From 12c90bd612fe7eb9e991bbfce8ca0ec43a3caccb Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 18 Jan 2024 09:44:56 -0600
Subject: [PATCH 001/843] [LinkerWrapper] Handle AMDGPU Target-IDs correctly
 when linking (#78359)

Summary:
The linker wrapper's job is to sort various embedded inputs into a list
of files that participate in a single link job. So far, this has been
completely 1-to-1, that is, each input file participates in exactly one
link job. However, support for AMD's target-id requires that one input
file may participate in multiple link jobs. For example, if given a
`gfx90a` static library and a `gfx90a:xnack+` object file input, we
should link the gfx90a` target into the `gfx90a:xnack+` job. These are
considered separate CPUs that can be mutually linked more or less.

This patch adds the necessary logic to make this happen. It primarily
reworks the logic to copy relevant input files into a separate list. So,
it moves construction of the final list of link jobs into the extraction
phase. We also need to copy the files in the case that it is needed more
than once, as the entire workflow expects ownership of said file.
---
 clang/lib/Driver/ToolChains/Clang.cpp         |  2 +-
 clang/test/Driver/amdgpu-openmp-toolchain.c   |  2 +-
 clang/test/Driver/linker-wrapper.c            | 21 +++++
 .../ClangLinkerWrapper.cpp                    | 82 +++++++++++--------
 llvm/include/llvm/Object/OffloadBinary.h      | 25 ++++++
 llvm/lib/Object/OffloadBinary.cpp             | 32 ++++++++
 6 files changed, 126 insertions(+), 38 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 997ec2d491d02..6e4fbe6816810 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -8700,7 +8700,7 @@ void OffloadPackager::ConstructJob(Compilation &C, const JobAction &JA,
     SmallVector<std::string> Parts{
         "file=" + File.str(),
         "triple=" + TC->getTripleString(),
-        "arch=" + getProcessorFromTargetID(TC->getTriple(), Arch).str(),
+        "arch=" + Arch.str(),
         "kind=" + Kind.str(),
     };
 
diff --git a/clang/test/Driver/amdgpu-openmp-toolchain.c b/clang/test/Driver/amdgpu-openmp-toolchain.c
index f38486ad0cccc..daa41b216089b 100644
--- a/clang/test/Driver/amdgpu-openmp-toolchain.c
+++ b/clang/test/Driver/amdgpu-openmp-toolchain.c
@@ -65,7 +65,7 @@
 
 // RUN: %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a:sramecc-:xnack+ \
 // RUN:   -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID
-// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a,kind=openmp,feature=-sramecc,feature=+xnack
+// CHECK-TARGET-ID: clang-offload-packager{{.*}}arch=gfx90a:sramecc-:xnack+,kind=openmp,feature=-sramecc,feature=+xnack
 
 // RUN: not %clang -### -target x86_64-pc-linux-gnu -fopenmp --offload-arch=gfx90a,gfx90a:xnack+ \
 // RUN:   -nogpulib %s 2>&1 | FileCheck %s --check-prefix=CHECK-TARGET-ID-ERROR
diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index e51c5ea381d31..a6de616b05e9f 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -2,6 +2,11 @@
 // REQUIRES: nvptx-registered-target
 // REQUIRES: amdgpu-registered-target
 
+// UNSUPPORTED: system-linux
+
+// An externally visible variable so static libraries extract.
+__attribute__((visibility("protected"), used)) int x;
+
 // RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.elf.o
 // RUN: %clang -cc1 %s -triple nvptx64-nvidia-cuda -emit-llvm-bc -o %t.nvptx.bc
 // RUN: %clang -cc1 %s -triple amdgcn-amd-amdhsa -emit-llvm-bc -o %t.amdgpu.bc
@@ -150,3 +155,19 @@
 // RUN:   --linker-path=/usr/bin/lld-link -- %t.o -libpath:./ -out:a.exe 2>&1 | FileCheck %s --check-prefix=COFF
 
 // COFF: "/usr/bin/lld-link" {{.*}}.o -libpath:./ -out:a.exe {{.*}}openmp.image.wrapper{{.*}}
+
+// RUN: clang-offload-packager -o %t-lib.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t.o -fembed-offload-object=%t-lib.out
+// RUN: llvm-ar rcs %t.a %t.o
+// RUN: clang-offload-packager -o %t-on.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a:xnack+
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-on.o -fembed-offload-object=%t-on.out
+// RUN: clang-offload-packager -o %t-off.out \
+// RUN:   --image=file=%t.elf.o,kind=openmp,triple=amdgcn-amd-amdhsa,arch=gfx90a:xnack-
+// RUN: %clang -cc1 %s -triple x86_64-unknown-linux-gnu -emit-obj -o %t-off.o -fembed-offload-object=%t-off.out
+// RUN: clang-linker-wrapper --host-triple=x86_64-unknown-linux-gnu --dry-run \
+// RUN:   --linker-path=/usr/bin/ld -- %t-on.o %t-off.o %t.a -o a.out 2>&1 | FileCheck %s --check-prefix=AMD-TARGET-ID
+
+// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack+ -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
+// AMD-TARGET-ID: clang{{.*}} -o {{.*}}.img --target=amdgcn-amd-amdhsa -mcpu=gfx90a:xnack- -O2 -Wl,--no-undefined {{.*}}.o {{.*}}.o
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 82cec1778bff7..03c83e2f92b32 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -538,7 +538,8 @@ std::unique_ptr<lto::LTO> createLTO(
     const ArgList &Args, const std::vector<std::string> &Features,
     ModuleHook Hook = [](size_t, const Module &) { return true; }) {
   const llvm::Triple Triple(Args.getLastArgValue(OPT_triple_EQ));
-  StringRef Arch = Args.getLastArgValue(OPT_arch_EQ);
+  // We need to remove AMD's target-id from the processor if present.
+  StringRef Arch = Args.getLastArgValue(OPT_arch_EQ).split(":").first;
   lto::Config Conf;
   lto::ThinBackend Backend;
   // TODO: Handle index-only thin-LTO
@@ -1066,24 +1067,14 @@ DerivedArgList getLinkerArgs(ArrayRef<OffloadFile> Input,
 
 /// Transforms all the extracted offloading input files into an image that can
 /// be registered by the runtime.
-Expected<SmallVector<StringRef>>
-linkAndWrapDeviceFiles(SmallVectorImpl<OffloadFile> &LinkerInputFiles,
-                       const InputArgList &Args, char **Argv, int Argc) {
+Expected<SmallVector<StringRef>> linkAndWrapDeviceFiles(
+    SmallVectorImpl<SmallVector<OffloadFile>> &LinkerInputFiles,
+    const InputArgList &Args, char **Argv, int Argc) {
   llvm::TimeTraceScope TimeScope("Handle all device input");
 
-  DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputMap;
-  for (auto &File : LinkerInputFiles)
-    InputMap[File].emplace_back(std::move(File));
-  LinkerInputFiles.clear();
-
-  SmallVector<SmallVector<OffloadFile>> InputsForTarget;
-  for (auto &[ID, Input] : InputMap)
-    InputsForTarget.emplace_back(std::move(Input));
-  InputMap.clear();
-
   std::mutex ImageMtx;
   DenseMap<OffloadKind, SmallVector<OffloadingImage>> Images;
-  auto Err = parallelForEachError(InputsForTarget, [&](auto &Input) -> Error {
+  auto Err = parallelForEachError(LinkerInputFiles, [&](auto &Input) -> Error {
     llvm::TimeTraceScope TimeScope("Link device input");
 
     // Each thread needs its own copy of the base arguments to maintain
@@ -1359,8 +1350,9 @@ Expected<bool> getSymbols(StringRef Image, OffloadKind Kind, bool IsArchive,
 /// Search the input files and libraries for embedded device offloading code
 /// and add it to the list of files to be linked. Files coming from static
 /// libraries are only added to the input if they are used by an existing
-/// input file.
-Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
+/// input file. Returns a list of input files intended for a single linking job.
+Expected<SmallVector<SmallVector<OffloadFile>>>
+getDeviceInput(const ArgList &Args) {
   llvm::TimeTraceScope TimeScope("ExtractDeviceCode");
 
   StringRef Root = Args.getLastArgValue(OPT_sysroot_EQ);
@@ -1372,7 +1364,7 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
   StringSaver Saver(Alloc);
 
   // Try to extract device code from the linker input files.
-  SmallVector<OffloadFile> InputFiles;
+  DenseMap<OffloadFile::TargetID, SmallVector<OffloadFile>> InputFiles;
   DenseMap<OffloadFile::TargetID, DenseMap<StringRef, Symbol>> Syms;
   bool WholeArchive = Args.hasArg(OPT_wholearchive_flag) ? true : false;
   for (const opt::Arg *Arg : Args.filtered(
@@ -1416,25 +1408,39 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
     while (Extracted) {
       Extracted = false;
       for (OffloadFile &Binary : Binaries) {
+        // If the binary was previously extracted it will be set to null.
         if (!Binary.getBinary())
           continue;
 
-        // If we don't have an object file for this architecture do not
-        // extract.
-        if (IsArchive && !WholeArchive && !Syms.count(Binary))
-          continue;
-
-        Expected<bool> ExtractOrErr =
-            getSymbols(Binary.getBinary()->getImage(),
-                       Binary.getBinary()->getOffloadKind(), IsArchive, Saver,
-                       Syms[Binary]);
-        if (!ExtractOrErr)
-          return ExtractOrErr.takeError();
-
-        Extracted = !WholeArchive && *ExtractOrErr;
-
-        if (!IsArchive || WholeArchive || Extracted)
-          InputFiles.emplace_back(std::move(Binary));
+        SmallVector<OffloadFile::TargetID> CompatibleTargets = {Binary};
+        for (const auto &[ID, Input] : InputFiles)
+          if (object::areTargetsCompatible(Binary, ID))
+            CompatibleTargets.emplace_back(ID);
+
+        for (const auto &[Index, ID] : llvm::enumerate(CompatibleTargets)) {
+          // Only extract an if we have an an object matching this target.
+          if (IsArchive && !WholeArchive && !InputFiles.count(ID))
+            continue;
+
+          Expected<bool> ExtractOrErr = getSymbols(
+              Binary.getBinary()->getImage(),
+              Binary.getBinary()->getOffloadKind(), IsArchive, Saver, Syms[ID]);
+          if (!ExtractOrErr)
+            return ExtractOrErr.takeError();
+
+          Extracted = !WholeArchive && *ExtractOrErr;
+
+          // Skip including the file if it is an archive that does not resolve
+          // any symbols.
+          if (IsArchive && !WholeArchive && !Extracted)
+            continue;
+
+          // If another target needs this binary it must be copied instead.
+          if (Index == CompatibleTargets.size() - 1)
+            InputFiles[ID].emplace_back(std::move(Binary));
+          else
+            InputFiles[ID].emplace_back(std::move(Binary.copy()));
+        }
 
         // If we extracted any files we need to check all the symbols again.
         if (Extracted)
@@ -1447,10 +1453,14 @@ Expected<SmallVector<OffloadFile>> getDeviceInput(const ArgList &Args) {
     auto FileOrErr = getInputBitcodeLibrary(Library);
     if (!FileOrErr)
       return FileOrErr.takeError();
-    InputFiles.push_back(std::move(*FileOrErr));
+    InputFiles[*FileOrErr].push_back(std::move(*FileOrErr));
   }
 
-  return std::move(InputFiles);
+  SmallVector<SmallVector<OffloadFile>> InputsForTarget;
+  for (auto &[ID, Input] : InputFiles)
+    InputsForTarget.emplace_back(std::move(Input));
+
+  return std::move(InputsForTarget);
 }
 
 } // namespace
diff --git a/llvm/include/llvm/Object/OffloadBinary.h b/llvm/include/llvm/Object/OffloadBinary.h
index dda1e7f1eafbb..13383d5f07ba0 100644
--- a/llvm/include/llvm/Object/OffloadBinary.h
+++ b/llvm/include/llvm/Object/OffloadBinary.h
@@ -162,6 +162,19 @@ class OffloadFile : public OwningBinary<OffloadBinary> {
               std::unique_ptr<MemoryBuffer> Buffer)
       : OwningBinary<OffloadBinary>(std::move(Binary), std::move(Buffer)) {}
 
+  /// Make a deep copy of this offloading file.
+  OffloadFile copy() const {
+    std::unique_ptr<MemoryBuffer> Buffer = MemoryBuffer::getMemBufferCopy(
+        getBinary()->getMemoryBufferRef().getBuffer());
+
+    // This parsing should never fail because it has already been parsed.
+    auto NewBinaryOrErr = OffloadBinary::create(*Buffer);
+    assert(NewBinaryOrErr && "Failed to parse a copy of the binary?");
+    if (!NewBinaryOrErr)
+      llvm::consumeError(NewBinaryOrErr.takeError());
+    return OffloadFile(std::move(*NewBinaryOrErr), std::move(Buffer));
+  }
+
   /// We use the Triple and Architecture pair to group linker inputs together.
   /// This conversion function lets us use these inputs in a hash-map.
   operator TargetID() const {
@@ -186,6 +199,18 @@ OffloadKind getOffloadKind(StringRef Name);
 /// Convert an offload kind to its string representation.
 StringRef getOffloadKindName(OffloadKind Name);
 
+/// If the target is AMD we check the target IDs for mutual compatibility. A
+/// target id is a string conforming to the folowing BNF syntax:
+///
+///  target-id ::= '<arch> ( : <feature> ( '+' | '-' ) )*'
+///
+/// The features 'xnack' and 'sramecc' are currently supported. These can be in
+/// the state of on, off, and any when unspecified. A target marked as any can
+/// bind with either on or off. This is used to link mutually compatible
+/// architectures together. Returns false in the case of an exact match.
+bool areTargetsCompatible(const OffloadFile::TargetID &LHS,
+                          const OffloadFile::TargetID &RHS);
+
 } // namespace object
 
 } // namespace llvm
diff --git a/llvm/lib/Object/OffloadBinary.cpp b/llvm/lib/Object/OffloadBinary.cpp
index 1de784c44da10..bfc35e41fe658 100644
--- a/llvm/lib/Object/OffloadBinary.cpp
+++ b/llvm/lib/Object/OffloadBinary.cpp
@@ -343,3 +343,35 @@ StringRef object::getImageKindName(ImageKind Kind) {
     return "";
   }
 }
+
+bool object::areTargetsCompatible(const OffloadFile::TargetID &LHS,
+                                  const OffloadFile::TargetID &RHS) {
+  // Exact matches are not considered compatible because they are the same
+  // target. We are interested in different targets that are compatible.
+  if (LHS == RHS)
+    return false;
+
+  // The triples must match at all times.
+  if (LHS.first != RHS.first)
+    return false;
+
+  // Only The AMDGPU target requires additional checks.
+  llvm::Triple T(LHS.first);
+  if (!T.isAMDGPU())
+    return false;
+
+  // The base processor must always match.
+  if (LHS.second.split(":").first != RHS.second.split(":").first)
+    return false;
+
+  // Check combintions of on / off features that must match.
+  if (LHS.second.contains("xnack+") && RHS.second.contains("xnack-"))
+    return false;
+  if (LHS.second.contains("xnack-") && RHS.second.contains("xnack+"))
+    return false;
+  if (LHS.second.contains("sramecc-") && RHS.second.contains("sramecc+"))
+    return false;
+  if (LHS.second.contains("sramecc+") && RHS.second.contains("sramecc-"))
+    return false;
+  return true;
+}

From 9cd41289989f07d08f14d8a67ccc2d6445cc7d43 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Thu, 18 Jan 2024 07:52:42 -0800
Subject: [PATCH 002/843] [mlir][sparse] add a 3-d block and fiber test
 (#78529)

---
 .../SparseTensor/CPU/sparse_block3d.mlir      | 122 ++++++++++++++++++
 1 file changed, 122 insertions(+)
 create mode 100755 mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir

diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
new file mode 100755
index 0000000000000..df12a6e042dde
--- /dev/null
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
@@ -0,0 +1,122 @@
+//--------------------------------------------------------------------------------------------------
+// WHEN CREATING A NEW TEST, PLEASE JUST COPY & PASTE WITHOUT EDITS.
+//
+// Set-up that's shared across all tests in this directory. In principle, this
+// config could be moved to lit.local.cfg. However, there are downstream users that
+//  do not use these LIT config files. Hence why this is kept inline.
+//
+// DEFINE: %{sparsifier_opts} = enable-runtime-library=true
+// DEFINE: %{sparsifier_opts_sve} = enable-arm-sve=true %{sparsifier_opts}
+// DEFINE: %{compile} = mlir-opt %s --sparsifier="%{sparsifier_opts}"
+// DEFINE: %{compile_sve} = mlir-opt %s --sparsifier="%{sparsifier_opts_sve}"
+// DEFINE: %{run_libs} = -shared-libs=%mlir_c_runner_utils,%mlir_runner_utils
+// DEFINE: %{run_opts} = -e main -entry-point-result=void
+// DEFINE: %{run} = mlir-cpu-runner %{run_opts} %{run_libs}
+// DEFINE: %{run_sve} = %mcr_aarch64_cmd --march=aarch64 --mattr="+sve" %{run_opts} %{run_libs}
+//
+// DEFINE: %{env} =
+//--------------------------------------------------------------------------------------------------
+
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and vectorization.
+// REDEFINE: %{sparsifier_opts} = enable-runtime-library=false enable-buffer-initialization=true vl=2 reassociate-fp-reductions=true enable-index-optimizations=true
+// RUN: %{compile} | %{run} | FileCheck %s
+//
+// Do the same run, but now with direct IR generation and VLA vectorization.
+// RUN: %if mlir_arm_sve_tests %{ %{compile_sve} | %{run_sve} | FileCheck %s %}
+
+#Sparse1 = #sparse_tensor.encoding<{
+  map = (i, j, k) -> (
+    i : compressed,
+    j : compressed,
+    k : compressed
+  )
+}>
+
+#Sparse2 = #sparse_tensor.encoding<{
+  map = (i, j, k) -> (
+    i floordiv 2 : compressed,
+    j floordiv 2 : compressed,
+    k floordiv 2 : compressed,
+    i mod 2 : dense,
+    j mod 2 : dense,
+    k mod 2 : dense)
+}>
+
+module {
+
+  //
+  // Main driver that tests sparse tensor storage.
+  //
+  func.func @main() {
+    %c0 = arith.constant 0 : index
+    %i0 = arith.constant 0 : i32
+
+    // Setup input dense tensor and convert to two sparse tensors.
+    %d = arith.constant dense <[
+       [ // i=0
+         [ 1, 0, 0, 0 ],
+         [ 0, 0, 0, 0 ],
+         [ 0, 0, 0, 0 ],
+         [ 0, 0, 5, 0 ] ],
+       [ // i=1
+         [ 2, 0, 0, 0 ],
+         [ 0, 0, 0, 0 ],
+         [ 0, 0, 0, 0 ],
+         [ 0, 0, 6, 0 ] ],
+       [ //i=2
+         [ 3, 0, 0, 0 ],
+         [ 0, 0, 0, 0 ],
+         [ 0, 0, 0, 0 ],
+         [ 0, 0, 7, 0 ] ],
+	 //i=3
+       [ [ 4, 0, 0, 0 ],
+         [ 0, 0, 0, 0 ],
+         [ 0, 0, 0, 0 ],
+         [ 0, 0, 8, 0 ] ]
+    ]> : tensor<4x4x4xi32>
+
+    %a = sparse_tensor.convert %d : tensor<4x4x4xi32> to tensor<4x4x4xi32, #Sparse1>
+    %b = sparse_tensor.convert %d : tensor<4x4x4xi32> to tensor<4x4x4xi32, #Sparse2>
+
+    //
+    // If we store the two "fibers" [1,2,3,4] starting at index (0,0,0) and
+    // ending at index (3,0,0) and [5,6,7,8] starting at index (0,3,2) and
+    // ending at index (3,3,2)) with a “DCSR-flavored” along (j,k) with
+    // dense “fibers” in the i-dim, we end up with 8 stored entries.
+    //
+    // CHECK: 8
+    // CHECK-NEXT: ( 1, 5, 2, 6, 3, 7, 4, 8 )
+    //
+    %na = sparse_tensor.number_of_entries %a : tensor<4x4x4xi32, #Sparse1>
+    vector.print %na : index
+    %ma = sparse_tensor.values %a: tensor<4x4x4xi32, #Sparse1> to memref<?xi32>
+    %va = vector.transfer_read %ma[%c0], %i0: memref<?xi32>, vector<8xi32>
+    vector.print %va : vector<8xi32>
+
+    //
+    // If we store full 2x2x2 3-D blocks in the original index order
+    // in a compressed fashion, we end up with 4 blocks to incorporate
+    // all the nonzeros, and thus 32 stored entries.
+    //
+    // CHECK: 32
+    // CHECK-NEXT: ( 1, 0, 0, 0, 2, 0, 0, 0, 0, 0, 5, 0, 0, 0, 6, 0, 3, 0, 0, 0, 4, 0, 0, 0, 0, 0, 7, 0, 0, 0, 8, 0 )
+    //
+    %nb = sparse_tensor.number_of_entries %b : tensor<4x4x4xi32, #Sparse2>
+    vector.print %nb : index
+    %mb = sparse_tensor.values %b: tensor<4x4x4xi32, #Sparse2> to memref<?xi32>
+    %vb = vector.transfer_read %mb[%c0], %i0: memref<?xi32>, vector<32xi32>
+    vector.print %vb : vector<32xi32>
+
+    // Release the resources.
+    bufferization.dealloc_tensor %a : tensor<4x4x4xi32, #Sparse1>
+    bufferization.dealloc_tensor %b : tensor<4x4x4xi32, #Sparse2>
+
+    return
+  }
+}

From f20488687e54a7202b0c90a9d3bee540c5e40f9a Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 18 Jan 2024 16:56:37 +0100
Subject: [PATCH 003/843] [CVP] Add test with nested cycle (NFC)

This is a regression test for a miscompile that would have been
introduced by an upcoming patch.
---
 .../nested-cycle.ll                           | 55 +++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 llvm/test/Transforms/CorrelatedValuePropagation/nested-cycle.ll

diff --git a/llvm/test/Transforms/CorrelatedValuePropagation/nested-cycle.ll b/llvm/test/Transforms/CorrelatedValuePropagation/nested-cycle.ll
new file mode 100644
index 0000000000000..7e5ce71f129a4
--- /dev/null
+++ b/llvm/test/Transforms/CorrelatedValuePropagation/nested-cycle.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=jump-threading,correlated-propagation < %s | FileCheck %s
+
+; This runs both jump threading and CVP to query values in the right order.
+; The comparison should not fold.
+
+define void @test(i32 %n, i1 %c, i1 %c2) {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: i32 [[N:%.*]], i1 [[C:%.*]], i1 [[C2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[IV]], 0
+; CHECK-NEXT:    call void @use(i1 [[CMP]])
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i32 [[IV]], 1
+; CHECK-NEXT:    br label [[LOOP2:%.*]]
+; CHECK:       loop2:
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP2_LATCH:%.*]], label [[LOOP2_LATCH]]
+; CHECK:       loop2.latch:
+; CHECK-NEXT:    br i1 [[C2]], label [[LOOP_LATCH]], label [[LOOP2]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[EXIT_COND:%.*]] = icmp eq i32 [[IV_NEXT]], [[N]]
+; CHECK-NEXT:    br i1 [[EXIT_COND]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ %iv.next, %loop.latch ], [ 0, %entry ]
+  %cmp = icmp eq i32 %iv, 0
+  call void @use(i1 %cmp)
+  %iv.next = add nuw nsw i32 %iv, 1
+  br label %loop2
+
+loop2:
+  br i1 %c, label %loop2.latch, label %loop2.split
+
+loop2.split:
+  br label %loop2.latch
+
+loop2.latch:
+  br i1 %c2, label %loop.latch, label %loop2
+
+loop.latch:
+  %exit.cond = icmp eq i32 %iv.next, %n
+  br i1 %exit.cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare void @use(i1)

From f4ede08c61cba631de204398a91aa59bac9d5db9 Mon Sep 17 00:00:00 2001
From: Zequan Wu <zequanwu@google.com>
Date: Thu, 18 Jan 2024 11:06:57 -0500
Subject: [PATCH 004/843] [lldb][Format] Fix missing inlined function names in
 frame formatting. (#78494)

This fixes missing inlined function names when formatting frame and the
`Block` in `SymbolContext` is a lexical block (e.g.
`DW_TAG_lexical_block` in Dwarf).
---
 lldb/source/Core/FormatEntity.cpp             | 38 ++++++-------
 .../Language/CPlusPlus/CPlusPlusLanguage.cpp  |  2 +-
 lldb/test/Shell/Settings/Inputs/names.cpp     |  8 ++-
 .../Shell/Settings/TestFrameFormatName.test   | 55 +++++++++++++++++++
 .../Settings/TestFrameFormatNameWithArgs.test | 26 ---------
 5 files changed, 80 insertions(+), 49 deletions(-)
 create mode 100644 lldb/test/Shell/Settings/TestFrameFormatName.test
 delete mode 100644 lldb/test/Shell/Settings/TestFrameFormatNameWithArgs.test

diff --git a/lldb/source/Core/FormatEntity.cpp b/lldb/source/Core/FormatEntity.cpp
index 94986457552d9..3c665c2eb2133 100644
--- a/lldb/source/Core/FormatEntity.cpp
+++ b/lldb/source/Core/FormatEntity.cpp
@@ -1093,6 +1093,19 @@ static void PrettyPrintFunctionNameWithArgs(Stream &out_stream,
     out_stream.PutChar(')');
 }
 
+static void FormatInlinedBlock(Stream &out_stream, Block *block) {
+  if (!block)
+    return;
+  Block *inline_block = block->GetContainingInlinedBlock();
+  if (inline_block) {
+    if (const InlineFunctionInfo *inline_info =
+            inline_block->GetInlinedFunctionInfo()) {
+      out_stream.PutCString(" [inlined] ");
+      inline_info->GetName().Dump(&out_stream);
+    }
+  }
+}
+
 bool FormatEntity::FormatStringRef(const llvm::StringRef &format_str, Stream &s,
                                    const SymbolContext *sc,
                                    const ExecutionContext *exe_ctx,
@@ -1592,18 +1605,7 @@ bool FormatEntity::Format(const Entry &entry, Stream &s,
 
       if (name) {
         s.PutCString(name);
-
-        if (sc->block) {
-          Block *inline_block = sc->block->GetContainingInlinedBlock();
-          if (inline_block) {
-            const InlineFunctionInfo *inline_info =
-                sc->block->GetInlinedFunctionInfo();
-            if (inline_info) {
-              s.PutCString(" [inlined] ");
-              inline_info->GetName().Dump(&s);
-            }
-          }
-        }
+        FormatInlinedBlock(s, sc->block);
         return true;
       }
     }
@@ -1638,6 +1640,7 @@ bool FormatEntity::Format(const Entry &entry, Stream &s,
         name = sc->symbol->GetNameNoArguments();
       if (name) {
         s.PutCString(name.GetCString());
+        FormatInlinedBlock(s, sc->block);
         return true;
       }
     }
@@ -1678,7 +1681,7 @@ bool FormatEntity::Format(const Entry &entry, Stream &s,
 
             if (inline_block) {
               get_function_vars = false;
-              inline_info = sc->block->GetInlinedFunctionInfo();
+              inline_info = inline_block->GetInlinedFunctionInfo();
               if (inline_info)
                 variable_list_sp = inline_block->GetBlockVariableList(true);
             }
@@ -1733,14 +1736,7 @@ bool FormatEntity::Format(const Entry &entry, Stream &s,
     if (!name)
       return false;
     s.PutCString(name);
-
-    if (sc->block && sc->block->GetContainingInlinedBlock()) {
-      if (const InlineFunctionInfo *inline_info =
-              sc->block->GetInlinedFunctionInfo()) {
-        s.PutCString(" [inlined] ");
-        inline_info->GetName().Dump(&s);
-      }
-    }
+    FormatInlinedBlock(s, sc->block);
     return true;
   }
   case Entry::Type::FunctionAddrOffset:
diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 7131ccb9d05ec..21c73e6361904 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -1646,7 +1646,7 @@ bool CPlusPlusLanguage::GetFunctionDisplayName(
 
           if (inline_block) {
             get_function_vars = false;
-            inline_info = sc->block->GetInlinedFunctionInfo();
+            inline_info = inline_block->GetInlinedFunctionInfo();
             if (inline_info)
               variable_list_sp = inline_block->GetBlockVariableList(true);
           }
diff --git a/lldb/test/Shell/Settings/Inputs/names.cpp b/lldb/test/Shell/Settings/Inputs/names.cpp
index cf6982abb8f35..79ff93be6b76e 100644
--- a/lldb/test/Shell/Settings/Inputs/names.cpp
+++ b/lldb/test/Shell/Settings/Inputs/names.cpp
@@ -26,6 +26,12 @@ int anon_bar() { return 1; }
 auto anon_lambda = [] {};
 } // namespace
 
+__attribute__((always_inline)) int inlined_foo(const char *str) {
+  if (bool b = bar())
+    return 1;
+  return 2;
+}
+
 int main() {
   ns::foo<decltype(bar)>(bar);
   ns::foo<decltype(bar)>("bar");
@@ -37,6 +43,6 @@ int main() {
   f.foo(anon_bar);
   f.operator<< <(2 > 1)>(0);
   f.returns_func_ptr<int>(detail::Quux<int>{});
-
+  inlined_foo("bar");
   return 0;
 }
diff --git a/lldb/test/Shell/Settings/TestFrameFormatName.test b/lldb/test/Shell/Settings/TestFrameFormatName.test
new file mode 100644
index 0000000000000..caa3242527c6e
--- /dev/null
+++ b/lldb/test/Shell/Settings/TestFrameFormatName.test
@@ -0,0 +1,55 @@
+# UNSUPPORTED: system-windows
+# Test different name formats.
+
+# RUN: %build %S/Inputs/names.cpp --std c++17 -o %t.out
+# RUN: split-file %s %t
+
+#--- name_with_args.input
+# RUN: %lldb -b -s %t/name_with_args.input %t.out | FileCheck %s --check-prefix=NAME_WITH_ARGS
+settings set -f frame-format "frame ${function.name-with-args}\n"
+break set -n foo
+break set -n operator<<
+break set -n returns_func_ptr
+break set -n inlined_foo
+run
+# NAME_WITH_ARGS: frame int ns::foo<int ()>(t={{.*}})
+c
+# NAME_WITH_ARGS: frame int ns::foo<int ()>(str="bar")
+c
+# NAME_WITH_ARGS: frame int ns::foo<(anonymous namespace)::$_0>(t=(anonymous namespace)::(unnamed class) @ {{.*}})
+c
+# NAME_WITH_ARGS: frame int ns::foo<int (*)()>(t=({{.*}}`(anonymous namespace)::anon_bar() at {{.*}}))
+c
+# NAME_WITH_ARGS: frame int ns::foo<void (Foo::*)(int (*)(int)) const noexcept>(str="method")
+c
+# NAME_WITH_ARGS: frame ns::returns_func_ptr<int>((null)={{.*}})
+c
+# NAME_WITH_ARGS: frame void Foo::foo<int (*)()>(this={{.*}}, arg=({{.*}}`(anonymous namespace)::anon_bar() at {{.*}}))
+c
+# NAME_WITH_ARGS: frame void Foo::operator<<<1>(this={{.*}}, (null)=0)
+c
+# NAME_WITH_ARGS: frame Foo::returns_func_ptr<int>(this={{.*}}, (null)={{.*}})
+c
+# NAME_WITH_ARGS: frame main [inlined] inlined_foo(str="bar")
+q
+
+#--- name.input
+# RUN: %lldb -b -s %t/name.input %t.out | FileCheck %s --check-prefix=NAME
+settings set -f frame-format "frame ${function.name}\n"
+break set -n inlined_foo
+run
+# NAME: frame main [inlined] inlined_foo(char const*)
+
+#--- name_without_args.input
+# RUN: %lldb -b -s %t/name_without_args.input %t.out | FileCheck %s --check-prefix=NAME_WITHOUT_ARGS
+settings set -f frame-format "frame ${function.name-without-args}\n"
+break set -n inlined_foo
+run
+# NAME_WITHOUT_ARGS: frame main [inlined] inlined_foo(char const*)
+
+#--- mangled_name.input
+# RUN: %lldb -b -s %t/mangled_name.input %t.out | FileCheck %s --check-prefix=MANGLED_NAME
+settings set -f frame-format "frame ${function.mangled-name}\n"
+break set -n inlined_foo
+run
+# MANGLED_NAME: frame main [inlined] inlined_foo(char const*)
diff --git a/lldb/test/Shell/Settings/TestFrameFormatNameWithArgs.test b/lldb/test/Shell/Settings/TestFrameFormatNameWithArgs.test
deleted file mode 100644
index eeb46cfd6bf9d..0000000000000
--- a/lldb/test/Shell/Settings/TestFrameFormatNameWithArgs.test
+++ /dev/null
@@ -1,26 +0,0 @@
-# UNSUPPORTED: system-windows
-# RUN: %build %S/Inputs/names.cpp --std c++17 -o %t.out
-# RUN: %lldb -b -s %s %t.out | FileCheck %s
-settings set -f frame-format "frame ${function.name-with-args}\n"
-break set -n foo
-break set -n operator<<
-break set -n returns_func_ptr
-run
-# CHECK: frame int ns::foo<int ()>(t={{.*}})
-c
-# CHECK: frame int ns::foo<int ()>(str="bar")
-c
-# CHECK: frame int ns::foo<(anonymous namespace)::$_0>(t=(anonymous namespace)::(unnamed class) @ {{.*}})
-c
-# CHECK: frame int ns::foo<int (*)()>(t=({{.*}}`(anonymous namespace)::anon_bar() at {{.*}}))
-c
-# CHECK: frame int ns::foo<void (Foo::*)(int (*)(int)) const noexcept>(str="method")
-c
-# CHECK: frame ns::returns_func_ptr<int>((null)={{.*}})
-c
-# CHECK: frame void Foo::foo<int (*)()>(this={{.*}}, arg=({{.*}}`(anonymous namespace)::anon_bar() at {{.*}}))
-c
-# CHECK: frame void Foo::operator<<<1>(this={{.*}}, (null)=0)
-c
-# CHECK: frame Foo::returns_func_ptr<int>(this={{.*}}, (null)={{.*}})
-q

From de03c46b8bb0b280d2de7ec805df1343bb12eeda Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Thu, 18 Jan 2024 08:17:19 -0800
Subject: [PATCH 005/843] [libc] reverts for 32b arm (#78307)

These were fixed properly by f1f1875c18b8.

- Revert "[libc] temporarily set -Wno-shorten-64-to-32 (#77396)"
- Revert "[libc] make off_t 32b for 32b arm (#77350)"
---
 libc/include/llvm-libc-types/off_t.h   | 4 ----
 libc/src/sys/mman/linux/CMakeLists.txt | 3 ---
 2 files changed, 7 deletions(-)

diff --git a/libc/include/llvm-libc-types/off_t.h b/libc/include/llvm-libc-types/off_t.h
index a0cbe992189d0..111b29aa68d87 100644
--- a/libc/include/llvm-libc-types/off_t.h
+++ b/libc/include/llvm-libc-types/off_t.h
@@ -9,10 +9,6 @@
 #ifndef __LLVM_LIBC_TYPES_OFF_T_H__
 #define __LLVM_LIBC_TYPES_OFF_T_H__
 
-#if defined(__LP64__) || defined(__riscv)
 typedef __INT64_TYPE__ off_t;
-#else
-typedef __INT32_TYPE__ off_t;
-#endif // __LP64__ || __riscv
 
 #endif // __LLVM_LIBC_TYPES_OFF_T_H__
diff --git a/libc/src/sys/mman/linux/CMakeLists.txt b/libc/src/sys/mman/linux/CMakeLists.txt
index 08694ec48be39..163e7dead8887 100644
--- a/libc/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/src/sys/mman/linux/CMakeLists.txt
@@ -22,9 +22,6 @@ add_entrypoint_object(
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
-  COMPILE_OPTIONS
-    # TODO: https://github.com/llvm/llvm-project/issues/77395
-    -Wno-shorten-64-to-32
 )
 
 add_entrypoint_object(

From 3044d754855c339319b041e579c569d4faf32b62 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Thu, 18 Jan 2024 08:18:13 -0800
Subject: [PATCH 006/843] [libc][arm] add more math.h entrypoints (#77839)

In particular, we have internal customers that would like to use nanf
and
scalbnf.

The differences between various entrypoint files can be checked via:

    $ comm -3 <(grep libc\.src path/to/entrypoints.txt | sort) \
       <(grep libc\.src path/to/other/entrypoints.txt | sort)
---
 libc/config/baremetal/arm/entrypoints.txt | 106 +++++++++++
 libc/config/linux/arm/entrypoints.txt     | 106 +++++++++++
 libc/docs/math/index.rst                  | 212 +++++++++++-----------
 3 files changed, 318 insertions(+), 106 deletions(-)

diff --git a/libc/config/baremetal/arm/entrypoints.txt b/libc/config/baremetal/arm/entrypoints.txt
index 54f74c6a973ce..f725b1c2394c6 100644
--- a/libc/config/baremetal/arm/entrypoints.txt
+++ b/libc/config/baremetal/arm/entrypoints.txt
@@ -114,18 +114,124 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
+    libc.src.math.acosf
+    libc.src.math.acoshf
+    libc.src.math.asinf
+    libc.src.math.asinhf
+    libc.src.math.atanf
+    libc.src.math.atanhf
+    libc.src.math.ceil
+    libc.src.math.ceilf
+    libc.src.math.ceill
+    libc.src.math.copysign
+    libc.src.math.copysignf
+    libc.src.math.copysignl
+    libc.src.math.cosf
+    libc.src.math.coshf
+    libc.src.math.erff
+    libc.src.math.exp
+    libc.src.math.exp10
+    libc.src.math.exp10f
+    libc.src.math.exp2
+    libc.src.math.exp2f
+    libc.src.math.expf
+    libc.src.math.expm1
+    libc.src.math.expm1f
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
+    libc.src.math.floor
+    libc.src.math.floorf
+    libc.src.math.floorl
+    libc.src.math.fma
+    libc.src.math.fmaf
     libc.src.math.fmax
     libc.src.math.fmaxf
     libc.src.math.fmaxl
     libc.src.math.fmin
     libc.src.math.fminf
     libc.src.math.fminl
+    libc.src.math.fmod
+    libc.src.math.fmodf
+    libc.src.math.frexp
+    libc.src.math.frexpf
+    libc.src.math.frexpl
+    libc.src.math.hypot
+    libc.src.math.hypotf
+    libc.src.math.ilogb
+    libc.src.math.ilogbf
+    libc.src.math.ilogbl
+    libc.src.math.ldexp
+    libc.src.math.ldexpf
+    libc.src.math.ldexpl
+    libc.src.math.llrint
+    libc.src.math.llrintf
+    libc.src.math.llrintl
+    libc.src.math.llround
+    libc.src.math.llroundf
+    libc.src.math.llroundl
+    libc.src.math.log
+    libc.src.math.log10
+    libc.src.math.log10f
+    libc.src.math.log1p
+    libc.src.math.log1pf
+    libc.src.math.log2
+    libc.src.math.log2f
+    libc.src.math.logb
+    libc.src.math.logbf
+    libc.src.math.logbl
+    libc.src.math.logf
+    libc.src.math.lrint
+    libc.src.math.lrintf
+    libc.src.math.lrintl
+    libc.src.math.lround
+    libc.src.math.lroundf
+    libc.src.math.lroundl
+    libc.src.math.modf
+    libc.src.math.modff
+    libc.src.math.modfl
+    libc.src.math.nan
+    libc.src.math.nanf
+    libc.src.math.nanl
+    libc.src.math.nearbyint
+    libc.src.math.nearbyintf
+    libc.src.math.nearbyintl
+    libc.src.math.nextafter
+    libc.src.math.nextafterf
+    libc.src.math.nextafterl
+    libc.src.math.nexttoward
+    libc.src.math.nexttowardf
+    libc.src.math.nexttowardl
+    libc.src.math.powf
+    libc.src.math.remainder
+    libc.src.math.remainderf
+    libc.src.math.remainderl
+    libc.src.math.remquo
+    libc.src.math.remquof
+    libc.src.math.remquol
+    libc.src.math.rint
+    libc.src.math.rintf
+    libc.src.math.rintl
+    libc.src.math.round
+    libc.src.math.roundf
+    libc.src.math.roundl
+    libc.src.math.scalbn
+    libc.src.math.scalbnf
+    libc.src.math.scalbnl
+    libc.src.math.sincosf
+    libc.src.math.sinf
+    libc.src.math.sinhf
+    libc.src.math.sqrt
+    libc.src.math.sqrtf
+    libc.src.math.sqrtl
+    libc.src.math.tanf
+    libc.src.math.tanhf
+    libc.src.math.trunc
+    libc.src.math.truncf
+    libc.src.math.truncl
 )
 
 set(TARGET_LLVMLIBC_ENTRYPOINTS
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 274d5aa5a0057..c75ac2302d4ac 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -115,18 +115,124 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
+    libc.src.math.acosf
+    libc.src.math.acoshf
+    libc.src.math.asinf
+    libc.src.math.asinhf
+    libc.src.math.atanf
+    libc.src.math.atanhf
+    libc.src.math.ceil
+    libc.src.math.ceilf
+    libc.src.math.ceill
+    libc.src.math.copysign
+    libc.src.math.copysignf
+    libc.src.math.copysignl
+    libc.src.math.cosf
+    libc.src.math.coshf
+    libc.src.math.erff
+    libc.src.math.exp
+    libc.src.math.exp10
+    libc.src.math.exp10f
+    libc.src.math.exp2
+    libc.src.math.exp2f
+    libc.src.math.expf
+    libc.src.math.expm1
+    libc.src.math.expm1f
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
+    libc.src.math.floor
+    libc.src.math.floorf
+    libc.src.math.floorl
+    libc.src.math.fma
+    libc.src.math.fmaf
     libc.src.math.fmax
     libc.src.math.fmaxf
     libc.src.math.fmaxl
     libc.src.math.fmin
     libc.src.math.fminf
     libc.src.math.fminl
+    libc.src.math.fmod
+    libc.src.math.fmodf
+    libc.src.math.frexp
+    libc.src.math.frexpf
+    libc.src.math.frexpl
+    libc.src.math.hypot
+    libc.src.math.hypotf
+    libc.src.math.ilogb
+    libc.src.math.ilogbf
+    libc.src.math.ilogbl
+    libc.src.math.ldexp
+    libc.src.math.ldexpf
+    libc.src.math.ldexpl
+    libc.src.math.llrint
+    libc.src.math.llrintf
+    libc.src.math.llrintl
+    libc.src.math.llround
+    libc.src.math.llroundf
+    libc.src.math.llroundl
+    libc.src.math.log
+    libc.src.math.log10
+    libc.src.math.log10f
+    libc.src.math.log1p
+    libc.src.math.log1pf
+    libc.src.math.log2
+    libc.src.math.log2f
+    libc.src.math.logb
+    libc.src.math.logbf
+    libc.src.math.logbl
+    libc.src.math.logf
+    libc.src.math.lrint
+    libc.src.math.lrintf
+    libc.src.math.lrintl
+    libc.src.math.lround
+    libc.src.math.lroundf
+    libc.src.math.lroundl
+    libc.src.math.modf
+    libc.src.math.modff
+    libc.src.math.modfl
+    libc.src.math.nan
+    libc.src.math.nanf
+    libc.src.math.nanl
+    libc.src.math.nearbyint
+    libc.src.math.nearbyintf
+    libc.src.math.nearbyintl
+    libc.src.math.nextafter
+    libc.src.math.nextafterf
+    libc.src.math.nextafterl
+    libc.src.math.nexttoward
+    libc.src.math.nexttowardf
+    libc.src.math.nexttowardl
+    libc.src.math.powf
+    libc.src.math.remainder
+    libc.src.math.remainderf
+    libc.src.math.remainderl
+    libc.src.math.remquo
+    libc.src.math.remquof
+    libc.src.math.remquol
+    libc.src.math.rint
+    libc.src.math.rintf
+    libc.src.math.rintl
+    libc.src.math.round
+    libc.src.math.roundf
+    libc.src.math.roundl
+    libc.src.math.scalbn
+    libc.src.math.scalbnf
+    libc.src.math.scalbnl
+    libc.src.math.sincosf
+    libc.src.math.sinf
+    libc.src.math.sinhf
+    libc.src.math.sqrt
+    libc.src.math.sqrtf
+    libc.src.math.sqrtl
+    libc.src.math.tanf
+    libc.src.math.tanhf
+    libc.src.math.trunc
+    libc.src.math.truncf
+    libc.src.math.truncl
 )
 
 set(TARGET_LLVMLIBC_ENTRYPOINTS
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 724ad19dbe442..13cd1dafe01f5 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -108,17 +108,17 @@ Basic Operations
 |              +---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 |              | x86_64  | aarch64 | aarch32 | riscv64 | x86_64  | aarch64 | x86_64  | aarch64 | aarch32 | riscv32 | AMD     | nVidia  |
 +==============+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+
-| ceil         | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| ceil         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ceilf        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| ceilf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ceill        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| ceill        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| copysign     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| copysign     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| copysignf    | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| copysignf    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| copysignl    | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| copysignl    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | copysignf128 | |check| | |check| |         |         |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -136,11 +136,11 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fdiml        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| floor        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| floor        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| floorf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| floorf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| floorl       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| floorl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fmax         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -154,125 +154,125 @@ Basic Operations
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fminl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmod         | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| fmod         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmodf        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| fmodf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fmodl        |         |         |         |         |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| frexp        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| frexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| frexpf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| frexpf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| frexpl       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| frexpl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ilogb        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| ilogb        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ilogbf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| ilogbf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ilogbl       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| ilogbl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ldexp        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| ldexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ldexpf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| ldexpf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ldexpl       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| ldexpl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llrint       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| llrint       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llrintf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| llrintf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llrintl      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| llrintl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llround      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| llround      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llroundf     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| llroundf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llroundl     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| llroundl     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| logb         | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| logb         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| logbf        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| logbf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| logbl        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| logbl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lrint        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| lrint        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lrintf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| lrintf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lrintl       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| lrintl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lround       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| lround       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lroundf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| lroundf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lroundl      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| lroundl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| modf         | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| modf         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| modff        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| modff        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| modfl        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| modfl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nan          | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nan          | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nanf         | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nanf         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nanl         | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nanl         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nearbyint    | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nearbyint    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nearbyintf   | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nearbyintf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nearbyintl   | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nearbyintl   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nextafter    | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nextafter    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nextafterf   | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nextafterf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nextafterl   | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nextafterl   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nexttoward   | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nexttoward   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nexttowardf  | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nexttowardf  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nexttowardl  | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| nexttowardl  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remainder    | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| remainder    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remainderf   | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| remainderf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remainderl   | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| remainderl   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remquo       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| remquo       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remquof      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| remquof      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remquol      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| remquol      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| rint         | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| rint         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| rintf        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| rintf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| rintl        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| rintl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| round        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| round        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| roundf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| roundf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| roundl       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| roundl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| scalbn       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| scalbn       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| scalbnf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| scalbnf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| scalbnl      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| scalbnl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| trunc        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| trunc        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| truncf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| truncf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| truncl       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| truncl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 
 
@@ -286,31 +286,31 @@ Higher Math Functions
 +============+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+
 | acos       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| acosf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| acosf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | acosl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | acosh      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| acoshf     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| acoshf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | acoshl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | asin       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| asinf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| asinf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | asinl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | asinh      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| asinhf     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| asinhf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | asinhl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | atan       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| atanf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| atanf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | atanl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -322,7 +322,7 @@ Higher Math Functions
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | atanh      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| atanhf     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| atanhf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | atanhl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -334,19 +334,19 @@ Higher Math Functions
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | cos        | |check| |         |         |         | |check| |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| cosf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| cosf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | cosl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | cosh       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| coshf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| coshf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | coshl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | erf        |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| erff       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| erff       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | erfl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -356,39 +356,39 @@ Higher Math Functions
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | erfcl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| exp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| expf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| expf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | expl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp10      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| exp10      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp10f     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| exp10f     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | exp10l     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp2       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| exp2       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp2f      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| exp2f      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | exp2l      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| expm1      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| expm1      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| expm1f     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| expm1f     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | expm1l     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fma        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| fma        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmaf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| fmaf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fmal       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| hypot      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| hypot      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| hypotf     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| hypotf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | hypotl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -398,69 +398,69 @@ Higher Math Functions
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | lgammal    |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log        | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| log        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| logf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| logf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | logl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log10      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| log10      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log10f     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| log10f     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | log10l     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log1p      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| log1p      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log1pf     | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| log1pf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | log1pl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log2       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| log2       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log2f      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| log2f      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | log2l      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | pow        |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| powf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| powf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | powl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sin        | |check| |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sinf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| sinf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sinl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sincos     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sincosf    | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| sincosf    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sincosl    |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sinh       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sinhf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| sinhf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sinhl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sqrt       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| sqrt       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sqrtf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| sqrtf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sqrtl      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| sqrtl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tan        | |check| |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| tanf       | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| tanf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tanl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tanh       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| tanhf      | |check| | |check| |         | |check| | |check| |         |         | |check| |         |         |         |         |
+| tanhf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tanhl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+

From 8bc7c0a058ff0e6495b8e7e4dd850e646228506b Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Fri, 19 Jan 2024 00:05:44 +0800
Subject: [PATCH 007/843] [X86] Fix failures on EXPENSIVE_CHECKS builds

Error message

```
*** Bad machine code: Illegal virtual register for instruction ***
- function:    test__blsi_u32
- basic block: %bb.0  (0x7a61208)
- instruction: %5:gr32 = MOV32r0 implicit-def $eflags
- operand 0:   %5:gr32
Expected a GR32_NOREX2 register, but got a GR32 register
```

Reported by RKSimon in #77433

The failure is b/c compiler emits a MOV32r0 with operand GR32 when
fast-isel is enabled.

```
// X86FastISel.cpp
Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass)
```

However, before this patch, compiler only allows GR32_NOREX operand
b/c MOV32r0 is a pseudo instruction. In this patch, we relax the
register class of the operand to GR32 b/c MOV32r0 is always expanded
to XOR32rr, which can use EGPR.

The bug was not introduced by #77433 but caught by it.
---
 llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index a37e7af208904..4442b80861b61 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -1260,6 +1260,10 @@ inline bool canUseApxExtendedReg(const MCInstrDesc &Desc) {
   if (Encoding == X86II::EVEX)
     return true;
 
+  unsigned Opcode = Desc.Opcode;
+  // MOV32r0 is always expanded to XOR32rr
+  if (Opcode == X86::MOV32r0)
+    return true;
   // To be conservative, egpr is not used for all pseudo instructions
   // because we are not sure what instruction it will become.
   // FIXME: Could we improve it in X86ExpandPseudo?
@@ -1268,7 +1272,6 @@ inline bool canUseApxExtendedReg(const MCInstrDesc &Desc) {
 
   // MAP OB/TB in legacy encoding space can always use egpr except
   // XSAVE*/XRSTOR*.
-  unsigned Opcode = Desc.Opcode;
   switch (Opcode) {
   default:
     break;

From d124b0224241f58847f06eb588685dc88bb627a0 Mon Sep 17 00:00:00 2001
From: XinWang10 <108658776+XinWang10@users.noreply.github.com>
Date: Thu, 18 Jan 2024 08:27:16 -0800
Subject: [PATCH 008/843] [X86][MC] Fix wrong encoding of promoted BMI
 instructions due to missing NoCD8 (#78386)

Address review comments in #76709

Add `NoCD8` to class `ITy`, and rewrite the promoted instructions with
`ITy` to avoid unexpected incorrect encoding about `NoCD8`.
---
 llvm/lib/Target/X86/X86InstrMisc.td        | 45 +++++++++-------------
 llvm/lib/Target/X86/X86InstrSSE.td         |  4 +-
 llvm/lib/Target/X86/X86InstrUtils.td       |  6 +--
 llvm/test/MC/Disassembler/X86/apx/bmi2.txt | 20 +++++-----
 llvm/test/MC/X86/apx/bmi2-att.s            | 20 +++++-----
 llvm/test/MC/X86/apx/bmi2-intel.s          | 20 +++++-----
 6 files changed, 53 insertions(+), 62 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrMisc.td b/llvm/lib/Target/X86/X86InstrMisc.td
index 80cddc570b842..229aa3c6f1f20 100644
--- a/llvm/lib/Target/X86/X86InstrMisc.td
+++ b/llvm/lib/Target/X86/X86InstrMisc.td
@@ -1375,39 +1375,30 @@ let Predicates = [HasBMI2, NoTBM, HasEGPR] in {
                              (MOV8ri (CountTrailingOnes imm:$mask)), sub_8bit))>;
 }
 
-multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
-                         X86MemOperand x86memop, SDPatternOperator OpNode,
-                         PatFrag ld_frag, string Suffix = ""> {
-  def rr#Suffix : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
-                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
-                  NoCD8, VVVV, Sched<[WriteALU]>;
-  def rm#Suffix : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
-                    !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
-                  NoCD8, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
+multiclass PdepPext<string m, X86TypeInfo t, SDPatternOperator node,
+                    string suffix = ""> {
+  def rr#suffix : ITy<0xF5, MRMSrcReg, t, (outs t.RegClass:$dst),
+                      (ins t.RegClass:$src1, t.RegClass:$src2), m, binop_ndd_args,
+                      [(set t.RegClass:$dst, (node t.RegClass:$src1, t.RegClass:$src2))]>,
+                  T8, VVVV, Sched<[WriteALU]>;
+  def rm#suffix : ITy<0xF5, MRMSrcMem, t, (outs t.RegClass:$dst),
+                      (ins t.RegClass:$src1, t.MemOperand:$src2), m, binop_ndd_args,
+                      [(set t.RegClass:$dst, (node t.RegClass:$src1, (t.LoadNode addr:$src2)))]>,
+                  T8, VVVV, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
 }
 
 let Predicates = [HasBMI2, NoEGPR] in {
-  defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
-                               X86pdep, loadi32>, T8, XD, VEX;
-  defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
-                               X86pdep, loadi64>, T8, XD, REX_W, VEX;
-  defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
-                               X86pext, loadi32>, T8, XS, VEX;
-  defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
-                               X86pext, loadi64>, T8, XS, REX_W, VEX;
+  defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep>, XD, VEX;
+  defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep>, XD, REX_W, VEX;
+  defm PEXT32 : PdepPext<"pext", Xi32, X86pext>, XS, VEX;
+  defm PEXT64 : PdepPext<"pext", Xi64, X86pext>, XS, REX_W, VEX;
 }
 
 let Predicates = [HasBMI2, HasEGPR] in {
-  defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
-                               X86pdep, loadi32, "_EVEX">, T8, XD, EVEX;
-  defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
-                               X86pdep, loadi64, "_EVEX">, T8, XD, REX_W, EVEX;
-  defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
-                               X86pext, loadi32, "_EVEX">, T8, XS, EVEX;
-  defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
-                               X86pext, loadi64, "_EVEX">, T8, XS, REX_W, EVEX;
+  defm PDEP32 : PdepPext<"pdep", Xi32, X86pdep, "_EVEX">, XD, EVEX;
+  defm PDEP64 : PdepPext<"pdep", Xi64, X86pdep, "_EVEX">, XD, REX_W, EVEX;
+  defm PEXT32 : PdepPext<"pext", Xi32, X86pext, "_EVEX">, XS, EVEX;
+  defm PEXT64 : PdepPext<"pext", Xi64, X86pext, "_EVEX">, XS, REX_W, EVEX;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index a8cd1996eeb35..7d94fec9a354d 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -6663,14 +6663,14 @@ let Defs = [ECX, EFLAGS], Uses = [EAX, EDX], hasSideEffects = 0 in {
 class Crc32r<X86TypeInfo t, RegisterClass rc, SDPatternOperator node>
   : ITy<0xF1, MRMSrcReg, t, (outs rc:$dst), (ins rc:$src1, t.RegClass:$src2),
       "crc32", binop_args, [(set rc:$dst, (node rc:$src1, t.RegClass:$src2))]>,
-    Sched<[WriteCRC32]>, NoCD8 {
+    Sched<[WriteCRC32]> {
   let Constraints = "$src1 = $dst";
 }
 
 class Crc32m<X86TypeInfo t, RegisterClass rc, SDPatternOperator node>
   : ITy<0xF1, MRMSrcMem, t, (outs rc:$dst), (ins rc:$src1, t.MemOperand:$src2),
       "crc32", binop_args, [(set rc:$dst, (node rc:$src1, (load addr:$src2)))]>,
-    Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]>, NoCD8 {
+    Sched<[WriteCRC32.Folded, WriteCRC32.ReadAfterFold]> {
   let Constraints = "$src1 = $dst";
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index e79240ae443ad..27aeff1cd3ae2 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -113,9 +113,9 @@ class NDD<bit ndd, Map map = OB> {
   Map OpMap = !if(!eq(ndd, 0), map, T_MAP4);
 }
 // NF - Helper for NF (no flags update) instructions
-class NF: T_MAP4, EVEX, EVEX_NF, NoCD8;
+class NF: T_MAP4, EVEX, EVEX_NF;
 // PL - Helper for promoted legacy instructions
-class PL: T_MAP4, EVEX, NoCD8, ExplicitEVEXPrefix;
+class PL: T_MAP4, EVEX, ExplicitEVEXPrefix;
 
 //===----------------------------------------------------------------------===//
 // X86 Type infomation definitions
@@ -961,7 +961,7 @@ class ITy<bits<8> o, Format f, X86TypeInfo t, dag outs, dag ins, string m,
           string args, list<dag> p>
   : I<{o{7}, o{6}, o{5}, o{4}, o{3}, o{2}, o{1},
        !if(!eq(t.HasEvenOpcode, 1), 0, o{0})}, f, outs, ins,
-      !strconcat(m, "{", t.InstrSuffix, "}\t", args), p> {
+      !strconcat(m, "{", t.InstrSuffix, "}\t", args), p>, NoCD8 {
   let hasSideEffects = 0;
   let hasREX_W  = t.HasREX_W;
 }
diff --git a/llvm/test/MC/Disassembler/X86/apx/bmi2.txt b/llvm/test/MC/Disassembler/X86/apx/bmi2.txt
index 0fb11f4061f1b..428cf2d148e2d 100644
--- a/llvm/test/MC/Disassembler/X86/apx/bmi2.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/bmi2.txt
@@ -13,11 +13,11 @@
 
 # ATT:   mulxl	123(%rax,%rbx,4), %ecx, %edx
 # INTEL: mulx	edx, ecx, dword ptr [rax + 4*rbx + 123]
-0x62,0xf2,0x77,0x08,0xf6,0x94,0x98,0x7b,0x00,0x00,0x00
+0x62,0xf2,0x77,0x08,0xf6,0x54,0x98,0x7b
 
 # ATT:   mulxq	123(%rax,%rbx,4), %r9, %r15
 # INTEL: mulx	r15, r9, qword ptr [rax + 4*rbx + 123]
-0x62,0x72,0xb7,0x08,0xf6,0xbc,0x98,0x7b,0x00,0x00,0x00
+0x62,0x72,0xb7,0x08,0xf6,0x7c,0x98,0x7b
 
 # ATT:   mulxl	%r18d, %r22d, %r26d
 # INTEL: mulx	r26d, r22d, r18d
@@ -115,11 +115,11 @@
 
 # ATT:   rorxl	$123, 123(%rax,%rbx,4), %ecx
 # INTEL: rorx	ecx, dword ptr [rax + 4*rbx + 123], 123
-0x62,0xf3,0x7f,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b
+0x62,0xf3,0x7f,0x08,0xf0,0x4c,0x98,0x7b,0x7b
 
 # ATT:   rorxq	$123, 123(%rax,%rbx,4), %r9
 # INTEL: rorx	r9, qword ptr [rax + 4*rbx + 123], 123
-0x62,0x73,0xff,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b
+0x62,0x73,0xff,0x08,0xf0,0x4c,0x98,0x7b,0x7b
 
 # ATT:   rorxl	$123, %r18d, %r22d
 # INTEL: rorx	r22d, r18d, 123
@@ -145,7 +145,7 @@
 
 # ATT:   sarxl	%ecx, 123(%rax,%rbx,4), %edx
 # INTEL: sarx	edx, dword ptr [rax + 4*rbx + 123], ecx
-0x62,0xf2,0x76,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00
+0x62,0xf2,0x76,0x08,0xf7,0x54,0x98,0x7b
 
 # ATT:   sarxq	%r9, %r15, %r11
 # INTEL: sarx	r11, r15, r9
@@ -153,7 +153,7 @@
 
 # ATT:   sarxq	%r9, 123(%rax,%rbx,4), %r15
 # INTEL: sarx	r15, qword ptr [rax + 4*rbx + 123], r9
-0x62,0x72,0xb6,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00
+0x62,0x72,0xb6,0x08,0xf7,0x7c,0x98,0x7b
 
 # ATT:   sarxl	%r18d, %r22d, %r26d
 # INTEL: sarx	r26d, r22d, r18d
@@ -179,7 +179,7 @@
 
 # ATT:   shlxl	%ecx, 123(%rax,%rbx,4), %edx
 # INTEL: shlx	edx, dword ptr [rax + 4*rbx + 123], ecx
-0x62,0xf2,0x75,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00
+0x62,0xf2,0x75,0x08,0xf7,0x54,0x98,0x7b
 
 # ATT:   shlxq	%r9, %r15, %r11
 # INTEL: shlx	r11, r15, r9
@@ -187,7 +187,7 @@
 
 # ATT:   shlxq	%r9, 123(%rax,%rbx,4), %r15
 # INTEL: shlx	r15, qword ptr [rax + 4*rbx + 123], r9
-0x62,0x72,0xb5,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00
+0x62,0x72,0xb5,0x08,0xf7,0x7c,0x98,0x7b
 
 # ATT:   shlxl	%r18d, %r22d, %r26d
 # INTEL: shlx	r26d, r22d, r18d
@@ -213,7 +213,7 @@
 
 # ATT:   shrxl	%ecx, 123(%rax,%rbx,4), %edx
 # INTEL: shrx	edx, dword ptr [rax + 4*rbx + 123], ecx
-0x62,0xf2,0x77,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00
+0x62,0xf2,0x77,0x08,0xf7,0x54,0x98,0x7b
 
 # ATT:   shrxq	%r9, %r15, %r11
 # INTEL: shrx	r11, r15, r9
@@ -221,7 +221,7 @@
 
 # ATT:   shrxq	%r9, 123(%rax,%rbx,4), %r15
 # INTEL: shrx	r15, qword ptr [rax + 4*rbx + 123], r9
-0x62,0x72,0xb7,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00
+0x62,0x72,0xb7,0x08,0xf7,0x7c,0x98,0x7b
 
 # ATT:   shrxl	%r18d, %r22d, %r26d
 # INTEL: shrx	r26d, r22d, r18d
diff --git a/llvm/test/MC/X86/apx/bmi2-att.s b/llvm/test/MC/X86/apx/bmi2-att.s
index 14e8566e799d5..20544d7922cfe 100644
--- a/llvm/test/MC/X86/apx/bmi2-att.s
+++ b/llvm/test/MC/X86/apx/bmi2-att.s
@@ -15,11 +15,11 @@
          {evex}	mulxq	%r9, %r15, %r11
 
 # CHECK: {evex}	mulxl	123(%rax,%rbx,4), %ecx, %edx
-# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf6,0x94,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf6,0x54,0x98,0x7b]
          {evex}	mulxl	123(%rax,%rbx,4), %ecx, %edx
 
 # CHECK: {evex}	mulxq	123(%rax,%rbx,4), %r9, %r15
-# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf6,0xbc,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf6,0x7c,0x98,0x7b]
          {evex}	mulxq	123(%rax,%rbx,4), %r9, %r15
 
 # CHECK: mulxl	%r18d, %r22d, %r26d
@@ -117,11 +117,11 @@
          {evex}	rorxq	$123, %r9, %r15
 
 # CHECK: {evex}	rorxl	$123, 123(%rax,%rbx,4), %ecx
-# CHECK: encoding: [0x62,0xf3,0x7f,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b]
+# CHECK: encoding: [0x62,0xf3,0x7f,0x08,0xf0,0x4c,0x98,0x7b,0x7b]
          {evex}	rorxl	$123, 123(%rax,%rbx,4), %ecx
 
 # CHECK: {evex}	rorxq	$123, 123(%rax,%rbx,4), %r9
-# CHECK: encoding: [0x62,0x73,0xff,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b]
+# CHECK: encoding: [0x62,0x73,0xff,0x08,0xf0,0x4c,0x98,0x7b,0x7b]
          {evex}	rorxq	$123, 123(%rax,%rbx,4), %r9
 
 # CHECK: rorxl	$123, %r18d, %r22d
@@ -147,7 +147,7 @@
          {evex}	sarxl	%ecx, %edx, %r10d
 
 # CHECK: {evex}	sarxl	%ecx, 123(%rax,%rbx,4), %edx
-# CHECK: encoding: [0x62,0xf2,0x76,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0xf2,0x76,0x08,0xf7,0x54,0x98,0x7b]
          {evex}	sarxl	%ecx, 123(%rax,%rbx,4), %edx
 
 # CHECK: {evex}	sarxq	%r9, %r15, %r11
@@ -155,7 +155,7 @@
          {evex}	sarxq	%r9, %r15, %r11
 
 # CHECK: {evex}	sarxq	%r9, 123(%rax,%rbx,4), %r15
-# CHECK: encoding: [0x62,0x72,0xb6,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0x72,0xb6,0x08,0xf7,0x7c,0x98,0x7b]
          {evex}	sarxq	%r9, 123(%rax,%rbx,4), %r15
 
 # CHECK: sarxl	%r18d, %r22d, %r26d
@@ -181,7 +181,7 @@
          {evex}	shlxl	%ecx, %edx, %r10d
 
 # CHECK: {evex}	shlxl	%ecx, 123(%rax,%rbx,4), %edx
-# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xf7,0x54,0x98,0x7b]
          {evex}	shlxl	%ecx, 123(%rax,%rbx,4), %edx
 
 # CHECK: {evex}	shlxq	%r9, %r15, %r11
@@ -189,7 +189,7 @@
          {evex}	shlxq	%r9, %r15, %r11
 
 # CHECK: {evex}	shlxq	%r9, 123(%rax,%rbx,4), %r15
-# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xf7,0x7c,0x98,0x7b]
          {evex}	shlxq	%r9, 123(%rax,%rbx,4), %r15
 
 # CHECK: shlxl	%r18d, %r22d, %r26d
@@ -215,7 +215,7 @@
          {evex}	shrxl	%ecx, %edx, %r10d
 
 # CHECK: {evex}	shrxl	%ecx, 123(%rax,%rbx,4), %edx
-# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf7,0x54,0x98,0x7b]
          {evex}	shrxl	%ecx, 123(%rax,%rbx,4), %edx
 
 # CHECK: {evex}	shrxq	%r9, %r15, %r11
@@ -223,7 +223,7 @@
          {evex}	shrxq	%r9, %r15, %r11
 
 # CHECK: {evex}	shrxq	%r9, 123(%rax,%rbx,4), %r15
-# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf7,0x7c,0x98,0x7b]
          {evex}	shrxq	%r9, 123(%rax,%rbx,4), %r15
 
 # CHECK: shrxl	%r18d, %r22d, %r26d
diff --git a/llvm/test/MC/X86/apx/bmi2-intel.s b/llvm/test/MC/X86/apx/bmi2-intel.s
index f21004fdd696a..fe96fbc0be8d7 100644
--- a/llvm/test/MC/X86/apx/bmi2-intel.s
+++ b/llvm/test/MC/X86/apx/bmi2-intel.s
@@ -11,11 +11,11 @@
          {evex}	mulx	r11, r15, r9
 
 # CHECK: {evex}	mulx	edx, ecx, dword ptr [rax + 4*rbx + 123]
-# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf6,0x94,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf6,0x54,0x98,0x7b]
          {evex}	mulx	edx, ecx, dword ptr [rax + 4*rbx + 123]
 
 # CHECK: {evex}	mulx	r15, r9, qword ptr [rax + 4*rbx + 123]
-# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf6,0xbc,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf6,0x7c,0x98,0x7b]
          {evex}	mulx	r15, r9, qword ptr [rax + 4*rbx + 123]
 
 # CHECK: mulx	r26d, r22d, r18d
@@ -113,11 +113,11 @@
          {evex}	rorx	r15, r9, 123
 
 # CHECK: {evex}	rorx	ecx, dword ptr [rax + 4*rbx + 123], 123
-# CHECK: encoding: [0x62,0xf3,0x7f,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b]
+# CHECK: encoding: [0x62,0xf3,0x7f,0x08,0xf0,0x4c,0x98,0x7b,0x7b]
          {evex}	rorx	ecx, dword ptr [rax + 4*rbx + 123], 123
 
 # CHECK: {evex}	rorx	r9, qword ptr [rax + 4*rbx + 123], 123
-# CHECK: encoding: [0x62,0x73,0xff,0x08,0xf0,0x8c,0x98,0x7b,0x00,0x00,0x00,0x7b]
+# CHECK: encoding: [0x62,0x73,0xff,0x08,0xf0,0x4c,0x98,0x7b,0x7b]
          {evex}	rorx	r9, qword ptr [rax + 4*rbx + 123], 123
 
 # CHECK: rorx	r22d, r18d, 123
@@ -143,7 +143,7 @@
          {evex}	sarx	r10d, edx, ecx
 
 # CHECK: {evex}	sarx	edx, dword ptr [rax + 4*rbx + 123], ecx
-# CHECK: encoding: [0x62,0xf2,0x76,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0xf2,0x76,0x08,0xf7,0x54,0x98,0x7b]
          {evex}	sarx	edx, dword ptr [rax + 4*rbx + 123], ecx
 
 # CHECK: {evex}	sarx	r11, r15, r9
@@ -151,7 +151,7 @@
          {evex}	sarx	r11, r15, r9
 
 # CHECK: {evex}	sarx	r15, qword ptr [rax + 4*rbx + 123], r9
-# CHECK: encoding: [0x62,0x72,0xb6,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0x72,0xb6,0x08,0xf7,0x7c,0x98,0x7b]
          {evex}	sarx	r15, qword ptr [rax + 4*rbx + 123], r9
 
 # CHECK: sarx	r26d, r22d, r18d
@@ -177,7 +177,7 @@
          {evex}	shlx	r10d, edx, ecx
 
 # CHECK: {evex}	shlx	edx, dword ptr [rax + 4*rbx + 123], ecx
-# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0xf2,0x75,0x08,0xf7,0x54,0x98,0x7b]
          {evex}	shlx	edx, dword ptr [rax + 4*rbx + 123], ecx
 
 # CHECK: {evex}	shlx	r11, r15, r9
@@ -185,7 +185,7 @@
          {evex}	shlx	r11, r15, r9
 
 # CHECK: {evex}	shlx	r15, qword ptr [rax + 4*rbx + 123], r9
-# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0x72,0xb5,0x08,0xf7,0x7c,0x98,0x7b]
          {evex}	shlx	r15, qword ptr [rax + 4*rbx + 123], r9
 
 # CHECK: shlx	r26d, r22d, r18d
@@ -211,7 +211,7 @@
          {evex}	shrx	r10d, edx, ecx
 
 # CHECK: {evex}	shrx	edx, dword ptr [rax + 4*rbx + 123], ecx
-# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf7,0x94,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0xf2,0x77,0x08,0xf7,0x54,0x98,0x7b]
          {evex}	shrx	edx, dword ptr [rax + 4*rbx + 123], ecx
 
 # CHECK: {evex}	shrx	r11, r15, r9
@@ -219,7 +219,7 @@
          {evex}	shrx	r11, r15, r9
 
 # CHECK: {evex}	shrx	r15, qword ptr [rax + 4*rbx + 123], r9
-# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf7,0xbc,0x98,0x7b,0x00,0x00,0x00]
+# CHECK: encoding: [0x62,0x72,0xb7,0x08,0xf7,0x7c,0x98,0x7b]
          {evex}	shrx	r15, qword ptr [rax + 4*rbx + 123], r9
 
 # CHECK: shrx	r26d, r22d, r18d

From 914cfa41385606fe81c3afd296a6ca3ab975a97d Mon Sep 17 00:00:00 2001
From: Fehr Mathieu <mathieu.fehr@gmail.com>
Date: Thu, 18 Jan 2024 16:31:40 +0000
Subject: [PATCH 009/843] [mlir][irdl] Add `irdl.base` op (#76400)

The `irdl.base` op represent an attribute constraint that will check
that the
base of a type or attribute is the expected one (e.g. `IntegerType`) .

Example:

```mlir
irdl.dialect @cmath {
  irdl.type @complex {
    %0 = irdl.base "!builtin.integer"
    irdl.parameters(%0)
  }

  irdl.type @complex_wrapper {
    %0 = irdl.base @complex
    irdl.parameters(%0)
  }
}
```

The above program defines a `cmath.complex` type that expects a single
parameter, which is a type with base name `builtin.integer`, which is
the
name of an `IntegerType` type.
It also defines a `cmath.complex_wrapper` type that expects a single
parameter, which is a type of base type `cmath.complex`.
---
 mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td  | 51 +++++++++++++++
 .../include/mlir/Dialect/IRDL/IRDLVerifiers.h | 42 +++++++++++++
 mlir/lib/Dialect/IRDL/IR/IRDL.cpp             | 33 ++++++++++
 mlir/lib/Dialect/IRDL/IR/IRDLOps.cpp          | 54 ++++++++++++++++
 mlir/lib/Dialect/IRDL/IRDLVerifiers.cpp       | 33 ++++++++++
 mlir/test/Dialect/IRDL/invalid.irdl.mlir      | 43 +++++++++++++
 mlir/test/Dialect/IRDL/testd.irdl.mlir        | 48 +++++++++++---
 mlir/test/Dialect/IRDL/testd.mlir             | 63 ++++++++++++++++---
 8 files changed, 350 insertions(+), 17 deletions(-)
 create mode 100644 mlir/test/Dialect/IRDL/invalid.irdl.mlir

diff --git a/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td b/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
index 681425f817442..aa6a8e93c0288 100644
--- a/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
+++ b/mlir/include/mlir/Dialect/IRDL/IR/IRDLOps.td
@@ -451,6 +451,57 @@ def IRDL_IsOp : IRDL_ConstraintOp<"is",
   let assemblyFormat = " $expected ` ` attr-dict ";
 }
 
+def IRDL_BaseOp : IRDL_ConstraintOp<"base",
+    [ParentOneOf<["TypeOp", "AttributeOp", "OperationOp"]>,
+     DeclareOpInterfaceMethods<SymbolUserOpInterface>]> {
+  let summary = "Constraints an attribute/type base";
+  let description = [{
+    `irdl.base` defines a constraint that only accepts a single type
+    or attribute base, e.g. an `IntegerType`. The attribute base is defined
+    either by a symbolic reference to the corresponding IRDL definition,
+    or by the name of the base. Named bases are prefixed with `!` or `#`
+    respectively for types and attributes.
+
+    Example:
+
+    ```mlir
+    irdl.dialect @cmath {
+      irdl.type @complex {
+        %0 = irdl.base "!builtin.integer"
+        irdl.parameters(%0)
+      }
+
+      irdl.type @complex_wrapper {
+        %0 = irdl.base @complex
+        irdl.parameters(%0)
+      }
+    }
+    ```
+
+    The above program defines a `cmath.complex` type that expects a single
+    parameter, which is a type with base name `builtin.integer`, which is the
+    name of an `IntegerType` type.
+    It also defines a `cmath.complex_wrapper` type that expects a single
+    parameter, which is a type of base type `cmath.complex`.
+  }];
+
+  let arguments = (ins OptionalAttr<SymbolRefAttr>:$base_ref,
+                       OptionalAttr<StrAttr>:$base_name);
+  let results = (outs IRDL_AttributeType:$output);
+  let assemblyFormat = " ($base_ref^)? ($base_name^)? ` ` attr-dict";
+  
+  let builders = [
+    OpBuilder<(ins "SymbolRefAttr":$base_ref), [{
+      build($_builder, $_state, base_ref, {});
+    }]>,
+    OpBuilder<(ins "StringAttr":$base_name), [{
+      build($_builder, $_state, {}, base_name);
+    }]>,
+  ];
+
+  let hasVerifier = 1;
+}
+
 def IRDL_ParametricOp : IRDL_ConstraintOp<"parametric",
     [ParentOneOf<["TypeOp", "AttributeOp", "OperationOp"]>, Pure]> {
   let summary = "Constraints an attribute/type base and its parameters";
diff --git a/mlir/include/mlir/Dialect/IRDL/IRDLVerifiers.h b/mlir/include/mlir/Dialect/IRDL/IRDLVerifiers.h
index f8ce77cbc50e9..9ecb7c0107d7f 100644
--- a/mlir/include/mlir/Dialect/IRDL/IRDLVerifiers.h
+++ b/mlir/include/mlir/Dialect/IRDL/IRDLVerifiers.h
@@ -99,6 +99,48 @@ class IsConstraint : public Constraint {
   Attribute expectedAttribute;
 };
 
+/// A constraint that checks that an attribute is of a given attribute base
+/// (e.g. IntegerAttr).
+class BaseAttrConstraint : public Constraint {
+public:
+  BaseAttrConstraint(TypeID baseTypeID, StringRef baseName)
+      : baseTypeID(baseTypeID), baseName(baseName) {}
+
+  virtual ~BaseAttrConstraint() = default;
+
+  LogicalResult verify(function_ref<InFlightDiagnostic()> emitError,
+                       Attribute attr,
+                       ConstraintVerifier &context) const override;
+
+private:
+  /// The expected base attribute typeID.
+  TypeID baseTypeID;
+
+  /// The base attribute name, only used for error reporting.
+  StringRef baseName;
+};
+
+/// A constraint that checks that a type is of a given type base (e.g.
+/// IntegerType).
+class BaseTypeConstraint : public Constraint {
+public:
+  BaseTypeConstraint(TypeID baseTypeID, StringRef baseName)
+      : baseTypeID(baseTypeID), baseName(baseName) {}
+
+  virtual ~BaseTypeConstraint() = default;
+
+  LogicalResult verify(function_ref<InFlightDiagnostic()> emitError,
+                       Attribute attr,
+                       ConstraintVerifier &context) const override;
+
+private:
+  /// The expected base type typeID.
+  TypeID baseTypeID;
+
+  /// The base type name, only used for error reporting.
+  StringRef baseName;
+};
+
 /// A constraint that checks that an attribute is of a
 /// specific dynamic attribute definition, and that all of its parameters
 /// satisfy the given constraints.
diff --git a/mlir/lib/Dialect/IRDL/IR/IRDL.cpp b/mlir/lib/Dialect/IRDL/IR/IRDL.cpp
index 33c6bb869a643..4eae2b03024c2 100644
--- a/mlir/lib/Dialect/IRDL/IR/IRDL.cpp
+++ b/mlir/lib/Dialect/IRDL/IR/IRDL.cpp
@@ -117,6 +117,39 @@ LogicalResult AttributesOp::verify() {
   return success();
 }
 
+LogicalResult BaseOp::verify() {
+  std::optional<StringRef> baseName = getBaseName();
+  std::optional<SymbolRefAttr> baseRef = getBaseRef();
+  if (baseName.has_value() == baseRef.has_value())
+    return emitOpError() << "the base type or attribute should be specified by "
+                            "either a name or a reference";
+
+  if (baseName &&
+      (baseName->empty() || ((*baseName)[0] != '!' && (*baseName)[0] != '#')))
+    return emitOpError() << "the base type or attribute name should start with "
+                            "'!' or '#'";
+
+  return success();
+}
+
+LogicalResult BaseOp::verifySymbolUses(SymbolTableCollection &symbolTable) {
+  std::optional<SymbolRefAttr> baseRef = getBaseRef();
+  if (!baseRef)
+    return success();
+
+  TypeOp typeOp = symbolTable.lookupNearestSymbolFrom<TypeOp>(*this, *baseRef);
+  if (typeOp)
+    return success();
+
+  AttributeOp attrOp =
+      symbolTable.lookupNearestSymbolFrom<AttributeOp>(*this, *baseRef);
+  if (attrOp)
+    return success();
+
+  return emitOpError() << "'" << *baseRef
+                       << "' does not refer to a type or attribute definition";
+}
+
 /// Parse a value with its variadicity first. By default, the variadicity is
 /// single.
 ///
diff --git a/mlir/lib/Dialect/IRDL/IR/IRDLOps.cpp b/mlir/lib/Dialect/IRDL/IR/IRDLOps.cpp
index e172039712f24..0895306b8bce1 100644
--- a/mlir/lib/Dialect/IRDL/IR/IRDLOps.cpp
+++ b/mlir/lib/Dialect/IRDL/IR/IRDLOps.cpp
@@ -37,6 +37,60 @@ std::unique_ptr<Constraint> IsOp::getVerifier(
   return std::make_unique<IsConstraint>(getExpectedAttr());
 }
 
+std::unique_ptr<Constraint> BaseOp::getVerifier(
+    ArrayRef<Value> valueToConstr,
+    DenseMap<TypeOp, std::unique_ptr<DynamicTypeDefinition>> const &types,
+    DenseMap<AttributeOp, std::unique_ptr<DynamicAttrDefinition>> const
+        &attrs) {
+  MLIRContext *ctx = getContext();
+
+  // Case where the input is a symbol reference.
+  // This corresponds to the case where the base is an IRDL type or attribute.
+  if (auto baseRef = getBaseRef()) {
+    Operation *defOp =
+        SymbolTable::lookupNearestSymbolFrom(getOperation(), baseRef.value());
+
+    // Type case.
+    if (auto typeOp = dyn_cast<TypeOp>(defOp)) {
+      DynamicTypeDefinition *typeDef = types.at(typeOp).get();
+      auto name = StringAttr::get(ctx, typeDef->getDialect()->getNamespace() +
+                                           "." + typeDef->getName().str());
+      return std::make_unique<BaseTypeConstraint>(typeDef->getTypeID(), name);
+    }
+
+    // Attribute case.
+    auto attrOp = cast<AttributeOp>(defOp);
+    DynamicAttrDefinition *attrDef = attrs.at(attrOp).get();
+    auto name = StringAttr::get(ctx, attrDef->getDialect()->getNamespace() +
+                                         "." + attrDef->getName().str());
+    return std::make_unique<BaseAttrConstraint>(attrDef->getTypeID(), name);
+  }
+
+  // Case where the input is string literal.
+  // This corresponds to the case where the base is a registered type or
+  // attribute.
+  StringRef baseName = getBaseName().value();
+
+  // Type case.
+  if (baseName[0] == '!') {
+    auto abstractType = AbstractType::lookup(baseName.drop_front(1), ctx);
+    if (!abstractType) {
+      emitError() << "no registered type with name " << baseName;
+      return nullptr;
+    }
+    return std::make_unique<BaseTypeConstraint>(abstractType->get().getTypeID(),
+                                                abstractType->get().getName());
+  }
+
+  auto abstractAttr = AbstractAttribute::lookup(baseName.drop_front(1), ctx);
+  if (!abstractAttr) {
+    emitError() << "no registered attribute with name " << baseName;
+    return nullptr;
+  }
+  return std::make_unique<BaseAttrConstraint>(abstractAttr->get().getTypeID(),
+                                              abstractAttr->get().getName());
+}
+
 std::unique_ptr<Constraint> ParametricOp::getVerifier(
     ArrayRef<Value> valueToConstr,
     DenseMap<TypeOp, std::unique_ptr<DynamicTypeDefinition>> const &types,
diff --git a/mlir/lib/Dialect/IRDL/IRDLVerifiers.cpp b/mlir/lib/Dialect/IRDL/IRDLVerifiers.cpp
index fab05d1ffb92f..05dc154eb5b4c 100644
--- a/mlir/lib/Dialect/IRDL/IRDLVerifiers.cpp
+++ b/mlir/lib/Dialect/IRDL/IRDLVerifiers.cpp
@@ -68,6 +68,39 @@ LogicalResult IsConstraint::verify(function_ref<InFlightDiagnostic()> emitError,
   return failure();
 }
 
+LogicalResult
+BaseAttrConstraint::verify(function_ref<InFlightDiagnostic()> emitError,
+                           Attribute attr, ConstraintVerifier &context) const {
+  if (attr.getTypeID() == baseTypeID)
+    return success();
+
+  if (emitError)
+    return emitError() << "expected base attribute '" << baseName
+                       << "' but got '" << attr.getAbstractAttribute().getName()
+                       << "'";
+  return failure();
+}
+
+LogicalResult
+BaseTypeConstraint::verify(function_ref<InFlightDiagnostic()> emitError,
+                           Attribute attr, ConstraintVerifier &context) const {
+  auto typeAttr = dyn_cast<TypeAttr>(attr);
+  if (!typeAttr) {
+    if (emitError)
+      return emitError() << "expected type, got attribute '" << attr;
+    return failure();
+  }
+
+  Type type = typeAttr.getValue();
+  if (type.getTypeID() == baseTypeID)
+    return success();
+
+  if (emitError)
+    return emitError() << "expected base type '" << baseName << "' but got '"
+                       << type.getAbstractType().getName() << "'";
+  return failure();
+}
+
 LogicalResult DynParametricAttrConstraint::verify(
     function_ref<InFlightDiagnostic()> emitError, Attribute attr,
     ConstraintVerifier &context) const {
diff --git a/mlir/test/Dialect/IRDL/invalid.irdl.mlir b/mlir/test/Dialect/IRDL/invalid.irdl.mlir
new file mode 100644
index 0000000000000..d62bb498a7ad9
--- /dev/null
+++ b/mlir/test/Dialect/IRDL/invalid.irdl.mlir
@@ -0,0 +1,43 @@
+// RUN: mlir-opt %s -verify-diagnostics -split-input-file
+
+// Testing invalid IRDL IRs
+
+func.func private @foo()
+
+irdl.dialect @testd {
+  irdl.type @type {
+    // expected-error@+1 {{'@foo' does not refer to a type or attribute definition}}
+    %0 = irdl.base @foo
+    irdl.parameters(%0)
+  }
+}
+
+// -----
+
+irdl.dialect @testd {
+  irdl.type @type {
+    // expected-error@+1 {{the base type or attribute name should start with '!' or '#'}}
+    %0 = irdl.base "builtin.integer"
+    irdl.parameters(%0)
+  }
+}
+
+// -----
+
+irdl.dialect @testd {
+  irdl.type @type {
+    // expected-error@+1 {{the base type or attribute name should start with '!' or '#'}}
+    %0 = irdl.base ""
+    irdl.parameters(%0)
+  }
+}
+
+// -----
+
+irdl.dialect @testd {
+  irdl.type @type {
+    // expected-error@+1 {{the base type or attribute should be specified by either a name}}
+    %0 = irdl.base
+    irdl.parameters(%0)
+  }
+}
diff --git a/mlir/test/Dialect/IRDL/testd.irdl.mlir b/mlir/test/Dialect/IRDL/testd.irdl.mlir
index 684286e4afeb0..f828d95bdb81d 100644
--- a/mlir/test/Dialect/IRDL/testd.irdl.mlir
+++ b/mlir/test/Dialect/IRDL/testd.irdl.mlir
@@ -11,6 +11,15 @@ irdl.dialect @testd {
     irdl.parameters(%0)
   }
 
+  // CHECK: irdl.attribute @parametric_attr {
+  // CHECK:  %[[v0:[^ ]*]] = irdl.any
+  // CHECK:  irdl.parameters(%[[v0]])
+  // CHECK: }
+  irdl.attribute @parametric_attr {
+    %0 = irdl.any
+    irdl.parameters(%0)
+  }
+
   // CHECK: irdl.type @attr_in_type_out {
   // CHECK:   %[[v0:[^ ]*]] = irdl.any
   // CHECK:   irdl.parameters(%[[v0]])
@@ -66,15 +75,40 @@ irdl.dialect @testd {
     irdl.results(%0)
   }
 
-  // CHECK: irdl.operation @dynbase {
-  // CHECK:   %[[v0:[^ ]*]] = irdl.any
-  // CHECK:   %[[v1:[^ ]*]] = irdl.parametric @parametric<%[[v0]]>
+  // CHECK: irdl.operation @dyn_type_base {
+  // CHECK:   %[[v1:[^ ]*]] = irdl.base @parametric
   // CHECK:   irdl.results(%[[v1]])
   // CHECK: }
-  irdl.operation @dynbase {
-    %0 = irdl.any
-    %1 = irdl.parametric @parametric<%0>
-    irdl.results(%1)
+  irdl.operation @dyn_type_base {
+    %0 = irdl.base @parametric
+    irdl.results(%0)
+  }
+
+  // CHECK: irdl.operation @dyn_attr_base {
+  // CHECK:   %[[v1:[^ ]*]] = irdl.base @parametric_attr
+  // CHECK:   irdl.attributes {"attr1" = %[[v1]]}
+  // CHECK: }
+  irdl.operation @dyn_attr_base {
+    %0 = irdl.base @parametric_attr
+    irdl.attributes {"attr1" = %0}
+  }
+
+  // CHECK: irdl.operation @named_type_base {
+  // CHECK:   %[[v1:[^ ]*]] = irdl.base "!builtin.integer"
+  // CHECK:   irdl.results(%[[v1]])
+  // CHECK: }
+  irdl.operation @named_type_base {
+    %0 = irdl.base "!builtin.integer"
+    irdl.results(%0)
+  }
+
+  // CHECK: irdl.operation @named_attr_base {
+  // CHECK:   %[[v1:[^ ]*]] = irdl.base "#builtin.integer"
+  // CHECK:   irdl.attributes {"attr1" = %[[v1]]}
+  // CHECK: }
+  irdl.operation @named_attr_base {
+    %0 = irdl.base "#builtin.integer"
+    irdl.attributes {"attr1" = %0}
   }
 
   // CHECK: irdl.operation @dynparams {
diff --git a/mlir/test/Dialect/IRDL/testd.mlir b/mlir/test/Dialect/IRDL/testd.mlir
index bb1e9f4635641..333bb96eb2e60 100644
--- a/mlir/test/Dialect/IRDL/testd.mlir
+++ b/mlir/test/Dialect/IRDL/testd.mlir
@@ -120,24 +120,67 @@ func.func @succeededAnyConstraint() {
 // -----
 
 //===----------------------------------------------------------------------===//
-// Dynamic base constraint
+// Base constraints
 //===----------------------------------------------------------------------===//
 
 func.func @succeededDynBaseConstraint() {
-  // CHECK: "testd.dynbase"() : () -> !testd.parametric<i32>
-  "testd.dynbase"() : () -> !testd.parametric<i32>
-  // CHECK: "testd.dynbase"() : () -> !testd.parametric<i64>
-  "testd.dynbase"() : () -> !testd.parametric<i64>
-  // CHECK: "testd.dynbase"() : () -> !testd.parametric<!testd.parametric<i64>>
-  "testd.dynbase"() : () -> !testd.parametric<!testd.parametric<i64>>
+  // CHECK: "testd.dyn_type_base"() : () -> !testd.parametric<i32>
+  "testd.dyn_type_base"() : () -> !testd.parametric<i32>
+  // CHECK: "testd.dyn_type_base"() : () -> !testd.parametric<i64>
+  "testd.dyn_type_base"() : () -> !testd.parametric<i64>
+  // CHECK: "testd.dyn_type_base"() : () -> !testd.parametric<!testd.parametric<i64>>
+  "testd.dyn_type_base"() : () -> !testd.parametric<!testd.parametric<i64>>
+  // CHECK: "testd.dyn_attr_base"() {attr1 = #testd.parametric_attr<i32>} : () -> ()
+  "testd.dyn_attr_base"() {attr1 = #testd.parametric_attr<i32>} : () -> ()
+  // CHECK: "testd.dyn_attr_base"() {attr1 = #testd.parametric_attr<i64>} : () -> ()
+  "testd.dyn_attr_base"() {attr1 = #testd.parametric_attr<i64>} : () -> ()
   return
 }
 
 // -----
 
-func.func @failedDynBaseConstraint() {
-  // expected-error@+1 {{expected base type 'testd.parametric' but got 'i32'}}
-  "testd.dynbase"() : () -> i32
+func.func @failedDynTypeBaseConstraint() {
+  // expected-error@+1 {{expected base type 'testd.parametric' but got 'builtin.integer'}}
+  "testd.dyn_type_base"() : () -> i32
+  return
+}
+
+// -----
+
+func.func @failedDynAttrBaseConstraintNotType() {
+  // expected-error@+1 {{expected base attribute 'testd.parametric_attr' but got 'builtin.type'}}
+  "testd.dyn_attr_base"() {attr1 = i32}: () -> ()
+  return
+}
+
+// -----
+
+
+func.func @succeededNamedBaseConstraint() {
+  // CHECK: "testd.named_type_base"() : () -> i32
+  "testd.named_type_base"() : () -> i32
+  // CHECK: "testd.named_type_base"() : () -> i64
+  "testd.named_type_base"() : () -> i64
+  // CHECK: "testd.named_attr_base"() {attr1 = 0 : i32} : () -> ()
+  "testd.named_attr_base"() {attr1 = 0 : i32} : () -> ()
+  // CHECK: "testd.named_attr_base"() {attr1 = 0 : i64} : () -> ()
+  "testd.named_attr_base"() {attr1 = 0 : i64} : () -> ()
+  return
+}
+
+// -----
+
+func.func @failedNamedTypeBaseConstraint() {
+  // expected-error@+1 {{expected base type 'builtin.integer' but got 'builtin.vector'}}
+  "testd.named_type_base"() : () -> vector<i32>
+  return
+}
+
+// -----
+
+func.func @failedDynAttrBaseConstraintNotType() {
+  // expected-error@+1 {{expected base attribute 'builtin.integer' but got 'builtin.type'}}
+  "testd.named_attr_base"() {attr1 = i32}: () -> ()
   return
 }
 

From 2b804f875579995b1588f1a079e265929163d0e4 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 18 Jan 2024 10:41:45 -0600
Subject: [PATCH 010/843] [LinkerWrapper][Obvious] Fix move on temporary object

Summary:
This causes warnings because it is already a temporary and does not need
to be moved.
---
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index 03c83e2f92b32..bfb54f58330bd 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -1439,7 +1439,7 @@ getDeviceInput(const ArgList &Args) {
           if (Index == CompatibleTargets.size() - 1)
             InputFiles[ID].emplace_back(std::move(Binary));
           else
-            InputFiles[ID].emplace_back(std::move(Binary.copy()));
+            InputFiles[ID].emplace_back(Binary.copy());
         }
 
         // If we extracted any files we need to check all the symbols again.

From b75b9d82f576fecbdec98f7cd076be7a72f70dbf Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 18 Jan 2024 16:45:10 +0000
Subject: [PATCH 011/843] [lldb] Correct function names in
 ProcessGDBRemote::ParseFlagsFields log messages

This has to be specified in the string because otherwise we'd get the
lambda's name, and I incorrectly used the name of the calling function here.
---
 .../Process/gdb-remote/ProcessGDBRemote.cpp   | 62 ++++++++++---------
 1 file changed, 33 insertions(+), 29 deletions(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 316be471df929..23834d403579c 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -4189,52 +4189,55 @@ static std::vector<RegisterFlags::Field> ParseFlagsFields(XMLNode flags_node,
       // Note that XML in general requires that each of these attributes only
       // appears once, so we don't have to handle that here.
       if (attr_name == "name") {
-        LLDB_LOG(log,
-                 "ProcessGDBRemote::ParseFlags Found field node name \"{0}\"",
-                 attr_value.data());
+        LLDB_LOG(
+            log,
+            "ProcessGDBRemote::ParseFlagsFields Found field node name \"{0}\"",
+            attr_value.data());
         name = attr_value;
       } else if (attr_name == "start") {
         unsigned parsed_start = 0;
         if (llvm::to_integer(attr_value, parsed_start)) {
           if (parsed_start > max_start_bit) {
-            LLDB_LOG(
-                log,
-                "ProcessGDBRemote::ParseFlags Invalid start {0} in field node, "
-                "cannot be > {1}",
-                parsed_start, max_start_bit);
+            LLDB_LOG(log,
+                     "ProcessGDBRemote::ParseFlagsFields Invalid start {0} in "
+                     "field node, "
+                     "cannot be > {1}",
+                     parsed_start, max_start_bit);
           } else
             start = parsed_start;
         } else {
-          LLDB_LOG(log,
-                   "ProcessGDBRemote::ParseFlags Invalid start \"{0}\" in "
-                   "field node",
-                   attr_value.data());
+          LLDB_LOG(
+              log,
+              "ProcessGDBRemote::ParseFlagsFields Invalid start \"{0}\" in "
+              "field node",
+              attr_value.data());
         }
       } else if (attr_name == "end") {
         unsigned parsed_end = 0;
         if (llvm::to_integer(attr_value, parsed_end))
           if (parsed_end > max_start_bit) {
-            LLDB_LOG(
-                log,
-                "ProcessGDBRemote::ParseFlags Invalid end {0} in field node, "
-                "cannot be > {1}",
-                parsed_end, max_start_bit);
+            LLDB_LOG(log,
+                     "ProcessGDBRemote::ParseFlagsFields Invalid end {0} in "
+                     "field node, "
+                     "cannot be > {1}",
+                     parsed_end, max_start_bit);
           } else
             end = parsed_end;
         else {
-          LLDB_LOG(
-              log,
-              "ProcessGDBRemote::ParseFlags Invalid end \"{0}\" in field node",
-              attr_value.data());
+          LLDB_LOG(log,
+                   "ProcessGDBRemote::ParseFlagsFields Invalid end \"{0}\" in "
+                   "field node",
+                   attr_value.data());
         }
       } else if (attr_name == "type") {
         // Type is a known attribute but we do not currently use it and it is
         // not required.
       } else {
-        LLDB_LOG(log,
-                 "ProcessGDBRemote::ParseFlags Ignoring unknown attribute "
-                 "\"{0}\" in field node",
-                 attr_name.data());
+        LLDB_LOG(
+            log,
+            "ProcessGDBRemote::ParseFlagsFields Ignoring unknown attribute "
+            "\"{0}\" in field node",
+            attr_name.data());
       }
 
       return true; // Walk all attributes of the field.
@@ -4242,10 +4245,11 @@ static std::vector<RegisterFlags::Field> ParseFlagsFields(XMLNode flags_node,
 
     if (name && start && end) {
       if (*start > *end) {
-        LLDB_LOG(log,
-                 "ProcessGDBRemote::ParseFlags Start {0} > end {1} in field "
-                 "\"{2}\", ignoring",
-                 *start, *end, name->data());
+        LLDB_LOG(
+            log,
+            "ProcessGDBRemote::ParseFlagsFields Start {0} > end {1} in field "
+            "\"{2}\", ignoring",
+            *start, *end, name->data());
       } else {
         fields.push_back(RegisterFlags::Field(name->str(), *start, *end));
       }

From 774b9577866239f577180fdd8b78159f0a5794a5 Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Thu, 18 Jan 2024 17:48:38 +0100
Subject: [PATCH 012/843] [SPIR-V] improve performance of Module Analysis stage
 in the part of processing "other instructions" (#76047)

The goal of this PR is to fix an issue when Module Analysis stage is not
able to complete processing of a really big LLVM source:
https://github.com/llvm/llvm-project/issues/76048.

There is an example of a bulky LLVM source:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/blob/main/test/SpecConstants/long-spec-const-composite.ll

Processing of this file with
`llc -mtriple=spirv64-unknown-unknown -O0 long-spec-const-composite.ll
-o long-spec-const-composite.spvt`
to produce SPIR-V output using LLVM SPIR-V backend takes too long, and
I've never been able to see it actually completes. After the patch from
this PR applied elapsed time for me is ~30 sec.

The fix changes underlying data structure to be `std::set` to trace
instructions with identical operands instead of the existing approach of
the `findSameInstrInMS()` function.
---
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 73 ++++++++++---------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 027e4c2b46d87..7222a93357b79 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -263,34 +263,6 @@ void SPIRVModuleAnalysis::processDefInstrs(const Module &M) {
       [](const SPIRV::DTSortableEntry *E) { return E->getIsFunc(); }, true);
 }
 
-// True if there is an instruction in the MS list with all the same operands as
-// the given instruction has (after the given starting index).
-// TODO: maybe it needs to check Opcodes too.
-static bool findSameInstrInMS(const MachineInstr &A,
-                              SPIRV::ModuleSectionType MSType,
-                              SPIRV::ModuleAnalysisInfo &MAI,
-                              unsigned StartOpIndex = 0) {
-  for (const auto *B : MAI.MS[MSType]) {
-    const unsigned NumAOps = A.getNumOperands();
-    if (NumAOps != B->getNumOperands() || A.getNumDefs() != B->getNumDefs())
-      continue;
-    bool AllOpsMatch = true;
-    for (unsigned i = StartOpIndex; i < NumAOps && AllOpsMatch; ++i) {
-      if (A.getOperand(i).isReg() && B->getOperand(i).isReg()) {
-        Register RegA = A.getOperand(i).getReg();
-        Register RegB = B->getOperand(i).getReg();
-        AllOpsMatch = MAI.getRegisterAlias(A.getMF(), RegA) ==
-                      MAI.getRegisterAlias(B->getMF(), RegB);
-      } else {
-        AllOpsMatch = A.getOperand(i).isIdenticalTo(B->getOperand(i));
-      }
-    }
-    if (AllOpsMatch)
-      return true;
-  }
-  return false;
-}
-
 // Look for IDs declared with Import linkage, and map the corresponding function
 // to the register defining that variable (which will usually be the result of
 // an OpFunction). This lets us call externally imported functions using
@@ -319,15 +291,43 @@ void SPIRVModuleAnalysis::collectFuncNames(MachineInstr &MI,
   }
 }
 
+using InstrSignature = SmallVector<size_t>;
+using InstrTraces = std::set<InstrSignature>;
+
+// Returns a representation of an instruction as a vector of MachineOperand
+// hash values, see llvm::hash_value(const MachineOperand &MO) for details.
+// This creates a signature of the instruction with the same content
+// that MachineOperand::isIdenticalTo uses for comparison.
+static InstrSignature instrToSignature(MachineInstr &MI,
+                                       SPIRV::ModuleAnalysisInfo &MAI) {
+  InstrSignature Signature;
+  for (unsigned i = 0; i < MI.getNumOperands(); ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    size_t h;
+    if (MO.isReg()) {
+      Register RegAlias = MAI.getRegisterAlias(MI.getMF(), MO.getReg());
+      // mimic llvm::hash_value(const MachineOperand &MO)
+      h = hash_combine(MO.getType(), (unsigned)RegAlias, MO.getSubReg(),
+                       MO.isDef());
+    } else {
+      h = hash_value(MO);
+    }
+    Signature.push_back(h);
+  }
+  return Signature;
+}
+
 // Collect the given instruction in the specified MS. We assume global register
 // numbering has already occurred by this point. We can directly compare reg
 // arguments when detecting duplicates.
 static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
-                              SPIRV::ModuleSectionType MSType,
+                              SPIRV::ModuleSectionType MSType, InstrTraces &IS,
                               bool Append = true) {
   MAI.setSkipEmission(&MI);
-  if (findSameInstrInMS(MI, MSType, MAI))
-    return; // Found a duplicate, so don't add it.
+  InstrSignature MISign = instrToSignature(MI, MAI);
+  auto FoundMI = IS.insert(MISign);
+  if (!FoundMI.second)
+    return; // insert failed, so we found a duplicate; don't add it to MAI.MS
   // No duplicates, so add it.
   if (Append)
     MAI.MS[MSType].push_back(&MI);
@@ -338,6 +338,7 @@ static void collectOtherInstr(MachineInstr &MI, SPIRV::ModuleAnalysisInfo &MAI,
 // Some global instructions make reference to function-local ID regs, so cannot
 // be correctly collected until these registers are globally numbered.
 void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
+  InstrTraces IS;
   for (auto F = M.begin(), E = M.end(); F != E; ++F) {
     if ((*F).isDeclaration())
       continue;
@@ -349,20 +350,20 @@ void SPIRVModuleAnalysis::processOtherInstrs(const Module &M) {
           continue;
         const unsigned OpCode = MI.getOpcode();
         if (OpCode == SPIRV::OpName || OpCode == SPIRV::OpMemberName) {
-          collectOtherInstr(MI, MAI, SPIRV::MB_DebugNames);
+          collectOtherInstr(MI, MAI, SPIRV::MB_DebugNames, IS);
         } else if (OpCode == SPIRV::OpEntryPoint) {
-          collectOtherInstr(MI, MAI, SPIRV::MB_EntryPoints);
+          collectOtherInstr(MI, MAI, SPIRV::MB_EntryPoints, IS);
         } else if (TII->isDecorationInstr(MI)) {
-          collectOtherInstr(MI, MAI, SPIRV::MB_Annotations);
+          collectOtherInstr(MI, MAI, SPIRV::MB_Annotations, IS);
           collectFuncNames(MI, &*F);
         } else if (TII->isConstantInstr(MI)) {
           // Now OpSpecConstant*s are not in DT,
           // but they need to be collected anyway.
-          collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars);
+          collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, IS);
         } else if (OpCode == SPIRV::OpFunction) {
           collectFuncNames(MI, &*F);
         } else if (OpCode == SPIRV::OpTypeForwardPointer) {
-          collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, false);
+          collectOtherInstr(MI, MAI, SPIRV::MB_TypeConstVars, IS, false);
         }
       }
   }

From cb2f340850db007aebf5012858697ba5afc1ce4e Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 18 Jan 2024 10:50:08 -0600
Subject: [PATCH 013/843] [CUDA] Disable registering surfaces and textures with
 the new driver

Summary:
These runtime calls don't seem to be supported anymore, disable them for
now.
---
 clang/test/Driver/linker-wrapper-image.c                | 2 --
 clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp | 3 ++-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/linker-wrapper-image.c b/clang/test/Driver/linker-wrapper-image.c
index 147d315f8e399..fa2e59e36b382 100644
--- a/clang/test/Driver/linker-wrapper-image.c
+++ b/clang/test/Driver/linker-wrapper-image.c
@@ -121,11 +121,9 @@
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: sw.surface:
-// CUDA-NEXT:   call void @__cudaRegisterSurface(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %extern)
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: sw.texture:
-// CUDA-NEXT:   call void @__cudaRegisterTexture(ptr %0, ptr %addr, ptr %name, ptr %name, i32 %textype, i32 %normalized, i32 %extern)
 // CUDA-NEXT:   br label %if.end
 
 //      CUDA: if.end:
diff --git a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
index bfb54f58330bd..5485a4b74bf8a 100644
--- a/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
+++ b/clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp
@@ -916,7 +916,8 @@ wrapDeviceImages(ArrayRef<std::unique_ptr<MemoryBuffer>> Buffers,
   case OFK_Cuda:
     if (Error Err = offloading::wrapCudaBinary(
             M, BuffersToWrap.front(),
-            offloading::getOffloadEntryArray(M, "cuda_offloading_entries")))
+            offloading::getOffloadEntryArray(M, "cuda_offloading_entries"),
+            /*Suffix=*/"", /*EmitSurfacesAndTextures=*/false))
       return std::move(Err);
     break;
   case OFK_HIP:

From b06bc7c6a00c4e8acac4fa76e402c6a7e2035090 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 18 Jan 2024 09:04:11 -0800
Subject: [PATCH 014/843] [mlir][flang][openacc] Device type support on acc
 routine op (#78375)

This patch add support for device_type on the acc.routine operation.
device_type can be specified on seq, worker, vector, gang and bind
information.

The support is following the same design than the one for compute
operations, data operation and the loop operation.
---
 flang/lib/Lower/OpenACC.cpp                   | 138 +++++++--
 flang/test/Lower/OpenACC/acc-routine.f90      |  18 +-
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |  58 +++-
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       | 290 ++++++++++++++++--
 mlir/test/Dialect/OpenACC/ops.mlir            |   4 +-
 .../Dialect/OpenACC/OpenACCOpsTest.cpp        |  58 ++++
 6 files changed, 498 insertions(+), 68 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index db9ed72bc8725..fd89d27db74dc 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -3469,6 +3469,72 @@ static void genACC(Fortran::lower::AbstractConverter &converter,
   llvm_unreachable("unsupported declarative directive");
 }
 
+static bool hasDeviceType(llvm::SmallVector<mlir::Attribute> &arrayAttr,
+                          mlir::acc::DeviceType deviceType) {
+  for (auto attr : arrayAttr) {
+    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+    if (deviceTypeAttr.getValue() == deviceType)
+      return true;
+  }
+  return false;
+}
+
+template <typename RetTy, typename AttrTy>
+static std::optional<RetTy>
+getAttributeValueByDeviceType(llvm::SmallVector<mlir::Attribute> &attributes,
+                              llvm::SmallVector<mlir::Attribute> &deviceTypes,
+                              mlir::acc::DeviceType deviceType) {
+  assert(attributes.size() == deviceTypes.size() &&
+         "expect same number of attributes");
+  for (auto it : llvm::enumerate(deviceTypes)) {
+    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(it.value());
+    if (deviceTypeAttr.getValue() == deviceType) {
+      if constexpr (std::is_same_v<mlir::StringAttr, AttrTy>) {
+        auto strAttr = mlir::dyn_cast<AttrTy>(attributes[it.index()]);
+        return strAttr.getValue();
+      } else if constexpr (std::is_same_v<mlir::IntegerAttr, AttrTy>) {
+        auto intAttr =
+            mlir::dyn_cast<mlir::IntegerAttr>(attributes[it.index()]);
+        return intAttr.getInt();
+      }
+    }
+  }
+  return std::nullopt;
+}
+
+static bool compareDeviceTypeInfo(
+    mlir::acc::RoutineOp op,
+    llvm::SmallVector<mlir::Attribute> &bindNameArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &bindNameDeviceTypeArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &gangArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &gangDimArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &gangDimDeviceTypeArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &seqArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &workerArrayAttr,
+    llvm::SmallVector<mlir::Attribute> &vectorArrayAttr) {
+  for (uint32_t dtypeInt = 0;
+       dtypeInt != mlir::acc::getMaxEnumValForDeviceType(); ++dtypeInt) {
+    auto dtype = static_cast<mlir::acc::DeviceType>(dtypeInt);
+    if (op.getBindNameValue(dtype) !=
+        getAttributeValueByDeviceType<llvm::StringRef, mlir::StringAttr>(
+            bindNameArrayAttr, bindNameDeviceTypeArrayAttr, dtype))
+      return false;
+    if (op.hasGang(dtype) != hasDeviceType(gangArrayAttr, dtype))
+      return false;
+    if (op.getGangDimValue(dtype) !=
+        getAttributeValueByDeviceType<int64_t, mlir::IntegerAttr>(
+            gangDimArrayAttr, gangDimDeviceTypeArrayAttr, dtype))
+      return false;
+    if (op.hasSeq(dtype) != hasDeviceType(seqArrayAttr, dtype))
+      return false;
+    if (op.hasWorker(dtype) != hasDeviceType(workerArrayAttr, dtype))
+      return false;
+    if (op.hasVector(dtype) != hasDeviceType(vectorArrayAttr, dtype))
+      return false;
+  }
+  return true;
+}
+
 static void attachRoutineInfo(mlir::func::FuncOp func,
                               mlir::SymbolRefAttr routineAttr) {
   llvm::SmallVector<mlir::SymbolRefAttr> routines;
@@ -3518,17 +3584,23 @@ void Fortran::lower::genOpenACCRoutineConstruct(
       funcName = funcOp.getName();
     }
   }
-  bool hasSeq = false, hasGang = false, hasWorker = false, hasVector = false,
-       hasNohost = false;
-  std::optional<std::string> bindName = std::nullopt;
-  std::optional<int64_t> gangDim = std::nullopt;
+  bool hasNohost = false;
+
+  llvm::SmallVector<mlir::Attribute> seqDeviceTypes, vectorDeviceTypes,
+      workerDeviceTypes, bindNameDeviceTypes, bindNames, gangDeviceTypes,
+      gangDimDeviceTypes, gangDimValues;
+
+  // device_type attribute is set to `none` until a device_type clause is
+  // encountered.
+  auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None);
 
   for (const Fortran::parser::AccClause &clause : clauses.v) {
     if (std::get_if<Fortran::parser::AccClause::Seq>(&clause.u)) {
-      hasSeq = true;
+      seqDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (const auto *gangClause =
                    std::get_if<Fortran::parser::AccClause::Gang>(&clause.u)) {
-      hasGang = true;
+
       if (gangClause->v) {
         const Fortran::parser::AccGangArgList &x = *gangClause->v;
         for (const Fortran::parser::AccGangArg &gangArg : x.v) {
@@ -3539,21 +3611,27 @@ void Fortran::lower::genOpenACCRoutineConstruct(
             if (!dimValue)
               mlir::emitError(loc,
                               "dim value must be a constant positive integer");
-            gangDim = *dimValue;
+            gangDimValues.push_back(
+                builder.getIntegerAttr(builder.getI64Type(), *dimValue));
+            gangDimDeviceTypes.push_back(crtDeviceTypeAttr);
           }
         }
+      } else {
+        gangDeviceTypes.push_back(crtDeviceTypeAttr);
       }
     } else if (std::get_if<Fortran::parser::AccClause::Vector>(&clause.u)) {
-      hasVector = true;
+      vectorDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (std::get_if<Fortran::parser::AccClause::Worker>(&clause.u)) {
-      hasWorker = true;
+      workerDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (std::get_if<Fortran::parser::AccClause::Nohost>(&clause.u)) {
       hasNohost = true;
     } else if (const auto *bindClause =
                    std::get_if<Fortran::parser::AccClause::Bind>(&clause.u)) {
       if (const auto *name =
               std::get_if<Fortran::parser::Name>(&bindClause->v.u)) {
-        bindName = converter.mangleName(*name->symbol);
+        bindNames.push_back(
+            builder.getStringAttr(converter.mangleName(*name->symbol)));
+        bindNameDeviceTypes.push_back(crtDeviceTypeAttr);
       } else if (const auto charExpr =
                      std::get_if<Fortran::parser::ScalarDefaultCharExpr>(
                          &bindClause->v.u)) {
@@ -3562,8 +3640,18 @@ void Fortran::lower::genOpenACCRoutineConstruct(
                                                           *charExpr);
         if (!name)
           mlir::emitError(loc, "Could not retrieve the bind name");
-        bindName = *name;
+        bindNames.push_back(builder.getStringAttr(*name));
+        bindNameDeviceTypes.push_back(crtDeviceTypeAttr);
       }
+    } else if (const auto *deviceTypeClause =
+                   std::get_if<Fortran::parser::AccClause::DeviceType>(
+                       &clause.u)) {
+      const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
+          deviceTypeClause->v;
+      assert(deviceTypeExprList.v.size() == 1 &&
+             "expect only one device_type expr");
+      crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
+          builder.getContext(), getDeviceType(deviceTypeExprList.v.front().v));
     }
   }
 
@@ -3575,12 +3663,11 @@ void Fortran::lower::genOpenACCRoutineConstruct(
     if (routineOp.getFuncName().str().compare(funcName) == 0) {
       // If the routine is already specified with the same clauses, just skip
       // the operation creation.
-      if (routineOp.getBindName() == bindName &&
-          routineOp.getGang() == hasGang &&
-          routineOp.getWorker() == hasWorker &&
-          routineOp.getVector() == hasVector && routineOp.getSeq() == hasSeq &&
-          routineOp.getNohost() == hasNohost &&
-          routineOp.getGangDim() == gangDim)
+      if (compareDeviceTypeInfo(routineOp, bindNames, bindNameDeviceTypes,
+                                gangDeviceTypes, gangDimValues,
+                                gangDimDeviceTypes, seqDeviceTypes,
+                                workerDeviceTypes, vectorDeviceTypes) &&
+          routineOp.getNohost() == hasNohost)
         return;
       mlir::emitError(loc, "Routine already specified with different clauses");
     }
@@ -3588,10 +3675,19 @@ void Fortran::lower::genOpenACCRoutineConstruct(
 
   modBuilder.create<mlir::acc::RoutineOp>(
       loc, routineOpName.str(), funcName,
-      bindName ? builder.getStringAttr(*bindName) : mlir::StringAttr{}, hasGang,
-      hasWorker, hasVector, hasSeq, hasNohost, /*implicit=*/false,
-      gangDim ? builder.getIntegerAttr(builder.getIntegerType(32), *gangDim)
-              : mlir::IntegerAttr{});
+      bindNames.empty() ? nullptr : builder.getArrayAttr(bindNames),
+      bindNameDeviceTypes.empty() ? nullptr
+                                  : builder.getArrayAttr(bindNameDeviceTypes),
+      workerDeviceTypes.empty() ? nullptr
+                                : builder.getArrayAttr(workerDeviceTypes),
+      vectorDeviceTypes.empty() ? nullptr
+                                : builder.getArrayAttr(vectorDeviceTypes),
+      seqDeviceTypes.empty() ? nullptr : builder.getArrayAttr(seqDeviceTypes),
+      hasNohost, /*implicit=*/false,
+      gangDeviceTypes.empty() ? nullptr : builder.getArrayAttr(gangDeviceTypes),
+      gangDimValues.empty() ? nullptr : builder.getArrayAttr(gangDimValues),
+      gangDimDeviceTypes.empty() ? nullptr
+                                 : builder.getArrayAttr(gangDimDeviceTypes));
 
   if (funcOp)
     attachRoutineInfo(funcOp, builder.getSymbolRefAttr(routineOpName.str()));
diff --git a/flang/test/Lower/OpenACC/acc-routine.f90 b/flang/test/Lower/OpenACC/acc-routine.f90
index 8b94279503334..2fe150e70b0cf 100644
--- a/flang/test/Lower/OpenACC/acc-routine.f90
+++ b/flang/test/Lower/OpenACC/acc-routine.f90
@@ -2,12 +2,14 @@
 
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
-
+! CHECK: acc.routine @acc_routine_16 func(@_QPacc_routine18) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine16" [#acc.device_type<multicore>])
+! CHECK: acc.routine @acc_routine_15 func(@_QPacc_routine17) worker ([#acc.device_type<host>]) vector ([#acc.device_type<multicore>])
+! CHECK: acc.routine @acc_routine_14 func(@_QPacc_routine16) gang([#acc.device_type<nvidia>]) seq ([#acc.device_type<host>])
 ! CHECK: acc.routine @acc_routine_10 func(@_QPacc_routine11) seq
 ! CHECK: acc.routine @acc_routine_9 func(@_QPacc_routine10) seq
 ! CHECK: acc.routine @acc_routine_8 func(@_QPacc_routine9) bind("_QPacc_routine9a")
 ! CHECK: acc.routine @acc_routine_7 func(@_QPacc_routine8) bind("routine8_")
-! CHECK: acc.routine @acc_routine_6 func(@_QPacc_routine7) gang(dim = 1 : i32)
+! CHECK: acc.routine @acc_routine_6 func(@_QPacc_routine7) gang(dim: 1 : i64)
 ! CHECK: acc.routine @acc_routine_5 func(@_QPacc_routine6) nohost
 ! CHECK: acc.routine @acc_routine_4 func(@_QPacc_routine5) worker
 ! CHECK: acc.routine @acc_routine_3 func(@_QPacc_routine4) vector
@@ -106,3 +108,15 @@ subroutine acc_routine14()
 subroutine acc_routine15()
   !$acc routine bind(acc_routine16)
 end subroutine
+
+subroutine acc_routine16()
+  !$acc routine device_type(host) seq dtype(nvidia) gang
+end subroutine
+
+subroutine acc_routine17()
+  !$acc routine device_type(host) worker dtype(multicore) vector 
+end subroutine
+
+subroutine acc_routine18()
+  !$acc routine device_type(host) bind(acc_routine17) dtype(multicore) bind(acc_routine16) 
+end subroutine
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 24f129d92805c..7344ab2852b9c 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -1994,27 +1994,63 @@ def OpenACC_RoutineOp : OpenACC_Op<"routine", [IsolatedFromAbove]> {
 
   let arguments = (ins SymbolNameAttr:$sym_name,
                        SymbolNameAttr:$func_name,
-                       OptionalAttr<StrAttr>:$bind_name,
-                       UnitAttr:$gang,
-                       UnitAttr:$worker,
-                       UnitAttr:$vector,
-                       UnitAttr:$seq,
+                       OptionalAttr<StrArrayAttr>:$bindName,
+                       OptionalAttr<DeviceTypeArrayAttr>:$bindNameDeviceType,
+                       OptionalAttr<DeviceTypeArrayAttr>:$worker,
+                       OptionalAttr<DeviceTypeArrayAttr>:$vector,
+                       OptionalAttr<DeviceTypeArrayAttr>:$seq,
                        UnitAttr:$nohost,
                        UnitAttr:$implicit,
-                       OptionalAttr<APIntAttr>:$gangDim);
+                       OptionalAttr<DeviceTypeArrayAttr>:$gang,
+                       OptionalAttr<I64ArrayAttr>:$gangDim,
+                       OptionalAttr<DeviceTypeArrayAttr>:$gangDimDeviceType);
 
   let extraClassDeclaration = [{
     static StringRef getGangDimKeyword() { return "dim"; }
+
+    /// Return true if the op has the worker attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasWorker();
+    /// Return true if the op has the worker attribute for the given
+    /// device_type.
+    bool hasWorker(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the vector attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasVector();
+    /// Return true if the op has the vector attribute for the given
+    /// device_type.
+    bool hasVector(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the seq attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasSeq();
+    /// Return true if the op has the seq attribute for the given
+    /// device_type.
+    bool hasSeq(mlir::acc::DeviceType deviceType);
+
+    /// Return true if the op has the gang attribute for the
+    /// mlir::acc::DeviceType::None device_type.
+    bool hasGang();
+    /// Return true if the op has the gang attribute for the given
+    /// device_type.
+    bool hasGang(mlir::acc::DeviceType deviceType);
+
+    std::optional<int64_t> getGangDimValue();
+    std::optional<int64_t> getGangDimValue(mlir::acc::DeviceType deviceType);
+
+    std::optional<llvm::StringRef> getBindNameValue();
+    std::optional<llvm::StringRef> getBindNameValue(mlir::acc::DeviceType deviceType);
   }];
 
   let assemblyFormat = [{
     $sym_name `func` `(` $func_name `)`
     oilist (
-        `bind` `(` $bind_name `)`
-      | `gang` `` custom<RoutineGangClause>($gang, $gangDim)
-      | `worker` $worker
-      | `vector` $vector
-      | `seq` $seq
+        `bind` `(` custom<BindName>($bindName, $bindNameDeviceType) `)`
+      | `gang` `` custom<RoutineGangClause>($gang, $gangDim, $gangDimDeviceType)
+      | `worker` custom<DeviceTypeArrayAttr>($worker)
+      | `vector` custom<DeviceTypeArrayAttr>($vector)
+      | `seq` custom<DeviceTypeArrayAttr>($seq)
       | `nohost` $nohost
       | `implicit` $implicit
     ) attr-dict-with-keyword
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 664a0161b79c1..20465f6bb86ed 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -1033,7 +1033,7 @@ static ParseResult parseDeviceTypeOperandsWithKeywordOnly(
   return success();
 }
 
-bool hasDeviceTypeValues(std::optional<mlir::ArrayAttr> arrayAttr) {
+static bool hasDeviceTypeValues(std::optional<mlir::ArrayAttr> arrayAttr) {
   if (arrayAttr && *arrayAttr && arrayAttr->size() > 0)
     return true;
   return false;
@@ -2090,55 +2090,281 @@ LogicalResult acc::DeclareOp::verify() {
 // RoutineOp
 //===----------------------------------------------------------------------===//
 
+static bool hasDeviceType(std::optional<mlir::ArrayAttr> arrayAttr,
+                          mlir::acc::DeviceType deviceType) {
+  if (!hasDeviceTypeValues(arrayAttr))
+    return false;
+
+  for (auto attr : *arrayAttr) {
+    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+    if (deviceTypeAttr.getValue() == deviceType)
+      return true;
+  }
+
+  return false;
+}
+
+static unsigned getParallelismForDeviceType(acc::RoutineOp op,
+                                            acc::DeviceType dtype) {
+  unsigned parallelism = 0;
+  parallelism += (op.hasGang(dtype) || op.getGangDimValue(dtype)) ? 1 : 0;
+  parallelism += op.hasWorker(dtype) ? 1 : 0;
+  parallelism += op.hasVector(dtype) ? 1 : 0;
+  parallelism += op.hasSeq(dtype) ? 1 : 0;
+  return parallelism;
+}
+
 LogicalResult acc::RoutineOp::verify() {
-  int parallelism = 0;
-  parallelism += getGang() ? 1 : 0;
-  parallelism += getWorker() ? 1 : 0;
-  parallelism += getVector() ? 1 : 0;
-  parallelism += getSeq() ? 1 : 0;
+  unsigned baseParallelism =
+      getParallelismForDeviceType(*this, acc::DeviceType::None);
 
-  if (parallelism > 1)
+  if (baseParallelism > 1)
     return emitError() << "only one of `gang`, `worker`, `vector`, `seq` can "
                           "be present at the same time";
 
+  for (uint32_t dtypeInt = 0; dtypeInt != acc::getMaxEnumValForDeviceType();
+       ++dtypeInt) {
+    auto dtype = static_cast<acc::DeviceType>(dtypeInt);
+    if (dtype == acc::DeviceType::None)
+      continue;
+    unsigned parallelism = getParallelismForDeviceType(*this, dtype);
+
+    if (parallelism > 1 || (baseParallelism == 1 && parallelism == 1))
+      return emitError() << "only one of `gang`, `worker`, `vector`, `seq` can "
+                            "be present at the same time";
+  }
+
   return success();
 }
 
-static ParseResult parseRoutineGangClause(OpAsmParser &parser, UnitAttr &gang,
-                                          IntegerAttr &gangDim) {
-  // Since gang clause exists, ensure that unit attribute is set.
-  gang = UnitAttr::get(parser.getBuilder().getContext());
+static ParseResult parseBindName(OpAsmParser &parser, mlir::ArrayAttr &bindName,
+                                 mlir::ArrayAttr &deviceTypes) {
+  llvm::SmallVector<mlir::Attribute> bindNameAttrs;
+  llvm::SmallVector<mlir::Attribute> deviceTypeAttrs;
 
-  // Next, look for dim on gang. Don't initialize `gangDim` yet since
-  // we leave it without attribute if there is no `dim` specifier.
-  if (succeeded(parser.parseOptionalLParen())) {
-    // Look for syntax that looks like `dim = 1 : i32`.
-    // Thus first look for `dim =`
-    if (failed(parser.parseKeyword(RoutineOp::getGangDimKeyword())) ||
-        failed(parser.parseEqual()))
-      return failure();
+  if (failed(parser.parseCommaSeparatedList([&]() {
+        if (parser.parseAttribute(bindNameAttrs.emplace_back()))
+          return failure();
+        if (failed(parser.parseOptionalLSquare())) {
+          deviceTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
+              parser.getContext(), mlir::acc::DeviceType::None));
+        } else {
+          if (parser.parseAttribute(deviceTypeAttrs.emplace_back()) ||
+              parser.parseRSquare())
+            return failure();
+        }
+        return success();
+      })))
+    return failure();
 
-    int64_t dimValue;
-    Type valueType;
-    // Now look for `1 : i32`
-    if (failed(parser.parseInteger(dimValue)) ||
-        failed(parser.parseColonType(valueType)))
-      return failure();
+  bindName = ArrayAttr::get(parser.getContext(), bindNameAttrs);
+  deviceTypes = ArrayAttr::get(parser.getContext(), deviceTypeAttrs);
+
+  return success();
+}
 
-    gangDim = IntegerAttr::get(valueType, dimValue);
+static void printBindName(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                          std::optional<mlir::ArrayAttr> bindName,
+                          std::optional<mlir::ArrayAttr> deviceTypes) {
+  llvm::interleaveComma(llvm::zip(*bindName, *deviceTypes), p,
+                        [&](const auto &pair) {
+                          p << std::get<0>(pair);
+                          printSingleDeviceType(p, std::get<1>(pair));
+                        });
+}
+
+static ParseResult parseRoutineGangClause(OpAsmParser &parser,
+                                          mlir::ArrayAttr &gang,
+                                          mlir::ArrayAttr &gangDim,
+                                          mlir::ArrayAttr &gangDimDeviceTypes) {
+
+  llvm::SmallVector<mlir::Attribute> gangAttrs, gangDimAttrs,
+      gangDimDeviceTypeAttrs;
+  bool needCommaBeforeOperands = false;
 
-    if (failed(parser.parseRParen()))
+  // Gang keyword only
+  if (failed(parser.parseOptionalLParen())) {
+    gangAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
+        parser.getContext(), mlir::acc::DeviceType::None));
+    gang = ArrayAttr::get(parser.getContext(), gangAttrs);
+    return success();
+  }
+
+  // Parse keyword only attributes
+  if (succeeded(parser.parseOptionalLSquare())) {
+    if (failed(parser.parseCommaSeparatedList([&]() {
+          if (parser.parseAttribute(gangAttrs.emplace_back()))
+            return failure();
+          return success();
+        })))
+      return failure();
+    if (parser.parseRSquare())
       return failure();
+    needCommaBeforeOperands = true;
   }
 
+  if (needCommaBeforeOperands && failed(parser.parseComma()))
+    return failure();
+
+  if (failed(parser.parseCommaSeparatedList([&]() {
+        if (parser.parseKeyword(acc::RoutineOp::getGangDimKeyword()) ||
+            parser.parseColon() ||
+            parser.parseAttribute(gangDimAttrs.emplace_back()))
+          return failure();
+        if (succeeded(parser.parseOptionalLSquare())) {
+          if (parser.parseAttribute(gangDimDeviceTypeAttrs.emplace_back()) ||
+              parser.parseRSquare())
+            return failure();
+        } else {
+          gangDimDeviceTypeAttrs.push_back(mlir::acc::DeviceTypeAttr::get(
+              parser.getContext(), mlir::acc::DeviceType::None));
+        }
+        return success();
+      })))
+    return failure();
+
+  if (failed(parser.parseRParen()))
+    return failure();
+
+  gang = ArrayAttr::get(parser.getContext(), gangAttrs);
+  gangDim = ArrayAttr::get(parser.getContext(), gangDimAttrs);
+  gangDimDeviceTypes =
+      ArrayAttr::get(parser.getContext(), gangDimDeviceTypeAttrs);
+
   return success();
 }
 
-void printRoutineGangClause(OpAsmPrinter &p, Operation *op, UnitAttr gang,
-                            IntegerAttr gangDim) {
-  if (gangDim)
-    p << "(" << RoutineOp::getGangDimKeyword() << " = " << gangDim.getValue()
-      << " : " << gangDim.getType() << ")";
+void printRoutineGangClause(OpAsmPrinter &p, Operation *op,
+                            std::optional<mlir::ArrayAttr> gang,
+                            std::optional<mlir::ArrayAttr> gangDim,
+                            std::optional<mlir::ArrayAttr> gangDimDeviceTypes) {
+
+  if (!hasDeviceTypeValues(gangDimDeviceTypes) && hasDeviceTypeValues(gang) &&
+      gang->size() == 1) {
+    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*gang)[0]);
+    if (deviceTypeAttr.getValue() == mlir::acc::DeviceType::None)
+      return;
+  }
+
+  p << "(";
+
+  printDeviceTypes(p, gang);
+
+  if (hasDeviceTypeValues(gang) && hasDeviceTypeValues(gangDimDeviceTypes))
+    p << ", ";
+
+  if (hasDeviceTypeValues(gangDimDeviceTypes))
+    llvm::interleaveComma(llvm::zip(*gangDim, *gangDimDeviceTypes), p,
+                          [&](const auto &pair) {
+                            p << acc::RoutineOp::getGangDimKeyword() << ": ";
+                            p << std::get<0>(pair);
+                            printSingleDeviceType(p, std::get<1>(pair));
+                          });
+
+  p << ")";
+}
+
+static ParseResult parseDeviceTypeArrayAttr(OpAsmParser &parser,
+                                            mlir::ArrayAttr &deviceTypes) {
+  llvm::SmallVector<mlir::Attribute> attributes;
+  // Keyword only
+  if (failed(parser.parseOptionalLParen())) {
+    attributes.push_back(mlir::acc::DeviceTypeAttr::get(
+        parser.getContext(), mlir::acc::DeviceType::None));
+    deviceTypes = ArrayAttr::get(parser.getContext(), attributes);
+    return success();
+  }
+
+  // Parse device type attributes
+  if (succeeded(parser.parseOptionalLSquare())) {
+    if (failed(parser.parseCommaSeparatedList([&]() {
+          if (parser.parseAttribute(attributes.emplace_back()))
+            return failure();
+          return success();
+        })))
+      return failure();
+    if (parser.parseRSquare() || parser.parseRParen())
+      return failure();
+  }
+  deviceTypes = ArrayAttr::get(parser.getContext(), attributes);
+  return success();
+}
+
+static void
+printDeviceTypeArrayAttr(mlir::OpAsmPrinter &p, mlir::Operation *op,
+                         std::optional<mlir::ArrayAttr> deviceTypes) {
+
+  if (hasDeviceTypeValues(deviceTypes) && deviceTypes->size() == 1) {
+    auto deviceTypeAttr =
+        mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*deviceTypes)[0]);
+    if (deviceTypeAttr.getValue() == mlir::acc::DeviceType::None)
+      return;
+  }
+
+  if (!hasDeviceTypeValues(deviceTypes))
+    return;
+
+  p << "([";
+  llvm::interleaveComma(*deviceTypes, p, [&](mlir::Attribute attr) {
+    auto dTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+    p << dTypeAttr;
+  });
+  p << "])";
+}
+
+bool RoutineOp::hasWorker() { return hasWorker(mlir::acc::DeviceType::None); }
+
+bool RoutineOp::hasWorker(mlir::acc::DeviceType deviceType) {
+  return hasDeviceType(getWorker(), deviceType);
+}
+
+bool RoutineOp::hasVector() { return hasVector(mlir::acc::DeviceType::None); }
+
+bool RoutineOp::hasVector(mlir::acc::DeviceType deviceType) {
+  return hasDeviceType(getVector(), deviceType);
+}
+
+bool RoutineOp::hasSeq() { return hasSeq(mlir::acc::DeviceType::None); }
+
+bool RoutineOp::hasSeq(mlir::acc::DeviceType deviceType) {
+  return hasDeviceType(getSeq(), deviceType);
+}
+
+std::optional<llvm::StringRef> RoutineOp::getBindNameValue() {
+  return getBindNameValue(mlir::acc::DeviceType::None);
+}
+
+std::optional<llvm::StringRef>
+RoutineOp::getBindNameValue(mlir::acc::DeviceType deviceType) {
+  if (!hasDeviceTypeValues(getBindNameDeviceType()))
+    return std::nullopt;
+  if (auto pos = findSegment(*getBindNameDeviceType(), deviceType)) {
+    auto attr = (*getBindName())[*pos];
+    auto stringAttr = dyn_cast<mlir::StringAttr>(attr);
+    return stringAttr.getValue();
+  }
+  return std::nullopt;
+}
+
+bool RoutineOp::hasGang() { return hasGang(mlir::acc::DeviceType::None); }
+
+bool RoutineOp::hasGang(mlir::acc::DeviceType deviceType) {
+  return hasDeviceType(getGang(), deviceType);
+}
+
+std::optional<int64_t> RoutineOp::getGangDimValue() {
+  return getGangDimValue(mlir::acc::DeviceType::None);
+}
+
+std::optional<int64_t>
+RoutineOp::getGangDimValue(mlir::acc::DeviceType deviceType) {
+  if (!hasDeviceTypeValues(getGangDimDeviceType()))
+    return std::nullopt;
+  if (auto pos = findSegment(*getGangDimDeviceType(), deviceType)) {
+    auto intAttr = mlir::dyn_cast<mlir::IntegerAttr>((*getGangDim())[*pos]);
+    return intAttr.getInt();
+  }
+  return std::nullopt;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 8fa37bc98294c..99b44183758d9 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -1656,7 +1656,7 @@ acc.routine @acc_func_rout5 func(@acc_func) bind("acc_func_gpu_worker") worker
 acc.routine @acc_func_rout6 func(@acc_func) bind("acc_func_gpu_seq") seq
 acc.routine @acc_func_rout7 func(@acc_func) bind("acc_func_gpu_imp_gang") implicit gang
 acc.routine @acc_func_rout8 func(@acc_func) bind("acc_func_gpu_vector_nohost") vector nohost
-acc.routine @acc_func_rout9 func(@acc_func) bind("acc_func_gpu_gang_dim1") gang(dim = 1 : i32)
+acc.routine @acc_func_rout9 func(@acc_func) bind("acc_func_gpu_gang_dim1") gang(dim: 1 : i64)
 
 // CHECK-LABEL: func.func @acc_func(
 // CHECK: attributes {acc.routine_info = #acc.routine_info<[@acc_func_rout1, @acc_func_rout2, @acc_func_rout3,
@@ -1669,7 +1669,7 @@ acc.routine @acc_func_rout9 func(@acc_func) bind("acc_func_gpu_gang_dim1") gang(
 // CHECK: acc.routine @acc_func_rout6 func(@acc_func) bind("acc_func_gpu_seq") seq
 // CHECK: acc.routine @acc_func_rout7 func(@acc_func) bind("acc_func_gpu_imp_gang") gang implicit
 // CHECK: acc.routine @acc_func_rout8 func(@acc_func) bind("acc_func_gpu_vector_nohost") vector nohost
-// CHECK: acc.routine @acc_func_rout9 func(@acc_func) bind("acc_func_gpu_gang_dim1") gang(dim = 1 : i32)
+// CHECK: acc.routine @acc_func_rout9 func(@acc_func) bind("acc_func_gpu_gang_dim1") gang(dim: 1 : i64)
 
 // -----
 
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
index d78d7b0fdf676..474f887928992 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCOpsTest.cpp
@@ -347,3 +347,61 @@ TEST_F(OpenACCOpsTest, loopOpGangVectorWorkerTest) {
   }
   op->removeVectorAttr();
 }
+
+TEST_F(OpenACCOpsTest, routineOpTest) {
+  OwningOpRef<RoutineOp> op =
+      b.create<RoutineOp>(loc, TypeRange{}, ValueRange{});
+
+  EXPECT_FALSE(op->hasSeq());
+  EXPECT_FALSE(op->hasVector());
+  EXPECT_FALSE(op->hasWorker());
+
+  for (auto d : dtypes) {
+    EXPECT_FALSE(op->hasSeq(d));
+    EXPECT_FALSE(op->hasVector(d));
+    EXPECT_FALSE(op->hasWorker(d));
+  }
+
+  auto dtypeNone = DeviceTypeAttr::get(&context, DeviceType::None);
+  op->setSeqAttr(b.getArrayAttr({dtypeNone}));
+  EXPECT_TRUE(op->hasSeq());
+  for (auto d : dtypesWithoutNone)
+    EXPECT_FALSE(op->hasSeq(d));
+  op->removeSeqAttr();
+
+  op->setVectorAttr(b.getArrayAttr({dtypeNone}));
+  EXPECT_TRUE(op->hasVector());
+  for (auto d : dtypesWithoutNone)
+    EXPECT_FALSE(op->hasVector(d));
+  op->removeVectorAttr();
+
+  op->setWorkerAttr(b.getArrayAttr({dtypeNone}));
+  EXPECT_TRUE(op->hasWorker());
+  for (auto d : dtypesWithoutNone)
+    EXPECT_FALSE(op->hasWorker(d));
+  op->removeWorkerAttr();
+
+  op->setGangAttr(b.getArrayAttr({dtypeNone}));
+  EXPECT_TRUE(op->hasGang());
+  for (auto d : dtypesWithoutNone)
+    EXPECT_FALSE(op->hasGang(d));
+  op->removeGangAttr();
+
+  op->setGangDimDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
+  op->setGangDimAttr(b.getArrayAttr({b.getIntegerAttr(b.getI64Type(), 8)}));
+  EXPECT_TRUE(op->getGangDimValue().has_value());
+  EXPECT_EQ(op->getGangDimValue().value(), 8);
+  for (auto d : dtypesWithoutNone)
+    EXPECT_FALSE(op->getGangDimValue(d).has_value());
+  op->removeGangDimDeviceTypeAttr();
+  op->removeGangDimAttr();
+
+  op->setBindNameDeviceTypeAttr(b.getArrayAttr({dtypeNone}));
+  op->setBindNameAttr(b.getArrayAttr({b.getStringAttr("fname")}));
+  EXPECT_TRUE(op->getBindNameValue().has_value());
+  EXPECT_EQ(op->getBindNameValue().value(), "fname");
+  for (auto d : dtypesWithoutNone)
+    EXPECT_FALSE(op->getBindNameValue(d).has_value());
+  op->removeBindNameDeviceTypeAttr();
+  op->removeBindNameAttr();
+}

From ab244b64dfa6a35458693d4dea6223ff3fe7f7a9 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Fri, 12 Jan 2024 10:20:38 -0800
Subject: [PATCH 015/843] [ORC] Specialize MachOBuilder support for the
 LC_ID_DYLIB command.

Provides a natural API for adding LC_ID_DYLIB commands, including the arbitrary
install name.
---
 .../llvm/ExecutionEngine/Orc/MachOBuilder.h   | 42 +++++++++++++++++--
 1 file changed, 38 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
index 2bc66b11e2704..d6dcdabab0e59 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
@@ -43,20 +43,20 @@ struct MachOBuilderLoadCommandBase {
 };
 
 /// MachOBuilder load command wrapper type.
-template <MachO::LoadCommandType LCType> struct MachOBuilderLoadCommand;
+template <MachO::LoadCommandType LCType> struct MachOBuilderLoadCommandImplBase;
 
 #define HANDLE_LOAD_COMMAND(Name, Value, LCStruct)                             \
   template <>                                                                  \
-  struct MachOBuilderLoadCommand<MachO::Name>                                  \
+  struct MachOBuilderLoadCommandImplBase<MachO::Name>                          \
       : public MachO::LCStruct, public MachOBuilderLoadCommandBase {           \
     using CmdStruct = LCStruct;                                                \
-    MachOBuilderLoadCommand() {                                                \
+    MachOBuilderLoadCommandImplBase() {                                        \
       memset(&rawStruct(), 0, sizeof(CmdStruct));                              \
       cmd = Value;                                                             \
       cmdsize = sizeof(CmdStruct);                                             \
     }                                                                          \
     template <typename... ArgTs>                                               \
-    MachOBuilderLoadCommand(ArgTs &&...Args)                                   \
+    MachOBuilderLoadCommandImplBase(ArgTs &&...Args)                           \
         : CmdStruct{Value, sizeof(CmdStruct), std::forward<ArgTs>(Args)...} {} \
     CmdStruct &rawStruct() { return static_cast<CmdStruct &>(*this); }         \
     size_t size() const override { return cmdsize; }                           \
@@ -70,6 +70,40 @@ template <MachO::LoadCommandType LCType> struct MachOBuilderLoadCommand;
 
 #undef HANDLE_LOAD_COMMAND
 
+template <MachO::LoadCommandType LCType>
+struct MachOBuilderLoadCommand
+    : public MachOBuilderLoadCommandImplBase<LCType> {
+public:
+  MachOBuilderLoadCommand() = default;
+
+  template <typename... ArgTs>
+  MachOBuilderLoadCommand(ArgTs &&...Args)
+      : MachOBuilderLoadCommand(std::forward<ArgTs>(Args)...) {}
+};
+
+template <>
+struct MachOBuilderLoadCommand<MachO::LC_ID_DYLIB>
+    : public MachOBuilderLoadCommandImplBase<MachO::LC_ID_DYLIB> {
+
+  MachOBuilderLoadCommand(std::string Name, uint32_t Timestamp,
+                          uint32_t CurrentVersion,
+                          uint32_t CompatibilityVersion)
+      : MachOBuilderLoadCommandImplBase(
+            MachO::dylib{24, Timestamp, CurrentVersion, CompatibilityVersion}),
+        Name(std::move(Name)) {
+    cmdsize += (this->Name.size() + 1 + 3) & ~0x3;
+  }
+
+  size_t write(MutableArrayRef<char> Buf, size_t Offset,
+               bool SwapStruct) override {
+    Offset = writeMachOStruct(Buf, Offset, rawStruct(), SwapStruct);
+    strcpy(Buf.data() + Offset, Name.data());
+    return Offset + ((Name.size() + 1 + 3) & ~0x3);
+  }
+
+  std::string Name;
+};
+
 // Builds MachO objects.
 template <typename MachOTraits> class MachOBuilder {
 private:

From b08aca7a4fb9520db116e1cc9620f9eb7ce57764 Mon Sep 17 00:00:00 2001
From: Ben Langmuir <blangmuir@apple.com>
Date: Fri, 12 Jan 2024 11:03:33 -0800
Subject: [PATCH 016/843] [ORC][MachO] Support common load commands in the
 platform's mach-o header builder

Add a HeaderOptions struct that can be used to configure commonly-used
load commands LC_ID_DYLIB, LC_LOAD_DYLIB, and LC_RPATH when setupDylib
creates a mach-o header.
---
 .../llvm/ExecutionEngine/Orc/MachOBuilder.h   | 41 +++++++++++++
 .../llvm/ExecutionEngine/Orc/MachOPlatform.h  | 46 ++++++++++++--
 .../lib/ExecutionEngine/Orc/MachOPlatform.cpp | 60 +++++++++++++------
 3 files changed, 123 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
index d6dcdabab0e59..45dd5fcbebd79 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOBuilder.h
@@ -104,6 +104,47 @@ struct MachOBuilderLoadCommand<MachO::LC_ID_DYLIB>
   std::string Name;
 };
 
+template <>
+struct MachOBuilderLoadCommand<MachO::LC_LOAD_DYLIB>
+    : public MachOBuilderLoadCommandImplBase<MachO::LC_LOAD_DYLIB> {
+
+  MachOBuilderLoadCommand(std::string Name, uint32_t Timestamp,
+                          uint32_t CurrentVersion,
+                          uint32_t CompatibilityVersion)
+      : MachOBuilderLoadCommandImplBase(
+            MachO::dylib{24, Timestamp, CurrentVersion, CompatibilityVersion}),
+        Name(std::move(Name)) {
+    cmdsize += (this->Name.size() + 1 + 3) & ~0x3;
+  }
+
+  size_t write(MutableArrayRef<char> Buf, size_t Offset,
+               bool SwapStruct) override {
+    Offset = writeMachOStruct(Buf, Offset, rawStruct(), SwapStruct);
+    strcpy(Buf.data() + Offset, Name.data());
+    return Offset + ((Name.size() + 1 + 3) & ~0x3);
+  }
+
+  std::string Name;
+};
+
+template <>
+struct MachOBuilderLoadCommand<MachO::LC_RPATH>
+    : public MachOBuilderLoadCommandImplBase<MachO::LC_RPATH> {
+  MachOBuilderLoadCommand(std::string Path)
+      : MachOBuilderLoadCommandImplBase(12u), Path(std::move(Path)) {
+    cmdsize += (this->Path.size() + 1 + 3) & ~0x3;
+  }
+
+  size_t write(MutableArrayRef<char> Buf, size_t Offset,
+               bool SwapStruct) override {
+    Offset = writeMachOStruct(Buf, Offset, rawStruct(), SwapStruct);
+    strcpy(Buf.data() + Offset, Path.data());
+    return Offset + ((Path.size() + 1 + 3) & ~0x3);
+  }
+
+  std::string Path;
+};
+
 // Builds MachO objects.
 template <typename MachOTraits> class MachOBuilder {
 private:
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index d9b809ab5b11a..ff1c420d047e5 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -47,14 +47,38 @@ class MachOPlatform : public Platform {
     LLVM_MARK_AS_BITMASK_ENUM(/* LargestValue = */ Callable)
   };
 
+  /// Configuration for the mach-o header of a JITDylib. Specify common load
+  /// commands that should be added to the header.
+  struct HeaderOptions {
+    /// A dylib for use with a dylib command (e.g. LC_ID_DYLIB, LC_LOAD_DYLIB).
+    struct Dylib {
+      std::string Name;
+      uint32_t Timestamp;
+      uint32_t CurrentVersion;
+      uint32_t CompatibilityVersion;
+    };
+
+    /// Override for LC_IC_DYLIB. If this is nullopt, {JD.getName(), 0, 0, 0}
+    /// will be used.
+    std::optional<Dylib> IDDylib;
+    /// List of LC_LOAD_DYLIBs.
+    std::vector<Dylib> LoadDylibs;
+    /// List of LC_RPATHs.
+    std::vector<std::string> RPaths;
+
+    HeaderOptions() = default;
+    HeaderOptions(Dylib D) : IDDylib(std::move(D)) {}
+  };
+
   /// Used by setupJITDylib to create MachO header MaterializationUnits for
   /// JITDylibs.
   using MachOHeaderMUBuilder =
-      unique_function<std::unique_ptr<MaterializationUnit>(MachOPlatform &MOP)>;
+      unique_function<std::unique_ptr<MaterializationUnit>(MachOPlatform &MOP,
+                                                           HeaderOptions Opts)>;
 
   /// Simple MachO header graph builder.
   static inline std::unique_ptr<MaterializationUnit>
-  buildSimpleMachOHeaderMU(MachOPlatform &MOP);
+  buildSimpleMachOHeaderMU(MachOPlatform &MOP, HeaderOptions Opts);
 
   /// Try to create a MachOPlatform instance, adding the ORC runtime to the
   /// given JITDylib.
@@ -97,6 +121,7 @@ class MachOPlatform : public Platform {
   static Expected<std::unique_ptr<MachOPlatform>>
   Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
          JITDylib &PlatformJD, std::unique_ptr<DefinitionGenerator> OrcRuntime,
+         HeaderOptions PlatformJDOpts = {},
          MachOHeaderMUBuilder BuildMachOHeaderMU = buildSimpleMachOHeaderMU,
          std::optional<SymbolAliasMap> RuntimeAliases = std::nullopt);
 
@@ -104,6 +129,7 @@ class MachOPlatform : public Platform {
   static Expected<std::unique_ptr<MachOPlatform>>
   Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
          JITDylib &PlatformJD, const char *OrcRuntimePath,
+         HeaderOptions PlatformJDOpts = {},
          MachOHeaderMUBuilder BuildMachOHeaderMU = buildSimpleMachOHeaderMU,
          std::optional<SymbolAliasMap> RuntimeAliases = std::nullopt);
 
@@ -115,6 +141,11 @@ class MachOPlatform : public Platform {
   }
 
   Error setupJITDylib(JITDylib &JD) override;
+
+  /// Install any platform-specific symbols (e.g. `__dso_handle`) and create a
+  /// mach-o header based on the given options.
+  Error setupJITDylib(JITDylib &JD, HeaderOptions Opts);
+
   Error teardownJITDylib(JITDylib &JD) override;
   Error notifyAdding(ResourceTracker &RT,
                      const MaterializationUnit &MU) override;
@@ -258,6 +289,7 @@ class MachOPlatform : public Platform {
   MachOPlatform(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
                 JITDylib &PlatformJD,
                 std::unique_ptr<DefinitionGenerator> OrcRuntimeGenerator,
+                HeaderOptions PlatformJDOpts,
                 MachOHeaderMUBuilder BuildMachOHeaderMU, Error &Err);
 
   // Associate MachOPlatform JIT-side runtime support functions with handlers.
@@ -336,7 +368,8 @@ class MachOPlatform : public Platform {
 // Generates a MachO header.
 class SimpleMachOHeaderMU : public MaterializationUnit {
 public:
-  SimpleMachOHeaderMU(MachOPlatform &MOP, SymbolStringPtr HeaderStartSymbol);
+  SimpleMachOHeaderMU(MachOPlatform &MOP, SymbolStringPtr HeaderStartSymbol,
+                      MachOPlatform::HeaderOptions Opts);
   StringRef getName() const override { return "MachOHeaderMU"; }
   void materialize(std::unique_ptr<MaterializationResponsibility> R) override;
   void discard(const JITDylib &JD, const SymbolStringPtr &Sym) override;
@@ -346,6 +379,7 @@ class SimpleMachOHeaderMU : public MaterializationUnit {
                                             jitlink::Section &HeaderSection);
 
   MachOPlatform &MOP;
+  MachOPlatform::HeaderOptions Opts;
 
 private:
   struct HeaderSymbol {
@@ -365,8 +399,10 @@ class SimpleMachOHeaderMU : public MaterializationUnit {
 
 /// Simple MachO header graph builder.
 inline std::unique_ptr<MaterializationUnit>
-MachOPlatform::buildSimpleMachOHeaderMU(MachOPlatform &MOP) {
-  return std::make_unique<SimpleMachOHeaderMU>(MOP, MOP.MachOHeaderStartSymbol);
+MachOPlatform::buildSimpleMachOHeaderMU(MachOPlatform &MOP,
+                                        HeaderOptions Opts) {
+  return std::make_unique<SimpleMachOHeaderMU>(MOP, MOP.MachOHeaderStartSymbol,
+                                               std::move(Opts));
 }
 
 struct MachOHeaderInfo {
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
index 6c17f14aa4c7c..377a31e63ec11 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -255,12 +255,11 @@ struct ObjCImageInfoFlags {
 namespace llvm {
 namespace orc {
 
-Expected<std::unique_ptr<MachOPlatform>>
-MachOPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-                      JITDylib &PlatformJD,
-                      std::unique_ptr<DefinitionGenerator> OrcRuntime,
-                      MachOHeaderMUBuilder BuildMachOHeaderMU,
-                      std::optional<SymbolAliasMap> RuntimeAliases) {
+Expected<std::unique_ptr<MachOPlatform>> MachOPlatform::Create(
+    ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
+    JITDylib &PlatformJD, std::unique_ptr<DefinitionGenerator> OrcRuntime,
+    HeaderOptions PlatformJDOpts, MachOHeaderMUBuilder BuildMachOHeaderMU,
+    std::optional<SymbolAliasMap> RuntimeAliases) {
 
   // If the target is not supported then bail out immediately.
   if (!supportedTarget(ES.getTargetTriple()))
@@ -290,9 +289,9 @@ MachOPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
 
   // Create the instance.
   Error Err = Error::success();
-  auto P = std::unique_ptr<MachOPlatform>(
-      new MachOPlatform(ES, ObjLinkingLayer, PlatformJD, std::move(OrcRuntime),
-                        std::move(BuildMachOHeaderMU), Err));
+  auto P = std::unique_ptr<MachOPlatform>(new MachOPlatform(
+      ES, ObjLinkingLayer, PlatformJD, std::move(OrcRuntime),
+      std::move(PlatformJDOpts), std::move(BuildMachOHeaderMU), Err));
   if (Err)
     return std::move(Err);
   return std::move(P);
@@ -301,6 +300,7 @@ MachOPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
 Expected<std::unique_ptr<MachOPlatform>>
 MachOPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
                       JITDylib &PlatformJD, const char *OrcRuntimePath,
+                      HeaderOptions PlatformJDOpts,
                       MachOHeaderMUBuilder BuildMachOHeaderMU,
                       std::optional<SymbolAliasMap> RuntimeAliases) {
 
@@ -312,11 +312,16 @@ MachOPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
 
   return Create(ES, ObjLinkingLayer, PlatformJD,
                 std::move(*OrcRuntimeArchiveGenerator),
-                std::move(BuildMachOHeaderMU), std::move(RuntimeAliases));
+                std::move(PlatformJDOpts), std::move(BuildMachOHeaderMU),
+                std::move(RuntimeAliases));
 }
 
 Error MachOPlatform::setupJITDylib(JITDylib &JD) {
-  if (auto Err = JD.define(BuildMachOHeaderMU(*this)))
+  return setupJITDylib(JD, /*Opts=*/{});
+}
+
+Error MachOPlatform::setupJITDylib(JITDylib &JD, HeaderOptions Opts) {
+  if (auto Err = JD.define(BuildMachOHeaderMU(*this, std::move(Opts))))
     return Err;
 
   return ES.lookup({&JD}, MachOHeaderStartSymbol).takeError();
@@ -432,7 +437,8 @@ MachOPlatform::MachOPlatform(
     ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
     JITDylib &PlatformJD,
     std::unique_ptr<DefinitionGenerator> OrcRuntimeGenerator,
-    MachOHeaderMUBuilder BuildMachOHeaderMU, Error &Err)
+    HeaderOptions PlatformJDOpts, MachOHeaderMUBuilder BuildMachOHeaderMU,
+    Error &Err)
     : ES(ES), PlatformJD(PlatformJD), ObjLinkingLayer(ObjLinkingLayer),
       BuildMachOHeaderMU(std::move(BuildMachOHeaderMU)) {
   ErrorAsOutParameter _(&Err);
@@ -497,7 +503,8 @@ MachOPlatform::MachOPlatform(
   //    the support methods callable. The bootstrap is now complete.
 
   // Step (1) Add header materialization unit and request.
-  if ((Err = PlatformJD.define(this->BuildMachOHeaderMU(*this))))
+  if ((Err = PlatformJD.define(
+           this->BuildMachOHeaderMU(*this, std::move(PlatformJDOpts)))))
     return;
   if ((Err = ES.lookup(&PlatformJD, MachOHeaderStartSymbol).takeError()))
     return;
@@ -1669,9 +1676,10 @@ Error MachOPlatform::MachOPlatformPlugin::addSymbolTableRegistration(
 }
 
 template <typename MachOTraits>
-jitlink::Block &createTrivialHeaderBlock(MachOPlatform &MOP,
-                                         jitlink::LinkGraph &G,
-                                         jitlink::Section &HeaderSection) {
+jitlink::Block &createHeaderBlock(MachOPlatform &MOP,
+                                  const MachOPlatform::HeaderOptions &Opts,
+                                  JITDylib &JD, jitlink::LinkGraph &G,
+                                  jitlink::Section &HeaderSection) {
   auto HdrInfo =
       getMachOHeaderInfoFromTriple(MOP.getExecutionSession().getTargetTriple());
   MachOBuilder<MachOTraits> B(HdrInfo.PageSize);
@@ -1680,6 +1688,19 @@ jitlink::Block &createTrivialHeaderBlock(MachOPlatform &MOP,
   B.Header.cputype = HdrInfo.CPUType;
   B.Header.cpusubtype = HdrInfo.CPUSubType;
 
+  if (Opts.IDDylib)
+    B.template addLoadCommand<MachO::LC_ID_DYLIB>(
+        Opts.IDDylib->Name, Opts.IDDylib->Timestamp,
+        Opts.IDDylib->CurrentVersion, Opts.IDDylib->CompatibilityVersion);
+  else
+    B.template addLoadCommand<MachO::LC_ID_DYLIB>(JD.getName(), 0, 0, 0);
+
+  for (auto &D : Opts.LoadDylibs)
+    B.template addLoadCommand<MachO::LC_LOAD_DYLIB>(
+        D.Name, D.Timestamp, D.CurrentVersion, D.CompatibilityVersion);
+  for (auto &P : Opts.RPaths)
+    B.template addLoadCommand<MachO::LC_RPATH>(P);
+
   auto HeaderContent = G.allocateBuffer(B.layout());
   B.write(HeaderContent);
 
@@ -1688,10 +1709,11 @@ jitlink::Block &createTrivialHeaderBlock(MachOPlatform &MOP,
 }
 
 SimpleMachOHeaderMU::SimpleMachOHeaderMU(MachOPlatform &MOP,
-                                         SymbolStringPtr HeaderStartSymbol)
+                                         SymbolStringPtr HeaderStartSymbol,
+                                         MachOPlatform::HeaderOptions Opts)
     : MaterializationUnit(
           createHeaderInterface(MOP, std::move(HeaderStartSymbol))),
-      MOP(MOP) {}
+      MOP(MOP), Opts(std::move(Opts)) {}
 
 void SimpleMachOHeaderMU::materialize(
     std::unique_ptr<MaterializationResponsibility> R) {
@@ -1725,7 +1747,7 @@ SimpleMachOHeaderMU::createHeaderBlock(JITDylib &JD, jitlink::LinkGraph &G,
   switch (MOP.getExecutionSession().getTargetTriple().getArch()) {
   case Triple::aarch64:
   case Triple::x86_64:
-    return createTrivialHeaderBlock<MachO64LE>(MOP, G, HeaderSection);
+    return ::createHeaderBlock<MachO64LE>(MOP, Opts, JD, G, HeaderSection);
   default:
     llvm_unreachable("Unsupported architecture");
   }

From 15c1c85470a17283bd86fe68a702c74599bdcb5c Mon Sep 17 00:00:00 2001
From: Andrei Golubev <andrey.golubev@intel.com>
Date: Thu, 18 Jan 2024 19:16:48 +0200
Subject: [PATCH 017/843] [LLVM][ADT] Convert llvm::hash_code to unsigned
 explicitly in DenseMapInfo (#77743)

The getHashValue() signature returns a value of type 'unsigned' while
the hash_code could only be implicitly converted to 'size_t'. Depending
on the C++ implementation, this may or may not be a narrowing
conversion.

On some platform/compiler combination, this becomes a warning. To avoid
the warning (and better highlight the narrowing), do an explicit
conversion instead.

Co-authored-by: Orest Chura <orest.chura@intel.com>
---
 llvm/include/llvm/ADT/Hashing.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/Hashing.h b/llvm/include/llvm/ADT/Hashing.h
index 4e82ba9a6fba2..a5477362a5079 100644
--- a/llvm/include/llvm/ADT/Hashing.h
+++ b/llvm/include/llvm/ADT/Hashing.h
@@ -677,7 +677,9 @@ template <typename T> hash_code hash_value(const std::optional<T> &arg) {
 template <> struct DenseMapInfo<hash_code, void> {
   static inline hash_code getEmptyKey() { return hash_code(-1); }
   static inline hash_code getTombstoneKey() { return hash_code(-2); }
-  static unsigned getHashValue(hash_code val) { return val; }
+  static unsigned getHashValue(hash_code val) {
+    return static_cast<unsigned>(size_t(val));
+  }
   static bool isEqual(hash_code LHS, hash_code RHS) { return LHS == RHS; }
 };
 

From 964565f42eb8f06dc8610b44d56b432a8c2cb50b Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1@linux.ibm.com>
Date: Thu, 18 Jan 2024 18:23:57 +0100
Subject: [PATCH 018/843] [SystemZ] i128 cost model (#78528)

Update SystemZTTI to reflect the recent change of handling i128 as a
legal type in vector registers.
---
 .../SystemZ/SystemZTargetTransformInfo.cpp    | 107 +++++++++----
 .../SystemZ/SystemZTargetTransformInfo.h      |   2 +
 .../CostModel/SystemZ/i128-cmp-ext-conv.ll    | 143 ++++++++++++++++++
 .../Analysis/CostModel/SystemZ/int-arith.ll   |   6 +
 .../Analysis/CostModel/SystemZ/intrinsics.ll  |   9 ++
 .../Analysis/CostModel/SystemZ/load_store.ll  |  72 +++++----
 .../Analysis/CostModel/SystemZ/logic-i128.ll  |  48 ++++++
 .../Analysis/CostModel/SystemZ/logical.ll     |  12 ++
 8 files changed, 337 insertions(+), 62 deletions(-)
 create mode 100644 llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
 create mode 100644 llvm/test/Analysis/CostModel/SystemZ/logic-i128.ll

diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index e21d3090ba2fd..9370fb51a96c5 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -75,8 +75,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
   // here, so that constant hoisting will ignore this constant.
   if (BitSize == 0)
     return TTI::TCC_Free;
-  // No cost model for operations on integers larger than 64 bit implemented yet.
-  if (BitSize > 64)
+  // No cost model for operations on integers larger than 128 bit implemented yet.
+  if ((!ST->hasVector() && BitSize > 64) || BitSize > 128)
     return TTI::TCC_Free;
 
   if (Imm == 0)
@@ -96,7 +96,8 @@ InstructionCost SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
     return 2 * TTI::TCC_Basic;
   }
 
-  return 4 * TTI::TCC_Basic;
+  // i128 immediates loads from Constant Pool
+  return 2 * TTI::TCC_Basic;
 }
 
 InstructionCost SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
@@ -479,21 +480,27 @@ InstructionCost SystemZTTIImpl::getArithmeticInstrCost(
       return LIBCALL_COST;
 
     // Give discount for some combined logical operations if supported.
-    if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
+    if (Args.size() == 2) {
       if (Opcode == Instruction::Xor) {
         for (const Value *A : Args) {
           if (const Instruction *I = dyn_cast<Instruction>(A))
             if (I->hasOneUse() &&
-                (I->getOpcode() == Instruction::And ||
-                 I->getOpcode() == Instruction::Or ||
+                (I->getOpcode() == Instruction::Or ||
+                 I->getOpcode() == Instruction::And ||
                  I->getOpcode() == Instruction::Xor))
-              return 0;
+              if ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
+                  (isInt128InVR(Ty) &&
+                   (I->getOpcode() == Instruction::Or || ST->hasVectorEnhancements1())))
+                return 0;
         }
       }
-      else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+      else if (Opcode == Instruction::And || Opcode == Instruction::Or) {
         for (const Value *A : Args) {
           if (const Instruction *I = dyn_cast<Instruction>(A))
-            if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
+            if ((I->hasOneUse() && I->getOpcode() == Instruction::Xor) &&
+                ((ScalarBits <= 64 && ST->hasMiscellaneousExtensions3()) ||
+                 (isInt128InVR(Ty) &&
+                  (Opcode == Instruction::And || ST->hasVectorEnhancements1()))))
               return 0;
         }
       }
@@ -774,29 +781,63 @@ InstructionCost SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst,
     assert (!Dst->isVectorTy());
 
     if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+      if (Src->isIntegerTy(128))
+        return LIBCALL_COST;
       if (SrcScalarBits >= 32 ||
           (I != nullptr && isa<LoadInst>(I->getOperand(0))))
         return 1;
       return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
     }
 
-    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-        Src->isIntegerTy(1)) {
-      if (ST->hasLoadStoreOnCond2())
-        return 2; // li 0; loc 1
-
-      // This should be extension of a compare i1 result, which is done with
-      // ipm and a varying sequence of instructions.
-      unsigned Cost = 0;
-      if (Opcode == Instruction::SExt)
-        Cost = (DstScalarBits < 64 ? 3 : 4);
-      if (Opcode == Instruction::ZExt)
-        Cost = 3;
-      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
-      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
-        // If operands of an fp-type was compared, this costs +1.
-        Cost++;
-      return Cost;
+    if ((Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI) &&
+        Dst->isIntegerTy(128))
+      return LIBCALL_COST;
+
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt)) {
+      if (Src->isIntegerTy(1)) {
+        if (DstScalarBits == 128)
+          return 5 /*branch seq.*/;
+
+        if (ST->hasLoadStoreOnCond2())
+          return 2; // li 0; loc 1
+
+        // This should be extension of a compare i1 result, which is done with
+        // ipm and a varying sequence of instructions.
+        unsigned Cost = 0;
+        if (Opcode == Instruction::SExt)
+          Cost = (DstScalarBits < 64 ? 3 : 4);
+        if (Opcode == Instruction::ZExt)
+          Cost = 3;
+        Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+        if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+          // If operands of an fp-type was compared, this costs +1.
+          Cost++;
+        return Cost;
+      }
+      else if (isInt128InVR(Dst)) {
+        // Extensions from GPR to i128 (in VR) typically costs two instructions,
+        // but a zero-extending load would be just one extra instruction.
+        if (Opcode == Instruction::ZExt && I != nullptr)
+          if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+            if (Ld->hasOneUse())
+              return 1;
+        return 2;
+      }
+    }
+
+    if (Opcode == Instruction::Trunc && isInt128InVR(Src) && I != nullptr) {
+      if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+        if (Ld->hasOneUse())
+          return 0;  // Will be converted to GPR load.
+      bool OnlyTruncatingStores = true;
+      for (const User *U : I->users())
+        if (!isa<StoreInst>(U)) {
+          OnlyTruncatingStores = false;
+          break;
+        }
+      if (OnlyTruncatingStores)
+        return 0;
+      return 2; // Vector element extraction.
     }
   }
   else if (ST->hasVector()) {
@@ -930,7 +971,7 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       // A loaded value compared with 0 with multiple users becomes Load and
       // Test. The load is then not foldable, so return 0 cost for the ICmp.
       unsigned ScalarBits = ValTy->getScalarSizeInBits();
-      if (I != nullptr && ScalarBits >= 32)
+      if (I != nullptr && (ScalarBits == 32 || ScalarBits == 64))
         if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
           if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
             if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
@@ -943,8 +984,8 @@ InstructionCost SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       return Cost;
     }
     case Instruction::Select:
-      if (ValTy->isFloatingPointTy())
-        return 4; // No load on condition for FP - costs a conditional jump.
+      if (ValTy->isFloatingPointTy() || isInt128InVR(ValTy))
+        return 4; // No LOC for FP / i128 - costs a conditional jump.
       return 1; // Load On Condition / Select Register.
     }
   }
@@ -1157,6 +1198,10 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
                                   CostKind);
 
+  // FP128 is a legal type but kept in a register pair on older CPUs.
+  if (Src->isFP128Ty() && !ST->hasVectorEnhancements1())
+    return 2;
+
   unsigned NumOps =
     (Src->isVectorTy() ? getNumVectorRegs(Src) : getNumberOfParts(Src));
 
@@ -1177,10 +1222,6 @@ InstructionCost SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     }
   }
 
-  if (Src->getScalarSizeInBits() == 128)
-    // 128 bit scalars are held in a pair of two 64 bit registers.
-    NumOps *= 2;
-
   return  NumOps;
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index 62c59ddc3f06a..2cccdf6d17dac 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -28,6 +28,8 @@ class SystemZTTIImpl : public BasicTTIImplBase<SystemZTTIImpl> {
 
   unsigned const LIBCALL_COST = 30;
 
+  bool isInt128InVR(Type *Ty) { return Ty->isIntegerTy(128) && ST->hasVector(); }
+
 public:
   explicit SystemZTTIImpl(const SystemZTargetMachine *TM, const Function &F)
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
diff --git a/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
new file mode 100644
index 0000000000000..66da6de3bc768
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/i128-cmp-ext-conv.ll
@@ -0,0 +1,143 @@
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+;
+
+define i128 @fun1(i128 %val1, i128 %val2) {
+; CHECK-LABEL: 'fun1'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v128 = sext i1 %cmp to i128
+  %cmp = icmp eq i128 %val1, %val2
+  %v128 = sext i1 %cmp to i128
+  ret i128 %v128
+}
+
+define i128 @fun2(i128 %val1, i128 %val2) {
+; CHECK-LABEL: 'fun2'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 5 for instruction:   %v128 = zext i1 %cmp to i128
+  %cmp = icmp eq i128 %val1, %val2
+  %v128 = zext i1 %cmp to i128
+  ret i128 %v128
+}
+
+define i128 @fun3(i128 %val1, i128 %val2,
+                  i128 %val3, i128 %val4) {
+; CHECK-LABEL: 'fun3'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %cmp = icmp eq i128 %val1, %val2
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %add = add i128 %val3, %val4
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %sel = select i1 %cmp, i128 %val3, i128 %add
+  %cmp = icmp eq i128 %val1, %val2
+  %add = add i128 %val3, %val4
+  %sel = select i1 %cmp, i128 %val3, i128 %add
+  ret i128 %sel
+}
+
+define i128 @fun4(ptr %src) {
+; CHECK-LABEL: 'fun4'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
+  %v = load i64, ptr %src, align 8
+  %res = sext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun5(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: 'fun5'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = sext i64 %v to i128
+  %v = add i64 %lhs, %rhs
+  %res = sext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun6(ptr %src) {
+; CHECK-LABEL: 'fun6'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res = zext i64 %v to i128
+  %v = load i64, ptr %src, align 8
+  %res = zext i64 %v to i128
+  ret i128 %res
+}
+
+define i128 @fun7(i64 %lhs, i64 %rhs) {
+; CHECK-LABEL: 'fun7'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %res = zext i64 %v to i128
+  %v = add i64 %lhs, %rhs
+  %res = zext i64 %v to i128
+  ret i128 %res
+}
+
+; Truncating store is free.
+define void @fun8(i128 %lhs, i128 %rhs, ptr %dst) {
+; CHECK-LABEL: 'fun8'
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  store i64 %t, ptr %dst, align 8
+  ret void
+}
+
+; If there is a non-store user, an extraction is needed.
+define i64 @fun9(i128 %lhs, i128 %rhs, ptr %dst) {
+; CHECK-LABEL: 'fun9'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  store i64 %t, ptr %dst, align 8
+  ret i64 %t
+}
+
+; Truncation of load is free.
+define i64 @fun10(ptr %src) {
+; CHECK-LABEL: 'fun10'
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction:   %t = trunc i128 %v to i64
+  %v = load i128, ptr %src, align 8
+  %t = trunc i128 %v to i64
+  ret i64 %t
+}
+
+; If the load has another user, the truncation becomes an extract.
+define i64 @fun11(ptr %src, i128 %val2, ptr %dst) {
+; CHECK-LABEL: 'fun11'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = load i128, ptr %src, align 8
+  %t = trunc i128 %v to i64
+  %a = add i128 %v, %val2
+  store i128 %a, ptr %dst
+  ret i64 %t
+}
+
+; Trunction with a GPR use typically requires an extraction.
+define i64 @fun12(i128 %lhs, i128 %rhs) {
+; CHECK-LABEL: 'fun12'
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %t = trunc i128 %v to i64
+  %v = add i128 %lhs, %rhs
+  %t = trunc i128 %v to i64
+  ret i64 %t
+}
+
+; Fp<->Int conversions require libcalls.
+define void @fun13() {
+; CHECK-LABEL: 'fun13'
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v0 = fptosi fp128 undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v1 = fptosi double undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v2 = fptosi float undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v3 = fptoui fp128 undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v4 = fptoui double undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v5 = fptoui float undef to i128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v6 = sitofp i128 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v7 = sitofp i128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v8 = sitofp i128 undef to float
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v9 = uitofp i128 undef to fp128
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v10 = uitofp i128 undef to double
+; CHECK: Cost Model: Found an estimated cost of 30 for instruction:   %v11 = uitofp i128 undef to float
+  %v0 = fptosi fp128 undef to i128
+  %v1 = fptosi double undef to i128
+  %v2 = fptosi float undef to i128
+  %v3 = fptoui fp128 undef to i128
+  %v4 = fptoui double undef to i128
+  %v5 = fptoui float undef to i128
+  %v6 = sitofp i128 undef to fp128
+  %v7 = sitofp i128 undef to double
+  %v8 = sitofp i128 undef to float
+  %v9 = uitofp i128 undef to fp128
+  %v10 = uitofp i128 undef to double
+  %v11 = uitofp i128 undef to float
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll b/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
index 71863b923ca38..fc4d19c5cdf9e 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/int-arith.ll
@@ -8,6 +8,7 @@ define void @add() {
   %res1 = add i16 undef, undef
   %res2 = add i32 undef, undef
   %res3 = add i64 undef, undef
+  %resQ = add i128 undef, undef
   %res4 = add <2 x i8> undef, undef
   %res5 = add <2 x i16> undef, undef
   %res6 = add <2 x i32> undef, undef
@@ -29,6 +30,7 @@ define void @add() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = add i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = add i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = add i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = add i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = add <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = add <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = add <2 x i32> undef, undef
@@ -54,6 +56,7 @@ define void @sub() {
   %res1 = sub i16 undef, undef
   %res2 = sub i32 undef, undef
   %res3 = sub i64 undef, undef
+  %resQ = sub i128 undef, undef
   %res4 = sub <2 x i8> undef, undef
   %res5 = sub <2 x i16> undef, undef
   %res6 = sub <2 x i32> undef, undef
@@ -75,6 +78,7 @@ define void @sub() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = sub i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = sub i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = sub i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = sub i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = sub <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = sub <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = sub <2 x i32> undef, undef
@@ -100,6 +104,7 @@ define void @mul() {
   %res1 = mul i16 undef, undef
   %res2 = mul i32 undef, undef
   %res3 = mul i64 undef, undef
+  %resQ = mul i128 undef, undef
   %res4 = mul <2 x i8> undef, undef
   %res5 = mul <2 x i16> undef, undef
   %res6 = mul <2 x i32> undef, undef
@@ -121,6 +126,7 @@ define void @mul() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = mul i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = mul i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = mul i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = mul i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = mul <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = mul <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = mul <2 x i32> undef, undef
diff --git a/llvm/test/Analysis/CostModel/SystemZ/intrinsics.ll b/llvm/test/Analysis/CostModel/SystemZ/intrinsics.ll
index d3e07fa9735b3..032b78099c571 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/intrinsics.ll
@@ -3,6 +3,13 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z15 \
 ; RUN:  | FileCheck %s -check-prefixes=CHECK,Z15
 
+define void @bswap_i128(i128 %arg) {
+; CHECK: function 'bswap_i128'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp = tail call i128 @llvm.bswap.i128(i128 %arg)
+  %swp = tail call i128 @llvm.bswap.i128(i128 %arg)
+  ret void
+}
+
 define void @bswap_i64(i64 %arg, <2 x i64> %arg2) {
 ; CHECK: function 'bswap_i64'
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %swp1 = tail call i64
@@ -186,6 +193,8 @@ define void @bswap_v8i16_mem(ptr %src, <8 x i16> %arg, ptr %dst) {
   ret void
 }
 
+declare i128 @llvm.bswap.i128(i128)
+
 declare i64 @llvm.bswap.i64(i64)
 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
 declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
diff --git a/llvm/test/Analysis/CostModel/SystemZ/load_store.ll b/llvm/test/Analysis/CostModel/SystemZ/load_store.ll
index 1766dd3b2859e..4d36c9ed421e0 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/load_store.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/load_store.ll
@@ -1,10 +1,13 @@
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=zEC12 | FileCheck %s --check-prefixes=CHECK,ZEC12
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 | FileCheck %s --check-prefixes=CHECK,Z13
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z14 | FileCheck %s --check-prefixes=CHECK,Z14
 
 define void @store() {
   store i8 undef, ptr undef
   store i16 undef, ptr undef
   store i32 undef, ptr undef
   store i64 undef, ptr undef
+  store i128 undef, ptr undef
   store float undef, ptr undef
   store double undef, ptr undef
   store fp128 undef, ptr undef
@@ -37,9 +40,14 @@ define void @store() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i16 undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i32 undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store i64 undef, ptr undef
+; ZEC12: Cost Model: Found an estimated cost of 2 for instruction:   store i128 undef, ptr undef
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   store i128 undef, ptr undef
+; Z14:   Cost Model: Found an estimated cost of 1 for instruction:   store i128 undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store float undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store double undef, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   store fp128 undef, ptr undef
+; ZEC12: Cost Model: Found an estimated cost of 2 for instruction:   store fp128 undef, ptr undef
+; Z13:   Cost Model: Found an estimated cost of 2 for instruction:   store fp128 undef, ptr undef
+; Z14:   Cost Model: Found an estimated cost of 1 for instruction:   store fp128 undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i8> undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i16> undef, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   store <2 x i32> undef, ptr undef
@@ -73,6 +81,7 @@ define void @load() {
   load i16, ptr undef
   load i32, ptr undef
   load i64, ptr undef
+  load i128, ptr undef
   load float, ptr undef
   load double, ptr undef
   load fp128, ptr undef
@@ -105,33 +114,38 @@ define void @load() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %2 = load i16, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %3 = load i32, ptr undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %4 = load i64, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %5 = load float, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = load double, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %7 = load fp128, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %8 = load <2 x i8>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = load <2 x i16>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = load <2 x i32>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %11 = load <2 x i64>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %12 = load <2 x float>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %13 = load <2 x double>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = load <4 x i8>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = load <4 x i16>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = load <4 x i32>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %17 = load <4 x i64>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %18 = load <4 x float>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %19 = load <4 x double>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %20 = load <8 x i8>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %21 = load <8 x i16>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %22 = load <8 x i32>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %23 = load <8 x i64>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %24 = load <8 x float>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %25 = load <8 x double>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %26 = load <16 x i8>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %27 = load <16 x i16>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %28 = load <16 x i32>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %29 = load <16 x i64>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %30 = load <16 x float>, ptr undef
-; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %31 = load <16 x double>, ptr undef
+; ZEC12: Cost Model: Found an estimated cost of 2 for instruction:   %5 = load i128, ptr undef
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction:   %5 = load i128, ptr undef
+; Z14:   Cost Model: Found an estimated cost of 1 for instruction:   %5 = load i128, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %6 = load float, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %7 = load double, ptr undef
+; ZEC12: Cost Model: Found an estimated cost of 2 for instruction:   %8 = load fp128, ptr undef
+; Z13:   Cost Model: Found an estimated cost of 2 for instruction:   %8 = load fp128, ptr undef
+; Z14:   Cost Model: Found an estimated cost of 1 for instruction:   %8 = load fp128, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %9 = load <2 x i8>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %10 = load <2 x i16>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %11 = load <2 x i32>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %12 = load <2 x i64>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %13 = load <2 x float>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %14 = load <2 x double>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %15 = load <4 x i8>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %16 = load <4 x i16>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %17 = load <4 x i32>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %18 = load <4 x i64>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %19 = load <4 x float>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %20 = load <4 x double>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %21 = load <8 x i8>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %22 = load <8 x i16>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %23 = load <8 x i32>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %24 = load <8 x i64>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %25 = load <8 x float>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %26 = load <8 x double>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %27 = load <16 x i8>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 2 for instruction:   %28 = load <16 x i16>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %29 = load <16 x i32>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %30 = load <16 x i64>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 4 for instruction:   %31 = load <16 x float>, ptr undef
+; CHECK: Cost Model: Found an estimated cost of 8 for instruction:   %32 = load <16 x double>, ptr undef
 
   ret void;
 }
diff --git a/llvm/test/Analysis/CostModel/SystemZ/logic-i128.ll b/llvm/test/Analysis/CostModel/SystemZ/logic-i128.ll
new file mode 100644
index 0000000000000..f4c4fceed717f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/SystemZ/logic-i128.ll
@@ -0,0 +1,48 @@
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z13 \
+; RUN:  | FileCheck %s -check-prefixes=CHECK,Z13
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=systemz-unknown -mcpu=z14 \
+; RUN:  | FileCheck %s -check-prefixes=CHECK,Z14
+
+define void @fun(i128 %a)  {
+; CHECK-LABEL: 'fun'
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c0 = xor i128 %l0, -1
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction: %res0 = or i128 %a, %c0
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction: %res0 = or i128 %a, %c0
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c1 = xor i128 %l1, -1
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %res1 = and i128 %a, %c1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c2 = and i128 %l2, %a
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction: %res2 = xor i128 %c2, -1
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction: %res2 = xor i128 %c2, -1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c3 = or i128 %l3, %a
+; CHECK: Cost Model: Found an estimated cost of 0 for instruction: %res3 = xor i128 %c3, -1
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction: %c4 = xor i128 %l4, %a
+; Z13:   Cost Model: Found an estimated cost of 1 for instruction: %res4 = xor i128 %c4, -1
+; Z14:   Cost Model: Found an estimated cost of 0 for instruction: %res4 = xor i128 %c4, -1
+;
+  %l0 = load i128, ptr undef
+  %c0 = xor i128 %l0, -1
+  %res0 = or i128 %a, %c0
+  store i128 %res0, ptr undef
+
+  %l1 = load i128, ptr undef
+  %c1 = xor i128 %l1, -1
+  %res1 = and i128 %a, %c1
+  store i128 %res1, ptr undef
+
+  %l2 = load i128, ptr undef
+  %c2 = and i128 %l2, %a
+  %res2 = xor i128 %c2, -1
+  store i128 %res2, ptr undef
+
+  %l3 = load i128, ptr undef
+  %c3 = or i128 %l3, %a
+  %res3 = xor i128 %c3, -1
+  store i128 %res3, ptr undef
+
+  %l4 = load i128, ptr undef
+  %c4 = xor i128 %l4, %a
+  %res4 = xor i128 %c4, -1
+  store i128 %res4, ptr undef
+
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/SystemZ/logical.ll b/llvm/test/Analysis/CostModel/SystemZ/logical.ll
index 29935d6895fc0..c87a3836ded6b 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/logical.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/logical.ll
@@ -5,6 +5,7 @@ define void @and() {
   %res1 = and i16 undef, undef
   %res2 = and i32 undef, undef
   %res3 = and i64 undef, undef
+  %resQ = and i128 undef, undef
   %res4 = and <2 x i8> undef, undef
   %res5 = and <2 x i16> undef, undef
   %res6 = and <2 x i32> undef, undef
@@ -26,6 +27,7 @@ define void @and() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = and i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = and i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = and i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = and i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = and <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = and <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = and <2 x i32> undef, undef
@@ -51,6 +53,7 @@ define void @ashr() {
   %res1 = ashr i16 undef, undef
   %res2 = ashr i32 undef, undef
   %res3 = ashr i64 undef, undef
+  %resQ = ashr i128 undef, undef
   %res4 = ashr <2 x i8> undef, undef
   %res5 = ashr <2 x i16> undef, undef
   %res6 = ashr <2 x i32> undef, undef
@@ -72,6 +75,7 @@ define void @ashr() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = ashr i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = ashr i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = ashr i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = ashr i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = ashr <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = ashr <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = ashr <2 x i32> undef, undef
@@ -97,6 +101,7 @@ define void @lshr() {
   %res1 = lshr i16 undef, undef
   %res2 = lshr i32 undef, undef
   %res3 = lshr i64 undef, undef
+  %resQ = lshr i128 undef, undef
   %res4 = lshr <2 x i8> undef, undef
   %res5 = lshr <2 x i16> undef, undef
   %res6 = lshr <2 x i32> undef, undef
@@ -118,6 +123,7 @@ define void @lshr() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = lshr i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = lshr i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = lshr i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = lshr i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = lshr <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = lshr <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = lshr <2 x i32> undef, undef
@@ -143,6 +149,7 @@ define void @or() {
   %res1 = or i16 undef, undef
   %res2 = or i32 undef, undef
   %res3 = or i64 undef, undef
+  %resQ = or i128 undef, undef
   %res4 = or <2 x i8> undef, undef
   %res5 = or <2 x i16> undef, undef
   %res6 = or <2 x i32> undef, undef
@@ -164,6 +171,7 @@ define void @or() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = or i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = or i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = or i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = or i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = or <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = or <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = or <2 x i32> undef, undef
@@ -189,6 +197,7 @@ define void @shl() {
   %res1 = shl i16 undef, undef
   %res2 = shl i32 undef, undef
   %res3 = shl i64 undef, undef
+  %resQ = shl i128 undef, undef
   %res4 = shl <2 x i8> undef, undef
   %res5 = shl <2 x i16> undef, undef
   %res6 = shl <2 x i32> undef, undef
@@ -210,6 +219,7 @@ define void @shl() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = shl i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = shl i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = shl i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = shl i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = shl <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = shl <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = shl <2 x i32> undef, undef
@@ -235,6 +245,7 @@ define void @xor() {
   %res1 = xor i16 undef, undef
   %res2 = xor i32 undef, undef
   %res3 = xor i64 undef, undef
+  %resQ = xor i128 undef, undef
   %res4 = xor <2 x i8> undef, undef
   %res5 = xor <2 x i16> undef, undef
   %res6 = xor <2 x i32> undef, undef
@@ -256,6 +267,7 @@ define void @xor() {
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res1 = xor i16 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res2 = xor i32 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res3 = xor i64 undef, undef
+; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %resQ = xor i128 undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res4 = xor <2 x i8> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res5 = xor <2 x i16> undef, undef
 ; CHECK: Cost Model: Found an estimated cost of 1 for instruction:   %res6 = xor <2 x i32> undef, undef

From 30da0f5a359ab4a684c5fdf0f4dbed20bae10f99 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Thu, 18 Jan 2024 20:24:13 +0300
Subject: [PATCH 019/843] [clang] Pass `-n` to llvm-cxxfilt in codegen tests

This is a follow-up to f4fbbebb5edcaad459ce154c011f71fc38fe4052, which addresses macOS test failure.
---
 clang/test/CXX/drs/dr1807.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/test/CXX/drs/dr1807.cpp b/clang/test/CXX/drs/dr1807.cpp
index 0e16eedac71ef..5544e4695f8d0 100644
--- a/clang/test/CXX/drs/dr1807.cpp
+++ b/clang/test/CXX/drs/dr1807.cpp
@@ -2,9 +2,9 @@
 // RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
 // RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
 // RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
-// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
-// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
-// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
 
 namespace dr1807 { // dr1807: 3.0
 struct S {

From 7f0515323670459b8376b866bc73a448f0a5aa6e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dominik=20W=C3=B3jt?= <dominik.wojt@arm.com>
Date: Thu, 18 Jan 2024 18:40:52 +0100
Subject: [PATCH 020/843] [libc++] Un-xfail module tests in picolibc tests
 (#78580)

Some of the module tests now pass after picolibc update.
https://github.com/llvm/llvm-project/pull/77908

The remaining tests fail and are now set to xfail on picolibc
specifically.
---
 .../libcxx/selftest/modules/std-and-std.compat-module.sh.cpp  | 4 ++++
 libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp  | 4 ++++
 libcxx/test/std/modules/std.compat.pass.cpp                   | 4 ++++
 libcxx/utils/libcxx/test/features.py                          | 1 -
 4 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
index d56ebb2961d4a..81241d7f43f9a 100644
--- a/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
+++ b/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
@@ -12,6 +12,10 @@
 
 // XFAIL: has-no-cxx-module-support
 
+// picolibc does not provide the required timespec_get function, and the
+// "using-if-exists" mechanism apparently did not work here.
+// XFAIL: LIBCXX-PICOLIBC-FIXME
+
 // Make sure that the compile flags contain the expected elements.
 // The tests only look for the expected components and not the exact flags.
 // Otherwise changing the location of the module would break this test.
diff --git a/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
index e84709853fbca..b74c2f1a249fc 100644
--- a/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
+++ b/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
@@ -12,6 +12,10 @@
 
 // XFAIL: has-no-cxx-module-support
 
+// picolibc does not provide the required timespec_get function, and the
+// "using-if-exists" mechanism apparently did not work here.
+// XFAIL: LIBCXX-PICOLIBC-FIXME
+
 // Make sure that the compile flags contain the expected elements.
 // The tests only look for the expected components and not the exact flags.
 // Otherwise changing the location of the module would break this test.
diff --git a/libcxx/test/std/modules/std.compat.pass.cpp b/libcxx/test/std/modules/std.compat.pass.cpp
index e840f3c6b629c..40ea979e27346 100644
--- a/libcxx/test/std/modules/std.compat.pass.cpp
+++ b/libcxx/test/std/modules/std.compat.pass.cpp
@@ -12,6 +12,10 @@
 
 // XFAIL: has-no-cxx-module-support
 
+// picolibc does not provide the required timespec_get function, and the
+// "using-if-exists" mechanism apparently did not work here.
+// XFAIL: LIBCXX-PICOLIBC-FIXME
+
 // A minimal test to validate import works.
 
 // MODULE_DEPENDENCIES: std.compat
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 8c27d8c810676..1983bc91ab171 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -324,7 +324,6 @@ def _getAndroidDeviceApi(cfg):
         # This is not allowed per C11 7.1.2 Standard headers/6
         #  Any declaration of a library function shall have external linkage.
         when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)
-        or "__PICOLIBC__" in compilerMacros(cfg)
         or platform.system().lower().startswith("aix")
         # Avoid building on platforms that don't support modules properly.
         or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier"),

From b689e1678c429b724dd898edb9e24cbb9c437667 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Thu, 18 Jan 2024 18:46:46 +0100
Subject: [PATCH 021/843] [libc++] Renames ABI tag. (#78342)

The tag name was long for an ABI tag. The name was misleading too, the
tag is first introduced in LLVM 18 in 2024 and not in 2023.

---------

Co-authored-by: Louis Dionne <ldionne.2@gmail.com>
---
 libcxx/include/__config                   | 4 ++--
 libcxx/include/__ranges/chunk_by_view.h   | 3 +--
 libcxx/include/__ranges/drop_while_view.h | 3 +--
 libcxx/include/__ranges/filter_view.h     | 2 +-
 libcxx/include/__ranges/repeat_view.h     | 2 +-
 libcxx/include/__ranges/single_view.h     | 2 +-
 libcxx/include/__ranges/take_while_view.h | 3 +--
 libcxx/include/__ranges/transform_view.h  | 3 +--
 8 files changed, 9 insertions(+), 13 deletions(-)

diff --git a/libcxx/include/__config b/libcxx/include/__config
index ee272a62711af..90a4585938a13 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -218,8 +218,8 @@
 // https://github.com/llvm/llvm-project/issues/70494
 //
 // To fix the bug we had to change the ABI of some classes to remove [[no_unique_address]] under certain conditions.
-// The below macro is used for all classes whose ABI have changed as part of fixing these bugs.
-#  define _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG __attribute__((__abi_tag__("subobj_fix_2023")))
+// The macro below is used for all classes whose ABI have changed as part of fixing these bugs.
+#  define _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS __attribute__((__abi_tag__("llvm18_nua")))
 
 // Changes the iterator type of select containers (see below) to a bounded iterator that keeps track of whether it's
 // within the bounds of the original container and asserts it on every dereference.
diff --git a/libcxx/include/__ranges/chunk_by_view.h b/libcxx/include/__ranges/chunk_by_view.h
index c5b3240a7d0be..b04a23de99fb2 100644
--- a/libcxx/include/__ranges/chunk_by_view.h
+++ b/libcxx/include/__ranges/chunk_by_view.h
@@ -54,8 +54,7 @@ namespace ranges {
 
 template <forward_range _View, indirect_binary_predicate<iterator_t<_View>, iterator_t<_View>> _Pred>
   requires view<_View> && is_object_v<_Pred>
-class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG chunk_by_view
-    : public view_interface<chunk_by_view<_View, _Pred>> {
+class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS chunk_by_view : public view_interface<chunk_by_view<_View, _Pred>> {
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
diff --git a/libcxx/include/__ranges/drop_while_view.h b/libcxx/include/__ranges/drop_while_view.h
index b367f735c1417..4e3ef61678f4d 100644
--- a/libcxx/include/__ranges/drop_while_view.h
+++ b/libcxx/include/__ranges/drop_while_view.h
@@ -45,8 +45,7 @@ namespace ranges {
 
 template <view _View, class _Pred>
   requires input_range<_View> && is_object_v<_Pred> && indirect_unary_predicate<const _Pred, iterator_t<_View>>
-class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG drop_while_view
-    : public view_interface<drop_while_view<_View, _Pred>> {
+class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS drop_while_view : public view_interface<drop_while_view<_View, _Pred>> {
 public:
   _LIBCPP_HIDE_FROM_ABI drop_while_view()
     requires default_initializable<_View> && default_initializable<_Pred>
diff --git a/libcxx/include/__ranges/filter_view.h b/libcxx/include/__ranges/filter_view.h
index ecb78eee38100..6e6719c14470d 100644
--- a/libcxx/include/__ranges/filter_view.h
+++ b/libcxx/include/__ranges/filter_view.h
@@ -51,7 +51,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace ranges {
 template <input_range _View, indirect_unary_predicate<iterator_t<_View>> _Pred>
   requires view<_View> && is_object_v<_Pred>
-class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG filter_view : public view_interface<filter_view<_View, _Pred>> {
+class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS filter_view : public view_interface<filter_view<_View, _Pred>> {
   _LIBCPP_NO_UNIQUE_ADDRESS _View __base_ = _View();
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Pred> __pred_;
 
diff --git a/libcxx/include/__ranges/repeat_view.h b/libcxx/include/__ranges/repeat_view.h
index 479eca96acb09..d9759abe1cba6 100644
--- a/libcxx/include/__ranges/repeat_view.h
+++ b/libcxx/include/__ranges/repeat_view.h
@@ -68,7 +68,7 @@ struct __fn;
 template <move_constructible _Tp, semiregular _Bound = unreachable_sentinel_t>
   requires(is_object_v<_Tp> && same_as<_Tp, remove_cv_t<_Tp>> &&
            (__integer_like_with_usable_difference_type<_Bound> || same_as<_Bound, unreachable_sentinel_t>))
-class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG repeat_view : public view_interface<repeat_view<_Tp, _Bound>> {
+class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS repeat_view : public view_interface<repeat_view<_Tp, _Bound>> {
   friend struct views::__take::__fn;
   friend struct views::__drop::__fn;
   class __iterator;
diff --git a/libcxx/include/__ranges/single_view.h b/libcxx/include/__ranges/single_view.h
index 0ae2036a66a92..ead597a9be170 100644
--- a/libcxx/include/__ranges/single_view.h
+++ b/libcxx/include/__ranges/single_view.h
@@ -37,7 +37,7 @@ template <move_constructible _Tp>
 template <copy_constructible _Tp>
 #  endif
   requires is_object_v<_Tp>
-class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG single_view : public view_interface<single_view<_Tp>> {
+class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS single_view : public view_interface<single_view<_Tp>> {
   _LIBCPP_NO_UNIQUE_ADDRESS __movable_box<_Tp> __value_;
 
 public:
diff --git a/libcxx/include/__ranges/take_while_view.h b/libcxx/include/__ranges/take_while_view.h
index 4534210d97943..46cfe4f70ac83 100644
--- a/libcxx/include/__ranges/take_while_view.h
+++ b/libcxx/include/__ranges/take_while_view.h
@@ -43,8 +43,7 @@ namespace ranges {
 
 template <view _View, class _Pred>
   requires input_range<_View> && is_object_v<_Pred> && indirect_unary_predicate<const _Pred, iterator_t<_View>>
-class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG take_while_view
-    : public view_interface<take_while_view<_View, _Pred>> {
+class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS take_while_view : public view_interface<take_while_view<_View, _Pred>> {
   template <bool>
   class __sentinel;
 
diff --git a/libcxx/include/__ranges/transform_view.h b/libcxx/include/__ranges/transform_view.h
index 744f597ccef59..3c8d825789cbc 100644
--- a/libcxx/include/__ranges/transform_view.h
+++ b/libcxx/include/__ranges/transform_view.h
@@ -67,8 +67,7 @@ template <input_range _View, move_constructible _Fn>
 template <input_range _View, copy_constructible _Fn>
 #  endif
   requires __transform_view_constraints<_View, _Fn>
-class _LIBCPP_ABI_2023_OVERLAPPING_SUBOBJECT_FIX_TAG transform_view
-    : public view_interface<transform_view<_View, _Fn>> {
+class _LIBCPP_ABI_LLVM18_NO_UNIQUE_ADDRESS transform_view : public view_interface<transform_view<_View, _Fn>> {
   template <bool>
   class __iterator;
   template <bool>

From c21f48e5ad1799db41cc9f16541b8365e3b75e63 Mon Sep 17 00:00:00 2001
From: Natalie Chouinard <sudonatalie@google.com>
Date: Thu, 18 Jan 2024 12:52:00 -0500
Subject: [PATCH 022/843] [HLSL][SPIR-V] Add Vulkan to target triple (#76749)

Add support for specifying the logical SPIR-V target environment in the
triple as Vulkan. When compiling HLSL, this replaces the DirectX Shader
Model with a Vulkan environment instead.

Currently, the only supported combinations of SPIR-V version and Vulkan
environment are:
- Vulkan 1.2 and SPIR-V 1.5
- Vulkan 1.3 and SPIR-V 1.6

Fixes #70051
---
 .../clang/Basic/DiagnosticDriverKinds.td      |   4 +-
 clang/lib/Basic/Targets/SPIR.h                |   6 +-
 clang/lib/Frontend/CompilerInvocation.cpp     |  27 +++--
 .../test/Driver/hlsl-lang-targets-spirv.hlsl  |  28 +++--
 clang/test/Driver/hlsl-lang-targets.hlsl      |   6 +-
 llvm/include/llvm/TargetParser/Triple.h       |  10 +-
 llvm/lib/TargetParser/Triple.cpp              |  32 +++++-
 llvm/unittests/TargetParser/TripleTest.cpp    | 102 ++++++++----------
 8 files changed, 132 insertions(+), 83 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 8b5232a6df395..0fe8798dfb301 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -735,10 +735,10 @@ def err_drv_dxc_missing_target_profile : Error<
 def err_drv_hlsl_unsupported_target : Error<
   "HLSL code generation is unsupported for target '%0'">;
 def err_drv_hlsl_bad_shader_required_in_target : Error<
-  "shader %select{model|stage}0 is required in target '%1' for HLSL code generation">;
+  "%select{shader model|Vulkan environment|shader stage}0 is required as %select{OS|environment}1 in target '%2' for HLSL code generation">;
 
 def err_drv_hlsl_bad_shader_unsupported : Error<
-  "shader %select{model|stage}0 '%1' in target '%2' is invalid for HLSL code generation">;
+  "%select{shader model|Vulkan environment|shader stage}0 '%1' in target '%2' is invalid for HLSL code generation">;
 def warn_drv_dxc_missing_dxv : Warning<"dxv not found. "
     "Resulting DXIL will not be validated or signed for use in release environments.">,
     InGroup<DXILValidation>;
diff --git a/clang/lib/Basic/Targets/SPIR.h b/clang/lib/Basic/Targets/SPIR.h
index 9ab2b7c609363..fa4a3bb1c82ee 100644
--- a/clang/lib/Basic/Targets/SPIR.h
+++ b/clang/lib/Basic/Targets/SPIR.h
@@ -17,6 +17,7 @@
 #include "clang/Basic/TargetInfo.h"
 #include "clang/Basic/TargetOptions.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/VersionTuple.h"
 #include "llvm/TargetParser/Triple.h"
 #include <optional>
 
@@ -301,8 +302,9 @@ class LLVM_LIBRARY_VISIBILITY SPIRVTargetInfo : public BaseSPIRVTargetInfo {
       : BaseSPIRVTargetInfo(Triple, Opts) {
     assert(Triple.getArch() == llvm::Triple::spirv &&
            "Invalid architecture for Logical SPIR-V.");
-    assert(Triple.getOS() == llvm::Triple::ShaderModel &&
-           "Logical SPIR-V requires a valid ShaderModel.");
+    assert(Triple.getOS() == llvm::Triple::Vulkan &&
+           Triple.getVulkanVersion() != llvm::VersionTuple(0) &&
+           "Logical SPIR-V requires a valid Vulkan environment.");
     assert(Triple.getEnvironment() >= llvm::Triple::Pixel &&
            Triple.getEnvironment() <= llvm::Triple::Amplification &&
            "Logical SPIR-V environment must be a valid shader stage.");
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 11f3f2c2d6425..b5e192d54465b 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -4236,20 +4236,35 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
     // TODO: Revisit restricting SPIR-V to logical once we've figured out how to
     // handle PhysicalStorageBuffer64 memory model
     if (T.isDXIL() || T.isSPIRVLogical()) {
-      enum { ShaderModel, ShaderStage };
+      enum { ShaderModel, VulkanEnv, ShaderStage };
+      enum { OS, Environment };
+
+      int ExpectedOS = T.isSPIRVLogical() ? VulkanEnv : ShaderModel;
+
       if (T.getOSName().empty()) {
         Diags.Report(diag::err_drv_hlsl_bad_shader_required_in_target)
-            << ShaderModel << T.str();
-      } else if (!T.isShaderModelOS() || T.getOSVersion() == VersionTuple(0)) {
-        Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported)
-            << ShaderModel << T.getOSName() << T.str();
+            << ExpectedOS << OS << T.str();
       } else if (T.getEnvironmentName().empty()) {
         Diags.Report(diag::err_drv_hlsl_bad_shader_required_in_target)
-            << ShaderStage << T.str();
+            << ShaderStage << Environment << T.str();
       } else if (!T.isShaderStageEnvironment()) {
         Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported)
             << ShaderStage << T.getEnvironmentName() << T.str();
       }
+
+      if (T.isDXIL()) {
+        if (!T.isShaderModelOS() || T.getOSVersion() == VersionTuple(0)) {
+          Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported)
+              << ShaderModel << T.getOSName() << T.str();
+        }
+      } else if (T.isSPIRVLogical()) {
+        if (!T.isVulkanOS() || T.getVulkanVersion() == VersionTuple(0)) {
+          Diags.Report(diag::err_drv_hlsl_bad_shader_unsupported)
+              << VulkanEnv << T.getOSName() << T.str();
+        }
+      } else {
+        llvm_unreachable("expected DXIL or SPIR-V target");
+      }
     } else
       Diags.Report(diag::err_drv_hlsl_unsupported_target) << T.str();
   }
diff --git a/clang/test/Driver/hlsl-lang-targets-spirv.hlsl b/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
index e04d71263770b..b86c2e01f8d80 100644
--- a/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
+++ b/clang/test/Driver/hlsl-lang-targets-spirv.hlsl
@@ -3,29 +3,39 @@
 // Supported targets
 //
 // RUN: %clang -target dxil-unknown-shadermodel6.2-compute %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s
-// RUN: %clang -target spirv-unknown-shadermodel6.2-compute %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s
+// RUN: %clang -target spirv-unknown-vulkan-compute %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s
+// RUN: %clang -target spirv-unknown-vulkan1.2-compute %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s
+// RUN: %clang -target spirv-unknown-vulkan1.3-compute %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s
+// RUN: %clang -target spirv1.5-unknown-vulkan1.2-compute %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s
+// RUN: %clang -target spirv1.6-unknown-vulkan1.3-compute %s -S -o /dev/null 2>&1 | FileCheck --allow-empty --check-prefix=CHECK-VALID %s
 
-// Empty shader model
+// Empty Vulkan environment
 //
 // RUN: not %clang -target spirv %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-NO-OS %s
 
-// Invalid shader models
+// Invalid Vulkan environment
 //
-// RUN: not %clang -target spirv--unknown %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-OS %s
+// RUN: not %clang -target spirv--shadermodel %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-OS %s
+// RUN: not %clang -target spirv-unknown-vulkan1.0-compute %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-OS %s
+// RUN: not %clang -target spirv1.5-unknown-vulkan1.3-compute %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-OS %s
+
+// Invalid SPIR-V version
+// RUN: not %clang -target spirv1.0-unknown-vulkan-compute %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-TARGET %s
 
 // Empty shader stage
 //
-// RUN: not %clang -target spirv--shadermodel6.2 %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-NO-ENV %s
+// RUN: not %clang -target spirv--vulkan %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-NO-ENV %s
 
 // Invalid shader stages
 //
-// RUN: not %clang -target spirv--shadermodel6.2-unknown %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
+// RUN: not %clang -target spirv--vulkan-unknown %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-ENV %s
 
 // CHECK-VALID-NOT: error:
-// CHECK-NO-OS: error: shader model is required in target '{{.*}}' for HLSL code generation
-// CHECK-BAD-OS: error: shader model '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
-// CHECK-NO-ENV: error: shader stage is required in target '{{.*}}' for HLSL code generation
+// CHECK-NO-OS: error: Vulkan environment is required as OS in target '{{.*}}' for HLSL code generation
+// CHECK-BAD-OS: error: Vulkan environment '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
+// CHECK-NO-ENV: error: shader stage is required as environment in target '{{.*}}' for HLSL code generation
 // CHECK-BAD-ENV: error: shader stage '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
+// CHECK-BAD-TARGET: error: HLSL code generation is unsupported for target '{{.*}}'
 
 [shader("compute"), numthreads(1,1,1)]
 void main() {}
diff --git a/clang/test/Driver/hlsl-lang-targets.hlsl b/clang/test/Driver/hlsl-lang-targets.hlsl
index 816e84cdc6dcb..f2f4bba8196bc 100644
--- a/clang/test/Driver/hlsl-lang-targets.hlsl
+++ b/clang/test/Driver/hlsl-lang-targets.hlsl
@@ -1,4 +1,4 @@
-// REQUIRES: dxil-registered-target
+// REQUIRES: directx-registered-target
 
 // Supported targets
 //
@@ -43,9 +43,9 @@
 // RUN: not %clang -target amdgcn %s -S -o /dev/null 2>&1 | FileCheck --check-prefix=CHECK-BAD-TARGET %s
 
 // CHECK-VALID-NOT: error:
-// CHECK-NO-OS: error: shader model is required in target '{{.*}}' for HLSL code generation
+// CHECK-NO-OS: error: shader model is required as OS in target '{{.*}}' for HLSL code generation
 // CHECK-BAD-OS: error: shader model '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
-// CHECK-NO-ENV: error: shader stage is required in target '{{.*}}' for HLSL code generation
+// CHECK-NO-ENV: error: shader stage is required as environment in target '{{.*}}' for HLSL code generation
 // CHECK-BAD-ENV: error: shader stage '{{.*}}' in target '{{.*}}' is invalid for HLSL code generation
 // CHECK-BAD-TARGET: error: HLSL code generation is unsupported for target '{{.*}}'
 
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 052336c8a305c..1f8c7a060e249 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -164,6 +164,7 @@ class Triple {
     SPIRVSubArch_v13,
     SPIRVSubArch_v14,
     SPIRVSubArch_v15,
+    SPIRVSubArch_v16,
   };
   enum VendorType {
     UnknownVendor,
@@ -224,7 +225,8 @@ class Triple {
     ShaderModel, // DirectX ShaderModel
     LiteOS,
     Serenity,
-    LastOSType = Serenity
+    Vulkan,      // Vulkan SPIR-V
+    LastOSType = Vulkan
   };
   enum EnvironmentType {
     UnknownEnvironment,
@@ -410,6 +412,10 @@ class Triple {
   /// Parse the version number as with getOSVersion.
   VersionTuple getDriverKitVersion() const;
 
+  /// Parse the Vulkan version number from the OSVersion and SPIR-V version
+  /// (SubArch).  This should only be called with Vulkan SPIR-V triples.
+  VersionTuple getVulkanVersion() const;
+
   /// @}
   /// @name Direct Component Access
   /// @{
@@ -775,6 +781,8 @@ class Triple {
     return getOS() == Triple::ShaderModel;
   }
 
+  bool isVulkanOS() const { return getOS() == Triple::Vulkan; }
+
   bool isShaderStageEnvironment() const {
     EnvironmentType Env = getEnvironment();
     return Env == Triple::Pixel || Env == Triple::Vertex ||
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 40079c1dcd3e7..3212ba77de9bd 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TargetParser/Triple.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -274,6 +275,7 @@ StringRef Triple::getOSTypeName(OSType Kind) {
   case ShaderModel: return "shadermodel";
   case LiteOS: return "liteos";
   case XROS: return "xros";
+  case Vulkan: return "vulkan";
   }
 
   llvm_unreachable("Invalid OSType");
@@ -551,8 +553,7 @@ static Triple::ArchType parseArch(StringRef ArchName) {
     .Case("hsail64", Triple::hsail64)
     .Case("spir", Triple::spir)
     .Case("spir64", Triple::spir64)
-    .Cases("spirv", "spirv1.0", "spirv1.1", "spirv1.2",
-           "spirv1.3", "spirv1.4", "spirv1.5", Triple::spirv)
+    .Cases("spirv", "spirv1.5", "spirv1.6", Triple::spirv)
     .Cases("spirv32", "spirv32v1.0", "spirv32v1.1", "spirv32v1.2",
            "spirv32v1.3", "spirv32v1.4", "spirv32v1.5", Triple::spirv32)
     .Cases("spirv64", "spirv64v1.0", "spirv64v1.1", "spirv64v1.2",
@@ -646,6 +647,7 @@ static Triple::OSType parseOS(StringRef OSName) {
     .StartsWith("shadermodel", Triple::ShaderModel)
     .StartsWith("liteos", Triple::LiteOS)
     .StartsWith("serenity", Triple::Serenity)
+    .StartsWith("vulkan", Triple::Vulkan)
     .Default(Triple::UnknownOS);
 }
 
@@ -730,6 +732,7 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
         .EndsWith("v1.3", Triple::SPIRVSubArch_v13)
         .EndsWith("v1.4", Triple::SPIRVSubArch_v14)
         .EndsWith("v1.5", Triple::SPIRVSubArch_v15)
+        .EndsWith("v1.6", Triple::SPIRVSubArch_v16)
         .Default(Triple::NoSubArch);
 
   StringRef ARMSubArch = ARM::getCanonicalArchName(SubArchName);
@@ -1345,6 +1348,31 @@ VersionTuple Triple::getDriverKitVersion() const {
   }
 }
 
+VersionTuple Triple::getVulkanVersion() const {
+  if (getArch() != spirv || getOS() != Vulkan)
+    llvm_unreachable("invalid Vulkan SPIR-V triple");
+
+  VersionTuple VulkanVersion = getOSVersion();
+  SubArchType SpirvVersion = getSubArch();
+
+  llvm::DenseMap<VersionTuple, SubArchType> ValidVersionMap = {
+      // Vulkan 1.2 -> SPIR-V 1.5.
+      {VersionTuple(1, 2), SPIRVSubArch_v15},
+      // Vulkan 1.3 -> SPIR-V 1.6.
+      {VersionTuple(1, 3), SPIRVSubArch_v16}};
+
+  // If Vulkan version is unset, default to 1.2.
+  if (VulkanVersion == VersionTuple(0))
+    VulkanVersion = VersionTuple(1, 2);
+
+  if (ValidVersionMap.contains(VulkanVersion) &&
+      (ValidVersionMap.lookup(VulkanVersion) == SpirvVersion ||
+       SpirvVersion == NoSubArch))
+    return VulkanVersion;
+
+  return VersionTuple(0);
+}
+
 void Triple::setTriple(const Twine &Str) {
   *this = Triple(Str);
 }
diff --git a/llvm/unittests/TargetParser/TripleTest.cpp b/llvm/unittests/TargetParser/TripleTest.cpp
index 6f9a35739bff6..4db54a08c0f63 100644
--- a/llvm/unittests/TargetParser/TripleTest.cpp
+++ b/llvm/unittests/TargetParser/TripleTest.cpp
@@ -325,130 +325,116 @@ TEST(TripleTest, ParsedIDs) {
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
   EXPECT_EQ(Triple::UnknownOS, T.getOS());
 
-  T = Triple("spirv-unknown-shadermodel-pixel");
+  T = Triple("spirv-unknown-vulkan-pixel");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Pixel, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-vertex");
+  T = Triple("spirv-unknown-vulkan-vertex");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Vertex, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-geometry");
+  T = Triple("spirv-unknown-vulkan-geometry");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Geometry, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-library");
+  T = Triple("spirv-unknown-vulkan-library");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Library, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-raygeneration");
+  T = Triple("spirv-unknown-vulkan-raygeneration");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::RayGeneration, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-intersection");
+  T = Triple("spirv-unknown-vulkan-intersection");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Intersection, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-anyhit");
+  T = Triple("spirv-unknown-vulkan-anyhit");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::AnyHit, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-closesthit");
+  T = Triple("spirv-unknown-vulkan-closesthit");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::ClosestHit, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-miss");
+  T = Triple("spirv-unknown-vulkan-miss");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Miss, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-callable");
+  T = Triple("spirv-unknown-vulkan-callable");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Callable, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-mesh");
+  T = Triple("spirv-unknown-vulkan-mesh");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Mesh, T.getEnvironment());
 
-  T = Triple("spirv-unknown-shadermodel-amplification");
+  T = Triple("spirv-unknown-vulkan-amplification");
   EXPECT_EQ(Triple::spirv, T.getArch());
   EXPECT_EQ(Triple::NoSubArch, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Amplification, T.getEnvironment());
 
-  T = Triple("spirv1.0-unknown-shadermodel-compute");
-  EXPECT_EQ(Triple::spirv, T.getArch());
-  EXPECT_EQ(Triple::SPIRVSubArch_v10, T.getSubArch());
-  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
-  EXPECT_EQ(Triple::Compute, T.getEnvironment());
-
-  T = Triple("spirv1.1-unknown-shadermodel-compute");
+  T = Triple("spirv1.5-unknown-vulkan1.2-compute");
   EXPECT_EQ(Triple::spirv, T.getArch());
-  EXPECT_EQ(Triple::SPIRVSubArch_v11, T.getSubArch());
-  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
-  EXPECT_EQ(Triple::Compute, T.getEnvironment());
-
-  T = Triple("spirv1.2-unknown-shadermodel-compute");
-  EXPECT_EQ(Triple::spirv, T.getArch());
-  EXPECT_EQ(Triple::SPIRVSubArch_v12, T.getSubArch());
-  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
-  EXPECT_EQ(Triple::Compute, T.getEnvironment());
-
-  T = Triple("spirv1.3-unknown-shadermodel-compute");
-  EXPECT_EQ(Triple::spirv, T.getArch());
-  EXPECT_EQ(Triple::SPIRVSubArch_v13, T.getSubArch());
-  EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
-  EXPECT_EQ(Triple::Compute, T.getEnvironment());
-
-  T = Triple("spirv1.4-unknown-shadermodel-compute");
-  EXPECT_EQ(Triple::spirv, T.getArch());
-  EXPECT_EQ(Triple::SPIRVSubArch_v14, T.getSubArch());
+  EXPECT_EQ(Triple::SPIRVSubArch_v15, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 2), T.getVulkanVersion());
   EXPECT_EQ(Triple::Compute, T.getEnvironment());
 
-  T = Triple("spirv1.5-unknown-shadermodel-compute");
+  T = Triple("spirv1.6-unknown-vulkan1.3-compute");
   EXPECT_EQ(Triple::spirv, T.getArch());
-  EXPECT_EQ(Triple::SPIRVSubArch_v15, T.getSubArch());
+  EXPECT_EQ(Triple::SPIRVSubArch_v16, T.getSubArch());
   EXPECT_EQ(Triple::UnknownVendor, T.getVendor());
-  EXPECT_EQ(Triple::ShaderModel, T.getOS());
+  EXPECT_EQ(Triple::Vulkan, T.getOS());
+  EXPECT_EQ(VersionTuple(1, 3), T.getVulkanVersion());
   EXPECT_EQ(Triple::Compute, T.getEnvironment());
 
   T = Triple("x86_64-unknown-fuchsia");

From d950157f7b290e35ce25647e255df9dccbcead2b Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alex_toresh@yahoo.fr>
Date: Thu, 18 Jan 2024 08:21:09 -0500
Subject: [PATCH 023/843] [lldb] Silence warning when building with latest MSVC
 Fixes: ```
 C:\git\llvm-project\lldb\unittests\Core\DumpDataExtractorTest.cpp(140):
 warning C4305: 'argument': truncation from 'double' to 'const
 std::complex<float>::_Ty' ```

---
 lldb/unittests/Core/DumpDataExtractorTest.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/unittests/Core/DumpDataExtractorTest.cpp b/lldb/unittests/Core/DumpDataExtractorTest.cpp
index 8c58fecfee29c..3d1e8bc5e4623 100644
--- a/lldb/unittests/Core/DumpDataExtractorTest.cpp
+++ b/lldb/unittests/Core/DumpDataExtractorTest.cpp
@@ -137,7 +137,7 @@ TEST_F(DumpDataExtractorTest, Formats) {
   TestDump('?', lldb::eFormatChar, "'?'");
   TestDump('\x1A', lldb::eFormatCharPrintable, ".");
   TestDump('#', lldb::eFormatCharPrintable, "#");
-  TestDump(std::complex<float>(1.2, 3.4), lldb::eFormatComplex, "1.2 + 3.4i");
+  TestDump(std::complex<float>(1.2f, 3.4f), lldb::eFormatComplex, "1.2 + 3.4i");
   TestDump(std::complex<double>(4.5, 6.7), lldb::eFormatComplex, "4.5 + 6.7i");
 
   // long double is not tested here because for some platforms we treat it as 10

From bafdaa171a2806ceff628ed7b64ace2b92c05578 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alex_toresh@yahoo.fr>
Date: Thu, 18 Jan 2024 09:33:23 -0500
Subject: [PATCH 024/843] [lldb] Silence warning with latest MSVC on Windows

Fixes:
```
[3465/3822] Building CXX object tools\lldb\source\Plugins\SymbolFile\CTF\CMakeFiles\lldbPluginSymbolFileCTF.dir\SymbolFileCTF.cpp.obj
C:\git\llvm-project\lldb\source\Plugins\SymbolFile\CTF\SymbolFileCTF.cpp(606) : warning C4715: 'lldb_private::SymbolFileCTF::CreateType': not all control paths return a value
```
---
 lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
index d192944bb9d08..65f5b1a5f1b0a 100644
--- a/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
+++ b/lldb/source/Plugins/SymbolFile/CTF/SymbolFileCTF.cpp
@@ -603,6 +603,7 @@ llvm::Expected<TypeSP> SymbolFileCTF::CreateType(CTFType *ctf_type) {
                       ctf_type->uid, ctf_type->name, ctf_type->kind),
         llvm::inconvertibleErrorCode());
   }
+  llvm_unreachable("Unexpected CTF type kind");
 }
 
 llvm::Expected<std::unique_ptr<CTFType>>

From ded8aa61849c88492811186e0f4a7ee56a5d78ea Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alex_toresh@yahoo.fr>
Date: Thu, 18 Jan 2024 09:34:13 -0500
Subject: [PATCH 025/843] [lldb] Silence warning with latest MSVC

Fixes several of these:
```
[3370/3822] Building CXX object tools\lldb\source\Plugins\Process\U...lldbPluginProcessUtility.dir\NativeRegisterContextDBReg_x86.cpp.ob
C:\git\llvm-project\lldb\source\Plugins\Process\Utility\NativeRegisterContextDBReg_x86.h(23): warning C4589: Constructor of abstract class 'lldb_private::NativeRegisterContextDBReg_x86' ignores initializer for virtual base class 'lldb_private::NativeRegisterContextRegisterInfo'
C:\git\llvm-project\lldb\source\Plugins\Process\Utility\NativeRegisterContextDBReg_x86.h(23): note: virtual base classes are only initialized by the most-derived type
```
---
 lldb/cmake/modules/LLDBConfig.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index 7efcc87b9799d..a758261073ac5 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -215,6 +215,7 @@ if( MSVC )
     -wd4251 # Suppress 'warning C4251: T must have dll-interface to be used by clients of class U.'
     -wd4521 # Suppress 'warning C4521: 'type' : multiple copy constructors specified'
     -wd4530 # Suppress 'warning C4530: C++ exception handler used, but unwind semantics are not enabled.'
+    -wd4589 # Suppress 'warning C4589: Constructor of abstract class 'lldb_private::NativeRegisterContextDBReg_x86' ignores initializer for virtual base class 'lldb_private::NativeRegisterContextRegisterInfo''
   )
 endif()
 

From cb67dc19256565d15f6bed0e9808f4026ca04995 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alex_toresh@yahoo.fr>
Date: Thu, 18 Jan 2024 09:35:05 -0500
Subject: [PATCH 026/843] [lldb] Silence narrowing conversion warning with MSVC

Fixes:
```
[13/270] Building CXX object tools\lldb\unittests\Core\CMakeFiles\LLDBCoreTests.dir\DumpDataExtractorTest.cpp.obj
C:\git\llvm-project\lldb\unittests\Core\DumpDataExtractorTest.cpp(140): warning C4305: 'argument': truncation from 'double' to 'const std::complex<float>::_Ty'
```
---
 lldb/unittests/DataFormatter/StringPrinterTests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/unittests/DataFormatter/StringPrinterTests.cpp b/lldb/unittests/DataFormatter/StringPrinterTests.cpp
index ac4116139beb2..74136b6763eca 100644
--- a/lldb/unittests/DataFormatter/StringPrinterTests.cpp
+++ b/lldb/unittests/DataFormatter/StringPrinterTests.cpp
@@ -72,7 +72,7 @@ TEST(StringPrinterTests, CxxASCII) {
   EXPECT_EQ(fmt("🥑"), QUOTE("🥑"));
 
   // Octal (\nnn), hex (\xnn), extended octal (\unnnn or \Unnnnnnnn).
-  EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C"));
+  EXPECT_EQ(fmt(L"\uD55C"), QUOTE(L"\uD55C"));
   EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348"));
 
   EXPECT_EQ(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal.

From 0c195e5096c4ded2676c60c89cd93ef619c8537b Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Thu, 18 Jan 2024 10:08:56 -0800
Subject: [PATCH 027/843] [NVPTX][NFC] Remove unused parameter of
 getArgumentAlignment (#78604)

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 9 ++++-----
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h   | 4 ++--
 2 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 9e9c05adce49e..8080e8404e427 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -1666,8 +1666,7 @@ std::string NVPTXTargetLowering::getPrototype(
   return Prototype;
 }
 
-Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
-                                                const CallBase *CB, Type *Ty,
+Align NVPTXTargetLowering::getArgumentAlignment(const CallBase *CB, Type *Ty,
                                                 unsigned Idx,
                                                 const DataLayout &DL) const {
   if (!CB) {
@@ -1794,7 +1793,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (IsVAArg)
         VAOffset = alignTo(VAOffset, ArgAlign);
     } else {
-      ArgAlign = getArgumentAlignment(Callee, CB, Ty, ParamCount + 1, DL);
+      ArgAlign = getArgumentAlignment(CB, Ty, ParamCount + 1, DL);
     }
 
     unsigned TypeSize =
@@ -1976,7 +1975,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           DeclareRetOps);
       InGlue = Chain.getValue(1);
     } else {
-      retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
+      retAlignment = getArgumentAlignment(CB, RetTy, 0, DL);
       assert(retAlignment && "retAlignment is guaranteed to be set");
       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
       SDValue DeclareRetOps[] = {
@@ -2107,7 +2106,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
     assert(VTs.size() == Ins.size() && "Bad value decomposition");
 
-    Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
+    Align RetAlign = getArgumentAlignment(CB, RetTy, 0, DL);
     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
 
     SmallVector<EVT, 6> LoadVTs;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 18e6179b06819..5d3fd992812ef 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -637,8 +637,8 @@ class NVPTXTargetLowering : public TargetLowering {
                           SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-  Align getArgumentAlignment(SDValue Callee, const CallBase *CB, Type *Ty,
-                             unsigned Idx, const DataLayout &DL) const;
+  Align getArgumentAlignment(const CallBase *CB, Type *Ty, unsigned Idx,
+                             const DataLayout &DL) const;
 };
 
 } // namespace llvm

From 2663d2cb9c9361f0b234c40a0f50c7ba0748eb26 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 18 Jan 2024 10:24:47 -0800
Subject: [PATCH 028/843] [RISCV] Adjust select shuffle cost to reflect mask
 creation cost (#77963)

This is inspired by
https://github.com/llvm/llvm-project/pull/77342#pullrequestreview-1814673242,
and is split off of same with some differences in style.

A select is a vmerge.vv with the additional cost of materializing the
bitmask vector in a vreg. All masks fit within a single vector register
(e8 + m8 is the worst case), and thus our worst case cost should be
roughly 3 (2 scalar to produce the address, one vector load op). Given
most shuffles are small, and the mask will be instead produced by
LUI/ADDI + vmv.s.x or ADDI + vmv.s.x, using 2 as the default seems quite
reasonable. At worst, we're not going to be off by much.

The prior lowering scaled the cost of the bitmask with LMUL, which I
don't understand. At m1 it did use the same base cost of 2. (@lukel97
You wrote the original code here, anything I'm missing here?)
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 11 +--
 .../RISCV/shuffle-insert_subvector.ll         | 76 +++++++++----------
 .../CostModel/RISCV/shuffle-select.ll         |  8 +-
 3 files changed, 48 insertions(+), 47 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 87ea297838165..056b947c7a9f9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -84,8 +84,6 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
       Cost += VL;
       break;
     }
-    case RISCV::VMV_S_X:
-      // FIXME: VMV_S_X doesn't use LMUL, the cost should be 1
     default:
       Cost += LMULCost;
     }
@@ -443,10 +441,13 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     // vsetivli     zero, 8, e8, mf2, ta, ma (ignored)
     // vmv.s.x      v0, a0
     // vmerge.vvm   v8, v9, v8, v0
+    // We use 2 for the cost of the mask materialization as this is the true
+    // cost for small masks and most shuffles are small.  At worst, this cost
+    // should be a very small constant for the constant pool load.  As such,
+    // we may bias towards large selects slightly more than truely warranted.
     return LT.first *
-           (TLI->getLMULCost(LT.second) + // FIXME: should be 1 for li
-            getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
-                                    LT.second, CostKind));
+           (2 + getRISCVInstructionCost({RISCV::VMERGE_VVM},
+                                        LT.second, CostKind));
   }
   case TTI::SK_Broadcast: {
     bool HasScalar = (Args.size() > 0) && (Operator::getOpcode(Args[0]) ==
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll
index af656b4d7976d..9a333dc8b8ddd 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-insert_subvector.ll
@@ -9,13 +9,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -23,13 +23,13 @@ define void @test_vXf64(<2 x double> %src128, <4 x double> %src256, <8 x double>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x double> %src128, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x double> %src128, <2 x double> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x double> %src256, <4 x double> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_01 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x double> %src256, <4 x double> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_01 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x double> %src512, <8 x double> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_0123 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x double> %src512, <8 x double> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -53,13 +53,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -67,13 +67,13 @@ define void @test_vXi64(<2 x i64> %src128, <4 x i64> %src256, <8 x i64> %src512)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_256 = shufflevector <2 x i64> %src128, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src128_512 = shufflevector <2 x i64> %src128, <2 x i64> undef, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <4 x i64> %src256, <4 x i64> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_01 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <4 x i64> %src256, <4 x i64> %src128_256, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_01 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <8 x i64> %src512, <8 x i64> %src128_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_0123 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <8 x i64> %src512, <8 x i64> %src256_512, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -102,13 +102,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -116,11 +116,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -133,13 +133,13 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x float> %src256, <8 x float> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x float> %src128, <4 x float> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_01 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x float> %src256, <8 x float> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_0123 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x float> %src256, <8 x float> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_01 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -147,11 +147,11 @@ define void @test_vXf32(<2 x float> %src64, <4 x float> %src128, <8 x float> %sr
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x float> %src512, <16 x float> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_0123 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_01234567 = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89ABCDEF = shufflevector <16 x float> %src512, <16 x float> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -199,13 +199,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -213,11 +213,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -230,13 +230,13 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %src256_512 = shufflevector <8 x i32> %src256, <8 x i32> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_01 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 4, i32 5, i32 2, i32 3>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_23 = shufflevector <4 x i32> %src128, <4 x i32> %src64_128, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_01 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 8, i32 9, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <8 x i32> %src256, <8 x i32> %src64_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_0123 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <8 x i32> %src256, <8 x i32> %src128_256, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_01 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_23 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_45 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_67 = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -244,11 +244,11 @@ define void @test_vXi32(<2 x i32> %src64, <4 x i32> %src128, <8 x i32> %src256,
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_AB = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CD = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_EF = shufflevector <16 x i32> %src512, <16 x i32> %src64_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_0123 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_4567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89AB = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_CDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V512_01234567 = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V512_89ABCDEF = shufflevector <16 x i32> %src512, <16 x i32> %src128_512, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
@@ -306,7 +306,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -314,11 +314,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
@@ -341,7 +341,7 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_67 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V128_0123 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V128_4567 = shufflevector <8 x i16> %src128, <8 x i16> %src32_128, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_01 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 16, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_23 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 16, i32 17, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_45 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_67 = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 16, i32 17, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -349,11 +349,11 @@ define void @test_vXi16(<2 x i16> %src32, <4 x i16> %src64, <8 x i16> %src128, <
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_AB = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 16, i32 17, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CD = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_EF = shufflevector <16 x i16> %src256, <16 x i16> %src64_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 16, i32 17>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_0123 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_4567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 16, i32 17, i32 18, i32 19, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89AB = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_CDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V256_01234567 = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V256_89ABCDEF = shufflevector <16 x i16> %src256, <16 x i16> %src128_256, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll
index 81454cc826e14..0fd1cfc908d97 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-select.ll
@@ -32,11 +32,11 @@ define <8 x i8> @select_non_contiguous_v8i8(<8 x i8> %v, <8 x i8> %w) {
 
 define <8 x i64> @select_start_v8i64(<8 x i64> %v, <8 x i64> %w) {
 ; CHECK-LABEL: 'select_start_v8i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %res
 ;
 ; CHECK-SIZE-LABEL: 'select_start_v8i64'
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res
 ;
   %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 4, i32 5, i32 6, i32 7>
@@ -45,11 +45,11 @@ define <8 x i64> @select_start_v8i64(<8 x i64> %v, <8 x i64> %w) {
 
 define <8 x i64> @select_non_contiguous_v8i64(<8 x i64> %v, <8 x i64> %w) {
 ; CHECK-LABEL: 'select_non_contiguous_v8i64'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i64> %res
 ;
 ; CHECK-SIZE-LABEL: 'select_non_contiguous_v8i64'
-; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
+; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>
 ; CHECK-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i64> %res
 ;
   %res = shufflevector <8 x i64> %v, <8 x i64> %w, <8 x i32> <i32 8, i32 1, i32 10, i32 3, i32 4, i32 13, i32 6, i32 15>

From 45d1cca3394fa4b14561dd6c36104ff8e20e51db Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Thu, 18 Jan 2024 10:35:18 -0800
Subject: [PATCH 029/843] [llvm][utils] Fix SmallString summary provider
 (#78527)

Fixes `SmallString` summary provider, which was incorrectly producing the empty string.
Initially I thought the strings I was debugging were empty for unknown reasons, but
that was not the case.
---
 llvm/utils/lldbDataFormatters.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/utils/lldbDataFormatters.py b/llvm/utils/lldbDataFormatters.py
index 0b61db60e80c7..de101abdabc8e 100644
--- a/llvm/utils/lldbDataFormatters.py
+++ b/llvm/utils/lldbDataFormatters.py
@@ -218,12 +218,14 @@ def get_child_at_index(self, index):
 
 
 def SmallStringSummaryProvider(valobj, internal_dict):
-    num_elements = valobj.GetNumChildren()
+    # The underlying SmallVector base class is the first child.
+    vector = valobj.GetChildAtIndex(0)
+    num_elements = vector.GetNumChildren()
     res = '"'
-    for i in range(0, num_elements):
-        c = valobj.GetChildAtIndex(i).GetValue()
+    for i in range(num_elements):
+        c = vector.GetChildAtIndex(i)
         if c:
-            res += c.strip("'")
+            res += chr(c.GetValueAsUnsigned())
     res += '"'
     return res
 

From 5de1d007ddc5b13a643ebedce42c5c164cd3bec2 Mon Sep 17 00:00:00 2001
From: kyulee-com <kyulee@meta.com>
Date: Thu, 18 Jan 2024 10:46:30 -0800
Subject: [PATCH 030/843] [lld-macho] Fix for objc_msgSend stubs (#78557)

This commit corrects the address computation for objc_msgSend stubs.
Previously, the address computation was incidentally correct due to
objc_msgSend often being the first entry in the got section, resulting
in a 0 index. This commit ensures accurate address computation
regardless of the objc_msgSend stub's position in the got section.
---
 lld/MachO/Arch/ARM64Common.h          |  5 ++--
 lld/test/MachO/arm64-objc-stubs-fix.s | 34 +++++++++++++++++++++++++++
 2 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 lld/test/MachO/arm64-objc-stubs-fix.s

diff --git a/lld/MachO/Arch/ARM64Common.h b/lld/MachO/Arch/ARM64Common.h
index 8dc2412b29bca..9cfccb6cec761 100644
--- a/lld/MachO/Arch/ARM64Common.h
+++ b/lld/MachO/Arch/ARM64Common.h
@@ -170,9 +170,10 @@ writeObjCMsgSendStub(uint8_t *buf, const uint32_t objcStubsFastCode[8],
                pageBits(selrefsVA + selectorOffset) - pcPageBits(0));
   encodePageOff12(&buf32[1], d, objcStubsFastCode[1],
                   selrefsVA + selectorOffset);
+  uint64_t gotOffset = msgSendIndex * LP::wordSize;
   encodePage21(&buf32[2], d, objcStubsFastCode[2],
-               pageBits(gotAddr) - pcPageBits(2));
-  encodePage21(&buf32[3], d, objcStubsFastCode[3], msgSendIndex * LP::wordSize);
+               pageBits(gotAddr + gotOffset) - pcPageBits(2));
+  encodePageOff12(&buf32[3], d, objcStubsFastCode[3], gotAddr + gotOffset);
   buf32[4] = objcStubsFastCode[4];
   buf32[5] = objcStubsFastCode[5];
   buf32[6] = objcStubsFastCode[6];
diff --git a/lld/test/MachO/arm64-objc-stubs-fix.s b/lld/test/MachO/arm64-objc-stubs-fix.s
new file mode 100644
index 0000000000000..0dbec361f4b7e
--- /dev/null
+++ b/lld/test/MachO/arm64-objc-stubs-fix.s
@@ -0,0 +1,34 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t.o
+# RUN: %lld -arch arm64 -lSystem  -fixup_chains -o %t.out %t.o
+# RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s --check-prefix=CHECK --check-prefix=FIRST
+
+# Prepend a dummy entry to check if the address for _objc_msgSend is valid.
+# RUN: %lld -arch arm64 -lSystem  -fixup_chains -e _dummy -U _dummy -o %t.out %t.o
+# RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s --check-prefix=CHECK --check-prefix=SECOND
+
+# CHECK: Contents of (__TEXT,__objc_stubs) section
+
+# CHECK-NEXT: _objc_msgSend$foo:
+# CHECK-NEXT: adrp    x1, 8 ; 0x100008000
+# CHECK-NEXT: ldr     x1, [x1]
+# CHECK-NEXT: adrp    x16, 4 ; 0x100004000
+# FIRST-NEXT: ldr     x16, [x16]
+# SECOND-NEXT:ldr     x16, [x16, #0x8]
+# CHECK-NEXT: br      x16
+# CHECK-NEXT: brk     #0x1
+# CHECK-NEXT: brk     #0x1
+# CHECK-NEXT: brk     #0x1
+
+# CHECK-EMPTY:
+
+.text
+.globl _objc_msgSend
+_objc_msgSend:
+  ret
+
+.globl _main
+_main:
+  bl  _objc_msgSend$foo
+  ret

From 2c9f04c98a1922d711fd1a88563506ee75c771bf Mon Sep 17 00:00:00 2001
From: Alan Zhao <alanzhao1@users.noreply.github.com>
Date: Thu, 18 Jan 2024 10:53:54 -0800
Subject: [PATCH 031/843] [clang] Fix parenthesized list initialization of
 arrays not working with `new` (#76976)

This bug is caused by parenthesized list initialization not being
implemented in `CodeGenFunction::EmitNewArrayInitializer(...)`.

Parenthesized list initialization of `struct`s with `operator new`
already works in Clang and is not affected by this bug.

Additionally, fix the test new-delete.cpp as it incorrectly assumes that
using parentheses with operator new to initialize arrays is illegal for
C++ versions >= C++17.

Fixes #68198
---
 clang/docs/ReleaseNotes.rst                |  3 +
 clang/lib/CodeGen/CGExprCXX.cpp            | 60 +++++++++-----
 clang/lib/Sema/SemaExprCXX.cpp             |  7 +-
 clang/lib/Sema/SemaInit.cpp                |  2 +-
 clang/test/CodeGen/paren-list-agg-init.cpp | 57 +++++++++++++
 clang/test/CodeGenCXX/new-array-init.cpp   | 94 +++++++++++++++++++++-
 clang/test/SemaCXX/new-delete.cpp          | 31 ++++---
 7 files changed, 221 insertions(+), 33 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0ea6f93a1f5df..b400d75095421 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -976,6 +976,9 @@ Bug Fixes to C++ Support
   ``std::source_location::current()`` was used in a function template.
   Fixes (`#78128 <https://github.com/llvm/llvm-project/issues/78128>`_)
 
+- Clang now allows parenthesized initialization of arrays in `operator new[]`.
+  Fixes: (`#68198 <https://github.com/llvm/llvm-project/issues/68198>`_)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 98ae56e2df881..d136bfc37278f 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1038,11 +1038,25 @@ void CodeGenFunction::EmitNewArrayInitializer(
     return true;
   };
 
+  const InitListExpr *ILE = dyn_cast<InitListExpr>(Init);
+  const CXXParenListInitExpr *CPLIE = nullptr;
+  const StringLiteral *SL = nullptr;
+  const ObjCEncodeExpr *OCEE = nullptr;
+  const Expr *IgnoreParen = nullptr;
+  if (!ILE) {
+    IgnoreParen = Init->IgnoreParenImpCasts();
+    CPLIE = dyn_cast<CXXParenListInitExpr>(IgnoreParen);
+    SL = dyn_cast<StringLiteral>(IgnoreParen);
+    OCEE = dyn_cast<ObjCEncodeExpr>(IgnoreParen);
+  }
+
   // If the initializer is an initializer list, first do the explicit elements.
-  if (const InitListExpr *ILE = dyn_cast<InitListExpr>(Init)) {
+  if (ILE || CPLIE || SL || OCEE) {
     // Initializing from a (braced) string literal is a special case; the init
     // list element does not initialize a (single) array element.
-    if (ILE->isStringLiteralInit()) {
+    if ((ILE && ILE->isStringLiteralInit()) || SL || OCEE) {
+      if (!ILE)
+        Init = IgnoreParen;
       // Initialize the initial portion of length equal to that of the string
       // literal. The allocation must be for at least this much; we emitted a
       // check for that earlier.
@@ -1054,12 +1068,13 @@ void CodeGenFunction::EmitNewArrayInitializer(
                                 AggValueSlot::DoesNotOverlap,
                                 AggValueSlot::IsNotZeroed,
                                 AggValueSlot::IsSanitizerChecked);
-      EmitAggExpr(ILE->getInit(0), Slot);
+      EmitAggExpr(ILE ? ILE->getInit(0) : Init, Slot);
 
       // Move past these elements.
       InitListElements =
-          cast<ConstantArrayType>(ILE->getType()->getAsArrayTypeUnsafe())
-              ->getSize().getZExtValue();
+          cast<ConstantArrayType>(Init->getType()->getAsArrayTypeUnsafe())
+              ->getSize()
+              .getZExtValue();
       CurPtr = Builder.CreateConstInBoundsGEP(
           CurPtr, InitListElements, "string.init.end");
 
@@ -1073,7 +1088,9 @@ void CodeGenFunction::EmitNewArrayInitializer(
       return;
     }
 
-    InitListElements = ILE->getNumInits();
+    ArrayRef<const Expr *> InitExprs =
+        ILE ? ILE->inits() : CPLIE->getInitExprs();
+    InitListElements = InitExprs.size();
 
     // If this is a multi-dimensional array new, we will initialize multiple
     // elements with each init list element.
@@ -1101,7 +1118,8 @@ void CodeGenFunction::EmitNewArrayInitializer(
     }
 
     CharUnits StartAlign = CurPtr.getAlignment();
-    for (unsigned i = 0, e = ILE->getNumInits(); i != e; ++i) {
+    unsigned i = 0;
+    for (const Expr *IE : InitExprs) {
       // Tell the cleanup that it needs to destroy up to this
       // element.  TODO: some of these stores can be trivially
       // observed to be unnecessary.
@@ -1111,18 +1129,17 @@ void CodeGenFunction::EmitNewArrayInitializer(
       // FIXME: If the last initializer is an incomplete initializer list for
       // an array, and we have an array filler, we can fold together the two
       // initialization loops.
-      StoreAnyExprIntoOneUnit(*this, ILE->getInit(i),
-                              ILE->getInit(i)->getType(), CurPtr,
+      StoreAnyExprIntoOneUnit(*this, IE, IE->getType(), CurPtr,
                               AggValueSlot::DoesNotOverlap);
       CurPtr = Address(Builder.CreateInBoundsGEP(
                            CurPtr.getElementType(), CurPtr.getPointer(),
                            Builder.getSize(1), "array.exp.next"),
                        CurPtr.getElementType(),
-                       StartAlign.alignmentAtOffset((i + 1) * ElementSize));
+                       StartAlign.alignmentAtOffset((++i) * ElementSize));
     }
 
     // The remaining elements are filled with the array filler expression.
-    Init = ILE->getArrayFiller();
+    Init = ILE ? ILE->getArrayFiller() : CPLIE->getArrayFiller();
 
     // Extract the initializer for the individual array elements by pulling
     // out the array filler from all the nested initializer lists. This avoids
@@ -1561,16 +1578,23 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
   // 1. Build a call to the allocation function.
   FunctionDecl *allocator = E->getOperatorNew();
 
-  // If there is a brace-initializer, cannot allocate fewer elements than inits.
+  // If there is a brace-initializer or C++20 parenthesized initializer, cannot
+  // allocate fewer elements than inits.
   unsigned minElements = 0;
   if (E->isArray() && E->hasInitializer()) {
-    const InitListExpr *ILE = dyn_cast<InitListExpr>(E->getInitializer());
-    if (ILE && ILE->isStringLiteralInit())
+    const Expr *Init = E->getInitializer();
+    const InitListExpr *ILE = dyn_cast<InitListExpr>(Init);
+    const CXXParenListInitExpr *CPLIE = dyn_cast<CXXParenListInitExpr>(Init);
+    const Expr *IgnoreParen = Init->IgnoreParenImpCasts();
+    if ((ILE && ILE->isStringLiteralInit()) ||
+        isa<StringLiteral>(IgnoreParen) || isa<ObjCEncodeExpr>(IgnoreParen)) {
       minElements =
-          cast<ConstantArrayType>(ILE->getType()->getAsArrayTypeUnsafe())
-              ->getSize().getZExtValue();
-    else if (ILE)
-      minElements = ILE->getNumInits();
+          cast<ConstantArrayType>(Init->getType()->getAsArrayTypeUnsafe())
+              ->getSize()
+              .getZExtValue();
+    } else if (ILE || CPLIE) {
+      minElements = ILE ? ILE->getNumInits() : CPLIE->getInitExprs().size();
+    }
   }
 
   llvm::Value *numElements = nullptr;
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 074bde26b9248..8b6a80b45b27b 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1947,11 +1947,11 @@ Sema::ActOnCXXNew(SourceLocation StartLoc, bool UseGlobal,
 }
 
 static bool isLegalArrayNewInitializer(CXXNewInitializationStyle Style,
-                                       Expr *Init) {
+                                       Expr *Init, bool IsCPlusPlus20) {
   if (!Init)
     return true;
   if (ParenListExpr *PLE = dyn_cast<ParenListExpr>(Init))
-    return PLE->getNumExprs() == 0;
+    return IsCPlusPlus20 || PLE->getNumExprs() == 0;
   if (isa<ImplicitValueInitExpr>(Init))
     return true;
   else if (CXXConstructExpr *CCE = dyn_cast<CXXConstructExpr>(Init))
@@ -2406,7 +2406,8 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
   // Array 'new' can't have any initializers except empty parentheses.
   // Initializer lists are also allowed, in C++11. Rely on the parser for the
   // dialect distinction.
-  if (ArraySize && !isLegalArrayNewInitializer(InitStyle, Initializer)) {
+  if (ArraySize && !isLegalArrayNewInitializer(InitStyle, Initializer,
+                                               getLangOpts().CPlusPlus20)) {
     SourceRange InitRange(Exprs.front()->getBeginLoc(),
                           Exprs.back()->getEndLoc());
     Diag(StartLoc, diag::err_new_array_init_args) << InitRange;
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 96900efa75fcd..18440a69e3a3d 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -5496,7 +5496,7 @@ static void TryOrBuildParenListInitialization(
         return;
     }
     //   ...and value-initialized for each k < i <= n;
-    if (ArrayLength > Args.size()) {
+    if (ArrayLength > Args.size() || Entity.isVariableLengthArrayNew()) {
       InitializedEntity SubEntity = InitializedEntity::InitializeElement(
           S.getASTContext(), Args.size(), Entity);
       InitializationKind SubKind = InitializationKind::CreateValue(
diff --git a/clang/test/CodeGen/paren-list-agg-init.cpp b/clang/test/CodeGen/paren-list-agg-init.cpp
index eb8d91a7f97f6..94d42431d125d 100644
--- a/clang/test/CodeGen/paren-list-agg-init.cpp
+++ b/clang/test/CodeGen/paren-list-agg-init.cpp
@@ -513,3 +513,60 @@ namespace gh61567 {
     I(0);
   }
 }
+
+namespace gh68198 {
+  // CHECK: define {{.*}} void @{{.*foo25.*}} {
+  // CHECK-NEXT: entry
+  // CHECK-NEXT: [[ARR_8:%.*arr8.*]] = alloca ptr, align 8
+  // CHECK-NEXT: [[CALL_PTR:%.*]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 8)
+  // CHECK-NEXT: store i32 1, ptr [[CALL_PTR]], align 4
+  // CHECK-NEXT: [[ARRAY_EXP_NEXT:%.*]] = getelementptr inbounds i32, ptr [[CALL_PTR]], i64 1
+  // CHECK-NEXT: store i32 2, ptr [[ARRAY_EXP_NEXT]], align 4
+  // CHECK-NEXT: [[ARRAY_EXP_NEXT1:%.*]] = getelementptr inbounds i32, ptr [[ARRAY_EXP_NEXT]], i64 1
+  // CHECK-NEXT: store ptr [[CALL_PTR]], ptr %arr8, align 8
+  // CHECK-NEXT: ret void
+  void foo25() {
+    int* arr8 = new int[](1, 2);
+  }
+
+  // CHECK: define {{.*}} void @{{.*foo26.*}} {
+  // CHECK-NEXT: entry
+  // CHECK-NEXT: [[ARR_10:%.*arr9.*]] = alloca ptr, align 8
+  // CHECK-NEXT: [[CALL_PTR]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 16)
+  // CHECK-NEXT: [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x i32], ptr [[CALL]], i64 0, i64 0
+  // CHECK-NEXT: store i32 1, ptr [[ARRAYINIT_BEGIN]], align 4
+  // CHECK-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds i32, ptr [[ARRAYINIT_BEGIN]], i64 1
+  // CHECK-NEXT: store i32 2, ptr [[ARRAYINIT_ELEMENT]], align 4
+  // CHECK-NEXT: [[ARRAY_EXP_NEXT:%.*]] = getelementptr inbounds [2 x i32], ptr %call, i64 1
+  // CHECK-NEXT: [[ARRAYINIT_BEGIN1:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAY_EXP_NEXT]], i64 0, i64 0
+  // CHECK-NEXT: store i32 3, ptr [[ARRAYINIT_BEGIN1]], align 4
+  // CHECK-NEXT: [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds i32, ptr [[ARRAYINIT_BEGIN1]], i64 1
+  // CHECK-NEXT: store i32 4, ptr [[ARRAYINIT_ELEMENT2]], align 4
+  // CHECK-NEXT: [[ARRAY_EXP_NEXT3:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAY_EXP_NEXT]], i64 1
+  // CHECK-NEXT: store ptr [[CALL_PTR]], ptr [[ARR_10]], align 8
+  // CHECK-NEXT: ret void
+  void foo26() {
+    void* arr9 = new int[][2]({1, 2}, {3, 4});
+  }
+
+  // CHECK: define {{.*}} void @{{.*foo27.*}} {
+  // CHECK-NEXT: entry
+  // CHECK-NEXT: [[ARR_10:%.*arr10.*]] = alloca ptr, align 8
+  // CHECK-NEXT: [[CALL_PTR]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 32)
+  // CHECK-NEXT: [[ARRAYINIT_BEGIN:%.*]] = getelementptr inbounds [2 x i32], ptr [[CALL]], i64 0, i64 0
+  // CHECK-NEXT: store i32 5, ptr [[ARRAYINIT_BEGIN]], align 4
+  // CHECK-NEXT: [[ARRAYINIT_ELEMENT:%.*]] = getelementptr inbounds i32, ptr [[ARRAYINIT_BEGIN]], i64 1
+  // CHECK-NEXT: store i32 6, ptr [[ARRAYINIT_ELEMENT]], align 4
+  // CHECK-NEXT: [[ARRAY_EXP_NEXT:%.*]] = getelementptr inbounds [2 x i32], ptr %call, i64 1
+  // CHECK-NEXT: [[ARRAYINIT_BEGIN1:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAY_EXP_NEXT]], i64 0, i64 0
+  // CHECK-NEXT: store i32 7, ptr [[ARRAYINIT_BEGIN1]], align 4
+  // CHECK-NEXT: [[ARRAYINIT_ELEMENT2:%.*]] = getelementptr inbounds i32, ptr [[ARRAYINIT_BEGIN1]], i64 1
+  // CHECK-NEXT: store i32 8, ptr [[ARRAYINIT_ELEMENT2]], align 4
+  // CHECK-NEXT: [[ARRAY_EXP_NEXT3:%.*]] = getelementptr inbounds [2 x i32], ptr [[ARRAY_EXP_NEXT]], i64 1
+  // CHECK-NEXT: call void @llvm.memset.p0.i64(ptr align 4 [[ARRAY_EXP_NEXT3]], i8 0, i64 16, i1 false)
+  // CHECK-NEXT: store ptr [[CALL_PTR]], ptr [[ARR_10]], align 8
+  // CHECK-NEXT: ret void
+  void foo27() {
+    void* arr10 = new int[4][2]({5, 6}, {7, 8});
+  }
+}
diff --git a/clang/test/CodeGenCXX/new-array-init.cpp b/clang/test/CodeGenCXX/new-array-init.cpp
index db740d065e2f7..fe1bdf425ab14 100644
--- a/clang/test/CodeGenCXX/new-array-init.cpp
+++ b/clang/test/CodeGenCXX/new-array-init.cpp
@@ -1,4 +1,5 @@
 // RUN: %clang_cc1 -std=c++11 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 -triple i386-unknown-unknown %s -emit-llvm -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKCXX20
 // RUN: %clang_cc1 -std=c++11 -triple i386-unknown-unknown %s -emit-llvm -fsanitize=signed-integer-overflow -o - | FileCheck --check-prefix=SIO %s
 
 // CHECK: @[[ABC4:.*]] = {{.*}} constant [4 x i8] c"abc\00"
@@ -15,6 +16,19 @@ void fn(int n) {
   new int[n] { 1, 2, 3 };
 }
 
+#if __cplusplus >= 202002L
+// CHECKCXX20-LABEL: define{{.*}} void @_Z8fn_pareni
+void fn_paren(int n) {
+  // CHECKCXX20: icmp ult i{{32|64}} %{{[^ ]+}}, 3
+  // CHECKCXX20: store i32 1
+  // CHECKCXX20: store i32 2
+  // CHECKCXX20: store i32 3
+  // CHECKCXX20: sub {{.*}}, 12
+  // CHECKCXX20: call void @llvm.memset
+  new int[n](1, 2, 3);
+}
+#endif
+
 // CHECK-LABEL: define{{.*}} void @_Z11const_exactv
 void const_exact() {
   // CHECK-NOT: icmp ult i{{32|64}} %{{[^ ]+}}, 3
@@ -22,6 +36,15 @@ void const_exact() {
   new int[3] { 1, 2, 3 };
 }
 
+#if __cplusplus >= 202002L
+// CHECKCXX20-LABEL: define{{.*}} void @_Z17const_exact_parenv
+void const_exact_paren() {
+  // CHECKCXX20-NOT: icmp ult i{{32|64}} %{{[^ ]+}}, 3
+  // CHECKCXX20-NOT: icmp eq ptr
+  new int[3](1, 2, 3);
+}
+#endif
+
 // CHECK-LABEL: define{{.*}} void @_Z16const_sufficientv
 void const_sufficient() {
   // CHECK-NOT: icmp ult i{{32|64}} %{{[^ ]+}}, 3
@@ -29,6 +52,15 @@ void const_sufficient() {
   // CHECK: ret void
 }
 
+#if __cplusplus >= 202002L
+// CHECKCXX20-LABEL: define{{.*}} void @_Z22const_sufficient_parenv
+void const_sufficient_paren() {
+  // CHECKCXX20-NOT: icmp ult i{{32|64}} %{{[^ ]+}}, 3
+  new int[4](1, 2, 3);
+  // CHECKCXX20: ret void
+}
+#endif
+
 // CHECK-LABEL: define{{.*}} void @_Z22check_array_value_initv
 void check_array_value_init() {
   struct S;
@@ -46,7 +78,7 @@ void check_array_value_init() {
 
 // CHECK-LABEL: define{{.*}} void @_Z15string_nonconsti
 void string_nonconst(int n) {
-  // CHECK: icmp slt i{{32|64}} %{{[^ ]+}}, 4
+  // CHECK: icmp {{s|u}}lt i{{32|64}} %{{[^ ]+}}, 4
   // FIXME: Conditionally throw an exception rather than passing -1 to alloc function
   // CHECK: select
   // CHECK: %[[PTR:.*]] = call noalias noundef nonnull ptr @_Zna{{.}}(i{{32|64}}
@@ -57,6 +89,34 @@ void string_nonconst(int n) {
   new char[n] { "abc" };
 }
 
+#if __cplusplus >= 202002L
+// CHECKCXX20-LABEL: define{{.*}} void @_Z21string_nonconst_pareni
+void string_nonconst_paren(int n) {
+  // CHECKCXX20: icmp {{s|u}}lt i{{32|64}} %{{[^ ]+}}, 4
+  // FIXME: Conditionally throw an exception rather than passing -1 to alloc function
+  // CHECKCXX20: select
+  // CHECKCXX20: %[[PTR:.*]] = call noalias noundef nonnull ptr @_Zna{{.}}(i{{32|64}}
+  // CHECKCXX20: call void @llvm.memcpy{{.*}}(ptr align {{[0-9]+}} %[[PTR]], ptr align {{[0-9]+}} @[[ABC4]], i32 4,
+  // CHECKCXX20: %[[REST:.*]] = getelementptr inbounds i8, ptr %[[PTR]], i32 4
+  // CHECKCXX20: %[[RESTSIZE:.*]] = sub {{.*}}, 4
+  // CHECKCXX20: call void @llvm.memset{{.*}}(ptr align {{[0-9]+}} %[[REST]], i8 0, i{{32|64}} %[[RESTSIZE]],
+  new char[n]("abc");
+}
+
+// CHECKCXX20-LABEL: define{{.*}} void @_Z33string_nonconst_paren_extra_pareni
+void string_nonconst_paren_extra_paren(int n) {
+  // CHECKCXX20: icmp {{s|u}}lt i{{32|64}} %{{[^ ]+}}, 4
+  // FIXME: Conditionally throw an exception rather than passing -1 to alloc function
+  // CHECKCXX20: select
+  // CHECKCXX20: %[[PTR:.*]] = call noalias noundef nonnull ptr @_Zna{{.}}(i{{32|64}}
+  // CHECKCXX20: call void @llvm.memcpy{{.*}}(ptr align {{[0-9]+}} %[[PTR]], ptr align {{[0-9]+}} @[[ABC4]], i32 4,
+  // CHECKCXX20: %[[REST:.*]] = getelementptr inbounds i8, ptr %[[PTR]], i32 4
+  // CHECKCXX20: %[[RESTSIZE:.*]] = sub {{.*}}, 4
+  // CHECKCXX20: call void @llvm.memset{{.*}}(ptr align {{[0-9]+}} %[[REST]], i8 0, i{{32|64}} %[[RESTSIZE]],
+  new char[n](("abc"));
+}
+#endif
+
 // CHECK-LABEL: define{{.*}} void @_Z12string_exactv
 void string_exact() {
   // CHECK-NOT: icmp
@@ -66,6 +126,26 @@ void string_exact() {
   new char[4] { "abc" };
 }
 
+#if __cplusplus >= 202002L
+// CHECKCXX20-LABEL: define{{.*}} void @_Z18string_exact_parenv
+void string_exact_paren() {
+  // CHECKCXX20-NOT: icmp
+  // CHECKCXX20: %[[PTR:.*]] = call noalias noundef nonnull ptr @_Zna{{.}}(i{{32|64}} noundef 4)
+  // CHECKCXX20: call void @llvm.memcpy{{.*}}(ptr align {{[0-9]+}} %[[PTR]], ptr align {{[0-9]+}} @[[ABC4]], i32 4,
+  // CHECKCXX20-NOT: memset
+  new char[4]("abc");
+}
+
+// CHECKCXX20-LABEL: define{{.*}} void @_Z28string_exact_paren_extensionv
+void string_exact_paren_extension() {
+  // CHECKCXX20-NOT: icmp
+  // CHECKCXX20: %[[PTR:.*]] = call noalias noundef nonnull ptr @_Zna{{.}}(i{{32|64}} noundef 4)
+  // CHECKCXX20: call void @llvm.memcpy{{.*}}(ptr align {{[0-9]+}} %[[PTR]], ptr align {{[0-9]+}} @[[ABC4]], i32 4,
+  // CHECKCXX20-NOT: memset
+  new char[4](__extension__ "abc");
+}
+#endif
+
 // CHECK-LABEL: define{{.*}} void @_Z17string_sufficientv
 void string_sufficient() {
   // CHECK-NOT: icmp
@@ -76,6 +156,18 @@ void string_sufficient() {
   new char[15] { "abc" };
 }
 
+#if __cplusplus >= 202002L
+// CHECKCXX20-LABEL: define{{.*}} void @_Z23string_sufficient_parenv
+void string_sufficient_paren() {
+  // CHECKCXX20-NOT: icmp
+  // CHECKCXX20: %[[PTR:.*]] = call noalias noundef nonnull ptr @_Zna{{.}}(i{{32|64}} noundef 15)
+  // FIXME: For very large arrays, it would be preferable to emit a small copy and a memset.
+  // CHECKCXX20: call void @llvm.memcpy{{.*}}(ptr align {{[0-9]+}} %[[PTR]], ptr align {{[0-9]+}} @[[ABC15]], i32 15,
+  // CHECKCXX20-NOT: memset
+  new char[15] { "abc" };
+}
+#endif
+
 // CHECK-LABEL: define{{.*}} void @_Z10aggr_exactv
 void aggr_exact() {
   // CHECK-NOT: icmp
diff --git a/clang/test/SemaCXX/new-delete.cpp b/clang/test/SemaCXX/new-delete.cpp
index 0270e42b7389f..4f78b7c71a91c 100644
--- a/clang/test/SemaCXX/new-delete.cpp
+++ b/clang/test/SemaCXX/new-delete.cpp
@@ -1,7 +1,8 @@
-// RUN: %clang_cc1 -fsyntax-only -verify %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++98
-// RUN: %clang_cc1 -fsyntax-only -verify %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++11
-// RUN: %clang_cc1 -fsyntax-only -verify %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++14
-// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx17 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null %std_cxx17-
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++98
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++11
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++14
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx17,precxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++17
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx17,cxx20 %s -triple=i686-pc-linux-gnu -Wno-new-returns-null -std=c++20
 
 // FIXME Location is (frontend)
 // cxx17-note@*:* {{candidate function not viable: requires 2 arguments, but 3 were provided}}
@@ -26,6 +27,11 @@ struct U
 struct V : U
 {
 };
+struct W // cxx20-note 2{{candidate constructor}}
+{
+  int a;
+  int b;
+};
 
 inline void operator delete(void *); // expected-warning {{replacement function 'operator delete' cannot be declared 'inline'}}
 
@@ -340,23 +346,28 @@ namespace PR5918 { // Look for template operator new overloads.
 namespace Test1 {
 
 void f() {
-  (void)new int[10](1, 2); // expected-error {{array 'new' cannot have initialization arguments}}
+  (void)new int[10](1, 2); // precxx20-error {{array 'new' cannot have initialization arguments}}
 
   typedef int T[10];
-  (void)new T(1, 2); // expected-error {{array 'new' cannot have initialization arguments}}
+  (void)new T(1, 2); // precxx20-error {{array 'new' cannot have initialization arguments}}
 }
 
 template<typename T>
 void g(unsigned i) {
-  (void)new T[1](i); // expected-error {{array 'new' cannot have initialization arguments}}
+  (void)new T[1](i); // precxx20-error {{array 'new' cannot have initialization arguments}}
 }
 
 template<typename T>
 void h(unsigned i) {
-  (void)new T(i); // expected-error {{array 'new' cannot have initialization arguments}}
+  (void)new T(i); // precxx20-error {{array 'new' cannot have initialization arguments}}
 }
 template void h<unsigned>(unsigned);
-template void h<unsigned[10]>(unsigned); // expected-note {{in instantiation of function template specialization 'Test1::h<unsigned int[10]>' requested here}}
+template void h<unsigned[10]>(unsigned); // precxx20-note {{in instantiation of function template specialization 'Test1::h<unsigned int[10]>' requested here}}
+
+void i() {
+  new W[2](1, 2, 3); // precxx20-error {{array 'new' cannot have initialization arguments}}
+  // cxx20-error@-1 {{no viable conversion from 'int' to 'W'}}
+}
 
 }
 
@@ -556,7 +567,7 @@ namespace P12023 {
 
   int main()
   {
-    CopyCounter* f = new CopyCounter[10](CopyCounter()); // expected-error {{cannot have initialization arguments}}
+    CopyCounter* f = new CopyCounter[10](CopyCounter()); // precxx20-error {{cannot have initialization arguments}}
       return 0;
   }
 }

From 0ac992e0ada60c670498ac3276150e1632ab0039 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alex_toresh@yahoo.fr>
Date: Thu, 18 Jan 2024 13:54:50 -0500
Subject: [PATCH 032/843] [openmp] Revert
 64874e5ab5fd102344d43ac9465537a44130bf19 since it was committed by mistake
 and the PR (https://github.com/llvm/llvm-project/pull/77853) wasn't approved
 yet.

---
 openmp/runtime/src/kmp.h                 | 34 ------------------------
 openmp/runtime/src/kmp_affinity.cpp      | 14 ++++------
 openmp/runtime/src/kmp_atomic.h          | 12 ---------
 openmp/runtime/src/kmp_barrier.cpp       |  7 +++--
 openmp/runtime/src/kmp_dispatch.h        |  8 ------
 openmp/runtime/src/kmp_io.cpp            |  3 +--
 openmp/runtime/src/kmp_settings.cpp      | 16 +++--------
 openmp/runtime/src/kmp_wait_release.h    |  2 ++
 openmp/runtime/src/z_Windows_NT_util.cpp |  3 +--
 9 files changed, 15 insertions(+), 84 deletions(-)

diff --git a/openmp/runtime/src/kmp.h b/openmp/runtime/src/kmp.h
index 25fdb2fbccf3e..c287a31e0b1b5 100644
--- a/openmp/runtime/src/kmp.h
+++ b/openmp/runtime/src/kmp.h
@@ -502,11 +502,6 @@ static inline kmp_sched_t __kmp_sched_without_mods(kmp_sched_t kind) {
 }
 
 /* Type to keep runtime schedule set via OMP_SCHEDULE or omp_set_schedule() */
-#if KMP_COMPILER_MSVC
-#pragma warning(push)
-// warning C4201: nonstandard extension used: nameless struct/union
-#pragma warning(disable : 4201)
-#endif
 typedef union kmp_r_sched {
   struct {
     enum sched_type r_sched_type;
@@ -514,9 +509,6 @@ typedef union kmp_r_sched {
   };
   kmp_int64 sched;
 } kmp_r_sched_t;
-#if KMP_COMPILER_MSVC
-#pragma warning(pop)
-#endif
 
 extern enum sched_type __kmp_sch_map[]; // map OMP 3.0 schedule types with our
 // internal schedule types
@@ -610,8 +602,6 @@ typedef int PACKED_REDUCTION_METHOD_T;
 #pragma warning(push)
 #pragma warning(disable : 271 310)
 #endif
-// Don't include everything related to NT status code, we'll do that explicitely
-#define WIN32_NO_STATUS
 #include <windows.h>
 #if KMP_MSVC_COMPAT
 #pragma warning(pop)
@@ -1907,20 +1897,12 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info32 {
   // Because of parm1-4 are used together, performance seems to be better
   // if they are on the same cache line (not measured though).
 
-#if KMP_COMPILER_MSVC
-#pragma warning(push)
-// warning C4201: nonstandard extension used: nameless struct/union
-#pragma warning(disable : 4201)
-#endif
   struct KMP_ALIGN(32) {
     kmp_int32 parm1;
     kmp_int32 parm2;
     kmp_int32 parm3;
     kmp_int32 parm4;
   };
-#if KMP_COMPILER_MSVC
-#pragma warning(pop)
-#endif
 
 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
   kmp_uint32 pchunks;
@@ -1954,20 +1936,12 @@ typedef struct KMP_ALIGN_CACHE dispatch_private_info64 {
   //    b) all parm1-4 are in the same cache line.
   // Because of parm1-4 are used together, performance seems to be better
   // if they are in the same line (not measured though).
-#if KMP_COMPILER_MSVC
-#pragma warning(push)
-// warning C4201: nonstandard extension used: nameless struct/union
-#pragma warning(disable : 4201)
-#endif
   struct KMP_ALIGN(32) {
     kmp_int64 parm1;
     kmp_int64 parm2;
     kmp_int64 parm3;
     kmp_int64 parm4;
   };
-#if KMP_COMPILER_MSVC
-#pragma warning(pop)
-#endif
 
 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
   kmp_uint64 pchunks;
@@ -2249,11 +2223,6 @@ union KMP_ALIGN_CACHE kmp_barrier_union {
 
 typedef union kmp_barrier_union kmp_balign_t;
 
-#if KMP_COMPILER_MSVC
-#pragma warning(push)
-// warning C4201: nonstandard extension used: nameless struct/union
-#pragma warning(disable : 4201)
-#endif
 /* Team barrier needs only non-volatile arrived counter */
 union KMP_ALIGN_CACHE kmp_barrier_team_union {
   double b_align; /* use worst case alignment */
@@ -2270,9 +2239,6 @@ union KMP_ALIGN_CACHE kmp_barrier_team_union {
 #endif
   };
 };
-#if KMP_COMPILER_MSVC
-#pragma warning(pop)
-#endif
 
 typedef union kmp_barrier_team_union kmp_balign_team_t;
 
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 673b9f5793b86..7009730a49ba7 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -127,9 +127,8 @@ const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
     return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
   case KMP_HW_PROC_GROUP:
     return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
-  default:
-    return KMP_I18N_STR(Unknown);
   }
+  return KMP_I18N_STR(Unknown);
 }
 
 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
@@ -158,9 +157,8 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
     return ((plural) ? "threads" : "thread");
   case KMP_HW_PROC_GROUP:
     return ((plural) ? "proc_groups" : "proc_group");
-  default:
-    return ((plural) ? "unknowns" : "unknown");
   }
+  return ((plural) ? "unknowns" : "unknown");
 }
 
 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
@@ -173,9 +171,8 @@ const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
   case KMP_HW_CORE_TYPE_CORE:
     return "Intel(R) Core(TM) processor";
 #endif
-  default:
-    return "unknown";
   }
+  return "unknown";
 }
 
 #if KMP_AFFINITY_SUPPORTED
@@ -1249,10 +1246,9 @@ bool kmp_topology_t::filter_hw_subset() {
 #endif
       case KMP_HW_CORE_TYPE_UNKNOWN:
         return 0;
-      default:
-        KMP_ASSERT(0);
-        return 0;
       }
+      KMP_ASSERT(0);
+      return 0;
     }
   };
   struct core_eff_indexer {
diff --git a/openmp/runtime/src/kmp_atomic.h b/openmp/runtime/src/kmp_atomic.h
index 1cf17018a958f..4fc51ee4289bd 100644
--- a/openmp/runtime/src/kmp_atomic.h
+++ b/openmp/runtime/src/kmp_atomic.h
@@ -1029,14 +1029,6 @@ void __kmpc_atomic_cmplx4_rd(kmp_cmplx32 *out, ident_t *id_ref, int gtid,
 kmp_cmplx32 __kmpc_atomic_cmplx4_rd(ident_t *id_ref, int gtid,
                                     kmp_cmplx32 *loc);
 #endif
-
-#if KMP_COMPILER_MSVC
-#pragma warning(push)
-// warning C4190: '__kmpc_atomic_cmplx8_rd' has C-linkage specified, but returns
-// UDT '__kmp_cmplx64_t' which is incompatible with C
-#pragma warning(disable : 4190)
-#endif
-
 kmp_cmplx64 __kmpc_atomic_cmplx8_rd(ident_t *id_ref, int gtid,
                                     kmp_cmplx64 *loc);
 kmp_cmplx80 __kmpc_atomic_cmplx10_rd(ident_t *id_ref, int gtid,
@@ -1597,10 +1589,6 @@ kmp_cmplx128_a16_t __kmpc_atomic_cmplx16_a16_swp(ident_t *id_ref, int gtid,
 #endif
 #endif
 
-#if KMP_COMPILER_MSVC
-#pragma warning(pop)
-#endif
-
 // Capture routines for mixed types (RHS=float16)
 #if KMP_HAVE_QUAD
 
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index c58eb27b109c5..281b8e9c2883d 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -2405,10 +2405,9 @@ void __kmp_fork_barrier(int gtid, int tid) {
 #endif /* USE_ITT_BUILD */
   if (team)
 
-#ifdef KMP_DEBUG
-    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
-                  (team != NULL) ? team->t.t_id : -1, tid));
-#endif
+  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
+                (team != NULL) ? team->t.t_id : -1, tid));
+
   // th_team pointer only valid for primary thread here
   if (KMP_MASTER_TID(tid)) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
diff --git a/openmp/runtime/src/kmp_dispatch.h b/openmp/runtime/src/kmp_dispatch.h
index 340fe5738c6a8..cf19eb52662ce 100644
--- a/openmp/runtime/src/kmp_dispatch.h
+++ b/openmp/runtime/src/kmp_dispatch.h
@@ -81,11 +81,6 @@ template <typename T> struct dispatch_private_infoXX_template {
 
   /* parm[1-4] are used in different ways by different scheduling algorithms */
 
-#if KMP_COMPILER_MSVC
-#pragma warning(push)
-// warning C4201: nonstandard extension used: nameless struct/union
-#pragma warning(disable : 4201)
-#endif
   // KMP_ALIGN(32) ensures ( if the KMP_ALIGN macro is turned on )
   //    a) parm3 is properly aligned and
   //    b) all parm1-4 are in the same cache line.
@@ -97,9 +92,6 @@ template <typename T> struct dispatch_private_infoXX_template {
     T parm3;
     T parm4;
   };
-#if KMP_COMPILER_MSVC
-#pragma warning(pop)
-#endif
 
 #if KMP_WEIGHTED_ITERATIONS_SUPPORTED
   UT pchunks; // total number of chunks for processes with p-core
diff --git a/openmp/runtime/src/kmp_io.cpp b/openmp/runtime/src/kmp_io.cpp
index 421bfe7e51f52..578e6e671cdf2 100644
--- a/openmp/runtime/src/kmp_io.cpp
+++ b/openmp/runtime/src/kmp_io.cpp
@@ -50,7 +50,6 @@ static HANDLE __kmp_stderr = NULL;
 static int __kmp_console_exists = FALSE;
 static kmp_str_buf_t __kmp_console_buf;
 
-#if 0
 static int is_console(void) {
   char buffer[128];
   DWORD rc = 0;
@@ -68,7 +67,6 @@ static int is_console(void) {
   }
   return rc > 0 || err == 0;
 }
-#endif
 
 void __kmp_close_console(void) {
   /* wait until user presses return before closing window */
@@ -86,6 +84,7 @@ void __kmp_close_console(void) {
 static void __kmp_redirect_output(void) {
   __kmp_acquire_bootstrap_lock(&__kmp_console_lock);
 
+  (void)is_console;
   if (!__kmp_console_exists) {
     HANDLE ho;
     HANDLE he;
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index 6ee9a893b0f84..30a4c05fe76b3 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -873,10 +873,6 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "PASSIVE";
     } break;
-    case library_none:
-    case library_serial: {
-      value = NULL;
-    } break;
     }
   } else {
     switch (__kmp_library) {
@@ -889,9 +885,6 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "throughput";
     } break;
-    case library_none: {
-      value = NULL;
-    } break;
     }
   }
   if (value != NULL) {
@@ -2010,15 +2003,16 @@ static void __kmp_stg_print_foreign_threads_threadprivate(kmp_str_buf_t *buffer,
 static inline const char *
 __kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
   switch (type) {
+  case KMP_HW_CORE_TYPE_UNKNOWN:
+    return "unknown";
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   case KMP_HW_CORE_TYPE_ATOM:
     return "intel_atom";
   case KMP_HW_CORE_TYPE_CORE:
     return "intel_core";
 #endif
-  default:
-    return "unknown";
   }
+  return "unknown";
 }
 
 #if KMP_AFFINITY_SUPPORTED
@@ -4434,8 +4428,6 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s,%d'\n", "auto", __kmp_chunk);
       break;
-    default:
-      break;
     }
   } else {
     switch (sched) {
@@ -4461,8 +4453,6 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s'\n", "auto");
       break;
-    default:
-      break;
     }
   }
 } // __kmp_stg_print_omp_schedule
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
index 36a28e695dc20..3fcae5687d124 100644
--- a/openmp/runtime/src/kmp_wait_release.h
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -1038,6 +1038,7 @@ static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
   case flag_oncore:
     __kmp_resume_oncore(gtid, RCAST(kmp_flag_oncore *, flag));
     break;
+#ifdef KMP_DEBUG
   case flag_unset:
     KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d is unset\n", type));
     break;
@@ -1045,6 +1046,7 @@ static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
     KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d does not match any "
                    "known flag type\n",
                    type));
+#endif
   }
 }
 
diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp
index d75b48b2c1bcf..9e264ab45b87f 100644
--- a/openmp/runtime/src/z_Windows_NT_util.cpp
+++ b/openmp/runtime/src/z_Windows_NT_util.cpp
@@ -22,7 +22,6 @@
    number of running threads in the system. */
 
 #include <ntsecapi.h> // UNICODE_STRING
-#undef WIN32_NO_STATUS
 #include <ntstatus.h>
 #include <psapi.h>
 #ifdef _MSC_VER
@@ -1636,7 +1635,7 @@ int __kmp_get_load_balance(int max) {
     // threads on all cores. So, we don't consider the running threads of this
     // process.
     if (pid != 0) {
-      for (ULONG i = 0; i < num; ++i) {
+      for (int i = 0; i < num; ++i) {
         THREAD_STATE state = spi->Threads[i].State;
         // Count threads that have Ready or Running state.
         // !!! TODO: Why comment does not match the code???

From 1522333c3c7e458b2e3cb6d175a56741e23f4007 Mon Sep 17 00:00:00 2001
From: Ricky Zhou <ricky@rzhou.org>
Date: Thu, 18 Jan 2024 10:55:30 -0800
Subject: [PATCH 033/843] [ThinLTO][DebugInfo] Emit full type definitions when
 importing anonymous types. (#78461)

This fixes some cases of missing debuginfo caused by an interaction
between:


https://github.com/llvm/llvm-project/commit/f0d66559ea345db4b2116cae044aaf3399d7e829,
which drops the identifier from a DICompositeType in the module
containing its
vtable.

and


https://github.com/llvm/llvm-project/commit/a61f5e379675732666744bcba25efbc9922e016a,
which causes ThinLTO to import composite types as declarations when they
have
an identifier.

If a virtual class's DICompositeType has no identifier due to the first
change,
and contains a nested anonymous type which does have an identifier, then
the
second change can cause ThinLTO to output the classes's DICompositeType
as a
type definition that links to a non-defining declaration for the nested
type.
Since the nested anonyous type does not have a name, debuggers are
unable to
find the definition for the declaration.

Repro case:
```
cat > a.h <<EOF
class A {
 public:
  A();
  virtual ~A();

 private:
  union {
    int val;
  };
};
EOF

cat > a.cc <<EOF
#include "a.h"

A::A() { asm(""); }

A::~A() {}
EOF

cat > main.cc <<EOF
#include "a.h"

int main(int argc, char **argv) {
  A a;
  return 0;
}
EOF

clang++ -O2 -g -flto=thin -mllvm -force-import-all main.cc a.cc
gdb ./a.out -batch -ex 'pt /rmt A'
```

The gdb command outputs:
```
type = class A {
  private:
    union {
        <incomplete type>
    };
}
```

and dwarfdump -i a.out shows a DW_TAG_class_type for A with an
incomplete union
type (note that there is also a duplicate entry with the full union type
that
comes after).
```
< 1><0x0000001e>    DW_TAG_class_type
                      DW_AT_containing_type       <0x0000001e>
                      DW_AT_calling_convention    DW_CC_pass_by_reference
                      DW_AT_name                  (indexed string: 0x00000007)A
                      DW_AT_byte_size             0x00000010
                      DW_AT_decl_file             0x00000001 /path/to/./a.h
                      DW_AT_decl_line             0x00000001
...
< 2><0x0000002f>      DW_TAG_member
                        DW_AT_type                  <0x00000037>
                        DW_AT_decl_file             0x00000001 /path/to/./a.h
                        DW_AT_decl_line             0x00000007
                        DW_AT_data_member_location  8
< 2><0x00000037>      DW_TAG_union_type
                        DW_AT_export_symbols        yes(1)
                        DW_AT_calling_convention    DW_CC_pass_by_value
                        DW_AT_declaration           yes(1)
```

This change works around this by making ThinLTO always import full
definitions
for anonymous types.
---
 llvm/lib/Bitcode/Reader/MetadataLoader.cpp    | 37 +++++++++++--------
 .../X86/debuginfo-compositetype-import.ll     | 15 +++++---
 2 files changed, 30 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index 910e97489dbbe..770eb83af17f9 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1615,27 +1615,32 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     Metadata *Annotations = nullptr;
     auto *Identifier = getMDString(Record[15]);
     // If this module is being parsed so that it can be ThinLTO imported
-    // into another module, composite types only need to be imported
-    // as type declarations (unless full type definitions requested).
-    // Create type declarations up front to save memory. Also, buildODRType
-    // handles the case where this is type ODRed with a definition needed
-    // by the importing module, in which case the existing definition is
-    // used.
-    if (IsImporting && !ImportFullTypeDefinitions && Identifier &&
+    // into another module, composite types only need to be imported as
+    // type declarations (unless full type definitions are requested).
+    // Create type declarations up front to save memory. This is only
+    // done for types which have an Identifier, and are therefore
+    // subject to the ODR.
+    //
+    // buildODRType handles the case where this is type ODRed with a
+    // definition needed by the importing module, in which case the
+    // existing definition is used.
+    //
+    // We always import full definitions for anonymous composite types,
+    // as without a name, debuggers cannot easily resolve a declaration
+    // to its definition.
+    if (IsImporting && !ImportFullTypeDefinitions && Identifier && Name &&
         (Tag == dwarf::DW_TAG_enumeration_type ||
          Tag == dwarf::DW_TAG_class_type ||
          Tag == dwarf::DW_TAG_structure_type ||
          Tag == dwarf::DW_TAG_union_type)) {
       Flags = Flags | DINode::FlagFwdDecl;
-      if (Name) {
-        // This is a hack around preserving template parameters for simplified
-        // template names - it should probably be replaced with a
-        // DICompositeType flag specifying whether template parameters are
-        // required on declarations of this type.
-        StringRef NameStr = Name->getString();
-        if (!NameStr.contains('<') || NameStr.starts_with("_STN|"))
-          TemplateParams = getMDOrNull(Record[14]);
-      }
+      // This is a hack around preserving template parameters for simplified
+      // template names - it should probably be replaced with a
+      // DICompositeType flag specifying whether template parameters are
+      // required on declarations of this type.
+      StringRef NameStr = Name->getString();
+      if (!NameStr.contains('<') || NameStr.starts_with("_STN|"))
+        TemplateParams = getMDOrNull(Record[14]);
     } else {
       BaseType = getDITypeRefOrNull(Record[6]);
       OffsetInBits = Record[9];
diff --git a/llvm/test/ThinLTO/X86/debuginfo-compositetype-import.ll b/llvm/test/ThinLTO/X86/debuginfo-compositetype-import.ll
index 22e8ec4418eca..133930ce6c037 100644
--- a/llvm/test/ThinLTO/X86/debuginfo-compositetype-import.ll
+++ b/llvm/test/ThinLTO/X86/debuginfo-compositetype-import.ll
@@ -16,7 +16,8 @@
 ; CHECK: distinct !DICompositeType(tag: DW_TAG_enumeration_type, name: "enum", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 50, size: 32, flags: DIFlagFwdDecl, identifier: "enum")
 ; CHECK: distinct !DICompositeType(tag: DW_TAG_class_type, name: "class<>", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 728, size: 448, flags: DIFlagFwdDecl, identifier: "class")
 ; CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "struct", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 309, size: 128, flags: DIFlagFwdDecl, templateParams: !{{[0-9]+}}, identifier: "struct_templ_simplified")
-; CHECK: distinct !DICompositeType(tag: DW_TAG_union_type, file: !{{[0-9]+}}, line: 115, size: 384, flags: DIFlagFwdDecl, identifier: "union")
+; CHECK: distinct !DICompositeType(tag: DW_TAG_union_type, name: "union", file: !{{[0-9]+}}, line: 115, size: 384, flags: DIFlagFwdDecl, identifier: "union")
+; CHECK: distinct !DICompositeType(tag: DW_TAG_union_type, file: !{{[0-9]+}}, line: 1, elements: !{{[0-9]+}}, identifier: "anon_union")
 ; CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "struct<>", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 309, size: 128, flags: DIFlagFwdDecl, identifier: "struct_templ")
 ; CHECK: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "_STN|struct|<>", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 309, size: 128, flags: DIFlagFwdDecl, templateParams: !{{[0-9]+}}, identifier: "struct_templ_simplified_mangled")
 
@@ -32,7 +33,8 @@
 ; FULL: distinct !DICompositeType(tag: DW_TAG_enumeration_type, name: "enum", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 50, size: 32, elements: !{{[0-9]+}}, identifier: "enum")
 ; FULL: distinct !DICompositeType(tag: DW_TAG_class_type, name: "class<>", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 728, size: 448, elements: !{{[0-9]+}}, identifier: "class")
 ; FULL: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "struct", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 309, baseType: !{{[0-9]+}}, size: 128, offset: 64, elements: !{{[0-9]+}}, vtableHolder: !{{[0-9]+}}, templateParams: !{{[0-9]+}}, identifier: "struct_templ_simplified")
-; FULL: distinct !DICompositeType(tag: DW_TAG_union_type, file: !{{[0-9]+}}, line: 115, size: 384, elements: !{{[0-9]+}}, identifier: "union")
+; FULL: distinct !DICompositeType(tag: DW_TAG_union_type, name: "union", file: !{{[0-9]+}}, line: 115, size: 384, elements: !{{[0-9]+}}, identifier: "union")
+; FULL: distinct !DICompositeType(tag: DW_TAG_union_type, file: !{{[0-9]+}}, line: 1, elements: !{{[0-9]+}}, identifier: "anon_union")
 ; FULL: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "struct<>", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 309, baseType: !{{[0-9]+}}, size: 128, offset: 64, elements: !{{[0-9]+}}, vtableHolder: !{{[0-9]+}}, templateParams: !{{[0-9]+}}, identifier: "struct_templ")
 ; FULL: distinct !DICompositeType(tag: DW_TAG_structure_type, name: "_STN|struct|<>", scope: !{{[0-9]+}}, file: !{{[0-9]+}}, line: 309, baseType: !{{[0-9]+}}, size: 128, offset: 64, elements: !{{[0-9]+}}, vtableHolder: !{{[0-9]+}}, templateParams: !{{[0-9]+}}, identifier: "struct_templ_simplified_mangled")
 
@@ -59,10 +61,11 @@ entry:
 !5 = !{}
 !6 = distinct !DISubprogram(name: "foo", scope: !1, file: !1, line: 1, type: !7, isLocal: false, isDefinition: true, scopeLine: 2, isOptimized: false, unit: !0, retainedNodes: !5)
 !7 = !DISubroutineType(types: !8)
-!8 = !{!9, !10, !11, !12, !13, !14}
+!8 = !{!9, !10, !11, !12, !13, !14, !15}
 !9 = !DICompositeType(tag: DW_TAG_enumeration_type, name: "enum", scope: !1, file: !1, line: 50, size: 32, elements: !5, identifier: "enum")
 !10 = !DICompositeType(tag: DW_TAG_class_type, name: "class<>", scope: !1, file: !1, line: 728, size: 448, elements: !5, identifier: "class")
 !11 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "struct", scope: !1, file: !1, line: 309, baseType: !10, size: 128, offset: 64, elements: !5, vtableHolder: !10, templateParams: !5, identifier: "struct_templ_simplified")
-!12 = distinct !DICompositeType(tag: DW_TAG_union_type, file: !1, line: 115, size: 384, elements: !5, identifier: "union")
-!13 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "struct<>", scope: !1, file: !1, line: 309, baseType: !10, size: 128, offset: 64, elements: !5, vtableHolder: !10, templateParams: !5, identifier: "struct_templ")
-!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "_STN|struct|<>", scope: !1, file: !1, line: 309, baseType: !10, size: 128, offset: 64, elements: !5, vtableHolder: !10, templateParams: !5, identifier: "struct_templ_simplified_mangled")
+!12 = distinct !DICompositeType(tag: DW_TAG_union_type, name: "union", file: !1, line: 115, size: 384, elements: !5, identifier: "union")
+!13 = distinct !DICompositeType(tag: DW_TAG_union_type, file: !1, line: 1, elements: !5, identifier: "anon_union")
+!14 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "struct<>", scope: !1, file: !1, line: 309, baseType: !10, size: 128, offset: 64, elements: !5, vtableHolder: !10, templateParams: !5, identifier: "struct_templ")
+!15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "_STN|struct|<>", scope: !1, file: !1, line: 309, baseType: !10, size: 128, offset: 64, elements: !5, vtableHolder: !10, templateParams: !5, identifier: "struct_templ_simplified_mangled")

From fa6c3df6add2559ee6f31762e7667cedc6e31a0b Mon Sep 17 00:00:00 2001
From: Peter Klausler <35819229+klausler@users.noreply.github.com>
Date: Thu, 18 Jan 2024 11:05:04 -0800
Subject: [PATCH 034/843] [flang] Don't use \uXXXX encodings unless \-escapes
 are enabled (#78326)

Don't put \uXXXX escapes into the cooked character stream while
prescanning; it should always be UTF-8.
---
 flang/include/flang/Parser/characters.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/include/flang/Parser/characters.h b/flang/include/flang/Parser/characters.h
index dae0d3e2a0cff..df188d674b9ee 100644
--- a/flang/include/flang/Parser/characters.h
+++ b/flang/include/flang/Parser/characters.h
@@ -237,7 +237,7 @@ void EmitQuotedChar(char32_t ch, const NORMAL &emit, const INSERTED &insert,
   }};
   if (ch <= 0x7f) {
     emitOneByte(ch);
-  } else if (useHexadecimalEscapeSequences) {
+  } else if (backslashEscapes && useHexadecimalEscapeSequences) {
     insert('\\');
     insert('u');
     if (ch > 0xffff) {

From a58ad3e2a33eafc821714f9d27b74279fb4d607c Mon Sep 17 00:00:00 2001
From: Haopeng Liu <153236845+haopliu@users.noreply.github.com>
Date: Thu, 18 Jan 2024 11:10:16 -0800
Subject: [PATCH 035/843] [clang] Add size filter for stack auto init (#74777)

Add a clang flag, "-ftrivial-auto-var-init-max-size=" so that clang
skips auto-init a variable if the auto-init memset size exceeds the flag
setting (in bytes). Note that this skipping doesn't apply to
runtime-sized variables like VLA.

Considerations: "__attribute__((uninitialized))" can be used to manually
opt variables out. However, there are thousands of large variables
(e.g., >=1KB, most of them are arrays and used as buffers) in big
codebase. Manually opting them out one by one is not efficient.
---
 .../clang/Basic/DiagnosticDriverKinds.td      |  7 ++
 clang/include/clang/Basic/LangOptions.def     |  2 +
 clang/include/clang/Driver/Options.td         |  4 +
 clang/lib/CodeGen/CGDecl.cpp                  | 14 ++++
 clang/lib/Driver/ToolChains/Clang.cpp         | 14 ++++
 .../CodeGenCXX/auto-var-init-max-size.cpp     | 83 +++++++++++++++++++
 clang/test/Driver/clang_f_opts.c              | 13 +++
 7 files changed, 137 insertions(+)
 create mode 100644 clang/test/CodeGenCXX/auto-var-init-max-size.cpp

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 0fe8798dfb301..090b169a0e724 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -656,6 +656,13 @@ def err_drv_trivial_auto_var_init_stop_after_missing_dependency : Error<
 def err_drv_trivial_auto_var_init_stop_after_invalid_value : Error<
   "'-ftrivial-auto-var-init-stop-after=*' only accepts positive integers">;
 
+def err_drv_trivial_auto_var_init_max_size_missing_dependency : Error<
+  "'-ftrivial-auto-var-init-max-size=*' is used without "
+  "'-ftrivial-auto-var-init=zero' or '-ftrivial-auto-var-init=pattern'">;
+
+def err_drv_trivial_auto_var_init_max_size_invalid_value : Error<
+  "'-ftrivial-auto-var-init-max-size=*' only accepts positive integers (in bytes)">;
+
 def warn_drv_msp430_hwmult_unsupported : Warning<
   "the given MCU does not support hardware multiply, but '-mhwmult' is set to "
   "%0">, InGroup<InvalidCommandLineArgument>;
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 0428b70c60206..04ebffbcba69d 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -378,6 +378,8 @@ ENUM_LANGOPT(TrivialAutoVarInit, TrivialAutoVarInitKind, 2, TrivialAutoVarInitKi
              "trivial automatic variable initialization")
 VALUE_LANGOPT(TrivialAutoVarInitStopAfter, 32, 0,
              "stop trivial automatic variable initialization after the specified number of instances. Must be greater than 0.")
+VALUE_LANGOPT(TrivialAutoVarInitMaxSize, 32, 0,
+             "stop trivial automatic variable initialization if var size exceeds the specified size (in bytes). Must be greater than 0.")
 ENUM_LANGOPT(SignedOverflowBehavior, SignedOverflowBehaviorTy, 2, SOB_Undefined,
              "signed integer overflow handling")
 ENUM_LANGOPT(ThreadModel  , ThreadModelKind, 2, ThreadModelKind::POSIX, "Thread Model")
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index e4fdad8265c86..d2e6c3ff721c2 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3664,6 +3664,10 @@ def ftrivial_auto_var_init_stop_after : Joined<["-"], "ftrivial-auto-var-init-st
   Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
   HelpText<"Stop initializing trivial automatic stack variables after the specified number of instances">,
   MarshallingInfoInt<LangOpts<"TrivialAutoVarInitStopAfter">>;
+def ftrivial_auto_var_init_max_size : Joined<["-"], "ftrivial-auto-var-init-max-size=">, Group<f_Group>,
+  Visibility<[ClangOption, CC1Option, CLOption, DXCOption]>,
+  HelpText<"Stop initializing trivial automatic stack variables if var size exceeds the specified number of instances (in bytes)">,
+  MarshallingInfoInt<LangOpts<"TrivialAutoVarInitMaxSize">>;
 def fstandalone_debug : Flag<["-"], "fstandalone-debug">, Group<f_Group>,
   Visibility<[ClangOption, CLOption, DXCOption]>,
   HelpText<"Emit full debug info for all types used by the program">;
diff --git a/clang/lib/CodeGen/CGDecl.cpp b/clang/lib/CodeGen/CGDecl.cpp
index a5da0aa2965a0..bbe14ef4c1724 100644
--- a/clang/lib/CodeGen/CGDecl.cpp
+++ b/clang/lib/CodeGen/CGDecl.cpp
@@ -1759,20 +1759,34 @@ void CodeGenFunction::emitZeroOrPatternForAutoVarInit(QualType type,
                                                       const VarDecl &D,
                                                       Address Loc) {
   auto trivialAutoVarInit = getContext().getLangOpts().getTrivialAutoVarInit();
+  auto trivialAutoVarInitMaxSize =
+      getContext().getLangOpts().TrivialAutoVarInitMaxSize;
   CharUnits Size = getContext().getTypeSizeInChars(type);
   bool isVolatile = type.isVolatileQualified();
   if (!Size.isZero()) {
+    // We skip auto-init variables by their alloc size. Take this as an example:
+    // "struct Foo {int x; char buff[1024];}" Assume the max-size flag is 1023.
+    // All Foo type variables will be skipped. Ideally, we only skip the buff
+    // array and still auto-init X in this example.
+    // TODO: Improve the size filtering to by member size.
+    auto allocSize = CGM.getDataLayout().getTypeAllocSize(Loc.getElementType());
     switch (trivialAutoVarInit) {
     case LangOptions::TrivialAutoVarInitKind::Uninitialized:
       llvm_unreachable("Uninitialized handled by caller");
     case LangOptions::TrivialAutoVarInitKind::Zero:
       if (CGM.stopAutoInit())
         return;
+      if (trivialAutoVarInitMaxSize > 0 &&
+          allocSize > trivialAutoVarInitMaxSize)
+        return;
       emitStoresForZeroInit(CGM, D, Loc, isVolatile, Builder);
       break;
     case LangOptions::TrivialAutoVarInitKind::Pattern:
       if (CGM.stopAutoInit())
         return;
+      if (trivialAutoVarInitMaxSize > 0 &&
+          allocSize > trivialAutoVarInitMaxSize)
+        return;
       emitStoresForPatternInit(CGM, D, Loc, isVolatile, Builder);
       break;
     }
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 6e4fbe6816810..fead2e884030e 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3535,6 +3535,20 @@ static void RenderTrivialAutoVarInitOptions(const Driver &D,
     CmdArgs.push_back(
         Args.MakeArgString("-ftrivial-auto-var-init-stop-after=" + Val));
   }
+
+  if (Arg *A = Args.getLastArg(options::OPT_ftrivial_auto_var_init_max_size)) {
+    if (!Args.hasArg(options::OPT_ftrivial_auto_var_init) ||
+        StringRef(
+            Args.getLastArg(options::OPT_ftrivial_auto_var_init)->getValue()) ==
+            "uninitialized")
+      D.Diag(diag::err_drv_trivial_auto_var_init_max_size_missing_dependency);
+    A->claim();
+    StringRef Val = A->getValue();
+    if (std::stoi(Val.str()) <= 0)
+      D.Diag(diag::err_drv_trivial_auto_var_init_max_size_invalid_value);
+    CmdArgs.push_back(
+        Args.MakeArgString("-ftrivial-auto-var-init-max-size=" + Val));
+  }
 }
 
 static void RenderOpenCLOptions(const ArgList &Args, ArgStringList &CmdArgs,
diff --git a/clang/test/CodeGenCXX/auto-var-init-max-size.cpp b/clang/test/CodeGenCXX/auto-var-init-max-size.cpp
new file mode 100644
index 0000000000000..ef38b8227a9a1
--- /dev/null
+++ b/clang/test/CodeGenCXX/auto-var-init-max-size.cpp
@@ -0,0 +1,83 @@
+// Pattern related max size tests: 1, 1024, 4096
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ftrivial-auto-var-init=pattern -ftrivial-auto-var-init-max-size=1 %s -emit-llvm -o - | FileCheck -check-prefix=PATTERN-COMMON -check-prefix=PATTERN-MAX-1 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ftrivial-auto-var-init=pattern -ftrivial-auto-var-init-max-size=1024 %s -emit-llvm -o - | FileCheck -check-prefix=PATTERN-COMMON -check-prefix=PATTERN-MAX-1024 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ftrivial-auto-var-init=pattern -ftrivial-auto-var-init-max-size=4096 %s -emit-llvm -o - | FileCheck -check-prefix=PATTERN-COMMON -check-prefix=PATTERN-MAX-4096 %s
+//
+// Zero related max size tests: 1, 1024, 4096
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ftrivial-auto-var-init=zero -ftrivial-auto-var-init-max-size=1 %s -emit-llvm -o - | FileCheck -check-prefix=ZERO-COMMON -check-prefix=ZERO-MAX-1 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ftrivial-auto-var-init=zero -ftrivial-auto-var-init-max-size=1024 %s -emit-llvm -o - | FileCheck -check-prefix=ZERO-COMMON -check-prefix=ZERO-MAX-1024 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -ftrivial-auto-var-init=zero -ftrivial-auto-var-init-max-size=4096 %s -emit-llvm -o - | FileCheck -check-prefix=ZERO-COMMON -check-prefix=ZERO-MAX-4096 %s
+
+struct Foo {
+    int x; // we should try to make sure X is initialized.
+    char buff[1024];  // this one is fine to skip
+};
+
+int foo(unsigned n) {
+  bool var_size_1;
+  long var_size_8 = 123;
+  void *var_size_8p;
+  int var_size_1024[256];
+  Foo var_size_1028;
+  int var_size_4096[1024];
+  // VLA, non-constant size
+  int var_vla[n];
+  // builtin, non-constant size
+  var_size_8p = __builtin_alloca(sizeof(unsigned long long) * n);
+  // There are 8 variables: var_size_1, var_size_8, var_size_8p, var_size_1024,
+  // var_size_1028, var_size_4096, var_vla, and a builtin anonymous var ("%5").
+  // - COMMON (auto-init regardless of the max size): "var_vla", and "%5"
+  // - Max size 1: "var_size_1"
+  // - Max size 1024: "var_size_1", "var_size_8", "var_size_8p", "var_size_1024"
+  // - Max size 4096: "var_size_1", "var_size_8", "var_size_8p", "var_size_1024", "var_size_1028", "var_size_4096"
+  //
+  // PATTERN-MAX-1: store i8 -86, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1-NOT: store i64 -6148914691236517206, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1-NOT: store ptr inttoptr (i64 -6148914691236517206 to ptr), ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 -86, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 -86, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 -86, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
+
+  // PATTERN-MAX-1024: store i8 -86, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1024: store i64 -6148914691236517206, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1024: store ptr inttoptr (i64 -6148914691236517206 to ptr), ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1024: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 -86, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1024-NOT: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 -86, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-1024-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 -86, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
+
+  // PATTERN-MAX-4096: store i8 -86, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-4096: store i64 -6148914691236517206, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-4096: store ptr inttoptr (i64 -6148914691236517206 to ptr), ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-4096: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 -86, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-4096: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 -86, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-MAX-4096: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 -86, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
+
+  // PATTERN-COMMON: call void @llvm.memcpy.p0.p0.i64(ptr align 4 %vla.cur, ptr align 4 @__const._Z3fooj.var_vla, i64 4, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // PATTERN-COMMON: call void @llvm.memset.p0.i64(ptr align 16 %5, i8 -86, i64 %mul, i1 false), !annotation [[AUTO_INIT:!.+]]
+
+  // ZERO-MAX-1: store i8 0, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1-NOT: store i64 0, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1-NOT: store ptr null, ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 0, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 0, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 0, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
+
+  // ZERO-MAX-1024: store i8 0, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1024: store i64 0, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1024: store ptr null, ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1024: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 0, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1024-NOT: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 0, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-1024-NOT: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 0, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
+
+  // ZERO-MAX-4096: store i8 0, ptr %var_size_1, align 1, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-4096: store i64 0, ptr %var_size_8, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-4096: store ptr null, ptr %var_size_8p, align 8, !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-4096: call void @llvm.memset.p0.i64(ptr align 16 %var_size_1024, i8 0, i64 1024, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-4096: call void @llvm.memset.p0.i64(ptr align 4 %var_size_1028, i8 0, i64 1028, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // ZERO-MAX-4096: call void @llvm.memset.p0.i64(ptr align 16 %var_size_4096, i8 0, i64 4096, i1 false), !annotation [[AUTO_INIT:!.+]]
+
+  // ZERO-COMMON: call void @llvm.memset.p0.i64(ptr align 16 %vla, i8 0, i64 %3, i1 false), !annotation [[AUTO_INIT:!.+]]
+  // ZERO-COMMON: call void @llvm.memset.p0.i64(ptr align 16 %5, i8 0, i64 %mul, i1 false), !annotation [[AUTO_INIT:!.+]]
+
+  return 0;
+}
diff --git a/clang/test/Driver/clang_f_opts.c b/clang/test/Driver/clang_f_opts.c
index c8b44e056e58f..33e4aa8386c3b 100644
--- a/clang/test/Driver/clang_f_opts.c
+++ b/clang/test/Driver/clang_f_opts.c
@@ -585,6 +585,19 @@
 // CHECK-TRIVIAL-PATTERN-STOP-AFTER-INVALID-VALUE: only accepts positive integers
 // CHECK-TRIVIAL-ZERO-STOP-AFTER-INVALID-VALUE: only accepts positive integers
 
+// RUN: %clang -### -S -ftrivial-auto-var-init=pattern -ftrivial-auto-var-init-max-size=1024 %s 2>&1 | FileCheck -check-prefix=CHECK-TRIVIAL-PATTERN-MAX-SIZE %s
+// RUN: %clang -### -S -ftrivial-auto-var-init=zero -ftrivial-auto-var-init-max-size=1024 %s 2>&1 | FileCheck -check-prefix=CHECK-TRIVIAL-ZERO-MAX-SIZE %s
+// RUN: not %clang -### -S -ftrivial-auto-var-init-max-size=1024 %s 2>&1 | FileCheck -check-prefix=CHECK-TRIVIAL-MAX-SIZE-MISSING-DEPENDENCY %s
+// RUN: not %clang -### -S -ftrivial-auto-var-init=pattern -ftrivial-auto-var-init-max-size=0 %s 2>&1 | FileCheck -check-prefix=CHECK-TRIVIAL-PATTERN-MAX-SIZE-INVALID-VALUE %s
+// RUN: not %clang -### -S -ftrivial-auto-var-init=zero -ftrivial-auto-var-init-max-size=0 %s 2>&1 | FileCheck -check-prefix=CHECK-TRIVIAL-ZERO-MAX-SIZE-INVALID-VALUE %s
+// CHECK-TRIVIAL-PATTERN-MAX-SIZE-NOT: is used without '-ftrivial-auto-var-init'
+// CHECK-TRIVIAL-PATTERN-MAX-SIZE-NOT: only accepts positive integers (in bytes)
+// CHECK-TRIVIAL-ZERO-MAX-SIZE-NOT: is used without '-ftrivial-auto-var-init'
+// CHECK-TRIVIAL-ZERO-MAX-SIZE-NOT: only accepts positive integers (in bytes)
+// CHECK-TRIVIAL-MAX-SIZE-MISSING-DEPENDENCY: used without '-ftrivial-auto-var-init=zero' or
+// CHECK-TRIVIAL-PATTERN-MAX-SIZE-INVALID-VALUE: only accepts positive integers (in bytes)
+// CHECK-TRIVIAL-ZERO-MAX-SIZE-INVALID-VALUE: only accepts positive integers (in bytes)
+
 // RUN: %clang -### -S -fno-temp-file %s 2>&1 | FileCheck -check-prefix=CHECK-NO-TEMP-FILE %s
 // CHECK-NO-TEMP-FILE: "-fno-temp-file"
 

From cfc9f3695fafddfa6f537d6dac702afab00c3cce Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alex_toresh@yahoo.fr>
Date: Thu, 18 Jan 2024 14:12:51 -0500
Subject: [PATCH 036/843] Revert "[lldb] Silence narrowing conversion warning
 with MSVC"

This reverts commit cb67dc19256565d15f6bed0e9808f4026ca04995.
---
 lldb/unittests/DataFormatter/StringPrinterTests.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/unittests/DataFormatter/StringPrinterTests.cpp b/lldb/unittests/DataFormatter/StringPrinterTests.cpp
index 74136b6763eca..ac4116139beb2 100644
--- a/lldb/unittests/DataFormatter/StringPrinterTests.cpp
+++ b/lldb/unittests/DataFormatter/StringPrinterTests.cpp
@@ -72,7 +72,7 @@ TEST(StringPrinterTests, CxxASCII) {
   EXPECT_EQ(fmt("🥑"), QUOTE("🥑"));
 
   // Octal (\nnn), hex (\xnn), extended octal (\unnnn or \Unnnnnnnn).
-  EXPECT_EQ(fmt(L"\uD55C"), QUOTE(L"\uD55C"));
+  EXPECT_EQ(fmt("\uD55C"), QUOTE("\uD55C"));
   EXPECT_EQ(fmt("\U00010348"), QUOTE("\U00010348"));
 
   EXPECT_EQ(fmt("\376"), QUOTE(R"(\xfe)")); // \376 is 254 in decimal.

From 911289a62be068e56a934e069e8e03112a804542 Mon Sep 17 00:00:00 2001
From: Usama Hameed <u_hameed@apple.com>
Date: Thu, 18 Jan 2024 11:13:56 -0800
Subject: [PATCH 037/843] [CompilerRT][ASan] Add new substitutions for tests
 while using lto to (#78523)

explicitly pass libLTO path. This fixes a failure in swift-ci where
libLTO was being picked from the system instead which was an older
version and caused issues.

rdar://117474861
---
 compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp |  2 +-
 compiler-rt/test/asan/lit.cfg.py                   | 14 ++++++++++++--
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp b/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp
index 55953d33940a8..90c16776a63b1 100644
--- a/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp
+++ b/compiler-rt/test/asan/TestCases/Darwin/odr-lto.cpp
@@ -5,7 +5,7 @@
 
 // RUN: %clangxx_asan -DPART=0 -c %s -o %t-1.o -flto -mllvm -asan-use-private-alias
 // RUN: %clangxx_asan -DPART=1 -c %s -o %t-2.o -flto -mllvm -asan-use-private-alias
-// RUN: %clangxx_asan %t-1.o %t-2.o -o %t -flto -mlinker-version=133
+// RUN: %clangxx_asan_lto %t-1.o %t-2.o -o %t -flto -mlinker-version=133
 // RUN: %run %t 2>&1 | FileCheck %s
 
 #include <stdio.h>
diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py
index d930346602125..83b3cbe789cac 100644
--- a/compiler-rt/test/asan/lit.cfg.py
+++ b/compiler-rt/test/asan/lit.cfg.py
@@ -101,14 +101,24 @@ def get_required_attr(config, attr_name):
     config.available_features.add(win_runtime_feature)
 
 
-def build_invocation(compile_flags):
-    return " " + " ".join([config.clang] + compile_flags) + " "
+def build_invocation(compile_flags, with_lto=False):
+    lto_flags = []
+    if with_lto and config.lto_supported:
+        lto_flags += config.lto_flags
+
+    return " " + " ".join([config.clang] + lto_flags + compile_flags) + " "
 
 
 config.substitutions.append(("%clang ", build_invocation(target_cflags)))
 config.substitutions.append(("%clangxx ", build_invocation(target_cxxflags)))
 config.substitutions.append(("%clang_asan ", build_invocation(clang_asan_cflags)))
 config.substitutions.append(("%clangxx_asan ", build_invocation(clang_asan_cxxflags)))
+config.substitutions.append(
+    ("%clang_asan_lto ", build_invocation(clang_asan_cflags, True))
+)
+config.substitutions.append(
+    ("%clangxx_asan_lto ", build_invocation(clang_asan_cxxflags, True))
+)
 if config.asan_dynamic:
     if config.host_os in ["Linux", "FreeBSD", "NetBSD", "SunOS"]:
         shared_libasan_path = os.path.join(

From a7588bb9bab43420f1c2642c80489f74af88f855 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Thu, 18 Jan 2024 22:22:06 +0300
Subject: [PATCH 038/843] [clang] Pass `-n` to llvm-cxxfilt in even more
 codegen tests

This is another follow-up to f4fbbebb5edcaad459ce154c011f71fc38fe4052 and 30da0f5a359ab4a684c5fdf0f4dbed20bae10f99
---
 clang/test/CXX/drs/dr1807.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/CXX/drs/dr1807.cpp b/clang/test/CXX/drs/dr1807.cpp
index 5544e4695f8d0..81e1e8ca640e3 100644
--- a/clang/test/CXX/drs/dr1807.cpp
+++ b/clang/test/CXX/drs/dr1807.cpp
@@ -1,7 +1,7 @@
-// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,CXX98
-// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
-// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
-// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++98 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,CXX98
+// RUN: %clang_cc1 -std=c++11 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++14 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
+// RUN: %clang_cc1 -std=c++17 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
 // RUN: %clang_cc1 -std=c++20 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
 // RUN: %clang_cc1 -std=c++23 %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11
 // RUN: %clang_cc1 -std=c++2c %s -triple x86_64-linux-gnu -emit-llvm -o - -fexceptions -fcxx-exceptions -pedantic-errors | llvm-cxxfilt -n | FileCheck %s --check-prefixes CHECK,SINCE-CXX11

From 99cae9a44fca4cfbd6ee82f196051cbdf6571fa1 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Thu, 18 Jan 2024 20:36:27 +0100
Subject: [PATCH 039/843] [flang] Fix seg fault
 `CodeGenAction::executeAction()` (#78269)

---
 flang/lib/Frontend/FrontendActions.cpp        |   5 +
 flang/unittests/Frontend/CMakeLists.txt       |   1 +
 .../unittests/Frontend/CodeGenActionTest.cpp  | 109 ++++++++++++++++++
 3 files changed, 115 insertions(+)
 create mode 100644 flang/unittests/Frontend/CodeGenActionTest.cpp

diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 74e3992d5ab62..65c4df7388f97 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -1202,6 +1202,11 @@ void CodeGenAction::executeAction() {
   if (!llvmModule)
     generateLLVMIR();
 
+  // If generating the LLVM module failed, abort! No need for further error
+  // reporting since generateLLVMIR() does this already.
+  if (!llvmModule)
+    return;
+
   // Set the triple based on the targetmachine (this comes compiler invocation
   // and the command-line target option if specified, or the default if not
   // given on the command-line).
diff --git a/flang/unittests/Frontend/CMakeLists.txt b/flang/unittests/Frontend/CMakeLists.txt
index 79a394f161ed1..3bcc37bed7f6d 100644
--- a/flang/unittests/Frontend/CMakeLists.txt
+++ b/flang/unittests/Frontend/CMakeLists.txt
@@ -4,6 +4,7 @@ set(LLVM_LINK_COMPONENTS
 )
 
 add_flang_unittest(FlangFrontendTests
+  CodeGenActionTest.cpp
   CompilerInstanceTest.cpp
   FrontendActionTest.cpp
 )
diff --git a/flang/unittests/Frontend/CodeGenActionTest.cpp b/flang/unittests/Frontend/CodeGenActionTest.cpp
new file mode 100644
index 0000000000000..9d798c7678ad1
--- /dev/null
+++ b/flang/unittests/Frontend/CodeGenActionTest.cpp
@@ -0,0 +1,109 @@
+//===- unittests/Frontend/CodeGenActionTest.cpp --- FrontendAction tests --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Unit tests for CodeGenAction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Builders.h"
+#include "flang/Frontend/CompilerInstance.h"
+#include "flang/Frontend/FrontendActions.h"
+#include "flang/Frontend/TextDiagnosticPrinter.h"
+
+#include "gtest/gtest.h"
+
+#include <memory>
+
+using namespace Fortran::frontend;
+
+namespace test {
+class DummyDialect : public ::mlir::Dialect {
+  explicit DummyDialect(::mlir::MLIRContext *context)
+      : ::mlir::Dialect(getDialectNamespace(), context,
+            ::mlir::TypeID::get<DummyDialect>()) {
+    initialize();
+  }
+
+  void initialize();
+  friend class ::mlir::MLIRContext;
+
+public:
+  ~DummyDialect() override = default;
+  static constexpr ::llvm::StringLiteral getDialectNamespace() {
+    return ::llvm::StringLiteral("dummy");
+  }
+};
+
+namespace dummy {
+class FakeOp : public ::mlir::Op<FakeOp> {
+public:
+  using Op::Op;
+
+  static llvm::StringRef getOperationName() { return "dummy.fake"; }
+
+  static ::llvm::ArrayRef<::llvm::StringRef> getAttributeNames() { return {}; }
+
+  static void build(
+      ::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState) {}
+};
+} // namespace dummy
+} // namespace test
+
+MLIR_DECLARE_EXPLICIT_TYPE_ID(::test::DummyDialect)
+MLIR_DEFINE_EXPLICIT_TYPE_ID(::test::DummyDialect)
+
+namespace test {
+
+void DummyDialect::initialize() { addOperations<::test::dummy::FakeOp>(); }
+} // namespace test
+
+// A test CodeGenAction to verify that we gracefully handle failure to convert
+// from MLIR to LLVM IR.
+class LLVMConversionFailureCodeGenAction : public CodeGenAction {
+public:
+  LLVMConversionFailureCodeGenAction()
+      : CodeGenAction(BackendActionTy::Backend_EmitLL) {
+    mlirCtx = std::make_unique<mlir::MLIRContext>();
+    mlirCtx->loadDialect<test::DummyDialect>();
+
+    mlir::Location loc(mlir::UnknownLoc::get(mlirCtx.get()));
+    mlirModule =
+        std::make_unique<mlir::ModuleOp>(mlir::ModuleOp::create(loc, "mod"));
+
+    mlir::OpBuilder builder(mlirCtx.get());
+    builder.setInsertionPointToStart(&mlirModule->getRegion().front());
+    // Create a fake op to trip conversion to LLVM.
+    builder.create<test::dummy::FakeOp>(loc);
+
+    llvmCtx = std::make_unique<llvm::LLVMContext>();
+  }
+};
+
+TEST(CodeGenAction, GracefullyHandleLLVMConversionFailure) {
+  std::string diagnosticOutput;
+  llvm::raw_string_ostream diagnosticsOS(diagnosticOutput);
+  auto diagPrinter = std::make_unique<Fortran::frontend::TextDiagnosticPrinter>(
+      diagnosticsOS, new clang::DiagnosticOptions());
+
+  CompilerInstance ci;
+  ci.createDiagnostics(diagPrinter.get(), /*ShouldOwnClient=*/false);
+  ci.setInvocation(std::make_shared<CompilerInvocation>());
+  ci.setOutputStream(std::make_unique<llvm::raw_null_ostream>());
+  ci.getInvocation().getCodeGenOpts().OptimizationLevel = 0;
+
+  FrontendInputFile file("/dev/null", InputKind());
+
+  LLVMConversionFailureCodeGenAction action;
+  action.setInstance(&ci);
+  action.setCurrentInput(file);
+
+  consumeError(action.execute());
+  ASSERT_EQ(diagnosticsOS.str(),
+      "error: Lowering to LLVM IR failed\n"
+      "error: failed to create the LLVM module\n");
+}

From c82b7fddfcbd6adfae4faf324a453fb8652efa91 Mon Sep 17 00:00:00 2001
From: Pete Lawrence <plawrence@apple.com>
Date: Thu, 18 Jan 2024 10:04:26 -1000
Subject: [PATCH 040/843] [lldb] Remove redundant severity substring within a
 diagnostic message. (#76111)

For example, the following message has the severity string "error: "
twice.
	> "error: <EXPR>:3:1: error: cannot find 'bogus' in scope

This method already appends the severity string in the beginning, but
with this fix, it also removes a secondary instance, if applicable.

Note that this change only removes the *first* redundant substring. I
considered putting the removal logic in a loop, but I decided that if
something is generating more than one redundant severity substring, then
that's a problem the message's source should probably fix.

rdar://114203423
---
 lldb/source/Expression/DiagnosticManager.cpp      | 15 ++++++++++++---
 .../TestModulesCompileError.py                    |  2 +-
 2 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/lldb/source/Expression/DiagnosticManager.cpp b/lldb/source/Expression/DiagnosticManager.cpp
index 08977066e3330..9a1100df78db2 100644
--- a/lldb/source/Expression/DiagnosticManager.cpp
+++ b/lldb/source/Expression/DiagnosticManager.cpp
@@ -46,11 +46,20 @@ static const char *StringForSeverity(DiagnosticSeverity severity) {
 
 std::string DiagnosticManager::GetString(char separator) {
   std::string ret;
+  llvm::raw_string_ostream stream(ret);
 
   for (const auto &diagnostic : Diagnostics()) {
-    ret.append(StringForSeverity(diagnostic->GetSeverity()));
-    ret.append(std::string(diagnostic->GetMessage()));
-    ret.push_back(separator);
+    llvm::StringRef severity = StringForSeverity(diagnostic->GetSeverity());
+    stream << severity;
+
+    llvm::StringRef message = diagnostic->GetMessage();
+    std::string searchable_message = message.lower();
+    auto severity_pos = message.find(severity);
+    stream << message.take_front(severity_pos);
+
+    if (severity_pos != llvm::StringRef::npos)
+      stream << message.drop_front(severity_pos + severity.size());
+    stream << separator;
   }
 
   return ret;
diff --git a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py b/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py
index 36e302be2525b..620b6e44fc852 100644
--- a/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py
+++ b/lldb/test/API/lang/objc/modules-compile-error/TestModulesCompileError.py
@@ -21,7 +21,7 @@ def test(self):
             "expr @import LLDBTestModule",
             error=True,
             substrs=[
-                "module.h:4:1: error: use of undeclared identifier 'syntax_error_for_lldb_to_find'",
+                "module.h:4:1: use of undeclared identifier 'syntax_error_for_lldb_to_find'",
                 "syntax_error_for_lldb_to_find // comment that tests source printing",
                 "could not build module 'LLDBTestModule'",
             ],

From ecd47811b755d13357085bcd7519a66d6c4d8e5c Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 18 Jan 2024 12:25:32 -0800
Subject: [PATCH 041/843] [sanitizer] Skip /include/c++/ from summary (#78534)

std:: usually is not a cause of the bug.

We now display
```
SUMMARY: AddressSanitizer: allocation-size-too-big path/to/allocator_returns_null.cpp:92:7 in main
```
instead of

```
SUMMARY: AddressSanitizer: allocation-size-too-big /usr/lib/../include/c++/13/bits/new_allocator.h:147:27 in std::__new_allocator<char>::allocate(unsigned long, void const*)
```

`/include/c++/` matches both libc++ and libstdc++ include paths.
---
 .../sanitizer_symbolizer_report.cpp           |  3 ++-
 .../TestCases/allocator_returns_null.cpp      | 19 +++++++++++++++++--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
index 253dc10607a6e..8438e019591b5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
@@ -34,7 +34,8 @@ static bool FrameIsInternal(const SymbolizedStack *frame) {
     return true;
   const char *file = frame->info.file;
   const char *module = frame->info.module;
-  if (file && (internal_strstr(file, "/compiler-rt/lib/")))
+  if (file && (internal_strstr(file, "/compiler-rt/lib/") ||
+               internal_strstr(file, "/include/c++/")))
     return true;
   if (module && (internal_strstr(module, "libclang_rt.")))
     return true;
diff --git a/compiler-rt/test/sanitizer_common/TestCases/allocator_returns_null.cpp b/compiler-rt/test/sanitizer_common/TestCases/allocator_returns_null.cpp
index ca6f637b9a3f5..1e4b0ed520b8a 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/allocator_returns_null.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/allocator_returns_null.cpp
@@ -34,17 +34,20 @@
 // RUN:   | FileCheck %s --check-prefix=CHECK-nnCRASH
 // RUN: %env_tool_opts=allocator_may_return_null=1     %run %t new-nothrow 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-NULL
+// RUN: %env_tool_opts=allocator_may_return_null=0 not %run %t vector 2>&1 \
+// RUN:   | FileCheck %s --check-prefix=CHECK-vCRASH
 
 // TODO(alekseyshl): win32 is disabled due to failing errno tests, fix it there.
 // UNSUPPORTED: ubsan, target={{.*windows-msvc.*}}
 
 #include <assert.h>
 #include <errno.h>
+#include <limits>
+#include <new>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <limits>
-#include <new>
+#include <vector>
 
 int main(int argc, char **argv) {
   assert(argc == 2);
@@ -60,6 +63,8 @@ int main(int argc, char **argv) {
       (3UL << 30) + 1;
 #endif
 
+  std::vector<char> v;
+
   void *x = nullptr;
   if (!strcmp(action, "malloc")) {
     x = malloc(kMaxAllowedMallocSizePlusOne);
@@ -82,6 +87,14 @@ int main(int argc, char **argv) {
     x = operator new(kMaxAllowedMallocSizePlusOne);
   } else if (!strcmp(action, "new-nothrow")) {
     x = operator new(kMaxAllowedMallocSizePlusOne, std::nothrow);
+  } else if (!strcmp(action, "vector")) {
+#if __LP64__ || defined(_WIN64)
+    v.resize(kMaxAllowedMallocSizePlusOne);
+    x = v.data();
+#else
+    // Fake it: 32bit fails early in std.
+    x = malloc(kMaxAllowedMallocSizePlusOne);
+#endif
   } else {
     assert(0);
   }
@@ -117,6 +130,8 @@ int main(int argc, char **argv) {
 // CHECK-nnCRASH: new-nothrow:
 // CHECK-nnCRASH: #{{[0-9]+.*}}allocator_returns_null.cpp
 // CHECK-nnCRASH: {{SUMMARY: .*Sanitizer: allocation-size-too-big.*allocator_returns_null.cpp.*}} in main
+// CHECK-vCRASH: #{{[0-9]+.*}}allocator_returns_null.cpp
+// CHECK-vCRASH: {{SUMMARY: .*Sanitizer: allocation-size-too-big.*allocator_returns_null.cpp.*}} in main
 
 // CHECK-NULL: {{malloc|calloc|calloc-overflow|realloc|realloc-after-malloc|new-nothrow}}
 // CHECK-NULL: errno: 12, x: 0

From a9ca820529c69674e01d2e90cadc69e361ecf339 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Thu, 18 Jan 2024 14:43:09 -0600
Subject: [PATCH 042/843] [libc] Use clang's scoped atomics if available from
 the compiler (#74769)

Summary:
A recent patch in https://github.com/llvm/llvm-project/pull/72280
provided `clang` the ability to easily use scoped atomics. These are a
special modifier on atomics that some backends support. They are
intended for providing more fine-grained control over the affected
memory of an atomic action. The default is a "system" scope, e.g.
coherence with the GPU and CPU memory on a heterogeneous system. If we
use "device" scope, that implies that the memory is only ordered with
respect to the current GPU.

These builtins are direct replacements for the GCC atomic builitins in
cases where the backend doesn't do anything with the information, so
these should be a drop-in. This introduces some noise, but hopefully it
isn't too contentious.
---
 libc/src/__support/CPP/atomic.h | 82 +++++++++++++++++++++++++--------
 libc/src/__support/RPC/rpc.h    | 17 ++++---
 2 files changed, 75 insertions(+), 24 deletions(-)

diff --git a/libc/src/__support/CPP/atomic.h b/libc/src/__support/CPP/atomic.h
index 78dc8d2da3c19..1c4478dfeab6d 100644
--- a/libc/src/__support/CPP/atomic.h
+++ b/libc/src/__support/CPP/atomic.h
@@ -26,6 +26,18 @@ enum class MemoryOrder : int {
   SEQ_CST = __ATOMIC_SEQ_CST
 };
 
+// These are a clang extension, see the clang documenation for more information:
+// https://clang.llvm.org/docs/LanguageExtensions.html#scoped-atomic-builtins.
+enum class MemoryScope : int {
+#if defined(__MEMORY_SCOPE_SYSTEM) && defined(__MEMORY_SCOPE_DEVICE)
+  SYSTEM = __MEMORY_SCOPE_SYSTEM,
+  DEVICE = __MEMORY_SCOPE_DEVICE,
+#else
+  SYSTEM = 0,
+  DEVICE = 0,
+#endif
+};
+
 template <typename T> struct Atomic {
   // For now, we will restrict to only arithmetic types.
   static_assert(is_arithmetic_v<T>, "Only arithmetic types can be atomic.");
@@ -54,48 +66,82 @@ template <typename T> struct Atomic {
   Atomic(const Atomic &) = delete;
   Atomic &operator=(const Atomic &) = delete;
 
-  // Atomic load
+  // Atomic load.
   operator T() { return __atomic_load_n(&val, int(MemoryOrder::SEQ_CST)); }
 
-  T load(MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
-    return __atomic_load_n(&val, int(mem_ord));
+  T load(MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
+         [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_load_n))
+      return __scoped_atomic_load_n(&val, int(mem_ord), (int)(mem_scope));
+    else
+      return __atomic_load_n(&val, int(mem_ord));
   }
 
-  // Atomic store
+  // Atomic store.
   T operator=(T rhs) {
     __atomic_store_n(&val, rhs, int(MemoryOrder::SEQ_CST));
     return rhs;
   }
 
-  void store(T rhs, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
-    __atomic_store_n(&val, rhs, int(mem_ord));
+  void store(T rhs, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
+             [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_store_n))
+      __scoped_atomic_store_n(&val, rhs, int(mem_ord), (int)(mem_scope));
+    else
+      __atomic_store_n(&val, rhs, int(mem_ord));
   }
 
   // Atomic compare exchange
-  bool compare_exchange_strong(T &expected, T desired,
-                               MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
+  bool compare_exchange_strong(
+      T &expected, T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
+      [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
     return __atomic_compare_exchange_n(&val, &expected, desired, false,
                                        int(mem_ord), int(mem_ord));
   }
 
-  T exchange(T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
-    return __atomic_exchange_n(&val, desired, int(mem_ord));
+  T exchange(T desired, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
+             [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_exchange_n))
+      return __scoped_atomic_exchange_n(&val, desired, int(mem_ord),
+                                        (int)(mem_scope));
+    else
+      return __atomic_exchange_n(&val, desired, int(mem_ord));
   }
 
-  T fetch_add(T increment, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
-    return __atomic_fetch_add(&val, increment, int(mem_ord));
+  T fetch_add(T increment, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
+              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_add))
+      return __scoped_atomic_fetch_add(&val, increment, int(mem_ord),
+                                       (int)(mem_scope));
+    else
+      return __atomic_fetch_add(&val, increment, int(mem_ord));
   }
 
-  T fetch_or(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
-    return __atomic_fetch_or(&val, mask, int(mem_ord));
+  T fetch_or(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
+             [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_or))
+      return __scoped_atomic_fetch_or(&val, mask, int(mem_ord),
+                                      (int)(mem_scope));
+    else
+      return __atomic_fetch_or(&val, mask, int(mem_ord));
   }
 
-  T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
-    return __atomic_fetch_and(&val, mask, int(mem_ord));
+  T fetch_and(T mask, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
+              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_and))
+      return __scoped_atomic_fetch_and(&val, mask, int(mem_ord),
+                                       (int)(mem_scope));
+    else
+      return __atomic_fetch_and(&val, mask, int(mem_ord));
   }
 
-  T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST) {
-    return __atomic_fetch_sub(&val, decrement, int(mem_ord));
+  T fetch_sub(T decrement, MemoryOrder mem_ord = MemoryOrder::SEQ_CST,
+              [[maybe_unused]] MemoryScope mem_scope = MemoryScope::DEVICE) {
+    if constexpr (LIBC_HAS_BUILTIN(__scoped_atomic_fetch_sub))
+      return __scoped_atomic_fetch_sub(&val, decrement, int(mem_ord),
+                                       (int)(mem_scope));
+    else
+      return __atomic_fetch_sub(&val, decrement, int(mem_ord));
   }
 
   // Set the value without using an atomic operation. This is useful
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index 08c1dfd10d6d7..7b2c89ac4dce4 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -109,14 +109,16 @@ template <bool Invert, typename Packet> struct Process {
 
   /// Retrieve the inbox state from memory shared between processes.
   LIBC_INLINE uint32_t load_inbox(uint64_t lane_mask, uint32_t index) const {
-    return gpu::broadcast_value(lane_mask,
-                                inbox[index].load(cpp::MemoryOrder::RELAXED));
+    return gpu::broadcast_value(
+        lane_mask,
+        inbox[index].load(cpp::MemoryOrder::RELAXED, cpp::MemoryScope::SYSTEM));
   }
 
   /// Retrieve the outbox state from memory shared between processes.
   LIBC_INLINE uint32_t load_outbox(uint64_t lane_mask, uint32_t index) const {
     return gpu::broadcast_value(lane_mask,
-                                outbox[index].load(cpp::MemoryOrder::RELAXED));
+                                outbox[index].load(cpp::MemoryOrder::RELAXED,
+                                                   cpp::MemoryScope::SYSTEM));
   }
 
   /// Signal to the other process that this one is finished with the buffer.
@@ -126,7 +128,8 @@ template <bool Invert, typename Packet> struct Process {
   LIBC_INLINE uint32_t invert_outbox(uint32_t index, uint32_t current_outbox) {
     uint32_t inverted_outbox = !current_outbox;
     atomic_thread_fence(cpp::MemoryOrder::RELEASE);
-    outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED);
+    outbox[index].store(inverted_outbox, cpp::MemoryOrder::RELAXED,
+                        cpp::MemoryScope::SYSTEM);
     return inverted_outbox;
   }
 
@@ -241,7 +244,8 @@ template <bool Invert, typename Packet> struct Process {
     uint32_t slot = index / NUM_BITS_IN_WORD;
     uint32_t bit = index % NUM_BITS_IN_WORD;
     return bits[slot].fetch_or(static_cast<uint32_t>(cond) << bit,
-                               cpp::MemoryOrder::RELAXED) &
+                               cpp::MemoryOrder::RELAXED,
+                               cpp::MemoryScope::DEVICE) &
            (1u << bit);
   }
 
@@ -251,7 +255,8 @@ template <bool Invert, typename Packet> struct Process {
     uint32_t slot = index / NUM_BITS_IN_WORD;
     uint32_t bit = index % NUM_BITS_IN_WORD;
     return bits[slot].fetch_and(~0u ^ (static_cast<uint32_t>(cond) << bit),
-                                cpp::MemoryOrder::RELAXED) &
+                                cpp::MemoryOrder::RELAXED,
+                                cpp::MemoryScope::DEVICE) &
            (1u << bit);
   }
 };

From 96542c018f75b54b35aa3e08f184a4909f8c0c04 Mon Sep 17 00:00:00 2001
From: quic-akaryaki <123192073+quic-akaryaki@users.noreply.github.com>
Date: Thu, 18 Jan 2024 15:04:32 -0600
Subject: [PATCH 043/843] [Hexagon] Flip subreg bit for reverse pairs hvx .new
 (#75873)

In .new instructions, the upper vector of a reverse pair (e.g. V4 in
V4:5) should be referenced with an odd sss value.
---
 .../Disassembler/HexagonDisassembler.cpp       |  2 ++
 .../MCTargetDesc/HexagonMCInstrInfo.cpp        |  6 ++++--
 llvm/test/MC/Hexagon/hvx-nv-pair-reverse.s     | 18 ++++++++++++++++++
 3 files changed, 24 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/MC/Hexagon/hvx-nv-pair-reverse.s

diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index c7e22d7d308b0..44a5cd73c6e89 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -512,6 +512,8 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
         const bool Rev = HexagonMCInstrInfo::IsReverseVecRegPair(Producer);
         const unsigned ProdPairIndex =
             Rev ? Producer - Hexagon::WR0 : Producer - Hexagon::W0;
+        if (Rev)
+          SubregBit = !SubregBit;
         Producer = (ProdPairIndex << 1) + SubregBit + Hexagon::V0;
       } else if (SubregBit)
         // Hexagon PRM 10.11 New-value operands
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 9cf004cf4c9a5..a6de2ab9c75a2 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -1036,8 +1036,10 @@ unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer,
                                             unsigned Producer2) {
   // If we're a single vector consumer of a double producer, set subreg bit
   // based on if we're accessing the lower or upper register component
-  if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer))
-    return (Consumer - Hexagon::V0) & 0x1;
+  if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer)) {
+    unsigned Rev = IsReverseVecRegPair(Producer);
+    return ((Consumer - Hexagon::V0) & 0x1) ^ Rev;
+  }
   if (Producer2 != Hexagon::NoRegister)
     return Consumer == Producer;
   return 0;
diff --git a/llvm/test/MC/Hexagon/hvx-nv-pair-reverse.s b/llvm/test/MC/Hexagon/hvx-nv-pair-reverse.s
new file mode 100644
index 0000000000000..ee7ec0143c143
--- /dev/null
+++ b/llvm/test/MC/Hexagon/hvx-nv-pair-reverse.s
@@ -0,0 +1,18 @@
+# RUN: llvm-mc -triple=hexagon -mv69 -mhvx -filetype=obj %s | \
+# RUN:   llvm-objdump --triple=hexagon --mcpu=hexagonv69 --mattr=+hvx -d - | \
+# RUN:   FileCheck %s
+# CHECK: 00000000 <.text>:
+
+{
+  V4:5.w = vadd(V1:0.w, V3:2.w)
+  vmem(r0+#0) = v4.new
+}
+# CHECK-NEXT: 1c6240c5 { 	v4:5.w = vadd(v1:0.w,v3:2.w)
+# CHECK-NEXT: 2820c023   	vmem(r0+#0x0) = v4.new }
+
+{
+  V4:5.w = vadd(V1:0.w, V3:2.w)
+  vmem(r0+#0) = v5.new
+}
+# CHECK-NEXT: 1c6240c5 { 	v4:5.w = vadd(v1:0.w,v3:2.w)
+# CHECK-NEXT: 2820c022   	vmem(r0+#0x0) = v5.new }

From 160a750e63256e58fc171f1b6cccf8b61bb05f42 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 18 Jan 2024 13:06:30 -0800
Subject: [PATCH 044/843] [clang-repl][test] Suppress memory lease after #76218

`new` was introduced in this patch, but I don't see `delete` to release
the memory.
---
 clang/unittests/Interpreter/InterpreterTest.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp
index 1e0854b3c4af4..d6eb0684ba49d 100644
--- a/clang/unittests/Interpreter/InterpreterTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterTest.cpp
@@ -34,6 +34,12 @@ using namespace clang;
 #define CLANG_INTERPRETER_NO_SUPPORT_EXEC
 #endif
 
+#if LLVM_ADDRESS_SANITIZER_BUILD || LLVM_HWADDRESS_SANITIZER_BUILD
+#include <sanitizer/lsan_interface.h>
+#else
+extern "C" void __lsan_ignore_object(const void *p) {}
+#endif
+
 int Global = 42;
 // JIT reports symbol not found on Windows without the visibility attribute.
 REPL_EXTERNAL_VISIBILITY int getGlobal() { return Global; }
@@ -311,6 +317,8 @@ TEST(IncrementalProcessing, InstantiateTemplate) {
   auto fn =
       cantFail(Interp->getSymbolAddress(MangledName)).toPtr<TemplateSpecFn>();
   EXPECT_EQ(42, fn(NewA.getPtr()));
+  // FIXME: release the memory.
+  __lsan_ignore_object(NewA.getPtr());
 }
 
 #ifdef CLANG_INTERPRETER_NO_SUPPORT_EXEC

From 741b8363300f0799f7aeabb83910ff8b98f9a919 Mon Sep 17 00:00:00 2001
From: Shoaib Meenai <smeenai@fb.com>
Date: Thu, 18 Jan 2024 13:31:33 -0800
Subject: [PATCH 045/843] [X86] Fix RTTI proxy emission for 32-bit (#78622)

32-bit x86 doesn't have an appropriate relocation type we can use to
elide the RTTI proxies, so we need to emit them. This would previously
cause crashes when using the relative vtable ABI for 32-bit x86.
---
 llvm/lib/Target/X86/X86TargetMachine.cpp    |  3 +++
 llvm/lib/Target/X86/X86TargetObjectFile.cpp |  2 +-
 llvm/lib/Target/X86/X86TargetObjectFile.h   | 12 +++++++++---
 llvm/test/MC/ELF/rtti-proxy-i686.ll         | 20 ++++++++++++++++++++
 4 files changed, 33 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/MC/ELF/rtti-proxy-i686.ll

diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index b92bffbe6239b..9e4cf1ea99682 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -113,6 +113,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
 
   if (TT.isOSBinFormatCOFF())
     return std::make_unique<TargetLoweringObjectFileCOFF>();
+
+  if (TT.getArch() == Triple::x86_64)
+    return std::make_unique<X86_64ELFTargetObjectFile>();
   return std::make_unique<X86ELFTargetObjectFile>();
 }
 
diff --git a/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/llvm/lib/Target/X86/X86TargetObjectFile.cpp
index 53c692060f08c..e2ddf43e398cb 100644
--- a/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -57,7 +57,7 @@ const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
   return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
 }
 
-const MCExpr *X86ELFTargetObjectFile::getIndirectSymViaGOTPCRel(
+const MCExpr *X86_64ELFTargetObjectFile::getIndirectSymViaGOTPCRel(
     const GlobalValue *GV, const MCSymbol *Sym, const MCValue &MV,
     int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const {
   int64_t FinalOffset = Offset + MV.getConstant();
diff --git a/llvm/lib/Target/X86/X86TargetObjectFile.h b/llvm/lib/Target/X86/X86TargetObjectFile.h
index ed9390d1fad1a..a0e1762a2cb50 100644
--- a/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -36,16 +36,22 @@ namespace llvm {
                                             MCStreamer &Streamer) const override;
   };
 
-  /// This implementation is used for X86 ELF targets that don't
-  /// have a further specialization.
+  /// This implementation is used for X86 ELF targets that don't have a further
+  /// specialization (and as a base class for X86_64, which does).
   class X86ELFTargetObjectFile : public TargetLoweringObjectFileELF {
   public:
     X86ELFTargetObjectFile() {
       PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
-      SupportIndirectSymViaGOTPCRel = true;
     }
     /// Describe a TLS variable address within debug info.
     const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
+  };
+
+  /// This implementation is used for X86_64 ELF targets, and defers to
+  /// X86ELFTargetObjectFile for commonalities with 32-bit targets.
+  class X86_64ELFTargetObjectFile : public X86ELFTargetObjectFile {
+  public:
+    X86_64ELFTargetObjectFile() { SupportIndirectSymViaGOTPCRel = true; }
 
     const MCExpr *
     getIndirectSymViaGOTPCRel(const GlobalValue *GV, const MCSymbol *Sym,
diff --git a/llvm/test/MC/ELF/rtti-proxy-i686.ll b/llvm/test/MC/ELF/rtti-proxy-i686.ll
new file mode 100644
index 0000000000000..afa62ebea4d77
--- /dev/null
+++ b/llvm/test/MC/ELF/rtti-proxy-i686.ll
@@ -0,0 +1,20 @@
+; REQUIRES: x86-registered-target
+
+;; Validate that we produce RTTI proxies for 32-bit x86.
+; RUN: llc %s -mtriple=i686-elf -o - | FileCheck %s
+
+;; Validate that we produce a valid object file.
+; RUN: llc %s -mtriple=i686-elf --filetype=obj -o %t.o
+; RUN: llvm-readobj --relocs %t.o | FileCheck --check-prefix=RELOCS %s
+
+@vtable = dso_local unnamed_addr constant i32 trunc (i64 sub (i64 ptrtoint (ptr @rtti.proxy to i64), i64 ptrtoint (ptr @vtable to i64)) to i32), align 4
+@rtti = external global i8, align 8
+@rtti.proxy = linkonce_odr hidden unnamed_addr constant ptr @rtti
+
+; CHECK-LABEL: vtable:
+; CHECK-NEXT:    .long   rtti.proxy-vtable
+
+; CHECK-LABEL: rtti.proxy:
+; CHECK-NEXT:    .long   rtti
+
+; RELOCS: R_386_32 rtti

From 83365152a4b07db245d0741d0f5165c7fb779266 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 18 Jan 2024 21:42:04 +0000
Subject: [PATCH 046/843] [AArch64] Add tests for operations on vectors with 3
 elements.

---
 .../AArch64/vec3-loads-ext-trunc-stores.ll    | 310 ++++++++++++++++++
 1 file changed, 310 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll

diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
new file mode 100644
index 0000000000000..9eeb194409df6
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -0,0 +1,310 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=arm64-apple-macosx -o - %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck --check-prefix BE %s
+
+define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
+; CHECK-LABEL: load_v3i8:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    umov.h w8, v0[0]
+; CHECK-NEXT:    umov.h w9, v0[1]
+; CHECK-NEXT:    fmov s0, w8
+; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    mov.b v0[1], w9
+; CHECK-NEXT:    ld1.b { v0 }[2], [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_v3i8:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldrh w8, [x0]
+; BE-NEXT:    strh w8, [sp, #12]
+; BE-NEXT:    ldr s0, [sp, #12]
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    ushll v0.8h, v0.8b, #0
+; BE-NEXT:    umov w8, v0.h[0]
+; BE-NEXT:    umov w9, v0.h[1]
+; BE-NEXT:    fmov s0, w8
+; BE-NEXT:    add x8, x0, #2
+; BE-NEXT:    mov v0.b[1], w9
+; BE-NEXT:    ld1 { v0.b }[2], [x8]
+; BE-NEXT:    rev64 v0.16b, v0.16b
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load <3 x i8>, ptr %src, align 1
+  %s = shufflevector <3 x i8> poison, <3 x i8> %l, <16 x i32> <i32 3, i32 4, i32 5, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <16 x i8> %s
+}
+
+define <4 x i32> @load_v3i8_to_4xi32(ptr %src, ptr %dst) {
+; CHECK-LABEL: load_v3i8_to_4xi32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    movi.2d v1, #0x0000ff000000ff
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldrsb w8, [x0, #2]
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    mov.h v0[1], v0[1]
+; CHECK-NEXT:    mov.h v0[2], w8
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_v3i8_to_4xi32:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldrh w8, [x0]
+; BE-NEXT:    movi v1.2d, #0x0000ff000000ff
+; BE-NEXT:    strh w8, [sp, #12]
+; BE-NEXT:    ldr s0, [sp, #12]
+; BE-NEXT:    ldrsb w8, [x0, #2]
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    ushll v0.8h, v0.8b, #0
+; BE-NEXT:    mov v0.h[1], v0.h[1]
+; BE-NEXT:    mov v0.h[2], w8
+; BE-NEXT:    ushll v0.4s, v0.4h, #0
+; BE-NEXT:    and v0.16b, v0.16b, v1.16b
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load <3 x i8>, ptr %src, align 1
+  %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> <i32 3, i32 4, i32 5, i32 undef>
+  %e = zext <4 x i8> %s to <4 x i32>
+  ret <4 x i32> %e
+}
+
+define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src, ptr %dst) {
+; CHECK-LABEL: volatile_load_v3i8_to_4xi32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    movi.2d v1, #0x0000ff000000ff
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldrsb w8, [x0, #2]
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    mov.h v0[1], v0[1]
+; CHECK-NEXT:    mov.h v0[2], w8
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: volatile_load_v3i8_to_4xi32:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldrh w8, [x0]
+; BE-NEXT:    movi v1.2d, #0x0000ff000000ff
+; BE-NEXT:    strh w8, [sp, #12]
+; BE-NEXT:    ldr s0, [sp, #12]
+; BE-NEXT:    ldrsb w8, [x0, #2]
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    ushll v0.8h, v0.8b, #0
+; BE-NEXT:    mov v0.h[1], v0.h[1]
+; BE-NEXT:    mov v0.h[2], w8
+; BE-NEXT:    ushll v0.4s, v0.4h, #0
+; BE-NEXT:    and v0.16b, v0.16b, v1.16b
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load volatile <3 x i8>, ptr %src, align 1
+  %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> <i32 3, i32 4, i32 5, i32 undef>
+  %e = zext <4 x i8> %s to <4 x i32>
+  ret <4 x i32> %e
+}
+
+define <3 x i32> @load_v3i32(ptr %src) {
+; CHECK-LABEL: load_v3i32:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    add x8, x0, #8
+; CHECK-NEXT:    ld1.s { v0 }[2], [x8]
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_v3i32:
+; BE:       // %bb.0:
+; BE-NEXT:    ldr d0, [x0]
+; BE-NEXT:    add x8, x0, #8
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ld1 { v0.s }[2], [x8]
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    ret
+  %l = load <3 x i32>, ptr %src, align 1
+  ret <3 x i32> %l
+}
+
+define void @store_trunc_from_64bits(ptr %src, ptr %dst) {
+; CHECK-LABEL: store_trunc_from_64bits:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    ldrh w8, [x0, #4]
+; CHECK-NEXT:    mov.h v0[2], w8
+; CHECK-NEXT:    xtn.8b v0, v0
+; CHECK-NEXT:    str s0, [sp, #12]
+; CHECK-NEXT:    ldrh w9, [sp, #12]
+; CHECK-NEXT:    strb w8, [x1, #2]
+; CHECK-NEXT:    strh w9, [x1]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: store_trunc_from_64bits:
+; BE:       // %bb.0: // %entry
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldr s0, [x0]
+; BE-NEXT:    ldrh w8, [x0, #4]
+; BE-NEXT:    rev32 v0.4h, v0.4h
+; BE-NEXT:    mov v0.h[2], w8
+; BE-NEXT:    xtn v0.8b, v0.8h
+; BE-NEXT:    rev32 v0.16b, v0.16b
+; BE-NEXT:    str s0, [sp, #12]
+; BE-NEXT:    ldrh w9, [sp, #12]
+; BE-NEXT:    strb w8, [x1, #2]
+; BE-NEXT:    strh w9, [x1]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+entry:
+  %l = load <3 x i16>, ptr %src, align 1
+  %t = trunc <3 x i16> %l to <3 x i8>
+  store <3 x i8> %t, ptr %dst, align 1
+  ret void
+}
+
+define void @load_ext_to_64bits(ptr %src, ptr %dst) {
+; CHECK-LABEL: load_ext_to_64bits:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    add x8, x0, #2
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    ld1.b { v0 }[4], [x8]
+; CHECK-NEXT:    add x8, x1, #4
+; CHECK-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-NEXT:    st1.h { v0 }[2], [x8]
+; CHECK-NEXT:    str s0, [x1]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_ext_to_64bits:
+; BE:       // %bb.0: // %entry
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldrh w8, [x0]
+; BE-NEXT:    strh w8, [sp, #12]
+; BE-NEXT:    add x8, x0, #2
+; BE-NEXT:    ldr s0, [sp, #12]
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    ushll v0.8h, v0.8b, #0
+; BE-NEXT:    ld1 { v0.b }[4], [x8]
+; BE-NEXT:    add x8, x1, #4
+; BE-NEXT:    bic v0.4h, #255, lsl #8
+; BE-NEXT:    rev32 v1.8h, v0.8h
+; BE-NEXT:    st1 { v0.h }[2], [x8]
+; BE-NEXT:    str s1, [x1]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+entry:
+  %l = load <3 x i8>, ptr %src, align 1
+  %e = zext <3 x i8> %l to <3 x i16>
+  store <3 x i16> %e, ptr %dst, align 1
+  ret void
+}
+
+define void @shift_trunc_store(ptr %src, ptr %dst) {
+; CHECK-LABEL: shift_trunc_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    shrn.4h v0, v0, #16
+; CHECK-NEXT:    xtn.8b v1, v0
+; CHECK-NEXT:    umov.h w8, v0[2]
+; CHECK-NEXT:    str s1, [sp, #12]
+; CHECK-NEXT:    ldrh w9, [sp, #12]
+; CHECK-NEXT:    strb w8, [x1, #2]
+; CHECK-NEXT:    strh w9, [x1]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: shift_trunc_store:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ld1 { v0.4s }, [x0]
+; BE-NEXT:    shrn v0.4h, v0.4s, #16
+; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    umov w8, v0.h[2]
+; BE-NEXT:    rev32 v1.16b, v1.16b
+; BE-NEXT:    str s1, [sp, #12]
+; BE-NEXT:    ldrh w9, [sp, #12]
+; BE-NEXT:    strb w8, [x1, #2]
+; BE-NEXT:    strh w9, [x1]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load <3 x i32>, ptr %src
+  %s = lshr <3 x i32> %l, <i32 16, i32 16, i32 16>
+  %t = trunc <3 x i32> %s to <3 x i8>
+  store <3 x i8> %t, ptr %dst, align 1
+  ret void
+}
+
+define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
+; CHECK-LABEL: shift_trunc_volatile_store:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    shrn.4h v0, v0, #16
+; CHECK-NEXT:    xtn.8b v1, v0
+; CHECK-NEXT:    umov.h w8, v0[2]
+; CHECK-NEXT:    str s1, [sp, #12]
+; CHECK-NEXT:    ldrh w9, [sp, #12]
+; CHECK-NEXT:    strb w8, [x1, #2]
+; CHECK-NEXT:    strh w9, [x1]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: shift_trunc_volatile_store:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ld1 { v0.4s }, [x0]
+; BE-NEXT:    shrn v0.4h, v0.4s, #16
+; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    umov w8, v0.h[2]
+; BE-NEXT:    rev32 v1.16b, v1.16b
+; BE-NEXT:    str s1, [sp, #12]
+; BE-NEXT:    ldrh w9, [sp, #12]
+; BE-NEXT:    strb w8, [x1, #2]
+; BE-NEXT:    strh w9, [x1]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load <3 x i32>, ptr %src
+  %s = lshr <3 x i32> %l, <i32 16, i32 16, i32 16>
+  %t = trunc <3 x i32> %s to <3 x i8>
+  store volatile <3 x i8> %t, ptr %dst, align 1
+  ret void
+}

From 8f1d94aaea5c18b83cd3b0df3be3a48ef1d3833d Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 18 Jan 2024 13:49:44 -0800
Subject: [PATCH 047/843] [BOLT] Use continuous output addresses in delta
 encoding in BAT

Make output function addresses be delta-encoded wrt last offset in the
previous function. This reduces the deltas in function start addresses.

Test Plan:
Reduces BAT section size to:
- large binary: 12218860 bytes (0.32x original),
- medium binary: 1606580 bytes (0.27x original),
- small binary: 404 bytes (0.28x original),

Reviewers: rafaelauler

Reviewed By: rafaelauler

Pull Request: https://github.com/llvm/llvm-project/pull/76904
---
 bolt/docs/BAT.md                              | 12 +++--
 .../bolt/Profile/BoltAddressTranslation.h     |  7 +--
 bolt/lib/Profile/BoltAddressTranslation.cpp   | 44 ++++++++++---------
 3 files changed, 36 insertions(+), 27 deletions(-)

diff --git a/bolt/docs/BAT.md b/bolt/docs/BAT.md
index 96a9a187e0acf..0a2c878ef4ae9 100644
--- a/bolt/docs/BAT.md
+++ b/bolt/docs/BAT.md
@@ -69,11 +69,15 @@ Header:
 
 The header is followed by Functions table with `NumFuncs` entries.
 Output binary addresses are delta encoded, meaning that only the difference with
-the previous output address is stored. Addresses implicitly start at zero.
+the last previous output address is stored. Addresses implicitly start at zero.
+Output addresses are continuous through function start addresses and function
+internal offsets, and between hot and cold fragments, to better spread deltas
+and save space.
+
 Hot indices are delta encoded, implicitly starting at zero.
 | Entry  | Encoding | Description |
 | ------ | ------| ----------- |
-| `Address` | Delta, ULEB128 | Function address in the output binary |
+| `Address` | Continuous, Delta, ULEB128 | Function address in the output binary |
 | `HotIndex` | Delta, ULEB128 | Cold functions only: index of corresponding hot function in hot functions table |
 | `NumEntries` | ULEB128 | Number of address translation entries for a function |
 
@@ -82,10 +86,10 @@ function.
 
 ### Address translation table
 Delta encoding means that only the difference with the previous corresponding
-entry is encoded. Offsets implicitly start at zero.
+entry is encoded. Input offsets implicitly start at zero.
 | Entry  | Encoding | Description |
 | ------ | ------| ----------- |
-| `OutputOffset` | Delta, ULEB128 | Function offset in output binary |
+| `OutputOffset` | Continuous, Delta, ULEB128 | Function offset in output binary |
 | `InputOffset` | Delta, SLEB128 | Function offset in input binary with `BRANCHENTRY` LSB bit |
 
 `BRANCHENTRY` bit denotes whether a given offset pair is a control flow source
diff --git a/bolt/include/bolt/Profile/BoltAddressTranslation.h b/bolt/include/bolt/Profile/BoltAddressTranslation.h
index 01d3be4ee59be..f6bd61bc88987 100644
--- a/bolt/include/bolt/Profile/BoltAddressTranslation.h
+++ b/bolt/include/bolt/Profile/BoltAddressTranslation.h
@@ -121,13 +121,14 @@ class BoltAddressTranslation {
 
   /// Write the serialized address translation table for a function.
   template <bool Cold>
-  void writeMaps(std::map<uint64_t, MapTy> &Maps, raw_ostream &OS);
+  void writeMaps(std::map<uint64_t, MapTy> &Maps, uint64_t &PrevAddress,
+                 raw_ostream &OS);
 
   /// Read the serialized address translation table for a function.
   /// Return a parse error if failed.
   template <bool Cold>
-  void parseMaps(std::vector<uint64_t> &HotFuncs, DataExtractor &DE,
-                 uint64_t &Offset, Error &Err);
+  void parseMaps(std::vector<uint64_t> &HotFuncs, uint64_t &PrevAddress,
+                 DataExtractor &DE, uint64_t &Offset, Error &Err);
 
   std::map<uint64_t, MapTy> Maps;
 
diff --git a/bolt/lib/Profile/BoltAddressTranslation.cpp b/bolt/lib/Profile/BoltAddressTranslation.cpp
index 697ff1e5dd0da..d3c33d6e6bc79 100644
--- a/bolt/lib/Profile/BoltAddressTranslation.cpp
+++ b/bolt/lib/Profile/BoltAddressTranslation.cpp
@@ -102,15 +102,17 @@ void BoltAddressTranslation::write(const BinaryContext &BC, raw_ostream &OS) {
     }
   }
 
-  writeMaps</*Cold=*/false>(Maps, OS);
-  writeMaps</*Cold=*/true>(Maps, OS);
+  // Output addresses are delta-encoded
+  uint64_t PrevAddress = 0;
+  writeMaps</*Cold=*/false>(Maps, PrevAddress, OS);
+  writeMaps</*Cold=*/true>(Maps, PrevAddress, OS);
 
   outs() << "BOLT-INFO: Wrote " << Maps.size() << " BAT maps\n";
 }
 
 template <bool Cold>
 void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
-                                       raw_ostream &OS) {
+                                       uint64_t &PrevAddress, raw_ostream &OS) {
   const uint32_t NumFuncs =
       llvm::count_if(llvm::make_first_range(Maps), [&](const uint64_t Address) {
         return Cold == ColdPartSource.count(Address);
@@ -119,8 +121,6 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
   LLVM_DEBUG(dbgs() << "Writing " << NumFuncs << (Cold ? " cold" : "")
                     << " functions for BAT.\n");
   size_t PrevIndex = 0;
-  // Output addresses are delta-encoded
-  uint64_t PrevAddress = 0;
   for (auto &MapEntry : Maps) {
     const uint64_t Address = MapEntry.first;
     // Only process cold fragments in cold mode, and vice versa.
@@ -139,12 +139,14 @@ void BoltAddressTranslation::writeMaps(std::map<uint64_t, MapTy> &Maps,
       PrevIndex = HotIndex;
     }
     encodeULEB128(NumEntries, OS);
-    uint64_t InOffset = 0, OutOffset = 0;
+    uint64_t InOffset = 0;
     // Output and Input addresses and delta-encoded
     for (std::pair<const uint32_t, uint32_t> &KeyVal : Map) {
-      encodeULEB128(KeyVal.first - OutOffset, OS);
+      const uint64_t OutputAddress = KeyVal.first + Address;
+      encodeULEB128(OutputAddress - PrevAddress, OS);
+      PrevAddress = OutputAddress;
       encodeSLEB128(KeyVal.second - InOffset, OS);
-      std::tie(OutOffset, InOffset) = KeyVal;
+      InOffset = KeyVal.second;
     }
   }
 }
@@ -170,21 +172,21 @@ std::error_code BoltAddressTranslation::parse(StringRef Buf) {
 
   Error Err(Error::success());
   std::vector<uint64_t> HotFuncs;
-  parseMaps</*Cold=*/false>(HotFuncs, DE, Offset, Err);
-  parseMaps</*Cold=*/true>(HotFuncs, DE, Offset, Err);
+  uint64_t PrevAddress = 0;
+  parseMaps</*Cold=*/false>(HotFuncs, PrevAddress, DE, Offset, Err);
+  parseMaps</*Cold=*/true>(HotFuncs, PrevAddress, DE, Offset, Err);
   outs() << "BOLT-INFO: Parsed " << Maps.size() << " BAT entries\n";
   return errorToErrorCode(std::move(Err));
 }
 
 template <bool Cold>
 void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
-                                       DataExtractor &DE, uint64_t &Offset,
-                                       Error &Err) {
+                                       uint64_t &PrevAddress, DataExtractor &DE,
+                                       uint64_t &Offset, Error &Err) {
   const uint32_t NumFunctions = DE.getULEB128(&Offset, &Err);
   LLVM_DEBUG(dbgs() << "Parsing " << NumFunctions << (Cold ? " cold" : "")
                     << " functions\n");
   size_t HotIndex = 0;
-  uint64_t PrevAddress = 0;
   for (uint32_t I = 0; I < NumFunctions; ++I) {
     const uint64_t Address = PrevAddress + DE.getULEB128(&Offset, &Err);
     PrevAddress = Address;
@@ -199,18 +201,20 @@ void BoltAddressTranslation::parseMaps(std::vector<uint64_t> &HotFuncs,
 
     LLVM_DEBUG(dbgs() << "Parsing " << NumEntries << " entries for 0x"
                       << Twine::utohexstr(Address) << "\n");
-    uint64_t InputOffset = 0, OutputOffset = 0;
+    uint64_t InputOffset = 0;
     for (uint32_t J = 0; J < NumEntries; ++J) {
       const uint64_t OutputDelta = DE.getULEB128(&Offset, &Err);
+      const uint64_t OutputAddress = PrevAddress + OutputDelta;
+      const uint64_t OutputOffset = OutputAddress - Address;
+      PrevAddress = OutputAddress;
       const int64_t InputDelta = DE.getSLEB128(&Offset, &Err);
-      OutputOffset += OutputDelta;
       InputOffset += InputDelta;
       Map.insert(std::pair<uint32_t, uint32_t>(OutputOffset, InputOffset));
-      LLVM_DEBUG(dbgs() << formatv("{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b)\n",
-                                   OutputOffset, InputOffset, OutputDelta,
-                                   encodeULEB128(OutputDelta, nulls()),
-                                   InputDelta,
-                                   encodeSLEB128(InputDelta, nulls())));
+      LLVM_DEBUG(
+          dbgs() << formatv("{0:x} -> {1:x} ({2}/{3}b -> {4}/{5}b), {6:x}\n",
+                            OutputOffset, InputOffset, OutputDelta,
+                            encodeULEB128(OutputDelta, nulls()), InputDelta,
+                            encodeSLEB128(InputDelta, nulls()), OutputAddress));
     }
     Maps.insert(std::pair<uint64_t, MapTy>(Address, Map));
   }

From da4b8ab7fd8e43c3456ed9881a4eb4ad9da320fa Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Thu, 18 Jan 2024 14:26:45 -0800
Subject: [PATCH 048/843] [lldb] Stop creating BreakpointEventData raw pointers
 (#78508)

The lifetime of these BreakpointEventData objects is difficult to reason
about. These BreakpointEventData pointers are created and passed along
to `Event` which takes the raw pointer and sticks them in a shared
pointer. Instead of manually managing the lifetime and memory, it would
be simpler to have them be shared pointers from the start.
---
 lldb/include/lldb/Breakpoint/Breakpoint.h |  2 +-
 lldb/source/Breakpoint/Breakpoint.cpp     | 53 ++++++++++-------------
 2 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/lldb/include/lldb/Breakpoint/Breakpoint.h b/lldb/include/lldb/Breakpoint/Breakpoint.h
index 3a8b29aee544c..8c4308ab0bc12 100644
--- a/lldb/include/lldb/Breakpoint/Breakpoint.h
+++ b/lldb/include/lldb/Breakpoint/Breakpoint.h
@@ -672,7 +672,7 @@ class Breakpoint : public std::enable_shared_from_this<Breakpoint>,
 
   void SendBreakpointChangedEvent(lldb::BreakpointEventType eventKind);
 
-  void SendBreakpointChangedEvent(BreakpointEventData *data);
+  void SendBreakpointChangedEvent(const lldb::EventDataSP &breakpoint_data_sp);
 
   Breakpoint(const Breakpoint &) = delete;
   const Breakpoint &operator=(const Breakpoint &) = delete;
diff --git a/lldb/source/Breakpoint/Breakpoint.cpp b/lldb/source/Breakpoint/Breakpoint.cpp
index 6e6b51b562496..af5dcc9cd88d4 100644
--- a/lldb/source/Breakpoint/Breakpoint.cpp
+++ b/lldb/source/Breakpoint/Breakpoint.cpp
@@ -460,17 +460,13 @@ void Breakpoint::ResolveBreakpointInModules(ModuleList &module_list,
     // If this is not an internal breakpoint, set up to record the new
     // locations, then dispatch an event with the new locations.
     if (!IsInternal() && send_event) {
-      BreakpointEventData *new_locations_event = new BreakpointEventData(
-          eBreakpointEventTypeLocationsAdded, shared_from_this());
-
+      std::shared_ptr<BreakpointEventData> new_locations_event =
+          std::make_shared<BreakpointEventData>(
+              eBreakpointEventTypeLocationsAdded, shared_from_this());
       ResolveBreakpointInModules(
           module_list, new_locations_event->GetBreakpointLocationCollection());
-
-      if (new_locations_event->GetBreakpointLocationCollection().GetSize() !=
-          0) {
+      if (new_locations_event->GetBreakpointLocationCollection().GetSize() != 0)
         SendBreakpointChangedEvent(new_locations_event);
-      } else
-        delete new_locations_event;
     } else {
       ElapsedTime elapsed(m_resolve_time);
       m_resolver_sp->ResolveBreakpointInModules(*m_filter_sp, module_list);
@@ -565,12 +561,10 @@ void Breakpoint::ModulesChanged(ModuleList &module_list, bool load,
     // the module list, then remove their breakpoint sites, and their locations
     // if asked to.
 
-    BreakpointEventData *removed_locations_event;
+    std::shared_ptr<BreakpointEventData> removed_locations_event;
     if (!IsInternal())
-      removed_locations_event = new BreakpointEventData(
+      removed_locations_event = std::make_shared<BreakpointEventData>(
           eBreakpointEventTypeLocationsRemoved, shared_from_this());
-    else
-      removed_locations_event = nullptr;
 
     for (ModuleSP module_sp : module_list.Modules()) {
       if (m_filter_sp->ModulePasses(module_sp)) {
@@ -795,31 +789,30 @@ void Breakpoint::ModuleReplaced(ModuleSP old_module_sp,
     // about telling the world about removing a location we didn't tell them
     // about adding.
 
-    BreakpointEventData *locations_event;
+    std::shared_ptr<BreakpointEventData> removed_locations_event;
     if (!IsInternal())
-      locations_event = new BreakpointEventData(
+      removed_locations_event = std::make_shared<BreakpointEventData>(
           eBreakpointEventTypeLocationsRemoved, shared_from_this());
-    else
-      locations_event = nullptr;
 
     for (BreakpointLocationSP loc_sp :
          locations_to_remove.BreakpointLocations()) {
       m_locations.RemoveLocation(loc_sp);
-      if (locations_event)
-        locations_event->GetBreakpointLocationCollection().Add(loc_sp);
+      if (removed_locations_event)
+        removed_locations_event->GetBreakpointLocationCollection().Add(loc_sp);
     }
-    SendBreakpointChangedEvent(locations_event);
+    SendBreakpointChangedEvent(removed_locations_event);
 
     // And announce the new ones.
 
     if (!IsInternal()) {
-      locations_event = new BreakpointEventData(
-          eBreakpointEventTypeLocationsAdded, shared_from_this());
+      std::shared_ptr<BreakpointEventData> added_locations_event =
+          std::make_shared<BreakpointEventData>(
+              eBreakpointEventTypeLocationsAdded, shared_from_this());
       for (BreakpointLocationSP loc_sp :
            locations_to_announce.BreakpointLocations())
-        locations_event->GetBreakpointLocationCollection().Add(loc_sp);
+        added_locations_event->GetBreakpointLocationCollection().Add(loc_sp);
 
-      SendBreakpointChangedEvent(locations_event);
+      SendBreakpointChangedEvent(added_locations_event);
     }
     m_locations.Compact();
   }
@@ -989,22 +982,22 @@ void Breakpoint::SendBreakpointChangedEvent(
   if (!m_being_created && !IsInternal() &&
       GetTarget().EventTypeHasListeners(
           Target::eBroadcastBitBreakpointChanged)) {
-    BreakpointEventData *data =
-        new Breakpoint::BreakpointEventData(eventKind, shared_from_this());
+    std::shared_ptr<BreakpointEventData> data =
+        std::make_shared<BreakpointEventData>(eventKind, shared_from_this());
 
     GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data);
   }
 }
 
-void Breakpoint::SendBreakpointChangedEvent(BreakpointEventData *data) {
-  if (data == nullptr)
+void Breakpoint::SendBreakpointChangedEvent(
+    const lldb::EventDataSP &breakpoint_data_sp) {
+  if (!breakpoint_data_sp)
     return;
 
   if (!m_being_created && !IsInternal() &&
       GetTarget().EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged))
-    GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged, data);
-  else
-    delete data;
+    GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged,
+                               breakpoint_data_sp);
 }
 
 const char *Breakpoint::BreakpointEventTypeAsCString(BreakpointEventType type) {

From f2684959145faaddc3717ea3d3fc4dbfb5c1a061 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Thu, 18 Jan 2024 14:39:38 -0800
Subject: [PATCH 049/843] [lld][WebAssembly] Rename fetch() to extract() to
 match ELF linker. NFC (#78625)

---
 lld/wasm/Driver.cpp      |  6 +++---
 lld/wasm/SymbolTable.cpp | 10 +++++-----
 lld/wasm/Symbols.cpp     |  4 +++-
 lld/wasm/Symbols.h       |  2 +-
 4 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index a354260c60525..60c6e5c6d5d31 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -705,7 +705,7 @@ static Symbol *handleUndefined(StringRef name, const char *option) {
   sym->isUsedInRegularObj = true;
 
   if (auto *lazySym = dyn_cast<LazySymbol>(sym)) {
-    lazySym->fetch();
+    lazySym->extract();
     if (!config->whyExtract.empty())
       config->whyExtractRecords.emplace_back(option, sym->getFile(), *sym);
   }
@@ -724,7 +724,7 @@ static void handleLibcall(StringRef name) {
       if (!config->whyExtract.empty())
         config->whyExtractRecords.emplace_back("<libcall>", sym->getFile(),
                                                *sym);
-      lazySym->fetch();
+      lazySym->extract();
     }
   }
 }
@@ -965,7 +965,7 @@ static void processStubLibraries() {
             needed->forceExport = true;
             if (auto *lazy = dyn_cast<LazySymbol>(needed)) {
               depsAdded = true;
-              lazy->fetch();
+              lazy->extract();
               if (!config->whyExtract.empty())
                 config->whyExtractRecords.emplace_back(stub_file->getName(),
                                                        sym->getFile(), *sym);
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index b5c138cd76392..1563e66729722 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -533,7 +533,7 @@ Symbol *SymbolTable::addUndefinedFunction(StringRef name,
       lazy->setWeak();
       lazy->signature = sig;
     } else {
-      lazy->fetch();
+      lazy->extract();
       if (!config->whyExtract.empty())
         config->whyExtractRecords.emplace_back(toString(file), s->getFile(),
                                                *s);
@@ -586,7 +586,7 @@ Symbol *SymbolTable::addUndefinedData(StringRef name, uint32_t flags,
     if ((flags & WASM_SYMBOL_BINDING_MASK) == WASM_SYMBOL_BINDING_WEAK)
       lazy->setWeak();
     else
-      lazy->fetch();
+      lazy->extract();
   } else if (s->isDefined()) {
     checkDataType(s, file);
   } else if (s->isWeak()) {
@@ -613,7 +613,7 @@ Symbol *SymbolTable::addUndefinedGlobal(StringRef name,
     replaceSymbol<UndefinedGlobal>(s, name, importName, importModule, flags,
                                    file, type);
   else if (auto *lazy = dyn_cast<LazySymbol>(s))
-    lazy->fetch();
+    lazy->extract();
   else if (s->isDefined())
     checkGlobalType(s, file, type);
   else if (s->isWeak())
@@ -639,7 +639,7 @@ Symbol *SymbolTable::addUndefinedTable(StringRef name,
     replaceSymbol<UndefinedTable>(s, name, importName, importModule, flags,
                                   file, type);
   else if (auto *lazy = dyn_cast<LazySymbol>(s))
-    lazy->fetch();
+    lazy->extract();
   else if (s->isDefined())
     checkTableType(s, file, type);
   else if (s->isWeak())
@@ -665,7 +665,7 @@ Symbol *SymbolTable::addUndefinedTag(StringRef name,
     replaceSymbol<UndefinedTag>(s, name, importName, importModule, flags, file,
                                 sig);
   else if (auto *lazy = dyn_cast<LazySymbol>(s))
-    lazy->fetch();
+    lazy->extract();
   else if (s->isDefined())
     checkTagType(s, file, sig);
   else if (s->isWeak())
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index 2adf72b6965ca..47d8d09ab1bd4 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -425,7 +425,9 @@ const OutputSectionSymbol *SectionSymbol::getOutputSectionSymbol() const {
   return section->outputSec->sectionSym;
 }
 
-void LazySymbol::fetch() { cast<ArchiveFile>(file)->addMember(&archiveSymbol); }
+void LazySymbol::extract() {
+  cast<ArchiveFile>(file)->addMember(&archiveSymbol);
+}
 
 void LazySymbol::setWeak() {
   flags |= (flags & ~WASM_SYMBOL_BINDING_MASK) | WASM_SYMBOL_BINDING_WEAK;
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index 34fff4b962bdc..69ebfdb5bb356 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -502,7 +502,7 @@ class LazySymbol : public Symbol {
       : Symbol(name, LazyKind, flags, file), archiveSymbol(sym) {}
 
   static bool classof(const Symbol *s) { return s->kind() == LazyKind; }
-  void fetch();
+  void extract();
   void setWeak();
   MemoryBufferRef getMemberBuffer();
 

From 184c22dd3aa3513244401fcced9a447c2577e2d3 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Thu, 18 Jan 2024 15:01:21 -0800
Subject: [PATCH 050/843] [lld][WebAssembly] Move linker global state in to
 context object. NFC (#78629)

See lld/ELF/Config.h
---
 lld/wasm/Config.h              | 22 ++++++++++----------
 lld/wasm/Driver.cpp            | 28 ++++++++++++-------------
 lld/wasm/InputChunks.cpp       |  6 +++---
 lld/wasm/InputFiles.cpp        |  2 +-
 lld/wasm/LTO.cpp               |  2 +-
 lld/wasm/MarkLive.cpp          |  2 +-
 lld/wasm/OutputSections.cpp    |  6 +++---
 lld/wasm/OutputSegment.h       |  2 +-
 lld/wasm/Relocations.cpp       |  4 ++--
 lld/wasm/SymbolTable.cpp       |  5 ++---
 lld/wasm/SyntheticSections.cpp | 12 +++++------
 lld/wasm/Writer.cpp            | 38 +++++++++++++++++-----------------
 12 files changed, 64 insertions(+), 65 deletions(-)

diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h
index 104d470484014..cc8a1dcb69afe 100644
--- a/lld/wasm/Config.h
+++ b/lld/wasm/Config.h
@@ -71,6 +71,11 @@ struct Configuration {
   uint64_t initialHeap;
   uint64_t initialMemory;
   uint64_t maxMemory;
+  // The table offset at which to place function addresses.  We reserve zero
+  // for the null function pointer.  This gets set to 1 for executables and 0
+  // for shared libraries (since they always added to a dynamic offset at
+  // runtime).
+  uint64_t tableBase;
   uint64_t zStackSize;
   unsigned ltoPartitions;
   unsigned ltoo;
@@ -96,11 +101,13 @@ struct Configuration {
   std::optional<std::vector<std::string>> features;
   std::optional<std::vector<std::string>> extraFeatures;
   llvm::SmallVector<uint8_t, 0> buildIdVector;
+};
 
-  // The following config options do not directly correspond to any
-  // particular command line options, and should probably be moved to separate
-  // Ctx struct as in ELF/Config.h
+// The only instance of Configuration struct.
+extern Configuration *config;
 
+// The Ctx object hold all other (non-configuration) global state.
+struct Ctx {
   // True if we are creating position-independent code.
   bool isPic;
 
@@ -108,12 +115,6 @@ struct Configuration {
   // requires it to be allocated to table number 0.
   bool legacyFunctionTable = false;
 
-  // The table offset at which to place function addresses.  We reserve zero
-  // for the null function pointer.  This gets set to 1 for executables and 0
-  // for shared libraries (since they always added to a dynamic offset at
-  // runtime).
-  uint32_t tableBase = 0;
-
   // Will be set to true if bss data segments should be emitted. In most cases
   // this is not necessary.
   bool emitBssSegments = false;
@@ -124,8 +125,7 @@ struct Configuration {
       whyExtractRecords;
 };
 
-// The only instance of Configuration struct.
-extern Configuration *config;
+extern Ctx ctx;
 
 } // namespace lld::wasm
 
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 60c6e5c6d5d31..05a96860966ea 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -45,6 +45,7 @@ using namespace llvm::wasm;
 
 namespace lld::wasm {
 Configuration *config;
+Ctx ctx;
 
 namespace {
 
@@ -308,7 +309,7 @@ static std::optional<std::string> searchLibraryBaseName(StringRef name) {
   for (StringRef dir : config->searchPaths) {
     // Currently we don't enable dynamic linking at all unless -shared or -pie
     // are used, so don't even look for .so files in that case..
-    if (config->isPic && !config->isStatic)
+    if (ctx.isPic && !config->isStatic)
       if (std::optional<std::string> s = findFile(dir, "lib" + name + ".so"))
         return s;
     if (std::optional<std::string> s = findFile(dir, "lib" + name + ".a"))
@@ -571,9 +572,9 @@ static void readConfigs(opt::InputArgList &args) {
 // This function initialize such members. See Config.h for the details
 // of these values.
 static void setConfigs() {
-  config->isPic = config->pie || config->shared;
+  ctx.isPic = config->pie || config->shared;
 
-  if (config->isPic) {
+  if (ctx.isPic) {
     if (config->exportTable)
       error("-shared/-pie is incompatible with --export-table");
     config->importTable = true;
@@ -680,7 +681,7 @@ static void checkOptions(opt::InputArgList &args) {
     warn("-Bsymbolic is only meaningful when combined with -shared");
   }
 
-  if (config->isPic) {
+  if (ctx.isPic) {
     if (config->globalBase)
       error("--global-base may not be used with -shared/-pie");
     if (config->tableBase)
@@ -707,7 +708,7 @@ static Symbol *handleUndefined(StringRef name, const char *option) {
   if (auto *lazySym = dyn_cast<LazySymbol>(sym)) {
     lazySym->extract();
     if (!config->whyExtract.empty())
-      config->whyExtractRecords.emplace_back(option, sym->getFile(), *sym);
+      ctx.whyExtractRecords.emplace_back(option, sym->getFile(), *sym);
   }
 
   return sym;
@@ -722,8 +723,7 @@ static void handleLibcall(StringRef name) {
     MemoryBufferRef mb = lazySym->getMemberBuffer();
     if (isBitcode(mb)) {
       if (!config->whyExtract.empty())
-        config->whyExtractRecords.emplace_back("<libcall>", sym->getFile(),
-                                               *sym);
+        ctx.whyExtractRecords.emplace_back("<libcall>", sym->getFile(), *sym);
       lazySym->extract();
     }
   }
@@ -742,7 +742,7 @@ static void writeWhyExtract() {
   }
 
   os << "reference\textracted\tsymbol\n";
-  for (auto &entry : config->whyExtractRecords) {
+  for (auto &entry : ctx.whyExtractRecords) {
     os << std::get<0>(entry) << '\t' << toString(std::get<1>(entry)) << '\t'
        << toString(std::get<2>(entry)) << '\n';
   }
@@ -811,7 +811,7 @@ static void createSyntheticSymbols() {
 
   bool is64 = config->is64.value_or(false);
 
-  if (config->isPic) {
+  if (ctx.isPic) {
     WasmSym::stackPointer =
         createUndefinedGlobal("__stack_pointer", config->is64.value_or(false)
                                                      ? &mutableGlobalTypeI64
@@ -850,7 +850,7 @@ static void createSyntheticSymbols() {
             "__wasm_init_tls"));
   }
 
-  if (config->isPic ||
+  if (ctx.isPic ||
       config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic) {
     // For PIC code, or when dynamically importing addresses, we create
     // synthetic functions that apply relocations.  These get called from
@@ -871,7 +871,7 @@ static void createOptionalSymbols() {
   if (!config->shared)
     WasmSym::dataEnd = symtab->addOptionalDataSymbol("__data_end");
 
-  if (!config->isPic) {
+  if (!ctx.isPic) {
     WasmSym::stackLow = symtab->addOptionalDataSymbol("__stack_low");
     WasmSym::stackHigh = symtab->addOptionalDataSymbol("__stack_high");
     WasmSym::globalBase = symtab->addOptionalDataSymbol("__global_base");
@@ -967,8 +967,8 @@ static void processStubLibraries() {
               depsAdded = true;
               lazy->extract();
               if (!config->whyExtract.empty())
-                config->whyExtractRecords.emplace_back(stub_file->getName(),
-                                                       sym->getFile(), *sym);
+                ctx.whyExtractRecords.emplace_back(stub_file->getName(),
+                                                   sym->getFile(), *sym);
             }
           }
         }
@@ -1305,7 +1305,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
       sym->forceExport = true;
   }
 
-  if (!config->relocatable && !config->isPic) {
+  if (!config->relocatable && !ctx.isPic) {
     // Add synthetic dummies for weak undefined functions.  Must happen
     // after LTO otherwise functions may not yet have signatures.
     symtab->handleWeakUndefines();
diff --git a/lld/wasm/InputChunks.cpp b/lld/wasm/InputChunks.cpp
index f55f54a9509c7..2074dd59c1dde 100644
--- a/lld/wasm/InputChunks.cpp
+++ b/lld/wasm/InputChunks.cpp
@@ -378,7 +378,7 @@ void InputChunk::generateRelocationCode(raw_ostream &os) const {
     uint64_t offset = getVA(rel.Offset) - getInputSectionOffset();
 
     Symbol *sym = file->getSymbol(rel);
-    if (!config->isPic && sym->isDefined())
+    if (!ctx.isPic && sym->isDefined())
       continue;
 
     LLVM_DEBUG(dbgs() << "gen reloc: type=" << relocTypeToString(rel.Type)
@@ -390,7 +390,7 @@ void InputChunk::generateRelocationCode(raw_ostream &os) const {
     writeSleb128(os, offset, "offset");
 
     // In PIC mode we need to add the __memory_base
-    if (config->isPic) {
+    if (ctx.isPic) {
       writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
       if (isTLS())
         writeUleb128(os, WasmSym::tlsBase->getGlobalIndex(), "tls_base");
@@ -417,7 +417,7 @@ void InputChunk::generateRelocationCode(raw_ostream &os) const {
         writeU8(os, opcode_reloc_add, "ADD");
       }
     } else {
-      assert(config->isPic);
+      assert(ctx.isPic);
       const GlobalSymbol* baseSymbol = WasmSym::memoryBase;
       if (rel.Type == R_WASM_TABLE_INDEX_I32 ||
           rel.Type == R_WASM_TABLE_INDEX_I64)
diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp
index 97c88587231ba..19c76e4902789 100644
--- a/lld/wasm/InputFiles.cpp
+++ b/lld/wasm/InputFiles.cpp
@@ -351,7 +351,7 @@ void ObjFile::addLegacyIndirectFunctionTableIfNeeded(
 
   // We assume that this compilation unit has unrelocatable references to
   // this table.
-  config->legacyFunctionTable = true;
+  ctx.legacyFunctionTable = true;
 }
 
 static bool shouldMerge(const WasmSection &sec) {
diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp
index 4b22e4ccc1dc3..e523f0f617153 100644
--- a/lld/wasm/LTO.cpp
+++ b/lld/wasm/LTO.cpp
@@ -55,7 +55,7 @@ static std::unique_ptr<lto::LTO> createLTO() {
 
   if (config->relocatable)
     c.RelocModel = std::nullopt;
-  else if (config->isPic)
+  else if (ctx.isPic)
     c.RelocModel = Reloc::PIC_;
   else
     c.RelocModel = Reloc::Static;
diff --git a/lld/wasm/MarkLive.cpp b/lld/wasm/MarkLive.cpp
index a59a80ad2cc3a..773af2f7e2360 100644
--- a/lld/wasm/MarkLive.cpp
+++ b/lld/wasm/MarkLive.cpp
@@ -187,7 +187,7 @@ bool MarkLive::isCallCtorsLive() {
 
   // In Emscripten-style PIC, we call `__wasm_call_ctors` which calls
   // `__wasm_apply_data_relocs`.
-  if (config->isPic)
+  if (ctx.isPic)
     return true;
 
   // If there are any init functions, mark `__wasm_call_ctors` live so that
diff --git a/lld/wasm/OutputSections.cpp b/lld/wasm/OutputSections.cpp
index 9c4383c6144bc..3974146ab7270 100644
--- a/lld/wasm/OutputSections.cpp
+++ b/lld/wasm/OutputSections.cpp
@@ -107,7 +107,7 @@ void DataSection::finalizeContents() {
   });
 #endif
 
-  assert((config->sharedMemory || !config->isPic || config->extendedConst ||
+  assert((config->sharedMemory || !ctx.isPic || config->extendedConst ||
           activeCount <= 1) &&
          "output segments should have been combined by now");
 
@@ -124,7 +124,7 @@ void DataSection::finalizeContents() {
     if (segment->initFlags & WASM_DATA_SEGMENT_HAS_MEMINDEX)
       writeUleb128(os, 0, "memory index");
     if ((segment->initFlags & WASM_DATA_SEGMENT_IS_PASSIVE) == 0) {
-      if (config->isPic && config->extendedConst) {
+      if (ctx.isPic && config->extendedConst) {
         writeU8(os, WASM_OPCODE_GLOBAL_GET, "global get");
         writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(),
                      "literal (global index)");
@@ -136,7 +136,7 @@ void DataSection::finalizeContents() {
       } else {
         WasmInitExpr initExpr;
         initExpr.Extended = false;
-        if (config->isPic) {
+        if (ctx.isPic) {
           assert(segment->startVA == 0);
           initExpr.Inst.Opcode = WASM_OPCODE_GLOBAL_GET;
           initExpr.Inst.Value.Global = WasmSym::memoryBase->getGlobalIndex();
diff --git a/lld/wasm/OutputSegment.h b/lld/wasm/OutputSegment.h
index e1e43c1f9a36a..701df1fecc4b8 100644
--- a/lld/wasm/OutputSegment.h
+++ b/lld/wasm/OutputSegment.h
@@ -27,7 +27,7 @@ class OutputSegment {
   // to the output binary.  However if the memory is imported, and
   // we can't use memory.fill during startup (due to lack of bulk
   // memory feature) then we include BSS segments verbatim.
-  bool requiredInBinary() const { return !isBss || config->emitBssSegments; }
+  bool requiredInBinary() const { return !isBss || ctx.emitBssSegments; }
 
   bool isTLS() const { return name == ".tdata"; }
 
diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp
index ce41cdcb3e07f..fcffacbc77af0 100644
--- a/lld/wasm/Relocations.cpp
+++ b/lld/wasm/Relocations.cpp
@@ -19,7 +19,7 @@ using namespace llvm::wasm;
 namespace lld::wasm {
 
 static bool requiresGOTAccess(const Symbol *sym) {
-  if (!config->isPic &&
+  if (!ctx.isPic &&
       config->unresolvedSymbols != UnresolvedPolicy::ImportDynamic)
     return false;
   if (sym->isHidden() || sym->isLocal())
@@ -141,7 +141,7 @@ void scanRelocations(InputChunk *chunk) {
       break;
     }
 
-    if (config->isPic ||
+    if (ctx.isPic ||
         (sym->isUndefined() &&
          config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic)) {
       switch (reloc.Type) {
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index 1563e66729722..9dd599e18975e 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -535,8 +535,7 @@ Symbol *SymbolTable::addUndefinedFunction(StringRef name,
     } else {
       lazy->extract();
       if (!config->whyExtract.empty())
-        config->whyExtractRecords.emplace_back(toString(file), s->getFile(),
-                                               *s);
+        ctx.whyExtractRecords.emplace_back(toString(file), s->getFile(), *s);
     }
   } else {
     auto existingFunction = dyn_cast<FunctionSymbol>(s);
@@ -774,7 +773,7 @@ void SymbolTable::addLazy(ArchiveFile *file, const Archive::Symbol *sym) {
   const InputFile *oldFile = s->getFile();
   file->addMember(sym);
   if (!config->whyExtract.empty())
-    config->whyExtractRecords.emplace_back(toString(oldFile), s->getFile(), *s);
+    ctx.whyExtractRecords.emplace_back(toString(oldFile), s->getFile(), *s);
 }
 
 bool SymbolTable::addComdat(StringRef name) {
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index 3a9c147161a2d..f95174fe6f269 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -55,7 +55,7 @@ class SubSection {
 } // namespace
 
 bool DylinkSection::isNeeded() const {
-  return config->isPic ||
+  return ctx.isPic ||
          config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic ||
          !symtab->sharedFiles.empty();
 }
@@ -174,7 +174,7 @@ void ImportSection::addGOTEntry(Symbol *sym) {
     return;
   LLVM_DEBUG(dbgs() << "addGOTEntry: " << toString(*sym) << "\n");
   sym->setGOTIndex(numImportedGlobals++);
-  if (config->isPic) {
+  if (ctx.isPic) {
     // Any symbol that is assigned an normal GOT entry must be exported
     // otherwise the dynamic linker won't be able create the entry that contains
     // it.
@@ -319,7 +319,7 @@ void TableSection::addTable(InputTable *table) {
     return;
   // Some inputs require that the indirect function table be assigned to table
   // number 0.
-  if (config->legacyFunctionTable &&
+  if (ctx.legacyFunctionTable &&
       isa<DefinedTable>(WasmSym::indirectFunctionTable) &&
       cast<DefinedTable>(WasmSym::indirectFunctionTable)->table == table) {
     if (out.importSec->getNumImportedTables()) {
@@ -474,7 +474,7 @@ void GlobalSection::writeBody() {
       // In the case of dynamic linking, unless we have 'extended-const'
       // available, these global must to be mutable since they get updated to
       // the correct runtime value during `__wasm_apply_global_relocs`.
-      if (!config->extendedConst && config->isPic && !sym->isTLS())
+      if (!config->extendedConst && ctx.isPic && !sym->isTLS())
         mutable_ = true;
       // With multi-theadeding any TLS globals must be mutable since they get
       // set during `__wasm_apply_global_tls_relocs`
@@ -487,7 +487,7 @@ void GlobalSection::writeBody() {
     bool useExtendedConst = false;
     uint32_t globalIdx;
     int64_t offset;
-    if (config->extendedConst && config->isPic) {
+    if (config->extendedConst && ctx.isPic) {
       if (auto *d = dyn_cast<DefinedData>(sym)) {
         if (!sym->isTLS()) {
           globalIdx = WasmSym::memoryBase->getGlobalIndex();
@@ -582,7 +582,7 @@ void ElemSection::writeBody() {
 
   WasmInitExpr initExpr;
   initExpr.Extended = false;
-  if (config->isPic) {
+  if (ctx.isPic) {
     initExpr.Inst.Opcode = WASM_OPCODE_GLOBAL_GET;
     initExpr.Inst.Value.Global =
         (config->is64.value_or(false) ? WasmSym::tableBase32
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index 805018c58dccb..7c19a1baf4c04 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -332,7 +332,7 @@ void Writer::layoutMemory() {
   uint64_t memoryPtr = 0;
 
   auto placeStack = [&]() {
-    if (config->relocatable || config->isPic)
+    if (config->relocatable || ctx.isPic)
       return;
     memoryPtr = alignTo(memoryPtr, stackAlignment);
     if (WasmSym::stackLow)
@@ -415,7 +415,7 @@ void Writer::layoutMemory() {
 
   uint64_t staticDataSize = memoryPtr - dataStart;
   log("mem: static data = " + Twine(staticDataSize));
-  if (config->isPic)
+  if (ctx.isPic)
     out.dylinkSec->memSize = staticDataSize;
 
   if (!config->stackFirst)
@@ -489,7 +489,7 @@ void Writer::layoutMemory() {
     if (max == 0) {
       // If no maxMemory config was supplied but we are building with
       // shared memory, we need to pick a sensible upper limit.
-      if (config->isPic)
+      if (ctx.isPic)
         max = maxMemorySetting;
       else
         max = memoryPtr;
@@ -568,7 +568,7 @@ void Writer::populateTargetFeatures() {
   SmallSet<std::string, 8> &allowed = out.targetFeaturesSec->features;
   bool tlsUsed = false;
 
-  if (config->isPic) {
+  if (ctx.isPic) {
     // This should not be necessary because all PIC objects should
     // contain the mutable-globals feature.
     // TODO (https://github.com/llvm/llvm-project/issues/51681)
@@ -683,7 +683,7 @@ void Writer::populateTargetFeatures() {
   // the bss segments, so these segments need to exist in the binary.
   if (config->emitRelocs ||
       (config->memoryImport.has_value() && !allowed.count("bulk-memory")))
-    config->emitBssSegments = true;
+    ctx.emitBssSegments = true;
 
   if (allowed.count("extended-const"))
     config->extendedConst = true;
@@ -734,18 +734,18 @@ static bool shouldImport(Symbol *sym) {
     return true;
   if (!sym->isUndefined())
     return false;
-  if (sym->isWeak() && !config->relocatable && !config->isPic)
+  if (sym->isWeak() && !config->relocatable && !ctx.isPic)
     return false;
 
   // In PIC mode we only need to import functions when they are called directly.
   // Indirect usage all goes via GOT imports.
-  if (config->isPic) {
+  if (ctx.isPic) {
     if (auto *f = dyn_cast<UndefinedFunction>(sym))
       if (!f->isCalledDirectly)
         return false;
   }
 
-  if (config->isPic || config->relocatable || config->importUndefined ||
+  if (ctx.isPic || config->relocatable || config->importUndefined ||
       config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic)
     return true;
   if (config->allowUndefinedSymbols.count(sym->getName()) != 0)
@@ -873,7 +873,7 @@ void Writer::calculateTypes() {
 // which calls the constructors and destructors.
 void Writer::createCommandExportWrappers() {
   // This logic doesn't currently support Emscripten-style PIC mode.
-  assert(!config->isPic);
+  assert(!ctx.isPic);
 
   // If there are no ctors and there's no libc `__wasm_call_dtors` to
   // call, don't wrap the exports.
@@ -1072,7 +1072,7 @@ void Writer::combineOutputSegments() {
   // This restriction does not apply when the extended const extension is
   // available: https://github.com/WebAssembly/extended-const
   assert(!config->extendedConst);
-  assert(config->isPic && !config->sharedMemory);
+  assert(ctx.isPic && !config->sharedMemory);
   if (segments.size() <= 1)
     return;
   OutputSegment *combined = make<OutputSegment>(".data");
@@ -1174,7 +1174,7 @@ void Writer::createSyntheticInitFunctions() {
     }
   }
 
-  if (config->isPic && out.globalSec->needsRelocations()) {
+  if (ctx.isPic && out.globalSec->needsRelocations()) {
     WasmSym::applyGlobalRelocs = symtab->addSyntheticFunction(
         "__wasm_apply_global_relocs", WASM_SYMBOL_VISIBILITY_HIDDEN,
         make<SyntheticFunction>(nullSignature, "__wasm_apply_global_relocs"));
@@ -1257,7 +1257,7 @@ void Writer::createInitMemoryFunction() {
     //    (i32.const 1)
 
     auto writeGetFlagAddress = [&]() {
-      if (config->isPic) {
+      if (ctx.isPic) {
         writeU8(os, WASM_OPCODE_LOCAL_GET, "local.get");
         writeUleb128(os, 0, "local 0");
       } else {
@@ -1267,7 +1267,7 @@ void Writer::createInitMemoryFunction() {
 
     if (config->sharedMemory) {
       // With PIC code we cache the flag address in local 0
-      if (config->isPic) {
+      if (ctx.isPic) {
         writeUleb128(os, 1, "num local decls");
         writeUleb128(os, 2, "local count");
         writeU8(os, is64 ? WASM_TYPE_I64 : WASM_TYPE_I32, "address type");
@@ -1317,7 +1317,7 @@ void Writer::createInitMemoryFunction() {
         // instructions take as their first argument the destination
         // address.
         writePtrConst(os, s->startVA, is64, "destination address");
-        if (config->isPic) {
+        if (ctx.isPic) {
           writeU8(os, WASM_OPCODE_GLOBAL_GET, "GLOBAL_GET");
           writeUleb128(os, WasmSym::memoryBase->getGlobalIndex(),
                        "__memory_base");
@@ -1329,7 +1329,7 @@ void Writer::createInitMemoryFunction() {
         // global.  This allows the runtime to use this static copy of the
         // TLS data for the first/main thread.
         if (config->sharedMemory && s->isTLS()) {
-          if (config->isPic) {
+          if (ctx.isPic) {
             // Cache the result of the addionion in local 0
             writeU8(os, WASM_OPCODE_LOCAL_TEE, "local.tee");
             writeUleb128(os, 1, "local 1");
@@ -1339,7 +1339,7 @@ void Writer::createInitMemoryFunction() {
           writeU8(os, WASM_OPCODE_GLOBAL_SET, "GLOBAL_SET");
           writeUleb128(os, WasmSym::tlsBase->getGlobalIndex(),
                        "__tls_base");
-          if (config->isPic) {
+          if (ctx.isPic) {
             writeU8(os, WASM_OPCODE_LOCAL_GET, "local.tee");
             writeUleb128(os, 1, "local 1");
           }
@@ -1687,7 +1687,7 @@ void Writer::createSyntheticSectionsPostLayout() {
 void Writer::run() {
   // For PIC code the table base is assigned dynamically by the loader.
   // For non-PIC, we start at 1 so that accessing table index 0 always traps.
-  if (!config->isPic) {
+  if (!ctx.isPic) {
     if (WasmSym::definedTableBase)
       WasmSym::definedTableBase->setVA(config->tableBase);
     if (WasmSym::definedTableBase32)
@@ -1734,7 +1734,7 @@ void Writer::run() {
   // `__memory_base` import.  Unless we support the extended const expression we
   // can't do addition inside the constant expression, so we much combine the
   // segments into a single one that can live at `__memory_base`.
-  if (config->isPic && !config->extendedConst && !config->sharedMemory) {
+  if (ctx.isPic && !config->extendedConst && !config->sharedMemory) {
     // In shared memory mode all data segments are passive and initialized
     // via __wasm_init_memory.
     log("-- combineOutputSegments");
@@ -1779,7 +1779,7 @@ void Writer::run() {
     // If the input contains a call to `__wasm_call_ctors`, either in one of
     // the input objects or an explicit export from the command-line, we
     // assume ctors and dtors are taken care of already.
-    if (!config->relocatable && !config->isPic &&
+    if (!config->relocatable && !ctx.isPic &&
         !WasmSym::callCtors->isUsedInRegularObj &&
         !WasmSym::callCtors->isExported()) {
       log("-- createCommandExportWrappers");

From 8434e5d0a16b11ccdc29fc66a3843a94b0ad19f1 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 18 Jan 2024 15:04:48 -0800
Subject: [PATCH 051/843] [dfsan] Don't clear shadow on dlopen(NULL, flags)

This ports msan https://reviews.llvm.org/D14795 to dfsan.
dfsan, like msan, clears shadow for globals in a newly opened DSO in
case the DSO occupies the address of a previously labeled/poisoned area.
The operation should not happen on the main executable.

In addition, for a DT_EXEC executable, l_addr is zero and will lead to a
null pointer dereference in ForEachMappedRegion.
---
 compiler-rt/lib/dfsan/dfsan_custom.cpp | 2 +-
 compiler-rt/test/dfsan/custom.cpp      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/dfsan/dfsan_custom.cpp b/compiler-rt/lib/dfsan/dfsan_custom.cpp
index c5c14a2d1b0eb..85b796bd6349c 100644
--- a/compiler-rt/lib/dfsan/dfsan_custom.cpp
+++ b/compiler-rt/lib/dfsan/dfsan_custom.cpp
@@ -842,7 +842,7 @@ __dfsw_dlopen(const char *filename, int flag, dfsan_label filename_label,
               dfsan_label flag_label, dfsan_label *ret_label) {
   void *handle = dlopen(filename, flag);
   link_map *map = GET_LINK_MAP_BY_DLOPEN_HANDLE(handle);
-  if (map)
+  if (filename && map)
     ForEachMappedRegion(map, dfsan_set_zero_label);
   *ret_label = 0;
   return handle;
diff --git a/compiler-rt/test/dfsan/custom.cpp b/compiler-rt/test/dfsan/custom.cpp
index 2ebeb1e451978..4bb818813cf7c 100644
--- a/compiler-rt/test/dfsan/custom.cpp
+++ b/compiler-rt/test/dfsan/custom.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_dfsan %s -o %t && DFSAN_OPTIONS="strict_data_dependencies=0" %run %t
 // RUN: %clang_dfsan -DSTRICT_DATA_DEPENDENCIES %s -o %t && %run %t
 // RUN: %clang_dfsan -DORIGIN_TRACKING -mllvm -dfsan-track-origins=1 -mllvm -dfsan-combine-pointer-labels-on-load=false -DSTRICT_DATA_DEPENDENCIES %s -o %t && %run %t
-// RUN: %clang_dfsan -DORIGIN_TRACKING -mllvm -dfsan-track-origins=1 -mllvm -dfsan-combine-pointer-labels-on-load=false %s -o %t && DFSAN_OPTIONS="strict_data_dependencies=0" %run %t
+// RUN: %clang_dfsan -DORIGIN_TRACKING -mllvm -dfsan-track-origins=1 -mllvm -dfsan-combine-pointer-labels-on-load=false -no-pie %s -o %t && DFSAN_OPTIONS="strict_data_dependencies=0" %run %t
 //
 // Tests custom implementations of various glibc functions.
 

From 5c150e7eeba9db13cc65b329b3c3537b613ae61d Mon Sep 17 00:00:00 2001
From: Daniel Thornburgh <dthorn@google.com>
Date: Thu, 18 Jan 2024 14:43:12 -0800
Subject: [PATCH 052/843] Revert #76246 and #76083

These cause test build failures on Windows.

This reverts the following commits:
  57ca74843586c9a93c425036c5538aae0a2cfa60
  d06ae33ec32122bb526fb35025c1f0cf979f1090
---
 .github/workflows/libcxx-build-and-test.yaml  |  8 ++
 libcxx/CMakeLists.txt                         | 15 +++-
 libcxx/cmake/caches/Generic-cxx26.cmake       |  1 +
 .../Generic-hardening-mode-extensive.cmake    |  1 +
 .../cmake/caches/Generic-no-exceptions.cmake  |  1 +
 .../caches/Generic-no-experimental.cmake      |  1 +
 .../cmake/caches/Generic-no-filesystem.cmake  |  1 +
 .../caches/Generic-no-localization.cmake      |  1 +
 .../caches/Generic-no-random_device.cmake     |  1 +
 libcxx/cmake/caches/Generic-no-threads.cmake  |  1 +
 libcxx/cmake/caches/Generic-no-tzdb.cmake     |  1 +
 libcxx/cmake/caches/Generic-no-unicode.cmake  |  1 +
 .../caches/Generic-no-wide-characters.cmake   |  1 +
 libcxx/docs/Modules.rst                       |  2 +-
 libcxx/docs/ReleaseNotes/18.rst               |  4 -
 libcxx/docs/TestingLibcxx.rst                 |  9 +-
 libcxx/modules/CMakeLists.txt                 | 28 ++++++
 libcxx/modules/CMakeLists.txt.in              | 86 +++++++++++++++++++
 libcxx/test/CMakeLists.txt                    | 25 ++++++
 libcxx/test/configs/cmake-bridge.cfg.in       |  7 ++
 libcxx/test/libcxx/module_std.gen.py          |  1 -
 libcxx/test/libcxx/module_std_compat.gen.py   |  1 -
 .../libcxx/selftest/modules/no-modules.sh.cpp | 14 ---
 .../modules/std-and-std.compat-module.sh.cpp  | 26 ------
 .../libcxx/selftest/modules/std-module.sh.cpp | 24 ------
 .../selftest/modules/std.compat-module.sh.cpp | 28 ------
 libcxx/test/lit.local.cfg                     | 83 ++++++++++++++++++
 libcxx/test/std/modules/std.compat.pass.cpp   |  7 +-
 libcxx/test/std/modules/std.pass.cpp          |  7 +-
 libcxx/utils/ci/Dockerfile                    | 11 +++
 libcxx/utils/ci/buildkite-pipeline.yml        |  2 +
 libcxx/utils/ci/run-buildbot                  | 10 +++
 libcxx/utils/libcxx/test/config.py            |  4 -
 libcxx/utils/libcxx/test/features.py          | 12 +--
 libcxx/utils/libcxx/test/format.py            | 69 ++-------------
 libcxx/utils/libcxx/test/modules.py           |  4 +-
 36 files changed, 298 insertions(+), 200 deletions(-)
 create mode 100644 libcxx/modules/CMakeLists.txt.in
 delete mode 100644 libcxx/test/libcxx/selftest/modules/no-modules.sh.cpp
 delete mode 100644 libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
 delete mode 100644 libcxx/test/libcxx/selftest/modules/std-module.sh.cpp
 delete mode 100644 libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
 create mode 100644 libcxx/test/lit.local.cfg

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 0cab9b841e4ee..1666a687aa5d0 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -35,6 +35,7 @@ concurrency:
 
 
 env:
+  CMAKE: "/opt/bin/cmake"
   # LLVM POST-BRANCH bump version
   # LLVM POST-BRANCH add compiler test for ToT - 1, e.g. "Clang 17"
   # LLVM RELEASE bump remove compiler ToT - 3, e.g. "Clang 15"
@@ -168,18 +169,24 @@ jobs:
           'bootstrapping-build'
         ]
         machine: [ 'libcxx-runners-8-set' ]
+        std_modules: [ 'OFF' ]
         include:
         - config: 'generic-cxx26'
           machine: libcxx-runners-8-set
+          std_modules: 'ON'
         - config: 'generic-asan'
           machine: libcxx-runners-8-set
+          std_modules: 'OFF'
         - config: 'generic-tsan'
           machine: libcxx-runners-8-set
+          std_modules: 'OFF'
         - config: 'generic-ubsan'
           machine: libcxx-runners-8-set
+          std_modules: 'OFF'
         # Use a larger machine for MSAN to avoid timeout and memory allocation issues.
         - config: 'generic-msan'
           machine: libcxx-runners-8-set
+          std_modules: 'OFF'
     runs-on: ${{ matrix.machine }}
     steps:
       - uses: actions/checkout@v4
@@ -189,6 +196,7 @@ jobs:
           CC: clang-18
           CXX: clang++-18
           ENABLE_CLANG_TIDY: "OFF"
+          ENABLE_STD_MODULES: ${{ matrix.std_modules }}
       - uses: actions/upload-artifact@v3
         if: always()
         with:
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 53de53480266b..a5b5169747008 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -124,6 +124,12 @@ option(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS
    the shared library they shipped should turn this on and see `include/__availability`
    for more details." OFF)
 option(LIBCXX_ENABLE_CLANG_TIDY "Whether to compile and run clang-tidy checks" OFF)
+# TODO MODULES Remove this option and test for the requirements (CMake/Clang) instead.
+option(LIBCXX_ENABLE_STD_MODULES
+   "Whether to enable the building the C++23 `std` module. This feature is
+    experimental and has additional dependencies. Only enable this when
+    interested in testing or developing this module. See
+    https://libcxx.llvm.org/Modules.html for more information." OFF)
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(LIBCXX_DEFAULT_TEST_CONFIG "llvm-libc++-shared-gcc.cfg.in")
@@ -773,6 +779,7 @@ config_define_if_not(LIBCXX_ENABLE_RANDOM_DEVICE _LIBCPP_HAS_NO_RANDOM_DEVICE)
 config_define_if_not(LIBCXX_ENABLE_LOCALIZATION _LIBCPP_HAS_NO_LOCALIZATION)
 config_define_if_not(LIBCXX_ENABLE_UNICODE _LIBCPP_HAS_NO_UNICODE)
 config_define_if_not(LIBCXX_ENABLE_WIDE_CHARACTERS _LIBCPP_HAS_NO_WIDE_CHARACTERS)
+config_define_if_not(LIBCXX_ENABLE_STD_MODULES _LIBCPP_HAS_NO_STD_MODULES)
 config_define_if_not(LIBCXX_ENABLE_TIME_ZONE_DATABASE _LIBCPP_HAS_NO_TIME_ZONE_DATABASE)
 config_define_if_not(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 
@@ -856,7 +863,9 @@ endfunction()
 add_subdirectory(include)
 add_subdirectory(src)
 add_subdirectory(utils)
-add_subdirectory(modules)
+if (LIBCXX_ENABLE_STD_MODULES)
+  add_subdirectory(modules)
+endif()
 
 set(LIBCXX_TEST_DEPS "cxx_experimental")
 
@@ -864,7 +873,9 @@ if (LIBCXX_ENABLE_CLANG_TIDY)
   list(APPEND LIBCXX_TEST_DEPS cxx-tidy)
 endif()
 
-list(APPEND LIBCXX_TEST_DEPS generate-cxx-modules)
+if (LIBCXX_ENABLE_STD_MODULES)
+  list(APPEND LIBCXX_TEST_DEPS generate-cxx-modules generate-test-module-std)
+endif()
 
 if (LIBCXX_INCLUDE_BENCHMARKS)
   add_subdirectory(benchmarks)
diff --git a/libcxx/cmake/caches/Generic-cxx26.cmake b/libcxx/cmake/caches/Generic-cxx26.cmake
index 6ba9482af5785..f48d72d493c2f 100644
--- a/libcxx/cmake/caches/Generic-cxx26.cmake
+++ b/libcxx/cmake/caches/Generic-cxx26.cmake
@@ -1,2 +1,3 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++26" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
index 72263dfd84635..0487377d4e9ba 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
@@ -1 +1,2 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_HARDENING_MODE "extensive" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-exceptions.cmake b/libcxx/cmake/caches/Generic-no-exceptions.cmake
index f0dffef60dba0..f405f7fe99375 100644
--- a/libcxx/cmake/caches/Generic-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Generic-no-exceptions.cmake
@@ -1,2 +1,3 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-experimental.cmake b/libcxx/cmake/caches/Generic-no-experimental.cmake
index f33ed01418990..fe14e7afed7b9 100644
--- a/libcxx/cmake/caches/Generic-no-experimental.cmake
+++ b/libcxx/cmake/caches/Generic-no-experimental.cmake
@@ -1,2 +1,3 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "enable_experimental=False" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-filesystem.cmake b/libcxx/cmake/caches/Generic-no-filesystem.cmake
index 4000f3a3e8ef2..db62f86854d94 100644
--- a/libcxx/cmake/caches/Generic-no-filesystem.cmake
+++ b/libcxx/cmake/caches/Generic-no-filesystem.cmake
@@ -1 +1,2 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-localization.cmake b/libcxx/cmake/caches/Generic-no-localization.cmake
index 79d6b44c7139a..54a7ec3f1f5b3 100644
--- a/libcxx/cmake/caches/Generic-no-localization.cmake
+++ b/libcxx/cmake/caches/Generic-no-localization.cmake
@@ -1 +1,2 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-random_device.cmake b/libcxx/cmake/caches/Generic-no-random_device.cmake
index e9b4cc60cc80e..adfa2458a8edf 100644
--- a/libcxx/cmake/caches/Generic-no-random_device.cmake
+++ b/libcxx/cmake/caches/Generic-no-random_device.cmake
@@ -1 +1,2 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-threads.cmake b/libcxx/cmake/caches/Generic-no-threads.cmake
index 616baef1be7be..2aeab22915e00 100644
--- a/libcxx/cmake/caches/Generic-no-threads.cmake
+++ b/libcxx/cmake/caches/Generic-no-threads.cmake
@@ -1,3 +1,4 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-tzdb.cmake b/libcxx/cmake/caches/Generic-no-tzdb.cmake
index 27c826edfecff..c5dc882e58442 100644
--- a/libcxx/cmake/caches/Generic-no-tzdb.cmake
+++ b/libcxx/cmake/caches/Generic-no-tzdb.cmake
@@ -1 +1,2 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_TIME_ZONE_DATABASE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-unicode.cmake b/libcxx/cmake/caches/Generic-no-unicode.cmake
index 01160bf218981..880e2d502ad91 100644
--- a/libcxx/cmake/caches/Generic-no-unicode.cmake
+++ b/libcxx/cmake/caches/Generic-no-unicode.cmake
@@ -1 +1,2 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-wide-characters.cmake b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
index 728d41086a386..5036f6abd52e8 100644
--- a/libcxx/cmake/caches/Generic-no-wide-characters.cmake
+++ b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
@@ -1 +1,2 @@
+set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "")
diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst
index 1998cd9d1d267..5099e6095582c 100644
--- a/libcxx/docs/Modules.rst
+++ b/libcxx/docs/Modules.rst
@@ -115,7 +115,7 @@ directory. First libc++ needs to be build with module support enabled.
   $ git clone https://github.com/llvm/llvm-project.git
   $ cd llvm-project
   $ mkdir build
-  $ cmake -G Ninja -S runtimes -B build -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind"
+  $ cmake -G Ninja -S runtimes -B build -DLIBCXX_ENABLE_STD_MODULES=ON -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind"
   $ ninja -C build
 
 The above ``build`` directory will be referred to as ``<build>`` in the
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 77b7939a0c0ac..983363af54e73 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -89,10 +89,6 @@ Improvements and New Features
 - The ``_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE`` macro has been added to make
   the function ``std::shared_ptr<...>::unique()`` available.
 
-- The cmake option ``LIBCXX_ENABLE_STD_MODULES`` has been removed. The test
-  infrastructure no longer depends on a modern CMake, it works with the minimal
-  required LLVM version (3.20.0).
-
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index 50ee9d4ee400b..e7645cb5885f1 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -394,7 +394,7 @@ Custom Directives
 ~~~~~~~~~~~~~~~~~
 
 Lit has many directives built in (e.g., ``DEFINE``, ``UNSUPPORTED``). In addition to those directives, libc++ adds two additional libc++-specific directives that makes
-writing tests easier. See `libc++-specific Lit Directives`_ for more information about the ``FILE_DEPENDENCIES``, ``ADDITIONAL_COMPILE_FLAGS``, and ``MODULE_DEPENDENCIES`` libc++-specific directives.
+writing tests easier. See `libc++-specific Lit Directives`_ for more information about the ``FILE_DEPENDENCIES`` and ``ADDITIONAL_COMPILE_FLAGS`` libc++-specific directives.
 
 .. _libc++-specific Lit Directives:
 .. list-table:: libc++-specific Lit Directives
@@ -417,13 +417,6 @@ writing tests easier. See `libc++-specific Lit Directives`_ for more information
      - The additional compiler flags specified by a space-separated list to the ``ADDITIONAL_COMPILE_FLAGS`` libc++-specific Lit directive will be added to the end of the ``%{compile_flags}``
        substitution for the test that contains it. This libc++-specific Lit directive makes it possible to add special compilation flags without having to resort to writing a ``.sh.cpp`` test (see
        `Lit Meaning of libc++ Test Filenames`_), more powerful but perhaps overkill.
-   * - ``MODULE_DEPENDENCIES``
-     - ``// MODULE_DEPENDENCIES: std std.compat``
-     - This directive will build the required C++23 standard library
-       modules and add the additional compiler flags in
-       %{compile_flags}. (Libc++ offers these modules in C++20 as an
-       extension.)
-
 
 Benchmarks
 ==========
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index 31fbadf449f77..fae6448a7eec8 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -1,3 +1,8 @@
+if (CMAKE_VERSION VERSION_LESS 3.26)
+  message(WARNING "The libc++ modules won't be available because the CMake version is too old. Update to CMake 3.26 or later.")
+  return()
+endif()
+
 # The headers of Table 24: C++ library headers [tab:headers.cpp]
 # and the headers of Table 25: C++ headers for C library facilities [tab:headers.cpp.c]
 set(LIBCXX_MODULE_STD_SOURCES
@@ -137,6 +142,28 @@ set(LIBCXX_MODULE_STD_COMPAT_SOURCES
   std.compat/cwctype.inc
 )
 
+# TODO MODULES the CMakeLists.txt in the install directory is only temporary
+# When that is removed the configured file can use the substitution
+# LIBCXX_GENERATED_INCLUDE_TARGET_DIR avoiding this set.
+# Also clean up the parts needed to generate the install version.
+# - LIBCXX_GENERATED_INCLUDE_DIR contains the libc++ headers
+# - LIBCXX_GENERATED_INCLUDE_TARGET_DIR contains the libc++ site config
+if ("${LIBCXX_GENERATED_INCLUDE_DIR}" STREQUAL "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR}")
+  # This typically happens when the target is not installed.
+  set(LIBCXX_CONFIGURED_INCLUDE_DIRS "${LIBCXX_GENERATED_INCLUDE_DIR}")
+else()
+  # It's important that the arch directory be included first so that its header files
+  # which interpose on the default include dir be included instead of the default ones.
+  set(LIBCXX_CONFIGURED_INCLUDE_DIRS
+    "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR};${LIBCXX_GENERATED_INCLUDE_DIR}"
+  )
+endif()
+configure_file(
+  "CMakeLists.txt.in"
+  "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt"
+  @ONLY
+)
+
 set(LIBCXX_MODULE_STD_INCLUDE_SOURCES)
 foreach(file ${LIBCXX_MODULE_STD_SOURCES})
   set(
@@ -166,6 +193,7 @@ configure_file(
 )
 
 set(_all_modules)
+list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.cppm")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.compat.cppm")
 foreach(file ${LIBCXX_MODULE_STD_SOURCES} ${LIBCXX_MODULE_STD_COMPAT_SOURCES})
diff --git a/libcxx/modules/CMakeLists.txt.in b/libcxx/modules/CMakeLists.txt.in
new file mode 100644
index 0000000000000..98168673ebfe9
--- /dev/null
+++ b/libcxx/modules/CMakeLists.txt.in
@@ -0,0 +1,86 @@
+cmake_minimum_required(VERSION 3.26)
+
+project(libc++-modules LANGUAGES CXX)
+
+# Enable CMake's module support
+if(CMAKE_VERSION VERSION_LESS "3.28.0")
+  if(CMAKE_VERSION VERSION_LESS "3.27.0")
+    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "2182bf5c-ef0d-489a-91da-49dbc3090d2a")
+  else()
+    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "aa1f7df0-828a-4fcd-9afc-2dc80491aca7")
+  endif()
+  set(CMAKE_EXPERIMENTAL_CXX_MODULE_DYNDEP 1)
+else()
+  cmake_policy(VERSION 3.28)
+endif()
+
+# Default to C++ extensions being off. Libc++'s modules support have trouble
+# with extensions right now.
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+# Propagates the CMake options to the modules.
+#
+# This uses the std module hard-coded since the std.compat module does not
+# depend on these flags.
+macro(compile_define_if_not condition def)
+  if (NOT ${condition})
+    target_compile_definitions(std PRIVATE ${def})
+  endif()
+endmacro()
+macro(compile_define_if condition def)
+  if (${condition})
+    target_compile_definitions(std PRIVATE ${def})
+  endif()
+endmacro()
+
+### STD
+
+add_library(std)
+target_sources(std
+  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
+    std.cppm
+)
+
+target_include_directories(std SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+
+if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
+  target_compile_options(std PUBLIC -fno-exceptions)
+endif()
+
+target_compile_options(std
+  PUBLIC
+    -nostdinc++
+    -Wno-reserved-module-identifier
+    -Wno-reserved-user-defined-literal
+    @LIBCXX_COMPILE_FLAGS@
+)
+set_target_properties(std
+  PROPERTIES
+    OUTPUT_NAME   "c++std"
+)
+
+### STD.COMPAT
+
+add_library(std.compat)
+target_sources(std.compat
+  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
+    std.compat.cppm
+)
+
+target_include_directories(std.compat SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
+
+if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
+  target_compile_options(std.compat PUBLIC -fno-exceptions)
+endif()
+
+target_compile_options(std.compat
+  PUBLIC
+    -nostdinc++
+    -Wno-reserved-module-identifier
+    -Wno-reserved-user-defined-literal
+    @LIBCXX_COMPILE_FLAGS@
+)
+set_target_properties(std.compat
+  PROPERTIES
+    OUTPUT_NAME   "c++std.compat"
+)
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 52620fc55feeb..48dd233462ab3 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -87,6 +87,31 @@ if (LIBCXX_INCLUDE_TESTS)
     ${CMAKE_CURRENT_BINARY_DIR}
     DEPENDS cxx-test-depends)
 
+  if(LIBCXX_ENABLE_STD_MODULES)
+    # Generates the modules used in the test.
+    # Note the test will regenerate this with the proper setting
+    # - the right DCMAKE_CXX_STANDARD
+    # - the right test compilation flags
+    # Since modules depend on these flags there currently is no way to
+    # avoid generating these for the tests. The advantage of the
+    # pre generation is that less build information needs to be shared
+    # in the bridge.
+    add_custom_command(
+        OUTPUT "${CMAKE_BINARY_DIR}/test/__config_module__/CMakeCache.txt"
+        COMMAND
+        ${CMAKE_COMMAND}
+            "-G${CMAKE_GENERATOR}"
+            "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}"
+            "-B${CMAKE_BINARY_DIR}/test/__config_module__"
+            "-H${LIBCXX_GENERATED_MODULE_DIR}"
+            "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
+            "-DCMAKE_CXX_STANDARD=23"
+            "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON"
+            )
+  add_custom_target(generate-test-module-std
+      DEPENDS "${CMAKE_BINARY_DIR}/test/__config_module__/CMakeCache.txt"
+      COMMENT "Builds generic module std.")
+  endif()
 endif()
 
 if (LIBCXX_GENERATE_COVERAGE)
diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in
index 72b2ddf378bb6..0e3c3040c9644 100644
--- a/libcxx/test/configs/cmake-bridge.cfg.in
+++ b/libcxx/test/configs/cmake-bridge.cfg.in
@@ -31,3 +31,10 @@ config.substitutions.append(('%{target-include}', '@LIBCXX_GENERATED_INCLUDE_TAR
 config.substitutions.append(('%{lib}', '@LIBCXX_LIBRARY_DIR@'))
 config.substitutions.append(('%{module}', '@LIBCXX_GENERATED_MODULE_DIR@'))
 config.substitutions.append(('%{test-tools}', '@LIBCXX_TEST_TOOLS_PATH@'))
+
+# The test needs to manually rebuild the module. The compiler flags used in the
+# test need to be the same as the compiler flags used to generate the module.
+# In the future, when CMake can generated modules this may no longer be
+# necessary.
+# TODO MODULES whether it's possible to remove this substitution.
+config.substitutions.append(('%{cmake}', '@CMAKE_COMMAND@'))
diff --git a/libcxx/test/libcxx/module_std.gen.py b/libcxx/test/libcxx/module_std.gen.py
index a9a05a0cd74e6..8e03d6e5b5b52 100644
--- a/libcxx/test/libcxx/module_std.gen.py
+++ b/libcxx/test/libcxx/module_std.gen.py
@@ -30,7 +30,6 @@
     "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin",
     "%{cxx}",
     "%{flags} %{compile_flags}",
-    "std",
 )
 
 
diff --git a/libcxx/test/libcxx/module_std_compat.gen.py b/libcxx/test/libcxx/module_std_compat.gen.py
index 2866066ccedc8..c4792db3d71e6 100644
--- a/libcxx/test/libcxx/module_std_compat.gen.py
+++ b/libcxx/test/libcxx/module_std_compat.gen.py
@@ -30,7 +30,6 @@
     "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin",
     "%{cxx}",
     "%{flags} %{compile_flags}",
-    "std.compat",
 )
 
 
diff --git a/libcxx/test/libcxx/selftest/modules/no-modules.sh.cpp b/libcxx/test/libcxx/selftest/modules/no-modules.sh.cpp
deleted file mode 100644
index 74e65a9072f7b..0000000000000
--- a/libcxx/test/libcxx/selftest/modules/no-modules.sh.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// Make sure that the compile flags contain no module information.
-
-// MODULE_DEPENDENCIES:
-
-// RUN: echo "%{compile_flags}" | grep -v "std.pcm"
-// RUN: echo "%{compile_flags}" | grep -v "std.compat.pcm"
diff --git a/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
deleted file mode 100644
index 81241d7f43f9a..0000000000000
--- a/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
+++ /dev/null
@@ -1,26 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-// UNSUPPORTED: clang-modules-build
-// UNSUPPORTED: gcc
-
-// XFAIL: has-no-cxx-module-support
-
-// picolibc does not provide the required timespec_get function, and the
-// "using-if-exists" mechanism apparently did not work here.
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
-// Make sure that the compile flags contain the expected elements.
-// The tests only look for the expected components and not the exact flags.
-// Otherwise changing the location of the module would break this test.
-
-// MODULE_DEPENDENCIES: std std.compat
-
-// RUN: echo "%{compile_flags}" | grep -- "-fmodule-file=std=.*/std.pcm .*/std.pcm"
-// RUN: echo "%{compile_flags}" | grep -- "-fmodule-file=std.compat=.*/std.compat.pcm .*/std.compat.pcm"
diff --git a/libcxx/test/libcxx/selftest/modules/std-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std-module.sh.cpp
deleted file mode 100644
index ec43994fa1ef9..0000000000000
--- a/libcxx/test/libcxx/selftest/modules/std-module.sh.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-// UNSUPPORTED: clang-modules-build
-// UNSUPPORTED: gcc
-
-// XFAIL: has-no-cxx-module-support
-
-// Make sure that the compile flags contain the expected elements.
-// The tests only look for the expected components and not the exact flags.
-// Otherwise changing the location of the module would break this test.
-
-// MODULE_DEPENDENCIES: std
-
-// RUN: echo "%{compile_flags}" | grep -- "-fmodule-file=std=.*/std.pcm .*/std.pcm"
-
-// The std module should not provide the std.compat module
-// RUN: echo "%{compile_flags}" | grep -v "std.compat.pcm"
diff --git a/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
deleted file mode 100644
index b74c2f1a249fc..0000000000000
--- a/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
+++ /dev/null
@@ -1,28 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-// UNSUPPORTED: clang-modules-build
-// UNSUPPORTED: gcc
-
-// XFAIL: has-no-cxx-module-support
-
-// picolibc does not provide the required timespec_get function, and the
-// "using-if-exists" mechanism apparently did not work here.
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
-// Make sure that the compile flags contain the expected elements.
-// The tests only look for the expected components and not the exact flags.
-// Otherwise changing the location of the module would break this test.
-
-// MODULE_DEPENDENCIES: std.compat
-
-// RUN: echo "%{compile_flags}" | grep -- "-fmodule-file=std.compat=.*/std.compat.pcm .*/std.compat.pcm"
-
-// It's unspecified whether std.compat is built on the std module.
-// Therefore don't test its presence
diff --git a/libcxx/test/lit.local.cfg b/libcxx/test/lit.local.cfg
new file mode 100644
index 0000000000000..1ee9086ee22e3
--- /dev/null
+++ b/libcxx/test/lit.local.cfg
@@ -0,0 +1,83 @@
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+# This configuration builds the C++23 std module.
+# It is build when the current lit configuration supports modules.
+#
+# TODO MODULES Evaluate whether this file can be removed when CMake supports
+# modules in libc++.
+
+import os
+import site
+import subprocess
+import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
+
+
+def getSubstitution(substitution, config):
+    for orig, replacement in config.substitutions:
+        if orig == substitution:
+            return replacement
+    raise ValueError("Substitution {} is not in the config.".format(substitution))
+
+
+def appendToSubstitution(substitutions, key, value):
+    return [(k, v + " " + value) if k == key else (k, v) for (k, v) in substitutions]
+
+
+std = getSubstitution("%{cxx_std}", config)
+if std == "cxx26":
+    std = "26"
+elif std == "cxx23":
+    std = "23"
+elif std == "cxx20":
+    std = "20"
+else:
+    std = ""
+
+if (
+    std
+    and not "libcpp-has-no-std-modules" in config.available_features
+    and not "clang-modules-build" in config.available_features
+):
+    build = os.path.join(config.test_exec_root, "__config_module__")
+    config.substitutions = appendToSubstitution(
+        config.substitutions,
+        "%{compile_flags}",
+        "-fprebuilt-module-path="
+        + os.path.join(config.test_exec_root, "__config_module__/CMakeFiles/std.dir"),
+    )
+
+    cmake = getSubstitution("%{cmake}", config)
+    flags = getSubstitution("%{flags}", config)
+    if "c++experimental" in config.available_features:
+        flags = f"{flags} -D_LIBCPP_ENABLE_EXPERIMENTAL"
+
+    subprocess.check_call(
+        [cmake, f"-DCMAKE_CXX_STANDARD={std}", f"-DCMAKE_CXX_FLAGS={flags}", build],
+        env={},
+    )
+    subprocess.check_call([cmake, "--build", build, "--", "-v"], env={})
+    config.substitutions = appendToSubstitution(
+        config.substitutions,
+        "%{link_flags}",
+        os.path.join(build, "libc++std.a"),
+    )
+
+    config.substitutions = appendToSubstitution(
+        config.substitutions,
+        "%{compile_flags}",
+        "-fprebuilt-module-path="
+        + os.path.join(
+            config.test_exec_root, "__config_module__/CMakeFiles/std.compat.dir"
+        ),
+    )
+    config.substitutions = appendToSubstitution(
+        config.substitutions,
+        "%{link_flags}",
+        os.path.join(build, "libc++std.compat.a"),
+    )
diff --git a/libcxx/test/std/modules/std.compat.pass.cpp b/libcxx/test/std/modules/std.compat.pass.cpp
index 40ea979e27346..9a2f330d722ed 100644
--- a/libcxx/test/std/modules/std.compat.pass.cpp
+++ b/libcxx/test/std/modules/std.compat.pass.cpp
@@ -7,10 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: clang-modules-build
-// UNSUPPORTED: gcc
 
-// XFAIL: has-no-cxx-module-support
+// UNSUPPORTED: libcpp-has-no-std-modules
+// UNSUPPORTED: clang-modules-build
 
 // picolibc does not provide the required timespec_get function, and the
 // "using-if-exists" mechanism apparently did not work here.
@@ -18,8 +17,6 @@
 
 // A minimal test to validate import works.
 
-// MODULE_DEPENDENCIES: std.compat
-
 import std.compat;
 
 int main(int, char**) { return !(::strlen("Hello modular world") == 19); }
diff --git a/libcxx/test/std/modules/std.pass.cpp b/libcxx/test/std/modules/std.pass.cpp
index ca05825b3a186..8ec3ce27322b7 100644
--- a/libcxx/test/std/modules/std.pass.cpp
+++ b/libcxx/test/std/modules/std.pass.cpp
@@ -7,15 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-// UNSUPPORTED: clang-modules-build
-// UNSUPPORTED: gcc
 
-// XFAIL: has-no-cxx-module-support
+// UNSUPPORTED: libcpp-has-no-std-modules
+// UNSUPPORTED: clang-modules-build
 
 // A minimal test to validate import works.
 
-// MODULE_DEPENDENCIES: std
-
 import std;
 
 int main(int, char**) {
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index 225de937cc869..e4bda4f06742c 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -152,6 +152,17 @@ RUN <<EOF
     wget https://github.com/Kitware/CMake/releases/download/v3.21.1/cmake-3.21.1-linux-x86_64.sh -O /tmp/install-cmake.sh
     sudo bash /tmp/install-cmake.sh --prefix=/usr --exclude-subdir --skip-license
     rm /tmp/install-cmake.sh
+
+    # Install a newer CMake for modules
+    # TODO Remove the duplicated installation when all runtimes can be build with CMake 3.28.
+    wget https://github.com/Kitware/CMake/releases/download/v3.27.1/cmake-3.27.1-linux-x86_64.sh -O /tmp/install-cmake.sh
+    sudo bash /tmp/install-cmake.sh --prefix=/opt --exclude-subdir --skip-license
+    rm /tmp/install-cmake.sh
+
+    wget https://github.com/Kitware/CMake/releases/download/v3.28.0-rc4/cmake-3.28.0-rc4-linux-x86_64.sh -O /tmp/install-cmake.sh
+    sudo mkdir /opt/cmake-3.28
+    sudo bash /tmp/install-cmake.sh --prefix=/opt/cmake-3.28 --exclude-subdir --skip-license
+    rm /tmp/install-cmake.sh
 EOF
 
 # ===----------------------------------------------------------------------===##
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index a7c44dab70939..a48f8524ef63c 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -48,6 +48,7 @@ environment_definitions:
       CLANG_CRASH_DIAGNOSTICS_DIR: "crash_diagnostics"
       CC: clang-${LLVM_HEAD_VERSION}
       CXX: clang++-${LLVM_HEAD_VERSION}
+      CMAKE: /opt/bin/cmake
 
   _absolute_path_clang: &absolute_path_clang
     # Note modules require and absolute path for clang-scan-deps
@@ -261,6 +262,7 @@ steps:
     env:
       CC: clang16
       CXX: clang++16
+      ENABLE_STD_MODULES: 'Off'
     agents:
       queue: libcxx-builders
       os: freebsd
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index ddfe53d2d8bd6..221b497be6d95 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -47,6 +47,11 @@ CLANG_FORMAT        The clang-format binary to use when generating the format
 ENABLE_CLANG_TIDY   Whether to compile and run clang-tidy checks. This variable
                     is optional.
 
+ENABLE_STD_MODULES  Whether to enable or disable building the C++23 std
+                    modules. This variable is optional.
+                    TODO MODULES remove when all supported compilers support
+                    modules.
+
 EOF
 }
 
@@ -115,6 +120,10 @@ if [ -z "${ENABLE_CLANG_TIDY}" ]; then
     ENABLE_CLANG_TIDY=Off
 fi
 
+if [ -n "${ENABLE_STD_MODULES}" ]; then
+    ENABLE_STD_MODULES="-DLIBCXX_ENABLE_STD_MODULES=${ENABLE_STD_MODULES}"
+fi
+
 function generate-cmake-base() {
     echo "--- Generating CMake"
     ${CMAKE} \
@@ -127,6 +136,7 @@ function generate-cmake-base() {
           -DLIBCXXABI_ENABLE_WERROR=YES \
           -DLIBUNWIND_ENABLE_WERROR=YES \
           -DLIBCXX_ENABLE_CLANG_TIDY=${ENABLE_CLANG_TIDY} \
+          ${ENABLE_STD_MODULES} \
           -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
           "${@}"
 }
diff --git a/libcxx/utils/libcxx/test/config.py b/libcxx/utils/libcxx/test/config.py
index 9a71c494030d7..63cfa1b677819 100644
--- a/libcxx/utils/libcxx/test/config.py
+++ b/libcxx/utils/libcxx/test/config.py
@@ -16,10 +16,6 @@ def _getSubstitution(substitution, config):
     raise ValueError("Substitution {} is not in the config.".format(substitution))
 
 
-def _appendToSubstitution(substitutions, key, value):
-    return [(k, v + " " + value) if k == key else (k, v) for (k, v) in substitutions]
-
-
 def configure(parameters, features, config, lit_config):
     note = lambda s: lit_config.note("({}) {}".format(config.name, s))
     config.environment = dict(os.environ)
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 1983bc91ab171..e5b561d7feeb5 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -317,17 +317,6 @@ def _getAndroidDeviceApi(cfg):
             AddSubstitution("%{clang-tidy}", lambda cfg: _getSuitableClangTidy(cfg))
         ],
     ),
-    # Whether module support for the platform is available.
-    Feature(
-        name="has-no-cxx-module-support",
-        # The libc of these platforms have functions with internal linkage.
-        # This is not allowed per C11 7.1.2 Standard headers/6
-        #  Any declaration of a library function shall have external linkage.
-        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)
-        or platform.system().lower().startswith("aix")
-        # Avoid building on platforms that don't support modules properly.
-        or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier"),
-    ),
 ]
 
 # Deduce and add the test features that that are implied by the #defines in
@@ -354,6 +343,7 @@ def _getAndroidDeviceApi(cfg):
     "_LIBCPP_HAS_NO_WIDE_CHARACTERS": "no-wide-characters",
     "_LIBCPP_HAS_NO_TIME_ZONE_DATABASE": "no-tzdb",
     "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode",
+    "_LIBCPP_HAS_NO_STD_MODULES":  "libcpp-has-no-std-modules",
     "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH": "libcpp-pstl-cpu-backend-libdispatch",
 }
 for macro, feature in macros.items():
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index c605e8796ac54..5d84711bf5d28 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -7,7 +7,6 @@
 # ===----------------------------------------------------------------------===##
 
 import lit
-import libcxx.test.config as config
 import lit.formats
 import os
 import re
@@ -53,14 +52,6 @@ def _executeScriptInternal(test, litConfig, commands):
     return (out, err, exitCode, timeoutInfo, parsedCommands)
 
 
-def _validateModuleDependencies(modules):
-    for m in modules:
-        if m not in ("std", "std.compat"):
-            raise RuntimeError(
-                f"Invalid module dependency '{m}', only 'std' and 'std.compat' are valid"
-            )
-
-
 def parseScript(test, preamble):
     """
     Extract the script from a test, with substitutions applied.
@@ -100,8 +91,6 @@ def parseScript(test, preamble):
     # Parse the test file, including custom directives
     additionalCompileFlags = []
     fileDependencies = []
-    modules = []  # The enabled modules
-    moduleCompileFlags = []  # The compilation flags to use modules
     parsers = [
         lit.TestRunner.IntegratedTestKeywordParser(
             "FILE_DEPENDENCIES:",
@@ -113,11 +102,6 @@ def parseScript(test, preamble):
             lit.TestRunner.ParserKind.SPACE_LIST,
             initial_value=additionalCompileFlags,
         ),
-        lit.TestRunner.IntegratedTestKeywordParser(
-            "MODULE_DEPENDENCIES:",
-            lit.TestRunner.ParserKind.SPACE_LIST,
-            initial_value=modules,
-        ),
     ]
 
     # Add conditional parsers for ADDITIONAL_COMPILE_FLAGS. This should be replaced by first
@@ -148,53 +132,12 @@ def parseScript(test, preamble):
     script += scriptInTest
 
     # Add compile flags specified with ADDITIONAL_COMPILE_FLAGS.
-    # Modules need to be built with the same compilation flags as the
-    # test. So add these flags before adding the modules.
-    substitutions = config._appendToSubstitution(
-        substitutions, "%{compile_flags}", " ".join(additionalCompileFlags)
-    )
-
-    if modules:
-        _validateModuleDependencies(modules)
-
-        # The moduleCompileFlags are added to the %{compile_flags}, but
-        # the modules need to be built without these flags. So expand the
-        # %{compile_flags} eagerly and hardcode them in the build script.
-        compileFlags = config._getSubstitution("%{compile_flags}", test.config)
-
-        # Building the modules needs to happen before the other script
-        # commands are executed. Therefore the commands are added to the
-        # front of the list.
-        if "std.compat" in modules:
-            script.insert(
-                0,
-                "%dbg(MODULE std.compat) %{cxx} %{flags} "
-                f"{compileFlags} "
-                "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
-                "--precompile -o %T/std.compat.pcm -c %{module}/std.compat.cppm",
-            )
-            moduleCompileFlags.extend(
-                ["-fmodule-file=std.compat=%T/std.compat.pcm", "%T/std.compat.pcm"]
-            )
-
-        # Make sure the std module is built before std.compat. Libc++'s
-        # std.compat module depends on the std module. It is not
-        # known whether the compiler expects the modules in the order of
-        # their dependencies. However it's trivial to provide them in
-        # that order.
-        script.insert(
-            0,
-            "%dbg(MODULE std) %{cxx} %{flags} "
-            f"{compileFlags} "
-            "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
-            "--precompile -o %T/std.pcm -c %{module}/std.cppm",
-        )
-        moduleCompileFlags.extend(["-fmodule-file=std=%T/std.pcm", "%T/std.pcm"])
-
-        # Add compile flags required for the modules.
-        substitutions = config._appendToSubstitution(
-            substitutions, "%{compile_flags}", " ".join(moduleCompileFlags)
-        )
+    substitutions = [
+        (s, x + " " + " ".join(additionalCompileFlags))
+        if s == "%{compile_flags}"
+        else (s, x)
+        for (s, x) in substitutions
+    ]
 
     # Perform substitutions in the script itself.
     script = lit.TestRunner.applySubstitutions(
diff --git a/libcxx/utils/libcxx/test/modules.py b/libcxx/utils/libcxx/test/modules.py
index 3e9fcae4c5389..bd19fac314dd9 100644
--- a/libcxx/utils/libcxx/test/modules.py
+++ b/libcxx/utils/libcxx/test/modules.py
@@ -113,12 +113,12 @@ class module_test_generator:
     clang_tidy_plugin: str
     compiler: str
     compiler_flags: str
-    module: str
 
     def write_lit_configuration(self):
         print(
             f"""\
 // UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: libcpp-has-no-std-modules
 // UNSUPPORTED: clang-modules-build
 
 // REQUIRES: has-clang-tidy
@@ -126,8 +126,6 @@ def write_lit_configuration(self):
 // The GCC compiler flags are not always compatible with clang-tidy.
 // UNSUPPORTED: gcc
 
-// MODULE_DEPENDENCIES: {self.module}
-
 // RUN: echo -n > {self.tmp_prefix}.all_partitions
 """
         )

From 80fcf486edefc776ec680375b7fe2d15d65c4428 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Fri, 19 Jan 2024 00:46:12 +0100
Subject: [PATCH 053/843] [llvm-lib][Object][COFF] Use ARM64 machine type for
 import library descriptor objects. (#78537)

---
 llvm/lib/Object/COFFImportFile.cpp           | 61 +++++++++++---------
 llvm/test/tools/llvm-lib/arm64ec-implib.test | 12 ++--
 2 files changed, 42 insertions(+), 31 deletions(-)

diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index b60e32f498eb5..60556c149bf73 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -150,7 +150,7 @@ namespace {
 class ObjectFactory {
   using u16 = support::ulittle16_t;
   using u32 = support::ulittle32_t;
-  MachineTypes Machine;
+  MachineTypes NativeMachine;
   BumpPtrAllocator Alloc;
   StringRef ImportName;
   StringRef Library;
@@ -159,7 +159,7 @@ class ObjectFactory {
 
 public:
   ObjectFactory(StringRef S, MachineTypes M)
-      : Machine(M), ImportName(S), Library(llvm::sys::path::stem(S)),
+      : NativeMachine(M), ImportName(S), Library(llvm::sys::path::stem(S)),
         ImportDescriptorSymbolName(("__IMPORT_DESCRIPTOR_" + Library).str()),
         NullThunkSymbolName(("\x7f" + Library + "_NULL_THUNK_DATA").str()) {}
 
@@ -182,10 +182,14 @@ class ObjectFactory {
   // Create a short import file which is described in PE/COFF spec 7. Import
   // Library Format.
   NewArchiveMember createShortImport(StringRef Sym, uint16_t Ordinal,
-                                     ImportType Type, ImportNameType NameType);
+                                     ImportType Type, ImportNameType NameType,
+                                     MachineTypes Machine);
 
   // Create a weak external file which is described in PE/COFF Aux Format 3.
-  NewArchiveMember createWeakExternal(StringRef Sym, StringRef Weak, bool Imp);
+  NewArchiveMember createWeakExternal(StringRef Sym, StringRef Weak, bool Imp,
+                                      MachineTypes Machine);
+
+  bool is64Bit() const { return COFF::is64Bit(NativeMachine); }
 };
 } // namespace
 
@@ -197,7 +201,7 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
 
   // COFF Header
   coff_file_header Header{
-      u16(Machine),
+      u16(NativeMachine),
       u16(NumberOfSections),
       u32(0),
       u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
@@ -208,7 +212,7 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
           (ImportName.size() + 1)),
       u32(NumberOfSymbols),
       u16(0),
-      u16(is64Bit(Machine) ? C_Invalid : IMAGE_FILE_32BIT_MACHINE),
+      u16(is64Bit() ? C_Invalid : IMAGE_FILE_32BIT_MACHINE),
   };
   append(Buffer, Header);
 
@@ -250,11 +254,11 @@ ObjectFactory::createImportDescriptor(std::vector<uint8_t> &Buffer) {
 
   const coff_relocation RelocationTable[NumberOfRelocations] = {
       {u32(offsetof(coff_import_directory_table_entry, NameRVA)), u32(2),
-       u16(getImgRelRelocation(Machine))},
+       u16(getImgRelRelocation(NativeMachine))},
       {u32(offsetof(coff_import_directory_table_entry, ImportLookupTableRVA)),
-       u32(3), u16(getImgRelRelocation(Machine))},
+       u32(3), u16(getImgRelRelocation(NativeMachine))},
       {u32(offsetof(coff_import_directory_table_entry, ImportAddressTableRVA)),
-       u32(4), u16(getImgRelRelocation(Machine))},
+       u32(4), u16(getImgRelRelocation(NativeMachine))},
   };
   append(Buffer, RelocationTable);
 
@@ -336,7 +340,7 @@ ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
 
   // COFF Header
   coff_file_header Header{
-      u16(Machine),
+      u16(NativeMachine),
       u16(NumberOfSections),
       u32(0),
       u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
@@ -344,7 +348,7 @@ ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
           sizeof(coff_import_directory_table_entry)),
       u32(NumberOfSymbols),
       u16(0),
-      u16(is64Bit(Machine) ? C_Invalid : IMAGE_FILE_32BIT_MACHINE),
+      u16(is64Bit() ? C_Invalid : IMAGE_FILE_32BIT_MACHINE),
   };
   append(Buffer, Header);
 
@@ -393,11 +397,11 @@ ObjectFactory::createNullImportDescriptor(std::vector<uint8_t> &Buffer) {
 NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
   const uint32_t NumberOfSections = 2;
   const uint32_t NumberOfSymbols = 1;
-  uint32_t VASize = is64Bit(Machine) ? 8 : 4;
+  uint32_t VASize = is64Bit() ? 8 : 4;
 
   // COFF Header
   coff_file_header Header{
-      u16(Machine),
+      u16(NativeMachine),
       u16(NumberOfSections),
       u32(0),
       u32(sizeof(Header) + (NumberOfSections * sizeof(coff_section)) +
@@ -407,7 +411,7 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
           VASize),
       u32(NumberOfSymbols),
       u16(0),
-      u16(is64Bit(Machine) ? C_Invalid : IMAGE_FILE_32BIT_MACHINE),
+      u16(is64Bit() ? C_Invalid : IMAGE_FILE_32BIT_MACHINE),
   };
   append(Buffer, Header);
 
@@ -422,8 +426,7 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
        u32(0),
        u16(0),
        u16(0),
-       u32((is64Bit(Machine) ? IMAGE_SCN_ALIGN_8BYTES
-                             : IMAGE_SCN_ALIGN_4BYTES) |
+       u32((is64Bit() ? IMAGE_SCN_ALIGN_8BYTES : IMAGE_SCN_ALIGN_4BYTES) |
            IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
            IMAGE_SCN_MEM_WRITE)},
       {{'.', 'i', 'd', 'a', 't', 'a', '$', '4'},
@@ -436,8 +439,7 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
        u32(0),
        u16(0),
        u16(0),
-       u32((is64Bit(Machine) ? IMAGE_SCN_ALIGN_8BYTES
-                             : IMAGE_SCN_ALIGN_4BYTES) |
+       u32((is64Bit() ? IMAGE_SCN_ALIGN_8BYTES : IMAGE_SCN_ALIGN_4BYTES) |
            IMAGE_SCN_CNT_INITIALIZED_DATA | IMAGE_SCN_MEM_READ |
            IMAGE_SCN_MEM_WRITE)},
   };
@@ -445,12 +447,12 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
 
   // .idata$5, ILT
   append(Buffer, u32(0));
-  if (is64Bit(Machine))
+  if (is64Bit())
     append(Buffer, u32(0));
 
   // .idata$4, IAT
   append(Buffer, u32(0));
-  if (is64Bit(Machine))
+  if (is64Bit())
     append(Buffer, u32(0));
 
   // Symbol Table
@@ -475,7 +477,8 @@ NewArchiveMember ObjectFactory::createNullThunk(std::vector<uint8_t> &Buffer) {
 NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
                                                   uint16_t Ordinal,
                                                   ImportType ImportType,
-                                                  ImportNameType NameType) {
+                                                  ImportNameType NameType,
+                                                  MachineTypes Machine) {
   size_t ImpSize = ImportName.size() + Sym.size() + 2; // +2 for NULs
   size_t Size = sizeof(coff_import_header) + ImpSize;
   char *Buf = Alloc.Allocate<char>(Size);
@@ -501,7 +504,8 @@ NewArchiveMember ObjectFactory::createShortImport(StringRef Sym,
 }
 
 NewArchiveMember ObjectFactory::createWeakExternal(StringRef Sym,
-                                                   StringRef Weak, bool Imp) {
+                                                   StringRef Weak, bool Imp,
+                                                   MachineTypes Machine) {
   std::vector<uint8_t> Buffer;
   const uint32_t NumberOfSections = 1;
   const uint32_t NumberOfSymbols = 5;
@@ -585,8 +589,11 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
                          ArrayRef<COFFShortExport> Exports,
                          MachineTypes Machine, bool MinGW) {
 
+  MachineTypes NativeMachine =
+      isArm64EC(Machine) ? IMAGE_FILE_MACHINE_ARM64 : Machine;
+
   std::vector<NewArchiveMember> Members;
-  ObjectFactory OF(llvm::sys::path::filename(ImportName), Machine);
+  ObjectFactory OF(llvm::sys::path::filename(ImportName), NativeMachine);
 
   std::vector<uint8_t> ImportDescriptor;
   Members.push_back(OF.createImportDescriptor(ImportDescriptor));
@@ -620,13 +627,15 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
       return Name.takeError();
 
     if (!E.AliasTarget.empty() && *Name != E.AliasTarget) {
-      Members.push_back(OF.createWeakExternal(E.AliasTarget, *Name, false));
-      Members.push_back(OF.createWeakExternal(E.AliasTarget, *Name, true));
+      Members.push_back(
+          OF.createWeakExternal(E.AliasTarget, *Name, false, Machine));
+      Members.push_back(
+          OF.createWeakExternal(E.AliasTarget, *Name, true, Machine));
       continue;
     }
 
     Members.push_back(
-        OF.createShortImport(*Name, E.Ordinal, ImportType, NameType));
+        OF.createShortImport(*Name, E.Ordinal, ImportType, NameType, Machine));
   }
 
   return writeArchive(Path, Members, SymtabWritingMode::NormalSymtab,
diff --git a/llvm/test/tools/llvm-lib/arm64ec-implib.test b/llvm/test/tools/llvm-lib/arm64ec-implib.test
index 3c74b4bf66076..2672f8d38b7f7 100644
--- a/llvm/test/tools/llvm-lib/arm64ec-implib.test
+++ b/llvm/test/tools/llvm-lib/arm64ec-implib.test
@@ -5,28 +5,30 @@ RUN: llvm-lib -machine:arm64ec -def:test.def -out:test.lib
 
 RUN: llvm-nm --print-armap test.lib | FileCheck -check-prefix=ARMAP %s
 
-ARMAP:      Archive EC map
+ARMAP:      Archive map
 ARMAP-NEXT: __IMPORT_DESCRIPTOR_test in test.dll
 ARMAP-NEXT: __NULL_IMPORT_DESCRIPTOR in test.dll
+ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
+ARMAP-EMPTY:
+ARMAP-NEXT: Archive EC map
 ARMAP-NEXT: __imp_dataexp in test.dll
 ARMAP-NEXT: __imp_funcexp in test.dll
 ARMAP-NEXT: funcexp in test.dll
-ARMAP-NEXT: test_NULL_THUNK_DATA in test.dll
 
 RUN: llvm-readobj test.lib | FileCheck -check-prefix=READOBJ %s
 
 READOBJ:      File: test.lib(test.dll)
-READOBJ-NEXT: Format: COFF-ARM64EC
+READOBJ-NEXT: Format: COFF-ARM64{{$}}
 READOBJ-NEXT: Arch: aarch64
 READOBJ-NEXT: AddressSize: 64bit
 READOBJ-EMPTY:
 READOBJ-NEXT: File: test.lib(test.dll)
-READOBJ-NEXT: Format: COFF-ARM64EC
+READOBJ-NEXT: Format: COFF-ARM64{{$}}
 READOBJ-NEXT: Arch: aarch64
 READOBJ-NEXT: AddressSize: 64bit
 READOBJ-EMPTY:
 READOBJ-NEXT: File: test.lib(test.dll)
-READOBJ-NEXT: Format: COFF-ARM64EC
+READOBJ-NEXT: Format: COFF-ARM64{{$}}
 READOBJ-NEXT: Arch: aarch64
 READOBJ-NEXT: AddressSize: 64bit
 READOBJ-EMPTY:

From 3c5845703c85f221ca12880ce9107a0c44cf7b27 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Thu, 18 Jan 2024 15:53:13 -0800
Subject: [PATCH 054/843] [lld][WebAssembly] Move input vectors from symtab to
 ctx. NFC (#78640)

Also convert from std::vector to SmallVector.

This matches the ELF linker where these were moved into the ctx object
in 9a572164d592e and converted to SmallVector in ba948c5a9c524b.
---
 lld/wasm/Config.h              | 15 +++++++++++++++
 lld/wasm/Driver.cpp            | 10 +++++-----
 lld/wasm/MapFile.cpp           |  2 +-
 lld/wasm/MarkLive.cpp          | 12 ++++++------
 lld/wasm/SymbolTable.cpp       | 24 ++++++++++++------------
 lld/wasm/SymbolTable.h         |  8 --------
 lld/wasm/SyntheticSections.cpp |  8 ++++----
 lld/wasm/Writer.cpp            | 32 ++++++++++++++++----------------
 8 files changed, 59 insertions(+), 52 deletions(-)

diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h
index cc8a1dcb69afe..dc7ca265e9a2c 100644
--- a/lld/wasm/Config.h
+++ b/lld/wasm/Config.h
@@ -23,6 +23,13 @@ enum class CodeGenOptLevel;
 namespace lld::wasm {
 
 class InputFile;
+class StubFile;
+class ObjFile;
+class SharedFile;
+class BitcodeFile;
+class InputTable;
+class InputGlobal;
+class InputFunction;
 class Symbol;
 
 // For --unresolved-symbols.
@@ -108,6 +115,14 @@ extern Configuration *config;
 
 // The Ctx object hold all other (non-configuration) global state.
 struct Ctx {
+  llvm::SmallVector<ObjFile *, 0> objectFiles;
+  llvm::SmallVector<StubFile *, 0> stubFiles;
+  llvm::SmallVector<SharedFile *, 0> sharedFiles;
+  llvm::SmallVector<BitcodeFile *, 0> bitcodeFiles;
+  llvm::SmallVector<InputFunction *, 0> syntheticFunctions;
+  llvm::SmallVector<InputGlobal *, 0> syntheticGlobals;
+  llvm::SmallVector<InputTable *, 0> syntheticTables;
+
   // True if we are creating position-independent code.
   bool isPic;
 
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 05a96860966ea..4a4f9a9622794 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -899,7 +899,7 @@ static void createOptionalSymbols() {
 
 static void processStubLibrariesPreLTO() {
   log("-- processStubLibrariesPreLTO");
-  for (auto &stub_file : symtab->stubFiles) {
+  for (auto &stub_file : ctx.stubFiles) {
     LLVM_DEBUG(llvm::dbgs()
                << "processing stub file: " << stub_file->getName() << "\n");
     for (auto [name, deps]: stub_file->symbolDependencies) {
@@ -924,7 +924,7 @@ static void processStubLibraries() {
   bool depsAdded = false;
   do {
     depsAdded = false;
-    for (auto &stub_file : symtab->stubFiles) {
+    for (auto &stub_file : ctx.stubFiles) {
       LLVM_DEBUG(llvm::dbgs()
                  << "processing stub file: " << stub_file->getName() << "\n");
       for (auto [name, deps]: stub_file->symbolDependencies) {
@@ -1075,7 +1075,7 @@ static void wrapSymbols(ArrayRef<WrappedSymbol> wrapped) {
   }
 
   // Update pointers in input files.
-  parallelForEach(symtab->objectFiles, [&](InputFile *file) {
+  parallelForEach(ctx.objectFiles, [&](InputFile *file) {
     MutableArrayRef<Symbol *> syms = file->getMutableSymbols();
     for (size_t i = 0, e = syms.size(); i != e; ++i)
       if (Symbol *s = map.lookup(syms[i]))
@@ -1091,7 +1091,7 @@ static void splitSections() {
   // splitIntoPieces needs to be called on each MergeInputChunk
   // before calling finalizeContents().
   LLVM_DEBUG(llvm::dbgs() << "splitSections\n");
-  parallelForEach(symtab->objectFiles, [](ObjFile *file) {
+  parallelForEach(ctx.objectFiles, [](ObjFile *file) {
     for (InputChunk *seg : file->segments) {
       if (auto *s = dyn_cast<MergeInputChunk>(seg))
         s->splitIntoPieces();
@@ -1263,7 +1263,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   // We only need to add libcall symbols to the link before LTO if the symbol's
   // definition is in bitcode. Any other required libcall symbols will be added
   // to the link after LTO when we add the LTO object file to the link.
-  if (!symtab->bitcodeFiles.empty())
+  if (!ctx.bitcodeFiles.empty())
     for (auto *s : lto::LTO::getRuntimeLibcallSymbols())
       handleLibcall(s);
   if (errorCount())
diff --git a/lld/wasm/MapFile.cpp b/lld/wasm/MapFile.cpp
index 288c189854dbd..c96b64cb64838 100644
--- a/lld/wasm/MapFile.cpp
+++ b/lld/wasm/MapFile.cpp
@@ -51,7 +51,7 @@ static void writeHeader(raw_ostream &os, int64_t vma, uint64_t lma,
 // Returns a list of all symbols that we want to print out.
 static std::vector<Symbol *> getSymbols() {
   std::vector<Symbol *> v;
-  for (InputFile *file : symtab->objectFiles)
+  for (InputFile *file : ctx.objectFiles)
     for (Symbol *b : file->getSymbols())
       if (auto *dr = dyn_cast<Symbol>(b))
         if ((!isa<SectionSymbol>(dr)) && dr->isLive() &&
diff --git a/lld/wasm/MarkLive.cpp b/lld/wasm/MarkLive.cpp
index 773af2f7e2360..b8ab7741ff1cb 100644
--- a/lld/wasm/MarkLive.cpp
+++ b/lld/wasm/MarkLive.cpp
@@ -97,7 +97,7 @@ void MarkLive::run() {
     enqueue(WasmSym::callDtors);
 
   // Enqueue constructors in objects explicitly live from the command-line.
-  for (const ObjFile *obj : symtab->objectFiles)
+  for (const ObjFile *obj : ctx.objectFiles)
     if (obj->isLive())
       enqueueInitFunctions(obj);
 
@@ -151,7 +151,7 @@ void markLive() {
 
   // Report garbage-collected sections.
   if (config->printGcSections) {
-    for (const ObjFile *obj : symtab->objectFiles) {
+    for (const ObjFile *obj : ctx.objectFiles) {
       for (InputChunk *c : obj->functions)
         if (!c->live)
           message("removing unused section " + toString(c));
@@ -168,13 +168,13 @@ void markLive() {
         if (!t->live)
           message("removing unused section " + toString(t));
     }
-    for (InputChunk *c : symtab->syntheticFunctions)
+    for (InputChunk *c : ctx.syntheticFunctions)
       if (!c->live)
         message("removing unused section " + toString(c));
-    for (InputGlobal *g : symtab->syntheticGlobals)
+    for (InputGlobal *g : ctx.syntheticGlobals)
       if (!g->live)
         message("removing unused section " + toString(g));
-    for (InputTable *t : symtab->syntheticTables)
+    for (InputTable *t : ctx.syntheticTables)
       if (!t->live)
         message("removing unused section " + toString(t));
   }
@@ -192,7 +192,7 @@ bool MarkLive::isCallCtorsLive() {
 
   // If there are any init functions, mark `__wasm_call_ctors` live so that
   // it can call them.
-  for (const ObjFile *file : symtab->objectFiles) {
+  for (const ObjFile *file : ctx.objectFiles) {
     const WasmLinkingData &l = file->getWasmObj()->linkingData();
     for (const WasmInitFunc &f : l.InitFunctions) {
       auto *sym = file->getFunctionSymbol(f.Symbol);
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index 9dd599e18975e..9988490e14b0b 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -34,14 +34,14 @@ void SymbolTable::addFile(InputFile *file, StringRef symName) {
 
   // .so file
   if (auto *f = dyn_cast<SharedFile>(file)) {
-    sharedFiles.push_back(f);
+    ctx.sharedFiles.push_back(f);
     return;
   }
 
   // stub file
   if (auto *f = dyn_cast<StubFile>(file)) {
     f->parse();
-    stubFiles.push_back(f);
+    ctx.stubFiles.push_back(f);
     return;
   }
 
@@ -52,7 +52,7 @@ void SymbolTable::addFile(InputFile *file, StringRef symName) {
   if (auto *f = dyn_cast<BitcodeFile>(file)) {
     // This order, first adding to `bitcodeFiles` and then parsing is necessary.
     // See https://github.com/llvm/llvm-project/pull/73095
-    bitcodeFiles.push_back(f);
+    ctx.bitcodeFiles.push_back(f);
     f->parse(symName);
     return;
   }
@@ -60,7 +60,7 @@ void SymbolTable::addFile(InputFile *file, StringRef symName) {
   // Regular object file
   auto *f = cast<ObjFile>(file);
   f->parse(false);
-  objectFiles.push_back(f);
+  ctx.objectFiles.push_back(f);
 }
 
 // This function is where all the optimizations of link-time
@@ -74,18 +74,18 @@ void SymbolTable::compileBitcodeFiles() {
   // Prevent further LTO objects being included
   BitcodeFile::doneLTO = true;
 
-  if (bitcodeFiles.empty())
+  if (ctx.bitcodeFiles.empty())
     return;
 
   // Compile bitcode files and replace bitcode symbols.
   lto.reset(new BitcodeCompiler);
-  for (BitcodeFile *f : bitcodeFiles)
+  for (BitcodeFile *f : ctx.bitcodeFiles)
     lto->add(*f);
 
   for (StringRef filename : lto->compile()) {
     auto *obj = make<ObjFile>(MemoryBufferRef(filename, "lto.tmp"), "");
     obj->parse(true);
-    objectFiles.push_back(obj);
+    ctx.objectFiles.push_back(obj);
   }
 }
 
@@ -218,7 +218,7 @@ DefinedFunction *SymbolTable::addSyntheticFunction(StringRef name,
                                                    InputFunction *function) {
   LLVM_DEBUG(dbgs() << "addSyntheticFunction: " << name << "\n");
   assert(!find(name));
-  syntheticFunctions.emplace_back(function);
+  ctx.syntheticFunctions.emplace_back(function);
   return replaceSymbol<DefinedFunction>(insertName(name).first, name,
                                         flags, nullptr, function);
 }
@@ -255,7 +255,7 @@ DefinedGlobal *SymbolTable::addSyntheticGlobal(StringRef name, uint32_t flags,
   LLVM_DEBUG(dbgs() << "addSyntheticGlobal: " << name << " -> " << global
                     << "\n");
   assert(!find(name));
-  syntheticGlobals.emplace_back(global);
+  ctx.syntheticGlobals.emplace_back(global);
   return replaceSymbol<DefinedGlobal>(insertName(name).first, name, flags,
                                       nullptr, global);
 }
@@ -267,7 +267,7 @@ DefinedGlobal *SymbolTable::addOptionalGlobalSymbol(StringRef name,
     return nullptr;
   LLVM_DEBUG(dbgs() << "addOptionalGlobalSymbol: " << name << " -> " << global
                     << "\n");
-  syntheticGlobals.emplace_back(global);
+  ctx.syntheticGlobals.emplace_back(global);
   return replaceSymbol<DefinedGlobal>(s, name, WASM_SYMBOL_VISIBILITY_HIDDEN,
                                       nullptr, global);
 }
@@ -280,7 +280,7 @@ DefinedTable *SymbolTable::addSyntheticTable(StringRef name, uint32_t flags,
   assert(!s || s->isUndefined());
   if (!s)
     s = insertName(name).first;
-  syntheticTables.emplace_back(table);
+  ctx.syntheticTables.emplace_back(table);
   return replaceSymbol<DefinedTable>(s, name, flags, nullptr, table);
 }
 
@@ -855,7 +855,7 @@ InputFunction *SymbolTable::replaceWithUnreachable(Symbol *sym,
                                                    StringRef debugName) {
   auto *func = make<SyntheticFunction>(sig, sym->getName(), debugName);
   func->setBody(unreachableFn);
-  syntheticFunctions.emplace_back(func);
+  ctx.syntheticFunctions.emplace_back(func);
   // Mark new symbols as local. For relocatable output we don't want them
   // to be exported outside the object file.
   replaceSymbol<DefinedFunction>(sym, debugName, WASM_SYMBOL_BINDING_LOCAL,
diff --git a/lld/wasm/SymbolTable.h b/lld/wasm/SymbolTable.h
index 59eda1c0b6740..c5518ee23da26 100644
--- a/lld/wasm/SymbolTable.h
+++ b/lld/wasm/SymbolTable.h
@@ -101,14 +101,6 @@ class SymbolTable {
   void handleWeakUndefines();
   DefinedFunction *createUndefinedStub(const WasmSignature &sig);
 
-  std::vector<ObjFile *> objectFiles;
-  std::vector<StubFile *> stubFiles;
-  std::vector<SharedFile *> sharedFiles;
-  std::vector<BitcodeFile *> bitcodeFiles;
-  std::vector<InputFunction *> syntheticFunctions;
-  std::vector<InputGlobal *> syntheticGlobals;
-  std::vector<InputTable *> syntheticTables;
-
 private:
   std::pair<Symbol *, bool> insert(StringRef name, const InputFile *file);
   std::pair<Symbol *, bool> insertName(StringRef name);
diff --git a/lld/wasm/SyntheticSections.cpp b/lld/wasm/SyntheticSections.cpp
index f95174fe6f269..72e255951608e 100644
--- a/lld/wasm/SyntheticSections.cpp
+++ b/lld/wasm/SyntheticSections.cpp
@@ -57,7 +57,7 @@ class SubSection {
 bool DylinkSection::isNeeded() const {
   return ctx.isPic ||
          config->unresolvedSymbols == UnresolvedPolicy::ImportDynamic ||
-         !symtab->sharedFiles.empty();
+         !ctx.sharedFiles.empty();
 }
 
 void DylinkSection::writeBody() {
@@ -72,10 +72,10 @@ void DylinkSection::writeBody() {
     sub.writeTo(os);
   }
 
-  if (symtab->sharedFiles.size()) {
+  if (ctx.sharedFiles.size()) {
     SubSection sub(WASM_DYLINK_NEEDED);
-    writeUleb128(sub.os, symtab->sharedFiles.size(), "Needed");
-    for (auto *so : symtab->sharedFiles)
+    writeUleb128(sub.os, ctx.sharedFiles.size(), "Needed");
+    for (auto *so : ctx.sharedFiles)
       writeStr(sub.os, llvm::sys::path::filename(so->getName()), "so name");
     sub.writeTo(os);
   }
diff --git a/lld/wasm/Writer.cpp b/lld/wasm/Writer.cpp
index 7c19a1baf4c04..d1a06c9ac9c2a 100644
--- a/lld/wasm/Writer.cpp
+++ b/lld/wasm/Writer.cpp
@@ -133,7 +133,7 @@ class Writer {
 void Writer::calculateCustomSections() {
   log("calculateCustomSections");
   bool stripDebug = config->stripDebug || config->stripAll;
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     for (InputChunk *section : file->customSections) {
       // Exclude COMDAT sections that are not selected for inclusion
       if (section->discarded)
@@ -207,7 +207,7 @@ void Writer::createRelocSections() {
 }
 
 void Writer::populateProducers() {
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     const WasmProducerInfo &info = file->getWasmObj()->getProducerInfo();
     out.producersSec->addInfo(info);
   }
@@ -591,7 +591,7 @@ void Writer::populateTargetFeatures() {
   }
 
   // Find the sets of used, required, and disallowed features
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     StringRef fileName(file->getName());
     for (auto &feature : file->getWasmObj()->getTargetFeatures()) {
       switch (feature.Prefix) {
@@ -654,7 +654,7 @@ void Writer::populateTargetFeatures() {
   }
 
   // Validate the required and disallowed constraints for each file
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     StringRef fileName(file->getName());
     SmallSet<std::string, 8> objectFeatures;
     for (const auto &feature : file->getWasmObj()->getTargetFeatures()) {
@@ -832,7 +832,7 @@ void Writer::populateSymtab() {
     if (sym->isUsedInRegularObj && sym->isLive())
       out.linkingSec->addToSymtab(sym);
 
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     LLVM_DEBUG(dbgs() << "Local symtab entries: " << file->getName() << "\n");
     for (Symbol *sym : file->getSymbols())
       if (sym->isLocal() && !isa<SectionSymbol>(sym) && sym->isLive())
@@ -848,7 +848,7 @@ void Writer::calculateTypes() {
   // 4. The signatures of all imported tags
   // 5. The signatures of all defined tags
 
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     ArrayRef<WasmSignature> types = file->getWasmObj()->types();
     for (uint32_t i = 0; i < types.size(); i++)
       if (file->typeIsUsed[i])
@@ -939,7 +939,7 @@ static void finalizeIndirectFunctionTable() {
 }
 
 static void scanRelocations() {
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     LLVM_DEBUG(dbgs() << "scanRelocations: " << file->getName() << "\n");
     for (InputChunk *chunk : file->functions)
       scanRelocations(chunk);
@@ -955,37 +955,37 @@ void Writer::assignIndexes() {
   // global are effected by the number of imports.
   out.importSec->seal();
 
-  for (InputFunction *func : symtab->syntheticFunctions)
+  for (InputFunction *func : ctx.syntheticFunctions)
     out.functionSec->addFunction(func);
 
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     LLVM_DEBUG(dbgs() << "Functions: " << file->getName() << "\n");
     for (InputFunction *func : file->functions)
       out.functionSec->addFunction(func);
   }
 
-  for (InputGlobal *global : symtab->syntheticGlobals)
+  for (InputGlobal *global : ctx.syntheticGlobals)
     out.globalSec->addGlobal(global);
 
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     LLVM_DEBUG(dbgs() << "Globals: " << file->getName() << "\n");
     for (InputGlobal *global : file->globals)
       out.globalSec->addGlobal(global);
   }
 
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     LLVM_DEBUG(dbgs() << "Tags: " << file->getName() << "\n");
     for (InputTag *tag : file->tags)
       out.tagSec->addTag(tag);
   }
 
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     LLVM_DEBUG(dbgs() << "Tables: " << file->getName() << "\n");
     for (InputTable *table : file->tables)
       out.tableSec->addTable(table);
   }
 
-  for (InputTable *table : symtab->syntheticTables)
+  for (InputTable *table : ctx.syntheticTables)
     out.tableSec->addTable(table);
 
   out.globalSec->assignIndexes();
@@ -1022,7 +1022,7 @@ OutputSegment *Writer::createOutputSegment(StringRef name) {
 }
 
 void Writer::createOutputSegments() {
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     for (InputChunk *segment : file->segments) {
       if (!segment->live)
         continue;
@@ -1639,7 +1639,7 @@ void Writer::calculateInitFunctions() {
   if (!config->relocatable && !WasmSym::callCtors->isLive())
     return;
 
-  for (ObjFile *file : symtab->objectFiles) {
+  for (ObjFile *file : ctx.objectFiles) {
     const WasmLinkingData &l = file->getWasmObj()->linkingData();
     for (const WasmInitFunc &f : l.InitFunctions) {
       FunctionSymbol *sym = file->getFunctionSymbol(f.Symbol);

From 8649328060b4e748502d1d859f9c9c1bd3c2bccc Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Fri, 19 Jan 2024 06:57:06 +0700
Subject: [PATCH 055/843] [RISCV] Add support for new unprivileged extensions
 defined in profiles spec (#77458)

This adds minimal support for 7 new unprivileged extensions that were
defined as a part of
the RISC-V Profiles specification here:

https://github.com/riscv/riscv-profiles/blob/main/profiles.adoc#7-new-isa-extensions

* Ziccif: Main memory supports instruction fetch with atomicity
requirement
* Ziccrse: Main memory supports forward progress on LR/SC sequences
* Ziccamoa: Main memory supports all atomics in A
* Zicclsm: Main memory supports misaligned loads/stores
* Za64rs: Reservation set size of 64 bytes
* Za128rs: Reservation set size of 128 bytes
* Zic64b: Cache block size isf 64 bytes

As stated in the specification, these extensions don't add any new
features but
describe existing features. So this patch only adds parsing and
subtarget
features.
---
 .../test/Preprocessor/riscv-target-features.c | 63 +++++++++++++++++++
 llvm/docs/RISCVUsage.rst                      | 12 ++++
 llvm/docs/ReleaseNotes.rst                    |  3 +
 llvm/lib/Support/RISCVISAInfo.cpp             |  7 +++
 llvm/lib/Target/RISCV/RISCVFeatures.td        | 26 ++++++++
 llvm/test/CodeGen/RISCV/attributes.ll         | 14 +++++
 llvm/test/MC/RISCV/attribute-arch.s           | 23 ++++++-
 llvm/unittests/Support/RISCVISAInfoTest.cpp   |  7 +++
 8 files changed, 154 insertions(+), 1 deletion(-)

diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index fd0970bd87fd1..8dc02f7efefbd 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -52,6 +52,8 @@
 // CHECK-NOT: __riscv_xtheadsync {{.*$}}
 // CHECK-NOT: __riscv_xtheadvdot {{.*$}}
 // CHECK-NOT: __riscv_xventanacondops {{.*$}}
+// CHECK-NOT: __riscv_za128rs {{.*$}}
+// CHECK-NOT: __riscv_za64rs {{.*$}}
 // CHECK-NOT: __riscv_zawrs {{.*$}}
 // CHECK-NOT: __riscv_zba {{.*$}}
 // CHECK-NOT: __riscv_zbb {{.*$}}
@@ -73,9 +75,14 @@
 // CHECK-NOT: __riscv_zfinx {{.*$}}
 // CHECK-NOT: __riscv_zhinx {{.*$}}
 // CHECK-NOT: __riscv_zhinxmin {{.*$}}
+// CHECK-NOT: __riscv_zic64b {{.*$}}
 // CHECK-NOT: __riscv_zicbom {{.*$}}
 // CHECK-NOT: __riscv_zicbop {{.*$}}
 // CHECK-NOT: __riscv_zicboz {{.*$}}
+// CHECK-NOT: __riscv_ziccamoa {{.*$}}
+// CHECK-NOT: __riscv_ziccif {{.*$}}
+// CHECK-NOT: __riscv_zicclsm {{.*$}}
+// CHECK-NOT: __riscv_ziccrse {{.*$}}
 // CHECK-NOT: __riscv_zicntr {{.*$}}
 // CHECK-NOT: __riscv_zicsr {{.*$}}
 // CHECK-NOT: __riscv_zifencei {{.*$}}
@@ -473,6 +480,22 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-XVENTANACONDOPS-EXT %s
 // CHECK-XVENTANACONDOPS-EXT: __riscv_xventanacondops 1000000{{$}}
 
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN: -march=rv32iza128rs -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZA128RS-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN: -march=rv64iza128rs -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZA128RS-EXT %s
+// CHECK-ZA128RS-EXT: __riscv_za128rs 1000000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN: -march=rv32iza64rs -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZA64RS-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN: -march=rv64iza64rs -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZA64RS-EXT %s
+// CHECK-ZA64RS-EXT: __riscv_za64rs 1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN: -march=rv32izawrs -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZAWRS-EXT %s
@@ -667,6 +690,14 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZHINXMIN-EXT %s
 // CHECK-ZHINXMIN-EXT: __riscv_zhinxmin 1000000{{$}}
 
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN: -march=rv32izic64b -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZIC64B-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN: -march=rv64izic64b -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZIC64B-EXT %s
+// CHECK-ZIC64B-EXT: __riscv_zic64b 1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN: -march=rv32izicbom -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZICBOM-EXT %s
@@ -691,6 +722,38 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZICBOZ-EXT %s
 // CHECK-ZICBOZ-EXT: __riscv_zicboz 1000000{{$}}
 
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN: -march=rv32iziccamoa -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCAMOA-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN: -march=rv64iziccamoa -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCAMOA-EXT %s
+// CHECK-ZICCAMOA-EXT: __riscv_ziccamoa 1000000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN: -march=rv32iziccif -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCIF-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN: -march=rv64iziccif -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCIF-EXT %s
+// CHECK-ZICCIF-EXT: __riscv_ziccif 1000000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN: -march=rv32izicclsm -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCLSM-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN: -march=rv64izicclsm -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCLSM-EXT %s
+// CHECK-ZICCLSM-EXT: __riscv_zicclsm 1000000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN: -march=rv32iziccrse -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCRSE-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN: -march=rv64iziccrse -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICCRSE-EXT %s
+// CHECK-ZICCRSE-EXT: __riscv_ziccrse 1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN: -march=rv32izicntr -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZICNTR-EXT %s
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index 40fbaaa24e839..fc4d97b134371 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -97,6 +97,8 @@ on support follow.
      ``Svnapot``      Assembly Support
      ``Svpbmt``       Supported
      ``V``            Supported
+     ``Za128rs``      Supported (`See note <#riscv-profiles-extensions-note>`__)
+     ``Za64rs``       Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zawrs``        Assembly Support
      ``Zba``          Supported
      ``Zbb``          Supported
@@ -118,9 +120,14 @@ on support follow.
      ``Zfinx``        Supported
      ``Zhinx``        Supported
      ``Zhinxmin``     Supported
+     ``Zic64b``       Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zicbom``       Assembly Support
      ``Zicbop``       Supported
      ``Zicboz``       Assembly Support
+     ``Ziccamoa``     Supported (`See note <#riscv-profiles-extensions-note>`__)
+     ``Ziccif``       Supported (`See note <#riscv-profiles-extensions-note>`__)
+     ``Zicclsm``      Supported (`See note <#riscv-profiles-extensions-note>`__)
+     ``Ziccrse``      Supported (`See note <#riscv-profiles-extensions-note>`__)
      ``Zicntr``       (`See Note <#riscv-i2p1-note>`__)
      ``Zicsr``        (`See Note <#riscv-i2p1-note>`__)
      ``Zifencei``     (`See Note <#riscv-i2p1-note>`__)
@@ -205,6 +212,11 @@ Supported
 ``zicntr``, ``zicsr``, ``zifencei``, ``zihpm``
   Between versions 2.0 and 2.1 of the base I specification, a backwards incompatible change was made to remove selected instructions and CSRs from the base ISA.  These instructions were grouped into a set of new extensions, but were no longer required by the base ISA.  This change is partially described in "Preface to Document Version 20190608-Base-Ratified" from the specification document (the ``zicntr`` and ``zihpm`` bits are not mentioned).  LLVM currently implements version 2.1 of the base specification. To maintain compatibility, instructions from these extensions are accepted without being in the ``-march`` string.  LLVM also allows the explicit specification of the extensions in an ``-march`` string.
 
+.. _riscv-profiles-extensions-note:
+
+``Za128rs``, ``Za64rs``, ``Zic64b``, ``Ziccamoa``, ``Ziccif``, ``Zicclsm``, ``Ziccrse``
+  These extensions are defined as part of the `RISC-V Profiles specification <https://github.com/riscv/riscv-profiles/releases/tag/v1.0>`_.  They do not introduce any new features themselves, but instead describe existing hardware features.
+
 Experimental Extensions
 =======================
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index e02f68f07d93b..4345d01021f17 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -157,6 +157,9 @@ Changes to the RISC-V Backend
 * ``-mcpu=sifive-p450`` was added.
 * CodeGen of RV32E/RV64E was supported experimentally.
 * CodeGen of ilp32e/lp64e was supported experimentally.
+* Support was added for the Ziccif, Ziccrse, Ziccamoa, Zicclsm, Za64rs, Za128rs
+  and Zic64b extensions which were introduced as a part of the RISC-V Profiles
+  specification.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index d991878a5f1ec..8c9eb1bddb3cb 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -88,6 +88,8 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"xtheadvdot", {1, 0}},
     {"xventanacondops", {1, 0}},
 
+    {"za128rs", {1, 0}},
+    {"za64rs", {1, 0}},
     {"zawrs", {1, 0}},
 
     {"zba", {1, 0}},
@@ -116,9 +118,14 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"zhinx", {1, 0}},
     {"zhinxmin", {1, 0}},
 
+    {"zic64b", {1, 0}},
     {"zicbom", {1, 0}},
     {"zicbop", {1, 0}},
     {"zicboz", {1, 0}},
+    {"ziccamoa", {1, 0}},
+    {"ziccif", {1, 0}},
+    {"zicclsm", {1, 0}},
+    {"ziccrse", {1, 0}},
     {"zicntr", {2, 0}},
     {"zicsr", {2, 0}},
     {"zifencei", {2, 0}},
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index fa334c69ddc98..72780937dd887 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -93,6 +93,22 @@ def HasStdExtZifencei : Predicate<"Subtarget->hasStdExtZifencei()">,
                                    AssemblerPredicate<(all_of FeatureStdExtZifencei),
                                    "'Zifencei' (fence.i)">;
 
+def FeatureStdExtZiccamoa
+    : SubtargetFeature<"ziccamoa", "HasStdExtZiccamoa", "true",
+                       "'Ziccamoa' (Main Memory Supports All Atomics in A)">;
+
+def FeatureStdExtZiccif
+    : SubtargetFeature<"ziccif", "HasStdExtZiccif", "true",
+                       "'Ziccif' (Main Memory Supports Instruction Fetch with Atomicity Requirement)">;
+
+def FeatureStdExtZicclsm
+    : SubtargetFeature<"zicclsm", "HasStdExtZicclsm", "true",
+                       "'Zicclsm' (Main Memory Supports Misaligned Loads/Stores)">;
+
+def FeatureStdExtZiccrse
+    : SubtargetFeature<"ziccrse", "HasStdExtZiccrse", "true",
+                       "'Ziccrse' (Main Memory Supports Forward Progress on LR/SC Sequences)">;
+
 def FeatureStdExtZicntr
     : SubtargetFeature<"zicntr", "HasStdExtZicntr", "true",
                        "'Zicntr' (Base Counters and Timers)",
@@ -517,6 +533,10 @@ def HasStdExtZfhOrZvfh
                                    "'Zfh' (Half-Precision Floating-Point) or "
                                    "'Zvfh' (Vector Half-Precision Floating-Point)">;
 
+def FeatureStdExtZic64b
+    : SubtargetFeature<"zic64b", "HasStdExtZic64b", "true",
+                       "'Zic64b' (Cache Block Size Is 64 Bytes)">;
+
 def FeatureStdExtZicbom
     : SubtargetFeature<"zicbom", "HasStdExtZicbom", "true",
                        "'Zicbom' (Cache-Block Management Instructions)">;
@@ -561,6 +581,12 @@ def HasStdExtZtso : Predicate<"Subtarget->hasStdExtZtso()">,
                               "'Ztso' (Memory Model - Total Store Order)">;
 def NotHasStdExtZtso : Predicate<"!Subtarget->hasStdExtZtso()">;
 
+def FeatureStdExtZa64rs : SubtargetFeature<"za64rs", "HasStdExtZa64rs", "true",
+                                           "'Za64rs' (Reservation Set Size of at Most 64 Bytes)">;
+
+def FeatureStdExtZa128rs : SubtargetFeature<"za128rs", "HasStdExtZa128rs", "true",
+                                            "'Za128rs' (Reservation Set Size of at Most 128 Bytes)">;
+
 def FeatureStdExtZawrs : SubtargetFeature<"zawrs", "HasStdExtZawrs", "true",
                                           "'Zawrs' (Wait on Reservation Set)">;
 def HasStdExtZawrs : Predicate<"Subtarget->hasStdExtZawrs()">,
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 60ef404ac345d..3e55e0fb4e686 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -130,6 +130,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zkn,+zkr,+zkt %s -o - | FileCheck --check-prefixes=CHECK,RV64COMBINEINTOZK %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zbkb,+zbkc,+zbkx,+zkne,+zknd,+zknh %s -o - | FileCheck --check-prefixes=CHECK,RV64COMBINEINTOZKN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zbkb,+zbkc,+zbkx,+zksed,+zksh %s -o - | FileCheck --check-prefixes=CHECK,RV64COMBINEINTOZKS %s
+; RUN: llc -mtriple=riscv64 -mattr=+zic64b %s -o - | FileCheck --check-prefixes=CHECK,RV64ZIC64B %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zicbom %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICBOM %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zicboz %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICBOZ %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zicbop %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICBOP %s
@@ -149,6 +150,8 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadmempair %s -o - | FileCheck --check-prefix=RV64XTHEADMEMPAIR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadsync %s -o - | FileCheck --check-prefix=RV64XTHEADSYNC %s
 ; RUN: llc -mtriple=riscv64 -mattr=+xtheadvdot %s -o - | FileCheck --check-prefixes=CHECK,RV64XTHEADVDOT %s
+; RUN: llc -mtriple=riscv64 -mattr=+za64rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA64RS %s
+; RUN: llc -mtriple=riscv64 -mattr=+za128rs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZA128RS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zawrs %s -o - | FileCheck --check-prefixes=CHECK,RV64ZAWRS %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-ztso %s -o - | FileCheck --check-prefixes=CHECK,RV64ZTSO %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zca %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCA %s
@@ -156,6 +159,10 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+zcd %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCD %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zcmp %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCMP %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zcmt %s -o - | FileCheck --check-prefixes=CHECK,RV64ZCMT %s
+; RUN: llc -mtriple=riscv64 -mattr=+ziccamoa %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCAMOA %s
+; RUN: llc -mtriple=riscv64 -mattr=+ziccif %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCIF %s
+; RUN: llc -mtriple=riscv64 -mattr=+zicclsm %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCLSM %s
+; RUN: llc -mtriple=riscv64 -mattr=+ziccrse %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICCRSE %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zicsr %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICSR %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zifencei %s -o - | FileCheck --check-prefixes=CHECK,RV64ZIFENCEI %s
 ; RUN: llc -mtriple=riscv64 -mattr=+zicntr %s -o - | FileCheck --check-prefixes=CHECK,RV64ZICNTR %s
@@ -319,8 +326,11 @@
 ; RV64COMBINEINTOZK: .attribute 5, "rv64i2p1_zbkb1p0_zbkc1p0_zbkx1p0_zk1p0_zkn1p0_zknd1p0_zkne1p0_zknh1p0_zkr1p0_zkt1p0"
 ; RV64COMBINEINTOZKN: .attribute 5, "rv64i2p1_zbkb1p0_zbkc1p0_zbkx1p0_zkn1p0_zknd1p0_zkne1p0_zknh1p0"
 ; RV64COMBINEINTOZKS: .attribute 5, "rv64i2p1_zbkb1p0_zbkc1p0_zbkx1p0_zks1p0_zksed1p0_zksh1p0"
+; RV64ZIC64B: .attribute 5, "rv64i2p1_zic64b1p0"
 ; RV64ZICBOM: .attribute 5, "rv64i2p1_zicbom1p0"
 ; RV64ZICBOZ: .attribute 5, "rv64i2p1_zicboz1p0"
+; RV64ZA64RS: .attribute 5, "rv64i2p1_za64rs1p0"
+; RV64ZA128RS: .attribute 5, "rv64i2p1_za128rs1p0"
 ; RV64ZAWRS: .attribute 5, "rv64i2p1_zawrs1p0"
 ; RV64ZICBOP: .attribute 5, "rv64i2p1_zicbop1p0"
 ; RV64SVNAPOT: .attribute 5, "rv64i2p1_svnapot1p0"
@@ -345,6 +355,10 @@
 ; RV64ZCD: .attribute 5, "rv64i2p1_f2p2_d2p2_zicsr2p0_zca1p0_zcd1p0"
 ; RV64ZCMP: .attribute 5, "rv64i2p1_zca1p0_zcmp1p0"
 ; RV64ZCMT: .attribute 5, "rv64i2p1_zicsr2p0_zca1p0_zcmt1p0"
+; RV64ZICCAMOA: .attribute 5, "rv64i2p1_ziccamoa1p0"
+; RV64ZICCIF: .attribute 5, "rv64i2p1_ziccif1p0"
+; RV64ZICCLSM: .attribute 5, "rv64i2p1_zicclsm1p0"
+; RV64ZICCRSE: .attribute 5, "rv64i2p1_ziccrse1p0"
 ; RV64ZICSR: .attribute 5, "rv64i2p1_zicsr2p0"
 ; RV64ZIFENCEI: .attribute 5, "rv64i2p1_zifencei2p0"
 ; RV64ZICNTR: .attribute 5, "rv64i2p1_zicntr2p0_zicsr2p0"
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index 0e508bb80f6b9..b1a03bbfd74da 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -91,6 +91,9 @@
 .attribute arch, "rv32ifdzve64d"
 # CHECK: attribute      5, "rv32i2p1_f2p2_d2p2_zicsr2p0_zve32f1p0_zve32x1p0_zve64d1p0_zve64f1p0_zve64x1p0_zvl32b1p0_zvl64b1p0"
 
+.attribute arch, "rv32izic64b"
+# CHECK: attribute      5, "rv32i2p1_zic64b1p0"
+
 .attribute arch, "rv32izicbom"
 # CHECK: attribute      5, "rv32i2p1_zicbom1p0"
 
@@ -100,6 +103,18 @@
 .attribute arch, "rv32izicbop"
 # CHECK: attribute      5, "rv32i2p1_zicbop1p0"
 
+.attribute arch, "rv32iziccamoa"
+# CHECK: attribute      5, "rv32i2p1_ziccamoa1p0"
+
+.attribute arch, "rv32iziccif"
+# CHECK: attribute      5, "rv32i2p1_ziccif1p0"
+
+.attribute arch, "rv32izicclsm"
+# CHECK: attribute      5, "rv32i2p1_zicclsm1p0"
+
+.attribute arch, "rv32iziccrse"
+# CHECK: attribute      5, "rv32i2p1_ziccrse1p0"
+
 ## Experimental extensions require version string to be explicitly specified
 
 .attribute arch, "rv32izba1p0"
@@ -125,7 +140,7 @@
 
 .attribute arch, "rv32i_zve64x_zvkn1p0"
 # CHECK: attribute      5, "rv32i2p1_zicsr2p0_zve32x1p0_zve64x1p0_zvkb1p0_zvkn1p0_zvkned1p0_zvknhb1p0_zvkt1p0_zvl32b1p0_zvl64b1p0"
-    
+
 .attribute arch, "rv32i_zve64x_zvknc1p0"
 # CHECK: attribute      5, "rv32i2p1_zicsr2p0_zvbc1p0_zve32x1p0_zve64x1p0_zvkb1p0_zvkn1p0_zvknc1p0_zvkned1p0_zvknhb1p0_zvkt1p0_zvl32b1p0_zvl64b1p0"
 
@@ -249,6 +264,12 @@
 .attribute arch, "rv64i_xsfvcp"
 # CHECK: attribute      5, "rv64i2p1_zicsr2p0_zve32x1p0_zvl32b1p0_xsfvcp1p0"
 
+.attribute arch, "rv32iza128rs1p0"
+# CHECK: attribute      5, "rv32i2p1_za128rs1p0"
+
+.attribute arch, "rv32iza64rs1p0"
+# CHECK: attribute      5, "rv32i2p1_za64rs1p0"
+
 .attribute arch, "rv32izawrs1p0"
 # CHECK: attribute      5, "rv32i2p1_zawrs1p0"
 
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index c8e77cdccf4e8..0b8bbc7c9027e 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -675,9 +675,14 @@ R"(All available -march extensions for RISC-V
     c                   2.0
     v                   1.0
     h                   1.0
+    zic64b              1.0
     zicbom              1.0
     zicbop              1.0
     zicboz              1.0
+    ziccamoa            1.0
+    ziccif              1.0
+    zicclsm             1.0
+    ziccrse             1.0
     zicntr              2.0
     zicsr               2.0
     zifencei            2.0
@@ -685,6 +690,8 @@ R"(All available -march extensions for RISC-V
     zihintpause         2.0
     zihpm               2.0
     zmmul               1.0
+    za128rs             1.0
+    za64rs              1.0
     zawrs               1.0
     zfa                 1.0
     zfh                 1.0

From 20a3484b1569e8a586a15af5060e2d796093a54f Mon Sep 17 00:00:00 2001
From: Piyou Chen <piyou.chen@sifive.com>
Date: Fri, 19 Jan 2024 08:22:25 +0800
Subject: [PATCH 056/843] [RISCV] Add statistic support for VSETVL insertion
 pass (#78543)

This patch make vsetvl insertion pass could track the number of
inserted/removed vsetvl instruction from `-stats` option.
---
 llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
index 3b15c71ef167c..a14f9a2835473 100644
--- a/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInsertVSETVLI.cpp
@@ -26,6 +26,7 @@
 
 #include "RISCV.h"
 #include "RISCVSubtarget.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include <queue>
@@ -34,6 +35,9 @@ using namespace llvm;
 #define DEBUG_TYPE "riscv-insert-vsetvli"
 #define RISCV_INSERT_VSETVLI_NAME "RISC-V Insert VSETVLI pass"
 
+STATISTIC(NumInsertedVSETVL, "Number of VSETVL inst inserted");
+STATISTIC(NumRemovedVSETVL, "Number of VSETVL inst removed");
+
 static cl::opt<bool> DisableInsertVSETVLPHIOpt(
     "riscv-disable-insert-vsetvl-phi-opt", cl::init(false), cl::Hidden,
     cl::desc("Disable looking through phis when inserting vsetvlis."));
@@ -911,6 +915,7 @@ void RISCVInsertVSETVLI::insertVSETVLI(MachineBasicBlock &MBB,
                      MachineBasicBlock::iterator InsertPt, DebugLoc DL,
                      const VSETVLIInfo &Info, const VSETVLIInfo &PrevInfo) {
 
+  ++NumInsertedVSETVL;
   if (PrevInfo.isValid() && !PrevInfo.isUnknown()) {
     // Use X0, X0 form if the AVL is the same and the SEW+LMUL gives the same
     // VLMAX.
@@ -1578,6 +1583,7 @@ void RISCVInsertVSETVLI::doLocalPostpass(MachineBasicBlock &MBB) {
     Used = getDemanded(MI, MRI, ST);
   }
 
+  NumRemovedVSETVL += ToDelete.size();
   for (auto *MI : ToDelete)
     MI->eraseFromParent();
 }

From 2a54098de1af2e0d0d7e63d79e8eae8e8d78db92 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 18 Jan 2024 16:33:31 -0800
Subject: [PATCH 057/843] Revert "[ASan][libc++] Turn on ASan annotations for
 short strings" (#78627)

Reverts llvm/llvm-project#75882

To recover build bots :
https://lab.llvm.org/buildbot/#/builders/239/builds/5361
https://lab.llvm.org/buildbot/#/builders/168/builds/18126
---
 libcxx/include/string                         |  14 +-
 .../asan_deque_integration.pass.cpp           | 182 ------------------
 .../strings/basic.string/asan_short.pass.cpp  |  56 ------
 .../asan_vector_integration.pass.cpp          | 182 ------------------
 libcxx/test/support/asan_testing.h            |  29 ++-
 5 files changed, 34 insertions(+), 429 deletions(-)
 delete mode 100644 libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp

diff --git a/libcxx/include/string b/libcxx/include/string
index 4116f350a8047..e97139206d4fa 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -659,6 +659,7 @@ _LIBCPP_PUSH_MACROS
 #else
 #  define _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS
 #endif
+#define _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED false
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -1895,17 +1896,22 @@ private:
 #endif
   }
 
+  // ASan: short string is poisoned if and only if this function returns true.
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __asan_short_string_is_annotated() const _NOEXCEPT {
+    return _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED && !__libcpp_is_constant_evaluated();
+  }
+
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT {
     (void) __current_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated())
+    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + capacity() + 1, data() + __current_size + 1);
 #endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_delete() const _NOEXCEPT {
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated())
+    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + capacity() + 1);
 #endif
   }
@@ -1913,7 +1919,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT {
     (void) __n;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated())
+    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + size() + 1 + __n);
 #endif
   }
@@ -1921,7 +1927,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
     (void) __old_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated())
+    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + __old_size + 1, data() + size() + 1);
 #endif
   }
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
deleted file mode 100644
index b914609f35ddf..0000000000000
--- a/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// REQUIRES: asan
-// UNSUPPORTED: c++03
-
-#include <cassert>
-#include <string>
-#include <array>
-#include <deque>
-#include "test_macros.h"
-#include "asan_testing.h"
-#include "min_allocator.h"
-
-// This tests exists to check if strings work well with deque, as those
-// may be partialy annotated, we cannot simply call
-// is_double_ended_contiguous_container_asan_correct, as it assumes that
-// object memory inside is not annotated, so we check everything in a more careful way.
-
-template <typename D>
-void verify_inside(D const& d) {
-  for (size_t i = 0; i < d.size(); ++i) {
-    assert(is_string_asan_correct(d[i]));
-  }
-}
-
-template <typename S, size_t N>
-S get_s(char c) {
-  S s;
-  for (size_t i = 0; i < N; ++i)
-    s.push_back(c);
-
-  return s;
-}
-
-template <class C, class S>
-void test_string() {
-  size_t const N = sizeof(S) < 256 ? (4096 / sizeof(S)) : 16;
-
-  {
-    C d1a(1), d1b(N), d1c(N + 1), d1d(32 * N);
-    verify_inside(d1a);
-    verify_inside(d1b);
-    verify_inside(d1c);
-    verify_inside(d1d);
-  }
-  {
-    C d2;
-    for (size_t i = 0; i < 16 * N; ++i) {
-      d2.push_back(get_s<S, 1>(i % 10 + 'a'));
-      verify_inside(d2);
-      d2.push_back(get_s<S, 222>(i % 10 + 'b'));
-      verify_inside(d2);
-
-      d2.pop_front();
-      verify_inside(d2);
-    }
-  }
-  {
-    C d3;
-    for (size_t i = 0; i < 16 * N; ++i) {
-      d3.push_front(get_s<S, 1>(i % 10 + 'a'));
-      verify_inside(d3);
-      d3.push_front(get_s<S, 222>(i % 10 + 'b'));
-      verify_inside(d3);
-
-      d3.pop_back();
-      verify_inside(d3);
-    }
-  }
-  {
-    C d4;
-    for (size_t i = 0; i < 16 * N; ++i) {
-      // When there is no SSO, all elements inside should not be poisoned,
-      // so we can verify deque poisoning.
-      d4.push_front(get_s<S, 333>(i % 10 + 'a'));
-      verify_inside(d4);
-      assert(is_double_ended_contiguous_container_asan_correct(d4));
-      d4.push_back(get_s<S, 222>(i % 10 + 'b'));
-      verify_inside(d4);
-      assert(is_double_ended_contiguous_container_asan_correct(d4));
-    }
-  }
-  {
-    C d5;
-    for (size_t i = 0; i < 5 * N; ++i) {
-      // In d4 we never had poisoned memory inside deque.
-      // Here we start with SSO, so part of the inside of the container,
-      // will be poisoned.
-      d5.push_front(S());
-      verify_inside(d5);
-    }
-    for (size_t i = 0; i < d5.size(); ++i) {
-      // We change the size to have long string.
-      // Memory owne by deque should not be poisoned by string.
-      d5[i].resize(1000);
-      verify_inside(d5);
-    }
-
-    assert(is_double_ended_contiguous_container_asan_correct(d5));
-
-    d5.erase(d5.begin() + 2);
-    verify_inside(d5);
-
-    d5.erase(d5.end() - 2);
-    verify_inside(d5);
-
-    assert(is_double_ended_contiguous_container_asan_correct(d5));
-  }
-  {
-    C d6a;
-    assert(is_double_ended_contiguous_container_asan_correct(d6a));
-
-    C d6b(N + 2, get_s<S, 1000>('a'));
-    d6b.push_front(get_s<S, 1001>('b'));
-    while (!d6b.empty()) {
-      d6b.pop_back();
-      assert(is_double_ended_contiguous_container_asan_correct(d6b));
-    }
-
-    C d6c(N + 2, get_s<S, 1002>('c'));
-    while (!d6c.empty()) {
-      d6c.pop_back();
-      assert(is_double_ended_contiguous_container_asan_correct(d6c));
-    }
-  }
-  {
-    C d7(9 * N + 2);
-
-    d7.insert(d7.begin() + 1, S());
-    verify_inside(d7);
-
-    d7.insert(d7.end() - 3, S());
-    verify_inside(d7);
-
-    d7.insert(d7.begin() + 2 * N, get_s<S, 1>('a'));
-    verify_inside(d7);
-
-    d7.insert(d7.end() - 2 * N, get_s<S, 1>('b'));
-    verify_inside(d7);
-
-    d7.insert(d7.begin() + 2 * N, 3 * N, get_s<S, 1>('c'));
-    verify_inside(d7);
-
-    // It may not be short for big element types, but it will be checked correctly:
-    d7.insert(d7.end() - 2 * N, 3 * N, get_s<S, 2>('d'));
-    verify_inside(d7);
-
-    d7.erase(d7.begin() + 2);
-    verify_inside(d7);
-
-    d7.erase(d7.end() - 2);
-    verify_inside(d7);
-  }
-}
-
-template <class S>
-void test_container() {
-  test_string<std::deque<S, std::allocator<S>>, S>();
-  test_string<std::deque<S, min_allocator<S>>, S>();
-  test_string<std::deque<S, safe_allocator<S>>, S>();
-}
-
-int main(int, char**) {
-  // Those tests support only types based on std::basic_string.
-  test_container<std::string>();
-  test_container<std::wstring>();
-#if TEST_STD_VER >= 11
-  test_container<std::u16string>();
-  test_container<std::u32string>();
-#endif
-#if TEST_STD_VER >= 20
-  test_container<std::u8string>();
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
deleted file mode 100644
index 53c70bed189b5..0000000000000
--- a/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// REQUIRES: asan
-// UNSUPPORTED: c++03
-
-// <string>
-
-// Basic test if ASan annotations work for short strings.
-
-#include <string>
-#include <cassert>
-#include <cstdlib>
-
-#include "asan_testing.h"
-#include "min_allocator.h"
-#include "test_iterators.h"
-#include "test_macros.h"
-
-extern "C" void __sanitizer_set_death_callback(void (*callback)(void));
-
-void do_exit() { exit(0); }
-
-int main(int, char**) {
-  {
-    typedef cpp17_input_iterator<char*> MyInputIter;
-    // Should not trigger ASan.
-    std::basic_string<char, std::char_traits<char>, safe_allocator<char>> v;
-    char i[] = {'a', 'b', 'c', 'd'};
-
-    v.insert(v.begin(), MyInputIter(i), MyInputIter(i + 4));
-    assert(v[0] == 'a');
-    assert(is_string_asan_correct(v));
-  }
-
-  __sanitizer_set_death_callback(do_exit);
-  {
-    using T     = char;
-    using C     = std::basic_string<T, std::char_traits<T>, safe_allocator<T>>;
-    const T t[] = {'a', 'b', 'c', 'd', 'e', 'f', 'g'};
-    C c(std::begin(t), std::end(t));
-    assert(is_string_asan_correct(c));
-    assert(__sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) !=
-           0);
-    volatile T foo = c[c.size() + 1]; // should trigger ASAN. Use volatile to prevent being optimized away.
-    assert(false);                    // if we got here, ASAN didn't trigger
-    ((void)foo);
-  }
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp
deleted file mode 100644
index 5b1900fb00d5b..0000000000000
--- a/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// REQUIRES: asan
-// UNSUPPORTED: c++03
-
-#include <cassert>
-#include <string>
-#include <vector>
-#include <array>
-#include "test_macros.h"
-#include "asan_testing.h"
-#include "min_allocator.h"
-
-// This tests exists to check if strings work well with vector, as those
-// may be partialy annotated, we cannot simply call
-// is_contiguous_container_asan_correct, as it assumes that
-// object memory inside is not annotated, so we check everything in a more careful way.
-
-template <typename D>
-void verify_inside(D const& d) {
-  for (size_t i = 0; i < d.size(); ++i) {
-    assert(is_string_asan_correct(d[i]));
-  }
-}
-
-template <typename S, size_t N>
-S get_s(char c) {
-  S s;
-  for (size_t i = 0; i < N; ++i)
-    s.push_back(c);
-
-  return s;
-}
-
-template <class C, class S>
-void test_string() {
-  size_t const N = sizeof(S) < 256 ? (4096 / sizeof(S)) : 16;
-
-  {
-    C d1a(1), d1b(N), d1c(N + 1), d1d(32 * N);
-    verify_inside(d1a);
-    verify_inside(d1b);
-    verify_inside(d1c);
-    verify_inside(d1d);
-  }
-  {
-    C d2;
-    for (size_t i = 0; i < 16 * N; ++i) {
-      d2.push_back(get_s<S, 1>(i % 10 + 'a'));
-      verify_inside(d2);
-      d2.push_back(get_s<S, 222>(i % 10 + 'b'));
-      verify_inside(d2);
-
-      d2.erase(d2.cbegin());
-      verify_inside(d2);
-    }
-  }
-  {
-    C d3;
-    for (size_t i = 0; i < 16 * N; ++i) {
-      d3.push_back(get_s<S, 1>(i % 10 + 'a'));
-      verify_inside(d3);
-      d3.push_back(get_s<S, 222>(i % 10 + 'b'));
-      verify_inside(d3);
-
-      d3.pop_back();
-      verify_inside(d3);
-    }
-  }
-  {
-    C d4;
-    for (size_t i = 0; i < 16 * N; ++i) {
-      // When there is no SSO, all elements inside should not be poisoned,
-      // so we can verify vector poisoning.
-      d4.push_back(get_s<S, 333>(i % 10 + 'a'));
-      verify_inside(d4);
-      assert(is_contiguous_container_asan_correct(d4));
-      d4.push_back(get_s<S, 222>(i % 10 + 'b'));
-      verify_inside(d4);
-      assert(is_contiguous_container_asan_correct(d4));
-    }
-  }
-  {
-    C d5;
-    for (size_t i = 0; i < 5 * N; ++i) {
-      // In d4 we never had poisoned memory inside vector.
-      // Here we start with SSO, so part of the inside of the container,
-      // will be poisoned.
-      d5.push_back(S());
-      verify_inside(d5);
-    }
-    for (size_t i = 0; i < d5.size(); ++i) {
-      // We change the size to have long string.
-      // Memory owne by vector should not be poisoned by string.
-      d5[i].resize(1000);
-      verify_inside(d5);
-    }
-
-    assert(is_contiguous_container_asan_correct(d5));
-
-    d5.erase(d5.begin() + 2);
-    verify_inside(d5);
-
-    d5.erase(d5.end() - 2);
-    verify_inside(d5);
-
-    assert(is_contiguous_container_asan_correct(d5));
-  }
-  {
-    C d6a;
-    assert(is_contiguous_container_asan_correct(d6a));
-
-    C d6b(N + 2, get_s<S, 1000>('a'));
-    d6b.push_back(get_s<S, 1001>('b'));
-    while (!d6b.empty()) {
-      d6b.pop_back();
-      assert(is_contiguous_container_asan_correct(d6b));
-    }
-
-    C d6c(N + 2, get_s<S, 1002>('c'));
-    while (!d6c.empty()) {
-      d6c.pop_back();
-      assert(is_contiguous_container_asan_correct(d6c));
-    }
-  }
-  {
-    C d7(9 * N + 2);
-
-    d7.insert(d7.begin() + 1, S());
-    verify_inside(d7);
-
-    d7.insert(d7.end() - 3, S());
-    verify_inside(d7);
-
-    d7.insert(d7.begin() + 2 * N, get_s<S, 1>('a'));
-    verify_inside(d7);
-
-    d7.insert(d7.end() - 2 * N, get_s<S, 1>('b'));
-    verify_inside(d7);
-
-    d7.insert(d7.begin() + 2 * N, 3 * N, get_s<S, 1>('c'));
-    verify_inside(d7);
-
-    // It may not be short for big element types, but it will be checked correctly:
-    d7.insert(d7.end() - 2 * N, 3 * N, get_s<S, 2>('d'));
-    verify_inside(d7);
-
-    d7.erase(d7.begin() + 2);
-    verify_inside(d7);
-
-    d7.erase(d7.end() - 2);
-    verify_inside(d7);
-  }
-}
-
-template <class S>
-void test_container() {
-  test_string<std::vector<S, std::allocator<S>>, S>();
-  test_string<std::vector<S, min_allocator<S>>, S>();
-  test_string<std::vector<S, safe_allocator<S>>, S>();
-}
-
-int main(int, char**) {
-  // Those tests support only types based on std::basic_string.
-  test_container<std::string>();
-  test_container<std::wstring>();
-#if TEST_STD_VER >= 11
-  test_container<std::u16string>();
-  test_container<std::u32string>();
-#endif
-#if TEST_STD_VER >= 20
-  test_container<std::u8string>();
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/support/asan_testing.h b/libcxx/test/support/asan_testing.h
index 3785c1f9c20de..6bfc8280a4ead 100644
--- a/libcxx/test/support/asan_testing.h
+++ b/libcxx/test/support/asan_testing.h
@@ -56,16 +56,35 @@ TEST_CONSTEXPR bool is_double_ended_contiguous_container_asan_correct(const std:
 #endif
 
 #if TEST_HAS_FEATURE(address_sanitizer)
+template <typename S>
+bool is_string_short(S const& s) {
+  // We do not have access to __is_long(), but we can check if strings
+  // buffer is inside strings memory. If strings memory contains its content,
+  // SSO is in use. To check it, we can just confirm that the beginning is in
+  // the string object memory block.
+  // &s    - beginning of objects memory
+  // &s[0] - beginning of the buffer
+  // (&s+1) - end of objects memory
+  return (void*)std::addressof(s) <= (void*)std::addressof(s[0]) &&
+         (void*)std::addressof(s[0]) < (void*)(std::addressof(s) + 1);
+}
+
 template <typename ChrT, typename TraitsT, typename Alloc>
 TEST_CONSTEXPR bool is_string_asan_correct(const std::basic_string<ChrT, TraitsT, Alloc>& c) {
   if (TEST_IS_CONSTANT_EVALUATED)
     return true;
 
-  if (std::__asan_annotate_container_with_allocator<Alloc>::value)
-    return __sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) != 0;
-  else
-    return __sanitizer_verify_contiguous_container(
-               c.data(), c.data() + c.capacity() + 1, c.data() + c.capacity() + 1) != 0;
+  if (!is_string_short(c) || _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED) {
+    if (std::__asan_annotate_container_with_allocator<Alloc>::value)
+      return __sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) !=
+             0;
+    else
+      return __sanitizer_verify_contiguous_container(
+                 c.data(), c.data() + c.capacity() + 1, c.data() + c.capacity() + 1) != 0;
+  } else {
+    return __sanitizer_verify_contiguous_container(std::addressof(c), std::addressof(c) + 1, std::addressof(c) + 1) !=
+           0;
+  }
 }
 #else
 #  include <string>

From 0b533fb1ca726822f25bf07fd9f632d464d8a36a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:13:12 -0700
Subject: [PATCH 058/843] Apply clang-tidy fixes for
 performance-unnecessary-value-param in LinalgInterfaces.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
index 291784072896c..7eed7928456d5 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgInterfaces.cpp
@@ -514,7 +514,7 @@ static llvm::SmallDenseSet<int64_t> getPreservedDims(AffineMap map) {
 }
 
 static SmallVector<int64_t, 2>
-getConstantsFromExprList(SmallVector<AffineExpr, 2> exprs) {
+getConstantsFromExprList(const SmallVector<AffineExpr, 2> &exprs) {
   SmallVector<int64_t, 2> vals;
   for (auto e : exprs) {
     auto constantExpr = dyn_cast<AffineConstantExpr>(e);

From 5a467b84c4b324f4f452633b26852c072f8ad140 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:14:41 -0700
Subject: [PATCH 059/843] Apply clang-tidy fixes for llvm-else-after-return in
 LinalgOps.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
index ec823d06c98f3..e86b9762d8581 100644
--- a/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
+++ b/mlir/lib/Dialect/Linalg/IR/LinalgOps.cpp
@@ -2043,7 +2043,8 @@ static LogicalResult appendMangledType(llvm::raw_string_ostream &ss, Type t) {
     if (failed(appendMangledType(ss, vec.getElementType())))
       return failure();
     return success();
-  } else if (t.isSignlessIntOrIndexOrFloat()) {
+  }
+  if (t.isSignlessIntOrIndexOrFloat()) {
     ss << t;
     return success();
   }

From 42427d805a6a1d48c8a8be56b5548c82119661ea Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:20:42 -0700
Subject: [PATCH 060/843] Apply clang-tidy fixes for bugprone-macro-parentheses
 in LinalgTransformOps.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index f7cfe8abddb2e..a648888622d16 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -53,7 +53,7 @@ using namespace mlir::transform;
 #define DEBUG_TYPE "linalg-transforms"
 #define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
 #define DBGSNL() (llvm::dbgs() << "\n")
-#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+#define LDBG(X) LLVM_DEBUG(DBGS() << (X) << "\n")
 
 /// Attempts to apply the pattern specified as template argument to the given
 /// operation. The pattern is expected to have a `returningMatchAndRewrite`

From de5cedefab2224e6832b843bfb5913f2a01ec2b7 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:21:30 -0700
Subject: [PATCH 061/843] Apply clang-tidy fixes for llvm-qualified-auto in
 LinalgTransformOps.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index a648888622d16..140bdd1f2db36 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -658,7 +658,7 @@ tileAndFuseFirstExtractUse(RewriterBase &rewriter, Diagnostic &diag,
   }
 
 #ifndef NDEBUG
-  for (auto tiledOp : tileAndFuseResult->tiledOps) {
+  for (auto *tiledOp : tileAndFuseResult->tiledOps) {
     LLVM_DEBUG(DBGS() << "tiledProducer: " << *tiledOp << "\n");
   }
 #endif

From 60caa8ef74bfd9fe63d299dcb95b21205037624b Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:27:29 -0700
Subject: [PATCH 062/843] Apply clang-tidy fixes for
 performance-unnecessary-value-param in ConvertConv2DToImg2Col.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp b/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp
index e7629d79494bd..420b04b3ee28c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ConvertConv2DToImg2Col.cpp
@@ -26,7 +26,7 @@ namespace mlir {
 namespace linalg {
 static bool hasAllOneValues(DenseIntElementsAttr attr) {
   return llvm::all_of(
-      attr, [](APInt element) { return element.getSExtValue() == 1; });
+      attr, [](const APInt &element) { return element.getSExtValue() == 1; });
 }
 
 static Value createAdd(Location loc, Value x, Value y, OpBuilder &builder) {

From f9da4c6eadf478a3828362bdfa9ec5136e8c131b Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Thu, 18 Jan 2024 16:41:58 -0800
Subject: [PATCH 063/843] [SLP][NFC]Add a test with extending the types for
 vectorized stores/insertelement instructions, NFC.

---
 .../X86/store-insertelement-minbitwidth.ll    | 109 ++++++++++++++++++
 1 file changed, 109 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
new file mode 100644
index 0000000000000..c1dd90d0e9a7b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/X86/store-insertelement-minbitwidth.ll
@@ -0,0 +1,109 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -mtriple=x86_64-unknown -passes=slp-vectorizer -mattr=+avx -pass-remarks-output=%t | FileCheck %s
+; RUN: FileCheck --input-file=%t --check-prefix=YAML %s
+
+; YAML-LABEL: --- !Passed
+; YAML-NEXT:  Pass:            slp-vectorizer
+; YAML-NEXT:  Name:            StoresVectorized
+; YAML-NEXT:  Function:        stores
+; YAML-NEXT:  Args:
+; YAML-NEXT:    - String:          'Stores SLP vectorized with cost '
+; YAML-NEXT:    - Cost:            '-3'
+; YAML-NEXT:    - String:          ' and with tree size '
+; YAML-NEXT:    - TreeSize:        '6'
+define void @stores(ptr noalias %in, ptr noalias %inn, ptr noalias %out) {
+; CHECK-LABEL: @stores(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    store <4 x i64> [[TMP5]], ptr [[OUT:%.*]], align 4
+; CHECK-NEXT:    ret void
+;
+  %load.1 = load i8, ptr %in, align 1
+  %gep.1 = getelementptr inbounds i8, ptr %in, i64 1
+  %load.2 = load i8, ptr %gep.1, align 1
+  %gep.2 = getelementptr inbounds i8, ptr %in, i64 2
+  %load.3 = load i8, ptr %gep.2, align 1
+  %gep.3 = getelementptr inbounds i8, ptr %in, i64 3
+  %load.4 = load i8, ptr %gep.3, align 1
+  %load.5 = load i8, ptr %inn, align 1
+  %gep.4 = getelementptr inbounds i8, ptr %inn, i64 1
+  %load.6 = load i8, ptr %gep.4, align 1
+  %gep.5 = getelementptr inbounds i8, ptr %inn, i64 2
+  %load.7 = load i8, ptr %gep.5, align 1
+  %gep.6 = getelementptr inbounds i8, ptr %inn, i64 3
+  %load.8 = load i8, ptr %gep.6, align 1
+  %z1 = zext i8 %load.1 to i64
+  %z2 = zext i8 %load.2 to i64
+  %z3 = zext i8 %load.3 to i64
+  %z4 = zext i8 %load.4 to i64
+  %z5 = zext i8 %load.5 to i64
+  %z6 = zext i8 %load.6 to i64
+  %z7 = zext i8 %load.7 to i64
+  %z8 = zext i8 %load.8 to i64
+  %add1 = add i64 %z1, %z5
+  %add2 = add i64 %z2, %z6
+  %add3 = add i64 %z3, %z7
+  %add4 = add i64 %z4, %z8
+  %gep.8 = getelementptr inbounds i64, ptr %out, i64 1
+  %gep.9 = getelementptr inbounds i64, ptr %out, i64 2
+  %gep.10 = getelementptr inbounds i64, ptr %out, i64 3
+  store i64 %add1, ptr %out, align 4
+  store i64 %add2, ptr %gep.8, align 4
+  store i64 %add3, ptr %gep.9, align 4
+  store i64 %add4, ptr %gep.10, align 4
+  ret void
+}
+
+; YAML-LABEL: --- !Passed
+; YAML-NEXT:  Pass:            slp-vectorizer
+; YAML-NEXT:  Name:            VectorizedList
+; YAML-NEXT:  Function:        insertelems
+; YAML-NEXT:  Args:
+; YAML-NEXT:    - String:          'SLP vectorized with cost '
+; YAML-NEXT:    - Cost:            '-5'
+; YAML-NEXT:    - String:          ' and with tree size '
+; YAML-NEXT:    - TreeSize:        '6'
+define <4 x i64> @insertelems(ptr noalias %in, ptr noalias %inn) {
+; CHECK-LABEL: @insertelems(
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i8>, ptr [[IN:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[INN:%.*]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i64>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <4 x i8> [[TMP2]] to <4 x i64>
+; CHECK-NEXT:    [[TMP5:%.*]] = add <4 x i64> [[TMP3]], [[TMP4]]
+; CHECK-NEXT:    ret <4 x i64> [[TMP5]]
+;
+  %load.1 = load i8, ptr %in, align 1
+  %gep.1 = getelementptr inbounds i8, ptr %in, i64 1
+  %load.2 = load i8, ptr %gep.1, align 1
+  %gep.2 = getelementptr inbounds i8, ptr %in, i64 2
+  %load.3 = load i8, ptr %gep.2, align 1
+  %gep.3 = getelementptr inbounds i8, ptr %in, i64 3
+  %load.4 = load i8, ptr %gep.3, align 1
+  %load.5 = load i8, ptr %inn, align 1
+  %gep.4 = getelementptr inbounds i8, ptr %inn, i64 1
+  %load.6 = load i8, ptr %gep.4, align 1
+  %gep.5 = getelementptr inbounds i8, ptr %inn, i64 2
+  %load.7 = load i8, ptr %gep.5, align 1
+  %gep.6 = getelementptr inbounds i8, ptr %inn, i64 3
+  %load.8 = load i8, ptr %gep.6, align 1
+  %z1 = zext i8 %load.1 to i64
+  %z2 = zext i8 %load.2 to i64
+  %z3 = zext i8 %load.3 to i64
+  %z4 = zext i8 %load.4 to i64
+  %z5 = zext i8 %load.5 to i64
+  %z6 = zext i8 %load.6 to i64
+  %z7 = zext i8 %load.7 to i64
+  %z8 = zext i8 %load.8 to i64
+  %add1 = add i64 %z1, %z5
+  %add2 = add i64 %z2, %z6
+  %add3 = add i64 %z3, %z7
+  %add4 = add i64 %z4, %z8
+  %ins1 = insertelement <4 x i64> poison, i64 %add1, i32 0
+  %ins2 = insertelement <4 x i64> %ins1, i64 %add2, i32 1
+  %ins3 = insertelement <4 x i64> %ins2, i64 %add3, i32 2
+  %ins4 = insertelement <4 x i64> %ins3, i64 %add4, i32 3
+  ret <4 x i64> %ins4
+}

From 8bf624af4776cdaad82c0fc4da9e9b0f14e9ea10 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Thu, 18 Jan 2024 16:38:38 -0800
Subject: [PATCH 064/843] [RISCV] Key VectorIntrinsicCostTable by SEW [nfc-ish]

Previously, we'd keyed the table by the vector type, but we were actually assigning the same cost for all the types with a common element type.  Unless we'd missed an entry, this means that effectively we were performing an SEW lookup.

Restructure the table to make this SEW dependence more explicit, and in the process greatly reduce the size of the table.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp | 554 ++----------------
 1 file changed, 62 insertions(+), 492 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 056b947c7a9f9..4ea3a51930899 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -656,494 +656,61 @@ InstructionCost RISCVTTIImpl::getGatherScatterOpCost(
 // instruction counts with the following adjustments made:
 // * One vsetvli is considered free.
 static const CostTblEntry VectorIntrinsicCostTable[]{
-    {Intrinsic::floor, MVT::v2f32, 9},
-    {Intrinsic::floor, MVT::v4f32, 9},
-    {Intrinsic::floor, MVT::v8f32, 9},
-    {Intrinsic::floor, MVT::v16f32, 9},
-    {Intrinsic::floor, MVT::nxv1f32, 9},
-    {Intrinsic::floor, MVT::nxv2f32, 9},
-    {Intrinsic::floor, MVT::nxv4f32, 9},
-    {Intrinsic::floor, MVT::nxv8f32, 9},
-    {Intrinsic::floor, MVT::nxv16f32, 9},
-    {Intrinsic::floor, MVT::v2f64, 9},
-    {Intrinsic::floor, MVT::v4f64, 9},
-    {Intrinsic::floor, MVT::v8f64, 9},
-    {Intrinsic::floor, MVT::v16f64, 9},
-    {Intrinsic::floor, MVT::nxv1f64, 9},
-    {Intrinsic::floor, MVT::nxv2f64, 9},
-    {Intrinsic::floor, MVT::nxv4f64, 9},
-    {Intrinsic::floor, MVT::nxv8f64, 9},
-    {Intrinsic::ceil, MVT::v2f32, 9},
-    {Intrinsic::ceil, MVT::v4f32, 9},
-    {Intrinsic::ceil, MVT::v8f32, 9},
-    {Intrinsic::ceil, MVT::v16f32, 9},
-    {Intrinsic::ceil, MVT::nxv1f32, 9},
-    {Intrinsic::ceil, MVT::nxv2f32, 9},
-    {Intrinsic::ceil, MVT::nxv4f32, 9},
-    {Intrinsic::ceil, MVT::nxv8f32, 9},
-    {Intrinsic::ceil, MVT::nxv16f32, 9},
-    {Intrinsic::ceil, MVT::v2f64, 9},
-    {Intrinsic::ceil, MVT::v4f64, 9},
-    {Intrinsic::ceil, MVT::v8f64, 9},
-    {Intrinsic::ceil, MVT::v16f64, 9},
-    {Intrinsic::ceil, MVT::nxv1f64, 9},
-    {Intrinsic::ceil, MVT::nxv2f64, 9},
-    {Intrinsic::ceil, MVT::nxv4f64, 9},
-    {Intrinsic::ceil, MVT::nxv8f64, 9},
-    {Intrinsic::trunc, MVT::v2f32, 7},
-    {Intrinsic::trunc, MVT::v4f32, 7},
-    {Intrinsic::trunc, MVT::v8f32, 7},
-    {Intrinsic::trunc, MVT::v16f32, 7},
-    {Intrinsic::trunc, MVT::nxv1f32, 7},
-    {Intrinsic::trunc, MVT::nxv2f32, 7},
-    {Intrinsic::trunc, MVT::nxv4f32, 7},
-    {Intrinsic::trunc, MVT::nxv8f32, 7},
-    {Intrinsic::trunc, MVT::nxv16f32, 7},
-    {Intrinsic::trunc, MVT::v2f64, 7},
-    {Intrinsic::trunc, MVT::v4f64, 7},
-    {Intrinsic::trunc, MVT::v8f64, 7},
-    {Intrinsic::trunc, MVT::v16f64, 7},
-    {Intrinsic::trunc, MVT::nxv1f64, 7},
-    {Intrinsic::trunc, MVT::nxv2f64, 7},
-    {Intrinsic::trunc, MVT::nxv4f64, 7},
-    {Intrinsic::trunc, MVT::nxv8f64, 7},
-    {Intrinsic::round, MVT::v2f32, 9},
-    {Intrinsic::round, MVT::v4f32, 9},
-    {Intrinsic::round, MVT::v8f32, 9},
-    {Intrinsic::round, MVT::v16f32, 9},
-    {Intrinsic::round, MVT::nxv1f32, 9},
-    {Intrinsic::round, MVT::nxv2f32, 9},
-    {Intrinsic::round, MVT::nxv4f32, 9},
-    {Intrinsic::round, MVT::nxv8f32, 9},
-    {Intrinsic::round, MVT::nxv16f32, 9},
-    {Intrinsic::round, MVT::v2f64, 9},
-    {Intrinsic::round, MVT::v4f64, 9},
-    {Intrinsic::round, MVT::v8f64, 9},
-    {Intrinsic::round, MVT::v16f64, 9},
-    {Intrinsic::round, MVT::nxv1f64, 9},
-    {Intrinsic::round, MVT::nxv2f64, 9},
-    {Intrinsic::round, MVT::nxv4f64, 9},
-    {Intrinsic::round, MVT::nxv8f64, 9},
-    {Intrinsic::roundeven, MVT::v2f32, 9},
-    {Intrinsic::roundeven, MVT::v4f32, 9},
-    {Intrinsic::roundeven, MVT::v8f32, 9},
-    {Intrinsic::roundeven, MVT::v16f32, 9},
-    {Intrinsic::roundeven, MVT::nxv1f32, 9},
-    {Intrinsic::roundeven, MVT::nxv2f32, 9},
-    {Intrinsic::roundeven, MVT::nxv4f32, 9},
-    {Intrinsic::roundeven, MVT::nxv8f32, 9},
-    {Intrinsic::roundeven, MVT::nxv16f32, 9},
-    {Intrinsic::roundeven, MVT::v2f64, 9},
-    {Intrinsic::roundeven, MVT::v4f64, 9},
-    {Intrinsic::roundeven, MVT::v8f64, 9},
-    {Intrinsic::roundeven, MVT::v16f64, 9},
-    {Intrinsic::roundeven, MVT::nxv1f64, 9},
-    {Intrinsic::roundeven, MVT::nxv2f64, 9},
-    {Intrinsic::roundeven, MVT::nxv4f64, 9},
-    {Intrinsic::roundeven, MVT::nxv8f64, 9},
-    {Intrinsic::rint, MVT::v2f32, 7},
-    {Intrinsic::rint, MVT::v4f32, 7},
-    {Intrinsic::rint, MVT::v8f32, 7},
-    {Intrinsic::rint, MVT::v16f32, 7},
-    {Intrinsic::rint, MVT::nxv1f32, 7},
-    {Intrinsic::rint, MVT::nxv2f32, 7},
-    {Intrinsic::rint, MVT::nxv4f32, 7},
-    {Intrinsic::rint, MVT::nxv8f32, 7},
-    {Intrinsic::rint, MVT::nxv16f32, 7},
-    {Intrinsic::rint, MVT::v2f64, 7},
-    {Intrinsic::rint, MVT::v4f64, 7},
-    {Intrinsic::rint, MVT::v8f64, 7},
-    {Intrinsic::rint, MVT::v16f64, 7},
-    {Intrinsic::rint, MVT::nxv1f64, 7},
-    {Intrinsic::rint, MVT::nxv2f64, 7},
-    {Intrinsic::rint, MVT::nxv4f64, 7},
-    {Intrinsic::rint, MVT::nxv8f64, 7},
-    {Intrinsic::lrint, MVT::v2i32, 1},
-    {Intrinsic::lrint, MVT::v4i32, 1},
-    {Intrinsic::lrint, MVT::v8i32, 1},
-    {Intrinsic::lrint, MVT::v16i32, 1},
-    {Intrinsic::lrint, MVT::nxv1i32, 1},
-    {Intrinsic::lrint, MVT::nxv2i32, 1},
-    {Intrinsic::lrint, MVT::nxv4i32, 1},
-    {Intrinsic::lrint, MVT::nxv8i32, 1},
-    {Intrinsic::lrint, MVT::nxv16i32, 1},
-    {Intrinsic::lrint, MVT::v2i64, 1},
-    {Intrinsic::lrint, MVT::v4i64, 1},
-    {Intrinsic::lrint, MVT::v8i64, 1},
-    {Intrinsic::lrint, MVT::v16i64, 1},
-    {Intrinsic::lrint, MVT::nxv1i64, 1},
-    {Intrinsic::lrint, MVT::nxv2i64, 1},
-    {Intrinsic::lrint, MVT::nxv4i64, 1},
-    {Intrinsic::lrint, MVT::nxv8i64, 1},
-    {Intrinsic::llrint, MVT::v2i64, 1},
-    {Intrinsic::llrint, MVT::v4i64, 1},
-    {Intrinsic::llrint, MVT::v8i64, 1},
-    {Intrinsic::llrint, MVT::v16i64, 1},
-    {Intrinsic::llrint, MVT::nxv1i64, 1},
-    {Intrinsic::llrint, MVT::nxv2i64, 1},
-    {Intrinsic::llrint, MVT::nxv4i64, 1},
-    {Intrinsic::llrint, MVT::nxv8i64, 1},
-    {Intrinsic::nearbyint, MVT::v2f32, 9},
-    {Intrinsic::nearbyint, MVT::v4f32, 9},
-    {Intrinsic::nearbyint, MVT::v8f32, 9},
-    {Intrinsic::nearbyint, MVT::v16f32, 9},
-    {Intrinsic::nearbyint, MVT::nxv1f32, 9},
-    {Intrinsic::nearbyint, MVT::nxv2f32, 9},
-    {Intrinsic::nearbyint, MVT::nxv4f32, 9},
-    {Intrinsic::nearbyint, MVT::nxv8f32, 9},
-    {Intrinsic::nearbyint, MVT::nxv16f32, 9},
-    {Intrinsic::nearbyint, MVT::v2f64, 9},
-    {Intrinsic::nearbyint, MVT::v4f64, 9},
-    {Intrinsic::nearbyint, MVT::v8f64, 9},
-    {Intrinsic::nearbyint, MVT::v16f64, 9},
-    {Intrinsic::nearbyint, MVT::nxv1f64, 9},
-    {Intrinsic::nearbyint, MVT::nxv2f64, 9},
-    {Intrinsic::nearbyint, MVT::nxv4f64, 9},
-    {Intrinsic::nearbyint, MVT::nxv8f64, 9},
-    {Intrinsic::bswap, MVT::v2i16, 3},
-    {Intrinsic::bswap, MVT::v4i16, 3},
-    {Intrinsic::bswap, MVT::v8i16, 3},
-    {Intrinsic::bswap, MVT::v16i16, 3},
-    {Intrinsic::bswap, MVT::nxv1i16, 3},
-    {Intrinsic::bswap, MVT::nxv2i16, 3},
-    {Intrinsic::bswap, MVT::nxv4i16, 3},
-    {Intrinsic::bswap, MVT::nxv8i16, 3},
-    {Intrinsic::bswap, MVT::nxv16i16, 3},
-    {Intrinsic::bswap, MVT::v2i32, 12},
-    {Intrinsic::bswap, MVT::v4i32, 12},
-    {Intrinsic::bswap, MVT::v8i32, 12},
-    {Intrinsic::bswap, MVT::v16i32, 12},
-    {Intrinsic::bswap, MVT::nxv1i32, 12},
-    {Intrinsic::bswap, MVT::nxv2i32, 12},
-    {Intrinsic::bswap, MVT::nxv4i32, 12},
-    {Intrinsic::bswap, MVT::nxv8i32, 12},
-    {Intrinsic::bswap, MVT::nxv16i32, 12},
-    {Intrinsic::bswap, MVT::v2i64, 31},
-    {Intrinsic::bswap, MVT::v4i64, 31},
-    {Intrinsic::bswap, MVT::v8i64, 31},
-    {Intrinsic::bswap, MVT::v16i64, 31},
-    {Intrinsic::bswap, MVT::nxv1i64, 31},
-    {Intrinsic::bswap, MVT::nxv2i64, 31},
-    {Intrinsic::bswap, MVT::nxv4i64, 31},
-    {Intrinsic::bswap, MVT::nxv8i64, 31},
-    {Intrinsic::vp_bswap, MVT::v2i16, 3},
-    {Intrinsic::vp_bswap, MVT::v4i16, 3},
-    {Intrinsic::vp_bswap, MVT::v8i16, 3},
-    {Intrinsic::vp_bswap, MVT::v16i16, 3},
-    {Intrinsic::vp_bswap, MVT::nxv1i16, 3},
-    {Intrinsic::vp_bswap, MVT::nxv2i16, 3},
-    {Intrinsic::vp_bswap, MVT::nxv4i16, 3},
-    {Intrinsic::vp_bswap, MVT::nxv8i16, 3},
-    {Intrinsic::vp_bswap, MVT::nxv16i16, 3},
-    {Intrinsic::vp_bswap, MVT::v2i32, 12},
-    {Intrinsic::vp_bswap, MVT::v4i32, 12},
-    {Intrinsic::vp_bswap, MVT::v8i32, 12},
-    {Intrinsic::vp_bswap, MVT::v16i32, 12},
-    {Intrinsic::vp_bswap, MVT::nxv1i32, 12},
-    {Intrinsic::vp_bswap, MVT::nxv2i32, 12},
-    {Intrinsic::vp_bswap, MVT::nxv4i32, 12},
-    {Intrinsic::vp_bswap, MVT::nxv8i32, 12},
-    {Intrinsic::vp_bswap, MVT::nxv16i32, 12},
-    {Intrinsic::vp_bswap, MVT::v2i64, 31},
-    {Intrinsic::vp_bswap, MVT::v4i64, 31},
-    {Intrinsic::vp_bswap, MVT::v8i64, 31},
-    {Intrinsic::vp_bswap, MVT::v16i64, 31},
-    {Intrinsic::vp_bswap, MVT::nxv1i64, 31},
-    {Intrinsic::vp_bswap, MVT::nxv2i64, 31},
-    {Intrinsic::vp_bswap, MVT::nxv4i64, 31},
-    {Intrinsic::vp_bswap, MVT::nxv8i64, 31},
-    {Intrinsic::vp_fshl, MVT::v2i8, 7},
-    {Intrinsic::vp_fshl, MVT::v4i8, 7},
-    {Intrinsic::vp_fshl, MVT::v8i8, 7},
-    {Intrinsic::vp_fshl, MVT::v16i8, 7},
-    {Intrinsic::vp_fshl, MVT::nxv1i8, 7},
-    {Intrinsic::vp_fshl, MVT::nxv2i8, 7},
-    {Intrinsic::vp_fshl, MVT::nxv4i8, 7},
-    {Intrinsic::vp_fshl, MVT::nxv8i8, 7},
-    {Intrinsic::vp_fshl, MVT::nxv16i8, 7},
-    {Intrinsic::vp_fshl, MVT::nxv32i8, 7},
-    {Intrinsic::vp_fshl, MVT::nxv64i8, 7},
-    {Intrinsic::vp_fshl, MVT::v2i16, 7},
-    {Intrinsic::vp_fshl, MVT::v4i16, 7},
-    {Intrinsic::vp_fshl, MVT::v8i16, 7},
-    {Intrinsic::vp_fshl, MVT::v16i16, 7},
-    {Intrinsic::vp_fshl, MVT::nxv1i16, 7},
-    {Intrinsic::vp_fshl, MVT::nxv2i16, 7},
-    {Intrinsic::vp_fshl, MVT::nxv4i16, 7},
-    {Intrinsic::vp_fshl, MVT::nxv8i16, 7},
-    {Intrinsic::vp_fshl, MVT::nxv16i16, 7},
-    {Intrinsic::vp_fshl, MVT::nxv32i16, 7},
-    {Intrinsic::vp_fshl, MVT::v2i32, 7},
-    {Intrinsic::vp_fshl, MVT::v4i32, 7},
-    {Intrinsic::vp_fshl, MVT::v8i32, 7},
-    {Intrinsic::vp_fshl, MVT::v16i32, 7},
-    {Intrinsic::vp_fshl, MVT::nxv1i32, 7},
-    {Intrinsic::vp_fshl, MVT::nxv2i32, 7},
-    {Intrinsic::vp_fshl, MVT::nxv4i32, 7},
-    {Intrinsic::vp_fshl, MVT::nxv8i32, 7},
-    {Intrinsic::vp_fshl, MVT::nxv16i32, 7},
-    {Intrinsic::vp_fshl, MVT::v2i64, 7},
-    {Intrinsic::vp_fshl, MVT::v4i64, 7},
-    {Intrinsic::vp_fshl, MVT::v8i64, 7},
-    {Intrinsic::vp_fshl, MVT::v16i64, 7},
-    {Intrinsic::vp_fshl, MVT::nxv1i64, 7},
-    {Intrinsic::vp_fshl, MVT::nxv2i64, 7},
-    {Intrinsic::vp_fshl, MVT::nxv4i64, 7},
-    {Intrinsic::vp_fshl, MVT::nxv8i64, 7},
-    {Intrinsic::vp_fshr, MVT::v2i8, 7},
-    {Intrinsic::vp_fshr, MVT::v4i8, 7},
-    {Intrinsic::vp_fshr, MVT::v8i8, 7},
-    {Intrinsic::vp_fshr, MVT::v16i8, 7},
-    {Intrinsic::vp_fshr, MVT::nxv1i8, 7},
-    {Intrinsic::vp_fshr, MVT::nxv2i8, 7},
-    {Intrinsic::vp_fshr, MVT::nxv4i8, 7},
-    {Intrinsic::vp_fshr, MVT::nxv8i8, 7},
-    {Intrinsic::vp_fshr, MVT::nxv16i8, 7},
-    {Intrinsic::vp_fshr, MVT::nxv32i8, 7},
-    {Intrinsic::vp_fshr, MVT::nxv64i8, 7},
-    {Intrinsic::vp_fshr, MVT::v2i16, 7},
-    {Intrinsic::vp_fshr, MVT::v4i16, 7},
-    {Intrinsic::vp_fshr, MVT::v8i16, 7},
-    {Intrinsic::vp_fshr, MVT::v16i16, 7},
-    {Intrinsic::vp_fshr, MVT::nxv1i16, 7},
-    {Intrinsic::vp_fshr, MVT::nxv2i16, 7},
-    {Intrinsic::vp_fshr, MVT::nxv4i16, 7},
-    {Intrinsic::vp_fshr, MVT::nxv8i16, 7},
-    {Intrinsic::vp_fshr, MVT::nxv16i16, 7},
-    {Intrinsic::vp_fshr, MVT::nxv32i16, 7},
-    {Intrinsic::vp_fshr, MVT::v2i32, 7},
-    {Intrinsic::vp_fshr, MVT::v4i32, 7},
-    {Intrinsic::vp_fshr, MVT::v8i32, 7},
-    {Intrinsic::vp_fshr, MVT::v16i32, 7},
-    {Intrinsic::vp_fshr, MVT::nxv1i32, 7},
-    {Intrinsic::vp_fshr, MVT::nxv2i32, 7},
-    {Intrinsic::vp_fshr, MVT::nxv4i32, 7},
-    {Intrinsic::vp_fshr, MVT::nxv8i32, 7},
-    {Intrinsic::vp_fshr, MVT::nxv16i32, 7},
-    {Intrinsic::vp_fshr, MVT::v2i64, 7},
-    {Intrinsic::vp_fshr, MVT::v4i64, 7},
-    {Intrinsic::vp_fshr, MVT::v8i64, 7},
-    {Intrinsic::vp_fshr, MVT::v16i64, 7},
-    {Intrinsic::vp_fshr, MVT::nxv1i64, 7},
-    {Intrinsic::vp_fshr, MVT::nxv2i64, 7},
-    {Intrinsic::vp_fshr, MVT::nxv4i64, 7},
-    {Intrinsic::vp_fshr, MVT::nxv8i64, 7},
-    {Intrinsic::bitreverse, MVT::v2i8, 17},
-    {Intrinsic::bitreverse, MVT::v4i8, 17},
-    {Intrinsic::bitreverse, MVT::v8i8, 17},
-    {Intrinsic::bitreverse, MVT::v16i8, 17},
-    {Intrinsic::bitreverse, MVT::nxv1i8, 17},
-    {Intrinsic::bitreverse, MVT::nxv2i8, 17},
-    {Intrinsic::bitreverse, MVT::nxv4i8, 17},
-    {Intrinsic::bitreverse, MVT::nxv8i8, 17},
-    {Intrinsic::bitreverse, MVT::nxv16i8, 17},
-    {Intrinsic::bitreverse, MVT::v2i16, 24},
-    {Intrinsic::bitreverse, MVT::v4i16, 24},
-    {Intrinsic::bitreverse, MVT::v8i16, 24},
-    {Intrinsic::bitreverse, MVT::v16i16, 24},
-    {Intrinsic::bitreverse, MVT::nxv1i16, 24},
-    {Intrinsic::bitreverse, MVT::nxv2i16, 24},
-    {Intrinsic::bitreverse, MVT::nxv4i16, 24},
-    {Intrinsic::bitreverse, MVT::nxv8i16, 24},
-    {Intrinsic::bitreverse, MVT::nxv16i16, 24},
-    {Intrinsic::bitreverse, MVT::v2i32, 33},
-    {Intrinsic::bitreverse, MVT::v4i32, 33},
-    {Intrinsic::bitreverse, MVT::v8i32, 33},
-    {Intrinsic::bitreverse, MVT::v16i32, 33},
-    {Intrinsic::bitreverse, MVT::nxv1i32, 33},
-    {Intrinsic::bitreverse, MVT::nxv2i32, 33},
-    {Intrinsic::bitreverse, MVT::nxv4i32, 33},
-    {Intrinsic::bitreverse, MVT::nxv8i32, 33},
-    {Intrinsic::bitreverse, MVT::nxv16i32, 33},
-    {Intrinsic::bitreverse, MVT::v2i64, 52},
-    {Intrinsic::bitreverse, MVT::v4i64, 52},
-    {Intrinsic::bitreverse, MVT::v8i64, 52},
-    {Intrinsic::bitreverse, MVT::v16i64, 52},
-    {Intrinsic::bitreverse, MVT::nxv1i64, 52},
-    {Intrinsic::bitreverse, MVT::nxv2i64, 52},
-    {Intrinsic::bitreverse, MVT::nxv4i64, 52},
-    {Intrinsic::bitreverse, MVT::nxv8i64, 52},
-    {Intrinsic::vp_bitreverse, MVT::v2i8, 17},
-    {Intrinsic::vp_bitreverse, MVT::v4i8, 17},
-    {Intrinsic::vp_bitreverse, MVT::v8i8, 17},
-    {Intrinsic::vp_bitreverse, MVT::v16i8, 17},
-    {Intrinsic::vp_bitreverse, MVT::nxv1i8, 17},
-    {Intrinsic::vp_bitreverse, MVT::nxv2i8, 17},
-    {Intrinsic::vp_bitreverse, MVT::nxv4i8, 17},
-    {Intrinsic::vp_bitreverse, MVT::nxv8i8, 17},
-    {Intrinsic::vp_bitreverse, MVT::nxv16i8, 17},
-    {Intrinsic::vp_bitreverse, MVT::v2i16, 24},
-    {Intrinsic::vp_bitreverse, MVT::v4i16, 24},
-    {Intrinsic::vp_bitreverse, MVT::v8i16, 24},
-    {Intrinsic::vp_bitreverse, MVT::v16i16, 24},
-    {Intrinsic::vp_bitreverse, MVT::nxv1i16, 24},
-    {Intrinsic::vp_bitreverse, MVT::nxv2i16, 24},
-    {Intrinsic::vp_bitreverse, MVT::nxv4i16, 24},
-    {Intrinsic::vp_bitreverse, MVT::nxv8i16, 24},
-    {Intrinsic::vp_bitreverse, MVT::nxv16i16, 24},
-    {Intrinsic::vp_bitreverse, MVT::v2i32, 33},
-    {Intrinsic::vp_bitreverse, MVT::v4i32, 33},
-    {Intrinsic::vp_bitreverse, MVT::v8i32, 33},
-    {Intrinsic::vp_bitreverse, MVT::v16i32, 33},
-    {Intrinsic::vp_bitreverse, MVT::nxv1i32, 33},
-    {Intrinsic::vp_bitreverse, MVT::nxv2i32, 33},
-    {Intrinsic::vp_bitreverse, MVT::nxv4i32, 33},
-    {Intrinsic::vp_bitreverse, MVT::nxv8i32, 33},
-    {Intrinsic::vp_bitreverse, MVT::nxv16i32, 33},
-    {Intrinsic::vp_bitreverse, MVT::v2i64, 52},
-    {Intrinsic::vp_bitreverse, MVT::v4i64, 52},
-    {Intrinsic::vp_bitreverse, MVT::v8i64, 52},
-    {Intrinsic::vp_bitreverse, MVT::v16i64, 52},
-    {Intrinsic::vp_bitreverse, MVT::nxv1i64, 52},
-    {Intrinsic::vp_bitreverse, MVT::nxv2i64, 52},
-    {Intrinsic::vp_bitreverse, MVT::nxv4i64, 52},
-    {Intrinsic::vp_bitreverse, MVT::nxv8i64, 52},
-    {Intrinsic::ctpop, MVT::v2i8, 12},
-    {Intrinsic::ctpop, MVT::v4i8, 12},
-    {Intrinsic::ctpop, MVT::v8i8, 12},
-    {Intrinsic::ctpop, MVT::v16i8, 12},
-    {Intrinsic::ctpop, MVT::nxv1i8, 12},
-    {Intrinsic::ctpop, MVT::nxv2i8, 12},
-    {Intrinsic::ctpop, MVT::nxv4i8, 12},
-    {Intrinsic::ctpop, MVT::nxv8i8, 12},
-    {Intrinsic::ctpop, MVT::nxv16i8, 12},
-    {Intrinsic::ctpop, MVT::v2i16, 19},
-    {Intrinsic::ctpop, MVT::v4i16, 19},
-    {Intrinsic::ctpop, MVT::v8i16, 19},
-    {Intrinsic::ctpop, MVT::v16i16, 19},
-    {Intrinsic::ctpop, MVT::nxv1i16, 19},
-    {Intrinsic::ctpop, MVT::nxv2i16, 19},
-    {Intrinsic::ctpop, MVT::nxv4i16, 19},
-    {Intrinsic::ctpop, MVT::nxv8i16, 19},
-    {Intrinsic::ctpop, MVT::nxv16i16, 19},
-    {Intrinsic::ctpop, MVT::v2i32, 20},
-    {Intrinsic::ctpop, MVT::v4i32, 20},
-    {Intrinsic::ctpop, MVT::v8i32, 20},
-    {Intrinsic::ctpop, MVT::v16i32, 20},
-    {Intrinsic::ctpop, MVT::nxv1i32, 20},
-    {Intrinsic::ctpop, MVT::nxv2i32, 20},
-    {Intrinsic::ctpop, MVT::nxv4i32, 20},
-    {Intrinsic::ctpop, MVT::nxv8i32, 20},
-    {Intrinsic::ctpop, MVT::nxv16i32, 20},
-    {Intrinsic::ctpop, MVT::v2i64, 21},
-    {Intrinsic::ctpop, MVT::v4i64, 21},
-    {Intrinsic::ctpop, MVT::v8i64, 21},
-    {Intrinsic::ctpop, MVT::v16i64, 21},
-    {Intrinsic::ctpop, MVT::nxv1i64, 21},
-    {Intrinsic::ctpop, MVT::nxv2i64, 21},
-    {Intrinsic::ctpop, MVT::nxv4i64, 21},
-    {Intrinsic::ctpop, MVT::nxv8i64, 21},
-    {Intrinsic::vp_ctpop, MVT::v2i8, 12},
-    {Intrinsic::vp_ctpop, MVT::v4i8, 12},
-    {Intrinsic::vp_ctpop, MVT::v8i8, 12},
-    {Intrinsic::vp_ctpop, MVT::v16i8, 12},
-    {Intrinsic::vp_ctpop, MVT::nxv1i8, 12},
-    {Intrinsic::vp_ctpop, MVT::nxv2i8, 12},
-    {Intrinsic::vp_ctpop, MVT::nxv4i8, 12},
-    {Intrinsic::vp_ctpop, MVT::nxv8i8, 12},
-    {Intrinsic::vp_ctpop, MVT::nxv16i8, 12},
-    {Intrinsic::vp_ctpop, MVT::v2i16, 19},
-    {Intrinsic::vp_ctpop, MVT::v4i16, 19},
-    {Intrinsic::vp_ctpop, MVT::v8i16, 19},
-    {Intrinsic::vp_ctpop, MVT::v16i16, 19},
-    {Intrinsic::vp_ctpop, MVT::nxv1i16, 19},
-    {Intrinsic::vp_ctpop, MVT::nxv2i16, 19},
-    {Intrinsic::vp_ctpop, MVT::nxv4i16, 19},
-    {Intrinsic::vp_ctpop, MVT::nxv8i16, 19},
-    {Intrinsic::vp_ctpop, MVT::nxv16i16, 19},
-    {Intrinsic::vp_ctpop, MVT::v2i32, 20},
-    {Intrinsic::vp_ctpop, MVT::v4i32, 20},
-    {Intrinsic::vp_ctpop, MVT::v8i32, 20},
-    {Intrinsic::vp_ctpop, MVT::v16i32, 20},
-    {Intrinsic::vp_ctpop, MVT::nxv1i32, 20},
-    {Intrinsic::vp_ctpop, MVT::nxv2i32, 20},
-    {Intrinsic::vp_ctpop, MVT::nxv4i32, 20},
-    {Intrinsic::vp_ctpop, MVT::nxv8i32, 20},
-    {Intrinsic::vp_ctpop, MVT::nxv16i32, 20},
-    {Intrinsic::vp_ctpop, MVT::v2i64, 21},
-    {Intrinsic::vp_ctpop, MVT::v4i64, 21},
-    {Intrinsic::vp_ctpop, MVT::v8i64, 21},
-    {Intrinsic::vp_ctpop, MVT::v16i64, 21},
-    {Intrinsic::vp_ctpop, MVT::nxv1i64, 21},
-    {Intrinsic::vp_ctpop, MVT::nxv2i64, 21},
-    {Intrinsic::vp_ctpop, MVT::nxv4i64, 21},
-    {Intrinsic::vp_ctpop, MVT::nxv8i64, 21},
-    {Intrinsic::vp_ctlz, MVT::v2i8, 19},
-    {Intrinsic::vp_ctlz, MVT::v4i8, 19},
-    {Intrinsic::vp_ctlz, MVT::v8i8, 19},
-    {Intrinsic::vp_ctlz, MVT::v16i8, 19},
-    {Intrinsic::vp_ctlz, MVT::nxv1i8, 19},
-    {Intrinsic::vp_ctlz, MVT::nxv2i8, 19},
-    {Intrinsic::vp_ctlz, MVT::nxv4i8, 19},
-    {Intrinsic::vp_ctlz, MVT::nxv8i8, 19},
-    {Intrinsic::vp_ctlz, MVT::nxv16i8, 19},
-    {Intrinsic::vp_ctlz, MVT::nxv32i8, 19},
-    {Intrinsic::vp_ctlz, MVT::nxv64i8, 19},
-    {Intrinsic::vp_ctlz, MVT::v2i16, 28},
-    {Intrinsic::vp_ctlz, MVT::v4i16, 28},
-    {Intrinsic::vp_ctlz, MVT::v8i16, 28},
-    {Intrinsic::vp_ctlz, MVT::v16i16, 28},
-    {Intrinsic::vp_ctlz, MVT::nxv1i16, 28},
-    {Intrinsic::vp_ctlz, MVT::nxv2i16, 28},
-    {Intrinsic::vp_ctlz, MVT::nxv4i16, 28},
-    {Intrinsic::vp_ctlz, MVT::nxv8i16, 28},
-    {Intrinsic::vp_ctlz, MVT::nxv16i16, 28},
-    {Intrinsic::vp_ctlz, MVT::nxv32i16, 28},
-    {Intrinsic::vp_ctlz, MVT::v2i32, 31},
-    {Intrinsic::vp_ctlz, MVT::v4i32, 31},
-    {Intrinsic::vp_ctlz, MVT::v8i32, 31},
-    {Intrinsic::vp_ctlz, MVT::v16i32, 31},
-    {Intrinsic::vp_ctlz, MVT::nxv1i32, 31},
-    {Intrinsic::vp_ctlz, MVT::nxv2i32, 31},
-    {Intrinsic::vp_ctlz, MVT::nxv4i32, 31},
-    {Intrinsic::vp_ctlz, MVT::nxv8i32, 31},
-    {Intrinsic::vp_ctlz, MVT::nxv16i32, 31},
-    {Intrinsic::vp_ctlz, MVT::v2i64, 35},
-    {Intrinsic::vp_ctlz, MVT::v4i64, 35},
-    {Intrinsic::vp_ctlz, MVT::v8i64, 35},
-    {Intrinsic::vp_ctlz, MVT::v16i64, 35},
-    {Intrinsic::vp_ctlz, MVT::nxv1i64, 35},
-    {Intrinsic::vp_ctlz, MVT::nxv2i64, 35},
-    {Intrinsic::vp_ctlz, MVT::nxv4i64, 35},
-    {Intrinsic::vp_ctlz, MVT::nxv8i64, 35},
-    {Intrinsic::vp_cttz, MVT::v2i8, 16},
-    {Intrinsic::vp_cttz, MVT::v4i8, 16},
-    {Intrinsic::vp_cttz, MVT::v8i8, 16},
-    {Intrinsic::vp_cttz, MVT::v16i8, 16},
-    {Intrinsic::vp_cttz, MVT::nxv1i8, 16},
-    {Intrinsic::vp_cttz, MVT::nxv2i8, 16},
-    {Intrinsic::vp_cttz, MVT::nxv4i8, 16},
-    {Intrinsic::vp_cttz, MVT::nxv8i8, 16},
-    {Intrinsic::vp_cttz, MVT::nxv16i8, 16},
-    {Intrinsic::vp_cttz, MVT::nxv32i8, 16},
-    {Intrinsic::vp_cttz, MVT::nxv64i8, 16},
-    {Intrinsic::vp_cttz, MVT::v2i16, 23},
-    {Intrinsic::vp_cttz, MVT::v4i16, 23},
-    {Intrinsic::vp_cttz, MVT::v8i16, 23},
-    {Intrinsic::vp_cttz, MVT::v16i16, 23},
-    {Intrinsic::vp_cttz, MVT::nxv1i16, 23},
-    {Intrinsic::vp_cttz, MVT::nxv2i16, 23},
-    {Intrinsic::vp_cttz, MVT::nxv4i16, 23},
-    {Intrinsic::vp_cttz, MVT::nxv8i16, 23},
-    {Intrinsic::vp_cttz, MVT::nxv16i16, 23},
-    {Intrinsic::vp_cttz, MVT::nxv32i16, 23},
-    {Intrinsic::vp_cttz, MVT::v2i32, 24},
-    {Intrinsic::vp_cttz, MVT::v4i32, 24},
-    {Intrinsic::vp_cttz, MVT::v8i32, 24},
-    {Intrinsic::vp_cttz, MVT::v16i32, 24},
-    {Intrinsic::vp_cttz, MVT::nxv1i32, 24},
-    {Intrinsic::vp_cttz, MVT::nxv2i32, 24},
-    {Intrinsic::vp_cttz, MVT::nxv4i32, 24},
-    {Intrinsic::vp_cttz, MVT::nxv8i32, 24},
-    {Intrinsic::vp_cttz, MVT::nxv16i32, 24},
-    {Intrinsic::vp_cttz, MVT::v2i64, 25},
-    {Intrinsic::vp_cttz, MVT::v4i64, 25},
-    {Intrinsic::vp_cttz, MVT::v8i64, 25},
-    {Intrinsic::vp_cttz, MVT::v16i64, 25},
-    {Intrinsic::vp_cttz, MVT::nxv1i64, 25},
-    {Intrinsic::vp_cttz, MVT::nxv2i64, 25},
-    {Intrinsic::vp_cttz, MVT::nxv4i64, 25},
-    {Intrinsic::vp_cttz, MVT::nxv8i64, 25},
+    {Intrinsic::floor, MVT::f32, 9},
+    {Intrinsic::floor, MVT::f64, 9},
+    {Intrinsic::ceil, MVT::f32, 9},
+    {Intrinsic::ceil, MVT::f64, 9},
+    {Intrinsic::trunc, MVT::f32, 7},
+    {Intrinsic::trunc, MVT::f64, 7},
+    {Intrinsic::round, MVT::f32, 9},
+    {Intrinsic::round, MVT::f64, 9},
+    {Intrinsic::roundeven, MVT::f32, 9},
+    {Intrinsic::roundeven, MVT::f64, 9},
+    {Intrinsic::rint, MVT::f32, 7},
+    {Intrinsic::rint, MVT::f64, 7},
+    {Intrinsic::lrint, MVT::i32, 1},
+    {Intrinsic::lrint, MVT::i64, 1},
+    {Intrinsic::llrint, MVT::i64, 1},
+    {Intrinsic::nearbyint, MVT::f32, 9},
+    {Intrinsic::nearbyint, MVT::f64, 9},
+    {Intrinsic::bswap, MVT::i16, 3},
+    {Intrinsic::bswap, MVT::i32, 12},
+    {Intrinsic::bswap, MVT::i64, 31},
+    {Intrinsic::vp_bswap, MVT::i16, 3},
+    {Intrinsic::vp_bswap, MVT::i32, 12},
+    {Intrinsic::vp_bswap, MVT::i64, 31},
+    {Intrinsic::vp_fshl, MVT::i8, 7},
+    {Intrinsic::vp_fshl, MVT::i16, 7},
+    {Intrinsic::vp_fshl, MVT::i32, 7},
+    {Intrinsic::vp_fshl, MVT::i64, 7},
+    {Intrinsic::vp_fshr, MVT::i8, 7},
+    {Intrinsic::vp_fshr, MVT::i16, 7},
+    {Intrinsic::vp_fshr, MVT::i32, 7},
+    {Intrinsic::vp_fshr, MVT::i64, 7},
+    {Intrinsic::bitreverse, MVT::i8, 17},
+    {Intrinsic::bitreverse, MVT::i16, 24},
+    {Intrinsic::bitreverse, MVT::i32, 33},
+    {Intrinsic::bitreverse, MVT::i64, 52},
+    {Intrinsic::vp_bitreverse, MVT::i8, 17},
+    {Intrinsic::vp_bitreverse, MVT::i16, 24},
+    {Intrinsic::vp_bitreverse, MVT::i32, 33},
+    {Intrinsic::vp_bitreverse, MVT::i64, 52},
+    {Intrinsic::ctpop, MVT::i8, 12},
+    {Intrinsic::ctpop, MVT::i16, 19},
+    {Intrinsic::ctpop, MVT::i32, 20},
+    {Intrinsic::ctpop, MVT::i64, 21},
+    {Intrinsic::vp_ctpop, MVT::i8, 12},
+    {Intrinsic::vp_ctpop, MVT::i16, 19},
+    {Intrinsic::vp_ctpop, MVT::i32, 20},
+    {Intrinsic::vp_ctpop, MVT::i64, 21},
+    {Intrinsic::vp_ctlz, MVT::i8, 19},
+    {Intrinsic::vp_ctlz, MVT::i16, 28},
+    {Intrinsic::vp_ctlz, MVT::i32, 31},
+    {Intrinsic::vp_ctlz, MVT::i64, 35},
+    {Intrinsic::vp_cttz, MVT::i8, 16},
+    {Intrinsic::vp_cttz, MVT::i16, 23},
+    {Intrinsic::vp_cttz, MVT::i32, 24},
+    {Intrinsic::vp_cttz, MVT::i64, 25},
 };
 
 static unsigned getISDForVPIntrinsicID(Intrinsic::ID ID) {
@@ -1251,10 +818,13 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
   }
 
   if (ST->hasVInstructions() && RetTy->isVectorTy()) {
-    auto LT = getTypeLegalizationCost(RetTy);
-    if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
-                                            ICA.getID(), LT.second))
-      return LT.first * Entry->Cost;
+    if (auto LT = getTypeLegalizationCost(RetTy);
+        LT.second.isVector()) {
+      MVT EltTy = LT.second.getVectorElementType();
+      if (const auto *Entry = CostTableLookup(VectorIntrinsicCostTable,
+                                              ICA.getID(), EltTy))
+        return LT.first * Entry->Cost;
+    }
   }
 
   return BaseT::getIntrinsicInstrCost(ICA, CostKind);

From 9299ca797ae6d0c5ac008ef2126ab53f214daa21 Mon Sep 17 00:00:00 2001
From: yonillasky <yonillasky@users.noreply.github.com>
Date: Fri, 19 Jan 2024 03:46:15 +0200
Subject: [PATCH 065/843] [Coroutines] Fix inline comment about frame layout
 (#78626)

`ResumeIndex` isn't part of the frame struct header, so it necessarily
appears after the promise.

Co-authored-by: Yoni Lavi <yoni.lavi@nextsilicon.com>
---
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 429344652a4c4..db09337215c10 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -1289,8 +1289,8 @@ static void buildFrameDebugInfo(Function &F, coro::Shape &Shape,
 //   struct f.frame {
 //     ResumeFnTy ResumeFnAddr;
 //     ResumeFnTy DestroyFnAddr;
-//     int ResumeIndex;
 //     ... promise (if present) ...
+//     int ResumeIndex;
 //     ... spills ...
 //   };
 static StructType *buildFrameType(Function &F, coro::Shape &Shape,

From e81c981fe35483a4b7474e110ff1b5d02bc2cb00 Mon Sep 17 00:00:00 2001
From: Younan Zhang <zyn7109@gmail.com>
Date: Fri, 19 Jan 2024 10:54:47 +0800
Subject: [PATCH 066/843] [clangd] Don't collect templated decls for builtin
 templates (#78466)

Builtin templates e.g. `__make_integer_seq`, `__type_pack_element` are
such that they don't have alias *Decls*.
[D133262](https://reviews.llvm.org/D133262) marked these as alias
templates, resulting in an attempt to collect their null "using" Decls
within our `TargetFinder`.

This fixes https://github.com/clangd/clangd/issues/1906.
---
 clang-tools-extra/clangd/FindTarget.cpp       | 13 ++++++---
 .../clangd/unittests/FindTargetTests.cpp      | 27 +++++++++++++++++++
 2 files changed, 37 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index 839cf6332fe8b..3d73e77b16aff 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -443,9 +443,16 @@ struct TargetFinder {
           Outer.add(TST->getAliasedType(), Flags | Rel::Underlying);
           // Don't *traverse* the alias, which would result in traversing the
           // template of the underlying type.
-          Outer.report(
-              TST->getTemplateName().getAsTemplateDecl()->getTemplatedDecl(),
-              Flags | Rel::Alias | Rel::TemplatePattern);
+
+          TemplateDecl *TD = TST->getTemplateName().getAsTemplateDecl();
+          // Builtin templates e.g. __make_integer_seq, __type_pack_element
+          // are such that they don't have alias *decls*. Even then, we still
+          // traverse their desugared *types* so that instantiated decls are
+          // collected.
+          if (llvm::isa<BuiltinTemplateDecl>(TD))
+            return;
+          Outer.report(TD->getTemplatedDecl(),
+                       Flags | Rel::Alias | Rel::TemplatePattern);
         }
         // specializations of template template parameters aren't instantiated
         // into decls, so they must refer to the parameter itself.
diff --git a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
index fbd10c4a47a79..29cff68cf03b2 100644
--- a/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
+++ b/clang-tools-extra/clangd/unittests/FindTargetTests.cpp
@@ -709,6 +709,33 @@ TEST_F(TargetDeclTest, TypeAliasTemplate) {
                 Rel::Alias | Rel::TemplatePattern});
 }
 
+TEST_F(TargetDeclTest, BuiltinTemplates) {
+  Code = R"cpp(
+    template <class T, T... Index> struct integer_sequence {};
+    [[__make_integer_seq]]<integer_sequence, int, 3> X;
+  )cpp";
+  EXPECT_DECLS(
+      "TemplateSpecializationTypeLoc",
+      {"struct integer_sequence", Rel::TemplatePattern | Rel::Underlying},
+      {"template<> struct integer_sequence<int, <0, 1, 2>>",
+       Rel::TemplateInstantiation | Rel::Underlying});
+
+  // Dependent context.
+  Code = R"cpp(
+    template <class T, T... Index> struct integer_sequence;
+
+    template <class T, int N>
+    using make_integer_sequence = [[__make_integer_seq]]<integer_sequence, T, N>;
+  )cpp";
+  EXPECT_DECLS("TemplateSpecializationTypeLoc");
+
+  Code = R"cpp(
+    template <int N, class... Pack>
+    using type_pack_element = [[__type_pack_element]]<N, Pack...>;
+  )cpp";
+  EXPECT_DECLS("TemplateSpecializationTypeLoc");
+}
+
 TEST_F(TargetDeclTest, MemberOfTemplate) {
   Code = R"cpp(
     template <typename T> struct Foo {

From 82bc33ea3f1a539be50ed46919dc53fc6b685da9 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 18 Jan 2024 19:44:16 -0800
Subject: [PATCH 067/843] [BOLT] Fix unconditional output of boltedcollection
 in merge-fdata (#78653)

Fix the bug where merge-fdata unconditionally outputs boltedcollection
line, regardless of whether input files have it set.

Test Plan:
Added bolt/test/X86/merge-fdata-nobat-mode.test which fails without this
fix.
---
 bolt/lib/Core/BinaryEmitter.cpp               |   1 +
 bolt/test/X86/merge-fdata-bat-mode.test       |   3 +-
 bolt/test/X86/merge-fdata-nobat-mode.test     |   6 +
 bolt/tools/merge-fdata/merge-fdata.cpp        |   2 +-
 clang/include/clang/Driver/Options.td         |   4 +
 clang/lib/Driver/ToolChains/Gnu.cpp           |  29 +++
 cross-project-tests/lit.cfg.py                |  14 +-
 cross-project-tests/lit.site.cfg.py.in        |   4 +
 lldb/test/API/lit.cfg.py                      |   5 +
 lldb/test/API/lit.site.cfg.py.in              |   8 +
 lldb/test/Shell/helper/toolchain.py           |   5 +
 lldb/test/Shell/lit.site.cfg.py.in            |   9 +
 llvm/CMakeLists.txt                           |   4 +
 llvm/include/llvm/MC/MCFragment.h             |  22 ++
 llvm/include/llvm/MC/MCObjectStreamer.h       |   2 +
 llvm/include/llvm/MC/MCStreamer.h             |   6 +
 llvm/lib/MC/MCAssembler.cpp                   | 118 ++++++----
 llvm/lib/MC/MCFragment.cpp                    |  12 +
 llvm/lib/MC/MCObjectStreamer.cpp              |   5 +
 llvm/lib/MC/MCStreamer.cpp                    |   2 +
 .../lib/Target/X86/AsmParser/X86AsmParser.cpp |  24 ++
 llvm/test/MC/X86/directive-avoid_end_align.s  | 208 ++++++++++++++++++
 22 files changed, 452 insertions(+), 41 deletions(-)
 create mode 100644 bolt/test/X86/merge-fdata-nobat-mode.test
 create mode 100644 llvm/test/MC/X86/directive-avoid_end_align.s

diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 3bff3125a57a8..190c700e2c303 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -485,6 +485,7 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
         // This assumes the second instruction in the macro-op pair will get
         // assigned to its own MCRelaxableFragment. Since all JCC instructions
         // are relaxable, we should be safe.
+        Streamer.emitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64, *BC.STI);
       }
 
       if (!EmitCodeOnly) {
diff --git a/bolt/test/X86/merge-fdata-bat-mode.test b/bolt/test/X86/merge-fdata-bat-mode.test
index ea3e4c3e4afcc..41738e196b5d3 100644
--- a/bolt/test/X86/merge-fdata-bat-mode.test
+++ b/bolt/test/X86/merge-fdata-bat-mode.test
@@ -4,6 +4,5 @@ RUN: merge-fdata %S/Inputs/bat_profile_1.fdata \
 RUN:             %S/Inputs/bat_profile_2.fdata \
 RUN:   | FileCheck %s --check-prefix=CHECK-FDATA
 
+CHECK-FDATA: boltedcollection
 CHECK-FDATA: 1 main 451 1 SolveCubic 0 0 302
-
-
diff --git a/bolt/test/X86/merge-fdata-nobat-mode.test b/bolt/test/X86/merge-fdata-nobat-mode.test
new file mode 100644
index 0000000000000..870d9f880e286
--- /dev/null
+++ b/bolt/test/X86/merge-fdata-nobat-mode.test
@@ -0,0 +1,6 @@
+# Check that merge-fdata tool doesn't spuriously print boltedcollection
+
+RUN: merge-fdata %S/Inputs/blarge.fdata %S/Inputs/blarge.fdata \
+RUN:   | FileCheck %s --check-prefix=CHECK-FDATA
+
+CHECK-FDATA-NOT: boltedcollection
diff --git a/bolt/tools/merge-fdata/merge-fdata.cpp b/bolt/tools/merge-fdata/merge-fdata.cpp
index e21aeba73aca2..104991dabc5f0 100644
--- a/bolt/tools/merge-fdata/merge-fdata.cpp
+++ b/bolt/tools/merge-fdata/merge-fdata.cpp
@@ -329,7 +329,7 @@ void mergeLegacyProfiles(const SmallVectorImpl<std::string> &Filenames) {
       MergedProfile.insert_or_assign(Key, Count);
     }
 
-  if (BoltedCollection)
+  if (BoltedCollection.value_or(false))
     output() << "boltedcollection\n";
   for (const auto &[Key, Value] : MergedProfile)
     output() << Key << " " << Value << "\n";
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d2e6c3ff721c2..a6da4b9160b3c 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5256,6 +5256,10 @@ def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">,
   MarshallingInfoFlag<CodeGenOpts<"InstrumentForProfiling">>;
 def pipe : Flag<["-", "--"], "pipe">,
   HelpText<"Use pipes between commands, when possible">;
+// Facebook T92898286
+def post_link_optimize : Flag<["--"], "post-link-optimize">,
+  HelpText<"Apply post-link optimizations using BOLT">;
+// End Facebook T92898286
 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
 def prebind : Flag<["-"], "prebind">;
 def preload : Flag<["-"], "preload">;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index e5e1b1d772697..8f30b5f7da237 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -668,12 +668,41 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
+  // Facebook T92898286
+  if (Args.hasArg(options::OPT_post_link_optimize))
+    CmdArgs.push_back("-q");
+  // End Facebook T92898286
+
   Args.AddAllArgs(CmdArgs, options::OPT_T);
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),
                                          Exec, CmdArgs, Inputs, Output));
+  // Facebook T92898286
+  if (!Args.hasArg(options::OPT_post_link_optimize) || !Output.isFilename())
+    return;
+
+  const char *MvExec = Args.MakeArgString(ToolChain.GetProgramPath("mv"));
+  ArgStringList MoveCmdArgs;
+  MoveCmdArgs.push_back(Output.getFilename());
+  const char *PreBoltBin =
+      Args.MakeArgString(Twine(Output.getFilename()) + ".pre-bolt");
+  MoveCmdArgs.push_back(PreBoltBin);
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         MvExec, MoveCmdArgs, std::nullopt));
+
+  ArgStringList BoltCmdArgs;
+  const char *BoltExec =
+      Args.MakeArgString(ToolChain.GetProgramPath("llvm-bolt"));
+  BoltCmdArgs.push_back(PreBoltBin);
+  BoltCmdArgs.push_back("-reorder-blocks=reverse");
+  BoltCmdArgs.push_back("-update-debug-sections");
+  BoltCmdArgs.push_back("-o");
+  BoltCmdArgs.push_back(Output.getFilename());
+  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
+                                         BoltExec, BoltCmdArgs, std::nullopt));
+  // End Facebook T92898286
 }
 
 void tools::gnutools::Assembler::ConstructJob(Compilation &C,
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index 774c4eaf4d976..619634578dfe6 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -84,7 +84,13 @@ def get_required_attr(config, attr_name):
 # use_clang() and use_lld() respectively, so set them to "", if needed.
 if not hasattr(config, "clang_src_dir"):
     config.clang_src_dir = ""
-llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# Facebook T92898286
+should_test_bolt = get_required_attr(config, "llvm_test_bolt")
+if should_test_bolt:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects), additional_flags=["--post-link-optimize"])
+else:
+    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
+# End Facebook T92898286
 
 if not hasattr(config, "lld_src_dir"):
     config.lld_src_dir = ""
@@ -293,3 +299,9 @@ def get_clang_default_dwarf_version_string(triple):
 # Allow 'REQUIRES: XXX-registered-target' in tests.
 for arch in config.targets_to_build:
     config.available_features.add(arch.lower() + "-registered-target")
+
+# Facebook T92898286
+# Ensure the user's PYTHONPATH is included.
+if "PYTHONPATH" in os.environ:
+    config.environment["PYTHONPATH"] = os.environ["PYTHONPATH"]
+# End Facebook T92898286
diff --git a/cross-project-tests/lit.site.cfg.py.in b/cross-project-tests/lit.site.cfg.py.in
index 39458dfc79afd..2d53cd377f033 100644
--- a/cross-project-tests/lit.site.cfg.py.in
+++ b/cross-project-tests/lit.site.cfg.py.in
@@ -21,6 +21,10 @@ config.mlir_src_root = "@MLIR_SOURCE_DIR@"
 
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 12675edc0fd3b..5358df98366fd 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -244,6 +244,11 @@ def delete_module_cache(path):
 if is_configured("lldb_framework_dir"):
     dotest_cmd += ["--framework", config.lldb_framework_dir]
 
+# Facebook T92898286
+if is_configured("llvm_test_bolt"):
+    dotest_cmd += ["-E", '"--post-link-optimize"']
+# End Facebook T92898286
+
 if (
     "lldb-repro-capture" in config.available_features
     or "lldb-repro-replay" in config.available_features
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 053331dc4881f..1da91d8fb5508 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -1,5 +1,9 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -39,6 +43,10 @@ config.libcxx_include_target_dir = "@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@"
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 # Plugins
 lldb_build_intel_pt = '@LLDB_BUILD_INTEL_PT@'
 if lldb_build_intel_pt == '1':
diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py
index 255955fc70d8c..7b7be06643166 100644
--- a/lldb/test/Shell/helper/toolchain.py
+++ b/lldb/test/Shell/helper/toolchain.py
@@ -165,6 +165,11 @@ def use_support_substitutions(config):
     if config.cmake_sysroot:
         host_flags += ["--sysroot={}".format(config.cmake_sysroot)]
 
+    # Facebook T92898286
+    if config.llvm_test_bolt:
+        host_flags += ["--post-link-optimize"]
+    # End Facebook T92898286
+
     host_flags = " ".join(host_flags)
     config.substitutions.append(("%clang_host", "%clang " + host_flags))
     config.substitutions.append(("%clangxx_host", "%clangxx " + host_flags))
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index 736dfc335732b..3b4d99aa076a4 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -1,5 +1,10 @@
 @LIT_SITE_CFG_IN_HEADER@
 
+#Facebook T92898286
+import lit.util
+#End Facebook T92898286
+
+
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -30,6 +35,10 @@ config.lldb_system_debugserver = @LLDB_USE_SYSTEM_DEBUGSERVER@
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
 
+# Facebook T92898286
+config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
+# End Facebook T92898286
+
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 61ab69d237470..a1f7a94e62afa 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -689,6 +689,10 @@ set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm and --gdb-index when linking." OFF)
 
+# Facebook T92898286
+option(LLVM_TEST_BOLT "Enable BOLT testing in non-BOLT tests that use clang" OFF)
+# End Facebook T92898286
+
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX"))
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index a9b19dc56f16a..256d98423e030 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -33,6 +33,7 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
 public:
   enum FragmentType : uint8_t {
     FT_Align,
+    FT_NeverAlign,
     FT_Data,
     FT_CompactEncodedInst,
     FT_Fill,
@@ -344,6 +345,27 @@ class MCAlignFragment : public MCFragment {
   }
 };
 
+class MCNeverAlignFragment : public MCFragment {
+  /// The alignment the end of the next fragment should avoid.
+  unsigned Alignment;
+
+  /// When emitting Nops some subtargets have specific nop encodings.
+  const MCSubtargetInfo &STI;
+
+public:
+  MCNeverAlignFragment(unsigned Alignment, const MCSubtargetInfo &STI,
+                       MCSection *Sec = nullptr)
+      : MCFragment(FT_NeverAlign, false, Sec), Alignment(Alignment), STI(STI) {}
+
+  unsigned getAlignment() const { return Alignment; }
+
+  const MCSubtargetInfo &getSubtargetInfo() const { return STI; }
+
+  static bool classof(const MCFragment *F) {
+    return F->getKind() == MCFragment::FT_NeverAlign;
+  }
+};
+
 class MCFillFragment : public MCFragment {
   uint8_t ValueSize;
   /// Value to use for filling bytes.
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index 5e5b4b3150170..3c6fd9301647d 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -157,6 +157,8 @@ class MCObjectStreamer : public MCStreamer {
                             unsigned MaxBytesToEmit = 0) override;
   void emitCodeAlignment(Align ByteAlignment, const MCSubtargetInfo *STI,
                          unsigned MaxBytesToEmit = 0) override;
+  void emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                               const MCSubtargetInfo &STI) override;
   void emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                          SMLoc Loc) override;
   void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column,
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 3bf2d22e18235..87691a6ec0e08 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -885,6 +885,12 @@ class MCStreamer {
   virtual void emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
                                  unsigned MaxBytesToEmit = 0);
 
+  /// If the end of the fragment following this NeverAlign fragment ever gets
+  /// aligned to \p ByteAlignment, this fragment emits a single nop before the
+  /// following fragment to break this end-alignment.
+  virtual void emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                       const MCSubtargetInfo &STI);
+
   /// Emit some number of copies of \p Value until the byte offset \p
   /// Offset is reached.
   ///
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index ad30b5ce9e631..62baeb93ea7d0 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -298,6 +298,43 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
   return IsResolved;
 }
 
+/// Check if the branch crosses the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch cross the boundary.
+static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
+                             Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (StartAddr >> Log2(BoundaryAlignment)) !=
+         ((EndAddr - 1) >> Log2(BoundaryAlignment));
+}
+
+/// Check if the branch is against the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch is against the boundary.
+static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
+                              Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
+}
+
+/// Check if the branch needs padding.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch needs padding.
+static bool needPadding(uint64_t StartAddr, uint64_t Size,
+                        Align BoundaryAlignment) {
+  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
+         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
+}
+
 uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
                                           const MCFragment &F) const {
   assert(getBackendPtr() && "Requires assembler backend");
@@ -358,6 +395,41 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return Size;
   }
 
+  case MCFragment::FT_NeverAlign: {
+    // Disclaimer: NeverAlign fragment size depends on the size of its immediate
+    // successor, but NeverAlign need not be a MCRelaxableFragment.
+    // NeverAlign fragment size is recomputed if the successor is relaxed:
+    // - If RelaxableFragment is relaxed, it gets invalidated by marking its
+    // predecessor as LastValidFragment.
+    // - This forces the assembler to call MCAsmLayout::layoutFragment on that
+    // relaxable fragment, which in turn will always ask the predecessor to
+    // compute its size (see "computeFragmentSize(prev)" in layoutFragment).
+    //
+    // In short, the simplest way to ensure that computeFragmentSize() is sane
+    // is to establish the following rule: it should never examine fragments
+    // after the current fragment in the section. If we logically need to
+    // examine any fragment after the current fragment, we need to do that using
+    // relaxation, inside MCAssembler::layoutSectionOnce.
+    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
+    const MCFragment *NF = F.getNextNode();
+    uint64_t Offset = Layout.getFragmentOffset(&NAF);
+    size_t NextFragSize = 0;
+    if (const auto *NextFrag = dyn_cast<MCRelaxableFragment>(NF)) {
+      NextFragSize = NextFrag->getContents().size();
+    } else if (const auto *NextFrag = dyn_cast<MCDataFragment>(NF)) {
+      NextFragSize = NextFrag->getContents().size();
+    } else {
+      llvm_unreachable("Didn't find the expected fragment after NeverAlign");
+    }
+    // Check if the next fragment ends at the alignment we want to avoid.
+    if (isAgainstBoundary(Offset, NextFragSize, Align(NAF.getAlignment()))) {
+      // Avoid this alignment by introducing minimum nop.
+      assert(getBackend().getMinimumNopSize() != NAF.getAlignment());
+      return getBackend().getMinimumNopSize();
+    }
+    return 0;
+  }
+
   case MCFragment::FT_Org: {
     const MCOrgFragment &OF = cast<MCOrgFragment>(F);
     MCValue Value;
@@ -581,6 +653,15 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     break;
   }
 
+  case MCFragment::FT_NeverAlign: {
+    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
+    if (!Asm.getBackend().writeNopData(OS, FragmentSize,
+                                       &NAF.getSubtargetInfo()))
+      report_fatal_error("unable to write nop sequence of " +
+                         Twine(FragmentSize) + " bytes");
+    break;
+  }
+
   case MCFragment::FT_Data:
     ++stats::EmittedDataFragments;
     OS << cast<MCDataFragment>(F).getContents();
@@ -1052,43 +1133,6 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
   return OldSize != LF.getContents().size();
 }
 
-/// Check if the branch crosses the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch cross the boundary.
-static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
-                             Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (StartAddr >> Log2(BoundaryAlignment)) !=
-         ((EndAddr - 1) >> Log2(BoundaryAlignment));
-}
-
-/// Check if the branch is against the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch is against the boundary.
-static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
-                              Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
-}
-
-/// Check if the branch needs padding.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch needs padding.
-static bool needPadding(uint64_t StartAddr, uint64_t Size,
-                        Align BoundaryAlignment) {
-  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
-         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
-}
-
 bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
                                      MCBoundaryAlignFragment &BF) {
   // BoundaryAlignFragment that doesn't need to align any fragment should not be
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index a8da46dbd8727..2626da7e0391a 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -274,6 +274,9 @@ void MCFragment::destroy() {
     case FT_Align:
       delete cast<MCAlignFragment>(this);
       return;
+    case FT_NeverAlign:
+      delete cast<MCNeverAlignFragment>(this);
+      return;
     case FT_Data:
       delete cast<MCDataFragment>(this);
       return;
@@ -342,6 +345,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   OS << "<";
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
+  case MCFragment::FT_NeverAlign:
+    OS << "MCNeverAlignFragment";
+    break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
   case MCFragment::FT_CompactEncodedInst:
     OS << "MCCompactEncodedInstFragment"; break;
@@ -381,6 +387,12 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
        << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
     break;
   }
+  case MCFragment::FT_NeverAlign: {
+    const MCNeverAlignFragment *NAF = cast<MCNeverAlignFragment>(this);
+    OS << "\n       ";
+    OS << " Alignment:" << NAF->getAlignment() << ">";
+    break;
+  }
   case MCFragment::FT_Data:  {
     const auto *DF = cast<MCDataFragment>(this);
     OS << "\n       ";
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index d11ccfb5e269f..28927ec57a8f9 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -659,6 +659,11 @@ void MCObjectStreamer::emitCodeAlignment(Align Alignment,
   cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true, STI);
 }
 
+void MCObjectStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                               const MCSubtargetInfo &STI) {
+  insert(new MCNeverAlignFragment(ByteAlignment, STI));
+}
+
 void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
                                          unsigned char Value,
                                          SMLoc Loc) {
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index bc145aef0b8c5..5738ae252ff27 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1234,6 +1234,8 @@ void MCStreamer::emitValueToAlignment(Align Alignment, int64_t Value,
                                       unsigned MaxBytesToEmit) {}
 void MCStreamer::emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
                                    unsigned MaxBytesToEmit) {}
+void MCStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
+                                         const MCSubtargetInfo &STI) {}
 void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                                    SMLoc Loc) {}
 void MCStreamer::emitBundleAlignMode(Align Alignment) {}
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 051f6caa8c047..2d66d43455df5 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1152,6 +1152,7 @@ class X86AsmParser : public MCTargetAsmParser {
   bool parseDirectiveArch();
   bool parseDirectiveNops(SMLoc L);
   bool parseDirectiveEven(SMLoc L);
+  bool parseDirectiveAvoidEndAlign(SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
   /// CodeView FPO data directives.
@@ -4533,6 +4534,8 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   } else if (IDVal == ".nops")
     return parseDirectiveNops(DirectiveID.getLoc());
+  else if (IDVal == ".avoid_end_align")
+    return parseDirectiveAvoidEndAlign(DirectiveID.getLoc());
   else if (IDVal == ".even")
     return parseDirectiveEven(DirectiveID.getLoc());
   else if (IDVal == ".cv_fpo_proc")
@@ -4627,6 +4630,27 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
   return false;
 }
 
+/// Directive for NeverAlign fragment testing, not for general usage!
+/// parseDirectiveAvoidEndAlign
+///  ::= .avoid_end_align alignment
+bool X86AsmParser::parseDirectiveAvoidEndAlign(SMLoc L) {
+  int64_t Alignment = 0;
+  SMLoc AlignmentLoc;
+  AlignmentLoc = getTok().getLoc();
+  if (getParser().checkForValidSection() ||
+      getParser().parseAbsoluteExpression(Alignment))
+    return true;
+
+  if (getParser().parseEOL("unexpected token in directive"))
+    return true;
+
+  if (Alignment <= 0)
+    return Error(AlignmentLoc, "expected a positive alignment");
+
+  getParser().getStreamer().emitNeverAlignCodeAtEnd(Alignment, getSTI());
+  return false;
+}
+
 /// ParseDirectiveCode
 ///  ::= .code16 | .code32 | .code64
 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
diff --git a/llvm/test/MC/X86/directive-avoid_end_align.s b/llvm/test/MC/X86/directive-avoid_end_align.s
new file mode 100644
index 0000000000000..1d748401edc12
--- /dev/null
+++ b/llvm/test/MC/X86/directive-avoid_end_align.s
@@ -0,0 +1,208 @@
+# RUN: llvm-mc -triple=x86_64 -filetype=obj %s | llvm-objdump --no-show-raw-insn -d - | FileCheck %s
+# RUN: not llvm-mc -triple=x86_64 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
+
+# avoid_end_align has no effect since test doesn't end at alignment boundary:
+.avoid_end_align 64
+# CHECK-NOT: nop
+  testl %eax, %eax
+# CHECK: testl %eax, %eax
+  je  .LBB0
+
+.fill 58, 1, 0x00
+# NeverAlign followed by MCDataFragment:
+# avoid_end_align inserts nop because `test` would end at alignment boundary:
+.avoid_end_align 64
+# CHECK: 			3e: nop
+  testl %eax, %eax
+# CHECK-NEXT: 3f: testl %eax, %eax
+  je  .LBB0
+# CHECK-NEXT: 41: je
+.LBB0:
+  retq
+
+.p2align 6
+.L0:
+.nops 57
+  int3
+# NeverAlign followed by RelaxableFragment:
+.avoid_end_align 64
+# CHECK: 			ba: nop
+  cmpl $(.L1-.L0), %eax
+# CHECK-NEXT: bb: cmpl
+  je  .L0
+# CHECK-NEXT: c1: je
+.nops 65
+.L1:
+
+###############################################################################
+# Experiment A:
+# Check that NeverAlign doesn't introduce infinite loops in layout.
+# Control:
+# 1. NeverAlign fragment is not added,
+# 2. Short formats of cmp and jcc are used (3 and 2 bytes respectively),
+# 3. cmp and jcc are placed such that to be split by 64B alignment boundary.
+# 4. jcc would be relaxed to a longer format if at least one byte is added
+#    between .L10 and je itself, e.g. by adding a NeverAlign padding byte,
+#    or relaxing cmp instruction.
+# 5. cmp would be relaxed to a longer format if at least one byte is added
+#    between .L11 and .L12, e.g. due to relaxing jcc instruction.
+.p2align 6
+# CHECK:      140: int3
+.fill 2, 1, 0xcc
+.L10:
+.nops 122
+  int3
+# CHECK:      1bc: int3
+# no avoid_end_align here
+# CHECK-NOT:  nop
+  cmp $(.L12-.L11), %eax
+# CHECK:      1bd: cmpl
+.L11:
+  je  .L10
+# CHECK-NEXT: 1c0: je
+.nops 125
+.L12:
+
+# Experiment:
+# Same setup as control, except NeverAlign fragment is added before cmp.
+# Expected effect:
+# 1. NeverAlign pads cmp+jcc by one byte since cmp and jcc are split by a 64B
+#    alignment boundary,
+# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4),
+# 3. This results in an cmp relaxation (Control rule #5),
+# 4. Which in turn makes NeverAlign fragment unnecessary as cmp and jcc
+#    are no longer split by an alignment boundary (cmp crosses the boundary).
+# 5. NeverAlign padding is removed.
+# 6. cmp and jcc instruction remain in relaxed form.
+# 7. Relaxation converges, layout succeeds.
+.p2align 6
+# CHECK:      240: int3
+.fill 2, 1, 0xcc
+.L20:
+.nops 122
+  int3
+# CHECK:      2bc: int3
+.avoid_end_align 64
+# CHECK-NOT: 	nop
+  cmp $(.L22-.L21), %eax
+# CHECK-NEXT: 2bd: cmpl
+.L21:
+  je  .L20
+# CHECK-NEXT: 2c3: je
+.nops 125
+.L22:
+
+###############################################################################
+# Experiment B: similar to exp A, but we check that once NeverAlign padding is
+# removed from the layout (exp A, experiment step 5), the increased distance
+# between the symbols L33 and L34 triggers the relaxation of instruction at
+# label L32.
+#
+# Control 1: using a one-byte instruction at L33 (site of NeverAlign) leads to
+# steps 2-3 of exp A, experiment:
+# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4),
+# 3. This results in an cmp relaxation (Control rule #5),
+# => short cmp under L32
+.p2align 6
+# CHECK:      380: int3
+.fill 2, 1, 0xcc
+.L30:
+.nops 122
+  int3
+# CHECK:      3fc: int3
+  hlt
+#.avoid_end_align 64
+.L33:
+  cmp $(.L32-.L31), %eax
+# CHECK:      3fe: cmpl
+.L31:
+  je  .L30
+# CHECK-NEXT: 404: je
+.nops 114
+.p2align 1
+  int3
+  int3
+# CHECK:      47c: int3
+.L34:
+.nops 9
+.L32:
+  cmp $(.L33-.L34), %eax
+# CHECK:      487: cmp
+# note that the size of cmp is 48a-487 == 3 bytes (distance is exactly -128)
+  int3
+# CHECK-NEXT: 48a: int3
+
+# Control 2: leaving out a byte at L43 (site of NeverAlign), plus
+# relaxed jcc and cmp leads to a relaxed cmp under L42 (-129 as cmp's immediate)
+.p2align 6
+# CHECK:      4c0: int3
+.fill 2, 1, 0xcc
+.L40:
+.nops 122
+  int3
+# CHECK:      53c: int3
+#  int3
+#.avoid_end_align 64
+.L43:
+  cmp $(.L42-.L41+0x100), %eax
+# CHECK:      53d: cmpl
+.L41:
+  je  .L40+0x100
+# CHECK-NEXT: 543: je
+.nops 114
+.p2align 1
+  int3
+  int3
+# CHECK:      5bc: int3
+.L44:
+.nops 9
+.L42:
+  cmp $(.L43-.L44), %eax
+# CHECK:      5c7: cmp
+# note that the size of cmp is 5cd-5c7 == 6 bytes (distance is exactly -129)
+  int3
+# CHECK-NEXT: 5cd: int3
+
+# Experiment
+# Checking if removing NeverAlign padding at L53 as a result of alignment and
+# relaxation of cmp and jcc following it (see exp A), thus reproducing the case
+# in Control 2 (getting a relaxed cmp under L52), is handled correctly.
+.p2align 6
+# CHECK:      600: int3
+.fill 2, 1, 0xcc
+.L50:
+.nops 122
+  int3
+# CHECK:      67c: int3
+.avoid_end_align 64
+.L53:
+# CHECK-NOT: 	nop
+  cmp $(.L52-.L51), %eax
+# CHECK-NEXT: 67d: cmpl
+.L51:
+  je  .L50
+# CHECK-NEXT: 683: je
+.nops 114
+.p2align 1
+  int3
+  int3
+# CHECK:      6fc: int3
+.L54:
+.nops 9
+.L52:
+  cmp $(.L53-.L54), %eax
+# CHECK:      707: cmp
+# note that the size of cmp is 70d-707 == 6 bytes (distance is exactly -129)
+  int3
+# CHECK-NEXT: 70d: int3
+
+.ifdef ERR
+# ERR: {{.*}}.s:[[#@LINE+1]]:17: error: unknown token in expression
+.avoid_end_align
+# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected absolute expression
+.avoid_end_align x
+# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected a positive alignment
+.avoid_end_align 0
+# ERR: {{.*}}.s:[[#@LINE+1]]:20: error: unexpected token in directive
+.avoid_end_align 64, 0
+.endif

From 9fec33aadc56c8c4ad3778a92dc0aaa3201a63ae Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 18 Jan 2024 19:59:09 -0800
Subject: [PATCH 068/843] Revert "[BOLT] Fix unconditional output of
 boltedcollection in merge-fdata (#78653)"

This reverts commit 82bc33ea3f1a539be50ed46919dc53fc6b685da9.

Accidentally pushed unrelated changes.
---
 bolt/lib/Core/BinaryEmitter.cpp               |   1 -
 bolt/test/X86/merge-fdata-bat-mode.test       |   3 +-
 bolt/test/X86/merge-fdata-nobat-mode.test     |   6 -
 bolt/tools/merge-fdata/merge-fdata.cpp        |   2 +-
 clang/include/clang/Driver/Options.td         |   4 -
 clang/lib/Driver/ToolChains/Gnu.cpp           |  29 ---
 cross-project-tests/lit.cfg.py                |  14 +-
 cross-project-tests/lit.site.cfg.py.in        |   4 -
 lldb/test/API/lit.cfg.py                      |   5 -
 lldb/test/API/lit.site.cfg.py.in              |   8 -
 lldb/test/Shell/helper/toolchain.py           |   5 -
 lldb/test/Shell/lit.site.cfg.py.in            |   9 -
 llvm/CMakeLists.txt                           |   4 -
 llvm/include/llvm/MC/MCFragment.h             |  22 --
 llvm/include/llvm/MC/MCObjectStreamer.h       |   2 -
 llvm/include/llvm/MC/MCStreamer.h             |   6 -
 llvm/lib/MC/MCAssembler.cpp                   | 118 ++++------
 llvm/lib/MC/MCFragment.cpp                    |  12 -
 llvm/lib/MC/MCObjectStreamer.cpp              |   5 -
 llvm/lib/MC/MCStreamer.cpp                    |   2 -
 .../lib/Target/X86/AsmParser/X86AsmParser.cpp |  24 --
 llvm/test/MC/X86/directive-avoid_end_align.s  | 208 ------------------
 22 files changed, 41 insertions(+), 452 deletions(-)
 delete mode 100644 bolt/test/X86/merge-fdata-nobat-mode.test
 delete mode 100644 llvm/test/MC/X86/directive-avoid_end_align.s

diff --git a/bolt/lib/Core/BinaryEmitter.cpp b/bolt/lib/Core/BinaryEmitter.cpp
index 190c700e2c303..3bff3125a57a8 100644
--- a/bolt/lib/Core/BinaryEmitter.cpp
+++ b/bolt/lib/Core/BinaryEmitter.cpp
@@ -485,7 +485,6 @@ void BinaryEmitter::emitFunctionBody(BinaryFunction &BF, FunctionFragment &FF,
         // This assumes the second instruction in the macro-op pair will get
         // assigned to its own MCRelaxableFragment. Since all JCC instructions
         // are relaxable, we should be safe.
-        Streamer.emitNeverAlignCodeAtEnd(/*Alignment to avoid=*/64, *BC.STI);
       }
 
       if (!EmitCodeOnly) {
diff --git a/bolt/test/X86/merge-fdata-bat-mode.test b/bolt/test/X86/merge-fdata-bat-mode.test
index 41738e196b5d3..ea3e4c3e4afcc 100644
--- a/bolt/test/X86/merge-fdata-bat-mode.test
+++ b/bolt/test/X86/merge-fdata-bat-mode.test
@@ -4,5 +4,6 @@ RUN: merge-fdata %S/Inputs/bat_profile_1.fdata \
 RUN:             %S/Inputs/bat_profile_2.fdata \
 RUN:   | FileCheck %s --check-prefix=CHECK-FDATA
 
-CHECK-FDATA: boltedcollection
 CHECK-FDATA: 1 main 451 1 SolveCubic 0 0 302
+
+
diff --git a/bolt/test/X86/merge-fdata-nobat-mode.test b/bolt/test/X86/merge-fdata-nobat-mode.test
deleted file mode 100644
index 870d9f880e286..0000000000000
--- a/bolt/test/X86/merge-fdata-nobat-mode.test
+++ /dev/null
@@ -1,6 +0,0 @@
-# Check that merge-fdata tool doesn't spuriously print boltedcollection
-
-RUN: merge-fdata %S/Inputs/blarge.fdata %S/Inputs/blarge.fdata \
-RUN:   | FileCheck %s --check-prefix=CHECK-FDATA
-
-CHECK-FDATA-NOT: boltedcollection
diff --git a/bolt/tools/merge-fdata/merge-fdata.cpp b/bolt/tools/merge-fdata/merge-fdata.cpp
index 104991dabc5f0..e21aeba73aca2 100644
--- a/bolt/tools/merge-fdata/merge-fdata.cpp
+++ b/bolt/tools/merge-fdata/merge-fdata.cpp
@@ -329,7 +329,7 @@ void mergeLegacyProfiles(const SmallVectorImpl<std::string> &Filenames) {
       MergedProfile.insert_or_assign(Key, Count);
     }
 
-  if (BoltedCollection.value_or(false))
+  if (BoltedCollection)
     output() << "boltedcollection\n";
   for (const auto &[Key, Value] : MergedProfile)
     output() << Key << " " << Value << "\n";
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index a6da4b9160b3c..d2e6c3ff721c2 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5256,10 +5256,6 @@ def pg : Flag<["-"], "pg">, HelpText<"Enable mcount instrumentation">,
   MarshallingInfoFlag<CodeGenOpts<"InstrumentForProfiling">>;
 def pipe : Flag<["-", "--"], "pipe">,
   HelpText<"Use pipes between commands, when possible">;
-// Facebook T92898286
-def post_link_optimize : Flag<["--"], "post-link-optimize">,
-  HelpText<"Apply post-link optimizations using BOLT">;
-// End Facebook T92898286
 def prebind__all__twolevel__modules : Flag<["-"], "prebind_all_twolevel_modules">;
 def prebind : Flag<["-"], "prebind">;
 def preload : Flag<["-"], "preload">;
diff --git a/clang/lib/Driver/ToolChains/Gnu.cpp b/clang/lib/Driver/ToolChains/Gnu.cpp
index 8f30b5f7da237..e5e1b1d772697 100644
--- a/clang/lib/Driver/ToolChains/Gnu.cpp
+++ b/clang/lib/Driver/ToolChains/Gnu.cpp
@@ -668,41 +668,12 @@ void tools::gnutools::Linker::ConstructJob(Compilation &C, const JobAction &JA,
     }
   }
 
-  // Facebook T92898286
-  if (Args.hasArg(options::OPT_post_link_optimize))
-    CmdArgs.push_back("-q");
-  // End Facebook T92898286
-
   Args.AddAllArgs(CmdArgs, options::OPT_T);
 
   const char *Exec = Args.MakeArgString(ToolChain.GetLinkerPath());
   C.addCommand(std::make_unique<Command>(JA, *this,
                                          ResponseFileSupport::AtFileCurCP(),
                                          Exec, CmdArgs, Inputs, Output));
-  // Facebook T92898286
-  if (!Args.hasArg(options::OPT_post_link_optimize) || !Output.isFilename())
-    return;
-
-  const char *MvExec = Args.MakeArgString(ToolChain.GetProgramPath("mv"));
-  ArgStringList MoveCmdArgs;
-  MoveCmdArgs.push_back(Output.getFilename());
-  const char *PreBoltBin =
-      Args.MakeArgString(Twine(Output.getFilename()) + ".pre-bolt");
-  MoveCmdArgs.push_back(PreBoltBin);
-  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         MvExec, MoveCmdArgs, std::nullopt));
-
-  ArgStringList BoltCmdArgs;
-  const char *BoltExec =
-      Args.MakeArgString(ToolChain.GetProgramPath("llvm-bolt"));
-  BoltCmdArgs.push_back(PreBoltBin);
-  BoltCmdArgs.push_back("-reorder-blocks=reverse");
-  BoltCmdArgs.push_back("-update-debug-sections");
-  BoltCmdArgs.push_back("-o");
-  BoltCmdArgs.push_back(Output.getFilename());
-  C.addCommand(std::make_unique<Command>(JA, *this, ResponseFileSupport::None(),
-                                         BoltExec, BoltCmdArgs, std::nullopt));
-  // End Facebook T92898286
 }
 
 void tools::gnutools::Assembler::ConstructJob(Compilation &C,
diff --git a/cross-project-tests/lit.cfg.py b/cross-project-tests/lit.cfg.py
index 619634578dfe6..774c4eaf4d976 100644
--- a/cross-project-tests/lit.cfg.py
+++ b/cross-project-tests/lit.cfg.py
@@ -84,13 +84,7 @@ def get_required_attr(config, attr_name):
 # use_clang() and use_lld() respectively, so set them to "", if needed.
 if not hasattr(config, "clang_src_dir"):
     config.clang_src_dir = ""
-# Facebook T92898286
-should_test_bolt = get_required_attr(config, "llvm_test_bolt")
-if should_test_bolt:
-    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects), additional_flags=["--post-link-optimize"])
-else:
-    llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
-# End Facebook T92898286
+llvm_config.use_clang(required=("clang" in config.llvm_enabled_projects))
 
 if not hasattr(config, "lld_src_dir"):
     config.lld_src_dir = ""
@@ -299,9 +293,3 @@ def get_clang_default_dwarf_version_string(triple):
 # Allow 'REQUIRES: XXX-registered-target' in tests.
 for arch in config.targets_to_build:
     config.available_features.add(arch.lower() + "-registered-target")
-
-# Facebook T92898286
-# Ensure the user's PYTHONPATH is included.
-if "PYTHONPATH" in os.environ:
-    config.environment["PYTHONPATH"] = os.environ["PYTHONPATH"]
-# End Facebook T92898286
diff --git a/cross-project-tests/lit.site.cfg.py.in b/cross-project-tests/lit.site.cfg.py.in
index 2d53cd377f033..39458dfc79afd 100644
--- a/cross-project-tests/lit.site.cfg.py.in
+++ b/cross-project-tests/lit.site.cfg.py.in
@@ -21,10 +21,6 @@ config.mlir_src_root = "@MLIR_SOURCE_DIR@"
 
 config.llvm_use_sanitizer = "@LLVM_USE_SANITIZER@"
 
-# Facebook T92898286
-config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
-# End Facebook T92898286
-
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/lldb/test/API/lit.cfg.py b/lldb/test/API/lit.cfg.py
index 5358df98366fd..12675edc0fd3b 100644
--- a/lldb/test/API/lit.cfg.py
+++ b/lldb/test/API/lit.cfg.py
@@ -244,11 +244,6 @@ def delete_module_cache(path):
 if is_configured("lldb_framework_dir"):
     dotest_cmd += ["--framework", config.lldb_framework_dir]
 
-# Facebook T92898286
-if is_configured("llvm_test_bolt"):
-    dotest_cmd += ["-E", '"--post-link-optimize"']
-# End Facebook T92898286
-
 if (
     "lldb-repro-capture" in config.available_features
     or "lldb-repro-replay" in config.available_features
diff --git a/lldb/test/API/lit.site.cfg.py.in b/lldb/test/API/lit.site.cfg.py.in
index 1da91d8fb5508..053331dc4881f 100644
--- a/lldb/test/API/lit.site.cfg.py.in
+++ b/lldb/test/API/lit.site.cfg.py.in
@@ -1,9 +1,5 @@
 @LIT_SITE_CFG_IN_HEADER@
 
-#Facebook T92898286
-import lit.util
-#End Facebook T92898286
-
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -43,10 +39,6 @@ config.libcxx_include_target_dir = "@LIBCXX_GENERATED_INCLUDE_TARGET_DIR@"
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-api")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-api")
 
-# Facebook T92898286
-config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
-# End Facebook T92898286
-
 # Plugins
 lldb_build_intel_pt = '@LLDB_BUILD_INTEL_PT@'
 if lldb_build_intel_pt == '1':
diff --git a/lldb/test/Shell/helper/toolchain.py b/lldb/test/Shell/helper/toolchain.py
index 7b7be06643166..255955fc70d8c 100644
--- a/lldb/test/Shell/helper/toolchain.py
+++ b/lldb/test/Shell/helper/toolchain.py
@@ -165,11 +165,6 @@ def use_support_substitutions(config):
     if config.cmake_sysroot:
         host_flags += ["--sysroot={}".format(config.cmake_sysroot)]
 
-    # Facebook T92898286
-    if config.llvm_test_bolt:
-        host_flags += ["--post-link-optimize"]
-    # End Facebook T92898286
-
     host_flags = " ".join(host_flags)
     config.substitutions.append(("%clang_host", "%clang " + host_flags))
     config.substitutions.append(("%clangxx_host", "%clangxx " + host_flags))
diff --git a/lldb/test/Shell/lit.site.cfg.py.in b/lldb/test/Shell/lit.site.cfg.py.in
index 3b4d99aa076a4..736dfc335732b 100644
--- a/lldb/test/Shell/lit.site.cfg.py.in
+++ b/lldb/test/Shell/lit.site.cfg.py.in
@@ -1,10 +1,5 @@
 @LIT_SITE_CFG_IN_HEADER@
 
-#Facebook T92898286
-import lit.util
-#End Facebook T92898286
-
-
 config.llvm_src_root = "@LLVM_SOURCE_DIR@"
 config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
@@ -35,10 +30,6 @@ config.lldb_system_debugserver = @LLDB_USE_SYSTEM_DEBUGSERVER@
 config.lldb_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_LLDB@", "lldb-shell")
 config.clang_module_cache = os.path.join("@LLDB_TEST_MODULE_CACHE_CLANG@", "lldb-shell")
 
-# Facebook T92898286
-config.llvm_test_bolt = lit.util.pythonize_bool("@LLVM_TEST_BOLT@")
-# End Facebook T92898286
-
 import lit.llvm
 lit.llvm.initialize(lit_config, config)
 
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index a1f7a94e62afa..61ab69d237470 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -689,10 +689,6 @@ set(LLVM_LIB_FUZZING_ENGINE "" CACHE PATH
 option(LLVM_USE_SPLIT_DWARF
   "Use -gsplit-dwarf when compiling llvm and --gdb-index when linking." OFF)
 
-# Facebook T92898286
-option(LLVM_TEST_BOLT "Enable BOLT testing in non-BOLT tests that use clang" OFF)
-# End Facebook T92898286
-
 # Define an option controlling whether we should build for 32-bit on 64-bit
 # platforms, where supported.
 if( CMAKE_SIZEOF_VOID_P EQUAL 8 AND NOT (WIN32 OR ${CMAKE_SYSTEM_NAME} MATCHES "AIX"))
diff --git a/llvm/include/llvm/MC/MCFragment.h b/llvm/include/llvm/MC/MCFragment.h
index 256d98423e030..a9b19dc56f16a 100644
--- a/llvm/include/llvm/MC/MCFragment.h
+++ b/llvm/include/llvm/MC/MCFragment.h
@@ -33,7 +33,6 @@ class MCFragment : public ilist_node_with_parent<MCFragment, MCSection> {
 public:
   enum FragmentType : uint8_t {
     FT_Align,
-    FT_NeverAlign,
     FT_Data,
     FT_CompactEncodedInst,
     FT_Fill,
@@ -345,27 +344,6 @@ class MCAlignFragment : public MCFragment {
   }
 };
 
-class MCNeverAlignFragment : public MCFragment {
-  /// The alignment the end of the next fragment should avoid.
-  unsigned Alignment;
-
-  /// When emitting Nops some subtargets have specific nop encodings.
-  const MCSubtargetInfo &STI;
-
-public:
-  MCNeverAlignFragment(unsigned Alignment, const MCSubtargetInfo &STI,
-                       MCSection *Sec = nullptr)
-      : MCFragment(FT_NeverAlign, false, Sec), Alignment(Alignment), STI(STI) {}
-
-  unsigned getAlignment() const { return Alignment; }
-
-  const MCSubtargetInfo &getSubtargetInfo() const { return STI; }
-
-  static bool classof(const MCFragment *F) {
-    return F->getKind() == MCFragment::FT_NeverAlign;
-  }
-};
-
 class MCFillFragment : public MCFragment {
   uint8_t ValueSize;
   /// Value to use for filling bytes.
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index 3c6fd9301647d..5e5b4b3150170 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -157,8 +157,6 @@ class MCObjectStreamer : public MCStreamer {
                             unsigned MaxBytesToEmit = 0) override;
   void emitCodeAlignment(Align ByteAlignment, const MCSubtargetInfo *STI,
                          unsigned MaxBytesToEmit = 0) override;
-  void emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
-                               const MCSubtargetInfo &STI) override;
   void emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                          SMLoc Loc) override;
   void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column,
diff --git a/llvm/include/llvm/MC/MCStreamer.h b/llvm/include/llvm/MC/MCStreamer.h
index 87691a6ec0e08..3bf2d22e18235 100644
--- a/llvm/include/llvm/MC/MCStreamer.h
+++ b/llvm/include/llvm/MC/MCStreamer.h
@@ -885,12 +885,6 @@ class MCStreamer {
   virtual void emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
                                  unsigned MaxBytesToEmit = 0);
 
-  /// If the end of the fragment following this NeverAlign fragment ever gets
-  /// aligned to \p ByteAlignment, this fragment emits a single nop before the
-  /// following fragment to break this end-alignment.
-  virtual void emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
-                                       const MCSubtargetInfo &STI);
-
   /// Emit some number of copies of \p Value until the byte offset \p
   /// Offset is reached.
   ///
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index 62baeb93ea7d0..ad30b5ce9e631 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -298,43 +298,6 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout, const MCFixup &Fixup,
   return IsResolved;
 }
 
-/// Check if the branch crosses the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch cross the boundary.
-static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
-                             Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (StartAddr >> Log2(BoundaryAlignment)) !=
-         ((EndAddr - 1) >> Log2(BoundaryAlignment));
-}
-
-/// Check if the branch is against the boundary.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch is against the boundary.
-static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
-                              Align BoundaryAlignment) {
-  uint64_t EndAddr = StartAddr + Size;
-  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
-}
-
-/// Check if the branch needs padding.
-///
-/// \param StartAddr start address of the fused/unfused branch.
-/// \param Size size of the fused/unfused branch.
-/// \param BoundaryAlignment alignment requirement of the branch.
-/// \returns true if the branch needs padding.
-static bool needPadding(uint64_t StartAddr, uint64_t Size,
-                        Align BoundaryAlignment) {
-  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
-         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
-}
-
 uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
                                           const MCFragment &F) const {
   assert(getBackendPtr() && "Requires assembler backend");
@@ -395,41 +358,6 @@ uint64_t MCAssembler::computeFragmentSize(const MCAsmLayout &Layout,
     return Size;
   }
 
-  case MCFragment::FT_NeverAlign: {
-    // Disclaimer: NeverAlign fragment size depends on the size of its immediate
-    // successor, but NeverAlign need not be a MCRelaxableFragment.
-    // NeverAlign fragment size is recomputed if the successor is relaxed:
-    // - If RelaxableFragment is relaxed, it gets invalidated by marking its
-    // predecessor as LastValidFragment.
-    // - This forces the assembler to call MCAsmLayout::layoutFragment on that
-    // relaxable fragment, which in turn will always ask the predecessor to
-    // compute its size (see "computeFragmentSize(prev)" in layoutFragment).
-    //
-    // In short, the simplest way to ensure that computeFragmentSize() is sane
-    // is to establish the following rule: it should never examine fragments
-    // after the current fragment in the section. If we logically need to
-    // examine any fragment after the current fragment, we need to do that using
-    // relaxation, inside MCAssembler::layoutSectionOnce.
-    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
-    const MCFragment *NF = F.getNextNode();
-    uint64_t Offset = Layout.getFragmentOffset(&NAF);
-    size_t NextFragSize = 0;
-    if (const auto *NextFrag = dyn_cast<MCRelaxableFragment>(NF)) {
-      NextFragSize = NextFrag->getContents().size();
-    } else if (const auto *NextFrag = dyn_cast<MCDataFragment>(NF)) {
-      NextFragSize = NextFrag->getContents().size();
-    } else {
-      llvm_unreachable("Didn't find the expected fragment after NeverAlign");
-    }
-    // Check if the next fragment ends at the alignment we want to avoid.
-    if (isAgainstBoundary(Offset, NextFragSize, Align(NAF.getAlignment()))) {
-      // Avoid this alignment by introducing minimum nop.
-      assert(getBackend().getMinimumNopSize() != NAF.getAlignment());
-      return getBackend().getMinimumNopSize();
-    }
-    return 0;
-  }
-
   case MCFragment::FT_Org: {
     const MCOrgFragment &OF = cast<MCOrgFragment>(F);
     MCValue Value;
@@ -653,15 +581,6 @@ static void writeFragment(raw_ostream &OS, const MCAssembler &Asm,
     break;
   }
 
-  case MCFragment::FT_NeverAlign: {
-    const MCNeverAlignFragment &NAF = cast<MCNeverAlignFragment>(F);
-    if (!Asm.getBackend().writeNopData(OS, FragmentSize,
-                                       &NAF.getSubtargetInfo()))
-      report_fatal_error("unable to write nop sequence of " +
-                         Twine(FragmentSize) + " bytes");
-    break;
-  }
-
   case MCFragment::FT_Data:
     ++stats::EmittedDataFragments;
     OS << cast<MCDataFragment>(F).getContents();
@@ -1133,6 +1052,43 @@ bool MCAssembler::relaxLEB(MCAsmLayout &Layout, MCLEBFragment &LF) {
   return OldSize != LF.getContents().size();
 }
 
+/// Check if the branch crosses the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch cross the boundary.
+static bool mayCrossBoundary(uint64_t StartAddr, uint64_t Size,
+                             Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (StartAddr >> Log2(BoundaryAlignment)) !=
+         ((EndAddr - 1) >> Log2(BoundaryAlignment));
+}
+
+/// Check if the branch is against the boundary.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch is against the boundary.
+static bool isAgainstBoundary(uint64_t StartAddr, uint64_t Size,
+                              Align BoundaryAlignment) {
+  uint64_t EndAddr = StartAddr + Size;
+  return (EndAddr & (BoundaryAlignment.value() - 1)) == 0;
+}
+
+/// Check if the branch needs padding.
+///
+/// \param StartAddr start address of the fused/unfused branch.
+/// \param Size size of the fused/unfused branch.
+/// \param BoundaryAlignment alignment requirement of the branch.
+/// \returns true if the branch needs padding.
+static bool needPadding(uint64_t StartAddr, uint64_t Size,
+                        Align BoundaryAlignment) {
+  return mayCrossBoundary(StartAddr, Size, BoundaryAlignment) ||
+         isAgainstBoundary(StartAddr, Size, BoundaryAlignment);
+}
+
 bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
                                      MCBoundaryAlignFragment &BF) {
   // BoundaryAlignFragment that doesn't need to align any fragment should not be
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index 2626da7e0391a..a8da46dbd8727 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -274,9 +274,6 @@ void MCFragment::destroy() {
     case FT_Align:
       delete cast<MCAlignFragment>(this);
       return;
-    case FT_NeverAlign:
-      delete cast<MCNeverAlignFragment>(this);
-      return;
     case FT_Data:
       delete cast<MCDataFragment>(this);
       return;
@@ -345,9 +342,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   OS << "<";
   switch (getKind()) {
   case MCFragment::FT_Align: OS << "MCAlignFragment"; break;
-  case MCFragment::FT_NeverAlign:
-    OS << "MCNeverAlignFragment";
-    break;
   case MCFragment::FT_Data:  OS << "MCDataFragment"; break;
   case MCFragment::FT_CompactEncodedInst:
     OS << "MCCompactEncodedInstFragment"; break;
@@ -387,12 +381,6 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
        << " MaxBytesToEmit:" << AF->getMaxBytesToEmit() << ">";
     break;
   }
-  case MCFragment::FT_NeverAlign: {
-    const MCNeverAlignFragment *NAF = cast<MCNeverAlignFragment>(this);
-    OS << "\n       ";
-    OS << " Alignment:" << NAF->getAlignment() << ">";
-    break;
-  }
   case MCFragment::FT_Data:  {
     const auto *DF = cast<MCDataFragment>(this);
     OS << "\n       ";
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 28927ec57a8f9..d11ccfb5e269f 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -659,11 +659,6 @@ void MCObjectStreamer::emitCodeAlignment(Align Alignment,
   cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true, STI);
 }
 
-void MCObjectStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
-                                               const MCSubtargetInfo &STI) {
-  insert(new MCNeverAlignFragment(ByteAlignment, STI));
-}
-
 void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
                                          unsigned char Value,
                                          SMLoc Loc) {
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 5738ae252ff27..bc145aef0b8c5 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -1234,8 +1234,6 @@ void MCStreamer::emitValueToAlignment(Align Alignment, int64_t Value,
                                       unsigned MaxBytesToEmit) {}
 void MCStreamer::emitCodeAlignment(Align Alignment, const MCSubtargetInfo *STI,
                                    unsigned MaxBytesToEmit) {}
-void MCStreamer::emitNeverAlignCodeAtEnd(unsigned ByteAlignment,
-                                         const MCSubtargetInfo &STI) {}
 void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                                    SMLoc Loc) {}
 void MCStreamer::emitBundleAlignMode(Align Alignment) {}
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index 2d66d43455df5..051f6caa8c047 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -1152,7 +1152,6 @@ class X86AsmParser : public MCTargetAsmParser {
   bool parseDirectiveArch();
   bool parseDirectiveNops(SMLoc L);
   bool parseDirectiveEven(SMLoc L);
-  bool parseDirectiveAvoidEndAlign(SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
 
   /// CodeView FPO data directives.
@@ -4534,8 +4533,6 @@ bool X86AsmParser::ParseDirective(AsmToken DirectiveID) {
     return false;
   } else if (IDVal == ".nops")
     return parseDirectiveNops(DirectiveID.getLoc());
-  else if (IDVal == ".avoid_end_align")
-    return parseDirectiveAvoidEndAlign(DirectiveID.getLoc());
   else if (IDVal == ".even")
     return parseDirectiveEven(DirectiveID.getLoc());
   else if (IDVal == ".cv_fpo_proc")
@@ -4630,27 +4627,6 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
   return false;
 }
 
-/// Directive for NeverAlign fragment testing, not for general usage!
-/// parseDirectiveAvoidEndAlign
-///  ::= .avoid_end_align alignment
-bool X86AsmParser::parseDirectiveAvoidEndAlign(SMLoc L) {
-  int64_t Alignment = 0;
-  SMLoc AlignmentLoc;
-  AlignmentLoc = getTok().getLoc();
-  if (getParser().checkForValidSection() ||
-      getParser().parseAbsoluteExpression(Alignment))
-    return true;
-
-  if (getParser().parseEOL("unexpected token in directive"))
-    return true;
-
-  if (Alignment <= 0)
-    return Error(AlignmentLoc, "expected a positive alignment");
-
-  getParser().getStreamer().emitNeverAlignCodeAtEnd(Alignment, getSTI());
-  return false;
-}
-
 /// ParseDirectiveCode
 ///  ::= .code16 | .code32 | .code64
 bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
diff --git a/llvm/test/MC/X86/directive-avoid_end_align.s b/llvm/test/MC/X86/directive-avoid_end_align.s
deleted file mode 100644
index 1d748401edc12..0000000000000
--- a/llvm/test/MC/X86/directive-avoid_end_align.s
+++ /dev/null
@@ -1,208 +0,0 @@
-# RUN: llvm-mc -triple=x86_64 -filetype=obj %s | llvm-objdump --no-show-raw-insn -d - | FileCheck %s
-# RUN: not llvm-mc -triple=x86_64 --defsym ERR=1 %s -o /dev/null 2>&1 | FileCheck %s --check-prefix=ERR
-
-# avoid_end_align has no effect since test doesn't end at alignment boundary:
-.avoid_end_align 64
-# CHECK-NOT: nop
-  testl %eax, %eax
-# CHECK: testl %eax, %eax
-  je  .LBB0
-
-.fill 58, 1, 0x00
-# NeverAlign followed by MCDataFragment:
-# avoid_end_align inserts nop because `test` would end at alignment boundary:
-.avoid_end_align 64
-# CHECK: 			3e: nop
-  testl %eax, %eax
-# CHECK-NEXT: 3f: testl %eax, %eax
-  je  .LBB0
-# CHECK-NEXT: 41: je
-.LBB0:
-  retq
-
-.p2align 6
-.L0:
-.nops 57
-  int3
-# NeverAlign followed by RelaxableFragment:
-.avoid_end_align 64
-# CHECK: 			ba: nop
-  cmpl $(.L1-.L0), %eax
-# CHECK-NEXT: bb: cmpl
-  je  .L0
-# CHECK-NEXT: c1: je
-.nops 65
-.L1:
-
-###############################################################################
-# Experiment A:
-# Check that NeverAlign doesn't introduce infinite loops in layout.
-# Control:
-# 1. NeverAlign fragment is not added,
-# 2. Short formats of cmp and jcc are used (3 and 2 bytes respectively),
-# 3. cmp and jcc are placed such that to be split by 64B alignment boundary.
-# 4. jcc would be relaxed to a longer format if at least one byte is added
-#    between .L10 and je itself, e.g. by adding a NeverAlign padding byte,
-#    or relaxing cmp instruction.
-# 5. cmp would be relaxed to a longer format if at least one byte is added
-#    between .L11 and .L12, e.g. due to relaxing jcc instruction.
-.p2align 6
-# CHECK:      140: int3
-.fill 2, 1, 0xcc
-.L10:
-.nops 122
-  int3
-# CHECK:      1bc: int3
-# no avoid_end_align here
-# CHECK-NOT:  nop
-  cmp $(.L12-.L11), %eax
-# CHECK:      1bd: cmpl
-.L11:
-  je  .L10
-# CHECK-NEXT: 1c0: je
-.nops 125
-.L12:
-
-# Experiment:
-# Same setup as control, except NeverAlign fragment is added before cmp.
-# Expected effect:
-# 1. NeverAlign pads cmp+jcc by one byte since cmp and jcc are split by a 64B
-#    alignment boundary,
-# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4),
-# 3. This results in an cmp relaxation (Control rule #5),
-# 4. Which in turn makes NeverAlign fragment unnecessary as cmp and jcc
-#    are no longer split by an alignment boundary (cmp crosses the boundary).
-# 5. NeverAlign padding is removed.
-# 6. cmp and jcc instruction remain in relaxed form.
-# 7. Relaxation converges, layout succeeds.
-.p2align 6
-# CHECK:      240: int3
-.fill 2, 1, 0xcc
-.L20:
-.nops 122
-  int3
-# CHECK:      2bc: int3
-.avoid_end_align 64
-# CHECK-NOT: 	nop
-  cmp $(.L22-.L21), %eax
-# CHECK-NEXT: 2bd: cmpl
-.L21:
-  je  .L20
-# CHECK-NEXT: 2c3: je
-.nops 125
-.L22:
-
-###############################################################################
-# Experiment B: similar to exp A, but we check that once NeverAlign padding is
-# removed from the layout (exp A, experiment step 5), the increased distance
-# between the symbols L33 and L34 triggers the relaxation of instruction at
-# label L32.
-#
-# Control 1: using a one-byte instruction at L33 (site of NeverAlign) leads to
-# steps 2-3 of exp A, experiment:
-# 2. This extra byte forces jcc relaxation to a longer format (Control rule #4),
-# 3. This results in an cmp relaxation (Control rule #5),
-# => short cmp under L32
-.p2align 6
-# CHECK:      380: int3
-.fill 2, 1, 0xcc
-.L30:
-.nops 122
-  int3
-# CHECK:      3fc: int3
-  hlt
-#.avoid_end_align 64
-.L33:
-  cmp $(.L32-.L31), %eax
-# CHECK:      3fe: cmpl
-.L31:
-  je  .L30
-# CHECK-NEXT: 404: je
-.nops 114
-.p2align 1
-  int3
-  int3
-# CHECK:      47c: int3
-.L34:
-.nops 9
-.L32:
-  cmp $(.L33-.L34), %eax
-# CHECK:      487: cmp
-# note that the size of cmp is 48a-487 == 3 bytes (distance is exactly -128)
-  int3
-# CHECK-NEXT: 48a: int3
-
-# Control 2: leaving out a byte at L43 (site of NeverAlign), plus
-# relaxed jcc and cmp leads to a relaxed cmp under L42 (-129 as cmp's immediate)
-.p2align 6
-# CHECK:      4c0: int3
-.fill 2, 1, 0xcc
-.L40:
-.nops 122
-  int3
-# CHECK:      53c: int3
-#  int3
-#.avoid_end_align 64
-.L43:
-  cmp $(.L42-.L41+0x100), %eax
-# CHECK:      53d: cmpl
-.L41:
-  je  .L40+0x100
-# CHECK-NEXT: 543: je
-.nops 114
-.p2align 1
-  int3
-  int3
-# CHECK:      5bc: int3
-.L44:
-.nops 9
-.L42:
-  cmp $(.L43-.L44), %eax
-# CHECK:      5c7: cmp
-# note that the size of cmp is 5cd-5c7 == 6 bytes (distance is exactly -129)
-  int3
-# CHECK-NEXT: 5cd: int3
-
-# Experiment
-# Checking if removing NeverAlign padding at L53 as a result of alignment and
-# relaxation of cmp and jcc following it (see exp A), thus reproducing the case
-# in Control 2 (getting a relaxed cmp under L52), is handled correctly.
-.p2align 6
-# CHECK:      600: int3
-.fill 2, 1, 0xcc
-.L50:
-.nops 122
-  int3
-# CHECK:      67c: int3
-.avoid_end_align 64
-.L53:
-# CHECK-NOT: 	nop
-  cmp $(.L52-.L51), %eax
-# CHECK-NEXT: 67d: cmpl
-.L51:
-  je  .L50
-# CHECK-NEXT: 683: je
-.nops 114
-.p2align 1
-  int3
-  int3
-# CHECK:      6fc: int3
-.L54:
-.nops 9
-.L52:
-  cmp $(.L53-.L54), %eax
-# CHECK:      707: cmp
-# note that the size of cmp is 70d-707 == 6 bytes (distance is exactly -129)
-  int3
-# CHECK-NEXT: 70d: int3
-
-.ifdef ERR
-# ERR: {{.*}}.s:[[#@LINE+1]]:17: error: unknown token in expression
-.avoid_end_align
-# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected absolute expression
-.avoid_end_align x
-# ERR: {{.*}}.s:[[#@LINE+1]]:18: error: expected a positive alignment
-.avoid_end_align 0
-# ERR: {{.*}}.s:[[#@LINE+1]]:20: error: unexpected token in directive
-.avoid_end_align 64, 0
-.endif

From 6735ce9d25acc091214cc90ff553f1d302f6b14e Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Thu, 18 Jan 2024 16:22:55 -0800
Subject: [PATCH 069/843] [BOLT] Fix unconditional output of boltedcollection
 in merge-fdata (#78653)

Fix the bug where merge-fdata unconditionally outputs boltedcollection
line, regardless of whether input files have it set.

Test Plan:
Added bolt/test/X86/merge-fdata-nobat-mode.test which fails without this
fix.
---
 bolt/test/X86/merge-fdata-bat-mode.test   | 3 +--
 bolt/test/X86/merge-fdata-nobat-mode.test | 6 ++++++
 bolt/tools/merge-fdata/merge-fdata.cpp    | 2 +-
 3 files changed, 8 insertions(+), 3 deletions(-)
 create mode 100644 bolt/test/X86/merge-fdata-nobat-mode.test

diff --git a/bolt/test/X86/merge-fdata-bat-mode.test b/bolt/test/X86/merge-fdata-bat-mode.test
index ea3e4c3e4afcc..41738e196b5d3 100644
--- a/bolt/test/X86/merge-fdata-bat-mode.test
+++ b/bolt/test/X86/merge-fdata-bat-mode.test
@@ -4,6 +4,5 @@ RUN: merge-fdata %S/Inputs/bat_profile_1.fdata \
 RUN:             %S/Inputs/bat_profile_2.fdata \
 RUN:   | FileCheck %s --check-prefix=CHECK-FDATA
 
+CHECK-FDATA: boltedcollection
 CHECK-FDATA: 1 main 451 1 SolveCubic 0 0 302
-
-
diff --git a/bolt/test/X86/merge-fdata-nobat-mode.test b/bolt/test/X86/merge-fdata-nobat-mode.test
new file mode 100644
index 0000000000000..870d9f880e286
--- /dev/null
+++ b/bolt/test/X86/merge-fdata-nobat-mode.test
@@ -0,0 +1,6 @@
+# Check that merge-fdata tool doesn't spuriously print boltedcollection
+
+RUN: merge-fdata %S/Inputs/blarge.fdata %S/Inputs/blarge.fdata \
+RUN:   | FileCheck %s --check-prefix=CHECK-FDATA
+
+CHECK-FDATA-NOT: boltedcollection
diff --git a/bolt/tools/merge-fdata/merge-fdata.cpp b/bolt/tools/merge-fdata/merge-fdata.cpp
index e21aeba73aca2..104991dabc5f0 100644
--- a/bolt/tools/merge-fdata/merge-fdata.cpp
+++ b/bolt/tools/merge-fdata/merge-fdata.cpp
@@ -329,7 +329,7 @@ void mergeLegacyProfiles(const SmallVectorImpl<std::string> &Filenames) {
       MergedProfile.insert_or_assign(Key, Count);
     }
 
-  if (BoltedCollection)
+  if (BoltedCollection.value_or(false))
     output() << "boltedcollection\n";
   for (const auto &[Key, Value] : MergedProfile)
     output() << Key << " " << Value << "\n";

From 4d566e57a2403b32992cbdae133fb644866f0070 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <Christudasan.Devadasan@amd.com>
Date: Thu, 18 Jan 2024 15:22:33 +0530
Subject: [PATCH 070/843] [AMDGPU] Precommit lit test.

---
 .../AMDGPU/vgpr-large-tuple-alloc-error.ll    | 545 ++++++++++++++++++
 1 file changed, 545 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll

diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
new file mode 100644
index 0000000000000..f3276719ac13c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
@@ -0,0 +1,545 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX900 %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s
+; RUN: not --crash llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX908-ERR %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90a %s
+
+; This test crashes for gfx908 while allocating the tuple. Compared to the other subtargets,
+; gfx908 marks an extra VGPR reserved for AGPR to VGPR copy that puts more register pressure.
+
+; GFX908-ERR: error: ran out of registers during register allocation
+
+define i32 @test_tuple(<16 x i64> %0) {
+; GFX900-LABEL: test_tuple:
+; GFX900:       ; %bb.0:
+; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT:    v_writelane_b32 v31, s36, 0
+; GFX900-NEXT:    v_writelane_b32 v31, s37, 1
+; GFX900-NEXT:    v_writelane_b32 v31, s38, 2
+; GFX900-NEXT:    v_writelane_b32 v31, s39, 3
+; GFX900-NEXT:    v_writelane_b32 v31, s40, 4
+; GFX900-NEXT:    v_writelane_b32 v31, s41, 5
+; GFX900-NEXT:    v_writelane_b32 v31, s42, 6
+; GFX900-NEXT:    v_writelane_b32 v31, s43, 7
+; GFX900-NEXT:    v_writelane_b32 v31, s44, 8
+; GFX900-NEXT:    v_writelane_b32 v31, s45, 9
+; GFX900-NEXT:    v_writelane_b32 v31, s46, 10
+; GFX900-NEXT:    v_writelane_b32 v31, s47, 11
+; GFX900-NEXT:    v_writelane_b32 v31, s48, 12
+; GFX900-NEXT:    v_writelane_b32 v31, s49, 13
+; GFX900-NEXT:    v_writelane_b32 v31, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v31, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v31, s52, 16
+; GFX900-NEXT:    v_writelane_b32 v31, s53, 17
+; GFX900-NEXT:    v_writelane_b32 v31, s54, 18
+; GFX900-NEXT:    v_writelane_b32 v31, s55, 19
+; GFX900-NEXT:    v_writelane_b32 v31, s56, 20
+; GFX900-NEXT:    v_writelane_b32 v31, s57, 21
+; GFX900-NEXT:    v_writelane_b32 v31, s58, 22
+; GFX900-NEXT:    v_writelane_b32 v31, s59, 23
+; GFX900-NEXT:    v_writelane_b32 v31, s60, 24
+; GFX900-NEXT:    v_writelane_b32 v31, s61, 25
+; GFX900-NEXT:    v_writelane_b32 v31, s62, 26
+; GFX900-NEXT:    v_writelane_b32 v31, s63, 27
+; GFX900-NEXT:    v_writelane_b32 v31, s64, 28
+; GFX900-NEXT:    v_writelane_b32 v31, s65, 29
+; GFX900-NEXT:    v_writelane_b32 v31, s66, 30
+; GFX900-NEXT:    v_writelane_b32 v31, s67, 31
+; GFX900-NEXT:    v_mov_b32_e32 v32, v0
+; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec
+; GFX900-NEXT:    v_mov_b32_e32 v33, v1
+; GFX900-NEXT:    v_mov_b32_e32 v34, v2
+; GFX900-NEXT:    v_mov_b32_e32 v35, v3
+; GFX900-NEXT:    v_mov_b32_e32 v36, v4
+; GFX900-NEXT:    v_mov_b32_e32 v37, v5
+; GFX900-NEXT:    v_mov_b32_e32 v38, v6
+; GFX900-NEXT:    v_mov_b32_e32 v39, v7
+; GFX900-NEXT:    v_mov_b32_e32 v40, v8
+; GFX900-NEXT:    v_mov_b32_e32 v41, v9
+; GFX900-NEXT:    v_mov_b32_e32 v42, v10
+; GFX900-NEXT:    v_mov_b32_e32 v43, v11
+; GFX900-NEXT:    v_mov_b32_e32 v44, v12
+; GFX900-NEXT:    v_mov_b32_e32 v45, v13
+; GFX900-NEXT:    v_mov_b32_e32 v46, v14
+; GFX900-NEXT:    v_mov_b32_e32 v47, v15
+; GFX900-NEXT:    v_mov_b32_e32 v48, v16
+; GFX900-NEXT:    v_mov_b32_e32 v49, v17
+; GFX900-NEXT:    v_mov_b32_e32 v50, v18
+; GFX900-NEXT:    v_mov_b32_e32 v51, v19
+; GFX900-NEXT:    v_mov_b32_e32 v52, v20
+; GFX900-NEXT:    v_mov_b32_e32 v53, v21
+; GFX900-NEXT:    v_mov_b32_e32 v54, v22
+; GFX900-NEXT:    v_mov_b32_e32 v55, v23
+; GFX900-NEXT:    v_mov_b32_e32 v56, v24
+; GFX900-NEXT:    v_mov_b32_e32 v57, v25
+; GFX900-NEXT:    v_mov_b32_e32 v58, v26
+; GFX900-NEXT:    v_mov_b32_e32 v59, v27
+; GFX900-NEXT:    v_mov_b32_e32 v60, v28
+; GFX900-NEXT:    v_mov_b32_e32 v61, v29
+; GFX900-NEXT:    v_mov_b32_e32 v62, v30
+; GFX900-NEXT:    ; kill: def $vgpr63 killed $vgpr0 killed $exec
+; GFX900-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    v_mov_b32_e32 v0, 0
+; GFX900-NEXT:    v_readlane_b32 s67, v31, 31
+; GFX900-NEXT:    v_readlane_b32 s66, v31, 30
+; GFX900-NEXT:    v_readlane_b32 s65, v31, 29
+; GFX900-NEXT:    v_readlane_b32 s64, v31, 28
+; GFX900-NEXT:    v_readlane_b32 s63, v31, 27
+; GFX900-NEXT:    v_readlane_b32 s62, v31, 26
+; GFX900-NEXT:    v_readlane_b32 s61, v31, 25
+; GFX900-NEXT:    v_readlane_b32 s60, v31, 24
+; GFX900-NEXT:    v_readlane_b32 s59, v31, 23
+; GFX900-NEXT:    v_readlane_b32 s58, v31, 22
+; GFX900-NEXT:    v_readlane_b32 s57, v31, 21
+; GFX900-NEXT:    v_readlane_b32 s56, v31, 20
+; GFX900-NEXT:    v_readlane_b32 s55, v31, 19
+; GFX900-NEXT:    v_readlane_b32 s54, v31, 18
+; GFX900-NEXT:    v_readlane_b32 s53, v31, 17
+; GFX900-NEXT:    v_readlane_b32 s52, v31, 16
+; GFX900-NEXT:    v_readlane_b32 s51, v31, 15
+; GFX900-NEXT:    v_readlane_b32 s50, v31, 14
+; GFX900-NEXT:    v_readlane_b32 s49, v31, 13
+; GFX900-NEXT:    v_readlane_b32 s48, v31, 12
+; GFX900-NEXT:    v_readlane_b32 s47, v31, 11
+; GFX900-NEXT:    v_readlane_b32 s46, v31, 10
+; GFX900-NEXT:    v_readlane_b32 s45, v31, 9
+; GFX900-NEXT:    v_readlane_b32 s44, v31, 8
+; GFX900-NEXT:    v_readlane_b32 s43, v31, 7
+; GFX900-NEXT:    v_readlane_b32 s42, v31, 6
+; GFX900-NEXT:    v_readlane_b32 s41, v31, 5
+; GFX900-NEXT:    v_readlane_b32 s40, v31, 4
+; GFX900-NEXT:    v_readlane_b32 s39, v31, 3
+; GFX900-NEXT:    v_readlane_b32 s38, v31, 2
+; GFX900-NEXT:    v_readlane_b32 s37, v31, 1
+; GFX900-NEXT:    v_readlane_b32 s36, v31, 0
+; GFX900-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX900-NEXT:    s_waitcnt vmcnt(0)
+; GFX900-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX906-LABEL: test_tuple:
+; GFX906:       ; %bb.0:
+; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX906-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX906-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX906-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_writelane_b32 v31, s36, 0
+; GFX906-NEXT:    v_writelane_b32 v31, s37, 1
+; GFX906-NEXT:    v_writelane_b32 v31, s38, 2
+; GFX906-NEXT:    v_writelane_b32 v31, s39, 3
+; GFX906-NEXT:    v_writelane_b32 v31, s40, 4
+; GFX906-NEXT:    v_writelane_b32 v31, s41, 5
+; GFX906-NEXT:    v_writelane_b32 v31, s42, 6
+; GFX906-NEXT:    v_writelane_b32 v31, s43, 7
+; GFX906-NEXT:    v_writelane_b32 v31, s44, 8
+; GFX906-NEXT:    v_writelane_b32 v31, s45, 9
+; GFX906-NEXT:    v_writelane_b32 v31, s46, 10
+; GFX906-NEXT:    v_writelane_b32 v31, s47, 11
+; GFX906-NEXT:    v_writelane_b32 v31, s48, 12
+; GFX906-NEXT:    v_writelane_b32 v31, s49, 13
+; GFX906-NEXT:    v_writelane_b32 v31, s50, 14
+; GFX906-NEXT:    v_writelane_b32 v31, s51, 15
+; GFX906-NEXT:    v_writelane_b32 v31, s52, 16
+; GFX906-NEXT:    v_writelane_b32 v31, s53, 17
+; GFX906-NEXT:    v_writelane_b32 v31, s54, 18
+; GFX906-NEXT:    v_writelane_b32 v31, s55, 19
+; GFX906-NEXT:    v_writelane_b32 v31, s56, 20
+; GFX906-NEXT:    v_writelane_b32 v31, s57, 21
+; GFX906-NEXT:    v_writelane_b32 v31, s58, 22
+; GFX906-NEXT:    v_writelane_b32 v31, s59, 23
+; GFX906-NEXT:    v_writelane_b32 v31, s60, 24
+; GFX906-NEXT:    v_writelane_b32 v31, s61, 25
+; GFX906-NEXT:    v_writelane_b32 v31, s62, 26
+; GFX906-NEXT:    v_writelane_b32 v31, s63, 27
+; GFX906-NEXT:    v_writelane_b32 v31, s64, 28
+; GFX906-NEXT:    v_writelane_b32 v31, s65, 29
+; GFX906-NEXT:    v_writelane_b32 v31, s66, 30
+; GFX906-NEXT:    v_writelane_b32 v31, s67, 31
+; GFX906-NEXT:    v_mov_b32_e32 v32, v0
+; GFX906-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec
+; GFX906-NEXT:    v_mov_b32_e32 v33, v1
+; GFX906-NEXT:    v_mov_b32_e32 v34, v2
+; GFX906-NEXT:    v_mov_b32_e32 v35, v3
+; GFX906-NEXT:    v_mov_b32_e32 v36, v4
+; GFX906-NEXT:    v_mov_b32_e32 v37, v5
+; GFX906-NEXT:    v_mov_b32_e32 v38, v6
+; GFX906-NEXT:    v_mov_b32_e32 v39, v7
+; GFX906-NEXT:    v_mov_b32_e32 v40, v8
+; GFX906-NEXT:    v_mov_b32_e32 v41, v9
+; GFX906-NEXT:    v_mov_b32_e32 v42, v10
+; GFX906-NEXT:    v_mov_b32_e32 v43, v11
+; GFX906-NEXT:    v_mov_b32_e32 v44, v12
+; GFX906-NEXT:    v_mov_b32_e32 v45, v13
+; GFX906-NEXT:    v_mov_b32_e32 v46, v14
+; GFX906-NEXT:    v_mov_b32_e32 v47, v15
+; GFX906-NEXT:    v_mov_b32_e32 v48, v16
+; GFX906-NEXT:    v_mov_b32_e32 v49, v17
+; GFX906-NEXT:    v_mov_b32_e32 v50, v18
+; GFX906-NEXT:    v_mov_b32_e32 v51, v19
+; GFX906-NEXT:    v_mov_b32_e32 v52, v20
+; GFX906-NEXT:    v_mov_b32_e32 v53, v21
+; GFX906-NEXT:    v_mov_b32_e32 v54, v22
+; GFX906-NEXT:    v_mov_b32_e32 v55, v23
+; GFX906-NEXT:    v_mov_b32_e32 v56, v24
+; GFX906-NEXT:    v_mov_b32_e32 v57, v25
+; GFX906-NEXT:    v_mov_b32_e32 v58, v26
+; GFX906-NEXT:    v_mov_b32_e32 v59, v27
+; GFX906-NEXT:    v_mov_b32_e32 v60, v28
+; GFX906-NEXT:    v_mov_b32_e32 v61, v29
+; GFX906-NEXT:    v_mov_b32_e32 v62, v30
+; GFX906-NEXT:    ; kill: def $vgpr63 killed $vgpr0 killed $exec
+; GFX906-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    v_mov_b32_e32 v0, 0
+; GFX906-NEXT:    v_readlane_b32 s67, v31, 31
+; GFX906-NEXT:    v_readlane_b32 s66, v31, 30
+; GFX906-NEXT:    v_readlane_b32 s65, v31, 29
+; GFX906-NEXT:    v_readlane_b32 s64, v31, 28
+; GFX906-NEXT:    v_readlane_b32 s63, v31, 27
+; GFX906-NEXT:    v_readlane_b32 s62, v31, 26
+; GFX906-NEXT:    v_readlane_b32 s61, v31, 25
+; GFX906-NEXT:    v_readlane_b32 s60, v31, 24
+; GFX906-NEXT:    v_readlane_b32 s59, v31, 23
+; GFX906-NEXT:    v_readlane_b32 s58, v31, 22
+; GFX906-NEXT:    v_readlane_b32 s57, v31, 21
+; GFX906-NEXT:    v_readlane_b32 s56, v31, 20
+; GFX906-NEXT:    v_readlane_b32 s55, v31, 19
+; GFX906-NEXT:    v_readlane_b32 s54, v31, 18
+; GFX906-NEXT:    v_readlane_b32 s53, v31, 17
+; GFX906-NEXT:    v_readlane_b32 s52, v31, 16
+; GFX906-NEXT:    v_readlane_b32 s51, v31, 15
+; GFX906-NEXT:    v_readlane_b32 s50, v31, 14
+; GFX906-NEXT:    v_readlane_b32 s49, v31, 13
+; GFX906-NEXT:    v_readlane_b32 s48, v31, 12
+; GFX906-NEXT:    v_readlane_b32 s47, v31, 11
+; GFX906-NEXT:    v_readlane_b32 s46, v31, 10
+; GFX906-NEXT:    v_readlane_b32 s45, v31, 9
+; GFX906-NEXT:    v_readlane_b32 s44, v31, 8
+; GFX906-NEXT:    v_readlane_b32 s43, v31, 7
+; GFX906-NEXT:    v_readlane_b32 s42, v31, 6
+; GFX906-NEXT:    v_readlane_b32 s41, v31, 5
+; GFX906-NEXT:    v_readlane_b32 s40, v31, 4
+; GFX906-NEXT:    v_readlane_b32 s39, v31, 3
+; GFX906-NEXT:    v_readlane_b32 s38, v31, 2
+; GFX906-NEXT:    v_readlane_b32 s37, v31, 1
+; GFX906-NEXT:    v_readlane_b32 s36, v31, 0
+; GFX906-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX906-NEXT:    s_waitcnt vmcnt(0)
+; GFX906-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX90a-LABEL: test_tuple:
+; GFX90a:       ; %bb.0:
+; GFX90a-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX90a-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX90a-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90a-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX90a-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a14, v62 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_write_b32 a15, v63 ; Reload Reuse
+; GFX90a-NEXT:    v_writelane_b32 v31, s36, 0
+; GFX90a-NEXT:    v_writelane_b32 v31, s37, 1
+; GFX90a-NEXT:    v_writelane_b32 v31, s38, 2
+; GFX90a-NEXT:    v_writelane_b32 v31, s39, 3
+; GFX90a-NEXT:    v_writelane_b32 v31, s40, 4
+; GFX90a-NEXT:    v_writelane_b32 v31, s41, 5
+; GFX90a-NEXT:    v_writelane_b32 v31, s42, 6
+; GFX90a-NEXT:    v_writelane_b32 v31, s43, 7
+; GFX90a-NEXT:    v_writelane_b32 v31, s44, 8
+; GFX90a-NEXT:    v_writelane_b32 v31, s45, 9
+; GFX90a-NEXT:    v_writelane_b32 v31, s46, 10
+; GFX90a-NEXT:    v_writelane_b32 v31, s47, 11
+; GFX90a-NEXT:    v_writelane_b32 v31, s48, 12
+; GFX90a-NEXT:    v_writelane_b32 v31, s49, 13
+; GFX90a-NEXT:    v_writelane_b32 v31, s50, 14
+; GFX90a-NEXT:    v_writelane_b32 v31, s51, 15
+; GFX90a-NEXT:    v_writelane_b32 v31, s52, 16
+; GFX90a-NEXT:    v_writelane_b32 v31, s53, 17
+; GFX90a-NEXT:    v_writelane_b32 v31, s54, 18
+; GFX90a-NEXT:    v_writelane_b32 v31, s55, 19
+; GFX90a-NEXT:    v_writelane_b32 v31, s56, 20
+; GFX90a-NEXT:    v_writelane_b32 v31, s57, 21
+; GFX90a-NEXT:    v_writelane_b32 v31, s58, 22
+; GFX90a-NEXT:    v_writelane_b32 v31, s59, 23
+; GFX90a-NEXT:    v_writelane_b32 v31, s60, 24
+; GFX90a-NEXT:    v_writelane_b32 v31, s61, 25
+; GFX90a-NEXT:    v_writelane_b32 v31, s62, 26
+; GFX90a-NEXT:    v_writelane_b32 v31, s63, 27
+; GFX90a-NEXT:    v_writelane_b32 v31, s64, 28
+; GFX90a-NEXT:    v_writelane_b32 v31, s65, 29
+; GFX90a-NEXT:    v_writelane_b32 v31, s66, 30
+; GFX90a-NEXT:    v_writelane_b32 v31, s67, 31
+; GFX90a-NEXT:    v_mov_b32_e32 v32, v0
+; GFX90a-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; implicit-def: $sgpr4
+; GFX90a-NEXT:    ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec
+; GFX90a-NEXT:    v_mov_b32_e32 v33, v1
+; GFX90a-NEXT:    v_mov_b32_e32 v34, v2
+; GFX90a-NEXT:    v_mov_b32_e32 v35, v3
+; GFX90a-NEXT:    v_mov_b32_e32 v36, v4
+; GFX90a-NEXT:    v_mov_b32_e32 v37, v5
+; GFX90a-NEXT:    v_mov_b32_e32 v38, v6
+; GFX90a-NEXT:    v_mov_b32_e32 v39, v7
+; GFX90a-NEXT:    v_mov_b32_e32 v40, v8
+; GFX90a-NEXT:    v_mov_b32_e32 v41, v9
+; GFX90a-NEXT:    v_mov_b32_e32 v42, v10
+; GFX90a-NEXT:    v_mov_b32_e32 v43, v11
+; GFX90a-NEXT:    v_mov_b32_e32 v44, v12
+; GFX90a-NEXT:    v_mov_b32_e32 v45, v13
+; GFX90a-NEXT:    v_mov_b32_e32 v46, v14
+; GFX90a-NEXT:    v_mov_b32_e32 v47, v15
+; GFX90a-NEXT:    v_mov_b32_e32 v48, v16
+; GFX90a-NEXT:    v_mov_b32_e32 v49, v17
+; GFX90a-NEXT:    v_mov_b32_e32 v50, v18
+; GFX90a-NEXT:    v_mov_b32_e32 v51, v19
+; GFX90a-NEXT:    v_mov_b32_e32 v52, v20
+; GFX90a-NEXT:    v_mov_b32_e32 v53, v21
+; GFX90a-NEXT:    v_mov_b32_e32 v54, v22
+; GFX90a-NEXT:    v_mov_b32_e32 v55, v23
+; GFX90a-NEXT:    v_mov_b32_e32 v56, v24
+; GFX90a-NEXT:    v_mov_b32_e32 v57, v25
+; GFX90a-NEXT:    v_mov_b32_e32 v58, v26
+; GFX90a-NEXT:    v_mov_b32_e32 v59, v27
+; GFX90a-NEXT:    v_mov_b32_e32 v60, v28
+; GFX90a-NEXT:    v_mov_b32_e32 v61, v29
+; GFX90a-NEXT:    v_mov_b32_e32 v62, v30
+; GFX90a-NEXT:    ; kill: def $vgpr63 killed $vgpr0 killed $exec
+; GFX90a-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+; GFX90a-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
+; GFX90a-NEXT:    v_readlane_b32 s67, v31, 31
+; GFX90a-NEXT:    v_readlane_b32 s66, v31, 30
+; GFX90a-NEXT:    v_readlane_b32 s65, v31, 29
+; GFX90a-NEXT:    v_readlane_b32 s64, v31, 28
+; GFX90a-NEXT:    v_readlane_b32 s63, v31, 27
+; GFX90a-NEXT:    v_readlane_b32 s62, v31, 26
+; GFX90a-NEXT:    v_readlane_b32 s61, v31, 25
+; GFX90a-NEXT:    v_readlane_b32 s60, v31, 24
+; GFX90a-NEXT:    v_readlane_b32 s59, v31, 23
+; GFX90a-NEXT:    v_readlane_b32 s58, v31, 22
+; GFX90a-NEXT:    v_readlane_b32 s57, v31, 21
+; GFX90a-NEXT:    v_readlane_b32 s56, v31, 20
+; GFX90a-NEXT:    v_readlane_b32 s55, v31, 19
+; GFX90a-NEXT:    v_readlane_b32 s54, v31, 18
+; GFX90a-NEXT:    v_readlane_b32 s53, v31, 17
+; GFX90a-NEXT:    v_readlane_b32 s52, v31, 16
+; GFX90a-NEXT:    v_readlane_b32 s51, v31, 15
+; GFX90a-NEXT:    v_readlane_b32 s50, v31, 14
+; GFX90a-NEXT:    v_readlane_b32 s49, v31, 13
+; GFX90a-NEXT:    v_readlane_b32 s48, v31, 12
+; GFX90a-NEXT:    v_readlane_b32 s47, v31, 11
+; GFX90a-NEXT:    v_readlane_b32 s46, v31, 10
+; GFX90a-NEXT:    v_readlane_b32 s45, v31, 9
+; GFX90a-NEXT:    v_readlane_b32 s44, v31, 8
+; GFX90a-NEXT:    v_readlane_b32 s43, v31, 7
+; GFX90a-NEXT:    v_readlane_b32 s42, v31, 6
+; GFX90a-NEXT:    v_readlane_b32 s41, v31, 5
+; GFX90a-NEXT:    v_readlane_b32 s40, v31, 4
+; GFX90a-NEXT:    v_readlane_b32 s39, v31, 3
+; GFX90a-NEXT:    v_readlane_b32 s38, v31, 2
+; GFX90a-NEXT:    v_readlane_b32 s37, v31, 1
+; GFX90a-NEXT:    v_readlane_b32 s36, v31, 0
+; GFX90a-NEXT:    v_accvgpr_read_b32 v63, a15 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v62, a14 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX90a-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX90a-NEXT:    s_xor_saveexec_b64 s[4:5], -1
+; GFX90a-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90a-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX90a-NEXT:    s_waitcnt vmcnt(0)
+; GFX90a-NEXT:    s_setpc_b64 s[30:31]
+  %2 = shufflevector <16 x i64> %0, <16 x i64> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  ret i32 0
+}

From 1f61507401995b434a0720d3f9869659466d9b31 Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz@cn.ibm.com>
Date: Thu, 18 Jan 2024 22:29:45 -0500
Subject: [PATCH 071/843] [NFC][PowerPC] remove the redundant spill related
 flags setting

---
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 538e0e6b3d420..15f6b65dea83c 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -1973,14 +1973,6 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
   unsigned Opcode = getLoadOpcodeForSpill(RC);
   NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opcode), DestReg),
                                      FrameIdx));
-  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-
-  if (PPC::CRRCRegClass.hasSubClassEq(RC) ||
-      PPC::CRBITRCRegClass.hasSubClassEq(RC))
-    FuncInfo->setSpillsCR();
-
-  if (isXFormMemOp(Opcode))
-    FuncInfo->setHasNonRISpills();
 }
 
 void PPCInstrInfo::loadRegFromStackSlotNoUpd(
@@ -1992,9 +1984,6 @@ void PPCInstrInfo::loadRegFromStackSlotNoUpd(
   DebugLoc DL;
   if (MI != MBB.end()) DL = MI->getDebugLoc();
 
-  PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-  FuncInfo->setHasSpills();
-
   LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);
 
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)

From 4fc75062745eb5232ea60c37b9ffe61177efa12a Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Fri, 19 Jan 2024 05:30:29 +0100
Subject: [PATCH 072/843] Revert "[flang] Fix seg fault
 `CodeGenAction::executeAction()` (#78269)" (#78667)

This reverts commit 99cae9a44fca4cfbd6ee82f196051cbdf6571fa1.

Temporarily until I reproduce and fix a linker issue:
```
FAILED: tools/flang/unittests/Frontend/FlangFrontendTests
...
/usr/bin/ld: tools/flang/unittests/Frontend/CMakeFiles/FlangFrontendTests.dir/CodeGenActionTest.cpp.o: undefined reference to symbol '_ZN4llvm11LLVMContextC1Ev'
/usr/bin/ld: /work1/omp-nightly/build/git/trunk18.0/build/llvm-project/lib/libLLVMCore.so.18git: error adding symbols: DSO missing from command line
```
---
 flang/lib/Frontend/FrontendActions.cpp        |   5 -
 flang/unittests/Frontend/CMakeLists.txt       |   1 -
 .../unittests/Frontend/CodeGenActionTest.cpp  | 109 ------------------
 3 files changed, 115 deletions(-)
 delete mode 100644 flang/unittests/Frontend/CodeGenActionTest.cpp

diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 65c4df7388f97..74e3992d5ab62 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -1202,11 +1202,6 @@ void CodeGenAction::executeAction() {
   if (!llvmModule)
     generateLLVMIR();
 
-  // If generating the LLVM module failed, abort! No need for further error
-  // reporting since generateLLVMIR() does this already.
-  if (!llvmModule)
-    return;
-
   // Set the triple based on the targetmachine (this comes compiler invocation
   // and the command-line target option if specified, or the default if not
   // given on the command-line).
diff --git a/flang/unittests/Frontend/CMakeLists.txt b/flang/unittests/Frontend/CMakeLists.txt
index 3bcc37bed7f6d..79a394f161ed1 100644
--- a/flang/unittests/Frontend/CMakeLists.txt
+++ b/flang/unittests/Frontend/CMakeLists.txt
@@ -4,7 +4,6 @@ set(LLVM_LINK_COMPONENTS
 )
 
 add_flang_unittest(FlangFrontendTests
-  CodeGenActionTest.cpp
   CompilerInstanceTest.cpp
   FrontendActionTest.cpp
 )
diff --git a/flang/unittests/Frontend/CodeGenActionTest.cpp b/flang/unittests/Frontend/CodeGenActionTest.cpp
deleted file mode 100644
index 9d798c7678ad1..0000000000000
--- a/flang/unittests/Frontend/CodeGenActionTest.cpp
+++ /dev/null
@@ -1,109 +0,0 @@
-//===- unittests/Frontend/CodeGenActionTest.cpp --- FrontendAction tests --===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Unit tests for CodeGenAction.
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/IR/Builders.h"
-#include "flang/Frontend/CompilerInstance.h"
-#include "flang/Frontend/FrontendActions.h"
-#include "flang/Frontend/TextDiagnosticPrinter.h"
-
-#include "gtest/gtest.h"
-
-#include <memory>
-
-using namespace Fortran::frontend;
-
-namespace test {
-class DummyDialect : public ::mlir::Dialect {
-  explicit DummyDialect(::mlir::MLIRContext *context)
-      : ::mlir::Dialect(getDialectNamespace(), context,
-            ::mlir::TypeID::get<DummyDialect>()) {
-    initialize();
-  }
-
-  void initialize();
-  friend class ::mlir::MLIRContext;
-
-public:
-  ~DummyDialect() override = default;
-  static constexpr ::llvm::StringLiteral getDialectNamespace() {
-    return ::llvm::StringLiteral("dummy");
-  }
-};
-
-namespace dummy {
-class FakeOp : public ::mlir::Op<FakeOp> {
-public:
-  using Op::Op;
-
-  static llvm::StringRef getOperationName() { return "dummy.fake"; }
-
-  static ::llvm::ArrayRef<::llvm::StringRef> getAttributeNames() { return {}; }
-
-  static void build(
-      ::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState) {}
-};
-} // namespace dummy
-} // namespace test
-
-MLIR_DECLARE_EXPLICIT_TYPE_ID(::test::DummyDialect)
-MLIR_DEFINE_EXPLICIT_TYPE_ID(::test::DummyDialect)
-
-namespace test {
-
-void DummyDialect::initialize() { addOperations<::test::dummy::FakeOp>(); }
-} // namespace test
-
-// A test CodeGenAction to verify that we gracefully handle failure to convert
-// from MLIR to LLVM IR.
-class LLVMConversionFailureCodeGenAction : public CodeGenAction {
-public:
-  LLVMConversionFailureCodeGenAction()
-      : CodeGenAction(BackendActionTy::Backend_EmitLL) {
-    mlirCtx = std::make_unique<mlir::MLIRContext>();
-    mlirCtx->loadDialect<test::DummyDialect>();
-
-    mlir::Location loc(mlir::UnknownLoc::get(mlirCtx.get()));
-    mlirModule =
-        std::make_unique<mlir::ModuleOp>(mlir::ModuleOp::create(loc, "mod"));
-
-    mlir::OpBuilder builder(mlirCtx.get());
-    builder.setInsertionPointToStart(&mlirModule->getRegion().front());
-    // Create a fake op to trip conversion to LLVM.
-    builder.create<test::dummy::FakeOp>(loc);
-
-    llvmCtx = std::make_unique<llvm::LLVMContext>();
-  }
-};
-
-TEST(CodeGenAction, GracefullyHandleLLVMConversionFailure) {
-  std::string diagnosticOutput;
-  llvm::raw_string_ostream diagnosticsOS(diagnosticOutput);
-  auto diagPrinter = std::make_unique<Fortran::frontend::TextDiagnosticPrinter>(
-      diagnosticsOS, new clang::DiagnosticOptions());
-
-  CompilerInstance ci;
-  ci.createDiagnostics(diagPrinter.get(), /*ShouldOwnClient=*/false);
-  ci.setInvocation(std::make_shared<CompilerInvocation>());
-  ci.setOutputStream(std::make_unique<llvm::raw_null_ostream>());
-  ci.getInvocation().getCodeGenOpts().OptimizationLevel = 0;
-
-  FrontendInputFile file("/dev/null", InputKind());
-
-  LLVMConversionFailureCodeGenAction action;
-  action.setInstance(&ci);
-  action.setCurrentInput(file);
-
-  consumeError(action.execute());
-  ASSERT_EQ(diagnosticsOS.str(),
-      "error: Lowering to LLVM IR failed\n"
-      "error: failed to create the LLVM module\n");
-}

From b8967e003e202cba1b77412478a1990c9dcccdca Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 18 Jan 2024 21:20:28 -0800
Subject: [PATCH 073/843] [flang][openacc] Support multiple device_type when
 lowering (#78634)

routine, data, parallel, serial, kernels and loop construct all support
the device_type clause. This clause takes a list of device_type.
Previously the lowering code was assuming that the list s a single item.
This PR updates the lowering to handle any number of device_types.
---
 flang/lib/Lower/OpenACC.cpp                  | 262 +++++++++++--------
 flang/test/Lower/OpenACC/acc-device-type.f90 |   4 +
 flang/test/Lower/OpenACC/acc-loop.f90        |   6 +
 flang/test/Lower/OpenACC/acc-routine.f90     |   5 +
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp      |   5 +-
 5 files changed, 175 insertions(+), 107 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index fd89d27db74dc..682ca06cabd6f 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1470,15 +1470,19 @@ genAsyncClause(Fortran::lower::AbstractConverter &converter,
                llvm::SmallVector<mlir::Value> &async,
                llvm::SmallVector<mlir::Attribute> &asyncDeviceTypes,
                llvm::SmallVector<mlir::Attribute> &asyncOnlyDeviceTypes,
-               mlir::acc::DeviceTypeAttr deviceTypeAttr,
+               llvm::SmallVector<mlir::Attribute> &deviceTypeAttrs,
                Fortran::lower::StatementContext &stmtCtx) {
   const auto &asyncClauseValue = asyncClause->v;
   if (asyncClauseValue) { // async has a value.
-    async.push_back(fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx)));
-    asyncDeviceTypes.push_back(deviceTypeAttr);
+    mlir::Value asyncValue = fir::getBase(converter.genExprValue(
+        *Fortran::semantics::GetExpr(*asyncClauseValue), stmtCtx));
+    for (auto deviceTypeAttr : deviceTypeAttrs) {
+      async.push_back(asyncValue);
+      asyncDeviceTypes.push_back(deviceTypeAttr);
+    }
   } else {
-    asyncOnlyDeviceTypes.push_back(deviceTypeAttr);
+    for (auto deviceTypeAttr : deviceTypeAttrs)
+      asyncOnlyDeviceTypes.push_back(deviceTypeAttr);
   }
 }
 
@@ -1504,10 +1508,9 @@ getDeviceType(Fortran::common::OpenACCDeviceType device) {
 }
 
 static void gatherDeviceTypeAttrs(
-    fir::FirOpBuilder &builder, mlir::Location clauseLocation,
+    fir::FirOpBuilder &builder,
     const Fortran::parser::AccClause::DeviceType *deviceTypeClause,
-    llvm::SmallVector<mlir::Attribute> &deviceTypes,
-    Fortran::lower::StatementContext &stmtCtx) {
+    llvm::SmallVector<mlir::Attribute> &deviceTypes) {
   const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
       deviceTypeClause->v;
   for (const auto &deviceTypeExpr : deviceTypeExprList.v)
@@ -1560,20 +1563,25 @@ genWaitClause(Fortran::lower::AbstractConverter &converter,
               llvm::SmallVector<mlir::Attribute> &waitOperandsDeviceTypes,
               llvm::SmallVector<mlir::Attribute> &waitOnlyDeviceTypes,
               llvm::SmallVector<int32_t> &waitOperandsSegments,
-              mlir::Value &waitDevnum, mlir::acc::DeviceTypeAttr deviceTypeAttr,
+              mlir::Value &waitDevnum,
+              llvm::SmallVector<mlir::Attribute> deviceTypeAttrs,
               Fortran::lower::StatementContext &stmtCtx) {
   const auto &waitClauseValue = waitClause->v;
   if (waitClauseValue) { // wait has a value.
     const Fortran::parser::AccWaitArgument &waitArg = *waitClauseValue;
     const auto &waitList =
         std::get<std::list<Fortran::parser::ScalarIntExpr>>(waitArg.t);
-    auto crtWaitOperands = waitOperands.size();
+    llvm::SmallVector<mlir::Value> waitValues;
     for (const Fortran::parser::ScalarIntExpr &value : waitList) {
-      waitOperands.push_back(fir::getBase(converter.genExprValue(
+      waitValues.push_back(fir::getBase(converter.genExprValue(
           *Fortran::semantics::GetExpr(value), stmtCtx)));
     }
-    waitOperandsDeviceTypes.push_back(deviceTypeAttr);
-    waitOperandsSegments.push_back(waitOperands.size() - crtWaitOperands);
+    for (auto deviceTypeAttr : deviceTypeAttrs) {
+      for (auto value : waitValues)
+        waitOperands.push_back(value);
+      waitOperandsDeviceTypes.push_back(deviceTypeAttr);
+      waitOperandsSegments.push_back(waitValues.size());
+    }
 
     // TODO: move to device_type model.
     const auto &waitDevnumValue =
@@ -1582,7 +1590,8 @@ genWaitClause(Fortran::lower::AbstractConverter &converter,
       waitDevnum = fir::getBase(converter.genExprValue(
           *Fortran::semantics::GetExpr(*waitDevnumValue), stmtCtx));
   } else {
-    waitOnlyDeviceTypes.push_back(deviceTypeAttr);
+    for (auto deviceTypeAttr : deviceTypeAttrs)
+      waitOnlyDeviceTypes.push_back(deviceTypeAttr);
   }
 }
 
@@ -1610,91 +1619,112 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
 
   // device_type attribute is set to `none` until a device_type clause is
   // encountered.
-  auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
-      builder.getContext(), mlir::acc::DeviceType::None);
+  llvm::SmallVector<mlir::Attribute> crtDeviceTypes;
+  crtDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None));
 
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *gangClause =
             std::get_if<Fortran::parser::AccClause::Gang>(&clause.u)) {
       if (gangClause->v) {
-        auto crtGangOperands = gangOperands.size();
         const Fortran::parser::AccGangArgList &x = *gangClause->v;
+        mlir::SmallVector<mlir::Value> gangValues;
+        mlir::SmallVector<mlir::Attribute> gangArgs;
         for (const Fortran::parser::AccGangArg &gangArg : x.v) {
           if (const auto *num =
                   std::get_if<Fortran::parser::AccGangArg::Num>(&gangArg.u)) {
-            gangOperands.push_back(fir::getBase(converter.genExprValue(
+            gangValues.push_back(fir::getBase(converter.genExprValue(
                 *Fortran::semantics::GetExpr(num->v), stmtCtx)));
-            gangArgTypes.push_back(mlir::acc::GangArgTypeAttr::get(
+            gangArgs.push_back(mlir::acc::GangArgTypeAttr::get(
                 builder.getContext(), mlir::acc::GangArgType::Num));
           } else if (const auto *staticArg =
                          std::get_if<Fortran::parser::AccGangArg::Static>(
                              &gangArg.u)) {
             const Fortran::parser::AccSizeExpr &sizeExpr = staticArg->v;
             if (sizeExpr.v) {
-              gangOperands.push_back(fir::getBase(converter.genExprValue(
+              gangValues.push_back(fir::getBase(converter.genExprValue(
                   *Fortran::semantics::GetExpr(*sizeExpr.v), stmtCtx)));
             } else {
               // * was passed as value and will be represented as a special
               // constant.
-              gangOperands.push_back(builder.createIntegerConstant(
+              gangValues.push_back(builder.createIntegerConstant(
                   clauseLocation, builder.getIndexType(), starCst));
             }
-            gangArgTypes.push_back(mlir::acc::GangArgTypeAttr::get(
+            gangArgs.push_back(mlir::acc::GangArgTypeAttr::get(
                 builder.getContext(), mlir::acc::GangArgType::Static));
           } else if (const auto *dim =
                          std::get_if<Fortran::parser::AccGangArg::Dim>(
                              &gangArg.u)) {
-            gangOperands.push_back(fir::getBase(converter.genExprValue(
+            gangValues.push_back(fir::getBase(converter.genExprValue(
                 *Fortran::semantics::GetExpr(dim->v), stmtCtx)));
-            gangArgTypes.push_back(mlir::acc::GangArgTypeAttr::get(
+            gangArgs.push_back(mlir::acc::GangArgTypeAttr::get(
                 builder.getContext(), mlir::acc::GangArgType::Dim));
           }
         }
-        gangOperandsSegments.push_back(gangOperands.size() - crtGangOperands);
-        gangOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+        for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+          for (const auto &pair : llvm::zip(gangValues, gangArgs)) {
+            gangOperands.push_back(std::get<0>(pair));
+            gangArgTypes.push_back(std::get<1>(pair));
+          }
+          gangOperandsSegments.push_back(gangValues.size());
+          gangOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+        }
       } else {
-        gangDeviceTypes.push_back(crtDeviceTypeAttr);
+        for (auto crtDeviceTypeAttr : crtDeviceTypes)
+          gangDeviceTypes.push_back(crtDeviceTypeAttr);
       }
     } else if (const auto *workerClause =
                    std::get_if<Fortran::parser::AccClause::Worker>(&clause.u)) {
       if (workerClause->v) {
-        workerNumOperands.push_back(fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*workerClause->v), stmtCtx)));
-        workerNumOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+        mlir::Value workerNumValue = fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(*workerClause->v), stmtCtx));
+        for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+          workerNumOperands.push_back(workerNumValue);
+          workerNumOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+        }
       } else {
-        workerNumDeviceTypes.push_back(crtDeviceTypeAttr);
+        for (auto crtDeviceTypeAttr : crtDeviceTypes)
+          workerNumDeviceTypes.push_back(crtDeviceTypeAttr);
       }
     } else if (const auto *vectorClause =
                    std::get_if<Fortran::parser::AccClause::Vector>(&clause.u)) {
       if (vectorClause->v) {
-        vectorOperands.push_back(fir::getBase(converter.genExprValue(
-            *Fortran::semantics::GetExpr(*vectorClause->v), stmtCtx)));
-        vectorOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+        mlir::Value vectorValue = fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(*vectorClause->v), stmtCtx));
+        for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+          vectorOperands.push_back(vectorValue);
+          vectorOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+        }
       } else {
-        vectorDeviceTypes.push_back(crtDeviceTypeAttr);
+        for (auto crtDeviceTypeAttr : crtDeviceTypes)
+          vectorDeviceTypes.push_back(crtDeviceTypeAttr);
       }
     } else if (const auto *tileClause =
                    std::get_if<Fortran::parser::AccClause::Tile>(&clause.u)) {
       const Fortran::parser::AccTileExprList &accTileExprList = tileClause->v;
-      auto crtTileOperands = tileOperands.size();
+      llvm::SmallVector<mlir::Value> tileValues;
       for (const auto &accTileExpr : accTileExprList.v) {
         const auto &expr =
             std::get<std::optional<Fortran::parser::ScalarIntConstantExpr>>(
                 accTileExpr.t);
         if (expr) {
-          tileOperands.push_back(fir::getBase(converter.genExprValue(
+          tileValues.push_back(fir::getBase(converter.genExprValue(
               *Fortran::semantics::GetExpr(*expr), stmtCtx)));
         } else {
           // * was passed as value and will be represented as a special
           // constant.
           mlir::Value tileStar = builder.createIntegerConstant(
               clauseLocation, builder.getIntegerType(32), starCst);
-          tileOperands.push_back(tileStar);
+          tileValues.push_back(tileStar);
         }
       }
-      tileOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
-      tileOperandsSegments.push_back(tileOperands.size() - crtTileOperands);
+      for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+        for (auto value : tileValues)
+          tileOperands.push_back(value);
+        tileOperandsDeviceTypes.push_back(crtDeviceTypeAttr);
+        tileOperandsSegments.push_back(tileValues.size());
+      }
     } else if (const auto *privateClause =
                    std::get_if<Fortran::parser::AccClause::Private>(
                        &clause.u)) {
@@ -1707,21 +1737,20 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
       genReductions(reductionClause->v, converter, semanticsContext, stmtCtx,
                     reductionOperands, reductionRecipes);
     } else if (std::get_if<Fortran::parser::AccClause::Seq>(&clause.u)) {
-      seqDeviceTypes.push_back(crtDeviceTypeAttr);
+      for (auto crtDeviceTypeAttr : crtDeviceTypes)
+        seqDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (std::get_if<Fortran::parser::AccClause::Independent>(
                    &clause.u)) {
-      independentDeviceTypes.push_back(crtDeviceTypeAttr);
+      for (auto crtDeviceTypeAttr : crtDeviceTypes)
+        independentDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (std::get_if<Fortran::parser::AccClause::Auto>(&clause.u)) {
-      autoDeviceTypes.push_back(crtDeviceTypeAttr);
+      for (auto crtDeviceTypeAttr : crtDeviceTypes)
+        autoDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
-      const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
-          deviceTypeClause->v;
-      assert(deviceTypeExprList.v.size() == 1 &&
-             "expect only one device_type expr");
-      crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
-          builder.getContext(), getDeviceType(deviceTypeExprList.v.front().v));
+      crtDeviceTypes.clear();
+      gatherDeviceTypeAttrs(builder, deviceTypeClause, crtDeviceTypes);
     } else if (const auto *collapseClause =
                    std::get_if<Fortran::parser::AccClause::Collapse>(
                        &clause.u)) {
@@ -1729,14 +1758,18 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
       const auto &force = std::get<bool>(arg.t);
       if (force)
         TODO(clauseLocation, "OpenACC collapse force modifier");
+
       const auto &intExpr =
           std::get<Fortran::parser::ScalarIntConstantExpr>(arg.t);
       const auto *expr = Fortran::semantics::GetExpr(intExpr);
       const std::optional<int64_t> collapseValue =
           Fortran::evaluate::ToInt64(*expr);
       assert(collapseValue && "expect integer value for the collapse clause");
-      collapseValues.push_back(*collapseValue);
-      collapseDeviceTypes.push_back(crtDeviceTypeAttr);
+
+      for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+        collapseValues.push_back(*collapseValue);
+        collapseDeviceTypes.push_back(crtDeviceTypeAttr);
+      }
     }
   }
 
@@ -1923,45 +1956,56 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
 
   // device_type attribute is set to `none` until a device_type clause is
   // encountered.
+  llvm::SmallVector<mlir::Attribute> crtDeviceTypes;
   auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
       builder.getContext(), mlir::acc::DeviceType::None);
+  crtDeviceTypes.push_back(crtDeviceTypeAttr);
 
-  // Lower clauses values mapped to operands.
-  // Keep track of each group of operands separatly as clauses can appear
+  // Lower clauses values mapped to operands and array attributes.
+  // Keep track of each group of operands separately as clauses can appear
   // more than once.
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *asyncClause =
             std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
       genAsyncClause(converter, asyncClause, async, asyncDeviceTypes,
-                     asyncOnlyDeviceTypes, crtDeviceTypeAttr, stmtCtx);
+                     asyncOnlyDeviceTypes, crtDeviceTypes, stmtCtx);
     } else if (const auto *waitClause =
                    std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
       genWaitClause(converter, waitClause, waitOperands,
                     waitOperandsDeviceTypes, waitOnlyDeviceTypes,
-                    waitOperandsSegments, waitDevnum, crtDeviceTypeAttr,
-                    stmtCtx);
+                    waitOperandsSegments, waitDevnum, crtDeviceTypes, stmtCtx);
     } else if (const auto *numGangsClause =
                    std::get_if<Fortran::parser::AccClause::NumGangs>(
                        &clause.u)) {
-      auto crtNumGangs = numGangs.size();
+      llvm::SmallVector<mlir::Value> numGangValues;
       for (const Fortran::parser::ScalarIntExpr &expr : numGangsClause->v)
-        numGangs.push_back(fir::getBase(converter.genExprValue(
+        numGangValues.push_back(fir::getBase(converter.genExprValue(
             *Fortran::semantics::GetExpr(expr), stmtCtx)));
-      numGangsDeviceTypes.push_back(crtDeviceTypeAttr);
-      numGangsSegments.push_back(numGangs.size() - crtNumGangs);
+      for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+        for (auto value : numGangValues)
+          numGangs.push_back(value);
+        numGangsDeviceTypes.push_back(crtDeviceTypeAttr);
+        numGangsSegments.push_back(numGangValues.size());
+      }
     } else if (const auto *numWorkersClause =
                    std::get_if<Fortran::parser::AccClause::NumWorkers>(
                        &clause.u)) {
-      numWorkers.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(numWorkersClause->v), stmtCtx)));
-      numWorkersDeviceTypes.push_back(crtDeviceTypeAttr);
+      mlir::Value numWorkerValue = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(numWorkersClause->v), stmtCtx));
+      for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+        numWorkers.push_back(numWorkerValue);
+        numWorkersDeviceTypes.push_back(crtDeviceTypeAttr);
+      }
     } else if (const auto *vectorLengthClause =
                    std::get_if<Fortran::parser::AccClause::VectorLength>(
                        &clause.u)) {
-      vectorLength.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(vectorLengthClause->v), stmtCtx)));
-      vectorLengthDeviceTypes.push_back(crtDeviceTypeAttr);
+      mlir::Value vectorLengthValue = fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(vectorLengthClause->v), stmtCtx));
+      for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+        vectorLength.push_back(vectorLengthValue);
+        vectorLengthDeviceTypes.push_back(crtDeviceTypeAttr);
+      }
     } else if (const auto *ifClause =
                    std::get_if<Fortran::parser::AccClause::If>(&clause.u)) {
       genIfClause(converter, clauseLocation, ifClause, ifCond, stmtCtx);
@@ -2115,12 +2159,8 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
-      const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
-          deviceTypeClause->v;
-      assert(deviceTypeExprList.v.size() == 1 &&
-             "expect only one device_type expr");
-      crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
-          builder.getContext(), getDeviceType(deviceTypeExprList.v.front().v));
+      crtDeviceTypes.clear();
+      gatherDeviceTypeAttrs(builder, deviceTypeClause, crtDeviceTypes);
     }
   }
 
@@ -2239,10 +2279,11 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
 
   // device_type attribute is set to `none` until a device_type clause is
   // encountered.
-  auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
-      builder.getContext(), mlir::acc::DeviceType::None);
+  llvm::SmallVector<mlir::Attribute> crtDeviceTypes;
+  crtDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None));
 
-  // Lower clauses values mapped to operands.
+  // Lower clauses values mapped to operands and array attributes.
   // Keep track of each group of operands separately as clauses can appear
   // more than once.
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
@@ -2323,19 +2364,23 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *asyncClause =
                    std::get_if<Fortran::parser::AccClause::Async>(&clause.u)) {
       genAsyncClause(converter, asyncClause, async, asyncDeviceTypes,
-                     asyncOnlyDeviceTypes, crtDeviceTypeAttr, stmtCtx);
+                     asyncOnlyDeviceTypes, crtDeviceTypes, stmtCtx);
     } else if (const auto *waitClause =
                    std::get_if<Fortran::parser::AccClause::Wait>(&clause.u)) {
       genWaitClause(converter, waitClause, waitOperands,
                     waitOperandsDeviceTypes, waitOnlyDeviceTypes,
-                    waitOperandsSegments, waitDevnum, crtDeviceTypeAttr,
-                    stmtCtx);
+                    waitOperandsSegments, waitDevnum, crtDeviceTypes, stmtCtx);
     } else if(const auto *defaultClause = 
                   std::get_if<Fortran::parser::AccClause::Default>(&clause.u)) {
       if ((defaultClause->v).v == llvm::acc::DefaultValue::ACC_Default_none)
         hasDefaultNone = true;
       else if ((defaultClause->v).v == llvm::acc::DefaultValue::ACC_Default_present)
         hasDefaultPresent = true;
+    } else if (const auto *deviceTypeClause =
+                   std::get_if<Fortran::parser::AccClause::DeviceType>(
+                       &clause.u)) {
+      crtDeviceTypes.clear();
+      gatherDeviceTypeAttrs(builder, deviceTypeClause, crtDeviceTypes);
     }
   }
 
@@ -2727,8 +2772,7 @@ genACCInitShutdownOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
-      gatherDeviceTypeAttrs(builder, clauseLocation, deviceTypeClause,
-                            deviceTypes, stmtCtx);
+      gatherDeviceTypeAttrs(builder, deviceTypeClause, deviceTypes);
     }
   }
 
@@ -2777,8 +2821,7 @@ void genACCSetOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
-      gatherDeviceTypeAttrs(builder, clauseLocation, deviceTypeClause,
-                            deviceTypes, stmtCtx);
+      gatherDeviceTypeAttrs(builder, deviceTypeClause, deviceTypes);
     }
   }
 
@@ -2835,8 +2878,7 @@ genACCUpdateOp(Fortran::lower::AbstractConverter &converter,
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
-      gatherDeviceTypeAttrs(builder, clauseLocation, deviceTypeClause,
-                            deviceTypes, stmtCtx);
+      gatherDeviceTypeAttrs(builder, deviceTypeClause, deviceTypes);
     } else if (const auto *hostClause =
                    std::get_if<Fortran::parser::AccClause::Host>(&clause.u)) {
       genDataOperandOperations<mlir::acc::GetDevicePtrOp>(
@@ -3592,15 +3634,16 @@ void Fortran::lower::genOpenACCRoutineConstruct(
 
   // device_type attribute is set to `none` until a device_type clause is
   // encountered.
-  auto crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
-      builder.getContext(), mlir::acc::DeviceType::None);
+  llvm::SmallVector<mlir::Attribute> crtDeviceTypes;
+  crtDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
+      builder.getContext(), mlir::acc::DeviceType::None));
 
   for (const Fortran::parser::AccClause &clause : clauses.v) {
     if (std::get_if<Fortran::parser::AccClause::Seq>(&clause.u)) {
-      seqDeviceTypes.push_back(crtDeviceTypeAttr);
+      for (auto crtDeviceTypeAttr : crtDeviceTypes)
+        seqDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (const auto *gangClause =
                    std::get_if<Fortran::parser::AccClause::Gang>(&clause.u)) {
-
       if (gangClause->v) {
         const Fortran::parser::AccGangArgList &x = *gangClause->v;
         for (const Fortran::parser::AccGangArg &gangArg : x.v) {
@@ -3611,27 +3654,36 @@ void Fortran::lower::genOpenACCRoutineConstruct(
             if (!dimValue)
               mlir::emitError(loc,
                               "dim value must be a constant positive integer");
-            gangDimValues.push_back(
-                builder.getIntegerAttr(builder.getI64Type(), *dimValue));
-            gangDimDeviceTypes.push_back(crtDeviceTypeAttr);
+            mlir::Attribute gangDimAttr =
+                builder.getIntegerAttr(builder.getI64Type(), *dimValue);
+            for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+              gangDimValues.push_back(gangDimAttr);
+              gangDimDeviceTypes.push_back(crtDeviceTypeAttr);
+            }
           }
         }
       } else {
-        gangDeviceTypes.push_back(crtDeviceTypeAttr);
+        for (auto crtDeviceTypeAttr : crtDeviceTypes)
+          gangDeviceTypes.push_back(crtDeviceTypeAttr);
       }
     } else if (std::get_if<Fortran::parser::AccClause::Vector>(&clause.u)) {
-      vectorDeviceTypes.push_back(crtDeviceTypeAttr);
+      for (auto crtDeviceTypeAttr : crtDeviceTypes)
+        vectorDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (std::get_if<Fortran::parser::AccClause::Worker>(&clause.u)) {
-      workerDeviceTypes.push_back(crtDeviceTypeAttr);
+      for (auto crtDeviceTypeAttr : crtDeviceTypes)
+        workerDeviceTypes.push_back(crtDeviceTypeAttr);
     } else if (std::get_if<Fortran::parser::AccClause::Nohost>(&clause.u)) {
       hasNohost = true;
     } else if (const auto *bindClause =
                    std::get_if<Fortran::parser::AccClause::Bind>(&clause.u)) {
       if (const auto *name =
               std::get_if<Fortran::parser::Name>(&bindClause->v.u)) {
-        bindNames.push_back(
-            builder.getStringAttr(converter.mangleName(*name->symbol)));
-        bindNameDeviceTypes.push_back(crtDeviceTypeAttr);
+        mlir::Attribute bindNameAttr =
+            builder.getStringAttr(converter.mangleName(*name->symbol));
+        for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+          bindNames.push_back(bindNameAttr);
+          bindNameDeviceTypes.push_back(crtDeviceTypeAttr);
+        }
       } else if (const auto charExpr =
                      std::get_if<Fortran::parser::ScalarDefaultCharExpr>(
                          &bindClause->v.u)) {
@@ -3640,18 +3692,18 @@ void Fortran::lower::genOpenACCRoutineConstruct(
                                                           *charExpr);
         if (!name)
           mlir::emitError(loc, "Could not retrieve the bind name");
-        bindNames.push_back(builder.getStringAttr(*name));
-        bindNameDeviceTypes.push_back(crtDeviceTypeAttr);
+
+        mlir::Attribute bindNameAttr = builder.getStringAttr(*name);
+        for (auto crtDeviceTypeAttr : crtDeviceTypes) {
+          bindNames.push_back(bindNameAttr);
+          bindNameDeviceTypes.push_back(crtDeviceTypeAttr);
+        }
       }
     } else if (const auto *deviceTypeClause =
                    std::get_if<Fortran::parser::AccClause::DeviceType>(
                        &clause.u)) {
-      const Fortran::parser::AccDeviceTypeExprList &deviceTypeExprList =
-          deviceTypeClause->v;
-      assert(deviceTypeExprList.v.size() == 1 &&
-             "expect only one device_type expr");
-      crtDeviceTypeAttr = mlir::acc::DeviceTypeAttr::get(
-          builder.getContext(), getDeviceType(deviceTypeExprList.v.front().v));
+      crtDeviceTypes.clear();
+      gatherDeviceTypeAttrs(builder, deviceTypeClause, crtDeviceTypes);
     }
   }
 
diff --git a/flang/test/Lower/OpenACC/acc-device-type.f90 b/flang/test/Lower/OpenACC/acc-device-type.f90
index 871dbc95f60fc..ae01d0dc5fcde 100644
--- a/flang/test/Lower/OpenACC/acc-device-type.f90
+++ b/flang/test/Lower/OpenACC/acc-device-type.f90
@@ -40,5 +40,9 @@ subroutine sub1()
 
 ! CHECK: acc.parallel num_gangs({%c2{{.*}} : i32}, {%c1{{.*}} : i32, %c1{{.*}} : i32, %c1{{.*}} : i32} [#acc.device_type<nvidia>])
 
+  !$acc parallel device_type(nvidia, default) num_gangs(1, 1, 1)
+  !$acc end parallel
+
+! CHECK: acc.parallel num_gangs({%c1{{.*}} : i32, %c1{{.*}} : i32, %c1{{.*}} : i32} [#acc.device_type<nvidia>], {%c1{{.*}} : i32, %c1{{.*}} : i32, %c1{{.*}} : i32} [#acc.device_type<default>])
 
 end subroutine
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index 42e14afb35f52..59c2513332a97 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -326,4 +326,10 @@ program acc_loop
 
 ! CHECK: acc.loop gang([#acc.device_type<none>], {num=%c8{{.*}} : i32} [#acc.device_type<nvidia>])
 
+  !$acc loop device_type(nvidia, default) gang
+  DO i = 1, n
+  END DO
+
+! CHECK: acc.loop gang([#acc.device_type<nvidia>, #acc.device_type<default>]) {
+
 end program
diff --git a/flang/test/Lower/OpenACC/acc-routine.f90 b/flang/test/Lower/OpenACC/acc-routine.f90
index 2fe150e70b0cf..1170af18bc334 100644
--- a/flang/test/Lower/OpenACC/acc-routine.f90
+++ b/flang/test/Lower/OpenACC/acc-routine.f90
@@ -2,6 +2,7 @@
 
 ! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
 
+! CHECK: acc.routine @acc_routine_17 func(@_QPacc_routine19) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine17" [#acc.device_type<default>], "_QPacc_routine16" [#acc.device_type<multicore>])
 ! CHECK: acc.routine @acc_routine_16 func(@_QPacc_routine18) bind("_QPacc_routine17" [#acc.device_type<host>], "_QPacc_routine16" [#acc.device_type<multicore>])
 ! CHECK: acc.routine @acc_routine_15 func(@_QPacc_routine17) worker ([#acc.device_type<host>]) vector ([#acc.device_type<multicore>])
 ! CHECK: acc.routine @acc_routine_14 func(@_QPacc_routine16) gang([#acc.device_type<nvidia>]) seq ([#acc.device_type<host>])
@@ -120,3 +121,7 @@ subroutine acc_routine17()
 subroutine acc_routine18()
   !$acc routine device_type(host) bind(acc_routine17) dtype(multicore) bind(acc_routine16) 
 end subroutine
+
+subroutine acc_routine19()
+  !$acc routine device_type(host,default) bind(acc_routine17) dtype(multicore) bind(acc_routine16) 
+end subroutine
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 20465f6bb86ed..bc03adbcae64d 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -1449,7 +1449,8 @@ void printGangClause(OpAsmPrinter &p, Operation *op,
                      std::optional<mlir::DenseI32ArrayAttr> segments,
                      std::optional<mlir::ArrayAttr> gangOnlyDeviceTypes) {
 
-  if (operands.begin() == operands.end() && gangOnlyDeviceTypes &&
+  if (operands.begin() == operands.end() &&
+      hasDeviceTypeValues(gangOnlyDeviceTypes) &&
       gangOnlyDeviceTypes->size() == 1) {
     auto deviceTypeAttr =
         mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*gangOnlyDeviceTypes)[0]);
@@ -1464,7 +1465,7 @@ void printGangClause(OpAsmPrinter &p, Operation *op,
       hasDeviceTypeValues(deviceTypes))
     p << ", ";
 
-  if (deviceTypes) {
+  if (hasDeviceTypeValues(deviceTypes)) {
     unsigned opIdx = 0;
     llvm::interleaveComma(llvm::enumerate(*deviceTypes), p, [&](auto it) {
       p << "{";

From dbc09553149dee5ff5d4a8f544f56df5f01c584a Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Thu, 18 Jan 2024 21:45:58 -0800
Subject: [PATCH 074/843] [libc] Provide sys/queue.h (#78081)

This header first appeared in 4.4BSD and is provided by a number of C
libraries including Newlib. Several of our embedded projects use this
header and so to make LLVM libc a drop-in replacement, we need to
provide it as well.

For the initial commit, we only implement singly linked variants (SLIST
and STAILQ). The doubly linked variants (LIST, TAILQ and CIRCLEQ) can be
implemented in the future as needed.
---
 libc/config/baremetal/arm/headers.txt         |   1 +
 libc/config/baremetal/riscv/headers.txt       |   1 +
 libc/config/linux/riscv/headers.txt           |   1 +
 libc/config/linux/x86_64/headers.txt          |   1 +
 libc/include/CMakeLists.txt                   |   8 +
 libc/include/llvm-libc-macros/CMakeLists.txt  |  28 +-
 .../llvm-libc-macros/containerof-macro.h      |  20 ++
 .../include/llvm-libc-macros/offsetof-macro.h |  15 +
 .../llvm-libc-macros/sys-queue-macros.h       | 260 ++++++++++++++++++
 libc/include/sys/queue.h                      |  14 +
 libc/test/CMakeLists.txt                      |   1 +
 libc/test/include/CMakeLists.txt              |  16 ++
 libc/test/include/sys/queue_test.cpp          | 168 +++++++++++
 13 files changed, 533 insertions(+), 1 deletion(-)
 create mode 100644 libc/include/llvm-libc-macros/containerof-macro.h
 create mode 100644 libc/include/llvm-libc-macros/offsetof-macro.h
 create mode 100644 libc/include/llvm-libc-macros/sys-queue-macros.h
 create mode 100644 libc/include/sys/queue.h
 create mode 100644 libc/test/include/CMakeLists.txt
 create mode 100644 libc/test/include/sys/queue_test.cpp

diff --git a/libc/config/baremetal/arm/headers.txt b/libc/config/baremetal/arm/headers.txt
index e7be1fd80e875..6ff51f9786772 100644
--- a/libc/config/baremetal/arm/headers.txt
+++ b/libc/config/baremetal/arm/headers.txt
@@ -8,4 +8,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.stdlib
     libc.include.string
     libc.include.strings
+    libc.include.sys_queue
 )
diff --git a/libc/config/baremetal/riscv/headers.txt b/libc/config/baremetal/riscv/headers.txt
index e7be1fd80e875..6ff51f9786772 100644
--- a/libc/config/baremetal/riscv/headers.txt
+++ b/libc/config/baremetal/riscv/headers.txt
@@ -8,4 +8,5 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.stdlib
     libc.include.string
     libc.include.strings
+    libc.include.sys_queue
 )
diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt
index 3e2b1630f1695..9c70a3bde74f0 100644
--- a/libc/config/linux/riscv/headers.txt
+++ b/libc/config/linux/riscv/headers.txt
@@ -31,6 +31,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.sys_mman
     libc.include.sys_prctl
     libc.include.sys_random
+    libc.include.sys_queue
     libc.include.sys_resource
     libc.include.sys_select
     libc.include.sys_socket
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index 3e2b1630f1695..a85f87b2a3ee9 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -30,6 +30,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.sys_ioctl
     libc.include.sys_mman
     libc.include.sys_prctl
+    libc.include.sys_queue
     libc.include.sys_random
     libc.include.sys_resource
     libc.include.sys_select
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 2c2d1b9b0fd15..9a06f829eee18 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -345,6 +345,14 @@ add_gen_header(
     .llvm_libc_common_h
 )
 
+add_header(
+  sys_queue
+  HDR
+    sys/queue.h
+  DEPENDS
+    .llvm-libc-macros.sys_queue_macros
+)
+
 add_gen_header(
   sys_random
   DEF_FILE sys/random.h.def
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index 9c9e6bfd12564..7b2616d4311d9 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -4,7 +4,7 @@ function(add_macro_header name)
     "MACRO_HEADER"
     "" # Optional arguments
     "HDR" # Single value arguments
-    "" # Multi-value arguments
+    "DEPENDS" # Multi-value arguments
     ${ARGN}
   )
   if(TARGET libc.include.llvm-libc-macros.${LIBC_TARGET_OS}.${name})
@@ -14,12 +14,15 @@ function(add_macro_header name)
         ${MACRO_HEADER_HDR}
       DEPENDS
         .${LIBC_TARGET_OS}.${name}
+        ${MACRO_HEADER_DEPENDS}
     )
   else()
     add_header(
       ${name}
       HDR
         ${MACRO_HEADER_HDR}
+      DEPENDS
+        ${MACRO_HEADER_DEPENDS}
     )
   endif()
 endfunction(add_macro_header)
@@ -70,6 +73,20 @@ add_macro_header(
     math-macros.h
 )
 
+add_macro_header(
+  offsetof_macro
+  HDR
+    offsetof-macro.h
+)
+
+add_macro_header(
+  containerof_macro
+  HDR
+    containerof-macro.h
+  DEPENDS
+    .offsetof_macro
+)
+
 add_macro_header(
   sched_macros
   HDR
@@ -118,6 +135,15 @@ add_macro_header(
     sys-mman-macros.h
 )
 
+add_macro_header(
+  sys_queue_macros
+  HDR
+    sys-queue-macros.h
+  DEPENDS
+    .null_macro
+    .containerof_macro
+)
+
 add_macro_header(
   sys_random_macros
   HDR
diff --git a/libc/include/llvm-libc-macros/containerof-macro.h b/libc/include/llvm-libc-macros/containerof-macro.h
new file mode 100644
index 0000000000000..ea91fa7097a4f
--- /dev/null
+++ b/libc/include/llvm-libc-macros/containerof-macro.h
@@ -0,0 +1,20 @@
+//===-- Definition of the containerof macro -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_LIBC_MACROS_CONTAINEROF_MACRO_H
+#define __LLVM_LIBC_MACROS_CONTAINEROF_MACRO_H
+
+#include <llvm-libc-macros/offsetof-macro.h>
+
+#define __containerof(ptr, type, member)                                       \
+  ({                                                                           \
+    const __typeof(((type *)0)->member) *__ptr = (ptr);                        \
+    (type *)(void *)((const char *)__ptr - offsetof(type, member));            \
+  })
+
+#endif // __LLVM_LIBC_MACROS_CONTAINEROF_MACRO_H
diff --git a/libc/include/llvm-libc-macros/offsetof-macro.h b/libc/include/llvm-libc-macros/offsetof-macro.h
new file mode 100644
index 0000000000000..eeceb3db110b6
--- /dev/null
+++ b/libc/include/llvm-libc-macros/offsetof-macro.h
@@ -0,0 +1,15 @@
+//===-- Definition of the offsetof macro ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_LIBC_MACROS_OFFSETOF_MACRO_H
+#define __LLVM_LIBC_MACROS_OFFSETOF_MACRO_H
+
+#define __need_offsetof
+#include <stddef.h>
+
+#endif // __LLVM_LIBC_MACROS_OFFSETOF_MACRO_H
diff --git a/libc/include/llvm-libc-macros/sys-queue-macros.h b/libc/include/llvm-libc-macros/sys-queue-macros.h
new file mode 100644
index 0000000000000..f41e7363e418b
--- /dev/null
+++ b/libc/include/llvm-libc-macros/sys-queue-macros.h
@@ -0,0 +1,260 @@
+//===-- Macros defined in sys/queue.h header file -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_LIBC_MACROS_SYS_QUEUE_MACROS_H
+#define __LLVM_LIBC_MACROS_SYS_QUEUE_MACROS_H
+
+#include <llvm-libc-macros/containerof-macro.h>
+#include <llvm-libc-macros/null-macro.h>
+
+#ifdef __cplusplus
+#define QUEUE_TYPEOF(type) type
+#else
+#define QUEUE_TYPEOF(type) struct type
+#endif
+
+// Singly-linked list definitions.
+
+#define SLIST_HEAD(name, type)                                                 \
+  struct name {                                                                \
+    struct type *first;                                                        \
+  }
+
+#define SLIST_CLASS_HEAD(name, type)                                           \
+  struct name {                                                                \
+    class type *first;                                                         \
+  }
+
+#define SLIST_HEAD_INITIALIZER(head)                                           \
+  { NULL }
+
+#define SLIST_ENTRY(type)                                                      \
+  struct {                                                                     \
+    struct type *next;                                                         \
+  }
+
+#define SLIST_CLASS_ENTRY(type)                                                \
+  struct {                                                                     \
+    class type *next;                                                          \
+  }
+
+// Singly-linked list access methods.
+
+#define SLIST_EMPTY(head) ((head)->first == NULL)
+#define SLIST_FIRST(head) ((head)->first)
+#define SLIST_NEXT(elem, field) ((elem)->field.next)
+
+#define SLIST_FOREACH(var, head, field)                                        \
+  for ((var) = SLIST_FIRST(head); (var); (var) = SLIST_NEXT(var, field))
+
+#define SLIST_FOREACH_FROM(var, head, field)                                   \
+  if (!(var))                                                                  \
+    (var) = SLIST_FIRST(head);                                                 \
+  for (; (var); (var) = SLIST_NEXT(var, field))
+
+#define SLIST_FOREACH_SAFE(var, head, field, tvar)                             \
+  for ((var) = SLIST_FIRST(head);                                              \
+       (var) && ((tvar) = SLIST_NEXT(var, field), 1); (var) = (tvar))
+
+#define SLIST_FOREACH_FROM_SAFE(var, head, field, tvar)                        \
+  if (!(var))                                                                  \
+    (var) = SLIST_FIRST(head);                                                 \
+  for (; (var) && ((tvar) = SLIST_NEXT(var, field), 1); (var) = (tvar))
+
+// Singly-linked list functions.
+
+#define SLIST_CONCAT(head1, head2, type, field)                                \
+  do {                                                                         \
+    if (SLIST_EMPTY(head1)) {                                                  \
+      if ((SLIST_FIRST(head1) = SLIST_FIRST(head2)) != NULL)                   \
+        SLIST_INIT(head2);                                                     \
+    } else if (!SLIST_EMPTY(head2)) {                                          \
+      QUEUE_TYPEOF(type) *cur = SLIST_FIRST(head1);                            \
+      while (SLIST_NEXT(cur, field) != NULL)                                   \
+        cur = SLIST_NEXT(cur, field);                                          \
+      SLIST_NEXT(cur, field) = SLIST_FIRST(head2);                             \
+      SLIST_INIT(head2);                                                       \
+    }                                                                          \
+  } while (0)
+
+#define SLIST_INIT(head)                                                       \
+  do {                                                                         \
+    SLIST_FIRST(head) = NULL;                                                  \
+  } while (0)
+
+#define SLIST_INSERT_AFTER(slistelem, elem, field)                             \
+  do {                                                                         \
+    SLIST_NEXT(elem, field) = SLIST_NEXT(slistelem, field);                    \
+    SLIST_NEXT(slistelem, field) = (elem);                                     \
+  } while (0)
+
+#define SLIST_INSERT_HEAD(head, elem, field)                                   \
+  do {                                                                         \
+    SLIST_NEXT(elem, field) = SLIST_FIRST(head);                               \
+    SLIST_FIRST(head) = (elem);                                                \
+  } while (0)
+
+#define SLIST_REMOVE(head, elem, type, field)                                  \
+  do {                                                                         \
+    if (SLIST_FIRST(head) == (elem)) {                                         \
+      SLIST_REMOVE_HEAD(head, field);                                          \
+    } else {                                                                   \
+      QUEUE_TYPEOF(type) *cur = SLIST_FIRST(head);                             \
+      while (SLIST_NEXT(cur, field) != (elem))                                 \
+        cur = SLIST_NEXT(cur, field);                                          \
+      SLIST_REMOVE_AFTER(cur, field);                                          \
+    }                                                                          \
+  } while (0)
+
+#define SLIST_REMOVE_AFTER(elem, field)                                        \
+  do {                                                                         \
+    SLIST_NEXT(elem, field) = SLIST_NEXT(SLIST_NEXT(elem, field), field);      \
+  } while (0)
+
+#define SLIST_REMOVE_HEAD(head, field)                                         \
+  do {                                                                         \
+    SLIST_FIRST(head) = SLIST_NEXT(SLIST_FIRST(head), field);                  \
+  } while (0)
+
+#define SLIST_SWAP(head1, head2, type)                                         \
+  do {                                                                         \
+    QUEUE_TYPEOF(type) *first = SLIST_FIRST(head1);                            \
+    SLIST_FIRST(head1) = SLIST_FIRST(head2);                                   \
+    SLIST_FIRST(head2) = first;                                                \
+  } while (0)
+
+// Singly-linked tail queue definitions.
+
+#define STAILQ_HEAD(name, type)                                                \
+  struct name {                                                                \
+    struct type *first;                                                        \
+    struct type **last;                                                        \
+  }
+
+#define STAILQ_CLASS_HEAD(name, type)                                          \
+  struct name {                                                                \
+    class type *first;                                                         \
+    class type **last;                                                         \
+  }
+
+#define STAILQ_HEAD_INITIALIZER(head)                                          \
+  { NULL, &(head).first }
+
+#define STAILQ_ENTRY(type)                                                     \
+  struct {                                                                     \
+    struct type *next;                                                         \
+  }
+
+#define STAILQ_CLASS_ENTRY(type)                                               \
+  struct {                                                                     \
+    class type *next;                                                          \
+  }
+
+// Singly-linked tail queue access methods.
+
+#define STAILQ_EMPTY(head) ((head)->first == NULL)
+#define STAILQ_FIRST(head) ((head)->first)
+#define STAILQ_LAST(head, type, field)                                         \
+  (STAILQ_EMPTY(head) ? NULL : __containerof((head)->last, type, field.next))
+#define STAILQ_NEXT(elem, field) ((elem)->field.next)
+
+#define STAILQ_FOREACH(var, head, field)                                       \
+  for ((var) = STAILQ_FIRST(head); (var); (var) = STAILQ_NEXT(var, field))
+
+#define STAILQ_FOREACH_FROM(var, head, field)                                  \
+  if (!(var))                                                                  \
+    (var) = STAILQ_FIRST(head);                                                \
+  for (; (var); (var) = STAILQ_NEXT(var, field))
+
+#define STAILQ_FOREACH_SAFE(var, head, field, tvar)                            \
+  for ((var) = STAILQ_FIRST(head);                                             \
+       (var) && ((tvar) = STAILQ_NEXT(var, field), 1); (var) = (tvar))
+
+#define STAILQ_FOREACH_FROM_SAFE(var, head, field, tvar)                       \
+  if (!(var))                                                                  \
+    (var) = STAILQ_FIRST(head);                                                \
+  for (; (var) && ((tvar) = STAILQ_NEXT(var, field), 1); (var) = (tvar))
+
+// Singly-linked tail queue functions.
+
+#define STAILQ_CONCAT(head1, head2, type, field)                               \
+  do {                                                                         \
+    if (!STAILQ_EMPTY(head2)) {                                                \
+      *(head1)->last = (head2)->first;                                         \
+      (head1)->last = (head2)->last;                                           \
+      STAILQ_INIT(head2);                                                      \
+    }                                                                          \
+  } while (0)
+
+#define STAILQ_INIT(head)                                                      \
+  do {                                                                         \
+    STAILQ_FIRST(head) = NULL;                                                 \
+    (head)->last = &STAILQ_FIRST(head);                                        \
+  } while (0)
+
+#define STAILQ_INSERT_AFTER(head, listelem, elem, field)                       \
+  do {                                                                         \
+    if ((STAILQ_NEXT(elem, field) = STAILQ_NEXT(listelem, field)) == NULL)     \
+      (head)->last = &STAILQ_NEXT(elem, field);                                \
+    STAILQ_NEXT(listelem, field) = (elem);                                     \
+  } while (0)
+
+#define STAILQ_INSERT_HEAD(head, elem, field)                                  \
+  do {                                                                         \
+    if ((STAILQ_NEXT(elem, field) = STAILQ_FIRST(head)) == NULL)               \
+      (head)->last = &STAILQ_NEXT(elem, field);                                \
+    STAILQ_FIRST(head) = (elem);                                               \
+  } while (0)
+
+#define STAILQ_INSERT_TAIL(head, elem, field)                                  \
+  do {                                                                         \
+    STAILQ_NEXT(elem, field) = NULL;                                           \
+    *(head)->last = (elem);                                                    \
+    (head)->last = &STAILQ_NEXT(elem, field);                                  \
+  } while (0)
+
+#define STAILQ_REMOVE(head, elem, type, field)                                 \
+  do {                                                                         \
+    if (STAILQ_FIRST(head) == (elem)) {                                        \
+      STAILQ_REMOVE_HEAD(head, field);                                         \
+    } else {                                                                   \
+      QUEUE_TYPEOF(type) *cur = STAILQ_FIRST(head);                            \
+      while (STAILQ_NEXT(cur, field) != (elem))                                \
+        cur = STAILQ_NEXT(cur, field);                                         \
+      STAILQ_REMOVE_AFTER(head, cur, field);                                   \
+    }                                                                          \
+  } while (0)
+
+#define STAILQ_REMOVE_AFTER(head, elem, field)                                 \
+  do {                                                                         \
+    if ((STAILQ_NEXT(elem, field) =                                            \
+             STAILQ_NEXT(STAILQ_NEXT(elem, field), field)) == NULL)            \
+      (head)->last = &STAILQ_NEXT(elem, field);                                \
+  } while (0)
+
+#define STAILQ_REMOVE_HEAD(head, field)                                        \
+  do {                                                                         \
+    if ((STAILQ_FIRST(head) = STAILQ_NEXT(STAILQ_FIRST(head), field)) == NULL) \
+      (head)->last = &STAILQ_FIRST(head);                                      \
+  } while (0)
+
+#define STAILQ_SWAP(head1, head2, type)                                        \
+  do {                                                                         \
+    QUEUE_TYPEOF(type) *first = STAILQ_FIRST(head1);                           \
+    QUEUE_TYPEOF(type) **last = (head1)->last;                                 \
+    STAILQ_FIRST(head1) = STAILQ_FIRST(head2);                                 \
+    (head1)->last = (head2)->last;                                             \
+    STAILQ_FIRST(head2) = first;                                               \
+    (head2)->last = last;                                                      \
+    if (STAILQ_EMPTY(head1))                                                   \
+      (head1)->last = &STAILQ_FIRST(head1);                                    \
+    if (STAILQ_EMPTY(head2))                                                   \
+      (head2)->last = &STAILQ_FIRST(head2);                                    \
+  } while (0)
+
+#endif // __LLVM_LIBC_MACROS_SYS_QUEUE_MACROS_H
diff --git a/libc/include/sys/queue.h b/libc/include/sys/queue.h
new file mode 100644
index 0000000000000..2a4dc37712d6d
--- /dev/null
+++ b/libc/include/sys/queue.h
@@ -0,0 +1,14 @@
+//===-- BSD sys/queue.h ---------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SYS_QUEUE_H
+#define LLVM_LIBC_SYS_QUEUE_H
+
+#include <llvm-libc-macros/sys-queue-macros.h>
+
+#endif // LLVM_LIBC_SYS_QUEUE_H
diff --git a/libc/test/CMakeLists.txt b/libc/test/CMakeLists.txt
index 5e8231ef27f46..f22f2b183aca9 100644
--- a/libc/test/CMakeLists.txt
+++ b/libc/test/CMakeLists.txt
@@ -14,6 +14,7 @@ if(LIBC_TARGET_ARCHITECTURE_IS_GPU AND
   return()
 endif()
 
+add_subdirectory(include)
 add_subdirectory(src)
 add_subdirectory(utils)
 
diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
new file mode 100644
index 0000000000000..95a3aba9d95d9
--- /dev/null
+++ b/libc/test/include/CMakeLists.txt
@@ -0,0 +1,16 @@
+add_custom_target(libc_include_tests)
+
+add_libc_unittest(
+  sys_queue_test
+  SUITE
+    libc_include_tests
+  SRCS
+    sys/queue_test.cpp
+  DEPENDS
+    libc.include.llvm-libc-macros.sys_queue_macros
+    libc.src.__support.char_vector
+    libc.src.__support.CPP.string
+  COMPILE_OPTIONS
+    # This is needed because the __containerof macro uses statement expression.
+    -Wno-gnu-statement-expression-from-macro-expansion
+)
diff --git a/libc/test/include/sys/queue_test.cpp b/libc/test/include/sys/queue_test.cpp
new file mode 100644
index 0000000000000..48c0e811c6154
--- /dev/null
+++ b/libc/test/include/sys/queue_test.cpp
@@ -0,0 +1,168 @@
+//===-- Unittests for queue -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDSList-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/CPP/string.h"
+#include "src/__support/char_vector.h"
+#include "test/UnitTest/Test.h"
+
+#include "llvm-libc-macros/sys-queue-macros.h"
+
+using LIBC_NAMESPACE::CharVector;
+using LIBC_NAMESPACE::cpp::string;
+
+namespace LIBC_NAMESPACE {
+
+TEST(LlvmLibcQueueTest, SList) {
+  struct Entry {
+    char c;
+    SLIST_ENTRY(Entry) entries;
+  };
+
+  SLIST_HEAD(Head, Entry);
+
+  Head head = SLIST_HEAD_INITIALIZER(head);
+
+  struct Contains : public testing::Matcher<Head> {
+    string s;
+    Contains(string s) : s(s) {}
+    bool match(Head head) {
+      Entry *e;
+      CharVector v;
+      SLIST_FOREACH(e, &head, entries) { v.append(e->c); }
+      return s == v.c_str();
+    }
+  };
+
+  Entry e1 = {'a', {NULL}};
+  SLIST_INSERT_HEAD(&head, &e1, entries);
+
+  ASSERT_THAT(head, Contains("a"));
+
+  Entry e2 = {'b', {NULL}};
+  SLIST_INSERT_AFTER(&e1, &e2, entries);
+
+  ASSERT_THAT(head, Contains("ab"));
+
+  Head head2 = SLIST_HEAD_INITIALIZER(head);
+
+  Entry e3 = {'c', {NULL}};
+  SLIST_INSERT_HEAD(&head2, &e3, entries);
+
+  ASSERT_THAT(head2, Contains("c"));
+
+  SLIST_SWAP(&head, &head2, Entry);
+
+  ASSERT_THAT(head2, Contains("ab"));
+
+  SLIST_CONCAT(&head2, &head, Entry, entries);
+
+  ASSERT_THAT(head2, Contains("abc"));
+
+  SLIST_CONCAT(&head, &head2, Entry, entries);
+
+  ASSERT_THAT(head, Contains("abc"));
+
+  Entry *e = NULL, *tmp = NULL;
+  SLIST_FOREACH_SAFE(e, &head, entries, tmp) {
+    if (e == &e2) {
+      SLIST_REMOVE(&head, e, Entry, entries);
+    }
+  }
+
+  ASSERT_THAT(head, Contains("ac"));
+
+  while (!SLIST_EMPTY(&head)) {
+    e = SLIST_FIRST(&head);
+    SLIST_REMOVE_HEAD(&head, entries);
+  }
+
+  ASSERT_TRUE(SLIST_EMPTY(&head));
+}
+
+TEST(LlvmLibcQueueTest, STailQ) {
+  struct Entry {
+    char c;
+    STAILQ_ENTRY(Entry) entries;
+  };
+
+  STAILQ_HEAD(Head, Entry);
+
+  Head head = STAILQ_HEAD_INITIALIZER(head);
+
+  struct Contains : public testing::Matcher<Head> {
+    string s;
+    Contains(string s) : s(s) {}
+    bool match(Head head) {
+      Entry *e;
+      CharVector v;
+      STAILQ_FOREACH(e, &head, entries) { v.append(e->c); }
+      return s == v.c_str();
+    }
+  };
+
+  STAILQ_INIT(&head);
+  ASSERT_TRUE(STAILQ_EMPTY(&head));
+
+  Entry e1 = {'a', {NULL}};
+  STAILQ_INSERT_HEAD(&head, &e1, entries);
+
+  ASSERT_THAT(head, Contains("a"));
+
+  Entry e2 = {'b', {NULL}};
+  STAILQ_INSERT_TAIL(&head, &e2, entries);
+
+  ASSERT_THAT(head, Contains("ab"));
+
+  Entry e3 = {'c', {NULL}};
+  STAILQ_INSERT_AFTER(&head, &e2, &e3, entries);
+
+  ASSERT_THAT(head, Contains("abc"));
+
+  Head head2 = STAILQ_HEAD_INITIALIZER(head);
+
+  Entry e4 = {'d', {NULL}};
+  STAILQ_INSERT_HEAD(&head2, &e4, entries);
+
+  ASSERT_THAT(head2, Contains("d"));
+
+  STAILQ_SWAP(&head, &head2, Entry);
+
+  ASSERT_THAT(head2, Contains("abc"));
+
+  STAILQ_CONCAT(&head2, &head, Entry, entries);
+
+  ASSERT_EQ(STAILQ_FIRST(&head2), &e1);
+  ASSERT_EQ(STAILQ_LAST(&head2, Entry, entries), &e4);
+
+  ASSERT_THAT(head2, Contains("abcd"));
+
+  STAILQ_CONCAT(&head, &head2, Entry, entries);
+
+  ASSERT_EQ(STAILQ_FIRST(&head), &e1);
+  ASSERT_EQ(STAILQ_LAST(&head, Entry, entries), &e4);
+
+  ASSERT_THAT(head, Contains("abcd"));
+
+  Entry *e = NULL, *tmp = NULL;
+  STAILQ_FOREACH_SAFE(e, &head, entries, tmp) {
+    if (e == &e2) {
+      STAILQ_REMOVE(&head, e, Entry, entries);
+    }
+  }
+
+  ASSERT_THAT(head, Contains("acd"));
+
+  while (!STAILQ_EMPTY(&head)) {
+    e = STAILQ_FIRST(&head);
+    STAILQ_REMOVE_HEAD(&head, entries);
+  }
+
+  ASSERT_TRUE(STAILQ_EMPTY(&head));
+}
+
+} // namespace LIBC_NAMESPACE

From 595d780b492d3de3f5653851e7d64c122472d71a Mon Sep 17 00:00:00 2001
From: Billy Zhu <billyzhu@modular.com>
Date: Thu, 18 Jan 2024 21:57:20 -0800
Subject: [PATCH 075/843] [MLIR][ODS] Check hasProperties when generating
 populateDefaultAttrs (#78525)

Currently ODS generates `populateDefaultAttrs` or
`populateDefaultProperties` based on whether the dialect opted into
usePropertiesForAttributes. But since individual ops might get opted
into using properties (as long as it has one property), it should
actually just check whether the op itself uses properties. Otherwise
`populateDefaultAttrs` will overwrite existing attrs inside properties
when creating an op. Understandably this becomes moot once everything
switches over to using properties, but this fixes it for now.

This PR makes ODS generate `populateDefaultProperties` as long as the op
itself uses properties.
---
 mlir/test/mlir-tblgen/op-attribute.td       | 13 +++++++++++++
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/mlir/test/mlir-tblgen/op-attribute.td b/mlir/test/mlir-tblgen/op-attribute.td
index 07636f53c1e15..b5b8619e7c9be 100644
--- a/mlir/test/mlir-tblgen/op-attribute.td
+++ b/mlir/test/mlir-tblgen/op-attribute.td
@@ -173,6 +173,8 @@ def AOp : NS_Op<"a_op", []> {
 // DEF:        ::llvm::ArrayRef<::mlir::NamedAttribute> attributes
 // DEF:      odsState.addAttributes(attributes);
 
+// DEF:      void AOp::populateDefaultAttrs
+
 // Test the above but with prefix.
 
 def Test2_Dialect : Dialect {
@@ -287,6 +289,17 @@ def AgetOp : Op<Test2_Dialect, "a_get_op", []> {
 // DEF:        ::llvm::ArrayRef<::mlir::NamedAttribute> attributes
 // DEF:      odsState.addAttributes(attributes);
 
+// Test the above but using properties.
+def ApropOp : NS_Op<"a_prop_op", []> {
+  let arguments = (ins
+      Property<"unsigned">:$aAttr,
+      DefaultValuedAttr<SomeAttr, "4.2">:$bAttr
+  );
+}
+
+// DEF-LABEL: ApropOp definitions
+// DEF:       void ApropOp::populateDefaultProperties
+
 def SomeTypeAttr : TypeAttrBase<"SomeType", "some type attribute">;
 
 def BOp : NS_Op<"b_op", []> {
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index cd37c8dcd3d5e..71326049af057 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -2506,7 +2506,7 @@ void OpEmitter::genPopulateDefaultAttributes() {
       }))
     return;
 
-  if (op.getDialect().usePropertiesForAttributes()) {
+  if (emitHelper.hasProperties()) {
     SmallVector<MethodParameter> paramList;
     paramList.emplace_back("::mlir::OperationName", "opName");
     paramList.emplace_back("Properties &", "properties");

From 0ad83bc26c37048aa88ad036a4d853d847d37ec7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 18 Jan 2024 22:35:24 -0800
Subject: [PATCH 076/843] [RISCV] Don't look through EXTRACT_ELEMENT in
 lowerScalarInsert if the element types are different. (#78668)

If the element type of the vector we're extracting from doesn't match the type we're
inserting into, we can't directly insert or extract the subvector.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   | 28 +++++++++++--------
 .../CodeGen/RISCV/rvv/fold-binary-reduce.ll   | 23 +++++++++++++++
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 019ea93aac9dc..b41e2f40dc72f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4040,19 +4040,23 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, MVT VT,
   if (Scalar.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
       isNullConstant(Scalar.getOperand(1))) {
     SDValue ExtractedVal = Scalar.getOperand(0);
-    MVT ExtractedVT = ExtractedVal.getSimpleValueType();
-    MVT ExtractedContainerVT = ExtractedVT;
-    if (ExtractedContainerVT.isFixedLengthVector()) {
-      ExtractedContainerVT = getContainerForFixedLengthVector(
-          DAG, ExtractedContainerVT, Subtarget);
-      ExtractedVal = convertToScalableVector(ExtractedContainerVT, ExtractedVal,
-                                             DAG, Subtarget);
-    }
-    if (ExtractedContainerVT.bitsLE(VT))
-      return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Passthru, ExtractedVal,
+    // The element types must be the same.
+    if (ExtractedVal.getValueType().getVectorElementType() ==
+        VT.getVectorElementType()) {
+      MVT ExtractedVT = ExtractedVal.getSimpleValueType();
+      MVT ExtractedContainerVT = ExtractedVT;
+      if (ExtractedContainerVT.isFixedLengthVector()) {
+        ExtractedContainerVT = getContainerForFixedLengthVector(
+            DAG, ExtractedContainerVT, Subtarget);
+        ExtractedVal = convertToScalableVector(ExtractedContainerVT,
+                                               ExtractedVal, DAG, Subtarget);
+      }
+      if (ExtractedContainerVT.bitsLE(VT))
+        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, Passthru,
+                           ExtractedVal, DAG.getConstant(0, DL, XLenVT));
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtractedVal,
                          DAG.getConstant(0, DL, XLenVT));
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtractedVal,
-                       DAG.getConstant(0, DL, XLenVT));
+    }
   }
 
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
index 8fd8c2548ff34..351c0bab9dca8 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fold-binary-reduce.ll
@@ -343,3 +343,26 @@ declare i64 @llvm.smax.i64(i64, i64)
 declare i64 @llvm.smin.i64(i64, i64)
 declare float @llvm.maxnum.f32(float ,float)
 declare float @llvm.minnum.f32(float ,float)
+
+define void @crash(<2 x i32> %0) {
+; CHECK-LABEL: crash:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetivli zero, 1, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vmv.v.i v8, 0
+; CHECK-NEXT:    vmv.s.x v9, a0
+; CHECK-NEXT:    vredsum.vs v8, v8, v9
+; CHECK-NEXT:    vmv.x.s a0, v8
+; CHECK-NEXT:    sb a0, 0(zero)
+; CHECK-NEXT:    ret
+entry:
+  %1 = extractelement <2 x i32> %0, i64 0
+  %2 = tail call i16 @llvm.vector.reduce.add.v4i16(<4 x i16> zeroinitializer)
+  %3 = zext i16 %2 to i32
+  %op.rdx = add i32 %1, %3
+  %conv18.us = trunc i32 %op.rdx to i8
+  store i8 %conv18.us, ptr null, align 1
+  ret void
+}
+declare i16 @llvm.vector.reduce.add.v4i16(<4 x i16>)

From 461679f2925220f350c6bafb2d661c309c5e3ae8 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 18 Jan 2024 23:15:24 -0800
Subject: [PATCH 077/843] [RISCV] Re-order riscv-target-features.c to put
 non-experimental extensions together. (#78675)

Drop -menable-experimenta-extensions where it isn't needed.

This file has sections for non-experimental and experimental extensions,
but we keep forgetting to move things when we change the extension
status.
---
 .../test/Preprocessor/riscv-target-features.c | 296 +++++++++---------
 1 file changed, 149 insertions(+), 147 deletions(-)

diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 8dc02f7efefbd..d5ec93e292bf2 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -19,6 +19,8 @@
 // CHECK-NOT: __riscv_m {{.*$}}
 // CHECK-NOT: __riscv_mul {{.*$}}
 // CHECK-NOT: __riscv_muldiv {{.*$}}
+// CHECK-NOT: __riscv_smaia {{.*$}}
+// CHECK-NOT: __riscv_ssaia {{.*$}}
 // CHECK-NOT: __riscv_svinval {{.*$}}
 // CHECK-NOT: __riscv_svnapot {{.*$}}
 // CHECK-NOT: __riscv_svpbmt {{.*$}}
@@ -70,6 +72,7 @@
 // CHECK-NOT: __riscv_zcmp {{.*$}}
 // CHECK-NOT: __riscv_zcmt {{.*$}}
 // CHECK-NOT: __riscv_zdinx {{.*$}}
+// CHECK-NOT: __riscv_zfa {{.*$}}
 // CHECK-NOT: __riscv_zfh {{.*$}}
 // CHECK-NOT: __riscv_zfhmin {{.*$}}
 // CHECK-NOT: __riscv_zfinx {{.*$}}
@@ -100,12 +103,27 @@
 // CHECK-NOT: __riscv_zksh {{.*$}}
 // CHECK-NOT: __riscv_zkt {{.*$}}
 // CHECK-NOT: __riscv_zmmul {{.*$}}
+// CHECK-NOT: __riscv_zvbb {{.*$}}
+// CHECK-NOT: __riscv_zvbc {{.*$}}
 // CHECK-NOT: __riscv_zve32f {{.*$}}
 // CHECK-NOT: __riscv_zve32x {{.*$}}
 // CHECK-NOT: __riscv_zve64d {{.*$}}
 // CHECK-NOT: __riscv_zve64f {{.*$}}
 // CHECK-NOT: __riscv_zve64x {{.*$}}
 // CHECK-NOT: __riscv_zvfh {{.*$}}
+// CHECK-NOT: __riscv_zvkg {{.*$}}
+// CHECK-NOT: __riscv_zvkn {{.*$}}
+// CHECK-NOT: __riscv_zvknc {{.*$}}
+// CHECK-NOT: __riscv_zvkned {{.*$}}
+// CHECK-NOT: __riscv_zvkng {{.*$}}
+// CHECK-NOT: __riscv_zvknha {{.*$}}
+// CHECK-NOT: __riscv_zvknhb {{.*$}}
+// CHECK-NOT: __riscv_zvks {{.*$}}
+// CHECK-NOT: __riscv_zvksc {{.*$}}
+// CHECK-NOT: __riscv_zvksed {{.*$}}
+// CHECK-NOT: __riscv_zvksg {{.*$}}
+// CHECK-NOT: __riscv_zvksh {{.*$}}
+// CHECK-NOT: __riscv_zvkt {{.*$}}
 // CHECK-NOT: __riscv_zvl32b {{.*$}}
 // CHECK-NOT: __riscv_zvl64b {{.*$}}
 // CHECK-NOT: __riscv_zvl128b {{.*$}}
@@ -121,34 +139,16 @@
 
 // Experimental extensions
 
-// CHECK-NOT: __riscv_smaia {{.*$}}
-// CHECK-NOT: __riscv_ssaia {{.*$}}
 // CHECK-NOT: __riscv_zacas {{.*$}}
-// CHECK-NOT: __riscv_zfa {{.*$}}
+// CHECK-NOT: __riscv_zcmop {{.*$}}
 // CHECK-NOT: __riscv_zfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zicfilp {{.*$}}
 // CHECK-NOT: __riscv_zicfiss {{.*$}}
 // CHECK-NOT: __riscv_zicond {{.*$}}
 // CHECK-NOT: __riscv_zimop {{.*$}}
-// CHECK-NOT: __riscv_zcmop {{.*$}}
 // CHECK-NOT: __riscv_ztso {{.*$}}
-// CHECK-NOT: __riscv_zvbb {{.*$}}
-// CHECK-NOT: __riscv_zvbc {{.*$}}
 // CHECK-NOT: __riscv_zvfbfmin {{.*$}}
 // CHECK-NOT: __riscv_zvfbfwma {{.*$}}
-// CHECK-NOT: __riscv_zvkg {{.*$}}
-// CHECK-NOT: __riscv_zvkn {{.*$}}
-// CHECK-NOT: __riscv_zvknc {{.*$}}
-// CHECK-NOT: __riscv_zvkned {{.*$}}
-// CHECK-NOT: __riscv_zvkng {{.*$}}
-// CHECK-NOT: __riscv_zvknha {{.*$}}
-// CHECK-NOT: __riscv_zvknhb {{.*$}}
-// CHECK-NOT: __riscv_zvks {{.*$}}
-// CHECK-NOT: __riscv_zvksc {{.*$}}
-// CHECK-NOT: __riscv_zvksed {{.*$}}
-// CHECK-NOT: __riscv_zvksg {{.*$}}
-// CHECK-NOT: __riscv_zvksh {{.*$}}
-// CHECK-NOT: __riscv_zvkt {{.*$}}
 
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN: -march=rv32ia -x c -E -dM %s \
@@ -1092,32 +1092,22 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVL65536b %s
 // CHECK-ZVL65536b: __riscv_v_min_vlen 65536
 
-// Experimental extensions
-
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32ismaia1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-SMAIA-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64ismaia1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-SMAIA-EXT %s
 // CHECK-SMAIA-EXT: __riscv_smaia  1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32issaia1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-SSAIA-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64issaia1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-SSAIA-EXT %s
 // CHECK-SSAIA-EXT: __riscv_ssaia  1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32i_zacas1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64i_zacas1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
-// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
-
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN: -march=rv32izfa -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZFA-EXT %s
@@ -1126,242 +1116,262 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZFA-EXT %s
 // CHECK-ZFA-EXT: __riscv_zfa 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32izfbfmin1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZFBFMIN-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64izfbfmin1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZFBFMIN-EXT %s
-// CHECK-ZFBFMIN-EXT: __riscv_zfbfmin 1000000{{$}}
-
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32i_zicfilp0p4 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFILP-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64i_zicfilp0p4 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFILP-EXT %s
-// CHECK-ZICFILP-EXT: __riscv_zicfilp 4000{{$}}
-
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32i_zicond1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64i_zicond1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
-// CHECK-ZICOND-EXT: __riscv_zicond  1000000{{$}}
-
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32i_zimop0p1 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64i_zimop0p1 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s
-// CHECK-ZIMOP-EXT: __riscv_zimop  1000{{$}}
-
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32i_zcmop0p2 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZCMOP-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64i_zcmop0p2 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZCMOP-EXT %s
-// CHECK-ZCMOP-EXT: __riscv_zcmop  2000{{$}}
-
-// RUN: %clang --target=riscv32-unknown-linux-gnu -menable-experimental-extensions \
-// RUN: -march=rv32iztso0p1 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZTSO-EXT %s
-// RUN: %clang --target=riscv64-unknown-linux-gnu -menable-experimental-extensions \
-// RUN: -march=rv64iztso0p1 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZTSO-EXT %s
-// CHECK-ZTSO-EXT: __riscv_ztso 1000{{$}}
-
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve64x_zvbb1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVBB-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve64x_zvbb1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVBB-EXT %s
 // CHECK-ZVBB-EXT: __riscv_zvbb  1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve64x_zvbc1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVBC-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve64x_zvbc1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVBC-EXT %s
 // CHECK-ZVBC-EXT: __riscv_zvbc  1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32ifzvfbfmin1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZVFBFMIN-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64ifzvfbfmin1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZVFBFMIN-EXT %s
-// CHECK-ZVFBFMIN-EXT: __riscv_zvfbfmin 1000000{{$}}
-
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32ifzvfbfwma1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZVFBFWMA-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64ifzvfbfwma1p0 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZVFBFWMA-EXT %s
-// CHECK-ZVFBFWMA-EXT: __riscv_zvfbfwma 1000000{{$}}
-
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve32x_zvkg1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKG-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve32x_zvkg1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKG-EXT %s
 // CHECK-ZVKG-EXT: __riscv_zvkg  1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve64x_zvkn1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKN-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve64x_zvkn1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKN-EXT %s
 // CHECK-ZVKN-EXT: __riscv_zvkn 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32iv_zvbb1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKN %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64iv_zvbb1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKN %s
 // CHECK-COMBINE-INTO-ZVKN: __riscv_zvkn 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve64x_zvknc1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNC-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve64x_zvknc1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNC-EXT %s
 // CHECK-ZVKNC-EXT: __riscv_zvknc 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32iv_zvbb1p0_zvbc1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKNC %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64iv_zvbb1p0_zvbc1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKNC %s
 // CHECK-COMBINE-INTO-ZVKNC: __riscv_zvkn 1000000{{$}}
 // CHECK-COMBINE-INTO-ZVKNC: __riscv_zvknc 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve32x_zvkned1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNED-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve32x_zvkned1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNED-EXT %s
 // CHECK-ZVKNED-EXT: __riscv_zvkned 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve64x_zvkng1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNG-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve64x_zvkng1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNG-EXT %s
 // CHECK-ZVKNG-EXT: __riscv_zvkng 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32iv_zvbb1p0_zvkg1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKNG %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64iv_zvbb1p0_zvkg1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKNG %s
 // CHECK-COMBINE-INTO-ZVKNG: __riscv_zvkn 1000000{{$}}
 // CHECK-COMBINE-INTO-ZVKNG: __riscv_zvkng 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve32x_zvknha1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNHA-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve32x_zvknha1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNHA-EXT %s
 // CHECK-ZVKNHA-EXT: __riscv_zvknha 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve64x_zvknhb1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNHB-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve64x_zvknhb1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKNHB-EXT %s
 // CHECK-ZVKNHB-EXT: __riscv_zvknhb  1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve64x_zvks1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKS-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve64x_zvks1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKS-EXT %s
 // CHECK-ZVKS-EXT: __riscv_zvks 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32iv_zvbb1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKS %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64iv_zvbb1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKS %s
 // CHECK-COMBINE-INTO-ZVKS: __riscv_zvks 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve64x_zvksc1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKSC-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve64x_zvksc1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKSC-EXT %s
 // CHECK-ZVKSC-EXT: __riscv_zvksc 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32iv_zvbb1p0_zvbc1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKSC %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64iv_zvbb1p0_zvbc1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKSC %s
 // CHECK-COMBINE-INTO-ZVKSC: __riscv_zvks 1000000{{$}}
 // CHECK-COMBINE-INTO-ZVKSC: __riscv_zvksc 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve32x_zvksed1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKSED-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve32x_zvksed1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKSED-EXT %s
 // CHECK-ZVKSED-EXT: __riscv_zvksed  1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve64x_zvksg1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKSG-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve64x_zvksg1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKSG-EXT %s
 // CHECK-ZVKSG-EXT: __riscv_zvksg 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32iv_zvbb1p0_zvkg1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKSG %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64iv_zvbb1p0_zvkg1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKSG %s
 // CHECK-COMBINE-INTO-ZVKSG: __riscv_zvks 1000000{{$}}
 // CHECK-COMBINE-INTO-ZVKSG: __riscv_zvksg 1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve32x_zvksh1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKSH-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve32x_zvksh1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKSH-EXT %s
 // CHECK-ZVKSH-EXT: __riscv_zvksh  1000000{{$}}
 
-// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve32x_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKT-EXT %s
-// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: %clang --target=riscv64 \
 // RUN: -march=rv64i_zve32x_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKT-EXT %s
 // CHECK-ZVKT-EXT: __riscv_zvkt 1000000{{$}}
 
+// Experimental extensions
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zacas1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zacas1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZACAS-EXT %s
+// CHECK-ZACAS-EXT: __riscv_zacas 1000000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32izfbfmin1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZFBFMIN-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64izfbfmin1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZFBFMIN-EXT %s
+// CHECK-ZFBFMIN-EXT: __riscv_zfbfmin 1000000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zicfilp0p4 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFILP-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zicfilp0p4 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFILP-EXT %s
+// CHECK-ZICFILP-EXT: __riscv_zicfilp 4000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zicond1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zicond1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICOND-EXT %s
+// CHECK-ZICOND-EXT: __riscv_zicond  1000000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zimop0p1 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zimop0p1 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZIMOP-EXT %s
+// CHECK-ZIMOP-EXT: __riscv_zimop  1000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32i_zcmop0p2 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZCMOP-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64i_zcmop0p2 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZCMOP-EXT %s
+// CHECK-ZCMOP-EXT: __riscv_zcmop  2000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu -menable-experimental-extensions \
+// RUN: -march=rv32iztso0p1 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZTSO-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu -menable-experimental-extensions \
+// RUN: -march=rv64iztso0p1 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZTSO-EXT %s
+// CHECK-ZTSO-EXT: __riscv_ztso 1000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32ifzvfbfmin1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZVFBFMIN-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64ifzvfbfmin1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZVFBFMIN-EXT %s
+// CHECK-ZVFBFMIN-EXT: __riscv_zvfbfmin 1000000{{$}}
+
+// RUN: %clang --target=riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32ifzvfbfwma1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZVFBFWMA-EXT %s
+// RUN: %clang --target=riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64ifzvfbfwma1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZVFBFWMA-EXT %s
+// CHECK-ZVFBFWMA-EXT: __riscv_zvfbfwma 1000000{{$}}
+
+// RUN: %clang -target riscv32 -menable-experimental-extensions \
+// RUN: -march=rv32izicfiss0p4 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFISS-EXT %s
+// RUN: %clang -target riscv64 -menable-experimental-extensions \
+// RUN: -march=rv64izicfiss0p4 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFISS-EXT %s
+// CHECK-ZICFISS-EXT: __riscv_zicfiss 4000{{$}}
+
+// Misaligned
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu -march=rv32i -x c -E -dM %s \
 // RUN: -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-AVOID
 // RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64i -x c -E -dM %s \
@@ -1373,11 +1383,3 @@
 // RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64i -E -dM %s \
 // RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // CHECK-MISALIGNED-FAST: __riscv_misaligned_fast 1
-
-// RUN: %clang -target riscv32 -menable-experimental-extensions \
-// RUN: -march=rv32izicfiss0p4 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFISS-EXT %s
-// RUN: %clang -target riscv64 -menable-experimental-extensions \
-// RUN: -march=rv64izicfiss0p4 -x c -E -dM %s \
-// RUN: -o - | FileCheck --check-prefix=CHECK-ZICFISS-EXT %s
-// CHECK-ZICFISS-EXT: __riscv_zicfiss 4000{{$}}

From 498e1c2257da552abc58aa75f6b9f776bf826f86 Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena <usx@google.com>
Date: Fri, 19 Jan 2024 08:23:25 +0100
Subject: [PATCH 078/843] [coroutine] Create coroutine body in the correct eval
 context (#78589)

Fixes: https://github.com/llvm/llvm-project/issues/78290

See the bug for more context.
```cpp
Gen ACoroutine() {
  if constexpr (0) // remove it make clang compile.
    co_return;
  co_await Gen{};
}
```
We miss symbol of ctor of promise_type if the first coroutine statement
happens to be inside the disabled branch of `if constexpr`.

This happens because the promise object is built when we see the first
coroutine statement which is present in
`ExpressionEvaluationContext::DiscardedStatement` context due to `if
constexpr (0)`. This makes clang believe that the promise constructor is
only odr-used and not really "used".

The expr evaluation context for the coroutine body should not be related
to the context in which the first coroutine statement appears. We
override the context to `PotentiallyEvaluated`.

---------

Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/lib/Sema/SemaCoroutine.cpp              |  4 +++
 clang/test/SemaCXX/coroutine-promise-ctor.cpp | 34 +++++++++++++++++++
 3 files changed, 41 insertions(+)
 create mode 100644 clang/test/SemaCXX/coroutine-promise-ctor.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b400d75095421..63f756f30873c 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -804,6 +804,9 @@ Bug Fixes in This Version
   Objective-C++ property accesses to not be converted to a function call
   to the getter in the placement-args of new-expressions.
   Fixes (`#65053 <https://github.com/llvm/llvm-project/issues/65053>`_)
+- Fix an issue with missing symbol definitions when the first coroutine
+  statement appears in a discarded ``if constexpr`` branch.
+  Fixes (`#78290 <https://github.com/llvm/llvm-project/issues/78290>`_)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 0e0f8f67dcd73..4e600fd29ee73 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -21,6 +21,7 @@
 #include "clang/AST/StmtCXX.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Lex/Preprocessor.h"
+#include "clang/Sema/EnterExpressionEvaluationContext.h"
 #include "clang/Sema/Initialization.h"
 #include "clang/Sema/Overload.h"
 #include "clang/Sema/ScopeInfo.h"
@@ -773,6 +774,9 @@ bool Sema::checkFinalSuspendNoThrow(const Stmt *FinalSuspend) {
 
 bool Sema::ActOnCoroutineBodyStart(Scope *SC, SourceLocation KWLoc,
                                    StringRef Keyword) {
+  // Ignore previous expr evaluation contexts.
+  EnterExpressionEvaluationContext PotentiallyEvaluated(
+      *this, Sema::ExpressionEvaluationContext::PotentiallyEvaluated);
   if (!checkCoroutineContext(*this, KWLoc, Keyword))
     return false;
   auto *ScopeInfo = getCurFunction();
diff --git a/clang/test/SemaCXX/coroutine-promise-ctor.cpp b/clang/test/SemaCXX/coroutine-promise-ctor.cpp
new file mode 100644
index 0000000000000..7a06299712cb7
--- /dev/null
+++ b/clang/test/SemaCXX/coroutine-promise-ctor.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 -std=c++20 -ast-dump %s | FileCheck %s
+#include "Inputs/std-coroutine.h"
+
+// Github issue: https://github.com/llvm/llvm-project/issues/78290
+namespace GH78290 {
+class Gen {
+   public:
+    class promise_type {
+       public:
+        template<typename... Args>
+        explicit promise_type(Args...) {}
+        // CHECK:       CXXConstructorDecl {{.*}} used promise_type 'void ()' {{.*}}
+        // CHECK-NEXT:     TemplateArgument pack
+        // CHECK-NEXT:     CompoundStmt {{.*}}
+        Gen get_return_object() { return {}; }
+
+        void unhandled_exception() {}
+        void return_void() {}
+        std::suspend_always await_transform(Gen gen) { return {}; }
+
+        std::suspend_always initial_suspend() { return {}; }
+        // CHECK: CXXMethodDecl {{.*}} used initial_suspend {{.*}}
+        std::suspend_always final_suspend() noexcept { return {}; }
+        // CHECK: CXXMethodDecl {{.*}} used final_suspend {{.*}}
+    };
+};
+
+Gen CoroutineBody() {
+    if constexpr (0) {
+        co_await Gen{};
+    }
+    co_await Gen{};
+}
+} // namespace GH78290

From 430e145fc3964dac0bbf355f27616190d403dd83 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 18 Jan 2024 23:26:15 -0800
Subject: [PATCH 079/843] Clean up PlatformDarwinKernel::GetSharedModule,
 document (#78652)

PlatformDarwinKernel::GetSharedModule, which can find a kernel or kext
from a local filesystem scan, needed a little cleanup. The method which
finds kernels was (1) not looking for the SymbolFileSpec when creating a
Module, and (2) adding that newly created Module to a Target, which
GetSharedModule should not be doing - after auditing many other subclass
implementations of this method, I haven't found any others doing it.
Platform::GetSharedModule didn't have a headerdoc so it took a little
work to piece together the intended behaviors.

This is addressing a bug where
PlatformDarwinKernel::GetSharedModuleKernel would find the ObjectFile
for a kernel, create a Module, and add it to the Target. Then up in
DynamicLoaderDarwinKernel, it would check if the Module had a SymbolFile
FileSpec, and because it did not, it would do its own search for a
binary & dSYM, find them, and then add that to the Target. Now we have
two copies of the Module in the Target, one with a dSYM and the other
without, and only one of them has its load addresses set.

GetSharedModule should not be adding binaries to the Target, and it
should set the SymbolFile FileSpec when it is creating the Module.

rdar://120895951
---
 lldb/include/lldb/Target/Platform.h           | 26 ++++++++
 .../Platform/MacOSX/PlatformDarwinKernel.cpp  | 66 +++++++------------
 2 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/lldb/include/lldb/Target/Platform.h b/lldb/include/lldb/Target/Platform.h
index 129e4565d9ff9..13196ff74d663 100644
--- a/lldb/include/lldb/Target/Platform.h
+++ b/lldb/include/lldb/Target/Platform.h
@@ -288,6 +288,32 @@ class Platform : public PluginInterface {
   LocateExecutableScriptingResources(Target *target, Module &module,
                                      Stream &feedback_stream);
 
+  /// \param[in] module_spec
+  ///     The ModuleSpec of a binary to find.
+  ///
+  /// \param[in] process
+  ///     A Process.
+  ///
+  /// \param[out] module_sp
+  ///     A Module that matches the ModuleSpec, if one is found.
+  ///
+  /// \param[in] module_search_paths_ptr
+  ///     Locations to possibly look for a binary that matches the ModuleSpec.
+  ///
+  /// \param[out] old_modules
+  ///     Existing Modules in the Process' Target image list which match
+  ///     the FileSpec.
+  ///
+  /// \param[out] did_create_ptr
+  ///     Optional boolean, nullptr may be passed for this argument.
+  ///     If this method is returning a *new* ModuleSP, this
+  ///     will be set to true.
+  ///     If this method is returning a ModuleSP that is already in the
+  ///     Target's image list, it will be false.
+  ///
+  /// \return
+  ///     The Status object for any errors found while searching for
+  ///     the binary.
   virtual Status GetSharedModule(
       const ModuleSpec &module_spec, Process *process,
       lldb::ModuleSP &module_sp, const FileSpecList *module_search_paths_ptr,
diff --git a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
index e2839f3285cce..1f121f5c4e5d0 100644
--- a/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
+++ b/lldb/source/Plugins/Platform/MacOSX/PlatformDarwinKernel.cpp
@@ -791,9 +791,10 @@ Status PlatformDarwinKernel::GetSharedModuleKernel(
     const ModuleSpec &module_spec, Process *process, ModuleSP &module_sp,
     const FileSpecList *module_search_paths_ptr,
     llvm::SmallVectorImpl<ModuleSP> *old_modules, bool *did_create_ptr) {
-  Status error;
-  module_sp.reset();
+  assert(module_sp.get() == nullptr);
   UpdateKextandKernelsLocalScan();
+  if (did_create_ptr)
+    *did_create_ptr = false;
 
   // First try all kernel binaries that have a dSYM next to them
   for (auto possible_kernel : m_kernel_binaries_with_dsyms) {
@@ -803,22 +804,19 @@ Status PlatformDarwinKernel::GetSharedModuleKernel(
       module_sp.reset(new Module(kern_spec));
       if (module_sp && module_sp->GetObjectFile() &&
           module_sp->MatchesModuleSpec(kern_spec)) {
-        // module_sp is an actual kernel binary we want to add.
-        if (process) {
-          const bool notify = false;
-          process->GetTarget().GetImages().AppendIfNeeded(module_sp, notify);
-          error.Clear();
-          return error;
-        } else {
-          error = ModuleList::GetSharedModule(kern_spec, module_sp, nullptr,
-                                              nullptr, nullptr);
-          if (module_sp && module_sp->GetObjectFile() &&
-              module_sp->GetObjectFile()->GetType() !=
-                  ObjectFile::Type::eTypeCoreFile) {
-            return error;
-          }
-          module_sp.reset();
-        }
+        // The dSYM is next to the binary (that's the only
+        // way it ends up in the index), but it might be a
+        // .dSYM.yaa that needs to be expanded, don't just
+        // append ".dSYM" to the filename for the SymbolFile.
+        FileSpecList search_paths =
+            process->GetTarget().GetDebugFileSearchPaths();
+        FileSpec dsym_fspec =
+            PluginManager::LocateExecutableSymbolFile(kern_spec, search_paths);
+        if (FileSystem::Instance().Exists(dsym_fspec))
+          module_sp->SetSymbolFileFileSpec(dsym_fspec);
+        if (did_create_ptr)
+          *did_create_ptr = true;
+        return {};
       }
     }
   }
@@ -836,36 +834,18 @@ Status PlatformDarwinKernel::GetSharedModuleKernel(
       module_sp.reset(new Module(kern_spec));
       if (module_sp && module_sp->GetObjectFile() &&
           module_sp->MatchesModuleSpec(kern_spec)) {
-        // module_sp is an actual kernel binary we want to add.
-        if (process) {
-          const bool notify = false;
-          process->GetTarget().GetImages().AppendIfNeeded(module_sp, notify);
-          error.Clear();
-          return error;
-        } else {
-          error = ModuleList::GetSharedModule(kern_spec, module_sp, nullptr,
-                                              nullptr, nullptr);
-          if (module_sp && module_sp->GetObjectFile() &&
-              module_sp->GetObjectFile()->GetType() !=
-                  ObjectFile::Type::eTypeCoreFile) {
-            return error;
-          }
-          module_sp.reset();
-        }
+        if (did_create_ptr)
+          *did_create_ptr = true;
+        return {};
       }
     }
   }
 
-  // Give the generic methods, including possibly calling into  DebugSymbols
+  // Give the generic methods, including possibly calling into DebugSymbols
   // framework on macOS systems, a chance.
-  error = PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
-                                          module_search_paths_ptr, old_modules,
-                                          did_create_ptr);
-  if (error.Success() && module_sp.get()) {
-    return error;
-  }
-
-  return error;
+  return PlatformDarwin::GetSharedModule(module_spec, process, module_sp,
+                                         module_search_paths_ptr, old_modules,
+                                         did_create_ptr);
 }
 
 std::vector<lldb_private::FileSpec>

From 407db48eb4d387ae272bfbef9b12868806f1e830 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Fri, 19 Jan 2024 15:08:39 +0800
Subject: [PATCH 080/843] [Release Notes] [C++20] [Modules] Summarize modules
 related change to the release note for clang 18

Given clang18 is going to be released, this patch collects the modules
related change to release notes to make users have a clear
vision for modules.
---
 clang/docs/ReleaseNotes.rst | 40 +++++++++++++++++++++++++++++++++----
 1 file changed, 36 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 63f756f30873c..392f694065a24 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -97,10 +97,6 @@ C/C++ Language Potentially Breaking Changes
   outlined in "The Equality Operator You Are Looking For" (`P2468 <http://wg21.link/p2468r2>`_).
   Fixes (`#68901: <https://github.com/llvm/llvm-project/issues/68901>`_).
 
-- Remove the hardcoded path to the imported modules for C++20 named modules. Now we
-  require all the dependent modules to specified from the command line.
-  See (`#62707: <https://github.com/llvm/llvm-project/issues/62707>`_).
-
 C++ Specific Potentially Breaking Changes
 -----------------------------------------
 - The name mangling rules for function templates has been changed to take into
@@ -141,6 +137,13 @@ C++ Specific Potentially Breaking Changes
   when targetting MSVC to match the behavior of MSVC.
   (`MSVC Docs <https://learn.microsoft.com/en-us/cpp/build/reference/permissive-standards-conformance?view=msvc-170>`_)
 
+- Remove the hardcoded path to the imported modules for C++20 named modules. Now we
+  require all the dependent modules to specified from the command line.
+  See (`#62707: <https://github.com/llvm/llvm-project/issues/62707>`_).
+
+- Forbid `import XXX;` in C++ to find module `XXX` comes from explicit clang modules.
+  See (`#64755: <https://github.com/llvm/llvm-project/issues/64755>`_).
+
 ABI Changes in This Version
 ---------------------------
 - Following the SystemV ABI for x86-64, ``__int128`` arguments will no longer
@@ -596,6 +599,17 @@ Improvements to Clang's diagnostics
        13 |     new (__builtin_memset) S {};
           |          ^
 
+- Clang now diagnoses import before module declarations but not in global
+  module fragment.
+  (`#67627: <https://github.com/llvm/llvm-project/issues/67627>`_).
+
+- Clang now diagnoses include headers with angle in module purviews, which is
+  not usually intended.
+  (`#68615: <https://github.com/llvm/llvm-project/issues/68615>`_)
+
+- Clang now won't mention invisible namespace when diagnose invisible declarations
+  inside namespace. The original diagnostic message is confusing.
+  (`#73893: <https://github.com/llvm/llvm-project/issues/73893>`_)
 
 Improvements to Clang's time-trace
 ----------------------------------
@@ -982,6 +996,16 @@ Bug Fixes to C++ Support
 - Clang now allows parenthesized initialization of arrays in `operator new[]`.
   Fixes: (`#68198 <https://github.com/llvm/llvm-project/issues/68198>`_)
 
+- Fix crash when importing the same module with an dynamic initializer twice
+  in different visibility.
+  Fixes (`#67893 <https://github.com/llvm/llvm-project/issues/67893>`_)
+
+- Fix a false-positive ODR violation for different definitions for `std::align_val_t`.
+  Fixes (`#76638 <https://github.com/llvm/llvm-project/issues/76638>`_)
+
+- Remove recorded `#pragma once` state for headers included in named modules.
+  Fixes (`#77995 <https://github.com/llvm/llvm-project/issues/77995>`_)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
@@ -1350,6 +1374,14 @@ Improvements
 - Checkers can query constraint bounds to improve diagnostic messages.
   (`#74141 <https://github.com/llvm/llvm-project/pull/74141>`_)
 
+- Improved the generated initializers for modules. Now the calls to initialize
+  functions from imported module units can be omitted if the initialize
+  function is known to be empty.
+  (`#56794 <https://github.com/llvm/llvm-project/issues/56794>`_)
+
+- Clang now allow to export declarations within linkage-specification.
+  (`#71347 <https://github.com/llvm/llvm-project/issues/71347>`_)
+
 Moved checkers
 ^^^^^^^^^^^^^^
 

From c6a6547798ca641b985456997cdf986bb99b0707 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Fri, 19 Jan 2024 10:01:29 +0200
Subject: [PATCH 081/843] [compiler-rt] Add a prefix on the windows mmap
 symbols (#78037)

For Windows, the compiler-rt profile library contains a polyfill
reimplementation of the mmap family of functions.

Previously, the runtime library exposed those symbols like, "mmap", in
the user symbol namespace. This could cause misdetections by configure
scripts that check for the "mmap" function just by linking, without
including headers.

Add a prefix on the symbols, and make an undeclared function static.

This fixes such an issue reported at
https://github.com/mstorsjo/llvm-mingw/issues/390.
---
 compiler-rt/lib/profile/WindowsMMap.c | 3 +--
 compiler-rt/lib/profile/WindowsMMap.h | 6 ++++++
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/profile/WindowsMMap.c b/compiler-rt/lib/profile/WindowsMMap.c
index 07c0a689feaea..9d7da835b1ed3 100644
--- a/compiler-rt/lib/profile/WindowsMMap.c
+++ b/compiler-rt/lib/profile/WindowsMMap.c
@@ -124,8 +124,7 @@ int madvise(void *addr, size_t length, int advice)
   return 0;
 }
 
-COMPILER_RT_VISIBILITY
-int lock(HANDLE handle, DWORD lockType, BOOL blocking) {
+static int lock(HANDLE handle, DWORD lockType, BOOL blocking) {
   DWORD flags = lockType;
   if (!blocking)
     flags |= LOCKFILE_FAIL_IMMEDIATELY;
diff --git a/compiler-rt/lib/profile/WindowsMMap.h b/compiler-rt/lib/profile/WindowsMMap.h
index 68b8de2398d60..1df1a0be0b02b 100644
--- a/compiler-rt/lib/profile/WindowsMMap.h
+++ b/compiler-rt/lib/profile/WindowsMMap.h
@@ -60,6 +60,12 @@
 # define DWORD_LO(x) (x)
 #endif
 
+#define mmap __llvm_profile_mmap
+#define munmap __llvm_profile_munmap
+#define msync __llvm_profile_msync
+#define madvise __llvm_profile_madvise
+#define flock __llvm_profile_flock
+
 void *mmap(void *start, size_t length, int prot, int flags, int fd,
            off_t offset);
 

From 3b54337be54107a5f196bb98dfc858b10ec35b8a Mon Sep 17 00:00:00 2001
From: denglesberg-splunk
 <125308860+denglesberg-splunk@users.noreply.github.com>
Date: Fri, 19 Jan 2024 03:12:54 -0500
Subject: [PATCH 082/843] Replace `exec_tools` with `tools` in bazel genrule.
 (#77510)

As of the Bazel 6.x series, there is no difference between the
`exec_tools` and `tools`. Bazel 7 removes the `exec_tools` attribute
entirely. This commit updates to use the cannonical attribute name to
allow building `clang-tidy``with bazel 7.0.0, though it does not change
the default bazel version which remains at 6.1.2.

See also https://github.com/bazelbuild/bazel/issues/19132 for more
information.
---
 .../clang-tools-extra/clang-tidy/BUILD.bazel                    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
index bbabc5397e989..317863de3b36c 100644
--- a/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang-tools-extra/clang-tidy/BUILD.bazel
@@ -57,7 +57,7 @@ genrule(
     srcs = ["misc/ConfusableTable/confusables.txt"],
     outs = ["Confusables.inc"],
     cmd = "$(location :confusable_table_builder) $(SRCS) $(OUTS)",
-    exec_tools = [":confusable_table_builder"],
+    tools = [":confusable_table_builder"],
     visibility = ["//visibility:private"],
 )
 

From c03c4e2b1459cc5a3c40534f4a7f99144126fbf4 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 00:19:31 -0800
Subject: [PATCH 083/843] [tools] Use SmallString::operator std::string (NFC)

---
 llvm/tools/bugpoint/ExecutionDriver.cpp         |  4 ++--
 llvm/tools/bugpoint/Miscompilation.cpp          | 17 +++++++----------
 llvm/tools/bugpoint/OptimizerDriver.cpp         |  2 +-
 llvm/tools/bugpoint/ToolRunner.cpp              |  4 ++--
 llvm/tools/dsymutil/dsymutil.cpp                |  4 ++--
 llvm/tools/llvm-config/llvm-config.cpp          |  6 +++---
 llvm/tools/llvm-cov/SourceCoverageView.cpp      |  4 ++--
 llvm/tools/llvm-cov/gcov.cpp                    | 10 ++++------
 .../tools/llvm-exegesis/lib/BenchmarkRunner.cpp |  2 +-
 llvm/tools/llvm-lto/llvm-lto.cpp                |  2 +-
 llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp        |  2 +-
 llvm/tools/llvm-rc/llvm-rc.cpp                  |  2 +-
 12 files changed, 27 insertions(+), 32 deletions(-)

diff --git a/llvm/tools/bugpoint/ExecutionDriver.cpp b/llvm/tools/bugpoint/ExecutionDriver.cpp
index 2b06e8f3b3651..18187ad97dddd 100644
--- a/llvm/tools/bugpoint/ExecutionDriver.cpp
+++ b/llvm/tools/bugpoint/ExecutionDriver.cpp
@@ -299,7 +299,7 @@ Expected<std::string> BugDriver::executeProgram(const Module &Program,
              << "!\n";
       exit(1);
     }
-    BitcodeFile = std::string(UniqueFilename.str());
+    BitcodeFile = std::string(UniqueFilename);
 
     if (writeProgramToFile(BitcodeFile, UniqueFD, Program)) {
       errs() << ToolName << ": Error emitting bitcode to file '" << BitcodeFile
@@ -324,7 +324,7 @@ Expected<std::string> BugDriver::executeProgram(const Module &Program,
            << "\n";
     exit(1);
   }
-  OutputFile = std::string(UniqueFile.str());
+  OutputFile = std::string(UniqueFile);
 
   // Figure out which shared objects to run, if any.
   std::vector<std::string> SharedObjs(AdditionalSOs);
diff --git a/llvm/tools/bugpoint/Miscompilation.cpp b/llvm/tools/bugpoint/Miscompilation.cpp
index 130c6fabd2d48..22806bab83eec 100644
--- a/llvm/tools/bugpoint/Miscompilation.cpp
+++ b/llvm/tools/bugpoint/Miscompilation.cpp
@@ -956,8 +956,7 @@ static Expected<bool> TestCodeGenerator(BugDriver &BD,
            << "Error making unique filename: " << EC.message() << "\n";
     exit(1);
   }
-  if (BD.writeProgramToFile(std::string(TestModuleBC.str()), TestModuleFD,
-                            *Test)) {
+  if (BD.writeProgramToFile(std::string(TestModuleBC), TestModuleFD, *Test)) {
     errs() << "Error writing bitcode to `" << TestModuleBC.str()
            << "'\nExiting.";
     exit(1);
@@ -976,8 +975,7 @@ static Expected<bool> TestCodeGenerator(BugDriver &BD,
     exit(1);
   }
 
-  if (BD.writeProgramToFile(std::string(SafeModuleBC.str()), SafeModuleFD,
-                            *Safe)) {
+  if (BD.writeProgramToFile(std::string(SafeModuleBC), SafeModuleFD, *Safe)) {
     errs() << "Error writing bitcode to `" << SafeModuleBC << "'\nExiting.";
     exit(1);
   }
@@ -985,7 +983,7 @@ static Expected<bool> TestCodeGenerator(BugDriver &BD,
   FileRemover SafeModuleBCRemover(SafeModuleBC.str(), !SaveTemps);
 
   Expected<std::string> SharedObject =
-      BD.compileSharedObject(std::string(SafeModuleBC.str()));
+      BD.compileSharedObject(std::string(SafeModuleBC));
   if (Error E = SharedObject.takeError())
     return std::move(E);
 
@@ -994,7 +992,7 @@ static Expected<bool> TestCodeGenerator(BugDriver &BD,
   // Run the code generator on the `Test' code, loading the shared library.
   // The function returns whether or not the new output differs from reference.
   Expected<bool> Result = BD.diffProgram(
-      BD.getProgram(), std::string(TestModuleBC.str()), *SharedObject, false);
+      BD.getProgram(), std::string(TestModuleBC), *SharedObject, false);
   if (Error E = Result.takeError())
     return std::move(E);
 
@@ -1051,8 +1049,7 @@ Error BugDriver::debugCodeGenerator() {
     exit(1);
   }
 
-  if (writeProgramToFile(std::string(TestModuleBC.str()), TestModuleFD,
-                         *ToCodeGen)) {
+  if (writeProgramToFile(std::string(TestModuleBC), TestModuleFD, *ToCodeGen)) {
     errs() << "Error writing bitcode to `" << TestModuleBC << "'\nExiting.";
     exit(1);
   }
@@ -1068,13 +1065,13 @@ Error BugDriver::debugCodeGenerator() {
     exit(1);
   }
 
-  if (writeProgramToFile(std::string(SafeModuleBC.str()), SafeModuleFD,
+  if (writeProgramToFile(std::string(SafeModuleBC), SafeModuleFD,
                          *ToNotCodeGen)) {
     errs() << "Error writing bitcode to `" << SafeModuleBC << "'\nExiting.";
     exit(1);
   }
   Expected<std::string> SharedObject =
-      compileSharedObject(std::string(SafeModuleBC.str()));
+      compileSharedObject(std::string(SafeModuleBC));
   if (Error E = SharedObject.takeError())
     return E;
 
diff --git a/llvm/tools/bugpoint/OptimizerDriver.cpp b/llvm/tools/bugpoint/OptimizerDriver.cpp
index f7239f5dc61bd..ce324594724ca 100644
--- a/llvm/tools/bugpoint/OptimizerDriver.cpp
+++ b/llvm/tools/bugpoint/OptimizerDriver.cpp
@@ -141,7 +141,7 @@ bool BugDriver::runPasses(Module &Program,
            << ": Error making unique filename: " << EC.message() << "\n";
     return true;
   }
-  OutputFilename = std::string(UniqueFilename.str());
+  OutputFilename = std::string(UniqueFilename);
 
   // set up the input file name
   Expected<sys::fs::TempFile> Temp =
diff --git a/llvm/tools/bugpoint/ToolRunner.cpp b/llvm/tools/bugpoint/ToolRunner.cpp
index c6733aecd31d4..e45c89b746aeb 100644
--- a/llvm/tools/bugpoint/ToolRunner.cpp
+++ b/llvm/tools/bugpoint/ToolRunner.cpp
@@ -442,7 +442,7 @@ Expected<CC::FileType> LLC::OutputCode(const std::string &Bitcode,
     errs() << "Error making unique filename: " << EC.message() << "\n";
     exit(1);
   }
-  OutputAsmFile = std::string(UniqueFile.str());
+  OutputAsmFile = std::string(UniqueFile);
   std::vector<StringRef> LLCArgs;
   LLCArgs.push_back(LLCPath);
 
@@ -772,7 +772,7 @@ Error CC::MakeSharedObject(const std::string &InputFile, FileType fileType,
     errs() << "Error making unique filename: " << EC.message() << "\n";
     exit(1);
   }
-  OutputFile = std::string(UniqueFilename.str());
+  OutputFile = std::string(UniqueFilename);
 
   std::vector<StringRef> CCArgs;
 
diff --git a/llvm/tools/dsymutil/dsymutil.cpp b/llvm/tools/dsymutil/dsymutil.cpp
index df0df3634882e..b0e988c6f8e4b 100644
--- a/llvm/tools/dsymutil/dsymutil.cpp
+++ b/llvm/tools/dsymutil/dsymutil.cpp
@@ -601,9 +601,9 @@ getOutputFileName(StringRef InputFile, const DsymutilOptions &Options) {
   }
 
   sys::path::append(Path, "Contents", "Resources");
-  std::string ResourceDir = std::string(Path.str());
+  std::string ResourceDir = std::string(Path);
   sys::path::append(Path, "DWARF", sys::path::filename(DwarfFile));
-  return OutputLocation(std::string(Path.str()), ResourceDir);
+  return OutputLocation(std::string(Path), ResourceDir);
 }
 
 int dsymutil_main(int argc, char **argv, const llvm::ToolContext &) {
diff --git a/llvm/tools/llvm-config/llvm-config.cpp b/llvm/tools/llvm-config/llvm-config.cpp
index f31098eb67cca..d5b76b1bb6c16 100644
--- a/llvm/tools/llvm-config/llvm-config.cpp
+++ b/llvm/tools/llvm-config/llvm-config.cpp
@@ -359,18 +359,18 @@ int main(int argc, char **argv) {
     {
       SmallString<256> Path(LLVM_INSTALL_INCLUDEDIR);
       sys::fs::make_absolute(ActivePrefix, Path);
-      ActiveIncludeDir = std::string(Path.str());
+      ActiveIncludeDir = std::string(Path);
     }
     {
       SmallString<256> Path(LLVM_TOOLS_INSTALL_DIR);
       sys::fs::make_absolute(ActivePrefix, Path);
-      ActiveBinDir = std::string(Path.str());
+      ActiveBinDir = std::string(Path);
     }
     ActiveLibDir = ActivePrefix + "/lib" + LLVM_LIBDIR_SUFFIX;
     {
       SmallString<256> Path(LLVM_INSTALL_PACKAGE_DIR);
       sys::fs::make_absolute(ActivePrefix, Path);
-      ActiveCMakeDir = std::string(Path.str());
+      ActiveCMakeDir = std::string(Path);
     }
     ActiveIncludeOption = "-I" + ActiveIncludeDir;
   }
diff --git a/llvm/tools/llvm-cov/SourceCoverageView.cpp b/llvm/tools/llvm-cov/SourceCoverageView.cpp
index c910edd1db782..b92c62df7950a 100644
--- a/llvm/tools/llvm-cov/SourceCoverageView.cpp
+++ b/llvm/tools/llvm-cov/SourceCoverageView.cpp
@@ -48,7 +48,7 @@ std::string CoveragePrinter::getOutputPath(StringRef Path, StringRef Extension,
   sys::path::append(FullPath, PathFilename);
   sys::path::native(FullPath);
 
-  return std::string(FullPath.str());
+  return std::string(FullPath);
 }
 
 Expected<CoveragePrinter::OwnedStream>
@@ -163,7 +163,7 @@ std::string SourceCoverageView::getSourceName() const {
   SmallString<128> SourceText(SourceName);
   sys::path::remove_dots(SourceText, /*remove_dot_dot=*/true);
   sys::path::native(SourceText);
-  return std::string(SourceText.str());
+  return std::string(SourceText);
 }
 
 void SourceCoverageView::addExpansion(
diff --git a/llvm/tools/llvm-cov/gcov.cpp b/llvm/tools/llvm-cov/gcov.cpp
index 9a1ebebc87fc8..00ea12415b220 100644
--- a/llvm/tools/llvm-cov/gcov.cpp
+++ b/llvm/tools/llvm-cov/gcov.cpp
@@ -35,12 +35,10 @@ static void reportCoverage(StringRef SourceFile, StringRef ObjectDir,
     // A file was given. Ignore the source file and look next to this file.
     sys::path::replace_extension(CoverageFileStem, "");
 
-  std::string GCNO = InputGCNO.empty()
-                         ? std::string(CoverageFileStem.str()) + ".gcno"
-                         : InputGCNO;
-  std::string GCDA = InputGCDA.empty()
-                         ? std::string(CoverageFileStem.str()) + ".gcda"
-                         : InputGCDA;
+  std::string GCNO =
+      InputGCNO.empty() ? std::string(CoverageFileStem) + ".gcno" : InputGCNO;
+  std::string GCDA =
+      InputGCDA.empty() ? std::string(CoverageFileStem) + ".gcda" : InputGCDA;
   GCOVFile GF;
 
   // Open .gcda and .gcda without requiring a NUL terminator. The concurrent
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index bc43c2d06324c..eb1393ad94ff9 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -639,7 +639,7 @@ BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const {
   raw_fd_ostream OFS(ResultFD, true /*ShouldClose*/);
   OFS.write(Buffer.data(), Buffer.size());
   OFS.flush();
-  return std::string(ResultPath.str());
+  return std::string(ResultPath);
 }
 
 BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index 4141aa85d0860..735b3763f5b2f 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -526,7 +526,7 @@ static std::string getThinLTOOutputFile(StringRef Path, StringRef OldPrefix,
     if (std::error_code EC = llvm::sys::fs::create_directories(ParentPath))
       error(EC, "error creating the directory '" + ParentPath + "'");
   }
-  return std::string(NewPath.str());
+  return std::string(NewPath);
 }
 
 namespace thinlto {
diff --git a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
index 5810c70a664ff..9da14cc52d6cb 100644
--- a/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
+++ b/llvm/tools/llvm-pdbutil/llvm-pdbutil.cpp
@@ -1574,7 +1574,7 @@ int main(int Argc, const char **Argv) {
     if (opts::yaml2pdb::YamlPdbOutputFile.empty()) {
       SmallString<16> OutputFilename(opts::yaml2pdb::InputFilename.getValue());
       sys::path::replace_extension(OutputFilename, ".pdb");
-      opts::yaml2pdb::YamlPdbOutputFile = std::string(OutputFilename.str());
+      opts::yaml2pdb::YamlPdbOutputFile = std::string(OutputFilename);
     }
     yamlToPdb(opts::yaml2pdb::InputFilename);
   } else if (opts::DiaDumpSubcommand) {
diff --git a/llvm/tools/llvm-rc/llvm-rc.cpp b/llvm/tools/llvm-rc/llvm-rc.cpp
index ec36676636498..1c3379a3a9678 100644
--- a/llvm/tools/llvm-rc/llvm-rc.cpp
+++ b/llvm/tools/llvm-rc/llvm-rc.cpp
@@ -561,7 +561,7 @@ RcOptions parseRcOptions(ArrayRef<const char *> ArgsArr,
     SmallString<128> OutputFile(Opts.InputFile);
     llvm::sys::fs::make_absolute(OutputFile);
     llvm::sys::path::replace_extension(OutputFile, "res");
-    OutArgsInfo.push_back(std::string(OutputFile.str()));
+    OutArgsInfo.push_back(std::string(OutputFile));
   }
   if (!Opts.IsDryRun) {
     if (OutArgsInfo.size() != 1)

From 4aea9f63a0a71ae20ad203e6e36c5894d95ae2a3 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 00:19:33 -0800
Subject: [PATCH 084/843] [Remarks] Use StringRef::consume_{front,back} (NFC)

---
 llvm/lib/Remarks/YAMLRemarkParser.cpp | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Remarks/YAMLRemarkParser.cpp b/llvm/lib/Remarks/YAMLRemarkParser.cpp
index 218b6691398bb..a287ef5742556 100644
--- a/llvm/lib/Remarks/YAMLRemarkParser.cpp
+++ b/llvm/lib/Remarks/YAMLRemarkParser.cpp
@@ -302,11 +302,8 @@ Expected<StringRef> YAMLRemarkParser::parseStr(yaml::KeyValueNode &Node) {
   } else
     Result = Value->getRawValue();
 
-  if (Result.front() == '\'')
-    Result = Result.drop_front();
-
-  if (Result.back() == '\'')
-    Result = Result.drop_back();
+  Result.consume_front("\'");
+  Result.consume_back("\'");
 
   return Result;
 }
@@ -456,11 +453,8 @@ Expected<StringRef> YAMLStrTabRemarkParser::parseStr(yaml::KeyValueNode &Node) {
   else
     return Str.takeError();
 
-  if (Result.front() == '\'')
-    Result = Result.drop_front();
-
-  if (Result.back() == '\'')
-    Result = Result.drop_back();
+  Result.consume_front("\'");
+  Result.consume_back("\'");
 
   return Result;
 }

From 7243607867393a2b8ccd477e95e6f62d00f3206f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 00:19:35 -0800
Subject: [PATCH 085/843] [Support] Use llvm::children and
 llvm::inverse_children (NFC)

---
 .../llvm/Support/GenericLoopInfoImpl.h        | 29 +++++++------------
 1 file changed, 11 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/Support/GenericLoopInfoImpl.h b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
index 15af9cd794e2b..1e0d0ee446fc4 100644
--- a/llvm/include/llvm/Support/GenericLoopInfoImpl.h
+++ b/llvm/include/llvm/Support/GenericLoopInfoImpl.h
@@ -115,7 +115,7 @@ bool LoopBase<BlockT, LoopT>::hasDedicatedExits() const {
   SmallVector<BlockT *, 4> UniqueExitBlocks;
   getUniqueExitBlocks(UniqueExitBlocks);
   for (BlockT *EB : UniqueExitBlocks)
-    for (BlockT *Predecessor : children<Inverse<BlockT *>>(EB))
+    for (BlockT *Predecessor : inverse_children<BlockT *>(EB))
       if (!contains(Predecessor))
         return false;
   // All the requirements are met.
@@ -208,10 +208,7 @@ BlockT *LoopBase<BlockT, LoopT>::getLoopPreheader() const {
     return nullptr;
 
   // Make sure there is only one exit out of the preheader.
-  typedef GraphTraits<BlockT *> BlockTraits;
-  typename BlockTraits::ChildIteratorType SI = BlockTraits::child_begin(Out);
-  ++SI;
-  if (SI != BlockTraits::child_end(Out))
+  if (llvm::size(llvm::children<BlockT *>(Out)) != 1)
     return nullptr; // Multiple exits from the block, must not be a preheader.
 
   // The predecessor has exactly one successor, so it is a preheader.
@@ -231,7 +228,7 @@ BlockT *LoopBase<BlockT, LoopT>::getLoopPredecessor() const {
 
   // Loop over the predecessors of the header node...
   BlockT *Header = getHeader();
-  for (const auto Pred : children<Inverse<BlockT *>>(Header)) {
+  for (const auto Pred : inverse_children<BlockT *>(Header)) {
     if (!contains(Pred)) { // If the block is not in the loop...
       if (Out && Out != Pred)
         return nullptr; // Multiple predecessors outside the loop
@@ -249,7 +246,7 @@ BlockT *LoopBase<BlockT, LoopT>::getLoopLatch() const {
   assert(!isInvalid() && "Loop not in a valid state!");
   BlockT *Header = getHeader();
   BlockT *Latch = nullptr;
-  for (const auto Pred : children<Inverse<BlockT *>>(Header)) {
+  for (const auto Pred : inverse_children<BlockT *>(Header)) {
     if (contains(Pred)) {
       if (Latch)
         return nullptr;
@@ -331,20 +328,16 @@ void LoopBase<BlockT, LoopT>::verifyLoop() const {
 
   // Check the individual blocks.
   for (BlockT *BB : depth_first_ext(getHeader(), VisitSet)) {
-    assert(std::any_of(GraphTraits<BlockT *>::child_begin(BB),
-                       GraphTraits<BlockT *>::child_end(BB),
-                       [&](BlockT *B) { return contains(B); }) &&
+    assert(llvm::any_of(children<BlockT *>(BB),
+                        [&](BlockT *B) { return contains(B); }) &&
            "Loop block has no in-loop successors!");
 
-    assert(std::any_of(GraphTraits<Inverse<BlockT *>>::child_begin(BB),
-                       GraphTraits<Inverse<BlockT *>>::child_end(BB),
-                       [&](BlockT *B) { return contains(B); }) &&
+    assert(llvm::any_of(inverse_children<BlockT *>(BB),
+                        [&](BlockT *B) { return contains(B); }) &&
            "Loop block has no in-loop predecessors!");
 
     SmallVector<BlockT *, 2> OutsideLoopPreds;
-    for (BlockT *B :
-         llvm::make_range(GraphTraits<Inverse<BlockT *>>::child_begin(BB),
-                          GraphTraits<Inverse<BlockT *>>::child_end(BB)))
+    for (BlockT *B : inverse_children<BlockT *>(BB))
       if (!contains(B))
         OutsideLoopPreds.push_back(B);
 
@@ -496,7 +489,7 @@ static void discoverAndMapSubloop(LoopT *L, ArrayRef<BlockT *> Backedges,
       // within this subloop tree itself. Note that a predecessor may directly
       // reach another subloop that is not yet discovered to be a subloop of
       // this loop, which we must traverse.
-      for (const auto Pred : children<Inverse<BlockT *>>(PredBB)) {
+      for (const auto Pred : inverse_children<BlockT *>(PredBB)) {
         if (LI->getLoopFor(Pred) != Subloop)
           ReverseCFGWorklist.push_back(Pred);
       }
@@ -579,7 +572,7 @@ void LoopInfoBase<BlockT, LoopT>::analyze(const DomTreeBase<BlockT> &DomTree) {
     SmallVector<BlockT *, 4> Backedges;
 
     // Check each predecessor of the potential loop header.
-    for (const auto Backedge : children<Inverse<BlockT *>>(Header)) {
+    for (const auto Backedge : inverse_children<BlockT *>(Header)) {
       // If Header dominates predBB, this is a new loop. Collect the backedges.
       const DomTreeNodeBase<BlockT> *BackedgeNode = DomTree.getNode(Backedge);
       if (BackedgeNode && DomTree.dominates(DomNode, BackedgeNode))

From c6cfd5350ec72625283eb99d7055391ed7e9fb7e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 00:19:36 -0800
Subject: [PATCH 086/843] [llvm] Use StringRef::contains (NFC)

---
 llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp | 4 +---
 llvm/lib/IR/AutoUpgrade.cpp                                 | 4 ++--
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
index 1f6724988ae97..fca18e1957884 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVCodeViewReader.cpp
@@ -217,9 +217,7 @@ Error LVCodeViewReader::resolveSymbolName(const coff_section *CoffSection,
 // and they are printed only if the command line option 'internal=system'.
 bool LVCodeViewReader::isSystemEntry(LVElement *Element, StringRef Name) const {
   Name = Name.empty() ? Element->getName() : Name;
-  auto Find = [=](const char *String) -> bool {
-    return StringRef::npos != Name.find(String);
-  };
+  auto Find = [=](const char *String) -> bool { return Name.contains(String); };
   auto Starts = [=](const char *Pattern) -> bool {
     return Name.starts_with(Pattern);
   };
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index d2338e0eaa8d4..ffb8e3a91b668 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -2781,7 +2781,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
 
       bool IsPS2PD = SrcTy->getElementType()->isFloatTy();
-      bool IsUnsigned = (StringRef::npos != Name.find("cvtu"));
+      bool IsUnsigned = Name.contains("cvtu");
       if (IsPS2PD)
         Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
       else if (CI->arg_size() == 4 &&
@@ -2961,7 +2961,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Value *SV =
           Builder.CreateShuffleVector(CI->getArgOperand(0), ShuffleMask);
 
-      bool DoSext = (StringRef::npos != Name.find("pmovsx"));
+      bool DoSext = Name.contains("pmovsx");
       Rep = DoSext ? Builder.CreateSExt(SV, DstTy)
                    : Builder.CreateZExt(SV, DstTy);
       // If there are 3 arguments, it's a masked intrinsic so we need a select.

From 508c6aa8f3e4a0e6ee5845a8fbf46b8cccd96ca5 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 19 Jan 2024 09:27:03 +0100
Subject: [PATCH 087/843] [libc][NFC] Fix "type qualifiers ignored on cast
 result type"  GCC warning (#78509)

GCC complains about "type qualifiers ignored on cast result type".
Upon investigation the correct fix was to remove all `volatile` and use the `-frounding-math` option.
---
 libc/test/src/__support/FPUtil/CMakeLists.txt |  3 ++
 .../__support/FPUtil/dyadic_float_test.cpp    | 34 +++++++------------
 .../test/src/__support/FPUtil/BUILD.bazel     |  1 +
 3 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/libc/test/src/__support/FPUtil/CMakeLists.txt b/libc/test/src/__support/FPUtil/CMakeLists.txt
index 411b8281470cf..897434ceff600 100644
--- a/libc/test/src/__support/FPUtil/CMakeLists.txt
+++ b/libc/test/src/__support/FPUtil/CMakeLists.txt
@@ -9,6 +9,9 @@ add_fp_unittest(
     dyadic_float_test.cpp
   DEPENDS
     libc.src.__support.FPUtil.dyadic_float
+  COMPILE_OPTIONS
+    # Prevent constant folding with a default rounding mode.
+    "-frounding-math"
 )
 
 add_libc_test(
diff --git a/libc/test/src/__support/FPUtil/dyadic_float_test.cpp b/libc/test/src/__support/FPUtil/dyadic_float_test.cpp
index 3d70fb926b0f1..a9f9842c50305 100644
--- a/libc/test/src/__support/FPUtil/dyadic_float_test.cpp
+++ b/libc/test/src/__support/FPUtil/dyadic_float_test.cpp
@@ -20,49 +20,39 @@ using Sign = LIBC_NAMESPACE::fputil::Sign;
 TEST(LlvmLibcDyadicFloatTest, BasicConversions) {
   Float128 x(Sign::POS, /*exponent*/ 0,
              /*mantissa*/ Float128::MantissaType(1));
-  volatile float xf = float(x);
-  volatile double xd = double(x);
-  ASSERT_FP_EQ(1.0f, xf);
-  ASSERT_FP_EQ(1.0, xd);
+  ASSERT_FP_EQ(1.0f, float(x));
+  ASSERT_FP_EQ(1.0, double(x));
 
   Float128 y(0x1.0p-53);
-  volatile float yf = float(y);
-  volatile double yd = double(y);
-  ASSERT_FP_EQ(0x1.0p-53f, yf);
-  ASSERT_FP_EQ(0x1.0p-53, yd);
+  ASSERT_FP_EQ(0x1.0p-53f, float(y));
+  ASSERT_FP_EQ(0x1.0p-53, double(y));
 
   Float128 z = quick_add(x, y);
 
-  EXPECT_FP_EQ_ALL_ROUNDING(xf + yf, float(z));
-  EXPECT_FP_EQ_ALL_ROUNDING(xd + yd, double(z));
+  EXPECT_FP_EQ_ALL_ROUNDING(float(x) + float(y), float(z));
+  EXPECT_FP_EQ_ALL_ROUNDING(double(x) + double(y), double(z));
 }
 
 TEST(LlvmLibcDyadicFloatTest, QuickAdd) {
   Float192 x(Sign::POS, /*exponent*/ 0,
              /*mantissa*/ Float192::MantissaType(0x123456));
-  volatile double xd = double(x);
-  ASSERT_FP_EQ(0x1.23456p20, xd);
+  ASSERT_FP_EQ(0x1.23456p20, double(x));
 
   Float192 y(0x1.abcdefp-20);
-  volatile double yd = double(y);
-  ASSERT_FP_EQ(0x1.abcdefp-20, yd);
+  ASSERT_FP_EQ(0x1.abcdefp-20, double(y));
 
   Float192 z = quick_add(x, y);
-
-  EXPECT_FP_EQ_ALL_ROUNDING(xd + yd, (volatile double)(z));
+  EXPECT_FP_EQ_ALL_ROUNDING(double(x) + double(y), double(z));
 }
 
 TEST(LlvmLibcDyadicFloatTest, QuickMul) {
   Float256 x(Sign::POS, /*exponent*/ 0,
              /*mantissa*/ Float256::MantissaType(0x123456));
-  volatile double xd = double(x);
-  ASSERT_FP_EQ(0x1.23456p20, xd);
+  ASSERT_FP_EQ(0x1.23456p20, double(x));
 
   Float256 y(0x1.abcdefp-25);
-  volatile double yd = double(y);
-  ASSERT_FP_EQ(0x1.abcdefp-25, yd);
+  ASSERT_FP_EQ(0x1.abcdefp-25, double(y));
 
   Float256 z = quick_mul(x, y);
-
-  EXPECT_FP_EQ_ALL_ROUNDING(xd * yd, double(z));
+  EXPECT_FP_EQ_ALL_ROUNDING(double(x) * double(y), double(z));
 }
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/__support/FPUtil/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/src/__support/FPUtil/BUILD.bazel
index 4f206b21e478b..461d5127a42a7 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/__support/FPUtil/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/__support/FPUtil/BUILD.bazel
@@ -22,6 +22,7 @@ libc_test(
 libc_test(
     name = "dyadic_float_test",
     srcs = ["dyadic_float_test.cpp"],
+    copts = ["-frounding-math"],
     deps = [
         "//libc:__support_fputil_dyadic_float",
         "//libc:__support_uint",

From 8b4bb15f6d879fd8655f9e41fee224a8a59f238c Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 19 Jan 2024 09:27:30 +0100
Subject: [PATCH 088/843] [clang][Interp] Implement integral->complex casts
 (#75590)

Allocate storage for them, initialize the first member with the given
value and the second member to 0.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 27 ++++++++++++++++++++++++
 clang/test/AST/Interp/complex.cpp        |  9 ++++----
 2 files changed, 31 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 138ffed392fca..82ef743e6a781 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -294,6 +294,29 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
   case CK_ToVoid:
     return discard(SubExpr);
 
+  case CK_IntegralRealToComplex:
+  case CK_FloatingRealToComplex: {
+    // We're creating a complex value here, so we need to
+    // allocate storage for it.
+    if (!Initializing) {
+      std::optional<unsigned> LocalIndex =
+          allocateLocal(CE, /*IsExtended=*/true);
+      if (!LocalIndex)
+        return false;
+      if (!this->emitGetPtrLocal(*LocalIndex, CE))
+        return false;
+    }
+
+    // Init the complex value to {SubExpr, 0}.
+    if (!this->visitArrayElemInit(0, SubExpr))
+      return false;
+    // Zero-init the second element.
+    PrimType T = classifyPrim(SubExpr->getType());
+    if (!this->visitZeroInitializer(T, SubExpr->getType(), SubExpr))
+      return false;
+    return this->emitInitElem(T, 1, SubExpr);
+  }
+
   default:
     assert(false && "Cast not implemented");
   }
@@ -846,6 +869,10 @@ bool ByteCodeExprGen<Emitter>::VisitInitListExpr(const InitListExpr *E) {
 
   if (T->isAnyComplexType()) {
     unsigned NumInits = E->getNumInits();
+
+    if (NumInits == 1)
+      return this->delegate(E->inits()[0]);
+
     QualType ElemQT = E->getType()->getAs<ComplexType>()->getElementType();
     PrimType ElemT = classifyPrim(ElemQT);
     if (NumInits == 0) {
diff --git a/clang/test/AST/Interp/complex.cpp b/clang/test/AST/Interp/complex.cpp
index e63693a0cfaa1..99c0dd141d0b7 100644
--- a/clang/test/AST/Interp/complex.cpp
+++ b/clang/test/AST/Interp/complex.cpp
@@ -55,21 +55,20 @@ static_assert(ignoredCast() == 0, "");
 static_assert((int)I1 == 1, "");
 static_assert((float)D == 1.0f, "");
 
+static_assert(__real((_Complex unsigned)5) == 5);
+static_assert(__imag((_Complex unsigned)5) == 0);
 
 /// Standalone complex expressions.
 static_assert(__real((_Complex float){1.0, 3.0}) == 1.0, "");
 
 
-#if 0
-/// FIXME: This should work in the new interpreter.
 constexpr _Complex double D2 = {12};
 static_assert(__real(D2) == 12, "");
-static_assert(__imag(D2) == 12, "");
+static_assert(__imag(D2) == 0, "");
 
 constexpr _Complex int I3 = {15};
 static_assert(__real(I3) == 15, "");
-static_assert(__imag(I3) == 15, "");
-#endif
+static_assert(__imag(I3) == 0, "");
 
 /// FIXME: This should work in the new interpreter as well.
 // constexpr _Complex _BitInt(8) A = 0;// = {4};

From 28d64c1237c28c0341700441eb2f275457f29e6a Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 19 Jan 2024 09:34:28 +0100
Subject: [PATCH 089/843] [libc][NFC] Simplify FPBits expressions (#78590)

---
 libc/src/__support/FPUtil/aarch64/FEnvImpl.h         | 4 ++--
 libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h | 4 ++--
 libc/src/__support/FPUtil/arm/FEnvImpl.h             | 4 ++--
 3 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
index 47c65af452809..23cde88c9c7c5 100644
--- a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
@@ -155,8 +155,8 @@ LIBC_INLINE int set_except(int excepts) {
 LIBC_INLINE int raise_except(int excepts) {
   float zero = 0.0f;
   float one = 1.0f;
-  float largeValue = float(FPBits<float>(FPBits<float>::MAX_NORMAL));
-  float smallValue = float(FPBits<float>(FPBits<float>::MIN_NORMAL));
+  float largeValue = FPBits<float>::max_normal();
+  float smallValue = FPBits<float>::min_normal();
   auto divfunc = [](float a, float b) {
     __asm__ __volatile__("ldr  s0, %0\n\t"
                          "ldr  s1, %1\n\t"
diff --git a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
index 6a0ac5fef0cac..ea1fd68a5fcdf 100644
--- a/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
+++ b/libc/src/__support/FPUtil/aarch64/fenv_darwin_impl.h
@@ -161,8 +161,8 @@ LIBC_INLINE int set_except(int excepts) {
 LIBC_INLINE int raise_except(int excepts) {
   float zero = 0.0f;
   float one = 1.0f;
-  float large_value = float(FPBits<float>(FPBits<float>::MAX_NORMAL));
-  float small_value = float(FPBits<float>(FPBits<float>::MIN_NORMAL));
+  float large_value = FPBits<float>::max_normal();
+  float small_value = FPBits<float>::min_normal();
   auto divfunc = [](float a, float b) {
     __asm__ __volatile__("ldr  s0, %0\n\t"
                          "ldr  s1, %1\n\t"
diff --git a/libc/src/__support/FPUtil/arm/FEnvImpl.h b/libc/src/__support/FPUtil/arm/FEnvImpl.h
index dcc5602864722..1a89de50b6b60 100644
--- a/libc/src/__support/FPUtil/arm/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/arm/FEnvImpl.h
@@ -135,8 +135,8 @@ LIBC_INLINE int set_except(int excepts) {
 LIBC_INLINE int raise_except(int excepts) {
   float zero = 0.0f;
   float one = 1.0f;
-  float large_value = float(FPBits<float>(FPBits<float>::MAX_NORMAL));
-  float small_value = float(FPBits<float>(FPBits<float>::MIN_NORMAL));
+  float large_value = FPBits<float>::max_normal();
+  float small_value = FPBits<float>::min_normal();
   auto divfunc = [](float a, float b) {
     __asm__ __volatile__("flds  s0, %0\n\t"
                          "flds  s1, %1\n\t"

From 14f0c06f48aca3e6f015944c88097b404005645e Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 19 Jan 2024 09:36:03 +0100
Subject: [PATCH 090/843] [libc] Fix is_subnormal for Intel Extended Precision
 (#78592)

Also turn a set of `get_biased_exponent() == 0` into `is_subnormal()`
which is clearer.
---
 libc/src/__support/FPUtil/FPBits.h            | 28 +++++--------------
 libc/src/__support/FPUtil/NormalFloat.h       |  4 +--
 libc/src/__support/FPUtil/generic/FMA.h       |  6 ++--
 libc/src/__support/FPUtil/generic/sqrt.h      |  2 +-
 .../FPUtil/generic/sqrt_80_bit_long_double.h  |  2 +-
 .../FPUtil/x86_64/NextAfterLongDouble.h       |  3 +-
 libc/utils/MPFRWrapper/MPFRUtils.cpp          |  8 +++---
 7 files changed, 19 insertions(+), 34 deletions(-)

diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 3ee6289b74964..be700285de828 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -386,8 +386,11 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
     bits = (value & FP_MASK);
   }
 
-  LIBC_INLINE constexpr bool is_zero() const {
-    return (bits & EXP_SIG_MASK) == 0;
+  LIBC_INLINE constexpr bool is_zero() const { return exp_sig_bits() == 0; }
+
+  LIBC_INLINE
+  constexpr bool is_subnormal() const {
+    return exp_bits() == encode(BiasedExponent::BITS_ALL_ZEROES());
   }
 
   LIBC_INLINE constexpr bool is_neg() const { return sign().is_neg(); }
@@ -435,19 +438,11 @@ template <FPType fp_type> struct FPRep : public FPRepBase<fp_type> {
     return exp_sig_bits() ==
            encode(BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
   }
-  LIBC_INLINE constexpr bool is_zero() const {
-    return exp_sig_bits() ==
-           encode(BiasedExponent::BITS_ALL_ZEROES(), Significand::ZERO());
-  }
   LIBC_INLINE constexpr bool is_finite() const {
     return exp_bits() != encode(BiasedExponent::BITS_ALL_ONES());
   }
-  LIBC_INLINE
-  constexpr bool is_subnormal() const {
-    return exp_bits() == encode(BiasedExponent::BITS_ALL_ZEROES());
-  }
   LIBC_INLINE constexpr bool is_normal() const {
-    return is_finite() && !is_subnormal();
+    return is_finite() && !UP::is_subnormal();
   }
 
   LIBC_INLINE static constexpr StorageType zero(Sign sign = Sign::POS) {
@@ -488,7 +483,7 @@ template <FPType fp_type> struct FPRep : public FPRepBase<fp_type> {
   // The function return mantissa with the implicit bit set iff the current
   // value is a valid normal number.
   LIBC_INLINE constexpr StorageType get_explicit_mantissa() {
-    if (is_subnormal())
+    if (UP::is_subnormal())
       return sig_bits();
     return (StorageType(1) << UP::SIG_LEN) | sig_bits();
   }
@@ -550,18 +545,9 @@ struct FPRep<FPType::X86_Binary80> : public FPRepBase<FPType::X86_Binary80> {
     return exp_sig_bits() ==
            encode(BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
   }
-  LIBC_INLINE constexpr bool is_zero() const {
-    return exp_sig_bits() ==
-           encode(BiasedExponent::BITS_ALL_ZEROES(), Significand::ZERO());
-  }
   LIBC_INLINE constexpr bool is_finite() const {
     return !is_inf() && !is_nan();
   }
-  LIBC_INLINE
-  constexpr bool is_subnormal() const {
-    return exp_sig_bits() >
-           encode(BiasedExponent::BITS_ALL_ZEROES(), Significand::ZERO());
-  }
   LIBC_INLINE constexpr bool is_normal() const {
     const auto exp = exp_bits();
     if (exp == encode(BiasedExponent::BITS_ALL_ZEROES()) ||
diff --git a/libc/src/__support/FPUtil/NormalFloat.h b/libc/src/__support/FPUtil/NormalFloat.h
index 8b1612e1b47c6..e7abc53ce6129 100644
--- a/libc/src/__support/FPUtil/NormalFloat.h
+++ b/libc/src/__support/FPUtil/NormalFloat.h
@@ -153,7 +153,7 @@ template <typename T> struct NormalFloat {
     }
 
     // Normalize subnormal numbers.
-    if (bits.get_biased_exponent() == 0) {
+    if (bits.is_subnormal()) {
       unsigned shift = evaluate_normalization_shift(bits.get_mantissa());
       mantissa = StorageType(bits.get_mantissa()) << shift;
       exponent = 1 - FPBits<T>::EXP_BIAS - shift;
@@ -186,7 +186,7 @@ NormalFloat<long double>::init_from_bits(FPBits<long double> bits) {
     return;
   }
 
-  if (bits.get_biased_exponent() == 0) {
+  if (bits.is_subnormal()) {
     if (bits.get_implicit_bit() == 0) {
       // Since we ignore zero value, the mantissa in this case is non-zero.
       int normalization_shift =
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 9c67c645d5243..6285cac1983d1 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -104,15 +104,15 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
   int z_exp = 0;
 
   // Normalize denormal inputs.
-  if (LIBC_UNLIKELY(FPBits(x).get_biased_exponent() == 0)) {
+  if (LIBC_UNLIKELY(FPBits(x).is_subnormal())) {
     x_exp -= 52;
     x *= 0x1.0p+52;
   }
-  if (LIBC_UNLIKELY(FPBits(y).get_biased_exponent() == 0)) {
+  if (LIBC_UNLIKELY(FPBits(y).is_subnormal())) {
     y_exp -= 52;
     y *= 0x1.0p+52;
   }
-  if (LIBC_UNLIKELY(FPBits(z).get_biased_exponent() == 0)) {
+  if (LIBC_UNLIKELY(FPBits(z).is_subnormal())) {
     z_exp -= 52;
     z *= 0x1.0p+52;
   }
diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index f273b678edf52..21ae9d081d3f1 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -97,7 +97,7 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
       StorageType x_mant = bits.get_mantissa();
 
       // Step 1a: Normalize denormal input and append hidden bit to the mantissa
-      if (bits.get_biased_exponent() == 0) {
+      if (bits.is_subnormal()) {
         ++x_exp; // let x_exp be the correct exponent of ONE bit.
         internal::normalize<T>(x_exp, x_mant);
       } else {
diff --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
index 4fe9b49ff41cf..4f8d136938f56 100644
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@@ -65,7 +65,7 @@ LIBC_INLINE long double sqrt(long double x) {
     // Step 1a: Normalize denormal input
     if (bits.get_implicit_bit()) {
       x_mant |= ONE;
-    } else if (bits.get_biased_exponent() == 0) {
+    } else if (bits.is_subnormal()) {
       normalize(x_exp, x_mant);
     }
 
diff --git a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
index 5f15bac5df77f..512f5de4e7931 100644
--- a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
+++ b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
@@ -38,8 +38,7 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
     return to;
 
   // Convert pseudo subnormal number to normal number.
-  if (from_bits.get_implicit_bit() == 1 &&
-      from_bits.get_biased_exponent() == 0) {
+  if (from_bits.get_implicit_bit() == 1 && from_bits.is_subnormal()) {
     from_bits.set_biased_exponent(1);
   }
 
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index fca83c4cdc52f..b6ca525db6cf7 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -457,9 +457,9 @@ class MPFRNumber {
     int thisExponent = FPBits<T>(thisAsT).get_exponent();
     int inputExponent = FPBits<T>(input).get_exponent();
     // Adjust the exponents for denormal numbers.
-    if (FPBits<T>(thisAsT).get_biased_exponent() == 0)
+    if (FPBits<T>(thisAsT).is_subnormal())
       ++thisExponent;
-    if (FPBits<T>(input).get_biased_exponent() == 0)
+    if (FPBits<T>(input).is_subnormal())
       ++inputExponent;
 
     if (thisAsT * input < 0 || thisExponent == inputExponent) {
@@ -481,9 +481,9 @@ class MPFRNumber {
     int minExponent = FPBits<T>(min).get_exponent();
     int maxExponent = FPBits<T>(max).get_exponent();
     // Adjust the exponents for denormal numbers.
-    if (FPBits<T>(min).get_biased_exponent() == 0)
+    if (FPBits<T>(min).is_subnormal())
       ++minExponent;
-    if (FPBits<T>(max).get_biased_exponent() == 0)
+    if (FPBits<T>(max).is_subnormal())
       ++maxExponent;
 
     MPFRNumber minMPFR(min);

From c875567af352716e703a49c9445f5dc1d3cf0063 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 19 Jan 2024 00:39:22 -0800
Subject: [PATCH 091/843] [asan,test] Disable alloca_loop_unpoisoning.cpp on
 s390{{.*}}

The test (from https://reviews.llvm.org/D7098) is about the interaction
of VLA and alloca where the VLA causes alloca to have the same address.
This is mostly about behavior checking and less about instrumentation
correctness, so I think it is fair to disable it for a platform that
does not work after StackSafetyAnalysis is enabled by default (#77210).
---
 compiler-rt/test/asan/TestCases/alloca_loop_unpoisoning.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/compiler-rt/test/asan/TestCases/alloca_loop_unpoisoning.cpp b/compiler-rt/test/asan/TestCases/alloca_loop_unpoisoning.cpp
index af6ca2163195a..ac25a4faa2dc1 100644
--- a/compiler-rt/test/asan/TestCases/alloca_loop_unpoisoning.cpp
+++ b/compiler-rt/test/asan/TestCases/alloca_loop_unpoisoning.cpp
@@ -2,6 +2,7 @@
 // RUN: %env_asan_opts=detect_stack_use_after_return=0 %run %t 2>&1
 //
 // REQUIRES: stable-runtime
+// UNSUPPORTED: target=s390{{.*}}
 
 // This testcase checks that allocas and VLAs inside loop are correctly unpoisoned.
 

From ee9c9f3b9622594c4ed6f5ebdf2985d7a9a3876a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Fri, 19 Jan 2024 08:54:12 +0000
Subject: [PATCH 092/843] [mlir][vector] Add 2 invalid tests for vector.xfer
 Ops (#78608)

---
 mlir/test/Dialect/Vector/invalid.mlir | 20 ++++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 3bee9e0081c3b..9c73f5a8f74a8 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -1,7 +1,5 @@
 // RUN: mlir-opt %s -split-input-file -verify-diagnostics
 
-// -----
-
 func.func @broadcast_to_scalar(%arg0: f32) -> f32 {
   // expected-error@+1 {{custom op 'vector.broadcast' invalid kind of type specified}}
   %0 = vector.broadcast %arg0 : f32 to f32
@@ -383,6 +381,15 @@ func.func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
 
 // -----
 
+func.func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
+  %c3 = arith.constant 3 : index
+  %cst = arith.constant 3.0 : f32
+  // expected-error@+1 {{requires 2 indices}}
+  %0 = vector.transfer_read %arg0[%c3], %cst { permutation_map = affine_map<()->(0)> } : memref<?x?xf32>, vector<128xf32>
+}
+
+// -----
+
 func.func @test_vector.transfer_read(%arg0: memref<?x?xf32>) {
   %c3 = arith.constant 3 : index
   %cst = arith.constant 3.0 : f32
@@ -538,6 +545,15 @@ func.func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
 
 // -----
 
+func.func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
+  %c3 = arith.constant 3 : index
+  %cst = arith.constant dense<3.0> : vector<128 x f32>
+  // expected-error@+1 {{requires 2 indices}}
+  vector.transfer_write %cst, %arg0[%c3] {permutation_map = affine_map<()->(0)>} : vector<128xf32>, memref<?x?xf32>
+}
+
+// -----
+
 func.func @test_vector.transfer_write(%arg0: memref<?x?xf32>) {
   %c3 = arith.constant 3 : index
   %cst = arith.constant dense<3.0> : vector<128 x f32>

From 508c4efe1e9d95661b322818ae4d6a05b1913504 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Fri, 19 Jan 2024 09:56:01 +0100
Subject: [PATCH 093/843] [libc][NFC] Use Sign in NormalFloat (#78579)

---
 .../FPUtil/DivisionAndRemainderOperations.h   |  2 +-
 libc/src/__support/FPUtil/NormalFloat.h       |  4 +--
 libc/test/src/math/LdExpTest.h                | 31 ++++++++++---------
 libc/test/src/math/smoke/LdExpTest.h          | 31 ++++++++++---------
 4 files changed, 35 insertions(+), 33 deletions(-)

diff --git a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
index 1c09e7906165f..1798310c3e31e 100644
--- a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
+++ b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
@@ -78,7 +78,7 @@ LIBC_INLINE T remquo(T x, T y, int &q) {
     }
   }
 
-  NormalFloat<T> remainder(exp + normaly.exponent, mx, 0);
+  NormalFloat<T> remainder(Sign::POS, exp + normaly.exponent, mx);
 
   // Since NormalFloat to native type conversion is a truncation operation
   // currently, the remainder value in the native type is correct as is.
diff --git a/libc/src/__support/FPUtil/NormalFloat.h b/libc/src/__support/FPUtil/NormalFloat.h
index e7abc53ce6129..cfa9e14175105 100644
--- a/libc/src/__support/FPUtil/NormalFloat.h
+++ b/libc/src/__support/FPUtil/NormalFloat.h
@@ -46,8 +46,8 @@ template <typename T> struct NormalFloat {
 
   Sign sign = Sign::POS;
 
-  LIBC_INLINE NormalFloat(int32_t e, StorageType m, bool s)
-      : exponent(e), mantissa(m), sign(s ? Sign::NEG : Sign::POS) {
+  LIBC_INLINE NormalFloat(Sign s, int32_t e, StorageType m)
+      : exponent(e), mantissa(m), sign(s) {
     if (mantissa >= ONE)
       return;
 
diff --git a/libc/test/src/math/LdExpTest.h b/libc/test/src/math/LdExpTest.h
index b507bb4ec56e5..25120ba3646fd 100644
--- a/libc/test/src/math/LdExpTest.h
+++ b/libc/test/src/math/LdExpTest.h
@@ -60,8 +60,8 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   }
 
   void testOverflow(LdExpFunc func) {
-    NormalFloat x(FPBits::MAX_BIASED_EXPONENT - 10, NormalFloat::ONE + 0xF00BA,
-                  0);
+    NormalFloat x(Sign::POS, FPBits::MAX_BIASED_EXPONENT - 10,
+                  NormalFloat::ONE + 0xF00BA);
     for (int32_t exp = 10; exp < 100; ++exp) {
       ASSERT_FP_EQ(inf, func(T(x), exp));
       ASSERT_FP_EQ(neg_inf, func(-T(x), exp));
@@ -75,7 +75,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
     int32_t exp_array[] = {base_exponent + 5, base_exponent + 4,
                            base_exponent + 3, base_exponent + 2,
                            base_exponent + 1};
-    T x = NormalFloat(0, MANTISSA, 0);
+    T x = NormalFloat(Sign::POS, 0, MANTISSA);
     for (int32_t exp : exp_array) {
       ASSERT_FP_EQ(func(x, -exp), x > 0 ? zero : neg_zero);
     }
@@ -88,20 +88,21 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
     int32_t exp_array[] = {base_exponent + 5, base_exponent + 4,
                            base_exponent + 3, base_exponent + 2,
                            base_exponent + 1};
-    T x = NormalFloat(-FPBits::EXP_BIAS, MANTISSA, 0);
+    T x = NormalFloat(Sign::POS, -FPBits::EXP_BIAS, MANTISSA);
     for (int32_t exp : exp_array) {
       ASSERT_FP_EQ(func(x, -exp), x > 0 ? zero : neg_zero);
     }
   }
 
   void testNormalOperation(LdExpFunc func) {
-    T val_array[] = {
-        // Normal numbers
-        NormalFloat(100, MANTISSA, 0), NormalFloat(-100, MANTISSA, 0),
-        NormalFloat(100, MANTISSA, 1), NormalFloat(-100, MANTISSA, 1),
-        // Subnormal numbers
-        NormalFloat(-FPBits::EXP_BIAS, MANTISSA, 0),
-        NormalFloat(-FPBits::EXP_BIAS, MANTISSA, 1)};
+    T val_array[] = {// Normal numbers
+                     NormalFloat(Sign::POS, 100, MANTISSA),
+                     NormalFloat(Sign::POS, -100, MANTISSA),
+                     NormalFloat(Sign::NEG, 100, MANTISSA),
+                     NormalFloat(Sign::NEG, -100, MANTISSA),
+                     // Subnormal numbers
+                     NormalFloat(Sign::POS, -FPBits::EXP_BIAS, MANTISSA),
+                     NormalFloat(Sign::NEG, -FPBits::EXP_BIAS, MANTISSA)};
     for (int32_t exp = 0; exp <= FPBits::FRACTION_LEN; ++exp) {
       for (T x : val_array) {
         // We compare the result of ldexp with the result
@@ -120,14 +121,14 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
     }
 
     // Normal which trigger mantissa overflow.
-    T x = NormalFloat(-FPBits::EXP_BIAS + 1,
-                      StorageType(2) * NormalFloat::ONE - StorageType(1), 0);
+    T x = NormalFloat(Sign::POS, -FPBits::EXP_BIAS + 1,
+                      StorageType(2) * NormalFloat::ONE - StorageType(1));
     ASSERT_FP_EQ(func(x, -1), x / 2);
     ASSERT_FP_EQ(func(-x, -1), -x / 2);
 
     // Start with a normal number high exponent but pass a very low number for
     // exp. The result should be a subnormal number.
-    x = NormalFloat(FPBits::EXP_BIAS, NormalFloat::ONE, 0);
+    x = NormalFloat(Sign::POS, FPBits::EXP_BIAS, NormalFloat::ONE);
     int exp = -FPBits::MAX_BIASED_EXPONENT - 5;
     T result = func(x, exp);
     FPBits result_bits(result);
@@ -141,7 +142,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
     // Start with a subnormal number but pass a very high number for exponent.
     // The result should not be infinity.
-    x = NormalFloat(-FPBits::EXP_BIAS + 1, NormalFloat::ONE >> 10, 0);
+    x = NormalFloat(Sign::POS, -FPBits::EXP_BIAS + 1, NormalFloat::ONE >> 10);
     exp = FPBits::MAX_BIASED_EXPONENT + 5;
     ASSERT_FALSE(FPBits(func(x, exp)).is_inf());
     // But if the exp is large enough to oversome than the normalization shift,
diff --git a/libc/test/src/math/smoke/LdExpTest.h b/libc/test/src/math/smoke/LdExpTest.h
index b507bb4ec56e5..25120ba3646fd 100644
--- a/libc/test/src/math/smoke/LdExpTest.h
+++ b/libc/test/src/math/smoke/LdExpTest.h
@@ -60,8 +60,8 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   }
 
   void testOverflow(LdExpFunc func) {
-    NormalFloat x(FPBits::MAX_BIASED_EXPONENT - 10, NormalFloat::ONE + 0xF00BA,
-                  0);
+    NormalFloat x(Sign::POS, FPBits::MAX_BIASED_EXPONENT - 10,
+                  NormalFloat::ONE + 0xF00BA);
     for (int32_t exp = 10; exp < 100; ++exp) {
       ASSERT_FP_EQ(inf, func(T(x), exp));
       ASSERT_FP_EQ(neg_inf, func(-T(x), exp));
@@ -75,7 +75,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
     int32_t exp_array[] = {base_exponent + 5, base_exponent + 4,
                            base_exponent + 3, base_exponent + 2,
                            base_exponent + 1};
-    T x = NormalFloat(0, MANTISSA, 0);
+    T x = NormalFloat(Sign::POS, 0, MANTISSA);
     for (int32_t exp : exp_array) {
       ASSERT_FP_EQ(func(x, -exp), x > 0 ? zero : neg_zero);
     }
@@ -88,20 +88,21 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
     int32_t exp_array[] = {base_exponent + 5, base_exponent + 4,
                            base_exponent + 3, base_exponent + 2,
                            base_exponent + 1};
-    T x = NormalFloat(-FPBits::EXP_BIAS, MANTISSA, 0);
+    T x = NormalFloat(Sign::POS, -FPBits::EXP_BIAS, MANTISSA);
     for (int32_t exp : exp_array) {
       ASSERT_FP_EQ(func(x, -exp), x > 0 ? zero : neg_zero);
     }
   }
 
   void testNormalOperation(LdExpFunc func) {
-    T val_array[] = {
-        // Normal numbers
-        NormalFloat(100, MANTISSA, 0), NormalFloat(-100, MANTISSA, 0),
-        NormalFloat(100, MANTISSA, 1), NormalFloat(-100, MANTISSA, 1),
-        // Subnormal numbers
-        NormalFloat(-FPBits::EXP_BIAS, MANTISSA, 0),
-        NormalFloat(-FPBits::EXP_BIAS, MANTISSA, 1)};
+    T val_array[] = {// Normal numbers
+                     NormalFloat(Sign::POS, 100, MANTISSA),
+                     NormalFloat(Sign::POS, -100, MANTISSA),
+                     NormalFloat(Sign::NEG, 100, MANTISSA),
+                     NormalFloat(Sign::NEG, -100, MANTISSA),
+                     // Subnormal numbers
+                     NormalFloat(Sign::POS, -FPBits::EXP_BIAS, MANTISSA),
+                     NormalFloat(Sign::NEG, -FPBits::EXP_BIAS, MANTISSA)};
     for (int32_t exp = 0; exp <= FPBits::FRACTION_LEN; ++exp) {
       for (T x : val_array) {
         // We compare the result of ldexp with the result
@@ -120,14 +121,14 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
     }
 
     // Normal which trigger mantissa overflow.
-    T x = NormalFloat(-FPBits::EXP_BIAS + 1,
-                      StorageType(2) * NormalFloat::ONE - StorageType(1), 0);
+    T x = NormalFloat(Sign::POS, -FPBits::EXP_BIAS + 1,
+                      StorageType(2) * NormalFloat::ONE - StorageType(1));
     ASSERT_FP_EQ(func(x, -1), x / 2);
     ASSERT_FP_EQ(func(-x, -1), -x / 2);
 
     // Start with a normal number high exponent but pass a very low number for
     // exp. The result should be a subnormal number.
-    x = NormalFloat(FPBits::EXP_BIAS, NormalFloat::ONE, 0);
+    x = NormalFloat(Sign::POS, FPBits::EXP_BIAS, NormalFloat::ONE);
     int exp = -FPBits::MAX_BIASED_EXPONENT - 5;
     T result = func(x, exp);
     FPBits result_bits(result);
@@ -141,7 +142,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
     // Start with a subnormal number but pass a very high number for exponent.
     // The result should not be infinity.
-    x = NormalFloat(-FPBits::EXP_BIAS + 1, NormalFloat::ONE >> 10, 0);
+    x = NormalFloat(Sign::POS, -FPBits::EXP_BIAS + 1, NormalFloat::ONE >> 10);
     exp = FPBits::MAX_BIASED_EXPONENT + 5;
     ASSERT_FALSE(FPBits(func(x, exp)).is_inf());
     // But if the exp is large enough to oversome than the normalization shift,

From 4e7cf1b1ed3824d1298d4232922f54c6295eab50 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 19 Jan 2024 10:08:03 +0100
Subject: [PATCH 094/843] [clang][Interp] Add an EvaluationResult class
 (#71315)

Add an `EvaluationResult` class. This contains the result either as a
`Pointer` or as a `APValue`.

This way, we can inspect the result of the evaluation and diagnose
problems with it (e.g. uninitialized fields in global initializers or
pointers pointing to things they shouldn't point to).
---
 clang/lib/AST/CMakeLists.txt              |   1 +
 clang/lib/AST/ExprConstant.cpp            |  23 ++-
 clang/lib/AST/Interp/ByteCodeEmitter.cpp  |  13 +-
 clang/lib/AST/Interp/ByteCodeEmitter.h    |  10 +-
 clang/lib/AST/Interp/ByteCodeExprGen.cpp  |  12 +-
 clang/lib/AST/Interp/ByteCodeExprGen.h    |   4 -
 clang/lib/AST/Interp/ByteCodeStmtGen.cpp  |   2 +-
 clang/lib/AST/Interp/Context.cpp          | 106 ++++++++----
 clang/lib/AST/Interp/Context.h            |   3 +
 clang/lib/AST/Interp/EvalEmitter.cpp      | 174 +++++--------------
 clang/lib/AST/Interp/EvalEmitter.h        |  15 +-
 clang/lib/AST/Interp/EvaluationResult.cpp | 196 ++++++++++++++++++++++
 clang/lib/AST/Interp/EvaluationResult.h   | 111 ++++++++++++
 clang/lib/AST/Interp/Interp.cpp           |  92 ----------
 clang/lib/AST/Interp/Interp.h             |  16 +-
 clang/lib/AST/Interp/Opcodes.td           |   2 -
 clang/lib/AST/Interp/Pointer.cpp          | 154 ++++++++++++++---
 clang/lib/AST/Interp/Pointer.h            |   3 +-
 clang/test/AST/Interp/records.cpp         |   1 -
 19 files changed, 599 insertions(+), 339 deletions(-)
 create mode 100644 clang/lib/AST/Interp/EvaluationResult.cpp
 create mode 100644 clang/lib/AST/Interp/EvaluationResult.h

diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index fe3f8c485ec1c..ebcb3952198a5 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -75,6 +75,7 @@ add_clang_library(clangAST
   Interp/Function.cpp
   Interp/InterpBuiltin.cpp
   Interp/Floating.cpp
+  Interp/EvaluationResult.cpp
   Interp/Interp.cpp
   Interp/InterpBlock.cpp
   Interp/InterpFrame.cpp
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 0884988c48b4e..f1d07d022b258 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -15439,11 +15439,13 @@ static bool EvaluateAsRValue(EvalInfo &Info, const Expr *E, APValue &Result) {
   if (Info.EnableNewConstInterp) {
     if (!Info.Ctx.getInterpContext().evaluateAsRValue(Info, E, Result))
       return false;
-  } else {
-    if (!::Evaluate(Result, Info, E))
-      return false;
+    return CheckConstantExpression(Info, E->getExprLoc(), E->getType(), Result,
+                                   ConstantExprKind::Normal);
   }
 
+  if (!::Evaluate(Result, Info, E))
+    return false;
+
   // Implicit lvalue-to-rvalue cast.
   if (E->isGLValue()) {
     LValue LV;
@@ -15671,6 +15673,13 @@ bool Expr::EvaluateAsConstantExpr(EvalResult &Result, const ASTContext &Ctx,
   EvalInfo Info(Ctx, Result, EM);
   Info.InConstantContext = true;
 
+  if (Info.EnableNewConstInterp) {
+    if (!Info.Ctx.getInterpContext().evaluate(Info, this, Result.Val))
+      return false;
+    return CheckConstantExpression(Info, getExprLoc(),
+                                   getStorageType(Ctx, this), Result.Val, Kind);
+  }
+
   // The type of the object we're initializing is 'const T' for a class NTTP.
   QualType T = getType();
   if (Kind == ConstantExprKind::ClassTemplateArgument)
@@ -15746,10 +15755,16 @@ bool Expr::EvaluateAsInitializer(APValue &Value, const ASTContext &Ctx,
   Info.setEvaluatingDecl(VD, Value);
   Info.InConstantContext = IsConstantInitialization;
 
+  SourceLocation DeclLoc = VD->getLocation();
+  QualType DeclTy = VD->getType();
+
   if (Info.EnableNewConstInterp) {
     auto &InterpCtx = const_cast<ASTContext &>(Ctx).getInterpContext();
     if (!InterpCtx.evaluateAsInitializer(Info, VD, Value))
       return false;
+
+    return CheckConstantExpression(Info, DeclLoc, DeclTy, Value,
+                                   ConstantExprKind::Normal);
   } else {
     LValue LVal;
     LVal.set(VD);
@@ -15779,8 +15794,6 @@ bool Expr::EvaluateAsInitializer(APValue &Value, const ASTContext &Ctx,
       llvm_unreachable("Unhandled cleanup; missing full expression marker?");
   }
 
-  SourceLocation DeclLoc = VD->getLocation();
-  QualType DeclTy = VD->getType();
   return CheckConstantExpression(Info, DeclLoc, DeclTy, Value,
                                  ConstantExprKind::Normal) &&
          CheckMemoryLeaks(Info);
diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.cpp b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
index 045263447cbc9..fd2a92d9d3f91 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.cpp
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.cpp
@@ -20,8 +20,7 @@
 using namespace clang;
 using namespace clang::interp;
 
-Expected<Function *>
-ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
+Function *ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
   // Set up argument indices.
   unsigned ParamOffset = 0;
   SmallVector<PrimType, 8> ParamTypes;
@@ -120,10 +119,6 @@ ByteCodeEmitter::compileFunc(const FunctionDecl *FuncDecl) {
 
   // Compile the function body.
   if (!IsEligibleForCompilation || !visitFunc(FuncDecl)) {
-    // Return a dummy function if compilation failed.
-    if (BailLocation)
-      return llvm::make_error<ByteCodeGenError>(*BailLocation);
-
     Func->setIsFullyCompiled(true);
     return Func;
   }
@@ -183,12 +178,6 @@ int32_t ByteCodeEmitter::getOffset(LabelTy Label) {
   return 0ull;
 }
 
-bool ByteCodeEmitter::bail(const SourceLocation &Loc) {
-  if (!BailLocation)
-    BailLocation = Loc;
-  return false;
-}
-
 /// Helper to write bytecode and bail out if 32-bit offsets become invalid.
 /// Pointers will be automatically marshalled as 32-bit IDs.
 template <typename T>
diff --git a/clang/lib/AST/Interp/ByteCodeEmitter.h b/clang/lib/AST/Interp/ByteCodeEmitter.h
index 5520f8c300610..03de286582c91 100644
--- a/clang/lib/AST/Interp/ByteCodeEmitter.h
+++ b/clang/lib/AST/Interp/ByteCodeEmitter.h
@@ -17,7 +17,6 @@
 #include "PrimType.h"
 #include "Program.h"
 #include "Source.h"
-#include "llvm/Support/Error.h"
 
 namespace clang {
 namespace interp {
@@ -32,7 +31,7 @@ class ByteCodeEmitter {
 
 public:
   /// Compiles the function into the module.
-  llvm::Expected<Function *> compileFunc(const FunctionDecl *FuncDecl);
+  Function *compileFunc(const FunctionDecl *FuncDecl);
 
 protected:
   ByteCodeEmitter(Context &Ctx, Program &P) : Ctx(Ctx), P(P) {}
@@ -49,11 +48,6 @@ class ByteCodeEmitter {
   virtual bool visitExpr(const Expr *E) = 0;
   virtual bool visitDecl(const VarDecl *E) = 0;
 
-  /// Bails out if a given node cannot be compiled.
-  bool bail(const Stmt *S) { return bail(S->getBeginLoc()); }
-  bool bail(const Decl *D) { return bail(D->getBeginLoc()); }
-  bool bail(const SourceLocation &Loc);
-
   /// Emits jumps.
   bool jumpTrue(const LabelTy &Label);
   bool jumpFalse(const LabelTy &Label);
@@ -81,8 +75,6 @@ class ByteCodeEmitter {
   LabelTy NextLabel = 0;
   /// Offset of the next local variable.
   unsigned NextLocalOffset = 0;
-  /// Location of a failure.
-  std::optional<SourceLocation> BailLocation;
   /// Label information for linker.
   llvm::DenseMap<LabelTy, unsigned> LabelOffsets;
   /// Location of label relocations.
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 82ef743e6a781..0651828113268 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -398,7 +398,7 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   }
 
   if (!LT || !RT || !T)
-    return this->bail(BO);
+    return false;
 
   // Pointer arithmetic special case.
   if (BO->getOpcode() == BO_Add || BO->getOpcode() == BO_Sub) {
@@ -478,7 +478,7 @@ bool ByteCodeExprGen<Emitter>::VisitBinaryOperator(const BinaryOperator *BO) {
   case BO_LAnd:
     llvm_unreachable("Already handled earlier");
   default:
-    return this->bail(BO);
+    return false;
   }
 
   llvm_unreachable("Unhandled binary op");
@@ -531,7 +531,7 @@ bool ByteCodeExprGen<Emitter>::VisitPointerArithBinOp(const BinaryOperator *E) {
   else if (Op == BO_Sub)
     return this->emitSubOffset(OffsetType, E);
 
-  return this->bail(E);
+  return false;
 }
 
 template <class Emitter>
@@ -2389,7 +2389,9 @@ bool ByteCodeExprGen<Emitter>::visitDecl(const VarDecl *VD) {
   // Return the value
   if (VarT)
     return this->emitRet(*VarT, VD);
-  return this->emitRetValue(VD);
+
+  // Return non-primitive values as pointers here.
+  return this->emitRet(PT_Ptr, VD);
 }
 
 template <class Emitter>
@@ -2409,7 +2411,7 @@ bool ByteCodeExprGen<Emitter>::visitVarDecl(const VarDecl *VD) {
     std::optional<unsigned> GlobalIndex = P.createGlobal(VD, Init);
 
     if (!GlobalIndex)
-      return this->bail(VD);
+      return false;
 
     assert(Init);
     {
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index 48005ce05724b..df4cb736299cb 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -191,10 +191,6 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
     if (!visitInitializer(Init))
       return false;
 
-    if ((Init->getType()->isArrayType() || Init->getType()->isRecordType()) &&
-        !this->emitCheckGlobalCtor(Init))
-      return false;
-
     return this->emitPopPtr(Init);
   }
 
diff --git a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
index 38067be73e254..a2d8c4e13010c 100644
--- a/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeStmtGen.cpp
@@ -282,7 +282,7 @@ bool ByteCodeStmtGen<Emitter>::visitStmt(const Stmt *S) {
   default: {
     if (auto *Exp = dyn_cast<Expr>(S))
       return this->discard(Exp);
-    return this->bail(S);
+    return false;
   }
   }
 }
diff --git a/clang/lib/AST/Interp/Context.cpp b/clang/lib/AST/Interp/Context.cpp
index 17abb71635839..75a300bcbace1 100644
--- a/clang/lib/AST/Interp/Context.cpp
+++ b/clang/lib/AST/Interp/Context.cpp
@@ -30,18 +30,8 @@ Context::~Context() {}
 bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) {
   assert(Stk.empty());
   Function *Func = P->getFunction(FD);
-  if (!Func || !Func->hasBody()) {
-    if (auto R = ByteCodeStmtGen<ByteCodeEmitter>(*this, *P).compileFunc(FD)) {
-      Func = *R;
-    } else {
-      handleAllErrors(R.takeError(), [&Parent](ByteCodeGenError &Err) {
-        Parent.FFDiag(Err.getRange().getBegin(),
-                      diag::err_experimental_clang_interp_failed)
-            << Err.getRange();
-      });
-      return false;
-    }
-  }
+  if (!Func || !Func->hasBody())
+    Func = ByteCodeStmtGen<ByteCodeEmitter>(*this, *P).compileFunc(FD);
 
   APValue DummyResult;
   if (!Run(Parent, Func, DummyResult)) {
@@ -54,36 +44,90 @@ bool Context::isPotentialConstantExpr(State &Parent, const FunctionDecl *FD) {
 bool Context::evaluateAsRValue(State &Parent, const Expr *E, APValue &Result) {
   assert(Stk.empty());
   ByteCodeExprGen<EvalEmitter> C(*this, *P, Parent, Stk, Result);
-  if (Check(Parent, C.interpretExpr(E))) {
-    assert(Stk.empty());
-#ifndef NDEBUG
-    // Make sure we don't rely on some value being still alive in
-    // InterpStack memory.
+
+  auto Res = C.interpretExpr(E);
+
+  if (Res.isInvalid()) {
     Stk.clear();
+    return false;
+  }
+
+  assert(Stk.empty());
+#ifndef NDEBUG
+  // Make sure we don't rely on some value being still alive in
+  // InterpStack memory.
+  Stk.clear();
 #endif
-    return true;
+
+  // Implicit lvalue-to-rvalue conversion.
+  if (E->isGLValue()) {
+    std::optional<APValue> RValueResult = Res.toRValue();
+    if (!RValueResult) {
+      return false;
+    }
+    Result = *RValueResult;
+  } else {
+    Result = Res.toAPValue();
   }
 
+  return true;
+}
+
+bool Context::evaluate(State &Parent, const Expr *E, APValue &Result) {
+  assert(Stk.empty());
+  ByteCodeExprGen<EvalEmitter> C(*this, *P, Parent, Stk, Result);
+
+  auto Res = C.interpretExpr(E);
+  if (Res.isInvalid()) {
+    Stk.clear();
+    return false;
+  }
+
+  assert(Stk.empty());
+#ifndef NDEBUG
+  // Make sure we don't rely on some value being still alive in
+  // InterpStack memory.
   Stk.clear();
-  return false;
+#endif
+  Result = Res.toAPValue();
+  return true;
 }
 
 bool Context::evaluateAsInitializer(State &Parent, const VarDecl *VD,
                                     APValue &Result) {
   assert(Stk.empty());
   ByteCodeExprGen<EvalEmitter> C(*this, *P, Parent, Stk, Result);
-  if (Check(Parent, C.interpretDecl(VD))) {
-    assert(Stk.empty());
-#ifndef NDEBUG
-    // Make sure we don't rely on some value being still alive in
-    // InterpStack memory.
+
+  auto Res = C.interpretDecl(VD);
+  if (Res.isInvalid()) {
     Stk.clear();
-#endif
-    return true;
+    return false;
   }
 
+  assert(Stk.empty());
+#ifndef NDEBUG
+  // Make sure we don't rely on some value being still alive in
+  // InterpStack memory.
   Stk.clear();
-  return false;
+#endif
+
+  // Ensure global variables are fully initialized.
+  if (shouldBeGloballyIndexed(VD) && !Res.isInvalid() &&
+      (VD->getType()->isRecordType() || VD->getType()->isArrayType())) {
+    assert(Res.isLValue());
+
+    if (!Res.checkFullyInitialized(C.getState()))
+      return false;
+
+    // lvalue-to-rvalue conversion.
+    std::optional<APValue> RValueResult = Res.toRValue();
+    if (!RValueResult)
+      return false;
+    Result = *RValueResult;
+
+  } else
+    Result = Res.toAPValue();
+  return true;
 }
 
 const LangOptions &Context::getLangOpts() const { return Ctx.getLangOpts(); }
@@ -234,12 +278,8 @@ const Function *Context::getOrCreateFunction(const FunctionDecl *FD) {
     return Func;
 
   if (!Func || WasNotDefined) {
-    if (auto R = ByteCodeStmtGen<ByteCodeEmitter>(*this, *P).compileFunc(FD))
-      Func = *R;
-    else {
-      llvm::consumeError(R.takeError());
-      return nullptr;
-    }
+    if (auto F = ByteCodeStmtGen<ByteCodeEmitter>(*this, *P).compileFunc(FD))
+      Func = F;
   }
 
   return Func;
diff --git a/clang/lib/AST/Interp/Context.h b/clang/lib/AST/Interp/Context.h
index 7649caef22428..ab83a8d132246 100644
--- a/clang/lib/AST/Interp/Context.h
+++ b/clang/lib/AST/Interp/Context.h
@@ -51,6 +51,9 @@ class Context final {
   /// Evaluates a toplevel expression as an rvalue.
   bool evaluateAsRValue(State &Parent, const Expr *E, APValue &Result);
 
+  /// Like evaluateAsRvalue(), but does no implicit lvalue-to-rvalue conversion.
+  bool evaluate(State &Parent, const Expr *E, APValue &Result);
+
   /// Evaluates a toplevel initializer.
   bool evaluateAsInitializer(State &Parent, const VarDecl *VD, APValue &Result);
 
diff --git a/clang/lib/AST/Interp/EvalEmitter.cpp b/clang/lib/AST/Interp/EvalEmitter.cpp
index 0ff0bde8fd17e..a60f893de8bda 100644
--- a/clang/lib/AST/Interp/EvalEmitter.cpp
+++ b/clang/lib/AST/Interp/EvalEmitter.cpp
@@ -19,7 +19,7 @@ using namespace clang::interp;
 
 EvalEmitter::EvalEmitter(Context &Ctx, Program &P, State &Parent,
                          InterpStack &Stk, APValue &Result)
-    : Ctx(Ctx), P(P), S(Parent, P, Stk, Ctx, this), Result(Result) {
+    : Ctx(Ctx), P(P), S(Parent, P, Stk, Ctx, this), EvalResult(&Ctx) {
   // Create a dummy frame for the interpreter which does not have locals.
   S.Current =
       new InterpFrame(S, /*Func=*/nullptr, /*Caller=*/nullptr, CodePtr());
@@ -33,20 +33,22 @@ EvalEmitter::~EvalEmitter() {
   }
 }
 
-llvm::Expected<bool> EvalEmitter::interpretExpr(const Expr *E) {
-  if (this->visitExpr(E))
-    return true;
-  if (BailLocation)
-    return llvm::make_error<ByteCodeGenError>(*BailLocation);
-  return false;
+EvaluationResult EvalEmitter::interpretExpr(const Expr *E) {
+  EvalResult.setSource(E);
+
+  if (!this->visitExpr(E))
+    EvalResult.setInvalid();
+
+  return std::move(this->EvalResult);
 }
 
-llvm::Expected<bool> EvalEmitter::interpretDecl(const VarDecl *VD) {
-  if (this->visitDecl(VD))
-    return true;
-  if (BailLocation)
-    return llvm::make_error<ByteCodeGenError>(*BailLocation);
-  return false;
+EvaluationResult EvalEmitter::interpretDecl(const VarDecl *VD) {
+  EvalResult.setSource(VD);
+
+  if (!this->visitDecl(VD))
+    EvalResult.setInvalid();
+
+  return std::move(this->EvalResult);
 }
 
 void EvalEmitter::emitLabel(LabelTy Label) {
@@ -77,12 +79,6 @@ Scope::Local EvalEmitter::createLocal(Descriptor *D) {
   return {Off, D};
 }
 
-bool EvalEmitter::bail(const SourceLocation &Loc) {
-  if (!BailLocation)
-    BailLocation = Loc;
-  return false;
-}
-
 bool EvalEmitter::jumpTrue(const LabelTy &Label) {
   if (isActive()) {
     if (S.Stk.pop<bool>())
@@ -116,125 +112,37 @@ template <PrimType OpType> bool EvalEmitter::emitRet(const SourceInfo &Info) {
   if (!isActive())
     return true;
   using T = typename PrimConv<OpType>::T;
-  return ReturnValue<T>(S.Stk.pop<T>(), Result);
+  EvalResult.setValue(S.Stk.pop<T>().toAPValue());
+  return true;
 }
 
-bool EvalEmitter::emitRetVoid(const SourceInfo &Info) { return true; }
+template <> bool EvalEmitter::emitRet<PT_Ptr>(const SourceInfo &Info) {
+  if (!isActive())
+    return true;
+  EvalResult.setPointer(S.Stk.pop<Pointer>());
+  return true;
+}
+template <> bool EvalEmitter::emitRet<PT_FnPtr>(const SourceInfo &Info) {
+  if (!isActive())
+    return true;
+  EvalResult.setFunctionPointer(S.Stk.pop<FunctionPointer>());
+  return true;
+}
+
+bool EvalEmitter::emitRetVoid(const SourceInfo &Info) {
+  EvalResult.setValid();
+  return true;
+}
 
 bool EvalEmitter::emitRetValue(const SourceInfo &Info) {
-  // Method to recursively traverse composites.
-  std::function<bool(QualType, const Pointer &, APValue &)> Composite;
-  Composite = [this, &Composite](QualType Ty, const Pointer &Ptr, APValue &R) {
-    if (const auto *AT = Ty->getAs<AtomicType>())
-      Ty = AT->getValueType();
-
-    if (const auto *RT = Ty->getAs<RecordType>()) {
-      const auto *Record = Ptr.getRecord();
-      assert(Record && "Missing record descriptor");
-
-      bool Ok = true;
-      if (RT->getDecl()->isUnion()) {
-        const FieldDecl *ActiveField = nullptr;
-        APValue Value;
-        for (const auto &F : Record->fields()) {
-          const Pointer &FP = Ptr.atField(F.Offset);
-          QualType FieldTy = F.Decl->getType();
-          if (FP.isActive()) {
-            if (std::optional<PrimType> T = Ctx.classify(FieldTy)) {
-              TYPE_SWITCH(*T, Ok &= ReturnValue<T>(FP.deref<T>(), Value));
-            } else {
-              Ok &= Composite(FieldTy, FP, Value);
-            }
-            break;
-          }
-        }
-        R = APValue(ActiveField, Value);
-      } else {
-        unsigned NF = Record->getNumFields();
-        unsigned NB = Record->getNumBases();
-        unsigned NV = Ptr.isBaseClass() ? 0 : Record->getNumVirtualBases();
-
-        R = APValue(APValue::UninitStruct(), NB, NF);
-
-        for (unsigned I = 0; I < NF; ++I) {
-          const Record::Field *FD = Record->getField(I);
-          QualType FieldTy = FD->Decl->getType();
-          const Pointer &FP = Ptr.atField(FD->Offset);
-          APValue &Value = R.getStructField(I);
-
-          if (std::optional<PrimType> T = Ctx.classify(FieldTy)) {
-            TYPE_SWITCH(*T, Ok &= ReturnValue<T>(FP.deref<T>(), Value));
-          } else {
-            Ok &= Composite(FieldTy, FP, Value);
-          }
-        }
-
-        for (unsigned I = 0; I < NB; ++I) {
-          const Record::Base *BD = Record->getBase(I);
-          QualType BaseTy = Ctx.getASTContext().getRecordType(BD->Decl);
-          const Pointer &BP = Ptr.atField(BD->Offset);
-          Ok &= Composite(BaseTy, BP, R.getStructBase(I));
-        }
-
-        for (unsigned I = 0; I < NV; ++I) {
-          const Record::Base *VD = Record->getVirtualBase(I);
-          QualType VirtBaseTy = Ctx.getASTContext().getRecordType(VD->Decl);
-          const Pointer &VP = Ptr.atField(VD->Offset);
-          Ok &= Composite(VirtBaseTy, VP, R.getStructBase(NB + I));
-        }
-      }
-      return Ok;
-    }
-
-    if (Ty->isIncompleteArrayType()) {
-      R = APValue(APValue::UninitArray(), 0, 0);
-      return true;
-    }
-
-    if (const auto *AT = Ty->getAsArrayTypeUnsafe()) {
-      const size_t NumElems = Ptr.getNumElems();
-      QualType ElemTy = AT->getElementType();
-      R = APValue(APValue::UninitArray{}, NumElems, NumElems);
-
-      bool Ok = true;
-      for (unsigned I = 0; I < NumElems; ++I) {
-        APValue &Slot = R.getArrayInitializedElt(I);
-        const Pointer &EP = Ptr.atIndex(I);
-        if (std::optional<PrimType> T = Ctx.classify(ElemTy)) {
-          TYPE_SWITCH(*T, Ok &= ReturnValue<T>(EP.deref<T>(), Slot));
-        } else {
-          Ok &= Composite(ElemTy, EP.narrow(), Slot);
-        }
-      }
-      return Ok;
-    }
-
-    // Complex types.
-    if (const auto *CT = Ty->getAs<ComplexType>()) {
-      QualType ElemTy = CT->getElementType();
-      std::optional<PrimType> ElemT = Ctx.classify(ElemTy);
-      assert(ElemT);
-
-      if (ElemTy->isIntegerType()) {
-        INT_TYPE_SWITCH(*ElemT, {
-          auto V1 = Ptr.atIndex(0).deref<T>();
-          auto V2 = Ptr.atIndex(1).deref<T>();
-          Result = APValue(V1.toAPSInt(), V2.toAPSInt());
-          return true;
-        });
-      } else if (ElemTy->isFloatingType()) {
-        Result = APValue(Ptr.atIndex(0).deref<Floating>().getAPFloat(),
-                         Ptr.atIndex(1).deref<Floating>().getAPFloat());
-        return true;
-      }
-      return false;
-    }
-    llvm_unreachable("invalid value to return");
-  };
-
-  // Return the composite type.
   const auto &Ptr = S.Stk.pop<Pointer>();
-  return Composite(Ptr.getType(), Ptr, Result);
+  if (std::optional<APValue> APV = Ptr.toRValue(S.getCtx())) {
+    EvalResult.setValue(*APV);
+    return true;
+  }
+
+  EvalResult.setInvalid();
+  return false;
 }
 
 bool EvalEmitter::emitGetPtrLocal(uint32_t I, const SourceInfo &Info) {
diff --git a/clang/lib/AST/Interp/EvalEmitter.h b/clang/lib/AST/Interp/EvalEmitter.h
index 5a9be18c34a03..deb2ebc4e61fa 100644
--- a/clang/lib/AST/Interp/EvalEmitter.h
+++ b/clang/lib/AST/Interp/EvalEmitter.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_CLANG_AST_INTERP_EVALEMITTER_H
 #define LLVM_CLANG_AST_INTERP_EVALEMITTER_H
 
+#include "EvaluationResult.h"
 #include "InterpState.h"
 #include "PrimType.h"
 #include "Source.h"
@@ -33,8 +34,10 @@ class EvalEmitter : public SourceMapper {
   using AddrTy = uintptr_t;
   using Local = Scope::Local;
 
-  llvm::Expected<bool> interpretExpr(const Expr *E);
-  llvm::Expected<bool> interpretDecl(const VarDecl *VD);
+  EvaluationResult interpretExpr(const Expr *E);
+  EvaluationResult interpretDecl(const VarDecl *VD);
+
+  InterpState &getState() { return S; }
 
 protected:
   EvalEmitter(Context &Ctx, Program &P, State &Parent, InterpStack &Stk,
@@ -51,10 +54,6 @@ class EvalEmitter : public SourceMapper {
   virtual bool visitExpr(const Expr *E) = 0;
   virtual bool visitDecl(const VarDecl *VD) = 0;
 
-  bool bail(const Stmt *S) { return bail(S->getBeginLoc()); }
-  bool bail(const Decl *D) { return bail(D->getBeginLoc()); }
-  bool bail(const SourceLocation &Loc);
-
   /// Emits jumps.
   bool jumpTrue(const LabelTy &Label);
   bool jumpFalse(const LabelTy &Label);
@@ -86,7 +85,7 @@ class EvalEmitter : public SourceMapper {
   /// Callee evaluation state.
   InterpState S;
   /// Location to write the result to.
-  APValue &Result;
+  EvaluationResult EvalResult;
 
   /// Temporaries which require storage.
   llvm::DenseMap<unsigned, std::unique_ptr<char[]>> Locals;
@@ -100,8 +99,6 @@ class EvalEmitter : public SourceMapper {
   // The emitter always tracks the current instruction and sets OpPC to a token
   // value which is mapped to the location of the opcode being evaluated.
   CodePtr OpPC;
-  /// Location of a failure.
-  std::optional<SourceLocation> BailLocation;
   /// Location of the current instruction.
   SourceInfo CurrentSource;
 
diff --git a/clang/lib/AST/Interp/EvaluationResult.cpp b/clang/lib/AST/Interp/EvaluationResult.cpp
new file mode 100644
index 0000000000000..a14dc87f1dfde
--- /dev/null
+++ b/clang/lib/AST/Interp/EvaluationResult.cpp
@@ -0,0 +1,196 @@
+//===----- EvaluationResult.cpp - Result class  for the VM ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "EvaluationResult.h"
+#include "Context.h"
+#include "InterpState.h"
+#include "Record.h"
+#include "clang/AST/ExprCXX.h"
+
+namespace clang {
+namespace interp {
+
+APValue EvaluationResult::toAPValue() const {
+  assert(!empty());
+  switch (Kind) {
+  case LValue:
+    // Either a pointer or a function pointer.
+    if (const auto *P = std::get_if<Pointer>(&Value))
+      return P->toAPValue();
+    else if (const auto *FP = std::get_if<FunctionPointer>(&Value))
+      return FP->toAPValue();
+    else
+      llvm_unreachable("Unhandled LValue type");
+    break;
+  case RValue:
+    return std::get<APValue>(Value);
+  case Valid:
+    return APValue();
+  default:
+    llvm_unreachable("Unhandled result kind?");
+  }
+}
+
+std::optional<APValue> EvaluationResult::toRValue() const {
+  if (Kind == RValue)
+    return toAPValue();
+
+  assert(Kind == LValue);
+
+  // We have a pointer and want an RValue.
+  if (const auto *P = std::get_if<Pointer>(&Value))
+    return P->toRValue(*Ctx);
+  else if (const auto *FP = std::get_if<FunctionPointer>(&Value)) // Nope
+    return FP->toAPValue();
+  llvm_unreachable("Unhandled lvalue kind");
+}
+
+static void DiagnoseUninitializedSubobject(InterpState &S, SourceLocation Loc,
+                                           const FieldDecl *SubObjDecl) {
+  assert(SubObjDecl && "Subobject declaration does not exist");
+  S.FFDiag(Loc, diag::note_constexpr_uninitialized)
+      << /*(name)*/ 1 << SubObjDecl;
+  S.Note(SubObjDecl->getLocation(),
+         diag::note_constexpr_subobject_declared_here);
+}
+
+static bool CheckFieldsInitialized(InterpState &S, SourceLocation Loc,
+                                   const Pointer &BasePtr, const Record *R);
+
+static bool CheckArrayInitialized(InterpState &S, SourceLocation Loc,
+                                  const Pointer &BasePtr,
+                                  const ConstantArrayType *CAT) {
+  bool Result = true;
+  size_t NumElems = CAT->getSize().getZExtValue();
+  QualType ElemType = CAT->getElementType();
+
+  if (ElemType->isRecordType()) {
+    const Record *R = BasePtr.getElemRecord();
+    for (size_t I = 0; I != NumElems; ++I) {
+      Pointer ElemPtr = BasePtr.atIndex(I).narrow();
+      Result &= CheckFieldsInitialized(S, Loc, ElemPtr, R);
+    }
+  } else if (const auto *ElemCAT = dyn_cast<ConstantArrayType>(ElemType)) {
+    for (size_t I = 0; I != NumElems; ++I) {
+      Pointer ElemPtr = BasePtr.atIndex(I).narrow();
+      Result &= CheckArrayInitialized(S, Loc, ElemPtr, ElemCAT);
+    }
+  } else {
+    for (size_t I = 0; I != NumElems; ++I) {
+      if (!BasePtr.atIndex(I).isInitialized()) {
+        DiagnoseUninitializedSubobject(S, Loc, BasePtr.getField());
+        Result = false;
+      }
+    }
+  }
+
+  return Result;
+}
+
+static bool CheckFieldsInitialized(InterpState &S, SourceLocation Loc,
+                                   const Pointer &BasePtr, const Record *R) {
+  assert(R);
+  bool Result = true;
+  // Check all fields of this record are initialized.
+  for (const Record::Field &F : R->fields()) {
+    Pointer FieldPtr = BasePtr.atField(F.Offset);
+    QualType FieldType = F.Decl->getType();
+
+    if (FieldType->isRecordType()) {
+      Result &= CheckFieldsInitialized(S, Loc, FieldPtr, FieldPtr.getRecord());
+    } else if (FieldType->isIncompleteArrayType()) {
+      // Nothing to do here.
+    } else if (FieldType->isArrayType()) {
+      const auto *CAT =
+          cast<ConstantArrayType>(FieldType->getAsArrayTypeUnsafe());
+      Result &= CheckArrayInitialized(S, Loc, FieldPtr, CAT);
+    } else if (!FieldPtr.isInitialized()) {
+      DiagnoseUninitializedSubobject(S, Loc, F.Decl);
+      Result = false;
+    }
+  }
+
+  // Check Fields in all bases
+  for (const Record::Base &B : R->bases()) {
+    Pointer P = BasePtr.atField(B.Offset);
+    if (!P.isInitialized()) {
+      S.FFDiag(BasePtr.getDeclDesc()->asDecl()->getLocation(),
+               diag::note_constexpr_uninitialized_base)
+          << B.Desc->getType();
+      return false;
+    }
+    Result &= CheckFieldsInitialized(S, Loc, P, B.R);
+  }
+
+  // TODO: Virtual bases
+
+  return Result;
+}
+
+bool EvaluationResult::checkFullyInitialized(InterpState &S) const {
+  assert(Source);
+  assert(isLValue());
+
+  // Our Source must be a VarDecl.
+  const Decl *SourceDecl = Source.dyn_cast<const Decl *>();
+  assert(SourceDecl);
+  const auto *VD = cast<VarDecl>(SourceDecl);
+  assert(VD->getType()->isRecordType() || VD->getType()->isArrayType());
+  SourceLocation InitLoc = VD->getAnyInitializer()->getExprLoc();
+
+  const Pointer &Ptr = *std::get_if<Pointer>(&Value);
+  assert(!Ptr.isZero());
+
+  if (const Record *R = Ptr.getRecord())
+    return CheckFieldsInitialized(S, InitLoc, Ptr, R);
+  const auto *CAT =
+      cast<ConstantArrayType>(Ptr.getType()->getAsArrayTypeUnsafe());
+  return CheckArrayInitialized(S, InitLoc, Ptr, CAT);
+
+  return true;
+}
+
+void EvaluationResult::dump() const {
+  assert(Ctx);
+  auto &OS = llvm::errs();
+  const ASTContext &ASTCtx = Ctx->getASTContext();
+
+  switch (Kind) {
+  case Empty:
+    OS << "Empty\n";
+    break;
+  case RValue:
+    OS << "RValue: ";
+    std::get<APValue>(Value).dump(OS, ASTCtx);
+    break;
+  case LValue: {
+    assert(Source);
+    QualType SourceType;
+    if (const auto *D = Source.dyn_cast<const Decl *>()) {
+      if (const auto *VD = dyn_cast<ValueDecl>(D))
+        SourceType = VD->getType();
+    } else if (const auto *E = Source.dyn_cast<const Expr *>()) {
+      SourceType = E->getType();
+    }
+
+    OS << "LValue: ";
+    if (const auto *P = std::get_if<Pointer>(&Value))
+      P->toAPValue().printPretty(OS, ASTCtx, SourceType);
+    else if (const auto *FP = std::get_if<FunctionPointer>(&Value)) // Nope
+      FP->toAPValue().printPretty(OS, ASTCtx, SourceType);
+    OS << "\n";
+    break;
+  }
+
+  default:
+    llvm_unreachable("Can't print that.");
+  }
+}
+
+} // namespace interp
+} // namespace clang
diff --git a/clang/lib/AST/Interp/EvaluationResult.h b/clang/lib/AST/Interp/EvaluationResult.h
new file mode 100644
index 0000000000000..2b9fc16f1a0ab
--- /dev/null
+++ b/clang/lib/AST/Interp/EvaluationResult.h
@@ -0,0 +1,111 @@
+//===------ EvaluationResult.h - Result class  for the VM -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_INTERP_EVALUATION_RESULT_H
+#define LLVM_CLANG_AST_INTERP_EVALUATION_RESULT_H
+
+#include "FunctionPointer.h"
+#include "Pointer.h"
+#include "clang/AST/APValue.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/Expr.h"
+#include <optional>
+#include <variant>
+
+namespace clang {
+namespace interp {
+class EvalEmitter;
+class Context;
+
+/// Defines the result of an evaluation.
+///
+/// The result might be in different forms--one of the pointer types,
+/// an APValue, or nothing.
+///
+/// We use this class to inspect and diagnose the result, as well as
+/// convert it to the requested form.
+class EvaluationResult final {
+public:
+  enum ResultKind {
+    Empty,   // Initial state.
+    LValue,  // Result is an lvalue/pointer.
+    RValue,  // Result is an rvalue.
+    Invalid, // Result is invalid.
+    Valid,   // Result is valid and empty.
+  };
+
+  using DeclTy = llvm::PointerUnion<const Decl *, const Expr *>;
+
+private:
+  const Context *Ctx = nullptr;
+  std::variant<std::monostate, Pointer, FunctionPointer, APValue> Value;
+  ResultKind Kind = Empty;
+  DeclTy Source = nullptr; // Currently only needed for dump().
+
+  EvaluationResult(ResultKind Kind) : Kind(Kind) {
+    // Leave everything empty. Can be used as an
+    // error marker or for void return values.
+    assert(Kind == Valid || Kind == Invalid);
+  }
+
+  void setSource(DeclTy D) { Source = D; }
+
+  void setValue(const APValue &V) {
+    assert(empty());
+    assert(!V.isLValue());
+    Value = std::move(V);
+    Kind = RValue;
+  }
+  void setPointer(const Pointer P) {
+    assert(empty());
+    Value = P;
+    Kind = LValue;
+  }
+  void setFunctionPointer(const FunctionPointer &P) {
+    assert(empty());
+    Value = P;
+    Kind = LValue;
+  }
+  void setInvalid() {
+    assert(empty());
+    Kind = Invalid;
+  }
+  void setValid() {
+    assert(empty());
+    Kind = Valid;
+  }
+
+public:
+  EvaluationResult(const Context *Ctx) : Ctx(Ctx) {}
+
+  bool empty() const { return Kind == Empty; }
+  bool isInvalid() const { return Kind == Invalid; }
+  bool isLValue() const { return Kind == LValue; }
+  bool isRValue() const { return Kind == RValue; }
+
+  /// Returns an APValue for the evaluation result. The returned
+  /// APValue might be an LValue or RValue.
+  APValue toAPValue() const;
+
+  /// If the result is an LValue, convert that to an RValue
+  /// and return it. This may fail, e.g. if the result is an
+  /// LValue and we can't read from it.
+  std::optional<APValue> toRValue() const;
+
+  bool checkFullyInitialized(InterpState &S) const;
+
+  /// Dump to stderr.
+  void dump() const;
+
+  friend class EvalEmitter;
+};
+
+} // namespace interp
+} // namespace clang
+
+#endif
diff --git a/clang/lib/AST/Interp/Interp.cpp b/clang/lib/AST/Interp/Interp.cpp
index ac03837408a4d..807b860f3565d 100644
--- a/clang/lib/AST/Interp/Interp.cpp
+++ b/clang/lib/AST/Interp/Interp.cpp
@@ -498,98 +498,6 @@ bool CheckPure(InterpState &S, CodePtr OpPC, const CXXMethodDecl *MD) {
   return false;
 }
 
-static void DiagnoseUninitializedSubobject(InterpState &S, const SourceInfo &SI,
-                                           const FieldDecl *SubObjDecl) {
-  assert(SubObjDecl && "Subobject declaration does not exist");
-  S.FFDiag(SI, diag::note_constexpr_uninitialized)
-      << /*(name)*/ 1 << SubObjDecl;
-  S.Note(SubObjDecl->getLocation(),
-         diag::note_constexpr_subobject_declared_here);
-}
-
-static bool CheckFieldsInitialized(InterpState &S, CodePtr OpPC,
-                                   const Pointer &BasePtr, const Record *R);
-
-static bool CheckArrayInitialized(InterpState &S, CodePtr OpPC,
-                                  const Pointer &BasePtr,
-                                  const ConstantArrayType *CAT) {
-  bool Result = true;
-  size_t NumElems = CAT->getSize().getZExtValue();
-  QualType ElemType = CAT->getElementType();
-
-  if (ElemType->isRecordType()) {
-    const Record *R = BasePtr.getElemRecord();
-    for (size_t I = 0; I != NumElems; ++I) {
-      Pointer ElemPtr = BasePtr.atIndex(I).narrow();
-      Result &= CheckFieldsInitialized(S, OpPC, ElemPtr, R);
-    }
-  } else if (const auto *ElemCAT = dyn_cast<ConstantArrayType>(ElemType)) {
-    for (size_t I = 0; I != NumElems; ++I) {
-      Pointer ElemPtr = BasePtr.atIndex(I).narrow();
-      Result &= CheckArrayInitialized(S, OpPC, ElemPtr, ElemCAT);
-    }
-  } else {
-    for (size_t I = 0; I != NumElems; ++I) {
-      if (!BasePtr.atIndex(I).isInitialized()) {
-        DiagnoseUninitializedSubobject(S, S.Current->getSource(OpPC),
-                                       BasePtr.getField());
-        Result = false;
-      }
-    }
-  }
-
-  return Result;
-}
-
-static bool CheckFieldsInitialized(InterpState &S, CodePtr OpPC,
-                                   const Pointer &BasePtr, const Record *R) {
-  assert(R);
-  bool Result = true;
-  // Check all fields of this record are initialized.
-  for (const Record::Field &F : R->fields()) {
-    Pointer FieldPtr = BasePtr.atField(F.Offset);
-    QualType FieldType = F.Decl->getType();
-
-    if (FieldType->isRecordType()) {
-      Result &= CheckFieldsInitialized(S, OpPC, FieldPtr, FieldPtr.getRecord());
-    } else if (FieldType->isIncompleteArrayType()) {
-      // Nothing to do here.
-    } else if (FieldType->isArrayType()) {
-      const auto *CAT =
-          cast<ConstantArrayType>(FieldType->getAsArrayTypeUnsafe());
-      Result &= CheckArrayInitialized(S, OpPC, FieldPtr, CAT);
-    } else if (!FieldPtr.isInitialized()) {
-      DiagnoseUninitializedSubobject(S, S.Current->getSource(OpPC), F.Decl);
-      Result = false;
-    }
-  }
-
-  // Check Fields in all bases
-  for (const Record::Base &B : R->bases()) {
-    Pointer P = BasePtr.atField(B.Offset);
-    if (!P.isInitialized()) {
-      S.FFDiag(BasePtr.getDeclDesc()->asDecl()->getLocation(),
-               diag::note_constexpr_uninitialized_base)
-          << B.Desc->getType();
-      return false;
-    }
-    Result &= CheckFieldsInitialized(S, OpPC, P, B.R);
-  }
-
-  // TODO: Virtual bases
-
-  return Result;
-}
-
-bool CheckCtorCall(InterpState &S, CodePtr OpPC, const Pointer &This) {
-  assert(!This.isZero());
-  if (const Record *R = This.getRecord())
-    return CheckFieldsInitialized(S, OpPC, This, R);
-  const auto *CAT =
-      cast<ConstantArrayType>(This.getType()->getAsArrayTypeUnsafe());
-  return CheckArrayInitialized(S, OpPC, This, CAT);
-}
-
 bool CheckPotentialReinterpretCast(InterpState &S, CodePtr OpPC,
                                    const Pointer &Ptr) {
   if (!S.inConstantContext())
diff --git a/clang/lib/AST/Interp/Interp.h b/clang/lib/AST/Interp/Interp.h
index 5321f9617feff..65c54ed9c89b6 100644
--- a/clang/lib/AST/Interp/Interp.h
+++ b/clang/lib/AST/Interp/Interp.h
@@ -111,9 +111,6 @@ bool CheckThis(InterpState &S, CodePtr OpPC, const Pointer &This);
 /// Checks if a method is pure virtual.
 bool CheckPure(InterpState &S, CodePtr OpPC, const CXXMethodDecl *MD);
 
-/// Checks that all fields are initialized after a constructor call.
-bool CheckCtorCall(InterpState &S, CodePtr OpPC, const Pointer &This);
-
 /// Checks if reinterpret casts are legal in the current context.
 bool CheckPotentialReinterpretCast(InterpState &S, CodePtr OpPC,
                                    const Pointer &Ptr);
@@ -1061,8 +1058,12 @@ inline bool InitGlobalTempComp(InterpState &S, CodePtr OpPC,
   const Pointer &P = S.Stk.peek<Pointer>();
   APValue *Cached = Temp->getOrCreateValue(true);
 
-  *Cached = P.toRValue(S.getCtx());
-  return true;
+  if (std::optional<APValue> APV = P.toRValue(S.getCtx())) {
+    *Cached = *APV;
+    return true;
+  }
+
+  return false;
 }
 
 template <PrimType Name, class T = typename PrimConv<Name>::T>
@@ -1863,11 +1864,6 @@ inline bool ArrayElemPtrPop(InterpState &S, CodePtr OpPC) {
   return NarrowPtr(S, OpPC);
 }
 
-inline bool CheckGlobalCtor(InterpState &S, CodePtr OpPC) {
-  const Pointer &Obj = S.Stk.peek<Pointer>();
-  return CheckCtorCall(S, OpPC, Obj);
-}
-
 inline bool Call(InterpState &S, CodePtr OpPC, const Function *Func) {
   if (Func->hasThisPointer()) {
     size_t ThisOffset =
diff --git a/clang/lib/AST/Interp/Opcodes.td b/clang/lib/AST/Interp/Opcodes.td
index 6e0f1c939460d..24747b6b98c16 100644
--- a/clang/lib/AST/Interp/Opcodes.td
+++ b/clang/lib/AST/Interp/Opcodes.td
@@ -375,8 +375,6 @@ def GetLocal : AccessOpcode { let HasCustomEval = 1; }
 // [] -> [Pointer]
 def SetLocal : AccessOpcode { let HasCustomEval = 1; }
 
-def CheckGlobalCtor : Opcode {}
-
 // [] -> [Value]
 def GetGlobal : AccessOpcode;
 def GetGlobalUnchecked : AccessOpcode;
diff --git a/clang/lib/AST/Interp/Pointer.cpp b/clang/lib/AST/Interp/Pointer.cpp
index e979b99b0fdd0..5af1d6d52e93e 100644
--- a/clang/lib/AST/Interp/Pointer.cpp
+++ b/clang/lib/AST/Interp/Pointer.cpp
@@ -231,33 +231,143 @@ bool Pointer::hasSameArray(const Pointer &A, const Pointer &B) {
   return hasSameBase(A, B) && A.Base == B.Base && A.getFieldDesc()->IsArray;
 }
 
-APValue Pointer::toRValue(const Context &Ctx) const {
-  // Primitives.
-  if (getFieldDesc()->isPrimitive()) {
-    PrimType PT = *Ctx.classify(getType());
-    TYPE_SWITCH(PT, return deref<T>().toAPValue());
-    llvm_unreachable("Unhandled PrimType?");
-  }
+std::optional<APValue> Pointer::toRValue(const Context &Ctx) const {
+  // Method to recursively traverse composites.
+  std::function<bool(QualType, const Pointer &, APValue &)> Composite;
+  Composite = [&Composite, &Ctx](QualType Ty, const Pointer &Ptr, APValue &R) {
+    if (const auto *AT = Ty->getAs<AtomicType>())
+      Ty = AT->getValueType();
+
+    // Invalid pointers.
+    if (Ptr.isDummy() || !Ptr.isLive() ||
+        (!Ptr.isUnknownSizeArray() && Ptr.isOnePastEnd()))
+      return false;
 
-  APValue Result;
-  // Records.
-  if (getFieldDesc()->isRecord()) {
-    const Record *R = getRecord();
-    Result =
-        APValue(APValue::UninitStruct(), R->getNumBases(), R->getNumFields());
-
-    for (unsigned I = 0; I != R->getNumFields(); ++I) {
-      const Pointer &FieldPtr = this->atField(R->getField(I)->Offset);
-      Result.getStructField(I) = FieldPtr.toRValue(Ctx);
+    // Primitive values.
+    if (std::optional<PrimType> T = Ctx.classify(Ty)) {
+      if (T == PT_Ptr || T == PT_FnPtr) {
+        R = Ptr.toAPValue();
+      } else {
+        TYPE_SWITCH(*T, R = Ptr.deref<T>().toAPValue());
+      }
+      return true;
     }
 
-    for (unsigned I = 0; I != R->getNumBases(); ++I) {
-      const Pointer &BasePtr = this->atField(R->getBase(I)->Offset);
-      Result.getStructBase(I) = BasePtr.toRValue(Ctx);
+    if (const auto *RT = Ty->getAs<RecordType>()) {
+      const auto *Record = Ptr.getRecord();
+      assert(Record && "Missing record descriptor");
+
+      bool Ok = true;
+      if (RT->getDecl()->isUnion()) {
+        const FieldDecl *ActiveField = nullptr;
+        APValue Value;
+        for (const auto &F : Record->fields()) {
+          const Pointer &FP = Ptr.atField(F.Offset);
+          QualType FieldTy = F.Decl->getType();
+          if (FP.isActive()) {
+            if (std::optional<PrimType> T = Ctx.classify(FieldTy)) {
+              TYPE_SWITCH(*T, Value = FP.deref<T>().toAPValue());
+            } else {
+              Ok &= Composite(FieldTy, FP, Value);
+            }
+            break;
+          }
+        }
+        R = APValue(ActiveField, Value);
+      } else {
+        unsigned NF = Record->getNumFields();
+        unsigned NB = Record->getNumBases();
+        unsigned NV = Ptr.isBaseClass() ? 0 : Record->getNumVirtualBases();
+
+        R = APValue(APValue::UninitStruct(), NB, NF);
+
+        for (unsigned I = 0; I < NF; ++I) {
+          const Record::Field *FD = Record->getField(I);
+          QualType FieldTy = FD->Decl->getType();
+          const Pointer &FP = Ptr.atField(FD->Offset);
+          APValue &Value = R.getStructField(I);
+
+          if (std::optional<PrimType> T = Ctx.classify(FieldTy)) {
+            TYPE_SWITCH(*T, Value = FP.deref<T>().toAPValue());
+          } else {
+            Ok &= Composite(FieldTy, FP, Value);
+          }
+        }
+
+        for (unsigned I = 0; I < NB; ++I) {
+          const Record::Base *BD = Record->getBase(I);
+          QualType BaseTy = Ctx.getASTContext().getRecordType(BD->Decl);
+          const Pointer &BP = Ptr.atField(BD->Offset);
+          Ok &= Composite(BaseTy, BP, R.getStructBase(I));
+        }
+
+        for (unsigned I = 0; I < NV; ++I) {
+          const Record::Base *VD = Record->getVirtualBase(I);
+          QualType VirtBaseTy = Ctx.getASTContext().getRecordType(VD->Decl);
+          const Pointer &VP = Ptr.atField(VD->Offset);
+          Ok &= Composite(VirtBaseTy, VP, R.getStructBase(NB + I));
+        }
+      }
+      return Ok;
     }
-  }
 
-  // TODO: Arrays
+    if (Ty->isIncompleteArrayType()) {
+      R = APValue(APValue::UninitArray(), 0, 0);
+      return true;
+    }
+
+    if (const auto *AT = Ty->getAsArrayTypeUnsafe()) {
+      const size_t NumElems = Ptr.getNumElems();
+      QualType ElemTy = AT->getElementType();
+      R = APValue(APValue::UninitArray{}, NumElems, NumElems);
+
+      bool Ok = true;
+      for (unsigned I = 0; I < NumElems; ++I) {
+        APValue &Slot = R.getArrayInitializedElt(I);
+        const Pointer &EP = Ptr.atIndex(I);
+        if (std::optional<PrimType> T = Ctx.classify(ElemTy)) {
+          TYPE_SWITCH(*T, Slot = EP.deref<T>().toAPValue());
+        } else {
+          Ok &= Composite(ElemTy, EP.narrow(), Slot);
+        }
+      }
+      return Ok;
+    }
 
+    // Complex types.
+    if (const auto *CT = Ty->getAs<ComplexType>()) {
+      QualType ElemTy = CT->getElementType();
+      std::optional<PrimType> ElemT = Ctx.classify(ElemTy);
+      assert(ElemT);
+
+      if (ElemTy->isIntegerType()) {
+        INT_TYPE_SWITCH(*ElemT, {
+          auto V1 = Ptr.atIndex(0).deref<T>();
+          auto V2 = Ptr.atIndex(1).deref<T>();
+          R = APValue(V1.toAPSInt(), V2.toAPSInt());
+          return true;
+        });
+      } else if (ElemTy->isFloatingType()) {
+        R = APValue(Ptr.atIndex(0).deref<Floating>().getAPFloat(),
+                    Ptr.atIndex(1).deref<Floating>().getAPFloat());
+        return true;
+      }
+      return false;
+    }
+
+    llvm_unreachable("invalid value to return");
+  };
+
+  if (isZero())
+    return APValue(static_cast<Expr *>(nullptr), CharUnits::Zero(), {}, false,
+                   true);
+
+  if (isDummy() || !isLive())
+    return std::nullopt;
+
+  // Return the composite type.
+  APValue Result;
+  if (!Composite(getType(), *this, Result))
+    return std::nullopt;
   return Result;
 }
diff --git a/clang/lib/AST/Interp/Pointer.h b/clang/lib/AST/Interp/Pointer.h
index a8f6e62fa76d3..8ccaff41ded8d 100644
--- a/clang/lib/AST/Interp/Pointer.h
+++ b/clang/lib/AST/Interp/Pointer.h
@@ -98,7 +98,7 @@ class Pointer {
   }
 
   /// Converts the pointer to an APValue that is an rvalue.
-  APValue toRValue(const Context &Ctx) const;
+  std::optional<APValue> toRValue(const Context &Ctx) const;
 
   /// Offsets a pointer inside an array.
   [[nodiscard]] Pointer atIndex(unsigned Idx) const {
@@ -379,6 +379,7 @@ class Pointer {
       return *reinterpret_cast<T *>(Pointee->rawData() + Base +
                                     sizeof(InitMapPtr));
 
+    assert(Offset + sizeof(T) <= Pointee->getDescriptor()->getAllocSize());
     return *reinterpret_cast<T *>(Pointee->rawData() + Offset);
   }
 
diff --git a/clang/test/AST/Interp/records.cpp b/clang/test/AST/Interp/records.cpp
index 800dc6f910be6..c6aac7938a297 100644
--- a/clang/test/AST/Interp/records.cpp
+++ b/clang/test/AST/Interp/records.cpp
@@ -170,7 +170,6 @@ class Bar { // expected-note {{definition of 'Bar' is not complete}} \
                    // ref-error {{has incomplete type 'const Bar'}}
 };
 constexpr Bar B; // expected-error {{must be initialized by a constant expression}} \
-                 // expected-error {{failed to evaluate an expression}} \
                  // ref-error {{must be initialized by a constant expression}}
 constexpr Bar *pb = nullptr;
 

From 805035931ececce8a856f31c576e8565ba078f93 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 19 Jan 2024 09:08:30 +0000
Subject: [PATCH 095/843] [gn build] Port 4e7cf1b1ed38

---
 llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
index 2076394ffe085..ff793190bd280 100644
--- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
@@ -99,6 +99,7 @@ static_library("AST") {
     "Interp/Descriptor.cpp",
     "Interp/Disasm.cpp",
     "Interp/EvalEmitter.cpp",
+    "Interp/EvaluationResult.cpp",
     "Interp/Floating.cpp",
     "Interp/Frame.cpp",
     "Interp/Function.cpp",

From 4fc128f817b425657049922175619addb04c8f41 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 19 Jan 2024 10:20:41 +0100
Subject: [PATCH 096/843] [mlir][bufferization][NFC] Clean up code (#78594)

Clean up code and remove dead code.
---
 .../Dialect/Bufferization/IR/BufferizableOpInterface.h |  3 ---
 .../Bufferization/IR/BufferizableOpInterface.cpp       |  7 -------
 .../Transforms/BufferDeallocationSimplification.cpp    | 10 ++--------
 3 files changed, 2 insertions(+), 18 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
index 63e2d19e68ef9..226a2fbd08563 100644
--- a/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
+++ b/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h
@@ -375,9 +375,6 @@ struct BufferizationOptions {
   SmallVector<AnalysisStateInitFn> stateInitializers;
 };
 
-/// Return `true` if the given value is a BlockArgument of a func::FuncOp.
-bool isFunctionArgument(Value value);
-
 /// Traversal parameters for `findValueInReverseUseDefChain`.
 struct TraversalConfig {
   /// Specifies if leaves (that do not have further OpOperands to follow)
diff --git a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
index 4b1dfee4a2b92..6ca9702cbbc66 100644
--- a/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
+++ b/mlir/lib/Dialect/Bufferization/IR/BufferizableOpInterface.cpp
@@ -759,13 +759,6 @@ LogicalResult BufferizationOptions::createMemCpy(OpBuilder &b, Location loc,
 // Bufferization-specific IRMapping support with debugging.
 //===----------------------------------------------------------------------===//
 
-bool bufferization::isFunctionArgument(Value value) {
-  auto bbArg = llvm::dyn_cast<BlockArgument>(value);
-  if (!bbArg)
-    return false;
-  return isa<func::FuncOp>(bbArg.getOwner()->getParentOp());
-}
-
 BaseMemRefType bufferization::getMemRefType(Value value,
                                             const BufferizationOptions &options,
                                             MemRefLayoutAttrInterface layout,
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
index 75d65193809f1..4d1e21c2e406c 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
@@ -240,13 +240,6 @@ struct RemoveRetainedMemrefsGuaranteedToNotAlias
   LogicalResult matchAndRewrite(DeallocOp deallocOp,
                                 PatternRewriter &rewriter) const override {
     SmallVector<Value> newRetainedMemrefs, replacements;
-    Value falseValue;
-    auto getOrCreateFalse = [&]() -> Value {
-      if (!falseValue)
-        falseValue = rewriter.create<arith::ConstantOp>(
-            deallocOp.getLoc(), rewriter.getBoolAttr(false));
-      return falseValue;
-    };
 
     for (auto retainedMemref : deallocOp.getRetained()) {
       if (potentiallyAliasesMemref(aliasAnalysis, deallocOp.getMemrefs(),
@@ -256,7 +249,8 @@ struct RemoveRetainedMemrefsGuaranteedToNotAlias
         continue;
       }
 
-      replacements.push_back(getOrCreateFalse());
+      replacements.push_back(rewriter.create<arith::ConstantOp>(
+          deallocOp.getLoc(), rewriter.getBoolAttr(false)));
     }
 
     if (newRetainedMemrefs.size() == deallocOp.getRetained().size())

From 10317da20309378365e116ce5d4f87c40d6ad25d Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Fri, 19 Jan 2024 10:40:25 +0100
Subject: [PATCH 097/843] [flang][re-apply] Fix seg fault
 CodeGenAction::executeAction() (#78672)

If `generateLLVMIR()` fails, we still continue using the module we
failed to generate which causes a seg fault if LLVM code-gen failed for
some reason or another. This commit fixes this issue.

Re-applies PR #78269 and adds LLVM and MLIR dependencies that were
missed in the PR. The missing libs were: `LLVMCore` & `MLIRIR`.

This reverts commit 4fc75062745eb5232ea60c37b9ffe61177efa12a.
---
 flang/lib/Frontend/FrontendActions.cpp        |   5 +
 flang/unittests/Frontend/CMakeLists.txt       |   3 +
 .../unittests/Frontend/CodeGenActionTest.cpp  | 109 ++++++++++++++++++
 3 files changed, 117 insertions(+)
 create mode 100644 flang/unittests/Frontend/CodeGenActionTest.cpp

diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index 74e3992d5ab62..65c4df7388f97 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -1202,6 +1202,11 @@ void CodeGenAction::executeAction() {
   if (!llvmModule)
     generateLLVMIR();
 
+  // If generating the LLVM module failed, abort! No need for further error
+  // reporting since generateLLVMIR() does this already.
+  if (!llvmModule)
+    return;
+
   // Set the triple based on the targetmachine (this comes compiler invocation
   // and the command-line target option if specified, or the default if not
   // given on the command-line).
diff --git a/flang/unittests/Frontend/CMakeLists.txt b/flang/unittests/Frontend/CMakeLists.txt
index 79a394f161ed1..22c568af3d121 100644
--- a/flang/unittests/Frontend/CMakeLists.txt
+++ b/flang/unittests/Frontend/CMakeLists.txt
@@ -1,9 +1,11 @@
 set(LLVM_LINK_COMPONENTS
   ${LLVM_TARGETS_TO_BUILD}
   TargetParser
+  Core
 )
 
 add_flang_unittest(FlangFrontendTests
+  CodeGenActionTest.cpp
   CompilerInstanceTest.cpp
   FrontendActionTest.cpp
 )
@@ -18,4 +20,5 @@ target_link_libraries(FlangFrontendTests
   FortranSemantics
   FortranCommon
   FortranEvaluate
+  MLIRIR
 )
diff --git a/flang/unittests/Frontend/CodeGenActionTest.cpp b/flang/unittests/Frontend/CodeGenActionTest.cpp
new file mode 100644
index 0000000000000..9d798c7678ad1
--- /dev/null
+++ b/flang/unittests/Frontend/CodeGenActionTest.cpp
@@ -0,0 +1,109 @@
+//===- unittests/Frontend/CodeGenActionTest.cpp --- FrontendAction tests --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Unit tests for CodeGenAction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Builders.h"
+#include "flang/Frontend/CompilerInstance.h"
+#include "flang/Frontend/FrontendActions.h"
+#include "flang/Frontend/TextDiagnosticPrinter.h"
+
+#include "gtest/gtest.h"
+
+#include <memory>
+
+using namespace Fortran::frontend;
+
+namespace test {
+class DummyDialect : public ::mlir::Dialect {
+  explicit DummyDialect(::mlir::MLIRContext *context)
+      : ::mlir::Dialect(getDialectNamespace(), context,
+            ::mlir::TypeID::get<DummyDialect>()) {
+    initialize();
+  }
+
+  void initialize();
+  friend class ::mlir::MLIRContext;
+
+public:
+  ~DummyDialect() override = default;
+  static constexpr ::llvm::StringLiteral getDialectNamespace() {
+    return ::llvm::StringLiteral("dummy");
+  }
+};
+
+namespace dummy {
+class FakeOp : public ::mlir::Op<FakeOp> {
+public:
+  using Op::Op;
+
+  static llvm::StringRef getOperationName() { return "dummy.fake"; }
+
+  static ::llvm::ArrayRef<::llvm::StringRef> getAttributeNames() { return {}; }
+
+  static void build(
+      ::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState) {}
+};
+} // namespace dummy
+} // namespace test
+
+MLIR_DECLARE_EXPLICIT_TYPE_ID(::test::DummyDialect)
+MLIR_DEFINE_EXPLICIT_TYPE_ID(::test::DummyDialect)
+
+namespace test {
+
+void DummyDialect::initialize() { addOperations<::test::dummy::FakeOp>(); }
+} // namespace test
+
+// A test CodeGenAction to verify that we gracefully handle failure to convert
+// from MLIR to LLVM IR.
+class LLVMConversionFailureCodeGenAction : public CodeGenAction {
+public:
+  LLVMConversionFailureCodeGenAction()
+      : CodeGenAction(BackendActionTy::Backend_EmitLL) {
+    mlirCtx = std::make_unique<mlir::MLIRContext>();
+    mlirCtx->loadDialect<test::DummyDialect>();
+
+    mlir::Location loc(mlir::UnknownLoc::get(mlirCtx.get()));
+    mlirModule =
+        std::make_unique<mlir::ModuleOp>(mlir::ModuleOp::create(loc, "mod"));
+
+    mlir::OpBuilder builder(mlirCtx.get());
+    builder.setInsertionPointToStart(&mlirModule->getRegion().front());
+    // Create a fake op to trip conversion to LLVM.
+    builder.create<test::dummy::FakeOp>(loc);
+
+    llvmCtx = std::make_unique<llvm::LLVMContext>();
+  }
+};
+
+TEST(CodeGenAction, GracefullyHandleLLVMConversionFailure) {
+  std::string diagnosticOutput;
+  llvm::raw_string_ostream diagnosticsOS(diagnosticOutput);
+  auto diagPrinter = std::make_unique<Fortran::frontend::TextDiagnosticPrinter>(
+      diagnosticsOS, new clang::DiagnosticOptions());
+
+  CompilerInstance ci;
+  ci.createDiagnostics(diagPrinter.get(), /*ShouldOwnClient=*/false);
+  ci.setInvocation(std::make_shared<CompilerInvocation>());
+  ci.setOutputStream(std::make_unique<llvm::raw_null_ostream>());
+  ci.getInvocation().getCodeGenOpts().OptimizationLevel = 0;
+
+  FrontendInputFile file("/dev/null", InputKind());
+
+  LLVMConversionFailureCodeGenAction action;
+  action.setInstance(&ci);
+  action.setCurrentInput(file);
+
+  consumeError(action.execute());
+  ASSERT_EQ(diagnosticsOS.str(),
+      "error: Lowering to LLVM IR failed\n"
+      "error: failed to create the LLVM module\n");
+}

From 4619147911c2a955bb605618bc518b45da994a81 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Fri, 19 Jan 2024 10:59:05 +0100
Subject: [PATCH 098/843] Revert "[sanitizer] Skip /include/c++/ from summary
 (#78534)"

The test fails on Darwin, see comment on the PR.

> std:: usually is not a cause of the bug.
>
> We now display
> ```
> SUMMARY: AddressSanitizer: allocation-size-too-big path/to/allocator_returns_null.cpp:92:7 in main
> ```
> instead of
>
> ```
> SUMMARY: AddressSanitizer: allocation-size-too-big /usr/lib/../include/c++/13/bits/new_allocator.h:147:27 in std::__new_allocator<char>::allocate(unsigned long, void const*)
> ```
>
> `/include/c++/` matches both libc++ and libstdc++ include paths.

This reverts commit ecd47811b755d13357085bcd7519a66d6c4d8e5c.
---
 .../sanitizer_symbolizer_report.cpp           |  3 +--
 .../TestCases/allocator_returns_null.cpp      | 19 ++-----------------
 2 files changed, 3 insertions(+), 19 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
index 8438e019591b5..253dc10607a6e 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
@@ -34,8 +34,7 @@ static bool FrameIsInternal(const SymbolizedStack *frame) {
     return true;
   const char *file = frame->info.file;
   const char *module = frame->info.module;
-  if (file && (internal_strstr(file, "/compiler-rt/lib/") ||
-               internal_strstr(file, "/include/c++/")))
+  if (file && (internal_strstr(file, "/compiler-rt/lib/")))
     return true;
   if (module && (internal_strstr(module, "libclang_rt.")))
     return true;
diff --git a/compiler-rt/test/sanitizer_common/TestCases/allocator_returns_null.cpp b/compiler-rt/test/sanitizer_common/TestCases/allocator_returns_null.cpp
index 1e4b0ed520b8a..ca6f637b9a3f5 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/allocator_returns_null.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/allocator_returns_null.cpp
@@ -34,20 +34,17 @@
 // RUN:   | FileCheck %s --check-prefix=CHECK-nnCRASH
 // RUN: %env_tool_opts=allocator_may_return_null=1     %run %t new-nothrow 2>&1 \
 // RUN:   | FileCheck %s --check-prefix=CHECK-NULL
-// RUN: %env_tool_opts=allocator_may_return_null=0 not %run %t vector 2>&1 \
-// RUN:   | FileCheck %s --check-prefix=CHECK-vCRASH
 
 // TODO(alekseyshl): win32 is disabled due to failing errno tests, fix it there.
 // UNSUPPORTED: ubsan, target={{.*windows-msvc.*}}
 
 #include <assert.h>
 #include <errno.h>
-#include <limits>
-#include <new>
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
-#include <vector>
+#include <limits>
+#include <new>
 
 int main(int argc, char **argv) {
   assert(argc == 2);
@@ -63,8 +60,6 @@ int main(int argc, char **argv) {
       (3UL << 30) + 1;
 #endif
 
-  std::vector<char> v;
-
   void *x = nullptr;
   if (!strcmp(action, "malloc")) {
     x = malloc(kMaxAllowedMallocSizePlusOne);
@@ -87,14 +82,6 @@ int main(int argc, char **argv) {
     x = operator new(kMaxAllowedMallocSizePlusOne);
   } else if (!strcmp(action, "new-nothrow")) {
     x = operator new(kMaxAllowedMallocSizePlusOne, std::nothrow);
-  } else if (!strcmp(action, "vector")) {
-#if __LP64__ || defined(_WIN64)
-    v.resize(kMaxAllowedMallocSizePlusOne);
-    x = v.data();
-#else
-    // Fake it: 32bit fails early in std.
-    x = malloc(kMaxAllowedMallocSizePlusOne);
-#endif
   } else {
     assert(0);
   }
@@ -130,8 +117,6 @@ int main(int argc, char **argv) {
 // CHECK-nnCRASH: new-nothrow:
 // CHECK-nnCRASH: #{{[0-9]+.*}}allocator_returns_null.cpp
 // CHECK-nnCRASH: {{SUMMARY: .*Sanitizer: allocation-size-too-big.*allocator_returns_null.cpp.*}} in main
-// CHECK-vCRASH: #{{[0-9]+.*}}allocator_returns_null.cpp
-// CHECK-vCRASH: {{SUMMARY: .*Sanitizer: allocation-size-too-big.*allocator_returns_null.cpp.*}} in main
 
 // CHECK-NULL: {{malloc|calloc|calloc-overflow|realloc|realloc-after-malloc|new-nothrow}}
 // CHECK-NULL: errno: 12, x: 0

From f670112a591c83498e348f3ff2f73f02baf8e303 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Fri, 19 Jan 2024 02:00:33 -0800
Subject: [PATCH 099/843] [llvm-exegesis] Add support for validation counters
 (#76653)

This patch adds support for validation counters. Validation counters can
be used to measure events that occur during snippet execution like cache
misses to ensure that certain assumed invariants about the benchmark
actually hold. Validation counters are setup within a perf event group,
so are turned on and off at exactly the same time as the "group leader"
counter that measures the desired value.
---
 .../llvm-exegesis/X86/validation-counters.asm | 12 +++
 .../llvm-exegesis/lib/BenchmarkResult.cpp     | 37 ++++++++
 .../tools/llvm-exegesis/lib/BenchmarkResult.h | 10 ++-
 .../llvm-exegesis/lib/BenchmarkRunner.cpp     | 86 +++++++++++++++----
 .../tools/llvm-exegesis/lib/BenchmarkRunner.h | 16 +++-
 .../lib/LatencyBenchmarkRunner.cpp            | 41 ++++++---
 .../lib/LatencyBenchmarkRunner.h              |  2 +
 llvm/tools/llvm-exegesis/lib/PerfHelper.cpp   | 43 ++++++++--
 llvm/tools/llvm-exegesis/lib/PerfHelper.h     |  9 +-
 llvm/tools/llvm-exegesis/lib/Target.cpp       | 35 +++++---
 llvm/tools/llvm-exegesis/lib/Target.h         | 10 +--
 .../llvm-exegesis/lib/UopsBenchmarkRunner.cpp | 32 +++++--
 .../llvm-exegesis/lib/UopsBenchmarkRunner.h   |  6 +-
 llvm/tools/llvm-exegesis/lib/X86/Target.cpp   | 22 ++++-
 .../llvm-exegesis/lib/X86/X86Counter.cpp      |  2 +-
 llvm/tools/llvm-exegesis/llvm-exegesis.cpp    | 12 ++-
 .../tools/llvm-exegesis/ClusteringTest.cpp    | 36 ++++----
 .../Mips/BenchmarkResultTest.cpp              | 12 +--
 .../llvm-exegesis/X86/BenchmarkResultTest.cpp | 12 +--
 19 files changed, 333 insertions(+), 102 deletions(-)
 create mode 100644 llvm/test/tools/llvm-exegesis/X86/validation-counters.asm

diff --git a/llvm/test/tools/llvm-exegesis/X86/validation-counters.asm b/llvm/test/tools/llvm-exegesis/X86/validation-counters.asm
new file mode 100644
index 0000000000000..7d0c940519e6a
--- /dev/null
+++ b/llvm/test/tools/llvm-exegesis/X86/validation-counters.asm
@@ -0,0 +1,12 @@
+# REQUIRES: exegesis-can-measure-latency, exegesis-can-measure-uops, x86_64-linux
+
+# Check that when specifying validation counters, the validation counter is
+# collected and the information is displayed in the output. Test across
+# multiple configurations that need to be wired up separately for validation
+# counter support.
+
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -opcode-name=ADD64rr --validation-counter=instructions-retired | FileCheck %s
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=latency -opcode-name=ADD64rr --validation-counter=instructions-retired -execution-mode=subprocess | FileCheck %s
+# RUN: llvm-exegesis -mtriple=x86_64-unknown-unknown -mode=uops -opcode-name=ADD64rr --validation-counter=instructions-retired -execution-mode=subprocess | FileCheck %s
+
+# CHECK: instructions-retired: {{[0-9]+}}
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 02c4da11e032d..60264ca87a988 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/bit.h"
 #include "llvm/ObjectYAML/YAML.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
@@ -192,6 +193,41 @@ template <> struct SequenceElementTraits<exegesis::BenchmarkMeasure> {
   static const bool flow = false;
 };
 
+const char *validationEventToString(exegesis::ValidationEvent VE) {
+  switch (VE) {
+  case exegesis::ValidationEvent::InstructionRetired:
+    return "instructions-retired";
+  }
+}
+
+Expected<exegesis::ValidationEvent> stringToValidationEvent(StringRef Input) {
+  if (Input == "instructions-retired")
+    return exegesis::ValidationEvent::InstructionRetired;
+  else
+    return make_error<StringError>("Invalid validation event string",
+                                   errc::invalid_argument);
+}
+
+template <>
+struct CustomMappingTraits<std::map<exegesis::ValidationEvent, int64_t>> {
+  static void inputOne(IO &Io, StringRef KeyStr,
+                       std::map<exegesis::ValidationEvent, int64_t> &VI) {
+    Expected<exegesis::ValidationEvent> Key = stringToValidationEvent(KeyStr);
+    if (!Key) {
+      Io.setError("Key is not a valid validation event");
+      return;
+    }
+    Io.mapRequired(KeyStr.str().c_str(), VI[*Key]);
+  }
+
+  static void output(IO &Io, std::map<exegesis::ValidationEvent, int64_t> &VI) {
+    for (auto &IndividualVI : VI) {
+      Io.mapRequired(validationEventToString(IndividualVI.first),
+                     IndividualVI.second);
+    }
+  }
+};
+
 // exegesis::Measure is rendererd as a flow instead of a list.
 // e.g. { "key": "the key", "value": 0123 }
 template <> struct MappingTraits<exegesis::BenchmarkMeasure> {
@@ -203,6 +239,7 @@ template <> struct MappingTraits<exegesis::BenchmarkMeasure> {
     }
     Io.mapRequired("value", Obj.PerInstructionValue);
     Io.mapOptional("per_snippet_value", Obj.PerSnippetValue);
+    Io.mapOptional("validation_counters", Obj.ValidationCounters);
   }
   static const bool flow = true;
 };
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
index 0d08febae20cb..c983d6d6e00d9 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -32,6 +32,8 @@ class Error;
 
 namespace exegesis {
 
+enum ValidationEvent { InstructionRetired };
+
 enum class BenchmarkPhaseSelectorE {
   PrepareSnippet,
   PrepareAndAssembleSnippet,
@@ -77,8 +79,10 @@ struct BenchmarkKey {
 
 struct BenchmarkMeasure {
   // A helper to create an unscaled BenchmarkMeasure.
-  static BenchmarkMeasure Create(std::string Key, double Value) {
-    return {Key, Value, Value};
+  static BenchmarkMeasure
+  Create(std::string Key, double Value,
+         std::map<ValidationEvent, int64_t> ValCounters) {
+    return {Key, Value, Value, ValCounters};
   }
   std::string Key;
   // This is the per-instruction value, i.e. measured quantity scaled per
@@ -87,6 +91,8 @@ struct BenchmarkMeasure {
   // This is the per-snippet value, i.e. measured quantity for one repetition of
   // the whole snippet.
   double PerSnippetValue;
+  // These are the validation counter values.
+  std::map<ValidationEvent, int64_t> ValidationCounters;
 };
 
 // The result of an instruction benchmark.
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
index eb1393ad94ff9..2b512284f9409 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.cpp
@@ -54,9 +54,11 @@ namespace exegesis {
 
 BenchmarkRunner::BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode,
                                  BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
-                                 ExecutionModeE ExecutionMode)
+                                 ExecutionModeE ExecutionMode,
+                                 ArrayRef<ValidationEvent> ValCounters)
     : State(State), Mode(Mode), BenchmarkPhaseSelector(BenchmarkPhaseSelector),
-      ExecutionMode(ExecutionMode), Scratch(std::make_unique<ScratchSpace>()) {}
+      ExecutionMode(ExecutionMode), ValidationCounters(ValCounters),
+      Scratch(std::make_unique<ScratchSpace>()) {}
 
 BenchmarkRunner::~BenchmarkRunner() = default;
 
@@ -71,7 +73,9 @@ void BenchmarkRunner::FunctionExecutor::accumulateCounterValues(
 }
 
 Expected<llvm::SmallVector<int64_t, 4>>
-BenchmarkRunner::FunctionExecutor::runAndSample(const char *Counters) const {
+BenchmarkRunner::FunctionExecutor::runAndSample(
+    const char *Counters, ArrayRef<const char *> ValidationCounters,
+    SmallVectorImpl<int64_t> &ValidationCounterValues) const {
   // We sum counts when there are several counters for a single ProcRes
   // (e.g. P23 on SandyBridge).
   llvm::SmallVector<int64_t, 4> CounterValues;
@@ -79,8 +83,8 @@ BenchmarkRunner::FunctionExecutor::runAndSample(const char *Counters) const {
   StringRef(Counters).split(CounterNames, '+');
   for (auto &CounterName : CounterNames) {
     CounterName = CounterName.trim();
-    Expected<SmallVector<int64_t, 4>> ValueOrError =
-        runWithCounter(CounterName);
+    Expected<SmallVector<int64_t, 4>> ValueOrError = runWithCounter(
+        CounterName, ValidationCounters, ValidationCounterValues);
     if (!ValueOrError)
       return ValueOrError.takeError();
     accumulateCounterValues(ValueOrError.get(), &CounterValues);
@@ -120,11 +124,13 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
       (*Result)[I] += NewValues[I];
   }
 
-  Expected<llvm::SmallVector<int64_t, 4>>
-  runWithCounter(StringRef CounterName) const override {
+  Expected<llvm::SmallVector<int64_t, 4>> runWithCounter(
+      StringRef CounterName, ArrayRef<const char *> ValidationCounters,
+      SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
     const ExegesisTarget &ET = State.getExegesisTarget();
     char *const ScratchPtr = Scratch->ptr();
-    auto CounterOrError = ET.createCounter(CounterName, State);
+    auto CounterOrError =
+        ET.createCounter(CounterName, State, ValidationCounters);
 
     if (!CounterOrError)
       return CounterOrError.takeError();
@@ -156,6 +162,14 @@ class InProcessFunctionExecutorImpl : public BenchmarkRunner::FunctionExecutor {
       }
     }
 
+    auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
+    if (!ValidationValuesOrErr)
+      return ValidationValuesOrErr.takeError();
+
+    ArrayRef RealValidationValues = *ValidationValuesOrErr;
+    for (size_t I = 0; I < RealValidationValues.size(); ++I)
+      ValidationCounterValues[I] = RealValidationValues[I];
+
     return Counter->readOrError(Function.getFunctionBytes());
   }
 
@@ -266,7 +280,9 @@ class SubProcessFunctionExecutorImpl
   }
 
   Error createSubProcessAndRunBenchmark(
-      StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues) const {
+      StringRef CounterName, SmallVectorImpl<int64_t> &CounterValues,
+      ArrayRef<const char *> ValidationCounters,
+      SmallVectorImpl<int64_t> &ValidationCounterValues) const {
     int PipeFiles[2];
     int PipeSuccessOrErr = socketpair(AF_UNIX, SOCK_DGRAM, 0, PipeFiles);
     if (PipeSuccessOrErr != 0) {
@@ -306,8 +322,8 @@ class SubProcessFunctionExecutorImpl
     }
 
     const ExegesisTarget &ET = State.getExegesisTarget();
-    auto CounterOrError =
-        ET.createCounter(CounterName, State, ParentOrChildPID);
+    auto CounterOrError = ET.createCounter(
+        CounterName, State, ValidationCounters, ParentOrChildPID);
 
     if (!CounterOrError)
       return CounterOrError.takeError();
@@ -362,6 +378,14 @@ class SubProcessFunctionExecutorImpl
           return CounterValueOrErr.takeError();
         CounterValues = std::move(*CounterValueOrErr);
 
+        auto ValidationValuesOrErr = Counter->readValidationCountersOrError();
+        if (!ValidationValuesOrErr)
+          return ValidationValuesOrErr.takeError();
+
+        ArrayRef RealValidationValues = *ValidationValuesOrErr;
+        for (size_t I = 0; I < RealValidationValues.size(); ++I)
+          ValidationCounterValues[I] = RealValidationValues[I];
+
         return Error::success();
       }
       // The child exited, but not successfully
@@ -460,15 +484,15 @@ class SubProcessFunctionExecutorImpl
     exit(0);
   }
 
-  Expected<llvm::SmallVector<int64_t, 4>>
-  runWithCounter(StringRef CounterName) const override {
+  Expected<llvm::SmallVector<int64_t, 4>> runWithCounter(
+      StringRef CounterName, ArrayRef<const char *> ValidationCounters,
+      SmallVectorImpl<int64_t> &ValidationCounterValues) const override {
     SmallVector<int64_t, 4> Value(1, 0);
-    Error PossibleBenchmarkError =
-        createSubProcessAndRunBenchmark(CounterName, Value);
+    Error PossibleBenchmarkError = createSubProcessAndRunBenchmark(
+        CounterName, Value, ValidationCounters, ValidationCounterValues);
 
-    if (PossibleBenchmarkError) {
+    if (PossibleBenchmarkError)
       return std::move(PossibleBenchmarkError);
-    }
 
     return Value;
   }
@@ -642,6 +666,34 @@ BenchmarkRunner::writeObjectFile(StringRef Buffer, StringRef FileName) const {
   return std::string(ResultPath);
 }
 
+static bool EventLessThan(const std::pair<ValidationEvent, const char *> LHS,
+                          const ValidationEvent RHS) {
+  return static_cast<int>(LHS.first) < static_cast<int>(RHS);
+}
+
+Error BenchmarkRunner::getValidationCountersToRun(
+    SmallVector<const char *> &ValCountersToRun) const {
+  const PfmCountersInfo &PCI = State.getPfmCounters();
+  ValCountersToRun.reserve(ValidationCounters.size());
+
+  ValCountersToRun.reserve(ValidationCounters.size());
+  ArrayRef TargetValidationEvents(PCI.ValidationEvents,
+                                  PCI.NumValidationEvents);
+  for (const ValidationEvent RequestedValEvent : ValidationCounters) {
+    auto ValCounterIt =
+        lower_bound(TargetValidationEvents, RequestedValEvent, EventLessThan);
+    if (ValCounterIt == TargetValidationEvents.end() ||
+        ValCounterIt->first != RequestedValEvent)
+      return make_error<Failure>("Cannot create validation counter");
+
+    assert(ValCounterIt->first == RequestedValEvent &&
+           "The array of validation events from the target should be sorted");
+    ValCountersToRun.push_back(ValCounterIt->second);
+  }
+
+  return Error::success();
+}
+
 BenchmarkRunner::FunctionExecutor::~FunctionExecutor() {}
 
 } // namespace exegesis
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
index d746a0f775646..aab4b54242e4b 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkRunner.h
@@ -38,7 +38,8 @@ class BenchmarkRunner {
 
   explicit BenchmarkRunner(const LLVMState &State, Benchmark::ModeE Mode,
                            BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
-                           ExecutionModeE ExecutionMode);
+                           ExecutionModeE ExecutionMode,
+                           ArrayRef<ValidationEvent> ValCounters);
 
   virtual ~BenchmarkRunner();
 
@@ -93,14 +94,18 @@ class BenchmarkRunner {
     virtual ~FunctionExecutor();
 
     Expected<llvm::SmallVector<int64_t, 4>>
-    runAndSample(const char *Counters) const;
+    runAndSample(const char *Counters,
+                 ArrayRef<const char *> ValidationCounters,
+                 SmallVectorImpl<int64_t> &ValidationCounterValues) const;
 
   protected:
     static void
     accumulateCounterValues(const llvm::SmallVectorImpl<int64_t> &NewValues,
                             llvm::SmallVectorImpl<int64_t> *Result);
     virtual Expected<llvm::SmallVector<int64_t, 4>>
-    runWithCounter(StringRef CounterName) const = 0;
+    runWithCounter(StringRef CounterName,
+                   ArrayRef<const char *> ValidationCounters,
+                   SmallVectorImpl<int64_t> &ValidationCounterValues) const = 0;
   };
 
 protected:
@@ -109,6 +114,11 @@ class BenchmarkRunner {
   const BenchmarkPhaseSelectorE BenchmarkPhaseSelector;
   const ExecutionModeE ExecutionMode;
 
+  SmallVector<ValidationEvent> ValidationCounters;
+
+  Error
+  getValidationCountersToRun(SmallVector<const char *> &ValCountersToRun) const;
+
 private:
   virtual Expected<std::vector<BenchmarkMeasure>>
   runMeasurements(const FunctionExecutor &Executor) const = 0;
diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
index eda450579a583..eb7f70fb2b840 100644
--- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.cpp
@@ -22,8 +22,9 @@ LatencyBenchmarkRunner::LatencyBenchmarkRunner(
     const LLVMState &State, Benchmark::ModeE Mode,
     BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
     Benchmark::ResultAggregationModeE ResultAgg, ExecutionModeE ExecutionMode,
-    unsigned BenchmarkRepeatCount)
-    : BenchmarkRunner(State, Mode, BenchmarkPhaseSelector, ExecutionMode) {
+    ArrayRef<ValidationEvent> ValCounters, unsigned BenchmarkRepeatCount)
+    : BenchmarkRunner(State, Mode, BenchmarkPhaseSelector, ExecutionMode,
+                      ValCounters) {
   assert((Mode == Benchmark::Latency || Mode == Benchmark::InverseThroughput) &&
          "invalid mode");
   ResultAggMode = ResultAgg;
@@ -72,11 +73,21 @@ Expected<std::vector<BenchmarkMeasure>> LatencyBenchmarkRunner::runMeasurements(
   // ResultAggMode.
   llvm::SmallVector<int64_t, 4> AccumulatedValues;
   double MinVariance = std::numeric_limits<double>::infinity();
-  const char *CounterName = State.getPfmCounters().CycleCounter;
+  const PfmCountersInfo &PCI = State.getPfmCounters();
+  const char *CounterName = PCI.CycleCounter;
+
+  SmallVector<const char *> ValCountersToRun;
+  Error ValCounterErr = getValidationCountersToRun(ValCountersToRun);
+  if (ValCounterErr)
+    return std::move(ValCounterErr);
+
+  SmallVector<int64_t> ValCounterValues(ValCountersToRun.size(), 0);
   // Values count for each run.
   int ValuesCount = 0;
   for (size_t I = 0; I < NumMeasurements; ++I) {
-    auto ExpectedCounterValues = Executor.runAndSample(CounterName);
+    SmallVector<int64_t> IterationValCounterValues(ValCountersToRun.size(), -1);
+    auto ExpectedCounterValues = Executor.runAndSample(
+        CounterName, ValCountersToRun, IterationValCounterValues);
     if (!ExpectedCounterValues)
       return ExpectedCounterValues.takeError();
     ValuesCount = ExpectedCounterValues.get().size();
@@ -90,8 +101,15 @@ Expected<std::vector<BenchmarkMeasure>> LatencyBenchmarkRunner::runMeasurements(
         MinVariance = Variance;
       }
     }
+
+    for (size_t I = 0; I < ValCounterValues.size(); ++I)
+      ValCounterValues[I] += IterationValCounterValues[I];
   }
 
+  std::map<ValidationEvent, int64_t> ValidationInfo;
+  for (size_t I = 0; I < ValidationCounters.size(); ++I)
+    ValidationInfo[ValidationCounters[I]] = ValCounterValues[I];
+
   std::string ModeName;
   switch (Mode) {
   case Benchmark::Latency:
@@ -112,25 +130,26 @@ Expected<std::vector<BenchmarkMeasure>> LatencyBenchmarkRunner::runMeasurements(
     std::vector<BenchmarkMeasure> Result;
     Result.reserve(AccumulatedValues.size());
     for (const int64_t Value : AccumulatedValues)
-      Result.push_back(BenchmarkMeasure::Create(ModeName, Value));
+      Result.push_back(
+          BenchmarkMeasure::Create(ModeName, Value, ValidationInfo));
     return std::move(Result);
   }
   case Benchmark::Min: {
     std::vector<BenchmarkMeasure> Result;
-    Result.push_back(
-        BenchmarkMeasure::Create(ModeName, findMin(AccumulatedValues)));
+    Result.push_back(BenchmarkMeasure::Create(
+        ModeName, findMin(AccumulatedValues), ValidationInfo));
     return std::move(Result);
   }
   case Benchmark::Max: {
     std::vector<BenchmarkMeasure> Result;
-    Result.push_back(
-        BenchmarkMeasure::Create(ModeName, findMax(AccumulatedValues)));
+    Result.push_back(BenchmarkMeasure::Create(
+        ModeName, findMax(AccumulatedValues), ValidationInfo));
     return std::move(Result);
   }
   case Benchmark::Mean: {
     std::vector<BenchmarkMeasure> Result;
-    Result.push_back(
-        BenchmarkMeasure::Create(ModeName, findMean(AccumulatedValues)));
+    Result.push_back(BenchmarkMeasure::Create(
+        ModeName, findMean(AccumulatedValues), ValidationInfo));
     return std::move(Result);
   }
   }
diff --git a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
index fc159d7d9b5e9..6e21197b698b9 100644
--- a/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/LatencyBenchmarkRunner.h
@@ -15,6 +15,7 @@
 #define LLVM_TOOLS_LLVM_EXEGESIS_LATENCY_H
 
 #include "BenchmarkRunner.h"
+#include "Target.h"
 
 namespace llvm {
 namespace exegesis {
@@ -25,6 +26,7 @@ class LatencyBenchmarkRunner : public BenchmarkRunner {
                          BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
                          Benchmark::ResultAggregationModeE ResultAggMode,
                          ExecutionModeE ExecutionMode,
+                         ArrayRef<ValidationEvent> ValCounters,
                          unsigned BenchmarkRepeatCount);
   ~LatencyBenchmarkRunner() override;
 
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
index 1dab8809d3028..6b8df01e9f544 100644
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.cpp
@@ -113,9 +113,8 @@ ConfiguredEvent::ConfiguredEvent(PerfEvent &&EventToConfigure)
 }
 
 #ifdef HAVE_LIBPFM
-void ConfiguredEvent::initRealEvent(const pid_t ProcessID) {
+void ConfiguredEvent::initRealEvent(const pid_t ProcessID, const int GroupFD) {
   const int CPU = -1;
-  const int GroupFD = -1;
   const uint32_t Flags = 0;
   perf_event_attr AttrCopy = *Event.attribute();
   FileDescriptor = perf_event_open(&AttrCopy, ProcessID, CPU, GroupFD, Flags);
@@ -147,7 +146,7 @@ ConfiguredEvent::readOrError(StringRef /*unused*/) const {
 
 ConfiguredEvent::~ConfiguredEvent() { close(FileDescriptor); }
 #else
-void ConfiguredEvent::initRealEvent(pid_t ProcessID) {}
+void ConfiguredEvent::initRealEvent(pid_t ProcessID, const int GroupFD) {}
 
 Expected<SmallVector<int64_t>>
 ConfiguredEvent::readOrError(StringRef /*unused*/) const {
@@ -158,9 +157,14 @@ ConfiguredEvent::readOrError(StringRef /*unused*/) const {
 ConfiguredEvent::~ConfiguredEvent() = default;
 #endif // HAVE_LIBPFM
 
-CounterGroup::CounterGroup(PerfEvent &&E, pid_t ProcessID)
+CounterGroup::CounterGroup(PerfEvent &&E, std::vector<PerfEvent> &&ValEvents,
+                           pid_t ProcessID)
     : EventCounter(std::move(E)) {
   IsDummyEvent = EventCounter.isDummyEvent();
+
+  for (auto &&ValEvent : ValEvents)
+    ValidationEventCounters.emplace_back(std::move(ValEvent));
+
   if (!IsDummyEvent)
     initRealEvent(ProcessID);
 }
@@ -168,16 +172,19 @@ CounterGroup::CounterGroup(PerfEvent &&E, pid_t ProcessID)
 #ifdef HAVE_LIBPFM
 void CounterGroup::initRealEvent(pid_t ProcessID) {
   EventCounter.initRealEvent(ProcessID);
+
+  for (auto &ValCounter : ValidationEventCounters)
+    ValCounter.initRealEvent(ProcessID, getFileDescriptor());
 }
 
 void CounterGroup::start() {
   if (!IsDummyEvent)
-    ioctl(getFileDescriptor(), PERF_EVENT_IOC_RESET, 0);
+    ioctl(getFileDescriptor(), PERF_EVENT_IOC_RESET, PERF_IOC_FLAG_GROUP);
 }
 
 void CounterGroup::stop() {
   if (!IsDummyEvent)
-    ioctl(getFileDescriptor(), PERF_EVENT_IOC_DISABLE, 0);
+    ioctl(getFileDescriptor(), PERF_EVENT_IOC_DISABLE, PERF_IOC_FLAG_GROUP);
 }
 
 llvm::Expected<llvm::SmallVector<int64_t, 4>>
@@ -188,6 +195,25 @@ CounterGroup::readOrError(StringRef FunctionBytes) const {
     return SmallVector<int64_t, 1>(1, 42);
 }
 
+llvm::Expected<llvm::SmallVector<int64_t>>
+CounterGroup::readValidationCountersOrError() const {
+  llvm::SmallVector<int64_t, 4> Result;
+  for (const auto &ValCounter : ValidationEventCounters) {
+    Expected<SmallVector<int64_t>> ValueOrError =
+        ValCounter.readOrError(StringRef());
+
+    if (!ValueOrError)
+      return ValueOrError.takeError();
+
+    // Reading a validation counter will only return a single value, so it is
+    // safe to only append the first value here. Also assert that this is true.
+    assert(ValueOrError->size() == 1 &&
+           "Validation counters should only return a single value");
+    Result.push_back((*ValueOrError)[0]);
+  }
+  return Result;
+}
+
 int CounterGroup::numValues() const { return 1; }
 #else
 
@@ -208,6 +234,11 @@ CounterGroup::readOrError(StringRef /*unused*/) const {
                                              llvm::errc::io_error);
 }
 
+llvm::Expected<llvm::SmallVector<int64_t>>
+CounterGroup::readValidationCountersOrError() const {
+  return SmallVector<int64_t>(0);
+}
+
 int CounterGroup::numValues() const { return 1; }
 
 #endif
diff --git a/llvm/tools/llvm-exegesis/lib/PerfHelper.h b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
index e9b73da383628..7d050b85c9a8c 100644
--- a/llvm/tools/llvm-exegesis/lib/PerfHelper.h
+++ b/llvm/tools/llvm-exegesis/lib/PerfHelper.h
@@ -83,7 +83,7 @@ class ConfiguredEvent {
 public:
   ConfiguredEvent(PerfEvent &&EventToConfigure);
 
-  void initRealEvent(const pid_t ProcessID);
+  void initRealEvent(const pid_t ProcessID, const int GroupFD = -1);
   Expected<SmallVector<int64_t>> readOrError(StringRef FunctionBytes) const;
   int getFileDescriptor() const { return FileDescriptor; }
   bool isDummyEvent() const {
@@ -107,7 +107,8 @@ class ConfiguredEvent {
 class CounterGroup {
 public:
   // event: the PerfEvent to measure.
-  explicit CounterGroup(PerfEvent &&event, pid_t ProcessID = 0);
+  explicit CounterGroup(PerfEvent &&event, std::vector<PerfEvent> &&ValEvents,
+                        pid_t ProcessID = 0);
 
   CounterGroup(const CounterGroup &) = delete;
   CounterGroup(CounterGroup &&other) = default;
@@ -129,6 +130,9 @@ class CounterGroup {
   virtual llvm::Expected<llvm::SmallVector<int64_t, 4>>
   readOrError(StringRef FunctionBytes = StringRef()) const;
 
+  virtual llvm::Expected<llvm::SmallVector<int64_t>>
+  readValidationCountersOrError() const;
+
   virtual int numValues() const;
 
   int getFileDescriptor() const { return EventCounter.getFileDescriptor(); }
@@ -136,6 +140,7 @@ class CounterGroup {
 protected:
   ConfiguredEvent EventCounter;
   bool IsDummyEvent;
+  std::vector<ConfiguredEvent> ValidationEventCounters;
 
 private:
   void initRealEvent(pid_t ProcessID);
diff --git a/llvm/tools/llvm-exegesis/lib/Target.cpp b/llvm/tools/llvm-exegesis/lib/Target.cpp
index 8f1c5a157eea3..58cf1b96fea4e 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/Target.cpp
@@ -37,6 +37,7 @@ const ExegesisTarget *ExegesisTarget::lookup(Triple TT) {
 
 Expected<std::unique_ptr<pfm::CounterGroup>>
 ExegesisTarget::createCounter(StringRef CounterName, const LLVMState &,
+                              ArrayRef<const char *> ValidationCounters,
                               const pid_t ProcessID) const {
   pfm::PerfEvent Event(CounterName);
   if (!Event.valid())
@@ -45,7 +46,18 @@ ExegesisTarget::createCounter(StringRef CounterName, const LLVMState &,
             .concat(CounterName)
             .concat("'"));
 
-  return std::make_unique<pfm::CounterGroup>(std::move(Event), ProcessID);
+  std::vector<pfm::PerfEvent> ValidationEvents;
+  for (const char *ValCounterName : ValidationCounters) {
+    ValidationEvents.emplace_back(ValCounterName);
+    if (!ValidationEvents.back().valid())
+      return llvm::make_error<Failure>(
+          llvm::Twine("Unable to create validation counter with name '")
+              .concat(ValCounterName)
+              .concat("'"));
+  }
+
+  return std::make_unique<pfm::CounterGroup>(
+      std::move(Event), std::move(ValidationEvents), ProcessID);
 }
 
 void ExegesisTarget::registerTarget(ExegesisTarget *Target) {
@@ -79,7 +91,7 @@ ExegesisTarget::createBenchmarkRunner(
     Benchmark::ModeE Mode, const LLVMState &State,
     BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
     BenchmarkRunner::ExecutionModeE ExecutionMode,
-    unsigned BenchmarkRepeatCount,
+    unsigned BenchmarkRepeatCount, ArrayRef<ValidationEvent> ValidationCounters,
     Benchmark::ResultAggregationModeE ResultAggMode) const {
   PfmCountersInfo PfmCounters = State.getPfmCounters();
   switch (Mode) {
@@ -101,9 +113,9 @@ ExegesisTarget::createBenchmarkRunner(
                   "benchmarking or --use-dummy-perf-counters to not query "
                   "the kernel for real event counts."));
     }
-    return createLatencyBenchmarkRunner(State, Mode, BenchmarkPhaseSelector,
-                                        ResultAggMode, ExecutionMode,
-                                        BenchmarkRepeatCount);
+    return createLatencyBenchmarkRunner(
+        State, Mode, BenchmarkPhaseSelector, ResultAggMode, ExecutionMode,
+        ValidationCounters, BenchmarkRepeatCount);
   case Benchmark::Uops:
     if (BenchmarkPhaseSelector == BenchmarkPhaseSelectorE::Measure &&
         !PfmCounters.UopsCounter && !PfmCounters.IssueCounters)
@@ -113,7 +125,8 @@ ExegesisTarget::createBenchmarkRunner(
           "benchmarking or --use-dummy-perf-counters to not query the kernel "
           "for real event counts.");
     return createUopsBenchmarkRunner(State, BenchmarkPhaseSelector,
-                                     ResultAggMode, ExecutionMode);
+                                     ResultAggMode, ExecutionMode,
+                                     ValidationCounters);
   }
   return nullptr;
 }
@@ -133,18 +146,20 @@ std::unique_ptr<BenchmarkRunner> ExegesisTarget::createLatencyBenchmarkRunner(
     BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
     Benchmark::ResultAggregationModeE ResultAggMode,
     BenchmarkRunner::ExecutionModeE ExecutionMode,
+    ArrayRef<ValidationEvent> ValidationCounters,
     unsigned BenchmarkRepeatCount) const {
   return std::make_unique<LatencyBenchmarkRunner>(
       State, Mode, BenchmarkPhaseSelector, ResultAggMode, ExecutionMode,
-      BenchmarkRepeatCount);
+      ValidationCounters, BenchmarkRepeatCount);
 }
 
 std::unique_ptr<BenchmarkRunner> ExegesisTarget::createUopsBenchmarkRunner(
     const LLVMState &State, BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
     Benchmark::ResultAggregationModeE /*unused*/,
-    BenchmarkRunner::ExecutionModeE ExecutionMode) const {
-  return std::make_unique<UopsBenchmarkRunner>(State, BenchmarkPhaseSelector,
-                                               ExecutionMode);
+    BenchmarkRunner::ExecutionModeE ExecutionMode,
+    ArrayRef<ValidationEvent> ValidationCounters) const {
+  return std::make_unique<UopsBenchmarkRunner>(
+      State, BenchmarkPhaseSelector, ExecutionMode, ValidationCounters);
 }
 
 static_assert(std::is_trivial_v<PfmCountersInfo>,
diff --git a/llvm/tools/llvm-exegesis/lib/Target.h b/llvm/tools/llvm-exegesis/lib/Target.h
index da6d44611eca7..3d6169c965021 100644
--- a/llvm/tools/llvm-exegesis/lib/Target.h
+++ b/llvm/tools/llvm-exegesis/lib/Target.h
@@ -39,10 +39,6 @@ extern cl::OptionCategory Options;
 extern cl::OptionCategory BenchmarkOptions;
 extern cl::OptionCategory AnalysisOptions;
 
-enum ValidationEvent {
-  InstructionRetired
-};
-
 struct PfmCountersInfo {
   // An optional name of a performance counter that can be used to measure
   // cycles.
@@ -86,6 +82,7 @@ class ExegesisTarget {
   // Targets can use this to create target-specific perf counters.
   virtual Expected<std::unique_ptr<pfm::CounterGroup>>
   createCounter(StringRef CounterName, const LLVMState &State,
+                ArrayRef<const char *> ValidationCounters,
                 const pid_t ProcessID = 0) const;
 
   // Targets can use this to add target-specific passes in assembleToStream();
@@ -270,6 +267,7 @@ class ExegesisTarget {
       BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
       BenchmarkRunner::ExecutionModeE ExecutionMode,
       unsigned BenchmarkRepeatCount,
+      ArrayRef<ValidationEvent> ValidationCounters,
       Benchmark::ResultAggregationModeE ResultAggMode = Benchmark::Min) const;
 
   // Returns the ExegesisTarget for the given triple or nullptr if the target
@@ -314,11 +312,13 @@ class ExegesisTarget {
       BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
       Benchmark::ResultAggregationModeE ResultAggMode,
       BenchmarkRunner::ExecutionModeE ExecutionMode,
+      ArrayRef<ValidationEvent> ValidationCounters,
       unsigned BenchmarkRepeatCount) const;
   std::unique_ptr<BenchmarkRunner> virtual createUopsBenchmarkRunner(
       const LLVMState &State, BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
       Benchmark::ResultAggregationModeE ResultAggMode,
-      BenchmarkRunner::ExecutionModeE ExecutionMode) const;
+      BenchmarkRunner::ExecutionModeE ExecutionMode,
+      ArrayRef<ValidationEvent> ValidationCounters) const;
 
   const ExegesisTarget *Next = nullptr;
   const ArrayRef<CpuAndPfmCounters> CpuPfmCounters;
diff --git a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.cpp b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.cpp
index 6351fdd3345a8..d6fb59970b94a 100644
--- a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.cpp
+++ b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.cpp
@@ -19,25 +19,45 @@ Expected<std::vector<BenchmarkMeasure>>
 UopsBenchmarkRunner::runMeasurements(const FunctionExecutor &Executor) const {
   std::vector<BenchmarkMeasure> Result;
   const PfmCountersInfo &PCI = State.getPfmCounters();
+
+  SmallVector<const char *> ValCountersToRun;
+  Error ValCounterErr = getValidationCountersToRun(ValCountersToRun);
+  if (ValCounterErr)
+    return std::move(ValCounterErr);
+
   // Uops per port.
   for (const auto *IssueCounter = PCI.IssueCounters,
                   *IssueCounterEnd = PCI.IssueCounters + PCI.NumIssueCounters;
        IssueCounter != IssueCounterEnd; ++IssueCounter) {
+    SmallVector<int64_t> ValCounterPortValues(ValCountersToRun.size(), -1);
     if (!IssueCounter->Counter)
       continue;
-    auto ExpectedCounterValue = Executor.runAndSample(IssueCounter->Counter);
+    auto ExpectedCounterValue = Executor.runAndSample(
+        IssueCounter->Counter, ValCountersToRun, ValCounterPortValues);
     if (!ExpectedCounterValue)
       return ExpectedCounterValue.takeError();
-    Result.push_back(BenchmarkMeasure::Create(IssueCounter->ProcResName,
-                                              (*ExpectedCounterValue)[0]));
+
+    std::map<ValidationEvent, int64_t> ValidationInfo;
+    for (size_t I = 0; I < ValidationCounters.size(); ++I)
+      ValidationInfo[ValidationCounters[I]] = ValCounterPortValues[I];
+
+    Result.push_back(BenchmarkMeasure::Create(
+        IssueCounter->ProcResName, (*ExpectedCounterValue)[0], ValidationInfo));
   }
   // NumMicroOps.
   if (const char *const UopsCounter = PCI.UopsCounter) {
-    auto ExpectedCounterValue = Executor.runAndSample(UopsCounter);
+    SmallVector<int64_t> ValCounterUopsValues(ValCountersToRun.size(), -1);
+    auto ExpectedCounterValue = Executor.runAndSample(
+        UopsCounter, ValCountersToRun, ValCounterUopsValues);
     if (!ExpectedCounterValue)
       return ExpectedCounterValue.takeError();
-    Result.push_back(
-        BenchmarkMeasure::Create("NumMicroOps", (*ExpectedCounterValue)[0]));
+
+    std::map<ValidationEvent, int64_t> ValidationInfo;
+    for (size_t I = 0; I < ValidationCounters.size(); ++I)
+      ValidationInfo[ValidationCounters[I]] = ValCounterUopsValues[I];
+
+    Result.push_back(BenchmarkMeasure::Create(
+        "NumMicroOps", (*ExpectedCounterValue)[0], ValidationInfo));
   }
   return std::move(Result);
 }
diff --git a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
index 337f093670122..ef47b7fe8a655 100644
--- a/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
+++ b/llvm/tools/llvm-exegesis/lib/UopsBenchmarkRunner.h
@@ -15,6 +15,7 @@
 #define LLVM_TOOLS_LLVM_EXEGESIS_UOPSBENCHMARKRUNNER_H
 
 #include "BenchmarkRunner.h"
+#include "Target.h"
 
 namespace llvm {
 namespace exegesis {
@@ -23,9 +24,10 @@ class UopsBenchmarkRunner : public BenchmarkRunner {
 public:
   UopsBenchmarkRunner(const LLVMState &State,
                       BenchmarkPhaseSelectorE BenchmarkPhaseSelector,
-                      ExecutionModeE ExecutionMode)
+                      ExecutionModeE ExecutionMode,
+                      ArrayRef<ValidationEvent> ValCounters)
       : BenchmarkRunner(State, Benchmark::Uops, BenchmarkPhaseSelector,
-                        ExecutionMode) {}
+                        ExecutionMode, ValCounters) {}
   ~UopsBenchmarkRunner() override;
 
   static constexpr const size_t kMinNumDifferentAddresses = 6;
diff --git a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
index 25df89fb0cf31..adb345f023a8e 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/Target.cpp
@@ -45,6 +45,9 @@
 #include <sys/mman.h>
 #include <sys/syscall.h>
 #include <unistd.h>
+#ifdef HAVE_LIBPFM
+#include <perfmon/perf_event.h>
+#endif // HAVE_LIBPFM
 #endif
 
 #define GET_AVAILABLE_OPCODE_CHECKER
@@ -681,6 +684,7 @@ class ExegesisX86Target : public ExegesisTarget {
 
   Expected<std::unique_ptr<pfm::CounterGroup>>
   createCounter(StringRef CounterName, const LLVMState &State,
+                ArrayRef<const char *> ValidationCounters,
                 const pid_t ProcessID) const override {
     // If LbrSamplingPeriod was provided, then ignore the
     // CounterName because we only have one for LBR.
@@ -689,6 +693,13 @@ class ExegesisX86Target : public ExegesisTarget {
       // __linux__ (for now)
 #if defined(HAVE_LIBPFM) && defined(LIBPFM_HAS_FIELD_CYCLES) &&                \
     defined(__linux__)
+      // TODO(boomanaiden154): Add in support for using validation counters when
+      // using LBR counters.
+      if (ValidationCounters.size() > 0)
+        return llvm::make_error<llvm::StringError>(
+            "Using LBR is not currently supported with validation counters",
+            llvm::errc::invalid_argument);
+
       return std::make_unique<X86LbrCounter>(
           X86LbrPerfEvent(LbrSamplingPeriod));
 #else
@@ -698,7 +709,8 @@ class ExegesisX86Target : public ExegesisTarget {
           llvm::errc::invalid_argument);
 #endif
     }
-    return ExegesisTarget::createCounter(CounterName, State, ProcessID);
+    return ExegesisTarget::createCounter(CounterName, State, ValidationCounters,
+                                         ProcessID);
   }
 
   enum ArgumentRegisters { CodeSize = X86::R12, AuxiliaryMemoryFD = X86::R13 };
@@ -1243,7 +1255,7 @@ std::vector<MCInst>
 ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const {
   std::vector<MCInst> ConfigurePerfCounterCode;
   if (SaveRegisters)
-    saveSyscallRegisters(ConfigurePerfCounterCode, 2);
+    saveSyscallRegisters(ConfigurePerfCounterCode, 3);
   ConfigurePerfCounterCode.push_back(
       loadImmediate(X86::RDI, 64, APInt(64, getAuxiliaryMemoryStartAddress())));
   ConfigurePerfCounterCode.push_back(MCInstBuilder(X86::MOV32rm)
@@ -1255,9 +1267,13 @@ ExegesisX86Target::configurePerfCounter(long Request, bool SaveRegisters) const
                                          .addReg(0));
   ConfigurePerfCounterCode.push_back(
       loadImmediate(X86::RSI, 64, APInt(64, Request)));
+#ifdef HAVE_LIBPFM
+  ConfigurePerfCounterCode.push_back(
+      loadImmediate(X86::RDX, 64, APInt(64, PERF_IOC_FLAG_GROUP)));
+#endif // HAVE_LIBPFM
   generateSyscall(SYS_ioctl, ConfigurePerfCounterCode);
   if (SaveRegisters)
-    restoreSyscallRegisters(ConfigurePerfCounterCode, 2);
+    restoreSyscallRegisters(ConfigurePerfCounterCode, 3);
   return ConfigurePerfCounterCode;
 }
 
diff --git a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
index 96fb0f085a1ee..26be9f846a210 100644
--- a/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
+++ b/llvm/tools/llvm-exegesis/lib/X86/X86Counter.cpp
@@ -141,7 +141,7 @@ X86LbrPerfEvent::X86LbrPerfEvent(unsigned SamplingPeriod) {
 }
 
 X86LbrCounter::X86LbrCounter(pfm::PerfEvent &&NewEvent)
-    : CounterGroup(std::move(NewEvent)) {
+    : CounterGroup(std::move(NewEvent), {}) {
   MMappedBuffer = mmap(nullptr, kMappedBufferSize, PROT_READ | PROT_WRITE,
                        MAP_SHARED, getFileDescriptor(), 0);
   if (MMappedBuffer == MAP_FAILED)
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index ffbf94ce0fcb2..2a121bec98d95 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -268,6 +268,16 @@ static cl::opt<unsigned> BenchmarkRepeatCount(
              "before aggregating the results"),
     cl::cat(BenchmarkOptions), cl::init(30));
 
+static cl::list<ValidationEvent> ValidationCounters(
+    "validation-counter",
+    cl::desc(
+        "The name of a validation counter to run concurrently with the main "
+        "counter to validate benchmarking assumptions"),
+    cl::CommaSeparated, cl::cat(BenchmarkOptions),
+    cl::values(clEnumValN(ValidationEvent::InstructionRetired,
+                          "instructions-retired",
+                          "Count retired instructions")));
+
 static ExitOnError ExitOnErr("llvm-exegesis error: ");
 
 // Helper function that logs the error(s) and exits.
@@ -501,7 +511,7 @@ void benchmarkMain() {
   const std::unique_ptr<BenchmarkRunner> Runner =
       ExitOnErr(State.getExegesisTarget().createBenchmarkRunner(
           BenchmarkMode, State, BenchmarkPhaseSelector, ExecutionMode,
-          BenchmarkRepeatCount, ResultAggMode));
+          BenchmarkRepeatCount, ValidationCounters, ResultAggMode));
   if (!Runner) {
     ExitWithError("cannot create benchmark runner");
   }
diff --git a/llvm/unittests/tools/llvm-exegesis/ClusteringTest.cpp b/llvm/unittests/tools/llvm-exegesis/ClusteringTest.cpp
index 25fe813502e54..26bb6c5d2e4c2 100644
--- a/llvm/unittests/tools/llvm-exegesis/ClusteringTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/ClusteringTest.cpp
@@ -32,17 +32,17 @@ TEST(ClusteringTest, Clusters3D) {
 
   // Cluster around (x=0, y=1, z=2): points {0, 3}.
   Points[0].Measurements = {
-      {"x", 0.01, 0.0}, {"y", 1.02, 0.0}, {"z", 1.98, 0.0}};
+      {"x", 0.01, 0.0, {}}, {"y", 1.02, 0.0, {}}, {"z", 1.98, 0.0, {}}};
   Points[3].Measurements = {
-      {"x", -0.01, 0.0}, {"y", 1.02, 0.0}, {"z", 1.98, 0.0}};
+      {"x", -0.01, 0.0, {}}, {"y", 1.02, 0.0, {}}, {"z", 1.98, 0.0, {}}};
   // Cluster around (x=1, y=1, z=2): points {1, 4}.
   Points[1].Measurements = {
-      {"x", 1.01, 0.0}, {"y", 1.02, 0.0}, {"z", 1.98, 0.0}};
+      {"x", 1.01, 0.0, {}}, {"y", 1.02, 0.0, {}}, {"z", 1.98, 0.0, {}}};
   Points[4].Measurements = {
-      {"x", 0.99, 0.0}, {"y", 1.02, 0.0}, {"z", 1.98, 0.0}};
+      {"x", 0.99, 0.0, {}}, {"y", 1.02, 0.0, {}}, {"z", 1.98, 0.0, {}}};
   // Cluster around (x=0, y=0, z=0): points {5}, marked as noise.
   Points[5].Measurements = {
-      {"x", 0.0, 0.0}, {"y", 0.01, 0.0}, {"z", -0.02, 0.0}};
+      {"x", 0.0, 0.0, {}}, {"y", 0.01, 0.0, {}}, {"z", -0.02, 0.0, {}}};
   // Error cluster: points {2}
   Points[2].Error = "oops";
 
@@ -71,8 +71,8 @@ TEST(ClusteringTest, Clusters3D) {
 TEST(ClusteringTest, Clusters3D_InvalidSize) {
   std::vector<Benchmark> Points(6);
   Points[0].Measurements = {
-      {"x", 0.01, 0.0}, {"y", 1.02, 0.0}, {"z", 1.98, 0.0}};
-  Points[1].Measurements = {{"y", 1.02, 0.0}, {"z", 1.98, 0.0}};
+      {"x", 0.01, 0.0, {}}, {"y", 1.02, 0.0, {}}, {"z", 1.98, 0.0, {}}};
+  Points[1].Measurements = {{"y", 1.02, 0.0, {}}, {"z", 1.98, 0.0, {}}};
   auto Error =
       BenchmarkClustering::create(
           Points, BenchmarkClustering::ModeE::Dbscan, 2, 0.25)
@@ -83,8 +83,8 @@ TEST(ClusteringTest, Clusters3D_InvalidSize) {
 
 TEST(ClusteringTest, Clusters3D_InvalidOrder) {
   std::vector<Benchmark> Points(6);
-  Points[0].Measurements = {{"x", 0.01, 0.0}, {"y", 1.02, 0.0}};
-  Points[1].Measurements = {{"y", 1.02, 0.0}, {"x", 1.98, 0.0}};
+  Points[0].Measurements = {{"x", 0.01, 0.0, {}}, {"y", 1.02, 0.0, {}}};
+  Points[1].Measurements = {{"y", 1.02, 0.0, {}}, {"x", 1.98, 0.0, {}}};
   auto Error =
       BenchmarkClustering::create(
           Points, BenchmarkClustering::ModeE::Dbscan, 2, 0.25)
@@ -110,12 +110,9 @@ TEST(ClusteringTest, Ordering) {
 TEST(ClusteringTest, Ordering1) {
   std::vector<Benchmark> Points(3);
 
-  Points[0].Measurements = {
-      {"x", 0.0, 0.0}};
-  Points[1].Measurements = {
-      {"x", 1.0, 0.0}};
-  Points[2].Measurements = {
-      {"x", 2.0, 0.0}};
+  Points[0].Measurements = {{"x", 0.0, 0.0, {}}};
+  Points[1].Measurements = {{"x", 1.0, 0.0, {}}};
+  Points[2].Measurements = {{"x", 2.0, 0.0, {}}};
 
   auto Clustering = BenchmarkClustering::create(
       Points, BenchmarkClustering::ModeE::Dbscan, 2, 1.1);
@@ -127,12 +124,9 @@ TEST(ClusteringTest, Ordering1) {
 TEST(ClusteringTest, Ordering2) {
   std::vector<Benchmark> Points(3);
 
-  Points[0].Measurements = {
-      {"x", 0.0, 0.0}};
-  Points[1].Measurements = {
-      {"x", 2.0, 0.0}};
-  Points[2].Measurements = {
-      {"x", 1.0, 0.0}};
+  Points[0].Measurements = {{"x", 0.0, 0.0, {}}};
+  Points[1].Measurements = {{"x", 2.0, 0.0, {}}};
+  Points[2].Measurements = {{"x", 1.0, 0.0, {}}};
 
   auto Clustering = BenchmarkClustering::create(
       Points, BenchmarkClustering::ModeE::Dbscan, 2, 1.1);
diff --git a/llvm/unittests/tools/llvm-exegesis/Mips/BenchmarkResultTest.cpp b/llvm/unittests/tools/llvm-exegesis/Mips/BenchmarkResultTest.cpp
index 201e0a8e7acce..3d02b8f648411 100644
--- a/llvm/unittests/tools/llvm-exegesis/Mips/BenchmarkResultTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/Mips/BenchmarkResultTest.cpp
@@ -65,8 +65,8 @@ TEST_F(MipsBenchmarkResultTest, WriteToAndReadFromDisk) {
   ToDisk.CpuName = "cpu_name";
   ToDisk.LLVMTriple = "llvm_triple";
   ToDisk.NumRepetitions = 1;
-  ToDisk.Measurements.push_back(BenchmarkMeasure{"a", 1, 1});
-  ToDisk.Measurements.push_back(BenchmarkMeasure{"b", 2, 2});
+  ToDisk.Measurements.push_back(BenchmarkMeasure{"a", 1, 1, {}});
+  ToDisk.Measurements.push_back(BenchmarkMeasure{"b", 2, 2, {}});
   ToDisk.Error = "error";
   ToDisk.Info = "info";
 
@@ -124,10 +124,10 @@ TEST_F(MipsBenchmarkResultTest, WriteToAndReadFromDisk) {
 
 TEST_F(MipsBenchmarkResultTest, PerInstructionStats) {
   PerInstructionStats Stats;
-  Stats.push(BenchmarkMeasure{"a", 0.5, 0.0});
-  Stats.push(BenchmarkMeasure{"a", 1.5, 0.0});
-  Stats.push(BenchmarkMeasure{"a", -1.0, 0.0});
-  Stats.push(BenchmarkMeasure{"a", 0.0, 0.0});
+  Stats.push(BenchmarkMeasure{"a", 0.5, 0.0, {}});
+  Stats.push(BenchmarkMeasure{"a", 1.5, 0.0, {}});
+  Stats.push(BenchmarkMeasure{"a", -1.0, 0.0, {}});
+  Stats.push(BenchmarkMeasure{"a", 0.0, 0.0, {}});
   EXPECT_EQ(Stats.min(), -1.0);
   EXPECT_EQ(Stats.max(), 1.5);
   EXPECT_EQ(Stats.avg(), 0.25); // (0.5+1.5-1.0+0.0) / 4
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp
index 616f7bac54bc4..fafd08131dffc 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/BenchmarkResultTest.cpp
@@ -84,8 +84,8 @@ TEST(BenchmarkResultTest, WriteToAndReadFromDisk) {
   ToDisk.CpuName = "cpu_name";
   ToDisk.LLVMTriple = "llvm_triple";
   ToDisk.NumRepetitions = 1;
-  ToDisk.Measurements.push_back(BenchmarkMeasure{"a", 1, 1});
-  ToDisk.Measurements.push_back(BenchmarkMeasure{"b", 2, 2});
+  ToDisk.Measurements.push_back(BenchmarkMeasure{"a", 1, 1, {}});
+  ToDisk.Measurements.push_back(BenchmarkMeasure{"b", 2, 2, {}});
   ToDisk.Error = "error";
   ToDisk.Info = "info";
 
@@ -162,10 +162,10 @@ TEST(BenchmarkResultTest, WriteToAndReadFromDisk) {
 
 TEST(BenchmarkResultTest, PerInstructionStats) {
   PerInstructionStats Stats;
-  Stats.push(BenchmarkMeasure{"a", 0.5, 0.0});
-  Stats.push(BenchmarkMeasure{"a", 1.5, 0.0});
-  Stats.push(BenchmarkMeasure{"a", -1.0, 0.0});
-  Stats.push(BenchmarkMeasure{"a", 0.0, 0.0});
+  Stats.push(BenchmarkMeasure{"a", 0.5, 0.0, {}});
+  Stats.push(BenchmarkMeasure{"a", 1.5, 0.0, {}});
+  Stats.push(BenchmarkMeasure{"a", -1.0, 0.0, {}});
+  Stats.push(BenchmarkMeasure{"a", 0.0, 0.0, {}});
   EXPECT_EQ(Stats.min(), -1.0);
   EXPECT_EQ(Stats.max(), 1.5);
   EXPECT_EQ(Stats.avg(), 0.25); // (0.5+1.5-1.0+0.0) / 4

From 7ded34576f348d8f5b915deb53e8bfa45f72c79c Mon Sep 17 00:00:00 2001
From: Twice <twice@apache.org>
Date: Fri, 19 Jan 2024 19:06:11 +0900
Subject: [PATCH 100/843] [libc++] Implement LWG3940: std::expected<void,
 E>::value() also needs E to be copy constructible (#71819)

This patch includes the fix for
[LWG3940](https://cplusplus.github.io/LWG/issue3940)
(`std::expected<void, E>::value()` also needs `E` to be copy
constructible)
---
 libcxx/docs/Status/Cxx2cIssues.csv            |  2 +-
 libcxx/include/__expected/expected.h          |  2 +
 .../expected.void/value.lwg3940.verify.cpp    | 50 +++++++++++++++++++
 .../expected.void/observers/value.pass.cpp    | 11 ----
 4 files changed, 53 insertions(+), 12 deletions(-)
 create mode 100644 libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp

diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index fe0f13f6e8cb2..e03b7c783bc38 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -15,7 +15,7 @@
 "`3927 <https://wg21.link/LWG3927>`__","Unclear preconditions for ``operator[]`` for sequence containers","Varna June 2023","|Nothing To Do|","",""
 "`3935 <https://wg21.link/LWG3935>`__","``template<class X> constexpr complex& operator=(const complex<X>&)`` has no specification","Varna June 2023","|Complete|","3.4",""
 "`3938 <https://wg21.link/LWG3938>`__","Cannot use ``std::expected`` monadic ops with move-only ``error_type``","Varna June 2023","|Complete|","18.0",""
-"`3940 <https://wg21.link/LWG3940>`__","``std::expected<void, E>::value()`` also needs ``E`` to be copy constructible","Varna June 2023","","",""
+"`3940 <https://wg21.link/LWG3940>`__","``std::expected<void, E>::value()`` also needs ``E`` to be copy constructible","Varna June 2023","|Complete|","18.0",""
 "","","","","",""
 "`2392 <https://wg21.link/LWG2392>`__","""character type"" is used but not defined","Kona November 2023","","",""
 "`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","","",""
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 97f92bdfe44c4..9f36c41b78edf 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -1150,12 +1150,14 @@ class expected<_Tp, _Err> {
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr void value() const& {
+    static_assert(is_copy_constructible_v<_Err>);
     if (!__has_val_) {
       std::__throw_bad_expected_access<_Err>(__union_.__unex_);
     }
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr void value() && {
+    static_assert(is_copy_constructible_v<_Err> && is_move_constructible_v<_Err>);
     if (!__has_val_) {
       std::__throw_bad_expected_access<_Err>(std::move(__union_.__unex_));
     }
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp
new file mode 100644
index 0000000000000..9c1c1d2e648d6
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/value.lwg3940.verify.cpp
@@ -0,0 +1,50 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: no-rtti
+// UNSUPPORTED: no-exceptions
+
+// constexpr void value() const &;
+// Mandates: is_copy_constructible_v<E> is true.
+
+// constexpr void value() &&;
+// Mandates: is_copy_constructible_v<E> is true and is_move_constructible_v<E> is true.
+
+#include <expected>
+
+#include "MoveOnly.h"
+
+struct CopyOnly {
+  CopyOnly()                = default;
+  CopyOnly(const CopyOnly&) = default;
+  CopyOnly(CopyOnly&&)      = delete;
+};
+
+void test() {
+  // MoveOnly type as error_type
+  std::expected<void, MoveOnly> e(std::unexpect, 5);
+
+  e.value(); // expected-note {{in instantiation of member function 'std::expected<void, MoveOnly>::value' requested here}}
+  // expected-error@*:* {{static assertion failed due to requirement 'is_copy_constructible_v<MoveOnly>'}}
+  // expected-error@*:* {{call to deleted constructor of 'MoveOnly'}}
+
+  std::move(e)
+      .value(); // expected-note {{in instantiation of member function 'std::expected<void, MoveOnly>::value' requested here}}
+  // expected-error@*:* {{static assertion failed due to requirement 'is_copy_constructible_v<MoveOnly>'}}
+
+  // CopyOnly type as error_type
+  std::expected<void, CopyOnly> e2(std::unexpect);
+  // expected-error@*:* {{call to deleted constructor of 'CopyOnly'}}
+
+  e2.value();
+
+  std::move(e2)
+      .value(); // expected-note {{in instantiation of member function 'std::expected<void, CopyOnly>::value' requested here}}
+  // expected-error@*:* {{static assertion failed due to requirement 'is_move_constructible_v<CopyOnly>'}}
+  // expected-error@*:* {{call to deleted constructor of 'CopyOnly'}}
+}
diff --git a/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp
index 0f02db19c7b75..a24d67c57e19b 100644
--- a/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/observers/value.pass.cpp
@@ -65,17 +65,6 @@ void testException() {
     }
   }
 
-  // MoveOnly
-  {
-    std::expected<void, MoveOnly> e(std::unexpect, 5);
-    try {
-      std::move(e).value();
-      assert(false);
-    } catch (const std::bad_expected_access<MoveOnly>& ex) {
-      assert(ex.error() == 5);
-    }
-  }
-
 #endif // TEST_HAS_NO_EXCEPTIONS
 }
 

From 3d90e1fa94d17c3b27c89731760f28791bb30943 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Fri, 19 Jan 2024 18:08:09 +0800
Subject: [PATCH 101/843] [TableGen] Integrate TableGen-based macro fusion
 (#73115)

`Fusion` is inherited from `SubtargetFeature` now. Each definition
of `Fusion` will define a `SubtargetFeature` accordingly.

Method `getMacroFusions` is added to `TargetSubtargetInfo`, which
returns a list of `MacroFusionPredTy` that will be evaluated by
MacroFusionMution.

`getMacroFusions` will be auto-generated if the target has `Fusion`
definitions.
---
 .../llvm/CodeGen/TargetSubtargetInfo.h        |  4 +
 llvm/include/llvm/Target/Target.td            | 92 +++++++++----------
 llvm/include/llvm/Target/TargetSchedule.td    | 37 ++++----
 llvm/test/TableGen/MacroFusion.td             | 16 +++-
 llvm/utils/TableGen/CodeGenTarget.cpp         |  1 +
 llvm/utils/TableGen/CodeGenTarget.h           |  6 ++
 llvm/utils/TableGen/SubtargetEmitter.cpp      | 23 +++++
 7 files changed, 115 insertions(+), 64 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
index 55ef95c285431..a064dec7d8ab3 100644
--- a/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetSubtargetInfo.h
@@ -16,6 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MacroFusion.h"
 #include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/CodeGen/SchedulerRegistry.h"
 #include "llvm/IR/GlobalValue.h"
@@ -323,6 +324,9 @@ class TargetSubtargetInfo : public MCSubtargetInfo {
   /// helps removing redundant copies generated by register allocator when
   /// handling complex eviction chains.
   virtual bool enableSpillageCopyElimination() const { return false; }
+
+  /// Get the list of MacroFusion predicates.
+  virtual std::vector<MacroFusionPredTy> getMacroFusions() const { return {}; };
 };
 
 } // end namespace llvm
diff --git a/llvm/include/llvm/Target/Target.td b/llvm/include/llvm/Target/Target.td
index 3b1d2f45267e9..0d97a47190b19 100644
--- a/llvm/include/llvm/Target/Target.td
+++ b/llvm/include/llvm/Target/Target.td
@@ -459,6 +459,52 @@ class DwarfRegAlias<Register reg> {
       Register DwarfAlias = reg;
 }
 
+//===----------------------------------------------------------------------===//
+// SubtargetFeature - A characteristic of the chip set.
+//
+class SubtargetFeature<string n, string f, string v, string d,
+                       list<SubtargetFeature> i = []> {
+  // Name - Feature name.  Used by command line (-mattr=) to determine the
+  // appropriate target chip.
+  //
+  string Name = n;
+
+  // FieldName - Field in XXXSubtarget to be set by feature.
+  //
+  string FieldName = f;
+
+  // Value - Value the XXXSubtarget field to be set to by feature.
+  //
+  // A value of "true" or "false" implies the field is a bool. Otherwise,
+  // it is assumed to be an integer. the integer value may be the name of an
+  // enum constant. If multiple features use the same integer field, the
+  // field will be set to the maximum value of all enabled features that
+  // share the field.
+  //
+  string Value = v;
+
+  // Desc - Feature description.  Used by command line (-mattr=) to display help
+  // information.
+  //
+  string Desc = d;
+
+  // Implies - Features that this feature implies are present. If one of those
+  // features isn't set, then this one shouldn't be set either.
+  //
+  list<SubtargetFeature> Implies = i;
+}
+
+/// Specifies a Subtarget feature that this instruction is deprecated on.
+class Deprecated<SubtargetFeature dep> {
+  SubtargetFeature DeprecatedFeatureMask = dep;
+}
+
+/// A custom predicate used to determine if an instruction is
+/// deprecated or not.
+class ComplexDeprecationPredicate<string dep> {
+  string ComplexDeprecationPredicate = dep;
+}
+
 //===----------------------------------------------------------------------===//
 // Pull in the common support for MCPredicate (portable scheduling predicates).
 //
@@ -1680,52 +1726,6 @@ class Target {
   int AllowRegisterRenaming = 0;
 }
 
-//===----------------------------------------------------------------------===//
-// SubtargetFeature - A characteristic of the chip set.
-//
-class SubtargetFeature<string n, string f, string v, string d,
-                       list<SubtargetFeature> i = []> {
-  // Name - Feature name.  Used by command line (-mattr=) to determine the
-  // appropriate target chip.
-  //
-  string Name = n;
-
-  // FieldName - Field in XXXSubtarget to be set by feature.
-  //
-  string FieldName = f;
-
-  // Value - Value the XXXSubtarget field to be set to by feature.
-  //
-  // A value of "true" or "false" implies the field is a bool. Otherwise,
-  // it is assumed to be an integer. the integer value may be the name of an
-  // enum constant. If multiple features use the same integer field, the
-  // field will be set to the maximum value of all enabled features that
-  // share the field.
-  //
-  string Value = v;
-
-  // Desc - Feature description.  Used by command line (-mattr=) to display help
-  // information.
-  //
-  string Desc = d;
-
-  // Implies - Features that this feature implies are present. If one of those
-  // features isn't set, then this one shouldn't be set either.
-  //
-  list<SubtargetFeature> Implies = i;
-}
-
-/// Specifies a Subtarget feature that this instruction is deprecated on.
-class Deprecated<SubtargetFeature dep> {
-  SubtargetFeature DeprecatedFeatureMask = dep;
-}
-
-/// A custom predicate used to determine if an instruction is
-/// deprecated or not.
-class ComplexDeprecationPredicate<string dep> {
-  string ComplexDeprecationPredicate = dep;
-}
-
 //===----------------------------------------------------------------------===//
 // Processor chip sets - These values represent each of the chip sets supported
 // by the scheduler.  Each Processor definition requires corresponding
diff --git a/llvm/include/llvm/Target/TargetSchedule.td b/llvm/include/llvm/Target/TargetSchedule.td
index 2016d452afb6f..032de72851782 100644
--- a/llvm/include/llvm/Target/TargetSchedule.td
+++ b/llvm/include/llvm/Target/TargetSchedule.td
@@ -658,7 +658,8 @@ def OneUse : OneUsePred;
 //   return true;
 // }
 // ```
-class Fusion<list<FusionPredicate> predicates> {
+class Fusion<string name, string fieldName, string desc, list<FusionPredicate> predicates>
+  : SubtargetFeature<name, fieldName, "true", desc> {
   list<FusionPredicate> Predicates = predicates;
 }
 
@@ -679,21 +680,23 @@ class Fusion<list<FusionPredicate> predicates> {
 //   return true;
 // }
 // ```
-class SimpleFusion<MCInstPredicate firstPred, MCInstPredicate secondPred,
+class SimpleFusion<string name, string fieldName, string desc,
+                   MCInstPredicate firstPred, MCInstPredicate secondPred,
                    list<FusionPredicate> prolog = [],
                    list<FusionPredicate> epilog = []>
-  : Fusion<!listconcat(
-                  prolog,
-                  [
-                    SecondFusionPredicateWithMCInstPredicate<secondPred>,
-                    WildcardTrue,
-                    FirstFusionPredicateWithMCInstPredicate<firstPred>,
-                    SecondFusionPredicateWithMCInstPredicate<
-                      CheckAny<[
-                        CheckIsVRegOperand<0>,
-                        CheckSameRegOperand<0, 1>
-                      ]>>,
-                    OneUse,
-                    TieReg<0, 1>,
-                  ],
-                  epilog)>;
+  : Fusion<name, fieldName, desc,
+           !listconcat(
+              prolog,
+              [
+                SecondFusionPredicateWithMCInstPredicate<secondPred>,
+                WildcardTrue,
+                FirstFusionPredicateWithMCInstPredicate<firstPred>,
+                SecondFusionPredicateWithMCInstPredicate<
+                  CheckAny<[
+                    CheckIsVRegOperand<0>,
+                    CheckSameRegOperand<0, 1>
+                  ]>>,
+                OneUse,
+                TieReg<0, 1>,
+              ],
+              epilog)>;
diff --git a/llvm/test/TableGen/MacroFusion.td b/llvm/test/TableGen/MacroFusion.td
index f984a142839c9..4aa6c8d9acb27 100644
--- a/llvm/test/TableGen/MacroFusion.td
+++ b/llvm/test/TableGen/MacroFusion.td
@@ -1,4 +1,5 @@
 // RUN: llvm-tblgen -gen-macro-fusion-pred -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-PREDICATOR
+// RUN: llvm-tblgen -gen-subtarget -I %p/../../include %s | FileCheck %s --check-prefix=CHECK-SUBTARGET
 
 include "llvm/Target/Target.td"
 
@@ -33,7 +34,8 @@ let Namespace = "Test" in {
 def Inst0 : TestInst<0>;
 def Inst1 : TestInst<1>;
 
-def TestFusion: SimpleFusion<CheckOpcode<[Inst0]>,
+def TestFusion: SimpleFusion<"test-fusion", "HasTestFusion", "Test Fusion",
+                             CheckOpcode<[Inst0]>,
                              CheckAll<[
                               CheckOpcode<[Inst1]>,
                               CheckRegOperand<0, X0>
@@ -95,3 +97,15 @@ def TestFusion: SimpleFusion<CheckOpcode<[Inst0]>,
 // CHECK-PREDICATOR-NEXT:  } // end namespace llvm
 // CHECK-PREDICATOR-EMPTY:
 // CHECK-PREDICATOR-NEXT:  #endif
+
+// Check that we have generated target subfeature.
+// CHECK-SUBTARGET: { "test-fusion", "Test Fusion", Test::TestFusion
+
+// Check that we have generated `getMacroFusions()` function.
+// CHECK-SUBTARGET:      std::vector<MacroFusionPredTy> getMacroFusions() const override;
+
+// CHECK-SUBTARGET:      std::vector<MacroFusionPredTy> TestGenSubtargetInfo::getMacroFusions() const {
+// CHECK-SUBTARGET-NEXT:   std::vector<MacroFusionPredTy> Fusions;
+// CHECK-SUBTARGET-NEXT:   if (hasFeature(Test::TestFusion)) Fusions.push_back(llvm::isTestFusion);
+// CHECK-SUBTARGET-NEXT:   return Fusions;
+// CHECK-SUBTARGET-NEXT: }
diff --git a/llvm/utils/TableGen/CodeGenTarget.cpp b/llvm/utils/TableGen/CodeGenTarget.cpp
index 53efa66f9dfc1..37fa30349eea9 100644
--- a/llvm/utils/TableGen/CodeGenTarget.cpp
+++ b/llvm/utils/TableGen/CodeGenTarget.cpp
@@ -291,6 +291,7 @@ CodeGenTarget::CodeGenTarget(RecordKeeper &records)
   if (Targets.size() != 1)
     PrintFatalError("Multiple subclasses of Target defined!");
   TargetRec = Targets[0];
+  MacroFusions = Records.getAllDerivedDefinitions("Fusion");
 }
 
 CodeGenTarget::~CodeGenTarget() {
diff --git a/llvm/utils/TableGen/CodeGenTarget.h b/llvm/utils/TableGen/CodeGenTarget.h
index a2b559d53b19c..d7388dceca37f 100644
--- a/llvm/utils/TableGen/CodeGenTarget.h
+++ b/llvm/utils/TableGen/CodeGenTarget.h
@@ -64,6 +64,8 @@ class CodeGenTarget {
   mutable std::vector<Record*> RegAltNameIndices;
   mutable SmallVector<ValueTypeByHwMode, 8> LegalValueTypes;
   CodeGenHwModes CGH;
+  std::vector<Record *> MacroFusions;
+
   void ReadRegAltNameIndices() const;
   void ReadInstructions() const;
   void ReadLegalValueTypes() const;
@@ -149,6 +151,10 @@ class CodeGenTarget {
 
   const CodeGenHwModes &getHwModes() const { return CGH; }
 
+  bool hasMacroFusion() const { return !MacroFusions.empty(); }
+
+  const std::vector<Record *> getMacroFusions() const { return MacroFusions; }
+
 private:
   DenseMap<const Record*, std::unique_ptr<CodeGenInstruction>> &
   getInstructions() const {
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index f7a7172d61fc6..39225182a4c25 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -133,6 +133,7 @@ class SubtargetEmitter {
   void EmitMCInstrAnalysisPredicateFunctions(raw_ostream &OS);
 
   void EmitSchedModel(raw_ostream &OS);
+  void emitGetMacroFusions(const std::string &ClassName, raw_ostream &OS);
   void EmitHwModeCheck(const std::string &ClassName, raw_ostream &OS);
   void ParseFeaturesFunction(raw_ostream &OS);
 
@@ -1786,6 +1787,24 @@ void SubtargetEmitter::EmitHwModeCheck(const std::string &ClassName,
   OS << "  return 0;\n}\n";
 }
 
+void SubtargetEmitter::emitGetMacroFusions(const std::string &ClassName,
+                                           raw_ostream &OS) {
+  if (!TGT.hasMacroFusion())
+    return;
+
+  OS << "std::vector<MacroFusionPredTy> " << ClassName
+     << "::getMacroFusions() const {\n";
+  OS.indent(2) << "std::vector<MacroFusionPredTy> Fusions;\n";
+  for (auto *Fusion : TGT.getMacroFusions()) {
+    std::string Name = Fusion->getNameInitAsString();
+    OS.indent(2) << "if (hasFeature(" << Target << "::" << Name
+                 << ")) Fusions.push_back(llvm::is" << Name << ");\n";
+  }
+
+  OS.indent(2) << "return Fusions;\n";
+  OS << "}\n";
+}
+
 // Produces a subtarget specific function for parsing
 // the subtarget features string.
 void SubtargetEmitter::ParseFeaturesFunction(raw_ostream &OS) {
@@ -1987,6 +2006,9 @@ void SubtargetEmitter::run(raw_ostream &OS) {
      << " const;\n";
   if (TGT.getHwModes().getNumModeIds() > 1)
     OS << "  unsigned getHwMode() const override;\n";
+  if (TGT.hasMacroFusion())
+    OS << "  std::vector<MacroFusionPredTy> getMacroFusions() const "
+          "override;\n";
 
   STIPredicateExpander PE(Target);
   PE.setByRef(false);
@@ -2044,6 +2066,7 @@ void SubtargetEmitter::run(raw_ostream &OS) {
 
   EmitSchedModelHelpers(ClassName, OS);
   EmitHwModeCheck(ClassName, OS);
+  emitGetMacroFusions(ClassName, OS);
 
   OS << "} // end namespace llvm\n\n";
 

From 9dd0eb9c9c207e7ea17912616c5cea58aa5c514d Mon Sep 17 00:00:00 2001
From: Tobias Gysi <tobias.gysi@nextsilicon.com>
Date: Fri, 19 Jan 2024 11:10:57 +0100
Subject: [PATCH 102/843] [mlir][llvm] Drop unreachable basic block during
 import (#78467)

This revision updates the LLVM IR import to support unreachable basic
blocks. An unreachable block may dominate itself and a value defined
inside the block may thus be used before its definition. The import does
not support such dependencies. We thus delete the unreachable basic
blocks before the import. This is possible since MLIR does not have
basic block labels that can be reached using an indirect call and
unreachable blocks can indeed be deleted safely.

Additionally, add a small poison constant import test.
---
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       | 46 +++++++++++++------
 mlir/test/Target/LLVMIR/Import/constant.ll    | 10 ++++
 mlir/test/Target/LLVMIR/Import/exception.ll   | 41 +++++++++--------
 .../LLVMIR/Import/unreachable-blocks.ll       | 35 ++++++++++++++
 4 files changed, 97 insertions(+), 35 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/Import/unreachable-blocks.ll

diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index e905408a1e089..928d8077175cc 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -26,6 +26,7 @@
 #include "mlir/Interfaces/DataLayoutInterfaces.h"
 #include "mlir/Tools/mlir-translate/Translation.h"
 
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringSet.h"
@@ -132,18 +133,17 @@ static LogicalResult convertInstructionImpl(OpBuilder &odsBuilder,
   return failure();
 }
 
-/// Get a topologically sorted list of blocks for the given function.
+/// Get a topologically sorted list of blocks for the given basic blocks.
 static SetVector<llvm::BasicBlock *>
-getTopologicallySortedBlocks(llvm::Function *func) {
+getTopologicallySortedBlocks(ArrayRef<llvm::BasicBlock *> basicBlocks) {
   SetVector<llvm::BasicBlock *> blocks;
-  for (llvm::BasicBlock &bb : *func) {
-    if (!blocks.contains(&bb)) {
-      llvm::ReversePostOrderTraversal<llvm::BasicBlock *> traversal(&bb);
+  for (llvm::BasicBlock *basicBlock : basicBlocks) {
+    if (!blocks.contains(basicBlock)) {
+      llvm::ReversePostOrderTraversal<llvm::BasicBlock *> traversal(basicBlock);
       blocks.insert(traversal.begin(), traversal.end());
     }
   }
-  assert(blocks.size() == func->size() && "some blocks are not sorted");
-
+  assert(blocks.size() == basicBlocks.size() && "some blocks are not sorted");
   return blocks;
 }
 
@@ -1859,11 +1859,26 @@ LogicalResult ModuleImport::processFunction(llvm::Function *func) {
   if (func->isDeclaration())
     return success();
 
-  // Eagerly create all blocks.
-  for (llvm::BasicBlock &bb : *func) {
-    Block *block =
-        builder.createBlock(&funcOp.getBody(), funcOp.getBody().end());
-    mapBlock(&bb, block);
+  // Collect the set of basic blocks reachable from the function's entry block.
+  // This step is crucial as LLVM IR can contain unreachable blocks that
+  // self-dominate. As a result, an operation might utilize a variable it
+  // defines, which the import does not support. Given that MLIR lacks block
+  // label support, we can safely remove unreachable blocks, as there are no
+  // indirect branch instructions that could potentially target these blocks.
+  llvm::df_iterator_default_set<llvm::BasicBlock *> reachable;
+  for (llvm::BasicBlock *basicBlock : llvm::depth_first_ext(func, reachable))
+    (void)basicBlock;
+
+  // Eagerly create all reachable blocks.
+  SmallVector<llvm::BasicBlock *> reachableBasicBlocks;
+  for (llvm::BasicBlock &basicBlock : *func) {
+    // Skip unreachable blocks.
+    if (!reachable.contains(&basicBlock))
+      continue;
+    Region &body = funcOp.getBody();
+    Block *block = builder.createBlock(&body, body.end());
+    mapBlock(&basicBlock, block);
+    reachableBasicBlocks.push_back(&basicBlock);
   }
 
   // Add function arguments to the entry block.
@@ -1876,10 +1891,11 @@ LogicalResult ModuleImport::processFunction(llvm::Function *func) {
   // Process the blocks in topological order. The ordered traversal ensures
   // operands defined in a dominating block have a valid mapping to an MLIR
   // value once a block is translated.
-  SetVector<llvm::BasicBlock *> blocks = getTopologicallySortedBlocks(func);
+  SetVector<llvm::BasicBlock *> blocks =
+      getTopologicallySortedBlocks(reachableBasicBlocks);
   setConstantInsertionPointToStart(lookupBlock(blocks.front()));
-  for (llvm::BasicBlock *bb : blocks)
-    if (failed(processBasicBlock(bb, lookupBlock(bb))))
+  for (llvm::BasicBlock *basicBlock : blocks)
+    if (failed(processBasicBlock(basicBlock, lookupBlock(basicBlock))))
       return failure();
 
   // Process the debug intrinsics that require a delayed conversion after
diff --git a/mlir/test/Target/LLVMIR/Import/constant.ll b/mlir/test/Target/LLVMIR/Import/constant.ll
index cd2d00ec0aa78..3c46f5b20c31c 100644
--- a/mlir/test/Target/LLVMIR/Import/constant.ll
+++ b/mlir/test/Target/LLVMIR/Import/constant.ll
@@ -47,6 +47,16 @@ define void @undef_constant(i32 %arg0) {
 
 ; // -----
 
+; CHECK-LABEL: @poison_constant
+define void @poison_constant(double %arg0) {
+  ; CHECK:  %[[POISON:.+]] = llvm.mlir.poison : f64
+  ; CHECK:  llvm.fadd %[[POISON]], %{{.*}} : f64
+  %1 = fadd double poison, %arg0
+  ret void
+}
+
+; // -----
+
 ; CHECK-LABEL: @null_constant
 define ptr @null_constant() {
   ; CHECK:  %[[NULL:[0-9]+]] = llvm.mlir.zero : !llvm.ptr
diff --git a/mlir/test/Target/LLVMIR/Import/exception.ll b/mlir/test/Target/LLVMIR/Import/exception.ll
index de227645cc15b..440d89ec147f7 100644
--- a/mlir/test/Target/LLVMIR/Import/exception.ll
+++ b/mlir/test/Target/LLVMIR/Import/exception.ll
@@ -12,34 +12,35 @@ define i32 @invokeLandingpad() personality ptr @__gxx_personality_v0 {
   ; CHECK: %[[a1:[0-9]+]] = llvm.mlir.addressof @_ZTIii : !llvm.ptr
   ; CHECK: %[[a3:[0-9]+]] = llvm.alloca %{{[0-9]+}} x i8 {alignment = 1 : i64} : (i32) -> !llvm.ptr
   %1 = alloca i8
-  ; CHECK: llvm.invoke @foo(%[[a3]]) to ^bb2 unwind ^bb1 : (!llvm.ptr) -> ()
-  invoke void @foo(ptr %1) to label %4 unwind label %2
+  ; CHECK: llvm.invoke @foo(%[[a3]]) to ^[[bb1:.*]] unwind ^[[bb4:.*]] : (!llvm.ptr) -> ()
+  invoke void @foo(ptr %1) to label %bb1 unwind label %bb4
 
-; CHECK: ^bb1:
+; CHECK: ^[[bb1]]:
+bb1:
+  ; CHECK: %{{[0-9]+}} = llvm.invoke @bar(%[[a3]]) to ^[[bb2:.*]] unwind ^[[bb4]] : (!llvm.ptr) -> !llvm.ptr
+  %2 = invoke ptr @bar(ptr %1) to label %bb2 unwind label %bb4
+
+; CHECK: ^[[bb2]]:
+bb2:
+  ; CHECK: llvm.invoke @vararg_foo(%[[a3]], %{{.*}}) to ^[[bb3:.*]] unwind ^[[bb4]] vararg(!llvm.func<void (ptr, ...)>) : (!llvm.ptr, i32) -> ()
+  invoke void (ptr, ...) @vararg_foo(ptr %1, i32 0) to label %bb3 unwind label %bb4
+
+; CHECK: ^[[bb3]]:
+bb3:
+  ; CHECK: llvm.invoke %{{.*}}(%[[a3]], %{{.*}}) to ^[[bb5:.*]] unwind ^[[bb4]] vararg(!llvm.func<void (ptr, ...)>) : !llvm.ptr, (!llvm.ptr, i32) -> ()
+  invoke void (ptr, ...) undef(ptr %1, i32 0) to label %bb5 unwind label %bb4
+
+; CHECK: ^[[bb4]]:
+bb4:
   ; CHECK: %{{[0-9]+}} = llvm.landingpad (catch %{{[0-9]+}} : !llvm.ptr) (catch %[[a1]] : !llvm.ptr) (filter %{{[0-9]+}} : !llvm.array<1 x i1>) : !llvm.struct<(ptr, i32)>
   %3 = landingpad { ptr, i32 } catch ptr @_ZTIi catch ptr @_ZTIii
           filter [1 x i1] [i1 1]
   resume { ptr, i32 } %3
 
-; CHECK: ^bb2:
+; CHECK: ^[[bb5]]:
+bb5:
   ; CHECK: llvm.return %{{[0-9]+}} : i32
   ret i32 1
-
-; CHECK: ^bb3:
-  ; CHECK: %{{[0-9]+}} = llvm.invoke @bar(%[[a3]]) to ^bb2 unwind ^bb1 : (!llvm.ptr) -> !llvm.ptr
-  %6 = invoke ptr @bar(ptr %1) to label %4 unwind label %2
-
-; CHECK: ^bb4:
-  ; CHECK: llvm.invoke @vararg_foo(%[[a3]], %{{.*}}) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (ptr, ...)>) : (!llvm.ptr, i32) -> ()
-  invoke void (ptr, ...) @vararg_foo(ptr %1, i32 0) to label %4 unwind label %2
-
-; CHECK: ^bb5:
-  ; CHECK: llvm.invoke %{{.*}}(%[[a3]], %{{.*}}) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (ptr, ...)>) : !llvm.ptr, (!llvm.ptr, i32) -> ()
-  invoke void (ptr, ...) undef(ptr %1, i32 0) to label %4 unwind label %2
-
-; CHECK: ^bb6:
-  ; CHECK: llvm.return %{{[0-9]+}} : i32
-  ret i32 0
 }
 
 declare i32 @foo2()
diff --git a/mlir/test/Target/LLVMIR/Import/unreachable-blocks.ll b/mlir/test/Target/LLVMIR/Import/unreachable-blocks.ll
new file mode 100644
index 0000000000000..8a84f4b5c19b8
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/Import/unreachable-blocks.ll
@@ -0,0 +1,35 @@
+; RUN: mlir-translate -import-llvm %s | FileCheck %s
+
+; Test unreachable blocks are dropped.
+
+; CHECK-LABEL: llvm.func @unreachable_block
+define void @unreachable_block(float %0) {
+.entry:
+  ; CHECK: llvm.return
+  ret void
+
+unreachable:
+  ; CHECK-NOT: llvm.fadd
+  %1 = fadd float %0, %1
+  br label %unreachable
+}
+
+; Test unreachable blocks with back edges are supported.
+
+; CHECK-LABEL: llvm.func @back_edge
+define i32 @back_edge(i32 %0) {
+.entry:
+  ; CHECK: llvm.br ^[[RET:.*]](%{{.*}})
+  br label %ret
+ret:
+  ; CHECK: ^[[RET]](%{{.*}}: i32)
+  %1 = phi i32 [ %0, %.entry ], [ %2, %unreachable ]
+  ; CHECK: llvm.return %{{.*}} : i32
+  ret i32 %1
+
+unreachable:
+  ; CHECK-NOT: add
+  %2 = add i32 %0, %2
+  %3 = icmp eq i32 %2, 42
+  br i1 %3, label %ret, label %unreachable
+}

From 2759cfa0c35478b5a3bc1e51a1ed817d1b1dc39f Mon Sep 17 00:00:00 2001
From: Leon Clark <PeddleSpam@users.noreply.github.com>
Date: Fri, 19 Jan 2024 10:16:46 +0000
Subject: [PATCH 103/843] [AMDGPU] Remove unnecessary add instructions in
 ctlz.i8 (#77615)

Add custom lowering for ctlz.i8 to avoid multiple add/sub operations.

---------

Co-authored-by: Leon Clark <leoclark@amd.com>
Co-authored-by: Matt Arsenault <Matthew.Arsenault@amd.com>
---
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 28 +++++++
 llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h   |  2 +
 llvm/test/CodeGen/AMDGPU/ctlz.ll              | 31 ++++---
 llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll   | 81 +++++++++----------
 4 files changed, 83 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 6f26e0e82ece1..55d95154c7587 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -446,6 +446,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
       {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
       MVT::i64, Custom);
 
+  for (auto VT : {MVT::i8, MVT::i16})
+    setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, VT, Custom);
+
   static const MVT::SimpleValueType VectorIntTypes[] = {
       MVT::v2i32, MVT::v3i32, MVT::v4i32, MVT::v5i32, MVT::v6i32, MVT::v7i32,
       MVT::v9i32, MVT::v10i32, MVT::v11i32, MVT::v12i32};
@@ -1398,6 +1401,11 @@ void AMDGPUTargetLowering::ReplaceNodeResults(SDNode *N,
     if (SDValue Lowered = lowerFEXP(SDValue(N, 0), DAG))
       Results.push_back(Lowered);
     return;
+  case ISD::CTLZ:
+  case ISD::CTLZ_ZERO_UNDEF:
+    if (auto Lowered = lowerCTLZResults(SDValue(N, 0u), DAG))
+      Results.push_back(Lowered);
+    return;
   default:
     return;
   }
@@ -3063,6 +3071,26 @@ static bool isCttzOpc(unsigned Opc) {
   return Opc == ISD::CTTZ || Opc == ISD::CTTZ_ZERO_UNDEF;
 }
 
+SDValue AMDGPUTargetLowering::lowerCTLZResults(SDValue Op,
+                                               SelectionDAG &DAG) const {
+  auto SL = SDLoc(Op);
+  auto Arg = Op.getOperand(0u);
+  auto ResultVT = Op.getValueType();
+
+  if (ResultVT != MVT::i8 && ResultVT != MVT::i16)
+    return {};
+
+  assert(isCtlzOpc(Op.getOpcode()));
+  assert(ResultVT == Arg.getValueType());
+
+  auto const LeadingZeroes = 32u - ResultVT.getFixedSizeInBits();
+  auto NewOp = DAG.getNode(ISD::ZERO_EXTEND, SL, MVT::i32, Arg);
+  auto ShiftVal = DAG.getConstant(LeadingZeroes, SL, MVT::i32);
+  NewOp = DAG.getNode(ISD::SHL, SL, MVT::i32, NewOp, ShiftVal);
+  NewOp = DAG.getNode(Op.getOpcode(), SL, MVT::i32, NewOp);
+  return DAG.getNode(ISD::TRUNCATE, SL, ResultVT, NewOp);
+}
+
 SDValue AMDGPUTargetLowering::LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index db8dc922c974c..f10a357125e56 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -84,6 +84,8 @@ class AMDGPUTargetLowering : public TargetLowering {
                             SDNodeFlags Flags) const;
   SDValue lowerFEXP(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue lowerCTLZResults(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerCTLZ_CTTZ(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue LowerINT_TO_FP32(SDValue Op, SelectionDAG &DAG, bool Signed) const;
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz.ll b/llvm/test/CodeGen/AMDGPU/ctlz.ll
index b426a15669417..9307d8952293b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz.ll
@@ -492,9 +492,9 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; SI-NEXT:    v_min_u32_e32 v0, 32, v0
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -512,10 +512,9 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ; VI-NEXT:    s_mov_b32 s4, s0
 ; VI-NEXT:    s_mov_b32 s5, s1
 ; VI-NEXT:    s_waitcnt vmcnt(0)
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; VI-NEXT:    v_ffbh_u32_e32 v0, v0
 ; VI-NEXT:    v_min_u32_e32 v0, 32, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT:    v_add_u16_e32 v0, -8, v0
 ; VI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
@@ -523,7 +522,7 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -532,14 +531,15 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
-; EG-NEXT:     CNDE_INT T0.W, T0.X, literal.x, PV.W,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    32(4.484155e-44), 3(4.203895e-45)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -24(nan), 0(0.000000e+00)
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T1.W, PV.W,
+; EG-NEXT:     AND_INT * T2.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
+; EG-NEXT:     CNDE_INT * T0.W, T0.W, literal.x, PV.W,
+; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, T2.W, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,
@@ -556,10 +556,9 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    global_load_ubyte v1, v0, s[2:3]
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX10-NEXT:    v_ffbh_u32_e32 v1, v1
 ; GFX10-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, -16, v1
-; GFX10-NEXT:    v_add_nc_u16 v1, v1, -8
 ; GFX10-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-NEXT:    s_endpgm
 ;
@@ -583,12 +582,10 @@ define amdgpu_kernel void @v_ctlz_i8(ptr addrspace(1) noalias %out, ptr addrspac
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    global_load_u8 v1, v0, s[2:3]
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
+; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 24, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_clz_i32_u32_e32 v1, v1
 ; GFX11-NEXT:    v_min_u32_e32 v1, 32, v1
-; GFX11-NEXT:    v_add_nc_u32_e32 v1, -16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_add_nc_u16 v1, v1, -8
 ; GFX11-NEXT:    global_store_b8 v0, v1, s[0:1]
 ; GFX11-NEXT:    s_nop 0
 ; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 5be8882764ce7..2830e5258e92b 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -314,9 +314,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s2, 0xff
-; SI-NEXT:    s_flbit_i32_b32 s2, s2
-; SI-NEXT:    s_sub_i32 s4, s2, 24
+; SI-NEXT:    s_lshl_b32 s2, s2, 24
+; SI-NEXT:    s_flbit_i32_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
@@ -327,12 +326,11 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    s_load_dword s2, s[0:1], 0x2c
 ; VI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    s_and_b32 s2, s2, 0xff
+; VI-NEXT:    s_lshl_b32 s2, s2, 24
 ; VI-NEXT:    s_flbit_i32_b32 s2, s2
-; VI-NEXT:    s_add_i32 s2, s2, -16
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
-; VI-NEXT:    v_add_u16_e64 v2, s2, -8
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
 ;
@@ -349,13 +347,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, 0.0,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -24(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, PS, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,
@@ -391,9 +389,8 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; SI-NEXT:    s_load_dwordx2 s[0:1], s[0:1], 0x9
 ; SI-NEXT:    s_mov_b32 s3, 0xf000
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    s_and_b32 s2, s2, 0xffff
-; SI-NEXT:    s_flbit_i32_b32 s2, s2
-; SI-NEXT:    s_add_i32 s4, s2, -16
+; SI-NEXT:    s_lshl_b32 s2, s2, 16
+; SI-NEXT:    s_flbit_i32_b32 s4, s2
 ; SI-NEXT:    s_mov_b32 s2, -1
 ; SI-NEXT:    v_mov_b32_e32 v0, s4
 ; SI-NEXT:    buffer_store_short v0, off, s[0:3], 0
@@ -426,13 +423,13 @@ define amdgpu_kernel void @s_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, 0.0,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -16(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, PS, literal.y,
 ; EG-NEXT:    65535(9.183409e-41), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,
@@ -590,8 +587,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_subrev_i32_e32 v1, vcc, 24, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
@@ -605,9 +602,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; VI-NEXT:    v_mov_b32_e32 v1, s3
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_sdwa v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0
-; VI-NEXT:    v_add_u32_e32 v1, vcc, -16, v1
-; VI-NEXT:    v_add_u16_e32 v1, -8, v1
+; VI-NEXT:    v_lshlrev_b32_e32 v1, 24, v0
+; VI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; VI-NEXT:    v_cmp_ne_u16_e32 vcc, 0, v0
 ; VI-NEXT:    v_cndmask_b32_e32 v2, 32, v1, vcc
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
@@ -619,7 +615,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -628,10 +624,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8_with_select(ptr addrspace(1) noa
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
-; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    -24(nan), 3(4.203895e-45)
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
@@ -686,8 +683,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; SI-NEXT:    v_lshlrev_b32_e32 v0, 8, v0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    v_or_b32_e32 v0, v0, v1
-; SI-NEXT:    v_ffbh_u32_e32 v1, v0
-; SI-NEXT:    v_add_i32_e32 v1, vcc, -16, v1
+; SI-NEXT:    v_lshlrev_b32_e32 v1, 16, v0
+; SI-NEXT:    v_ffbh_u32_e32 v1, v1
 ; SI-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; SI-NEXT:    v_cndmask_b32_e32 v0, 32, v1, vcc
 ; SI-NEXT:    buffer_store_short v0, off, s[4:7], 0
@@ -722,7 +719,7 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG:       ; %bb.0:
 ; EG-NEXT:    ALU 0, @8, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    TEX 0 @6
-; EG-NEXT:    ALU 15, @9, KC0[CB0:0-32], KC1[]
+; EG-NEXT:    ALU 16, @9, KC0[CB0:0-32], KC1[]
 ; EG-NEXT:    MEM_RAT MSKOR T0.XW, T1.X
 ; EG-NEXT:    CF_END
 ; EG-NEXT:    PAD
@@ -731,10 +728,11 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     MOV * T0.X, KC0[2].Z,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT * T0.W, T0.X,
-; EG-NEXT:     ADD_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.y,
-; EG-NEXT:    -16(nan), 3(4.203895e-45)
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    16(2.242078e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
+; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
+; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
 ; EG-NEXT:     CNDE_INT * T0.W, T0.X, literal.x, PV.W,
 ; EG-NEXT:    32(4.484155e-44), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
@@ -1103,8 +1101,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; SI-NEXT:    s_mov_b32 s4, s0
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
 ; SI-NEXT:    v_ffbh_u32_e32 v0, v0
-; SI-NEXT:    v_subrev_i32_e32 v0, vcc, 24, v0
 ; SI-NEXT:    buffer_store_byte v0, off, s[4:7], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -1117,9 +1115,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; VI-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
 ; VI-NEXT:    flat_load_ubyte v0, v[0:1]
 ; VI-NEXT:    s_waitcnt vmcnt(0)
-; VI-NEXT:    v_ffbh_u32_e32 v0, v0
-; VI-NEXT:    v_add_u32_e32 v0, vcc, -16, v0
-; VI-NEXT:    v_add_u16_e32 v2, -8, v0
+; VI-NEXT:    v_lshlrev_b32_e32 v0, 24, v0
+; VI-NEXT:    v_ffbh_u32_e32 v2, v0
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    flat_store_byte v[0:1], v2
@@ -1138,13 +1135,13 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i8(ptr addrspace(1) noalias %out, p
 ; EG-NEXT:    ALU clause starting at 8:
 ; EG-NEXT:     ADD_INT * T0.X, KC0[2].Z, T0.X,
 ; EG-NEXT:    ALU clause starting at 9:
-; EG-NEXT:     FFBH_UINT T0.W, T0.X,
+; EG-NEXT:     LSHL * T0.W, T0.X, literal.x,
+; EG-NEXT:    24(3.363116e-44), 0(0.000000e+00)
+; EG-NEXT:     FFBH_UINT T0.W, PV.W,
 ; EG-NEXT:     AND_INT * T1.W, KC0[2].Y, literal.x,
 ; EG-NEXT:    3(4.203895e-45), 0(0.000000e+00)
-; EG-NEXT:     ADD_INT * T0.W, PV.W, literal.x,
-; EG-NEXT:    -24(nan), 0(0.000000e+00)
 ; EG-NEXT:     AND_INT T0.W, PV.W, literal.x,
-; EG-NEXT:     LSHL * T1.W, T1.W, literal.y,
+; EG-NEXT:     LSHL * T1.W, PS, literal.y,
 ; EG-NEXT:    255(3.573311e-43), 3(4.203895e-45)
 ; EG-NEXT:     LSHL T0.X, PV.W, PS,
 ; EG-NEXT:     LSHL * T0.W, literal.x, PS,

From abdb61f5fd0fa6f184aa2c7f3b676879a71df8a8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 19 Jan 2024 10:27:53 +0000
Subject: [PATCH 104/843] [VPlan] Introduce VPSingleDefRecipe. (#77023)

This patch introduces a new common base class for recipes defining a
single result VPValue. This has been discussed/mentioned at various
previous reviews as potential follow-up and helps to replace various
getVPSingleValue calls.

PR: https://github.com/llvm/llvm-project/pull/77023
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  35 ++--
 llvm/lib/Transforms/Vectorize/VPlan.h         | 191 ++++++++++++------
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   4 +-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  35 ++--
 4 files changed, 163 insertions(+), 102 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 77d54c9d08f4e..fa1aa15fa822d 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8963,16 +8963,17 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
            "AnyOf reductions are not allowed for in-loop reductions");
 
     // Collect the chain of "link" recipes for the reduction starting at PhiR.
-    SetVector<VPRecipeBase *> Worklist;
+    SetVector<VPSingleDefRecipe *> Worklist;
     Worklist.insert(PhiR);
     for (unsigned I = 0; I != Worklist.size(); ++I) {
-      VPRecipeBase *Cur = Worklist[I];
-      for (VPUser *U : Cur->getVPSingleValue()->users()) {
-        auto *UserRecipe = dyn_cast<VPRecipeBase>(U);
-        if (!UserRecipe)
+      VPSingleDefRecipe *Cur = Worklist[I];
+      for (VPUser *U : Cur->users()) {
+        auto *UserRecipe = dyn_cast<VPSingleDefRecipe>(U);
+        if (!UserRecipe) {
+          assert(isa<VPLiveOut>(U) &&
+                 "U must either be a VPSingleDef or VPLiveOut");
           continue;
-        assert(UserRecipe->getNumDefinedValues() == 1 &&
-               "recipes must define exactly one result value");
+        }
         Worklist.insert(UserRecipe);
       }
     }
@@ -8982,10 +8983,8 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
     // (PreviousLink) to tell which of the two operands of a Link will remain
     // scalar and which will be reduced. For minmax by select(cmp), Link will be
     // the select instructions.
-    VPRecipeBase *PreviousLink = PhiR; // Aka Worklist[0].
-    for (VPRecipeBase *CurrentLink : Worklist.getArrayRef().drop_front()) {
-      VPValue *PreviousLinkV = PreviousLink->getVPSingleValue();
-
+    VPSingleDefRecipe *PreviousLink = PhiR; // Aka Worklist[0].
+    for (VPSingleDefRecipe *CurrentLink : Worklist.getArrayRef().drop_front()) {
       Instruction *CurrentLinkI = CurrentLink->getUnderlyingInstr();
 
       // Index of the first operand which holds a non-mask vector operand.
@@ -9000,7 +8999,7 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
             "Expected instruction to be a call to the llvm.fmuladd intrinsic");
         assert(((MinVF.isScalar() && isa<VPReplicateRecipe>(CurrentLink)) ||
                 isa<VPWidenCallRecipe>(CurrentLink)) &&
-               CurrentLink->getOperand(2) == PreviousLinkV &&
+               CurrentLink->getOperand(2) == PreviousLink &&
                "expected a call where the previous link is the added operand");
 
         // If the instruction is a call to the llvm.fmuladd intrinsic then we
@@ -9031,15 +9030,15 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
         // Note that for non-commutable operands (cmp-selects), the semantics of
         // the cmp-select are captured in the recurrence kind.
         unsigned VecOpId =
-            CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLinkV
+            CurrentLink->getOperand(IndexOfFirstOperand) == PreviousLink
                 ? IndexOfFirstOperand + 1
                 : IndexOfFirstOperand;
         VecOp = CurrentLink->getOperand(VecOpId);
-        assert(VecOp != PreviousLinkV &&
+        assert(VecOp != PreviousLink &&
                CurrentLink->getOperand(CurrentLink->getNumOperands() - 1 -
                                        (VecOpId - IndexOfFirstOperand)) ==
-                   PreviousLinkV &&
-               "PreviousLinkV must be the operand other than VecOp");
+                   PreviousLink &&
+               "PreviousLink must be the operand other than VecOp");
       }
 
       BasicBlock *BB = CurrentLinkI->getParent();
@@ -9051,13 +9050,13 @@ void LoopVectorizationPlanner::adjustRecipesForReductions(
       }
 
       VPReductionRecipe *RedRecipe = new VPReductionRecipe(
-          RdxDesc, CurrentLinkI, PreviousLinkV, VecOp, CondOp);
+          RdxDesc, CurrentLinkI, PreviousLink, VecOp, CondOp);
       // Append the recipe to the end of the VPBasicBlock because we need to
       // ensure that it comes after all of it's inputs, including CondOp.
       // Note that this transformation may leave over dead recipes (including
       // CurrentLink), which will be cleaned by a later VPlan transform.
       LinkVPBB->appendRecipe(RedRecipe);
-      CurrentLink->getVPSingleValue()->replaceAllUsesWith(RedRecipe);
+      CurrentLink->replaceAllUsesWith(RedRecipe);
       PreviousLink = RedRecipe;
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4b4f4911eb641..bdbb61b88ecb2 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -12,10 +12,12 @@
 ///    VPBlockBase, together implementing a Hierarchical CFG;
 /// 2. Pure virtual VPRecipeBase serving as the base class for recipes contained
 ///    within VPBasicBlocks;
-/// 3. VPInstruction, a concrete Recipe and VPUser modeling a single planned
+/// 3. Pure virtual VPSingleDefRecipe serving as a base class for recipes that
+///    also inherit from VPValue.
+/// 4. VPInstruction, a concrete Recipe and VPUser modeling a single planned
 ///    instruction;
-/// 4. The VPlan class holding a candidate for vectorization;
-/// 5. The VPlanPrinter class providing a way to print a plan in dot format;
+/// 5. The VPlan class holding a candidate for vectorization;
+/// 6. The VPlanPrinter class providing a way to print a plan in dot format;
 /// These are documented in docs/VectorizationPlan.rst.
 //
 //===----------------------------------------------------------------------===//
@@ -700,8 +702,8 @@ class VPLiveOut : public VPUser {
 /// VPRecipeBase is a base class modeling a sequence of one or more output IR
 /// instructions. VPRecipeBase owns the VPValues it defines through VPDef
 /// and is responsible for deleting its defined values. Single-value
-/// VPRecipeBases that also inherit from VPValue must make sure to inherit from
-/// VPRecipeBase before VPValue.
+/// recipes must inherit from VPSingleDef instead of inheriting from both
+/// VPRecipeBase and VPValue separately.
 class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
                      public VPDef,
                      public VPUser {
@@ -762,15 +764,6 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   /// \returns an iterator pointing to the element after the erased one
   iplist<VPRecipeBase>::iterator eraseFromParent();
 
-  /// Returns the underlying instruction, if the recipe is a VPValue or nullptr
-  /// otherwise.
-  Instruction *getUnderlyingInstr() {
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
-  }
-  const Instruction *getUnderlyingInstr() const {
-    return cast<Instruction>(getVPSingleValue()->getUnderlyingValue());
-  }
-
   /// Method to support type inquiry through isa, cast, and dyn_cast.
   static inline bool classof(const VPDef *D) {
     // All VPDefs are also VPRecipeBases.
@@ -819,10 +812,80 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock>,
   }                                                                            \
   static inline bool classof(const VPRecipeBase *R) {                          \
     return R->getVPDefID() == VPDefID;                                         \
+  }                                                                            \
+  static inline bool classof(const VPSingleDefRecipe *R) {                     \
+    return R->getVPDefID() == VPDefID;                                         \
   }
 
+/// VPSingleDef is a base class for recipes for modeling a sequence of one or
+/// more output IR that define a single result VPValue.
+/// Note that VPRecipeBase must be inherited from before VPValue.
+class VPSingleDefRecipe : public VPRecipeBase, public VPValue {
+public:
+  template <typename IterT>
+  VPSingleDefRecipe(const unsigned char SC, IterT Operands, DebugLoc DL = {})
+      : VPRecipeBase(SC, Operands, DL), VPValue(this) {}
+
+  VPSingleDefRecipe(const unsigned char SC, ArrayRef<VPValue *> Operands,
+                    DebugLoc DL = {})
+      : VPRecipeBase(SC, Operands, DL), VPValue(this) {}
+
+  template <typename IterT>
+  VPSingleDefRecipe(const unsigned char SC, IterT Operands, Value *UV,
+                    DebugLoc DL = {})
+      : VPRecipeBase(SC, Operands, DL), VPValue(this, UV) {}
+
+  static inline bool classof(const VPRecipeBase *R) {
+    switch (R->getVPDefID()) {
+    case VPRecipeBase::VPDerivedIVSC:
+    case VPRecipeBase::VPExpandSCEVSC:
+    case VPRecipeBase::VPInstructionSC:
+    case VPRecipeBase::VPReductionSC:
+    case VPRecipeBase::VPReplicateSC:
+    case VPRecipeBase::VPScalarIVStepsSC:
+    case VPRecipeBase::VPVectorPointerSC:
+    case VPRecipeBase::VPWidenCallSC:
+    case VPRecipeBase::VPWidenCanonicalIVSC:
+    case VPRecipeBase::VPWidenCastSC:
+    case VPRecipeBase::VPWidenGEPSC:
+    case VPRecipeBase::VPWidenSC:
+    case VPRecipeBase::VPWidenSelectSC:
+    case VPRecipeBase::VPBlendSC:
+    case VPRecipeBase::VPPredInstPHISC:
+    case VPRecipeBase::VPCanonicalIVPHISC:
+    case VPRecipeBase::VPActiveLaneMaskPHISC:
+    case VPRecipeBase::VPFirstOrderRecurrencePHISC:
+    case VPRecipeBase::VPWidenPHISC:
+    case VPRecipeBase::VPWidenIntOrFpInductionSC:
+    case VPRecipeBase::VPWidenPointerInductionSC:
+    case VPRecipeBase::VPReductionPHISC:
+      return true;
+    case VPRecipeBase::VPInterleaveSC:
+    case VPRecipeBase::VPBranchOnMaskSC:
+    case VPRecipeBase::VPWidenMemoryInstructionSC:
+      // TODO: Widened stores don't define a value, but widened loads do. Split
+      // the recipes to be able to make widened loads VPSingleDefRecipes.
+      return false;
+    }
+    llvm_unreachable("Unhandled VPDefID");
+  }
+
+  static inline bool classof(const VPUser *U) {
+    auto *R = dyn_cast<VPRecipeBase>(U);
+    return R && classof(R);
+  }
+
+  /// Returns the underlying instruction.
+  Instruction *getUnderlyingInstr() {
+    return cast<Instruction>(getUnderlyingValue());
+  }
+  const Instruction *getUnderlyingInstr() const {
+    return cast<Instruction>(getUnderlyingValue());
+  }
+};
+
 /// Class to record LLVM IR flag for a recipe along with it.
-class VPRecipeWithIRFlags : public VPRecipeBase {
+class VPRecipeWithIRFlags : public VPSingleDefRecipe {
   enum class OperationType : unsigned char {
     Cmp,
     OverflowingBinOp,
@@ -886,14 +949,14 @@ class VPRecipeWithIRFlags : public VPRecipeBase {
 public:
   template <typename IterT>
   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, DebugLoc DL = {})
-      : VPRecipeBase(SC, Operands, DL) {
+      : VPSingleDefRecipe(SC, Operands, DL) {
     OpType = OperationType::Other;
     AllFlags = 0;
   }
 
   template <typename IterT>
   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands, Instruction &I)
-      : VPRecipeWithIRFlags(SC, Operands, I.getDebugLoc()) {
+      : VPSingleDefRecipe(SC, Operands, &I, I.getDebugLoc()) {
     if (auto *Op = dyn_cast<CmpInst>(&I)) {
       OpType = OperationType::Cmp;
       CmpPredicate = Op->getPredicate();
@@ -915,32 +978,35 @@ class VPRecipeWithIRFlags : public VPRecipeBase {
     } else if (auto *Op = dyn_cast<FPMathOperator>(&I)) {
       OpType = OperationType::FPMathOp;
       FMFs = Op->getFastMathFlags();
+    } else {
+      OpType = OperationType::Other;
+      AllFlags = 0;
     }
   }
 
   template <typename IterT>
   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
                       CmpInst::Predicate Pred, DebugLoc DL = {})
-      : VPRecipeBase(SC, Operands, DL), OpType(OperationType::Cmp),
+      : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::Cmp),
         CmpPredicate(Pred) {}
 
   template <typename IterT>
   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
                       WrapFlagsTy WrapFlags, DebugLoc DL = {})
-      : VPRecipeBase(SC, Operands, DL), OpType(OperationType::OverflowingBinOp),
-        WrapFlags(WrapFlags) {}
+      : VPSingleDefRecipe(SC, Operands, DL),
+        OpType(OperationType::OverflowingBinOp), WrapFlags(WrapFlags) {}
 
   template <typename IterT>
   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
                       FastMathFlags FMFs, DebugLoc DL = {})
-      : VPRecipeBase(SC, Operands, DL), OpType(OperationType::FPMathOp),
+      : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::FPMathOp),
         FMFs(FMFs) {}
 
 protected:
   template <typename IterT>
   VPRecipeWithIRFlags(const unsigned char SC, IterT Operands,
                       GEPFlagsTy GEPFlags, DebugLoc DL = {})
-      : VPRecipeBase(SC, Operands, DL), OpType(OperationType::GEPOp),
+      : VPSingleDefRecipe(SC, Operands, DL), OpType(OperationType::GEPOp),
         GEPFlags(GEPFlags) {}
 
 public:
@@ -1056,7 +1122,7 @@ class VPRecipeWithIRFlags : public VPRecipeBase {
 /// While as any Recipe it may generate a sequence of IR instructions when
 /// executed, these instructions would always form a single-def expression as
 /// the VPInstruction is also a single def-use vertex.
-class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
+class VPInstruction : public VPRecipeWithIRFlags {
   friend class VPlanSlp;
 
 public:
@@ -1103,7 +1169,7 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
   VPInstruction(unsigned Opcode, ArrayRef<VPValue *> Operands, DebugLoc DL,
                 const Twine &Name = "")
       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, DL),
-        VPValue(this), Opcode(Opcode), Name(Name.str()) {}
+        Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
                 DebugLoc DL = {}, const Twine &Name = "")
@@ -1115,7 +1181,7 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
                 WrapFlagsTy WrapFlags, DebugLoc DL = {}, const Twine &Name = "")
       : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, WrapFlags, DL),
-        VPValue(this), Opcode(Opcode), Name(Name.str()) {}
+        Opcode(Opcode), Name(Name.str()) {}
 
   VPInstruction(unsigned Opcode, std::initializer_list<VPValue *> Operands,
                 FastMathFlags FMFs, DebugLoc DL = {}, const Twine &Name = "");
@@ -1205,13 +1271,13 @@ class VPInstruction : public VPRecipeWithIRFlags, public VPValue {
 /// VPWidenRecipe is a recipe for producing a copy of vector type its
 /// ingredient. This recipe covers most of the traditional vectorization cases
 /// where each ingredient transforms into a vectorized version of itself.
-class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
+class VPWidenRecipe : public VPRecipeWithIRFlags {
   unsigned Opcode;
 
 public:
   template <typename IterT>
   VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I), VPValue(this, &I),
+      : VPRecipeWithIRFlags(VPDef::VPWidenSC, Operands, I),
         Opcode(I.getOpcode()) {}
 
   ~VPWidenRecipe() override = default;
@@ -1231,7 +1297,7 @@ class VPWidenRecipe : public VPRecipeWithIRFlags, public VPValue {
 };
 
 /// VPWidenCastRecipe is a recipe to create vector cast instructions.
-class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPValue {
+class VPWidenCastRecipe : public VPRecipeWithIRFlags {
   /// Cast instruction opcode.
   Instruction::CastOps Opcode;
 
@@ -1241,8 +1307,8 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPValue {
 public:
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy,
                     CastInst &UI)
-      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), VPValue(this, &UI),
-        Opcode(Opcode), ResultTy(ResultTy) {
+      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op, UI), Opcode(Opcode),
+        ResultTy(ResultTy) {
     assert(UI.getOpcode() == Opcode &&
            "opcode of underlying cast doesn't match");
     assert(UI.getType() == ResultTy &&
@@ -1250,8 +1316,8 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPValue {
   }
 
   VPWidenCastRecipe(Instruction::CastOps Opcode, VPValue *Op, Type *ResultTy)
-      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), VPValue(this, nullptr),
-        Opcode(Opcode), ResultTy(ResultTy) {}
+      : VPRecipeWithIRFlags(VPDef::VPWidenCastSC, Op), Opcode(Opcode),
+        ResultTy(ResultTy) {}
 
   ~VPWidenCastRecipe() override = default;
 
@@ -1273,7 +1339,7 @@ class VPWidenCastRecipe : public VPRecipeWithIRFlags, public VPValue {
 };
 
 /// A recipe for widening Call instructions.
-class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
+class VPWidenCallRecipe : public VPSingleDefRecipe {
   /// ID of the vector intrinsic to call when widening the call. If set the
   /// Intrinsic::not_intrinsic, a library call will be used instead.
   Intrinsic::ID VectorIntrinsicID;
@@ -1288,7 +1354,7 @@ class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
   VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
                     Intrinsic::ID VectorIntrinsicID,
                     Function *Variant = nullptr)
-      : VPRecipeBase(VPDef::VPWidenCallSC, CallArguments), VPValue(this, &I),
+      : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, &I),
         VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {}
 
   ~VPWidenCallRecipe() override = default;
@@ -1306,11 +1372,11 @@ class VPWidenCallRecipe : public VPRecipeBase, public VPValue {
 };
 
 /// A recipe for widening select instructions.
-struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
+struct VPWidenSelectRecipe : public VPSingleDefRecipe {
   template <typename IterT>
   VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands)
-      : VPRecipeBase(VPDef::VPWidenSelectSC, Operands, I.getDebugLoc()),
-        VPValue(this, &I) {}
+      : VPSingleDefRecipe(VPDef::VPWidenSelectSC, Operands, &I,
+                          I.getDebugLoc()) {}
 
   ~VPWidenSelectRecipe() override = default;
 
@@ -1335,7 +1401,7 @@ struct VPWidenSelectRecipe : public VPRecipeBase, public VPValue {
 };
 
 /// A recipe for handling GEP instructions.
-class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
+class VPWidenGEPRecipe : public VPRecipeWithIRFlags {
   bool isPointerLoopInvariant() const {
     return getOperand(0)->isDefinedOutsideVectorRegions();
   }
@@ -1353,8 +1419,7 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
 public:
   template <typename IterT>
   VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands)
-      : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP),
-        VPValue(this, GEP) {}
+      : VPRecipeWithIRFlags(VPDef::VPWidenGEPSC, Operands, *GEP) {}
 
   ~VPWidenGEPRecipe() override = default;
 
@@ -1373,7 +1438,7 @@ class VPWidenGEPRecipe : public VPRecipeWithIRFlags, public VPValue {
 /// A recipe to compute the pointers for widened memory accesses of IndexTy for
 /// all parts. If IsReverse is true, compute pointers for accessing the input in
 /// reverse order per part.
-class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPValue {
+class VPVectorPointerRecipe : public VPRecipeWithIRFlags {
   Type *IndexedTy;
   bool IsReverse;
 
@@ -1382,7 +1447,7 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPValue {
                         bool IsInBounds, DebugLoc DL)
       : VPRecipeWithIRFlags(VPDef::VPVectorPointerSC, ArrayRef<VPValue *>(Ptr),
                             GEPFlagsTy(IsInBounds), DL),
-        VPValue(this), IndexedTy(IndexedTy), IsReverse(IsReverse) {}
+        IndexedTy(IndexedTy), IsReverse(IsReverse) {}
 
   VP_CLASSOF_IMPL(VPDef::VPVectorPointerSC)
 
@@ -1424,11 +1489,11 @@ class VPVectorPointerRecipe : public VPRecipeWithIRFlags, public VPValue {
 ///  * VPWidenPointerInductionRecipe: Generate vector and scalar values for a
 ///    pointer induction. Produces either a vector PHI per-part or scalar values
 ///    per-lane based on the canonical induction.
-class VPHeaderPHIRecipe : public VPRecipeBase, public VPValue {
+class VPHeaderPHIRecipe : public VPSingleDefRecipe {
 protected:
   VPHeaderPHIRecipe(unsigned char VPDefID, Instruction *UnderlyingInstr,
                     VPValue *Start = nullptr, DebugLoc DL = {})
-      : VPRecipeBase(VPDefID, {}, DL), VPValue(this, UnderlyingInstr) {
+      : VPSingleDefRecipe(VPDefID, ArrayRef<VPValue *>(), UnderlyingInstr, DL) {
     if (Start)
       addOperand(Start);
   }
@@ -1709,14 +1774,13 @@ class VPReductionPHIRecipe : public VPHeaderPHIRecipe {
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
-class VPBlendRecipe : public VPRecipeBase, public VPValue {
+class VPBlendRecipe : public VPSingleDefRecipe {
 public:
   /// The blend operation is a User of the incoming values and of their
   /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
   /// might be incoming with a full mask for which there is no VPValue.
   VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
-      : VPRecipeBase(VPDef::VPBlendSC, Operands, Phi->getDebugLoc()),
-        VPValue(this, Phi) {
+      : VPSingleDefRecipe(VPDef::VPBlendSC, Operands, Phi, Phi->getDebugLoc()) {
     assert(Operands.size() > 0 &&
            ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
            "Expected either a single incoming value or a positive even number "
@@ -1843,14 +1907,15 @@ class VPInterleaveRecipe : public VPRecipeBase {
 /// A recipe to represent inloop reduction operations, performing a reduction on
 /// a vector operand into a scalar value, and adding the result to a chain.
 /// The Operands are {ChainOp, VecOp, [Condition]}.
-class VPReductionRecipe : public VPRecipeBase, public VPValue {
+class VPReductionRecipe : public VPSingleDefRecipe {
   /// The recurrence decriptor for the reduction in question.
   const RecurrenceDescriptor &RdxDesc;
 
 public:
   VPReductionRecipe(const RecurrenceDescriptor &R, Instruction *I,
                     VPValue *ChainOp, VPValue *VecOp, VPValue *CondOp)
-      : VPRecipeBase(VPDef::VPReductionSC, {ChainOp, VecOp}), VPValue(this, I),
+      : VPSingleDefRecipe(VPDef::VPReductionSC,
+                          ArrayRef<VPValue *>({ChainOp, VecOp}), I),
         RdxDesc(R) {
     if (CondOp)
       addOperand(CondOp);
@@ -1883,7 +1948,7 @@ class VPReductionRecipe : public VPRecipeBase, public VPValue {
 /// copies of the original scalar type, one per lane, instead of producing a
 /// single copy of widened type for all lanes. If the instruction is known to be
 /// uniform only one copy, per lane zero, will be generated.
-class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPValue {
+class VPReplicateRecipe : public VPRecipeWithIRFlags {
   /// Indicator if only a single replica per lane is needed.
   bool IsUniform;
 
@@ -1895,7 +1960,7 @@ class VPReplicateRecipe : public VPRecipeWithIRFlags, public VPValue {
   VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
                     bool IsUniform, VPValue *Mask = nullptr)
       : VPRecipeWithIRFlags(VPDef::VPReplicateSC, Operands, *I),
-        VPValue(this, I), IsUniform(IsUniform), IsPredicated(Mask) {
+        IsUniform(IsUniform), IsPredicated(Mask) {
     if (Mask)
       addOperand(Mask);
   }
@@ -1993,12 +2058,12 @@ class VPBranchOnMaskRecipe : public VPRecipeBase {
 /// order to merge values that are set under such a branch and feed their uses.
 /// The phi nodes can be scalar or vector depending on the users of the value.
 /// This recipe works in concert with VPBranchOnMaskRecipe.
-class VPPredInstPHIRecipe : public VPRecipeBase, public VPValue {
+class VPPredInstPHIRecipe : public VPSingleDefRecipe {
 public:
   /// Construct a VPPredInstPHIRecipe given \p PredInst whose value needs a phi
   /// nodes after merging back from a Branch-on-Mask.
   VPPredInstPHIRecipe(VPValue *PredV)
-      : VPRecipeBase(VPDef::VPPredInstPHISC, PredV), VPValue(this) {}
+      : VPSingleDefRecipe(VPDef::VPPredInstPHISC, PredV) {}
   ~VPPredInstPHIRecipe() override = default;
 
   VP_CLASSOF_IMPL(VPDef::VPPredInstPHISC)
@@ -2119,14 +2184,13 @@ class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
 };
 
 /// Recipe to expand a SCEV expression.
-class VPExpandSCEVRecipe : public VPRecipeBase, public VPValue {
+class VPExpandSCEVRecipe : public VPSingleDefRecipe {
   const SCEV *Expr;
   ScalarEvolution &SE;
 
 public:
   VPExpandSCEVRecipe(const SCEV *Expr, ScalarEvolution &SE)
-      : VPRecipeBase(VPDef::VPExpandSCEVSC, {}), VPValue(this), Expr(Expr),
-        SE(SE) {}
+      : VPSingleDefRecipe(VPDef::VPExpandSCEVSC, {}), Expr(Expr), SE(SE) {}
 
   ~VPExpandSCEVRecipe() override = default;
 
@@ -2225,11 +2289,10 @@ class VPActiveLaneMaskPHIRecipe : public VPHeaderPHIRecipe {
 };
 
 /// A Recipe for widening the canonical induction variable of the vector loop.
-class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
+class VPWidenCanonicalIVRecipe : public VPSingleDefRecipe {
 public:
   VPWidenCanonicalIVRecipe(VPCanonicalIVPHIRecipe *CanonicalIV)
-      : VPRecipeBase(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}),
-        VPValue(this) {}
+      : VPSingleDefRecipe(VPDef::VPWidenCanonicalIVSC, {CanonicalIV}) {}
 
   ~VPWidenCanonicalIVRecipe() override = default;
 
@@ -2256,7 +2319,7 @@ class VPWidenCanonicalIVRecipe : public VPRecipeBase, public VPValue {
 /// A recipe for converting the canonical IV value to the corresponding value of
 /// an IV with different start and step values, using Start + CanonicalIV *
 /// Step.
-class VPDerivedIVRecipe : public VPRecipeBase, public VPValue {
+class VPDerivedIVRecipe : public VPSingleDefRecipe {
   /// If not nullptr, the result of the induction will get truncated to
   /// TruncResultTy.
   Type *TruncResultTy;
@@ -2271,8 +2334,8 @@ class VPDerivedIVRecipe : public VPRecipeBase, public VPValue {
   VPDerivedIVRecipe(const InductionDescriptor &IndDesc, VPValue *Start,
                     VPCanonicalIVPHIRecipe *CanonicalIV, VPValue *Step,
                     Type *TruncResultTy)
-      : VPRecipeBase(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
-        VPValue(this), TruncResultTy(TruncResultTy), Kind(IndDesc.getKind()),
+      : VPSingleDefRecipe(VPDef::VPDerivedIVSC, {Start, CanonicalIV, Step}),
+        TruncResultTy(TruncResultTy), Kind(IndDesc.getKind()),
         FPBinOp(dyn_cast_or_null<FPMathOperator>(IndDesc.getInductionBinOp())) {
   }
 
@@ -2309,7 +2372,7 @@ class VPDerivedIVRecipe : public VPRecipeBase, public VPValue {
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their scalar values.
-class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, public VPValue {
+class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags {
   Instruction::BinaryOps InductionOpcode;
 
 public:
@@ -2317,7 +2380,7 @@ class VPScalarIVStepsRecipe : public VPRecipeWithIRFlags, public VPValue {
                         Instruction::BinaryOps Opcode, FastMathFlags FMFs)
       : VPRecipeWithIRFlags(VPDef::VPScalarIVStepsSC,
                             ArrayRef<VPValue *>({IV, Step}), FMFs),
-        VPValue(this), InductionOpcode(Opcode) {}
+        InductionOpcode(Opcode) {}
 
   VPScalarIVStepsRecipe(const InductionDescriptor &IndDesc, VPValue *IV,
                         VPValue *Step)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index c553e2c9e7683..78e64e7ffb126 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -258,7 +258,7 @@ VPInstruction::VPInstruction(unsigned Opcode, CmpInst::Predicate Pred,
                              const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, ArrayRef<VPValue *>({A, B}),
                           Pred, DL),
-      VPValue(this), Opcode(Opcode), Name(Name.str()) {
+      Opcode(Opcode), Name(Name.str()) {
   assert(Opcode == Instruction::ICmp &&
          "only ICmp predicates supported at the moment");
 }
@@ -267,7 +267,7 @@ VPInstruction::VPInstruction(unsigned Opcode,
                              std::initializer_list<VPValue *> Operands,
                              FastMathFlags FMFs, DebugLoc DL, const Twine &Name)
     : VPRecipeWithIRFlags(VPDef::VPInstructionSC, Operands, FMFs, DL),
-      VPValue(this), Opcode(Opcode), Name(Name.str()) {
+      Opcode(Opcode), Name(Name.str()) {
   // Make sure the VPInstruction is a floating-point operation.
   assert(isFPMathOp() && "this op can't take fast-math flags");
 }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index b3694e74a3850..38671bafb4277 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -103,7 +103,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
   bool Changed = false;
   // First, collect the operands of all recipes in replicate blocks as seeds for
   // sinking.
-  SetVector<std::pair<VPBasicBlock *, VPRecipeBase *>> WorkList;
+  SetVector<std::pair<VPBasicBlock *, VPSingleDefRecipe *>> WorkList;
   for (VPRegionBlock *VPR : VPBlockUtils::blocksOnly<VPRegionBlock>(Iter)) {
     VPBasicBlock *EntryVPBB = VPR->getEntryBasicBlock();
     if (!VPR->isReplicator() || EntryVPBB->getSuccessors().size() != 2)
@@ -113,7 +113,8 @@ static bool sinkScalarOperands(VPlan &Plan) {
       continue;
     for (auto &Recipe : *VPBB) {
       for (VPValue *Op : Recipe.operands())
-        if (auto *Def = Op->getDefiningRecipe())
+        if (auto *Def =
+                dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
           WorkList.insert(std::make_pair(VPBB, Def));
     }
   }
@@ -122,7 +123,7 @@ static bool sinkScalarOperands(VPlan &Plan) {
   // Try to sink each replicate or scalar IV steps recipe in the worklist.
   for (unsigned I = 0; I != WorkList.size(); ++I) {
     VPBasicBlock *SinkTo;
-    VPRecipeBase *SinkCandidate;
+    VPSingleDefRecipe *SinkCandidate;
     std::tie(SinkTo, SinkCandidate) = WorkList[I];
     if (SinkCandidate->getParent() == SinkTo ||
         SinkCandidate->mayHaveSideEffects() ||
@@ -146,12 +147,11 @@ static bool sinkScalarOperands(VPlan &Plan) {
         return false;
       if (UI->getParent() == SinkTo)
         return true;
-      NeedsDuplicating =
-          UI->onlyFirstLaneUsed(SinkCandidate->getVPSingleValue());
+      NeedsDuplicating = UI->onlyFirstLaneUsed(SinkCandidate);
       // We only know how to duplicate VPRecipeRecipes for now.
       return NeedsDuplicating && isa<VPReplicateRecipe>(SinkCandidate);
     };
-    if (!all_of(SinkCandidate->getVPSingleValue()->users(), CanSinkWithUser))
+    if (!all_of(SinkCandidate->users(), CanSinkWithUser))
       continue;
 
     if (NeedsDuplicating) {
@@ -163,14 +163,14 @@ static bool sinkScalarOperands(VPlan &Plan) {
       // TODO: add ".cloned" suffix to name of Clone's VPValue.
 
       Clone->insertBefore(SinkCandidate);
-      SinkCandidate->getVPSingleValue()->replaceUsesWithIf(
-          Clone, [SinkTo](VPUser &U, unsigned) {
-            return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
-          });
+      SinkCandidate->replaceUsesWithIf(Clone, [SinkTo](VPUser &U, unsigned) {
+        return cast<VPRecipeBase>(&U)->getParent() != SinkTo;
+      });
     }
     SinkCandidate->moveBefore(*SinkTo, SinkTo->getFirstNonPhi());
     for (VPValue *Op : SinkCandidate->operands())
-      if (auto *Def = Op->getDefiningRecipe())
+      if (auto *Def =
+              dyn_cast_or_null<VPSingleDefRecipe>(Op->getDefiningRecipe()))
         WorkList.insert(std::make_pair(SinkTo, Def));
     Changed = true;
   }
@@ -412,16 +412,15 @@ void VPlanTransforms::removeRedundantInductionCasts(VPlan &Plan) {
     auto &Casts = IV->getInductionDescriptor().getCastInsts();
     VPValue *FindMyCast = IV;
     for (Instruction *IRCast : reverse(Casts)) {
-      VPRecipeBase *FoundUserCast = nullptr;
+      VPSingleDefRecipe *FoundUserCast = nullptr;
       for (auto *U : FindMyCast->users()) {
-        auto *UserCast = cast<VPRecipeBase>(U);
-        if (UserCast->getNumDefinedValues() == 1 &&
-            UserCast->getVPSingleValue()->getUnderlyingValue() == IRCast) {
+        auto *UserCast = dyn_cast<VPSingleDefRecipe>(U);
+        if (UserCast && UserCast->getUnderlyingValue() == IRCast) {
           FoundUserCast = UserCast;
           break;
         }
       }
-      FindMyCast = FoundUserCast->getVPSingleValue();
+      FindMyCast = FoundUserCast;
     }
     FindMyCast->replaceAllUsesWith(IV);
   }
@@ -1139,7 +1138,7 @@ void VPlanTransforms::addActiveLaneMask(
          "Must have widened canonical IV when tail folding!");
   auto *WideCanonicalIV =
       cast<VPWidenCanonicalIVRecipe>(*FoundWidenCanonicalIVUser);
-  VPRecipeBase *LaneMask;
+  VPSingleDefRecipe *LaneMask;
   if (UseActiveLaneMaskForControlFlow) {
     LaneMask = addVPLaneMaskPhiAndUpdateExitBranch(
         Plan, DataAndControlFlowWithoutRuntimeCheck);
@@ -1164,7 +1163,7 @@ void VPlanTransforms::addActiveLaneMask(
 
     assert(CompareToReplace->getOperand(0) == WideCanonicalIV &&
            "WidenCanonicalIV must be the first operand of the compare");
-    CompareToReplace->replaceAllUsesWith(LaneMask->getVPSingleValue());
+    CompareToReplace->replaceAllUsesWith(LaneMask);
     CompareToReplace->eraseFromParent();
   }
 }

From 0185c764563f5b58b315cf80325c52e2aedf5b90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= <Mirko.Brkusanin@amd.com>
Date: Fri, 19 Jan 2024 11:32:02 +0100
Subject: [PATCH 105/843] [AMDGPU] Fix test for expensive-checks build (#78687)

---
 .../AMDGPU/vgpr-mark-last-scratch-load.mir    | 132 +++++++++---------
 1 file changed, 69 insertions(+), 63 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir
index ab80b7d20984b..cee45216968df 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-mark-last-scratch-load.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -o - %s -run-pass=greedy -run-pass=amdgpu-mark-last-scratch-load | FileCheck -check-prefix=CHECK %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -o - %s -run-pass=greedy -run-pass=amdgpu-mark-last-scratch-load -verify-machineinstrs | FileCheck -check-prefix=CHECK %s
 
 --- |
   define amdgpu_cs void @test_spill_12x32() "amdgpu-num-vgpr"="12" {
@@ -15,6 +15,8 @@
 ---
 name: test_spill_12x32
 tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
 body:             |
   bb.0:
     liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
@@ -22,31 +24,31 @@ body:             |
     ; CHECK-LABEL: name: test_spill_12x32
     ; CHECK: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr1, %stack.1, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr2, %stack.2, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr3, %stack.3, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr4, %stack.4, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr5, %stack.5, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr6, %stack.6, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr7, %stack.7, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr8, %stack.8, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr9, %stack.9, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr10, %stack.10, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
-    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr11, %stack.11, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr0, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr2, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr3, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr4, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr5, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr6, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr7, %stack.7, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr8, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr9, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr10, %stack.10, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V32_SAVE $vgpr11, %stack.11, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
     ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.0, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.1, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.2, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.3, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.4, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.5, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.6, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.7, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.8, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.9, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.10, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.10, addrspace 5)
-    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.11, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.11, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.0, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.1, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.2, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.3, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.5, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.6, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.7, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.8, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.9, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.10, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.11, addrspace 5)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[SI_SPILL_V32_RESTORE]], implicit [[SI_SPILL_V32_RESTORE1]], implicit [[SI_SPILL_V32_RESTORE2]], implicit [[SI_SPILL_V32_RESTORE3]], implicit [[SI_SPILL_V32_RESTORE4]], implicit [[SI_SPILL_V32_RESTORE5]], implicit [[SI_SPILL_V32_RESTORE6]], implicit [[SI_SPILL_V32_RESTORE7]], implicit [[SI_SPILL_V32_RESTORE8]], implicit [[SI_SPILL_V32_RESTORE9]], implicit [[SI_SPILL_V32_RESTORE10]], implicit [[SI_SPILL_V32_RESTORE11]]
     %0:vgpr_32 = COPY $vgpr0
     %1:vgpr_32 = COPY $vgpr1
@@ -67,6 +69,8 @@ body:             |
 ---
 name: test_spill_384
 tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
 body:             |
   bb.0:
     liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
@@ -74,9 +78,9 @@ body:             |
     ; CHECK-LABEL: name: test_spill_384
     ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: SI_SPILL_V384_SAVE $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, %stack.0, $sp_reg, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: SI_SPILL_V384_SAVE $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11, %stack.0, $sgpr32, 0, implicit $exec :: (store (s384) into %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
-    ; CHECK-NEXT: [[SI_SPILL_V384_RESTORE:%[0-9]+]]:vreg_384 = SI_SPILL_V384_RESTORE %stack.0, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s384) from %stack.0, align 4, addrspace 5)
+    ; CHECK-NEXT: [[SI_SPILL_V384_RESTORE:%[0-9]+]]:vreg_384 = SI_SPILL_V384_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s384) from %stack.0, align 4, addrspace 5)
     ; CHECK-NEXT: S_ENDPGM 0, implicit [[SI_SPILL_V384_RESTORE]]
     %0:vreg_384 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
     INLINEASM &"", 1 /* sideeffect attdialect */, 12 /* clobber */, implicit-def early-clobber $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11
@@ -86,30 +90,32 @@ body:             |
 ---
 name: test_loop_12
 tracksRegLiveness: true
+machineFunctionInfo:
+  stackPtrOffsetReg: '$sgpr32'
 body:             |
   ; CHECK-LABEL: name: test_loop_12
   ; CHECK: bb.0:
   ; CHECK-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr12, %stack.0, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr11, %stack.1, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr10, %stack.2, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr9, %stack.3, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr8, %stack.4, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr7, %stack.5, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr6, %stack.6, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr5, %stack.7, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr4, %stack.8, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr3, %stack.9, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr2, %stack.10, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr1, %stack.11, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr0, %stack.12, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr12, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr11, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr10, %stack.2, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr9, %stack.3, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.3, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr8, %stack.4, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.4, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr7, %stack.5, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.5, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr6, %stack.6, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.6, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr5, %stack.7, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.7, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr4, %stack.8, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.8, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr3, %stack.9, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.9, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr2, %stack.10, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.10, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr1, %stack.11, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.11, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE $vgpr0, %stack.12, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.12, addrspace 5)
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   [[V_MOV_B32_e32_3:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_3]], %stack.16, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_3]], %stack.16, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
   ; CHECK-NEXT:   %res5:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   %res6:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   %res7:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
@@ -118,7 +124,7 @@ body:             |
   ; CHECK-NEXT:   %res10:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   %res11:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
   ; CHECK-NEXT:   %res12:vgpr_32 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
   ; CHECK-NEXT:   %vcmp:sreg_32 = V_CMP_LT_I32_e64 0, [[SI_SPILL_V32_RESTORE]], implicit $exec
   ; CHECK-NEXT:   %mask:sreg_32 = COPY $exec_lo, implicit-def $exec_lo
   ; CHECK-NEXT:   %sand:sreg_32 = S_AND_B32 %mask, %vcmp, implicit-def dead $scc
@@ -154,39 +160,39 @@ body:             |
   ; CHECK-NEXT: bb.3:
   ; CHECK-NEXT:   successors: %bb.4(0x04000000), %bb.3(0x7c000000)
   ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_1]], %stack.14, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.11, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_1]], %stack.14, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.14, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.11, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.11, addrspace 5)
   ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_]], 0, [[SI_SPILL_V32_RESTORE1]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.14, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.14, addrspace 5)
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_]], %stack.13, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.10, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.14, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.14, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_]], %stack.13, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.13, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.10, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.10, addrspace 5)
   ; CHECK-NEXT:   [[V_MOV_B32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_1]], 0, [[SI_SPILL_V32_RESTORE2]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE3:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.9, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.9, addrspace 5)
   ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_2]], 0, [[SI_SPILL_V32_RESTORE3]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_2]], %stack.15, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_2]], %stack.15, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.15, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE4:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.8, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.8, addrspace 5)
   ; CHECK-NEXT:   [[V_MOV_B32_e32_4:%[0-9]+]]:vgpr_32 = V_ADD_F32_e64 0, [[V_MOV_B32_e32_4]], 0, [[SI_SPILL_V32_RESTORE4]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE5:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.7, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.7, addrspace 5)
   ; CHECK-NEXT:   %res5:vgpr_32 = V_ADD_F32_e64 0, %res5, 0, [[SI_SPILL_V32_RESTORE5]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE6:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.6, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.6, addrspace 5)
   ; CHECK-NEXT:   %res6:vgpr_32 = V_ADD_F32_e64 0, %res6, 0, [[SI_SPILL_V32_RESTORE6]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE7:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.5, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.5, addrspace 5)
   ; CHECK-NEXT:   %res7:vgpr_32 = V_ADD_F32_e64 0, %res7, 0, [[SI_SPILL_V32_RESTORE7]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE8:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.4, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.4, addrspace 5)
   ; CHECK-NEXT:   %res8:vgpr_32 = V_ADD_F32_e64 0, %res8, 0, [[SI_SPILL_V32_RESTORE8]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE9:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.3, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.3, addrspace 5)
   ; CHECK-NEXT:   %res9:vgpr_32 = V_ADD_F32_e64 0, %res9, 0, [[SI_SPILL_V32_RESTORE9]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE10:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.2, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; CHECK-NEXT:   %res10:vgpr_32 = V_ADD_F32_e64 0, %res10, 0, [[SI_SPILL_V32_RESTORE10]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE11:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5)
   ; CHECK-NEXT:   %res11:vgpr_32 = V_ADD_F32_e64 0, %res11, 0, [[SI_SPILL_V32_RESTORE11]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE12:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE12:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, addrspace 5)
   ; CHECK-NEXT:   %res12:vgpr_32 = V_ADD_F32_e64 0, %res12, 0, [[SI_SPILL_V32_RESTORE12]], 0, 0, implicit $mode, implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.13, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.13, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.13, addrspace 5)
   ; CHECK-NEXT:   %count:sgpr_32 = nuw nsw S_ADD_I32 %count, 1, implicit-def dead $scc
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE13:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE13:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.12, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.12, addrspace 5)
   ; CHECK-NEXT:   %vcmp2:sreg_32 = V_CMP_GE_I32_e64 %count, [[SI_SPILL_V32_RESTORE13]], implicit $exec
-  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.15, $sp_reg, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
+  ; CHECK-NEXT:   [[V_MOV_B32_e32_2:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.15, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.15, addrspace 5)
   ; CHECK-NEXT:   %mask2:sgpr_32 = S_OR_B32 %vcmp2, %mask2, implicit-def $scc
   ; CHECK-NEXT:   $exec_lo = S_ANDN2_B32_term $exec_lo, %mask2, implicit-def $scc
   ; CHECK-NEXT:   S_CBRANCH_EXECNZ %bb.3, implicit $exec
@@ -196,11 +202,11 @@ body:             |
   ; CHECK-NEXT:   successors: %bb.2(0x80000000)
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $exec_lo = S_OR_B32 $exec_lo, %mask2, implicit-def $scc
-  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_4]], %stack.16, $sp_reg, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
+  ; CHECK-NEXT:   SI_SPILL_V32_SAVE [[V_MOV_B32_e32_4]], %stack.16, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.16, addrspace 5)
   ; CHECK-NEXT:   S_BRANCH %bb.2
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT: bb.5:
-  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE14:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.16, $sp_reg, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.16, addrspace 5)
+  ; CHECK-NEXT:   [[SI_SPILL_V32_RESTORE14:%[0-9]+]]:vgpr_32 = SI_SPILL_V32_RESTORE %stack.16, $sgpr32, 0, implicit $exec :: ("amdgpu-last-use" load (s32) from %stack.16, addrspace 5)
   ; CHECK-NEXT:   EXP_DONE 0, [[V_MOV_B32_e32_]], [[V_MOV_B32_e32_1]], [[V_MOV_B32_e32_2]], [[SI_SPILL_V32_RESTORE14]], -1, 0, 15, implicit $exec
   ; CHECK-NEXT:   EXP_DONE 0, %res5, %res6, %res7, %res8, -1, 0, 15, implicit $exec
   ; CHECK-NEXT:   EXP_DONE 0, %res9, %res10, %res11, %res12, -1, 0, 15, implicit $exec

From a90347ecc7e32c25a3b3820723f8405369f3abdc Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Thu, 18 Jan 2024 15:39:57 +0000
Subject: [PATCH 106/843] Revert "Revert "[Flang][OpenMP] NFC: Minor
 refactoring of Reduction lowering code" (#73139)"

This reverts commit c2b3f16fb595fa88bfd21b455785c59ac6a21ed4.
---
 flang/lib/Lower/OpenMP.cpp | 778 ++++++++++++++++++++-----------------
 1 file changed, 424 insertions(+), 354 deletions(-)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 4b0582a6f129f..0aaf8f189a0ec 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -724,276 +724,438 @@ static void checkMapType(mlir::Location location, mlir::Type type) {
       TODO(location, "OMPD_target_data MapOperand BoxType");
 }
 
-static std::string getReductionName(llvm::StringRef name, mlir::Type ty) {
-  return (llvm::Twine(name) +
-          (ty.isIntOrIndex() ? llvm::Twine("_i_") : llvm::Twine("_f_")) +
-          llvm::Twine(ty.getIntOrFloatBitWidth()))
-      .str();
-}
+class ReductionProcessor {
+public:
+  enum IntrinsicProc { MAX, MIN, IAND, IOR, IEOR };
+  static IntrinsicProc
+  getReductionType(const Fortran::parser::ProcedureDesignator &pd) {
+    auto redType = llvm::StringSwitch<std::optional<IntrinsicProc>>(
+                       getRealName(pd).ToString())
+                       .Case("max", IntrinsicProc::MAX)
+                       .Case("min", IntrinsicProc::MIN)
+                       .Case("iand", IntrinsicProc::IAND)
+                       .Case("ior", IntrinsicProc::IOR)
+                       .Case("ieor", IntrinsicProc::IEOR)
+                       .Default(std::nullopt);
+    assert(redType && "Invalid Reduction");
+    return *redType;
+  }
 
-static std::string getReductionName(
-    Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-    mlir::Type ty) {
-  std::string reductionName;
+  static bool supportedIntrinsicProcReduction(
+      const Fortran::parser::ProcedureDesignator &pd) {
+    const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(pd)};
+    assert(name && "Invalid Reduction Intrinsic.");
+    auto redType = llvm::StringSwitch<std::optional<IntrinsicProc>>(
+                       getRealName(name).ToString())
+                       .Case("max", IntrinsicProc::MAX)
+                       .Case("min", IntrinsicProc::MIN)
+                       .Case("iand", IntrinsicProc::IAND)
+                       .Case("ior", IntrinsicProc::IOR)
+                       .Case("ieor", IntrinsicProc::IEOR)
+                       .Default(std::nullopt);
+    if (redType)
+      return true;
+    return false;
+  }
 
-  switch (intrinsicOp) {
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
-    reductionName = "add_reduction";
-    break;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
-    reductionName = "multiply_reduction";
-    break;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
-    return "and_reduction";
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
-    return "eqv_reduction";
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
-    return "or_reduction";
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
-    return "neqv_reduction";
-  default:
-    reductionName = "other_reduction";
-    break;
+  static const Fortran::semantics::SourceName
+  getRealName(const Fortran::parser::Name *name) {
+    return name->symbol->GetUltimate().name();
   }
 
-  return getReductionName(reductionName, ty);
-}
+  static const Fortran::semantics::SourceName
+  getRealName(const Fortran::parser::ProcedureDesignator &pd) {
+    const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(pd)};
+    assert(name && "Invalid Reduction Intrinsic.");
+    return getRealName(name);
+  }
 
-/// This function returns the identity value of the operator \p reductionOpName.
-/// For example:
-///    0 + x = x,
-///    1 * x = x
-static int getOperationIdentity(llvm::StringRef reductionOpName,
-                                mlir::Location loc) {
-  if (reductionOpName.contains("add") || reductionOpName.contains("or") ||
-      reductionOpName.contains("neqv"))
-    return 0;
-  if (reductionOpName.contains("multiply") || reductionOpName.contains("and") ||
-      reductionOpName.contains("eqv"))
-    return 1;
-  TODO(loc, "Reduction of some intrinsic operators is not supported");
-}
+  static std::string getReductionName(llvm::StringRef name, mlir::Type ty) {
+    return (llvm::Twine(name) +
+            (ty.isIntOrIndex() ? llvm::Twine("_i_") : llvm::Twine("_f_")) +
+            llvm::Twine(ty.getIntOrFloatBitWidth()))
+        .str();
+  }
+
+  static std::string getReductionName(
+      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
+      mlir::Type ty) {
+    std::string reductionName;
+
+    switch (intrinsicOp) {
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+      reductionName = "add_reduction";
+      break;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+      reductionName = "multiply_reduction";
+      break;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
+      return "and_reduction";
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+      return "eqv_reduction";
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
+      return "or_reduction";
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+      return "neqv_reduction";
+    default:
+      reductionName = "other_reduction";
+      break;
+    }
 
-static mlir::Value getReductionInitValue(mlir::Location loc, mlir::Type type,
-                                         llvm::StringRef reductionOpName,
-                                         fir::FirOpBuilder &builder) {
-  assert((fir::isa_integer(type) || fir::isa_real(type) ||
-          type.isa<fir::LogicalType>()) &&
-         "only integer, logical and real types are currently supported");
-  if (reductionOpName.contains("max")) {
-    if (auto ty = type.dyn_cast<mlir::FloatType>()) {
-      const llvm::fltSemantics &sem = ty.getFloatSemantics();
-      return builder.createRealConstant(
-          loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/true));
+    return getReductionName(reductionName, ty);
+  }
+
+  /// This function returns the identity value of the operator \p
+  /// reductionOpName. For example:
+  ///    0 + x = x,
+  ///    1 * x = x
+  static int getOperationIdentity(
+      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
+      mlir::Location loc) {
+    switch (intrinsicOp) {
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+      return 0;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+      return 1;
+    default:
+      TODO(loc, "Reduction of some intrinsic operators is not supported");
     }
-    unsigned bits = type.getIntOrFloatBitWidth();
-    int64_t minInt = llvm::APInt::getSignedMinValue(bits).getSExtValue();
-    return builder.createIntegerConstant(loc, type, minInt);
-  } else if (reductionOpName.contains("min")) {
-    if (auto ty = type.dyn_cast<mlir::FloatType>()) {
-      const llvm::fltSemantics &sem = ty.getFloatSemantics();
-      return builder.createRealConstant(
-          loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/false));
+  }
+
+  static mlir::Value getIntrinsicProcInitValue(
+      mlir::Location loc, mlir::Type type,
+      const Fortran::parser::ProcedureDesignator &procDesignator,
+      fir::FirOpBuilder &builder) {
+    assert((fir::isa_integer(type) || fir::isa_real(type) ||
+            type.isa<fir::LogicalType>()) &&
+           "only integer, logical and real types are currently supported");
+    switch (getReductionType(procDesignator)) {
+    case IntrinsicProc::MAX: {
+      if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+        const llvm::fltSemantics &sem = ty.getFloatSemantics();
+        return builder.createRealConstant(
+            loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/true));
+      }
+      unsigned bits = type.getIntOrFloatBitWidth();
+      int64_t minInt = llvm::APInt::getSignedMinValue(bits).getSExtValue();
+      return builder.createIntegerConstant(loc, type, minInt);
     }
-    unsigned bits = type.getIntOrFloatBitWidth();
-    int64_t maxInt = llvm::APInt::getSignedMaxValue(bits).getSExtValue();
-    return builder.createIntegerConstant(loc, type, maxInt);
-  } else if (reductionOpName.contains("ior")) {
-    unsigned bits = type.getIntOrFloatBitWidth();
-    int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue();
-    return builder.createIntegerConstant(loc, type, zeroInt);
-  } else if (reductionOpName.contains("ieor")) {
-    unsigned bits = type.getIntOrFloatBitWidth();
-    int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue();
-    return builder.createIntegerConstant(loc, type, zeroInt);
-  } else if (reductionOpName.contains("iand")) {
-    unsigned bits = type.getIntOrFloatBitWidth();
-    int64_t allOnInt = llvm::APInt::getAllOnes(bits).getSExtValue();
-    return builder.createIntegerConstant(loc, type, allOnInt);
-  } else {
+    case IntrinsicProc::MIN: {
+      if (auto ty = type.dyn_cast<mlir::FloatType>()) {
+        const llvm::fltSemantics &sem = ty.getFloatSemantics();
+        return builder.createRealConstant(
+            loc, type, llvm::APFloat::getLargest(sem, /*Negative=*/false));
+      }
+      unsigned bits = type.getIntOrFloatBitWidth();
+      int64_t maxInt = llvm::APInt::getSignedMaxValue(bits).getSExtValue();
+      return builder.createIntegerConstant(loc, type, maxInt);
+    }
+    case IntrinsicProc::IOR: {
+      unsigned bits = type.getIntOrFloatBitWidth();
+      int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue();
+      return builder.createIntegerConstant(loc, type, zeroInt);
+    }
+    case IntrinsicProc::IEOR: {
+      unsigned bits = type.getIntOrFloatBitWidth();
+      int64_t zeroInt = llvm::APInt::getZero(bits).getSExtValue();
+      return builder.createIntegerConstant(loc, type, zeroInt);
+    }
+    case IntrinsicProc::IAND: {
+      unsigned bits = type.getIntOrFloatBitWidth();
+      int64_t allOnInt = llvm::APInt::getAllOnes(bits).getSExtValue();
+      return builder.createIntegerConstant(loc, type, allOnInt);
+    }
+    }
+    llvm_unreachable("Unknown Reduction Intrinsic");
+  }
+
+  static mlir::Value getIntrinsicOpInitValue(
+      mlir::Location loc, mlir::Type type,
+      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
+      fir::FirOpBuilder &builder) {
     if (type.isa<mlir::FloatType>())
       return builder.create<mlir::arith::ConstantOp>(
           loc, type,
-          builder.getFloatAttr(
-              type, (double)getOperationIdentity(reductionOpName, loc)));
+          builder.getFloatAttr(type,
+                               (double)getOperationIdentity(intrinsicOp, loc)));
 
     if (type.isa<fir::LogicalType>()) {
       mlir::Value intConst = builder.create<mlir::arith::ConstantOp>(
           loc, builder.getI1Type(),
           builder.getIntegerAttr(builder.getI1Type(),
-                                 getOperationIdentity(reductionOpName, loc)));
+                                 getOperationIdentity(intrinsicOp, loc)));
       return builder.createConvert(loc, type, intConst);
     }
 
     return builder.create<mlir::arith::ConstantOp>(
         loc, type,
-        builder.getIntegerAttr(type,
-                               getOperationIdentity(reductionOpName, loc)));
+        builder.getIntegerAttr(type, getOperationIdentity(intrinsicOp, loc)));
   }
-}
-
-template <typename FloatOp, typename IntegerOp>
-static mlir::Value getReductionOperation(fir::FirOpBuilder &builder,
-                                         mlir::Type type, mlir::Location loc,
-                                         mlir::Value op1, mlir::Value op2) {
-  assert(type.isIntOrIndexOrFloat() &&
-         "only integer and float types are currently supported");
-  if (type.isIntOrIndex())
-    return builder.create<IntegerOp>(loc, op1, op2);
-  return builder.create<FloatOp>(loc, op1, op2);
-}
 
-static mlir::omp::ReductionDeclareOp
-createMinimalReductionDecl(fir::FirOpBuilder &builder,
-                           llvm::StringRef reductionOpName, mlir::Type type,
-                           mlir::Location loc) {
-  mlir::ModuleOp module = builder.getModule();
-  mlir::OpBuilder modBuilder(module.getBodyRegion());
-
-  mlir::omp::ReductionDeclareOp decl =
-      modBuilder.create<mlir::omp::ReductionDeclareOp>(loc, reductionOpName,
-                                                       type);
-  builder.createBlock(&decl.getInitializerRegion(),
-                      decl.getInitializerRegion().end(), {type}, {loc});
-  builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
-  mlir::Value init = getReductionInitValue(loc, type, reductionOpName, builder);
-  builder.create<mlir::omp::YieldOp>(loc, init);
-
-  builder.createBlock(&decl.getReductionRegion(),
-                      decl.getReductionRegion().end(), {type, type},
-                      {loc, loc});
-
-  return decl;
-}
+  template <typename FloatOp, typename IntegerOp>
+  static mlir::Value getReductionOperation(fir::FirOpBuilder &builder,
+                                           mlir::Type type, mlir::Location loc,
+                                           mlir::Value op1, mlir::Value op2) {
+    assert(type.isIntOrIndexOrFloat() &&
+           "only integer and float types are currently supported");
+    if (type.isIntOrIndex())
+      return builder.create<IntegerOp>(loc, op1, op2);
+    return builder.create<FloatOp>(loc, op1, op2);
+  }
 
-/// Creates an OpenMP reduction declaration and inserts it into the provided
-/// symbol table. The declaration has a constant initializer with the neutral
-/// value `initValue`, and the reduction combiner carried over from `reduce`.
-/// TODO: Generalize this for non-integer types, add atomic region.
-static mlir::omp::ReductionDeclareOp
-createReductionDecl(fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-                    const Fortran::parser::ProcedureDesignator &procDesignator,
-                    mlir::Type type, mlir::Location loc) {
-  mlir::OpBuilder::InsertionGuard guard(builder);
-  mlir::ModuleOp module = builder.getModule();
-
-  auto decl =
-      module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
-  if (decl)
-    return decl;
+  /// Creates an OpenMP reduction declaration and inserts it into the provided
+  /// symbol table. The declaration has a constant initializer with the neutral
+  /// value `initValue`, and the reduction combiner carried over from `reduce`.
+  /// TODO: Generalize this for non-integer types, add atomic region.
+  static mlir::omp::ReductionDeclareOp createReductionDecl(
+      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
+      const Fortran::parser::ProcedureDesignator &procDesignator,
+      mlir::Type type, mlir::Location loc) {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    mlir::ModuleOp module = builder.getModule();
+
+    auto decl =
+        module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
+    if (decl)
+      return decl;
 
-  decl = createMinimalReductionDecl(builder, reductionOpName, type, loc);
-  builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
-  mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
-  mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
+    mlir::OpBuilder modBuilder(module.getBodyRegion());
 
-  mlir::Value reductionOp;
-  if (const auto *name{
-          Fortran::parser::Unwrap<Fortran::parser::Name>(procDesignator)}) {
-    if (name->source == "max") {
+    decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(
+        loc, reductionOpName, type);
+    builder.createBlock(&decl.getInitializerRegion(),
+                        decl.getInitializerRegion().end(), {type}, {loc});
+    builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
+    mlir::Value init =
+        getIntrinsicProcInitValue(loc, type, procDesignator, builder);
+    builder.create<mlir::omp::YieldOp>(loc, init);
+
+    builder.createBlock(&decl.getReductionRegion(),
+                        decl.getReductionRegion().end(), {type, type},
+                        {loc, loc});
+
+    builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
+    mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
+    mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
+
+    mlir::Value reductionOp;
+    switch (getReductionType(procDesignator)) {
+    case IntrinsicProc::MAX:
       reductionOp =
           getReductionOperation<mlir::arith::MaximumFOp, mlir::arith::MaxSIOp>(
               builder, type, loc, op1, op2);
-    } else if (name->source == "min") {
+      break;
+    case IntrinsicProc::MIN:
       reductionOp =
           getReductionOperation<mlir::arith::MinimumFOp, mlir::arith::MinSIOp>(
               builder, type, loc, op1, op2);
-    } else if (name->source == "ior") {
+      break;
+    case IntrinsicProc::IOR:
       assert((type.isIntOrIndex()) && "only integer is expected");
       reductionOp = builder.create<mlir::arith::OrIOp>(loc, op1, op2);
-    } else if (name->source == "ieor") {
+      break;
+    case IntrinsicProc::IEOR:
       assert((type.isIntOrIndex()) && "only integer is expected");
       reductionOp = builder.create<mlir::arith::XOrIOp>(loc, op1, op2);
-    } else if (name->source == "iand") {
+      break;
+    case IntrinsicProc::IAND:
       assert((type.isIntOrIndex()) && "only integer is expected");
       reductionOp = builder.create<mlir::arith::AndIOp>(loc, op1, op2);
-    } else {
-      TODO(loc, "Reduction of some intrinsic operators is not supported");
+      break;
     }
+
+    builder.create<mlir::omp::YieldOp>(loc, reductionOp);
+    return decl;
   }
 
-  builder.create<mlir::omp::YieldOp>(loc, reductionOp);
-  return decl;
-}
+  /// Creates an OpenMP reduction declaration and inserts it into the provided
+  /// symbol table. The declaration has a constant initializer with the neutral
+  /// value `initValue`, and the reduction combiner carried over from `reduce`.
+  /// TODO: Generalize this for non-integer types, add atomic region.
+  static mlir::omp::ReductionDeclareOp createReductionDecl(
+      fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
+      Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
+      mlir::Type type, mlir::Location loc) {
+    mlir::OpBuilder::InsertionGuard guard(builder);
+    mlir::ModuleOp module = builder.getModule();
+
+    auto decl =
+        module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
+    if (decl)
+      return decl;
 
-/// Creates an OpenMP reduction declaration and inserts it into the provided
-/// symbol table. The declaration has a constant initializer with the neutral
-/// value `initValue`, and the reduction combiner carried over from `reduce`.
-/// TODO: Generalize this for non-integer types, add atomic region.
-static mlir::omp::ReductionDeclareOp createReductionDecl(
-    fir::FirOpBuilder &builder, llvm::StringRef reductionOpName,
-    Fortran::parser::DefinedOperator::IntrinsicOperator intrinsicOp,
-    mlir::Type type, mlir::Location loc) {
-  mlir::OpBuilder::InsertionGuard guard(builder);
-  mlir::ModuleOp module = builder.getModule();
-
-  auto decl =
-      module.lookupSymbol<mlir::omp::ReductionDeclareOp>(reductionOpName);
-  if (decl)
-    return decl;
+    mlir::OpBuilder modBuilder(module.getBodyRegion());
 
-  decl = createMinimalReductionDecl(builder, reductionOpName, type, loc);
-  builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
-  mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
-  mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
+    decl = modBuilder.create<mlir::omp::ReductionDeclareOp>(
+        loc, reductionOpName, type);
+    builder.createBlock(&decl.getInitializerRegion(),
+                        decl.getInitializerRegion().end(), {type}, {loc});
+    builder.setInsertionPointToEnd(&decl.getInitializerRegion().back());
+    mlir::Value init = getIntrinsicOpInitValue(loc, type, intrinsicOp, builder);
+    builder.create<mlir::omp::YieldOp>(loc, init);
 
-  mlir::Value reductionOp;
-  switch (intrinsicOp) {
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
-    reductionOp =
-        getReductionOperation<mlir::arith::AddFOp, mlir::arith::AddIOp>(
-            builder, type, loc, op1, op2);
-    break;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
-    reductionOp =
-        getReductionOperation<mlir::arith::MulFOp, mlir::arith::MulIOp>(
-            builder, type, loc, op1, op2);
-    break;
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: {
-    mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
-    mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
+    builder.createBlock(&decl.getReductionRegion(),
+                        decl.getReductionRegion().end(), {type, type},
+                        {loc, loc});
 
-    mlir::Value andiOp = builder.create<mlir::arith::AndIOp>(loc, op1I1, op2I1);
+    builder.setInsertionPointToEnd(&decl.getReductionRegion().back());
+    mlir::Value op1 = decl.getReductionRegion().front().getArgument(0);
+    mlir::Value op2 = decl.getReductionRegion().front().getArgument(1);
 
-    reductionOp = builder.createConvert(loc, type, andiOp);
-    break;
-  }
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: {
-    mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
-    mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
+    mlir::Value reductionOp;
+    switch (intrinsicOp) {
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+      reductionOp =
+          getReductionOperation<mlir::arith::AddFOp, mlir::arith::AddIOp>(
+              builder, type, loc, op1, op2);
+      break;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+      reductionOp =
+          getReductionOperation<mlir::arith::MulFOp, mlir::arith::MulIOp>(
+              builder, type, loc, op1, op2);
+      break;
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND: {
+      mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
+      mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
-    mlir::Value oriOp = builder.create<mlir::arith::OrIOp>(loc, op1I1, op2I1);
+      mlir::Value andiOp =
+          builder.create<mlir::arith::AndIOp>(loc, op1I1, op2I1);
 
-    reductionOp = builder.createConvert(loc, type, oriOp);
-    break;
-  }
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: {
-    mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
-    mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
+      reductionOp = builder.createConvert(loc, type, andiOp);
+      break;
+    }
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR: {
+      mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
+      mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
-    mlir::Value cmpiOp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::eq, op1I1, op2I1);
+      mlir::Value oriOp = builder.create<mlir::arith::OrIOp>(loc, op1I1, op2I1);
 
-    reductionOp = builder.createConvert(loc, type, cmpiOp);
-    break;
-  }
-  case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: {
-    mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
-    mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
+      reductionOp = builder.createConvert(loc, type, oriOp);
+      break;
+    }
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV: {
+      mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
+      mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
 
-    mlir::Value cmpiOp = builder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::ne, op1I1, op2I1);
+      mlir::Value cmpiOp = builder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::eq, op1I1, op2I1);
 
-    reductionOp = builder.createConvert(loc, type, cmpiOp);
-    break;
-  }
-  default:
-    TODO(loc, "Reduction of some intrinsic operators is not supported");
+      reductionOp = builder.createConvert(loc, type, cmpiOp);
+      break;
+    }
+    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV: {
+      mlir::Value op1I1 = builder.createConvert(loc, builder.getI1Type(), op1);
+      mlir::Value op2I1 = builder.createConvert(loc, builder.getI1Type(), op2);
+
+      mlir::Value cmpiOp = builder.create<mlir::arith::CmpIOp>(
+          loc, mlir::arith::CmpIPredicate::ne, op1I1, op2I1);
+
+      reductionOp = builder.createConvert(loc, type, cmpiOp);
+      break;
+    }
+    default:
+      TODO(loc, "Reduction of some intrinsic operators is not supported");
+    }
+
+    builder.create<mlir::omp::YieldOp>(loc, reductionOp);
+    return decl;
   }
 
-  builder.create<mlir::omp::YieldOp>(loc, reductionOp);
-  return decl;
-}
+  /// Creates a reduction declaration and associates it with an OpenMP block
+  /// directive.
+  static void addReductionDecl(
+      mlir::Location currentLocation,
+      Fortran::lower::AbstractConverter &converter,
+      const Fortran::parser::OmpReductionClause &reduction,
+      llvm::SmallVectorImpl<mlir::Value> &reductionVars,
+      llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols) {
+    fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+    mlir::omp::ReductionDeclareOp decl;
+    const auto &redOperator{
+        std::get<Fortran::parser::OmpReductionOperator>(reduction.t)};
+    const auto &objectList{
+        std::get<Fortran::parser::OmpObjectList>(reduction.t)};
+    if (const auto &redDefinedOp =
+            std::get_if<Fortran::parser::DefinedOperator>(&redOperator.u)) {
+      const auto &intrinsicOp{
+          std::get<Fortran::parser::DefinedOperator::IntrinsicOperator>(
+              redDefinedOp->u)};
+      switch (intrinsicOp) {
+      case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
+      case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
+      case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
+      case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
+      case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
+      case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
+        break;
+
+      default:
+        TODO(currentLocation,
+             "Reduction of some intrinsic operators is not supported");
+        break;
+      }
+      for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
+        if (const auto *name{
+                Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
+          if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
+            mlir::Value symVal = converter.getSymbolAddress(*symbol);
+            if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
+              symVal = declOp.getBase();
+            mlir::Type redType =
+                symVal.getType().cast<fir::ReferenceType>().getEleTy();
+            reductionVars.push_back(symVal);
+            if (redType.isa<fir::LogicalType>())
+              decl = createReductionDecl(
+                  firOpBuilder,
+                  getReductionName(intrinsicOp, firOpBuilder.getI1Type()),
+                  intrinsicOp, redType, currentLocation);
+            else if (redType.isIntOrIndexOrFloat()) {
+              decl = createReductionDecl(firOpBuilder,
+                                         getReductionName(intrinsicOp, redType),
+                                         intrinsicOp, redType, currentLocation);
+            } else {
+              TODO(currentLocation, "Reduction of some types is not supported");
+            }
+            reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
+                firOpBuilder.getContext(), decl.getSymName()));
+          }
+        }
+      }
+    } else if (const auto *reductionIntrinsic =
+                   std::get_if<Fortran::parser::ProcedureDesignator>(
+                       &redOperator.u)) {
+      if (ReductionProcessor::supportedIntrinsicProcReduction(
+              *reductionIntrinsic)) {
+        for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
+          if (const auto *name{
+                  Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
+            if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
+              mlir::Value symVal = converter.getSymbolAddress(*symbol);
+              if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
+                symVal = declOp.getBase();
+              mlir::Type redType =
+                  symVal.getType().cast<fir::ReferenceType>().getEleTy();
+              reductionVars.push_back(symVal);
+              assert(redType.isIntOrIndexOrFloat() &&
+                     "Unsupported reduction type");
+              decl = createReductionDecl(
+                  firOpBuilder,
+                  getReductionName(getRealName(*reductionIntrinsic).ToString(),
+                                   redType),
+                  *reductionIntrinsic, redType, currentLocation);
+              reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
+                  firOpBuilder.getContext(), decl.getSymName()));
+            }
+          }
+        }
+      }
+    }
+  }
+};
 
 static mlir::omp::ScheduleModifier
 translateScheduleModifier(const Fortran::parser::OmpScheduleModifierType &m) {
@@ -1176,101 +1338,6 @@ static mlir::Value getIfClauseOperand(
                                     ifVal);
 }
 
-/// Creates a reduction declaration and associates it with an OpenMP block
-/// directive.
-static void
-addReductionDecl(mlir::Location currentLocation,
-                 Fortran::lower::AbstractConverter &converter,
-                 const Fortran::parser::OmpReductionClause &reduction,
-                 llvm::SmallVectorImpl<mlir::Value> &reductionVars,
-                 llvm::SmallVectorImpl<mlir::Attribute> &reductionDeclSymbols) {
-  fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
-  mlir::omp::ReductionDeclareOp decl;
-  const auto &redOperator{
-      std::get<Fortran::parser::OmpReductionOperator>(reduction.t)};
-  const auto &objectList{std::get<Fortran::parser::OmpObjectList>(reduction.t)};
-  if (const auto &redDefinedOp =
-          std::get_if<Fortran::parser::DefinedOperator>(&redOperator.u)) {
-    const auto &intrinsicOp{
-        std::get<Fortran::parser::DefinedOperator::IntrinsicOperator>(
-            redDefinedOp->u)};
-    switch (intrinsicOp) {
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Add:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::Multiply:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::AND:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::EQV:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::OR:
-    case Fortran::parser::DefinedOperator::IntrinsicOperator::NEQV:
-      break;
-
-    default:
-      TODO(currentLocation,
-           "Reduction of some intrinsic operators is not supported");
-      break;
-    }
-    for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-      if (const auto *name{
-              Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-        if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-          mlir::Value symVal = converter.getSymbolAddress(*symbol);
-          if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
-            symVal = declOp.getBase();
-          mlir::Type redType =
-              symVal.getType().cast<fir::ReferenceType>().getEleTy();
-          reductionVars.push_back(symVal);
-          if (redType.isa<fir::LogicalType>())
-            decl = createReductionDecl(
-                firOpBuilder,
-                getReductionName(intrinsicOp, firOpBuilder.getI1Type()),
-                intrinsicOp, redType, currentLocation);
-          else if (redType.isIntOrIndexOrFloat()) {
-            decl = createReductionDecl(firOpBuilder,
-                                       getReductionName(intrinsicOp, redType),
-                                       intrinsicOp, redType, currentLocation);
-          } else {
-            TODO(currentLocation, "Reduction of some types is not supported");
-          }
-          reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
-              firOpBuilder.getContext(), decl.getSymName()));
-        }
-      }
-    }
-  } else if (const auto *reductionIntrinsic =
-                 std::get_if<Fortran::parser::ProcedureDesignator>(
-                     &redOperator.u)) {
-    if (const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(
-            reductionIntrinsic)}) {
-      if ((name->source != "max") && (name->source != "min") &&
-          (name->source != "ior") && (name->source != "ieor") &&
-          (name->source != "iand")) {
-        TODO(currentLocation,
-             "Reduction of intrinsic procedures is not supported");
-      }
-      std::string intrinsicOp = name->ToString();
-      for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-        if (const auto *name{
-                Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
-          if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-            mlir::Value symVal = converter.getSymbolAddress(*symbol);
-            if (auto declOp = symVal.getDefiningOp<hlfir::DeclareOp>())
-              symVal = declOp.getBase();
-            mlir::Type redType =
-                symVal.getType().cast<fir::ReferenceType>().getEleTy();
-            reductionVars.push_back(symVal);
-            assert(redType.isIntOrIndexOrFloat() &&
-                   "Unsupported reduction type");
-            decl = createReductionDecl(
-                firOpBuilder, getReductionName(intrinsicOp, redType),
-                *reductionIntrinsic, redType, currentLocation);
-            reductionDeclSymbols.push_back(mlir::SymbolRefAttr::get(
-                firOpBuilder.getContext(), decl.getSymName()));
-          }
-        }
-      }
-    }
-  }
-}
-
 static void
 addUseDeviceClause(Fortran::lower::AbstractConverter &converter,
                    const Fortran::parser::OmpObjectList &useDeviceClause,
@@ -1864,8 +1931,9 @@ bool ClauseProcessor::processReduction(
   return findRepeatableClause<ClauseTy::Reduction>(
       [&](const ClauseTy::Reduction *reductionClause,
           const Fortran::parser::CharBlock &) {
-        addReductionDecl(currentLocation, converter, reductionClause->v,
-                         reductionVars, reductionDeclSymbols);
+        ReductionProcessor rp;
+        rp.addReductionDecl(currentLocation, converter, reductionClause->v,
+                            reductionVars, reductionDeclSymbols);
       });
 }
 
@@ -3891,48 +3959,50 @@ void Fortran::lower::genOpenMPReduction(
       } else if (const auto *reductionIntrinsic =
                      std::get_if<Fortran::parser::ProcedureDesignator>(
                          &redOperator.u)) {
-        if (const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(
-                reductionIntrinsic)}) {
-          std::string redName = name->ToString();
-          if ((name->source != "max") && (name->source != "min") &&
-              (name->source != "ior") && (name->source != "ieor") &&
-              (name->source != "iand")) {
-            continue;
-          }
-          for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
-            if (const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(
-                    ompObject)}) {
-              if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
-                mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
-                if (auto declOp =
-                        reductionVal.getDefiningOp<hlfir::DeclareOp>())
-                  reductionVal = declOp.getBase();
-                for (const mlir::OpOperand &reductionValUse :
-                     reductionVal.getUses()) {
-                  if (auto loadOp = mlir::dyn_cast<fir::LoadOp>(
-                          reductionValUse.getOwner())) {
-                    mlir::Value loadVal = loadOp.getRes();
-                    // Max is lowered as a compare -> select.
-                    // Match the pattern here.
-                    mlir::Operation *reductionOp =
-                        findReductionChain(loadVal, &reductionVal);
-                    if (reductionOp == nullptr)
-                      continue;
-
-                    if (redName == "max" || redName == "min") {
-                      assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
-                             "Selection Op not found in reduction intrinsic");
-                      mlir::Operation *compareOp =
-                          getCompareFromReductionOp(reductionOp, loadVal);
-                      updateReduction(compareOp, firOpBuilder, loadVal,
-                                      reductionVal);
-                    }
-                    if (redName == "ior" || redName == "ieor" ||
-                        redName == "iand") {
+        if (!ReductionProcessor::supportedIntrinsicProcReduction(
+                *reductionIntrinsic))
+          continue;
+        ReductionProcessor::IntrinsicProc redIntrinsicProc =
+            ReductionProcessor::getReductionType(*reductionIntrinsic);
+        for (const Fortran::parser::OmpObject &ompObject : objectList.v) {
+          if (const auto *name{
+                  Fortran::parser::Unwrap<Fortran::parser::Name>(ompObject)}) {
+            if (const Fortran::semantics::Symbol * symbol{name->symbol}) {
+              mlir::Value reductionVal = converter.getSymbolAddress(*symbol);
+              if (auto declOp = reductionVal.getDefiningOp<hlfir::DeclareOp>())
+                reductionVal = declOp.getBase();
+              for (const mlir::OpOperand &reductionValUse :
+                   reductionVal.getUses()) {
+                if (auto loadOp = mlir::dyn_cast<fir::LoadOp>(
+                        reductionValUse.getOwner())) {
+                  mlir::Value loadVal = loadOp.getRes();
+                  // Max is lowered as a compare -> select.
+                  // Match the pattern here.
+                  mlir::Operation *reductionOp =
+                      findReductionChain(loadVal, &reductionVal);
+                  if (reductionOp == nullptr)
+                    continue;
+
+                  if (redIntrinsicProc ==
+                          ReductionProcessor::IntrinsicProc::MAX ||
+                      redIntrinsicProc ==
+                          ReductionProcessor::IntrinsicProc::MIN) {
+                    assert(mlir::isa<mlir::arith::SelectOp>(reductionOp) &&
+                           "Selection Op not found in reduction intrinsic");
+                    mlir::Operation *compareOp =
+                        getCompareFromReductionOp(reductionOp, loadVal);
+                    updateReduction(compareOp, firOpBuilder, loadVal,
+                                    reductionVal);
+                  }
+                  if (redIntrinsicProc ==
+                          ReductionProcessor::IntrinsicProc::IOR ||
+                      redIntrinsicProc ==
+                          ReductionProcessor::IntrinsicProc::IEOR ||
+                      redIntrinsicProc ==
+                          ReductionProcessor::IntrinsicProc::IAND) {
 
-                      updateReduction(reductionOp, firOpBuilder, loadVal,
-                                      reductionVal);
-                    }
+                    updateReduction(reductionOp, firOpBuilder, loadVal,
+                                    reductionVal);
                   }
                 }
               }

From 3d4128f7ff1cd44b95e5057295704f86c0f0b162 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Sof=C3=ADa=20Rodr=C3=ADguez?= <sofilin2000@icloud.com>
Date: Fri, 19 Jan 2024 11:56:53 +0100
Subject: [PATCH 107/843] [clang][ExtractAPI] Record availability information
 only for the target platform (#76823)

Currently, ExtractAPI provides availability information for all
platforms within a given domain. With this change, we narrow down the
output to include availability details only for the specified target
platform, so users can generate the symbol graph with only the
availability information they need, omitting information of the other
platforms.

This change reverts the functionality introduced in
[`57c9780`](https://github.com/llvm/llvm-project/commit/57c9780).

rdar://120419037
---
 clang/include/clang/ExtractAPI/API.h          | 245 +++++++++---------
 .../clang/ExtractAPI/AvailabilityInfo.h       |  55 ++--
 .../clang/ExtractAPI/ExtractAPIVisitor.h      | 165 ++++++------
 clang/lib/ExtractAPI/API.cpp                  | 115 ++++----
 clang/lib/ExtractAPI/AvailabilityInfo.cpp     |  63 ++---
 .../Serialization/SymbolGraphSerializer.cpp   |  56 ++--
 clang/test/ExtractAPI/availability.c          |  20 --
 clang/tools/libclang/CXExtractAPI.cpp         |   4 +-
 8 files changed, 338 insertions(+), 385 deletions(-)

diff --git a/clang/include/clang/ExtractAPI/API.h b/clang/include/clang/ExtractAPI/API.h
index b4c0e0ad39cdf..f4a6374161685 100644
--- a/clang/include/clang/ExtractAPI/API.h
+++ b/clang/include/clang/ExtractAPI/API.h
@@ -224,7 +224,7 @@ struct APIRecord {
   StringRef USR;
   StringRef Name;
   PresumedLoc Location;
-  AvailabilitySet Availabilities;
+  AvailabilityInfo Availability;
   LinkageInfo Linkage;
 
   /// Documentation comment lines attached to this symbol declaration.
@@ -256,12 +256,12 @@ struct APIRecord {
   APIRecord() = delete;
 
   APIRecord(RecordKind Kind, StringRef USR, StringRef Name,
-            PresumedLoc Location, AvailabilitySet Availabilities,
+            PresumedLoc Location, AvailabilityInfo Availability,
             LinkageInfo Linkage, const DocComment &Comment,
             DeclarationFragments Declaration, DeclarationFragments SubHeading,
             bool IsFromSystemHeader)
       : USR(USR), Name(Name), Location(Location),
-        Availabilities(std::move(Availabilities)), Linkage(Linkage),
+        Availability(std::move(Availability)), Linkage(Linkage),
         Comment(Comment), Declaration(Declaration), SubHeading(SubHeading),
         IsFromSystemHeader(IsFromSystemHeader), Kind(Kind) {}
 
@@ -274,10 +274,10 @@ struct APIRecord {
 
 struct NamespaceRecord : APIRecord {
   NamespaceRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                  AvailabilitySet Availabilities, LinkageInfo Linkage,
+                  AvailabilityInfo Availability, LinkageInfo Linkage,
                   const DocComment &Comment, DeclarationFragments Declaration,
                   DeclarationFragments SubHeading, bool IsFromSystemHeader)
-      : APIRecord(RK_Namespace, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_Namespace, USR, Name, Loc, std::move(Availability),
                   Linkage, Comment, Declaration, SubHeading,
                   IsFromSystemHeader) {}
 
@@ -291,23 +291,23 @@ struct GlobalFunctionRecord : APIRecord {
   FunctionSignature Signature;
 
   GlobalFunctionRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                       AvailabilitySet Availabilities, LinkageInfo Linkage,
+                       AvailabilityInfo Availability, LinkageInfo Linkage,
                        const DocComment &Comment,
                        DeclarationFragments Declaration,
                        DeclarationFragments SubHeading,
                        FunctionSignature Signature, bool IsFromSystemHeader)
-      : APIRecord(RK_GlobalFunction, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_GlobalFunction, USR, Name, Loc, std::move(Availability),
                   Linkage, Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         Signature(Signature) {}
 
   GlobalFunctionRecord(RecordKind Kind, StringRef USR, StringRef Name,
-                       PresumedLoc Loc, AvailabilitySet Availabilities,
+                       PresumedLoc Loc, AvailabilityInfo Availability,
                        LinkageInfo Linkage, const DocComment &Comment,
                        DeclarationFragments Declaration,
                        DeclarationFragments SubHeading,
                        FunctionSignature Signature, bool IsFromSystemHeader)
-      : APIRecord(Kind, USR, Name, Loc, std::move(Availabilities), Linkage,
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability), Linkage,
                   Comment, Declaration, SubHeading, IsFromSystemHeader),
         Signature(Signature) {}
 
@@ -323,14 +323,14 @@ struct GlobalFunctionTemplateRecord : GlobalFunctionRecord {
   Template Templ;
 
   GlobalFunctionTemplateRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                               AvailabilitySet Availabilities,
+                               AvailabilityInfo Availability,
                                LinkageInfo Linkage, const DocComment &Comment,
                                DeclarationFragments Declaration,
                                DeclarationFragments SubHeading,
                                FunctionSignature Signature, Template Template,
                                bool IsFromSystemHeader)
       : GlobalFunctionRecord(RK_GlobalFunctionTemplate, USR, Name, Loc,
-                             std::move(Availabilities), Linkage, Comment,
+                             std::move(Availability), Linkage, Comment,
                              Declaration, SubHeading, Signature,
                              IsFromSystemHeader),
         Templ(Template) {}
@@ -343,12 +343,12 @@ struct GlobalFunctionTemplateRecord : GlobalFunctionRecord {
 struct GlobalFunctionTemplateSpecializationRecord : GlobalFunctionRecord {
   GlobalFunctionTemplateSpecializationRecord(
       StringRef USR, StringRef Name, PresumedLoc Loc,
-      AvailabilitySet Availabilities, LinkageInfo Linkage,
+      AvailabilityInfo Availability, LinkageInfo Linkage,
       const DocComment &Comment, DeclarationFragments Declaration,
       DeclarationFragments SubHeading, FunctionSignature Signature,
       bool IsFromSystemHeader)
       : GlobalFunctionRecord(RK_GlobalFunctionTemplateSpecialization, USR, Name,
-                             Loc, std::move(Availabilities), Linkage, Comment,
+                             Loc, std::move(Availability), Linkage, Comment,
                              Declaration, SubHeading, Signature,
                              IsFromSystemHeader) {}
 
@@ -360,20 +360,20 @@ struct GlobalFunctionTemplateSpecializationRecord : GlobalFunctionRecord {
 /// This holds information associated with global functions.
 struct GlobalVariableRecord : APIRecord {
   GlobalVariableRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                       AvailabilitySet Availabilities, LinkageInfo Linkage,
+                       AvailabilityInfo Availability, LinkageInfo Linkage,
                        const DocComment &Comment,
                        DeclarationFragments Declaration,
                        DeclarationFragments SubHeading, bool IsFromSystemHeader)
-      : APIRecord(RK_GlobalVariable, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_GlobalVariable, USR, Name, Loc, std::move(Availability),
                   Linkage, Comment, Declaration, SubHeading,
                   IsFromSystemHeader) {}
 
   GlobalVariableRecord(RecordKind Kind, StringRef USR, StringRef Name,
-                       PresumedLoc Loc, AvailabilitySet Availabilities,
+                       PresumedLoc Loc, AvailabilityInfo Availability,
                        LinkageInfo Linkage, const DocComment &Comment,
                        DeclarationFragments Declaration,
                        DeclarationFragments SubHeading, bool IsFromSystemHeader)
-      : APIRecord(Kind, USR, Name, Loc, std::move(Availabilities), Linkage,
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability), Linkage,
                   Comment, Declaration, SubHeading, IsFromSystemHeader) {}
 
   static bool classof(const APIRecord *Record) {
@@ -388,13 +388,13 @@ struct GlobalVariableTemplateRecord : GlobalVariableRecord {
   Template Templ;
 
   GlobalVariableTemplateRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                               AvailabilitySet Availabilities,
+                               AvailabilityInfo Availability,
                                LinkageInfo Linkage, const DocComment &Comment,
                                DeclarationFragments Declaration,
                                DeclarationFragments SubHeading,
                                class Template Template, bool IsFromSystemHeader)
       : GlobalVariableRecord(RK_GlobalVariableTemplate, USR, Name, Loc,
-                             std::move(Availabilities), Linkage, Comment,
+                             std::move(Availability), Linkage, Comment,
                              Declaration, SubHeading, IsFromSystemHeader),
         Templ(Template) {}
 
@@ -406,11 +406,11 @@ struct GlobalVariableTemplateRecord : GlobalVariableRecord {
 struct GlobalVariableTemplateSpecializationRecord : GlobalVariableRecord {
   GlobalVariableTemplateSpecializationRecord(
       StringRef USR, StringRef Name, PresumedLoc Loc,
-      AvailabilitySet Availabilities, LinkageInfo Linkage,
+      AvailabilityInfo Availability, LinkageInfo Linkage,
       const DocComment &Comment, DeclarationFragments Declaration,
       DeclarationFragments SubHeading, bool IsFromSystemHeader)
       : GlobalVariableRecord(RK_GlobalVariableTemplateSpecialization, USR, Name,
-                             Loc, std::move(Availabilities), Linkage, Comment,
+                             Loc, std::move(Availability), Linkage, Comment,
                              Declaration, SubHeading, IsFromSystemHeader) {}
 
   static bool classof(const APIRecord *Record) {
@@ -424,12 +424,12 @@ struct GlobalVariableTemplatePartialSpecializationRecord
 
   GlobalVariableTemplatePartialSpecializationRecord(
       StringRef USR, StringRef Name, PresumedLoc Loc,
-      AvailabilitySet Availabilities, LinkageInfo Linkage,
+      AvailabilityInfo Availability, LinkageInfo Linkage,
       const DocComment &Comment, DeclarationFragments Declaration,
       DeclarationFragments SubHeading, class Template Template,
       bool IsFromSystemHeader)
       : GlobalVariableRecord(RK_GlobalVariableTemplatePartialSpecialization,
-                             USR, Name, Loc, std::move(Availabilities), Linkage,
+                             USR, Name, Loc, std::move(Availability), Linkage,
                              Comment, Declaration, SubHeading,
                              IsFromSystemHeader),
         Templ(Template) {}
@@ -442,10 +442,10 @@ struct GlobalVariableTemplatePartialSpecializationRecord
 /// This holds information associated with enum constants.
 struct EnumConstantRecord : APIRecord {
   EnumConstantRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                     AvailabilitySet Availabilities, const DocComment &Comment,
+                     AvailabilityInfo Availability, const DocComment &Comment,
                      DeclarationFragments Declaration,
                      DeclarationFragments SubHeading, bool IsFromSystemHeader)
-      : APIRecord(RK_EnumConstant, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_EnumConstant, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader) {}
 
@@ -462,10 +462,10 @@ struct EnumRecord : APIRecord {
   SmallVector<std::unique_ptr<EnumConstantRecord>> Constants;
 
   EnumRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-             AvailabilitySet Availabilities, const DocComment &Comment,
+             AvailabilityInfo Availability, const DocComment &Comment,
              DeclarationFragments Declaration, DeclarationFragments SubHeading,
              bool IsFromSystemHeader)
-      : APIRecord(RK_Enum, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_Enum, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader) {}
 
@@ -480,10 +480,10 @@ struct EnumRecord : APIRecord {
 /// This holds information associated with struct fields.
 struct StructFieldRecord : APIRecord {
   StructFieldRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                    AvailabilitySet Availabilities, const DocComment &Comment,
+                    AvailabilityInfo Availability, const DocComment &Comment,
                     DeclarationFragments Declaration,
                     DeclarationFragments SubHeading, bool IsFromSystemHeader)
-      : APIRecord(RK_StructField, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_StructField, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader) {}
 
@@ -500,10 +500,10 @@ struct StructRecord : APIRecord {
   SmallVector<std::unique_ptr<StructFieldRecord>> Fields;
 
   StructRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-               AvailabilitySet Availabilities, const DocComment &Comment,
+               AvailabilityInfo Availability, const DocComment &Comment,
                DeclarationFragments Declaration,
                DeclarationFragments SubHeading, bool IsFromSystemHeader)
-      : APIRecord(RK_Struct, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_Struct, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader) {}
 
@@ -519,21 +519,21 @@ struct CXXFieldRecord : APIRecord {
   AccessControl Access;
 
   CXXFieldRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                 AvailabilitySet Availabilities, const DocComment &Comment,
+                 AvailabilityInfo Availability, const DocComment &Comment,
                  DeclarationFragments Declaration,
                  DeclarationFragments SubHeading, AccessControl Access,
                  bool IsFromSystemHeader)
-      : APIRecord(RK_CXXField, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_CXXField, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         Access(Access) {}
 
   CXXFieldRecord(RecordKind Kind, StringRef USR, StringRef Name,
-                 PresumedLoc Loc, AvailabilitySet Availabilities,
+                 PresumedLoc Loc, AvailabilityInfo Availability,
                  const DocComment &Comment, DeclarationFragments Declaration,
                  DeclarationFragments SubHeading, AccessControl Access,
                  bool IsFromSystemHeader)
-      : APIRecord(Kind, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         Access(Access) {}
@@ -550,13 +550,13 @@ struct CXXFieldTemplateRecord : CXXFieldRecord {
   Template Templ;
 
   CXXFieldTemplateRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                         AvailabilitySet Availabilities,
+                         AvailabilityInfo Availability,
                          const DocComment &Comment,
                          DeclarationFragments Declaration,
                          DeclarationFragments SubHeading, AccessControl Access,
                          Template Template, bool IsFromSystemHeader)
       : CXXFieldRecord(RK_CXXFieldTemplate, USR, Name, Loc,
-                       std::move(Availabilities), Comment, Declaration,
+                       std::move(Availability), Comment, Declaration,
                        SubHeading, Access, IsFromSystemHeader),
         Templ(Template) {}
 
@@ -572,11 +572,11 @@ struct CXXMethodRecord : APIRecord {
   CXXMethodRecord() = delete;
 
   CXXMethodRecord(RecordKind Kind, StringRef USR, StringRef Name,
-                  PresumedLoc Loc, AvailabilitySet Availabilities,
+                  PresumedLoc Loc, AvailabilityInfo Availability,
                   const DocComment &Comment, DeclarationFragments Declaration,
                   DeclarationFragments SubHeading, FunctionSignature Signature,
                   AccessControl Access, bool IsFromSystemHeader)
-      : APIRecord(Kind, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         Signature(Signature), Access(Access) {}
@@ -586,14 +586,13 @@ struct CXXMethodRecord : APIRecord {
 
 struct CXXConstructorRecord : CXXMethodRecord {
   CXXConstructorRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                       AvailabilitySet Availabilities,
-                       const DocComment &Comment,
+                       AvailabilityInfo Availability, const DocComment &Comment,
                        DeclarationFragments Declaration,
                        DeclarationFragments SubHeading,
                        FunctionSignature Signature, AccessControl Access,
                        bool IsFromSystemHeader)
       : CXXMethodRecord(RK_CXXConstructorMethod, USR, Name, Loc,
-                        std::move(Availabilities), Comment, Declaration,
+                        std::move(Availability), Comment, Declaration,
                         SubHeading, Signature, Access, IsFromSystemHeader) {}
   static bool classof(const APIRecord *Record) {
     return Record->getKind() == RK_CXXConstructorMethod;
@@ -605,13 +604,13 @@ struct CXXConstructorRecord : CXXMethodRecord {
 
 struct CXXDestructorRecord : CXXMethodRecord {
   CXXDestructorRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                      AvailabilitySet Availabilities, const DocComment &Comment,
+                      AvailabilityInfo Availability, const DocComment &Comment,
                       DeclarationFragments Declaration,
                       DeclarationFragments SubHeading,
                       FunctionSignature Signature, AccessControl Access,
                       bool IsFromSystemHeader)
       : CXXMethodRecord(RK_CXXDestructorMethod, USR, Name, Loc,
-                        std::move(Availabilities), Comment, Declaration,
+                        std::move(Availability), Comment, Declaration,
                         SubHeading, Signature, Access, IsFromSystemHeader) {}
   static bool classof(const APIRecord *Record) {
     return Record->getKind() == RK_CXXDestructorMethod;
@@ -623,14 +622,14 @@ struct CXXDestructorRecord : CXXMethodRecord {
 
 struct CXXStaticMethodRecord : CXXMethodRecord {
   CXXStaticMethodRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                        AvailabilitySet Availabilities,
+                        AvailabilityInfo Availability,
                         const DocComment &Comment,
                         DeclarationFragments Declaration,
                         DeclarationFragments SubHeading,
                         FunctionSignature Signature, AccessControl Access,
                         bool IsFromSystemHeader)
       : CXXMethodRecord(RK_CXXStaticMethod, USR, Name, Loc,
-                        std::move(Availabilities), Comment, Declaration,
+                        std::move(Availability), Comment, Declaration,
                         SubHeading, Signature, Access, IsFromSystemHeader) {}
   static bool classof(const APIRecord *Record) {
     return Record->getKind() == RK_CXXStaticMethod;
@@ -642,14 +641,14 @@ struct CXXStaticMethodRecord : CXXMethodRecord {
 
 struct CXXInstanceMethodRecord : CXXMethodRecord {
   CXXInstanceMethodRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                          AvailabilitySet Availabilities,
+                          AvailabilityInfo Availability,
                           const DocComment &Comment,
                           DeclarationFragments Declaration,
                           DeclarationFragments SubHeading,
                           FunctionSignature Signature, AccessControl Access,
                           bool IsFromSystemHeader)
       : CXXMethodRecord(RK_CXXInstanceMethod, USR, Name, Loc,
-                        std::move(Availabilities), Comment, Declaration,
+                        std::move(Availability), Comment, Declaration,
                         SubHeading, Signature, Access, IsFromSystemHeader) {}
 
   static bool classof(const APIRecord *Record) {
@@ -664,14 +663,14 @@ struct CXXMethodTemplateRecord : CXXMethodRecord {
   Template Templ;
 
   CXXMethodTemplateRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                          AvailabilitySet Availabilities,
+                          AvailabilityInfo Availability,
                           const DocComment &Comment,
                           DeclarationFragments Declaration,
                           DeclarationFragments SubHeading,
                           FunctionSignature Signature, AccessControl Access,
                           Template Template, bool IsFromSystemHeader)
       : CXXMethodRecord(RK_CXXMethodTemplate, USR, Name, Loc,
-                        std::move(Availabilities), Comment, Declaration,
+                        std::move(Availability), Comment, Declaration,
                         SubHeading, Signature, Access, IsFromSystemHeader),
         Templ(Template) {}
 
@@ -683,12 +682,12 @@ struct CXXMethodTemplateRecord : CXXMethodRecord {
 struct CXXMethodTemplateSpecializationRecord : CXXMethodRecord {
   CXXMethodTemplateSpecializationRecord(
       StringRef USR, StringRef Name, PresumedLoc Loc,
-      AvailabilitySet Availabilities, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       FunctionSignature Signature, AccessControl Access,
       bool IsFromSystemHeader)
       : CXXMethodRecord(RK_CXXMethodTemplateSpecialization, USR, Name, Loc,
-                        std::move(Availabilities), Comment, Declaration,
+                        std::move(Availability), Comment, Declaration,
                         SubHeading, Signature, Access, IsFromSystemHeader) {}
 
   static bool classof(const APIRecord *Record) {
@@ -711,13 +710,13 @@ struct ObjCPropertyRecord : APIRecord {
   bool IsOptional;
 
   ObjCPropertyRecord(RecordKind Kind, StringRef USR, StringRef Name,
-                     PresumedLoc Loc, AvailabilitySet Availabilities,
+                     PresumedLoc Loc, AvailabilityInfo Availability,
                      const DocComment &Comment,
                      DeclarationFragments Declaration,
                      DeclarationFragments SubHeading, AttributeKind Attributes,
                      StringRef GetterName, StringRef SetterName,
                      bool IsOptional, bool IsFromSystemHeader)
-      : APIRecord(Kind, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         Attributes(Attributes), GetterName(GetterName), SetterName(SetterName),
@@ -731,7 +730,7 @@ struct ObjCPropertyRecord : APIRecord {
 
 struct ObjCInstancePropertyRecord : ObjCPropertyRecord {
   ObjCInstancePropertyRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                             AvailabilitySet Availabilities,
+                             AvailabilityInfo Availability,
                              const DocComment &Comment,
                              DeclarationFragments Declaration,
                              DeclarationFragments SubHeading,
@@ -739,7 +738,7 @@ struct ObjCInstancePropertyRecord : ObjCPropertyRecord {
                              StringRef SetterName, bool IsOptional,
                              bool IsFromSystemHeader)
       : ObjCPropertyRecord(RK_ObjCInstanceProperty, USR, Name, Loc,
-                           std::move(Availabilities), Comment, Declaration,
+                           std::move(Availability), Comment, Declaration,
                            SubHeading, Attributes, GetterName, SetterName,
                            IsOptional, IsFromSystemHeader) {}
 
@@ -753,7 +752,7 @@ struct ObjCInstancePropertyRecord : ObjCPropertyRecord {
 
 struct ObjCClassPropertyRecord : ObjCPropertyRecord {
   ObjCClassPropertyRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                          AvailabilitySet Availabilities,
+                          AvailabilityInfo Availability,
                           const DocComment &Comment,
                           DeclarationFragments Declaration,
                           DeclarationFragments SubHeading,
@@ -761,7 +760,7 @@ struct ObjCClassPropertyRecord : ObjCPropertyRecord {
                           StringRef SetterName, bool IsOptional,
                           bool IsFromSystemHeader)
       : ObjCPropertyRecord(RK_ObjCClassProperty, USR, Name, Loc,
-                           std::move(Availabilities), Comment, Declaration,
+                           std::move(Availability), Comment, Declaration,
                            SubHeading, Attributes, GetterName, SetterName,
                            IsOptional, IsFromSystemHeader) {}
 
@@ -779,12 +778,12 @@ struct ObjCInstanceVariableRecord : APIRecord {
   AccessControl Access;
 
   ObjCInstanceVariableRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                             AvailabilitySet Availabilities,
+                             AvailabilityInfo Availability,
                              const DocComment &Comment,
                              DeclarationFragments Declaration,
                              DeclarationFragments SubHeading,
                              AccessControl Access, bool IsFromSystemHeader)
-      : APIRecord(RK_ObjCIvar, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_ObjCIvar, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         Access(Access) {}
@@ -804,11 +803,11 @@ struct ObjCMethodRecord : APIRecord {
   ObjCMethodRecord() = delete;
 
   ObjCMethodRecord(RecordKind Kind, StringRef USR, StringRef Name,
-                   PresumedLoc Loc, AvailabilitySet Availabilities,
+                   PresumedLoc Loc, AvailabilityInfo Availability,
                    const DocComment &Comment, DeclarationFragments Declaration,
                    DeclarationFragments SubHeading, FunctionSignature Signature,
                    bool IsFromSystemHeader)
-      : APIRecord(Kind, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         Signature(Signature) {}
@@ -818,13 +817,13 @@ struct ObjCMethodRecord : APIRecord {
 
 struct ObjCInstanceMethodRecord : ObjCMethodRecord {
   ObjCInstanceMethodRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                           AvailabilitySet Availabilities,
+                           AvailabilityInfo Availability,
                            const DocComment &Comment,
                            DeclarationFragments Declaration,
                            DeclarationFragments SubHeading,
                            FunctionSignature Signature, bool IsFromSystemHeader)
       : ObjCMethodRecord(RK_ObjCInstanceMethod, USR, Name, Loc,
-                         std::move(Availabilities), Comment, Declaration,
+                         std::move(Availability), Comment, Declaration,
                          SubHeading, Signature, IsFromSystemHeader) {}
   static bool classof(const APIRecord *Record) {
     return Record->getKind() == RK_ObjCInstanceMethod;
@@ -836,13 +835,13 @@ struct ObjCInstanceMethodRecord : ObjCMethodRecord {
 
 struct ObjCClassMethodRecord : ObjCMethodRecord {
   ObjCClassMethodRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                        AvailabilitySet Availabilities,
+                        AvailabilityInfo Availability,
                         const DocComment &Comment,
                         DeclarationFragments Declaration,
                         DeclarationFragments SubHeading,
                         FunctionSignature Signature, bool IsFromSystemHeader)
       : ObjCMethodRecord(RK_ObjCClassMethod, USR, Name, Loc,
-                         std::move(Availabilities), Comment, Declaration,
+                         std::move(Availability), Comment, Declaration,
                          SubHeading, Signature, IsFromSystemHeader) {}
 
   static bool classof(const APIRecord *Record) {
@@ -880,13 +879,13 @@ struct StaticFieldRecord : CXXFieldRecord {
   SymbolReference Context;
 
   StaticFieldRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                    AvailabilitySet Availabilities, LinkageInfo Linkage,
+                    AvailabilityInfo Availability, LinkageInfo Linkage,
                     const DocComment &Comment, DeclarationFragments Declaration,
                     DeclarationFragments SubHeading, SymbolReference Context,
                     AccessControl Access, bool IsFromSystemHeader)
-      : CXXFieldRecord(RK_StaticField, USR, Name, Loc,
-                       std::move(Availabilities), Comment, Declaration,
-                       SubHeading, Access, IsFromSystemHeader),
+      : CXXFieldRecord(RK_StaticField, USR, Name, Loc, std::move(Availability),
+                       Comment, Declaration, SubHeading, Access,
+                       IsFromSystemHeader),
         Context(Context) {}
 
   static bool classof(const APIRecord *Record) {
@@ -905,11 +904,11 @@ struct ObjCContainerRecord : APIRecord {
   ObjCContainerRecord() = delete;
 
   ObjCContainerRecord(RecordKind Kind, StringRef USR, StringRef Name,
-                      PresumedLoc Loc, AvailabilitySet Availabilities,
+                      PresumedLoc Loc, AvailabilityInfo Availability,
                       LinkageInfo Linkage, const DocComment &Comment,
                       DeclarationFragments Declaration,
                       DeclarationFragments SubHeading, bool IsFromSystemHeader)
-      : APIRecord(Kind, USR, Name, Loc, std::move(Availabilities), Linkage,
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability), Linkage,
                   Comment, Declaration, SubHeading, IsFromSystemHeader) {}
 
   virtual ~ObjCContainerRecord() = 0;
@@ -922,11 +921,11 @@ struct CXXClassRecord : APIRecord {
   AccessControl Access;
 
   CXXClassRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                 AvailabilitySet Availabilities, const DocComment &Comment,
+                 AvailabilityInfo Availability, const DocComment &Comment,
                  DeclarationFragments Declaration,
                  DeclarationFragments SubHeading, RecordKind Kind,
                  AccessControl Access, bool IsFromSystemHeader)
-      : APIRecord(Kind, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         Access(Access) {}
@@ -943,11 +942,11 @@ struct ClassTemplateRecord : CXXClassRecord {
   Template Templ;
 
   ClassTemplateRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                      AvailabilitySet Availabilities, const DocComment &Comment,
+                      AvailabilityInfo Availability, const DocComment &Comment,
                       DeclarationFragments Declaration,
                       DeclarationFragments SubHeading, Template Template,
                       AccessControl Access, bool IsFromSystemHeader)
-      : CXXClassRecord(USR, Name, Loc, std::move(Availabilities), Comment,
+      : CXXClassRecord(USR, Name, Loc, std::move(Availability), Comment,
                        Declaration, SubHeading, RK_ClassTemplate, Access,
                        IsFromSystemHeader),
         Templ(Template) {}
@@ -960,10 +959,10 @@ struct ClassTemplateRecord : CXXClassRecord {
 struct ClassTemplateSpecializationRecord : CXXClassRecord {
   ClassTemplateSpecializationRecord(
       StringRef USR, StringRef Name, PresumedLoc Loc,
-      AvailabilitySet Availabilities, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       AccessControl Access, bool IsFromSystemHeader)
-      : CXXClassRecord(USR, Name, Loc, std::move(Availabilities), Comment,
+      : CXXClassRecord(USR, Name, Loc, std::move(Availability), Comment,
                        Declaration, SubHeading, RK_ClassTemplateSpecialization,
                        Access, IsFromSystemHeader) {}
 
@@ -976,10 +975,10 @@ struct ClassTemplatePartialSpecializationRecord : CXXClassRecord {
   Template Templ;
   ClassTemplatePartialSpecializationRecord(
       StringRef USR, StringRef Name, PresumedLoc Loc,
-      AvailabilitySet Availabilities, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       Template Template, AccessControl Access, bool IsFromSystemHeader)
-      : CXXClassRecord(USR, Name, Loc, std::move(Availabilities), Comment,
+      : CXXClassRecord(USR, Name, Loc, std::move(Availability), Comment,
                        Declaration, SubHeading, RK_ClassTemplateSpecialization,
                        Access, IsFromSystemHeader),
         Templ(Template) {}
@@ -993,11 +992,11 @@ struct ConceptRecord : APIRecord {
   Template Templ;
 
   ConceptRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                AvailabilitySet Availabilities, const DocComment &Comment,
+                AvailabilityInfo Availability, const DocComment &Comment,
                 DeclarationFragments Declaration,
                 DeclarationFragments SubHeading, Template Template,
                 bool IsFromSystemHeader)
-      : APIRecord(RK_Concept, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_Concept, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         Templ(Template) {}
@@ -1010,12 +1009,12 @@ struct ObjCCategoryRecord : ObjCContainerRecord {
   bool IsFromExternalModule = false;
 
   ObjCCategoryRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                     AvailabilitySet Availabilities, const DocComment &Comment,
+                     AvailabilityInfo Availability, const DocComment &Comment,
                      DeclarationFragments Declaration,
                      DeclarationFragments SubHeading, SymbolReference Interface,
                      bool IsFromSystemHeader)
       : ObjCContainerRecord(RK_ObjCCategory, USR, Name, Loc,
-                            std::move(Availabilities), LinkageInfo::none(),
+                            std::move(Availability), LinkageInfo::none(),
                             Comment, Declaration, SubHeading,
                             IsFromSystemHeader),
         Interface(Interface) {}
@@ -1035,13 +1034,13 @@ struct ObjCInterfaceRecord : ObjCContainerRecord {
   SmallVector<ObjCCategoryRecord *> Categories;
 
   ObjCInterfaceRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                      AvailabilitySet Availabilities, LinkageInfo Linkage,
+                      AvailabilityInfo Availability, LinkageInfo Linkage,
                       const DocComment &Comment,
                       DeclarationFragments Declaration,
                       DeclarationFragments SubHeading,
                       SymbolReference SuperClass, bool IsFromSystemHeader)
       : ObjCContainerRecord(RK_ObjCInterface, USR, Name, Loc,
-                            std::move(Availabilities), Linkage, Comment,
+                            std::move(Availability), Linkage, Comment,
                             Declaration, SubHeading, IsFromSystemHeader),
         SuperClass(SuperClass) {}
 
@@ -1056,11 +1055,11 @@ struct ObjCInterfaceRecord : ObjCContainerRecord {
 /// This holds information associated with Objective-C protocols.
 struct ObjCProtocolRecord : ObjCContainerRecord {
   ObjCProtocolRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                     AvailabilitySet Availabilities, const DocComment &Comment,
+                     AvailabilityInfo Availability, const DocComment &Comment,
                      DeclarationFragments Declaration,
                      DeclarationFragments SubHeading, bool IsFromSystemHeader)
       : ObjCContainerRecord(RK_ObjCProtocol, USR, Name, Loc,
-                            std::move(Availabilities), LinkageInfo::none(),
+                            std::move(Availability), LinkageInfo::none(),
                             Comment, Declaration, SubHeading,
                             IsFromSystemHeader) {}
 
@@ -1078,7 +1077,7 @@ struct MacroDefinitionRecord : APIRecord {
                         DeclarationFragments Declaration,
                         DeclarationFragments SubHeading,
                         bool IsFromSystemHeader)
-      : APIRecord(RK_MacroDefinition, USR, Name, Loc, AvailabilitySet(),
+      : APIRecord(RK_MacroDefinition, USR, Name, Loc, AvailabilityInfo(),
                   LinkageInfo(), {}, Declaration, SubHeading,
                   IsFromSystemHeader) {}
 
@@ -1099,11 +1098,11 @@ struct TypedefRecord : APIRecord {
   SymbolReference UnderlyingType;
 
   TypedefRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
-                AvailabilitySet Availabilities, const DocComment &Comment,
+                AvailabilityInfo Availability, const DocComment &Comment,
                 DeclarationFragments Declaration,
                 DeclarationFragments SubHeading, SymbolReference UnderlyingType,
                 bool IsFromSystemHeader)
-      : APIRecord(RK_Typedef, USR, Name, Loc, std::move(Availabilities),
+      : APIRecord(RK_Typedef, USR, Name, Loc, std::move(Availability),
                   LinkageInfo(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader),
         UnderlyingType(UnderlyingType) {}
@@ -1191,7 +1190,7 @@ class APISet {
 public:
   NamespaceRecord *addNamespace(APIRecord *Parent, StringRef Name,
                                 StringRef USR, PresumedLoc Loc,
-                                AvailabilitySet Availability,
+                                AvailabilityInfo Availability,
                                 LinkageInfo Linkage, const DocComment &Comment,
                                 DeclarationFragments Declaration,
                                 DeclarationFragments SubHeading,
@@ -1204,13 +1203,13 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   GlobalVariableRecord *
   addGlobalVar(StringRef Name, StringRef USR, PresumedLoc Loc,
-               AvailabilitySet Availability, LinkageInfo Linkage,
+               AvailabilityInfo Availability, LinkageInfo Linkage,
                const DocComment &Comment, DeclarationFragments Declaration,
                DeclarationFragments SubHeadin, bool IsFromSystemHeaderg);
 
   GlobalVariableTemplateRecord *
   addGlobalVariableTemplate(StringRef Name, StringRef USR, PresumedLoc Loc,
-                            AvailabilitySet Availability, LinkageInfo Linkage,
+                            AvailabilityInfo Availability, LinkageInfo Linkage,
                             const DocComment &Comment,
                             DeclarationFragments Declaration,
                             DeclarationFragments SubHeading, Template Template,
@@ -1224,14 +1223,14 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   GlobalFunctionRecord *
   addGlobalFunction(StringRef Name, StringRef USR, PresumedLoc Loc,
-                    AvailabilitySet Availability, LinkageInfo Linkage,
+                    AvailabilityInfo Availability, LinkageInfo Linkage,
                     const DocComment &Comment, DeclarationFragments Declaration,
                     DeclarationFragments SubHeading,
                     FunctionSignature Signature, bool IsFromSystemHeader);
 
   GlobalFunctionTemplateRecord *addGlobalFunctionTemplate(
       StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, LinkageInfo Linkage,
+      AvailabilityInfo Availability, LinkageInfo Linkage,
       const DocComment &Comment, DeclarationFragments Declaration,
       DeclarationFragments SubHeading, FunctionSignature Signature,
       Template Template, bool IsFromSystemHeader);
@@ -1239,7 +1238,7 @@ class APISet {
   GlobalFunctionTemplateSpecializationRecord *
   addGlobalFunctionTemplateSpecialization(
       StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, LinkageInfo Linkage,
+      AvailabilityInfo Availability, LinkageInfo Linkage,
       const DocComment &Comment, DeclarationFragments Declaration,
       DeclarationFragments SubHeading, FunctionSignature Signature,
       bool IsFromSystemHeader);
@@ -1252,7 +1251,7 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   EnumConstantRecord *
   addEnumConstant(EnumRecord *Enum, StringRef Name, StringRef USR,
-                  PresumedLoc Loc, AvailabilitySet Availability,
+                  PresumedLoc Loc, AvailabilityInfo Availability,
                   const DocComment &Comment, DeclarationFragments Declaration,
                   DeclarationFragments SubHeading, bool IsFromSystemHeader);
 
@@ -1263,7 +1262,7 @@ class APISet {
   /// APISet itself, and APISet::recordUSR(const Decl *D) is a helper method
   /// to generate the USR for \c D and keep it alive in APISet.
   EnumRecord *addEnum(StringRef Name, StringRef USR, PresumedLoc Loc,
-                      AvailabilitySet Availability, const DocComment &Comment,
+                      AvailabilityInfo Availability, const DocComment &Comment,
                       DeclarationFragments Declaration,
                       DeclarationFragments SubHeading, bool IsFromSystemHeader);
 
@@ -1275,7 +1274,7 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   StructFieldRecord *
   addStructField(StructRecord *Struct, StringRef Name, StringRef USR,
-                 PresumedLoc Loc, AvailabilitySet Availability,
+                 PresumedLoc Loc, AvailabilityInfo Availability,
                  const DocComment &Comment, DeclarationFragments Declaration,
                  DeclarationFragments SubHeading, bool IsFromSystemHeader);
 
@@ -1286,7 +1285,7 @@ class APISet {
   /// APISet itself, and APISet::recordUSR(const Decl *D) is a helper method
   /// to generate the USR for \c D and keep it alive in APISet.
   StructRecord *addStruct(StringRef Name, StringRef USR, PresumedLoc Loc,
-                          AvailabilitySet Availability,
+                          AvailabilityInfo Availability,
                           const DocComment &Comment,
                           DeclarationFragments Declaration,
                           DeclarationFragments SubHeading,
@@ -1294,14 +1293,14 @@ class APISet {
 
   StaticFieldRecord *
   addStaticField(StringRef Name, StringRef USR, PresumedLoc Loc,
-                 AvailabilitySet Availabilities, LinkageInfo Linkage,
+                 AvailabilityInfo Availability, LinkageInfo Linkage,
                  const DocComment &Comment, DeclarationFragments Declaration,
                  DeclarationFragments SubHeading, SymbolReference Context,
                  AccessControl Access, bool IsFromSystemHeaderg);
 
   CXXFieldRecord *addCXXField(APIRecord *CXXClass, StringRef Name,
                               StringRef USR, PresumedLoc Loc,
-                              AvailabilitySet Availabilities,
+                              AvailabilityInfo Availability,
                               const DocComment &Comment,
                               DeclarationFragments Declaration,
                               DeclarationFragments SubHeading,
@@ -1309,12 +1308,12 @@ class APISet {
 
   CXXFieldTemplateRecord *addCXXFieldTemplate(
       APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       AccessControl Access, Template Template, bool IsFromSystemHeader);
 
   CXXClassRecord *addCXXClass(APIRecord *Parent, StringRef Name, StringRef USR,
-                              PresumedLoc Loc, AvailabilitySet Availability,
+                              PresumedLoc Loc, AvailabilityInfo Availability,
                               const DocComment &Comment,
                               DeclarationFragments Declaration,
                               DeclarationFragments SubHeading,
@@ -1323,76 +1322,76 @@ class APISet {
 
   ClassTemplateRecord *
   addClassTemplate(APIRecord *Parent, StringRef Name, StringRef USR,
-                   PresumedLoc Loc, AvailabilitySet Availability,
+                   PresumedLoc Loc, AvailabilityInfo Availability,
                    const DocComment &Comment, DeclarationFragments Declaration,
                    DeclarationFragments SubHeading, Template Template,
                    AccessControl Access, bool IsFromSystemHeader);
 
   ClassTemplateSpecializationRecord *addClassTemplateSpecialization(
       APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       AccessControl Access, bool IsFromSystemHeader);
 
   ClassTemplatePartialSpecializationRecord *
   addClassTemplatePartialSpecialization(
       APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       Template Template, AccessControl Access, bool IsFromSystemHeader);
 
   GlobalVariableTemplateSpecializationRecord *
   addGlobalVariableTemplateSpecialization(
       StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, LinkageInfo Linkage,
+      AvailabilityInfo Availability, LinkageInfo Linkage,
       const DocComment &Comment, DeclarationFragments Declaration,
       DeclarationFragments SubHeading, bool IsFromSystemHeader);
 
   GlobalVariableTemplatePartialSpecializationRecord *
   addGlobalVariableTemplatePartialSpecialization(
       StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, LinkageInfo Linkage,
+      AvailabilityInfo Availability, LinkageInfo Linkage,
       const DocComment &Comment, DeclarationFragments Declaration,
       DeclarationFragments SubHeading, Template Template,
       bool IsFromSystemHeader);
 
   CXXMethodRecord *addCXXInstanceMethod(
       APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       FunctionSignature Signature, AccessControl Access,
       bool IsFromSystemHeader);
 
   CXXMethodRecord *addCXXStaticMethod(
       APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       FunctionSignature Signature, AccessControl Access,
       bool IsFromSystemHeader);
 
   CXXMethodRecord *addCXXSpecialMethod(
       APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       FunctionSignature Signature, AccessControl Access,
       bool IsFromSystemHeader);
 
   CXXMethodTemplateRecord *addCXXMethodTemplate(
       APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       FunctionSignature Signature, AccessControl Access, Template Template,
       bool IsFromSystemHeader);
 
   CXXMethodTemplateSpecializationRecord *addCXXMethodTemplateSpec(
       APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-      AvailabilitySet Availability, const DocComment &Comment,
+      AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       FunctionSignature Signature, AccessControl Access,
       bool IsFromSystemHeader);
 
   ConceptRecord *addConcept(StringRef Name, StringRef USR, PresumedLoc Loc,
-                            AvailabilitySet Availability,
+                            AvailabilityInfo Availability,
                             const DocComment &Comment,
                             DeclarationFragments Declaration,
                             DeclarationFragments SubHeading, Template Template,
@@ -1406,7 +1405,7 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   ObjCCategoryRecord *
   addObjCCategory(StringRef Name, StringRef USR, PresumedLoc Loc,
-                  AvailabilitySet Availability, const DocComment &Comment,
+                  AvailabilityInfo Availability, const DocComment &Comment,
                   DeclarationFragments Declaration,
                   DeclarationFragments SubHeading, SymbolReference Interface,
                   bool IsFromSystemHeader, bool IsFromExternalModule);
@@ -1419,7 +1418,7 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   ObjCInterfaceRecord *
   addObjCInterface(StringRef Name, StringRef USR, PresumedLoc Loc,
-                   AvailabilitySet Availability, LinkageInfo Linkage,
+                   AvailabilityInfo Availability, LinkageInfo Linkage,
                    const DocComment &Comment, DeclarationFragments Declaration,
                    DeclarationFragments SubHeading, SymbolReference SuperClass,
                    bool IsFromSystemHeader);
@@ -1432,7 +1431,7 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   ObjCMethodRecord *
   addObjCMethod(ObjCContainerRecord *Container, StringRef Name, StringRef USR,
-                PresumedLoc Loc, AvailabilitySet Availability,
+                PresumedLoc Loc, AvailabilityInfo Availability,
                 const DocComment &Comment, DeclarationFragments Declaration,
                 DeclarationFragments SubHeading, FunctionSignature Signature,
                 bool IsInstanceMethod, bool IsFromSystemHeader);
@@ -1445,7 +1444,7 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   ObjCPropertyRecord *
   addObjCProperty(ObjCContainerRecord *Container, StringRef Name, StringRef USR,
-                  PresumedLoc Loc, AvailabilitySet Availability,
+                  PresumedLoc Loc, AvailabilityInfo Availability,
                   const DocComment &Comment, DeclarationFragments Declaration,
                   DeclarationFragments SubHeading,
                   ObjCPropertyRecord::AttributeKind Attributes,
@@ -1460,7 +1459,7 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   ObjCInstanceVariableRecord *addObjCInstanceVariable(
       ObjCContainerRecord *Container, StringRef Name, StringRef USR,
-      PresumedLoc Loc, AvailabilitySet Availability, const DocComment &Comment,
+      PresumedLoc Loc, AvailabilityInfo Availability, const DocComment &Comment,
       DeclarationFragments Declaration, DeclarationFragments SubHeading,
       ObjCInstanceVariableRecord::AccessControl Access,
       bool IsFromSystemHeader);
@@ -1473,7 +1472,7 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   ObjCProtocolRecord *
   addObjCProtocol(StringRef Name, StringRef USR, PresumedLoc Loc,
-                  AvailabilitySet Availability, const DocComment &Comment,
+                  AvailabilityInfo Availability, const DocComment &Comment,
                   DeclarationFragments Declaration,
                   DeclarationFragments SubHeading, bool IsFromSystemHeader);
 
@@ -1498,7 +1497,7 @@ class APISet {
   /// to generate the USR for \c D and keep it alive in APISet.
   TypedefRecord *
   addTypedef(StringRef Name, StringRef USR, PresumedLoc Loc,
-             AvailabilitySet Availability, const DocComment &Comment,
+             AvailabilityInfo Availability, const DocComment &Comment,
              DeclarationFragments Declaration, DeclarationFragments SubHeading,
              SymbolReference UnderlyingType, bool IsFromSystemHeader);
 
diff --git a/clang/include/clang/ExtractAPI/AvailabilityInfo.h b/clang/include/clang/ExtractAPI/AvailabilityInfo.h
index 0af373135b667..3b8d6f46ed56d 100644
--- a/clang/include/clang/ExtractAPI/AvailabilityInfo.h
+++ b/clang/include/clang/ExtractAPI/AvailabilityInfo.h
@@ -16,56 +16,31 @@
 #define LLVM_CLANG_EXTRACTAPI_AVAILABILITY_INFO_H
 
 #include "clang/AST/Decl.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/VersionTuple.h"
 #include "llvm/Support/raw_ostream.h"
 
-using llvm::VersionTuple;
-
 namespace clang {
 namespace extractapi {
 
-/// Stores availability attributes of a symbol in a given domain.
+/// Stores availability attributes of a symbol.
 struct AvailabilityInfo {
   /// The domain for which this availability info item applies
   std::string Domain;
   VersionTuple Introduced;
   VersionTuple Deprecated;
   VersionTuple Obsoleted;
-  bool Unavailable;
-
-  AvailabilityInfo() = default;
-
-  AvailabilityInfo(StringRef Domain, VersionTuple I, VersionTuple D,
-                   VersionTuple O, bool U)
-      : Domain(Domain), Introduced(I), Deprecated(D), Obsoleted(O),
-        Unavailable(U) {}
-};
-
-class AvailabilitySet {
-private:
-  using AvailabilityList = llvm::SmallVector<AvailabilityInfo, 4>;
-  AvailabilityList Availabilities;
-
   bool UnconditionallyDeprecated = false;
   bool UnconditionallyUnavailable = false;
 
-public:
-  AvailabilitySet(const Decl *Decl);
-  AvailabilitySet() = default;
-
-  AvailabilityList::const_iterator begin() const {
-    return Availabilities.begin();
-  }
-
-  AvailabilityList::const_iterator end() const { return Availabilities.end(); }
+  AvailabilityInfo() = default;
 
+  /// Determine if this AvailabilityInfo represents the default availability.
+  bool isDefault() const { return *this == AvailabilityInfo(); }
   /// Check if the symbol is unconditionally deprecated.
   ///
   /// i.e. \code __attribute__((deprecated)) \endcode
   bool isUnconditionallyDeprecated() const { return UnconditionallyDeprecated; }
-
   /// Check if the symbol is unconditionally unavailable.
   ///
   /// i.e. \code __attribute__((unavailable)) \endcode
@@ -73,10 +48,28 @@ class AvailabilitySet {
     return UnconditionallyUnavailable;
   }
 
-  /// Determine if this AvailabilitySet represents default availability.
-  bool isDefault() const { return Availabilities.empty(); }
+  AvailabilityInfo(StringRef Domain, VersionTuple I, VersionTuple D,
+                   VersionTuple O, bool UD, bool UU)
+      : Domain(Domain), Introduced(I), Deprecated(D), Obsoleted(O),
+        UnconditionallyDeprecated(UD), UnconditionallyUnavailable(UU) {}
+
+  friend bool operator==(const AvailabilityInfo &Lhs,
+                         const AvailabilityInfo &Rhs);
+
+public:
+  static AvailabilityInfo createFromDecl(const Decl *Decl);
 };
 
+inline bool operator==(const AvailabilityInfo &Lhs,
+                       const AvailabilityInfo &Rhs) {
+  return std::tie(Lhs.Introduced, Lhs.Deprecated, Lhs.Obsoleted,
+                  Lhs.UnconditionallyDeprecated,
+                  Lhs.UnconditionallyUnavailable) ==
+         std::tie(Rhs.Introduced, Rhs.Deprecated, Rhs.Obsoleted,
+                  Rhs.UnconditionallyDeprecated,
+                  Rhs.UnconditionallyUnavailable);
+}
+
 } // namespace extractapi
 } // namespace clang
 
diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
index a344fa7d5d8a7..1f76add1faae8 100644
--- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
+++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
@@ -18,6 +18,7 @@
 #include "clang/AST/DeclTemplate.h"
 #include "clang/Basic/OperatorKinds.h"
 #include "clang/Basic/Specifiers.h"
+#include "clang/ExtractAPI/AvailabilityInfo.h"
 #include "clang/ExtractAPI/DeclarationFragments.h"
 #include "llvm/ADT/FunctionExtras.h"
 
@@ -263,7 +264,6 @@ bool ExtractAPIVisitorBase<Derived>::VisitVarDecl(const VarDecl *Decl) {
       DeclarationFragmentsBuilder::getFragmentsForVar(Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-
   if (Decl->isStaticDataMember()) {
     SymbolReference Context;
     // getDeclContext() should return a RecordDecl since we
@@ -272,13 +272,14 @@ bool ExtractAPIVisitorBase<Derived>::VisitVarDecl(const VarDecl *Decl) {
     Context.Name = Record->getName();
     Context.USR = API.recordUSR(Record);
     auto Access = DeclarationFragmentsBuilder::getAccessControl(Decl);
-    API.addStaticField(Name, USR, Loc, AvailabilitySet(Decl), Linkage, Comment,
-                       Declaration, SubHeading, Context, Access,
-                       isInSystemHeader(Decl));
+    API.addStaticField(Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl),
+                       Linkage, Comment, Declaration, SubHeading, Context,
+                       Access, isInSystemHeader(Decl));
   } else
     // Add the global variable record to the API set.
-    API.addGlobalVar(Name, USR, Loc, AvailabilitySet(Decl), Linkage, Comment,
-                     Declaration, SubHeading, isInSystemHeader(Decl));
+    API.addGlobalVar(Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl),
+                     Linkage, Comment, Declaration, SubHeading,
+                     isInSystemHeader(Decl));
   return true;
 }
 
@@ -333,19 +334,19 @@ bool ExtractAPIVisitorBase<Derived>::VisitFunctionDecl(
       DeclarationFragmentsBuilder::getSubHeading(Decl);
   FunctionSignature Signature =
       DeclarationFragmentsBuilder::getFunctionSignature(Decl);
-
   if (Decl->getTemplateSpecializationInfo())
     API.addGlobalFunctionTemplateSpecialization(
-        Name, USR, Loc, AvailabilitySet(Decl), Linkage, Comment,
+        Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Linkage,
+        Comment,
         DeclarationFragmentsBuilder::
             getFragmentsForFunctionTemplateSpecialization(Decl),
         SubHeading, Signature, isInSystemHeader(Decl));
   else
     // Add the function record to the API set.
     API.addGlobalFunction(
-        Name, USR, Loc, AvailabilitySet(Decl), Linkage, Comment,
-        DeclarationFragmentsBuilder::getFragmentsForFunction(Decl), SubHeading,
-        Signature, isInSystemHeader(Decl));
+        Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Linkage,
+        Comment, DeclarationFragmentsBuilder::getFragmentsForFunction(Decl),
+        SubHeading, Signature, isInSystemHeader(Decl));
   return true;
 }
 
@@ -379,10 +380,9 @@ bool ExtractAPIVisitorBase<Derived>::VisitEnumDecl(const EnumDecl *Decl) {
       DeclarationFragmentsBuilder::getFragmentsForEnum(Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-
-  EnumRecord *EnumRecord =
-      API.addEnum(API.copyString(Name), USR, Loc, AvailabilitySet(Decl),
-                  Comment, Declaration, SubHeading, isInSystemHeader(Decl));
+  EnumRecord *EnumRecord = API.addEnum(
+      API.copyString(Name), USR, Loc, AvailabilityInfo::createFromDecl(Decl),
+      Comment, Declaration, SubHeading, isInSystemHeader(Decl));
 
   // Now collect information about the enumerators in this enum.
   getDerivedExtractAPIVisitor().recordEnumConstants(EnumRecord,
@@ -496,8 +496,9 @@ bool ExtractAPIVisitorBase<Derived>::VisitNamespaceDecl(
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
   APIRecord *Parent = determineParentRecord(Decl->getDeclContext());
-  API.addNamespace(Parent, Name, USR, Loc, AvailabilitySet(Decl), Linkage,
-                   Comment, Declaration, SubHeading, isInSystemHeader(Decl));
+  API.addNamespace(Parent, Name, USR, Loc,
+                   AvailabilityInfo::createFromDecl(Decl), Linkage, Comment,
+                   Declaration, SubHeading, isInSystemHeader(Decl));
 
   return true;
 }
@@ -527,10 +528,9 @@ bool ExtractAPIVisitorBase<Derived>::VisitRecordDecl(const RecordDecl *Decl) {
       DeclarationFragmentsBuilder::getFragmentsForStruct(Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-
   StructRecord *StructRecord =
-      API.addStruct(Name, USR, Loc, AvailabilitySet(Decl), Comment, Declaration,
-                    SubHeading, isInSystemHeader(Decl));
+      API.addStruct(Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl),
+                    Comment, Declaration, SubHeading, isInSystemHeader(Decl));
 
   // Now collect information about the fields in this struct.
   getDerivedExtractAPIVisitor().recordStructFields(StructRecord,
@@ -578,13 +578,13 @@ bool ExtractAPIVisitorBase<Derived>::VisitCXXRecordDecl(
         DeclarationFragmentsBuilder::getFragmentsForRedeclarableTemplate(
             Decl->getDescribedClassTemplate()));
     CXXClassRecord = API.addClassTemplate(
-        Parent, Name, USR, Loc, AvailabilitySet(Decl), Comment, Declaration,
-        SubHeading, Template(Decl->getDescribedClassTemplate()), Access,
-        isInSystemHeader(Decl));
+        Parent, Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Comment,
+        Declaration, SubHeading, Template(Decl->getDescribedClassTemplate()),
+        Access, isInSystemHeader(Decl));
   } else
     CXXClassRecord = API.addCXXClass(
-        Parent, Name, USR, Loc, AvailabilitySet(Decl), Comment, Declaration,
-        SubHeading, Kind, Access, isInSystemHeader(Decl));
+        Parent, Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Comment,
+        Declaration, SubHeading, Kind, Access, isInSystemHeader(Decl));
 
   CXXClassRecord->Bases = getBases(Decl);
 
@@ -624,7 +624,7 @@ bool ExtractAPIVisitorBase<Derived>::VisitCXXMethodDecl(
     FunctionTemplateDecl *TemplateDecl = Decl->getDescribedFunctionTemplate();
     API.addCXXMethodTemplate(
         API.findRecordForUSR(ParentUSR), Decl->getName(), USR, Loc,
-        AvailabilitySet(Decl), Comment,
+        AvailabilityInfo::createFromDecl(Decl), Comment,
         DeclarationFragmentsBuilder::getFragmentsForFunctionTemplate(
             TemplateDecl),
         SubHeading, DeclarationFragmentsBuilder::getFunctionSignature(Decl),
@@ -632,24 +632,27 @@ bool ExtractAPIVisitorBase<Derived>::VisitCXXMethodDecl(
         Template(TemplateDecl), isInSystemHeader(Decl));
   } else if (Decl->getTemplateSpecializationInfo())
     API.addCXXMethodTemplateSpec(
-        Parent, Decl->getName(), USR, Loc, AvailabilitySet(Decl), Comment,
+        Parent, Decl->getName(), USR, Loc,
+        AvailabilityInfo::createFromDecl(Decl), Comment,
         DeclarationFragmentsBuilder::
             getFragmentsForFunctionTemplateSpecialization(Decl),
         SubHeading, Signature, Access, isInSystemHeader(Decl));
   else if (Decl->isOverloadedOperator())
     API.addCXXInstanceMethod(
         Parent, API.copyString(Decl->getNameAsString()), USR, Loc,
-        AvailabilitySet(Decl), Comment,
+        AvailabilityInfo::createFromDecl(Decl), Comment,
         DeclarationFragmentsBuilder::getFragmentsForOverloadedOperator(Decl),
         SubHeading, Signature, Access, isInSystemHeader(Decl));
   else if (Decl->isStatic())
     API.addCXXStaticMethod(
-        Parent, Decl->getName(), USR, Loc, AvailabilitySet(Decl), Comment,
+        Parent, Decl->getName(), USR, Loc,
+        AvailabilityInfo::createFromDecl(Decl), Comment,
         DeclarationFragmentsBuilder::getFragmentsForCXXMethod(Decl), SubHeading,
         Signature, Access, isInSystemHeader(Decl));
   else
     API.addCXXInstanceMethod(
-        Parent, Decl->getName(), USR, Loc, AvailabilitySet(Decl), Comment,
+        Parent, Decl->getName(), USR, Loc,
+        AvailabilityInfo::createFromDecl(Decl), Comment,
         DeclarationFragmentsBuilder::getFragmentsForCXXMethod(Decl), SubHeading,
         Signature, Access, isInSystemHeader(Decl));
 
@@ -678,13 +681,12 @@ bool ExtractAPIVisitorBase<Derived>::VisitCXXConstructorDecl(
   FunctionSignature Signature =
       DeclarationFragmentsBuilder::getFunctionSignature(Decl);
   AccessControl Access = DeclarationFragmentsBuilder::getAccessControl(Decl);
-
   SmallString<128> ParentUSR;
   index::generateUSRForDecl(dyn_cast<CXXRecordDecl>(Decl->getDeclContext()),
                             ParentUSR);
   API.addCXXInstanceMethod(API.findRecordForUSR(ParentUSR), Name, USR, Loc,
-                           AvailabilitySet(Decl), Comment, Declaration,
-                           SubHeading, Signature, Access,
+                           AvailabilityInfo::createFromDecl(Decl), Comment,
+                           Declaration, SubHeading, Signature, Access,
                            isInSystemHeader(Decl));
   return true;
 }
@@ -711,13 +713,12 @@ bool ExtractAPIVisitorBase<Derived>::VisitCXXDestructorDecl(
   FunctionSignature Signature =
       DeclarationFragmentsBuilder::getFunctionSignature(Decl);
   AccessControl Access = DeclarationFragmentsBuilder::getAccessControl(Decl);
-
   SmallString<128> ParentUSR;
   index::generateUSRForDecl(dyn_cast<CXXRecordDecl>(Decl->getDeclContext()),
                             ParentUSR);
   API.addCXXInstanceMethod(API.findRecordForUSR(ParentUSR), Name, USR, Loc,
-                           AvailabilitySet(Decl), Comment, Declaration,
-                           SubHeading, Signature, Access,
+                           AvailabilityInfo::createFromDecl(Decl), Comment,
+                           Declaration, SubHeading, Signature, Access,
                            isInSystemHeader(Decl));
   return true;
 }
@@ -740,8 +741,9 @@ bool ExtractAPIVisitorBase<Derived>::VisitConceptDecl(const ConceptDecl *Decl) {
       DeclarationFragmentsBuilder::getFragmentsForConcept(Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-  API.addConcept(Name, USR, Loc, AvailabilitySet(Decl), Comment, Declaration,
-                 SubHeading, Template(Decl), isInSystemHeader(Decl));
+  API.addConcept(Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl),
+                 Comment, Declaration, SubHeading, Template(Decl),
+                 isInSystemHeader(Decl));
   return true;
 }
 
@@ -768,8 +770,9 @@ bool ExtractAPIVisitorBase<Derived>::VisitClassTemplateSpecializationDecl(
 
   APIRecord *Parent = determineParentRecord(Decl->getDeclContext());
   auto *ClassTemplateSpecializationRecord = API.addClassTemplateSpecialization(
-      Parent, Name, USR, Loc, AvailabilitySet(Decl), Comment, Declaration,
-      SubHeading, DeclarationFragmentsBuilder::getAccessControl(Decl),
+      Parent, Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Comment,
+      Declaration, SubHeading,
+      DeclarationFragmentsBuilder::getAccessControl(Decl),
       isInSystemHeader(Decl));
 
   ClassTemplateSpecializationRecord->Bases = getBases(Decl);
@@ -797,12 +800,11 @@ bool ExtractAPIVisitorBase<Derived>::
       getFragmentsForClassTemplatePartialSpecialization(Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-
   APIRecord *Parent = determineParentRecord(Decl->getDeclContext());
   auto *ClassTemplatePartialSpecRecord =
       API.addClassTemplatePartialSpecialization(
-          Parent, Name, USR, Loc, AvailabilitySet(Decl), Comment, Declaration,
-          SubHeading, Template(Decl),
+          Parent, Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl),
+          Comment, Declaration, SubHeading, Template(Decl),
           DeclarationFragmentsBuilder::getAccessControl(Decl),
           isInSystemHeader(Decl));
 
@@ -845,12 +847,13 @@ bool ExtractAPIVisitorBase<Derived>::VisitVarTemplateDecl(
                             ParentUSR);
   if (Decl->getDeclContext()->getDeclKind() == Decl::CXXRecord)
     API.addCXXFieldTemplate(API.findRecordForUSR(ParentUSR), Name, USR, Loc,
-                            AvailabilitySet(Decl), Comment, Declaration,
-                            SubHeading,
+                            AvailabilityInfo::createFromDecl(Decl), Comment,
+                            Declaration, SubHeading,
                             DeclarationFragmentsBuilder::getAccessControl(Decl),
                             Template(Decl), isInSystemHeader(Decl));
   else
-    API.addGlobalVariableTemplate(Name, USR, Loc, AvailabilitySet(Decl),
+    API.addGlobalVariableTemplate(Name, USR, Loc,
+                                  AvailabilityInfo::createFromDecl(Decl),
                                   Linkage, Comment, Declaration, SubHeading,
                                   Template(Decl), isInSystemHeader(Decl));
   return true;
@@ -880,10 +883,9 @@ bool ExtractAPIVisitorBase<Derived>::VisitVarTemplateSpecializationDecl(
           Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-
   API.addGlobalVariableTemplateSpecialization(
-      Name, USR, Loc, AvailabilitySet(Decl), Linkage, Comment, Declaration,
-      SubHeading, isInSystemHeader(Decl));
+      Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Linkage, Comment,
+      Declaration, SubHeading, isInSystemHeader(Decl));
   return true;
 }
 
@@ -910,10 +912,9 @@ bool ExtractAPIVisitorBase<Derived>::VisitVarTemplatePartialSpecializationDecl(
       getFragmentsForVarTemplatePartialSpecialization(Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-
   API.addGlobalVariableTemplatePartialSpecialization(
-      Name, USR, Loc, AvailabilitySet(Decl), Linkage, Comment, Declaration,
-      SubHeading, Template(Decl), isInSystemHeader(Decl));
+      Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Linkage, Comment,
+      Declaration, SubHeading, Template(Decl), isInSystemHeader(Decl));
   return true;
 }
 
@@ -942,9 +943,8 @@ bool ExtractAPIVisitorBase<Derived>::VisitFunctionTemplateDecl(
   FunctionSignature Signature =
       DeclarationFragmentsBuilder::getFunctionSignature(
           Decl->getTemplatedDecl());
-
   API.addGlobalFunctionTemplate(
-      Name, USR, Loc, AvailabilitySet(Decl), Linkage, Comment,
+      Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Linkage, Comment,
       DeclarationFragmentsBuilder::getFragmentsForFunctionTemplate(Decl),
       SubHeading, Signature, Template(Decl), isInSystemHeader(Decl));
 
@@ -983,8 +983,8 @@ bool ExtractAPIVisitorBase<Derived>::VisitObjCInterfaceDecl(
   }
 
   ObjCInterfaceRecord *ObjCInterfaceRecord = API.addObjCInterface(
-      Name, USR, Loc, AvailabilitySet(Decl), Linkage, Comment, Declaration,
-      SubHeading, SuperClass, isInSystemHeader(Decl));
+      Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Linkage, Comment,
+      Declaration, SubHeading, SuperClass, isInSystemHeader(Decl));
 
   // Record all methods (selectors). This doesn't include automatically
   // synthesized property methods.
@@ -1023,9 +1023,9 @@ bool ExtractAPIVisitorBase<Derived>::VisitObjCProtocolDecl(
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
 
-  ObjCProtocolRecord *ObjCProtocolRecord =
-      API.addObjCProtocol(Name, USR, Loc, AvailabilitySet(Decl), Comment,
-                          Declaration, SubHeading, isInSystemHeader(Decl));
+  ObjCProtocolRecord *ObjCProtocolRecord = API.addObjCProtocol(
+      Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Comment,
+      Declaration, SubHeading, isInSystemHeader(Decl));
 
   getDerivedExtractAPIVisitor().recordObjCMethods(ObjCProtocolRecord,
                                                   Decl->methods());
@@ -1080,7 +1080,8 @@ bool ExtractAPIVisitorBase<Derived>::VisitTypedefNameDecl(
       TypedefUnderlyingTypeResolver(Context).getSymbolReferenceForType(Type,
                                                                        API);
 
-  API.addTypedef(Name, USR, Loc, AvailabilitySet(Decl), Comment,
+  API.addTypedef(Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl),
+                 Comment,
                  DeclarationFragmentsBuilder::getFragmentsForTypedef(Decl),
                  DeclarationFragmentsBuilder::getSubHeading(Decl), SymRef,
                  isInSystemHeader(Decl));
@@ -1122,8 +1123,9 @@ bool ExtractAPIVisitorBase<Derived>::VisitObjCCategoryDecl(
   }
 
   ObjCCategoryRecord *ObjCCategoryRecord = API.addObjCCategory(
-      Name, USR, Loc, AvailabilitySet(Decl), Comment, Declaration, SubHeading,
-      Interface, isInSystemHeader(Decl), IsFromExternalModule);
+      Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Comment,
+      Declaration, SubHeading, Interface, isInSystemHeader(Decl),
+      IsFromExternalModule);
 
   getDerivedExtractAPIVisitor().recordObjCMethods(ObjCCategoryRecord,
                                                   Decl->methods());
@@ -1160,9 +1162,9 @@ void ExtractAPIVisitorBase<Derived>::recordEnumConstants(
     DeclarationFragments SubHeading =
         DeclarationFragmentsBuilder::getSubHeading(Constant);
 
-    API.addEnumConstant(EnumRecord, Name, USR, Loc, AvailabilitySet(Constant),
-                        Comment, Declaration, SubHeading,
-                        isInSystemHeader(Constant));
+    API.addEnumConstant(EnumRecord, Name, USR, Loc,
+                        AvailabilityInfo::createFromDecl(Constant), Comment,
+                        Declaration, SubHeading, isInSystemHeader(Constant));
   }
 }
 
@@ -1189,9 +1191,9 @@ void ExtractAPIVisitorBase<Derived>::recordStructFields(
     DeclarationFragments SubHeading =
         DeclarationFragmentsBuilder::getSubHeading(Field);
 
-    API.addStructField(StructRecord, Name, USR, Loc, AvailabilitySet(Field),
-                       Comment, Declaration, SubHeading,
-                       isInSystemHeader(Field));
+    API.addStructField(StructRecord, Name, USR, Loc,
+                       AvailabilityInfo::createFromDecl(Field), Comment,
+                       Declaration, SubHeading, isInSystemHeader(Field));
   }
 }
 
@@ -1223,8 +1225,8 @@ bool ExtractAPIVisitorBase<Derived>::VisitFieldDecl(const FieldDecl *Decl) {
   index::generateUSRForDecl(dyn_cast<CXXRecordDecl>(Decl->getDeclContext()),
                             ParentUSR);
   API.addCXXField(API.findRecordForUSR(ParentUSR), Name, USR, Loc,
-                  AvailabilitySet(Decl), Comment, Declaration, SubHeading,
-                  Access, isInSystemHeader(Decl));
+                  AvailabilityInfo::createFromDecl(Decl), Comment, Declaration,
+                  SubHeading, Access, isInSystemHeader(Decl));
   return true;
 }
 
@@ -1255,13 +1257,13 @@ bool ExtractAPIVisitorBase<Derived>::VisitCXXConversionDecl(
                             ParentUSR);
   if (Decl->isStatic())
     API.addCXXStaticMethod(API.findRecordForUSR(ParentUSR), Name, USR, Loc,
-                           AvailabilitySet(Decl), Comment, Declaration,
-                           SubHeading, Signature, Access,
+                           AvailabilityInfo::createFromDecl(Decl), Comment,
+                           Declaration, SubHeading, Signature, Access,
                            isInSystemHeader(Decl));
   else
     API.addCXXInstanceMethod(API.findRecordForUSR(ParentUSR), Name, USR, Loc,
-                             AvailabilitySet(Decl), Comment, Declaration,
-                             SubHeading, Signature, Access,
+                             AvailabilityInfo::createFromDecl(Decl), Comment,
+                             Declaration, SubHeading, Signature, Access,
                              isInSystemHeader(Decl));
   return true;
 }
@@ -1295,8 +1297,9 @@ void ExtractAPIVisitorBase<Derived>::recordObjCMethods(
     FunctionSignature Signature =
         DeclarationFragmentsBuilder::getFunctionSignature(Method);
 
-    API.addObjCMethod(Container, Name, USR, Loc, AvailabilitySet(Method),
-                      Comment, Declaration, SubHeading, Signature,
+    API.addObjCMethod(Container, Name, USR, Loc,
+                      AvailabilityInfo::createFromDecl(Method), Comment,
+                      Declaration, SubHeading, Signature,
                       Method->isInstanceMethod(), isInSystemHeader(Method));
   }
 }
@@ -1334,8 +1337,8 @@ void ExtractAPIVisitorBase<Derived>::recordObjCProperties(
       Attributes |= ObjCPropertyRecord::ReadOnly;
 
     API.addObjCProperty(
-        Container, Name, USR, Loc, AvailabilitySet(Property), Comment,
-        Declaration, SubHeading,
+        Container, Name, USR, Loc, AvailabilityInfo::createFromDecl(Property),
+        Comment, Declaration, SubHeading,
         static_cast<ObjCPropertyRecord::AttributeKind>(Attributes), GetterName,
         SetterName, Property->isOptional(),
         !(Property->getPropertyAttributes() &
@@ -1370,9 +1373,9 @@ void ExtractAPIVisitorBase<Derived>::recordObjCInstanceVariables(
     ObjCInstanceVariableRecord::AccessControl Access =
         Ivar->getCanonicalAccessControl();
 
-    API.addObjCInstanceVariable(Container, Name, USR, Loc,
-                                AvailabilitySet(Ivar), Comment, Declaration,
-                                SubHeading, Access, isInSystemHeader(Ivar));
+    API.addObjCInstanceVariable(
+        Container, Name, USR, Loc, AvailabilityInfo::createFromDecl(Ivar),
+        Comment, Declaration, SubHeading, Access, isInSystemHeader(Ivar));
   }
 }
 
diff --git a/clang/lib/ExtractAPI/API.cpp b/clang/lib/ExtractAPI/API.cpp
index 71c655ba5b5b3..a7709fff85ffe 100644
--- a/clang/lib/ExtractAPI/API.cpp
+++ b/clang/lib/ExtractAPI/API.cpp
@@ -45,7 +45,7 @@ RecordTy *addTopLevelRecord(DenseMap<StringRef, APIRecord *> &USRLookupTable,
 
 NamespaceRecord *
 APISet::addNamespace(APIRecord *Parent, StringRef Name, StringRef USR,
-                     PresumedLoc Loc, AvailabilitySet Availability,
+                     PresumedLoc Loc, AvailabilityInfo Availability,
                      LinkageInfo Linkage, const DocComment &Comment,
                      DeclarationFragments Declaration,
                      DeclarationFragments SubHeading, bool IsFromSystemHeader) {
@@ -61,17 +61,17 @@ APISet::addNamespace(APIRecord *Parent, StringRef Name, StringRef USR,
 
 GlobalVariableRecord *
 APISet::addGlobalVar(StringRef Name, StringRef USR, PresumedLoc Loc,
-                     AvailabilitySet Availabilities, LinkageInfo Linkage,
+                     AvailabilityInfo Availability, LinkageInfo Linkage,
                      const DocComment &Comment, DeclarationFragments Fragments,
                      DeclarationFragments SubHeading, bool IsFromSystemHeader) {
   return addTopLevelRecord(USRBasedLookupTable, GlobalVariables, USR, Name, Loc,
-                           std::move(Availabilities), Linkage, Comment,
-                           Fragments, SubHeading, IsFromSystemHeader);
+                           std::move(Availability), Linkage, Comment, Fragments,
+                           SubHeading, IsFromSystemHeader);
 }
 
 GlobalVariableTemplateRecord *APISet::addGlobalVariableTemplate(
     StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, LinkageInfo Linkage,
+    AvailabilityInfo Availability, LinkageInfo Linkage,
     const DocComment &Comment, DeclarationFragments Declaration,
     DeclarationFragments SubHeading, Template Template,
     bool IsFromSystemHeader) {
@@ -83,19 +83,18 @@ GlobalVariableTemplateRecord *APISet::addGlobalVariableTemplate(
 
 GlobalFunctionRecord *APISet::addGlobalFunction(
     StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availabilities, LinkageInfo Linkage,
+    AvailabilityInfo Availability, LinkageInfo Linkage,
     const DocComment &Comment, DeclarationFragments Fragments,
     DeclarationFragments SubHeading, FunctionSignature Signature,
     bool IsFromSystemHeader) {
   return addTopLevelRecord(USRBasedLookupTable, GlobalFunctions, USR, Name, Loc,
-                           std::move(Availabilities), Linkage, Comment,
-                           Fragments, SubHeading, Signature,
-                           IsFromSystemHeader);
+                           std::move(Availability), Linkage, Comment, Fragments,
+                           SubHeading, Signature, IsFromSystemHeader);
 }
 
 GlobalFunctionTemplateRecord *APISet::addGlobalFunctionTemplate(
     StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, LinkageInfo Linkage,
+    AvailabilityInfo Availability, LinkageInfo Linkage,
     const DocComment &Comment, DeclarationFragments Declaration,
     DeclarationFragments SubHeading, FunctionSignature Signature,
     Template Template, bool IsFromSystemHeader) {
@@ -108,7 +107,7 @@ GlobalFunctionTemplateRecord *APISet::addGlobalFunctionTemplate(
 GlobalFunctionTemplateSpecializationRecord *
 APISet::addGlobalFunctionTemplateSpecialization(
     StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, LinkageInfo Linkage,
+    AvailabilityInfo Availability, LinkageInfo Linkage,
     const DocComment &Comment, DeclarationFragments Declaration,
     DeclarationFragments SubHeading, FunctionSignature Signature,
     bool IsFromSystemHeader) {
@@ -120,14 +119,14 @@ APISet::addGlobalFunctionTemplateSpecialization(
 
 EnumConstantRecord *APISet::addEnumConstant(EnumRecord *Enum, StringRef Name,
                                             StringRef USR, PresumedLoc Loc,
-                                            AvailabilitySet Availabilities,
+                                            AvailabilityInfo Availability,
                                             const DocComment &Comment,
                                             DeclarationFragments Declaration,
                                             DeclarationFragments SubHeading,
                                             bool IsFromSystemHeader) {
   auto Record = std::make_unique<EnumConstantRecord>(
-      USR, Name, Loc, std::move(Availabilities), Comment, Declaration,
-      SubHeading, IsFromSystemHeader);
+      USR, Name, Loc, std::move(Availability), Comment, Declaration, SubHeading,
+      IsFromSystemHeader);
   Record->ParentInformation = APIRecord::HierarchyInformation(
       Enum->USR, Enum->Name, Enum->getKind(), Enum);
   USRBasedLookupTable.insert({USR, Record.get()});
@@ -135,26 +134,26 @@ EnumConstantRecord *APISet::addEnumConstant(EnumRecord *Enum, StringRef Name,
 }
 
 EnumRecord *APISet::addEnum(StringRef Name, StringRef USR, PresumedLoc Loc,
-                            AvailabilitySet Availabilities,
+                            AvailabilityInfo Availability,
                             const DocComment &Comment,
                             DeclarationFragments Declaration,
                             DeclarationFragments SubHeading,
                             bool IsFromSystemHeader) {
   return addTopLevelRecord(USRBasedLookupTable, Enums, USR, Name, Loc,
-                           std::move(Availabilities), Comment, Declaration,
+                           std::move(Availability), Comment, Declaration,
                            SubHeading, IsFromSystemHeader);
 }
 
 StructFieldRecord *APISet::addStructField(StructRecord *Struct, StringRef Name,
                                           StringRef USR, PresumedLoc Loc,
-                                          AvailabilitySet Availabilities,
+                                          AvailabilityInfo Availability,
                                           const DocComment &Comment,
                                           DeclarationFragments Declaration,
                                           DeclarationFragments SubHeading,
                                           bool IsFromSystemHeader) {
   auto Record = std::make_unique<StructFieldRecord>(
-      USR, Name, Loc, std::move(Availabilities), Comment, Declaration,
-      SubHeading, IsFromSystemHeader);
+      USR, Name, Loc, std::move(Availability), Comment, Declaration, SubHeading,
+      IsFromSystemHeader);
   Record->ParentInformation = APIRecord::HierarchyInformation(
       Struct->USR, Struct->Name, Struct->getKind(), Struct);
   USRBasedLookupTable.insert({USR, Record.get()});
@@ -162,37 +161,37 @@ StructFieldRecord *APISet::addStructField(StructRecord *Struct, StringRef Name,
 }
 
 StructRecord *APISet::addStruct(StringRef Name, StringRef USR, PresumedLoc Loc,
-                                AvailabilitySet Availabilities,
+                                AvailabilityInfo Availability,
                                 const DocComment &Comment,
                                 DeclarationFragments Declaration,
                                 DeclarationFragments SubHeading,
                                 bool IsFromSystemHeader) {
   return addTopLevelRecord(USRBasedLookupTable, Structs, USR, Name, Loc,
-                           std::move(Availabilities), Comment, Declaration,
+                           std::move(Availability), Comment, Declaration,
                            SubHeading, IsFromSystemHeader);
 }
 
 StaticFieldRecord *
 APISet::addStaticField(StringRef Name, StringRef USR, PresumedLoc Loc,
-                       AvailabilitySet Availabilities, LinkageInfo Linkage,
+                       AvailabilityInfo Availability, LinkageInfo Linkage,
                        const DocComment &Comment,
                        DeclarationFragments Declaration,
                        DeclarationFragments SubHeading, SymbolReference Context,
                        AccessControl Access, bool IsFromSystemHeader) {
   return addTopLevelRecord(USRBasedLookupTable, StaticFields, USR, Name, Loc,
-                           std::move(Availabilities), Linkage, Comment,
+                           std::move(Availability), Linkage, Comment,
                            Declaration, SubHeading, Context, Access,
                            IsFromSystemHeader);
 }
 
 CXXFieldRecord *
 APISet::addCXXField(APIRecord *CXXClass, StringRef Name, StringRef USR,
-                    PresumedLoc Loc, AvailabilitySet Availabilities,
+                    PresumedLoc Loc, AvailabilityInfo Availability,
                     const DocComment &Comment, DeclarationFragments Declaration,
                     DeclarationFragments SubHeading, AccessControl Access,
                     bool IsFromSystemHeader) {
   auto *Record = addTopLevelRecord(
-      USRBasedLookupTable, CXXFields, USR, Name, Loc, std::move(Availabilities),
+      USRBasedLookupTable, CXXFields, USR, Name, Loc, std::move(Availability),
       Comment, Declaration, SubHeading, Access, IsFromSystemHeader);
   Record->ParentInformation = APIRecord::HierarchyInformation(
       CXXClass->USR, CXXClass->Name, CXXClass->getKind(), CXXClass);
@@ -201,7 +200,7 @@ APISet::addCXXField(APIRecord *CXXClass, StringRef Name, StringRef USR,
 
 CXXFieldTemplateRecord *APISet::addCXXFieldTemplate(
     APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, const DocComment &Comment,
+    AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     AccessControl Access, Template Template, bool IsFromSystemHeader) {
   auto *Record =
@@ -216,14 +215,13 @@ CXXFieldTemplateRecord *APISet::addCXXFieldTemplate(
 
 CXXClassRecord *
 APISet::addCXXClass(APIRecord *Parent, StringRef Name, StringRef USR,
-                    PresumedLoc Loc, AvailabilitySet Availabilities,
+                    PresumedLoc Loc, AvailabilityInfo Availability,
                     const DocComment &Comment, DeclarationFragments Declaration,
                     DeclarationFragments SubHeading, APIRecord::RecordKind Kind,
                     AccessControl Access, bool IsFromSystemHeader) {
-  auto *Record =
-      addTopLevelRecord(USRBasedLookupTable, CXXClasses, USR, Name, Loc,
-                        std::move(Availabilities), Comment, Declaration,
-                        SubHeading, Kind, Access, IsFromSystemHeader);
+  auto *Record = addTopLevelRecord(
+      USRBasedLookupTable, CXXClasses, USR, Name, Loc, std::move(Availability),
+      Comment, Declaration, SubHeading, Kind, Access, IsFromSystemHeader);
   if (Parent)
     Record->ParentInformation = APIRecord::HierarchyInformation(
         Parent->USR, Parent->Name, Parent->getKind(), Parent);
@@ -232,7 +230,7 @@ APISet::addCXXClass(APIRecord *Parent, StringRef Name, StringRef USR,
 
 ClassTemplateRecord *APISet::addClassTemplate(
     APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, const DocComment &Comment,
+    AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     Template Template, AccessControl Access, bool IsFromSystemHeader) {
   auto *Record =
@@ -247,7 +245,7 @@ ClassTemplateRecord *APISet::addClassTemplate(
 
 ClassTemplateSpecializationRecord *APISet::addClassTemplateSpecialization(
     APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, const DocComment &Comment,
+    AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     AccessControl Access, bool IsFromSystemHeader) {
   auto *Record =
@@ -263,7 +261,7 @@ ClassTemplateSpecializationRecord *APISet::addClassTemplateSpecialization(
 ClassTemplatePartialSpecializationRecord *
 APISet::addClassTemplatePartialSpecialization(
     APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, const DocComment &Comment,
+    AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     Template Template, AccessControl Access, bool IsFromSystemHeader) {
   auto *Record = addTopLevelRecord(
@@ -279,7 +277,7 @@ APISet::addClassTemplatePartialSpecialization(
 GlobalVariableTemplateSpecializationRecord *
 APISet::addGlobalVariableTemplateSpecialization(
     StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, LinkageInfo Linkage,
+    AvailabilityInfo Availability, LinkageInfo Linkage,
     const DocComment &Comment, DeclarationFragments Declaration,
     DeclarationFragments SubHeading, bool IsFromSystemHeader) {
   return addTopLevelRecord(USRBasedLookupTable,
@@ -291,7 +289,7 @@ APISet::addGlobalVariableTemplateSpecialization(
 GlobalVariableTemplatePartialSpecializationRecord *
 APISet::addGlobalVariableTemplatePartialSpecialization(
     StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, LinkageInfo Linkage,
+    AvailabilityInfo Availability, LinkageInfo Linkage,
     const DocComment &Comment, DeclarationFragments Declaration,
     DeclarationFragments SubHeading, Template Template,
     bool IsFromSystemHeader) {
@@ -302,7 +300,8 @@ APISet::addGlobalVariableTemplatePartialSpecialization(
 }
 
 ConceptRecord *APISet::addConcept(StringRef Name, StringRef USR,
-                                  PresumedLoc Loc, AvailabilitySet Availability,
+                                  PresumedLoc Loc,
+                                  AvailabilityInfo Availability,
                                   const DocComment &Comment,
                                   DeclarationFragments Declaration,
                                   DeclarationFragments SubHeading,
@@ -314,7 +313,7 @@ ConceptRecord *APISet::addConcept(StringRef Name, StringRef USR,
 
 CXXMethodRecord *APISet::addCXXInstanceMethod(
     APIRecord *CXXClassRecord, StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, const DocComment &Comment,
+    AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     FunctionSignature Signature, AccessControl Access,
     bool IsFromSystemHeader) {
@@ -331,7 +330,7 @@ CXXMethodRecord *APISet::addCXXInstanceMethod(
 
 CXXMethodRecord *APISet::addCXXStaticMethod(
     APIRecord *CXXClassRecord, StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, const DocComment &Comment,
+    AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     FunctionSignature Signature, AccessControl Access,
     bool IsFromSystemHeader) {
@@ -348,7 +347,7 @@ CXXMethodRecord *APISet::addCXXStaticMethod(
 
 CXXMethodTemplateRecord *APISet::addCXXMethodTemplate(
     APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, const DocComment &Comment,
+    AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     FunctionSignature Signature, AccessControl Access, Template Template,
     bool IsFromSystemHeader) {
@@ -364,7 +363,7 @@ CXXMethodTemplateRecord *APISet::addCXXMethodTemplate(
 
 CXXMethodTemplateSpecializationRecord *APISet::addCXXMethodTemplateSpec(
     APIRecord *Parent, StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availability, const DocComment &Comment,
+    AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     FunctionSignature Signature, AccessControl Access,
     bool IsFromSystemHeader) {
@@ -381,14 +380,14 @@ CXXMethodTemplateSpecializationRecord *APISet::addCXXMethodTemplateSpec(
 
 ObjCCategoryRecord *APISet::addObjCCategory(
     StringRef Name, StringRef USR, PresumedLoc Loc,
-    AvailabilitySet Availabilities, const DocComment &Comment,
+    AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     SymbolReference Interface, bool IsFromSystemHeader,
     bool IsFromExternalModule) {
   // Create the category record.
   auto *Record =
       addTopLevelRecord(USRBasedLookupTable, ObjCCategories, USR, Name, Loc,
-                        std::move(Availabilities), Comment, Declaration,
+                        std::move(Availability), Comment, Declaration,
                         SubHeading, Interface, IsFromSystemHeader);
 
   Record->IsFromExternalModule = IsFromExternalModule;
@@ -402,31 +401,31 @@ ObjCCategoryRecord *APISet::addObjCCategory(
 
 ObjCInterfaceRecord *
 APISet::addObjCInterface(StringRef Name, StringRef USR, PresumedLoc Loc,
-                         AvailabilitySet Availabilities, LinkageInfo Linkage,
+                         AvailabilityInfo Availability, LinkageInfo Linkage,
                          const DocComment &Comment,
                          DeclarationFragments Declaration,
                          DeclarationFragments SubHeading,
                          SymbolReference SuperClass, bool IsFromSystemHeader) {
   return addTopLevelRecord(USRBasedLookupTable, ObjCInterfaces, USR, Name, Loc,
-                           std::move(Availabilities), Linkage, Comment,
+                           std::move(Availability), Linkage, Comment,
                            Declaration, SubHeading, SuperClass,
                            IsFromSystemHeader);
 }
 
 ObjCMethodRecord *APISet::addObjCMethod(
     ObjCContainerRecord *Container, StringRef Name, StringRef USR,
-    PresumedLoc Loc, AvailabilitySet Availabilities, const DocComment &Comment,
+    PresumedLoc Loc, AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     FunctionSignature Signature, bool IsInstanceMethod,
     bool IsFromSystemHeader) {
   std::unique_ptr<ObjCMethodRecord> Record;
   if (IsInstanceMethod)
     Record = std::make_unique<ObjCInstanceMethodRecord>(
-        USR, Name, Loc, std::move(Availabilities), Comment, Declaration,
+        USR, Name, Loc, std::move(Availability), Comment, Declaration,
         SubHeading, Signature, IsFromSystemHeader);
   else
     Record = std::make_unique<ObjCClassMethodRecord>(
-        USR, Name, Loc, std::move(Availabilities), Comment, Declaration,
+        USR, Name, Loc, std::move(Availability), Comment, Declaration,
         SubHeading, Signature, IsFromSystemHeader);
 
   Record->ParentInformation = APIRecord::HierarchyInformation(
@@ -437,7 +436,7 @@ ObjCMethodRecord *APISet::addObjCMethod(
 
 ObjCPropertyRecord *APISet::addObjCProperty(
     ObjCContainerRecord *Container, StringRef Name, StringRef USR,
-    PresumedLoc Loc, AvailabilitySet Availabilities, const DocComment &Comment,
+    PresumedLoc Loc, AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     ObjCPropertyRecord::AttributeKind Attributes, StringRef GetterName,
     StringRef SetterName, bool IsOptional, bool IsInstanceProperty,
@@ -445,12 +444,12 @@ ObjCPropertyRecord *APISet::addObjCProperty(
   std::unique_ptr<ObjCPropertyRecord> Record;
   if (IsInstanceProperty)
     Record = std::make_unique<ObjCInstancePropertyRecord>(
-        USR, Name, Loc, std::move(Availabilities), Comment, Declaration,
+        USR, Name, Loc, std::move(Availability), Comment, Declaration,
         SubHeading, Attributes, GetterName, SetterName, IsOptional,
         IsFromSystemHeader);
   else
     Record = std::make_unique<ObjCClassPropertyRecord>(
-        USR, Name, Loc, std::move(Availabilities), Comment, Declaration,
+        USR, Name, Loc, std::move(Availability), Comment, Declaration,
         SubHeading, Attributes, GetterName, SetterName, IsOptional,
         IsFromSystemHeader);
   Record->ParentInformation = APIRecord::HierarchyInformation(
@@ -461,12 +460,12 @@ ObjCPropertyRecord *APISet::addObjCProperty(
 
 ObjCInstanceVariableRecord *APISet::addObjCInstanceVariable(
     ObjCContainerRecord *Container, StringRef Name, StringRef USR,
-    PresumedLoc Loc, AvailabilitySet Availabilities, const DocComment &Comment,
+    PresumedLoc Loc, AvailabilityInfo Availability, const DocComment &Comment,
     DeclarationFragments Declaration, DeclarationFragments SubHeading,
     ObjCInstanceVariableRecord::AccessControl Access, bool IsFromSystemHeader) {
   auto Record = std::make_unique<ObjCInstanceVariableRecord>(
-      USR, Name, Loc, std::move(Availabilities), Comment, Declaration,
-      SubHeading, Access, IsFromSystemHeader);
+      USR, Name, Loc, std::move(Availability), Comment, Declaration, SubHeading,
+      Access, IsFromSystemHeader);
   Record->ParentInformation = APIRecord::HierarchyInformation(
       Container->USR, Container->Name, Container->getKind(), Container);
   USRBasedLookupTable.insert({USR, Record.get()});
@@ -475,13 +474,13 @@ ObjCInstanceVariableRecord *APISet::addObjCInstanceVariable(
 
 ObjCProtocolRecord *APISet::addObjCProtocol(StringRef Name, StringRef USR,
                                             PresumedLoc Loc,
-                                            AvailabilitySet Availabilities,
+                                            AvailabilityInfo Availability,
                                             const DocComment &Comment,
                                             DeclarationFragments Declaration,
                                             DeclarationFragments SubHeading,
                                             bool IsFromSystemHeader) {
   return addTopLevelRecord(USRBasedLookupTable, ObjCProtocols, USR, Name, Loc,
-                           std::move(Availabilities), Comment, Declaration,
+                           std::move(Availability), Comment, Declaration,
                            SubHeading, IsFromSystemHeader);
 }
 
@@ -496,12 +495,12 @@ APISet::addMacroDefinition(StringRef Name, StringRef USR, PresumedLoc Loc,
 
 TypedefRecord *
 APISet::addTypedef(StringRef Name, StringRef USR, PresumedLoc Loc,
-                   AvailabilitySet Availabilities, const DocComment &Comment,
+                   AvailabilityInfo Availability, const DocComment &Comment,
                    DeclarationFragments Declaration,
                    DeclarationFragments SubHeading,
                    SymbolReference UnderlyingType, bool IsFromSystemHeader) {
   return addTopLevelRecord(USRBasedLookupTable, Typedefs, USR, Name, Loc,
-                           std::move(Availabilities), Comment, Declaration,
+                           std::move(Availability), Comment, Declaration,
                            SubHeading, UnderlyingType, IsFromSystemHeader);
 }
 
diff --git a/clang/lib/ExtractAPI/AvailabilityInfo.cpp b/clang/lib/ExtractAPI/AvailabilityInfo.cpp
index 1df852fdbf930..18e4d16b45bb6 100644
--- a/clang/lib/ExtractAPI/AvailabilityInfo.cpp
+++ b/clang/lib/ExtractAPI/AvailabilityInfo.cpp
@@ -1,50 +1,35 @@
 #include "clang/ExtractAPI/AvailabilityInfo.h"
+#include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
+#include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/STLExtras.h"
 
-using namespace clang;
-using namespace extractapi;
+using namespace clang::extractapi;
+using namespace llvm;
 
-AvailabilitySet::AvailabilitySet(const Decl *Decl) {
-  // Collect availability attributes from all redeclrations.
-  for (const auto *RD : Decl->redecls()) {
-    if (const auto *A = RD->getAttr<UnavailableAttr>()) {
-      if (!A->isImplicit()) {
-        this->Availabilities.clear();
-        UnconditionallyUnavailable = true;
-      }
-    }
+AvailabilityInfo AvailabilityInfo::createFromDecl(const Decl *Decl) {
+  ASTContext &Context = Decl->getASTContext();
+  StringRef PlatformName = Context.getTargetInfo().getPlatformName();
+  AvailabilityInfo Availability;
 
-    if (const auto *A = RD->getAttr<DeprecatedAttr>()) {
-      if (!A->isImplicit()) {
-        this->Availabilities.clear();
-        UnconditionallyDeprecated = true;
-      }
+  // Collect availability attributes from all redeclarations.
+  for (const auto *RD : Decl->redecls()) {
+    for (const auto *A : RD->specific_attrs<AvailabilityAttr>()) {
+      if (A->getPlatform()->getName() != PlatformName)
+        continue;
+      Availability =
+          AvailabilityInfo(A->getPlatform()->getName(), A->getIntroduced(),
+                           A->getDeprecated(), A->getObsoleted(), false, false);
+      break;
     }
 
-    for (const auto *Attr : RD->specific_attrs<AvailabilityAttr>()) {
-      StringRef Domain = Attr->getPlatform()->getName();
-      auto *Availability =
-          llvm::find_if(Availabilities, [Domain](const AvailabilityInfo &Info) {
-            return Domain.equals(Info.Domain);
-          });
-      if (Availability != Availabilities.end()) {
-        // Get the highest introduced version for all redeclarations.
-        if (Availability->Introduced < Attr->getIntroduced())
-          Availability->Introduced = Attr->getIntroduced();
+    if (const auto *A = RD->getAttr<UnavailableAttr>())
+      if (!A->isImplicit())
+        Availability.UnconditionallyUnavailable = true;
 
-        // Get the lowest deprecated version for all redeclarations.
-        if (Availability->Deprecated > Attr->getDeprecated())
-          Availability->Deprecated = Attr->getDeprecated();
-
-        // Get the lowest obsoleted version for all redeclarations.
-        if (Availability->Obsoleted > Attr->getObsoleted())
-          Availability->Obsoleted = Attr->getObsoleted();
-      } else {
-        Availabilities.emplace_back(Domain, Attr->getIntroduced(),
-                                    Attr->getDeprecated(), Attr->getObsoleted(),
-                                    Attr->getUnavailable());
-      }
-    }
+    if (const auto *A = RD->getAttr<DeprecatedAttr>())
+      if (!A->isImplicit())
+        Availability.UnconditionallyDeprecated = true;
   }
+  return Availability;
 }
diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
index 53b22297ee0ea..ee424a16fc1cf 100644
--- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
+++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
@@ -147,46 +147,40 @@ Object serializeSourceRange(const PresumedLoc &BeginLoc,
 /// Serialize the availability attributes of a symbol.
 ///
 /// Availability information contains the introduced, deprecated, and obsoleted
-/// versions of the symbol for a given domain (roughly corresponds to a
-/// platform) as semantic versions, if not default.  Availability information
-/// also contains flags to indicate if the symbol is unconditionally unavailable
-/// or deprecated, i.e. \c __attribute__((unavailable)) and \c
-/// __attribute__((deprecated)).
+/// versions of the symbol as semantic versions, if not default.
+/// Availability information also contains flags to indicate if the symbol is
+/// unconditionally unavailable or deprecated,
+/// i.e. \c __attribute__((unavailable)) and \c __attribute__((deprecated)).
 ///
 /// \returns \c std::nullopt if the symbol has default availability attributes,
-/// or an \c Array containing the formatted availability information.
-std::optional<Array>
-serializeAvailability(const AvailabilitySet &Availabilities) {
-  if (Availabilities.isDefault())
+/// or an \c Array containing an object with the formatted availability
+/// information.
+std::optional<Array> serializeAvailability(const AvailabilityInfo &Avail) {
+  if (Avail.isDefault())
     return std::nullopt;
 
+  Object Availability;
   Array AvailabilityArray;
-
-  if (Availabilities.isUnconditionallyDeprecated()) {
+  Availability["domain"] = Avail.Domain;
+  serializeObject(Availability, "introduced",
+                  serializeSemanticVersion(Avail.Introduced));
+  serializeObject(Availability, "deprecated",
+                  serializeSemanticVersion(Avail.Deprecated));
+  serializeObject(Availability, "obsoleted",
+                  serializeSemanticVersion(Avail.Obsoleted));
+  if (Avail.isUnconditionallyDeprecated()) {
     Object UnconditionallyDeprecated;
     UnconditionallyDeprecated["domain"] = "*";
     UnconditionallyDeprecated["isUnconditionallyDeprecated"] = true;
     AvailabilityArray.emplace_back(std::move(UnconditionallyDeprecated));
   }
-
-  // Note unconditionally unavailable records are skipped.
-
-  for (const auto &AvailInfo : Availabilities) {
-    Object Availability;
-    Availability["domain"] = AvailInfo.Domain;
-    if (AvailInfo.Unavailable)
-      Availability["isUnconditionallyUnavailable"] = true;
-    else {
-      serializeObject(Availability, "introduced",
-                      serializeSemanticVersion(AvailInfo.Introduced));
-      serializeObject(Availability, "deprecated",
-                      serializeSemanticVersion(AvailInfo.Deprecated));
-      serializeObject(Availability, "obsoleted",
-                      serializeSemanticVersion(AvailInfo.Obsoleted));
-    }
-    AvailabilityArray.emplace_back(std::move(Availability));
+  if (Avail.isUnconditionallyUnavailable()) {
+    Object UnconditionallyUnavailable;
+    UnconditionallyUnavailable["domain"] = "*";
+    UnconditionallyUnavailable["isUnconditionallyUnavailable"] = true;
+    AvailabilityArray.emplace_back(std::move(UnconditionallyUnavailable));
   }
-
+  AvailabilityArray.emplace_back(std::move(Availability));
   return AvailabilityArray;
 }
 
@@ -738,7 +732,7 @@ bool SymbolGraphSerializer::shouldSkip(const APIRecord &Record) const {
     return true;
 
   // Skip unconditionally unavailable symbols
-  if (Record.Availabilities.isUnconditionallyUnavailable())
+  if (Record.Availability.isUnconditionallyUnavailable())
     return true;
 
   // Filter out symbols prefixed with an underscored as they are understood to
@@ -764,7 +758,7 @@ SymbolGraphSerializer::serializeAPIRecord(const RecordTy &Record) const {
       Obj, "location",
       serializeSourceLocation(Record.Location, /*IncludeFileURI=*/true));
   serializeArray(Obj, "availability",
-                 serializeAvailability(Record.Availabilities));
+                 serializeAvailability(Record.Availability));
   serializeObject(Obj, "docComment", serializeDocComment(Record.Comment));
   serializeArray(Obj, "declarationFragments",
                  serializeDeclarationFragments(Record.Declaration));
diff --git a/clang/test/ExtractAPI/availability.c b/clang/test/ExtractAPI/availability.c
index 4bda94ba7c2bf..3c1ef5c45b634 100644
--- a/clang/test/ExtractAPI/availability.c
+++ b/clang/test/ExtractAPI/availability.c
@@ -300,22 +300,6 @@ void e(void) __attribute__((availability(tvos, unavailable)));
             "minor": 0,
             "patch": 0
           }
-        },
-        {
-          "domain": "ios",
-          "introduced": {
-            "major": 13,
-            "minor": 0,
-            "patch": 0
-          }
-        },
-        {
-          "domain": "tvos",
-          "introduced": {
-            "major": 15,
-            "minor": 0,
-            "patch": 0
-          }
         }
       ],
       "declarationFragments": [
@@ -394,10 +378,6 @@ void e(void) __attribute__((availability(tvos, unavailable)));
             "minor": 0,
             "patch": 0
           }
-        },
-        {
-          "domain": "tvos",
-          "isUnconditionallyUnavailable": true
         }
       ],
       "declarationFragments": [
diff --git a/clang/tools/libclang/CXExtractAPI.cpp b/clang/tools/libclang/CXExtractAPI.cpp
index 9128e891538a8..05098c96829fc 100644
--- a/clang/tools/libclang/CXExtractAPI.cpp
+++ b/clang/tools/libclang/CXExtractAPI.cpp
@@ -80,8 +80,8 @@ struct LibClangExtractAPIVisitor
     }
 
     ObjCInterfaceRecord *ObjCInterfaceRecord = API.addObjCInterface(
-        Name, USR, Loc, AvailabilitySet(Decl), Linkage, Comment, Declaration,
-        SubHeading, SuperClass, isInSystemHeader(Decl));
+        Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Linkage,
+        Comment, Declaration, SubHeading, SuperClass, isInSystemHeader(Decl));
 
     // Record all methods (selectors). This doesn't include automatically
     // synthesized property methods.

From 2c78f3b86007fbf56a6f40b647b5cb757c082215 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora@amd.com>
Date: Fri, 19 Jan 2024 12:08:17 +0100
Subject: [PATCH 108/843] [AMDGPU][GFX12] Add tests for flat_atomic_pk (#78683)

---
 llvm/test/MC/AMDGPU/gfx12_asm_vflat.s         | 42 +++++++++++++++++++
 .../Disassembler/AMDGPU/gfx12_dasm_vflat.txt  | 42 +++++++++++++++++++
 2 files changed, 84 insertions(+)

diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
index bac3655d19ec6..c814b42c12755 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vflat.s
@@ -432,6 +432,48 @@ flat_atomic_xor_b64 v[1:2], v[0:1], v[2:3] offset:-64 th:TH_ATOMIC_RETURN
 flat_atomic_xor_b64 v[1:2], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN
 // GFX12: encoding: [0x7c,0xc0,0x12,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 
+flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+
+flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+
+flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
+flat_atomic_pk_add_f16 v[0:1], v2 offset:-64
+// GFX12: encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+flat_atomic_pk_add_f16 v[0:1], v2 offset:64
+// GFX12: encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
+flat_atomic_pk_add_f16 v[0:1], v2
+// GFX12: encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+
+flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+
+flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+
+flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+
+flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN
+// GFX12: encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+
+flat_atomic_pk_add_bf16 v[0:1], v2 offset:-64
+// GFX12: encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+
+flat_atomic_pk_add_bf16 v[0:1], v2 offset:64
+// GFX12: encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+
+flat_atomic_pk_add_bf16 v[0:1], v2
+// GFX12: encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+
 flat_load_b128 v[1:4], v[0:1] offset:-64
 // GFX12: encoding: [0x7c,0xc0,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0xc0,0xff,0xff]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
index b579297235d85..181f2bd646069 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vflat.txt
@@ -240,6 +240,48 @@
 # GFX12: flat_atomic_xor_b64 v[1:2], v[0:1], v[2:3] offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0xc0,0x12,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
 0x7c,0xc0,0x12,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
 
+# GFX12: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85
+
+# GFX12: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a
+
+# GFX12: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: flat_atomic_pk_add_f16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: flat_atomic_pk_add_f16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: flat_atomic_pk_add_f16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: flat_atomic_pk_add_f16 v[0:1], v2 ; encoding: [0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x7c,0x40,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
+# GFX12: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0xee,0x85
+
+# GFX12: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:8000000 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x00,0x12,0x7a
+
+# GFX12: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:-64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: flat_atomic_pk_add_bf16 v1, v[0:1], v2 offset:64 th:TH_ATOMIC_RETURN ; encoding: [0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xec,0x01,0x00,0x10,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: flat_atomic_pk_add_bf16 v[0:1], v2 offset:-64 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff]
+0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0xc0,0xff,0xff
+
+# GFX12: flat_atomic_pk_add_bf16 v[0:1], v2 offset:64 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00]
+0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x40,0x00,0x00
+
+# GFX12: flat_atomic_pk_add_bf16 v[0:1], v2 ; encoding: [0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00]
+0x7c,0x80,0x16,0xec,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x00
+
 # GFX12: flat_load_b128 v[1:4], v[0:1] offset:64 ; encoding: [0x7c,0xc0,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00]
 0x7c,0xc0,0x05,0xec,0x01,0x00,0x00,0x00,0x00,0x40,0x00,0x00
 

From 12b676de728ee9046ac5fea49e27b9bf1cde4a70 Mon Sep 17 00:00:00 2001
From: Han-Chung Wang <hanhan0912@gmail.com>
Date: Fri, 19 Jan 2024 03:15:13 -0800
Subject: [PATCH 109/843] [mlir][vector] Drop innermost unit dims on
 transfer_write. (#78554)

---
 .../Vector/Transforms/VectorTransforms.cpp    | 219 +++++++++++++-----
 ...tor-transfer-collapse-inner-most-dims.mlir |  53 +++++
 2 files changed, 218 insertions(+), 54 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index bd02c07981466..9c734e8cc2ad1 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1152,8 +1152,78 @@ struct FoldI1Select : public OpRewritePattern<arith::SelectOp> {
   }
 };
 
-// Drop inner most contiguous unit dimensions from transfer_read operand.
-class DropInnerMostUnitDims : public OpRewritePattern<vector::TransferReadOp> {
+/// Returns the number of dims can be folded away from transfer ops. It returns
+/// a failure if it can not determine the number of dims to be folded.
+/// Example 1: it returns "2" if `srcType` is memref<512x16x1x1xf32> and
+/// `vectorType` is vector<16x16x1x1xf32>. Because there two inner most dims
+/// can be dropped by memref.subview ops.
+/// Example 2: it returns "1" if `srcType` is the same memref type with
+/// [8192, 16, 8, 1] strides.
+static FailureOr<size_t>
+getTransferFoldableInnerUnitDims(MemRefType srcType, VectorType vectorType) {
+  SmallVector<int64_t> srcStrides;
+  int64_t srcOffset;
+  if (failed(getStridesAndOffset(srcType, srcStrides, srcOffset)))
+    return failure();
+
+  // According to vector.transfer_read/write semantics, the vector can be a
+  // slice. Thus, we have to offset the check index with `rankDiff` in
+  // `srcStrides` and source dim sizes.
+  size_t result = 0;
+  int rankDiff = srcType.getRank() - vectorType.getRank();
+  for (int64_t i = 0, e = vectorType.getRank(); i < e; ++i) {
+    // Check that the inner dim size is 1 for both memref type and vector slice.
+    // It can be folded only if they are 1 and the stride is 1.
+    int dim = vectorType.getRank() - i - 1;
+    if (srcStrides[dim + rankDiff] != 1 ||
+        srcType.getDimSize(dim + rankDiff) != 1 ||
+        vectorType.getDimSize(dim) != 1)
+      break;
+    result++;
+  }
+  return result;
+}
+
+/// Returns a MemRef type that drops inner `dimsToDrop` dimensions from
+/// `srcType`. E.g., if `srcType` is memref<512x16x1x1xf32> and `dimsToDrop` is
+/// two, it returns memref<512x16x16> type.
+static MemRefType getMemRefTypeWithDroppingInnerDims(OpBuilder &builder,
+                                                     MemRefType srcType,
+                                                     size_t dimsToDrop) {
+  MemRefType resultMemrefType;
+  MemRefLayoutAttrInterface layout = srcType.getLayout();
+  if (isa<AffineMapAttr>(layout) && layout.isIdentity()) {
+    return MemRefType::get(srcType.getShape().drop_back(dimsToDrop),
+                           srcType.getElementType(), nullptr,
+                           srcType.getMemorySpace());
+  }
+  MemRefLayoutAttrInterface updatedLayout;
+  if (auto strided = dyn_cast<StridedLayoutAttr>(layout)) {
+    auto strides = llvm::to_vector(strided.getStrides().drop_back(dimsToDrop));
+    updatedLayout = StridedLayoutAttr::get(strided.getContext(),
+                                           strided.getOffset(), strides);
+    return MemRefType::get(srcType.getShape().drop_back(dimsToDrop),
+                           srcType.getElementType(), updatedLayout,
+                           srcType.getMemorySpace());
+  }
+
+  // Non-strided layout case.
+  AffineMap map = srcType.getLayout().getAffineMap();
+  int numSymbols = map.getNumSymbols();
+  for (size_t i = 0; i < dimsToDrop; ++i) {
+    int dim = srcType.getRank() - i - 1;
+    map = map.replace(builder.getAffineDimExpr(dim),
+                      builder.getAffineConstantExpr(0), map.getNumDims() - 1,
+                      numSymbols);
+  }
+  return MemRefType::get(srcType.getShape().drop_back(dimsToDrop),
+                         srcType.getElementType(), updatedLayout,
+                         srcType.getMemorySpace());
+}
+
+/// Drop inner most contiguous unit dimensions from transfer_read operand.
+class DropInnerMostUnitDimsTransferRead
+    : public OpRewritePattern<vector::TransferReadOp> {
   using OpRewritePattern::OpRewritePattern;
 
   LogicalResult matchAndRewrite(vector::TransferReadOp readOp,
@@ -1177,29 +1247,12 @@ class DropInnerMostUnitDims : public OpRewritePattern<vector::TransferReadOp> {
     if (targetType.getRank() <= 1)
       return failure();
 
-    SmallVector<int64_t> srcStrides;
-    int64_t srcOffset;
-    if (failed(getStridesAndOffset(srcType, srcStrides, srcOffset)))
-      return failure();
-
-    // According to vector.transfer_read semantics, the result can be a slice.
-    // It pads the indices with `1` starting from beginning. Thus, we have to
-    // offset the check index with `rankDiff` in `srcStrides` and source dim
-    // sizes.
-    size_t dimsToDrop = 0;
-    int rankDiff = srcType.getRank() - targetType.getRank();
-    for (int64_t i = 0, e = targetType.getRank(); i < e; ++i) {
-      // Check that the inner dim size is 1 for both memref/tensor type and
-      // vector slice. It can be folded only if they are 1 and the stride is 1.
-      int dim = targetType.getRank() - i - 1;
-      if (srcStrides[dim + rankDiff] == 1 &&
-          srcType.getDimSize(dim + rankDiff) == 1 &&
-          targetType.getDimSize(dim) == 1) {
-        dimsToDrop++;
-      } else {
-        break;
-      }
-    }
+    FailureOr<size_t> maybeDimsToDrop =
+        getTransferFoldableInnerUnitDims(srcType, targetType);
+    if (failed(maybeDimsToDrop))
+      return failure();
+
+    size_t dimsToDrop = maybeDimsToDrop.value();
     if (dimsToDrop == 0)
       return failure();
 
@@ -1207,35 +1260,9 @@ class DropInnerMostUnitDims : public OpRewritePattern<vector::TransferReadOp> {
         VectorType::get(targetType.getShape().drop_back(dimsToDrop),
                         targetType.getElementType());
 
-    MemRefType resultMemrefType;
-    MemRefLayoutAttrInterface layout = srcType.getLayout();
-    if (isa<AffineMapAttr>(layout) && layout.isIdentity()) {
-      resultMemrefType = MemRefType::get(
-          srcType.getShape().drop_back(dimsToDrop), srcType.getElementType(),
-          nullptr, srcType.getMemorySpace());
-    } else {
-      MemRefLayoutAttrInterface updatedLayout;
-      if (auto strided = dyn_cast<StridedLayoutAttr>(layout)) {
-        auto strides =
-            llvm::to_vector(strided.getStrides().drop_back(dimsToDrop));
-        updatedLayout = StridedLayoutAttr::get(strided.getContext(),
-                                               strided.getOffset(), strides);
-      } else {
-        AffineMap map = srcType.getLayout().getAffineMap();
-        int numSymbols = map.getNumSymbols();
-        for (size_t i = 0; i < dimsToDrop; ++i) {
-          int dim = srcType.getRank() - i - 1;
-          map = map.replace(rewriter.getAffineDimExpr(dim),
-                            rewriter.getAffineConstantExpr(0),
-                            map.getNumDims() - 1, numSymbols);
-        }
-      }
-      resultMemrefType = MemRefType::get(
-          srcType.getShape().drop_back(dimsToDrop), srcType.getElementType(),
-          updatedLayout, srcType.getMemorySpace());
-    }
-
     auto loc = readOp.getLoc();
+    MemRefType resultMemrefType =
+        getMemRefTypeWithDroppingInnerDims(rewriter, srcType, dimsToDrop);
     SmallVector<int64_t> offsets(srcType.getRank(), 0);
     SmallVector<int64_t> strides(srcType.getRank(), 1);
 
@@ -1261,6 +1288,88 @@ class DropInnerMostUnitDims : public OpRewritePattern<vector::TransferReadOp> {
   }
 };
 
+/// Drop inner most contiguous unit dimensions from transfer_write operand.
+/// E.g.,
+///    vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0, %c0]
+///      {in_bounds = [true, true, true, true, true]}
+///      : vector<1x16x16x1x1xf32>, memref<1x512x16x1x1xf32>
+///
+/// will be replaced with
+///
+///    %subview = memref.subview %arg0
+///      [0, 0, 0, 0, 0] [1, 512, 16, 1, 1] [1, 1, 1, 1, 1]
+///      : memref<1x512x16x1x1xf32> to memref<1x512x16xf32>
+///    %0 = vector.shape_cast %arg1 : vector<1x16x16x1x1xf32>
+///      to vector<1x16x16xf32>
+///    vector.transfer_write %0, %subview[%c0, %arg2, %c0]
+///      {in_bounds = [true, true, true]}
+///      : vector<1x16x16xf32>, memref<1x512x16xf32>
+class DropInnerMostUnitDimsTransferWrite
+    : public OpRewritePattern<vector::TransferWriteOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(vector::TransferWriteOp writeOp,
+                                PatternRewriter &rewriter) const override {
+    // TODO: support 0-d corner case.
+    if (writeOp.getTransferRank() == 0)
+      return failure();
+
+    // TODO: support mask.
+    if (writeOp.getMask())
+      return failure();
+
+    auto srcType = dyn_cast<MemRefType>(writeOp.getSource().getType());
+    if (!srcType || !srcType.hasStaticShape())
+      return failure();
+
+    if (!writeOp.getPermutationMap().isMinorIdentity())
+      return failure();
+
+    auto targetType = writeOp.getVectorType();
+    if (targetType.getRank() <= 1)
+      return failure();
+
+    FailureOr<size_t> maybeDimsToDrop =
+        getTransferFoldableInnerUnitDims(srcType, targetType);
+    if (failed(maybeDimsToDrop))
+      return failure();
+
+    size_t dimsToDrop = maybeDimsToDrop.value();
+    if (dimsToDrop == 0)
+      return failure();
+
+    auto resultTargetVecType =
+        VectorType::get(targetType.getShape().drop_back(dimsToDrop),
+                        targetType.getElementType());
+
+    MemRefType resultMemrefType =
+        getMemRefTypeWithDroppingInnerDims(rewriter, srcType, dimsToDrop);
+    SmallVector<int64_t> offsets(srcType.getRank(), 0);
+    SmallVector<int64_t> strides(srcType.getRank(), 1);
+    ArrayAttr inBoundsAttr =
+        writeOp.getInBounds()
+            ? rewriter.getArrayAttr(
+                  writeOp.getInBoundsAttr().getValue().drop_back(dimsToDrop))
+            : ArrayAttr();
+
+    Location loc = writeOp.getLoc();
+    Value rankedReducedView = rewriter.create<memref::SubViewOp>(
+        loc, resultMemrefType, writeOp.getSource(), offsets, srcType.getShape(),
+        strides);
+    auto permMap = getTransferMinorIdentityMap(
+        cast<ShapedType>(rankedReducedView.getType()), resultTargetVecType);
+
+    auto shapeCast = rewriter.createOrFold<vector::ShapeCastOp>(
+        loc, resultTargetVecType, writeOp.getVector());
+    rewriter.replaceOpWithNewOp<vector::TransferWriteOp>(
+        writeOp, shapeCast, rankedReducedView,
+        writeOp.getIndices().drop_back(dimsToDrop), AffineMapAttr::get(permMap),
+        // TODO: support mask.
+        /*mask=*/Value(), inBoundsAttr);
+    return success();
+  }
+};
+
 /// Canonicalization of a `vector.contraction %a, %b, %c` with row-major matmul
 /// semantics to a contraction suitable for MMT (matrix matrix multiplication
 /// with the RHS transposed) lowering.
@@ -1696,7 +1805,9 @@ void mlir::vector::populateVectorReductionToContractPatterns(
 void mlir::vector::
     populateVectorTransferCollapseInnerMostContiguousDimsPatterns(
         RewritePatternSet &patterns, PatternBenefit benefit) {
-  patterns.add<DropInnerMostUnitDims>(patterns.getContext(), benefit);
+  patterns.add<DropInnerMostUnitDimsTransferRead,
+               DropInnerMostUnitDimsTransferWrite>(patterns.getContext(),
+                                                   benefit);
 }
 
 void mlir::vector::populateSinkVectorBroadcastPatterns(
diff --git a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
index 0d2743b9fe2e7..d6d69c8af8850 100644
--- a/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
+++ b/mlir/test/Dialect/Vector/vector-transfer-collapse-inner-most-dims.mlir
@@ -76,3 +76,56 @@ func.func @contiguous_inner_most_dim_out_of_bounds_2d(%arg0: memref<1x1xf32>) ->
 //  CHECK-NOT:   memref.subview
 //      CHECK:   %[[READ:.+]] = vector.transfer_read %[[SRC]]
 //      CHECK:   return %[[READ]] : vector<4x8xf32>
+
+// -----
+
+func.func @drop_two_inner_most_dim_for_transfer_write(%arg0: memref<1x512x16x1x1xf32>, %arg1: vector<1x16x16x1x1xf32>, %arg2: index) {
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0, %c0]
+    {in_bounds = [true, true, true, true, true]}
+    : vector<1x16x16x1x1xf32>, memref<1x512x16x1x1xf32>
+  return
+}
+// CHECK:      func.func @drop_two_inner_most_dim_for_transfer_write
+// CHECK-SAME:   %[[DEST:[a-zA-Z0-9]+]]
+// CHECK-SAME:   %[[VEC:[a-zA-Z0-9]+]]
+// CHECK-SAME:   %[[IDX:[a-zA-Z0-9]+]]
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK:        %[[SUBVIEW:.+]] = memref.subview %[[DEST]]
+// CHECK-SAME:     memref<1x512x16x1x1xf32> to memref<1x512x16xf32>
+// CHECK:        %[[CAST:.+]] = vector.shape_cast %[[VEC]] : vector<1x16x16x1x1xf32> to vector<1x16x16xf32>
+// CHECK:        vector.transfer_write %[[CAST]], %[[SUBVIEW]]
+// CHECK-SAME:     [%[[C0]], %[[IDX]], %[[C0]]]
+
+// -----
+
+func.func @drop_inner_most_dim_for_transfer_write(%arg0: memref<1x512x16x1xf32, strided<[8192, 16, 1, 1], offset: ?>>, %arg1: vector<1x16x16x1xf32>, %arg2: index) {
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %arg1, %arg0[%c0, %arg2, %c0, %c0]
+    {in_bounds = [true, true, true, true]}
+    : vector<1x16x16x1xf32>, memref<1x512x16x1xf32, strided<[8192, 16, 1, 1], offset: ?>>
+  return
+}
+// CHECK:      func.func @drop_inner_most_dim_for_transfer_write
+// CHECK-SAME:   %[[DEST:[a-zA-Z0-9]+]]
+// CHECK-SAME:   %[[VEC:[a-zA-Z0-9]+]]
+// CHECK-SAME:   %[[IDX:[a-zA-Z0-9]+]]
+// CHECK-DAG:    %[[C0:.+]] = arith.constant 0 : index
+// CHECK:        %[[SUBVIEW:.+]] = memref.subview %[[DEST]]
+// CHECK-SAME:     memref<1x512x16x1xf32, strided<[8192, 16, 1, 1], offset: ?>> to memref<1x512x16xf32, strided<[8192, 16, 1], offset: ?>>
+// CHECK:        %[[CAST:.+]] = vector.shape_cast %[[VEC]] : vector<1x16x16x1xf32> to vector<1x16x16xf32>
+// CHECK:        vector.transfer_write %[[CAST]], %[[SUBVIEW]]
+// CHECK-SAME:     [%[[C0]], %[[IDX]], %[[C0]]]
+
+// -----
+
+func.func @non_unit_strides(%arg0: memref<512x16x1xf32, strided<[8192, 16, 4], offset: ?>>, %arg1: vector<16x16x1xf32>, %arg2: index) {
+  %c0 = arith.constant 0 : index
+  vector.transfer_write %arg1, %arg0[%arg2, %c0, %c0]
+    {in_bounds = [true, true, true]}
+    : vector<16x16x1xf32>, memref<512x16x1xf32, strided<[8192, 16, 4], offset: ?>>
+  return
+}
+// The inner most unit dims can not be dropped if the strides are not ones.
+// CHECK:     func.func @non_unit_strides
+// CHECK-NOT:   memref.subview

From 228aecbcf106a50c30b1f8f1915d61850860cbcd Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 19 Jan 2024 11:22:43 +0000
Subject: [PATCH 110/843] [llvm][AArch64] Copy all operands when expanding
 BLR_BTI bundle (#78267)

Fixes #77915

Previously I based the operand copying on expandCALL_RVMARKER but did
not understand it properly at the time. This lead to me dropping the
arguments of the function being branched to.

This fixes that by copying all operands from the BLR_BTI to the BL/BLR
without skipping anything.

I've updated the existing test by adding function arguments.
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  6 +++--
 .../AArch64/blr-bti-preserves-operands.mir    | 24 +++++++++++++++++++
 .../AArch64/blr-bti-preserves-regmask.mir     | 23 ------------------
 3 files changed, 28 insertions(+), 25 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir
 delete mode 100644 llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index bb7f4d907ffd7..c45cb8eba8972 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -842,9 +842,11 @@ bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB,
   unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
   MachineInstr *Call =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
-  Call->addOperand(CallTarget);
+
+  for (const MachineOperand &MO : MI.operands())
+    Call->addOperand(MO);
+
   Call->setCFIType(*MBB.getParent(), MI.getCFIType());
-  Call->copyImplicitOps(*MBB.getParent(), MI);
 
   MachineInstr *BTI =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::HINT))
diff --git a/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir b/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir
new file mode 100644
index 0000000000000..3b2f10348bd77
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir
@@ -0,0 +1,24 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=aarch64-expand-pseudo -o - %s | FileCheck %s
+
+# When expanding a BLR_BTI, we should copy all the operands to the branch in the
+# bundle. Otherwise we could end up using a register after the BL which was
+# clobbered by the function that was called, or overwriting an argument to that
+# function before we make the call.
+# CHECK:    BUNDLE implicit-def $lr, implicit-def $w30, implicit-def $sp, implicit-def $wsp, implicit $x0, implicit $w1, implicit $sp {
+# CHECK:      BL @_setjmp, $x0, $w1, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
+# CHECK:      HINT 36
+# CHECK:    }
+
+--- |
+  define void @a() {
+    ret void
+  }
+
+  declare void @_setjmp(...)
+...
+---
+name: a
+body: |
+  bb.0:
+    BLR_BTI @_setjmp, $x0, $w1, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+...
diff --git a/llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir b/llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir
deleted file mode 100644
index 91652c6e20c8f..0000000000000
--- a/llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir
+++ /dev/null
@@ -1,23 +0,0 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=aarch64-expand-pseudo -o - %s | FileCheck %s
-
-# When expanding a BLR_BTI, we should keep the regmask that was attached to it.
-# Otherwise we could end up using a register after the BL which was clobbered by
-# the function that was called.
-# CHECK:    BUNDLE implicit-def $lr, implicit-def $w30, implicit-def $sp, implicit-def $wsp, implicit $sp {
-# CHECK:      BL @_setjmp, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
-# CHECK:      HINT 36
-# CHECK:    }
-
---- |
-  define void @a() {
-    ret void
-  }
-
-  declare void @_setjmp(...)
-...
----
-name: a
-body: |
-  bb.0:
-    BLR_BTI @_setjmp, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
-...

From 35121add2e3d5748e2d6de517632e35ce08f41c2 Mon Sep 17 00:00:00 2001
From: Benjamin Chetioui <bchetioui@google.com>
Date: Fri, 19 Jan 2024 11:32:19 +0000
Subject: [PATCH 111/843] [mlir][NFC] Remove unused variable.

---
 mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
index 9c734e8cc2ad1..12aa11e9e33f5 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorTransforms.cpp
@@ -1190,7 +1190,6 @@ getTransferFoldableInnerUnitDims(MemRefType srcType, VectorType vectorType) {
 static MemRefType getMemRefTypeWithDroppingInnerDims(OpBuilder &builder,
                                                      MemRefType srcType,
                                                      size_t dimsToDrop) {
-  MemRefType resultMemrefType;
   MemRefLayoutAttrInterface layout = srcType.getLayout();
   if (isa<AffineMapAttr>(layout) && layout.isIdentity()) {
     return MemRefType::get(srcType.getShape().drop_back(dimsToDrop),

From 01ba627f431c3622a973c1cb5a593c101021f805 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Fri, 19 Jan 2024 12:38:22 +0100
Subject: [PATCH 112/843] [llvm-jitlink] Refactor GOT and stubs registration
 (NFC) (#78365)

Add methods `registerGOTEntry()` and `registerStubEntry()` in
`Session::FileInfo` to factor out generic code from the individual
object type implementations.
---
 llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp | 35 +++-----
 llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp  | 79 ++++++++-----------
 .../tools/llvm-jitlink/llvm-jitlink-macho.cpp | 30 ++-----
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      | 30 +++++++
 llvm/tools/llvm-jitlink/llvm-jitlink.h        | 10 +++
 5 files changed, 89 insertions(+), 95 deletions(-)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp
index 283e655205d78..5271fdb556590 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-coff.cpp
@@ -108,34 +108,17 @@ Error registerCOFFGraphInfo(Session &S, LinkGraph &G) {
       if (Sym->getAddress() > LastSym->getAddress())
         LastSym = Sym;
 
-      if (isGOTSection) {
-        if (Sym->isSymbolZeroFill())
-          return make_error<StringError>("zero-fill atom in GOT section",
-                                         inconvertibleErrorCode());
-
-        // If this is a GOT symbol with size (i.e. not the GOT start symbol)
-        // then add it to the GOT entry info table.
-        if (Sym->getSize() != 0) {
-          if (auto TS = getCOFFGOTTarget(G, Sym->getBlock()))
-            FileInfo.GOTEntryInfos[TS->getName()] = {
-                Sym->getSymbolContent(), Sym->getAddress().getValue(),
-                Sym->getTargetFlags()};
-          else
-            return TS.takeError();
+      if (isGOTSection || isStubsSection) {
+        if (isGOTSection) {
+          // Skip the GOT start symbol
+          if (Sym->getSize() != 0)
+            if (Error E = FileInfo.registerGOTEntry(G, *Sym, getCOFFGOTTarget))
+              return E;
+        } else {
+          if (Error E = FileInfo.registerStubEntry(G, *Sym, getCOFFStubTarget))
+            return E;
         }
         SectionContainsContent = true;
-      } else if (isStubsSection) {
-        if (Sym->isSymbolZeroFill())
-          return make_error<StringError>("zero-fill atom in Stub section",
-                                         inconvertibleErrorCode());
-
-        if (auto TS = getCOFFStubTarget(G, Sym->getBlock()))
-          FileInfo.StubInfos[TS->getName()] = {Sym->getSymbolContent(),
-                                               Sym->getAddress().getValue(),
-                                               Sym->getTargetFlags()};
-        else
-          return TS.takeError();
-        SectionContainsContent = true;
       }
 
       if (Sym->hasName()) {
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
index a02468758b33f..c6b4218aad7af 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
@@ -72,13 +72,29 @@ static Expected<Symbol &> getELFStubTarget(LinkGraph &G, Block &B) {
   return getELFGOTTarget(G, GOTSym.getBlock());
 }
 
-static Expected<std::string> getELFAArch32StubTargetName(LinkGraph &G,
-                                                         Block &B) {
+static Expected<Symbol &> getELFAArch32StubTarget(LinkGraph &G, Block &B) {
   auto E = getFirstRelocationEdge(G, B);
   if (!E)
     return E.takeError();
-  Symbol &StubTarget = E->getTarget();
-  return StubTarget.getName().str();
+  return E->getTarget();
+}
+
+enum SectionType { GOT, Stubs, AArch32Stubs, Other };
+
+static Error registerSymbol(LinkGraph &G, Symbol &Sym, Session::FileInfo &FI,
+                            SectionType SecType) {
+  switch (SecType) {
+  case GOT:
+    if (Sym.getSize() == 0)
+      return Error::success(); // Skip the GOT start symbol
+    return FI.registerGOTEntry(G, Sym, getELFGOTTarget);
+  case Stubs:
+    return FI.registerStubEntry(G, Sym, getELFStubTarget);
+  case AArch32Stubs:
+    return FI.registerStubEntry(G, Sym, getELFAArch32StubTarget);
+  case Other:
+    return Error::success();
+  }
 }
 
 namespace llvm {
@@ -113,9 +129,16 @@ Error registerELFGraphInfo(Session &S, LinkGraph &G) {
                                          "\"",
                                      inconvertibleErrorCode());
 
-    bool isGOTSection = isELFGOTSection(Sec);
-    bool isStubsSection = isELFStubsSection(Sec);
-    bool isAArch32StubsSection = isELFAArch32StubsSection(Sec);
+    SectionType SecType;
+    if (isELFGOTSection(Sec)) {
+      SecType = GOT;
+    } else if (isELFStubsSection(Sec)) {
+      SecType = Stubs;
+    } else if (isELFAArch32StubsSection(Sec)) {
+      SecType = AArch32Stubs;
+    } else {
+      SecType = Other;
+    }
 
     bool SectionContainsContent = false;
     bool SectionContainsZeroFill = false;
@@ -128,45 +151,9 @@ Error registerELFGraphInfo(Session &S, LinkGraph &G) {
       if (Sym->getAddress() > LastSym->getAddress())
         LastSym = Sym;
 
-      if (isGOTSection) {
-        if (Sym->isSymbolZeroFill())
-          return make_error<StringError>("zero-fill atom in GOT section",
-                                         inconvertibleErrorCode());
-
-        // If this is a GOT symbol with size (i.e. not the GOT start symbol)
-        // then add it to the GOT entry info table.
-        if (Sym->getSize() != 0) {
-          if (auto TS = getELFGOTTarget(G, Sym->getBlock()))
-            FileInfo.GOTEntryInfos[TS->getName()] = {
-                Sym->getSymbolContent(), Sym->getAddress().getValue(),
-                Sym->getTargetFlags()};
-          else
-            return TS.takeError();
-        }
-        SectionContainsContent = true;
-      } else if (isStubsSection) {
-        if (Sym->isSymbolZeroFill())
-          return make_error<StringError>("zero-fill atom in Stub section",
-                                         inconvertibleErrorCode());
-
-        if (auto TS = getELFStubTarget(G, Sym->getBlock()))
-          FileInfo.StubInfos[TS->getName()] = {Sym->getSymbolContent(),
-                                               Sym->getAddress().getValue(),
-                                               Sym->getTargetFlags()};
-        else
-          return TS.takeError();
-        SectionContainsContent = true;
-      } else if (isAArch32StubsSection) {
-        if (Sym->isSymbolZeroFill())
-          return make_error<StringError>("zero-fill atom in Stub section",
-                                         inconvertibleErrorCode());
-
-        if (auto Name = getELFAArch32StubTargetName(G, Sym->getBlock()))
-          FileInfo.StubInfos[*Name] = {Sym->getSymbolContent(),
-                                       Sym->getAddress().getValue(),
-                                       Sym->getTargetFlags()};
-        else
-          return Name.takeError();
+      if (SecType != Other) {
+        if (Error Err = registerSymbol(G, *Sym, FileInfo, SecType))
+          return Err;
         SectionContainsContent = true;
       }
 
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
index 7dcadd94c2365..2c60c802293a1 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-macho.cpp
@@ -111,29 +111,13 @@ Error registerMachOGraphInfo(Session &S, LinkGraph &G) {
         FirstSym = Sym;
       if (Sym->getAddress() > LastSym->getAddress())
         LastSym = Sym;
-      if (isGOTSection) {
-        if (Sym->isSymbolZeroFill())
-          return make_error<StringError>("zero-fill atom in GOT section",
-                                         inconvertibleErrorCode());
-
-        if (auto TS = getMachOGOTTarget(G, Sym->getBlock()))
-          FileInfo.GOTEntryInfos[TS->getName()] = {Sym->getSymbolContent(),
-                                                   Sym->getAddress().getValue(),
-                                                   Sym->getTargetFlags()};
-        else
-          return TS.takeError();
-        SectionContainsContent = true;
-      } else if (isStubsSection) {
-        if (Sym->isSymbolZeroFill())
-          return make_error<StringError>("zero-fill atom in Stub section",
-                                         inconvertibleErrorCode());
-
-        if (auto TS = getMachOStubTarget(G, Sym->getBlock()))
-          FileInfo.StubInfos[TS->getName()] = {Sym->getSymbolContent(),
-                                               Sym->getAddress().getValue(),
-                                               Sym->getTargetFlags()};
-        else
-          return TS.takeError();
+      if (isGOTSection || isStubsSection) {
+        Error Err =
+            isGOTSection
+                ? FileInfo.registerGOTEntry(G, *Sym, getMachOGOTTarget)
+                : FileInfo.registerStubEntry(G, *Sym, getMachOStubTarget);
+        if (Err)
+          return Err;
         SectionContainsContent = true;
       } else if (Sym->hasName()) {
         if (Sym->isSymbolZeroFill()) {
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index aa032c97485fb..8c18610313ce8 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1183,6 +1183,36 @@ Error Session::loadAndLinkDynamicLibrary(JITDylib &JD, StringRef LibPath) {
   return Error::success();
 }
 
+Error Session::FileInfo::registerGOTEntry(
+    LinkGraph &G, Symbol &Sym, GetSymbolTargetFunction GetSymbolTarget) {
+  if (Sym.isSymbolZeroFill())
+    return make_error<StringError>("Unexpected zero-fill symbol in section " +
+                                       Sym.getBlock().getSection().getName(),
+                                   inconvertibleErrorCode());
+  auto TS = GetSymbolTarget(G, Sym.getBlock());
+  if (!TS)
+    return TS.takeError();
+  GOTEntryInfos[TS->getName()] = {Sym.getSymbolContent(),
+                                  Sym.getAddress().getValue(),
+                                  Sym.getTargetFlags()};
+  return Error::success();
+}
+
+Error Session::FileInfo::registerStubEntry(
+    LinkGraph &G, Symbol &Sym, GetSymbolTargetFunction GetSymbolTarget) {
+  if (Sym.isSymbolZeroFill())
+    return make_error<StringError>("Unexpected zero-fill symbol in section " +
+                                       Sym.getBlock().getSection().getName(),
+                                   inconvertibleErrorCode());
+  auto TS = GetSymbolTarget(G, Sym.getBlock());
+  if (!TS)
+    return TS.takeError();
+  StubInfos[TS->getName()] = {Sym.getSymbolContent(),
+                              Sym.getAddress().getValue(),
+                              Sym.getTargetFlags()};
+  return Error::success();
+}
+
 Expected<Session::FileInfo &> Session::findFileInfo(StringRef FileName) {
   auto FileInfoItr = FileInfos.find(FileName);
   if (FileInfoItr == FileInfos.end())
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.h b/llvm/tools/llvm-jitlink/llvm-jitlink.h
index 3ff406b7b82df..93a00266b1504 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.h
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.h
@@ -51,6 +51,16 @@ struct Session {
     StringMap<MemoryRegionInfo> SectionInfos;
     StringMap<MemoryRegionInfo> StubInfos;
     StringMap<MemoryRegionInfo> GOTEntryInfos;
+
+    using Symbol = jitlink::Symbol;
+    using LinkGraph = jitlink::LinkGraph;
+    using GetSymbolTargetFunction =
+        unique_function<Expected<Symbol &>(LinkGraph &G, jitlink::Block &)>;
+
+    Error registerGOTEntry(LinkGraph &G, Symbol &Sym,
+                           GetSymbolTargetFunction GetSymbolTarget);
+    Error registerStubEntry(LinkGraph &G, Symbol &Sym,
+                            GetSymbolTargetFunction GetSymbolTarget);
   };
 
   using DynLibJDMap = std::map<std::string, orc::JITDylib *>;

From 74f6ae9f245f2ae9e832ba297f451e04e008ada8 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 19 Jan 2024 12:00:16 +0000
Subject: [PATCH 113/843] [LTO] Require asserts for discard-value-names.ll
 test.

The test requires asserts, as it depends on the default value for
-lto-discard-value-names at the moment.
---
 llvm/test/tools/lto/discard-value-names.ll | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/llvm/test/tools/lto/discard-value-names.ll b/llvm/test/tools/lto/discard-value-names.ll
index 7e2c82119028a..723b0701ae22c 100644
--- a/llvm/test/tools/lto/discard-value-names.ll
+++ b/llvm/test/tools/lto/discard-value-names.ll
@@ -5,7 +5,12 @@
 ; RUN: %ld64 -lto_library %llvmshlibdir/libLTO.dylib -dylib -arch x86_64 -macos_version_min 10.10.0 -o %t.dylib %t.o -save-temps  -undefined dynamic_lookup -exported_symbol _bar -lSystem -mllvm -lto-discard-value-names=false
 ; RUN: llvm-dis %t.dylib.lto.opt.bc -o - | FileCheck --check-prefix=KEEP %s
 
+; The test requires asserts, as it depends on the default value for
+; -lto-discard-value-names at the moment.
 ; FIXME: -lto-discard-value-names is ignored at the moment.
+
+; REQUIRES: asserts
+
 ; DISCARD: %cmp.i = icmp
 ; DISCARD: %add = add i32
 

From 879cbe06edf915db2ad3dfee19215d72d5ce76d5 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 19 Jan 2024 12:01:31 +0000
Subject: [PATCH 114/843] [AMDGPU] Fix predicates for BUFFER_ATOMIC_CSUB
 pattern (#78701)

Use OtherPredicates to avoid interfering with other uses of
SubtargetPredicate for GFX12.
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td           | 2 +-
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 732f451e6b96a..c3e5be8334a69 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1712,7 +1712,7 @@ defm : SIBufferAtomicPat<"SIbuffer_atomic_xor", i64, "BUFFER_ATOMIC_XOR_X2">;
 defm : SIBufferAtomicPat<"SIbuffer_atomic_inc", i64, "BUFFER_ATOMIC_INC_X2">;
 defm : SIBufferAtomicPat<"SIbuffer_atomic_dec", i64, "BUFFER_ATOMIC_DEC_X2">;
 
-let SubtargetPredicate = HasAtomicCSubNoRtnInsts in
+let OtherPredicates = [HasAtomicCSubNoRtnInsts] in
 defm : SIBufferAtomicPat<"SIbuffer_atomic_csub", i32, "BUFFER_ATOMIC_CSUB", ["noret"]>;
 
 let SubtargetPredicate = isGFX12Plus in {
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
index 4f1be11309eb0..2b0584d39a3be 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.csub.ll
@@ -16,7 +16,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}buffer_atomic_csub_no_rtn:
 ; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen
-; GFX12PLUS: buffer_atomic_csub_u32 v0, v1, s[0:3], null idxen
+; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen
 define amdgpu_ps void @buffer_atomic_csub_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
 main_body:
   %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 0, i1 0)
@@ -34,7 +34,7 @@ main_body:
 
 ; GCN-LABEL: {{^}}buffer_atomic_csub_off4_slc_no_rtn:
 ; PREGFX12: buffer_atomic_csub v0, v1, s[0:3], 0 idxen offset:4 slc
-; GFX12PLUS: buffer_atomic_csub_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT
+; GFX12PLUS: buffer_atomic_sub_clamp_u32 v0, v1, s[0:3], null idxen offset:4 th:TH_ATOMIC_NT
 define amdgpu_ps void @buffer_atomic_csub_off4_slc_no_rtn(<4 x i32> inreg %rsrc, i32 %data, i32 %vindex) #0 {
 main_body:
   %ret = call i32 @llvm.amdgcn.buffer.atomic.csub(i32 %data, <4 x i32> %rsrc, i32 %vindex, i32 4, i1 1)

From 955417ade2648c2b1a4e5f0be697f61570590a88 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Fri, 19 Jan 2024 12:06:30 +0000
Subject: [PATCH 115/843] Revert "[llvm][AArch64] Copy all operands when
 expanding BLR_BTI bundle (#78267)"

This reverts commit 228aecbcf106a50c30b1f8f1915d61850860cbcd.

Failing expensive checks: https://lab.llvm.org/buildbot/#/builders/16/builds/59798
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  6 ++---
 .../AArch64/blr-bti-preserves-operands.mir    | 24 -------------------
 .../AArch64/blr-bti-preserves-regmask.mir     | 23 ++++++++++++++++++
 3 files changed, 25 insertions(+), 28 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir
 create mode 100644 llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index c45cb8eba8972..bb7f4d907ffd7 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -842,11 +842,9 @@ bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB,
   unsigned Opc = CallTarget.isGlobal() ? AArch64::BL : AArch64::BLR;
   MachineInstr *Call =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
-
-  for (const MachineOperand &MO : MI.operands())
-    Call->addOperand(MO);
-
+  Call->addOperand(CallTarget);
   Call->setCFIType(*MBB.getParent(), MI.getCFIType());
+  Call->copyImplicitOps(*MBB.getParent(), MI);
 
   MachineInstr *BTI =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::HINT))
diff --git a/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir b/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir
deleted file mode 100644
index 3b2f10348bd77..0000000000000
--- a/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir
+++ /dev/null
@@ -1,24 +0,0 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=aarch64-expand-pseudo -o - %s | FileCheck %s
-
-# When expanding a BLR_BTI, we should copy all the operands to the branch in the
-# bundle. Otherwise we could end up using a register after the BL which was
-# clobbered by the function that was called, or overwriting an argument to that
-# function before we make the call.
-# CHECK:    BUNDLE implicit-def $lr, implicit-def $w30, implicit-def $sp, implicit-def $wsp, implicit $x0, implicit $w1, implicit $sp {
-# CHECK:      BL @_setjmp, $x0, $w1, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
-# CHECK:      HINT 36
-# CHECK:    }
-
---- |
-  define void @a() {
-    ret void
-  }
-
-  declare void @_setjmp(...)
-...
----
-name: a
-body: |
-  bb.0:
-    BLR_BTI @_setjmp, $x0, $w1, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
-...
diff --git a/llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir b/llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir
new file mode 100644
index 0000000000000..91652c6e20c8f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir
@@ -0,0 +1,23 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=aarch64-expand-pseudo -o - %s | FileCheck %s
+
+# When expanding a BLR_BTI, we should keep the regmask that was attached to it.
+# Otherwise we could end up using a register after the BL which was clobbered by
+# the function that was called.
+# CHECK:    BUNDLE implicit-def $lr, implicit-def $w30, implicit-def $sp, implicit-def $wsp, implicit $sp {
+# CHECK:      BL @_setjmp, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
+# CHECK:      HINT 36
+# CHECK:    }
+
+--- |
+  define void @a() {
+    ret void
+  }
+
+  declare void @_setjmp(...)
+...
+---
+name: a
+body: |
+  bb.0:
+    BLR_BTI @_setjmp, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+...

From 80ccc72ec729623f6898d50f75447c3057557a8f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 19 Jan 2024 12:19:29 +0000
Subject: [PATCH 116/843] [AMDGPU] Remove GFX12 encoding hack (#78702)

This is no longer needed now that we have implemented GFX12 encoding for
all instructions.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 48fdb803e9d73..5a9401103da82 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9136,12 +9136,6 @@ int SIInstrInfo::pseudoToMCOpcode(int Opcode) const {
 
   int MCOp = AMDGPU::getMCOpcode(Opcode, Gen);
 
-  // TODO-GFX12: Remove this.
-  // Hack to allow some GFX12 codegen tests to run before all the encodings are
-  // implemented.
-  if (MCOp == (uint16_t)-1 && Gen == SIEncodingFamily::GFX12)
-    MCOp = AMDGPU::getMCOpcode(Opcode, SIEncodingFamily::GFX11);
-
   // -1 means that Opcode is already a native instruction.
   if (MCOp == -1)
     return Opcode;

From b4f24be7ef9bb08f6c3d6e96c998a942d95e0038 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Fri, 19 Jan 2024 13:22:02 +0100
Subject: [PATCH 117/843] [mlir][bufferization] Simplify helper
 `potentiallyAliasesMemref` (#78690)

This commit simplifies a helper function in the ownership-based buffer
deallocation pass. Fixes a potential double-free (depending on the
scheduling of patterns).
---
 .../BufferDeallocationSimplification.cpp      | 26 ++++++++-----------
 .../buffer-deallocation-simplification.mlir   | 11 ++++++++
 2 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
index 4d1e21c2e406c..e30779868b475 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/BufferDeallocationSimplification.cpp
@@ -77,18 +77,12 @@ static bool distinctAllocAndBlockArgument(Value v1, Value v2) {
   return areDistinct(v1Base, v2Base) || areDistinct(v2Base, v1Base);
 }
 
-/// Checks if `memref` may or must alias a MemRef in `otherList`. It is often a
-/// requirement of optimization patterns that there cannot be any aliasing
-/// memref in order to perform the desired simplification. The `allowSelfAlias`
-/// argument indicates whether `memref` may be present in `otherList` which
-/// makes this helper function applicable to situations where we already know
-/// that `memref` is in the list but also when we don't want it in the list.
+/// Checks if `memref` may potentially alias a MemRef in `otherList`. It is
+/// often a requirement of optimization patterns that there cannot be any
+/// aliasing memref in order to perform the desired simplification.
 static bool potentiallyAliasesMemref(AliasAnalysis &analysis,
-                                     ValueRange otherList, Value memref,
-                                     bool allowSelfAlias) {
+                                     ValueRange otherList, Value memref) {
   for (auto other : otherList) {
-    if (allowSelfAlias && other == memref)
-      continue;
     if (distinctAllocAndBlockArgument(other, memref))
       continue;
     if (!analysis.alias(other, memref).isNo())
@@ -243,7 +237,7 @@ struct RemoveRetainedMemrefsGuaranteedToNotAlias
 
     for (auto retainedMemref : deallocOp.getRetained()) {
       if (potentiallyAliasesMemref(aliasAnalysis, deallocOp.getMemrefs(),
-                                   retainedMemref, false)) {
+                                   retainedMemref)) {
         newRetainedMemrefs.push_back(retainedMemref);
         replacements.push_back({});
         continue;
@@ -314,11 +308,13 @@ struct SplitDeallocWhenNotAliasingAnyOther
 
     SmallVector<Value> remainingMemrefs, remainingConditions;
     SmallVector<SmallVector<Value>> updatedConditions;
-    for (auto [memref, cond] :
-         llvm::zip(deallocOp.getMemrefs(), deallocOp.getConditions())) {
+    for (int64_t i = 0, e = deallocOp.getMemrefs().size(); i < e; ++i) {
+      Value memref = deallocOp.getMemrefs()[i];
+      Value cond = deallocOp.getConditions()[i];
+      SmallVector<Value> otherMemrefs(deallocOp.getMemrefs());
+      otherMemrefs.erase(otherMemrefs.begin() + i);
       // Check if `memref` can split off into a separate bufferization.dealloc.
-      if (potentiallyAliasesMemref(aliasAnalysis, deallocOp.getMemrefs(),
-                                   memref, true)) {
+      if (potentiallyAliasesMemref(aliasAnalysis, otherMemrefs, memref)) {
         // `memref` alias with other memrefs, do not split off.
         remainingMemrefs.push_back(memref);
         remainingConditions.push_back(cond);
diff --git a/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation-simplification.mlir b/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation-simplification.mlir
index e192e9870becd..eee69acbe821b 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation-simplification.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/buffer-deallocation-simplification.mlir
@@ -160,3 +160,14 @@ func.func @alloc_and_bbarg(%arg0: memref<5xf32>, %arg1: index, %arg2: index, %ar
 //       CHECK:     bufferization.dealloc (%[[view]] : memref<f32>)
 //   CHECK-NOT:     retain
 //       CHECK:     scf.yield %[[alloc]], %[[true]]
+
+// -----
+
+func.func @duplicate_memref(%arg0: memref<5xf32>, %arg1: memref<6xf32>, %c: i1) -> i1 {
+  %0 = bufferization.dealloc (%arg0, %arg0 : memref<5xf32>, memref<5xf32>) if (%c, %c) retain (%arg1 : memref<6xf32>)
+  return %0 : i1
+}
+
+// CHECK-LABEL: func @duplicate_memref(
+//       CHECK:   %[[r:.*]] = bufferization.dealloc (%{{.*}} : memref<5xf32>) if (%{{.*}}) retain (%{{.*}} : memref<6xf32>)
+//       CHECK:   return %[[r]]

From 24e5229230786b650a24d70bf7b10611ceb910fb Mon Sep 17 00:00:00 2001
From: Daniil Dudkin <unterumarmung@yandex.ru>
Date: Fri, 19 Jan 2024 15:57:02 +0300
Subject: [PATCH 118/843] [clang-apply-replacements] Deduplicate Implementation
 of `collectReplacementsFromDirectory` (NFC) (#78630)

* Convert `collectReplacementsFromDirectory` into a function template.
* Employ explicit specialization to maintain implementation in the
source file.
* Utilize the function template in the source file to eliminate code
duplication.
* Update the documentation for the function.
---
 .../Tooling/ApplyReplacements.h               | 14 ++++-
 .../lib/Tooling/ApplyReplacements.cpp         | 59 ++++++-------------
 2 files changed, 28 insertions(+), 45 deletions(-)

diff --git a/clang-tools-extra/clang-apply-replacements/include/clang-apply-replacements/Tooling/ApplyReplacements.h b/clang-tools-extra/clang-apply-replacements/include/clang-apply-replacements/Tooling/ApplyReplacements.h
index 1f0d873719140..7a8408bcdff8f 100644
--- a/clang-tools-extra/clang-apply-replacements/include/clang-apply-replacements/Tooling/ApplyReplacements.h
+++ b/clang-tools-extra/clang-apply-replacements/include/clang-apply-replacements/Tooling/ApplyReplacements.h
@@ -51,19 +51,27 @@ using FileToChangesMap =
 /// Directories starting with '.' are ignored during traversal.
 ///
 /// \param[in] Directory Directory to begin search for serialized
-/// TranslationUnitReplacements.
+/// TranslationUnitReplacements or TranslationUnitDiagnostics.
 /// \param[out] TUs Collection of all found and deserialized
 /// TranslationUnitReplacements or TranslationUnitDiagnostics.
-/// \param[out] TUFiles Collection of all TranslationUnitReplacement files
-/// found in \c Directory.
+/// \param[out] TUFiles Collection of all TranslationUnitReplacement or
+/// TranslationUnitDiagnostics files found in \c Directory.
 /// \param[in] Diagnostics DiagnosticsEngine used for error output.
 ///
 /// \returns An error_code indicating success or failure in navigating the
 /// directory structure.
+template <typename TranslationUnits>
+std::error_code collectReplacementsFromDirectory(
+    const llvm::StringRef Directory, TranslationUnits &TUs,
+    TUReplacementFiles &TUFiles,
+    clang::DiagnosticsEngine &Diagnostics) = delete;
+
+template <>
 std::error_code collectReplacementsFromDirectory(
     const llvm::StringRef Directory, TUReplacements &TUs,
     TUReplacementFiles &TUFiles, clang::DiagnosticsEngine &Diagnostics);
 
+template <>
 std::error_code collectReplacementsFromDirectory(
     const llvm::StringRef Directory, TUDiagnostics &TUs,
     TUReplacementFiles &TUFiles, clang::DiagnosticsEngine &Diagnostics);
diff --git a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
index b490780f48529..87ed1b8797cb0 100644
--- a/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
+++ b/clang-tools-extra/clang-apply-replacements/lib/Tooling/ApplyReplacements.cpp
@@ -38,8 +38,10 @@ static void eatDiagnostics(const SMDiagnostic &, void *) {}
 namespace clang {
 namespace replace {
 
-std::error_code collectReplacementsFromDirectory(
-    const llvm::StringRef Directory, TUReplacements &TUs,
+namespace detail {
+template <typename TranslationUnits>
+static std::error_code collectReplacementsFromDirectory(
+    const llvm::StringRef Directory, TranslationUnits &TUs,
     TUReplacementFiles &TUFiles, clang::DiagnosticsEngine &Diagnostics) {
   using namespace llvm::sys::fs;
   using namespace llvm::sys::path;
@@ -68,7 +70,7 @@ std::error_code collectReplacementsFromDirectory(
     }
 
     yaml::Input YIn(Out.get()->getBuffer(), nullptr, &eatDiagnostics);
-    tooling::TranslationUnitReplacements TU;
+    typename TranslationUnits::value_type TU;
     YIn >> TU;
     if (YIn.error()) {
       // File doesn't appear to be a header change description. Ignore it.
@@ -81,49 +83,22 @@ std::error_code collectReplacementsFromDirectory(
 
   return ErrorCode;
 }
+} // namespace detail
 
+template <>
 std::error_code collectReplacementsFromDirectory(
-    const llvm::StringRef Directory, TUDiagnostics &TUs,
+    const llvm::StringRef Directory, TUReplacements &TUs,
     TUReplacementFiles &TUFiles, clang::DiagnosticsEngine &Diagnostics) {
-  using namespace llvm::sys::fs;
-  using namespace llvm::sys::path;
-
-  std::error_code ErrorCode;
-
-  for (recursive_directory_iterator I(Directory, ErrorCode), E;
-       I != E && !ErrorCode; I.increment(ErrorCode)) {
-    if (filename(I->path())[0] == '.') {
-      // Indicate not to descend into directories beginning with '.'
-      I.no_push();
-      continue;
-    }
-
-    if (extension(I->path()) != ".yaml")
-      continue;
-
-    TUFiles.push_back(I->path());
-
-    ErrorOr<std::unique_ptr<MemoryBuffer>> Out =
-        MemoryBuffer::getFile(I->path());
-    if (std::error_code BufferError = Out.getError()) {
-      errs() << "Error reading " << I->path() << ": " << BufferError.message()
-             << "\n";
-      continue;
-    }
-
-    yaml::Input YIn(Out.get()->getBuffer(), nullptr, &eatDiagnostics);
-    tooling::TranslationUnitDiagnostics TU;
-    YIn >> TU;
-    if (YIn.error()) {
-      // File doesn't appear to be a header change description. Ignore it.
-      continue;
-    }
-
-    // Only keep files that properly parse.
-    TUs.push_back(TU);
-  }
+  return detail::collectReplacementsFromDirectory(Directory, TUs, TUFiles,
+                                                  Diagnostics);
+}
 
-  return ErrorCode;
+template <>
+std::error_code collectReplacementsFromDirectory(
+    const llvm::StringRef Directory, TUDiagnostics &TUs,
+    TUReplacementFiles &TUFiles, clang::DiagnosticsEngine &Diagnostics) {
+  return detail::collectReplacementsFromDirectory(Directory, TUs, TUFiles,
+                                                  Diagnostics);
 }
 
 /// Extract replacements from collected TranslationUnitReplacements and

From 689da340edaa9f61dfae90e94ac69bfe189ee78f Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter@arm.com>
Date: Fri, 19 Jan 2024 11:06:21 +0000
Subject: [PATCH 119/843] [NFC][LV] Test precommit for interleaved linear args

---
 .../LoopVectorize/AArch64/masked-call.ll      | 594 +++++++++++++-----
 .../AArch64/uniform-args-call-variants.ll     | 173 ++++-
 .../AArch64/vector-call-linear-args.ll        | 208 +++++-
 3 files changed, 787 insertions(+), 188 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
index d2ef5b2d14bc5..1e79c3e1e8dc2 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/masked-call.ll
@@ -2,6 +2,7 @@
 ; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=TFNONE
 ; RUN: opt < %s -passes=loop-vectorize,instsimplify,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TFCOMMON,TFALWAYS
 ; RUN: opt < %s -passes=loop-vectorize,instsimplify,simplifycfg -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-else-scalar-epilogue -S | FileCheck %s --check-prefixes=TFCOMMON,TFFALLBACK
+; RUN: opt < %s -passes=loop-vectorize,instsimplify,simplifycfg -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefixes=TFA_INTERLEAVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -19,17 +20,17 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
-; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
-; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
-; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; TFNONE-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; TFNONE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; TFNONE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFNONE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; TFNONE:       middle.block:
@@ -60,19 +61,19 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; TFCOMMON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TFCOMMON-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFCOMMON-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; TFCOMMON-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TFCOMMON-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFCOMMON-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFCOMMON-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFCOMMON:       vector.body:
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFCOMMON-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFCOMMON-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; TFCOMMON-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFCOMMON-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x i1> [[TMP10]], i32 0
@@ -80,6 +81,54 @@ define void @test_widen(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON:       for.cond.cleanup:
 ; TFCOMMON-NEXT:    ret void
 ;
+; TFA_INTERLEAVE-LABEL: @test_widen(
+; TFA_INTERLEAVE-NEXT:  entry:
+; TFA_INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TFA_INTERLEAVE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
+; TFA_INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; TFA_INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TFA_INTERLEAVE:       vector.body:
+; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP11]]
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 [[TMP17]]
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP14]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX_NEXT]], [[TMP20]]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP21]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 2 x i1> [[TMP22]], i32 0
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP23]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; TFA_INTERLEAVE:       for.cond.cleanup:
+; TFA_INTERLEAVE-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -111,20 +160,20 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
-; TFNONE-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 2
+; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
-; TFNONE-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; TFNONE-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> [[TMP5]])
-; TFNONE-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[TMP5]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP6]]
-; TFNONE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP8]], align 8
-; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
+; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; TFNONE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFNONE-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> [[TMP7]])
+; TFNONE-NEXT:    [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP7]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP8]]
+; TFNONE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP10]], align 8
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; TFNONE-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFNONE-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; TFNONE:       middle.block:
@@ -161,25 +210,25 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; TFCOMMON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TFCOMMON-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFCOMMON-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
-; TFCOMMON-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 2
+; TFCOMMON-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFCOMMON-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFCOMMON-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFCOMMON:       vector.body:
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFCOMMON-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; TFCOMMON-NEXT:    [[TMP7:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP7]])
-; TFCOMMON-NEXT:    [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP6]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFCOMMON-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT:    [[TMP12:%.*]] = or <vscale x 2 x i1> [[TMP7]], [[TMP10]]
-; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP8]]
-; TFCOMMON-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[TMP12]])
-; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP14]]
+; TFCOMMON-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFCOMMON-NEXT:    [[TMP9:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i1> zeroinitializer
+; TFCOMMON-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP9]])
+; TFCOMMON-NEXT:    [[TMP11:%.*]] = xor <vscale x 2 x i1> [[TMP8]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFCOMMON-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP11]], <vscale x 2 x i1> zeroinitializer
+; TFCOMMON-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP9]], [[TMP12]]
+; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP12]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP10]]
+; TFCOMMON-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP14]], i32 8, <vscale x 2 x i1> [[TMP13]])
+; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP15:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFCOMMON-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 2 x i1> [[TMP15]], i32 0
@@ -187,6 +236,66 @@ define void @test_if_then(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON:       for.cond.cleanup:
 ; TFCOMMON-NEXT:    ret void
 ;
+; TFA_INTERLEAVE-LABEL: @test_if_then(
+; TFA_INTERLEAVE-NEXT:  entry:
+; TFA_INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TFA_INTERLEAVE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
+; TFA_INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; TFA_INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TFA_INTERLEAVE:       vector.body:
+; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP11]]
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP15]])
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[TMP16]])
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = xor <vscale x 2 x i1> [[TMP13]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = xor <vscale x 2 x i1> [[TMP14]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP19]], <vscale x 2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP20]], <vscale x 2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = or <vscale x 2 x i1> [[TMP15]], [[TMP21]]
+; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = or <vscale x 2 x i1> [[TMP16]], [[TMP22]]
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP21]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP17]]
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP22]], <vscale x 2 x i64> zeroinitializer, <vscale x 2 x i64> [[TMP18]]
+; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = mul i64 [[TMP26]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP25]], i64 [[TMP27]]
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP25]], i32 8, <vscale x 2 x i1> [[TMP23]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP28]], i32 8, <vscale x 2 x i1> [[TMP24]])
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT5]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP31:%.*]] = add i64 [[INDEX_NEXT]], [[TMP30]]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT6]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP31]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[TMP32:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP33:%.*]] = extractelement <vscale x 2 x i1> [[TMP32]], i32 0
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP33]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; TFA_INTERLEAVE:       for.cond.cleanup:
+; TFA_INTERLEAVE-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -229,21 +338,21 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
-; TFNONE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
-; TFNONE-NEXT:    [[TMP5:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; TFNONE-NEXT:    [[TMP6:%.*]] = xor <vscale x 2 x i1> [[TMP5]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFNONE-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP6]])
-; TFNONE-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> [[TMP5]])
-; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i64> [[TMP7]], <vscale x 2 x i64> [[TMP8]]
-; TFNONE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP9]], align 8
-; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; TFNONE-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFNONE-NEXT:    [[TMP8:%.*]] = xor <vscale x 2 x i1> [[TMP7]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFNONE-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP8]])
+; TFNONE-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_LOAD]], <vscale x 2 x i1> [[TMP7]])
+; TFNONE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP10]]
+; TFNONE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[PREDPHI]], ptr [[TMP11]], align 8
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; TFNONE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFNONE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TFNONE:       middle.block:
@@ -283,26 +392,26 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; TFCOMMON-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TFCOMMON-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFCOMMON-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; TFCOMMON-NEXT:    [[TMP15:%.*]] = mul i64 [[TMP14]], 2
+; TFCOMMON-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFCOMMON-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFCOMMON-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFCOMMON:       vector.body:
 ; TFCOMMON-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFCOMMON-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFCOMMON-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; TFCOMMON-NEXT:    [[TMP7:%.*]] = xor <vscale x 2 x i1> [[TMP6]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; TFCOMMON-NEXT:    [[TMP8:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP7]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP8]])
-; TFCOMMON-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP6]], <vscale x 2 x i1> zeroinitializer
-; TFCOMMON-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP10]])
-; TFCOMMON-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i1> [[TMP8]], [[TMP10]]
-; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i64> [[TMP9]], <vscale x 2 x i64> [[TMP11]]
-; TFCOMMON-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[TMP13]])
-; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP15]]
+; TFCOMMON-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFCOMMON-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFCOMMON-NEXT:    [[TMP9:%.*]] = xor <vscale x 2 x i1> [[TMP8]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFCOMMON-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP9]], <vscale x 2 x i1> zeroinitializer
+; TFCOMMON-NEXT:    [[TMP11:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP10]])
+; TFCOMMON-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP8]], <vscale x 2 x i1> zeroinitializer
+; TFCOMMON-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP12]])
+; TFCOMMON-NEXT:    [[TMP14:%.*]] = or <vscale x 2 x i1> [[TMP10]], [[TMP12]]
+; TFCOMMON-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP10]], <vscale x 2 x i64> [[TMP11]], <vscale x 2 x i64> [[TMP13]]
+; TFCOMMON-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFCOMMON-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[TMP14]])
+; TFCOMMON-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFCOMMON-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFCOMMON-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFCOMMON-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
@@ -310,6 +419,68 @@ define void @test_widen_if_then_else(ptr noalias %a, ptr readnone %b) #4 {
 ; TFCOMMON:       for.cond.cleanup:
 ; TFCOMMON-NEXT:    ret void
 ;
+; TFA_INTERLEAVE-LABEL: @test_widen_if_then_else(
+; TFA_INTERLEAVE-NEXT:  entry:
+; TFA_INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TFA_INTERLEAVE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
+; TFA_INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; TFA_INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TFA_INTERLEAVE:       vector.body:
+; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT6:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP9]], i64 [[TMP11]]
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = icmp ugt <vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 50, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = xor <vscale x 2 x i1> [[TMP13]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[TMP14]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP15]], <vscale x 2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP16]], <vscale x 2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP17]])
+; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> zeroinitializer, <vscale x 2 x i1> [[TMP18]])
+; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i1> [[TMP13]], <vscale x 2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i1> [[TMP14]], <vscale x 2 x i1> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[TMP21]])
+; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[TMP22]])
+; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = or <vscale x 2 x i1> [[TMP17]], [[TMP21]]
+; TFA_INTERLEAVE-NEXT:    [[TMP26:%.*]] = or <vscale x 2 x i1> [[TMP18]], [[TMP22]]
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI:%.*]] = select <vscale x 2 x i1> [[TMP17]], <vscale x 2 x i64> [[TMP19]], <vscale x 2 x i64> [[TMP23]]
+; TFA_INTERLEAVE-NEXT:    [[PREDPHI4:%.*]] = select <vscale x 2 x i1> [[TMP18]], <vscale x 2 x i64> [[TMP20]], <vscale x 2 x i64> [[TMP24]]
+; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i64, ptr [[TMP27]], i64 [[TMP29]]
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI]], ptr [[TMP27]], i32 8, <vscale x 2 x i1> [[TMP25]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[PREDPHI4]], ptr [[TMP30]], i32 8, <vscale x 2 x i1> [[TMP26]])
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT5]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[TMP31:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP32:%.*]] = mul i64 [[TMP31]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP33:%.*]] = add i64 [[INDEX_NEXT]], [[TMP32]]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT6]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP33]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[TMP34:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP35:%.*]] = extractelement <vscale x 2 x i1> [[TMP34]], i32 0
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP35]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; TFA_INTERLEAVE:       for.cond.cleanup:
+; TFA_INTERLEAVE-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -355,17 +526,17 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
-; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
-; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
-; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
-; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; TFNONE-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
+; TFNONE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; TFNONE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFNONE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TFNONE:       middle.block:
@@ -413,17 +584,17 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; TFFALLBACK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFFALLBACK-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
-; TFFALLBACK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; TFFALLBACK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; TFFALLBACK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; TFFALLBACK-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; TFFALLBACK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFFALLBACK:       vector.body:
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFFALLBACK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
-; TFFALLBACK-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
-; TFFALLBACK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
-; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; TFFALLBACK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFFALLBACK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; TFFALLBACK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
+; TFFALLBACK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFFALLBACK-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; TFFALLBACK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFFALLBACK-NEXT:    br i1 [[TMP9]], label [[SCALAR_PH]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; TFFALLBACK:       scalar.ph:
@@ -442,6 +613,22 @@ define void @test_widen_nomask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK:       for.cond.cleanup:
 ; TFFALLBACK-NEXT:    ret void
 ;
+; TFA_INTERLEAVE-LABEL: @test_widen_nomask(
+; TFA_INTERLEAVE-NEXT:  entry:
+; TFA_INTERLEAVE-NEXT:    br label [[FOR_BODY:%.*]]
+; TFA_INTERLEAVE:       for.body:
+; TFA_INTERLEAVE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[GEP:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDVARS_IV]]
+; TFA_INTERLEAVE-NEXT:    [[LOAD:%.*]] = load i64, ptr [[GEP]], align 8
+; TFA_INTERLEAVE-NEXT:    [[CALL:%.*]] = call i64 @foo(i64 [[LOAD]]) #[[ATTR5:[0-9]+]]
+; TFA_INTERLEAVE-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDVARS_IV]]
+; TFA_INTERLEAVE-NEXT:    store i64 [[CALL]], ptr [[ARRAYIDX]], align 8
+; TFA_INTERLEAVE-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
+; TFA_INTERLEAVE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
+; TFA_INTERLEAVE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]]
+; TFA_INTERLEAVE:       for.cond.cleanup:
+; TFA_INTERLEAVE-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -475,17 +662,17 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
-; TFNONE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; TFNONE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP4]], align 8
-; TFNONE-NEXT:    [[TMP5:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
-; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP5]], ptr [[TMP6]], align 8
-; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP6]], align 8
+; TFNONE-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @foo_vector_nomask(<vscale x 2 x i64> [[WIDE_LOAD]])
+; TFNONE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; TFNONE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFNONE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; TFNONE:       middle.block:
@@ -516,19 +703,19 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFALWAYS-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; TFALWAYS-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TFALWAYS-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFALWAYS-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; TFALWAYS-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TFALWAYS-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFALWAYS-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFALWAYS-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFALWAYS:       vector.body:
 ; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFALWAYS-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; TFALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFALWAYS-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFALWAYS-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFALWAYS-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x i1> [[TMP10]], i32 0
@@ -546,19 +733,19 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; TFFALLBACK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TFFALLBACK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; TFFALLBACK-NEXT:    [[TMP9:%.*]] = mul i64 [[TMP8]], 2
+; TFFALLBACK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFFALLBACK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFFALLBACK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFFALLBACK:       vector.body:
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
-; TFFALLBACK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFFALLBACK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP9]]
+; TFFALLBACK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFFALLBACK-NEXT:    [[TMP10:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFFALLBACK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 2 x i1> [[TMP10]], i32 0
@@ -566,6 +753,54 @@ define void @test_widen_optmask(ptr noalias %a, ptr readnone %b) #4 {
 ; TFFALLBACK:       for.cond.cleanup:
 ; TFFALLBACK-NEXT:    ret void
 ;
+; TFA_INTERLEAVE-LABEL: @test_widen_optmask(
+; TFA_INTERLEAVE-NEXT:  entry:
+; TFA_INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TFA_INTERLEAVE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
+; TFA_INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; TFA_INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TFA_INTERLEAVE:       vector.body:
+; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr i64, ptr [[TMP9]], i64 [[TMP11]]
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x i64> @llvm.masked.load.nxv2i64.p0(ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x i64> poison)
+; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[WIDE_MASKED_LOAD3]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = mul i64 [[TMP16]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i64, ptr [[TMP15]], i64 [[TMP17]]
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP13]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP14]], ptr [[TMP18]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX_NEXT]], [[TMP20]]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP21]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 2 x i1> [[TMP22]], i32 0
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP23]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; TFA_INTERLEAVE:       for.cond.cleanup:
+; TFA_INTERLEAVE-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -599,30 +834,30 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 2
 ; TFNONE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1025, [[TMP3]]
 ; TFNONE-NEXT:    [[N_VEC:%.*]] = sub i64 1025, [[N_MOD_VF]]
-; TFNONE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; TFNONE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; TFNONE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; TFNONE-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
 ; TFNONE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[M:%.*]], i64 0
 ; TFNONE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
 ; TFNONE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; TFNONE:       vector.body:
 ; TFNONE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFNONE-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; TFNONE-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP4]], align 8
-; TFNONE-NEXT:    [[TMP5:%.*]] = fmul <vscale x 2 x double> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; TFNONE-NEXT:    [[TMP6:%.*]] = fptoui <vscale x 2 x double> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; TFNONE-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP6]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
-; TFNONE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP7]], ptr [[TMP8]], align 8
-; TFNONE-NEXT:    [[TMP9]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP5]])
-; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; TFNONE-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
+; TFNONE-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP6]], align 8
+; TFNONE-NEXT:    [[TMP7:%.*]] = fmul <vscale x 2 x double> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; TFNONE-NEXT:    [[TMP8:%.*]] = fptoui <vscale x 2 x double> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; TFNONE-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP8]], <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
+; TFNONE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFNONE-NEXT:    store <vscale x 2 x i64> [[TMP9]], ptr [[TMP10]], align 8
+; TFNONE-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP7]])
+; TFNONE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; TFNONE-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; TFNONE-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; TFNONE:       middle.block:
 ; TFNONE-NEXT:    br i1 false, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; TFNONE:       scalar.ph:
 ; TFNONE-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
-; TFNONE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; TFNONE-NEXT:    [[BC_MERGE_RDX:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; TFNONE-NEXT:    br label [[FOR_BODY:%.*]]
 ; TFNONE:       for.body:
 ; TFNONE-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
@@ -638,7 +873,7 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFNONE-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], 1025
 ; TFNONE-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; TFNONE:       for.cond.cleanup:
-; TFNONE-NEXT:    [[MULADD_LCSSA:%.*]] = phi double [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
+; TFNONE-NEXT:    [[MULADD_LCSSA:%.*]] = phi double [ [[MULADD]], [[FOR_BODY]] ], [ [[TMP11]], [[MIDDLE_BLOCK]] ]
 ; TFNONE-NEXT:    ret double [[MULADD_LCSSA]]
 ;
 ; TFALWAYS-LABEL: @test_widen_fmuladd_and_call(
@@ -651,8 +886,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFALWAYS-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; TFALWAYS-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TFALWAYS-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFALWAYS-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; TFALWAYS-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; TFALWAYS-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFALWAYS-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFALWAYS-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[M:%.*]], i64 0
 ; TFALWAYS-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -660,23 +895,23 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFALWAYS:       vector.body:
 ; TFALWAYS-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFALWAYS-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; TFALWAYS-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; TFALWAYS-NEXT:    [[TMP6:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; TFALWAYS-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
-; TFALWAYS-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFALWAYS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFALWAYS-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-; TFALWAYS-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
-; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
+; TFALWAYS-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; TFALWAYS-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; TFALWAYS-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; TFALWAYS-NEXT:    [[TMP8:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
+; TFALWAYS-NEXT:    [[TMP9:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
+; TFALWAYS-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFALWAYS-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFALWAYS-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP8]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; TFALWAYS-NEXT:    [[TMP13]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP12]])
+; TFALWAYS-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFALWAYS-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFALWAYS-NEXT:    [[TMP14:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFALWAYS-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[TMP14]], i32 0
 ; TFALWAYS-NEXT:    br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; TFALWAYS:       for.cond.cleanup:
-; TFALWAYS-NEXT:    ret double [[TMP11]]
+; TFALWAYS-NEXT:    ret double [[TMP13]]
 ;
 ; TFFALLBACK-LABEL: @test_widen_fmuladd_and_call(
 ; TFFALLBACK-NEXT:  entry:
@@ -688,8 +923,8 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFFALLBACK-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
 ; TFFALLBACK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
 ; TFFALLBACK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
-; TFFALLBACK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; TFFALLBACK-NEXT:    [[TMP13:%.*]] = mul i64 [[TMP12]], 2
+; TFFALLBACK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFFALLBACK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 2
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
 ; TFFALLBACK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[M:%.*]], i64 0
 ; TFFALLBACK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
@@ -697,23 +932,82 @@ define double @test_widen_fmuladd_and_call(ptr noalias %a, ptr readnone %b, doub
 ; TFFALLBACK:       vector.body:
 ; TFFALLBACK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; TFFALLBACK-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; TFFALLBACK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; TFFALLBACK-NEXT:    [[TMP6:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
-; TFFALLBACK-NEXT:    [[TMP7:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
-; TFFALLBACK-NEXT:    [[TMP8:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP7]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFFALLBACK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP8]], ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; TFFALLBACK-NEXT:    [[TMP10:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
-; TFFALLBACK-NEXT:    [[TMP11]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP10]])
-; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP13]]
+; TFFALLBACK-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; TFFALLBACK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; TFFALLBACK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; TFFALLBACK-NEXT:    [[TMP8:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
+; TFFALLBACK-NEXT:    [[TMP9:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
+; TFFALLBACK-NEXT:    [[TMP10:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP9]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFFALLBACK-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP10]], ptr [[TMP11]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFFALLBACK-NEXT:    [[TMP12:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP8]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; TFFALLBACK-NEXT:    [[TMP13]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP12]])
+; TFFALLBACK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP6]]
 ; TFFALLBACK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
 ; TFFALLBACK-NEXT:    [[TMP14:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
 ; TFFALLBACK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 2 x i1> [[TMP14]], i32 0
 ; TFFALLBACK-NEXT:    br i1 [[TMP15]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; TFFALLBACK:       for.cond.cleanup:
-; TFFALLBACK-NEXT:    ret double [[TMP11]]
+; TFFALLBACK-NEXT:    ret double [[TMP13]]
+;
+; TFA_INTERLEAVE-LABEL: @test_widen_fmuladd_and_call(
+; TFA_INTERLEAVE-NEXT:  entry:
+; TFA_INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP4:%.*]] = sub i64 [[TMP3]], 1
+; TFA_INTERLEAVE-NEXT:    [[N_RND_UP:%.*]] = add i64 1025, [[TMP4]]
+; TFA_INTERLEAVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
+; TFA_INTERLEAVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[N_RND_UP]], [[N_MOD_VF]]
+; TFA_INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 4
+; TFA_INTERLEAVE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 2
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP8]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x double> poison, double [[M:%.*]], i64 0
+; TFA_INTERLEAVE-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x double> [[BROADCAST_SPLATINSERT]], <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer
+; TFA_INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; TFA_INTERLEAVE:       vector.body:
+; TFA_INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT4:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi double [ 0.000000e+00, [[ENTRY]] ], [ [[TMP26:%.*]], [[VECTOR_BODY]] ]
+; TFA_INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP10]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP9]], i64 [[TMP11]]
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP9]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; TFA_INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP12]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; TFA_INTERLEAVE-NEXT:    [[TMP13:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD]], [[BROADCAST_SPLAT]]
+; TFA_INTERLEAVE-NEXT:    [[TMP14:%.*]] = fmul <vscale x 2 x double> [[WIDE_MASKED_LOAD3]], [[BROADCAST_SPLAT]]
+; TFA_INTERLEAVE-NEXT:    [[TMP15:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD]] to <vscale x 2 x i64>
+; TFA_INTERLEAVE-NEXT:    [[TMP16:%.*]] = fptoui <vscale x 2 x double> [[WIDE_MASKED_LOAD3]] to <vscale x 2 x i64>
+; TFA_INTERLEAVE-NEXT:    [[TMP17:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP15]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    [[TMP18:%.*]] = call <vscale x 2 x i64> @foo_vector(<vscale x 2 x i64> [[TMP16]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
+; TFA_INTERLEAVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP21:%.*]] = mul i64 [[TMP20]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[TMP19]], i64 [[TMP21]]
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP17]], ptr [[TMP19]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; TFA_INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2i64.p0(<vscale x 2 x i64> [[TMP18]], ptr [[TMP22]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; TFA_INTERLEAVE-NEXT:    [[TMP23:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> [[TMP13]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP24:%.*]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[VEC_PHI]], <vscale x 2 x double> [[TMP23]])
+; TFA_INTERLEAVE-NEXT:    [[TMP25:%.*]] = select <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> [[TMP14]], <vscale x 2 x double> shufflevector (<vscale x 2 x double> insertelement (<vscale x 2 x double> poison, double -0.000000e+00, i64 0), <vscale x 2 x double> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP26]] = call double @llvm.vector.reduce.fadd.nxv2f64(double [[TMP24]], <vscale x 2 x double> [[TMP25]])
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT:%.*]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[INDEX_NEXT4]] = add i64 [[INDEX]], [[TMP6]]
+; TFA_INTERLEAVE-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
+; TFA_INTERLEAVE-NEXT:    [[TMP28:%.*]] = mul i64 [[TMP27]], 2
+; TFA_INTERLEAVE-NEXT:    [[TMP29:%.*]] = add i64 [[INDEX_NEXT]], [[TMP28]]
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP29]], i64 1025)
+; TFA_INTERLEAVE-NEXT:    [[TMP30:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
+; TFA_INTERLEAVE-NEXT:    [[TMP31:%.*]] = extractelement <vscale x 2 x i1> [[TMP30]], i32 0
+; TFA_INTERLEAVE-NEXT:    br i1 [[TMP31]], label [[FOR_COND_CLEANUP:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; TFA_INTERLEAVE:       for.cond.cleanup:
+; TFA_INTERLEAVE-NEXT:    ret double [[TMP26]]
 ;
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
index bda20ae18bc96..17edfe513dd01 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/uniform-args-call-variants.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
 ; RUN: opt < %s -passes=loop-vectorize,simplifycfg,instcombine -force-vector-interleave=1 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s
+; RUN: opt < %s -passes=loop-vectorize,simplifycfg,instcombine -force-vector-interleave=2 -prefer-predicate-over-epilogue=predicate-dont-vectorize -S | FileCheck %s --check-prefix=INTERLEAVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -9,28 +10,73 @@ define void @test_uniform(ptr noalias %dst, ptr readonly %src, i64 %uniform , i6
 ; CHECK-LABEL: define void @test_uniform
 ; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP4]], ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
+; INTERLEAVE-LABEL: define void @test_uniform
+; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; INTERLEAVE-NEXT:  entry:
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP6]])
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP8]], 1
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]])
+; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; INTERLEAVE:       vector.body:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP11]], 1
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]]
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x double> @foo_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i64 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP17]], 1
+; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]]
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP15]], ptr [[TMP19]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP21:%.*]] = shl i64 [[TMP20]], 1
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP7]])
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; INTERLEAVE:       for.cond.cleanup:
+; INTERLEAVE-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -53,28 +99,73 @@ define void @test_uniform_smaller_scalar(ptr noalias %dst, ptr readonly %src, i3
 ; CHECK-LABEL: define void @test_uniform_smaller_scalar
 ; CHECK-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[TMP6]], 1
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP1]])
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 1
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP3]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP4]], ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP7]]
-; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP2]])
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP5]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP6]], ptr [[TMP7]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
+; INTERLEAVE-LABEL: define void @test_uniform_smaller_scalar
+; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i32 [[UNIFORM:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; INTERLEAVE-NEXT:  entry:
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = shl i64 [[TMP0]], 2
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = shl i64 [[TMP2]], 2
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP3]])
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = shl i64 [[TMP5]], 2
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 [[TMP6]])
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = shl i64 [[TMP8]], 1
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 [[N]])
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP9]], i64 [[N]])
+; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; INTERLEAVE:       vector.body:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT4:%.*]], [[VECTOR_BODY]] ]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = shl i64 [[TMP11]], 1
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP10]], i64 [[TMP12]]
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP10]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[WIDE_MASKED_LOAD3:%.*]] = call <vscale x 2 x double> @llvm.masked.load.nxv2f64.p0(ptr [[TMP13]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]], <vscale x 2 x double> poison)
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    [[TMP15:%.*]] = call <vscale x 2 x double> @bar_uniform(<vscale x 2 x double> [[WIDE_MASKED_LOAD3]], i32 [[UNIFORM]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP18:%.*]] = shl i64 [[TMP17]], 1
+; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds double, ptr [[TMP16]], i64 [[TMP18]]
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP16]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; INTERLEAVE-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP15]], ptr [[TMP19]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK2]])
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = call i64 @llvm.vscale.i64()
+; INTERLEAVE-NEXT:    [[TMP21:%.*]] = shl i64 [[TMP20]], 1
+; INTERLEAVE-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], [[TMP21]]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP4]])
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT4]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[TMP22]], i64 [[TMP7]])
+; INTERLEAVE-NEXT:    [[TMP23:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT:    br i1 [[TMP23]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP3:![0-9]+]]
+; INTERLEAVE:       for.cond.cleanup:
+; INTERLEAVE-NEXT:    ret void
+;
 entry:
   br label %for.body
 
@@ -112,6 +203,48 @@ define void @test_uniform_not_invariant(ptr noalias %dst, ptr readonly %src, i64
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
 ;
+; INTERLEAVE-LABEL: define void @test_uniform_not_invariant
+; INTERLEAVE-SAME: (ptr noalias [[DST:%.*]], ptr readonly [[SRC:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; INTERLEAVE-NEXT:  entry:
+; INTERLEAVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2)
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = call i64 @llvm.usub.sat.i64(i64 [[N]], i64 2)
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 0, i64 [[N]])
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_ENTRY1:%.*]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 1, i64 [[N]])
+; INTERLEAVE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; INTERLEAVE:       vector.body:
+; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[PRED_STORE_CONTINUE4:%.*]] ]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <1 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK2:%.*]] = phi <1 x i1> [ [[ACTIVE_LANE_MASK_ENTRY1]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT5:%.*]], [[PRED_STORE_CONTINUE4]] ]
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK]], i64 0
+; INTERLEAVE-NEXT:    br i1 [[TMP2]], label [[PRED_STORE_IF:%.*]], label [[PRED_STORE_CONTINUE:%.*]]
+; INTERLEAVE:       pred.store.if:
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[SRC]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = load double, ptr [[TMP3]], align 8
+; INTERLEAVE-NEXT:    [[TMP5:%.*]] = call double @foo(double [[TMP4]], i64 [[INDEX]]) #[[ATTR5:[0-9]+]]
+; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[INDEX]]
+; INTERLEAVE-NEXT:    store double [[TMP5]], ptr [[TMP6]], align 8
+; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE]]
+; INTERLEAVE:       pred.store.continue:
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK2]], i64 0
+; INTERLEAVE-NEXT:    br i1 [[TMP7]], label [[PRED_STORE_IF3:%.*]], label [[PRED_STORE_CONTINUE4]]
+; INTERLEAVE:       pred.store.if3:
+; INTERLEAVE-NEXT:    [[TMP8:%.*]] = or disjoint i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[SRC]], i64 [[TMP8]]
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = load double, ptr [[TMP9]], align 8
+; INTERLEAVE-NEXT:    [[TMP11:%.*]] = call double @foo(double [[TMP10]], i64 [[TMP8]]) #[[ATTR5]]
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[DST]], i64 [[TMP8]]
+; INTERLEAVE-NEXT:    store double [[TMP11]], ptr [[TMP12]], align 8
+; INTERLEAVE-NEXT:    br label [[PRED_STORE_CONTINUE4]]
+; INTERLEAVE:       pred.store.continue4:
+; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 2
+; INTERLEAVE-NEXT:    [[TMP13:%.*]] = or disjoint i64 [[INDEX]], 1
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 [[INDEX]], i64 [[TMP0]])
+; INTERLEAVE-NEXT:    [[ACTIVE_LANE_MASK_NEXT5]] = call <1 x i1> @llvm.get.active.lane.mask.v1i1.i64(i64 [[TMP13]], i64 [[TMP1]])
+; INTERLEAVE-NEXT:    [[TMP14:%.*]] = extractelement <1 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; INTERLEAVE-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; INTERLEAVE:       for.cond.cleanup:
+; INTERLEAVE-NEXT:    ret void
+;
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
index 16506b3a57573..29440ca174248 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-call-linear-args.ll
@@ -1,7 +1,10 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(foo|bar|baz|quux|goo)" --version 2
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --filter "call.*(foo|bar|baz|quux|goo)|extractelement" --version 2
 ; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=NEON
+; RUN: opt < %s -passes=loop-vectorize -force-vector-interleave=2 -S | FileCheck %s --check-prefixes=NEON_INTERLEAVE
 ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=SVE_OR_NEON
+; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=2 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_OR_NEON_INTERLEAVE
 ; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=1 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_TF
+; RUN: opt < %s -mattr=+sve -passes=loop-vectorize -force-vector-interleave=2 -S -prefer-predicate-over-epilogue=predicate-dont-vectorize | FileCheck %s --check-prefixes=SVE_TF_INTERLEAVE
 
 target triple = "aarch64-unknown-linux-gnu"
 
@@ -10,19 +13,46 @@ target triple = "aarch64-unknown-linux-gnu"
 define void @test_linear8(ptr noalias %a, ptr readnone %b, i64 %n) {
 ; NEON-LABEL: define void @test_linear8
 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP2:%.*]])
+; NEON:    [[TMP2:%.*]] = extractelement <2 x ptr> [[TMP1:%.*]], i32 0
+; NEON:    [[TMP3:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP2]])
 ; NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]]
 ;
+; NEON_INTERLEAVE-LABEL: define void @test_linear8
+; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
+; NEON_INTERLEAVE:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP5:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]])
+; NEON_INTERLEAVE:    [[TMP6:%.*]] = call <2 x i64> @vec_foo_linear8_nomask_neon(ptr [[TMP4]])
+; NEON_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR0:[0-9]+]]
+;
 ; SVE_OR_NEON-LABEL: define void @test_linear8
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_nomask_sve(ptr [[TMP14:%.*]])
+; SVE_OR_NEON:    [[TMP14:%.*]] = extractelement <vscale x 2 x ptr> [[TMP13:%.*]], i32 0
+; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_nomask_sve(ptr [[TMP14]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]]
 ;
+; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear8
+; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; SVE_OR_NEON_INTERLEAVE:    [[TMP33:%.*]] = extractelement <vscale x 2 x ptr> [[TMP31:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[TMP34:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_OR_NEON_INTERLEAVE:    [[TMP35:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_OR_NEON_INTERLEAVE:    [[TMP47:%.*]] = extractelement <vscale x 2 x i1> [[TMP45:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
+;
 ; SVE_TF-LABEL: define void @test_linear8
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; SVE_TF:    [[TMP21:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP20:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[TMP20:%.*]] = extractelement <vscale x 2 x ptr> [[TMP19:%.*]], i32 0
+; SVE_TF:    [[TMP21:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP20]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[TMP25:%.*]] = extractelement <vscale x 2 x i1> [[TMP24:%.*]], i32 0
 ; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
 ;
+; SVE_TF_INTERLEAVE-LABEL: define void @test_linear8
+; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; SVE_TF_INTERLEAVE:    [[TMP33:%.*]] = extractelement <vscale x 2 x ptr> [[TMP31:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[TMP34:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF_INTERLEAVE:    [[TMP35:%.*]] = call <vscale x 2 x i64> @vec_foo_linear8_mask_sve(ptr [[TMP33]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_TF_INTERLEAVE:    [[TMP47:%.*]] = extractelement <vscale x 2 x i1> [[TMP45:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -43,18 +73,35 @@ for.cond.cleanup:
 define void @test_vector_linear4(ptr noalias %a, ptr readnone %b, ptr readonly %c, i64 %n) {
 ; NEON-LABEL: define void @test_vector_linear4
 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) {
-; NEON:    [[TMP5:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]])
+; NEON:    [[TMP4:%.*]] = extractelement <4 x ptr> [[TMP3:%.*]], i32 0
+; NEON:    [[TMP5:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP4]])
 ; NEON:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]]
 ;
+; NEON_INTERLEAVE-LABEL: define void @test_vector_linear4
+; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) {
+; NEON_INTERLEAVE:    [[TMP8:%.*]] = extractelement <4 x ptr> [[TMP6:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP9:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP8]])
+; NEON_INTERLEAVE:    [[TMP10:%.*]] = call <4 x i32> @vec_baz_vector_linear4_nomask_neon(<4 x i32> [[WIDE_LOAD2:%.*]], ptr [[TMP8]])
+; NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR1:[0-9]+]]
+;
 ; SVE_OR_NEON-LABEL: define void @test_vector_linear4
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP17:%.*]] = call <vscale x 4 x i32> @vec_baz_vector_linear4_nomask_sve(<vscale x 4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]])
+; SVE_OR_NEON:    [[TMP16:%.*]] = extractelement <vscale x 4 x ptr> [[TMP15:%.*]], i32 0
+; SVE_OR_NEON:    [[TMP17:%.*]] = call <vscale x 4 x i32> @vec_baz_vector_linear4_nomask_sve(<vscale x 4 x i32> [[WIDE_LOAD:%.*]], ptr [[TMP16]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]]
 ;
+; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_vector_linear4
+; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_OR_NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
+;
 ; SVE_TF-LABEL: define void @test_vector_linear4
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SVE_TF:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
 ;
+; SVE_TF_INTERLEAVE-LABEL: define void @test_vector_linear4
+; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_TF_INTERLEAVE:    [[DATA:%.*]] = call i32 @baz(i32 [[INPUT:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -79,14 +126,28 @@ define void @test_linear8_bad_stride(ptr noalias %a, ptr readnone %b, i64 %n) {
 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
 ; NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2:[0-9]+]]
 ;
+; NEON_INTERLEAVE-LABEL: define void @test_linear8_bad_stride
+; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
+; NEON_INTERLEAVE:    [[TMP4:%.*]] = call i64 @foo(ptr [[TMP2:%.*]]) #[[ATTR2:[0-9]+]]
+; NEON_INTERLEAVE:    [[TMP5:%.*]] = call i64 @foo(ptr [[TMP3:%.*]]) #[[ATTR2]]
+; NEON_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]]
+;
 ; SVE_OR_NEON-LABEL: define void @test_linear8_bad_stride
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4:[0-9]+]]
 ;
+; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear8_bad_stride
+; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_OR_NEON_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]]
+;
 ; SVE_TF-LABEL: define void @test_linear8_bad_stride
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]]
 ;
+; SVE_TF_INTERLEAVE-LABEL: define void @test_linear8_bad_stride
+; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_TF_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6:[0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -107,18 +168,35 @@ for.cond.cleanup:
 define void @test_linear16_wide_stride(ptr noalias %a, ptr readnone %b, i64 %n) {
 ; NEON-LABEL: define void @test_linear16_wide_stride
 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
-; NEON:    [[TMP4:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP3:%.*]])
+; NEON:    [[TMP3:%.*]] = extractelement <2 x ptr> [[TMP2:%.*]], i32 0
+; NEON:    [[TMP4:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP3]])
 ; NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]]
 ;
+; NEON_INTERLEAVE-LABEL: define void @test_linear16_wide_stride
+; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) {
+; NEON_INTERLEAVE:    [[TMP6:%.*]] = extractelement <2 x ptr> [[TMP4:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP7:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]])
+; NEON_INTERLEAVE:    [[TMP8:%.*]] = call <2 x i64> @vec_foo_linear16_nomask_neon(ptr [[TMP6]])
+; NEON_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR2]]
+;
 ; SVE_OR_NEON-LABEL: define void @test_linear16_wide_stride
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP16:%.*]] = call <vscale x 2 x i64> @vec_foo_linear16_nomask_sve(ptr [[TMP15:%.*]])
+; SVE_OR_NEON:    [[TMP15:%.*]] = extractelement <vscale x 2 x ptr> [[TMP14:%.*]], i32 0
+; SVE_OR_NEON:    [[TMP16:%.*]] = call <vscale x 2 x i64> @vec_foo_linear16_nomask_sve(ptr [[TMP15]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR4]]
 ;
+; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear16_wide_stride
+; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_OR_NEON_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]]
+;
 ; SVE_TF-LABEL: define void @test_linear16_wide_stride
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SVE_TF:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]]
 ;
+; SVE_TF_INTERLEAVE-LABEL: define void @test_linear16_wide_stride
+; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_TF_INTERLEAVE:    [[DATA:%.*]] = call i64 @foo(ptr [[GEPB:%.*]]) #[[ATTR6]]
+;
 entry:
   br label %for.body
 
@@ -140,19 +218,52 @@ for.cond.cleanup:
 define void @test_linear4_linear8(ptr noalias %a, ptr readnone %b, ptr readonly %c, i64 %n) {
 ; NEON-LABEL: define void @test_linear4_linear8
 ; NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) {
-; NEON:    [[TMP5:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP3:%.*]], ptr [[TMP4:%.*]])
+; NEON:    [[TMP3:%.*]] = extractelement <4 x ptr> [[TMP1:%.*]], i32 0
+; NEON:    [[TMP4:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0
+; NEON:    [[TMP5:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP3]], ptr [[TMP4]])
 ; NEON:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]]
 ;
+; NEON_INTERLEAVE-LABEL: define void @test_linear4_linear8
+; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) {
+; NEON_INTERLEAVE:    [[TMP6:%.*]] = extractelement <4 x ptr> [[TMP2:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP7:%.*]] = extractelement <4 x ptr> [[TMP4:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP8:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]])
+; NEON_INTERLEAVE:    [[TMP9:%.*]] = call <4 x i32> @vec_quux_linear4_linear8_nomask_neon(ptr [[TMP6]], ptr [[TMP7]])
+; NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR3:[0-9]+]]
+;
 ; SVE_OR_NEON-LABEL: define void @test_linear4_linear8
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP17:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP15:%.*]], ptr [[TMP16:%.*]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; SVE_OR_NEON:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13:%.*]], i32 0
+; SVE_OR_NEON:    [[TMP16:%.*]] = extractelement <vscale x 4 x ptr> [[TMP14:%.*]], i32 0
+; SVE_OR_NEON:    [[TMP17:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP15]], ptr [[TMP16]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR5:[0-9]+]]
 ;
+; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear4_linear8
+; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_OR_NEON_INTERLEAVE:    [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP31:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[TMP36:%.*]] = extractelement <vscale x 4 x ptr> [[TMP33:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[TMP37:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_OR_NEON_INTERLEAVE:    [[TMP38:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_OR_NEON_INTERLEAVE:    [[TMP50:%.*]] = extractelement <vscale x 4 x i1> [[TMP48:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]]
+;
 ; SVE_TF-LABEL: define void @test_linear4_linear8
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    [[TMP23:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP21:%.*]], ptr [[TMP22:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[TMP21:%.*]] = extractelement <vscale x 4 x ptr> [[TMP19:%.*]], i32 0
+; SVE_TF:    [[TMP22:%.*]] = extractelement <vscale x 4 x ptr> [[TMP20:%.*]], i32 0
+; SVE_TF:    [[TMP23:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP21]], ptr [[TMP22]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[TMP27:%.*]] = extractelement <vscale x 4 x i1> [[TMP26:%.*]], i32 0
 ; SVE_TF:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]]
 ;
+; SVE_TF_INTERLEAVE-LABEL: define void @test_linear4_linear8
+; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], ptr readnone [[B:%.*]], ptr readonly [[C:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_TF_INTERLEAVE:    [[TMP35:%.*]] = extractelement <vscale x 4 x ptr> [[TMP31:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[TMP36:%.*]] = extractelement <vscale x 4 x ptr> [[TMP33:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[TMP37:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF_INTERLEAVE:    [[TMP38:%.*]] = call <vscale x 4 x i32> @vec_quux_linear4_linear8_mask_sve(ptr [[TMP35]], ptr [[TMP36]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_TF_INTERLEAVE:    [[TMP50:%.*]] = extractelement <vscale x 4 x i1> [[TMP48:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    [[DATA:%.*]] = call i32 @quux(ptr [[GEPC:%.*]], ptr [[GEPB:%.*]]) #[[ATTR7:[0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -174,18 +285,35 @@ for.cond.cleanup:
 define void @test_linear3_non_ptr(ptr noalias %a, i64 %n) {
 ; NEON-LABEL: define void @test_linear3_non_ptr
 ; NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP2:%.*]])
+; NEON:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1:%.*]], i32 0
+; NEON:    [[TMP3:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP2]])
 ; NEON:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR4:[0-9]+]]
 ;
+; NEON_INTERLEAVE-LABEL: define void @test_linear3_non_ptr
+; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) {
+; NEON_INTERLEAVE:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP5:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]])
+; NEON_INTERLEAVE:    [[TMP6:%.*]] = call <4 x i32> @vec_bar_linear3_nomask_neon(i32 [[TMP4]])
+; NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR4:[0-9]+]]
+;
 ; SVE_OR_NEON-LABEL: define void @test_linear3_non_ptr
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_bar_linear3_nomask_sve(i32 [[TMP14:%.*]])
+; SVE_OR_NEON:    [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[TMP13:%.*]], i32 0
+; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_bar_linear3_nomask_sve(i32 [[TMP14]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR6:[0-9]+]]
 ;
+; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear3_non_ptr
+; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_OR_NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]]
+;
 ; SVE_TF-LABEL: define void @test_linear3_non_ptr
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]]
 ;
+; SVE_TF_INTERLEAVE-LABEL: define void @test_linear3_non_ptr
+; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_TF_INTERLEAVE:    [[DATA:%.*]] = call i32 @bar(i32 [[TREBLED:%.*]]) #[[ATTR8:[0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -207,18 +335,35 @@ for.cond.cleanup:
 define void @test_linearn5_non_ptr_neg_stride(ptr noalias %a, i64 %n) {
 ; NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride
 ; NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) {
-; NEON:    [[TMP3:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP2:%.*]])
+; NEON:    [[TMP2:%.*]] = extractelement <4 x i32> [[TMP1:%.*]], i32 0
+; NEON:    [[TMP3:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP2]])
 ; NEON:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR5:[0-9]+]]
 ;
+; NEON_INTERLEAVE-LABEL: define void @test_linearn5_non_ptr_neg_stride
+; NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) {
+; NEON_INTERLEAVE:    [[TMP4:%.*]] = extractelement <4 x i32> [[TMP2:%.*]], i32 0
+; NEON_INTERLEAVE:    [[TMP5:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]])
+; NEON_INTERLEAVE:    [[TMP6:%.*]] = call <4 x i32> @vec_bar_linearn5_nomask_neon(i32 [[TMP4]])
+; NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR5:[0-9]+]]
+;
 ; SVE_OR_NEON-LABEL: define void @test_linearn5_non_ptr_neg_stride
 ; SVE_OR_NEON-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_bar_linearn5_nomask_sve(i32 [[TMP14:%.*]])
+; SVE_OR_NEON:    [[TMP14:%.*]] = extractelement <vscale x 4 x i32> [[TMP13:%.*]], i32 0
+; SVE_OR_NEON:    [[TMP15:%.*]] = call <vscale x 4 x i32> @vec_bar_linearn5_nomask_sve(i32 [[TMP14]])
 ; SVE_OR_NEON:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR7:[0-9]+]]
 ;
+; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linearn5_non_ptr_neg_stride
+; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_OR_NEON_INTERLEAVE:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]]
+;
 ; SVE_TF-LABEL: define void @test_linearn5_non_ptr_neg_stride
 ; SVE_TF-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
 ; SVE_TF:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]]
 ;
+; SVE_TF_INTERLEAVE-LABEL: define void @test_linearn5_non_ptr_neg_stride
+; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[A:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_TF_INTERLEAVE:    [[DATA:%.*]] = call i32 @bar(i32 [[NEGSTRIDE:%.*]]) #[[ATTR9:[0-9]+]]
+;
 entry:
   br label %for.body
 
@@ -240,19 +385,46 @@ for.cond.cleanup:
 define void @test_linear8_return_void(ptr noalias %in, ptr noalias %out, i64 %n) {
 ; NEON-LABEL: define void @test_linear8_return_void
 ; NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) {
-; NEON:    call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP4:%.*]])
+; NEON:    [[TMP4:%.*]] = extractelement <2 x ptr> [[TMP3:%.*]], i32 0
+; NEON:    call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP4]])
 ; NEON:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]]
 ;
+; NEON_INTERLEAVE-LABEL: define void @test_linear8_return_void
+; NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) {
+; NEON_INTERLEAVE:    [[TMP8:%.*]] = extractelement <2 x ptr> [[TMP6:%.*]], i32 0
+; NEON_INTERLEAVE:    call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP8]])
+; NEON_INTERLEAVE:    call void @vec_goo_linear8_nomask_neon(<2 x i64> [[WIDE_LOAD2:%.*]], ptr [[TMP8]])
+; NEON_INTERLEAVE:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR6:[0-9]+]]
+;
 ; SVE_OR_NEON-LABEL: define void @test_linear8_return_void
 ; SVE_OR_NEON-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_OR_NEON:    call void @vec_goo_linear8_nomask_sve(<vscale x 2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP16:%.*]])
+; SVE_OR_NEON:    [[TMP16:%.*]] = extractelement <vscale x 2 x ptr> [[TMP15:%.*]], i32 0
+; SVE_OR_NEON:    call void @vec_goo_linear8_nomask_sve(<vscale x 2 x i64> [[WIDE_LOAD:%.*]], ptr [[TMP16]])
 ; SVE_OR_NEON:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR8:[0-9]+]]
 ;
+; SVE_OR_NEON_INTERLEAVE-LABEL: define void @test_linear8_return_void
+; SVE_OR_NEON_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_OR_NEON_INTERLEAVE:    [[TMP39:%.*]] = extractelement <vscale x 2 x ptr> [[TMP37:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_OR_NEON_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_OR_NEON_INTERLEAVE:    [[TMP45:%.*]] = extractelement <vscale x 2 x i1> [[TMP43:%.*]], i32 0
+; SVE_OR_NEON_INTERLEAVE:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]]
+;
 ; SVE_TF-LABEL: define void @test_linear8_return_void
 ; SVE_TF-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; SVE_TF:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[TMP22:%.*]] = extractelement <vscale x 2 x ptr> [[TMP21:%.*]], i32 0
+; SVE_TF:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP22]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF:    [[TMP24:%.*]] = extractelement <vscale x 2 x i1> [[TMP23:%.*]], i32 0
 ; SVE_TF:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]]
 ;
+; SVE_TF_INTERLEAVE-LABEL: define void @test_linear8_return_void
+; SVE_TF_INTERLEAVE-SAME: (ptr noalias [[IN:%.*]], ptr noalias [[OUT:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
+; SVE_TF_INTERLEAVE:    [[TMP39:%.*]] = extractelement <vscale x 2 x ptr> [[TMP37:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
+; SVE_TF_INTERLEAVE:    call void @vec_goo_linear8_mask_sve(<vscale x 2 x i64> [[WIDE_MASKED_LOAD4:%.*]], ptr [[TMP39]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK2:%.*]])
+; SVE_TF_INTERLEAVE:    [[TMP45:%.*]] = extractelement <vscale x 2 x i1> [[TMP43:%.*]], i32 0
+; SVE_TF_INTERLEAVE:    call void @goo(i64 [[NUM:%.*]], ptr [[GEP_OUT:%.*]]) #[[ATTR10:[0-9]+]]
+;
 entry:
   br label %for.body
 

From 1ab418beb3cc9c31ebb2d5779069426d761ceb8f Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Fri, 19 Jan 2024 14:02:04 +0100
Subject: [PATCH 120/843] [Tooling] Fix FixedCompilationDatabase with header
 compile flags (#73913)

Summary:
The logic to strip positional args feels very fragile, but it's terribly
useful
when you want to use a tool on a file and have the exact argv.

Today doesn't work with header-parsing actions because these are
"precompile"
rather than "compile", from tooling's perspective it's all the same.

Reviewers:
kadircet

Subscribers:
---
 clang/lib/Tooling/CompilationDatabase.cpp     |  4 +++-
 .../Tooling/CompilationDatabaseTest.cpp       | 20 ++++++++++++++++++-
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Tooling/CompilationDatabase.cpp b/clang/lib/Tooling/CompilationDatabase.cpp
index 87ad8f25a1ab4..9d8f0d03a7d5c 100644
--- a/clang/lib/Tooling/CompilationDatabase.cpp
+++ b/clang/lib/Tooling/CompilationDatabase.cpp
@@ -156,6 +156,7 @@ struct CompileJobAnalyzer {
     bool CollectChildren = Collect;
     switch (A->getKind()) {
     case driver::Action::CompileJobClass:
+    case driver::Action::PrecompileJobClass:
       CollectChildren = true;
       break;
 
@@ -293,7 +294,8 @@ static bool stripPositionalArgs(std::vector<const char *> Args,
     // -flto* flags make the BackendJobClass, which still needs analyzer.
     if (Cmd.getSource().getKind() == driver::Action::AssembleJobClass ||
         Cmd.getSource().getKind() == driver::Action::BackendJobClass ||
-        Cmd.getSource().getKind() == driver::Action::CompileJobClass) {
+        Cmd.getSource().getKind() == driver::Action::CompileJobClass ||
+        Cmd.getSource().getKind() == driver::Action::PrecompileJobClass) {
       CompileAnalyzer.run(&Cmd.getSource());
     }
   }
diff --git a/clang/unittests/Tooling/CompilationDatabaseTest.cpp b/clang/unittests/Tooling/CompilationDatabaseTest.cpp
index 5173d472486bf..45062cf7c16f6 100644
--- a/clang/unittests/Tooling/CompilationDatabaseTest.cpp
+++ b/clang/unittests/Tooling/CompilationDatabaseTest.cpp
@@ -647,12 +647,30 @@ TEST(ParseFixedCompilationDatabase, HandlesPositionalArgs) {
       FixedCompilationDatabase::loadFromCommandLine(Argc, Argv, ErrorMsg);
   ASSERT_TRUE((bool)Database);
   ASSERT_TRUE(ErrorMsg.empty());
+  std::vector<CompileCommand> Result = Database->getCompileCommands("source");
+  ASSERT_EQ(1ul, Result.size());
+  ASSERT_EQ(".", Result[0].Directory);
+  ASSERT_THAT(Result[0].CommandLine,
+              ElementsAre(EndsWith("clang-tool"), "-c", "-DDEF3", "source"));
+  EXPECT_EQ(2, Argc);
+}
+
+TEST(ParseFixedCompilationDatabase, HandlesPositionalArgsHeader) {
+  const char *Argv[] = {"1",  "2",          "--",    "-xc++-header",
+                        "-c", "somefile.h", "-DDEF3"};
+  int Argc = std::size(Argv);
+  std::string ErrorMsg;
+  std::unique_ptr<FixedCompilationDatabase> Database =
+      FixedCompilationDatabase::loadFromCommandLine(Argc, Argv, ErrorMsg);
+  ASSERT_TRUE((bool)Database);
+  ASSERT_TRUE(ErrorMsg.empty());
   std::vector<CompileCommand> Result =
     Database->getCompileCommands("source");
   ASSERT_EQ(1ul, Result.size());
   ASSERT_EQ(".", Result[0].Directory);
   ASSERT_THAT(Result[0].CommandLine,
-              ElementsAre(EndsWith("clang-tool"), "-c", "-DDEF3", "source"));
+              ElementsAre(EndsWith("clang-tool"), "-xc++-header", "-c",
+                          "-DDEF3", "source"));
   EXPECT_EQ(2, Argc);
 }
 

From 9ad7d8f0e4628f0f2d70a6c30299fe8be6bc18c4 Mon Sep 17 00:00:00 2001
From: Danila Malyutin <danilaml@users.noreply.github.com>
Date: Fri, 19 Jan 2024 17:15:36 +0400
Subject: [PATCH 121/843] [Statepoint] Optimize Location structure size
 (#78600)

Reduce its size from 24 to 12 bytes. Improves memory consumption when
dealing with statepoint-heavy code.
---
 llvm/include/llvm/CodeGen/StackMaps.h         | 13 +++---
 llvm/lib/CodeGen/StackMaps.cpp                | 40 ++++++++-----------
 .../CodeGen/X86/statepoint-fixup-undef.mir    |  9 ++---
 3 files changed, 28 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/StackMaps.h b/llvm/include/llvm/CodeGen/StackMaps.h
index 467e31f17bc82..ca5dfa5666003 100644
--- a/llvm/include/llvm/CodeGen/StackMaps.h
+++ b/llvm/include/llvm/CodeGen/StackMaps.h
@@ -259,7 +259,7 @@ class StatepointOpers {
 class StackMaps {
 public:
   struct Location {
-    enum LocationType {
+    enum LocationType : unsigned short {
       Unprocessed,
       Register,
       Direct,
@@ -268,12 +268,13 @@ class StackMaps {
       ConstantIndex
     };
     LocationType Type = Unprocessed;
-    unsigned Size = 0;
-    unsigned Reg = 0;
-    int64_t Offset = 0;
+    unsigned short Size = 0;
+    unsigned short Reg = 0;
+    int32_t Offset = 0;
 
     Location() = default;
-    Location(LocationType Type, unsigned Size, unsigned Reg, int64_t Offset)
+    Location(LocationType Type, unsigned short Size, unsigned short Reg,
+             int32_t Offset)
         : Type(Type), Size(Size), Reg(Reg), Offset(Offset) {}
   };
 
@@ -369,7 +370,7 @@ class StackMaps {
   MachineInstr::const_mop_iterator
   parseOperand(MachineInstr::const_mop_iterator MOI,
                MachineInstr::const_mop_iterator MOE, LocationVec &Locs,
-               LiveOutVec &LiveOuts) const;
+               LiveOutVec &LiveOuts);
 
   /// Specialized parser of statepoint operands.
   /// They do not directly correspond to StackMap record entries.
diff --git a/llvm/lib/CodeGen/StackMaps.cpp b/llvm/lib/CodeGen/StackMaps.cpp
index f9115e4348784..f45bcaf16ab70 100644
--- a/llvm/lib/CodeGen/StackMaps.cpp
+++ b/llvm/lib/CodeGen/StackMaps.cpp
@@ -207,7 +207,7 @@ static unsigned getDwarfRegNum(unsigned Reg, const TargetRegisterInfo *TRI) {
 MachineInstr::const_mop_iterator
 StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
                         MachineInstr::const_mop_iterator MOE, LocationVec &Locs,
-                        LiveOutVec &LiveOuts) const {
+                        LiveOutVec &LiveOuts) {
   const TargetRegisterInfo *TRI = AP.MF->getSubtarget().getRegisterInfo();
   if (MOI->isImm()) {
     switch (MOI->getImm()) {
@@ -238,7 +238,22 @@ StackMaps::parseOperand(MachineInstr::const_mop_iterator MOI,
       ++MOI;
       assert(MOI->isImm() && "Expected constant operand.");
       int64_t Imm = MOI->getImm();
-      Locs.emplace_back(Location::Constant, sizeof(int64_t), 0, Imm);
+      if (isInt<32>(Imm)) {
+        Locs.emplace_back(Location::Constant, sizeof(int64_t), 0, Imm);
+      } else {
+        // ConstPool is intentionally a MapVector of 'uint64_t's (as
+        // opposed to 'int64_t's).  We should never be in a situation
+        // where we have to insert either the tombstone or the empty
+        // keys into a map, and for a DenseMap<uint64_t, T> these are
+        // (uint64_t)0 and (uint64_t)-1.  They can be and are
+        // represented using 32 bit integers.
+        assert((uint64_t)Imm != DenseMapInfo<uint64_t>::getEmptyKey() &&
+               (uint64_t)Imm != DenseMapInfo<uint64_t>::getTombstoneKey() &&
+               "empty and tombstone keys should fit in 32 bits!");
+        auto Result = ConstPool.insert(std::make_pair(Imm, Imm));
+        Locs.emplace_back(Location::ConstantIndex, sizeof(int64_t), 0,
+                          Result.first - ConstPool.begin());
+      }
       break;
     }
     }
@@ -497,27 +512,6 @@ void StackMaps::recordStackMapOpers(const MCSymbol &MILabel,
     while (MOI != MOE)
       MOI = parseOperand(MOI, MOE, Locations, LiveOuts);
 
-  // Move large constants into the constant pool.
-  for (auto &Loc : Locations) {
-    // Constants are encoded as sign-extended integers.
-    // -1 is directly encoded as .long 0xFFFFFFFF with no constant pool.
-    if (Loc.Type == Location::Constant && !isInt<32>(Loc.Offset)) {
-      Loc.Type = Location::ConstantIndex;
-      // ConstPool is intentionally a MapVector of 'uint64_t's (as
-      // opposed to 'int64_t's).  We should never be in a situation
-      // where we have to insert either the tombstone or the empty
-      // keys into a map, and for a DenseMap<uint64_t, T> these are
-      // (uint64_t)0 and (uint64_t)-1.  They can be and are
-      // represented using 32 bit integers.
-      assert((uint64_t)Loc.Offset != DenseMapInfo<uint64_t>::getEmptyKey() &&
-             (uint64_t)Loc.Offset !=
-                 DenseMapInfo<uint64_t>::getTombstoneKey() &&
-             "empty and tombstone keys should fit in 32 bits!");
-      auto Result = ConstPool.insert(std::make_pair(Loc.Offset, Loc.Offset));
-      Loc.Offset = Result.first - ConstPool.begin();
-    }
-  }
-
   // Create an expression to calculate the offset of the callsite from function
   // entry.
   const MCExpr *CSOffsetExpr = MCBinaryExpr::createSub(
diff --git a/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir b/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
index 0adccba88a327..30a68e6c2efd2 100644
--- a/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
+++ b/llvm/test/CodeGen/X86/statepoint-fixup-undef.mir
@@ -124,12 +124,11 @@ body:             |
   ; STACKMAP-NEXT:	.byte	0
   ; STACKMAP-NEXT:	.short	0
   ; STACKMAP-NEXT:	.long	1
-  ; STACKMAP-NEXT:	.long	1
+  ; STACKMAP-NEXT:	.long	0
   ; STACKMAP-NEXT:	.long	1
   ; STACKMAP-NEXT:	.quad	test_undef
   ; STACKMAP-NEXT:	.quad	88
   ; STACKMAP-NEXT:	.quad	1
-  ; STACKMAP-NEXT:	.quad	4278124286
   ; STACKMAP-NEXT:	.quad	2
   ; STACKMAP-NEXT:	.long	.Ltmp0-test_undef
   ; STACKMAP-NEXT:	.short	0
@@ -182,13 +181,13 @@ body:             |
   ; STACKMAP-NEXT:	.short	3
   ; STACKMAP-NEXT:	.short	0
   ; STACKMAP-NEXT:	.long	0
-  ;      This is entry we're looking for, reference to constant pool entry 0xFEFEFEFE
-  ; STACKMAP-NEXT:	.byte	5
+  ;      This is a constant 0xFEFEFEFE we are looking for
+  ; STACKMAP-NEXT:	.byte	4
   ; STACKMAP-NEXT:	.byte	0
   ; STACKMAP-NEXT:	.short	8
   ; STACKMAP-NEXT:	.short	0
   ; STACKMAP-NEXT:	.short	0
-  ; STACKMAP-NEXT:	.long	0
+  ; STACKMAP-NEXT:	.long	-16843010
   ; STACKMAP-NEXT:	.byte	4
   ; STACKMAP-NEXT:	.byte	0
   ; STACKMAP-NEXT:	.short	8

From 061eb62a9051ed06935135ffa3ea03eb250fd55e Mon Sep 17 00:00:00 2001
From: Sirraide <74590115+Sirraide@users.noreply.github.com>
Date: Fri, 19 Jan 2024 14:29:23 +0100
Subject: [PATCH 122/843] [Clang] [NFC] Remove default argument in ASTUnit.h
 (#78566)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This removes a default argument that is currently broken in C++23 mode
due to `std::default_delete` now being `constexpr`. This is a known
problem (see #74963, #59966, #69996, and a couple more), fixing which
will probably take some time, so this at least makes it possible to
compile `ASTUnit.h` in C++23 mode.

Note that we can’t simply include the header that provides the
definition of the class causing the problem either, as that would create
a circular dependency.
---
 clang/include/clang/Frontend/ASTUnit.h        | 2 +-
 clang/tools/libclang/CIndexCodeCompletion.cpp | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Frontend/ASTUnit.h b/clang/include/clang/Frontend/ASTUnit.h
index fe99b3d5adbfa..6af712afdcb6d 100644
--- a/clang/include/clang/Frontend/ASTUnit.h
+++ b/clang/include/clang/Frontend/ASTUnit.h
@@ -902,7 +902,7 @@ class ASTUnit {
                     SourceManager &SourceMgr, FileManager &FileMgr,
                     SmallVectorImpl<StoredDiagnostic> &StoredDiagnostics,
                     SmallVectorImpl<const llvm::MemoryBuffer *> &OwnedBuffers,
-                    std::unique_ptr<SyntaxOnlyAction> Act = nullptr);
+                    std::unique_ptr<SyntaxOnlyAction> Act);
 
   /// Save this translation unit to a file with the given name.
   ///
diff --git a/clang/tools/libclang/CIndexCodeCompletion.cpp b/clang/tools/libclang/CIndexCodeCompletion.cpp
index 196c64e617227..3c5f390f6d888 100644
--- a/clang/tools/libclang/CIndexCodeCompletion.cpp
+++ b/clang/tools/libclang/CIndexCodeCompletion.cpp
@@ -765,7 +765,8 @@ clang_codeCompleteAt_Impl(CXTranslationUnit TU, const char *complete_filename,
                     IncludeBriefComments, Capture,
                     CXXIdx->getPCHContainerOperations(), *Results->Diag,
                     Results->LangOpts, *Results->SourceMgr, *Results->FileMgr,
-                    Results->Diagnostics, Results->TemporaryBuffers);
+                    Results->Diagnostics, Results->TemporaryBuffers,
+                    /*SyntaxOnlyAction=*/nullptr);
 
   Results->DiagnosticsWrappers.resize(Results->Diagnostics.size());
 

From 42fb1fac9e72d080b6be5e4dac0689659a801638 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 19 Jan 2024 13:33:03 +0000
Subject: [PATCH 123/843] [VPlan] Use DebugLoc from recipe in VPWidenCallRecipe
 (NFCI).

Instead of using the debug location of the underlying instruction, use
the debug location from the recipe. This removes an unneeded dependency
of the underlying instruction.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp   | 6 ++++--
 llvm/lib/Transforms/Vectorize/VPlan.h             | 4 ++--
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp    | 2 +-
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp | 6 +++---
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index fa1aa15fa822d..6ca93e15719fb 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -8261,7 +8261,8 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
                 },
                 Range);
   if (ShouldUseVectorIntrinsic)
-    return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID);
+    return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()), ID,
+                                 CI->getDebugLoc());
 
   Function *Variant = nullptr;
   std::optional<unsigned> MaskPos;
@@ -8314,7 +8315,8 @@ VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI,
     }
 
     return new VPWidenCallRecipe(*CI, make_range(Ops.begin(), Ops.end()),
-                                 Intrinsic::not_intrinsic, Variant);
+                                 Intrinsic::not_intrinsic, CI->getDebugLoc(),
+                                 Variant);
   }
 
   return nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index bdbb61b88ecb2..0c6214868d847 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1352,9 +1352,9 @@ class VPWidenCallRecipe : public VPSingleDefRecipe {
 public:
   template <typename IterT>
   VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments,
-                    Intrinsic::ID VectorIntrinsicID,
+                    Intrinsic::ID VectorIntrinsicID, DebugLoc DL = {},
                     Function *Variant = nullptr)
-      : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, &I),
+      : VPSingleDefRecipe(VPDef::VPWidenCallSC, CallArguments, &I, DL),
         VectorIntrinsicID(VectorIntrinsicID), Variant(Variant) {}
 
   ~VPWidenCallRecipe() override = default;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 78e64e7ffb126..bbeb5da2cfec3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -580,7 +580,7 @@ void VPWidenCallRecipe::execute(VPTransformState &State) {
   auto &CI = *cast<CallInst>(getUnderlyingInstr());
   assert(!isa<DbgInfoIntrinsic>(CI) &&
          "DbgInfoIntrinsic should have been dropped during VPlan construction");
-  State.setDebugLocFrom(CI.getDebugLoc());
+  State.setDebugLocFrom(getDebugLoc());
 
   bool UseIntrinsic = VectorIntrinsicID != Intrinsic::not_intrinsic;
   FunctionType *VFTy = nullptr;
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 38671bafb4277..8e6b48cdb2c8a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -74,9 +74,9 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
         } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
           NewRecipe = new VPWidenGEPRecipe(GEP, Ingredient.operands());
         } else if (CallInst *CI = dyn_cast<CallInst>(Inst)) {
-          NewRecipe =
-              new VPWidenCallRecipe(*CI, drop_end(Ingredient.operands()),
-                                    getVectorIntrinsicIDForCall(CI, &TLI));
+          NewRecipe = new VPWidenCallRecipe(
+              *CI, drop_end(Ingredient.operands()),
+              getVectorIntrinsicIDForCall(CI, &TLI), CI->getDebugLoc());
         } else if (SelectInst *SI = dyn_cast<SelectInst>(Inst)) {
           NewRecipe = new VPWidenSelectRecipe(*SI, Ingredient.operands());
         } else if (auto *CI = dyn_cast<CastInst>(Inst)) {

From 340054e561bc7fa407de5c591d503ad52fbffaff Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 19 Jan 2024 13:48:44 +0000
Subject: [PATCH 124/843] [AArch64][SME] Remove combination of private-ZA and
 preserves_za. (#78563)

The new Clang attributes no longer support the combination of having a
private-ZA function that preserves ZA. The use of __arm_preserves("za")
means that ZA is shared and preserved.

There wasn't that much benefit to the special handling of this, because
in practice it only meant that we'd avoid restoring the lazy-save
afterwards, but it still needed setting up a lazy-save (with the
possibility of using a 0-sized buffer).

Perhaps a new attribute will be added in the future to support this
case, at which point we can revert back some of the changes removed in
this patch. But for now removing this code simplifies things.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 76 ++++++++-----------
 .../AArch64/sme-lazy-save-call-remarks.ll     | 15 +---
 .../CodeGen/AArch64/sme-lazy-save-call.ll     | 46 -----------
 3 files changed, 34 insertions(+), 103 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 8a6f1dc7487ba..7b9b6a7a42812 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7609,16 +7609,6 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   bool RequiresLazySave = CallerAttrs.requiresLazySave(CalleeAttrs);
   if (RequiresLazySave) {
-    SDValue NumZaSaveSlices;
-    if (!CalleeAttrs.preservesZA()) {
-      // Set up a lazy save mechanism by storing the runtime live slices
-      // (worst-case SVL) to the TPIDR2 stack object.
-      NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
-                                    DAG.getConstant(1, DL, MVT::i32));
-    } else if (CalleeAttrs.preservesZA()) {
-      NumZaSaveSlices = DAG.getConstant(0, DL, MVT::i64);
-    }
-
     unsigned TPIDR2Obj = FuncInfo->getLazySaveTPIDR2Obj();
     MachinePointerInfo MPI = MachinePointerInfo::getStack(MF, TPIDR2Obj);
     SDValue TPIDR2ObjAddr = DAG.getFrameIndex(TPIDR2Obj,
@@ -7626,6 +7616,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     SDValue NumZaSaveSlicesAddr =
         DAG.getNode(ISD::ADD, DL, TPIDR2ObjAddr.getValueType(), TPIDR2ObjAddr,
                     DAG.getConstant(8, DL, TPIDR2ObjAddr.getValueType()));
+    SDValue NumZaSaveSlices = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                                          DAG.getConstant(1, DL, MVT::i32));
     Chain = DAG.getTruncStore(Chain, DL, NumZaSaveSlices, NumZaSaveSlicesAddr,
                               MPI, MVT::i16);
     Chain = DAG.getNode(
@@ -7638,14 +7630,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                                    CLI.CB)
                       : OptimizationRemarkAnalysis("sme", "SMELazySaveZA",
                                                    &MF.getFunction());
-      DescribeCallsite(R) << " sets up a lazy save for ZA";
-      if (CalleeAttrs.preservesZA())
-        R << ", but callee preserves ZA, so we request 0 slices to be saved";
-      else
-        R << ", and we request that all slices be saved";
-      R << ore::setExtraArgs()
-        << ore::NV("CalleePreservesZA", CalleeAttrs.preservesZA());
-      return R;
+      return DescribeCallsite(R) << " sets up a lazy save for ZA";
     });
   }
 
@@ -8081,34 +8066,33 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   if (RequiresLazySave) {
-    if (!CalleeAttrs.preservesZA()) {
-      // Unconditionally resume ZA.
-      Result = DAG.getNode(
-          AArch64ISD::SMSTART, DL, MVT::Other, Result,
-          DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
-          DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
-
-      // Conditionally restore the lazy save using a pseudo node.
-      unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
-      SDValue RegMask = DAG.getRegisterMask(
-          TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
-      SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
-          "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
-      SDValue TPIDR2_EL0 = DAG.getNode(
-          ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
-          DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
-
-      // Copy the address of the TPIDR2 block into X0 before 'calling' the
-      // RESTORE_ZA pseudo.
-      SDValue Glue;
-      SDValue TPIDR2Block = DAG.getFrameIndex(
-          FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
-      Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
-      Result = DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
-                           {Result, TPIDR2_EL0,
-                            DAG.getRegister(AArch64::X0, MVT::i64),
-                            RestoreRoutine, RegMask, Result.getValue(1)});
-    }
+    // Unconditionally resume ZA.
+    Result = DAG.getNode(
+        AArch64ISD::SMSTART, DL, MVT::Other, Result,
+        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
+        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+
+    // Conditionally restore the lazy save using a pseudo node.
+    unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
+    SDValue RegMask = DAG.getRegisterMask(
+        TRI->SMEABISupportRoutinesCallPreservedMaskFromX0());
+    SDValue RestoreRoutine = DAG.getTargetExternalSymbol(
+        "__arm_tpidr2_restore", getPointerTy(DAG.getDataLayout()));
+    SDValue TPIDR2_EL0 = DAG.getNode(
+        ISD::INTRINSIC_W_CHAIN, DL, MVT::i64, Result,
+        DAG.getConstant(Intrinsic::aarch64_sme_get_tpidr2, DL, MVT::i32));
+
+    // Copy the address of the TPIDR2 block into X0 before 'calling' the
+    // RESTORE_ZA pseudo.
+    SDValue Glue;
+    SDValue TPIDR2Block = DAG.getFrameIndex(
+        FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+    Result = DAG.getCopyToReg(Result, DL, AArch64::X0, TPIDR2Block, Glue);
+    Result =
+        DAG.getNode(AArch64ISD::RESTORE_ZA, DL, MVT::Other,
+                    {Result, TPIDR2_EL0, DAG.getRegister(AArch64::X0, MVT::i64),
+                     RestoreRoutine, RegMask, Result.getValue(1)});
+
     // Finally reset the TPIDR2_EL0 register to 0.
     Result = DAG.getNode(
         ISD::INTRINSIC_VOID, DL, MVT::Other, Result,
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll
index 6762a768fd5bd..d999311301f94 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call-remarks.ll
@@ -2,31 +2,24 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+sme --pass-remarks-analysis=sme -o /dev/null < %s 2>&1 | FileCheck %s
 
 declare void @private_za_callee()
-declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved"
 declare float @llvm.cos.f32(float)
 
 define void @test_lazy_save_1_callee() nounwind "aarch64_pstate_za_shared" {
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved
+; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_1_callee' to 'private_za_callee' sets up a lazy save for ZA
   call void @private_za_callee()
   ret void
 }
 
 define void @test_lazy_save_2_callees() nounwind "aarch64_pstate_za_shared" {
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved
+; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
   call void @private_za_callee()
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA, and we request that all slices be saved
+; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_2_callees' to 'private_za_callee' sets up a lazy save for ZA
   call void @private_za_callee()
   ret void
 }
 
-define void @test_lazy_save_preserved_callee() nounwind "aarch64_pstate_za_shared" {
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_preserved_callee' to 'private_za_preserved_callee' sets up a lazy save for ZA, but callee preserves ZA, so we request 0 slices to be saved
-  call void @private_za_preserved_callee()
-  ret void
-}
-
 define float @test_lazy_save_expanded_intrinsic(float %a) nounwind "aarch64_pstate_za_shared" {
-; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA, and we request that all slices be saved
+; CHECK: remark: <unknown>:0:0: call from 'test_lazy_save_expanded_intrinsic' to 'cosf' sets up a lazy save for ZA
   %res = call float @llvm.cos.f32(float %a)
   ret float %res
 }
diff --git a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
index a173fd40feb28..9625e139bd0bc 100644
--- a/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
+++ b/llvm/test/CodeGen/AArch64/sme-lazy-save-call.ll
@@ -2,7 +2,6 @@
 ; RUN: llc -mtriple=aarch64 -mattr=+sme < %s | FileCheck %s
 
 declare void @private_za_callee()
-declare void @private_za_preserved_callee() "aarch64_pstate_za_preserved"
 declare float @llvm.cos.f32(float)
 
 ; Test lazy-save mechanism for a single callee.
@@ -170,48 +169,3 @@ define void @test_lazy_save_and_conditional_smstart() nounwind "aarch64_pstate_z
   call void @private_za_callee()
   ret void
 }
-
-
-; Test lazy-save mechanism for an aarch64_pstate_za_shared caller
-; calling a callee with aarch64_pstate_za_preserved.
-define void @za_shared_caller_za_preserved_callee() nounwind "aarch64_pstate_za_shared" "aarch64_pstate_sm_compatible" {
-; CHECK-LABEL: za_shared_caller_za_preserved_callee:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    stp d15, d14, [sp, #-96]! // 16-byte Folded Spill
-; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
-; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT:    add x29, sp, #64
-; CHECK-NEXT:    str x19, [sp, #80] // 8-byte Folded Spill
-; CHECK-NEXT:    sub sp, sp, #16
-; CHECK-NEXT:    rdsvl x8, #1
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    msub x8, x8, x8, x9
-; CHECK-NEXT:    mov sp, x8
-; CHECK-NEXT:    sub x9, x29, #80
-; CHECK-NEXT:    stp x8, xzr, [x29, #-80]
-; CHECK-NEXT:    msr TPIDR2_EL0, x9
-; CHECK-NEXT:    bl __arm_sme_state
-; CHECK-NEXT:    and x19, x0, #0x1
-; CHECK-NEXT:    tbz w19, #0, .LBB4_2
-; CHECK-NEXT:  // %bb.1:
-; CHECK-NEXT:    smstop sm
-; CHECK-NEXT:  .LBB4_2:
-; CHECK-NEXT:    bl private_za_preserved_callee
-; CHECK-NEXT:    tbz w19, #0, .LBB4_4
-; CHECK-NEXT:  // %bb.3:
-; CHECK-NEXT:    smstart sm
-; CHECK-NEXT:  .LBB4_4:
-; CHECK-NEXT:    msr TPIDR2_EL0, xzr
-; CHECK-NEXT:    sub sp, x29, #64
-; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT:    ldr x19, [sp, #80] // 8-byte Folded Reload
-; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT:    ldp d15, d14, [sp], #96 // 16-byte Folded Reload
-; CHECK-NEXT:    ret
-  call void @private_za_preserved_callee()
-  ret void
-}

From 7017efa1a1780e1bb0c2a49a14ac45818ea43338 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 17 Jan 2024 15:59:24 +0000
Subject: [PATCH 125/843] Fix typo "widended"

---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp                     | 4 ++--
 llvm/lib/Transforms/Utils/SimplifyIndVar.cpp                  | 2 +-
 .../Transforms/LoopVectorize/optimal-epilog-vectorization.ll  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 315604498a82f..929b5d004782d 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -14386,8 +14386,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_ROUND:
     return performFPRoundCombine(N, DCI);
   case ISD::LOAD: {
-    if (SDValue Widended = widenLoad(cast<LoadSDNode>(N), DCI))
-      return Widended;
+    if (SDValue Widened = widenLoad(cast<LoadSDNode>(N), DCI))
+      return Widened;
     [[fallthrough]];
   }
   default: {
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index 42e7c4006b427..0ed3324a27b6c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -1753,7 +1753,7 @@ Instruction *WidenIV::widenIVUse(WidenIV::NarrowIVDefUse DU, SCEVExpander &Rewri
   }
 
   // This narrow use can be widened by a sext if it's non-negative or its narrow
-  // def was widended by a sext. Same for zext.
+  // def was widened by a sext. Same for zext.
   auto canWidenBySExt = [&]() {
     return DU.NeverNegative || getExtendKind(DU.NarrowDef) == ExtendKind::Sign;
   };
diff --git a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
index 5410d5d2f6471..8888bf529909e 100644
--- a/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/optimal-epilog-vectorization.ll
@@ -662,7 +662,7 @@ outer.latch:
   br label %outer.header
 }
 
-; Check handling of widended/truncated inductions.
+; Check handling of widened/truncated inductions.
 define void @f4(ptr noalias %A, i32 signext %n) {
 ; CHECK-LABEL: @f4(
 ; CHECK-NEXT:  iter.check:

From ea9d75aa2ad64330ffc030b7ce0fcc16b55cf3bb Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 19 Jan 2024 10:43:08 +0000
Subject: [PATCH 126/843] [AMDGPU] Misc formatting fixes. NFC.

---
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp        | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 1 +
 llvm/lib/Target/AMDGPU/DSInstructions.td             | 2 +-
 llvm/lib/Target/AMDGPU/FLATInstructions.td           | 8 ++++----
 llvm/lib/Target/AMDGPU/SOPInstructions.td            | 2 --
 llvm/lib/Target/AMDGPU/VOP1Instructions.td           | 1 -
 6 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index d1470538d7033..4c35649cec6c8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3193,7 +3193,7 @@ bool AMDGPUDAGToDAGISel::isVGPRImm(const SDNode * N) const {
   return !AllUsesAcceptSReg && (Limit < 10);
 }
 
-bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
+bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode *N) const {
   auto Ld = cast<LoadSDNode>(N);
 
   const MachineMemOperand *MMO = Ld->getMemOperand();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c94367cbfe8bb..fdee74d58d269 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -5422,6 +5422,7 @@ bool AMDGPUInstructionSelector::selectNamedBarrierInst(
   I.eraseFromParent();
   return true;
 }
+
 bool AMDGPUInstructionSelector::selectSBarrierLeave(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index c5ebf4c6716a3..d09e1ef3bcb27 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1267,7 +1267,7 @@ let AssemblerPredicate = isGFX11Only, DecoderNamespace = "GFX11" in {
   multiclass DS_Real_gfx11<bits<8> op>  {
     def _gfx11 :
       Base_DS_Real_gfx6_gfx7_gfx10_gfx11_gfx12<op, !cast<DS_Pseudo>(NAME),
-                                              SIEncodingFamily.GFX11>;
+                                               SIEncodingFamily.GFX11>;
   }
 
   multiclass DS_Real_Renamed_gfx11<bits<8> op, DS_Pseudo backing_pseudo, string real_name> {
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 688a440e2c396..cb830b128df8e 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -2690,8 +2690,8 @@ defm FLAT_ATOMIC_COND_SUB_U32      : VFLAT_Real_Atomics_gfx12<0x050, "FLAT_ATOMI
 defm FLAT_ATOMIC_MIN_NUM_F32       : VFLAT_Real_Atomics_gfx12<0x051, "FLAT_ATOMIC_FMIN", "flat_atomic_min_num_f32", true, "flat_atomic_min_f32">;
 defm FLAT_ATOMIC_MAX_NUM_F32       : VFLAT_Real_Atomics_gfx12<0x052, "FLAT_ATOMIC_FMAX", "flat_atomic_max_num_f32", true, "flat_atomic_max_f32">;
 defm FLAT_ATOMIC_ADD_F32           : VFLAT_Real_Atomics_gfx12<0x056>;
-defm FLAT_ATOMIC_PK_ADD_F16        : VFLAT_Real_Atomics_gfx12<0x059, "FLAT_ATOMIC_PK_ADD_F16", "flat_atomic_pk_add_f16">;
-defm FLAT_ATOMIC_PK_ADD_BF16       : VFLAT_Real_Atomics_gfx12<0x05a, "FLAT_ATOMIC_PK_ADD_BF16", "flat_atomic_pk_add_bf16">;
+defm FLAT_ATOMIC_PK_ADD_F16        : VFLAT_Real_Atomics_gfx12<0x059>;
+defm FLAT_ATOMIC_PK_ADD_BF16       : VFLAT_Real_Atomics_gfx12<0x05a>;
 
 // ENC_VGLOBAL.
 defm GLOBAL_LOAD_U8                : VGLOBAL_Real_AllAddr_gfx12<0x010, "GLOBAL_LOAD_UBYTE", "global_load_u8", true>;
@@ -2762,8 +2762,8 @@ let WaveSizePredicate = isWave64, DecoderNamespace = "GFX12W64" in {
 }
 
 defm GLOBAL_ATOMIC_ORDERED_ADD_B64 : VGLOBAL_Real_Atomics_gfx12<0x073>;
-defm GLOBAL_ATOMIC_PK_ADD_F16      : VGLOBAL_Real_Atomics_gfx12<0x059, "GLOBAL_ATOMIC_PK_ADD_F16", "global_atomic_pk_add_f16">;
-defm GLOBAL_ATOMIC_PK_ADD_BF16     : VGLOBAL_Real_Atomics_gfx12<0x05a, "GLOBAL_ATOMIC_PK_ADD_BF16", "global_atomic_pk_add_bf16">;
+defm GLOBAL_ATOMIC_PK_ADD_F16      : VGLOBAL_Real_Atomics_gfx12<0x059>;
+defm GLOBAL_ATOMIC_PK_ADD_BF16     : VGLOBAL_Real_Atomics_gfx12<0x05a>;
 
 defm GLOBAL_INV                    : VFLAT_Real_Base_gfx12<0x02b>;
 defm GLOBAL_WB                     : VFLAT_Real_Base_gfx12<0x02c>;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 414039f3b2a7f..eae4800ade0dc 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -505,8 +505,6 @@ def S_WAKEUP_BARRIER_IMM : SOP1_Pseudo <"s_wakeup_barrier", (outs),
   (ins SplitBarrier:$src0), "$src0", []>{
   let SchedRW = [WriteBarrier];
   let isConvergent = 1;
-
-
 }
 } // End has_sdst = 0
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index d9cb656c5fc0d..95a1d86963473 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -705,7 +705,6 @@ class VOP1_DPP16_Gen<bits<8> op, VOP1_DPP_Pseudo ps, GFXGen Gen, VOPProfile p =
   let DecoderNamespace = "DPP"#Gen.DecoderNamespace;
 }
 
-
 class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
     VOP_DPP8<ps.OpName, p> {
   let hasSideEffects = ps.hasSideEffects;

From 6f371149c1c9f24bede8a799b7c2c9562740aa62 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 19 Jan 2024 14:55:31 +0100
Subject: [PATCH 127/843] [AsmParser] Don't require value numbers to be
 consecutive (#78171)

Currently, the IR parser requires that %n style numbered values are
consecutive. This means that the IR becomes invalid as soon as you
remove an instruction, argument or block. This makes it very annoying to
modify IR without running it through instnamer first.

I don't think there is any good reason to impose this requirement. This
PR relaxes it to allow value IDs to be non-consecutive, but it still
keeps the requirement that they're increasing (i.e. you can't skip a
value number and then assign it later).

This only implements support for skipping numbers for local values. We
should extend this to global values in the future as well.
---
 llvm/docs/LangRef.rst                         |  12 +-
 llvm/include/llvm/AsmParser/LLParser.h        |  32 +++++-
 llvm/lib/AsmParser/LLParser.cpp               | 106 ++++++++++++------
 .../call-nonzero-program-addrspace-2.ll       |   2 +-
 llvm/test/Assembler/invalid-arg-num-1.ll      |   6 -
 llvm/test/Assembler/invalid-arg-num-2.ll      |   6 -
 llvm/test/Assembler/invalid-arg-num-3.ll      |   6 -
 .../test/Assembler/invalid-block-label-num.ll |   7 --
 .../Assembler/skip-value-numbers-invalid.ll   |  31 +++++
 llvm/test/Assembler/skip-value-numbers.ll     |  79 +++++++++++++
 10 files changed, 214 insertions(+), 73 deletions(-)
 delete mode 100644 llvm/test/Assembler/invalid-arg-num-1.ll
 delete mode 100644 llvm/test/Assembler/invalid-arg-num-2.ll
 delete mode 100644 llvm/test/Assembler/invalid-arg-num-3.ll
 delete mode 100644 llvm/test/Assembler/invalid-block-label-num.ll
 create mode 100644 llvm/test/Assembler/skip-value-numbers-invalid.ll
 create mode 100644 llvm/test/Assembler/skip-value-numbers.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 27429ad1f43c9..bc9cfb557c540 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -128,11 +128,13 @@ lexical features of LLVM:
 #. Comments are delimited with a '``;``' and go until the end of line.
 #. Unnamed temporaries are created when the result of a computation is
    not assigned to a named value.
-#. Unnamed temporaries are numbered sequentially (using a per-function
-   incrementing counter, starting with 0). Note that basic blocks and unnamed
-   function parameters are included in this numbering. For example, if the
-   entry basic block is not given a label name and all function parameters are
-   named, then it will get number 0.
+#. By default, unnamed temporaries are numbered sequentially (using a
+   per-function incrementing counter, starting with 0). However, when explicitly
+   specifying temporary numbers, it is allowed to skip over numbers.
+
+   Note that basic blocks and unnamed function parameters are included in this
+   numbering. For example, if the entry basic block is not given a label name
+   and all function parameters are named, then it will get number 0.
 
 It also shows a convention that we follow in this document. When
 demonstrating instructions, we will follow an instruction with a comment
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index 54bc3e582e01a..b0d02eac8e672 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -203,6 +203,9 @@ namespace llvm {
     bool error(LocTy L, const Twine &Msg) const { return Lex.Error(L, Msg); }
     bool tokError(const Twine &Msg) const { return error(Lex.getLoc(), Msg); }
 
+    bool checkValueID(LocTy L, StringRef Kind, StringRef Prefix,
+                      unsigned NextID, unsigned ID) const;
+
     /// Restore the internal name and slot mappings using the mappings that
     /// were created at an earlier parsing stage.
     void restoreParsingState(const SlotMapping *Slots);
@@ -448,19 +451,35 @@ namespace llvm {
     bool parseFunctionType(Type *&Result);
     bool parseTargetExtType(Type *&Result);
 
+    class NumberedValues {
+      DenseMap<unsigned, Value *> Vals;
+      unsigned NextUnusedID = 0;
+
+    public:
+      unsigned getNext() const { return NextUnusedID; }
+      Value *get(unsigned ID) const { return Vals.lookup(ID); }
+      void add(unsigned ID, Value *V) {
+        assert(ID >= NextUnusedID && "Invalid value ID");
+        Vals.insert({ID, V});
+        NextUnusedID = ID + 1;
+      }
+    };
+
     // Function Semantic Analysis.
     class PerFunctionState {
       LLParser &P;
       Function &F;
       std::map<std::string, std::pair<Value*, LocTy> > ForwardRefVals;
       std::map<unsigned, std::pair<Value*, LocTy> > ForwardRefValIDs;
-      std::vector<Value*> NumberedVals;
+      NumberedValues NumberedVals;
 
       /// FunctionNumber - If this is an unnamed function, this is the slot
       /// number of it, otherwise it is -1.
       int FunctionNumber;
+
     public:
-      PerFunctionState(LLParser &p, Function &f, int functionNumber);
+      PerFunctionState(LLParser &p, Function &f, int functionNumber,
+                       ArrayRef<unsigned> UnnamedArgNums);
       ~PerFunctionState();
 
       Function &getFunction() const { return F; }
@@ -590,9 +609,12 @@ namespace llvm {
       ArgInfo(LocTy L, Type *ty, AttributeSet Attr, const std::string &N)
           : Loc(L), Ty(ty), Attrs(Attr), Name(N) {}
     };
-    bool parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList, bool &IsVarArg);
-    bool parseFunctionHeader(Function *&Fn, bool IsDefine);
-    bool parseFunctionBody(Function &Fn);
+    bool parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
+                           SmallVectorImpl<unsigned> &UnnamedArgNums,
+                           bool &IsVarArg);
+    bool parseFunctionHeader(Function *&Fn, bool IsDefine,
+                             SmallVectorImpl<unsigned> &UnnamedArgNums);
+    bool parseFunctionBody(Function &Fn, ArrayRef<unsigned> UnnamedArgNums);
     bool parseBasicBlock(PerFunctionState &PFS);
 
     enum TailCallType { TCT_None, TCT_Tail, TCT_MustTail };
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 7127791fbf2fc..2e859c42d5575 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -600,7 +600,8 @@ bool LLParser::parseDeclare() {
   }
 
   Function *F;
-  if (parseFunctionHeader(F, false))
+  SmallVector<unsigned> UnnamedArgNums;
+  if (parseFunctionHeader(F, false, UnnamedArgNums))
     return true;
   for (auto &MD : MDs)
     F->addMetadata(MD.first, *MD.second);
@@ -614,8 +615,10 @@ bool LLParser::parseDefine() {
   Lex.Lex();
 
   Function *F;
-  return parseFunctionHeader(F, true) || parseOptionalFunctionMetadata(*F) ||
-         parseFunctionBody(*F);
+  SmallVector<unsigned> UnnamedArgNums;
+  return parseFunctionHeader(F, true, UnnamedArgNums) ||
+         parseOptionalFunctionMetadata(*F) ||
+         parseFunctionBody(*F, UnnamedArgNums);
 }
 
 /// parseGlobalType
@@ -2953,6 +2956,15 @@ bool LLParser::parseOptionalOperandBundles(
   return false;
 }
 
+bool LLParser::checkValueID(LocTy Loc, StringRef Kind, StringRef Prefix,
+                            unsigned NextID, unsigned ID) const {
+  if (ID < NextID)
+    return error(Loc, Kind + " expected to be numbered '" + Prefix +
+                          Twine(NextID) + "' or greater");
+
+  return false;
+}
+
 /// parseArgumentList - parse the argument list for a function type or function
 /// prototype.
 ///   ::= '(' ArgTypeListI ')'
@@ -2963,6 +2975,7 @@ bool LLParser::parseOptionalOperandBundles(
 ///   ::= ArgType (',' ArgType)*
 ///
 bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
+                                 SmallVectorImpl<unsigned> &UnnamedArgNums,
                                  bool &IsVarArg) {
   unsigned CurValID = 0;
   IsVarArg = false;
@@ -2989,12 +3002,19 @@ bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
     if (Lex.getKind() == lltok::LocalVar) {
       Name = Lex.getStrVal();
       Lex.Lex();
-    } else if (Lex.getKind() == lltok::LocalVarID) {
-      if (Lex.getUIntVal() != CurValID)
-        return error(TypeLoc, "argument expected to be numbered '%" +
-                                  Twine(CurValID) + "'");
-      ++CurValID;
-      Lex.Lex();
+    } else {
+      unsigned ArgID;
+      if (Lex.getKind() == lltok::LocalVarID) {
+        ArgID = Lex.getUIntVal();
+        if (checkValueID(TypeLoc, "argument", "%", CurValID, ArgID))
+          return true;
+        Lex.Lex();
+      } else {
+        ArgID = CurValID;
+      }
+
+      UnnamedArgNums.push_back(ArgID);
+      CurValID = ArgID + 1;
     }
 
     if (!FunctionType::isValidArgumentType(ArgTy))
@@ -3023,13 +3043,17 @@ bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
         Name = Lex.getStrVal();
         Lex.Lex();
       } else {
+        unsigned ArgID;
         if (Lex.getKind() == lltok::LocalVarID) {
-          if (Lex.getUIntVal() != CurValID)
-            return error(TypeLoc, "argument expected to be numbered '%" +
-                                      Twine(CurValID) + "'");
+          ArgID = Lex.getUIntVal();
+          if (checkValueID(TypeLoc, "argument", "%", CurValID, ArgID))
+            return true;
           Lex.Lex();
+        } else {
+          ArgID = CurValID;
         }
-        ++CurValID;
+        UnnamedArgNums.push_back(ArgID);
+        CurValID = ArgID + 1;
         Name = "";
       }
 
@@ -3055,7 +3079,8 @@ bool LLParser::parseFunctionType(Type *&Result) {
 
   SmallVector<ArgInfo, 8> ArgList;
   bool IsVarArg;
-  if (parseArgumentList(ArgList, IsVarArg))
+  SmallVector<unsigned> UnnamedArgNums;
+  if (parseArgumentList(ArgList, UnnamedArgNums, IsVarArg))
     return true;
 
   // Reject names on the arguments lists.
@@ -3291,13 +3316,18 @@ bool LLParser::parseTargetExtType(Type *&Result) {
 //===----------------------------------------------------------------------===//
 
 LLParser::PerFunctionState::PerFunctionState(LLParser &p, Function &f,
-                                             int functionNumber)
+                                             int functionNumber,
+                                             ArrayRef<unsigned> UnnamedArgNums)
   : P(p), F(f), FunctionNumber(functionNumber) {
 
   // Insert unnamed arguments into the NumberedVals list.
-  for (Argument &A : F.args())
-    if (!A.hasName())
-      NumberedVals.push_back(&A);
+  auto It = UnnamedArgNums.begin();
+  for (Argument &A : F.args()) {
+    if (!A.hasName()) {
+      unsigned ArgNum = *It++;
+      NumberedVals.add(ArgNum, &A);
+    }
+  }
 }
 
 LLParser::PerFunctionState::~PerFunctionState() {
@@ -3378,7 +3408,7 @@ Value *LLParser::PerFunctionState::getVal(const std::string &Name, Type *Ty,
 
 Value *LLParser::PerFunctionState::getVal(unsigned ID, Type *Ty, LocTy Loc) {
   // Look this name up in the normal function symbol table.
-  Value *Val = ID < NumberedVals.size() ? NumberedVals[ID] : nullptr;
+  Value *Val = NumberedVals.get(ID);
 
   // If this is a forward reference for the value, see if we already created a
   // forward ref record.
@@ -3426,11 +3456,11 @@ bool LLParser::PerFunctionState::setInstName(int NameID,
   if (NameStr.empty()) {
     // If neither a name nor an ID was specified, just use the next ID.
     if (NameID == -1)
-      NameID = NumberedVals.size();
+      NameID = NumberedVals.getNext();
 
-    if (unsigned(NameID) != NumberedVals.size())
-      return P.error(NameLoc, "instruction expected to be numbered '%" +
-                                  Twine(NumberedVals.size()) + "'");
+    if (P.checkValueID(NameLoc, "instruction", "%", NumberedVals.getNext(),
+                       NameID))
+      return true;
 
     auto FI = ForwardRefValIDs.find(NameID);
     if (FI != ForwardRefValIDs.end()) {
@@ -3445,7 +3475,7 @@ bool LLParser::PerFunctionState::setInstName(int NameID,
       ForwardRefValIDs.erase(FI);
     }
 
-    NumberedVals.push_back(Inst);
+    NumberedVals.add(NameID, Inst);
     return false;
   }
 
@@ -3492,15 +3522,15 @@ BasicBlock *LLParser::PerFunctionState::defineBB(const std::string &Name,
                                                  int NameID, LocTy Loc) {
   BasicBlock *BB;
   if (Name.empty()) {
-    if (NameID != -1 && unsigned(NameID) != NumberedVals.size()) {
-      P.error(Loc, "label expected to be numbered '" +
-                       Twine(NumberedVals.size()) + "'");
-      return nullptr;
+    if (NameID != -1) {
+      if (P.checkValueID(Loc, "label", "", NumberedVals.getNext(), NameID))
+        return nullptr;
+    } else {
+      NameID = NumberedVals.getNext();
     }
-    BB = getBB(NumberedVals.size(), Loc);
+    BB = getBB(NameID, Loc);
     if (!BB) {
-      P.error(Loc, "unable to create block numbered '" +
-                       Twine(NumberedVals.size()) + "'");
+      P.error(Loc, "unable to create block numbered '" + Twine(NameID) + "'");
       return nullptr;
     }
   } else {
@@ -3517,8 +3547,8 @@ BasicBlock *LLParser::PerFunctionState::defineBB(const std::string &Name,
 
   // Remove the block from forward ref sets.
   if (Name.empty()) {
-    ForwardRefValIDs.erase(NumberedVals.size());
-    NumberedVals.push_back(BB);
+    ForwardRefValIDs.erase(NameID);
+    NumberedVals.add(NameID, BB);
   } else {
     // BB forward references are already in the function symbol table.
     ForwardRefVals.erase(Name);
@@ -5962,7 +5992,8 @@ bool LLParser::parseTypeAndBasicBlock(BasicBlock *&BB, LocTy &Loc,
 ///       OptionalCallingConv OptRetAttrs OptUnnamedAddr Type GlobalName
 ///       '(' ArgList ')' OptAddrSpace OptFuncAttrs OptSection OptionalAlign
 ///       OptGC OptionalPrefix OptionalPrologue OptPersonalityFn
-bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
+bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine,
+                                   SmallVectorImpl<unsigned> &UnnamedArgNums) {
   // parse the linkage.
   LocTy LinkageLoc = Lex.getLoc();
   unsigned Linkage;
@@ -6050,7 +6081,7 @@ bool LLParser::parseFunctionHeader(Function *&Fn, bool IsDefine) {
   Constant *PersonalityFn = nullptr;
   Comdat *C;
 
-  if (parseArgumentList(ArgList, IsVarArg) ||
+  if (parseArgumentList(ArgList, UnnamedArgNums, IsVarArg) ||
       parseOptionalUnnamedAddr(UnnamedAddr) ||
       parseOptionalProgramAddrSpace(AddrSpace) ||
       parseFnAttributeValuePairs(FuncAttrs, FwdRefAttrGrps, false,
@@ -6245,7 +6276,8 @@ bool LLParser::PerFunctionState::resolveForwardRefBlockAddresses() {
 
 /// parseFunctionBody
 ///   ::= '{' BasicBlock+ UseListOrderDirective* '}'
-bool LLParser::parseFunctionBody(Function &Fn) {
+bool LLParser::parseFunctionBody(Function &Fn,
+                                 ArrayRef<unsigned> UnnamedArgNums) {
   if (Lex.getKind() != lltok::lbrace)
     return tokError("expected '{' in function body");
   Lex.Lex();  // eat the {.
@@ -6253,7 +6285,7 @@ bool LLParser::parseFunctionBody(Function &Fn) {
   int FunctionNumber = -1;
   if (!Fn.hasName()) FunctionNumber = NumberedVals.size()-1;
 
-  PerFunctionState PFS(*this, Fn, FunctionNumber);
+  PerFunctionState PFS(*this, Fn, FunctionNumber, UnnamedArgNums);
 
   // Resolve block addresses and allow basic blocks to be forward-declared
   // within this function.
diff --git a/llvm/test/Assembler/call-nonzero-program-addrspace-2.ll b/llvm/test/Assembler/call-nonzero-program-addrspace-2.ll
index f5c5437d8221e..bc600d56db51b 100644
--- a/llvm/test/Assembler/call-nonzero-program-addrspace-2.ll
+++ b/llvm/test/Assembler/call-nonzero-program-addrspace-2.ll
@@ -3,7 +3,7 @@
 
 ; Check that numbered variables in a nonzero program address space 200 can be used in a call instruction
 
-define i8 @test_unnamed(ptr, ptr addrspace(42) %0) {
+define i8 @test_unnamed(ptr, ptr addrspace(42) %1) {
   ; Calls with explicit address spaces are fine:
   call addrspace(0) i8 %0(i32 0)
   call addrspace(42) i8 %1(i32 0)
diff --git a/llvm/test/Assembler/invalid-arg-num-1.ll b/llvm/test/Assembler/invalid-arg-num-1.ll
deleted file mode 100644
index ee13c339dec44..0000000000000
--- a/llvm/test/Assembler/invalid-arg-num-1.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
-
-; CHECK: error: argument expected to be numbered '%1'
-define void @foo(i32 %0, i32 %5) {
-  ret void
-}
diff --git a/llvm/test/Assembler/invalid-arg-num-2.ll b/llvm/test/Assembler/invalid-arg-num-2.ll
deleted file mode 100644
index 6ecb0033512d0..0000000000000
--- a/llvm/test/Assembler/invalid-arg-num-2.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
-
-; CHECK: error: argument expected to be numbered '%1'
-define void @foo(i8 %0, i32 %named, i32 %2) {
-  ret void
-}
diff --git a/llvm/test/Assembler/invalid-arg-num-3.ll b/llvm/test/Assembler/invalid-arg-num-3.ll
deleted file mode 100644
index f1638365630be..0000000000000
--- a/llvm/test/Assembler/invalid-arg-num-3.ll
+++ /dev/null
@@ -1,6 +0,0 @@
-; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s
-
-; CHECK: error: argument expected to be numbered '%0'
-define void @foo(i8 %1) {
-  ret void
-}
diff --git a/llvm/test/Assembler/invalid-block-label-num.ll b/llvm/test/Assembler/invalid-block-label-num.ll
deleted file mode 100644
index 4f02300849e68..0000000000000
--- a/llvm/test/Assembler/invalid-block-label-num.ll
+++ /dev/null
@@ -1,7 +0,0 @@
-; RUN: not llvm-as < %s 2>&1 | FileCheck %s
-
-define void @f () {
-1:
-; CHECK: error: label expected to be numbered '0'
-  ret void
-}
diff --git a/llvm/test/Assembler/skip-value-numbers-invalid.ll b/llvm/test/Assembler/skip-value-numbers-invalid.ll
new file mode 100644
index 0000000000000..67e6b10196bc8
--- /dev/null
+++ b/llvm/test/Assembler/skip-value-numbers-invalid.ll
@@ -0,0 +1,31 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %s %t/instr_smaller_id.ll 2>&1 | FileCheck %s --check-prefix=INSTR-SMALLER-ID
+; RUN: not llvm-as < %s %t/arg_smaller_id.ll 2>&1 | FileCheck %s --check-prefix=ARG-SMALLER-ID
+; RUN: not llvm-as < %s %t/block_smaller_id.ll 2>&1 | FileCheck %s --check-prefix=BLOCK-SMALLER-ID
+
+;--- instr_smaller_id.ll
+
+; INSTR-SMALLER-ID: error: instruction expected to be numbered '%11' or greater
+define i32 @test() {
+  %10 = add i32 1, 2
+  %5 = add i32 %10, 3
+  ret i32 %5
+}
+
+;--- arg_smaller_id.ll
+
+; ARG-SMALLER-ID: error: argument expected to be numbered '%11' or greater
+define i32 @test(i32 %10, i32 %5) {
+  ret i32 %5
+}
+
+;--- block_smaller_id.ll
+
+; BLOCK-SMALLER-ID: error: label expected to be numbered '11' or greater
+define i32 @test() {
+10:
+  br label %5
+
+5:
+  ret i32 0
+}
diff --git a/llvm/test/Assembler/skip-value-numbers.ll b/llvm/test/Assembler/skip-value-numbers.ll
new file mode 100644
index 0000000000000..d74d8d502709b
--- /dev/null
+++ b/llvm/test/Assembler/skip-value-numbers.ll
@@ -0,0 +1,79 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: -p --version 4
+; RUN: opt -S < %s | FileCheck %s
+
+define i32 @instr() {
+; CHECK-LABEL: define i32 @instr() {
+; CHECK-NEXT:    %1 = add i32 1, 2
+; CHECK-NEXT:    %2 = add i32 %1, 3
+; CHECK-NEXT:    %3 = add i32 %2, 4
+; CHECK-NEXT:    ret i32 %3
+;
+  %10 = add i32 1, 2
+  %20 = add i32 %10, 3
+  %30 = add i32 %20, 4
+  ret i32 %30
+}
+
+define i32 @instr_some_implicit() {
+; CHECK-LABEL: define i32 @instr_some_implicit() {
+; CHECK-NEXT:    %1 = add i32 1, 2
+; CHECK-NEXT:    %2 = add i32 3, 4
+; CHECK-NEXT:    %3 = add i32 %1, %2
+; CHECK-NEXT:    ret i32 %3
+;
+  add i32 1, 2
+  %10 = add i32 3, 4
+  add i32 %1, %10
+  ret i32 %11
+}
+
+define i32 @args(i32 %0, i32 %10, i32 %20) {
+; CHECK-LABEL: define i32 @args(i32 %0, i32 %1, i32 %2) {
+; CHECK-NEXT:    %4 = add i32 %0, %1
+; CHECK-NEXT:    %5 = add i32 %4, %2
+; CHECK-NEXT:    ret i32 %5
+;
+  %30 = add i32 %0, %10
+  %31 = add i32 %30, %20
+  ret i32 %31
+}
+
+define i32 @args_some_implicit(i32, i32 %10, i32) {
+; CHECK-LABEL: define i32 @args_some_implicit(i32 %0, i32 %1, i32 %2) {
+; CHECK-NEXT:    %4 = add i32 %0, %1
+; CHECK-NEXT:    %5 = add i32 %2, %4
+; CHECK-NEXT:    ret i32 %5
+;
+  add i32 %0, %10
+  %20 = add i32 %11, %13
+  ret i32 %20
+}
+
+define i32 @blocks() {
+; CHECK-LABEL: define i32 @blocks() {
+; CHECK-NEXT:    br label %1
+; CHECK:       1:
+; CHECK-NEXT:    ret i32 0
+;
+10:
+  br label %20
+
+20:
+  ret i32 0
+}
+
+define i32 @blocks_some_implicit() {
+; CHECK-LABEL: define i32 @blocks_some_implicit() {
+; CHECK-NEXT:    br label %1
+; CHECK:       1:
+; CHECK-NEXT:    br label %2
+; CHECK:       2:
+; CHECK-NEXT:    ret i32 0
+;
+  br label %10
+
+10:
+  br label %11
+
+  ret i32 0
+}

From 9ff4be640fb1b3a64a8bc73020d67816f1c09ea0 Mon Sep 17 00:00:00 2001
From: Alexey Lapshin <a.v.lapshin@mail.ru>
Date: Fri, 19 Jan 2024 16:57:09 +0300
Subject: [PATCH 128/843] [DWARFLinker][NFC] Decrease DWARFLinker dependence on
 DwarfStreamer. (#77932)

This patch is extracted from #74725.

The DwarfStreamer interface looks overcomplicated and has unnecessary
dependencies. This patch avoids creation of DwarfStreamer by DWARFLinker and
simplifies interface.
---
 .../llvm/DWARFLinker/Classic/DWARFLinker.h    |  24 +---
 .../llvm/DWARFLinker/Classic/DWARFStreamer.h  |  18 ++-
 .../llvm/DWARFLinker/DWARFLinkerBase.h        |  47 +++++++
 .../llvm/DWARFLinker/Parallel/DWARFLinker.h   |  59 +++++----
 llvm/lib/DWARFLinker/CMakeLists.txt           |   1 +
 llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp  |  33 ++---
 .../lib/DWARFLinker/Classic/DWARFStreamer.cpp |  90 ++++++++++---
 llvm/lib/DWARFLinker/DWARFLinkerBase.cpp      |  64 +++++++++
 .../DWARFLinker/Parallel/DWARFEmitterImpl.cpp |  66 ----------
 .../DWARFLinker/Parallel/DWARFEmitterImpl.h   |  23 +---
 .../Parallel/DWARFLinkerCompileUnit.cpp       |  14 +-
 .../Parallel/DWARFLinkerCompileUnit.h         |   7 +-
 .../Parallel/DWARFLinkerGlobalData.h          |  29 +++-
 .../DWARFLinker/Parallel/DWARFLinkerImpl.cpp  |  92 +++++--------
 .../DWARFLinker/Parallel/DWARFLinkerImpl.h    |  23 ++--
 .../Parallel/DWARFLinkerTypeUnit.cpp          |  12 +-
 .../Parallel/DWARFLinkerTypeUnit.h            |   2 +-
 .../DWARFLinker/Parallel/OutputSections.cpp   |  53 --------
 .../lib/DWARFLinker/Parallel/OutputSections.h | 124 ++++++------------
 llvm/tools/dsymutil/DwarfLinkerForBinary.cpp  |  36 +++--
 llvm/tools/dsymutil/DwarfLinkerForBinary.h    |   4 +-
 llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp |  25 +++-
 22 files changed, 421 insertions(+), 425 deletions(-)
 create mode 100644 llvm/lib/DWARFLinker/DWARFLinkerBase.cpp

diff --git a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinker.h
index 46d3f3148818b..b1d3f03394f5e 100644
--- a/llvm/include/llvm/DWARFLinker/Classic/DWARFLinker.h
+++ b/llvm/include/llvm/DWARFLinker/Classic/DWARFLinker.h
@@ -42,7 +42,8 @@ class DwarfEmitter {
   virtual ~DwarfEmitter() = default;
 
   /// Emit section named SecName with data SecData.
-  virtual void emitSectionContents(StringRef SecData, StringRef SecName) = 0;
+  virtual void emitSectionContents(StringRef SecData,
+                                   DebugSectionKind SecKind) = 0;
 
   /// Emit the abbreviation table \p Abbrevs to the .debug_abbrev section.
   virtual void
@@ -188,17 +189,6 @@ class DwarfEmitter {
 
   /// Dump the file to the disk.
   virtual void finish() = 0;
-
-  /// Emit the swift_ast section stored in \p Buffer.
-  virtual void emitSwiftAST(StringRef Buffer) = 0;
-
-  /// Emit the swift reflection section stored in \p Buffer.
-  virtual void emitSwiftReflectionSection(
-      llvm::binaryformat::Swift5ReflectionSectionKind ReflSectionKind,
-      StringRef Buffer, uint32_t Alignment, uint32_t Size) = 0;
-
-  /// Returns underlying AsmPrinter.
-  virtual AsmPrinter &getAsmPrinter() const = 0;
 };
 
 class DwarfStreamer;
@@ -232,10 +222,10 @@ class DWARFLinker : public DWARFLinkerBase {
                                          StringsTranslator);
   }
 
-  Error createEmitter(const Triple &TheTriple, OutputFileType FileType,
-                      raw_pwrite_stream &OutFile);
-
-  DwarfEmitter *getEmitter();
+  /// Set output DWARF emitter.
+  void setOutputDWARFEmitter(DwarfEmitter *Emitter) {
+    TheDwarfEmitter = Emitter;
+  }
 
   /// Add object file to be linked. Pre-load compile unit die. Call
   /// \p OnCUDieLoaded for each compile unit die. If specified \p File
@@ -762,7 +752,7 @@ class DWARFLinker : public DWARFLinkerBase {
   BumpPtrAllocator DIEAlloc;
   /// @}
 
-  std::unique_ptr<DwarfStreamer> TheDwarfEmitter;
+  DwarfEmitter *TheDwarfEmitter = nullptr;
   std::vector<LinkContext> ObjectContexts;
 
   /// The CIEs that have been emitted in the output section. The actual CIE
diff --git a/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h b/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h
index f010c348f1214..bebdfd5e60257 100644
--- a/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h
+++ b/llvm/include/llvm/DWARFLinker/Classic/DWARFStreamer.h
@@ -45,18 +45,23 @@ class DwarfStreamer : public DwarfEmitter {
 public:
   DwarfStreamer(DWARFLinkerBase::OutputFileType OutFileType,
                 raw_pwrite_stream &OutFile,
-                std::function<StringRef(StringRef Input)> Translator,
+                DWARFLinkerBase::TranslatorFuncTy Translator,
                 DWARFLinkerBase::MessageHandlerTy Warning)
       : OutFile(OutFile), OutFileType(OutFileType), Translator(Translator),
         WarningHandler(Warning) {}
   virtual ~DwarfStreamer() = default;
 
+  static Expected<std::unique_ptr<DwarfStreamer>> createStreamer(
+      const Triple &TheTriple, DWARFLinkerBase::OutputFileType FileType,
+      raw_pwrite_stream &OutFile, DWARFLinkerBase::TranslatorFuncTy Translator,
+      DWARFLinkerBase::MessageHandlerTy Warning);
+
   Error init(Triple TheTriple, StringRef Swift5ReflectionSegmentName);
 
   /// Dump the file to the disk.
   void finish() override;
 
-  AsmPrinter &getAsmPrinter() const override { return *Asm; }
+  AsmPrinter &getAsmPrinter() const { return *Asm; }
 
   /// Set the current output section to debug_info and change
   /// the MC Dwarf version to \p DwarfVersion.
@@ -77,7 +82,8 @@ class DwarfStreamer : public DwarfEmitter {
                    unsigned DwarfVersion) override;
 
   /// Emit contents of section SecName From Obj.
-  void emitSectionContents(StringRef SecData, StringRef SecName) override;
+  void emitSectionContents(StringRef SecData,
+                           DebugSectionKind SecKind) override;
 
   /// Emit the string table described by \p Pool into .debug_str table.
   void emitStrings(const NonRelocatableStringpool &Pool) override;
@@ -91,12 +97,12 @@ class DwarfStreamer : public DwarfEmitter {
   void emitLineStrings(const NonRelocatableStringpool &Pool) override;
 
   /// Emit the swift_ast section stored in \p Buffer.
-  void emitSwiftAST(StringRef Buffer) override;
+  void emitSwiftAST(StringRef Buffer);
 
   /// Emit the swift reflection section stored in \p Buffer.
   void emitSwiftReflectionSection(
       llvm::binaryformat::Swift5ReflectionSectionKind ReflSectionKind,
-      StringRef Buffer, uint32_t Alignment, uint32_t Size) override;
+      StringRef Buffer, uint32_t Alignment, uint32_t Size);
 
   /// Emit debug ranges(.debug_ranges, .debug_rnglists) header.
   MCSymbol *emitDwarfDebugRangeListHeader(const CompileUnit &Unit) override;
@@ -215,6 +221,8 @@ class DwarfStreamer : public DwarfEmitter {
       WarningHandler(Warning, Context, nullptr);
   }
 
+  MCSection *getMCSection(DebugSectionKind SecKind);
+
   void emitMacroTableImpl(const DWARFDebugMacro *MacroTable,
                           const Offset2UnitMap &UnitMacroMap,
                           OffsetsStringPool &StringPool, uint64_t &OutOffset);
diff --git a/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h b/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h
index 626fb53d90f9f..5c811b668f0a3 100644
--- a/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h
+++ b/llvm/include/llvm/DWARFLinker/DWARFLinkerBase.h
@@ -23,6 +23,53 @@ class DWARFUnit;
 
 namespace dwarf_linker {
 
+/// List of tracked debug tables.
+enum class DebugSectionKind : uint8_t {
+  DebugInfo = 0,
+  DebugLine,
+  DebugFrame,
+  DebugRange,
+  DebugRngLists,
+  DebugLoc,
+  DebugLocLists,
+  DebugARanges,
+  DebugAbbrev,
+  DebugMacinfo,
+  DebugMacro,
+  DebugAddr,
+  DebugStr,
+  DebugLineStr,
+  DebugStrOffsets,
+  DebugPubNames,
+  DebugPubTypes,
+  DebugNames,
+  AppleNames,
+  AppleNamespaces,
+  AppleObjC,
+  AppleTypes,
+  NumberOfEnumEntries // must be last
+};
+
+static constexpr size_t SectionKindsNum =
+    static_cast<size_t>(DebugSectionKind::NumberOfEnumEntries);
+
+static constexpr StringLiteral SectionNames[SectionKindsNum] = {
+    "debug_info",     "debug_line",     "debug_frame",       "debug_ranges",
+    "debug_rnglists", "debug_loc",      "debug_loclists",    "debug_aranges",
+    "debug_abbrev",   "debug_macinfo",  "debug_macro",       "debug_addr",
+    "debug_str",      "debug_line_str", "debug_str_offsets", "debug_pubnames",
+    "debug_pubtypes", "debug_names",    "apple_names",       "apple_namespac",
+    "apple_objc",     "apple_types"};
+
+/// Return the name of the section.
+static constexpr const StringLiteral &
+getSectionName(DebugSectionKind SectionKind) {
+  return SectionNames[static_cast<uint8_t>(SectionKind)];
+}
+
+/// Recognise the table name and match it with the DebugSectionKind.
+std::optional<DebugSectionKind> parseDebugTableName(StringRef Name);
+
 /// The base interface for DWARFLinker implementations.
 class DWARFLinkerBase {
 public:
diff --git a/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h b/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h
index c38a9906940ec..5312712a4a5b8 100644
--- a/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h
+++ b/llvm/include/llvm/DWARFLinker/Parallel/DWARFLinker.h
@@ -89,30 +89,34 @@ namespace llvm {
 namespace dwarf_linker {
 namespace parallel {
 
-/// ExtraDwarfEmitter allows adding extra data to the DWARFLinker output.
-/// The finish() method should be called after all extra data are emitted.
-class ExtraDwarfEmitter {
-public:
-  virtual ~ExtraDwarfEmitter() = default;
-
-  /// Dump the file to the disk.
-  virtual void finish() = 0;
-
-  /// Emit section named SecName with data SecData.
-  virtual void emitSectionContents(StringRef SecData, StringRef SecName) = 0;
-
-  /// Emit the swift_ast section stored in \p Buffer.
-  virtual void emitSwiftAST(StringRef Buffer) = 0;
-
-  /// Emit the swift reflection section stored in \p Buffer.
-  virtual void emitSwiftReflectionSection(
-      llvm::binaryformat::Swift5ReflectionSectionKind ReflSectionKind,
-      StringRef Buffer, uint32_t Alignment, uint32_t Size) = 0;
-
-  /// Returns underlying AsmPrinter.
-  virtual AsmPrinter &getAsmPrinter() const = 0;
+/// This structure keeps data of the concrete section.
+struct SectionDescriptorBase {
+  SectionDescriptorBase(DebugSectionKind SectionKind, dwarf::FormParams Format,
+                        llvm::endianness Endianess)
+      : SectionKind(SectionKind), Format(Format), Endianess(Endianess) {}
+  virtual ~SectionDescriptorBase() = default;
+  /// Returns section content.
+  virtual StringRef getContents() = 0;
+  /// Returns section kind.
+  DebugSectionKind getKind() { return SectionKind; }
+  /// Returns section name.
+  const StringLiteral &getName() const { return getSectionName(SectionKind); }
+  /// Returns endianess used by section.
+  llvm::endianness getEndianess() const { return Endianess; }
+  /// Returns FormParams used by section.
+  dwarf::FormParams getFormParams() const { return Format; }
+
+protected:
+  /// The section kind.
+  DebugSectionKind SectionKind = DebugSectionKind::NumberOfEnumEntries;
+  /// Output format.
+  dwarf::FormParams Format = {4, 4, dwarf::DWARF32};
+  llvm::endianness Endianess = llvm::endianness::little;
 };
 
+using SectionHandlerTy =
+    std::function<void(std::shared_ptr<SectionDescriptorBase> Section)>;
+
 class DWARFLinker : public DWARFLinkerBase {
 public:
   virtual ~DWARFLinker() = default;
@@ -122,12 +126,11 @@ class DWARFLinker : public DWARFLinkerBase {
   createLinker(MessageHandlerTy ErrorHandler, MessageHandlerTy WarningHandler,
                TranslatorFuncTy StringsTranslator = nullptr);
 
-  /// Creates emitter for output dwarf.
-  virtual Error createEmitter(const Triple &TheTriple, OutputFileType FileType,
-                              raw_pwrite_stream &OutFile) = 0;
-
-  /// Returns previously created dwarf emitter. May be nullptr.
-  virtual ExtraDwarfEmitter *getEmitter() = 0;
+  /// Set output DWARF handler. Result of linking DWARF is set of sections
+  /// containing final debug info. DWARFLinkerBase::link() pass generated
+  /// sections using specified \p SectionHandler.
+  virtual void setOutputDWARFHandler(const Triple &TargetTriple,
+                                     SectionHandlerTy SectionHandler) = 0;
 };
 
 } // end of namespace parallel
diff --git a/llvm/lib/DWARFLinker/CMakeLists.txt b/llvm/lib/DWARFLinker/CMakeLists.txt
index 66bbf72165d64..f9d523757cd7c 100644
--- a/llvm/lib/DWARFLinker/CMakeLists.txt
+++ b/llvm/lib/DWARFLinker/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_component_library(LLVMDWARFLinker
+  DWARFLinkerBase.cpp
   Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index db294169ceb9b..d2b4db3b0c6fb 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -2644,19 +2644,22 @@ uint64_t DWARFLinker::DIECloner::cloneAllCompileUnits(
 
 void DWARFLinker::copyInvariantDebugSection(DWARFContext &Dwarf) {
   TheDwarfEmitter->emitSectionContents(Dwarf.getDWARFObj().getLocSection().Data,
-                                       "debug_loc");
+                                       DebugSectionKind::DebugLoc);
   TheDwarfEmitter->emitSectionContents(
-      Dwarf.getDWARFObj().getRangesSection().Data, "debug_ranges");
+      Dwarf.getDWARFObj().getRangesSection().Data,
+      DebugSectionKind::DebugRange);
   TheDwarfEmitter->emitSectionContents(
-      Dwarf.getDWARFObj().getFrameSection().Data, "debug_frame");
+      Dwarf.getDWARFObj().getFrameSection().Data, DebugSectionKind::DebugFrame);
   TheDwarfEmitter->emitSectionContents(Dwarf.getDWARFObj().getArangesSection(),
-                                       "debug_aranges");
+                                       DebugSectionKind::DebugARanges);
   TheDwarfEmitter->emitSectionContents(
-      Dwarf.getDWARFObj().getAddrSection().Data, "debug_addr");
+      Dwarf.getDWARFObj().getAddrSection().Data, DebugSectionKind::DebugAddr);
   TheDwarfEmitter->emitSectionContents(
-      Dwarf.getDWARFObj().getRnglistsSection().Data, "debug_rnglists");
+      Dwarf.getDWARFObj().getRnglistsSection().Data,
+      DebugSectionKind::DebugRngLists);
   TheDwarfEmitter->emitSectionContents(
-      Dwarf.getDWARFObj().getLoclistsSection().Data, "debug_loclists");
+      Dwarf.getDWARFObj().getLoclistsSection().Data,
+      DebugSectionKind::DebugLocLists);
 }
 
 void DWARFLinker::addObjectFile(DWARFFile &File, ObjFileLoaderTy Loader,
@@ -2848,7 +2851,7 @@ Error DWARFLinker::link() {
       SizeByObject[OptContext.File.FileName].Input =
           getDebugInfoSize(*OptContext.File.Dwarf);
       SizeByObject[OptContext.File.FileName].Output =
-          DIECloner(*this, TheDwarfEmitter.get(), OptContext.File, DIEAlloc,
+          DIECloner(*this, TheDwarfEmitter, OptContext.File, DIEAlloc,
                     OptContext.CompileUnits, Options.Update, DebugStrPool,
                     DebugLineStrPool, StringOffsetPool)
               .cloneAllCompileUnits(*OptContext.File.Dwarf, OptContext.File,
@@ -3011,7 +3014,7 @@ Error DWARFLinker::cloneModuleUnit(LinkContext &Context, RefModuleUnit &Unit,
   UnitListTy CompileUnits;
   CompileUnits.emplace_back(std::move(Unit.Unit));
   assert(TheDwarfEmitter);
-  DIECloner(*this, TheDwarfEmitter.get(), Unit.File, DIEAlloc, CompileUnits,
+  DIECloner(*this, TheDwarfEmitter, Unit.File, DIEAlloc, CompileUnits,
             Options.Update, DebugStrPool, DebugLineStrPool, StringOffsetPool)
       .cloneAllCompileUnits(*Unit.File.Dwarf, Unit.File,
                             Unit.File.Dwarf->isLittleEndian());
@@ -3030,16 +3033,4 @@ void DWARFLinker::verifyInput(const DWARFFile &File) {
   }
 }
 
-Error DWARFLinker::createEmitter(const Triple &TheTriple,
-                                 OutputFileType FileType,
-                                 raw_pwrite_stream &OutFile) {
-
-  TheDwarfEmitter = std::make_unique<DwarfStreamer>(
-      FileType, OutFile, StringsTranslator, WarningHandler);
-
-  return TheDwarfEmitter->init(TheTriple, "__DWARF");
-}
-
-DwarfEmitter *DWARFLinker::getEmitter() { return TheDwarfEmitter.get(); }
-
 } // namespace llvm
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
index 020bbb06449d3..6d522370e440d 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
@@ -30,6 +30,18 @@ using namespace llvm;
 using namespace dwarf_linker;
 using namespace dwarf_linker::classic;
 
+Expected<std::unique_ptr<DwarfStreamer>> DwarfStreamer::createStreamer(
+    const Triple &TheTriple, DWARFLinkerBase::OutputFileType FileType,
+    raw_pwrite_stream &OutFile, DWARFLinkerBase::TranslatorFuncTy Translator,
+    DWARFLinkerBase::MessageHandlerTy Warning) {
+  std::unique_ptr<DwarfStreamer> Streamer =
+      std::make_unique<DwarfStreamer>(FileType, OutFile, Translator, Warning);
+  if (Error Err = Streamer->init(TheTriple, "__DWARF"))
+    return std::move(Err);
+
+  return std::move(Streamer);
+}
+
 Error DwarfStreamer::init(Triple TheTriple,
                           StringRef Swift5ReflectionSegmentName) {
   std::string ErrorStr;
@@ -212,30 +224,72 @@ void DwarfStreamer::emitDIE(DIE &Die) {
 }
 
 /// Emit contents of section SecName From Obj.
-void DwarfStreamer::emitSectionContents(StringRef SecData, StringRef SecName) {
-  MCSection *Section =
-      StringSwitch<MCSection *>(SecName)
-          .Case("debug_line", MC->getObjectFileInfo()->getDwarfLineSection())
-          .Case("debug_loc", MC->getObjectFileInfo()->getDwarfLocSection())
-          .Case("debug_ranges",
-                MC->getObjectFileInfo()->getDwarfRangesSection())
-          .Case("debug_frame", MC->getObjectFileInfo()->getDwarfFrameSection())
-          .Case("debug_aranges",
-                MC->getObjectFileInfo()->getDwarfARangesSection())
-          .Case("debug_addr", MC->getObjectFileInfo()->getDwarfAddrSection())
-          .Case("debug_rnglists",
-                MC->getObjectFileInfo()->getDwarfRnglistsSection())
-          .Case("debug_loclists",
-                MC->getObjectFileInfo()->getDwarfLoclistsSection())
-          .Default(nullptr);
-
-  if (Section) {
+void DwarfStreamer::emitSectionContents(StringRef SecData,
+                                        DebugSectionKind SecKind) {
+  if (SecData.empty())
+    return;
+
+  if (MCSection *Section = getMCSection(SecKind)) {
     MS->switchSection(Section);
 
     MS->emitBytes(SecData);
   }
 }
 
+MCSection *DwarfStreamer::getMCSection(DebugSectionKind SecKind) {
+  switch (SecKind) {
+  case DebugSectionKind::DebugInfo:
+    return MC->getObjectFileInfo()->getDwarfInfoSection();
+  case DebugSectionKind::DebugLine:
+    return MC->getObjectFileInfo()->getDwarfLineSection();
+  case DebugSectionKind::DebugFrame:
+    return MC->getObjectFileInfo()->getDwarfFrameSection();
+  case DebugSectionKind::DebugRange:
+    return MC->getObjectFileInfo()->getDwarfRangesSection();
+  case DebugSectionKind::DebugRngLists:
+    return MC->getObjectFileInfo()->getDwarfRnglistsSection();
+  case DebugSectionKind::DebugLoc:
+    return MC->getObjectFileInfo()->getDwarfLocSection();
+  case DebugSectionKind::DebugLocLists:
+    return MC->getObjectFileInfo()->getDwarfLoclistsSection();
+  case DebugSectionKind::DebugARanges:
+    return MC->getObjectFileInfo()->getDwarfARangesSection();
+  case DebugSectionKind::DebugAbbrev:
+    return MC->getObjectFileInfo()->getDwarfAbbrevSection();
+  case DebugSectionKind::DebugMacinfo:
+    return MC->getObjectFileInfo()->getDwarfMacinfoSection();
+  case DebugSectionKind::DebugMacro:
+    return MC->getObjectFileInfo()->getDwarfMacroSection();
+  case DebugSectionKind::DebugAddr:
+    return MC->getObjectFileInfo()->getDwarfAddrSection();
+  case DebugSectionKind::DebugStr:
+    return MC->getObjectFileInfo()->getDwarfStrSection();
+  case DebugSectionKind::DebugLineStr:
+    return MC->getObjectFileInfo()->getDwarfLineStrSection();
+  case DebugSectionKind::DebugStrOffsets:
+    return MC->getObjectFileInfo()->getDwarfStrOffSection();
+  case DebugSectionKind::DebugPubNames:
+    return MC->getObjectFileInfo()->getDwarfPubNamesSection();
+  case DebugSectionKind::DebugPubTypes:
+    return MC->getObjectFileInfo()->getDwarfPubTypesSection();
+  case DebugSectionKind::DebugNames:
+    return MC->getObjectFileInfo()->getDwarfDebugNamesSection();
+  case DebugSectionKind::AppleNames:
+    return MC->getObjectFileInfo()->getDwarfAccelNamesSection();
+  case DebugSectionKind::AppleNamespaces:
+    return MC->getObjectFileInfo()->getDwarfAccelNamespaceSection();
+  case DebugSectionKind::AppleObjC:
+    return MC->getObjectFileInfo()->getDwarfAccelObjCSection();
+  case DebugSectionKind::AppleTypes:
+    return MC->getObjectFileInfo()->getDwarfAccelTypesSection();
+  case DebugSectionKind::NumberOfEnumEntries:
+    llvm_unreachable("Unknown DebugSectionKind value");
+    break;
+  }
+
+  return nullptr;
+}
+
 /// Emit the debug_str section stored in \p Pool.
 void DwarfStreamer::emitStrings(const NonRelocatableStringpool &Pool) {
   Asm->OutStreamer->switchSection(MOFI->getDwarfStrSection());
diff --git a/llvm/lib/DWARFLinker/DWARFLinkerBase.cpp b/llvm/lib/DWARFLinker/DWARFLinkerBase.cpp
new file mode 100644
index 0000000000000..7ba98d43274fa
--- /dev/null
+++ b/llvm/lib/DWARFLinker/DWARFLinkerBase.cpp
@@ -0,0 +1,64 @@
+//=== DWARFLinkerBase.cpp -------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DWARFLinker/DWARFLinkerBase.h"
+#include "llvm/ADT/StringSwitch.h"
+
+using namespace llvm;
+using namespace llvm::dwarf_linker;
+
+std::optional<DebugSectionKind>
+llvm::dwarf_linker::parseDebugTableName(llvm::StringRef SecName) {
+  return llvm::StringSwitch<std::optional<DebugSectionKind>>(
+             SecName.substr(SecName.find_first_not_of("._")))
+      .Case(getSectionName(DebugSectionKind::DebugInfo),
+            DebugSectionKind::DebugInfo)
+      .Case(getSectionName(DebugSectionKind::DebugLine),
+            DebugSectionKind::DebugLine)
+      .Case(getSectionName(DebugSectionKind::DebugFrame),
+            DebugSectionKind::DebugFrame)
+      .Case(getSectionName(DebugSectionKind::DebugRange),
+            DebugSectionKind::DebugRange)
+      .Case(getSectionName(DebugSectionKind::DebugRngLists),
+            DebugSectionKind::DebugRngLists)
+      .Case(getSectionName(DebugSectionKind::DebugLoc),
+            DebugSectionKind::DebugLoc)
+      .Case(getSectionName(DebugSectionKind::DebugLocLists),
+            DebugSectionKind::DebugLocLists)
+      .Case(getSectionName(DebugSectionKind::DebugARanges),
+            DebugSectionKind::DebugARanges)
+      .Case(getSectionName(DebugSectionKind::DebugAbbrev),
+            DebugSectionKind::DebugAbbrev)
+      .Case(getSectionName(DebugSectionKind::DebugMacinfo),
+            DebugSectionKind::DebugMacinfo)
+      .Case(getSectionName(DebugSectionKind::DebugMacro),
+            DebugSectionKind::DebugMacro)
+      .Case(getSectionName(DebugSectionKind::DebugAddr),
+            DebugSectionKind::DebugAddr)
+      .Case(getSectionName(DebugSectionKind::DebugStr),
+            DebugSectionKind::DebugStr)
+      .Case(getSectionName(DebugSectionKind::DebugLineStr),
+            DebugSectionKind::DebugLineStr)
+      .Case(getSectionName(DebugSectionKind::DebugStrOffsets),
+            DebugSectionKind::DebugStrOffsets)
+      .Case(getSectionName(DebugSectionKind::DebugPubNames),
+            DebugSectionKind::DebugPubNames)
+      .Case(getSectionName(DebugSectionKind::DebugPubTypes),
+            DebugSectionKind::DebugPubTypes)
+      .Case(getSectionName(DebugSectionKind::DebugNames),
+            DebugSectionKind::DebugNames)
+      .Case(getSectionName(DebugSectionKind::AppleNames),
+            DebugSectionKind::AppleNames)
+      .Case(getSectionName(DebugSectionKind::AppleNamespaces),
+            DebugSectionKind::AppleNamespaces)
+      .Case(getSectionName(DebugSectionKind::AppleObjC),
+            DebugSectionKind::AppleObjC)
+      .Case(getSectionName(DebugSectionKind::AppleTypes),
+            DebugSectionKind::AppleTypes)
+      .Default(std::nullopt);
+}
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp
index 115167f0c7dc9..b9edcb63a3401 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.cpp
@@ -120,72 +120,6 @@ Error DwarfEmitterImpl::init(Triple TheTriple,
   return Error::success();
 }
 
-void DwarfEmitterImpl::emitSwiftAST(StringRef Buffer) {
-  MCSection *SwiftASTSection = MOFI->getDwarfSwiftASTSection();
-  SwiftASTSection->setAlignment(Align(32));
-  MS->switchSection(SwiftASTSection);
-  MS->emitBytes(Buffer);
-}
-
-/// Emit the swift reflection section stored in \p Buffer.
-void DwarfEmitterImpl::emitSwiftReflectionSection(
-    llvm::binaryformat::Swift5ReflectionSectionKind ReflSectionKind,
-    StringRef Buffer, uint32_t Alignment, uint32_t) {
-  MCSection *ReflectionSection =
-      MOFI->getSwift5ReflectionSection(ReflSectionKind);
-  if (ReflectionSection == nullptr)
-    return;
-  ReflectionSection->setAlignment(Align(Alignment));
-  MS->switchSection(ReflectionSection);
-  MS->emitBytes(Buffer);
-}
-
-void DwarfEmitterImpl::emitSectionContents(StringRef SecData,
-                                           StringRef SecName) {
-  if (SecData.empty())
-    return;
-
-  if (MCSection *Section = switchSection(SecName)) {
-    MS->switchSection(Section);
-
-    MS->emitBytes(SecData);
-  }
-}
-
-MCSection *DwarfEmitterImpl::switchSection(StringRef SecName) {
-  return StringSwitch<MCSection *>(SecName)
-      .Case("debug_info", MC->getObjectFileInfo()->getDwarfInfoSection())
-      .Case("debug_abbrev", MC->getObjectFileInfo()->getDwarfAbbrevSection())
-      .Case("debug_line", MC->getObjectFileInfo()->getDwarfLineSection())
-      .Case("debug_loc", MC->getObjectFileInfo()->getDwarfLocSection())
-      .Case("debug_ranges", MC->getObjectFileInfo()->getDwarfRangesSection())
-      .Case("debug_frame", MC->getObjectFileInfo()->getDwarfFrameSection())
-      .Case("debug_aranges", MC->getObjectFileInfo()->getDwarfARangesSection())
-      .Case("debug_rnglists",
-            MC->getObjectFileInfo()->getDwarfRnglistsSection())
-      .Case("debug_loclists",
-            MC->getObjectFileInfo()->getDwarfLoclistsSection())
-      .Case("debug_macro", MC->getObjectFileInfo()->getDwarfMacroSection())
-      .Case("debug_macinfo", MC->getObjectFileInfo()->getDwarfMacinfoSection())
-      .Case("debug_addr", MC->getObjectFileInfo()->getDwarfAddrSection())
-      .Case("debug_str", MC->getObjectFileInfo()->getDwarfStrSection())
-      .Case("debug_line_str", MC->getObjectFileInfo()->getDwarfLineStrSection())
-      .Case("debug_str_offsets",
-            MC->getObjectFileInfo()->getDwarfStrOffSection())
-      .Case("debug_pubnames",
-            MC->getObjectFileInfo()->getDwarfPubNamesSection())
-      .Case("debug_pubtypes",
-            MC->getObjectFileInfo()->getDwarfPubTypesSection())
-      .Case("debug_names", MC->getObjectFileInfo()->getDwarfDebugNamesSection())
-      .Case("apple_names", MC->getObjectFileInfo()->getDwarfAccelNamesSection())
-      .Case("apple_namespac",
-            MC->getObjectFileInfo()->getDwarfAccelNamespaceSection())
-      .Case("apple_objc", MC->getObjectFileInfo()->getDwarfAccelObjCSection())
-      .Case("apple_types", MC->getObjectFileInfo()->getDwarfAccelTypesSection())
-
-      .Default(nullptr);
-}
-
 void DwarfEmitterImpl::emitAbbrevs(
     const SmallVector<std::unique_ptr<DIEAbbrev>> &Abbrevs,
     unsigned DwarfVersion) {
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.h b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.h
index 89a33fe941915..71351f1d8c1ce 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFEmitterImpl.h
@@ -45,7 +45,7 @@ using CompUnitIDToIdx = DenseMap<unsigned, unsigned>;
 /// This class emits DWARF data to the output stream. It emits already
 /// generated section data and specific data, which could not be generated
 /// by CompileUnit.
-class DwarfEmitterImpl : public ExtraDwarfEmitter {
+class DwarfEmitterImpl {
 public:
   DwarfEmitterImpl(DWARFLinker::OutputFileType OutFileType,
                    raw_pwrite_stream &OutFile)
@@ -58,21 +58,7 @@ class DwarfEmitterImpl : public ExtraDwarfEmitter {
   const Triple &getTargetTriple() { return MC->getTargetTriple(); }
 
   /// Dump the file to the disk.
-  void finish() override { MS->finish(); }
-
-  /// Returns AsmPrinter.
-  AsmPrinter &getAsmPrinter() const override { return *Asm; }
-
-  /// Emit the swift_ast section stored in \p Buffer.
-  void emitSwiftAST(StringRef Buffer) override;
-
-  /// Emit the swift reflection section stored in \p Buffer.
-  void emitSwiftReflectionSection(
-      llvm::binaryformat::Swift5ReflectionSectionKind ReflSectionKind,
-      StringRef Buffer, uint32_t Alignment, uint32_t) override;
-
-  /// Emit specified section data.
-  void emitSectionContents(StringRef SecData, StringRef SecName) override;
+  void finish() { MS->finish(); }
 
   /// Emit abbreviations.
   void emitAbbrevs(const SmallVector<std::unique_ptr<DIEAbbrev>> &Abbrevs,
@@ -115,8 +101,6 @@ class DwarfEmitterImpl : public ExtraDwarfEmitter {
                        const StringEntryToDwarfStringPoolEntryMap &Strings,
                        uint64_t &NextOffset, MCSection *OutSection);
 
-  MCSection *switchSection(StringRef SecName);
-
   /// \defgroup MCObjects MC layer objects constructed by the streamer
   /// @{
   std::unique_ptr<MCRegisterInfo> MRI;
@@ -135,7 +119,8 @@ class DwarfEmitterImpl : public ExtraDwarfEmitter {
 
   /// The output file we stream the linked Dwarf to.
   raw_pwrite_stream &OutFile;
-  DWARFLinker::OutputFileType OutFileType = DWARFLinker::OutputFileType::Object;
+  DWARFLinkerBase::OutputFileType OutFileType =
+      DWARFLinkerBase::OutputFileType::Object;
 
   uint64_t DebugInfoSectionSize = 0;
 };
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
index 6ed284a66a856..212264714c2a5 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
@@ -1229,8 +1229,9 @@ void CompileUnit::cloneDieAttrExpression(
   }
 }
 
-Error CompileUnit::cloneAndEmit(std::optional<Triple> TargetTriple,
-                                TypeUnit *ArtificialTypeUnit) {
+Error CompileUnit::cloneAndEmit(
+    std::optional<std::reference_wrapper<const Triple>> TargetTriple,
+    TypeUnit *ArtificialTypeUnit) {
   BumpPtrAllocator Allocator;
 
   DWARFDie OrigUnitDIE = getOrigUnit().getUnitDIE();
@@ -1247,18 +1248,17 @@ Error CompileUnit::cloneAndEmit(std::optional<Triple> TargetTriple,
       std::nullopt, std::nullopt, Allocator, ArtificialTypeUnit);
   setOutUnitDIE(OutCUDie.first);
 
-  if (getGlobalData().getOptions().NoOutput || (OutCUDie.first == nullptr))
+  if (!TargetTriple.has_value() || (OutCUDie.first == nullptr))
     return Error::success();
 
-  assert(TargetTriple.has_value());
-  if (Error Err = cloneAndEmitLineTable(*TargetTriple))
+  if (Error Err = cloneAndEmitLineTable((*TargetTriple).get()))
     return Err;
 
   if (Error Err = cloneAndEmitDebugMacro())
     return Err;
 
   getOrCreateSectionDescriptor(DebugSectionKind::DebugInfo);
-  if (Error Err = emitDebugInfo(*TargetTriple))
+  if (Error Err = emitDebugInfo((*TargetTriple).get()))
     return Err;
 
   // ASSUMPTION: .debug_info section should already be emitted at this point.
@@ -1514,7 +1514,7 @@ TypeEntry *CompileUnit::createTypeDIEandCloneAttributes(
   return Entry;
 }
 
-Error CompileUnit::cloneAndEmitLineTable(Triple &TargetTriple) {
+Error CompileUnit::cloneAndEmitLineTable(const Triple &TargetTriple) {
   const DWARFDebugLine::LineTable *InputLineTable =
       getContaingFile().Dwarf->getLineTableForUnit(&getOrigUnit());
   if (InputLineTable == nullptr) {
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.h
index abd978e7c0e4b..20e20222a4ac0 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.h
@@ -401,8 +401,9 @@ class alignas(8) CompileUnit : public DwarfUnit {
   const RangesTy &getFunctionRanges() const { return Ranges; }
 
   /// Clone and emit this compilation unit.
-  Error cloneAndEmit(std::optional<Triple> TargetTriple,
-                     TypeUnit *ArtificialTypeUnit);
+  Error
+  cloneAndEmit(std::optional<std::reference_wrapper<const Triple>> TargetTriple,
+               TypeUnit *ArtificialTypeUnit);
 
   /// Clone and emit debug locations(.debug_loc/.debug_loclists).
   Error cloneAndEmitDebugLocations();
@@ -422,7 +423,7 @@ class alignas(8) CompileUnit : public DwarfUnit {
            BumpPtrAllocator &Allocator, TypeUnit *ArtificialTypeUnit);
 
   // Clone and emit line table.
-  Error cloneAndEmitLineTable(Triple &TargetTriple);
+  Error cloneAndEmitLineTable(const Triple &TargetTriple);
 
   /// Clone attribute location axpression.
   void cloneDieAttrExpression(const DWARFExpression &InputExpression,
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h
index b641343ac808c..38c261a8106fc 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerGlobalData.h
@@ -39,9 +39,6 @@ struct DWARFLinkerOptions {
   /// Verify the input DWARF.
   bool VerifyInputDWARF = false;
 
-  /// Do not emit output.
-  bool NoOutput = false;
-
   /// Do not unique types according to ODR
   bool NoODR = false;
 
@@ -59,13 +56,14 @@ struct DWARFLinkerOptions {
   unsigned Threads = 1;
 
   /// The accelerator table kinds
-  SmallVector<DWARFLinker::AccelTableKind, 1> AccelTables;
+  SmallVector<DWARFLinkerBase::AccelTableKind, 1> AccelTables;
 
   /// Prepend path for the clang modules.
   std::string PrependPath;
 
   /// input verification handler(it might be called asynchronously).
-  DWARFLinker::InputVerificationHandlerTy InputVerificationHandler = nullptr;
+  DWARFLinkerBase::InputVerificationHandlerTy InputVerificationHandler =
+      nullptr;
 
   /// A list of all .swiftinterface files referenced by the debug
   /// info, mapping Module name to path on disk. The entries need to
@@ -74,12 +72,12 @@ struct DWARFLinkerOptions {
   /// this is dsymutil specific fag.
   ///
   /// (it might be called asynchronously).
-  DWARFLinker::SwiftInterfacesMapTy *ParseableSwiftInterfaces = nullptr;
+  DWARFLinkerBase::SwiftInterfacesMapTy *ParseableSwiftInterfaces = nullptr;
 
   /// A list of remappings to apply to file paths.
   ///
   /// (it might be called asynchronously).
-  DWARFLinker::ObjectPrefixMapTy *ObjectPrefixMap = nullptr;
+  DWARFLinkerBase::ObjectPrefixMapTy *ObjectPrefixMap = nullptr;
 };
 
 class DWARFLinkerImpl;
@@ -147,6 +145,19 @@ class LinkingGlobalData {
     });
   }
 
+  /// Set target triple.
+  void setTargetTriple(const Triple &TargetTriple) {
+    this->TargetTriple = TargetTriple;
+  }
+
+  /// Optionally return target triple.
+  std::optional<std::reference_wrapper<const Triple>> getTargetTriple() {
+    if (TargetTriple)
+      return std::cref(*TargetTriple);
+
+    return std::nullopt;
+  }
+
 protected:
   llvm::parallel::PerThreadBumpPtrAllocator Allocator;
   StringPool Strings;
@@ -154,6 +165,10 @@ class LinkingGlobalData {
   DWARFLinkerOptions Options;
   MessageHandlerTy WarningHandler;
   MessageHandlerTy ErrorHandler;
+
+  /// Triple for output data. May be not set if generation of output
+  /// data is not requested.
+  std::optional<Triple> TargetTriple;
 };
 
 } // end of namespace parallel
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
index b0b819cf97785..f127214927a5d 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
@@ -32,11 +32,9 @@ DWARFLinkerImpl::DWARFLinkerImpl(MessageHandlerTy ErrorHandler,
 DWARFLinkerImpl::LinkContext::LinkContext(LinkingGlobalData &GlobalData,
                                           DWARFFile &File,
                                           StringMap<uint64_t> &ClangModules,
-                                          std::atomic<size_t> &UniqueUnitID,
-                                          std::optional<Triple> TargetTriple)
+                                          std::atomic<size_t> &UniqueUnitID)
     : OutputSections(GlobalData), InputDWARFFile(File),
-      ClangModules(ClangModules), TargetTriple(TargetTriple),
-      UniqueUnitID(UniqueUnitID) {
+      ClangModules(ClangModules), UniqueUnitID(UniqueUnitID) {
 
   if (File.Dwarf) {
     if (!File.Dwarf->compile_units().empty())
@@ -63,25 +61,10 @@ void DWARFLinkerImpl::LinkContext::addModulesCompileUnit(
   ModulesCompileUnits.emplace_back(std::move(Unit));
 }
 
-Error DWARFLinkerImpl::createEmitter(const Triple &TheTriple,
-                                     OutputFileType FileType,
-                                     raw_pwrite_stream &OutFile) {
-
-  TheDwarfEmitter = std::make_unique<DwarfEmitterImpl>(FileType, OutFile);
-
-  return TheDwarfEmitter->init(TheTriple, "__DWARF");
-}
-
-ExtraDwarfEmitter *DWARFLinkerImpl::getEmitter() {
-  return TheDwarfEmitter.get();
-}
-
 void DWARFLinkerImpl::addObjectFile(DWARFFile &File, ObjFileLoaderTy Loader,
                                     CompileUnitHandlerTy OnCUDieLoaded) {
   ObjectContexts.emplace_back(std::make_unique<LinkContext>(
-      GlobalData, File, ClangModules, UniqueUnitID,
-      (TheDwarfEmitter.get() == nullptr ? std::optional<Triple>(std::nullopt)
-                                        : TheDwarfEmitter->getTargetTriple())));
+      GlobalData, File, ClangModules, UniqueUnitID));
 
   if (ObjectContexts.back()->InputDWARFFile.Dwarf) {
     for (const std::unique_ptr<DWARFUnit> &CU :
@@ -117,8 +100,9 @@ Error DWARFLinkerImpl::link() {
                                     0, dwarf::DwarfFormat::DWARF32};
   llvm::endianness GlobalEndianness = llvm::endianness::native;
 
-  if (TheDwarfEmitter) {
-    GlobalEndianness = TheDwarfEmitter->getTargetTriple().isLittleEndian()
+  if (std::optional<std::reference_wrapper<const Triple>> CurTriple =
+          GlobalData.getTargetTriple()) {
+    GlobalEndianness = (*CurTriple).get().isLittleEndian()
                            ? llvm::endianness::little
                            : llvm::endianness::big;
   }
@@ -147,7 +131,7 @@ Error DWARFLinkerImpl::link() {
     if (GlobalData.getOptions().VerifyInputDWARF)
       verifyInput(Context->InputDWARFFile);
 
-    if (!TheDwarfEmitter)
+    if (!GlobalData.getTargetTriple())
       GlobalEndianness = Context->getEndianness();
     GlobalFormat.AddrSize =
         std::max(GlobalFormat.AddrSize, Context->getFormParams().AddrSize);
@@ -173,9 +157,9 @@ Error DWARFLinkerImpl::link() {
   }
 
   if (GlobalFormat.AddrSize == 0) {
-    if (TheDwarfEmitter)
-      GlobalFormat.AddrSize =
-          TheDwarfEmitter->getTargetTriple().isArch32Bit() ? 4 : 8;
+    if (std::optional<std::reference_wrapper<const Triple>> TargetTriple =
+            GlobalData.getTargetTriple())
+      GlobalFormat.AddrSize = (*TargetTriple).get().isArch32Bit() ? 4 : 8;
     else
       GlobalFormat.AddrSize = 8;
   }
@@ -225,12 +209,10 @@ Error DWARFLinkerImpl::link() {
                                                   ->getValue()
                                                   .load()
                                                   ->Children.empty()) {
-    std::optional<Triple> OutTriple = TheDwarfEmitter.get() == nullptr
-                                          ? std::optional<Triple>(std::nullopt)
-                                          : TheDwarfEmitter->getTargetTriple();
-
-    if (Error Err = ArtificialTypeUnit.get()->finishCloningAndEmit(OutTriple))
-      return Err;
+    if (GlobalData.getTargetTriple().has_value())
+      if (Error Err = ArtificialTypeUnit.get()->finishCloningAndEmit(
+              (*GlobalData.getTargetTriple()).get()))
+        return Err;
   }
 
   // At this stage each compile units are cloned to their own set of debug
@@ -258,8 +240,6 @@ Error DWARFLinkerImpl::validateAndUpdateOptions() {
     return createStringError(std::errc::invalid_argument,
                              "target DWARF version is not set");
 
-  GlobalData.Options.NoOutput = TheDwarfEmitter.get() == nullptr;
-
   if (GlobalData.getOptions().Verbose && GlobalData.getOptions().Threads != 1) {
     GlobalData.Options.Threads = 1;
     GlobalData.warn(
@@ -690,7 +670,8 @@ void DWARFLinkerImpl::LinkContext::linkSingleCompileUnit(
           if (CU.isClangModule() ||
               GlobalData.getOptions().UpdateIndexTablesOnly ||
               CU.getContaingFile().Addresses->hasValidRelocs()) {
-            if (Error Err = CU.cloneAndEmit(TargetTriple, ArtificialTypeUnit))
+            if (Error Err = CU.cloneAndEmit(GlobalData.getTargetTriple(),
+                                            ArtificialTypeUnit))
               return std::move(Err);
           }
 
@@ -727,7 +708,7 @@ void DWARFLinkerImpl::LinkContext::linkSingleCompileUnit(
 }
 
 Error DWARFLinkerImpl::LinkContext::emitInvariantSections() {
-  if (GlobalData.getOptions().NoOutput)
+  if (!GlobalData.getTargetTriple().has_value())
     return Error::success();
 
   getOrCreateSectionDescriptor(DebugSectionKind::DebugLoc).OS
@@ -749,7 +730,7 @@ Error DWARFLinkerImpl::LinkContext::emitInvariantSections() {
 }
 
 Error DWARFLinkerImpl::LinkContext::cloneAndEmitDebugFrame() {
-  if (GlobalData.getOptions().NoOutput)
+  if (!GlobalData.getTargetTriple().has_value())
     return Error::success();
 
   if (InputDWARFFile.Dwarf.get() == nullptr)
@@ -870,8 +851,9 @@ void DWARFLinkerImpl::LinkContext::emitFDE(uint32_t CIEOffset,
 }
 
 void DWARFLinkerImpl::glueCompileUnitsAndWriteToTheOutput() {
-  if (GlobalData.getOptions().NoOutput)
+  if (!GlobalData.getTargetTriple().has_value())
     return;
+  assert(SectionHandler);
 
   // Go through all object files, all compile units and assign
   // offsets to them.
@@ -1154,21 +1136,23 @@ void DWARFLinkerImpl::emitCommonSectionsAndWriteCompileUnitsToTheOutput() {
                          AccelTableKind::DebugNames))
     CommonSections.getOrCreateSectionDescriptor(DebugSectionKind::DebugNames);
 
-  const Triple &TargetTriple = TheDwarfEmitter->getTargetTriple();
-
   // Emit .debug_str and .debug_line_str sections.
   TG.spawn([&]() { emitStringSections(); });
 
   if (llvm::is_contained(GlobalData.Options.AccelTables,
                          AccelTableKind::Apple)) {
     // Emit apple accelerator sections.
-    TG.spawn([&]() { emitAppleAcceleratorSections(TargetTriple); });
+    TG.spawn([&]() {
+      emitAppleAcceleratorSections((*GlobalData.getTargetTriple()).get());
+    });
   }
 
   if (llvm::is_contained(GlobalData.Options.AccelTables,
                          AccelTableKind::DebugNames)) {
     // Emit .debug_names section.
-    TG.spawn([&]() { emitDWARFv5DebugNamesSection(TargetTriple); });
+    TG.spawn([&]() {
+      emitDWARFv5DebugNamesSection((*GlobalData.getTargetTriple()).get());
+    });
   }
 
   // Write compile units to the output file.
@@ -1419,33 +1403,17 @@ void DWARFLinkerImpl::cleanupDataAfterDWARFOutputIsWritten() {
 }
 
 void DWARFLinkerImpl::writeCompileUnitsToTheOutput() {
-  bool HasAbbreviations = false;
-
   // Enumerate all sections and store them into the final emitter.
   forEachObjectSectionsSet([&](OutputSections &Sections) {
-    Sections.forEach([&](SectionDescriptor &OutSection) {
-      if (!HasAbbreviations && !OutSection.getContents().empty() &&
-          OutSection.getKind() == DebugSectionKind::DebugAbbrev)
-        HasAbbreviations = true;
-
+    Sections.forEach([&](std::shared_ptr<SectionDescriptor> OutSection) {
       // Emit section content.
-      TheDwarfEmitter->emitSectionContents(OutSection.getContents(),
-                                           OutSection.getName());
-      OutSection.clearSectionContent();
+      SectionHandler(OutSection);
     });
   });
-
-  if (!HasAbbreviations) {
-    const SmallVector<std::unique_ptr<DIEAbbrev>> Abbreviations;
-    TheDwarfEmitter->emitAbbrevs(Abbreviations, 3);
-  }
 }
 
 void DWARFLinkerImpl::writeCommonSectionsToTheOutput() {
-  CommonSections.forEach([&](SectionDescriptor &OutSection) {
-    // Emit section content.
-    TheDwarfEmitter->emitSectionContents(OutSection.getContents(),
-                                         OutSection.getName());
-    OutSection.clearSectionContent();
+  CommonSections.forEach([&](std::shared_ptr<SectionDescriptor> OutSection) {
+    SectionHandler(OutSection);
   });
 }
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
index b4331df5e323d..527c7a0ec3642 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.h
@@ -29,12 +29,6 @@ class DWARFLinkerImpl : public DWARFLinker {
                   MessageHandlerTy WarningHandler,
                   TranslatorFuncTy StringsTranslator);
 
-  /// Create debug info emitter.
-  Error createEmitter(const Triple &TheTriple, OutputFileType FileType,
-                      raw_pwrite_stream &OutFile) override;
-
-  ExtraDwarfEmitter *getEmitter() override;
-
   /// Add object file to be linked. Pre-load compile unit die. Call
   /// \p OnCUDieLoaded for each compile unit die. If specified \p File
   /// has reference to the Clang module then such module would be
@@ -49,6 +43,14 @@ class DWARFLinkerImpl : public DWARFLinker {
   /// Link debug info for added files.
   Error link() override;
 
+  /// Set output DWARF handler. May be not set if output generation is not
+  /// necessary.
+  void setOutputDWARFHandler(const Triple &TargetTriple,
+                             SectionHandlerTy SectionHandler) override {
+    GlobalData.setTargetTriple(TargetTriple);
+    this->SectionHandler = SectionHandler;
+  }
+
   /// \defgroup Methods setting various linking options:
   ///
   /// @{
@@ -190,8 +192,6 @@ class DWARFLinkerImpl : public DWARFLinker {
 
     StringMap<uint64_t> &ClangModules;
 
-    std::optional<Triple> TargetTriple;
-
     /// Flag indicating that new inter-connected compilation units were
     /// discovered. It is used for restarting units processing
     /// if new inter-connected units were found.
@@ -204,8 +204,7 @@ class DWARFLinkerImpl : public DWARFLinker {
 
     LinkContext(LinkingGlobalData &GlobalData, DWARFFile &File,
                 StringMap<uint64_t> &ClangModules,
-                std::atomic<size_t> &UniqueUnitID,
-                std::optional<Triple> TargetTriple);
+                std::atomic<size_t> &UniqueUnitID);
 
     /// Check whether specified \p CUDie is a Clang module reference.
     /// if \p Quiet is false then display error messages.
@@ -364,8 +363,8 @@ class DWARFLinkerImpl : public DWARFLinker {
   /// Common sections.
   OutputSections CommonSections;
 
-  /// The emitter of final dwarf file.
-  std::unique_ptr<DwarfEmitterImpl> TheDwarfEmitter;
+  /// Hanler for output sections.
+  SectionHandlerTy SectionHandler = nullptr;
 
   /// Overall compile units number.
   uint64_t OverallNumberOfCU = 0;
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.cpp
index 397411895a8e3..3030aa2c39b23 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.cpp
@@ -337,11 +337,11 @@ uint8_t TypeUnit::getSizeByAttrForm(dwarf::Form Form) const {
   llvm_unreachable("Unsupported Attr Form");
 }
 
-Error TypeUnit::finishCloningAndEmit(std::optional<Triple> TargetTriple) {
+Error TypeUnit::finishCloningAndEmit(const Triple &TargetTriple) {
   BumpPtrAllocator Allocator;
   createDIETree(Allocator);
 
-  if (getGlobalData().getOptions().NoOutput || (getOutUnitDIE() == nullptr))
+  if (getOutUnitDIE() == nullptr)
     return Error::success();
 
   // Create sections ahead so that they should not be created asynchronously
@@ -360,14 +360,12 @@ Error TypeUnit::finishCloningAndEmit(std::optional<Triple> TargetTriple) {
 
   // Add task for emitting .debug_line section.
   if (!LineTable.Prologue.FileNames.empty()) {
-    Tasks.push_back([&]() -> Error {
-      assert(TargetTriple.has_value());
-      return emitDebugLine(*TargetTriple, LineTable);
-    });
+    Tasks.push_back(
+        [&]() -> Error { return emitDebugLine(TargetTriple, LineTable); });
   }
 
   // Add task for emitting .debug_info section.
-  Tasks.push_back([&]() -> Error { return emitDebugInfo(*TargetTriple); });
+  Tasks.push_back([&]() -> Error { return emitDebugInfo(TargetTriple); });
 
   // Add task for emitting Pub accelerator sections.
   if (llvm::is_contained(GlobalData.getOptions().AccelTables,
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.h b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.h
index 0944de8d1315a..328da5c0570b1 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.h
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerTypeUnit.h
@@ -31,7 +31,7 @@ class TypeUnit : public DwarfUnit {
   void createDIETree(BumpPtrAllocator &Allocator);
 
   /// Emits resulting dwarf based on information from DIE tree.
-  Error finishCloningAndEmit(std::optional<Triple> TargetTriple);
+  Error finishCloningAndEmit(const Triple &TargetTriple);
 
   /// Returns global type pool.
   TypePool &getTypePool() { return Types; }
diff --git a/llvm/lib/DWARFLinker/Parallel/OutputSections.cpp b/llvm/lib/DWARFLinker/Parallel/OutputSections.cpp
index cd1205b60f85b..d03f9b40d4902 100644
--- a/llvm/lib/DWARFLinker/Parallel/OutputSections.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/OutputSections.cpp
@@ -15,59 +15,6 @@ using namespace llvm;
 using namespace dwarf_linker;
 using namespace dwarf_linker::parallel;
 
-std::optional<DebugSectionKind>
-dwarf_linker::parallel::parseDebugTableName(llvm::StringRef SecName) {
-  return llvm::StringSwitch<std::optional<DebugSectionKind>>(
-             SecName.substr(SecName.find_first_not_of("._")))
-      .Case(getSectionName(DebugSectionKind::DebugInfo),
-            DebugSectionKind::DebugInfo)
-      .Case(getSectionName(DebugSectionKind::DebugLine),
-            DebugSectionKind::DebugLine)
-      .Case(getSectionName(DebugSectionKind::DebugFrame),
-            DebugSectionKind::DebugFrame)
-      .Case(getSectionName(DebugSectionKind::DebugRange),
-            DebugSectionKind::DebugRange)
-      .Case(getSectionName(DebugSectionKind::DebugRngLists),
-            DebugSectionKind::DebugRngLists)
-      .Case(getSectionName(DebugSectionKind::DebugLoc),
-            DebugSectionKind::DebugLoc)
-      .Case(getSectionName(DebugSectionKind::DebugLocLists),
-            DebugSectionKind::DebugLocLists)
-      .Case(getSectionName(DebugSectionKind::DebugARanges),
-            DebugSectionKind::DebugARanges)
-      .Case(getSectionName(DebugSectionKind::DebugAbbrev),
-            DebugSectionKind::DebugAbbrev)
-      .Case(getSectionName(DebugSectionKind::DebugMacinfo),
-            DebugSectionKind::DebugMacinfo)
-      .Case(getSectionName(DebugSectionKind::DebugMacro),
-            DebugSectionKind::DebugMacro)
-      .Case(getSectionName(DebugSectionKind::DebugAddr),
-            DebugSectionKind::DebugAddr)
-      .Case(getSectionName(DebugSectionKind::DebugStr),
-            DebugSectionKind::DebugStr)
-      .Case(getSectionName(DebugSectionKind::DebugLineStr),
-            DebugSectionKind::DebugLineStr)
-      .Case(getSectionName(DebugSectionKind::DebugStrOffsets),
-            DebugSectionKind::DebugStrOffsets)
-      .Case(getSectionName(DebugSectionKind::DebugPubNames),
-            DebugSectionKind::DebugPubNames)
-      .Case(getSectionName(DebugSectionKind::DebugPubTypes),
-            DebugSectionKind::DebugPubTypes)
-      .Case(getSectionName(DebugSectionKind::DebugNames),
-            DebugSectionKind::DebugNames)
-      .Case(getSectionName(DebugSectionKind::AppleNames),
-            DebugSectionKind::AppleNames)
-      .Case(getSectionName(DebugSectionKind::AppleNamespaces),
-            DebugSectionKind::AppleNamespaces)
-      .Case(getSectionName(DebugSectionKind::AppleObjC),
-            DebugSectionKind::AppleObjC)
-      .Case(getSectionName(DebugSectionKind::AppleTypes),
-            DebugSectionKind::AppleTypes)
-      .Default(std::nullopt);
-
-  return std::nullopt;
-}
-
 DebugDieRefPatch::DebugDieRefPatch(uint64_t PatchOffset, CompileUnit *SrcCU,
                                    CompileUnit *RefCU, uint32_t RefIdx)
     : SectionPatch({PatchOffset}),
diff --git a/llvm/lib/DWARFLinker/Parallel/OutputSections.h b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
index b9df2228920a9..a21e4b2b75a50 100644
--- a/llvm/lib/DWARFLinker/Parallel/OutputSections.h
+++ b/llvm/lib/DWARFLinker/Parallel/OutputSections.h
@@ -34,51 +34,6 @@ namespace parallel {
 
 class TypeUnit;
 
-/// List of tracked debug tables.
-enum class DebugSectionKind : uint8_t {
-  DebugInfo = 0,
-  DebugLine,
-  DebugFrame,
-  DebugRange,
-  DebugRngLists,
-  DebugLoc,
-  DebugLocLists,
-  DebugARanges,
-  DebugAbbrev,
-  DebugMacinfo,
-  DebugMacro,
-  DebugAddr,
-  DebugStr,
-  DebugLineStr,
-  DebugStrOffsets,
-  DebugPubNames,
-  DebugPubTypes,
-  DebugNames,
-  AppleNames,
-  AppleNamespaces,
-  AppleObjC,
-  AppleTypes,
-  NumberOfEnumEntries // must be last
-};
-constexpr static size_t SectionKindsNum =
-    static_cast<size_t>(DebugSectionKind::NumberOfEnumEntries);
-
-static constexpr StringLiteral SectionNames[SectionKindsNum] = {
-    "debug_info",     "debug_line",     "debug_frame",       "debug_ranges",
-    "debug_rnglists", "debug_loc",      "debug_loclists",    "debug_aranges",
-    "debug_abbrev",   "debug_macinfo",  "debug_macro",       "debug_addr",
-    "debug_str",      "debug_line_str", "debug_str_offsets", "debug_pubnames",
-    "debug_pubtypes", "debug_names",    "apple_names",       "apple_namespac",
-    "apple_objc",     "apple_types"};
-
-static constexpr const StringLiteral &
-getSectionName(DebugSectionKind SectionKind) {
-  return SectionNames[static_cast<uint8_t>(SectionKind)];
-}
-
-/// Recognise the table name and match it with the DebugSectionKind.
-std::optional<DebugSectionKind> parseDebugTableName(StringRef Name);
-
 /// There are fields(sizes, offsets) which should be updated after
 /// sections are generated. To remember offsets and related data
 /// the descendants of SectionPatch structure should be used.
@@ -194,12 +149,13 @@ class OutputSections;
 
 /// This structure is used to keep data of the concrete section.
 /// Like data bits, list of patches, format.
-struct SectionDescriptor {
+struct SectionDescriptor : SectionDescriptorBase {
   friend OutputSections;
 
   SectionDescriptor(DebugSectionKind SectionKind, LinkingGlobalData &GlobalData,
                     dwarf::FormParams Format, llvm::endianness Endianess)
-      : OS(Contents), ListDebugStrPatch(&GlobalData.getAllocator()),
+      : SectionDescriptorBase(SectionKind, Format, Endianess), OS(Contents),
+        ListDebugStrPatch(&GlobalData.getAllocator()),
         ListDebugLineStrPatch(&GlobalData.getAllocator()),
         ListDebugRangePatch(&GlobalData.getAllocator()),
         ListDebugLocPatch(&GlobalData.getAllocator()),
@@ -211,10 +167,9 @@ struct SectionDescriptor {
         ListDebugTypeStrPatch(&GlobalData.getAllocator()),
         ListDebugTypeLineStrPatch(&GlobalData.getAllocator()),
         ListDebugTypeDeclFilePatch(&GlobalData.getAllocator()),
-        GlobalData(GlobalData), SectionKind(SectionKind), Format(Format),
-        Endianess(Endianess) {}
+        GlobalData(GlobalData) {}
 
-  /// Erase whole section contents(data bits, list of patches).
+  /// Erase whole section content(data bits, list of patches).
   void clearAllSectionData();
 
   /// Erase only section output data bits.
@@ -263,7 +218,7 @@ struct SectionDescriptor {
   void setSizesForSectionCreatedByAsmPrinter();
 
   /// Returns section content.
-  StringRef getContents() {
+  StringRef getContents() override {
     if (SectionOffsetInsideAsmPrinterOutputStart == 0)
       return StringRef(Contents.data(), Contents.size());
 
@@ -311,18 +266,6 @@ struct SectionDescriptor {
   /// Write specified \p Value of \p AttrForm to the \p PatchOffset.
   void apply(uint64_t PatchOffset, dwarf::Form AttrForm, uint64_t Val);
 
-  /// Returns section kind.
-  DebugSectionKind getKind() { return SectionKind; }
-
-  /// Returns section name.
-  const StringLiteral &getName() const { return getSectionName(SectionKind); }
-
-  /// Returns endianess used by section.
-  llvm::endianness getEndianess() const { return Endianess; }
-
-  /// Returns FormParams used by section.
-  dwarf::FormParams getFormParams() const { return Format; }
-
   /// Returns integer value of \p Size located by specified \p PatchOffset.
   uint64_t getIntVal(uint64_t PatchOffset, unsigned Size);
 
@@ -344,9 +287,6 @@ struct SectionDescriptor {
 
   LinkingGlobalData &GlobalData;
 
-  /// The section kind.
-  DebugSectionKind SectionKind = DebugSectionKind::NumberOfEnumEntries;
-
   /// Section data bits.
   OutSectionDataTy Contents;
 
@@ -355,10 +295,6 @@ struct SectionDescriptor {
   /// real section content inside elf file.
   size_t SectionOffsetInsideAsmPrinterOutputStart = 0;
   size_t SectionOffsetInsideAsmPrinterOutputEnd = 0;
-
-  /// Output format.
-  dwarf::FormParams Format = {4, 4, dwarf::DWARF32};
-  llvm::endianness Endianess = llvm::endianness::little;
 };
 
 /// This class keeps contents and offsets to the debug sections. Any objects
@@ -387,7 +323,7 @@ class OutputSections {
               .str()
               .c_str());
 
-    return It->second;
+    return *It->second;
   }
 
   /// Returns descriptor for the specified section of \p SectionKind.
@@ -402,7 +338,9 @@ class OutputSections {
               .str()
               .c_str());
 
-    return It->second;
+    assert(It->second.get() != nullptr);
+
+    return *It->second;
   }
 
   /// Returns descriptor for the specified section of \p SectionKind.
@@ -414,7 +352,7 @@ class OutputSections {
     if (It == SectionDescriptors.end())
       return std::nullopt;
 
-    return &It->second;
+    return It->second.get();
   }
 
   /// Returns descriptor for the specified section of \p SectionKind.
@@ -426,26 +364,44 @@ class OutputSections {
     if (It == SectionDescriptors.end())
       return std::nullopt;
 
-    return &It->second;
+    return It->second.get();
   }
 
   /// Returns descriptor for the specified section of \p SectionKind.
   /// If descriptor does not exist then creates it.
   SectionDescriptor &
   getOrCreateSectionDescriptor(DebugSectionKind SectionKind) {
-    return SectionDescriptors
-        .try_emplace(SectionKind, SectionKind, GlobalData, Format, Endianness)
-        .first->second;
+    SectionsSetTy::iterator It = SectionDescriptors.find(SectionKind);
+
+    if (It == SectionDescriptors.end()) {
+      SectionDescriptor *Section =
+          new SectionDescriptor(SectionKind, GlobalData, Format, Endianness);
+      auto Result = SectionDescriptors.try_emplace(SectionKind, Section);
+      assert(Result.second);
+
+      It = Result.first;
+    }
+
+    return *It->second;
   }
 
   /// Erases data of all sections.
   void eraseSections() {
     for (auto &Section : SectionDescriptors)
-      Section.second.clearAllSectionData();
+      Section.second->clearAllSectionData();
   }
 
   /// Enumerate all sections and call \p Handler for each.
   void forEach(function_ref<void(SectionDescriptor &)> Handler) {
+    for (auto &Section : SectionDescriptors) {
+      assert(Section.second.get() != nullptr);
+      Handler(*(Section.second));
+    }
+  }
+
+  /// Enumerate all sections and call \p Handler for each.
+  void forEach(
+      function_ref<void(std::shared_ptr<SectionDescriptor> Section)> Handler) {
     for (auto &Section : SectionDescriptors)
       Handler(Section.second);
   }
@@ -456,10 +412,11 @@ class OutputSections {
   void assignSectionsOffsetAndAccumulateSize(
       std::array<uint64_t, SectionKindsNum> &SectionSizesAccumulator) {
     for (auto &Section : SectionDescriptors) {
-      Section.second.StartOffset = SectionSizesAccumulator[static_cast<uint8_t>(
-          Section.second.getKind())];
-      SectionSizesAccumulator[static_cast<uint8_t>(Section.second.getKind())] +=
-          Section.second.getContents().size();
+      Section.second->StartOffset =
+          SectionSizesAccumulator[static_cast<uint8_t>(
+              Section.second->getKind())];
+      SectionSizesAccumulator[static_cast<uint8_t>(
+          Section.second->getKind())] += Section.second->getContents().size();
     }
   }
 
@@ -505,7 +462,8 @@ class OutputSections {
   llvm::endianness Endianness = llvm::endianness::native;
 
   /// All keeping sections.
-  using SectionsSetTy = std::map<DebugSectionKind, SectionDescriptor>;
+  using SectionsSetTy =
+      std::map<DebugSectionKind, std::shared_ptr<SectionDescriptor>>;
   SectionsSetTy SectionDescriptors;
 };
 
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index 4ad188b0796f5..d09dcf678e1e9 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -533,9 +533,8 @@ Error DwarfLinkerForBinary::copySwiftInterfaces(StringRef Architecture) const {
   return Error::success();
 }
 
-template <typename OutStreamer>
 void DwarfLinkerForBinary::copySwiftReflectionMetadata(
-    const llvm::dsymutil::DebugMapObject *Obj, OutStreamer *Streamer,
+    const llvm::dsymutil::DebugMapObject *Obj, classic::DwarfStreamer *Streamer,
     std::vector<uint64_t> &SectionToOffsetInDwarf,
     std::vector<MachOUtils::DwarfRelocationApplicationInfo>
         &RelocationsToApply) {
@@ -649,14 +648,32 @@ bool DwarfLinkerForBinary::linkImpl(
       },
       Options.Translator ? TranslationLambda : nullptr);
 
+  std::unique_ptr<classic::DwarfStreamer> Streamer;
   if (!Options.NoOutput) {
-    if (Error Err = GeneralLinker->createEmitter(Map.getTriple(), ObjectType,
-                                                 OutFile)) {
-      handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EI) {
+    if (Expected<std::unique_ptr<classic::DwarfStreamer>> StreamerOrErr =
+            classic::DwarfStreamer::createStreamer(
+                Map.getTriple(), ObjectType, OutFile, Options.Translator,
+                [&](const Twine &Warning, StringRef Context,
+                    const DWARFDie *DIE) {
+                  reportWarning(Warning, Context, DIE);
+                }))
+      Streamer = std::move(*StreamerOrErr);
+    else {
+      handleAllErrors(StreamerOrErr.takeError(), [&](const ErrorInfoBase &EI) {
         reportError(EI.message(), "dwarf streamer init");
       });
       return false;
     }
+
+    if constexpr (std::is_same<Linker, parallel::DWARFLinker>::value) {
+      GeneralLinker->setOutputDWARFHandler(
+          Map.getTriple(),
+          [&](std::shared_ptr<parallel::SectionDescriptorBase> Section) {
+            Streamer->emitSectionContents(Section->getContents(),
+                                          Section->getKind());
+          });
+    } else
+      GeneralLinker->setOutputDWARFEmitter(Streamer.get());
   }
 
   remarks::RemarkLinker RL;
@@ -749,7 +766,7 @@ bool DwarfLinkerForBinary::linkImpl(
     auto SectionToOffsetInDwarf =
         calculateStartOfStrippableReflectionSections(Map);
     for (const auto &Obj : Map.objects())
-      copySwiftReflectionMetadata(Obj.get(), GeneralLinker->getEmitter(),
+      copySwiftReflectionMetadata(Obj.get(), Streamer.get(),
                                   SectionToOffsetInDwarf, RelocationsToApply);
   }
 
@@ -796,7 +813,7 @@ bool DwarfLinkerForBinary::linkImpl(
 
       // Copy the module into the .swift_ast section.
       if (!Options.NoOutput)
-        GeneralLinker->getEmitter()->emitSwiftAST((*ErrorOrMem)->getBuffer());
+        Streamer->emitSwiftAST((*ErrorOrMem)->getBuffer());
 
       continue;
     }
@@ -850,10 +867,9 @@ bool DwarfLinkerForBinary::linkImpl(
       ObjectType == Linker::OutputFileType::Object)
     return MachOUtils::generateDsymCompanion(
         Options.VFS, Map, Options.Translator,
-        *GeneralLinker->getEmitter()->getAsmPrinter().OutStreamer, OutFile,
-        RelocationsToApply);
+        *Streamer->getAsmPrinter().OutStreamer, OutFile, RelocationsToApply);
 
-  GeneralLinker->getEmitter()->finish();
+  Streamer->finish();
   return true;
 }
 
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.h b/llvm/tools/dsymutil/DwarfLinkerForBinary.h
index 052c6a899fd76..14b9625117c2d 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.h
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.h
@@ -253,9 +253,9 @@ class DwarfLinkerForBinary {
 
   Error copySwiftInterfaces(StringRef Architecture) const;
 
-  template <typename OutStreamer>
   void copySwiftReflectionMetadata(
-      const llvm::dsymutil::DebugMapObject *Obj, OutStreamer *Streamer,
+      const llvm::dsymutil::DebugMapObject *Obj,
+      classic::DwarfStreamer *Streamer,
       std::vector<uint64_t> &SectionToOffsetInDwarf,
       std::vector<MachOUtils::DwarfRelocationApplicationInfo>
           &RelocationsToApply);
diff --git a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
index d677bb0962669..19eb1bed7578b 100644
--- a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
+++ b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
@@ -336,9 +336,26 @@ Error linkDebugInfoImpl(object::ObjectFile &File, const Options &Options,
       Linker::createLinker(ReportErr, ReportWarn);
 
   Triple TargetTriple = File.makeTriple();
-  if (Error Err = DebugInfoLinker->createEmitter(
-          TargetTriple, Linker::OutputFileType::Object, OutStream))
-    return Err;
+  std::unique_ptr<classic::DwarfStreamer> Streamer;
+  if (Expected<std::unique_ptr<classic::DwarfStreamer>> StreamerOrErr =
+          classic::DwarfStreamer::createStreamer(
+              TargetTriple, Linker::OutputFileType::Object, OutStream, nullptr,
+              ReportWarn))
+    Streamer = std::move(*StreamerOrErr);
+  else
+    return StreamerOrErr.takeError();
+
+  if constexpr (std::is_same<Linker,
+                             dwarf_linker::parallel::DWARFLinker>::value) {
+    DebugInfoLinker->setOutputDWARFHandler(
+        TargetTriple,
+        [&](std::shared_ptr<dwarf_linker::parallel::SectionDescriptorBase>
+                Section) {
+          Streamer->emitSectionContents(Section->getContents(),
+                                        Section->getKind());
+        });
+  } else
+    DebugInfoLinker->setOutputDWARFEmitter(Streamer.get());
 
   DebugInfoLinker->setEstimatedObjfilesAmount(1);
   DebugInfoLinker->setNumThreads(Options.NumThreads);
@@ -445,7 +462,7 @@ Error linkDebugInfoImpl(object::ObjectFile &File, const Options &Options,
   if (Error Err = DebugInfoLinker->link())
     return Err;
 
-  DebugInfoLinker->getEmitter()->finish();
+  Streamer->finish();
   return Error::success();
 }
 

From d1a2f11febfcdb2cc56a8c29012f948727c152d0 Mon Sep 17 00:00:00 2001
From: trevyn <230691+trevyn@users.noreply.github.com>
Date: Fri, 19 Jan 2024 18:02:00 +0400
Subject: [PATCH 129/843] [builtins] Mark `int_lib.h` builtins as `static`
 (#69305)

Mark the following symbols as `static` to prevent duplicate definitions:

`__builtin_ctz`
`__builtin_clz`
`__builtin_clzll`
`__builtin_sadd_overflow`

>Without these then all of these functions show up in all object files
which include int_lib.h on Windows. This'll help prevent duplicate
symbols by ensuring they're not exported.

See:

https://github.com/rust-lang/compiler-builtins/issues/167
https://reviews.llvm.org/D34599
---
 compiler-rt/lib/builtins/int_lib.h | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/compiler-rt/lib/builtins/int_lib.h b/compiler-rt/lib/builtins/int_lib.h
index 04ea2d910574b..f6c1b7cff4b99 100644
--- a/compiler-rt/lib/builtins/int_lib.h
+++ b/compiler-rt/lib/builtins/int_lib.h
@@ -119,14 +119,14 @@ COMPILER_RT_ABI tu_int __udivmodti4(tu_int a, tu_int b, tu_int *rem);
 #if defined(_MSC_VER) && !defined(__clang__)
 #include <intrin.h>
 
-int __inline __builtin_ctz(uint32_t value) {
+static int __inline __builtin_ctz(uint32_t value) {
   unsigned long trailing_zero = 0;
   if (_BitScanForward(&trailing_zero, value))
     return trailing_zero;
   return 32;
 }
 
-int __inline __builtin_clz(uint32_t value) {
+static int __inline __builtin_clz(uint32_t value) {
   unsigned long leading_zero = 0;
   if (_BitScanReverse(&leading_zero, value))
     return 31 - leading_zero;
@@ -134,14 +134,14 @@ int __inline __builtin_clz(uint32_t value) {
 }
 
 #if defined(_M_ARM) || defined(_M_X64)
-int __inline __builtin_clzll(uint64_t value) {
+static int __inline __builtin_clzll(uint64_t value) {
   unsigned long leading_zero = 0;
   if (_BitScanReverse64(&leading_zero, value))
     return 63 - leading_zero;
   return 64;
 }
 #else
-int __inline __builtin_clzll(uint64_t value) {
+static int __inline __builtin_clzll(uint64_t value) {
   if (value == 0)
     return 64;
   uint32_t msh = (uint32_t)(value >> 32);
@@ -154,7 +154,7 @@ int __inline __builtin_clzll(uint64_t value) {
 
 #define __builtin_clzl __builtin_clzll
 
-bool __inline __builtin_sadd_overflow(int x, int y, int *result) {
+static bool __inline __builtin_sadd_overflow(int x, int y, int *result) {
   if ((x < 0) != (y < 0)) {
     *result = x + y;
     return false;

From 0f20d5a8b10b98c83c73b6b7808b7bf88aaaaa2e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 19 Jan 2024 14:59:14 +0100
Subject: [PATCH 130/843] [AsmParser] Deduplicate argument list parsing code
 (NFC)

The handling for parsing the first argument and all later arguments
was duplicated. Consolidate them into a single loop.
---
 llvm/lib/AsmParser/LLParser.cpp | 53 +++++----------------------------
 1 file changed, 7 insertions(+), 46 deletions(-)

diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 2e859c42d5575..ea7f9a00d719f 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -2982,49 +2982,8 @@ bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
   assert(Lex.getKind() == lltok::lparen);
   Lex.Lex(); // eat the (.
 
-  if (Lex.getKind() == lltok::rparen) {
-    // empty
-  } else if (Lex.getKind() == lltok::dotdotdot) {
-    IsVarArg = true;
-    Lex.Lex();
-  } else {
-    LocTy TypeLoc = Lex.getLoc();
-    Type *ArgTy = nullptr;
-    AttrBuilder Attrs(M->getContext());
-    std::string Name;
-
-    if (parseType(ArgTy) || parseOptionalParamAttrs(Attrs))
-      return true;
-
-    if (ArgTy->isVoidTy())
-      return error(TypeLoc, "argument can not have void type");
-
-    if (Lex.getKind() == lltok::LocalVar) {
-      Name = Lex.getStrVal();
-      Lex.Lex();
-    } else {
-      unsigned ArgID;
-      if (Lex.getKind() == lltok::LocalVarID) {
-        ArgID = Lex.getUIntVal();
-        if (checkValueID(TypeLoc, "argument", "%", CurValID, ArgID))
-          return true;
-        Lex.Lex();
-      } else {
-        ArgID = CurValID;
-      }
-
-      UnnamedArgNums.push_back(ArgID);
-      CurValID = ArgID + 1;
-    }
-
-    if (!FunctionType::isValidArgumentType(ArgTy))
-      return error(TypeLoc, "invalid type for function argument");
-
-    ArgList.emplace_back(TypeLoc, ArgTy,
-                         AttributeSet::get(ArgTy->getContext(), Attrs),
-                         std::move(Name));
-
-    while (EatIfPresent(lltok::comma)) {
+  if (Lex.getKind() != lltok::rparen) {
+    do {
       // Handle ... at end of arg list.
       if (EatIfPresent(lltok::dotdotdot)) {
         IsVarArg = true;
@@ -3032,13 +2991,16 @@ bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
       }
 
       // Otherwise must be an argument type.
-      TypeLoc = Lex.getLoc();
+      LocTy TypeLoc = Lex.getLoc();
+      Type *ArgTy = nullptr;
+      AttrBuilder Attrs(M->getContext());
       if (parseType(ArgTy) || parseOptionalParamAttrs(Attrs))
         return true;
 
       if (ArgTy->isVoidTy())
         return error(TypeLoc, "argument can not have void type");
 
+      std::string Name;
       if (Lex.getKind() == lltok::LocalVar) {
         Name = Lex.getStrVal();
         Lex.Lex();
@@ -3054,7 +3016,6 @@ bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
         }
         UnnamedArgNums.push_back(ArgID);
         CurValID = ArgID + 1;
-        Name = "";
       }
 
       if (!ArgTy->isFirstClassType())
@@ -3063,7 +3024,7 @@ bool LLParser::parseArgumentList(SmallVectorImpl<ArgInfo> &ArgList,
       ArgList.emplace_back(TypeLoc, ArgTy,
                            AttributeSet::get(ArgTy->getContext(), Attrs),
                            std::move(Name));
-    }
+    } while (EatIfPresent(lltok::comma));
   }
 
   return parseToken(lltok::rparen, "expected ')' at end of argument list");

From aac1a41c11b4e727df07a6119cddebdc783212c7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 19 Jan 2024 14:02:18 +0000
Subject: [PATCH 131/843] [llvm-exegesis] Fix MSVC "not all control paths
 return a value" warning. NFC.

---
 llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 60264ca87a988..969d9c163f8ab 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -198,6 +198,7 @@ const char *validationEventToString(exegesis::ValidationEvent VE) {
   case exegesis::ValidationEvent::InstructionRetired:
     return "instructions-retired";
   }
+  llvm_unreachable("Unhandled exegesis::ValidationEvent enum");
 }
 
 Expected<exegesis::ValidationEvent> stringToValidationEvent(StringRef Input) {

From 836dcdb84ab91aa2b69a6cec412d83c840a7196d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 19 Jan 2024 14:03:42 +0000
Subject: [PATCH 132/843] [llvm-jitlink] Fix MSVC "not all control paths return
 a value" warning. NFC.

---
 llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
index c6b4218aad7af..9f7d66d7b30a1 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
@@ -95,6 +95,7 @@ static Error registerSymbol(LinkGraph &G, Symbol &Sym, Session::FileInfo &FI,
   case Other:
     return Error::success();
   }
+  llvm_unreachable("Unhandled SectionType enum");
 }
 
 namespace llvm {

From eaa8def929b17ea4c7a13a7bf860ac07bfd5bf14 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Fri, 19 Jan 2024 15:09:25 +0100
Subject: [PATCH 133/843] [flang] Expand parent component in procedure pointer
 component ref (#78593)

For simplicity, lowering relies on semantics expansion of parent
components in designators.

This was not done in `call x%p()` where `p` is a procedure component
pointer of a parent component of `x`.

Do it and turn lowering TODO into a new lowering TODO for `call bar(x%type_bound_procedure)` (passing a tybe bound procedure is allowed as an extension, but lowering does not handle this extension yet. This is a lowering issue, will do in different patch).
---
 flang/include/flang/Semantics/expression.h    |  4 +--
 .../lib/Lower/ConvertProcedureDesignator.cpp  |  6 ++--
 flang/lib/Semantics/expression.cpp            | 25 +++++++++++-----
 .../HLFIR/proc-pointer-comp-in-parent.f90     | 30 +++++++++++++++++++
 4 files changed, 53 insertions(+), 12 deletions(-)
 create mode 100644 flang/test/Lower/HLFIR/proc-pointer-comp-in-parent.f90

diff --git a/flang/include/flang/Semantics/expression.h b/flang/include/flang/Semantics/expression.h
index 790d0a4d6d064..a330e241c2cda 100644
--- a/flang/include/flang/Semantics/expression.h
+++ b/flang/include/flang/Semantics/expression.h
@@ -327,8 +327,8 @@ class ExpressionAnalyzer {
       const parser::SectionSubscript &);
   std::vector<Subscript> AnalyzeSectionSubscripts(
       const std::list<parser::SectionSubscript> &);
-  std::optional<Component> CreateComponent(
-      DataRef &&, const Symbol &, const semantics::Scope &);
+  std::optional<Component> CreateComponent(DataRef &&, const Symbol &,
+      const semantics::Scope &, bool C919bAlreadyEnforced = false);
   MaybeExpr CompleteSubscripts(ArrayRef &&);
   MaybeExpr ApplySubscripts(DataRef &&, std::vector<Subscript> &&);
   void CheckConstantSubscripts(ArrayRef &);
diff --git a/flang/lib/Lower/ConvertProcedureDesignator.cpp b/flang/lib/Lower/ConvertProcedureDesignator.cpp
index 0806f78450dd6..2446be3a1908b 100644
--- a/flang/lib/Lower/ConvertProcedureDesignator.cpp
+++ b/flang/lib/Lower/ConvertProcedureDesignator.cpp
@@ -113,10 +113,10 @@ static hlfir::EntityWithAttributes designateProcedurePointerComponent(
   auto recordType =
       hlfir::getFortranElementType(base.getType()).cast<fir::RecordType>();
   mlir::Type fieldType = recordType.getType(fieldName);
-  // FIXME: semantics is not expanding intermediate parent components in:
-  // call x%p() where p is a component of a parent type of x type.
+  // Note: semantics turns x%p() into x%t%p() when the procedure pointer
+  // component is part of parent component t.
   if (!fieldType)
-    TODO(loc, "reference to procedure pointer component from parent type");
+    TODO(loc, "passing type bound procedure (extension)");
   mlir::Type designatorType = fir::ReferenceType::get(fieldType);
   mlir::Value compRef = builder.create<hlfir::DesignateOp>(
       loc, designatorType, base, fieldName,
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index bfc380183e23f..44e16ac938737 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -1296,9 +1296,11 @@ static NamedEntity IgnoreAnySubscripts(Designator<SomeDerived> &&designator) {
 }
 
 // Components of parent derived types are explicitly represented as such.
-std::optional<Component> ExpressionAnalyzer::CreateComponent(
-    DataRef &&base, const Symbol &component, const semantics::Scope &scope) {
-  if (IsAllocatableOrPointer(component) && base.Rank() > 0) { // C919b
+std::optional<Component> ExpressionAnalyzer::CreateComponent(DataRef &&base,
+    const Symbol &component, const semantics::Scope &scope,
+    bool C919bAlreadyEnforced) {
+  if (!C919bAlreadyEnforced && IsAllocatableOrPointer(component) &&
+      base.Rank() > 0) { // C919b
     Say("An allocatable or pointer component reference must be applied to a scalar base"_err_en_US);
   }
   if (&component.owner() == &scope) {
@@ -1313,7 +1315,7 @@ std::optional<Component> ExpressionAnalyzer::CreateComponent(
                   parentType->derivedTypeSpec().scope()}) {
             return CreateComponent(
                 DataRef{Component{std::move(base), *parentComponent}},
-                component, *parentScope);
+                component, *parentScope, C919bAlreadyEnforced);
           }
         }
       }
@@ -2391,9 +2393,18 @@ auto ExpressionAnalyzer::AnalyzeProcedureComponentRef(
               ProcedureDesignator{*resolution}, std::move(arguments)};
         } else if (dataRef.has_value()) {
           if (sym->attrs().test(semantics::Attr::NOPASS)) {
-            return CalleeAndArguments{
-                ProcedureDesignator{Component{std::move(*dataRef), *sym}},
-                std::move(arguments)};
+            const auto *dtSpec{GetDerivedTypeSpec(dtExpr->GetType())};
+            if (dtSpec && dtSpec->scope()) {
+              if (auto component{CreateComponent(std::move(*dataRef), *sym,
+                      *dtSpec->scope(), /*C919bAlreadyEnforced=*/true)}) {
+                return CalleeAndArguments{
+                    ProcedureDesignator{std::move(*component)},
+                    std::move(arguments)};
+              }
+            }
+            Say(sc.component.source,
+                "Component is not in scope of base derived type"_err_en_US);
+            return std::nullopt;
           } else {
             AddPassArg(arguments,
                 Expr<SomeDerived>{Designator<SomeDerived>{std::move(*dataRef)}},
diff --git a/flang/test/Lower/HLFIR/proc-pointer-comp-in-parent.f90 b/flang/test/Lower/HLFIR/proc-pointer-comp-in-parent.f90
new file mode 100644
index 0000000000000..5b37b6a8651dd
--- /dev/null
+++ b/flang/test/Lower/HLFIR/proc-pointer-comp-in-parent.f90
@@ -0,0 +1,30 @@
+! Test that parent components are made explicit in reference to
+! procedure pointer from parent type.
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+module type_defs
+ interface
+  subroutine s1
+  end subroutine
+  real function s2()
+  end function
+ end interface
+ type :: t
+  procedure(s1), pointer, nopass :: p1
+  procedure(s2), pointer, nopass :: p2
+ end type
+ type, extends(t) :: t2
+ end type
+end module
+
+! CHECK-LABEL: func.func @_QPtest(
+subroutine test (x)
+use type_defs, only : t2
+type(t2) :: x
+call x%p1()
+! CHECK: %[[T_REF1:.*]] = hlfir.designate %{{.*}}{"t"}
+! CHECK: hlfir.designate %[[T_REF1]]{"p1"}
+print *, x%p2()
+! CHECK: %[[T_REF2:.*]] = hlfir.designate %{{.*}}{"t"}
+! CHECK: hlfir.designate %[[T_REF2]]{"p2"}
+end subroutine

From 5a7f9a5a9c85c9b7851bbf267f5a12b9211f810e Mon Sep 17 00:00:00 2001
From: Yi Wu <43659785+yi-wu-arm@users.noreply.github.com>
Date: Fri, 19 Jan 2024 14:18:57 +0000
Subject: [PATCH 134/843] [flang] use setsid to assign the child to prevent
 zombie as it will be clean up by init process (#77944)

When using `setsid()` in a child process created by `fork()`, a new
session is created, and the child becomes a session leader. If the
parent process terminates before the child, the child becomes an orphan
and is adopted by the `init` process. The `init` process will eventually
clean up the child process once it exits.

However, killing the parent does not automatically kill the child; the
child will continue running until it exits.
Proper cleanup involves waiting for the child process to exit using
`wait()` or `waitpid()` in the parent process to avoid zombie processes,
but this approach is not valid for `EXECUTE_COMMAND_LINE` with async
mode.
Fix: https://github.com/llvm/llvm-project/issues/77803
---
 flang/runtime/execute.cpp               | 15 +++++++++++++--
 flang/unittests/Runtime/CommandTest.cpp | 18 ++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/flang/runtime/execute.cpp b/flang/runtime/execute.cpp
index d38cb8384bc86..c84930c5c3287 100644
--- a/flang/runtime/execute.cpp
+++ b/flang/runtime/execute.cpp
@@ -181,8 +181,6 @@ void RTNAME(ExecuteCommandLine)(const Descriptor &command, bool wait,
     }
     FreeMemory(wcmd);
 #else
-    // terminated children do not become zombies
-    signal(SIGCHLD, SIG_IGN);
     pid_t pid{fork()};
     if (pid < 0) {
       if (!cmdstat) {
@@ -192,6 +190,19 @@ void RTNAME(ExecuteCommandLine)(const Descriptor &command, bool wait,
         CheckAndCopyCharsToDescriptor(cmdmsg, "Fork failed");
       }
     } else if (pid == 0) {
+      // Create a new session, let init process take care of zombie child
+      if (setsid() == -1) {
+        if (!cmdstat) {
+          terminator.Crash("setsid() failed with errno: %d, asynchronous "
+                           "process initiation failed.",
+              errno);
+        } else {
+          StoreIntToDescriptor(cmdstat, ASYNC_NO_SUPPORT_ERR, terminator);
+          CheckAndCopyCharsToDescriptor(cmdmsg,
+              "setsid() failed, asynchronous process initiation failed.");
+        }
+        exit(EXIT_FAILURE);
+      }
       int status{std::system(newCmd)};
       TerminationCheck(status, cmdstat, cmdmsg, terminator);
       exit(status);
diff --git a/flang/unittests/Runtime/CommandTest.cpp b/flang/unittests/Runtime/CommandTest.cpp
index 4c3618d0e3d90..b2f6fe6177ed5 100644
--- a/flang/unittests/Runtime/CommandTest.cpp
+++ b/flang/unittests/Runtime/CommandTest.cpp
@@ -404,6 +404,24 @@ TEST_F(ZeroArguments, ECLInvalidCommandParentNotTerminatedAsync) {
   CheckDescriptorEqStr(cmdMsg.get(), "No change");
 }
 
+TEST_F(ZeroArguments, ECLInvalidCommandAsyncDontAffectSync) {
+  OwningPtr<Descriptor> command{CharDescriptor("echo hi")};
+
+  EXPECT_NO_FATAL_FAILURE(RTNAME(ExecuteCommandLine)(
+      *command.get(), false, nullptr, nullptr, nullptr));
+  EXPECT_NO_FATAL_FAILURE(RTNAME(ExecuteCommandLine)(
+      *command.get(), true, nullptr, nullptr, nullptr));
+}
+
+TEST_F(ZeroArguments, ECLInvalidCommandAsyncDontAffectAsync) {
+  OwningPtr<Descriptor> command{CharDescriptor("echo hi")};
+
+  EXPECT_NO_FATAL_FAILURE(RTNAME(ExecuteCommandLine)(
+      *command.get(), false, nullptr, nullptr, nullptr));
+  EXPECT_NO_FATAL_FAILURE(RTNAME(ExecuteCommandLine)(
+      *command.get(), false, nullptr, nullptr, nullptr));
+}
+
 static const char *oneArgArgv[]{"aProgram", "anArgumentOfLength20"};
 class OneArgument : public CommandFixture {
 protected:

From d54dfdd1b53ff72344287d250c2b67329792c840 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 19 Jan 2024 15:19:58 +0100
Subject: [PATCH 135/843] [Clang] Fix build with GCC 14 on ARM (#78704)

GCC 14 defines `__arm_streaming` as a macro expanding to
`[[arm::streaming]]`. Due to the nested macro use, this gets expanded
prior to concatenation.

It doesn't look like C++ has a really clean way to prevent macro
expansion. The best I have found is to use `EMPTY ## X` where `EMPTY` is
an empty macro argument, so this is the hack I'm implementing here.

Fixes https://github.com/llvm/llvm-project/issues/78691.
---
 clang/include/clang/Basic/AttributeCommonInfo.h | 2 +-
 clang/include/clang/Basic/TokenKinds.def        | 3 ++-
 clang/utils/TableGen/ClangAttrEmitter.cpp       | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/AttributeCommonInfo.h b/clang/include/clang/Basic/AttributeCommonInfo.h
index d787e4959bfee..ef2ddf525c981 100644
--- a/clang/include/clang/Basic/AttributeCommonInfo.h
+++ b/clang/include/clang/Basic/AttributeCommonInfo.h
@@ -260,7 +260,7 @@ inline bool doesKeywordAttributeTakeArgs(tok::TokenKind Kind) {
   switch (Kind) {
   default:
     return false;
-#define KEYWORD_ATTRIBUTE(NAME, HASARG)                                        \
+#define KEYWORD_ATTRIBUTE(NAME, HASARG, ...)                                   \
   case tok::kw_##NAME:                                                         \
     return HASARG;
 #include "clang/Basic/RegularKeywordAttrInfo.inc"
diff --git a/clang/include/clang/Basic/TokenKinds.def b/clang/include/clang/Basic/TokenKinds.def
index d15e4970b7d8f..c10e2adfbe6e9 100644
--- a/clang/include/clang/Basic/TokenKinds.def
+++ b/clang/include/clang/Basic/TokenKinds.def
@@ -760,8 +760,9 @@ KEYWORD(__builtin_available              , KEYALL)
 KEYWORD(__builtin_sycl_unique_stable_name, KEYSYCL)
 
 // Keywords defined by Attr.td.
+// The "EMPTY ## X" is used to prevent early macro-expansion of the keyword.
 #ifndef KEYWORD_ATTRIBUTE
-#define KEYWORD_ATTRIBUTE(X, ...) KEYWORD(X, KEYALL)
+#define KEYWORD_ATTRIBUTE(X, HASARG, EMPTY) KEYWORD(EMPTY ## X, KEYALL)
 #endif
 #include "clang/Basic/RegularKeywordAttrInfo.inc"
 
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index a1827f8ce0ead..3888e6c08ab0f 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3589,7 +3589,7 @@ void EmitClangRegularKeywordAttributeInfo(RecordKeeper &Records,
 
       OS << "KEYWORD_ATTRIBUTE("
          << S.getSpellingRecord().getValueAsString("Name") << ", "
-         << (HasArgs ? "true" : "false") << ")\n";
+         << (HasArgs ? "true" : "false") << ", )\n";
     }
   OS << "#undef KEYWORD_ATTRIBUTE\n";
 }

From a2a0089ac3a5781ba74d4d319c87c9e8b46d4eda Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <RKSimon@users.noreply.github.com>
Date: Fri, 19 Jan 2024 14:21:26 +0000
Subject: [PATCH 136/843] [X86] movsd/movss/movd/movq - add support for
 constant comments (#78601)

If we're loading a constant value, print the constant (and the zero upper elements) instead of just the shuffle mask.

This did require me to move the shuffle mask handling into addConstantComments as we can't handle this in the MC layer.
---
 lld/test/MachO/lto-mattrs.ll                  |    3 +-
 .../X86/MCTargetDesc/X86InstComments.cpp      |   32 +-
 llvm/lib/Target/X86/X86MCInstLower.cpp        |   90 ++
 .../CodeGen/X86/2008-09-25-sseregparm-1.ll    |    4 +-
 llvm/test/CodeGen/X86/GlobalISel/fconstant.ll |    4 +-
 .../X86/asm-reg-type-mismatch-avx512.ll       |    2 +-
 llvm/test/CodeGen/X86/atomic-fp.ll            |   36 +-
 llvm/test/CodeGen/X86/avx512-cmp.ll           |    2 +-
 .../test/CodeGen/X86/avx512-fma-intrinsics.ll |   19 +-
 .../test/CodeGen/X86/avx512-insert-extract.ll |    4 +-
 .../CodeGen/X86/avx512-intrinsics-upgrade.ll  |   48 +-
 llvm/test/CodeGen/X86/avx512-mov.ll           |   24 +-
 .../test/CodeGen/X86/avx512-regcall-NoMask.ll |   12 +-
 llvm/test/CodeGen/X86/avx512-vec-cmp.ll       |   12 +-
 llvm/test/CodeGen/X86/bc-extract.ll           |    2 +-
 llvm/test/CodeGen/X86/bfloat.ll               |    4 +-
 llvm/test/CodeGen/X86/buildvec-insertvec.ll   |    2 +-
 llvm/test/CodeGen/X86/cmov-fp.ll              |   48 +-
 llvm/test/CodeGen/X86/cmovcmov.ll             |    4 +-
 llvm/test/CodeGen/X86/combine-fabs.ll         |    4 +-
 .../CodeGen/X86/combineIncDecVector-crash.ll  |    2 +-
 llvm/test/CodeGen/X86/cvtv2f32.ll             |    2 +-
 llvm/test/CodeGen/X86/dagcombine-select.ll    |   12 +-
 llvm/test/CodeGen/X86/deopt-intrinsic.ll      |    2 +-
 llvm/test/CodeGen/X86/extract-fp.ll           |   12 +-
 llvm/test/CodeGen/X86/extractelement-fp.ll    |    4 +-
 llvm/test/CodeGen/X86/extractelement-load.ll  |    8 +-
 llvm/test/CodeGen/X86/fadd-combines.ll        |    2 +-
 llvm/test/CodeGen/X86/fast-isel-constpool.ll  |   32 +-
 llvm/test/CodeGen/X86/fdiv-combine-vec.ll     |   24 +-
 llvm/test/CodeGen/X86/fdiv-combine.ll         |   12 +-
 .../CodeGen/X86/fma-intrinsics-canonical.ll   |   32 +-
 .../CodeGen/X86/fma-intrinsics-x86-upgrade.ll |   64 +-
 llvm/test/CodeGen/X86/fma-intrinsics-x86.ll   |   68 +-
 llvm/test/CodeGen/X86/fma-scalar-memfold.ll   |  128 +-
 llvm/test/CodeGen/X86/fma.ll                  |  132 +-
 llvm/test/CodeGen/X86/fma_patterns.ll         |   12 +-
 llvm/test/CodeGen/X86/fmf-flags.ll            |    2 +-
 llvm/test/CodeGen/X86/fminimum-fmaximum.ll    |   24 +-
 .../X86/fold-int-pow2-with-fmul-or-fdiv.ll    |  112 +-
 llvm/test/CodeGen/X86/fp-intrinsics-fma.ll    |   24 +-
 llvm/test/CodeGen/X86/fp-intrinsics.ll        |  116 +-
 llvm/test/CodeGen/X86/fp-logic.ll             |   12 +-
 .../X86/fp-strict-scalar-fptoint-fp16.ll      |    4 +-
 .../CodeGen/X86/fp-strict-scalar-fptoint.ll   |   24 +-
 llvm/test/CodeGen/X86/fp-undef.ll             |   50 +-
 llvm/test/CodeGen/X86/fpclamptosat.ll         |   16 +-
 llvm/test/CodeGen/X86/fpclamptosat_vec.ll     |   20 +-
 llvm/test/CodeGen/X86/fptosi-sat-scalar.ll    |   56 +-
 .../test/CodeGen/X86/fptosi-sat-vector-128.ll |   30 +-
 llvm/test/CodeGen/X86/fptoui-sat-scalar.ll    |   46 +-
 .../test/CodeGen/X86/fptoui-sat-vector-128.ll |   24 +-
 llvm/test/CodeGen/X86/ftrunc.ll               |    4 +-
 llvm/test/CodeGen/X86/half.ll                 |   14 +-
 llvm/test/CodeGen/X86/insertelement-ones.ll   |    6 +-
 llvm/test/CodeGen/X86/ldexp.ll                |    4 +-
 .../test/CodeGen/X86/load-scalar-as-vector.ll |   16 +-
 llvm/test/CodeGen/X86/logical-load-fold.ll    |    8 +-
 llvm/test/CodeGen/X86/lsr-static-addr.ll      |    4 +-
 .../X86/machine-trace-metrics-crash.ll        |    4 +-
 llvm/test/CodeGen/X86/masked-iv-safe.ll       |   48 +-
 llvm/test/CodeGen/X86/masked-iv-unsafe.ll     |   78 +-
 .../X86/merge-consecutive-loads-128.ll        |    2 +-
 llvm/test/CodeGen/X86/neg_fp.ll               |    2 +-
 llvm/test/CodeGen/X86/nontemporal-4.ll        |    6 +-
 llvm/test/CodeGen/X86/oss-fuzz-25184.ll       |    2 +-
 llvm/test/CodeGen/X86/peep-test-0.ll          |    2 +-
 llvm/test/CodeGen/X86/pow.ll                  |   20 +-
 llvm/test/CodeGen/X86/powi-int32min.ll        |    2 +-
 llvm/test/CodeGen/X86/pr23103.ll              |    2 +-
 llvm/test/CodeGen/X86/pr37879.ll              |    2 +-
 llvm/test/CodeGen/X86/pr40539.ll              |    4 +-
 llvm/test/CodeGen/X86/pr44749.ll              |    6 +-
 llvm/test/CodeGen/X86/pr59258.ll              |   16 +-
 llvm/test/CodeGen/X86/pr59305.ll              |    6 +-
 llvm/test/CodeGen/X86/pseudo_cmov_lower2.ll   |    8 +-
 llvm/test/CodeGen/X86/recip-fastmath.ll       |   32 +-
 llvm/test/CodeGen/X86/recip-fastmath2.ll      |   64 +-
 llvm/test/CodeGen/X86/recip-pic.ll            |    2 +-
 llvm/test/CodeGen/X86/scalar-fp-to-i64.ll     |   24 +-
 llvm/test/CodeGen/X86/scalarize-fp.ll         |   48 +-
 .../CodeGen/X86/select-of-fp-constants.ll     |    6 +-
 llvm/test/CodeGen/X86/select.ll               |    2 +-
 llvm/test/CodeGen/X86/select_const.ll         |    2 +-
 llvm/test/CodeGen/X86/setcc-combine.ll        |    2 +-
 llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll   |    4 +-
 .../CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll |    4 +-
 llvm/test/CodeGen/X86/sqrt-fastmath.ll        |   12 +-
 llvm/test/CodeGen/X86/sse-fcopysign.ll        |    4 +-
 .../CodeGen/X86/sse-intrinsics-fast-isel.ll   |  184 +--
 llvm/test/CodeGen/X86/sse-load-ret.ll         |    4 +-
 llvm/test/CodeGen/X86/sse-minmax.ll           |   66 +-
 llvm/test/CodeGen/X86/sse1.ll                 |   16 +-
 .../CodeGen/X86/sse2-intrinsics-fast-isel.ll  |  384 ++---
 .../X86/sse2-intrinsics-x86-upgrade.ll        |   24 +-
 llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll    |    4 +-
 llvm/test/CodeGen/X86/sse41.ll                |   56 +-
 .../CodeGen/X86/stack-folding-int-avx2.ll     |    6 +-
 llvm/test/CodeGen/X86/swifterror.ll           |   32 +-
 .../CodeGen/X86/vec-strict-fptoint-128.ll     |   56 +-
 .../CodeGen/X86/vec-strict-fptoint-256.ll     |   16 +-
 .../CodeGen/X86/vec-strict-fptoint-512.ll     |    4 +-
 llvm/test/CodeGen/X86/vec_fp_to_int.ll        |   34 +-
 .../vector-constrained-fp-intrinsics-fma.ll   |   20 +-
 .../X86/vector-constrained-fp-intrinsics.ll   | 1250 ++++++++---------
 llvm/test/CodeGen/X86/vector-reduce-fadd.ll   |    6 +-
 .../CodeGen/X86/vector-shuffle-combining.ll   |    8 +-
 llvm/test/CodeGen/X86/vselect-zero.ll         |   16 +-
 .../CodeGen/X86/widen-load-of-small-alloca.ll |    4 +-
 llvm/test/CodeGen/X86/x86-64-varargs.ll       |    4 +-
 110 files changed, 2092 insertions(+), 2038 deletions(-)

diff --git a/lld/test/MachO/lto-mattrs.ll b/lld/test/MachO/lto-mattrs.ll
index 87d0d550110f2..f658b485a1792 100644
--- a/lld/test/MachO/lto-mattrs.ll
+++ b/lld/test/MachO/lto-mattrs.ll
@@ -19,8 +19,7 @@
 ; NO-FMA:      <_foo>:
 ; NO-FMA-NEXT: vrcpss %xmm0, %xmm0, %xmm1
 ; NO-FMA-NEXT: vmulss %xmm1, %xmm0, %xmm0
-; NO-FMA-NEXT: vmovss [[#]](%rip), %xmm2  ## xmm2 =
-; NO-FMA-NEXT:                            ## 0x
+; NO-FMA-NEXT: vmovss [[#]](%rip), %xmm2  ## 0x
 ; NO-FMA-NEXT: vsubss %xmm0, %xmm2, %xmm0
 ; NO-FMA-NEXT: vmulss %xmm0, %xmm1, %xmm0
 ; NO-FMA-NEXT: vaddss %xmm0, %xmm1, %xmm0
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 20b37d5a99902..619328af12719 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -1212,15 +1212,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VMOVSDZrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
-    [[fallthrough]];
-
-  case X86::MOVSDrm_alt:
-  case X86::MOVSDrm:
-  case X86::VMOVSDrm_alt:
-  case X86::VMOVSDrm:
-  case X86::VMOVSDZrm:
-  case X86::VMOVSDZrm_alt:
-    DecodeScalarMoveMask(2, nullptr == Src2Name, ShuffleMask);
+    DecodeScalarMoveMask(2, false, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
@@ -1229,15 +1221,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VMOVSSZrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
-    [[fallthrough]];
-
-  case X86::MOVSSrm:
-  case X86::MOVSSrm_alt:
-  case X86::VMOVSSrm:
-  case X86::VMOVSSrm_alt:
-  case X86::VMOVSSZrm:
-  case X86::VMOVSSZrm_alt:
-    DecodeScalarMoveMask(4, nullptr == Src2Name, ShuffleMask);
+    DecodeScalarMoveMask(4, false, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
@@ -1248,22 +1232,10 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::VMOVZPQILo2PQIrr:
   case X86::VMOVZPQILo2PQIZrr:
     Src1Name = getRegName(MI->getOperand(1).getReg());
-    [[fallthrough]];
-
-  case X86::MOVQI2PQIrm:
-  case X86::VMOVQI2PQIrm:
-  case X86::VMOVQI2PQIZrm:
     DecodeZeroMoveLowMask(2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
 
-  case X86::MOVDI2PDIrm:
-  case X86::VMOVDI2PDIrm:
-  case X86::VMOVDI2PDIZrm:
-    DecodeZeroMoveLowMask(4, ShuffleMask);
-    DestName = getRegName(MI->getOperand(0).getReg());
-    break;
-
   case X86::EXTRQI:
     if (MI->getOperand(2).isImm() &&
         MI->getOperand(3).isImm())
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 048902681103b..fe3d16a949342 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1806,6 +1806,96 @@ static void addConstantComments(const MachineInstr *MI,
     break;
   }
 
+  case X86::MOVSDrm:
+  case X86::MOVSSrm:
+  case X86::VMOVSDrm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSDZrm:
+  case X86::VMOVSSZrm:
+  case X86::MOVSDrm_alt:
+  case X86::MOVSSrm_alt:
+  case X86::VMOVSDrm_alt:
+  case X86::VMOVSSrm_alt:
+  case X86::VMOVSDZrm_alt:
+  case X86::VMOVSSZrm_alt:
+  case X86::MOVDI2PDIrm:
+  case X86::MOVQI2PQIrm:
+  case X86::VMOVDI2PDIrm:
+  case X86::VMOVQI2PQIrm:
+  case X86::VMOVDI2PDIZrm:
+  case X86::VMOVQI2PQIZrm: {
+    assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
+    int SclWidth = 32;
+    int VecWidth = 128;
+
+    switch (MI->getOpcode()) {
+    default:
+      llvm_unreachable("Invalid opcode");
+    case X86::MOVSDrm:
+    case X86::VMOVSDrm:
+    case X86::VMOVSDZrm:
+    case X86::MOVSDrm_alt:
+    case X86::VMOVSDrm_alt:
+    case X86::VMOVSDZrm_alt:
+    case X86::MOVQI2PQIrm:
+    case X86::VMOVQI2PQIrm:
+    case X86::VMOVQI2PQIZrm:
+      SclWidth = 64;
+      VecWidth = 128;
+      break;
+    case X86::MOVSSrm:
+    case X86::VMOVSSrm:
+    case X86::VMOVSSZrm:
+    case X86::MOVSSrm_alt:
+    case X86::VMOVSSrm_alt:
+    case X86::VMOVSSZrm_alt:
+    case X86::MOVDI2PDIrm:
+    case X86::VMOVDI2PDIrm:
+    case X86::VMOVDI2PDIZrm:
+      SclWidth = 32;
+      VecWidth = 128;
+      break;
+    }
+    std::string Comment;
+    raw_string_ostream CS(Comment);
+    const MachineOperand &DstOp = MI->getOperand(0);
+    CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+
+    if (auto *C =
+            X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+      if (SclWidth == C->getType()->getScalarSizeInBits()) {
+        if (auto *CI = dyn_cast<ConstantInt>(C)) {
+          CS << "[";
+          printConstant(CI->getValue(), CS);
+          for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
+            CS << ",0";
+          }
+          CS << "]";
+          OutStreamer.AddComment(CS.str());
+          break; // early-out
+        }
+        if (auto *CF = dyn_cast<ConstantFP>(C)) {
+          CS << "[";
+          printConstant(CF->getValue(), CS);
+          APFloat ZeroFP = APFloat::getZero(CF->getValue().getSemantics());
+          for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
+            CS << ",";
+            printConstant(ZeroFP, CS);
+          }
+          CS << "]";
+          OutStreamer.AddComment(CS.str());
+          break; // early-out
+        }
+      }
+    }
+
+    // We didn't find a constant load, fallback to a shuffle mask decode.
+    CS << (SclWidth == 32 ? "mem[0],zero,zero,zero" : "mem[0],zero");
+    OutStreamer.AddComment(CS.str());
+    break;
+  }
+
 #define MOV_CASE(Prefix, Suffix)                                               \
   case X86::Prefix##MOVAPD##Suffix##rm:                                        \
   case X86::Prefix##MOVAPS##Suffix##rm:                                        \
diff --git a/llvm/test/CodeGen/X86/2008-09-25-sseregparm-1.ll b/llvm/test/CodeGen/X86/2008-09-25-sseregparm-1.ll
index a2dd55767a7ef..6288f7e1d039c 100644
--- a/llvm/test/CodeGen/X86/2008-09-25-sseregparm-1.ll
+++ b/llvm/test/CodeGen/X86/2008-09-25-sseregparm-1.ll
@@ -5,7 +5,7 @@
 define inreg double @foo1()  nounwind {
 ; CHECK-LABEL: foo1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    retl
   ret double 1.0
 }
@@ -13,7 +13,7 @@ define inreg double @foo1()  nounwind {
 define inreg float @foo2()  nounwind {
 ; CHECK-LABEL: foo2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    retl
   ret float 1.0
 }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll b/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll
index 1e08c804af586..a9b2037e9947a 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/fconstant.ll
@@ -8,7 +8,7 @@
 define void @test_float(ptr %a , float %b) {
 ; CHECK64_SMALL-LABEL: test_float:
 ; CHECK64_SMALL:       # %bb.0: # %entry
-; CHECK64_SMALL-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK64_SMALL-NEXT:    movss {{.*#+}} xmm1 = [5.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK64_SMALL-NEXT:    addss %xmm0, %xmm1
 ; CHECK64_SMALL-NEXT:    movd %xmm1, %eax
 ; CHECK64_SMALL-NEXT:    movl %eax, (%rdi)
@@ -26,7 +26,7 @@ define void @test_float(ptr %a , float %b) {
 ; CHECK32:       # %bb.0: # %entry
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK32-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK32-NEXT:    movss {{.*#+}} xmm0 = [5.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK32-NEXT:    movd %ecx, %xmm1
 ; CHECK32-NEXT:    addss %xmm0, %xmm1
 ; CHECK32-NEXT:    movd %xmm1, %ecx
diff --git a/llvm/test/CodeGen/X86/asm-reg-type-mismatch-avx512.ll b/llvm/test/CodeGen/X86/asm-reg-type-mismatch-avx512.ll
index 053ca11b95a50..56b05418afa94 100644
--- a/llvm/test/CodeGen/X86/asm-reg-type-mismatch-avx512.ll
+++ b/llvm/test/CodeGen/X86/asm-reg-type-mismatch-avx512.ll
@@ -5,7 +5,7 @@ define i64 @test1() nounwind {
 ; CHECK-LABEL: test1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    #APP
-; CHECK-NEXT:    vmovq {{.*#+}} xmm16 = mem[0],zero
+; CHECK-NEXT:    vmovq 0, %xmm16
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vmovq %xmm16, %rax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/atomic-fp.ll b/llvm/test/CodeGen/X86/atomic-fp.ll
index d933ffec623b9..1094edd19af43 100644
--- a/llvm/test/CodeGen/X86/atomic-fp.ll
+++ b/llvm/test/CodeGen/X86/atomic-fp.ll
@@ -207,28 +207,28 @@ define dso_local void @fadd_32g() nounwind {
 ;
 ; X86-SSE2-LABEL: fadd_32g:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE2-NEXT:    addss glob32, %xmm0
 ; X86-SSE2-NEXT:    movss %xmm0, glob32
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: fadd_32g:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-AVX-NEXT:    vaddss glob32, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovss %xmm0, glob32
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-SSE-LABEL: fadd_32g:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-SSE-NEXT:    addss glob32(%rip), %xmm0
 ; X64-SSE-NEXT:    movss %xmm0, glob32(%rip)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: fadd_32g:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-AVX-NEXT:    vaddss glob32(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovss %xmm0, glob32(%rip)
 ; X64-AVX-NEXT:    retq
@@ -319,14 +319,14 @@ define dso_local void @fadd_64g() nounwind {
 ;
 ; X64-SSE-LABEL: fadd_64g:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X64-SSE-NEXT:    addsd glob64(%rip), %xmm0
 ; X64-SSE-NEXT:    movsd %xmm0, glob64(%rip)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: fadd_64g:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X64-AVX-NEXT:    vaddsd glob64(%rip), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovsd %xmm0, glob64(%rip)
 ; X64-AVX-NEXT:    retq
@@ -368,14 +368,14 @@ define dso_local void @fadd_32imm() nounwind {
 ;
 ; X86-SSE2-LABEL: fadd_32imm:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE2-NEXT:    addss -559038737, %xmm0
 ; X86-SSE2-NEXT:    movss %xmm0, -559038737
 ; X86-SSE2-NEXT:    retl
 ;
 ; X86-AVX-LABEL: fadd_32imm:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-AVX-NEXT:    vaddss -559038737, %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovss %xmm0, -559038737
 ; X86-AVX-NEXT:    retl
@@ -383,7 +383,7 @@ define dso_local void @fadd_32imm() nounwind {
 ; X64-SSE-LABEL: fadd_32imm:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
-; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-SSE-NEXT:    addss (%rax), %xmm0
 ; X64-SSE-NEXT:    movss %xmm0, (%rax)
 ; X64-SSE-NEXT:    retq
@@ -391,7 +391,7 @@ define dso_local void @fadd_32imm() nounwind {
 ; X64-AVX-LABEL: fadd_32imm:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
-; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-AVX-NEXT:    vaddss (%rax), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovss %xmm0, (%rax)
 ; X64-AVX-NEXT:    retq
@@ -483,7 +483,7 @@ define dso_local void @fadd_64imm() nounwind {
 ; X64-SSE-LABEL: fadd_64imm:
 ; X64-SSE:       # %bb.0:
 ; X64-SSE-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
-; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X64-SSE-NEXT:    addsd (%rax), %xmm0
 ; X64-SSE-NEXT:    movsd %xmm0, (%rax)
 ; X64-SSE-NEXT:    retq
@@ -491,7 +491,7 @@ define dso_local void @fadd_64imm() nounwind {
 ; X64-AVX-LABEL: fadd_64imm:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    movl $3735928559, %eax # imm = 0xDEADBEEF
-; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X64-AVX-NEXT:    vaddsd (%rax), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovsd %xmm0, (%rax)
 ; X64-AVX-NEXT:    retq
@@ -534,7 +534,7 @@ define dso_local void @fadd_32stack() nounwind {
 ; X86-SSE2-LABEL: fadd_32stack:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pushl %eax
-; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE2-NEXT:    addss (%esp), %xmm0
 ; X86-SSE2-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE2-NEXT:    popl %eax
@@ -543,7 +543,7 @@ define dso_local void @fadd_32stack() nounwind {
 ; X86-AVX-LABEL: fadd_32stack:
 ; X86-AVX:       # %bb.0:
 ; X86-AVX-NEXT:    pushl %eax
-; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-AVX-NEXT:    vaddss (%esp), %xmm0, %xmm0
 ; X86-AVX-NEXT:    vmovss %xmm0, (%esp)
 ; X86-AVX-NEXT:    popl %eax
@@ -551,14 +551,14 @@ define dso_local void @fadd_32stack() nounwind {
 ;
 ; X64-SSE-LABEL: fadd_32stack:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-SSE-NEXT:    addss -{{[0-9]+}}(%rsp), %xmm0
 ; X64-SSE-NEXT:    movss %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: fadd_32stack:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-AVX-NEXT:    vaddss -{{[0-9]+}}(%rsp), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovss %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    retq
@@ -650,14 +650,14 @@ define dso_local void @fadd_64stack() nounwind {
 ;
 ; X64-SSE-LABEL: fadd_64stack:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X64-SSE-NEXT:    addsd -{{[0-9]+}}(%rsp), %xmm0
 ; X64-SSE-NEXT:    movsd %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-SSE-NEXT:    retq
 ;
 ; X64-AVX-LABEL: fadd_64stack:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X64-AVX-NEXT:    vaddsd -{{[0-9]+}}(%rsp), %xmm0, %xmm0
 ; X64-AVX-NEXT:    vmovsd %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/avx512-cmp.ll b/llvm/test/CodeGen/X86/avx512-cmp.ll
index 919edb334b367..0c3d9d6f7277c 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp.ll
@@ -70,7 +70,7 @@ define float @test5(float %p) #0 {
 ; ALL-NEXT:    retq
 ; ALL-NEXT:  LBB3_1: ## %if.end
 ; ALL-NEXT:    vcmpltss %xmm0, %xmm1, %k1
-; ALL-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT:    vmovss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; ALL-NEXT:    vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1}
 ; ALL-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll
index def5ad51d732c..c5a994e6846a4 100644
--- a/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-fma-intrinsics.ll
@@ -1150,19 +1150,12 @@ define <16 x float>@test_int_x86_avx512_mask_vfnmadd_ps_512(<16 x float> %x0, <1
 
 ; This test case used to crash due to combineFMA not bitcasting results of isFNEG.
 define <4 x float> @foo() {
-; X86-LABEL: foo:
-; X86:       # %bb.0: # %entry
-; X86-NEXT:    vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
-; X86-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vfmsub213ss {rd-sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x38,0xab,0xc0]
-; X86-NEXT:    retl # encoding: [0xc3]
-;
-; X64-LABEL: foo:
-; X64:       # %bb.0: # %entry
-; X64-NEXT:    vmovss (%rax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
-; X64-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    vfmsub213ss {rd-sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x38,0xab,0xc0]
-; X64-NEXT:    retq # encoding: [0xc3]
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
+; CHECK-NEXT:    vfmsub213ss {rd-sae}, %xmm0, %xmm0, %xmm0 # encoding: [0x62,0xf2,0x7d,0x38,0xab,0xc0]
+; CHECK-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
 entry:
   %0 = load <4 x float>, ptr undef, align 16
   %sub = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %0
diff --git a/llvm/test/CodeGen/X86/avx512-insert-extract.ll b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
index 4a2dd7673f4e7..abfe3e6428e66 100644
--- a/llvm/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/llvm/test/CodeGen/X86/avx512-insert-extract.ll
@@ -2175,7 +2175,7 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
 ; KNL-NEXT:    movzwl %ax, %eax
 ; KNL-NEXT:    vmovd %eax, %xmm1
 ; KNL-NEXT:    vcvtph2ps %xmm1, %xmm1
-; KNL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; KNL-NEXT:    vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; KNL-NEXT:    vucomiss %xmm2, %xmm1
 ; KNL-NEXT:    setb %al
 ; KNL-NEXT:    andl $1, %eax
@@ -2217,7 +2217,7 @@ define void @test_concat_v2i1(ptr %arg, ptr %arg1, ptr %arg2) nounwind {
 ; SKX-NEXT:    movzwl %ax, %eax
 ; SKX-NEXT:    vmovd %eax, %xmm1
 ; SKX-NEXT:    vcvtph2ps %xmm1, %xmm1
-; SKX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SKX-NEXT:    vmovss {{.*#+}} xmm2 = [6.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SKX-NEXT:    vucomiss %xmm2, %xmm1
 ; SKX-NEXT:    setb %al
 ; SKX-NEXT:    kmovd %eax, %k0
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
index a5a4bf1e53631..6c9c28bc9e55e 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -10104,10 +10104,10 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04]
-; X86-NEXT:    vmovss (%edx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x02]
-; X86-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vmovss (%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x09]
-; X86-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x02]
+; X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x09]
 ; X86-NEXT:    vfmadd213ss %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xc8]
 ; X86-NEXT:    ## xmm1 = (xmm0 * xmm1) + xmm0
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
@@ -10117,10 +10117,10 @@ define void @fmadd_ss_mask_memfold(ptr %a, ptr %b, i8 %c) {
 ;
 ; X64-LABEL: fmadd_ss_mask_memfold:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; X64-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; X64-NEXT:    vmovss (%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0e]
-; X64-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
+; X64-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0e]
 ; X64-NEXT:    vfmadd213ss %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0xc8]
 ; X64-NEXT:    ## xmm1 = (xmm0 * xmm1) + xmm0
 ; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
@@ -10152,8 +10152,8 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04]
-; X86-NEXT:    vmovss (%edx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x02]
-; X86-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x02]
 ; X86-NEXT:    vfmadd231ss (%ecx), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0x01]
 ; X86-NEXT:    ## xmm0 = (xmm0 * mem) + xmm0
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
@@ -10163,8 +10163,8 @@ define void @fmadd_ss_maskz_memfold(ptr %a, ptr %b, i8 %c) {
 ;
 ; X64-LABEL: fmadd_ss_maskz_memfold:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; X64-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; X64-NEXT:    vfmadd231ss (%rsi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0x06]
 ; X64-NEXT:    ## xmm0 = (xmm0 * mem) + xmm0
 ; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
@@ -10196,10 +10196,10 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04]
-; X86-NEXT:    vmovsd (%edx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x02]
-; X86-NEXT:    ## xmm0 = mem[0],zero
-; X86-NEXT:    vmovsd (%ecx), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x09]
-; X86-NEXT:    ## xmm1 = mem[0],zero
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x02]
+; X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x09]
 ; X86-NEXT:    vfmadd213sd %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xc8]
 ; X86-NEXT:    ## xmm1 = (xmm0 * xmm1) + xmm0
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
@@ -10209,10 +10209,10 @@ define void @fmadd_sd_mask_memfold(ptr %a, ptr %b, i8 %c) {
 ;
 ; X64-LABEL: fmadd_sd_mask_memfold:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovsd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; X64-NEXT:    ## xmm0 = mem[0],zero
-; X64-NEXT:    vmovsd (%rsi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0e]
-; X64-NEXT:    ## xmm1 = mem[0],zero
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
+; X64-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x0e]
 ; X64-NEXT:    vfmadd213sd %xmm0, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0xc8]
 ; X64-NEXT:    ## xmm1 = (xmm0 * xmm1) + xmm0
 ; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
@@ -10240,8 +10240,8 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) {
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax ## encoding: [0x0f,0xb6,0x44,0x24,0x0c]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x08]
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx ## encoding: [0x8b,0x54,0x24,0x04]
-; X86-NEXT:    vmovsd (%edx), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x02]
-; X86-NEXT:    ## xmm0 = mem[0],zero
+; X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x02]
 ; X86-NEXT:    vfmadd231sd (%ecx), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0x01]
 ; X86-NEXT:    ## xmm0 = (xmm0 * mem) + xmm0
 ; X86-NEXT:    kmovw %eax, %k1 ## encoding: [0xc5,0xf8,0x92,0xc8]
@@ -10251,8 +10251,8 @@ define void @fmadd_sd_maskz_memfold(ptr %a, ptr %b, i8 %c) {
 ;
 ; X64-LABEL: fmadd_sd_maskz_memfold:
 ; X64:       ## %bb.0:
-; X64-NEXT:    vmovsd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; X64-NEXT:    ## xmm0 = mem[0],zero
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-NEXT:    vfmadd231sd (%rsi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0x06]
 ; X64-NEXT:    ## xmm0 = (xmm0 * mem) + xmm0
 ; X64-NEXT:    kmovw %edx, %k1 ## encoding: [0xc5,0xf8,0x92,0xca]
diff --git a/llvm/test/CodeGen/X86/avx512-mov.ll b/llvm/test/CodeGen/X86/avx512-mov.ll
index 88682cea75466..895317cd73d82 100644
--- a/llvm/test/CodeGen/X86/avx512-mov.ll
+++ b/llvm/test/CodeGen/X86/avx512-mov.ll
@@ -31,8 +31,8 @@ define <2 x i64> @test3(i64 %x) {
 define <4 x i32> @test4(ptr %x) {
 ; CHECK-LABEL: test4:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; CHECK-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %y = load i32, ptr %x
    %res = insertelement <4 x i32>undef, i32 %y, i32 0
@@ -60,8 +60,8 @@ define void @test6(double %x, ptr %y) {
 define float @test7(ptr %x) {
 ; CHECK-LABEL: test7:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; CHECK-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %y = load i32, ptr %x
    %res = bitcast i32 %y to float
@@ -89,8 +89,8 @@ define i64 @test9(<2 x i64> %x) {
 define <4 x i32> @test10(ptr %x) {
 ; CHECK-LABEL: test10:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; CHECK-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %y = load i32, ptr %x, align 4
    %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
@@ -100,8 +100,8 @@ define <4 x i32> @test10(ptr %x) {
 define <4 x float> @test11(ptr %x) {
 ; CHECK-LABEL: test11:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; CHECK-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %y = load float, ptr %x, align 4
    %res = insertelement <4 x float>zeroinitializer, float %y, i32 0
@@ -111,8 +111,8 @@ define <4 x float> @test11(ptr %x) {
 define <2 x double> @test12(ptr %x) {
 ; CHECK-LABEL: test12:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovsd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; CHECK-NEXT:    ## xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %y = load double, ptr %x, align 8
    %res = insertelement <2 x double>zeroinitializer, double %y, i32 0
@@ -140,8 +140,8 @@ define <4 x i32> @test14(i32 %x) {
 define <4 x i32> @test15(ptr %x) {
 ; CHECK-LABEL: test15:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vmovss (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; CHECK-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; CHECK-NEXT:    retq ## encoding: [0xc3]
    %y = load i32, ptr %x, align 4
    %res = insertelement <4 x i32>zeroinitializer, i32 %y, i32 0
diff --git a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
index 7a534721bae05..2081d201704f3 100644
--- a/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
+++ b/llvm/test/CodeGen/X86/avx512-regcall-NoMask.ll
@@ -340,7 +340,7 @@ define dso_local x86_regcallcc float @test_CallargRetFloat(float %a)  {
 ; X32:       # %bb.0:
 ; X32-NEXT:    subl $28, %esp
 ; X32-NEXT:    vmovups %xmm4, (%esp) # 16-byte Spill
-; X32-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; X32-NEXT:    vmovss {{.*#+}} xmm4 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X32-NEXT:    vaddss %xmm4, %xmm0, %xmm0
 ; X32-NEXT:    calll _test_argRetFloat
 ; X32-NEXT:    vaddss %xmm4, %xmm0, %xmm0
@@ -355,7 +355,7 @@ define dso_local x86_regcallcc float @test_CallargRetFloat(float %a)  {
 ; WIN64-NEXT:    vmovaps %xmm8, (%rsp) # 16-byte Spill
 ; WIN64-NEXT:    .seh_savexmm %xmm8, 0
 ; WIN64-NEXT:    .seh_endprologue
-; WIN64-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; WIN64-NEXT:    vmovss {{.*#+}} xmm8 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; WIN64-NEXT:    vaddss %xmm0, %xmm8, %xmm0
 ; WIN64-NEXT:    callq test_argRetFloat
 ; WIN64-NEXT:    vaddss %xmm0, %xmm8, %xmm0
@@ -370,7 +370,7 @@ define dso_local x86_regcallcc float @test_CallargRetFloat(float %a)  {
 ; LINUXOSX64-NEXT:    vmovaps %xmm8, (%rsp) # 16-byte Spill
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 32
 ; LINUXOSX64-NEXT:    .cfi_offset %xmm8, -32
-; LINUXOSX64-NEXT:    vmovss {{.*#+}} xmm8 = mem[0],zero,zero,zero
+; LINUXOSX64-NEXT:    vmovss {{.*#+}} xmm8 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; LINUXOSX64-NEXT:    vaddss %xmm0, %xmm8, %xmm0
 ; LINUXOSX64-NEXT:    callq test_argRetFloat
 ; LINUXOSX64-NEXT:    vaddss %xmm0, %xmm8, %xmm0
@@ -410,7 +410,7 @@ define dso_local x86_regcallcc double @test_CallargRetDouble(double %a)  {
 ; X32:       # %bb.0:
 ; X32-NEXT:    subl $28, %esp
 ; X32-NEXT:    vmovups %xmm4, (%esp) # 16-byte Spill
-; X32-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; X32-NEXT:    vmovsd {{.*#+}} xmm4 = [1.0E+0,0.0E+0]
 ; X32-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
 ; X32-NEXT:    calll _test_argRetDouble
 ; X32-NEXT:    vaddsd %xmm4, %xmm0, %xmm0
@@ -425,7 +425,7 @@ define dso_local x86_regcallcc double @test_CallargRetDouble(double %a)  {
 ; WIN64-NEXT:    vmovaps %xmm8, (%rsp) # 16-byte Spill
 ; WIN64-NEXT:    .seh_savexmm %xmm8, 0
 ; WIN64-NEXT:    .seh_endprologue
-; WIN64-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
+; WIN64-NEXT:    vmovsd {{.*#+}} xmm8 = [1.0E+0,0.0E+0]
 ; WIN64-NEXT:    vaddsd %xmm0, %xmm8, %xmm0
 ; WIN64-NEXT:    callq test_argRetDouble
 ; WIN64-NEXT:    vaddsd %xmm0, %xmm8, %xmm0
@@ -440,7 +440,7 @@ define dso_local x86_regcallcc double @test_CallargRetDouble(double %a)  {
 ; LINUXOSX64-NEXT:    vmovaps %xmm8, (%rsp) # 16-byte Spill
 ; LINUXOSX64-NEXT:    .cfi_def_cfa_offset 32
 ; LINUXOSX64-NEXT:    .cfi_offset %xmm8, -32
-; LINUXOSX64-NEXT:    vmovsd {{.*#+}} xmm8 = mem[0],zero
+; LINUXOSX64-NEXT:    vmovsd {{.*#+}} xmm8 = [1.0E+0,0.0E+0]
 ; LINUXOSX64-NEXT:    vaddsd %xmm0, %xmm8, %xmm0
 ; LINUXOSX64-NEXT:    callq test_argRetDouble
 ; LINUXOSX64-NEXT:    vaddsd %xmm0, %xmm8, %xmm0
diff --git a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
index e4c62fca5bd57..973f4ee215649 100644
--- a/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/llvm/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -1434,8 +1434,8 @@ define <4 x i32> @zext_bool_logic(<4 x i64> %cond1, <4 x i64> %cond2, <4 x i32>
 define void @half_vec_compare(ptr %x, ptr %y) {
 ; KNL-LABEL: half_vec_compare:
 ; KNL:       ## %bb.0: ## %entry
-; KNL-NEXT:    vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; KNL-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; KNL-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
 ; KNL-NEXT:    vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10]
 ; KNL-NEXT:    vpextrw $0, %xmm1, %eax ## encoding: [0xc5,0xf9,0xc5,0xc1,0x00]
 ; KNL-NEXT:    movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0]
@@ -1466,8 +1466,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
 ;
 ; AVX512BW-LABEL: half_vec_compare:
 ; AVX512BW:       ## %bb.0: ## %entry
-; AVX512BW-NEXT:    vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; AVX512BW-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512BW-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
 ; AVX512BW-NEXT:    vpsrld $16, %xmm0, %xmm1 ## encoding: [0xc5,0xf1,0x72,0xd0,0x10]
 ; AVX512BW-NEXT:    vpextrw $0, %xmm1, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc1,0x00]
 ; AVX512BW-NEXT:    movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0]
@@ -1498,8 +1498,8 @@ define void @half_vec_compare(ptr %x, ptr %y) {
 ;
 ; SKX-LABEL: half_vec_compare:
 ; SKX:       ## %bb.0: ## %entry
-; SKX-NEXT:    vmovd (%rdi), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
-; SKX-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SKX-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x07]
 ; SKX-NEXT:    vpsrld $16, %xmm0, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x72,0xd0,0x10]
 ; SKX-NEXT:    vpextrw $0, %xmm1, %eax ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xc5,0xc1,0x00]
 ; SKX-NEXT:    movzwl %ax, %eax ## encoding: [0x0f,0xb7,0xc0]
diff --git a/llvm/test/CodeGen/X86/bc-extract.ll b/llvm/test/CodeGen/X86/bc-extract.ll
index 506ba906800a6..23091a2da9c58 100644
--- a/llvm/test/CodeGen/X86/bc-extract.ll
+++ b/llvm/test/CodeGen/X86/bc-extract.ll
@@ -10,7 +10,7 @@ define float @extractFloat1() nounwind {
 ;
 ; X64-LABEL: extractFloat1:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    retq
 entry:
   %tmp0 = bitcast <1 x double> <double 0x000000003F800000> to <2 x float>
diff --git a/llvm/test/CodeGen/X86/bfloat.ll b/llvm/test/CodeGen/X86/bfloat.ll
index 9d2ef51b0a8fb..f2d3c4fb34199 100644
--- a/llvm/test/CodeGen/X86/bfloat.ll
+++ b/llvm/test/CodeGen/X86/bfloat.ll
@@ -1007,11 +1007,11 @@ define <32 x bfloat> @pr63017_2() nounwind {
 ; SSE2-NEXT:    movzwl (%rax), %eax
 ; SSE2-NEXT:    shll $16, %eax
 ; SSE2-NEXT:    movl %eax, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    jmp .LBB12_3
 ; SSE2-NEXT:  .LBB12_1:
-; SSE2-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    movd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
 ; SSE2-NEXT:  .LBB12_3: # %else
diff --git a/llvm/test/CodeGen/X86/buildvec-insertvec.ll b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
index 3fdfde8576f77..ae70b6a5a4665 100644
--- a/llvm/test/CodeGen/X86/buildvec-insertvec.ll
+++ b/llvm/test/CodeGen/X86/buildvec-insertvec.ll
@@ -50,7 +50,7 @@ define <4 x float> @test_negative_zero_1(<4 x float> %A) {
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    movaps %xmm0, %xmm2
 ; SSE2-NEXT:    unpckhps {{.*#+}} xmm2 = xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
 ; SSE2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/cmov-fp.ll b/llvm/test/CodeGen/X86/cmov-fp.ll
index 749b96e25b4ca..26e720ffcebcc 100644
--- a/llvm/test/CodeGen/X86/cmov-fp.ll
+++ b/llvm/test/CodeGen/X86/cmov-fp.ll
@@ -19,7 +19,7 @@ define double @test1(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    jmp .LBB0_3
 ; SSE-NEXT:  .LBB0_1:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [9.9E+1,0.0E+0]
 ; SSE-NEXT:  .LBB0_3:
 ; SSE-NEXT:    movsd %xmm0, (%esp)
 ; SSE-NEXT:    fldl (%esp)
@@ -82,7 +82,7 @@ define double @test2(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    jmp .LBB1_3
 ; SSE-NEXT:  .LBB1_1:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [9.9E+1,0.0E+0]
 ; SSE-NEXT:  .LBB1_3:
 ; SSE-NEXT:    movsd %xmm0, (%esp)
 ; SSE-NEXT:    fldl (%esp)
@@ -145,7 +145,7 @@ define double @test3(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    jmp .LBB2_3
 ; SSE-NEXT:  .LBB2_1:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [9.9E+1,0.0E+0]
 ; SSE-NEXT:  .LBB2_3:
 ; SSE-NEXT:    movsd %xmm0, (%esp)
 ; SSE-NEXT:    fldl (%esp)
@@ -208,7 +208,7 @@ define double @test4(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    jmp .LBB3_3
 ; SSE-NEXT:  .LBB3_1:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [9.9E+1,0.0E+0]
 ; SSE-NEXT:  .LBB3_3:
 ; SSE-NEXT:    movsd %xmm0, (%esp)
 ; SSE-NEXT:    fldl (%esp)
@@ -271,7 +271,7 @@ define double @test5(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    jmp .LBB4_3
 ; SSE-NEXT:  .LBB4_1:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [9.9E+1,0.0E+0]
 ; SSE-NEXT:  .LBB4_3:
 ; SSE-NEXT:    movsd %xmm0, (%esp)
 ; SSE-NEXT:    fldl (%esp)
@@ -338,7 +338,7 @@ define double @test6(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    jmp .LBB5_3
 ; SSE-NEXT:  .LBB5_1:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [9.9E+1,0.0E+0]
 ; SSE-NEXT:  .LBB5_3:
 ; SSE-NEXT:    movsd %xmm0, (%esp)
 ; SSE-NEXT:    fldl (%esp)
@@ -405,7 +405,7 @@ define double @test7(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    jmp .LBB6_3
 ; SSE-NEXT:  .LBB6_1:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [9.9E+1,0.0E+0]
 ; SSE-NEXT:  .LBB6_3:
 ; SSE-NEXT:    movsd %xmm0, (%esp)
 ; SSE-NEXT:    fldl (%esp)
@@ -472,7 +472,7 @@ define double @test8(i32 %a, i32 %b, double %x) nounwind {
 ; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE-NEXT:    jmp .LBB7_3
 ; SSE-NEXT:  .LBB7_1:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [9.9E+1,0.0E+0]
 ; SSE-NEXT:  .LBB7_3:
 ; SSE-NEXT:    movsd %xmm0, (%esp)
 ; SSE-NEXT:    fldl (%esp)
@@ -536,7 +536,7 @@ define float @test9(i32 %a, i32 %b, float %x) nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    jmp .LBB8_3
 ; SSE-NEXT:  .LBB8_1:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:  .LBB8_3:
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    flds (%esp)
@@ -553,7 +553,7 @@ define float @test9(i32 %a, i32 %b, float %x) nounwind {
 ; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NOSSE2-NEXT:    jmp .LBB8_3
 ; NOSSE2-NEXT:  .LBB8_1:
-; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; NOSSE2-NEXT:  .LBB8_3:
 ; NOSSE2-NEXT:    movss %xmm0, (%esp)
 ; NOSSE2-NEXT:    flds (%esp)
@@ -601,7 +601,7 @@ define float @test10(i32 %a, i32 %b, float %x) nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    jmp .LBB9_3
 ; SSE-NEXT:  .LBB9_1:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:  .LBB9_3:
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    flds (%esp)
@@ -618,7 +618,7 @@ define float @test10(i32 %a, i32 %b, float %x) nounwind {
 ; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NOSSE2-NEXT:    jmp .LBB9_3
 ; NOSSE2-NEXT:  .LBB9_1:
-; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; NOSSE2-NEXT:  .LBB9_3:
 ; NOSSE2-NEXT:    movss %xmm0, (%esp)
 ; NOSSE2-NEXT:    flds (%esp)
@@ -666,7 +666,7 @@ define float @test11(i32 %a, i32 %b, float %x) nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    jmp .LBB10_3
 ; SSE-NEXT:  .LBB10_1:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:  .LBB10_3:
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    flds (%esp)
@@ -683,7 +683,7 @@ define float @test11(i32 %a, i32 %b, float %x) nounwind {
 ; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NOSSE2-NEXT:    jmp .LBB10_3
 ; NOSSE2-NEXT:  .LBB10_1:
-; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; NOSSE2-NEXT:  .LBB10_3:
 ; NOSSE2-NEXT:    movss %xmm0, (%esp)
 ; NOSSE2-NEXT:    flds (%esp)
@@ -731,7 +731,7 @@ define float @test12(i32 %a, i32 %b, float %x) nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    jmp .LBB11_3
 ; SSE-NEXT:  .LBB11_1:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:  .LBB11_3:
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    flds (%esp)
@@ -748,7 +748,7 @@ define float @test12(i32 %a, i32 %b, float %x) nounwind {
 ; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NOSSE2-NEXT:    jmp .LBB11_3
 ; NOSSE2-NEXT:  .LBB11_1:
-; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; NOSSE2-NEXT:  .LBB11_3:
 ; NOSSE2-NEXT:    movss %xmm0, (%esp)
 ; NOSSE2-NEXT:    flds (%esp)
@@ -796,7 +796,7 @@ define float @test13(i32 %a, i32 %b, float %x) nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    jmp .LBB12_3
 ; SSE-NEXT:  .LBB12_1:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:  .LBB12_3:
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    flds (%esp)
@@ -813,7 +813,7 @@ define float @test13(i32 %a, i32 %b, float %x) nounwind {
 ; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NOSSE2-NEXT:    jmp .LBB12_3
 ; NOSSE2-NEXT:  .LBB12_1:
-; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; NOSSE2-NEXT:  .LBB12_3:
 ; NOSSE2-NEXT:    movss %xmm0, (%esp)
 ; NOSSE2-NEXT:    flds (%esp)
@@ -863,7 +863,7 @@ define float @test14(i32 %a, i32 %b, float %x) nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    jmp .LBB13_3
 ; SSE-NEXT:  .LBB13_1:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:  .LBB13_3:
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    flds (%esp)
@@ -880,7 +880,7 @@ define float @test14(i32 %a, i32 %b, float %x) nounwind {
 ; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NOSSE2-NEXT:    jmp .LBB13_3
 ; NOSSE2-NEXT:  .LBB13_1:
-; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; NOSSE2-NEXT:  .LBB13_3:
 ; NOSSE2-NEXT:    movss %xmm0, (%esp)
 ; NOSSE2-NEXT:    flds (%esp)
@@ -930,7 +930,7 @@ define float @test15(i32 %a, i32 %b, float %x) nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    jmp .LBB14_3
 ; SSE-NEXT:  .LBB14_1:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:  .LBB14_3:
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    flds (%esp)
@@ -947,7 +947,7 @@ define float @test15(i32 %a, i32 %b, float %x) nounwind {
 ; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NOSSE2-NEXT:    jmp .LBB14_3
 ; NOSSE2-NEXT:  .LBB14_1:
-; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; NOSSE2-NEXT:  .LBB14_3:
 ; NOSSE2-NEXT:    movss %xmm0, (%esp)
 ; NOSSE2-NEXT:    flds (%esp)
@@ -997,7 +997,7 @@ define float @test16(i32 %a, i32 %b, float %x) nounwind {
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; SSE-NEXT:    jmp .LBB15_3
 ; SSE-NEXT:  .LBB15_1:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:  .LBB15_3:
 ; SSE-NEXT:    movss %xmm0, (%esp)
 ; SSE-NEXT:    flds (%esp)
@@ -1014,7 +1014,7 @@ define float @test16(i32 %a, i32 %b, float %x) nounwind {
 ; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; NOSSE2-NEXT:    jmp .LBB15_3
 ; NOSSE2-NEXT:  .LBB15_1:
-; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NOSSE2-NEXT:    movss {{.*#+}} xmm0 = [9.9E+1,0.0E+0,0.0E+0,0.0E+0]
 ; NOSSE2-NEXT:  .LBB15_3:
 ; NOSSE2-NEXT:    movss %xmm0, (%esp)
 ; NOSSE2-NEXT:    flds (%esp)
diff --git a/llvm/test/CodeGen/X86/cmovcmov.ll b/llvm/test/CodeGen/X86/cmovcmov.ll
index ab863dee69010..d2d1c4db4608d 100644
--- a/llvm/test/CodeGen/X86/cmovcmov.ll
+++ b/llvm/test/CodeGen/X86/cmovcmov.ll
@@ -217,7 +217,7 @@ define dso_local float @test_zext_fcmp_une(float %a, float %b) nounwind {
 ; CMOV-LABEL: test_zext_fcmp_une:
 ; CMOV:       # %bb.0: # %entry
 ; CMOV-NEXT:    cmpneqss %xmm1, %xmm0
-; CMOV-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CMOV-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CMOV-NEXT:    andps %xmm1, %xmm0
 ; CMOV-NEXT:    retq
 ;
@@ -255,7 +255,7 @@ define dso_local float @test_zext_fcmp_oeq(float %a, float %b) nounwind {
 ; CMOV-LABEL: test_zext_fcmp_oeq:
 ; CMOV:       # %bb.0: # %entry
 ; CMOV-NEXT:    cmpeqss %xmm1, %xmm0
-; CMOV-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CMOV-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CMOV-NEXT:    andps %xmm1, %xmm0
 ; CMOV-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/combine-fabs.ll b/llvm/test/CodeGen/X86/combine-fabs.ll
index e668c87003202..a862ea16a748f 100644
--- a/llvm/test/CodeGen/X86/combine-fabs.ll
+++ b/llvm/test/CodeGen/X86/combine-fabs.ll
@@ -11,12 +11,12 @@
 define float @combine_fabs_constant() {
 ; SSE-LABEL: combine_fabs_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: combine_fabs_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
   %1 = call float @llvm.fabs.f32(float -2.0)
   ret float %1
diff --git a/llvm/test/CodeGen/X86/combineIncDecVector-crash.ll b/llvm/test/CodeGen/X86/combineIncDecVector-crash.ll
index f98e38d92099a..2d6ce2017dc0d 100644
--- a/llvm/test/CodeGen/X86/combineIncDecVector-crash.ll
+++ b/llvm/test/CodeGen/X86/combineIncDecVector-crash.ll
@@ -18,7 +18,7 @@ define void @TestvMeth(i32 %0, i64 %1) gc "statepoint-example" !prof !1 {
 ; CHECK-NEXT:    movl $400, %ecx # imm = 0x190
 ; CHECK-NEXT:    callq newarray@PLT
 ; CHECK-NEXT:  .Ltmp0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.5E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    addss (%rax), %xmm0
 ; CHECK-NEXT:    movdqu (%rax), %xmm1
 ; CHECK-NEXT:    pcmpeqd %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/cvtv2f32.ll b/llvm/test/CodeGen/X86/cvtv2f32.ll
index b4b63caf4e91c..3875b72d5d68a 100644
--- a/llvm/test/CodeGen/X86/cvtv2f32.ll
+++ b/llvm/test/CodeGen/X86/cvtv2f32.ll
@@ -9,7 +9,7 @@ define <2 x float> @uitofp_2i32_cvt_buildvector(i32 %x, i32 %y, <2 x float> %v)
 ; X86-LABEL: uitofp_2i32_cvt_buildvector:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm2 = [4.503599627370496E+15,0.0E+0]
 ; X86-NEXT:    orpd %xmm2, %xmm1
 ; X86-NEXT:    subsd %xmm2, %xmm1
 ; X86-NEXT:    cvtsd2ss %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/dagcombine-select.ll b/llvm/test/CodeGen/X86/dagcombine-select.ll
index d29f161c76fb8..1380c02663ee0 100644
--- a/llvm/test/CodeGen/X86/dagcombine-select.ll
+++ b/llvm/test/CodeGen/X86/dagcombine-select.ll
@@ -279,10 +279,10 @@ define double @fsub_constant_sel_constants(i1 %cond) {
 ; CHECK-NEXT:    testb $1, %dil
 ; CHECK-NEXT:    jne .LBB20_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [-1.8200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB20_1:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [9.0999999999999996E+0,0.0E+0]
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, double -4.0, double 23.3
   %bo = fsub double 5.1, %sel
@@ -295,10 +295,10 @@ define double @fdiv_constant_sel_constants(i1 %cond) {
 ; CHECK-NEXT:    testb $1, %dil
 ; CHECK-NEXT:    jne .LBB21_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [2.188841201716738E-1,0.0E+0]
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB21_1:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [-1.2749999999999999E+0,0.0E+0]
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, double -4.0, double 23.3
   %bo = fdiv double 5.1, %sel
@@ -311,10 +311,10 @@ define double @frem_constant_sel_constants(i1 %cond) {
 ; CHECK-NEXT:    testb $1, %dil
 ; CHECK-NEXT:    jne .LBB22_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [5.0999999999999996E+0,0.0E+0]
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB22_1:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0999999999999996E+0,0.0E+0]
 ; CHECK-NEXT:    retq
   %sel = select i1 %cond, double -4.0, double 23.3
   %bo = frem double 5.1, %sel
diff --git a/llvm/test/CodeGen/X86/deopt-intrinsic.ll b/llvm/test/CodeGen/X86/deopt-intrinsic.ll
index b99482f0fb038..d610cc859f1ce 100644
--- a/llvm/test/CodeGen/X86/deopt-intrinsic.ll
+++ b/llvm/test/CodeGen/X86/deopt-intrinsic.ll
@@ -27,7 +27,7 @@ define i8 @caller_1() {
 ; CHECK-NEXT: ##{{.+}}
 ; CHECK-NEXT: pushq	%rax
 ; CHECK-NEXT: {{.+cfi.+}}
-; CHECK-NEXT: movss	{{[a-zA-Z0-9_]+}}(%rip), %xmm0    ## xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT: movss	{{[a-zA-Z0-9_]+}}(%rip), %xmm0    ## xmm0 = [5.0E+2,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT: movl	$42, %edi
 ; CHECK-NEXT: callq	___llvm_deoptimize
 ; CHECK-NEXT: {{Ltmp[0-9]+}}:
diff --git a/llvm/test/CodeGen/X86/extract-fp.ll b/llvm/test/CodeGen/X86/extract-fp.ll
index 58aaab9c83f95..fd4f2171807bf 100644
--- a/llvm/test/CodeGen/X86/extract-fp.ll
+++ b/llvm/test/CodeGen/X86/extract-fp.ll
@@ -16,7 +16,7 @@ define float @ext_fsub_v4f32(<4 x float> %x) {
 ; CHECK-LABEL: ext_fsub_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    subss %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -51,7 +51,7 @@ define float @ext_fdiv_v4f32_constant_op0(<4 x float> %x) {
 ; CHECK-LABEL: ext_fdiv_v4f32_constant_op0:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -64,7 +64,7 @@ define float @ext_frem_v4f32(<4 x float> %x) {
 ; CHECK-LABEL: ext_frem_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    jmp fmodf@PLT # TAILCALL
   %bo = frem <4 x float> %x, <float 1.0, float 2.0, float 3.0, float 42.0>
   %ext = extractelement <4 x float> %bo, i32 2
@@ -76,7 +76,7 @@ define float @ext_frem_v4f32_constant_op0(<4 x float> %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    jmp fmodf@PLT # TAILCALL
   %bo = frem <4 x float> <float 1.0, float 2.0, float 3.0, float 42.0>, %x
   %ext = extractelement <4 x float> %bo, i32 1
@@ -109,7 +109,7 @@ define double @ext_maximum_v4f64(<2 x double> %x) nounwind {
 ; CHECK-LABEL: ext_maximum_v4f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.3E+1,0.0E+0]
 ; CHECK-NEXT:    maxsd %xmm0, %xmm1
 ; CHECK-NEXT:    movapd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
@@ -122,7 +122,7 @@ define float @ext_minimum_v4f32(<4 x float> %x) nounwind {
 ; CHECK-LABEL: ext_minimum_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    minss %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 5bc6d00022e23..38162f676e7ee 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -320,7 +320,7 @@ define <3 x double> @extvselectsetcc_crash(<2 x double> %x) {
 ; X64-LABEL: extvselectsetcc_crash:
 ; X64:       # %bb.0:
 ; X64-NEXT:    vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; X64-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X64-NEXT:    vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; X64-NEXT:    vandpd %xmm2, %xmm1, %xmm1
 ; X64-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X64-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
@@ -329,7 +329,7 @@ define <3 x double> @extvselectsetcc_crash(<2 x double> %x) {
 ; X86-LABEL: extvselectsetcc_crash:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vcmpeqpd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
-; X86-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; X86-NEXT:    vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; X86-NEXT:    vandpd %xmm2, %xmm1, %xmm1
 ; X86-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
 ; X86-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,3,3]
diff --git a/llvm/test/CodeGen/X86/extractelement-load.ll b/llvm/test/CodeGen/X86/extractelement-load.ll
index 538b8ed10f25b..9d573ef2a8fad 100644
--- a/llvm/test/CodeGen/X86/extractelement-load.ll
+++ b/llvm/test/CodeGen/X86/extractelement-load.ll
@@ -139,7 +139,7 @@ define float @t6(ptr%a0) {
 ; X32-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X32-SSE2-NEXT:    xorps %xmm1, %xmm1
 ; X32-SSE2-NEXT:    cmpeqss %xmm0, %xmm1
-; X32-SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X32-SSE2-NEXT:    andps %xmm1, %xmm2
 ; X32-SSE2-NEXT:    andnps %xmm0, %xmm1
 ; X32-SSE2-NEXT:    orps %xmm2, %xmm1
@@ -154,7 +154,7 @@ define float @t6(ptr%a0) {
 ; X64-SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
 ; X64-SSSE3-NEXT:    xorps %xmm0, %xmm0
 ; X64-SSSE3-NEXT:    cmpeqss %xmm1, %xmm0
-; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-SSSE3-NEXT:    andps %xmm0, %xmm2
 ; X64-SSSE3-NEXT:    andnps %xmm1, %xmm0
 ; X64-SSSE3-NEXT:    orps %xmm2, %xmm0
@@ -239,7 +239,7 @@ define float @PR43971_1(ptr%a0) nounwind {
 ; X32-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
 ; X32-SSE2-NEXT:    xorps %xmm1, %xmm1
 ; X32-SSE2-NEXT:    cmpeqss %xmm0, %xmm1
-; X32-SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X32-SSE2-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X32-SSE2-NEXT:    andps %xmm1, %xmm2
 ; X32-SSE2-NEXT:    andnps %xmm0, %xmm1
 ; X32-SSE2-NEXT:    orps %xmm2, %xmm1
@@ -253,7 +253,7 @@ define float @PR43971_1(ptr%a0) nounwind {
 ; X64-SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = mem[1,1,3,3]
 ; X64-SSSE3-NEXT:    xorps %xmm0, %xmm0
 ; X64-SSSE3-NEXT:    cmpeqss %xmm1, %xmm0
-; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-SSSE3-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-SSSE3-NEXT:    andps %xmm0, %xmm2
 ; X64-SSSE3-NEXT:    andnps %xmm1, %xmm0
 ; X64-SSSE3-NEXT:    orps %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/fadd-combines.ll b/llvm/test/CodeGen/X86/fadd-combines.ll
index 15512e997d14d..1082177e3da19 100644
--- a/llvm/test/CodeGen/X86/fadd-combines.ll
+++ b/llvm/test/CodeGen/X86/fadd-combines.ll
@@ -236,7 +236,7 @@ define <4 x float> @fadd_fadd_x_x_fadd_x_x_4f32(<4 x float> %x) #0 {
 define float @fadd_const_multiuse_attr(float %x) #0 {
 ; CHECK-LABEL: fadd_const_multiuse_attr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    addss %xmm0, %xmm1
 ; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    addss %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/fast-isel-constpool.ll b/llvm/test/CodeGen/X86/fast-isel-constpool.ll
index 9e4cbb61308ea..dc746e1daac50 100644
--- a/llvm/test/CodeGen/X86/fast-isel-constpool.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-constpool.ll
@@ -17,7 +17,7 @@
 define float @constpool_float(float %x) {
 ; CHECK-LABEL: constpool_float:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.65E+2,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    addss %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
@@ -39,7 +39,7 @@ define float @constpool_float(float %x) {
 ;
 ; AVX-LABEL: constpool_float:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.65E+2,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -53,8 +53,8 @@ define float @constpool_float(float %x) {
 ; X86-LARGE:       ## %bb.0:
 ; X86-LARGE-NEXT:    pushl %eax ## encoding: [0x50]
 ; X86-LARGE-NEXT:    .cfi_def_cfa_offset 8
-; X86-LARGE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
-; X86-LARGE-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; X86-LARGE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-LARGE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
 ; X86-LARGE-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ## encoding: [0xf3,0x0f,0x58,0x05,A,A,A,A]
 ; X86-LARGE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-LARGE-NEXT:    movss %xmm0, (%esp) ## encoding: [0xf3,0x0f,0x11,0x04,0x24]
@@ -69,7 +69,7 @@ define float @constpool_float(float %x) {
 define double @constpool_double(double %x) nounwind {
 ; CHECK-LABEL: constpool_double:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [8.4999999999999998E-1,0.0E+0]
 ; CHECK-NEXT:    addsd %xmm1, %xmm0
 ; CHECK-NEXT:    retq
 ;
@@ -91,7 +91,7 @@ define double @constpool_double(double %x) nounwind {
 ;
 ; AVX-LABEL: constpool_double:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [8.4999999999999998E-1,0.0E+0]
 ; AVX-NEXT:    vaddsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -104,8 +104,8 @@ define double @constpool_double(double %x) nounwind {
 ; X86-LARGE-LABEL: constpool_double:
 ; X86-LARGE:       ## %bb.0:
 ; X86-LARGE-NEXT:    subl $12, %esp ## encoding: [0x83,0xec,0x0c]
-; X86-LARGE-NEXT:    movsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xf2,0x0f,0x10,0x44,0x24,0x10]
-; X86-LARGE-NEXT:    ## xmm0 = mem[0],zero
+; X86-LARGE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-LARGE-NEXT:    ## encoding: [0xf2,0x0f,0x10,0x44,0x24,0x10]
 ; X86-LARGE-NEXT:    addsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ## encoding: [0xf2,0x0f,0x58,0x05,A,A,A,A]
 ; X86-LARGE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-LARGE-NEXT:    movsd %xmm0, (%esp) ## encoding: [0xf2,0x0f,0x11,0x04,0x24]
@@ -120,7 +120,7 @@ define double @constpool_double(double %x) nounwind {
 define void @constpool_float_no_fp_args(ptr %x) nounwind {
 ; CHECK-LABEL: constpool_float_no_fp_args:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.65E+2,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    addss (%rdi), %xmm0
 ; CHECK-NEXT:    movss %xmm0, (%rdi)
 ; CHECK-NEXT:    retq
@@ -147,7 +147,7 @@ define void @constpool_float_no_fp_args(ptr %x) nounwind {
 ;
 ; AVX-LABEL: constpool_float_no_fp_args:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.65E+2,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vaddss (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovss %xmm0, (%rdi)
 ; AVX-NEXT:    retq
@@ -163,9 +163,9 @@ define void @constpool_float_no_fp_args(ptr %x) nounwind {
 ; X86-LARGE-LABEL: constpool_float_no_fp_args:
 ; X86-LARGE:       ## %bb.0:
 ; X86-LARGE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-LARGE-NEXT:    movss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; X86-LARGE-NEXT:    movss {{.*#+}} xmm0 = [1.65E+2,0.0E+0,0.0E+0,0.0E+0]
+; X86-LARGE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
 ; X86-LARGE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-LARGE-NEXT:    ## xmm0 = mem[0],zero,zero,zero
 ; X86-LARGE-NEXT:    addss (%eax), %xmm0 ## encoding: [0xf3,0x0f,0x58,0x00]
 ; X86-LARGE-NEXT:    movss %xmm0, (%eax) ## encoding: [0xf3,0x0f,0x11,0x00]
 ; X86-LARGE-NEXT:    retl ## encoding: [0xc3]
@@ -178,7 +178,7 @@ define void @constpool_float_no_fp_args(ptr %x) nounwind {
 define void @constpool_double_no_fp_args(ptr %x) nounwind {
 ; CHECK-LABEL: constpool_double_no_fp_args:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [8.4999999999999998E-1,0.0E+0]
 ; CHECK-NEXT:    addsd (%rdi), %xmm0
 ; CHECK-NEXT:    movsd %xmm0, (%rdi)
 ; CHECK-NEXT:    retq
@@ -205,7 +205,7 @@ define void @constpool_double_no_fp_args(ptr %x) nounwind {
 ;
 ; AVX-LABEL: constpool_double_no_fp_args:
 ; AVX:       ## %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [8.4999999999999998E-1,0.0E+0]
 ; AVX-NEXT:    vaddsd (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovsd %xmm0, (%rdi)
 ; AVX-NEXT:    retq
@@ -221,9 +221,9 @@ define void @constpool_double_no_fp_args(ptr %x) nounwind {
 ; X86-LARGE-LABEL: constpool_double_no_fp_args:
 ; X86-LARGE:       ## %bb.0:
 ; X86-LARGE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-LARGE-NEXT:    movsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A]
+; X86-LARGE-NEXT:    movsd {{.*#+}} xmm0 = [8.4999999999999998E-1,0.0E+0]
+; X86-LARGE-NEXT:    ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A]
 ; X86-LARGE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
-; X86-LARGE-NEXT:    ## xmm0 = mem[0],zero
 ; X86-LARGE-NEXT:    addsd (%eax), %xmm0 ## encoding: [0xf2,0x0f,0x58,0x00]
 ; X86-LARGE-NEXT:    movsd %xmm0, (%eax) ## encoding: [0xf2,0x0f,0x11,0x00]
 ; X86-LARGE-NEXT:    retl ## encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/fdiv-combine-vec.ll b/llvm/test/CodeGen/X86/fdiv-combine-vec.ll
index 5e119207ccf1b..5c25d0214114f 100644
--- a/llvm/test/CodeGen/X86/fdiv-combine-vec.ll
+++ b/llvm/test/CodeGen/X86/fdiv-combine-vec.ll
@@ -5,7 +5,7 @@
 define <2 x double> @splat_fdiv_v2f64(<2 x double> %x, double %y) {
 ; SSE-LABEL: splat_fdiv_v2f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; SSE-NEXT:    divsd %xmm1, %xmm2
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm2 = xmm2[0,0]
 ; SSE-NEXT:    mulpd %xmm2, %xmm0
@@ -13,7 +13,7 @@ define <2 x double> @splat_fdiv_v2f64(<2 x double> %x, double %y) {
 ;
 ; AVX-LABEL: splat_fdiv_v2f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivsd %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
 ; AVX-NEXT:    vmulpd %xmm1, %xmm0, %xmm0
@@ -27,7 +27,7 @@ define <2 x double> @splat_fdiv_v2f64(<2 x double> %x, double %y) {
 define <4 x double> @splat_fdiv_v4f64(<4 x double> %x, double %y) {
 ; SSE-LABEL: splat_fdiv_v4f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0]
 ; SSE-NEXT:    divsd %xmm2, %xmm3
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm3 = xmm3[0,0]
 ; SSE-NEXT:    mulpd %xmm3, %xmm0
@@ -36,7 +36,7 @@ define <4 x double> @splat_fdiv_v4f64(<4 x double> %x, double %y) {
 ;
 ; AVX-LABEL: splat_fdiv_v4f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivsd %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = xmm1[0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
@@ -51,7 +51,7 @@ define <4 x double> @splat_fdiv_v4f64(<4 x double> %x, double %y) {
 define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) {
 ; SSE-LABEL: splat_fdiv_v4f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    divss %xmm1, %xmm2
 ; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,0,0,0]
 ; SSE-NEXT:    mulps %xmm2, %xmm0
@@ -59,7 +59,7 @@ define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) {
 ;
 ; AVX-LABEL: splat_fdiv_v4f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX-NEXT:    vmulps %xmm1, %xmm0, %xmm0
@@ -73,7 +73,7 @@ define <4 x float> @splat_fdiv_v4f32(<4 x float> %x, float %y) {
 define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) {
 ; SSE-LABEL: splat_fdiv_v8f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    divss %xmm2, %xmm3
 ; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,0,0,0]
 ; SSE-NEXT:    mulps %xmm3, %xmm0
@@ -82,7 +82,7 @@ define <8 x float> @splat_fdiv_v8f32(<8 x float> %x, float %y) {
 ;
 ; AVX-LABEL: splat_fdiv_v8f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm1, %ymm1
@@ -99,7 +99,7 @@ define <4 x float> @splat_fdiv_v4f32_estimate(<4 x float> %x, float %y) #0 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpss %xmm1, %xmm2
 ; SSE-NEXT:    mulss %xmm2, %xmm1
-; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    subss %xmm1, %xmm3
 ; SSE-NEXT:    mulss %xmm2, %xmm3
 ; SSE-NEXT:    addss %xmm2, %xmm3
@@ -111,7 +111,7 @@ define <4 x float> @splat_fdiv_v4f32_estimate(<4 x float> %x, float %y) #0 {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubss %xmm1, %xmm3, %xmm1
 ; AVX-NEXT:    vmulss %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vaddss %xmm1, %xmm2, %xmm1
@@ -129,7 +129,7 @@ define <8 x float> @splat_fdiv_v8f32_estimate(<8 x float> %x, float %y) #0 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpss %xmm2, %xmm3
 ; SSE-NEXT:    mulss %xmm3, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm4 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    subss %xmm2, %xmm4
 ; SSE-NEXT:    mulss %xmm3, %xmm4
 ; SSE-NEXT:    addss %xmm3, %xmm4
@@ -142,7 +142,7 @@ define <8 x float> @splat_fdiv_v8f32_estimate(<8 x float> %x, float %y) #0 {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vrcpss %xmm1, %xmm1, %xmm2
 ; AVX-NEXT:    vmulss %xmm2, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubss %xmm1, %xmm3, %xmm1
 ; AVX-NEXT:    vmulss %xmm1, %xmm2, %xmm1
 ; AVX-NEXT:    vaddss %xmm1, %xmm2, %xmm1
diff --git a/llvm/test/CodeGen/X86/fdiv-combine.ll b/llvm/test/CodeGen/X86/fdiv-combine.ll
index a97f0891e002b..322ef6b393922 100644
--- a/llvm/test/CodeGen/X86/fdiv-combine.ll
+++ b/llvm/test/CodeGen/X86/fdiv-combine.ll
@@ -20,7 +20,7 @@ define float @div1_arcp(float %x, float %y, float %z) {
 define float @div2_arcp_all(float %x, float %y, float %z) {
 ; CHECK-LABEL: div2_arcp_all:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss %xmm2, %xmm3
 ; CHECK-NEXT:    mulss %xmm3, %xmm0
 ; CHECK-NEXT:    mulss %xmm1, %xmm0
@@ -67,7 +67,7 @@ define float @div2_arcp_partial2(float %x, float %y, float %z) {
 define float @div2_arcp_partial3(float %x, float %y, float %z) {
 ; CHECK-LABEL: div2_arcp_partial3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss %xmm2, %xmm3
 ; CHECK-NEXT:    mulss %xmm3, %xmm0
 ; CHECK-NEXT:    mulss %xmm1, %xmm0
@@ -85,7 +85,7 @@ define float @div2_arcp_partial3(float %x, float %y, float %z) {
 define double @div3_arcp(double %x, double %y, double %z) {
 ; CHECK-LABEL: div3_arcp:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    divsd %xmm1, %xmm2
 ; CHECK-NEXT:    mulsd %xmm2, %xmm0
 ; CHECK-NEXT:    addsd %xmm2, %xmm0
@@ -102,10 +102,10 @@ define float @div_select_constant_fold(i1 zeroext %arg) {
 ; CHECK-NEXT:    testl %edi, %edi
 ; CHECK-NEXT:    jne .LBB6_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    retq
 ; CHECK-NEXT:  .LBB6_1:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    retq
   %tmp = select i1 %arg, float 5.000000e+00, float 6.000000e+00
   %B2 = fdiv nnan float %tmp, 2.000000e+00
@@ -115,7 +115,7 @@ define float @div_select_constant_fold(i1 zeroext %arg) {
 define float @div_select_constant_fold_zero(i1 zeroext %arg) {
 ; CHECK-LABEL: div_select_constant_fold_zero:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [+Inf,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    retq
   %tmp = select i1 %arg, float 5.000000e+00, float 6.000000e+00
   %B2 = fdiv float %tmp, 0.000000e+00
diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-canonical.ll b/llvm/test/CodeGen/X86/fma-intrinsics-canonical.ll
index 1de8fb15c7403..e420215121728 100644
--- a/llvm/test/CodeGen/X86/fma-intrinsics-canonical.ll
+++ b/llvm/test/CodeGen/X86/fma-intrinsics-canonical.ll
@@ -71,8 +71,8 @@ define <4 x float> @test_mm_fmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float>
 ; CHECK-FMA-WIN-LABEL: test_mm_fmadd_ss:
 ; CHECK-FMA-WIN:       # %bb.0: # %entry
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -101,8 +101,8 @@ define <2 x double> @test_mm_fmadd_sd(<2 x double> %a, <2 x double> %b, <2 x dou
 ; CHECK-FMA-WIN-LABEL: test_mm_fmadd_sd:
 ; CHECK-FMA-WIN:       # %bb.0: # %entry
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -183,8 +183,8 @@ define <4 x float> @test_mm_fmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float>
 ; CHECK-FMA-WIN-LABEL: test_mm_fmsub_ss:
 ; CHECK-FMA-WIN:       # %bb.0: # %entry
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -214,8 +214,8 @@ define <2 x double> @test_mm_fmsub_sd(<2 x double> %a, <2 x double> %b, <2 x dou
 ; CHECK-FMA-WIN-LABEL: test_mm_fmsub_sd:
 ; CHECK-FMA-WIN:       # %bb.0: # %entry
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -297,8 +297,8 @@ define <4 x float> @test_mm_fnmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float
 ; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_ss:
 ; CHECK-FMA-WIN:       # %bb.0: # %entry
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -328,8 +328,8 @@ define <2 x double> @test_mm_fnmadd_sd(<2 x double> %a, <2 x double> %b, <2 x do
 ; CHECK-FMA-WIN-LABEL: test_mm_fnmadd_sd:
 ; CHECK-FMA-WIN:       # %bb.0: # %entry
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -413,8 +413,8 @@ define <4 x float> @test_mm_fnmsub_ss(<4 x float> %a, <4 x float> %b, <4 x float
 ; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_ss:
 ; CHECK-FMA-WIN:       # %bb.0: # %entry
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -445,8 +445,8 @@ define <2 x double> @test_mm_fnmsub_sd(<2 x double> %a, <2 x double> %b, <2 x do
 ; CHECK-FMA-WIN-LABEL: test_mm_fnmsub_sd:
 ; CHECK-FMA-WIN:       # %bb.0: # %entry
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll
index 726de36023ce6..7d99b6a610f67 100644
--- a/llvm/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/fma-intrinsics-x86-upgrade.ll
@@ -20,8 +20,8 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -47,8 +47,8 @@ define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -73,8 +73,8 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -100,8 +100,8 @@ define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -227,8 +227,8 @@ define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -254,8 +254,8 @@ define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -280,8 +280,8 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -307,8 +307,8 @@ define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -434,8 +434,8 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -461,8 +461,8 @@ define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -487,8 +487,8 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -514,8 +514,8 @@ define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double>
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -641,8 +641,8 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -668,8 +668,8 @@ define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -694,8 +694,8 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -721,8 +721,8 @@ define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double>
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll b/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll
index eb351ec81d383..94229c9370e2a 100644
--- a/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/fma-intrinsics-x86.ll
@@ -20,8 +20,8 @@ define <4 x float> @test_x86_fma_vfmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -51,8 +51,8 @@ define <4 x float> @test_x86_fma_vfmadd_bac_ss(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x99,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -82,8 +82,8 @@ define <4 x float> @test_x86_fma_vfmadd_ss_231(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_ss_231:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%r8), %xmm0 # encoding: [0xc4,0xc1,0x78,0x28,0x00]
-; CHECK-FMA-WIN-NEXT:    vmovss (%rcx), %xmm1 # encoding: [0xc5,0xfa,0x10,0x09]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc5,0xfa,0x10,0x09]
 ; CHECK-FMA-WIN-NEXT:    vfmadd231ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0xb9,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm1 * mem) + xmm0
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -111,8 +111,8 @@ define <2 x double> @test_x86_fma_vfmadd_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -142,8 +142,8 @@ define <2 x double> @test_x86_fma_vfmadd_bac_sd(<2 x double> %a0, <2 x double> %
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmadd_bac_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x99,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -268,8 +268,8 @@ define <4 x float> @test_x86_fma_vfmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -300,8 +300,8 @@ define <4 x float> @test_x86_fma_vfmsub_bac_ss(<4 x float> %a0, <4 x float> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9b,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -330,8 +330,8 @@ define <2 x double> @test_x86_fma_vfmsub_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -362,8 +362,8 @@ define <2 x double> @test_x86_fma_vfmsub_bac_sd(<2 x double> %a0, <2 x double> %
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfmsub_bac_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9b,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = (xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -493,8 +493,8 @@ define <4 x float> @test_x86_fma_vfnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -525,8 +525,8 @@ define <4 x float> @test_x86_fma_vfnmadd_bac_ss(<4 x float> %a0, <4 x float> %a1
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9d,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -555,8 +555,8 @@ define <2 x double> @test_x86_fma_vfnmadd_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -587,8 +587,8 @@ define <2 x double> @test_x86_fma_vfnmadd_bac_sd(<2 x double> %a0, <2 x double>
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmadd_bac_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmadd132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9d,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) + xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -718,8 +718,8 @@ define <4 x float> @test_x86_fma_vfnmsub_ss(<4 x float> %a0, <4 x float> %a1, <4
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rcx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -751,8 +751,8 @@ define <4 x float> @test_x86_fma_vfnmsub_bac_ss(<4 x float> %a0, <4 x float> %a1
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_ss:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovaps (%rdx), %xmm0 # encoding: [0xc5,0xf8,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovss (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7a,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132ss (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0x71,0x9f,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -782,8 +782,8 @@ define <2 x double> @test_x86_fma_vfnmsub_sd(<2 x double> %a0, <2 x double> %a1,
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rcx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x01]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rdx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x02]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
@@ -815,8 +815,8 @@ define <2 x double> @test_x86_fma_vfnmsub_bac_sd(<2 x double> %a0, <2 x double>
 ; CHECK-FMA-WIN-LABEL: test_x86_fma_vfnmsub_bac_sd:
 ; CHECK-FMA-WIN:       # %bb.0:
 ; CHECK-FMA-WIN-NEXT:    vmovapd (%rdx), %xmm0 # encoding: [0xc5,0xf9,0x28,0x02]
-; CHECK-FMA-WIN-NEXT:    vmovsd (%r8), %xmm1 # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
-; CHECK-FMA-WIN-NEXT:    # xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-FMA-WIN-NEXT:    # encoding: [0xc4,0xc1,0x7b,0x10,0x08]
 ; CHECK-FMA-WIN-NEXT:    vfnmsub132sd (%rcx), %xmm1, %xmm0 # encoding: [0xc4,0xe2,0xf1,0x9f,0x01]
 ; CHECK-FMA-WIN-NEXT:    # xmm0 = -(xmm0 * mem) - xmm1
 ; CHECK-FMA-WIN-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/fma-scalar-memfold.ll b/llvm/test/CodeGen/X86/fma-scalar-memfold.ll
index 508bb4b299b7e..e81d80a457928 100644
--- a/llvm/test/CodeGen/X86/fma-scalar-memfold.ll
+++ b/llvm/test/CodeGen/X86/fma-scalar-memfold.ll
@@ -17,8 +17,8 @@ declare <2 x double> @llvm.x86.fma.vfnmsub.sd(<2 x double>, <2 x double>, <2 x d
 define void @fmadd_aab_ss(ptr %a, ptr %b) {
 ; AVX2-LABEL: fmadd_aab_ss:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX2-NEXT:    vfmadd213ss (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xa9,0x06]
 ; AVX2-NEXT:    # xmm0 = (xmm0 * xmm0) + mem
 ; AVX2-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
@@ -26,8 +26,8 @@ define void @fmadd_aab_ss(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fmadd_aab_ss:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX512-NEXT:    vfmadd213ss (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xa9,0x06]
 ; AVX512-NEXT:    # xmm0 = (xmm0 * xmm0) + mem
 ; AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
@@ -54,8 +54,8 @@ define void @fmadd_aab_ss(ptr %a, ptr %b) {
 define void @fmadd_aba_ss(ptr %a, ptr %b) {
 ; AVX2-LABEL: fmadd_aba_ss:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX2-NEXT:    vfmadd231ss (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xb9,0x06]
 ; AVX2-NEXT:    # xmm0 = (xmm0 * mem) + xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
@@ -63,8 +63,8 @@ define void @fmadd_aba_ss(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fmadd_aba_ss:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX512-NEXT:    vfmadd231ss (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xb9,0x06]
 ; AVX512-NEXT:    # xmm0 = (xmm0 * mem) + xmm0
 ; AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
@@ -91,8 +91,8 @@ define void @fmadd_aba_ss(ptr %a, ptr %b) {
 define void @fmsub_aab_ss(ptr %a, ptr %b) {
 ; AVX2-LABEL: fmsub_aab_ss:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX2-NEXT:    vfmsub213ss (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xab,0x06]
 ; AVX2-NEXT:    # xmm0 = (xmm0 * xmm0) - mem
 ; AVX2-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
@@ -100,8 +100,8 @@ define void @fmsub_aab_ss(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fmsub_aab_ss:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX512-NEXT:    vfmsub213ss (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xab,0x06]
 ; AVX512-NEXT:    # xmm0 = (xmm0 * xmm0) - mem
 ; AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
@@ -128,8 +128,8 @@ define void @fmsub_aab_ss(ptr %a, ptr %b) {
 define void @fmsub_aba_ss(ptr %a, ptr %b) {
 ; AVX2-LABEL: fmsub_aba_ss:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX2-NEXT:    vfmsub231ss (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xbb,0x06]
 ; AVX2-NEXT:    # xmm0 = (xmm0 * mem) - xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
@@ -137,8 +137,8 @@ define void @fmsub_aba_ss(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fmsub_aba_ss:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX512-NEXT:    vfmsub231ss (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xbb,0x06]
 ; AVX512-NEXT:    # xmm0 = (xmm0 * mem) - xmm0
 ; AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
@@ -165,8 +165,8 @@ define void @fmsub_aba_ss(ptr %a, ptr %b) {
 define void @fnmadd_aab_ss(ptr %a, ptr %b) {
 ; AVX2-LABEL: fnmadd_aab_ss:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX2-NEXT:    vfnmadd213ss (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xad,0x06]
 ; AVX2-NEXT:    # xmm0 = -(xmm0 * xmm0) + mem
 ; AVX2-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
@@ -174,8 +174,8 @@ define void @fnmadd_aab_ss(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fnmadd_aab_ss:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX512-NEXT:    vfnmadd213ss (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xad,0x06]
 ; AVX512-NEXT:    # xmm0 = -(xmm0 * xmm0) + mem
 ; AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
@@ -202,8 +202,8 @@ define void @fnmadd_aab_ss(ptr %a, ptr %b) {
 define void @fnmadd_aba_ss(ptr %a, ptr %b) {
 ; AVX2-LABEL: fnmadd_aba_ss:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX2-NEXT:    vfnmadd231ss (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xbd,0x06]
 ; AVX2-NEXT:    # xmm0 = -(xmm0 * mem) + xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
@@ -211,8 +211,8 @@ define void @fnmadd_aba_ss(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fnmadd_aba_ss:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX512-NEXT:    vfnmadd231ss (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xbd,0x06]
 ; AVX512-NEXT:    # xmm0 = -(xmm0 * mem) + xmm0
 ; AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
@@ -239,8 +239,8 @@ define void @fnmadd_aba_ss(ptr %a, ptr %b) {
 define void @fnmsub_aab_ss(ptr %a, ptr %b) {
 ; AVX2-LABEL: fnmsub_aab_ss:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX2-NEXT:    vfnmsub213ss (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xaf,0x06]
 ; AVX2-NEXT:    # xmm0 = -(xmm0 * xmm0) - mem
 ; AVX2-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
@@ -248,8 +248,8 @@ define void @fnmsub_aab_ss(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fnmsub_aab_ss:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX512-NEXT:    vfnmsub213ss (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xaf,0x06]
 ; AVX512-NEXT:    # xmm0 = -(xmm0 * xmm0) - mem
 ; AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
@@ -276,8 +276,8 @@ define void @fnmsub_aab_ss(ptr %a, ptr %b) {
 define void @fnmsub_aba_ss(ptr %a, ptr %b) {
 ; AVX2-LABEL: fnmsub_aba_ss:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX2-NEXT:    vfnmsub231ss (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0x79,0xbf,0x06]
 ; AVX2-NEXT:    # xmm0 = -(xmm0 * mem) - xmm0
 ; AVX2-NEXT:    vmovss %xmm0, (%rdi) # encoding: [0xc5,0xfa,0x11,0x07]
@@ -285,8 +285,8 @@ define void @fnmsub_aba_ss(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fnmsub_aba_ss:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; AVX512-NEXT:    vfnmsub231ss (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0xbf,0x06]
 ; AVX512-NEXT:    # xmm0 = -(xmm0 * mem) - xmm0
 ; AVX512-NEXT:    vmovss %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x07]
@@ -313,8 +313,8 @@ define void @fnmsub_aba_ss(ptr %a, ptr %b) {
 define void @fmadd_aab_sd(ptr %a, ptr %b) {
 ; AVX2-LABEL: fmadd_aab_sd:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX2-NEXT:    vfmadd213sd (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0xa9,0x06]
 ; AVX2-NEXT:    # xmm0 = (xmm0 * xmm0) + mem
 ; AVX2-NEXT:    vmovsd %xmm0, (%rdi) # encoding: [0xc5,0xfb,0x11,0x07]
@@ -322,8 +322,8 @@ define void @fmadd_aab_sd(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fmadd_aab_sd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX512-NEXT:    vfmadd213sd (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xa9,0x06]
 ; AVX512-NEXT:    # xmm0 = (xmm0 * xmm0) + mem
 ; AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
@@ -346,8 +346,8 @@ define void @fmadd_aab_sd(ptr %a, ptr %b) {
 define void @fmadd_aba_sd(ptr %a, ptr %b) {
 ; AVX2-LABEL: fmadd_aba_sd:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX2-NEXT:    vfmadd231sd (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0xb9,0x06]
 ; AVX2-NEXT:    # xmm0 = (xmm0 * mem) + xmm0
 ; AVX2-NEXT:    vmovsd %xmm0, (%rdi) # encoding: [0xc5,0xfb,0x11,0x07]
@@ -355,8 +355,8 @@ define void @fmadd_aba_sd(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fmadd_aba_sd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX512-NEXT:    vfmadd231sd (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xb9,0x06]
 ; AVX512-NEXT:    # xmm0 = (xmm0 * mem) + xmm0
 ; AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
@@ -379,8 +379,8 @@ define void @fmadd_aba_sd(ptr %a, ptr %b) {
 define void @fmsub_aab_sd(ptr %a, ptr %b) {
 ; AVX2-LABEL: fmsub_aab_sd:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX2-NEXT:    vfmsub213sd (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0xab,0x06]
 ; AVX2-NEXT:    # xmm0 = (xmm0 * xmm0) - mem
 ; AVX2-NEXT:    vmovsd %xmm0, (%rdi) # encoding: [0xc5,0xfb,0x11,0x07]
@@ -388,8 +388,8 @@ define void @fmsub_aab_sd(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fmsub_aab_sd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX512-NEXT:    vfmsub213sd (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xab,0x06]
 ; AVX512-NEXT:    # xmm0 = (xmm0 * xmm0) - mem
 ; AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
@@ -412,8 +412,8 @@ define void @fmsub_aab_sd(ptr %a, ptr %b) {
 define void @fmsub_aba_sd(ptr %a, ptr %b) {
 ; AVX2-LABEL: fmsub_aba_sd:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX2-NEXT:    vfmsub231sd (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0xbb,0x06]
 ; AVX2-NEXT:    # xmm0 = (xmm0 * mem) - xmm0
 ; AVX2-NEXT:    vmovsd %xmm0, (%rdi) # encoding: [0xc5,0xfb,0x11,0x07]
@@ -421,8 +421,8 @@ define void @fmsub_aba_sd(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fmsub_aba_sd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX512-NEXT:    vfmsub231sd (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xbb,0x06]
 ; AVX512-NEXT:    # xmm0 = (xmm0 * mem) - xmm0
 ; AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
@@ -445,8 +445,8 @@ define void @fmsub_aba_sd(ptr %a, ptr %b) {
 define void @fnmadd_aab_sd(ptr %a, ptr %b) {
 ; AVX2-LABEL: fnmadd_aab_sd:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX2-NEXT:    vfnmadd213sd (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0xad,0x06]
 ; AVX2-NEXT:    # xmm0 = -(xmm0 * xmm0) + mem
 ; AVX2-NEXT:    vmovsd %xmm0, (%rdi) # encoding: [0xc5,0xfb,0x11,0x07]
@@ -454,8 +454,8 @@ define void @fnmadd_aab_sd(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fnmadd_aab_sd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX512-NEXT:    vfnmadd213sd (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xad,0x06]
 ; AVX512-NEXT:    # xmm0 = -(xmm0 * xmm0) + mem
 ; AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
@@ -478,8 +478,8 @@ define void @fnmadd_aab_sd(ptr %a, ptr %b) {
 define void @fnmadd_aba_sd(ptr %a, ptr %b) {
 ; AVX2-LABEL: fnmadd_aba_sd:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX2-NEXT:    vfnmadd231sd (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0xbd,0x06]
 ; AVX2-NEXT:    # xmm0 = -(xmm0 * mem) + xmm0
 ; AVX2-NEXT:    vmovsd %xmm0, (%rdi) # encoding: [0xc5,0xfb,0x11,0x07]
@@ -487,8 +487,8 @@ define void @fnmadd_aba_sd(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fnmadd_aba_sd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX512-NEXT:    vfnmadd231sd (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xbd,0x06]
 ; AVX512-NEXT:    # xmm0 = -(xmm0 * mem) + xmm0
 ; AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
@@ -511,8 +511,8 @@ define void @fnmadd_aba_sd(ptr %a, ptr %b) {
 define void @fnmsub_aab_sd(ptr %a, ptr %b) {
 ; AVX2-LABEL: fnmsub_aab_sd:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX2-NEXT:    vfnmsub213sd (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0xaf,0x06]
 ; AVX2-NEXT:    # xmm0 = -(xmm0 * xmm0) - mem
 ; AVX2-NEXT:    vmovsd %xmm0, (%rdi) # encoding: [0xc5,0xfb,0x11,0x07]
@@ -520,8 +520,8 @@ define void @fnmsub_aab_sd(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fnmsub_aab_sd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX512-NEXT:    vfnmsub213sd (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xaf,0x06]
 ; AVX512-NEXT:    # xmm0 = -(xmm0 * xmm0) - mem
 ; AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
@@ -544,8 +544,8 @@ define void @fnmsub_aab_sd(ptr %a, ptr %b) {
 define void @fnmsub_aba_sd(ptr %a, ptr %b) {
 ; AVX2-LABEL: fnmsub_aba_sd:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; AVX2-NEXT:    # xmm0 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX2-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX2-NEXT:    vfnmsub231sd (%rsi), %xmm0, %xmm0 # encoding: [0xc4,0xe2,0xf9,0xbf,0x06]
 ; AVX2-NEXT:    # xmm0 = -(xmm0 * mem) - xmm0
 ; AVX2-NEXT:    vmovsd %xmm0, (%rdi) # encoding: [0xc5,0xfb,0x11,0x07]
@@ -553,8 +553,8 @@ define void @fnmsub_aba_sd(ptr %a, ptr %b) {
 ;
 ; AVX512-LABEL: fnmsub_aba_sd:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; AVX512-NEXT:    # xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; AVX512-NEXT:    vfnmsub231sd (%rsi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xf9,0xbf,0x06]
 ; AVX512-NEXT:    # xmm0 = -(xmm0 * mem) - xmm0
 ; AVX512-NEXT:    vmovsd %xmm0, (%rdi) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x07]
diff --git a/llvm/test/CodeGen/X86/fma.ll b/llvm/test/CodeGen/X86/fma.ll
index 03de1533e1d64..c55f50e97786a 100644
--- a/llvm/test/CodeGen/X86/fma.ll
+++ b/llvm/test/CodeGen/X86/fma.ll
@@ -12,10 +12,10 @@ define float @test_f32(float %a, float %b, float %c) #0 {
 ; FMA32-LABEL: test_f32:
 ; FMA32:       ## %bb.0:
 ; FMA32-NEXT:    pushl %eax ## encoding: [0x50]
-; FMA32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
-; FMA32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; FMA32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
-; FMA32-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMA32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMA32-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
+; FMA32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; FMA32-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
 ; FMA32-NEXT:    vfmadd213ss {{[0-9]+}}(%esp), %xmm0, %xmm1 ## encoding: [0xc4,0xe2,0x79,0xa9,0x4c,0x24,0x10]
 ; FMA32-NEXT:    ## xmm1 = (xmm0 * xmm1) + mem
 ; FMA32-NEXT:    vmovss %xmm1, (%esp) ## encoding: [0xc5,0xfa,0x11,0x0c,0x24]
@@ -60,10 +60,10 @@ define float @test_f32_reassoc(float %a, float %b, float %c) #0 {
 ; FMA32-LABEL: test_f32_reassoc:
 ; FMA32:       ## %bb.0:
 ; FMA32-NEXT:    pushl %eax ## encoding: [0x50]
-; FMA32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
-; FMA32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; FMA32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
-; FMA32-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMA32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMA32-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
+; FMA32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; FMA32-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
 ; FMA32-NEXT:    vfmadd213ss {{[0-9]+}}(%esp), %xmm0, %xmm1 ## encoding: [0xc4,0xe2,0x79,0xa9,0x4c,0x24,0x10]
 ; FMA32-NEXT:    ## xmm1 = (xmm0 * xmm1) + mem
 ; FMA32-NEXT:    vmovss %xmm1, (%esp) ## encoding: [0xc5,0xfa,0x11,0x0c,0x24]
@@ -74,8 +74,8 @@ define float @test_f32_reassoc(float %a, float %b, float %c) #0 {
 ; FMACALL32-LABEL: test_f32_reassoc:
 ; FMACALL32:       ## %bb.0:
 ; FMACALL32-NEXT:    pushl %eax ## encoding: [0x50]
-; FMACALL32-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
-; FMACALL32-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMACALL32-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
 ; FMACALL32-NEXT:    vmulss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x59,0x44,0x24,0x0c]
 ; FMACALL32-NEXT:    vaddss {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x58,0x44,0x24,0x10]
 ; FMACALL32-NEXT:    vmovss %xmm0, (%esp) ## encoding: [0xc5,0xfa,0x11,0x04,0x24]
@@ -114,10 +114,10 @@ define double @test_f64(double %a, double %b, double %c) #0 {
 ; FMA32-LABEL: test_f64:
 ; FMA32:       ## %bb.0: ## %entry
 ; FMA32-NEXT:    subl $12, %esp ## encoding: [0x83,0xec,0x0c]
-; FMA32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x10]
-; FMA32-NEXT:    ## xmm0 = mem[0],zero
-; FMA32-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x18]
-; FMA32-NEXT:    ## xmm1 = mem[0],zero
+; FMA32-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMA32-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x10]
+; FMA32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; FMA32-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x18]
 ; FMA32-NEXT:    vfmadd213sd {{[0-9]+}}(%esp), %xmm0, %xmm1 ## encoding: [0xc4,0xe2,0xf9,0xa9,0x4c,0x24,0x20]
 ; FMA32-NEXT:    ## xmm1 = (xmm0 * xmm1) + mem
 ; FMA32-NEXT:    vmovsd %xmm1, (%esp) ## encoding: [0xc5,0xfb,0x11,0x0c,0x24]
@@ -263,30 +263,30 @@ define float @test_f32_cst() #0 {
 ;
 ; FMA64-LABEL: test_f32_cst:
 ; FMA64:       ## %bb.0: ## %entry
-; FMA64-NEXT:    vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
+; FMA64-NEXT:    vmovss {{.*#+}} xmm0 = [1.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; FMA64-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
 ; FMA64-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; FMA64-NEXT:    ## xmm0 = mem[0],zero,zero,zero
 ; FMA64-NEXT:    retq ## encoding: [0xc3]
 ;
 ; FMACALL64-LABEL: test_f32_cst:
 ; FMACALL64:       ## %bb.0: ## %entry
-; FMACALL64-NEXT:    movss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; FMACALL64-NEXT:    movss {{.*#+}} xmm0 = [1.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; FMACALL64-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
 ; FMACALL64-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; FMACALL64-NEXT:    ## xmm0 = mem[0],zero,zero,zero
 ; FMACALL64-NEXT:    retq ## encoding: [0xc3]
 ;
 ; AVX512-LABEL: test_f32_cst:
 ; AVX512:       ## %bb.0: ## %entry
-; AVX512-NEXT:    vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = [1.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
 ; AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; AVX512-NEXT:    ## xmm0 = mem[0],zero,zero,zero
 ; AVX512-NEXT:    retq ## encoding: [0xc3]
 ;
 ; AVX512VL-LABEL: test_f32_cst:
 ; AVX512VL:       ## %bb.0: ## %entry
-; AVX512VL-NEXT:    vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
+; AVX512VL-NEXT:    vmovss {{.*#+}} xmm0 = [1.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX512VL-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
 ; AVX512VL-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
-; AVX512VL-NEXT:    ## xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-NEXT:    retq ## encoding: [0xc3]
 entry:
   %call = call float @llvm.fma.f32(float 3.0, float 3.0, float 3.0)
@@ -450,8 +450,8 @@ define <4 x float> @test_v4f32(<4 x float> %a, <4 x float> %b, <4 x float> %c) #
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
 ; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x1c]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x18,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x14,0x20]
@@ -778,12 +778,12 @@ define <8 x float> @test_v8f32(<8 x float> %a, <8 x float> %b, <8 x float> %c) #
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
 ; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x10]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x2c]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x28,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c]
-; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x1c]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x18,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x24,0x20]
@@ -1382,16 +1382,16 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    calll _fmaf ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fmaf-4, kind: FK_PCRel_4
 ; FMACALL32_BDVER2-NEXT:    fstps {{[0-9]+}}(%esp) ## encoding: [0xd9,0x5c,0x24,0x40]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x44,0x24,0x3c]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x38,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0],mem[0],xmm0[2,3]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c]
-; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x2c]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x28,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c]
-; FMACALL32_BDVER2-NEXT:    ## xmm2 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x54,0x24,0x4c]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x48,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0],mem[0],xmm2[2,3]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0x44,0x24,0x34,0x20]
@@ -1407,8 +1407,8 @@ define <16 x float> @test_v16f32(<16 x float> %a, <16 x float> %b, <16 x float>
 ; FMACALL32_BDVER2-NEXT:    vinsertps $48, {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc4,0xe3,0x69,0x21,0x54,0x24,0x40,0x30]
 ; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0,1,2],mem[0]
 ; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32_BDVER2-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c]
-; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x5c]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $16, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x58,0x10]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0],mem[0],xmm1[2,3]
 ; FMACALL32_BDVER2-NEXT:    vinsertps $32, {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc4,0xe3,0x71,0x21,0x4c,0x24,0x54,0x20]
@@ -1512,8 +1512,8 @@ define <2 x double> @test_v2f64(<2 x double> %a, <2 x double> %b, <2 x double> %
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
 ; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x20]
-; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x28]
 ; FMACALL32_BDVER2-NEXT:    vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x20]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0,1]
 ; FMACALL32_BDVER2-NEXT:    addl $108, %esp ## encoding: [0x83,0xc4,0x6c]
@@ -1733,10 +1733,10 @@ define <4 x double> @test_v4f64(<4 x double> %a, <4 x double> %b, <4 x double> %
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
 ; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x18]
-; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
-; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20]
-; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x30]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x20]
 ; FMACALL32_BDVER2-NEXT:    vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x28]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0,1]
 ; FMACALL32_BDVER2-NEXT:    vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x18]
@@ -1928,8 +1928,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    ## imm = 0x160
 ; FMACALL32_BDVER2-NEXT:    vmovaps %ymm0, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x84,0x24,0xe0,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    vmovsd 56(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x38]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x45,0x38]
 ; FMACALL32_BDVER2-NEXT:    vmovaps %ymm2, {{[-0-9]+}}(%e{{[sb]}}p) ## 32-byte Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfc,0x29,0x94,0x24,0x00,0x01,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vextractf128 $1, %ymm3, %xmm2 ## encoding: [0xc4,0xe3,0x7d,0x19,0xda,0x01]
@@ -1949,8 +1949,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovsd 48(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x30]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x45,0x30]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x94,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
@@ -1963,8 +1963,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovsd 40(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x28]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x45,0x28]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0x88,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
@@ -1977,8 +1977,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovsd 32(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x20]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x45,0x20]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xc0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
@@ -1998,8 +1998,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovsd 24(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x18]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x45,0x18]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0xbc,0x24,0xa0,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
@@ -2011,8 +2011,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vmovups %xmm0, (%esp) ## encoding: [0xc5,0xf8,0x11,0x04,0x24]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovsd 16(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x10]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x45,0x10]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x30]
 ; FMACALL32_BDVER2-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
@@ -2025,8 +2025,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovsd 8(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x08]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x45,0x08]
 ; FMACALL32_BDVER2-NEXT:    fstpt {{[-0-9]+}}(%e{{[sb]}}p) ## 10-byte Folded Spill
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xdb,0x7c,0x24,0x20]
 ; FMACALL32_BDVER2-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
@@ -2039,8 +2039,8 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    vzeroupper ## encoding: [0xc5,0xf8,0x77]
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
-; FMACALL32_BDVER2-NEXT:    vmovsd 64(%ebp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x45,0x40]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x45,0x40]
 ; FMACALL32_BDVER2-NEXT:    vmovsd %xmm0, {{[0-9]+}}(%esp) ## encoding: [0xc5,0xfb,0x11,0x44,0x24,0x10]
 ; FMACALL32_BDVER2-NEXT:    vmovaps {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 ## 16-byte Reload
 ; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x84,0x24,0x30,0x01,0x00,0x00]
@@ -2070,21 +2070,21 @@ define <8 x double> @test_v8f64(<8 x double> %a, <8 x double> %b, <8 x double> %
 ; FMACALL32_BDVER2-NEXT:    calll _fma ## encoding: [0xe8,A,A,A,A]
 ; FMACALL32_BDVER2-NEXT:    ## fixup A - offset: 1, value: _fma-4, kind: FK_PCRel_4
 ; FMACALL32_BDVER2-NEXT:    fstpl {{[0-9]+}}(%esp) ## encoding: [0xdd,0x5c,0x24,0x68]
-; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60]
-; FMACALL32_BDVER2-NEXT:    ## xmm0 = mem[0],zero
-; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50]
-; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x44,0x24,0x60]
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x50]
 ; FMACALL32_BDVER2-NEXT:    vmovhps {{[0-9]+}}(%esp), %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x16,0x44,0x24,0x58]
 ; FMACALL32_BDVER2-NEXT:    ## xmm0 = xmm0[0,1],mem[0,1]
 ; FMACALL32_BDVER2-NEXT:    vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x48]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0,1]
-; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm2 ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70]
-; FMACALL32_BDVER2-NEXT:    ## xmm2 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x54,0x24,0x70]
 ; FMACALL32_BDVER2-NEXT:    vmovhps {{[0-9]+}}(%esp), %xmm2, %xmm2 ## encoding: [0xc5,0xe8,0x16,0x54,0x24,0x68]
 ; FMACALL32_BDVER2-NEXT:    ## xmm2 = xmm2[0,1],mem[0,1]
 ; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0 ## encoding: [0xc4,0xe3,0x7d,0x18,0xc1,0x01]
-; FMACALL32_BDVER2-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00]
-; FMACALL32_BDVER2-NEXT:    ## xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; FMACALL32_BDVER2-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x8c,0x24,0x80,0x00,0x00,0x00]
 ; FMACALL32_BDVER2-NEXT:    vmovhps {{[0-9]+}}(%esp), %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x16,0x4c,0x24,0x78]
 ; FMACALL32_BDVER2-NEXT:    ## xmm1 = xmm1[0,1],mem[0,1]
 ; FMACALL32_BDVER2-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1 ## encoding: [0xc4,0xe3,0x75,0x18,0xca,0x01]
diff --git a/llvm/test/CodeGen/X86/fma_patterns.ll b/llvm/test/CodeGen/X86/fma_patterns.ll
index aa99672b8fc6a..0ffcb8c46cef9 100644
--- a/llvm/test/CodeGen/X86/fma_patterns.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns.ll
@@ -1269,7 +1269,7 @@ define <4 x float> @test_v4f32_mul_y_sub_x_negone_undefs(<4 x float> %x, <4 x fl
 define float @test_f32_interp(float %x, float %y, float %t) {
 ; FMA-INFS-LABEL: test_f32_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; FMA-INFS-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; FMA-INFS-NEXT:    vsubss %xmm2, %xmm3, %xmm3
 ; FMA-INFS-NEXT:    vmulss %xmm3, %xmm1, %xmm1
 ; FMA-INFS-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1277,7 +1277,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
 ;
 ; FMA4-INFS-LABEL: test_f32_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; FMA4-INFS-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; FMA4-INFS-NEXT:    vsubss %xmm2, %xmm3, %xmm3
 ; FMA4-INFS-NEXT:    vmulss %xmm3, %xmm1, %xmm1
 ; FMA4-INFS-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
@@ -1285,7 +1285,7 @@ define float @test_f32_interp(float %x, float %y, float %t) {
 ;
 ; AVX512-INFS-LABEL: test_f32_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX512-INFS-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512-INFS-NEXT:    vsubss %xmm2, %xmm3, %xmm3
 ; AVX512-INFS-NEXT:    vmulss %xmm3, %xmm1, %xmm1
 ; AVX512-INFS-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1416,7 +1416,7 @@ define <8 x float> @test_v8f32_interp(<8 x float> %x, <8 x float> %y, <8 x float
 define double @test_f64_interp(double %x, double %y, double %t) {
 ; FMA-INFS-LABEL: test_f64_interp:
 ; FMA-INFS:       # %bb.0:
-; FMA-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; FMA-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0]
 ; FMA-INFS-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
 ; FMA-INFS-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
 ; FMA-INFS-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
@@ -1424,7 +1424,7 @@ define double @test_f64_interp(double %x, double %y, double %t) {
 ;
 ; FMA4-INFS-LABEL: test_f64_interp:
 ; FMA4-INFS:       # %bb.0:
-; FMA4-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; FMA4-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0]
 ; FMA4-INFS-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
 ; FMA4-INFS-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
 ; FMA4-INFS-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm2) + xmm1
@@ -1432,7 +1432,7 @@ define double @test_f64_interp(double %x, double %y, double %t) {
 ;
 ; AVX512-INFS-LABEL: test_f64_interp:
 ; AVX512-INFS:       # %bb.0:
-; AVX512-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX512-INFS-NEXT:    vmovsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0]
 ; AVX512-INFS-NEXT:    vsubsd %xmm2, %xmm3, %xmm3
 ; AVX512-INFS-NEXT:    vmulsd %xmm3, %xmm1, %xmm1
 ; AVX512-INFS-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm2 * xmm0) + xmm1
diff --git a/llvm/test/CodeGen/X86/fmf-flags.ll b/llvm/test/CodeGen/X86/fmf-flags.ll
index daaf643201e92..24dabfc18b9e3 100644
--- a/llvm/test/CodeGen/X86/fmf-flags.ll
+++ b/llvm/test/CodeGen/X86/fmf-flags.ll
@@ -51,7 +51,7 @@ define dso_local float @fast_fmuladd_opts(float %a , float %b , float %c) {
 define dso_local double @not_so_fast_mul_add(double %x) {
 ; X64-LABEL: not_so_fast_mul_add:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm1 = [4.2000000000000002E+0,0.0E+0]
 ; X64-NEXT:    mulsd %xmm0, %xmm1
 ; X64-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; X64-NEXT:    movsd %xmm1, mul1(%rip)
diff --git a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
index b927f92897a20..55502f37da1a6 100644
--- a/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/fminimum-fmaximum.ll
@@ -119,12 +119,12 @@ define <4 x float> @test_fmaximum_scalarize(<4 x float> %x, <4 x float> %y) "no-
 define float @test_fmaximum_nan0(float %x, float %y) {
 ; SSE2-LABEL: test_fmaximum_nan0:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fmaximum_nan0:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; X86-LABEL: test_fmaximum_nan0:
@@ -138,12 +138,12 @@ define float @test_fmaximum_nan0(float %x, float %y) {
 define float @test_fmaximum_nan1(float %x, float %y) {
 ; SSE2-LABEL: test_fmaximum_nan1:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fmaximum_nan1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; X86-LABEL: test_fmaximum_nan1:
@@ -608,12 +608,12 @@ define <2 x double> @test_fminimum_scalarize(<2 x double> %x, <2 x double> %y) "
 define float @test_fminimum_nan0(float %x, float %y) {
 ; SSE2-LABEL: test_fminimum_nan0:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fminimum_nan0:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; X86-LABEL: test_fminimum_nan0:
@@ -627,12 +627,12 @@ define float @test_fminimum_nan0(float %x, float %y) {
 define float @test_fminimum_nan1(float %x, float %y) {
 ; SSE2-LABEL: test_fminimum_nan1:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fminimum_nan1:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; X86-LABEL: test_fminimum_nan1:
@@ -816,12 +816,12 @@ define double @test_fminimum_zero1(double %x, double %y) nounwind {
 define double @test_fminimum_zero2(double %x, double %y) {
 ; SSE2-LABEL: test_fminimum_zero2:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fminimum_zero2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; X86-LABEL: test_fminimum_zero2:
@@ -1187,7 +1187,7 @@ define <4 x float> @test_fmaximum_vector_non_zero(<4 x float> %x) {
 define <2 x double> @test_fminimum_vector_nan(<2 x double> %x) {
 ; SSE2-LABEL: test_fminimum_vector_nan:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = [NaN,0.0E+0]
 ; SSE2-NEXT:    xorpd %xmm1, %xmm1
 ; SSE2-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0]
 ; SSE2-NEXT:    minpd %xmm0, %xmm1
@@ -1197,7 +1197,7 @@ define <2 x double> @test_fminimum_vector_nan(<2 x double> %x) {
 ;
 ; AVX-LABEL: test_fminimum_vector_nan:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [NaN,0.0E+0]
 ; AVX-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm1[0]
 ; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index 063182fcecf3e..abaefaee33ed6 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -49,26 +49,26 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
 ; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
 ; CHECK-SSE-NEXT:    movd %xmm1, %edi
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-SSE-NEXT:    pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-SSE-NEXT:    # xmm0 = mem[2,3,2,3]
 ; CHECK-SSE-NEXT:    movd %xmm0, %edi
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-SSE-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-SSE-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-SSE-NEXT:    movd %xmm0, %edi
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; CHECK-SSE-NEXT:    # xmm0 = mem[1,1,1,1]
 ; CHECK-SSE-NEXT:    movd %xmm0, %edi
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-SSE-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -85,26 +85,26 @@ define <4 x float> @fmul_pow2_ldexp_4xfloat(<4 x i32> %i) {
 ; CHECK-AVX-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX-NEXT:    vextractps $1, %xmm0, %edi
-; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-AVX-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-AVX-NEXT:    vmovd %xmm0, %edi
-; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX-NEXT:    vinsertps $16, (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; CHECK-AVX-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; CHECK-AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-AVX-NEXT:    vextractps $2, %xmm0, %edi
-; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
 ; CHECK-AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-AVX-NEXT:    vmovaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-AVX-NEXT:    vextractps $3, %xmm0, %edi
-; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-NEXT:    vmovss {{.*#+}} xmm0 = [9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
@@ -407,13 +407,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-SSE-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-SSE-NEXT:    pextrw $7, %xmm0, %edi
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-SSE-NEXT:    pextrw $6, %xmm0, %edi
-; CHECK-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -421,13 +421,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-SSE-NEXT:    pextrw $5, %xmm0, %edi
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-SSE-NEXT:    pextrw $4, %xmm0, %edi
-; CHECK-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -437,13 +437,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-SSE-NEXT:    pextrw $3, %xmm0, %edi
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-SSE-NEXT:    pextrw $2, %xmm0, %edi
-; CHECK-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -451,14 +451,14 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-SSE-NEXT:    pextrw $1, %xmm0, %edi
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-SSE-NEXT:    movd %xmm0, %eax
 ; CHECK-SSE-NEXT:    movzwl %ax, %edi
-; CHECK-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    callq ldexpf@PLT
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -477,13 +477,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX2-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-AVX2-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-AVX2-NEXT:    vpextrw $7, %xmm0, %edi
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX2-NEXT:    vpextrw $6, %xmm0, %edi
-; CHECK-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -491,13 +491,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX2-NEXT:    vpextrw $5, %xmm0, %edi
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX2-NEXT:    vpextrw $4, %xmm0, %edi
-; CHECK-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -507,13 +507,13 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX2-NEXT:    vpextrw $3, %xmm0, %edi
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX2-NEXT:    vpextrw $2, %xmm0, %edi
-; CHECK-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -521,14 +521,14 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX2-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX2-NEXT:    vpextrw $1, %xmm0, %edi
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX2-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX2-NEXT:    vmovd %xmm0, %eax
 ; CHECK-AVX2-NEXT:    movzwl %ax, %edi
-; CHECK-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -547,7 +547,7 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX512F-NEXT:    .cfi_def_cfa_offset 80
 ; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-AVX512F-NEXT:    vpextrw $7, %xmm0, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
@@ -555,7 +555,7 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX512F-NEXT:    vpextrw $6, %xmm0, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
@@ -565,7 +565,7 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX512F-NEXT:    vpextrw $5, %xmm0, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
@@ -573,7 +573,7 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX512F-NEXT:    vpextrw $4, %xmm0, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
@@ -585,7 +585,7 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX512F-NEXT:    vpextrw $3, %xmm0, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
@@ -593,7 +593,7 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX512F-NEXT:    vpextrw $2, %xmm0, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
@@ -603,7 +603,7 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX512F-NEXT:    vpextrw $1, %xmm0, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
@@ -612,7 +612,7 @@ define <8 x half> @fmul_pow2_ldexp_8xhalf(<8 x i16> %i) {
 ; CHECK-AVX512F-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
 ; CHECK-AVX512F-NEXT:    movzwl %ax, %edi
-; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX512F-NEXT:    callq ldexpf@PLT
 ; CHECK-AVX512F-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-AVX512F-NEXT:    vmovd %xmm0, %eax
@@ -1270,7 +1270,7 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
 ; CHECK-SSE-NEXT:    addss %xmm1, %xmm1
 ; CHECK-SSE-NEXT:  .LBB22_3:
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    retq
 ;
@@ -1290,7 +1290,7 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    vaddss %xmm0, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:  .LBB22_3:
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -1301,7 +1301,7 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
 ; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    retq
 ;
@@ -1310,7 +1310,7 @@ define float @fdiv_pow_shl_cnt_fail_maybe_z(i64 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    movl $8, %eax
 ; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
 ; CHECK-FMA-NEXT:    vcvtusi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-FMA-NEXT:    retq
   %shl = shl i64 8, %cnt
@@ -1327,7 +1327,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-SSE-NEXT:    shlq %cl, %rax
 ; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    retq
 ;
@@ -1338,7 +1338,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-AVX2-NEXT:    shlq %cl, %rax
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -1349,7 +1349,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
 ; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    retq
 ;
@@ -1358,7 +1358,7 @@ define float @fdiv_pow_shl_cnt_fail_neg_int(i64 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    movl $8, %eax
 ; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
 ; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [-9.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-FMA-NEXT:    retq
   %shl = shl i64 8, %cnt
@@ -1376,7 +1376,7 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-SSE-NEXT:    shlq %cl, %rax
 ; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    retq
 ;
@@ -1388,7 +1388,7 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-AVX2-NEXT:    shlq %cl, %rax
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -1400,7 +1400,7 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; CHECK-NO-FASTFMA-NEXT:    shlq %cl, %rax
 ; CHECK-NO-FASTFMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    retq
 ;
@@ -1410,7 +1410,7 @@ define float @fdiv_pow_shl_cnt(i64 %cnt_in) nounwind {
 ; CHECK-FMA-NEXT:    movl $8, %eax
 ; CHECK-FMA-NEXT:    shlxq %rdi, %rax, %rax
 ; CHECK-FMA-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-FMA-NEXT:    retq
   %cnt = and i64 %cnt_in, 31
@@ -1431,7 +1431,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
 ; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
@@ -1448,7 +1448,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    popq %rax
@@ -1464,7 +1464,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
@@ -1479,7 +1479,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bounds(i32 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [8.192E+3,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
@@ -1547,7 +1547,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-SSE-NEXT:    cvtsi2ss %eax, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-SSE-NEXT:    callq __extendhfsf2@PLT
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    divss %xmm0, %xmm1
 ; CHECK-SSE-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    callq __truncsfhf2@PLT
@@ -1565,7 +1565,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %eax, %xmm0, %xmm0
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    callq __extendhfsf2@PLT
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-AVX2-NEXT:    popq %rax
@@ -1582,7 +1582,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; CHECK-NO-FASTFMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    vmovd %xmm0, %eax
@@ -1598,7 +1598,7 @@ define half @fdiv_pow_shl_cnt_fail_out_of_bound2(i16 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vpmovzxwq {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; CHECK-FMA-NEXT:    vcvtph2ps %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-FMA-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; CHECK-FMA-NEXT:    vmovd %xmm0, %eax
@@ -1642,7 +1642,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; CHECK-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-SSE-NEXT:    shll %cl, %eax
 ; CHECK-SSE-NEXT:    cvtsi2ss %rax, %xmm1
-; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-NEXT:    movss {{.*#+}} xmm0 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-SSE-NEXT:    divss %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    retq
 ;
@@ -1653,7 +1653,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; CHECK-AVX2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-AVX2-NEXT:    shll %cl, %eax
 ; CHECK-AVX2-NEXT:    vcvtsi2ss %rax, %xmm0, %xmm0
-; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-AVX2-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-AVX2-NEXT:    retq
 ;
@@ -1664,7 +1664,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; CHECK-NO-FASTFMA-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NO-FASTFMA-NEXT:    shll %cl, %eax
 ; CHECK-NO-FASTFMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NO-FASTFMA-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NO-FASTFMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-NO-FASTFMA-NEXT:    retq
 ;
@@ -1673,7 +1673,7 @@ define float @fdiv_pow_shl_cnt32_out_of_bounds2(i32 %cnt) nounwind {
 ; CHECK-FMA-NEXT:    movl $1, %eax
 ; CHECK-FMA-NEXT:    shlxl %edi, %eax, %eax
 ; CHECK-FMA-NEXT:    vcvtusi2ss %eax, %xmm0, %xmm0
-; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-FMA-NEXT:    vmovss {{.*#+}} xmm1 = [1.00974148E-28,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-FMA-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; CHECK-FMA-NEXT:    retq
   %shl = shl nuw i32 1, %cnt
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
index 2253d7cbaf8b6..71d49481ebb8e 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics-fma.ll
@@ -385,7 +385,7 @@ entry:
 define float @f15() #0 {
 ; NOFMA-LABEL: f15:
 ; NOFMA:       # %bb.0: # %entry
-; NOFMA-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; NOFMA-NEXT:    movss {{.*#+}} xmm1 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; NOFMA-NEXT:    movaps %xmm1, %xmm0
 ; NOFMA-NEXT:    mulss %xmm1, %xmm0
 ; NOFMA-NEXT:    addss %xmm1, %xmm0
@@ -393,13 +393,13 @@ define float @f15() #0 {
 ;
 ; FMA-LABEL: f15:
 ; FMA:       # %bb.0: # %entry
-; FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMA-NEXT:    vmovss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
 ; FMA-NEXT:    retq
 ;
 ; FMA4-LABEL: f15:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMA4-NEXT:    vmovss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
 ; FMA4-NEXT:    retq
 entry:
@@ -417,7 +417,7 @@ entry:
 define double @f16() #0 {
 ; NOFMA-LABEL: f16:
 ; NOFMA:       # %bb.0: # %entry
-; NOFMA-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; NOFMA-NEXT:    movsd {{.*#+}} xmm1 = [4.2100000000000001E+1,0.0E+0]
 ; NOFMA-NEXT:    movapd %xmm1, %xmm0
 ; NOFMA-NEXT:    mulsd %xmm1, %xmm0
 ; NOFMA-NEXT:    addsd %xmm1, %xmm0
@@ -425,13 +425,13 @@ define double @f16() #0 {
 ;
 ; FMA-LABEL: f16:
 ; FMA:       # %bb.0: # %entry
-; FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMA-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
 ; FMA-NEXT:    retq
 ;
 ; FMA4-LABEL: f16:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMA4-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
 ; FMA4-NEXT:    retq
 entry:
@@ -451,7 +451,7 @@ define float @f17() #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; NOFMA-NEXT:    movss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; NOFMA-NEXT:    movaps %xmm0, %xmm1
 ; NOFMA-NEXT:    movaps %xmm0, %xmm2
 ; NOFMA-NEXT:    callq fmaf@PLT
@@ -461,13 +461,13 @@ define float @f17() #0 {
 ;
 ; FMA-LABEL: f17:
 ; FMA:       # %bb.0: # %entry
-; FMA-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMA-NEXT:    vmovss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; FMA-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
 ; FMA-NEXT:    retq
 ;
 ; FMA4-LABEL: f17:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; FMA4-NEXT:    vmovss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; FMA4-NEXT:    vfmaddss {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
 ; FMA4-NEXT:    retq
 entry:
@@ -487,7 +487,7 @@ define double @f18() #0 {
 ; NOFMA:       # %bb.0: # %entry
 ; NOFMA-NEXT:    pushq %rax
 ; NOFMA-NEXT:    .cfi_def_cfa_offset 16
-; NOFMA-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; NOFMA-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; NOFMA-NEXT:    movaps %xmm0, %xmm1
 ; NOFMA-NEXT:    movaps %xmm0, %xmm2
 ; NOFMA-NEXT:    callq fma@PLT
@@ -497,13 +497,13 @@ define double @f18() #0 {
 ;
 ; FMA-LABEL: f18:
 ; FMA:       # %bb.0: # %entry
-; FMA-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMA-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; FMA-NEXT:    vfmadd213sd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
 ; FMA-NEXT:    retq
 ;
 ; FMA4-LABEL: f18:
 ; FMA4:       # %bb.0: # %entry
-; FMA4-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; FMA4-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; FMA4-NEXT:    vfmaddsd {{.*#+}} xmm0 = (xmm0 * xmm0) + xmm0
 ; FMA4-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index 5f77e2cb46cbf..d2b45ee1e03e6 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -26,7 +26,7 @@ define double @f1() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    divsd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
@@ -37,13 +37,13 @@ define double @f1() #0 {
 ;
 ; SSE-LABEL: f1:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; SSE-NEXT:    divsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: f1:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -130,7 +130,7 @@ define double @f3(double %a, double %b) #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    movapd %xmm0, %xmm1
 ; X86-SSE-NEXT:    subsd {{[0-9]+}}(%esp), %xmm1
 ; X86-SSE-NEXT:    mulsd {{[0-9]+}}(%esp), %xmm1
@@ -144,7 +144,7 @@ define double @f3(double %a, double %b) #0 {
 ;
 ; SSE-LABEL: f3:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
 ; SSE-NEXT:    movapd %xmm2, %xmm3
 ; SSE-NEXT:    subsd %xmm0, %xmm3
 ; SSE-NEXT:    mulsd %xmm1, %xmm3
@@ -154,7 +154,7 @@ define double @f3(double %a, double %b) #0 {
 ;
 ; AVX-LABEL: f3:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
 ; AVX-NEXT:    vmulsd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vsubsd %xmm0, %xmm2, %xmm0
@@ -264,7 +264,7 @@ define double @f5() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    sqrtsd %xmm0, %xmm0
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    fldl (%esp)
@@ -275,13 +275,13 @@ define double @f5() #0 {
 ;
 ; SSE-LABEL: f5:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    sqrtsd %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: f5:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -311,9 +311,9 @@ define double @f6() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $28, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 32
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll pow
 ; X86-SSE-NEXT:    addl $28, %esp
@@ -324,8 +324,8 @@ define double @f6() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; SSE-NEXT:    callq pow@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -335,8 +335,8 @@ define double @f6() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -368,7 +368,7 @@ define double @f7() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    movl $3, {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    calll __powidf2
@@ -380,7 +380,7 @@ define double @f7() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; SSE-NEXT:    movl $3, %edi
 ; SSE-NEXT:    callq __powidf2@PLT
 ; SSE-NEXT:    popq %rax
@@ -391,7 +391,7 @@ define double @f7() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powidf2@PLT
 ; AVX-NEXT:    popq %rax
@@ -423,7 +423,7 @@ define double @f8() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll sin
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -434,7 +434,7 @@ define double @f8() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    callq sin@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -444,7 +444,7 @@ define double @f8() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -474,7 +474,7 @@ define double @f9() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll cos
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -485,7 +485,7 @@ define double @f9() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    callq cos@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -495,7 +495,7 @@ define double @f9() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -525,7 +525,7 @@ define double @f10() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll exp
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -536,7 +536,7 @@ define double @f10() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    callq exp@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -546,7 +546,7 @@ define double @f10() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -576,7 +576,7 @@ define double @f11() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll exp2
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -587,7 +587,7 @@ define double @f11() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; SSE-NEXT:    callq exp2@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -597,7 +597,7 @@ define double @f11() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -627,7 +627,7 @@ define double @f12() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll log
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -638,7 +638,7 @@ define double @f12() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    callq log@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -648,7 +648,7 @@ define double @f12() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -678,7 +678,7 @@ define double @f13() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll log10
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -689,7 +689,7 @@ define double @f13() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    callq log10@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -699,7 +699,7 @@ define double @f13() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -729,7 +729,7 @@ define double @f14() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll log2
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -740,7 +740,7 @@ define double @f14() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    callq log2@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -750,7 +750,7 @@ define double @f14() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -780,7 +780,7 @@ define double @f15() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll rint
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -791,7 +791,7 @@ define double @f15() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; SSE-NEXT:    callq rint@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -799,7 +799,7 @@ define double @f15() #0 {
 ;
 ; AVX-LABEL: f15:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -828,7 +828,7 @@ define double @f16() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $12, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 16
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll nearbyint
 ; X86-SSE-NEXT:    addl $12, %esp
@@ -839,7 +839,7 @@ define double @f16() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; SSE-NEXT:    callq nearbyint@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -847,7 +847,7 @@ define double @f16() #0 {
 ;
 ; AVX-LABEL: f16:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -877,9 +877,9 @@ define double @f19() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    subl $28, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 32
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+1,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp)
 ; X86-SSE-NEXT:    calll fmod
 ; X86-SSE-NEXT:    addl $28, %esp
@@ -890,8 +890,8 @@ define double @f19() #0 {
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 16
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; SSE-NEXT:    callq fmod@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    .cfi_def_cfa_offset 8
@@ -901,8 +901,8 @@ define double @f19() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -1301,7 +1301,7 @@ define i32 @f20u(double %x) #0 {
 ; X86-SSE-LABEL: f20u:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = [2.147483648E+9,0.0E+0]
 ; X86-SSE-NEXT:    comisd %xmm0, %xmm2
 ; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
 ; X86-SSE-NEXT:    ja .LBB24_2
@@ -1378,7 +1378,7 @@ define i64 @f20u64(double %x) #0 {
 ; X86-SSE-NEXT:    subl $20, %esp
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 24
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; X86-SSE-NEXT:    comisd %xmm0, %xmm1
 ; X86-SSE-NEXT:    jbe .LBB25_2
 ; X86-SSE-NEXT:  # %bb.1: # %entry
@@ -1406,7 +1406,7 @@ define i64 @f20u64(double %x) #0 {
 ;
 ; SSE-LABEL: f20u64:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-NEXT:    comisd %xmm2, %xmm0
 ; SSE-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-NEXT:    jb .LBB25_2
@@ -1423,7 +1423,7 @@ define i64 @f20u64(double %x) #0 {
 ;
 ; AVX1-LABEL: f20u64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm1, %xmm0
 ; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    jb .LBB25_2
@@ -1537,7 +1537,7 @@ define float @f21() #0 {
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    pushl %eax
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; X86-SSE-NEXT:    cvtsd2ss %xmm0, %xmm0
 ; X86-SSE-NEXT:    movss %xmm0, (%esp)
 ; X86-SSE-NEXT:    flds (%esp)
@@ -1548,13 +1548,13 @@ define float @f21() #0 {
 ;
 ; SSE-LABEL: f21:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; SSE-NEXT:    cvtsd2ss %xmm0, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: f21:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/fp-logic.ll b/llvm/test/CodeGen/X86/fp-logic.ll
index 7fef4269f6565..522a1589caf09 100644
--- a/llvm/test/CodeGen/X86/fp-logic.ll
+++ b/llvm/test/CodeGen/X86/fp-logic.ll
@@ -99,7 +99,7 @@ define float @f6(float %x, i32 %y) {
 define float @f7(float %x) {
 ; CHECK-LABEL: f7:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.20389539E-45,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast float %x to i32
@@ -113,7 +113,7 @@ define float @f7(float %x) {
 define float @f8(float %x) {
 ; CHECK-LABEL: f8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [5.60519386E-45,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast float %x to i32
@@ -177,7 +177,7 @@ define float @xor(float %x, float %y) {
 define float @f7_or(float %x) {
 ; CHECK-LABEL: f7_or:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.20389539E-45,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    orps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast float %x to i32
@@ -189,7 +189,7 @@ define float @f7_or(float %x) {
 define float @f7_xor(float %x) {
 ; CHECK-LABEL: f7_xor:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.20389539E-45,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast float %x to i32
@@ -215,7 +215,7 @@ define double @doubles(double %x, double %y) {
 define double @f7_double(double %x) {
 ; CHECK-LABEL: f7_double:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.4821969375237396E-323,0.0E+0]
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast double %x to i64
@@ -231,7 +231,7 @@ define double @f7_double(double %x) {
 define float @movmsk(float %x) {
 ; CHECK-LABEL: movmsk:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-0.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    andps %xmm1, %xmm0
 ; CHECK-NEXT:    retq
   %bc1 = bitcast float %x to i32
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
index fac14d8f14e8a..bd3cb377bfce6 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint-fp16.ll
@@ -347,7 +347,7 @@ define i64 @fptoui_f16toi64(half %x) #0 {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    pushq %rax
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    comiss %xmm2, %xmm0
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
 ; SSE2-NEXT:    jb .LBB9_2
@@ -369,7 +369,7 @@ define i64 @fptoui_f16toi64(half %x) #0 {
 ; F16C-NEXT:    movzwl %ax, %eax
 ; F16C-NEXT:    vmovd %eax, %xmm0
 ; F16C-NEXT:    vcvtph2ps %xmm0, %xmm0
-; F16C-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; F16C-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; F16C-NEXT:    vcomiss %xmm1, %xmm0
 ; F16C-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; F16C-NEXT:    jb .LBB9_2
diff --git a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
index 25a946465ff3f..ecdc507a882c3 100644
--- a/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-scalar-fptoint.ll
@@ -447,7 +447,7 @@ define i32 @fptoui_f32toi32(float %x) #0 {
 ; SSE-X86-LABEL: fptoui_f32toi32:
 ; SSE-X86:       # %bb.0:
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-X86-NEXT:    movss {{.*#+}} xmm2 = [2.14748365E+9,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-X86-NEXT:    comiss %xmm0, %xmm2
 ; SSE-X86-NEXT:    xorps %xmm1, %xmm1
 ; SSE-X86-NEXT:    ja .LBB8_2
@@ -543,7 +543,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; SSE-X86-NEXT:    andl $-8, %esp
 ; SSE-X86-NEXT:    subl $16, %esp
 ; SSE-X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-X86-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-X86-NEXT:    comiss %xmm0, %xmm1
 ; SSE-X86-NEXT:    jbe .LBB9_2
 ; SSE-X86-NEXT:  # %bb.1:
@@ -572,7 +572,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ;
 ; SSE-X64-LABEL: fptoui_f32toi64:
 ; SSE-X64:       # %bb.0:
-; SSE-X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-X64-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-X64-NEXT:    comiss %xmm2, %xmm0
 ; SSE-X64-NEXT:    xorps %xmm1, %xmm1
 ; SSE-X64-NEXT:    jb .LBB9_2
@@ -597,7 +597,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; AVX1-X86-NEXT:    andl $-8, %esp
 ; AVX1-X86-NEXT:    subl $8, %esp
 ; AVX1-X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-X86-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-X86-NEXT:    vcomiss %xmm0, %xmm1
 ; AVX1-X86-NEXT:    jbe .LBB9_2
 ; AVX1-X86-NEXT:  # %bb.1:
@@ -620,7 +620,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ;
 ; AVX1-X64-LABEL: fptoui_f32toi64:
 ; AVX1-X64:       # %bb.0:
-; AVX1-X64-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-X64-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-X64-NEXT:    vcomiss %xmm1, %xmm0
 ; AVX1-X64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX1-X64-NEXT:    jb .LBB9_2
@@ -645,7 +645,7 @@ define i64 @fptoui_f32toi64(float %x) #0 {
 ; AVX512-X86-NEXT:    andl $-8, %esp
 ; AVX512-X86-NEXT:    subl $8, %esp
 ; AVX512-X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX512-X86-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-X86-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512-X86-NEXT:    xorl %edx, %edx
 ; AVX512-X86-NEXT:    vcomiss %xmm0, %xmm1
 ; AVX512-X86-NEXT:    setbe %dl
@@ -1087,7 +1087,7 @@ define i32 @fptoui_f64toi32(double %x) #0 {
 ; SSE-X86-LABEL: fptoui_f64toi32:
 ; SSE-X86:       # %bb.0:
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-X86-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-X86-NEXT:    movsd {{.*#+}} xmm2 = [2.147483648E+9,0.0E+0]
 ; SSE-X86-NEXT:    comisd %xmm0, %xmm2
 ; SSE-X86-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-X86-NEXT:    ja .LBB17_2
@@ -1183,7 +1183,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; SSE-X86-NEXT:    andl $-8, %esp
 ; SSE-X86-NEXT:    subl $16, %esp
 ; SSE-X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; SSE-X86-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-X86-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-X86-NEXT:    comisd %xmm0, %xmm1
 ; SSE-X86-NEXT:    jbe .LBB18_2
 ; SSE-X86-NEXT:  # %bb.1:
@@ -1212,7 +1212,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ;
 ; SSE-X64-LABEL: fptoui_f64toi64:
 ; SSE-X64:       # %bb.0:
-; SSE-X64-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-X64-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-X64-NEXT:    comisd %xmm2, %xmm0
 ; SSE-X64-NEXT:    xorpd %xmm1, %xmm1
 ; SSE-X64-NEXT:    jb .LBB18_2
@@ -1237,7 +1237,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; AVX1-X86-NEXT:    andl $-8, %esp
 ; AVX1-X86-NEXT:    subl $8, %esp
 ; AVX1-X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1-X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-X86-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX1-X86-NEXT:    vcomisd %xmm0, %xmm1
 ; AVX1-X86-NEXT:    jbe .LBB18_2
 ; AVX1-X86-NEXT:  # %bb.1:
@@ -1260,7 +1260,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ;
 ; AVX1-X64-LABEL: fptoui_f64toi64:
 ; AVX1-X64:       # %bb.0:
-; AVX1-X64-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-X64-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX1-X64-NEXT:    vcomisd %xmm1, %xmm0
 ; AVX1-X64-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX1-X64-NEXT:    jb .LBB18_2
@@ -1285,7 +1285,7 @@ define i64 @fptoui_f64toi64(double %x) #0 {
 ; AVX512-X86-NEXT:    andl $-8, %esp
 ; AVX512-X86-NEXT:    subl $8, %esp
 ; AVX512-X86-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX512-X86-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512-X86-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX512-X86-NEXT:    xorl %edx, %edx
 ; AVX512-X86-NEXT:    vcomisd %xmm0, %xmm1
 ; AVX512-X86-NEXT:    setbe %dl
diff --git a/llvm/test/CodeGen/X86/fp-undef.ll b/llvm/test/CodeGen/X86/fp-undef.ll
index 2ae51c6c97e9b..227f0073e103b 100644
--- a/llvm/test/CodeGen/X86/fp-undef.ll
+++ b/llvm/test/CodeGen/X86/fp-undef.ll
@@ -8,7 +8,7 @@
 define float @fadd_undef_op0(float %x) {
 ; ANY-LABEL: fadd_undef_op0:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fadd float undef, %x
   ret float %r
@@ -17,7 +17,7 @@ define float @fadd_undef_op0(float %x) {
 define float @fadd_undef_op1(float %x) {
 ; ANY-LABEL: fadd_undef_op1:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fadd float %x, undef
   ret float %r
@@ -26,7 +26,7 @@ define float @fadd_undef_op1(float %x) {
 define float @fsub_undef_op0(float %x) {
 ; ANY-LABEL: fsub_undef_op0:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fsub float undef, %x
   ret float %r
@@ -35,7 +35,7 @@ define float @fsub_undef_op0(float %x) {
 define float @fsub_undef_op1(float %x) {
 ; ANY-LABEL: fsub_undef_op1:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fsub float %x, undef
   ret float %r
@@ -44,7 +44,7 @@ define float @fsub_undef_op1(float %x) {
 define float @fmul_undef_op0(float %x) {
 ; ANY-LABEL: fmul_undef_op0:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fmul float undef, %x
   ret float %r
@@ -53,7 +53,7 @@ define float @fmul_undef_op0(float %x) {
 define float @fmul_undef_op1(float %x) {
 ; ANY-LABEL: fmul_undef_op1:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fmul float %x, undef
   ret float %r
@@ -62,7 +62,7 @@ define float @fmul_undef_op1(float %x) {
 define float @fdiv_undef_op0(float %x) {
 ; ANY-LABEL: fdiv_undef_op0:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fdiv float undef, %x
   ret float %r
@@ -71,7 +71,7 @@ define float @fdiv_undef_op0(float %x) {
 define float @fdiv_undef_op1(float %x) {
 ; ANY-LABEL: fdiv_undef_op1:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fdiv float %x, undef
   ret float %r
@@ -80,7 +80,7 @@ define float @fdiv_undef_op1(float %x) {
 define float @frem_undef_op0(float %x) {
 ; ANY-LABEL: frem_undef_op0:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = frem float undef, %x
   ret float %r
@@ -89,7 +89,7 @@ define float @frem_undef_op0(float %x) {
 define float @frem_undef_op1(float %x) {
 ; ANY-LABEL: frem_undef_op1:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = frem float %x, undef
   ret float %r
@@ -232,7 +232,7 @@ define float @fadd_undef_op0_nnan_constant(float %x) {
 define float @fadd_undef_op1_constant(float %x) {
 ; ANY-LABEL: fadd_undef_op1_constant:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fadd float 2.0, undef
   ret float %r
@@ -249,7 +249,7 @@ define float @fsub_undef_op0_fast_constant(float %x) {
 define float @fsub_undef_op1_constant(float %x) {
 ; ANY-LABEL: fsub_undef_op1_constant:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fsub float 4.0, undef
   ret float %r
@@ -266,7 +266,7 @@ define float @fmul_undef_op0_nnan_constant(float %x) {
 define float @fmul_undef_op1_constant(float %x) {
 ; ANY-LABEL: fmul_undef_op1_constant:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fmul float 6.0, undef
   ret float %r
@@ -283,7 +283,7 @@ define float @fdiv_undef_op0_fast_constant(float %x) {
 define float @fdiv_undef_op1_constant(float %x) {
 ; ANY-LABEL: fdiv_undef_op1_constant:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fdiv float 8.0, undef
   ret float %r
@@ -300,7 +300,7 @@ define float @frem_undef_op0_nnan_constant(float %x) {
 define float @frem_undef_op1_constant(float %x) {
 ; ANY-LABEL: frem_undef_op1_constant:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ANY-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; ANY-NEXT:    retq
   %r = frem float 10.0, undef
   ret float %r
@@ -311,7 +311,7 @@ define float @frem_undef_op1_constant(float %x) {
 define double @fadd_undef_op0_constant_nan(double %x) {
 ; ANY-LABEL: fadd_undef_op0_constant_nan:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fadd double undef, 0x7FF8000000000000
   ret double %r
@@ -328,7 +328,7 @@ define double @fadd_undef_op1_fast_constant_nan(double %x) {
 define double @fsub_undef_op0_constant_nan(double %x) {
 ; ANY-LABEL: fsub_undef_op0_constant_nan:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fsub double undef, 0xFFF8000000000010
   ret double %r
@@ -345,7 +345,7 @@ define double @fsub_undef_op1_nnan_constant_nan(double %x) {
 define double @fmul_undef_op0_constant_nan(double %x) {
 ; ANY-LABEL: fmul_undef_op0_constant_nan:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fmul double undef, 0x7FF8000000000100
   ret double %r
@@ -362,7 +362,7 @@ define double @fmul_undef_op1_fast_constant_nan(double %x) {
 define double @fdiv_undef_op0_constant_nan(double %x) {
 ; ANY-LABEL: fdiv_undef_op0_constant_nan:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fdiv double undef, 0xFFF8000000000110
   ret double %r
@@ -379,7 +379,7 @@ define double @fdiv_undef_op1_nnan_constant_nan(double %x) {
 define double @frem_undef_op0_constant_nan(double %x) {
 ; ANY-LABEL: frem_undef_op0_constant_nan:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = frem double undef, 0x7FF8000000001000
   ret double %r
@@ -398,7 +398,7 @@ define double @frem_undef_op1_fast_constant_nan(double %x) {
 define double @fadd_undef_op0_constant_inf(double %x) {
 ; ANY-LABEL: fadd_undef_op0_constant_inf:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fadd double undef, 0x7FF0000000000000
   ret double %r
@@ -415,7 +415,7 @@ define double @fadd_undef_op1_fast_constant_inf(double %x) {
 define double @fsub_undef_op0_constant_inf(double %x) {
 ; ANY-LABEL: fsub_undef_op0_constant_inf:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fsub double undef, 0xFFF0000000000000
   ret double %r
@@ -432,7 +432,7 @@ define double @fsub_undef_op1_ninf_constant_inf(double %x) {
 define double @fmul_undef_op0_constant_inf(double %x) {
 ; ANY-LABEL: fmul_undef_op0_constant_inf:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fmul double undef, 0x7FF0000000000000
   ret double %r
@@ -449,7 +449,7 @@ define double @fmul_undef_op1_fast_constant_inf(double %x) {
 define double @fdiv_undef_op0_constant_inf(double %x) {
 ; ANY-LABEL: fdiv_undef_op0_constant_inf:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = fdiv double undef, 0xFFF0000000000000
   ret double %r
@@ -466,7 +466,7 @@ define double @fdiv_undef_op1_ninf_constant_inf(double %x) {
 define double @frem_undef_op0_constant_inf(double %x) {
 ; ANY-LABEL: frem_undef_op0_constant_inf:
 ; ANY:       # %bb.0:
-; ANY-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ANY-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; ANY-NEXT:    retq
   %r = frem double undef, 0x7FF0000000000000
   ret double %r
diff --git a/llvm/test/CodeGen/X86/fpclamptosat.ll b/llvm/test/CodeGen/X86/fpclamptosat.ll
index 2564e7f974cd8..3f5ec7b530fe0 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat.ll
@@ -217,9 +217,9 @@ entry:
 define i16 @stest_f64i16(double %x) nounwind {
 ; CHECK-LABEL: stest_f64i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-3.2768E+4,0.0E+0]
 ; CHECK-NEXT:    maxsd %xmm0, %xmm1
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [3.2767E+4,0.0E+0]
 ; CHECK-NEXT:    minsd %xmm1, %xmm0
 ; CHECK-NEXT:    cvttsd2si %xmm0, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -276,9 +276,9 @@ entry:
 define i16 @stest_f32i16(float %x) nounwind {
 ; CHECK-LABEL: stest_f32i16:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-3.2768E+4,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    maxss %xmm0, %xmm1
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [3.2767E+4,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -815,9 +815,9 @@ entry:
 define i16 @stest_f64i16_mm(double %x) nounwind {
 ; CHECK-LABEL: stest_f64i16_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-3.2768E+4,0.0E+0]
 ; CHECK-NEXT:    maxsd %xmm0, %xmm1
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [3.2767E+4,0.0E+0]
 ; CHECK-NEXT:    minsd %xmm1, %xmm0
 ; CHECK-NEXT:    cvttsd2si %xmm0, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -869,9 +869,9 @@ entry:
 define i16 @stest_f32i16_mm(float %x) nounwind {
 ; CHECK-LABEL: stest_f32i16_mm:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-3.2768E+4,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    maxss %xmm0, %xmm1
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [3.2767E+4,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    minss %xmm1, %xmm0
 ; CHECK-NEXT:    cvttss2si %xmm0, %eax
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 78ccc983d1637..42d6e8139c6aa 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -79,7 +79,7 @@ entry:
 define <2 x i32> @utest_f64i32(<2 x double> %x) nounwind {
 ; SSE-LABEL: utest_f64i32:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-NEXT:    movapd %xmm0, %xmm2
 ; SSE-NEXT:    subsd %xmm1, %xmm2
 ; SSE-NEXT:    cvttsd2si %xmm2, %rax
@@ -115,7 +115,7 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) nounwind {
 ;
 ; AVX2-LABEL: utest_f64i32:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vcvttsd2si %xmm2, %rax
 ; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
@@ -348,7 +348,7 @@ entry:
 define <4 x i32> @utest_f32i32(<4 x float> %x) nounwind {
 ; SSE-LABEL: utest_f32i32:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    subss %xmm2, %xmm1
 ; SSE-NEXT:    cvttss2si %xmm1, %rax
@@ -419,7 +419,7 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) nounwind {
 ; AVX2-LABEL: utest_f32i32:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
@@ -854,7 +854,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    movzwl %ax, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
@@ -2737,7 +2737,7 @@ entry:
 define <2 x i32> @utest_f64i32_mm(<2 x double> %x) nounwind {
 ; SSE-LABEL: utest_f64i32_mm:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-NEXT:    movapd %xmm0, %xmm2
 ; SSE-NEXT:    subsd %xmm1, %xmm2
 ; SSE-NEXT:    cvttsd2si %xmm2, %rax
@@ -2773,7 +2773,7 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) nounwind {
 ;
 ; AVX2-LABEL: utest_f64i32_mm:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX2-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
 ; AVX2-NEXT:    vcvttsd2si %xmm2, %rax
 ; AVX2-NEXT:    vcvttsd2si %xmm0, %rcx
@@ -3001,7 +3001,7 @@ entry:
 define <4 x i32> @utest_f32i32_mm(<4 x float> %x) nounwind {
 ; SSE-LABEL: utest_f32i32_mm:
 ; SSE:       # %bb.0: # %entry
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    subss %xmm2, %xmm1
 ; SSE-NEXT:    cvttss2si %xmm1, %rax
@@ -3072,7 +3072,7 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) nounwind {
 ; AVX2-LABEL: utest_f32i32_mm:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
@@ -3502,7 +3502,7 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
 ; AVX2-NEXT:    movzwl %ax, %eax
 ; AVX2-NEXT:    vmovd %eax, %xmm1
 ; AVX2-NEXT:    vcvtph2ps %xmm1, %xmm2
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
index 2eb351c8fac9e..04fce7badb951 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
@@ -72,7 +72,7 @@ define i1 @test_signed_i1_f32(float %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i1_f32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    minss %xmm0, %xmm1
@@ -82,7 +82,7 @@ define i1 @test_signed_i1_f32(float %f) nounwind {
 ;
 ; X64-LABEL: test_signed_i1_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm1 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    maxss %xmm0, %xmm1
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    minss %xmm1, %xmm0
@@ -143,9 +143,9 @@ define i8 @test_signed_i8_f32(float %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i8_f32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [-1.28E+2,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = [1.27E+2,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    minss %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -153,9 +153,9 @@ define i8 @test_signed_i8_f32(float %f) nounwind {
 ;
 ; X64-LABEL: test_signed_i8_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm1 = [-1.28E+2,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    maxss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [1.27E+2,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -215,9 +215,9 @@ define i13 @test_signed_i13_f32(float %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i13_f32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [-4.096E+3,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = [4.095E+3,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    minss %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -225,9 +225,9 @@ define i13 @test_signed_i13_f32(float %f) nounwind {
 ;
 ; X64-LABEL: test_signed_i13_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm1 = [-4.096E+3,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    maxss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [4.095E+3,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -287,9 +287,9 @@ define i16 @test_signed_i16_f32(float %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i16_f32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [-3.2768E+4,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = [3.2767E+4,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    minss %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -297,9 +297,9 @@ define i16 @test_signed_i16_f32(float %f) nounwind {
 ;
 ; X64-LABEL: test_signed_i16_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm1 = [-3.2768E+4,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    maxss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [3.2767E+4,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1092,7 +1092,7 @@ define i1 @test_signed_i1_f64(double %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i1_f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [-1.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    maxsd {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
 ; X86-SSE-NEXT:    minsd %xmm0, %xmm1
@@ -1102,7 +1102,7 @@ define i1 @test_signed_i1_f64(double %f) nounwind {
 ;
 ; X64-LABEL: test_signed_i1_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm1 = [-1.0E+0,0.0E+0]
 ; X64-NEXT:    maxsd %xmm0, %xmm1
 ; X64-NEXT:    xorpd %xmm0, %xmm0
 ; X64-NEXT:    minsd %xmm1, %xmm0
@@ -1163,9 +1163,9 @@ define i8 @test_signed_i8_f64(double %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i8_f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [-1.28E+2,0.0E+0]
 ; X86-SSE-NEXT:    maxsd {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = [1.27E+2,0.0E+0]
 ; X86-SSE-NEXT:    minsd %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttsd2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -1173,9 +1173,9 @@ define i8 @test_signed_i8_f64(double %f) nounwind {
 ;
 ; X64-LABEL: test_signed_i8_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm1 = [-1.28E+2,0.0E+0]
 ; X64-NEXT:    maxsd %xmm0, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [1.27E+2,0.0E+0]
 ; X64-NEXT:    minsd %xmm1, %xmm0
 ; X64-NEXT:    cvttsd2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -1235,9 +1235,9 @@ define i13 @test_signed_i13_f64(double %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i13_f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [-4.096E+3,0.0E+0]
 ; X86-SSE-NEXT:    maxsd {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = [4.095E+3,0.0E+0]
 ; X86-SSE-NEXT:    minsd %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttsd2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1245,9 +1245,9 @@ define i13 @test_signed_i13_f64(double %f) nounwind {
 ;
 ; X64-LABEL: test_signed_i13_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm1 = [-4.096E+3,0.0E+0]
 ; X64-NEXT:    maxsd %xmm0, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [4.095E+3,0.0E+0]
 ; X64-NEXT:    minsd %xmm1, %xmm0
 ; X64-NEXT:    cvttsd2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1307,9 +1307,9 @@ define i16 @test_signed_i16_f64(double %f) nounwind {
 ;
 ; X86-SSE-LABEL: test_signed_i16_f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [-3.2768E+4,0.0E+0]
 ; X86-SSE-NEXT:    maxsd {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = [3.2767E+4,0.0E+0]
 ; X86-SSE-NEXT:    minsd %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttsd2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1317,9 +1317,9 @@ define i16 @test_signed_i16_f64(double %f) nounwind {
 ;
 ; X64-LABEL: test_signed_i16_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm1 = [-3.2768E+4,0.0E+0]
 ; X64-NEXT:    maxsd %xmm0, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [3.2767E+4,0.0E+0]
 ; X64-NEXT:    minsd %xmm1, %xmm0
 ; X64-NEXT:    cvttsd2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
index 2856cfa01fad1..eaa1293ed2f98 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-vector-128.ll
@@ -17,7 +17,7 @@ define <4 x i1> @test_signed_v4i1_v4f32(<4 x float> %f) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ucomiss %xmm1, %xmm1
 ; CHECK-NEXT:    maxss %xmm2, %xmm1
@@ -60,10 +60,10 @@ define <4 x i1> @test_signed_v4i1_v4f32(<4 x float> %f) nounwind {
 define <4 x i8> @test_signed_v4i8_v4f32(<4 x float> %f) nounwind {
 ; CHECK-LABEL: test_signed_v4i8_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [-1.28E+2,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movaps %xmm1, %xmm3
 ; CHECK-NEXT:    maxss %xmm0, %xmm3
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [1.27E+2,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movaps %xmm2, %xmm4
 ; CHECK-NEXT:    minss %xmm3, %xmm4
 ; CHECK-NEXT:    cvttss2si %xmm4, %eax
@@ -105,10 +105,10 @@ define <4 x i16> @test_signed_v4i16_v4f32(<4 x float> %f) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [-3.2768E+4,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movaps %xmm2, %xmm3
 ; CHECK-NEXT:    maxss %xmm1, %xmm3
-; CHECK-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm4 = [3.2767E+4,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movaps %xmm4, %xmm1
 ; CHECK-NEXT:    minss %xmm3, %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %eax
@@ -144,7 +144,7 @@ define <4 x i32> @test_signed_v4i32_v4f32(<4 x float> %f) nounwind {
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
 ; CHECK-NEXT:    cvttss2si %xmm1, %edx
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [2.14748352E+9,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    ucomiss %xmm2, %xmm1
 ; CHECK-NEXT:    movl $2147483647, %eax # imm = 0x7FFFFFFF
 ; CHECK-NEXT:    cmoval %eax, %edx
@@ -186,7 +186,7 @@ define <4 x i64> @test_signed_v4i64_v4f32(<4 x float> %f) nounwind {
 ; CHECK-LABEL: test_signed_v4i64_v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cvttss2si %xmm0, %rdx
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [9.22337149E+18,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    ucomiss %xmm1, %xmm0
 ; CHECK-NEXT:    movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF
 ; CHECK-NEXT:    cmovaq %rax, %rdx
@@ -347,7 +347,7 @@ declare <2 x i128> @llvm.fptosi.sat.v2i128.v2f64(<2 x double>)
 define <2 x i1> @test_signed_v2i1_v2f64(<2 x double> %f) nounwind {
 ; CHECK-LABEL: test_signed_v2i1_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [-1.0E+0,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    maxsd %xmm2, %xmm1
 ; CHECK-NEXT:    xorpd %xmm3, %xmm3
@@ -374,10 +374,10 @@ define <2 x i1> @test_signed_v2i1_v2f64(<2 x double> %f) nounwind {
 define <2 x i8> @test_signed_v2i8_v2f64(<2 x double> %f) nounwind {
 ; CHECK-LABEL: test_signed_v2i8_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-1.28E+2,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm1, %xmm2
 ; CHECK-NEXT:    maxsd %xmm0, %xmm2
-; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm3 = [1.27E+2,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm3, %xmm4
 ; CHECK-NEXT:    minsd %xmm2, %xmm4
 ; CHECK-NEXT:    cvttsd2si %xmm4, %eax
@@ -397,12 +397,12 @@ define <2 x i8> @test_signed_v2i8_v2f64(<2 x double> %f) nounwind {
 define <2 x i16> @test_signed_v2i16_v2f64(<2 x double> %f) nounwind {
 ; CHECK-LABEL: test_signed_v2i16_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-3.2768E+4,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm1, %xmm2
 ; CHECK-NEXT:    maxsd %xmm0, %xmm1
 ; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; CHECK-NEXT:    maxsd %xmm0, %xmm2
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [3.2767E+4,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm0, %xmm3
 ; CHECK-NEXT:    minsd %xmm2, %xmm3
 ; CHECK-NEXT:    cvttsd2si %xmm3, %eax
@@ -418,10 +418,10 @@ define <2 x i16> @test_signed_v2i16_v2f64(<2 x double> %f) nounwind {
 define <2 x i32> @test_signed_v2i32_v2f64(<2 x double> %f) nounwind {
 ; CHECK-LABEL: test_signed_v2i32_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [-2.147483648E+9,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    maxsd %xmm2, %xmm1
-; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm3 = [2.147483647E+9,0.0E+0]
 ; CHECK-NEXT:    minsd %xmm3, %xmm1
 ; CHECK-NEXT:    cvttsd2si %xmm1, %eax
 ; CHECK-NEXT:    xorl %ecx, %ecx
@@ -446,7 +446,7 @@ define <2 x i64> @test_signed_v2i64_v2f64(<2 x double> %f) nounwind {
 ; CHECK-LABEL: test_signed_v2i64_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    cvttsd2si %xmm0, %rax
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547748E+18,0.0E+0]
 ; CHECK-NEXT:    ucomisd %xmm2, %xmm0
 ; CHECK-NEXT:    movabsq $9223372036854775807, %rcx # imm = 0x7FFFFFFFFFFFFFFF
 ; CHECK-NEXT:    cmovaq %rcx, %rax
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
index e334af71397ff..fefc92c313511 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
@@ -62,7 +62,7 @@ define i1 @test_unsigned_i1_f32(float %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    minss %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -72,7 +72,7 @@ define i1 @test_unsigned_i1_f32(float %f) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    maxss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -125,7 +125,7 @@ define i8 @test_unsigned_i8_f32(float %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = [2.55E+2,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    minss %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -135,7 +135,7 @@ define i8 @test_unsigned_i8_f32(float %f) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    maxss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [2.55E+2,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -187,7 +187,7 @@ define i13 @test_unsigned_i13_f32(float %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = [8.191E+3,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    minss %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -197,7 +197,7 @@ define i13 @test_unsigned_i13_f32(float %f) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    maxss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [8.191E+3,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -249,7 +249,7 @@ define i16 @test_unsigned_i16_f32(float %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0
 ; X86-SSE-NEXT:    maxss {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = [6.5535E+4,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    minss %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttss2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -259,7 +259,7 @@ define i16 @test_unsigned_i16_f32(float %f) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorps %xmm1, %xmm1
 ; X64-NEXT:    maxss %xmm0, %xmm1
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [6.5535E+4,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    minss %xmm1, %xmm0
 ; X64-NEXT:    cvttss2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -474,7 +474,7 @@ define i50 @test_unsigned_i50_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm2
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    jbe .LBB6_2
@@ -598,7 +598,7 @@ define i64 @test_unsigned_i64_f32(float %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $20, %esp
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    ucomiss %xmm0, %xmm2
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    jbe .LBB7_2
@@ -997,7 +997,7 @@ define i1 @test_unsigned_i1_f64(double %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    xorpd %xmm0, %xmm0
 ; X86-SSE-NEXT:    maxsd {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    minsd %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttsd2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -1007,7 +1007,7 @@ define i1 @test_unsigned_i1_f64(double %f) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorpd %xmm1, %xmm1
 ; X64-NEXT:    maxsd %xmm0, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X64-NEXT:    minsd %xmm1, %xmm0
 ; X64-NEXT:    cvttsd2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -1060,7 +1060,7 @@ define i8 @test_unsigned_i8_f64(double %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    xorpd %xmm0, %xmm0
 ; X86-SSE-NEXT:    maxsd {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = [2.55E+2,0.0E+0]
 ; X86-SSE-NEXT:    minsd %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttsd2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $al killed $al killed $eax
@@ -1070,7 +1070,7 @@ define i8 @test_unsigned_i8_f64(double %f) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorpd %xmm1, %xmm1
 ; X64-NEXT:    maxsd %xmm0, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [2.55E+2,0.0E+0]
 ; X64-NEXT:    minsd %xmm1, %xmm0
 ; X64-NEXT:    cvttsd2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
@@ -1122,7 +1122,7 @@ define i13 @test_unsigned_i13_f64(double %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    xorpd %xmm0, %xmm0
 ; X86-SSE-NEXT:    maxsd {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = [8.191E+3,0.0E+0]
 ; X86-SSE-NEXT:    minsd %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttsd2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1132,7 +1132,7 @@ define i13 @test_unsigned_i13_f64(double %f) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorpd %xmm1, %xmm1
 ; X64-NEXT:    maxsd %xmm0, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [8.191E+3,0.0E+0]
 ; X64-NEXT:    minsd %xmm1, %xmm0
 ; X64-NEXT:    cvttsd2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1184,7 +1184,7 @@ define i16 @test_unsigned_i16_f64(double %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    xorpd %xmm0, %xmm0
 ; X86-SSE-NEXT:    maxsd {{[0-9]+}}(%esp), %xmm0
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = [6.5535E+4,0.0E+0]
 ; X86-SSE-NEXT:    minsd %xmm0, %xmm1
 ; X86-SSE-NEXT:    cvttsd2si %xmm1, %eax
 ; X86-SSE-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1194,7 +1194,7 @@ define i16 @test_unsigned_i16_f64(double %f) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorpd %xmm1, %xmm1
 ; X64-NEXT:    maxsd %xmm0, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [6.5535E+4,0.0E+0]
 ; X64-NEXT:    minsd %xmm1, %xmm0
 ; X64-NEXT:    cvttsd2si %xmm0, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -1320,7 +1320,7 @@ define i32 @test_unsigned_i32_f64(double %f) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    xorpd %xmm1, %xmm1
 ; X64-NEXT:    maxsd %xmm0, %xmm1
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [4.294967295E+9,0.0E+0]
 ; X64-NEXT:    minsd %xmm1, %xmm0
 ; X64-NEXT:    cvttsd2si %xmm0, %rax
 ; X64-NEXT:    # kill: def $eax killed $eax killed $rax
@@ -1402,7 +1402,7 @@ define i50 @test_unsigned_i50_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    pushl %esi
 ; X86-SSE-NEXT:    subl $16, %esp
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; X86-SSE-NEXT:    ucomisd %xmm0, %xmm2
 ; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
 ; X86-SSE-NEXT:    jbe .LBB16_2
@@ -1522,7 +1522,7 @@ define i64 @test_unsigned_i64_f64(double %f) nounwind {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    subl $20, %esp
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; X86-SSE-NEXT:    ucomisd %xmm0, %xmm2
 ; X86-SSE-NEXT:    xorpd %xmm1, %xmm1
 ; X86-SSE-NEXT:    jbe .LBB17_2
@@ -2455,7 +2455,7 @@ define i50 @test_unsigned_i50_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    calll __extendhfsf2
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    ucomiss %xmm2, %xmm0
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    jae .LBB26_2
@@ -2596,7 +2596,7 @@ define i64 @test_unsigned_i64_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    calll __extendhfsf2
 ; X86-SSE-NEXT:    fstps {{[0-9]+}}(%esp)
 ; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    ucomiss %xmm2, %xmm0
 ; X86-SSE-NEXT:    xorps %xmm1, %xmm1
 ; X86-SSE-NEXT:    jae .LBB27_2
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
index 0cced636dddba..4305886168abe 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-vector-128.ll
@@ -19,7 +19,7 @@ define <4 x i1> @test_unsigned_v4i1_v4f32(<4 x float> %f) nounwind {
 ; CHECK-NEXT:    shufps {{.*#+}} xmm1 = xmm1[3,3],xmm0[3,3]
 ; CHECK-NEXT:    xorps %xmm2, %xmm2
 ; CHECK-NEXT:    maxss %xmm2, %xmm1
-; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    minss %xmm3, %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %eax
 ; CHECK-NEXT:    movd %eax, %xmm1
@@ -54,7 +54,7 @@ define <4 x i8> @test_unsigned_v4i8_v4f32(<4 x float> %f) nounwind {
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
 ; CHECK-NEXT:    maxss %xmm0, %xmm3
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [2.55E+2,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movaps %xmm2, %xmm4
 ; CHECK-NEXT:    minss %xmm3, %xmm4
 ; CHECK-NEXT:    cvttss2si %xmm4, %eax
@@ -99,7 +99,7 @@ define <4 x i16> @test_unsigned_v4i16_v4f32(<4 x float> %f) nounwind {
 ; CHECK-NEXT:    xorps %xmm2, %xmm2
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
 ; CHECK-NEXT:    maxss %xmm1, %xmm3
-; CHECK-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm4 = [6.5535E+4,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movaps %xmm4, %xmm1
 ; CHECK-NEXT:    minss %xmm3, %xmm1
 ; CHECK-NEXT:    cvttss2si %xmm1, %eax
@@ -139,7 +139,7 @@ define <4 x i32> @test_unsigned_v4i32_v4f32(<4 x float> %f) nounwind {
 ; CHECK-NEXT:    xorps %xmm2, %xmm2
 ; CHECK-NEXT:    ucomiss %xmm2, %xmm1
 ; CHECK-NEXT:    cmovbl %eax, %edx
-; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = [4.29496704E+9,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    ucomiss %xmm3, %xmm1
 ; CHECK-NEXT:    movl $-1, %ecx
 ; CHECK-NEXT:    cmoval %ecx, %edx
@@ -177,7 +177,7 @@ define <4 x i32> @test_unsigned_v4i32_v4f32(<4 x float> %f) nounwind {
 define <4 x i64> @test_unsigned_v4i64_v4f32(<4 x float> %f) nounwind {
 ; CHECK-LABEL: test_unsigned_v4i64_v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movaps %xmm0, %xmm2
 ; CHECK-NEXT:    subss %xmm1, %xmm2
 ; CHECK-NEXT:    cvttss2si %xmm2, %rax
@@ -190,7 +190,7 @@ define <4 x i64> @test_unsigned_v4i64_v4f32(<4 x float> %f) nounwind {
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
 ; CHECK-NEXT:    ucomiss %xmm3, %xmm0
 ; CHECK-NEXT:    cmovbq %rax, %rdx
-; CHECK-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm4 = [1.8446743E+19,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    ucomiss %xmm4, %xmm0
 ; CHECK-NEXT:    movq $-1, %rcx
 ; CHECK-NEXT:    cmovaq %rcx, %rdx
@@ -352,7 +352,7 @@ define <2 x i1> @test_unsigned_v2i1_v2f64(<2 x double> %f) nounwind {
 ; CHECK-NEXT:    xorpd %xmm2, %xmm2
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    maxsd %xmm2, %xmm1
-; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    minsd %xmm3, %xmm1
 ; CHECK-NEXT:    cvttsd2si %xmm1, %rax
 ; CHECK-NEXT:    movq %rax, %xmm1
@@ -374,7 +374,7 @@ define <2 x i8> @test_unsigned_v2i8_v2f64(<2 x double> %f) nounwind {
 ; CHECK-NEXT:    xorpd %xmm1, %xmm1
 ; CHECK-NEXT:    xorpd %xmm2, %xmm2
 ; CHECK-NEXT:    maxsd %xmm0, %xmm2
-; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm3 = [2.55E+2,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm3, %xmm4
 ; CHECK-NEXT:    minsd %xmm2, %xmm4
 ; CHECK-NEXT:    cvttsd2si %xmm4, %eax
@@ -399,7 +399,7 @@ define <2 x i16> @test_unsigned_v2i16_v2f64(<2 x double> %f) nounwind {
 ; CHECK-NEXT:    unpckhpd {{.*#+}} xmm0 = xmm0[1,1]
 ; CHECK-NEXT:    xorpd %xmm2, %xmm2
 ; CHECK-NEXT:    maxsd %xmm0, %xmm2
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [6.5535E+4,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm0, %xmm3
 ; CHECK-NEXT:    minsd %xmm2, %xmm3
 ; CHECK-NEXT:    cvttsd2si %xmm3, %eax
@@ -418,7 +418,7 @@ define <2 x i32> @test_unsigned_v2i32_v2f64(<2 x double> %f) nounwind {
 ; CHECK-NEXT:    xorpd %xmm2, %xmm2
 ; CHECK-NEXT:    xorpd %xmm1, %xmm1
 ; CHECK-NEXT:    maxsd %xmm0, %xmm1
-; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm3 = [4.294967295E+9,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm3, %xmm4
 ; CHECK-NEXT:    minsd %xmm1, %xmm4
 ; CHECK-NEXT:    cvttsd2si %xmm4, %rax
@@ -438,7 +438,7 @@ define <2 x i32> @test_unsigned_v2i32_v2f64(<2 x double> %f) nounwind {
 define <2 x i64> @test_unsigned_v2i64_v2f64(<2 x double> %f) nounwind {
 ; CHECK-LABEL: test_unsigned_v2i64_v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
 ; CHECK-NEXT:    subsd %xmm2, %xmm1
 ; CHECK-NEXT:    cvttsd2si %xmm1, %rax
@@ -451,7 +451,7 @@ define <2 x i64> @test_unsigned_v2i64_v2f64(<2 x double> %f) nounwind {
 ; CHECK-NEXT:    xorpd %xmm3, %xmm3
 ; CHECK-NEXT:    ucomisd %xmm3, %xmm0
 ; CHECK-NEXT:    cmovbq %rax, %rdx
-; CHECK-NEXT:    movsd {{.*#+}} xmm4 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm4 = [1.844674407370955E+19,0.0E+0]
 ; CHECK-NEXT:    ucomisd %xmm4, %xmm0
 ; CHECK-NEXT:    movq $-1, %rcx
 ; CHECK-NEXT:    cmovaq %rcx, %rdx
diff --git a/llvm/test/CodeGen/X86/ftrunc.ll b/llvm/test/CodeGen/X86/ftrunc.ll
index 08705e9cdc59c..da44b5ec1371e 100644
--- a/llvm/test/CodeGen/X86/ftrunc.ll
+++ b/llvm/test/CodeGen/X86/ftrunc.ll
@@ -122,7 +122,7 @@ define <4 x float> @trunc_unsigned_v4f32(<4 x float> %x) #0 {
 define <2 x double> @trunc_unsigned_v2f64(<2 x double> %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_v2f64:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; SSE2-NEXT:    movapd %xmm0, %xmm1
 ; SSE2-NEXT:    subsd %xmm2, %xmm1
 ; SSE2-NEXT:    cvttsd2si %xmm1, %rax
@@ -170,7 +170,7 @@ define <4 x double> @trunc_unsigned_v4f64(<4 x double> %x) #0 {
 ; SSE2-LABEL: trunc_unsigned_v4f64:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movapd %xmm1, %xmm2
-; SSE2-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE2-NEXT:    movsd {{.*#+}} xmm3 = [9.2233720368547758E+18,0.0E+0]
 ; SSE2-NEXT:    subsd %xmm3, %xmm1
 ; SSE2-NEXT:    cvttsd2si %xmm1, %rax
 ; SSE2-NEXT:    cvttsd2si %xmm2, %rcx
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index 7225257203161..b42f6fdea34b6 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -334,7 +334,7 @@ define i64 @test_fptoui_i64(ptr %p) #0 {
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
 ; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-I686-NEXT:    ucomiss %xmm1, %xmm0
 ; CHECK-I686-NEXT:    jae .LBB9_2
 ; CHECK-I686-NEXT:  # %bb.1:
@@ -1066,12 +1066,12 @@ define void @main.158() #0 {
 ; CHECK-LIBCALL-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-LIBCALL-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-LIBCALL-NEXT:    callq __extendhfsf2@PLT
-; CHECK-LIBCALL-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT:    movss {{.*#+}} xmm1 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-LIBCALL-NEXT:    ucomiss %xmm0, %xmm1
 ; CHECK-LIBCALL-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-LIBCALL-NEXT:    jae .LBB20_2
 ; CHECK-LIBCALL-NEXT:  # %bb.1: # %entry
-; CHECK-LIBCALL-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-LIBCALL-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-LIBCALL-NEXT:  .LBB20_2: # %entry
 ; CHECK-LIBCALL-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-LIBCALL-NEXT:    pextrw $0, %xmm0, %eax
@@ -1085,11 +1085,11 @@ define void @main.158() #0 {
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm1
 ; BWON-F16C-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; BWON-F16C-NEXT:    vcvtph2ps %xmm1, %xmm1
-; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm2 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; BWON-F16C-NEXT:    vucomiss %xmm1, %xmm2
 ; BWON-F16C-NEXT:    jae .LBB20_2
 ; BWON-F16C-NEXT:  # %bb.1: # %entry
-; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; BWON-F16C-NEXT:    vmovss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; BWON-F16C-NEXT:  .LBB20_2: # %entry
 ; BWON-F16C-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; BWON-F16C-NEXT:    vmovd %xmm0, %eax
@@ -1105,12 +1105,12 @@ define void @main.158() #0 {
 ; CHECK-I686-NEXT:    movw %ax, (%esp)
 ; CHECK-I686-NEXT:    calll __extendhfsf2
 ; CHECK-I686-NEXT:    fstps {{[0-9]+}}(%esp)
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = [8.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-I686-NEXT:    ucomiss {{[0-9]+}}(%esp), %xmm0
 ; CHECK-I686-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-I686-NEXT:    jae .LBB20_2
 ; CHECK-I686-NEXT:  # %bb.1: # %entry
-; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-I686-NEXT:    movss {{.*#+}} xmm0 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-I686-NEXT:  .LBB20_2: # %entry
 ; CHECK-I686-NEXT:    movss %xmm0, (%esp)
 ; CHECK-I686-NEXT:    calll __truncsfhf2
diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll
index 5470fae5fd581..ed7cbb0a8430d 100644
--- a/llvm/test/CodeGen/X86/insertelement-ones.ll
+++ b/llvm/test/CodeGen/X86/insertelement-ones.ll
@@ -150,7 +150,7 @@ define <4 x i32> @insert_v4i32_01x3(<4 x i32> %a) {
 define <8 x i32> @insert_v8i32_x12345x7(<8 x i32> %a) {
 ; SSE2-LABEL: insert_v8i32_x12345x7:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE2-NEXT:    movl $-1, %eax
 ; SSE2-NEXT:    movd %eax, %xmm2
@@ -160,7 +160,7 @@ define <8 x i32> @insert_v8i32_x12345x7(<8 x i32> %a) {
 ;
 ; SSE3-LABEL: insert_v8i32_x12345x7:
 ; SSE3:       # %bb.0:
-; SSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE3-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSE3-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSE3-NEXT:    movl $-1, %eax
 ; SSE3-NEXT:    movd %eax, %xmm2
@@ -170,7 +170,7 @@ define <8 x i32> @insert_v8i32_x12345x7(<8 x i32> %a) {
 ;
 ; SSSE3-LABEL: insert_v8i32_x12345x7:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3]
 ; SSSE3-NEXT:    movl $-1, %eax
 ; SSSE3-NEXT:    movd %eax, %xmm2
diff --git a/llvm/test/CodeGen/X86/ldexp.ll b/llvm/test/CodeGen/X86/ldexp.ll
index ec128fc6686c8..2be5dec156690 100644
--- a/llvm/test/CodeGen/X86/ldexp.ll
+++ b/llvm/test/CodeGen/X86/ldexp.ll
@@ -5,7 +5,7 @@
 define float @ldexp_f32(i8 zeroext %x) {
 ; X64-LABEL: ldexp_f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    jmp ldexpf@PLT # TAILCALL
 ;
 ; WIN32-LABEL: ldexp_f32:
@@ -86,7 +86,7 @@ define float @ldexp_f32(i8 zeroext %x) {
 define double @ldexp_f64(i8 zeroext %x) {
 ; X64-LABEL: ldexp_f64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; X64-NEXT:    jmp ldexp@PLT # TAILCALL
 ;
 ; WIN32-LABEL: ldexp_f64:
diff --git a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
index 5ebcde3053a7b..3edbcd1fe18eb 100644
--- a/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
+++ b/llvm/test/CodeGen/X86/load-scalar-as-vector.ll
@@ -587,13 +587,13 @@ define <2 x double> @fsub_op1_constant(ptr %p) nounwind {
 define <4 x float> @fsub_op0_constant(ptr %p) nounwind {
 ; SSE-LABEL: fsub_op0_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    subss (%rdi), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fsub_op0_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubss (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load float, ptr %p
@@ -641,13 +641,13 @@ define <2 x double> @fdiv_op1_constant(ptr %p) nounwind {
 define <4 x float> @fdiv_op0_constant(ptr %p) nounwind {
 ; SSE-LABEL: fdiv_op0_constant:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    divss (%rdi), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fdiv_op0_constant:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load float, ptr %p
@@ -661,7 +661,7 @@ define <4 x float> @frem_op1_constant(ptr %p) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    callq fmodf@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    retq
@@ -670,7 +670,7 @@ define <4 x float> @frem_op1_constant(ptr %p) nounwind {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fmodf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    retq
@@ -685,7 +685,7 @@ define <2 x double> @frem_op0_constant(ptr %p) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pushq %rax
 ; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    callq fmod@PLT
 ; SSE-NEXT:    popq %rax
 ; SSE-NEXT:    retq
@@ -694,7 +694,7 @@ define <2 x double> @frem_op0_constant(ptr %p) nounwind {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/logical-load-fold.ll b/llvm/test/CodeGen/X86/logical-load-fold.ll
index 3890c18694196..1c3f209fc1e66 100644
--- a/llvm/test/CodeGen/X86/logical-load-fold.ll
+++ b/llvm/test/CodeGen/X86/logical-load-fold.ll
@@ -14,14 +14,14 @@ define double @load_double_no_fold(double %x, double %y) {
 ; SSE2-LABEL: load_double_no_fold:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    cmplesd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; SSE2-NEXT:    andpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: load_double_no_fold:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmplesd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; AVX-NEXT:    vandpd %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 
@@ -35,14 +35,14 @@ define float @load_float_no_fold(float %x, float %y) {
 ; SSE2-LABEL: load_float_no_fold:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    cmpless %xmm0, %xmm1
-; SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    andps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: load_float_no_fold:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmpless %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vandps %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 
diff --git a/llvm/test/CodeGen/X86/lsr-static-addr.ll b/llvm/test/CodeGen/X86/lsr-static-addr.ll
index 67d6e6d812e11..a06ead491bf7d 100644
--- a/llvm/test/CodeGen/X86/lsr-static-addr.ll
+++ b/llvm/test/CodeGen/X86/lsr-static-addr.ll
@@ -11,7 +11,7 @@ define void @foo(i64 %n) nounwind {
 ; CHECK-NEXT:    jle .LBB0_3
 ; CHECK-NEXT:  # %bb.1: # %for.body.preheader
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [2.2999999999999998E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_2: # %for.body
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -30,7 +30,7 @@ define void @foo(i64 %n) nounwind {
 ; ATOM-NEXT:    jle .LBB0_3
 ; ATOM-NEXT:  # %bb.1: # %for.body.preheader
 ; ATOM-NEXT:    xorl %eax, %eax
-; ATOM-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; ATOM-NEXT:    movsd {{.*#+}} xmm0 = [2.2999999999999998E+0,0.0E+0]
 ; ATOM-NEXT:    .p2align 4, 0x90
 ; ATOM-NEXT:  .LBB0_2: # %for.body
 ; ATOM-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
index 8b626a937ed3d..5828f06bf1c39 100644
--- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -19,7 +19,7 @@ define void @PR24199(i32 %a0) {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    jmp .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: # %if.then
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
@@ -30,7 +30,7 @@ define void @PR24199(i32 %a0) {
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 4-byte Reload
 ; CHECK-NEXT:    # xmm2 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    mulss %xmm0, %xmm2
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    addss %xmm1, %xmm0
 ; CHECK-NEXT:    addss %xmm2, %xmm0
 ; CHECK-NEXT:    movss %xmm0, (%rax)
diff --git a/llvm/test/CodeGen/X86/masked-iv-safe.ll b/llvm/test/CodeGen/X86/masked-iv-safe.ll
index f5190cff44714..4e4ad3a0161e0 100644
--- a/llvm/test/CodeGen/X86/masked-iv-safe.ll
+++ b/llvm/test/CodeGen/X86/masked-iv-safe.ll
@@ -8,9 +8,9 @@ define void @count_up(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: count_up:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq $-80, %rax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -54,9 +54,9 @@ define void @count_down(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: count_down:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl $80, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -100,9 +100,9 @@ define void @count_up_signed(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: count_up_signed:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq $-80, %rax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB2_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -148,9 +148,9 @@ define void @count_down_signed(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: count_down_signed:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl $80, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB3_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -196,9 +196,9 @@ define void @another_count_up(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: another_count_up:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq $-8, %rax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB4_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -246,9 +246,9 @@ define void @another_count_down(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: another_count_down:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq $-2040, %rax # imm = 0xF808
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    movq %rdi, %rcx
 ; CHECK-NEXT:    movq %rdi, %rdx
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -300,9 +300,9 @@ define void @another_count_up_signed(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: another_count_up_signed:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq $-8, %rax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB6_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -348,9 +348,9 @@ define void @another_count_down_signed(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: another_count_down_signed:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl $8, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB7_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll
index 045c42627a397..f10db3424c2ea 100644
--- a/llvm/test/CodeGen/X86/masked-iv-unsafe.ll
+++ b/llvm/test/CodeGen/X86/masked-iv-unsafe.ll
@@ -8,9 +8,9 @@ define void @count_up(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: count_up:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl $10, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -61,9 +61,9 @@ define void @count_down(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: count_down:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl $10, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -117,9 +117,9 @@ define void @count_up_signed(ptr %d, i64 %n) nounwind {
 ; CHECK-NEXT:    movl $10, %eax
 ; CHECK-NEXT:    movl $167772160, %ecx # imm = 0xA000000
 ; CHECK-NEXT:    movl $2560, %edx # imm = 0xA00
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB2_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -177,9 +177,9 @@ define void @count_down_signed(ptr %d, i64 %n) nounwind {
 ; CHECK-NEXT:    movq $-10, %rax
 ; CHECK-NEXT:    movl $167772160, %ecx # imm = 0xA000000
 ; CHECK-NEXT:    movl $2560, %edx # imm = 0xA00
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB3_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -235,9 +235,9 @@ define void @another_count_up(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: another_count_up:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB4_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -288,9 +288,9 @@ return:
 define void @another_count_down(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: another_count_down:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB5_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -342,9 +342,9 @@ define void @another_count_up_signed(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: another_count_up_signed:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    movq %rdi, %rdx
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -406,9 +406,9 @@ define void @another_count_down_signed(ptr %d, i64 %n) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rcx
 ; CHECK-NEXT:    shlq $24, %rcx
 ; CHECK-NEXT:    shlq $8, %rsi
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB7_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -464,9 +464,9 @@ define void @yet_another_count_down(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: yet_another_count_down:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq $-2040, %rax # imm = 0xF808
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    movq %rdi, %rcx
 ; CHECK-NEXT:    movq %rdi, %rdx
 ; CHECK-NEXT:    .p2align 4, 0x90
@@ -518,9 +518,9 @@ define void @yet_another_count_up(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: yet_another_count_up:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB9_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -572,9 +572,9 @@ define void @still_another_count_down(ptr %d, i64 %n) nounwind {
 ; CHECK-LABEL: still_another_count_down:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl $10, %eax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB10_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -626,9 +626,9 @@ define void @yet_another_count_up_signed(ptr %d, i64 %n) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq $-10, %rax
 ; CHECK-NEXT:    xorl %ecx, %ecx
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    xorl %edx, %edx
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB11_1: # %loop
@@ -687,9 +687,9 @@ define void @yet_another_count_down_signed(ptr %d, i64 %n) nounwind {
 ; CHECK-NEXT:    movl $10, %eax
 ; CHECK-NEXT:    movl $167772160, %ecx # imm = 0xA000000
 ; CHECK-NEXT:    movl $2560, %edx # imm = 0xA00
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0000000000000001E-1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.2999999999999998E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.5E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB12_1: # %loop
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
index 1e31ee7ad6b59..595f8491b405c 100644
--- a/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
+++ b/llvm/test/CodeGen/X86/merge-consecutive-loads-128.ll
@@ -1074,7 +1074,7 @@ define void @merge_4i32_i32_combine(ptr %dst, ptr %src) {
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SSE1-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE1-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE1-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE1-NEXT:    andps %xmm0, %xmm1
 ; X86-SSE1-NEXT:    movaps %xmm1, (%eax)
 ; X86-SSE1-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll
index 0d50ebd475f01..8020982509819 100644
--- a/llvm/test/CodeGen/X86/neg_fp.ll
+++ b/llvm/test/CodeGen/X86/neg_fp.ll
@@ -30,7 +30,7 @@ define double @negation_propagation(ptr %arg, double %arg1, double %arg2) nounwi
 ; CHECK-NEXT:    movl %esp, %ebp
 ; CHECK-NEXT:    andl $-8, %esp
 ; CHECK-NEXT:    subl $8, %esp
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    divsd 12(%ebp), %xmm0
 ; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
 ; CHECK-NEXT:    mulsd %xmm0, %xmm1
diff --git a/llvm/test/CodeGen/X86/nontemporal-4.ll b/llvm/test/CodeGen/X86/nontemporal-4.ll
index 0f42a9a9cb719..6a8df2445690a 100644
--- a/llvm/test/CodeGen/X86/nontemporal-4.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-4.ll
@@ -78,7 +78,7 @@ define void @test_constant_v2i64_align1(ptr %dst) nounwind {
 ;
 ; SSE4A-LABEL: test_constant_v2i64_align1:
 ; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [4.9406564584124654E-324,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
 ; SSE4A-NEXT:    xorl %eax, %eax
 ; SSE4A-NEXT:    movntiq %rax, (%rdi)
@@ -340,7 +340,7 @@ define void @test_constant_v4i64_align1(ptr %dst) nounwind {
 ;
 ; SSE4A-LABEL: test_constant_v4i64_align1:
 ; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
 ; SSE4A-NEXT:    xorl %eax, %eax
 ; SSE4A-NEXT:    movntiq %rax, (%rdi)
@@ -905,7 +905,7 @@ define void @test_constant_v8i64_align1(ptr %dst) nounwind {
 ;
 ; SSE4A-LABEL: test_constant_v8i64_align1:
 ; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
 ; SSE4A-NEXT:    xorl %eax, %eax
 ; SSE4A-NEXT:    movntiq %rax, (%rdi)
diff --git a/llvm/test/CodeGen/X86/oss-fuzz-25184.ll b/llvm/test/CodeGen/X86/oss-fuzz-25184.ll
index 8e6b343ab0e0b..87e20e566ae77 100644
--- a/llvm/test/CodeGen/X86/oss-fuzz-25184.ll
+++ b/llvm/test/CodeGen/X86/oss-fuzz-25184.ll
@@ -6,7 +6,7 @@
 define <2 x double> @test_fpext() {
 ; CHECK-LABEL: test_fpext:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.9406564584124654E-324,0.0E+0]
 ; CHECK-NEXT:    retq
   %tmp12 = insertelement <4 x float> undef, float 0.000000e+00, i32 3
   %tmp5 = fpext <4 x float> %tmp12 to <4 x double>
diff --git a/llvm/test/CodeGen/X86/peep-test-0.ll b/llvm/test/CodeGen/X86/peep-test-0.ll
index d31d2620bf9fe..71dadf7edea3a 100644
--- a/llvm/test/CodeGen/X86/peep-test-0.ll
+++ b/llvm/test/CodeGen/X86/peep-test-0.ll
@@ -7,7 +7,7 @@ define void @loop(i64 %n, ptr nocapture %d) nounwind {
 ; CHECK-NEXT:    movq %rdi, %rax
 ; CHECK-NEXT:    shlq $4, %rax
 ; CHECK-NEXT:    addq %rsi, %rax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %bb
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/pow.ll b/llvm/test/CodeGen/X86/pow.ll
index fc1593005debc..b3a5268460aa3 100644
--- a/llvm/test/CodeGen/X86/pow.ll
+++ b/llvm/test/CodeGen/X86/pow.ll
@@ -87,7 +87,7 @@ define <2 x double> @pow_v2f64_one_fourth_fmf(<2 x double> %x) nounwind {
 define float @pow_f32_one_fourth_not_enough_fmf(float %x) nounwind {
 ; CHECK-LABEL: pow_f32_one_fourth_not_enough_fmf:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.5E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    jmp powf@PLT # TAILCALL
   %r = call afn ninf float @llvm.pow.f32(float %x, float 2.5e-01)
   ret float %r
@@ -96,7 +96,7 @@ define float @pow_f32_one_fourth_not_enough_fmf(float %x) nounwind {
 define double @pow_f64_one_fourth_not_enough_fmf(double %x) nounwind {
 ; CHECK-LABEL: pow_f64_one_fourth_not_enough_fmf:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.5E-1,0.0E+0]
 ; CHECK-NEXT:    jmp pow@PLT # TAILCALL
   %r = call nsz ninf double @llvm.pow.f64(double %x, double 2.5e-01)
   ret double %r
@@ -108,23 +108,23 @@ define <4 x float> @pow_v4f32_one_fourth_not_enough_fmf(<4 x float> %x) nounwind
 ; CHECK-NEXT:    subq $56, %rsp
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.5E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq powf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.5E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq powf@PLT
 ; CHECK-NEXT:    unpcklps (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.5E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq powf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.5E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq powf@PLT
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -142,12 +142,12 @@ define <2 x double> @pow_v2f64_one_fourth_not_enough_fmf(<2 x double> %x) nounwi
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.5E-1,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
 ; CHECK-NEXT:    movhlps {{.*#+}} xmm0 = xmm0[1,1]
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.5E-1,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
@@ -205,7 +205,7 @@ define x86_fp80 @pow_f80_one_third_fmf(x86_fp80 %x) nounwind {
 define double @pow_f64_not_exactly_one_third_fmf(double %x) nounwind {
 ; CHECK-LABEL: pow_f64_not_exactly_one_third_fmf:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.3333333333333337E-1,0.0E+0]
 ; CHECK-NEXT:    jmp pow@PLT # TAILCALL
   %r = call nsz nnan ninf afn double @llvm.pow.f64(double %x, double 0x3fd5555555555556)
   ret double %r
@@ -216,7 +216,7 @@ define double @pow_f64_not_exactly_one_third_fmf(double %x) nounwind {
 define double @pow_f64_not_enough_fmf(double %x) nounwind {
 ; CHECK-LABEL: pow_f64_not_enough_fmf:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.3333333333333331E-1,0.0E+0]
 ; CHECK-NEXT:    jmp pow@PLT # TAILCALL
   %r = call nsz ninf afn double @llvm.pow.f64(double %x, double 0x3fd5555555555555)
   ret double %r
diff --git a/llvm/test/CodeGen/X86/powi-int32min.ll b/llvm/test/CodeGen/X86/powi-int32min.ll
index 3a1304c61c49a..b3093a08c496f 100644
--- a/llvm/test/CodeGen/X86/powi-int32min.ll
+++ b/llvm/test/CodeGen/X86/powi-int32min.ll
@@ -5,7 +5,7 @@ define float @test_powi(ptr %p) nounwind {
 ; CHECK:       # %bb.0: # %bb
 ; CHECK-NEXT:        movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; CHECK-COUNT-31:    mulss %xmm1, %xmm1
-; CHECK-NEXT:        movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:        movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:        divss %xmm1, %xmm0
 ; CHECK-NEXT:        retq
 bb:
diff --git a/llvm/test/CodeGen/X86/pr23103.ll b/llvm/test/CodeGen/X86/pr23103.ll
index 2146d32f8cc18..2142ae8657aeb 100644
--- a/llvm/test/CodeGen/X86/pr23103.ll
+++ b/llvm/test/CodeGen/X86/pr23103.ll
@@ -14,7 +14,7 @@ define <1 x double> @pr23103(ptr align 8 %Vp) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    callq foo@PLT
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr37879.ll b/llvm/test/CodeGen/X86/pr37879.ll
index 06bf74d13ffae..60ca7c5b6d22b 100644
--- a/llvm/test/CodeGen/X86/pr37879.ll
+++ b/llvm/test/CodeGen/X86/pr37879.ll
@@ -7,7 +7,7 @@ define double @foo(ptr nocapture readonly) #0 {
 ; CHECK-NEXT:    movq (%rax), %rax
 ; CHECK-NEXT:    vcvtsi2sd %rax, %xmm0, %xmm1
 ; CHECK-NEXT:    kmovd %eax, %k1
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
 ; CHECK-NEXT:    retq
   %2 = load i64, ptr undef, align 8
diff --git a/llvm/test/CodeGen/X86/pr40539.ll b/llvm/test/CodeGen/X86/pr40539.ll
index f92d4a90bf852..56d80a025fa08 100644
--- a/llvm/test/CodeGen/X86/pr40539.ll
+++ b/llvm/test/CodeGen/X86/pr40539.ll
@@ -40,10 +40,10 @@ define zeroext i1 @_Z8test_cosv() {
 ; CHECK-NEXT:    subl $8, %esp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 12
 ; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [8.70000004E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
 ; CHECK-NEXT:    movss %xmm0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [8.60000014E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
 ; CHECK-NEXT:    #APP
 ; CHECK-NEXT:    fcos
diff --git a/llvm/test/CodeGen/X86/pr44749.ll b/llvm/test/CodeGen/X86/pr44749.ll
index cc9963dc2d8cc..9df2a4b14dcf4 100644
--- a/llvm/test/CodeGen/X86/pr44749.ll
+++ b/llvm/test/CodeGen/X86/pr44749.ll
@@ -15,10 +15,10 @@ define i32 @a() {
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    cvtsi2sd %eax, %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+2,0.0E+0]
 ; CHECK-NEXT:    subsd %xmm2, %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm3 = [1.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [3.1400000000000001E+0,0.0E+0]
 ; CHECK-NEXT:    cmplesd %xmm1, %xmm0
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    andpd %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr59258.ll b/llvm/test/CodeGen/X86/pr59258.ll
index 61ddb24eaaf87..e5f5ca71739df 100644
--- a/llvm/test/CodeGen/X86/pr59258.ll
+++ b/llvm/test/CodeGen/X86/pr59258.ll
@@ -90,14 +90,14 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind {
 ; CHECK-NEXT:    callq fmaxf@PLT
 ; CHECK-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -106,14 +106,14 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind {
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movd (%rsp), %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -124,14 +124,14 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind {
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
@@ -140,14 +140,14 @@ define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind {
 ; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; CHECK-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; CHECK-NEXT:    callq __extendhfsf2@PLT
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    callq __truncsfhf2@PLT
 ; CHECK-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/pr59305.ll b/llvm/test/CodeGen/X86/pr59305.ll
index cb98e9d5dda8e..c2f6d21a41d4d 100644
--- a/llvm/test/CodeGen/X86/pr59305.ll
+++ b/llvm/test/CodeGen/X86/pr59305.ll
@@ -8,17 +8,17 @@ define double @foo(double %0) #0 {
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
 ; CHECK-NEXT:    movl $1024, %edi # imm = 0x400
 ; CHECK-NEXT:    callq fesetround@PLT
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    divsd (%rsp), %xmm1 # 8-byte Folded Reload
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    movl $1024, %edi # imm = 0x400
 ; CHECK-NEXT:    callq fesetround@PLT
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    divsd (%rsp), %xmm0 # 8-byte Folded Reload
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; CHECK-NEXT:    movl $1024, %edi # imm = 0x400
 ; CHECK-NEXT:    callq fesetround@PLT
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    divsd (%rsp), %xmm2 # 8-byte Folded Reload
 ; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 8-byte Reload
 ; CHECK-NEXT:    # xmm0 = mem[0],zero
diff --git a/llvm/test/CodeGen/X86/pseudo_cmov_lower2.ll b/llvm/test/CodeGen/X86/pseudo_cmov_lower2.ll
index 6a009ec0efed7..f2c1269143fbc 100644
--- a/llvm/test/CodeGen/X86/pseudo_cmov_lower2.ll
+++ b/llvm/test/CodeGen/X86/pseudo_cmov_lower2.ll
@@ -29,7 +29,7 @@ define double @foo2(float %p1, double %p2, double %p3) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
 ; CHECK-NEXT:    ucomiss %xmm3, %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.25E+0,0.0E+0]
 ; CHECK-NEXT:    jae .LBB1_1
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    addsd %xmm0, %xmm2
@@ -122,7 +122,7 @@ define double @foo5(float %p1, double %p2, double %p3) nounwind {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
 ; CHECK-NEXT:    ucomiss %xmm3, %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.25E+0,0.0E+0]
 ; CHECK-NEXT:    jae .LBB4_1
 ; CHECK-NEXT:  # %bb.2: # %select.false
 ; CHECK-NEXT:    addsd %xmm2, %xmm0
@@ -156,7 +156,7 @@ define double @foo6(float %p1, double %p2, double %p3) nounwind {
 ; CHECK-NEXT:    movaps %xmm0, %xmm3
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    ucomiss %xmm0, %xmm3
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.25E+0,0.0E+0]
 ; CHECK-NEXT:    jae .LBB5_1
 ; CHECK-NEXT:  # %bb.2: # %select.false
 ; CHECK-NEXT:    addsd %xmm2, %xmm0
@@ -203,7 +203,7 @@ define double @foo1_g(float %p1, double %p2, double %p3) nounwind !dbg !4 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
 ; CHECK-NEXT:    ucomiss %xmm3, %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.25E+0,0.0E+0]
 ; CHECK-NEXT:    jae .LBB6_1
 ; CHECK-NEXT:  # %bb.2: # %entry
 ; CHECK-NEXT:    addsd %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index 7e9bbc5556424..bb8c074f97e5e 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -21,14 +21,14 @@
 define float @f32_no_estimate(float %x) #0 {
 ; SSE-LABEL: f32_no_estimate:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    divss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: f32_no_estimate:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %div = fdiv fast float 1.0, %x
@@ -40,7 +40,7 @@ define float @f32_one_step(float %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpss %xmm0, %xmm2
 ; SSE-NEXT:    mulss %xmm2, %xmm0
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    subss %xmm0, %xmm1
 ; SSE-NEXT:    mulss %xmm2, %xmm1
 ; SSE-NEXT:    addss %xmm2, %xmm1
@@ -51,7 +51,7 @@ define float @f32_one_step(float %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
 ; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
@@ -73,7 +73,7 @@ define float @f32_one_step(float %x) #1 {
 ;
 ; BTVER2-LABEL: f32_one_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
@@ -85,7 +85,7 @@ define float @f32_one_step(float %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
@@ -102,7 +102,7 @@ define float @f32_one_step(float %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
@@ -211,7 +211,7 @@ define float @f32_two_step(float %x) #2 {
 ; SSE-NEXT:    rcpss %xmm0, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm3
 ; SSE-NEXT:    mulss %xmm2, %xmm3
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm4
 ; SSE-NEXT:    subss %xmm3, %xmm4
 ; SSE-NEXT:    mulss %xmm2, %xmm4
@@ -227,7 +227,7 @@ define float @f32_two_step(float %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
 ; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
 ; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
@@ -240,7 +240,7 @@ define float @f32_two_step(float %x) #2 {
 ; FMA-RECIP-LABEL: f32_two_step:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; FMA-RECIP-NEXT:    vmovaps %xmm1, %xmm3
 ; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
 ; FMA-RECIP-NEXT:    vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
@@ -251,7 +251,7 @@ define float @f32_two_step(float %x) #2 {
 ; BDVER2-LABEL: f32_two_step:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm3 = (xmm0 * xmm1) - xmm2
 ; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm3) + xmm1
 ; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm1) - xmm2
@@ -260,7 +260,7 @@ define float @f32_two_step(float %x) #2 {
 ;
 ; BTVER2-LABEL: f32_two_step:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
 ; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
@@ -276,7 +276,7 @@ define float @f32_two_step(float %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
 ; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
 ; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
@@ -289,7 +289,7 @@ define float @f32_two_step(float %x) #2 {
 ; HASWELL-LABEL: f32_two_step:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NEXT:    vmovaps %xmm1, %xmm3
 ; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
 ; HASWELL-NEXT:    vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
@@ -301,7 +301,7 @@ define float @f32_two_step(float %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
@@ -314,7 +314,7 @@ define float @f32_two_step(float %x) #2 {
 ; AVX512-LABEL: f32_two_step:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm3
 ; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm3 = (xmm0 * xmm3) - xmm2
 ; AVX512-NEXT:    vfnmadd132ss {{.*#+}} xmm3 = -(xmm3 * xmm1) + xmm1
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 2a5e46bba2c00..042069703af21 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -32,7 +32,7 @@ define float @f32_one_step_2(float %x) #1 {
 ; SSE-LABEL: f32_one_step_2:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpss %xmm0, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm2, %xmm3
 ; SSE-NEXT:    mulss %xmm1, %xmm3
 ; SSE-NEXT:    mulss %xmm3, %xmm0
@@ -45,7 +45,7 @@ define float @f32_one_step_2(float %x) #1 {
 ; AVX-RECIP-LABEL: f32_one_step_2:
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; AVX-RECIP-NEXT:    vmulss %xmm3, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
@@ -56,7 +56,7 @@ define float @f32_one_step_2(float %x) #1 {
 ; FMA-RECIP-LABEL: f32_one_step_2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; FMA-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
 ; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
@@ -65,7 +65,7 @@ define float @f32_one_step_2(float %x) #1 {
 ; BDVER2-LABEL: f32_one_step_2:
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; BDVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm2
 ; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
@@ -73,7 +73,7 @@ define float @f32_one_step_2(float %x) #1 {
 ;
 ; BTVER2-LABEL: f32_one_step_2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; BTVER2-NEXT:    vmulss %xmm3, %xmm0, %xmm0
@@ -85,7 +85,7 @@ define float @f32_one_step_2(float %x) #1 {
 ; SANDY-LABEL: f32_one_step_2:
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; SANDY-NEXT:    vmulss %xmm3, %xmm0, %xmm0
 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
@@ -96,7 +96,7 @@ define float @f32_one_step_2(float %x) #1 {
 ; HASWELL-LABEL: f32_one_step_2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
 ; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
@@ -105,7 +105,7 @@ define float @f32_one_step_2(float %x) #1 {
 ; HASWELL-NO-FMA-LABEL: f32_one_step_2:
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm3, %xmm0, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
@@ -116,7 +116,7 @@ define float @f32_one_step_2(float %x) #1 {
 ; AVX512-LABEL: f32_one_step_2:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm2
 ; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm1 * xmm0) + xmm3
@@ -130,11 +130,11 @@ define float @f32_one_step_2_divs(float %x) #1 {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    rcpss %xmm0, %xmm1
 ; SSE-NEXT:    mulss %xmm1, %xmm0
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    subss %xmm0, %xmm2
 ; SSE-NEXT:    mulss %xmm1, %xmm2
 ; SSE-NEXT:    addss %xmm1, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [3.456E+3,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    mulss %xmm2, %xmm0
 ; SSE-NEXT:    mulss %xmm2, %xmm0
 ; SSE-NEXT:    retq
@@ -143,7 +143,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
 ; AVX-RECIP-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; AVX-RECIP-NEXT:    vaddss %xmm0, %xmm1, %xmm0
@@ -171,7 +171,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
 ;
 ; BTVER2-LABEL: f32_one_step_2_divs:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm0
 ; BTVER2-NEXT:    vsubss %xmm0, %xmm2, %xmm0
@@ -185,7 +185,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
 ; SANDY-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; SANDY-NEXT:    vaddss %xmm0, %xmm1, %xmm0
@@ -206,7 +206,7 @@ define float @f32_one_step_2_divs(float %x) #1 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm0
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm0, %xmm1, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm0, %xmm1, %xmm0
@@ -233,11 +233,11 @@ define float @f32_two_step_2(float %x) #2 {
 ; SSE-NEXT:    rcpss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    mulss %xmm1, %xmm2
-; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    subss %xmm2, %xmm3
 ; SSE-NEXT:    mulss %xmm1, %xmm3
 ; SSE-NEXT:    addss %xmm1, %xmm3
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm3, %xmm2
 ; SSE-NEXT:    mulss %xmm1, %xmm2
 ; SSE-NEXT:    mulss %xmm2, %xmm0
@@ -251,11 +251,11 @@ define float @f32_two_step_2(float %x) #2 {
 ; AVX-RECIP:       # %bb.0:
 ; AVX-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; AVX-RECIP-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-RECIP-NEXT:    vsubss %xmm2, %xmm3, %xmm2
 ; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm2
 ; AVX-RECIP-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-RECIP-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; AVX-RECIP-NEXT:    vmulss %xmm3, %xmm0, %xmm0
 ; AVX-RECIP-NEXT:    vsubss %xmm0, %xmm2, %xmm0
@@ -266,10 +266,10 @@ define float @f32_two_step_2(float %x) #2 {
 ; FMA-RECIP-LABEL: f32_two_step_2:
 ; FMA-RECIP:       # %bb.0:
 ; FMA-RECIP-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; FMA-RECIP-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
 ; FMA-RECIP-NEXT:    vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
-; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; FMA-RECIP-NEXT:    vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
 ; FMA-RECIP-NEXT:    vmulss %xmm1, %xmm2, %xmm3
 ; FMA-RECIP-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
 ; FMA-RECIP-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
@@ -279,7 +279,7 @@ define float @f32_two_step_2(float %x) #2 {
 ; BDVER2:       # %bb.0:
 ; BDVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm2 = (xmm0 * xmm1) - mem
-; BDVER2-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; BDVER2-NEXT:    vmovss {{.*#+}} xmm4 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
 ; BDVER2-NEXT:    vfnmaddss {{.*#+}} xmm1 = -(xmm1 * xmm2) + xmm1
 ; BDVER2-NEXT:    vmulss %xmm4, %xmm1, %xmm3
 ; BDVER2-NEXT:    vfmsubss {{.*#+}} xmm0 = (xmm0 * xmm3) - xmm4
@@ -288,9 +288,9 @@ define float @f32_two_step_2(float %x) #2 {
 ;
 ; BTVER2-LABEL: f32_two_step_2:
 ; BTVER2:       # %bb.0:
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; BTVER2-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; BTVER2-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; BTVER2-NEXT:    vmovss {{.*#+}} xmm4 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
 ; BTVER2-NEXT:    vmulss %xmm1, %xmm0, %xmm2
 ; BTVER2-NEXT:    vsubss %xmm2, %xmm3, %xmm2
 ; BTVER2-NEXT:    vmulss %xmm2, %xmm1, %xmm2
@@ -306,11 +306,11 @@ define float @f32_two_step_2(float %x) #2 {
 ; SANDY:       # %bb.0:
 ; SANDY-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; SANDY-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SANDY-NEXT:    vsubss %xmm2, %xmm3, %xmm2
 ; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm2
 ; SANDY-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SANDY-NEXT:    vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
 ; SANDY-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; SANDY-NEXT:    vmulss %xmm3, %xmm0, %xmm0
 ; SANDY-NEXT:    vsubss %xmm0, %xmm2, %xmm0
@@ -321,10 +321,10 @@ define float @f32_two_step_2(float %x) #2 {
 ; HASWELL-LABEL: f32_two_step_2:
 ; HASWELL:       # %bb.0:
 ; HASWELL-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
 ; HASWELL-NEXT:    vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
-; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; HASWELL-NEXT:    vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NEXT:    vmulss %xmm1, %xmm2, %xmm3
 ; HASWELL-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
 ; HASWELL-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
@@ -334,11 +334,11 @@ define float @f32_two_step_2(float %x) #2 {
 ; HASWELL-NO-FMA:       # %bb.0:
 ; HASWELL-NO-FMA-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm1, %xmm0, %xmm2
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm3 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm2, %xmm3, %xmm2
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm2
 ; HASWELL-NO-FMA-NEXT:    vaddss %xmm2, %xmm1, %xmm1
-; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; HASWELL-NO-FMA-NEXT:    vmovss {{.*#+}} xmm2 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm2, %xmm1, %xmm3
 ; HASWELL-NO-FMA-NEXT:    vmulss %xmm3, %xmm0, %xmm0
 ; HASWELL-NO-FMA-NEXT:    vsubss %xmm0, %xmm2, %xmm0
@@ -349,10 +349,10 @@ define float @f32_two_step_2(float %x) #2 {
 ; AVX512-LABEL: f32_two_step_2:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vrcpss %xmm0, %xmm0, %xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512-NEXT:    vfmsub231ss {{.*#+}} xmm2 = (xmm0 * xmm1) - xmm2
 ; AVX512-NEXT:    vfnmadd132ss {{.*#+}} xmm2 = -(xmm2 * xmm1) + xmm1
-; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm1 = [6.789E+3,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512-NEXT:    vmulss %xmm1, %xmm2, %xmm3
 ; AVX512-NEXT:    vfmsub213ss {{.*#+}} xmm0 = (xmm3 * xmm0) - xmm1
 ; AVX512-NEXT:    vfnmadd213ss {{.*#+}} xmm0 = -(xmm2 * xmm0) + xmm3
diff --git a/llvm/test/CodeGen/X86/recip-pic.ll b/llvm/test/CodeGen/X86/recip-pic.ll
index c04aa394c309e..d01ecc1e2ce1e 100644
--- a/llvm/test/CodeGen/X86/recip-pic.ll
+++ b/llvm/test/CodeGen/X86/recip-pic.ll
@@ -11,7 +11,7 @@ define fastcc float @foo(float %x) unnamed_addr #0 {
 ; CHECK-NEXT:    .cfi_adjust_cfa_offset -4
 ; CHECK-NEXT:  .Ltmp0:
 ; CHECK-NEXT:    addl $_GLOBAL_OFFSET_TABLE_+(.Ltmp0-.L0$pb), %eax
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
 ; CHECK-NEXT:    movss %xmm1, (%eax)
diff --git a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
index 718b8b558c9b2..86d4be9cb7af6 100644
--- a/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
+++ b/llvm/test/CodeGen/X86/scalar-fp-to-i64.ll
@@ -68,7 +68,7 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-AVX512F-WIN-NEXT:    andl $-8, %esp
 ; X86-AVX512F-WIN-NEXT:    subl $8, %esp
 ; X86-AVX512F-WIN-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX512F-WIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX512F-WIN-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-AVX512F-WIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512F-WIN-NEXT:    vucomiss %xmm0, %xmm1
 ; X86-AVX512F-WIN-NEXT:    setbe %dl
@@ -89,7 +89,7 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-AVX512F-LIN:       # %bb.0:
 ; X86-AVX512F-LIN-NEXT:    subl $12, %esp
 ; X86-AVX512F-LIN-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-AVX512F-LIN-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX512F-LIN-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-AVX512F-LIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512F-LIN-NEXT:    vucomiss %xmm0, %xmm1
 ; X86-AVX512F-LIN-NEXT:    setbe %dl
@@ -112,7 +112,7 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-SSE3-WIN-NEXT:    andl $-8, %esp
 ; X86-SSE3-WIN-NEXT:    subl $8, %esp
 ; X86-SSE3-WIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE3-WIN-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE3-WIN-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE3-WIN-NEXT:    ucomiss %xmm0, %xmm1
 ; X86-SSE3-WIN-NEXT:    jbe LBB0_2
 ; X86-SSE3-WIN-NEXT:  # %bb.1:
@@ -135,7 +135,7 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-SSE3-LIN:       # %bb.0:
 ; X86-SSE3-LIN-NEXT:    subl $12, %esp
 ; X86-SSE3-LIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE3-LIN-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE3-LIN-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE3-LIN-NEXT:    ucomiss %xmm0, %xmm1
 ; X86-SSE3-LIN-NEXT:    jbe .LBB0_2
 ; X86-SSE3-LIN-NEXT:  # %bb.1:
@@ -182,7 +182,7 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-SSE2-WIN-NEXT:    andl $-8, %esp
 ; X86-SSE2-WIN-NEXT:    subl $16, %esp
 ; X86-SSE2-WIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-WIN-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE2-WIN-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE2-WIN-NEXT:    ucomiss %xmm0, %xmm1
 ; X86-SSE2-WIN-NEXT:    jbe LBB0_2
 ; X86-SSE2-WIN-NEXT:  # %bb.1:
@@ -211,7 +211,7 @@ define i64 @f_to_u64(float %a) nounwind {
 ; X86-SSE2-LIN:       # %bb.0:
 ; X86-SSE2-LIN-NEXT:    subl $20, %esp
 ; X86-SSE2-LIN-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-LIN-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE2-LIN-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE2-LIN-NEXT:    ucomiss %xmm0, %xmm1
 ; X86-SSE2-LIN-NEXT:    jbe .LBB0_2
 ; X86-SSE2-LIN-NEXT:  # %bb.1:
@@ -501,7 +501,7 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-AVX512F-WIN-NEXT:    andl $-8, %esp
 ; X86-AVX512F-WIN-NEXT:    subl $8, %esp
 ; X86-AVX512F-WIN-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX512F-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX512F-WIN-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; X86-AVX512F-WIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512F-WIN-NEXT:    vucomisd %xmm0, %xmm1
 ; X86-AVX512F-WIN-NEXT:    setbe %dl
@@ -522,7 +522,7 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-AVX512F-LIN:       # %bb.0:
 ; X86-AVX512F-LIN-NEXT:    subl $12, %esp
 ; X86-AVX512F-LIN-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX512F-LIN-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX512F-LIN-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; X86-AVX512F-LIN-NEXT:    xorl %edx, %edx
 ; X86-AVX512F-LIN-NEXT:    vucomisd %xmm0, %xmm1
 ; X86-AVX512F-LIN-NEXT:    setbe %dl
@@ -545,7 +545,7 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-SSE3-WIN-NEXT:    andl $-8, %esp
 ; X86-SSE3-WIN-NEXT:    subl $8, %esp
 ; X86-SSE3-WIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE3-WIN-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE3-WIN-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; X86-SSE3-WIN-NEXT:    ucomisd %xmm0, %xmm1
 ; X86-SSE3-WIN-NEXT:    jbe LBB2_2
 ; X86-SSE3-WIN-NEXT:  # %bb.1:
@@ -568,7 +568,7 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-SSE3-LIN:       # %bb.0:
 ; X86-SSE3-LIN-NEXT:    subl $12, %esp
 ; X86-SSE3-LIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE3-LIN-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE3-LIN-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; X86-SSE3-LIN-NEXT:    ucomisd %xmm0, %xmm1
 ; X86-SSE3-LIN-NEXT:    jbe .LBB2_2
 ; X86-SSE3-LIN-NEXT:  # %bb.1:
@@ -615,7 +615,7 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-SSE2-WIN-NEXT:    andl $-8, %esp
 ; X86-SSE2-WIN-NEXT:    subl $16, %esp
 ; X86-SSE2-WIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-WIN-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE2-WIN-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; X86-SSE2-WIN-NEXT:    ucomisd %xmm0, %xmm1
 ; X86-SSE2-WIN-NEXT:    jbe LBB2_2
 ; X86-SSE2-WIN-NEXT:  # %bb.1:
@@ -644,7 +644,7 @@ define i64 @d_to_u64(double %a) nounwind {
 ; X86-SSE2-LIN:       # %bb.0:
 ; X86-SSE2-LIN-NEXT:    subl $20, %esp
 ; X86-SSE2-LIN-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE2-LIN-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE2-LIN-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; X86-SSE2-LIN-NEXT:    ucomisd %xmm0, %xmm1
 ; X86-SSE2-LIN-NEXT:    jbe .LBB2_2
 ; X86-SSE2-LIN-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/X86/scalarize-fp.ll b/llvm/test/CodeGen/X86/scalarize-fp.ll
index 8379d20b603e3..ef04437c7951f 100644
--- a/llvm/test/CodeGen/X86/scalarize-fp.ll
+++ b/llvm/test/CodeGen/X86/scalarize-fp.ll
@@ -38,14 +38,14 @@ define <4 x float> @load_fadd_op1_constant_v4f32(ptr %p) nounwind {
 define <4 x float> @fsub_op0_constant_v4f32(float %x) nounwind {
 ; SSE-LABEL: fsub_op0_constant_v4f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    subss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fsub_op0_constant_v4f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %v = insertelement <4 x float> undef, float %x, i32 0
@@ -56,13 +56,13 @@ define <4 x float> @fsub_op0_constant_v4f32(float %x) nounwind {
 define <4 x float> @load_fsub_op0_constant_v4f32(ptr %p) nounwind {
 ; SSE-LABEL: load_fsub_op0_constant_v4f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    subss (%rdi), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_fsub_op0_constant_v4f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubss (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load float, ptr %p
@@ -140,14 +140,14 @@ define <4 x float> @load_fdiv_op1_constant_v4f32(ptr %p) nounwind {
 define <4 x float> @fdiv_op0_constant_v4f32(float %x) nounwind {
 ; SSE-LABEL: fdiv_op0_constant_v4f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    divss %xmm0, %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fdiv_op0_constant_v4f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %v = insertelement <4 x float> undef, float %x, i32 0
@@ -158,13 +158,13 @@ define <4 x float> @fdiv_op0_constant_v4f32(float %x) nounwind {
 define <4 x float> @load_fdiv_op0_constant_v4f32(ptr %p) nounwind {
 ; SSE-LABEL: load_fdiv_op0_constant_v4f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    divss (%rdi), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_fdiv_op0_constant_v4f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load float, ptr %p
@@ -209,14 +209,14 @@ define <4 x double> @load_fadd_op1_constant_v4f64(ptr %p) nounwind {
 define <4 x double> @fsub_op0_constant_v4f64(double %x) nounwind {
 ; SSE-LABEL: fsub_op0_constant_v4f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    subsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fsub_op0_constant_v4f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %v = insertelement <4 x double> undef, double %x, i32 0
@@ -227,13 +227,13 @@ define <4 x double> @fsub_op0_constant_v4f64(double %x) nounwind {
 define <4 x double> @load_fsub_op0_constant_v4f64(ptr %p) nounwind {
 ; SSE-LABEL: load_fsub_op0_constant_v4f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    subsd (%rdi), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_fsub_op0_constant_v4f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    vsubsd (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load double, ptr %p
@@ -311,14 +311,14 @@ define <4 x double> @load_fdiv_op1_constant_v4f64(ptr %p) nounwind {
 define <4 x double> @fdiv_op0_constant_v4f64(double %x) nounwind {
 ; SSE-LABEL: fdiv_op0_constant_v4f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    divsd %xmm0, %xmm1
 ; SSE-NEXT:    movapd %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fdiv_op0_constant_v4f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %v = insertelement <4 x double> undef, double %x, i32 0
@@ -329,13 +329,13 @@ define <4 x double> @fdiv_op0_constant_v4f64(double %x) nounwind {
 define <4 x double> @load_fdiv_op0_constant_v4f64(ptr %p) nounwind {
 ; SSE-LABEL: load_fdiv_op0_constant_v4f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    divsd (%rdi), %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: load_fdiv_op0_constant_v4f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    vdivsd (%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %x = load double, ptr %p
@@ -524,7 +524,7 @@ define <2 x double> @fadd_splat_const_op1_v2f64(<2 x double> %vx) {
 define <4 x double> @fsub_const_op0_splat_v4f64(double %x) {
 ; SSE-LABEL: fsub_const_op0_splat_v4f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [8.0E+0,0.0E+0]
 ; SSE-NEXT:    subsd %xmm0, %xmm1
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
 ; SSE-NEXT:    movapd %xmm1, %xmm0
@@ -532,7 +532,7 @@ define <4 x double> @fsub_const_op0_splat_v4f64(double %x) {
 ;
 ; AVX-LABEL: fsub_const_op0_splat_v4f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [8.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -565,7 +565,7 @@ define <4 x float> @fmul_splat_const_op1_v4f32(<4 x float> %vx, <4 x float> %vy)
 define <8 x float> @fdiv_splat_const_op0_v8f32(<8 x float> %vy) {
 ; SSE-LABEL: fdiv_splat_const_op0_v8f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [4.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    divss %xmm0, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
@@ -573,7 +573,7 @@ define <8 x float> @fdiv_splat_const_op0_v8f32(<8 x float> %vy) {
 ;
 ; AVX-LABEL: fdiv_splat_const_op0_v8f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -700,7 +700,7 @@ define <2 x double> @splat0_fadd_const_op1_v2f64(<2 x double> %vx) {
 define <4 x double> @splat0_fsub_const_op0_v4f64(double %x) {
 ; SSE-LABEL: splat0_fsub_const_op0_v4f64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [-4.2E+1,0.0E+0]
 ; SSE-NEXT:    subsd %xmm0, %xmm1
 ; SSE-NEXT:    unpcklpd {{.*#+}} xmm1 = xmm1[0,0]
 ; SSE-NEXT:    movapd %xmm1, %xmm0
@@ -708,7 +708,7 @@ define <4 x double> @splat0_fsub_const_op0_v4f64(double %x) {
 ;
 ; AVX-LABEL: splat0_fsub_const_op0_v4f64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [-4.2E+1,0.0E+0]
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = xmm0[0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
@@ -756,7 +756,7 @@ define <8 x float> @splat0_fdiv_const_op1_v8f32(<8 x float> %vx) {
 define <8 x float> @splat0_fdiv_const_op0_v8f32(<8 x float> %vx) {
 ; SSE-LABEL: splat0_fdiv_const_op0_v8f32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    divss %xmm0, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
@@ -764,7 +764,7 @@ define <8 x float> @splat0_fdiv_const_op0_v8f32(<8 x float> %vx) {
 ;
 ; AVX-LABEL: splat0_fdiv_const_op0_v8f32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/select-of-fp-constants.ll b/llvm/test/CodeGen/X86/select-of-fp-constants.ll
index 3ddeeee1bce04..76b8ea8e2b8a2 100644
--- a/llvm/test/CodeGen/X86/select-of-fp-constants.ll
+++ b/llvm/test/CodeGen/X86/select-of-fp-constants.ll
@@ -42,7 +42,7 @@ define float @icmp_select_fp_constants(i32 %x) nounwind readnone {
 define float @fcmp_select_fp_constants(float %x) nounwind readnone {
 ; X86-SSE-LABEL: fcmp_select_fp_constants:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [-4.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-SSE-NEXT:    cmpneqss {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    movd %xmm0, %eax
 ; X86-SSE-NEXT:    andl $1, %eax
@@ -51,7 +51,7 @@ define float @fcmp_select_fp_constants(float %x) nounwind readnone {
 ;
 ; X86-AVX2-LABEL: fcmp_select_fp_constants:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [-4.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-AVX2-NEXT:    vcmpneqss {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X86-AVX2-NEXT:    vmovd %xmm0, %eax
 ; X86-AVX2-NEXT:    andl $1, %eax
@@ -85,7 +85,7 @@ define float @fcmp_select_fp_constants(float %x) nounwind readnone {
 ; X64-AVX512F-LABEL: fcmp_select_fp_constants:
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vcmpneqss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
-; X64-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX512F-NEXT:    vmovss {{.*#+}} xmm0 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; X64-AVX512F-NEXT:    vmovss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1}
 ; X64-AVX512F-NEXT:    retq
  %c = fcmp une float %x, -4.0
diff --git a/llvm/test/CodeGen/X86/select.ll b/llvm/test/CodeGen/X86/select.ll
index 6d2b73e2108ba..ca5558561a65b 100644
--- a/llvm/test/CodeGen/X86/select.ll
+++ b/llvm/test/CodeGen/X86/select.ll
@@ -177,7 +177,7 @@ entry:
 define signext i8 @test4(ptr nocapture %P, double %F) nounwind readonly {
 ; CHECK-LABEL: test4:
 ; CHECK:       ## %bb.0: ## %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    ucomisd %xmm0, %xmm1
 ; CHECK-NEXT:    seta %al
diff --git a/llvm/test/CodeGen/X86/select_const.ll b/llvm/test/CodeGen/X86/select_const.ll
index eba22036701b4..d604923b48a11 100644
--- a/llvm/test/CodeGen/X86/select_const.ll
+++ b/llvm/test/CodeGen/X86/select_const.ll
@@ -958,7 +958,7 @@ define float @select_undef_fp(float %x) {
 ;
 ; X64-LABEL: select_undef_fp:
 ; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [4.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    retq
   %f = select i1 undef, float 4.0, float %x
   ret float %f
diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll
index 780a769bc9e2b..0745881b2f3a3 100644
--- a/llvm/test/CodeGen/X86/setcc-combine.ll
+++ b/llvm/test/CodeGen/X86/setcc-combine.ll
@@ -488,7 +488,7 @@ define double @ogt_no_zero(double %x) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0]
 ; CHECK-NEXT:    xorpd %xmm0, %xmm1
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [1.0E+0,0.0E+0]
 ; CHECK-NEXT:    cmpltsd %xmm0, %xmm2
 ; CHECK-NEXT:    andpd %xmm2, %xmm0
 ; CHECK-NEXT:    andnpd %xmm1, %xmm2
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
index 6d6a7b897c332..9f2071ff14b87 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tune.ll
@@ -14,7 +14,7 @@ define float @f32_no_daz(float %f) #0 {
 ; NHM-NEXT:    rsqrtss %xmm0, %xmm1
 ; NHM-NEXT:    movaps %xmm0, %xmm2
 ; NHM-NEXT:    mulss %xmm1, %xmm2
-; NHM-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NHM-NEXT:    movss {{.*#+}} xmm3 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
 ; NHM-NEXT:    mulss %xmm2, %xmm3
 ; NHM-NEXT:    mulss %xmm1, %xmm2
 ; NHM-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
@@ -221,7 +221,7 @@ define float @f32_daz(float %f) #1 {
 ; NHM-NEXT:    rsqrtss %xmm0, %xmm1
 ; NHM-NEXT:    movaps %xmm0, %xmm2
 ; NHM-NEXT:    mulss %xmm1, %xmm2
-; NHM-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; NHM-NEXT:    movss {{.*#+}} xmm3 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
 ; NHM-NEXT:    mulss %xmm2, %xmm3
 ; NHM-NEXT:    mulss %xmm1, %xmm2
 ; NHM-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll
index 99b45f83ac909..85f7733e671a7 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath-tunecpu-attr.ll
@@ -7,7 +7,7 @@ define float @f32_tune_nhm(float %f) #0 {
 ; CHECK-NEXT:    rsqrtss %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm0, %xmm2
 ; CHECK-NEXT:    mulss %xmm1, %xmm2
-; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    mulss %xmm2, %xmm3
 ; CHECK-NEXT:    mulss %xmm1, %xmm2
 ; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
@@ -44,7 +44,7 @@ define float @f32_tune_x86_64(float %f) #3 {
 ; CHECK-NEXT:    rsqrtss %xmm0, %xmm1
 ; CHECK-NEXT:    movaps %xmm0, %xmm2
 ; CHECK-NEXT:    mulss %xmm1, %xmm2
-; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = [-5.0E-1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    mulss %xmm2, %xmm3
 ; CHECK-NEXT:    mulss %xmm1, %xmm2
 ; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index c0ad1a31c7d8d..af2f66d1e9bd0 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -289,14 +289,14 @@ define float @f32_no_estimate(float %x) #0 {
 ; SSE-LABEL: f32_no_estimate:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    sqrtss %xmm0, %xmm1
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    divss %xmm1, %xmm0
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: f32_no_estimate:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
   %sqrt = tail call float @llvm.sqrt.f32(float %x)
@@ -940,7 +940,7 @@ define double @sqrt_simplify_before_recip(double %x, ptr %p) nounwind {
 ; SSE-LABEL: sqrt_simplify_before_recip:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    sqrtsd %xmm0, %xmm0
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; SSE-NEXT:    divsd %xmm0, %xmm1
 ; SSE-NEXT:    movsd %xmm1, (%rdi)
 ; SSE-NEXT:    retq
@@ -948,7 +948,7 @@ define double @sqrt_simplify_before_recip(double %x, ptr %p) nounwind {
 ; AVX-LABEL: sqrt_simplify_before_recip:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
 ; AVX-NEXT:    vmovsd %xmm1, (%rdi)
 ; AVX-NEXT:    retq
@@ -987,7 +987,7 @@ define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
 ; SSE-LABEL: sqrt_simplify_before_recip_order:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    sqrtsd %xmm0, %xmm0
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    divsd %xmm0, %xmm1
 ; SSE-NEXT:    movsd %xmm1, (%rdi)
 ; SSE-NEXT:    retq
@@ -995,7 +995,7 @@ define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
 ; AVX-LABEL: sqrt_simplify_before_recip_order:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    vdivsd %xmm0, %xmm1, %xmm1
 ; AVX-NEXT:    vmovsd %xmm1, (%rdi)
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sse-fcopysign.ll b/llvm/test/CodeGen/X86/sse-fcopysign.ll
index 59150d41e38e8..3eadcad145b65 100644
--- a/llvm/test/CodeGen/X86/sse-fcopysign.ll
+++ b/llvm/test/CodeGen/X86/sse-fcopysign.ll
@@ -188,7 +188,7 @@ define float @cst1() nounwind {
 ;
 ; X64-LABEL: cst1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [-1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    retq
   %tmp = tail call float @llvm.copysign.f32( float 1.0, float -2.0 )
   ret float %tmp
@@ -203,7 +203,7 @@ define double @cst2() nounwind {
 ;
 ; X64-LABEL: cst2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
 ; X64-NEXT:    retq
   %tmp1 = fadd float -1.0, -1.0
   %tmp2 = fpext float %tmp1 to double
diff --git a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
index 7ada1c04626f3..db2f78bf5eedc 100644
--- a/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1199,8 +1199,8 @@ define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_load_ps1:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x00]
 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
@@ -1219,8 +1219,8 @@ define <4 x float> @test_mm_load_ps1(float* %a0) nounwind {
 ;
 ; X64-SSE-LABEL: test_mm_load_ps1:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
-; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x07]
 ; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
 ; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
@@ -1246,40 +1246,40 @@ define <4 x float> @test_mm_load_ss(float* %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_load_ss:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x00]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_load_ss:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovss (%eax), %xmm0 # encoding: [0xc5,0xfa,0x10,0x00]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x00]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_load_ss:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_load_ss:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
-; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x07]
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_mm_load_ss:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; X64-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_mm_load_ss:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; X64-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
   %ld = load float, float* %a0, align 1
   %res0 = insertelement <4 x float> undef, float %ld, i32 0
@@ -1293,8 +1293,8 @@ define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_load1_ps:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x00]
 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
@@ -1313,8 +1313,8 @@ define <4 x float> @test_mm_load1_ps(float* %a0) nounwind {
 ;
 ; X64-SSE-LABEL: test_mm_load1_ps:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
-; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x07]
 ; X64-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
 ; X64-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
@@ -2002,16 +2002,16 @@ define void @test_MM_SET_FLUSH_ZERO_MODE(i32 %a0) nounwind {
 define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
 ; X86-SSE-LABEL: test_mm_set_ps:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
 ; X86-SSE-NEXT:    unpcklps %xmm1, %xmm0 # encoding: [0x0f,0x14,0xc1]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x04]
-; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x04]
 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm1 # encoding: [0x0f,0x14,0xca]
 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
@@ -2020,36 +2020,36 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
 ;
 ; X86-AVX1-LABEL: test_mm_set_ps:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
 ; X86-AVX1-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
 ; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_set_ps:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
 ; X86-AVX512-NEXT:    vinsertps $16, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x10]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x08]
 ; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x20]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vinsertps $48, %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0x30]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0,1,2],xmm1[0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
@@ -2094,24 +2094,24 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n
 define <4 x float> @test_mm_set_ps1(float %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_set_ps1:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_set_ps1:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_set_ps1:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2206,8 +2206,8 @@ define void @test_MM_SET_ROUNDING_MODE(i32 %a0) nounwind {
 define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_set_ss:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
 ; X86-SSE-NEXT:    xorps %xmm0, %xmm0 # encoding: [0x0f,0x57,0xc0]
 ; X86-SSE-NEXT:    movss %xmm1, %xmm0 # encoding: [0xf3,0x0f,0x10,0xc1]
 ; X86-SSE-NEXT:    # xmm0 = xmm1[0],xmm0[1,2,3]
@@ -2215,8 +2215,8 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ;
 ; X86-AVX1-LABEL: test_mm_set_ss:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
 ; X86-AVX1-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
@@ -2224,8 +2224,8 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 ;
 ; X86-AVX512-LABEL: test_mm_set_ss:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vxorps %xmm1, %xmm1, %xmm1 # encoding: [0xc5,0xf0,0x57,0xc9]
 ; X86-AVX512-NEXT:    vblendps $1, %xmm0, %xmm1, %xmm0 # encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x01]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[1,2,3]
@@ -2255,24 +2255,24 @@ define <4 x float> @test_mm_set_ss(float %a0) nounwind {
 define <4 x float> @test_mm_set1_ps(float %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_set1_ps:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    shufps $0, %xmm0, %xmm0 # encoding: [0x0f,0xc6,0xc0,0x00]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_set1_ps:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_set1_ps:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vbroadcastss %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
@@ -2335,16 +2335,16 @@ define void @test_mm_setcsr(i32 %a0) nounwind {
 define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind {
 ; X86-SSE-LABEL: test_mm_setr_ps:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
 ; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
-; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
@@ -2353,14 +2353,14 @@ define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3)
 ;
 ; X86-AVX1-LABEL: test_mm_setr_ps:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
-; X86-AVX1-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
 ; X86-AVX1-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
 ; X86-AVX1-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; X86-AVX1-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
@@ -2371,14 +2371,14 @@ define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3)
 ;
 ; X86-AVX512-LABEL: test_mm_setr_ps:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm2 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
-; X86-AVX512-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm3 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm3 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x10]
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x0c]
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x54,0x24,0x08]
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x5c,0x24,0x04]
 ; X86-AVX512-NEXT:    vinsertps $16, %xmm2, %xmm3, %xmm2 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x61,0x21,0xd2,0x10]
 ; X86-AVX512-NEXT:    # xmm2 = xmm3[0],xmm2[0],xmm3[2,3]
 ; X86-AVX512-NEXT:    vinsertps $32, %xmm1, %xmm2, %xmm1 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x69,0x21,0xc9,0x20]
@@ -2520,8 +2520,8 @@ define float @test_mm_sqrt_ss_scalar(float %a0) {
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %eax # encoding: [0x50]
 ; X86-SSE-NEXT:    .cfi_def_cfa_offset 8
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
 ; X86-SSE-NEXT:    sqrtss %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x51,0xc0]
 ; X86-SSE-NEXT:    movss %xmm0, (%esp) # encoding: [0xf3,0x0f,0x11,0x04,0x24]
 ; X86-SSE-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
@@ -2533,8 +2533,8 @@ define float @test_mm_sqrt_ss_scalar(float %a0) {
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    pushl %eax # encoding: [0x50]
 ; X86-AVX1-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
 ; X86-AVX1-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x51,0xc0]
 ; X86-AVX1-NEXT:    vmovss %xmm0, (%esp) # encoding: [0xc5,0xfa,0x11,0x04,0x24]
 ; X86-AVX1-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
@@ -2546,8 +2546,8 @@ define float @test_mm_sqrt_ss_scalar(float %a0) {
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    pushl %eax # encoding: [0x50]
 ; X86-AVX512-NEXT:    .cfi_def_cfa_offset 8
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x08]
 ; X86-AVX512-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x51,0xc0]
 ; X86-AVX512-NEXT:    vmovss %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x11,0x04,0x24]
 ; X86-AVX512-NEXT:    flds (%esp) # encoding: [0xd9,0x04,0x24]
diff --git a/llvm/test/CodeGen/X86/sse-load-ret.ll b/llvm/test/CodeGen/X86/sse-load-ret.ll
index f1c7c9014cadb..0e9b24360e2f8 100644
--- a/llvm/test/CodeGen/X86/sse-load-ret.ll
+++ b/llvm/test/CodeGen/X86/sse-load-ret.ll
@@ -33,10 +33,10 @@ define double @test3(i1 %B) {
 ; CHECK-NEXT:    testb $1, 8(%ebp)
 ; CHECK-NEXT:    jne .LBB2_1
 ; CHECK-NEXT:  # %bb.2:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [5.2301123123000002E+2,0.0E+0]
 ; CHECK-NEXT:    jmp .LBB2_3
 ; CHECK-NEXT:  .LBB2_1:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.2341200000000001E+2,0.0E+0]
 ; CHECK-NEXT:  .LBB2_3:
 ; CHECK-NEXT:    movsd %xmm0, (%esp)
 ; CHECK-NEXT:    fldl (%esp)
diff --git a/llvm/test/CodeGen/X86/sse-minmax.ll b/llvm/test/CodeGen/X86/sse-minmax.ll
index 93edca04b775b..1c14b7400a358 100644
--- a/llvm/test/CodeGen/X86/sse-minmax.ll
+++ b/llvm/test/CodeGen/X86/sse-minmax.ll
@@ -690,7 +690,7 @@ define double @olt_y(double %x)  {
 define double @ogt_inverse_y(double %x)  {
 ; STRICT-LABEL: ogt_inverse_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    minsd %xmm0, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
@@ -702,7 +702,7 @@ define double @ogt_inverse_y(double %x)  {
 ;
 ; FINITE-LABEL: ogt_inverse_y:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; FINITE-NEXT:    minsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -714,7 +714,7 @@ define double @ogt_inverse_y(double %x)  {
 define double @olt_inverse_y(double %x)  {
 ; STRICT-LABEL: olt_inverse_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    maxsd %xmm0, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
@@ -726,7 +726,7 @@ define double @olt_inverse_y(double %x)  {
 ;
 ; FINITE-LABEL: olt_inverse_y:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; FINITE-NEXT:    maxsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -738,7 +738,7 @@ define double @olt_inverse_y(double %x)  {
 define double @oge_y(double %x)  {
 ; STRICT-LABEL: oge_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    movapd %xmm1, %xmm2
 ; STRICT-NEXT:    cmplesd %xmm0, %xmm2
 ; STRICT-NEXT:    andpd %xmm2, %xmm0
@@ -758,7 +758,7 @@ define double @oge_y(double %x)  {
 define double @ole_y(double %x)  {
 ; STRICT-LABEL: ole_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    movapd %xmm0, %xmm2
 ; STRICT-NEXT:    cmplesd %xmm1, %xmm2
 ; STRICT-NEXT:    andpd %xmm2, %xmm0
@@ -778,7 +778,7 @@ define double @ole_y(double %x)  {
 define double @oge_inverse_y(double %x)  {
 ; STRICT-LABEL: oge_inverse_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    movapd %xmm2, %xmm1
 ; STRICT-NEXT:    cmplesd %xmm0, %xmm1
 ; STRICT-NEXT:    andpd %xmm1, %xmm2
@@ -794,7 +794,7 @@ define double @oge_inverse_y(double %x)  {
 ;
 ; FINITE-LABEL: oge_inverse_y:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; FINITE-NEXT:    minsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -806,7 +806,7 @@ define double @oge_inverse_y(double %x)  {
 define double @ole_inverse_y(double %x)  {
 ; STRICT-LABEL: ole_inverse_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmplesd %xmm2, %xmm1
 ; STRICT-NEXT:    andpd %xmm1, %xmm2
@@ -822,7 +822,7 @@ define double @ole_inverse_y(double %x)  {
 ;
 ; FINITE-LABEL: ole_inverse_y:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; FINITE-NEXT:    maxsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -834,7 +834,7 @@ define double @ole_inverse_y(double %x)  {
 define double @ugt_y(double %x)  {
 ; STRICT-LABEL: ugt_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    movapd %xmm0, %xmm2
 ; STRICT-NEXT:    cmpnlesd %xmm1, %xmm2
 ; STRICT-NEXT:    andpd %xmm2, %xmm0
@@ -854,7 +854,7 @@ define double @ugt_y(double %x)  {
 define double @ult_y(double %x)  {
 ; STRICT-LABEL: ult_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    movapd %xmm1, %xmm2
 ; STRICT-NEXT:    cmpnlesd %xmm0, %xmm2
 ; STRICT-NEXT:    andpd %xmm2, %xmm0
@@ -874,7 +874,7 @@ define double @ult_y(double %x)  {
 define double @ugt_inverse_y(double %x)  {
 ; STRICT-LABEL: ugt_inverse_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    movapd %xmm0, %xmm1
 ; STRICT-NEXT:    cmpnlesd %xmm2, %xmm1
 ; STRICT-NEXT:    andpd %xmm1, %xmm2
@@ -890,7 +890,7 @@ define double @ugt_inverse_y(double %x)  {
 ;
 ; FINITE-LABEL: ugt_inverse_y:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; FINITE-NEXT:    minsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -902,7 +902,7 @@ define double @ugt_inverse_y(double %x)  {
 define double @ult_inverse_y(double %x)  {
 ; STRICT-LABEL: ult_inverse_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm2 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    movapd %xmm2, %xmm1
 ; STRICT-NEXT:    cmpnlesd %xmm0, %xmm1
 ; STRICT-NEXT:    andpd %xmm1, %xmm2
@@ -918,7 +918,7 @@ define double @ult_inverse_y(double %x)  {
 ;
 ; FINITE-LABEL: ult_inverse_y:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; FINITE-NEXT:    maxsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -930,7 +930,7 @@ define double @ult_inverse_y(double %x)  {
 define double @uge_y(double %x)  {
 ; STRICT-LABEL: uge_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    maxsd %xmm0, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
@@ -947,7 +947,7 @@ define double @uge_y(double %x)  {
 define double @ule_y(double %x)  {
 ; STRICT-LABEL: ule_y:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; STRICT-NEXT:    minsd %xmm0, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
@@ -974,7 +974,7 @@ define double @uge_inverse_y(double %x)  {
 ;
 ; FINITE-LABEL: uge_inverse_y:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; FINITE-NEXT:    minsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -996,7 +996,7 @@ define double @ule_inverse_y(double %x)  {
 ;
 ; FINITE-LABEL: ule_inverse_y:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [-0.0E+0,0.0E+0]
 ; FINITE-NEXT:    maxsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -1010,7 +1010,7 @@ define double @ule_inverse_y(double %x)  {
 define double @clampTo3k_a(double %x)  {
 ; STRICT-LABEL: clampTo3k_a:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; STRICT-NEXT:    minsd %xmm0, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
@@ -1022,7 +1022,7 @@ define double @clampTo3k_a(double %x)  {
 ;
 ; FINITE-LABEL: clampTo3k_a:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; FINITE-NEXT:    minsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -1044,7 +1044,7 @@ define double @clampTo3k_b(double %x)  {
 ;
 ; FINITE-LABEL: clampTo3k_b:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; FINITE-NEXT:    minsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -1056,7 +1056,7 @@ define double @clampTo3k_b(double %x)  {
 define double @clampTo3k_c(double %x)  {
 ; STRICT-LABEL: clampTo3k_c:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; STRICT-NEXT:    maxsd %xmm0, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
@@ -1068,7 +1068,7 @@ define double @clampTo3k_c(double %x)  {
 ;
 ; FINITE-LABEL: clampTo3k_c:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; FINITE-NEXT:    maxsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -1090,7 +1090,7 @@ define double @clampTo3k_d(double %x)  {
 ;
 ; FINITE-LABEL: clampTo3k_d:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; FINITE-NEXT:    maxsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -1102,7 +1102,7 @@ define double @clampTo3k_d(double %x)  {
 define double @clampTo3k_e(double %x)  {
 ; STRICT-LABEL: clampTo3k_e:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; STRICT-NEXT:    maxsd %xmm0, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
@@ -1114,7 +1114,7 @@ define double @clampTo3k_e(double %x)  {
 ;
 ; FINITE-LABEL: clampTo3k_e:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; FINITE-NEXT:    maxsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -1136,7 +1136,7 @@ define double @clampTo3k_f(double %x)  {
 ;
 ; FINITE-LABEL: clampTo3k_f:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; FINITE-NEXT:    maxsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -1148,7 +1148,7 @@ define double @clampTo3k_f(double %x)  {
 define double @clampTo3k_g(double %x)  {
 ; STRICT-LABEL: clampTo3k_g:
 ; STRICT:       # %bb.0:
-; STRICT-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; STRICT-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; STRICT-NEXT:    minsd %xmm0, %xmm1
 ; STRICT-NEXT:    movapd %xmm1, %xmm0
 ; STRICT-NEXT:    retq
@@ -1160,7 +1160,7 @@ define double @clampTo3k_g(double %x)  {
 ;
 ; FINITE-LABEL: clampTo3k_g:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; FINITE-NEXT:    minsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -1182,7 +1182,7 @@ define double @clampTo3k_h(double %x)  {
 ;
 ; FINITE-LABEL: clampTo3k_h:
 ; FINITE:       # %bb.0:
-; FINITE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; FINITE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+3,0.0E+0]
 ; FINITE-NEXT:    minsd %xmm0, %xmm1
 ; FINITE-NEXT:    movapd %xmm1, %xmm0
 ; FINITE-NEXT:    retq
@@ -1344,7 +1344,7 @@ define <3 x float> @test_minps_illegal_v3f32(<3 x float> %x, <3 x float> %y)  {
 define float @ossfuzz13838(float %x) {
 ; ALL-LABEL: ossfuzz13838:
 ; ALL:       # %bb.0: # %bb
-; ALL-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; ALL-NEXT:    movss {{.*#+}} xmm0 = [2.55E+2,0.0E+0,0.0E+0,0.0E+0]
 ; ALL-NEXT:    retq
 bb:
   %cmp2 = fcmp fast olt float %x, 2.550000e+02
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index f048002bf56f2..8ac86d11d89e6 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -52,17 +52,17 @@ define <4 x float> @vselect(ptr%p, <4 x i32> %q) {
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB1_5
 ; X86-NEXT:  .LBB1_4:
-; X86-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-NEXT:    movss {{.*#+}} xmm2 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB1_8
 ; X86-NEXT:  .LBB1_7:
-; X86-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X86-NEXT:    movss {{.*#+}} xmm3 = [4.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB1_10
 ; X86-NEXT:    jmp .LBB1_11
 ; X86-NEXT:  .LBB1_1:
-; X86-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-NEXT:    movss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    je .LBB1_4
 ; X86-NEXT:  .LBB1_5: # %entry
@@ -75,7 +75,7 @@ define <4 x float> @vselect(ptr%p, <4 x i32> %q) {
 ; X86-NEXT:    cmpl $0, {{[0-9]+}}(%esp)
 ; X86-NEXT:    jne .LBB1_11
 ; X86-NEXT:  .LBB1_10:
-; X86-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X86-NEXT:  .LBB1_11: # %entry
 ; X86-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
@@ -91,17 +91,17 @@ define <4 x float> @vselect(ptr%p, <4 x i32> %q) {
 ; X64-NEXT:    testl %ecx, %ecx
 ; X64-NEXT:    jne .LBB1_5
 ; X64-NEXT:  .LBB1_4:
-; X64-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm2 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    testl %r8d, %r8d
 ; X64-NEXT:    jne .LBB1_8
 ; X64-NEXT:  .LBB1_7:
-; X64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm3 = [4.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
 ; X64-NEXT:    testl %esi, %esi
 ; X64-NEXT:    je .LBB1_10
 ; X64-NEXT:    jmp .LBB1_11
 ; X64-NEXT:  .LBB1_1:
-; X64-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm1 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:    testl %ecx, %ecx
 ; X64-NEXT:    je .LBB1_4
 ; X64-NEXT:  .LBB1_5: # %entry
@@ -114,7 +114,7 @@ define <4 x float> @vselect(ptr%p, <4 x i32> %q) {
 ; X64-NEXT:    testl %esi, %esi
 ; X64-NEXT:    jne .LBB1_11
 ; X64-NEXT:  .LBB1_10:
-; X64-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; X64-NEXT:  .LBB1_11: # %entry
 ; X64-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X64-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
index eba390733794e..adf4fc28208e7 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -1679,20 +1679,20 @@ define <2 x double> @test_mm_cvtsi32_sd(<2 x double> %a0, i32 %a1) nounwind {
 define <2 x i64> @test_mm_cvtsi32_si128(i32 %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_cvtsi32_si128:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_cvtsi32_si128:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_cvtsi32_si128:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_cvtsi32_si128:
@@ -2000,58 +2000,58 @@ define <2 x double> @test_mm_load_sd(ptr %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_load_sd:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movsd (%eax), %xmm0 # encoding: [0xf2,0x0f,0x10,0x00]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x00]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_load_sd:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovsd (%eax), %xmm0 # encoding: [0xc5,0xfb,0x10,0x00]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x00]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_load_sd:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_load_sd:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movsd (%rdi), %xmm0 # encoding: [0xf2,0x0f,0x10,0x07]
-; X64-SSE-NEXT:    # xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x07]
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_mm_load_sd:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; X64-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X64-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_mm_load_sd:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; X64-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X64-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-SSE-LABEL: test_mm_load_sd:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movsd (%edi), %xmm0 # encoding: [0x67,0xf2,0x0f,0x10,0x07]
-; X32-SSE-NEXT:    # xmm0 = mem[0],zero
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT:    # encoding: [0x67,0xf2,0x0f,0x10,0x07]
 ; X32-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-AVX1-LABEL: test_mm_load_sd:
 ; X32-AVX1:       # %bb.0:
-; X32-AVX1-NEXT:    vmovsd (%edi), %xmm0 # encoding: [0x67,0xc5,0xfb,0x10,0x07]
-; X32-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX1-NEXT:    # encoding: [0x67,0xc5,0xfb,0x10,0x07]
 ; X32-AVX1-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-AVX512-LABEL: test_mm_load_sd:
 ; X32-AVX512:       # %bb.0:
-; X32-AVX512-NEXT:    vmovsd (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x10,0x07]
-; X32-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x10,0x07]
 ; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ld = load double, ptr %a0, align 1
   %res0 = insertelement <2 x double> undef, double %ld, i32 0
@@ -2115,8 +2115,8 @@ define <2 x double> @test_mm_load1_pd(ptr %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_load1_pd:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movsd (%eax), %xmm0 # encoding: [0xf2,0x0f,0x10,0x00]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x00]
 ; X86-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
@@ -2137,8 +2137,8 @@ define <2 x double> @test_mm_load1_pd(ptr %a0) nounwind {
 ;
 ; X64-SSE-LABEL: test_mm_load1_pd:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movsd (%rdi), %xmm0 # encoding: [0xf2,0x0f,0x10,0x07]
-; X64-SSE-NEXT:    # xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x07]
 ; X64-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
 ; X64-SSE-NEXT:    # xmm0 = xmm0[0,0]
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
@@ -2157,8 +2157,8 @@ define <2 x double> @test_mm_load1_pd(ptr %a0) nounwind {
 ;
 ; X32-SSE-LABEL: test_mm_load1_pd:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movsd (%edi), %xmm0 # encoding: [0x67,0xf2,0x0f,0x10,0x07]
-; X32-SSE-NEXT:    # xmm0 = mem[0],zero
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT:    # encoding: [0x67,0xf2,0x0f,0x10,0x07]
 ; X32-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
 ; X32-SSE-NEXT:    # xmm0 = xmm0[0,0]
 ; X32-SSE-NEXT:    retq # encoding: [0xc3]
@@ -2246,58 +2246,58 @@ define <2 x i64> @test_mm_loadl_epi64(<2 x i64> %a0, ptr %a1) nounwind {
 ; X86-SSE-LABEL: test_mm_loadl_epi64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movsd (%eax), %xmm0 # encoding: [0xf2,0x0f,0x10,0x00]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x00]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_loadl_epi64:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovsd (%eax), %xmm0 # encoding: [0xc5,0xfb,0x10,0x00]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x00]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_loadl_epi64:
 ; X86-AVX512:       # %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_loadl_epi64:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movsd (%rdi), %xmm0 # encoding: [0xf2,0x0f,0x10,0x07]
-; X64-SSE-NEXT:    # xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x07]
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_mm_loadl_epi64:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; X64-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X64-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_mm_loadl_epi64:
 ; X64-AVX512:       # %bb.0:
-; X64-AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; X64-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X64-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-SSE-LABEL: test_mm_loadl_epi64:
 ; X32-SSE:       # %bb.0:
-; X32-SSE-NEXT:    movsd (%edi), %xmm0 # encoding: [0x67,0xf2,0x0f,0x10,0x07]
-; X32-SSE-NEXT:    # xmm0 = mem[0],zero
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT:    # encoding: [0x67,0xf2,0x0f,0x10,0x07]
 ; X32-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-AVX1-LABEL: test_mm_loadl_epi64:
 ; X32-AVX1:       # %bb.0:
-; X32-AVX1-NEXT:    vmovsd (%edi), %xmm0 # encoding: [0x67,0xc5,0xfb,0x10,0x07]
-; X32-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX1-NEXT:    # encoding: [0x67,0xc5,0xfb,0x10,0x07]
 ; X32-AVX1-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-AVX512-LABEL: test_mm_loadl_epi64:
 ; X32-AVX512:       # %bb.0:
-; X32-AVX512-NEXT:    vmovsd (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x10,0x07]
-; X32-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x10,0x07]
 ; X32-AVX512-NEXT:    retq # encoding: [0xc3]
   %ld = load i64, ptr %a1, align 1
   %res0 = insertelement <2 x i64> undef, i64 %ld, i32 0
@@ -2540,58 +2540,58 @@ define <2 x i64> @test_mm_loadu_si64(ptr nocapture readonly %A) {
 ; X86-SSE-LABEL: test_mm_loadu_si64:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movsd (%eax), %xmm0 # encoding: [0xf2,0x0f,0x10,0x00]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x00]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_loadu_si64:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovsd (%eax), %xmm0 # encoding: [0xc5,0xfb,0x10,0x00]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x00]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_loadu_si64:
 ; X86-AVX512:       # %bb.0: # %entry
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovsd (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x00]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_loadu_si64:
 ; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movsd (%rdi), %xmm0 # encoding: [0xf2,0x0f,0x10,0x07]
-; X64-SSE-NEXT:    # xmm0 = mem[0],zero
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X64-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x07]
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_mm_loadu_si64:
 ; X64-AVX1:       # %bb.0: # %entry
-; X64-AVX1-NEXT:    vmovsd (%rdi), %xmm0 # encoding: [0xc5,0xfb,0x10,0x07]
-; X64-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X64-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_mm_loadu_si64:
 ; X64-AVX512:       # %bb.0: # %entry
-; X64-AVX512-NEXT:    vmovsd (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
-; X64-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X64-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X64-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-SSE-LABEL: test_mm_loadu_si64:
 ; X32-SSE:       # %bb.0: # %entry
-; X32-SSE-NEXT:    movsd (%edi), %xmm0 # encoding: [0x67,0xf2,0x0f,0x10,0x07]
-; X32-SSE-NEXT:    # xmm0 = mem[0],zero
+; X32-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X32-SSE-NEXT:    # encoding: [0x67,0xf2,0x0f,0x10,0x07]
 ; X32-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-AVX1-LABEL: test_mm_loadu_si64:
 ; X32-AVX1:       # %bb.0: # %entry
-; X32-AVX1-NEXT:    vmovsd (%edi), %xmm0 # encoding: [0x67,0xc5,0xfb,0x10,0x07]
-; X32-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX1-NEXT:    # encoding: [0x67,0xc5,0xfb,0x10,0x07]
 ; X32-AVX1-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-AVX512-LABEL: test_mm_loadu_si64:
 ; X32-AVX512:       # %bb.0: # %entry
-; X32-AVX512-NEXT:    vmovsd (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x10,0x07]
-; X32-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X32-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X32-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfb,0x10,0x07]
 ; X32-AVX512-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = load i64, ptr %A, align 1
@@ -2603,58 +2603,58 @@ define <2 x i64> @test_mm_loadu_si32(ptr nocapture readonly %A) {
 ; X86-SSE-LABEL: test_mm_loadu_si32:
 ; X86-SSE:       # %bb.0: # %entry
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movss (%eax), %xmm0 # encoding: [0xf3,0x0f,0x10,0x00]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x00]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_loadu_si32:
 ; X86-AVX1:       # %bb.0: # %entry
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovss (%eax), %xmm0 # encoding: [0xc5,0xfa,0x10,0x00]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x00]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_loadu_si32:
 ; X86-AVX512:       # %bb.0: # %entry
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovss (%eax), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x00]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_mm_loadu_si32:
 ; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movss (%rdi), %xmm0 # encoding: [0xf3,0x0f,0x10,0x07]
-; X64-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x07]
 ; X64-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_mm_loadu_si32:
 ; X64-AVX1:       # %bb.0: # %entry
-; X64-AVX1-NEXT:    vmovss (%rdi), %xmm0 # encoding: [0xc5,0xfa,0x10,0x07]
-; X64-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x07]
 ; X64-AVX1-NEXT:    retq # encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_mm_loadu_si32:
 ; X64-AVX512:       # %bb.0: # %entry
-; X64-AVX512-NEXT:    vmovss (%rdi), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
-; X64-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x07]
 ; X64-AVX512-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-SSE-LABEL: test_mm_loadu_si32:
 ; X32-SSE:       # %bb.0: # %entry
-; X32-SSE-NEXT:    movss (%edi), %xmm0 # encoding: [0x67,0xf3,0x0f,0x10,0x07]
-; X32-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-SSE-NEXT:    # encoding: [0x67,0xf3,0x0f,0x10,0x07]
 ; X32-SSE-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-AVX1-LABEL: test_mm_loadu_si32:
 ; X32-AVX1:       # %bb.0: # %entry
-; X32-AVX1-NEXT:    vmovss (%edi), %xmm0 # encoding: [0x67,0xc5,0xfa,0x10,0x07]
-; X32-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X32-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX1-NEXT:    # encoding: [0x67,0xc5,0xfa,0x10,0x07]
 ; X32-AVX1-NEXT:    retq # encoding: [0xc3]
 ;
 ; X32-AVX512-LABEL: test_mm_loadu_si32:
 ; X32-AVX512:       # %bb.0: # %entry
-; X32-AVX512-NEXT:    vmovss (%edi), %xmm0 # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfa,0x10,0x07]
-; X32-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X32-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X32-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0x67,0xc5,0xfa,0x10,0x07]
 ; X32-AVX512-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = load i32, ptr %A, align 1
@@ -3937,16 +3937,16 @@ define <2 x i64> @test_mm_set_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4,
 define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
 ; X86-SSE-LABEL: test_mm_set_epi32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x08]
 ; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x0c]
-; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x0c]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
@@ -3955,8 +3955,8 @@ define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
 ;
 ; X86-AVX1-LABEL: test_mm_set_epi32:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x10]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x10]
 ; X86-AVX1-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x0c,0x01]
 ; X86-AVX1-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x02]
 ; X86-AVX1-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x03]
@@ -3964,8 +3964,8 @@ define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
 ;
 ; X86-AVX512-LABEL: test_mm_set_epi32:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x10]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x10]
 ; X86-AVX512-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x0c,0x01]
 ; X86-AVX512-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x02]
 ; X86-AVX512-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x03]
@@ -4043,16 +4043,16 @@ define <2 x i64> @test_mm_set_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind
 define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
 ; X86-SSE-LABEL: test_mm_set_epi64x:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x08]
 ; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x0c]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x10]
-; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x0c]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x10]
 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
@@ -4061,8 +4061,8 @@ define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
 ;
 ; X86-AVX1-LABEL: test_mm_set_epi64x:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x0c]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x0c]
 ; X86-AVX1-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x10,0x01]
 ; X86-AVX1-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x02]
 ; X86-AVX1-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x03]
@@ -4070,8 +4070,8 @@ define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
 ;
 ; X86-AVX512-LABEL: test_mm_set_epi64x:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x0c]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x0c]
 ; X86-AVX512-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x10,0x01]
 ; X86-AVX512-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x04,0x02]
 ; X86-AVX512-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x03]
@@ -4132,30 +4132,30 @@ define <2 x i64> @test_mm_set_epi64x(i64 %a0, i64 %a1) nounwind {
 define <2 x double> @test_mm_set_pd(double %a0, double %a1) nounwind {
 ; X86-SSE-LABEL: test_mm_set_pd:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf2,0x0f,0x10,0x44,0x24,0x0c]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movsd {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf2,0x0f,0x10,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x44,0x24,0x0c]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x4c,0x24,0x04]
 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_set_pd:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero
-; X86-AVX1-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c]
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x16,0xc1]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0],xmm1[0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_set_pd:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero
-; X86-AVX512-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c]
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovlhps %xmm1, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf8,0x16,0xc1]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],xmm1[0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
@@ -4205,24 +4205,24 @@ define <2 x double> @test_mm_set_pd(double %a0, double %a1) nounwind {
 define <2 x double> @test_mm_set_pd1(double %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_set_pd1:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf2,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_set_pd1:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_set_pd1:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0,0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
@@ -4270,24 +4270,24 @@ define <2 x double> @test_mm_set_pd1(double %a0) nounwind {
 define <2 x double> @test_mm_set_sd(double %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_set_sd:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movq {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x7e,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x7e,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movq %xmm0, %xmm0 # encoding: [0xf3,0x0f,0x7e,0xc0]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],zero
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_set_sd:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovq {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x7e,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x7e,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovq %xmm0, %xmm0 # encoding: [0xc5,0xfa,0x7e,0xc0]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0],zero
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_set_sd:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovq {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfa,0x7e,0xc0]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0],zero
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
@@ -4513,16 +4513,16 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind {
 define <2 x i64> @test_mm_set1_epi32(i32 %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_set1_epi32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movd {{[0-9]+}}(%esp), %xmm0 # encoding: [0x66,0x0f,0x6e,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0x66,0x0f,0x6e,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_set1_epi32:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfa,0x10,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vshufps $0, %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0xc6,0xc0,0x00]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0,0,0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
@@ -4583,10 +4583,10 @@ define <2 x i64> @test_mm_set1_epi32(i32 %a0) nounwind {
 define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_set1_epi64x:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movd {{[0-9]+}}(%esp), %xmm0 # encoding: [0x66,0x0f,0x6e,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movd {{[0-9]+}}(%esp), %xmm1 # encoding: [0x66,0x0f,0x6e,0x4c,0x24,0x08]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0x66,0x0f,0x6e,0x44,0x24,0x04]
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0x66,0x0f,0x6e,0x4c,0x24,0x08]
 ; X86-SSE-NEXT:    punpckldq %xmm1, %xmm0 # encoding: [0x66,0x0f,0x62,0xc1]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; X86-SSE-NEXT:    pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44]
@@ -4595,8 +4595,8 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
 ;
 ; X86-AVX1-LABEL: test_mm_set1_epi64x:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x01]
 ; X86-AVX1-NEXT:    vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,1,0,1]
@@ -4604,8 +4604,8 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
 ;
 ; X86-AVX512-LABEL: test_mm_set1_epi64x:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x01]
 ; X86-AVX512-NEXT:    vpbroadcastq %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
@@ -4655,24 +4655,24 @@ define <2 x i64> @test_mm_set1_epi64x(i64 %a0) nounwind {
 define <2 x double> @test_mm_set1_pd(double %a0) nounwind {
 ; X86-SSE-LABEL: test_mm_set1_pd:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf2,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movlhps %xmm0, %xmm0 # encoding: [0x0f,0x16,0xc0]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0,0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_set1_pd:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovddup %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x12,0xc0]
 ; X86-AVX1-NEXT:    # xmm0 = xmm0[0,0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_set1_pd:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovddup %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x12,0xc0]
 ; X86-AVX512-NEXT:    # xmm0 = xmm0[0,0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
@@ -5328,16 +5328,16 @@ define <2 x i64> @test_mm_setr_epi16(i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4
 define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwind {
 ; X86-SSE-LABEL: test_mm_setr_epi32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
 ; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
-; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
@@ -5346,8 +5346,8 @@ define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwin
 ;
 ; X86-AVX1-LABEL: test_mm_setr_epi32:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x01]
 ; X86-AVX1-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x0c,0x02]
 ; X86-AVX1-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x10,0x03]
@@ -5355,8 +5355,8 @@ define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwin
 ;
 ; X86-AVX512-LABEL: test_mm_setr_epi32:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x01]
 ; X86-AVX512-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x0c,0x02]
 ; X86-AVX512-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x10,0x03]
@@ -5434,16 +5434,16 @@ define <2 x i64> @test_mm_setr_epi32(i32 %a0, i32 %a1, i32 %a2, i32 %a3) nounwin
 define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
 ; X86-SSE-LABEL: test_mm_setr_epi64x:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x0c]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x10]
 ; X86-SSE-NEXT:    unpcklps %xmm0, %xmm1 # encoding: [0x0f,0x14,0xc8]
 ; X86-SSE-NEXT:    # xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm2 # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
-; X86-SSE-NEXT:    # xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x44,0x24,0x04]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    # encoding: [0xf3,0x0f,0x10,0x54,0x24,0x08]
 ; X86-SSE-NEXT:    unpcklps %xmm2, %xmm0 # encoding: [0x0f,0x14,0xc2]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
@@ -5452,8 +5452,8 @@ define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
 ;
 ; X86-AVX1-LABEL: test_mm_setr_epi64x:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
 ; X86-AVX1-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x01]
 ; X86-AVX1-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x0c,0x02]
 ; X86-AVX1-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 # encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x10,0x03]
@@ -5461,8 +5461,8 @@ define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
 ;
 ; X86-AVX512-LABEL: test_mm_setr_epi64x:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6e,0x44,0x24,0x04]
 ; X86-AVX512-NEXT:    vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x08,0x01]
 ; X86-AVX512-NEXT:    vpinsrd $2, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x0c,0x02]
 ; X86-AVX512-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x22,0x44,0x24,0x10,0x03]
@@ -5523,30 +5523,30 @@ define <2 x i64> @test_mm_setr_epi64x(i64 %a0, i64 %a1) nounwind {
 define <2 x double> @test_mm_setr_pd(double %a0, double %a1) nounwind {
 ; X86-SSE-LABEL: test_mm_setr_pd:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movsd {{[0-9]+}}(%esp), %xmm1 # encoding: [0xf2,0x0f,0x10,0x4c,0x24,0x0c]
-; X86-SSE-NEXT:    # xmm1 = mem[0],zero
-; X86-SSE-NEXT:    movsd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xf2,0x0f,0x10,0x44,0x24,0x04]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x4c,0x24,0x0c]
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x44,0x24,0x04]
 ; X86-SSE-NEXT:    movlhps %xmm1, %xmm0 # encoding: [0x0f,0x16,0xc1]
 ; X86-SSE-NEXT:    # xmm0 = xmm0[0],xmm1[0]
 ; X86-SSE-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_mm_setr_pd:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 # encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero
-; X86-AVX1-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 # encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04]
-; X86-AVX1-NEXT:    # xmm1 = mem[0],zero
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c]
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 # encoding: [0xc5,0xf0,0x16,0xc0]
 ; X86-AVX1-NEXT:    # xmm0 = xmm1[0],xmm0[0]
 ; X86-AVX1-NEXT:    retl # encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_mm_setr_pd:
 ; X86-AVX512:       # %bb.0:
-; X86-AVX512-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero
-; X86-AVX512-NEXT:    vmovsd {{[0-9]+}}(%esp), %xmm1 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT:    # xmm1 = mem[0],zero
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x44,0x24,0x0c]
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
 ; X86-AVX512-NEXT:    # xmm0 = xmm1[0],xmm0[0]
 ; X86-AVX512-NEXT:    retl # encoding: [0xc3]
@@ -5925,8 +5925,8 @@ define double @test_mm_sqrt_sd_scalar(double %a0) nounwind {
 ; X86-SSE-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
 ; X86-SSE-NEXT:    andl $-8, %esp # encoding: [0x83,0xe4,0xf8]
 ; X86-SSE-NEXT:    subl $8, %esp # encoding: [0x83,0xec,0x08]
-; X86-SSE-NEXT:    movsd 8(%ebp), %xmm0 # encoding: [0xf2,0x0f,0x10,0x45,0x08]
-; X86-SSE-NEXT:    # xmm0 = mem[0],zero
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-SSE-NEXT:    # encoding: [0xf2,0x0f,0x10,0x45,0x08]
 ; X86-SSE-NEXT:    sqrtsd %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x51,0xc0]
 ; X86-SSE-NEXT:    movsd %xmm0, (%esp) # encoding: [0xf2,0x0f,0x11,0x04,0x24]
 ; X86-SSE-NEXT:    fldl (%esp) # encoding: [0xdd,0x04,0x24]
@@ -5940,8 +5940,8 @@ define double @test_mm_sqrt_sd_scalar(double %a0) nounwind {
 ; X86-AVX1-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
 ; X86-AVX1-NEXT:    andl $-8, %esp # encoding: [0x83,0xe4,0xf8]
 ; X86-AVX1-NEXT:    subl $8, %esp # encoding: [0x83,0xec,0x08]
-; X86-AVX1-NEXT:    vmovsd 8(%ebp), %xmm0 # encoding: [0xc5,0xfb,0x10,0x45,0x08]
-; X86-AVX1-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX1-NEXT:    # encoding: [0xc5,0xfb,0x10,0x45,0x08]
 ; X86-AVX1-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x51,0xc0]
 ; X86-AVX1-NEXT:    vmovsd %xmm0, (%esp) # encoding: [0xc5,0xfb,0x11,0x04,0x24]
 ; X86-AVX1-NEXT:    fldl (%esp) # encoding: [0xdd,0x04,0x24]
@@ -5955,8 +5955,8 @@ define double @test_mm_sqrt_sd_scalar(double %a0) nounwind {
 ; X86-AVX512-NEXT:    movl %esp, %ebp # encoding: [0x89,0xe5]
 ; X86-AVX512-NEXT:    andl $-8, %esp # encoding: [0x83,0xe4,0xf8]
 ; X86-AVX512-NEXT:    subl $8, %esp # encoding: [0x83,0xec,0x08]
-; X86-AVX512-NEXT:    vmovsd 8(%ebp), %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x45,0x08]
-; X86-AVX512-NEXT:    # xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; X86-AVX512-NEXT:    # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x45,0x08]
 ; X86-AVX512-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x51,0xc0]
 ; X86-AVX512-NEXT:    vmovsd %xmm0, (%esp) # EVEX TO VEX Compression encoding: [0xc5,0xfb,0x11,0x04,0x24]
 ; X86-AVX512-NEXT:    fldl (%esp) # encoding: [0xdd,0x04,0x24]
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
index 21dcddf2a05cf..f6b0df153c260 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -711,8 +711,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X86-SSE-LABEL: test_x86_sse2_cvtss2sd_load:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movss (%eax), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x08]
-; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x08]
 ; X86-SSE-NEXT:    cvtss2sd %xmm1, %xmm1 ## encoding: [0xf3,0x0f,0x5a,0xc9]
 ; X86-SSE-NEXT:    movsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x10,0xc1]
 ; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
@@ -721,8 +721,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X86-AVX1-LABEL: test_x86_sse2_cvtss2sd_load:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovss (%eax), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x08]
-; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x08]
 ; X86-AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0xc9]
 ; X86-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
@@ -731,8 +731,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ; X86-AVX512-LABEL: test_x86_sse2_cvtss2sd_load:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
-; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
 ; X86-AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
 ; X86-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
@@ -740,8 +740,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ;
 ; X64-SSE-LABEL: test_x86_sse2_cvtss2sd_load:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movss (%rdi), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x0f]
-; X64-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x0f]
 ; X64-SSE-NEXT:    cvtss2sd %xmm1, %xmm1 ## encoding: [0xf3,0x0f,0x5a,0xc9]
 ; X64-SSE-NEXT:    movsd %xmm1, %xmm0 ## encoding: [0xf2,0x0f,0x10,0xc1]
 ; X64-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1]
@@ -749,8 +749,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ;
 ; X64-AVX1-LABEL: test_x86_sse2_cvtss2sd_load:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovss (%rdi), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x0f]
-; X64-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x0f]
 ; X64-AVX1-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf2,0x5a,0xc9]
 ; X64-AVX1-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
 ; X64-AVX1-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
@@ -758,8 +758,8 @@ define <2 x double> @test_x86_sse2_cvtss2sd_load(<2 x double> %a0, ptr %p1) {
 ;
 ; X64-AVX512-LABEL: test_x86_sse2_cvtss2sd_load:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
-; X64-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
 ; X64-AVX512-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf2,0x5a,0xc9]
 ; X64-AVX512-NEXT:    vblendps $3, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x03]
 ; X64-AVX512-NEXT:    ## xmm0 = xmm1[0,1],xmm0[2,3]
diff --git a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
index f94dd2e97dc1a..a88510591ceb8 100644
--- a/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
+++ b/llvm/test/CodeGen/X86/sse3-avx-addsub-2.ll
@@ -439,7 +439,7 @@ define <4 x float> @test15(<4 x float> %A, <4 x float> %B) {
 define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
 ; SSE-LABEL: test16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm3 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    subss %xmm3, %xmm2
 ; SSE-NEXT:    movaps %xmm0, %xmm4
@@ -460,7 +460,7 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) {
 ;
 ; AVX-LABEL: test16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubss %xmm2, %xmm0, %xmm3
 ; AVX-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
 ; AVX-NEXT:    vshufpd {{.*#+}} xmm5 = xmm1[1,0]
diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll
index f1ae16a7c96b1..703834729205a 100644
--- a/llvm/test/CodeGen/X86/sse41.ll
+++ b/llvm/test/CodeGen/X86/sse41.ll
@@ -343,24 +343,24 @@ declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) noun
 define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind {
 ; X86-SSE-LABEL: blendps_not_insertps_1:
 ; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
 ; X86-SSE-NEXT:    blendps $1, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0c,0xc1,0x01]
 ; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: blendps_not_insertps_1:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: blendps_not_insertps_1:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vblendps $1, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x01]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
@@ -386,24 +386,24 @@ define <4 x float> @blendps_not_insertps_1(<4 x float> %t1, float %t2) nounwind
 define <4 x float> @insertps_or_blendps(<4 x float> %t1, float %t2) minsize nounwind {
 ; X86-SSE-LABEL: insertps_or_blendps:
 ; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x4c,0x24,0x04]
 ; X86-SSE-NEXT:    movss %xmm1, %xmm0 ## encoding: [0xf3,0x0f,0x10,0xc1]
 ; X86-SSE-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: insertps_or_blendps:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
 ; X86-AVX1-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xfa,0x10,0xc1]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: insertps_or_blendps:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
-; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss {{[0-9]+}}(%esp), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x4c,0x24,0x04]
 ; X86-AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0xc1]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[1,2,3]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
@@ -1639,8 +1639,8 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x08]
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx ## encoding: [0x8b,0x4c,0x24,0x04]
-; X86-SSE-NEXT:    movss (%ecx,%eax,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0x81]
-; X86-SSE-NEXT:    ## xmm4 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss (%ecx,%eax,4), %xmm4 ## xmm4 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x24,0x81]
 ; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
 ; X86-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
 ; X86-SSE-NEXT:    insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
@@ -1692,8 +1692,8 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl
 ;
 ; X64-SSE-LABEL: insertps_from_broadcast_multiple_use:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movss (%rdi,%rsi,4), %xmm4 ## encoding: [0xf3,0x0f,0x10,0x24,0xb7]
-; X64-SSE-NEXT:    ## xmm4 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss (%rdi,%rsi,4), %xmm4 ## xmm4 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x24,0xb7]
 ; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc4,0x30]
 ; X64-SSE-NEXT:    ## xmm0 = xmm0[0,1,2],xmm4[0]
 ; X64-SSE-NEXT:    insertps $48, %xmm4, %xmm1 ## encoding: [0x66,0x0f,0x3a,0x21,0xcc,0x30]
@@ -1758,8 +1758,8 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, ptr %b) {
 ; X86-SSE-LABEL: insertps_with_undefs:
 ; X86-SSE:       ## %bb.0:
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-SSE-NEXT:    movss (%eax), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x08]
-; X86-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    movss (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x08]
 ; X86-SSE-NEXT:    movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8]
 ; X86-SSE-NEXT:    ## xmm1 = xmm1[0],xmm0[0]
 ; X86-SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
@@ -1768,8 +1768,8 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, ptr %b) {
 ; X86-AVX1-LABEL: insertps_with_undefs:
 ; X86-AVX1:       ## %bb.0:
 ; X86-AVX1-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX1-NEXT:    vmovss (%eax), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x08]
-; X86-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    vmovss (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x08]
 ; X86-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0]
 ; X86-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
@@ -1777,16 +1777,16 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, ptr %b) {
 ; X86-AVX512-LABEL: insertps_with_undefs:
 ; X86-AVX512:       ## %bb.0:
 ; X86-AVX512-NEXT:    movl {{[0-9]+}}(%esp), %eax ## encoding: [0x8b,0x44,0x24,0x04]
-; X86-AVX512-NEXT:    vmovss (%eax), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
-; X86-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    vmovss (%eax), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x08]
 ; X86-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
 ; X86-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: insertps_with_undefs:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movss (%rdi), %xmm1 ## encoding: [0xf3,0x0f,0x10,0x0f]
-; X64-SSE-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    movss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x0f]
 ; X64-SSE-NEXT:    movlhps %xmm0, %xmm1 ## encoding: [0x0f,0x16,0xc8]
 ; X64-SSE-NEXT:    ## xmm1 = xmm1[0],xmm0[0]
 ; X64-SSE-NEXT:    movaps %xmm1, %xmm0 ## encoding: [0x0f,0x28,0xc1]
@@ -1794,16 +1794,16 @@ define <4 x float> @insertps_with_undefs(<4 x float> %a, ptr %b) {
 ;
 ; X64-AVX1-LABEL: insertps_with_undefs:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovss (%rdi), %xmm1 ## encoding: [0xc5,0xfa,0x10,0x0f]
-; X64-AVX1-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x0f]
 ; X64-AVX1-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## encoding: [0xc5,0xf0,0x16,0xc0]
 ; X64-AVX1-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: insertps_with_undefs:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovss (%rdi), %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
-; X64-AVX512-NEXT:    ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    vmovss (%rdi), %xmm1 ## xmm1 = mem[0],zero,zero,zero
+; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x0f]
 ; X64-AVX512-NEXT:    vmovlhps %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x16,0xc0]
 ; X64-AVX512-NEXT:    ## xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll b/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll
index e7302ea1d5351..5a55eb247b688 100644
--- a/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-int-avx2.ll
@@ -17,7 +17,7 @@ define <4 x double> @stack_fold_broadcastsd_ymm(<2 x double> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vbroadcastsd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = [4.9406564584124654E-324,0.0E+0]
 ; CHECK-NEXT:    vaddpd %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
@@ -35,7 +35,7 @@ define <4 x float> @stack_fold_broadcastss(<4 x float> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vaddps %xmm1, %xmm0, %xmm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
@@ -53,7 +53,7 @@ define <8 x float> @stack_fold_broadcastss_ymm(<4 x float> %a0) {
 ; CHECK-NEXT:    nop
 ; CHECK-NEXT:    #NO_APP
 ; CHECK-NEXT:    vbroadcastss {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 16-byte Folded Reload
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vaddps %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    retq
   %1 = tail call <2 x i64> asm sideeffect "nop", "=x,~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15},~{flags}"()
diff --git a/llvm/test/CodeGen/X86/swifterror.ll b/llvm/test/CodeGen/X86/swifterror.ll
index 8fff6405d0d89..75252309790b1 100644
--- a/llvm/test/CodeGen/X86/swifterror.ll
+++ b/llvm/test/CodeGen/X86/swifterror.ll
@@ -17,7 +17,7 @@ define float @foo(ptr swifterror %error_ptr_ref) {
 ; CHECK-APPLE-NEXT:    movl $16, %edi
 ; CHECK-APPLE-NEXT:    callq _malloc
 ; CHECK-APPLE-NEXT:    movb $1, 8(%rax)
-; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-APPLE-NEXT:    movq %rax, %r12
 ; CHECK-APPLE-NEXT:    popq %rax
 ; CHECK-APPLE-NEXT:    retq
@@ -30,7 +30,7 @@ define float @foo(ptr swifterror %error_ptr_ref) {
 ; CHECK-O0-NEXT:    callq _malloc
 ; CHECK-O0-NEXT:    movq %rax, %r12
 ; CHECK-O0-NEXT:    movb $1, 8(%rax)
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-O0-NEXT:    popq %rax
 ; CHECK-O0-NEXT:    retq
 ;
@@ -83,7 +83,7 @@ define float @caller(ptr %error_ref) {
 ; CHECK-APPLE-NEXT:    movb %al, (%rbx)
 ; CHECK-APPLE-NEXT:  LBB1_2: ## %handler
 ; CHECK-APPLE-NEXT:    callq _free
-; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-APPLE-NEXT:    addq $8, %rsp
 ; CHECK-APPLE-NEXT:    popq %rbx
 ; CHECK-APPLE-NEXT:    popq %r12
@@ -112,7 +112,7 @@ define float @caller(ptr %error_ref) {
 ; CHECK-O0-NEXT:  LBB1_2: ## %handler
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
 ; CHECK-O0-NEXT:    callq _free
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-O0-NEXT:    addq $32, %rsp
 ; CHECK-O0-NEXT:    popq %r12
 ; CHECK-O0-NEXT:    retq
@@ -187,7 +187,7 @@ define float @caller2(ptr %error_ref) {
 ; CHECK-APPLE-NEXT:  LBB2_4: ## %handler
 ; CHECK-APPLE-NEXT:    movq %r12, %rdi
 ; CHECK-APPLE-NEXT:    callq _free
-; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-APPLE-NEXT:    addq $8, %rsp
 ; CHECK-APPLE-NEXT:    popq %rbx
 ; CHECK-APPLE-NEXT:    popq %r12
@@ -215,7 +215,7 @@ define float @caller2(ptr %error_ref) {
 ; CHECK-O0-NEXT:    ## in Loop: Header=BB2_1 Depth=1
 ; CHECK-O0-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 ## 4-byte Reload
 ; CHECK-O0-NEXT:    ## xmm0 = mem[0],zero,zero,zero
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-O0-NEXT:    ucomiss %xmm1, %xmm0
 ; CHECK-O0-NEXT:    jbe LBB2_1
 ; CHECK-O0-NEXT:  ## %bb.3: ## %bb_end
@@ -226,7 +226,7 @@ define float @caller2(ptr %error_ref) {
 ; CHECK-O0-NEXT:  LBB2_4: ## %handler
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
 ; CHECK-O0-NEXT:    callq _free
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-O0-NEXT:    addq $32, %rsp
 ; CHECK-O0-NEXT:    popq %r12
 ; CHECK-O0-NEXT:    retq
@@ -310,7 +310,7 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
 ; CHECK-APPLE-NEXT:    movl $16, %edi
 ; CHECK-APPLE-NEXT:    callq _malloc
 ; CHECK-APPLE-NEXT:    movb $1, 8(%rax)
-; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-APPLE-NEXT:    movq %rax, %r12
 ; CHECK-APPLE-NEXT:    popq %rax
 ; CHECK-APPLE-NEXT:    retq
@@ -330,7 +330,7 @@ define float @foo_if(ptr swifterror %error_ptr_ref, i32 %cc) {
 ; CHECK-O0-NEXT:    callq _malloc
 ; CHECK-O0-NEXT:    movq %rax, %r12
 ; CHECK-O0-NEXT:    movb $1, 8(%rax)
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-O0-NEXT:    popq %rax
 ; CHECK-O0-NEXT:    retq
 ; CHECK-O0-NEXT:  LBB3_2: ## %normal
@@ -444,7 +444,7 @@ define float @foo_loop(ptr swifterror %error_ptr_ref, i32 %cc, float %cc2) {
 ; CHECK-O0-NEXT:    ## xmm0 = mem[0],zero,zero,zero
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax ## 8-byte Reload
 ; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-O0-NEXT:    ucomiss %xmm1, %xmm0
 ; CHECK-O0-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) ## 8-byte Spill
 ; CHECK-O0-NEXT:    jbe LBB4_1
@@ -632,7 +632,7 @@ define float @caller3(ptr %error_ref) {
 ; CHECK-APPLE-NEXT:    movb %al, (%rbx)
 ; CHECK-APPLE-NEXT:  LBB6_2: ## %handler
 ; CHECK-APPLE-NEXT:    callq _free
-; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-APPLE-NEXT:    addq $40, %rsp
 ; CHECK-APPLE-NEXT:    popq %rbx
 ; CHECK-APPLE-NEXT:    popq %r12
@@ -663,7 +663,7 @@ define float @caller3(ptr %error_ref) {
 ; CHECK-O0-NEXT:  LBB6_2: ## %handler
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
 ; CHECK-O0-NEXT:    callq _free
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-O0-NEXT:    addq $48, %rsp
 ; CHECK-O0-NEXT:    popq %r12
 ; CHECK-O0-NEXT:    retq
@@ -757,7 +757,7 @@ define float @caller_with_multiple_swifterror_values(ptr %error_ref, ptr %error_
 ; CHECK-APPLE-NEXT:    movb %al, (%rbx)
 ; CHECK-APPLE-NEXT:  LBB7_4: ## %handler2
 ; CHECK-APPLE-NEXT:    callq _free
-; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-APPLE-NEXT:    leaq -24(%rbp), %rsp
 ; CHECK-APPLE-NEXT:    popq %rbx
 ; CHECK-APPLE-NEXT:    popq %r12
@@ -810,7 +810,7 @@ define float @caller_with_multiple_swifterror_values(ptr %error_ref, ptr %error_
 ; CHECK-O0-NEXT:  LBB7_4: ## %handler2
 ; CHECK-O0-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi ## 8-byte Reload
 ; CHECK-O0-NEXT:    callq _free
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-O0-NEXT:    leaq -8(%rbp), %rsp
 ; CHECK-O0-NEXT:    popq %r12
 ; CHECK-O0-NEXT:    popq %rbp
@@ -1033,7 +1033,7 @@ define swiftcc float @foo_swiftcc(ptr swifterror %error_ptr_ref) {
 ; CHECK-APPLE-NEXT:    movl $16, %edi
 ; CHECK-APPLE-NEXT:    callq _malloc
 ; CHECK-APPLE-NEXT:    movb $1, 8(%rax)
-; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-APPLE-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-APPLE-NEXT:    movq %rax, %r12
 ; CHECK-APPLE-NEXT:    popq %rax
 ; CHECK-APPLE-NEXT:    retq
@@ -1046,7 +1046,7 @@ define swiftcc float @foo_swiftcc(ptr swifterror %error_ptr_ref) {
 ; CHECK-O0-NEXT:    callq _malloc
 ; CHECK-O0-NEXT:    movq %rax, %r12
 ; CHECK-O0-NEXT:    movb $1, 8(%rax)
-; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-O0-NEXT:    popq %rax
 ; CHECK-O0-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
index 349d94d930651..0981c45e9d803 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-128.ll
@@ -222,7 +222,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-32-NEXT:    andl $-8, %esp
 ; SSE-32-NEXT:    subl $24, %esp
-; SSE-32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-32-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-32-NEXT:    comisd %xmm1, %xmm0
 ; SSE-32-NEXT:    movapd %xmm1, %xmm2
 ; SSE-32-NEXT:    jae .LBB1_2
@@ -280,7 +280,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ;
 ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64:
 ; SSE-64:       # %bb.0:
-; SSE-64-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE-64-NEXT:    movsd {{.*#+}} xmm3 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-64-NEXT:    comisd %xmm3, %xmm0
 ; SSE-64-NEXT:    xorpd %xmm2, %xmm2
 ; SSE-64-NEXT:    xorpd %xmm1, %xmm1
@@ -323,7 +323,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
 ; AVX-32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX-32-NEXT:    vmovapd %xmm1, %xmm3
 ; AVX-32-NEXT:    jae .LBB1_2
@@ -364,7 +364,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ;
 ; AVX-64-LABEL: strict_vector_fptoui_v2f64_to_v2i64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-64-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX-64-NEXT:    vcomisd %xmm1, %xmm0
 ; AVX-64-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX-64-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
@@ -405,7 +405,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512F-32-NEXT:    andl $-8, %esp
 ; AVX512F-32-NEXT:    subl $16, %esp
 ; AVX512F-32-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; AVX512F-32-NEXT:    xorl %eax, %eax
 ; AVX512F-32-NEXT:    vcomisd %xmm2, %xmm1
 ; AVX512F-32-NEXT:    setae %al
@@ -458,7 +458,7 @@ define <2 x i64> @strict_vector_fptoui_v2f64_to_v2i64(<2 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    andl $-8, %esp
 ; AVX512VL-32-NEXT:    subl $16, %esp
 ; AVX512VL-32-NEXT:    vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
+; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomisd %xmm2, %xmm1
 ; AVX512VL-32-NEXT:    setae %al
@@ -893,7 +893,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-32-NEXT:    andl $-8, %esp
 ; SSE-32-NEXT:    subl $24, %esp
-; SSE-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-32-NEXT:    comiss %xmm1, %xmm0
 ; SSE-32-NEXT:    movaps %xmm1, %xmm2
 ; SSE-32-NEXT:    jae .LBB4_2
@@ -951,7 +951,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ;
 ; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
 ; SSE-64:       # %bb.0:
-; SSE-64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-64-NEXT:    movss {{.*#+}} xmm3 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-64-NEXT:    comiss %xmm3, %xmm0
 ; SSE-64-NEXT:    xorps %xmm2, %xmm2
 ; SSE-64-NEXT:    xorps %xmm1, %xmm1
@@ -994,7 +994,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
 ; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
 ; AVX-32-NEXT:    jae .LBB4_2
@@ -1035,7 +1035,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ;
 ; AVX-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-64-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-64-NEXT:    vcomiss %xmm1, %xmm0
 ; AVX-64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX-64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
@@ -1076,7 +1076,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512F-32-NEXT:    andl $-8, %esp
 ; AVX512F-32-NEXT:    subl $16, %esp
 ; AVX512F-32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512F-32-NEXT:    xorl %eax, %eax
 ; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
 ; AVX512F-32-NEXT:    setae %al
@@ -1129,7 +1129,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64(<2 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    andl $-8, %esp
 ; AVX512VL-32-NEXT:    subl $16, %esp
 ; AVX512VL-32-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
 ; AVX512VL-32-NEXT:    setae %al
@@ -1201,7 +1201,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp {
 ; SSE-32-NEXT:    subl $24, %esp
 ; SSE-32-NEXT:    movl 8(%ebp), %eax
 ; SSE-32-NEXT:    movaps (%eax), %xmm0
-; SSE-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-32-NEXT:    comiss %xmm1, %xmm0
 ; SSE-32-NEXT:    movaps %xmm1, %xmm2
 ; SSE-32-NEXT:    jae .LBB5_2
@@ -1260,7 +1260,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp {
 ; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i64_load128:
 ; SSE-64:       # %bb.0:
 ; SSE-64-NEXT:    movaps (%rdi), %xmm1
-; SSE-64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-64-NEXT:    movss {{.*#+}} xmm3 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-64-NEXT:    comiss %xmm3, %xmm1
 ; SSE-64-NEXT:    xorps %xmm2, %xmm2
 ; SSE-64-NEXT:    xorps %xmm0, %xmm0
@@ -1304,7 +1304,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp {
 ; AVX-32-NEXT:    movl 8(%ebp), %eax
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
 ; AVX-32-NEXT:    jae .LBB5_2
@@ -1347,7 +1347,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp {
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX-64-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
-; AVX-64-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-64-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-64-NEXT:    vcomiss %xmm1, %xmm3
 ; AVX-64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX-64-NEXT:    vxorps %xmm4, %xmm4, %xmm4
@@ -1389,7 +1389,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp {
 ; AVX512F-32-NEXT:    movl 8(%ebp), %eax
 ; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512F-32-NEXT:    xorl %eax, %eax
 ; AVX512F-32-NEXT:    vcomiss %xmm2, %xmm1
 ; AVX512F-32-NEXT:    setae %al
@@ -1443,7 +1443,7 @@ define <2 x i64> @strict_vector_fptoui_v2f32_to_v2i64_load128(ptr %x) strictfp {
 ; AVX512VL-32-NEXT:    movl 8(%ebp), %eax
 ; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomiss %xmm2, %xmm1
 ; AVX512VL-32-NEXT:    setae %al
@@ -1561,7 +1561,7 @@ define <2 x i32> @strict_vector_fptosi_v2f64_to_v2i32(<2 x double> %a) #0 {
 define <2 x i32> @strict_vector_fptoui_v2f64_to_v2i32(<2 x double> %a) #0 {
 ; SSE-32-LABEL: strict_vector_fptoui_v2f64_to_v2i32:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE-32-NEXT:    movsd {{.*#+}} xmm3 = [2.147483648E+9,0.0E+0]
 ; SSE-32-NEXT:    comisd %xmm3, %xmm0
 ; SSE-32-NEXT:    xorpd %xmm2, %xmm2
 ; SSE-32-NEXT:    xorpd %xmm1, %xmm1
@@ -1717,7 +1717,7 @@ define <2 x i32> @strict_vector_fptosi_v2f32_to_v2i32(<2 x float> %a) #0 {
 define <2 x i32> @strict_vector_fptoui_v2f32_to_v2i32(<2 x float> %a) #0 {
 ; SSE-32-LABEL: strict_vector_fptoui_v2f32_to_v2i32:
 ; SSE-32:       # %bb.0:
-; SSE-32-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    movss {{.*#+}} xmm3 = [2.14748365E+9,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-32-NEXT:    comiss %xmm3, %xmm0
 ; SSE-32-NEXT:    xorps %xmm2, %xmm2
 ; SSE-32-NEXT:    xorps %xmm1, %xmm1
@@ -2384,7 +2384,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; SSE-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-32-NEXT:    andl $-8, %esp
 ; SSE-32-NEXT:    subl $24, %esp
-; SSE-32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-32-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-32-NEXT:    comisd %xmm1, %xmm0
 ; SSE-32-NEXT:    movapd %xmm1, %xmm2
 ; SSE-32-NEXT:    jae .LBB19_2
@@ -2442,7 +2442,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ;
 ; SSE-64-LABEL: strict_vector_fptoui_v2f64_to_v2i1:
 ; SSE-64:       # %bb.0:
-; SSE-64-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE-64-NEXT:    movsd {{.*#+}} xmm3 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-64-NEXT:    comisd %xmm3, %xmm0
 ; SSE-64-NEXT:    xorpd %xmm2, %xmm2
 ; SSE-64-NEXT:    xorpd %xmm1, %xmm1
@@ -2485,7 +2485,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
 ; AVX-32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX-32-NEXT:    vmovapd %xmm1, %xmm3
 ; AVX-32-NEXT:    jae .LBB19_2
@@ -2526,7 +2526,7 @@ define <2 x i1> @strict_vector_fptoui_v2f64_to_v2i1(<2 x double> %a) #0 {
 ;
 ; AVX-64-LABEL: strict_vector_fptoui_v2f64_to_v2i1:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-64-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX-64-NEXT:    vcomisd %xmm1, %xmm0
 ; AVX-64-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX-64-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
@@ -2758,7 +2758,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; SSE-32-NEXT:    .cfi_def_cfa_register %ebp
 ; SSE-32-NEXT:    andl $-8, %esp
 ; SSE-32-NEXT:    subl $24, %esp
-; SSE-32-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-32-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-32-NEXT:    comiss %xmm1, %xmm0
 ; SSE-32-NEXT:    movaps %xmm1, %xmm2
 ; SSE-32-NEXT:    jae .LBB21_2
@@ -2816,7 +2816,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ;
 ; SSE-64-LABEL: strict_vector_fptoui_v2f32_to_v2i1:
 ; SSE-64:       # %bb.0:
-; SSE-64-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; SSE-64-NEXT:    movss {{.*#+}} xmm3 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-64-NEXT:    comiss %xmm3, %xmm0
 ; SSE-64-NEXT:    xorps %xmm2, %xmm2
 ; SSE-64-NEXT:    xorps %xmm1, %xmm1
@@ -2859,7 +2859,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $16, %esp
 ; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
 ; AVX-32-NEXT:    jae .LBB21_2
@@ -2900,7 +2900,7 @@ define <2 x i1> @strict_vector_fptoui_v2f32_to_v2i1(<2 x float> %a) #0 {
 ;
 ; AVX-64-LABEL: strict_vector_fptoui_v2f32_to_v2i1:
 ; AVX-64:       # %bb.0:
-; AVX-64-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-64-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-64-NEXT:    vcomiss %xmm1, %xmm0
 ; AVX-64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX-64-NEXT:    vxorps %xmm3, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
index b28211bb4388f..cba3b09445148 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-256.ll
@@ -227,7 +227,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $32, %esp
 ; AVX-32-NEXT:    vshufpd {{.*#+}} xmm2 = xmm0[1,0]
-; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-32-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX-32-NEXT:    vcomisd %xmm1, %xmm2
 ; AVX-32-NEXT:    vmovapd %xmm1, %xmm3
 ; AVX-32-NEXT:    jae .LBB1_2
@@ -306,7 +306,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX-64-LABEL: strict_vector_fptoui_v4f64_to_v4i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX-64-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-64-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX-64-NEXT:    vcomisd %xmm1, %xmm3
 ; AVX-64-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX-64-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
@@ -379,7 +379,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
 ; AVX512F-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512F-32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512F-32-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX512F-32-NEXT:    xorl %eax, %eax
 ; AVX512F-32-NEXT:    vcomisd %xmm1, %xmm3
 ; AVX512F-32-NEXT:    setae %al
@@ -472,7 +472,7 @@ define <4 x i64> @strict_vector_fptoui_v4f64_to_v4i64(<4 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
 ; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512VL-32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm3
 ; AVX512VL-32-NEXT:    setae %al
@@ -758,7 +758,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-32-NEXT:    andl $-8, %esp
 ; AVX-32-NEXT:    subl $32, %esp
 ; AVX-32-NEXT:    vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
-; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-32-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX-32-NEXT:    vmovaps %xmm1, %xmm3
 ; AVX-32-NEXT:    jae .LBB3_2
@@ -837,7 +837,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX-64-LABEL: strict_vector_fptoui_v4f32_to_v4i64:
 ; AVX-64:       # %bb.0:
 ; AVX-64-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
-; AVX-64-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-64-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-64-NEXT:    vcomiss %xmm1, %xmm3
 ; AVX-64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX-64-NEXT:    vxorps %xmm4, %xmm4, %xmm4
@@ -909,7 +909,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512F-32-NEXT:    subl $40, %esp
 ; AVX512F-32-NEXT:    .cfi_offset %ebx, -12
 ; AVX512F-32-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512F-32-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512F-32-NEXT:    xorl %eax, %eax
 ; AVX512F-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX512F-32-NEXT:    setae %al
@@ -1002,7 +1002,7 @@ define <4 x i64> @strict_vector_fptoui_v4f32_to_v4i64(<4 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    subl $40, %esp
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
 ; AVX512VL-32-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm2
 ; AVX512VL-32-NEXT:    setae %al
diff --git a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
index 17c5ff7955106..ff00779d90e5c 100644
--- a/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
+++ b/llvm/test/CodeGen/X86/vec-strict-fptoint-512.ll
@@ -151,7 +151,7 @@ define <8 x i64> @strict_vector_fptoui_v8f64_to_v8i64(<8 x double> %a) #0 {
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
 ; AVX512VL-32-NEXT:    vextractf32x4 $3, %zmm0, %xmm2
 ; AVX512VL-32-NEXT:    vshufpd {{.*#+}} xmm3 = xmm2[1,0]
-; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX512VL-32-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomisd %xmm1, %xmm3
 ; AVX512VL-32-NEXT:    setae %al
@@ -443,7 +443,7 @@ define <8 x i64> @strict_vector_fptoui_v8f32_to_v8i64(<8 x float> %a) #0 {
 ; AVX512VL-32-NEXT:    .cfi_offset %ebx, -12
 ; AVX512VL-32-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX512VL-32-NEXT:    vshufps {{.*#+}} xmm3 = xmm2[3,3,3,3]
-; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX512VL-32-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512VL-32-NEXT:    xorl %eax, %eax
 ; AVX512VL-32-NEXT:    vcomiss %xmm1, %xmm3
 ; AVX512VL-32-NEXT:    setae %al
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index a49f7e9909760..cb19e202e7947 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -250,7 +250,7 @@ define <4 x i32> @fptosi_4f64_to_4i32(<4 x double> %a) {
 define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
 ; SSE-LABEL: fptoui_2f64_to_2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-NEXT:    movapd %xmm0, %xmm1
 ; SSE-NEXT:    subsd %xmm2, %xmm1
 ; SSE-NEXT:    cvttsd2si %xmm1, %rax
@@ -275,7 +275,7 @@ define <2 x i64> @fptoui_2f64_to_2i64(<2 x double> %a) {
 ;
 ; VEX-LABEL: fptoui_2f64_to_2i64:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; VEX-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; VEX-NEXT:    vsubsd %xmm1, %xmm0, %xmm2
 ; VEX-NEXT:    vcvttsd2si %xmm2, %rax
 ; VEX-NEXT:    vcvttsd2si %xmm0, %rcx
@@ -511,7 +511,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; SSE-LABEL: fptoui_4f64_to_4i64:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movapd %xmm0, %xmm2
-; SSE-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm3 = [9.2233720368547758E+18,0.0E+0]
 ; SSE-NEXT:    subsd %xmm3, %xmm0
 ; SSE-NEXT:    cvttsd2si %xmm0, %rax
 ; SSE-NEXT:    cvttsd2si %xmm2, %rcx
@@ -555,7 +555,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; AVX1-LABEL: fptoui_4f64_to_4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX1-NEXT:    vsubsd %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vcvttsd2si %xmm3, %rax
 ; AVX1-NEXT:    vcvttsd2si %xmm2, %rcx
@@ -598,7 +598,7 @@ define <4 x i64> @fptoui_4f64_to_4i64(<4 x double> %a) {
 ; AVX2-LABEL: fptoui_4f64_to_4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX2-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX2-NEXT:    vsubsd %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttsd2si %xmm3, %rax
 ; AVX2-NEXT:    vcvttsd2si %xmm2, %rcx
@@ -1279,7 +1279,7 @@ define <4 x i32> @fptoui_4f32_to_4i32(<4 x float> %a) {
 define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
 ; SSE-LABEL: fptoui_2f32_to_2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    subss %xmm2, %xmm1
 ; SSE-NEXT:    cvttss2si %xmm1, %rax
@@ -1304,7 +1304,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
 ;
 ; VEX-LABEL: fptoui_2f32_to_2i64:
 ; VEX:       # %bb.0:
-; VEX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; VEX-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
 ; VEX-NEXT:    vcvttss2si %xmm2, %rax
 ; VEX-NEXT:    vcvttss2si %xmm0, %rcx
@@ -1365,7 +1365,7 @@ define <2 x i64> @fptoui_2f32_to_2i64(<4 x float> %a) {
 define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
 ; SSE-LABEL: fptoui_4f32_to_2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm1
 ; SSE-NEXT:    subss %xmm2, %xmm1
 ; SSE-NEXT:    cvttss2si %xmm1, %rax
@@ -1391,7 +1391,7 @@ define <2 x i64> @fptoui_4f32_to_2i64(<4 x float> %a) {
 ; VEX-LABEL: fptoui_4f32_to_2i64:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; VEX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; VEX-NEXT:    vmovss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; VEX-NEXT:    vsubss %xmm2, %xmm1, %xmm3
 ; VEX-NEXT:    vcvttss2si %xmm3, %rax
 ; VEX-NEXT:    vcvttss2si %xmm1, %rcx
@@ -1520,7 +1520,7 @@ define <8 x i32> @fptoui_8f32_to_8i32(<8 x float> %a) {
 define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
 ; SSE-LABEL: fptoui_4f32_to_4i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    subss %xmm1, %xmm2
 ; SSE-NEXT:    cvttss2si %xmm2, %rax
@@ -1567,7 +1567,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
 ; AVX1-LABEL: fptoui_4f32_to_4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vcvttss2si %xmm3, %rax
 ; AVX1-NEXT:    vcvttss2si %xmm2, %rcx
@@ -1610,7 +1610,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
 ; AVX2-LABEL: fptoui_4f32_to_4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
@@ -1704,7 +1704,7 @@ define <4 x i64> @fptoui_4f32_to_4i64(<8 x float> %a) {
 define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
 ; SSE-LABEL: fptoui_8f32_to_4i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm0, %xmm2
 ; SSE-NEXT:    subss %xmm1, %xmm2
 ; SSE-NEXT:    cvttss2si %xmm2, %rax
@@ -1751,7 +1751,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
 ; AVX1-LABEL: fptoui_8f32_to_4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vcvttss2si %xmm3, %rax
 ; AVX1-NEXT:    vcvttss2si %xmm2, %rcx
@@ -1794,7 +1794,7 @@ define <4 x i64> @fptoui_8f32_to_4i64(<8 x float> %a) {
 ; AVX2-LABEL: fptoui_8f32_to_4i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vshufps {{.*#+}} xmm2 = xmm0[3,3,3,3]
-; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX2-NEXT:    vsubss %xmm1, %xmm2, %xmm3
 ; AVX2-NEXT:    vcvttss2si %xmm3, %rax
 ; AVX2-NEXT:    vcvttss2si %xmm2, %rcx
@@ -2684,7 +2684,7 @@ define <2 x i64> @fptoui_2f32_to_2i64_load(ptr %x) {
 ; SSE-LABEL: fptoui_2f32_to_2i64_load:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
-; SSE-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    movaps %xmm1, %xmm0
 ; SSE-NEXT:    subss %xmm2, %xmm0
 ; SSE-NEXT:    cvttss2si %xmm0, %rax
@@ -2709,7 +2709,7 @@ define <2 x i64> @fptoui_2f32_to_2i64_load(ptr %x) {
 ; VEX-LABEL: fptoui_2f32_to_2i64_load:
 ; VEX:       # %bb.0:
 ; VEX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; VEX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; VEX-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; VEX-NEXT:    vsubss %xmm1, %xmm0, %xmm2
 ; VEX-NEXT:    vcvttss2si %xmm2, %rax
 ; VEX-NEXT:    vcvttss2si %xmm0, %rcx
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll
index 5f39f33dd6a5c..ff208678c9bc7 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics-fma.ll
@@ -4,8 +4,8 @@
 define <1 x float> @constrained_vector_fma_v1f32() #0 {
 ; CHECK-LABEL: constrained_vector_fma_v1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = [5.0E-1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm0 = (xmm1 * xmm0) + mem
 ; CHECK-NEXT:    retq
 entry:
@@ -38,14 +38,14 @@ entry:
 define <3 x float> @constrained_vector_fma_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_fma_v3f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = [5.0E-1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm1 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
-; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm2 = [5.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm2 = (xmm0 * xmm2) + mem
-; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm3 = [4.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    vfmadd213ss {{.*#+}} xmm3 = (xmm0 * xmm3) + mem
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm3[0],xmm2[2,3]
 ; CHECK-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
@@ -63,8 +63,8 @@ entry:
 define <3 x double> @constrained_vector_fma_v3f64() #0 {
 ; CHECK-LABEL: constrained_vector_fma_v3f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = [5.0E-1,0.0E+0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm1 = [3.5E+0,0.0E+0]
 ; CHECK-NEXT:    vfmadd213sd {{.*#+}} xmm1 = (xmm0 * xmm1) + mem
 ; CHECK-NEXT:    vmovapd {{.*#+}} xmm0 = [2.5E+0,1.5E+0]
 ; CHECK-NEXT:    vmovapd {{.*#+}} xmm2 = [5.5E+0,4.5E+0]
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index cdabd7fab081c..acf45fc4bbeba 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -7,13 +7,13 @@
 define <1 x float> @constrained_vector_fdiv_v1f32() #0 {
 ; CHECK-LABEL: constrained_vector_fdiv_v1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fdiv_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -49,12 +49,12 @@ entry:
 define <3 x float> @constrained_vector_fdiv_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_fdiv_v3f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss %xmm1, %xmm2
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss %xmm1, %xmm0
-; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    divss %xmm1, %xmm3
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
@@ -62,12 +62,12 @@ define <3 x float> @constrained_vector_fdiv_v3f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_fdiv_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm0, %xmm2, %xmm2
-; AVX-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm3 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivss %xmm0, %xmm3, %xmm0
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
@@ -86,7 +86,7 @@ define <3 x double> @constrained_vector_fdiv_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.0E+0,2.0E+0]
 ; CHECK-NEXT:    divpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    divsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
@@ -97,7 +97,7 @@ define <3 x double> @constrained_vector_fdiv_v3f64() #0 {
 ;
 ; AVX-LABEL: constrained_vector_fdiv_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    vdivsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vmovapd {{.*#+}} xmm1 = [1.0E+0,2.0E+0]
 ; AVX-NEXT:    vdivpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -150,8 +150,8 @@ define <1 x float> @constrained_vector_frem_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fmodf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -161,8 +161,8 @@ define <1 x float> @constrained_vector_frem_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fmodf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -181,12 +181,12 @@ define <2 x double> @constrained_vector_frem_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [2.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmod@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmod@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -198,12 +198,12 @@ define <2 x double> @constrained_vector_frem_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [2.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -224,16 +224,16 @@ define <3 x float> @constrained_vector_frem_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fmodf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fmodf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fmodf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -248,16 +248,16 @@ define <3 x float> @constrained_vector_frem_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fmodf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fmodf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.0E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fmodf@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -280,16 +280,16 @@ define <3 x double> @constrained_vector_frem_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [2.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmod@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmod@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmod@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -306,18 +306,18 @@ define <3 x double> @constrained_vector_frem_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [2.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -339,22 +339,22 @@ define <4 x double> @constrained_vector_frem_v4f64() #0 {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [2.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmod@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmod@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmod@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmod@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -368,22 +368,22 @@ define <4 x double> @constrained_vector_frem_v4f64() #0 {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [2.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [1.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmod@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -404,13 +404,13 @@ define <4 x double> @constrained_vector_frem_v4f64() #0 {
 define <1 x float> @constrained_vector_fmul_v1f32() #0 {
 ; CHECK-LABEL: constrained_vector_fmul_v1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [+Inf,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fmul_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [+Inf,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -447,10 +447,10 @@ entry:
 define <3 x float> @constrained_vector_fmul_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_fmul_v3f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [+Inf,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [1.0E+2,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    mulss %xmm1, %xmm2
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    mulss %xmm1, %xmm0
 ; CHECK-NEXT:    mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -459,7 +459,7 @@ define <3 x float> @constrained_vector_fmul_v3f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_fmul_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [+Inf,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
 ; AVX-NEXT:    vmulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -481,7 +481,7 @@ define <3 x double> @constrained_vector_fmul_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [1.7976931348623157E+308,1.7976931348623157E+308]
 ; CHECK-NEXT:    mulpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [1.7976931348623157E+308,0.0E+0]
 ; CHECK-NEXT:    mulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movapd %xmm0, %xmm1
@@ -492,7 +492,7 @@ define <3 x double> @constrained_vector_fmul_v3f64() #0 {
 ;
 ; AVX-LABEL: constrained_vector_fmul_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.7976931348623157E+308,0.0E+0]
 ; AVX-NEXT:    vmulsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [1.7976931348623157E+308,1.7976931348623157E+308]
 ; AVX-NEXT:    # xmm1 = mem[0,0]
@@ -537,13 +537,13 @@ entry:
 define <1 x float> @constrained_vector_fadd_v1f32() #0 {
 ; CHECK-LABEL: constrained_vector_fadd_v1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [+Inf,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fadd_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [+Inf,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -581,9 +581,9 @@ define <3 x float> @constrained_vector_fadd_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_fadd_v3f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    addss %xmm2, %xmm1
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [2.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    addss %xmm2, %xmm0
 ; CHECK-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -593,7 +593,7 @@ define <3 x float> @constrained_vector_fadd_v3f32() #0 {
 ; AVX-LABEL: constrained_vector_fadd_v3f32:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vaddss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
 ; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -671,13 +671,13 @@ entry:
 define <1 x float> @constrained_vector_fsub_v1f32() #0 {
 ; CHECK-LABEL: constrained_vector_fsub_v1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [+Inf,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    subss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fsub_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [+Inf,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -715,7 +715,7 @@ define <3 x float> @constrained_vector_fsub_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_fsub_v3f32:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movaps %xmm1, %xmm2
 ; CHECK-NEXT:    subss %xmm0, %xmm2
 ; CHECK-NEXT:    movaps %xmm1, %xmm0
@@ -728,7 +728,7 @@ define <3 x float> @constrained_vector_fsub_v3f32() #0 {
 ; AVX-LABEL: constrained_vector_fsub_v3f32:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsubss %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm2
 ; AVX-NEXT:    vsubss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
@@ -749,7 +749,7 @@ define <3 x double> @constrained_vector_fsub_v3f64() #0 {
 ; CHECK-LABEL: constrained_vector_fsub_v3f64:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    xorpd %xmm0, %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [-1.7976931348623157E+308,0.0E+0]
 ; CHECK-NEXT:    subsd %xmm0, %xmm1
 ; CHECK-NEXT:    movapd {{.*#+}} xmm0 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
 ; CHECK-NEXT:    subpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -763,7 +763,7 @@ define <3 x double> @constrained_vector_fsub_v3f64() #0 {
 ; AVX-LABEL: constrained_vector_fsub_v3f64:
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    vxorpd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [-1.7976931348623157E+308,0.0E+0]
 ; AVX-NEXT:    vsubsd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    vmovddup {{.*#+}} xmm1 = [-1.7976931348623157E+308,-1.7976931348623157E+308]
 ; AVX-NEXT:    # xmm1 = mem[0,0]
@@ -808,13 +808,13 @@ entry:
 define <1 x float> @constrained_vector_sqrt_v1f32() #0 {
 ; CHECK-LABEL: constrained_vector_sqrt_v1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    sqrtss %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_sqrt_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -846,11 +846,11 @@ entry:
 define <3 x float> @constrained_vector_sqrt_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_sqrt_v3f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    sqrtss %xmm0, %xmm1
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    sqrtss %xmm0, %xmm0
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    sqrtss %xmm2, %xmm2
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
 ; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -858,11 +858,11 @@ define <3 x float> @constrained_vector_sqrt_v3f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_sqrt_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vsqrtss %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
@@ -878,7 +878,7 @@ entry:
 define <3 x double> @constrained_vector_sqrt_v3f64() #0 {
 ; CHECK-LABEL: constrained_vector_sqrt_v3f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    sqrtsd %xmm0, %xmm1
 ; CHECK-NEXT:    sqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-NEXT:    movsd %xmm1, -{{[0-9]+}}(%rsp)
@@ -890,7 +890,7 @@ define <3 x double> @constrained_vector_sqrt_v3f64() #0 {
 ;
 ; AVX-LABEL: constrained_vector_sqrt_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vsqrtpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -928,8 +928,8 @@ define <1 x float> @constrained_vector_pow_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq powf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -939,8 +939,8 @@ define <1 x float> @constrained_vector_pow_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq powf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -959,12 +959,12 @@ define <2 x double> @constrained_vector_pow_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -976,12 +976,12 @@ define <2 x double> @constrained_vector_pow_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1002,16 +1002,16 @@ define <3 x float> @constrained_vector_pow_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq powf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq powf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq powf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -1026,16 +1026,16 @@ define <3 x float> @constrained_vector_pow_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq powf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq powf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [3.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq powf@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -1058,16 +1058,16 @@ define <3 x double> @constrained_vector_pow_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -1084,18 +1084,18 @@ define <3 x double> @constrained_vector_pow_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -1117,22 +1117,22 @@ define <4 x double> @constrained_vector_pow_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2399999999999999E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq pow@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -1146,22 +1146,22 @@ define <4 x double> @constrained_vector_pow_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2399999999999999E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
 ; AVX-NEXT:    callq pow@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1185,7 +1185,7 @@ define <1 x float> @constrained_vector_powi_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powisf2@PLT
 ; CHECK-NEXT:    popq %rax
@@ -1196,7 +1196,7 @@ define <1 x float> @constrained_vector_powi_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powisf2@PLT
 ; AVX-NEXT:    popq %rax
@@ -1216,11 +1216,11 @@ define <2 x double> @constrained_vector_powi_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powidf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powidf2@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
@@ -1233,11 +1233,11 @@ define <2 x double> @constrained_vector_powi_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powidf2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powidf2@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -1259,15 +1259,15 @@ define <3 x float> @constrained_vector_powi_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powisf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powisf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powisf2@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
@@ -1283,15 +1283,15 @@ define <3 x float> @constrained_vector_powi_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powisf2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powisf2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powisf2@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
@@ -1315,15 +1315,15 @@ define <3 x double> @constrained_vector_powi_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powidf2@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powidf2@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powidf2@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
@@ -1341,17 +1341,17 @@ define <3 x double> @constrained_vector_powi_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powidf2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powidf2@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq __powidf2@PLT
@@ -1374,21 +1374,21 @@ define <4 x double> @constrained_vector_powi_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powidf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powidf2@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2399999999999999E+1,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powidf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    movl $3, %edi
 ; CHECK-NEXT:    callq __powidf2@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
@@ -1403,21 +1403,21 @@ define <4 x double> @constrained_vector_powi_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2399999999999999E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powidf2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powidf2@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powidf2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    movl $3, %edi
 ; AVX-NEXT:    callq __powidf2@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -1441,7 +1441,7 @@ define <1 x float> @constrained_vector_sin_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq sinf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -1451,7 +1451,7 @@ define <1 x float> @constrained_vector_sin_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq sinf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -1469,10 +1469,10 @@ define <2 x double> @constrained_vector_sin_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq sin@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq sin@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1484,10 +1484,10 @@ define <2 x double> @constrained_vector_sin_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1507,13 +1507,13 @@ define <3 x float> @constrained_vector_sin_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq sinf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq sinf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq sinf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -1528,13 +1528,13 @@ define <3 x float> @constrained_vector_sin_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq sinf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq sinf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq sinf@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -1556,13 +1556,13 @@ define <3 x double> @constrained_vector_sin_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq sin@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq sin@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq sin@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -1579,15 +1579,15 @@ define <3 x double> @constrained_vector_sin_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -1608,18 +1608,18 @@ define <4 x double> @constrained_vector_sin_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq sin@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq sin@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    callq sin@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq sin@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -1633,18 +1633,18 @@ define <4 x double> @constrained_vector_sin_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq sin@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1666,7 +1666,7 @@ define <1 x float> @constrained_vector_cos_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq cosf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -1676,7 +1676,7 @@ define <1 x float> @constrained_vector_cos_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq cosf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -1694,10 +1694,10 @@ define <2 x double> @constrained_vector_cos_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq cos@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq cos@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1709,10 +1709,10 @@ define <2 x double> @constrained_vector_cos_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1732,13 +1732,13 @@ define <3 x float> @constrained_vector_cos_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq cosf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq cosf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq cosf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -1753,13 +1753,13 @@ define <3 x float> @constrained_vector_cos_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq cosf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq cosf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq cosf@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -1781,13 +1781,13 @@ define <3 x double> @constrained_vector_cos_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq cos@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq cos@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq cos@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -1804,15 +1804,15 @@ define <3 x double> @constrained_vector_cos_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -1833,18 +1833,18 @@ define <4 x double> @constrained_vector_cos_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq cos@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq cos@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    callq cos@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq cos@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -1858,18 +1858,18 @@ define <4 x double> @constrained_vector_cos_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq cos@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1891,7 +1891,7 @@ define <1 x float> @constrained_vector_exp_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq expf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -1901,7 +1901,7 @@ define <1 x float> @constrained_vector_exp_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq expf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -1919,10 +1919,10 @@ define <2 x double> @constrained_vector_exp_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1934,10 +1934,10 @@ define <2 x double> @constrained_vector_exp_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -1957,13 +1957,13 @@ define <3 x float> @constrained_vector_exp_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq expf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq expf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq expf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -1978,13 +1978,13 @@ define <3 x float> @constrained_vector_exp_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq expf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq expf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq expf@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -2006,13 +2006,13 @@ define <3 x double> @constrained_vector_exp_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -2029,15 +2029,15 @@ define <3 x double> @constrained_vector_exp_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -2058,18 +2058,18 @@ define <4 x double> @constrained_vector_exp_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -2083,18 +2083,18 @@ define <4 x double> @constrained_vector_exp_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq exp@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2116,7 +2116,7 @@ define <1 x float> @constrained_vector_exp2_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq exp2f@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -2126,7 +2126,7 @@ define <1 x float> @constrained_vector_exp2_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq exp2f@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -2144,10 +2144,10 @@ define <2 x double> @constrained_vector_exp2_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp2@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp2@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2159,10 +2159,10 @@ define <2 x double> @constrained_vector_exp2_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2182,13 +2182,13 @@ define <3 x float> @constrained_vector_exp2_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq exp2f@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq exp2f@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq exp2f@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -2203,13 +2203,13 @@ define <3 x float> @constrained_vector_exp2_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq exp2f@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq exp2f@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq exp2f@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -2231,13 +2231,13 @@ define <3 x double> @constrained_vector_exp2_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp2@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp2@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp2@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -2254,15 +2254,15 @@ define <3 x double> @constrained_vector_exp2_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -2283,18 +2283,18 @@ define <4 x double> @constrained_vector_exp2_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp2@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp2@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2399999999999999E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp2@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    callq exp2@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -2308,18 +2308,18 @@ define <4 x double> @constrained_vector_exp2_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2399999999999999E+1,0.0E+0]
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq exp2@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2341,7 +2341,7 @@ define <1 x float> @constrained_vector_log_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq logf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -2351,7 +2351,7 @@ define <1 x float> @constrained_vector_log_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq logf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -2369,10 +2369,10 @@ define <2 x double> @constrained_vector_log_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq log@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq log@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2384,10 +2384,10 @@ define <2 x double> @constrained_vector_log_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2407,13 +2407,13 @@ define <3 x float> @constrained_vector_log_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq logf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq logf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq logf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -2428,13 +2428,13 @@ define <3 x float> @constrained_vector_log_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq logf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq logf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq logf@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -2456,13 +2456,13 @@ define <3 x double> @constrained_vector_log_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq log@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq log@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq log@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -2479,15 +2479,15 @@ define <3 x double> @constrained_vector_log_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -2508,18 +2508,18 @@ define <4 x double> @constrained_vector_log_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq log@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq log@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    callq log@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq log@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -2533,18 +2533,18 @@ define <4 x double> @constrained_vector_log_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2566,7 +2566,7 @@ define <1 x float> @constrained_vector_log10_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq log10f@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -2576,7 +2576,7 @@ define <1 x float> @constrained_vector_log10_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq log10f@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -2594,10 +2594,10 @@ define <2 x double> @constrained_vector_log10_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq log10@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq log10@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2609,10 +2609,10 @@ define <2 x double> @constrained_vector_log10_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2632,13 +2632,13 @@ define <3 x float> @constrained_vector_log10_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq log10f@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq log10f@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq log10f@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -2653,13 +2653,13 @@ define <3 x float> @constrained_vector_log10_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq log10f@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq log10f@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq log10f@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -2681,13 +2681,13 @@ define <3 x double> @constrained_vector_log10_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq log10@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq log10@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq log10@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -2704,15 +2704,15 @@ define <3 x double> @constrained_vector_log10_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -2733,18 +2733,18 @@ define <4 x double> @constrained_vector_log10_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq log10@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq log10@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    callq log10@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq log10@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -2758,18 +2758,18 @@ define <4 x double> @constrained_vector_log10_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log10@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2791,7 +2791,7 @@ define <1 x float> @constrained_vector_log2_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq log2f@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -2801,7 +2801,7 @@ define <1 x float> @constrained_vector_log2_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq log2f@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -2819,10 +2819,10 @@ define <2 x double> @constrained_vector_log2_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq log2@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq log2@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2834,10 +2834,10 @@ define <2 x double> @constrained_vector_log2_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -2857,13 +2857,13 @@ define <3 x float> @constrained_vector_log2_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq log2f@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq log2f@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq log2f@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -2878,13 +2878,13 @@ define <3 x float> @constrained_vector_log2_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq log2f@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq log2f@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq log2f@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -2906,13 +2906,13 @@ define <3 x double> @constrained_vector_log2_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq log2@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq log2@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq log2@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -2929,15 +2929,15 @@ define <3 x double> @constrained_vector_log2_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -2958,18 +2958,18 @@ define <4 x double> @constrained_vector_log2_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq log2@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq log2@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    callq log2@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq log2@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -2983,18 +2983,18 @@ define <4 x double> @constrained_vector_log2_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq log2@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -3016,7 +3016,7 @@ define <1 x float> @constrained_vector_rint_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq rintf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -3024,7 +3024,7 @@ define <1 x float> @constrained_vector_rint_v1f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_rint_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -3040,10 +3040,10 @@ define <2 x double> @constrained_vector_rint_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq rint@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq rint@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -3068,13 +3068,13 @@ define <3 x float> @constrained_vector_rint_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq rintf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq rintf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq rintf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -3087,11 +3087,11 @@ define <3 x float> @constrained_vector_rint_v3f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_rint_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $4, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $4, %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $4, %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
@@ -3109,13 +3109,13 @@ define <3 x double> @constrained_vector_rint_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq rint@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq rint@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq rint@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -3130,7 +3130,7 @@ define <3 x double> @constrained_vector_rint_v3f64() #0 {
 ;
 ; AVX-LABEL: constrained_vector_rint_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vroundsd $4, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vroundpd $4, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3148,18 +3148,18 @@ define <4 x double> @constrained_vector_rint_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq rint@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq rint@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2399999999999999E+1,0.0E+0]
 ; CHECK-NEXT:    callq rint@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    callq rint@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -3187,7 +3187,7 @@ define <1 x float> @constrained_vector_nearbyint_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq nearbyintf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -3195,7 +3195,7 @@ define <1 x float> @constrained_vector_nearbyint_v1f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_nearbyint_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -3211,10 +3211,10 @@ define <2 x double> @constrained_vector_nearbyint_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq nearbyint@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq nearbyint@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -3239,13 +3239,13 @@ define <3 x float> @constrained_vector_nearbyint_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq nearbyintf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq nearbyintf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq nearbyintf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -3258,11 +3258,11 @@ define <3 x float> @constrained_vector_nearbyint_v3f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_nearbyint_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $12, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $12, %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $12, %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
@@ -3280,13 +3280,13 @@ define <3 x double> @constrained_vector_nearby_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq nearbyint@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq nearbyint@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq nearbyint@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -3301,7 +3301,7 @@ define <3 x double> @constrained_vector_nearby_v3f64() #0 {
 ;
 ; AVX-LABEL: constrained_vector_nearby_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vroundsd $12, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vroundpd $12, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -3319,18 +3319,18 @@ define <4 x double> @constrained_vector_nearbyint_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    callq nearbyint@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    callq nearbyint@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2399999999999999E+1,0.0E+0]
 ; CHECK-NEXT:    callq nearbyint@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    callq nearbyint@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -3358,8 +3358,8 @@ define <1 x float> @constrained_vector_maxnum_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.1E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fmaxf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -3369,8 +3369,8 @@ define <1 x float> @constrained_vector_maxnum_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.1E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fmaxf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -3387,12 +3387,12 @@ define <2 x double> @constrained_vector_maxnum_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmax@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.3E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmax@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -3404,12 +3404,12 @@ define <2 x double> @constrained_vector_maxnum_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmax@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.3E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; AVX-NEXT:    callq fmax@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -3429,16 +3429,16 @@ define <3 x float> @constrained_vector_maxnum_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.5E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fmaxf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.1E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fmaxf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fmaxf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -3453,16 +3453,16 @@ define <3 x float> @constrained_vector_maxnum_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.5E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fmaxf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.1E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fmaxf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fmaxf@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -3484,16 +3484,16 @@ define <3 x double> @constrained_vector_max_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.4E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmax@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.3E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmax@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.5E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmax@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -3510,18 +3510,18 @@ define <3 x double> @constrained_vector_max_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.4E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; AVX-NEXT:    callq fmax@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.3E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmax@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.5E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq fmax@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -3542,22 +3542,22 @@ define <4 x double> @constrained_vector_maxnum_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.5E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmax@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.4E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmax@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.7E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.3E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmax@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.6E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmax@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -3571,22 +3571,22 @@ define <4 x double> @constrained_vector_maxnum_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.7E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.3E+1,0.0E+0]
 ; AVX-NEXT:    callq fmax@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.6E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq fmax@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.5E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; AVX-NEXT:    callq fmax@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.4E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmax@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -3609,8 +3609,8 @@ define <1 x float> @constrained_vector_minnum_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.1E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -3620,8 +3620,8 @@ define <1 x float> @constrained_vector_minnum_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.1E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fminf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -3638,12 +3638,12 @@ define <2 x double> @constrained_vector_minnum_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmin@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.3E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmin@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -3655,12 +3655,12 @@ define <2 x double> @constrained_vector_minnum_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmin@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.3E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; AVX-NEXT:    callq fmin@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -3680,16 +3680,16 @@ define <3 x float> @constrained_vector_minnum_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.5E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.1E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq fminf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -3704,16 +3704,16 @@ define <3 x float> @constrained_vector_minnum_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.5E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fminf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.1E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fminf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq fminf@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -3735,16 +3735,16 @@ define <3 x double> @constrained_vector_min_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.4E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmin@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.3E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmin@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.5E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmin@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -3761,18 +3761,18 @@ define <3 x double> @constrained_vector_min_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.4E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; AVX-NEXT:    callq fmin@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.3E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmin@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.5E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq fmin@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -3793,22 +3793,22 @@ define <4 x double> @constrained_vector_minnum_v4f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.5E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmin@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.4E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmin@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.7E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.3E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmin@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.6E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; CHECK-NEXT:    callq fmin@PLT
 ; CHECK-NEXT:    movaps %xmm0, %xmm1
 ; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
@@ -3822,22 +3822,22 @@ define <4 x double> @constrained_vector_minnum_v4f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.7E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.3E+1,0.0E+0]
 ; AVX-NEXT:    callq fmin@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.6E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    callq fmin@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.5E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.1E+1,0.0E+0]
 ; AVX-NEXT:    callq fmin@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.4E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.0E+1,0.0E+0]
 ; AVX-NEXT:    callq fmin@PLT
 ; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -4478,8 +4478,8 @@ entry:
 define <1 x i64> @constrained_vector_fptoui_v1i64_v1f32() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm0, %xmm2
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    ja .LBB115_2
@@ -4496,8 +4496,8 @@ define <1 x i64> @constrained_vector_fptoui_v1i64_v1f32() #0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v1i64_v1f32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm0, %xmm1
 ; AVX1-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    ja .LBB115_2
@@ -4526,8 +4526,8 @@ entry:
 define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm2, %xmm1
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
@@ -4542,7 +4542,7 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 {
 ; CHECK-NEXT:    shlq $63, %rcx
 ; CHECK-NEXT:    xorq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, %xmm2
-; CHECK-NEXT:    movss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm3 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm3, %xmm1
 ; CHECK-NEXT:    ja .LBB116_4
 ; CHECK-NEXT:  # %bb.3: # %entry
@@ -4560,8 +4560,8 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v2i64_v2f32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm2, %xmm0
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
@@ -4576,7 +4576,7 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f32() #0 {
 ; AVX1-NEXT:    shlq $63, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm2
-; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm3, %xmm0
 ; AVX1-NEXT:    ja .LBB116_4
 ; AVX1-NEXT:  # %bb.3: # %entry
@@ -4617,8 +4617,8 @@ entry:
 define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm2, %xmm1
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
@@ -4632,7 +4632,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    shlq $63, %rax
 ; CHECK-NEXT:    xorq %rcx, %rax
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm2, %xmm1
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
 ; CHECK-NEXT:    ja .LBB117_4
@@ -4645,7 +4645,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
 ; CHECK-NEXT:    movzbl %dl, %edx
 ; CHECK-NEXT:    shlq $63, %rdx
 ; CHECK-NEXT:    xorq %rcx, %rdx
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm2, %xmm1
 ; CHECK-NEXT:    ja .LBB117_6
 ; CHECK-NEXT:  # %bb.5: # %entry
@@ -4661,8 +4661,8 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm2, %xmm0
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
@@ -4677,7 +4677,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
 ; AVX1-NEXT:    shlq $63, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm2
-; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm3, %xmm0
 ; AVX1-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    ja .LBB117_4
@@ -4692,7 +4692,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f32() #0 {
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm3
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm3, %xmm0
 ; AVX1-NEXT:    ja .LBB117_6
 ; AVX1-NEXT:  # %bb.5: # %entry
@@ -4730,8 +4730,8 @@ entry:
 define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm0, %xmm2
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    xorps %xmm3, %xmm3
@@ -4745,7 +4745,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    shlq $63, %rax
 ; CHECK-NEXT:    xorq %rcx, %rax
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm0, %xmm2
 ; CHECK-NEXT:    xorps %xmm4, %xmm4
 ; CHECK-NEXT:    ja .LBB118_4
@@ -4760,7 +4760,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
 ; CHECK-NEXT:    shlq $63, %rcx
 ; CHECK-NEXT:    xorq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, %xmm0
-; CHECK-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm4 = [4.5E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm4, %xmm2
 ; CHECK-NEXT:    xorps %xmm5, %xmm5
 ; CHECK-NEXT:    ja .LBB118_6
@@ -4775,7 +4775,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
 ; CHECK-NEXT:    shlq $63, %rcx
 ; CHECK-NEXT:    xorq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, %xmm3
-; CHECK-NEXT:    movss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm4 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    comiss %xmm4, %xmm2
 ; CHECK-NEXT:    ja .LBB118_8
 ; CHECK-NEXT:  # %bb.7: # %entry
@@ -4793,8 +4793,8 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v4i64_v4f32:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm2 = [4.5E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [9.22337203E+18,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm2, %xmm0
 ; AVX1-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vxorps %xmm3, %xmm3, %xmm3
@@ -4808,7 +4808,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
 ; AVX1-NEXT:    movzbl %al, %eax
 ; AVX1-NEXT:    shlq $63, %rax
 ; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm3 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm3, %xmm0
 ; AVX1-NEXT:    vxorps %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    ja .LBB118_4
@@ -4823,7 +4823,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
 ; AVX1-NEXT:    shlq $63, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm3
-; AVX1-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm4 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm4, %xmm0
 ; AVX1-NEXT:    vxorps %xmm5, %xmm5, %xmm5
 ; AVX1-NEXT:    ja .LBB118_6
@@ -4838,7 +4838,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f32() #0 {
 ; AVX1-NEXT:    shlq $63, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm3
-; AVX1-NEXT:    vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero
+; AVX1-NEXT:    vmovss {{.*#+}} xmm4 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX1-NEXT:    vcomiss %xmm4, %xmm0
 ; AVX1-NEXT:    ja .LBB118_8
 ; AVX1-NEXT:  # %bb.7: # %entry
@@ -5032,8 +5032,8 @@ entry:
 define <1 x i64> @constrained_vector_fptoui_v1i64_v1f64() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v1i64_v1f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm0, %xmm2
 ; CHECK-NEXT:    xorpd %xmm1, %xmm1
 ; CHECK-NEXT:    ja .LBB123_2
@@ -5050,8 +5050,8 @@ define <1 x i64> @constrained_vector_fptoui_v1i64_v1f64() #0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v1i64_v1f64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm0, %xmm1
 ; AVX1-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    ja .LBB123_2
@@ -5080,8 +5080,8 @@ entry:
 define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v2i64_v2f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm2, %xmm1
 ; CHECK-NEXT:    xorpd %xmm0, %xmm0
 ; CHECK-NEXT:    xorpd %xmm3, %xmm3
@@ -5096,7 +5096,7 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 {
 ; CHECK-NEXT:    shlq $63, %rcx
 ; CHECK-NEXT:    xorq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, %xmm2
-; CHECK-NEXT:    movsd {{.*#+}} xmm3 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm3 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm3, %xmm1
 ; CHECK-NEXT:    ja .LBB124_4
 ; CHECK-NEXT:  # %bb.3: # %entry
@@ -5114,8 +5114,8 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v2i64_v2f64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [9.2233720368547758E+18,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm2, %xmm0
 ; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
@@ -5130,7 +5130,7 @@ define <2 x i64> @constrained_vector_fptoui_v2i64_v2f64() #0 {
 ; AVX1-NEXT:    shlq $63, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm2
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = [4.2100000000000001E+1,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm3, %xmm0
 ; AVX1-NEXT:    ja .LBB124_4
 ; AVX1-NEXT:  # %bb.3: # %entry
@@ -5172,8 +5172,8 @@ entry:
 define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v3i64_v3f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [9.2233720368547758E+18,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm2, %xmm1
 ; CHECK-NEXT:    xorpd %xmm0, %xmm0
 ; CHECK-NEXT:    xorpd %xmm3, %xmm3
@@ -5187,7 +5187,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    shlq $63, %rax
 ; CHECK-NEXT:    xorq %rcx, %rax
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm2, %xmm1
 ; CHECK-NEXT:    xorpd %xmm3, %xmm3
 ; CHECK-NEXT:    ja .LBB125_4
@@ -5200,7 +5200,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
 ; CHECK-NEXT:    movzbl %dl, %edx
 ; CHECK-NEXT:    shlq $63, %rdx
 ; CHECK-NEXT:    xorq %rcx, %rdx
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm2, %xmm1
 ; CHECK-NEXT:    ja .LBB125_6
 ; CHECK-NEXT:  # %bb.5: # %entry
@@ -5216,8 +5216,8 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v3i64_v3f64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = [4.2200000000000003E+1,0.0E+0]
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [9.2233720368547758E+18,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm2, %xmm0
 ; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
@@ -5232,7 +5232,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
 ; AVX1-NEXT:    shlq $63, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm2
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = [4.2100000000000001E+1,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm3, %xmm0
 ; AVX1-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    ja .LBB125_4
@@ -5247,7 +5247,7 @@ define <3 x i64> @constrained_vector_fptoui_v3i64_v3f64() #0 {
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm3
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = [4.2299999999999997E+1,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm3, %xmm0
 ; AVX1-NEXT:    ja .LBB125_6
 ; AVX1-NEXT:  # %bb.5: # %entry
@@ -5285,8 +5285,8 @@ entry:
 define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
 ; CHECK-LABEL: constrained_vector_fptoui_v4i64_v4f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm2 = [9.2233720368547758E+18,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm0, %xmm2
 ; CHECK-NEXT:    xorpd %xmm1, %xmm1
 ; CHECK-NEXT:    xorpd %xmm3, %xmm3
@@ -5300,7 +5300,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
 ; CHECK-NEXT:    movzbl %al, %eax
 ; CHECK-NEXT:    shlq $63, %rax
 ; CHECK-NEXT:    xorq %rcx, %rax
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm0, %xmm2
 ; CHECK-NEXT:    xorpd %xmm4, %xmm4
 ; CHECK-NEXT:    ja .LBB126_4
@@ -5315,7 +5315,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
 ; CHECK-NEXT:    shlq $63, %rcx
 ; CHECK-NEXT:    xorq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, %xmm0
-; CHECK-NEXT:    movsd {{.*#+}} xmm4 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm4 = [4.2399999999999999E+1,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm4, %xmm2
 ; CHECK-NEXT:    xorpd %xmm5, %xmm5
 ; CHECK-NEXT:    ja .LBB126_6
@@ -5330,7 +5330,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
 ; CHECK-NEXT:    shlq $63, %rcx
 ; CHECK-NEXT:    xorq %rax, %rcx
 ; CHECK-NEXT:    movq %rcx, %xmm3
-; CHECK-NEXT:    movsd {{.*#+}} xmm4 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm4 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    comisd %xmm4, %xmm2
 ; CHECK-NEXT:    ja .LBB126_8
 ; CHECK-NEXT:  # %bb.7: # %entry
@@ -5348,8 +5348,8 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
 ;
 ; AVX1-LABEL: constrained_vector_fptoui_v4i64_v4f64:
 ; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm2 = [4.2399999999999999E+1,0.0E+0]
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [9.2233720368547758E+18,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm2, %xmm0
 ; AVX1-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX1-NEXT:    vxorpd %xmm3, %xmm3, %xmm3
@@ -5363,7 +5363,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
 ; AVX1-NEXT:    movzbl %al, %eax
 ; AVX1-NEXT:    shlq $63, %rax
 ; AVX1-NEXT:    xorq %rcx, %rax
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm3 = [4.2299999999999997E+1,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm3, %xmm0
 ; AVX1-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
 ; AVX1-NEXT:    ja .LBB126_4
@@ -5378,7 +5378,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
 ; AVX1-NEXT:    shlq $63, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm3
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm4 = [4.2200000000000003E+1,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm4, %xmm0
 ; AVX1-NEXT:    vxorpd %xmm5, %xmm5, %xmm5
 ; AVX1-NEXT:    ja .LBB126_6
@@ -5393,7 +5393,7 @@ define <4 x i64> @constrained_vector_fptoui_v4i64_v4f64() #0 {
 ; AVX1-NEXT:    shlq $63, %rcx
 ; AVX1-NEXT:    xorq %rax, %rcx
 ; AVX1-NEXT:    vmovq %rcx, %xmm3
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm4 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm4 = [4.2100000000000001E+1,0.0E+0]
 ; AVX1-NEXT:    vcomisd %xmm4, %xmm0
 ; AVX1-NEXT:    ja .LBB126_8
 ; AVX1-NEXT:  # %bb.7: # %entry
@@ -5443,13 +5443,13 @@ entry:
 define <1 x float> @constrained_vector_fptrunc_v1f64() #0 {
 ; CHECK-LABEL: constrained_vector_fptrunc_v1f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fptrunc_v1f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -5481,24 +5481,24 @@ entry:
 define <3 x float> @constrained_vector_fptrunc_v3f64() #0 {
 ; CHECK-LABEL: constrained_vector_fptrunc_v3f64:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm1
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
 ; CHECK-NEXT:    cvtsd2ss %xmm0, %xmm0
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; CHECK-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [4.2299999999999997E+1,0.0E+0]
 ; CHECK-NEXT:    cvtsd2ss %xmm1, %xmm1
 ; CHECK-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fptrunc_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
 ; AVX-NEXT:    vcvtsd2ss %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2100000000000001E+1,0.0E+0]
 ; AVX-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2299999999999997E+1,0.0E+0]
 ; AVX-NEXT:    vcvtsd2ss %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX-NEXT:    retq
@@ -5535,13 +5535,13 @@ entry:
 define <1 x double> @constrained_vector_fpext_v1f32() #0 {
 ; CHECK-LABEL: constrained_vector_fpext_v1f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    cvtss2sd %xmm0, %xmm0
 ; CHECK-NEXT:    retq
 ;
 ; AVX-LABEL: constrained_vector_fpext_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -5571,11 +5571,11 @@ entry:
 define <3 x double> @constrained_vector_fpext_v3f32() #0 {
 ; CHECK-LABEL: constrained_vector_fpext_v3f32:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    cvtss2sd %xmm0, %xmm1
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    cvtss2sd %xmm0, %xmm0
-; CHECK-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm2 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    cvtss2sd %xmm2, %xmm2
 ; CHECK-NEXT:    movsd %xmm2, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl -{{[0-9]+}}(%rsp)
@@ -5584,12 +5584,12 @@ define <3 x double> @constrained_vector_fpext_v3f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_fpext_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vcvtss2sd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vcvtss2sd %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX-NEXT:    retq
@@ -5625,7 +5625,7 @@ define <1 x float> @constrained_vector_ceil_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq ceilf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -5633,7 +5633,7 @@ define <1 x float> @constrained_vector_ceil_v1f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_ceil_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -5648,10 +5648,10 @@ define <2 x double> @constrained_vector_ceil_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; CHECK-NEXT:    callq ceil@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; CHECK-NEXT:    callq ceil@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -5675,13 +5675,13 @@ define <3 x float> @constrained_vector_ceil_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq ceilf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq ceilf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq ceilf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -5694,11 +5694,11 @@ define <3 x float> @constrained_vector_ceil_v3f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_ceil_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $10, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $10, %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $10, %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
@@ -5715,13 +5715,13 @@ define <3 x double> @constrained_vector_ceil_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; CHECK-NEXT:    callq ceil@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; CHECK-NEXT:    callq ceil@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.5E+0,0.0E+0]
 ; CHECK-NEXT:    callq ceil@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -5736,7 +5736,7 @@ define <3 x double> @constrained_vector_ceil_v3f64() #0 {
 ;
 ; AVX-LABEL: constrained_vector_ceil_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.5E+0,0.0E+0]
 ; AVX-NEXT:    vroundsd $10, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vroundpd $10, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5753,7 +5753,7 @@ define <1 x float> @constrained_vector_floor_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq floorf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -5761,7 +5761,7 @@ define <1 x float> @constrained_vector_floor_v1f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_floor_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -5777,10 +5777,10 @@ define <2 x double> @constrained_vector_floor_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; CHECK-NEXT:    callq floor@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; CHECK-NEXT:    callq floor@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -5804,13 +5804,13 @@ define <3 x float> @constrained_vector_floor_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq floorf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq floorf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq floorf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -5823,11 +5823,11 @@ define <3 x float> @constrained_vector_floor_v3f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_floor_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $9, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $9, %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $9, %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
@@ -5844,13 +5844,13 @@ define <3 x double> @constrained_vector_floor_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; CHECK-NEXT:    callq floor@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; CHECK-NEXT:    callq floor@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.5E+0,0.0E+0]
 ; CHECK-NEXT:    callq floor@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -5865,7 +5865,7 @@ define <3 x double> @constrained_vector_floor_v3f64() #0 {
 ;
 ; AVX-LABEL: constrained_vector_floor_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.5E+0,0.0E+0]
 ; AVX-NEXT:    vroundsd $9, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vroundpd $9, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
@@ -5882,7 +5882,7 @@ define <1 x float> @constrained_vector_round_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq roundf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -5892,7 +5892,7 @@ define <1 x float> @constrained_vector_round_v1f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    pushq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 16
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq roundf@PLT
 ; AVX-NEXT:    popq %rax
 ; AVX-NEXT:    .cfi_def_cfa_offset 8
@@ -5909,10 +5909,10 @@ define <2 x double> @constrained_vector_round_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; CHECK-NEXT:    callq round@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; CHECK-NEXT:    callq round@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -5924,10 +5924,10 @@ define <2 x double> @constrained_vector_round_v2f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $24, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 32
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; AVX-NEXT:    callq round@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; AVX-NEXT:    callq round@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -5946,13 +5946,13 @@ define <3 x float> @constrained_vector_round_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq roundf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq roundf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq roundf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -5967,13 +5967,13 @@ define <3 x float> @constrained_vector_round_v3f32() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq roundf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq roundf@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    callq roundf@PLT
 ; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
@@ -5995,13 +5995,13 @@ define <3 x double> @constrained_vector_round_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; CHECK-NEXT:    callq round@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; CHECK-NEXT:    callq round@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.5E+0,0.0E+0]
 ; CHECK-NEXT:    callq round@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -6018,15 +6018,15 @@ define <3 x double> @constrained_vector_round_v3f64() #0 {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    subq $40, %rsp
 ; AVX-NEXT:    .cfi_def_cfa_offset 48
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; AVX-NEXT:    callq round@PLT
 ; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; AVX-NEXT:    callq round@PLT
 ; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
 ; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
 ; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.5E+0,0.0E+0]
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    callq round@PLT
 ; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
@@ -6046,7 +6046,7 @@ define <1 x float> @constrained_vector_trunc_v1f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    pushq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq truncf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
@@ -6054,7 +6054,7 @@ define <1 x float> @constrained_vector_trunc_v1f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_trunc_v1f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 entry:
@@ -6069,10 +6069,10 @@ define <2 x double> @constrained_vector_trunc_v2f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; CHECK-NEXT:    callq trunc@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; CHECK-NEXT:    callq trunc@PLT
 ; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
 ; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
@@ -6096,13 +6096,13 @@ define <3 x float> @constrained_vector_trunc_v3f32() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $40, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq truncf@PLT
 ; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq truncf@PLT
 ; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    callq truncf@PLT
 ; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
@@ -6115,11 +6115,11 @@ define <3 x float> @constrained_vector_trunc_v3f32() #0 {
 ;
 ; AVX-LABEL: constrained_vector_trunc_v3f32:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [3.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $11, %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [1.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $11, %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [2.5E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vroundss $11, %xmm2, %xmm2, %xmm2
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0,1],xmm0[0],xmm1[3]
@@ -6136,13 +6136,13 @@ define <3 x double> @constrained_vector_trunc_v3f64() #0 {
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 32
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.8999999999999999E+0,0.0E+0]
 ; CHECK-NEXT:    callq trunc@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.1000000000000001E+0,0.0E+0]
 ; CHECK-NEXT:    callq trunc@PLT
 ; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.5E+0,0.0E+0]
 ; CHECK-NEXT:    callq trunc@PLT
 ; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
@@ -6157,7 +6157,7 @@ define <3 x double> @constrained_vector_trunc_v3f64() #0 {
 ;
 ; AVX-LABEL: constrained_vector_trunc_v3f64:
 ; AVX:       # %bb.0: # %entry
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [1.5E+0,0.0E+0]
 ; AVX-NEXT:    vroundsd $11, %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vroundpd $11, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
index 9dd2a045087da..606beeaff750e 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fadd.ll
@@ -1901,17 +1901,17 @@ define double @test_v16f64_undef(<16 x double> %a0) {
 define float @PR64627() {
 ; SSE-LABEL: PR64627:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [5.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR64627:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [5.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: PR64627:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = [5.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; AVX512-NEXT:    retq
   %1 = bitcast i5 0 to <5 x i1>
   %2 = select <5 x i1> %1, <5 x float> zeroinitializer, <5 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index be4253b6d5d10..d0334dfb66e8a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3482,7 +3482,7 @@ define void @SpinningCube() {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
-; SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSE2-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u>
 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSE2-NEXT:    xorps %xmm3, %xmm3
@@ -3501,7 +3501,7 @@ define void @SpinningCube() {
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
-; SSSE3-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; SSSE3-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSSE3-NEXT:    movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u>
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSSE3-NEXT:    xorps %xmm3, %xmm3
@@ -3521,7 +3521,7 @@ define void @SpinningCube() {
 ; SSE41-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
 ; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
-; SSE41-NEXT:    movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; SSE41-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSE41-NEXT:    movaps %xmm1, %xmm3
 ; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
 ; SSE41-NEXT:    movaps %xmm0, %xmm4
@@ -3540,7 +3540,7 @@ define void @SpinningCube() {
 ; AVX-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
-; AVX-NEXT:    vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
 ; AVX-NEXT:    vaddps %xmm2, %xmm3, %xmm2
diff --git a/llvm/test/CodeGen/X86/vselect-zero.ll b/llvm/test/CodeGen/X86/vselect-zero.ll
index 1b576b28ce831..3a53a7b852233 100644
--- a/llvm/test/CodeGen/X86/vselect-zero.ll
+++ b/llvm/test/CodeGen/X86/vselect-zero.ll
@@ -117,7 +117,7 @@ define double @fsel_nonzero_false_val(double %x, double %y, double %z) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cmpeqsd %xmm1, %xmm0
 ; SSE-NEXT:    andpd %xmm0, %xmm2
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    andnpd %xmm1, %xmm0
 ; SSE-NEXT:    orpd %xmm2, %xmm0
 ; SSE-NEXT:    retq
@@ -133,7 +133,7 @@ define double @fsel_nonzero_false_val(double %x, double %y, double %z) {
 ; AVX512-LABEL: fsel_nonzero_false_val:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcmpeqsd %xmm1, %xmm0, %k1
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX512-NEXT:    vmovsd %xmm2, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
   %cond = fcmp oeq double %x, %y
@@ -145,7 +145,7 @@ define double @fsel_nonzero_true_val(double %x, double %y, double %z) {
 ; SSE-LABEL: fsel_nonzero_true_val:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    cmpeqsd %xmm1, %xmm0
-; SSE-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; SSE-NEXT:    andpd %xmm0, %xmm1
 ; SSE-NEXT:    andnpd %xmm2, %xmm0
 ; SSE-NEXT:    orpd %xmm1, %xmm0
@@ -188,7 +188,7 @@ define double @fsel_nonzero_constants(double %x, double %y) {
 ; AVX512-LABEL: fsel_nonzero_constants:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcmpeqsd %xmm1, %xmm0, %k1
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX512-NEXT:    vmovsd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1}
 ; AVX512-NEXT:    retq
   %cond = fcmp oeq double %x, %y
@@ -200,7 +200,7 @@ define <2 x double> @vsel_nonzero_constants(<2 x double> %x, <2 x double> %y) {
 ; SSE2-LABEL: vsel_nonzero_constants:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    cmplepd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE2-NEXT:    movsd {{.*#+}} xmm2 = [4.2E+1,0.0E+0]
 ; SSE2-NEXT:    movapd %xmm1, %xmm0
 ; SSE2-NEXT:    andnpd %xmm2, %xmm0
 ; SSE2-NEXT:    andpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
@@ -210,7 +210,7 @@ define <2 x double> @vsel_nonzero_constants(<2 x double> %x, <2 x double> %y) {
 ; SSE42-LABEL: vsel_nonzero_constants:
 ; SSE42:       # %bb.0:
 ; SSE42-NEXT:    cmplepd %xmm0, %xmm1
-; SSE42-NEXT:    movsd {{.*#+}} xmm2 = mem[0],zero
+; SSE42-NEXT:    movsd {{.*#+}} xmm2 = [4.2E+1,0.0E+0]
 ; SSE42-NEXT:    movapd %xmm1, %xmm0
 ; SSE42-NEXT:    blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE42-NEXT:    movapd %xmm2, %xmm0
@@ -219,14 +219,14 @@ define <2 x double> @vsel_nonzero_constants(<2 x double> %x, <2 x double> %y) {
 ; AVX-LABEL: vsel_nonzero_constants:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vcmplepd %xmm0, %xmm1, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [4.2E+1,0.0E+0]
 ; AVX-NEXT:    vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX512-LABEL: vsel_nonzero_constants:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vcmplepd %xmm0, %xmm1, %k1
-; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
 ; AVX512-NEXT:    vmovapd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1}
 ; AVX512-NEXT:    retq
   %cond = fcmp oge <2 x double> %x, %y
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index 8c7535f616913..929671d674e5e 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -1334,7 +1334,7 @@ define void @load_16byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst
 ; no @load_32byte_chunk_of_32byte_alloca
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; ALL: {{.*}}
-; X86-NO-SHLD: {{.*}}
-; X86-SHLD: {{.*}}
 ; X64-NO-SHLD: {{.*}}
 ; X64-SHLD: {{.*}}
+; X86-NO-SHLD: {{.*}}
+; X86-SHLD: {{.*}}
diff --git a/llvm/test/CodeGen/X86/x86-64-varargs.ll b/llvm/test/CodeGen/X86/x86-64-varargs.ll
index 884baf174d0aa..f947327d4c562 100644
--- a/llvm/test/CodeGen/X86/x86-64-varargs.ll
+++ b/llvm/test/CodeGen/X86/x86-64-varargs.ll
@@ -579,9 +579,9 @@ define i32 @main() nounwind {
 ; CHECK-X32:       # %bb.0: # %entry
 ; CHECK-X32-NEXT:    pushq %rax
 ; CHECK-X32-NEXT:    movl $12, (%esp)
-; CHECK-X32-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-X32-NEXT:    movsd {{.*#+}} xmm0 = [4.5E+15,0.0E+0]
 ; CHECK-X32-NEXT:    movabsq $123456677890, %r8 # imm = 0x1CBE976802
-; CHECK-X32-NEXT:    movsd {{.*#+}} xmm1 = mem[0],zero
+; CHECK-X32-NEXT:    movsd {{.*#+}} xmm1 = [1.2450000047683716E+0,0.0E+0]
 ; CHECK-X32-NEXT:    movl $1, %edi
 ; CHECK-X32-NEXT:    movl $2, %esi
 ; CHECK-X32-NEXT:    movl $3, %edx

From 4d11f04b20f0bd7488e19e8f178ba028412fa519 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <5361294+alexey-bataev@users.noreply.github.com>
Date: Fri, 19 Jan 2024 09:29:01 -0500
Subject: [PATCH 137/843] [InstCombine] Try to fold trunc(shuffle(zext)) to
 just a shuffle (#78636)

Tries to remove extra trunc/ext instruction for shufflevector
instructions.
---
 llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp   | 10 ++++++++++
 .../InstCombine/logical-select-inseltpoison.ll         |  7 ++-----
 llvm/test/Transforms/InstCombine/logical-select.ll     |  7 ++-----
 3 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 6629ca840a67c..3470e61cd597f 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -103,6 +103,13 @@ Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
       }
     }
     break;
+  case Instruction::ShuffleVector: {
+    Value *Op0 = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
+    Value *Op1 = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Res = new ShuffleVectorInst(Op0, Op1,
+                                cast<ShuffleVectorInst>(I)->getShuffleMask());
+    break;
+  }
   default:
     // TODO: Can handle more cases here.
     llvm_unreachable("Unreachable!");
@@ -363,6 +370,9 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
           I->getOpcode() == Instruction::FPToSI);
     return Ty->getScalarSizeInBits() >= MinBitWidth;
   }
+  case Instruction::ShuffleVector:
+    return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+           canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
   default:
     // TODO: Can handle more cases here.
     break;
diff --git a/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
index 29e2cb42e1bef..b3d147621b59e 100644
--- a/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
@@ -671,11 +671,8 @@ define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %
 
 define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i32> %y, <4 x i1> %cond1, <4 x i1> %cond2) {
 ; CHECK-LABEL: @computesignbits_through_two_input_shuffle(
-; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[COND]] to <4 x i1>
-; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[COND1:%.*]], <4 x i1> [[COND2:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[SEL]]
 ;
   %sext1 = sext <4 x i1> %cond1 to <4 x i32>
diff --git a/llvm/test/Transforms/InstCombine/logical-select.ll b/llvm/test/Transforms/InstCombine/logical-select.ll
index fcca9588767dd..af1a3e1455e49 100644
--- a/llvm/test/Transforms/InstCombine/logical-select.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select.ll
@@ -707,11 +707,8 @@ define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %
 
 define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i32> %y, <4 x i1> %cond1, <4 x i1> %cond2) {
 ; CHECK-LABEL: @computesignbits_through_two_input_shuffle(
-; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[COND]] to <4 x i1>
-; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[COND1:%.*]], <4 x i1> [[COND2:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[SEL]]
 ;
   %sext1 = sext <4 x i1> %cond1 to <4 x i32>

From a0b911745494e3c2f53a27f23c536314818ce0b1 Mon Sep 17 00:00:00 2001
From: Manish Kausik H <46352931+Nirhar@users.noreply.github.com>
Date: Fri, 19 Jan 2024 20:00:20 +0530
Subject: [PATCH 138/843] LoopDeletion: Move EH pad check before the
 isLoopNeverExecuted Check (#78189)

This commit modifies `LoopDeletion::deleteLoopIfDead` to check if the
exit block of a loop is an EH pad before checking if the loop gets
executed. This handles the case where an unreachable loop has a
landingpad as an Exit block, and the loop gets deleted, leaving leaving
the landingpad without an edge from an unwind clause.

Fixes #76852.
---
 llvm/lib/Transforms/Scalar/LoopDeletion.cpp   | 14 +++----
 .../loop-with-ehpad-not-executed.ll           | 37 +++++++++++++++++++
 2 files changed, 44 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopDeletion/loop-with-ehpad-not-executed.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index c041e3621a16b..bfe9374cf2f8c 100644
--- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -452,6 +452,13 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
 
   BasicBlock *ExitBlock = L->getUniqueExitBlock();
 
+  // We can't directly branch to an EH pad. Don't bother handling this edge
+  // case.
+  if (ExitBlock && ExitBlock->isEHPad()) {
+    LLVM_DEBUG(dbgs() << "Cannot delete loop exiting to EH pad.\n");
+    return LoopDeletionResult::Unmodified;
+  }
+
   if (ExitBlock && isLoopNeverExecuted(L)) {
     LLVM_DEBUG(dbgs() << "Loop is proven to never execute, delete it!\n");
     // We need to forget the loop before setting the incoming values of the exit
@@ -487,13 +494,6 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
     return LoopDeletionResult::Unmodified;
   }
 
-  // We can't directly branch to an EH pad. Don't bother handling this edge
-  // case.
-  if (ExitBlock && ExitBlock->isEHPad()) {
-    LLVM_DEBUG(dbgs() << "Cannot delete loop exiting to EH pad.\n");
-    return LoopDeletionResult::Unmodified;
-  }
-
   // Finally, we have to check that the loop really is dead.
   bool Changed = false;
   if (!isLoopDead(L, SE, ExitingBlocks, ExitBlock, Changed, Preheader, LI)) {
diff --git a/llvm/test/Transforms/LoopDeletion/loop-with-ehpad-not-executed.ll b/llvm/test/Transforms/LoopDeletion/loop-with-ehpad-not-executed.ll
new file mode 100644
index 0000000000000..e5871df96d3a9
--- /dev/null
+++ b/llvm/test/Transforms/LoopDeletion/loop-with-ehpad-not-executed.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt %s -passes=loop-deletion -S | FileCheck %s
+
+define void @wombat() personality ptr null {
+; CHECK-LABEL: define void @wombat() personality ptr null {
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 false, label [[BB1:%.*]], label [[BB4:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    br label [[BB2:%.*]]
+; CHECK:       bb2:
+; CHECK-NEXT:    [[INVOKE:%.*]] = invoke double null()
+; CHECK-NEXT:            to label [[BB2]] unwind label [[BB3:%.*]]
+; CHECK:       bb3:
+; CHECK-NEXT:    [[LANDINGPAD:%.*]] = landingpad { ptr, i32 }
+; CHECK-NEXT:            cleanup
+; CHECK-NEXT:    ret void
+; CHECK:       bb4:
+; CHECK-NEXT:    ret void
+;
+bb:
+  br i1 false, label %bb1, label %bb4
+
+bb1:                                              ; preds = %bb
+  br label %bb2
+
+bb2:                                              ; preds = %bb1, %bb2
+  %invoke = invoke double null()
+  to label %bb2 unwind label %bb3
+
+bb3:                                              ; preds = %bb2
+  %landingpad = landingpad { ptr, i32 }
+  cleanup
+  ret void
+
+bb4:                                              ; preds = %bb
+  ret void
+}

From ed123880826cd9a4acdcd05409b1fac4203a436a Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 19 Jan 2024 14:36:27 +0000
Subject: [PATCH 139/843] [AMDGPU] Do not emit `V_DOT2C_F32_F16_e32` on GFX12
 (#78709)

That instruction is not supported on GFX12.
Added a testcase which previously crashed without this change.

Co-authored-by: pvanhout <pierre.vanhoutryve@amd.com>
---
 clang/test/CodeGenOpenCL/amdgpu-features.cl   | 4 ++--
 llvm/lib/Target/AMDGPU/AMDGPU.td              | 1 -
 llvm/lib/TargetParser/TargetParser.cpp        | 1 -
 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll | 4 ++++
 4 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 7495bca72a9df..1ba2b129f6895 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -100,8 +100,8 @@
 // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 7b7fa906b2b1a..92985f971f17a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1487,7 +1487,6 @@ def FeatureISAVersion12 : FeatureSet<
   [FeatureGFX12,
    FeatureLDSBankCount32,
    FeatureDLInsts,
-   FeatureDot5Insts,
    FeatureDot7Insts,
    FeatureDot8Insts,
    FeatureDot9Insts,
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 2cfe23676d20f..f6d5bfe913b41 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -275,7 +275,6 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
     case GK_GFX1201:
     case GK_GFX1200:
       Features["ci-insts"] = true;
-      Features["dot5-insts"] = true;
       Features["dot7-insts"] = true;
       Features["dot8-insts"] = true;
       Features["dot9-insts"] = true;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
index 240997aeb9a68..26e6bde97f499 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.ll
@@ -3,12 +3,14 @@
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1011 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1012 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX10
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GCN,GFX12
 
 declare float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float %c, i1 %clamp)
 
 ; GCN-LABEL: {{^}}test_llvm_amdgcn_fdot2_clamp
 ; GFX9:   v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 ; GFX10:  v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
+; GFX12:  v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}} clamp{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_clamp(
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -27,6 +29,7 @@ entry:
 ; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX940: v_dot2c_f32_f16_e32 v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 ; GFX10:  {{v_dot2c_f32_f16|v_dot2acc_f32_f16}} v{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GFX12: v_dot2_f32_f16 v{{[0-9]+}}, s{{[0-9]+}}, s{{[0-9]+}}, v{{[0-9]+}}{{$}}
 define amdgpu_kernel void @test_llvm_amdgcn_fdot2_no_clamp(
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
@@ -44,6 +47,7 @@ entry:
 ; GFX9-LABEL: {{^}}fdot2_inline_literal
 ; GFX906: v_dot2_f32_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0
 ; GFX940: v_dot2c_f32_f16_e32 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}{{$}}
+; GFX12: v_dot2_f32_f16 v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}}, 1.0{{$}}
 define float @fdot2_inline_literal(<2 x half> %a, <2 x half> %b) {
   %ret = tail call float @llvm.amdgcn.fdot2(<2 x half> %a, <2 x half> %b, float 1.0, i1 false)
   ret float %ret

From 0d51c8704c647e3eec833b1435d268d8bb693b4e Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Fri, 19 Jan 2024 22:36:24 +0800
Subject: [PATCH 140/843] [X86] Fix -Wsign-compare in X86MCInstLower.cpp (NFC)

llvm-project/llvm/lib/Target/X86/X86MCInstLower.cpp:1867:20:
 error: comparison of integers of different signs: 'int' and 'unsigned int' [-Werror,-Wsign-compare]
      if (SclWidth == C->getType()->getScalarSizeInBits()) {
          ~~~~~~~~ ^  ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 error generated.
---
 llvm/lib/Target/X86/X86MCInstLower.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index fe3d16a949342..6e8e512336997 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1864,7 +1864,7 @@ static void addConstantComments(const MachineInstr *MI,
 
     if (auto *C =
             X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
-      if (SclWidth == C->getType()->getScalarSizeInBits()) {
+      if ((unsigned)SclWidth == C->getType()->getScalarSizeInBits()) {
         if (auto *CI = dyn_cast<ConstantInt>(C)) {
           CS << "[";
           printConstant(CI->getValue(), CS);

From 535b197b8e96b816998ad4b4ee45e011fa05fba9 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Fri, 19 Jan 2024 14:42:19 +0000
Subject: [PATCH 141/843] [gn build] Port 9ff4be640fb1

---
 llvm/utils/gn/secondary/llvm/lib/DWARFLinker/BUILD.gn | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/DWARFLinker/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/DWARFLinker/BUILD.gn
index 1540e7b4165ba..f84c6863278de 100644
--- a/llvm/utils/gn/secondary/llvm/lib/DWARFLinker/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/DWARFLinker/BUILD.gn
@@ -7,5 +7,8 @@ static_library("DWARFLinker") {
     "//llvm/lib/Object",
     "//llvm/lib/Support",
   ]
-  sources = [ "Utils.cpp" ]
+  sources = [
+    "DWARFLinkerBase.cpp",
+    "Utils.cpp",
+  ]
 }

From 9350860824a8badbbfe2ba81804e163543da2173 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Fri, 19 Jan 2024 16:08:16 +0100
Subject: [PATCH 142/843] [AsmParser] Add support for reading incomplete IR
 (part 1) (#78421)

Add an `-allow-incomplete-ir` flag to the IR parser, which allows
reading IR with missing declarations. This is intended to produce a
best-effort interpretation of the IR, along the same lines of what we
would manually do when taking, for example, a function from
`-print-after-all` output and fixing it up to be valid IR.

This patch only supports dropping references to undeclared metadata,
either by dropping metadata attachments from instructions/functions, or
by dropping calls to certain intrinsics (like debug intrinsics). I will
implement support for inserting missing function/global declarations in
a followup patch.

We don't have real use lists for metadata, so the approach here is to
iterate over the whole IR and identify metadata that needs to be
dropped. This does not support all possible cases, but should handle
anything that's relevant for the function-only IR use case.
---
 llvm/include/llvm/AsmParser/LLParser.h        |  1 +
 llvm/include/llvm/IR/GlobalObject.h           |  1 +
 llvm/include/llvm/IR/Instruction.h            |  3 +
 llvm/include/llvm/IR/Metadata.h               |  7 ++
 llvm/include/llvm/IR/Value.h                  |  3 +
 llvm/lib/AsmParser/LLParser.cpp               | 74 +++++++++++++++++--
 llvm/lib/IR/Metadata.cpp                      | 34 ++++++---
 .../incomplete-ir-metadata-unsupported.ll     |  9 +++
 llvm/test/Assembler/incomplete-ir-metadata.ll | 34 +++++++++
 9 files changed, 151 insertions(+), 15 deletions(-)
 create mode 100644 llvm/test/Assembler/incomplete-ir-metadata-unsupported.ll
 create mode 100644 llvm/test/Assembler/incomplete-ir-metadata.ll

diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index b0d02eac8e672..cf358c384f520 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -331,6 +331,7 @@ namespace llvm {
 
     // Top-Level Entities
     bool parseTopLevelEntities();
+    void dropUnknownMetadataReferences();
     bool validateEndOfModule(bool UpgradeDebugInfo);
     bool validateEndOfIndex();
     bool parseTargetDefinitions(DataLayoutCallbackTy DataLayoutCallback);
diff --git a/llvm/include/llvm/IR/GlobalObject.h b/llvm/include/llvm/IR/GlobalObject.h
index ae8e616824448..b6a974d8bb9f0 100644
--- a/llvm/include/llvm/IR/GlobalObject.h
+++ b/llvm/include/llvm/IR/GlobalObject.h
@@ -133,6 +133,7 @@ class GlobalObject : public GlobalValue {
   using Value::addMetadata;
   using Value::clearMetadata;
   using Value::eraseMetadata;
+  using Value::eraseMetadataIf;
   using Value::getAllMetadata;
   using Value::getMetadata;
   using Value::hasMetadata;
diff --git a/llvm/include/llvm/IR/Instruction.h b/llvm/include/llvm/IR/Instruction.h
index 0211b5076131c..fcd2ba838e7fd 100644
--- a/llvm/include/llvm/IR/Instruction.h
+++ b/llvm/include/llvm/IR/Instruction.h
@@ -384,6 +384,9 @@ class Instruction : public User,
   void copyMetadata(const Instruction &SrcInst,
                     ArrayRef<unsigned> WL = ArrayRef<unsigned>());
 
+  /// Erase all metadata that matches the predicate.
+  void eraseMetadataIf(function_ref<bool(unsigned, MDNode *)> Pred);
+
   /// If the instruction has "branch_weights" MD_prof metadata and the MDNode
   /// has three operands (including name string), swap the order of the
   /// metadata.
diff --git a/llvm/include/llvm/IR/Metadata.h b/llvm/include/llvm/IR/Metadata.h
index befb6975ca18d..db1f44fea3b45 100644
--- a/llvm/include/llvm/IR/Metadata.h
+++ b/llvm/include/llvm/IR/Metadata.h
@@ -417,6 +417,8 @@ class ReplaceableMetadataImpl {
   /// is resolved.
   void resolveAllUses(bool ResolveUsers = true);
 
+  unsigned getNumUses() const { return UseMap.size(); }
+
 private:
   void addRef(void *Ref, OwnerTy Owner);
   void dropRef(void *Ref);
@@ -1243,6 +1245,11 @@ class MDNode : public Metadata {
   bool isReplaceable() const { return isTemporary() || isAlwaysReplaceable(); }
   bool isAlwaysReplaceable() const { return getMetadataID() == DIAssignIDKind; }
 
+  unsigned getNumTemporaryUses() const {
+    assert(isTemporary() && "Only for temporaries");
+    return Context.getReplaceableUses()->getNumUses();
+  }
+
   /// RAUW a temporary.
   ///
   /// \pre \a isTemporary() must be \c true.
diff --git a/llvm/include/llvm/IR/Value.h b/llvm/include/llvm/IR/Value.h
index 61f9d34eef356..945081b77e953 100644
--- a/llvm/include/llvm/IR/Value.h
+++ b/llvm/include/llvm/IR/Value.h
@@ -618,6 +618,9 @@ class Value {
   /// \returns true if any metadata was removed.
   bool eraseMetadata(unsigned KindID);
 
+  /// Erase all metadata attachments matching the given predicate.
+  void eraseMetadataIf(function_ref<bool(unsigned, MDNode *)> Pred);
+
   /// Erase all metadata attached to this Value.
   void clearMetadata();
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index ea7f9a00d719f..d6c5993797de1 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -13,8 +13,8 @@
 #include "llvm/AsmParser/LLParser.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/AsmParser/LLToken.h"
 #include "llvm/AsmParser/SlotMapping.h"
@@ -32,7 +32,9 @@
 #include "llvm/IR/GlobalIFunc.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
@@ -54,6 +56,12 @@
 
 using namespace llvm;
 
+static cl::opt<bool> AllowIncompleteIR(
+    "allow-incomplete-ir", cl::init(false), cl::Hidden,
+    cl::desc(
+        "Allow incomplete IR on a best effort basis (references to unknown "
+        "metadata will be dropped)"));
+
 static std::string getTypeString(Type *T) {
   std::string Result;
   raw_string_ostream Tmp(Result);
@@ -123,6 +131,55 @@ void LLParser::restoreParsingState(const SlotMapping *Slots) {
         std::make_pair(I.first, std::make_pair(I.second, LocTy())));
 }
 
+static void dropIntrinsicWithUnknownMetadataArgument(IntrinsicInst *II) {
+  // White-list intrinsics that are safe to drop.
+  if (!isa<DbgInfoIntrinsic>(II) &&
+      II->getIntrinsicID() != Intrinsic::experimental_noalias_scope_decl)
+    return;
+
+  SmallVector<MetadataAsValue *> MVs;
+  for (Value *V : II->args())
+    if (auto *MV = dyn_cast<MetadataAsValue>(V))
+      if (auto *MD = dyn_cast<MDNode>(MV->getMetadata()))
+        if (MD->isTemporary())
+          MVs.push_back(MV);
+
+  if (!MVs.empty()) {
+    assert(II->use_empty() && "Cannot have uses");
+    II->eraseFromParent();
+
+    // Also remove no longer used MetadataAsValue wrappers.
+    for (MetadataAsValue *MV : MVs)
+      if (MV->use_empty())
+        delete MV;
+  }
+}
+
+void LLParser::dropUnknownMetadataReferences() {
+  auto Pred = [](unsigned MDKind, MDNode *Node) { return Node->isTemporary(); };
+  for (Function &F : *M) {
+    F.eraseMetadataIf(Pred);
+    for (Instruction &I : make_early_inc_range(instructions(F))) {
+      I.eraseMetadataIf(Pred);
+
+      if (auto *II = dyn_cast<IntrinsicInst>(&I))
+        dropIntrinsicWithUnknownMetadataArgument(II);
+    }
+  }
+
+  for (GlobalVariable &GV : M->globals())
+    GV.eraseMetadataIf(Pred);
+
+  for (const auto &[ID, Info] : make_early_inc_range(ForwardRefMDNodes)) {
+    // Check whether there is only a single use left, which would be in our
+    // own NumberedMetadata.
+    if (Info.first->getNumTemporaryUses() == 1) {
+      NumberedMetadata.erase(ID);
+      ForwardRefMDNodes.erase(ID);
+    }
+  }
+}
+
 /// validateEndOfModule - Do final validity and basic correctness checks at the
 /// end of the module.
 bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
@@ -284,6 +341,9 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
                  "use of undefined value '@" +
                      Twine(ForwardRefValIDs.begin()->first) + "'");
 
+  if (AllowIncompleteIR && !ForwardRefMDNodes.empty())
+    dropUnknownMetadataReferences();
+
   if (!ForwardRefMDNodes.empty())
     return error(ForwardRefMDNodes.begin()->second.second,
                  "use of undefined metadata '!" +
@@ -297,10 +357,14 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
 
   for (auto *Inst : InstsWithTBAATag) {
     MDNode *MD = Inst->getMetadata(LLVMContext::MD_tbaa);
-    assert(MD && "UpgradeInstWithTBAATag should have a TBAA tag");
-    auto *UpgradedMD = UpgradeTBAANode(*MD);
-    if (MD != UpgradedMD)
-      Inst->setMetadata(LLVMContext::MD_tbaa, UpgradedMD);
+    // With incomplete IR, the tbaa metadata may have been dropped.
+    if (!AllowIncompleteIR)
+      assert(MD && "UpgradeInstWithTBAATag should have a TBAA tag");
+    if (MD) {
+      auto *UpgradedMD = UpgradeTBAANode(*MD);
+      if (MD != UpgradedMD)
+        Inst->setMetadata(LLVMContext::MD_tbaa, UpgradedMD);
+    }
   }
 
   // Look for intrinsic functions and CallInst that need to be upgraded.  We use
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index eff4fd371a73f..37017a222d485 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -1533,6 +1533,21 @@ bool Value::eraseMetadata(unsigned KindID) {
   return Changed;
 }
 
+void Value::eraseMetadataIf(function_ref<bool(unsigned, MDNode *)> Pred) {
+  if (!HasMetadata)
+    return;
+
+  auto &MetadataStore = getContext().pImpl->ValueMetadata;
+  MDAttachments &Info = MetadataStore.find(this)->second;
+  assert(!Info.empty() && "bit out of sync with hash table");
+  Info.remove_if([Pred](const MDAttachments::Attachment &I) {
+    return Pred(I.MDKind, I.Node);
+  });
+
+  if (Info.empty())
+    clearMetadata();
+}
+
 void Value::clearMetadata() {
   if (!HasMetadata)
     return;
@@ -1556,6 +1571,13 @@ MDNode *Instruction::getMetadataImpl(StringRef Kind) const {
   return Value::getMetadata(KindID);
 }
 
+void Instruction::eraseMetadataIf(function_ref<bool(unsigned, MDNode *)> Pred) {
+  if (DbgLoc && Pred(LLVMContext::MD_dbg, DbgLoc.getAsMDNode()))
+    DbgLoc = {};
+
+  Value::eraseMetadataIf(Pred);
+}
+
 void Instruction::dropUnknownNonDebugMetadata(ArrayRef<unsigned> KnownIDs) {
   if (!Value::hasMetadata())
     return; // Nothing to remove!
@@ -1566,17 +1588,9 @@ void Instruction::dropUnknownNonDebugMetadata(ArrayRef<unsigned> KnownIDs) {
   // A DIAssignID attachment is debug metadata, don't drop it.
   KnownSet.insert(LLVMContext::MD_DIAssignID);
 
-  auto &MetadataStore = getContext().pImpl->ValueMetadata;
-  MDAttachments &Info = MetadataStore.find(this)->second;
-  assert(!Info.empty() && "bit out of sync with hash table");
-  Info.remove_if([&KnownSet](const MDAttachments::Attachment &I) {
-    return !KnownSet.count(I.MDKind);
+  Value::eraseMetadataIf([&KnownSet](unsigned MDKind, MDNode *Node) {
+    return !KnownSet.count(MDKind);
   });
-
-  if (Info.empty()) {
-    // Drop our entry at the store.
-    clearMetadata();
-  }
 }
 
 void Instruction::updateDIAssignIDMapping(DIAssignID *ID) {
diff --git a/llvm/test/Assembler/incomplete-ir-metadata-unsupported.ll b/llvm/test/Assembler/incomplete-ir-metadata-unsupported.ll
new file mode 100644
index 0000000000000..e320472921ed9
--- /dev/null
+++ b/llvm/test/Assembler/incomplete-ir-metadata-unsupported.ll
@@ -0,0 +1,9 @@
+; RUN: not llvm-as -allow-incomplete-ir < %s 2>&1 | FileCheck %s
+
+; CHECK: error: use of undefined metadata '!1'
+define void @test(ptr %p) {
+  %v = load i8, ptr %p, !noalias !0
+  ret void
+}
+
+!0 = !{!1}
diff --git a/llvm/test/Assembler/incomplete-ir-metadata.ll b/llvm/test/Assembler/incomplete-ir-metadata.ll
new file mode 100644
index 0000000000000..fdf7b4ee0ebd2
--- /dev/null
+++ b/llvm/test/Assembler/incomplete-ir-metadata.ll
@@ -0,0 +1,34 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -allow-incomplete-ir < %s | FileCheck %s
+
+@g = global i8 0, !exclude !4
+
+define void @test(ptr %p) !dbg !3 {
+; CHECK-LABEL: define void @test(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[V1:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    [[V2:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    [[V3:%.*]] = load i8, ptr [[P]], align 1, !noalias [[META0:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
+; CHECK-NEXT:    ret void
+;
+  %v1 = load i8, ptr %p, !noalias !0
+  %v2 = load i8, ptr %p, !tbaa !1
+  %v3 = load i8, ptr %p, !dbg !2, !noalias !100
+  call void @llvm.experimental.noalias.scope.decl(metadata !5)
+  call void @llvm.dbg.value(metadata i32 0, metadata !7, metadata !8)
+  call void @llvm.experimental.noalias.scope.decl(metadata !100)
+  ret void
+}
+
+declare void @llvm.experimental.noalias.scope.decl(metadata)
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+!100 = !{!101}
+!101 = !{!101, !102}
+!102 = !{!102}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]]}
+;.

From 497a8604b39f8b5736c389f6d7ccd8242a122cbf Mon Sep 17 00:00:00 2001
From: Vinayak Dev <104419489+vinayakdsci@users.noreply.github.com>
Date: Fri, 19 Jan 2024 20:38:24 +0530
Subject: [PATCH 143/843] [FileCheck]: Fix diagnostics for NOT prefixes
 (#78412)

Fixes #70221

Fix a bug in FileCheck that corrects the error message when multiple
prefixes are provided
through --check-prefixes and one of them is a PREFIX-NOT.

Earlier, only the first of the provided prefixes was displayed as the
erroneous prefix, while the
actual error might be on the prefix that occurred at the end of the
prefix list in the input file.

Now, the right NOT prefix is shown in the error message.
---
 llvm/lib/FileCheck/FileCheck.cpp              | 61 ++++++++--------
 llvm/lib/FileCheck/FileCheckImpl.h            | 20 ++++--
 llvm/test/FileCheck/check-ignore-case.txt     |  2 +-
 .../FileCheck/check-not-custom-prefix.txt     | 69 +++++++++++++++++++
 .../test/FileCheck/dump-input/annotations.txt |  2 +-
 llvm/test/FileCheck/implicit-check-not.txt    | 14 ++--
 6 files changed, 126 insertions(+), 42 deletions(-)
 create mode 100644 llvm/test/FileCheck/check-not-custom-prefix.txt

diff --git a/llvm/lib/FileCheck/FileCheck.cpp b/llvm/lib/FileCheck/FileCheck.cpp
index b728c14d288aa..0d5bdf7903a20 100644
--- a/llvm/lib/FileCheck/FileCheck.cpp
+++ b/llvm/lib/FileCheck/FileCheck.cpp
@@ -1784,7 +1784,7 @@ bool FileCheck::readCheckFile(
 
   PatternContext->createLineVariable();
 
-  std::vector<Pattern> ImplicitNegativeChecks;
+  std::vector<FileCheckString::DagNotPrefixInfo> ImplicitNegativeChecks;
   for (StringRef PatternString : Req.ImplicitCheckNot) {
     // Create a buffer with fake command line content in order to display the
     // command line option responsible for the specific implicit CHECK-NOT.
@@ -1807,14 +1807,15 @@ bool FileCheck::readCheckFile(
       }
     }
 
-    ImplicitNegativeChecks.push_back(
-        Pattern(Check::CheckNot, PatternContext.get()));
-    ImplicitNegativeChecks.back().parsePattern(PatternInBuffer,
-                                               "IMPLICIT-CHECK", SM, Req);
+    ImplicitNegativeChecks.emplace_back(
+        Pattern(Check::CheckNot, PatternContext.get()),
+        StringRef("IMPLICIT-CHECK"));
+    ImplicitNegativeChecks.back().DagNotPat.parsePattern(
+        PatternInBuffer, "IMPLICIT-CHECK", SM, Req);
   }
 
-  std::vector<Pattern> DagNotMatches = ImplicitNegativeChecks;
-
+  std::vector<FileCheckString::DagNotPrefixInfo> DagNotMatches =
+      ImplicitNegativeChecks;
   // LineNumber keeps track of the line on which CheckPrefix instances are
   // found.
   unsigned LineNumber = 1;
@@ -1926,7 +1927,7 @@ bool FileCheck::readCheckFile(
 
     // Handle CHECK-DAG/-NOT.
     if (CheckTy == Check::CheckDAG || CheckTy == Check::CheckNot) {
-      DagNotMatches.push_back(P);
+      DagNotMatches.emplace_back(P, UsedPrefix);
       continue;
     }
 
@@ -2165,7 +2166,7 @@ size_t FileCheckString::Check(const SourceMgr &SM, StringRef Buffer,
                               FileCheckRequest &Req,
                               std::vector<FileCheckDiag> *Diags) const {
   size_t LastPos = 0;
-  std::vector<const Pattern *> NotStrings;
+  std::vector<const DagNotPrefixInfo *> NotStrings;
 
   // IsLabelScanMode is true when we are scanning forward to find CHECK-LABEL
   // bounds; we have not processed variable definitions within the bounded block
@@ -2302,17 +2303,19 @@ bool FileCheckString::CheckSame(const SourceMgr &SM, StringRef Buffer) const {
   return false;
 }
 
-bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
-                               const std::vector<const Pattern *> &NotStrings,
-                               const FileCheckRequest &Req,
-                               std::vector<FileCheckDiag> *Diags) const {
+bool FileCheckString::CheckNot(
+    const SourceMgr &SM, StringRef Buffer,
+    const std::vector<const DagNotPrefixInfo *> &NotStrings,
+    const FileCheckRequest &Req, std::vector<FileCheckDiag> *Diags) const {
   bool DirectiveFail = false;
-  for (const Pattern *Pat : NotStrings) {
-    assert((Pat->getCheckTy() == Check::CheckNot) && "Expect CHECK-NOT!");
-    Pattern::MatchResult MatchResult = Pat->match(Buffer, SM);
-    if (Error Err = reportMatchResult(/*ExpectedMatch=*/false, SM, Prefix,
-                                      Pat->getLoc(), *Pat, 1, Buffer,
-                                      std::move(MatchResult), Req, Diags)) {
+  for (auto NotInfo : NotStrings) {
+    assert((NotInfo->DagNotPat.getCheckTy() == Check::CheckNot) &&
+           "Expect CHECK-NOT!");
+    Pattern::MatchResult MatchResult = NotInfo->DagNotPat.match(Buffer, SM);
+    if (Error Err = reportMatchResult(
+            /*ExpectedMatch=*/false, SM, NotInfo->DagNotPrefix,
+            NotInfo->DagNotPat.getLoc(), NotInfo->DagNotPat, 1, Buffer,
+            std::move(MatchResult), Req, Diags)) {
       cantFail(handleErrors(std::move(Err), [&](const ErrorReported &E) {}));
       DirectiveFail = true;
       continue;
@@ -2321,10 +2324,11 @@ bool FileCheckString::CheckNot(const SourceMgr &SM, StringRef Buffer,
   return DirectiveFail;
 }
 
-size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
-                                 std::vector<const Pattern *> &NotStrings,
-                                 const FileCheckRequest &Req,
-                                 std::vector<FileCheckDiag> *Diags) const {
+size_t
+FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
+                          std::vector<const DagNotPrefixInfo *> &NotStrings,
+                          const FileCheckRequest &Req,
+                          std::vector<FileCheckDiag> *Diags) const {
   if (DagNotStrings.empty())
     return 0;
 
@@ -2344,13 +2348,14 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
   // group, so we don't use a range-based for loop here.
   for (auto PatItr = DagNotStrings.begin(), PatEnd = DagNotStrings.end();
        PatItr != PatEnd; ++PatItr) {
-    const Pattern &Pat = *PatItr;
+    const Pattern &Pat = PatItr->DagNotPat;
+    const StringRef DNPrefix = PatItr->DagNotPrefix;
     assert((Pat.getCheckTy() == Check::CheckDAG ||
             Pat.getCheckTy() == Check::CheckNot) &&
            "Invalid CHECK-DAG or CHECK-NOT!");
 
     if (Pat.getCheckTy() == Check::CheckNot) {
-      NotStrings.push_back(&Pat);
+      NotStrings.push_back(&*PatItr);
       continue;
     }
 
@@ -2367,7 +2372,7 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
       // With a group of CHECK-DAGs, a single mismatching means the match on
       // that group of CHECK-DAGs fails immediately.
       if (MatchResult.TheError || Req.VerboseVerbose) {
-        if (Error Err = reportMatchResult(/*ExpectedMatch=*/true, SM, Prefix,
+        if (Error Err = reportMatchResult(/*ExpectedMatch=*/true, SM, DNPrefix,
                                           Pat.getLoc(), Pat, 1, MatchBuffer,
                                           std::move(MatchResult), Req, Diags)) {
           cantFail(
@@ -2430,13 +2435,13 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
     }
     if (!Req.VerboseVerbose)
       cantFail(printMatch(
-          /*ExpectedMatch=*/true, SM, Prefix, Pat.getLoc(), Pat, 1, Buffer,
+          /*ExpectedMatch=*/true, SM, DNPrefix, Pat.getLoc(), Pat, 1, Buffer,
           Pattern::MatchResult(MatchPos, MatchLen, Error::success()), Req,
           Diags));
 
     // Handle the end of a CHECK-DAG group.
     if (std::next(PatItr) == PatEnd ||
-        std::next(PatItr)->getCheckTy() == Check::CheckNot) {
+        std::next(PatItr)->DagNotPat.getCheckTy() == Check::CheckNot) {
       if (!NotStrings.empty()) {
         // If there are CHECK-NOTs between two CHECK-DAGs or from CHECK to
         // CHECK-DAG, verify that there are no 'not' strings occurred in that
diff --git a/llvm/lib/FileCheck/FileCheckImpl.h b/llvm/lib/FileCheck/FileCheckImpl.h
index c15461684ea39..c772eddd8ecd5 100644
--- a/llvm/lib/FileCheck/FileCheckImpl.h
+++ b/llvm/lib/FileCheck/FileCheckImpl.h
@@ -823,9 +823,19 @@ struct FileCheckString {
   /// The location in the match file that the check string was specified.
   SMLoc Loc;
 
-  /// All of the strings that are disallowed from occurring between this match
-  /// string and the previous one (or start of file).
-  std::vector<Pattern> DagNotStrings;
+  /// Hold the information about the DAG/NOT strings in the program, which are
+  /// not explicitly stored otherwise. This allows for better and more accurate
+  /// diagnostic messages.
+  struct DagNotPrefixInfo {
+    Pattern DagNotPat;
+    StringRef DagNotPrefix;
+
+    DagNotPrefixInfo(const Pattern &P, StringRef S)
+        : DagNotPat(P), DagNotPrefix(S) {}
+  };
+
+  /// Hold the DAG/NOT strings occurring in the input file.
+  std::vector<DagNotPrefixInfo> DagNotStrings;
 
   FileCheckString(const Pattern &P, StringRef S, SMLoc L)
       : Pat(P), Prefix(S), Loc(L) {}
@@ -845,12 +855,12 @@ struct FileCheckString {
   /// \p Buffer. Errors are reported against \p SM and diagnostics recorded in
   /// \p Diags according to the verbosity level set in \p Req.
   bool CheckNot(const SourceMgr &SM, StringRef Buffer,
-                const std::vector<const Pattern *> &NotStrings,
+                const std::vector<const DagNotPrefixInfo *> &NotStrings,
                 const FileCheckRequest &Req,
                 std::vector<FileCheckDiag> *Diags) const;
   /// Matches "dag strings" and their mixed "not strings".
   size_t CheckDag(const SourceMgr &SM, StringRef Buffer,
-                  std::vector<const Pattern *> &NotStrings,
+                  std::vector<const DagNotPrefixInfo *> &NotStrings,
                   const FileCheckRequest &Req,
                   std::vector<FileCheckDiag> *Diags) const;
 };
diff --git a/llvm/test/FileCheck/check-ignore-case.txt b/llvm/test/FileCheck/check-ignore-case.txt
index c3b4d97ab5e41..47999ff293515 100644
--- a/llvm/test/FileCheck/check-ignore-case.txt
+++ b/llvm/test/FileCheck/check-ignore-case.txt
@@ -43,6 +43,6 @@ One Line To Match
 # LINE: {{o}}ne line
 # LINE-SAME: {{t}}o match
 
-# ERROR: command line:1:{{[0-9]+}}: error: CHECK-NOT: excluded string found in input
+# ERROR: command line:1:{{[0-9]+}}: error: IMPLICIT-CHECK-NOT: excluded string found in input
 # ERROR-NEXT: -implicit-check-not='sTrInG'
 # ERROR: note: found here
diff --git a/llvm/test/FileCheck/check-not-custom-prefix.txt b/llvm/test/FileCheck/check-not-custom-prefix.txt
new file mode 100644
index 0000000000000..ddd8a7109ded9
--- /dev/null
+++ b/llvm/test/FileCheck/check-not-custom-prefix.txt
@@ -0,0 +1,69 @@
+; Test two trailing NOT strings
+; RUN: rm -f %t && \
+; RUN: echo "LEADING: placeholder1" >>%t && echo "MIDDLE-NOT: placeholder2" >>%t && echo "TRAILING-NOT: placeholder3" >>%t && \
+; RUN: %ProtectFileCheckOutput not FileCheck --strict-whitespace --check-prefixes LEADING,MIDDLE,TRAILING --dump-input=never --input-file  %t %t 2>&1 | \
+; RUN: FileCheck --check-prefix TEST1 %s
+
+; Test NOT string occurring in between two allowable strings
+; RUN: rm -f %t && \
+; RUN: echo "LEADING: placeholder1" >>%t && echo "MIDDLE-NOT: placeholder2" >>%t && echo "TRAILING: placeholder3" >>%t && \
+; RUN: %ProtectFileCheckOutput not FileCheck --strict-whitespace --check-prefixes LEADING,MIDDLE,TRAILING --dump-input=never --input-file  %t %t 2>&1 | \
+; RUN: FileCheck --check-prefix TEST2 %s
+
+; Test first prefix found being the NOT string
+; RUN: rm -f %t && \
+; RUN: echo "LEADING-NOT: placeholder1" >>%t && echo "MIDDLE: placeholder2" >>%t && echo "TRAILING: placeholder3" >>%t && \
+; RUN: %ProtectFileCheckOutput not FileCheck --strict-whitespace --check-prefixes LEADING,MIDDLE,TRAILING --dump-input=never --input-file  %t %t 2>&1 | \
+; RUN: FileCheck --check-prefix TEST3 %s
+
+; Test all given prefixes being NOT strings
+; RUN: rm -f %t && \
+; RUN: echo "LEADING-NOT: placeholder1" >>%t && echo "MIDDLE-NOT: placeholder2" >>%t && echo "TRAILING-NOT: placeholder3" >>%t && \
+; RUN: %ProtectFileCheckOutput not FileCheck --strict-whitespace --check-prefixes LEADING,MIDDLE,TRAILING --dump-input=never --input-file  %t %t 2>&1 | \
+; RUN: FileCheck --check-prefix TEST4 %s
+
+; TEST1:           error: MIDDLE-NOT: excluded string found in input
+; TEST1-NEXT:      MIDDLE-NOT: placeholder2
+; TEST1-NEXT: {{^}}            ^{{$}}
+; TEST1-NEXT:      note: found here
+; TEST1-NEXT:      MIDDLE-NOT: placeholder2
+; TEST1-NEXT: {{^}}            ^~~~~~~~~~~~{{$}}
+; TEST1-NEXT:      error: TRAILING-NOT: excluded string found in input
+; TEST1-NEXT:      TRAILING-NOT: placeholder3
+; TEST1-NEXT: {{^}}              ^{{$}}
+; TEST1-NEXT:      note: found here
+; TEST1-NEXT:      TRAILING-NOT: placeholder3
+; TEST1-NEXT: {{^}}              ^~~~~~~~~~~~{{$}}
+
+; TEST2:           error: MIDDLE-NOT: excluded string found in input
+; TEST2-NEXT:      MIDDLE-NOT: placeholder2
+; TEST2-NEXT: {{^}}            ^{{$}}
+; TEST2-NEXT:      note: found here
+; TEST2-NEXT:      MIDDLE-NOT: placeholder2
+; TEST2-NEXT: {{^}}            ^~~~~~~~~~~~{{$}}
+
+; TEST3:           error: LEADING-NOT: excluded string found in input
+; TEST3-NEXT:      LEADING-NOT: placeholder1
+; TEST3-NEXT: {{^}}            ^{{$}}
+; TEST3-NEXT:      note: found here
+; TEST3-NEXT:      LEADING-NOT: placeholder1
+; TEST3-NEXT: {{^}}            ^~~~~~~~~~~~{{$}}
+
+; TEST4:           error: LEADING-NOT: excluded string found in input
+; TEST4-NEXT:      LEADING-NOT: placeholder1
+; TEST4-NEXT: {{^}}            ^{{$}}
+; TEST4-NEXT:      note: found here
+; TEST4-NEXT:      LEADING-NOT: placeholder1
+; TEST4-NEXT: {{^}}            ^~~~~~~~~~~~{{$}}
+; TEST4-NEXT:      error: MIDDLE-NOT: excluded string found in input
+; TEST4-NEXT:      MIDDLE-NOT: placeholder2
+; TEST4-NEXT: {{^}}            ^{{$}}
+; TEST4-NEXT:      note: found here
+; TEST4-NEXT:      MIDDLE-NOT: placeholder2
+; TEST4-NEXT: {{^}}            ^~~~~~~~~~~~{{$}}
+; TEST4-NEXT:      error: TRAILING-NOT: excluded string found in input
+; TEST4-NEXT:      TRAILING-NOT: placeholder3
+; TEST4-NEXT: {{^}}              ^{{$}}
+; TEST4-NEXT:      note: found here
+; TEST4-NEXT:      TRAILING-NOT: placeholder3
+; TEST4-NEXT: {{^}}              ^~~~~~~~~~~~{{$}}
diff --git a/llvm/test/FileCheck/dump-input/annotations.txt b/llvm/test/FileCheck/dump-input/annotations.txt
index 45bf54698bca0..367ea3377e6ae 100644
--- a/llvm/test/FileCheck/dump-input/annotations.txt
+++ b/llvm/test/FileCheck/dump-input/annotations.txt
@@ -650,7 +650,7 @@
 ; RUN:             -implicit-check-not='{{remark:|error:}}'
 
 ; Verbose diagnostics are suppressed but not errors.
-; IMPNOT:{{.*}}command line:1:22: error: CHECK-NOT: excluded string found in input
+; IMPNOT:{{.*}}command line:1:22: error: IMPLICIT-CHECK-NOT: excluded string found in input
 
 ;         IMPNOT:<<<<<<
 ;    IMPNOT-NEXT:            1: hello world again! 
diff --git a/llvm/test/FileCheck/implicit-check-not.txt b/llvm/test/FileCheck/implicit-check-not.txt
index 95dd9fa782df8..eaca0d37fcb29 100644
--- a/llvm/test/FileCheck/implicit-check-not.txt
+++ b/llvm/test/FileCheck/implicit-check-not.txt
@@ -21,26 +21,26 @@
 
 warning: aaa
 ; CHECK-PASS: warning: aaa
-; CHECK-ERROR1: command line:1:22: error: CHECK-FAIL1-NOT: excluded string found in input
+; CHECK-ERROR1: command line:1:22: error: IMPLICIT-CHECK-NOT: excluded string found in input
 ; CHECK-ERROR1-NEXT: -implicit-check-not='warning:'
 ; CHECK-ERROR1: note: found here
 ; CHECK-FAIL2: warning: aaa
 ; CHECK-FAIL3: warning: aaa
-; CHECK-ERROR4: command line:1:22: error: CHECK-FAIL1-NOT: excluded string found in input
+; CHECK-ERROR4: command line:1:22: error: IMPLICIT-CHECK-NOT: excluded string found in input
 ; CHECK-ERROR4-NEXT: {{-implicit-check-not='\{\{aaa\|bbb\|ccc\}\}'}}
 ; CHECK-ERROR4: note: found here
-; CHECK-ERROR5: command line:1:22: error: CHECK-FAIL1-NOT: excluded string found in input
+; CHECK-ERROR5: command line:1:22: error: IMPLICIT-CHECK-NOT: excluded string found in input
 ; CHECK-ERROR5-NEXT: -implicit-check-not='aaa'
 ; CHECK-ERROR5: note: found here
 
 warning: bbb
 ; CHECK-PASS: warning: bbb
 ; CHECK-FAIL1: warning: bbb
-; CHECK-ERROR2: command line:1:22: error: CHECK-FAIL2-NOT: excluded string found in input
+; CHECK-ERROR2: command line:1:22: error: IMPLICIT-CHECK-NOT: excluded string found in input
 ; CHECK-ERROR2-NEXT: -implicit-check-not='warning:'
 ; CHECK-ERROR2: note: found here
 ; CHECK-FAIL3: warning: bbb
-; CHECK-ERROR6: command line:1:22: error: CHECK-FAIL2-NOT: excluded string found in input
+; CHECK-ERROR6: command line:1:22: error: IMPLICIT-CHECK-NOT: excluded string found in input
 ; CHECK-ERROR6-NEXT: -implicit-check-not='bbb'
 ; CHECK-ERROR6: note: found here
 
@@ -48,9 +48,9 @@ warning: ccc
 ; CHECK-PASS: warning: ccc
 ; CHECK-FAIL1: warning: ccc
 ; CHECK-FAIL2: warning: ccc
-; CHECK-ERROR3: command line:1:22: error: CHECK-FAIL3-NOT: excluded string found in input
+; CHECK-ERROR3: command line:1:22: error: IMPLICIT-CHECK-NOT: excluded string found in input
 ; CHECK-ERROR3-NEXT: -implicit-check-not='warning:'
 ; CHECK-ERROR3: note: found here
-; CHECK-ERROR7: command line:1:22: error: CHECK-FAIL3-NOT: excluded string found in input
+; CHECK-ERROR7: command line:1:22: error: IMPLICIT-CHECK-NOT: excluded string found in input
 ; CHECK-ERROR7-NEXT: -implicit-check-not='ccc'
 ; CHECK-ERROR7: note: found here

From aac1d9710b5123e2d4fca0b5e0e2b66cbb6b0fb3 Mon Sep 17 00:00:00 2001
From: Kiran Chandramohan <kiran.chandramohan@arm.com>
Date: Fri, 19 Jan 2024 15:17:21 +0000
Subject: [PATCH 144/843] [Flang][OpenMP] Consider renames when processing
 reduction intrinsics (#70822)

Fixes #68654

Depends on https://github.com/llvm/llvm-project/pull/70790
---
 flang/lib/Lower/OpenMP.cpp                    |  3 +++
 flang/lib/Semantics/check-omp-structure.cpp   | 20 ++++++++--------
 flang/lib/Semantics/resolve-directives.cpp    | 23 ++++++++++++-------
 .../Lower/OpenMP/wsloop-reduction-max-2.f90   | 19 +++++++++++++++
 4 files changed, 48 insertions(+), 17 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/wsloop-reduction-max-2.f90

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 0aaf8f189a0ec..7dd25f75d9eb7 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -745,6 +745,9 @@ class ReductionProcessor {
       const Fortran::parser::ProcedureDesignator &pd) {
     const auto *name{Fortran::parser::Unwrap<Fortran::parser::Name>(pd)};
     assert(name && "Invalid Reduction Intrinsic.");
+    if (!name->symbol->GetUltimate().attrs().test(
+            Fortran::semantics::Attr::INTRINSIC))
+      return false;
     auto redType = llvm::StringSwitch<std::optional<IntrinsicProc>>(
                        getRealName(name).ToString())
                        .Case("max", IntrinsicProc::MAX)
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index c430375d5ed01..22e9d5d4dd799 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2299,18 +2299,20 @@ bool OmpStructureChecker::CheckReductionOperators(
           },
           [&](const parser::ProcedureDesignator &procD) {
             const parser::Name *name{std::get_if<parser::Name>(&procD.u)};
-            if (name) {
-              if (name->source == "max" || name->source == "min" ||
-                  name->source == "iand" || name->source == "ior" ||
-                  name->source == "ieor") {
+            if (name && name->symbol) {
+              const SourceName &realName{name->symbol->GetUltimate().name()};
+              if (realName == "max" || realName == "min" ||
+                  realName == "iand" || realName == "ior" ||
+                  realName == "ieor") {
                 ok = true;
-              } else {
-                context_.Say(GetContext().clauseSource,
-                    "Invalid reduction identifier in REDUCTION "
-                    "clause."_err_en_US,
-                    ContextDirectiveAsFortran());
               }
             }
+            if (!ok) {
+              context_.Say(GetContext().clauseSource,
+                  "Invalid reduction identifier in REDUCTION "
+                  "clause."_err_en_US,
+                  ContextDirectiveAsFortran());
+            }
           },
       },
       definedOp.u);
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index e31193f490f79..2c570bc3abeb2 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -481,21 +481,28 @@ class OmpAttributeVisitor : DirectiveAttributeVisitor<llvm::omp::Directive> {
   bool Pre(const parser::OmpClause::Reduction &x) {
     const parser::OmpReductionOperator &opr{
         std::get<parser::OmpReductionOperator>(x.v.t)};
+    auto createDummyProcSymbol = [&](const parser::Name *name) {
+      // If name resolution failed, create a dummy symbol
+      const auto namePair{
+          currScope().try_emplace(name->source, Attrs{}, ProcEntityDetails{})};
+      auto &newSymbol{*namePair.first->second};
+      name->symbol = &newSymbol;
+    };
     if (const auto *procD{parser::Unwrap<parser::ProcedureDesignator>(opr.u)}) {
       if (const auto *name{parser::Unwrap<parser::Name>(procD->u)}) {
         if (!name->symbol) {
-          const auto namePair{currScope().try_emplace(
-              name->source, Attrs{}, ProcEntityDetails{})};
-          auto &symbol{*namePair.first->second};
-          name->symbol = &symbol;
-          name->symbol->set(Symbol::Flag::OmpReduction);
-          AddToContextObjectWithDSA(*name->symbol, Symbol::Flag::OmpReduction);
+          if (!ResolveName(name)) {
+            createDummyProcSymbol(name);
+          }
         }
       }
       if (const auto *procRef{
               parser::Unwrap<parser::ProcComponentRef>(procD->u)}) {
-        ResolveOmp(*procRef->v.thing.component.symbol,
-            Symbol::Flag::OmpReduction, currScope());
+        if (!procRef->v.thing.component.symbol) {
+          if (!ResolveName(&procRef->v.thing.component)) {
+            createDummyProcSymbol(&procRef->v.thing.component);
+          }
+        }
       }
     }
     const auto &objList{std::get<parser::OmpObjectList>(x.v.t)};
diff --git a/flang/test/Lower/OpenMP/wsloop-reduction-max-2.f90 b/flang/test/Lower/OpenMP/wsloop-reduction-max-2.f90
new file mode 100644
index 0000000000000..7e079470df847
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-reduction-max-2.f90
@@ -0,0 +1,19 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+! RUN: %flang_fc1 -emit-hlfir -fopenmp -o - %s 2>&1 | FileCheck %s
+
+! CHECK: omp.wsloop reduction(@max_i_32
+! CHECK: omp.reduction
+
+module m1
+  intrinsic max
+end module m1
+program main
+  use m1, ren=>max
+  n=0
+  !$omp parallel do reduction(ren:n)
+  do i=1,100
+     n=max(n,i)
+  end do
+  if (n/=100) print *,101
+  print *,'pass'
+end program main

From 89226ecbb9975eea1c8dee4744f719555fc39036 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 19 Jan 2024 15:30:07 +0000
Subject: [PATCH 145/843] [AMDGPU] Do not widen scalar loads on GFX12 (#78724)

GFX12 has subword scalar loads so there is no need to do this.
---
 .../AMDGPU/AMDGPULateCodeGenPrepare.cpp       |  10 +-
 .../AMDGPU/amdgpu-late-codegenprepare.ll      |  84 +++++++-----
 .../AMDGPU/indirect-call-known-callees.ll     | 123 ++++++++++++------
 3 files changed, 149 insertions(+), 68 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
index 1983e9f8d4af7..69fdeaebe0a01 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULateCodeGenPrepare.cpp
@@ -13,9 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "AMDGPUTargetMachine.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/UniformityAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/InitializePasses.h"
@@ -58,6 +60,7 @@ class AMDGPULateCodeGenPrepare
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<UniformityInfoWrapperPass>();
     AU.setPreservesAll();
@@ -90,7 +93,11 @@ bool AMDGPULateCodeGenPrepare::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
-  // TODO: Skip this on GFX12 which does have scalar sub-dword loads.
+  const TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  const TargetMachine &TM = TPC.getTM<TargetMachine>();
+  const GCNSubtarget &ST = TM.getSubtarget<GCNSubtarget>(F);
+  if (ST.hasScalarSubwordLoads())
+    return false;
 
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   UA = &getAnalysis<UniformityInfoWrapperPass>().getUniformityInfo();
@@ -181,6 +188,7 @@ bool AMDGPULateCodeGenPrepare::visitLoadInst(LoadInst &LI) {
 
 INITIALIZE_PASS_BEGIN(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
                       "AMDGPU IR late optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(UniformityInfoWrapperPass)
 INITIALIZE_PASS_END(AMDGPULateCodeGenPrepare, DEBUG_TYPE,
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll
index 0de0ac7b77a77..83016f1d2d3c8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-late-codegenprepare.ll
@@ -1,15 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX9
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-late-codegenprepare %s | FileCheck %s -check-prefix=GFX12
 
 ; Make sure we don't crash when trying to create a bitcast between
 ; address spaces
 define amdgpu_kernel void @constant_from_offset_cast_generic_null() {
-; CHECK-LABEL: @constant_from_offset_cast_generic_null(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; CHECK-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_offset_cast_generic_null(
+; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 4), align 4
+; GFX9-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; GFX9-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; GFX9-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_offset_cast_generic_null(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr null to ptr addrspace(4)), i64 6), align 1
   store i8 %load, ptr addrspace(1) undef
@@ -17,12 +23,17 @@ define amdgpu_kernel void @constant_from_offset_cast_generic_null() {
 }
 
 define amdgpu_kernel void @constant_from_offset_cast_global_null() {
-; CHECK-LABEL: @constant_from_offset_cast_global_null(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; CHECK-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_offset_cast_global_null(
+; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 4), align 4
+; GFX9-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; GFX9-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; GFX9-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_offset_cast_global_null(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) null to ptr addrspace(4)), i64 6), align 1
   store i8 %load, ptr addrspace(1) undef
@@ -32,12 +43,17 @@ define amdgpu_kernel void @constant_from_offset_cast_global_null() {
 @gv = unnamed_addr addrspace(1) global [64 x i8] undef, align 4
 
 define amdgpu_kernel void @constant_from_offset_cast_global_gv() {
-; CHECK-LABEL: @constant_from_offset_cast_global_gv(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; CHECK-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_offset_cast_global_gv(
+; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 4), align 4
+; GFX9-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; GFX9-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; GFX9-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_offset_cast_global_gv(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr addrspace(1) @gv to ptr addrspace(4)), i64 6), align 1
   store i8 %load, ptr addrspace(1) undef
@@ -45,12 +61,17 @@ define amdgpu_kernel void @constant_from_offset_cast_global_gv() {
 }
 
 define amdgpu_kernel void @constant_from_offset_cast_generic_inttoptr() {
-; CHECK-LABEL: @constant_from_offset_cast_generic_inttoptr(
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
-; CHECK-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
-; CHECK-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_offset_cast_generic_inttoptr(
+; GFX9-NEXT:    [[TMP1:%.*]] = load i32, ptr addrspace(4) getelementptr (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 4), align 4
+; GFX9-NEXT:    [[TMP2:%.*]] = lshr i32 [[TMP1]], 16
+; GFX9-NEXT:    [[TMP3:%.*]] = trunc i32 [[TMP2]] to i8
+; GFX9-NEXT:    store i8 [[TMP3]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_offset_cast_generic_inttoptr(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) getelementptr inbounds (i8, ptr addrspace(4) addrspacecast (ptr inttoptr (i64 128 to ptr) to ptr addrspace(4)), i64 6), align 1
   store i8 %load, ptr addrspace(1) undef
@@ -58,10 +79,15 @@ define amdgpu_kernel void @constant_from_offset_cast_generic_inttoptr() {
 }
 
 define amdgpu_kernel void @constant_from_inttoptr() {
-; CHECK-LABEL: @constant_from_inttoptr(
-; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4
-; CHECK-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
-; CHECK-NEXT:    ret void
+; GFX9-LABEL: @constant_from_inttoptr(
+; GFX9-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 4
+; GFX9-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX9-NEXT:    ret void
+;
+; GFX12-LABEL: @constant_from_inttoptr(
+; GFX12-NEXT:    [[LOAD:%.*]] = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1
+; GFX12-NEXT:    store i8 [[LOAD]], ptr addrspace(1) undef, align 1
+; GFX12-NEXT:    ret void
 ;
   %load = load i8, ptr addrspace(4) inttoptr (i64 128 to ptr addrspace(4)), align 1
   store i8 %load, ptr addrspace(1) undef
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index fe7323eeadf8a..9965d214cc9b3 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck %s -check-prefix=GFX9
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefix=GFX12
 
 ; We have an indirect call with a known set of callees, which are
 ; known to not need any special inputs. The ABI still needs to use the
@@ -8,35 +9,63 @@
 ; FIXME: Passing real values for workitem ID, and 0s that can be undef
 
 define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
-; CHECK-LABEL: indirect_call_known_no_special_inputs:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
-; CHECK-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
-; CHECK-NEXT:    s_add_u32 s0, s0, s7
-; CHECK-NEXT:    s_addc_u32 s1, s1, 0
-; CHECK-NEXT:    s_mov_b64 s[4:5], 0
-; CHECK-NEXT:    s_load_dword s7, s[4:5], 0x0
-; CHECK-NEXT:    s_getpc_b64 s[4:5]
-; CHECK-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
-; CHECK-NEXT:    s_getpc_b64 s[8:9]
-; CHECK-NEXT:    s_add_u32 s8, s8, snork@gotpcrel32@lo+4
-; CHECK-NEXT:    s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
-; CHECK-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
-; CHECK-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
-; CHECK-NEXT:    s_mov_b64 s[8:9], 0
-; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    s_and_b32 s4, 1, s7
-; CHECK-NEXT:    s_cmp_eq_u32 s4, 1
-; CHECK-NEXT:    v_mov_b32_e32 v31, v0
-; CHECK-NEXT:    s_cselect_b32 s5, s13, s11
-; CHECK-NEXT:    s_cselect_b32 s4, s12, s10
-; CHECK-NEXT:    s_mov_b32 s12, s6
-; CHECK-NEXT:    v_mov_b32_e32 v1, 0
-; CHECK-NEXT:    v_mov_b32_e32 v4, 0
-; CHECK-NEXT:    s_mov_b32 s32, 0
-; CHECK-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; CHECK-NEXT:    s_endpgm
+; GFX9-LABEL: indirect_call_known_no_special_inputs:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_add_u32 flat_scratch_lo, s4, s7
+; GFX9-NEXT:    s_addc_u32 flat_scratch_hi, s5, 0
+; GFX9-NEXT:    s_add_u32 s0, s0, s7
+; GFX9-NEXT:    s_addc_u32 s1, s1, 0
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:    s_load_dword s7, s[4:5], 0x0
+; GFX9-NEXT:    s_getpc_b64 s[4:5]
+; GFX9-NEXT:    s_add_u32 s4, s4, wobble@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s5, s5, wobble@gotpcrel32@hi+12
+; GFX9-NEXT:    s_getpc_b64 s[8:9]
+; GFX9-NEXT:    s_add_u32 s8, s8, snork@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s9, s9, snork@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[10:11], s[8:9], 0x0
+; GFX9-NEXT:    s_load_dwordx2 s[12:13], s[4:5], 0x0
+; GFX9-NEXT:    s_mov_b64 s[8:9], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_and_b32 s4, 1, s7
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 1
+; GFX9-NEXT:    v_mov_b32_e32 v31, v0
+; GFX9-NEXT:    s_cselect_b32 s5, s13, s11
+; GFX9-NEXT:    s_cselect_b32 s4, s12, s10
+; GFX9-NEXT:    s_mov_b32 s12, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 0
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: indirect_call_known_no_special_inputs:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_getpc_b64 s[4:5]
+; GFX12-NEXT:    s_sext_i32_i16 s5, s5
+; GFX12-NEXT:    s_add_co_u32 s4, s4, snork@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, snork@gotpcrel32@hi+16
+; GFX12-NEXT:    s_mov_b64 s[2:3], 0
+; GFX12-NEXT:    s_getpc_b64 s[6:7]
+; GFX12-NEXT:    s_sext_i32_i16 s7, s7
+; GFX12-NEXT:    s_add_co_u32 s6, s6, wobble@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s7, s7, wobble@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_u8 s1, s[2:3], 0x0
+; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
+; GFX12-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
+; GFX12-NEXT:    v_mov_b32_e32 v31, v0
+; GFX12-NEXT:    s_mov_b64 s[8:9], 0
+; GFX12-NEXT:    s_mov_b32 s12, s0
+; GFX12-NEXT:    s_mov_b32 s32, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_and_b32 s1, 1, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_cmp_eq_u32 s1, 1
+; GFX12-NEXT:    s_cselect_b32 s3, s5, s3
+; GFX12-NEXT:    s_cselect_b32 s2, s4, s2
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_endpgm
 
 bb:
   %cond = load i1, ptr addrspace(4) null
@@ -46,19 +75,37 @@ bb:
 }
 
 define void @wobble() {
-; CHECK-LABEL: wobble:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: wobble:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: wobble:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   ret void
 }
 
 define void @snork() {
-; CHECK-LABEL: snork:
-; CHECK:       ; %bb.0: ; %bb
-; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT:    s_setpc_b64 s[30:31]
+; GFX9-LABEL: snork:
+; GFX9:       ; %bb.0: ; %bb
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: snork:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
 bb:
   ret void
 }

From 97747467f1320b6e80495c5961a0176e0738863b Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 19 Jan 2024 15:30:41 +0000
Subject: [PATCH 146/843] [AMDGPU] Update hazard recognition for new GFX12 wait
 counters (#78722)

In most cases the hazards no longer apply, so just assert that we are
not on GFX12.
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 27 ++++++++++++++-----
 1 file changed, 20 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 59ba19c71432e..ebad40b641820 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1143,6 +1143,7 @@ bool GCNHazardRecognizer::fixVcmpxPermlaneHazards(MachineInstr *MI) {
 bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
   if (!ST.hasVMEMtoScalarWriteHazard())
     return false;
+  assert(!ST.hasExtendedWaitCounts());
 
   if (!SIInstrInfo::isSALU(*MI) && !SIInstrInfo::isSMRD(*MI))
     return false;
@@ -1189,6 +1190,7 @@ bool GCNHazardRecognizer::fixVMEMtoScalarWriteHazards(MachineInstr *MI) {
 bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
   if (!ST.hasSMEMtoVectorWriteHazard())
     return false;
+  assert(!ST.hasExtendedWaitCounts());
 
   if (!SIInstrInfo::isVALU(*MI))
     return false;
@@ -1273,7 +1275,11 @@ bool GCNHazardRecognizer::fixSMEMtoVectorWriteHazards(MachineInstr *MI) {
 }
 
 bool GCNHazardRecognizer::fixVcmpxExecWARHazard(MachineInstr *MI) {
-  if (!ST.hasVcmpxExecWARHazard() || !SIInstrInfo::isVALU(*MI))
+  if (!ST.hasVcmpxExecWARHazard())
+    return false;
+  assert(!ST.hasExtendedWaitCounts());
+
+  if (!SIInstrInfo::isVALU(*MI))
     return false;
 
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
@@ -1343,6 +1349,7 @@ bool GCNHazardRecognizer::fixLdsBranchVmemWARHazard(MachineInstr *MI) {
     return false;
 
   assert(ST.hasLdsBranchVmemWARHazard());
+  assert(!ST.hasExtendedWaitCounts());
 
   auto IsHazardInst = [](const MachineInstr &MI) {
     if (SIInstrInfo::isDS(MI))
@@ -1452,6 +1459,8 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
     return I.readsRegister(VDSTReg, &TRI) || I.modifiesRegister(VDSTReg, &TRI);
   };
   bool LdsdirCanWait = ST.hasLdsWaitVMSRC();
+  // TODO: On GFX12 the hazard should expire on S_WAIT_LOADCNT/SAMPLECNT/BVHCNT
+  // according to the type of VMEM instruction.
   auto IsExpiredFn = [this, LdsdirCanWait](const MachineInstr &I, int) {
     return SIInstrInfo::isVALU(I) || SIInstrInfo::isEXP(I) ||
            (I.getOpcode() == AMDGPU::S_WAITCNT && !I.getOperand(0).getImm()) ||
@@ -1477,11 +1486,11 @@ bool GCNHazardRecognizer::fixLdsDirectVMEMHazard(MachineInstr *MI) {
 }
 
 bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
-  if (!ST.isWave64())
-    return false;
   if (!ST.hasVALUPartialForwardingHazard())
     return false;
-  if (!SIInstrInfo::isVALU(*MI))
+  assert(!ST.hasExtendedWaitCounts());
+
+  if (!ST.isWave64() || !SIInstrInfo::isVALU(*MI))
     return false;
 
   SmallSetVector<Register, 4> SrcVGPRs;
@@ -1628,6 +1637,8 @@ bool GCNHazardRecognizer::fixVALUPartialForwardingHazard(MachineInstr *MI) {
 bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
   if (!ST.hasVALUTransUseHazard())
     return false;
+  assert(!ST.hasExtendedWaitCounts());
+
   if (!SIInstrInfo::isVALU(*MI))
     return false;
 
@@ -1767,6 +1778,7 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
 bool GCNHazardRecognizer::fixShift64HighRegBug(MachineInstr *MI) {
   if (!ST.hasShift64HighRegBug())
     return false;
+  assert(!ST.hasExtendedWaitCounts());
 
   switch (MI->getOpcode()) {
   default:
@@ -1896,6 +1908,7 @@ int GCNHazardRecognizer::checkFPAtomicToDenormModeHazard(MachineInstr *MI) {
 
   if (!ST.hasFPAtomicToDenormModeHazard())
     return 0;
+  assert(!ST.hasExtendedWaitCounts());
 
   if (MI->getOpcode() != AMDGPU::S_DENORM_MODE)
     return 0;
@@ -2721,11 +2734,11 @@ bool GCNHazardRecognizer::ShouldPreferAnother(SUnit *SU) {
 }
 
 bool GCNHazardRecognizer::fixVALUMaskWriteHazard(MachineInstr *MI) {
-  if (!ST.isWave64())
-    return false;
   if (!ST.hasVALUMaskWriteHazard())
     return false;
-  if (!SIInstrInfo::isSALU(*MI))
+  assert(!ST.hasExtendedWaitCounts());
+
+  if (!ST.isWave64() || !SIInstrInfo::isSALU(*MI))
     return false;
 
   // The hazard sequence is three instructions:

From e21b0b083e14dd0714a33c2d8190b377d58532fe Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 19 Jan 2024 15:45:53 +0000
Subject: [PATCH 147/843] [AMDGPU] Remove gws feature from GFX12 (#78711)

This was already done for LLVM. This patch just updates the Clang
builtin handling to match.
---
 .../builtins-amdgcn-gfx12-err.cl              | 27 ++++++-------------
 .../builtins-amdgcn-gfx12-param-err.cl        | 24 +++++++++++++++++
 llvm/lib/TargetParser/TargetParser.cpp        |  1 -
 3 files changed, 32 insertions(+), 20 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl

diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
index 5e0153c42825e..bcaea9a2482d1 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-err.cl
@@ -2,23 +2,12 @@
 
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s
 
-kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) {
-
-  __builtin_amdgcn_s_barrier_signal(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal' must be a constant integer}}
-  __builtin_amdgcn_s_barrier_wait(-1);
-  *out = *in;
-}
-
-kernel void builtins_amdgcn_s_barrier_wait_err(global int* in, global int* out, int barrier) {
-
-  __builtin_amdgcn_s_barrier_signal(-1);
-  __builtin_amdgcn_s_barrier_wait(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_wait' must be a constant integer}}
-  *out = *in;
-}
-
-kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global int* out, int barrier) {
-
-  __builtin_amdgcn_s_barrier_signal_isfirst(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' must be a constant integer}}
-  __builtin_amdgcn_s_barrier_wait(-1);
-  *out = *in;
+typedef unsigned int uint;
+
+kernel void test_builtins_amdgcn_gws_insts(uint a, uint b) {
+  __builtin_amdgcn_ds_gws_init(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_init' needs target feature gws}}
+  __builtin_amdgcn_ds_gws_barrier(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_barrier' needs target feature gws}}
+  __builtin_amdgcn_ds_gws_sema_v(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_v' needs target feature gws}}
+  __builtin_amdgcn_ds_gws_sema_br(a, b); // expected-error {{'__builtin_amdgcn_ds_gws_sema_br' needs target feature gws}}
+  __builtin_amdgcn_ds_gws_sema_p(a); // expected-error {{'__builtin_amdgcn_ds_gws_sema_p' needs target feature gws}}
 }
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
new file mode 100644
index 0000000000000..5e0153c42825e
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-param-err.cl
@@ -0,0 +1,24 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -verify -S -emit-llvm -o - %s
+
+kernel void builtins_amdgcn_s_barrier_signal_err(global int* in, global int* out, int barrier) {
+
+  __builtin_amdgcn_s_barrier_signal(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal' must be a constant integer}}
+  __builtin_amdgcn_s_barrier_wait(-1);
+  *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_wait_err(global int* in, global int* out, int barrier) {
+
+  __builtin_amdgcn_s_barrier_signal(-1);
+  __builtin_amdgcn_s_barrier_wait(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_wait' must be a constant integer}}
+  *out = *in;
+}
+
+kernel void builtins_amdgcn_s_barrier_signal_isfirst_err(global int* in, global int* out, int barrier) {
+
+  __builtin_amdgcn_s_barrier_signal_isfirst(barrier); // expected-error {{'__builtin_amdgcn_s_barrier_signal_isfirst' must be a constant integer}}
+  __builtin_amdgcn_s_barrier_wait(-1);
+  *out = *in;
+}
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index f6d5bfe913b41..3cbe974ff3142 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -294,7 +294,6 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["gfx12-insts"] = true;
       Features["atomic-fadd-rtn-insts"] = true;
       Features["image-insts"] = true;
-      Features["gws"] = true;
       break;
     case GK_GFX1151:
     case GK_GFX1150:

From 5dbb30d950e4760a11ad97ee4b91c0325075860f Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Fri, 19 Jan 2024 16:47:24 +0100
Subject: [PATCH 148/843] [MLIR][OpenMP] Better error reporting for unsupported
 `nowait` (#78551)

Provides some context for failing to generate LLVM IR for `target
enter|exit|update` directives when `nowait` is provided. This is
directly helpful for flang users since they would get this error message
if they tried to use `nowait`. Before that we had a very generic
message.

This is a follow-up to https://github.com/llvm/llvm-project/pull/78269,
please only review the latest commit (the one with the same commit
message as the PR title).
---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      |  9 +++--
 .../Target/LLVMIR/omptarget-nowait-llvm.mlir  | 39 +++++++++++++++++++
 2 files changed, 45 insertions(+), 3 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir

diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index a7e320771189f..23e101f1e4527 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1884,7 +1884,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
           })
           .Case([&](omp::EnterDataOp enterDataOp) {
             if (enterDataOp.getNowait())
-              return failure();
+              return (LogicalResult)(enterDataOp.emitError(
+                  "`nowait` is not supported yet"));
 
             if (auto ifExprVar = enterDataOp.getIfExpr())
               ifCond = moduleTranslation.lookupValue(ifExprVar);
@@ -1900,7 +1901,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
           })
           .Case([&](omp::ExitDataOp exitDataOp) {
             if (exitDataOp.getNowait())
-              return failure();
+              return (LogicalResult)(exitDataOp.emitError(
+                  "`nowait` is not supported yet"));
 
             if (auto ifExprVar = exitDataOp.getIfExpr())
               ifCond = moduleTranslation.lookupValue(ifExprVar);
@@ -1917,7 +1919,8 @@ convertOmpTargetData(Operation *op, llvm::IRBuilderBase &builder,
           })
           .Case([&](omp::UpdateDataOp updateDataOp) {
             if (updateDataOp.getNowait())
-              return failure();
+              return (LogicalResult)(updateDataOp.emitError(
+                  "`nowait` is not supported yet"));
 
             if (auto ifExprVar = updateDataOp.getIfExpr())
               ifCond = moduleTranslation.lookupValue(ifExprVar);
diff --git a/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
new file mode 100644
index 0000000000000..50c2f0702e63e
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-nowait-llvm.mlir
@@ -0,0 +1,39 @@
+// RUN: not mlir-translate -mlir-to-llvmir -split-input-file %s 2>&1 | FileCheck %s
+
+llvm.func @_QPopenmp_target_data_update() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+  %2 = omp.map_info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+
+  // CHECK: error: `nowait` is not supported yet
+  omp.target_update_data motion_entries(%2 : !llvm.ptr) nowait
+
+  llvm.return
+}
+
+// -----
+
+llvm.func @_QPopenmp_target_data_enter() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+  %2 = omp.map_info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(to) capture(ByRef) -> !llvm.ptr {name = ""}
+
+  // CHECK: error: `nowait` is not supported yet
+  omp.target_enter_data map_entries(%2 : !llvm.ptr) nowait
+
+  llvm.return
+}
+
+
+// -----
+
+llvm.func @_QPopenmp_target_data_exit() {
+  %0 = llvm.mlir.constant(1 : i64) : i64
+  %1 = llvm.alloca %0 x i32 {bindc_name = "i", in_type = i32, operand_segment_sizes = array<i32: 0, 0>, uniq_name = "_QFopenmp_target_dataEi"} : (i64) -> !llvm.ptr
+  %2 = omp.map_info var_ptr(%1 : !llvm.ptr, i32)   map_clauses(from) capture(ByRef) -> !llvm.ptr {name = ""}
+
+  // CHECK: error: `nowait` is not supported yet
+  omp.target_exit_data map_entries(%2 : !llvm.ptr) nowait
+
+  llvm.return
+}

From 1abf2570b305a972d30af75a12dd4e835fb34c26 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 19 Jan 2024 15:40:17 +0000
Subject: [PATCH 149/843] [AMDGPU] Make use of CPol::SWZ_* in SelectionDAG.
 NFC.

For GlobalISel this was already done in
AMDGPUInstructionSelector::selectBufferLoadLds.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 929b5d004782d..cc0c4d4e36eaa 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9453,6 +9453,7 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   case Intrinsic::amdgcn_raw_ptr_buffer_load_lds:
   case Intrinsic::amdgcn_struct_buffer_load_lds:
   case Intrinsic::amdgcn_struct_ptr_buffer_load_lds: {
+    assert(!AMDGPU::isGFX12Plus(*Subtarget));
     unsigned Opc;
     bool HasVIndex =
         IntrinsicID == Intrinsic::amdgcn_struct_buffer_load_lds ||
@@ -9505,8 +9506,8 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
     unsigned Aux = Op.getConstantOperandVal(8 + OpOffset);
     Ops.push_back(
       DAG.getTargetConstant(Aux & AMDGPU::CPol::ALL, DL, MVT::i8)); // cpol
-    Ops.push_back(
-      DAG.getTargetConstant((Aux >> 3) & 1, DL, MVT::i8));          // swz
+    Ops.push_back(DAG.getTargetConstant(
+        Aux & AMDGPU::CPol::SWZ_pregfx12 ? 1 : 0, DL, MVT::i8)); // swz
     Ops.push_back(M0Val.getValue(0)); // Chain
     Ops.push_back(M0Val.getValue(1)); // Glue
 

From e89a7c41ba2d94866157056a88cfc083fa9d9cb5 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Fri, 19 Jan 2024 15:53:06 +0000
Subject: [PATCH 150/843] [AMDGPU] Update comment on
 SIInstrInfo::isLegalFLATOffset for GFX12

---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 5a9401103da82..f4ca27808a304 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -8968,8 +8968,9 @@ bool SIInstrInfo::splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset,
 
 // Depending on the used address space and instructions, some immediate offsets
 // are allowed and some are not.
-// In general, flat instruction offsets can only be non-negative, global and
-// scratch instruction offsets can also be negative.
+// Pre-GFX12, flat instruction offsets can only be non-negative, global and
+// scratch instruction offsets can also be negative. On GFX12, offsets can be
+// negative for all variants.
 //
 // There are several bugs related to these offsets:
 // On gfx10.1, flat instructions that go into the global address space cannot

From 40a631f452726606e1d43b5d2744aa983949af78 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 19 Jan 2024 16:02:24 +0000
Subject: [PATCH 151/843] [Clang] Refactor diagnostics for SME builtins.
 (#78258)

The arm_sme.td file was still using `IsSharedZA` and `IsPreservesZA`,
which should be changed to match the new state attributes added in
#76971.

This patch adds `IsInZA`, `IsOutZA` and `IsInOutZA` as the state for the
Clang builtins and fixes up the code in SemaChecking and SveEmitter to
match.

Note that the code is written in such a way that it can be easily
extended with ZT0 state (to follow in a future patch).
---
 clang/include/clang/Basic/arm_sme.td          | 572 +++++++++---------
 clang/include/clang/Basic/arm_sve_sme_incl.td |  13 +-
 clang/lib/Sema/SemaChecking.cpp               |  35 +-
 .../Sema/aarch64-incompat-sm-builtin-calls.c  |   6 +
 clang/utils/TableGen/SveEmitter.cpp           |  22 +-
 5 files changed, 329 insertions(+), 319 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index aac3bd486de92..4fb50b8e4e4e5 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -21,19 +21,19 @@ include "arm_sve_sme_incl.td"
 multiclass ZALoad<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
   let TargetGuard = "sme" in {
     def NAME # _H : MInst<"svld1_hor_" # n_suffix, "vimPQ", t,
-                          [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA],
+                          [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
                           MemEltTyDefault, i_prefix # "_horiz", ch>;
 
     def NAME # _H_VNUM : MInst<"svld1_hor_vnum_" # n_suffix, "vimPQl", t,
-                               [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA],
+                               [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
                                MemEltTyDefault, i_prefix # "_horiz", ch>;
 
     def NAME # _V : MInst<"svld1_ver_" # n_suffix, "vimPQ", t,
-                          [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA],
+                          [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
                           MemEltTyDefault, i_prefix # "_vert", ch>;
 
     def NAME # _V_VNUM : MInst<"svld1_ver_vnum_" # n_suffix, "vimPQl", t,
-                               [IsLoad, IsOverloadNone, IsStreaming, IsSharedZA],
+                               [IsLoad, IsOverloadNone, IsStreaming, IsInOutZA],
                                MemEltTyDefault, i_prefix # "_vert", ch>;
   }
 }
@@ -45,11 +45,11 @@ defm SVLD1_ZA64 : ZALoad<"za64", "l", "aarch64_sme_ld1d", [ImmCheck<0, ImmCheck0
 defm SVLD1_ZA128 : ZALoad<"za128", "q", "aarch64_sme_ld1q", [ImmCheck<0, ImmCheck0_15>]>;
 
 def SVLDR_VNUM_ZA : MInst<"svldr_vnum_za", "vmQl", "",
-                          [IsOverloadNone, IsStreamingCompatible, IsSharedZA],
+                          [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                           MemEltTyDefault, "aarch64_sme_ldr">;
 
 def SVLDR_ZA : MInst<"svldr_za", "vmQ", "",
-                          [IsOverloadNone, IsStreamingCompatible, IsSharedZA],
+                          [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                           MemEltTyDefault, "aarch64_sme_ldr", []>;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -58,19 +58,19 @@ def SVLDR_ZA : MInst<"svldr_za", "vmQ", "",
 multiclass ZAStore<string n_suffix, string t, string i_prefix, list<ImmCheck> ch> {
   let TargetGuard = "sme" in {
     def NAME # _H : MInst<"svst1_hor_" # n_suffix, "vimP%", t,
-                          [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA],
+                          [IsStore, IsOverloadNone, IsStreaming, IsInZA],
                           MemEltTyDefault, i_prefix # "_horiz", ch>;
 
     def NAME # _H_VNUM : MInst<"svst1_hor_vnum_" # n_suffix, "vimP%l", t,
-                               [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA],
+                               [IsStore, IsOverloadNone, IsStreaming, IsInZA],
                                MemEltTyDefault, i_prefix # "_horiz", ch>;
 
     def NAME # _V : MInst<"svst1_ver_" # n_suffix, "vimP%", t,
-                          [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA],
+                          [IsStore, IsOverloadNone, IsStreaming, IsInZA],
                           MemEltTyDefault, i_prefix # "_vert", ch>;
 
     def NAME # _V_VNUM : MInst<"svst1_ver_vnum_" # n_suffix, "vimP%l", t,
-                               [IsStore, IsOverloadNone, IsStreaming, IsSharedZA, IsPreservesZA],
+                               [IsStore, IsOverloadNone, IsStreaming, IsInZA],
                                MemEltTyDefault, i_prefix # "_vert", ch>;
   }
 }
@@ -82,11 +82,11 @@ defm SVST1_ZA64 : ZAStore<"za64", "l", "aarch64_sme_st1d", [ImmCheck<0, ImmCheck
 defm SVST1_ZA128 : ZAStore<"za128", "q", "aarch64_sme_st1q", [ImmCheck<0, ImmCheck0_15>]>;
 
 def SVSTR_VNUM_ZA : MInst<"svstr_vnum_za", "vm%l", "",
-                          [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA],
+                          [IsOverloadNone, IsStreamingCompatible, IsInZA],
                           MemEltTyDefault, "aarch64_sme_str">;
 
 def SVSTR_ZA : MInst<"svstr_za", "vm%", "",
-                      [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA],
+                      [IsOverloadNone, IsStreamingCompatible, IsInZA],
                       MemEltTyDefault, "aarch64_sme_str", []>;
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -96,11 +96,11 @@ multiclass ZARead<string n_suffix, string t, string i_prefix, list<ImmCheck> ch>
   let TargetGuard = "sme" in {
     def NAME # _H : SInst<"svread_hor_" # n_suffix # "[_{d}]", "ddPim", t,
                           MergeOp1, i_prefix # "_horiz",
-                          [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>;
+                          [IsReadZA, IsStreaming, IsInZA], ch>;
 
     def NAME # _V : SInst<"svread_ver_" # n_suffix # "[_{d}]", "ddPim", t,
                           MergeOp1, i_prefix # "_vert",
-                          [IsReadZA, IsStreaming, IsSharedZA, IsPreservesZA], ch>;
+                          [IsReadZA, IsStreaming, IsInZA], ch>;
   }
 }
 
@@ -117,11 +117,11 @@ multiclass ZAWrite<string n_suffix, string t, string i_prefix, list<ImmCheck> ch
   let TargetGuard = "sme" in {
     def NAME # _H : SInst<"svwrite_hor_" # n_suffix # "[_{d}]", "vimPd", t,
                           MergeOp1, i_prefix # "_horiz",
-                          [IsWriteZA, IsStreaming, IsSharedZA], ch>;
+                          [IsWriteZA, IsStreaming, IsInOutZA], ch>;
 
     def NAME # _V : SInst<"svwrite_ver_" # n_suffix # "[_{d}]", "vimPd", t,
                           MergeOp1, i_prefix # "_vert",
-                          [IsWriteZA, IsStreaming, IsSharedZA], ch>;
+                          [IsWriteZA, IsStreaming, IsInOutZA], ch>;
   }
 }
 
@@ -136,10 +136,10 @@ defm SVWRITE_ZA128 : ZAWrite<"za128", "csilUcUsUiUlhbfd", "aarch64_sme_writeq",
 
 let TargetGuard = "sme" in {
   def SVZERO_MASK_ZA : SInst<"svzero_mask_za", "vi", "", MergeNone, "aarch64_sme_zero",
-                             [IsOverloadNone, IsStreamingCompatible, IsSharedZA],
+                             [IsOverloadNone, IsStreamingCompatible, IsInOutZA],
                              [ImmCheck<0, ImmCheck0_255>]>;
   def SVZERO_ZA      : SInst<"svzero_za", "v", "", MergeNone, "aarch64_sme_zero",
-                             [IsOverloadNone, IsStreamingCompatible, IsSharedZA]>;
+                             [IsOverloadNone, IsStreamingCompatible, IsOutZA]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -149,7 +149,7 @@ multiclass ZACount<string n_suffix> {
   let TargetGuard = "sme" in {
     def NAME : SInst<"sv" # n_suffix, "nv", "", MergeNone,
                       "aarch64_sme_" # n_suffix,
-                      [IsOverloadNone, IsStreamingCompatible, IsPreservesZA]>;
+                      [IsOverloadNone, IsStreamingCompatible]>;
   }
 }
 
@@ -164,13 +164,13 @@ defm SVCNTSD : ZACount<"cntsd">;
 multiclass ZAAdd<string n_suffix> {
   let TargetGuard = "sme" in {
     def NAME # _ZA32: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPd", "iUi", MergeOp1,
-                      "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA],
+                      "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA],
                       [ImmCheck<0, ImmCheck0_3>]>;
   }
 
   let TargetGuard = "sme-i16i64" in {
     def NAME # _ZA64: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPd", "lUl", MergeOp1,
-                     "aarch64_sme_" # n_suffix, [IsStreaming, IsSharedZA],
+                     "aarch64_sme_" # n_suffix, [IsStreaming, IsInOutZA],
                      [ImmCheck<0, ImmCheck0_7>]>;
   }
 }
@@ -186,7 +186,7 @@ multiclass ZAIntOuterProd<string n_suffix1, string n_suffix2> {
     def NAME # _ZA32_B: SInst<"sv" # n_suffix2 # "_za32[_{d}]",
                               "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "c",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
   }
 
@@ -194,7 +194,7 @@ multiclass ZAIntOuterProd<string n_suffix1, string n_suffix2> {
     def NAME # _ZA64_H: SInst<"sv" # n_suffix2 # "_za64[_{d}]",
                               "viPPdd", !cond(!eq(n_suffix1, "s") : "", true: "U") # "s",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_7>]>;
   }
 }
@@ -213,7 +213,7 @@ multiclass ZAIntOuterProdMixedSigns<string n_suffix1, string n_suffix2> {
                               "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
                               !cond(!eq(n_suffix1, "su") : "", true: "U") # "c",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
   }
 
@@ -222,7 +222,7 @@ multiclass ZAIntOuterProdMixedSigns<string n_suffix1, string n_suffix2> {
                               "viPPd" # !cond(!eq(n_suffix1, "su") : "u", true: "x"),
                               !cond(!eq(n_suffix1, "su") : "", true: "U") # "s",
                               MergeOp1, "aarch64_sme_" # n_suffix1 # n_suffix2 # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_7>]>;
   }
 }
@@ -239,24 +239,24 @@ multiclass ZAFPOuterProd<string n_suffix> {
   let TargetGuard = "sme" in {
     def NAME # _ZA32_B: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "h",
                               MergeOp1, "aarch64_sme_" # n_suffix # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
 
     def NAME # _ZA32_H: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "b",
                               MergeOp1, "aarch64_sme_" # n_suffix # "_wide",
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
 
     def NAME # _ZA32_S: SInst<"sv" # n_suffix # "_za32[_{d}]", "viPPdd", "f",
                               MergeOp1, "aarch64_sme_" # n_suffix,
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_3>]>;
   }
 
   let TargetGuard = "sme-f64f64" in {
     def NAME # _ZA64_D: SInst<"sv" # n_suffix # "_za64[_{d}]", "viPPdd", "d",
                               MergeOp1, "aarch64_sme_" # n_suffix,
-                              [IsStreaming, IsSharedZA],
+                              [IsStreaming, IsInOutZA],
                               [ImmCheck<0, ImmCheck0_7>]>;
   }
 }
@@ -269,29 +269,29 @@ defm SVMOPS : ZAFPOuterProd<"mops">;
 
 multiclass ZAAddSub<string n_suffix> {
   let TargetGuard = "sme2" in {
-    def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>;
-    def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+    def NAME # _WRITE_SINGLE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x2", "vm2d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _WRITE_SINGLE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write[_single]_za32[_{d}]_vg1x4", "vm4d", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>;
 
-    def NAME # _WRITE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x2", "vm22", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>;
-    def NAME # _WRITE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x4", "vm44", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+    def NAME # _WRITE_ZA32_VG1X2_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x2", "vm22", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _WRITE_ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_write_za32[_{d}]_vg1x4", "vm44", "iUi", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>;
 
-    def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-    def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+    def NAME # _ZA32_VG1x2_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x2", "vm2", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+    def NAME # _ZA32_VG1X4_I32 : Inst<"sv" # n_suffix # "_za32[_{d}]_vg1x4", "vm4", "iUif", MergeNone, "aarch64_sme_" # n_suffix # "_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
 
     let TargetGuard = "sme-i16i64" in {
-      def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsSharedZA], []>;
-      def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+      def NAME # _WRITE_SINGLE_ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x2", "vm2d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+      def NAME # _WRITE_SINGLE_ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_write[_single]_za64[_{d}]_vg1x4", "vm4d", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_single_za_vg1x4", [IsStreaming, IsInOutZA], []>;
 
-      def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsSharedZA], []>;
-      def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsSharedZA], []>;
+      def NAME # _WRITE_ZA64_VG1x2_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x2", "vm22", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x2", [IsStreaming, IsInOutZA], []>;
+      def NAME # _WRITE_ZA64_VG1x4_I64 : Inst<"sv" # n_suffix # "_write_za64[_{d}]_vg1x4", "vm44", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_write_za_vg1x4", [IsStreaming, IsInOutZA], []>;
 
-      def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
-      def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+      def NAME # _ZA64_VG1X2_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+      def NAME # _ZA64_VG1X4_I64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "lUl", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
     }
 
     let TargetGuard = "sme-f64f64" in {
-      def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
-      def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
+      def NAME # _ZA64_VG1X2_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x2", "vm2", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+      def NAME # _ZA64_VG1X4_F64 : Inst<"sv" # n_suffix # "_za64[_{d}]_vg1x4", "vm4", "d", MergeNone, "aarch64_sme_" # n_suffix # "_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
     }
   }
 }
@@ -306,12 +306,12 @@ defm SVSUB : ZAAddSub<"sub">;
 //
 
 multiclass ZAWrite_VG<string n, string t, string i, list<ImmCheck> checks> {
-  def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "[_{d}]_vg2",   "vim2", t, MergeNone, i # "_hor_vg2", [IsSharedZA, IsStreaming], checks>;
-  def NAME # _VG2_V : Inst<"svwrite_ver_" # n # "[_{d}]_vg2",   "vim2", t, MergeNone, i # "_ver_vg2", [IsSharedZA, IsStreaming], checks>;
-  def NAME # _VG4_H : Inst<"svwrite_hor_" # n # "[_{d}]_vg4",   "vim4", t, MergeNone, i # "_hor_vg4", [IsSharedZA, IsStreaming], checks>;
-  def NAME # _VG4_V : Inst<"svwrite_ver_" # n # "[_{d}]_vg4",   "vim4", t, MergeNone, i # "_ver_vg4", [IsSharedZA, IsStreaming], checks>;
-  def NAME # _VG1x2 : Inst<"svwrite_"     # n # "[_{d}]_vg1x2", "vm2",  t, MergeNone, i # "_vg1x2",   [IsSharedZA, IsStreaming], []>;
-  def NAME # _VG1x4 : Inst<"svwrite_"     # n # "[_{d}]_vg1x4", "vm4",  t, MergeNone, i # "_vg1x4",   [IsSharedZA, IsStreaming], []>;
+  def NAME # _VG2_H : Inst<"svwrite_hor_" # n # "[_{d}]_vg2",   "vim2", t, MergeNone, i # "_hor_vg2", [IsInOutZA, IsStreaming], checks>;
+  def NAME # _VG2_V : Inst<"svwrite_ver_" # n # "[_{d}]_vg2",   "vim2", t, MergeNone, i # "_ver_vg2", [IsInOutZA, IsStreaming], checks>;
+  def NAME # _VG4_H : Inst<"svwrite_hor_" # n # "[_{d}]_vg4",   "vim4", t, MergeNone, i # "_hor_vg4", [IsInOutZA, IsStreaming], checks>;
+  def NAME # _VG4_V : Inst<"svwrite_ver_" # n # "[_{d}]_vg4",   "vim4", t, MergeNone, i # "_ver_vg4", [IsInOutZA, IsStreaming], checks>;
+  def NAME # _VG1x2 : Inst<"svwrite_"     # n # "[_{d}]_vg1x2", "vm2",  t, MergeNone, i # "_vg1x2",   [IsInOutZA, IsStreaming], []>;
+  def NAME # _VG1x4 : Inst<"svwrite_"     # n # "[_{d}]_vg1x4", "vm4",  t, MergeNone, i # "_vg1x4",   [IsInOutZA, IsStreaming], []>;
 }
 
 let TargetGuard = "sme2" in {
@@ -322,12 +322,12 @@ let TargetGuard = "sme2" in {
 }
 
 multiclass ZARead_VG<string n, string t, string i, list<ImmCheck> checks> {
-  def NAME # _VG2_H : Inst<"svread_hor_" # n # "_{d}_vg2",   "2im", t, MergeNone, i # "_hor_vg2", [IsSharedZA, IsPreservesZA, IsStreaming], checks>;
-  def NAME # _VG2_V : Inst<"svread_ver_" # n # "_{d}_vg2",   "2im", t, MergeNone, i # "_ver_vg2", [IsSharedZA, IsPreservesZA, IsStreaming], checks>;
-  def NAME # _VG4_H : Inst<"svread_hor_" # n # "_{d}_vg4",   "4im", t, MergeNone, i # "_hor_vg4", [IsSharedZA, IsPreservesZA, IsStreaming], checks>;
-  def NAME # _VG4_V : Inst<"svread_ver_" # n # "_{d}_vg4",   "4im", t, MergeNone, i # "_ver_vg4", [IsSharedZA, IsPreservesZA, IsStreaming], checks>;
-  def NAME # _VG1x2 : Inst<"svread_"     # n # "_{d}_vg1x2", "2m",  t, MergeNone, i # "_vg1x2",   [IsSharedZA, IsPreservesZA, IsStreaming], []>;
-  def NAME # _VG1x4 : Inst<"svread_"     # n # "_{d}_vg1x4", "4m",  t, MergeNone, i # "_vg1x4",   [IsSharedZA, IsPreservesZA, IsStreaming], []>;
+  def NAME # _VG2_H : Inst<"svread_hor_" # n # "_{d}_vg2",   "2im", t, MergeNone, i # "_hor_vg2", [IsInZA, IsStreaming], checks>;
+  def NAME # _VG2_V : Inst<"svread_ver_" # n # "_{d}_vg2",   "2im", t, MergeNone, i # "_ver_vg2", [IsInZA, IsStreaming], checks>;
+  def NAME # _VG4_H : Inst<"svread_hor_" # n # "_{d}_vg4",   "4im", t, MergeNone, i # "_hor_vg4", [IsInZA, IsStreaming], checks>;
+  def NAME # _VG4_V : Inst<"svread_ver_" # n # "_{d}_vg4",   "4im", t, MergeNone, i # "_ver_vg4", [IsInZA, IsStreaming], checks>;
+  def NAME # _VG1x2 : Inst<"svread_"     # n # "_{d}_vg1x2", "2m",  t, MergeNone, i # "_vg1x2",   [IsInZA, IsStreaming], []>;
+  def NAME # _VG1x4 : Inst<"svread_"     # n # "_{d}_vg1x4", "4m",  t, MergeNone, i # "_vg1x4",   [IsInZA, IsStreaming], []>;
 }
 
 let TargetGuard = "sme2" in {
@@ -342,331 +342,331 @@ let TargetGuard = "sme2" in {
 //
 
 let TargetGuard = "sme2" in {
-  def SVSMOPA  : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "s", MergeNone, "aarch64_sme_smopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
-  def SVUSMOPA : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "Us", MergeNone, "aarch64_sme_umopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+  def SVSMOPA  : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "s", MergeNone, "aarch64_sme_smopa_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+  def SVUSMOPA : Inst<"svmopa_za32[_{d}]_m", "viPPdd", "Us", MergeNone, "aarch64_sme_umopa_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 
-  def SVSMOPS  : Inst<"svmops_za32[_{d}]_m", "viPPdd", "s", MergeNone, "aarch64_sme_smops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
-  def SVUSMOPS : Inst<"svmops_za32[_{d}]_m", "viPPdd", "Us", MergeNone, "aarch64_sme_umops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+  def SVSMOPS  : Inst<"svmops_za32[_{d}]_m", "viPPdd", "s", MergeNone, "aarch64_sme_smops_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+  def SVUSMOPS : Inst<"svmops_za32[_{d}]_m", "viPPdd", "Us", MergeNone, "aarch64_sme_umops_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 
-  def SVBMOPA : Inst<"svbmopa_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmopa_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+  def SVBMOPA : Inst<"svbmopa_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmopa_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 
-  def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsSharedZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
+  def SVBMOPS : Inst<"svbmops_za32[_{d}]_m", "viPPdd", "iUi", MergeNone, "aarch64_sme_bmops_za32", [IsInOutZA, IsStreaming], [ImmCheck<0, ImmCheck0_3>]>;
 
   // VERTICAL DOT-PRODUCT
-  def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVVDOT_LANE_ZA32_VG1x4_S : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vm4di", "c", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVVDOT_LANE_ZA32_VG1x2_U : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVVDOT_LANE_ZA32_VG1x4_U : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vm4di", "Uc", MergeNone, "aarch64_sme_uvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVVDOT_LANE_ZA32_VG1x2_F : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "hb", MergeNone, "aarch64_sme_fvdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVSUVDOT_LANE_ZA32_VG1x4 : Inst<"svsuvdot_lane_za32[_{d}]_vg1x4", "vm4di", "c", MergeNone, "aarch64_sme_suvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVUSVDOT_LANE_ZA32_VG1x4 : Inst<"svusvdot_lane_za32[_{d}]_vg1x4", "vm4di", "Uc", MergeNone, "aarch64_sme_usvdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x2_S : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x4_S : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vm4di", "c", MergeNone, "aarch64_sme_svdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x2_U : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x4_U : Inst<"svvdot_lane_za32[_{d}]_vg1x4", "vm4di", "Uc", MergeNone, "aarch64_sme_uvdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVVDOT_LANE_ZA32_VG1x2_F : Inst<"svvdot_lane_za32[_{d}]_vg1x2", "vm2di", "hb", MergeNone, "aarch64_sme_fvdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVSUVDOT_LANE_ZA32_VG1x4 : Inst<"svsuvdot_lane_za32[_{d}]_vg1x4", "vm4di", "c", MergeNone, "aarch64_sme_suvdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVUSVDOT_LANE_ZA32_VG1x4 : Inst<"svusvdot_lane_za32[_{d}]_vg1x4", "vm4di", "Uc", MergeNone, "aarch64_sme_usvdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
 
   // Multi-vector signed & unsigned integer dot-product
-  def SVDOT_MULTI_ZA32_VG1x2_S  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "cs", MergeNone, "aarch64_sme_sdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_MULTI_ZA32_VG1x4_S  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "cs", MergeNone, "aarch64_sme_sdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_MULTI_ZA32_VG1x2_U  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "UcUs", MergeNone, "aarch64_sme_udot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_MULTI_ZA32_VG1x4_U  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "UcUs", MergeNone, "aarch64_sme_udot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA32_VG1x2_S : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "cs", MergeNone, "aarch64_sme_sdot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA32_VG1x4_S : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "cs", MergeNone, "aarch64_sme_sdot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA32_VG1x2_U : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "UcUs", MergeNone, "aarch64_sme_udot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA32_VG1x4_U : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "UcUs", MergeNone, "aarch64_sme_udot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_LANE_ZA32_VG1x2_S   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "cs", MergeNone, "aarch64_sme_sdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVDOT_LANE_ZA32_VG1x4_S   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "cs", MergeNone, "aarch64_sme_sdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVDOT_LANE_ZA32_VG1x2_U   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "UcUs", MergeNone, "aarch64_sme_udot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVDOT_LANE_ZA32_VG1x4_U   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "UcUs", MergeNone, "aarch64_sme_udot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-
-  def SVUSDOT_SINGLE_ZA32_VG1x2 : Inst<"svusdot[_single]_za32[_{d}]_vg1x2", "vm2.dx", "Uc", MergeNone, "aarch64_sme_usdot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVUSDOT_SINGLE_ZA32_VG1x4 : Inst<"svusdot[_single]_za32[_{d}]_vg1x4", "vm4.dx", "Uc", MergeNone, "aarch64_sme_usdot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVUSDOT_MULTI_ZA32_VG1x2  : Inst<"svusdot_za32[_{d}]_vg1x2", "vm2.d2.x", "Uc", MergeNone, "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVUSDOT_MULTI_ZA32_VG1x4  : Inst<"svusdot_za32[_{d}]_vg1x4", "vm4.d4.x", "Uc", MergeNone, "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVUSDOT_LANE_ZA32_VG1x2   : Inst<"svusdot_lane_za32[_{d}]_vg1x2", "vm2.dxi", "Uc", MergeNone, "aarch64_sme_usdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVUSDOT_LANE_ZA32_VG1x4   : Inst<"svusdot_lane_za32[_{d}]_vg1x4", "vm4.dxi", "Uc", MergeNone, "aarch64_sme_usdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-
-  def SVSUDOT_SINGLE_ZA32_VG1x2 : Inst<"svsudot[_single]_za32[_{d}]_vg1x2", "vm2.du", "c", MergeNone, "aarch64_sme_sudot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVSUDOT_SINGLE_ZA32_VG1x4 : Inst<"svsudot[_single]_za32[_{d}]_vg1x4", "vm4.du", "c", MergeNone, "aarch64_sme_sudot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x2_S  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "cs", MergeNone, "aarch64_sme_sdot_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x4_S  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "cs", MergeNone, "aarch64_sme_sdot_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x2_U  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "UcUs", MergeNone, "aarch64_sme_udot_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x4_U  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "UcUs", MergeNone, "aarch64_sme_udot_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x2_S : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "cs", MergeNone, "aarch64_sme_sdot_single_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x4_S : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "cs", MergeNone, "aarch64_sme_sdot_single_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x2_U : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "UcUs", MergeNone, "aarch64_sme_udot_single_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x4_U : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "UcUs", MergeNone, "aarch64_sme_udot_single_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_LANE_ZA32_VG1x2_S   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "cs", MergeNone, "aarch64_sme_sdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x4_S   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "cs", MergeNone, "aarch64_sme_sdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x2_U   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "UcUs", MergeNone, "aarch64_sme_udot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x4_U   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "UcUs", MergeNone, "aarch64_sme_udot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  def SVUSDOT_SINGLE_ZA32_VG1x2 : Inst<"svusdot[_single]_za32[_{d}]_vg1x2", "vm2.dx", "Uc", MergeNone, "aarch64_sme_usdot_single_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVUSDOT_SINGLE_ZA32_VG1x4 : Inst<"svusdot[_single]_za32[_{d}]_vg1x4", "vm4.dx", "Uc", MergeNone, "aarch64_sme_usdot_single_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVUSDOT_MULTI_ZA32_VG1x2  : Inst<"svusdot_za32[_{d}]_vg1x2", "vm2.d2.x", "Uc", MergeNone, "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVUSDOT_MULTI_ZA32_VG1x4  : Inst<"svusdot_za32[_{d}]_vg1x4", "vm4.d4.x", "Uc", MergeNone, "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVUSDOT_LANE_ZA32_VG1x2   : Inst<"svusdot_lane_za32[_{d}]_vg1x2", "vm2.dxi", "Uc", MergeNone, "aarch64_sme_usdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVUSDOT_LANE_ZA32_VG1x4   : Inst<"svusdot_lane_za32[_{d}]_vg1x4", "vm4.dxi", "Uc", MergeNone, "aarch64_sme_usdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+
+  def SVSUDOT_SINGLE_ZA32_VG1x2 : Inst<"svsudot[_single]_za32[_{d}]_vg1x2", "vm2.du", "c", MergeNone, "aarch64_sme_sudot_single_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVSUDOT_SINGLE_ZA32_VG1x4 : Inst<"svsudot[_single]_za32[_{d}]_vg1x4", "vm4.du", "c", MergeNone, "aarch64_sme_sudot_single_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
 
   // Multi-multi sudot builtins are mapped to usdot, with zn & zm operands swapped
-  def SVSUDOT_MULTI_ZA32_VG1x2  : Inst<"svsudot_za32[_{d}]_vg1x2", "vm2.d2.u", "c", MergeNone, "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVSUDOT_MULTI_ZA32_VG1x4  : Inst<"svsudot_za32[_{d}]_vg1x4", "vm4.d4.u", "c", MergeNone, "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
+  def SVSUDOT_MULTI_ZA32_VG1x2  : Inst<"svsudot_za32[_{d}]_vg1x2", "vm2.d2.u", "c", MergeNone, "aarch64_sme_usdot_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVSUDOT_MULTI_ZA32_VG1x4  : Inst<"svsudot_za32[_{d}]_vg1x4", "vm4.d4.u", "c", MergeNone, "aarch64_sme_usdot_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
 
-  def SVSUDOT_LANE_ZA32_VG1x2   : Inst<"svsudot_lane_za32[_{d}]_vg1x2", "vm2.dui", "c", MergeNone, "aarch64_sme_sudot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVSUDOT_LANE_ZA32_VG1x4   : Inst<"svsudot_lane_za32[_{d}]_vg1x4", "vm4.dui", "c", MergeNone, "aarch64_sme_sudot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVSUDOT_LANE_ZA32_VG1x2   : Inst<"svsudot_lane_za32[_{d}]_vg1x2", "vm2.dui", "c", MergeNone, "aarch64_sme_sudot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVSUDOT_LANE_ZA32_VG1x4   : Inst<"svsudot_lane_za32[_{d}]_vg1x4", "vm4.dui", "c", MergeNone, "aarch64_sme_sudot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
 
   // Multi-vector half-precision/BFloat16 floating-point dot-product
-  def SVDOT_MULTI_ZA32_VG1x2_F16  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "bh", MergeNone, "aarch64_sme_fdot_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_MULTI_ZA32_VG1x4_F16  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "bh", MergeNone, "aarch64_sme_fdot_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA32_VG1x2_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "bh", MergeNone, "aarch64_sme_fdot_single_za32_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA32_VG1x4_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "bh", MergeNone, "aarch64_sme_fdot_single_za32_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_LANE_ZA32_VG1x2_F16   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "bh", MergeNone, "aarch64_sme_fdot_lane_za32_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVDOT_LANE_ZA32_VG1x4_F16   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "bh", MergeNone, "aarch64_sme_fdot_lane_za32_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_MULTI_ZA32_VG1x2_F16  : Inst<"svdot_za32[_{d}]_vg1x2", "vm22", "bh", MergeNone, "aarch64_sme_fdot_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_MULTI_ZA32_VG1x4_F16  : Inst<"svdot_za32[_{d}]_vg1x4", "vm44", "bh", MergeNone, "aarch64_sme_fdot_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x2_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x2", "vm2d", "bh", MergeNone, "aarch64_sme_fdot_single_za32_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA32_VG1x4_F16 : Inst<"svdot[_single]_za32[_{d}]_vg1x4", "vm4d", "bh", MergeNone, "aarch64_sme_fdot_single_za32_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_LANE_ZA32_VG1x2_F16   : Inst<"svdot_lane_za32[_{d}]_vg1x2", "vm2di", "bh", MergeNone, "aarch64_sme_fdot_lane_za32_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVDOT_LANE_ZA32_VG1x4_F16   : Inst<"svdot_lane_za32[_{d}]_vg1x4", "vm4di", "bh", MergeNone, "aarch64_sme_fdot_lane_za32_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
 }
 
 let TargetGuard = "sme2,sme-i16i64" in {
-  def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
-  def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
-
-  def SVDOT_MULTI_ZA64_VG1x2_S16  : Inst<"svdot_za64[_{d}]_vg1x2", "vm22", "s", MergeNone, "aarch64_sme_sdot_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_MULTI_ZA64_VG1x4_S16  : Inst<"svdot_za64[_{d}]_vg1x4", "vm44", "s", MergeNone, "aarch64_sme_sdot_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_MULTI_ZA64_VG1x2_U16  : Inst<"svdot_za64[_{d}]_vg1x2", "vm22", "Us", MergeNone, "aarch64_sme_udot_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_MULTI_ZA64_VG1x4_U16  : Inst<"svdot_za64[_{d}]_vg1x4", "vm44", "Us", MergeNone, "aarch64_sme_udot_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA64_VG1x2_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vm2d", "s", MergeNone, "aarch64_sme_sdot_single_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA64_VG1x4_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vm4d", "s", MergeNone, "aarch64_sme_sdot_single_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA64_VG1x2_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vm2d", "Us", MergeNone, "aarch64_sme_udot_single_za64_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_SINGLE_ZA64_VG1x4_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vm4d", "Us", MergeNone, "aarch64_sme_udot_single_za64_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVDOT_LANE_ZA64_VG1x2_S16   : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_sdot_lane_za64_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
-  def SVDOT_LANE_ZA64_VG1x4_S16   : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_sdot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
-  def SVDOT_LANE_ZA64_VG1x2_U16   : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vm2di", "Us", MergeNone, "aarch64_sme_udot_lane_za64_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
-  def SVDOT_LANE_ZA64_VG1x4_U16   : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_udot_lane_za64_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVVDOT_LANE_ZA64_VG1x4_S : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_svdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVVDOT_LANE_ZA64_VG1x4_U : Inst<"svvdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_uvdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
+
+  def SVDOT_MULTI_ZA64_VG1x2_S16  : Inst<"svdot_za64[_{d}]_vg1x2", "vm22", "s", MergeNone, "aarch64_sme_sdot_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_MULTI_ZA64_VG1x4_S16  : Inst<"svdot_za64[_{d}]_vg1x4", "vm44", "s", MergeNone, "aarch64_sme_sdot_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_MULTI_ZA64_VG1x2_U16  : Inst<"svdot_za64[_{d}]_vg1x2", "vm22", "Us", MergeNone, "aarch64_sme_udot_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_MULTI_ZA64_VG1x4_U16  : Inst<"svdot_za64[_{d}]_vg1x4", "vm44", "Us", MergeNone, "aarch64_sme_udot_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x2_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vm2d", "s", MergeNone, "aarch64_sme_sdot_single_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x4_S16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vm4d", "s", MergeNone, "aarch64_sme_sdot_single_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x2_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x2", "vm2d", "Us", MergeNone, "aarch64_sme_udot_single_za64_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_SINGLE_ZA64_VG1x4_U16 : Inst<"svdot[_single]_za64[_{d}]_vg1x4", "vm4d", "Us", MergeNone, "aarch64_sme_udot_single_za64_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVDOT_LANE_ZA64_VG1x2_S16   : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vm2di", "s", MergeNone, "aarch64_sme_sdot_lane_za64_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVDOT_LANE_ZA64_VG1x4_S16   : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vm4di", "s", MergeNone, "aarch64_sme_sdot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVDOT_LANE_ZA64_VG1x2_U16   : Inst<"svdot_lane_za64[_{d}]_vg1x2", "vm2di", "Us", MergeNone, "aarch64_sme_udot_lane_za64_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVDOT_LANE_ZA64_VG1x4_U16   : Inst<"svdot_lane_za64[_{d}]_vg1x4", "vm4di", "Us", MergeNone, "aarch64_sme_udot_lane_za64_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
 }
 
 // FMLA/FMLS
 let TargetGuard = "sme2" in {
-  def SVMLA_MULTI_VG1x2_F32 : Inst<"svmla_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLA_MULTI_VG1x4_F32 : Inst<"svmla_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLS_MULTI_VG1x2_F32 : Inst<"svmls_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLS_MULTI_VG1x4_F32 : Inst<"svmls_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA], []>;
-
-  def SVMLA_SINGLE_VG1x2_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLA_SINGLE_VG1x4_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLS_SINGLE_VG1x2_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLS_SINGLE_VG1x4_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA], []>;
-
-  def SVMLA_LANE_VG1x2_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVMLA_LANE_VG1x4_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVMLS_LANE_VG1x2_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
-  def SVMLS_LANE_VG1x4_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLA_MULTI_VG1x2_F32 : Inst<"svmla_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_F32 : Inst<"svmla_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_F32 : Inst<"svmls_za32[_{d}]_vg1x2", "vm22", "f", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_F32 : Inst<"svmls_za32[_{d}]_vg1x4", "vm44", "f", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_F32 : Inst<"svmla[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x2", "vm2d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_F32 : Inst<"svmls[_single]_za32[_{d}]_vg1x4", "vm4d", "f", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLA_LANE_VG1x4_F32 : Inst<"svmla_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLS_LANE_VG1x2_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x2", "vm2di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
+  def SVMLS_LANE_VG1x4_F32 : Inst<"svmls_lane_za32[_{d}]_vg1x4", "vm4di", "f", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_3>]>;
 }
 
 let TargetGuard = "sme2,sme-f64f64" in {
-  def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLS_MULTI_VG1x4_F64 : Inst<"svmls_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsSharedZA], []>;
-
-  def SVMLA_SINGLE_VG1x2_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLA_SINGLE_VG1x4_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLS_SINGLE_VG1x2_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLS_SINGLE_VG1x4_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsSharedZA], []>;
-
-  def SVMLA_LANE_VG1x2_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
-  def SVMLA_LANE_VG1x4_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
-  def SVMLS_LANE_VG1x2_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
-  def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLA_MULTI_VG1x2_F64 : Inst<"svmla_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmla_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_MULTI_VG1x4_F64 : Inst<"svmla_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmla_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x2_F64 : Inst<"svmls_za64[_{d}]_vg1x2", "vm22", "d", MergeNone, "aarch64_sme_fmls_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_MULTI_VG1x4_F64 : Inst<"svmls_za64[_{d}]_vg1x4", "vm44", "d", MergeNone, "aarch64_sme_fmls_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_SINGLE_VG1x2_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLA_SINGLE_VG1x4_F64 : Inst<"svmla[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmla_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x2_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x2", "vm2d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLS_SINGLE_VG1x4_F64 : Inst<"svmls[_single]_za64[_{d}]_vg1x4", "vm4d", "d", MergeNone, "aarch64_sme_fmls_single_vg1x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLA_LANE_VG1x2_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLA_LANE_VG1x4_F64 : Inst<"svmla_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmla_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLS_LANE_VG1x2_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x2", "vm2di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
+  def SVMLS_LANE_VG1x4_F64 : Inst<"svmls_lane_za64[_{d}]_vg1x4", "vm4di", "d", MergeNone, "aarch64_sme_fmls_lane_vg1x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_1>]>;
 }
 
 // FMLAL/FMLSL/UMLAL/SMLAL
 // SMLALL/UMLALL/USMLALL/SUMLALL
 let TargetGuard = "sme2" in {
   // MULTI MLAL
-  def SVMLAL_MULTI_VG2x2_F16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlal_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG2x4_F16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlal_vg2x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG2x2_S16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlal_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG2x4_S16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlal_vg2x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG2x2_U16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlal_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG2x4_U16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlal_vg2x4", [IsStreaming, IsSharedZA], []>;
-
-  def SVMLAL_MULTI_VG4x2_S8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG4x2_U8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG4x4_S8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG4x4_U8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG2x2_F16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlal_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG2x4_F16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlal_vg2x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG2x2_S16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlal_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG2x4_S16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlal_vg2x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG2x2_U16 : Inst<"svmla_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlal_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG2x4_U16 : Inst<"svmla_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlal_vg2x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLAL_MULTI_VG4x2_S8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smla_za32_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG4x2_U8 : Inst<"svmla_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG4x4_S8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smla_za32_vg4x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG4x4_U8 : Inst<"svmla_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umla_za32_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // MULTI MLSL
-  def SVMLSL_MULTI_VG2x2_F16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG2x4_F16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG2x2_S16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG2x4_S16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG2x2_U16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlsl_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG2x4_U16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlsl_vg2x4", [IsStreaming, IsSharedZA], []>;
-
-  def SVMLSL_MULTI_VG4x2_S8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smls_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG4x2_U8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG4x4_S8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG4x4_U8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG2x2_F16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG2x4_F16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "bh", MergeNone, "aarch64_sme_fmlsl_vg2x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG2x2_S16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "s", MergeNone, "aarch64_sme_smlsl_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG2x4_S16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "s", MergeNone, "aarch64_sme_smlsl_vg2x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG2x2_U16 : Inst<"svmls_za32[_{d}]_vg2x2", "vm22", "Us", MergeNone, "aarch64_sme_umlsl_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG2x4_U16 : Inst<"svmls_za32[_{d}]_vg2x4", "vm44", "Us", MergeNone, "aarch64_sme_umlsl_vg2x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLSL_MULTI_VG4x2_S8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "c", MergeNone, "aarch64_sme_smls_za32_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG4x2_U8 : Inst<"svmls_za32[_{d}]_vg4x2", "vm22", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG4x4_S8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "c", MergeNone, "aarch64_sme_smls_za32_vg4x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG4x4_U8 : Inst<"svmls_za32[_{d}]_vg4x4", "vm44", "Uc", MergeNone, "aarch64_sme_umls_za32_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // SINGLE MLAL
-  def SVMLAL_SINGLE_VG2x1_F16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG2x2_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG2x4_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG2x1_S16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG2x2_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG2x4_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG2x1_U16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG2x2_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG2x4_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x4", [IsStreaming, IsSharedZA], []>;
-
-  def SVMLAL_SINGLE_VG4x1_S8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x1_U8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x2_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x2_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x4_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x4_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG2x1_F16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG2x2_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG2x4_F16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlal_single_vg2x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG2x1_S16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlal_single_vg2x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG2x2_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG2x4_S16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlal_single_vg2x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG2x1_U16 : Inst<"svmla_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG2x2_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG2x4_U16 : Inst<"svmla[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlal_single_vg2x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLAL_SINGLE_VG4x1_S8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x1_U8  : Inst<"svmla_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x2_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x2_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x4_S8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smla_za32_single_vg4x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x4_U8  : Inst<"svmla[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umla_za32_single_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // SINGLE MLSL
-  def SVMLSL_SINGLE_VG2x1_F16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG2x2_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG2x4_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG2x1_S16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG2x2_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG2x4_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG2x1_U16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG2x2_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG2x4_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x4", [IsStreaming, IsSharedZA], []>;
-
-  def SVMLSL_SINGLE_VG4x1_S8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x1_U8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x2_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x2_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x4_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x4_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG2x1_F16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG2x2_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG2x4_F16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "bh", MergeNone, "aarch64_sme_fmlsl_single_vg2x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG2x1_S16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG2x2_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG2x4_S16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "s", MergeNone, "aarch64_sme_smlsl_single_vg2x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG2x1_U16 : Inst<"svmls_za32[_{d}]_vg2x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG2x2_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x2", "vm2d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG2x4_U16 : Inst<"svmls[_single]_za32[_{d}]_vg2x4", "vm4d", "Us", MergeNone, "aarch64_sme_umlsl_single_vg2x4", [IsStreaming, IsInOutZA], []>;
+
+  def SVMLSL_SINGLE_VG4x1_S8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x1_U8  : Inst<"svmls_za32[_{d}]_vg4x1",          "vmdd", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x2_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x2_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x2", "vm2d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x4_S8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "c", MergeNone, "aarch64_sme_smls_za32_single_vg4x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x4_U8  : Inst<"svmls[_single]_za32[_{d}]_vg4x4", "vm4d", "Uc", MergeNone, "aarch64_sme_umls_za32_single_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // INDEXED MLAL
-  def SVMLAL_LANE_VG2x1_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG2x2_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG2x4_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG2x1_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG2x2_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG2x4_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG2x1_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG2x2_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG2x4_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-
-  def SVMLAL_LANE_VG4x1_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLAL_LANE_VG4x1_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLAL_LANE_VG4x2_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLAL_LANE_VG4x2_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLAL_LANE_VG4x4_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLAL_LANE_VG4x4_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG2x1_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_F16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlal_lane_vg2x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x1_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_S16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlal_lane_vg2x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x1_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x2_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG2x4_U16 : Inst<"svmla_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlal_lane_vg2x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  def SVMLAL_LANE_VG4x1_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x1_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x2_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x2_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x4_S8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smla_za32_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLAL_LANE_VG4x4_U8 : Inst<"svmla_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umla_za32_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
 
   // INDEXED MLSL
-  def SVMLSL_LANE_VG2x1_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG2x2_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG2x4_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG2x1_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG2x2_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG2x4_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG2x1_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG2x2_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG2x4_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-
-  def SVMLSL_LANE_VG4x1_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLSL_LANE_VG4x1_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLSL_LANE_VG4x2_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLSL_LANE_VG4x2_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLSL_LANE_VG4x4_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVMLSL_LANE_VG4x4_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG2x1_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_F16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "bh", MergeNone, "aarch64_sme_fmlsl_lane_vg2x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x1_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_S16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "s", MergeNone, "aarch64_sme_smlsl_lane_vg2x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x1_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x1", "vmddi", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x2_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x2", "vm2di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG2x4_U16 : Inst<"svmls_lane_za32[_{d}]_vg2x4", "vm4di", "Us", MergeNone, "aarch64_sme_umlsl_lane_vg2x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+
+  def SVMLSL_LANE_VG4x1_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x1_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x1", "vmddi", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x2_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x2_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x2", "vm2di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x4_S8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "c", MergeNone, "aarch64_sme_smls_za32_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVMLSL_LANE_VG4x4_U8 : Inst<"svmls_lane_za32[_{d}]_vg4x4", "vm4di", "Uc", MergeNone, "aarch64_sme_umls_za32_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
 
   // SINGLE SUMLALL
   // Single sumla maps to usmla, with zn & zm operands swapped
-  def SVSUMLALL_SINGLE_VG4x1 : Inst<"svsumla_za32[_{d}]_vg4x1",          "vmdu",   "c", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
+  def SVSUMLALL_SINGLE_VG4x1 : Inst<"svsumla_za32[_{d}]_vg4x1",          "vmdu",   "c", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsInOutZA], []>;
 
-  def SVSUMLALL_SINGLE_VG4x2 : Inst<"svsumla[_single]_za32[_{d}]_vg4x2", "vm2.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVSUMLALL_SINGLE_VG4x4 : Inst<"svsumla[_single]_za32[_{d}]_vg4x4", "vm4.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVSUMLALL_SINGLE_VG4x2 : Inst<"svsumla[_single]_za32[_{d}]_vg4x2", "vm2.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVSUMLALL_SINGLE_VG4x4 : Inst<"svsumla[_single]_za32[_{d}]_vg4x4", "vm4.du", "c", MergeNone, "aarch64_sme_sumla_za32_single_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // Multi-multi sumla builtins are mapped to usmla, with zn & zm operands swapped
-  def SVSUMLALL_MULTI_VG4x2 : Inst<"svsumla_za32[_{d}]_vg4x2", "vm2.d2.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVSUMLALL_MULTI_VG4x4 : Inst<"svsumla_za32[_{d}]_vg4x4", "vm4.d4.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVSUMLALL_MULTI_VG4x2 : Inst<"svsumla_za32[_{d}]_vg4x2", "vm2.d2.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVSUMLALL_MULTI_VG4x4 : Inst<"svsumla_za32[_{d}]_vg4x4", "vm4.d4.u", "c", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // INDEXED SUMLALL
-  def SVSUMLALL_LANE_VG4x1 : Inst<"svsumla_lane_za32[_{d}]_vg4x1", "vmdui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVSUMLALL_LANE_VG4x2 : Inst<"svsumla_lane_za32[_{d}]_vg4x2", "vm2ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVSUMLALL_LANE_VG4x4 : Inst<"svsumla_lane_za32[_{d}]_vg4x4", "vm4ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVSUMLALL_LANE_VG4x1 : Inst<"svsumla_lane_za32[_{d}]_vg4x1", "vmdui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVSUMLALL_LANE_VG4x2 : Inst<"svsumla_lane_za32[_{d}]_vg4x2", "vm2ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVSUMLALL_LANE_VG4x4 : Inst<"svsumla_lane_za32[_{d}]_vg4x4", "vm4ui", "c", MergeNone, "aarch64_sme_sumla_za32_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
 
   // SINGLE USMLALL
-  def SVUSMLALL_SINGLE_VG4x1 : Inst<"svusmla_za32[_{d}]_vg4x1",          "vmdx",   "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-  def SVUSMLALL_SINGLE_VG4x2 : Inst<"svusmla[_single]_za32[_{d}]_vg4x2", "vm2.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVUSMLALL_SINGLE_VG4x4 : Inst<"svusmla[_single]_za32[_{d}]_vg4x4", "vm4.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_SINGLE_VG4x1 : Inst<"svusmla_za32[_{d}]_vg4x1",          "vmdx",   "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x1", [IsStreaming, IsInOutZA], []>;
+  def SVUSMLALL_SINGLE_VG4x2 : Inst<"svusmla[_single]_za32[_{d}]_vg4x2", "vm2.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVUSMLALL_SINGLE_VG4x4 : Inst<"svusmla[_single]_za32[_{d}]_vg4x4", "vm4.dx", "Uc", MergeNone, "aarch64_sme_usmla_za32_single_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // MULTI USMLALL
-  def SVUSMLALL_MULTI_VG4x2 : Inst<"svusmla_za32[_{d}]_vg4x2", "vm2.d2.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVUSMLALL_MULTI_VG4x4 : Inst<"svusmla_za32[_{d}]_vg4x4", "vm4.d4.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVUSMLALL_MULTI_VG4x2 : Inst<"svusmla_za32[_{d}]_vg4x2", "vm2.d2.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVUSMLALL_MULTI_VG4x4 : Inst<"svusmla_za32[_{d}]_vg4x4", "vm4.d4.x", "Uc", MergeNone, "aarch64_sme_usmla_za32_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // INDEXED USMLALL
-  def SVUSMLALL_LANE_VG4x1 : Inst<"svusmla_lane_za32[_{d}]_vg4x1", "vmdxi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVUSMLALL_LANE_VG4x2 : Inst<"svusmla_lane_za32[_{d}]_vg4x2", "vm2xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
-  def SVUSMLALL_LANE_VG4x4 : Inst<"svusmla_lane_za32[_{d}]_vg4x4", "vm4xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVUSMLALL_LANE_VG4x1 : Inst<"svusmla_lane_za32[_{d}]_vg4x1", "vmdxi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVUSMLALL_LANE_VG4x2 : Inst<"svusmla_lane_za32[_{d}]_vg4x2", "vm2xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
+  def SVUSMLALL_LANE_VG4x4 : Inst<"svusmla_lane_za32[_{d}]_vg4x4", "vm4xi", "Uc", MergeNone, "aarch64_sme_usmla_za32_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_15>]>;
 }
 
 let TargetGuard = "sme2,sme-i16i64" in {
   // MULTI MLAL
-  def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG4x4_S16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_MULTI_VG4x4_U16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_MULTI_VG4x2_S16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smla_za64_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG4x2_U16 : Inst<"svmla_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG4x4_S16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smla_za64_vg4x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_MULTI_VG4x4_U16 : Inst<"svmla_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umla_za64_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // MULTI MLSL
-  def SVMLSL_MULTI_VG4x2_S16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG4x2_U16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG4x4_S16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_MULTI_VG4x4_U16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_MULTI_VG4x2_S16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "s", MergeNone, "aarch64_sme_smls_za64_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG4x2_U16 : Inst<"svmls_za64[_{d}]_vg4x2", "vm22", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG4x4_S16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "s", MergeNone, "aarch64_sme_smls_za64_vg4x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_MULTI_VG4x4_U16 : Inst<"svmls_za64[_{d}]_vg4x4", "vm44", "Us", MergeNone, "aarch64_sme_umls_za64_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // SINGLE MLAL
-  def SVMLAL_SINGLE_VG4x1_S16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x1_U16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x2_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x2_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x4_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLAL_SINGLE_VG4x4_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLAL_SINGLE_VG4x1_S16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x1_U16 : Inst<"svmla_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x2_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x2_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x4_S16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smla_za64_single_vg4x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLAL_SINGLE_VG4x4_U16 : Inst<"svmla[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umla_za64_single_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // SINGLE MLSL
-  def SVMLSL_SINGLE_VG4x1_S16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x1_U16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x1", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x2_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x2_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x2", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x4_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
-  def SVMLSL_SINGLE_VG4x4_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x4", [IsStreaming, IsSharedZA], []>;
+  def SVMLSL_SINGLE_VG4x1_S16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x1_U16 : Inst<"svmls_za64[_{d}]_vg4x1",          "vmdd", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x1", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x2_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x2_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x2", "vm2d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x2", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x4_S16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "s", MergeNone, "aarch64_sme_smls_za64_single_vg4x4", [IsStreaming, IsInOutZA], []>;
+  def SVMLSL_SINGLE_VG4x4_U16 : Inst<"svmls[_single]_za64[_{d}]_vg4x4", "vm4d", "Us", MergeNone, "aarch64_sme_umls_za64_single_vg4x4", [IsStreaming, IsInOutZA], []>;
 
   // INDEXED MLAL
-  def SVMLAL_LANE_VG4x1_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG4x1_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG4x2_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG4x2_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG4x4_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLAL_LANE_VG4x4_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x1_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x1_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x2_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x2_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x4_S16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smla_za64_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLAL_LANE_VG4x4_U16 : Inst<"svmla_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umla_za64_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
 
   // INDEXED MLSL
-  def SVMLSL_LANE_VG4x1_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG4x1_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x1", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG4x2_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG4x2_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x2", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG4x4_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
-  def SVMLSL_LANE_VG4x4_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x4", [IsStreaming, IsSharedZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x1_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x1_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x1", "vmddi", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x1", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x2_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x2_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x2", "vm2di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x2", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x4_S16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "s", MergeNone, "aarch64_sme_smls_za64_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
+  def SVMLSL_LANE_VG4x4_U16 : Inst<"svmls_lane_za64[_{d}]_vg4x4", "vm4di", "Us", MergeNone, "aarch64_sme_umls_za64_lane_vg4x4", [IsStreaming, IsInOutZA], [ImmCheck<3, ImmCheck0_7>]>;
 }
 
 //
 // Spill and fill of ZT0
 //
 let TargetGuard = "sme2" in {
-  def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_0>]>;
-  def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
 //
 // Zero ZT0
 //
 let TargetGuard = "sme2" in {
-  def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsSharedZA], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
 //
 // lookup table expand four contiguous registers
 //
 let TargetGuard = "sme2" in {
-  def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
-  def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
 }
 
 //
 // lookup table expand one register
 //
 let TargetGuard = "sme2" in {
-  def SVLUTI2_LANE_ZT : Inst<"svluti2_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
-  def SVLUTI4_LANE_ZT : Inst<"svluti4_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
+  def SVLUTI2_LANE_ZT : Inst<"svluti2_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
+  def SVLUTI4_LANE_ZT : Inst<"svluti4_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
 }
 
 //
 // lookup table expand two contiguous registers
 //
 let TargetGuard = "sme2" in {
-  def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
-  def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsSharedZA, IsPreservesZA], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
+  def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
 }
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index ad29864440c96..c05684c0b248a 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -222,12 +222,13 @@ def ReverseMergeAnyBinOp      : FlagType<0x800000000>; // e.g. Implement SUBR_X
 def ReverseMergeAnyAccOp      : FlagType<0x1000000000>; // e.g. Implement MSB_X using MLS_X.
 def IsStreaming               : FlagType<0x2000000000>;
 def IsStreamingCompatible     : FlagType<0x4000000000>;
-def IsSharedZA                : FlagType<0x8000000000>;
-def IsPreservesZA             : FlagType<0x10000000000>;
-def IsReadZA                  : FlagType<0x20000000000>;
-def IsWriteZA                 : FlagType<0x40000000000>;
-def IsReductionQV             : FlagType<0x80000000000>;
-def IsStreamingOrSVE2p1       : FlagType<0x80000000000>; // Use for intrinsics that are common between sme/sme2 and sve2p1.
+def IsReadZA                  : FlagType<0x8000000000>;
+def IsWriteZA                 : FlagType<0x10000000000>;
+def IsReductionQV             : FlagType<0x20000000000>;
+def IsStreamingOrSVE2p1       : FlagType<0x40000000000>; // Use for intrinsics that are common between sme/sme2 and sve2p1.
+def IsInZA                    : FlagType<0x80000000000>;
+def IsOutZA                   : FlagType<0x100000000000>;
+def IsInOutZA                 : FlagType<0x200000000000>;
 
 // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
 class ImmCheckType<int val> {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index ace3e386988f0..3e0d94e817634 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3005,6 +3005,15 @@ enum ArmStreamingType {
   ArmStreamingOrSVE2p1
 };
 
+enum ArmSMEState : unsigned {
+  ArmNoState = 0,
+
+  ArmInZA = 0b01,
+  ArmOutZA = 0b10,
+  ArmInOutZA = 0b11,
+  ArmZAMask = 0b11,
+};
+
 bool Sema::ParseSVEImmChecks(
     CallExpr *TheCall, SmallVector<std::tuple<int, int, int>, 3> &ImmChecks) {
   // Perform all the immediate checks for this builtin call.
@@ -3189,26 +3198,20 @@ static void checkArmStreamingBuiltin(Sema &S, CallExpr *TheCall,
   }
 }
 
-static bool hasSMEZAState(const FunctionDecl *FD) {
-  if (auto *Attr = FD->getAttr<ArmNewAttr>())
-    if (Attr->isNewZA())
-      return true;
-  if (const auto *T = FD->getType()->getAs<FunctionProtoType>()) {
-    FunctionType::ArmStateValue State =
-        FunctionType::getArmZAState(T->getAArch64SMEAttributes());
-    if (State != FunctionType::ARM_None)
-      return true;
-  }
-  return false;
+static bool hasArmZAState(const FunctionDecl *FD) {
+  const auto *T = FD->getType()->getAs<FunctionProtoType>();
+  return (T && FunctionType::getArmZAState(T->getAArch64SMEAttributes()) !=
+                   FunctionType::ARM_None) ||
+         (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZA());
 }
 
-static bool hasSMEZAState(unsigned BuiltinID) {
+static ArmSMEState getSMEState(unsigned BuiltinID) {
   switch (BuiltinID) {
   default:
-    return false;
-#define GET_SME_BUILTIN_HAS_ZA_STATE
+    return ArmNoState;
+#define GET_SME_BUILTIN_GET_STATE
 #include "clang/Basic/arm_sme_builtins_za_state.inc"
-#undef GET_SME_BUILTIN_HAS_ZA_STATE
+#undef GET_SME_BUILTIN_GET_STATE
   }
 }
 
@@ -3225,7 +3228,7 @@ bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
     if (BuiltinType)
       checkArmStreamingBuiltin(*this, TheCall, FD, *BuiltinType);
 
-    if (hasSMEZAState(BuiltinID) && !hasSMEZAState(FD))
+    if ((getSMEState(BuiltinID) & ArmZAMask) && !hasArmZAState(FD))
       Diag(TheCall->getBeginLoc(),
            diag::warn_attribute_arm_za_builtin_no_za_state)
           << TheCall->getSourceRange();
diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
index 476da8534ce76..9e4b3920e543b 100644
--- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
+++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
@@ -102,3 +102,9 @@ svint8_t missing_za(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streami
   // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZA state}}
     return svread_hor_za8_s8_m(zd, pg, 0, slice_base);
 }
+
+__arm_new("za")
+svint8_t new_za(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
+    // expected-no-warning
+    return svread_hor_za8_s8_m(zd, pg, 0, slice_base);
+}
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 99b7148c17496..b67c8e524287f 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1720,21 +1720,21 @@ void SVEEmitter::createBuiltinZAState(raw_ostream &OS) {
   for (auto *R : RV)
     createIntrinsic(R, Defs);
 
-  std::map<bool, std::set<std::string>> DefsZAState;
-
-  uint64_t IsSharedZAFlag = getEnumValueForFlag("IsSharedZA");
+  std::map<std::string, std::set<std::string>> IntrinsicsPerState;
   for (auto &Def : Defs) {
-    bool HasZAState = Def->isFlagSet(IsSharedZAFlag);
-    DefsZAState[HasZAState].insert(Def->getMangledName());
+    if (Def->isFlagSet(getEnumValueForFlag("IsInZA")))
+      IntrinsicsPerState["ArmInZA"].insert(Def->getMangledName());
+    else if (Def->isFlagSet(getEnumValueForFlag("IsOutZA")))
+      IntrinsicsPerState["ArmOutZA"].insert(Def->getMangledName());
+    else if (Def->isFlagSet(getEnumValueForFlag("IsInOutZA")))
+      IntrinsicsPerState["ArmInOutZA"].insert(Def->getMangledName());
   }
 
-  OS << "#ifdef GET_SME_BUILTIN_HAS_ZA_STATE\n";
-
-  for (auto HasZA : {true, false}) {
-    auto Names = DefsZAState[HasZA];
-    for (auto Name : Names)
+  OS << "#ifdef GET_SME_BUILTIN_GET_STATE\n";
+  for (auto &KV : IntrinsicsPerState) {
+    for (StringRef Name : KV.second)
       OS << "case SME::BI__builtin_sme_" << Name << ":\n";
-    OS << "  return " << (HasZA ? "true" : "false") << ";\n";
+    OS << "  return " << KV.first << ";\n";
   }
   OS << "#endif\n\n";
 }

From 5f41cef58f72ebe950cc9777a8e29d1d28e543d4 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Fri, 19 Jan 2024 16:15:38 +0000
Subject: [PATCH 152/843] [AArch64] NFC: Simplify discombobulating
 'requiresSMChange' interface (#78703)

Having it return a `std::optional<bool>` is unnecessarily confusing.
This patch changes it to a simple 'bool'.

This patch also removes the 'BodyOverridesInterface' operand because
there is only a single use for this which is easily rewritten.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    | 15 +++--
 .../AArch64/AArch64TargetTransformInfo.cpp    |  5 +-
 .../AArch64/Utils/AArch64SMEAttributes.cpp    | 20 ++-----
 .../AArch64/Utils/AArch64SMEAttributes.h      |  9 +--
 .../Target/AArch64/SMEAttributesTest.cpp      | 59 ++++---------------
 5 files changed, 28 insertions(+), 80 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 7b9b6a7a42812..09e42b72be63c 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7635,8 +7635,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   SDValue PStateSM;
-  std::optional<bool> RequiresSMChange =
-      CallerAttrs.requiresSMChange(CalleeAttrs);
+  bool RequiresSMChange = CallerAttrs.requiresSMChange(CalleeAttrs);
   if (RequiresSMChange) {
     if (CallerAttrs.hasStreamingInterfaceOrBody())
       PStateSM = DAG.getConstant(1, DL, MVT::i64);
@@ -7910,8 +7909,9 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   SDValue InGlue;
   if (RequiresSMChange) {
-    SDValue NewChain = changeStreamingMode(DAG, DL, *RequiresSMChange, Chain,
-                                           InGlue, PStateSM, true);
+    SDValue NewChain =
+        changeStreamingMode(DAG, DL, CalleeAttrs.hasStreamingInterface(), Chain,
+                            InGlue, PStateSM, true);
     Chain = NewChain.getValue(0);
     InGlue = NewChain.getValue(1);
   }
@@ -8061,8 +8061,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (RequiresSMChange) {
     assert(PStateSM && "Expected a PStateSM to be set");
-    Result = changeStreamingMode(DAG, DL, !*RequiresSMChange, Result, InGlue,
-                                 PStateSM, false);
+    Result = changeStreamingMode(DAG, DL, !CalleeAttrs.hasStreamingInterface(),
+                                 Result, InGlue, PStateSM, false);
   }
 
   if (RequiresLazySave) {
@@ -25463,8 +25463,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
   if (auto *Base = dyn_cast<CallBase>(&Inst)) {
     auto CallerAttrs = SMEAttrs(*Inst.getFunction());
     auto CalleeAttrs = SMEAttrs(*Base);
-    if (CallerAttrs.requiresSMChange(CalleeAttrs,
-                                     /*BodyOverridesInterface=*/false) ||
+    if (CallerAttrs.requiresSMChange(CalleeAttrs) ||
         CallerAttrs.requiresLazySave(CalleeAttrs))
       return true;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d358a5c8bd949..08ae536fe9bbf 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -236,8 +236,9 @@ bool AArch64TTIImpl::areInlineCompatible(const Function *Caller,
     return false;
 
   if (CallerAttrs.requiresLazySave(CalleeAttrs) ||
-      CallerAttrs.requiresSMChange(CalleeAttrs,
-                                   /*BodyOverridesInterface=*/true)) {
+      (CallerAttrs.requiresSMChange(CalleeAttrs) &&
+       (!CallerAttrs.hasStreamingInterfaceOrBody() ||
+        !CalleeAttrs.hasStreamingBody()))) {
     if (hasPossibleIncompatibleOps(Callee))
       return false;
   }
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index ccdec78d78086..9693b6a664be2 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -82,27 +82,17 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
     Bitmask |= encodeZT0State(StateValue::New);
 }
 
-std::optional<bool>
-SMEAttrs::requiresSMChange(const SMEAttrs &Callee,
-                           bool BodyOverridesInterface) const {
-  // If the transition is not through a call (e.g. when considering inlining)
-  // and Callee has a streaming body, then we can ignore the interface of
-  // Callee.
-  if (BodyOverridesInterface && Callee.hasStreamingBody()) {
-    return hasStreamingInterfaceOrBody() ? std::nullopt
-                                         : std::optional<bool>(true);
-  }
-
+bool SMEAttrs::requiresSMChange(const SMEAttrs &Callee) const {
   if (Callee.hasStreamingCompatibleInterface())
-    return std::nullopt;
+    return false;
 
   // Both non-streaming
   if (hasNonStreamingInterfaceAndBody() && Callee.hasNonStreamingInterface())
-    return std::nullopt;
+    return false;
 
   // Both streaming
   if (hasStreamingInterfaceOrBody() && Callee.hasStreamingInterface())
-    return std::nullopt;
+    return false;
 
-  return Callee.hasStreamingInterface();
+  return true;
 }
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index af2854856fb97..6f622f1996a3a 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -75,14 +75,7 @@ class SMEAttrs {
 
   /// \return true if a call from Caller -> Callee requires a change in
   /// streaming mode.
-  /// If \p BodyOverridesInterface is true and Callee has a streaming body,
-  /// then requiresSMChange considers a call to Callee as having a Streaming
-  /// interface. This can be useful when considering e.g. inlining, where we
-  /// explicitly want the body to overrule the interface (because after inlining
-  /// the interface is no longer relevant).
-  std::optional<bool>
-  requiresSMChange(const SMEAttrs &Callee,
-                   bool BodyOverridesInterface = false) const;
+  bool requiresSMChange(const SMEAttrs &Callee) const;
 
   // Interfaces to query PSTATE.ZA
   bool hasNewZABody() const { return Bitmask & ZA_New; }
diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
index 2f7201464ba2f..294e557181424 100644
--- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
+++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
@@ -193,86 +193,51 @@ TEST(SMEAttributes, Transitions) {
   ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal)));
   // Normal -> Normal + LocallyStreaming
   ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal | SA::SM_Body)));
-  ASSERT_EQ(*SA(SA::Normal)
-                 .requiresSMChange(SA(SA::Normal | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
 
   // Normal -> Streaming
-  ASSERT_EQ(*SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled)), true);
+  ASSERT_TRUE(SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled)));
   // Normal -> Streaming + LocallyStreaming
-  ASSERT_EQ(*SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)),
-            true);
-  ASSERT_EQ(*SA(SA::Normal)
-                 .requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
+  ASSERT_TRUE(
+      SA(SA::Normal).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)));
 
   // Normal -> Streaming-compatible
   ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::SM_Compatible)));
   // Normal -> Streaming-compatible + LocallyStreaming
   ASSERT_FALSE(
       SA(SA::Normal).requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body)));
-  ASSERT_EQ(*SA(SA::Normal)
-                 .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
 
   // Streaming -> Normal
-  ASSERT_EQ(*SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal)), false);
+  ASSERT_TRUE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal)));
   // Streaming -> Normal + LocallyStreaming
-  ASSERT_EQ(*SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal | SA::SM_Body)),
-            false);
-  ASSERT_FALSE(SA(SA::SM_Enabled)
-                   .requiresSMChange(SA(SA::Normal | SA::SM_Body),
-                                     /*BodyOverridesInterface=*/true));
+  ASSERT_TRUE(
+      SA(SA::SM_Enabled).requiresSMChange(SA(SA::Normal | SA::SM_Body)));
 
   // Streaming -> Streaming
   ASSERT_FALSE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Enabled)));
   // Streaming -> Streaming + LocallyStreaming
   ASSERT_FALSE(
       SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)));
-  ASSERT_FALSE(SA(SA::SM_Enabled)
-                   .requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body),
-                                     /*BodyOverridesInterface=*/true));
 
   // Streaming -> Streaming-compatible
   ASSERT_FALSE(SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Compatible)));
   // Streaming -> Streaming-compatible + LocallyStreaming
   ASSERT_FALSE(
       SA(SA::SM_Enabled).requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body)));
-  ASSERT_FALSE(SA(SA::SM_Enabled)
-                   .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body),
-                                     /*BodyOverridesInterface=*/true));
 
   // Streaming-compatible -> Normal
-  ASSERT_EQ(*SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal)), false);
-  ASSERT_EQ(
-      *SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal | SA::SM_Body)),
-      false);
-  ASSERT_EQ(*SA(SA::SM_Compatible)
-                 .requiresSMChange(SA(SA::Normal | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
+  ASSERT_TRUE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal)));
+  ASSERT_TRUE(
+      SA(SA::SM_Compatible).requiresSMChange(SA(SA::Normal | SA::SM_Body)));
 
   // Streaming-compatible -> Streaming
-  ASSERT_EQ(*SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled)), true);
+  ASSERT_TRUE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled)));
   // Streaming-compatible -> Streaming + LocallyStreaming
-  ASSERT_EQ(
-      *SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)),
-      true);
-  ASSERT_EQ(*SA(SA::SM_Compatible)
-                 .requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
+  ASSERT_TRUE(
+      SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Enabled | SA::SM_Body)));
 
   // Streaming-compatible -> Streaming-compatible
   ASSERT_FALSE(SA(SA::SM_Compatible).requiresSMChange(SA(SA::SM_Compatible)));
   // Streaming-compatible -> Streaming-compatible + LocallyStreaming
   ASSERT_FALSE(SA(SA::SM_Compatible)
                    .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body)));
-  ASSERT_EQ(*SA(SA::SM_Compatible)
-                 .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body),
-                                   /*BodyOverridesInterface=*/true),
-            true);
 }

From 205e15c176f540dc37f04507355a77d642ddcb22 Mon Sep 17 00:00:00 2001
From: Marius Brehler <marius.brehler@iml.fraunhofer.de>
Date: Fri, 19 Jan 2024 17:37:46 +0100
Subject: [PATCH 153/843] [mlir][docs] Fix broken link

---
 mlir/docs/Dialects/LLVM.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/docs/Dialects/LLVM.md b/mlir/docs/Dialects/LLVM.md
index 5bbccda7cf6bd..a49ba35db9a68 100644
--- a/mlir/docs/Dialects/LLVM.md
+++ b/mlir/docs/Dialects/LLVM.md
@@ -52,7 +52,7 @@ module attributes {llvm.data_layout = "e",
 LLVM functions are represented by a special operation, `llvm.func`, that has
 syntax similar to that of the built-in function operation but supports
 LLVM-related features such as linkage and variadic argument lists. See detailed
-description in the operation list [below](#llvmfunc-mlirllvmllvmfuncop).
+description in the operation list [below](#llvmfunc-llvmllvmfuncop).
 
 ### PHI Nodes and Block Arguments
 

From cebe4de66fb7effa9f8a521c2b9ed2c2b890e5b9 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Fri, 19 Jan 2024 10:37:47 -0600
Subject: [PATCH 154/843] [libc] Fix test failing on GPU using deprecated
 'add_unittest'

Summary:
We use `add_libc_test' now because it works for both hermetic and unit
tests. If the test needs to be unit test only you use `UNIT_TEST_ONLY`
as an argument.
---
 libc/test/include/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/test/include/CMakeLists.txt b/libc/test/include/CMakeLists.txt
index 95a3aba9d95d9..183e7ecde719e 100644
--- a/libc/test/include/CMakeLists.txt
+++ b/libc/test/include/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_custom_target(libc_include_tests)
 
-add_libc_unittest(
+add_libc_test(
   sys_queue_test
   SUITE
     libc_include_tests

From d0d072710468316edbd4130e50f1146c5a6aca89 Mon Sep 17 00:00:00 2001
From: Jordan Rupprecht <rupprecht@google.com>
Date: Fri, 19 Jan 2024 08:50:05 -0800
Subject: [PATCH 155/843] [lldb][test] Apply @expectedFailureAll/@skipIf early
 for debug_info tests (#73067)

The @expectedFailureAll and @skipIf decorators will mark the test case
as xfail/skip if _all_ conditions passed in match, including debug_info.
* If debug_info is not one of the matching conditions, we can
immediately evaluate the check and decide if it should be decorated.
* If debug_info *is* present as a match condition, we need to defer
whether or not to decorate until when the `LLDBTestCaseFactory`
metaclass expands the test case into its potential variants. This is
still early enough that the standard `unittest` framework will recognize
the test as xfail/skip by the time the test actually runs.

TestDecorators exhibits the edge cases more thoroughly. With the
exception of `@expectedFailureIf` (added by this commit), all those test
cases pass prior to this commit.

This is a followup to 212a60ec37322f853e91e171b305479b1abff2f2.
---
 .../Python/lldbsuite/test/decorators.py       |  53 ++++++++-
 .../Python/lldbsuite/test/lldbtest.py         |  22 ++++
 lldb/test/API/test_utils/TestDecorators.py    | 110 +++++++++++++++++-
 3 files changed, 179 insertions(+), 6 deletions(-)

diff --git a/lldb/packages/Python/lldbsuite/test/decorators.py b/lldb/packages/Python/lldbsuite/test/decorators.py
index a4cee1f576162..0fb146913388e 100644
--- a/lldb/packages/Python/lldbsuite/test/decorators.py
+++ b/lldb/packages/Python/lldbsuite/test/decorators.py
@@ -113,6 +113,21 @@ def _compiler_supports(
     return True
 
 
+def expectedFailureIf(condition, bugnumber=None):
+    def expectedFailure_impl(func):
+        if isinstance(func, type) and issubclass(func, unittest2.TestCase):
+            raise Exception("Decorator can only be used to decorate a test method")
+
+        if condition:
+            return unittest2.expectedFailure(func)
+        return func
+
+    if callable(bugnumber):
+        return expectedFailure_impl(bugnumber)
+    else:
+        return expectedFailure_impl
+
+
 def expectedFailureIfFn(expected_fn, bugnumber=None):
     def expectedFailure_impl(func):
         if isinstance(func, type) and issubclass(func, unittest2.TestCase):
@@ -174,6 +189,34 @@ def wrapper(*args, **kwargs):
         return skipTestIfFn_impl
 
 
+def _xfailForDebugInfo(expected_fn, bugnumber=None):
+    def expectedFailure_impl(func):
+        if isinstance(func, type) and issubclass(func, unittest2.TestCase):
+            raise Exception("Decorator can only be used to decorate a test method")
+
+        func.__xfail_for_debug_info_cat_fn__ = expected_fn
+        return func
+
+    if callable(bugnumber):
+        return expectedFailure_impl(bugnumber)
+    else:
+        return expectedFailure_impl
+
+
+def _skipForDebugInfo(expected_fn, bugnumber=None):
+    def skipImpl(func):
+        if isinstance(func, type) and issubclass(func, unittest2.TestCase):
+            raise Exception("Decorator can only be used to decorate a test method")
+
+        func.__skip_for_debug_info_cat_fn__ = expected_fn
+        return func
+
+    if callable(bugnumber):
+        return skipImpl(bugnumber)
+    else:
+        return skipImpl
+
+
 def _decorateTest(
     mode,
     bugnumber=None,
@@ -191,7 +234,7 @@ def _decorateTest(
     dwarf_version=None,
     setting=None,
 ):
-    def fn(self):
+    def fn(actual_debug_info=None):
         skip_for_os = _match_decorator_property(
             lldbplatform.translate(oslist), lldbplatformutil.getPlatform()
         )
@@ -204,7 +247,7 @@ def fn(self):
         skip_for_arch = _match_decorator_property(
             archs, lldbplatformutil.getArchitecture()
         )
-        skip_for_debug_info = _match_decorator_property(debug_info, self.getDebugInfo())
+        skip_for_debug_info = _match_decorator_property(debug_info, actual_debug_info)
         skip_for_triple = _match_decorator_property(
             triple, lldb.selected_platform.GetTriple()
         )
@@ -279,9 +322,13 @@ def fn(self):
         return reason_str
 
     if mode == DecorateMode.Skip:
+        if debug_info:
+            return _skipForDebugInfo(fn, bugnumber)
         return skipTestIfFn(fn, bugnumber)
     elif mode == DecorateMode.Xfail:
-        return expectedFailureIfFn(fn, bugnumber)
+        if debug_info:
+            return _xfailForDebugInfo(fn, bugnumber)
+        return expectedFailureIf(fn(), bugnumber)
     else:
         return None
 
diff --git a/lldb/packages/Python/lldbsuite/test/lldbtest.py b/lldb/packages/Python/lldbsuite/test/lldbtest.py
index dc4e322c675dc..3abc713398490 100644
--- a/lldb/packages/Python/lldbsuite/test/lldbtest.py
+++ b/lldb/packages/Python/lldbsuite/test/lldbtest.py
@@ -1667,6 +1667,11 @@ def __new__(cls, name, bases, attrs):
         if original_testcase.NO_DEBUG_INFO_TESTCASE:
             return original_testcase
 
+        # Default implementation for skip/xfail reason based on the debug category,
+        # where "None" means to run the test as usual.
+        def no_reason(_):
+            return None
+
         newattrs = {}
         for attrname, attrvalue in attrs.items():
             if attrname.startswith("test") and not getattr(
@@ -1688,6 +1693,12 @@ def __new__(cls, name, bases, attrs):
                         if can_replicate
                     ]
 
+                xfail_for_debug_info_cat_fn = getattr(
+                    attrvalue, "__xfail_for_debug_info_cat_fn__", no_reason
+                )
+                skip_for_debug_info_cat_fn = getattr(
+                    attrvalue, "__skip_for_debug_info_cat_fn__", no_reason
+                )
                 for cat in categories:
 
                     @decorators.add_test_categories([cat])
@@ -1698,6 +1709,17 @@ def test_method(self, attrvalue=attrvalue):
                     method_name = attrname + "_" + cat
                     test_method.__name__ = method_name
                     test_method.debug_info = cat
+
+                    xfail_reason = xfail_for_debug_info_cat_fn(cat)
+                    if xfail_reason:
+                        test_method = unittest2.expectedFailure(xfail_reason)(
+                            test_method
+                        )
+
+                    skip_reason = skip_for_debug_info_cat_fn(cat)
+                    if skip_reason:
+                        test_method = unittest2.skip(skip_reason)(test_method)
+
                     newattrs[method_name] = test_method
 
             else:
diff --git a/lldb/test/API/test_utils/TestDecorators.py b/lldb/test/API/test_utils/TestDecorators.py
index 97d144b6d4441..eb09db69de349 100644
--- a/lldb/test/API/test_utils/TestDecorators.py
+++ b/lldb/test/API/test_utils/TestDecorators.py
@@ -1,11 +1,115 @@
-from lldbsuite.test.lldbtest import Base
+import re
+
+from lldbsuite.test.lldbtest import TestBase
 from lldbsuite.test.decorators import *
 
 
-class TestDecorators(Base):
+def expectedFailureDwarf(bugnumber=None):
+    return expectedFailureAll(bugnumber, debug_info="dwarf")
+
+
+class TestDecoratorsNoDebugInfoClass(TestBase):
     NO_DEBUG_INFO_TESTCASE = True
 
     @expectedFailureAll(debug_info="dwarf")
-    def test_decorator_skip_no_debug_info(self):
+    def test_decorator_xfail(self):
         """Test that specifying a debug info category works for a NO_DEBUG_INFO_TESTCASE"""
+
+    @expectedFailureDwarf
+    def test_decorator_xfail_bare_decorator(self):
+        """Same as test_decorator_xfail, but with a custom decorator w/ a bare syntax"""
+
+    @expectedFailureDwarf()
+    def test_decorator_xfail_decorator_empty_args(self):
+        """Same as test_decorator_xfail, but with a custom decorator w/ no args"""
+
+    @add_test_categories(["dwarf"])
+    def test_add_test_categories(self):
+        # Note: the "dwarf" test category is ignored, because we don't generate any debug info test variants
+        self.assertIsNone(self.getDebugInfo())
+
+    @expectedFailureAll
+    def test_xfail_regexp(self):
+        """Test that expectedFailureAll can be empty (but please just use expectedFailure)"""
+        self.fail()
+
+    @expectedFailureAll(compiler=re.compile(".*"))
+    def test_xfail_regexp(self):
+        """Test that xfail can take a regex as a matcher"""
+        self.fail()
+
+    @expectedFailureAll(compiler=no_match(re.compile(".*")))
+    def test_xfail_no_match(self):
+        """Test that xfail can take a no_match matcher"""
+        pass
+
+    @expectedFailureIf(condition=True)
+    def test_xfail_condition_true(self):
+        self.fail()
+
+    @expectedFailureIf(condition=False)
+    def test_xfail_condition_false(self):
+        pass
+
+
+class TestDecorators(TestBase):
+    @expectedFailureAll(debug_info="dwarf")
+    def test_decorator_xfail(self):
+        """Test that expectedFailureAll fails for the debug_info variant"""
+        if self.getDebugInfo() == "dwarf":
+            self.fail()
+
+    @skipIf(debug_info="dwarf")
+    def test_decorator_skip(self):
+        """Test that skipIf skips the debug_info variant"""
+        self.assertNotEqual(self.getDebugInfo(), "dwarf")
+
+    @expectedFailureDwarf
+    def test_decorator_xfail2(self):
+        """Same as test_decorator_xfail, but with a custom decorator w/ a bare syntax"""
+        if self.getDebugInfo() == "dwarf":
+            self.fail()
+
+    @expectedFailureDwarf()
+    def test_decorator_xfail3(self):
+        """Same as test_decorator_xfail, but with a custom decorator w/ no args"""
+        if self.getDebugInfo() == "dwarf":
+            self.fail()
+
+    @add_test_categories(["dwarf"])
+    def test_add_test_categories(self):
+        """Test that add_test_categories limits the kinds of debug info test variants"""
+        self.assertEqual(self.getDebugInfo(), "dwarf")
+
+    @expectedFailureAll(compiler="fake", debug_info="dwarf")
+    def test_decorator_xfail_all(self):
+        """Test that expectedFailureAll requires all conditions to match to be xfail"""
+
+    @skipIf(compiler="fake", debug_info="dwarf")
+    def test_decorator_skip2(self):
+        """Test that expectedFailureAll fails for the debug_info variant"""
+        # Note: the following assertion would fail, if this were not skipped:
+        # self.assertNotEqual(self.getDebugInfo(), "dwarf")
+
+    @expectedFailureAll
+    def test_xfail_regexp(self):
+        """Test that xfail can be empty"""
+        self.fail()
+
+    @expectedFailureAll(compiler=re.compile(".*"))
+    def test_xfail_regexp(self):
+        """Test that xfail can take a regex as a matcher"""
+        self.fail()
+
+    @expectedFailureAll(compiler=no_match(re.compile(".*")))
+    def test_xfail_no_match(self):
+        """Test that xfail can take a no_match matcher"""
+        pass
+
+    @expectedFailureIf(condition=True)
+    def test_xfail_condition_true(self):
+        self.fail()
+
+    @expectedFailureIf(condition=False)
+    def test_xfail_condition_false(self):
         pass

From c80d68a67621b776d7bfbc5447cbbb36f4095e92 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Fri, 19 Jan 2024 12:04:34 -0500
Subject: [PATCH 156/843] [libc] Add float.h header. (#78737)

---
 libc/config/baremetal/arm/headers.txt        |  1 +
 libc/config/baremetal/riscv/headers.txt      |  1 +
 libc/config/darwin/arm/headers.txt           |  1 +
 libc/config/darwin/x86_64/headers.txt        |  1 +
 libc/config/gpu/headers.txt                  |  1 +
 libc/config/linux/aarch64/headers.txt        |  1 +
 libc/config/linux/arm/headers.txt            |  1 +
 libc/config/linux/riscv/headers.txt          |  1 +
 libc/config/linux/x86_64/headers.txt         |  1 +
 libc/include/CMakeLists.txt                  |  8 ++++++++
 libc/include/float.h.def                     | 14 +++++++++++++
 libc/include/llvm-libc-macros/CMakeLists.txt |  6 ++++++
 libc/include/llvm-libc-macros/float-macros.h | 21 ++++++++++++++++++++
 libc/spec/stdc.td                            | 10 ++++++++++
 14 files changed, 68 insertions(+)
 create mode 100644 libc/include/float.h.def
 create mode 100644 libc/include/llvm-libc-macros/float-macros.h

diff --git a/libc/config/baremetal/arm/headers.txt b/libc/config/baremetal/arm/headers.txt
index 6ff51f9786772..38899fabd980c 100644
--- a/libc/config/baremetal/arm/headers.txt
+++ b/libc/config/baremetal/arm/headers.txt
@@ -2,6 +2,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.ctype
     libc.include.fenv
     libc.include.errno
+    libc.include.float
     libc.include.inttypes
     libc.include.math
     libc.include.stdio
diff --git a/libc/config/baremetal/riscv/headers.txt b/libc/config/baremetal/riscv/headers.txt
index 6ff51f9786772..38899fabd980c 100644
--- a/libc/config/baremetal/riscv/headers.txt
+++ b/libc/config/baremetal/riscv/headers.txt
@@ -2,6 +2,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.ctype
     libc.include.fenv
     libc.include.errno
+    libc.include.float
     libc.include.inttypes
     libc.include.math
     libc.include.stdio
diff --git a/libc/config/darwin/arm/headers.txt b/libc/config/darwin/arm/headers.txt
index 7366c530304d4..2dd54b7c1f505 100644
--- a/libc/config/darwin/arm/headers.txt
+++ b/libc/config/darwin/arm/headers.txt
@@ -2,6 +2,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.ctype
     libc.include.errno
     libc.include.fenv
+    libc.include.float
     libc.include.inttypes
     libc.include.math
     libc.include.stdlib
diff --git a/libc/config/darwin/x86_64/headers.txt b/libc/config/darwin/x86_64/headers.txt
index b33fa45e42080..510d62688a45f 100644
--- a/libc/config/darwin/x86_64/headers.txt
+++ b/libc/config/darwin/x86_64/headers.txt
@@ -3,6 +3,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.errno
     # Fenv is currently disabled.
     #libc.include.fenv
+    libc.include.float
     libc.include.inttypes
     libc.include.math
     libc.include.stdlib
diff --git a/libc/config/gpu/headers.txt b/libc/config/gpu/headers.txt
index dae01310fe9c3..e68e18b87b10e 100644
--- a/libc/config/gpu/headers.txt
+++ b/libc/config/gpu/headers.txt
@@ -2,6 +2,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.assert
     libc.include.ctype
     libc.include.string
+    libc.include.float
     libc.include.inttypes
     libc.include.math
     libc.include.fenv
diff --git a/libc/config/linux/aarch64/headers.txt b/libc/config/linux/aarch64/headers.txt
index cfca5959b5ffa..6a4a4aaeb0a8d 100644
--- a/libc/config/linux/aarch64/headers.txt
+++ b/libc/config/linux/aarch64/headers.txt
@@ -4,6 +4,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.errno
     libc.include.features
     libc.include.fenv
+    libc.include.float
     libc.include.inttypes
     libc.include.math
     libc.include.pthread
diff --git a/libc/config/linux/arm/headers.txt b/libc/config/linux/arm/headers.txt
index bd08d8f8fa437..9e6ee51675916 100644
--- a/libc/config/linux/arm/headers.txt
+++ b/libc/config/linux/arm/headers.txt
@@ -2,6 +2,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.ctype
     libc.include.fenv
     libc.include.errno
+    libc.include.float
     libc.include.inttypes
     libc.include.math
     libc.include.stdlib
diff --git a/libc/config/linux/riscv/headers.txt b/libc/config/linux/riscv/headers.txt
index 9c70a3bde74f0..dc1daa48f6c87 100644
--- a/libc/config/linux/riscv/headers.txt
+++ b/libc/config/linux/riscv/headers.txt
@@ -6,6 +6,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.fcntl
     libc.include.features
     libc.include.fenv
+    libc.include.float
     libc.include.inttypes
     libc.include.math
     libc.include.pthread
diff --git a/libc/config/linux/x86_64/headers.txt b/libc/config/linux/x86_64/headers.txt
index a85f87b2a3ee9..b0e0219a30e67 100644
--- a/libc/config/linux/x86_64/headers.txt
+++ b/libc/config/linux/x86_64/headers.txt
@@ -6,6 +6,7 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.fcntl
     libc.include.features
     libc.include.fenv
+    libc.include.float
     libc.include.inttypes
     libc.include.math
     libc.include.pthread
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 9a06f829eee18..14aa9ec6d73f3 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -75,6 +75,14 @@ add_gen_header(
     .llvm-libc-types.imaxdiv_t
 )
 
+add_gen_header(
+  float
+  DEF_FILE float.h.def
+  GEN_HDR float.h
+  DEPENDS
+    .llvm-libc-macros.float_macros
+)
+
 add_gen_header(
   math
   DEF_FILE math.h.def
diff --git a/libc/include/float.h.def b/libc/include/float.h.def
new file mode 100644
index 0000000000000..6d3599d78c69e
--- /dev/null
+++ b/libc/include/float.h.def
@@ -0,0 +1,14 @@
+//===-- C standard library header float.h ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_FLOAT_H
+#define LLVM_LIBC_FLOAT_H
+
+#include <llvm-libc-macros/float-macros.h>
+
+#endif // LLVM_LIBC_FLOAT_H
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index 7b2616d4311d9..d965a6a9443ed 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -67,6 +67,12 @@ add_macro_header(
     file-seek-macros.h
 )
 
+add_macro_header(
+  float_macros
+  HDR
+    float-macros.h
+)
+
 add_macro_header(
   math_macros
   HDR
diff --git a/libc/include/llvm-libc-macros/float-macros.h b/libc/include/llvm-libc-macros/float-macros.h
new file mode 100644
index 0000000000000..1db27f117a46f
--- /dev/null
+++ b/libc/include/llvm-libc-macros/float-macros.h
@@ -0,0 +1,21 @@
+//===-- Definition of macros from float.h ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __LLVM_LIBC_MACROS_FLOAT_MACROS_H
+#define __LLVM_LIBC_MACROS_FLOAT_MACROS_H
+
+#undef FLT_MANT_DIG
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+
+#undef DBL_MANT_DIG
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+
+#undef LDBL_MANT_DIG
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#endif // __LLVM_LIBC_MACROS_FLOAT_MACROS_H
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 714dc21f95ba5..99e9b3ca65ca5 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -847,6 +847,15 @@ def StdC : StandardSpec<"stdc"> {
       ]
   >;
 
+  HeaderSpec Float = HeaderSpec<
+      "float.h",
+      [
+        Macro<"FLT_MANT_DIG">,
+        Macro<"DBL_MANT_DIG">,
+        Macro<"LDBL_MANT_DIG">,
+      ]
+  >;
+
   NamedType SigAtomicT = NamedType<"sig_atomic_t">;
   HeaderSpec Signal = HeaderSpec<
       "signal.h",
@@ -1149,6 +1158,7 @@ def StdC : StandardSpec<"stdc"> {
     CType,
     Errno,
     Fenv,
+    Float,
     Math,
     String,
     StdIO,

From b6677835fed3a204fa043e079a135c4a225d2c0e Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Fri, 19 Jan 2024 09:19:09 -0800
Subject: [PATCH 157/843] [AsmPrinter][DebugNames] Implement DW_IDX_parent
 entries (#77457)

This implements the ideas discussed in [1].

To summarize, this commit changes AsmPrinter so that it outputs
DW_IDX_parent information for debug_name entries. It will enable
debuggers to speed up queries for fully qualified types (based on a
DWARFDeclContext) significantly, as debuggers will no longer need to
parse the entire CU in order to inspect the parent chain of a DIE.
Instead, a debugger can simply take the parent DIE offset from the
accelerator table and peek at its name in the debug_info/debug_str
sections.

The implementation uses two types of DW_FORM for the DW_IDX_parent
attribute:

1. DW_FORM_ref4, which points to the accelerator table entry for the
parent.
2. DW_FORM_flag_present, when the entry has a parent that is not in the
table (that is, the parent doesn't have a name, or isn't allowed to be
in the table as per the DWARF spec). This is space-efficient, since it
takes 0 bytes.

The implementation works by:

1. Changing how abbreviations are encoded (so that they encode which
form, if
any, was used to encode IDX_Parent)
2. Creating an MCLabel per accelerator table entry, so that they may be
referred by IDX_parent references.


When all patches related to this are merged, we are able to show that
evaluating an expression such as:

```
lldb --batch -o 'b CodeGenFunction::GenerateCode' -o run -o 'expr Fn' -- \
  clang++ -c -g test.cpp -o /dev/null
```

is far faster: from ~5000 ms to ~1500ms.

Building llvm-project + clang with and without this patch, and looking
at its impact on object file size:

```
ls -la $(find build_stage2_Debug_idx_parent_assert_dwarf5 -name \*.cpp.o) | awk '{s+=$5}  END {printf "%\047d\n", s}'
11,507,327,592

-la $(find build_stage2_Debug_no_idx_parent_assert_dwarf5 -name \*.cpp.o) | awk '{s+=$5}  END {printf "%\047d\n", s}'
11,436,446,616
```

That is, an increase of 0.62% in total object file size.

Looking only at debug_names:

```
$stage1_build/bin/llvm-objdump --section-headers $(find build_stage2_Debug_idx_parent_assert_dwarf5 -name \*.cpp.o) | grep __debug_names | awk '{s+="0x"$3}  END {printf "%\047d\n", s}'
440,772,348

$stage1_build/bin/llvm-objdump --section-headers $(find build_stage2_Debug_no_idx_parent_assert_dwarf5 -name \*.cpp.o) | grep __debug_names | awk '{s+="0x"$3}  END {printf "%\047d\n", s}'
369,867,920
```

That is an increase of 19%.

DWARF Linkers need to be changed in order to support this. This commit
already brings support to "base" linker, but it does not attempt to
modify the parallel linker. Accelerator entries refer to the
corresponding DIE offset, and this patch also requires the parent DIE
offset -- it's not clear how the parallel linker can access this. It may
be obvious to someone familiar with it, but it would be nice to get help
from its authors.

[1]:
https://discourse.llvm.org/t/rfc-improve-dwarf-5-debug-names-type-lookup-parsing-speed/74151/
---
 llvm/include/llvm/CodeGen/AccelTable.h        |  53 +++++++-
 llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp    | 127 ++++++++++++++++--
 llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp  |  18 ++-
 .../DWARFLinker/Parallel/DWARFLinkerImpl.cpp  |   3 +-
 llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp    |  16 ++-
 .../DebugInfo/Generic/debug-names-one-cu.ll   |  37 ++++-
 .../test/DebugInfo/X86/debug-names-dwarf64.ll |  16 ++-
 .../DebugInfo/X86/debug-names-end-of-list.ll  |   6 +-
 .../X86/debug-names-parents-same-offset.ll    |  66 +++++++++
 llvm/test/DebugInfo/X86/debug-names-types.ll  |  58 +++++---
 .../ARM/accel-imported-declarations.test      |   2 +
 .../ARM/dwarf5-dwarf4-combination-macho.test  |  12 +-
 12 files changed, 356 insertions(+), 58 deletions(-)
 create mode 100644 llvm/test/DebugInfo/X86/debug-names-parents-same-offset.ll

diff --git a/llvm/include/llvm/CodeGen/AccelTable.h b/llvm/include/llvm/CodeGen/AccelTable.h
index 0638fbffda4f3..e6a661696354b 100644
--- a/llvm/include/llvm/CodeGen/AccelTable.h
+++ b/llvm/include/llvm/CodeGen/AccelTable.h
@@ -255,6 +255,20 @@ class AppleAccelTableData : public AccelTableData {
   static uint32_t hash(StringRef Buffer) { return djbHash(Buffer); }
 };
 
+/// Helper class to identify an entry in DWARF5AccelTable based on their DIE
+/// offset and UnitID.
+struct OffsetAndUnitID : std::pair<uint64_t, uint32_t> {
+  using Base = std::pair<uint64_t, uint32_t>;
+  OffsetAndUnitID(Base B) : Base(B) {}
+
+  OffsetAndUnitID(uint64_t Offset, uint32_t UnitID) : Base(Offset, UnitID) {}
+  uint64_t offset() const { return first; };
+  uint32_t unitID() const { return second; };
+};
+
+template <>
+struct DenseMapInfo<OffsetAndUnitID> : DenseMapInfo<OffsetAndUnitID::Base> {};
+
 /// The Data class implementation for DWARF v5 accelerator table. Unlike the
 /// Apple Data classes, this class is just a DIE wrapper, and does not know to
 /// serialize itself. The complete serialization logic is in the
@@ -270,9 +284,12 @@ class DWARF5AccelTableData : public AccelTableData {
 
   DWARF5AccelTableData(const DIE &Die, const uint32_t UnitID,
                        const bool IsTU = false);
-  DWARF5AccelTableData(const uint64_t DieOffset, const unsigned DieTag,
-                       const unsigned UnitID, const bool IsTU = false)
-      : OffsetVal(DieOffset), DieTag(DieTag), UnitID(UnitID), IsTU(IsTU) {}
+  DWARF5AccelTableData(const uint64_t DieOffset,
+                       const std::optional<uint64_t> DefiningParentOffset,
+                       const unsigned DieTag, const unsigned UnitID,
+                       const bool IsTU = false)
+      : OffsetVal(DieOffset), ParentOffset(DefiningParentOffset),
+        DieTag(DieTag), UnitID(UnitID), IsTU(IsTU) {}
 
 #ifndef NDEBUG
   void print(raw_ostream &OS) const override;
@@ -282,19 +299,44 @@ class DWARF5AccelTableData : public AccelTableData {
     assert(isNormalized() && "Accessing DIE Offset before normalizing.");
     return std::get<uint64_t>(OffsetVal);
   }
+
+  OffsetAndUnitID getDieOffsetAndUnitID() const {
+    return {getDieOffset(), UnitID};
+  }
+
   unsigned getDieTag() const { return DieTag; }
   unsigned getUnitID() const { return UnitID; }
   bool isTU() const { return IsTU; }
   void normalizeDIEToOffset() {
     assert(!isNormalized() && "Accessing offset after normalizing.");
-    OffsetVal = std::get<const DIE *>(OffsetVal)->getOffset();
+    const DIE *Entry = std::get<const DIE *>(OffsetVal);
+    ParentOffset = getDefiningParentDieOffset(*Entry);
+    OffsetVal = Entry->getOffset();
   }
   bool isNormalized() const {
     return std::holds_alternative<uint64_t>(OffsetVal);
   }
 
+  std::optional<uint64_t> getParentDieOffset() const {
+    if (auto OffsetAndId = getParentDieOffsetAndUnitID())
+      return OffsetAndId->offset();
+    return {};
+  }
+
+  std::optional<OffsetAndUnitID> getParentDieOffsetAndUnitID() const {
+    assert(isNormalized() && "Accessing DIE Offset before normalizing.");
+    if (!ParentOffset)
+      return std::nullopt;
+    return OffsetAndUnitID(*ParentOffset, getUnitID());
+  }
+
+  /// If `Die` has a non-null parent and the parent is not a declaration,
+  /// return its offset.
+  static std::optional<uint64_t> getDefiningParentDieOffset(const DIE &Die);
+
 protected:
   std::variant<const DIE *, uint64_t> OffsetVal;
+  std::optional<uint64_t> ParentOffset;
   uint32_t DieTag : 16;
   uint32_t UnitID : 15;
   uint32_t IsTU : 1;
@@ -341,7 +383,8 @@ class DWARF5AccelTable : public AccelTable<DWARF5AccelTableData> {
   void addTypeEntries(DWARF5AccelTable &Table) {
     for (auto &Entry : Table.getEntries()) {
       for (auto *Data : Entry.second.getValues<DWARF5AccelTableData *>()) {
-        addName(Entry.second.Name, Data->getDieOffset(), Data->getDieTag(),
+        addName(Entry.second.Name, Data->getDieOffset(),
+                Data->getParentDieOffset(), Data->getDieTag(),
                 Data->getUnitID(), true);
       }
     }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index b72c17aa6f54a..1024aabf2ab0f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -13,6 +13,7 @@
 #include "llvm/CodeGen/AccelTable.h"
 #include "DwarfCompileUnit.h"
 #include "DwarfUnit.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/Dwarf.h"
@@ -207,7 +208,7 @@ class Dwarf5AccelTableWriter : public AccelTableWriter {
   };
 
   Header Header;
-  DenseMap<uint32_t, SmallVector<DWARF5AccelTableData::AttributeEncoding, 2>>
+  DenseMap<uint32_t, SmallVector<DWARF5AccelTableData::AttributeEncoding, 3>>
       Abbreviations;
   ArrayRef<std::variant<MCSymbol *, uint64_t>> CompUnits;
   ArrayRef<std::variant<MCSymbol *, uint64_t>> TypeUnits;
@@ -220,6 +221,8 @@ class Dwarf5AccelTableWriter : public AccelTableWriter {
   MCSymbol *EntryPool = Asm->createTempSymbol("names_entries");
   // Indicates if this module is built with Split Dwarf enabled.
   bool IsSplitDwarf = false;
+  /// Stores the DIE offsets which are indexed by this table.
+  DenseSet<OffsetAndUnitID> IndexedOffsets;
 
   void populateAbbrevsMap();
 
@@ -228,8 +231,11 @@ class Dwarf5AccelTableWriter : public AccelTableWriter {
   void emitBuckets() const;
   void emitStringOffsets() const;
   void emitAbbrevs() const;
-  void emitEntry(const DWARF5AccelTableData &Entry) const;
-  void emitData() const;
+  void emitEntry(
+      const DWARF5AccelTableData &Entry,
+      const DenseMap<OffsetAndUnitID, MCSymbol *> &DIEOffsetToAccelEntryLabel,
+      DenseSet<MCSymbol *> &EmittedAccelEntrySymbols) const;
+  void emitData();
 
 public:
   Dwarf5AccelTableWriter(
@@ -395,23 +401,72 @@ void Dwarf5AccelTableWriter::Header::emit(Dwarf5AccelTableWriter &Ctx) {
   Asm->OutStreamer->emitBytes({AugmentationString, AugmentationStringSize});
 }
 
-static uint32_t constexpr LowerBitSize = dwarf::DW_IDX_type_hash;
+std::optional<uint64_t>
+DWARF5AccelTableData::getDefiningParentDieOffset(const DIE &Die) {
+  if (auto *Parent = Die.getParent();
+      Parent && !Parent->findAttribute(dwarf::Attribute::DW_AT_declaration))
+    return Parent->getOffset();
+  return {};
+}
+
+enum IdxParentEncoding : uint8_t {
+  NoIndexedParent = 0, /// Parent information present but parent isn't indexed.
+  Ref4 = 1,            /// Parent information present and parent is indexed.
+  NoParent = 2,        /// Parent information missing.
+};
+
+static uint32_t constexpr NumBitsIdxParent = 2;
+
+uint8_t encodeIdxParent(const std::optional<dwarf::Form> MaybeParentForm) {
+  if (!MaybeParentForm)
+    return NoParent;
+  switch (*MaybeParentForm) {
+  case dwarf::Form::DW_FORM_flag_present:
+    return NoIndexedParent;
+  case dwarf::Form::DW_FORM_ref4:
+    return Ref4;
+  default:
+    // This is not crashing on bad input: we should only reach this if the
+    // internal compiler logic is faulty; see getFormForIdxParent.
+    llvm_unreachable("Bad form for IDX_parent");
+  }
+}
+
+static uint32_t constexpr ParentBitOffset = dwarf::DW_IDX_type_hash;
+static uint32_t constexpr TagBitOffset = ParentBitOffset + NumBitsIdxParent;
 static uint32_t getTagFromAbbreviationTag(const uint32_t AbbrvTag) {
-  return AbbrvTag >> LowerBitSize;
+  return AbbrvTag >> TagBitOffset;
 }
 
 /// Constructs a unique AbbrevTag that captures what a DIE accesses.
 /// Using this tag we can emit a unique abbreviation for each DIE.
 static uint32_t constructAbbreviationTag(
     const unsigned Tag,
-    const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> &EntryRet) {
+    const std::optional<DWARF5AccelTable::UnitIndexAndEncoding> &EntryRet,
+    std::optional<dwarf::Form> MaybeParentForm) {
   uint32_t AbbrvTag = 0;
   if (EntryRet)
     AbbrvTag |= 1 << EntryRet->Encoding.Index;
   AbbrvTag |= 1 << dwarf::DW_IDX_die_offset;
-  AbbrvTag |= Tag << LowerBitSize;
+  AbbrvTag |= 1 << dwarf::DW_IDX_parent;
+  AbbrvTag |= encodeIdxParent(MaybeParentForm) << ParentBitOffset;
+  AbbrvTag |= Tag << TagBitOffset;
   return AbbrvTag;
 }
+
+static std::optional<dwarf::Form>
+getFormForIdxParent(const DenseSet<OffsetAndUnitID> &IndexedOffsets,
+                    std::optional<OffsetAndUnitID> ParentOffset) {
+  // No parent information
+  if (!ParentOffset)
+    return std::nullopt;
+  // Parent is indexed by this table.
+  if (IndexedOffsets.contains(*ParentOffset))
+    return dwarf::Form::DW_FORM_ref4;
+  // Parent is not indexed by this table.
+  return dwarf::Form::DW_FORM_flag_present;
+}
+
 void Dwarf5AccelTableWriter::populateAbbrevsMap() {
   for (auto &Bucket : Contents.getBuckets()) {
     for (auto *Hash : Bucket) {
@@ -419,12 +474,17 @@ void Dwarf5AccelTableWriter::populateAbbrevsMap() {
         std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
             getIndexForEntry(*Value);
         unsigned Tag = Value->getDieTag();
-        uint32_t AbbrvTag = constructAbbreviationTag(Tag, EntryRet);
+        std::optional<dwarf::Form> MaybeParentForm = getFormForIdxParent(
+            IndexedOffsets, Value->getParentDieOffsetAndUnitID());
+        uint32_t AbbrvTag =
+            constructAbbreviationTag(Tag, EntryRet, MaybeParentForm);
         if (Abbreviations.count(AbbrvTag) == 0) {
-          SmallVector<DWARF5AccelTableData::AttributeEncoding, 2> UA;
+          SmallVector<DWARF5AccelTableData::AttributeEncoding, 3> UA;
           if (EntryRet)
             UA.push_back(EntryRet->Encoding);
           UA.push_back({dwarf::DW_IDX_die_offset, dwarf::DW_FORM_ref4});
+          if (MaybeParentForm)
+            UA.push_back({dwarf::DW_IDX_parent, *MaybeParentForm});
           Abbreviations.try_emplace(AbbrvTag, UA);
         }
       }
@@ -496,15 +556,34 @@ void Dwarf5AccelTableWriter::emitAbbrevs() const {
 }
 
 void Dwarf5AccelTableWriter::emitEntry(
-    const DWARF5AccelTableData &Entry) const {
+    const DWARF5AccelTableData &Entry,
+    const DenseMap<OffsetAndUnitID, MCSymbol *> &DIEOffsetToAccelEntryLabel,
+    DenseSet<MCSymbol *> &EmittedAccelEntrySymbols) const {
   std::optional<DWARF5AccelTable::UnitIndexAndEncoding> EntryRet =
       getIndexForEntry(Entry);
-  uint32_t AbbrvTag = constructAbbreviationTag(Entry.getDieTag(), EntryRet);
+  std::optional<OffsetAndUnitID> MaybeParentOffset =
+      Entry.getParentDieOffsetAndUnitID();
+  std::optional<dwarf::Form> MaybeParentForm =
+      getFormForIdxParent(IndexedOffsets, MaybeParentOffset);
+  uint32_t AbbrvTag =
+      constructAbbreviationTag(Entry.getDieTag(), EntryRet, MaybeParentForm);
   auto AbbrevIt = Abbreviations.find(AbbrvTag);
   assert(AbbrevIt != Abbreviations.end() &&
          "Why wasn't this abbrev generated?");
   assert(getTagFromAbbreviationTag(AbbrevIt->first) == Entry.getDieTag() &&
          "Invalid Tag");
+
+  auto EntrySymbolIt =
+      DIEOffsetToAccelEntryLabel.find(Entry.getDieOffsetAndUnitID());
+  assert(EntrySymbolIt != DIEOffsetToAccelEntryLabel.end());
+  MCSymbol *EntrySymbol = EntrySymbolIt->getSecond();
+
+  // Emit the label for this Entry, so that IDX_parents may refer to it.
+  // Note: a DIE may have multiple accelerator Entries; this check avoids
+  // creating/emitting multiple labels for the same DIE.
+  if (EmittedAccelEntrySymbols.insert(EntrySymbol).second)
+    Asm->OutStreamer->emitLabel(EntrySymbol);
+
   Asm->emitULEB128(AbbrevIt->first, "Abbreviation code");
 
   for (const auto &AttrEnc : AbbrevIt->second) {
@@ -520,20 +599,34 @@ void Dwarf5AccelTableWriter::emitEntry(
       assert(AttrEnc.Form == dwarf::DW_FORM_ref4);
       Asm->emitInt32(Entry.getDieOffset());
       break;
+    case dwarf::DW_IDX_parent: {
+      if (AttrEnc.Form == dwarf::Form::DW_FORM_flag_present)
+        break;
+      auto ParentSymbolIt = DIEOffsetToAccelEntryLabel.find(*MaybeParentOffset);
+      assert(ParentSymbolIt != DIEOffsetToAccelEntryLabel.end());
+      Asm->emitLabelDifference(ParentSymbolIt->getSecond(), EntryPool, 4);
+      break;
+    }
     default:
       llvm_unreachable("Unexpected index attribute!");
     }
   }
 }
 
-void Dwarf5AccelTableWriter::emitData() const {
+void Dwarf5AccelTableWriter::emitData() {
+  DenseMap<OffsetAndUnitID, MCSymbol *> DIEOffsetToAccelEntryLabel;
+
+  for (OffsetAndUnitID Offset : IndexedOffsets)
+    DIEOffsetToAccelEntryLabel.insert({Offset, Asm->createTempSymbol("")});
+
   Asm->OutStreamer->emitLabel(EntryPool);
+  DenseSet<MCSymbol *> EmittedAccelEntrySymbols;
   for (auto &Bucket : Contents.getBuckets()) {
     for (auto *Hash : Bucket) {
       // Remember to emit the label for our offset.
       Asm->OutStreamer->emitLabel(Hash->Sym);
-      for (const auto *Value : Hash->Values)
-        emitEntry(*static_cast<const DWARF5AccelTableData *>(Value));
+      for (const auto *Value : Hash->getValues<DWARF5AccelTableData *>())
+        emitEntry(*Value, DIEOffsetToAccelEntryLabel, EmittedAccelEntrySymbols);
       Asm->OutStreamer->AddComment("End of list: " + Hash->Name.getString());
       Asm->emitInt8(0);
     }
@@ -555,6 +648,12 @@ Dwarf5AccelTableWriter::Dwarf5AccelTableWriter(
       CompUnits(CompUnits), TypeUnits(TypeUnits),
       getIndexForEntry(std::move(getIndexForEntry)),
       IsSplitDwarf(IsSplitDwarf) {
+
+  for (auto &Bucket : Contents.getBuckets())
+    for (auto *Hash : Bucket)
+      for (auto *Value : Hash->getValues<DWARF5AccelTableData *>())
+        IndexedOffsets.insert(Value->getDieOffsetAndUnitID());
+
   populateAbbrevsMap();
 }
 
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index d2b4db3b0c6fb..6c85f9ac24bec 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -2240,14 +2240,20 @@ void DWARFLinker::emitAcceleratorEntriesForUnit(CompileUnit &Unit) {
     } break;
     case AccelTableKind::DebugNames: {
       for (const auto &Namespace : Unit.getNamespaces())
-        DebugNames.addName(Namespace.Name, Namespace.Die->getOffset(),
-                           Namespace.Die->getTag(), Unit.getUniqueID());
+        DebugNames.addName(
+            Namespace.Name, Namespace.Die->getOffset(),
+            DWARF5AccelTableData::getDefiningParentDieOffset(*Namespace.Die),
+            Namespace.Die->getTag(), Unit.getUniqueID());
       for (const auto &Pubname : Unit.getPubnames())
-        DebugNames.addName(Pubname.Name, Pubname.Die->getOffset(),
-                           Pubname.Die->getTag(), Unit.getUniqueID());
+        DebugNames.addName(
+            Pubname.Name, Pubname.Die->getOffset(),
+            DWARF5AccelTableData::getDefiningParentDieOffset(*Pubname.Die),
+            Pubname.Die->getTag(), Unit.getUniqueID());
       for (const auto &Pubtype : Unit.getPubtypes())
-        DebugNames.addName(Pubtype.Name, Pubtype.Die->getOffset(),
-                           Pubtype.Die->getTag(), Unit.getUniqueID());
+        DebugNames.addName(
+            Pubtype.Name, Pubtype.Die->getOffset(),
+            DWARF5AccelTableData::getDefiningParentDieOffset(*Pubtype.Die),
+            Pubtype.Die->getTag(), Unit.getUniqueID());
     } break;
     }
   }
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
index f127214927a5d..56828e75ac0b4 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
@@ -1358,7 +1358,8 @@ void DWARFLinkerImpl::emitDWARFv5DebugNamesSection(const Triple &TargetTriple) {
       case DwarfUnit::AccelType::Namespace:
       case DwarfUnit::AccelType::Type: {
         DebugNames->addName(*DebugStrStrings.getExistingEntry(Info.String),
-                            Info.OutOffset, Info.Tag, CU->getUniqueID());
+                            Info.OutOffset, std::nullopt /*ParentDIEOffset*/,
+                            Info.Tag, CU->getUniqueID());
       } break;
 
       default:
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index d25b732fdba3f..c4c14f5e2c9d3 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/DebugInfo/DWARF/DWARFVerifier.h"
 #include "llvm/ADT/IntervalMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
@@ -1272,6 +1273,20 @@ unsigned DWARFVerifier::verifyNameIndexAttribute(
           NI.getUnitOffset(), Abbr.Code, AttrEnc.Form, dwarf::DW_FORM_data8);
       return 1;
     }
+    return 0;
+  }
+
+  if (AttrEnc.Index == dwarf::DW_IDX_parent) {
+    constexpr static auto AllowedForms = {dwarf::Form::DW_FORM_flag_present,
+                                          dwarf::Form::DW_FORM_ref4};
+    if (!is_contained(AllowedForms, AttrEnc.Form)) {
+      error() << formatv("NameIndex @ {0:x}: Abbreviation {1:x}: DW_IDX_parent "
+                         "uses an unexpected form {2} (should be "
+                         "DW_FORM_ref4 or DW_FORM_flag_present).\n",
+                         NI.getUnitOffset(), Abbr.Code, AttrEnc.Form);
+      return 1;
+    }
+    return 0;
   }
 
   // A list of known index attributes and their expected form classes.
@@ -1286,7 +1301,6 @@ unsigned DWARFVerifier::verifyNameIndexAttribute(
       {dwarf::DW_IDX_compile_unit, DWARFFormValue::FC_Constant, {"constant"}},
       {dwarf::DW_IDX_type_unit, DWARFFormValue::FC_Constant, {"constant"}},
       {dwarf::DW_IDX_die_offset, DWARFFormValue::FC_Reference, {"reference"}},
-      {dwarf::DW_IDX_parent, DWARFFormValue::FC_Constant, {"constant"}},
   };
 
   ArrayRef<FormClassTable> TableRef(Table);
diff --git a/llvm/test/DebugInfo/Generic/debug-names-one-cu.ll b/llvm/test/DebugInfo/Generic/debug-names-one-cu.ll
index 469be45c312e8..92212a24bd153 100644
--- a/llvm/test/DebugInfo/Generic/debug-names-one-cu.ll
+++ b/llvm/test/DebugInfo/Generic/debug-names-one-cu.ll
@@ -7,22 +7,49 @@
 ; CHECK: CU count: 1
 ; CHECK: Local TU count: 0
 ; CHECK: Foreign TU count: 0
-; CHECK: Name count: 1
+; CHECK: Name count: 3
 ; CHECK: CU[0]: 0x{{[0-9a-f]*}}
 
+; CHECK: Abbreviation [[ABBREV_STRUCT:0x[0-9a-f]*]] {
+; CHECK-NEXT:   Tag: DW_TAG_structure_type
+; CHECK-NEXT:   DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT: }
+
 ; CHECK: Abbreviation [[ABBREV:0x[0-9a-f]*]]
 ; CHECK-NEXT: Tag: DW_TAG_variable
 ; CHECK-NEXT: DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT: DW_IDX_parent: DW_FORM_flag_present
+; CHECK-NEXT: }
+
+; The entry for A::B must not have an IDX_Parent, since A is only a forward
+; declaration.
+; CHECK: String: 0x{{[0-9a-f]*}} "B"
+; CHECK-NEXT: Entry
+; CHECK-NEXT: Abbrev: [[ABBREV_STRUCT]]
+; CHECK-NEXT: Tag: DW_TAG_structure_type
+; CHECK-NEXT: DW_IDX_die_offset: 0x{{[0-9a-f]*}}
+; CHECK-NEXT: }
+
+; CHECK: String: 0x{{[0-9a-f]*}} "someA_B"
+; CHECK-NEXT: Entry
+; CHECK-NEXT: Abbrev: [[ABBREV]]
+; CHECK-NEXT: Tag: DW_TAG_variable
+; CHECK-NEXT: DW_IDX_die_offset: 0x{{[0-9a-f]*}}
+; CHECK-NEXT: DW_IDX_parent: true
+; CHECK-NEXT: }
 
 ; CHECK: String: 0x{{[0-9a-f]*}} "foobar"
 ; CHECK-NEXT: Entry
 ; CHECK-NEXT: Abbrev: [[ABBREV]]
 ; CHECK-NEXT: Tag: DW_TAG_variable
 ; CHECK-NEXT: DW_IDX_die_offset: 0x{{[0-9a-f]*}}
+; CHECK-NEXT: DW_IDX_parent: true
+; CHECK-NEXT: }
 
 ; VERIFY: No errors.
 
 @foobar = common dso_local global ptr null, align 8, !dbg !0
+@someA_B = common dso_local global ptr null, align 8, !dbg !18
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7, !8, !9}
@@ -33,9 +60,15 @@
 !2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, producer: "clang version 7.0.0 (trunk 325496) (llvm/trunk 325732)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, globals: !5)
 !3 = !DIFile(filename: "/tmp/cu1.c", directory: "/tmp")
 !4 = !{}
-!5 = !{!0}
+!5 = !{!0, !18}
 !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
 !7 = !{i32 2, !"Dwarf Version", i32 4}
 !8 = !{i32 2, !"Debug Info Version", i32 3}
 !9 = !{i32 1, !"wchar_size", i32 4}
 !10 = !{!"clang version 7.0.0 (trunk 325496) (llvm/trunk 325732)"}
+
+!13 = !{}
+!15 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "B", scope: !16, file: !3, line: 3, size: 8, elements: !13, identifier: "type_A::B")
+!16 = !DICompositeType(tag: DW_TAG_structure_type, name: "A", file: !3, line: 1, size: 8, flags: DIFlagFwdDecl, identifier: "type_A")
+!17 = distinct !DIGlobalVariable(name: "someA_B", scope: !2, file: !3, line: 1, type: !15, isLocal: false, isDefinition: true)
+!18 = !DIGlobalVariableExpression(var: !17, expr: !DIExpression())
diff --git a/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll
index b94bdbcb12010..a6855d77a34ac 100644
--- a/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll
@@ -30,21 +30,25 @@
 ; CHECK-NEXT:     CU[0]: 0x00000000
 ; CHECK-NEXT:   ]
 ; CHECK-NEXT:   Abbreviations [
+; CHECK-NEXT:     Abbreviation [[ABBREV_LABEL:0x[0-9a-f]*]] {
+; CHECK-NEXT:       Tag: DW_TAG_label
+; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_ref4
+; CHECK-NEXT:     }
 ; CHECK-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
 ; CHECK-NEXT:       Tag: DW_TAG_base_type
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; CHECK-NEXT:       Tag: DW_TAG_variable
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:     Abbreviation [[ABBREV_SP:0x[0-9a-f]*]] {
 ; CHECK-NEXT:       Tag: DW_TAG_subprogram
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
-; CHECK-NEXT:     }
-; CHECK-NEXT:     Abbreviation [[ABBREV_LABEL:0x[0-9a-f]*]] {
-; CHECK-NEXT:       Tag: DW_TAG_label
-; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
 ; CHECK-NEXT:   Bucket 0 [
@@ -55,6 +59,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV]]
 ; CHECK-NEXT:         Tag: DW_TAG_base_type
 ; CHECK-NEXT:         DW_IDX_die_offset: [[TYPEDIE]]
+; CHECK-NEXT:         DW_IDX_parent: true
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -66,6 +71,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV1]]
 ; CHECK-NEXT:         Tag: DW_TAG_variable
 ; CHECK-NEXT:         DW_IDX_die_offset: [[VARDIE]]
+; CHECK-NEXT:         DW_IDX_parent: true
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:     Name 3 {
@@ -75,6 +81,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV_SP]]
 ; CHECK-NEXT:         Tag: DW_TAG_subprogram
 ; CHECK-NEXT:         DW_IDX_die_offset: [[SPDIE]]
+; CHECK-NEXT:         DW_IDX_parent: true
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -89,6 +96,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV_LABEL]]
 ; CHECK-NEXT:         Tag: DW_TAG_label
 ; CHECK-NEXT:         DW_IDX_die_offset: [[LABELDIE]]
+; CHECK-NEXT:         DW_IDX_parent: 0x{{.*}}
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
diff --git a/llvm/test/DebugInfo/X86/debug-names-end-of-list.ll b/llvm/test/DebugInfo/X86/debug-names-end-of-list.ll
index f57168a8eff38..b8d4046201c34 100644
--- a/llvm/test/DebugInfo/X86/debug-names-end-of-list.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-end-of-list.ll
@@ -6,8 +6,10 @@
 
 ; CHECK:   .section .debug_names,"",@progbits
 ; CHECK: .Lnames_entries0:
-; CHECK:   .byte 0    # End of list: int
-; CHECK:   .byte 0    # End of list: foo
+; CHECK:   .byte 0
+; CHECK-NEXT:   # End of list: int
+; CHECK:   .byte 0
+; CHECK-NEXT:   # End of list: foo
 
 @foo = common dso_local global i32 0, align 4, !dbg !5
 
diff --git a/llvm/test/DebugInfo/X86/debug-names-parents-same-offset.ll b/llvm/test/DebugInfo/X86/debug-names-parents-same-offset.ll
new file mode 100644
index 0000000000000..9170cfc8508fa
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/debug-names-parents-same-offset.ll
@@ -0,0 +1,66 @@
+; RUN: llc -mtriple=x86_64 -generate-type-units -dwarf-version=5 -filetype=obj %s -o %t
+; RUN: llvm-dwarfdump -debug-names %t | FileCheck %s
+
+; Two structs that have different names but are structurally identical:
+;    namespace MyNamespace {
+;      struct MyStruct1 {
+;          char c1;
+;      };
+;      struct MyStruct2 {
+;          char c2;
+;      };
+;    } // namespace MyNamespace
+;    MyNamespace::MyStruct1 gv1;
+;    MyNamespace::MyStruct2 gv2;
+
+; Using two TUs, this should produce DIE structures with the same offset.
+; We test that accelerator table generation works with this.
+
+; CHECK:      String: {{.*}} "MyStruct2"
+; CHECK-NEXT:      Entry @ {{.*}} {
+; CHECK-NEXT:        Abbrev: [[ABBREV:0x.*]]
+; CHECK-NEXT:        Tag: DW_TAG_structure_type
+; CHECK-NEXT:        DW_IDX_type_unit: 0x01
+; CHECK-NEXT:        DW_IDX_die_offset: [[DieOffset:0x.*]]
+; CHECK-NEXT:        DW_IDX_parent: [[Parent1:0x.*]]
+; CHECK:      String: {{.*}} "MyStruct1"
+; CHECK-NEXT:      Entry @ {{.*}} {
+; CHECK-NEXT:        Abbrev: [[ABBREV]]
+; CHECK-NEXT:        Tag: DW_TAG_structure_type
+; CHECK-NEXT:        DW_IDX_type_unit: 0x00
+; CHECK-NEXT:        DW_IDX_die_offset: [[DieOffset:0x.*]]
+; CHECK-NOT:         DW_IDX_parent: [[Parent1]]
+; CHECK-NEXT:        DW_IDX_parent: 0x{{.*}}
+
+
+%"struct.MyNamespace::MyStruct1" = type { i8 }
+%"struct.MyNamespace::MyStruct2" = type { i8 }
+
+@gv1 = dso_local global %"struct.MyNamespace::MyStruct1" zeroinitializer, align 1, !dbg !0
+@gv2 = dso_local global %"struct.MyNamespace::MyStruct2" zeroinitializer, align 1, !dbg !5
+
+!llvm.dbg.cu = !{!2}
+!llvm.module.flags = !{!15, !16, !17, !18, !19}
+!llvm.ident = !{!20}
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "gv1", scope: !2, file: !3, line: 10, type: !12, isLocal: false, isDefinition: true)
+!2 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !3, producer: "blah", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !4, splitDebugInlining: false)
+!3 = !DIFile(filename: "two_tus.cpp", directory: "blah", checksumkind: CSK_MD5, checksum: "69acf04f32811fe7fd35449b58c3f5b1")
+!4 = !{!0, !5}
+!5 = !DIGlobalVariableExpression(var: !6, expr: !DIExpression())
+!6 = distinct !DIGlobalVariable(name: "gv2", scope: !2, file: !3, line: 11, type: !7, isLocal: false, isDefinition: true)
+!7 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct2", scope: !8, file: !3, line: 5, size: 8, flags: DIFlagTypePassByValue, elements: !9, identifier: "_ZTSN11MyNamespace9MyStruct2E")
+!8 = !DINamespace(name: "MyNamespace", scope: null)
+!9 = !{!10}
+!10 = !DIDerivedType(tag: DW_TAG_member, name: "c2", scope: !7, file: !3, line: 6, baseType: !11, size: 8)
+!11 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!12 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "MyStruct1", scope: !8, file: !3, line: 2, size: 8, flags: DIFlagTypePassByValue, elements: !13, identifier: "_ZTSN11MyNamespace9MyStruct1E")
+!13 = !{!14}
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "c1", scope: !12, file: !3, line: 3, baseType: !11, size: 8)
+!15 = !{i32 7, !"Dwarf Version", i32 5}
+!16 = !{i32 2, !"Debug Info Version", i32 3}
+!17 = !{i32 1, !"wchar_size", i32 4}
+!18 = !{i32 7, !"uwtable", i32 2}
+!19 = !{i32 7, !"frame-pointer", i32 2}
+!20 = !{!"blah"}
diff --git a/llvm/test/DebugInfo/X86/debug-names-types.ll b/llvm/test/DebugInfo/X86/debug-names-types.ll
index 501b7efd88eb9..c44e82578f14a 100644
--- a/llvm/test/DebugInfo/X86/debug-names-types.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-types.ll
@@ -37,27 +37,32 @@
 ; CHECK-NEXT:        LocalTU[0]: 0x00000000
 ; CHECK-NEXT:      ]
 ; CHECK:        Abbreviations [
-; CHECK-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; CHECK-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
 ; CHECK-NEXT:       Tag: DW_TAG_structure_type
+; CHECK-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
-; CHECK-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
-; CHECK-NEXT:       Tag: DW_TAG_structure_type
+; CHECK-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; CHECK-NEXT:       Tag: DW_TAG_base_type
 ; CHECK-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
 ; CHECK-NEXT:       Tag: DW_TAG_base_type
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
-; CHECK-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
-; CHECK-NEXT:       Tag: DW_TAG_subprogram
+; CHECK-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
+; CHECK-NEXT:       Tag: DW_TAG_structure_type
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
-; CHECK-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
-; CHECK-NEXT:       Tag: DW_TAG_base_type
-; CHECK-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; CHECK-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
+; CHECK-NEXT:       Tag: DW_TAG_subprogram
 ; CHECK-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
 ; CHECK-NEXT:   Bucket 0 [
@@ -68,6 +73,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV]]
 ; CHECK-NEXT:         Tag: DW_TAG_base_type
 ; CHECK-NEXT:         DW_IDX_die_offset: 0x0000003e
+; CHECK-NEXT:         DW_IDX_parent: true
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -80,11 +86,13 @@
 ; CHECK-NEXT:           Tag: DW_TAG_structure_type
 ; CHECK-NEXT:           DW_IDX_type_unit: 0x00
 ; CHECK-NEXT:           DW_IDX_die_offset: 0x00000023
+; CHECK-NEXT:           DW_IDX_parent: true
 ; CHECK-NEXT:       }
-; CHECK-NEXT:       Entry @ 0xaa {
+; CHECK-NEXT:       Entry @ {{.+}} {
 ; CHECK-NEXT:         Abbrev: [[ABBREV1]]
 ; CHECK-NEXT:         Tag: DW_TAG_structure_type
 ; CHECK-NEXT:         DW_IDX_die_offset: 0x00000042
+; CHECK-NEXT:         DW_IDX_parent: true
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -96,6 +104,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV2]]
 ; CHECK-NEXT:         Tag: DW_TAG_subprogram
 ; CHECK-NEXT:         DW_IDX_die_offset: 0x00000023
+; CHECK-NEXT:         DW_IDX_parent: true
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -108,6 +117,7 @@
 ; CHECK-NEXT:           Tag: DW_TAG_base_type
 ; CHECK-NEXT:           DW_IDX_type_unit: 0x00
 ; CHECK-NEXT:           DW_IDX_die_offset: 0x00000038
+; CHECK-NEXT:           DW_IDX_parent: true
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -120,7 +130,7 @@
 ; CHECK-SPLIT:          Foreign TU count: 1
 ; CHECK-SPLIT-NEXT:     Bucket count: 4
 ; CHECK-SPLIT-NEXT:     Name count: 4
-; CHECK-SPLIT-NEXT:     Abbreviations table size: 0x28
+; CHECK-SPLIT-NEXT:     Abbreviations table size: 0x32
 ; CHECK-SPLIT-NEXT:     Augmentation: 'LLVM0700'
 ; CHECK-SPLIT-NEXT:   }
 ; CHECK-SPLIT-NEXT:   Compilation Unit offsets [
@@ -130,27 +140,32 @@
 ; CHECK-SPLIT-NEXT:     ForeignTU[0]: 0x675d23e4f33235f2
 ; CHECK-SPLIT-NEXT:   ]
 ; CHECK-SPLIT-NEXT:   Abbreviations [
-; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
+; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
 ; CHECK-SPLIT-NEXT:       Tag: DW_TAG_structure_type
+; CHECK-SPLIT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
-; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV1:0x[0-9a-f]*]] {
-; CHECK-SPLIT-NEXT:       Tag: DW_TAG_structure_type
+; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
+; CHECK-SPLIT-NEXT:       Tag: DW_TAG_base_type
 ; CHECK-SPLIT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV2:0x[0-9a-f]*]] {
 ; CHECK-SPLIT-NEXT:       Tag: DW_TAG_base_type
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
-; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
-; CHECK-SPLIT-NEXT:       Tag: DW_TAG_subprogram
+; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV:0x[0-9a-f]*]] {
+; CHECK-SPLIT-NEXT:       Tag: DW_TAG_structure_type
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
-; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV4:0x[0-9a-f]*]] {
-; CHECK-SPLIT-NEXT:       Tag: DW_TAG_base_type
-; CHECK-SPLIT-NEXT:       DW_IDX_type_unit: DW_FORM_data1
+; CHECK-SPLIT-NEXT:     Abbreviation [[ABBREV3:0x[0-9a-f]*]] {
+; CHECK-SPLIT-NEXT:       Tag: DW_TAG_subprogram
 ; CHECK-SPLIT-NEXT:       DW_IDX_die_offset: DW_FORM_ref4
+; CHECK-SPLIT-NEXT:       DW_IDX_parent: DW_FORM_flag_present
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
 ; CHECK-SPLIT-NEXT:   Bucket 0 [
@@ -161,6 +176,7 @@
 ; CHECK-SPLIT-NEXT:         Abbrev: [[ABBREV2]]
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_base_type
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000035
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
 ; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
@@ -173,11 +189,13 @@
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_structure_type
 ; CHECK-SPLIT-NEXT:         DW_IDX_type_unit: 0x00
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000021
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
 ; CHECK-SPLIT-NEXT:       }
-; CHECK-SPLIT-NEXT:       Entry @ 0xae {
+; CHECK-SPLIT-NEXT:       Entry @ {{.*}} {
 ; CHECK-SPLIT-NEXT:         Abbrev: [[ABBREV]]
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_structure_type
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000039
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
 ; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
@@ -189,6 +207,7 @@
 ; CHECK-SPLIT-NEXT:         Abbrev: [[ABBREV3]]
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_subprogram
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x0000001a
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
 ; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
@@ -201,6 +220,7 @@
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_base_type
 ; CHECK-SPLIT-NEXT:         DW_IDX_type_unit: 0x00
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000036
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
 ; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
diff --git a/llvm/test/tools/dsymutil/ARM/accel-imported-declarations.test b/llvm/test/tools/dsymutil/ARM/accel-imported-declarations.test
index 057e89d060b1d..a8bf191e036f7 100644
--- a/llvm/test/tools/dsymutil/ARM/accel-imported-declarations.test
+++ b/llvm/test/tools/dsymutil/ARM/accel-imported-declarations.test
@@ -23,11 +23,13 @@ DWARF-NEXT:     Entry {{.*}} {
 DWARF-NEXT:       Abbrev: {{.*}}
 DWARF-NEXT:       Tag: DW_TAG_namespace
 DWARF:       DW_IDX_die_offset: [[NAMESPACE]]
+DWARF-NEXT:  DW_IDX_parent: 0x{{.*}}
 DWARF-NEXT:     }
 DWARF-NEXT:     Entry {{.*}} {
 DWARF-NEXT:       Abbrev: {{.*}}
 DWARF:       Tag: DW_TAG_imported_declaration
 DWARF:       DW_IDX_die_offset: 0x0000005c
+DWARF-NEXT:  DW_IDX_parent: 0x{{.*}}
 DWARF-NEXT:     }
 DWARF-NEXT:   }
 
diff --git a/llvm/test/tools/dsymutil/ARM/dwarf5-dwarf4-combination-macho.test b/llvm/test/tools/dsymutil/ARM/dwarf5-dwarf4-combination-macho.test
index 3704ec9802d70..5a37b4247b5bf 100644
--- a/llvm/test/tools/dsymutil/ARM/dwarf5-dwarf4-combination-macho.test
+++ b/llvm/test/tools/dsymutil/ARM/dwarf5-dwarf4-combination-macho.test
@@ -31,14 +31,14 @@
 
 RUN: rm -rf %t.dir && mkdir -p %t.dir
 RUN: dsymutil -y %p/dummy-debug-map-amr64.map -oso-prepend-path=%p/../Inputs/DWARF5-DWARF4-combination -o %t.dir/dwarf5-dwarf4-combination-macho.dSYM
-RUN: llvm-dwarfdump %t.dir/dwarf5-dwarf4-combination-macho.dSYM -a --verbose | FileCheck %s
+RUN: llvm-dwarfdump %t.dir/dwarf5-dwarf4-combination-macho.dSYM -a --verbose | FileCheck %s --check-prefixes=CHECK,WITH-PARENTS
 
 RUN: rm -rf %t.dir && mkdir -p %t.dir
 RUN: dsymutil --no-odr --linker parallel -y %p/dummy-debug-map-amr64.map \
 RUN:   -oso-prepend-path=%p/../Inputs/DWARF5-DWARF4-combination \
 RUN:   -o %t.dir/dwarf5-dwarf4-combination-macho.dSYM
 RUN: llvm-dwarfdump %t.dir/dwarf5-dwarf4-combination-macho.dSYM \
-RUN:   -a --verbose | FileCheck %s
+RUN:   -a --verbose | FileCheck %s --check-prefixes=CHECK,NO-PARENTS
 
 ### Uncomment following when llvm-dwarfdump will dump address ranges
 ### correctly for severall compile units case.
@@ -219,7 +219,10 @@ CHECK-NEXT: 0x00000028: 000000dc "int"
 CHECK: .debug_names contents:
 CHECK-NEXT: Name Index @ 0x0 {
 CHECK-NEXT:   Header {
-CHECK-NEXT:     Length: 0xC4
+; FIXME: when the parallel dwarf linker is able to generate DW_IDX_parent,
+; these headers should be the same.
+WITH-PARENTS-NEXT: Length: 0xC8
+NO-PARENTS-NEXT:   Length: 0xC4
 CHECK-NEXT:     Format: DWARF32
 CHECK-NEXT:     Version: 5
 CHECK-NEXT:     CU count: 2
@@ -227,6 +230,7 @@ CHECK-NEXT:     Local TU count: 0
 CHECK-NEXT:     Foreign TU count: 0
 CHECK-NEXT:     Bucket count: 5
 CHECK-NEXT:     Name count: 5
-CHECK-NEXT:     Abbreviations table size: 0x13
+WITH-PARENTS-NEXT:     Abbreviations table size: 0x17
+NO-PARENTS-NEXT:       Abbreviations table size: 0x13
 CHECK-NEXT:     Augmentation: 'LLVM0700'
 CHECK-NEXT:   }

From 5b0e45c8cec8d6a95c22fcd20e7161d57ccafca1 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Fri, 19 Jan 2024 09:32:22 -0800
Subject: [PATCH 158/843] [lld][WebAssembly] Fix use of undefined funcs under
 --warn-unresolved-symbols (#78643)

When undefined functions exist in the final link we need to create
stub functions (otherwise direct calls to those functions could
not be generated).  We were creating those stub when
`--unresolved-symbols=ignore-all` was passed but overlooked the fact
that `--warn-unresolved-symbols` essentially has the same effect (i.e.
undefined function can exist in the final link).

Fixes: #53987
---
 lld/test/wasm/unresolved-symbols.s | 17 +++++++++++------
 lld/wasm/Relocations.cpp           | 23 ++++++++++++-----------
 2 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/lld/test/wasm/unresolved-symbols.s b/lld/test/wasm/unresolved-symbols.s
index 5de54d76c6de8..6e183e1878b3a 100644
--- a/lld/test/wasm/unresolved-symbols.s
+++ b/lld/test/wasm/unresolved-symbols.s
@@ -18,10 +18,15 @@
 # RUN:   FileCheck -check-prefix=ERR1 %s
 
 ## Ignore all should not produce error and should not produce
-# any imports.  It should create a stub function in the place of the missing
-# function symbol.
+## any imports.  It should create a stub function in the place of the missing
+## function symbol.
 # RUN: wasm-ld %t1.o -o %t2.wasm --unresolved-symbols=ignore-all
 # RUN: obj2yaml %t2.wasm | FileCheck -check-prefix=IGNORE %s
+
+## --warn-unresolved-symbols should behave the same
+# RUN: wasm-ld %t1.o -o %t2.wasm --warn-unresolved-symbols
+# RUN: obj2yaml %t2.wasm | FileCheck -check-prefix=IGNORE %s
+
 # IGNORE-NOT: - Type:            IMPORT
 # IGNORE-NOT: - Type:            ELEM
 #
@@ -52,10 +57,10 @@
 # IGNORE-NEXT:      - Index:           3
 # IGNORE-NEXT:        Name:            get_func_addr
 
-## --import-undefined should handle unresolved funtions symbols
-# by importing them but still report errors/warning for missing data symbols.
-# `--allow-undefined` should behave like `--import-undefined` +
-# `--unresolve-symbols=ignore`
+## --import-undefined should handle unresolved functions symbols
+## by importing them but still report errors/warning for missing data symbols.
+## `--allow-undefined` should behave like `--import-undefined` +
+## `--unresolve-symbols=ignore`
 # RUN: wasm-ld %t1.o -o %t3.wasm --import-undefined --unresolved-symbols=ignore-all
 # RUN: obj2yaml %t3.wasm | FileCheck -check-prefix=IMPORT %s
 #      IMPORT:  - Type:            IMPORT
diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp
index fcffacbc77af0..09b0a24ff011a 100644
--- a/lld/wasm/Relocations.cpp
+++ b/lld/wasm/Relocations.cpp
@@ -54,21 +54,22 @@ static void reportUndefined(Symbol *sym) {
     case UnresolvedPolicy::Ignore:
       LLVM_DEBUG(dbgs() << "ignoring undefined symbol: " + toString(*sym) +
                                "\n");
-      if (!config->importUndefined) {
-        if (auto *f = dyn_cast<UndefinedFunction>(sym)) {
-          if (!f->stubFunction) {
-            f->stubFunction = symtab->createUndefinedStub(*f->getSignature());
-            f->stubFunction->markLive();
-            // Mark the function itself as a stub which prevents it from being
-            // assigned a table entry.
-            f->isStub = true;
-          }
-        }
-      }
       break;
     case UnresolvedPolicy::ImportDynamic:
       break;
     }
+
+    if (auto *f = dyn_cast<UndefinedFunction>(sym)) {
+      if (!f->stubFunction &&
+          config->unresolvedSymbols != UnresolvedPolicy::ImportDynamic &&
+          !config->importUndefined) {
+        f->stubFunction = symtab->createUndefinedStub(*f->getSignature());
+        f->stubFunction->markLive();
+        // Mark the function itself as a stub which prevents it from being
+        // assigned a table entry.
+        f->isStub = true;
+      }
+    }
   }
 }
 

From 0784b1eefa36d4acbb0dacd2d18796e26313b6c5 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Fri, 19 Jan 2024 17:33:54 +0000
Subject: [PATCH 159/843] Re-exec TSan with no ASLR if memory layout is
 incompatible on Linux (#78351)

TSan's shadow mappings only support 30-bits of ASLR entropy on x86
Linux, and it is not practical to support the maximum of 32-bits (due to pointer compression and the overhead of shadow mappings). Instead, this patch changes TSan to re-exec without ASLR if it encounters an
incompatible memory layout, as suggested by Dmitry in
https://github.com/google/sanitizers/issues/1716.
If ASLR is already disabled but the memory layout is still incompatible,
it will abort.

This patch involves a bit of refactoring, because the old code is:
1. InitializePlatformEarly()
2. InitializeAllocator()
3. InitializePlatform(): CheckAndProtect()

but it may already segfault during InitializeAllocator() if the memory
layout is incompatible, before we get a chance to check in
CheckAndProtect().

This patch adds CheckAndProtect() during InitializePlatformEarly(), before the allocator is initialized. Naturally, it is necessary to ensure that CheckAndProtect() does *not* allow the heap regions to be occupied  here, hence we generalize CheckAndProtect() to optionally check the heap
regions. We keep the original behavior of CheckAndProtect() in InitializePlatform() as a last line of defense.

We need to be careful not to prematurely abort if ASLR is disabled but TSan was going to re-exec for other reasons (e.g., unlimited stack size); we implement this by moving all the re-exec logic into ReExecIfNeeded().
---
 compiler-rt/lib/tsan/rtl/tsan_platform.h      |   2 +-
 .../lib/tsan/rtl/tsan_platform_linux.cpp      | 138 ++++++++++++------
 .../lib/tsan/rtl/tsan_platform_mac.cpp        |   5 +-
 .../lib/tsan/rtl/tsan_platform_posix.cpp      |  43 +++++-
 4 files changed, 138 insertions(+), 50 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform.h b/compiler-rt/lib/tsan/rtl/tsan_platform.h
index 84ff4bfade09a..377f8aeb8d66e 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform.h
@@ -1024,7 +1024,7 @@ inline uptr RestoreAddr(uptr addr) {
 
 void InitializePlatform();
 void InitializePlatformEarly();
-void CheckAndProtect();
+bool CheckAndProtect(bool protect, bool ignore_heap, bool print_warnings);
 void InitializeShadowMemoryPlatform();
 void WriteMemoryProfile(char *buf, uptr buf_size, u64 uptime_ns);
 int ExtractResolvFDs(void *state, int *fds, int nfd);
diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
index b45adea45b27a..9a66a7feb5b39 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
@@ -214,6 +214,86 @@ void InitializeShadowMemoryPlatform() {
 
 #endif  // #if !SANITIZER_GO
 
+#  if !SANITIZER_GO
+static void ReExecIfNeeded() {
+  // Go maps shadow memory lazily and works fine with limited address space.
+  // Unlimited stack is not a problem as well, because the executable
+  // is not compiled with -pie.
+  bool reexec = false;
+  // TSan doesn't play well with unlimited stack size (as stack
+  // overlaps with shadow memory). If we detect unlimited stack size,
+  // we re-exec the program with limited stack size as a best effort.
+  if (StackSizeIsUnlimited()) {
+    const uptr kMaxStackSize = 32 * 1024 * 1024;
+    VReport(1,
+            "Program is run with unlimited stack size, which wouldn't "
+            "work with ThreadSanitizer.\n"
+            "Re-execing with stack size limited to %zd bytes.\n",
+            kMaxStackSize);
+    SetStackSizeLimitInBytes(kMaxStackSize);
+    reexec = true;
+  }
+
+  if (!AddressSpaceIsUnlimited()) {
+    Report(
+        "WARNING: Program is run with limited virtual address space,"
+        " which wouldn't work with ThreadSanitizer.\n");
+    Report("Re-execing with unlimited virtual address space.\n");
+    SetAddressSpaceUnlimited();
+    reexec = true;
+  }
+
+  // ASLR personality check.
+  int old_personality = personality(0xffffffff);
+  bool aslr_on =
+      (old_personality != -1) && ((old_personality & ADDR_NO_RANDOMIZE) == 0);
+
+#    if SANITIZER_ANDROID && (defined(__aarch64__) || defined(__x86_64__))
+  // After patch "arm64: mm: support ARCH_MMAP_RND_BITS." is introduced in
+  // linux kernel, the random gap between stack and mapped area is increased
+  // from 128M to 36G on 39-bit aarch64. As it is almost impossible to cover
+  // this big range, we should disable randomized virtual space on aarch64.
+  if (aslr_on) {
+    VReport(1,
+            "WARNING: Program is run with randomized virtual address "
+            "space, which wouldn't work with ThreadSanitizer on Android.\n"
+            "Re-execing with fixed virtual address space.\n");
+    CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
+    reexec = true;
+  }
+#    endif
+
+  if (reexec) {
+    // Don't check the address space since we're going to re-exec anyway.
+  } else if (!CheckAndProtect(false, false, false)) {
+    if (aslr_on) {
+      // Disable ASLR if the memory layout was incompatible.
+      // Alternatively, we could just keep re-execing until we get lucky
+      // with a compatible randomized layout, but the risk is that if it's
+      // not an ASLR-related issue, we will be stuck in an infinite loop of
+      // re-execing (unless we change ReExec to pass a parameter of the
+      // number of retries allowed.)
+      VReport(1,
+              "WARNING: ThreadSanitizer: memory layout is incompatible, "
+              "possibly due to high-entropy ASLR.\n"
+              "Re-execing with fixed virtual address space.\n"
+              "N.B. reducing ASLR entropy is preferable.\n");
+      CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
+      reexec = true;
+    } else {
+      VReport(1,
+              "FATAL: ThreadSanitizer: memory layout is incompatible, "
+              "even though ASLR is disabled.\n"
+              "Please file a bug.\n");
+      Die();
+    }
+  }
+
+  if (reexec)
+    ReExec();
+}
+#  endif
+
 void InitializePlatformEarly() {
   vmaSize =
     (MostSignificantSetBitIndex(GET_CURRENT_FRAME()) + 1);
@@ -284,6 +364,10 @@ void InitializePlatformEarly() {
   }
 #    endif
 #  endif
+
+#  if !SANITIZER_GO
+  ReExecIfNeeded();
+#  endif
 }
 
 void InitializePlatform() {
@@ -294,52 +378,22 @@ void InitializePlatform() {
   // is not compiled with -pie.
 #if !SANITIZER_GO
   {
-    bool reexec = false;
-    // TSan doesn't play well with unlimited stack size (as stack
-    // overlaps with shadow memory). If we detect unlimited stack size,
-    // we re-exec the program with limited stack size as a best effort.
-    if (StackSizeIsUnlimited()) {
-      const uptr kMaxStackSize = 32 * 1024 * 1024;
-      VReport(1, "Program is run with unlimited stack size, which wouldn't "
-                 "work with ThreadSanitizer.\n"
-                 "Re-execing with stack size limited to %zd bytes.\n",
-              kMaxStackSize);
-      SetStackSizeLimitInBytes(kMaxStackSize);
-      reexec = true;
-    }
-
-    if (!AddressSpaceIsUnlimited()) {
-      Report("WARNING: Program is run with limited virtual address space,"
-             " which wouldn't work with ThreadSanitizer.\n");
-      Report("Re-execing with unlimited virtual address space.\n");
-      SetAddressSpaceUnlimited();
-      reexec = true;
-    }
-#if SANITIZER_ANDROID && (defined(__aarch64__) || defined(__x86_64__))
-    // After patch "arm64: mm: support ARCH_MMAP_RND_BITS." is introduced in
-    // linux kernel, the random gap between stack and mapped area is increased
-    // from 128M to 36G on 39-bit aarch64. As it is almost impossible to cover
-    // this big range, we should disable randomized virtual space on aarch64.
-    // ASLR personality check.
-    int old_personality = personality(0xffffffff);
-    if (old_personality != -1 && (old_personality & ADDR_NO_RANDOMIZE) == 0) {
-      VReport(1, "WARNING: Program is run with randomized virtual address "
-              "space, which wouldn't work with ThreadSanitizer.\n"
-              "Re-execing with fixed virtual address space.\n");
-      CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
-      reexec = true;
-    }
-
-#endif
-#if SANITIZER_LINUX && (defined(__aarch64__) || defined(__loongarch_lp64))
+#    if SANITIZER_LINUX && (defined(__aarch64__) || defined(__loongarch_lp64))
     // Initialize the xor key used in {sig}{set,long}jump.
     InitializeLongjmpXorKey();
-#endif
-    if (reexec)
-      ReExec();
+#    endif
+  }
+
+  // Earlier initialization steps already re-exec'ed until we got a compatible
+  // memory layout, so we don't expect any more issues here.
+  if (!CheckAndProtect(true, true, true)) {
+    Printf(
+        "FATAL: ThreadSanitizer: unexpectedly found incompatible memory "
+        "layout.\n");
+    Printf("FATAL: Please file a bug.\n");
+    Die();
   }
 
-  CheckAndProtect();
   InitTlsSize();
 #endif  // !SANITIZER_GO
 }
diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
index 1aac0fb27520c..07d83e1a9a9ff 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_mac.cpp
@@ -239,7 +239,10 @@ static uptr longjmp_xor_key = 0;
 void InitializePlatform() {
   DisableCoreDumperIfNecessary();
 #if !SANITIZER_GO
-  CheckAndProtect();
+  if (!CheckAndProtect(true, true, true)) {
+    Printf("FATAL: ThreadSanitizer: found incompatible memory layout.\n");
+    Die();
+  }
 
   InitializeThreadStateStorage();
 
diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp
index e7dcd664dc0d2..7d5a992dc665e 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_posix.cpp
@@ -94,22 +94,51 @@ static void ProtectRange(uptr beg, uptr end) {
   }
 }
 
-void CheckAndProtect() {
+// CheckAndProtect will check if the memory layout is compatible with TSan.
+// Optionally (if 'protect' is true), it will set the memory regions between
+// app memory to be inaccessible.
+// 'ignore_heap' means it will not consider heap memory allocations to be a
+// conflict. Set this based on whether we are calling CheckAndProtect before
+// or after the allocator has initialized the heap.
+bool CheckAndProtect(bool protect, bool ignore_heap, bool print_warnings) {
   // Ensure that the binary is indeed compiled with -pie.
   MemoryMappingLayout proc_maps(true);
   MemoryMappedSegment segment;
   while (proc_maps.Next(&segment)) {
-    if (IsAppMem(segment.start)) continue;
+    if (segment.start >= HeapMemBeg() && segment.end <= HeapEnd()) {
+      if (ignore_heap) {
+        continue;
+      } else {
+        return false;
+      }
+    }
+
+    // Note: IsAppMem includes if it is heap memory, hence we must
+    // put this check after the heap bounds check.
+    if (IsAppMem(segment.start) && IsAppMem(segment.end - 1))
+      continue;
+
+    // Guard page after the heap end
     if (segment.start >= HeapMemEnd() && segment.start < HeapEnd()) continue;
+
     if (segment.protection == 0)  // Zero page or mprotected.
       continue;
+
     if (segment.start >= VdsoBeg())  // vdso
       break;
-    Printf("FATAL: ThreadSanitizer: unexpected memory mapping 0x%zx-0x%zx\n",
-           segment.start, segment.end);
-    Die();
+
+    // Debug output can break tests. Suppress this message in most cases.
+    if (print_warnings)
+      Printf(
+          "WARNING: ThreadSanitizer: unexpected memory mapping 0x%zx-0x%zx\n",
+          segment.start, segment.end);
+
+    return false;
   }
 
+  if (!protect)
+    return true;
+
 #    if SANITIZER_IOS && !SANITIZER_IOSSIM
   ProtectRange(HeapMemEnd(), ShadowBeg());
   ProtectRange(ShadowEnd(), MetaShadowBeg());
@@ -135,8 +164,10 @@ void CheckAndProtect() {
   // Older s390x kernels may not support 5-level page tables.
   TryProtectRange(user_addr_max_l4, user_addr_max_l5);
 #endif
+
+  return true;
 }
-#endif
+#  endif
 
 }  // namespace __tsan
 

From 42b160356fe5d3b41bf07c428d0142d3721b1d44 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn.dawkins@gmail.com>
Date: Fri, 19 Jan 2024 10:21:52 -0800
Subject: [PATCH 160/843] [mlir][transform] Add an op for replacing values with
 function calls (#78398)

Adds `transform.func.cast_and_call` that takes a set of inputs and
outputs and replaces the uses of those outputs with a call to a function
at a specified insertion point.

The idea with this operation is to allow users to author independent IR
outside of a to-be-compiled module, and then match and replace a slice
of the program with a call to the external function.

Additionally adds a mechanism for populating a type converter with a set
of conversion materialization functions that allow insertion of
casts on the inputs/outputs to and from the types of the function
signature.
---
 .../Func/TransformOps/FuncTransformOps.td     |  72 +++++++
 .../MemRef/TransformOps/MemRefTransformOps.td |   3 +-
 .../Tensor/TransformOps/TensorTransformOps.td |  18 ++
 .../Transform/IR/TransformInterfaces.td       |  27 ++-
 .../Func/TransformOps/FuncTransformOps.cpp    | 191 ++++++++++++++++++
 .../TransformOps/TensorTransformOps.cpp       |  40 ++++
 .../lib/Dialect/Transform/IR/TransformOps.cpp |   4 +
 mlir/test/Dialect/Func/func-transform.mlir    | 120 +++++++++++
 .../Dialect/Tensor/transform-op-casting.mlir  |  65 ++++++
 .../TestTransformDialectExtension.td          |   3 +-
 10 files changed, 538 insertions(+), 5 deletions(-)
 create mode 100644 mlir/test/Dialect/Func/func-transform.mlir
 create mode 100644 mlir/test/Dialect/Tensor/transform-op-casting.mlir

diff --git a/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td b/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td
index 7a7e991c78618..c36fdd1505562 100644
--- a/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td
+++ b/mlir/include/mlir/Dialect/Func/TransformOps/FuncTransformOps.td
@@ -12,6 +12,8 @@
 include "mlir/Dialect/Transform/IR/TransformDialect.td"
 include "mlir/Dialect/Transform/IR/TransformInterfaces.td"
 include "mlir/Dialect/Transform/IR/TransformTypes.td"
+include "mlir/Interfaces/SideEffectInterfaces.td"
+include "mlir/IR/RegionKindInterface.td"
 include "mlir/IR/OpBase.td"
 
 def ApplyFuncToLLVMConversionPatternsOp : Op<Transform_Dialect,
@@ -26,4 +28,74 @@ def ApplyFuncToLLVMConversionPatternsOp : Op<Transform_Dialect,
   let assemblyFormat = "attr-dict";
 }
 
+def CastAndCallOp : Op<Transform_Dialect,
+    "func.cast_and_call",
+    [DeclareOpInterfaceMethods<TransformOpInterface>,
+     DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+     AttrSizedOperandSegments,
+     ReportTrackingListenerFailuresOpTrait]
+        # GraphRegionNoTerminator.traits> {
+  let summary = "Casts values to the signature of a function and replaces them "
+                "with a call";
+  let description = [{
+    This transform takes value handles to a set of `inputs` and `outputs` and
+    attempts to cast them to the function signature of the attached function
+    op, then builds a call to the function and replaces the users of the
+    outputs. It is the responsibility of the user to ensure that the slice of
+    the program replaced by this operation makes sense, i.e. there is no
+    verification that the inputs to this operation have any relation to the
+    outputs outside of basic dominance requirements needed for the call.
+
+    The casting materialization functions are specified in the graph region of
+    this op. They must implement the `TypeConverterBuilderOpInterface`. The
+    order of ops within the region is irrelevant.
+
+    The target function can be specified by a symbol name or by a handle to the
+    operation.
+
+    This transform only reads the operand handles and only replaces the users of
+    the outputs with the results of the call. No handles are consumed and no
+    operations are removed. Users are expected to run cleanup separately if
+    desired.
+
+    Warning: The replacement of the uses of the outputs could invalidate certain
+    restricted value handle types (e.g. `transform.block_arg` if it existed, by
+    replacing the use with something not coming from a block argument). The
+    value will still exist in such cases but wouldn't verify against the type.
+    See the discussion here for more information:
+    https://github.com/llvm/llvm-project/pull/78398#discussion_r1455070087
+
+    This transform will emit a silenceable failure if:
+     - The set of outputs isn't unique
+     - The handle for the insertion point does not include exactly one operation
+     - The insertion point op does not dominate any of the output users
+     - The insertion point op is not dominated by any of the inputs
+     - The function signature does not match the number of inputs/outputs
+
+    This transform will emit a definite failure if it fails to resolve the
+    target function, or if it fails to materialize the conversion casts of
+    either the inputs to the function argument types, or the call results to
+    the output types.
+  }];
+
+  let arguments = (ins
+    TransformHandleTypeInterface:$insertion_point,
+    UnitAttr:$insert_after,
+    Optional<TransformValueHandleTypeInterface>:$inputs,
+    Optional<TransformValueHandleTypeInterface>:$outputs,
+    OptionalAttr<SymbolRefAttr>:$function_name,
+    Optional<TransformHandleTypeInterface>:$function);
+  let results = (outs TransformHandleTypeInterface:$result);
+  let regions = (region MaxSizedRegion<1>:$conversions);
+
+  let assemblyFormat = [{
+    ($function_name^)? ($function^)?
+    ( `(` $inputs^ `)` )?
+    ( `->` $outputs^ )?
+    (`after` $insert_after^):(`before`)? $insertion_point
+    ($conversions^)? attr-dict `:` functional-type(operands, results)
+  }];
+  let hasVerifier = 1;
+}
+
 #endif // FUNC_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td b/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td
index 76309b9b8a964..29383a3825be8 100644
--- a/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td
+++ b/mlir/include/mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.td
@@ -18,7 +18,8 @@ include "mlir/IR/OpBase.td"
 def MemrefToLLVMTypeConverterOp : Op<Transform_Dialect,
     "apply_conversion_patterns.memref.memref_to_llvm_type_converter",
     [DeclareOpInterfaceMethods<TypeConverterBuilderOpInterface,
-                               ["getTypeConverterType"]>]> {
+                               ["getTypeConverter",
+                                "getTypeConverterType"]>]> {
   let description = [{
     This operation provides an "LLVMTypeConverter" that lowers memref types to
     LLVM types.
diff --git a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td
index 8556d9570fd12..39e1d7fa3494a 100644
--- a/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/TransformOps/TensorTransformOps.td
@@ -169,4 +169,22 @@ def MakeLoopIndependentOp
   }];
 }
 
+def TypeConversionCastShapeDynamicDimsOp : Op<Transform_Dialect,
+    "type_conversion.tensor.cast_shape_dynamic_dims",
+    [DeclareOpInterfaceMethods<TypeConverterBuilderOpInterface,
+                               ["populateTypeMaterializations"]>]> {
+  let description = [{
+    Populates a type converter with conversion materialization functions that
+    cast a tensor value between two cast-compatible tensors. See `tensor.cast`
+    for more information on cast compatibility between tensors.
+
+    If `ignore_dynamic_info` is not set, this will set an additional constraint
+    that source materializations do not cast dynamic dimensions to static ones.
+  }];
+  let arguments = (ins UnitAttr:$ignore_dynamic_info);
+
+  let assemblyFormat =
+      "(`ignore_dynamic_info` $ignore_dynamic_info^)? attr-dict";
+}
+
 #endif // TENSOR_TRANSFORM_OPS
diff --git a/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.td b/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.td
index f29efaee620d8..8f7b8f1999e0c 100644
--- a/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.td
+++ b/mlir/include/mlir/Dialect/Transform/IR/TransformInterfaces.td
@@ -284,8 +284,14 @@ def TypeConverterBuilderOpInterface
     : OpInterface<"TypeConverterBuilderOpInterface"> {
   let description = [{
     This interface should be implemented by ops that specify a type converter
-    for a dialect conversion. Such ops can be used with
-    "apply_conversion_patterns".
+    for a dialect conversion, or to populate a type converter with
+    conversions.
+
+    When such ops are intended to be used with "apply_conversion_patterns" or
+    other operations that expect a type converter, a non-default implementation
+    of `getTypeConverter` should be implemented. For use with "cast_and_call"
+    like ops that construct a type converter iteratively, non-default
+    `populateTypeMaterializations` should be implemented.
   }];
 
   let cppNamespace = "::mlir::transform";
@@ -297,7 +303,11 @@ def TypeConverterBuilderOpInterface
       }],
       /*returnType=*/"std::unique_ptr<::mlir::TypeConverter>",
       /*name=*/"getTypeConverter",
-      /*arguments=*/(ins)
+      /*arguments=*/(ins),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{
+        return std::make_unique<::mlir::TypeConverter>();
+      }]
     >,
     StaticInterfaceMethod<
       /*desc=*/[{
@@ -310,6 +320,17 @@ def TypeConverterBuilderOpInterface
       /*methodBody=*/"",
       /*defaultImplementation=*/[{ return "TypeConverter"; }]
     >,
+    InterfaceMethod<
+      /*desc=*/[{
+        Populate the given type converter with source/target materialization
+        functions.
+      }],
+      /*returnType=*/"void",
+      /*name=*/"populateTypeMaterializations",
+      /*arguments=*/(ins "::mlir::TypeConverter &":$converter),
+      /*methodBody=*/"",
+      /*defaultImplementation=*/[{ return; }]
+    >,
   ];
 }
 
diff --git a/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp b/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
index 9e9b6bcea790d..9e79b086c0be8 100644
--- a/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
+++ b/mlir/lib/Dialect/Func/TransformOps/FuncTransformOps.cpp
@@ -15,6 +15,7 @@
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
 
@@ -36,6 +37,196 @@ transform::ApplyFuncToLLVMConversionPatternsOp::verifyTypeConverter(
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// CastAndCallOp
+//===----------------------------------------------------------------------===//
+
+DiagnosedSilenceableFailure
+transform::CastAndCallOp::apply(transform::TransformRewriter &rewriter,
+                                transform::TransformResults &results,
+                                transform::TransformState &state) {
+  SmallVector<Value> inputs;
+  if (getInputs())
+    llvm::append_range(inputs, state.getPayloadValues(getInputs()));
+
+  SetVector<Value> outputs;
+  if (getOutputs()) {
+    for (auto output : state.getPayloadValues(getOutputs()))
+      outputs.insert(output);
+
+    // Verify that the set of output values to be replaced is unique.
+    if (outputs.size() !=
+        llvm::range_size(state.getPayloadValues(getOutputs()))) {
+      return emitSilenceableFailure(getLoc())
+             << "cast and call output values must be unique";
+    }
+  }
+
+  // Get the insertion point for the call.
+  auto insertionOps = state.getPayloadOps(getInsertionPoint());
+  if (!llvm::hasSingleElement(insertionOps)) {
+    return emitSilenceableFailure(getLoc())
+           << "Only one op can be specified as an insertion point";
+  }
+  bool insertAfter = getInsertAfter();
+  Operation *insertionPoint = *insertionOps.begin();
+
+  // Check that all inputs dominate the insertion point, and the insertion
+  // point dominates all users of the outputs.
+  DominanceInfo dom(insertionPoint);
+  for (Value output : outputs) {
+    for (Operation *user : output.getUsers()) {
+      // If we are inserting after the insertion point operation, the
+      // insertion point operation must properly dominate the user. Otherwise
+      // basic dominance is enough.
+      bool doesDominate = insertAfter
+                              ? dom.properlyDominates(insertionPoint, user)
+                              : dom.dominates(insertionPoint, user);
+      if (!doesDominate) {
+        return emitDefiniteFailure()
+               << "User " << user << " is not dominated by insertion point "
+               << insertionPoint;
+      }
+    }
+  }
+
+  for (Value input : inputs) {
+    // If we are inserting before the insertion point operation, the
+    // input must properly dominate the insertion point operation. Otherwise
+    // basic dominance is enough.
+    bool doesDominate = insertAfter
+                            ? dom.dominates(input, insertionPoint)
+                            : dom.properlyDominates(input, insertionPoint);
+    if (!doesDominate) {
+      return emitDefiniteFailure()
+             << "input " << input << " does not dominate insertion point "
+             << insertionPoint;
+    }
+  }
+
+  // Get the function to call. This can either be specified by symbol or as a
+  // transform handle.
+  func::FuncOp targetFunction = nullptr;
+  if (getFunctionName()) {
+    targetFunction = SymbolTable::lookupNearestSymbolFrom<func::FuncOp>(
+        insertionPoint, *getFunctionName());
+    if (!targetFunction) {
+      return emitDefiniteFailure()
+             << "unresolved symbol " << *getFunctionName();
+    }
+  } else if (getFunction()) {
+    auto payloadOps = state.getPayloadOps(getFunction());
+    if (!llvm::hasSingleElement(payloadOps)) {
+      return emitDefiniteFailure() << "requires a single function to call";
+    }
+    targetFunction = dyn_cast<func::FuncOp>(*payloadOps.begin());
+    if (!targetFunction) {
+      return emitDefiniteFailure() << "invalid non-function callee";
+    }
+  } else {
+    llvm_unreachable("Invalid CastAndCall op without a function to call");
+    return emitDefiniteFailure();
+  }
+
+  // Verify that the function argument and result lengths match the inputs and
+  // outputs given to this op.
+  if (targetFunction.getNumArguments() != inputs.size()) {
+    return emitSilenceableFailure(targetFunction.getLoc())
+           << "mismatch between number of function arguments "
+           << targetFunction.getNumArguments() << " and number of inputs "
+           << inputs.size();
+  }
+  if (targetFunction.getNumResults() != outputs.size()) {
+    return emitSilenceableFailure(targetFunction.getLoc())
+           << "mismatch between number of function results "
+           << targetFunction->getNumResults() << " and number of outputs "
+           << outputs.size();
+  }
+
+  // Gather all specified converters.
+  mlir::TypeConverter converter;
+  if (!getRegion().empty()) {
+    for (Operation &op : getRegion().front()) {
+      cast<transform::TypeConverterBuilderOpInterface>(&op)
+          .populateTypeMaterializations(converter);
+    }
+  }
+
+  if (insertAfter)
+    rewriter.setInsertionPointAfter(insertionPoint);
+  else
+    rewriter.setInsertionPoint(insertionPoint);
+
+  for (auto [input, type] :
+       llvm::zip_equal(inputs, targetFunction.getArgumentTypes())) {
+    if (input.getType() != type) {
+      Value newInput = converter.materializeSourceConversion(
+          rewriter, input.getLoc(), type, input);
+      if (!newInput) {
+        return emitDefiniteFailure() << "Failed to materialize conversion of "
+                                     << input << " to type " << type;
+      }
+      input = newInput;
+    }
+  }
+
+  auto callOp = rewriter.create<func::CallOp>(insertionPoint->getLoc(),
+                                              targetFunction, inputs);
+
+  // Cast the call results back to the expected types. If any conversions fail
+  // this is a definite failure as the call has been constructed at this point.
+  for (auto [output, newOutput] :
+       llvm::zip_equal(outputs, callOp.getResults())) {
+    Value convertedOutput = newOutput;
+    if (output.getType() != newOutput.getType()) {
+      convertedOutput = converter.materializeTargetConversion(
+          rewriter, output.getLoc(), output.getType(), newOutput);
+      if (!convertedOutput) {
+        return emitDefiniteFailure()
+               << "Failed to materialize conversion of " << newOutput
+               << " to type " << output.getType();
+      }
+    }
+    rewriter.replaceAllUsesExcept(output, convertedOutput, callOp);
+  }
+  results.set(cast<OpResult>(getResult()), {callOp});
+  return DiagnosedSilenceableFailure::success();
+}
+
+LogicalResult transform::CastAndCallOp::verify() {
+  if (!getRegion().empty()) {
+    for (Operation &op : getRegion().front()) {
+      if (!isa<transform::TypeConverterBuilderOpInterface>(&op)) {
+        InFlightDiagnostic diag = emitOpError()
+                                  << "expected children ops to implement "
+                                     "TypeConverterBuilderOpInterface";
+        diag.attachNote(op.getLoc()) << "op without interface";
+        return diag;
+      }
+    }
+  }
+  if (!getFunction() && !getFunctionName()) {
+    return emitOpError() << "expected a function handle or name to call";
+  }
+  if (getFunction() && getFunctionName()) {
+    return emitOpError() << "function handle and name are mutually exclusive";
+  }
+  return success();
+}
+
+void transform::CastAndCallOp::getEffects(
+    SmallVectorImpl<MemoryEffects::EffectInstance> &effects) {
+  transform::onlyReadsHandle(getInsertionPoint(), effects);
+  if (getInputs())
+    transform::onlyReadsHandle(getInputs(), effects);
+  if (getOutputs())
+    transform::onlyReadsHandle(getOutputs(), effects);
+  if (getFunction())
+    transform::onlyReadsHandle(getFunction(), effects);
+  transform::producesHandle(getResult(), effects);
+  transform::modifiesPayload(effects);
+}
+
 //===----------------------------------------------------------------------===//
 // Transform op registration
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp
index ed27423870471..38f1824a3634a 100644
--- a/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp
+++ b/mlir/lib/Dialect/Tensor/TransformOps/TensorTransformOps.cpp
@@ -15,6 +15,8 @@
 #include "mlir/Dialect/Tensor/Utils/Utils.h"
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/Transforms/DialectConversion.h"
 
 using namespace mlir;
 using namespace tensor;
@@ -128,6 +130,44 @@ void transform::ApplyRewriteTensorOpsAsConstantPatternsOp::populatePatterns(
   tensor::populateRewriteAsConstantPatterns(patterns);
 }
 
+//===----------------------------------------------------------------------===//
+// TypeConversionCastTensorShapeOp
+//===----------------------------------------------------------------------===//
+
+void transform::TypeConversionCastShapeDynamicDimsOp::
+    populateTypeMaterializations(TypeConverter &converter) {
+  bool ignoreDynamicInfo = getIgnoreDynamicInfo();
+  converter.addSourceMaterialization([ignoreDynamicInfo](
+                                         OpBuilder &builder, Type resultType,
+                                         ValueRange inputs,
+                                         Location loc) -> std::optional<Value> {
+    if (inputs.size() != 1) {
+      return std::nullopt;
+    }
+    Value input = inputs[0];
+    if (!ignoreDynamicInfo &&
+        !tensor::preservesStaticInformation(resultType, input.getType())) {
+      return std::nullopt;
+    }
+    if (!tensor::CastOp::areCastCompatible(input.getType(), resultType)) {
+      return std::nullopt;
+    }
+    return builder.create<tensor::CastOp>(loc, resultType, input).getResult();
+  });
+  converter.addTargetMaterialization([](OpBuilder &builder, Type resultType,
+                                        ValueRange inputs,
+                                        Location loc) -> std::optional<Value> {
+    if (inputs.size() != 1) {
+      return std::nullopt;
+    }
+    Value input = inputs[0];
+    if (!tensor::CastOp::areCastCompatible(input.getType(), resultType)) {
+      return std::nullopt;
+    }
+    return builder.create<tensor::CastOp>(loc, resultType, input).getResult();
+  });
+}
+
 //===----------------------------------------------------------------------===//
 // MakeLoopIndependentOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
index 485d4448e7c36..f2a57383cc5bf 100644
--- a/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
+++ b/mlir/lib/Dialect/Transform/IR/TransformOps.cpp
@@ -16,10 +16,12 @@
 #include "mlir/Dialect/Transform/IR/TransformDialect.h"
 #include "mlir/Dialect/Transform/IR/TransformInterfaces.h"
 #include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/Diagnostics.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/Verifier.h"
+#include "mlir/Interfaces/CallInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
 #include "mlir/Interfaces/FunctionImplementation.h"
 #include "mlir/Interfaces/FunctionInterfaces.h"
@@ -30,11 +32,13 @@
 #include "mlir/Transforms/DialectConversion.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/LoopInvariantCodeMotionUtils.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/TypeSwitch.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
 #include <optional>
 
 #define DEBUG_TYPE "transform-dialect"
diff --git a/mlir/test/Dialect/Func/func-transform.mlir b/mlir/test/Dialect/Func/func-transform.mlir
new file mode 100644
index 0000000000000..6aab07b0cb38a
--- /dev/null
+++ b/mlir/test/Dialect/Func/func-transform.mlir
@@ -0,0 +1,120 @@
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file | FileCheck %s
+
+// CHECK-LABEL: func.func @basic_cast_and_call
+func.func @basic_cast_and_call() {
+  // CHECK-NEXT: call @second()
+  "test.foo"() : () -> ()
+  // CHECK-NEXT: test.foo
+  // CHECK-NEXT: call @third()
+  func.return
+}
+
+func.func @second() {
+  "test.bar"() : () -> ()
+  func.return
+}
+
+func.func private @third()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %f:3 = transform.split_handle %funcs : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+    %foo = transform.structured.match ops{["test.foo"]} in %f#0 : (!transform.any_op) -> !transform.any_op
+    transform.func.cast_and_call @second before %foo : (!transform.any_op) -> !transform.any_op
+    transform.func.cast_and_call %f#2 after %foo : (!transform.any_op, !transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @non_empty_arg_and_out
+func.func @non_empty_arg_and_out(%arg0 : index) -> i32 {
+  // CHECK-NEXT: %[[FOO:.+]] = "test.foo"
+  %0 = "test.foo"(%arg0) : (index) -> (index)
+  // CHECK-NEXT: %[[CALL:.+]] = call @second(%[[FOO]]) : (index) -> i32
+  %1 = "test.bar"(%0) : (index) -> (i32)
+  // CHECK: return %[[CALL]] : i32
+  func.return %1 : i32
+}
+
+func.func private @second(%arg1 : index) -> i32
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %f:2 = transform.split_handle %funcs : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %foo = transform.structured.match ops{["test.foo"]} in %f#0 : (!transform.any_op) -> !transform.any_op
+    %bar = transform.structured.match ops{["test.bar"]} in %f#0 : (!transform.any_op) -> !transform.any_op
+    %in = transform.get_result %foo[0] : (!transform.any_op) -> !transform.any_value
+    %out = transform.get_result %bar[0] : (!transform.any_op) -> !transform.any_value
+    transform.func.cast_and_call %f#1(%in) -> %out before %bar
+        : (!transform.any_op, !transform.any_value,
+           !transform.any_value, !transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @multi_arg_and_result
+func.func @multi_arg_and_result(%arg0 : index) -> (index, index) {
+  // CHECK-NEXT: %[[FOO:.+]] = "test.foo"
+  %0 = "test.foo"(%arg0) : (index) -> (index)
+  %1 = "test.bar"(%0) : (index) -> (index)
+  %2 = "test.bar"(%0) : (index) -> (index)
+  // CHECK: %[[CALL:.+]]:2 = call @second(%[[FOO]], %[[FOO]]) : (index, index) -> (index, index)
+  // CHECK: return %[[CALL]]#0, %[[CALL]]#1 : index, index
+  func.return %1, %2 : index, index
+}
+
+func.func private @second(%arg1: index, %arg2: index) -> (index, index)
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %f:2 = transform.split_handle %funcs : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %foo = transform.structured.match ops{["test.foo"]} in %f#0 : (!transform.any_op) -> !transform.any_op
+    %bars = transform.structured.match ops{["test.bar"]} in %f#0 : (!transform.any_op) -> !transform.any_op
+    %in0 = transform.get_result %foo[0] : (!transform.any_op) -> !transform.any_value
+    %in1 = transform.get_result %foo[0] : (!transform.any_op) -> !transform.any_value
+    %ins = transform.merge_handles %in0, %in1 : !transform.any_value
+
+    %outs = transform.get_result %bars[0] : (!transform.any_op) -> !transform.any_value
+
+    transform.func.cast_and_call %f#1(%ins) -> %outs after %foo
+        : (!transform.any_op, !transform.any_value,
+           !transform.any_value, !transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// -----
+
+// CHECK-LABEL: func.func @nested_call
+func.func @nested_call() {
+  // CHECK-NEXT: %[[ARG:.+]] = "test.arg"
+  // CHECK-NEXT: test.foo
+  %0 = "test.arg"() : () -> (index)
+  "test.foo"() ({
+    // CHECK-NEXT: call @second(%[[ARG]]) : (index) -> ()
+    "test.bar"(%0) : (index) -> ()
+  }) : () -> ()
+}
+
+func.func private @second(%arg1: index) -> ()
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %f:2 = transform.split_handle %funcs : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %arg = transform.structured.match ops{["test.arg"]} in %f#0 : (!transform.any_op) -> !transform.any_op
+    %bar = transform.structured.match ops{["test.bar"]} in %f#0 : (!transform.any_op) -> !transform.any_op
+    %in = transform.get_result %arg[0] : (!transform.any_op) -> !transform.any_value
+
+    transform.func.cast_and_call %f#1(%in) before %bar
+        : (!transform.any_op, !transform.any_value, !transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
diff --git a/mlir/test/Dialect/Tensor/transform-op-casting.mlir b/mlir/test/Dialect/Tensor/transform-op-casting.mlir
new file mode 100644
index 0000000000000..16a1fa2b0ba9c
--- /dev/null
+++ b/mlir/test/Dialect/Tensor/transform-op-casting.mlir
@@ -0,0 +1,65 @@
+// RUN: mlir-opt %s --transform-interpreter -allow-unregistered-dialect --split-input-file | FileCheck %s
+
+func.func @cast_to_dynamic(%arg0: tensor<10x13xf32>, %arg1: tensor<3x13xf32>) -> tensor<13x13xf32> {
+  %0 = tensor.concat dim(0) %arg0, %arg1 : (tensor<10x13xf32>, tensor<3x13xf32>) -> tensor<13x13xf32>
+  func.return %0 : tensor<13x13xf32>
+}
+
+func.func private @concat_replacement(%arg0: tensor<?x?xf32>, %arg1: tensor<?x?xf32>) -> tensor<?x?xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %f:2 = transform.split_handle %funcs : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %concat = transform.structured.match ops{["tensor.concat"]} in %f#0 : (!transform.any_op) -> !transform.any_op
+    %ins = transform.get_operand %concat[all] : (!transform.any_op) -> !transform.any_value
+    %out = transform.get_result %concat[all] : (!transform.any_op) -> !transform.any_value
+    transform.func.cast_and_call %f#1(%ins) -> %out before %concat {
+      transform.type_conversion.tensor.cast_shape_dynamic_dims
+    } : (!transform.any_op, !transform.any_value,
+         !transform.any_value, !transform.any_op) -> !transform.any_op
+    transform.apply_dce to %f#0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL: func.func @cast_to_dynamic
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<10x13xf32>
+//  CHECK-SAME:   %[[ARG1:[a-zA-Z0-9]+]]: tensor<3x13xf32>
+//   CHECK-DAG:   %[[CAST0:.+]] = tensor.cast %[[ARG0]] : tensor<10x13xf32> to tensor<?x?xf32>
+//   CHECK-DAG:   %[[CAST1:.+]] = tensor.cast %[[ARG1]] : tensor<3x13xf32> to tensor<?x?xf32>
+//       CHECK:   %[[CALL:.+]] = call @concat_replacement(%[[CAST0]], %[[CAST1]])
+//       CHECK:   %[[CAST_RES:.+]] = tensor.cast %[[CALL]] : tensor<?x?xf32> to tensor<13x13xf32>
+//       CHECK:   return %[[CAST_RES]] : tensor<13x13xf32>
+
+// -----
+
+func.func @cast_to_static(%arg0: tensor<?x?xf32>) -> tensor<?xf32> {
+  %0 = tensor.collapse_shape %arg0 [[0, 1]] : tensor<?x?xf32> into tensor<?xf32>
+  func.return %0 : tensor<?xf32>
+}
+
+func.func private @collapse_replacement(%arg0: tensor<4x5xf32>) -> tensor<20xf32>
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg0: !transform.any_op) {
+    %funcs = transform.structured.match ops{["func.func"]} in %arg0 : (!transform.any_op) -> !transform.any_op
+    %f:2 = transform.split_handle %funcs : (!transform.any_op) -> (!transform.any_op, !transform.any_op)
+    %concat = transform.structured.match ops{["tensor.collapse_shape"]} in %f#0 : (!transform.any_op) -> !transform.any_op
+    %ins = transform.get_operand %concat[all] : (!transform.any_op) -> !transform.any_value
+    %out = transform.get_result %concat[all] : (!transform.any_op) -> !transform.any_value
+    transform.func.cast_and_call %f#1(%ins) -> %out before %concat {
+      transform.type_conversion.tensor.cast_shape_dynamic_dims ignore_dynamic_info
+    } : (!transform.any_op, !transform.any_value,
+         !transform.any_value, !transform.any_op) -> !transform.any_op
+    transform.apply_dce to %f#0 : !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL: func.func @cast_to_static
+//  CHECK-SAME:   %[[ARG0:[a-zA-Z0-9]+]]: tensor<?x?xf32>
+//   CHECK-DAG:   %[[CAST_IN:.+]] = tensor.cast %[[ARG0]] : tensor<?x?xf32> to tensor<4x5xf32>
+//       CHECK:   %[[CALL:.+]] = call @collapse_replacement(%[[CAST_IN]])
+//       CHECK:   %[[CAST_RES:.+]] = tensor.cast %[[CALL]] : tensor<20xf32> to tensor<?xf32>
+//       CHECK:   return %[[CAST_RES]] : tensor<?xf32>
diff --git a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
index 54036f7929d1b..c00cc560e83e9 100644
--- a/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
+++ b/mlir/test/lib/Dialect/Transform/TestTransformDialectExtension.td
@@ -502,7 +502,8 @@ def ApplyTestConversionPatternsOp
 
 def TestTypeConverterOp
   : Op<Transform_Dialect, "apply_conversion_patterns.transform.test_type_converter",
-      [DeclareOpInterfaceMethods<TypeConverterBuilderOpInterface>]> {
+      [DeclareOpInterfaceMethods<TypeConverterBuilderOpInterface,
+                                 ["getTypeConverter"]>]> {
   let arguments = (ins);
   let results = (outs);
   let assemblyFormat = "attr-dict";

From 43531e719636e5960d8592a184e10af885be6869 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Sat, 20 Jan 2024 00:12:33 +0530
Subject: [PATCH 161/843] [LLVM][NVPTX] Add cp.async.bulk.commit/wait
 intrinsics (#78698)

This patch adds NVVM intrinsics and NVPTX codegen for the bulk variants
of the async-copy commit/wait instructions.
lit tests are added to verify the generated PTX.

PTX Doc link:

https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 llvm/include/llvm/IR/IntrinsicsNVVM.td     | 10 ++++++++
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td   | 16 +++++++++++++
 llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll | 28 ++++++++++++++++++++++
 3 files changed, 54 insertions(+)

diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index bcc3bf2695f8a..5a5ba2592e146 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -1454,6 +1454,16 @@ def int_nvvm_cp_async_wait_all :
     ClangBuiltin<"__nvvm_cp_async_wait_all">,
     Intrinsic<[],[],[]>;
 
+// cp.async.bulk variants of the commit/wait group
+def int_nvvm_cp_async_bulk_commit_group :
+    Intrinsic<[],[],[]>;
+
+def int_nvvm_cp_async_bulk_wait_group :
+    Intrinsic<[],[llvm_i32_ty],[ImmArg<ArgIndex<0>>]>;
+
+def int_nvvm_cp_async_bulk_wait_group_read :
+    Intrinsic<[],[llvm_i32_ty],[ImmArg<ArgIndex<0>>]>;
+
 // mbarrier
 def int_nvvm_mbarrier_init : ClangBuiltin<"__nvvm_mbarrier_init">,
     Intrinsic<[],[llvm_ptr_ty, llvm_i32_ty],[IntrConvergent, IntrNoCallback]>;
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 15a136faf4574..33f1e4a43e072 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -411,6 +411,22 @@ def CP_ASYNC_WAIT_ALL :
   [(int_nvvm_cp_async_wait_all)]>,
   Requires<[hasPTX<70>, hasSM<80>]>;
 
+// cp.async.bulk variants of the commit/wait group
+def CP_ASYNC_BULK_COMMIT_GROUP :
+  NVPTXInst<(outs), (ins), "cp.async.bulk.commit_group;",
+  [(int_nvvm_cp_async_bulk_commit_group)]>,
+  Requires<[hasPTX<80>, hasSM<90>]>;
+
+def CP_ASYNC_BULK_WAIT_GROUP :
+  NVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group $n;",
+  [(int_nvvm_cp_async_bulk_wait_group (i32 timm:$n))]>,
+  Requires<[hasPTX<80>, hasSM<90>]>;
+
+def CP_ASYNC_BULK_WAIT_GROUP_READ :
+  NVPTXInst<(outs), (ins i32imm:$n), "cp.async.bulk.wait_group.read $n;",
+  [(int_nvvm_cp_async_bulk_wait_group_read (i32 timm:$n))]>,
+  Requires<[hasPTX<80>, hasSM<90>]>;
+
 //-----------------------------------
 // MBarrier Functions
 //-----------------------------------
diff --git a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
index 181fbf2112910..c405cc97674d6 100644
--- a/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
+++ b/llvm/test/CodeGen/NVPTX/intrinsics-sm90.ll
@@ -144,6 +144,31 @@ define void @test_barrier_cluster_aligned() {
        ret void
 }
 
+; CHECK-LABEL: test_cp_async_bulk_commit_group(
+define void @test_cp_async_bulk_commit_group() {
+; CHECK: cp.async.bulk.commit_group;
+       call void @llvm.nvvm.cp.async.bulk.commit.group()
+       ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_wait_group(
+define void @test_cp_async_bulk_wait_group() {
+; CHECK: cp.async.bulk.wait_group 8;
+       call void @llvm.nvvm.cp.async.bulk.wait.group(i32 8)
+; CHECK: cp.async.bulk.wait_group 0;
+       call void @llvm.nvvm.cp.async.bulk.wait.group(i32 0)
+       ret void
+}
+
+; CHECK-LABEL: test_cp_async_bulk_wait_group_read(
+define void @test_cp_async_bulk_wait_group_read() {
+; CHECK: cp.async.bulk.wait_group.read 8;
+       call void @llvm.nvvm.cp.async.bulk.wait.group.read(i32 8)
+; CHECK: cp.async.bulk.wait_group.read 0;
+       call void @llvm.nvvm.cp.async.bulk.wait.group.read(i32 0)
+       ret void
+}
+
 declare i1 @llvm.nvvm.isspacep.shared.cluster(ptr %p);
 declare ptr @llvm.nvvm.mapa(ptr %p, i32 %r);
 declare ptr addrspace(3) @llvm.nvvm.mapa.shared.cluster(ptr addrspace(3) %p, i32 %r);
@@ -167,3 +192,6 @@ declare void @llvm.nvvm.barrier.cluster.arrive.aligned()
 declare void @llvm.nvvm.barrier.cluster.arrive.relaxed.aligned()
 declare void @llvm.nvvm.barrier.cluster.wait.aligned()
 declare void @llvm.nvvm.fence.sc.cluster()
+declare void @llvm.nvvm.cp.async.bulk.commit.group()
+declare void @llvm.nvvm.cp.async.bulk.wait.group(i32)
+declare void @llvm.nvvm.cp.async.bulk.wait.group.read(i32)

From 2521e9785dd640920d97b110a8e5b6886e09b851 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <2467754+j2kun@users.noreply.github.com>
Date: Fri, 19 Jan 2024 10:55:43 -0800
Subject: [PATCH 162/843] [mlir][transform]: fix broken bazel build (#78757)

Broken by
https://github.com/llvm/llvm-project/commit/42b160356fe5d3b41bf07c428d0142d3721b1d44
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 3facb043c02c1..25e47a2c9c8a1 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -4568,6 +4568,7 @@ cc_library(
         ":LLVMCommonConversion",
         ":LLVMDialect",
         ":TransformDialect",
+        ":TransformUtils",
     ],
 )
 

From 5330daad41d761aaa1aea554dbeca7c050da7fb6 Mon Sep 17 00:00:00 2001
From: Min-Yih Hsu <min.hsu@sifive.com>
Date: Fri, 19 Jan 2024 11:09:35 -0800
Subject: [PATCH 163/843] [RISCV] Add support for Smepmp 1.0 (#78489)

Smepmp is a supervisor extension that prevents privileged processes from
accessing unprivileged program and data.

Spec: https://github.com/riscv/riscv-tee/blob/main/Smepmp/Smepmp.pdf
---
 clang/test/Preprocessor/riscv-target-features.c | 9 +++++++++
 llvm/docs/RISCVUsage.rst                        | 1 +
 llvm/docs/ReleaseNotes.rst                      | 1 +
 llvm/lib/Support/RISCVISAInfo.cpp               | 1 +
 llvm/lib/Target/RISCV/RISCVFeatures.td          | 4 ++++
 llvm/test/CodeGen/RISCV/attributes.ll           | 4 ++++
 llvm/test/MC/RISCV/attribute-arch.s             | 3 +++
 llvm/unittests/Support/RISCVISAInfoTest.cpp     | 1 +
 8 files changed, 24 insertions(+)

diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index d5ec93e292bf2..4fb3f0bab36b0 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -21,6 +21,7 @@
 // CHECK-NOT: __riscv_muldiv {{.*$}}
 // CHECK-NOT: __riscv_smaia {{.*$}}
 // CHECK-NOT: __riscv_ssaia {{.*$}}
+// CHECK-NOT: __riscv_smepmp {{.*$}}
 // CHECK-NOT: __riscv_svinval {{.*$}}
 // CHECK-NOT: __riscv_svnapot {{.*$}}
 // CHECK-NOT: __riscv_svpbmt {{.*$}}
@@ -1108,6 +1109,14 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-SSAIA-EXT %s
 // CHECK-SSAIA-EXT: __riscv_ssaia  1000000{{$}}
 
+// RUN: %clang --target=riscv32 \
+// RUN: -march=rv32ismepmp1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-SMEPMP-EXT %s
+// RUN: %clang --target=riscv64 \
+// RUN: -march=rv64ismepmp1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-SMEPMP-EXT %s
+// CHECK-SMEPMP-EXT: __riscv_smepmp  1000000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN: -march=rv32izfa -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZFA-EXT %s
diff --git a/llvm/docs/RISCVUsage.rst b/llvm/docs/RISCVUsage.rst
index fc4d97b134371..6fdc945ad2707 100644
--- a/llvm/docs/RISCVUsage.rst
+++ b/llvm/docs/RISCVUsage.rst
@@ -92,6 +92,7 @@ on support follow.
      ``H``            Assembly Support
      ``M``            Supported
      ``Smaia``        Supported
+     ``Smepmp``       Supported
      ``Ssaia``        Supported
      ``Svinval``      Assembly Support
      ``Svnapot``      Assembly Support
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 4345d01021f17..82cf130ffd186 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -160,6 +160,7 @@ Changes to the RISC-V Backend
 * Support was added for the Ziccif, Ziccrse, Ziccamoa, Zicclsm, Za64rs, Za128rs
   and Zic64b extensions which were introduced as a part of the RISC-V Profiles
   specification.
+* The Smepmp 1.0 extension is now supported.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/lib/Support/RISCVISAInfo.cpp b/llvm/lib/Support/RISCVISAInfo.cpp
index 8c9eb1bddb3cb..3c02492e99f1d 100644
--- a/llvm/lib/Support/RISCVISAInfo.cpp
+++ b/llvm/lib/Support/RISCVISAInfo.cpp
@@ -55,6 +55,7 @@ static const RISCVSupportedExtension SupportedExtensions[] = {
     {"m", {2, 0}},
 
     {"smaia", {1, 0}},
+    {"smepmp", {1, 0}},
     {"ssaia", {1, 0}},
     {"svinval", {1, 0}},
     {"svnapot", {1, 0}},
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 72780937dd887..115da8c4a1a93 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -748,6 +748,10 @@ def FeatureStdExtSsaia
                        "'Ssaia' (Advanced Interrupt Architecture Supervisor "
                        "Level)", []>;
 
+def FeatureStdExtSmepmp
+    : SubtargetFeature<"smepmp", "HasStdExtSmepmp", "true",
+                       "'Smepmp' (Enhanced Physical Memory Protection)", []>;
+
 def HasHalfFPLoadStoreMove
     : Predicate<"Subtarget->hasHalfFPLoadStoreMove()">,
                 AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZfhmin,
diff --git a/llvm/test/CodeGen/RISCV/attributes.ll b/llvm/test/CodeGen/RISCV/attributes.ll
index 3e55e0fb4e686..b90bef7525379 100644
--- a/llvm/test/CodeGen/RISCV/attributes.ll
+++ b/llvm/test/CodeGen/RISCV/attributes.ll
@@ -89,6 +89,7 @@
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zcmop %s -o - | FileCheck --check-prefix=RV32ZCMOP %s
 ; RUN: llc -mtriple=riscv32 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SMAIA %s
 ; RUN: llc -mtriple=riscv32 -mattr=+ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV32SSAIA %s
+; RUN: llc -mtriple=riscv32 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV32SMEPMP %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv32 -mattr=+experimental-zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV32ZVFBFWMA %s
@@ -190,6 +191,7 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zcmop %s -o - | FileCheck --check-prefix=RV64ZCMOP %s
 ; RUN: llc -mtriple=riscv64 -mattr=+smaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SMAIA %s
 ; RUN: llc -mtriple=riscv64 -mattr=+ssaia %s -o - | FileCheck --check-prefixes=CHECK,RV64SSAIA %s
+; RUN: llc -mtriple=riscv64 -mattr=+smepmp %s -o - | FileCheck --check-prefixes=CHECK,RV64SMEPMP %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfmin %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFMIN %s
 ; RUN: llc -mtriple=riscv64 -mattr=+experimental-zvfbfwma %s -o - | FileCheck --check-prefixes=CHECK,RV64ZVFBFWMA %s
@@ -286,6 +288,7 @@
 ; RV32ZCMOP: .attribute 5, "rv32i2p1_zca1p0_zcmop0p2"
 ; RV32SMAIA: .attribute 5, "rv32i2p1_smaia1p0"
 ; RV32SSAIA: .attribute 5, "rv32i2p1_ssaia1p0"
+; RV32SMEPMP: .attribute 5, "rv32i2p1_smepmp1p0"
 ; RV32ZFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0"
 ; RV32ZVFBFMIN: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0"
 ; RV32ZVFBFWMA: .attribute 5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvfbfwma1p0_zvl32b1p0"
@@ -386,6 +389,7 @@
 ; RV64ZCMOP: .attribute 5, "rv64i2p1_zca1p0_zcmop0p2"
 ; RV64SMAIA: .attribute 5, "rv64i2p1_smaia1p0"
 ; RV64SSAIA: .attribute 5, "rv64i2p1_ssaia1p0"
+; RV64SMEPMP: .attribute 5, "rv64i2p1_smepmp1p0"
 ; RV64ZFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0"
 ; RV64ZVFBFMIN: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvl32b1p0"
 ; RV64ZVFBFWMA: .attribute 5, "rv64i2p1_f2p2_zicsr2p0_zfbfmin1p0_zve32f1p0_zve32x1p0_zvfbfmin1p0_zvfbfwma1p0_zvl32b1p0"
diff --git a/llvm/test/MC/RISCV/attribute-arch.s b/llvm/test/MC/RISCV/attribute-arch.s
index b1a03bbfd74da..5f9a7cabcc768 100644
--- a/llvm/test/MC/RISCV/attribute-arch.s
+++ b/llvm/test/MC/RISCV/attribute-arch.s
@@ -294,6 +294,9 @@
 .attribute arch, "rv32i_ssaia1p0"
 # CHECK: attribute      5, "rv32i2p1_ssaia1p0"
 
+.attribute arch, "rv32i_smepmp1p0"
+# CHECK: attribute      5, "rv32i2p1_smepmp1p0"
+
 .attribute arch, "rv32i_zfbfmin1p0"
 # CHECK: .attribute     5, "rv32i2p1_f2p2_zicsr2p0_zfbfmin1p0"
 
diff --git a/llvm/unittests/Support/RISCVISAInfoTest.cpp b/llvm/unittests/Support/RISCVISAInfoTest.cpp
index 0b8bbc7c9027e..9b7112fa2bfeb 100644
--- a/llvm/unittests/Support/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/Support/RISCVISAInfoTest.cpp
@@ -760,6 +760,7 @@ R"(All available -march extensions for RISC-V
     zhinx               1.0
     zhinxmin            1.0
     smaia               1.0
+    smepmp              1.0
     ssaia               1.0
     svinval             1.0
     svnapot             1.0

From 76ffa8f63a817758d49c45297591e05760155044 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <2467754+j2kun@users.noreply.github.com>
Date: Fri, 19 Jan 2024 11:33:36 -0800
Subject: [PATCH 164/843] [mlir][transform]: fix broken bazel build for
 TensorTransformOps (#78766)

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 25e47a2c9c8a1..94093570cfeaf 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -7262,8 +7262,7 @@ cc_library(
         ":TensorTransforms",
         ":TensorUtils",
         ":TransformDialect",
-        ":ValueBoundsOpInterface",
-        "//llvm:Support",
+        ":TransformUtils",
     ],
 )
 

From c067524852816c9f6a99ddbb888cd33b9c5a2042 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Fri, 19 Jan 2024 11:34:00 -0800
Subject: [PATCH 165/843] [SHT_LLVM_BB_ADDR_MAP] Add assertion and clarify
 docstring (#77374)

This patch adds an assertion to readBBAddrMapImpl to confirm that
PGOAnalyses and BBAddrMaps are of the same size when PGO information is
requested (part of the API contract). This patch also updates the
docstring for readBBAddrMap to better clarify what is guaranteed.
---
 llvm/include/llvm/Object/ELFObjectFile.h | 11 ++++++-----
 llvm/lib/Object/ELFObjectFile.cpp        |  4 ++++
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Object/ELFObjectFile.h b/llvm/include/llvm/Object/ELFObjectFile.h
index 86f42654f8af8..7124df50b561d 100644
--- a/llvm/include/llvm/Object/ELFObjectFile.h
+++ b/llvm/include/llvm/Object/ELFObjectFile.h
@@ -110,11 +110,12 @@ class ELFObjectFileBase : public ObjectFile {
   Expected<std::vector<VersionEntry>> readDynsymVersions() const;
 
   /// Returns a vector of all BB address maps in the object file. When
-  // `TextSectionIndex` is specified, only returns the BB address maps
-  // corresponding to the section with that index. When `PGOAnalyses`is
-  // specified, the vector is cleared then filled with extra PGO data.
-  // `PGOAnalyses` will always be the same length as the return value on
-  // success, otherwise it is empty.
+  /// `TextSectionIndex` is specified, only returns the BB address maps
+  /// corresponding to the section with that index. When `PGOAnalyses`is
+  /// specified (PGOAnalyses is not nullptr), the vector is cleared then filled
+  /// with extra PGO data. `PGOAnalyses` will always be the same length as the
+  /// return value when it is requested assuming no error occurs. Upon failure,
+  /// `PGOAnalyses` will be emptied.
   Expected<std::vector<BBAddrMap>>
   readBBAddrMap(std::optional<unsigned> TextSectionIndex = std::nullopt,
                 std::vector<PGOAnalysisMap> *PGOAnalyses = nullptr) const;
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index ae21b81c10c82..28b96c341e3ff 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -830,6 +830,10 @@ Expected<std::vector<BBAddrMap>> static readBBAddrMapImpl(
     std::move(BBAddrMapOrErr->begin(), BBAddrMapOrErr->end(),
               std::back_inserter(BBAddrMaps));
   }
+  if (PGOAnalyses)
+    assert(PGOAnalyses->size() == BBAddrMaps.size() &&
+           "The same number of BBAddrMaps and PGOAnalysisMaps should be "
+           "returned when PGO information is requested");
   return BBAddrMaps;
 }
 

From 2b31a673de51061d0407b79127054a5083659efc Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Fri, 19 Jan 2024 11:34:16 -0800
Subject: [PATCH 166/843] [llvm-exegesis] Make duplicate snippet repetitor
 produce whole snippets (#77224)

Currently, the duplicate snippet repetitor will truncate snippets that
do not exactly divide the minimum number of instructions. This patch
corrects that behavior by making the duplicate snippet repetitor
duplicate the snippet in its entirety until the minimum number of
instructions has been reached.

This makes the behavior consistent with the loop snippet repetitor,
which will execute at least `--num-repetitions` (soon to be renamed
`--min-instructions`) instructions.
---
 .../llvm-exegesis/lib/SnippetRepetitor.cpp     |  8 ++++----
 .../llvm-exegesis/X86/SnippetRepetitorTest.cpp | 18 ++++++++++++++++--
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
index 1872f550c3f31..c1660c2d4a1b8 100644
--- a/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
+++ b/llvm/tools/llvm-exegesis/lib/SnippetRepetitor.cpp
@@ -30,10 +30,10 @@ class DuplicateSnippetRepetitor : public SnippetRepetitor {
             CleanupMemory](FunctionFiller &Filler) {
       auto Entry = Filler.getEntry();
       if (!Instructions.empty()) {
-        // Add the whole snippet at least once.
-        Entry.addInstructions(Instructions);
-        for (unsigned I = Instructions.size(); I < MinInstructions; ++I) {
-          Entry.addInstruction(Instructions[I % Instructions.size()]);
+        const unsigned NumRepetitions =
+            divideCeil(MinInstructions, Instructions.size());
+        for (unsigned I = 0; I < NumRepetitions; ++I) {
+          Entry.addInstructions(Instructions);
         }
       }
       Entry.addReturn(State.getExegesisTarget(), CleanupMemory);
diff --git a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
index d2382ec0cddc4..25e8836087c15 100644
--- a/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
+++ b/llvm/unittests/tools/llvm-exegesis/X86/SnippetRepetitorTest.cpp
@@ -38,9 +38,11 @@ class X86SnippetRepetitorTest : public X86TestBase {
     MF = &createVoidVoidPtrMachineFunction("TestFn", Mod.get(), MMI.get());
   }
 
-  void TestCommon(Benchmark::RepetitionModeE RepetitionMode) {
+  void TestCommon(Benchmark::RepetitionModeE RepetitionMode,
+                  unsigned SnippetInstructions = 1) {
     const auto Repetitor = SnippetRepetitor::Create(RepetitionMode, State);
-    const std::vector<MCInst> Instructions = {MCInstBuilder(X86::NOOP)};
+    const std::vector<MCInst> Instructions(SnippetInstructions,
+                                           MCInstBuilder(X86::NOOP));
     FunctionFiller Sink(*MF, {X86::EAX});
     const auto Fill =
         Repetitor->Repeat(Instructions, kMinInstructions, kLoopBodySize, false);
@@ -74,6 +76,18 @@ TEST_F(X86SnippetRepetitorTest, Duplicate) {
                           HasOpcode(X86::NOOP), HasOpcode(X86::RET64)));
 }
 
+TEST_F(X86SnippetRepetitorTest, DuplicateSnippetInstructionCount) {
+  TestCommon(Benchmark::Duplicate, 2);
+  // Duplicating a snippet of two instructions with the minimum number of
+  // instructions set to three duplicates the snippet twice for a total of
+  // four instructions.
+  ASSERT_EQ(MF->getNumBlockIDs(), 1u);
+  EXPECT_THAT(MF->getBlockNumbered(0)->instrs(),
+              ElementsAre(HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
+                          HasOpcode(X86::NOOP), HasOpcode(X86::NOOP),
+                          HasOpcode(X86::RET64)));
+}
+
 TEST_F(X86SnippetRepetitorTest, Loop) {
   TestCommon(Benchmark::Loop);
   // Duplicating creates an entry block, a loop body and a ret block.

From 924701311aa79180e86ad8ce43d253f27d25ec7d Mon Sep 17 00:00:00 2001
From: Mital Ashok <mital@mitalashok.co.uk>
Date: Fri, 19 Jan 2024 20:10:51 +0000
Subject: [PATCH 167/843] [SemaCXX] Implement CWG2137 (list-initialization from
 objects of the same type) (#77768)

Closes #77638, #24186

Rebased from <https://reviews.llvm.org/D156032>, see there for more
information.

Implements wording change in [CWG2137](https://wg21.link/CWG2137) in the
first commit.

This also implements an approach to [CWG2311](https://wg21.link/CWG2311)
in the second commit, because too much code that relies on `T{ T_prvalue}`
being an elision would break. Because that issue is still open and
the CWG issue doesn't provide wording to fix the issue, there may be
different behaviours on other compilers.
---
 clang/docs/ReleaseNotes.rst                   |  5 ++
 clang/lib/Sema/SemaInit.cpp                   | 40 ++++++---
 clang/lib/Sema/SemaOverload.cpp               | 38 ++++++---
 clang/test/CXX/drs/dr14xx.cpp                 | 10 ---
 clang/test/CXX/drs/dr21xx.cpp                 | 45 ++++++++++
 clang/test/CXX/drs/dr23xx.cpp                 | 85 +++++++++++++++++++
 clang/www/cxx_dr_status.html                  |  4 +-
 .../pairs.pair/ctor.pair_U_V_move.pass.cpp    | 21 ++++-
 8 files changed, 215 insertions(+), 33 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 392f694065a24..585e1535b1585 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -229,6 +229,11 @@ C++2c Feature Support
 
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+- Implemented `CWG2137 <https://wg21.link/CWG2137>`_ which allows
+  list-initialization from objects of the same type.
+- Implemented `CWG2311 <https://wg21.link/CWG2311>`_: given a prvalue ``e`` of object type
+  ``T``, ``T{e}`` will try to resolve an initializer list constructor and will use it if successful (CWG2137).
+  Otherwise, if there is no initializer list constructor, the copy will be elided as if it was ``T(e)``.
 
 - Implemented `CWG2598 <https://wg21.link/CWG2598>`_ and `CWG2096 <https://wg21.link/CWG2096>`_,
   making unions (that have either no members or at least one literal member) literal types.
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 18440a69e3a3d..3170f41e8033f 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -4200,7 +4200,7 @@ static OverloadingResult ResolveConstructorOverload(
 /// \param IsListInit     Is this list-initialization?
 /// \param IsInitListCopy Is this non-list-initialization resulting from a
 ///                       list-initialization from {x} where x is the same
-///                       type as the entity?
+///                       aggregate type as the entity?
 static void TryConstructorInitialization(Sema &S,
                                          const InitializedEntity &Entity,
                                          const InitializationKind &Kind,
@@ -4230,6 +4230,14 @@ static void TryConstructorInitialization(Sema &S,
         Entity.getKind() !=
             InitializedEntity::EK_LambdaToBlockConversionBlockElement);
 
+  bool CopyElisionPossible = false;
+  auto ElideConstructor = [&] {
+    // Convert qualifications if necessary.
+    Sequence.AddQualificationConversionStep(DestType, VK_PRValue);
+    if (ILE)
+      Sequence.RewrapReferenceInitList(DestType, ILE);
+  };
+
   // C++17 [dcl.init]p17:
   //     - If the initializer expression is a prvalue and the cv-unqualified
   //       version of the source type is the same class as the class of the
@@ -4242,11 +4250,17 @@ static void TryConstructorInitialization(Sema &S,
   if (S.getLangOpts().CPlusPlus17 && !RequireActualConstructor &&
       UnwrappedArgs.size() == 1 && UnwrappedArgs[0]->isPRValue() &&
       S.Context.hasSameUnqualifiedType(UnwrappedArgs[0]->getType(), DestType)) {
-    // Convert qualifications if necessary.
-    Sequence.AddQualificationConversionStep(DestType, VK_PRValue);
-    if (ILE)
-      Sequence.RewrapReferenceInitList(DestType, ILE);
-    return;
+    if (ILE && !DestType->isAggregateType()) {
+      // CWG2311: T{ prvalue_of_type_T } is not eligible for copy elision
+      // Make this an elision if this won't call an initializer-list
+      // constructor. (Always on an aggregate type or check constructors first.)
+      assert(!IsInitListCopy &&
+             "IsInitListCopy only possible with aggregate types");
+      CopyElisionPossible = true;
+    } else {
+      ElideConstructor();
+      return;
+    }
   }
 
   const RecordType *DestRecordType = DestType->getAs<RecordType>();
@@ -4291,6 +4305,12 @@ static void TryConstructorInitialization(Sema &S,
           S, Kind.getLocation(), Args, CandidateSet, DestType, Ctors, Best,
           CopyInitialization, AllowExplicit,
           /*OnlyListConstructors=*/true, IsListInit, RequireActualConstructor);
+
+    if (CopyElisionPossible && Result == OR_No_Viable_Function) {
+      // No initializer list candidate
+      ElideConstructor();
+      return;
+    }
   }
 
   // C++11 [over.match.list]p1:
@@ -4572,9 +4592,9 @@ static void TryListInitialization(Sema &S,
     return;
   }
 
-  // C++11 [dcl.init.list]p3, per DR1467:
-  // - If T is a class type and the initializer list has a single element of
-  //   type cv U, where U is T or a class derived from T, the object is
+  // C++11 [dcl.init.list]p3, per DR1467 and DR2137:
+  // - If T is an aggregate class and the initializer list has a single element
+  //   of type cv U, where U is T or a class derived from T, the object is
   //   initialized from that element (by copy-initialization for
   //   copy-list-initialization, or by direct-initialization for
   //   direct-list-initialization).
@@ -4585,7 +4605,7 @@ static void TryListInitialization(Sema &S,
   // - Otherwise, if T is an aggregate, [...] (continue below).
   if (S.getLangOpts().CPlusPlus11 && InitList->getNumInits() == 1 &&
       !IsDesignatedInit) {
-    if (DestType->isRecordType()) {
+    if (DestType->isRecordType() && DestType->isAggregateType()) {
       QualType InitType = InitList->getInit(0)->getType();
       if (S.Context.hasSameUnqualifiedType(InitType, DestType) ||
           S.IsDerivedFrom(InitList->getBeginLoc(), InitType, DestType)) {
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index bbbd0abc82d74..6ee5f26d55c3a 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1568,19 +1568,37 @@ TryUserDefinedConversion(Sema &S, Expr *From, QualType ToType,
     //   called for those cases.
     if (CXXConstructorDecl *Constructor
           = dyn_cast<CXXConstructorDecl>(ICS.UserDefined.ConversionFunction)) {
-      QualType FromCanon
-        = S.Context.getCanonicalType(From->getType().getUnqualifiedType());
+      QualType FromType;
+      SourceLocation FromLoc;
+      // C++11 [over.ics.list]p6, per DR2137:
+      // C++17 [over.ics.list]p6:
+      //   If C is not an initializer-list constructor and the initializer list
+      //   has a single element of type cv U, where U is X or a class derived
+      //   from X, the implicit conversion sequence has Exact Match rank if U is
+      //   X, or Conversion rank if U is derived from X.
+      if (const auto *InitList = dyn_cast<InitListExpr>(From);
+          InitList && InitList->getNumInits() == 1 &&
+          !S.isInitListConstructor(Constructor)) {
+        const Expr *SingleInit = InitList->getInit(0);
+        FromType = SingleInit->getType();
+        FromLoc = SingleInit->getBeginLoc();
+      } else {
+        FromType = From->getType();
+        FromLoc = From->getBeginLoc();
+      }
+      QualType FromCanon =
+          S.Context.getCanonicalType(FromType.getUnqualifiedType());
       QualType ToCanon
         = S.Context.getCanonicalType(ToType).getUnqualifiedType();
       if (Constructor->isCopyConstructor() &&
           (FromCanon == ToCanon ||
-           S.IsDerivedFrom(From->getBeginLoc(), FromCanon, ToCanon))) {
+           S.IsDerivedFrom(FromLoc, FromCanon, ToCanon))) {
         // Turn this into a "standard" conversion sequence, so that it
         // gets ranked with standard conversion sequences.
         DeclAccessPair Found = ICS.UserDefined.FoundConversionFunction;
         ICS.setStandard();
         ICS.Standard.setAsIdentityConversion();
-        ICS.Standard.setFromType(From->getType());
+        ICS.Standard.setFromType(FromType);
         ICS.Standard.setAllToTypes(ToType);
         ICS.Standard.CopyConstructor = Constructor;
         ICS.Standard.FoundCopyConstructor = Found;
@@ -5306,18 +5324,18 @@ TryListConversion(Sema &S, InitListExpr *From, QualType ToType,
       IsDesignatedInit)
     return Result;
 
-  // Per DR1467:
-  //   If the parameter type is a class X and the initializer list has a single
-  //   element of type cv U, where U is X or a class derived from X, the
-  //   implicit conversion sequence is the one required to convert the element
-  //   to the parameter type.
+  // Per DR1467 and DR2137:
+  //   If the parameter type is an aggregate class X and the initializer list
+  //   has a single element of type cv U, where U is X or a class derived from
+  //   X, the implicit conversion sequence is the one required to convert the
+  //   element to the parameter type.
   //
   //   Otherwise, if the parameter type is a character array [... ]
   //   and the initializer list has a single element that is an
   //   appropriately-typed string literal (8.5.2 [dcl.init.string]), the
   //   implicit conversion sequence is the identity conversion.
   if (From->getNumInits() == 1 && !IsDesignatedInit) {
-    if (ToType->isRecordType()) {
+    if (ToType->isRecordType() && ToType->isAggregateType()) {
       QualType InitType = From->getInit(0)->getType();
       if (S.Context.hasSameUnqualifiedType(InitType, ToType) ||
           S.IsDerivedFrom(From->getBeginLoc(), InitType, ToType))
diff --git a/clang/test/CXX/drs/dr14xx.cpp b/clang/test/CXX/drs/dr14xx.cpp
index d262f6f9dcab7..4c29d03a6e117 100644
--- a/clang/test/CXX/drs/dr14xx.cpp
+++ b/clang/test/CXX/drs/dr14xx.cpp
@@ -488,16 +488,6 @@ namespace dr1467 {  // dr1467: 3.7 c++11
     }
   } // nonaggregate
 
-  namespace SelfInitIsNotListInit {
-    struct S {
-      S();
-      explicit S(S &);
-      S(const S &);
-    };
-    S s1;
-    S s2 = {s1}; // ok, not list-initialization so we pick the non-explicit constructor
-  }
-
   struct NestedInit { int a, b, c; };
   NestedInit ni[1] = {{NestedInit{1, 2, 3}}};
 
diff --git a/clang/test/CXX/drs/dr21xx.cpp b/clang/test/CXX/drs/dr21xx.cpp
index a7e50df3f374b..87040246aa5cd 100644
--- a/clang/test/CXX/drs/dr21xx.cpp
+++ b/clang/test/CXX/drs/dr21xx.cpp
@@ -11,6 +11,16 @@
 // cxx98-error@-1 {{variadic macros are a C99 feature}}
 #endif
 
+namespace std {
+  __extension__ typedef __SIZE_TYPE__ size_t;
+
+  template<typename E> struct initializer_list {
+    const E *p; size_t n;
+    initializer_list(const E *p, size_t n);
+    initializer_list();
+  };
+}
+
 namespace dr2100 { // dr2100: 12
   template<const int *P, bool = true> struct X {};
   template<typename T> struct A {
@@ -132,6 +142,41 @@ namespace dr2126 { // dr2126: 12
 #endif
 }
 
+namespace dr2137 { // dr2137: 18
+#if __cplusplus >= 201103L
+  struct Q {
+    Q();
+    Q(Q&&);
+    Q(std::initializer_list<Q>) = delete; // #dr2137-Qcons
+  };
+
+  Q x = Q { Q() };
+  // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}}
+  //   since-cxx11-note@#dr2137-Qcons {{'Q' has been explicitly marked deleted here}}
+
+  int f(Q); // #dr2137-f
+  int y = f({ Q() });
+  // since-cxx11-error@-1 {{call to deleted constructor of 'Q'}}
+  //   since-cxx11-note@#dr2137-Qcons {{'Q' has been explicitly marked deleted here}}
+  //   since-cxx11-note@#dr2137-f {{passing argument to parameter here}}
+
+  struct U {
+    U();
+    U(const U&);
+  };
+
+  struct Derived : U {
+    Derived();
+    Derived(const Derived&);
+  } d;
+
+  int g(Derived);
+  int g(U(&&)[1]) = delete;
+
+  int z = g({ d });
+#endif
+}
+
 namespace dr2140 { // dr2140: 9
 #if __cplusplus >= 201103L
   union U { int a; decltype(nullptr) b; };
diff --git a/clang/test/CXX/drs/dr23xx.cpp b/clang/test/CXX/drs/dr23xx.cpp
index 03077ae9239a4..d8556998315c7 100644
--- a/clang/test/CXX/drs/dr23xx.cpp
+++ b/clang/test/CXX/drs/dr23xx.cpp
@@ -6,6 +6,16 @@
 // RUN: %clang_cc1 -std=c++23 %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
 // RUN: %clang_cc1 -std=c++2c %s -verify=expected,since-cxx11,since-cxx14,since-cxx17,since-cxx20 -fexceptions -fcxx-exceptions -pedantic-errors 2>&1 | FileCheck %s
 
+namespace std {
+  __extension__ typedef __SIZE_TYPE__ size_t;
+
+  template<typename E> struct initializer_list {
+    const E *p; size_t n;
+    initializer_list(const E *p, size_t n);
+    initializer_list();
+  };
+}
+
 #if __cplusplus >= 201103L
 namespace dr2303 { // dr2303: 12
 template <typename... T>
@@ -47,6 +57,81 @@ void g() {
 } //namespace dr2303
 #endif
 
+namespace dr2311 {  // dr2311: 18 open
+#if __cplusplus >= 201707L
+template<typename T>
+void test() {
+  // Ensure none of these try to call a move constructor.
+  T a = T{T(0)};
+  T b{T(0)};
+  auto c{T(0)};
+  T d = {T(0)};
+  auto e = {T(0)};
+#if __cplusplus >= 202302L
+  auto f = auto{T(0)};
+#endif
+  void(*fn)(T);
+  fn({T(0)});
+}
+
+struct NonMovable {
+  NonMovable(int);
+  NonMovable(NonMovable&&) = delete;
+};
+struct NonMovableNonApplicableIList {
+  NonMovableNonApplicableIList(int);
+  NonMovableNonApplicableIList(NonMovableNonApplicableIList&&) = delete;
+  NonMovableNonApplicableIList(std::initializer_list<int>);
+};
+struct ExplicitMovable {
+  ExplicitMovable(int);
+  explicit ExplicitMovable(ExplicitMovable&&);
+};
+struct ExplicitNonMovable {
+  ExplicitNonMovable(int);
+  explicit ExplicitNonMovable(ExplicitNonMovable&&) = delete;
+};
+struct ExplicitNonMovableNonApplicableIList {
+  ExplicitNonMovableNonApplicableIList(int);
+  explicit ExplicitNonMovableNonApplicableIList(ExplicitNonMovableNonApplicableIList&&) = delete;
+  ExplicitNonMovableNonApplicableIList(std::initializer_list<int>);
+};
+struct CopyOnly {
+  CopyOnly(int);
+  CopyOnly(const CopyOnly&);
+  CopyOnly(CopyOnly&&) = delete;
+};
+struct ExplicitCopyOnly {
+  ExplicitCopyOnly(int);
+  explicit ExplicitCopyOnly(const ExplicitCopyOnly&);
+  explicit ExplicitCopyOnly(ExplicitCopyOnly&&) = delete;
+};
+
+template void test<NonMovable>();
+template void test<NonMovableNonApplicableIList>();
+template void test<ExplicitMovable>();
+template void test<ExplicitNonMovable>();
+template void test<ExplicitNonMovableNonApplicableIList>();
+template void test<CopyOnly>();
+template void test<ExplicitCopyOnly>();
+
+struct any {
+    template<typename T>
+    any(T&&);
+};
+
+template<typename T>
+struct X {
+    X();
+    X(T) = delete; // #dr2311-X
+};
+
+X<std::initializer_list<any>> x{ X<std::initializer_list<any>>() };
+// since-cxx17-error@-1 {{call to deleted constructor of 'X<std::initializer_list<any>>'}}
+//   since-cxx17-note@#dr2311-X {{'X' has been explicitly marked deleted here}}
+#endif
+}
+
 // dr2331: na
 // dr2335 is in dr2335.cxx
 
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index 3ee359945582f..b29a746bcf6e8 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -12630,7 +12630,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2137.html">2137</a></td>
     <td>CD4</td>
     <td>List-initialization from object of same type</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr id="2138">
     <td><a href="https://cplusplus.github.io/CWG/issues/2138.html">2138</a></td>
@@ -13674,7 +13674,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/2311.html">2311</a></td>
     <td>open</td>
     <td>Missed case for guaranteed copy elision</td>
-    <td align="center">Not resolved</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr id="2312">
     <td><a href="https://cplusplus.github.io/CWG/issues/2312.html">2312</a></td>
diff --git a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp
index 3b2d093eb34d4..30fdb19fd3aeb 100644
--- a/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp
+++ b/libcxx/test/std/utilities/utility/pairs/pairs.pair/ctor.pair_U_V_move.pass.cpp
@@ -121,7 +121,26 @@ int main(int, char**)
         test_pair_rv<CopyOnly, CopyOnly&>();
         test_pair_rv<CopyOnly, CopyOnly&&>();
 
-        test_pair_rv<ExplicitTypes::CopyOnly, ExplicitTypes::CopyOnly>();
+        /* For ExplicitTypes::CopyOnly, two of the viable candidates for initializing from a non-const xvalue are:
+         *   pair(const pair&);  // (defaulted copy constructor)
+         *   template<class U1, class U2> explicit pair(const pair<U1, U2>&&); [U1 = ExplicitTypes::CopyOnly, U2 = int]
+         * This results in diverging behavior for test_convertible which uses copy-list-initialization
+         * Prior to CWG2137, this would have selected the first (non-explicit) ctor as explicit ctors would not be considered
+         * Afterwards, it should select the second since it is a better match, and then failed because it is explicit
+         *
+         * This may change with future defect reports, and some compilers only have partial support for CWG2137,
+         * so use std::is_convertible directly to avoid a copy-list-initialization
+         */
+        {
+          using P1  = std::pair<ExplicitTypes::CopyOnly, int>;
+          using P2  = std::pair<int, ExplicitTypes::CopyOnly>;
+          using UP1 = std::pair<ExplicitTypes::CopyOnly, int>&&;
+          using UP2 = std::pair<int, ExplicitTypes::CopyOnly>&&;
+          static_assert(std::is_constructible<P1, UP1>::value, "");
+          static_assert(std::is_convertible<P1, UP1>::value, "");
+          static_assert(std::is_constructible<P2, UP2>::value, "");
+          static_assert(std::is_convertible<P2, UP2>::value, "");
+        }
         test_pair_rv<ExplicitTypes::CopyOnly, ExplicitTypes::CopyOnly&, true, false>();
         test_pair_rv<ExplicitTypes::CopyOnly, ExplicitTypes::CopyOnly&&, true, false>();
 

From 9ae28fb9d3b8d8f3db7cc55c164aa1ceb4df300c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 19 Jan 2024 12:15:08 -0800
Subject: [PATCH 168/843] [RISCV] Prevent RISCVMergeBaseOffsetOpt from calling
 getVRegDef on a physical register. (#78762)

Fixes #78679.
---
 .../lib/Target/RISCV/RISCVMergeBaseOffset.cpp | 16 +++--
 .../test/CodeGen/RISCV/fold-addi-loadstore.ll | 66 +++++++++++++++++++
 2 files changed, 75 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index ae46d5554d350..866b2453579cc 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -181,7 +181,7 @@ bool RISCVMergeBaseOffsetOpt::foldLargeOffset(MachineInstr &Hi,
   Register Reg = Rs == GAReg ? Rt : Rs;
 
   // Can't fold if the register has more than one use.
-  if (!MRI->hasOneUse(Reg))
+  if (!Reg.isVirtual() || !MRI->hasOneUse(Reg))
     return false;
   // This can point to an ADDI(W) or a LUI:
   MachineInstr &OffsetTail = *MRI->getVRegDef(Reg);
@@ -192,9 +192,11 @@ bool RISCVMergeBaseOffsetOpt::foldLargeOffset(MachineInstr &Hi,
     MachineOperand &AddiImmOp = OffsetTail.getOperand(2);
     if (AddiImmOp.getTargetFlags() != RISCVII::MO_None)
       return false;
+    Register AddiReg = OffsetTail.getOperand(1).getReg();
+    if (!AddiReg.isVirtual())
+      return false;
     int64_t OffLo = AddiImmOp.getImm();
-    MachineInstr &OffsetLui =
-        *MRI->getVRegDef(OffsetTail.getOperand(1).getReg());
+    MachineInstr &OffsetLui = *MRI->getVRegDef(AddiReg);
     MachineOperand &LuiImmOp = OffsetLui.getOperand(1);
     if (OffsetLui.getOpcode() != RISCV::LUI ||
         LuiImmOp.getTargetFlags() != RISCVII::MO_None ||
@@ -246,14 +248,14 @@ bool RISCVMergeBaseOffsetOpt::foldShiftedOffset(MachineInstr &Hi,
           TailShXAdd.getOpcode() == RISCV::SH3ADD) &&
          "Expected SHXADD instruction!");
 
-  // The first source is the shifted operand.
-  Register Rs1 = TailShXAdd.getOperand(1).getReg();
-
   if (GAReg != TailShXAdd.getOperand(2).getReg())
     return false;
 
+  // The first source is the shifted operand.
+  Register Rs1 = TailShXAdd.getOperand(1).getReg();
+
   // Can't fold if the register has more than one use.
-  if (!MRI->hasOneUse(Rs1))
+  if (!Rs1.isVirtual() || !MRI->hasOneUse(Rs1))
     return false;
   // This can point to an ADDI X0, C.
   MachineInstr &OffsetTail = *MRI->getVRegDef(Rs1);
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 7c2f775bca14e..74652cbc73b60 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -964,3 +964,69 @@ for.body:                                         ; preds = %for.body.lr.ph, %fo
 }
 
 declare void @f(ptr)
+
+@g = external dso_local global [100 x [100 x i8]]
+
+; This test used to crash due to calling getVRegDef on X0.
+define i32 @crash() {
+; RV32I-LABEL: crash:
+; RV32I:       # %bb.0: # %entry
+; RV32I-NEXT:    li a0, 1
+; RV32I-NEXT:    lui a1, %hi(g)
+; RV32I-NEXT:    addi a1, a1, %lo(g)
+; RV32I-NEXT:    add a0, a1, a0
+; RV32I-NEXT:    lbu a0, 400(a0)
+; RV32I-NEXT:    seqz a0, a0
+; RV32I-NEXT:    sw a0, 0(zero)
+; RV32I-NEXT:    li a0, 0
+; RV32I-NEXT:    ret
+;
+; RV32I-MEDIUM-LABEL: crash:
+; RV32I-MEDIUM:       # %bb.0: # %entry
+; RV32I-MEDIUM-NEXT:    li a0, 1
+; RV32I-MEDIUM-NEXT:  .Lpcrel_hi14:
+; RV32I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(g)
+; RV32I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi14)
+; RV32I-MEDIUM-NEXT:    add a0, a1, a0
+; RV32I-MEDIUM-NEXT:    lbu a0, 400(a0)
+; RV32I-MEDIUM-NEXT:    seqz a0, a0
+; RV32I-MEDIUM-NEXT:    sw a0, 0(zero)
+; RV32I-MEDIUM-NEXT:    li a0, 0
+; RV32I-MEDIUM-NEXT:    ret
+;
+; RV64I-LABEL: crash:
+; RV64I:       # %bb.0: # %entry
+; RV64I-NEXT:    li a0, 1
+; RV64I-NEXT:    lui a1, %hi(g)
+; RV64I-NEXT:    addi a1, a1, %lo(g)
+; RV64I-NEXT:    add a0, a1, a0
+; RV64I-NEXT:    lbu a0, 400(a0)
+; RV64I-NEXT:    seqz a0, a0
+; RV64I-NEXT:    sw a0, 0(zero)
+; RV64I-NEXT:    li a0, 0
+; RV64I-NEXT:    ret
+;
+; RV64I-MEDIUM-LABEL: crash:
+; RV64I-MEDIUM:       # %bb.0: # %entry
+; RV64I-MEDIUM-NEXT:    li a0, 1
+; RV64I-MEDIUM-NEXT:  .Lpcrel_hi14:
+; RV64I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(g)
+; RV64I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi14)
+; RV64I-MEDIUM-NEXT:    add a0, a1, a0
+; RV64I-MEDIUM-NEXT:    lbu a0, 400(a0)
+; RV64I-MEDIUM-NEXT:    seqz a0, a0
+; RV64I-MEDIUM-NEXT:    sw a0, 0(zero)
+; RV64I-MEDIUM-NEXT:    li a0, 0
+; RV64I-MEDIUM-NEXT:    ret
+entry:
+  %idxprom7.peel = sext i32 1 to i64
+  br label %for.inc.peel
+
+for.inc.peel:                                     ; preds = %entry
+  %arrayidx8.3.peel = getelementptr [100 x [100 x i8]], ptr @g, i64 0, i64 4, i64 %idxprom7.peel
+  %0 = load i8, ptr %arrayidx8.3.peel, align 1
+  %tobool.not.3.peel = icmp eq i8 %0, 0
+  %spec.select = select i1 %tobool.not.3.peel, i32 1, i32 0
+  store i32 %spec.select, ptr null, align 4
+  ret i32 0
+}

From 89592061a4d53a5b78ca033fb13ba9f9f27ab1b7 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Fri, 19 Jan 2024 15:13:58 -0500
Subject: [PATCH 169/843] Remove an unused API; NFC

Not only is this unused, it's really confusing having
getAPValueResult() and getResultAsAPValue() as sibling APIs
---
 clang/include/clang/AST/Expr.h | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 9de3e35609467..9820bd11da860 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -1136,7 +1136,6 @@ class ConstantExpr final
     return ConstantExprBits.APValueKind != APValue::None;
   }
   APValue getAPValueResult() const;
-  APValue &getResultAsAPValue() const { return APValueResult(); }
   llvm::APSInt getResultAsAPSInt() const;
   // Iterators
   child_range children() { return child_range(&SubExpr, &SubExpr+1); }

From 2c0d20668a11dd4cc9a678706cd3f0312ed97b23 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Fri, 19 Jan 2024 12:28:06 -0800
Subject: [PATCH 170/843] [libc] remove extra -Werror (#78761)

-Werror is now a global default as of
commit c52b467875e2 ("Reapply "[libc] build with -Werror (#73966)"
(#74506)")
---
 libc/src/signal/linux/CMakeLists.txt | 1 -
 1 file changed, 1 deletion(-)

diff --git a/libc/src/signal/linux/CMakeLists.txt b/libc/src/signal/linux/CMakeLists.txt
index d0cb0a79312b8..77a2453b25a0a 100644
--- a/libc/src/signal/linux/CMakeLists.txt
+++ b/libc/src/signal/linux/CMakeLists.txt
@@ -41,7 +41,6 @@ add_object_library(
     -fomit-frame-pointer
     -O3
     -Wframe-larger-than=0
-    -Werror
     -Wno-attributes
     # asan creates asan.module_ctor which uses stack space, causing warnings.
     -fno-sanitize=address

From f5e58a038033c8736fed91a7e89a5faad462abcc Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Fri, 19 Jan 2024 12:39:35 -0800
Subject: [PATCH 171/843] [lld][ELF] Simplify handleLibcall. NFC (#78659)

I noticed this while working on #78658
---
 lld/ELF/Driver.cpp | 8 +-------
 1 file changed, 1 insertion(+), 7 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 5ccc65600dcb9..07f4263c90e62 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2081,13 +2081,7 @@ static void handleUndefinedGlob(StringRef arg) {
 
 static void handleLibcall(StringRef name) {
   Symbol *sym = symtab.find(name);
-  if (!sym || !sym->isLazy())
-    return;
-
-  MemoryBufferRef mb;
-  mb = cast<LazyObject>(sym)->file->mb;
-
-  if (isBitcode(mb))
+  if (sym && sym->isLazy() && isa<BitcodeFile>(sym->file))
     sym->extract();
 }
 

From 4482fd846af849f399f915accca58d1c904243ef Mon Sep 17 00:00:00 2001
From: Pranav Kant <prka@google.com>
Date: Fri, 19 Jan 2024 21:02:20 +0000
Subject: [PATCH 172/843] Revert "[InstCombine] Try to fold
 trunc(shuffle(zext)) to just a shuffle (#78636)"

This reverts commit 4d11f04b20f0bd7488e19e8f178ba028412fa519.

This breaks some programs as mentioned in #78636
---
 llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp   | 10 ----------
 .../InstCombine/logical-select-inseltpoison.ll         |  7 +++++--
 llvm/test/Transforms/InstCombine/logical-select.ll     |  7 +++++--
 3 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 3470e61cd597f..6629ca840a67c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -103,13 +103,6 @@ Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
       }
     }
     break;
-  case Instruction::ShuffleVector: {
-    Value *Op0 = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
-    Value *Op1 = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
-    Res = new ShuffleVectorInst(Op0, Op1,
-                                cast<ShuffleVectorInst>(I)->getShuffleMask());
-    break;
-  }
   default:
     // TODO: Can handle more cases here.
     llvm_unreachable("Unreachable!");
@@ -370,9 +363,6 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
           I->getOpcode() == Instruction::FPToSI);
     return Ty->getScalarSizeInBits() >= MinBitWidth;
   }
-  case Instruction::ShuffleVector:
-    return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
-           canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
   default:
     // TODO: Can handle more cases here.
     break;
diff --git a/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
index b3d147621b59e..29e2cb42e1bef 100644
--- a/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
@@ -671,8 +671,11 @@ define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %
 
 define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i32> %y, <4 x i1> %cond1, <4 x i1> %cond2) {
 ; CHECK-LABEL: @computesignbits_through_two_input_shuffle(
-; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[COND1:%.*]], <4 x i1> [[COND2:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[COND]] to <4 x i1>
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[SEL]]
 ;
   %sext1 = sext <4 x i1> %cond1 to <4 x i32>
diff --git a/llvm/test/Transforms/InstCombine/logical-select.ll b/llvm/test/Transforms/InstCombine/logical-select.ll
index af1a3e1455e49..fcca9588767dd 100644
--- a/llvm/test/Transforms/InstCombine/logical-select.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select.ll
@@ -707,8 +707,11 @@ define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %
 
 define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i32> %y, <4 x i1> %cond1, <4 x i1> %cond2) {
 ; CHECK-LABEL: @computesignbits_through_two_input_shuffle(
-; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[COND1:%.*]], <4 x i1> [[COND2:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
+; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[COND]] to <4 x i1>
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[SEL]]
 ;
   %sext1 = sext <4 x i1> %cond1 to <4 x i32>

From 9175dd9cbcad01a47acea9f1b99a0c96bf1a9a29 Mon Sep 17 00:00:00 2001
From: Eric Miotto <emiotto@apple.com>
Date: Fri, 19 Jan 2024 13:32:32 -0800
Subject: [PATCH 173/843] [CMake] Detect properly new linker introduced in
 Xcode 15 (#77806)

As explained in [1], this linker is functionally equivalent to the
classic one (`ld64`) for build system purposes -- in particular to
enable the use of order files to link `clang`. For this reason, in
addition to fixing the detection rename `LLVM_LINKER_IS_LD64` to
`LLVM_LINKER_IS_APPLE` to make the result of such detection more
clear -- this should not cause any issue to downstream users, from
a quick search in SourceGraph [2], only Swift uses the value of
this variable (which I will take care of updating in due time).

[1]: https://developer.apple.com/documentation/xcode-release-notes/xcode-15-release-notes#Linking
[2]: https://sourcegraph.com/search?q=context:global+LLVM_LINKER_IS_LD64+lang:cmake+fork:no+-file:AddLLVM.cmake+-file:clang/tools/driver/CMakeLists.txt&patternType=standard&sm=1&groupBy=repo
rdar://120740222
---
 clang/tools/driver/CMakeLists.txt | 4 ++--
 llvm/cmake/modules/AddLLVM.cmake  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/tools/driver/CMakeLists.txt b/clang/tools/driver/CMakeLists.txt
index 2182486f93a55..d70b92b0984e5 100644
--- a/clang/tools/driver/CMakeLists.txt
+++ b/clang/tools/driver/CMakeLists.txt
@@ -103,10 +103,10 @@ if (APPLE)
 endif()
 
 if(CLANG_ORDER_FILE AND
-    (LLVM_LINKER_IS_LD64 OR LLVM_LINKER_IS_GOLD OR LLVM_LINKER_IS_LLD))
+    (LLVM_LINKER_IS_APPLE OR LLVM_LINKER_IS_GOLD OR LLVM_LINKER_IS_LLD))
   include(LLVMCheckLinkerFlag)
 
-  if (LLVM_LINKER_IS_LD64 OR (LLVM_LINKER_IS_LLD AND APPLE))
+  if (LLVM_LINKER_IS_APPLE OR (LLVM_LINKER_IS_LLD AND APPLE))
     set(LINKER_ORDER_FILE_OPTION "-Wl,-order_file,${CLANG_ORDER_FILE}")
   elseif (LLVM_LINKER_IS_GOLD)
     set(LINKER_ORDER_FILE_OPTION "-Wl,--section-ordering-file,${CLANG_ORDER_FILE}")
diff --git a/llvm/cmake/modules/AddLLVM.cmake b/llvm/cmake/modules/AddLLVM.cmake
index 14c0837c35964..5e98961855282 100644
--- a/llvm/cmake/modules/AddLLVM.cmake
+++ b/llvm/cmake/modules/AddLLVM.cmake
@@ -211,10 +211,10 @@ if (NOT DEFINED LLVM_LINKER_DETECTED AND NOT WIN32)
     )
 
   if(APPLE)
-    if("${stderr}" MATCHES "PROJECT:ld64")
+    if("${stderr}" MATCHES "PROGRAM:ld")
       set(LLVM_LINKER_DETECTED YES CACHE INTERNAL "")
-      set(LLVM_LINKER_IS_LD64 YES CACHE INTERNAL "")
-      message(STATUS "Linker detection: ld64")
+      set(LLVM_LINKER_IS_APPLE YES CACHE INTERNAL "")
+      message(STATUS "Linker detection: Apple")
     elseif("${stderr}" MATCHES "^LLD" OR
            "${stdout}" MATCHES "^LLD")
       set(LLVM_LINKER_DETECTED YES CACHE INTERNAL "")

From 5954b9dca21bb0c69b9e991b2ddb84c8b05ecba3 Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@users.noreply.github.com>
Date: Fri, 19 Jan 2024 13:36:59 -0800
Subject: [PATCH 174/843] [InstrProf] Adding utility weights to
 BalancedPartitioning (#72717)

Adding weights to utility nodes in BP so that we can give more
importance to
certain utilities. This is useful when we optimize several objectives
jointly.
---
 .../llvm/Support/BalancedPartitioning.h       | 26 ++++++-
 llvm/lib/ProfileData/InstrProf.cpp            |  9 +--
 llvm/lib/Support/BalancedPartitioning.cpp     | 57 +++++++++------
 .../ProfileData/BPFunctionNodeTest.cpp        | 30 +++++---
 .../Support/BalancedPartitioningTest.cpp      | 73 +++++++++++++++++--
 5 files changed, 146 insertions(+), 49 deletions(-)

diff --git a/llvm/include/llvm/Support/BalancedPartitioning.h b/llvm/include/llvm/Support/BalancedPartitioning.h
index a8464ac0fe60e..19eb6d41e45e3 100644
--- a/llvm/include/llvm/Support/BalancedPartitioning.h
+++ b/llvm/include/llvm/Support/BalancedPartitioning.h
@@ -57,8 +57,27 @@ class BPFunctionNode {
   friend class BalancedPartitioning;
 
 public:
+  /// The type of ID
   using IDT = uint64_t;
-  using UtilityNodeT = uint32_t;
+  /// The type of UtilityNode
+  struct UtilityNodeT {
+    UtilityNodeT(uint32_t Id, uint32_t Weight = 1) : Id(Id), Weight(Weight) {}
+    uint32_t Id;
+    uint32_t Weight;
+
+    // Deduplicate utility nodes for a given function.
+    // TODO: One may experiment with accumulating the weights of duplicates.
+    static void sortAndDeduplicate(SmallVector<UtilityNodeT, 4> &UNs) {
+      llvm::sort(UNs, [](const UtilityNodeT &L, const UtilityNodeT &R) {
+        return L.Id < R.Id;
+      });
+      UNs.erase(std::unique(UNs.begin(), UNs.end(),
+                            [](const UtilityNodeT &L, const UtilityNodeT &R) {
+                              return L.Id == R.Id;
+                            }),
+                UNs.end());
+    }
+  };
 
   /// \param UtilityNodes the set of utility nodes (must be unique'd)
   BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
@@ -78,8 +97,7 @@ class BPFunctionNode {
   uint64_t InputOrderIndex = 0;
 
   friend class BPFunctionNodeTest_Basic_Test;
-  friend class BalancedPartitioningTest_Basic_Test;
-  friend class BalancedPartitioningTest_Large_Test;
+  friend class BalancedPartitioningTest;
 };
 
 /// Algorithm parameters; default values are tuned on real-world binaries
@@ -188,6 +206,8 @@ class BalancedPartitioning {
     float CachedGainRL;
     /// Whether \p CachedGainLR and \p CachedGainRL are valid
     bool CachedGainIsValid = false;
+    /// The weight of this utility node
+    uint32_t Weight = 1;
   };
 
 protected:
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 2640027455e0d..ba4fb227381f6 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -923,15 +923,15 @@ std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
 
   const int N = Log2_64(LargestTraceSize) + 1;
 
-  // TODO: We need to use the Trace.Weight field to give more weight to more
+  // TODO: We may use the Trace.Weight field to give more weight to more
   // important utilities
   DenseMap<IDT, SmallVector<UtilityNodeT, 4>> FuncGroups;
-  for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
+  for (uint32_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
     auto &Trace = Traces[TraceIdx].FunctionNameRefs;
     for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
       for (int I = Log2_64(Timestamp + 1); I < N; I++) {
         auto FunctionId = Trace[Timestamp];
-        UtilityNodeT GroupId = TraceIdx * N + I;
+        UtilityNodeT GroupId(TraceIdx * N + I);
         FuncGroups[FunctionId].push_back(GroupId);
       }
     }
@@ -940,8 +940,7 @@ std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
   std::vector<BPFunctionNode> Nodes;
   for (auto Id : FunctionIds) {
     auto &UNs = FuncGroups[Id];
-    llvm::sort(UNs);
-    UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
+    UtilityNodeT::sortAndDeduplicate(UNs);
     Nodes.emplace_back(Id, UNs);
   }
   return Nodes;
diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp
index cb6ba61179941..cd71ddec47647 100644
--- a/llvm/lib/Support/BalancedPartitioning.cpp
+++ b/llvm/lib/Support/BalancedPartitioning.cpp
@@ -21,8 +21,13 @@ using namespace llvm;
 #define DEBUG_TYPE "balanced-partitioning"
 
 void BPFunctionNode::dump(raw_ostream &OS) const {
-  OS << formatv("{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}", Id,
-                make_range(UtilityNodes.begin(), UtilityNodes.end()), Bucket);
+  OS << "{ID=" << Id << " Utilities={";
+  for (auto &N : UtilityNodes)
+    OS << N.Id << " ,";
+  OS << "}";
+  if (Bucket.has_value())
+    OS << " Bucket=" << Bucket.value();
+  OS << "}";
 }
 
 template <typename Func>
@@ -166,34 +171,40 @@ void BalancedPartitioning::runIterations(const FunctionNodeRange Nodes,
                                          unsigned RecDepth, unsigned LeftBucket,
                                          unsigned RightBucket,
                                          std::mt19937 &RNG) const {
-  unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
-  DenseMap<BPFunctionNode::UtilityNodeT, unsigned> UtilityNodeIndex;
+  // Count the degree of each utility node.
+  DenseMap<uint32_t, unsigned> UtilityNodeIndex;
   for (auto &N : Nodes)
     for (auto &UN : N.UtilityNodes)
-      ++UtilityNodeIndex[UN];
+      ++UtilityNodeIndex[UN.Id];
   // Remove utility nodes if they have just one edge or are connected to all
-  // functions
+  // functions.
+  unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
   for (auto &N : Nodes)
     llvm::erase_if(N.UtilityNodes, [&](auto &UN) {
-      return UtilityNodeIndex[UN] == 1 || UtilityNodeIndex[UN] == NumNodes;
+      return UtilityNodeIndex[UN.Id] == 1 ||
+             UtilityNodeIndex[UN.Id] == NumNodes;
     });
 
-  // Renumber utility nodes so they can be used to index into Signatures
+  // Renumber utility nodes so they can be used to index into Signatures.
   UtilityNodeIndex.clear();
   for (auto &N : Nodes)
     for (auto &UN : N.UtilityNodes)
-      UN = UtilityNodeIndex.insert({UN, UtilityNodeIndex.size()}).first->second;
+      UN.Id = UtilityNodeIndex.insert({UN.Id, UtilityNodeIndex.size()})
+                  .first->second;
 
-  // Initialize signatures
+  // Initialize signatures.
   SignaturesT Signatures(/*Size=*/UtilityNodeIndex.size());
   for (auto &N : Nodes) {
     for (auto &UN : N.UtilityNodes) {
-      assert(UN < Signatures.size());
-      if (N.Bucket == LeftBucket) {
-        Signatures[UN].LeftCount++;
-      } else {
-        Signatures[UN].RightCount++;
-      }
+      assert(UN.Id < Signatures.size());
+      if (N.Bucket == LeftBucket)
+        Signatures[UN.Id].LeftCount++;
+      else
+        Signatures[UN.Id].RightCount++;
+      // Identical utility nodes (having the same UN.Id) have the same weight
+      // (unless there are hash collisions mapping utilities to the same Id);
+      // thus, we get a new weight only when the signature is uninitialized.
+      Signatures[UN.Id].Weight = UN.Weight;
     }
   }
 
@@ -221,9 +232,11 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes,
     Signature.CachedGainLR = 0.f;
     Signature.CachedGainRL = 0.f;
     if (L > 0)
-      Signature.CachedGainLR = Cost - logCost(L - 1, R + 1);
+      Signature.CachedGainLR =
+          (Cost - logCost(L - 1, R + 1)) * Signature.Weight;
     if (R > 0)
-      Signature.CachedGainRL = Cost - logCost(L + 1, R - 1);
+      Signature.CachedGainRL =
+          (Cost - logCost(L + 1, R - 1)) * Signature.Weight;
     Signature.CachedGainIsValid = true;
   }
 
@@ -282,14 +295,14 @@ bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N,
   // Update signatures and invalidate gain cache
   if (FromLeftToRight) {
     for (auto &UN : N.UtilityNodes) {
-      auto &Signature = Signatures[UN];
+      auto &Signature = Signatures[UN.Id];
       Signature.LeftCount--;
       Signature.RightCount++;
       Signature.CachedGainIsValid = false;
     }
   } else {
     for (auto &UN : N.UtilityNodes) {
-      auto &Signature = Signatures[UN];
+      auto &Signature = Signatures[UN.Id];
       Signature.LeftCount++;
       Signature.RightCount--;
       Signature.CachedGainIsValid = false;
@@ -318,8 +331,8 @@ float BalancedPartitioning::moveGain(const BPFunctionNode &N,
                                      const SignaturesT &Signatures) {
   float Gain = 0.f;
   for (auto &UN : N.UtilityNodes)
-    Gain += (FromLeftToRight ? Signatures[UN].CachedGainLR
-                             : Signatures[UN].CachedGainRL);
+    Gain += (FromLeftToRight ? Signatures[UN.Id].CachedGainLR
+                             : Signatures[UN.Id].CachedGainRL);
   return Gain;
 }
 
diff --git a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
index 6af6f1bcdc40a..787e3041fb5e7 100644
--- a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
+++ b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
@@ -13,8 +13,8 @@
 #include "gtest/gtest.h"
 
 using testing::Field;
+using testing::Matcher;
 using testing::UnorderedElementsAre;
-using testing::UnorderedElementsAreArray;
 
 namespace llvm {
 
@@ -24,29 +24,35 @@ void PrintTo(const BPFunctionNode &Node, std::ostream *OS) {
 }
 
 TEST(BPFunctionNodeTest, Basic) {
+  auto UNIdsAre = [](auto... Ids) {
+    return UnorderedElementsAre(Field("Id", &BPFunctionNode::UtilityNodeT::Id,
+                                      std::forward<uint32_t>(Ids))...);
+  };
   auto NodeIs = [](BPFunctionNode::IDT Id,
-                   ArrayRef<BPFunctionNode::UtilityNodeT> UNs) {
-    return AllOf(Field("Id", &BPFunctionNode::Id, Id),
-                 Field("UtilityNodes", &BPFunctionNode::UtilityNodes,
-                       UnorderedElementsAreArray(UNs)));
+                   Matcher<ArrayRef<BPFunctionNode::UtilityNodeT>> UNsMatcher) {
+    return AllOf(
+        Field("Id", &BPFunctionNode::Id, Id),
+        Field("UtilityNodes", &BPFunctionNode::UtilityNodes, UNsMatcher));
   };
 
   auto Nodes = TemporalProfTraceTy::createBPFunctionNodes({
       TemporalProfTraceTy({0, 1, 2, 3}),
   });
-  EXPECT_THAT(Nodes,
-              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
-                                   NodeIs(2, {1, 2}), NodeIs(3, {2})));
+  EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, UNIdsAre(0, 1, 2)),
+                                          NodeIs(1, UNIdsAre(1, 2)),
+                                          NodeIs(2, UNIdsAre(1, 2)),
+                                          NodeIs(3, UNIdsAre(2))));
 
   Nodes = TemporalProfTraceTy::createBPFunctionNodes({
       TemporalProfTraceTy({0, 1, 2, 3, 4}),
       TemporalProfTraceTy({4, 2}),
   });
 
-  EXPECT_THAT(Nodes,
-              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
-                                   NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}),
-                                   NodeIs(4, {2, 3, 4, 5})));
+  EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, UNIdsAre(0, 1, 2)),
+                                          NodeIs(1, UNIdsAre(1, 2)),
+                                          NodeIs(2, UNIdsAre(1, 2, 4, 5)),
+                                          NodeIs(3, UNIdsAre(2)),
+                                          NodeIs(4, UNIdsAre(2, 3, 4, 5))));
 }
 
 } // end namespace llvm
diff --git a/llvm/unittests/Support/BalancedPartitioningTest.cpp b/llvm/unittests/Support/BalancedPartitioningTest.cpp
index ebe518a8e89ca..79e89ae201480 100644
--- a/llvm/unittests/Support/BalancedPartitioningTest.cpp
+++ b/llvm/unittests/Support/BalancedPartitioningTest.cpp
@@ -37,6 +37,20 @@ class BalancedPartitioningTest : public ::testing::Test {
       Ids.push_back(N.Id);
     return Ids;
   }
+
+  static testing::Matcher<BPFunctionNode> NodeIdIs(BPFunctionNode::IDT Id) {
+    return Field("Id", &BPFunctionNode::Id, Id);
+  };
+
+  static testing::Matcher<BPFunctionNode>
+  NodeBucketIs(std::optional<uint32_t> Bucket) {
+    return Field("Bucket", &BPFunctionNode::Bucket, Bucket);
+  };
+
+  static testing::Matcher<BPFunctionNode>
+  NodeIs(BPFunctionNode::IDT Id, std::optional<uint32_t> Bucket) {
+    return AllOf(NodeIdIs(Id), NodeBucketIs(Bucket));
+  };
 };
 
 TEST_F(BalancedPartitioningTest, Basic) {
@@ -48,11 +62,6 @@ TEST_F(BalancedPartitioningTest, Basic) {
 
   Bp.run(Nodes);
 
-  auto NodeIs = [](BPFunctionNode::IDT Id, std::optional<uint32_t> Bucket) {
-    return AllOf(Field("Id", &BPFunctionNode::Id, Id),
-                 Field("Bucket", &BPFunctionNode::Bucket, Bucket));
-  };
-
   EXPECT_THAT(Nodes,
               UnorderedElementsAre(NodeIs(0, 0), NodeIs(1, 1), NodeIs(2, 2),
                                    NodeIs(3, 3), NodeIs(4, 4)));
@@ -79,8 +88,7 @@ TEST_F(BalancedPartitioningTest, Large) {
 
   Bp.run(Nodes);
 
-  EXPECT_THAT(
-      Nodes, Each(Not(Field("Bucket", &BPFunctionNode::Bucket, std::nullopt))));
+  EXPECT_THAT(Nodes, Each(Not(NodeBucketIs(std::nullopt))));
   EXPECT_THAT(getIds(Nodes), UnorderedElementsAreArray(OrigIds));
 }
 
@@ -97,4 +105,55 @@ TEST_F(BalancedPartitioningTest, MoveGain) {
                   30.f);
 }
 
+TEST_F(BalancedPartitioningTest, Weight1) {
+  std::vector<BPFunctionNode::UtilityNodeT> UNs = {
+      BPFunctionNode::UtilityNodeT(0, 100),
+      BPFunctionNode::UtilityNodeT(1, 100),
+      BPFunctionNode::UtilityNodeT(2, 100),
+      BPFunctionNode::UtilityNodeT(3, 1),
+      BPFunctionNode::UtilityNodeT(4, 1),
+  };
+  std::vector<BPFunctionNode> Nodes = {
+      BPFunctionNode(0, {UNs[0], UNs[3]}), BPFunctionNode(1, {UNs[1], UNs[3]}),
+      BPFunctionNode(2, {UNs[2], UNs[3]}), BPFunctionNode(3, {UNs[0], UNs[4]}),
+      BPFunctionNode(4, {UNs[1], UNs[4]}), BPFunctionNode(5, {UNs[2], UNs[4]}),
+  };
+
+  Bp.run(Nodes);
+
+  // Check that nodes that share important UNs are ordered together
+  auto NodesRef = ArrayRef(Nodes);
+  auto Groups = {NodesRef.slice(0, 2), NodesRef.slice(2, 2),
+                 NodesRef.slice(4, 2)};
+  EXPECT_THAT(Groups, UnorderedElementsAre(
+                          UnorderedElementsAre(NodeIdIs(0), NodeIdIs(3)),
+                          UnorderedElementsAre(NodeIdIs(1), NodeIdIs(4)),
+                          UnorderedElementsAre(NodeIdIs(2), NodeIdIs(5))));
+}
+
+TEST_F(BalancedPartitioningTest, Weight2) {
+  std::vector<BPFunctionNode::UtilityNodeT> UNs = {
+      BPFunctionNode::UtilityNodeT(0, 1),
+      BPFunctionNode::UtilityNodeT(1, 1),
+      BPFunctionNode::UtilityNodeT(2, 1),
+      BPFunctionNode::UtilityNodeT(3, 100),
+      BPFunctionNode::UtilityNodeT(4, 100),
+  };
+  std::vector<BPFunctionNode> Nodes = {
+      BPFunctionNode(0, {UNs[0], UNs[3]}), BPFunctionNode(1, {UNs[1], UNs[4]}),
+      BPFunctionNode(2, {UNs[2], UNs[3]}), BPFunctionNode(3, {UNs[0], UNs[4]}),
+      BPFunctionNode(4, {UNs[1], UNs[3]}), BPFunctionNode(5, {UNs[2], UNs[4]}),
+  };
+
+  Bp.run(Nodes);
+
+  // Check that nodes that share important UNs are ordered together
+  auto NodesRef = ArrayRef(Nodes);
+  auto Groups = {NodesRef.slice(0, 3), NodesRef.slice(3, 3)};
+  EXPECT_THAT(Groups,
+              UnorderedElementsAre(
+                  UnorderedElementsAre(NodeIdIs(0), NodeIdIs(2), NodeIdIs(4)),
+                  UnorderedElementsAre(NodeIdIs(1), NodeIdIs(3), NodeIdIs(5))));
+}
+
 } // end namespace llvm

From 0388ab3e29de843dea823b6ef0c6d0ccc56b0a16 Mon Sep 17 00:00:00 2001
From: Danila Malyutin <danilaml@users.noreply.github.com>
Date: Sat, 20 Jan 2024 01:44:00 +0400
Subject: [PATCH 175/843] [Statepoint][NFC] Use uint16_t and add an assert
 (#78717)

Use a fixed width integer type and assert that DwarRegNum fits the 16
bits.

This is a follow up to review comments on #78600.
---
 llvm/include/llvm/CodeGen/StackMaps.h | 18 ++++++++----------
 llvm/lib/CodeGen/StackMaps.cpp        |  2 +-
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/StackMaps.h b/llvm/include/llvm/CodeGen/StackMaps.h
index ca5dfa5666003..578020ca5501a 100644
--- a/llvm/include/llvm/CodeGen/StackMaps.h
+++ b/llvm/include/llvm/CodeGen/StackMaps.h
@@ -259,7 +259,7 @@ class StatepointOpers {
 class StackMaps {
 public:
   struct Location {
-    enum LocationType : unsigned short {
+    enum LocationType : uint16_t {
       Unprocessed,
       Register,
       Direct,
@@ -268,24 +268,22 @@ class StackMaps {
       ConstantIndex
     };
     LocationType Type = Unprocessed;
-    unsigned short Size = 0;
-    unsigned short Reg = 0;
+    uint16_t Size = 0;
+    uint16_t Reg = 0;
     int32_t Offset = 0;
 
     Location() = default;
-    Location(LocationType Type, unsigned short Size, unsigned short Reg,
-             int32_t Offset)
+    Location(LocationType Type, uint16_t Size, uint16_t Reg, int32_t Offset)
         : Type(Type), Size(Size), Reg(Reg), Offset(Offset) {}
   };
 
   struct LiveOutReg {
-    unsigned short Reg = 0;
-    unsigned short DwarfRegNum = 0;
-    unsigned short Size = 0;
+    uint16_t Reg = 0;
+    uint16_t DwarfRegNum = 0;
+    uint16_t Size = 0;
 
     LiveOutReg() = default;
-    LiveOutReg(unsigned short Reg, unsigned short DwarfRegNum,
-               unsigned short Size)
+    LiveOutReg(uint16_t Reg, uint16_t DwarfRegNum, uint16_t Size)
         : Reg(Reg), DwarfRegNum(DwarfRegNum), Size(Size) {}
   };
 
diff --git a/llvm/lib/CodeGen/StackMaps.cpp b/llvm/lib/CodeGen/StackMaps.cpp
index f45bcaf16ab70..90aa93e442cf3 100644
--- a/llvm/lib/CodeGen/StackMaps.cpp
+++ b/llvm/lib/CodeGen/StackMaps.cpp
@@ -200,7 +200,7 @@ static unsigned getDwarfRegNum(unsigned Reg, const TargetRegisterInfo *TRI) {
       break;
   }
 
-  assert(RegNum >= 0 && "Invalid Dwarf register number.");
+  assert(RegNum >= 0 && isUInt<16>(RegNum) && "Invalid Dwarf register number.");
   return (unsigned)RegNum;
 }
 

From 58780b811c23df3d928d8452ee21c862dde754a2 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconsteq@gmail.com>
Date: Fri, 19 Jan 2024 13:48:13 -0800
Subject: [PATCH 176/843] [libc++][hardening] In production hardening modes,
 trap rather than abort (#78561)

In the hardening modes that can be used in production (`fast` and
`extensive`), make a failed assertion invoke a trap instruction rather
than calling verbose abort. In the debug mode, still keep calling
verbose abort to provide a better user experience and to allow us to
keep our existing testing infrastructure for verifying assertion
messages. Since the debug mode by definition enables all assertions, we
can be sure that we still check all the assertion messages in the
library when running the test suite in the debug mode.

The main motivation to use trapping in production is to achieve better
code generation and reduce the binary size penalty. This way, the
assertion handler can compile to a single instruction, whereas the
existing mechanism with verbose abort results in generating a function
call that in general cannot be optimized away (made worse by the fact
that it's a variadic function, imposing an additional penalty). See the
[RFC](https://discourse.llvm.org/t/rfc-hardening-in-libc/73925) for more
details. Note that this mechanism can now be completely [overridden at
CMake configuration
time](https://github.com/llvm/llvm-project/pull/77883).

This patch also significantly refactors `check_assertion.h` and expands
its test coverage. The main changes:
- when overriding `verbose_abort`, don't do matching inside the function
-- just print the error message to `stderr`. This removes the need to
set a global matcher and allows to do matching in the parent process
after the child finishes;
- remove unused logic for matching source locations and for using
wildcards;
- make matchers simple functors;
- introduce `DeathTestResult` that keeps data about the test run,
primarily to make it easier to test.

In addition to the refactoring, `check_assertion.h` can now recognize
when a process exits due to a trap.
---
 libcxx/docs/BuildingLibcxx.rst                |  41 +-
 libcxx/docs/ReleaseNotes/18.rst               |  21 +-
 .../alg.sorting/assert.min.max.pass.cpp       |   2 +-
 .../assert.sort.invalid_comparator.pass.cpp   |   2 +-
 ...tomize_verbose_abort.compile-time.pass.cpp |   2 +-
 ...customize_verbose_abort.link-time.pass.cpp |   2 +-
 .../assertions/default_verbose_abort.pass.cpp |   4 +-
 ...assertions_enables_extensive_mode.pass.cpp |   5 +-
 .../assertions/modes/extensive.pass.cpp       |   1 -
 .../libcxx/assertions/modes/fast.pass.cpp     |   1 -
 .../override_with_extensive_mode.pass.cpp     |   5 +-
 .../modes/override_with_fast_mode.pass.cpp    |   5 +-
 .../array/array.zero/assert.back.pass.cpp     |   2 +-
 .../array/array.zero/assert.front.pass.cpp    |   2 +-
 .../array.zero/assert.subscript.pass.cpp      |   2 +-
 .../deque/assert.pop_back.empty.pass.cpp      |   2 +-
 .../assert.erase_iter.end.pass.cpp            |   2 +-
 .../assert.pop_back.empty.pass.cpp            |   2 +-
 .../vector/assert.back.empty.pass.cpp         |   2 +-
 .../vector/assert.cback.empty.pass.cpp        |   2 +-
 .../vector/assert.cfront.empty.pass.cpp       |   2 +-
 .../vector/assert.cindex.oob.pass.cpp         |   2 +-
 .../vector/assert.front.empty.pass.cpp        |   2 +-
 .../vector/assert.index.oob.pass.cpp          |   2 +-
 .../vector/assert.pop_back.empty.pass.cpp     |   2 +-
 .../unord/unord.map/assert.bucket.pass.cpp    |   2 +-
 .../unord.map/assert.bucket_size.pass.cpp     |   2 +-
 .../unord.map/assert.max_load_factor.pass.cpp |   2 +-
 .../unord.multimap/assert.bucket.pass.cpp     |   2 +-
 .../assert.bucket_size.pass.cpp               |   2 +-
 .../assert.max_load_factor.pass.cpp           |   2 +-
 .../unord.multiset/assert.bucket.pass.cpp     |   2 +-
 .../assert.bucket_size.pass.cpp               |   2 +-
 .../assert.max_load_factor.pass.cpp           |   2 +-
 .../unord/unord.set/assert.bucket.pass.cpp    |   2 +-
 .../unord.set/assert.bucket_size.pass.cpp     |   2 +-
 .../unord.set/assert.max_load_factor.pass.cpp |   2 +-
 .../mdspan/extents/assert.conversion.pass.cpp |   2 +-
 .../extents/assert.ctor_from_array.pass.cpp   |   2 +-
 .../assert.ctor_from_integral.pass.cpp        |   2 +-
 .../extents/assert.ctor_from_span.pass.cpp    |   2 +-
 .../views/mdspan/extents/assert.obs.pass.cpp  |   2 +-
 .../layout_left/assert.conversion.pass.cpp    |   2 +-
 .../layout_left/assert.ctor.extents.pass.cpp  |   2 +-
 .../assert.ctor.layout_right.pass.cpp         |   2 +-
 .../assert.ctor.layout_stride.pass.cpp        |   2 +-
 .../assert.index_operator.pass.cpp            |   2 +-
 .../mdspan/layout_left/assert.stride.pass.cpp |   2 +-
 .../layout_right/assert.conversion.pass.cpp   |   2 +-
 .../layout_right/assert.ctor.extents.pass.cpp |   2 +-
 .../assert.ctor.layout_left.pass.cpp          |   2 +-
 .../assert.ctor.layout_stride.pass.cpp        |   2 +-
 .../assert.index_operator.pass.cpp            |   2 +-
 .../layout_right/assert.stride.pass.cpp       |   2 +-
 .../layout_stride/assert.conversion.pass.cpp  |   2 +-
 ...ert.ctor.extents_array.non_unique.pass.cpp |   2 +-
 .../assert.ctor.extents_array.pass.cpp        |   2 +-
 ...sert.ctor.extents_span.non_unique.pass.cpp |   2 +-
 .../assert.ctor.extents_span.pass.cpp         |   2 +-
 .../assert.index_operator.pass.cpp            |   2 +-
 .../layout_stride/assert.stride.pass.cpp      |   2 +-
 .../mdspan/mdspan/assert.conversion.pass.cpp  |   2 +-
 .../mdspan/assert.index_operator.pass.cpp     |   2 +-
 .../views/mdspan/mdspan/assert.size.pass.cpp  |   2 +-
 .../span.cons/assert.iter_sent.pass.cpp       |   2 +-
 .../span.cons/assert.iter_size.pass.cpp       |   2 +-
 .../span.cons/assert.other_span.pass.cpp      |   2 +-
 .../span.cons/assert.range.pass.cpp           |   2 +-
 .../views.span/span.elem/assert.back.pass.cpp |   2 +-
 .../span.elem/assert.front.pass.cpp           |   2 +-
 .../span.elem/assert.op_idx.pass.cpp          |   2 +-
 .../views.span/span.sub/assert.first.pass.cpp |   2 +-
 .../views.span/span.sub/assert.last.pass.cpp  |   2 +-
 .../span.sub/assert.subspan.pass.cpp          |   2 +-
 .../path.itr/assert.iterator.pass.cpp         |   2 +-
 .../libcxx/iterators/assert.advance.pass.cpp  |   2 +-
 .../libcxx/iterators/assert.next.pass.cpp     |   2 +-
 .../libcxx/iterators/assert.prev.pass.cpp     |   2 +-
 .../bounded_iter/dereference.pass.cpp         |   2 +-
 .../counted.iterator/assert.pass.cpp          |   2 +-
 .../iterators.common/assert.pass.cpp          |   2 +-
 .../range.chunk.by/assert.begin.pass.cpp      |   2 +-
 .../range.chunk.by/assert.find-next.pass.cpp  |   2 +-
 .../range.chunk.by/assert.find-prev.pass.cpp  |   2 +-
 .../range.chunk.by.iter/assert.deref.pass.cpp |   2 +-
 .../assert.increment.pass.cpp                 |   2 +-
 .../range.drop.while/assert.begin.pass.cpp    |   2 +-
 .../assert.equal.pass.cpp                     |   2 +-
 .../assert.equal.pass.cpp                     |   2 +-
 .../string.access/assert.back.pass.cpp        |   2 +-
 .../string.access/assert.cback.pass.cpp       |   2 +-
 .../string.access/assert.cfront.pass.cpp      |   2 +-
 .../string.access/assert.cindex.pass.cpp      |   2 +-
 .../string.access/assert.front.pass.cpp       |   2 +-
 .../string.access/assert.index.pass.cpp       |   2 +-
 .../assert.erase_iter.null.pass.cpp           |   2 +-
 .../string.modifiers/assert.pop_back.pass.cpp |   2 +-
 .../string.view/assert.ctor.length.pass.cpp   |   2 +-
 .../string.view/assert.ctor.pointer.pass.cpp  |   2 +-
 .../assert.set_exception.pass.cpp             |   2 +-
 ...sert.set_exception_at_thread_exit.pass.cpp |   2 +-
 .../thread.barrier/assert.arrive.pass.cpp     |   2 +-
 .../thread.barrier/assert.ctor.pass.cpp       |   2 +-
 .../assert.arrive_and_wait.pass.cpp           |   2 +-
 .../thread.latch/assert.count_down.pass.cpp   |   2 +-
 .../thread/thread.latch/assert.ctor.pass.cpp  |   2 +-
 .../thread.semaphore/assert.ctor.pass.cpp     |   2 +-
 .../thread.semaphore/assert.release.pass.cpp  |   2 +-
 ...ert.exception_guard.no_exceptions.pass.cpp |   2 +-
 .../expected.expected/assert.arrow.pass.cpp   |   2 +-
 .../expected.expected/assert.deref.pass.cpp   |   2 +-
 .../expected.expected/assert.error.pass.cpp   |   2 +-
 .../expected.void/assert.deref.pass.cpp       |   2 +-
 .../expected.void/assert.error.pass.cpp       |   2 +-
 .../assert.dereference.pass.cpp               |   2 +-
 .../assert.op_arrow.pass.cpp                  |   2 +-
 libcxx/test/support/check_assertion.h         | 399 ++++++++++--------
 .../test_check_assertion.pass.cpp             | 127 ++++--
 .../vendor/llvm/default_assertion_handler.in  |  12 +-
 119 files changed, 483 insertions(+), 354 deletions(-)

diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index 11e250e3f3735..dcd05a28f36d1 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -485,28 +485,25 @@ LLVM-specific options
 .. _assertion-handler:
 
 Overriding the default assertion handler
-==========================================
-
-When the library wants to terminate due to an unforeseen condition (such as
-a hardening assertion failure), the program is aborted through a special verbose
-termination function. The library provides a default function that prints an
-error message and calls ``std::abort()``. Note that this function is provided by
-the static or shared library, so it is only available when deploying to
-a platform where the compiled library is sufficiently recent. On older
-platforms, the program will terminate in an unspecified unsuccessful manner, but
-the quality of diagnostics won't be great.
-
-However, vendors can also override that mechanism at CMake configuration time.
-When a hardening assertion fails, the library invokes the
-``_LIBCPP_ASSERTION_HANDLER`` macro. A vendor may provide a header that contains
-a custom definition of this macro and specify the path to the header via the
-``LIBCXX_ASSERTION_HANDLER_FILE`` CMake variable. If provided, this header will
-be included by the library and replace the default implementation. The header
-must not include any standard library headers (directly or transitively) because
-doing so will almost always create a circular dependency. The
-``_LIBCPP_ASSERTION_HANDLER(message)`` macro takes a single parameter that
-contains an error message explaining the hardening failure and some details
-about the source location that triggered it.
+========================================
+
+When the library wants to terminate due to a hardening assertion failure, the
+program is aborted by invoking a trap instruction (or in debug mode, by
+a special verbose termination function that prints an error message and calls
+``std::abort()``). This is done to minimize the code size impact of enabling
+hardening in the library. However, vendors can also override that mechanism at
+CMake configuration time.
+
+Under the hood, a hardening assertion will invoke the
+``_LIBCPP_ASSERTION_HANDLER`` macro upon failure. A vendor may provide a header
+that contains a custom definition of this macro and specify the path to the
+header via the ``LIBCXX_ASSERTION_HANDLER_FILE`` CMake variable. If provided,
+this header will be included by the library and replace the default
+implementation. The header must not include any standard library headers
+(directly or transitively) because doing so will almost always create a circular
+dependency. The ``_LIBCPP_ASSERTION_HANDLER(message)`` macro takes a single
+parameter that contains an error message explaining the hardening failure and
+some details about the source location that triggered it.
 
 When a hardening assertion fails, it means that the program is about to invoke
 library undefined behavior. For this reason, the custom assertion handler is
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 983363af54e73..37f2a77ca4f85 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -107,13 +107,18 @@ Deprecations and Removals
   macro is provided to restore the previous behavior, and it will be supported in the LLVM 18 release only.
   In LLVM 19 and beyond, ``_LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT`` will not be honored anymore.
 
-- The only supported way to customize the assertion handler that gets invoked when a hardening assertion fails
-  is now by setting the ``LIBCXX_ASSERTION_HANDLER_FILE`` CMake variable and providing a custom header. See
-  the documentation on overriding the default assertion handler for details.
+- Overriding `__libcpp_verbose_abort` no longer has any effect on library assertions. The only supported way
+  to customize the assertion handler that gets invoked when a hardening assertion fails is now by setting the
+  ``LIBCXX_ASSERTION_HANDLER_FILE`` CMake variable and providing a custom header. See the documentation on
+  overriding the default assertion handler for details. The ability to override `__libcpp_verbose_abort` will
+  be removed in an upcoming release in favor of the new overriding mechanism.
+
+- In safe mode (which is now equivalent to the ``extensive`` hardening mode), a failed assertion will now
+  generate a trap rather than a call to verbose abort.
 
 - The ``_LIBCPP_AVAILABILITY_CUSTOM_VERBOSE_ABORT_PROVIDED`` macro is not honored anymore in LLVM 18.
-  Please see the updated documentation about the hardening modes in libc++ and in particular the
-  ``_LIBCPP_VERBOSE_ABORT`` macro for details.
+  Please see the updated documentation about the hardening modes in libc++ and in particular on
+  overriding the default assertion handler.
 
 - The headers ``<experimental/deque>``, ``<experimental/forward_list>``, ``<experimental/list>``,
   ``<experimental/map>``, ``<experimental/memory_resource>``, ``<experimental/regex>``, ``<experimental/set>``,
@@ -136,13 +141,15 @@ Deprecations and Removals
 Upcoming Deprecations and Removals
 ----------------------------------
 
+- The ability to override ``__libcpp_verbose_abort`` will be removed in an upcoming release.
+
 LLVM 19
 ~~~~~~~
 
 - The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable that was used to enable the safe mode will be deprecated and setting
   it will trigger an error; use the ``LIBCXX_HARDENING_MODE`` variable with the value ``extensive`` instead. Similarly,
-  the ``_LIBCPP_ENABLE_ASSERTIONS`` macro will be deprecated (setting it to ``1`` still enables the extensive mode the
-  LLVM 19 release while also issuing a deprecation warning). See :ref:`the hardening documentation
+  the ``_LIBCPP_ENABLE_ASSERTIONS`` macro will be deprecated (setting it to ``1`` still enables the extensive mode in
+  the LLVM 19 release while also issuing a deprecation warning). See :ref:`the hardening documentation
   <using-hardening-modes>` for more details.
 
 - The base template for ``std::char_traits`` has been marked as deprecated and will be removed in LLVM 19. If you
diff --git a/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp b/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp
index bd9dfd4549c4e..7e765d7e84683 100644
--- a/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.sorting/assert.min.max.pass.cpp
@@ -11,7 +11,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <algorithm>
 #include <array>
diff --git a/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator.pass.cpp b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator.pass.cpp
index 96c2821c4a654..f90e2a3a1a71f 100644
--- a/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator.pass.cpp
+++ b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator.pass.cpp
@@ -11,7 +11,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DEBUG_STRICT_WEAK_ORDERING_CHECK
 // When the debug mode is enabled, this test fails because we actually catch on the fly that the comparator is not
 // a strict-weak ordering before we catch that we'd dereference out-of-bounds inside std::sort, which leads to different
diff --git a/libcxx/test/libcxx/assertions/customize_verbose_abort.compile-time.pass.cpp b/libcxx/test/libcxx/assertions/customize_verbose_abort.compile-time.pass.cpp
index d09881e8cecb9..69154c3f7eaff 100644
--- a/libcxx/test/libcxx/assertions/customize_verbose_abort.compile-time.pass.cpp
+++ b/libcxx/test/libcxx/assertions/customize_verbose_abort.compile-time.pass.cpp
@@ -22,6 +22,6 @@ void my_abort(char const*, ...) {
 }
 
 int main(int, char**) {
-  _LIBCPP_ASSERT(false, "message");
+  _LIBCPP_VERBOSE_ABORT("%s", "message");
   return EXIT_FAILURE;
 }
diff --git a/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp b/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp
index 219c43874e77d..585ab73f2cb26 100644
--- a/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp
+++ b/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp
@@ -19,6 +19,6 @@ void std::__libcpp_verbose_abort(char const*, ...) {
 }
 
 int main(int, char**) {
-  _LIBCPP_ASSERT(false, "message");
+  std::__libcpp_verbose_abort("%s", "message");
   return EXIT_FAILURE;
 }
diff --git a/libcxx/test/libcxx/assertions/default_verbose_abort.pass.cpp b/libcxx/test/libcxx/assertions/default_verbose_abort.pass.cpp
index 870f43da4b8f0..0cc4b1e005226 100644
--- a/libcxx/test/libcxx/assertions/default_verbose_abort.pass.cpp
+++ b/libcxx/test/libcxx/assertions/default_verbose_abort.pass.cpp
@@ -7,7 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 // Test that the default verbose termination function aborts the program.
+// XFAIL: availability-verbose_abort-missing
 
+#include <__verbose_abort>
 #include <csignal>
 #include <cstdlib>
 
@@ -19,6 +21,6 @@ void signal_handler(int signal) {
 
 int main(int, char**) {
   if (std::signal(SIGABRT, signal_handler) != SIG_ERR)
-    _LIBCPP_ASSERT(false, "foo");
+    std::__libcpp_verbose_abort("%s", "foo");
   return EXIT_FAILURE;
 }
diff --git a/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp b/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp
index a40ae84fa8e85..a91ba04176374 100644
--- a/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp
@@ -12,8 +12,9 @@
 
 // `check_assertion.h` is only available starting from C++11 and requires Unix headers and regex support.
 // UNSUPPORTED: c++03, !has-unix-headers, no-localization
-// The ability to set a custom abort message is required to compare the assertion message.
-// XFAIL: availability-verbose_abort-missing
+// The ability to set a custom abort message is required to compare the assertion message (which only happens in the
+// debug mode).
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // Note that GCC doesn't support `-Wno-macro-redefined`.
 // ADDITIONAL_COMPILE_FLAGS: -U_LIBCPP_HARDENING_MODE -D_LIBCPP_ENABLE_ASSERTIONS=1
 
diff --git a/libcxx/test/libcxx/assertions/modes/extensive.pass.cpp b/libcxx/test/libcxx/assertions/modes/extensive.pass.cpp
index e9e494eae341b..5743f95e472d7 100644
--- a/libcxx/test/libcxx/assertions/modes/extensive.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/extensive.pass.cpp
@@ -14,7 +14,6 @@
 // UNSUPPORTED: c++03
 // `check_assertion.h` requires Unix headers.
 // REQUIRES: has-unix-headers
-// XFAIL: availability-verbose_abort-missing
 
 #include <cassert>
 #include "check_assertion.h"
diff --git a/libcxx/test/libcxx/assertions/modes/fast.pass.cpp b/libcxx/test/libcxx/assertions/modes/fast.pass.cpp
index 33377f03fe870..85181859fdad0 100644
--- a/libcxx/test/libcxx/assertions/modes/fast.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/fast.pass.cpp
@@ -14,7 +14,6 @@
 // UNSUPPORTED: c++03
 // `check_assertion.h` requires Unix headers.
 // REQUIRES: has-unix-headers
-// XFAIL: availability-verbose_abort-missing
 
 #include <cassert>
 #include "check_assertion.h"
diff --git a/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp b/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp
index f5323c671c9b1..f78d5d70e5890 100644
--- a/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp
@@ -10,8 +10,9 @@
 
 // `check_assertion.h` is only available starting from C++11 and requires Unix headers and regex support.
 // UNSUPPORTED: c++03, !has-unix-headers, no-localization
-// The ability to set a custom abort message is required to compare the assertion message.
-// XFAIL: availability-verbose_abort-missing
+// The ability to set a custom abort message is required to compare the assertion message (which only happens in the
+// debug mode).
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // ADDITIONAL_COMPILE_FLAGS: -U_LIBCPP_HARDENING_MODE -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_EXTENSIVE
 
 #include <cassert>
diff --git a/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp b/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp
index 5ee22cc45f3bf..27542ee32bef9 100644
--- a/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp
@@ -10,8 +10,9 @@
 
 // `check_assertion.h` is only available starting from C++11 and requires Unix headers and regex support.
 // UNSUPPORTED: c++03, !has-unix-headers, no-localization
-// The ability to set a custom abort message is required to compare the assertion message.
-// XFAIL: availability-verbose_abort-missing
+// The ability to set a custom abort message is required to compare the assertion message (which only happens in the
+// debug mode).
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // ADDITIONAL_COMPILE_FLAGS: -U_LIBCPP_HARDENING_MODE -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST
 
 #include <cassert>
diff --git a/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.back.pass.cpp b/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.back.pass.cpp
index 7bd399fca7e0b..c20b0c9a804f2 100644
--- a/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.back.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.back.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // test that array<T, 0>::back() triggers an assertion
 
diff --git a/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.front.pass.cpp b/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.front.pass.cpp
index fcfb26ff42726..1ea0035ee4cc5 100644
--- a/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.front.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.front.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // test that array<T, 0>::back() triggers an assertion
 
diff --git a/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.subscript.pass.cpp b/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.subscript.pass.cpp
index e221e361405ad..ab65ce223f7c5 100644
--- a/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.subscript.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/array/array.zero/assert.subscript.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // test that array<T, 0>::operator[] triggers an assertion
 
diff --git a/libcxx/test/libcxx/containers/sequences/deque/assert.pop_back.empty.pass.cpp b/libcxx/test/libcxx/containers/sequences/deque/assert.pop_back.empty.pass.cpp
index e0f102b3cc1f8..19cd41f4cf3a2 100644
--- a/libcxx/test/libcxx/containers/sequences/deque/assert.pop_back.empty.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/deque/assert.pop_back.empty.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <deque>
 
diff --git a/libcxx/test/libcxx/containers/sequences/list/list.modifiers/assert.erase_iter.end.pass.cpp b/libcxx/test/libcxx/containers/sequences/list/list.modifiers/assert.erase_iter.end.pass.cpp
index a6471f7389b87..531dd8de399de 100644
--- a/libcxx/test/libcxx/containers/sequences/list/list.modifiers/assert.erase_iter.end.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/list/list.modifiers/assert.erase_iter.end.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <list>
 
diff --git a/libcxx/test/libcxx/containers/sequences/list/list.modifiers/assert.pop_back.empty.pass.cpp b/libcxx/test/libcxx/containers/sequences/list/list.modifiers/assert.pop_back.empty.pass.cpp
index 04e933430a7b0..d7c6999162270 100644
--- a/libcxx/test/libcxx/containers/sequences/list/list.modifiers/assert.pop_back.empty.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/list/list.modifiers/assert.pop_back.empty.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <list>
 #include <cassert>
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.back.empty.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.back.empty.pass.cpp
index b68b5dece6840..169ad1def9e6f 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.back.empty.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.back.empty.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <vector>
 #include <cassert>
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.cback.empty.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.cback.empty.pass.cpp
index 0c1571f67a53c..5ceb4a16b9340 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.cback.empty.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.cback.empty.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <vector>
 
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.cfront.empty.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.cfront.empty.pass.cpp
index 6d0f4eacf44c0..20f94f1d3f0fa 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.cfront.empty.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.cfront.empty.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <vector>
 
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.cindex.oob.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.cindex.oob.pass.cpp
index 47a531fb3cf75..3a9a7add3e30d 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.cindex.oob.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.cindex.oob.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <vector>
 #include <cassert>
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.front.empty.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.front.empty.pass.cpp
index 687f8fa8664a6..85364c778ad64 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.front.empty.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.front.empty.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <vector>
 #include <cassert>
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.index.oob.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.index.oob.pass.cpp
index 1f3cae3479a1b..14cb89625f064 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.index.oob.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.index.oob.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <vector>
 #include <cassert>
diff --git a/libcxx/test/libcxx/containers/sequences/vector/assert.pop_back.empty.pass.cpp b/libcxx/test/libcxx/containers/sequences/vector/assert.pop_back.empty.pass.cpp
index 4ed3fab6d22bb..ad80be69897ed 100644
--- a/libcxx/test/libcxx/containers/sequences/vector/assert.pop_back.empty.pass.cpp
+++ b/libcxx/test/libcxx/containers/sequences/vector/assert.pop_back.empty.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <vector>
 
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.bucket.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.bucket.pass.cpp
index 51718c013ba56..8d23673dfc50f 100644
--- a/libcxx/test/libcxx/containers/unord/unord.map/assert.bucket.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.bucket.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_map>
 #include <string>
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.bucket_size.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.bucket_size.pass.cpp
index de6aa442c6b3b..0a0bb888182e9 100644
--- a/libcxx/test/libcxx/containers/unord/unord.map/assert.bucket_size.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.bucket_size.pass.cpp
@@ -17,7 +17,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_map>
 #include <string>
diff --git a/libcxx/test/libcxx/containers/unord/unord.map/assert.max_load_factor.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.map/assert.max_load_factor.pass.cpp
index 7b6a83559f9ee..452a3ae8c3818 100644
--- a/libcxx/test/libcxx/containers/unord/unord.map/assert.max_load_factor.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.map/assert.max_load_factor.pass.cpp
@@ -18,7 +18,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_map>
 #include <string>
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.bucket.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.bucket.pass.cpp
index 2b2859c1e1940..ec3892f444cd6 100644
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.bucket.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.bucket.pass.cpp
@@ -17,7 +17,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_map>
 #include <string>
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.bucket_size.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.bucket_size.pass.cpp
index 6100ef37a33b1..ea5904b94019e 100644
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.bucket_size.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.bucket_size.pass.cpp
@@ -17,7 +17,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_map>
 #include <string>
diff --git a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.max_load_factor.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.max_load_factor.pass.cpp
index 8c80821b823d4..65f7c91d293d6 100644
--- a/libcxx/test/libcxx/containers/unord/unord.multimap/assert.max_load_factor.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.multimap/assert.max_load_factor.pass.cpp
@@ -18,7 +18,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_map>
 #include <string>
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.bucket.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.bucket.pass.cpp
index 40c68fd6c0c09..49fdb723b4230 100644
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.bucket.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.bucket.pass.cpp
@@ -17,7 +17,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_set>
 
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.bucket_size.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.bucket_size.pass.cpp
index 8599b53ae30c8..5709fd1e30d9a 100644
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.bucket_size.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.bucket_size.pass.cpp
@@ -17,7 +17,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_set>
 
diff --git a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.max_load_factor.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.max_load_factor.pass.cpp
index 0dedebed4acfc..c54bb833e7240 100644
--- a/libcxx/test/libcxx/containers/unord/unord.multiset/assert.max_load_factor.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.multiset/assert.max_load_factor.pass.cpp
@@ -18,7 +18,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_set>
 
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.bucket.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.bucket.pass.cpp
index d5a3cd9a5c1b3..c0bfb00efc68a 100644
--- a/libcxx/test/libcxx/containers/unord/unord.set/assert.bucket.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.bucket.pass.cpp
@@ -17,7 +17,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_set>
 
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.bucket_size.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.bucket_size.pass.cpp
index f8f5feea48aab..3e25b448cb872 100644
--- a/libcxx/test/libcxx/containers/unord/unord.set/assert.bucket_size.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.bucket_size.pass.cpp
@@ -17,7 +17,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_set>
 
diff --git a/libcxx/test/libcxx/containers/unord/unord.set/assert.max_load_factor.pass.cpp b/libcxx/test/libcxx/containers/unord/unord.set/assert.max_load_factor.pass.cpp
index 002e5f3962146..ec1f1f3bd5ce1 100644
--- a/libcxx/test/libcxx/containers/unord/unord.set/assert.max_load_factor.pass.cpp
+++ b/libcxx/test/libcxx/containers/unord/unord.set/assert.max_load_factor.pass.cpp
@@ -18,7 +18,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <unordered_set>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.conversion.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.conversion.pass.cpp
index e8cde6bf8856c..03ed1f78b778b 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.conversion.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.conversion.pass.cpp
@@ -7,7 +7,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_array.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_array.pass.cpp
index 886375e54ca3c..906a6ec87bd44 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_array.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_array.pass.cpp
@@ -7,7 +7,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_integral.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_integral.pass.cpp
index e03bb28ba1cfb..af484a300ea8b 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_integral.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_integral.pass.cpp
@@ -7,7 +7,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_span.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_span.pass.cpp
index d2e1910b19462..f09515cf1142d 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_span.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.ctor_from_span.pass.cpp
@@ -7,7 +7,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // Test construction from span:
 //
diff --git a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.obs.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.obs.pass.cpp
index 44e6d48e9e178..c473879d87b71 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/extents/assert.obs.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/extents/assert.obs.pass.cpp
@@ -7,7 +7,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.conversion.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.conversion.pass.cpp
index 94d49f13e9b8a..2cdfa2667cfd4 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.conversion.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.conversion.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.extents.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.extents.pass.cpp
index e73a167a51be3..6ac7411f1df01 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.extents.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.extents.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.layout_right.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.layout_right.pass.cpp
index d11cf9c2362dd..ceb3242beb9c2 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.layout_right.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.layout_right.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.layout_stride.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.layout_stride.pass.cpp
index 60c08a86d2003..cc9358c853db4 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.layout_stride.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.ctor.layout_stride.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // ADDITIONAL_COMPILE_FLAGS: -Wno-ctad-maybe-unsupported
 
 // FIXME: https://github.com/llvm/llvm-project/issues/64719
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.index_operator.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.index_operator.pass.cpp
index c514a532a1a0d..79e424fbb52cb 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.index_operator.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.index_operator.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.stride.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.stride.pass.cpp
index e54cf8678ea71..61a5149c6881a 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.stride.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_left/assert.stride.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.conversion.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.conversion.pass.cpp
index f3ee1a63be368..4695ea834692e 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.conversion.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.conversion.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.extents.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.extents.pass.cpp
index 824c0b0f5793b..797a8e3845c2f 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.extents.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.extents.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.layout_left.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.layout_left.pass.cpp
index 29903595a2a7f..e4044a94b7f68 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.layout_left.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.layout_left.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.layout_stride.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.layout_stride.pass.cpp
index a14459823ff6a..18d974333a580 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.layout_stride.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.ctor.layout_stride.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // ADDITIONAL_COMPILE_FLAGS: -Wno-ctad-maybe-unsupported
 
 // FIXME: https://github.com/llvm/llvm-project/issues/64719
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.index_operator.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.index_operator.pass.cpp
index 756234f157f2c..7fae6f87caf7c 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.index_operator.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.index_operator.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.stride.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.stride.pass.cpp
index 9cc5ad72f9349..8e3049c4736f0 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.stride.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_right/assert.stride.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.conversion.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.conversion.pass.cpp
index c6d3e4fe97f61..81f6321ef519c 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.conversion.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.conversion.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // XFAIL: target=powerpc{{.*}}le-unknown-linux-gnu
 
 // <mdspan>
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_array.non_unique.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_array.non_unique.pass.cpp
index 34b333ec3d145..97a6d56e4f839 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_array.non_unique.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_array.non_unique.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_array.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_array.pass.cpp
index eab1c67ed6979..31b38bd8079ce 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_array.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_array.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.non_unique.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.non_unique.pass.cpp
index 9e1de537682d8..fd0701e9ee3a7 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.non_unique.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.non_unique.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.pass.cpp
index 3b7338449a7f6..4cd0d8ff896ab 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.index_operator.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.index_operator.pass.cpp
index f507b14ca45eb..b5244a60af0f7 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.index_operator.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.index_operator.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.stride.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.stride.pass.cpp
index 5ae141389da4e..ee2b731da203b 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.stride.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.stride.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.conversion.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.conversion.pass.cpp
index 860bf19af45ca..53aec7bb714ea 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.conversion.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.conversion.pass.cpp
@@ -7,7 +7,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.index_operator.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.index_operator.pass.cpp
index 174bddb56d53c..9dd957986f14d 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.index_operator.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.index_operator.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.size.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.size.pass.cpp
index ee3114e228a0d..5682de61fbb32 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.size.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/mdspan/assert.size.pass.cpp
@@ -7,7 +7,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <mdspan>
 
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.iter_sent.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.iter_sent.pass.cpp
index c268ae51e79ac..fa2a98e33b8f0 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.iter_sent.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.iter_sent.pass.cpp
@@ -20,7 +20,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <array>
 #include <span>
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.iter_size.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.iter_size.pass.cpp
index 3c800e9f2fdf9..4461bad8ff504 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.iter_size.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.iter_size.pass.cpp
@@ -17,7 +17,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <array>
 #include <span>
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.other_span.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.other_span.pass.cpp
index f1b7d2183625b..c288c2acd6b59 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.other_span.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.other_span.pass.cpp
@@ -15,7 +15,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <array>
 #include <span>
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.range.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.range.pass.cpp
index 18ed7ce213b90..1f9af3e03009a 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.range.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.cons/assert.range.pass.cpp
@@ -15,7 +15,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <span>
 #include <vector>
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.back.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.back.pass.cpp
index d15b024149775..ea98fe81ee2f8 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.back.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.back.pass.cpp
@@ -15,7 +15,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <array>
 #include <span>
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.front.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.front.pass.cpp
index 588123af6499b..2660ca1f90c14 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.front.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.front.pass.cpp
@@ -15,7 +15,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <array>
 #include <span>
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.op_idx.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.op_idx.pass.cpp
index 8a74739c79ddf..e7d7958844181 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.op_idx.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.elem/assert.op_idx.pass.cpp
@@ -15,7 +15,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <array>
 #include <span>
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.first.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.first.pass.cpp
index 01d5658291f1d..eb6b442d64849 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.first.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.first.pass.cpp
@@ -15,7 +15,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <array>
 #include <span>
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.last.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.last.pass.cpp
index 4e6587443c6a7..141735db02a60 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.last.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.last.pass.cpp
@@ -15,7 +15,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <array>
 #include <span>
diff --git a/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.subspan.pass.cpp b/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.subspan.pass.cpp
index 52e4fe9ecc5c7..7ed228e96a483 100644
--- a/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.subspan.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/views.span/span.sub/assert.subspan.pass.cpp
@@ -23,7 +23,7 @@
 
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <array>
 #include <span>
diff --git a/libcxx/test/libcxx/input.output/filesystems/class.path/path.itr/assert.iterator.pass.cpp b/libcxx/test/libcxx/input.output/filesystems/class.path/path.itr/assert.iterator.pass.cpp
index 6c138752b5054..38047957de8e5 100644
--- a/libcxx/test/libcxx/input.output/filesystems/class.path/path.itr/assert.iterator.pass.cpp
+++ b/libcxx/test/libcxx/input.output/filesystems/class.path/path.itr/assert.iterator.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <filesystem>
 
diff --git a/libcxx/test/libcxx/iterators/assert.advance.pass.cpp b/libcxx/test/libcxx/iterators/assert.advance.pass.cpp
index b7e6962b443ef..e9d2f27008260 100644
--- a/libcxx/test/libcxx/iterators/assert.advance.pass.cpp
+++ b/libcxx/test/libcxx/iterators/assert.advance.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <list>
 
diff --git a/libcxx/test/libcxx/iterators/assert.next.pass.cpp b/libcxx/test/libcxx/iterators/assert.next.pass.cpp
index da4f4ea3150c3..242a0c6f0f7ce 100644
--- a/libcxx/test/libcxx/iterators/assert.next.pass.cpp
+++ b/libcxx/test/libcxx/iterators/assert.next.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <list>
 
diff --git a/libcxx/test/libcxx/iterators/assert.prev.pass.cpp b/libcxx/test/libcxx/iterators/assert.prev.pass.cpp
index 6a71cad9ef048..a5a04f1bbeb6b 100644
--- a/libcxx/test/libcxx/iterators/assert.prev.pass.cpp
+++ b/libcxx/test/libcxx/iterators/assert.prev.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <list>
 
diff --git a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
index 095e8a8fef6ef..8eee4ad2f319a 100644
--- a/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
+++ b/libcxx/test/libcxx/iterators/bounded_iter/dereference.pass.cpp
@@ -14,7 +14,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <iterator>
 
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp
index f803b2cad75be..2fafe4727185d 100644
--- a/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp
+++ b/libcxx/test/libcxx/iterators/predef.iterators/counted.iterator/assert.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <iterator>
 
diff --git a/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp b/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp
index ea4574fc1a9cc..01c0fb4048320 100644
--- a/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp
+++ b/libcxx/test/libcxx/iterators/predef.iterators/iterators.common/assert.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <iterator>
 
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.begin.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.begin.pass.cpp
index 0ba4745be0826..57af366317091 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.begin.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.begin.pass.cpp
@@ -10,7 +10,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <ranges>
 
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.find-next.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.find-next.pass.cpp
index 9e5536bf51be2..1c91ee7198952 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.find-next.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.find-next.pass.cpp
@@ -10,7 +10,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <ranges>
 
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.find-prev.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.find-prev.pass.cpp
index 4f746cfec2f1a..2605bf6dde074 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.find-prev.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/assert.find-prev.pass.cpp
@@ -10,7 +10,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: no-exceptions
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <ranges>
 
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/assert.deref.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/assert.deref.pass.cpp
index 80c08efc56c05..8ed84ca8b56a1 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/assert.deref.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/assert.deref.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <ranges>
 
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/assert.increment.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/assert.increment.pass.cpp
index 7381ad2886293..1a804b71b5e5e 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/assert.increment.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.chunk.by/range.chunk.by.iter/assert.increment.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <ranges>
 
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.drop.while/assert.begin.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.drop.while/assert.begin.pass.cpp
index a2b11fb54ed9d..205cf40746207 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.drop.while/assert.begin.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.drop.while/assert.begin.pass.cpp
@@ -14,7 +14,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // UNSUPPORTED: no-exceptions
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <ranges>
 
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/assert.equal.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/assert.equal.pass.cpp
index 86ee2841f249e..22ede4143ffa4 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/assert.equal.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/range.lazy.split.inner/assert.equal.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <ranges>
 
diff --git a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/range.lazy.split.outer/assert.equal.pass.cpp b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/range.lazy.split.outer/assert.equal.pass.cpp
index d4a3bbc513c49..b6cbf5241f744 100644
--- a/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/range.lazy.split.outer/assert.equal.pass.cpp
+++ b/libcxx/test/libcxx/ranges/range.adaptors/range.lazy.split/range.lazy.split.outer/assert.equal.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <ranges>
 
diff --git a/libcxx/test/libcxx/strings/basic.string/string.access/assert.back.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.access/assert.back.pass.cpp
index 42beb28c47b13..36a485a1e4d00 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.access/assert.back.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.access/assert.back.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <string>
 
diff --git a/libcxx/test/libcxx/strings/basic.string/string.access/assert.cback.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.access/assert.cback.pass.cpp
index 7a3d3e1661636..d810acd67e7e7 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.access/assert.cback.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.access/assert.cback.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <string>
 
diff --git a/libcxx/test/libcxx/strings/basic.string/string.access/assert.cfront.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.access/assert.cfront.pass.cpp
index 66432dc099b5f..12e7ef3328b04 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.access/assert.cfront.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.access/assert.cfront.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <string>
 
diff --git a/libcxx/test/libcxx/strings/basic.string/string.access/assert.cindex.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.access/assert.cindex.pass.cpp
index 41bb48b6f574c..3983352712963 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.access/assert.cindex.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.access/assert.cindex.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <string>
 #include <cassert>
diff --git a/libcxx/test/libcxx/strings/basic.string/string.access/assert.front.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.access/assert.front.pass.cpp
index 1033e17961b24..24df3fcad0c5c 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.access/assert.front.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.access/assert.front.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <string>
 #include <cassert>
diff --git a/libcxx/test/libcxx/strings/basic.string/string.access/assert.index.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.access/assert.index.pass.cpp
index 776bba32da184..d26997d8d24c2 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.access/assert.index.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.access/assert.index.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <string>
 #include <cassert>
diff --git a/libcxx/test/libcxx/strings/basic.string/string.modifiers/assert.erase_iter.null.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.modifiers/assert.erase_iter.null.pass.cpp
index 0b4cb9db51c94..036e75965c488 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.modifiers/assert.erase_iter.null.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.modifiers/assert.erase_iter.null.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <string>
 
diff --git a/libcxx/test/libcxx/strings/basic.string/string.modifiers/assert.pop_back.pass.cpp b/libcxx/test/libcxx/strings/basic.string/string.modifiers/assert.pop_back.pass.cpp
index 2bab67bdc3edc..54c011c4d54a0 100644
--- a/libcxx/test/libcxx/strings/basic.string/string.modifiers/assert.pop_back.pass.cpp
+++ b/libcxx/test/libcxx/strings/basic.string/string.modifiers/assert.pop_back.pass.cpp
@@ -13,7 +13,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <string>
 
diff --git a/libcxx/test/libcxx/strings/string.view/assert.ctor.length.pass.cpp b/libcxx/test/libcxx/strings/string.view/assert.ctor.length.pass.cpp
index addcae5ec1863..af8b393f9e0a7 100644
--- a/libcxx/test/libcxx/strings/string.view/assert.ctor.length.pass.cpp
+++ b/libcxx/test/libcxx/strings/string.view/assert.ctor.length.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // Construct a string_view from an invalid length
 // constexpr basic_string_view( const _CharT* s, size_type len )
diff --git a/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp b/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp
index b38e0585d0312..1810ec1ca8ac9 100644
--- a/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp
+++ b/libcxx/test/libcxx/strings/string.view/assert.ctor.pointer.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // Construct a string_view from a null pointer
 // constexpr basic_string_view( const CharT* s );
diff --git a/libcxx/test/libcxx/thread/futures/futures.promise/assert.set_exception.pass.cpp b/libcxx/test/libcxx/thread/futures/futures.promise/assert.set_exception.pass.cpp
index fe7b3616b5a76..6d5eb5ef9931f 100644
--- a/libcxx/test/libcxx/thread/futures/futures.promise/assert.set_exception.pass.cpp
+++ b/libcxx/test/libcxx/thread/futures/futures.promise/assert.set_exception.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, no-threads
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <future>
 
diff --git a/libcxx/test/libcxx/thread/futures/futures.promise/assert.set_exception_at_thread_exit.pass.cpp b/libcxx/test/libcxx/thread/futures/futures.promise/assert.set_exception_at_thread_exit.pass.cpp
index f28db8caba949..1bffde5e3ebd1 100644
--- a/libcxx/test/libcxx/thread/futures/futures.promise/assert.set_exception_at_thread_exit.pass.cpp
+++ b/libcxx/test/libcxx/thread/futures/futures.promise/assert.set_exception_at_thread_exit.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, no-threads
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // <future>
 
diff --git a/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp b/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp
index 58bc4b844eb59..419a603a037f8 100644
--- a/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.barrier/assert.arrive.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
 
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // REQUIRES: has-unix-headers
 
diff --git a/libcxx/test/libcxx/thread/thread.barrier/assert.ctor.pass.cpp b/libcxx/test/libcxx/thread/thread.barrier/assert.ctor.pass.cpp
index 3af43bdbe7b13..0b4fb1d675eaa 100644
--- a/libcxx/test/libcxx/thread/thread.barrier/assert.ctor.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.barrier/assert.ctor.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
 
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // REQUIRES: has-unix-headers
 
diff --git a/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp b/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp
index 442ae0d962631..e61679554a62e 100644
--- a/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.latch/assert.arrive_and_wait.pass.cpp
@@ -18,7 +18,7 @@
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <latch>
 
diff --git a/libcxx/test/libcxx/thread/thread.latch/assert.count_down.pass.cpp b/libcxx/test/libcxx/thread/thread.latch/assert.count_down.pass.cpp
index 1547dd6390d6c..6220cba02af19 100644
--- a/libcxx/test/libcxx/thread/thread.latch/assert.count_down.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.latch/assert.count_down.pass.cpp
@@ -19,7 +19,7 @@
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <latch>
 
diff --git a/libcxx/test/libcxx/thread/thread.latch/assert.ctor.pass.cpp b/libcxx/test/libcxx/thread/thread.latch/assert.ctor.pass.cpp
index b20911e0c3b9f..5f1ea19d82a50 100644
--- a/libcxx/test/libcxx/thread/thread.latch/assert.ctor.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.latch/assert.ctor.pass.cpp
@@ -18,7 +18,7 @@
 
 // REQUIRES: has-unix-headers
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <latch>
 
diff --git a/libcxx/test/libcxx/thread/thread.semaphore/assert.ctor.pass.cpp b/libcxx/test/libcxx/thread/thread.semaphore/assert.ctor.pass.cpp
index 91b1e35429088..1e33add779496 100644
--- a/libcxx/test/libcxx/thread/thread.semaphore/assert.ctor.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.semaphore/assert.ctor.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
 
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // REQUIRES: has-unix-headers
 
diff --git a/libcxx/test/libcxx/thread/thread.semaphore/assert.release.pass.cpp b/libcxx/test/libcxx/thread/thread.semaphore/assert.release.pass.cpp
index 09b4de3665359..a5a01a3847878 100644
--- a/libcxx/test/libcxx/thread/thread.semaphore/assert.release.pass.cpp
+++ b/libcxx/test/libcxx/thread/thread.semaphore/assert.release.pass.cpp
@@ -9,7 +9,7 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 // REQUIRES: libcpp-hardening-mode={{extensive|debug}}
 
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // REQUIRES: has-unix-headers
 
diff --git a/libcxx/test/libcxx/utilities/assert.exception_guard.no_exceptions.pass.cpp b/libcxx/test/libcxx/utilities/assert.exception_guard.no_exceptions.pass.cpp
index cf6b7bef3ebce..c6ddb8bd252a4 100644
--- a/libcxx/test/libcxx/utilities/assert.exception_guard.no_exceptions.pass.cpp
+++ b/libcxx/test/libcxx/utilities/assert.exception_guard.no_exceptions.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03
 // REQUIRES: libcpp-hardening-mode=debug
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // ADDITIONAL_COMPILE_FLAGS: -fno-exceptions
 
 #include <__utility/exception_guard.h>
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/assert.arrow.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/assert.arrow.pass.cpp
index 27dcdfe951230..47481bcbef8a8 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/assert.arrow.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/assert.arrow.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // constexpr const T* operator->() const noexcept;
 // constexpr T* operator->() noexcept;
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/assert.deref.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/assert.deref.pass.cpp
index 01078ac27e96a..5ab43d38ccb15 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/assert.deref.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/assert.deref.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // constexpr const T& operator*() const & noexcept;
 // constexpr T& operator*() & noexcept;
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/assert.error.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/assert.error.pass.cpp
index ff574b350cc9e..92bf305994c18 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/assert.error.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/assert.error.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // constexpr const E& error() const & noexcept;
 // constexpr E& error() & noexcept;
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/assert.deref.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/assert.deref.pass.cpp
index e5d7ac3eae698..6f1ba075b3245 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/assert.deref.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/assert.deref.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // constexpr void operator*() const noexcept;
 //
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/assert.error.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/assert.error.pass.cpp
index 7bf4844b29b01..a1c92ff85f33a 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/assert.error.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/assert.error.pass.cpp
@@ -9,7 +9,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 // constexpr const E& error() const & noexcept;
 // constexpr E& error() & noexcept;
diff --git a/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/assert.dereference.pass.cpp b/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/assert.dereference.pass.cpp
index 2ba5f299bc34e..31938b3f8fbaa 100644
--- a/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/assert.dereference.pass.cpp
+++ b/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/assert.dereference.pass.cpp
@@ -16,7 +16,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <optional>
 
diff --git a/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/assert.op_arrow.pass.cpp b/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/assert.op_arrow.pass.cpp
index f93569979d188..52009628327db 100644
--- a/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/assert.op_arrow.pass.cpp
+++ b/libcxx/test/libcxx/utilities/optional/optional.object/optional.object.observe/assert.op_arrow.pass.cpp
@@ -14,7 +14,7 @@
 // REQUIRES: has-unix-headers
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: libcpp-hardening-mode=none
-// XFAIL: availability-verbose_abort-missing
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 
 #include <optional>
 
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index 83a46548fa925..485f8103c7ad8 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -15,7 +15,9 @@
 #include <cstdio>
 #include <cstdlib>
 #include <exception>
+#include <functional>
 #include <regex>
+#include <sstream>
 #include <string>
 #include <string_view>
 #include <utility>
@@ -27,118 +29,206 @@
 #include "test_allocator.h"
 
 #if TEST_STD_VER < 11
-# error "C++11 or greater is required to use this header"
+#  error "C++11 or greater is required to use this header"
 #endif
 
-struct AssertionInfoMatcher {
-  static const int any_line = -1;
-  static constexpr const char* any_file = "*";
-  static constexpr const char* any_msg = "*";
+// When printing the assertion message to `stderr`, delimit it with a marker to make it easier to match the message
+// later.
+static constexpr const char* Marker = "###";
 
-  constexpr AssertionInfoMatcher() : is_empty_(true), msg_(any_msg, __builtin_strlen(any_msg)), file_(any_file, __builtin_strlen(any_file)), line_(any_line) { }
-  constexpr AssertionInfoMatcher(const char* msg, const char* file = any_file, int line = any_line)
-    : is_empty_(false), msg_(msg, __builtin_strlen(msg)), file_(file, __builtin_strlen(file)), line_(line) {}
+// (success, error-message-if-failed)
+using MatchResult = std::pair<bool, std::string>;
+using Matcher     = std::function<MatchResult(const std::string& /*text*/)>;
 
-  bool Matches(char const* file, int line, char const* message) const {
-    assert(!empty() && "empty matcher");
+MatchResult MatchAssertionMessage(const std::string& text, std::string_view expected_message) {
+  // Extract information from the error message. This has to stay synchronized with how we format assertions in the
+  // library.
+  std::regex assertion_format(".*###\\n(.*):(\\d+): assertion (.*) failed: (.*)\\n###");
 
-    if (CheckLineMatches(line) && CheckFileMatches(file) && CheckMessageMatches(message))
-        return true;
-    // Write to stdout because that's the file descriptor captured by the parent
-    // process.
-    std::printf("Failed to match assertion info!\n%s\nVS\n%s:%d (%s)\n", ToString().data(), file, line, message);
-    return false;
+  std::smatch match_result;
+  bool has_match = std::regex_match(text, match_result, assertion_format);
+  assert(has_match);
+  assert(match_result.size() == 5);
+
+  const std::string& file = match_result[1];
+  int line                = std::stoi(match_result[2]);
+  // Omitting `expression` in `match_result[3]`
+  const std::string& assertion_message = match_result[4];
+
+  bool result = assertion_message == expected_message;
+  if (!result) {
+    std::stringstream matching_error;
+    matching_error                                                       //
+        << "Expected message:   '" << expected_message.data() << "'\n"   //
+        << "Actual message:     '" << assertion_message.c_str() << "'\n" //
+        << "Source location:     " << file << ":" << std::to_string(line) << "\n";
+    return MatchResult(/*success=*/false, matching_error.str());
   }
 
-  std::string ToString() const {
-    std::string result = "msg = \""; result += msg_; result += "\"\n";
-    result += "line = " + (line_ == any_line ? "'*'" : std::to_string(line_)) + "\n";
-    result += "file = " + (file_ == any_file ? "'*'" : std::string(file_));
-    return result;
+  return MatchResult(/*success=*/true, /*maybe_error=*/"");
+}
+
+Matcher MakeAssertionMessageMatcher(std::string_view assertion_message) {
+  return [=](const std::string& text) { //
+    return MatchAssertionMessage(text, assertion_message);
+  };
+}
+
+Matcher MakeAnyMatcher() {
+  return [](const std::string&) { //
+    return MatchResult(/*success=*/true, /*maybe_error=*/"");
+  };
+}
+
+enum class DeathCause {
+  // Valid causes
+  VerboseAbort = 1,
+  StdTerminate,
+  Trap,
+  // Invalid causes
+  DidNotDie,
+  SetupFailure,
+  Unknown
+};
+
+bool IsValidCause(DeathCause cause) {
+  switch (cause) {
+  case DeathCause::VerboseAbort:
+  case DeathCause::StdTerminate:
+  case DeathCause::Trap:
+    return true;
+  default:
+    return false;
   }
+}
 
-  bool empty() const { return is_empty_; }
-private:
-  bool CheckLineMatches(int got_line) const {
-    if (line_ == any_line)
-      return true;
-    return got_line == line_;
+std::string ToString(DeathCause cause) {
+  switch (cause) {
+  case DeathCause::VerboseAbort:
+    return "verbose abort";
+  case DeathCause::StdTerminate:
+    return "`std::terminate`";
+  case DeathCause::Trap:
+    return "trap";
+  case DeathCause::DidNotDie:
+    return "<invalid cause (child did not die)>";
+  case DeathCause::SetupFailure:
+    return "<invalid cause (child failed to set up test environment)>";
+  case DeathCause::Unknown:
+    return "<invalid cause (cause unknown)>";
   }
 
-  bool CheckFileMatches(std::string_view got_file) const {
-    assert(!empty() && "empty matcher");
-    if (file_ == any_file)
-      return true;
-    std::size_t found_at = got_file.find(file_);
-    if (found_at == std::string_view::npos)
-      return false;
-    // require the match start at the beginning of the file or immediately after
-    // a directory separator.
-    if (found_at != 0) {
-      char last_char = got_file[found_at - 1];
-      if (last_char != '/' && last_char != '\\')
-        return false;
-    }
-    // require the match goes until the end of the string.
-    return got_file.substr(found_at) == file_;
+  assert(false && "Unreachable");
+}
+
+TEST_NORETURN void StopChildProcess(DeathCause cause) { std::exit(static_cast<int>(cause)); }
+
+DeathCause ConvertToDeathCause(int val) {
+  if (val < static_cast<int>(DeathCause::VerboseAbort) || val > static_cast<int>(DeathCause::Unknown)) {
+    return DeathCause::Unknown;
   }
+  return static_cast<DeathCause>(val);
+}
+
+enum class Outcome {
+  Success,
+  UnexpectedCause,
+  UnexpectedErrorMessage,
+  InvalidCause,
+};
 
-  bool CheckMessageMatches(std::string_view got_msg) const {
-    assert(!empty() && "empty matcher");
-    if (msg_ == any_msg)
-      return true;
-    std::size_t found_at = got_msg.find(msg_);
-    if (found_at == std::string_view::npos)
-      return false;
-    return found_at == 0 && got_msg.size() == msg_.size();
+std::string ToString(Outcome outcome) {
+  switch (outcome) {
+  case Outcome::Success:
+    return "success";
+  case Outcome::UnexpectedCause:
+    return "unexpected death cause";
+  case Outcome::UnexpectedErrorMessage:
+    return "unexpected error message";
+  case Outcome::InvalidCause:
+    return "invalid death cause";
   }
+
+  assert(false && "Unreachable");
+}
+
+class DeathTestResult {
+public:
+  DeathTestResult() = default;
+  DeathTestResult(Outcome set_outcome, DeathCause set_cause, const std::string& set_failure_description = "")
+      : outcome_(set_outcome), cause_(set_cause), failure_description_(set_failure_description) {}
+
+  bool success() const { return outcome() == Outcome::Success; }
+  Outcome outcome() const { return outcome_; }
+  DeathCause cause() const { return cause_; }
+  const std::string& failure_description() const { return failure_description_; }
+
 private:
-  bool is_empty_;
-  std::string_view msg_;
-  std::string_view file_;
-  int line_;
+  Outcome outcome_  = Outcome::Success;
+  DeathCause cause_ = DeathCause::Unknown;
+  std::string failure_description_;
 };
 
-static constexpr AssertionInfoMatcher AnyMatcher(AssertionInfoMatcher::any_msg);
+class DeathTest {
+public:
+  DeathTest()                            = default;
+  DeathTest(DeathTest const&)            = delete;
+  DeathTest& operator=(DeathTest const&) = delete;
 
-inline AssertionInfoMatcher& GlobalMatcher() {
-  static AssertionInfoMatcher GMatch;
-  return GMatch;
-}
+  template <class Func>
+  DeathTestResult Run(DeathCause expected_cause, Func&& func, const Matcher& matcher) {
+    std::set_terminate([] { StopChildProcess(DeathCause::StdTerminate); });
 
-struct DeathTest {
-  enum ResultKind {
-    RK_DidNotDie, RK_MatchFound, RK_MatchFailure, RK_Terminate, RK_SetupFailure, RK_Unknown
-  };
+    DeathCause cause = Run(func);
 
-  static const char* ResultKindToString(ResultKind RK) {
-#define CASE(K) case K: return #K
-    switch (RK) {
-    CASE(RK_MatchFailure);
-    CASE(RK_DidNotDie);
-    CASE(RK_SetupFailure);
-    CASE(RK_MatchFound);
-    CASE(RK_Unknown);
-    CASE(RK_Terminate);
+    if (!IsValidCause(cause)) {
+      return DeathTestResult(Outcome::InvalidCause, cause, ToString(cause));
     }
-    return "not a result kind";
-  }
 
-  static bool IsValidResultKind(int val) {
-    return val >= RK_DidNotDie && val <= RK_Unknown;
+    if (expected_cause != cause) {
+      std::stringstream failure_description;
+      failure_description                                             //
+          << "Child died, but with a different death cause\n"         //
+          << "Expected cause:   " << ToString(expected_cause) << "\n" //
+          << "Actual cause:     " << ToString(cause) << "\n";
+      return DeathTestResult(Outcome::UnexpectedCause, cause, failure_description.str());
+    }
+
+    MatchResult match_result = matcher(GetChildStdErr());
+    if (!match_result.first) {
+      auto failure_description = std::string("Child died, but with a different error message\n") + match_result.second;
+      return DeathTestResult(Outcome::UnexpectedErrorMessage, cause, failure_description);
+    }
+
+    return DeathTestResult(Outcome::Success, cause);
   }
 
-  DeathTest(AssertionInfoMatcher const& Matcher) : matcher_(Matcher) {}
+  void PrintFailureDetails(std::string_view failure_description, std::string_view stmt, DeathCause cause) const {
+    std::fprintf(
+        stderr, "Failure: EXPECT_DEATH( %s ) failed!\n(reason: %s)\n\n", stmt.data(), failure_description.data());
+
+    if (cause != DeathCause::Unknown) {
+      std::fprintf(stderr, "child exit code: %d\n", GetChildExitCode());
+    }
+    std::fprintf(stderr, "---------- standard err ----------\n%s", GetChildStdErr().c_str());
+    std::fprintf(stderr, "\n----------------------------------\n");
+    std::fprintf(stderr, "---------- standard out ----------\n%s", GetChildStdOut().c_str());
+    std::fprintf(stderr, "\n----------------------------------\n");
+  };
+
+private:
+  int GetChildExitCode() const { return exit_code_; }
+  std::string const& GetChildStdOut() const { return stdout_from_child_; }
+  std::string const& GetChildStdErr() const { return stderr_from_child_; }
 
   template <class Func>
-  ResultKind Run(Func&& f) {
+  DeathCause Run(Func&& f) {
     int pipe_res = pipe(stdout_pipe_fd_);
     assert(pipe_res != -1 && "failed to create pipe");
     pipe_res = pipe(stderr_pipe_fd_);
     assert(pipe_res != -1 && "failed to create pipe");
     pid_t child_pid = fork();
-    assert(child_pid != -1 &&
-        "failed to fork a process to perform a death test");
+    assert(child_pid != -1 && "failed to fork a process to perform a death test");
     child_pid_ = child_pid;
     if (child_pid_ == 0) {
       RunForChild(std::forward<Func>(f));
@@ -147,10 +237,6 @@ struct DeathTest {
     return RunForParent();
   }
 
-  int getChildExitCode() const { return exit_code_; }
-  std::string const& getChildStdOut() const { return stdout_from_child_; }
-  std::string const& getChildStdErr() const { return stderr_from_child_; }
-private:
   template <class Func>
   TEST_NORETURN void RunForChild(Func&& f) {
     close(GetStdOutReadFD()); // don't need to read from the pipe in the child.
@@ -158,14 +244,13 @@ struct DeathTest {
     auto DupFD = [](int DestFD, int TargetFD) {
       int dup_result = dup2(DestFD, TargetFD);
       if (dup_result == -1)
-        std::exit(RK_SetupFailure);
+        StopChildProcess(DeathCause::SetupFailure);
     };
     DupFD(GetStdOutWriteFD(), STDOUT_FILENO);
     DupFD(GetStdErrWriteFD(), STDERR_FILENO);
 
-    GlobalMatcher() = matcher_;
     f();
-    std::exit(RK_DidNotDie);
+    StopChildProcess(DeathCause::DidNotDie);
   }
 
   static std::string ReadChildIOUntilEnd(int FD) {
@@ -190,7 +275,7 @@ struct DeathTest {
     close(GetStdErrReadFD());
   }
 
-  ResultKind RunForParent() {
+  DeathCause RunForParent() {
     CaptureIOFromChild();
 
     int status_value;
@@ -199,35 +284,27 @@ struct DeathTest {
 
     if (WIFEXITED(status_value)) {
       exit_code_ = WEXITSTATUS(status_value);
-      if (!IsValidResultKind(exit_code_))
-        return RK_Unknown;
-      return static_cast<ResultKind>(exit_code_);
+      return ConvertToDeathCause(exit_code_);
     }
-    return RK_Unknown;
-  }
 
-  DeathTest(DeathTest const&) = delete;
-  DeathTest& operator=(DeathTest const&) = delete;
+    if (WIFSIGNALED(status_value)) {
+      exit_code_ = WTERMSIG(status_value);
+      // `__builtin_trap` generqtes `SIGILL` on x86 and `SIGTRAP` on ARM.
+      if (exit_code_ == SIGILL || exit_code_ == SIGTRAP) {
+        return DeathCause::Trap;
+      }
+    }
 
-  int GetStdOutReadFD() const {
-    return stdout_pipe_fd_[0];
+    return DeathCause::Unknown;
   }
 
-  int GetStdOutWriteFD() const {
-    return stdout_pipe_fd_[1];
-  }
+  int GetStdOutReadFD() const { return stdout_pipe_fd_[0]; }
+  int GetStdOutWriteFD() const { return stdout_pipe_fd_[1]; }
+  int GetStdErrReadFD() const { return stderr_pipe_fd_[0]; }
+  int GetStdErrWriteFD() const { return stderr_pipe_fd_[1]; }
 
-  int GetStdErrReadFD() const {
-    return stderr_pipe_fd_[0];
-  }
-
-  int GetStdErrWriteFD() const {
-    return stderr_pipe_fd_[1];
-  }
-private:
-  AssertionInfoMatcher matcher_;
   pid_t child_pid_ = -1;
-  int exit_code_ = -1;
+  int exit_code_   = -1;
   int stdout_pipe_fd_[2];
   int stderr_pipe_fd_[2];
   std::string stdout_from_child_;
@@ -235,82 +312,56 @@ struct DeathTest {
 };
 
 #ifdef _LIBCPP_VERSION
-void std::__libcpp_verbose_abort(char const* printf_format, ...) {
-  // Extract information from the error message. This has to stay synchronized with how we format assertions in the
-  // library.
+void std::__libcpp_verbose_abort(char const* format, ...) {
   va_list args;
-  va_start(args, printf_format);
-  char const* message = va_arg(args, char const*);
+  va_start(args, format);
 
-  std::regex message_format("(.*):(\\d+): assertion (.*) failed: (.*)\\n");
+  std::fprintf(stderr, "%s\n", Marker);
+  std::vfprintf(stderr, format, args);
+  std::fprintf(stderr, "%s", Marker);
 
-  std::cmatch match_result;
-  bool has_match = std::regex_match(message, match_result, message_format);
-  assert(has_match);
-  assert(match_result.size() == 5);
-
-  std::string file = match_result[1];
-  int line         = std::stoi(match_result[2]);
-  // Omitting `expression` in `match_result[3]`
-  std::string failure_reason = match_result[4];
+  va_end(args);
 
-  if (GlobalMatcher().Matches(file.c_str(), line, failure_reason.c_str())) {
-    std::exit(DeathTest::RK_MatchFound);
-  }
-  std::exit(DeathTest::RK_MatchFailure);
+  StopChildProcess(DeathCause::VerboseAbort);
 }
 #endif // _LIBCPP_VERSION
 
-[[noreturn]] inline void terminate_handler() {
-  std::exit(DeathTest::RK_Terminate);
-}
-
 template <class Func>
-inline bool ExpectDeath(const char* stmt, Func&& func, AssertionInfoMatcher Matcher) {
-  std::set_terminate(terminate_handler);
-  DeathTest DT(Matcher);
-  DeathTest::ResultKind RK = DT.Run(func);
-  auto OnFailure = [&](const char* msg) {
-    std::fprintf(stderr, "EXPECT_DEATH( %s ) failed! (%s)\n\n", stmt, msg);
-    if (RK != DeathTest::RK_Unknown) {
-      std::fprintf(stderr, "child exit code: %d\n", DT.getChildExitCode());
-    }
-    if (!DT.getChildStdErr().empty()) {
-      std::fprintf(stderr, "---------- standard err ----------\n%s\n", DT.getChildStdErr().c_str());
-    }
-    if (!DT.getChildStdOut().empty()) {
-      std::fprintf(stderr, "---------- standard out ----------\n%s\n", DT.getChildStdOut().c_str());
-    }
-    return false;
-  };
-  switch (RK) {
-  case DeathTest::RK_MatchFound:
-  case DeathTest::RK_Terminate:
-    return true;
-  case DeathTest::RK_SetupFailure:
-    return OnFailure("child failed to setup test environment");
-  case DeathTest::RK_Unknown:
-      return OnFailure("reason unknown");
-  case DeathTest::RK_DidNotDie:
-      return OnFailure("child did not die");
-  case DeathTest::RK_MatchFailure:
-      return OnFailure("matcher failed");
+bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func, const Matcher& matcher) {
+  assert(IsValidCause(expected_cause));
+
+  DeathTest test_case;
+  DeathTestResult test_result = test_case.Run(expected_cause, func, matcher);
+  if (!test_result.success()) {
+    test_case.PrintFailureDetails(test_result.failure_description(), stmt, test_result.cause());
   }
-  assert(false && "unreachable");
+
+  return test_result.success();
 }
 
 template <class Func>
-inline bool ExpectDeath(const char* stmt, Func&& func) {
-  return ExpectDeath(stmt, func, AnyMatcher);
+bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func) {
+  return ExpectDeath(expected_cause, stmt, func, MakeAnyMatcher());
 }
 
-/// Assert that the specified expression throws a libc++ debug exception.
-#define EXPECT_DEATH(...) assert((ExpectDeath(#__VA_ARGS__, [&]() { __VA_ARGS__; } )))
-
-#define EXPECT_STD_TERMINATE(...) assert(ExpectDeath(#__VA_ARGS__, __VA_ARGS__))
-
-#define EXPECT_DEATH_MATCHES(Matcher, ...) assert((ExpectDeath(#__VA_ARGS__, [&]() { __VA_ARGS__; }, Matcher)))
-
-#define TEST_LIBCPP_ASSERT_FAILURE(expr, message) assert((ExpectDeath(#expr, [&]() { (void)(expr); }, AssertionInfoMatcher(message))))
+// clang-format off
+
+/// Assert that the specified expression aborts with the expected cause and, optionally, error message.
+#define EXPECT_DEATH(...)                         \
+    assert(( ExpectDeath(DeathCause::VerboseAbort, #__VA_ARGS__, [&]() { __VA_ARGS__; } ) ))
+#define EXPECT_DEATH_MATCHES(matcher, ...)        \
+    assert(( ExpectDeath(DeathCause::VerboseAbort, #__VA_ARGS__, [&]() { __VA_ARGS__; }, matcher) ))
+#define EXPECT_STD_TERMINATE(...)                 \
+    assert(  ExpectDeath(DeathCause::StdTerminate, #__VA_ARGS__, __VA_ARGS__)  )
+
+#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+#define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
+    assert(( ExpectDeath(DeathCause::VerboseAbort, #expr, [&]() { (void)(expr); }, MakeAssertionMessageMatcher(message)) ))
+#else
+#define TEST_LIBCPP_ASSERT_FAILURE(expr, message) \
+    assert(( ExpectDeath(DeathCause::Trap,         #expr, [&]() { (void)(expr); }) ))
+#endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+
+// clang-format on
 
 #endif // TEST_SUPPORT_CHECK_ASSERTION_H
diff --git a/libcxx/test/support/test.support/test_check_assertion.pass.cpp b/libcxx/test/support/test.support/test_check_assertion.pass.cpp
index 7cf0e0966ce89..d1ac6717267f3 100644
--- a/libcxx/test/support/test.support/test_check_assertion.pass.cpp
+++ b/libcxx/test/support/test.support/test_check_assertion.pass.cpp
@@ -18,47 +18,110 @@
 #include "check_assertion.h"
 
 template <class Func>
-inline bool TestDeathTest(const char* stmt, Func&& func, DeathTest::ResultKind ExpectResult, AssertionInfoMatcher Matcher = AnyMatcher) {
-  DeathTest DT(Matcher);
-  DeathTest::ResultKind RK = DT.Run(func);
-  auto OnFailure = [&](std::string msg) {
-    std::fprintf(stderr, "EXPECT_DEATH( %s ) failed! (%s)\n\n", stmt, msg.c_str());
-    if (!DT.getChildStdErr().empty()) {
-      std::fprintf(stderr, "---------- standard err ----------\n%s\n", DT.getChildStdErr().c_str());
-    }
-    if (!DT.getChildStdOut().empty()) {
-      std::fprintf(stderr, "---------- standard out ----------\n%s\n", DT.getChildStdOut().c_str());
+bool TestDeathTest(
+    Outcome expected_outcome, DeathCause expected_cause, const char* stmt, Func&& func, const Matcher& matcher) {
+  auto get_matcher = [&] {
+#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+    return matcher;
+#else
+    (void)matcher;
+    return MakeAnyMatcher();
+#endif
+  };
+
+  DeathTest test_case;
+  DeathTestResult test_result = test_case.Run(expected_cause, func, get_matcher());
+  std::string maybe_failure_description;
+
+  Outcome outcome = test_result.outcome();
+  if (expected_outcome != outcome) {
+    maybe_failure_description +=
+        std::string("Test outcome was different from expected; expected ") + ToString(expected_outcome) +
+        ", got: " + ToString(outcome);
+  }
+
+  DeathCause cause = test_result.cause();
+  if (expected_cause != cause) {
+    auto failure_description =
+        std::string("Cause of death was different from expected; expected ") + ToString(expected_cause) +
+        ", got: " + ToString(cause);
+    if (maybe_failure_description.empty()) {
+      maybe_failure_description = failure_description;
+    } else {
+      maybe_failure_description += std::string("; ") + failure_description;
     }
+  }
+
+  if (!maybe_failure_description.empty()) {
+    test_case.PrintFailureDetails(maybe_failure_description, stmt, test_result.cause());
     return false;
-  };
-  if (RK != ExpectResult)
-    return OnFailure(std::string("expected result did not occur: expected ") + DeathTest::ResultKindToString(ExpectResult) + " got: " + DeathTest::ResultKindToString(RK));
+  }
+
   return true;
 }
-#define TEST_DEATH_TEST(RK, ...) assert((TestDeathTest(#__VA_ARGS__, [&]() { __VA_ARGS__; }, RK, AnyMatcher )))
-
-#define TEST_DEATH_TEST_MATCHES(RK, Matcher, ...) assert((TestDeathTest(#__VA_ARGS__, [&]() { __VA_ARGS__; }, RK, Matcher)))
 
-void my_libcpp_assert() {
-  _LIBCPP_ASSERT(false, "other");
-}
+// clang-format off
 
-void test_no_match_found() {
-  AssertionInfoMatcher ExpectMatch("my message");
-  TEST_DEATH_TEST_MATCHES(DeathTest::RK_MatchFailure, ExpectMatch, my_libcpp_assert());
-}
+#define TEST_DEATH_TEST(outcome, cause, ...)                   \
+  assert(( TestDeathTest(outcome, cause, #__VA_ARGS__, [&]() { __VA_ARGS__; }, MakeAnyMatcher()) ))
+#define TEST_DEATH_TEST_MATCHES(outcome, cause, matcher, ...)  \
+  assert(( TestDeathTest(outcome, cause, #__VA_ARGS__, [&]() { __VA_ARGS__; }, matcher) ))
 
-void test_did_not_die() {
-  TEST_DEATH_TEST(DeathTest::RK_DidNotDie, ((void)0));
-}
+// clang-format on
 
-void test_unknown() {
-  TEST_DEATH_TEST(DeathTest::RK_Unknown, std::exit(13));
-}
+#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+DeathCause assertion_death_cause = DeathCause::VerboseAbort;
+#else
+DeathCause assertion_death_cause = DeathCause::Trap;
+#endif
 
 int main(int, char**) {
-  test_no_match_found();
-  test_did_not_die();
-  test_unknown();
+  auto fail_assert     = [] { _LIBCPP_ASSERT(false, "Some message"); };
+  Matcher good_matcher = MakeAssertionMessageMatcher("Some message");
+  Matcher bad_matcher  = MakeAssertionMessageMatcher("Bad expected message");
+
+  // Test the implementation of death tests. We're bypassing the assertions added by the actual `EXPECT_DEATH` macros
+  // which allows us to test failure cases (where the assertion would fail) as well.
+  {
+    // Success -- `std::terminate`.
+    TEST_DEATH_TEST(Outcome::Success, DeathCause::StdTerminate, std::terminate());
+
+    // Success -- trapping.
+    TEST_DEATH_TEST(Outcome::Success, DeathCause::Trap, __builtin_trap());
+
+    // Success -- assertion failure with any matcher.
+    TEST_DEATH_TEST_MATCHES(Outcome::Success, assertion_death_cause, MakeAnyMatcher(), fail_assert());
+
+    // Success -- assertion failure with a specific matcher.
+    TEST_DEATH_TEST_MATCHES(Outcome::Success, assertion_death_cause, good_matcher, fail_assert());
+
+#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+    // Failure -- error message doesn't match.
+    TEST_DEATH_TEST_MATCHES(Outcome::UnexpectedErrorMessage, assertion_death_cause, bad_matcher, fail_assert());
+#endif
+
+    // Invalid cause -- child did not die.
+    TEST_DEATH_TEST(Outcome::InvalidCause, DeathCause::DidNotDie, ((void)0));
+
+    // Invalid cause --  unknown.
+    TEST_DEATH_TEST(Outcome::InvalidCause, DeathCause::Unknown, std::exit(13));
+  }
+
+  // Test the `EXPECT_DEATH` macros themselves. Since they assert success, we can only test successful cases.
+  {
+    auto invoke_abort = [] { _LIBCPP_VERBOSE_ABORT("contains some message"); };
+
+    auto simple_matcher = [](const std::string& text) {
+      bool success = text.find("some") != std::string::npos;
+      return MatchResult(success, "");
+    };
+
+    EXPECT_DEATH(invoke_abort());
+    EXPECT_DEATH_MATCHES(MakeAnyMatcher(), invoke_abort());
+    EXPECT_DEATH_MATCHES(simple_matcher, invoke_abort());
+    EXPECT_STD_TERMINATE([] { std::terminate(); });
+    TEST_LIBCPP_ASSERT_FAILURE(fail_assert(), "Some message");
+  }
+
   return 0;
 }
diff --git a/libcxx/vendor/llvm/default_assertion_handler.in b/libcxx/vendor/llvm/default_assertion_handler.in
index 111d305a16f7c..8bc0553c078b3 100644
--- a/libcxx/vendor/llvm/default_assertion_handler.in
+++ b/libcxx/vendor/llvm/default_assertion_handler.in
@@ -17,7 +17,15 @@
 #  pragma GCC system_header
 #endif
 
-// TODO(hardening): in production, trap rather than abort.
-#define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_ABORT("%s", message)
+#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
+
+#  define _LIBCPP_ASSERTION_HANDLER(message) _LIBCPP_VERBOSE_ABORT("%s", message)
+
+#else
+
+// TODO(hardening): use `__builtin_verbose_trap(message)` once that becomes available.
+#  define _LIBCPP_ASSERTION_HANDLER(message) ((void)message, __builtin_trap())
+
+#endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 
 #endif // _LIBCPP___ASSERTION_HANDLER

From 2bfa5ca9277a7320358501c8fa0f1741c5134859 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Fri, 19 Jan 2024 13:51:35 -0800
Subject: [PATCH 177/843] [lld][WebAssembly] Reset context object after each
 link (#78770)

This mirrors how the ELF linker works. I wasn't able to find anywhere
where this is currently tested.

Followup to #78640, which triggered a regression.
---
 lld/wasm/Config.h   |  4 +++-
 lld/wasm/Driver.cpp | 15 +++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/lld/wasm/Config.h b/lld/wasm/Config.h
index dc7ca265e9a2c..97c508bda6a1c 100644
--- a/lld/wasm/Config.h
+++ b/lld/wasm/Config.h
@@ -124,7 +124,7 @@ struct Ctx {
   llvm::SmallVector<InputTable *, 0> syntheticTables;
 
   // True if we are creating position-independent code.
-  bool isPic;
+  bool isPic = false;
 
   // True if we have an MVP input that uses __indirect_function_table and which
   // requires it to be allocated to table number 0.
@@ -138,6 +138,8 @@ struct Ctx {
   llvm::SmallVector<std::tuple<std::string, const InputFile *, const Symbol &>,
                     0>
       whyExtractRecords;
+
+  void reset();
 };
 
 extern Ctx ctx;
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 4a4f9a9622794..716a2d9ebfe30 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -47,6 +47,20 @@ namespace lld::wasm {
 Configuration *config;
 Ctx ctx;
 
+void Ctx::reset() {
+  objectFiles.clear();
+  stubFiles.clear();
+  sharedFiles.clear();
+  bitcodeFiles.clear();
+  syntheticFunctions.clear();
+  syntheticGlobals.clear();
+  syntheticTables.clear();
+  whyExtractRecords.clear();
+  isPic = false;
+  legacyFunctionTable = false;
+  emitBssSegments = false;
+}
+
 namespace {
 
 // Create enum with OPT_xxx values for each option in Options.td
@@ -90,6 +104,7 @@ bool link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
   auto *ctx = new CommonLinkerContext;
 
   ctx->e.initialize(stdoutOS, stderrOS, exitEarly, disableOutput);
+  ctx->e.cleanupCallback = []() { wasm::ctx.reset(); };
   ctx->e.logName = args::getFilenameWithoutExe(args[0]);
   ctx->e.errorLimitExceededMsg = "too many errors emitted, stopping now (use "
                                  "-error-limit=0 to see all errors)";

From cd05ade13a66d4fb2daff489b2c02ac1ae2070d1 Mon Sep 17 00:00:00 2001
From: bd1976bris <bd1976llvm@gmail.com>
Date: Fri, 19 Jan 2024 21:57:40 +0000
Subject: [PATCH 178/843] Add a "don't override" mapping for
 -fvisibility-from-dllstorageclass (#74629)

`-fvisibility-from-dllstorageclass` allows for overriding the visibility
of globals from their DLL storage class. The visibility to apply can be
customised for the different classes of globals via a set of dependent
options that specify the mapping values:
- `-fvisibility-dllexport=<value>`
- `-fvisibility-nodllstorageclass=<value>`
- `-fvisibility-externs-dllimport=<value>`
- `-fvisibility-externs-nodllstorageclass=<value>`

Currently, one of the existing LLVM visibilities, `hidden`, `protected`,
`default`, can be used as a mapping value. This change adds a new
mapping value: `keep`, which specifies that the visibility should not be
overridden for that class of globals. The behaviour of
`-fvisibility-from-dllstorageclass` is otherwise unchanged and existing
uses of this set of options will be unaffected.

The background to this change is that currently the PS4 and PS5
compilers effectively ignore visibility - dllimport/export is the
supported method for export control in C/C++ source code. Now, we would
like to support visibility attributes and options in our frontend, in
addition to dllimport/export. To support this, we will override the
visibility of globals with explicit dllimport/export annotations but use
the `keep` setting for globals which do not have an explicit
dllimport/export.

There are also some minor improvements to the existing options:
- Make the `LANGOPS` `BENIGN` as they don't involve the AST.
- Correct/clarify the help text for the options.
---
 clang/include/clang/Basic/LangOptions.def     | 20 ++---
 clang/include/clang/Basic/LangOptions.h       | 11 +++
 clang/include/clang/Driver/Options.td         | 23 +++---
 clang/lib/CodeGen/CodeGenModule.cpp           | 81 ++++++++++++-------
 .../CodeGenCXX/visibility-dllstorageclass.cpp | 40 ++++++++-
 .../test/Driver/visibility-dllstorageclass.c  | 34 +++++++-
 6 files changed, 160 insertions(+), 49 deletions(-)

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 04ebffbcba69d..f181da9aea6f9 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -359,16 +359,16 @@ BENIGN_ENUM_LANGOPT(TypeVisibilityMode, Visibility, 3, DefaultVisibility,
              "default visibility for types [-ftype-visibility]")
 LANGOPT(SetVisibilityForExternDecls, 1, 0,
         "apply global symbol visibility to external declarations without an explicit visibility")
-LANGOPT(VisibilityFromDLLStorageClass, 1, 0,
-        "set the visiblity of globals from their DLL storage class [-fvisibility-from-dllstorageclass]")
-ENUM_LANGOPT(DLLExportVisibility, Visibility, 3, DefaultVisibility,
-             "visibility for functions and variables with dllexport annotations [-fvisibility-from-dllstorageclass]")
-ENUM_LANGOPT(NoDLLStorageClassVisibility, Visibility, 3, HiddenVisibility,
-             "visibility for functions and variables without an explicit DLL storage class [-fvisibility-from-dllstorageclass]")
-ENUM_LANGOPT(ExternDeclDLLImportVisibility, Visibility, 3, DefaultVisibility,
-             "visibility for external declarations with dllimport annotations [-fvisibility-from-dllstorageclass]")
-ENUM_LANGOPT(ExternDeclNoDLLStorageClassVisibility, Visibility, 3, HiddenVisibility,
-             "visibility for external declarations without an explicit DLL storage class [-fvisibility-from-dllstorageclass]")
+BENIGN_LANGOPT(VisibilityFromDLLStorageClass, 1, 0,
+               "override the visibility of globals based on their final DLL storage class [-fvisibility-from-dllstorageclass]")
+BENIGN_ENUM_LANGOPT(DLLExportVisibility, VisibilityFromDLLStorageClassKinds, 3, VisibilityFromDLLStorageClassKinds::Default,
+             "how to adjust the visibility for functions and variables with dllexport annotations [-fvisibility-dllexport]")
+BENIGN_ENUM_LANGOPT(NoDLLStorageClassVisibility, VisibilityFromDLLStorageClassKinds, 3, VisibilityFromDLLStorageClassKinds::Hidden,
+             "how to adjust the visibility for functions and variables without an explicit DLL storage class [-fvisibility-nodllstorageclass]")
+BENIGN_ENUM_LANGOPT(ExternDeclDLLImportVisibility, VisibilityFromDLLStorageClassKinds, 3, VisibilityFromDLLStorageClassKinds::Default,
+             "how to adjust the visibility for external declarations with dllimport annotations [-fvisibility-externs-dllimport]")
+BENIGN_ENUM_LANGOPT(ExternDeclNoDLLStorageClassVisibility, VisibilityFromDLLStorageClassKinds, 3,  VisibilityFromDLLStorageClassKinds::Hidden,
+             "how to adjust the visibility for external declarations without an explicit DLL storage class [-fvisibility-externs-nodllstorageclass]")
 BENIGN_LANGOPT(SemanticInterposition        , 1, 0, "semantic interposition")
 BENIGN_LANGOPT(HalfNoSemanticInterposition, 1, 0,
                "Like -fno-semantic-interposition but don't use local aliases")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 9f986fce2d441..976a00d1a6fa5 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -381,6 +381,17 @@ class LangOptions : public LangOptionsBase {
     All,
   };
 
+  enum class VisibilityFromDLLStorageClassKinds {
+    /// Keep the IR-gen assigned visibility.
+    Keep,
+    /// Override the IR-gen assigned visibility with default visibility.
+    Default,
+    /// Override the IR-gen assigned visibility with hidden visibility.
+    Hidden,
+    /// Override the IR-gen assigned visibility with protected visibility.
+    Protected,
+  };
+
   enum class StrictFlexArraysLevelKind {
     /// Any trailing array member is a FAM.
     Default = 0,
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index d2e6c3ff721c2..f1687d823f6e0 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3862,27 +3862,32 @@ def dA : Flag<["-"], "dA">, Alias<fverbose_asm>;
 defm visibility_from_dllstorageclass : BoolFOption<"visibility-from-dllstorageclass",
   LangOpts<"VisibilityFromDLLStorageClass">, DefaultFalse,
   PosFlag<SetTrue, [], [ClangOption, CC1Option],
-          "Set the visibility of symbols in the generated code from their DLL storage class">,
+          "Override the visibility of globals based on their final DLL storage class.">,
   NegFlag<SetFalse>>;
+class MarshallingInfoVisibilityFromDLLStorage<KeyPathAndMacro kpm, code default>
+  : MarshallingInfoEnum<kpm, default>,
+    Values<"keep,hidden,protected,default">,
+    NormalizedValuesScope<"LangOptions::VisibilityFromDLLStorageClassKinds">,
+    NormalizedValues<["Keep", "Hidden", "Protected", "Default"]> {}
 def fvisibility_dllexport_EQ : Joined<["-"], "fvisibility-dllexport=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
-  HelpText<"The visibility for dllexport definitions [-fvisibility-from-dllstorageclass]">,
-  MarshallingInfoVisibility<LangOpts<"DLLExportVisibility">, "DefaultVisibility">,
+  HelpText<"The visibility for dllexport definitions. If Keep is specified the visibility is not adjusted [-fvisibility-from-dllstorageclass]">,
+  MarshallingInfoVisibilityFromDLLStorage<LangOpts<"DLLExportVisibility">, "Default">,
   ShouldParseIf<fvisibility_from_dllstorageclass.KeyPath>;
 def fvisibility_nodllstorageclass_EQ : Joined<["-"], "fvisibility-nodllstorageclass=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
-  HelpText<"The visibility for definitions without an explicit DLL export class [-fvisibility-from-dllstorageclass]">,
-  MarshallingInfoVisibility<LangOpts<"NoDLLStorageClassVisibility">, "HiddenVisibility">,
+  HelpText<"The visibility for definitions without an explicit DLL storage class. If Keep is specified the visibility is not adjusted [-fvisibility-from-dllstorageclass]">,
+  MarshallingInfoVisibilityFromDLLStorage<LangOpts<"NoDLLStorageClassVisibility">, "Hidden">,
   ShouldParseIf<fvisibility_from_dllstorageclass.KeyPath>;
 def fvisibility_externs_dllimport_EQ : Joined<["-"], "fvisibility-externs-dllimport=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
-  HelpText<"The visibility for dllimport external declarations [-fvisibility-from-dllstorageclass]">,
-  MarshallingInfoVisibility<LangOpts<"ExternDeclDLLImportVisibility">, "DefaultVisibility">,
+  HelpText<"The visibility for dllimport external declarations. If Keep is specified the visibility is not adjusted [-fvisibility-from-dllstorageclass]">,
+  MarshallingInfoVisibilityFromDLLStorage<LangOpts<"ExternDeclDLLImportVisibility">, "Default">,
   ShouldParseIf<fvisibility_from_dllstorageclass.KeyPath>;
 def fvisibility_externs_nodllstorageclass_EQ : Joined<["-"], "fvisibility-externs-nodllstorageclass=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
-  HelpText<"The visibility for external declarations without an explicit DLL dllstorageclass [-fvisibility-from-dllstorageclass]">,
-  MarshallingInfoVisibility<LangOpts<"ExternDeclNoDLLStorageClassVisibility">, "HiddenVisibility">,
+  HelpText<"The visibility for external declarations without an explicit DLL storage class. If Keep is specified the visibility is not adjusted [-fvisibility-from-dllstorageclass]">,
+  MarshallingInfoVisibilityFromDLLStorage<LangOpts<"ExternDeclNoDLLStorageClassVisibility">, "Hidden">,
   ShouldParseIf<fvisibility_from_dllstorageclass.KeyPath>;
 def fvisibility_EQ : Joined<["-"], "fvisibility=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 482c2108a988a..77c931842dd21 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -722,43 +722,70 @@ void InstrProfStats::reportDiagnostics(DiagnosticsEngine &Diags,
   }
 }
 
+static std::optional<llvm::GlobalValue::VisibilityTypes>
+getLLVMVisibility(clang::LangOptions::VisibilityFromDLLStorageClassKinds K) {
+  // Map to LLVM visibility.
+  switch (K) {
+  case clang::LangOptions::VisibilityFromDLLStorageClassKinds::Keep:
+    return std::nullopt;
+  case clang::LangOptions::VisibilityFromDLLStorageClassKinds::Default:
+    return llvm::GlobalValue::DefaultVisibility;
+  case clang::LangOptions::VisibilityFromDLLStorageClassKinds::Hidden:
+    return llvm::GlobalValue::HiddenVisibility;
+  case clang::LangOptions::VisibilityFromDLLStorageClassKinds::Protected:
+    return llvm::GlobalValue::ProtectedVisibility;
+  }
+  llvm_unreachable("unknown option value!");
+}
+
+void setLLVMVisibility(llvm::GlobalValue &GV,
+                       std::optional<llvm::GlobalValue::VisibilityTypes> V) {
+  if (!V)
+    return;
+
+  // Reset DSO locality before setting the visibility. This removes
+  // any effects that visibility options and annotations may have
+  // had on the DSO locality. Setting the visibility will implicitly set
+  // appropriate globals to DSO Local; however, this will be pessimistic
+  // w.r.t. to the normal compiler IRGen.
+  GV.setDSOLocal(false);
+  GV.setVisibility(*V);
+}
+
 static void setVisibilityFromDLLStorageClass(const clang::LangOptions &LO,
                                              llvm::Module &M) {
   if (!LO.VisibilityFromDLLStorageClass)
     return;
 
-  llvm::GlobalValue::VisibilityTypes DLLExportVisibility =
-      CodeGenModule::GetLLVMVisibility(LO.getDLLExportVisibility());
-  llvm::GlobalValue::VisibilityTypes NoDLLStorageClassVisibility =
-      CodeGenModule::GetLLVMVisibility(LO.getNoDLLStorageClassVisibility());
-  llvm::GlobalValue::VisibilityTypes ExternDeclDLLImportVisibility =
-      CodeGenModule::GetLLVMVisibility(LO.getExternDeclDLLImportVisibility());
-  llvm::GlobalValue::VisibilityTypes ExternDeclNoDLLStorageClassVisibility =
-      CodeGenModule::GetLLVMVisibility(
-          LO.getExternDeclNoDLLStorageClassVisibility());
+  std::optional<llvm::GlobalValue::VisibilityTypes> DLLExportVisibility =
+      getLLVMVisibility(LO.getDLLExportVisibility());
+
+  std::optional<llvm::GlobalValue::VisibilityTypes>
+      NoDLLStorageClassVisibility =
+          getLLVMVisibility(LO.getNoDLLStorageClassVisibility());
+
+  std::optional<llvm::GlobalValue::VisibilityTypes>
+      ExternDeclDLLImportVisibility =
+          getLLVMVisibility(LO.getExternDeclDLLImportVisibility());
+
+  std::optional<llvm::GlobalValue::VisibilityTypes>
+      ExternDeclNoDLLStorageClassVisibility =
+          getLLVMVisibility(LO.getExternDeclNoDLLStorageClassVisibility());
 
   for (llvm::GlobalValue &GV : M.global_values()) {
     if (GV.hasAppendingLinkage() || GV.hasLocalLinkage())
       continue;
 
-    // Reset DSO locality before setting the visibility. This removes
-    // any effects that visibility options and annotations may have
-    // had on the DSO locality. Setting the visibility will implicitly set
-    // appropriate globals to DSO Local; however, this will be pessimistic
-    // w.r.t. to the normal compiler IRGen.
-    GV.setDSOLocal(false);
-
-    if (GV.isDeclarationForLinker()) {
-      GV.setVisibility(GV.getDLLStorageClass() ==
-                               llvm::GlobalValue::DLLImportStorageClass
-                           ? ExternDeclDLLImportVisibility
-                           : ExternDeclNoDLLStorageClassVisibility);
-    } else {
-      GV.setVisibility(GV.getDLLStorageClass() ==
-                               llvm::GlobalValue::DLLExportStorageClass
-                           ? DLLExportVisibility
-                           : NoDLLStorageClassVisibility);
-    }
+    if (GV.isDeclarationForLinker())
+      setLLVMVisibility(GV, GV.getDLLStorageClass() ==
+                                    llvm::GlobalValue::DLLImportStorageClass
+                                ? ExternDeclDLLImportVisibility
+                                : ExternDeclNoDLLStorageClassVisibility);
+    else
+      setLLVMVisibility(GV, GV.getDLLStorageClass() ==
+                                    llvm::GlobalValue::DLLExportStorageClass
+                                ? DLLExportVisibility
+                                : NoDLLStorageClassVisibility);
 
     GV.setDLLStorageClass(llvm::GlobalValue::DefaultStorageClass);
   }
diff --git a/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp b/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp
index 6520e7039728e..a44ff1316d94c 100644
--- a/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp
+++ b/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp
@@ -1,7 +1,7 @@
 // REQUIRES: x86-registered-target
 
-// Test that -fvisibility-from-dllstorageclass maps DLL storage class to visibility
-// and that it overrides the effect of visibility options and annotations.
+//// Test that -fvisibility-from-dllstorageclass maps DLL storage class to visibility
+//// and that it overrides the effect of visibility options and annotations.
 
 // RUN: %clang_cc1 -triple x86_64-unknown-windows-itanium -fdeclspec \
 // RUN:     -fvisibility=hidden \
@@ -32,12 +32,35 @@
 // RUN:     -x c++  %s -S -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=ALL_DEFAULT
 
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-itanium -fdeclspec \
+// RUN:     -fvisibility=hidden \
+// RUN:     -fapply-global-visibility-to-externs \
+// RUN:     -fvisibility-from-dllstorageclass \
+// RUN:     -fvisibility-dllexport=keep \
+// RUN:     -fvisibility-nodllstorageclass=keep \
+// RUN:     -fvisibility-externs-dllimport=keep \
+// RUN:     -fvisibility-externs-nodllstorageclass=keep \
+// RUN:     -x c++  %s -S -emit-llvm -o - | \
+// RUN:   FileCheck %s --check-prefixes=ALL_KEEP
+
+//// Show that omitting -fvisibility-from-dllstorageclass causes the other options to be ignored.
+// RUN: %clang_cc1 -triple x86_64-unknown-windows-itanium -fdeclspec \
+// RUN:     -fvisibility=hidden \
+// RUN:     -fapply-global-visibility-to-externs \
+// RUN:     -fvisibility-dllexport=protected \
+// RUN:     -fvisibility-nodllstorageclass=protected \
+// RUN:     -fvisibility-externs-dllimport=protected \
+// RUN:     -fvisibility-externs-nodllstorageclass=protected \
+// RUN:     -x c++  %s -S -emit-llvm -o - | \
+// RUN:   FileCheck %s --check-prefixes=ALL_KEEP
+
 // Local
 static void l() {}
 void use_locals(){l();}
 // DEFAULTS-DAG: define internal void @_ZL1lv()
 // EXPLICIT-DAG: define internal void @_ZL1lv()
 // ALL_DEFAULT-DAG: define internal void @_ZL1lv()
+// ALL_KEEP-DAG: define internal void @_ZL1lv()
 
 // Function
 void f() {}
@@ -48,6 +71,8 @@ void __declspec(dllexport) exported_f() {}
 // EXPLICIT-DAG: define hidden void @_Z10exported_fv()
 // ALL_DEFAULT-DAG: define void @_Z1fv()
 // ALL_DEFAULT-DAG: define void @_Z10exported_fv()
+// ALL_KEEP-DAG: define hidden void @_Z1fv()
+// ALL_KEEP-DAG: define hidden void @_Z10exported_fv()
 
 // Variable
 int d = 123;
@@ -58,6 +83,8 @@ __declspec(dllexport) int exported_d = 123;
 // EXPLICIT-DAG: @exported_d = hidden global
 // ALL_DEFAULT-DAG: @d = global
 // ALL_DEFAULT-DAG: @exported_d = global
+// ALL_KEEP-DAG: @d = hidden global
+// ALL_KEEP-DAG: @exported_d = hidden global
 
 // Alias
 extern "C" void aliased() {}
@@ -69,6 +96,8 @@ void __declspec(dllexport) a_exported() __attribute__((alias("aliased")));
 // EXPLICIT-DAG: @_Z10a_exportedv = hidden alias
 // ALL_DEFAULT-DAG: @_Z1av = alias
 // ALL_DEFAULT-DAG: @_Z10a_exportedv = alias
+// ALL_KEEP-DAG: @_Z1av = hidden alias
+// ALL_KEEP-DAG: @_Z10a_exportedv = dso_local alias
 
 // Declaration
 extern void e();
@@ -79,6 +108,8 @@ extern void __declspec(dllimport) imported_e();
 // EXPLICIT-DAG: declare hidden void @_Z10imported_ev()
 // ALL_DEFAULT-DAG: declare void @_Z1ev()
 // ALL_DEFAULT-DAG: declare void @_Z10imported_ev()
+// ALL_KEEP-DAG: declare hidden void @_Z1ev()
+// ALL_KEEP-DAG: declare void @_Z10imported_ev()
 
 // Weak Declaration
 __attribute__((weak))
@@ -91,6 +122,8 @@ extern void __declspec(dllimport) imported_w();
 // EXPLICIT-DAG: declare extern_weak hidden void @_Z10imported_wv()
 // ALL_DEFAULT-DAG: declare extern_weak void @_Z1wv()
 // ALL_DEFAULT-DAG: declare extern_weak void @_Z10imported_wv()
+// ALL_KEEP-DAG: declare extern_weak hidden void @_Z1wv()
+// ALL_KEEP-DAG: declare extern_weak void @_Z10imported_wv()
 
 void use_declarations(){e(); imported_e(); w(); imported_w();}
 
@@ -101,11 +134,14 @@ struct __attribute__((type_visibility("protected"))) t {
 };
 void t::foo() {}
 // DEFAULTS-DAG: @_ZTV1t = hidden unnamed_addr constant
+// ALL_KEEP-DAG: @_ZTV1t = protected unnamed_addr constant
 
 int v __attribute__ ((__visibility__ ("protected"))) = 123;
 // DEFAULTS-DAG: @v = hidden global
+// ALL_KEEP-DAG: @v = protected global
 
 #pragma GCC visibility push(protected)
 int p = 345;
 #pragma GCC visibility pop
 // DEFAULTS-DAG: @p = hidden global
+// ALL_KEEP-DAG: @p = protected global
diff --git a/clang/test/Driver/visibility-dllstorageclass.c b/clang/test/Driver/visibility-dllstorageclass.c
index 1e495f694a5a5..65d13d348cc97 100644
--- a/clang/test/Driver/visibility-dllstorageclass.c
+++ b/clang/test/Driver/visibility-dllstorageclass.c
@@ -1,4 +1,4 @@
-// Check behaviour of -fvisibility-from-dllstorageclass options
+//// Check behaviour of -fvisibility-from-dllstorageclass options
 
 // RUN: %clang -target x86_64-unknown-windows-itanium -fdeclspec \
 // RUN:     -Werror -S -### %s 2>&1 | \
@@ -85,3 +85,35 @@
 // ALL-SAME: "-fvisibility-nodllstorageclass=protected"
 // ALL-SAME: "-fvisibility-externs-dllimport=hidden"
 // ALL-SAME: "-fvisibility-externs-nodllstorageclass=protected"
+
+//// Test that "keep" can be specified, which means that the visibility of
+//// the matching globals will not be adjusted.
+
+// RUN: %clang -target x86_64-unknown-windows-itanium -fdeclspec \
+// RUN:     -fvisibility-from-dllstorageclass \
+// RUN:     -fvisibility-dllexport=keep \
+// RUN:     -fvisibility-nodllstorageclass=keep \
+// RUN:     -fvisibility-externs-dllimport=keep \
+// RUN:     -fvisibility-externs-nodllstorageclass=keep \
+// RUN:     -Werror -S -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=KEEP
+
+// KEEP:        "-fvisibility-from-dllstorageclass"
+// KEEP-SAME:   "-fvisibility-dllexport=keep"
+// KEEP-SAME:   "-fvisibility-nodllstorageclass=keep"
+// KEEP-SAME:   "-fvisibility-externs-dllimport=keep"
+// KEEP-SAME:   "-fvisibility-externs-nodllstorageclass=keep"
+
+// RUN: %clang -target x86_64-unknown-windows-itanium -fdeclspec \
+// RUN:     -fvisibility-from-dllstorageclass \
+// RUN:     -fvisibility-dllexport=default \
+// RUN:     -fvisibility-dllexport=keep \
+// RUN:     -fvisibility-nodllstorageclass=default \
+// RUN:     -fvisibility-nodllstorageclass=keep \
+// RUN:     -fvisibility-externs-dllimport=default \
+// RUN:     -fvisibility-externs-dllimport=keep \
+// RUN:     -fvisibility-externs-nodllstorageclass=default \
+// RUN:     -fvisibility-externs-nodllstorageclass=keep \
+// RUN:     -Werror -S -### %s 2>&1 | \
+// RUN:   FileCheck %s --check-prefixes=KEEP
+

From 86eaf6083b2cd27b8811f4791ad2eb8dacbb0e5f Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Fri, 19 Jan 2024 15:11:18 -0700
Subject: [PATCH 179/843] [X86] Refine
 X86DAGToDAGISel::isSExtAbsoluteSymbolRef() (#76191)

We just need to check if the global is large or not.

In the kernel code model, globals are in the negative 2GB of the address
space, so globals can be a sign extended 32-bit immediate.

In other code models, small globals are in the low 2GB of the address
space, so sign extending them is equivalent to zero extending them.
---
 llvm/lib/Target/X86/X86ISelDAGToDAG.cpp          | 16 ++++++++++------
 ...imm-small-model.ll => relocimm-code-model.ll} |  8 ++++++--
 2 files changed, 16 insertions(+), 8 deletions(-)
 rename llvm/test/CodeGen/X86/{relocimm-small-model.ll => relocimm-code-model.ll} (62%)

diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 853d654457ef3..833f058253d88 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -3188,12 +3188,16 @@ bool X86DAGToDAGISel::isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const {
   if (!GA)
     return false;
 
-  std::optional<ConstantRange> CR = GA->getGlobal()->getAbsoluteSymbolRange();
-  if (!CR)
-    return Width == 32 && TM.getCodeModel() == CodeModel::Small;
-
-  return CR->getSignedMin().sge(-1ull << Width) &&
-         CR->getSignedMax().slt(1ull << Width);
+  auto *GV = GA->getGlobal();
+  std::optional<ConstantRange> CR = GV->getAbsoluteSymbolRange();
+  if (CR)
+    return CR->getSignedMin().sge(-1ull << Width) &&
+           CR->getSignedMax().slt(1ull << Width);
+  // In the kernel code model, globals are in the negative 2GB of the address
+  // space, so globals can be a sign extended 32-bit immediate.
+  // In other code models, small globals are in the low 2GB of the address
+  // space, so sign extending them is equivalent to zero extending them.
+  return Width == 32 && !TM.isLargeGlobalValue(GV);
 }
 
 X86::CondCode X86DAGToDAGISel::getCondFromNode(SDNode *N) const {
diff --git a/llvm/test/CodeGen/X86/relocimm-small-model.ll b/llvm/test/CodeGen/X86/relocimm-code-model.ll
similarity index 62%
rename from llvm/test/CodeGen/X86/relocimm-small-model.ll
rename to llvm/test/CodeGen/X86/relocimm-code-model.ll
index 5c128e15c1e26..82483abc1ed0a 100644
--- a/llvm/test/CodeGen/X86/relocimm-small-model.ll
+++ b/llvm/test/CodeGen/X86/relocimm-code-model.ll
@@ -1,5 +1,9 @@
 ; RUN: llc < %s | FileCheck %s --check-prefix=CHECK-SMALL
-; RUN: llc --code-model=medium < %s | FileCheck %s --check-prefix=CHECK-MEDIUM
+; RUN: llc --code-model=medium -large-data-threshold=100 < %s | FileCheck %s --check-prefix=CHECK-SMALL
+; RUN: llc --code-model=medium < %s | FileCheck %s --check-prefix=CHECK-LARGE
+; RUN: llc --code-model=large -large-data-threshold=100 < %s | FileCheck %s --check-prefix=CHECK-SMALL
+; RUN: llc --code-model=large < %s | FileCheck %s --check-prefix=CHECK-LARGE
+; RUN: llc --code-model=kernel < %s | FileCheck %s --check-prefix=CHECK-SMALL
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -9,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 declare void @f()
 
 define void @foo(i64 %b) {
-; CHECK-MEDIUM: cmpq  %rax, %rdi
+; CHECK-LARGE: cmpq %rax, %rdi
 ; CHECK-SMALL: cmpq $a, %rdi
 entry:
   %cmp = icmp eq i64 %b, ptrtoint (ptr @a to i64)

From f9bc1ee3fcf913df7e3f0684a8f3b98dc55dff73 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Fri, 19 Jan 2024 14:28:31 -0800
Subject: [PATCH 180/843] [llvm-objdump] Add support for symbolizing
 PGOBBAddrMap Info (#76386)

This patch adds in support for symbolizing PGO information contained
within the SHT_LLVM_BB_ADDR_MAP section in llvm-objdump. The outputs are
simply the raw values contained within the section.
---
 .../llvm-objdump/X86/elf-pgoanalysismap.yaml  | 180 ++++++++++++++++++
 llvm/tools/llvm-objdump/llvm-objdump.cpp      | 100 ++++++++--
 2 files changed, 263 insertions(+), 17 deletions(-)
 create mode 100644 llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml

diff --git a/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml b/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
new file mode 100644
index 0000000000000..c4bf443f920a2
--- /dev/null
+++ b/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
@@ -0,0 +1,180 @@
+## Test that in the presence of SHT_LLVM_BB_ADDR_MAP sections which also
+## contain PGO data, --symbolize-operands is able to label the basic blocks
+## correctly.
+
+## Check the case where we only have entry counts.
+
+# RUN: yaml2obj --docnum=1 %s -o %t1
+# RUN: llvm-objdump %t1 -d --symbolize-operands --no-show-raw-insn --no-leading-addr | \
+# RUN:   FileCheck %s --check-prefix=ENTRYCOUNT
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .text.foo
+    Type:    SHT_PROGBITS
+    Address: 0x0
+    Flags:   [SHF_ALLOC, SHF_EXECINSTR]
+    Content: '50'
+  - Name:   .llvm_bb_addr_map.foo
+    Type:   SHT_LLVM_BB_ADDR_MAP
+    Link:   .text.foo
+    Entries:
+      - Version: 2
+        Address: 0x0
+        Feature: 0x1
+        BBEntries:
+          - ID:            3
+            AddressOffset: 0x0
+            Size:          0x1
+            Metadata:      0x1
+    PGOAnalyses:
+      - FuncEntryCount: 1000
+Symbols:
+  - Name:    foo
+    Section: .text.foo
+    Value:   0x0
+
+# ENTRYCOUNT: <foo>:
+# ENTRYCOUNT: <BB3> (Entry count: 1000):
+
+## Check the case where we have entry points and block frequency information
+
+# RUN: yaml2obj %s --docnum=2 -o %t2
+# RUN: llvm-objdump %t2 -d --symbolize-operands --no-show-raw-insn --no-leading-addr | \
+# RUN:   FileCheck %s --check-prefix=ENTRYCOUNT-BLOCKFREQ
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .text.foo
+    Type:    SHT_PROGBITS
+    Address: 0x0
+    Flags:   [SHF_ALLOC, SHF_EXECINSTR]
+    Content: '503b0505200000907d02ebf5c3'
+  - Name:   .llvm_bb_addr_map.foo
+    Type:   SHT_LLVM_BB_ADDR_MAP
+    Link:   .text.foo
+    Entries:
+      - Version: 2
+        Address: 0x0
+        Feature: 0x3
+        BBEntries:
+          - ID:            3
+            AddressOffset: 0x0
+            Size:          0x1
+            Metadata:      0x1
+          - ID:            1
+            AddressOffset: 0x0
+            Size:          0x6
+            Metadata:      0x0
+          - ID:            2
+            AddressOffset: 0x1
+            Size:          0x4
+            Metadata:      0x0
+          - ID:            5
+            AddressOffset: 0x0
+            Size:          0x1
+            Metadata:      0x2
+    PGOAnalyses:
+      - FuncEntryCount: 1000
+        PGOBBEntries:
+          - BBFreq: 1000
+          - BBFreq: 133
+          - BBFreq: 18
+          - BBFreq: 1000
+Symbols:
+  - Name:    foo
+    Section: .text.foo
+    Value:   0x0
+
+# ENTRYCOUNT-BLOCKFREQ: <foo>:
+# ENTRYCOUNT-BLOCKFREQ: <BB3> (Entry count: 1000, Frequency: 1000):
+# ENTRYCOUNT-BLOCKFREQ: <BB1> (Frequency: 133):
+# ENTRYCOUNT-BLOCKFREQ: <BB2> (Frequency: 18):
+# ENTRYCOUNT-BLOCKFREQ: <BB5> (Frequency: 1000):
+
+## Check the case where we have entry points, block frequency, and branch
+## proabability information.
+
+# RUN: yaml2obj %s --docnum=3 -o %t3
+# RUN: llvm-objdump %t3 -d --symbolize-operands --no-show-raw-insn --no-leading-addr | \
+# RUN:   FileCheck %s --check-prefix=ENTRY-FREQ-PROB
+
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_X86_64
+Sections:
+  - Name:    .text.foo
+    Type:    SHT_PROGBITS
+    Address: 0x0
+    Flags:   [SHF_ALLOC, SHF_EXECINSTR]
+    Content: '503b0505200000907d02ebf5c3'
+  - Name:   .llvm_bb_addr_map.foo
+    Type:   SHT_LLVM_BB_ADDR_MAP
+    Link:   .text.foo
+    Entries:
+      - Version: 2
+        Address: 0x0
+        Feature: 0x7
+        BBEntries:
+          - ID:            3
+            AddressOffset: 0x0
+            Size:          0x1
+            Metadata:      0x1
+          - ID:            1
+            AddressOffset: 0x0
+            Size:          0x6
+            Metadata:      0x0
+          - ID:            2
+            AddressOffset: 0x1
+            Size:          0x4
+            Metadata:      0x0
+          - ID:            5
+            AddressOffset: 0x0
+            Size:          0x1
+            Metadata:      0x2
+    PGOAnalyses:
+      - FuncEntryCount: 1000
+        PGOBBEntries:
+          - BBFreq: 1000
+            Successors:
+            - ID:          1
+              BrProb:      0x22222222
+            - ID:          2
+              BrProb:      0x33333333
+            - ID:          3
+              BrProb:      0xaaaaaaaa
+          - BBFreq: 133
+            Successors:
+            - ID:          2
+              BrProb:      0x11111111
+            - ID:          3
+              BrProb:      0xeeeeeeee
+          - BBFreq: 18
+            Successors:
+            - ID:          3
+              BrProb:      0xffffffff
+          - BBFreq: 1000
+            Successors:    []
+Symbols:
+  - Name:    foo
+    Section: .text.foo
+    Value:   0x0
+
+# ENTRY-FREQ-PROB: <foo>:
+# ENTRY-FREQ-PROB: <BB3> (Entry count: 1000, Frequency: 1000, Successors: BB1:22222222, BB2:33333333, BB3:aaaaaaaa):
+# ENTRY-FREQ-PROB: <BB1> (Frequency: 133, Successors: BB2:11111111, BB3:eeeeeeee):
+# ENTRY-FREQ-PROB: <BB2> (Frequency: 18, Successors: BB3:ffffffff):
+# ENTRY-FREQ-PROB: <BB5> (Frequency: 1000):
diff --git a/llvm/tools/llvm-objdump/llvm-objdump.cpp b/llvm/tools/llvm-objdump/llvm-objdump.cpp
index 8944965ccf80a..22b427f57658e 100644
--- a/llvm/tools/llvm-objdump/llvm-objdump.cpp
+++ b/llvm/tools/llvm-objdump/llvm-objdump.cpp
@@ -1263,10 +1263,57 @@ static SymbolInfoTy createDummySymbolInfo(const ObjectFile &Obj,
     return SymbolInfoTy(Addr, Name, Type);
 }
 
-static void
-collectBBAddrMapLabels(const std::unordered_map<uint64_t, BBAddrMap> &AddrToBBAddrMap,
-                       uint64_t SectionAddr, uint64_t Start, uint64_t End,
-                       std::unordered_map<uint64_t, std::vector<std::string>> &Labels) {
+struct BBAddrMapLabel {
+  std::string BlockLabel;
+  std::string PGOAnalysis;
+};
+
+static std::string constructPGOLabelString(const PGOAnalysisMap &PGOMap,
+                                           size_t BBEntryIndex) {
+  std::string PGOString;
+  raw_string_ostream PGOSS(PGOString);
+
+  PGOSS << " (";
+  if (PGOMap.FeatEnable.FuncEntryCount && BBEntryIndex == 0) {
+    PGOSS << "Entry count: " << Twine(PGOMap.FuncEntryCount);
+    if (PGOMap.FeatEnable.BBFreq || PGOMap.FeatEnable.BrProb) {
+      PGOSS << ", ";
+    }
+  }
+
+  if (PGOMap.FeatEnable.BBFreq || PGOMap.FeatEnable.BrProb) {
+    assert(BBEntryIndex < PGOMap.BBEntries.size() &&
+           "Expected PGOAnalysisMap and BBAddrMap to have the same entires");
+    const PGOAnalysisMap::PGOBBEntry &PGOBBEntry =
+        PGOMap.BBEntries[BBEntryIndex];
+
+    if (PGOMap.FeatEnable.BBFreq) {
+      PGOSS << "Frequency: " << Twine(PGOBBEntry.BlockFreq.getFrequency());
+      if (PGOMap.FeatEnable.BrProb && PGOBBEntry.Successors.size() > 0) {
+        PGOSS << ", ";
+      }
+    }
+    if (PGOMap.FeatEnable.BrProb && PGOBBEntry.Successors.size() > 0) {
+      PGOSS << "Successors: ";
+      interleaveComma(
+          PGOBBEntry.Successors, PGOSS,
+          [&PGOSS](const PGOAnalysisMap::PGOBBEntry::SuccessorEntry &SE) {
+            PGOSS << "BB" << SE.ID << ":";
+            PGOSS.write_hex(SE.Prob.getNumerator());
+          });
+    }
+  }
+  PGOSS << ")";
+
+  return PGOString;
+}
+
+static void collectBBAddrMapLabels(
+    const std::unordered_map<uint64_t, BBAddrMap> &AddrToBBAddrMap,
+    const std::unordered_map<uint64_t, PGOAnalysisMap> &AddrToPGOAnalysisMap,
+    uint64_t SectionAddr, uint64_t Start, uint64_t End,
+    std::unordered_map<uint64_t, std::vector<BBAddrMapLabel>> &Labels,
+    const StringRef FileName) {
   if (AddrToBBAddrMap.empty())
     return;
   Labels.clear();
@@ -1275,11 +1322,21 @@ collectBBAddrMapLabels(const std::unordered_map<uint64_t, BBAddrMap> &AddrToBBAd
   auto Iter = AddrToBBAddrMap.find(StartAddress);
   if (Iter == AddrToBBAddrMap.end())
     return;
-  for (const BBAddrMap::BBEntry &BBEntry : Iter->second.getBBEntries()) {
+  auto PGOIter = AddrToPGOAnalysisMap.find(StartAddress);
+
+  for (size_t I = 0; I < Iter->second.getBBEntries().size(); ++I) {
+    const BBAddrMap::BBEntry &BBEntry = Iter->second.getBBEntries()[I];
     uint64_t BBAddress = BBEntry.Offset + Iter->second.getFunctionAddress();
     if (BBAddress >= EndAddress)
       continue;
-    Labels[BBAddress].push_back(("BB" + Twine(BBEntry.ID)).str());
+
+    std::string LabelString = ("BB" + Twine(BBEntry.ID)).str();
+    std::string PGOString;
+
+    if (PGOIter != AddrToPGOAnalysisMap.end())
+      PGOString = constructPGOLabelString(PGOIter->second, I);
+
+    Labels[BBAddress].push_back({LabelString, PGOString});
   }
 }
 
@@ -1637,18 +1694,24 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
   LLVM_DEBUG(LVP.dump());
 
   std::unordered_map<uint64_t, BBAddrMap> AddrToBBAddrMap;
+  std::unordered_map<uint64_t, PGOAnalysisMap> AddrToPGOAnalysisMap;
   auto ReadBBAddrMap = [&](std::optional<unsigned> SectionIndex =
                                std::nullopt) {
     AddrToBBAddrMap.clear();
     if (const auto *Elf = dyn_cast<ELFObjectFileBase>(&Obj)) {
-      auto BBAddrMapsOrErr = Elf->readBBAddrMap(SectionIndex);
+      std::vector<PGOAnalysisMap> PGOAnalyses;
+      auto BBAddrMapsOrErr = Elf->readBBAddrMap(SectionIndex, &PGOAnalyses);
       if (!BBAddrMapsOrErr) {
         reportWarning(toString(BBAddrMapsOrErr.takeError()), Obj.getFileName());
         return;
       }
-      for (auto &FunctionBBAddrMap : *BBAddrMapsOrErr)
-        AddrToBBAddrMap.emplace(FunctionBBAddrMap.Addr,
-                                std::move(FunctionBBAddrMap));
+      for (const auto &[FunctionBBAddrMap, FunctionPGOAnalysis] :
+           zip_equal(*std::move(BBAddrMapsOrErr), std::move(PGOAnalyses))) {
+        uint64_t Addr = FunctionBBAddrMap.Addr;
+        AddrToBBAddrMap.emplace(Addr, std::move(FunctionBBAddrMap));
+        if (FunctionPGOAnalysis.FeatEnable.anyEnabled())
+          AddrToPGOAnalysisMap.emplace(Addr, std::move(FunctionPGOAnalysis));
+      }
     }
   };
 
@@ -1977,14 +2040,15 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
         FOS.SetUnbuffered();
 
       std::unordered_map<uint64_t, std::string> AllLabels;
-      std::unordered_map<uint64_t, std::vector<std::string>> BBAddrMapLabels;
+      std::unordered_map<uint64_t, std::vector<BBAddrMapLabel>> BBAddrMapLabels;
       if (SymbolizeOperands) {
         collectLocalBranchTargets(Bytes, DT->InstrAnalysis.get(),
                                   DT->DisAsm.get(), DT->InstPrinter.get(),
                                   PrimaryTarget.SubtargetInfo.get(),
                                   SectionAddr, Index, End, AllLabels);
-        collectBBAddrMapLabels(AddrToBBAddrMap, SectionAddr, Index, End,
-                               BBAddrMapLabels);
+        collectBBAddrMapLabels(AddrToBBAddrMap, AddrToPGOAnalysisMap,
+                               SectionAddr, Index, End, BBAddrMapLabels,
+                               FileName);
       }
 
       if (DT->InstrAnalysis)
@@ -2082,8 +2146,9 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
           // Print local label if there's any.
           auto Iter1 = BBAddrMapLabels.find(SectionAddr + Index);
           if (Iter1 != BBAddrMapLabels.end()) {
-            for (StringRef Label : Iter1->second)
-              FOS << "<" << Label << ">:\n";
+            for (const auto &BBLabel : Iter1->second)
+              FOS << "<" << BBLabel.BlockLabel << ">" << BBLabel.PGOAnalysis
+                  << ":\n";
           } else {
             auto Iter2 = AllLabels.find(SectionAddr + Index);
             if (Iter2 != AllLabels.end())
@@ -2260,7 +2325,7 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                 } else if (!Disp) {
                   *TargetOS << TargetName;
                 } else if (BBAddrMapLabelAvailable) {
-                  *TargetOS << BBAddrMapLabels[Target].front();
+                  *TargetOS << BBAddrMapLabels[Target].front().BlockLabel;
                 } else if (LabelAvailable) {
                   *TargetOS << AllLabels[Target];
                 } else {
@@ -2276,7 +2341,8 @@ disassembleObject(ObjectFile &Obj, const ObjectFile &DbgObj,
                 }
 
               } else if (BBAddrMapLabelAvailable) {
-                *TargetOS << " <" << BBAddrMapLabels[Target].front() << ">";
+                *TargetOS << " <" << BBAddrMapLabels[Target].front().BlockLabel
+                          << ">";
               } else if (LabelAvailable) {
                 *TargetOS << " <" << AllLabels[Target] << ">";
               }

From 39e024d9e2ec00de00d6815596572579e4c03beb Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Fri, 19 Jan 2024 14:42:03 -0800
Subject: [PATCH 181/843] [lld][WebAssembly] Use the archive offset with
 --whole-archive (#78791)

This essentially ports 0b1413a8 from the ELF linker.
---
 lld/test/wasm/lto/thin-archivecollision.ll | 34 ++++++++++++++++++++++
 lld/wasm/Driver.cpp                        | 11 +++----
 2 files changed, 40 insertions(+), 5 deletions(-)
 create mode 100644 lld/test/wasm/lto/thin-archivecollision.ll

diff --git a/lld/test/wasm/lto/thin-archivecollision.ll b/lld/test/wasm/lto/thin-archivecollision.ll
new file mode 100644
index 0000000000000..5b3a423de4020
--- /dev/null
+++ b/lld/test/wasm/lto/thin-archivecollision.ll
@@ -0,0 +1,34 @@
+; Based on lld/test/ELF/lto/thin-archivecollision.ll
+
+; RUN: opt -module-summary %s -o %t.o
+; RUN: mkdir -p %t1 %t2
+; RUN: opt -module-summary %p/Inputs/thin1.ll -o %t1/t.coll.o
+; RUN: opt -module-summary %p/Inputs/thin2.ll -o %t2/t.coll.o
+
+; RUN: rm -f %t.a
+; RUN: llvm-ar rcs %t.a %t1/t.coll.o %t2/t.coll.o
+; RUN: wasm-ld %t.o %t.a -o %t
+; RUN: obj2yaml %t | FileCheck %s
+
+; Check we handle this case correctly even in presence of --whole-archive.
+; RUN: wasm-ld %t.o --whole-archive %t.a -o %t
+; RUN: obj2yaml %t | FileCheck %s
+
+; CHECK: Name:            _start
+; CHECK: Name:            foo
+; CHECK: Name:            blah
+
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128"
+target triple = "wasm32-unknown-unknown"
+
+define i32 @_start() #0 {
+entry:
+  %call = call i32 @foo(i32 23)
+  %call1 = call i32 @blah(i32 37)
+  ret i32 0
+}
+
+declare i32 @foo(i32) #1
+declare i32 @blah(i32) #1
+
+attributes #0 = { noinline optnone }
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 716a2d9ebfe30..32c042b5695a4 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -233,19 +233,20 @@ static void readImportFile(StringRef filename) {
 
 // Returns slices of MB by parsing MB as an archive file.
 // Each slice consists of a member file in the archive.
-std::vector<MemoryBufferRef> static getArchiveMembers(MemoryBufferRef mb) {
+std::vector<std::pair<MemoryBufferRef, uint64_t>> static getArchiveMembers(
+    MemoryBufferRef mb) {
   std::unique_ptr<Archive> file =
       CHECK(Archive::create(mb),
             mb.getBufferIdentifier() + ": failed to parse archive");
 
-  std::vector<MemoryBufferRef> v;
+  std::vector<std::pair<MemoryBufferRef, uint64_t>> v;
   Error err = Error::success();
   for (const Archive::Child &c : file->children(err)) {
     MemoryBufferRef mbref =
         CHECK(c.getMemoryBufferRef(),
               mb.getBufferIdentifier() +
                   ": could not get the buffer for a child of the archive");
-    v.push_back(mbref);
+    v.push_back(std::make_pair(mbref, c.getChildOffset()));
   }
   if (err)
     fatal(mb.getBufferIdentifier() +
@@ -273,8 +274,8 @@ void LinkerDriver::addFile(StringRef path) {
 
     // Handle -whole-archive.
     if (inWholeArchive) {
-      for (MemoryBufferRef &m : getArchiveMembers(mbref)) {
-        auto *object = createObjectFile(m, path);
+      for (const auto &[m, offset] : getArchiveMembers(mbref)) {
+        auto *object = createObjectFile(m, path, offset);
         // Mark object as live; object members are normally not
         // live by default but -whole-archive is designed to treat
         // them as such.

From 66cea7143afd401f9d8c70966d21a6d19c65da9d Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 19 Jan 2024 14:07:49 -0800
Subject: [PATCH 182/843] [RISCV] Add test case for #78783. NFC

---
 llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
index 0e4e04af4a3fe..f58b01c7395af 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
@@ -91,3 +91,15 @@ for.body:                                         ; preds = %for.body, %for.body
   %niter.ncmp.1 = icmp eq i64 %niter.next.1, %unroll_iter
   br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
 }
+
+; TODO: We should not change 4294967295 to -1 here.
+define i64 @bug(i32 %x) {
+; CHECK-LABEL: @bug(
+; CHECK-NEXT:    [[A:%.*]] = sext i32 [[X:%.*]] to i64
+; CHECK-NEXT:    [[B:%.*]] = and i64 [[A]], -1
+; CHECK-NEXT:    ret i64 [[B]]
+;
+  %a = sext i32 %x to i64
+  %b = and i64 %a, 4294967295
+  ret i64 %b
+}

From 9396891271fd85b4f8922b16dd71e9433dc5fcb3 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Fri, 19 Jan 2024 14:21:09 -0800
Subject: [PATCH 183/843] [RISCV] Don't look for sext in
 RISCVCodeGenPrepare::visitAnd.

We want to know the upper 33 bits of the And Input are zero. SExt
only guarantees they are the same.

We originally checked for SExt or ZExt when we were using
isImpliedByDomCondition because a ZExt may have been changed to SExt
before we visited the And.

We are no longer using isImpliedByDomCondition so we can only look
for zext with the nneg flag.

While here, switch to PatternMatch to simplify the code.

Fixes #78783
---
 llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp  | 18 ++++++------------
 .../test/CodeGen/RISCV/riscv-codegenprepare.ll |  4 ++--
 2 files changed, 8 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
index 6434532afd409..53fcc527e615d 100644
--- a/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
+++ b/llvm/lib/Target/RISCV/RISCVCodeGenPrepare.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 
@@ -67,20 +68,13 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
   if (!BO.getType()->isIntegerTy(64))
     return false;
 
-  auto canBeSignExtend = [](Instruction *I) {
-    if (isa<SExtInst>(I))
-      return true;
-    if (isa<ZExtInst>(I))
-      return I->hasNonNeg();
-    return false;
-  };
+  using namespace PatternMatch;
 
-  // Left hand side should be a sext or zext nneg.
-  Instruction *LHS = dyn_cast<Instruction>(BO.getOperand(0));
-  if (!LHS || !canBeSignExtend(LHS))
+  // Left hand side should be a zext nneg.
+  Value *LHSSrc;
+  if (!match(BO.getOperand(0), m_NNegZExt(m_Value(LHSSrc))))
     return false;
 
-  Value *LHSSrc = LHS->getOperand(0);
   if (!LHSSrc->getType()->isIntegerTy(32))
     return false;
 
@@ -100,7 +94,7 @@ bool RISCVCodeGenPrepare::visitAnd(BinaryOperator &BO) {
 
   // Sign extend the constant and replace the And operand.
   C = SignExtend64<32>(C);
-  BO.setOperand(1, ConstantInt::get(LHS->getType(), C));
+  BO.setOperand(1, ConstantInt::get(RHS->getType(), C));
 
   return true;
 }
diff --git a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
index f58b01c7395af..2179a0d26cf98 100644
--- a/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
+++ b/llvm/test/CodeGen/RISCV/riscv-codegenprepare.ll
@@ -92,11 +92,11 @@ for.body:                                         ; preds = %for.body, %for.body
   br i1 %niter.ncmp.1, label %for.cond.cleanup.loopexit.unr-lcssa, label %for.body
 }
 
-; TODO: We should not change 4294967295 to -1 here.
+; Make sure we do not change 4294967295 to -1 here.
 define i64 @bug(i32 %x) {
 ; CHECK-LABEL: @bug(
 ; CHECK-NEXT:    [[A:%.*]] = sext i32 [[X:%.*]] to i64
-; CHECK-NEXT:    [[B:%.*]] = and i64 [[A]], -1
+; CHECK-NEXT:    [[B:%.*]] = and i64 [[A]], 4294967295
 ; CHECK-NEXT:    ret i64 [[B]]
 ;
   %a = sext i32 %x to i64

From 593395f0da67e9c5e2e6de6fe364c313458a642a Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Fri, 19 Jan 2024 15:10:57 -0800
Subject: [PATCH 184/843] [dsymutil] Fix spurious warnings in
 MachODebugMapParser (#78794)

When the MachODebugMapParser encounters an object file that cannot be
found on disk, it currently leaves the parser in an incoherent state,
resulting in spurious warnings that can in turn slow down dsymutil.

This fixes #78411.

rdar://117515153
---
 .../dsymutil/ARM/missing-object-warning.test    |   5 +++++
 .../dsymutil/Inputs/private/tmp/missing/foo.o   | Bin 0 -> 1792 bytes
 .../Inputs/private/tmp/missing/foobar.out       | Bin 0 -> 17336 bytes
 llvm/tools/dsymutil/MachODebugMapParser.cpp     |   5 ++---
 4 files changed, 7 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/tools/dsymutil/ARM/missing-object-warning.test
 create mode 100644 llvm/test/tools/dsymutil/Inputs/private/tmp/missing/foo.o
 create mode 100755 llvm/test/tools/dsymutil/Inputs/private/tmp/missing/foobar.out

diff --git a/llvm/test/tools/dsymutil/ARM/missing-object-warning.test b/llvm/test/tools/dsymutil/ARM/missing-object-warning.test
new file mode 100644
index 0000000000000..62b3ecb31888b
--- /dev/null
+++ b/llvm/test/tools/dsymutil/ARM/missing-object-warning.test
@@ -0,0 +1,5 @@
+RUN: dsymutil -oso-prepend-path %p/../Inputs --dump-debug-map %p/../Inputs/private/tmp/missing/foobar.out 2>&1 | FileCheck %s
+
+CHECK: bar.o unable to open object file
+CHECK-NOT: could not find object file symbol for symbol _bar
+CHECK-NOT: could not find object file symbol for symbol _main
diff --git a/llvm/test/tools/dsymutil/Inputs/private/tmp/missing/foo.o b/llvm/test/tools/dsymutil/Inputs/private/tmp/missing/foo.o
new file mode 100644
index 0000000000000000000000000000000000000000..333b39c45b145218f7a902acba130d0e084f087d
GIT binary patch
literal 1792
zcmb7E%}>-&5Pzk+xUgD5OoSMcJ)qG<+U~NzF8Bd)QH&4>OCq<X?Utq3k4;-vNlYM~
zF#ZFaj3?sJi-||W!2=g>{t3!Oy=$V*w9^F}5u9W`-^}k#-@LvL-@p7k%^1gkP^5tO
zc@Q~Zpg!Og^pBwDZP7vo3xQ()6xhg}7z9$3z<3-)cH}QkEiE4Sv6UeM@d!GKq}XPh
zgIZB)hPGB$R88}YCi;~JlM8oRDKX9sigZzbt%L-8jAuH_F6Jchrl_z=dY16E+gm_?
zv>W63ffrUJ@wN$PkMzr9`%@bY%TOHEHhic_;%VUo8M{t+t7$S0RHDBK&(&6SOiALs
zAiM#p-#H;(*k41{lX$xcyuB0R1?!E!e(xv3yG#9jJr+-QZRjgd)|^MCQ&W_g3YCUR
zrqjt$ump3(YyS!~ZIw8`=>A!zV_-@0d*>;8obZ+a2kG?@cm-JN1&;O-;70@oLKuv{
zrGEB7qn`zy2S&miWUT$iuL`S2Q;uyeyyKORTsL<~=t=X!(<j2W4WaA8MWLHJ2b;Eu
z42{nleG|RBFqR$9-s`yquLD28c;RL?iw(^INB1}ren&rBfrtaV1B~=0QlZ3Olun5;
zbU1exC3hbr{~km-#dJ$`>ba)j`KIgS@+DCg<+>T<u7H>TURl#bnCzHkHEjvSV8e4)
z4LuMst29z94;4!Ch^!Tj(PC*>F4Xe1(r|gXte5q?RvXd_ny%|t*|O`3I+JEh%~QQ~
zsbVw@%WW8*blbIU)v3+EoEKf!@}+9!zAw$H`doD>Ulfa?Uwg<Bt`yh}$u@o8bn1xA
zOebK|)yZi*m6+kx4Jq>Bgn^#<TiI)%!*c+Qw*u*KE&9pYUqJtxgZrLGFd)1&9BT)i
zh5_Oz8Iw1+vPIB12+YN)#IFIEMD)(K<`~CE!AP-wkQ~ST5YC{YFX!m6F*VJ1vW`}9
zPg@4^FwTa6!D_!u8e`}WlExDdqEuXh-ofvHt_cj6&K_E43eLfT!Q?|JV-e7AJ<iNd

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/dsymutil/Inputs/private/tmp/missing/foobar.out b/llvm/test/tools/dsymutil/Inputs/private/tmp/missing/foobar.out
new file mode 100755
index 0000000000000000000000000000000000000000..2574c85b9929240784035fb4b3ffddcb99ba061c
GIT binary patch
literal 17336
zcmeI4U1%It6vyv=R9X@nsnN#z(RnCRC1k~x+KT8VbTP%y#x~Yfglm$WwL`PJ?e0X|
zBql4hq7;#e;)@RvDW$<EYQ^+H8mQVAQKR*fB2iS5hl-^(rGa$)|7Y)znbZ&PQO<#L
z&zy7So_p`_&g+@aX3t!zRVtz&QRr3BDLm{~7M_&44%!OUy1n_{)WfN*TNn+CH#r=-
z%wwG?sMZ~+bVoSt%}<77%J_>ArgCo-tzCPMTYawj;d&kSR1ec;UZL;L^+It~HYv|&
z-J5^H$!D~a&lV6_xW8dj?@=>}eyE;_`(~b3+osL;r8aKz&zdxlVLdT+ELL^NAjom8
zQf1x+Ijo0SWBm2-c^}+E1N2$=T(hAC&#*Pnb|~c<qj6Zi^Iv2sCRtE?BP!8bDkgfI
zu0&>UPX<2adMNkN-q!Q#?saP(e0{}hA3eYQ{X@4SW<jfGLY4SZ@7X(U&9?Zh$a&|3
zrL4yJSSJI%LE*YQKXW{DB-4z~Au_)g>}T8Fl56LhTAHvgrwcDZZicdcIjPiQ*jtQG
zUX&Z5%nyTGLl8WZB0@k22mv7=1cZPP5CTF#2nYcoAOwVf5D)@FKnMr{As_^VfDjM@
zLO=)z0U;m+gn$qb0zyCt2mv7=1cZPP5CTF#2nYcoAOwVf5D)@FKnMr{As_^VfDjM@
zLO=-oZwOebymdULzfKxkIiV)z`lCnY_Njx%Ppljp-nVk>#~O8Urc!xjvQ8bNor<ZE
z$-3k*bSu})^|u_soP(^v*5n%unV@AaG<(~VuvGS?Q<B~|Kd|4r7M;`ePzrjUS&|Oh
z!acDt-e#16U^}#FPiHtR5nbIEE5+35K@6uj{5*)tI=4gHd+hFB*Va4i&Wv4DOS(FX
z>dI`PpqAx2oxECJEq7{gG264fH8X*Q*}v`gm@|)l_>g}m{48dT{61nc&#6rq{QiIT
zrLPM3%&&rai^0<0ydWQ3{{iHK>vu2k9r#@T%?0Z}f_q@?YoT1<_Zj=E^@DltpZ)KA
z*D~{c-=BXM;uQMae=r~1Kd+@O!VdTO{nX)oe?9V{@AI;Z@7J^NCxpu{QS%ZtFHw7!
zm~eBu5;><-a`KNQ6yi<YDzU5RJl^Trp&U~M6$&ZT`n%Ve;@=pQn*E`694Fw{!Xs1a
z5ySB!F%^yTOXg8|J!vK(_&-3)uod-wSx}Aq^XUI7B7#kN3-`L}so9&pXmpnUGFf?O
zx>)Y(I^P=q_VXn#4!k=)x~{)%^xWvs8}FnGA5Uk;<KIk7?2K+nFRBb29q8Nt{RanH
zpKCn)Z~5u;Tz~VYe~cZtyWV};O+2Z;dg<hapRV}wt(ikFe{%lq(;M$_TDSjtY3=>V
dlYQT9IQw2CJCu6i_%n^Aqbr6!yn5z0^(R~|BuD@N

literal 0
HcmV?d00001

diff --git a/llvm/tools/dsymutil/MachODebugMapParser.cpp b/llvm/tools/dsymutil/MachODebugMapParser.cpp
index 524a6795c360e..6a9f25681cdd1 100644
--- a/llvm/tools/dsymutil/MachODebugMapParser.cpp
+++ b/llvm/tools/dsymutil/MachODebugMapParser.cpp
@@ -186,6 +186,8 @@ void MachODebugMapParser::addCommonSymbols() {
 /// everything up to add symbols to the new one.
 void MachODebugMapParser::switchToNewDebugMapObject(
     StringRef Filename, sys::TimePoint<std::chrono::seconds> Timestamp) {
+  addCommonSymbols();
+  resetParserState();
 
   SmallString<80> Path(PathPrefix);
   sys::path::append(Path, Filename);
@@ -206,9 +208,6 @@ void MachODebugMapParser::switchToNewDebugMapObject(
     return;
   }
 
-  addCommonSymbols();
-  resetParserState();
-
   CurrentDebugMapObject =
       &Result->addDebugMapObject(Path, Timestamp, MachO::N_OSO);
 

From 30aa9fb4c1da33892a38f952fbdf6e7e45e5953a Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@fb.com>
Date: Fri, 19 Jan 2024 15:01:43 -0800
Subject: [PATCH 185/843] Revert "[InstrProf] Adding utility weights to
 BalancedPartitioning (#72717)"

This reverts commit 5954b9dca21bb0c69b9e991b2ddb84c8b05ecba3
due to broken Windows build
---
 .../llvm/Support/BalancedPartitioning.h       | 26 +------
 llvm/lib/ProfileData/InstrProf.cpp            |  9 ++-
 llvm/lib/Support/BalancedPartitioning.cpp     | 57 ++++++---------
 .../ProfileData/BPFunctionNodeTest.cpp        | 30 +++-----
 .../Support/BalancedPartitioningTest.cpp      | 73 ++-----------------
 5 files changed, 49 insertions(+), 146 deletions(-)

diff --git a/llvm/include/llvm/Support/BalancedPartitioning.h b/llvm/include/llvm/Support/BalancedPartitioning.h
index 19eb6d41e45e3..a8464ac0fe60e 100644
--- a/llvm/include/llvm/Support/BalancedPartitioning.h
+++ b/llvm/include/llvm/Support/BalancedPartitioning.h
@@ -57,27 +57,8 @@ class BPFunctionNode {
   friend class BalancedPartitioning;
 
 public:
-  /// The type of ID
   using IDT = uint64_t;
-  /// The type of UtilityNode
-  struct UtilityNodeT {
-    UtilityNodeT(uint32_t Id, uint32_t Weight = 1) : Id(Id), Weight(Weight) {}
-    uint32_t Id;
-    uint32_t Weight;
-
-    // Deduplicate utility nodes for a given function.
-    // TODO: One may experiment with accumulating the weights of duplicates.
-    static void sortAndDeduplicate(SmallVector<UtilityNodeT, 4> &UNs) {
-      llvm::sort(UNs, [](const UtilityNodeT &L, const UtilityNodeT &R) {
-        return L.Id < R.Id;
-      });
-      UNs.erase(std::unique(UNs.begin(), UNs.end(),
-                            [](const UtilityNodeT &L, const UtilityNodeT &R) {
-                              return L.Id == R.Id;
-                            }),
-                UNs.end());
-    }
-  };
+  using UtilityNodeT = uint32_t;
 
   /// \param UtilityNodes the set of utility nodes (must be unique'd)
   BPFunctionNode(IDT Id, ArrayRef<UtilityNodeT> UtilityNodes)
@@ -97,7 +78,8 @@ class BPFunctionNode {
   uint64_t InputOrderIndex = 0;
 
   friend class BPFunctionNodeTest_Basic_Test;
-  friend class BalancedPartitioningTest;
+  friend class BalancedPartitioningTest_Basic_Test;
+  friend class BalancedPartitioningTest_Large_Test;
 };
 
 /// Algorithm parameters; default values are tuned on real-world binaries
@@ -206,8 +188,6 @@ class BalancedPartitioning {
     float CachedGainRL;
     /// Whether \p CachedGainLR and \p CachedGainRL are valid
     bool CachedGainIsValid = false;
-    /// The weight of this utility node
-    uint32_t Weight = 1;
   };
 
 protected:
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index ba4fb227381f6..2640027455e0d 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -923,15 +923,15 @@ std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
 
   const int N = Log2_64(LargestTraceSize) + 1;
 
-  // TODO: We may use the Trace.Weight field to give more weight to more
+  // TODO: We need to use the Trace.Weight field to give more weight to more
   // important utilities
   DenseMap<IDT, SmallVector<UtilityNodeT, 4>> FuncGroups;
-  for (uint32_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
+  for (size_t TraceIdx = 0; TraceIdx < Traces.size(); TraceIdx++) {
     auto &Trace = Traces[TraceIdx].FunctionNameRefs;
     for (size_t Timestamp = 0; Timestamp < Trace.size(); Timestamp++) {
       for (int I = Log2_64(Timestamp + 1); I < N; I++) {
         auto FunctionId = Trace[Timestamp];
-        UtilityNodeT GroupId(TraceIdx * N + I);
+        UtilityNodeT GroupId = TraceIdx * N + I;
         FuncGroups[FunctionId].push_back(GroupId);
       }
     }
@@ -940,7 +940,8 @@ std::vector<BPFunctionNode> TemporalProfTraceTy::createBPFunctionNodes(
   std::vector<BPFunctionNode> Nodes;
   for (auto Id : FunctionIds) {
     auto &UNs = FuncGroups[Id];
-    UtilityNodeT::sortAndDeduplicate(UNs);
+    llvm::sort(UNs);
+    UNs.erase(std::unique(UNs.begin(), UNs.end()), UNs.end());
     Nodes.emplace_back(Id, UNs);
   }
   return Nodes;
diff --git a/llvm/lib/Support/BalancedPartitioning.cpp b/llvm/lib/Support/BalancedPartitioning.cpp
index cd71ddec47647..cb6ba61179941 100644
--- a/llvm/lib/Support/BalancedPartitioning.cpp
+++ b/llvm/lib/Support/BalancedPartitioning.cpp
@@ -21,13 +21,8 @@ using namespace llvm;
 #define DEBUG_TYPE "balanced-partitioning"
 
 void BPFunctionNode::dump(raw_ostream &OS) const {
-  OS << "{ID=" << Id << " Utilities={";
-  for (auto &N : UtilityNodes)
-    OS << N.Id << " ,";
-  OS << "}";
-  if (Bucket.has_value())
-    OS << " Bucket=" << Bucket.value();
-  OS << "}";
+  OS << formatv("{{ID={0} Utilities={{{1:$[,]}} Bucket={2}}", Id,
+                make_range(UtilityNodes.begin(), UtilityNodes.end()), Bucket);
 }
 
 template <typename Func>
@@ -171,40 +166,34 @@ void BalancedPartitioning::runIterations(const FunctionNodeRange Nodes,
                                          unsigned RecDepth, unsigned LeftBucket,
                                          unsigned RightBucket,
                                          std::mt19937 &RNG) const {
-  // Count the degree of each utility node.
-  DenseMap<uint32_t, unsigned> UtilityNodeIndex;
+  unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
+  DenseMap<BPFunctionNode::UtilityNodeT, unsigned> UtilityNodeIndex;
   for (auto &N : Nodes)
     for (auto &UN : N.UtilityNodes)
-      ++UtilityNodeIndex[UN.Id];
+      ++UtilityNodeIndex[UN];
   // Remove utility nodes if they have just one edge or are connected to all
-  // functions.
-  unsigned NumNodes = std::distance(Nodes.begin(), Nodes.end());
+  // functions
   for (auto &N : Nodes)
     llvm::erase_if(N.UtilityNodes, [&](auto &UN) {
-      return UtilityNodeIndex[UN.Id] == 1 ||
-             UtilityNodeIndex[UN.Id] == NumNodes;
+      return UtilityNodeIndex[UN] == 1 || UtilityNodeIndex[UN] == NumNodes;
     });
 
-  // Renumber utility nodes so they can be used to index into Signatures.
+  // Renumber utility nodes so they can be used to index into Signatures
   UtilityNodeIndex.clear();
   for (auto &N : Nodes)
     for (auto &UN : N.UtilityNodes)
-      UN.Id = UtilityNodeIndex.insert({UN.Id, UtilityNodeIndex.size()})
-                  .first->second;
+      UN = UtilityNodeIndex.insert({UN, UtilityNodeIndex.size()}).first->second;
 
-  // Initialize signatures.
+  // Initialize signatures
   SignaturesT Signatures(/*Size=*/UtilityNodeIndex.size());
   for (auto &N : Nodes) {
     for (auto &UN : N.UtilityNodes) {
-      assert(UN.Id < Signatures.size());
-      if (N.Bucket == LeftBucket)
-        Signatures[UN.Id].LeftCount++;
-      else
-        Signatures[UN.Id].RightCount++;
-      // Identical utility nodes (having the same UN.Id) have the same weight
-      // (unless there are hash collisions mapping utilities to the same Id);
-      // thus, we get a new weight only when the signature is uninitialized.
-      Signatures[UN.Id].Weight = UN.Weight;
+      assert(UN < Signatures.size());
+      if (N.Bucket == LeftBucket) {
+        Signatures[UN].LeftCount++;
+      } else {
+        Signatures[UN].RightCount++;
+      }
     }
   }
 
@@ -232,11 +221,9 @@ unsigned BalancedPartitioning::runIteration(const FunctionNodeRange Nodes,
     Signature.CachedGainLR = 0.f;
     Signature.CachedGainRL = 0.f;
     if (L > 0)
-      Signature.CachedGainLR =
-          (Cost - logCost(L - 1, R + 1)) * Signature.Weight;
+      Signature.CachedGainLR = Cost - logCost(L - 1, R + 1);
     if (R > 0)
-      Signature.CachedGainRL =
-          (Cost - logCost(L + 1, R - 1)) * Signature.Weight;
+      Signature.CachedGainRL = Cost - logCost(L + 1, R - 1);
     Signature.CachedGainIsValid = true;
   }
 
@@ -295,14 +282,14 @@ bool BalancedPartitioning::moveFunctionNode(BPFunctionNode &N,
   // Update signatures and invalidate gain cache
   if (FromLeftToRight) {
     for (auto &UN : N.UtilityNodes) {
-      auto &Signature = Signatures[UN.Id];
+      auto &Signature = Signatures[UN];
       Signature.LeftCount--;
       Signature.RightCount++;
       Signature.CachedGainIsValid = false;
     }
   } else {
     for (auto &UN : N.UtilityNodes) {
-      auto &Signature = Signatures[UN.Id];
+      auto &Signature = Signatures[UN];
       Signature.LeftCount++;
       Signature.RightCount--;
       Signature.CachedGainIsValid = false;
@@ -331,8 +318,8 @@ float BalancedPartitioning::moveGain(const BPFunctionNode &N,
                                      const SignaturesT &Signatures) {
   float Gain = 0.f;
   for (auto &UN : N.UtilityNodes)
-    Gain += (FromLeftToRight ? Signatures[UN.Id].CachedGainLR
-                             : Signatures[UN.Id].CachedGainRL);
+    Gain += (FromLeftToRight ? Signatures[UN].CachedGainLR
+                             : Signatures[UN].CachedGainRL);
   return Gain;
 }
 
diff --git a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
index 787e3041fb5e7..6af6f1bcdc40a 100644
--- a/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
+++ b/llvm/unittests/ProfileData/BPFunctionNodeTest.cpp
@@ -13,8 +13,8 @@
 #include "gtest/gtest.h"
 
 using testing::Field;
-using testing::Matcher;
 using testing::UnorderedElementsAre;
+using testing::UnorderedElementsAreArray;
 
 namespace llvm {
 
@@ -24,35 +24,29 @@ void PrintTo(const BPFunctionNode &Node, std::ostream *OS) {
 }
 
 TEST(BPFunctionNodeTest, Basic) {
-  auto UNIdsAre = [](auto... Ids) {
-    return UnorderedElementsAre(Field("Id", &BPFunctionNode::UtilityNodeT::Id,
-                                      std::forward<uint32_t>(Ids))...);
-  };
   auto NodeIs = [](BPFunctionNode::IDT Id,
-                   Matcher<ArrayRef<BPFunctionNode::UtilityNodeT>> UNsMatcher) {
-    return AllOf(
-        Field("Id", &BPFunctionNode::Id, Id),
-        Field("UtilityNodes", &BPFunctionNode::UtilityNodes, UNsMatcher));
+                   ArrayRef<BPFunctionNode::UtilityNodeT> UNs) {
+    return AllOf(Field("Id", &BPFunctionNode::Id, Id),
+                 Field("UtilityNodes", &BPFunctionNode::UtilityNodes,
+                       UnorderedElementsAreArray(UNs)));
   };
 
   auto Nodes = TemporalProfTraceTy::createBPFunctionNodes({
       TemporalProfTraceTy({0, 1, 2, 3}),
   });
-  EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, UNIdsAre(0, 1, 2)),
-                                          NodeIs(1, UNIdsAre(1, 2)),
-                                          NodeIs(2, UNIdsAre(1, 2)),
-                                          NodeIs(3, UNIdsAre(2))));
+  EXPECT_THAT(Nodes,
+              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
+                                   NodeIs(2, {1, 2}), NodeIs(3, {2})));
 
   Nodes = TemporalProfTraceTy::createBPFunctionNodes({
       TemporalProfTraceTy({0, 1, 2, 3, 4}),
       TemporalProfTraceTy({4, 2}),
   });
 
-  EXPECT_THAT(Nodes, UnorderedElementsAre(NodeIs(0, UNIdsAre(0, 1, 2)),
-                                          NodeIs(1, UNIdsAre(1, 2)),
-                                          NodeIs(2, UNIdsAre(1, 2, 4, 5)),
-                                          NodeIs(3, UNIdsAre(2)),
-                                          NodeIs(4, UNIdsAre(2, 3, 4, 5))));
+  EXPECT_THAT(Nodes,
+              UnorderedElementsAre(NodeIs(0, {0, 1, 2}), NodeIs(1, {1, 2}),
+                                   NodeIs(2, {1, 2, 4, 5}), NodeIs(3, {2}),
+                                   NodeIs(4, {2, 3, 4, 5})));
 }
 
 } // end namespace llvm
diff --git a/llvm/unittests/Support/BalancedPartitioningTest.cpp b/llvm/unittests/Support/BalancedPartitioningTest.cpp
index 79e89ae201480..ebe518a8e89ca 100644
--- a/llvm/unittests/Support/BalancedPartitioningTest.cpp
+++ b/llvm/unittests/Support/BalancedPartitioningTest.cpp
@@ -37,20 +37,6 @@ class BalancedPartitioningTest : public ::testing::Test {
       Ids.push_back(N.Id);
     return Ids;
   }
-
-  static testing::Matcher<BPFunctionNode> NodeIdIs(BPFunctionNode::IDT Id) {
-    return Field("Id", &BPFunctionNode::Id, Id);
-  };
-
-  static testing::Matcher<BPFunctionNode>
-  NodeBucketIs(std::optional<uint32_t> Bucket) {
-    return Field("Bucket", &BPFunctionNode::Bucket, Bucket);
-  };
-
-  static testing::Matcher<BPFunctionNode>
-  NodeIs(BPFunctionNode::IDT Id, std::optional<uint32_t> Bucket) {
-    return AllOf(NodeIdIs(Id), NodeBucketIs(Bucket));
-  };
 };
 
 TEST_F(BalancedPartitioningTest, Basic) {
@@ -62,6 +48,11 @@ TEST_F(BalancedPartitioningTest, Basic) {
 
   Bp.run(Nodes);
 
+  auto NodeIs = [](BPFunctionNode::IDT Id, std::optional<uint32_t> Bucket) {
+    return AllOf(Field("Id", &BPFunctionNode::Id, Id),
+                 Field("Bucket", &BPFunctionNode::Bucket, Bucket));
+  };
+
   EXPECT_THAT(Nodes,
               UnorderedElementsAre(NodeIs(0, 0), NodeIs(1, 1), NodeIs(2, 2),
                                    NodeIs(3, 3), NodeIs(4, 4)));
@@ -88,7 +79,8 @@ TEST_F(BalancedPartitioningTest, Large) {
 
   Bp.run(Nodes);
 
-  EXPECT_THAT(Nodes, Each(Not(NodeBucketIs(std::nullopt))));
+  EXPECT_THAT(
+      Nodes, Each(Not(Field("Bucket", &BPFunctionNode::Bucket, std::nullopt))));
   EXPECT_THAT(getIds(Nodes), UnorderedElementsAreArray(OrigIds));
 }
 
@@ -105,55 +97,4 @@ TEST_F(BalancedPartitioningTest, MoveGain) {
                   30.f);
 }
 
-TEST_F(BalancedPartitioningTest, Weight1) {
-  std::vector<BPFunctionNode::UtilityNodeT> UNs = {
-      BPFunctionNode::UtilityNodeT(0, 100),
-      BPFunctionNode::UtilityNodeT(1, 100),
-      BPFunctionNode::UtilityNodeT(2, 100),
-      BPFunctionNode::UtilityNodeT(3, 1),
-      BPFunctionNode::UtilityNodeT(4, 1),
-  };
-  std::vector<BPFunctionNode> Nodes = {
-      BPFunctionNode(0, {UNs[0], UNs[3]}), BPFunctionNode(1, {UNs[1], UNs[3]}),
-      BPFunctionNode(2, {UNs[2], UNs[3]}), BPFunctionNode(3, {UNs[0], UNs[4]}),
-      BPFunctionNode(4, {UNs[1], UNs[4]}), BPFunctionNode(5, {UNs[2], UNs[4]}),
-  };
-
-  Bp.run(Nodes);
-
-  // Check that nodes that share important UNs are ordered together
-  auto NodesRef = ArrayRef(Nodes);
-  auto Groups = {NodesRef.slice(0, 2), NodesRef.slice(2, 2),
-                 NodesRef.slice(4, 2)};
-  EXPECT_THAT(Groups, UnorderedElementsAre(
-                          UnorderedElementsAre(NodeIdIs(0), NodeIdIs(3)),
-                          UnorderedElementsAre(NodeIdIs(1), NodeIdIs(4)),
-                          UnorderedElementsAre(NodeIdIs(2), NodeIdIs(5))));
-}
-
-TEST_F(BalancedPartitioningTest, Weight2) {
-  std::vector<BPFunctionNode::UtilityNodeT> UNs = {
-      BPFunctionNode::UtilityNodeT(0, 1),
-      BPFunctionNode::UtilityNodeT(1, 1),
-      BPFunctionNode::UtilityNodeT(2, 1),
-      BPFunctionNode::UtilityNodeT(3, 100),
-      BPFunctionNode::UtilityNodeT(4, 100),
-  };
-  std::vector<BPFunctionNode> Nodes = {
-      BPFunctionNode(0, {UNs[0], UNs[3]}), BPFunctionNode(1, {UNs[1], UNs[4]}),
-      BPFunctionNode(2, {UNs[2], UNs[3]}), BPFunctionNode(3, {UNs[0], UNs[4]}),
-      BPFunctionNode(4, {UNs[1], UNs[3]}), BPFunctionNode(5, {UNs[2], UNs[4]}),
-  };
-
-  Bp.run(Nodes);
-
-  // Check that nodes that share important UNs are ordered together
-  auto NodesRef = ArrayRef(Nodes);
-  auto Groups = {NodesRef.slice(0, 3), NodesRef.slice(3, 3)};
-  EXPECT_THAT(Groups,
-              UnorderedElementsAre(
-                  UnorderedElementsAre(NodeIdIs(0), NodeIdIs(2), NodeIdIs(4)),
-                  UnorderedElementsAre(NodeIdIs(1), NodeIdIs(3), NodeIdIs(5))));
-}
-
 } // end namespace llvm

From b7360fbe8ca0c9411e89fafd654856c484f84f5e Mon Sep 17 00:00:00 2001
From: erman-gurses <99776114+erman-gurses@users.noreply.github.com>
Date: Fri, 19 Jan 2024 18:44:45 -0500
Subject: [PATCH 186/843] [mlir][amdgpu] Shared memory access optimization pass
 (#75627)

It implements transformation to optimize accesses to shared memory.

Reference: https://reviews.llvm.org/D127457

_This change adds a transformation and pass to the NvGPU dialect that
attempts to optimize reads/writes from a memref representing GPU shared
memory in order to avoid bank conflicts. Given a value representing a
shared memory memref, it traverses all reads/writes within the parent op
and, subject to suitable conditions, rewrites all last dimension index
values such that element locations in the final (col) dimension are
given by newColIdx = col % vecSize + perm[row](col / vecSize, row)
where perm is a permutation function indexed by row and vecSize
is the vector access size in elements (currently assumes 128bit
vectorized accesses, but this can be made a parameter). This specific
transformation can help optimize typical distributed & vectorized
accesses
common to loading matrix multiplication operands to/from shared memory._
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  17 ++
 .../mlir/Dialect/AMDGPU/Transforms/Passes.h   |   3 +-
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td  |  13 +
 .../Dialect/AMDGPU/Transforms/Transforms.h    |  54 ++++
 .../mlir/Dialect/AMDGPU/Transforms/Utils.h    |  24 ++
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |  15 ++
 .../Dialect/AMDGPU/Transforms/CMakeLists.txt  |   2 +
 .../Transforms/OptimizeSharedMemory.cpp       | 243 ++++++++++++++++++
 mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp  |  39 +++
 .../AMDGPU/optimize_shmem_reads_writes.mlir   |  57 ++++
 10 files changed, 466 insertions(+), 1 deletion(-)
 create mode 100644 mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
 create mode 100644 mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
 create mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
 create mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp
 create mode 100644 mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index ffb302fcedd73..b4bf1b5191232 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -29,6 +29,23 @@ def AMDGPU_Dialect : Dialect {
     "gpu::GPUDialect"
   ];
   let useDefaultAttributePrinterParser = 1;
+
+  let extraClassDeclaration = [{
+    /// Return true if the given MemRefType has an integer address
+    /// space that matches the ROCDL shared memory address space or
+    /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
+    static bool hasSharedMemoryAddressSpace(MemRefType type);
+
+    /// Return true if the given Attribute has an integer address
+    /// space that matches the ROCDL shared memory address space or
+    /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
+    static bool isSharedMemoryAddressSpace(Attribute type);
+
+    /// Defines the MemRef memory space attribute numeric value that indicates
+    /// a memref is located in shared memory. This should correspond to the
+    /// value used in ROCDL.
+    static constexpr unsigned kSharedMemoryAddressSpace = 3;
+  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index 8dd5ff1a4b198..11d182ba5823e 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -20,7 +20,8 @@ namespace mlir {
 class ConversionTarget;
 namespace amdgpu {
 
-#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
+#define GEN_PASS_DECL
+
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index e6b27aa842dfc..c8059e6d316e8 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -30,4 +30,17 @@ def AmdgpuEmulateAtomicsPass : Pass<"amdgpu-emulate-atomics"> {
                         "Chipset that these operations will run on">];
 }
 
+def OptimizeSharedMemory : Pass<"amdgpu-optimize-shared-memory"> {
+  let summary = "Optimizes accesses to shared memory memrefs in order to reduce bank conflicts.";
+  let description = [{
+    This pass adds a transformation and pass to the AMDGPU dialect that
+    attempts to optimize reads/writes from a memref representing GPU shared
+    memory in order to avoid bank conflicts.
+  }];
+
+  let dependentDialects = [
+    "memref::MemRefDialect", "vector::VectorDialect"
+  ];
+}
+
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
new file mode 100644
index 0000000000000..140bc12deed69
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
@@ -0,0 +1,54 @@
+//===- Transforms.h - AMDGPU Dialect transformations --------------*-
+// C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares functions that assist transformations for the amdgpu
+// dialect.
+//
+//===----------------------------------------------------------------------===//
+#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
+#define MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
+
+#include "mlir/IR/Operation.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace mlir {
+class RewriterBase;
+
+namespace amdgpu {
+
+///
+/// Passes
+///
+
+/// Optimizes vectorized accesses to a shared memory buffer specified by
+/// memrefValue. This transformation assumes the following:
+/// 1) All relevant accesses to `memrefValue` are contained with `parentOp`.
+/// 2) The function will fail precondition checks if any subviews are
+/// taken of `memrefValue`. All reads/writes to `memrefValue` should occur
+/// through `memrefValue` directly.
+///
+/// Shared memory bank conflicts occur when multiple threads attempt to read or
+/// write locations assigned to the same shared memory bank. For `2^N` byte
+/// vectorized accesses, we need to be concerned with conflicts among threads
+/// identified as `(tid) -> tid.floordiv(2^{7-N})`. As such, this transformation
+/// changes any indexed memory access (vector.load, memref.load, etc)
+/// such that the final dimension's index value is permuted such that
+/// `newColIndex = oldColIndex % vectorSize +
+/// perm[rowIndex](oldColIndex/vectorSize, rowIndex)` where `rowIndex` is the
+/// index for the second-to last dimension and `perm[rowIndex]` is a permutation
+/// function that depends on the row Index. The permutation function is chosen
+/// to ensure that sequential distributed+vectorized reads/writes down a single
+/// dimension of the memref have minimal conflicts.
+mlir::LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
+                                                       Value memrefValue);
+
+} // namespace amdgpu
+} // namespace mlir
+
+#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
new file mode 100644
index 0000000000000..6be57ca54b15f
--- /dev/null
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
@@ -0,0 +1,24 @@
+//===- Utils.h - Transform utilities -----------------------------*- C++-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Operation.h"
+
+namespace mlir {
+namespace amdgpu {
+
+/// Get and set the indices that the given load/store operation is operating on.
+/// Preconditions:
+/// - The Op must have memory affects
+/// - Considers memref::LoadOp,  vector::LoadOp,  vector::TransferReadOp
+/// - Considers memref::StoreOp, vector::StoreOp, vector::TransferWriteOp
+/// - Excludes subview op
+std::optional<Operation::operand_range> getIndices(Operation *op);
+void setIndices(Operation *op, ArrayRef<Value> indices);
+
+} // namespace amdgpu
+} // namespace mlir
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 2575ad4984814..4e72fbf56b80a 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -43,6 +43,21 @@ void AMDGPUDialect::initialize() {
       >();
 }
 
+bool amdgpu::AMDGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
+  if (!memorySpace)
+    return false;
+  if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
+    return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace;
+  if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
+    return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;
+  return false;
+}
+
+bool amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
+  Attribute memorySpace = type.getMemorySpace();
+  return isSharedMemoryAddressSpace(memorySpace);
+}
+
 //===----------------------------------------------------------------------===//
 // 8-bit float ops
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index e11b6cc88bf22..a1a91270bc55c 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,5 +1,7 @@
 add_mlir_dialect_library(MLIRAMDGPUTransforms
   EmulateAtomics.cpp
+  OptimizeSharedMemory.cpp
+  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
new file mode 100644
index 0000000000000..c7001fc6d57d5
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
@@ -0,0 +1,243 @@
+//===- OptimizeSharedMemory.cpp - MLIR AMDGPU pass implementation ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements transforms to optimize accesses to shared memory.
+// It is inspired by
+// https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/AMDGPU/Transforms/Transforms.h"
+#include "mlir/Dialect/AMDGPU/Transforms/Utils.h"
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/Interfaces/SideEffectInterfaces.h"
+#include "mlir/Support/LogicalResult.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/MathExtras.h"
+
+namespace mlir {
+namespace amdgpu {
+#define GEN_PASS_DEF_OPTIMIZESHAREDMEMORY
+#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
+} // namespace amdgpu
+} // namespace mlir
+
+using namespace mlir;
+using namespace mlir::amdgpu;
+
+/// The size of a shared memory line according to AMD documentation.
+/// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/instinct-mi200-cdna2-instruction-set-architecture.pdf
+constexpr int64_t kSharedMemoryLineSizeBytes = 64;
+/// We optimize for 64bit accesses, but this can be made an argument in the
+/// future.
+constexpr int64_t kDefaultVectorSizeBits = 64;
+
+/// Uses `srcIndexValue` to permute `tgtIndexValue` via
+/// `result = xor(floordiv(srcIdxVal,permuteEveryN),
+///               floordiv(tgtIdxVal,vectorSize)))
+///            + tgtIdxVal % vectorSize`
+/// This is done using an optimized sequence of `arith` operations.
+static Value permuteVectorOffset(OpBuilder &b, Location loc,
+                                 ArrayRef<Value> indices, MemRefType memrefTy,
+                                 int64_t srcDim, int64_t tgtDim) {
+  // Adjust the src index to change how often the permutation changes
+  // if necessary.
+  Value src = indices[srcDim];
+
+  // We only want to permute every N iterations of the target dim where N is
+  // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
+  const int64_t permuteEveryN = std::max<int64_t>(
+      1, kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
+                                        memrefTy.getElementTypeBitWidth()) /
+                                       8));
+
+  // clang-format off
+  // Index bit representation (b0 = least significant bit) for dim(1)
+  // of a `memref<?x?xDT>` is as follows:
+  // N := log2(128/elementSizeBits)
+  // M := log2(dimSize(1))
+  // then
+  // bits[0:N] = sub-vector element offset
+  // bits[N:M] = vector index
+  // clang-format on
+  int64_t n =
+      llvm::Log2_64(kDefaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
+  int64_t m = llvm::Log2_64(memrefTy.getDimSize(tgtDim));
+
+  // Capture bits[0:(M-N)] of src by first creating a (M-N) mask.
+  int64_t mask = (1LL << (m - n)) - 1;
+  if (permuteEveryN > 1)
+    mask = mask << llvm::Log2_64(permuteEveryN);
+  Value srcBits = b.create<arith::ConstantIndexOp>(loc, mask);
+  srcBits = b.create<arith::AndIOp>(loc, src, srcBits);
+
+  // Use the src bits to permute the target bits b[N:M] containing the
+  // vector offset.
+  if (permuteEveryN > 1) {
+    int64_t shlBits = n - llvm::Log2_64(permuteEveryN);
+    if (shlBits > 0) {
+      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, shlBits);
+      srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
+    } else if (shlBits < 0) {
+      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, -1 * shlBits);
+      srcBits = b.createOrFold<arith::ShRUIOp>(loc, srcBits, finalShiftVal);
+    }
+  } else {
+    Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, n);
+    srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
+  }
+
+  Value permutedVectorIdx =
+      b.create<arith::XOrIOp>(loc, indices[tgtDim], srcBits);
+  return permutedVectorIdx;
+}
+
+static void transformIndices(OpBuilder &builder, Location loc,
+                             SmallVector<Value, 4> &indices,
+                             MemRefType memrefTy, int64_t srcDim,
+                             int64_t tgtDim) {
+  indices[tgtDim] =
+      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim);
+}
+
+/// Return all operations within `parentOp` that read from or write to
+/// `shmMemRef`.
+static LogicalResult
+getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
+                      SmallVector<Operation *, 16> &readOps,
+                      SmallVector<Operation *, 16> &writeOps) {
+  parentOp->walk([&](Operation *op) {
+    MemoryEffectOpInterface iface = dyn_cast<MemoryEffectOpInterface>(op);
+    if (!iface)
+      return;
+    std::optional<MemoryEffects::EffectInstance> effect =
+        iface.getEffectOnValue<MemoryEffects::Read>(shmMemRef);
+    if (effect) {
+      readOps.push_back(op);
+      return;
+    }
+    effect = iface.getEffectOnValue<MemoryEffects::Write>(shmMemRef);
+    if (effect)
+      writeOps.push_back(op);
+  });
+
+  // Restrict to a supported set of ops. We also require at least 2D access,
+  // although this could be relaxed.
+  if (llvm::any_of(readOps, [](Operation *op) {
+        return !isa<memref::LoadOp, vector::LoadOp, vector::TransferReadOp>(
+                   op) ||
+               amdgpu::getIndices(op)->size() < 2;
+      }))
+    return failure();
+  if (llvm::any_of(writeOps, [](Operation *op) {
+        return !isa<memref::StoreOp, vector::StoreOp, vector::TransferWriteOp>(
+                   op) ||
+               amdgpu::getIndices(op)->size() < 2;
+      }))
+    return failure();
+
+  return success();
+}
+
+mlir::LogicalResult
+mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
+                                                 Value memrefValue) {
+  auto memRefType = dyn_cast<MemRefType>(memrefValue.getType());
+  if (!memRefType ||
+      !amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(memRefType))
+    return failure();
+
+  // Abort if the given value has any sub-views; we do not do any alias
+  // analysis.
+  bool hasSubView = false;
+  parentOp->walk([&](memref::SubViewOp subView) { hasSubView = true; });
+  if (hasSubView)
+    return failure();
+
+  // Check if this is necessary given the assumption of 128b accesses:
+  // If dim[rank-1] is small enough to fit 8 rows in a 128B line.
+  const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);
+  const int64_t rowsPerLine =
+      (8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
+      rowSize;
+  const int64_t threadGroupSize =
+      1LL << (7 - llvm::Log2_64(kDefaultVectorSizeBits / 8));
+  if (rowsPerLine >= threadGroupSize)
+    return failure();
+
+  // Get sets of operations within the function that read/write to shared
+  // memory.
+  SmallVector<Operation *, 16> shmReadOps;
+  SmallVector<Operation *, 16> shmWriteOps;
+  if (failed(getShmReadAndWriteOps(parentOp, memrefValue, shmReadOps,
+                                   shmWriteOps)))
+    return failure();
+
+  if (shmReadOps.empty() || shmWriteOps.empty())
+    return failure();
+
+  OpBuilder builder(parentOp->getContext());
+
+  int64_t tgtDim = memRefType.getRank() - 1;
+  int64_t srcDim = memRefType.getRank() - 2;
+
+  // Transform indices for the ops writing to shared memory.
+  while (!shmWriteOps.empty()) {
+    Operation *shmWriteOp = shmWriteOps.pop_back_val();
+    builder.setInsertionPoint(shmWriteOp);
+
+    auto indices = amdgpu::getIndices(shmWriteOp);
+    SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
+    transformIndices(builder, shmWriteOp->getLoc(), transformedIndices,
+                     memRefType, srcDim, tgtDim);
+    amdgpu::setIndices(shmWriteOp, transformedIndices);
+  }
+
+  // Transform indices for the ops reading from shared memory.
+  while (!shmReadOps.empty()) {
+    Operation *shmReadOp = shmReadOps.pop_back_val();
+    builder.setInsertionPoint(shmReadOp);
+
+    auto indices = amdgpu::getIndices(shmReadOp);
+    SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
+    transformIndices(builder, shmReadOp->getLoc(), transformedIndices,
+                     memRefType, srcDim, tgtDim);
+    amdgpu::setIndices(shmReadOp, transformedIndices);
+  }
+
+  return success();
+}
+
+struct OptimizeSharedMemoryPass
+    : public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
+public:
+  OptimizeSharedMemoryPass() = default;
+
+  void runOnOperation() override {
+    Operation *op = getOperation();
+    SmallVector<memref::AllocOp> shmAllocOps;
+    op->walk([&](memref::AllocOp allocOp) {
+      if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(
+              allocOp.getType()))
+        return;
+      shmAllocOps.push_back(allocOp);
+    });
+    for (auto allocOp : shmAllocOps) {
+      if (failed(optimizeSharedMemoryReadsAndWrites(getOperation(),
+                                                    allocOp.getMemref())))
+        return;
+    }
+  }
+};
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp
new file mode 100644
index 0000000000000..8163eeafdf1f0
--- /dev/null
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp
@@ -0,0 +1,39 @@
+#include "mlir/Dialect/AMDGPU/Transforms/Utils.h"
+
+#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/Vector/IR/VectorOps.h"
+
+using namespace mlir;
+using namespace mlir::amdgpu;
+
+std::optional<Operation::operand_range> amdgpu::getIndices(Operation *op) {
+  if (auto loadOp = dyn_cast<memref::LoadOp>(op))
+    return loadOp.getIndices();
+  if (auto storeOp = dyn_cast<memref::StoreOp>(op))
+    return storeOp.getIndices();
+  if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
+    return vectorReadOp.getIndices();
+  if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
+    return vectorStoreOp.getIndices();
+  if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
+    return transferReadOp.getIndices();
+  if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
+    return transferWriteOp.getIndices();
+  return std::nullopt;
+}
+
+void amdgpu::setIndices(Operation *op, ArrayRef<Value> indices) {
+  if (auto loadOp = dyn_cast<memref::LoadOp>(op))
+    return loadOp.getIndicesMutable().assign(indices);
+  if (auto storeOp = dyn_cast<memref::StoreOp>(op))
+    return storeOp.getIndicesMutable().assign(indices);
+  if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
+    return vectorReadOp.getIndicesMutable().assign(indices);
+  if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
+    return vectorStoreOp.getIndicesMutable().assign(indices);
+  if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
+    return transferReadOp.getIndicesMutable().assign(indices);
+  if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
+    return transferWriteOp.getIndicesMutable().assign(indices);
+}
diff --git a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
new file mode 100644
index 0000000000000..41111dddda520
--- /dev/null
+++ b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
@@ -0,0 +1,57 @@
+// RUN: mlir-opt  %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
+  
+  // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
+  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
+                    %readRow: index, %readCol: index,
+                    %writeRow: index, %writeCol: index,
+                    %fragRow: index, %fragCol: index, 
+                    %fragColPerm: index,
+                    %stRow: index, %stCol: index) {
+    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16                  
+    %cst = arith.constant 0.000000e+00 : f16
+
+    // CHECK: [[shmA:%.+]] = memref.alloc
+    // CHECK: [[shmB:%.+]] = memref.alloc
+    %shmA = memref.alloc() {alignment = 64 : i64} : memref<128x32xf16, 3>
+    %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
+
+    // CHECK: %[[D0:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
+    %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
+    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: vector.transfer_write %[[D0:.+]], [[shmB]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
+    vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
+    gpu.barrier
+    gpu.barrier
+    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
+    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
+    // CHECK:  vector.load [[shmB:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<256x32xf16, 3>, vector<8xf16>
+    %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
+
+    // CHECK: %[[D2:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
+    %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
+    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
+    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
+    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
+    // CHECK: vector.transfer_write %[[D2:.+]], [[shmA:%.+]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
+    vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
+    gpu.barrier
+    gpu.barrier
+    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
+    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
+    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
+    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
+    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
+    // CHECK:  vector.load [[shmA:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<128x32xf16, 3>, vector<8xf16>
+    %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
+    return
+  }
+  
\ No newline at end of file

From b86d02375eb42db49e375484fe26cb837b316cbf Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Fri, 19 Jan 2024 15:45:22 -0800
Subject: [PATCH 187/843] [libc] Redo the install targets (#78795)

Prior to this change, we wouldn't build headers that aren't referenced
by other parts of the libc which would result in a build error during
installation. To address this, we make the header target a dependency of
the libc archive. Additionally, we also redo the install targets, moving
the install targets closer to build targets and simplifying the
hierarchy and generally matching what we do for other runtimes.
---
 libc/CMakeLists.txt                  | 28 ----------------------
 libc/include/CMakeLists.txt          | 10 ++++++++
 libc/lib/CMakeLists.txt              | 36 +++++++++++++++++++++++-----
 libc/utils/gpu/server/CMakeLists.txt |  2 +-
 4 files changed, 41 insertions(+), 35 deletions(-)

diff --git a/libc/CMakeLists.txt b/libc/CMakeLists.txt
index 114d2dc65dec2..6d385849b6a64 100644
--- a/libc/CMakeLists.txt
+++ b/libc/CMakeLists.txt
@@ -348,19 +348,6 @@ foreach(entrypoint IN LISTS TARGET_LLVMLIBC_ENTRYPOINTS)
   list(APPEND TARGET_ENTRYPOINT_NAME_LIST ${entrypoint_name})
 endforeach()
 
-set(LIBC_INSTALL_DEPENDS)
-if(LLVM_LIBC_FULL_BUILD)
-  set(LIBC_INSTALL_DEPENDS "install-libc-static-archives;install-libc-headers")
-  if(NOT LIBC_TARGET_OS_IS_BAREMETAL)
-    # For now we will disable libc-startup installation for baremetal. The
-    # correct way to do it would be to make a hookable startup for baremetal
-    # and install it as part of the libc installation.
-    list(APPEND LIBC_INSTALL_DEPENDS "libc-startup")
-  endif()
-else()
-  set(LIBC_INSTALL_DEPENDS install-libc-static-archives)
-endif()
-
 add_subdirectory(include)
 add_subdirectory(config)
 add_subdirectory(src)
@@ -388,18 +375,3 @@ endif()
 if (LIBC_INCLUDE_DOCS)
   add_subdirectory(docs)
 endif()
-
-
-if(LLVM_LIBC_FULL_BUILD)
-  add_llvm_install_targets(
-    install-libc-headers
-    DEPENDS libc-headers
-    COMPONENT libc-headers
-  )
-endif()
-
-add_llvm_install_targets(
-  install-libc
-  DEPENDS ${LIBC_INSTALL_DEPENDS}
-  COMPONENT libc
-)
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 14aa9ec6d73f3..cec8b86516b51 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -606,3 +606,13 @@ foreach(target IN LISTS all_install_header_targets)
             COMPONENT libc-headers)
   endif()
 endforeach()
+
+if(LLVM_LIBC_FULL_BUILD)
+  add_custom_target(install-libc-headers
+                    DEPENDS libc-headers
+                    COMMAND "${CMAKE_COMMAND}"
+                            -DCMAKE_INSTALL_COMPONENT=libc-headers
+                            -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+  # Stripping is a no-op for headers
+  add_custom_target(install-libc-headers-stripped DEPENDS install-libc-headers)
+endif()
diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index f257582ed9bdf..66aaa34632b39 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -33,6 +33,9 @@ foreach(archive IN ZIP_LISTS
     PROPERTIES
       ARCHIVE_OUTPUT_NAME ${archive_0}
   )
+  if(LLVM_LIBC_FULL_BUILD)
+    target_link_libraries(${archive_1} PUBLIC libc-headers)
+  endif()
   list(APPEND added_archive_targets ${archive_1})
 endforeach()
 
@@ -48,11 +51,32 @@ endif()
 install(
   TARGETS ${added_archive_targets}
   ARCHIVE DESTINATION ${LIBC_INSTALL_LIBRARY_DIR}
-  COMPONENT libc-static-archives
+  COMPONENT libc
 )
 
-add_llvm_install_targets(
-  install-libc-static-archives
-  DEPENDS ${added_archive_targets}
-  COMPONENT libc-static-archives
-)
+if(NOT LIBC_TARGET_OS_IS_BAREMETAL)
+  # For now we will disable libc-startup installation for baremetal. The
+  # correct way to do it would be to make a hookable startup for baremetal
+  # and install it as part of the libc installation.
+  set(startup_target "libc-startup")
+endif()
+
+if(LLVM_LIBC_FULL_BUILD)
+  set(header_install_target install-libc-headers)
+endif()
+
+add_custom_target(install-libc
+                  DEPENDS ${added_archive_targets}
+                          ${startup_target}
+                          ${header_install_target}
+                  COMMAND "${CMAKE_COMMAND}"
+                          -DCMAKE_INSTALL_COMPONENT=libc
+                          -P "${LIBCXX_BINARY_DIR}/cmake_install.cmake")
+add_custom_target(install-libc-stripped
+                  DEPENDS ${added_archive_targets}
+                          ${startup_target}
+                          ${header_install_target}
+                  COMMAND "${CMAKE_COMMAND}"
+                          -DCMAKE_INSTALL_COMPONENT=libc
+                          -DCMAKE_INSTALL_DO_STRIP=1
+                          -P "${LIBCXX_BINARY_DIR}/cmake_install.cmake")
diff --git a/libc/utils/gpu/server/CMakeLists.txt b/libc/utils/gpu/server/CMakeLists.txt
index a1c4229d386f8..3d9b2bcab4dbc 100644
--- a/libc/utils/gpu/server/CMakeLists.txt
+++ b/libc/utils/gpu/server/CMakeLists.txt
@@ -14,7 +14,7 @@ target_compile_definitions(llvmlibc_rpc_server PUBLIC
 # Install the server and associated header.
 install(FILES ${CMAKE_CURRENT_SOURCE_DIR}/rpc_server.h
         DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/gpu-none-llvm/
-        COMPONENT libc)
+        COMPONENT libc-headers)
 install(TARGETS llvmlibc_rpc_server
         ARCHIVE DESTINATION "lib${LLVM_LIBDIR_SUFFIX}"
         COMPONENT libc)

From c17aa14f4ca101db247d124b4b05506247aa330f Mon Sep 17 00:00:00 2001
From: "Xiangxi Guo (Ryan)" <ryanguo@modular.com>
Date: Fri, 19 Jan 2024 15:54:33 -0800
Subject: [PATCH 188/843] [mlir][index] Fold `cmp(x, x)` when `x` isn't a
 constant (#78812)

Such cases show up in the middle of optimizations passes, e.g., after
some rewrites and then CSE. The current folder can fold such cases when
the inputs are constant; this patch improves it to fold even if the
inputs are non-constant.
---
 mlir/lib/Dialect/Index/IR/IndexOps.cpp        | 22 +++++++++++++++++++
 .../Dialect/Index/index-canonicalize.mlir     | 20 +++++++++++++++++
 2 files changed, 42 insertions(+)

diff --git a/mlir/lib/Dialect/Index/IR/IndexOps.cpp b/mlir/lib/Dialect/Index/IR/IndexOps.cpp
index b506397742772..42401dae217ce 100644
--- a/mlir/lib/Dialect/Index/IR/IndexOps.cpp
+++ b/mlir/lib/Dialect/Index/IR/IndexOps.cpp
@@ -578,6 +578,24 @@ static std::optional<bool> foldCmpOfMaxOrMin(Operation *lhsOp,
                                 lhsRange, ConstantIntRanges::constant(cstB));
 }
 
+/// Return the result of `cmp(pred, x, x)`
+static bool compareSameArgs(IndexCmpPredicate pred) {
+  switch (pred) {
+  case IndexCmpPredicate::EQ:
+  case IndexCmpPredicate::SGE:
+  case IndexCmpPredicate::SLE:
+  case IndexCmpPredicate::UGE:
+  case IndexCmpPredicate::ULE:
+    return true;
+  case IndexCmpPredicate::NE:
+  case IndexCmpPredicate::SGT:
+  case IndexCmpPredicate::SLT:
+  case IndexCmpPredicate::UGT:
+  case IndexCmpPredicate::ULT:
+    return false;
+  }
+}
+
 OpFoldResult CmpOp::fold(FoldAdaptor adaptor) {
   // Attempt to fold if both inputs are constant.
   auto lhs = dyn_cast_if_present<IntegerAttr>(adaptor.getLhs());
@@ -606,6 +624,10 @@ OpFoldResult CmpOp::fold(FoldAdaptor adaptor) {
       return BoolAttr::get(getContext(), *result64);
   }
 
+  // Fold `cmp(x, x)`
+  if (getLhs() == getRhs())
+    return BoolAttr::get(getContext(), compareSameArgs(getPred()));
+
   return {};
 }
 
diff --git a/mlir/test/Dialect/Index/index-canonicalize.mlir b/mlir/test/Dialect/Index/index-canonicalize.mlir
index db03505350b77..37aa33bfde952 100644
--- a/mlir/test/Dialect/Index/index-canonicalize.mlir
+++ b/mlir/test/Dialect/Index/index-canonicalize.mlir
@@ -499,6 +499,26 @@ func.func @cmp(%arg0: index) -> (i1, i1, i1, i1, i1, i1) {
   return %0, %1, %2, %3, %5, %7 : i1, i1, i1, i1, i1, i1
 }
 
+// CHECK-LABEL: @cmp_same_args
+func.func @cmp_same_args(%a: index) -> (i1, i1, i1, i1, i1, i1, i1, i1, i1, i1) {
+  %0 = index.cmp eq(%a, %a)
+  %1 = index.cmp sge(%a, %a)
+  %2 = index.cmp sle(%a, %a)
+  %3 = index.cmp uge(%a, %a)
+  %4 = index.cmp ule(%a, %a)
+  %5 = index.cmp ne(%a, %a)
+  %6 = index.cmp sgt(%a, %a)
+  %7 = index.cmp slt(%a, %a)
+  %8 = index.cmp ugt(%a, %a)
+  %9 = index.cmp ult(%a, %a)
+
+  // CHECK-DAG: %[[TRUE:.*]] = index.bool.constant true
+  // CHECK-DAG: %[[FALSE:.*]] = index.bool.constant false
+  // CHECK-NEXT: return %[[TRUE]], %[[TRUE]], %[[TRUE]], %[[TRUE]], %[[TRUE]],
+  // CHECK-SAME: %[[FALSE]], %[[FALSE]], %[[FALSE]], %[[FALSE]], %[[FALSE]]
+  return %0, %1, %2, %3, %4, %5, %6, %7, %8, %9 : i1, i1, i1, i1, i1, i1, i1, i1, i1, i1
+}
+
 // CHECK-LABEL: @cmp_nofold
 func.func @cmp_nofold() -> i1 {
   %lhs = index.constant 1

From 8bef2f27a0f7df05c7879186cc50fc8ec4a81132 Mon Sep 17 00:00:00 2001
From: Walter Erquinigo <a20012251@gmail.com>
Date: Fri, 19 Jan 2024 18:55:40 -0500
Subject: [PATCH 189/843] [lldb-dap] Add a CMake variable for defining a
 welcome message (#78811)

lldb-dap instances managed by other extensions benefit from having a
welcome message with, for example, a basic user guide or a
troubleshooting message.
This PR adds a cmake variable for defining such message in a simple way.
This message appears upon initialization but before initCommands are
executed, as they might cause a failure and prevent the message from
being displayed.
---
 lldb/tools/lldb-dap/CMakeLists.txt |  6 ++++++
 lldb/tools/lldb-dap/lldb-dap.cpp   | 20 ++++++++++++++++----
 2 files changed, 22 insertions(+), 4 deletions(-)

diff --git a/lldb/tools/lldb-dap/CMakeLists.txt b/lldb/tools/lldb-dap/CMakeLists.txt
index c71c80981890b..554567eb3b0e2 100644
--- a/lldb/tools/lldb-dap/CMakeLists.txt
+++ b/lldb/tools/lldb-dap/CMakeLists.txt
@@ -46,6 +46,12 @@ add_lldb_tool(lldb-dap
     Support
   )
 
+if(LLDB_DAP_WELCOME_MESSAGE)
+  target_compile_definitions(lldb-dap
+    PRIVATE
+    -DLLDB_DAP_WELCOME_MESSAGE=\"${LLDB_DAP_WELCOME_MESSAGE}\")
+endif()
+
 if(LLDB_BUILD_FRAMEWORK)
   # In the build-tree, we know the exact path to the framework directory.
   # The installed framework can be in different locations.
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index bb3500c21e745..828cc67c42ddd 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -108,6 +108,14 @@ typedef void (*RequestCallback)(const llvm::json::Object &command);
 
 enum LaunchMethod { Launch, Attach, AttachForSuspendedLaunch };
 
+/// Prints a welcome message on the editor if the preprocessor variable
+/// LLDB_DAP_WELCOME_MESSAGE is defined.
+static void PrintWelcomeMessage() {
+#ifdef LLDB_DAP_WELCOME_MESSAGE
+  g_dap.SendOutput(OutputType::Console, LLDB_DAP_WELCOME_MESSAGE);
+#endif
+}
+
 lldb::SBValueList *GetTopLevelScope(int64_t variablesReference) {
   switch (variablesReference) {
   case VARREF_LOCALS:
@@ -657,6 +665,8 @@ void request_attach(const llvm::json::Object &request) {
   g_dap.SetFrameFormat(GetString(arguments, "customFrameFormat"));
   g_dap.SetThreadFormat(GetString(arguments, "customThreadFormat"));
 
+  PrintWelcomeMessage();
+
   // This is a hack for loading DWARF in .o files on Mac where the .o files
   // in the debug map of the main executable have relative paths which require
   // the lldb-dap binary to have its working directory set to that relative
@@ -666,7 +676,7 @@ void request_attach(const llvm::json::Object &request) {
 
   // Run any initialize LLDB commands the user specified in the launch.json
   if (llvm::Error err = g_dap.RunInitCommands()) {
-    response["success"] = false;
+    kkkk response["success"] = false;
     EmplaceSafeString(response, "message", llvm::toString(std::move(err)));
     g_dap.SendJSON(llvm::json::Value(std::move(response)));
     return;
@@ -1838,10 +1848,12 @@ void request_launch(const llvm::json::Object &request) {
   g_dap.SetFrameFormat(GetString(arguments, "customFrameFormat"));
   g_dap.SetThreadFormat(GetString(arguments, "customThreadFormat"));
 
+  PrintWelcomeMessage();
+
   // This is a hack for loading DWARF in .o files on Mac where the .o files
-  // in the debug map of the main executable have relative paths which require
-  // the lldb-dap binary to have its working directory set to that relative
-  // root for the .o files in order to be able to load debug info.
+  // in the debug map of the main executable have relative paths which
+  // require the lldb-dap binary to have its working directory set to that
+  // relative root for the .o files in order to be able to load debug info.
   if (!debuggerRoot.empty())
     llvm::sys::fs::set_current_path(debuggerRoot);
 

From 904cf66ec1d4089e5e661eb996487ba132b97664 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Sat, 20 Jan 2024 08:09:20 +0800
Subject: [PATCH 190/843] [lldb] Fix build error in lldb-dap.cpp (NFC)

llvm-project/lldb/tools/lldb-dap/lldb-dap.cpp:679:5:
 error: unknown type name 'kkkk'
    kkkk response["success"] = false;
    ^
---
 lldb/tools/lldb-dap/lldb-dap.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 828cc67c42ddd..01494dcc7da00 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -676,7 +676,7 @@ void request_attach(const llvm::json::Object &request) {
 
   // Run any initialize LLDB commands the user specified in the launch.json
   if (llvm::Error err = g_dap.RunInitCommands()) {
-    kkkk response["success"] = false;
+    response["success"] = false;
     EmplaceSafeString(response, "message", llvm::toString(std::move(err)));
     g_dap.SendJSON(llvm::json::Value(std::move(response)));
     return;

From 46845074557484a82f4dc73647dad399e1c00e89 Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Fri, 19 Jan 2024 16:11:08 -0800
Subject: [PATCH 191/843] [lldb][DWARFUnit] Implement PeekDIEName query
 (#78486)

This allows us to query the AT_Name of a DIE without parsing the entire
CU.

Part of the ongoing effort to support IDX_Parent in accelerator tables
[1].

[1]:
https://discourse.llvm.org/t/rfc-improve-dwarf-5-debug-names-type-lookup-parsing-speed/74151/44
---
 .../SymbolFile/DWARF/DWARFDebugInfo.cpp       |  6 ++
 .../Plugins/SymbolFile/DWARF/DWARFDebugInfo.h |  5 ++
 .../SymbolFile/DWARF/DWARFFormValue.cpp       | 24 +++---
 .../Plugins/SymbolFile/DWARF/DWARFFormValue.h |  6 ++
 .../Plugins/SymbolFile/DWARF/DWARFUnit.cpp    | 24 ++++++
 .../Plugins/SymbolFile/DWARF/DWARFUnit.h      |  5 ++
 .../SymbolFile/DWARF/DWARFDIETest.cpp         | 83 +++++++++++++++++++
 7 files changed, 143 insertions(+), 10 deletions(-)

diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
index 553b6a4c551d2..340b9acf80d02 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.cpp
@@ -191,3 +191,9 @@ DWARFDebugInfo::GetDIE(const DIERef &die_ref) {
     return cu->GetNonSkeletonUnit().GetDIE(die_ref.die_offset());
   return DWARFDIE(); // Not found
 }
+
+llvm::StringRef DWARFDebugInfo::PeekDIEName(const DIERef &die_ref) {
+  if (DWARFUnit *cu = GetUnit(die_ref))
+    return cu->GetNonSkeletonUnit().PeekDIEName(die_ref.die_offset());
+  return llvm::StringRef();
+}
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
index d5e48f312ea0e..a8b5abc3beed2 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFDebugInfo.h
@@ -43,6 +43,11 @@ class DWARFDebugInfo {
   bool ContainsTypeUnits();
   DWARFDIE GetDIE(const DIERef &die_ref);
 
+  /// Returns the AT_Name of this DIE, if it exists, without parsing the entire
+  /// compile unit. An empty is string is returned upon error or if the
+  /// attribute is not present.
+  llvm::StringRef PeekDIEName(const DIERef &die_ref);
+
   enum {
     eDumpFlag_Verbose = (1 << 0),  // Verbose dumping
     eDumpFlag_ShowForm = (1 << 1), // Show the DW_form type
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
index 0a7029a55c047..e1f73f1997e36 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.cpp
@@ -18,8 +18,6 @@
 #include "DWARFFormValue.h"
 #include "DWARFUnit.h"
 
-class DWARFUnit;
-
 using namespace lldb_private;
 using namespace lldb_private::dwarf;
 using namespace lldb_private::plugin::dwarf;
@@ -502,7 +500,8 @@ dw_addr_t DWARFFormValue::Address() const {
       &offset, index_size);
 }
 
-DWARFDIE DWARFFormValue::Reference() const {
+std::pair<DWARFUnit *, uint64_t>
+DWARFFormValue::ReferencedUnitAndOffset() const {
   uint64_t value = m_value.value.uval;
   switch (m_form) {
   case DW_FORM_ref1:
@@ -516,9 +515,9 @@ DWARFDIE DWARFFormValue::Reference() const {
     if (!m_unit->ContainsDIEOffset(value)) {
       m_unit->GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
           "DW_FORM_ref* DIE reference {0:x16} is outside of its CU", value);
-      return {};
+      return {nullptr, 0};
     }
-    return const_cast<DWARFUnit *>(m_unit)->GetDIE(value);
+    return {const_cast<DWARFUnit *>(m_unit), value};
 
   case DW_FORM_ref_addr: {
     DWARFUnit *ref_cu =
@@ -527,24 +526,29 @@ DWARFDIE DWARFFormValue::Reference() const {
     if (!ref_cu) {
       m_unit->GetSymbolFileDWARF().GetObjectFile()->GetModule()->ReportError(
           "DW_FORM_ref_addr DIE reference {0:x16} has no matching CU", value);
-      return {};
+      return {nullptr, 0};
     }
-    return ref_cu->GetDIE(value);
+    return {ref_cu, value};
   }
 
   case DW_FORM_ref_sig8: {
     DWARFTypeUnit *tu =
         m_unit->GetSymbolFileDWARF().DebugInfo().GetTypeUnitForHash(value);
     if (!tu)
-      return {};
-    return tu->GetDIE(tu->GetTypeOffset());
+      return {nullptr, 0};
+    return {tu, tu->GetTypeOffset()};
   }
 
   default:
-    return {};
+    return {nullptr, 0};
   }
 }
 
+DWARFDIE DWARFFormValue::Reference() const {
+  auto [unit, offset] = ReferencedUnitAndOffset();
+  return unit ? unit->GetDIE(offset) : DWARFDIE();
+}
+
 uint64_t DWARFFormValue::Reference(dw_offset_t base_offset) const {
   uint64_t value = m_value.value.uval;
   switch (m_form) {
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h
index 445749a6aac3a..fdd5b3c278a4e 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFFormValue.h
@@ -60,6 +60,12 @@ class DWARFFormValue {
                                              const DWARFUnit *u);
   std::optional<uint8_t> GetFixedSize() const;
   DWARFDIE Reference() const;
+
+  /// If this is a reference to another DIE, return the corresponding DWARFUnit
+  /// and DIE offset such that Unit->GetDIE(offset) produces the desired DIE.
+  /// Otherwise, a nullptr and unspecified offset are returned.
+  std::pair<DWARFUnit *, uint64_t> ReferencedUnitAndOffset() const;
+
   uint64_t Reference(dw_offset_t offset) const;
   bool Boolean() const { return m_value.value.uval != 0; }
   uint64_t Unsigned() const { return m_value.value.uval; }
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
index 0e2f4d45543bb..7a40361cdede7 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.cpp
@@ -663,6 +663,30 @@ DWARFUnit::GetDIE(dw_offset_t die_offset) {
   return DWARFDIE(); // Not found
 }
 
+llvm::StringRef DWARFUnit::PeekDIEName(dw_offset_t die_offset) {
+  DWARFDebugInfoEntry die;
+  if (!die.Extract(GetData(), this, &die_offset))
+    return llvm::StringRef();
+
+  // Does die contain a DW_AT_Name?
+  if (const char *name =
+          die.GetAttributeValueAsString(this, DW_AT_name, nullptr))
+    return name;
+
+  // Does its DW_AT_specification or DW_AT_abstract_origin contain an AT_Name?
+  for (auto attr : {DW_AT_specification, DW_AT_abstract_origin}) {
+    DWARFFormValue form_value;
+    if (!die.GetAttributeValue(this, attr, form_value))
+      continue;
+    auto [unit, offset] = form_value.ReferencedUnitAndOffset();
+    if (unit)
+      if (auto name = unit->PeekDIEName(offset); !name.empty())
+        return name;
+  }
+
+  return llvm::StringRef();
+}
+
 DWARFUnit &DWARFUnit::GetNonSkeletonUnit() {
   ExtractUnitDIEIfNeeded();
   if (m_dwo)
diff --git a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
index 3f528e913d8cf..bc225a52e1d03 100644
--- a/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
+++ b/lldb/source/Plugins/SymbolFile/DWARF/DWARFUnit.h
@@ -187,6 +187,11 @@ class DWARFUnit : public UserID {
 
   DWARFDIE GetDIE(dw_offset_t die_offset);
 
+  /// Returns the AT_Name of the DIE at `die_offset`, if it exists, without
+  /// parsing the entire compile unit. An empty is string is returned upon
+  /// error or if the attribute is not present.
+  llvm::StringRef PeekDIEName(dw_offset_t die_offset);
+
   DWARFUnit &GetNonSkeletonUnit();
 
   static uint8_t GetAddressByteSize(const DWARFUnit *cu);
diff --git a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
index 8497855b2f3db..bcb211815f9f3 100644
--- a/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
+++ b/lldb/unittests/SymbolFile/DWARF/DWARFDIETest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "Plugins/SymbolFile/DWARF/DWARFDIE.h"
+#include "Plugins/SymbolFile/DWARF/DWARFDebugInfo.h"
 #include "TestingSupport/Symbol/YAMLModuleTester.h"
 #include "llvm/ADT/STLExtras.h"
 #include "gmock/gmock.h"
@@ -104,3 +105,85 @@ TEST(DWARFDIETest, ChildIteration) {
   DWARFDIE no_children_die(unit, die_child0);
   EXPECT_TRUE(no_children_die.children().empty());
 }
+
+TEST(DWARFDIETest, PeekName) {
+  const char *yamldata = R"(
+--- !ELF
+FileHeader:
+  Class:   ELFCLASS64
+  Data:    ELFDATA2LSB
+  Type:    ET_EXEC
+  Machine: EM_386
+DWARF:
+  debug_str:
+    - 'NameType1'
+    - 'NameType2'
+  debug_abbrev:
+    - Table:
+        - Code:            0x00000001
+          Tag:             DW_TAG_compile_unit
+          Children:        DW_CHILDREN_yes
+          Attributes:
+            - Attribute:       DW_AT_language
+              Form:            DW_FORM_data2
+        - Code:            0x00000002
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_name
+              Form:            DW_FORM_strp
+        - Code:            0x00000003
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_abstract_origin
+              Form:            DW_FORM_ref1
+        - Code:            0x00000004
+          Tag:             DW_TAG_base_type
+          Children:        DW_CHILDREN_no
+          Attributes:
+            - Attribute:       DW_AT_specification
+              Form:            DW_FORM_ref1
+  debug_info:
+    - Version:         4
+      AddrSize:        8
+      Entries:
+        - AbbrCode:        0x00000001
+          Values:
+            - Value:           0x000000000000000C
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x0000000000000000 # Name = NameType1
+        - AbbrCode:        0x00000002
+          Values:
+            - Value:           0x000000000000000a # Name = NameType2
+        - AbbrCode:        0x00000003
+          Values:
+            - Value:           0x000000000000000e # Ref abstract origin to NameType1 DIE.
+        - AbbrCode:        0x00000004
+          Values:
+            - Value:           0x0000000000000013 # Ref specification to NameType2 DIE.
+        - AbbrCode:        0x00000000
+)";
+
+  YAMLModuleTester t(yamldata);
+  auto *symbol_file =
+      llvm::cast<SymbolFileDWARF>(t.GetModule()->GetSymbolFile());
+  auto &debug_info = symbol_file->DebugInfo();
+
+  DIERef first_die(std::nullopt, DIERef::Section::DebugInfo,
+                   11 /*FirstDIEOffset*/);
+  EXPECT_EQ(debug_info.PeekDIEName(first_die), "");
+
+  DIERef second_die(std::nullopt, DIERef::Section::DebugInfo, 14);
+  EXPECT_EQ(debug_info.PeekDIEName(second_die), "NameType1");
+
+  DIERef third_die(std::nullopt, DIERef::Section::DebugInfo, 19);
+  EXPECT_EQ(debug_info.PeekDIEName(third_die), "NameType2");
+
+  DIERef fourth_die(std::nullopt, DIERef::Section::DebugInfo, 24);
+  EXPECT_EQ(debug_info.PeekDIEName(fourth_die), "NameType1");
+
+  DIERef fifth_die(std::nullopt, DIERef::Section::DebugInfo, 26);
+  EXPECT_EQ(debug_info.PeekDIEName(fifth_die), "NameType2");
+}

From 7071a25d122e72fbb71cc2c34a0ec654035e75f4 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 19 Jan 2024 16:15:52 -0800
Subject: [PATCH 192/843] [ELF] Rename LazyObject to LazySymbol. NFC

LazySymbol (used by wasm port) is a more accurate name (the struct is
not about an object). However, the ELF port calls this LazyObject likely
because we used to have LazyArchive (removed by
https://reviews.llvm.org/D119074), and LazyObject has a similar naming
convention (LazyObjectSymbol would be better, but it is too long).

Reviewers: dschuff

Pull Request: https://github.com/llvm/llvm-project/pull/78809
---
 lld/ELF/InputFiles.cpp |  4 ++--
 lld/ELF/Symbols.cpp    |  6 +++---
 lld/ELF/Symbols.h      | 22 +++++++++++-----------
 3 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index cc2c5916e05c2..147afcf306980 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1739,7 +1739,7 @@ void BitcodeFile::parseLazy() {
   for (auto [i, irSym] : llvm::enumerate(obj->symbols()))
     if (!irSym.isUndefined()) {
       auto *sym = symtab.insert(saver().save(irSym.getName()));
-      sym->resolve(LazyObject{*this});
+      sym->resolve(LazySymbol{*this});
       symbols[i] = sym;
     }
 }
@@ -1821,7 +1821,7 @@ template <class ELFT> void ObjFile<ELFT>::parseLazy() {
     if (eSyms[i].st_shndx == SHN_UNDEF)
       continue;
     symbols[i] = symtab.insert(CHECK(eSyms[i].getName(stringTable), this));
-    symbols[i]->resolve(LazyObject{*this});
+    symbols[i]->resolve(LazySymbol{*this});
     if (!lazy)
       break;
   }
diff --git a/lld/ELF/Symbols.cpp b/lld/ELF/Symbols.cpp
index 734ca6bfcb403..cd2b9e22ab322 100644
--- a/lld/ELF/Symbols.cpp
+++ b/lld/ELF/Symbols.cpp
@@ -40,7 +40,7 @@ LLVM_ATTRIBUTE_UNUSED static inline void assertSymbols() {
   AssertSymbol<CommonSymbol>();
   AssertSymbol<Undefined>();
   AssertSymbol<SharedSymbol>();
-  AssertSymbol<LazyObject>();
+  AssertSymbol<LazySymbol>();
 }
 
 // Returns a symbol for an error message.
@@ -146,7 +146,7 @@ static uint64_t getSymVA(const Symbol &sym, int64_t addend) {
   case Symbol::SharedKind:
   case Symbol::UndefinedKind:
     return 0;
-  case Symbol::LazyObjectKind:
+  case Symbol::LazyKind:
     llvm_unreachable("lazy symbol reached writer");
   case Symbol::CommonKind:
     llvm_unreachable("common symbol reached writer");
@@ -623,7 +623,7 @@ void Symbol::resolve(const Defined &other) {
     other.overwrite(*this);
 }
 
-void Symbol::resolve(const LazyObject &other) {
+void Symbol::resolve(const LazySymbol &other) {
   if (isPlaceholder()) {
     other.overwrite(*this);
     return;
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index e34a31e12f99a..ec541de5c4613 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -37,7 +37,7 @@ class InputSectionBase;
 class SharedSymbol;
 class Symbol;
 class Undefined;
-class LazyObject;
+class LazySymbol;
 class InputFile;
 
 void printTraceSymbol(const Symbol &sym, StringRef name);
@@ -76,7 +76,7 @@ class Symbol {
     CommonKind,
     SharedKind,
     UndefinedKind,
-    LazyObjectKind,
+    LazyKind,
   };
 
   Kind kind() const { return static_cast<Kind>(symbolKind); }
@@ -181,7 +181,7 @@ class Symbol {
 
   bool isLocal() const { return binding == llvm::ELF::STB_LOCAL; }
 
-  bool isLazy() const { return symbolKind == LazyObjectKind; }
+  bool isLazy() const { return symbolKind == LazyKind; }
 
   // True if this is an undefined weak symbol. This only works once
   // all input files have been added.
@@ -237,7 +237,7 @@ class Symbol {
   void resolve(const Undefined &other);
   void resolve(const CommonSymbol &other);
   void resolve(const Defined &other);
-  void resolve(const LazyObject &other);
+  void resolve(const LazySymbol &other);
   void resolve(const SharedSymbol &other);
 
   // If this is a lazy symbol, extract an input file and add the symbol
@@ -471,7 +471,7 @@ class SharedSymbol : public Symbol {
   uint32_t alignment;
 };
 
-// LazyObject symbols represent symbols in object files between --start-lib and
+// LazySymbol symbols represent symbols in object files between --start-lib and
 // --end-lib options. LLD also handles traditional archives as if all the files
 // in the archive are surrounded by --start-lib and --end-lib.
 //
@@ -480,14 +480,14 @@ class SharedSymbol : public Symbol {
 // and the lazy. We represent that with a lazy symbol with a weak binding. This
 // means that code looking for undefined symbols normally also has to take lazy
 // symbols into consideration.
-class LazyObject : public Symbol {
+class LazySymbol : public Symbol {
 public:
-  LazyObject(InputFile &file)
-      : Symbol(LazyObjectKind, &file, {}, llvm::ELF::STB_GLOBAL,
+  LazySymbol(InputFile &file)
+      : Symbol(LazyKind, &file, {}, llvm::ELF::STB_GLOBAL,
                llvm::ELF::STV_DEFAULT, llvm::ELF::STT_NOTYPE) {}
-  void overwrite(Symbol &sym) const { Symbol::overwrite(sym, LazyObjectKind); }
+  void overwrite(Symbol &sym) const { Symbol::overwrite(sym, LazyKind); }
 
-  static bool classof(const Symbol *s) { return s->kind() == LazyObjectKind; }
+  static bool classof(const Symbol *s) { return s->kind() == LazyKind; }
 };
 
 // Some linker-generated symbols need to be created as
@@ -541,7 +541,7 @@ union SymbolUnion {
   alignas(CommonSymbol) char b[sizeof(CommonSymbol)];
   alignas(Undefined) char c[sizeof(Undefined)];
   alignas(SharedSymbol) char d[sizeof(SharedSymbol)];
-  alignas(LazyObject) char e[sizeof(LazyObject)];
+  alignas(LazySymbol) char e[sizeof(LazySymbol)];
 };
 
 template <typename... T> Defined *makeDefined(T &&...args) {

From 123ab34abc68e2e64720f9690aed5cb9e62a035d Mon Sep 17 00:00:00 2001
From: "Stephan T. Lavavej" <stl@nuwen.net>
Date: Fri, 19 Jan 2024 16:17:03 -0800
Subject: [PATCH 193/843] [libc++][test] Move format.functions ASCII tests to
 `libcxx/test/libcxx` (#78661)

As @cpplearner explained in microsoft/STL#4328:

> libc++'s "ascii" mode (controlled by the `_LIBCPP_HAS_NO_UNICODE`
> macro) means "every code unit outside ASCII is treated as a valid
> printable character". AFAIK we \[MSVC's STL\] don't support such a mode.

Because these files are testing a non-Standard mode, they should be
moved from `libcxx/test/std` to `libcxx/test/libcxx`.
---
 .../utilities/format/format.functions/ascii.pass.cpp            | 0
 .../format/format.functions/escaped_output.ascii.pass.cpp       | 2 +-
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename libcxx/test/{std => libcxx}/utilities/format/format.functions/ascii.pass.cpp (100%)
 rename libcxx/test/{std => libcxx}/utilities/format/format.functions/escaped_output.ascii.pass.cpp (99%)

diff --git a/libcxx/test/std/utilities/format/format.functions/ascii.pass.cpp b/libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp
similarity index 100%
rename from libcxx/test/std/utilities/format/format.functions/ascii.pass.cpp
rename to libcxx/test/libcxx/utilities/format/format.functions/ascii.pass.cpp
diff --git a/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp b/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp
similarity index 99%
rename from libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp
rename to libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp
index 1d92c7c96c2a7..32352774a08f9 100644
--- a/libcxx/test/std/utilities/format/format.functions/escaped_output.ascii.pass.cpp
+++ b/libcxx/test/libcxx/utilities/format/format.functions/escaped_output.ascii.pass.cpp
@@ -226,7 +226,7 @@ void test_char() {
     static_assert(sizeof(CharT) == 4, "add support for unexpected size");
     // Unicode fitting in a 32-bit wchar_t
 
-    constexpr wchar_t x  = 0x1ffff;
+    constexpr wchar_t x       = 0x1ffff;
     constexpr std::uint32_t y = 0x1ffff;
     static_assert(x == y);
 

From c71a5bf940d2246c53d4cbb32cff21e52cc5635d Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 19 Jan 2024 16:18:28 -0800
Subject: [PATCH 194/843] [msan] Unpoison indirect outputs for userspace when
 -msan-handle-asm-conservative is specified (#77393)

KMSAN defaults to `msan-handle-asm-conservative`, which inserts
`__msan_instrument_asm_store` calls to unpoison indirect outputs in
inline assembly (e.g. `=m` constraints in source).

```c
unsigned f() {
  unsigned v;
  // __msan_instrument_asm_store unpoisons v before invoking the asm.
  asm("movl $1,%0" : "=m"(v));
  return v;
}
```

Extend the mechanism to userspace, but require explicit
`-mllvm -msan-handle-asm-conservative` for experiments for now.

As

https://docs.kernel.org/dev-tools/kmsan.html#inline-assembly-instrumentation
says, this approach may mask certain errors (an indirect output may not
actually be initialized), but it also helps to avoid a lot of false
positives.

Link: https://github.com/google/sanitizers/issues/192
---
 .../Instrumentation/MemorySanitizer.cpp       |  16 ++-
 .../MemorySanitizer/msan_asm_conservative.ll  | 106 ++++++++++--------
 2 files changed, 73 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 5b3f26a5f54eb..15bca538860d9 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -4103,7 +4103,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       // do the usual thing: check argument shadow and mark all outputs as
       // clean. Note that any side effects of the inline asm that are not
       // immediately visible in its constraints are not handled.
-      if (ClHandleAsmConservative && MS.CompileKernel)
+      // For now, handle inline asm by default for KMSAN.
+      bool HandleAsm = ClHandleAsmConservative.getNumOccurrences()
+                           ? ClHandleAsmConservative
+                           : MS.CompileKernel;
+      if (HandleAsm)
         visitAsmInstruction(CB);
       else
         visitInstruction(CB);
@@ -4557,7 +4561,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       return;
     Value *SizeVal =
       IRB.CreateTypeSize(MS.IntptrTy, DL.getTypeStoreSize(ElemTy));
-    IRB.CreateCall(MS.MsanInstrumentAsmStoreFn, {Operand, SizeVal});
+    if (MS.CompileKernel) {
+      IRB.CreateCall(MS.MsanInstrumentAsmStoreFn, {Operand, SizeVal});
+    } else {
+      // ElemTy, derived from elementtype(), does not encode the alignment of
+      // the pointer. Conservatively assume that the shadow memory is unaligned.
+      auto [ShadowPtr, _] =
+          getShadowOriginPtrUserspace(Operand, IRB, IRB.getInt8Ty(), Align(1));
+      IRB.CreateAlignedStore(getCleanShadow(ElemTy), ShadowPtr, Align(1));
+    }
   }
 
   /// Get the number of output arguments returned by pointers.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
index 1239698f3ac32..9a501ee6954c9 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
@@ -1,10 +1,14 @@
 ; Test for handling of asm constraints in MSan instrumentation.
+; RUN: opt < %s -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | \
+; RUN:   FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | \
+; RUN:   FileCheck --check-prefixes=CHECK,USER-CONS %s
 ; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0                    \
-; RUN: -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck        \
-; RUN: "-check-prefix=CHECK" %s
+; RUN:   -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck      \
+; RUN:   --check-prefixes=CHECK,KMSAN %s
 ; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0                    \
-; RUN: -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck        \
-; RUN: "-check-prefixes=CHECK,CHECK-CONS" %s
+; RUN:   -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck      \
+; RUN:   --check-prefixes=CHECK,KMSAN,CHECK-CONS %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
@@ -46,9 +50,9 @@ entry:
 ; CHECK: [[IS1_F1:%.*]] = load i32, ptr @is1, align 4
 ; CHECK: call void @__msan_warning
 ; CHECK: call i32 asm "",{{.*}}(i32 [[IS1_F1]])
-; CHECK: [[PACK1_F1:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
-; CHECK: [[EXT1_F1:%.*]] = extractvalue { ptr, ptr } [[PACK1_F1]], 0
-; CHECK: store i32 0, ptr [[EXT1_F1]]
+; KMSAN: [[PACK1_F1:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; KMSAN: [[EXT1_F1:%.*]] = extractvalue { ptr, ptr } [[PACK1_F1]], 0
+; KMSAN: store i32 0, ptr [[EXT1_F1]]
 
 
 ; Two input registers, two output registers:
@@ -69,14 +73,14 @@ entry:
 ; CHECK: [[IS1_F2:%.*]] = load i32, ptr @is1, align 4
 ; CHECK: [[IS2_F2:%.*]] = load i32, ptr @is2, align 4
 ; CHECK: call void @__msan_warning
-; CHECK: call void @__msan_warning
+; KMSAN: call void @__msan_warning
 ; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[IS1_F2]], i32 [[IS2_F2]])
-; CHECK: [[PACK1_F2:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
-; CHECK: [[EXT1_F2:%.*]] = extractvalue { ptr, ptr } [[PACK1_F2]], 0
-; CHECK: store i32 0, ptr [[EXT1_F2]]
-; CHECK: [[PACK2_F2:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
-; CHECK: [[EXT2_F2:%.*]] = extractvalue { ptr, ptr } [[PACK2_F2]], 0
-; CHECK: store i32 0, ptr [[EXT2_F2]]
+; KMSAN: [[PACK1_F2:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; KMSAN: [[EXT1_F2:%.*]] = extractvalue { ptr, ptr } [[PACK1_F2]], 0
+; KMSAN: store i32 0, ptr [[EXT1_F2]]
+; KMSAN: [[PACK2_F2:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
+; KMSAN: [[EXT2_F2:%.*]] = extractvalue { ptr, ptr } [[PACK2_F2]], 0
+; KMSAN: store i32 0, ptr [[EXT2_F2]]
 
 ; Input same as output, used twice:
 ;   asm("" : "=r" (id1), "=r" (id2) : "r" (id1), "r" (id2));
@@ -96,14 +100,14 @@ entry:
 ; CHECK: [[ID1_F3:%.*]] = load i32, ptr @id1, align 4
 ; CHECK: [[ID2_F3:%.*]] = load i32, ptr @id2, align 4
 ; CHECK: call void @__msan_warning
-; CHECK: call void @__msan_warning
+; KMSAN: call void @__msan_warning
 ; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[ID1_F3]], i32 [[ID2_F3]])
-; CHECK: [[PACK1_F3:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
-; CHECK: [[EXT1_F3:%.*]] = extractvalue { ptr, ptr } [[PACK1_F3]], 0
-; CHECK: store i32 0, ptr [[EXT1_F3]]
-; CHECK: [[PACK2_F3:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
-; CHECK: [[EXT2_F3:%.*]] = extractvalue { ptr, ptr } [[PACK2_F3]], 0
-; CHECK: store i32 0, ptr [[EXT2_F3]]
+; KMSAN: [[PACK1_F3:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; KMSAN: [[EXT1_F3:%.*]] = extractvalue { ptr, ptr } [[PACK1_F3]], 0
+; KMSAN: store i32 0, ptr [[EXT1_F3]]
+; KMSAN: [[PACK2_F3:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
+; KMSAN: [[EXT2_F3:%.*]] = extractvalue { ptr, ptr } [[PACK2_F3]], 0
+; KMSAN: store i32 0, ptr [[EXT2_F3]]
 
 
 ; One of the input registers is also an output:
@@ -124,14 +128,14 @@ entry:
 ; CHECK: [[ID1_F4:%.*]] = load i32, ptr @id1, align 4
 ; CHECK: [[IS1_F4:%.*]] = load i32, ptr @is1, align 4
 ; CHECK: call void @__msan_warning
-; CHECK: call void @__msan_warning
+; KMSAN: call void @__msan_warning
 ; CHECK: call { i32, i32 } asm "",{{.*}}(i32 [[ID1_F4]], i32 [[IS1_F4]])
-; CHECK: [[PACK1_F4:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
-; CHECK: [[EXT1_F4:%.*]] = extractvalue { ptr, ptr } [[PACK1_F4]], 0
-; CHECK: store i32 0, ptr [[EXT1_F4]]
-; CHECK: [[PACK2_F4:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
-; CHECK: [[EXT2_F4:%.*]] = extractvalue { ptr, ptr } [[PACK2_F4]], 0
-; CHECK: store i32 0, ptr [[EXT2_F4]]
+; KMSAN: [[PACK1_F4:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; KMSAN: [[EXT1_F4:%.*]] = extractvalue { ptr, ptr } [[PACK1_F4]], 0
+; KMSAN: store i32 0, ptr [[EXT1_F4]]
+; KMSAN: [[PACK2_F4:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
+; KMSAN: [[EXT2_F4:%.*]] = extractvalue { ptr, ptr } [[PACK2_F4]], 0
+; KMSAN: store i32 0, ptr [[EXT2_F4]]
 
 
 ; One input register, three output registers:
@@ -153,15 +157,15 @@ entry:
 ; CHECK: [[IS1_F5:%.*]] = load i32, ptr @is1, align 4
 ; CHECK: call void @__msan_warning
 ; CHECK: call { i32, i32, i32 } asm "",{{.*}}(i32 [[IS1_F5]])
-; CHECK: [[PACK1_F5:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
-; CHECK: [[EXT1_F5:%.*]] = extractvalue { ptr, ptr } [[PACK1_F5]], 0
-; CHECK: store i32 0, ptr [[EXT1_F5]]
-; CHECK: [[PACK2_F5:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
-; CHECK: [[EXT2_F5:%.*]] = extractvalue { ptr, ptr } [[PACK2_F5]], 0
-; CHECK: store i32 0, ptr [[EXT2_F5]]
-; CHECK: [[PACK3_F5:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id3{{.*}})
-; CHECK: [[EXT3_F5:%.*]] = extractvalue { ptr, ptr } [[PACK3_F5]], 0
-; CHECK: store i32 0, ptr [[EXT3_F5]]
+; KMSAN: [[PACK1_F5:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id1{{.*}})
+; KMSAN: [[EXT1_F5:%.*]] = extractvalue { ptr, ptr } [[PACK1_F5]], 0
+; KMSAN: store i32 0, ptr [[EXT1_F5]]
+; KMSAN: [[PACK2_F5:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id2{{.*}})
+; KMSAN: [[EXT2_F5:%.*]] = extractvalue { ptr, ptr } [[PACK2_F5]], 0
+; KMSAN: store i32 0, ptr [[EXT2_F5]]
+; KMSAN: [[PACK3_F5:%.*]] = call {{.*}} @__msan_metadata_ptr_for_store_4({{.*}}@id3{{.*}})
+; KMSAN: [[EXT3_F5:%.*]] = extractvalue { ptr, ptr } [[PACK3_F5]], 0
+; KMSAN: store i32 0, ptr [[EXT3_F5]]
 
 
 ; 2 input memory args, 2 output memory args:
@@ -173,6 +177,8 @@ entry:
 }
 
 ; CHECK-LABEL: @f_2i_2o_mem
+; USER-CONS:  store i32 0, ptr inttoptr (i64 xor (i64 ptrtoint (ptr @id1 to i64), i64 87960930222080) to ptr), align 1
+; USER-CONS:  store i32 0, ptr inttoptr (i64 xor (i64 ptrtoint (ptr @id2 to i64), i64 87960930222080) to ptr), align 1
 ; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id1{{.*}}, i64 4)
 ; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id2{{.*}}, i64 4)
 ; CHECK: call void asm "", "=*m,=*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) @id1, ptr elementtype(i32) @id2, ptr elementtype(i32) @is1, ptr elementtype(i32) @is2)
@@ -190,6 +196,7 @@ entry:
 
 ; CHECK-LABEL: @f_1i_1o_memreg
 ; CHECK: [[IS1_F7:%.*]] = load i32, ptr @is1, align 4
+; USER-CONS:  store i32 0, ptr inttoptr (i64 xor (i64 ptrtoint (ptr @id1 to i64), i64 87960930222080) to ptr), align 1
 ; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@id1{{.*}}, i64 4)
 ; CHECK: call void @__msan_warning
 ; CHECK: call i32 asm "", "=r,=*m,r,*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) @id1, i32 [[IS1_F7]], ptr elementtype(i32) @is1)
@@ -208,6 +215,7 @@ entry:
 }
 
 ; CHECK-LABEL: @f_3o_reg_mem_reg
+; USER-CONS:  store i32 0, ptr inttoptr (i64 xor (i64 ptrtoint (ptr @id2 to i64), i64 87960930222080) to ptr), align 1
 ; CHECK-CONS: call void @__msan_instrument_asm_store(ptr @id2, i64 4)
 ; CHECK: call { i32, i32 } asm "", "=r,=*m,=r,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(i32) @id2)
 
@@ -232,10 +240,11 @@ entry:
 ; CHECK: [[PAIR1_F9:%.*]] = load {{.*}} @pair1
 ; CHECK: [[C1_F9:%.*]] = load {{.*}} @c1
 ; CHECK: [[MEMCPY_S1_F9:%.*]] = load {{.*}} @memcpy_s1
+; USER-CONS:  store { i32, i32 } zeroinitializer, ptr inttoptr (i64 xor (i64 ptrtoint (ptr @pair2 to i64), i64 87960930222080) to ptr), align 1
 ; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@pair2{{.*}}, i64 8)
 ; CHECK: call void @__msan_warning
-; CHECK: call void @__msan_warning
-; CHECK: call void @__msan_warning
+; KMSAN: call void @__msan_warning
+; KMSAN: call void @__msan_warning
 ; CHECK: call { i8, ptr } asm "", "=*r,=r,=r,r,r,r,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(%struct.pair) @pair2, {{.*}}[[PAIR1_F9]], i8 [[C1_F9]], {{.*}} [[MEMCPY_S1_F9]])
 
 ; Three inputs and three outputs of different types: a pair, a char, a function pointer.
@@ -248,9 +257,12 @@ entry:
 }
 
 ; CHECK-LABEL: @f_3i_3o_complex_mem
-; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@pair2{{.*}}, i64 8)
-; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@c2{{.*}}, i64 1)
-; CHECK-CONS: call void @__msan_instrument_asm_store({{.*}}@memcpy_d1{{.*}}, i64 8)
+; USER-CONS:       store { i32, i32 } zeroinitializer, ptr inttoptr (i64 xor (i64 ptrtoint (ptr @pair2 to i64), i64 87960930222080) to ptr), align 1
+; USER-CONS-NEXT:  store i8 0, ptr inttoptr (i64 xor (i64 ptrtoint (ptr @c2 to i64), i64 87960930222080) to ptr), align 1
+; USER-CONS-NEXT:  store i64 0, ptr inttoptr (i64 xor (i64 ptrtoint (ptr @memcpy_d1 to i64), i64 87960930222080) to ptr), align 1
+; CHECK-CONS:      call void @__msan_instrument_asm_store({{.*}}@pair2{{.*}}, i64 8)
+; CHECK-CONS:      call void @__msan_instrument_asm_store({{.*}}@c2{{.*}}, i64 1)
+; CHECK-CONS:      call void @__msan_instrument_asm_store({{.*}}@memcpy_d1{{.*}}, i64 8)
 ; CHECK: call void asm "", "=*m,=*m,=*m,*m,*m,*m,~{dirflag},~{fpsr},~{flags}"(ptr elementtype(%struct.pair) @pair2, ptr elementtype(i8) @c2, ptr elementtype(ptr) @memcpy_d1, ptr elementtype(%struct.pair) @pair1, ptr elementtype(i8) @c1, ptr elementtype(ptr) @memcpy_s1)
 
 
@@ -278,8 +290,8 @@ cleanup:                                          ; preds = %entry, %skip_label
 }
 
 ; CHECK-LABEL: @asm_goto
-; CHECK: [[LOAD_ARG:%.*]] = load {{.*}} %_msarg
-; CHECK: [[CMP:%.*]] = icmp ne {{.*}} [[LOAD_ARG]], 0
-; CHECK: br {{.*}} [[CMP]], label %[[LABEL:.*]], label
-; CHECK: [[LABEL]]:
-; CHECK-NEXT: call void @__msan_warning
+; KMSAN: [[LOAD_ARG:%.*]] = load {{.*}} %_msarg
+; KMSAN: [[CMP:%.*]] = icmp ne {{.*}} [[LOAD_ARG]], 0
+; KMSAN: br {{.*}} [[CMP]], label %[[LABEL:.*]], label
+; KMSAN: [[LABEL]]:
+; KMSAN-NEXT: call void @__msan_warning

From bcc9b9d80c61ea6521215e9826281041a3f73b05 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Fri, 19 Jan 2024 16:20:29 -0800
Subject: [PATCH 195/843] [lld][WebAssembly] Match the ELF linker in
 transitioning away from archive indexes. (#78658)

The ELF linker transitioned away from archive indexes in
https://reviews.llvm.org/D117284.

This paves the way for supporting `--start-lib`/`--end-lib` (See #77960)

The ELF linker unified library handling with `--start-lib`/`--end-lib` and removed
the ArchiveFile class in https://reviews.llvm.org/D119074.
---
 lld/docs/ReleaseNotes.rst          |   4 ++
 lld/test/wasm/archive-no-index.s   |  14 ----
 lld/test/wasm/bad-archive-member.s |   2 +-
 lld/wasm/Driver.cpp                |  29 ++++-----
 lld/wasm/InputFiles.cpp            | 101 ++++++++++++++---------------
 lld/wasm/InputFiles.h              |  41 +++---------
 lld/wasm/SymbolTable.cpp           |  23 ++++---
 lld/wasm/SymbolTable.h             |   2 +-
 lld/wasm/Symbols.cpp               |  16 ++---
 lld/wasm/Symbols.h                 |   9 +--
 10 files changed, 99 insertions(+), 142 deletions(-)
 delete mode 100644 lld/test/wasm/archive-no-index.s

diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index c322b776ff58f..01669543cd50c 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -50,5 +50,9 @@ MachO Improvements
 WebAssembly Improvements
 ------------------------
 
+* Indexes are no longer required on archive files.  Instead symbol information
+  is read from object files within the archive.  This matches the behaviour of
+  the ELF linker.
+
 Fixes
 #####
diff --git a/lld/test/wasm/archive-no-index.s b/lld/test/wasm/archive-no-index.s
deleted file mode 100644
index 99ca5a367d3c6..0000000000000
--- a/lld/test/wasm/archive-no-index.s
+++ /dev/null
@@ -1,14 +0,0 @@
-# Tests error on archive file without a symbol table
-# RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown -o %t.o %s
-# RUN: llvm-as -o %t.archive.o %S/Inputs/archive1.ll
-# RUN: rm -f %t.a
-# RUN: llvm-ar crS %t.a %t.archive.o
-
-# RUN: not wasm-ld -o out.wasm %t.o %t.a 2>&1 | FileCheck %s
-
-  .globl  _start
-_start:
-  .functype _start () -> ()
-  end_function
-
-# CHECK: archive has no index; run ranlib to add one
diff --git a/lld/test/wasm/bad-archive-member.s b/lld/test/wasm/bad-archive-member.s
index 029027a8517a3..77bf16871ca5b 100644
--- a/lld/test/wasm/bad-archive-member.s
+++ b/lld/test/wasm/bad-archive-member.s
@@ -5,7 +5,7 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux -o %t.dir/elf.o %s
 # RUN: llvm-ar rcs %t.dir/libfoo.a %t.dir/elf.o
 # RUN: not wasm-ld %t.dir/libfoo.a -o /dev/null 2>&1 | FileCheck %s
-# CHECK: error: unknown file type: {{.*}}libfoo.a(elf.o)
+# CHECK: warning: {{.*}}libfoo.a: archive member 'elf.o' is neither Wasm object file nor LLVM bitcode
 
 .globl _start
 _start:
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 32c042b5695a4..edf1979c1d302 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -272,9 +272,11 @@ void LinkerDriver::addFile(StringRef path) {
     if (fs::exists(importFile))
       readImportFile(importFile.str());
 
+    auto members = getArchiveMembers(mbref);
+
     // Handle -whole-archive.
     if (inWholeArchive) {
-      for (const auto &[m, offset] : getArchiveMembers(mbref)) {
+      for (const auto &[m, offset] : members) {
         auto *object = createObjectFile(m, path, offset);
         // Mark object as live; object members are normally not
         // live by default but -whole-archive is designed to treat
@@ -289,12 +291,15 @@ void LinkerDriver::addFile(StringRef path) {
     std::unique_ptr<Archive> file =
         CHECK(Archive::create(mbref), path + ": failed to parse archive");
 
-    if (!file->isEmpty() && !file->hasSymbolTable()) {
-      error(mbref.getBufferIdentifier() +
-            ": archive has no index; run ranlib to add one");
+    for (const auto &[m, offset] : members) {
+      auto magic = identify_magic(m.getBuffer());
+      if (magic == file_magic::wasm_object || magic == file_magic::bitcode)
+        files.push_back(createObjectFile(m, path, offset, true));
+      else
+        warn(path + ": archive member '" + m.getBufferIdentifier() +
+             "' is neither Wasm object file nor LLVM bitcode");
     }
 
-    files.push_back(make<ArchiveFile>(mbref));
     return;
   }
   case file_magic::bitcode:
@@ -732,16 +737,10 @@ static Symbol *handleUndefined(StringRef name, const char *option) {
 
 static void handleLibcall(StringRef name) {
   Symbol *sym = symtab->find(name);
-  if (!sym)
-    return;
-
-  if (auto *lazySym = dyn_cast<LazySymbol>(sym)) {
-    MemoryBufferRef mb = lazySym->getMemberBuffer();
-    if (isBitcode(mb)) {
-      if (!config->whyExtract.empty())
-        ctx.whyExtractRecords.emplace_back("<libcall>", sym->getFile(), *sym);
-      lazySym->extract();
-    }
+  if (sym && sym->isLazy() && isa<BitcodeFile>(sym->getFile())) {
+    if (!config->whyExtract.empty())
+      ctx.whyExtractRecords.emplace_back("<libcall>", sym->getFile(), *sym);
+    cast<LazySymbol>(sym)->extract();
   }
 }
 
diff --git a/lld/wasm/InputFiles.cpp b/lld/wasm/InputFiles.cpp
index 19c76e4902789..f5e946aca8b2a 100644
--- a/lld/wasm/InputFiles.cpp
+++ b/lld/wasm/InputFiles.cpp
@@ -75,7 +75,7 @@ std::optional<MemoryBufferRef> readFile(StringRef path) {
 }
 
 InputFile *createObjectFile(MemoryBufferRef mb, StringRef archiveName,
-                            uint64_t offsetInArchive) {
+                            uint64_t offsetInArchive, bool lazy) {
   file_magic magic = identify_magic(mb.getBuffer());
   if (magic == file_magic::wasm_object) {
     std::unique_ptr<Binary> bin =
@@ -83,18 +83,11 @@ InputFile *createObjectFile(MemoryBufferRef mb, StringRef archiveName,
     auto *obj = cast<WasmObjectFile>(bin.get());
     if (obj->isSharedObject())
       return make<SharedFile>(mb);
-    return make<ObjFile>(mb, archiveName);
+    return make<ObjFile>(mb, archiveName, lazy);
   }
 
-  if (magic == file_magic::bitcode)
-    return make<BitcodeFile>(mb, archiveName, offsetInArchive);
-
-  std::string name = mb.getBufferIdentifier().str();
-  if (!archiveName.empty()) {
-    name = archiveName.str() + "(" + name + ")";
-  }
-
-  fatal("unknown file type: " + name);
+  assert(magic == file_magic::bitcode);
+  return make<BitcodeFile>(mb, archiveName, offsetInArchive, lazy);
 }
 
 // Relocations contain either symbol or type indices.  This function takes a
@@ -391,9 +384,30 @@ static bool shouldMerge(const WasmSegment &seg) {
   return true;
 }
 
-void ObjFile::parse(bool ignoreComdats) {
-  // Parse a memory buffer as a wasm file.
-  LLVM_DEBUG(dbgs() << "Parsing object: " << toString(this) << "\n");
+void ObjFile::parseLazy() {
+  LLVM_DEBUG(dbgs() << "ObjFile::parseLazy: " << toString(this) << "\n");
+  for (const SymbolRef &sym : wasmObj->symbols()) {
+    const WasmSymbol &wasmSym = wasmObj->getWasmSymbol(sym.getRawDataRefImpl());
+    if (!wasmSym.isDefined())
+      continue;
+    symtab->addLazy(wasmSym.Info.Name, this);
+    // addLazy() may trigger this->extract() if an existing symbol is an
+    // undefined symbol. If that happens, this function has served its purpose,
+    // and we can exit from the loop early.
+    if (!lazy)
+      break;
+  }
+}
+
+ObjFile::ObjFile(MemoryBufferRef m, StringRef archiveName, bool lazy)
+    : InputFile(ObjectKind, m) {
+  this->lazy = lazy;
+  this->archiveName = std::string(archiveName);
+
+  // If this isn't part of an archive, it's eagerly linked, so mark it live.
+  if (archiveName.empty())
+    markLive();
+
   std::unique_ptr<Binary> bin = CHECK(createBinary(mb), toString(this));
 
   auto *obj = dyn_cast<WasmObjectFile>(bin.get());
@@ -406,6 +420,11 @@ void ObjFile::parse(bool ignoreComdats) {
   wasmObj.reset(obj);
 
   checkArch(obj->getArch());
+}
+
+void ObjFile::parse(bool ignoreComdats) {
+  // Parse a memory buffer as a wasm file.
+  LLVM_DEBUG(dbgs() << "ObjFile::parse: " << toString(this) << "\n");
 
   // Build up a map of function indices to table indices for use when
   // verifying the existing table index relocations
@@ -717,43 +736,6 @@ void StubFile::parse() {
   }
 }
 
-void ArchiveFile::parse() {
-  // Parse a MemoryBufferRef as an archive file.
-  LLVM_DEBUG(dbgs() << "Parsing library: " << toString(this) << "\n");
-  file = CHECK(Archive::create(mb), toString(this));
-
-  // Read the symbol table to construct Lazy symbols.
-  int count = 0;
-  for (const Archive::Symbol &sym : file->symbols()) {
-    symtab->addLazy(this, &sym);
-    ++count;
-  }
-  LLVM_DEBUG(dbgs() << "Read " << count << " symbols\n");
-  (void) count;
-}
-
-void ArchiveFile::addMember(const Archive::Symbol *sym) {
-  const Archive::Child &c =
-      CHECK(sym->getMember(),
-            "could not get the member for symbol " + sym->getName());
-
-  // Don't try to load the same member twice (this can happen when members
-  // mutually reference each other).
-  if (!seen.insert(c.getChildOffset()).second)
-    return;
-
-  LLVM_DEBUG(dbgs() << "loading lazy: " << sym->getName() << "\n");
-  LLVM_DEBUG(dbgs() << "from archive: " << toString(this) << "\n");
-
-  MemoryBufferRef mb =
-      CHECK(c.getMemoryBufferRef(),
-            "could not get the buffer for the member defining symbol " +
-                sym->getName());
-
-  InputFile *obj = createObjectFile(mb, getName(), c.getChildOffset());
-  symtab->addFile(obj, sym->getName());
-}
-
 static uint8_t mapVisibility(GlobalValue::VisibilityTypes gvVisibility) {
   switch (gvVisibility) {
   case GlobalValue::DefaultVisibility:
@@ -790,8 +772,9 @@ static Symbol *createBitcodeSymbol(const std::vector<bool> &keptComdats,
 }
 
 BitcodeFile::BitcodeFile(MemoryBufferRef m, StringRef archiveName,
-                         uint64_t offsetInArchive)
+                         uint64_t offsetInArchive, bool lazy)
     : InputFile(BitcodeKind, m) {
+  this->lazy = lazy;
   this->archiveName = std::string(archiveName);
 
   std::string path = mb.getBufferIdentifier().str();
@@ -817,6 +800,20 @@ BitcodeFile::BitcodeFile(MemoryBufferRef m, StringRef archiveName,
 
 bool BitcodeFile::doneLTO = false;
 
+void BitcodeFile::parseLazy() {
+  for (auto [i, irSym] : llvm::enumerate(obj->symbols())) {
+    if (irSym.isUndefined())
+      continue;
+    StringRef name = saver().save(irSym.getName());
+    symtab->addLazy(name, this);
+    // addLazy() may trigger this->extract() if an existing symbol is an
+    // undefined symbol. If that happens, this function has served its purpose,
+    // and we can exit from the loop early.
+    if (!lazy)
+      break;
+  }
+}
+
 void BitcodeFile::parse(StringRef symName) {
   if (doneLTO) {
     error(toString(this) + ": attempt to add bitcode file after LTO (" + symName + ")");
diff --git a/lld/wasm/InputFiles.h b/lld/wasm/InputFiles.h
index d9a8b53066032..fd3d5e5ef4796 100644
--- a/lld/wasm/InputFiles.h
+++ b/lld/wasm/InputFiles.h
@@ -14,7 +14,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/LTO/LTO.h"
-#include "llvm/Object/Archive.h"
 #include "llvm/Object/Wasm.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/TargetParser/Triple.h"
@@ -45,7 +44,6 @@ class InputFile {
   enum Kind {
     ObjectKind,
     SharedKind,
-    ArchiveKind,
     BitcodeKind,
     StubKind,
   };
@@ -69,6 +67,11 @@ class InputFile {
   void markLive() { live = true; }
   bool isLive() const { return live; }
 
+  // True if this file is exists as in an archive file and has not yet been
+  // extracted.
+  // TODO(sbc): Use this to implement --start-lib/--end-lib.
+  bool lazy = false;
+
 protected:
   InputFile(Kind k, MemoryBufferRef m)
       : mb(m), fileKind(k), live(!config->gcSections) {}
@@ -85,35 +88,14 @@ class InputFile {
   bool live;
 };
 
-// .a file (ar archive)
-class ArchiveFile : public InputFile {
-public:
-  explicit ArchiveFile(MemoryBufferRef m) : InputFile(ArchiveKind, m) {}
-  static bool classof(const InputFile *f) { return f->kind() == ArchiveKind; }
-
-  void addMember(const llvm::object::Archive::Symbol *sym);
-
-  void parse();
-
-private:
-  std::unique_ptr<llvm::object::Archive> file;
-  llvm::DenseSet<uint64_t> seen;
-};
-
 // .o file (wasm object file)
 class ObjFile : public InputFile {
 public:
-  explicit ObjFile(MemoryBufferRef m, StringRef archiveName)
-      : InputFile(ObjectKind, m) {
-    this->archiveName = std::string(archiveName);
-
-    // If this isn't part of an archive, it's eagerly linked, so mark it live.
-    if (archiveName.empty())
-      markLive();
-  }
+  ObjFile(MemoryBufferRef m, StringRef archiveName, bool lazy = false);
   static bool classof(const InputFile *f) { return f->kind() == ObjectKind; }
 
   void parse(bool ignoreComdats = false);
+  void parseLazy();
 
   // Returns the underlying wasm file.
   const WasmObjectFile *getWasmObj() const { return wasmObj.get(); }
@@ -173,10 +155,11 @@ class SharedFile : public InputFile {
 class BitcodeFile : public InputFile {
 public:
   BitcodeFile(MemoryBufferRef m, StringRef archiveName,
-              uint64_t offsetInArchive);
+              uint64_t offsetInArchive, bool lazy);
   static bool classof(const InputFile *f) { return f->kind() == BitcodeKind; }
 
   void parse(StringRef symName);
+  void parseLazy();
   std::unique_ptr<llvm::lto::InputFile> obj;
 
   // Set to true once LTO is complete in order prevent further bitcode objects
@@ -196,14 +179,10 @@ class StubFile : public InputFile {
   llvm::DenseMap<StringRef, std::vector<StringRef>> symbolDependencies;
 };
 
-inline bool isBitcode(MemoryBufferRef mb) {
-  return identify_magic(mb.getBuffer()) == llvm::file_magic::bitcode;
-}
-
 // Will report a fatal() error if the input buffer is not a valid bitcode
 // or wasm object file.
 InputFile *createObjectFile(MemoryBufferRef mb, StringRef archiveName = "",
-                            uint64_t offsetInArchive = 0);
+                            uint64_t offsetInArchive = 0, bool lazy = false);
 
 // Opens a given file.
 std::optional<MemoryBufferRef> readFile(StringRef path);
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index 9988490e14b0b..c98aa3ee3a7a3 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -26,9 +26,13 @@ SymbolTable *symtab;
 void SymbolTable::addFile(InputFile *file, StringRef symName) {
   log("Processing: " + toString(file));
 
-  // .a file
-  if (auto *f = dyn_cast<ArchiveFile>(file)) {
-    f->parse();
+  // Lazy object file
+  if (file->lazy) {
+    if (auto *f = dyn_cast<BitcodeFile>(file)) {
+      f->parseLazy();
+    } else {
+      cast<ObjFile>(file)->parseLazy();
+    }
     return;
   }
 
@@ -737,16 +741,15 @@ TableSymbol *SymbolTable::resolveIndirectFunctionTable(bool required) {
   return nullptr;
 }
 
-void SymbolTable::addLazy(ArchiveFile *file, const Archive::Symbol *sym) {
-  LLVM_DEBUG(dbgs() << "addLazy: " << sym->getName() << "\n");
-  StringRef name = sym->getName();
+void SymbolTable::addLazy(StringRef name, InputFile *file) {
+  LLVM_DEBUG(dbgs() << "addLazy: " << name << "\n");
 
   Symbol *s;
   bool wasInserted;
   std::tie(s, wasInserted) = insertName(name);
 
   if (wasInserted) {
-    replaceSymbol<LazySymbol>(s, name, 0, file, *sym);
+    replaceSymbol<LazySymbol>(s, name, 0, file);
     return;
   }
 
@@ -763,15 +766,15 @@ void SymbolTable::addLazy(ArchiveFile *file, const Archive::Symbol *sym) {
     if (auto *f = dyn_cast<UndefinedFunction>(s))
       oldSig = f->signature;
     LLVM_DEBUG(dbgs() << "replacing existing weak undefined symbol\n");
-    auto newSym = replaceSymbol<LazySymbol>(s, name, WASM_SYMBOL_BINDING_WEAK,
-                                            file, *sym);
+    auto newSym =
+        replaceSymbol<LazySymbol>(s, name, WASM_SYMBOL_BINDING_WEAK, file);
     newSym->signature = oldSig;
     return;
   }
 
   LLVM_DEBUG(dbgs() << "replacing existing undefined\n");
   const InputFile *oldFile = s->getFile();
-  file->addMember(sym);
+  replaceSymbol<LazySymbol>(s, name, 0, file)->extract();
   if (!config->whyExtract.empty())
     ctx.whyExtractRecords.emplace_back(toString(oldFile), s->getFile(), *s);
 }
diff --git a/lld/wasm/SymbolTable.h b/lld/wasm/SymbolTable.h
index c5518ee23da26..42ebb8be8eb3f 100644
--- a/lld/wasm/SymbolTable.h
+++ b/lld/wasm/SymbolTable.h
@@ -83,7 +83,7 @@ class SymbolTable {
 
   TableSymbol *resolveIndirectFunctionTable(bool required);
 
-  void addLazy(ArchiveFile *f, const llvm::object::Archive::Symbol *sym);
+  void addLazy(StringRef name, InputFile *f);
 
   bool addComdat(StringRef name);
 
diff --git a/lld/wasm/Symbols.cpp b/lld/wasm/Symbols.cpp
index 47d8d09ab1bd4..ace6bade02d43 100644
--- a/lld/wasm/Symbols.cpp
+++ b/lld/wasm/Symbols.cpp
@@ -13,6 +13,7 @@
 #include "InputFiles.h"
 #include "OutputSections.h"
 #include "OutputSegment.h"
+#include "SymbolTable.h"
 #include "lld/Common/ErrorHandler.h"
 #include "lld/Common/Memory.h"
 #include "llvm/Demangle/Demangle.h"
@@ -426,23 +427,16 @@ const OutputSectionSymbol *SectionSymbol::getOutputSectionSymbol() const {
 }
 
 void LazySymbol::extract() {
-  cast<ArchiveFile>(file)->addMember(&archiveSymbol);
+  if (file->lazy) {
+    file->lazy = false;
+    symtab->addFile(file, name);
+  }
 }
 
 void LazySymbol::setWeak() {
   flags |= (flags & ~WASM_SYMBOL_BINDING_MASK) | WASM_SYMBOL_BINDING_WEAK;
 }
 
-MemoryBufferRef LazySymbol::getMemberBuffer() {
-  Archive::Child c =
-      CHECK(archiveSymbol.getMember(),
-            "could not get the member for symbol " + toString(*this));
-
-  return CHECK(c.getMemoryBufferRef(),
-               "could not get the buffer for the member defining symbol " +
-                   toString(*this));
-}
-
 void printTraceSymbolUndefined(StringRef name, const InputFile* file) {
   message(toString(file) + ": reference to " + name);
 }
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index 69ebfdb5bb356..de52c92d34e78 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -497,14 +497,12 @@ class UndefinedTag : public TagSymbol {
 // symbols into consideration.
 class LazySymbol : public Symbol {
 public:
-  LazySymbol(StringRef name, uint32_t flags, InputFile *file,
-             const llvm::object::Archive::Symbol &sym)
-      : Symbol(name, LazyKind, flags, file), archiveSymbol(sym) {}
+  LazySymbol(StringRef name, uint32_t flags, InputFile *file)
+      : Symbol(name, LazyKind, flags, file) {}
 
   static bool classof(const Symbol *s) { return s->kind() == LazyKind; }
   void extract();
   void setWeak();
-  MemoryBufferRef getMemberBuffer();
 
   // Lazy symbols can have a signature because they can replace an
   // UndefinedFunction in which case we need to be able to preserve the
@@ -512,9 +510,6 @@ class LazySymbol : public Symbol {
   // TODO(sbc): This repetition of the signature field is inelegant.  Revisit
   // the use of class hierarchy to represent symbol taxonomy.
   const WasmSignature *signature = nullptr;
-
-private:
-  llvm::object::Archive::Symbol archiveSymbol;
 };
 
 // linker-generated symbols

From ddad7e30973c00f375620e4a646ed30cb724ef16 Mon Sep 17 00:00:00 2001
From: Jeremy Kun <2467754+j2kun@users.noreply.github.com>
Date: Fri, 19 Jan 2024 16:31:08 -0800
Subject: [PATCH 196/843] [mlir][amdgpu] Fix bazel build (#78820)

Broken by
https://github.com/llvm/llvm-project/commit/b7360fbe8ca0c9411e89fafd654856c484f84f5e
---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 94093570cfeaf..cd3236b3f5a3d 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -1530,10 +1530,16 @@ cc_library(
         ":AMDGPUUtils",
         ":ArithDialect",
         ":ControlFlowDialect",
+        ":FuncDialect",
+        ":GPUDialect",
         ":IR",
+        ":MemRefDialect",
         ":Pass",
+        ":Support",
         ":TransformUtils",
         ":Transforms",
+        ":VectorDialect",
+        "//llvm:Support",
     ],
 )
 
@@ -6272,7 +6278,7 @@ cc_library(
 
 cc_library(
     name = "NVVMToLLVM",
-    srcs = glob(["lib/Conversion/NVVMToLLVM/NVVMToLLVM.cpp"]),
+    srcs = ["lib/Conversion/NVVMToLLVM/NVVMToLLVM.cpp"],
     hdrs = glob(["include/mlir/Conversion/NVVMToLLVM/*.h"]),
     includes = ["include"],
     deps = [

From ab0d8fc4a6ec202159c36c892671f1568be4ae70 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Sat, 20 Jan 2024 08:38:22 +0800
Subject: [PATCH 197/843] Reland "[CodeGen] Support start/stop in
 CodeGenPassBuilder (#70912)" (#78570)

Unfortunately the legacy pass system can't recognize `no-op-module` and
`no-op-function` so it causes test failure in `CodeGenTests`. Add a
workaround in function `PassInfo *getPassInfo(StringRef PassName)`,
`TargetPassConfig.cpp`.
---
 .../include/llvm/CodeGen/CodeGenPassBuilder.h | 133 +++++++++++++++---
 llvm/include/llvm/CodeGen/TargetPassConfig.h  |  15 ++
 llvm/lib/CodeGen/TargetPassConfig.cpp         | 109 +++++---------
 llvm/lib/Passes/PassBuilder.cpp               |   4 +-
 4 files changed, 164 insertions(+), 97 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
index 78ee7bef02ab1..12088f6fc35e0 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
@@ -44,6 +44,7 @@
 #include "llvm/CodeGen/ShadowStackGCLowering.h"
 #include "llvm/CodeGen/SjLjEHPrepare.h"
 #include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/UnreachableBlockElim.h"
 #include "llvm/CodeGen/WasmEHPrepare.h"
 #include "llvm/CodeGen/WinEHPrepare.h"
@@ -176,73 +177,80 @@ template <typename DerivedT> class CodeGenPassBuilder {
   // Function object to maintain state while adding codegen IR passes.
   class AddIRPass {
   public:
-    AddIRPass(ModulePassManager &MPM) : MPM(MPM) {}
+    AddIRPass(ModulePassManager &MPM, const DerivedT &PB) : MPM(MPM), PB(PB) {}
     ~AddIRPass() {
       if (!FPM.isEmpty())
         MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
     }
 
-    template <typename PassT> void operator()(PassT &&Pass) {
+    template <typename PassT>
+    void operator()(PassT &&Pass, StringRef Name = PassT::name()) {
       static_assert((is_detected<is_function_pass_t, PassT>::value ||
                      is_detected<is_module_pass_t, PassT>::value) &&
                     "Only module pass and function pass are supported.");
 
+      if (!PB.runBeforeAdding(Name))
+        return;
+
       // Add Function Pass
       if constexpr (is_detected<is_function_pass_t, PassT>::value) {
         FPM.addPass(std::forward<PassT>(Pass));
+
+        for (auto &C : PB.AfterCallbacks)
+          C(Name);
       } else {
         // Add Module Pass
         if (!FPM.isEmpty()) {
           MPM.addPass(createModuleToFunctionPassAdaptor(std::move(FPM)));
           FPM = FunctionPassManager();
         }
+
         MPM.addPass(std::forward<PassT>(Pass));
+
+        for (auto &C : PB.AfterCallbacks)
+          C(Name);
       }
     }
 
   private:
     ModulePassManager &MPM;
     FunctionPassManager FPM;
+    const DerivedT &PB;
   };
 
   // Function object to maintain state while adding codegen machine passes.
   class AddMachinePass {
   public:
-    AddMachinePass(MachineFunctionPassManager &PM) : PM(PM) {}
+    AddMachinePass(MachineFunctionPassManager &PM, const DerivedT &PB)
+        : PM(PM), PB(PB) {}
 
     template <typename PassT> void operator()(PassT &&Pass) {
       static_assert(
           is_detected<has_key_t, PassT>::value,
           "Machine function pass must define a static member variable `Key`.");
-      for (auto &C : BeforeCallbacks)
-        if (!C(&PassT::Key))
-          return;
+
+      if (!PB.runBeforeAdding(PassT::name()))
+        return;
+
       PM.addPass(std::forward<PassT>(Pass));
-      for (auto &C : AfterCallbacks)
-        C(&PassT::Key);
+
+      for (auto &C : PB.AfterCallbacks)
+        C(PassT::name());
     }
 
     template <typename PassT> void insertPass(MachinePassKey *ID, PassT Pass) {
-      AfterCallbacks.emplace_back(
+      PB.AfterCallbacks.emplace_back(
           [this, ID, Pass = std::move(Pass)](MachinePassKey *PassID) {
             if (PassID == ID)
               this->PM.addPass(std::move(Pass));
           });
     }
 
-    void disablePass(MachinePassKey *ID) {
-      BeforeCallbacks.emplace_back(
-          [ID](MachinePassKey *PassID) { return PassID != ID; });
-    }
-
     MachineFunctionPassManager releasePM() { return std::move(PM); }
 
   private:
     MachineFunctionPassManager &PM;
-    SmallVector<llvm::unique_function<bool(MachinePassKey *)>, 4>
-        BeforeCallbacks;
-    SmallVector<llvm::unique_function<void(MachinePassKey *)>, 4>
-        AfterCallbacks;
+    const DerivedT &PB;
   };
 
   LLVMTargetMachine &TM;
@@ -473,6 +481,25 @@ template <typename DerivedT> class CodeGenPassBuilder {
   const DerivedT &derived() const {
     return static_cast<const DerivedT &>(*this);
   }
+
+  bool runBeforeAdding(StringRef Name) const {
+    bool ShouldAdd = true;
+    for (auto &C : BeforeCallbacks)
+      ShouldAdd &= C(Name);
+    return ShouldAdd;
+  }
+
+  void setStartStopPasses(const TargetPassConfig::StartStopInfo &Info) const;
+
+  Error verifyStartStop(const TargetPassConfig::StartStopInfo &Info) const;
+
+  mutable SmallVector<llvm::unique_function<bool(StringRef)>, 4>
+      BeforeCallbacks;
+  mutable SmallVector<llvm::unique_function<void(StringRef)>, 4> AfterCallbacks;
+
+  /// Helper variable for `-start-before/-start-after/-stop-before/-stop-after`
+  mutable bool Started = true;
+  mutable bool Stopped = true;
 };
 
 template <typename Derived>
@@ -480,13 +507,17 @@ Error CodeGenPassBuilder<Derived>::buildPipeline(
     ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
     raw_pwrite_stream &Out, raw_pwrite_stream *DwoOut,
     CodeGenFileType FileType) const {
-  AddIRPass addIRPass(MPM);
+  auto StartStopInfo = TargetPassConfig::getStartStopInfo(*PIC);
+  if (!StartStopInfo)
+    return StartStopInfo.takeError();
+  setStartStopPasses(*StartStopInfo);
+  AddIRPass addIRPass(MPM, derived());
   // `ProfileSummaryInfo` is always valid.
   addIRPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
   addIRPass(RequireAnalysisPass<CollectorMetadataAnalysis, Module>());
   addISelPasses(addIRPass);
 
-  AddMachinePass addPass(MFPM);
+  AddMachinePass addPass(MFPM, derived());
   if (auto Err = addCoreISelPasses(addPass))
     return std::move(Err);
 
@@ -499,6 +530,68 @@ Error CodeGenPassBuilder<Derived>::buildPipeline(
       });
 
   addPass(FreeMachineFunctionPass());
+  return verifyStartStop(*StartStopInfo);
+}
+
+template <typename Derived>
+void CodeGenPassBuilder<Derived>::setStartStopPasses(
+    const TargetPassConfig::StartStopInfo &Info) const {
+  if (!Info.StartPass.empty()) {
+    Started = false;
+    BeforeCallbacks.emplace_back([this, &Info, AfterFlag = Info.StartAfter,
+                                  Count = 0u](StringRef ClassName) mutable {
+      if (Count == Info.StartInstanceNum) {
+        if (AfterFlag) {
+          AfterFlag = false;
+          Started = true;
+        }
+        return Started;
+      }
+
+      auto PassName = PIC->getPassNameForClassName(ClassName);
+      if (Info.StartPass == PassName && ++Count == Info.StartInstanceNum)
+        Started = !Info.StartAfter;
+
+      return Started;
+    });
+  }
+
+  if (!Info.StopPass.empty()) {
+    Stopped = false;
+    BeforeCallbacks.emplace_back([this, &Info, AfterFlag = Info.StopAfter,
+                                  Count = 0u](StringRef ClassName) mutable {
+      if (Count == Info.StopInstanceNum) {
+        if (AfterFlag) {
+          AfterFlag = false;
+          Stopped = true;
+        }
+        return !Stopped;
+      }
+
+      auto PassName = PIC->getPassNameForClassName(ClassName);
+      if (Info.StopPass == PassName && ++Count == Info.StopInstanceNum)
+        Stopped = !Info.StopAfter;
+      return !Stopped;
+    });
+  }
+}
+
+template <typename Derived>
+Error CodeGenPassBuilder<Derived>::verifyStartStop(
+    const TargetPassConfig::StartStopInfo &Info) const {
+  if (Started && Stopped)
+    return Error::success();
+
+  if (!Started)
+    return make_error<StringError>(
+        "Can't find start pass \"" +
+            PIC->getPassNameForClassName(Info.StartPass) + "\".",
+        std::make_error_code(std::errc::invalid_argument));
+  if (!Stopped)
+    return make_error<StringError>(
+        "Can't find stop pass \"" +
+            PIC->getPassNameForClassName(Info.StopPass) + "\".",
+        std::make_error_code(std::errc::invalid_argument));
   return Error::success();
 }
 
diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index 66365419aa330..de6a760c4e4fd 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -15,6 +15,7 @@
 
 #include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Error.h"
 #include <cassert>
 #include <string>
 
@@ -176,6 +177,20 @@ class TargetPassConfig : public ImmutablePass {
   static std::string
   getLimitedCodeGenPipelineReason(const char *Separator = "/");
 
+  struct StartStopInfo {
+    bool StartAfter;
+    bool StopAfter;
+    unsigned StartInstanceNum;
+    unsigned StopInstanceNum;
+    StringRef StartPass;
+    StringRef StopPass;
+  };
+
+  /// Returns pass name in `-stop-before` or `-stop-after`
+  /// NOTE: New pass manager migration only
+  static Expected<StartStopInfo>
+  getStartStopInfo(PassInstrumentationCallbacks &PIC);
+
   void setDisableVerify(bool Disable) { setOpt(DisableVerify, Disable); }
 
   bool getEnableTailMerge() const { return EnableTailMerge; }
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 3bbc792f4cbf4..76ba8da547e6b 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -504,81 +504,6 @@ CGPassBuilderOption llvm::getCGPassBuilderOption() {
   return Opt;
 }
 
-static void registerPartialPipelineCallback(PassInstrumentationCallbacks &PIC,
-                                            LLVMTargetMachine &LLVMTM) {
-  StringRef StartBefore;
-  StringRef StartAfter;
-  StringRef StopBefore;
-  StringRef StopAfter;
-
-  unsigned StartBeforeInstanceNum = 0;
-  unsigned StartAfterInstanceNum = 0;
-  unsigned StopBeforeInstanceNum = 0;
-  unsigned StopAfterInstanceNum = 0;
-
-  std::tie(StartBefore, StartBeforeInstanceNum) =
-      getPassNameAndInstanceNum(StartBeforeOpt);
-  std::tie(StartAfter, StartAfterInstanceNum) =
-      getPassNameAndInstanceNum(StartAfterOpt);
-  std::tie(StopBefore, StopBeforeInstanceNum) =
-      getPassNameAndInstanceNum(StopBeforeOpt);
-  std::tie(StopAfter, StopAfterInstanceNum) =
-      getPassNameAndInstanceNum(StopAfterOpt);
-
-  if (StartBefore.empty() && StartAfter.empty() && StopBefore.empty() &&
-      StopAfter.empty())
-    return;
-
-  std::tie(StartBefore, std::ignore) =
-      LLVMTM.getPassNameFromLegacyName(StartBefore);
-  std::tie(StartAfter, std::ignore) =
-      LLVMTM.getPassNameFromLegacyName(StartAfter);
-  std::tie(StopBefore, std::ignore) =
-      LLVMTM.getPassNameFromLegacyName(StopBefore);
-  std::tie(StopAfter, std::ignore) =
-      LLVMTM.getPassNameFromLegacyName(StopAfter);
-  if (!StartBefore.empty() && !StartAfter.empty())
-    report_fatal_error(Twine(StartBeforeOptName) + Twine(" and ") +
-                       Twine(StartAfterOptName) + Twine(" specified!"));
-  if (!StopBefore.empty() && !StopAfter.empty())
-    report_fatal_error(Twine(StopBeforeOptName) + Twine(" and ") +
-                       Twine(StopAfterOptName) + Twine(" specified!"));
-
-  PIC.registerShouldRunOptionalPassCallback(
-      [=, EnableCurrent = StartBefore.empty() && StartAfter.empty(),
-       EnableNext = std::optional<bool>(), StartBeforeCount = 0u,
-       StartAfterCount = 0u, StopBeforeCount = 0u,
-       StopAfterCount = 0u](StringRef P, Any) mutable {
-        bool StartBeforePass = !StartBefore.empty() && P.contains(StartBefore);
-        bool StartAfterPass = !StartAfter.empty() && P.contains(StartAfter);
-        bool StopBeforePass = !StopBefore.empty() && P.contains(StopBefore);
-        bool StopAfterPass = !StopAfter.empty() && P.contains(StopAfter);
-
-        // Implement -start-after/-stop-after
-        if (EnableNext) {
-          EnableCurrent = *EnableNext;
-          EnableNext.reset();
-        }
-
-        // Using PIC.registerAfterPassCallback won't work because if this
-        // callback returns false, AfterPassCallback is also skipped.
-        if (StartAfterPass && StartAfterCount++ == StartAfterInstanceNum) {
-          assert(!EnableNext && "Error: assign to EnableNext more than once");
-          EnableNext = true;
-        }
-        if (StopAfterPass && StopAfterCount++ == StopAfterInstanceNum) {
-          assert(!EnableNext && "Error: assign to EnableNext more than once");
-          EnableNext = false;
-        }
-
-        if (StartBeforePass && StartBeforeCount++ == StartBeforeInstanceNum)
-          EnableCurrent = true;
-        if (StopBeforePass && StopBeforeCount++ == StopBeforeInstanceNum)
-          EnableCurrent = false;
-        return EnableCurrent;
-      });
-}
-
 void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
                                    LLVMTargetMachine &LLVMTM) {
 
@@ -605,8 +530,40 @@ void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
 
     return true;
   });
+}
 
-  registerPartialPipelineCallback(PIC, LLVMTM);
+Expected<TargetPassConfig::StartStopInfo>
+TargetPassConfig::getStartStopInfo(PassInstrumentationCallbacks &PIC) {
+  auto [StartBefore, StartBeforeInstanceNum] =
+      getPassNameAndInstanceNum(StartBeforeOpt);
+  auto [StartAfter, StartAfterInstanceNum] =
+      getPassNameAndInstanceNum(StartAfterOpt);
+  auto [StopBefore, StopBeforeInstanceNum] =
+      getPassNameAndInstanceNum(StopBeforeOpt);
+  auto [StopAfter, StopAfterInstanceNum] =
+      getPassNameAndInstanceNum(StopAfterOpt);
+
+  if (!StartBefore.empty() && !StartAfter.empty())
+    return make_error<StringError>(
+        Twine(StartBeforeOptName) + " and " + StartAfterOptName + " specified!",
+        std::make_error_code(std::errc::invalid_argument));
+  if (!StopBefore.empty() && !StopAfter.empty())
+    return make_error<StringError>(
+        Twine(StopBeforeOptName) + " and " + StopAfterOptName + " specified!",
+        std::make_error_code(std::errc::invalid_argument));
+
+  StartStopInfo Result;
+  Result.StartPass = StartBefore.empty() ? StartAfter : StartBefore;
+  Result.StopPass = StopBefore.empty() ? StopAfter : StopBefore;
+  Result.StartInstanceNum =
+      StartBefore.empty() ? StartAfterInstanceNum : StartBeforeInstanceNum;
+  Result.StopInstanceNum =
+      StopBefore.empty() ? StopAfterInstanceNum : StopBeforeInstanceNum;
+  Result.StartAfter = !StartAfter.empty();
+  Result.StopAfter = !StopAfter.empty();
+  Result.StartInstanceNum += Result.StartInstanceNum == 0;
+  Result.StopInstanceNum += Result.StopInstanceNum == 0;
+  return Result;
 }
 
 // Out of line constructor provides default values for pass options and
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index d309ed999bd20..8d3f69be50383 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -93,6 +93,7 @@
 #include "llvm/CodeGen/ShadowStackGCLowering.h"
 #include "llvm/CodeGen/SjLjEHPrepare.h"
 #include "llvm/CodeGen/StackProtector.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TypePromotion.h"
 #include "llvm/CodeGen/WasmEHPrepare.h"
 #include "llvm/CodeGen/WinEHPrepare.h"
@@ -316,7 +317,8 @@ namespace {
 /// We currently only use this for --print-before/after.
 bool shouldPopulateClassToPassNames() {
   return PrintPipelinePasses || !printBeforePasses().empty() ||
-         !printAfterPasses().empty() || !isFilterPassesEmpty();
+         !printAfterPasses().empty() || !isFilterPassesEmpty() ||
+         TargetPassConfig::hasLimitedCodeGenPipeline();
 }
 
 // A pass for testing -print-on-crash.

From e611a4cf8060bf0a95b4acd9e136733425da085a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 19 Jan 2024 16:41:43 -0800
Subject: [PATCH 198/843] Revert "[mlir][amdgpu] Shared memory access
 optimization pass" (#78822)

Reverts llvm/llvm-project#75627 ; it broke the bot:
https://lab.llvm.org/buildbot/#/builders/61/builds/53218
---
 mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td |  17 --
 .../mlir/Dialect/AMDGPU/Transforms/Passes.h   |   3 +-
 .../mlir/Dialect/AMDGPU/Transforms/Passes.td  |  13 -
 .../Dialect/AMDGPU/Transforms/Transforms.h    |  54 ----
 .../mlir/Dialect/AMDGPU/Transforms/Utils.h    |  24 --
 mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp  |  15 --
 .../Dialect/AMDGPU/Transforms/CMakeLists.txt  |   2 -
 .../Transforms/OptimizeSharedMemory.cpp       | 243 ------------------
 mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp  |  39 ---
 .../AMDGPU/optimize_shmem_reads_writes.mlir   |  57 ----
 10 files changed, 1 insertion(+), 466 deletions(-)
 delete mode 100644 mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
 delete mode 100644 mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
 delete mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
 delete mode 100644 mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp
 delete mode 100644 mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir

diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index b4bf1b5191232..ffb302fcedd73 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -29,23 +29,6 @@ def AMDGPU_Dialect : Dialect {
     "gpu::GPUDialect"
   ];
   let useDefaultAttributePrinterParser = 1;
-
-  let extraClassDeclaration = [{
-    /// Return true if the given MemRefType has an integer address
-    /// space that matches the ROCDL shared memory address space or
-    /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
-    static bool hasSharedMemoryAddressSpace(MemRefType type);
-
-    /// Return true if the given Attribute has an integer address
-    /// space that matches the ROCDL shared memory address space or
-    /// is a gpu::AddressSpaceAttr attribute with value 'workgroup`.
-    static bool isSharedMemoryAddressSpace(Attribute type);
-
-    /// Defines the MemRef memory space attribute numeric value that indicates
-    /// a memref is located in shared memory. This should correspond to the
-    /// value used in ROCDL.
-    static constexpr unsigned kSharedMemoryAddressSpace = 3;
-  }];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
index 11d182ba5823e..8dd5ff1a4b198 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.h
@@ -20,8 +20,7 @@ namespace mlir {
 class ConversionTarget;
 namespace amdgpu {
 
-#define GEN_PASS_DECL
-
+#define GEN_PASS_DECL_AMDGPUEMULATEATOMICSPASS
 #define GEN_PASS_REGISTRATION
 #include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
index c8059e6d316e8..e6b27aa842dfc 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Passes.td
@@ -30,17 +30,4 @@ def AmdgpuEmulateAtomicsPass : Pass<"amdgpu-emulate-atomics"> {
                         "Chipset that these operations will run on">];
 }
 
-def OptimizeSharedMemory : Pass<"amdgpu-optimize-shared-memory"> {
-  let summary = "Optimizes accesses to shared memory memrefs in order to reduce bank conflicts.";
-  let description = [{
-    This pass adds a transformation and pass to the AMDGPU dialect that
-    attempts to optimize reads/writes from a memref representing GPU shared
-    memory in order to avoid bank conflicts.
-  }];
-
-  let dependentDialects = [
-    "memref::MemRefDialect", "vector::VectorDialect"
-  ];
-}
-
 #endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_PASSES_TD_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
deleted file mode 100644
index 140bc12deed69..0000000000000
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Transforms.h
+++ /dev/null
@@ -1,54 +0,0 @@
-//===- Transforms.h - AMDGPU Dialect transformations --------------*-
-// C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file declares functions that assist transformations for the amdgpu
-// dialect.
-//
-//===----------------------------------------------------------------------===//
-#ifndef MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
-#define MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
-
-#include "mlir/IR/Operation.h"
-#include "mlir/Support/LogicalResult.h"
-
-namespace mlir {
-class RewriterBase;
-
-namespace amdgpu {
-
-///
-/// Passes
-///
-
-/// Optimizes vectorized accesses to a shared memory buffer specified by
-/// memrefValue. This transformation assumes the following:
-/// 1) All relevant accesses to `memrefValue` are contained with `parentOp`.
-/// 2) The function will fail precondition checks if any subviews are
-/// taken of `memrefValue`. All reads/writes to `memrefValue` should occur
-/// through `memrefValue` directly.
-///
-/// Shared memory bank conflicts occur when multiple threads attempt to read or
-/// write locations assigned to the same shared memory bank. For `2^N` byte
-/// vectorized accesses, we need to be concerned with conflicts among threads
-/// identified as `(tid) -> tid.floordiv(2^{7-N})`. As such, this transformation
-/// changes any indexed memory access (vector.load, memref.load, etc)
-/// such that the final dimension's index value is permuted such that
-/// `newColIndex = oldColIndex % vectorSize +
-/// perm[rowIndex](oldColIndex/vectorSize, rowIndex)` where `rowIndex` is the
-/// index for the second-to last dimension and `perm[rowIndex]` is a permutation
-/// function that depends on the row Index. The permutation function is chosen
-/// to ensure that sequential distributed+vectorized reads/writes down a single
-/// dimension of the memref have minimal conflicts.
-mlir::LogicalResult optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                       Value memrefValue);
-
-} // namespace amdgpu
-} // namespace mlir
-
-#endif // MLIR_DIALECT_AMDGPU_TRANSFORMS_TRANSFORMS_H_
diff --git a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h b/mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
deleted file mode 100644
index 6be57ca54b15f..0000000000000
--- a/mlir/include/mlir/Dialect/AMDGPU/Transforms/Utils.h
+++ /dev/null
@@ -1,24 +0,0 @@
-//===- Utils.h - Transform utilities -----------------------------*- C++-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/IR/Operation.h"
-
-namespace mlir {
-namespace amdgpu {
-
-/// Get and set the indices that the given load/store operation is operating on.
-/// Preconditions:
-/// - The Op must have memory affects
-/// - Considers memref::LoadOp,  vector::LoadOp,  vector::TransferReadOp
-/// - Considers memref::StoreOp, vector::StoreOp, vector::TransferWriteOp
-/// - Excludes subview op
-std::optional<Operation::operand_range> getIndices(Operation *op);
-void setIndices(Operation *op, ArrayRef<Value> indices);
-
-} // namespace amdgpu
-} // namespace mlir
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 4e72fbf56b80a..2575ad4984814 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -43,21 +43,6 @@ void AMDGPUDialect::initialize() {
       >();
 }
 
-bool amdgpu::AMDGPUDialect::isSharedMemoryAddressSpace(Attribute memorySpace) {
-  if (!memorySpace)
-    return false;
-  if (auto intAttr = llvm::dyn_cast<IntegerAttr>(memorySpace))
-    return intAttr.getInt() == AMDGPUDialect::kSharedMemoryAddressSpace;
-  if (auto gpuAttr = llvm::dyn_cast<gpu::AddressSpaceAttr>(memorySpace))
-    return gpuAttr.getValue() == gpu::AddressSpace::Workgroup;
-  return false;
-}
-
-bool amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(MemRefType type) {
-  Attribute memorySpace = type.getMemorySpace();
-  return isSharedMemoryAddressSpace(memorySpace);
-}
-
 //===----------------------------------------------------------------------===//
 // 8-bit float ops
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
index a1a91270bc55c..e11b6cc88bf22 100644
--- a/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/AMDGPU/Transforms/CMakeLists.txt
@@ -1,7 +1,5 @@
 add_mlir_dialect_library(MLIRAMDGPUTransforms
   EmulateAtomics.cpp
-  OptimizeSharedMemory.cpp
-  Utils.cpp
 
   ADDITIONAL_HEADER_DIRS
   {$MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/AMDGPU/Transforms
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
deleted file mode 100644
index c7001fc6d57d5..0000000000000
--- a/mlir/lib/Dialect/AMDGPU/Transforms/OptimizeSharedMemory.cpp
+++ /dev/null
@@ -1,243 +0,0 @@
-//===- OptimizeSharedMemory.cpp - MLIR AMDGPU pass implementation ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements transforms to optimize accesses to shared memory.
-// It is inspired by
-// https://github.com/llvm/llvm-project/blob/main/mlir/lib/Dialect/NVGPU/Transforms/OptimizeSharedMemory.cpp
-//
-//===----------------------------------------------------------------------===//
-
-#include "mlir/Dialect/AMDGPU/Transforms/Passes.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/AMDGPU/Transforms/Transforms.h"
-#include "mlir/Dialect/AMDGPU/Transforms/Utils.h"
-#include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Dialect/Func/IR/FuncOps.h"
-#include "mlir/Dialect/GPU/IR/GPUDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-#include "mlir/Interfaces/SideEffectInterfaces.h"
-#include "mlir/Support/LogicalResult.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Support/MathExtras.h"
-
-namespace mlir {
-namespace amdgpu {
-#define GEN_PASS_DEF_OPTIMIZESHAREDMEMORY
-#include "mlir/Dialect/AMDGPU/Transforms/Passes.h.inc"
-} // namespace amdgpu
-} // namespace mlir
-
-using namespace mlir;
-using namespace mlir::amdgpu;
-
-/// The size of a shared memory line according to AMD documentation.
-/// https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/instinct-mi200-cdna2-instruction-set-architecture.pdf
-constexpr int64_t kSharedMemoryLineSizeBytes = 64;
-/// We optimize for 64bit accesses, but this can be made an argument in the
-/// future.
-constexpr int64_t kDefaultVectorSizeBits = 64;
-
-/// Uses `srcIndexValue` to permute `tgtIndexValue` via
-/// `result = xor(floordiv(srcIdxVal,permuteEveryN),
-///               floordiv(tgtIdxVal,vectorSize)))
-///            + tgtIdxVal % vectorSize`
-/// This is done using an optimized sequence of `arith` operations.
-static Value permuteVectorOffset(OpBuilder &b, Location loc,
-                                 ArrayRef<Value> indices, MemRefType memrefTy,
-                                 int64_t srcDim, int64_t tgtDim) {
-  // Adjust the src index to change how often the permutation changes
-  // if necessary.
-  Value src = indices[srcDim];
-
-  // We only want to permute every N iterations of the target dim where N is
-  // ceil(sharedMemoryLineSizeBytes / dimSizeBytes(tgtDim)).
-  const int64_t permuteEveryN = std::max<int64_t>(
-      1, kSharedMemoryLineSizeBytes / ((memrefTy.getDimSize(tgtDim) *
-                                        memrefTy.getElementTypeBitWidth()) /
-                                       8));
-
-  // clang-format off
-  // Index bit representation (b0 = least significant bit) for dim(1)
-  // of a `memref<?x?xDT>` is as follows:
-  // N := log2(128/elementSizeBits)
-  // M := log2(dimSize(1))
-  // then
-  // bits[0:N] = sub-vector element offset
-  // bits[N:M] = vector index
-  // clang-format on
-  int64_t n =
-      llvm::Log2_64(kDefaultVectorSizeBits / memrefTy.getElementTypeBitWidth());
-  int64_t m = llvm::Log2_64(memrefTy.getDimSize(tgtDim));
-
-  // Capture bits[0:(M-N)] of src by first creating a (M-N) mask.
-  int64_t mask = (1LL << (m - n)) - 1;
-  if (permuteEveryN > 1)
-    mask = mask << llvm::Log2_64(permuteEveryN);
-  Value srcBits = b.create<arith::ConstantIndexOp>(loc, mask);
-  srcBits = b.create<arith::AndIOp>(loc, src, srcBits);
-
-  // Use the src bits to permute the target bits b[N:M] containing the
-  // vector offset.
-  if (permuteEveryN > 1) {
-    int64_t shlBits = n - llvm::Log2_64(permuteEveryN);
-    if (shlBits > 0) {
-      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, shlBits);
-      srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
-    } else if (shlBits < 0) {
-      Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, -1 * shlBits);
-      srcBits = b.createOrFold<arith::ShRUIOp>(loc, srcBits, finalShiftVal);
-    }
-  } else {
-    Value finalShiftVal = b.create<arith::ConstantIndexOp>(loc, n);
-    srcBits = b.createOrFold<arith::ShLIOp>(loc, srcBits, finalShiftVal);
-  }
-
-  Value permutedVectorIdx =
-      b.create<arith::XOrIOp>(loc, indices[tgtDim], srcBits);
-  return permutedVectorIdx;
-}
-
-static void transformIndices(OpBuilder &builder, Location loc,
-                             SmallVector<Value, 4> &indices,
-                             MemRefType memrefTy, int64_t srcDim,
-                             int64_t tgtDim) {
-  indices[tgtDim] =
-      permuteVectorOffset(builder, loc, indices, memrefTy, srcDim, tgtDim);
-}
-
-/// Return all operations within `parentOp` that read from or write to
-/// `shmMemRef`.
-static LogicalResult
-getShmReadAndWriteOps(Operation *parentOp, Value shmMemRef,
-                      SmallVector<Operation *, 16> &readOps,
-                      SmallVector<Operation *, 16> &writeOps) {
-  parentOp->walk([&](Operation *op) {
-    MemoryEffectOpInterface iface = dyn_cast<MemoryEffectOpInterface>(op);
-    if (!iface)
-      return;
-    std::optional<MemoryEffects::EffectInstance> effect =
-        iface.getEffectOnValue<MemoryEffects::Read>(shmMemRef);
-    if (effect) {
-      readOps.push_back(op);
-      return;
-    }
-    effect = iface.getEffectOnValue<MemoryEffects::Write>(shmMemRef);
-    if (effect)
-      writeOps.push_back(op);
-  });
-
-  // Restrict to a supported set of ops. We also require at least 2D access,
-  // although this could be relaxed.
-  if (llvm::any_of(readOps, [](Operation *op) {
-        return !isa<memref::LoadOp, vector::LoadOp, vector::TransferReadOp>(
-                   op) ||
-               amdgpu::getIndices(op)->size() < 2;
-      }))
-    return failure();
-  if (llvm::any_of(writeOps, [](Operation *op) {
-        return !isa<memref::StoreOp, vector::StoreOp, vector::TransferWriteOp>(
-                   op) ||
-               amdgpu::getIndices(op)->size() < 2;
-      }))
-    return failure();
-
-  return success();
-}
-
-mlir::LogicalResult
-mlir::amdgpu::optimizeSharedMemoryReadsAndWrites(Operation *parentOp,
-                                                 Value memrefValue) {
-  auto memRefType = dyn_cast<MemRefType>(memrefValue.getType());
-  if (!memRefType ||
-      !amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(memRefType))
-    return failure();
-
-  // Abort if the given value has any sub-views; we do not do any alias
-  // analysis.
-  bool hasSubView = false;
-  parentOp->walk([&](memref::SubViewOp subView) { hasSubView = true; });
-  if (hasSubView)
-    return failure();
-
-  // Check if this is necessary given the assumption of 128b accesses:
-  // If dim[rank-1] is small enough to fit 8 rows in a 128B line.
-  const int64_t rowSize = memRefType.getDimSize(memRefType.getRank() - 1);
-  const int64_t rowsPerLine =
-      (8 * kSharedMemoryLineSizeBytes / memRefType.getElementTypeBitWidth()) /
-      rowSize;
-  const int64_t threadGroupSize =
-      1LL << (7 - llvm::Log2_64(kDefaultVectorSizeBits / 8));
-  if (rowsPerLine >= threadGroupSize)
-    return failure();
-
-  // Get sets of operations within the function that read/write to shared
-  // memory.
-  SmallVector<Operation *, 16> shmReadOps;
-  SmallVector<Operation *, 16> shmWriteOps;
-  if (failed(getShmReadAndWriteOps(parentOp, memrefValue, shmReadOps,
-                                   shmWriteOps)))
-    return failure();
-
-  if (shmReadOps.empty() || shmWriteOps.empty())
-    return failure();
-
-  OpBuilder builder(parentOp->getContext());
-
-  int64_t tgtDim = memRefType.getRank() - 1;
-  int64_t srcDim = memRefType.getRank() - 2;
-
-  // Transform indices for the ops writing to shared memory.
-  while (!shmWriteOps.empty()) {
-    Operation *shmWriteOp = shmWriteOps.pop_back_val();
-    builder.setInsertionPoint(shmWriteOp);
-
-    auto indices = amdgpu::getIndices(shmWriteOp);
-    SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
-    transformIndices(builder, shmWriteOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
-    amdgpu::setIndices(shmWriteOp, transformedIndices);
-  }
-
-  // Transform indices for the ops reading from shared memory.
-  while (!shmReadOps.empty()) {
-    Operation *shmReadOp = shmReadOps.pop_back_val();
-    builder.setInsertionPoint(shmReadOp);
-
-    auto indices = amdgpu::getIndices(shmReadOp);
-    SmallVector<Value, 4> transformedIndices(indices->begin(), indices->end());
-    transformIndices(builder, shmReadOp->getLoc(), transformedIndices,
-                     memRefType, srcDim, tgtDim);
-    amdgpu::setIndices(shmReadOp, transformedIndices);
-  }
-
-  return success();
-}
-
-struct OptimizeSharedMemoryPass
-    : public amdgpu::impl::OptimizeSharedMemoryBase<OptimizeSharedMemoryPass> {
-public:
-  OptimizeSharedMemoryPass() = default;
-
-  void runOnOperation() override {
-    Operation *op = getOperation();
-    SmallVector<memref::AllocOp> shmAllocOps;
-    op->walk([&](memref::AllocOp allocOp) {
-      if (!amdgpu::AMDGPUDialect::hasSharedMemoryAddressSpace(
-              allocOp.getType()))
-        return;
-      shmAllocOps.push_back(allocOp);
-    });
-    for (auto allocOp : shmAllocOps) {
-      if (failed(optimizeSharedMemoryReadsAndWrites(getOperation(),
-                                                    allocOp.getMemref())))
-        return;
-    }
-  }
-};
diff --git a/mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp b/mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp
deleted file mode 100644
index 8163eeafdf1f0..0000000000000
--- a/mlir/lib/Dialect/AMDGPU/Transforms/Utils.cpp
+++ /dev/null
@@ -1,39 +0,0 @@
-#include "mlir/Dialect/AMDGPU/Transforms/Utils.h"
-
-#include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
-#include "mlir/Dialect/MemRef/IR/MemRef.h"
-#include "mlir/Dialect/Vector/IR/VectorOps.h"
-
-using namespace mlir;
-using namespace mlir::amdgpu;
-
-std::optional<Operation::operand_range> amdgpu::getIndices(Operation *op) {
-  if (auto loadOp = dyn_cast<memref::LoadOp>(op))
-    return loadOp.getIndices();
-  if (auto storeOp = dyn_cast<memref::StoreOp>(op))
-    return storeOp.getIndices();
-  if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
-    return vectorReadOp.getIndices();
-  if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
-    return vectorStoreOp.getIndices();
-  if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
-    return transferReadOp.getIndices();
-  if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
-    return transferWriteOp.getIndices();
-  return std::nullopt;
-}
-
-void amdgpu::setIndices(Operation *op, ArrayRef<Value> indices) {
-  if (auto loadOp = dyn_cast<memref::LoadOp>(op))
-    return loadOp.getIndicesMutable().assign(indices);
-  if (auto storeOp = dyn_cast<memref::StoreOp>(op))
-    return storeOp.getIndicesMutable().assign(indices);
-  if (auto vectorReadOp = dyn_cast<vector::LoadOp>(op))
-    return vectorReadOp.getIndicesMutable().assign(indices);
-  if (auto vectorStoreOp = dyn_cast<vector::StoreOp>(op))
-    return vectorStoreOp.getIndicesMutable().assign(indices);
-  if (auto transferReadOp = dyn_cast<vector::TransferReadOp>(op))
-    return transferReadOp.getIndicesMutable().assign(indices);
-  if (auto transferWriteOp = dyn_cast<vector::TransferWriteOp>(op))
-    return transferWriteOp.getIndicesMutable().assign(indices);
-}
diff --git a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir b/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
deleted file mode 100644
index 41111dddda520..0000000000000
--- a/mlir/test/Dialect/AMDGPU/optimize_shmem_reads_writes.mlir
+++ /dev/null
@@ -1,57 +0,0 @@
-// RUN: mlir-opt  %s --pass-pipeline='builtin.module(func.func(amdgpu-optimize-shared-memory))' | FileCheck %s
-  
-  // CHECK: @optimize_shmem([[arg0:%.+]]: memref<{{.*}}>, [[readRow:%.+]]: index, [[readCol:%.+]]: index, [[writeRow:%.+]]: index, [[writeCol:%.+]]: index, [[fragRow:%.+]]: index, [[fragCol:%.+]]: index, [[fragColPerm:%.+]]: index, [[stRow:%.+]]: index, [[stCol:%.+]]: index)
-  func.func @optimize_shmem(%arg0: memref<4096x4096xf16>, 
-                    %readRow: index, %readCol: index,
-                    %writeRow: index, %writeCol: index,
-                    %fragRow: index, %fragCol: index, 
-                    %fragColPerm: index,
-                    %stRow: index, %stCol: index) {
-    // CHECK:    %[[cst:.+]] = arith.constant 0.000000e+00 : f16                  
-    %cst = arith.constant 0.000000e+00 : f16
-
-    // CHECK: [[shmA:%.+]] = memref.alloc
-    // CHECK: [[shmB:%.+]] = memref.alloc
-    %shmA = memref.alloc() {alignment = 64 : i64} : memref<128x32xf16, 3>
-    %shmB = memref.alloc() {alignment = 64 : i64} : memref<256x32xf16, 3>
-
-    // CHECK: %[[D0:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    %0 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D0:.+]], [[shmB]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
-    vector.transfer_write %0, %shmB[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<256x32xf16, 3>
-    gpu.barrier
-    gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]     
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]       
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]] 
-    // CHECK:  vector.load [[shmB:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<256x32xf16, 3>, vector<8xf16>
-    %1 = vector.load %shmB[%fragRow, %fragColPerm] : memref<256x32xf16, 3>, vector<8xf16>
-
-    // CHECK: %[[D2:.+]] = vector.transfer_read [[arg0:%.+]][[[readRow:%.+]], [[readCol:%.+]]], [[cst:.+]] {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    %2 = vector.transfer_read %arg0[%readRow, %readCol], %cst {in_bounds = [true, true]} : memref<4096x4096xf16>, vector<1x8xf16>
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                  
-    // CHECK: [[srcBits:%.+]] = arith.andi [[stRow:%.+]], [[c7]]       
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                 
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]]     
-    // CHECK: [[stColPerm:%.+]] = arith.xori [[stCol:%.+]], [[xorBits]]  
-    // CHECK: vector.transfer_write %[[D2:.+]], [[shmA:%.+]][[[writeRow:%.+]], [[writeCol:%.+]]] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
-    vector.transfer_write %2, %shmA[%writeRow, %writeCol] {in_bounds = [true, true]} : vector<1x8xf16>, memref<128x32xf16, 3>
-    gpu.barrier
-    gpu.barrier
-    // CHECK: [[c7:%.+]] = arith.constant 7 : index                     
-    // CHECK: [[srcBits:%.+]] = arith.andi [[fragRow]], [[c7]]          
-    // CHECK: [[c2:%.+]] = arith.constant 2 : index                     
-    // CHECK: [[xorBits:%.+]] = arith.shli [[srcBits]], [[c2]] 
-    // CHECK: [[fragColPerm:%.+]] = arith.xori [[fragCol:%.+]], [[xorBits]]
-    // CHECK:  vector.load [[shmA:%.+]][[[fragRow:%.+]], [[fragColPerm]]] : memref<128x32xf16, 3>, vector<8xf16>
-    %3 = vector.load %shmA[%fragRow, %fragColPerm] : memref<128x32xf16, 3>, vector<8xf16>
-    return
-  }
-  
\ No newline at end of file

From bd3838ff6b4310fb8ff68649ef87e5e962bab1fd Mon Sep 17 00:00:00 2001
From: Jason Molenda <jason@molenda.com>
Date: Fri, 19 Jan 2024 17:03:19 -0800
Subject: [PATCH 199/843] Skip TestThreadLocal.py on darwin temporarily for
 linker issue

The new static linker in Xcode 15 does not emit the necessary
symbols for file static thread local storage, causing this test
to fail when used.  The old static linker is still available
as ld-classic in Xcode 15, but it has to be invoked specially, and
the new static linker will be fixed at some point.  I may try to
add linker name and versioning information in
lldb/packages/Python/lldbsuite/test/decorators.py like we do with
the compiler / compiler_version, so it can be xfailed for known
problematic static linker name / versions, but until I get that
sorted I'm skipping this test to unblock the CI bots.
---
 lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py b/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py
index 636e4819ea298..9b128ba6097ac 100644
--- a/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py
+++ b/lldb/test/API/lang/cpp/thread_local/TestThreadLocal.py
@@ -9,6 +9,7 @@
 
 class PlatformProcessCrashInfoTestCase(TestBase):
     @expectedFailureAll(oslist=["windows", "linux", "freebsd", "netbsd"])
+    @skipIfDarwin  # rdar://120795095
     def test_thread_local(self):
         # Set a breakpoint on the first instruction of the main function,
         # before the TLS initialization has run.

From a387bce4bcbaeb28bf4510817ce54602e2f7a21d Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Fri, 19 Jan 2024 17:23:51 -0800
Subject: [PATCH 200/843] [MLGO] Upstream the corpus extraction tooling
 (#72319)

This patch upstreams some of the MLGO utilities, particularly the corpus
extraction tooling, into LLVM proper. The motivation for this patch is
available in the RFC.


https://discourse.llvm.org/t/rfc-upstreaming-elements-of-the-mlgo-tooling/74939
---
 llvm/CMakeLists.txt                           |   1 +
 llvm/utils/mlgo-utils/CMakeLists.txt          |  11 +
 llvm/utils/mlgo-utils/README.md               |  12 +
 llvm/utils/mlgo-utils/mlgo/__init__.py        |   6 +
 .../mlgo/corpus/combine_training_corpus.py    |  48 +++
 .../corpus/combine_training_corpus_lib.py     |  38 ++
 .../mlgo-utils/mlgo/corpus/extract_ir.py      | 165 ++++++++
 .../mlgo-utils/mlgo/corpus/extract_ir_lib.py  | 395 ++++++++++++++++++
 .../mlgo-utils/mlgo/corpus/make_corpus.py     |  54 +++
 .../mlgo-utils/mlgo/corpus/make_corpus_lib.py |  77 ++++
 llvm/utils/mlgo-utils/pyproject.toml          |  25 ++
 .../corpus/combine_training_corpus_test.py    | 132 ++++++
 .../tests/corpus/extract_ir_test.py           | 339 +++++++++++++++
 .../tests/corpus/make_corpus_test.py          |  71 ++++
 llvm/utils/mlgo-utils/tests/lit.cfg           |  15 +
 llvm/utils/mlgo-utils/tests/lit.local.cfg     |  14 +
 llvm/utils/mlgo-utils/tests/lit.site.cfg.in   |  10 +
 17 files changed, 1413 insertions(+)
 create mode 100644 llvm/utils/mlgo-utils/CMakeLists.txt
 create mode 100644 llvm/utils/mlgo-utils/README.md
 create mode 100644 llvm/utils/mlgo-utils/mlgo/__init__.py
 create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
 create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
 create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
 create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
 create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
 create mode 100644 llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py
 create mode 100644 llvm/utils/mlgo-utils/pyproject.toml
 create mode 100644 llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
 create mode 100644 llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
 create mode 100644 llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
 create mode 100644 llvm/utils/mlgo-utils/tests/lit.cfg
 create mode 100644 llvm/utils/mlgo-utils/tests/lit.local.cfg
 create mode 100644 llvm/utils/mlgo-utils/tests/lit.site.cfg.in

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 61ab69d237470..1d230004e6c34 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -1197,6 +1197,7 @@ if( LLVM_INCLUDE_UTILS )
   add_subdirectory(utils/UnicodeData)
   add_subdirectory(utils/yaml-bench)
   add_subdirectory(utils/split-file)
+  add_subdirectory(utils/mlgo-utils)
   if( LLVM_INCLUDE_TESTS )
     add_subdirectory(${LLVM_THIRD_PARTY_DIR}/unittest ${CMAKE_CURRENT_BINARY_DIR}/third-party/unittest)
   endif()
diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt
new file mode 100644
index 0000000000000..7b303c7639401
--- /dev/null
+++ b/llvm/utils/mlgo-utils/CMakeLists.txt
@@ -0,0 +1,11 @@
+configure_lit_site_cfg(
+  "${CMAKE_CURRENT_SOURCE_DIR}/tests/lit.site.cfg.in"
+  "${CMAKE_CURRENT_BINARY_DIR}/lit.site.cfg"
+)
+
+add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
+  ${CMAKE_CURRENT_BINARY_DIR}
+  DEPENDS "FileCheck" "not" "count"
+)
+
+set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests")
diff --git a/llvm/utils/mlgo-utils/README.md b/llvm/utils/mlgo-utils/README.md
new file mode 100644
index 0000000000000..12e9375f23eda
--- /dev/null
+++ b/llvm/utils/mlgo-utils/README.md
@@ -0,0 +1,12 @@
+# MLGO Python Utilities
+
+This folder contains MLGO Python utilities, particularly infrastructure
+to help enable ML applications within LLVM, especially tooling to extract
+corpora that can be used in downstream projects to train ML models and perform
+other tasks that benefit from having a large amount of data.
+
+### Python Versioning
+
+Due to type annotations, the MLGO tooling currently only supports a Python
+version greater than 3.8, deviating from the current LLVM project-wide
+minimum supported version of Python 3.6.
diff --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
new file mode 100644
index 0000000000000..bcb5de2ff4d57
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -0,0 +1,6 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+__versioninfo__ = (18, 0, 0)
+__version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
new file mode 100644
index 0000000000000..9aabd87b4688e
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -0,0 +1,48 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+r"""Combine multiple training corpus into a single training corpus.
+
+Currently only support the case that multiple corpus share the same
+configurables except the "modules" field.
+
+Usage: we'd like to combine training corpus corpus1 and corpus2 into
+combinedcorpus; we first structure the files as follows:
+
+combinedcorpus
+combinedcorpus/corpus1
+combinedcorpus/corpus2
+
+Running this script with
+
+python3 \
+compiler_opt/tools/combine_training_corpus.py \
+  --root_dir=$PATH_TO_combinedcorpus
+
+generates combinedcorpus/corpus_description.json file. In this way corpus1
+and corpus2 are combined into combinedcorpus.
+"""
+
+from absl import app
+from absl import flags
+
+from mlgo.corpus import combine_training_corpus_lib
+
+flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+
+    combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
+
+
+def entrypoint():
+    app.run(main)
+
+
+if __name__ == "__main__":
+    entrypoint()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
new file mode 100644
index 0000000000000..e2ae8699ec318
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
@@ -0,0 +1,38 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library for combining training corpora."""
+
+import os
+import json
+import glob
+
+from absl import logging
+
+_FILE_NAME = "corpus_description.json"
+
+
+def combine_corpus(root_dir: str) -> None:
+    module_names = []
+    output_corpus_description = {}
+
+    corpus_description_glob = os.path.join(root_dir, "*/" + _FILE_NAME)
+    for corpus_description_path in glob.glob(corpus_description_glob):
+        logging.info("processing %s", corpus_description_path)
+
+        with open(corpus_description_path, encoding="utf-8") as f:
+            corpus_description = json.load(f)
+            sub_dir = os.path.basename(os.path.dirname(corpus_description_path))
+            module_names.extend(
+                [os.path.join(sub_dir, name) for name in corpus_description["modules"]]
+            )
+            del corpus_description["modules"]
+            if len(output_corpus_description) == 0:
+                output_corpus_description = corpus_description
+            elif corpus_description != output_corpus_description:
+                raise ValueError("Input corpora differ by more than modules.")
+
+    output_corpus_description["modules"] = module_names
+
+    with open(os.path.join(root_dir, _FILE_NAME), "w") as f:
+        json.dump(output_corpus_description, f, indent=2)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
new file mode 100644
index 0000000000000..9463e61dc534f
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -0,0 +1,165 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Extract IR for training.
+
+Extract IR for training, either from a compile_commands.json file produced by
+cmake, or a linker parameter list file.
+
+Only run with
+'python compiler_opt/tools/extract_ir.py ...'
+
+The compilation is assumed to have been performed with clang, using
+-fembed-bitcode=all passed to cc1 (i.e. pass clang -Xclang=-fembed-bitcode=all)
+
+In a distributed ThinLTO case, the compilation is assumed to have been performed
+specifying -mllvm -lto-embed-bitcode=post-merge-pre-opt.
+
+In a local ThinLTO case, the compilation is assumedto have been performed
+specifying -Wl,--save-temps=import -Wl,--thinlto-emit-index-files
+
+To change the logging verbosity, pass an integer representing the desired
+verbosity to the --verbosity flag. Use 0 for all logs, status information,
+and detailed debug information, -1 for solely warnings, and -2 to not produce
+any output.
+"""
+
+import json
+import multiprocessing
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from mlgo.corpus import extract_ir_lib
+
+flags.DEFINE_string(
+    "input",
+    None,
+    "Input file or directory - either compile_commands.json, a linker parameter"
+    "list, or a path to a directory containing object files.",
+)
+flags.DEFINE_enum(
+    "input_type",
+    "json",
+    ["json", "params", "directory"],
+    "Input file type - json, params, or directory. params latter refers to lld"
+    "params.",
+)
+flags.DEFINE_string("output_dir", None, "Output directory")
+flags.DEFINE_integer(
+    "num_workers",
+    None,
+    "Number of parallel workers for objcopy. `None` for maximum available.",
+)
+flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
+flags.DEFINE_string(
+    "obj_base_dir",
+    "",
+    "Base directory for object files. Defaults to current working dir.",
+)
+flags.DEFINE_string(
+    "cmd_filter",
+    None,
+    "Include only those modules with a command line matching this regexp. "
+    "Setting it to None for not filtering. Note that the regexp is applied "
+    "independently for each separate command line option. For example, ^-Oz$ "
+    "will match Oz - built binaries. Does not work with thinlto_build=lld.",
+)
+flags.DEFINE_enum(
+    "thinlto_build",
+    None,
+    ["distributed", "local"],
+    "Set if the build was performed with either 'distributed' or "
+    "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
+    "The build is assumed to have had "
+    "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
+    "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
+    "passed in the local case.",
+)
+flags.DEFINE_string(
+    "cmd_section_name",
+    ".llvmcmd",
+    "The section name passed to llvm-objcopy. For ELF object files, the "
+    "default .llvmcmd is correct. For Mach-O object files, one should use "
+    "something like __LLVM,__cmdline",
+)
+flags.DEFINE_string(
+    "bitcode_section_name",
+    ".llvmbc",
+    "The section name passed to llvm-objcopy. For ELF object files, the "
+    "default .llvmbc is correct. For Mach-O object files, one should use "
+    "__LLVM,__bitcode",
+)
+
+flags.mark_flag_as_required("output_dir")
+
+FLAGS = flags.FLAGS
+
+
+def main(argv):
+    if len(argv) > 1:
+        raise app.UsageError("Too many command-line arguments.")
+
+    objs = []
+    if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+        raise ValueError("--thinlto_build=local cannot be run with --input")
+    if FLAGS.input is None:
+        if FLAGS.thinlto_build != "local":
+            raise ValueError("--input or --thinlto_build=local must be provided")
+        objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
+    elif FLAGS.input_type == "json":
+        with open(FLAGS.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_compile_commands(
+                json.load(f), FLAGS.output_dir
+            )
+    elif FLAGS.input_type == "params":
+        if not FLAGS.obj_base_dir:
+            logging.info(
+                "-obj_base_dir is unspecified, assuming current directory."
+                "If no objects are found, use this option to specify the root"
+                "directory for the object file paths in the input file."
+            )
+        with open(FLAGS.input, encoding="utf-8") as f:
+            objs = extract_ir_lib.load_from_lld_params(
+                [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+            )
+    elif FLAGS.input_type == "directory":
+        logging.warning(
+            "Using the directory input is only recommended if the build system"
+            "your project uses does not support any structured output that"
+            "ml-compiler-opt understands. If your build system provides a"
+            "structured compilation database, use that instead"
+        )
+        objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+    else:
+        logging.error("Unknown input type: %s", FLAGS.input_type)
+
+    relative_output_paths = extract_ir_lib.run_extraction(
+        objs,
+        FLAGS.num_workers,
+        FLAGS.llvm_objcopy_path,
+        FLAGS.cmd_filter,
+        FLAGS.thinlto_build,
+        FLAGS.cmd_section_name,
+        FLAGS.bitcode_section_name,
+    )
+
+    extract_ir_lib.write_corpus_manifest(
+        FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+    )
+
+    logging.info(
+        "Converted %d files out of %d",
+        len(objs) - relative_output_paths.count(None),
+        len(objs),
+    )
+
+
+def entrypoint():
+    multiprocessing.set_start_method("fork")
+    app.run(main)
+
+
+if __name__ == "__main__":
+    entrypoint()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
new file mode 100644
index 0000000000000..9c828ce1eb631
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
@@ -0,0 +1,395 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library functions for IR extraction."""
+
+import os
+import pathlib
+import re
+import shutil
+import subprocess
+import multiprocessing
+import functools
+import json
+
+from typing import Dict, List, Optional
+
+from absl import logging
+
+_UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"]
+
+
+# TODO(ml-compiler-opt): maybe we can also convert here the cmdline file,from a
+# \0 - separated list of strings, to a \n one.
+def should_include_module(cmdline: str, match_regexp: Optional[str]) -> bool:
+    """Determine if the module should be included."""
+    if match_regexp is None:
+        return True
+    lines = cmdline.split("\0")
+    return any(len(re.findall(match_regexp, l)) for l in lines)
+
+
+def get_thinlto_index(cmdline: str, basedir: str) -> Optional[str]:
+    opts = cmdline.split("\0")
+    for option in opts:
+        if option.startswith("-fthinlto-index"):
+            return os.path.join(basedir, option.split("=")[1])
+    return None
+
+
+class TrainingIRExtractor:
+    """IR and command line extraction from an object file."""
+
+    def __init__(self, obj_relative_path, output_base_dir, obj_base_dir=None):
+        """Set up a TrainingIRExtractor.
+
+        Args:
+          obj_relative_path: relative path to the input object file. It will be also
+            used to construct the absolute path of the output IR and cmd files, by
+            appending it to output_base_dir.
+          output_base_dir: the directory under which the output will be produced.
+          obj_base_dir: the base directory for all the input object files.
+        """
+        self._obj_relative_path = obj_relative_path
+        self._output_base_dir = output_base_dir
+        self._obj_base_dir = obj_base_dir if obj_base_dir is not None else ""
+
+    def obj_base_dir(self):
+        return self._obj_base_dir
+
+    def output_base_dir(self):
+        return self._output_base_dir
+
+    def relative_output_path(self):
+        return self._obj_relative_path
+
+    def input_obj(self):
+        return os.path.join(self.obj_base_dir(), self._obj_relative_path)
+
+    def lld_src_bc(self):
+        # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+        # IR bitcode saved by lld. It is hardcoded into lld.
+        return os.path.join(
+            self._obj_base_dir, self._obj_relative_path + ".3.import.bc"
+        )
+
+    def lld_src_thinlto(self):
+        return os.path.join(self._obj_base_dir, self._obj_relative_path + ".thinlto.bc")
+
+    def dest_dir(self):
+        return os.path.join(
+            self.output_base_dir(), os.path.dirname(self._obj_relative_path)
+        )
+
+    def module_name(self):
+        return os.path.basename(self._obj_relative_path)
+
+    def cmd_file(self):
+        return os.path.join(self.dest_dir(), self.module_name() + ".cmd")
+
+    def bc_file(self):
+        return os.path.join(self.dest_dir(), self.module_name() + ".bc")
+
+    def thinlto_index_file(self):
+        return os.path.join(self.dest_dir(), self.module_name() + ".thinlto.bc")
+
+    def _get_extraction_cmd_command(
+        self, llvm_objcopy_path: str, cmd_section_name: str
+    ):
+        """Get llvm-objcopy and process args to a produce a command string that,
+        when invoked, will extract the cmd section info ths self.cmd_file() file.
+        """
+        return [
+            llvm_objcopy_path,
+            "--dump-section=" + cmd_section_name + "=" + self.cmd_file(),
+            self.input_obj(),
+            "/dev/null",
+        ]
+
+    def _get_extraction_bc_command(
+        self, llvm_objcopy_path: str, bitcode_section_name: str
+    ):
+        """Gets llvm-objcopy and process args to produce a command string that,
+        when invoked, will extract the bitcode section into the self.bc_file()
+        file.
+        """
+        return [
+            llvm_objcopy_path,
+            "--dump-section=" + bitcode_section_name + "=" + self.bc_file(),
+            self.input_obj(),
+            "/dev/null",
+        ]
+
+    def _extract_clang_artifacts(
+        self,
+        llvm_objcopy_path: str,
+        cmd_filter: str,
+        is_thinlto: bool,
+        cmd_section_name: str,
+        bitcode_section_name: str,
+    ) -> Optional[str]:
+        """Run llvm-objcopy to extract the .bc and command line."""
+        if not os.path.exists(self.input_obj()):
+            logging.info("%s does not exist.", self.input_obj())
+            return None
+        os.makedirs(self.dest_dir(), exist_ok=True)
+        try:
+            subprocess.check_output(
+                self._get_extraction_cmd_command(llvm_objcopy_path, cmd_section_name),
+                stderr=subprocess.STDOUT,
+                encoding="utf-8",
+            )
+            if cmd_filter is not None or is_thinlto:
+                with open(self.cmd_file(), encoding="utf-8") as f:
+                    lines = f.readlines()
+                assert len(lines) == 1
+                cmdline = lines[0]
+                if not should_include_module(cmdline, cmd_filter):
+                    logging.info(
+                        "Excluding module %s because it does not match the filter",
+                        self.input_obj(),
+                    )
+                    os.remove(self.cmd_file())
+                    return None
+                if is_thinlto:
+                    index_file = get_thinlto_index(cmdline, self.obj_base_dir())
+                    shutil.copy(index_file, self.thinlto_index_file())
+
+            subprocess.check_output(
+                self._get_extraction_bc_command(
+                    llvm_objcopy_path, bitcode_section_name
+                ),
+                stderr=subprocess.STDOUT,
+                encoding="utf-8",
+            )
+        except subprocess.CalledProcessError as e:
+            # This may happen if  .o file was build from asm (.S source).
+            logging.warning("%s was not processed: %s", self.input_obj(), e)
+            logging.info(e.output)
+            return None
+        assert (
+            os.path.exists(self.cmd_file())
+            and os.path.exists(self.bc_file())
+            and (not is_thinlto or os.path.exists(self.thinlto_index_file()))
+        )
+        return self.relative_output_path()
+
+    def _extract_lld_artifacts(self) -> Optional[str]:
+        """Extract the .bc file with ThinLTO index from an lld ThinLTO invocation."""
+        if not os.path.exists(self.lld_src_bc()):
+            logging.info("%s does not exist.", self.lld_src_bc())
+            return None
+        if not os.path.exists(self.lld_src_thinlto()):
+            logging.info("%s does not exist.", self.lld_src_thinlto())
+            return None
+        os.makedirs(self.dest_dir(), exist_ok=True)
+
+        # Copy over the files
+        shutil.copy(self.lld_src_bc(), self.bc_file())
+        shutil.copy(self.lld_src_thinlto(), self.thinlto_index_file())
+
+        assert os.path.exists(self.bc_file())
+        assert os.path.exists(self.thinlto_index_file())
+        return self._obj_relative_path
+
+    def extract(
+        self,
+        llvm_objcopy_path: Optional[str] = None,
+        cmd_filter: Optional[str] = None,
+        thinlto_build: Optional[str] = None,
+        cmd_section_name: Optional[str] = ".llvmcmd",
+        bitcode_section_name: Optional[str] = ".llvmbc",
+    ) -> Optional[str]:
+        if thinlto_build == "local":
+            return self._extract_lld_artifacts()
+        return self._extract_clang_artifacts(
+            llvm_objcopy_path=llvm_objcopy_path,
+            cmd_filter=cmd_filter,
+            is_thinlto=thinlto_build == "distributed",
+            cmd_section_name=cmd_section_name,
+            bitcode_section_name=bitcode_section_name,
+        )
+
+
+def convert_compile_command_to_objectfile(
+    command: Dict[str, str], output_dir: str
+) -> Optional[TrainingIRExtractor]:
+    obj_base_dir = command["directory"]
+    if "arguments" in command:
+        cmd_parts = command["arguments"]
+    elif "command" in command:
+        cmd_parts = command["command"].split()
+    else:
+        logging.info("compile_commands element has no command and arguments")
+        return None
+
+    try:
+        obj_index = cmd_parts.index("-o") + 1
+    except ValueError:
+        # This could happen if there are non-clang commands in compile_commands.json
+        logging.info("Command has no -o option: %s", " ".join(cmd_parts))
+        return None
+    obj_rel_path = cmd_parts[obj_index]
+    # TODO(mtrofin): is the obj_base_dir correct for thinlto index bc files?
+    return TrainingIRExtractor(
+        obj_relative_path=obj_rel_path,
+        output_base_dir=output_dir,
+        obj_base_dir=obj_base_dir,
+    )
+
+
+def load_from_compile_commands(
+    json_array: List[Dict[str, str]], output_dir: str
+) -> List[TrainingIRExtractor]:
+    objs = [
+        convert_compile_command_to_objectfile(cmd, output_dir) for cmd in json_array
+    ]
+    # Filter out None, in case there were non-clang commands in the .json
+    return [obj for obj in objs if obj is not None]
+
+
+def load_from_lld_params(
+    params_array: List[str], obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+    """Create an ObjectFile array based on lld's parameters."""
+    # yank out -o and the output. After that, anything not starting with '-', and
+    # ending in a '.o', is an object file.
+    try:
+        minus_o_idx = params_array.index("-o")
+        del params_array[minus_o_idx : minus_o_idx + 2]
+        just_obj_paths = [
+            o for o in params_array if not o.startswith("-") and o.endswith(".o")
+        ]
+    except ValueError:
+        logging.info("This params file does not have an explicit -o option.")
+        just_obj_paths = params_array
+
+    def make_obj(obj_file: str) -> TrainingIRExtractor:
+        return TrainingIRExtractor(
+            obj_relative_path=obj_file,
+            output_base_dir=output_dir,
+            obj_base_dir=obj_base_dir,
+        )
+
+    return [make_obj(obj_file) for obj_file in just_obj_paths]
+
+
+def load_from_directory(
+    obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+    """Create an object file array by globbing an entire drectory.
+
+    Args:
+      obj_base_dir: The base build directory that all object files will be
+        written out as being relative to.
+      output_dir: The output directory where extracted .bc and .cmd files should
+        be placed.
+    """
+    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.o")]
+
+    def make_spec(obj_file: str):
+        return TrainingIRExtractor(
+            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir),
+            output_base_dir=output_dir,
+            obj_base_dir=obj_base_dir,
+        )
+
+    return [make_spec(path) for path in paths]
+
+
+def load_for_lld_thinlto(
+    obj_base_dir: str, output_dir: str
+) -> List[TrainingIRExtractor]:
+    # .3.import.bc is the suffix attached to post-merge-pre-opt ('postimport')
+    # IR bitcode saved by lld. It is hardcoded into lld. ThinLTO index files
+    # are also emitted next to the postimport bitcode, with the suffix
+    # .thinlto.bc instead
+    paths = [str(p) for p in pathlib.Path(obj_base_dir).glob("**/*.3.import.bc")]
+
+    def make_spec(obj_file: str):
+        return TrainingIRExtractor(
+            # Cut away .3.import.bc
+            obj_relative_path=os.path.relpath(obj_file, start=obj_base_dir)[:-12],
+            output_base_dir=output_dir,
+            obj_base_dir=obj_base_dir,
+        )
+
+    return [make_spec(path) for path in paths]
+
+
+def run_extraction(
+    objs: List[TrainingIRExtractor],
+    num_workers: int,
+    llvm_objcopy_path: str,
+    cmd_filter: str,
+    thinlto_build: str,
+    cmd_section_name: str,
+    bitcode_section_name: str,
+):
+    """Extracts all specified object files into the corpus directory.
+
+    Args:
+      objs: A list of TrainingIRExtractor Objects that represent the object files
+        to extract bitcode/commands from.
+      num_workers: The number of parallel processes to spawn to run the
+        extraction.
+      llvm_objcopy_path: The path to the llvm-objcopy to use for dumping sections.
+      cmd_filter: A regular expression that is used to select for compilations
+        performed with specific flags. If you want to include all compilations,
+        set this to None.
+      thinlto_build: Whether or not this is a ThinLTO build, and if so, the type.
+        Set this to None if the build was not done with ThinLTO.
+      cmd_section_name: The name of the command line section created by the
+        bitcode embedding.
+      bitcode_section_name: The name of the bitcode section created by the
+        bitcode embedding.
+    """
+    extract_artifacts = functools.partial(
+        TrainingIRExtractor.extract,
+        llvm_objcopy_path=llvm_objcopy_path,
+        cmd_filter=cmd_filter,
+        thinlto_build=thinlto_build,
+        cmd_section_name=cmd_section_name,
+        bitcode_section_name=bitcode_section_name,
+    )
+
+    with multiprocessing.Pool(num_workers) as pool:
+        relative_output_paths = pool.map(extract_artifacts, objs)
+        pool.close()
+        pool.join()
+    return relative_output_paths
+
+
+def write_corpus_manifest(
+    thinlto_build: str, relative_output_paths: List[str], output_dir: str
+):
+    """Writes a corpus_manifest.json containing all necessary information about
+    the corpus.
+
+    Args:
+      thinlto_build: Whether or not the build was done with ThinLTO and if so,
+        what kind of ThinLTO. Set this to none if the build was not performed with
+        ThinLTO.
+      relative_output_paths: The relative (to the corpus directory) output paths
+        of all the bitcode files that should be placed in the corpus manifest
+      output_dir: The corpus directory where the corpus manifest should be
+        placed.
+    """
+    # This comes first rather than later so global_command_override is at the top
+    # of the .json after being written
+    if thinlto_build == "local":
+        corpus_description = {"global_command_override": _UNSPECIFIED_OVERRIDE}
+    else:
+        corpus_description = {}
+
+    corpus_description.update(
+        {
+            "has_thinlto": thinlto_build is not None,
+            "modules": [path for path in relative_output_paths if path is not None],
+        }
+    )
+
+    with open(
+        os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+    ) as f:
+        json.dump(corpus_description, f, indent=2)
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
new file mode 100644
index 0000000000000..edb0ecd853de2
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -0,0 +1,54 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Tool for making a corpus from arbitrary bitcode.
+
+To create a corpus from a set of bitcode files in an input directory, run
+the following command:
+
+PYTHONPATH=$PYTHONPATH:. python3 ./compiler_opt/tools/make_corpus.py \
+  --input_dir=<path to input directory> \
+  --output_dir=<path to output directory> \
+  --default_args="<list of space separated flags>"
+"""
+
+from absl import app
+from absl import flags
+from absl import logging
+
+from mlgo.corpus import make_corpus_lib
+
+flags.DEFINE_string("input_dir", None, "The input directory.")
+flags.DEFINE_string("output_dir", None, "The output directory.")
+flags.DEFINE_string(
+    "default_args",
+    "",
+    "The compiler flags to compile with when using downstream tooling.",
+)
+
+flags.mark_flag_as_required("input_dir")
+flags.mark_flag_as_required("output_dir")
+
+FLAGS = flags.FLAGS
+
+
+def main(_):
+    logging.warning(
+        "Using this tool does not guarantee that the bitcode is taken at "
+        "the correct stage for consumption during model training. Make "
+        "sure to validate assumptions about where the bitcode is coming "
+        "from before using it in production."
+    )
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir)
+    make_corpus_lib.write_corpus_manifest(
+        relative_paths, FLAGS.output_dir, FLAGS.default_args.split()
+    )
+
+
+def entrypoint():
+    app.run(main)
+
+
+if __name__ == "__main__":
+    entrypoint()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py
new file mode 100644
index 0000000000000..697c97ebf6ee2
--- /dev/null
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus_lib.py
@@ -0,0 +1,77 @@
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+"""Library functions for making a corpus from arbitrary bitcode."""
+
+import pathlib
+import os
+import shutil
+import json
+
+from typing import List, Optional
+
+BITCODE_EXTENSION = ".bc"
+
+
+def load_bitcode_from_directory(bitcode_base_dir: str) -> List[str]:
+    """Finds bitcode files to extract from a given directory.
+
+    Args:
+      bitcode_base_dir: The base directory where the bitcode to be copied
+        is from.
+      output_dir: The directory to place the bitcode in.
+
+    Returns an array of paths representing the relative path to the bitcode
+    file from the base direcotry.
+    """
+    paths = [
+        str(p)[: -len(BITCODE_EXTENSION)]
+        for p in pathlib.Path(bitcode_base_dir).glob("**/*" + BITCODE_EXTENSION)
+    ]
+
+    return [os.path.relpath(full_path, start=bitcode_base_dir) for full_path in paths]
+
+
+def copy_bitcode(
+    relative_paths: List[str], bitcode_base_dir: str, output_dir: str
+) -> None:
+    """Copies bitcode files from the base directory to the output directory.
+
+    Args:
+      relative_paths: An array of relative paths to bitcode files that are copied
+        over to the output directory, preserving relative location.
+      bitcode_base_dir: The base directory where the bitcode is located.
+      output_dir: The output directory to place the bitcode in.
+    """
+    for relative_path in relative_paths:
+        base_path = os.path.join(bitcode_base_dir, relative_path + BITCODE_EXTENSION)
+        destination_path = os.path.join(output_dir, relative_path + BITCODE_EXTENSION)
+        os.makedirs(os.path.dirname(destination_path), exist_ok=True)
+        shutil.copy(base_path, destination_path)
+
+
+def write_corpus_manifest(
+    relative_output_paths: List[str],
+    output_dir: str,
+    default_args: Optional[List[str]] = None,
+) -> None:
+    """Creates a corpus manifest describing the bitcode that has been found.
+
+    Args:
+      relative_output_paths: A list of paths to each bitcode file relative to the
+        output directory.
+      outout_dir: The output directory where the corpus is being created.
+      default_args: An array of compiler flags that should be used to compile
+        the bitcode when using further downstream tooling."""
+    if default_args is None:
+        default_args = []
+    corpus_description = {
+        "global_command_override": default_args,
+        "has_thinlto": False,
+        "modules": [path for path in relative_output_paths if path is not None],
+    }
+
+    with open(
+        os.path.join(output_dir, "corpus_description.json"), "w", encoding="utf-8"
+    ) as description_file:
+        json.dump(corpus_description, description_file, indent=2)
diff --git a/llvm/utils/mlgo-utils/pyproject.toml b/llvm/utils/mlgo-utils/pyproject.toml
new file mode 100644
index 0000000000000..be2af86cd05df
--- /dev/null
+++ b/llvm/utils/mlgo-utils/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+requires = ["setuptools>=61.0"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "mlgo"
+description = "Tooling for ML in LLVM"
+readme = "README.md"
+requires-python = ">=3.8,<3.11"
+dependencies = [
+  "absl-py>=1.0.0"
+]
+dynamic = ["version"]
+license = {text = "Apache-2.0 WITH LLVM-exception"}
+classifiers = [
+  "License :: OSI Approved :: Apache Software License"
+]
+
+[tool.setuptools.dynamic]
+version = {attr = "mlgo.__version__"}
+
+[project.scripts]
+combine_training_corpus = "mlgo.combine_training_corpus:entrypoint"
+extract_ir = "mlgo.extract_ir:entrypoint"
+make_corpus = "mlgo.make_corpus:entrypoint"
diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
new file mode 100644
index 0000000000000..dbfdc78a87533
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
@@ -0,0 +1,132 @@
+# REQUIRES: python-38, absl
+
+## Test the functionality of combine_training_corpus_lib
+
+import json
+import os
+import sys
+
+from mlgo.corpus import combine_training_corpus_lib
+
+## Test that combining two training corpora works as expected
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: mkdir %t.dir/subcorpus2
+# RUN: %python %s test_combine_corpus %t.dir | FileCheck %s --check-prefix CHECK-COMBINE-CORPUS
+
+
+def test_combine_corpus(corpus_dir):
+    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+    subcorpus2_dir = os.path.join(corpus_dir, "subcorpus2")
+    subcorpus1_description = {
+        "has_thinlto": False,
+        "modules": ["test1.o", "test2.o"],
+    }
+    subcorpus2_description = {
+        "has_thinlto": False,
+        "modules": ["test3.o", "test4.o"],
+    }
+    with open(
+        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+    ) as corpus1_description_handle:
+        json.dump(subcorpus1_description, corpus1_description_handle)
+    with open(
+        os.path.join(subcorpus2_dir, "corpus_description.json"), "w"
+    ) as corpus2_description_handle:
+        json.dump(subcorpus2_description, corpus2_description_handle)
+    combine_training_corpus_lib.combine_corpus(corpus_dir)
+    with open(
+        os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+    ) as combined_corpus_description_file:
+        combined_corpus_description = json.load(combined_corpus_description_file)
+    print(combined_corpus_description["has_thinlto"])
+    # CHECK-COMBINE-CORPUS: False
+    for module in sorted(combined_corpus_description["modules"]):
+        print(module)
+    # CHECK-COMBINE-CORPUS: subcorpus1/test1.o
+    # CHECK-COMBINE-CORPUS: subcorpus1/test2.o
+    # CHECK-COMBINE-CORPUS: subcorpus2/test3.o
+    # CHECK-COMBINE-CORPUS: subcorpus2/test4.o
+
+
+## Test that we handle the empty folder case gracefully
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: mkdir %t.dir/empty_dir
+# RUN: %python %s test_empty_folder %t.dir | FileCheck %s --check-prefix CHECK-EMPTY-DIR
+
+
+def test_empty_folder(corpus_dir):
+    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+    subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+    with open(
+        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+    ) as subcorpus1_description_handle:
+        json.dump(subcorpus1_description, subcorpus1_description_handle)
+    combine_training_corpus_lib.combine_corpus(corpus_dir)
+    with open(
+        os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+    ) as combined_corpus_description_file:
+        combined_corpus_description = json.load(combined_corpus_description_file)
+    print(len(combined_corpus_description["modules"]))
+    # CHECK-EMPTY-DIR: 2
+
+
+## Test that we ignore extra files that will not end up contributing to the
+## corpus.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: touch %t.dir/empty.log
+# RUN: %python %s test_ignore_extra_file %t.dir | FileCheck %s --check-prefix CHECK-IGNORE-EXTRA-FILE
+
+
+def test_ignore_extra_file(corpus_dir):
+    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+    subcorpus1_description = {"modules": ["test1.o", "test2.o"]}
+    with open(
+        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+    ) as subcorpus1_description_handle:
+        json.dump(subcorpus1_description, subcorpus1_description_handle)
+    combine_training_corpus_lib.combine_corpus(corpus_dir)
+    with open(
+        os.path.join(corpus_dir, "corpus_description.json"), encoding="utf-8"
+    ) as combined_corpus_description_file:
+        combined_corpus_description = json.load(combined_corpus_description_file)
+    print(len(combined_corpus_description["modules"]))
+    # CHECK-IGNORE-EXTRA-FILE: 2
+
+
+## Test that we raise an error in the case where the corpora differ in a
+## substantial way.
+
+# RUN: rm -rf  %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subcorpus1
+# RUN: mkdir %t.dir/subcorpus2
+# RUN: %python %s test_different_corpora %t.dir | FileCheck %s --check-prefix CHECK-DIFFERENT-CORPORA
+
+
+def test_different_corpora(corpus_dir):
+    subcorpus1_dir = os.path.join(corpus_dir, "subcorpus1")
+    subcorpus2_dir = os.path.join(corpus_dir, "subcorpus2")
+    subcorpus1_description = {"has_thinlto": False, "modules": ["test1.o"]}
+    subcorpus2_description = {"has_thinlto": True, "modules": ["test2.o"]}
+    with open(
+        os.path.join(subcorpus1_dir, "corpus_description.json"), "w"
+    ) as subcorpus1_description_handle:
+        json.dump(subcorpus1_description, subcorpus1_description_handle)
+    with open(
+        os.path.join(subcorpus2_dir, "corpus_description.json"), "w"
+    ) as subcorpus2_description_handle:
+        json.dump(subcorpus2_description, subcorpus2_description_handle)
+    try:
+        combine_training_corpus_lib.combine_corpus(corpus_dir)
+    except ValueError:
+        print("ValueError")
+        # CHECK-DIFFERENT-CORPORA: ValueError
+
+
+if __name__ == "__main__":
+    globals()[sys.argv[1]](*sys.argv[2:])
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
new file mode 100644
index 0000000000000..3ed52ec21de95
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
@@ -0,0 +1,339 @@
+# REQUIRES: python-38, absl
+
+## Test the functionality of extract_ir_lib
+
+import os.path
+import sys
+
+from mlgo.corpus import extract_ir_lib
+
+## Test that we can convert a compilation database with a single compilation
+## command in it.
+
+# RUN: %python %s test_one_conversion | FileCheck %s --check-prefix CHECK-ONE-CONVERSION
+
+
+def test_one_conversion():
+    obj = extract_ir_lib.convert_compile_command_to_objectfile(
+        {
+            "directory": "/output/directory",
+            "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+            "file": "/some/path/lib/foo/bar.cc",
+        },
+        "/corpus/destination/path",
+    )
+    print(obj.input_obj())
+    # CHECK-ONE-CONVERSION: /output/directory/lib/bar.o
+    print(obj.relative_output_path())
+    # CHECK-ONE-CONVERSION: lib/bar.o
+    print(obj.cmd_file())
+    # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.cmd
+    print(obj.bc_file())
+    # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.bc
+    print(obj.thinlto_index_file())
+    # CHECK-ONE-CONVERSION: /corpus/destination/path/lib/bar.o.thinlto.bc
+
+
+## Test that we can convert an arguments style compilation database
+
+# RUN: %python %s test_one_conversion_arguments_style | FileCheck %s --check-prefix CHECK-ARGUMENTS-STYLE
+
+
+def test_one_conversion_arguments_style():
+    obj = extract_ir_lib.convert_compile_command_to_objectfile(
+        {
+            "directory": "/output/directory",
+            "arguments": [
+                "-cc1",
+                "-c",
+                "/some/path/lib/foo/bar.cc",
+                "-o",
+                "lib/bar.o",
+            ],
+            "file": "/some/path/lib/foo/bar.cc",
+        },
+        "/corpus/destination/path",
+    )
+    print(obj.input_obj())
+    # CHECK-ARGUMENTS-STYLE: /output/directory/lib/bar.o
+    print(obj.relative_output_path())
+    # CHECK-ARGUMENTS-STYLE: lib/bar.o
+    print(obj.cmd_file())
+    # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.cmd
+    print(obj.bc_file())
+    # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.bc
+    print(obj.thinlto_index_file())
+    # CHECK-ARGUMENTS-STYLE: /corpus/destination/path/lib/bar.o.thinlto.bc
+
+
+## Test that converting multiple files works as well
+
+# RUN: %python %s test_multiple_conversion | FileCheck %s --check-prefix CHECK-MULTIPLE-CONVERSION
+
+
+def test_multiple_conversion():
+    res = extract_ir_lib.load_from_compile_commands(
+        [
+            {
+                "directory": "/output/directory",
+                "command": "-cc1 -c /some/path/lib/foo/bar.cc -o lib/bar.o",
+                "file": "/some/path/lib/foo/bar.cc",
+            },
+            {
+                "directory": "/output/directory",
+                "command": "-cc1 -c /some/path/lib/foo/baz.cc -o lib/other/baz.o",
+                "file": "/some/path/lib/foo/baz.cc",
+            },
+        ],
+        "/corpus/destination/path",
+    )
+    res = list(res)
+    print(res[0].input_obj())
+    # CHECK-MULTIPLE-CONVERSION: /output/directory/lib/bar.o
+    print(res[0].relative_output_path())
+    # CHECK-MULTIPLE-CONVERSION: lib/bar.o
+    print(res[0].cmd_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.cmd
+    print(res[0].bc_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.bc
+    print(res[0].thinlto_index_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/bar.o.thinlto.bc
+
+    print(res[1].input_obj(), "/output/directory/lib/other/baz.o")
+    # CHECK-MULTIPLE-CONVERSION: /output/directory/lib/other/baz.o
+    print(res[1].relative_output_path(), "lib/other/baz.o")
+    # CHECK-MULTIPLE-CONVERSION: lib/other/baz.o
+    print(res[1].cmd_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.cmd
+    print(res[1].bc_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.bc
+    print(res[1].thinlto_index_file())
+    # CHECK-MULTIPLE-CONVERSION: /corpus/destination/path/lib/other/baz.o.thinlto.bc
+
+
+## Test that we generate the correct objcopy commands for extracting commands
+
+# RUN: %python %s test_command_extraction | FileCheck %s --check-prefix CHECK-COMMAND-EXTRACT
+
+
+def test_command_extraction():
+    obj = extract_ir_lib.TrainingIRExtractor(
+        obj_relative_path="lib/obj_file.o",
+        output_base_dir="/where/corpus/goes",
+        obj_base_dir="/foo/bar",
+    )
+    extraction_cmd1 = obj._get_extraction_cmd_command(
+        "/bin/llvm_objcopy_path", ".llvmcmd"
+    )
+    for part in extraction_cmd1:
+        print(part)
+    # CHECK-COMMAND-EXTRACT: /bin/llvm_objcopy_path
+    # CHECK-COMMAND-EXTRACT: --dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd
+    # CHECK-COMMAND-EXTRACT: /foo/bar/lib/obj_file.o
+    # CHECK-COMMAND-EXTRACT: /dev/null
+
+    extraction_cmd2 = obj._get_extraction_bc_command(
+        "/bin/llvm_objcopy_path", ".llvmbc"
+    )
+    for part in extraction_cmd2:
+        print(part)
+    # CHECK-COMMAND-EXTRACT: /bin/llvm_objcopy_path
+    # CHECK-COMMAND-EXTRACT: --dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc
+    # CHECK-COMMAND-EXTRACT: /foo/bar/lib/obj_file.o
+    # CHECK-COMMAND-EXTRACT: /dev/null
+
+
+## Test that we generate the correct extraction commands without specifying
+## an output base directory.
+
+# RUN: %python %s test_command_extraction_no_basedir | FileCheck %s --check-prefix CHECK-COMMAND-EXTRACT-NOBASEDIR
+
+
+def test_command_extraction_no_basedir():
+    obj = extract_ir_lib.TrainingIRExtractor("lib/obj_file.o", "/where/corpus/goes")
+    extraction_cmd1 = obj._get_extraction_cmd_command(
+        "/bin/llvm_objcopy_path", ".llvmcmd"
+    )
+    for part in extraction_cmd1:
+        print(part)
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /bin/llvm_objcopy_path
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: --dump-section=.llvmcmd=/where/corpus/goes/lib/obj_file.o.cmd
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: lib/obj_file.o
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /dev/null
+
+    extraction_cmd2 = obj._get_extraction_bc_command(
+        "/bin/llvm_objcopy_path", ".llvmbc"
+    )
+    for part in extraction_cmd2:
+        print(part)
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /bin/llvm_objcopy_path
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: --dump-section=.llvmbc=/where/corpus/goes/lib/obj_file.o.bc
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: lib/obj_file.o
+    # CHECK-COMMAND-EXTRACT-NOBASEDIR: /dev/null
+
+
+## Test that we can extract a corpus from lld parameters
+
+# RUN: %python %s test_lld_params | FileCheck %s --check-prefix CHECK-LLD-PARAMS
+
+
+def test_lld_params():
+    lld_opts = [
+        "-o",
+        "output/dir/exe",
+        "lib/obj1.o",
+        "somelib.a",
+        "-W,blah",
+        "lib/dir/obj2.o",
+    ]
+    obj = extract_ir_lib.load_from_lld_params(lld_opts, "/some/path", "/tmp/out")
+    print(obj[0].input_obj())
+    # CHECK-LLD-PARAMS: /some/path/lib/obj1.o
+    print(obj[0].relative_output_path())
+    # CHECK-LLD-PARAMS: lib/obj1.o
+    print(obj[0].cmd_file())
+    # CHECK-LLD-PARAMS: /tmp/out/lib/obj1.o.cmd
+    print(obj[0].thinlto_index_file())
+    # CHECK-LLD-PARAMS: /tmp/out/lib/obj1.o.thinlto.bc
+    print(obj[1].input_obj())
+    # CHECK-LLD-PARMAS: /some/path/lib/dir/obj2.o
+
+
+## Test that we can load a corpus from a directory containing object files
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/subdir
+# RUN: touch %t.dir/subdir/test1.o
+# RUN: touch %t.dir/subdir/test2.o
+# RUN: %python %s test_load_from_directory %t.dir | FileCheck %s --check-prefix CHECK-LOAD-DIR
+
+
+def test_load_from_directory(tempdir):
+    objs = extract_ir_lib.load_from_directory(tempdir, "/output")
+    for index, obj in enumerate(sorted(objs, key=lambda x: x._obj_relative_path)):
+        print(obj._obj_relative_path, f"subdir/test{index + 1:d}.o")
+        # CHECK-LOAD-DIR: subdir/test1.o
+        # Explicitly check for equality here as we can not check within
+        # FileCheck the exact value as lit substitutions do not work in
+        # FileCheck lines.
+        print(obj._obj_base_dir == tempdir)
+        # CHECK-LOAD-DIR: True
+        print(obj._output_base_dir)
+        # CHECK-LOAD-DIR /output
+
+
+## Test that we can load a corpus in the lld thinLTO case
+
+# RUN: rm -rf %.dir && mkdir %t.dir
+# RUN: touch %t.dir/1.3.import.bc
+# RUN: touch %t.dir/2.3.import.bc
+# RUN: touch %t.dir/3.3.import.bc
+# RUN: touch %t.dir/1.thinlto.bc
+# RUN: touch %t.dir/2.thinlto.bc
+# RUN: touch %t.dir/3.thinlto.bc
+# RUN: %python %s test_lld_thinlto_discovery %t.dir | FileCheck %s --check-prefix CHECK-LLD-THINLTO-DISCOVERY
+
+
+def test_lld_thinlto_discovery(tempdir):
+    obj = extract_ir_lib.load_for_lld_thinlto(tempdir, "/output")
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+        print(o._obj_relative_path)
+        # Explicitly check for equality as we can not check within FileCheck
+        # using the lit substitution for the temp dir
+        print(o._obj_base_dir == tempdir)
+        print(o._output_base_dir)  # outdir
+    # CHECK-LLD-THINLTO-DISCOVERY: 1
+    # CHECK-LLD-THINLTO-DISCOVERY: True
+    # CHECK-LLD-THINLTO-DISCOVERY: /output
+    # CHECK-LLD-THINLTO-DISCOVERY: 2
+    # CHECK-LLD-THINLTO-DISCOVERY: True
+    # CHECK-LLD-THINLTO-DISCOVERY: /output
+    # CHECK-LLD-THINLTO-DISCOVERY: 3
+    # CHECK-LLD-THINLTO-DISCOVERY: True
+    # CHECK-LLD-THINLTO-DISCOVERY: /output
+
+
+## Test that we can load a corpus in the nested lld thinLTO case
+
+# RUN: mkdir %t.dir/nest
+# RUN: mv %t.dir/*.bc %t.dir/nest
+# RUN: %python %s test_lld_thinlto_discovery_nested %t.dir | FileCheck %s --check-prefix CHECK-LLD-THINLTO-DISCOVERY-NESTED
+
+
+def test_lld_thinlto_discovery_nested(outer):
+    obj = extract_ir_lib.load_for_lld_thinlto(outer, "/output")
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+        print(o._obj_relative_path)
+        print(o._obj_base_dir == outer)
+        print(o._output_base_dir)
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/1
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/2
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: nest/3
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: True
+    # CHECK-LLD-THINLTO-DISCOVERY-NESTED: /output
+
+
+## Test the lld extraction works as expected
+
+# RUN: rm -rf  %t.dir.out && mkdir %t.dir.out
+# RUN: %python %s test_lld_thinlto_extraction %t.dir %t.dir.out | FileCheck %s --check-prefix CHECK-LLD-THINLTO-EXTRACTION-PY
+# ls %t.dir.out/nest | FileChceck %s --check-prefix CHECK-LLD-THINLTO-EXTRACTION
+
+# CHECK-LLD-THINLTO-EXTRACTION: 1
+# CHECK-LLD-THINLTO-EXTRACTION: 2
+# CHECK-LLD-THINLTO-EXTRACTION: 3
+# CHECK-LLD-THINLTO-EXTRACTION: 1.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 2.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 3.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 1.thinlto.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 2.thinlto.bc
+# CHECK-LLD-THINLTO-EXTRACTION: 3.thinlto.bc
+
+
+def test_lld_thinlto_extraction(outer, outdir):
+    obj = extract_ir_lib.load_for_lld_thinlto(outer, outdir)
+    for i, o in enumerate(sorted(obj, key=lambda x: x._obj_relative_path)):
+        mod_path = o.extract(thinlto_build="local")
+        print(mod_path)
+    # CHECK-LLD-THINLTO-EXTRACTION-PY: 1
+    # CHECK-LLD-THINLTO-EXTRACTION-PY: 2
+    # CHECK-LLD-THINLTO-EXTRACTION-PY: 3
+
+
+## Test that filtering works correctly
+
+# RUN: %python %s test_filtering | FileCheck %s --check-prefix CHECK-TEST-FILTERING
+
+
+def test_filtering():
+    cmdline = "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/out.o"
+    print(extract_ir_lib.should_include_module(cmdline, None))
+    # CHECK-TEST-FILTERING: True
+    print(extract_ir_lib.should_include_module(cmdline, ".*"))
+    # CHECK-TEST-FILTERING: True
+    print(extract_ir_lib.should_include_module(cmdline, "^-Oz$"))
+    # CHECK-TEST-FILTERING: True
+    print(extract_ir_lib.should_include_module(cmdline, "^-O3$"))
+    # CHECK-TEST-FILTERING: False
+
+
+## Test that we extract the thinLTO index correctly
+
+# RUN: %python %s test_thinlto_index_extractor | FileCheck %s --check-prefix CHECK-THINLTO-INDEX-EXTRACTOR
+
+
+def test_thinlto_index_extractor():
+    cmdline = (
+        "-cc1\0x/y/foobar.cpp\0-Oz\0-Ifoo\0-o\0bin/"
+        "out.o\0-fthinlto-index=foo/bar.thinlto.bc"
+    )
+    print(extract_ir_lib.get_thinlto_index(cmdline, "/the/base/dir"))
+    # CHECK-THINLTO-INDEX-EXTRACTOR: /the/base/dir/foo/bar.thinlto.bc
+
+
+if __name__ == "__main__":
+    globals()[sys.argv[1]](*sys.argv[2:])
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
new file mode 100644
index 0000000000000..0f970414b1aec
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
@@ -0,0 +1,71 @@
+# REQUIRES: python-38, absl
+
+## Test the functionality of make_corpus_lib
+
+import json
+import os
+import sys
+
+from mlgo.corpus import make_corpus_lib
+
+## Test that when we load the bitcode from a directory using the
+## load_bitcode_from_directory function, we get the expected results.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: mkdir %t.dir/nested
+# RUN: touch %t.dir/nested/test1.bc
+# RUN: touch %t.dir/nested/test2.bc
+# RUN: %python %s test_load_bitcode_from_directory %t.dir | FileCheck %s --check-prefix CHECK-LOAD
+
+
+def test_load_bitcode_from_directory(work_dir):
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(work_dir)
+    relative_paths = sorted(relative_paths)
+    for relative_path in relative_paths:
+        print(relative_path)
+    # CHECK-LOAD: nested/test1
+    # CHECK-LOAD: nested/test2
+
+
+## Test that when we copy the bitcode given a list of relative paths, the
+## appropriate files are copied over.
+
+# RUN: rm -rf %t.dir1 && mkdir %t.dir1
+# RUN: %python %s test_copy_bitcode %t.dir %t.dir1
+# RUN: ls %t.dir1/nested | FileCheck %s --check-prefix CHECK-COPY
+
+# CHECK-COPY: test1.bc
+# CHECK-COPY: test2.bc
+
+
+def test_copy_bitcode(directory, output_dir):
+    relative_paths = ["nested/test1", "nested/test2"]
+    make_corpus_lib.copy_bitcode(relative_paths, directory, output_dir)
+
+
+## Test that we get the expected corpus manifest when writing a corpus
+## manifest to the specificed directory.
+
+# RUN: %python %s test_write_corpus_manifest %t.dir1 | FileCheck %s --check-prefix CHECK-MANIFEST
+
+
+def test_write_corpus_manifest(output_dir):
+    relative_output_paths = ["test/test1", "test/test2"]
+    default_args = ["-O3", "-c"]
+    make_corpus_lib.write_corpus_manifest(
+        relative_output_paths, output_dir, default_args
+    )
+    with open(
+        os.path.join(output_dir, "corpus_description.json"), encoding="utf-8"
+    ) as corpus_description_file:
+        corpus_description = json.load(corpus_description_file)
+    print(corpus_description["global_command_override"])
+    # CHECK-MANIFEST: ['-O3', '-c']
+    print(corpus_description["has_thinlto"])
+    # CHECK-MANIFEST: False
+    print(corpus_description["modules"])
+    # CHECK-MANIFEST: ['test/test1', 'test/test2']
+
+
+if __name__ == "__main__":
+    globals()[sys.argv[1]](*sys.argv[2:])
diff --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg
new file mode 100644
index 0000000000000..055f0945942fc
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/lit.cfg
@@ -0,0 +1,15 @@
+import lit.formats
+
+from lit.llvm import llvm_config
+
+config.name = "mlgo-utils"
+config.test_format = lit.formats.ShTest(execute_external=False)
+
+config.suffixes = [".py"]
+
+config.test_source_root = os.path.dirname(__file__)
+config.test_exec_root = config.obj_root
+
+config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo-utils")
+
+llvm_config.use_default_substitutions()
diff --git a/llvm/utils/mlgo-utils/tests/lit.local.cfg b/llvm/utils/mlgo-utils/tests/lit.local.cfg
new file mode 100644
index 0000000000000..90cdf8ba618ed
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/lit.local.cfg
@@ -0,0 +1,14 @@
+import sys
+
+# TODO(boomanaiden154): Remove this flag once the minimum Python version for
+# the entire project has been bumped to 3.8.
+if sys.version_info > (3,8):
+    config.available_features.add("python-38")
+
+# TODO(boomanaiden154): Remove this flag once the scripts are converted to
+# not use absl anymore.
+try:
+    import absl
+    config.available_features.add("absl")
+except:
+    pass
diff --git a/llvm/utils/mlgo-utils/tests/lit.site.cfg.in b/llvm/utils/mlgo-utils/tests/lit.site.cfg.in
new file mode 100644
index 0000000000000..22e1524e6a8fd
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/lit.site.cfg.in
@@ -0,0 +1,10 @@
+@LIT_SITE_CFG_IN_HEADER@
+
+config.src_root = "@LLVM_SOURCE_DIR@"
+config.obj_root = "@LLVM_BINARY_DIR@"
+config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
+
+import lit.llvm
+lit.llvm.initialize(lit_config, config)
+
+lit_config.load_config(config, "@LLVM_SOURCE_DIR@/utils/mlgo-utils/tests/lit.cfg")

From fd49ef1eb3de7eb2be5f7ddd469a005917cc1988 Mon Sep 17 00:00:00 2001
From: Ben Dunbobbin <Ben.Dunbobbin@sony.com>
Date: Sat, 20 Jan 2024 01:35:06 +0000
Subject: [PATCH 201/843] Removed a late added test-case from the tests for
 #74629

This test-case was generating invalid IR and causing the test to fail.

Given that the reviewer initially accepted the change without this
test case I feel that it is appropriate to remove this test case,
to get the build to pass, and find a way to reimplement this test-case
in a later commit.
---
 clang/test/CodeGenCXX/visibility-dllstorageclass.cpp | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp b/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp
index a44ff1316d94c..3ea00bd488992 100644
--- a/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp
+++ b/clang/test/CodeGenCXX/visibility-dllstorageclass.cpp
@@ -43,17 +43,6 @@
 // RUN:     -x c++  %s -S -emit-llvm -o - | \
 // RUN:   FileCheck %s --check-prefixes=ALL_KEEP
 
-//// Show that omitting -fvisibility-from-dllstorageclass causes the other options to be ignored.
-// RUN: %clang_cc1 -triple x86_64-unknown-windows-itanium -fdeclspec \
-// RUN:     -fvisibility=hidden \
-// RUN:     -fapply-global-visibility-to-externs \
-// RUN:     -fvisibility-dllexport=protected \
-// RUN:     -fvisibility-nodllstorageclass=protected \
-// RUN:     -fvisibility-externs-dllimport=protected \
-// RUN:     -fvisibility-externs-nodllstorageclass=protected \
-// RUN:     -x c++  %s -S -emit-llvm -o - | \
-// RUN:   FileCheck %s --check-prefixes=ALL_KEEP
-
 // Local
 static void l() {}
 void use_locals(){l();}

From 99ffe71921f5286d17f3c07810f639e919cf32d9 Mon Sep 17 00:00:00 2001
From: Micah Weston <micahsweston@gmail.com>
Date: Fri, 19 Jan 2024 20:40:05 -0500
Subject: [PATCH 202/843] [llvm-objdump] Disables running pgo-analysis-map
 symbolizing on windows.

---
 llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml b/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
index c4bf443f920a2..a3dc0a7b6425c 100644
--- a/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
+++ b/llvm/test/tools/llvm-objdump/X86/elf-pgoanalysismap.yaml
@@ -2,6 +2,9 @@
 ## contain PGO data, --symbolize-operands is able to label the basic blocks
 ## correctly.
 
+## Fails on windows (https://github.com/llvm/llvm-project/issues/60013).
+# UNSUPPORTED: system-windows
+
 ## Check the case where we only have entry counts.
 
 # RUN: yaml2obj --docnum=1 %s -o %t1

From 3b61f5a1bc43bb0e303bf6da120a7920616799a6 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:29:48 -0700
Subject: [PATCH 203/843] Apply clang-tidy fixes for
 performance-unnecessary-value-param in DataLayoutPropagation.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
index 9e71a0765bbb5..6a971b37cad7c 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DataLayoutPropagation.cpp
@@ -369,7 +369,7 @@ static GenericOp packGenericOp(RewriterBase &rewriter, GenericOp genericOp,
 ///     } -> tensor<?x?x8x2xf32>
 static FailureOr<GenericOp>
 bubbleUpPackOpThroughGenericOp(RewriterBase &rewriter, tensor::PackOp packOp,
-                               ControlPropagationFn controlFn) {
+                               const ControlPropagationFn &controlFn) {
   auto genericOp = packOp.getSource().getDefiningOp<GenericOp>();
   if (!genericOp)
     return failure();

From f19f2139741e51963fd22637ba70e4063998a184 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:32:33 -0700
Subject: [PATCH 204/843] Apply clang-tidy fixes for llvm-else-after-return in
 DropUnitDims.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
index e6f4ed5b51b1e..559ffda4494d2 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/DropUnitDims.cpp
@@ -457,8 +457,8 @@ LogicalResult linalg::dropUnitDims(RewriterBase &rewriter, GenericOp genericOp,
     Type operandType = operand.get().getType();
     if (auto memrefOperandType = dyn_cast_or_null<MemRefType>(operandType)) {
       return memrefOperandType.getLayout().isIdentity();
-    } else if (auto tensorOperandType =
-                   dyn_cast<RankedTensorType>(operandType)) {
+    }
+    if (auto tensorOperandType = dyn_cast<RankedTensorType>(operandType)) {
       return tensorOperandType.getEncoding() == nullptr;
     }
     return false;

From 46ce993dd414a0ac3b4f6d13ffaff161f3e60efa Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:33:46 -0700
Subject: [PATCH 205/843] Apply clang-tidy fixes for llvm-else-after-return in
 ElementwiseOpFusion.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
index e4cb2f223f3c7..286b07669a47f 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/ElementwiseOpFusion.cpp
@@ -1403,11 +1403,10 @@ static Value getCollapsedOpOperand(Location loc, LinalgOp op,
     return builder
         .create<memref::CollapseShapeOp>(loc, operand, operandReassociation)
         .getResult();
-  } else {
-    return builder
-        .create<tensor::CollapseShapeOp>(loc, operand, operandReassociation)
-        .getResult();
   }
+  return builder
+      .create<tensor::CollapseShapeOp>(loc, operand, operandReassociation)
+      .getResult();
 }
 
 /// Modify the `linalg.index` operations in the original generic op, to its

From 197a73f0192593da1a211b17f18f843174e18787 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:38:46 -0700
Subject: [PATCH 206/843] Apply clang-tidy fixes for llvm-include-order in
 Fusion.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
index e48188fe516d3..a85532b2f755a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Fusion.cpp
@@ -29,8 +29,8 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
-#include <set>
 #include <optional>
+#include <set>
 
 #define DEBUG_TYPE "linalg-fusion"
 

From b1d4265a5f5345065e5c2aedf50787da05ddcc08 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:47:10 -0700
Subject: [PATCH 207/843] Apply clang-tidy fixes for llvm-qualified-auto in
 Promotion.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
index 9311fd0bb2a47..ac896d6c30d04 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Promotion.cpp
@@ -331,7 +331,7 @@ promoteSubViews(ImplicitLocOpBuilder &b,
 
   // Copy data into the promoted buffers. Use callback if provided.
   for (auto v : options.subViews) {
-    auto info = promotionInfoMap.find(v.first);
+    auto *info = promotionInfoMap.find(v.first);
     if (info == promotionInfoMap.end())
       continue;
     if (options.operandsNumbersToCopyIn.count(v.first) == 0)

From 01c5b5c1040ac8ee95d7a07d84546750a4e9e1c3 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 18:54:06 -0800
Subject: [PATCH 208/843] [ObjCopy] Use StringRef::consume_front (NFC)

---
 llvm/lib/ObjCopy/CommonConfig.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/ObjCopy/CommonConfig.cpp b/llvm/lib/ObjCopy/CommonConfig.cpp
index f44e70d996b2e..ff3cb4d44c126 100644
--- a/llvm/lib/ObjCopy/CommonConfig.cpp
+++ b/llvm/lib/ObjCopy/CommonConfig.cpp
@@ -20,11 +20,7 @@ NameOrPattern::create(StringRef Pattern, MatchStyle MS,
     return NameOrPattern(Pattern);
   case MatchStyle::Wildcard: {
     SmallVector<char, 32> Data;
-    bool IsPositiveMatch = true;
-    if (Pattern[0] == '!') {
-      IsPositiveMatch = false;
-      Pattern = Pattern.drop_front();
-    }
+    bool IsPositiveMatch = !Pattern.consume_front("!");
     Expected<GlobPattern> GlobOrErr = GlobPattern::create(Pattern);
 
     // If we couldn't create it as a glob, report the error, but try again

From fbd00a47754c1279145b86dffb347d30493f4445 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 18:54:07 -0800
Subject: [PATCH 209/843] [Analysis] Use llvm::children and
 llvm::inverse_children (NFC)

---
 llvm/include/llvm/Analysis/RegionInfoImpl.h | 33 +++++++--------------
 1 file changed, 10 insertions(+), 23 deletions(-)

diff --git a/llvm/include/llvm/Analysis/RegionInfoImpl.h b/llvm/include/llvm/Analysis/RegionInfoImpl.h
index ec79b35ae324a..c5e8821858fd2 100644
--- a/llvm/include/llvm/Analysis/RegionInfoImpl.h
+++ b/llvm/include/llvm/Analysis/RegionInfoImpl.h
@@ -163,9 +163,7 @@ typename RegionBase<Tr>::BlockT *RegionBase<Tr>::getEnteringBlock() const {
     assert(!AllowRepeats && "Unexpected parameter value.");
     return DT->getNode(Pred) && !contains(Pred) ? Pred : nullptr;
   };
-  BlockT *entry = getEntry();
-  return find_singleton<BlockT>(make_range(InvBlockTraits::child_begin(entry),
-                                           InvBlockTraits::child_end(entry)),
+  return find_singleton<BlockT>(llvm::inverse_children<BlockT *>(getEntry()),
                                 isEnteringBlock);
 }
 
@@ -177,10 +175,7 @@ bool RegionBase<Tr>::getExitingBlocks(
   if (!exit)
     return CoverAll;
 
-  for (PredIterTy PI = InvBlockTraits::child_begin(exit),
-                  PE = InvBlockTraits::child_end(exit);
-       PI != PE; ++PI) {
-    BlockT *Pred = *PI;
+  for (BlockT *Pred : llvm::inverse_children<BlockT *>(exit)) {
     if (contains(Pred)) {
       Exitings.push_back(Pred);
       continue;
@@ -202,8 +197,7 @@ typename RegionBase<Tr>::BlockT *RegionBase<Tr>::getExitingBlock() const {
     assert(!AllowRepeats && "Unexpected parameter value.");
     return contains(Pred) ? Pred : nullptr;
   };
-  return find_singleton<BlockT>(make_range(InvBlockTraits::child_begin(exit),
-                                           InvBlockTraits::child_end(exit)),
+  return find_singleton<BlockT>(llvm::inverse_children<BlockT *>(exit),
                                 isContained);
 }
 
@@ -244,16 +238,14 @@ void RegionBase<Tr>::verifyBBInRegion(BlockT *BB) const {
 
   BlockT *entry = getEntry(), *exit = getExit();
 
-  for (BlockT *Succ :
-       make_range(BlockTraits::child_begin(BB), BlockTraits::child_end(BB))) {
+  for (BlockT *Succ : llvm::children<BlockT *>(BB)) {
     if (!contains(Succ) && exit != Succ)
       report_fatal_error("Broken region found: edges leaving the region must go "
                          "to the exit node!");
   }
 
   if (entry != BB) {
-    for (BlockT *Pred : make_range(InvBlockTraits::child_begin(BB),
-                                   InvBlockTraits::child_end(BB))) {
+    for (BlockT *Pred : llvm::inverse_children<BlockT *>(BB)) {
       // Allow predecessors that are unreachable, as these are ignored during
       // region analysis.
       if (!contains(Pred) && DT->isReachableFromEntry(Pred))
@@ -271,8 +263,7 @@ void RegionBase<Tr>::verifyWalk(BlockT *BB, std::set<BlockT *> *visited) const {
 
   verifyBBInRegion(BB);
 
-  for (BlockT *Succ :
-       make_range(BlockTraits::child_begin(BB), BlockTraits::child_end(BB))) {
+  for (BlockT *Succ : llvm::children<BlockT *>(BB)) {
     if (Succ != exit && visited->find(Succ) == visited->end())
       verifyWalk(Succ, visited);
   }
@@ -453,8 +444,7 @@ typename Tr::RegionT *RegionBase<Tr>::getExpandedRegion() const {
   RegionT *R = RI->getRegionFor(exit);
 
   if (R->getEntry() != exit) {
-    for (BlockT *Pred : make_range(InvBlockTraits::child_begin(getExit()),
-                                   InvBlockTraits::child_end(getExit())))
+    for (BlockT *Pred : llvm::inverse_children<BlockT *>(getExit()))
       if (!contains(Pred))
         return nullptr;
     if (Tr::getNumSuccessors(exit) == 1)
@@ -465,8 +455,7 @@ typename Tr::RegionT *RegionBase<Tr>::getExpandedRegion() const {
   while (R->getParent() && R->getParent()->getEntry() == exit)
     R = R->getParent();
 
-  for (BlockT *Pred : make_range(InvBlockTraits::child_begin(getExit()),
-                                 InvBlockTraits::child_end(getExit()))) {
+  for (BlockT *Pred : llvm::inverse_children<BlockT *>(getExit())) {
     if (!(contains(Pred) || R->contains(Pred)))
       return nullptr;
   }
@@ -553,8 +542,7 @@ void RegionInfoBase<Tr>::verifyBBMap(const RegionT *R) const {
 template <class Tr>
 bool RegionInfoBase<Tr>::isCommonDomFrontier(BlockT *BB, BlockT *entry,
                                              BlockT *exit) const {
-  for (BlockT *P : make_range(InvBlockTraits::child_begin(BB),
-                              InvBlockTraits::child_end(BB))) {
+  for (BlockT *P : llvm::inverse_children<BlockT *>(BB)) {
     if (DT->dominates(entry, P) && !DT->dominates(exit, P))
       return false;
   }
@@ -837,8 +825,7 @@ RegionInfoBase<Tr>::getMaxRegionExit(BlockT *BB) const {
            ExitR->getParent()->getEntry() == Exit)
       ExitR = ExitR->getParent();
 
-    for (BlockT *Pred : make_range(InvBlockTraits::child_begin(Exit),
-                                   InvBlockTraits::child_end(Exit))) {
+    for (BlockT *Pred : llvm::inverse_children<BlockT *>(Exit)) {
       if (!R->contains(Pred) && !ExitR->contains(Pred))
         break;
     }

From c0396e15c348b17b6de7005ee057160812e29ad8 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 18:54:09 -0800
Subject: [PATCH 210/843] [AArch64] Use StringRef::contains_insensitive (NFC)

---
 llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 4198ac6c1c039..e9d96f3b838d4 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3858,7 +3858,7 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
 
   Lex(); // Eat operand.
 
-  bool ExpectRegister = (Op.lower().find("all") == StringRef::npos);
+  bool ExpectRegister = !Op.contains_insensitive("all");
   bool HasRegister = false;
 
   // Check for the optional register operand.

From b7a66d0faeb1d2838e89c3632ba7835e6c2af6cc Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 18:54:11 -0800
Subject: [PATCH 211/843] [llvm] Use SmallString::operator std::string (NFC)

---
 llvm/include/llvm/ADT/APFixedPoint.h                        | 2 +-
 llvm/include/llvm/ADT/StringExtras.h                        | 2 +-
 llvm/include/llvm/IR/ModuleSummaryIndex.h                   | 2 +-
 llvm/include/llvm/Object/MachO.h                            | 2 +-
 llvm/include/llvm/TableGen/StringToOffsetTable.h            | 2 +-
 llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp                | 2 +-
 llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp    | 2 +-
 llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp                 | 2 +-
 llvm/lib/DebugInfo/Symbolize/Symbolize.cpp                  | 2 +-
 llvm/lib/ExecutionEngine/ExecutionEngine.cpp                | 2 +-
 .../lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp | 2 +-
 llvm/lib/LTO/LTO.cpp                                        | 2 +-
 llvm/lib/LTO/ThinLTOCodeGenerator.cpp                       | 6 +++---
 llvm/lib/LineEditor/LineEditor.cpp                          | 2 +-
 llvm/lib/Object/Archive.cpp                                 | 2 +-
 llvm/lib/ProfileData/GCOV.cpp                               | 2 +-
 llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp                 | 2 +-
 llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp       | 4 ++--
 llvm/lib/Transforms/Utils/NameAnonGlobals.cpp               | 2 +-
 llvm/lib/WindowsDriver/MSVCPaths.cpp                        | 6 +++---
 20 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/llvm/include/llvm/ADT/APFixedPoint.h b/llvm/include/llvm/ADT/APFixedPoint.h
index 5442968d20e4a..b0c510865f444 100644
--- a/llvm/include/llvm/ADT/APFixedPoint.h
+++ b/llvm/include/llvm/ADT/APFixedPoint.h
@@ -235,7 +235,7 @@ class APFixedPoint {
   std::string toString() const {
     SmallString<40> S;
     toString(S);
-    return std::string(S.str());
+    return std::string(S);
   }
 
   void print(raw_ostream &) const;
diff --git a/llvm/include/llvm/ADT/StringExtras.h b/llvm/include/llvm/ADT/StringExtras.h
index c6fb8b6528089..a24368924bc90 100644
--- a/llvm/include/llvm/ADT/StringExtras.h
+++ b/llvm/include/llvm/ADT/StringExtras.h
@@ -332,7 +332,7 @@ inline std::string toString(const APInt &I, unsigned Radix, bool Signed,
                             bool formatAsCLiteral = false) {
   SmallString<40> S;
   I.toString(S, Radix, Signed, formatAsCLiteral);
-  return std::string(S.str());
+  return std::string(S);
 }
 
 inline std::string toString(const APSInt &I, unsigned Radix) {
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 66c7d10d823d9..395c9e332eb8f 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -1715,7 +1715,7 @@ class ModuleSummaryIndex {
     SmallString<256> NewName(Name);
     NewName += ".llvm.";
     NewName += Suffix;
-    return std::string(NewName.str());
+    return std::string(NewName);
   }
 
   /// Helper to obtain the unpromoted name for a global value (or the original
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index f91f21d837ce7..24f9954584ed5 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -826,7 +826,7 @@ class MachOObjectFile : public ObjectFile {
     Version = utostr(major) + "." + utostr(minor);
     if (update != 0)
       Version += "." + utostr(update);
-    return std::string(std::string(Version.str()));
+    return std::string(std::string(Version));
   }
 
   /// If the input path is a .dSYM bundle (as created by the dsymutil tool),
diff --git a/llvm/include/llvm/TableGen/StringToOffsetTable.h b/llvm/include/llvm/TableGen/StringToOffsetTable.h
index 7fcf20abed616..66bcc81c94b59 100644
--- a/llvm/include/llvm/TableGen/StringToOffsetTable.h
+++ b/llvm/include/llvm/TableGen/StringToOffsetTable.h
@@ -45,7 +45,7 @@ class StringToOffsetTable {
     // Escape the string.
     SmallString<256> Str;
     raw_svector_ostream(Str).write_escaped(AggregateString);
-    AggregateString = std::string(Str.str());
+    AggregateString = std::string(Str);
 
     O << "    \"";
     unsigned CharsPrinted = 0;
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index 6c85f9ac24bec..bcb8b94c1fa5a 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -218,7 +218,7 @@ static void analyzeImportedModule(
     ReportWarning(Twine("Conflicting parseable interfaces for Swift Module ") +
                       *Name + ": " + Entry + " and " + Path,
                   DIE);
-  Entry = std::string(ResolvedPath.str());
+  Entry = std::string(ResolvedPath);
 }
 
 /// The distinct types of work performed by the work loop in
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
index 212264714c2a5..fe2748f79c06d 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
@@ -294,7 +294,7 @@ void CompileUnit::analyzeImportedModule(const DWARFDebugInfoEntry *DieEntry) {
                ": " + Entry + " and " + Path + ".",
            &Die);
     }
-    Entry = std::string(ResolvedPath.str());
+    Entry = std::string(ResolvedPath);
   }
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 78792cf838916..28f05644a3aa1 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -1456,7 +1456,7 @@ bool DWARFDebugLine::Prologue::getFileNameByIndex(
 
   // sys::path::append skips empty strings.
   sys::path::append(FilePath, Style, IncludeDir, FileName);
-  Result = std::string(FilePath.str());
+  Result = std::string(FilePath);
   return true;
 }
 
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 37806be448ae1..5f29226c14b70 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -303,7 +303,7 @@ std::string getDarwinDWARFResourceForPath(const std::string &Path,
   }
   sys::path::append(ResourceName, "Contents", "Resources", "DWARF");
   sys::path::append(ResourceName, Basename);
-  return std::string(ResourceName.str());
+  return std::string(ResourceName);
 }
 
 bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
diff --git a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index 7abb9e9a6786b..31c290ad50530 100644
--- a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -197,7 +197,7 @@ std::string ExecutionEngine::getMangledName(const GlobalValue *GV) {
       : GV->getParent()->getDataLayout();
 
   Mangler::getNameWithPrefix(FullName, GV->getName(), DL);
-  return std::string(FullName.str());
+  return std::string(FullName);
 }
 
 void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
diff --git a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
index 5e0623102d335..f7852b0ca62e5 100644
--- a/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/TargetProcess/JITLoaderPerf.cpp
@@ -323,7 +323,7 @@ static Error InitDebuggingDir(PerfState &State) {
     return make_error<StringError>(std::move(ErrStr), inconvertibleErrorCode());
   }
 
-  State.JitPath = std::string(UniqueDebugDir.str());
+  State.JitPath = std::string(UniqueDebugDir);
 
   return Error::success();
 }
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 6a1e53b96998c..b38c568d10cd0 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1573,7 +1573,7 @@ std::string lto::getThinLTOOutputFile(StringRef Path, StringRef OldPrefix,
       llvm::errs() << "warning: could not create directory '" << ParentPath
                    << "': " << EC.message() << '\n';
   }
-  return std::string(NewPath.str());
+  return std::string(NewPath);
 }
 
 namespace {
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 443439b71e756..535faf5f78047 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -947,11 +947,11 @@ ThinLTOCodeGenerator::writeGeneratedObject(int count, StringRef CacheEntryPath,
     // Cache is enabled, hard-link the entry (or copy if hard-link fails).
     auto Err = sys::fs::create_hard_link(CacheEntryPath, OutputPath);
     if (!Err)
-      return std::string(OutputPath.str());
+      return std::string(OutputPath);
     // Hard linking failed, try to copy.
     Err = sys::fs::copy_file(CacheEntryPath, OutputPath);
     if (!Err)
-      return std::string(OutputPath.str());
+      return std::string(OutputPath);
     // Copy failed (could be because the CacheEntry was removed from the cache
     // in the meantime by another process), fall back and try to write down the
     // buffer to the output.
@@ -964,7 +964,7 @@ ThinLTOCodeGenerator::writeGeneratedObject(int count, StringRef CacheEntryPath,
   if (Err)
     report_fatal_error(Twine("Can't open output '") + OutputPath + "'\n");
   OS << OutputBuffer.getBuffer();
-  return std::string(OutputPath.str());
+  return std::string(OutputPath);
 }
 
 // Main entry point for the ThinLTO processing
diff --git a/llvm/lib/LineEditor/LineEditor.cpp b/llvm/lib/LineEditor/LineEditor.cpp
index bb408411a330e..d0d138bb1f9d8 100644
--- a/llvm/lib/LineEditor/LineEditor.cpp
+++ b/llvm/lib/LineEditor/LineEditor.cpp
@@ -25,7 +25,7 @@ std::string LineEditor::getDefaultHistoryPath(StringRef ProgName) {
   SmallString<32> Path;
   if (sys::path::home_directory(Path)) {
     sys::path::append(Path, "." + ProgName + "-history");
-    return std::string(Path.str());
+    return std::string(Path);
   }
   return std::string();
 }
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 4ac4d727afb69..e447e5b23316f 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -567,7 +567,7 @@ Expected<std::string> Archive::Child::getFullName() const {
   SmallString<128> FullName = sys::path::parent_path(
       Parent->getMemoryBufferRef().getBufferIdentifier());
   sys::path::append(FullName, Name);
-  return std::string(FullName.str());
+  return std::string(FullName);
 }
 
 Expected<StringRef> Archive::Child::getBuffer() const {
diff --git a/llvm/lib/ProfileData/GCOV.cpp b/llvm/lib/ProfileData/GCOV.cpp
index f7bf42e5c4d25..fcfeb5b0f5844 100644
--- a/llvm/lib/ProfileData/GCOV.cpp
+++ b/llvm/lib/ProfileData/GCOV.cpp
@@ -666,7 +666,7 @@ static std::string mangleCoveragePath(StringRef Filename, bool PreservePaths) {
 
   if (S < I)
     Result.append(S, I);
-  return std::string(Result.str());
+  return std::string(Result);
 }
 
 std::string Context::getCoveragePath(StringRef filename,
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 1c140edb07ac7..5d7ec0fb03098 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -67,7 +67,7 @@ class LibOptTable : public opt::GenericOptTable {
 static std::string getDefaultOutputPath(const NewArchiveMember &FirstMember) {
   SmallString<128> Val = StringRef(FirstMember.Buf->getBufferIdentifier());
   sys::path::replace_extension(Val, ".lib");
-  return std::string(Val.str());
+  return std::string(Val);
 }
 
 static std::vector<StringRef> getSearchPaths(opt::InputArgList *Args,
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index 1ff0a34bae243..c7f6f2a43c17f 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -535,7 +535,7 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
 
       SmallString<128> Filename = GCovFile->getString();
       sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
-      return std::string(Filename.str());
+      return std::string(Filename);
     }
   }
 
@@ -546,7 +546,7 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
   if (sys::fs::current_path(CurPath))
     return std::string(FName);
   sys::path::append(CurPath, FName);
-  return std::string(CurPath.str());
+  return std::string(CurPath);
 }
 
 bool GCOVProfiler::runOnModule(
diff --git a/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp b/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
index f41a14cdfbec5..9655cb9cf6f40 100644
--- a/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
+++ b/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
@@ -54,7 +54,7 @@ class ModuleHasher {
     Hasher.final(Hash);
     SmallString<32> Result;
     MD5::stringifyResult(Hash, Result);
-    TheHash = std::string(Result.str());
+    TheHash = std::string(Result);
     return TheHash;
   }
 };
diff --git a/llvm/lib/WindowsDriver/MSVCPaths.cpp b/llvm/lib/WindowsDriver/MSVCPaths.cpp
index b69ee11c1931b..634cfcb15f1d9 100644
--- a/llvm/lib/WindowsDriver/MSVCPaths.cpp
+++ b/llvm/lib/WindowsDriver/MSVCPaths.cpp
@@ -328,7 +328,7 @@ bool appendArchToWindowsSDKLibPath(int SDKMajor, SmallString<128> LibPath,
     }
   }
 
-  path = std::string(LibPath.str());
+  path = std::string(LibPath);
   return true;
 }
 
@@ -383,7 +383,7 @@ std::string getSubDirectoryPath(SubDirectoryType Type, ToolsetLayout VSLayout,
     sys::path::append(Path, "lib", SubdirName);
     break;
   }
-  return std::string(Path.str());
+  return std::string(Path);
 }
 
 bool useUniversalCRT(ToolsetLayout VSLayout, const std::string &VCToolChainPath,
@@ -720,7 +720,7 @@ bool findVCToolChainViaRegistry(std::string &Path, ToolsetLayout &VSLayout) {
       SmallString<256> VCPath(StringRef(VSInstallPath.c_str(), pos));
       sys::path::append(VCPath, "VC");
 
-      Path = std::string(VCPath.str());
+      Path = std::string(VCPath);
       VSLayout = ToolsetLayout::OlderVS;
       return true;
     }

From 02232307ce18c095ef0bf26b5cef23e4efbc1e4b Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Sat, 20 Jan 2024 11:02:40 +0800
Subject: [PATCH 212/843] [clang][analyzer] Improve modeling of 'fdopen' in
 StdLibraryFunctionsChecker (#78680)

---
 .../Checkers/StdLibraryFunctionsChecker.cpp   | 19 ++++++++++---------
 .../Analysis/std-c-library-functions-POSIX.c  |  2 +-
 clang/test/Analysis/stream-errno.c            | 10 ++++++++++
 clang/test/Analysis/stream-note.c             |  1 +
 4 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index 83f972c7072ec..d0eb5091444f6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -2171,6 +2171,16 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
             .ArgConstraint(NotNull(ArgNo(0)))
             .ArgConstraint(NotNull(ArgNo(1))));
 
+    // FILE *fdopen(int fd, const char *mode);
+    addToFunctionSummaryMap(
+        "fdopen",
+        Signature(ArgTypes{IntTy, ConstCharPtrTy}, RetType{FilePtrTy}),
+        Summary(NoEvalCall)
+            .Case({NotNull(Ret)}, ErrnoMustNotBeChecked, GenericSuccessMsg)
+            .Case({IsNull(Ret)}, ErrnoNEZeroIrrelevant, GenericFailureMsg)
+            .ArgConstraint(ArgumentCondition(0, WithinRange, Range(0, IntMax)))
+            .ArgConstraint(NotNull(ArgNo(1))));
+
     // FILE *tmpfile(void);
     addToFunctionSummaryMap(
         "tmpfile", Signature(ArgTypes{}, RetType{FilePtrTy}),
@@ -2853,15 +2863,6 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
         "pathconf", Signature(ArgTypes{ConstCharPtrTy, IntTy}, RetType{LongTy}),
         Summary(NoEvalCall).ArgConstraint(NotNull(ArgNo(0))));
 
-    // FILE *fdopen(int fd, const char *mode);
-    // FIXME: Improve for errno modeling.
-    addToFunctionSummaryMap(
-        "fdopen",
-        Signature(ArgTypes{IntTy, ConstCharPtrTy}, RetType{FilePtrTy}),
-        Summary(NoEvalCall)
-            .ArgConstraint(ArgumentCondition(0, WithinRange, Range(0, IntMax)))
-            .ArgConstraint(NotNull(ArgNo(1))));
-
     // void rewinddir(DIR *dir);
     addToFunctionSummaryMap(
         "rewinddir", Signature(ArgTypes{DirPtrTy}, RetType{VoidTy}),
diff --git a/clang/test/Analysis/std-c-library-functions-POSIX.c b/clang/test/Analysis/std-c-library-functions-POSIX.c
index 8a26e0b0f2728..51b136d9ba356 100644
--- a/clang/test/Analysis/std-c-library-functions-POSIX.c
+++ b/clang/test/Analysis/std-c-library-functions-POSIX.c
@@ -17,6 +17,7 @@
 // RUN:   -triple i686-unknown-linux 2>&1 | FileCheck %s
 
 // CHECK: Loaded summary for: FILE *fopen(const char *restrict pathname, const char *restrict mode)
+// CHECK: Loaded summary for: FILE *fdopen(int fd, const char *mode)
 // CHECK: Loaded summary for: FILE *tmpfile(void)
 // CHECK: Loaded summary for: FILE *freopen(const char *restrict pathname, const char *restrict mode, FILE *restrict stream)
 // CHECK: Loaded summary for: int fclose(FILE *stream)
@@ -78,7 +79,6 @@
 // CHECK: Loaded summary for: int close(int fildes)
 // CHECK: Loaded summary for: long fpathconf(int fildes, int name)
 // CHECK: Loaded summary for: long pathconf(const char *path, int name)
-// CHECK: Loaded summary for: FILE *fdopen(int fd, const char *mode)
 // CHECK: Loaded summary for: void rewinddir(DIR *dir)
 // CHECK: Loaded summary for: void seekdir(DIR *dirp, long loc)
 // CHECK: Loaded summary for: int rand_r(unsigned int *seedp)
diff --git a/clang/test/Analysis/stream-errno.c b/clang/test/Analysis/stream-errno.c
index 13981e9094d1c..fab6a58b3275a 100644
--- a/clang/test/Analysis/stream-errno.c
+++ b/clang/test/Analysis/stream-errno.c
@@ -18,6 +18,16 @@ void check_fopen(void) {
   if (errno) {} // expected-warning{{An undefined value may be read from 'errno' [unix.Errno]}}
 }
 
+void check_fdopen(int Fd) {
+  FILE *F = fdopen(Fd, "r");
+  if (!F) {
+    clang_analyzer_eval(errno != 0); // expected-warning{{TRUE}}
+    if (errno) {}                    // no-warning
+  } else {
+    if (errno) {}                    // expected-warning{{An undefined value may be read from 'errno' [unix.Errno]}}
+  }
+}
+
 void check_tmpfile(void) {
   FILE *F = tmpfile();
   if (!F) {
diff --git a/clang/test/Analysis/stream-note.c b/clang/test/Analysis/stream-note.c
index e412015eb6839..abb4784c078aa 100644
--- a/clang/test/Analysis/stream-note.c
+++ b/clang/test/Analysis/stream-note.c
@@ -56,6 +56,7 @@ void check_note_freopen(void) {
 
 void check_note_fdopen(int fd) {
   FILE *F = fdopen(fd, "r"); // expected-note {{Stream opened here}}
+  // stdargs-note@-1 {{'fdopen' is successful}}
   if (!F)
     // expected-note@-1 {{'F' is non-null}}
     // expected-note@-2 {{Taking false branch}}

From b3ea9b398fe656fab5d78d6b2c58bba975badc07 Mon Sep 17 00:00:00 2001
From: antangelo <contact@antangelo.com>
Date: Fri, 19 Jan 2024 23:06:44 -0500
Subject: [PATCH 213/843] Reland "[clang] Fix CTAD for aggregates for nested
 template classes" (#78670)

Reland of #78387

Use the template pattern in determining whether to synthesize the
aggregate deduction guide, and update
DeclareImplicitDeductionGuideFromInitList to substitute outer template
arguments.

The tests in the original patch made an assumption about the size of a
pointer type, and this led to them failing on targets with 32-bit
pointers. The tests have been updated to not depend on the size of any
type. This only requires updates to the test file, no functionality has
otherwise changed between this and the original patch.
---
 clang/docs/ReleaseNotes.rst                   |  3 ++
 clang/lib/Sema/SemaInit.cpp                   |  9 ++++-
 clang/lib/Sema/SemaTemplate.cpp               | 21 +++++++++---
 .../nested-implicit-deduction-guides.cpp      | 34 ++++++++++++++++---
 4 files changed, 58 insertions(+), 9 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 585e1535b1585..08e79984fe336 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1001,6 +1001,9 @@ Bug Fixes to C++ Support
 - Clang now allows parenthesized initialization of arrays in `operator new[]`.
   Fixes: (`#68198 <https://github.com/llvm/llvm-project/issues/68198>`_)
 
+- Fixes CTAD for aggregates on nested template classes. Fixes:
+  (`#77599 <https://github.com/llvm/llvm-project/issues/77599>`_)
+
 - Fix crash when importing the same module with an dynamic initializer twice
   in different visibility.
   Fixes (`#67893 <https://github.com/llvm/llvm-project/issues/67893>`_)
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 3170f41e8033f..91e4cb7b68a24 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -10751,7 +10751,14 @@ QualType Sema::DeduceTemplateSpecializationFromInitializer(
     bool HasAnyDeductionGuide = false;
 
     auto SynthesizeAggrGuide = [&](InitListExpr *ListInit) {
-      auto *RD = cast<CXXRecordDecl>(Template->getTemplatedDecl());
+      auto *Pattern = Template;
+      while (Pattern->getInstantiatedFromMemberTemplate()) {
+        if (Pattern->isMemberSpecialization())
+          break;
+        Pattern = Pattern->getInstantiatedFromMemberTemplate();
+      }
+
+      auto *RD = cast<CXXRecordDecl>(Pattern->getTemplatedDecl());
       if (!(RD->getDefinition() && RD->isAggregate()))
         return;
       QualType Ty = Context.getRecordType(RD);
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 0655d36335206..839d508b911f0 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -2418,6 +2418,9 @@ struct ConvertConstructorToDeductionGuideTransform {
     QualType Result = SemaRef.BuildFunctionType(DeducedType, ParamTypes, Loc,
                                                 DeductionGuideName, EPI);
     TypeSourceInfo *TSI = SemaRef.Context.getTrivialTypeSourceInfo(Result, Loc);
+    if (NestedPattern)
+      TSI = SemaRef.SubstType(TSI, OuterInstantiationArgs, Loc,
+                              DeductionGuideName);
 
     FunctionProtoTypeLoc FPTL =
         TSI->getTypeLoc().castAs<FunctionProtoTypeLoc>();
@@ -2425,9 +2428,13 @@ struct ConvertConstructorToDeductionGuideTransform {
     // Build the parameters, needed during deduction / substitution.
     SmallVector<ParmVarDecl*, 4> Params;
     for (auto T : ParamTypes) {
-      ParmVarDecl *NewParam = ParmVarDecl::Create(
-          SemaRef.Context, DC, Loc, Loc, nullptr, T,
-          SemaRef.Context.getTrivialTypeSourceInfo(T, Loc), SC_None, nullptr);
+      auto *TSI = SemaRef.Context.getTrivialTypeSourceInfo(T, Loc);
+      if (NestedPattern)
+        TSI = SemaRef.SubstType(TSI, OuterInstantiationArgs, Loc,
+                                DeclarationName());
+      ParmVarDecl *NewParam =
+          ParmVarDecl::Create(SemaRef.Context, DC, Loc, Loc, nullptr,
+                              TSI->getType(), TSI, SC_None, nullptr);
       NewParam->setScopeInfo(0, Params.size());
       FPTL.setParam(Params.size(), NewParam);
       Params.push_back(NewParam);
@@ -2670,8 +2677,14 @@ FunctionTemplateDecl *Sema::DeclareImplicitDeductionGuideFromInitList(
   if (BuildingDeductionGuides.isInvalid())
     return nullptr;
 
-  return cast<FunctionTemplateDecl>(
+  ClassTemplateDecl *Pattern =
+      Transform.NestedPattern ? Transform.NestedPattern : Transform.Template;
+  ContextRAII SavedContext(*this, Pattern->getTemplatedDecl());
+
+  auto *DG = cast<FunctionTemplateDecl>(
       Transform.buildSimpleDeductionGuide(ParamTypes));
+  SavedContext.pop();
+  return DG;
 }
 
 void Sema::DeclareImplicitDeductionGuides(TemplateDecl *Template,
diff --git a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
index c44ec6918c7af..38b6706595a11 100644
--- a/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
+++ b/clang/test/SemaTemplate/nested-implicit-deduction-guides.cpp
@@ -1,5 +1,4 @@
 // RUN: %clang_cc1 -std=c++20 -verify %s
-// expected-no-diagnostics
 
 template<class T> struct S {
     template<class U> struct N {
@@ -36,11 +35,14 @@ using NonTypeParam = decltype(ntp);
 using NonTypeParam = non_type_param<int>::B<int>;
 
 template<typename A, typename T>
-concept C = (sizeof(T) == sizeof(A));
+concept True = true;
+
+template<typename T>
+concept False = false;
 
 template<class X> struct concepts {
     template<class Y> struct B {
-        template<class K = X, C<K> Z> B(Y, Z);
+        template<class K = X, True<K> Z> B(Y, Z);
     };
 };
 
@@ -50,7 +52,7 @@ using Concepts = concepts<int>::B<int>;
 
 template<class X> struct requires_clause {
     template<class Y> struct B {
-        template<class Z> requires (sizeof(Z) == sizeof(X))
+        template<class Z> requires true
             B(Y, Z);
     };
 };
@@ -58,3 +60,27 @@ template<class X> struct requires_clause {
 requires_clause<int>::B req(1, 2);
 using RC = decltype(req);
 using RC = requires_clause<int>::B<int>;
+
+template<typename X> struct nested_init_list {
+    template<True<X> Y>
+    struct B {
+        X x;
+        Y y;
+    };
+
+    template<False F>
+    struct concept_fail { // #INIT_LIST_INNER_INVALID
+        X x;
+        F f;
+    };
+};
+
+nested_init_list<int>::B nil {1, 2};
+using NIL = decltype(nil);
+using NIL = nested_init_list<int>::B<int>;
+
+// expected-error@+1 {{no viable constructor or deduction guide for deduction of template arguments of 'concept_fail'}}
+nested_init_list<int>::concept_fail nil_invalid{1, ""};
+// expected-note@#INIT_LIST_INNER_INVALID {{candidate template ignored: substitution failure [with F = const char *]: constraints not satisfied for class template 'concept_fail' [with F = const char *]}}
+// expected-note@#INIT_LIST_INNER_INVALID {{candidate function template not viable: requires 1 argument, but 2 were provided}}
+// expected-note@#INIT_LIST_INNER_INVALID {{candidate function template not viable: requires 0 arguments, but 2 were provided}}

From ed276dff464a91587e5b2a24a99a7b00f7b6bf7e Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hristo.goshev.hristov@gmail.com>
Date: Sat, 20 Jan 2024 06:07:19 +0200
Subject: [PATCH 214/843] [libc++][memory] P2868R1: Removing deprecated typedef
 `std::allocator::is_always_equal` (#78562)

Implements:
- https://wg21.link/P2868R1
- https://wg21.link/LWG3170

---------

Co-authored-by: Zingam <zingam@outlook.com>
---
 libcxx/.clang-format                          |  1 +
 libcxx/docs/ReleaseNotes/18.rst               |  4 +++
 libcxx/docs/Status/Cxx23Issues.csv            |  2 +-
 libcxx/docs/Status/Cxx2cPapers.csv            |  2 +-
 libcxx/docs/UsingLibcxx.rst                   |  3 ++
 libcxx/include/__memory/allocator.h           |  8 +++--
 libcxx/include/memory                         |  2 +-
 ...cator_types.deprecated_in_cxx23.verify.cpp | 33 ++++++++++++++++++
 .../allocator_types.pass.cpp                  |  2 ++
 ...llocator_types.removed_in_cxx26.verify.cpp | 34 +++++++++++++++++++
 10 files changed, 86 insertions(+), 5 deletions(-)
 create mode 100644 libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx23.verify.cpp
 create mode 100644 libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx26.verify.cpp

diff --git a/libcxx/.clang-format b/libcxx/.clang-format
index 56bdf2b5f9116..39ae1322ffa8a 100644
--- a/libcxx/.clang-format
+++ b/libcxx/.clang-format
@@ -28,6 +28,7 @@ AttributeMacros: [
                   '_LIBCPP_DEPRECATED_IN_CXX14',
                   '_LIBCPP_DEPRECATED_IN_CXX17',
                   '_LIBCPP_DEPRECATED_IN_CXX20',
+                  '_LIBCPP_DEPRECATED_IN_CXX23',
                   '_LIBCPP_DEPRECATED',
                   '_LIBCPP_DISABLE_EXTENTSION_WARNING',
                   '_LIBCPP_EXCLUDE_FROM_EXPLICIT_INSTANTIATION',
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 37f2a77ca4f85..87eae54307712 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -59,6 +59,7 @@ Implemented Papers
 - P2821R5 - ``span.at()``
 - P0521R0 - Proposed Resolution for CA 14 (``shared_ptr`` ``use_count/unique``)
 - P1759R6 - Native handles and file streams
+- P2868R3 - Remove Deprecated ``std::allocator`` Typedef From C++26
 - P2517R1 - Add a conditional ``noexcept`` specification to ``std::apply``
 
 
@@ -86,6 +87,9 @@ Improvements and New Features
 - The ``_LIBCPP_ENABLE_CXX26_REMOVED_STRING_RESERVE`` macro has been added to make
   the function ``std::basic_string<...>::reserve()`` available.
 
+- The ``_LIBCPP_ENABLE_CXX26_REMOVED_ALLOCATOR_MEMBERS`` macro has been added to make
+  the function ``allocator<T>::is_always_equal`` available.
+
 - The ``_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE`` macro has been added to make
   the function ``std::shared_ptr<...>::unique()`` available.
 
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index b24ecc5525a14..70480b3382058 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -15,7 +15,7 @@
 "`2743 <https://wg21.link/LWG2743>`__","P0083R3 ``node_handle`` private members missing ""exposition only"" comment","November 2020","|Nothing To Do|",""
 "`2820 <https://wg21.link/LWG2820>`__","Clarify ``<cstdint>`` macros","November 2020","|Nothing To Do|",""
 "`3120 <https://wg21.link/LWG3120>`__","Unclear behavior of ``monotonic_buffer_resource::release()``","November 2020","",""
-"`3170 <https://wg21.link/LWG3170>`__","``is_always_equal`` added to ``std::allocator`` makes the standard library treat derived types as always equal","November 2020","",""
+"`3170 <https://wg21.link/LWG3170>`__","``is_always_equal`` added to ``std::allocator`` makes the standard library treat derived types as always equal","November 2020","|Complete|","18.0"
 "`3036 <https://wg21.link/LWG3036>`__","``polymorphic_allocator::destroy`` is extraneous","November 2020","",""
 "`3171 <https://wg21.link/LWG3171>`__","LWG2989 breaks ``directory_entry`` stream insertion","November 2020","|Complete|","14.0"
 "`3306 <https://wg21.link/LWG3306>`__","``ranges::advance`` violates its preconditions","November 2020","|Complete|","14.0","|ranges|"
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 5701717f39766..762cbc6d487c6 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -36,7 +36,7 @@
 "`P0952R2 <https://wg21.link/P0952R2>`__","LWG","A new specification for ``std::generate_canonical``","Kona November 2023","","",""
 "`P2447R6 <https://wg21.link/P2447R6>`__","LWG","``std::span`` over an initializer list","Kona November 2023","","",""
 "`P2821R5 <https://wg21.link/P2821R5>`__","LWG","``span.at()``","Kona November 2023","|Complete|","18.0",""
-"`P2868R3 <https://wg21.link/P2868R3>`__","LWG","Remove Deprecated ``std::allocator`` Typedef From C++26","Kona November 2023","","",""
+"`P2868R3 <https://wg21.link/P2868R3>`__","LWG","Remove Deprecated ``std::allocator`` Typedef From C++26","Kona November 2023","|Complete|","18.0",""
 "`P2870R3 <https://wg21.link/P2870R3>`__","LWG","Remove ``basic_string::reserve()`` From C++26","Kona November 2023","|Complete|","18.0",""
 "`P2871R3 <https://wg21.link/P2871R3>`__","LWG","Remove Deprecated Unicode Conversion Facets from C++26","Kona November 2023","|Complete|","18.0",""
 "`P2819R2 <https://wg21.link/P2819R2>`__","LWG","Add tuple protocol to complex","Kona November 2023","","",""
diff --git a/libcxx/docs/UsingLibcxx.rst b/libcxx/docs/UsingLibcxx.rst
index daf39c9faa449..3b1be286c1698 100644
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@@ -283,6 +283,9 @@ C++26 Specific Configuration Macros
   This macro is used to re-enable the function
   ``std::basic_string<...>::reserve()``.
 
+**_LIBCPP_ENABLE_CXX26_REMOVED_ALLOCATOR_MEMBERS**:
+  This macro is used to re-enable redundant member of ``allocator<T>::is_always_equal``
+
 Libc++ Extensions
 =================
 
diff --git a/libcxx/include/__memory/allocator.h b/libcxx/include/__memory/allocator.h
index 8d54e0f0897dd..4e6303914c38a 100644
--- a/libcxx/include/__memory/allocator.h
+++ b/libcxx/include/__memory/allocator.h
@@ -107,7 +107,9 @@ class _LIBCPP_TEMPLATE_VIS allocator : private __non_trivial_if<!is_void<_Tp>::v
   typedef ptrdiff_t difference_type;
   typedef _Tp value_type;
   typedef true_type propagate_on_container_move_assignment;
-  typedef true_type is_always_equal;
+#if _LIBCPP_STD_VER <= 23 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_ALLOCATOR_MEMBERS)
+  _LIBCPP_DEPRECATED_IN_CXX23 typedef true_type is_always_equal;
+#endif
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator() _NOEXCEPT = default;
 
@@ -185,7 +187,9 @@ class _LIBCPP_TEMPLATE_VIS allocator<const _Tp>
   typedef ptrdiff_t difference_type;
   typedef const _Tp value_type;
   typedef true_type propagate_on_container_move_assignment;
-  typedef true_type is_always_equal;
+#if _LIBCPP_STD_VER <= 23 || defined(_LIBCPP_ENABLE_CXX26_REMOVED_ALLOCATOR_MEMBERS)
+  _LIBCPP_DEPRECATED_IN_CXX23 typedef true_type is_always_equal;
+#endif
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 allocator() _NOEXCEPT = default;
 
diff --git a/libcxx/include/memory b/libcxx/include/memory
index ee245d5fd2dcb..19c11ee949872 100644
--- a/libcxx/include/memory
+++ b/libcxx/include/memory
@@ -139,7 +139,7 @@ public:
     template <class U> struct rebind {typedef allocator<U> other;}; // deprecated in C++17, removed in C++20
 
     typedef true_type propagate_on_container_move_assignment;
-    typedef true_type is_always_equal;
+    typedef true_type is_always_equal;                   // Deprecated in C++23, removed in C++26
 
     constexpr allocator() noexcept;                      // constexpr in C++20
     constexpr allocator(const allocator&) noexcept;      // constexpr in C++20
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx23.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx23.verify.cpp
new file mode 100644
index 0000000000000..ad431f94732ee
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.deprecated_in_cxx23.verify.cpp
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: c++23
+
+// <memory>
+
+// template <class T>
+// class allocator
+// {
+// ...
+//     typedef true_type is_always_equal; // Deprecated in C++23, removed in C++26
+// ...
+// };
+
+#include <memory>
+
+void test() {
+  {
+    typedef std::allocator<char>::is_always_equal IAE; // expected-warning {{'is_always_equal' is deprecated}}
+  }
+  {
+    typedef std::allocator<const char>::is_always_equal IAE; // expected-warning {{'is_always_equal' is deprecated}}
+  }
+  {
+    typedef std::allocator<void>::is_always_equal IAE; // expected-warning {{'is_always_equal' is deprecated}}
+  }
+}
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp
index 74adc6943594d..7085a1d3fc602 100644
--- a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.pass.cpp
@@ -46,7 +46,9 @@ void test() {
     static_assert((std::is_same<typename Alloc::difference_type, std::ptrdiff_t>::value), "");
     static_assert((std::is_same<typename Alloc::value_type, T>::value), "");
     static_assert((std::is_same<typename Alloc::propagate_on_container_move_assignment, std::true_type>::value), "");
+#if TEST_STD_VER <= 23
     static_assert((std::is_same<typename Alloc::is_always_equal, std::true_type>::value), "");
+#endif
 
 #if TEST_STD_VER <= 17
     static_assert((std::is_same<typename Alloc::pointer, T*>::value), "");
diff --git a/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx26.verify.cpp b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx26.verify.cpp
new file mode 100644
index 0000000000000..72cc21402327a
--- /dev/null
+++ b/libcxx/test/std/utilities/memory/default.allocator/allocator_types.removed_in_cxx26.verify.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <memory>
+
+// Check that the following nested types are removed in C++26:
+
+// template <class T>
+// class allocator
+// {
+// ...
+//     typedef true_type is_always_equal; // Deprecated in C++23, removed in C++26
+// ...
+// };
+
+#include <memory>
+
+template <typename T>
+void check() {
+  using IAE = typename std::allocator<T>::is_always_equal; // expected-error 3 {{no type named 'is_always_equal'}}
+}
+
+void test() {
+  check<char>();
+  check<char const>();
+  check<void>();
+}

From 552f5549de38d69116d29657132aea85f640dee1 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hristo.goshev.hristov@gmail.com>
Date: Sat, 20 Jan 2024 06:08:24 +0200
Subject: [PATCH 215/843] [libc++][any] LWG3305: `any_cast<void>` (#78215)

Implements: https://wg21.link/LWG3305
- https://eel.is/c++draft/any.nonmembers

---------

Co-authored-by: Zingam <zingam@outlook.com>
---
 libcxx/docs/Status/Cxx2cIssues.csv            |  2 +-
 libcxx/include/any                            |  3 ++
 .../any.nonmembers/any.cast/void.verify.cpp   | 34 +++++++++++++++++++
 3 files changed, 38 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/test/std/utilities/any/any.nonmembers/any.cast/void.verify.cpp

diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index e03b7c783bc38..b69b094832541 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -19,7 +19,7 @@
 "","","","","",""
 "`2392 <https://wg21.link/LWG2392>`__","""character type"" is used but not defined","Kona November 2023","","",""
 "`3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","Kona November 2023","","",""
-"`3305 <https://wg21.link/LWG3305>`__","``any_cast<void>``","Kona November 2023","","",""
+"`3305 <https://wg21.link/LWG3305>`__","``any_cast<void>``","Kona November 2023","|Complete|","18.0",""
 "`3431 <https://wg21.link/LWG3431>`__","``<=>`` for containers should require ``three_way_comparable<T>`` instead of ``<=>``","Kona November 2023","","",""
 "`3749 <https://wg21.link/LWG3749>`__","``common_iterator`` should handle integer-class difference types","Kona November 2023","","",""
 "`3809 <https://wg21.link/LWG3809>`__","Is ``std::subtract_with_carry_engine<uint16_t>`` supposed to work","Kona November 2023","","",""
diff --git a/libcxx/include/any b/libcxx/include/any
index b9e0a8d94550c..378dfb6e21b53 100644
--- a/libcxx/include/any
+++ b/libcxx/include/any
@@ -98,6 +98,7 @@ namespace std {
 #include <__type_traits/is_nothrow_move_constructible.h>
 #include <__type_traits/is_reference.h>
 #include <__type_traits/is_same.h>
+#include <__type_traits/is_void.h>
 #include <__type_traits/remove_cv.h>
 #include <__type_traits/remove_cvref.h>
 #include <__type_traits/remove_reference.h>
@@ -555,6 +556,7 @@ inline _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_ANY_CAST _ValueType
 
 template <class _ValueType>
 inline _LIBCPP_HIDE_FROM_ABI add_pointer_t<add_const_t<_ValueType>> any_cast(any const* __any) _NOEXCEPT {
+  static_assert(!is_void_v<_ValueType>, "_ValueType may not be void.");
   static_assert(!is_reference<_ValueType>::value, "_ValueType may not be a reference.");
   return std::any_cast<_ValueType>(const_cast<any*>(__any));
 }
@@ -572,6 +574,7 @@ inline _LIBCPP_HIDE_FROM_ABI _RetType __pointer_or_func_cast(void*, /*IsFunction
 template <class _ValueType>
 _LIBCPP_HIDE_FROM_ABI add_pointer_t<_ValueType> any_cast(any* __any) _NOEXCEPT {
   using __any_imp::_Action;
+  static_assert(!is_void_v<_ValueType>, "_ValueType may not be void.");
   static_assert(!is_reference<_ValueType>::value, "_ValueType may not be a reference.");
   typedef add_pointer_t<_ValueType> _ReturnType;
   if (__any && __any->__h_) {
diff --git a/libcxx/test/std/utilities/any/any.nonmembers/any.cast/void.verify.cpp b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/void.verify.cpp
new file mode 100644
index 0000000000000..9530d63e07b0a
--- /dev/null
+++ b/libcxx/test/std/utilities/any/any.nonmembers/any.cast/void.verify.cpp
@@ -0,0 +1,34 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <any>
+
+// template<class T>
+// const T* any_cast(const any* operand) noexcept;
+
+// template<class T>
+// T* any_cast(any* operand) noexcept;
+
+#include <any>
+
+void test() {
+  {
+    const std::any ca = 1;
+
+    // expected-error-re@any:* {{static assertion failed{{.*}}_ValueType may not be void.}}
+    std::any_cast<void>(&ca); // expected-note {{requested here}}
+  }
+  {
+    std::any a = 1;
+
+    // expected-error-re@any:* {{static assertion failed{{.*}}_ValueType may not be void.}}
+    std::any_cast<void>(&a); // expected-note {{requested here}}
+  }
+}

From dbbeee6b8357c5a68543f612f3b2b607f1911b4c Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hristo.goshev.hristov@gmail.com>
Date: Sat, 20 Jan 2024 06:09:46 +0200
Subject: [PATCH 216/843] [libc++][span] P2447R4: `std::span` over an
 initializer list (#78157)

Implements: https://wg21.link/P2447R6
- https://eel.is/c++draft/span.syn
- https://eel.is/c++draft/span.overview
- https://eel.is/c++draft/span.cons
- https://eel.is/c++draft/diff

---------

Co-authored-by: Zingam <zingam@outlook.com>
---
 libcxx/docs/FeatureTestMacroTable.rst         |   2 +-
 libcxx/docs/ReleaseNotes/18.rst               |   1 +
 libcxx/docs/Status/Cxx2cPapers.csv            |   2 +-
 libcxx/include/span                           |  16 +++
 libcxx/include/version                        |   2 +-
 .../views/views.span/span.cons/array.pass.cpp |   4 +
 .../initializer_list.assert.pass.cpp          |  32 +++++
 .../span.cons/initializer_list.pass.cpp       | 125 +++++++++++++++---
 .../span.cons/initializer_list.verify.cpp     |  48 +++++++
 .../span.cons/iterator_len.verify.cpp         |  19 ++-
 .../span.version.compile.pass.cpp             |  16 +--
 .../version.version.compile.pass.cpp          |  16 +--
 .../generate_feature_test_macro_components.py |   1 -
 13 files changed, 238 insertions(+), 46 deletions(-)
 create mode 100644 libcxx/test/std/containers/views/views.span/span.cons/initializer_list.assert.pass.cpp
 create mode 100644 libcxx/test/std/containers/views/views.span/span.cons/initializer_list.verify.cpp

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 893a3b13ca06e..82e51d4e7eb53 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -438,7 +438,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_span_at``                               ``202311L``
     --------------------------------------------------- -----------------
-    ``__cpp_lib_span_initializer_list``                 *unimplemented*
+    ``__cpp_lib_span_initializer_list``                 ``202311L``
     --------------------------------------------------- -----------------
     ``__cpp_lib_sstream_from_string_view``              *unimplemented*
     --------------------------------------------------- -----------------
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 87eae54307712..311a06e34c49d 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -61,6 +61,7 @@ Implemented Papers
 - P1759R6 - Native handles and file streams
 - P2868R3 - Remove Deprecated ``std::allocator`` Typedef From C++26
 - P2517R1 - Add a conditional ``noexcept`` specification to ``std::apply``
+- P2447R6 - ``span`` over initializer list
 
 
 Improvements and New Features
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 762cbc6d487c6..bdfdfdd2d53c8 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -34,7 +34,7 @@
 "`P2918R2 <https://wg21.link/P2918R2>`__","LWG","Runtime format strings II","Kona November 2023","|Complete|","18.0","|format|"
 "`P2909R4 <https://wg21.link/P2909R4>`__","LWG","Fix formatting of code units as integers (Dude, where’s my ``char``?)","Kona November 2023","|Complete|","18.0","|format| |DR|"
 "`P0952R2 <https://wg21.link/P0952R2>`__","LWG","A new specification for ``std::generate_canonical``","Kona November 2023","","",""
-"`P2447R6 <https://wg21.link/P2447R6>`__","LWG","``std::span`` over an initializer list","Kona November 2023","","",""
+"`P2447R6 <https://wg21.link/P2447R6>`__","LWG","``std::span`` over an initializer list","Kona November 2023","|Complete|","18.0",""
 "`P2821R5 <https://wg21.link/P2821R5>`__","LWG","``span.at()``","Kona November 2023","|Complete|","18.0",""
 "`P2868R3 <https://wg21.link/P2868R3>`__","LWG","Remove Deprecated ``std::allocator`` Typedef From C++26","Kona November 2023","|Complete|","18.0",""
 "`P2870R3 <https://wg21.link/P2870R3>`__","LWG","Remove ``basic_string::reserve()`` From C++26","Kona November 2023","|Complete|","18.0",""
diff --git a/libcxx/include/span b/libcxx/include/span
index 007a32597f965..32364b4270be9 100644
--- a/libcxx/include/span
+++ b/libcxx/include/span
@@ -68,6 +68,7 @@ public:
         constexpr span(const array<value_type, N>& arr) noexcept;
     template<class R>
       constexpr explicit(Extent != dynamic_extent) span(R&& r);
+    constexpr explicit(extent != dynamic_extent) span(std::initializer_list<value_type> il); // Since C++26
     constexpr span(const span& other) noexcept = default;
     template <class OtherElementType, size_t OtherExtent>
         constexpr explicit(Extent != dynamic_extent) span(const span<OtherElementType, OtherExtent>& s) noexcept;
@@ -228,6 +229,15 @@ public:
     requires(_Sz == 0)
   _LIBCPP_HIDE_FROM_ABI constexpr span() noexcept : __data_{nullptr} {}
 
+#  if _LIBCPP_STD_VER >= 26
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit span(std::initializer_list<value_type> __il)
+    requires is_const_v<element_type>
+      : __data_{__il.begin()} {
+    _LIBCPP_ASSERT_VALID_INPUT_RANGE(
+        _Extent == __il.size(), "Size mismatch in span's constructor _Extent != __il.size().");
+  }
+#  endif
+
   constexpr span(const span&) noexcept            = default;
   constexpr span& operator=(const span&) noexcept = default;
 
@@ -397,6 +407,12 @@ public:
   // [span.cons], span constructors, copy, assignment, and destructor
   _LIBCPP_HIDE_FROM_ABI constexpr span() noexcept : __data_{nullptr}, __size_{0} {}
 
+#  if _LIBCPP_STD_VER >= 26
+  _LIBCPP_HIDE_FROM_ABI constexpr span(std::initializer_list<value_type> __il)
+    requires is_const_v<element_type>
+      : __data_{__il.begin()}, __size_{__il.size()} {}
+#  endif
+
   constexpr span(const span&) noexcept            = default;
   constexpr span& operator=(const span&) noexcept = default;
 
diff --git a/libcxx/include/version b/libcxx/include/version
index c96647894dce6..7b2f487fe7020 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -507,7 +507,7 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 // # define __cpp_lib_saturation_arithmetic                202311L
 // # define __cpp_lib_smart_ptr_owner_equality             202306L
 # define __cpp_lib_span_at                              202311L
-// # define __cpp_lib_span_initializer_list                202311L
+# define __cpp_lib_span_initializer_list                202311L
 // # define __cpp_lib_sstream_from_string_view             202306L
 // # define __cpp_lib_submdspan                            202306L
 // # define __cpp_lib_text_encoding                        202306L
diff --git a/libcxx/test/std/containers/views/views.span/span.cons/array.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/array.pass.cpp
index 8fa7692c3b637..c02f42400b6e1 100644
--- a/libcxx/test/std/containers/views/views.span/span.cons/array.pass.cpp
+++ b/libcxx/test/std/containers/views/views.span/span.cons/array.pass.cpp
@@ -94,7 +94,11 @@ constexpr bool testSpan()
     assert(s4.data() == val && s4.size() == 2);
 
     std::span<const int> s5 = {{1,2}};
+#if TEST_STD_VER >= 26
+    std::span<const int, 2> s6({1, 2});
+#else
     std::span<const int, 2> s6 = {{1,2}};
+#endif
     assert(s5.size() == 2);  // and it dangles
     assert(s6.size() == 2);  // and it dangles
 
diff --git a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.assert.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.assert.pass.cpp
new file mode 100644
index 0000000000000..262e56c74287d
--- /dev/null
+++ b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.assert.pass.cpp
@@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// REQUIRES: has-unix-headers
+// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// XFAIL: availability-verbose_abort-missing
+
+// <span>
+
+// constexpr explicit(extent != dynamic_extent) span(std::initializer_list<value_type> il); // Since C++26
+
+#include <cassert>
+#include <initializer_list>
+#include <span>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  TEST_LIBCPP_ASSERT_FAILURE(
+      (std::span<const int, 4>({1, 2, 3, 9084, 5})), "Size mismatch in span's constructor _Extent != __il.size().");
+  TEST_LIBCPP_ASSERT_FAILURE((std::span<const int, 4>(std::initializer_list<int>{1, 2, 3, 9084, 5})),
+                             "Size mismatch in span's constructor _Extent != __il.size().");
+
+  return 0;
+}
diff --git a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
index d0f4cc795f3e2..74a5094f61261 100644
--- a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
+++ b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.pass.cpp
@@ -5,39 +5,134 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+
 // UNSUPPORTED: c++03, c++11, c++14, c++17
 
 // <span>
 
-#include <span>
+// constexpr explicit(extent != dynamic_extent) span(std::initializer_list<value_type> il); // Since C++26
+
+#include <any>
 #include <cassert>
 #include <cstddef>
+#include <initializer_list>
+#include <span>
+#include <type_traits>
+
+#include "test_convertible.h"
+#include "test_macros.h"
+
+#if TEST_STD_VER >= 26
+
+// SFINAE
+
+template <typename T>
+concept ConstElementType = std::is_const_v<typename T::element_type>;
+
+static_assert(ConstElementType<std::span<const int>>);
+static_assert(!ConstElementType<std::span<int>>);
+static_assert(ConstElementType<std::span<const int, 94>>);
+static_assert(!ConstElementType<std::span<int, 94>>);
+
+// Constructor constraings
+
+template <typename I, typename T, std::size_t... N>
+concept HasInitializerListCtr = requires(I il) { std::span<T, N...>{il}; };
+
+static_assert(HasInitializerListCtr<std::initializer_list<const int>, const int>);
+static_assert(!HasInitializerListCtr<std::initializer_list<int>, int>);
+static_assert(HasInitializerListCtr<std::initializer_list<const int>, const int, 94>);
+static_assert(!HasInitializerListCtr<std::initializer_list<int>, int, 94>);
+
+// Constructor conditionally explicit
+
+static_assert(!test_convertible<std::span<const int, 28>, std::initializer_list<int>>(),
+              "This constructor must be explicit");
+static_assert(std::is_constructible_v<std::span<const int, 28>, std::initializer_list<int>>);
+static_assert(test_convertible<std::span<const int>, std::initializer_list<int>>(),
+              "This constructor must not be explicit");
+static_assert(std::is_constructible_v<std::span<const int>, std::initializer_list<int>>);
+
+#endif
 
 struct Sink {
-    constexpr Sink() = default;
-    constexpr Sink(Sink*) {}
+  constexpr Sink() = default;
+  constexpr Sink(Sink*) {}
 };
 
-constexpr std::size_t count(std::span<const Sink> sp) {
-    return sp.size();
-}
+constexpr std::size_t count(std::span<const Sink> sp) { return sp.size(); }
 
-template<int N>
-constexpr std::size_t countn(std::span<const Sink, N> sp) {
-    return sp.size();
+template <std::size_t N>
+constexpr std::size_t count_n(std::span<const Sink, N> sp) {
+  return sp.size();
 }
 
 constexpr bool test() {
+#if TEST_STD_VER >= 26
+  // Dynamic extent
+  {
+    Sink a[10];
+
+    assert(count({a}) == 1);
+    assert(count({a, a + 10}) == 2);
+    assert(count({a, a + 1, a + 2}) == 3);
+    assert(count(std::initializer_list<Sink>{a[0], a[1], a[2], a[3]}) == 4);
+  }
+#else
+  {
     Sink a[10];
+
     assert(count({a}) == 10);
-    assert(count({a, a+10}) == 10);
-    assert(countn<10>({a}) == 10);
-    return true;
+    assert(count({a, a + 10}) == 10);
+    assert(count_n<10>({a}) == 10);
+  }
+#endif
+
+  return true;
+}
+
+// Test P2447R4 "Annex C examples"
+
+constexpr int three(std::span<void* const> sp) { return sp.size(); }
+
+constexpr int four(std::span<const std::any> sp) { return sp.size(); }
+
+bool test_P2447R4_annex_c_examples() {
+  // 1. Overload resolution is affected
+  // --> tested in "initializer_list.verify.cpp"
+
+  // 2. The `initializer_list` ctor has high precedence
+  // --> tested in "initializer_list.verify.cpp"
+
+  // 3. Implicit two-argument construction with a highly convertible value_type
+#if TEST_STD_VER >= 26
+  {
+    void* a[10];
+    assert(three({a, 0}) == 2);
+  }
+  {
+    std::any a[10];
+    assert(four({a, a + 10}) == 2);
+  }
+#else
+  {
+    void* a[10];
+    assert(three({a, 0}) == 0);
+  }
+  {
+    std::any a[10];
+    assert(four({a, a + 10}) == 10);
+  }
+#endif
+
+  return true;
 }
 
 int main(int, char**) {
-    test();
-    static_assert(test());
+  assert(test());
+  static_assert(test());
+
+  assert(test_P2447R4_annex_c_examples());
 
-    return 0;
+  return 0;
 }
diff --git a/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.verify.cpp b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.verify.cpp
new file mode 100644
index 0000000000000..93bcd0551cdb0
--- /dev/null
+++ b/libcxx/test/std/containers/views/views.span/span.cons/initializer_list.verify.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// <span>
+
+// constexpr explicit(extent != dynamic_extent) span(std::initializer_list<value_type> il); // Since C++26
+
+#include <span>
+#include <utility>
+
+#include "test_macros.h"
+
+// Test P2447R4 "Annex C examples"
+
+void one(std::pair<int, int>);
+void one(std::span<const int>);
+
+void two(std::span<const int, 2>);
+
+void test_P2447R4_annex_c_examples() {
+  // 1. Overload resolution is affected
+#if TEST_STD_VER >= 26
+  // expected-error@+1 {{call to 'one' is ambiguous}}
+  one({1, 2});
+#else
+  // expected-no-diagnostics
+  one({1, 2});
+#endif
+
+// 2. The `initializer_list` ctor has high precedence
+#if TEST_STD_VER >= 26
+  // expected-error@+1 {{chosen constructor is explicit in copy-initialization}}
+  two({{1, 2}});
+#else
+  // expected-no-diagnostics
+  two({{1, 2}});
+#endif
+
+  // 3. Implicit two-argument construction with a highly convertible value_type
+  // --> tested in "initializer_list.pass.cpp"
+}
diff --git a/libcxx/test/std/containers/views/views.span/span.cons/iterator_len.verify.cpp b/libcxx/test/std/containers/views/views.span/span.cons/iterator_len.verify.cpp
index 3836c97e94c6d..176ca20079269 100644
--- a/libcxx/test/std/containers/views/views.span/span.cons/iterator_len.verify.cpp
+++ b/libcxx/test/std/containers/views/views.span/span.cons/iterator_len.verify.cpp
@@ -13,22 +13,31 @@
 // constexpr explicit(Extent != dynamic_extent) span(It first, size_type count);
 //  If Extent is not equal to dynamic_extent, then count shall be equal to Extent.
 //
+// constexpr explicit(extent != dynamic_extent) span(std::initializer_list<value_type> il); // Since C++26
 
 #include <span>
 #include <cstddef>
 
+#include "test_macros.h"
+
 template <class T, std::size_t extent>
 std::span<T, extent> createImplicitSpan(T* ptr, std::size_t len) {
   return {ptr, len}; // expected-error {{chosen constructor is explicit in copy-initialization}}
 }
 
-void f() {
+void test() {
   // explicit constructor necessary
   int arr[] = {1, 2, 3};
   createImplicitSpan<int, 1>(arr, 3);
 
-  std::span<int> sp = {0, 0}; // expected-error {{no matching constructor for initialization of 'std::span<int>'}}
-  std::span<int, 2> sp2 = {0, 0}; // expected-error {{no matching constructor for initialization of 'std::span<int, 2>'}}
-  std::span<const int> csp = {0, 0}; // expected-error {{no matching constructor for initialization of 'std::span<const int>'}}
-  std::span<const int, 2> csp2 = {0, 0}; // expected-error {{no matching constructor for initialization of 'std::span<const int, 2>'}}
+  // expected-error@+1 {{no matching constructor for initialization of 'std::span<int>'}}
+  std::span<int> sp = {0, 0};
+  // expected-error@+1 {{no matching constructor for initialization of 'std::span<int, 2>'}}
+  std::span<int, 2> sp2 = {0, 0};
+#if TEST_STD_VER < 26
+  // expected-error@+1 {{no matching constructor for initialization of 'std::span<const int>'}}
+  std::span<const int> csp = {0, 0};
+  // expected-error@+1 {{no matching constructor for initialization of 'std::span<const int, 2>'}}
+  std::span<const int, 2> csp2 = {0, 0};
+#endif
 }
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp
index dbbbaf4ec7c22..e1694308f12a7 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/span.version.compile.pass.cpp
@@ -116,17 +116,11 @@
 #   error "__cpp_lib_span_at should have the value 202311L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_span_initializer_list
-#     error "__cpp_lib_span_initializer_list should be defined in c++26"
-#   endif
-#   if __cpp_lib_span_initializer_list != 202311L
-#     error "__cpp_lib_span_initializer_list should have the value 202311L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_span_initializer_list
-#     error "__cpp_lib_span_initializer_list should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_span_initializer_list
+#   error "__cpp_lib_span_initializer_list should be defined in c++26"
+# endif
+# if __cpp_lib_span_initializer_list != 202311L
+#   error "__cpp_lib_span_initializer_list should have the value 202311L in c++26"
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index d5a0839b30f82..fa188533446b4 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -7294,17 +7294,11 @@
 #   error "__cpp_lib_span_at should have the value 202311L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_span_initializer_list
-#     error "__cpp_lib_span_initializer_list should be defined in c++26"
-#   endif
-#   if __cpp_lib_span_initializer_list != 202311L
-#     error "__cpp_lib_span_initializer_list should have the value 202311L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_span_initializer_list
-#     error "__cpp_lib_span_initializer_list should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_span_initializer_list
+#   error "__cpp_lib_span_initializer_list should be defined in c++26"
+# endif
+# if __cpp_lib_span_initializer_list != 202311L
+#   error "__cpp_lib_span_initializer_list should have the value 202311L in c++26"
 # endif
 
 # if !defined(_LIBCPP_VERSION)
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 8ee92909dfa53..958c34edde005 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1093,7 +1093,6 @@ def add_version_header(tc):
             "name": "__cpp_lib_span_initializer_list",
             "values": {"c++26": 202311},  # P2447R6 std::span over an initializer list
             "headers": ["span"],
-            "unimplemented": True,
         },
         {
             "name": "__cpp_lib_spanstream",

From 58d5a486ec641156dcf420d67e075dc0a766fc5e Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Fri, 19 Jan 2024 21:45:30 -0800
Subject: [PATCH 217/843] [lld][WebAssembly] Fix regression in function
 signature checking (#78831)

Followup to #78658, which caused a regression in emscripten.

When a lazy symbol is added, which resolved and existing undefined
symbol, we don't need/want to replace the undefined symbol with the lazy
one. Instead we called extract, which replaces the undefined symbol with
the defined one.

The fact that we were first replacing the undefined symbol with a lazy
one before extracting the archive member doesn't normally matter but, in
the case of the function symbol, replacing the undefined symbol with a
lazy symbol means that `addDefinedFunction` sees the existing symbol as
lazy and simply replaces it.

Note that this is consistent with both the ELF code in
`Symbol::resolve(const LazySymbol &other)` and the wasm code prior to
 #78658, neither of which replace the existing symbol with the lazy one
in this case.
---
 lld/test/wasm/signature-mismatch.s | 11 ++++++++++-
 lld/wasm/SymbolTable.cpp           |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/lld/test/wasm/signature-mismatch.s b/lld/test/wasm/signature-mismatch.s
index 5d305efca2464..7dc1b8ced3530 100644
--- a/lld/test/wasm/signature-mismatch.s
+++ b/lld/test/wasm/signature-mismatch.s
@@ -8,6 +8,11 @@
 # RUN: wasm-ld -r -o %t.reloc.o %t.main.o %t.ret32.o %t.call.o 2>&1 | FileCheck %s -check-prefix=WARN
 # RUN: obj2yaml %t.reloc.o | FileCheck %s -check-prefix=RELOC
 
+# RUN: rm -f %t.a
+# RUN: ar crS %t.a %t.ret32.o %t.call.o
+# RUN: wasm-ld --export=call_ret32 --export=ret32 -o %t2.wasm %t.main.o %t.a 2>&1 | FileCheck %s -check-prefix=ARCHIVE
+# RUN: obj2yaml %t2.wasm | FileCheck %s -check-prefix=YAML
+
 # RUN: not wasm-ld --fatal-warnings -o %t.wasm %t.main.o %t.ret32.o %t.call.o 2>&1 | FileCheck %s -check-prefix=ERROR
 
 .functype ret32 (i32, i64, i32) -> (i32)
@@ -42,6 +47,10 @@ ret32_address_main:
 # WARN-NEXT: >>> defined as (i32, i64, i32) -> i32 in {{.*}}.main.o
 # WARN-NEXT: >>> defined as (f32) -> i32 in {{.*}}.ret32.o
 
+# ARCHIVE: warning: function signature mismatch: ret32
+# ARCHIVE-NEXT: >>> defined as (i32, i64, i32) -> i32 in {{.*}}.main.o
+# ARCHIVE-NEXT: >>> defined as (f32) -> i32 in {{.*}}.ret32.o
+
 # ERROR: error: function signature mismatch: ret32
 # ERROR-NEXT: >>> defined as (i32, i64, i32) -> i32 in {{.*}}.main.o
 # ERROR-NEXT: >>> defined as (f32) -> i32 in {{.*}}.ret32.o
@@ -56,7 +65,7 @@ ret32_address_main:
 
 # YAML:        - Type:            CUSTOM
 # YAML-NEXT:     Name:            name
-# YAML-NEXT:     FunctionNames:   
+# YAML-NEXT:     FunctionNames:
 # YAML-NEXT:       - Index:           0
 # YAML-NEXT:         Name:            'signature_mismatch:ret32'
 # YAML-NEXT:       - Index:           1
diff --git a/lld/wasm/SymbolTable.cpp b/lld/wasm/SymbolTable.cpp
index c98aa3ee3a7a3..00c347ea3ef24 100644
--- a/lld/wasm/SymbolTable.cpp
+++ b/lld/wasm/SymbolTable.cpp
@@ -774,7 +774,7 @@ void SymbolTable::addLazy(StringRef name, InputFile *file) {
 
   LLVM_DEBUG(dbgs() << "replacing existing undefined\n");
   const InputFile *oldFile = s->getFile();
-  replaceSymbol<LazySymbol>(s, name, 0, file)->extract();
+  LazySymbol(name, 0, file).extract();
   if (!config->whyExtract.empty())
     ctx.whyExtractRecords.emplace_back(toString(oldFile), s->getFile(), *s);
 }

From 963d7b4b2a0a80e29d4a862c3629a21de0af975f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 19 Jan 2024 22:02:21 -0800
Subject: [PATCH 218/843] [ELF] Improve --thinlto-index-only and
 --thinlto-emit-index-files tests

---
 lld/test/ELF/lto/thinlto-emit-imports.ll |  36 +-------
 lld/test/ELF/lto/thinlto-emit-index.ll   |  50 +++++-----
 lld/test/ELF/lto/thinlto-index-file.ll   |  27 ------
 lld/test/ELF/lto/thinlto-index-only.ll   | 112 ++++++++++++++++-------
 4 files changed, 106 insertions(+), 119 deletions(-)
 delete mode 100644 lld/test/ELF/lto/thinlto-index-file.ll

diff --git a/lld/test/ELF/lto/thinlto-emit-imports.ll b/lld/test/ELF/lto/thinlto-emit-imports.ll
index fcddd18cc7447..6d0e1e65047db 100644
--- a/lld/test/ELF/lto/thinlto-emit-imports.ll
+++ b/lld/test/ELF/lto/thinlto-emit-imports.ll
@@ -1,32 +1,10 @@
 ; REQUIRES: x86
+;; Test a few properties not tested by thinlto-index-only.ll
 
-; Generate summary sections and test lld handling.
 ; RUN: opt -module-summary %s -o %t1.o
 ; RUN: opt -module-summary %p/Inputs/thinlto.ll -o %t2.o
-
-; Include a file with an empty module summary index, to ensure that the expected
-; output files are created regardless, for a distributed build system.
 ; RUN: opt -module-summary %p/Inputs/thinlto_empty.ll -o %t3.o
 
-; Ensure lld generates imports files if requested for distributed backends.
-; RUN: rm -f %t3.o.imports %t3.o.thinlto.bc
-; RUN: ld.lld --plugin-opt=thinlto-index-only --plugin-opt=thinlto-emit-imports-files -shared %t1.o %t2.o %t3.o -o %t4
-
-; The imports file for this module contains the bitcode file for
-; Inputs/thinlto.ll
-; RUN: count 1 < %t1.o.imports
-; RUN: FileCheck %s --check-prefix=IMPORTS1 < %t1.o.imports
-; IMPORTS1: thinlto-emit-imports.ll.tmp2.o
-
-; The imports file for Input/thinlto.ll is empty as it does not import anything.
-; RUN: count 0 < %t2.o.imports
-
-; The imports file for Input/thinlto_empty.ll is empty but should exist.
-; RUN: count 0 < %t3.o.imports
-
-; The index file should be created even for the input with an empty summary.
-; RUN: ls %t3.o.thinlto.bc
-
 ; Ensure lld generates error if unable to write to imports file.
 ; RUN: rm -f %t3.o.imports
 ; RUN: touch %t3.o.imports
@@ -34,22 +12,12 @@
 ; RUN: not ld.lld --plugin-opt=thinlto-index-only --plugin-opt=thinlto-emit-imports-files -shared %t1.o %t2.o %t3.o -o /dev/null 2>&1 | FileCheck -DMSG=%errc_EACCES %s --check-prefix=ERR
 ; ERR: cannot open {{.*}}3.o.imports: [[MSG]]
 
-; Ensure lld doesn't generate import files when thinlto-index-only is not enabled
-; RUN: rm -f %t1.o.imports
-; RUN: rm -f %t2.o.imports
-; RUN: rm -f %t3.o.imports
+; RUN: rm -f %t1.o.imports %t2.o.imports rm -f %t3.o.imports
 ; RUN: ld.lld --plugin-opt=thinlto-emit-imports-files -shared %t1.o %t2.o %t3.o -o %t4
 ; RUN: not ls %t1.o.imports
 ; RUN: not ls %t2.o.imports
 ; RUN: not ls %t3.o.imports
 
-; Check that imports files are generated also when --thinlto-index-only
-; is specified without --plugin-opt=.
-; RUN: rm -f %t1.o.imports
-; RUN: ld.lld --thinlto-index-only --thinlto-emit-imports-files -shared %t1.o %t2.o %t3.o -o %t4
-; RUN: count 1 < %t1.o.imports
-; RUN: FileCheck %s --check-prefix=IMPORTS1 < %t1.o.imports
-
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/lld/test/ELF/lto/thinlto-emit-index.ll b/lld/test/ELF/lto/thinlto-emit-index.ll
index ae8b4ddeb1bf1..ba2ac2ceb689f 100644
--- a/lld/test/ELF/lto/thinlto-emit-index.ll
+++ b/lld/test/ELF/lto/thinlto-emit-index.ll
@@ -3,40 +3,40 @@
 ;; Mostly copied/updated from thinlto-index-only.ll
 ;; First ensure that the ThinLTO handling in lld handles
 ;; bitcode without summary sections gracefully and generates index file.
-; RUN: rm -rf %t.dir && mkdir %t.dir && cd %t.dir
+; RUN: rm -rf %t && mkdir %t && cd %t
+; RUN: mkdir d
 ; RUN: llvm-as %s -o 1.o
-; RUN: llvm-as %p/Inputs/thinlto.ll -o 2.o
-; RUN: ld.lld --thinlto-emit-index-files -shared 1.o 2.o -o 3
-; RUN: ls 2.o.thinlto.bc
+; RUN: llvm-as %p/Inputs/thinlto.ll -o d/2.o
+; RUN: ld.lld --thinlto-emit-index-files -shared 1.o d/2.o -o 3
+; RUN: ls d/2.o.thinlto.bc
 ; RUN: ls 3
-; RUN: ld.lld -shared 1.o 2.o -o 3
+; RUN: ld.lld -shared 1.o d/2.o -o 3
 ; RUN: llvm-nm 3 | FileCheck %s --check-prefix=NM
 
 ;; Basic ThinLTO tests.
 ; RUN: opt -module-summary %s -o 1.o
-; RUN: opt -module-summary %p/Inputs/thinlto.ll -o 2.o
+; RUN: opt -module-summary %p/Inputs/thinlto.ll -o d/2.o
 ; RUN: opt -module-summary %p/Inputs/thinlto_empty.ll -o 3.o
+; RUN: cp 3.o 4.o
 
 ;; Ensure lld generates an index and also a binary if requested.
-; RUN: ld.lld --thinlto-emit-index-files -shared 1.o 2.o -o 4
-; RUN: llvm-bcanalyzer -dump 1.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND1
-; RUN: llvm-bcanalyzer -dump 2.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND2
+; RUN: ld.lld --thinlto-emit-index-files -shared 1.o --start-lib d/2.o 3.o --end-lib 4.o -o 4
 ; RUN: ls 4
+; RUN: llvm-bcanalyzer -dump 1.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND1
+; RUN: llvm-bcanalyzer -dump d/2.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND2
+; RUN: llvm-dis < 3.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND3
+; RUN: llvm-dis < 4.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND4
+
+; IMPORTS1: d/2.o
 
 ;; Ensure lld generates an index and not a binary if both emit-index and index-only are present.
-; RUN: ld.lld --thinlto-emit-index-files --thinlto-index-only -shared 1.o 2.o -o 5
+; RUN: ld.lld --thinlto-emit-index-files --thinlto-index-only -shared 1.o d/2.o -o 5
 ; RUN: not ls 5
 
-;; Ensure lld generates an index even if the file is wrapped in --start-lib/--end-lib
-; RUN: rm -f 2.o.thinlto.bc
-; RUN: ld.lld --thinlto-emit-index-files -shared 1.o 3.o --start-lib 2.o --end-lib -o 6
-; RUN: llvm-dis < 2.o.thinlto.bc | grep -q '\^0 = module:'
-; RUN: ls 6
-
 ;; Test that LLD generates an empty index even for lazy object file that is not added to link.
 ;; Test that LLD also generates empty imports file with the --thinlto-emit-imports-files option.
 ; RUN: rm -f 1.o.thinlto.bc 1.o.imports
-; RUN: ld.lld --thinlto-emit-index-files -shared 2.o --start-lib 1.o --end-lib \
+; RUN: ld.lld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \
 ; RUN: --thinlto-emit-imports-files -o 7
 ; RUN: ls 7
 ; RUN: ls 1.o.thinlto.bc
@@ -50,19 +50,19 @@
 ; RUN: ls 1.o.thinlto.bc
 
 ;; Test that LLD errors out when run with suffix replacement, or prefix replacement
-; RUN: not ld.lld --thinlto-emit-index-files -shared 2.o --start-lib 1.o --end-lib \
+; RUN: not ld.lld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \
 ; RUN: --thinlto-prefix-replace="abc;xyz" 2>&1 | FileCheck %s --check-prefix=ERR1
 ; ERR1: --thinlto-prefix-replace is not supported with --thinlto-emit-index-files
 
-; RUN: not ld.lld --thinlto-emit-index-files -shared 2.o --start-lib 1.o --end-lib \
+; RUN: not ld.lld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \
 ; RUN: --thinlto-object-suffix-replace="abc;xyz" 2>&1 | FileCheck %s --check-prefix=ERR2
 ; ERR2: --thinlto-object-suffix-replace is not supported with --thinlto-emit-index-files
 
 ;; But not when passed with index only as well
-; RUN: ld.lld --thinlto-emit-index-files -shared 2.o --start-lib 1.o --end-lib \
+; RUN: ld.lld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \
 ; RUN: --thinlto-prefix-replace="abc;xyz" --thinlto-index-only
 
-; RUN: ld.lld --thinlto-emit-index-files -shared 2.o --start-lib 1.o --end-lib \
+; RUN: ld.lld --thinlto-emit-index-files -shared d/2.o --start-lib 1.o --end-lib \
 ; RUN: --thinlto-object-suffix-replace="abc;xyz" --thinlto-index-only
 
 ; NM: T f
@@ -71,7 +71,7 @@
 ;; Inputs/thinlto.ll, as it imports from the latter.
 ; BACKEND1: <MODULE_STRTAB_BLOCK
 ; BACKEND1-NEXT: <ENTRY {{.*}} record string = '1.o'
-; BACKEND1-NEXT: <ENTRY {{.*}} record string = '2.o'
+; BACKEND1-NEXT: <ENTRY {{.*}} record string = 'd/2.o'
 ; BACKEND1-NEXT: </MODULE_STRTAB_BLOCK
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
@@ -85,7 +85,7 @@
 ;; The backend index for Input/thinlto.ll contains summaries from itself only,
 ;; as it does not import anything.
 ; BACKEND2: <MODULE_STRTAB_BLOCK
-; BACKEND2-NEXT: <ENTRY {{.*}} record string = '2.o'
+; BACKEND2-NEXT: <ENTRY {{.*}} record string = 'd/2.o'
 ; BACKEND2-NEXT: </MODULE_STRTAB_BLOCK
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
@@ -94,6 +94,10 @@
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
+; BACKEND3: ^0 = flags:
+
+; BACKEND4: ^0 = module: (path: "4.o", hash: (0, 0, 0, 0, 0))
+
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/lld/test/ELF/lto/thinlto-index-file.ll b/lld/test/ELF/lto/thinlto-index-file.ll
deleted file mode 100644
index 00feef725844b..0000000000000
--- a/lld/test/ELF/lto/thinlto-index-file.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; REQUIRES: x86
-; RUN: rm -rf %t && mkdir %t && cd %t
-; RUN: opt -module-summary %s -o 1.o
-; RUN: opt -module-summary %p/Inputs/thinlto.ll -o 2.o
-; RUN: opt -module-summary %p/Inputs/thinlto_empty.ll -o 3.o
-
-;; Ensure lld writes linked files to linked objects file
-; RUN: ld.lld --plugin-opt=thinlto-index-only=1.txt -shared 1.o 2.o 3.o -o /dev/null
-; RUN: FileCheck %s < 1.txt
-; CHECK: 1.o
-; CHECK: 2.o
-; CHECK: 3.o
-
-;; Check that this also works without the --plugin-opt= prefix.
-; RUN: ld.lld --thinlto-index-only=2.txt -shared 1.o 2.o 3.o -o /dev/null
-; RUN: diff 1.txt 2.txt
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @g(...)
-
-define void @f() {
-entry:
-  call void (...) @g()
-  ret void
-}
diff --git a/lld/test/ELF/lto/thinlto-index-only.ll b/lld/test/ELF/lto/thinlto-index-only.ll
index 0b64958007b28..abf58ce5ea411 100644
--- a/lld/test/ELF/lto/thinlto-index-only.ll
+++ b/lld/test/ELF/lto/thinlto-index-only.ll
@@ -1,46 +1,75 @@
 ; REQUIRES: x86
-; RUN: rm -rf %t && mkdir %t && cd %t
+; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: mkdir d
 
 ;; First ensure that the ThinLTO handling in lld handles
 ;; bitcode without summary sections gracefully and generates index file.
-; RUN: llvm-as %s -o 1.o
-; RUN: llvm-as %p/Inputs/thinlto.ll -o 2.o
-; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 1.o 2.o -o 3
-; RUN: ls 2.o.thinlto.bc
+; RUN: llvm-as 1.ll -o 1.o
+; RUN: llvm-as %p/Inputs/thinlto.ll -o d/2.o
+; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 1.o d/2.o -o 3
+; RUN: ls d/2.o.thinlto.bc
 ; RUN: not test -e 3
-; RUN: ld.lld -shared 1.o 2.o -o 3
+; RUN: ld.lld -shared 1.o d/2.o -o 3
 ; RUN: llvm-nm 3 | FileCheck %s --check-prefix=NM
 
 ;; Basic ThinLTO tests.
-; RUN: opt -module-summary %s -o 1.o
-; RUN: opt -module-summary %p/Inputs/thinlto.ll -o 2.o
+; RUN: llvm-as 0.ll -o 0.o
+; RUN: opt -module-summary 1.ll -o 1.o
+; RUN: opt -module-summary %p/Inputs/thinlto.ll -o d/2.o
 ; RUN: opt -module-summary %p/Inputs/thinlto_empty.ll -o 3.o
+; RUN: cp 3.o 4.o
 
 ;; Ensure lld doesn't generates index files when --thinlto-index-only is not enabled.
-; RUN: rm -f 1.o.thinlto.bc 2.o.thinlto.bc
-; RUN: ld.lld -shared 1.o 2.o -o 5
+; RUN: rm -f 1.o.thinlto.bc d/2.o.thinlto.bc
+; RUN: ld.lld -shared 1.o d/2.o -o /dev/null
 ; RUN: not ls 1.o.thinlto.bc
-; RUN: not ls 2.o.thinlto.bc
+; RUN: not ls d/2.o.thinlto.bc
 
 ;; Ensure lld generates an index and not a binary if requested.
-; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 1.o 2.o -o 4
+; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 1.o --start-lib d/2.o 3.o --end-lib 4.o -o 4
+; RUN: not test -e 4
 ; RUN: llvm-bcanalyzer -dump 1.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND1
-; RUN: llvm-bcanalyzer -dump 2.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND2
+; RUN: llvm-bcanalyzer -dump d/2.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND2
+; RUN: llvm-dis < 3.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND3
+; RUN: llvm-dis < 4.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND4
+
+; RUN: rm -f 1.o.thinlto.bc d/2.o.thinlto.bc 3.o.thinlto.bc 4.o.thinlto.bc
+; RUN: ld.lld --plugin-opt=thinlto-index-only=4.txt --plugin-opt=thinlto-emit-imports-files -shared 1.o --start-lib d/2.o 3.o --end-lib 4.o -o 4
 ; RUN: not test -e 4
+; RUN: FileCheck %s --check-prefix=RSP --implicit-check-not={{.}} < 4.txt
+; RUN: llvm-bcanalyzer -dump 1.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND1
+; RUN: llvm-bcanalyzer -dump d/2.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND2
+; RUN: llvm-dis < 3.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND3
+; RUN: llvm-dis < 4.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND4
+; RUN: FileCheck %s --check-prefix=IMPORTS1 --implicit-check-not={{.}} < 1.o.imports
+; RUN: count 0 < d/2.o.imports
+;; Test that LLD generates an empty index even for lazy object file that is not added to link.
+; RUN: count 0 < 3.o.imports
+; RUN: count 0 < 4.o.imports
 
-;; Ensure lld generates an index even if the file is wrapped in --start-lib/--end-lib.
-; RUN: rm -f 2.o.thinlto.bc 4
-; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 1.o 3.o --start-lib 2.o --end-lib -o 4
-; RUN: llvm-dis < 2.o.thinlto.bc | grep -q '\^0 = module:'
+;; Test the other spelling --thinlto-index-only= --thinlto-emit-imports-files and the interaction with --save-temps.
+; RUN: rm -f 4.txt 1.o.thinlto.bc d/2.o.thinlto.bc 3.o.thinlto.bc 4.o.thinlto.bc
+; RUN: ld.lld --thinlto-index-only=4.txt --thinlto-emit-imports-files --save-temps -shared 0.o 1.o --start-lib d/2.o 3.o --end-lib 4.o -o t
 ; RUN: not test -e 4
+; RUN: FileCheck %s --check-prefix=RSP --implicit-check-not={{.}} < 4.txt
+; RUN: llvm-bcanalyzer -dump 1.o.thinlto.bc | FileCheck %s --check-prefix=BACKEND1
+; RUN: FileCheck %s --check-prefix=IMPORTS1 --implicit-check-not={{.}} < 1.o.imports
+; RUN: FileCheck %s --check-prefix=RESOLUTION < t.resolution.txt
+; RUN: llvm-dis < t.index.bc | FileCheck %s --check-prefix=INDEX-BC
 
-;; Test that LLD generates an empty index even for lazy object file that is not added to link.
-;; Test LLD generates empty imports file either because of thinlto-emit-imports-files option.
-; RUN: rm -f 1.o.thinlto.bc 1.o.imports
-; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 2.o --start-lib 1.o --end-lib \
-; RUN: --plugin-opt=thinlto-emit-imports-files -o 3
-; RUN: ls 1.o.thinlto.bc
-; RUN: ls 1.o.imports
+; RSP:      1.o
+; RSP-NEXT: d/2.o
+; RSP-NEXT: 4.o
+
+; IMPORTS1: d/2.o
+
+; RESOLUTION:      0.o
+; RESOLUTION-NEXT: -r=0.o,foo,px
+; RESOLUTION-NEXT: 1.o
+
+; INDEX-BC:      ^0 = module: (path: "1.o", hash: (0, 0, 0, 0, 0))
+; INDEX-BC-NEXT: ^1 = module: (path: "4.o", hash: (0, 0, 0, 0, 0))
+; INDEX-BC-NEXT: ^2 = module: (path: "d/2.o", hash: (0, 0, 0, 0, 0))
 
 ;; Ensure LLD generates an empty index for each bitcode file even if all bitcode files are lazy.
 ; RUN: rm -f 1.o.thinlto.bc
@@ -50,18 +79,18 @@
 
 ;; Ensure when the same bitcode object is given as both lazy and non-lazy,
 ;; LLD does not generate an empty index for the lazy object.
-; RUN: rm -f 2.o.thinlto.bc
-; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 1.o 2.o --start-lib 2.o --end-lib -o /dev/null
-; RUN: llvm-dis < 2.o.thinlto.bc | grep -q '\^0 = module:'
-; RUN: rm -f 2.o.thinlto.bc
-; RUN: ld.lld --plugin-opt=thinlto-index-only -shared --start-lib 2.o --end-lib 2.o 1.o -o /dev/null
-; RUN: llvm-dis < 2.o.thinlto.bc | grep -q '\^0 = module:'
+; RUN: rm -f d/2.o.thinlto.bc
+; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 1.o d/2.o --start-lib d/2.o --end-lib -o /dev/null
+; RUN: llvm-dis < d/2.o.thinlto.bc | grep -q '\^0 = module:'
+; RUN: rm -f d/2.o.thinlto.bc
+; RUN: ld.lld --plugin-opt=thinlto-index-only -shared --start-lib d/2.o --end-lib d/2.o 1.o -o /dev/null
+; RUN: llvm-dis < d/2.o.thinlto.bc | grep -q '\^0 = module:'
 
 ;; Ensure when the same lazy bitcode object is given multiple times,
 ;; no empty index file is generated if one of the copies is linked.
-; RUN: rm -f 2.o.thinlto.bc
-; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 1.o --start-lib 2.o --end-lib --start-lib 2.o --end-lib -o /dev/null
-; RUN: llvm-dis < 2.o.thinlto.bc | grep -q '\^0 = module:'
+; RUN: rm -f d/2.o.thinlto.bc
+; RUN: ld.lld --plugin-opt=thinlto-index-only -shared 1.o --start-lib d/2.o --end-lib --start-lib d/2.o --end-lib -o /dev/null
+; RUN: llvm-dis < d/2.o.thinlto.bc | grep -q '\^0 = module:'
 
 ; NM: T f
 
@@ -69,7 +98,7 @@
 ;; Inputs/thinlto.ll, as it imports from the latter.
 ; BACKEND1: <MODULE_STRTAB_BLOCK
 ; BACKEND1-NEXT: <ENTRY {{.*}} record string = '1.o'
-; BACKEND1-NEXT: <ENTRY {{.*}} record string = '2.o'
+; BACKEND1-NEXT: <ENTRY {{.*}} record string = 'd/2.o'
 ; BACKEND1-NEXT: </MODULE_STRTAB_BLOCK
 ; BACKEND1: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND1: <VERSION
@@ -83,7 +112,7 @@
 ;; The backend index for Input/thinlto.ll contains summaries from itself only,
 ;; as it does not import anything.
 ; BACKEND2: <MODULE_STRTAB_BLOCK
-; BACKEND2-NEXT: <ENTRY {{.*}} record string = '2.o'
+; BACKEND2-NEXT: <ENTRY {{.*}} record string = 'd/2.o'
 ; BACKEND2-NEXT: </MODULE_STRTAB_BLOCK
 ; BACKEND2-NEXT: <GLOBALVAL_SUMMARY_BLOCK
 ; BACKEND2-NEXT: <VERSION
@@ -92,6 +121,19 @@
 ; BACKEND2-NEXT: <COMBINED
 ; BACKEND2-NEXT: </GLOBALVAL_SUMMARY_BLOCK
 
+; BACKEND3: ^0 = flags:
+
+; BACKEND4: ^0 = module: (path: "4.o", hash: (0, 0, 0, 0, 0))
+
+;--- 0.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define void @foo() {
+  ret void
+}
+
+;--- 1.ll
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 

From 10886a8f0a054d8d97708fcfbe03313d81fae35e Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 22:24:09 -0800
Subject: [PATCH 219/843] [Driver] Use SmallString::operator std::string (NFC)

---
 clang/lib/Driver/Driver.cpp                   | 22 +++++++++----------
 clang/lib/Driver/SanitizerArgs.cpp            |  2 +-
 clang/lib/Driver/ToolChain.cpp                |  8 +++----
 clang/lib/Driver/ToolChains/AVR.cpp           |  2 +-
 clang/lib/Driver/ToolChains/CSKYToolChain.cpp |  2 +-
 clang/lib/Driver/ToolChains/Cuda.cpp          |  2 +-
 clang/lib/Driver/ToolChains/Fuchsia.cpp       |  4 ++--
 clang/lib/Driver/ToolChains/Hexagon.cpp       |  2 +-
 clang/lib/Driver/ToolChains/MSP430.cpp        |  2 +-
 clang/lib/Driver/ToolChains/MSVC.cpp          |  2 +-
 clang/lib/Driver/ToolChains/MipsLinux.cpp     |  2 +-
 clang/lib/Driver/ToolChains/OpenBSD.cpp       |  4 ++--
 clang/lib/Driver/ToolChains/PS4CPU.cpp        |  2 +-
 .../lib/Driver/ToolChains/RISCVToolchain.cpp  |  2 +-
 14 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 35d563b9a87fa..10f7b99a59bd6 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -186,7 +186,7 @@ std::string Driver::GetResourcesPath(StringRef BinaryPath,
                             CLANG_VERSION_MAJOR_STRING);
   }
 
-  return std::string(P.str());
+  return std::string(P);
 }
 
 Driver::Driver(StringRef ClangExecutable, StringRef TargetTriple,
@@ -1831,7 +1831,7 @@ void Driver::generateCompilationDiagnostics(
       ScriptOS << "\n# Additional information: " << AdditionalInformation
                << "\n";
     if (Report)
-      Report->TemporaryFiles.push_back(std::string(Script.str()));
+      Report->TemporaryFiles.push_back(std::string(Script));
     Diag(clang::diag::note_drv_command_failed_diag_msg) << Script;
   }
 
@@ -6073,17 +6073,17 @@ std::string Driver::GetFilePath(StringRef Name, const ToolChain &TC) const {
   SmallString<128> R(ResourceDir);
   llvm::sys::path::append(R, Name);
   if (llvm::sys::fs::exists(Twine(R)))
-    return std::string(R.str());
+    return std::string(R);
 
   SmallString<128> P(TC.getCompilerRTPath());
   llvm::sys::path::append(P, Name);
   if (llvm::sys::fs::exists(Twine(P)))
-    return std::string(P.str());
+    return std::string(P);
 
   SmallString<128> D(Dir);
   llvm::sys::path::append(D, "..", Name);
   if (llvm::sys::fs::exists(Twine(D)))
-    return std::string(D.str());
+    return std::string(D);
 
   if (auto P = SearchPaths(TC.getLibraryPaths()))
     return *P;
@@ -6120,11 +6120,11 @@ std::string Driver::GetProgramPath(StringRef Name, const ToolChain &TC) const {
     if (llvm::sys::fs::is_directory(PrefixDir)) {
       SmallString<128> P(PrefixDir);
       if (ScanDirForExecutable(P, Name))
-        return std::string(P.str());
+        return std::string(P);
     } else {
       SmallString<128> P((PrefixDir + Name).str());
       if (llvm::sys::fs::can_execute(Twine(P)))
-        return std::string(P.str());
+        return std::string(P);
     }
   }
 
@@ -6140,7 +6140,7 @@ std::string Driver::GetProgramPath(StringRef Name, const ToolChain &TC) const {
     for (const auto &Path : List) {
       SmallString<128> P(Path);
       if (ScanDirForExecutable(P, TargetSpecificExecutable))
-        return std::string(P.str());
+        return std::string(P);
     }
 
     // Fall back to the path
@@ -6160,7 +6160,7 @@ std::string Driver::GetTemporaryPath(StringRef Prefix, StringRef Suffix) const {
     return "";
   }
 
-  return std::string(Path.str());
+  return std::string(Path);
 }
 
 std::string Driver::GetTemporaryDirectory(StringRef Prefix) const {
@@ -6171,7 +6171,7 @@ std::string Driver::GetTemporaryDirectory(StringRef Prefix) const {
     return "";
   }
 
-  return std::string(Path.str());
+  return std::string(Path);
 }
 
 std::string Driver::GetClPchPath(Compilation &C, StringRef BaseName) const {
@@ -6193,7 +6193,7 @@ std::string Driver::GetClPchPath(Compilation &C, StringRef BaseName) const {
       Output = BaseName;
     llvm::sys::path::replace_extension(Output, ".pch");
   }
-  return std::string(Output.str());
+  return std::string(Output);
 }
 
 const ToolChain &Driver::getToolChain(const ArgList &Args,
diff --git a/clang/lib/Driver/SanitizerArgs.cpp b/clang/lib/Driver/SanitizerArgs.cpp
index 9d6ea371f9f6d..56d497eb4c32b 100644
--- a/clang/lib/Driver/SanitizerArgs.cpp
+++ b/clang/lib/Driver/SanitizerArgs.cpp
@@ -190,7 +190,7 @@ static void addDefaultIgnorelists(const Driver &D, SanitizerMask Kinds,
     clang::SmallString<64> Path(D.ResourceDir);
     llvm::sys::path::append(Path, "share", BL.File);
     if (D.getVFS().exists(Path))
-      IgnorelistFiles.push_back(std::string(Path.str()));
+      IgnorelistFiles.push_back(std::string(Path));
     else if (BL.Mask == SanitizerKind::CFI && DiagnoseErrors)
       // If cfi_ignorelist.txt cannot be found in the resource dir, driver
       // should fail.
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index 96a57927339a9..388030592b483 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -606,7 +606,7 @@ std::string ToolChain::getCompilerRTPath() const {
   } else {
     llvm::sys::path::append(Path, "lib", getOSLibName());
   }
-  return std::string(Path.str());
+  return std::string(Path);
 }
 
 std::string ToolChain::getCompilerRTBasename(const ArgList &Args,
@@ -659,7 +659,7 @@ std::string ToolChain::getCompilerRT(const ArgList &Args, StringRef Component,
     SmallString<128> P(LibPath);
     llvm::sys::path::append(P, CRTBasename);
     if (getVFS().exists(P))
-      return std::string(P.str());
+      return std::string(P);
   }
 
   // Fall back to the old expected compiler-rt name if the new one does not
@@ -668,7 +668,7 @@ std::string ToolChain::getCompilerRT(const ArgList &Args, StringRef Component,
       buildCompilerRTBasename(Args, Component, Type, /*AddArch=*/true);
   SmallString<128> Path(getCompilerRTPath());
   llvm::sys::path::append(Path, CRTBasename);
-  return std::string(Path.str());
+  return std::string(Path);
 }
 
 const char *ToolChain::getCompilerRTArgString(const llvm::opt::ArgList &Args,
@@ -783,7 +783,7 @@ ToolChain::path_list ToolChain::getArchSpecificLibPaths() const {
     llvm::sys::path::append(Path, "lib");
     for (auto &S : SS)
       llvm::sys::path::append(Path, S);
-    Paths.push_back(std::string(Path.str()));
+    Paths.push_back(std::string(Path));
   };
 
   AddPath({getTriple().str()});
diff --git a/clang/lib/Driver/ToolChains/AVR.cpp b/clang/lib/Driver/ToolChains/AVR.cpp
index a4ce2258c3d81..bb5c0e6db9978 100644
--- a/clang/lib/Driver/ToolChains/AVR.cpp
+++ b/clang/lib/Driver/ToolChains/AVR.cpp
@@ -430,7 +430,7 @@ AVRToolChain::getCompilerRT(const llvm::opt::ArgList &Args, StringRef Component,
   SmallString<256> Path(ToolChain::getCompilerRTPath());
   llvm::sys::path::append(Path, "avr");
   llvm::sys::path::append(Path, File.str());
-  return std::string(Path.str());
+  return std::string(Path);
 }
 
 void AVR::Linker::ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
index 0c280347b2af6..feb3bc922920f 100644
--- a/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
+++ b/clang/lib/Driver/ToolChains/CSKYToolChain.cpp
@@ -125,7 +125,7 @@ std::string CSKYToolChain::computeSysRoot() const {
   if (!llvm::sys::fs::exists(SysRootDir))
     return std::string();
 
-  return std::string(SysRootDir.str());
+  return std::string(SysRootDir);
 }
 
 void CSKY::Linker::ConstructJob(Compilation &C, const JobAction &JA,
diff --git a/clang/lib/Driver/ToolChains/Cuda.cpp b/clang/lib/Driver/ToolChains/Cuda.cpp
index f7a208575cb0c..1462576ca870e 100644
--- a/clang/lib/Driver/ToolChains/Cuda.cpp
+++ b/clang/lib/Driver/ToolChains/Cuda.cpp
@@ -894,7 +894,7 @@ std::string CudaToolChain::getInputFilename(const InputInfo &Input) const {
   // these particular file names.
   SmallString<256> Filename(ToolChain::getInputFilename(Input));
   llvm::sys::path::replace_extension(Filename, "cubin");
-  return std::string(Filename.str());
+  return std::string(Filename);
 }
 
 llvm::opt::DerivedArgList *
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 6525cf8356d18..14b838500becc 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -258,7 +258,7 @@ Fuchsia::Fuchsia(const Driver &D, const llvm::Triple &Triple,
   if (!D.SysRoot.empty()) {
     SmallString<128> P(D.SysRoot);
     llvm::sys::path::append(P, "lib");
-    getFilePaths().push_back(std::string(P.str()));
+    getFilePaths().push_back(std::string(P));
   }
 
   auto FilePaths = [&](const Multilib &M) -> std::vector<std::string> {
@@ -266,7 +266,7 @@ Fuchsia::Fuchsia(const Driver &D, const llvm::Triple &Triple,
     if (std::optional<std::string> Path = getStdlibPath()) {
       SmallString<128> P(*Path);
       llvm::sys::path::append(P, M.gccSuffix());
-      FP.push_back(std::string(P.str()));
+      FP.push_back(std::string(P));
     }
     return FP;
   };
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index 6a2a105bee32e..d1eed931be5f1 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -550,7 +550,7 @@ std::string HexagonToolChain::getCompilerRTPath() const {
   if (!SelectedMultilibs.empty()) {
     Dir += SelectedMultilibs.back().gccSuffix();
   }
-  return std::string(Dir.str());
+  return std::string(Dir);
 }
 
 void HexagonToolChain::getHexagonLibraryPaths(const ArgList &Args,
diff --git a/clang/lib/Driver/ToolChains/MSP430.cpp b/clang/lib/Driver/ToolChains/MSP430.cpp
index 8dc23521f400a..07e875c64960e 100644
--- a/clang/lib/Driver/ToolChains/MSP430.cpp
+++ b/clang/lib/Driver/ToolChains/MSP430.cpp
@@ -142,7 +142,7 @@ std::string MSP430ToolChain::computeSysRoot() const {
   else
     llvm::sys::path::append(Dir, getDriver().Dir, "..");
 
-  return std::string(Dir.str());
+  return std::string(Dir);
 }
 
 void MSP430ToolChain::AddClangSystemIncludeArgs(const ArgList &DriverArgs,
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index 8e1e95173836f..4914604f0a286 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -585,7 +585,7 @@ bool MSVCToolChain::getUniversalCRTLibraryPath(const ArgList &Args,
   llvm::SmallString<128> LibPath(UniversalCRTSdkPath);
   llvm::sys::path::append(LibPath, "Lib", UCRTVersion, "ucrt", ArchName);
 
-  Path = std::string(LibPath.str());
+  Path = std::string(LibPath);
   return true;
 }
 
diff --git a/clang/lib/Driver/ToolChains/MipsLinux.cpp b/clang/lib/Driver/ToolChains/MipsLinux.cpp
index eacdcbf730b62..4183eccceedb4 100644
--- a/clang/lib/Driver/ToolChains/MipsLinux.cpp
+++ b/clang/lib/Driver/ToolChains/MipsLinux.cpp
@@ -138,5 +138,5 @@ std::string MipsLLVMToolChain::getCompilerRT(const ArgList &Args,
   }
   llvm::sys::path::append(
       Path, Twine("libclang_rt." + Component + "-" + "mips" + Suffix));
-  return std::string(Path.str());
+  return std::string(Path);
 }
diff --git a/clang/lib/Driver/ToolChains/OpenBSD.cpp b/clang/lib/Driver/ToolChains/OpenBSD.cpp
index 21b9004ef2d52..fd6aa4d7e6844 100644
--- a/clang/lib/Driver/ToolChains/OpenBSD.cpp
+++ b/clang/lib/Driver/ToolChains/OpenBSD.cpp
@@ -371,7 +371,7 @@ std::string OpenBSD::getCompilerRT(const ArgList &Args, StringRef Component,
   if (Component == "builtins") {
     SmallString<128> Path(getDriver().SysRoot);
     llvm::sys::path::append(Path, "/usr/lib/libcompiler_rt.a");
-    return std::string(Path.str());
+    return std::string(Path);
   }
   SmallString<128> P(getDriver().ResourceDir);
   std::string CRTBasename =
@@ -379,7 +379,7 @@ std::string OpenBSD::getCompilerRT(const ArgList &Args, StringRef Component,
   llvm::sys::path::append(P, "lib", CRTBasename);
   // Checks if this is the base system case which uses a different location.
   if (getVFS().exists(P))
-    return std::string(P.str());
+    return std::string(P);
   return ToolChain::getCompilerRT(Args, Component, Type);
 }
 
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 5fd82d1da1992..287d2e98931da 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -292,7 +292,7 @@ toolchains::PS4PS5Base::PS4PS5Base(const Driver &D, const llvm::Triple &Triple,
         << Twine(Platform, " system libraries").str() << SDKLibDir << Whence;
     return;
   }
-  getFilePaths().push_back(std::string(SDKLibDir.str()));
+  getFilePaths().push_back(std::string(SDKLibDir));
 }
 
 void toolchains::PS4PS5Base::AddClangSystemIncludeArgs(
diff --git a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
index 7e6abd1444287..5e4fa4d5331ff 100644
--- a/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
+++ b/clang/lib/Driver/ToolChains/RISCVToolchain.cpp
@@ -141,7 +141,7 @@ std::string RISCVToolChain::computeSysRoot() const {
   if (!llvm::sys::fs::exists(SysRootDir))
     return std::string();
 
-  return std::string(SysRootDir.str());
+  return std::string(SysRootDir);
 }
 
 void RISCV::Linker::ConstructJob(Compilation &C, const JobAction &JA,

From 71ca5c54a2af47fd9c1caab95fe5decd0a0a06d1 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 22:24:11 -0800
Subject: [PATCH 220/843] [CodeGen] Use a range-based for loop with
 llvm::predecessors (NFC)

---
 llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index eb372655e5f16..5ed7b020862a7 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -1851,8 +1851,7 @@ bool AssignmentTrackingLowering::join(
   // invalidated later, we will remove it when we revisit this block. This
   // is essentially the same as initialising all LocKinds and Assignments to
   // an implicit ⊥ value which is the identity value for the join operation.
-  for (auto I = pred_begin(&BB), E = pred_end(&BB); I != E; I++) {
-    const BasicBlock *Pred = *I;
+  for (const BasicBlock *Pred : predecessors(&BB)) {
     if (Visited.count(Pred))
       VisitedPreds.push_back(Pred);
   }

From 896cfcc585eee3a52d67c6d706b4e2af5eee258d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 22:24:13 -0800
Subject: [PATCH 221/843] [IR] Use StringRef::consume_front (NFC)

---
 llvm/lib/IR/VFABIDemangler.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/IR/VFABIDemangler.cpp b/llvm/lib/IR/VFABIDemangler.cpp
index 7fb2ff6f150d4..cdfb9fbfaa084 100644
--- a/llvm/lib/IR/VFABIDemangler.cpp
+++ b/llvm/lib/IR/VFABIDemangler.cpp
@@ -36,8 +36,7 @@ static ParseRet tryParseISA(StringRef &MangledName, VFISAKind &ISA) {
   if (MangledName.empty())
     return ParseRet::Error;
 
-  if (MangledName.starts_with(VFABI::_LLVM_)) {
-    MangledName = MangledName.drop_front(strlen(VFABI::_LLVM_));
+  if (MangledName.consume_front(VFABI::_LLVM_)) {
     ISA = VFISAKind::LLVM;
   } else {
     ISA = StringSwitch<VFISAKind>(MangledName.take_front(1))

From df017dc66269ed758a741a6519d4a0efe3161780 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 22:24:14 -0800
Subject: [PATCH 222/843] [ADT] Use llvm::is_contained (NFC)

---
 llvm/include/llvm/ADT/SetVector.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h
index ff083556cf169..69931e02953d8 100644
--- a/llvm/include/llvm/ADT/SetVector.h
+++ b/llvm/include/llvm/ADT/SetVector.h
@@ -162,7 +162,7 @@ class SetVector {
   bool insert(const value_type &X) {
     if constexpr (canBeSmall())
       if (isSmall()) {
-        if (llvm::find(vector_, X) == vector_.end()) {
+        if (!llvm::is_contained(vector_, X)) {
           vector_.push_back(X);
           if (vector_.size() > N)
             makeBig();

From 17a777a4a5863bfe5bb1c2602486d6ca9bfb0199 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Fri, 19 Jan 2024 22:24:16 -0800
Subject: [PATCH 223/843] [Analysis] Use llvm::children and
 llvm::inverse_children (NFC)

---
 llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
index 0c05be0b9b8cd..8acb75e872541 100644
--- a/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
+++ b/llvm/include/llvm/Analysis/BlockFrequencyInfoImpl.h
@@ -1532,8 +1532,7 @@ void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
   SmallPtrSet<const BlockT *, 8> InverseReachable;
   for (const BlockT &BB : *F) {
     // An exit block is a block without any successors
-    bool HasSucc = GraphTraits<const BlockT *>::child_begin(&BB) !=
-                   GraphTraits<const BlockT *>::child_end(&BB);
+    bool HasSucc = !llvm::children<const BlockT *>(&BB).empty();
     if (!HasSucc && Reachable.count(&BB)) {
       Queue.push(&BB);
       InverseReachable.insert(&BB);
@@ -1542,7 +1541,7 @@ void BlockFrequencyInfoImpl<BT>::findReachableBlocks(
   while (!Queue.empty()) {
     const BlockT *SrcBB = Queue.front();
     Queue.pop();
-    for (const BlockT *DstBB : children<Inverse<const BlockT *>>(SrcBB)) {
+    for (const BlockT *DstBB : inverse_children<const BlockT *>(SrcBB)) {
       auto EP = BPI->getEdgeProbability(DstBB, SrcBB);
       if (EP.isZero())
         continue;

From 34feb2263e22c6eb75f229591e73028b3d09f04c Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hristo.goshev.hristov@gmail.com>
Date: Sat, 20 Jan 2024 08:33:00 +0200
Subject: [PATCH 224/843] =?UTF-8?q?[libc++][spaceship][NFC]=20Status=20pag?=
 =?UTF-8?q?e=20update:=20*libc++*=20Spaceship=20Operator=20Status=20(`oper?=
 =?UTF-8?q?ator<=3D>`)=C2=B6=20(#78832)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Co-authored-by: Zingam <zingam@outlook.com>
---
 libcxx/docs/Status/SpaceshipPapers.csv | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libcxx/docs/Status/SpaceshipPapers.csv b/libcxx/docs/Status/SpaceshipPapers.csv
index 09fc67b04c91b..1a00dc2f274f3 100644
--- a/libcxx/docs/Status/SpaceshipPapers.csv
+++ b/libcxx/docs/Status/SpaceshipPapers.csv
@@ -1,11 +1,12 @@
 "Number","Name","Status","First released version"
 `P1614R2 <https://wg21.link/P1614R2>`_,The Mothership has Landed,|In Progress|,
-`P2404R3 <https://wg21.link/P2404R3>`_,"Relaxing equality_comparable_with's, totally_ordered_with's, and three_way_comparable_with's common reference requirements to support move-only types",,
+`P2404R3 <https://wg21.link/P2404R3>`_,"Relaxing ``equality_comparable_with``'s, ``totally_ordered_with``'s, and ``three_way_comparable_with``'s common reference requirements to support move-only types",,
 `LWG3330 <https://wg21.link/LWG3330>`_,Include <compare> from most library headers,"|Complete|","13.0"
-`LWG3347 <https://wg21.link/LWG3347>`_,"std::pair<T, U> now requires T and U to be less-than-comparable",|Nothing To Do|,
+`LWG3347 <https://wg21.link/LWG3347>`_,"``std::pair<T, U>`` now requires ``T`` and ``U`` to be *``less-than-comparable``*",|Nothing To Do|,
 `LWG3350 <https://wg21.link/LWG3350>`_,Simplify return type of lexicographical_compare_three_way,|Nothing To Do|,
-`LWG3360 <https://wg21.link/LWG3360>`_,three_way_comparable_with is inconsistent with similar concepts,|Nothing To Do|,
+`LWG3360 <https://wg21.link/LWG3360>`_,``three_way_comparable_with`` is inconsistent with similar concepts,|Nothing To Do|,
 `LWG3380 <https://wg21.link/LWG3380>`_,common_type and comparison categories,|Nothing To Do|,
 `LWG3395 <https://wg21.link/LWG3395>`_,Definition for three-way comparison needs to be updated,|Nothing To Do|,
 `P0905R1 <https://wg21.link/P0905R1>`_,Symmetry for spaceship,,
-`P1120R0 <https://wg21.link/P1120R0>`_,Consistency improvements for <=> and other comparison operators,,
+`P1120R0 <https://wg21.link/P1120R0>`_,Consistency improvements for ``<=>`` and other comparison operators,,
+`LWG3431 <https://wg21.link/LWG3431>`_,``<=>`` for containers should require ``three_way_comparable<T>`` instead of ``<=>``,,

From 15b089cb023eaf7f80bcd5cd0e0e5fdd2fa2cbd0 Mon Sep 17 00:00:00 2001
From: Jeff Niu <jeff@modular.com>
Date: Fri, 19 Jan 2024 23:23:41 -0800
Subject: [PATCH 225/843] [mlir] Make `printAlias` hooks public (NFC) (#78833)

These are very useful when writing custom parsers and printers for
aggregate types or attributes that might want to print aliases.
---
 mlir/include/mlir/IR/OpImplementation.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/mlir/include/mlir/IR/OpImplementation.h b/mlir/include/mlir/IR/OpImplementation.h
index 86ed14e7ca843..5333d7446df5c 100644
--- a/mlir/include/mlir/IR/OpImplementation.h
+++ b/mlir/include/mlir/IR/OpImplementation.h
@@ -180,6 +180,14 @@ class AsmPrinter {
   /// provide a valid type for the attribute.
   virtual void printAttributeWithoutType(Attribute attr);
 
+  /// Print the alias for the given attribute, return failure if no alias could
+  /// be printed.
+  virtual LogicalResult printAlias(Attribute attr);
+
+  /// Print the alias for the given type, return failure if no alias could
+  /// be printed.
+  virtual LogicalResult printAlias(Type type);
+
   /// Print the given string as a keyword, or a quoted and escaped string if it
   /// has any special or non-printable characters in it.
   virtual void printKeywordOrString(StringRef keyword);
@@ -295,14 +303,6 @@ class AsmPrinter {
   AsmPrinter(const AsmPrinter &) = delete;
   void operator=(const AsmPrinter &) = delete;
 
-  /// Print the alias for the given attribute, return failure if no alias could
-  /// be printed.
-  virtual LogicalResult printAlias(Attribute attr);
-
-  /// Print the alias for the given type, return failure if no alias could
-  /// be printed.
-  virtual LogicalResult printAlias(Type type);
-
   /// The internal implementation of the printer.
   Impl *impl{nullptr};
 };
@@ -1597,9 +1597,9 @@ class OpAsmParser : public AsmParser {
   //===--------------------------------------------------------------------===//
 
   struct Argument {
-    UnresolvedOperand ssaName;    // SourceLoc, SSA name, result #.
-    Type type;                    // Type.
-    DictionaryAttr attrs;         // Attributes if present.
+    UnresolvedOperand ssaName;         // SourceLoc, SSA name, result #.
+    Type type;                         // Type.
+    DictionaryAttr attrs;              // Attributes if present.
     std::optional<Location> sourceLoc; // Source location specifier if present.
   };
 

From 3e3d74af86869278c4bc3fa015f4e0bda15f09e0 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 19 Jan 2024 22:42:17 -0800
Subject: [PATCH 226/843] Reapply "[sanitizer] Skip /include/c++/ from summary
 (#78534)"

Keep linux only test.

This reverts commit 4619147911c2a955bb605618bc518b45da994a81.
---
 .../sanitizer_symbolizer_report.cpp           |  3 +-
 .../Linux/allocator_returns_null_std.cpp      | 30 +++++++++++++++++++
 2 files changed, 32 insertions(+), 1 deletion(-)
 create mode 100644 compiler-rt/test/sanitizer_common/TestCases/Linux/allocator_returns_null_std.cpp

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
index 253dc10607a6e..8438e019591b5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_symbolizer_report.cpp
@@ -34,7 +34,8 @@ static bool FrameIsInternal(const SymbolizedStack *frame) {
     return true;
   const char *file = frame->info.file;
   const char *module = frame->info.module;
-  if (file && (internal_strstr(file, "/compiler-rt/lib/")))
+  if (file && (internal_strstr(file, "/compiler-rt/lib/") ||
+               internal_strstr(file, "/include/c++/")))
     return true;
   if (module && (internal_strstr(module, "libclang_rt.")))
     return true;
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/allocator_returns_null_std.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/allocator_returns_null_std.cpp
new file mode 100644
index 0000000000000..812cf049f2a9b
--- /dev/null
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/allocator_returns_null_std.cpp
@@ -0,0 +1,30 @@
+// Test the behavior of malloc called from std:: when the allocation size
+// exceeds the sanitizer's allocator max allowed one.
+
+// RUN: %clangxx -O0 %s -o %t
+// RUN: %env_tool_opts=allocator_may_return_null=0 not %run %t 2>&1 | FileCheck %s
+
+// UBSAN has no allocator.
+// UNSUPPORTED: ubsan
+
+// REQUIRES: x86_64-target-arch
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <vector>
+
+int main(int argc, char **argv) {
+  // The maximum value of all supported sanitizers (search for
+  // kMaxAllowedMallocSize). For ASan + LSan, ASan limit is used.
+  constexpr size_t kMaxAllowedMallocSizePlusOne = (1ULL << 40) + 1;
+
+  std::vector<char> v;
+  v.resize(kMaxAllowedMallocSizePlusOne);
+
+  fprintf(stderr, "x: %lx\n", (long)v.data());
+
+  return 0;
+}
+
+// CHECK: #{{[0-9]+.*}}allocator_returns_null_std.cpp
+// CHECK: {{SUMMARY: .*Sanitizer: allocation-size-too-big.*allocator_returns_null_std.cpp.*}} in main

From 14ca0ac9153ec47f006b29a79c3a27841c3dcb08 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Fri, 19 Jan 2024 23:26:36 -0800
Subject: [PATCH 227/843] [fuzzer,test] Remove old debug logging

---
 compiler-rt/test/fuzzer/lit.cfg.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler-rt/test/fuzzer/lit.cfg.py b/compiler-rt/test/fuzzer/lit.cfg.py
index 34179fbd3d163..4e203236b1670 100644
--- a/compiler-rt/test/fuzzer/lit.cfg.py
+++ b/compiler-rt/test/fuzzer/lit.cfg.py
@@ -7,7 +7,6 @@
 config.suffixes = [".test"]
 config.test_source_root = os.path.dirname(__file__)
 config.available_features.add(config.target_arch)
-lit_config.note(f'arch feature "{config.target_arch}" available')
 
 # Choose between lit's internal shell pipeline runner and a real shell.  If
 # LIT_USE_INTERNAL_SHELL is in the environment, we use that as an override.

From 4b500147f071d5ee4abb968f55fc4f411a2d5283 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Fri, 19 Jan 2024 23:44:40 -0800
Subject: [PATCH 228/843] [ELF] Improve LTO tests

Make it easy to change --save-temps filenames to follow COFF
(https://reviews.llvm.org/D137217).
---
 lld/test/ELF/lto/emit-asm.ll             | 15 +++--
 lld/test/ELF/lto/obj-path.ll             | 70 +++++++++++++++++---
 lld/test/ELF/lto/parallel-internalize.ll | 13 ++--
 lld/test/ELF/lto/parallel.ll             | 12 ++--
 lld/test/ELF/lto/thinlto-obj-path.ll     | 65 -------------------
 lld/test/ELF/lto/thinlto.ll              | 83 ++++++++++++------------
 6 files changed, 124 insertions(+), 134 deletions(-)
 delete mode 100644 lld/test/ELF/lto/thinlto-obj-path.ll

diff --git a/lld/test/ELF/lto/emit-asm.ll b/lld/test/ELF/lto/emit-asm.ll
index 3f635b8dbe7f7..03cf32d904520 100644
--- a/lld/test/ELF/lto/emit-asm.ll
+++ b/lld/test/ELF/lto/emit-asm.ll
@@ -1,12 +1,13 @@
 ; REQUIRES: x86
-; RUN: llvm-as %s -o %t.o
-; RUN: ld.lld --lto-emit-asm -shared %t.o -o - | FileCheck %s
-; RUN: ld.lld --plugin-opt=emit-asm --plugin-opt=lto-partitions=2 -shared %t.o -o %t2.s
-; RUN: cat %t2.s %t2.s1 | FileCheck %s
+; RUN: rm -rf %t && mkdir %t && cd %t
+; RUN: llvm-as %s -o a.bc
+; RUN: ld.lld --lto-emit-asm -shared a.bc -o - | FileCheck %s
+; RUN: ld.lld --plugin-opt=emit-asm --plugin-opt=lto-partitions=2 -shared a.bc -o out.s
+; RUN: cat out.s out.s1 | FileCheck %s
 
-; RUN: ld.lld --lto-emit-asm --save-temps -shared %t.o -o %t3.s
-; RUN: FileCheck --input-file %t3.s %s
-; RUN: llvm-dis %t3.s.0.4.opt.bc -o - | FileCheck --check-prefix=OPT %s
+; RUN: ld.lld --lto-emit-asm --save-temps -shared a.bc -o out.s
+; RUN: FileCheck --input-file out.s %s
+; RUN: llvm-dis out.s.0.4.opt.bc -o - | FileCheck --check-prefix=OPT %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/lld/test/ELF/lto/obj-path.ll b/lld/test/ELF/lto/obj-path.ll
index 69c0a21fb79ab..23b593bd36fa8 100644
--- a/lld/test/ELF/lto/obj-path.ll
+++ b/lld/test/ELF/lto/obj-path.ll
@@ -2,18 +2,19 @@
 ;; Test --lto-obj-path= for regular LTO.
 
 ; RUN: rm -rf %t && split-file %s %t && cd %t
+; RUN: mkdir d
 ; RUN: opt 1.ll -o 1.bc
-; RUN: opt 2.ll -o 2.bc
+; RUN: opt 2.ll -o d/2.bc
 
-; RUN: rm -f 4.o
-; RUN: ld.lld --lto-obj-path=4.o -shared 1.bc 2.bc -o 3
+; RUN: rm -f objpath.o
+; RUN: ld.lld --lto-obj-path=objpath.o -shared 1.bc d/2.bc -o 3
 ; RUN: llvm-nm 3 | FileCheck %s --check-prefix=NM
-; RUN: llvm-objdump -d 4.o | FileCheck %s
-; RUN: ls 3* 4* | count 2
+; RUN: llvm-objdump -d objpath.o | FileCheck %s
+; RUN: ls 3* objpath* | count 2
 
-; RUN: rm -f 3 4.o
-; RUN: ld.lld --thinlto-index-only=3.txt --lto-obj-path=4.o -shared 1.bc 2.bc -o 3
-; RUN: llvm-objdump -d 4.o | FileCheck %s
+; RUN: rm -f 3 objpath.o
+; RUN: ld.lld --thinlto-index-only=3.txt --lto-obj-path=objpath.o -shared 1.bc d/2.bc -o 3
+; RUN: llvm-objdump -d objpath.o | FileCheck %s
 ; RUN: not ls 3
 
 ; NM: T f
@@ -23,6 +24,59 @@
 ; CHECK: <f>:
 ; CHECK: <g>:
 
+;; Test --lto-obj-path= for ThinLTO.
+; RUN: opt -module-summary 1.ll -o 1.bc
+; RUN: opt -module-summary 2.ll -o d/2.bc
+
+; RUN: ld.lld --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc -o 3
+; RUN: llvm-nm 3 | FileCheck %s --check-prefix=NM3
+; RUN: llvm-objdump -d objpath.o1 | FileCheck %s --check-prefix=CHECK1
+; RUN: llvm-objdump -d objpath.o2 | FileCheck %s --check-prefix=CHECK2
+
+; NM3:      T f
+; NM3-NEXT: T g
+
+; CHECK1:       file format elf64-x86-64
+; CHECK1-EMPTY:
+; CHECK1-NEXT:  Disassembly of section .text.f:
+; CHECK1-EMPTY:
+; CHECK1-NEXT:  <f>:
+; CHECK1-NEXT:    retq
+; CHECK1-NOT:   {{.}}
+
+; CHECK2:       file format elf64-x86-64
+; CHECK2-EMPTY:
+; CHECK2-NEXT:  Disassembly of section .text.g:
+; CHECK2-EMPTY:
+; CHECK2-NEXT:  <g>:
+; CHECK2-NEXT:    retq
+; CHECK2-NOT:   {{.}}
+
+;; With --thinlto-index-only, --lto-obj-path= creates just one file.
+; RUN: rm -f objpath.o objpath.o1 objpath.o2
+; RUN: ld.lld --thinlto-index-only --lto-obj-path=objpath.o -shared 1.bc d/2.bc -o /dev/null
+; RUN: llvm-objdump -d objpath.o | FileCheck %s --check-prefix=EMPTY
+; RUN: not ls objpath.o1
+; RUN: not ls objpath.o2
+
+;; Test --plugin-opt=obj-path=.
+; RUN: rm -f objpath.o
+; RUN: ld.lld --plugin-opt=thinlto-index-only --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc -o /dev/null
+; RUN: llvm-objdump -d objpath.o | FileCheck %s --check-prefix=EMPTY
+
+;; Ensure lld emits empty combined module if specific obj-path.
+; RUN: mkdir obj
+; RUN: ld.lld --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc -o obj/out --save-temps
+; RUN: ls obj/out.lto.o obj/out1.lto.o obj/out2.lto.o
+
+;; Ensure lld does not emit empty combined module by default.
+; RUN: rm -fr obj && mkdir obj
+; RUN: ld.lld -shared 1.bc d/2.bc -o obj/out --save-temps
+; RUN: ls obj/out*.lto.* | count 2
+
+; EMPTY:     file format elf64-x86-64
+; EMPTY-NOT: {{.}}
+
 ;--- 1.ll
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/lld/test/ELF/lto/parallel-internalize.ll b/lld/test/ELF/lto/parallel-internalize.ll
index 1d3399b92b825..9cd080360d396 100644
--- a/lld/test/ELF/lto/parallel-internalize.ll
+++ b/lld/test/ELF/lto/parallel-internalize.ll
@@ -1,11 +1,10 @@
 ; REQUIRES: x86
-; RUN: llvm-as -o %t.bc %s
-; RUN: rm -f %t.lto.o %t1.lto.o
-; RUN: ld.lld --lto-partitions=2 -save-temps -o %t %t.bc \
-; RUN:   -e foo --lto-O0
-; RUN: llvm-readobj --symbols --dyn-syms %t | FileCheck %s
-; RUN: llvm-nm %t.lto.o | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-nm %t1.lto.o | FileCheck --check-prefix=CHECK1 %s
+; RUN: rm -rf %t && mkdir %t && cd %t
+; RUN: llvm-as -o a.bc %s
+; RUN: ld.lld --lto-partitions=2 -save-temps -o out a.bc -e foo --lto-O0
+; RUN: llvm-readobj --symbols --dyn-syms out | FileCheck %s
+; RUN: llvm-nm out.lto.o | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-nm out1.lto.o | FileCheck --check-prefix=CHECK1 %s
 
 ; CHECK:      Symbols [
 ; CHECK-NEXT:   Symbol {
diff --git a/lld/test/ELF/lto/parallel.ll b/lld/test/ELF/lto/parallel.ll
index d89431e8b4a16..c6c9a185677b8 100644
--- a/lld/test/ELF/lto/parallel.ll
+++ b/lld/test/ELF/lto/parallel.ll
@@ -1,11 +1,11 @@
 ; REQUIRES: x86
-; RUN: llvm-as -o %t.bc %s
-; RUN: rm -f %t.lto.o %t1.lto.o
-; RUN: ld.lld --lto-partitions=2 -save-temps -o %t %t.bc -shared
-; RUN: llvm-nm %t.lto.o | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-nm %t1.lto.o | FileCheck --check-prefix=CHECK1 %s
+; RUN: rm -rf %t && mkdir %t && cd %t
+; RUN: llvm-as -o a.bc %s
+; RUN: ld.lld --lto-partitions=2 -save-temps -o out a.bc -shared
+; RUN: llvm-nm out.lto.o | FileCheck --check-prefix=CHECK0 %s
+; RUN: llvm-nm out1.lto.o | FileCheck --check-prefix=CHECK1 %s
 
-; RUN: not ld.lld --lto-partitions=0 %t.bc -o /dev/null 2>&1 | FileCheck --check-prefix=INVALID %s
+; RUN: not ld.lld --lto-partitions=0 a.bc -o /dev/null 2>&1 | FileCheck --check-prefix=INVALID %s
 ; INVALID: --lto-partitions: number of threads must be > 0
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/lld/test/ELF/lto/thinlto-obj-path.ll b/lld/test/ELF/lto/thinlto-obj-path.ll
deleted file mode 100644
index 4058dd0f1156a..0000000000000
--- a/lld/test/ELF/lto/thinlto-obj-path.ll
+++ /dev/null
@@ -1,65 +0,0 @@
-; REQUIRES: x86
-
-; RUN: rm -rf %t && mkdir %t && cd %t
-; RUN: opt -module-summary %s -o 1.bc
-; RUN: opt -module-summary %p/Inputs/thinlto.ll -o 2.bc
-
-; RUN: ld.lld --plugin-opt=obj-path=4.o -shared 1.bc 2.bc -o 3
-; RUN: llvm-nm 3 | FileCheck %s --check-prefix=NM3
-; RUN: llvm-objdump -d 4.o1 | FileCheck %s --check-prefix=CHECK1
-; RUN: llvm-objdump -d 4.o2 | FileCheck %s --check-prefix=CHECK2
-
-; NM3:      T f
-; NM3-NEXT: T g
-
-; CHECK1:       file format elf64-x86-64
-; CHECK1-EMPTY:
-; CHECK1-NEXT:  Disassembly of section .text.f:
-; CHECK1-EMPTY:
-; CHECK1-NEXT:  <f>:
-; CHECK1-NEXT:    retq
-; CHECK1-NOT:   {{.}}
-
-; CHECK2:       file format elf64-x86-64
-; CHECK2-EMPTY:
-; CHECK2-NEXT:  Disassembly of section .text.g:
-; CHECK2-EMPTY:
-; CHECK2-NEXT:  <g>:
-; CHECK2-NEXT:    retq
-; CHECK2-NOT:   {{.}}
-
-;; With --thinlto-index-only, --lto-obj-path= creates just one file.
-; RUN: rm -f 4.o 4.o1 4.o2
-; RUN: ld.lld --thinlto-index-only --lto-obj-path=4.o -shared 1.bc 2.bc -o /dev/null
-; RUN: llvm-objdump -d 4.o | FileCheck %s --check-prefix=EMPTY
-; RUN: not ls 4.o1
-; RUN: not ls 4.o2
-
-;; Test --plugin-opt=obj-path=.
-; RUN: rm -f 4.o
-; RUN: ld.lld --plugin-opt=thinlto-index-only --plugin-opt=obj-path=4.o -shared 1.bc 2.bc -o /dev/null
-; RUN: llvm-objdump -d 4.o | FileCheck %s --check-prefix=EMPTY
-
-;; Ensure lld emits empty combined module if specific obj-path.
-; RUN: rm -fr objpath && mkdir objpath
-; RUN: ld.lld --plugin-opt=obj-path=4.o -shared 1.bc 2.bc -o objpath/a.out --save-temps
-; RUN: ls objpath/a.out*.lto.* | count 3
-
-;; Ensure lld does not emit empty combined module in default.
-; RUN: rm -fr objpath && mkdir objpath
-; RUN: ld.lld -shared 1.bc 2.bc -o objpath/a.out --save-temps
-; RUN: ls objpath/a.out*.lto.* | count 2
-
-; EMPTY:     file format elf64-x86-64
-; EMPTY-NOT: {{.}}
-
-target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-declare void @g(...)
-
-define void @f() {
-entry:
-  call void (...) @g()
-  ret void
-}
diff --git a/lld/test/ELF/lto/thinlto.ll b/lld/test/ELF/lto/thinlto.ll
index 5d5df641566b1..4145007d9b201 100644
--- a/lld/test/ELF/lto/thinlto.ll
+++ b/lld/test/ELF/lto/thinlto.ll
@@ -1,68 +1,69 @@
 ; REQUIRES: x86
 
 ; Basic ThinLTO tests.
-; RUN: opt -module-summary %s -o %t1.o
-; RUN: opt -module-summary %p/Inputs/thinlto.ll -o %t2.o
+; RUN: rm -rf %t && mkdir %t && cd %t
+; RUN: mkdir d e
+; RUN: opt -module-summary %s -o 1.o
+; RUN: opt -module-summary %p/Inputs/thinlto.ll -o d/2.o
 
 ; First force single-threaded mode
-; RUN: rm -f %t31.lto.o %t32.lto.o
-; RUN: ld.lld -save-temps --thinlto-jobs=1 -shared %t1.o %t2.o -o %t3
-; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: ld.lld -save-temps --thinlto-jobs=1 -shared 1.o d/2.o -o e/out
+; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Next force multi-threaded mode
-; RUN: rm -f %t31.lto.o %t32.lto.o
-; RUN: ld.lld -save-temps --thinlto-jobs=2 -shared %t1.o %t2.o -o %t3
-; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: ld.lld -save-temps --thinlto-jobs=2 -shared 1.o d/2.o -o e/out
+; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ;; --plugin-opt=jobs= is an alias.
-; RUN: rm -f %t31.lto.o %t32.lto.o
-; RUN: ld.lld -save-temps --plugin-opt=jobs=2 -shared %t1.o %t2.o -o %t3
-; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: ld.lld -save-temps --plugin-opt=jobs=2 -shared 1.o d/2.o -o e/out
+; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ;; --thinlto-jobs= defaults to --threads=.
-; RUN: rm -f %t31.lto.o %t32.lto.o
-; RUN: ld.lld -save-temps --threads=2 -shared %t1.o %t2.o -o %t3
-; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: ld.lld -save-temps --threads=2 -shared 1.o d/2.o -o e/out
+; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ;; --thinlto-jobs= overrides --threads=.
-; RUN: rm -f %t31.lto.o %t32.lto.o
-; RUN: ld.lld -save-temps --threads=1 --plugin-opt=jobs=2 -shared %t1.o %t2.o -o %t3
-; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: ld.lld -save-temps --threads=1 --plugin-opt=jobs=2 -shared 1.o d/2.o -o e/out
+; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Test with all threads, on all cores, on all CPU sockets
-; RUN: rm -f %t31.lto.o %t32.lto.o
-; RUN: ld.lld -save-temps --thinlto-jobs=all -shared %t1.o %t2.o -o %t3
-; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: ld.lld -save-temps --thinlto-jobs=all -shared 1.o d/2.o -o e/out
+; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Test with many more threads than the system has
-; RUN: rm -f %t31.lto.o %t32.lto.o
-; RUN: ld.lld -save-temps --thinlto-jobs=100 -shared %t1.o %t2.o -o %t3
-; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: ld.lld -save-temps --thinlto-jobs=100 -shared 1.o d/2.o -o e/out
+; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Test with a bad value
-; RUN: rm -f %t31.lto.o %t32.lto.o
-; RUN: not ld.lld -save-temps --thinlto-jobs=foo -shared %t1.o %t2.o -o %t3 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
+; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: not ld.lld -save-temps --thinlto-jobs=foo -shared 1.o d/2.o -o e/out 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
 ; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo
 
 ; Then check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT)
-; RUN: ld.lld -shared -save-temps %t1.o %t2.o -o %t3
-; RUN: llvm-nm %t31.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm %t32.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: ld.lld -shared -save-temps 1.o d/2.o -o e/out
+; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Check that -save-temps is usable with thin archives
-; RUN: rm -fr %t.dir
-; RUN: mkdir -p %t.dir
-; RUN: cp %t2.o %t.dir/t.o
-; RUN: llvm-ar rcsT %t.dir/t.a %t.dir/t.o
-; RUN: ld.lld -save-temps %t1.o %t.dir/t.a -o %t.null
-; RUN: ls %t.dir/t.a*.0.preopt.bc
+; RUN: mkdir dir
+; RUN: cp d/2.o dir/t.o
+; RUN: llvm-ar rcsT dir/t.a dir/t.o
+; RUN: ld.lld -save-temps 1.o dir/t.a -o %t.null
+; RUN: ls dir/t.a*.0.preopt.bc
 
 ; NM1: T f
 ; NM2: T g

From 296fbee5af89e8e4c31dd98f48a9770c4eb3ca4d Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Sat, 20 Jan 2024 08:48:09 +0100
Subject: [PATCH 229/843] [clang-tidy] Fix crash in modernize-loop-convert when
 int is used as iterator (#78796)

Fix crash when built-in type (like int) is used as iterator, or when
call to begin() return integer.

Closes #78381
---
 .../clang-tidy/modernize/LoopConvertCheck.cpp       | 12 ++++++++----
 clang-tools-extra/docs/ReleaseNotes.rst             |  5 +++--
 .../checkers/modernize/loop-convert-basic.cpp       | 13 +++++++++++++
 3 files changed, 24 insertions(+), 6 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
index 8beaa62c78ba0..f0791da143ad9 100644
--- a/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/LoopConvertCheck.cpp
@@ -942,11 +942,15 @@ bool LoopConvertCheck::isConvertible(ASTContext *Context,
         CanonicalInitVarType->isPointerType()) {
       // If the initializer and the variable are both pointers check if the
       // un-qualified pointee types match, otherwise we don't use auto.
-      if (!Context->hasSameUnqualifiedType(
-              CanonicalBeginType->getPointeeType(),
-              CanonicalInitVarType->getPointeeType()))
-        return false;
+      return Context->hasSameUnqualifiedType(
+          CanonicalBeginType->getPointeeType(),
+          CanonicalInitVarType->getPointeeType());
     }
+
+    if (CanonicalBeginType->isBuiltinType() ||
+        CanonicalInitVarType->isBuiltinType())
+      return false;
+
   } else if (FixerKind == LFK_PseudoArray) {
     if (const auto *EndCall = Nodes.getNodeAs<CXXMemberCallExpr>(EndCallName)) {
       // This call is required to obtain the container.
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index d77267588db91..7f618e71afd1c 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -410,7 +410,8 @@ Changes in existing checks
 - Improved :doc:`modernize-loop-convert
   <clang-tidy/checks/modernize/loop-convert>` to support for-loops with
   iterators initialized by free functions like ``begin``, ``end``, or ``size``
-  and avoid crash for array of dependent array.
+  and avoid crash for array of dependent array and non-dereferenceable builtin
+  types used as iterators.
 
 - Improved :doc:`modernize-make-shared
   <clang-tidy/checks/modernize/make-shared>` check to support
@@ -502,7 +503,7 @@ Changes in existing checks
   <clang-tidy/checks/readability/implicit-bool-conversion>` check to take
   do-while loops into account for the `AllowIntegerConditions` and
   `AllowPointerConditions` options. It also now provides more consistent
-  suggestions when parentheses are added to the return value or expressions. 
+  suggestions when parentheses are added to the return value or expressions.
   It also ignores false-positives for comparison containing bool bitfield.
 
 - Improved :doc:`readability-misleading-indentation
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp
index e2b9336d620f5..c29fbc9f9b23b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/loop-convert-basic.cpp
@@ -954,3 +954,16 @@ void dependenceArrayTest() {
 }
 
 } // namespace PseudoArray
+
+namespace PR78381 {
+  struct blocked_range {
+    int begin() const;
+    int end() const;
+  };
+
+  void test() {
+    blocked_range r;
+    for (auto i = r.begin(); i!=r.end(); ++i) {
+    }
+  }
+}

From a7d7da6e45992b79fe712c1e228cc57c9f27fa7a Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Sat, 20 Jan 2024 00:08:23 -0800
Subject: [PATCH 230/843] [clang-format] Add SkipMacroDefinitionBody option
 (#78682)

Closes #67991.

See also: #70338

Co-authored-by: @tomekpaszek
---
 clang/docs/ClangFormatStyleOptions.rst     |   5 +
 clang/docs/ReleaseNotes.rst                |   1 +
 clang/include/clang/Format/Format.h        |   5 +
 clang/lib/Format/Format.cpp                |   2 +
 clang/lib/Format/UnwrappedLineParser.cpp   |   9 ++
 clang/unittests/Format/ConfigParseTest.cpp |   1 +
 clang/unittests/Format/FormatTest.cpp      | 132 +++++++++++++++++++++
 7 files changed, 155 insertions(+)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 8bc13e45bf2f5..4dc0de3a90f26 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -4999,6 +4999,11 @@ the configuration (without a prefix: ``Auto``).
        int bar;                           int bar;
      } // namespace b                   } // namespace b
 
+.. _SkipMacroDefinitionBody:
+
+**SkipMacroDefinitionBody** (``Boolean``) :versionbadge:`clang-format 18` :ref:`¶ <SkipMacroDefinitionBody>`
+  Do not format macro definition body.
+
 .. _SortIncludes:
 
 **SortIncludes** (``SortIncludesOptions``) :versionbadge:`clang-format 3.8` :ref:`¶ <SortIncludes>`
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 08e79984fe336..4d152d220289b 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1237,6 +1237,7 @@ clang-format
 - Add ``PenaltyBreakScopeResolution`` option.
 - Add ``.clang-format-ignore`` files.
 - Add ``AlignFunctionPointers`` sub-option for ``AlignConsecutiveDeclarations``.
+- Add ``SkipMacroDefinitionBody`` option.
 
 libclang
 --------
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index 6fd7947bd2179..bc9eecd42f9eb 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -3932,6 +3932,10 @@ struct FormatStyle {
   /// \version 13
   unsigned ShortNamespaceLines;
 
+  /// Do not format macro definition body.
+  /// \version 18
+  bool SkipMacroDefinitionBody;
+
   /// Include sorting options.
   enum SortIncludesOptions : int8_t {
     /// Includes are never sorted.
@@ -4895,6 +4899,7 @@ struct FormatStyle {
            RequiresExpressionIndentation == R.RequiresExpressionIndentation &&
            SeparateDefinitionBlocks == R.SeparateDefinitionBlocks &&
            ShortNamespaceLines == R.ShortNamespaceLines &&
+           SkipMacroDefinitionBody == R.SkipMacroDefinitionBody &&
            SortIncludes == R.SortIncludes &&
            SortJavaStaticImport == R.SortJavaStaticImport &&
            SpaceAfterCStyleCast == R.SpaceAfterCStyleCast &&
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 7c2f4dcf3d230..ff326dc784783 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1085,6 +1085,7 @@ template <> struct MappingTraits<FormatStyle> {
                    Style.RequiresExpressionIndentation);
     IO.mapOptional("SeparateDefinitionBlocks", Style.SeparateDefinitionBlocks);
     IO.mapOptional("ShortNamespaceLines", Style.ShortNamespaceLines);
+    IO.mapOptional("SkipMacroDefinitionBody", Style.SkipMacroDefinitionBody);
     IO.mapOptional("SortIncludes", Style.SortIncludes);
     IO.mapOptional("SortJavaStaticImport", Style.SortJavaStaticImport);
     IO.mapOptional("SortUsingDeclarations", Style.SortUsingDeclarations);
@@ -1556,6 +1557,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.RequiresExpressionIndentation = FormatStyle::REI_OuterScope;
   LLVMStyle.SeparateDefinitionBlocks = FormatStyle::SDS_Leave;
   LLVMStyle.ShortNamespaceLines = 1;
+  LLVMStyle.SkipMacroDefinitionBody = false;
   LLVMStyle.SortIncludes = FormatStyle::SI_CaseSensitive;
   LLVMStyle.SortJavaStaticImport = FormatStyle::SJSIO_Before;
   LLVMStyle.SortUsingDeclarations = FormatStyle::SUD_LexicographicNumeric;
diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 50d41c9f57a67..c08ce86449b6e 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1170,6 +1170,15 @@ void UnwrappedLineParser::parsePPDefine() {
   assert((int)Line->PPLevel >= 0);
   Line->InMacroBody = true;
 
+  if (Style.SkipMacroDefinitionBody) {
+    do {
+      FormatTok->Finalized = true;
+      nextToken();
+    } while (!eof());
+    addUnwrappedLine();
+    return;
+  }
+
   if (FormatTok->is(tok::identifier) &&
       Tokens->peekNextToken()->is(tok::colon)) {
     nextToken();
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 172aaab5988ce..2a8d79359a49b 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -184,6 +184,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(ReflowComments);
   CHECK_PARSE_BOOL(RemoveBracesLLVM);
   CHECK_PARSE_BOOL(RemoveSemicolon);
+  CHECK_PARSE_BOOL(SkipMacroDefinitionBody);
   CHECK_PARSE_BOOL(SpacesInSquareBrackets);
   CHECK_PARSE_BOOL(SpaceInEmptyBlock);
   CHECK_PARSE_BOOL(SpacesInContainerLiterals);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index c229d9bc56def..44896c10b0b25 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -24387,6 +24387,138 @@ TEST_F(FormatTest, WhitespaceSensitiveMacros) {
   verifyNoChange("FOO(String-ized&Messy+But: :Still=Intentional);", Style);
 }
 
+TEST_F(FormatTest, SkipMacroDefinitionBody) {
+  auto Style = getLLVMStyle();
+  Style.SkipMacroDefinitionBody = true;
+
+  verifyFormat("#define A", "#define  A", Style);
+  verifyFormat("#define A       a   aa", "#define   A       a   aa", Style);
+  verifyNoChange("#define A   b", Style);
+  verifyNoChange("#define A  (  args   )", Style);
+  verifyNoChange("#define A  (  args   )  =  func  (  args  )", Style);
+  verifyNoChange("#define A  (  args   )  {  int  a  =  1 ;  }", Style);
+  verifyNoChange("#define A  (  args   ) \\\n"
+                 "  {\\\n"
+                 "    int  a  =  1 ;\\\n"
+                 "}",
+                 Style);
+
+  verifyNoChange("#define A x:", Style);
+  verifyNoChange("#define A a. b", Style);
+
+  // Surrounded with formatted code.
+  verifyFormat("int a;\n"
+               "#define A  a\n"
+               "int a;",
+               "int  a ;\n"
+               "#define  A  a\n"
+               "int  a ;",
+               Style);
+
+  // Columns are not broken when a limit is set.
+  Style.ColumnLimit = 10;
+  verifyFormat("#define A  a  a  a  a", " # define  A  a  a  a  a ", Style);
+  verifyNoChange("#define A a a a a", Style);
+
+  Style.ColumnLimit = 15;
+  verifyFormat("#define A // a\n"
+               "          // very\n"
+               "          // long\n"
+               "          // comment",
+               "#define A //a very long comment", Style);
+  Style.ColumnLimit = 0;
+
+  // Multiline definition.
+  verifyNoChange("#define A \\\n"
+                 "Line one with spaces  .  \\\n"
+                 " Line two.",
+                 Style);
+  verifyNoChange("#define A \\\n"
+                 "a a \\\n"
+                 "a        \\\n"
+                 "a",
+                 Style);
+  Style.AlignEscapedNewlines = FormatStyle::ENAS_Left;
+  verifyNoChange("#define A \\\n"
+                 "a a \\\n"
+                 "a        \\\n"
+                 "a",
+                 Style);
+  Style.AlignEscapedNewlines = FormatStyle::ENAS_Right;
+  verifyNoChange("#define A \\\n"
+                 "a a \\\n"
+                 "a        \\\n"
+                 "a",
+                 Style);
+
+  // Adjust indendations but don't change the definition.
+  Style.IndentPPDirectives = FormatStyle::PPDIS_None;
+  verifyNoChange("#if A\n"
+                 "#define A  a\n"
+                 "#endif",
+                 Style);
+  verifyFormat("#if A\n"
+               "#define A  a\n"
+               "#endif",
+               "#if A\n"
+               "  #define A  a\n"
+               "#endif",
+               Style);
+  Style.IndentPPDirectives = FormatStyle::PPDIS_AfterHash;
+  verifyNoChange("#if A\n"
+                 "#  define A  a\n"
+                 "#endif",
+                 Style);
+  verifyFormat("#if A\n"
+               "#  define A  a\n"
+               "#endif",
+               "#if A\n"
+               "  #define A  a\n"
+               "#endif",
+               Style);
+  Style.IndentPPDirectives = FormatStyle::PPDIS_BeforeHash;
+  verifyNoChange("#if A\n"
+                 "  #define A  a\n"
+                 "#endif",
+                 Style);
+  verifyFormat("#if A\n"
+               "  #define A  a\n"
+               "#endif",
+               "#if A\n"
+               " # define A  a\n"
+               "#endif",
+               Style);
+
+  Style.IndentPPDirectives = FormatStyle::PPDIS_None;
+  // SkipMacroDefinitionBody should not affect other PP directives
+  verifyFormat("#if !defined(A)\n"
+               "#define A  a\n"
+               "#endif",
+               "#if ! defined ( A )\n"
+               "  #define  A  a\n"
+               "#endif",
+               Style);
+
+  // With comments.
+  verifyFormat("/* */ #define A  a //  a  a", "/* */  # define A  a  //  a  a",
+               Style);
+  verifyNoChange("/* */ #define A  a //  a  a", Style);
+
+  verifyFormat("int a;    // a\n"
+               "#define A // a\n"
+               "int aaa;  // a",
+               "int a; // a\n"
+               "#define A  // a\n"
+               "int aaa; // a",
+               Style);
+
+  // multiline macro definitions
+  verifyNoChange("#define A  a\\\n"
+                 "  A  a \\\n "
+                 " A  a",
+                 Style);
+}
+
 TEST_F(FormatTest, VeryLongNamespaceCommentSplit) {
   // These tests are not in NamespaceEndCommentsFixerTest because that doesn't
   // test its interaction with line wrapping

From 9eb0f86c279f40a792ec27bf0e9b491b8c90a640 Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Sat, 20 Jan 2024 12:23:08 +0400
Subject: [PATCH 231/843] [clang] Implement CWG1878 "`operator auto` template"
 (#78103)

C++14 introduced deduced return type for regular functions, but shortly after [CWG1878](https://wg21.link/cwg1878) was filed and resolved to disallow deduced return types in conversion function templates. So this patch diagnoses such usage of deduced return type in C++14 mode onwards.

Fixes #51776
---
 clang/docs/ReleaseNotes.rst                   |  2 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 +-
 clang/lib/Sema/SemaDeclCXX.cpp                | 15 +++++-
 clang/test/CXX/drs/dr18xx.cpp                 | 45 +++++++++++++++--
 .../SemaCXX/deduced-return-type-cxx14.cpp     | 50 ++++++++++++++++---
 clang/www/cxx_dr_status.html                  |  2 +-
 6 files changed, 102 insertions(+), 15 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4d152d220289b..4c69bdf131aaa 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -575,6 +575,8 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses unexpanded packs within the template argument lists of function template specializations.
 - Clang now diagnoses attempts to bind a bitfield to an NTTP of a reference type as erroneous
   converted constant expression and not as a reference to subobject.
+- Clang now diagnoses ``auto`` and ``decltype(auto)`` in declarations of conversion function template
+  (`CWG1878: <https://cplusplus.github.io/CWG/issues/1878.html>`_)
 - Clang now diagnoses the requirement that non-template friend declarations with requires clauses
   and template friend declarations with a constraint that depends on a template parameter from an
   enclosing template must be a definition.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 03b0122d1c08f..23692758e9b8a 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2411,7 +2411,8 @@ def err_auto_not_allowed : Error<
   "|in type allocated by 'new'|in K&R-style function parameter"
   "|in template parameter|in friend declaration|in function prototype that is "
   "not a function declaration|in requires expression parameter"
-  "|in array declaration}1">;
+  "|in array declaration"
+  "|in declaration of conversion function template}1">;
 def err_dependent_deduced_tst : Error<
   "typename specifier refers to "
   "%select{class template|function template|variable template|alias template|"
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 828a0ec3dd042..df5bd55e7c283 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -11321,9 +11321,20 @@ Decl *Sema::ActOnConversionDeclarator(CXXConversionDecl *Conversion) {
       << ClassType << ConvType;
   }
 
-  if (FunctionTemplateDecl *ConversionTemplate
-                                = Conversion->getDescribedFunctionTemplate())
+  if (FunctionTemplateDecl *ConversionTemplate =
+          Conversion->getDescribedFunctionTemplate()) {
+    if (const auto *ConvTypePtr = ConvType->getAs<PointerType>()) {
+      ConvType = ConvTypePtr->getPointeeType();
+    }
+    if (ConvType->isUndeducedAutoType()) {
+      Diag(Conversion->getTypeSpecStartLoc(), diag::err_auto_not_allowed)
+          << getReturnTypeLoc(Conversion).getSourceRange()
+          << llvm::to_underlying(ConvType->getAs<AutoType>()->getKeyword())
+          << /* in declaration of conversion function template= */ 24;
+    }
+
     return ConversionTemplate;
+  }
 
   return Conversion;
 }
diff --git a/clang/test/CXX/drs/dr18xx.cpp b/clang/test/CXX/drs/dr18xx.cpp
index 0245f03986dd7..f2c056e13ecad 100644
--- a/clang/test/CXX/drs/dr18xx.cpp
+++ b/clang/test/CXX/drs/dr18xx.cpp
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx11-17,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx17,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 
 #if __cplusplus == 199711L
 #define static_assert(...) __extension__ _Static_assert(__VA_ARGS__)
@@ -319,6 +319,41 @@ namespace dr1872 { // dr1872: 9
 #endif
 }
 
+namespace dr1878 { // dr1878: 18
+#if __cplusplus >= 201402L
+#if __cplusplus >= 202002L
+template <typename T>
+concept C = true;
+#endif
+
+struct S {
+  template <typename T>
+  operator auto() const { return short(); }
+  // since-cxx14-error@-1 {{'auto' not allowed in declaration of conversion function template}}
+  template <typename T>
+  operator const auto() const { return int(); }
+  // since-cxx14-error@-1 {{'auto' not allowed in declaration of conversion function template}}
+  template <typename T>
+  operator const auto&() const { return char(); }
+  // since-cxx14-error@-1 {{'auto' not allowed in declaration of conversion function template}}
+  template <typename T>
+  operator const auto*() const { return long(); }
+  // since-cxx14-error@-1 {{'auto' not allowed in declaration of conversion function template}}
+  template <typename T>
+  operator decltype(auto)() const { return unsigned(); }
+  // since-cxx14-error@-1 {{'decltype(auto)' not allowed in declaration of conversion function template}}
+#if __cplusplus >= 202002L
+  template <typename T>
+  operator C auto() const { return float(); }
+  // since-cxx20-error@-1 {{'auto' not allowed in declaration of conversion function template}}
+  template <typename T>
+  operator C decltype(auto)() const { return double(); }
+  // since-cxx20-error@-1 {{'decltype(auto)' not allowed in declaration of conversion function template}}
+#endif
+};
+#endif
+}
+
 namespace dr1881 { // dr1881: 7
   struct A { int a : 4; };
   struct B : A { int b : 3; };
diff --git a/clang/test/SemaCXX/deduced-return-type-cxx14.cpp b/clang/test/SemaCXX/deduced-return-type-cxx14.cpp
index eac9c587869f5..c0d43911b8c71 100644
--- a/clang/test/SemaCXX/deduced-return-type-cxx14.cpp
+++ b/clang/test/SemaCXX/deduced-return-type-cxx14.cpp
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify=expected,cxx20_23,cxx23    %s
-// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify=expected,cxx20_23,cxx23    %s -fdelayed-template-parsing -DDELAYED_TEMPLATE_PARSING
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify=expected,since-cxx20,since-cxx14,cxx20_23,cxx23    %s
+// RUN: %clang_cc1 -std=c++23 -fsyntax-only -verify=expected,since-cxx20,since-cxx14,cxx20_23,cxx23    %s -fdelayed-template-parsing -DDELAYED_TEMPLATE_PARSING
 
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify=expected,cxx14_20,cxx20_23 %s
-// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify=expected,cxx14_20,cxx20_23 %s -fdelayed-template-parsing -DDELAYED_TEMPLATE_PARSING
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify=expected,since-cxx20,since-cxx14,cxx14_20,cxx20_23 %s
+// RUN: %clang_cc1 -std=c++20 -fsyntax-only -verify=expected,since-cxx20,since-cxx14,cxx14_20,cxx20_23 %s -fdelayed-template-parsing -DDELAYED_TEMPLATE_PARSING
 
-// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify=expected,cxx14_20,cxx14    %s
-// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify=expected,cxx14_20,cxx14    %s -fdelayed-template-parsing -DDELAYED_TEMPLATE_PARSING
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify=expected,since-cxx14,cxx14_20,cxx14    %s
+// RUN: %clang_cc1 -std=c++14 -fsyntax-only -verify=expected,since-cxx14,cxx14_20,cxx14    %s -fdelayed-template-parsing -DDELAYED_TEMPLATE_PARSING
 
 auto f(); // expected-note {{previous}}
 int f(); // expected-error {{differ only in their return type}}
@@ -686,3 +686,41 @@ auto f(auto x) { // cxx14-error {{'auto' not allowed in function prototype}}
 }
 
 }
+
+#if __cplusplus >= 202002L
+template <typename T>
+concept C = true;
+#endif
+
+struct DeducedTargetTypeOfConversionFunction {
+  operator auto() const { return char(); }
+  operator const auto() const { return float(); }
+  operator const auto&() const { return int(); }
+  // expected-warning@-1 {{returning reference to local temporary object}}
+  operator decltype(auto)() const { return double(); }
+#if __cplusplus >= 202002L
+  operator C auto() const { return unsigned(); }
+  operator C decltype(auto)() const { return long(); }
+#endif
+
+  template <typename T>
+  operator auto() const { return short(); }
+  // since-cxx14-error@-1 {{'auto' not allowed in declaration of conversion function template}}
+  template <typename T>
+  operator const auto() const { return int(); }
+  // since-cxx14-error@-1 {{'auto' not allowed in declaration of conversion function template}}
+  template <typename T>
+  operator const auto&() const { return char(); }
+  // since-cxx14-error@-1 {{'auto' not allowed in declaration of conversion function template}}
+  template <typename T>
+  operator decltype(auto)() const { return unsigned(); }
+  // since-cxx14-error@-1 {{'decltype(auto)' not allowed in declaration of conversion function template}}
+#if __cplusplus >= 202002L
+  template <typename T>
+  operator C auto() const { return float(); }
+  // since-cxx20-error@-1 {{'auto' not allowed in declaration of conversion function template}}
+  template <typename T>
+  operator C decltype(auto)() const { return double(); }
+  // since-cxx20-error@-1 {{'decltype(auto)' not allowed in declaration of conversion function template}}
+#endif
+};
diff --git a/clang/www/cxx_dr_status.html b/clang/www/cxx_dr_status.html
index b29a746bcf6e8..f0d835fc091ce 100755
--- a/clang/www/cxx_dr_status.html
+++ b/clang/www/cxx_dr_status.html
@@ -11076,7 +11076,7 @@ <h2 id="cxxdr">C++ defect report implementation status</h2>
     <td><a href="https://cplusplus.github.io/CWG/issues/1878.html">1878</a></td>
     <td>CD4</td>
     <td><TT>operator auto</TT> template</td>
-    <td class="unknown" align="center">Unknown</td>
+    <td class="unreleased" align="center">Clang 18</td>
   </tr>
   <tr id="1879">
     <td><a href="https://cplusplus.github.io/CWG/issues/1879.html">1879</a></td>

From 6a433d77b1f49ddeb03e27394a9b7cbf6e472d1a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Sat, 20 Jan 2024 09:57:03 +0100
Subject: [PATCH 232/843] [llvm-jitlink] Allow optional stub-kind filter in
 stub_addr() expressions (#78369)

We use `jitlink-check` lines in LIT tests as the primary tool for
testing JITLink backends. Parsing and evaluation of the expressions is
implemented in `RuntimeDyldChecker`. The `stub_addr(obj, name)`
expression allows to obtain the linker-generated stub for the external
symbol `name` in object file `obj`.

This patch adds support for a filter parameter to select one out of many
stubs. This is necessary for the AArch32 JITLink backend, which must be
able to emit two different kinds of stubs depending on the instruction
set state (Arm/Thumb) of the relocation site. Since the new parameter is
optional, we don't have to update existing tests.

Filters are regular expressions without brackets that match exactly one
existing stub. Given object file `armv7.o` with two stubs for external
function `ext` of kinds `armv7_abs_le` and `thumbv7_abs_le`, we get the
following filter results e.g.:
```
stub_addr(armv7.o, ext, thumb)        thumbv7_abs_le
stub_addr(armv7.o, ext, thumbv7)      thumbv7_abs_le
stub_addr(armv7.o, ext, armv7_abs_le) armv7_abs_le
stub_addr(armv7.o, ext, v7_.*_le)     Error: "ext" has 2 candidate stubs in file "armv7.o". Please refine stub-kind filter "v7_.*_le" for disambiguation (encountered kinds are "thumbv7_abs_le", "armv7_abs_le").
stub_addr(armv7.o, ext, v8)           Error: "ext" has 2 stubs in file "armv7.o", but none of them matches the stub-kind filter "v8" (all encountered kinds are "thumbv7_abs_le", "armv7_abs_le").
```
---
 .../llvm/ExecutionEngine/RuntimeDyldChecker.h |  2 +-
 .../RuntimeDyld/RuntimeDyldChecker.cpp        | 27 ++++--
 .../RuntimeDyld/RuntimeDyldCheckerImpl.h      |  3 +-
 llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp  |  2 +-
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      | 86 +++++++++++++++++--
 llvm/tools/llvm-jitlink/llvm-jitlink.h        |  7 +-
 llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp        | 11 ++-
 7 files changed, 114 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h b/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
index 80e4bbf494339..034c134a13731 100644
--- a/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
+++ b/llvm/include/llvm/ExecutionEngine/RuntimeDyldChecker.h
@@ -154,7 +154,7 @@ class RuntimeDyldChecker {
   using GetSectionInfoFunction = std::function<Expected<MemoryRegionInfo>(
       StringRef FileName, StringRef SectionName)>;
   using GetStubInfoFunction = std::function<Expected<MemoryRegionInfo>(
-      StringRef StubContainer, StringRef TargetName)>;
+      StringRef StubContainer, StringRef TargetName, StringRef StubKindFilter)>;
   using GetGOTInfoFunction = std::function<Expected<MemoryRegionInfo>(
       StringRef GOTContainer, StringRef TargetName)>;
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 7fadbdd6a1fff..11fb21a9c1c0a 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -400,6 +400,15 @@ class RuntimeDyldCheckerExprEval {
     StringRef Symbol;
     std::tie(Symbol, RemainingExpr) = parseSymbol(RemainingExpr);
 
+    // Parse optional parameter to filter by stub kind
+    StringRef KindNameFilter;
+    if (RemainingExpr.starts_with(",")) {
+      RemainingExpr = RemainingExpr.substr(1).ltrim();
+      size_t ClosingBracket = RemainingExpr.find(")");
+      KindNameFilter = RemainingExpr.substr(0, ClosingBracket);
+      RemainingExpr = RemainingExpr.substr(ClosingBracket);
+    }
+
     if (!RemainingExpr.starts_with(")"))
       return std::make_pair(
           unexpectedToken(RemainingExpr, Expr, "expected ')'"), "");
@@ -407,8 +416,9 @@ class RuntimeDyldCheckerExprEval {
 
     uint64_t StubAddr;
     std::string ErrorMsg;
-    std::tie(StubAddr, ErrorMsg) = Checker.getStubOrGOTAddrFor(
-        StubContainerName, Symbol, PCtx.IsInsideLoad, IsStubAddr);
+    std::tie(StubAddr, ErrorMsg) =
+        Checker.getStubOrGOTAddrFor(StubContainerName, Symbol, KindNameFilter,
+                                    PCtx.IsInsideLoad, IsStubAddr);
 
     if (ErrorMsg != "")
       return std::make_pair(EvalResult(ErrorMsg), "");
@@ -985,11 +995,14 @@ std::pair<uint64_t, std::string> RuntimeDyldCheckerImpl::getSectionAddr(
 }
 
 std::pair<uint64_t, std::string> RuntimeDyldCheckerImpl::getStubOrGOTAddrFor(
-    StringRef StubContainerName, StringRef SymbolName, bool IsInsideLoad,
-    bool IsStubAddr) const {
-
-  auto StubInfo = IsStubAddr ? GetStubInfo(StubContainerName, SymbolName)
-                             : GetGOTInfo(StubContainerName, SymbolName);
+    StringRef StubContainerName, StringRef SymbolName, StringRef StubKindFilter,
+    bool IsInsideLoad, bool IsStubAddr) const {
+
+  assert((StubKindFilter.empty() || IsStubAddr) &&
+         "Kind name filter only supported for stubs");
+  auto StubInfo =
+      IsStubAddr ? GetStubInfo(StubContainerName, SymbolName, StubKindFilter)
+                 : GetGOTInfo(StubContainerName, SymbolName);
 
   if (!StubInfo) {
     std::string ErrMsg;
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
index 9f44a9389f473..bda554e9e5b67 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCheckerImpl.h
@@ -64,7 +64,8 @@ class RuntimeDyldCheckerImpl {
 
   std::pair<uint64_t, std::string>
   getStubOrGOTAddrFor(StringRef StubContainerName, StringRef Symbol,
-                      bool IsInsideLoad, bool IsStubAddr) const;
+                      StringRef StubKindFilter, bool IsInsideLoad,
+                      bool IsStubAddr) const;
 
   std::optional<uint64_t> getSectionLoadAddress(void *LocalAddr) const;
 
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
index 9f7d66d7b30a1..a8c804a459e3c 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-elf.cpp
@@ -91,7 +91,7 @@ static Error registerSymbol(LinkGraph &G, Symbol &Sym, Session::FileInfo &FI,
   case Stubs:
     return FI.registerStubEntry(G, Sym, getELFStubTarget);
   case AArch32Stubs:
-    return FI.registerStubEntry(G, Sym, getELFAArch32StubTarget);
+    return FI.registerMultiStubEntry(G, Sym, getELFAArch32StubTarget);
   case Other:
     return Error::success();
   }
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 8c18610313ce8..d233ebdb5a3a8 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -331,8 +331,12 @@ operator<<(raw_ostream &OS, const Session::FileInfo &FI) {
     OS << "  Section \"" << SIKV.first() << "\": " << SIKV.second << "\n";
   for (auto &GOTKV : FI.GOTEntryInfos)
     OS << "  GOT \"" << GOTKV.first() << "\": " << GOTKV.second << "\n";
-  for (auto &StubKV : FI.StubInfos)
-    OS << "  Stub \"" << StubKV.first() << "\": " << StubKV.second << "\n";
+  for (auto &StubKVs : FI.StubInfos) {
+    OS << "  Stubs \"" << StubKVs.first() << "\":";
+    for (auto MemRegion : StubKVs.second)
+      OS << " " << MemRegion;
+    OS << "\n";
+  }
   return OS;
 }
 
@@ -1207,9 +1211,35 @@ Error Session::FileInfo::registerStubEntry(
   auto TS = GetSymbolTarget(G, Sym.getBlock());
   if (!TS)
     return TS.takeError();
-  StubInfos[TS->getName()] = {Sym.getSymbolContent(),
-                              Sym.getAddress().getValue(),
-                              Sym.getTargetFlags()};
+
+  SmallVector<MemoryRegionInfo> &Entry = StubInfos[TS->getName()];
+  Entry.insert(Entry.begin(),
+               {Sym.getSymbolContent(), Sym.getAddress().getValue(),
+                Sym.getTargetFlags()});
+  return Error::success();
+}
+
+Error Session::FileInfo::registerMultiStubEntry(
+    LinkGraph &G, Symbol &Sym, GetSymbolTargetFunction GetSymbolTarget) {
+  if (Sym.isSymbolZeroFill())
+    return make_error<StringError>("Unexpected zero-fill symbol in section " +
+                                       Sym.getBlock().getSection().getName(),
+                                   inconvertibleErrorCode());
+
+  auto Target = GetSymbolTarget(G, Sym.getBlock());
+  if (!Target)
+    return Target.takeError();
+
+  SmallVector<MemoryRegionInfo> &Entry = StubInfos[Target->getName()];
+  Entry.emplace_back(Sym.getSymbolContent(), Sym.getAddress().getValue(),
+                     Sym.getTargetFlags());
+
+  // Let's keep stubs ordered by ascending address.
+  std::sort(Entry.begin(), Entry.end(),
+            [](const MemoryRegionInfo &L, const MemoryRegionInfo &R) {
+              return L.getTargetAddress() < R.getTargetAddress();
+            });
+
   return Error::success();
 }
 
@@ -1235,8 +1265,14 @@ Session::findSectionInfo(StringRef FileName, StringRef SectionName) {
   return SecInfoItr->second;
 }
 
+static StringRef detectStubKind(const Session::MemoryRegionInfo &Stub) {
+  // Implement acutal stub kind detection
+  return "";
+}
+
 Expected<Session::MemoryRegionInfo &>
-Session::findStubInfo(StringRef FileName, StringRef TargetName) {
+Session::findStubInfo(StringRef FileName, StringRef TargetName,
+                      StringRef KindNameFilter) {
   auto FI = findFileInfo(FileName);
   if (!FI)
     return FI.takeError();
@@ -1246,7 +1282,38 @@ Session::findStubInfo(StringRef FileName, StringRef TargetName) {
                                        "\" registered for file \"" + FileName +
                                        "\"",
                                    inconvertibleErrorCode());
-  return StubInfoItr->second;
+  auto &StubsForTarget = StubInfoItr->second;
+  assert(!StubsForTarget.empty() && "At least 1 stub in each entry");
+  if (KindNameFilter.empty() && StubsForTarget.size() == 1)
+    return StubsForTarget[0]; // Regular single-stub match
+
+  std::string KindsStr;
+  SmallVector<MemoryRegionInfo *, 1> Matches;
+  Regex KindNameMatcher(KindNameFilter.empty() ? ".*" : KindNameFilter);
+  for (MemoryRegionInfo &Stub : StubsForTarget) {
+    StringRef Kind = detectStubKind(Stub);
+    if (KindNameMatcher.match(Kind))
+      Matches.push_back(&Stub);
+    KindsStr += "\"" + (Kind.empty() ? "<unknown>" : Kind.str()) + "\", ";
+  }
+  if (Matches.empty())
+    return make_error<StringError>(
+        "\"" + TargetName + "\" has " + Twine(StubsForTarget.size()) +
+            " stubs in file \"" + FileName +
+            "\", but none of them matches the stub-kind filter \"" +
+            KindNameFilter + "\" (all encountered kinds are " +
+            StringRef(KindsStr.data(), KindsStr.size() - 2) + ").",
+        inconvertibleErrorCode());
+  if (Matches.size() > 1)
+    return make_error<StringError>(
+        "\"" + TargetName + "\" has " + Twine(Matches.size()) +
+            " candidate stubs in file \"" + FileName +
+            "\". Please refine stub-kind filter \"" + KindNameFilter +
+            "\" for disambiguation (encountered kinds are " +
+            StringRef(KindsStr.data(), KindsStr.size() - 2) + ").",
+        inconvertibleErrorCode());
+
+  return *Matches[0];
 }
 
 Expected<Session::MemoryRegionInfo &>
@@ -2015,8 +2082,9 @@ static Error runChecks(Session &S, Triple TT, SubtargetFeatures Features) {
     return S.findSectionInfo(FileName, SectionName);
   };
 
-  auto GetStubInfo = [&S](StringRef FileName, StringRef SectionName) {
-    return S.findStubInfo(FileName, SectionName);
+  auto GetStubInfo = [&S](StringRef FileName, StringRef SectionName,
+                          StringRef KindNameFilter) {
+    return S.findStubInfo(FileName, SectionName, KindNameFilter);
   };
 
   auto GetGOTInfo = [&S](StringRef FileName, StringRef SectionName) {
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.h b/llvm/tools/llvm-jitlink/llvm-jitlink.h
index 93a00266b1504..e09c15adace20 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.h
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.h
@@ -49,7 +49,7 @@ struct Session {
 
   struct FileInfo {
     StringMap<MemoryRegionInfo> SectionInfos;
-    StringMap<MemoryRegionInfo> StubInfos;
+    StringMap<SmallVector<MemoryRegionInfo, 1>> StubInfos;
     StringMap<MemoryRegionInfo> GOTEntryInfos;
 
     using Symbol = jitlink::Symbol;
@@ -61,6 +61,8 @@ struct Session {
                            GetSymbolTargetFunction GetSymbolTarget);
     Error registerStubEntry(LinkGraph &G, Symbol &Sym,
                             GetSymbolTargetFunction GetSymbolTarget);
+    Error registerMultiStubEntry(LinkGraph &G, Symbol &Sym,
+                                 GetSymbolTargetFunction GetSymbolTarget);
   };
 
   using DynLibJDMap = std::map<std::string, orc::JITDylib *>;
@@ -74,7 +76,8 @@ struct Session {
   Expected<MemoryRegionInfo &> findSectionInfo(StringRef FileName,
                                                StringRef SectionName);
   Expected<MemoryRegionInfo &> findStubInfo(StringRef FileName,
-                                            StringRef TargetName);
+                                            StringRef TargetName,
+                                            StringRef KindNameFilter);
   Expected<MemoryRegionInfo &> findGOTEntryInfo(StringRef FileName,
                                                 StringRef TargetName);
 
diff --git a/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp b/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
index 107b555a99faa..4cb76f4347422 100644
--- a/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
+++ b/llvm/tools/llvm-rtdyld/llvm-rtdyld.cpp
@@ -926,7 +926,8 @@ static int linkAndVerify() {
   };
 
   auto GetStubInfo = [&Dyld, &StubMap](StringRef StubContainer,
-                                       StringRef SymbolName)
+                                       StringRef SymbolName,
+                                       StringRef KindNameFilter)
       -> Expected<RuntimeDyldChecker::MemoryRegionInfo> {
     if (!StubMap.count(StubContainer))
       return make_error<StringError>("Stub container not found: " +
@@ -947,6 +948,11 @@ static int linkAndVerify() {
     return StubMemInfo;
   };
 
+  auto GetGOTInfo = [&GetStubInfo](StringRef StubContainer,
+                                   StringRef SymbolName) {
+    return GetStubInfo(StubContainer, SymbolName, "");
+  };
+
   // We will initialize this below once we have the first object file and can
   // know the endianness.
   std::unique_ptr<RuntimeDyldChecker> Checker;
@@ -977,8 +983,7 @@ static int linkAndVerify() {
 
     if (!Checker)
       Checker = std::make_unique<RuntimeDyldChecker>(
-          IsSymbolValid, GetSymbolInfo, GetSectionInfo, GetStubInfo,
-          GetStubInfo,
+          IsSymbolValid, GetSymbolInfo, GetSectionInfo, GetStubInfo, GetGOTInfo,
           Obj.isLittleEndian() ? llvm::endianness::little
                                : llvm::endianness::big,
           TheTriple, MCPU, SubtargetFeatures(), dbgs());

From 1ad1f981a62213c1fc3145e8330f3c9b0dbe2b85 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <piotr.zegar@nokia.com>
Date: Sat, 20 Jan 2024 10:11:05 +0100
Subject: [PATCH 233/843] [clang-tidy] Add readability-redundant-casting check
 (#70595)

Detects explicit type casting operations that involve the same source
and destination types, and subsequently recommend their removal. Covers
a range of explicit casting operations. Its primary objective is to
enhance code readability and maintainability by eliminating unnecessary
type casting.

Closes #67534
---
 .../clang-tidy/readability/CMakeLists.txt     |   1 +
 .../readability/ReadabilityTidyModule.cpp     |   3 +
 .../readability/RedundantCastingCheck.cpp     | 252 ++++++++++++++++++
 .../readability/RedundantCastingCheck.h       |  38 +++
 .../clang-tidy/utils/FixItHintUtils.h         |   1 +
 clang-tools-extra/docs/ReleaseNotes.rst       |   6 +
 .../docs/clang-tidy/checks/list.rst           |   1 +
 .../checks/readability/redundant-casting.rst  |  37 +++
 .../readability/redundant-casting.cpp         | 223 ++++++++++++++++
 9 files changed, 562 insertions(+)
 create mode 100644 clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/readability/redundant-casting.rst
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp

diff --git a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
index fa571d5dd7650..636860f54c73c 100644
--- a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
@@ -35,6 +35,7 @@ add_clang_library(clangTidyReadabilityModule
   QualifiedAutoCheck.cpp
   ReadabilityTidyModule.cpp
   RedundantAccessSpecifiersCheck.cpp
+  RedundantCastingCheck.cpp
   RedundantControlFlowCheck.cpp
   RedundantDeclarationCheck.cpp
   RedundantFunctionPtrDereferenceCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
index f769752c5de5f..2f5d47546197f 100644
--- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
@@ -38,6 +38,7 @@
 #include "OperatorsRepresentationCheck.h"
 #include "QualifiedAutoCheck.h"
 #include "RedundantAccessSpecifiersCheck.h"
+#include "RedundantCastingCheck.h"
 #include "RedundantControlFlowCheck.h"
 #include "RedundantDeclarationCheck.h"
 #include "RedundantFunctionPtrDereferenceCheck.h"
@@ -117,6 +118,8 @@ class ReadabilityModule : public ClangTidyModule {
         "readability-qualified-auto");
     CheckFactories.registerCheck<RedundantAccessSpecifiersCheck>(
         "readability-redundant-access-specifiers");
+    CheckFactories.registerCheck<RedundantCastingCheck>(
+        "readability-redundant-casting");
     CheckFactories.registerCheck<RedundantFunctionPtrDereferenceCheck>(
         "readability-redundant-function-ptr-dereference");
     CheckFactories.registerCheck<RedundantMemberInitCheck>(
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
new file mode 100644
index 0000000000000..b9ff0e81cbc52
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.cpp
@@ -0,0 +1,252 @@
+//===--- RedundantCastingCheck.cpp - clang-tidy ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RedundantCastingCheck.h"
+#include "../utils/FixItHintUtils.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/Lex/Lexer.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::readability {
+
+static bool areTypesEqual(QualType S, QualType D) {
+  if (S == D)
+    return true;
+
+  const auto *TS = S->getAs<TypedefType>();
+  const auto *TD = D->getAs<TypedefType>();
+  if (TS != TD)
+    return false;
+
+  QualType PtrS = S->getPointeeType();
+  QualType PtrD = D->getPointeeType();
+
+  if (!PtrS.isNull() && !PtrD.isNull())
+    return areTypesEqual(PtrS, PtrD);
+
+  const DeducedType *DT = S->getContainedDeducedType();
+  if (DT && DT->isDeduced())
+    return D == DT->getDeducedType();
+
+  return false;
+}
+
+static bool areTypesEqual(QualType TypeS, QualType TypeD,
+                          bool IgnoreTypeAliases) {
+  const QualType CTypeS = TypeS.getCanonicalType();
+  const QualType CTypeD = TypeD.getCanonicalType();
+  if (CTypeS != CTypeD)
+    return false;
+
+  return IgnoreTypeAliases || areTypesEqual(TypeS.getLocalUnqualifiedType(),
+                                            TypeD.getLocalUnqualifiedType());
+}
+
+static bool areBinaryOperatorOperandsTypesEqualToOperatorResultType(
+    const Expr *E, bool IgnoreTypeAliases) {
+  if (!E)
+    return true;
+  const Expr *WithoutImplicitAndParen = E->IgnoreParenImpCasts();
+  if (!WithoutImplicitAndParen)
+    return true;
+  if (const auto *B = dyn_cast<BinaryOperator>(WithoutImplicitAndParen)) {
+    const QualType Type = WithoutImplicitAndParen->getType();
+    if (Type.isNull())
+      return true;
+
+    const QualType NonReferenceType = Type.getNonReferenceType();
+    const QualType LHSType = B->getLHS()->IgnoreImplicit()->getType();
+    if (LHSType.isNull() || !areTypesEqual(LHSType.getNonReferenceType(),
+                                           NonReferenceType, IgnoreTypeAliases))
+      return false;
+    const QualType RHSType = B->getRHS()->IgnoreImplicit()->getType();
+    if (RHSType.isNull() || !areTypesEqual(RHSType.getNonReferenceType(),
+                                           NonReferenceType, IgnoreTypeAliases))
+      return false;
+  }
+  return true;
+}
+
+static const Decl *getSourceExprDecl(const Expr *SourceExpr) {
+  const Expr *CleanSourceExpr = SourceExpr->IgnoreParenImpCasts();
+  if (const auto *E = dyn_cast<DeclRefExpr>(CleanSourceExpr)) {
+    return E->getDecl();
+  }
+
+  if (const auto *E = dyn_cast<CallExpr>(CleanSourceExpr)) {
+    return E->getCalleeDecl();
+  }
+
+  if (const auto *E = dyn_cast<MemberExpr>(CleanSourceExpr)) {
+    return E->getMemberDecl();
+  }
+  return nullptr;
+}
+
+RedundantCastingCheck::RedundantCastingCheck(StringRef Name,
+                                             ClangTidyContext *Context)
+    : ClangTidyCheck(Name, Context),
+      IgnoreMacros(Options.getLocalOrGlobal("IgnoreMacros", true)),
+      IgnoreTypeAliases(Options.getLocalOrGlobal("IgnoreTypeAliases", false)) {}
+
+void RedundantCastingCheck::storeOptions(ClangTidyOptions::OptionMap &Opts) {
+  Options.store(Opts, "IgnoreMacros", IgnoreMacros);
+  Options.store(Opts, "IgnoreTypeAliases", IgnoreTypeAliases);
+}
+
+void RedundantCastingCheck::registerMatchers(MatchFinder *Finder) {
+  auto SimpleType = qualType(hasCanonicalType(
+      qualType(anyOf(builtinType(), references(builtinType()),
+                     references(pointsTo(qualType())), pointsTo(qualType())))));
+
+  auto BitfieldMemberExpr = memberExpr(member(fieldDecl(isBitField())));
+
+  Finder->addMatcher(
+      explicitCastExpr(
+          unless(hasCastKind(CK_ConstructorConversion)),
+          unless(hasCastKind(CK_UserDefinedConversion)),
+          unless(cxxFunctionalCastExpr(hasDestinationType(unless(SimpleType)))),
+
+          hasDestinationType(qualType().bind("dstType")),
+          hasSourceExpression(anyOf(
+              expr(unless(initListExpr()), unless(BitfieldMemberExpr),
+                   hasType(qualType().bind("srcType")))
+                  .bind("source"),
+              initListExpr(unless(hasInit(1, expr())),
+                           hasInit(0, expr(unless(BitfieldMemberExpr),
+                                           hasType(qualType().bind("srcType")))
+                                          .bind("source"))))))
+          .bind("cast"),
+      this);
+}
+
+void RedundantCastingCheck::check(const MatchFinder::MatchResult &Result) {
+  const auto *SourceExpr = Result.Nodes.getNodeAs<Expr>("source");
+  auto TypeD = *Result.Nodes.getNodeAs<QualType>("dstType");
+
+  if (SourceExpr->getValueKind() == VK_LValue &&
+      TypeD.getCanonicalType()->isRValueReferenceType())
+    return;
+
+  const auto TypeS =
+      Result.Nodes.getNodeAs<QualType>("srcType")->getNonReferenceType();
+  TypeD = TypeD.getNonReferenceType();
+
+  if (!areTypesEqual(TypeS, TypeD, IgnoreTypeAliases))
+    return;
+
+  if (!areBinaryOperatorOperandsTypesEqualToOperatorResultType(
+          SourceExpr, IgnoreTypeAliases))
+    return;
+
+  const auto *CastExpr = Result.Nodes.getNodeAs<ExplicitCastExpr>("cast");
+  if (IgnoreMacros &&
+      (CastExpr->getBeginLoc().isMacroID() ||
+       CastExpr->getEndLoc().isMacroID() || CastExpr->getExprLoc().isMacroID()))
+    return;
+
+  {
+    auto Diag = diag(CastExpr->getExprLoc(),
+                     "redundant explicit casting to the same type %0 as the "
+                     "sub-expression, remove this casting");
+    Diag << TypeD;
+
+    const SourceManager &SM = *Result.SourceManager;
+    const SourceLocation SourceExprBegin =
+        SM.getExpansionLoc(SourceExpr->getBeginLoc());
+    const SourceLocation SourceExprEnd =
+        SM.getExpansionLoc(SourceExpr->getEndLoc());
+
+    if (SourceExprBegin != CastExpr->getBeginLoc())
+      Diag << FixItHint::CreateRemoval(SourceRange(
+          CastExpr->getBeginLoc(), SourceExprBegin.getLocWithOffset(-1)));
+
+    const SourceLocation NextToken = Lexer::getLocForEndOfToken(
+        SourceExprEnd, 0U, SM, Result.Context->getLangOpts());
+
+    if (SourceExprEnd != CastExpr->getEndLoc()) {
+      Diag << FixItHint::CreateRemoval(
+          SourceRange(NextToken, CastExpr->getEndLoc()));
+    }
+
+    if (utils::fixit::areParensNeededForStatement(*SourceExpr)) {
+      Diag << FixItHint::CreateInsertion(SourceExprBegin, "(")
+           << FixItHint::CreateInsertion(NextToken, ")");
+    }
+  }
+
+  const auto *SourceExprDecl = getSourceExprDecl(SourceExpr);
+  if (!SourceExprDecl)
+    return;
+
+  if (const auto *D = dyn_cast<CXXConstructorDecl>(SourceExprDecl)) {
+    diag(D->getLocation(),
+         "source type originates from the invocation of this constructor",
+         DiagnosticIDs::Note);
+    return;
+  }
+
+  if (const auto *D = dyn_cast<FunctionDecl>(SourceExprDecl)) {
+    diag(D->getLocation(),
+         "source type originates from the invocation of this "
+         "%select{function|method}0",
+         DiagnosticIDs::Note)
+        << isa<CXXMethodDecl>(D) << D->getReturnTypeSourceRange();
+    return;
+  }
+
+  if (const auto *D = dyn_cast<FieldDecl>(SourceExprDecl)) {
+    diag(D->getLocation(),
+         "source type originates from referencing this member",
+         DiagnosticIDs::Note)
+        << SourceRange(D->getTypeSpecStartLoc(), D->getTypeSpecEndLoc());
+    return;
+  }
+
+  if (const auto *D = dyn_cast<ParmVarDecl>(SourceExprDecl)) {
+    diag(D->getLocation(),
+         "source type originates from referencing this parameter",
+         DiagnosticIDs::Note)
+        << SourceRange(D->getTypeSpecStartLoc(), D->getTypeSpecEndLoc());
+    return;
+  }
+
+  if (const auto *D = dyn_cast<VarDecl>(SourceExprDecl)) {
+    diag(D->getLocation(),
+         "source type originates from referencing this variable",
+         DiagnosticIDs::Note)
+        << SourceRange(D->getTypeSpecStartLoc(), D->getTypeSpecEndLoc());
+    return;
+  }
+
+  if (const auto *D = dyn_cast<EnumConstantDecl>(SourceExprDecl)) {
+    diag(D->getLocation(),
+         "source type originates from referencing this enum constant",
+         DiagnosticIDs::Note);
+    return;
+  }
+
+  if (const auto *D = dyn_cast<BindingDecl>(SourceExprDecl)) {
+    diag(D->getLocation(),
+         "source type originates from referencing this bound variable",
+         DiagnosticIDs::Note);
+    return;
+  }
+
+  if (const auto *D = dyn_cast<NonTypeTemplateParmDecl>(SourceExprDecl)) {
+    diag(D->getLocation(),
+         "source type originates from referencing this non-type template "
+         "parameter",
+         DiagnosticIDs::Note);
+    return;
+  }
+}
+
+} // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h
new file mode 100644
index 0000000000000..fdcfede05d436
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/RedundantCastingCheck.h
@@ -0,0 +1,38 @@
+//===--- RedundantCastingCheck.h - clang-tidy -------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTCASTINGCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTCASTINGCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::readability {
+
+/// Detects explicit type casting operations that involve the same source and
+/// destination types, and subsequently recommend their removal.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-casting.html
+class RedundantCastingCheck : public ClangTidyCheck {
+public:
+  RedundantCastingCheck(StringRef Name, ClangTidyContext *Context);
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  void storeOptions(ClangTidyOptions::OptionMap &Opts) override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+
+private:
+  const bool IgnoreMacros;
+  const bool IgnoreTypeAliases;
+};
+
+} // namespace clang::tidy::readability
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTCASTINGCHECK_H
diff --git a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.h b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.h
index d25bea2c6d297..2b96b2b2ce600 100644
--- a/clang-tools-extra/clang-tidy/utils/FixItHintUtils.h
+++ b/clang-tools-extra/clang-tidy/utils/FixItHintUtils.h
@@ -51,6 +51,7 @@ std::string formatDereference(const Expr &ExprNode, const ASTContext &Context);
 // \brief Checks whatever a expression require extra () to be always used in
 // safe way in any other expression.
 bool areParensNeededForStatement(const Stmt &Node);
+
 } // namespace clang::tidy::utils::fixit
 
 #endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_UTILS_FIXITHINTUTILS_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 7f618e71afd1c..ed8d01d65542f 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -235,6 +235,12 @@ New checks
   Finds return statements with ``void`` values used within functions with
   ``void`` result types.
 
+- New :doc:`readability-redundant-casting
+  <clang-tidy/checks/readability/redundant-casting>` check.
+
+  Detects explicit type casting operations that involve the same source and
+  destination types, and subsequently recommend their removal.
+
 - New :doc:`readability-reference-to-constructed-temporary
   <clang-tidy/checks/readability/reference-to-constructed-temporary>` check.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 5f21449cfc3df..503b9fac82732 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -365,6 +365,7 @@ Clang-Tidy Checks
    :doc:`readability-operators-representation <readability/operators-representation>`, "Yes"
    :doc:`readability-qualified-auto <readability/qualified-auto>`, "Yes"
    :doc:`readability-redundant-access-specifiers <readability/redundant-access-specifiers>`, "Yes"
+   :doc:`readability-redundant-casting <readability/redundant-casting>`, "Yes"
    :doc:`readability-redundant-control-flow <readability/redundant-control-flow>`, "Yes"
    :doc:`readability-redundant-declaration <readability/redundant-declaration>`, "Yes"
    :doc:`readability-redundant-function-ptr-dereference <readability/redundant-function-ptr-dereference>`, "Yes"
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-casting.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-casting.rst
new file mode 100644
index 0000000000000..23eaa225f03a3
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-casting.rst
@@ -0,0 +1,37 @@
+.. title:: clang-tidy - readability-redundant-casting
+
+readability-redundant-casting
+=============================
+
+Detects explicit type casting operations that involve the same source and
+destination types, and subsequently recommend their removal. Covers a range of
+explicit casting operations, including ``static_cast``, ``const_cast``, C-style
+casts, and ``reinterpret_cast``. Its primary objective is to enhance code
+readability and maintainability by eliminating unnecessary type casting.
+
+.. code-block:: c++
+
+  int value = 42;
+  int result = static_cast<int>(value);
+
+In this example, the ``static_cast<int>(value)`` is redundant, as it performs
+a cast from an ``int`` to another ``int``.
+
+Casting operations involving constructor conversions, user-defined conversions,
+functional casts, type-dependent casts, casts between distinct type aliases that
+refer to the same underlying type, as well as bitfield-related casts and casts
+directly from lvalue to rvalue, are all disregarded by the check.
+
+Options
+-------
+
+.. option:: IgnoreMacros
+
+   If set to `true`, the check will not give warnings inside macros. Default
+   is `true`.
+
+.. option:: IgnoreTypeAliases
+
+   When set to `false`, the check will consider type aliases, and when set to
+   `true`, it will resolve all type aliases and operate on the underlying
+   types. Default is `false`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp
new file mode 100644
index 0000000000000..30cac6bd5cca0
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-casting.cpp
@@ -0,0 +1,223 @@
+// RUN: %check_clang_tidy -std=c++11-or-later %s readability-redundant-casting %t -- -- -fno-delayed-template-parsing
+// RUN: %check_clang_tidy -std=c++11-or-later -check-suffix=,MACROS %s readability-redundant-casting %t -- \
+// RUN:   -config='{CheckOptions: { readability-redundant-casting.IgnoreMacros: false }}' \
+// RUN:   -- -fno-delayed-template-parsing
+// RUN: %check_clang_tidy -std=c++11-or-later -check-suffix=,ALIASES %s readability-redundant-casting %t -- \
+// RUN:   -config='{CheckOptions: { readability-redundant-casting.IgnoreTypeAliases: true }}' \
+// RUN:   -- -fno-delayed-template-parsing
+
+struct A {};
+struct B : A {};
+A getA();
+
+void testRedundantStaticCasting(A& value) {
+  A& a1 = static_cast<A&>(value);
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-3]]:36: note: source type originates from referencing this parameter
+  // CHECK-FIXES: {{^}}  A& a1 = value;
+}
+
+void testRedundantConstCasting1(A& value) {
+  A& a2 = const_cast<A&>(value);
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-3]]:36: note: source type originates from referencing this parameter
+  // CHECK-FIXES: {{^}}  A& a2 = value;
+}
+
+void testRedundantConstCasting2(const A& value) {
+  const A& a3 = const_cast<const A&>(value);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: redundant explicit casting to the same type 'const A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-3]]:42: note: source type originates from referencing this parameter
+  // CHECK-FIXES: {{^}}  const A& a3 = value;
+}
+
+void testRedundantReinterpretCasting(A& value) {
+  A& a4 = reinterpret_cast<A&>(value);
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-3]]:41: note: source type originates from referencing this parameter
+  // CHECK-FIXES: {{^}}  A& a4 = value;
+}
+
+void testRedundantCCasting(A& value) {
+  A& a5 = (A&)(value);
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-3]]:31: note: source type originates from referencing this parameter
+  // CHECK-FIXES: {{^}}  A& a5 = value;
+}
+
+void testDoubleCasting(A& value) {
+  A& a6 = static_cast<A&>(reinterpret_cast<A&>(value));
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-2]]:27: warning: redundant explicit casting to the same type 'A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-4]]:27: note: source type originates from referencing this parameter
+  // CHECK-FIXES: {{^}}  A& a6 = value;
+}
+
+void testDiffrentTypesCast(B& value) {
+  A& a7 = static_cast<A&>(value);
+}
+
+void testCastingWithAuto() {
+  auto a = getA();
+  A& a8 = static_cast<A&>(a);
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-3]]:8: note: source type originates from referencing this variable
+  // CHECK-FIXES: {{^}}  A& a8 = a;
+}
+
+void testCastingWithConstAuto() {
+  const auto a = getA();
+  const A& a9 = static_cast<const A&>(a);
+  // CHECK-MESSAGES: :[[@LINE-1]]:17: warning: redundant explicit casting to the same type 'const A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-3]]:14: note: source type originates from referencing this variable
+  // CHECK-FIXES: {{^}}  const A& a9 = a;
+}
+
+void testCastingWithAutoPtr(A& ptr) {
+  auto* a = &ptr;
+  A* a10 = static_cast<A*>(a);
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: redundant explicit casting to the same type 'A *' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-3]]:9: note: source type originates from referencing this variable
+  // CHECK-FIXES: {{^}}  A* a10 = a;
+}
+
+template<typename T>
+void testRedundantTemplateCasting(T& value) {
+  A& a = static_cast<A&>(value);
+  T& t = static_cast<T&>(value);
+  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: redundant explicit casting to the same type 'T' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-4]]:38: note: source type originates from referencing this parameter
+  // CHECK-FIXES: {{^}}  T& t = value;
+}
+
+void testTemplate() {
+  A value;
+  testRedundantTemplateCasting(value);
+}
+
+void testValidRefConstCast() {
+  const auto a = getA();
+  A& a11 = const_cast<A&>(a);
+}
+
+void testValidPtrConstCast(const A* ptr) {
+  A* a12 = const_cast<A*>(ptr);
+}
+
+#define CAST(X) static_cast<int>(X)
+
+void testMacroCasting(int value) {
+  int a = CAST(value);
+  // CHECK-MESSAGES-MACROS: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'int' as the sub-expression, remove this casting [readability-redundant-casting]
+}
+
+#define PTR_NAME name
+
+void testMacroCasting(A* PTR_NAME) {
+  A* a13 = static_cast<A*>(PTR_NAME);
+  // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: redundant explicit casting to the same type 'A *' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-FIXES: {{^}}  A* a13 = PTR_NAME;
+}
+
+struct CastBool {
+  operator bool() const {
+    return true;
+  }
+};
+
+void testUserOperatorCast(const CastBool& value) {
+  bool b = static_cast<bool>(value);
+}
+
+using TypeA = A;
+
+void testTypedefCast(A& value) {
+  TypeA& a = static_cast<TypeA&>(value);
+  // CHECK-MESSAGES-ALIASES: :[[@LINE-1]]:14: warning: redundant explicit casting to the same type 'TypeA' (aka 'A') as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-FIXES-ALIASES: {{^}}  TypeA& a = value;
+}
+
+void testTypedefCast2(TypeA& value) {
+  A& a = static_cast<A&>(value);
+  // CHECK-MESSAGES-ALIASES: :[[@LINE-1]]:10: warning: redundant explicit casting to the same type 'A' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-FIXES-ALIASES: {{^}}  A& a = value;
+}
+
+void testFunctionalCastWithPrimitive(int a) {
+  int b = int(a);
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'int' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-FIXES: {{^}}  int b = a;
+}
+
+void testFunctionalCastWithInitExpr(unsigned a) {
+  unsigned b = ~unsigned{!a};
+  unsigned c = unsigned{0};
+}
+
+void testBinaryOperator(char c) {
+  int a = int(c - 'C');
+}
+
+struct BIT {
+  bool b:1;
+};
+
+template<typename ...Args>
+void make(Args&& ...);
+
+void testBinaryOperator(BIT b) {
+  make((bool)b.b);
+}
+
+struct Class {
+  using Iterator = const char*;
+
+  Iterator begin() {
+    return static_cast<Iterator>(first());
+// CHECK-MESSAGES-ALIASES: :[[@LINE-1]]:12: warning: redundant explicit casting to the same type 'Iterator' (aka 'const char *') as the sub-expression, remove this casting [readability-redundant-casting]
+// CHECK-MESSAGES-ALIASES: :[[@LINE+4]]:15: note: source type originates from the invocation of this method
+// CHECK-FIXES-ALIASES: {{^}}    return first();
+  }
+
+  const char* first();
+};
+
+void testAddOperation(int aa, int bb) {
+  int c = static_cast<int>(aa + bb) * aa;
+ // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'int' as the sub-expression, remove this casting [readability-redundant-casting]
+ // CHECK-FIXES: {{^}}  int c = (aa + bb) * aa;
+}
+
+void testAddOperationWithParen(int a, int b) {
+  int c = static_cast<int>((a+b))*a;
+ // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'int' as the sub-expression, remove this casting [readability-redundant-casting]
+ // CHECK-FIXES: {{^}}  int c = (a+b)*a;
+}
+
+void testRValueCast(int&& a) {
+  int&& b = static_cast<int&&>(a);
+  int&& c = static_cast<int&&>(10);
+  // CHECK-MESSAGES: :[[@LINE-1]]:13: warning: redundant explicit casting to the same type 'int' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-FIXES: {{^}}  int&& c = 10;
+}
+
+template <int V>
+void testRedundantNTTPCasting() {
+  int a = static_cast<int>(V);
+  // CHECK-MESSAGES: :[[@LINE-1]]:11: warning: redundant explicit casting to the same type 'int' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-4]]:15: note: source type originates from referencing this non-type template parameter
+  // CHECK-FIXES: {{^}}  int a = V;
+}
+
+template <typename T, T V>
+void testValidNTTPCasting() {
+  int a = static_cast<int>(V);
+}
+
+template <typename T, T V>
+void testRedundantDependentNTTPCasting() {
+  T a = static_cast<T>(V);
+  // CHECK-MESSAGES: :[[@LINE-1]]:9: warning: redundant explicit casting to the same type 'T' as the sub-expression, remove this casting [readability-redundant-casting]
+  // CHECK-MESSAGES: :[[@LINE-4]]:25: note: source type originates from referencing this non-type template parameter
+  // CHECK-FIXES: {{^}}  T a = V;
+}

From 14d59527e7b36cbab7fb92699a52843392aa22c0 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sat, 20 Jan 2024 09:11:11 +0000
Subject: [PATCH 234/843] [gn build] Port 1ad1f981a622

---
 .../secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
index 37e75a9720e63..62ae7e2aeb5bb 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
@@ -43,6 +43,7 @@ static_library("readability") {
     "QualifiedAutoCheck.cpp",
     "ReadabilityTidyModule.cpp",
     "RedundantAccessSpecifiersCheck.cpp",
+    "RedundantCastingCheck.cpp",
     "RedundantControlFlowCheck.cpp",
     "RedundantDeclarationCheck.cpp",
     "RedundantFunctionPtrDereferenceCheck.cpp",

From 7a8f5d97afbff948fa2437c81d08a5963ee7d8cf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix-Antoine=20Constantin?=
 <60141446+felix642@users.noreply.github.com>
Date: Sat, 20 Jan 2024 04:45:59 -0500
Subject: [PATCH 235/843] =?UTF-8?q?[clang-tidy]=C2=A0Added=20new=20check?=
 =?UTF-8?q?=20to=20detect=20redundant=20inline=20keyword=20(#73069)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This checks find usages of the inline keywork where it is already
implicitly defined by the compiler and suggests it's removal.

Fixes #72397
---
 .../clang-tidy/readability/CMakeLists.txt     |   1 +
 .../readability/ReadabilityTidyModule.cpp     |   3 +
 .../RedundantInlineSpecifierCheck.cpp         | 132 +++++++++++++++++
 .../RedundantInlineSpecifierCheck.h           |  42 ++++++
 clang-tools-extra/docs/ReleaseNotes.rst       |   5 +
 .../docs/clang-tidy/checks/list.rst           |   1 +
 .../redundant-inline-specifier.rst            |  32 ++++
 .../redundant-inline-specifier.cpp            | 137 ++++++++++++++++++
 8 files changed, 353 insertions(+)
 create mode 100644 clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/readability/redundant-inline-specifier.cpp

diff --git a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
index 636860f54c73c..a6c8cbd8eb448 100644
--- a/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/readability/CMakeLists.txt
@@ -22,6 +22,7 @@ add_clang_library(clangTidyReadabilityModule
   IdentifierLengthCheck.cpp
   IdentifierNamingCheck.cpp
   ImplicitBoolConversionCheck.cpp
+  RedundantInlineSpecifierCheck.cpp
   InconsistentDeclarationParameterNameCheck.cpp
   IsolateDeclarationCheck.cpp
   MagicNumbersCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
index 2f5d47546197f..87b299bf1ef1c 100644
--- a/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/readability/ReadabilityTidyModule.cpp
@@ -42,6 +42,7 @@
 #include "RedundantControlFlowCheck.h"
 #include "RedundantDeclarationCheck.h"
 #include "RedundantFunctionPtrDereferenceCheck.h"
+#include "RedundantInlineSpecifierCheck.h"
 #include "RedundantMemberInitCheck.h"
 #include "RedundantPreprocessorCheck.h"
 #include "RedundantSmartptrGetCheck.h"
@@ -100,6 +101,8 @@ class ReadabilityModule : public ClangTidyModule {
         "readability-identifier-naming");
     CheckFactories.registerCheck<ImplicitBoolConversionCheck>(
         "readability-implicit-bool-conversion");
+    CheckFactories.registerCheck<RedundantInlineSpecifierCheck>(
+        "readability-redundant-inline-specifier");
     CheckFactories.registerCheck<InconsistentDeclarationParameterNameCheck>(
         "readability-inconsistent-declaration-parameter-name");
     CheckFactories.registerCheck<IsolateDeclarationCheck>(
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
new file mode 100644
index 0000000000000..0e8d17d444247
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.cpp
@@ -0,0 +1,132 @@
+//===--- RedundantInlineSpecifierCheck.cpp - clang-tidy--------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "RedundantInlineSpecifierCheck.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/DeclTemplate.h"
+#include "clang/AST/ExprCXX.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Basic/Diagnostic.h"
+#include "clang/Basic/SourceLocation.h"
+#include "clang/Basic/SourceManager.h"
+#include "clang/Lex/Token.h"
+
+#include "../utils/LexerUtils.h"
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::readability {
+
+namespace {
+AST_POLYMORPHIC_MATCHER(isInlineSpecified,
+                        AST_POLYMORPHIC_SUPPORTED_TYPES(FunctionDecl,
+                                                        VarDecl)) {
+  if (const auto *FD = dyn_cast<FunctionDecl>(&Node))
+    return FD->isInlineSpecified();
+  if (const auto *VD = dyn_cast<VarDecl>(&Node))
+    return VD->isInlineSpecified();
+  llvm_unreachable("Not a valid polymorphic type");
+}
+
+AST_POLYMORPHIC_MATCHER_P(isInternalLinkage,
+                          AST_POLYMORPHIC_SUPPORTED_TYPES(FunctionDecl,
+                                                          VarDecl),
+                          bool, strictMode) {
+  if (!strictMode)
+    return false;
+  if (const auto *FD = dyn_cast<FunctionDecl>(&Node))
+    return FD->getStorageClass() == SC_Static || FD->isInAnonymousNamespace();
+  if (const auto *VD = dyn_cast<VarDecl>(&Node))
+    return VD->isInAnonymousNamespace();
+  llvm_unreachable("Not a valid polymorphic type");
+}
+} // namespace
+
+static SourceLocation getInlineTokenLocation(SourceRange RangeLocation,
+                                             const SourceManager &Sources,
+                                             const LangOptions &LangOpts) {
+  SourceLocation Loc = RangeLocation.getBegin();
+  if (Loc.isMacroID())
+    return {};
+
+  Token FirstToken;
+  Lexer::getRawToken(Loc, FirstToken, Sources, LangOpts, true);
+  std::optional<Token> CurrentToken = FirstToken;
+  while (CurrentToken && CurrentToken->getLocation() < RangeLocation.getEnd() &&
+         CurrentToken->isNot(tok::eof)) {
+    if (CurrentToken->is(tok::raw_identifier) &&
+        CurrentToken->getRawIdentifier() == "inline")
+      return CurrentToken->getLocation();
+
+    CurrentToken =
+        Lexer::findNextToken(CurrentToken->getLocation(), Sources, LangOpts);
+  }
+  return {};
+}
+
+void RedundantInlineSpecifierCheck::registerMatchers(MatchFinder *Finder) {
+  Finder->addMatcher(
+      functionDecl(isInlineSpecified(),
+                   anyOf(isConstexpr(), isDeleted(), isDefaulted(),
+                         isInternalLinkage(StrictMode),
+                         allOf(isDefinition(), hasAncestor(recordDecl()))))
+          .bind("fun_decl"),
+      this);
+
+  if (StrictMode)
+    Finder->addMatcher(
+        functionTemplateDecl(
+            has(functionDecl(allOf(isInlineSpecified(), isDefinition()))))
+            .bind("templ_decl"),
+        this);
+
+  if (getLangOpts().CPlusPlus17) {
+    Finder->addMatcher(
+        varDecl(isInlineSpecified(),
+                anyOf(isInternalLinkage(StrictMode),
+                      allOf(isConstexpr(), hasAncestor(recordDecl()))))
+            .bind("var_decl"),
+        this);
+  }
+}
+
+template <typename T>
+void RedundantInlineSpecifierCheck::handleMatchedDecl(
+    const T *MatchedDecl, const SourceManager &Sources,
+    const MatchFinder::MatchResult &Result, StringRef Message) {
+  SourceLocation Loc = getInlineTokenLocation(
+      MatchedDecl->getSourceRange(), Sources, Result.Context->getLangOpts());
+  if (Loc.isValid())
+    diag(Loc, Message) << MatchedDecl << FixItHint::CreateRemoval(Loc);
+}
+
+void RedundantInlineSpecifierCheck::check(
+    const MatchFinder::MatchResult &Result) {
+  const SourceManager &Sources = *Result.SourceManager;
+
+  if (const auto *MatchedDecl =
+          Result.Nodes.getNodeAs<FunctionDecl>("fun_decl")) {
+    handleMatchedDecl(
+        MatchedDecl, Sources, Result,
+        "function %0 has inline specifier but is implicitly inlined");
+  } else if (const auto *MatchedDecl =
+                 Result.Nodes.getNodeAs<VarDecl>("var_decl")) {
+    handleMatchedDecl(
+        MatchedDecl, Sources, Result,
+        "variable %0 has inline specifier but is implicitly inlined");
+  } else if (const auto *MatchedDecl =
+                 Result.Nodes.getNodeAs<FunctionTemplateDecl>("templ_decl")) {
+    handleMatchedDecl(
+        MatchedDecl, Sources, Result,
+        "function %0 has inline specifier but is implicitly inlined");
+  }
+}
+
+} // namespace clang::tidy::readability
diff --git a/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h
new file mode 100644
index 0000000000000..cc0bfa9685d0c
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/readability/RedundantInlineSpecifierCheck.h
@@ -0,0 +1,42 @@
+//===--- RedundantInlineSpecifierCheck.h - clang-tidy ------------*-C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTINLINESPECIFIERCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTINLINESPECIFIERCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::readability {
+
+/// Detects redundant ``inline`` specifiers on function and variable
+/// declarations.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/readability/redundant-inline-specifier.html
+class RedundantInlineSpecifierCheck : public ClangTidyCheck {
+public:
+  RedundantInlineSpecifierCheck(StringRef Name, ClangTidyContext *Context)
+      : ClangTidyCheck(Name, Context),
+        StrictMode(Options.getLocalOrGlobal("StrictMode", false)) {}
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+
+private:
+  template <typename T>
+  void handleMatchedDecl(const T *MatchedDecl, const SourceManager &Sources,
+                         const ast_matchers::MatchFinder::MatchResult &Result,
+                         StringRef Message);
+  const bool StrictMode;
+};
+
+} // namespace clang::tidy::readability
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_READABILITY_REDUNDANTINLINESPECIFIERCHECK_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index ed8d01d65542f..c8e93231fd11b 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -240,6 +240,11 @@ New checks
 
   Detects explicit type casting operations that involve the same source and
   destination types, and subsequently recommend their removal.
+  
+- New :doc:`readability-redundant-inline-specifier
+  <clang-tidy/checks/readability/redundant-inline-specifier>` check.
+
+  Detects redundant ``inline`` specifiers on function and variable declarations.
 
 - New :doc:`readability-reference-to-constructed-temporary
   <clang-tidy/checks/readability/reference-to-constructed-temporary>` check.
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 503b9fac82732..e972e376ebc5a 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -354,6 +354,7 @@ Clang-Tidy Checks
    :doc:`readability-identifier-length <readability/identifier-length>`,
    :doc:`readability-identifier-naming <readability/identifier-naming>`, "Yes"
    :doc:`readability-implicit-bool-conversion <readability/implicit-bool-conversion>`, "Yes"
+   :doc:`readability-redundant-inline-specifier <readability/redundant-inline-specifier>`, "Yes"
    :doc:`readability-inconsistent-declaration-parameter-name <readability/inconsistent-declaration-parameter-name>`, "Yes"
    :doc:`readability-isolate-declaration <readability/isolate-declaration>`, "Yes"
    :doc:`readability-magic-numbers <readability/magic-numbers>`,
diff --git a/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst
new file mode 100644
index 0000000000000..eee324cddab48
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/readability/redundant-inline-specifier.rst
@@ -0,0 +1,32 @@
+.. title:: clang-tidy - readability-redundant-inline-specifier
+
+readability-redundant-inline-specifier
+======================================
+
+Detects redundant ``inline`` specifiers on function and variable declarations.
+
+Examples:
+
+.. code-block:: c++
+
+   constexpr inline void f() {}
+
+In the example above the keyword ``inline`` is redundant since constexpr
+functions are implicitly inlined
+
+.. code-block:: c++
+   
+   class MyClass {
+       inline void myMethod() {}
+   };
+
+In the example above the keyword ``inline`` is redundant since member functions
+defined entirely inside a class/struct/union definition are implicitly inlined.
+
+Options
+-------
+
+.. option:: StrictMode
+
+   If set to `true`, the check will also flag functions and variables that
+   already have internal linkage as redundant.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-inline-specifier.cpp b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-inline-specifier.cpp
new file mode 100644
index 0000000000000..cdd98d8fdc20f
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/readability/redundant-inline-specifier.cpp
@@ -0,0 +1,137 @@
+// RUN: %check_clang_tidy -std=c++17 %s readability-redundant-inline-specifier %t
+// RUN: %check_clang_tidy -std=c++17 -check-suffixes=,STRICT %s readability-redundant-inline-specifier %t -- -config="{CheckOptions: {readability-redundant-inline-specifier.StrictMode: 'true'}}"
+
+template <typename T> inline T f()
+// CHECK-MESSAGES-STRICT: :[[@LINE-1]]:23: warning: function 'f' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+// CHECK-FIXES-STRICT: template <typename T> T f()
+{
+    return T{};
+}
+
+template <> inline double f<double>() = delete;
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: function 'f<double>' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+// CHECK-FIXES: template <> double f<double>() = delete;
+
+inline int g(float a)
+{
+    return static_cast<int>(a - 5.F);
+}
+
+inline int g(double) = delete;
+// CHECK-MESSAGES: :[[@LINE-1]]:1: warning: function 'g' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+// CHECK-FIXES: int g(double) = delete;
+
+class C
+{
+  public:
+    inline C& operator=(const C&) = delete;
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: function 'operator=' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES: C& operator=(const C&) = delete;
+
+    inline C(const C&) = default;
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: function 'C' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES: C(const C&) = default;
+
+    constexpr inline C& operator=(int a);
+    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: function 'operator=' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES: constexpr C& operator=(int a);
+
+    inline C() {}
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: function 'C' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES: C() {}
+
+    constexpr inline C(int);
+    // CHECK-MESSAGES: :[[@LINE-1]]:15: warning: function 'C' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES: constexpr C(int);
+
+    inline int Get42() const { return 42; }
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: function 'Get42' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES: int Get42() const { return 42; }
+
+    static inline constexpr int C_STATIC = 42;
+    // CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'C_STATIC' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES: static constexpr int C_STATIC = 42;
+
+    static constexpr int C_STATIC_2 = 42;
+};
+
+constexpr inline int Get42() { return 42; }
+// CHECK-MESSAGES: :[[@LINE-1]]:11: warning: function 'Get42' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+// CHECK-FIXES: constexpr int Get42() { return 42; }
+
+
+static constexpr inline int NAMESPACE_STATIC = 42;
+
+inline static int fn0(int i)
+// CHECK-MESSAGES-STRICT: :[[@LINE-1]]:1: warning: function 'fn0' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+// CHECK-FIXES-STRICT: static int fn0(int i)
+{
+    return i - 1;
+}
+
+static constexpr inline int fn1(int i)
+// CHECK-MESSAGES: :[[@LINE-1]]:18: warning: function 'fn1' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+// CHECK-FIXES: static constexpr int fn1(int i)
+{
+    return i - 1;
+}
+
+namespace
+{
+    inline int fn2(int i)
+    // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:5: warning: function 'fn2' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES-STRICT: int fn2(int i)
+    {
+        return i - 1;
+    }
+
+    inline constexpr int fn3(int i)
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: function 'fn3' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES: constexpr int fn3(int i)
+    {
+        return i - 1;
+    }
+
+    inline constexpr int MY_CONSTEXPR_VAR = 42;
+    // CHECK-MESSAGES-STRICT: :[[@LINE-1]]:5: warning: variable 'MY_CONSTEXPR_VAR' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES-STRICT: constexpr int MY_CONSTEXPR_VAR = 42;
+}
+
+namespace ns
+{
+    inline int fn4(int i)
+    {
+        return i - 1;
+    }
+
+    inline constexpr int fn5(int i)
+    // CHECK-MESSAGES: :[[@LINE-1]]:5: warning: function 'fn5' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+    // CHECK-FIXES: constexpr int fn5(int i)
+    {
+        return i - 1;
+    }
+}
+
+auto fn6 = [](){};
+
+template <typename T> inline T fn7();
+
+template <typename T> T fn7()
+{
+    return T{};
+}
+
+template <typename T>  T fn8();
+
+template <typename T> inline T fn8()
+// CHECK-MESSAGES-STRICT: :[[@LINE-1]]:23: warning: function 'fn8' has inline specifier but is implicitly inlined [readability-redundant-inline-specifier]
+// CHECK-FIXES-STRICT: template <typename T> T fn8()
+{
+    return T{};
+}
+
+#define INLINE_MACRO() inline void fn9() { }
+INLINE_MACRO()
+
+#define INLINE_KW inline
+INLINE_KW void fn10() { }

From d70bfeb4e1461dca6c5d5a56e5aa304daa692ed7 Mon Sep 17 00:00:00 2001
From: Bharathi Ramana Joshi <joshibharathiramana@gmail.com>
Date: Sat, 20 Jan 2024 15:19:10 +0530
Subject: [PATCH 236/843] [MLIR][Presburger] Implement IntegerRelation::setId
 (#77872)

---
 .../Analysis/Presburger/IntegerRelation.h     |  5 +++
 .../Analysis/Presburger/IntegerRelation.cpp   |  8 +++++
 .../Presburger/IntegerRelationTest.cpp        | 34 +++++++++++++++++++
 3 files changed, 47 insertions(+)

diff --git a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
index 8e2c9fca0a17c..c476a022a4827 100644
--- a/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
+++ b/mlir/include/mlir/Analysis/Presburger/IntegerRelation.h
@@ -122,6 +122,11 @@ class IntegerRelation {
   /// current space; this will result in an assert failure.
   void setSpaceExceptLocals(const PresburgerSpace &oSpace);
 
+  /// Set the identifier for the ith variable of the specified kind of the
+  /// IntegerRelation's PresburgerSpace. The index is relative to the kind of
+  /// the variable.
+  void setId(VarKind kind, unsigned i, Identifier id);
+
   /// Returns a copy of the space without locals.
   PresburgerSpace getSpaceWithoutLocals() const {
     return PresburgerSpace::getRelationSpace(space.getNumDomainVars(),
diff --git a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
index f78e21ccd38eb..7d2a63d17676f 100644
--- a/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
+++ b/mlir/lib/Analysis/Presburger/IntegerRelation.cpp
@@ -66,6 +66,14 @@ void IntegerRelation::setSpaceExceptLocals(const PresburgerSpace &oSpace) {
   space.insertVar(VarKind::Local, 0, newNumLocals);
 }
 
+void IntegerRelation::setId(VarKind kind, unsigned i, Identifier id) {
+  assert(space.isUsingIds() &&
+         "space must be using identifiers to set an identifier");
+  assert(kind != VarKind::Local && "local variables cannot have identifiers");
+  assert(i < space.getNumVarKind(kind) && "invalid variable index");
+  space.getId(kind, i) = id;
+}
+
 void IntegerRelation::append(const IntegerRelation &other) {
   assert(space.isEqual(other.getSpace()) && "Spaces must be equal.");
 
diff --git a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
index dff092b3204bb..00d2204c9c8ef 100644
--- a/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/IntegerRelationTest.cpp
@@ -451,3 +451,37 @@ TEST(IntegerRelationTest, mergeAndAlignCommonSuffixSymbols) {
   EXPECT_EQ(otherSpace.getId(VarKind::Range, 1),
             Identifier(&otherIdentifiers[3]));
 }
+
+TEST(IntegerRelationTest, setId) {
+  IntegerRelation rel = parseRelationFromSet(
+      "(x, y, z)[A, B, C, D] : (x + A - C - y + D - z >= 0)", 2);
+  PresburgerSpace space = PresburgerSpace::getRelationSpace(2, 1, 4, 0);
+  space.resetIds();
+
+  // Attach identifiers.
+  int identifiers[7] = {'x', 'y', 'z', 'A', 'B', 'C', 'D'};
+  space.getId(VarKind::Domain, 0) = Identifier(&identifiers[0]);
+  space.getId(VarKind::Domain, 1) = Identifier(&identifiers[1]);
+  space.getId(VarKind::Range, 0) = Identifier(&identifiers[2]);
+  space.getId(VarKind::Symbol, 0) = Identifier(&identifiers[3]);
+  space.getId(VarKind::Symbol, 1) = Identifier(&identifiers[4]);
+  space.getId(VarKind::Symbol, 2) = Identifier(&identifiers[5]);
+  space.getId(VarKind::Symbol, 3) = Identifier(&identifiers[6]);
+  rel.setSpace(space);
+
+  int newIdentifiers[3] = {1, 2, 3};
+  rel.setId(VarKind::Domain, 1, Identifier(&newIdentifiers[0]));
+  rel.setId(VarKind::Range, 0, Identifier(&newIdentifiers[1]));
+  rel.setId(VarKind::Symbol, 2, Identifier(&newIdentifiers[2]));
+
+  space = rel.getSpace();
+  // Check that new identifiers are set correctly.
+  EXPECT_EQ(space.getId(VarKind::Domain, 1), Identifier(&newIdentifiers[0]));
+  EXPECT_EQ(space.getId(VarKind::Range, 0), Identifier(&newIdentifiers[1]));
+  EXPECT_EQ(space.getId(VarKind::Symbol, 2), Identifier(&newIdentifiers[2]));
+  // Check that old identifier are not changed.
+  EXPECT_EQ(space.getId(VarKind::Domain, 0), Identifier(&identifiers[0]));
+  EXPECT_EQ(space.getId(VarKind::Symbol, 0), Identifier(&identifiers[3]));
+  EXPECT_EQ(space.getId(VarKind::Symbol, 1), Identifier(&identifiers[4]));
+  EXPECT_EQ(space.getId(VarKind::Symbol, 3), Identifier(&identifiers[6]));
+}

From 920bb5430a96c346f7afcbe288e3546513b58012 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Sat, 20 Jan 2024 09:52:13 +0000
Subject: [PATCH 237/843] [gn build] Port 7a8f5d97afbf

---
 .../secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
index 62ae7e2aeb5bb..6345d09b41251 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/readability/BUILD.gn
@@ -47,6 +47,7 @@ static_library("readability") {
     "RedundantControlFlowCheck.cpp",
     "RedundantDeclarationCheck.cpp",
     "RedundantFunctionPtrDereferenceCheck.cpp",
+    "RedundantInlineSpecifierCheck.cpp",
     "RedundantMemberInitCheck.cpp",
     "RedundantPreprocessorCheck.cpp",
     "RedundantSmartptrGetCheck.cpp",

From 63d7ca924feea1948329ba80c4dc4fad56112be3 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Sat, 20 Jan 2024 11:44:42 +0000
Subject: [PATCH 238/843] [AMDGPU] Add GFX12 llvm.amdgcn.s.wait.*cnt intrinsics
 (#78723)

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      | 11 +++
 llvm/lib/Target/AMDGPU/SOPInstructions.td     | 21 +++--
 .../AMDGPU/llvm.amdgcn.s.wait.gfx12.ll        | 94 +++++++++++++++++++
 3 files changed, 119 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 9302e590a6fc9..21765cdd13a15 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -311,6 +311,17 @@ def int_amdgcn_iglp_opt : ClangBuiltin<"__builtin_amdgcn_iglp_opt">,
 def int_amdgcn_s_waitcnt : ClangBuiltin<"__builtin_amdgcn_s_waitcnt">,
   Intrinsic<[], [llvm_i32_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
 
+// GFX12 intrinsics
+class AMDGPUWaitIntrinsic :
+  Intrinsic<[], [llvm_i16_ty], [ImmArg<ArgIndex<0>>, IntrNoMem, IntrHasSideEffects, IntrWillReturn, IntrNoCallback, IntrNoFree]>;
+def int_amdgcn_s_wait_bvhcnt         : AMDGPUWaitIntrinsic;
+def int_amdgcn_s_wait_dscnt          : AMDGPUWaitIntrinsic;
+def int_amdgcn_s_wait_expcnt         : AMDGPUWaitIntrinsic;
+def int_amdgcn_s_wait_kmcnt          : AMDGPUWaitIntrinsic;
+def int_amdgcn_s_wait_loadcnt        : AMDGPUWaitIntrinsic;
+def int_amdgcn_s_wait_samplecnt      : AMDGPUWaitIntrinsic;
+def int_amdgcn_s_wait_storecnt       : AMDGPUWaitIntrinsic;
+
 def int_amdgcn_div_scale : DefaultAttrsIntrinsic<
   // 1st parameter: Numerator
   // 2nd parameter: Denominator
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index eae4800ade0dc..d9c6b1a35e272 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1720,23 +1720,30 @@ let SubtargetPredicate = HasVGPRSingleUseHintInsts in {
 
 let SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 in {
   def S_WAIT_LOADCNT :
-    SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16">;
+    SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_wait_loadcnt timm:$simm16)]>;
   def S_WAIT_LOADCNT_DSCNT :
     SOPP_Pseudo<"s_wait_loadcnt_dscnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_STORECNT :
-    SOPP_Pseudo<"s_wait_storecnt", (ins s16imm:$simm16), "$simm16">;
+    SOPP_Pseudo<"s_wait_storecnt", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_wait_storecnt timm:$simm16)]>;
   def S_WAIT_STORECNT_DSCNT :
     SOPP_Pseudo<"s_wait_storecnt_dscnt", (ins s16imm:$simm16), "$simm16">;
   def S_WAIT_SAMPLECNT :
-    SOPP_Pseudo<"s_wait_samplecnt", (ins s16imm:$simm16), "$simm16">;
+    SOPP_Pseudo<"s_wait_samplecnt", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_wait_samplecnt timm:$simm16)]>;
   def S_WAIT_BVHCNT :
-    SOPP_Pseudo<"s_wait_bvhcnt", (ins s16imm:$simm16), "$simm16">;
+    SOPP_Pseudo<"s_wait_bvhcnt", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_wait_bvhcnt timm:$simm16)]>;
   def S_WAIT_EXPCNT :
-    SOPP_Pseudo<"s_wait_expcnt", (ins s16imm:$simm16), "$simm16">;
+    SOPP_Pseudo<"s_wait_expcnt", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_wait_expcnt timm:$simm16)]>;
   def S_WAIT_DSCNT :
-    SOPP_Pseudo<"s_wait_dscnt", (ins s16imm:$simm16), "$simm16">;
+    SOPP_Pseudo<"s_wait_dscnt", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_wait_dscnt timm:$simm16)]>;
   def S_WAIT_KMCNT :
-    SOPP_Pseudo<"s_wait_kmcnt", (ins s16imm:$simm16), "$simm16">;
+    SOPP_Pseudo<"s_wait_kmcnt", (ins s16imm:$simm16), "$simm16",
+                [(int_amdgcn_s_wait_kmcnt timm:$simm16)]>;
 } // End SubtargetPredicate = isGFX12Plus, hasSideEffects = 1
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll
new file mode 100644
index 0000000000000..f03dbb9eb1645
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.wait.gfx12.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s -check-prefix=GFX12
+
+define amdgpu_ps void @test_bvhcnt() {
+; GFX12-LABEL: test_bvhcnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.bvhcnt(i16 0)
+  ret void
+}
+
+define amdgpu_ps void @test_dscnt() {
+; GFX12-LABEL: test_dscnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_dscnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.dscnt(i16 0)
+  ret void
+}
+
+define amdgpu_ps void @test_expcnt() {
+; GFX12-LABEL: test_expcnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.expcnt(i16 0)
+  ret void
+}
+
+define amdgpu_ps void @test_kmcnt() {
+; GFX12-LABEL: test_kmcnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.kmcnt(i16 0)
+  ret void
+}
+
+define amdgpu_ps void @test_loadcnt() {
+; GFX12-LABEL: test_loadcnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.loadcnt(i16 0)
+  ret void
+}
+
+define amdgpu_ps void @test_loadcnt_dscnt() {
+; GFX12-LABEL: test_loadcnt_dscnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.loadcnt(i16 0)
+  call void @llvm.amdgcn.s.wait.dscnt(i16 0)
+  ret void
+}
+
+define amdgpu_ps void @test_samplecnt() {
+; GFX12-LABEL: test_samplecnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.samplecnt(i16 0)
+  ret void
+}
+
+define amdgpu_ps void @test_storecnt() {
+; GFX12-LABEL: test_storecnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.storecnt(i16 0)
+  ret void
+}
+
+define amdgpu_ps void @test_storecnt_dscnt() {
+; GFX12-LABEL: test_storecnt_dscnt:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    s_endpgm
+  call void @llvm.amdgcn.s.wait.storecnt(i16 0)
+  call void @llvm.amdgcn.s.wait.dscnt(i16 0)
+  ret void
+}
+
+declare void @llvm.amdgcn.s.wait.bvhcnt(i16)
+declare void @llvm.amdgcn.s.wait.dscnt(i16)
+declare void @llvm.amdgcn.s.wait.expcnt(i16)
+declare void @llvm.amdgcn.s.wait.kmcnt(i16)
+declare void @llvm.amdgcn.s.wait.loadcnt(i16)
+declare void @llvm.amdgcn.s.wait.samplecnt(i16)
+declare void @llvm.amdgcn.s.wait.storecnt(i16)

From fd3346dba825f6b9c2873bdeafe34da8f8b4f3e1 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Sat, 20 Jan 2024 20:05:22 +0800
Subject: [PATCH 239/843] [clang-tidy] fix modernize-use-auto incorrect fix
 hints for pointer (#77943)

---
 .../clang-tidy/modernize/UseAutoCheck.cpp     | 56 ++++++++++++++-----
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 ++
 .../modernize/use-auto-for-pointer.cpp        | 29 ++++++++++
 3 files changed, 74 insertions(+), 15 deletions(-)
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-for-pointer.cpp

diff --git a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
index 7af30e688b6a7..aec67808846b1 100644
--- a/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/UseAutoCheck.cpp
@@ -8,10 +8,12 @@
 
 #include "UseAutoCheck.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/TypeLoc.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Basic/CharInfo.h"
 #include "clang/Tooling/FixIt.h"
+#include "llvm/ADT/STLExtras.h"
 
 using namespace clang;
 using namespace clang::ast_matchers;
@@ -333,6 +335,25 @@ void UseAutoCheck::replaceIterators(const DeclStmt *D, ASTContext *Context) {
       << FixItHint::CreateReplacement(Range, "auto");
 }
 
+static void ignoreTypeLocClasses(
+    TypeLoc &Loc,
+    std::initializer_list<TypeLoc::TypeLocClass> const &LocClasses) {
+  while (llvm::is_contained(LocClasses, Loc.getTypeLocClass()))
+    Loc = Loc.getNextTypeLoc();
+}
+
+static bool isMutliLevelPointerToTypeLocClasses(
+    TypeLoc Loc,
+    std::initializer_list<TypeLoc::TypeLocClass> const &LocClasses) {
+  ignoreTypeLocClasses(Loc, {TypeLoc::Paren, TypeLoc::Qualified});
+  TypeLoc::TypeLocClass TLC = Loc.getTypeLocClass();
+  if (TLC != TypeLoc::Pointer && TLC != TypeLoc::MemberPointer)
+    return false;
+  ignoreTypeLocClasses(Loc, {TypeLoc::Paren, TypeLoc::Qualified,
+                             TypeLoc::Pointer, TypeLoc::MemberPointer});
+  return llvm::is_contained(LocClasses, Loc.getTypeLocClass());
+}
+
 void UseAutoCheck::replaceExpr(
     const DeclStmt *D, ASTContext *Context,
     llvm::function_ref<QualType(const Expr *)> GetType, StringRef Message) {
@@ -342,6 +363,10 @@ void UseAutoCheck::replaceExpr(
     return;
 
   const QualType FirstDeclType = FirstDecl->getType().getCanonicalType();
+  TypeSourceInfo *TSI = FirstDecl->getTypeSourceInfo();
+
+  if (TSI == nullptr)
+    return;
 
   std::vector<FixItHint> StarRemovals;
   for (const auto *Dec : D->decls()) {
@@ -383,17 +408,11 @@ void UseAutoCheck::replaceExpr(
   // is the same as the initializer, just more CV-qualified. However, TypeLoc
   // information is not reliable where CV qualifiers are concerned so we can't
   // do anything about this case for now.
-  TypeLoc Loc = FirstDecl->getTypeSourceInfo()->getTypeLoc();
-  if (!RemoveStars) {
-    while (Loc.getTypeLocClass() == TypeLoc::Pointer ||
-           Loc.getTypeLocClass() == TypeLoc::Qualified)
-      Loc = Loc.getNextTypeLoc();
-  }
-  while (Loc.getTypeLocClass() == TypeLoc::LValueReference ||
-         Loc.getTypeLocClass() == TypeLoc::RValueReference ||
-         Loc.getTypeLocClass() == TypeLoc::Qualified) {
-    Loc = Loc.getNextTypeLoc();
-  }
+  TypeLoc Loc = TSI->getTypeLoc();
+  if (!RemoveStars)
+    ignoreTypeLocClasses(Loc, {TypeLoc::Pointer, TypeLoc::Qualified});
+  ignoreTypeLocClasses(Loc, {TypeLoc::LValueReference, TypeLoc::RValueReference,
+                             TypeLoc::Qualified});
   SourceRange Range(Loc.getSourceRange());
 
   if (MinTypeNameLength != 0 &&
@@ -405,12 +424,19 @@ void UseAutoCheck::replaceExpr(
 
   auto Diag = diag(Range.getBegin(), Message);
 
+  bool ShouldReplenishVariableName = isMutliLevelPointerToTypeLocClasses(
+      TSI->getTypeLoc(), {TypeLoc::FunctionProto, TypeLoc::ConstantArray});
+
   // Space after 'auto' to handle cases where the '*' in the pointer type is
   // next to the identifier. This avoids changing 'int *p' into 'autop'.
-  // FIXME: This doesn't work for function pointers because the variable name
-  // is inside the type.
-  Diag << FixItHint::CreateReplacement(Range, RemoveStars ? "auto " : "auto")
-       << StarRemovals;
+  llvm::StringRef Auto = ShouldReplenishVariableName
+                             ? (RemoveStars ? "auto " : "auto *")
+                             : (RemoveStars ? "auto " : "auto");
+  std::string ReplenishedVariableName =
+      ShouldReplenishVariableName ? FirstDecl->getQualifiedNameAsString() : "";
+  std::string Replacement =
+      (Auto + llvm::StringRef{ReplenishedVariableName}).str();
+  Diag << FixItHint::CreateReplacement(Range, Replacement) << StarRemovals;
 }
 
 void UseAutoCheck::check(const MatchFinder::MatchResult &Result) {
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index c8e93231fd11b..bdbdee31f39d1 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -434,6 +434,10 @@ Changes in existing checks
   false-positives when constructing the container with ``count`` copies of
   elements with value ``value``.
 
+- Improved :doc:`modernize-use-auto
+  <clang-tidy/checks/modernize/use-auto>` to avoid create incorrect fix hints
+  for pointer to array type and pointer to function type.
+
 - Improved :doc:`modernize-use-emplace
   <clang-tidy/checks/modernize/use-emplace>` to not replace aggregates that
   ``emplace`` cannot construct with aggregate initialization.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-for-pointer.cpp b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-for-pointer.cpp
new file mode 100644
index 0000000000000..8a3e0bab26c12
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/modernize/use-auto-for-pointer.cpp
@@ -0,0 +1,29 @@
+// RUN: %check_clang_tidy -check-suffix=REMOVE %s modernize-use-auto %t -- \
+// RUN:   -config="{CheckOptions: {modernize-use-auto.RemoveStars: 'true', modernize-use-auto.MinTypeNameLength: '0'}}"
+// RUN: %check_clang_tidy %s modernize-use-auto %t -- \
+// RUN:   -config="{CheckOptions: {modernize-use-auto.RemoveStars: 'false', modernize-use-auto.MinTypeNameLength: '0'}}"
+
+void pointerToFunction() {
+  void (*(*(f1)))() = static_cast<void (**)()>(nullptr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use auto when initializing
+  // CHECK-FIXES-REMOVE: auto f1 =
+  // CHECK-FIXES: auto *f1 =
+}
+
+void pointerToArray() {
+  int(*a1)[2] = new int[10][2];
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use auto when initializing
+  // CHECK-FIXES-REMOVE: auto a1 =
+  // CHECK-FIXES: auto *a1 =
+}
+
+void memberFunctionPointer() {
+  class A {
+    void f();
+  };
+  void(A::* a1)() = static_cast<void(A::*)()>(nullptr);
+  // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: use auto when initializing
+  // CHECK-FIXES-REMOVE: auto a1 =
+  // CHECK-FIXES: auto *a1 =
+}
+

From a8a3711e745286fd26f726b3397dbe5fb03ea465 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Sat, 20 Jan 2024 12:06:00 +0000
Subject: [PATCH 240/843] [AArch64][SME2] Preserve ZT0 state around function
 calls (#78321)

If a function has ZT0 state and calls a function which does not
preserve ZT0, the caller must save and restore ZT0 around the call.
If the caller shares ZT0 state and the callee is not shared ZA, we must
additionally call SMSTOP/SMSTART ZA around the call.

This patch adds new AArch64ISDNodes for spilling & filling ZT0.
Where requiresPreservingZT0 is true, ZT0 state will be preserved
across a call.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  48 +++++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 +
 .../lib/Target/AArch64/AArch64SMEInstrInfo.td |  10 +-
 .../AArch64/Utils/AArch64SMEAttributes.h      |   9 +
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    | 155 ++++++++++++++++++
 .../Target/AArch64/SMEAttributesTest.cpp      |  36 ++++
 6 files changed, 256 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sme-zt0-state.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 09e42b72be63c..96ea692d03f56 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2341,6 +2341,8 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::SMSTART)
     MAKE_CASE(AArch64ISD::SMSTOP)
     MAKE_CASE(AArch64ISD::RESTORE_ZA)
+    MAKE_CASE(AArch64ISD::RESTORE_ZT)
+    MAKE_CASE(AArch64ISD::SAVE_ZT)
     MAKE_CASE(AArch64ISD::CALL)
     MAKE_CASE(AArch64ISD::ADRP)
     MAKE_CASE(AArch64ISD::ADR)
@@ -7654,6 +7656,34 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     });
   }
 
+  SDValue ZTFrameIdx;
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  bool ShouldPreserveZT0 = CallerAttrs.requiresPreservingZT0(CalleeAttrs);
+
+  // If the caller has ZT0 state which will not be preserved by the callee,
+  // spill ZT0 before the call.
+  if (ShouldPreserveZT0) {
+    unsigned ZTObj = MFI.CreateSpillStackObject(64, Align(16));
+    ZTFrameIdx = DAG.getFrameIndex(
+        ZTObj,
+        DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+
+    Chain = DAG.getNode(AArch64ISD::SAVE_ZT, DL, DAG.getVTList(MVT::Other),
+                        {Chain, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
+  }
+
+  // If caller shares ZT0 but the callee is not shared ZA, we need to stop
+  // PSTATE.ZA before the call if there is no lazy-save active.
+  bool DisableZA = CallerAttrs.requiresDisablingZABeforeCall(CalleeAttrs);
+  assert((!DisableZA || !RequiresLazySave) &&
+         "Lazy-save should have PSTATE.SM=1 on entry to the function");
+
+  if (DisableZA)
+    Chain = DAG.getNode(
+        AArch64ISD::SMSTOP, DL, MVT::Other, Chain,
+        DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
+        DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
+
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
   if (!IsSibCall)
@@ -8065,13 +8095,19 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
                                  Result, InGlue, PStateSM, false);
   }
 
-  if (RequiresLazySave) {
+  if (CallerAttrs.requiresEnablingZAAfterCall(CalleeAttrs))
     // Unconditionally resume ZA.
     Result = DAG.getNode(
         AArch64ISD::SMSTART, DL, MVT::Other, Result,
         DAG.getTargetConstant((int32_t)(AArch64SVCR::SVCRZA), DL, MVT::i32),
         DAG.getConstant(0, DL, MVT::i64), DAG.getConstant(1, DL, MVT::i64));
 
+  if (ShouldPreserveZT0)
+    Result =
+        DAG.getNode(AArch64ISD::RESTORE_ZT, DL, DAG.getVTList(MVT::Other),
+                    {Result, DAG.getConstant(0, DL, MVT::i32), ZTFrameIdx});
+
+  if (RequiresLazySave) {
     // Conditionally restore the lazy save using a pseudo node.
     unsigned FI = FuncInfo->getLazySaveTPIDR2Obj();
     SDValue RegMask = DAG.getRegisterMask(
@@ -8100,7 +8136,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         DAG.getConstant(0, DL, MVT::i64));
   }
 
-  if (RequiresSMChange || RequiresLazySave) {
+  if (RequiresSMChange || RequiresLazySave || ShouldPreserveZT0) {
     for (unsigned I = 0; I < InVals.size(); ++I) {
       // The smstart/smstop is chained as part of the call, but when the
       // resulting chain is discarded (which happens when the call is not part
@@ -23977,6 +24013,14 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       return DAG.getMergeValues(
           {A, DAG.getZExtOrTrunc(B, DL, MVT::i1), A.getValue(2)}, DL);
     }
+    case Intrinsic::aarch64_sme_ldr_zt:
+      return DAG.getNode(AArch64ISD::RESTORE_ZT, SDLoc(N),
+                         DAG.getVTList(MVT::Other), N->getOperand(0),
+                         N->getOperand(2), N->getOperand(3));
+    case Intrinsic::aarch64_sme_str_zt:
+      return DAG.getNode(AArch64ISD::SAVE_ZT, SDLoc(N),
+                         DAG.getVTList(MVT::Other), N->getOperand(0),
+                         N->getOperand(2), N->getOperand(3));
     default:
       break;
     }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 6047a3b7b2864..abecc3560ccbb 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -61,6 +61,8 @@ enum NodeType : unsigned {
   SMSTART,
   SMSTOP,
   RESTORE_ZA,
+  RESTORE_ZT,
+  SAVE_ZT,
 
   // Produces the full sequence of instructions for getting the thread pointer
   // offset of a variable into X0, using the TLSDesc model.
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index 380f6e1fcfdae..eeae5303a3f89 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -22,6 +22,12 @@ def AArch64_restore_za : SDNode<"AArch64ISD::RESTORE_ZA", SDTypeProfile<0, 3,
                              [SDTCisInt<0>, SDTCisPtrTy<1>]>,
                              [SDNPHasChain, SDNPSideEffect, SDNPVariadic,
                               SDNPOptInGlue]>;
+def AArch64_restore_zt : SDNode<"AArch64ISD::RESTORE_ZT", SDTypeProfile<0, 2,
+                                [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+                                [SDNPHasChain, SDNPSideEffect, SDNPMayLoad]>;
+def AArch64_save_zt : SDNode<"AArch64ISD::SAVE_ZT", SDTypeProfile<0, 2,
+                             [SDTCisInt<0>, SDTCisPtrTy<1>]>,
+                             [SDNPHasChain, SDNPSideEffect, SDNPMayStore]>;
 
 //===----------------------------------------------------------------------===//
 // Instruction naming conventions.
@@ -543,8 +549,8 @@ defm UMOPS_MPPZZ_HtoS : sme2_int_mopx_tile<"umops", 0b101, int_aarch64_sme_umops
 
 defm ZERO_T : sme2_zero_zt<"zero", 0b0001>;
 
-defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, int_aarch64_sme_ldr_zt>;
-defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, int_aarch64_sme_str_zt>;
+defm LDR_TX : sme2_spill_fill_vector<"ldr", 0b01111100, AArch64_restore_zt>;
+defm STR_TX : sme2_spill_fill_vector<"str", 0b11111100, AArch64_save_zt>;
 
 def MOVT_XTI : sme2_movt_zt_to_scalar<"movt", 0b0011111>;
 def MOVT_TIX : sme2_movt_scalar_to_zt<"movt", 0b0011111>;
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index 6f622f1996a3a..8af219bb361fd 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -112,6 +112,15 @@ class SMEAttrs {
            State == StateValue::InOut || State == StateValue::Preserved;
   }
   bool hasZT0State() const { return isNewZT0() || sharesZT0(); }
+  bool requiresPreservingZT0(const SMEAttrs &Callee) const {
+    return hasZT0State() && !Callee.sharesZT0();
+  }
+  bool requiresDisablingZABeforeCall(const SMEAttrs &Callee) const {
+    return hasZT0State() && !hasZAState() && Callee.hasPrivateZAInterface();
+  }
+  bool requiresEnablingZAAfterCall(const SMEAttrs &Callee) const {
+    return requiresLazySave(Callee) || requiresDisablingZABeforeCall(Callee);
+  }
 };
 
 } // namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
new file mode 100644
index 0000000000000..88eaf19ec488f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -0,0 +1,155 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -start-after=simplifycfg -enable-tail-merge=false -verify-machineinstrs < %s | FileCheck %s
+
+declare void @callee();
+
+;
+; Private-ZA Callee
+;
+
+; Expect spill & fill of ZT0 around call
+; Expect smstop/smstart za around call
+define void @zt0_in_caller_no_state_callee() "aarch64_in_zt0" nounwind {
+; CHECK-LABEL: zt0_in_caller_no_state_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+  call void @callee();
+  ret void;
+}
+
+; Expect spill & fill of ZT0 around call
+; Expect setup and restore lazy-save around call
+; Expect smstart za after call
+define void @za_zt0_shared_caller_no_state_callee() "aarch64_pstate_za_shared" "aarch64_in_zt0" nounwind {
+; CHECK-LABEL: za_zt0_shared_caller_no_state_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x9, x8, x8, x9
+; CHECK-NEXT:    mov sp, x9
+; CHECK-NEXT:    sub x10, x29, #16
+; CHECK-NEXT:    sub x19, x29, #80
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x9, [x29, #-16]
+; CHECK-NEXT:    sturh w8, [x29, #-8]
+; CHECK-NEXT:    msr TPIDR2_EL0, x10
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    sub x0, x29, #16
+; CHECK-NEXT:    cbnz x8, .LBB1_2
+; CHECK-NEXT:  // %bb.1:
+; CHECK-NEXT:    bl __arm_tpidr2_restore
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee();
+  ret void;
+}
+
+;
+; Shared-ZA Callee
+;
+
+; Caller and callee have shared ZT0 state, no spill/fill of ZT0 required
+define void @zt0_shared_caller_zt0_shared_callee() "aarch64_in_zt0" nounwind {
+; CHECK-LABEL: zt0_shared_caller_zt0_shared_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee() "aarch64_in_zt0";
+  ret void;
+}
+
+; Expect spill & fill of ZT0 around call
+define void @za_zt0_shared_caller_za_shared_callee() "aarch64_pstate_za_shared" "aarch64_in_zt0" nounwind {
+; CHECK-LABEL: za_zt0_shared_caller_za_shared_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT:    str x19, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    sub x19, x29, #80
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldr x19, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee() "aarch64_pstate_za_shared";
+  ret void;
+}
+
+; Caller and callee have shared ZA & ZT0
+define void @za_zt0_shared_caller_za_zt0_shared_callee() "aarch64_pstate_za_shared" "aarch64_in_zt0" nounwind {
+; CHECK-LABEL: za_zt0_shared_caller_za_zt0_shared_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee() "aarch64_pstate_za_shared" "aarch64_in_zt0";
+  ret void;
+}
+
+; New-ZA Callee
+
+; Expect spill & fill of ZT0 around call
+; Expect smstop/smstart za around call
+define void @zt0_in_caller_zt0_new_callee() "aarch64_in_zt0" nounwind {
+; CHECK-LABEL: zt0_in_caller_zt0_new_callee:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str zt0, [x19]
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    ldr zt0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+  call void @callee() "aarch64_new_zt0";
+  ret void;
+}
diff --git a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
index 294e557181424..044de72449ec8 100644
--- a/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
+++ b/llvm/unittests/Target/AArch64/SMEAttributesTest.cpp
@@ -191,6 +191,9 @@ TEST(SMEAttributes, Basics) {
 TEST(SMEAttributes, Transitions) {
   // Normal -> Normal
   ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal)));
+  ASSERT_FALSE(SA(SA::Normal).requiresPreservingZT0(SA(SA::Normal)));
+  ASSERT_FALSE(SA(SA::Normal).requiresDisablingZABeforeCall(SA(SA::Normal)));
+  ASSERT_FALSE(SA(SA::Normal).requiresEnablingZAAfterCall(SA(SA::Normal)));
   // Normal -> Normal + LocallyStreaming
   ASSERT_FALSE(SA(SA::Normal).requiresSMChange(SA(SA::Normal | SA::SM_Body)));
 
@@ -240,4 +243,37 @@ TEST(SMEAttributes, Transitions) {
   // Streaming-compatible -> Streaming-compatible + LocallyStreaming
   ASSERT_FALSE(SA(SA::SM_Compatible)
                    .requiresSMChange(SA(SA::SM_Compatible | SA::SM_Body)));
+
+  SA Private_ZA = SA(SA::Normal);
+  SA ZA_Shared = SA(SA::ZA_Shared);
+  SA ZT0_Shared = SA(SA::encodeZT0State(SA::StateValue::In));
+  SA ZA_ZT0_Shared = SA(SA::ZA_Shared | SA::encodeZT0State(SA::StateValue::In));
+
+  // Shared ZA -> Private ZA Interface
+  ASSERT_FALSE(ZA_Shared.requiresDisablingZABeforeCall(Private_ZA));
+  ASSERT_TRUE(ZA_Shared.requiresEnablingZAAfterCall(Private_ZA));
+
+  // Shared ZT0 -> Private ZA Interface
+  ASSERT_TRUE(ZT0_Shared.requiresDisablingZABeforeCall(Private_ZA));
+  ASSERT_TRUE(ZT0_Shared.requiresPreservingZT0(Private_ZA));
+  ASSERT_TRUE(ZT0_Shared.requiresEnablingZAAfterCall(Private_ZA));
+
+  // Shared ZA & ZT0 -> Private ZA Interface
+  ASSERT_FALSE(ZA_ZT0_Shared.requiresDisablingZABeforeCall(Private_ZA));
+  ASSERT_TRUE(ZA_ZT0_Shared.requiresPreservingZT0(Private_ZA));
+  ASSERT_TRUE(ZA_ZT0_Shared.requiresEnablingZAAfterCall(Private_ZA));
+
+  // Shared ZA -> Shared ZA Interface
+  ASSERT_FALSE(ZA_Shared.requiresDisablingZABeforeCall(ZT0_Shared));
+  ASSERT_FALSE(ZA_Shared.requiresEnablingZAAfterCall(ZT0_Shared));
+
+  // Shared ZT0 -> Shared ZA Interface
+  ASSERT_FALSE(ZT0_Shared.requiresDisablingZABeforeCall(ZT0_Shared));
+  ASSERT_FALSE(ZT0_Shared.requiresPreservingZT0(ZT0_Shared));
+  ASSERT_FALSE(ZT0_Shared.requiresEnablingZAAfterCall(ZT0_Shared));
+
+  // Shared ZA & ZT0 -> Shared ZA Interface
+  ASSERT_FALSE(ZA_ZT0_Shared.requiresDisablingZABeforeCall(ZT0_Shared));
+  ASSERT_FALSE(ZA_ZT0_Shared.requiresPreservingZT0(ZT0_Shared));
+  ASSERT_FALSE(ZA_ZT0_Shared.requiresEnablingZAAfterCall(ZT0_Shared));
 }

From fcb6737f82246c6046526f699c9a82a96f71ab55 Mon Sep 17 00:00:00 2001
From: Hirofumi Nakamura <k.nakamura.hirofumi@gmail.com>
Date: Sat, 20 Jan 2024 21:15:58 +0900
Subject: [PATCH 241/843] [clang-format] Support of TableGen identifiers
 beginning with a number. (#78571)

TableGen allows the identifiers beginning with a number.
This patch add the support of the recognition of such identifiers.
---
 clang/lib/Format/FormatTokenLexer.cpp         | 42 ++++++++++++++++++-
 clang/lib/Format/FormatTokenLexer.h           |  4 ++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 23 ++++++++++
 3 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/FormatTokenLexer.cpp b/clang/lib/Format/FormatTokenLexer.cpp
index 25ac9be57c81a..52a55ea23b5f2 100644
--- a/clang/lib/Format/FormatTokenLexer.cpp
+++ b/clang/lib/Format/FormatTokenLexer.cpp
@@ -93,8 +93,10 @@ ArrayRef<FormatToken *> FormatTokenLexer::lex() {
       // string literals are correctly identified.
       handleCSharpVerbatimAndInterpolatedStrings();
     }
-    if (Style.isTableGen())
+    if (Style.isTableGen()) {
       handleTableGenMultilineString();
+      handleTableGenNumericLikeIdentifier();
+    }
     if (Tokens.back()->NewlinesBefore > 0 || Tokens.back()->IsMultiline)
       FirstInLineIndex = Tokens.size() - 1;
   } while (Tokens.back()->isNot(tok::eof));
@@ -804,6 +806,44 @@ void FormatTokenLexer::handleTableGenMultilineString() {
       FirstLineText, MultiLineString->OriginalColumn, Style.TabWidth, Encoding);
 }
 
+void FormatTokenLexer::handleTableGenNumericLikeIdentifier() {
+  FormatToken *Tok = Tokens.back();
+  // TableGen identifiers can begin with digits. Such tokens are lexed as
+  // numeric_constant now.
+  if (Tok->isNot(tok::numeric_constant))
+    return;
+  StringRef Text = Tok->TokenText;
+  // The following check is based on llvm::TGLexer::LexToken.
+  // That lexes the token as a number if any of the following holds:
+  // 1. It starts with '+', '-'.
+  // 2. All the characters are digits.
+  // 3. The first non-digit character is 'b', and the next is '0' or '1'.
+  // 4. The first non-digit character is 'x', and the next is a hex digit.
+  // Note that in the case 3 and 4, if the next character does not exists in
+  // this token, the token is an identifier.
+  if (Text.size() < 1 || Text[0] == '+' || Text[0] == '-')
+    return;
+  const auto NonDigitPos = Text.find_if([](char C) { return !isdigit(C); });
+  // All the characters are digits
+  if (NonDigitPos == StringRef::npos)
+    return;
+  char FirstNonDigit = Text[NonDigitPos];
+  if (NonDigitPos < Text.size() - 1) {
+    char TheNext = Text[NonDigitPos + 1];
+    // Regarded as a binary number.
+    if (FirstNonDigit == 'b' && (TheNext == '0' || TheNext == '1'))
+      return;
+    // Regarded as hex number.
+    if (FirstNonDigit == 'x' && isxdigit(TheNext))
+      return;
+  }
+  if (isalpha(FirstNonDigit) || FirstNonDigit == '_') {
+    // This is actually an identifier in TableGen.
+    Tok->Tok.setKind(tok::identifier);
+    Tok->Tok.setIdentifierInfo(nullptr);
+  }
+}
+
 void FormatTokenLexer::handleTemplateStrings() {
   FormatToken *BacktickToken = Tokens.back();
 
diff --git a/clang/lib/Format/FormatTokenLexer.h b/clang/lib/Format/FormatTokenLexer.h
index 1dec6bbc41514..65dd733bd5335 100644
--- a/clang/lib/Format/FormatTokenLexer.h
+++ b/clang/lib/Format/FormatTokenLexer.h
@@ -97,6 +97,10 @@ class FormatTokenLexer {
 
   // Handles TableGen multiline strings. It has the form [{ ... }].
   void handleTableGenMultilineString();
+  // Handles TableGen numeric like identifiers.
+  // They have a forms of [0-9]*[_a-zA-Z]([_a-zA-Z0-9]*). But limited to the
+  // case it is not lexed as an integer.
+  void handleTableGenNumericLikeIdentifier();
 
   void tryParsePythonComment();
 
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 117d8fe8f7dc1..64b2abac5cce5 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2209,6 +2209,29 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   EXPECT_EQ(Tokens[0]->ColumnWidth, sizeof("[{ It can break\n") - 1);
   EXPECT_TRUE(Tokens[0]->IsMultiline);
   EXPECT_EQ(Tokens[0]->LastLineColumnWidth, sizeof("   the string. }]") - 1);
+
+  // Identifier tokens. In TableGen, identifiers can begin with a number.
+  // In ambiguous cases, the lexer tries to lex it as a number.
+  // Even if the try fails, it does not fall back to identifier lexing and
+  // regard as an error.
+  // The ambiguity is not documented. The result of those tests are based on the
+  // implementation of llvm::TGLexer::LexToken.
+  Tokens = Annotate("1234");
+  EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
+  Tokens = Annotate("0x1abC");
+  EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
+  // This is invalid syntax of number, but not an identifier.
+  Tokens = Annotate("0x1234x");
+  EXPECT_TOKEN(Tokens[0], tok::numeric_constant, TT_Unknown);
+  Tokens = Annotate("identifier");
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+  // Identifier beginning with a number.
+  Tokens = Annotate("0x");
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+  Tokens = Annotate("2dVector");
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+  Tokens = Annotate("01234Vector");
+  EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandConstructors) {

From d0986519d58e6d71656019cfa6604efa4bf6d3e7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Sat, 20 Jan 2024 16:15:44 +0200
Subject: [PATCH 242/843] [LLD] [COFF] Preserve directives and export names
 from LTO objects (#78802)

The export names are saved as StringRefs pointing into the COFF
directives. In the case of LTO objects, this can be memory allocated
that is owned by the LTO InputFile, which gets destructed when doing the
compilation.

In the case of LTO objects from an older version of LLVM, which require
being upgraded when loaded, the directives string gets destructed, while
when using LTO objects of a matching version (the common case), the
directives string points into memory that doesn't get destructed on LTO
compilation.

Test this by linking a bundled binary LTO object file, from an older
version of LLVM.

This fixes issue #78591, and downstream issue
https://github.com/mstorsjo/llvm-mingw/issues/392.
---
 lld/COFF/InputFiles.cpp                 |   2 +-
 lld/test/COFF/Inputs/lto-directives.obj | Bin 0 -> 2316 bytes
 lld/test/COFF/lto-directives.test       |  15 +++++++++++++++
 3 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 lld/test/COFF/Inputs/lto-directives.obj
 create mode 100644 lld/test/COFF/lto-directives.test

diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index cd744800cb0de..22cc0e3e5dbaf 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -1081,7 +1081,7 @@ void BitcodeFile::parse() {
     if (objSym.isUsed())
       ctx.config.gcroot.push_back(sym);
   }
-  directives = obj->getCOFFLinkerOpts();
+  directives = saver.save(obj->getCOFFLinkerOpts());
 }
 
 void BitcodeFile::parseLazy() {
diff --git a/lld/test/COFF/Inputs/lto-directives.obj b/lld/test/COFF/Inputs/lto-directives.obj
new file mode 100644
index 0000000000000000000000000000000000000000..a413b0ab92f4e43833627109e869a402ad98d5b7
GIT binary patch
literal 2316
zcma)8e{2)i9e<8P&Jc4pkhMv3cibgmO<74UPDt$7k!&Bdri)ZV*_2I8_;;N)n2TfL
zBt?~vb9Y7!EFk~kKbT~%Okz_z1u1_p2~Bp+P2J2zDXJ=zS&>Zw7!+94vI%L`w(lAC
z|DN=H@B4lC-uHcfe7;}X&SlpQHX*bPA>`4r9dG^OFM-egweXw9qp@KX>@iw|o&nQX
z$DlfpUjo8}_Di-x{gDQ}!E{*_*Y4Y<X`$*r_QbS@>+F{s?LNJs$*5`m=?;S-;o$R3
z`=Q;^tloCH&R5V}Z=$(367-iUgCX9$zjOaw&)NACmuR^Fde77F4y=Z@f`<0Ifta6k
zq_oUFm|9-desp!tW$>M#JG|%8*LI;jKH7B=Y0=SUgsgC<^={j<|G-f3XnUtfYwLeU
z-p-M$F><*=EDGdmmRfcb)e5<h+4+Pa3j(#Gim79~6b-6p<|ICbO*ycrQR)v$d>V_O
zs*QS-A&`fvGQ^UbTt(DIg{)?XC4pMAQL9$sQI05P2+($)A)mrY3)0MJuxX<dPD}mG
z*wi%)?cK`at698?Xi&0IA(jPdHA^frL^VTgWQhB!5pg_=SzO{l7CW25rhy4clUb~P
zM4V1ZVGvW&bSGv429DiO$I{XaFNK5Rcubl>w=zTt3}DD5B~9oE#6e;9oCC_&-u^I$
zSJOKm<;ZVy<XVL))jTf=1n~DWJ6GM*rkl7+>q=I<oZVS=iLWKO)&-F&ajjXAD5;2Q
zj(|871!6&EDdfm}jaN2pwF3fC704BVx}zc&Y=bZ>qOCQhrm_N+7l^X0^@f_N+K4;P
zFV?;m?L<LE1>@}19CEBTa{JSIgfxl+R)lD@!|D;ONB@mb>~+}3VB0`J_IC}>&2Ink
zn@jbaXM2;)mXvEZ&~@<n&v2$!6C?5!BG9B7N^r)wJZq0((^J?)YFv9C=^pFxr!l-&
zYo``-Bt$L1ZmS&=TZ`(U0yi|P9x6`FMtJN4$e%;vd2EJPPfkfZEwvPP;hS@Kh1G>v
z-TD^3<-~jSd2)>*?l9z{n+mY!%{N8!3TIkSTW*S$xY}IYYMOLm9|W<1AfyT#h)F!a
z%Y{vLVw2Cdf!b(HI;oMyo24*hHzmCOWD9>%#{ZGUH(9l(pg`(5u!xdd!Af2rm)#BD
z&T*y!XG+!P8?rIU853%AQtpOFe>i9UMs1##wO@=%{D>6h#YhmFa!HY9DUz02ZoQ*>
zf^^?G@r^lr-Kwk5_$GKkcao(XwP>SmGek+CN@V~j`A9_tnvWXixxp)<x#;Y8;fPC$
zPGM82qr*?<bV>xM_iDAokZU>eLzM#MB13Mu8;*Y@8jG@do*ROI{IF=*;LHgS<=M}7
zfTvOv?9pH|ome;}oLJZ6n_2vx6aSuV&)v2`b-jG2Lau=Uju!L0IJ6+P#%1FyXDZ5;
z8#z-!1|Ka&$HKS<1FOa%fFn}N-<I%Am+o=7VT;z?V|Djv-9MdpNRO|xIEdeu8y=SN
z4JZDLZNCdphc0lZ^71A_LdE785;9n2_6F$Rns0JLH`J!2Xey4}gzST}P{}h6DauPN
ze|>=eoz+#!_(QAiVH#eMRqBar82EBB1yE5U99e->3UQmMEJNsqAX7zaK20uX$g+)E
z%MpPNFN;@{{OUl-J(gp{GtJn9Lpqt3&c@*G^`y!&D_fGXafKTMI29pcS_ktZyPE=2
zZ{^5iLKB*%5qcA_9;O^Lm(aFs4;vhGyD)p{($|O5`c9vv$6&uW$9xi`y{fmu#O^^j
z8}$a=;gb`iGk)h0jvn?;`p3>hBW}-_|9EgTG<^K+!y!d@haVe_fb_G_Xv90_4!uiH
z`oj~Wr$cm4e;3=;M|TDz5q@H@yZhbINbroO%X|7%_t@Cv^XUK|KK-8G8|i|NwA<fj
z=rQ$synUwLK)`Pt7;q2xdwqd`(c|wmu)PO;es`ab9e543YYwsJteSRB9u_@FJu+Wk
z&~j~buXZ_BWQn{%=9|9I3QW7OeK~RAt3Q49`^OhPz4%%2^VtKNW1*zS{mO$QA1V9$
zE__<N5K5xDKR`ct75r}85$=RK!)8YK6y#P(i)lqZcb{0w?1aV+Z2;n0t>O2jzI7M=
zzzJoc+fa%__rQriosS9FbZ4z<rzAj}g4q%Nifmj^nXigNNxANQXaGd1LX>D?!>!C!
zfm(DED;c8j9BA&GO81o3Rh&BLJ&lQ4!j*Q0zJ|Wm04r9sTouh%porB&Y0&}&Jtr&u
z+fBdd<Ml#a4){F?p-<|+bgJ6^h!{gWp=a!Q&G}cX2`$rhD53AMAH(%+-gy2dedyPT
zUAw=F>09i_eyMNs#pB=5zPPW%+P9%b)io8$x@@3p9CWH-HNdqv+ECcq@Wb!?Uf_1C
zlUvG$@<I7e&<!*Uq}VnBqzq+MWL^<q^Z~U2{TxW~?F0}G^uO=G*3K)dG8DbaP*&wP
z8+Ft~^eeD|0v2p|r0n=Zk?`~j4F12!nlkpE?C(2pCiH&jbTo7zIvVnwj!qmnH8JV!
eF?AW(F7}jrG}Ps#yAO{z-#&JHu=dFF_x}sNRUJ_P

literal 0
HcmV?d00001

diff --git a/lld/test/COFF/lto-directives.test b/lld/test/COFF/lto-directives.test
new file mode 100644
index 0000000000000..5890d426764c4
--- /dev/null
+++ b/lld/test/COFF/lto-directives.test
@@ -0,0 +1,15 @@
+; REQUIRES: x86
+
+;; Test linking an LTO object file that contains directives. The
+;; LTO object file is built with an older toolchain, to force it
+;; to be upgraded when loaded.
+
+;; The input file is compiled from source that looks like this:
+;;   void __declspec(dllexport) entry(void) { }
+;; with this command:
+;;   clang -target x86_64-windows-msvc -c main.c -flto
+
+; RUN: lld-link /entry:entry /subsystem:console %p/Inputs/lto-directives.obj /dll /out:%t.dll
+; RUN: llvm-readobj --coff-exports %t.dll | FileCheck %s
+
+; CHECK: Name: entry

From d01145f7607432586faa771d6519196cd27458df Mon Sep 17 00:00:00 2001
From: Ryan Landay <rlanday@gmail.com>
Date: Sat, 20 Jan 2024 09:20:22 -0500
Subject: [PATCH 243/843] Fix typo in AttrDocs.td (__single_inhertiance =>
 __single_inheritance) (#78838)

---
 clang/include/clang/Basic/AttrDocs.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/AttrDocs.td b/clang/include/clang/Basic/AttrDocs.td
index 9e8190614fbe8..7e633f8e2635a 100644
--- a/clang/include/clang/Basic/AttrDocs.td
+++ b/clang/include/clang/Basic/AttrDocs.td
@@ -3621,7 +3621,7 @@ Attribute ``trivial_abi`` has no effect in the following cases:
 
 def MSInheritanceDocs : Documentation {
   let Category = DocCatDecl;
-  let Heading = "__single_inhertiance, __multiple_inheritance, __virtual_inheritance";
+  let Heading = "__single_inheritance, __multiple_inheritance, __virtual_inheritance";
   let Content = [{
 This collection of keywords is enabled under ``-fms-extensions`` and controls
 the pointer-to-member representation used on ``*-*-win32`` targets.

From 0f80f5e362fb43a9335bd154c5f7976a96e32cfc Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Sat, 20 Jan 2024 22:43:15 +0800
Subject: [PATCH 244/843] [NFC] fix typo in
 clang/include/clang/Sema/ScopeInfo.h

---
 clang/include/clang/Sema/ScopeInfo.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Sema/ScopeInfo.h b/clang/include/clang/Sema/ScopeInfo.h
index b2f6e3289f41f..6eaa74382685b 100644
--- a/clang/include/clang/Sema/ScopeInfo.h
+++ b/clang/include/clang/Sema/ScopeInfo.h
@@ -66,7 +66,7 @@ namespace sema {
 /// parsed.
 class CompoundScopeInfo {
 public:
-  /// Whether this compound stamement contains `for' or `while' loops
+  /// Whether this compound statement contains `for' or `while' loops
   /// with empty bodies.
   bool HasEmptyLoopBodies = false;
 
@@ -168,7 +168,7 @@ class FunctionScopeInfo {
   /// to build, the initial and final coroutine suspend points
   bool NeedsCoroutineSuspends : 1;
 
-  /// An enumeration represeting the kind of the first coroutine statement
+  /// An enumeration representing the kind of the first coroutine statement
   /// in the function. One of co_return, co_await, or co_yield.
   unsigned char FirstCoroutineStmtKind : 2;
 
@@ -220,7 +220,7 @@ class FunctionScopeInfo {
   /// The initial and final coroutine suspend points.
   std::pair<Stmt *, Stmt *> CoroutineSuspends;
 
-  /// The stack of currently active compound stamement scopes in the
+  /// The stack of currently active compound statement scopes in the
   /// function.
   SmallVector<CompoundScopeInfo, 4> CompoundScopes;
 

From b9a1e2ab8dead4863834f70cdae56104ec92d041 Mon Sep 17 00:00:00 2001
From: nsurbay <21074287+nsurbay@users.noreply.github.com>
Date: Sat, 20 Jan 2024 18:07:14 +0100
Subject: [PATCH 245/843] [AArch64] Rename LDAPR<x>pre to LDAPR<x>post (#77340)

The feature FEAT_LRCPC3 introduces post-increment version of LDAPR and
LDIAPP instructions. The current disassembly of theses instructions is
correct but the opcode name is misleading with a 'pre' suffix instead of
'post'.

see :
-
https://developer.arm.com/documentation/ddi0602/2023-12/Base-Instructions/LDAPR--Load-Acquire-RCpc-Register-
-
https://developer.arm.com/documentation/ddi0602/2023-12/Base-Instructions/LDIAPP--Load-Acquire-RCpc-ordered-Pair-of-registers-
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index ddbe840079a57..6a744689b79ec 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -9380,15 +9380,15 @@ let Predicates = [HasLSE128] in {
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasRCPC3] in {
-  //                                             size   opc    opc2
-  def STILPWpre:  BaseLRCPC3IntegerLoadStorePair<0b10, 0b00, 0b0000, (outs GPR64sp:$wback), (ins GPR32:$Rt, GPR32:$Rt2, GPR64sp:$Rn), "stilp", "\t$Rt, $Rt2, [$Rn, #-8]!", "$Rn = $wback">;
-  def STILPXpre:  BaseLRCPC3IntegerLoadStorePair<0b11, 0b00, 0b0000, (outs GPR64sp:$wback), (ins GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn), "stilp", "\t$Rt, $Rt2, [$Rn, #-16]!", "$Rn = $wback">;
-  def STILPW:     BaseLRCPC3IntegerLoadStorePair<0b10, 0b00, 0b0001, (outs), (ins GPR32:$Rt, GPR32:$Rt2, GPR64sp:$Rn), "stilp", "\t$Rt, $Rt2, [$Rn]", "">;
-  def STILPX:     BaseLRCPC3IntegerLoadStorePair<0b11, 0b00, 0b0001, (outs), (ins GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn), "stilp", "\t$Rt, $Rt2, [$Rn]", "">;
-  def LDIAPPWpre: BaseLRCPC3IntegerLoadStorePair<0b10, 0b01, 0b0000, (outs GPR64sp:$wback, GPR32:$Rt, GPR32:$Rt2), (ins GPR64sp:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn], #8", "$Rn = $wback">;
-  def LDIAPPXpre: BaseLRCPC3IntegerLoadStorePair<0b11, 0b01, 0b0000, (outs GPR64sp:$wback, GPR64:$Rt, GPR64:$Rt2), (ins GPR64sp:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn], #16", "$Rn = $wback">;
-  def LDIAPPW:    BaseLRCPC3IntegerLoadStorePair<0b10, 0b01, 0b0001, (outs GPR32:$Rt, GPR32:$Rt2), (ins GPR64sp0:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn]", "">;
-  def LDIAPPX:    BaseLRCPC3IntegerLoadStorePair<0b11, 0b01, 0b0001, (outs GPR64:$Rt, GPR64:$Rt2), (ins GPR64sp0:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn]", "">;
+  //                                              size   opc    opc2
+  def STILPWpre:   BaseLRCPC3IntegerLoadStorePair<0b10, 0b00, 0b0000, (outs GPR64sp:$wback), (ins GPR32:$Rt, GPR32:$Rt2, GPR64sp:$Rn), "stilp", "\t$Rt, $Rt2, [$Rn, #-8]!", "$Rn = $wback">;
+  def STILPXpre:   BaseLRCPC3IntegerLoadStorePair<0b11, 0b00, 0b0000, (outs GPR64sp:$wback), (ins GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn), "stilp", "\t$Rt, $Rt2, [$Rn, #-16]!", "$Rn = $wback">;
+  def STILPW:      BaseLRCPC3IntegerLoadStorePair<0b10, 0b00, 0b0001, (outs), (ins GPR32:$Rt, GPR32:$Rt2, GPR64sp:$Rn), "stilp", "\t$Rt, $Rt2, [$Rn]", "">;
+  def STILPX:      BaseLRCPC3IntegerLoadStorePair<0b11, 0b00, 0b0001, (outs), (ins GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn), "stilp", "\t$Rt, $Rt2, [$Rn]", "">;
+  def LDIAPPWpost: BaseLRCPC3IntegerLoadStorePair<0b10, 0b01, 0b0000, (outs GPR64sp:$wback, GPR32:$Rt, GPR32:$Rt2), (ins GPR64sp:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn], #8", "$Rn = $wback">;
+  def LDIAPPXpost: BaseLRCPC3IntegerLoadStorePair<0b11, 0b01, 0b0000, (outs GPR64sp:$wback, GPR64:$Rt, GPR64:$Rt2), (ins GPR64sp:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn], #16", "$Rn = $wback">;
+  def LDIAPPW:     BaseLRCPC3IntegerLoadStorePair<0b10, 0b01, 0b0001, (outs GPR32:$Rt, GPR32:$Rt2), (ins GPR64sp0:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn]", "">;
+  def LDIAPPX:     BaseLRCPC3IntegerLoadStorePair<0b11, 0b01, 0b0001, (outs GPR64:$Rt, GPR64:$Rt2), (ins GPR64sp0:$Rn), "ldiapp", "\t$Rt, $Rt2, [$Rn]", "">;
 
   def : Pat<(AArch64ldiapp GPR64sp:$Rn), (LDIAPPX GPR64sp:$Rn)>;
   def : Pat<(AArch64stilp GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn), (STILPX GPR64:$Rt, GPR64:$Rt2, GPR64sp:$Rn)>;
@@ -9397,11 +9397,11 @@ let Predicates = [HasRCPC3] in {
   def : InstAlias<"stilp\t$Rt, $Rt2, [$Rn, #0]", (STILPW GPR32: $Rt, GPR32: $Rt2, GPR64sp:$Rn)>;
   def : InstAlias<"stilp\t$Rt, $Rt2, [$Rn, #0]", (STILPX GPR64: $Rt, GPR64: $Rt2, GPR64sp:$Rn)>;
 
-  //                                        size   opc
-  def STLRWpre:  BaseLRCPC3IntegerLoadStore<0b10, 0b10, (outs GPR64sp:$wback),            (ins GPR32:$Rt, GPR64sp:$Rn), "stlr",  "\t$Rt, [$Rn, #-4]!", "$Rn = $wback">;
-  def STLRXpre:  BaseLRCPC3IntegerLoadStore<0b11, 0b10, (outs GPR64sp:$wback),            (ins GPR64:$Rt, GPR64sp:$Rn), "stlr",  "\t$Rt, [$Rn, #-8]!", "$Rn = $wback">;
-  def LDAPRWpre: BaseLRCPC3IntegerLoadStore<0b10, 0b11, (outs GPR64sp:$wback, GPR32:$Rt), (ins GPR64sp:$Rn),            "ldapr", "\t$Rt, [$Rn], #4",   "$Rn = $wback">;
-  def LDAPRXpre: BaseLRCPC3IntegerLoadStore<0b11, 0b11, (outs GPR64sp:$wback, GPR64:$Rt), (ins GPR64sp:$Rn),            "ldapr", "\t$Rt, [$Rn], #8",   "$Rn = $wback">;
+  //                                         size   opc
+  def STLRWpre:   BaseLRCPC3IntegerLoadStore<0b10, 0b10, (outs GPR64sp:$wback),            (ins GPR32:$Rt, GPR64sp:$Rn), "stlr",  "\t$Rt, [$Rn, #-4]!", "$Rn = $wback">;
+  def STLRXpre:   BaseLRCPC3IntegerLoadStore<0b11, 0b10, (outs GPR64sp:$wback),            (ins GPR64:$Rt, GPR64sp:$Rn), "stlr",  "\t$Rt, [$Rn, #-8]!", "$Rn = $wback">;
+  def LDAPRWpost: BaseLRCPC3IntegerLoadStore<0b10, 0b11, (outs GPR64sp:$wback, GPR32:$Rt), (ins GPR64sp:$Rn),            "ldapr", "\t$Rt, [$Rn], #4",   "$Rn = $wback">;
+  def LDAPRXpost: BaseLRCPC3IntegerLoadStore<0b11, 0b11, (outs GPR64sp:$wback, GPR64:$Rt), (ins GPR64sp:$Rn),            "ldapr", "\t$Rt, [$Rn], #8",   "$Rn = $wback">;
 }
 
 let Predicates = [HasRCPC3, HasNEON] in {

From 818de32f31e8075657dd27938e4aeb1a46f3f631 Mon Sep 17 00:00:00 2001
From: kelbon <58717435+kelbon@users.noreply.github.com>
Date: Sat, 20 Jan 2024 18:37:35 +0100
Subject: [PATCH 246/843] Warning for incorrect use of 'pure' attribute
 (#78200)

This adds a warning when applying the `pure` attribute along with the `const` attribute, or when applying the `pure` attribute to a function with a `void` return type (including constructors and destructors).

Fixes https://github.com/llvm/llvm-project/issues/77482
---
 clang/docs/ReleaseNotes.rst                   |  2 +-
 .../clang/Basic/DiagnosticSemaKinds.td        |  7 +++++
 clang/lib/Sema/SemaDecl.cpp                   | 28 +++++++++++++++++++
 clang/test/Analysis/call-invalidation.cpp     |  8 +++---
 clang/test/CodeGen/function-attributes.c      |  6 ++--
 clang/test/CodeGen/pragma-weak.c              |  6 ++--
 clang/test/Import/attr/Inputs/S.cpp           |  2 +-
 clang/test/Import/attr/test.cpp               |  4 +--
 clang/test/Index/attributes.c                 |  4 +--
 .../Interpreter/disambiguate-decl-stmt.cpp    |  4 +--
 clang/test/Sema/attr-print.c                  |  8 +++---
 clang/test/Sema/incorrect_pure.cpp            | 14 ++++++++++
 clang/test/SemaCXX/attr-print.cpp             | 12 ++++----
 .../SemaCXX/cxx0x-cursory-default-delete.cpp  |  2 +-
 clang/test/SemaCXX/cxx11-attr-print.cpp       |  8 +++---
 .../test/SemaCXX/warn-unused-value-cxx11.cpp  |  2 +-
 16 files changed, 83 insertions(+), 34 deletions(-)
 create mode 100644 clang/test/Sema/incorrect_pure.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4c69bdf131aaa..8bb26fcae18d6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -581,7 +581,7 @@ Improvements to Clang's diagnostics
   and template friend declarations with a constraint that depends on a template parameter from an
   enclosing template must be a definition.
 - Clang now diagnoses function/variable templates that shadow their own template parameters, e.g. ``template<class T> void T();``.
-
+- Clang now diagnoses incorrect usage of ``const`` and ``pure`` attributes, so ``-Wignored-attributes`` diagnoses more cases.
 - Clang now emits more descriptive diagnostics for 'unusual' expressions (e.g. incomplete index
   expressions on matrix types or builtin functions without an argument list) as placement-args
   to new-expressions.
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 23692758e9b8a..dd24767b62f3c 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -692,6 +692,13 @@ def warn_maybe_falloff_nonvoid_function : Warning<
 def warn_falloff_nonvoid_function : Warning<
   "non-void function does not return a value">,
   InGroup<ReturnType>;
+def warn_const_attr_with_pure_attr : Warning<
+  "'const' attribute imposes more restrictions; 'pure' attribute ignored">,
+  InGroup<IgnoredAttributes>;
+def warn_pure_function_returns_void : Warning<
+  "'%select{pure|const}0' attribute on function returning 'void'; attribute ignored">,
+  InGroup<IgnoredAttributes>;
+
 def err_maybe_falloff_nonvoid_block : Error<
   "non-void block does not return a value in all control paths">;
 def err_falloff_nonvoid_block : Error<
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index cbb8ed8ce34d1..8dff2cdc063df 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -11801,6 +11801,32 @@ static bool CheckMultiVersionFunction(Sema &S, FunctionDecl *NewFD,
                                          OldDecl, Previous);
 }
 
+static void CheckConstPureAttributesUsage(Sema &S, FunctionDecl *NewFD) {
+  bool IsPure = NewFD->hasAttr<PureAttr>();
+  bool IsConst = NewFD->hasAttr<ConstAttr>();
+
+  // If there are no pure or const attributes, there's nothing to check.
+  if (!IsPure && !IsConst)
+    return;
+
+  // If the function is marked both pure and const, we retain the const
+  // attribute because it makes stronger guarantees than the pure attribute, and
+  // we drop the pure attribute explicitly to prevent later confusion about
+  // semantics.
+  if (IsPure && IsConst) {
+    S.Diag(NewFD->getLocation(), diag::warn_const_attr_with_pure_attr);
+    NewFD->dropAttrs<PureAttr>();
+  }
+
+  // Constructors and destructors are functions which return void, so are
+  // handled here as well.
+  if (NewFD->getReturnType()->isVoidType()) {
+    S.Diag(NewFD->getLocation(), diag::warn_pure_function_returns_void)
+        << IsConst;
+    NewFD->dropAttrs<PureAttr, ConstAttr>();
+  }
+}
+
 /// Perform semantic checking of a new function declaration.
 ///
 /// Performs semantic analysis of the new function declaration
@@ -11898,6 +11924,8 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
     NewFD->setInvalidDecl();
   }
 
+  CheckConstPureAttributesUsage(*this, NewFD);
+
   // C++11 [dcl.constexpr]p8:
   //   A constexpr specifier for a non-static member function that is not
   //   a constructor declares that member function to be const.
diff --git a/clang/test/Analysis/call-invalidation.cpp b/clang/test/Analysis/call-invalidation.cpp
index ef6505e19cf80..727217f228b05 100644
--- a/clang/test/Analysis/call-invalidation.cpp
+++ b/clang/test/Analysis/call-invalidation.cpp
@@ -90,8 +90,8 @@ void testConstReferenceStruct() {
 }
 
 
-void usePointerPure(int * const *) __attribute__((pure));
-void usePointerConst(int * const *) __attribute__((const));
+int usePointerPure(int * const *) __attribute__((pure));
+int usePointerConst(int * const *) __attribute__((const));
 
 void testPureConst() {
   extern int global;
@@ -104,11 +104,11 @@ void testPureConst() {
   clang_analyzer_eval(x == 42); // expected-warning{{TRUE}}
   clang_analyzer_eval(global == -5); // expected-warning{{TRUE}}
 
-  usePointerPure(&p);
+  (void)usePointerPure(&p);
   clang_analyzer_eval(x == 42); // expected-warning{{TRUE}}
   clang_analyzer_eval(global == -5); // expected-warning{{TRUE}}
 
-  usePointerConst(&p);
+  (void)usePointerConst(&p);
   clang_analyzer_eval(x == 42); // expected-warning{{TRUE}}
   clang_analyzer_eval(global == -5); // expected-warning{{TRUE}}
 
diff --git a/clang/test/CodeGen/function-attributes.c b/clang/test/CodeGen/function-attributes.c
index 845f3baf7a4ee..bb8670a8530e0 100644
--- a/clang/test/CodeGen/function-attributes.c
+++ b/clang/test/CodeGen/function-attributes.c
@@ -57,9 +57,9 @@ int f12(int arg) {
   return arg ? 0 : f10_t();
 }
 
-// CHECK: define{{.*}} void @f13() [[NUW_OS_RN:#[0-9]+]]
-void f13(void) __attribute__((pure)) __attribute__((const));
-void f13(void){}
+// CHECK: define{{.*}} i32 @f13() [[NUW_OS_RN:#[0-9]+]]
+int f13(void) __attribute__((const));
+int f13(void){ return 0; }
 
 
 // [irgen] clang isn't setting the optsize bit on functions
diff --git a/clang/test/CodeGen/pragma-weak.c b/clang/test/CodeGen/pragma-weak.c
index 52328bf9ff1be..ea508a485ef61 100644
--- a/clang/test/CodeGen/pragma-weak.c
+++ b/clang/test/CodeGen/pragma-weak.c
@@ -16,7 +16,7 @@
 // CHECK-DAG: @declfirstattr = weak{{.*}} alias void (), ptr @__declfirstattr
 // CHECK-DAG: @mix2 = weak{{.*}} alias void (), ptr @__mix2
 // CHECK-DAG: @a1 = weak{{.*}} alias void (), ptr @__a1
-// CHECK-DAG: @xxx = weak{{.*}} alias void (), ptr @__xxx
+// CHECK-DAG: @xxx = weak{{.*}} alias i32 (), ptr @__xxx
 // CHECK-DAG: @undecfunc_alias1 = weak{{.*}} alias void (), ptr @undecfunc
 // CHECK-DAG: @undecfunc_alias2 = weak{{.*}} alias void (), ptr @undecfunc
 // CHECK-DAG: @undecfunc_alias3 = weak{{.*}} alias void (), ptr @undecfunc
@@ -137,8 +137,8 @@ void __a1(void) {}
 // CHECK: define{{.*}} void @__a1() [[NI:#[0-9]+]]
 
 #pragma weak xxx = __xxx
-__attribute((pure,noinline,const)) void __xxx(void) { }
-// CHECK: void @__xxx() [[RN:#[0-9]+]]
+__attribute((noinline,const)) int __xxx(void) { return 0; }
+// CHECK: i32 @__xxx() [[RN:#[0-9]+]]
 
 ///////////// PR28611: Try multiple aliases of same undeclared symbol or alias
 #pragma weak undecfunc_alias1 = undecfunc
diff --git a/clang/test/Import/attr/Inputs/S.cpp b/clang/test/Import/attr/Inputs/S.cpp
index 28d70c544a7ca..cf9af91838e8e 100644
--- a/clang/test/Import/attr/Inputs/S.cpp
+++ b/clang/test/Import/attr/Inputs/S.cpp
@@ -1,4 +1,4 @@
-extern void f() __attribute__((const));
+extern char f() __attribute__((const));
 
 struct S {
   struct {
diff --git a/clang/test/Import/attr/test.cpp b/clang/test/Import/attr/test.cpp
index c9b2d6ed3433a..9e2d6ee1cbac9 100644
--- a/clang/test/Import/attr/test.cpp
+++ b/clang/test/Import/attr/test.cpp
@@ -14,13 +14,13 @@
 // CHECK-NEXT: LoopHintAttr
 // CHECK-SAME: line:10:9
 
-extern void f() __attribute__((const));
+extern char f() __attribute__((const));
 
 struct S;
 
 void stmt();
 
 void expr() {
-  f();
+  (void)f();
   struct S s;
 }
diff --git a/clang/test/Index/attributes.c b/clang/test/Index/attributes.c
index a5d10a1835914..0a9b0bf50aabe 100644
--- a/clang/test/Index/attributes.c
+++ b/clang/test/Index/attributes.c
@@ -4,8 +4,8 @@ struct __attribute__((packed)) Test2 {
   char a;
 };
 
-void pure_fn() __attribute__((pure));
-void const_fn() __attribute__((const));
+char pure_fn() __attribute__((pure));
+char const_fn() __attribute__((const));
 void noduplicate_fn() __attribute__((noduplicate));
 
 enum __attribute((flag_enum)) FlagEnum {
diff --git a/clang/test/Interpreter/disambiguate-decl-stmt.cpp b/clang/test/Interpreter/disambiguate-decl-stmt.cpp
index a49d7013c540a..1f4d5e267288b 100644
--- a/clang/test/Interpreter/disambiguate-decl-stmt.cpp
+++ b/clang/test/Interpreter/disambiguate-decl-stmt.cpp
@@ -95,10 +95,10 @@ Ns::Ns::Fs();
 Ns::Ns::Ns();
 
 struct Attrs1 { Attrs1(); };
-Attrs1::Attrs1() __attribute((pure)) = default;
+Attrs1::Attrs1() __attribute((noreturn)) = default;
 
 struct Attrs2 { Attrs2(); };
-__attribute((pure)) Attrs2::Attrs2() = default;
+__attribute((noreturn)) Attrs2::Attrs2() = default;
 
 // Extra semicolon
 namespace N {};
diff --git a/clang/test/Sema/attr-print.c b/clang/test/Sema/attr-print.c
index ffa27de4a5d30..8492356e5d2e5 100644
--- a/clang/test/Sema/attr-print.c
+++ b/clang/test/Sema/attr-print.c
@@ -9,11 +9,11 @@ __declspec(align(4)) int y;
 // CHECK: short arr[3] __attribute__((aligned));
 short arr[3] __attribute__((aligned));
 
-// CHECK: void foo(void) __attribute__((const));
-void foo(void) __attribute__((const));
+// CHECK: int foo(void) __attribute__((const));
+int foo(void) __attribute__((const));
 
-// CHECK: void bar(void) __attribute__((__const));
-void bar(void) __attribute__((__const));
+// CHECK: int bar(void) __attribute__((__const));
+int bar(void) __attribute__((__const));
 
 // CHECK: int * __ptr32 p32;
 int * __ptr32 p32;
diff --git a/clang/test/Sema/incorrect_pure.cpp b/clang/test/Sema/incorrect_pure.cpp
new file mode 100644
index 0000000000000..69ae41c421300
--- /dev/null
+++ b/clang/test/Sema/incorrect_pure.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+[[gnu::pure]] void foo(); // expected-warning{{'pure' attribute on function returning 'void'; attribute ignored}}
+
+[[gnu::const]] void bar(); // expected-warning{{'const' attribute on function returning 'void'; attribute ignored}}
+
+struct A {
+    [[gnu::pure]] A(); // expected-warning{{'pure' attribute on function returning 'void'; attribute ignored}}
+
+    [[gnu::const]] A(int); // expected-warning{{'const' attribute on function returning 'void'; attribute ignored}}
+    [[gnu::pure]] ~A(); // expected-warning{{'pure' attribute on function returning 'void'; attribute ignored}}
+
+    [[gnu::const]] [[gnu::pure]] int m(); // expected-warning{{'const' attribute imposes more restrictions; 'pure' attribute ignored}}
+};
diff --git a/clang/test/SemaCXX/attr-print.cpp b/clang/test/SemaCXX/attr-print.cpp
index dff290be696be..c7c58a25504ee 100644
--- a/clang/test/SemaCXX/attr-print.cpp
+++ b/clang/test/SemaCXX/attr-print.cpp
@@ -6,15 +6,15 @@ int x __attribute__((aligned(4)));
 // CHECK: __declspec(align(4)) int y;
 __declspec(align(4)) int y;
 
-// CHECK: void foo() __attribute__((const));
-void foo() __attribute__((const));
+// CHECK: int foo() __attribute__((const));
+int foo() __attribute__((const));
 
-// CHECK: void bar() __attribute__((__const));
-void bar() __attribute__((__const));
+// CHECK: int bar() __attribute__((__const));
+int bar() __attribute__((__const));
 
 // FIXME: Print this with correct format.
-// CHECK: void foo1() __attribute__((noinline)) __attribute__((pure));
-void foo1() __attribute__((noinline, pure));
+// CHECK: int foo1() __attribute__((noinline)) __attribute__((pure));
+int foo1() __attribute__((noinline, pure));
 
 // CHECK: typedef int Small1 __attribute__((mode(byte)));
 typedef int Small1 __attribute__((mode(byte)));
diff --git a/clang/test/SemaCXX/cxx0x-cursory-default-delete.cpp b/clang/test/SemaCXX/cxx0x-cursory-default-delete.cpp
index 9d68a0e5d358f..6ae146f0d08c7 100644
--- a/clang/test/SemaCXX/cxx0x-cursory-default-delete.cpp
+++ b/clang/test/SemaCXX/cxx0x-cursory-default-delete.cpp
@@ -194,7 +194,7 @@ struct except_spec_d_match : except_spec_a, except_spec_b {
 // gcc-compatibility: allow attributes on default definitions
 // (but not normal definitions)
 struct S { S(); };
-S::S() __attribute((pure)) = default;
+S::S() __attribute((noreturn)) = default;
 
 using size_t = decltype(sizeof(0));
 void *operator new(size_t) = delete; // expected-error {{deleted definition must be first declaration}} expected-note {{implicit}}
diff --git a/clang/test/SemaCXX/cxx11-attr-print.cpp b/clang/test/SemaCXX/cxx11-attr-print.cpp
index 7095c462031d9..c988972aeb1a5 100644
--- a/clang/test/SemaCXX/cxx11-attr-print.cpp
+++ b/clang/test/SemaCXX/cxx11-attr-print.cpp
@@ -30,11 +30,11 @@ alignas(4) int cxx11_alignas;
 // CHECK: int c11_alignas _Alignas(int);
 _Alignas(int) int c11_alignas;
 
-// CHECK: void foo() __attribute__((const));
-void foo() __attribute__((const));
+// CHECK: int foo() __attribute__((const));
+int foo() __attribute__((const));
 
-// CHECK: void bar() __attribute__((__const));
-void bar() __attribute__((__const));
+// CHECK: int bar() __attribute__((__const));
+int bar() __attribute__((__const));
 
 // FIXME: It's unfortunate that the string literal prints with the below three
 // cases given that the string is only exposed via the [[nodiscard]] spelling.
diff --git a/clang/test/SemaCXX/warn-unused-value-cxx11.cpp b/clang/test/SemaCXX/warn-unused-value-cxx11.cpp
index a6f41c3fd6b3b..687278a98f4e1 100644
--- a/clang/test/SemaCXX/warn-unused-value-cxx11.cpp
+++ b/clang/test/SemaCXX/warn-unused-value-cxx11.cpp
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -std=c++11 -fsyntax-only -verify -Wunused-value %s
 
-void f() __attribute__((const));
+int f() __attribute__((const));
 
 namespace PR18571 {
 // Unevaluated contexts should not trigger unused result warnings.

From ec0ac85e58f0a80cc52a132336b132ffe7b50b59 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Sat, 20 Jan 2024 12:53:03 -0600
Subject: [PATCH 247/843] [Clang][Obvious] Correctly disable Windows on
 linker-wrapper test

---
 clang/test/Driver/linker-wrapper.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Driver/linker-wrapper.c b/clang/test/Driver/linker-wrapper.c
index a6de616b05e9f..632df63c797e4 100644
--- a/clang/test/Driver/linker-wrapper.c
+++ b/clang/test/Driver/linker-wrapper.c
@@ -2,7 +2,7 @@
 // REQUIRES: nvptx-registered-target
 // REQUIRES: amdgpu-registered-target
 
-// UNSUPPORTED: system-linux
+// REQUIRES: system-linux
 
 // An externally visible variable so static libraries extract.
 __attribute__((visibility("protected"), used)) int x;

From 0880742a60e9436e439eaee05bf0c8acf4c48a62 Mon Sep 17 00:00:00 2001
From: Nathan Sidwell <nathan@acm.org>
Date: Sat, 20 Jan 2024 14:23:37 -0500
Subject: [PATCH 248/843] [NFC] Rename internal fns (#77994)

Internal functions should use a lowerCaseName, thus renamed.
---
 llvm/lib/IR/AutoUpgrade.cpp | 268 +++++++++++++++++-------------------
 1 file changed, 130 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index ffb8e3a91b668..b90bbe71ac189 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -53,7 +53,7 @@ static void rename(GlobalValue *GV) { GV->setName(GV->getName() + ".old"); }
 
 // Upgrade the declarations of the SSE4.1 ptest intrinsics whose arguments have
 // changed their type from v4f32 to v2i64.
-static bool UpgradePTESTIntrinsic(Function* F, Intrinsic::ID IID,
+static bool upgradePTESTIntrinsic(Function *F, Intrinsic::ID IID,
                                   Function *&NewFn) {
   // Check whether this is an old version of the function, which received
   // v4f32 arguments.
@@ -69,7 +69,7 @@ static bool UpgradePTESTIntrinsic(Function* F, Intrinsic::ID IID,
 
 // Upgrade the declarations of intrinsic functions whose 8-bit immediate mask
 // arguments have changed their type from i32 to i8.
-static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
+static bool upgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
                                              Function *&NewFn) {
   // Check that the last argument is an i32.
   Type *LastArgType = F->getFunctionType()->getParamType(
@@ -85,7 +85,7 @@ static bool UpgradeX86IntrinsicsWith8BitMask(Function *F, Intrinsic::ID IID,
 
 // Upgrade the declaration of fp compare intrinsics that change return type
 // from scalar to vXi1 mask.
-static bool UpgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID,
+static bool upgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID,
                                       Function *&NewFn) {
   // Check if the return type is a vector.
   if (F->getReturnType()->isVectorTy())
@@ -96,7 +96,7 @@ static bool UpgradeX86MaskedFPCompare(Function *F, Intrinsic::ID IID,
   return true;
 }
 
-static bool UpgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID,
+static bool upgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID,
                                     Function *&NewFn) {
   if (F->getReturnType()->getScalarType()->isBFloatTy())
     return false;
@@ -106,7 +106,7 @@ static bool UpgradeX86BF16Intrinsic(Function *F, Intrinsic::ID IID,
   return true;
 }
 
-static bool UpgradeX86BF16DPIntrinsic(Function *F, Intrinsic::ID IID,
+static bool upgradeX86BF16DPIntrinsic(Function *F, Intrinsic::ID IID,
                                       Function *&NewFn) {
   if (F->getFunctionType()->getParamType(1)->getScalarType()->isBFloatTy())
     return false;
@@ -116,7 +116,7 @@ static bool UpgradeX86BF16DPIntrinsic(Function *F, Intrinsic::ID IID,
   return true;
 }
 
-static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
+static bool shouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
   // All of the intrinsics matches below should be marked with which llvm
   // version started autoupgrading them. At some point in the future we would
   // like to use this information to remove upgrade code for some older
@@ -483,13 +483,13 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
           Name.starts_with("vcvtph2ps.")); // Added in 11.0
 }
 
-static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
+static bool upgradeX86IntrinsicFunction(Function *F, StringRef Name,
                                         Function *&NewFn) {
   // Only handle intrinsics that start with "x86.".
   if (!Name.consume_front("x86."))
     return false;
 
-  if (ShouldUpgradeX86Intrinsic(F, Name)) {
+  if (shouldUpgradeX86Intrinsic(F, Name)) {
     NewFn = nullptr;
     return true;
   }
@@ -515,7 +515,7 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
              .Case("nzc", Intrinsic::x86_sse41_ptestnzc)
              .Default(Intrinsic::not_intrinsic);
     if (ID != Intrinsic::not_intrinsic)
-      return UpgradePTESTIntrinsic(F, ID, NewFn);
+      return upgradePTESTIntrinsic(F, ID, NewFn);
 
     return false;
   }
@@ -533,7 +533,7 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
            .Case("avx2.mpsadbw", Intrinsic::x86_avx2_mpsadbw)
            .Default(Intrinsic::not_intrinsic);
   if (ID != Intrinsic::not_intrinsic)
-    return UpgradeX86IntrinsicsWith8BitMask(F, ID, NewFn);
+    return upgradeX86IntrinsicsWith8BitMask(F, ID, NewFn);
 
   if (Name.consume_front("avx512.mask.cmp.")) {
     // Added in 7.0
@@ -546,7 +546,7 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
              .Case("ps.512", Intrinsic::x86_avx512_mask_cmp_ps_512)
              .Default(Intrinsic::not_intrinsic);
     if (ID != Intrinsic::not_intrinsic)
-      return UpgradeX86MaskedFPCompare(F, ID, NewFn);
+      return upgradeX86MaskedFPCompare(F, ID, NewFn);
     return false; // No other 'x86.avx523.mask.cmp.*'.
   }
 
@@ -567,7 +567,7 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
                    Intrinsic::x86_avx512bf16_cvtneps2bf16_512)
              .Default(Intrinsic::not_intrinsic);
     if (ID != Intrinsic::not_intrinsic)
-      return UpgradeX86BF16Intrinsic(F, ID, NewFn);
+      return upgradeX86BF16Intrinsic(F, ID, NewFn);
 
     // Added in 9.0
     ID = StringSwitch<Intrinsic::ID>(Name)
@@ -576,7 +576,7 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
              .Case("dpbf16ps.512", Intrinsic::x86_avx512bf16_dpbf16ps_512)
              .Default(Intrinsic::not_intrinsic);
     if (ID != Intrinsic::not_intrinsic)
-      return UpgradeX86BF16DPIntrinsic(F, ID, NewFn);
+      return upgradeX86BF16DPIntrinsic(F, ID, NewFn);
     return false; // No other 'x86.avx512bf16.*'.
   }
 
@@ -623,7 +623,7 @@ static bool UpgradeX86IntrinsicFunction(Function *F, StringRef Name,
 
 // Upgrade ARM (IsArm) or Aarch64 (!IsArm) intrinsic fns. Return true iff so.
 // IsArm: 'arm.*', !IsArm: 'aarch64.*'.
-static bool UpgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
+static bool upgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
                                                  StringRef Name,
                                                  Function *&NewFn) {
   if (Name.starts_with("rbit")) {
@@ -899,7 +899,7 @@ static bool UpgradeArmOrAarch64IntrinsicFunction(bool IsArm, Function *F,
   return false; // No other 'arm.*', 'aarch64.*'.
 }
 
-static Intrinsic::ID ShouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
+static Intrinsic::ID shouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
   if (Name.consume_front("abs."))
     return StringSwitch<Intrinsic::ID>(Name)
         .Case("bf16", Intrinsic::nvvm_abs_bf16)
@@ -979,7 +979,7 @@ static Intrinsic::ID ShouldUpgradeNVPTXBF16Intrinsic(StringRef Name) {
   return Intrinsic::not_intrinsic;
 }
 
-static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
+static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   assert(F && "Illegal to upgrade a non-existent Function.");
 
   StringRef Name = F->getName();
@@ -993,7 +993,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
   case 'a': {
     bool IsArm = Name.consume_front("arm.");
     if (IsArm || Name.consume_front("aarch64.")) {
-      if (UpgradeArmOrAarch64IntrinsicFunction(IsArm, F, Name, NewFn))
+      if (upgradeArmOrAarch64IntrinsicFunction(IsArm, F, Name, NewFn))
         return true;
       break;
     }
@@ -1190,7 +1190,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 
       // Check for nvvm intrinsics that need a return type adjustment.
       if (!F->getReturnType()->getScalarType()->isBFloatTy()) {
-        Intrinsic::ID IID = ShouldUpgradeNVPTXBF16Intrinsic(Name);
+        Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
         if (IID != Intrinsic::not_intrinsic) {
           NewFn = nullptr;
           return true;
@@ -1354,7 +1354,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
     break;
 
   case 'x':
-    if (UpgradeX86IntrinsicFunction(F, Name, NewFn))
+    if (upgradeX86IntrinsicFunction(F, Name, NewFn))
       return true;
   }
 
@@ -1399,7 +1399,7 @@ static bool UpgradeIntrinsicFunction1(Function *F, Function *&NewFn) {
 
 bool llvm::UpgradeIntrinsicFunction(Function *F, Function *&NewFn) {
   NewFn = nullptr;
-  bool Upgraded = UpgradeIntrinsicFunction1(F, NewFn);
+  bool Upgraded = upgradeIntrinsicFunction1(F, NewFn);
   assert(F != NewFn && "Intrinsic function upgraded to the same function");
 
   // Upgrade intrinsic attributes.  This does not change the function.
@@ -1443,8 +1443,8 @@ GlobalVariable *llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
 
 // Handles upgrading SSE2/AVX2/AVX512BW PSLLDQ intrinsics by converting them
 // to byte shuffles.
-static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder,
-                                         Value *Op, unsigned Shift) {
+static Value *upgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder, Value *Op,
+                                         unsigned Shift) {
   auto *ResultTy = cast<FixedVectorType>(Op->getType());
   unsigned NumElts = ResultTy->getNumElements() * 8;
 
@@ -1477,7 +1477,7 @@ static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder,
 
 // Handles upgrading SSE2/AVX2/AVX512BW PSRLDQ intrinsics by converting them
 // to byte shuffles.
-static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value *Op,
+static Value *upgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value *Op,
                                          unsigned Shift) {
   auto *ResultTy = cast<FixedVectorType>(Op->getType());
   unsigned NumElts = ResultTy->getNumElements() * 8;
@@ -1529,8 +1529,8 @@ static Value *getX86MaskVec(IRBuilder<> &Builder, Value *Mask,
   return Mask;
 }
 
-static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask,
-                            Value *Op0, Value *Op1) {
+static Value *emitX86Select(IRBuilder<> &Builder, Value *Mask, Value *Op0,
+                            Value *Op1) {
   // If the mask is all ones just emit the first operation.
   if (const auto *C = dyn_cast<Constant>(Mask))
     if (C->isAllOnesValue())
@@ -1541,8 +1541,8 @@ static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask,
   return Builder.CreateSelect(Mask, Op0, Op1);
 }
 
-static Value *EmitX86ScalarSelect(IRBuilder<> &Builder, Value *Mask,
-                                  Value *Op0, Value *Op1) {
+static Value *emitX86ScalarSelect(IRBuilder<> &Builder, Value *Mask, Value *Op0,
+                                  Value *Op1) {
   // If the mask is all ones just emit the first operation.
   if (const auto *C = dyn_cast<Constant>(Mask))
     if (C->isAllOnesValue())
@@ -1558,7 +1558,7 @@ static Value *EmitX86ScalarSelect(IRBuilder<> &Builder, Value *Mask,
 // Handle autoupgrade for masked PALIGNR and VALIGND/Q intrinsics.
 // PALIGNR handles large immediates by shifting while VALIGN masks the immediate
 // so we need to handle both cases. VALIGN also doesn't have 128-bit lanes.
-static Value *UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0,
+static Value *upgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0,
                                         Value *Op1, Value *Shift,
                                         Value *Passthru, Value *Mask,
                                         bool IsVALIGN) {
@@ -1600,10 +1600,10 @@ static Value *UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0,
   Value *Align = Builder.CreateShuffleVector(
       Op1, Op0, ArrayRef(Indices, NumElts), "palignr");
 
-  return EmitX86Select(Builder, Mask, Align, Passthru);
+  return emitX86Select(Builder, Mask, Align, Passthru);
 }
 
-static Value *UpgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI,
+static Value *upgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI,
                                           bool ZeroMask, bool IndexForm) {
   Type *Ty = CI.getType();
   unsigned VecWidth = Ty->getPrimitiveSizeInBits();
@@ -1661,10 +1661,10 @@ static Value *UpgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI,
   Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty)
                              : Builder.CreateBitCast(CI.getArgOperand(1),
                                                      Ty);
-  return EmitX86Select(Builder, CI.getArgOperand(3), V, PassThru);
+  return emitX86Select(Builder, CI.getArgOperand(3), V, PassThru);
 }
 
-static Value *UpgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallBase &CI,
+static Value *upgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallBase &CI,
                                          Intrinsic::ID IID) {
   Type *Ty = CI.getType();
   Value *Op0 = CI.getOperand(0);
@@ -1675,7 +1675,7 @@ static Value *UpgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallBase &CI,
   if (CI.arg_size() == 4) { // For masked intrinsics.
     Value *VecSrc = CI.getOperand(2);
     Value *Mask = CI.getOperand(3);
-    Res = EmitX86Select(Builder, Mask, Res, VecSrc);
+    Res = emitX86Select(Builder, Mask, Res, VecSrc);
   }
   return Res;
 }
@@ -1702,7 +1702,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallBase &CI,
   if (CI.arg_size() == 4) { // For masked intrinsics.
     Value *VecSrc = CI.getOperand(2);
     Value *Mask = CI.getOperand(3);
-    Res = EmitX86Select(Builder, Mask, Res, VecSrc);
+    Res = emitX86Select(Builder, Mask, Res, VecSrc);
   }
   return Res;
 }
@@ -1775,14 +1775,13 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallBase &CI,
                     ZeroMask     ? ConstantAggregateZero::get(CI.getType()) :
                                    CI.getArgOperand(0);
     Value *Mask = CI.getOperand(NumArgs - 1);
-    Res = EmitX86Select(Builder, Mask, Res, VecSrc);
+    Res = emitX86Select(Builder, Mask, Res, VecSrc);
   }
   return Res;
 }
 
-static Value *UpgradeMaskedStore(IRBuilder<> &Builder,
-                                 Value *Ptr, Value *Data, Value *Mask,
-                                 bool Aligned) {
+static Value *upgradeMaskedStore(IRBuilder<> &Builder, Value *Ptr, Value *Data,
+                                 Value *Mask, bool Aligned) {
   // Cast the pointer to the right type.
   Ptr = Builder.CreateBitCast(Ptr,
                               llvm::PointerType::getUnqual(Data->getType()));
@@ -1802,9 +1801,8 @@ static Value *UpgradeMaskedStore(IRBuilder<> &Builder,
   return Builder.CreateMaskedStore(Data, Ptr, Alignment, Mask);
 }
 
-static Value *UpgradeMaskedLoad(IRBuilder<> &Builder,
-                                Value *Ptr, Value *Passthru, Value *Mask,
-                                bool Aligned) {
+static Value *upgradeMaskedLoad(IRBuilder<> &Builder, Value *Ptr,
+                                Value *Passthru, Value *Mask, bool Aligned) {
   Type *ValTy = Passthru->getType();
   // Cast the pointer to the right type.
   Ptr = Builder.CreateBitCast(Ptr, llvm::PointerType::getUnqual(ValTy));
@@ -1832,7 +1830,7 @@ static Value *upgradeAbs(IRBuilder<> &Builder, CallBase &CI) {
   Function *F = Intrinsic::getDeclaration(CI.getModule(), Intrinsic::abs, Ty);
   Value *Res = Builder.CreateCall(F, {Op0, Builder.getInt1(false)});
   if (CI.arg_size() == 3)
-    Res = EmitX86Select(Builder, CI.getArgOperand(2), Res, CI.getArgOperand(1));
+    Res = emitX86Select(Builder, CI.getArgOperand(2), Res, CI.getArgOperand(1));
   return Res;
 }
 
@@ -1860,13 +1858,13 @@ static Value *upgradePMULDQ(IRBuilder<> &Builder, CallBase &CI, bool IsSigned) {
   Value *Res = Builder.CreateMul(LHS, RHS);
 
   if (CI.arg_size() == 4)
-    Res = EmitX86Select(Builder, CI.getArgOperand(3), Res, CI.getArgOperand(2));
+    Res = emitX86Select(Builder, CI.getArgOperand(3), Res, CI.getArgOperand(2));
 
   return Res;
 }
 
 // Applying mask on vector of i1's and make sure result is at least 8 bits wide.
-static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec,
+static Value *applyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec,
                                      Value *Mask) {
   unsigned NumElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
   if (Mask) {
@@ -1916,19 +1914,19 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallBase &CI,
 
   Value *Mask = CI.getArgOperand(CI.arg_size() - 1);
 
-  return ApplyX86MaskOn1BitsVec(Builder, Cmp, Mask);
+  return applyX86MaskOn1BitsVec(Builder, Cmp, Mask);
 }
 
 // Replace a masked intrinsic with an older unmasked intrinsic.
-static Value *UpgradeX86MaskedShift(IRBuilder<> &Builder, CallBase &CI,
+static Value *upgradeX86MaskedShift(IRBuilder<> &Builder, CallBase &CI,
                                     Intrinsic::ID IID) {
   Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID);
   Value *Rep = Builder.CreateCall(Intrin,
                                  { CI.getArgOperand(0), CI.getArgOperand(1) });
-  return EmitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2));
+  return emitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2));
 }
 
-static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallBase &CI) {
+static Value *upgradeMaskedMove(IRBuilder<> &Builder, CallBase &CI) {
   Value* A = CI.getArgOperand(0);
   Value* B = CI.getArgOperand(1);
   Value* Src = CI.getArgOperand(2);
@@ -1942,8 +1940,7 @@ static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallBase &CI) {
   return Builder.CreateInsertElement(A, Select, (uint64_t)0);
 }
 
-
-static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallBase &CI) {
+static Value *upgradeMaskToInt(IRBuilder<> &Builder, CallBase &CI) {
   Value* Op = CI.getArgOperand(0);
   Type* ReturnOp = CI.getType();
   unsigned NumElts = cast<FixedVectorType>(CI.getType())->getNumElements();
@@ -2185,7 +2182,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
   Rep = Builder.CreateCall(Intrinsic::getDeclaration(CI.getModule(), IID),
                            Args);
   unsigned NumArgs = CI.arg_size();
-  Rep = EmitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep,
+  Rep = emitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep,
                       CI.getArgOperand(NumArgs - 2));
   return true;
 }
@@ -2201,7 +2198,7 @@ void llvm::UpgradeInlineAsmString(std::string *AsmStr) {
   }
 }
 
-static Value *UpgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
+static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
                                       IRBuilder<> &Builder) {
   if (Name == "mve.vctp64.old") {
     // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the
@@ -2295,7 +2292,7 @@ static Value *UpgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
   llvm_unreachable("Unknown function for ARM CallBase upgrade.");
 }
 
-static Value *UpgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
+static Value *upgradeAMDGCNIntrinsicCall(StringRef Name, CallBase *CI,
                                          Function *F, IRBuilder<> &Builder) {
   const bool IsInc = Name.starts_with("atomic.inc.");
   if (IsInc || Name.starts_with("atomic.dec.")) {
@@ -2441,7 +2438,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
     if (IsX86 && Name == "avx512.mask.store.ss") {
       Value *Mask = Builder.CreateAnd(CI->getArgOperand(2), Builder.getInt8(1));
-      UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+      upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
                          Mask, false);
 
       // Remove intrinsic.
@@ -2449,10 +2446,10 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       return;
     }
 
-    if (IsX86 && (Name.starts_with("avx512.mask.store"))) {
+    if (IsX86 && Name.starts_with("avx512.mask.store")) {
       // "avx512.mask.storeu." or "avx512.mask.store."
       bool Aligned = Name[17] != 'u'; // "avx512.mask.storeu".
-      UpgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+      upgradeMaskedStore(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
                          CI->getArgOperand(2), Aligned);
 
       // Remove intrinsic.
@@ -2508,7 +2505,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                                            CI->getType()),
                                  {CI->getArgOperand(0)});
       }
-      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
     } else if (IsX86 && (Name.starts_with("avx512.ptestm") ||
                          Name.starts_with("avx512.ptestnm"))) {
@@ -2521,12 +2518,12 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       ICmpInst::Predicate Pred =
         Name.starts_with("avx512.ptestm") ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ;
       Rep = Builder.CreateICmp(Pred, Rep, Zero);
-      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, Mask);
+      Rep = applyX86MaskOn1BitsVec(Builder, Rep, Mask);
     } else if (IsX86 && (Name.starts_with("avx512.mask.pbroadcast"))){
       unsigned NumElts = cast<FixedVectorType>(CI->getArgOperand(1)->getType())
                              ->getNumElements();
       Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
-      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
     } else if (IsX86 && (Name.starts_with("avx512.kunpck"))) {
       unsigned NumElts = CI->getType()->getScalarSizeInBits();
@@ -2627,7 +2624,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                { CI->getOperand(0), CI->getArgOperand(1) });
-      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
+      Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.fpclass.p")) {
       Type *OpTy = CI->getArgOperand(0)->getType();
       unsigned VecWidth = OpTy->getPrimitiveSizeInBits();
@@ -2650,7 +2647,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                { CI->getOperand(0), CI->getArgOperand(1) });
-      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
+      Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.cmp.p")) {
       SmallVector<Value *, 4> Args(CI->args());
       Type *OpTy = Args[0]->getType();
@@ -2693,7 +2690,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Value *Op = CI->getArgOperand(0);
       Value *Zero = llvm::Constant::getNullValue(Op->getType());
       Rep = Builder.CreateICmp(ICmpInst::ICMP_SLT, Op, Zero);
-      Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, nullptr);
+      Rep = applyX86MaskOn1BitsVec(Builder, Rep, nullptr);
     } else if(IsX86 && (Name == "ssse3.pabs.b.128" ||
                         Name == "ssse3.pabs.w.128" ||
                         Name == "ssse3.pabs.d.128" ||
@@ -2705,25 +2702,25 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                          Name == "sse41.pmaxsd" ||
                          Name.starts_with("avx2.pmaxs") ||
                          Name.starts_with("avx512.mask.pmaxs"))) {
-      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smax);
+      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smax);
     } else if (IsX86 && (Name == "sse2.pmaxu.b" ||
                          Name == "sse41.pmaxuw" ||
                          Name == "sse41.pmaxud" ||
                          Name.starts_with("avx2.pmaxu") ||
                          Name.starts_with("avx512.mask.pmaxu"))) {
-      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umax);
+      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umax);
     } else if (IsX86 && (Name == "sse41.pminsb" ||
                          Name == "sse2.pmins.w" ||
                          Name == "sse41.pminsd" ||
                          Name.starts_with("avx2.pmins") ||
                          Name.starts_with("avx512.mask.pmins"))) {
-      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smin);
+      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::smin);
     } else if (IsX86 && (Name == "sse2.pminu.b" ||
                          Name == "sse41.pminuw" ||
                          Name == "sse41.pminud" ||
                          Name.starts_with("avx2.pminu") ||
                          Name.starts_with("avx512.mask.pminu"))) {
-      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umin);
+      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::umin);
     } else if (IsX86 && (Name == "sse2.pmulu.dq" ||
                          Name == "avx2.pmulu.dq" ||
                          Name == "avx512.pmulu.dq.512" ||
@@ -2798,7 +2795,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
 
       if (CI->arg_size() >= 3)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (IsX86 && (Name.starts_with("avx512.mask.vcvtph2ps.") ||
                          Name.starts_with("vcvtph2ps."))) {
@@ -2814,13 +2811,13 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           Rep, FixedVectorType::get(Type::getHalfTy(C), NumDstElts));
       Rep = Builder.CreateFPExt(Rep, DstTy, "cvtph2ps");
       if (CI->arg_size() >= 3)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (IsX86 && Name.starts_with("avx512.mask.load")) {
       // "avx512.mask.loadu." or "avx512.mask.load."
       bool Aligned = Name[16] != 'u'; // "avx512.mask.loadu".
       Rep =
-          UpgradeMaskedLoad(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+          upgradeMaskedLoad(Builder, CI->getArgOperand(0), CI->getArgOperand(1),
                             CI->getArgOperand(2), Aligned);
     } else if (IsX86 && Name.starts_with("avx512.mask.expand.load.")) {
       auto *ResultTy = cast<FixedVectorType>(CI->getType());
@@ -2966,7 +2963,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                    : Builder.CreateZExt(SV, DstTy);
       // If there are 3 arguments, it's a masked intrinsic so we need a select.
       if (CI->arg_size() == 3)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (Name == "avx512.mask.pmov.qd.256" ||
                Name == "avx512.mask.pmov.qd.512" ||
@@ -2974,7 +2971,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                Name == "avx512.mask.pmov.wb.512") {
       Type *Ty = CI->getArgOperand(1)->getType();
       Rep = Builder.CreateTrunc(CI->getArgOperand(0), Ty);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
     } else if (IsX86 && (Name.starts_with("avx.vbroadcastf128") ||
                          Name == "avx2.vbroadcasti128")) {
@@ -3010,7 +3007,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       }
       Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
                                         CI->getArgOperand(1), ShuffleMask);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep,
                           CI->getArgOperand(3));
     }else if (IsX86 && (Name.starts_with("avx512.mask.broadcastf") ||
                          Name.starts_with("avx512.mask.broadcasti"))) {
@@ -3027,7 +3024,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateShuffleVector(CI->getArgOperand(0),
                                         CI->getArgOperand(0),
                                         ShuffleMask);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
     } else if (IsX86 && (Name.starts_with("avx2.pbroadcast") ||
                          Name.starts_with("avx2.vbroadcast") ||
@@ -3042,64 +3039,60 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateShuffleVector(Op, M);
 
       if (CI->arg_size() == 3)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
     } else if (IsX86 && (Name.starts_with("sse2.padds.") ||
                          Name.starts_with("avx2.padds.") ||
                          Name.starts_with("avx512.padds.") ||
                          Name.starts_with("avx512.mask.padds."))) {
-      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::sadd_sat);
+      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::sadd_sat);
     } else if (IsX86 && (Name.starts_with("sse2.psubs.") ||
                          Name.starts_with("avx2.psubs.") ||
                          Name.starts_with("avx512.psubs.") ||
                          Name.starts_with("avx512.mask.psubs."))) {
-      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::ssub_sat);
+      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::ssub_sat);
     } else if (IsX86 && (Name.starts_with("sse2.paddus.") ||
                          Name.starts_with("avx2.paddus.") ||
                          Name.starts_with("avx512.mask.paddus."))) {
-      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::uadd_sat);
+      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::uadd_sat);
     } else if (IsX86 && (Name.starts_with("sse2.psubus.") ||
                          Name.starts_with("avx2.psubus.") ||
                          Name.starts_with("avx512.mask.psubus."))) {
-      Rep = UpgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::usub_sat);
+      Rep = upgradeX86BinaryIntrinsics(Builder, *CI, Intrinsic::usub_sat);
     } else if (IsX86 && Name.starts_with("avx512.mask.palignr.")) {
-      Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
-                                      CI->getArgOperand(1),
-                                      CI->getArgOperand(2),
-                                      CI->getArgOperand(3),
-                                      CI->getArgOperand(4),
-                                      false);
+      Rep = upgradeX86ALIGNIntrinsics(
+          Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+          CI->getArgOperand(2), CI->getArgOperand(3), CI->getArgOperand(4),
+          false);
     } else if (IsX86 && Name.starts_with("avx512.mask.valign.")) {
-      Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0),
-                                      CI->getArgOperand(1),
-                                      CI->getArgOperand(2),
-                                      CI->getArgOperand(3),
-                                      CI->getArgOperand(4),
-                                      true);
+      Rep = upgradeX86ALIGNIntrinsics(
+          Builder, CI->getArgOperand(0), CI->getArgOperand(1),
+          CI->getArgOperand(2), CI->getArgOperand(3), CI->getArgOperand(4),
+          true);
     } else if (IsX86 && (Name == "sse2.psll.dq" ||
                          Name == "avx2.psll.dq")) {
       // 128/256-bit shift left specified in bits.
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0),
+      Rep = upgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0),
                                        Shift / 8); // Shift is in bits.
     } else if (IsX86 && (Name == "sse2.psrl.dq" ||
                          Name == "avx2.psrl.dq")) {
       // 128/256-bit shift right specified in bits.
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0),
+      Rep = upgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0),
                                        Shift / 8); // Shift is in bits.
     } else if (IsX86 && (Name == "sse2.psll.dq.bs" ||
                          Name == "avx2.psll.dq.bs" ||
                          Name == "avx512.psll.dq.512")) {
       // 128/256/512-bit shift left specified in bytes.
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
+      Rep = upgradeX86PSLLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
     } else if (IsX86 && (Name == "sse2.psrl.dq.bs" ||
                          Name == "avx2.psrl.dq.bs" ||
                          Name == "avx512.psrl.dq.512")) {
       // 128/256/512-bit shift right specified in bytes.
       unsigned Shift = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      Rep = UpgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
+      Rep = upgradeX86PSRLDQIntrinsics(Builder, CI->getArgOperand(0), Shift);
     } else if (IsX86 && (Name == "sse41.pblendw" ||
                          Name.starts_with("sse41.blendp") ||
                          Name.starts_with("avx.blend.p") ||
@@ -3160,7 +3153,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       // If the intrinsic has a mask operand, handle that.
       if (CI->arg_size() == 5)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep,
                             CI->getArgOperand(3));
     } else if (IsX86 && (Name.starts_with("avx.vextractf128.") ||
                          Name == "avx2.vextracti128" ||
@@ -3185,7 +3178,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       // If the intrinsic has a mask operand, handle that.
       if (CI->arg_size() == 4)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (!IsX86 && Name == "stackprotectorcheck") {
       Rep = nullptr;
@@ -3203,7 +3196,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
       if (CI->arg_size() == 4)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && (Name.starts_with("avx.vperm2f128.") ||
                          Name == "avx2.vperm2i128")) {
@@ -3263,7 +3256,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
       if (CI->arg_size() == 4)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && (Name == "sse2.pshufl.w" ||
                          Name.starts_with("avx512.mask.pshufl.w."))) {
@@ -3282,7 +3275,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
       if (CI->arg_size() == 4)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && (Name == "sse2.pshufh.w" ||
                          Name.starts_with("avx512.mask.pshufh.w."))) {
@@ -3301,7 +3294,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
       if (CI->arg_size() == 4)
-        Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+        Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                             CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.shuf.p")) {
       Value *Op0 = CI->getArgOperand(0);
@@ -3326,7 +3319,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
 
-      Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep,
                           CI->getArgOperand(3));
     } else if (IsX86 && (Name.starts_with("avx512.mask.movddup") ||
                          Name.starts_with("avx512.mask.movshdup") ||
@@ -3348,7 +3341,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = Builder.CreateShuffleVector(Op0, Op0, Idxs);
 
-      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
     } else if (IsX86 && (Name.starts_with("avx512.mask.punpckl") ||
                          Name.starts_with("avx512.mask.unpckl."))) {
@@ -3364,7 +3357,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
 
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && (Name.starts_with("avx512.mask.punpckh") ||
                          Name.starts_with("avx512.mask.unpckh."))) {
@@ -3380,7 +3373,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
 
       Rep = Builder.CreateShuffleVector(Op0, Op1, Idxs);
 
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && (Name.starts_with("avx512.mask.and.") ||
                          Name.starts_with("avx512.mask.pand."))) {
@@ -3389,7 +3382,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateAnd(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
                               Builder.CreateBitCast(CI->getArgOperand(1), ITy));
       Rep = Builder.CreateBitCast(Rep, FTy);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && (Name.starts_with("avx512.mask.andn.") ||
                          Name.starts_with("avx512.mask.pandn."))) {
@@ -3399,7 +3392,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateAnd(Rep,
                               Builder.CreateBitCast(CI->getArgOperand(1), ITy));
       Rep = Builder.CreateBitCast(Rep, FTy);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && (Name.starts_with("avx512.mask.or.") ||
                          Name.starts_with("avx512.mask.por."))) {
@@ -3408,7 +3401,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateOr(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
                              Builder.CreateBitCast(CI->getArgOperand(1), ITy));
       Rep = Builder.CreateBitCast(Rep, FTy);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && (Name.starts_with("avx512.mask.xor.") ||
                          Name.starts_with("avx512.mask.pxor."))) {
@@ -3417,19 +3410,19 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateXor(Builder.CreateBitCast(CI->getArgOperand(0), ITy),
                               Builder.CreateBitCast(CI->getArgOperand(1), ITy));
       Rep = Builder.CreateBitCast(Rep, FTy);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.padd.")) {
       Rep = Builder.CreateAdd(CI->getArgOperand(0), CI->getArgOperand(1));
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.psub.")) {
       Rep = Builder.CreateSub(CI->getArgOperand(0), CI->getArgOperand(1));
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.pmull.")) {
       Rep = Builder.CreateMul(CI->getArgOperand(0), CI->getArgOperand(1));
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.add.p")) {
       if (Name.ends_with(".512")) {
@@ -3445,7 +3438,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       } else {
         Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
       }
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.div.p")) {
       if (Name.ends_with(".512")) {
@@ -3461,7 +3454,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       } else {
         Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
       }
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.mul.p")) {
       if (Name.ends_with(".512")) {
@@ -3477,7 +3470,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       } else {
         Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
       }
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.sub.p")) {
       if (Name.ends_with(".512")) {
@@ -3493,7 +3486,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       } else {
         Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
       }
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && (Name.starts_with("avx512.mask.max.p") ||
                          Name.starts_with("avx512.mask.min.p")) &&
@@ -3509,14 +3502,14 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
                                { CI->getArgOperand(0), CI->getArgOperand(1),
                                  CI->getArgOperand(4) });
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep,
                           CI->getArgOperand(2));
     } else if (IsX86 && Name.starts_with("avx512.mask.lzcnt.")) {
       Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
                                                          Intrinsic::ctlz,
                                                          CI->getType()),
                                { CI->getArgOperand(0), Builder.getInt1(false) });
-      Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+      Rep = emitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
     } else if (IsX86 && Name.starts_with("avx512.mask.psll")) {
       bool IsImmediate = Name[16] == 'i' ||
@@ -3585,7 +3578,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           llvm_unreachable("Unexpected size");
       }
 
-      Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
+      Rep = upgradeX86MaskedShift(Builder, *CI, IID);
     } else if (IsX86 && Name.starts_with("avx512.mask.psrl")) {
       bool IsImmediate = Name[16] == 'i' ||
                          (Name.size() > 18 && Name[18] == 'i');
@@ -3653,7 +3646,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           llvm_unreachable("Unexpected size");
       }
 
-      Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
+      Rep = upgradeX86MaskedShift(Builder, *CI, IID);
     } else if (IsX86 && Name.starts_with("avx512.mask.psra")) {
       bool IsImmediate = Name[16] == 'i' ||
                          (Name.size() > 18 && Name[18] == 'i');
@@ -3719,11 +3712,11 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           llvm_unreachable("Unexpected size");
       }
 
-      Rep = UpgradeX86MaskedShift(Builder, *CI, IID);
+      Rep = upgradeX86MaskedShift(Builder, *CI, IID);
     } else if (IsX86 && Name.starts_with("avx512.mask.move.s")) {
       Rep = upgradeMaskedMove(Builder, *CI);
     } else if (IsX86 && Name.starts_with("avx512.cvtmask2")) {
-      Rep = UpgradeMaskToInt(Builder, *CI);
+      Rep = upgradeMaskToInt(Builder, *CI);
     } else if (IsX86 && Name.ends_with(".movntdqa")) {
       MDNode *Node = MDNode::get(
           C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
@@ -3839,8 +3832,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
         PassThru = Builder.CreateExtractElement(CI->getArgOperand(2),
                                                 (uint64_t)0);
 
-      Rep = EmitX86ScalarSelect(Builder, CI->getArgOperand(3),
-                                Rep, PassThru);
+      Rep = emitX86ScalarSelect(Builder, CI->getArgOperand(3), Rep, PassThru);
       Rep = Builder.CreateInsertElement(CI->getArgOperand(IsMask3 ? 2 : 0),
                                         Rep, (uint64_t)0);
     } else if (IsX86 && (Name.starts_with("avx512.mask.vfmadd.p") ||
@@ -3891,7 +3883,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                         IsMask3 ? CI->getArgOperand(2) :
                                   CI->getArgOperand(0);
 
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
     } else if (IsX86 &&  Name.starts_with("fma.vfmsubadd.p")) {
       unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
       unsigned EltWidth = CI->getType()->getScalarSizeInBits();
@@ -3962,7 +3954,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                         IsMask3 ? CI->getArgOperand(2) :
                                   CI->getArgOperand(0);
 
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
     } else if (IsX86 && (Name.starts_with("avx512.mask.pternlog.") ||
                          Name.starts_with("avx512.maskz.pternlog."))) {
       bool ZeroMask = Name[11] == 'z';
@@ -3990,7 +3982,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                Args);
       Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                  : CI->getArgOperand(0);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
+      Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
     } else if (IsX86 && (Name.starts_with("avx512.mask.vpmadd52") ||
                          Name.starts_with("avx512.maskz.vpmadd52"))) {
       bool ZeroMask = Name[11] == 'z';
@@ -4018,13 +4010,13 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                Args);
       Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                  : CI->getArgOperand(0);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
     } else if (IsX86 && (Name.starts_with("avx512.mask.vpermi2var.") ||
                          Name.starts_with("avx512.mask.vpermt2var.") ||
                          Name.starts_with("avx512.maskz.vpermt2var."))) {
       bool ZeroMask = Name[11] == 'z';
       bool IndexForm = Name[17] == 'i';
-      Rep = UpgradeX86VPERMT2Intrinsics(Builder, *CI, ZeroMask, IndexForm);
+      Rep = upgradeX86VPERMT2Intrinsics(Builder, *CI, ZeroMask, IndexForm);
     } else if (IsX86 && (Name.starts_with("avx512.mask.vpdpbusd.") ||
                          Name.starts_with("avx512.maskz.vpdpbusd.") ||
                          Name.starts_with("avx512.mask.vpdpbusds.") ||
@@ -4054,7 +4046,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                Args);
       Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                  : CI->getArgOperand(0);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
     } else if (IsX86 && (Name.starts_with("avx512.mask.vpdpwssd.") ||
                          Name.starts_with("avx512.maskz.vpdpwssd.") ||
                          Name.starts_with("avx512.mask.vpdpwssds.") ||
@@ -4084,7 +4076,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                Args);
       Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                  : CI->getArgOperand(0);
-      Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
+      Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
     } else if (IsX86 && (Name == "addcarryx.u32" || Name == "addcarryx.u64" ||
                          Name == "addcarry.u32" || Name == "addcarry.u64" ||
                          Name == "subborrow.u32" || Name == "subborrow.u64")) {
@@ -4152,7 +4144,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                        : Builder.CreateICmpSLE(Arg0, Arg1, "min.cond");
       Rep = Builder.CreateSelect(Cmp, Arg0, Arg1, "min");
     } else if (IsNVVM && Name == "clz.ll") {
-      // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 and returns an i64.
+      // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
       Value *Arg = CI->getArgOperand(0);
       Value *Ctlz = Builder.CreateCall(
           Intrinsic::getDeclaration(F->getParent(), Intrinsic::ctlz,
@@ -4160,7 +4152,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
           {Arg, Builder.getFalse()}, "ctlz");
       Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
     } else if (IsNVVM && Name == "popc.ll") {
-      // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 and returns an
+      // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
       // i64.
       Value *Arg = CI->getArgOperand(0);
       Value *Popc = Builder.CreateCall(
@@ -4176,7 +4168,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
                                    {Builder.getFloatTy()}),
                                CI->getArgOperand(0), "h2f");
       } else {
-        Intrinsic::ID IID = ShouldUpgradeNVPTXBF16Intrinsic(Name);
+        Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
         if (IID != Intrinsic::not_intrinsic &&
             !F->getReturnType()->getScalarType()->isBFloatTy()) {
           rename(F);
@@ -4197,9 +4189,9 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
         }
       }
     } else if (IsARM) {
-      Rep = UpgradeARMIntrinsicCall(Name, CI, F, Builder);
+      Rep = upgradeARMIntrinsicCall(Name, CI, F, Builder);
     } else if (IsAMDGCN) {
-      Rep = UpgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
+      Rep = upgradeAMDGCNIntrinsicCall(Name, CI, F, Builder);
     } else {
       llvm_unreachable("Unknown function for CallBase upgrade.");
     }
@@ -4618,7 +4610,7 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
     Args[3] = getX86MaskVec(Builder, Args[3], NumElts);
 
     NewCall = Builder.CreateCall(NewFn, Args);
-    Value *Res = ApplyX86MaskOn1BitsVec(Builder, NewCall, nullptr);
+    Value *Res = applyX86MaskOn1BitsVec(Builder, NewCall, nullptr);
 
     NewCall->takeName(CI);
     CI->replaceAllUsesWith(Res);
@@ -4827,7 +4819,7 @@ bool llvm::UpgradeDebugInfo(Module &M) {
 
 /// This checks for objc retain release marker which should be upgraded. It
 /// returns true if module is modified.
-static bool UpgradeRetainReleaseMarker(Module &M) {
+static bool upgradeRetainReleaseMarker(Module &M) {
   bool Changed = false;
   const char *MarkerKey = "clang.arc.retainAutoreleasedReturnValueMarker";
   NamedMDNode *ModRetainReleaseMarker = M.getNamedMetadata(MarkerKey);
@@ -4926,7 +4918,7 @@ void llvm::UpgradeARCRuntime(Module &M) {
   // Upgrade the retain release marker. If there is no need to upgrade
   // the marker, that means either the module is already new enough to contain
   // new intrinsics or it is not ARC. There is no need to upgrade runtime call.
-  if (!UpgradeRetainReleaseMarker(M))
+  if (!upgradeRetainReleaseMarker(M))
     return;
 
   std::pair<const char *, llvm::Intrinsic::ID> RuntimeFuncs[] = {

From ce8fcad5f4dfaecdc0c74a99508b86e8e5d1d9e8 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Sat, 20 Jan 2024 14:44:51 -0500
Subject: [PATCH 249/843] [libc] Fix float.h header to include the system
 float.h first and add more definitions. (#78857)

---
 libc/include/llvm-libc-macros/float-macros.h | 145 ++++++++++++++++++-
 1 file changed, 142 insertions(+), 3 deletions(-)

diff --git a/libc/include/llvm-libc-macros/float-macros.h b/libc/include/llvm-libc-macros/float-macros.h
index 1db27f117a46f..8e4b91b9a0f55 100644
--- a/libc/include/llvm-libc-macros/float-macros.h
+++ b/libc/include/llvm-libc-macros/float-macros.h
@@ -9,13 +9,152 @@
 #ifndef __LLVM_LIBC_MACROS_FLOAT_MACROS_H
 #define __LLVM_LIBC_MACROS_FLOAT_MACROS_H
 
-#undef FLT_MANT_DIG
+#include_next <float.h>
+
+#ifndef FLT_RADIX
+#define FLT_RADIX __FLT_RADIX__
+#endif // FLT_RADIX
+
+#ifndef FLT_EVAL_METHOD
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#endif // FLT_EVAL_METHOD
+
+#ifndef DECIMAL_DIG
+#define DECIMAL_DIG __DECIMAL_DIG__
+#endif // DECIMAL_DIG
+
+#ifndef FLT_DECIMAL_DIG
+#define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
+#endif // FLT_DECIMAL_DIG
+
+#ifndef DBL_DECIMAL_DIG
+#define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
+#endif // DBL_DECIMAL_DIG
+
+#ifndef LDBL_DECIMAL_DIG
+#define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
+#endif // LDBL_DECIMAL_DIG
+
+#ifndef FLT_DIG
+#define FLT_DIG __FLT_DIG__
+#endif // FLT_DIG
+
+#ifndef DBL_DIG
+#define DBL_DIG __DBL_DIG__
+#endif // DBL_DIG
+
+#ifndef LDBL_DIG
+#define LDBL_DIG __LDBL_DIG__
+#endif // LDBL_DIG
+
+#ifndef FLT_MANT_DIG
 #define FLT_MANT_DIG __FLT_MANT_DIG__
+#endif // FLT_MANT_DIG
 
-#undef DBL_MANT_DIG
+#ifndef DBL_MANT_DIG
 #define DBL_MANT_DIG __DBL_MANT_DIG__
+#endif // DBL_MANT_DIG
 
-#undef LDBL_MANT_DIG
+#ifndef LDBL_MANT_DIG
 #define LDBL_MANT_DIG __LDBL_MANT_DIG__
+#endif // LDBL_MANT_DIG
+
+#ifndef FLT_MIN
+#define FLT_MIN __FLT_MIN__
+#endif // FLT_MIN
+
+#ifndef DBL_MIN
+#define DBL_MIN __DBL_MIN__
+#endif // DBL_MIN
+
+#ifndef LDBL_MIN
+#define LDBL_MIN __LDBL_MIN__
+#endif // LDBL_MIN
+
+#ifndef FLT_MAX
+#define FLT_MAX __FLT_MAX__
+#endif // FLT_MAX
+
+#ifndef DBL_MAX
+#define DBL_MAX __DBL_MAX__
+#endif // DBL_MAX
+
+#ifndef LDBL_MAX
+#define LDBL_MAX __LDBL_MAX__
+#endif // LDBL_MAX
+
+#ifndef FLT_TRUE_MIN
+#define FLT_TRUE_MIN __FLT_TRUE_MIN__
+#endif // FLT_TRUE_MIN
+
+#ifndef DBL_TRUE_MIN
+#define DBL_TRUE_MIN __DBL_TRUE_MIN__
+#endif // DBL_TRUE_MIN
+
+#ifndef LDBL_TRUE_MIN
+#define LDBL_TRUE_MIN __LDBL_TRUE_MIN__
+#endif // LDBL_TRUE_MIN
+
+#ifndef FLT_EPSILON
+#define FLT_EPSILON __FLT_EPSILON__
+#endif // FLT_EPSILON
+
+#ifndef DBL_EPSILON
+#define DBL_EPSILON __DBL_EPSILON__
+#endif // DBL_EPSILON
+
+#ifndef LDBL_EPSILON
+#define LDBL_EPSILON __LDBL_EPSILON__
+#endif // LDBL_EPSILON
+
+#ifndef FLT_MIN_EXP
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#endif // FLT_MIN_EXP
+
+#ifndef DBL_MIN_EXP
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#endif // DBL_MIN_EXP
+
+#ifndef LDBL_MIN_EXP
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+#endif // LDBL_MIN_EXP
+
+#ifndef FLT_MIN_10_EXP
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#endif // FLT_MIN_10_EXP
+
+#ifndef DBL_MIN_10_EXP
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#endif // DBL_MIN_10_EXP
+
+#ifndef LDBL_MIN_10_EXP
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+#endif // LDBL_MIN_10_EXP
+
+#ifndef FLT_MAX_EXP
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#endif // FLT_MAX_EXP
+
+#ifndef DBL_MAX_EXP
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#endif // DBL_MAX_EXP
+
+#ifndef LDBL_MAX_EXP
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+#endif // LDBL_MAX_EXP
+
+#ifndef FLT_MAX_10_EXP
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#endif // FLT_MAX_10_EXP
+
+#ifndef DBL_MAX_10_EXP
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#endif // DBL_MAX_10_EXP
+
+#ifndef LDBL_MAX_10_EXP
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+#endif // LDBL_MAX_10_EXP
+
+// TODO: Add FLT16 and FLT128 constants.
 
 #endif // __LLVM_LIBC_MACROS_FLOAT_MACROS_H

From 448b8e6162607f42948a11dd50793517198e4445 Mon Sep 17 00:00:00 2001
From: LEE KYOUNGHEON <stripe2933@gmail.com>
Date: Sun, 21 Jan 2024 05:06:02 +0900
Subject: [PATCH 250/843] Module documentation improvement: prebuilt module
 location can be directly fetched via CMake variable. (#78405)

CMake officially supports binary directory variable of installed
dependency using `FetchContent`. According to the current documentation,
it fetches `std` module and use its binary directory as hardcoded
string, `${CMAKE_BINARY_DIR}/_deps/std-build`, however it can be
replaced with `${std_BINARY_DIR}`.

Reference: https://cmake.org/cmake/help/latest/module/FetchContent.html
---
 libcxx/docs/Modules.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst
index 5099e6095582c..146a91522436d 100644
--- a/libcxx/docs/Modules.rst
+++ b/libcxx/docs/Modules.rst
@@ -183,8 +183,8 @@ This is a small sample program that uses the module ``std``. It consists of a
   # Adjust project compiler flags
   #
 
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fprebuilt-module-path=${CMAKE_BINARY_DIR}/_deps/std-build/CMakeFiles/std.dir/>)
-  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fprebuilt-module-path=${CMAKE_BINARY_DIR}/_deps/std-build/CMakeFiles/std.compat.dir/>)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fprebuilt-module-path=${std_BINARY_DIR}/CMakeFiles/std.dir/>)
+  add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-fprebuilt-module-path=${std_BINARY_DIR}/CMakeFiles/std.compat.dir/>)
   add_compile_options($<$<COMPILE_LANGUAGE:CXX>:-nostdinc++>)
   # The include path needs to be set to be able to use macros from headers.
   # For example from, the headers <cassert> and <version>.

From 0175a1e4d33720ed7e827b3db5a36f88bdd790a3 Mon Sep 17 00:00:00 2001
From: Daniil Dudkin <unterumarmung@yandex.ru>
Date: Sat, 20 Jan 2024 23:19:17 +0300
Subject: [PATCH 251/843] new-prs-labeler: Add `clang-tools-extra` labeling
 (#78633)

There is no automatic labeling for the Extra Clang Tools, except
Clang-Tidy and ClangD.
---
 .github/new-prs-labeler.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index e4ae2b7c839f9..9b975d051b99e 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -308,6 +308,9 @@ clang-tidy:
   - clang-tools-extra/docs/clang-tidy/**
   - clang-tools-extra/test/clang-tidy/**
 
+clang-tools-extra:
+  - clang-tools-extra/**
+
 tools:llvm-mca:
   - llvm/tools/llvm-mca/**
   - llvm/include/llvm/MCA/**

From 1cc7cd46a9158e2c254ed72274c69f36f62cf7a3 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Sat, 20 Jan 2024 16:14:17 -0500
Subject: [PATCH 252/843] [libc] Fix size_t used without including stddef.h in
 CPP/limit.h. (#78861)

---
 libc/src/__support/CPP/limits.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/src/__support/CPP/limits.h b/libc/src/__support/CPP/limits.h
index e92ad00f80e03..d9e7090a0b7b1 100644
--- a/libc/src/__support/CPP/limits.h
+++ b/libc/src/__support/CPP/limits.h
@@ -20,7 +20,7 @@ namespace cpp {
 
 // Some older gcc distributions don't define these for 32 bit targets.
 #ifndef LLONG_MAX
-constexpr size_t LLONG_BIT_WIDTH = sizeof(long long) * 8;
+constexpr unsigned int LLONG_BIT_WIDTH = sizeof(long long) * 8;
 constexpr long long LLONG_MAX = ~0LL ^ (1LL << (LLONG_BIT_WIDTH - 1));
 constexpr long long LLONG_MIN = 1LL << (LLONG_BIT_WIDTH - 1);
 constexpr unsigned long long ULLONG_MAX = ~0ULL;

From 85a8e5c3e0586e85a2fa3ff9cef12455bd039921 Mon Sep 17 00:00:00 2001
From: Hui <hui.xie1990@gmail.com>
Date: Sat, 20 Jan 2024 21:18:44 +0000
Subject: [PATCH 253/843] [libc++] fix condition_variable_any hangs on
 stop_request (#77127)

When I implemented `condition_variable_any::wait`, I missed the most
important paragraph in the spec:

> The following wait functions will be notified when there is a stop
request on the passed stop_token.
> In that case the functions return immediately, returning false if the
predicate evaluates to false.

From
https://eel.is/c++draft/thread.condition#thread.condvarany.intwait-1.

Fixes #76807
---
 libcxx/include/condition_variable             | 93 +++++++++++++++----
 .../thread.condition.condvarany/helpers.h     | 45 +++++++++
 .../wait_for_token_pred.pass.cpp              | 66 +++++++++++++
 .../wait_token_pred.pass.cpp                  | 50 ++++++++++
 .../wait_until_token_pred.pass.cpp            | 87 ++++++++++++++---
 5 files changed, 311 insertions(+), 30 deletions(-)
 create mode 100644 libcxx/test/std/thread/thread.condition/thread.condition.condvarany/helpers.h

diff --git a/libcxx/include/condition_variable b/libcxx/include/condition_variable
index cf7a570b6cb63..e375c986e7f12 100644
--- a/libcxx/include/condition_variable
+++ b/libcxx/include/condition_variable
@@ -126,11 +126,11 @@ public:
 #include <__condition_variable/condition_variable.h>
 #include <__config>
 #include <__memory/shared_ptr.h>
-#include <__memory/unique_ptr.h>
 #include <__mutex/lock_guard.h>
 #include <__mutex/mutex.h>
 #include <__mutex/tag_types.h>
 #include <__mutex/unique_lock.h>
+#include <__stop_token/stop_callback.h>
 #include <__stop_token/stop_token.h>
 #include <__utility/move.h>
 #include <version>
@@ -200,19 +200,26 @@ inline void condition_variable_any::notify_all() _NOEXCEPT {
   __cv_.notify_all();
 }
 
-struct __lock_external {
-  template <class _Lock>
-  _LIBCPP_HIDE_FROM_ABI void operator()(_Lock* __m) {
-    __m->lock();
+template <class _Lock>
+struct __unlock_guard {
+  _Lock& __lock_;
+
+  _LIBCPP_HIDE_FROM_ABI __unlock_guard(_Lock& __lock) : __lock_(__lock) { __lock_.unlock(); }
+
+  _LIBCPP_HIDE_FROM_ABI ~__unlock_guard() _NOEXCEPT // turns exception to std::terminate
+  {
+    __lock_.lock();
   }
+
+  __unlock_guard(const __unlock_guard&)            = delete;
+  __unlock_guard& operator=(const __unlock_guard&) = delete;
 };
 
 template <class _Lock>
 void condition_variable_any::wait(_Lock& __lock) {
   shared_ptr<mutex> __mut = __mut_;
   unique_lock<mutex> __lk(*__mut);
-  __lock.unlock();
-  unique_ptr<_Lock, __lock_external> __lxx(&__lock);
+  __unlock_guard<_Lock> __unlock(__lock);
   lock_guard<unique_lock<mutex> > __lx(__lk, adopt_lock_t());
   __cv_.wait(__lk);
 } // __mut_.unlock(), __lock.lock()
@@ -227,8 +234,7 @@ template <class _Lock, class _Clock, class _Duration>
 cv_status condition_variable_any::wait_until(_Lock& __lock, const chrono::time_point<_Clock, _Duration>& __t) {
   shared_ptr<mutex> __mut = __mut_;
   unique_lock<mutex> __lk(*__mut);
-  __lock.unlock();
-  unique_ptr<_Lock, __lock_external> __lxx(&__lock);
+  __unlock_guard<_Lock> __unlock(__lock);
   lock_guard<unique_lock<mutex> > __lx(__lk, adopt_lock_t());
   return __cv_.wait_until(__lk, __t);
 } // __mut_.unlock(), __lock.lock()
@@ -256,24 +262,75 @@ condition_variable_any::wait_for(_Lock& __lock, const chrono::duration<_Rep, _Pe
 #  if _LIBCPP_STD_VER >= 20 && !defined(_LIBCPP_HAS_NO_EXPERIMENTAL_STOP_TOKEN)
 
 template <class _Lock, class _Predicate>
-bool condition_variable_any::wait(_Lock& __lock, stop_token __stoken, _Predicate __pred) {
-  while (!__stoken.stop_requested()) {
+bool condition_variable_any::wait(_Lock& __user_lock, stop_token __stoken, _Predicate __pred) {
+  if (__stoken.stop_requested())
+    return __pred();
+
+  // Per https://eel.is/c++draft/thread.condition.condvarany#general-note-2,
+  // we do need to take a copy of the shared pointer __mut_
+  // This ensures that a thread can call the destructor immediately after calling
+  // notify_all, without waiting all the wait calls.
+  // A thread can also safely call the destructor immediately after calling
+  // request_stop, as the call to request_stop would evaluate the callback,
+  // which accesses the internal condition variable, immediately on the same thread.
+  // In this situation, it is OK even without copying a shared ownership the internal
+  // condition variable. However, this needs the evaluation of stop_callback to
+  // happen-before the destruction.
+  // The spec only says "Only the notification to unblock the wait needs to happen
+  // before destruction". To make this work, we need to copy the shared ownership of
+  // the internal condition variable inside this function, which is not possible
+  // with the current ABI.
+  shared_ptr<mutex> __mut = __mut_;
+
+  stop_callback __cb(__stoken, [this] { notify_all(); });
+
+  while (true) {
     if (__pred())
       return true;
-    wait(__lock);
-  }
+
+    // We need to take the internal lock before checking stop_requested,
+    // so that the notification cannot come in between the stop_requested
+    // check and entering the wait.
+    // Note that the stop_callback takes the same internal lock before notifying
+    unique_lock<mutex> __internal_lock(*__mut);
+    if (__stoken.stop_requested())
+      break;
+
+    __unlock_guard<_Lock> __unlock(__user_lock);
+    unique_lock<mutex> __internal_lock2(
+        std::move(__internal_lock)); // switch unlock order between __internal_lock and __user_lock
+    __cv_.wait(__internal_lock2);
+  } // __internal_lock2.unlock(), __user_lock.lock()
   return __pred();
 }
 
 template <class _Lock, class _Clock, class _Duration, class _Predicate>
 bool condition_variable_any::wait_until(
-    _Lock& __lock, stop_token __stoken, const chrono::time_point<_Clock, _Duration>& __abs_time, _Predicate __pred) {
-  while (!__stoken.stop_requested()) {
+    _Lock& __user_lock,
+    stop_token __stoken,
+    const chrono::time_point<_Clock, _Duration>& __abs_time,
+    _Predicate __pred) {
+  if (__stoken.stop_requested())
+    return __pred();
+
+  shared_ptr<mutex> __mut = __mut_;
+  stop_callback __cb(__stoken, [this] { notify_all(); });
+
+  while (true) {
     if (__pred())
       return true;
-    if (wait_until(__lock, __abs_time) == cv_status::timeout)
-      return __pred();
-  }
+
+    unique_lock<mutex> __internal_lock(*__mut);
+    if (__stoken.stop_requested())
+      break;
+
+    __unlock_guard<_Lock> __unlock(__user_lock);
+    unique_lock<mutex> __internal_lock2(
+        std::move(__internal_lock)); // switch unlock order between __internal_lock and __user_lock
+
+    if (__cv_.wait_until(__internal_lock2, __abs_time) == cv_status::timeout)
+      break;
+  } // __internal_lock2.unlock(), __user_lock.lock()
   return __pred();
 }
 
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/helpers.h b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/helpers.h
new file mode 100644
index 0000000000000..83687a1c648c9
--- /dev/null
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/helpers.h
@@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_STD_THREAD_CONDITION_CONDVARANY_HELPERS_H
+#define TEST_STD_THREAD_CONDITION_CONDVARANY_HELPERS_H
+
+#include <chrono>
+#include <cassert>
+
+#include "test_macros.h"
+
+#if TEST_STD_VER >= 17
+
+// wait_for and wait_until function can exit via
+// - predicate is true
+// - timeout
+// - stop_requested
+// The return value only tells if the predicate is true
+// when the function exits, but it does not tell whether
+// it is timeout or stop_requested.
+//
+// ElapsedTimeCheck would fail the test if a test takes
+// longer than a duration. This is useful because we can
+// ensure that the wait_{for, until} function does not
+// wait until the timeout
+struct ElapsedTimeCheck {
+  ElapsedTimeCheck(std::chrono::steady_clock::duration timeoutDuration)
+      : timeout_(std::chrono::steady_clock::now() + timeoutDuration) {}
+
+  ElapsedTimeCheck(ElapsedTimeCheck&&)            = delete;
+  ElapsedTimeCheck& operator=(ElapsedTimeCheck&&) = delete;
+
+  ~ElapsedTimeCheck() { assert(std::chrono::steady_clock::now() < timeout_); }
+
+  std::chrono::time_point<std::chrono::steady_clock> timeout_;
+};
+
+#endif
+
+#endif // TEST_STD_THREAD_CONDITION_CONDVARANY_HELPERS_H
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_token_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_token_pred.pass.cpp
index fb3f0287726ee..4ea60557d9f88 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_token_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_for_token_pred.pass.cpp
@@ -29,6 +29,7 @@
 #include <stop_token>
 #include <thread>
 
+#include "helpers.h"
 #include "make_test_thread.h"
 #include "test_macros.h"
 
@@ -44,6 +45,8 @@ void test() {
     Lock lock{mutex};
     ss.request_stop();
 
+    ElapsedTimeCheck check(1min);
+
     // [Note 4: The returned value indicates whether the predicate evaluated to true
     // regardless of whether the timeout was triggered or a stop request was made.]
     std::same_as<bool> auto r1 = cv.wait_for(lock, ss.get_token(), -1h, []() { return false; });
@@ -69,6 +72,8 @@ void test() {
     Mutex mutex;
     Lock lock{mutex};
 
+    ElapsedTimeCheck check(1min);
+
     std::same_as<bool> auto r1 = cv.wait_for(lock, ss.get_token(), -1h, []() { return true; });
     assert(r1);
 
@@ -83,6 +88,8 @@ void test() {
     Mutex mutex;
     Lock lock{mutex};
 
+    ElapsedTimeCheck check(1min);
+
     std::same_as<bool> auto r1 = cv.wait_for(lock, ss.get_token(), -1h, []() { return false; });
     assert(!r1);
   }
@@ -117,6 +124,8 @@ void test() {
       cv.notify_all();
     });
 
+    ElapsedTimeCheck check(10min);
+
     std::same_as<bool> auto r1 = cv.wait_for(lock, ss.get_token(), 1h, [&]() { return flag; });
     assert(flag);
     assert(r1);
@@ -143,6 +152,8 @@ void test() {
       }
     });
 
+    ElapsedTimeCheck check(10min);
+
     std::same_as<bool> auto r = cv.wait_for(lock, ss.get_token(), 1h, [&]() {
       start.store(true);
       start.notify_all();
@@ -155,6 +166,60 @@ void test() {
     assert(lock.owns_lock());
   }
 
+  // #76807 Hangs in std::condition_variable_any when used with std::stop_token
+  {
+    class MyThread {
+    public:
+      MyThread() {
+        thread_ = support::make_test_jthread([this](std::stop_token st) {
+          while (!st.stop_requested()) {
+            std::unique_lock lock{m_};
+            cv_.wait_for(lock, st, 1h, [] { return false; });
+          }
+        });
+      }
+
+    private:
+      std::mutex m_;
+      std::condition_variable_any cv_;
+      std::jthread thread_;
+    };
+
+    ElapsedTimeCheck check(10min);
+
+    [[maybe_unused]] MyThread my_thread;
+  }
+
+  // request_stop potentially in-between check and wait
+  {
+    std::stop_source ss;
+    std::condition_variable_any cv;
+    Mutex mutex;
+    Lock lock{mutex};
+
+    std::atomic_bool pred_started        = false;
+    std::atomic_bool request_stop_called = false;
+    auto thread                          = support::make_test_thread([&]() {
+      pred_started.wait(false);
+      ss.request_stop();
+      request_stop_called.store(true);
+      request_stop_called.notify_all();
+    });
+
+    ElapsedTimeCheck check(10min);
+
+    std::same_as<bool> auto r = cv.wait_for(lock, ss.get_token(), 1h, [&]() {
+      pred_started.store(true);
+      pred_started.notify_all();
+      request_stop_called.wait(false);
+      return false;
+    });
+    assert(!r);
+    thread.join();
+
+    assert(lock.owns_lock());
+  }
+
 #if !defined(TEST_HAS_NO_EXCEPTIONS)
   // Throws: Any exception thrown by pred.
   {
@@ -164,6 +229,7 @@ void test() {
     Lock lock{mutex};
 
     try {
+      ElapsedTimeCheck check(10min);
       cv.wait_for(lock, ss.get_token(), 1h, []() -> bool { throw 5; });
       assert(false);
     } catch (int i) {
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_token_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_token_pred.pass.cpp
index 451df9ab7ee28..e96a3e8bd1bc0 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_token_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_token_pred.pass.cpp
@@ -107,6 +107,56 @@ void test() {
     assert(lock.owns_lock());
   }
 
+  // #76807 Hangs in std::condition_variable_any when used with std::stop_token
+  {
+    class MyThread {
+    public:
+      MyThread() {
+        thread_ = support::make_test_jthread([this](std::stop_token st) {
+          while (!st.stop_requested()) {
+            std::unique_lock lock{m_};
+            cv_.wait(lock, st, [] { return false; });
+          }
+        });
+      }
+
+    private:
+      std::mutex m_;
+      std::condition_variable_any cv_;
+      std::jthread thread_;
+    };
+
+    [[maybe_unused]] MyThread my_thread;
+  }
+
+  // request_stop potentially in-between check and wait
+  {
+    std::stop_source ss;
+    std::condition_variable_any cv;
+    Mutex mutex;
+    Lock lock{mutex};
+
+    std::atomic_bool pred_started        = false;
+    std::atomic_bool request_stop_called = false;
+    auto thread                          = support::make_test_thread([&]() {
+      pred_started.wait(false);
+      ss.request_stop();
+      request_stop_called.store(true);
+      request_stop_called.notify_all();
+    });
+
+    std::same_as<bool> auto r = cv.wait(lock, ss.get_token(), [&]() {
+      pred_started.store(true);
+      pred_started.notify_all();
+      request_stop_called.wait(false);
+      return false;
+    });
+    assert(!r);
+    thread.join();
+
+    assert(lock.owns_lock());
+  }
+
 #if !defined(TEST_HAS_NO_EXCEPTIONS)
   // Throws: Any exception thrown by pred.
   {
diff --git a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_token_pred.pass.cpp b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_token_pred.pass.cpp
index 6cdcbe36d9859..d649db025d755 100644
--- a/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_token_pred.pass.cpp
+++ b/libcxx/test/std/thread/thread.condition/thread.condition.condvarany/wait_until_token_pred.pass.cpp
@@ -29,13 +29,15 @@
 #include <stop_token>
 #include <thread>
 
+#include "helpers.h"
 #include "make_test_thread.h"
 #include "test_macros.h"
 
 template <class Mutex, class Lock>
 void test() {
-  const auto past   = std::chrono::steady_clock::now() - std::chrono::hours(1);
-  const auto future = std::chrono::steady_clock::now() + std::chrono::hours(1);
+  using namespace std::chrono_literals;
+  const auto oneHourAgo   = std::chrono::steady_clock::now() - 1h;
+  const auto oneHourLater = std::chrono::steady_clock::now() + 1h;
 
   // stop_requested before hand
   {
@@ -44,19 +46,20 @@ void test() {
     Mutex mutex;
     Lock lock{mutex};
     ss.request_stop();
+    ElapsedTimeCheck check(1min);
 
     // [Note 4: The returned value indicates whether the predicate evaluated to true
     // regardless of whether the timeout was triggered or a stop request was made.]
-    std::same_as<bool> auto r1 = cv.wait_until(lock, ss.get_token(), past, []() { return false; });
+    std::same_as<bool> auto r1 = cv.wait_until(lock, ss.get_token(), oneHourAgo, []() { return false; });
     assert(!r1);
 
-    std::same_as<bool> auto r2 = cv.wait_until(lock, ss.get_token(), future, []() { return false; });
+    std::same_as<bool> auto r2 = cv.wait_until(lock, ss.get_token(), oneHourLater, []() { return false; });
     assert(!r2);
 
-    std::same_as<bool> auto r3 = cv.wait_until(lock, ss.get_token(), past, []() { return true; });
+    std::same_as<bool> auto r3 = cv.wait_until(lock, ss.get_token(), oneHourAgo, []() { return true; });
     assert(r3);
 
-    std::same_as<bool> auto r4 = cv.wait_until(lock, ss.get_token(), future, []() { return true; });
+    std::same_as<bool> auto r4 = cv.wait_until(lock, ss.get_token(), oneHourLater, []() { return true; });
     assert(r4);
 
     // Postconditions: lock is locked by the calling thread.
@@ -69,11 +72,12 @@ void test() {
     std::condition_variable_any cv;
     Mutex mutex;
     Lock lock{mutex};
+    ElapsedTimeCheck check(1min);
 
-    std::same_as<bool> auto r1 = cv.wait_until(lock, ss.get_token(), past, []() { return true; });
+    std::same_as<bool> auto r1 = cv.wait_until(lock, ss.get_token(), oneHourAgo, []() { return true; });
     assert(r1);
 
-    std::same_as<bool> auto r2 = cv.wait_until(lock, ss.get_token(), future, []() { return true; });
+    std::same_as<bool> auto r2 = cv.wait_until(lock, ss.get_token(), oneHourLater, []() { return true; });
     assert(r2);
   }
 
@@ -83,8 +87,9 @@ void test() {
     std::condition_variable_any cv;
     Mutex mutex;
     Lock lock{mutex};
+    ElapsedTimeCheck check(1min);
 
-    std::same_as<bool> auto r1 = cv.wait_until(lock, ss.get_token(), past, []() { return false; });
+    std::same_as<bool> auto r1 = cv.wait_until(lock, ss.get_token(), oneHourAgo, []() { return false; });
     assert(!r1);
   }
 
@@ -119,7 +124,9 @@ void test() {
       cv.notify_all();
     });
 
-    std::same_as<bool> auto r1 = cv.wait_until(lock, ss.get_token(), future, [&]() { return flag; });
+    ElapsedTimeCheck check(10min);
+
+    std::same_as<bool> auto r1 = cv.wait_until(lock, ss.get_token(), oneHourLater, [&]() { return flag; });
     assert(flag);
     assert(r1);
 
@@ -145,7 +152,9 @@ void test() {
       }
     });
 
-    std::same_as<bool> auto r = cv.wait_until(lock, ss.get_token(), future, [&]() {
+    ElapsedTimeCheck check(10min);
+
+    std::same_as<bool> auto r = cv.wait_until(lock, ss.get_token(), oneHourLater, [&]() {
       start.store(true);
       start.notify_all();
       return false;
@@ -157,6 +166,60 @@ void test() {
     assert(lock.owns_lock());
   }
 
+  // #76807 Hangs in std::condition_variable_any when used with std::stop_token
+  {
+    class MyThread {
+    public:
+      MyThread() {
+        thread_ = support::make_test_jthread([this](std::stop_token st) {
+          while (!st.stop_requested()) {
+            std::unique_lock lock{m_};
+            cv_.wait_until(lock, st, std::chrono::steady_clock::now() + std::chrono::hours(1), [] { return false; });
+          }
+        });
+      }
+
+    private:
+      std::mutex m_;
+      std::condition_variable_any cv_;
+      std::jthread thread_;
+    };
+
+    ElapsedTimeCheck check(10min);
+
+    [[maybe_unused]] MyThread my_thread;
+  }
+
+  // request_stop potentially in-between check and wait
+  {
+    std::stop_source ss;
+    std::condition_variable_any cv;
+    Mutex mutex;
+    Lock lock{mutex};
+
+    std::atomic_bool pred_started        = false;
+    std::atomic_bool request_stop_called = false;
+    auto thread                          = support::make_test_thread([&]() {
+      pred_started.wait(false);
+      ss.request_stop();
+      request_stop_called.store(true);
+      request_stop_called.notify_all();
+    });
+
+    ElapsedTimeCheck check(10min);
+
+    std::same_as<bool> auto r = cv.wait_until(lock, ss.get_token(), oneHourLater, [&]() {
+      pred_started.store(true);
+      pred_started.notify_all();
+      request_stop_called.wait(false);
+      return false;
+    });
+    assert(!r);
+    thread.join();
+
+    assert(lock.owns_lock());
+  }
+
 #if !defined(TEST_HAS_NO_EXCEPTIONS)
   // Throws: Any exception thrown by pred.
   {
@@ -166,7 +229,7 @@ void test() {
     Lock lock{mutex};
 
     try {
-      cv.wait_until(lock, ss.get_token(), future, []() -> bool { throw 5; });
+      cv.wait_until(lock, ss.get_token(), oneHourLater, []() -> bool { throw 5; });
       assert(false);
     } catch (int i) {
       assert(i == 5);

From 2fc2ee136c0183f40af4c0e7a8d27092b8ce3415 Mon Sep 17 00:00:00 2001
From: XDeme <66138117+XDeme@users.noreply.github.com>
Date: Sat, 20 Jan 2024 18:34:37 -0300
Subject: [PATCH 254/843] [clang-format] Fix poor spacing in
 `AlignArrayOfStructures: Left` (#77868)

Fixes llvm/llvm-project#62904

`AlignArrayOfStructures: Left` combined with `SpacesInParentheses: true`
causes the first cell of every row to have 1 additional space.
We were only setting the first cell of the first row to be against the
left brace, now every row will be against the left brace.
---
 clang/lib/Format/WhitespaceManager.cpp | 11 ++++++-----
 clang/unittests/Format/FormatTest.cpp  |  8 ++++++++
 2 files changed, 14 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Format/WhitespaceManager.cpp b/clang/lib/Format/WhitespaceManager.cpp
index 8415c8d360d64..df84f97a8e8ac 100644
--- a/clang/lib/Format/WhitespaceManager.cpp
+++ b/clang/lib/Format/WhitespaceManager.cpp
@@ -1366,11 +1366,12 @@ void WhitespaceManager::alignArrayInitializersLeftJustified(
   auto &Cells = CellDescs.Cells;
   // Now go through and fixup the spaces.
   auto *CellIter = Cells.begin();
-  // The first cell needs to be against the left brace.
-  if (Changes[CellIter->Index].NewlinesBefore == 0)
-    Changes[CellIter->Index].Spaces = BracePadding;
-  else
-    Changes[CellIter->Index].Spaces = CellDescs.InitialSpaces;
+  // The first cell of every row needs to be against the left brace.
+  for (const auto *Next = CellIter; Next; Next = Next->NextColumnElement) {
+    auto &Change = Changes[Next->Index];
+    Change.Spaces =
+        Change.NewlinesBefore == 0 ? BracePadding : CellDescs.InitialSpaces;
+  }
   ++CellIter;
   for (auto i = 1U; i < CellDescs.CellCounts[0]; i++, ++CellIter) {
     auto MaxNetWidth = getMaximumNetWidth(
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 44896c10b0b25..cab495125415a 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -21331,6 +21331,14 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresLeftAlignment) {
       "00000000000000000000000000000000000000000000000000000000\" },\n"
       "};",
       Style);
+
+  Style.SpacesInParens = FormatStyle::SIPO_Custom;
+  Style.SpacesInParensOptions.Other = true;
+  verifyFormat("Foo foo[] = {\n"
+               "    {1, 1},\n"
+               "    {1, 1},\n"
+               "};",
+               Style);
 }
 
 TEST_F(FormatTest, UnderstandsPragmas) {

From aaa7de1fc9255f489e760c3449efde9c07ddb1cb Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Sat, 20 Jan 2024 13:37:38 -0800
Subject: [PATCH 255/843] [libc++][hardening] XFAIL test in fast mode under
 HWASAN (#78862)

After #77883, `fast` mode uses TRAP, and HWASAN
replaces TRAP with abort or error exit code.

On a quick looks it should be possible to avoid doing
that in HWASAN, but historically this is convention for all
sanitizers. Changing this behavior may break existing
users.

Other sanitizers are not affected because they don't
install TRAP handlers by default. But if they do, they also
replace TRAP with abort/exit.
---
 .../modes/enabling_assertions_enables_extensive_mode.pass.cpp   | 2 ++
 .../assertions/modes/override_with_extensive_mode.pass.cpp      | 2 ++
 .../libcxx/assertions/modes/override_with_fast_mode.pass.cpp    | 2 ++
 3 files changed, 6 insertions(+)

diff --git a/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp b/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp
index a91ba04176374..23ef20d4e0781 100644
--- a/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp
@@ -15,6 +15,8 @@
 // The ability to set a custom abort message is required to compare the assertion message (which only happens in the
 // debug mode).
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// HWASAN replaces TRAP with abort or error exit code.
+// XFAIL: libcpp-hardening-mode=fast && hwasan
 // Note that GCC doesn't support `-Wno-macro-redefined`.
 // ADDITIONAL_COMPILE_FLAGS: -U_LIBCPP_HARDENING_MODE -D_LIBCPP_ENABLE_ASSERTIONS=1
 
diff --git a/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp b/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp
index f78d5d70e5890..c3cdfa926c6cc 100644
--- a/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp
@@ -13,6 +13,8 @@
 // The ability to set a custom abort message is required to compare the assertion message (which only happens in the
 // debug mode).
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// HWASAN replaces TRAP with abort or error exit code.
+// XFAIL: libcpp-hardening-mode=fast && hwasan
 // ADDITIONAL_COMPILE_FLAGS: -U_LIBCPP_HARDENING_MODE -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_EXTENSIVE
 
 #include <cassert>
diff --git a/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp b/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp
index 27542ee32bef9..854bf6c5da9cd 100644
--- a/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp
@@ -13,6 +13,8 @@
 // The ability to set a custom abort message is required to compare the assertion message (which only happens in the
 // debug mode).
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+// HWASAN replaces TRAP with abort or error exit code.
+// XFAIL: libcpp-hardening-mode=fast && hwasan
 // ADDITIONAL_COMPILE_FLAGS: -U_LIBCPP_HARDENING_MODE -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST
 
 #include <cassert>

From 07b5829fcad6514ab1079027e1b4d3565f963ead Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hristo.goshev.hristov@gmail.com>
Date: Sat, 20 Jan 2024 23:48:48 +0200
Subject: [PATCH 256/843] [libc++] FreeBSD CI: Adds `<signal.h>` to
 `check_assertion.h` (#78863)

...in attempt to fix the FreeBSD CI.

I noticed that suddenly some tests in the latest PRs fail to compile on
FreeBSD (`SIGILL` and `SIGTRAP` not defined). This tries to resolve
the issue.

Co-authored-by: Zingam <zingam@outlook.com>
---
 libcxx/test/support/check_assertion.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index 485f8103c7ad8..1811df5f474f5 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -24,7 +24,9 @@
 
 #include <unistd.h>
 #include <errno.h>
+#include <signal.h>
 #include <sys/wait.h>
+
 #include "test_macros.h"
 #include "test_allocator.h"
 

From 46a9135d61f729da90b88d3d34a3905c91d194d7 Mon Sep 17 00:00:00 2001
From: OldWorldOrdr <joey.t.reinhart@gmail.com>
Date: Sat, 20 Jan 2024 16:53:55 -0500
Subject: [PATCH 257/843] [lld-macho] Find objects in library search path
 (#78628)

Find object files in library search path just like Apple's linker, this
makes building with some older MacOS SDKs easier since clang runs with
`-lcrt1.10.6.o`
---
 lld/MachO/Driver.cpp             |  4 ++++
 lld/test/MachO/link-csu-object.s | 14 ++++++++++++++
 2 files changed, 18 insertions(+)
 create mode 100644 lld/test/MachO/link-csu-object.s

diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 401459a054394..7ac3f51cec103 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -90,6 +90,10 @@ static std::optional<StringRef> findLibrary(StringRef name) {
     return entry->second;
 
   auto doFind = [&] {
+    // Special case for Csu support files required for Mac OS X 10.7 and older
+    // (crt1.o)
+    if (name.ends_with(".o"))
+      return findPathCombination(name, config->librarySearchPaths, {""});
     if (config->searchDylibsFirst) {
       if (std::optional<StringRef> path =
               findPathCombination("lib" + name, config->librarySearchPaths,
diff --git a/lld/test/MachO/link-csu-object.s b/lld/test/MachO/link-csu-object.s
new file mode 100644
index 0000000000000..e6f5ff7e52e32
--- /dev/null
+++ b/lld/test/MachO/link-csu-object.s
@@ -0,0 +1,14 @@
+# REQUIRES: x86
+# RUN: mkdir -p %t
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %p/Inputs/libhello.s -o %t/hello.o
+# RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/main.o
+# RUN: %lld -L %t %t/main.o %t/hello.o -o %t/a.out
+# RUN: llvm-nm %t/a.out | FileCheck %s
+
+# CHECK: _main
+# CHECK: _print_hello
+
+.globl _main
+_main:
+    call _print_hello
+    ret

From 49212d1601a1f0e34a8867eb1ad2e394f91cade1 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Sat, 20 Jan 2024 22:16:05 +0000
Subject: [PATCH 258/843] [Flang] Fix for replacing loop uses in LoopVersioning
 pass (#77899)

The added test case has a loop that is versioned, which has a use of the
loop in an if block after the loop. The current code replaces all uses
of the loop with the new version If, but only if the parent blocks
match. As far as I can see it should be safe to replace all the uses,
then construct the result for the If with op.op.
---
 .../Optimizer/Transforms/LoopVersioning.cpp   | 12 +--
 flang/test/Transforms/loop-versioning.fir     | 81 ++++++++++++++++++-
 2 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
index 48e42bdab5c0e..bca70a0a0d322 100644
--- a/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
+++ b/flang/lib/Optimizer/Transforms/LoopVersioning.cpp
@@ -144,16 +144,6 @@ struct ArgsUsageInLoop {
 };
 } // namespace
 
-/// @c replaceOuterUses - replace uses outside of @c op with result of @c
-/// outerOp
-static void replaceOuterUses(mlir::Operation *op, mlir::Operation *outerOp) {
-  const mlir::Operation *outerParent = outerOp->getParentOp();
-  op->replaceUsesWithIf(outerOp, [&](mlir::OpOperand &operand) {
-    mlir::Operation *owner = operand.getOwner();
-    return outerParent == owner->getParentOp();
-  });
-}
-
 static fir::SequenceType getAsSequenceType(mlir::Value *v) {
   mlir::Type argTy = fir::unwrapPassByRefType(fir::unwrapRefType(v->getType()));
   return argTy.dyn_cast<fir::SequenceType>();
@@ -538,7 +528,7 @@ void LoopVersioningPass::runOnOperation() {
 
     // Add the original loop in the else-side of the if operation.
     builder.setInsertionPointToStart(&ifOp.getElseRegion().front());
-    replaceOuterUses(op.op, ifOp);
+    op.op->replaceAllUsesWith(ifOp);
     op.op->remove();
     builder.insert(op.op);
     // Rely on "cloned loop has results, so original loop also has results".
diff --git a/flang/test/Transforms/loop-versioning.fir b/flang/test/Transforms/loop-versioning.fir
index 6313bc2ac36a7..d1ad1510b0e89 100644
--- a/flang/test/Transforms/loop-versioning.fir
+++ b/flang/test/Transforms/loop-versioning.fir
@@ -263,7 +263,7 @@ func.func @sum1dfixed(%arg0: !fir.ref<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // CHECK-SAME:            %[[N1:.*]]: !fir.ref<i32> {{.*}},
 // CHECK-SAME:            %[[M1:.*]]: !fir.ref<i32> {{.*}}) {
 // CHECK: fir.do_loop
-// CHECL:   %[[FOUR:.*]] = arith.constant 4 : index
+// CHECK:   %[[FOUR:.*]] = arith.constant 4 : index
 // CHECK:   %[[COMP:.*]] = arith.cmpi {{.*}}, %[[FOUR]]
 // CHECK:   fir.if %[[COMP]] -> {{.*}} {
 // CHECK:     %[[CONV:.*]] = fir.convert %[[B]] :
@@ -1478,4 +1478,83 @@ func.func @sum1drebox(%arg0: !fir.box<!fir.array<?xf64>> {fir.bindc_name = "a"},
 // CHECK-NOT: fir.if
 
 
+// Check for a use in a different block (%12 = do_loop is used inside the if %14 block)
+func.func @minloc(%arg0: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "x"}, %arg1: !fir.box<!fir.array<?xi32>> {fir.bindc_name = "mask"}) -> f32 {
+  %c2147483647_i32 = arith.constant 2147483647 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c0 = arith.constant 0 : index
+  %c0_i32 = arith.constant 0 : i32
+  %c5_i32 = arith.constant 5 : i32
+  %c5 = arith.constant 5 : index
+  %c1 = arith.constant 1 : index
+  %0 = fir.alloca i32
+  %1 = fir.alloca !fir.array<1xi32>
+  %2 = fir.declare %arg1 {uniq_name = "_QFtestEmask"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+  %3 = fir.rebox %2 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+  %4 = fir.alloca f32 {bindc_name = "test", uniq_name = "_QFtestEtest"}
+  %5 = fir.declare %4 {uniq_name = "_QFtestEtest"} : (!fir.ref<f32>) -> !fir.ref<f32>
+  %6 = fir.declare %arg0 {uniq_name = "_QFtestEx"} : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+  %7 = fir.rebox %6 : (!fir.box<!fir.array<?xi32>>) -> !fir.box<!fir.array<?xi32>>
+  %8 = fir.shape %c1 : (index) -> !fir.shape<1>
+  %9 = fir.array_coor %1(%8) %c1 : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+  fir.store %c0_i32 to %9 : !fir.ref<i32>
+  fir.store %c0_i32 to %0 : !fir.ref<i32>
+  %10:3 = fir.box_dims %7, %c0 : (!fir.box<!fir.array<?xi32>>, index) -> (index, index, index)
+  %11 = arith.subi %10#1, %c1 : index
+  %12 = fir.do_loop %arg2 = %c0 to %11 step %c1 iter_args(%arg3 = %c2147483647_i32) -> (i32) {
+    %18 = arith.addi %arg2, %c1 : index
+    %19 = fir.array_coor %3 %18 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+    %20 = fir.load %19 : !fir.ref<i32>
+    %21 = arith.cmpi sge, %20, %c5_i32 : i32
+    %22 = fir.if %21 -> (i32) {
+      fir.store %c1_i32 to %0 : !fir.ref<i32>
+      %23 = arith.subi %10#0, %c1 : index
+      %24 = arith.addi %18, %23 : index
+      %25 = fir.array_coor %7 %24 : (!fir.box<!fir.array<?xi32>>, index) -> !fir.ref<i32>
+      %26 = fir.load %25 : !fir.ref<i32>
+      %27 = arith.cmpi slt, %26, %arg3 : i32
+      %28 = fir.if %27 -> (i32) {
+        %29 = fir.convert %18 : (index) -> i32
+        fir.store %29 to %9 : !fir.ref<i32>
+        fir.result %26 : i32
+      } else {
+        fir.result %arg3 : i32
+      }
+      fir.result %28 : i32
+    } else {
+      fir.result %arg3 : i32
+    }
+    fir.result %22 : i32
+  }
+  %13 = fir.load %0 : !fir.ref<i32>
+  %14 = arith.cmpi eq, %13, %c1_i32 : i32
+  fir.if %14 {
+    %18 = arith.cmpi eq, %12, %c2147483647_i32 : i32
+    fir.if %18 {
+      %19 = fir.array_coor %1(%8) %c0 : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+      fir.store %c1_i32 to %19 : !fir.ref<i32>
+    }
+  }
+  %15 = fir.slice %c5, %c5, %c1 : (index, index, index) -> !fir.slice<1>
+  %16 = fir.rebox %7 [%15] : (!fir.box<!fir.array<?xi32>>, !fir.slice<1>) -> !fir.box<!fir.array<1xi32>>
+  fir.do_loop %arg2 = %c1 to %c1 step %c1 unordered {
+    %18 = fir.array_coor %1(%8) %arg2 : (!fir.ref<!fir.array<1xi32>>, !fir.shape<1>, index) -> !fir.ref<i32>
+    %19 = fir.load %18 : !fir.ref<i32>
+    %20 = fir.array_coor %16 %arg2 : (!fir.box<!fir.array<1xi32>>, index) -> !fir.ref<i32>
+    fir.store %19 to %20 : !fir.ref<i32>
+  }
+  %17 = fir.load %5 : !fir.ref<f32>
+  return %17 : f32
+}
+// CHECK-LABEL: func @minloc
+// CHECK: %[[V17:.*]] = fir.if %{{.*}} -> (i32) {
+// CHECK:   %[[V27:.*]] = fir.do_loop
+// CHECK:   fir.result %[[V27]] : i32
+// CHECK: } else {
+// CHECK:   %[[V23:.*]] = fir.do_loop
+// CHECK:   fir.result %[[V23]] : i32
+// CHECK: fir.if %{{.*}} {
+// CHECK:   {{.*}} = arith.cmpi eq, %[[V17]], %c2147483647_i32
+
+
 } // End module

From 06ca52e25226d406a3e384953abd12955f42ac84 Mon Sep 17 00:00:00 2001
From: Vincent Lee <thevinster@users.noreply.github.com>
Date: Sat, 20 Jan 2024 14:22:31 -0800
Subject: [PATCH 259/843] [InlineOrder] Fix InlineOrder erase_if implementation
 (#78684)

The InlineOrder Heap stores a CallBase ptr and InlineHistoryID pair.
When running the `erase_if` method, InlineHistoryID is always returned
with 0. Instead, we should be retrieving it from the `InlineHistoryMap`
(similar to what is done in the `pop` implementation).

This change is completely harmless because no one is using
InlineHistoryID right now as part of the `erase_if` implementation which
is currently only used in the ModuleInliner.
---
 llvm/lib/Analysis/InlineOrder.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Analysis/InlineOrder.cpp b/llvm/lib/Analysis/InlineOrder.cpp
index d6acafdc6ab8b..09fc4f9a00f49 100644
--- a/llvm/lib/Analysis/InlineOrder.cpp
+++ b/llvm/lib/Analysis/InlineOrder.cpp
@@ -262,7 +262,7 @@ class PriorityInlineOrder : public InlineOrder<std::pair<CallBase *, int>> {
 
   void erase_if(function_ref<bool(T)> Pred) override {
     auto PredWrapper = [=](CallBase *CB) -> bool {
-      return Pred(std::make_pair(CB, 0));
+      return Pred(std::make_pair(CB, InlineHistoryMap[CB]));
     };
     llvm::erase_if(Heap, PredWrapper);
     std::make_heap(Heap.begin(), Heap.end(), isLess);

From 84cb8eaeeb108234be1c8498688a014b87fd431d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 14:30:30 -0800
Subject: [PATCH 260/843] [Sema] Use llvm::is_contained (NFC)

---
 clang/lib/Sema/SemaDeclAttr.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index a482919356e1b..2a73567c2f051 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -9008,10 +9008,8 @@ static void handleArmNewAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
       return;
     }
 
-    if (std::find(NewState.begin(), NewState.end(), StateName) ==
-        NewState.end()) { // Avoid adding duplicates.
+    if (!llvm::is_contained(NewState, StateName)) // Avoid adding duplicates.
       NewState.push_back(StateName);
-    }
   }
 
   if (auto *FPT = dyn_cast<FunctionProtoType>(D->getFunctionType())) {

From 81218356fde328112817075ccc0c17a6d698c664 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 14:30:32 -0800
Subject: [PATCH 261/843] [Sparc] Use StringRef::starts_with_insensitive (NFC)

---
 .../Target/Sparc/AsmParser/SparcAsmParser.cpp | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 54a7d92144568..7a956636831df 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -1352,7 +1352,7 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, MCRegister &RegNo,
       return true;
     }
 
-    if (name.substr(0, 3).equals_insensitive("asr") &&
+    if (name.starts_with_insensitive("asr") &&
         !name.substr(3).getAsInteger(10, intVal) && intVal > 0 && intVal < 32) {
       RegNo = ASRRegs[intVal];
       RegKind = SparcOperand::rk_Special;
@@ -1421,7 +1421,7 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, MCRegister &RegNo,
     }
 
     // %fcc0 - %fcc3
-    if (name.substr(0, 3).equals_insensitive("fcc") &&
+    if (name.starts_with_insensitive("fcc") &&
         !name.substr(3).getAsInteger(10, intVal) && intVal < 4) {
       // FIXME: check 64bit and  handle %fcc1 - %fcc3
       RegNo = Sparc::FCC0 + intVal;
@@ -1430,40 +1430,40 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, MCRegister &RegNo,
     }
 
     // %g0 - %g7
-    if (name.substr(0, 1).equals_insensitive("g") &&
+    if (name.starts_with_insensitive("g") &&
         !name.substr(1).getAsInteger(10, intVal) && intVal < 8) {
       RegNo = IntRegs[intVal];
       RegKind = SparcOperand::rk_IntReg;
       return true;
     }
     // %o0 - %o7
-    if (name.substr(0, 1).equals_insensitive("o") &&
+    if (name.starts_with_insensitive("o") &&
         !name.substr(1).getAsInteger(10, intVal) && intVal < 8) {
       RegNo = IntRegs[8 + intVal];
       RegKind = SparcOperand::rk_IntReg;
       return true;
     }
-    if (name.substr(0, 1).equals_insensitive("l") &&
+    if (name.starts_with_insensitive("l") &&
         !name.substr(1).getAsInteger(10, intVal) && intVal < 8) {
       RegNo = IntRegs[16 + intVal];
       RegKind = SparcOperand::rk_IntReg;
       return true;
     }
-    if (name.substr(0, 1).equals_insensitive("i") &&
+    if (name.starts_with_insensitive("i") &&
         !name.substr(1).getAsInteger(10, intVal) && intVal < 8) {
       RegNo = IntRegs[24 + intVal];
       RegKind = SparcOperand::rk_IntReg;
       return true;
     }
     // %f0 - %f31
-    if (name.substr(0, 1).equals_insensitive("f") &&
+    if (name.starts_with_insensitive("f") &&
         !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 32) {
       RegNo = FloatRegs[intVal];
       RegKind = SparcOperand::rk_FloatReg;
       return true;
     }
     // %f32 - %f62
-    if (name.substr(0, 1).equals_insensitive("f") &&
+    if (name.starts_with_insensitive("f") &&
         !name.substr(1, 2).getAsInteger(10, intVal) && intVal >= 32 &&
         intVal <= 62 && (intVal % 2 == 0)) {
       // FIXME: Check V9
@@ -1473,7 +1473,7 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, MCRegister &RegNo,
     }
 
     // %r0 - %r31
-    if (name.substr(0, 1).equals_insensitive("r") &&
+    if (name.starts_with_insensitive("r") &&
         !name.substr(1, 2).getAsInteger(10, intVal) && intVal < 31) {
       RegNo = IntRegs[intVal];
       RegKind = SparcOperand::rk_IntReg;
@@ -1481,7 +1481,7 @@ bool SparcAsmParser::matchRegisterName(const AsmToken &Tok, MCRegister &RegNo,
     }
 
     // %c0 - %c31
-    if (name.substr(0, 1).equals_insensitive("c") &&
+    if (name.starts_with_insensitive("c") &&
         !name.substr(1).getAsInteger(10, intVal) && intVal < 32) {
       RegNo = CoprocRegs[intVal];
       RegKind = SparcOperand::rk_CoprocReg;

From aa530c7d0091f9536485385e2c6fa7342d04afd5 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 14:30:34 -0800
Subject: [PATCH 262/843] [Passes] Use a range-based for loop with
 llvm::successors (NFC)

---
 llvm/lib/Passes/StandardInstrumentations.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index d467fe5c9a8e7..aea8acebcb830 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -2119,8 +2119,8 @@ DCData::DCData(const BasicBlock &B) {
       addSuccessorLabel(C.getCaseSuccessor()->getName().str(), Value);
     }
   } else
-    for (const_succ_iterator I = succ_begin(&B), E = succ_end(&B); I != E; ++I)
-      addSuccessorLabel((*I)->getName().str(), "");
+    for (const BasicBlock *Succ : successors(&B))
+      addSuccessorLabel(Succ->getName().str(), "");
 }
 
 DotCfgChangeReporter::DotCfgChangeReporter(bool Verbose)

From 851143608e2394487d849f56ea1d4b3d3d8f2ead Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 14:30:36 -0800
Subject: [PATCH 263/843] [Frontend] Use SmallString::operator std::string
 (NFC)

---
 clang/lib/Frontend/CompilerInstance.cpp        | 2 +-
 clang/lib/Frontend/CompilerInvocation.cpp      | 4 ++--
 clang/lib/Frontend/LogDiagnosticPrinter.cpp    | 2 +-
 clang/lib/Frontend/Rewrite/FrontendActions.cpp | 2 +-
 clang/lib/Frontend/TextDiagnosticBuffer.cpp    | 8 ++++----
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 6df5521b25cc0..c258870072613 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -543,7 +543,7 @@ std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) {
   SmallString<256> SpecificModuleCache(getHeaderSearchOpts().ModuleCachePath);
   if (!SpecificModuleCache.empty() && !getHeaderSearchOpts().DisableModuleHash)
     llvm::sys::path::append(SpecificModuleCache, ModuleHash);
-  return std::string(SpecificModuleCache.str());
+  return std::string(SpecificModuleCache);
 }
 
 // ASTContext
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index b5e192d54465b..ed9cd1299eae2 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3176,7 +3176,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
       llvm::sys::fs::make_absolute(WorkingDir, P);
   }
   llvm::sys::path::remove_dots(P);
-  Opts.ModuleCachePath = std::string(P.str());
+  Opts.ModuleCachePath = std::string(P);
 
   // Only the -fmodule-file=<name>=<file> form.
   for (const auto *A : Args.filtered(OPT_fmodule_file)) {
@@ -3217,7 +3217,7 @@ static bool ParseHeaderSearchArgs(HeaderSearchOptions &Opts, ArgList &Args,
       SmallString<32> Buffer;
       llvm::sys::path::append(Buffer, Opts.Sysroot,
                               llvm::StringRef(A->getValue()).substr(1));
-      Path = std::string(Buffer.str());
+      Path = std::string(Buffer);
     }
 
     Opts.AddPath(Path, Group, IsFramework,
diff --git a/clang/lib/Frontend/LogDiagnosticPrinter.cpp b/clang/lib/Frontend/LogDiagnosticPrinter.cpp
index 32fc6cb2acd87..469d1c22633aa 100644
--- a/clang/lib/Frontend/LogDiagnosticPrinter.cpp
+++ b/clang/lib/Frontend/LogDiagnosticPrinter.cpp
@@ -134,7 +134,7 @@ void LogDiagnosticPrinter::HandleDiagnostic(DiagnosticsEngine::Level Level,
   // Format the message.
   SmallString<100> MessageStr;
   Info.FormatDiagnostic(MessageStr);
-  DE.Message = std::string(MessageStr.str());
+  DE.Message = std::string(MessageStr);
 
   // Set the location information.
   DE.Filename = "";
diff --git a/clang/lib/Frontend/Rewrite/FrontendActions.cpp b/clang/lib/Frontend/Rewrite/FrontendActions.cpp
index 92921bf6f3d7b..cf5a9437e89e6 100644
--- a/clang/lib/Frontend/Rewrite/FrontendActions.cpp
+++ b/clang/lib/Frontend/Rewrite/FrontendActions.cpp
@@ -88,7 +88,7 @@ class FixItRewriteToTemp : public FixItOptions {
     llvm::sys::fs::createTemporaryFile(llvm::sys::path::filename(Filename),
                                        llvm::sys::path::extension(Filename).drop_front(), fd,
                                        Path);
-    return std::string(Path.str());
+    return std::string(Path);
   }
 };
 } // end anonymous namespace
diff --git a/clang/lib/Frontend/TextDiagnosticBuffer.cpp b/clang/lib/Frontend/TextDiagnosticBuffer.cpp
index 90f273e65f88f..681bc25f46b83 100644
--- a/clang/lib/Frontend/TextDiagnosticBuffer.cpp
+++ b/clang/lib/Frontend/TextDiagnosticBuffer.cpp
@@ -32,20 +32,20 @@ void TextDiagnosticBuffer::HandleDiagnostic(DiagnosticsEngine::Level Level,
                          "Diagnostic not handled during diagnostic buffering!");
   case DiagnosticsEngine::Note:
     All.emplace_back(Level, Notes.size());
-    Notes.emplace_back(Info.getLocation(), std::string(Buf.str()));
+    Notes.emplace_back(Info.getLocation(), std::string(Buf));
     break;
   case DiagnosticsEngine::Warning:
     All.emplace_back(Level, Warnings.size());
-    Warnings.emplace_back(Info.getLocation(), std::string(Buf.str()));
+    Warnings.emplace_back(Info.getLocation(), std::string(Buf));
     break;
   case DiagnosticsEngine::Remark:
     All.emplace_back(Level, Remarks.size());
-    Remarks.emplace_back(Info.getLocation(), std::string(Buf.str()));
+    Remarks.emplace_back(Info.getLocation(), std::string(Buf));
     break;
   case DiagnosticsEngine::Error:
   case DiagnosticsEngine::Fatal:
     All.emplace_back(Level, Errors.size());
-    Errors.emplace_back(Info.getLocation(), std::string(Buf.str()));
+    Errors.emplace_back(Info.getLocation(), std::string(Buf));
     break;
   }
 }

From 11b3b1085645f0819552eb8a17e1928f60570fce Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 14:30:37 -0800
Subject: [PATCH 264/843] [Support] Use llvm::children and
 llvm::inverse_children (NFC)

---
 llvm/include/llvm/Support/GenericDomTree.h | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/Support/GenericDomTree.h b/llvm/include/llvm/Support/GenericDomTree.h
index 62186a368e964..4fce9d8bfb2b6 100644
--- a/llvm/include/llvm/Support/GenericDomTree.h
+++ b/llvm/include/llvm/Support/GenericDomTree.h
@@ -850,17 +850,16 @@ class DominatorTreeBase {
   void Split(typename GraphTraits<N>::NodeRef NewBB) {
     using GraphT = GraphTraits<N>;
     using NodeRef = typename GraphT::NodeRef;
-    assert(std::distance(GraphT::child_begin(NewBB),
-                         GraphT::child_end(NewBB)) == 1 &&
+    assert(llvm::hasSingleElement(children<N>(NewBB)) &&
            "NewBB should have a single successor!");
     NodeRef NewBBSucc = *GraphT::child_begin(NewBB);
 
-    SmallVector<NodeRef, 4> PredBlocks(children<Inverse<N>>(NewBB));
+    SmallVector<NodeRef, 4> PredBlocks(inverse_children<N>(NewBB));
 
     assert(!PredBlocks.empty() && "No predblocks?");
 
     bool NewBBDominatesNewBBSucc = true;
-    for (auto *Pred : children<Inverse<N>>(NewBBSucc)) {
+    for (auto *Pred : inverse_children<N>(NewBBSucc)) {
       if (Pred != NewBB && !dominates(NewBBSucc, Pred) &&
           isReachableFromEntry(Pred)) {
         NewBBDominatesNewBBSucc = false;

From a464e05109088f1f3a0ca4c83d6dd900e83bdb4b Mon Sep 17 00:00:00 2001
From: XDeme <66138117+XDeme@users.noreply.github.com>
Date: Sat, 20 Jan 2024 19:47:58 -0300
Subject: [PATCH 265/843] =?UTF-8?q?[clang-format]=20Handle=20templated=20e?=
 =?UTF-8?q?laborated=20type=20specifier=20in=20function=E2=80=A6=20(#77013?=
 =?UTF-8?q?)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

… return type.

The behavior now is consistent with the non template version.
Enabled and updated old test.
---
 clang/lib/Format/UnwrappedLineParser.cpp      | 30 ++++++++++---------
 clang/unittests/Format/FormatTest.cpp         | 10 +++++--
 clang/unittests/Format/TokenAnnotatorTest.cpp | 14 +++++++++
 3 files changed, 37 insertions(+), 17 deletions(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index c08ce86449b6e..7bc6410a78a49 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -3882,6 +3882,9 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
   const FormatToken &InitialToken = *FormatTok;
   nextToken();
 
+  auto IsNonMacroIdentifier = [](const FormatToken *Tok) {
+    return Tok->is(tok::identifier) && Tok->TokenText != Tok->TokenText.upper();
+  };
   // The actual identifier can be a nested name specifier, and in macros
   // it is often token-pasted.
   // An [[attribute]] can be before the identifier.
@@ -3903,27 +3906,26 @@ void UnwrappedLineParser::parseRecord(bool ParseAsExpr) {
     }
     if (FormatTok->is(tok::l_square) && handleCppAttributes())
       continue;
-    bool IsNonMacroIdentifier =
-        FormatTok->is(tok::identifier) &&
-        FormatTok->TokenText != FormatTok->TokenText.upper();
     nextToken();
     // We can have macros in between 'class' and the class name.
-    if (!IsNonMacroIdentifier && FormatTok->is(tok::l_paren))
+    if (!IsNonMacroIdentifier(FormatTok->Previous) &&
+        FormatTok->is(tok::l_paren)) {
       parseParens();
+    }
   }
 
-  // Note that parsing away template declarations here leads to incorrectly
-  // accepting function declarations as record declarations.
-  // In general, we cannot solve this problem. Consider:
-  // class A<int> B() {}
-  // which can be a function definition or a class definition when B() is a
-  // macro. If we find enough real-world cases where this is a problem, we
-  // can parse for the 'template' keyword in the beginning of the statement,
-  // and thus rule out the record production in case there is no template
-  // (this would still leave us with an ambiguity between template function
-  // and class declarations).
   if (FormatTok->isOneOf(tok::colon, tok::less)) {
+    int AngleNestingLevel = 0;
     do {
+      if (FormatTok->is(tok::less))
+        ++AngleNestingLevel;
+      else if (FormatTok->is(tok::greater))
+        --AngleNestingLevel;
+
+      if (AngleNestingLevel == 0 && FormatTok->is(tok::l_paren) &&
+          IsNonMacroIdentifier(FormatTok->Previous)) {
+        break;
+      }
       if (FormatTok->is(tok::l_brace)) {
         calculateBraceTypes(/*ExpectClassBody=*/true);
         if (!tryToParseBracedList())
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index cab495125415a..3fb55ae2c1f41 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -14659,9 +14659,13 @@ TEST_F(FormatTest, UnderstandContextOfRecordTypeKeywords) {
   verifyFormat("template <> struct X < 15, i<3 && 42 < 50 && 33 < 28> {};");
   verifyFormat("int i = SomeFunction(a<b, a> b);");
 
-  // FIXME:
-  // This now gets parsed incorrectly as class definition.
-  // verifyFormat("class A<int> f() {\n}\nint n;");
+  verifyFormat("class A<int> f() {}\n"
+               "int n;");
+  verifyFormat("template <typename T> class A<T> f() {}\n"
+               "int n;");
+
+  verifyFormat("template <> class Foo<int> F() {\n"
+               "} n;");
 
   // Elaborate types where incorrectly parsing the structural element would
   // break the indent.
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 64b2abac5cce5..b1e1e70a1abbf 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2520,6 +2520,20 @@ TEST_F(TokenAnnotatorTest, BraceKind) {
   EXPECT_BRACE_KIND(Tokens[4], BK_Block);
   EXPECT_BRACE_KIND(Tokens[5], BK_Block);
 
+  Tokens = annotate("class Foo<int> f() {}");
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
+  EXPECT_TOKEN(Tokens[5], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_FunctionLBrace);
+  EXPECT_BRACE_KIND(Tokens[8], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[9], BK_Block);
+
+  Tokens = annotate("template <typename T> class Foo<T> f() {}");
+  ASSERT_EQ(Tokens.size(), 16u) << Tokens;
+  EXPECT_TOKEN(Tokens[10], tok::identifier, TT_FunctionDeclarationName);
+  EXPECT_TOKEN(Tokens[13], tok::l_brace, TT_FunctionLBrace);
+  EXPECT_BRACE_KIND(Tokens[13], BK_Block);
+  EXPECT_BRACE_KIND(Tokens[14], BK_Block);
+
   Tokens = annotate("void f() override {};");
   ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::identifier, TT_FunctionDeclarationName);

From 975deb366470e4943a5b73d8f3031ed54dec6e8b Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Sat, 20 Jan 2024 18:16:45 -0500
Subject: [PATCH 266/843] [libc] Add missing header ioctl.h on aarch64.
 (#78865)

---
 libc/config/linux/aarch64/headers.txt | 2 ++
 libc/src/unistd/linux/CMakeLists.txt  | 1 +
 2 files changed, 3 insertions(+)

diff --git a/libc/config/linux/aarch64/headers.txt b/libc/config/linux/aarch64/headers.txt
index 6a4a4aaeb0a8d..709a8dbdedc98 100644
--- a/libc/config/linux/aarch64/headers.txt
+++ b/libc/config/linux/aarch64/headers.txt
@@ -21,4 +21,6 @@ set(TARGET_PUBLIC_HEADERS
     libc.include.threads
     libc.include.time
     libc.include.unistd
+
+    libc.include.sys_ioctl
 )
diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt
index 2507e1e9a9387..ab9eca19e6508 100644
--- a/libc/src/unistd/linux/CMakeLists.txt
+++ b/libc/src/unistd/linux/CMakeLists.txt
@@ -226,6 +226,7 @@ add_entrypoint_object(
     ../isatty.h
   DEPENDS
     libc.include.unistd
+    libc.include.sys_ioctl
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno

From dedc7d4d362b8045c6810f8ca7f947bbdb63b7ec Mon Sep 17 00:00:00 2001
From: Jerry Wu <cheyuw@google.com>
Date: Sat, 20 Jan 2024 16:37:46 -0800
Subject: [PATCH 267/843] [mlir] Exclude masked ops in VectorDropLeadUnitDim
 (#76468)

Don't insert cast ops for ops in `vector.mask` region in
`VectorDropLeadUnitDim`.
---
 .../Transforms/VectorDropLeadUnitDim.cpp      |  9 +++
 .../vector-dropleadunitdim-transforms.mlir    | 66 +++++++++++++++++++
 2 files changed, 75 insertions(+)

diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
index 84294e4552a60..e1ed5d81625d8 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDropLeadUnitDim.cpp
@@ -223,6 +223,9 @@ struct CastAwayTransferReadLeadingOneDim
 
   LogicalResult matchAndRewrite(vector::TransferReadOp read,
                                 PatternRewriter &rewriter) const override {
+    // TODO(#78787): Not supported masked op yet.
+    if (cast<MaskableOpInterface>(read.getOperation()).isMasked())
+      return failure();
     // TODO: support 0-d corner case.
     if (read.getTransferRank() == 0)
       return failure();
@@ -274,6 +277,9 @@ struct CastAwayTransferWriteLeadingOneDim
 
   LogicalResult matchAndRewrite(vector::TransferWriteOp write,
                                 PatternRewriter &rewriter) const override {
+    // TODO(#78787): Not supported masked op yet.
+    if (cast<MaskableOpInterface>(write.getOperation()).isMasked())
+      return failure();
     // TODO: support 0-d corner case.
     if (write.getTransferRank() == 0)
       return failure();
@@ -325,6 +331,9 @@ struct CastAwayTransferWriteLeadingOneDim
 LogicalResult
 mlir::vector::castAwayContractionLeadingOneDim(vector::ContractionOp contractOp,
                                                RewriterBase &rewriter) {
+  // TODO(#78787): Not supported masked op yet.
+  if (cast<MaskableOpInterface>(contractOp.getOperation()).isMasked())
+    return failure();
   VectorType oldAccType = dyn_cast<VectorType>(contractOp.getAccType());
   if (oldAccType == nullptr)
     return failure();
diff --git a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
index 71dffca8f14da..f601be0416814 100644
--- a/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-dropleadunitdim-transforms.mlir
@@ -164,6 +164,37 @@ func.func @cast_away_contraction_leading_one_dims_nonleadingunitdim_rank4_acctra
   return %0: vector<1x1x2x16xf32>
 }
 
+// -----
+
+// CHECK-DAG: #[[MAP0:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// CHECK-DAG: #[[MAP1:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d3, d2)>
+// CHECK-DAG: #[[MAP2:.*]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+// CHECK-LABEL: not_insert_cast_for_contraction_under_mask
+// CHECK:      %[[MASK:.+]] = vector.constant_mask
+// CHECK:      %[[CASTED_MASK:.+]] = vector.broadcast %[[MASK]]
+// CHECK:      %[[RET:.+]] = vector.mask %[[CASTED_MASK]] {
+// CHECK-SAME:   vector.contract {{.*}} : vector<1x16x8xf32>, vector<1x8x16xf32> into vector<1x16x16xf32> }
+// CHECK:      return %[[RET]] : vector<1x16x16xf32>
+
+#contraction_accesses0 = [
+  affine_map<(l, i, j, k) -> (l, i, k)>,
+  affine_map<(l, i, j, k) -> (l, k, j)>,
+  affine_map<(l, i, j, k) -> (l, i, j)>
+]
+#contraction_trait0 = {
+  indexing_maps = #contraction_accesses0,
+  iterator_types = ["parallel", "parallel", "parallel", "reduction"]
+}
+
+func.func @not_insert_cast_for_contraction_under_mask(%arg0: vector<1x16x8xf32>, %arg1: vector<1x8x16xf32>, %arg2: vector<1x16x16xf32>) -> vector<1x16x16xf32> {
+  %mask = vector.constant_mask [1, 15, 15, 8] : vector<1x16x16x8xi1>
+  %0 = vector.mask %mask {
+    vector.contract #contraction_trait0 %arg0, %arg1, %arg2 : vector<1x16x8xf32>, vector<1x8x16xf32> into vector<1x16x16xf32>
+  } : vector<1x16x16x8xi1> -> vector<1x16x16xf32>
+  return %0 : vector<1x16x16xf32>
+}
+
 // -----
 // CHECK-LABEL: func @cast_away_extract_strided_slice_leading_one_dims
 func.func @cast_away_extract_strided_slice_leading_one_dims(%arg0: vector<1x8x8xf16>) -> vector<1x1x8xf16> {
@@ -253,6 +284,24 @@ func.func @cast_away_nontrivial_map_masked_transfer_read(%arg0: memref<1x4x8xf16
 
 // -----
 
+// CHECK-LABEL: func @not_insert_cast_fo4_transfer_read_under_mask
+// CHECK:      %[[MASK:.+]] = vector.constant_mask
+// CHECK:      %[[CASTED_MASK:.+]] = vector.broadcast %[[MASK]]
+// CHECK:      %[[RET:.+]] = vector.mask %[[CASTED_MASK]] {
+// CHECK-SAME:   vector.transfer_read {{.*}} : memref<1x1x4xf16>, vector<1x4xf16> }
+// CHECK:      return %[[RET]] : vector<1x4xf16>
+func.func @not_insert_cast_fo4_transfer_read_under_mask(%arg0: memref<1x1x4xf16>) -> vector<1x4xf16> {
+  %c0 = arith.constant 0 : index
+  %f0 = arith.constant 0. : f16
+  %mask = vector.constant_mask [1, 3] : vector<1x4xi1>
+  %ret = vector.mask %mask {
+    vector.transfer_read %arg0[%c0, %c0, %c0], %f0 {in_bounds = [true, true]} : memref<1x1x4xf16>, vector<1x4xf16>
+  } : vector<1x4xi1> -> vector<1x4xf16>
+  return %ret: vector<1x4xf16>
+}
+
+// -----
+
 // CHECK-LABEL: func @cast_away_transfer_write_leading_one_dims
 func.func @cast_away_transfer_write_leading_one_dims(%arg0: memref<1x4x8x16xf16>, %arg1: vector<1x4xf16>) {
   // CHECK: %[[C0:.+]] = arith.constant 0 : index
@@ -286,6 +335,23 @@ func.func @cast_away_transfer_write_leading_one_dims_one_element(%arg0: memref<1
 
 // -----
 
+// CHECK-LABEL: func @not_insert_cast_for_transfer_write_under_mask
+// CHECK:      %[[MASK:.+]] = vector.constant_mask
+// CHECK:      %[[CASTED_MASK:.+]] = vector.broadcast %[[MASK]]
+// CHECK:      vector.mask %[[CASTED_MASK]] {
+// CHECK-SAME:   vector.transfer_write {{.*}} : vector<1x4xf16>, memref<1x1x4xf16> }
+// CHECK:      return
+func.func @not_insert_cast_for_transfer_write_under_mask(%arg0: memref<1x1x4xf16>, %arg1: vector<1x4xf16>) {
+  %c0 = arith.constant 0 : index
+  %mask = vector.constant_mask [1, 3] : vector<1x4xi1>
+  vector.mask %mask {
+    vector.transfer_write %arg1, %arg0[%c0, %c0, %c0] {in_bounds = [true, true]} : vector<1x4xf16>, memref<1x1x4xf16>
+  } : vector<1x4xi1>
+  return
+}
+
+// -----
+
 // CHECK:       #[[$MAP:.+]] = affine_map<(d0, d1, d2) -> (d1)>
 // CHECK-LABEL: func @cast_away_nontrivial_map_masked_transfer_write
 func.func @cast_away_nontrivial_map_masked_transfer_write(%arg0: memref<1x4x8xf16>, %arg1: vector<1x1x4xf16>, %arg2: vector<1x4x1xi1>) {

From 2759e47067ea286f6302adcfe93b653cfaf6f2eb Mon Sep 17 00:00:00 2001
From: Vassil Vassilev <v.g.vassilev@gmail.com>
Date: Sun, 21 Jan 2024 03:00:38 +0200
Subject: [PATCH 268/843] [clang-repl] We do not need to call new in the object
 allocation. (#78843)

This test demonstrates template instantiation via the interpreter code.
In order to do that we can allocate the object on the stack and extend
its lifetime by boxing it into a clang::Value.

That avoids the subtle problem where we call the new operator on an
object only known to the interpreter and we cannot destroy it from
compiled code since there is not suitable facility in clang::Value yet.

That should resolve the asan issues that was reported in
llvm/llvm-project#76218.
---
 clang/unittests/Interpreter/InterpreterTest.cpp | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/clang/unittests/Interpreter/InterpreterTest.cpp b/clang/unittests/Interpreter/InterpreterTest.cpp
index d6eb0684ba49d..e76c0677db5ea 100644
--- a/clang/unittests/Interpreter/InterpreterTest.cpp
+++ b/clang/unittests/Interpreter/InterpreterTest.cpp
@@ -34,12 +34,6 @@ using namespace clang;
 #define CLANG_INTERPRETER_NO_SUPPORT_EXEC
 #endif
 
-#if LLVM_ADDRESS_SANITIZER_BUILD || LLVM_HWADDRESS_SANITIZER_BUILD
-#include <sanitizer/lsan_interface.h>
-#else
-extern "C" void __lsan_ignore_object(const void *p) {}
-#endif
-
 int Global = 42;
 // JIT reports symbol not found on Windows without the visibility attribute.
 REPL_EXTERNAL_VISIBILITY int getGlobal() { return Global; }
@@ -257,7 +251,12 @@ TEST(IncrementalProcessing, FindMangledNameSymbol) {
 static Value AllocateObject(TypeDecl *TD, Interpreter &Interp) {
   std::string Name = TD->getQualifiedNameAsString();
   Value Addr;
-  cantFail(Interp.ParseAndExecute("new " + Name + "()", &Addr));
+  // FIXME: Consider providing an option in clang::Value to take ownership of
+  // the memory created from the interpreter.
+  // cantFail(Interp.ParseAndExecute("new " + Name + "()", &Addr));
+
+  // The lifetime of the temporary is extended by the clang::Value.
+  cantFail(Interp.ParseAndExecute(Name + "()", &Addr));
   return Addr;
 }
 
@@ -317,8 +316,6 @@ TEST(IncrementalProcessing, InstantiateTemplate) {
   auto fn =
       cantFail(Interp->getSymbolAddress(MangledName)).toPtr<TemplateSpecFn>();
   EXPECT_EQ(42, fn(NewA.getPtr()));
-  // FIXME: release the memory.
-  __lsan_ignore_object(NewA.getPtr());
 }
 
 #ifdef CLANG_INTERPRETER_NO_SUPPORT_EXEC

From 9b2c25c70466d6f081a2915e661840f965b6056a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 18:57:29 -0800
Subject: [PATCH 269/843] [clang] Use SmallString::operator std::string (NFC)

---
 clang/lib/ARCMigrate/FileRemapper.cpp                         | 2 +-
 clang/lib/ARCMigrate/ObjCMT.cpp                               | 2 +-
 clang/lib/AST/Expr.cpp                                        | 4 ++--
 clang/lib/AST/Mangle.cpp                                      | 2 +-
 clang/lib/Basic/FileManager.cpp                               | 2 +-
 clang/lib/CodeGen/CGCUDANV.cpp                                | 2 +-
 clang/lib/CrossTU/CrossTranslationUnit.cpp                    | 2 +-
 clang/lib/Lex/ModuleMap.cpp                                   | 2 +-
 clang/lib/Tooling/ASTDiff/ASTDiff.cpp                         | 4 ++--
 clang/lib/Tooling/CompilationDatabase.cpp                     | 2 +-
 .../Tooling/DependencyScanning/DependencyScanningWorker.cpp   | 2 +-
 clang/lib/Tooling/Tooling.cpp                                 | 2 +-
 clang/utils/TableGen/ClangAttrEmitter.cpp                     | 2 +-
 .../ClangCommentHTMLNamedCharacterReferenceEmitter.cpp        | 2 +-
 clang/utils/TableGen/ClangDiagnosticsEmitter.cpp              | 2 +-
 clang/utils/TableGen/MveEmitter.cpp                           | 2 +-
 16 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/clang/lib/ARCMigrate/FileRemapper.cpp b/clang/lib/ARCMigrate/FileRemapper.cpp
index 7abc862ceecc2..84024c3bafdca 100644
--- a/clang/lib/ARCMigrate/FileRemapper.cpp
+++ b/clang/lib/ARCMigrate/FileRemapper.cpp
@@ -43,7 +43,7 @@ std::string FileRemapper::getRemapInfoFile(StringRef outputDir) {
   assert(!outputDir.empty());
   SmallString<128> InfoFile = outputDir;
   llvm::sys::path::append(InfoFile, "remap");
-  return std::string(InfoFile.str());
+  return std::string(InfoFile);
 }
 
 bool FileRemapper::initFromDisk(StringRef outputDir, DiagnosticsEngine &Diag,
diff --git a/clang/lib/ARCMigrate/ObjCMT.cpp b/clang/lib/ARCMigrate/ObjCMT.cpp
index ed363a46a2004..0786c81516b2d 100644
--- a/clang/lib/ARCMigrate/ObjCMT.cpp
+++ b/clang/lib/ARCMigrate/ObjCMT.cpp
@@ -2201,7 +2201,7 @@ static std::string applyEditsToTemp(FileEntryRef FE,
   TmpOut.write(NewText.data(), NewText.size());
   TmpOut.close();
 
-  return std::string(TempPath.str());
+  return std::string(TempPath);
 }
 
 bool arcmt::getFileRemappingsFromFileList(
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
index ade403da65dd9..f1efa98e175ed 100644
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -690,7 +690,7 @@ std::string PredefinedExpr::ComputeName(PredefinedIdentKind IK,
 
         if (!Buffer.empty() && Buffer.front() == '\01')
           return std::string(Buffer.substr(1));
-        return std::string(Buffer.str());
+        return std::string(Buffer);
       }
       return std::string(ND->getIdentifier()->getName());
     }
@@ -986,7 +986,7 @@ std::string FixedPointLiteral::getValueAsString(unsigned Radix) const {
   SmallString<64> S;
   FixedPointValueToString(
       S, llvm::APSInt::getUnsigned(getValue().getZExtValue()), Scale);
-  return std::string(S.str());
+  return std::string(S);
 }
 
 void CharacterLiteral::print(unsigned Val, CharacterLiteralKind Kind,
diff --git a/clang/lib/AST/Mangle.cpp b/clang/lib/AST/Mangle.cpp
index d3a6b61fd2bec..30cff1ba2e6f3 100644
--- a/clang/lib/AST/Mangle.cpp
+++ b/clang/lib/AST/Mangle.cpp
@@ -464,7 +464,7 @@ class ASTNameGenerator::Implementation {
       SmallString<40> Mangled;
       auto Prefix = getClassSymbolPrefix(Kind, OCD->getASTContext());
       llvm::Mangler::getNameWithPrefix(Mangled, Prefix + ClassName, DL);
-      return std::string(Mangled.str());
+      return std::string(Mangled);
     };
 
     return {
diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index d16626b106521..974c8c22598f6 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -532,7 +532,7 @@ void FileManager::fillRealPathName(FileEntry *UFE, llvm::StringRef FileName) {
   // misleading. We need to clean up the interface here.
   makeAbsolutePath(AbsPath);
   llvm::sys::path::remove_dots(AbsPath, /*remove_dot_dot=*/true);
-  UFE->RealPathName = std::string(AbsPath.str());
+  UFE->RealPathName = std::string(AbsPath);
 }
 
 llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>>
diff --git a/clang/lib/CodeGen/CGCUDANV.cpp b/clang/lib/CodeGen/CGCUDANV.cpp
index 353370f1d761b..5b43272bfa62f 100644
--- a/clang/lib/CodeGen/CGCUDANV.cpp
+++ b/clang/lib/CodeGen/CGCUDANV.cpp
@@ -893,7 +893,7 @@ llvm::Function *CGNVCUDARuntime::makeModuleCtorFunction() {
     llvm::raw_svector_ostream OS(ModuleID);
     OS << ModuleIDPrefix << llvm::format("%" PRIx64, FatbinWrapper->getGUID());
     llvm::Constant *ModuleIDConstant = makeConstantArray(
-        std::string(ModuleID.str()), "", ModuleIDSectionName, 32, /*AddNull=*/true);
+        std::string(ModuleID), "", ModuleIDSectionName, 32, /*AddNull=*/true);
 
     // Create an alias for the FatbinWrapper that nvcc will look for.
     llvm::GlobalAlias::create(llvm::GlobalValue::ExternalLinkage,
diff --git a/clang/lib/CrossTU/CrossTranslationUnit.cpp b/clang/lib/CrossTU/CrossTranslationUnit.cpp
index 94c10e50d7d06..986470042bd83 100644
--- a/clang/lib/CrossTU/CrossTranslationUnit.cpp
+++ b/clang/lib/CrossTU/CrossTranslationUnit.cpp
@@ -247,7 +247,7 @@ CrossTranslationUnitContext::getLookupName(const NamedDecl *ND) {
   bool Ret = index::generateUSRForDecl(ND, DeclUSR);
   if (Ret)
     return {};
-  return std::string(DeclUSR.str());
+  return std::string(DeclUSR);
 }
 
 /// Recursively visits the decls of a DeclContext, and returns one with the
diff --git a/clang/lib/Lex/ModuleMap.cpp b/clang/lib/Lex/ModuleMap.cpp
index 42d55d09ea5a1..afb2948f05ae5 100644
--- a/clang/lib/Lex/ModuleMap.cpp
+++ b/clang/lib/Lex/ModuleMap.cpp
@@ -308,7 +308,7 @@ void ModuleMap::resolveHeader(Module *Mod,
         setUmbrellaHeaderAsWritten(Mod, *File, Header.FileName,
                                    RelativePathName.str());
     } else {
-      Module::Header H = {Header.FileName, std::string(RelativePathName.str()),
+      Module::Header H = {Header.FileName, std::string(RelativePathName),
                           *File};
       addHeader(Mod, H, headerKindToRole(Header.Kind));
     }
diff --git a/clang/lib/Tooling/ASTDiff/ASTDiff.cpp b/clang/lib/Tooling/ASTDiff/ASTDiff.cpp
index 356b4bd5a1b85..5f7153cd53ac2 100644
--- a/clang/lib/Tooling/ASTDiff/ASTDiff.cpp
+++ b/clang/lib/Tooling/ASTDiff/ASTDiff.cpp
@@ -453,12 +453,12 @@ std::string SyntaxTree::Impl::getStmtValue(const Stmt *S) const {
   if (auto *I = dyn_cast<IntegerLiteral>(S)) {
     SmallString<256> Str;
     I->getValue().toString(Str, /*Radix=*/10, /*Signed=*/false);
-    return std::string(Str.str());
+    return std::string(Str);
   }
   if (auto *F = dyn_cast<FloatingLiteral>(S)) {
     SmallString<256> Str;
     F->getValue().toString(Str);
-    return std::string(Str.str());
+    return std::string(Str);
   }
   if (auto *D = dyn_cast<DeclRefExpr>(S))
     return getRelativeName(D->getDecl(), getEnclosingDeclContext(AST, S));
diff --git a/clang/lib/Tooling/CompilationDatabase.cpp b/clang/lib/Tooling/CompilationDatabase.cpp
index 9d8f0d03a7d5c..af18194ae0fe4 100644
--- a/clang/lib/Tooling/CompilationDatabase.cpp
+++ b/clang/lib/Tooling/CompilationDatabase.cpp
@@ -216,7 +216,7 @@ std::string GetClangToolCommand() {
   SmallString<128> ClangToolPath;
   ClangToolPath = llvm::sys::path::parent_path(ClangExecutable);
   llvm::sys::path::append(ClangToolPath, "clang-tool");
-  return std::string(ClangToolPath.str());
+  return std::string(ClangToolPath);
 }
 
 } // namespace
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index c54e6d523800b..7ab4a699af6df 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -103,7 +103,7 @@ static void visitPrebuiltModule(StringRef PrebuiltModuleFilename,
 static std::string makeObjFileName(StringRef FileName) {
   SmallString<128> ObjFileName(FileName);
   llvm::sys::path::replace_extension(ObjFileName, "o");
-  return std::string(ObjFileName.str());
+  return std::string(ObjFileName);
 }
 
 /// Deduce the dependency target based on the output file and input files.
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index d82cd5e886e46..c5c3cdb47e92e 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -261,7 +261,7 @@ llvm::Expected<std::string> getAbsolutePath(llvm::vfs::FileSystem &FS,
   if (auto EC = FS.makeAbsolute(AbsolutePath))
     return llvm::errorCodeToError(EC);
   llvm::sys::path::native(AbsolutePath);
-  return std::string(AbsolutePath.str());
+  return std::string(AbsolutePath);
 }
 
 std::string getAbsolutePath(StringRef File) {
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 3888e6c08ab0f..89b88e386f257 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -1919,7 +1919,7 @@ struct AttributeSubjectMatchRule {
     }
     if (isAbstractRule())
       Result += "_abstract";
-    return std::string(Result.str());
+    return std::string(Result);
   }
 
   std::string getEnumValue() const { return "attr::" + getEnumValueName(); }
diff --git a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
index 47b871afdeebe..f1cd9af0519d1 100644
--- a/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
+++ b/clang/utils/TableGen/ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
@@ -66,7 +66,7 @@ void clang::EmitClangCommentHTMLNamedCharacterReferences(RecordKeeper &Records,
     }
     CLiteral.append(";");
 
-    StringMatcher::StringPair Match(Spelling, std::string(CLiteral.str()));
+    StringMatcher::StringPair Match(Spelling, std::string(CLiteral));
     NameToUTF8.push_back(Match);
   }
 
diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index a8a80ed882bde..480c7c83f5f8e 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -1343,7 +1343,7 @@ static std::string getDiagCategoryEnum(llvm::StringRef name) {
   SmallString<256> enumName = llvm::StringRef("DiagCat_");
   for (llvm::StringRef::iterator I = name.begin(), E = name.end(); I != E; ++I)
     enumName += isalnum(*I) ? *I : '_';
-  return std::string(enumName.str());
+  return std::string(enumName);
 }
 
 /// Emit the array of diagnostic subgroups.
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index f0bd0865c1c8b..496cb10d14f2d 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -896,7 +896,7 @@ class ACLEIntrinsic {
     llvm::APInt i = iOrig.trunc(64);
     SmallString<40> s;
     i.toString(s, 16, true, true);
-    return std::string(s.str());
+    return std::string(s);
   }
 
   std::string genSema() const {

From 24790a778822e14abcb09c696bf5113c9a23bf81 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 18:57:31 -0800
Subject: [PATCH 270/843] [Hexagon] Use llvm::children (NFC)

---
 llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index f7227dca3b603..cb820e2158992 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -601,8 +601,6 @@ bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
   // Visit all dominated blocks from the same loop first, then process B.
   MachineDomTreeNode *N = MDT->getNode(B);
 
-  using GTN = GraphTraits<MachineDomTreeNode *>;
-
   // We will change CFG/DT during this traversal, so take precautions to
   // avoid problems related to invalidated iterators. In fact, processing
   // a child C of B cannot cause another child to be removed, but it can
@@ -611,7 +609,7 @@ bool HexagonEarlyIfConversion::visitBlock(MachineBasicBlock *B,
   // prior to processing B, so there is no need to process it again.
   // Simply keep a list of children of B, and traverse that list.
   using DTNodeVectType = SmallVector<MachineDomTreeNode *, 4>;
-  DTNodeVectType Cn(GTN::child_begin(N), GTN::child_end(N));
+  DTNodeVectType Cn(llvm::children<MachineDomTreeNode *>(N));
   for (auto &I : Cn) {
     MachineBasicBlock *SB = I->getBlock();
     if (!Deleted.count(SB))

From 1ce5a80d08ad68f53e04d4b7c413d0dbb0065991 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 18:57:33 -0800
Subject: [PATCH 271/843] [Mips] Use MachineBasicBlock::pred_size (NFC)

---
 llvm/lib/Target/Mips/MipsAsmPrinter.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 30ff82dd911cf..718844bc36ff9 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -491,10 +491,7 @@ bool MipsAsmPrinter::isBlockOnlyReachableByFallthrough(const MachineBasicBlock*
     return false;
 
   // If there isn't exactly one predecessor, it can't be a fall through.
-  MachineBasicBlock::const_pred_iterator PI = MBB->pred_begin(), PI2 = PI;
-  ++PI2;
-
-  if (PI2 != MBB->pred_end())
+  if (MBB->pred_size() != 1)
     return false;
 
   // The predecessor has to be immediately before this block.

From f523a5522be221e4657daa111431106083e1c053 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 18:57:35 -0800
Subject: [PATCH 272/843] [Sema] Use llvm::all_of (NFC)

---
 clang/lib/Sema/Sema.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 70832d1a14552..2d4e6d1d058cd 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -2758,7 +2758,7 @@ bool Sema::isDeclaratorFunctionLike(Declarator &D) {
     return false;
 
   LookupQualifiedName(LR, DC);
-  bool Result = std::all_of(LR.begin(), LR.end(), [](Decl *Dcl) {
+  bool Result = llvm::all_of(LR, [](Decl *Dcl) {
     if (NamedDecl *ND = dyn_cast<NamedDecl>(Dcl)) {
       ND = ND->getUnderlyingDecl();
       return isa<FunctionDecl>(ND) || isa<FunctionTemplateDecl>(ND) ||

From bb6564a1b59eaaafbce46b519a45fc9d9e4f8f8a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Sat, 20 Jan 2024 18:57:36 -0800
Subject: [PATCH 273/843] [TableGen] Use StringRef::consume_front (NFC)

---
 llvm/utils/TableGen/GlobalISel/CodeExpander.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp b/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
index 20f98bef4887c..b0baf194bafeb 100644
--- a/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
+++ b/llvm/utils/TableGen/GlobalISel/CodeExpander.cpp
@@ -31,9 +31,8 @@ void CodeExpander::emit(raw_ostream &OS) const {
     OS << Current.substr(0, Pos);
     Current = Current.substr(Pos);
 
-    if (Current.starts_with("\n")) {
+    if (Current.consume_front("\n")) {
       OS << "\n" << Indent;
-      Current = Current.drop_front(1);
       continue;
     }
 
@@ -43,10 +42,8 @@ void CodeExpander::emit(raw_ostream &OS) const {
       continue;
     }
 
-    if (Current.starts_with("\\")) {
-      Current = Current.drop_front(1);
+    if (Current.consume_front("\\"))
       continue;
-    }
 
     if (Current.starts_with("${")) {
       StringRef StartVar = Current;

From 39f1ca522b023e77c4dca6332d8b9c6366d0eab9 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 21 Jan 2024 03:33:15 +0000
Subject: [PATCH 274/843] [MLGO] Remove absl dep from libraries

The library files in the new mlgo-utils utilities folder only depend on
absl.logging. The builtin Python logging library is a direct drop-in
replacement here, so we can just change the include and drop the test
dependency.
---
 .../mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py      | 3 +--
 llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py            | 3 +--
 .../mlgo-utils/tests/corpus/combine_training_corpus_test.py    | 2 +-
 llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py          | 2 +-
 llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py         | 2 +-
 5 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
index e2ae8699ec318..5d8c78aa95a9b 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus_lib.py
@@ -6,8 +6,7 @@
 import os
 import json
 import glob
-
-from absl import logging
+import logging
 
 _FILE_NAME = "corpus_description.json"
 
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
index 9c828ce1eb631..8e9779c6257f1 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir_lib.py
@@ -11,11 +11,10 @@
 import multiprocessing
 import functools
 import json
+import logging
 
 from typing import Dict, List, Optional
 
-from absl import logging
-
 _UNSPECIFIED_OVERRIDE = ["<UNSPECIFIED>"]
 
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
index dbfdc78a87533..4b36bf7742312 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, absl
+# REQUIRES: python-38
 
 ## Test the functionality of combine_training_corpus_lib
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
index 3ed52ec21de95..b22960556c99e 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, absl
+# REQUIRES: python-38
 
 ## Test the functionality of extract_ir_lib
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
index 0f970414b1aec..d8b8ba1e67c37 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, absl
+# REQUIRES: python-38
 
 ## Test the functionality of make_corpus_lib
 

From 8e99a63899c85972ec1ce9dbfc7e4cc01eddcdb0 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 21 Jan 2024 03:37:18 +0000
Subject: [PATCH 275/843] [Github] Update paths for mlgo PR subscribers

This patch updates the paths in the PR labelling config for the MLGO
label. In particular, the path for the new mlgo-utils subfolder under
llvm/utils has been added and two other files that were missed in the
original introduction.
---
 .github/new-prs-labeler.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/new-prs-labeler.yml b/.github/new-prs-labeler.yml
index 9b975d051b99e..b42bcb1887fb6 100644
--- a/.github/new-prs-labeler.yml
+++ b/.github/new-prs-labeler.yml
@@ -594,13 +594,16 @@ mlgo:
   - llvm/include/llvm/Analysis/*Runner.h
   - llvm/unittests/Analysis/ML*
   - llvm/lib/Analysis/FunctionPropertiesAnalysis.cpp
+  - llvm/lib/Analysis/TrainingLogger.cpp
   - llvm/include/llvm/Analysis/FunctionPropertiesAnalysis.h
+  - llvm/include/llvm/Analysis/Utils/TrainingLogger.h
   - llvm/test/Analysis/FunctionPropertiesAnalysis/*
   - llvm/unittests/Analysis/FunctionPropertiesAnalysisTest.cpp
   - llvm/test/Transforms/inline/ML/**
   - llvm/lib/CodeGen/ML*
   - llvm/unittests/CodeGen/ML*
   - llvm/test/CodeGen/MLRegAlloc/**
+  - llvm/utils/mlgo-utils/*
 
 tools:llvm-exegesis:
   - llvm/tools/llvm-exegesis/**

From a70d3101ba786b76f1796c2c2ac5fe469e9a57bd Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 21 Jan 2024 03:54:25 +0000
Subject: [PATCH 276/843] [MLGO] Disable mlgo-utils tests on Windows builders

This patch disables the mlgo-utils tests on Windows builders. MLGO is
not currently supported on Windows.

These tests were failing as some of them look for specific file paths
and the path conventions are different between Linux and Windows.
---
 .../mlgo-utils/tests/corpus/combine_training_corpus_test.py     | 2 +-
 llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py           | 2 +-
 llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py          | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
index 4b36bf7742312..4965a63ac2ae7 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_test.py
@@ -1,4 +1,4 @@
-# REQUIRES: python-38
+# REQUIRES: python-38, system-linux
 
 ## Test the functionality of combine_training_corpus_lib
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
index b22960556c99e..5ee0762a551a5 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_test.py
@@ -1,4 +1,4 @@
-# REQUIRES: python-38
+# REQUIRES: python-38, system-linux
 
 ## Test the functionality of extract_ir_lib
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
index d8b8ba1e67c37..0a2246f4572d3 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_test.py
@@ -1,4 +1,4 @@
-# REQUIRES: python-38
+# REQUIRES: python-38, system-linux
 
 ## Test the functionality of make_corpus_lib
 

From 6bb5c989bd725deb3f96d4374541b5aeec776ba2 Mon Sep 17 00:00:00 2001
From: ZhangYin <zhangyin2018@iscas.ac.cn>
Date: Sun, 21 Jan 2024 12:48:30 +0800
Subject: [PATCH 277/843] [libc++] <experimental/simd> Add load constructor for
 class simd/simd_mask (#76610)

---
 libcxx/docs/Status/ParallelismProjects.csv    |  2 +
 libcxx/include/experimental/__simd/scalar.h   |  7 ++
 libcxx/include/experimental/__simd/simd.h     |  6 ++
 .../include/experimental/__simd/simd_mask.h   |  6 ++
 libcxx/include/experimental/__simd/vec_ext.h  | 11 +++
 .../simd/simd.class/simd_ctor_load.pass.cpp   | 92 +++++++++++++++++++
 .../simd_mask_ctor_load.pass.cpp              | 67 ++++++++++++++
 .../test/std/experimental/simd/test_utils.h   | 12 +++
 8 files changed, 203 insertions(+)
 create mode 100644 libcxx/test/std/experimental/simd/simd.class/simd_ctor_load.pass.cpp
 create mode 100644 libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_ctor_load.pass.cpp

diff --git a/libcxx/docs/Status/ParallelismProjects.csv b/libcxx/docs/Status/ParallelismProjects.csv
index 67203e8d75152..06da008ac5fe9 100644
--- a/libcxx/docs/Status/ParallelismProjects.csv
+++ b/libcxx/docs/Status/ParallelismProjects.csv
@@ -22,6 +22,7 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd broadcast constructor <https://reviews.llvm.org/D156225>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd implicit type conversion constructor <https://github.com/llvm/llvm-project/pull/71132>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd generate constructor <https://reviews.llvm.org/D159442>`_", None, Yin Zhang, |Complete|
+| `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd load constructor <https://github.com/llvm/llvm-project/pull/76610>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "`simd subscript operators <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.class] <https://wg21.link/N4808>`_, "Class template simd implementation", None, Yin Zhang, |In Progress|
 | `[parallel.simd.nonmembers] <https://wg21.link/N4808>`_, "simd non-member operations", None, Yin Zhang, |In Progress|
@@ -30,6 +31,7 @@ Section,Description,Dependencies,Assignee,Complete
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask default constructor <https://github.com/llvm/llvm-project/pull/70424>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask broadcast constructor <https://reviews.llvm.org/D156225>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask implicit type conversion constructor <https://github.com/llvm/llvm-project/pull/71132>`_", None, Yin Zhang, |Complete|
+| `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask load constructor <https://github.com/llvm/llvm-project/pull/76610>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "`simd_mask subscript operators <https://github.com/llvm/llvm-project/pull/68960>`_", None, Yin Zhang, |Complete|
 | `[parallel.simd.mask.class] <https://wg21.link/N4808>`_, "Class template simd_mask implementation", None, Yin Zhang, |In Progress|
 | `[parallel.simd.mask.nonmembers] <https://wg21.link/N4808>`_, "simd_mask non-member operations", None, Yin Zhang, |In Progress|
diff --git a/libcxx/include/experimental/__simd/scalar.h b/libcxx/include/experimental/__simd/scalar.h
index 5eeff4c1e82a3..717fd6cd92d71 100644
--- a/libcxx/include/experimental/__simd/scalar.h
+++ b/libcxx/include/experimental/__simd/scalar.h
@@ -56,6 +56,11 @@ struct __simd_operations<_Tp, simd_abi::__scalar> {
   static _LIBCPP_HIDE_FROM_ABI _SimdStorage __generate(_Generator&& __g) noexcept {
     return {__g(std::integral_constant<size_t, 0>())};
   }
+
+  template <class _Up>
+  static _LIBCPP_HIDE_FROM_ABI void __load(_SimdStorage& __s, const _Up* __mem) noexcept {
+    __s.__data = static_cast<_Tp>(__mem[0]);
+  }
 };
 
 template <class _Tp>
@@ -63,6 +68,8 @@ struct __mask_operations<_Tp, simd_abi::__scalar> {
   using _MaskStorage = __mask_storage<_Tp, simd_abi::__scalar>;
 
   static _LIBCPP_HIDE_FROM_ABI _MaskStorage __broadcast(bool __v) noexcept { return {__v}; }
+
+  static _LIBCPP_HIDE_FROM_ABI void __load(_MaskStorage& __s, const bool* __mem) noexcept { __s.__data = __mem[0]; }
 };
 
 } // namespace parallelism_v2
diff --git a/libcxx/include/experimental/__simd/simd.h b/libcxx/include/experimental/__simd/simd.h
index c345811fee7fc..db4ebb8e4a381 100644
--- a/libcxx/include/experimental/__simd/simd.h
+++ b/libcxx/include/experimental/__simd/simd.h
@@ -64,6 +64,12 @@ class simd {
   explicit _LIBCPP_HIDE_FROM_ABI simd(_Generator&& __g) noexcept
       : __s_(_Impl::__generate(std::forward<_Generator>(__g))) {}
 
+  // load constructor
+  template <class _Up, class _Flags, enable_if_t<__is_vectorizable_v<_Up> && is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI simd(const _Up* __mem, _Flags) {
+    _Impl::__load(__s_, _Flags::template __apply<simd>(__mem));
+  }
+
   // scalar access [simd.subscr]
   _LIBCPP_HIDE_FROM_ABI reference operator[](size_t __i) noexcept { return reference(__s_, __i); }
   _LIBCPP_HIDE_FROM_ABI value_type operator[](size_t __i) const noexcept { return __s_.__get(__i); }
diff --git a/libcxx/include/experimental/__simd/simd_mask.h b/libcxx/include/experimental/__simd/simd_mask.h
index db03843b46e3a..754db7992683b 100644
--- a/libcxx/include/experimental/__simd/simd_mask.h
+++ b/libcxx/include/experimental/__simd/simd_mask.h
@@ -52,6 +52,12 @@ class simd_mask {
     }
   }
 
+  // load constructor
+  template <class _Flags, enable_if_t<is_simd_flag_type_v<_Flags>, int> = 0>
+  _LIBCPP_HIDE_FROM_ABI simd_mask(const value_type* __mem, _Flags) {
+    _Impl::__load(__s_, _Flags::template __apply<simd_mask>(__mem));
+  }
+
   // scalar access [simd.mask.subscr]
   _LIBCPP_HIDE_FROM_ABI reference operator[](size_t __i) noexcept { return reference(__s_, __i); }
   _LIBCPP_HIDE_FROM_ABI value_type operator[](size_t __i) const noexcept { return __s_.__get(__i); }
diff --git a/libcxx/include/experimental/__simd/vec_ext.h b/libcxx/include/experimental/__simd/vec_ext.h
index 07ba032f493b1..7883132ba6c0d 100644
--- a/libcxx/include/experimental/__simd/vec_ext.h
+++ b/libcxx/include/experimental/__simd/vec_ext.h
@@ -73,6 +73,12 @@ struct __simd_operations<_Tp, simd_abi::__vec_ext<_Np>> {
   static _LIBCPP_HIDE_FROM_ABI _SimdStorage __generate(_Generator&& __g) noexcept {
     return __generate_init(std::forward<_Generator>(__g), std::make_index_sequence<_Np>());
   }
+
+  template <class _Up>
+  static _LIBCPP_HIDE_FROM_ABI void __load(_SimdStorage& __s, const _Up* __mem) noexcept {
+    for (size_t __i = 0; __i < _Np; __i++)
+      __s.__data[__i] = static_cast<_Tp>(__mem[__i]);
+  }
 };
 
 template <class _Tp, int _Np>
@@ -87,6 +93,11 @@ struct __mask_operations<_Tp, simd_abi::__vec_ext<_Np>> {
     }
     return __result;
   }
+
+  static _LIBCPP_HIDE_FROM_ABI void __load(_MaskStorage& __s, const bool* __mem) noexcept {
+    for (size_t __i = 0; __i < _Np; __i++)
+      __s.__data[__i] = experimental::__set_all_bits<_Tp>(__mem[__i]);
+  }
 };
 
 } // namespace parallelism_v2
diff --git a/libcxx/test/std/experimental/simd/simd.class/simd_ctor_load.pass.cpp b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_load.pass.cpp
new file mode 100644
index 0000000000000..3992f3e450cb2
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.class/simd_ctor_load.pass.cpp
@@ -0,0 +1,92 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <experimental/simd>
+//
+// [simd.class]
+// template<class U, class Flags> simd(const U* mem, Flags);
+
+#include "../test_utils.h"
+
+namespace ex = std::experimental::parallelism_v2;
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct ElementAlignedLoadCtorHelper {
+  template <class U>
+  void operator()() const {
+    U buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      buffer[i] = static_cast<U>(i);
+    ex::simd<T, SimdAbi> origin_simd(buffer, ex::element_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct VectorAlignedLoadCtorHelper {
+  template <class U>
+  void operator()() const {
+    alignas(ex::memory_alignment_v<ex::simd<T, SimdAbi>, U>) U buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      buffer[i] = static_cast<U>(i);
+    ex::simd<T, SimdAbi> origin_simd(buffer, ex::vector_aligned_tag());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, class SimdAbi, std::size_t array_size>
+struct OveralignedLoadCtorHelper {
+  template <class U>
+  void operator()() const {
+    alignas(bit_ceil(sizeof(U) + 1)) U buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      buffer[i] = static_cast<U>(i);
+    ex::simd<T, SimdAbi> origin_simd(buffer, ex::overaligned_tag<bit_ceil(sizeof(U) + 1)>());
+    assert_simd_values_equal(origin_simd, buffer);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckSimdLoadCtor {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    types::for_each(arithmetic_no_bool_types(), ElementAlignedLoadCtorHelper<T, SimdAbi, array_size>());
+    types::for_each(arithmetic_no_bool_types(), VectorAlignedLoadCtorHelper<T, SimdAbi, array_size>());
+    types::for_each(arithmetic_no_bool_types(), OveralignedLoadCtorHelper<T, SimdAbi, array_size>());
+  }
+};
+
+template <class T, std::size_t>
+struct CheckLoadCtorTraits {
+  template <class SimdAbi>
+  void operator()() {
+    // This function shall not participate in overload resolution unless
+    // is_simd_flag_type_v<Flags> is true, and
+    // U is a vectorizable type.
+    static_assert(std::is_constructible_v<ex::simd<T, SimdAbi>, const int*, ex::element_aligned_tag>);
+
+    // is_simd_flag_type_v<Flags> is false
+    static_assert(!std::is_constructible_v<ex::simd<T, SimdAbi>, const int*, T>);
+    static_assert(!std::is_constructible_v<ex::simd<T, SimdAbi>, const int*, SimdAbi>);
+
+    // U is not a vectorizable type.
+    static_assert(!std::is_constructible_v<ex::simd<T, SimdAbi>, const SimdAbi*, ex::element_aligned_tag>);
+    static_assert(
+        !std::is_constructible_v<ex::simd<T, SimdAbi>, const ex::element_aligned_tag*, ex::element_aligned_tag>);
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckSimdLoadCtor>();
+  test_all_simd_abi<CheckLoadCtorTraits>();
+  return 0;
+}
diff --git a/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_ctor_load.pass.cpp b/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_ctor_load.pass.cpp
new file mode 100644
index 0000000000000..e59c38a2e6c4b
--- /dev/null
+++ b/libcxx/test/std/experimental/simd/simd.mask.class/simd_mask_ctor_load.pass.cpp
@@ -0,0 +1,67 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14
+
+// <experimental/simd>
+//
+// [simd.class]
+// template<class Flags> simd_mask(const value_type* mem, Flags);
+
+#include "../test_utils.h"
+
+namespace ex = std::experimental::parallelism_v2;
+
+template <class T, std::size_t>
+struct CheckSimdMaskLoadCtor {
+  template <class SimdAbi>
+  void operator()() {
+    constexpr std::size_t array_size = ex::simd_size_v<T, SimdAbi>;
+
+    // element aligned tag
+    bool element_buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      element_buffer[i] = static_cast<bool>(i % 2);
+    ex::simd_mask<T, SimdAbi> element_mask(element_buffer, ex::element_aligned_tag());
+    assert_simd_mask_values_equal(element_mask, element_buffer);
+
+    // vector aligned tag
+    alignas(ex::memory_alignment_v<ex::simd_mask<T, SimdAbi>>) bool vector_buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      vector_buffer[i] = static_cast<bool>(i % 2);
+    ex::simd_mask<T, SimdAbi> vector_mask(vector_buffer, ex::vector_aligned_tag());
+    assert_simd_mask_values_equal(vector_mask, vector_buffer);
+
+    // overaligned tag
+    alignas(bit_ceil(sizeof(bool) + 1)) bool overaligned_buffer[array_size];
+    for (size_t i = 0; i < array_size; ++i)
+      overaligned_buffer[i] = static_cast<bool>(i % 2);
+    ex::simd_mask<T, SimdAbi> overaligned_mask(overaligned_buffer, ex::overaligned_tag<bit_ceil(sizeof(bool) + 1)>());
+    assert_simd_mask_values_equal(overaligned_mask, overaligned_buffer);
+  }
+};
+
+template <class T, std::size_t>
+struct CheckMaskLoadCtorTraits {
+  template <class SimdAbi>
+  void operator()() {
+    // This function shall not participate in overload resolution unless
+    // is_simd_flag_type_v<Flags> is true
+    static_assert(std::is_constructible_v<ex::simd_mask<T, SimdAbi>, const bool*, ex::element_aligned_tag>);
+
+    // is_simd_flag_type_v<Flags> is false
+    static_assert(!std::is_constructible_v<ex::simd_mask<T, SimdAbi>, const bool*, T>);
+    static_assert(!std::is_constructible_v<ex::simd_mask<T, SimdAbi>, const bool*, SimdAbi>);
+  }
+};
+
+int main(int, char**) {
+  test_all_simd_abi<CheckSimdMaskLoadCtor>();
+  test_all_simd_abi<CheckMaskLoadCtorTraits>();
+  return 0;
+}
diff --git a/libcxx/test/std/experimental/simd/test_utils.h b/libcxx/test/std/experimental/simd/test_utils.h
index 30a48521aa589..b3679b51e50b5 100644
--- a/libcxx/test/std/experimental/simd/test_utils.h
+++ b/libcxx/test/std/experimental/simd/test_utils.h
@@ -79,4 +79,16 @@ void assert_simd_mask_values_equal(const ex::simd_mask<T, SimdAbi>& origin_mask,
     assert(origin_mask[i] == expected_value[i]);
 }
 
+template <class SimdAbi, class T, class U = T>
+void assert_simd_values_equal(const ex::simd<T, SimdAbi>& origin_simd, U* expected_value) {
+  for (size_t i = 0; i < origin_simd.size(); ++i)
+    assert(origin_simd[i] == static_cast<T>(expected_value[i]));
+}
+
+template <class SimdAbi, class T>
+void assert_simd_mask_values_equal(const ex::simd_mask<T, SimdAbi>& origin_mask, bool* expected_value) {
+  for (size_t i = 0; i < origin_mask.size(); ++i)
+    assert(origin_mask[i] == expected_value[i]);
+}
+
 #endif // LIBCXX_TEST_STD_EXPERIMENTAL_SIMD_TEST_UTILS_H

From 3412bc765887d2b2244e4338adc2f49420cce9c8 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hristo.goshev.hristov@gmail.com>
Date: Sun, 21 Jan 2024 07:16:51 +0200
Subject: [PATCH 278/843] [libc++][variant] P2637R3: Member `visit`
 (`std::variant`) (#76447)

Implements parts of: `P2637R3` https://wg21.link/P2637R3
(https://eel.is/c++draft/variant.visit)

Implements:
`variant.visit()`
`variant.visit<R>()`

The tests are as close as possible to the non-member function.

To land after: https://github.com/llvm/llvm-project/pull/76268

---------

Co-authored-by: Zingam <zingam@outlook.com>
---
 libcxx/docs/Status/Cxx2cPapers.csv            |   2 +-
 libcxx/include/__config                       |   5 +
 libcxx/include/variant                        |  48 ++-
 .../robust_against_adl.pass.cpp               |  49 +++
 .../variant.visit.member/visit.pass.cpp       | 258 +++++++++++++
 .../visit_return_type.pass.cpp                | 345 ++++++++++++++++++
 .../generate_feature_test_macro_components.py |   6 +-
 7 files changed, 706 insertions(+), 7 deletions(-)
 create mode 100644 libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp
 create mode 100644 libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
 create mode 100644 libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp

diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index bdfdfdd2d53c8..f6124d9a6c4e3 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -17,7 +17,7 @@
 "`P0792R14 <https://wg21.link/P0792R14>`__","LWG","``function_ref``: a type-erased callable reference","Varna June 2023","","",""
 "`P2874R2 <https://wg21.link/P2874R2>`__","LWG","Mandating Annex D Require No More","Varna June 2023","","",""
 "`P2757R3 <https://wg21.link/P2757R3>`__","LWG","Type-checking format args","Varna June 2023","","","|format|"
-"`P2637R3 <https://wg21.link/P2637R3>`__","LWG","Member ``visit``","Varna June 2023","","","|format|"
+"`P2637R3 <https://wg21.link/P2637R3>`__","LWG","Member ``visit``","Varna June 2023","|Partial|","18.0",""
 "`P2641R4 <https://wg21.link/P2641R4>`__","CWG, LWG","Checking if a ``union`` alternative is active","Varna June 2023","","",""
 "`P1759R6 <https://wg21.link/P1759R6>`__","LWG","Native handles and file streams","Varna June 2023","|Complete|","18.0",""
 "`P2697R1 <https://wg21.link/P2697R1>`__","LWG","Interfacing ``bitset`` with ``string_view``","Varna June 2023","|Complete|","18.0",""
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 90a4585938a13..2825d03b7c205 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1513,6 +1513,11 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 #    define _LIBCPP_DISABLE_UBSAN_UNSIGNED_INTEGER_CHECK
 #  endif
 
+// Clang-18 has support for deducing this, but it does not set the FTM.
+#  if defined(__cpp_explicit_this_parameter) || (defined(_LIBCPP_CLANG_VER) && _LIBCPP_CLANG_VER >= 1800)
+#    define _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#  endif
+
 #endif // __cplusplus
 
 #endif // _LIBCPP___CONFIG
diff --git a/libcxx/include/variant b/libcxx/include/variant
index 6179b2a1a0ab6..ac69645a0fab0 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -69,6 +69,12 @@ namespace std {
 
     // 20.7.2.6, swap
     void swap(variant&) noexcept(see below);
+
+    // [variant.visit], visitation
+    template<class Self, class Visitor>
+      constexpr decltype(auto) visit(this Self&&, Visitor&&); // Since C++26
+    template<class R, class Self, class Visitor>
+      constexpr R visit(this Self&&, Visitor&&);              // Since C++26
   };
 
   // 20.7.3, variant helper classes
@@ -235,6 +241,7 @@ namespace std {
 #include <__type_traits/void_t.h>
 #include <__utility/declval.h>
 #include <__utility/forward.h>
+#include <__utility/forward_like.h>
 #include <__utility/in_place.h>
 #include <__utility/move.h>
 #include <__utility/swap.h>
@@ -1130,6 +1137,19 @@ using __best_match_t = typename invoke_result_t<_MakeOverloads<_Types...>, _Tp,
 
 } // namespace __variant_detail
 
+template <class _Visitor, class... _Vs, typename = void_t<decltype(std::__as_variant(std::declval<_Vs>()))...>>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr decltype(auto)
+visit(_Visitor&& __visitor, _Vs&&... __vs);
+
+#  if _LIBCPP_STD_VER >= 20
+template <class _Rp,
+          class _Visitor,
+          class... _Vs,
+          typename = void_t<decltype(std::__as_variant(std::declval<_Vs>()))...>>
+_LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr _Rp
+visit(_Visitor&& __visitor, _Vs&&... __vs);
+#  endif
+
 template <class... _Types>
 class _LIBCPP_TEMPLATE_VIS _LIBCPP_DECLSPEC_EMPTY_BASES variant
     : private __sfinae_ctor_base< __all<is_copy_constructible_v<_Types>...>::value,
@@ -1273,6 +1293,27 @@ public:
     __impl_.__swap(__that.__impl_);
   }
 
+#  if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
+  // Helper class to implement [variant.visit]/10
+  //   Constraints: The call to visit does not use an explicit template-argument-list
+  //   that begins with a type template-argument.
+  struct __variant_visit_barrier_tag {
+    _LIBCPP_HIDE_FROM_ABI explicit __variant_visit_barrier_tag() = default;
+  };
+
+  template <__variant_visit_barrier_tag = __variant_visit_barrier_tag{}, class _Self, class _Visitor>
+  _LIBCPP_HIDE_FROM_ABI constexpr decltype(auto) visit(this _Self&& __self, _Visitor&& __visitor) {
+    using _VariantT = _OverrideRef<_Self&&, _CopyConst<remove_reference_t<_Self>, variant>>;
+    return std::visit(std::forward<_Visitor>(__visitor), (_VariantT)__self);
+  }
+
+  template <class _Rp, class _Self, class _Visitor>
+  _LIBCPP_HIDE_FROM_ABI constexpr _Rp visit(this _Self&& __self, _Visitor&& __visitor) {
+    using _VariantT = _OverrideRef<_Self&&, _CopyConst<remove_reference_t<_Self>, variant>>;
+    return std::visit<_Rp>(std::forward<_Visitor>(__visitor), (_VariantT)__self);
+  }
+#  endif
+
 private:
   __variant_detail::__impl<_Types...> __impl_;
 
@@ -1511,7 +1552,7 @@ _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr vo
   }
 }
 
-template < class _Visitor, class... _Vs, typename = void_t<decltype(std::__as_variant(std::declval<_Vs>()))...> >
+template < class _Visitor, class... _Vs, typename>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr decltype(auto)
 visit(_Visitor&& __visitor, _Vs&&... __vs) {
   using __variant_detail::__visitation::__variant;
@@ -1520,10 +1561,7 @@ visit(_Visitor&& __visitor, _Vs&&... __vs) {
 }
 
 #  if _LIBCPP_STD_VER >= 20
-template < class _Rp,
-           class _Visitor,
-           class... _Vs,
-           typename = void_t<decltype(std::__as_variant(std::declval<_Vs>()))...> >
+template < class _Rp, class _Visitor, class... _Vs, typename>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS constexpr _Rp
 visit(_Visitor&& __visitor, _Vs&&... __vs) {
   using __variant_detail::__visitation::__variant;
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp
new file mode 100644
index 0000000000000..c54f2b722d46a
--- /dev/null
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/robust_against_adl.pass.cpp
@@ -0,0 +1,49 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// The tested functionality needs deducing this.
+// UNSUPPORTED: clang-16 || clang-17
+// XFAIL: apple-clang
+
+// <variant>
+
+// class variant;
+// template<class Self, class Visitor>
+//   constexpr decltype(auto) visit(this Self&&, Visitor&&); // since C++26
+// template<class R, class Self, class Visitor>
+//   constexpr R visit(this Self&&, Visitor&&);              // since C++26
+
+#include <variant>
+
+#include "test_macros.h"
+
+struct Incomplete;
+template <class T>
+struct Holder {
+  T t;
+};
+
+constexpr bool test(bool do_it) {
+  if (do_it) {
+    std::variant<Holder<Incomplete>*, int> v = nullptr;
+
+    v.visit([](auto) {});
+    v.visit([](auto) -> Holder<Incomplete>* { return nullptr; });
+    v.visit<void>([](auto) {});
+    v.visit<void*>([](auto) -> Holder<Incomplete>* { return nullptr; });
+  }
+  return true;
+}
+
+int main(int, char**) {
+  test(true);
+  static_assert(test(true));
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
new file mode 100644
index 0000000000000..68706d6c32af4
--- /dev/null
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit.pass.cpp
@@ -0,0 +1,258 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// The tested functionality needs deducing this.
+// UNSUPPORTED: clang-16 || clang-17
+// XFAIL: apple-clang
+
+// <variant>
+
+// class variant;
+
+// template<class Self, class Visitor>
+//   constexpr decltype(auto) visit(this Self&&, Visitor&&); // since C++26
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "test_macros.h"
+#include "variant_test_helpers.h"
+
+void test_call_operator_forwarding() {
+  using Fn = ForwardingCallObject;
+  Fn obj{};
+  const Fn& cobj = obj;
+
+  { // test call operator forwarding - single variant, single arg
+    using V = std::variant<int>;
+    V v(42);
+
+    v.visit(obj);
+    assert(Fn::check_call<int&>(CT_NonConst | CT_LValue));
+    v.visit(cobj);
+    assert(Fn::check_call<int&>(CT_Const | CT_LValue));
+    v.visit(std::move(obj));
+    assert(Fn::check_call<int&>(CT_NonConst | CT_RValue));
+    v.visit(std::move(cobj));
+    assert(Fn::check_call<int&>(CT_Const | CT_RValue));
+  }
+  { // test call operator forwarding - single variant, multi arg
+    using V = std::variant<int, long, double>;
+    V v(42L);
+
+    v.visit(obj);
+    assert(Fn::check_call<long&>(CT_NonConst | CT_LValue));
+    v.visit(cobj);
+    assert(Fn::check_call<long&>(CT_Const | CT_LValue));
+    v.visit(std::move(obj));
+    assert(Fn::check_call<long&>(CT_NonConst | CT_RValue));
+    v.visit(std::move(cobj));
+    assert(Fn::check_call<long&>(CT_Const | CT_RValue));
+  }
+}
+
+// Applies to non-member `std::visit` only.
+void test_argument_forwarding() {
+  using Fn = ForwardingCallObject;
+  Fn obj{};
+  const auto val = CT_LValue | CT_NonConst;
+
+  { // single argument - value type
+    using V = std::variant<int>;
+    V v(42);
+    const V& cv = v;
+
+    v.visit(obj);
+    assert(Fn::check_call<int&>(val));
+    cv.visit(obj);
+    assert(Fn::check_call<const int&>(val));
+    std::move(v).visit(obj);
+    assert(Fn::check_call<int&&>(val));
+    std::move(cv).visit(obj);
+    assert(Fn::check_call<const int&&>(val));
+  }
+#if !defined(TEST_VARIANT_HAS_NO_REFERENCES)
+  { // single argument - lvalue reference
+    using V = std::variant<int&>;
+    int x   = 42;
+    V v(x);
+    const V& cv = v;
+
+    v.visit(obj);
+    assert(Fn::check_call<int&>(val));
+    cv.visit(obj);
+    assert(Fn::check_call<int&>(val));
+    std::move(v).visit(obj);
+    assert(Fn::check_call<int&>(val));
+    std::move(cv).visit(obj);
+    assert(Fn::check_call<int&>(val));
+    assert(false);
+  }
+  { // single argument - rvalue reference
+    using V = std::variant<int&&>;
+    int x   = 42;
+    V v(std::move(x));
+    const V& cv = v;
+
+    v.visit(obj);
+    assert(Fn::check_call<int&>(val));
+    cvstd::visit(obj);
+    assert(Fn::check_call<int&>(val));
+    std::move(v).visit(obj);
+    assert(Fn::check_call<int&&>(val));
+    std::move(cv).visit(obj);
+    assert(Fn::check_call<int&&>(val));
+  }
+#endif
+}
+
+void test_return_type() {
+  using Fn = ForwardingCallObject;
+  Fn obj{};
+  const Fn& cobj = obj;
+
+  { // test call operator forwarding - single variant, single arg
+    using V = std::variant<int>;
+    V v(42);
+
+    static_assert(std::is_same_v<decltype(v.visit(obj)), Fn&>);
+    static_assert(std::is_same_v<decltype(v.visit(cobj)), const Fn&>);
+    static_assert(std::is_same_v<decltype(v.visit(std::move(obj))), Fn&&>);
+    static_assert(std::is_same_v<decltype(v.visit(std::move(cobj))), const Fn&&>);
+  }
+  { // test call operator forwarding - single variant, multi arg
+    using V = std::variant<int, long, double>;
+    V v(42L);
+
+    static_assert(std::is_same_v<decltype(v.visit(obj)), Fn&>);
+    static_assert(std::is_same_v<decltype(v.visit(cobj)), const Fn&>);
+    static_assert(std::is_same_v<decltype(v.visit(std::move(obj))), Fn&&>);
+    static_assert(std::is_same_v<decltype(v.visit(std::move(cobj))), const Fn&&>);
+  }
+}
+
+void test_constexpr() {
+  constexpr ReturnFirst obj{};
+
+  {
+    using V = std::variant<int>;
+    constexpr V v(42);
+
+    static_assert(v.visit(obj) == 42);
+  }
+  {
+    using V = std::variant<short, long, char>;
+    constexpr V v(42L);
+
+    static_assert(v.visit(obj) == 42);
+  }
+}
+
+void test_exceptions() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  ReturnArity obj{};
+
+  auto test = [&](auto&& v) {
+    try {
+      v.visit(obj);
+    } catch (const std::bad_variant_access&) {
+      return true;
+    } catch (...) {
+    }
+    return false;
+  };
+
+  {
+    using V = std::variant<int, MakeEmptyT>;
+    V v;
+    makeEmpty(v);
+
+    assert(test(v));
+  }
+#endif
+}
+
+// See https://llvm.org/PR31916
+void test_caller_accepts_nonconst() {
+  struct A {};
+  struct Visitor {
+    void operator()(A&) {}
+  };
+  std::variant<A> v;
+
+  v.visit(Visitor{});
+}
+
+struct MyVariant : std::variant<short, long, float> {};
+
+namespace std {
+template <std::size_t Index>
+void get(const MyVariant&) {
+  assert(false);
+}
+} // namespace std
+
+void test_derived_from_variant() {
+  auto v1        = MyVariant{42};
+  const auto cv1 = MyVariant{142};
+
+  v1.visit([](auto x) { assert(x == 42); });
+  cv1.visit([](auto x) { assert(x == 142); });
+  MyVariant{-1.25f}.visit([](auto x) { assert(x == -1.25f); });
+  std::move(v1).visit([](auto x) { assert(x == 42); });
+  std::move(cv1).visit([](auto x) { assert(x == 142); });
+
+  // Check that visit does not take index nor valueless_by_exception members from the base class.
+  struct EvilVariantBase {
+    int index;
+    char valueless_by_exception;
+  };
+
+  struct EvilVariant1 : std::variant<int, long, double>, std::tuple<int>, EvilVariantBase {
+    using std::variant<int, long, double>::variant;
+  };
+
+  EvilVariant1{12}.visit([](auto x) { assert(x == 12); });
+  EvilVariant1{12.3}.visit([](auto x) { assert(x == 12.3); });
+
+  // Check that visit unambiguously picks the variant, even if the other base has __impl member.
+  struct ImplVariantBase {
+    struct Callable {
+      bool operator()() const {
+        assert(false);
+        return false;
+      }
+    };
+
+    Callable __impl;
+  };
+
+  struct EvilVariant2 : std::variant<int, long, double>, ImplVariantBase {
+    using std::variant<int, long, double>::variant;
+  };
+
+  EvilVariant2{12}.visit([](auto x) { assert(x == 12); });
+  EvilVariant2{12.3}.visit([](auto x) { assert(x == 12.3); });
+}
+
+int main(int, char**) {
+  test_call_operator_forwarding();
+  test_argument_forwarding();
+  test_return_type();
+  test_constexpr();
+  test_exceptions();
+  test_caller_accepts_nonconst();
+  test_derived_from_variant();
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
new file mode 100644
index 0000000000000..20472c62fc5f9
--- /dev/null
+++ b/libcxx/test/std/utilities/variant/variant.visit.member/visit_return_type.pass.cpp
@@ -0,0 +1,345 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// The tested functionality needs deducing this.
+// UNSUPPORTED: clang-16 || clang-17
+// XFAIL: apple-clang
+
+// <variant>
+
+// class variant;
+
+// template<class R, class Self, class Visitor>
+//   constexpr R visit(this Self&&, Visitor&&);              // since C++26
+
+#include <cassert>
+#include <memory>
+#include <string>
+#include <type_traits>
+#include <utility>
+#include <variant>
+
+#include "test_macros.h"
+#include "variant_test_helpers.h"
+
+template <class... Ts>
+struct overloaded : Ts... {
+  using Ts::operator()...;
+};
+
+void test_overload_ambiguity() {
+  using V = std::variant<float, long, std::string>;
+  using namespace std::string_literals;
+  V v{"baba"s};
+
+  v.visit(
+      overloaded{[]([[maybe_unused]] auto x) { assert(false); }, [](const std::string& x) { assert(x == "baba"s); }});
+  assert(std::get<std::string>(v) == "baba"s);
+
+  // Test the constraint.
+  v = std::move(v).visit<V>(overloaded{
+      []([[maybe_unused]] auto x) {
+        assert(false);
+        return 0;
+      },
+      [](const std::string& x) {
+        assert(x == "baba"s);
+        return x + " zmt"s;
+      }});
+  assert(std::get<std::string>(v) == "baba zmt"s);
+}
+
+template <typename ReturnType>
+void test_call_operator_forwarding() {
+  using Fn = ForwardingCallObject;
+  Fn obj{};
+  const Fn& cobj = obj;
+
+  { // test call operator forwarding - single variant, single arg
+    using V = std::variant<int>;
+    V v(42);
+
+    v.visit<ReturnType>(obj);
+    assert(Fn::check_call<int&>(CT_NonConst | CT_LValue));
+    v.visit<ReturnType>(cobj);
+    assert(Fn::check_call<int&>(CT_Const | CT_LValue));
+    v.visit<ReturnType>(std::move(obj));
+    assert(Fn::check_call<int&>(CT_NonConst | CT_RValue));
+    v.visit<ReturnType>(std::move(cobj));
+    assert(Fn::check_call<int&>(CT_Const | CT_RValue));
+  }
+  { // test call operator forwarding - single variant, multi arg
+    using V = std::variant<int, long, double>;
+    V v(42L);
+
+    v.visit<ReturnType>(obj);
+    assert(Fn::check_call<long&>(CT_NonConst | CT_LValue));
+    v.visit<ReturnType>(cobj);
+    assert(Fn::check_call<long&>(CT_Const | CT_LValue));
+    v.visit<ReturnType>(std::move(obj));
+    assert(Fn::check_call<long&>(CT_NonConst | CT_RValue));
+    v.visit<ReturnType>(std::move(cobj));
+    assert(Fn::check_call<long&>(CT_Const | CT_RValue));
+  }
+}
+
+template <typename ReturnType>
+void test_argument_forwarding() {
+  using Fn = ForwardingCallObject;
+  Fn obj{};
+  const auto val = CT_LValue | CT_NonConst;
+
+  { // single argument - value type
+    using V = std::variant<int>;
+    V v(42);
+    const V& cv = v;
+
+    v.visit<ReturnType>(obj);
+    assert(Fn::check_call<int&>(val));
+    cv.visit<ReturnType>(obj);
+    assert(Fn::check_call<const int&>(val));
+    std::move(v).visit<ReturnType>(obj);
+    assert(Fn::check_call<int&&>(val));
+    std::move(cv).visit<ReturnType>(obj);
+    assert(Fn::check_call<const int&&>(val));
+  }
+#if !defined(TEST_VARIANT_HAS_NO_REFERENCES)
+  { // single argument - lvalue reference
+    using V = std::variant<int&>;
+    int x   = 42;
+    V v(x);
+    const V& cv = v;
+
+    v.visit<ReturnType>(obj);
+    assert(Fn::check_call<int&>(val));
+    cv.visit<ReturnType>(obj);
+    assert(Fn::check_call<int&>(val));
+    std::move(v).visit<ReturnType>(obj);
+    assert(Fn::check_call<int&>(val));
+    std::move(cv).visit<ReturnType>(obj);
+    assert(Fn::check_call<int&>(val));
+  }
+  { // single argument - rvalue reference
+    using V = std::variant<int&&>;
+    int x   = 42;
+    V v(std::move(x));
+    const V& cv = v;
+
+    v.visit<ReturnType>(obj);
+    assert(Fn::check_call<int&>(val));
+    cv.visit<ReturnType>(obj);
+    assert(Fn::check_call<int&>(val));
+    std::move(v).visit<ReturnType>(obj);
+    assert(Fn::check_call<int&&>(val));
+    std::move(cv).visit<ReturnType>(obj);
+    assert(Fn::check_call<int&&>(val));
+  }
+#endif
+}
+
+template <typename ReturnType>
+void test_return_type() {
+  using Fn = ForwardingCallObject;
+  Fn obj{};
+  const Fn& cobj = obj;
+
+  { // test call operator forwarding - no variant
+    // non-member
+    {
+      static_assert(std::is_same_v<decltype(std::visit<ReturnType>(obj)), ReturnType>);
+      static_assert(std::is_same_v<decltype(std::visit<ReturnType>(cobj)), ReturnType>);
+      static_assert(std::is_same_v<decltype(std::visit<ReturnType>(std::move(obj))), ReturnType>);
+      static_assert(std::is_same_v<decltype(std::visit<ReturnType>(std::move(cobj))), ReturnType>);
+    }
+  }
+  { // test call operator forwarding - single variant, single arg
+    using V = std::variant<int>;
+    V v(42);
+
+    static_assert(std::is_same_v<decltype(v.visit<ReturnType>(obj)), ReturnType>);
+    static_assert(std::is_same_v<decltype(v.visit<ReturnType>(cobj)), ReturnType>);
+    static_assert(std::is_same_v<decltype(v.visit<ReturnType>(std::move(obj))), ReturnType>);
+    static_assert(std::is_same_v<decltype(v.visit<ReturnType>(std::move(cobj))), ReturnType>);
+  }
+  { // test call operator forwarding - single variant, multi arg
+    using V = std::variant<int, long, double>;
+    V v(42L);
+
+    static_assert(std::is_same_v<decltype(v.visit<ReturnType>(obj)), ReturnType>);
+    static_assert(std::is_same_v<decltype(v.visit<ReturnType>(cobj)), ReturnType>);
+    static_assert(std::is_same_v<decltype(v.visit<ReturnType>(std::move(obj))), ReturnType>);
+    static_assert(std::is_same_v<decltype(v.visit<ReturnType>(std::move(cobj))), ReturnType>);
+  }
+}
+
+void test_constexpr_void() {
+  constexpr ReturnFirst obj{};
+
+  {
+    using V = std::variant<int>;
+    constexpr V v(42);
+
+    static_assert((v.visit<void>(obj), 42) == 42);
+  }
+  {
+    using V = std::variant<short, long, char>;
+    constexpr V v(42L);
+
+    static_assert((v.visit<void>(obj), 42) == 42);
+  }
+}
+
+void test_constexpr_int() {
+  constexpr ReturnFirst obj{};
+
+  {
+    using V = std::variant<int>;
+    constexpr V v(42);
+
+    static_assert(v.visit<int>(obj) == 42);
+  }
+  {
+    using V = std::variant<short, long, char>;
+    constexpr V v(42L);
+
+    static_assert(v.visit<int>(obj) == 42);
+  }
+}
+
+template <typename ReturnType>
+void test_exceptions() {
+#ifndef TEST_HAS_NO_EXCEPTIONS
+  ReturnArity obj{};
+
+  auto test = [&](auto&& v) {
+    try {
+      v.template visit<ReturnType>(obj);
+    } catch (const std::bad_variant_access&) {
+      return true;
+    } catch (...) {
+    }
+    return false;
+  };
+
+  {
+    using V = std::variant<int, MakeEmptyT>;
+    V v;
+    makeEmpty(v);
+
+    assert(test(v));
+  }
+#endif
+}
+
+// See https://bugs.llvm.org/show_bug.cgi?id=31916
+template <typename ReturnType>
+void test_caller_accepts_nonconst() {
+  struct A {};
+  struct Visitor {
+    auto operator()(A&) {
+      if constexpr (!std::is_void_v<ReturnType>) {
+        return ReturnType{};
+      }
+    }
+  };
+  std::variant<A> v;
+
+  v.template visit<ReturnType>(Visitor{});
+}
+
+void test_constexpr_explicit_side_effect() {
+  auto test_lambda = [](int arg) constexpr {
+    std::variant<int> v = 101;
+
+    {
+      v.template visit<void>([arg](int& x) constexpr { x = arg; });
+    }
+
+    return std::get<int>(v);
+  };
+
+  static_assert(test_lambda(202) == 202);
+}
+
+void test_derived_from_variant() {
+  struct MyVariant : std::variant<short, long, float> {};
+
+  MyVariant{42}.template visit<bool>([](auto x) {
+    assert(x == 42);
+    return true;
+  });
+  MyVariant{-1.3f}.template visit<bool>([](auto x) {
+    assert(x == -1.3f);
+    return true;
+  });
+
+  // Check that visit does not take index nor valueless_by_exception members from the base class.
+  struct EvilVariantBase {
+    int index;
+    char valueless_by_exception;
+  };
+
+  struct EvilVariant1 : std::variant<int, long, double>, std::tuple<int>, EvilVariantBase {
+    using std::variant<int, long, double>::variant;
+  };
+
+  EvilVariant1{12}.template visit<bool>([](auto x) {
+    assert(x == 12);
+    return true;
+  });
+  EvilVariant1{12.3}.template visit<bool>([](auto x) {
+    assert(x == 12.3);
+    return true;
+  });
+
+  // Check that visit unambiguously picks the variant, even if the other base has __impl member.
+  struct ImplVariantBase {
+    struct Callable {
+      bool operator()() const {
+        assert(false);
+        return false;
+      }
+    };
+
+    Callable __impl;
+  };
+
+  struct EvilVariant2 : std::variant<int, long, double>, ImplVariantBase {
+    using std::variant<int, long, double>::variant;
+  };
+
+  EvilVariant2{12}.template visit<bool>([](auto x) {
+    assert(x == 12);
+    return true;
+  });
+  EvilVariant2{12.3}.template visit<bool>([](auto x) {
+    assert(x == 12.3);
+    return true;
+  });
+}
+
+int main(int, char**) {
+  test_overload_ambiguity();
+  test_call_operator_forwarding<void>();
+  test_argument_forwarding<void>();
+  test_return_type<void>();
+  test_constexpr_void();
+  test_exceptions<void>();
+  test_caller_accepts_nonconst<void>();
+  test_call_operator_forwarding<int>();
+  test_argument_forwarding<int>();
+  test_return_type<int>();
+  test_constexpr_int();
+  test_exceptions<int>();
+  test_caller_accepts_nonconst<int>();
+  test_constexpr_explicit_side_effect();
+  test_derived_from_variant();
+
+  return 0;
+}
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 958c34edde005..a5df187b046f6 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1267,7 +1267,11 @@ def add_version_header(tc):
         },
         {
             "name": "__cpp_lib_variant",
-            "values": {"c++17": 202102},
+            "values": {
+                "c++17": 202102,  # std::visit for classes derived from std::variant
+                # "c++20": 202106,  # Fully constexpr std::variant
+                # "c++26": 202306,  # Member visit (implemented)
+            },
             "headers": ["variant"],
         },
         {

From f9614b328ad502cecfeb0d923f7abda30173d4be Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 20 Jan 2024 22:16:29 -0800
Subject: [PATCH 279/843] [AArch64] Improve nonlazybind test

Prepare for -fno-plt implementation.
---
 llvm/test/CodeGen/AArch64/nonlazybind.ll | 118 +++++++++++++++++------
 1 file changed, 90 insertions(+), 28 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/nonlazybind.ll b/llvm/test/CodeGen/AArch64/nonlazybind.ll
index 7f5701495ef92..faf4a1bb84037 100644
--- a/llvm/test/CodeGen/AArch64/nonlazybind.ll
+++ b/llvm/test/CodeGen/AArch64/nonlazybind.ll
@@ -1,40 +1,102 @@
-; RUN: llc -mtriple=aarch64-apple-ios %s -o - -aarch64-enable-nonlazybind | FileCheck %s
-; RUN: llc -mtriple=aarch64-apple-ios %s -o - | FileCheck %s --check-prefix=CHECK-NORMAL
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64-apple-ios %s -o - -aarch64-enable-nonlazybind | FileCheck %s --check-prefix=MACHO
+; RUN: llc -mtriple=aarch64-apple-ios %s -o - | FileCheck %s --check-prefix=MACHO-NORMAL
+; RUN: llc -mtriple=aarch64 -fast-isel %s -o - | FileCheck %s --check-prefixes=ELF,ELF-FI
+; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=ELF,ELF-GI
+; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=ELF,ELF-SDAG
 
-define void @local() nonlazybind {
+define dso_preemptable void @preemptable() nonlazybind {
+; MACHO-LABEL: preemptable:
+; MACHO:       ; %bb.0:
+; MACHO-NEXT:    ret
+;
+; MACHO-NORMAL-LABEL: preemptable:
+; MACHO-NORMAL:       ; %bb.0:
+; MACHO-NORMAL-NEXT:    ret
+;
+; ELF-LABEL: preemptable:
+; ELF:       // %bb.0:
+; ELF-NEXT:    ret
   ret void
 }
 
-declare void @nonlocal() nonlazybind
-
-define void @test_laziness() {
-; CHECK-LABEL: test_laziness:
-
-; CHECK: bl _local
-
-; CHECK: adrp x[[TMP:[0-9]+]], _nonlocal@GOTPAGE
-; CHECK: ldr [[FUNC:x[0-9]+]], [x[[TMP]], _nonlocal@GOTPAGEOFF]
-; CHECK: blr [[FUNC]]
+define dso_local void @local() nonlazybind {
+; MACHO-LABEL: local:
+; MACHO:       ; %bb.0:
+; MACHO-NEXT:    ret
+;
+; MACHO-NORMAL-LABEL: local:
+; MACHO-NORMAL:       ; %bb.0:
+; MACHO-NORMAL-NEXT:    ret
+;
+; ELF-LABEL: local:
+; ELF:       // %bb.0:
+; ELF-NEXT:    ret
+  ret void
+}
 
-; CHECK-NORMAL-LABEL: test_laziness:
-; CHECK-NORMAL: bl _local
-; CHECK-NORMAL: bl _nonlocal
+declare void @external() nonlazybind
 
+define void @test_laziness() nounwind {
+;
+; MACHO-LABEL: test_laziness:
+; MACHO:       ; %bb.0:
+; MACHO-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; MACHO-NEXT:    bl _preemptable
+; MACHO-NEXT:    bl _local
+; MACHO-NEXT:  Lloh0:
+; MACHO-NEXT:    adrp x8, _external@GOTPAGE
+; MACHO-NEXT:  Lloh1:
+; MACHO-NEXT:    ldr x8, [x8, _external@GOTPAGEOFF]
+; MACHO-NEXT:    blr x8
+; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; MACHO-NEXT:    ret
+; MACHO-NEXT:    .loh AdrpLdrGot Lloh0, Lloh1
+;
+; MACHO-NORMAL-LABEL: test_laziness:
+; MACHO-NORMAL:       ; %bb.0:
+; MACHO-NORMAL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
+; MACHO-NORMAL-NEXT:    bl _preemptable
+; MACHO-NORMAL-NEXT:    bl _local
+; MACHO-NORMAL-NEXT:    bl _external
+; MACHO-NORMAL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; MACHO-NORMAL-NEXT:    ret
+;
+; ELF-LABEL: test_laziness:
+; ELF:       // %bb.0:
+; ELF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; ELF-NEXT:    bl preemptable
+; ELF-NEXT:    bl local
+; ELF-NEXT:    bl external
+; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; ELF-NEXT:    ret
+  call void @preemptable()
   call void @local()
-  call void @nonlocal()
+  call void @external()
   ret void
 }
 
-define void @test_laziness_tail() {
-; CHECK-LABEL: test_laziness_tail:
-
-; CHECK: adrp x[[TMP:[0-9]+]], _nonlocal@GOTPAGE
-; CHECK: ldr [[FUNC:x[0-9]+]], [x[[TMP]], _nonlocal@GOTPAGEOFF]
-; CHECK: br [[FUNC]]
-
-; CHECK-NORMAL-LABEL: test_laziness_tail:
-; CHECK-NORMAL: b _nonlocal
-
-  tail call void @nonlocal()
+define void @test_laziness_tail() nounwind {
+; MACHO-LABEL: test_laziness_tail:
+; MACHO:       ; %bb.0:
+; MACHO-NEXT:  Lloh2:
+; MACHO-NEXT:    adrp x0, _external@GOTPAGE
+; MACHO-NEXT:  Lloh3:
+; MACHO-NEXT:    ldr x0, [x0, _external@GOTPAGEOFF]
+; MACHO-NEXT:    br x0
+; MACHO-NEXT:    .loh AdrpLdrGot Lloh2, Lloh3
+;
+; MACHO-NORMAL-LABEL: test_laziness_tail:
+; MACHO-NORMAL:       ; %bb.0:
+; MACHO-NORMAL-NEXT:    b _external
+;
+; ELF-LABEL: test_laziness_tail:
+; ELF:       // %bb.0:
+; ELF-NEXT:    b external
+  tail call void @external()
   ret void
 }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ELF-FI: {{.*}}
+; ELF-GI: {{.*}}
+; ELF-SDAG: {{.*}}

From aa04d2b78bbbfef213d860876948b934ab18b1aa Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 21 Jan 2024 06:40:05 +0000
Subject: [PATCH 280/843] Add workflow to release mlgo utils

---
 .github/workflows/release-mlgo-utils.yml | 33 ++++++++++++++++++++++++
 1 file changed, 33 insertions(+)
 create mode 100644 .github/workflows/release-mlgo-utils.yml

diff --git a/.github/workflows/release-mlgo-utils.yml b/.github/workflows/release-mlgo-utils.yml
new file mode 100644
index 0000000000000..dcbbe83e56118
--- /dev/null
+++ b/.github/workflows/release-mlgo-utils.yml
@@ -0,0 +1,33 @@
+name: Release mlgo-utils
+
+permissions:
+  contents: read
+
+on:
+  push
+  workflow_dispatch:
+    inputs:
+      branch:
+        description: 'Branch'
+        required: true
+        type: string
+
+jobs:
+  release-mlgo-utils:
+    name: Release mlg-utils
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout LLVM
+        uses: actions/checkout@v4.1.1
+        with:
+          ref: "${{ inputs.branch }}"
+      - name: Package mlgo-utils
+        run: |
+          cd llvm/utils/mlgo-utils
+          python3 -m build
+      - name: Upload mlgo-utils to test.pypi.org
+        uses: pypa/gh-action-pypi-publish@release/v1
+        with:
+          password: ${{ secrets.TEST_PYPI_TOKEN }}
+          repository-url: https://test.pypi.org/legalcy/
+          packages-dir: llvm/utils/mlgo-utils/dist/

From 61d098962e1b304224fb5309b94162a3e43852bb Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 21 Jan 2024 06:43:01 +0000
Subject: [PATCH 281/843] Revert "Add workflow to release mlgo utils"

This reverts commit aa04d2b78bbbfef213d860876948b934ab18b1aa.

Meant to push this to my fork and did not realize I was on main and not
a separate branch.
---
 .github/workflows/release-mlgo-utils.yml | 33 ------------------------
 1 file changed, 33 deletions(-)
 delete mode 100644 .github/workflows/release-mlgo-utils.yml

diff --git a/.github/workflows/release-mlgo-utils.yml b/.github/workflows/release-mlgo-utils.yml
deleted file mode 100644
index dcbbe83e56118..0000000000000
--- a/.github/workflows/release-mlgo-utils.yml
+++ /dev/null
@@ -1,33 +0,0 @@
-name: Release mlgo-utils
-
-permissions:
-  contents: read
-
-on:
-  push
-  workflow_dispatch:
-    inputs:
-      branch:
-        description: 'Branch'
-        required: true
-        type: string
-
-jobs:
-  release-mlgo-utils:
-    name: Release mlg-utils
-    runs-on: ubuntu-latest
-    steps:
-      - name: Checkout LLVM
-        uses: actions/checkout@v4.1.1
-        with:
-          ref: "${{ inputs.branch }}"
-      - name: Package mlgo-utils
-        run: |
-          cd llvm/utils/mlgo-utils
-          python3 -m build
-      - name: Upload mlgo-utils to test.pypi.org
-        uses: pypa/gh-action-pypi-publish@release/v1
-        with:
-          password: ${{ secrets.TEST_PYPI_TOKEN }}
-          repository-url: https://test.pypi.org/legalcy/
-          packages-dir: llvm/utils/mlgo-utils/dist/

From f0c920ffb7e98adbad369c33c01e0996260c4ade Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Sat, 20 Jan 2024 23:26:48 -0800
Subject: [PATCH 282/843] [libc++][hardening] XFAIL tests with HWASAN (#78866)

Follow up to #78862

These tests control hardening mode with `ADDITIONAL_COMPILE_FLAGS`, and
always set modes which use on TRAP. So we don't need to check
`libcpp-hardening-mode=fast`, and they must always fail with the current
HWASAN implementation.
---
 .../modes/enabling_assertions_enables_extensive_mode.pass.cpp   | 2 +-
 .../assertions/modes/override_with_extensive_mode.pass.cpp      | 2 +-
 .../libcxx/assertions/modes/override_with_fast_mode.pass.cpp    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp b/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp
index 23ef20d4e0781..11e8ae228f8c8 100644
--- a/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/enabling_assertions_enables_extensive_mode.pass.cpp
@@ -16,7 +16,7 @@
 // debug mode).
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // HWASAN replaces TRAP with abort or error exit code.
-// XFAIL: libcpp-hardening-mode=fast && hwasan
+// XFAIL: hwasan
 // Note that GCC doesn't support `-Wno-macro-redefined`.
 // ADDITIONAL_COMPILE_FLAGS: -U_LIBCPP_HARDENING_MODE -D_LIBCPP_ENABLE_ASSERTIONS=1
 
diff --git a/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp b/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp
index c3cdfa926c6cc..cf662e9eadf7a 100644
--- a/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/override_with_extensive_mode.pass.cpp
@@ -14,7 +14,7 @@
 // debug mode).
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // HWASAN replaces TRAP with abort or error exit code.
-// XFAIL: libcpp-hardening-mode=fast && hwasan
+// XFAIL: hwasan
 // ADDITIONAL_COMPILE_FLAGS: -U_LIBCPP_HARDENING_MODE -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_EXTENSIVE
 
 #include <cassert>
diff --git a/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp b/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp
index 854bf6c5da9cd..0989ad39a78ea 100644
--- a/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp
+++ b/libcxx/test/libcxx/assertions/modes/override_with_fast_mode.pass.cpp
@@ -14,7 +14,7 @@
 // debug mode).
 // XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
 // HWASAN replaces TRAP with abort or error exit code.
-// XFAIL: libcpp-hardening-mode=fast && hwasan
+// XFAIL: hwasan
 // ADDITIONAL_COMPILE_FLAGS: -U_LIBCPP_HARDENING_MODE -D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST
 
 #include <cassert>

From dc57752031fb14166dff2174b36c28d27d742382 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconsteq@gmail.com>
Date: Sat, 20 Jan 2024 23:38:02 -0800
Subject: [PATCH 283/843] [libc++][hardening] Categorize assertions that
 produce incorrect results (#77183)

Introduce a new `argument-within-domain` category that covers cases
where the given arguments make it impossible to produce a correct result
(or create a valid object in case of constructors). While the incorrect
result doesn't create an immediate problem within the library (like e.g.
a null pointer dereference would), it always indicates a logic error in
user code and is highly likely to lead to a bug in the program once the
value is used.
---
 libcxx/include/__algorithm/clamp.h           |  2 +-
 libcxx/include/__algorithm/ranges_clamp.h    |  5 +++--
 libcxx/include/__bit/bit_ceil.h              |  2 +-
 libcxx/include/__config                      | 13 ++++++++++++-
 libcxx/include/__hash_table                  |  2 +-
 libcxx/include/__memory/assume_aligned.h     |  3 ++-
 libcxx/include/__numeric/gcd_lcm.h           |  2 +-
 libcxx/include/barrier                       | 12 ++++++------
 libcxx/include/latch                         | 10 +++++-----
 libcxx/include/semaphore                     |  8 ++++----
 libcxx/include/string_view                   |  7 +++++--
 libcxx/src/filesystem/operations.cpp         |  5 ++---
 libcxx/src/include/to_chars_floating_point.h |  2 +-
 13 files changed, 44 insertions(+), 29 deletions(-)

diff --git a/libcxx/include/__algorithm/clamp.h b/libcxx/include/__algorithm/clamp.h
index 1631b2673c3fa..003bf70dd4f01 100644
--- a/libcxx/include/__algorithm/clamp.h
+++ b/libcxx/include/__algorithm/clamp.h
@@ -26,7 +26,7 @@ clamp(_LIBCPP_LIFETIMEBOUND const _Tp& __v,
       _LIBCPP_LIFETIMEBOUND const _Tp& __lo,
       _LIBCPP_LIFETIMEBOUND const _Tp& __hi,
       _Compare __comp) {
-  _LIBCPP_ASSERT_UNCATEGORIZED(!__comp(__hi, __lo), "Bad bounds passed to std::clamp");
+  _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(!__comp(__hi, __lo), "Bad bounds passed to std::clamp");
   return __comp(__v, __lo) ? __lo : __comp(__hi, __v) ? __hi : __v;
 }
 
diff --git a/libcxx/include/__algorithm/ranges_clamp.h b/libcxx/include/__algorithm/ranges_clamp.h
index e6c86207254a1..a1185e7278f0e 100644
--- a/libcxx/include/__algorithm/ranges_clamp.h
+++ b/libcxx/include/__algorithm/ranges_clamp.h
@@ -34,8 +34,9 @@ struct __fn {
             indirect_strict_weak_order<projected<const _Type*, _Proj>> _Comp = ranges::less>
   _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr const _Type& operator()(
       const _Type& __value, const _Type& __low, const _Type& __high, _Comp __comp = {}, _Proj __proj = {}) const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(!bool(std::invoke(__comp, std::invoke(__proj, __high), std::invoke(__proj, __low))),
-                                 "Bad bounds passed to std::ranges::clamp");
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        !bool(std::invoke(__comp, std::invoke(__proj, __high), std::invoke(__proj, __low))),
+        "Bad bounds passed to std::ranges::clamp");
 
     auto&& __projected = std::invoke(__proj, __value);
     if (std::invoke(__comp, std::forward<decltype(__projected)>(__projected), std::invoke(__proj, __low)))
diff --git a/libcxx/include/__bit/bit_ceil.h b/libcxx/include/__bit/bit_ceil.h
index 17fe06aa41ccd..77fa739503bc5 100644
--- a/libcxx/include/__bit/bit_ceil.h
+++ b/libcxx/include/__bit/bit_ceil.h
@@ -28,7 +28,7 @@ _LIBCPP_NODISCARD_EXT _LIBCPP_HIDE_FROM_ABI constexpr _Tp __bit_ceil(_Tp __t) no
   if (__t < 2)
     return 1;
   const unsigned __n = numeric_limits<_Tp>::digits - std::__countl_zero((_Tp)(__t - 1u));
-  _LIBCPP_ASSERT_UNCATEGORIZED(__n != numeric_limits<_Tp>::digits, "Bad input to bit_ceil");
+  _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(__n != numeric_limits<_Tp>::digits, "Bad input to bit_ceil");
 
   if constexpr (sizeof(_Tp) >= sizeof(unsigned))
     return _Tp{1} << __n;
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 2825d03b7c205..33a79b9a79f62 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -294,6 +294,13 @@
 // - `_LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR` -- checks any operations that exchange nodes between containers to make sure
 //   the containers have compatible allocators.
 //
+// - `_LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN` -- checks that the given argument is within the domain of valid arguments
+//   for the function. Violating this typically produces an incorrect result (e.g. the clamp algorithm returns the
+//   original value without clamping it due to incorrect functors) or puts an object into an invalid state (e.g.
+//   a string view where only a subset of elements is possible to access). This category is for assertions violating
+//   which doesn't cause any immediate issues in the library -- whatever the consequences are, they will happen in the
+//   user code.
+//
 // - `_LIBCPP_ASSERT_PEDANTIC` -- checks prerequisites which are imposed by the Standard, but violating which happens to
 //   be benign in our implementation.
 //
@@ -338,6 +345,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 // Overlapping ranges will make algorithms produce incorrect results but don't directly lead to a security
 // vulnerability.
 #    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)   _LIBCPP_ASSUME(expression)
+#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)   _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)     _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                 _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                 _LIBCPP_ASSUME(expression)
@@ -353,8 +361,9 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_NON_NULL(expression, message)                 _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)   _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)     _LIBCPP_ASSERT(expression, message)
-#    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)            _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)   _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                 _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)            _LIBCPP_ASSERT(expression, message)
 // Disabled checks.
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                 _LIBCPP_ASSUME(expression)
 
@@ -368,6 +377,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_NON_NULL(expression, message)                  _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)    _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)      _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)    _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                  _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                  _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)             _LIBCPP_ASSERT(expression, message)
@@ -382,6 +392,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_NON_NULL(expression, message)                  _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)    _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)      _LIBCPP_ASSUME(expression)
+#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)    _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                  _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                  _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)             _LIBCPP_ASSUME(expression)
diff --git a/libcxx/include/__hash_table b/libcxx/include/__hash_table
index 4ca49fe42606c..13420012006e2 100644
--- a/libcxx/include/__hash_table
+++ b/libcxx/include/__hash_table
@@ -861,7 +861,7 @@ public:
 
   template <class _Key>
   _LIBCPP_HIDE_FROM_ABI size_type bucket(const _Key& __k) const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         bucket_count() > 0, "unordered container::bucket(key) called when bucket_count() == 0");
     return std::__constrain_hash(hash_function()(__k), bucket_count());
   }
diff --git a/libcxx/include/__memory/assume_aligned.h b/libcxx/include/__memory/assume_aligned.h
index c66fb49ebb3c0..5036df7f0f5d5 100644
--- a/libcxx/include/__memory/assume_aligned.h
+++ b/libcxx/include/__memory/assume_aligned.h
@@ -29,7 +29,8 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __ass
   if (__libcpp_is_constant_evaluated()) {
     return __ptr;
   } else {
-    _LIBCPP_ASSERT_UNCATEGORIZED(reinterpret_cast<uintptr_t>(__ptr) % _Np == 0, "Alignment assumption is violated");
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        reinterpret_cast<uintptr_t>(__ptr) % _Np == 0, "Alignment assumption is violated");
     return static_cast<_Tp*>(__builtin_assume_aligned(__ptr, _Np));
   }
 }
diff --git a/libcxx/include/__numeric/gcd_lcm.h b/libcxx/include/__numeric/gcd_lcm.h
index 3e9c244f25c28..48df2338051e2 100644
--- a/libcxx/include/__numeric/gcd_lcm.h
+++ b/libcxx/include/__numeric/gcd_lcm.h
@@ -77,7 +77,7 @@ _LIBCPP_CONSTEXPR _LIBCPP_HIDE_FROM_ABI common_type_t<_Tp, _Up> lcm(_Tp __m, _Up
   using _Rp  = common_type_t<_Tp, _Up>;
   _Rp __val1 = __ct_abs<_Rp, _Tp>()(__m) / std::gcd(__m, __n);
   _Rp __val2 = __ct_abs<_Rp, _Up>()(__n);
-  _LIBCPP_ASSERT_UNCATEGORIZED((numeric_limits<_Rp>::max() / __val1 > __val2), "Overflow in lcm");
+  _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN((numeric_limits<_Rp>::max() / __val1 > __val2), "Overflow in lcm");
   return __val1 * __val2;
 }
 
diff --git a/libcxx/include/barrier b/libcxx/include/barrier
index 9e7150fec1480..f91452c8d0064 100644
--- a/libcxx/include/barrier
+++ b/libcxx/include/barrier
@@ -129,7 +129,7 @@ public:
         __completion_(std::move(__completion)),
         __phase_(0) {}
   [[__nodiscard__]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __update <= __expected_, "update is greater than the expected count for the current barrier phase");
 
     auto const __old_phase = __phase_.load(memory_order_relaxed);
@@ -187,7 +187,7 @@ public:
     auto const __result     = __arrived.fetch_sub(update, memory_order_acq_rel) - update;
     auto const new_expected = __expected.load(memory_order_relaxed);
 
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         update <= new_expected, "update is greater than the expected count for the current barrier phase");
 
     if (0 == __result) {
@@ -232,7 +232,7 @@ public:
     auto const __inc = __arrived_unit * update;
     auto const __old = __phase_arrived_expected.fetch_add(__inc, memory_order_acq_rel);
 
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         update <= __old, "update is greater than the expected count for the current barrier phase");
 
     if ((__old ^ (__old + __inc)) & __phase_bit) {
@@ -268,10 +268,10 @@ public:
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI explicit barrier(
       ptrdiff_t __count, _CompletionF __completion = _CompletionF())
       : __b_(__count, std::move(__completion)) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __count >= 0,
         "barrier::barrier(ptrdiff_t, CompletionFunction): barrier cannot be initialized with a negative value");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __count <= max(),
         "barrier::barrier(ptrdiff_t, CompletionFunction): barrier cannot be initialized with "
         "a value greater than max()");
@@ -281,7 +281,7 @@ public:
   barrier& operator=(barrier const&) = delete;
 
   [[__nodiscard__]] _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI arrival_token arrive(ptrdiff_t __update = 1) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__update > 0, "barrier:arrive must be called with a value greater than 0");
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(__update > 0, "barrier:arrive must be called with a value greater than 0");
     return __b_.arrive(__update);
   }
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void wait(arrival_token&& __phase) const {
diff --git a/libcxx/include/latch b/libcxx/include/latch
index 742452517ddc5..ad7b35579913b 100644
--- a/libcxx/include/latch
+++ b/libcxx/include/latch
@@ -73,11 +73,11 @@ public:
   static _LIBCPP_HIDE_FROM_ABI constexpr ptrdiff_t max() noexcept { return numeric_limits<ptrdiff_t>::max(); }
 
   inline _LIBCPP_HIDE_FROM_ABI constexpr explicit latch(ptrdiff_t __expected) : __a_(__expected) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __expected >= 0,
         "latch::latch(ptrdiff_t): latch cannot be "
         "initialized with a negative value");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __expected <= max(),
         "latch::latch(ptrdiff_t): latch cannot be "
         "initialized with a value greater than max()");
@@ -88,9 +88,9 @@ public:
   latch& operator=(const latch&) = delete;
 
   inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void count_down(ptrdiff_t __update = 1) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__update >= 0, "latch::count_down called with a negative value");
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(__update >= 0, "latch::count_down called with a negative value");
     auto const __old = __a_.fetch_sub(__update, memory_order_release);
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __update <= __old,
         "latch::count_down called with a value greater "
         "than the internal counter");
@@ -102,7 +102,7 @@ public:
     __cxx_atomic_wait(&__a_.__a_, [this]() -> bool { return try_wait(); });
   }
   inline _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void arrive_and_wait(ptrdiff_t __update = 1) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__update >= 0, "latch::arrive_and_wait called with a negative value");
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(__update >= 0, "latch::arrive_and_wait called with a negative value");
     // other preconditions on __update are checked in count_down()
 
     count_down(__update);
diff --git a/libcxx/include/semaphore b/libcxx/include/semaphore
index ca5b2a312433c..ac3d2d7fe02e8 100644
--- a/libcxx/include/semaphore
+++ b/libcxx/include/semaphore
@@ -92,7 +92,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI constexpr explicit __atomic_semaphore_base(ptrdiff_t __count) : __a_(__count) {}
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void release(ptrdiff_t __update = 1) {
     auto __old = __a_.fetch_add(__update, memory_order_release);
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __update <= _LIBCPP_SEMAPHORE_MAX - __old, "update is greater than the expected value");
 
     if (__old > 0) {
@@ -138,11 +138,11 @@ public:
   static constexpr ptrdiff_t max() noexcept { return __least_max_value; }
 
   _LIBCPP_HIDE_FROM_ABI constexpr explicit counting_semaphore(ptrdiff_t __count) : __semaphore_(__count) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __count >= 0,
         "counting_semaphore::counting_semaphore(ptrdiff_t): counting_semaphore cannot be "
         "initialized with a negative value");
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
         __count <= max(),
         "counting_semaphore::counting_semaphore(ptrdiff_t): counting_semaphore cannot be "
         "initialized with a value greater than max()");
@@ -153,7 +153,7 @@ public:
   counting_semaphore& operator=(const counting_semaphore&) = delete;
 
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void release(ptrdiff_t __update = 1) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__update >= 0, "counting_semaphore:release called with a negative value");
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(__update >= 0, "counting_semaphore:release called with a negative value");
     __semaphore_.release(__update);
   }
   _LIBCPP_AVAILABILITY_SYNC _LIBCPP_HIDE_FROM_ABI void acquire() { __semaphore_.acquire(); }
diff --git a/libcxx/include/string_view b/libcxx/include/string_view
index 909224fe7e3d0..e414507a7933b 100644
--- a/libcxx/include/string_view
+++ b/libcxx/include/string_view
@@ -307,8 +307,11 @@ public:
       : __data_(__s),
         __size_(__len) {
 #if _LIBCPP_STD_VER >= 14
-    _LIBCPP_ASSERT_UNCATEGORIZED(__len <= static_cast<size_type>(numeric_limits<difference_type>::max()),
-                                 "string_view::string_view(_CharT *, size_t): length does not fit in difference_type");
+    // This will result in creating an invalid `string_view` object -- some calculations involving `size` would
+    // overflow, making it effectively truncated.
+    _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
+        __len <= static_cast<size_type>(numeric_limits<difference_type>::max()),
+        "string_view::string_view(_CharT *, size_t): length does not fit in difference_type");
     _LIBCPP_ASSERT_NON_NULL(
         __len == 0 || __s != nullptr, "string_view::string_view(_CharT *, size_t): received nullptr");
 #endif
diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp
index 6bee340e0d15c..8a7d6cc94c756 100644
--- a/libcxx/src/filesystem/operations.cpp
+++ b/libcxx/src/filesystem/operations.cpp
@@ -608,10 +608,9 @@ void __permissions(const path& p, perms prms, perm_options opts, error_code* ec)
   const bool resolve_symlinks = !has_opt(perm_options::nofollow);
   const bool add_perms        = has_opt(perm_options::add);
   const bool remove_perms     = has_opt(perm_options::remove);
-  _LIBCPP_ASSERT_UNCATEGORIZED(
+  _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
       (add_perms + remove_perms + has_opt(perm_options::replace)) == 1,
-      "One and only one of the perm_options constants replace, add, or remove "
-      "is present in opts");
+      "One and only one of the perm_options constants 'replace', 'add', or 'remove' must be present in opts");
 
   bool set_sym_perms = false;
   prms &= perms::mask;
diff --git a/libcxx/src/include/to_chars_floating_point.h b/libcxx/src/include/to_chars_floating_point.h
index e4715d10d97da..01c26181697b7 100644
--- a/libcxx/src/include/to_chars_floating_point.h
+++ b/libcxx/src/include/to_chars_floating_point.h
@@ -994,7 +994,7 @@ to_chars_result _Floating_to_chars(
     if constexpr (_Overload == _Floating_to_chars_overload::_Plain) {
         _LIBCPP_ASSERT_INTERNAL(_Fmt == chars_format{}, ""); // plain overload must pass chars_format{} internally
     } else {
-        _LIBCPP_ASSERT_UNCATEGORIZED(_Fmt == chars_format::general || _Fmt == chars_format::scientific
+        _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(_Fmt == chars_format::general || _Fmt == chars_format::scientific
                          || _Fmt == chars_format::fixed || _Fmt == chars_format::hex,
             "invalid format in to_chars()");
     }

From d0230446d282396795e5a55b9b70df6961fcd046 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sat, 20 Jan 2024 23:39:07 -0800
Subject: [PATCH 284/843] [AArch64] Remove non-sensible define nonlazybind test

nonlazybind is for declarations, not for definitions. We could test the
behavior, but the output would be misleading.
---
 llvm/test/CodeGen/AArch64/nonlazybind.ll | 114 ++++++++++++-----------
 1 file changed, 61 insertions(+), 53 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/nonlazybind.ll b/llvm/test/CodeGen/AArch64/nonlazybind.ll
index faf4a1bb84037..669a8ee04b249 100644
--- a/llvm/test/CodeGen/AArch64/nonlazybind.ll
+++ b/llvm/test/CodeGen/AArch64/nonlazybind.ll
@@ -5,74 +5,81 @@
 ; RUN: llc -mtriple=aarch64 -global-isel %s -o - | FileCheck %s --check-prefixes=ELF,ELF-GI
 ; RUN: llc -mtriple=aarch64 %s -o - | FileCheck %s --check-prefixes=ELF,ELF-SDAG
 
-define dso_preemptable void @preemptable() nonlazybind {
-; MACHO-LABEL: preemptable:
-; MACHO:       ; %bb.0:
-; MACHO-NEXT:    ret
-;
-; MACHO-NORMAL-LABEL: preemptable:
-; MACHO-NORMAL:       ; %bb.0:
-; MACHO-NORMAL-NEXT:    ret
-;
-; ELF-LABEL: preemptable:
-; ELF:       // %bb.0:
-; ELF-NEXT:    ret
-  ret void
-}
-
-define dso_local void @local() nonlazybind {
-; MACHO-LABEL: local:
-; MACHO:       ; %bb.0:
-; MACHO-NEXT:    ret
-;
-; MACHO-NORMAL-LABEL: local:
-; MACHO-NORMAL:       ; %bb.0:
-; MACHO-NORMAL-NEXT:    ret
-;
-; ELF-LABEL: local:
-; ELF:       // %bb.0:
-; ELF-NEXT:    ret
-  ret void
-}
-
 declare void @external() nonlazybind
 
-define void @test_laziness() nounwind {
+define void @test_laziness(ptr %a) nounwind {
 ;
 ; MACHO-LABEL: test_laziness:
 ; MACHO:       ; %bb.0:
-; MACHO-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; MACHO-NEXT:    bl _preemptable
-; MACHO-NEXT:    bl _local
+; MACHO-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; MACHO-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
 ; MACHO-NEXT:  Lloh0:
 ; MACHO-NEXT:    adrp x8, _external@GOTPAGE
+; MACHO-NEXT:    mov x19, x0
 ; MACHO-NEXT:  Lloh1:
 ; MACHO-NEXT:    ldr x8, [x8, _external@GOTPAGEOFF]
 ; MACHO-NEXT:    blr x8
-; MACHO-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; MACHO-NEXT:    mov x0, x19
+; MACHO-NEXT:    mov w1, #1 ; =0x1
+; MACHO-NEXT:    mov w2, #1000 ; =0x3e8
+; MACHO-NEXT:    bl _memset
+; MACHO-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; MACHO-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
 ; MACHO-NEXT:    ret
 ; MACHO-NEXT:    .loh AdrpLdrGot Lloh0, Lloh1
 ;
 ; MACHO-NORMAL-LABEL: test_laziness:
 ; MACHO-NORMAL:       ; %bb.0:
-; MACHO-NORMAL-NEXT:    stp x29, x30, [sp, #-16]! ; 16-byte Folded Spill
-; MACHO-NORMAL-NEXT:    bl _preemptable
-; MACHO-NORMAL-NEXT:    bl _local
+; MACHO-NORMAL-NEXT:    stp x20, x19, [sp, #-32]! ; 16-byte Folded Spill
+; MACHO-NORMAL-NEXT:    stp x29, x30, [sp, #16] ; 16-byte Folded Spill
+; MACHO-NORMAL-NEXT:    mov x19, x0
 ; MACHO-NORMAL-NEXT:    bl _external
-; MACHO-NORMAL-NEXT:    ldp x29, x30, [sp], #16 ; 16-byte Folded Reload
+; MACHO-NORMAL-NEXT:    mov x0, x19
+; MACHO-NORMAL-NEXT:    mov w1, #1 ; =0x1
+; MACHO-NORMAL-NEXT:    mov w2, #1000 ; =0x3e8
+; MACHO-NORMAL-NEXT:    bl _memset
+; MACHO-NORMAL-NEXT:    ldp x29, x30, [sp, #16] ; 16-byte Folded Reload
+; MACHO-NORMAL-NEXT:    ldp x20, x19, [sp], #32 ; 16-byte Folded Reload
 ; MACHO-NORMAL-NEXT:    ret
 ;
-; ELF-LABEL: test_laziness:
-; ELF:       // %bb.0:
-; ELF-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
-; ELF-NEXT:    bl preemptable
-; ELF-NEXT:    bl local
-; ELF-NEXT:    bl external
-; ELF-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
-; ELF-NEXT:    ret
-  call void @preemptable()
-  call void @local()
+; ELF-FI-LABEL: test_laziness:
+; ELF-FI:       // %bb.0:
+; ELF-FI-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; ELF-FI-NEXT:    mov x19, x0
+; ELF-FI-NEXT:    bl external
+; ELF-FI-NEXT:    mov w8, #1 // =0x1
+; ELF-FI-NEXT:    mov x0, x19
+; ELF-FI-NEXT:    mov x2, #1000 // =0x3e8
+; ELF-FI-NEXT:    uxtb w1, w8
+; ELF-FI-NEXT:    bl memset
+; ELF-FI-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; ELF-FI-NEXT:    ret
+;
+; ELF-GI-LABEL: test_laziness:
+; ELF-GI:       // %bb.0:
+; ELF-GI-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; ELF-GI-NEXT:    mov x19, x0
+; ELF-GI-NEXT:    bl external
+; ELF-GI-NEXT:    mov x0, x19
+; ELF-GI-NEXT:    mov w1, #1 // =0x1
+; ELF-GI-NEXT:    mov w2, #1000 // =0x3e8
+; ELF-GI-NEXT:    bl memset
+; ELF-GI-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; ELF-GI-NEXT:    ret
+;
+; ELF-SDAG-LABEL: test_laziness:
+; ELF-SDAG:       // %bb.0:
+; ELF-SDAG-NEXT:    stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; ELF-SDAG-NEXT:    mov x19, x0
+; ELF-SDAG-NEXT:    bl external
+; ELF-SDAG-NEXT:    mov x0, x19
+; ELF-SDAG-NEXT:    mov w1, #1 // =0x1
+; ELF-SDAG-NEXT:    mov w2, #1000 // =0x3e8
+; ELF-SDAG-NEXT:    bl memset
+; ELF-SDAG-NEXT:    ldp x30, x19, [sp], #16 // 16-byte Folded Reload
+; ELF-SDAG-NEXT:    ret
   call void @external()
+  call void @llvm.memset.p0.i64(ptr align 1 %a, i8 1, i64 1000, i1 false)
   ret void
 }
 
@@ -96,7 +103,8 @@ define void @test_laziness_tail() nounwind {
   tail call void @external()
   ret void
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; ELF-FI: {{.*}}
-; ELF-GI: {{.*}}
-; ELF-SDAG: {{.*}}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 7, !"RtLibUseGOT", i32 1}

From 62bf7710ff295cc7bb0bb281c471ca0c91fd156e Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Sun, 21 Jan 2024 10:06:53 +0100
Subject: [PATCH 285/843] [mlir][IR] Add `notifyBlockRemoved` callback to
 listener (#78306)

There is already a "block inserted" notification (in
`OpBuilder::Listener`), so there should also be a "block removed"
notification.

The purpose of this change is to make the listener API more mature.
There is currently a gap between what kind of IR changes can be made and
what IR changes can be listened to. At the moment, the only way to
inform listeners about "block removal" is to send a manual
`notifyOperationModified` for the parent op (e.g., by wrapping the
`eraseBlock(b)` method call in `updateRootInPlace(b->getParentOp())`).
This tells the listener that *something* has changed, but it is somewhat
of an API abuse.
---
 mlir/include/mlir/IR/PatternMatch.h                   |  8 ++++++++
 mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp       |  2 +-
 mlir/lib/IR/PatternMatch.cpp                          | 11 +++++++++--
 .../Transforms/Utils/GreedyPatternRewriteDriver.cpp   |  8 ++++++++
 4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/IR/PatternMatch.h b/mlir/include/mlir/IR/PatternMatch.h
index b065d4e8d3768..815340c918509 100644
--- a/mlir/include/mlir/IR/PatternMatch.h
+++ b/mlir/include/mlir/IR/PatternMatch.h
@@ -402,6 +402,10 @@ class RewriterBase : public OpBuilder {
     Listener()
         : OpBuilder::Listener(ListenerBase::Kind::RewriterBaseListener) {}
 
+    /// Notify the listener that the specified block is about to be erased.
+    /// At this point, the block has zero uses.
+    virtual void notifyBlockRemoved(Block *block) {}
+
     /// Notify the listener that the specified operation was modified in-place.
     virtual void notifyOperationModified(Operation *op) {}
 
@@ -452,6 +456,10 @@ class RewriterBase : public OpBuilder {
     void notifyBlockCreated(Block *block) override {
       listener->notifyBlockCreated(block);
     }
+    void notifyBlockRemoved(Block *block) override {
+      if (auto *rewriteListener = dyn_cast<RewriterBase::Listener>(listener))
+        rewriteListener->notifyBlockRemoved(block);
+    }
     void notifyOperationModified(Operation *op) override {
       if (auto *rewriteListener = dyn_cast<RewriterBase::Listener>(listener))
         rewriteListener->notifyOperationModified(op);
diff --git a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
index abb65bc3c38f2..b63baf330c864 100644
--- a/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
+++ b/mlir/lib/Conversion/VectorToGPU/VectorToGPU.cpp
@@ -1114,7 +1114,7 @@ static scf::ForOp replaceForOpWithNewSignature(RewriterBase &rewriter,
   scf::ForOp newLoop = rewriter.create<scf::ForOp>(
       loop.getLoc(), loop.getLowerBound(), loop.getUpperBound(), loop.getStep(),
       operands);
-  newLoop.getBody()->erase();
+  rewriter.eraseBlock(newLoop.getBody());
 
   newLoop.getRegion().getBlocks().splice(
       newLoop.getRegion().getBlocks().begin(), loop.getRegion().getBlocks());
diff --git a/mlir/lib/IR/PatternMatch.cpp b/mlir/lib/IR/PatternMatch.cpp
index 73f232fd0de01..ba0516e0539b6 100644
--- a/mlir/lib/IR/PatternMatch.cpp
+++ b/mlir/lib/IR/PatternMatch.cpp
@@ -244,7 +244,7 @@ void RewriterBase::eraseOp(Operation *op) {
           for (BlockArgument bbArg : b->getArguments())
             bbArg.dropAllUses();
           b->dropAllUses();
-          b->erase();
+          eraseBlock(b);
         }
       }
     }
@@ -256,10 +256,17 @@ void RewriterBase::eraseOp(Operation *op) {
 }
 
 void RewriterBase::eraseBlock(Block *block) {
+  assert(block->use_empty() && "expected 'block' to have no uses");
+
   for (auto &op : llvm::make_early_inc_range(llvm::reverse(*block))) {
     assert(op.use_empty() && "expected 'op' to have no uses");
     eraseOp(&op);
   }
+
+  // Notify the listener that the block is about to be removed.
+  if (auto *rewriteListener = dyn_cast_if_present<Listener>(listener))
+    rewriteListener->notifyBlockRemoved(block);
+
   block->erase();
 }
 
@@ -311,7 +318,7 @@ void RewriterBase::inlineBlockBefore(Block *source, Block *dest,
   // Move operations from the source block to the dest block and erase the
   // source block.
   dest->getOperations().splice(before, source->getOperations());
-  source->erase();
+  eraseBlock(source);
 }
 
 void RewriterBase::inlineBlockBefore(Block *source, Operation *op,
diff --git a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
index 36d63d62bf10f..ac73e82bfe92a 100644
--- a/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
+++ b/mlir/lib/Transforms/Utils/GreedyPatternRewriteDriver.cpp
@@ -373,6 +373,9 @@ class GreedyPatternRewriteDriver : public PatternRewriter,
   /// Notify the driver that the given block was created.
   void notifyBlockCreated(Block *block) override;
 
+  /// Notify the driver that the given block is about to be removed.
+  void notifyBlockRemoved(Block *block) override;
+
   /// For debugging only: Notify the driver of a pattern match failure.
   LogicalResult
   notifyMatchFailure(Location loc,
@@ -633,6 +636,11 @@ void GreedyPatternRewriteDriver::notifyBlockCreated(Block *block) {
     config.listener->notifyBlockCreated(block);
 }
 
+void GreedyPatternRewriteDriver::notifyBlockRemoved(Block *block) {
+  if (config.listener)
+    config.listener->notifyBlockRemoved(block);
+}
+
 void GreedyPatternRewriteDriver::notifyOperationInserted(Operation *op) {
   LLVM_DEBUG({
     logger.startLine() << "** Insert  : '" << op->getName() << "'(" << op

From f9e2e85b07ee2c19bbef8fda50b3f664d6f5193e Mon Sep 17 00:00:00 2001
From: FantasqueX <fantasquex@gmail.com>
Date: Sun, 21 Jan 2024 17:34:29 +0800
Subject: [PATCH 286/843] [Clang] Use const pointer to eliminate warning with
 libxml 2.12.0 (#76719)

Currently, if `CLANG_HAVE_LIBXML` is defined, and the version of libxml2
is above 2.12.0, there will be two warnings when building clang.

warning: initializing 'xmlErrorPtr' (aka 'struct _xmlError *') with an
expression of type 'const xmlError *' (aka 'const struct _xmlError *')
discards qualifiers

Since this commit

https://gitlab.gnome.org/GNOME/libxml2/-/commit/45470611b047db78106dcb2fdbd4164163c15ab7,
libxml2 makes cmlGetLastError return a const error. This patch follows
libxml2. Making the result a const pointer should be compatible with
versions before 2.12.0.

Tested on ArchLinux with libxml2 2.12.3 installed.
---
 clang/tools/c-index-test/c-index-test.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/tools/c-index-test/c-index-test.c b/clang/tools/c-index-test/c-index-test.c
index 25235eb7f06d6..21619888cfa5f 100644
--- a/clang/tools/c-index-test/c-index-test.c
+++ b/clang/tools/c-index-test/c-index-test.c
@@ -707,7 +707,7 @@ static void ValidateCommentXML(const char *Str, const char *CommentSchemaFile) {
   Doc = xmlParseDoc((const xmlChar *) Str);
 
   if (!Doc) {
-    xmlErrorPtr Error = xmlGetLastError();
+    const xmlError *Error = xmlGetLastError();
     printf(" CommentXMLInvalid [not well-formed XML: %s]", Error->message);
     return;
   }
@@ -717,7 +717,7 @@ static void ValidateCommentXML(const char *Str, const char *CommentSchemaFile) {
   if (!status)
     printf(" CommentXMLValid");
   else if (status > 0) {
-    xmlErrorPtr Error = xmlGetLastError();
+    const xmlError *Error = xmlGetLastError();
     printf(" CommentXMLInvalid [not valid XML: %s]", Error->message);
   } else
     printf(" libXMLError");

From fbb62d449c47bb0b49c0727c926373b41a8183c5 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Sun, 21 Jan 2024 11:10:09 +0100
Subject: [PATCH 287/843] [mlir][bufferization] Buffer deallocation: Make op
 preconditions stricter (#75127)

The buffer deallocation pass checks the IR ("operation preconditions")
to make sure that there is no IR that is unsupported. In such a case,
the pass signals a failure.

The pass now rejects all ops with unknown memory effects. We do not know
whether such an op allocates memory or not. Therefore, the buffer
deallocation pass does not know whether a deallocation op should be
inserted or not.

Memory effects are queried from the `MemoryEffectOpInterface` interface.
Ops that do not implement this interface but have the
`RecursiveMemoryEffects` trait do not have any side effects (apart from
the ones that their nested ops may have).

Unregistered ops are now rejected by the pass because they do not
implement the `MemoryEffectOpInterface` and neither do we know if they
have `RecursiveMemoryEffects` or not. All test cases that currently have
unregistered ops are updated to use registered ops.
---
 .../OwnershipBasedBufferDeallocation.cpp      | 79 +++++++++++++------
 .../dealloc-branchop-interface.mlir           |  4 +-
 .../dealloc-existing-deallocs.mlir            |  8 +-
 .../dealloc-function-boundaries.mlir          |  4 +-
 .../dealloc-other.mlir                        | 13 ++-
 .../dealloc-region-branchop-interface.mlir    | 68 +++++++---------
 .../bufferization-buffer-deallocation.mlir    |  2 +-
 mlir/test/lib/Dialect/Test/TestDialect.cpp    | 15 ++++
 mlir/test/lib/Dialect/Test/TestOps.td         | 37 ++++++++-
 9 files changed, 153 insertions(+), 77 deletions(-)

diff --git a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
index 529d5a808c012..c535d6c130edb 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation.cpp
@@ -48,6 +48,32 @@ static Value buildBoolValue(OpBuilder &builder, Location loc, bool value) {
 
 static bool isMemref(Value v) { return v.getType().isa<BaseMemRefType>(); }
 
+/// Return "true" if the given op is guaranteed to have neither "Allocate" nor
+/// "Free" side effects.
+static bool hasNeitherAllocateNorFreeSideEffect(Operation *op) {
+  if (isa<MemoryEffectOpInterface>(op))
+    return hasEffect<MemoryEffects::Allocate>(op) ||
+           hasEffect<MemoryEffects::Free>(op);
+  // If the op does not implement the MemoryEffectOpInterface but has has
+  // recursive memory effects, then this op in isolation (without its body) does
+  // not have any side effects. All the ops inside the regions of this op will
+  // be processed separately.
+  return op->hasTrait<OpTrait::HasRecursiveMemoryEffects>();
+}
+
+/// Return "true" if the given op has buffer semantics. I.e., it has buffer
+/// operands, buffer results and/or buffer region entry block arguments.
+static bool hasBufferSemantics(Operation *op) {
+  if (llvm::any_of(op->getOperands(), isMemref) ||
+      llvm::any_of(op->getResults(), isMemref))
+    return true;
+  for (Region &region : op->getRegions())
+    if (!region.empty())
+      if (llvm::any_of(region.front().getArguments(), isMemref))
+        return true;
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 // Backedges analysis
 //===----------------------------------------------------------------------===//
@@ -462,21 +488,6 @@ BufferDeallocation::materializeUniqueOwnership(OpBuilder &builder, Value memref,
   return state.getMemrefWithUniqueOwnership(builder, memref, block);
 }
 
-static bool regionOperatesOnMemrefValues(Region &region) {
-  WalkResult result = region.walk([](Block *block) {
-    if (llvm::any_of(block->getArguments(), isMemref))
-      return WalkResult::interrupt();
-    for (Operation &op : *block) {
-      if (llvm::any_of(op.getOperands(), isMemref))
-        return WalkResult::interrupt();
-      if (llvm::any_of(op.getResults(), isMemref))
-        return WalkResult::interrupt();
-    }
-    return WalkResult::advance();
-  });
-  return result.wasInterrupted();
-}
-
 LogicalResult
 BufferDeallocation::verifyFunctionPreconditions(FunctionOpInterface op) {
   // (1) Ensure that there are supported loops only (no explicit control flow
@@ -491,7 +502,32 @@ BufferDeallocation::verifyFunctionPreconditions(FunctionOpInterface op) {
 }
 
 LogicalResult BufferDeallocation::verifyOperationPreconditions(Operation *op) {
-  // (1) Check that the control flow structures are supported.
+  // (1) The pass does not work properly when deallocations are already present.
+  // Alternatively, we could also remove all deallocations as a pre-pass.
+  if (isa<DeallocOp>(op))
+    return op->emitError(
+        "No deallocation operations must be present when running this pass!");
+
+  // (2) Memory side effects of unregistered ops are unknown. In particular, we
+  // do not know whether an unregistered op allocates memory or not.
+  // - Ops with recursive memory effects are allowed. All nested ops in the
+  //   regions of `op` will be analyzed separately.
+  // - Call ops are allowed even though they typically do not implement the
+  //   MemoryEffectOpInterface. They usually do not have side effects apart
+  //   from the callee, which will be analyzed separately. (This is similar to
+  //   "recursive memory effects".)
+  if (!isa<MemoryEffectOpInterface>(op) &&
+      !op->hasTrait<OpTrait::HasRecursiveMemoryEffects>() &&
+      !isa<CallOpInterface>(op))
+    return op->emitError(
+        "ops with unknown memory side effects are not supported");
+
+  // We do not care about ops that do not operate on buffers and have no
+  // Allocate/Free side effect.
+  if (!hasBufferSemantics(op) && hasNeitherAllocateNorFreeSideEffect(op))
+    return success();
+
+  // (3) Check that the control flow structures are supported.
   auto regions = op->getRegions();
   // Check that if the operation has at
   // least one region it implements the RegionBranchOpInterface. If there
@@ -502,17 +538,10 @@ LogicalResult BufferDeallocation::verifyOperationPreconditions(Operation *op) {
   size_t size = regions.size();
   if (((size == 1 && !op->getResults().empty()) || size > 1) &&
       !dyn_cast<RegionBranchOpInterface>(op)) {
-    if (llvm::any_of(regions, regionOperatesOnMemrefValues))
-      return op->emitError("All operations with attached regions need to "
-                           "implement the RegionBranchOpInterface.");
+    return op->emitError("All operations with attached regions need to "
+                         "implement the RegionBranchOpInterface.");
   }
 
-  // (2) The pass does not work properly when deallocations are already present.
-  // Alternatively, we could also remove all deallocations as a pre-pass.
-  if (isa<DeallocOp>(op))
-    return op->emitError(
-        "No deallocation operations must be present when running this pass!");
-
   // (3) Check that terminators with more than one successor except `cf.cond_br`
   // are not present and that either BranchOpInterface or
   // RegionBranchTerminatorOpInterface is implemented.
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir
index 3ae0529ab7d74..5e8104f83cc4d 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-branchop-interface.mlir
@@ -570,7 +570,7 @@ func.func @select_captured_in_next_block(%arg0: index, %arg1: memref<?xi8>, %arg
 func.func @blocks_not_preordered_by_dominance() {
   cf.br ^bb1
 ^bb2:
-  "test.memref_user"(%alloc) : (memref<2xi32>) -> ()
+  "test.read_buffer"(%alloc) : (memref<2xi32>) -> ()
   return
 ^bb1:
   %alloc = memref.alloc() : memref<2xi32>
@@ -581,7 +581,7 @@ func.func @blocks_not_preordered_by_dominance() {
 //  CHECK-NEXT:   [[TRUE:%.+]] = arith.constant true
 //  CHECK-NEXT:   cf.br [[BB1:\^.+]]
 //  CHECK-NEXT: [[BB2:\^[a-zA-Z0-9_]+]]:
-//  CHECK-NEXT:   "test.memref_user"([[ALLOC:%[a-zA-Z0-9_]+]])
+//  CHECK-NEXT:   "test.read_buffer"([[ALLOC:%[a-zA-Z0-9_]+]])
 //  CHECK-NEXT:   bufferization.dealloc ([[ALLOC]] : {{.*}}) if ([[TRUE]])
 //   CHECK-NOT: retain
 //  CHECK-NEXT:   return
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-existing-deallocs.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-existing-deallocs.mlir
index 7014746e348d2..7a41ed2057031 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-existing-deallocs.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-existing-deallocs.mlir
@@ -8,7 +8,7 @@ func.func @auto_dealloc() {
   %c100 = arith.constant 100 : index
   %alloc = memref.alloc(%c10) : memref<?xi32>
   %realloc = memref.realloc %alloc(%c100) : memref<?xi32> to memref<?xi32>
-  "test.memref_user"(%realloc) : (memref<?xi32>) -> ()
+  "test.read_buffer"(%realloc) : (memref<?xi32>) -> ()
   return
 }
 
@@ -17,7 +17,7 @@ func.func @auto_dealloc() {
 //   CHECK-NOT:  bufferization.dealloc
 //       CHECK:  [[V0:%.+]]:2 = scf.if
 //   CHECK-NOT:  bufferization.dealloc
-//       CHECK:  test.memref_user
+//       CHECK:  test.read_buffer
 //  CHECK-NEXT:  [[BASE:%[a-zA-Z0-9_]+]]{{.*}} = memref.extract_strided_metadata [[V0]]#0
 //  CHECK-NEXT:  bufferization.dealloc ([[ALLOC]], [[BASE]] :{{.*}}) if (%true{{[0-9_]*}}, [[V0]]#1)
 //  CHECK-NEXT:  return
@@ -32,14 +32,14 @@ func.func @auto_dealloc_inside_nested_region(%arg0: memref<?xi32>, %arg1: i1) {
   } else {
     scf.yield %arg0 : memref<?xi32>
   }
-  "test.memref_user"(%0) : (memref<?xi32>) -> ()
+  "test.read_buffer"(%0) : (memref<?xi32>) -> ()
   return
 }
 
 // CHECK-LABEL: func @auto_dealloc_inside_nested_region
 //  CHECK-SAME: (%arg0: memref<?xi32>, %arg1: i1)
 //   CHECK-NOT: dealloc
-//       CHECK: "test.memref_user"([[V0:%.+]]#0)
+//       CHECK: "test.read_buffer"([[V0:%.+]]#0)
 //  CHECK-NEXT: [[BASE:%[a-zA-Z0-9_]+]],{{.*}} = memref.extract_strided_metadata [[V0]]#0
 //  CHECK-NEXT: bufferization.dealloc ([[BASE]] : memref<i32>) if ([[V0]]#1)
 //  CHECK-NEXT: return
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-function-boundaries.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-function-boundaries.mlir
index 13c55d0289880..de71f23cfad6f 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-function-boundaries.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-function-boundaries.mlir
@@ -12,7 +12,7 @@
 
 func.func private @emptyUsesValue(%arg0: memref<4xf32>) {
   %0 = memref.alloc() : memref<4xf32>
-  "test.memref_user"(%0) : (memref<4xf32>) -> ()
+  "test.read_buffer"(%0) : (memref<4xf32>) -> ()
   return
 }
 
@@ -37,7 +37,7 @@ func.func private @emptyUsesValue(%arg0: memref<4xf32>) {
 
 func.func @emptyUsesValue(%arg0: memref<4xf32>) {
   %0 = memref.alloc() : memref<4xf32>
-  "test.memref_user"(%0) : (memref<4xf32>) -> ()
+  "test.read_buffer"(%0) : (memref<4xf32>) -> ()
   return
 }
 
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir
index be894dbe4c5fc..9bfa91589482b 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-other.mlir
@@ -21,11 +21,20 @@ func.func private @no_interface_no_operands(%t : tensor<?x?x?xf16>) -> memref<?x
 // CHECK-LABEL: func private @no_interface(
 //       CHECK:   %[[true:.*]] = arith.constant true
 //       CHECK:   %[[alloc:.*]] = memref.alloc
-//       CHECK:   %[[foo:.*]] = "test.foo"(%[[alloc]])
+//       CHECK:   %[[foo:.*]] = "test.forward_buffer"(%[[alloc]])
 //       CHECK:   %[[r:.*]] = bufferization.dealloc (%[[alloc]] : {{.*}}) if (%[[true]]) retain (%[[foo]] : {{.*}})
 //       CHECK:   return %[[foo]]
 func.func private @no_interface() -> memref<5xf32> {
   %0 = memref.alloc() : memref<5xf32>
-  %1 = "test.foo"(%0) : (memref<5xf32>) -> (memref<5xf32>)
+  %1 = "test.forward_buffer"(%0) : (memref<5xf32>) -> (memref<5xf32>)
   return %1 : memref<5xf32>
 }
+
+// -----
+
+func.func @no_side_effects() {
+  %0 = memref.alloc() : memref<5xf32>
+  // expected-error @below{{ops with unknown memory side effects are not supported}}
+  "test.unregistered_op_foo"(%0) : (memref<5xf32>) -> ()
+  return
+}
diff --git a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-region-branchop-interface.mlir b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-region-branchop-interface.mlir
index 1a8a930bc9002..423fc4730b137 100644
--- a/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-region-branchop-interface.mlir
+++ b/mlir/test/Dialect/Bufferization/Transforms/OwnershipBasedBufferDeallocation/dealloc-region-branchop-interface.mlir
@@ -71,7 +71,7 @@ func.func @nested_region_control_flow(
     scf.yield %1 : memref<?x?xf32>
   } else {
     %3 = memref.alloc(%arg0, %arg1) : memref<?x?xf32>
-    "test.memref_user"(%3) : (memref<?x?xf32>) -> ()
+    "test.read_buffer"(%3) : (memref<?x?xf32>) -> ()
     scf.yield %1 : memref<?x?xf32>
   }
   return %2 : memref<?x?xf32>
@@ -253,7 +253,7 @@ func.func @loop_alloc(
   %buf: memref<2xf32>,
   %res: memref<2xf32>) {
   %0 = memref.alloc() : memref<2xf32>
-  "test.memref_user"(%0) : (memref<2xf32>) -> ()
+  "test.read_buffer"(%0) : (memref<2xf32>) -> ()
   %1 = scf.for %i = %lb to %ub step %step
     iter_args(%iterBuf = %buf) -> memref<2xf32> {
     %2 = arith.cmpi eq, %i, %ub : index
@@ -385,7 +385,7 @@ func.func @loop_nested_alloc(
   %buf: memref<2xf32>,
   %res: memref<2xf32>) {
   %0 = memref.alloc() : memref<2xf32>
-  "test.memref_user"(%0) : (memref<2xf32>) -> ()
+  "test.read_buffer"(%0) : (memref<2xf32>) -> ()
   %1 = scf.for %i = %lb to %ub step %step
     iter_args(%iterBuf = %buf) -> memref<2xf32> {
     %2 = scf.for %i2 = %lb to %ub step %step
@@ -393,7 +393,7 @@ func.func @loop_nested_alloc(
       %3 = scf.for %i3 = %lb to %ub step %step
         iter_args(%iterBuf3 = %iterBuf2) -> memref<2xf32> {
         %4 = memref.alloc() : memref<2xf32>
-        "test.memref_user"(%4) : (memref<2xf32>) -> ()
+        "test.read_buffer"(%4) : (memref<2xf32>) -> ()
         %5 = arith.cmpi eq, %i, %ub : index
         %6 = scf.if %5 -> (memref<2xf32>) {
           %7 = memref.alloc() : memref<2xf32>
@@ -476,7 +476,7 @@ func.func @assumingOp(
   // Confirm the alloc will be dealloc'ed in the block.
   %1 = shape.assuming %arg0 -> memref<2xf32> {
     %0 = memref.alloc() : memref<2xf32>
-    "test.memref_user"(%0) : (memref<2xf32>) -> ()
+    "test.read_buffer"(%0) : (memref<2xf32>) -> ()
     shape.assuming_yield %arg2 : memref<2xf32>
   }
   // Confirm the alloc will be returned and dealloc'ed after its use.
@@ -511,50 +511,40 @@ func.func @assumingOp(
 
 // -----
 
-// Test Case: The op "test.bar" does not implement the RegionBranchOpInterface.
-// This is only allowed in buffer deallocation because the operation's region
-// does not deal with any MemRef values.
+// Test Case: The op "test.one_region_with_recursive_memory_effects" does not
+// implement the RegionBranchOpInterface. This is allowed during buffer
+// deallocation because the operation's region does not deal with any MemRef
+// values.
 
 func.func @noRegionBranchOpInterface() {
-  %0 = "test.bar"() ({
-    %1 = "test.bar"() ({
-      "test.yield"() : () -> ()
+  %0 = "test.one_region_with_recursive_memory_effects"() ({
+    %1 = "test.one_region_with_recursive_memory_effects"() ({
+      %2 = memref.alloc() : memref<2xi32>
+      "test.read_buffer"(%2) : (memref<2xi32>) -> ()
+      "test.return"() : () -> ()
     }) : () -> (i32)
-    "test.yield"() : () -> ()
+    "test.return"() : () -> ()
   }) : () -> (i32)
-  "test.terminator"() : () -> ()
+  "test.return"() : () -> ()
 }
 
 // -----
 
-// Test Case: The op "test.bar" does not implement the RegionBranchOpInterface.
-// This is not allowed in buffer deallocation.
+// Test Case: The second op "test.one_region_with_recursive_memory_effects" does
+// not implement the RegionBranchOpInterface but has buffer semantics. This is
+// not allowed during buffer deallocation.
 
 func.func @noRegionBranchOpInterface() {
-  %0 = "test.bar"() ({
+  %0 = "test.one_region_with_recursive_memory_effects"() ({
     // expected-error@+1 {{All operations with attached regions need to implement the RegionBranchOpInterface.}}
-    %1 = "test.bar"() ({
-      %2 = "test.get_memref"() : () -> memref<2xi32>
-      "test.yield"(%2) : (memref<2xi32>) -> ()
+    %1 = "test.one_region_with_recursive_memory_effects"() ({
+      %2 = memref.alloc() : memref<2xi32>
+      "test.read_buffer"(%2) : (memref<2xi32>) -> ()
+      "test.return"(%2) : (memref<2xi32>) -> ()
     }) : () -> (memref<2xi32>)
-    "test.yield"() : () -> ()
+    "test.return"() : () -> ()
   }) : () -> (i32)
-  "test.terminator"() : () -> ()
-}
-
-// -----
-
-// Test Case: The op "test.bar" does not implement the RegionBranchOpInterface.
-// This is not allowed in buffer deallocation.
-
-func.func @noRegionBranchOpInterface() {
-  // expected-error@+1 {{All operations with attached regions need to implement the RegionBranchOpInterface.}}
-  %0 = "test.bar"() ({
-    %2 = "test.get_memref"() : () -> memref<2xi32>
-    %3 = "test.foo"(%2) : (memref<2xi32>) -> (i32)
-    "test.yield"(%3) : (i32) -> ()
-  }) : () -> (i32)
-  "test.terminator"() : () -> ()
+  "test.return"() : () -> ()
 }
 
 // -----
@@ -562,7 +552,8 @@ func.func @noRegionBranchOpInterface() {
 func.func @while_two_arg(%arg0: index) {
   %a = memref.alloc(%arg0) : memref<?xf32>
   scf.while (%arg1 = %a, %arg2 = %a) : (memref<?xf32>, memref<?xf32>) -> (memref<?xf32>, memref<?xf32>) {
-    %0 = "test.make_condition"() : () -> i1
+    // This op has a side effect, but it's not an allocate/free side effect.
+    %0 = "test.side_effect_op"() {effects = [{effect="read"}]} : () -> i1
     scf.condition(%0) %arg1, %arg2 : memref<?xf32>, memref<?xf32>
   } do {
   ^bb0(%arg1: memref<?xf32>, %arg2: memref<?xf32>):
@@ -591,7 +582,8 @@ func.func @while_two_arg(%arg0: index) {
 func.func @while_three_arg(%arg0: index) {
   %a = memref.alloc(%arg0) : memref<?xf32>
   scf.while (%arg1 = %a, %arg2 = %a, %arg3 = %a) : (memref<?xf32>, memref<?xf32>, memref<?xf32>) -> (memref<?xf32>, memref<?xf32>, memref<?xf32>) {
-    %0 = "test.make_condition"() : () -> i1
+    // This op has a side effect, but it's not an allocate/free side effect.
+    %0 = "test.side_effect_op"() {effects = [{effect="read"}]} : () -> i1
     scf.condition(%0) %arg1, %arg2, %arg3 : memref<?xf32>, memref<?xf32>, memref<?xf32>
   } do {
   ^bb0(%arg1: memref<?xf32>, %arg2: memref<?xf32>, %arg3: memref<?xf32>):
diff --git a/mlir/test/Dialect/GPU/bufferization-buffer-deallocation.mlir b/mlir/test/Dialect/GPU/bufferization-buffer-deallocation.mlir
index 25349967e61d3..57260c86efca2 100644
--- a/mlir/test/Dialect/GPU/bufferization-buffer-deallocation.mlir
+++ b/mlir/test/Dialect/GPU/bufferization-buffer-deallocation.mlir
@@ -5,7 +5,7 @@ func.func @gpu_launch() {
   gpu.launch blocks(%arg0, %arg1, %arg2) in (%arg6 = %c1, %arg7 = %c1, %arg8 = %c1)
     threads(%arg3, %arg4, %arg5) in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1) {
     %alloc = memref.alloc() : memref<2xf32>
-    "test.memref_user"(%alloc) : (memref<2xf32>) -> ()
+    "test.read_buffer"(%alloc) : (memref<2xf32>) -> ()
     gpu.terminator
   }
   return
diff --git a/mlir/test/lib/Dialect/Test/TestDialect.cpp b/mlir/test/lib/Dialect/Test/TestDialect.cpp
index cb5ee6014b611..1ee52fc08d77f 100644
--- a/mlir/test/lib/Dialect/Test/TestDialect.cpp
+++ b/mlir/test/lib/Dialect/Test/TestDialect.cpp
@@ -1321,6 +1321,21 @@ static void printSumProperty(OpAsmPrinter &printer, Operation *op,
   printer << second << " = " << (second + first);
 }
 
+//===----------------------------------------------------------------------===//
+// Tensor/Buffer Ops
+//===----------------------------------------------------------------------===//
+
+void ReadBufferOp::getEffects(
+    SmallVectorImpl<SideEffects::EffectInstance<MemoryEffects::Effect>>
+        &effects) {
+  // The buffer operand is read.
+  effects.emplace_back(MemoryEffects::Read::get(), getBuffer(),
+                       SideEffects::DefaultResource::get());
+  // The buffer contents are dumped.
+  effects.emplace_back(MemoryEffects::Write::get(),
+                       SideEffects::DefaultResource::get());
+}
+
 //===----------------------------------------------------------------------===//
 // Test Dataflow
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/lib/Dialect/Test/TestOps.td b/mlir/test/lib/Dialect/Test/TestOps.td
index 61e181999ff2f..11e409c6072f7 100644
--- a/mlir/test/lib/Dialect/Test/TestOps.td
+++ b/mlir/test/lib/Dialect/Test/TestOps.td
@@ -486,6 +486,17 @@ def AllocaScopeRegionOp : TEST_Op<"alloca_scope_region",
   let assemblyFormat = "attr-dict-with-keyword $region";
 }
 
+def OneRegionWithRecursiveMemoryEffectsOp
+    : TEST_Op<"one_region_with_recursive_memory_effects", [
+        RecursiveMemoryEffects]> {
+  let description = [{
+    Op that has one region and recursive side effects. The
+    RegionBranchOpInterface is not implemented on this op.
+  }];
+  let results = (outs AnyType:$result);
+  let regions = (region SizedRegion<1>:$body);
+}
+
 //===----------------------------------------------------------------------===//
 // NoTerminator Operation
 //===----------------------------------------------------------------------===//
@@ -1900,7 +1911,7 @@ def BlackHoleOp : TEST_Op<"blackhole">,
 //===----------------------------------------------------------------------===//
 
 def TestRegionBuilderOp : TEST_Op<"region_builder">;
-def TestReturnOp : TEST_Op<"return", [ReturnLike, Terminator]> {
+def TestReturnOp : TEST_Op<"return", [Pure, ReturnLike, Terminator]> {
   let arguments = (ins Variadic<AnyType>);
   let builders = [OpBuilder<(ins),
     [{ build($_builder, $_state, {}); }]>
@@ -2106,8 +2117,10 @@ class BufferBasedOpBase<string mnemonic, list<Trait> traits>
   let description = [{
     A buffer based operation, that uses memRefs as input and output.
   }];
-  let arguments = (ins AnyRankedOrUnrankedMemRef:$input,
-                       AnyRankedOrUnrankedMemRef:$output);
+  let arguments = (ins Arg<AnyRankedOrUnrankedMemRef, "reading",
+                           [MemRead]>:$input,
+                       Arg<AnyRankedOrUnrankedMemRef, "writing",
+                           [MemWrite]>:$output);
 }
 
 def BufferBasedOp : BufferBasedOpBase<"buffer_based", []>{
@@ -2138,6 +2151,24 @@ def TensorBasedOp : TEST_Op<"tensor_based", []> {
   }];
 }
 
+def ReadBufferOp : TEST_Op<"read_buffer", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>]> {
+  let description = [{
+    An operation that reads the buffer operand and dumps its contents.
+  }];
+  let arguments = (ins AnyRankedOrUnrankedMemRef:$buffer);
+}
+
+def ForwardBufferOp : TEST_Op<"forward_buffer", [Pure]> {
+  let description = [{
+    A pure operation that takes a buffer and returns a buffer. This op does not
+    have any side effects, so it cannot allocate or read a buffer from memory.
+    It must return the input buffer (or a view thereof). This op purposely does
+    does not implement any interface.
+  }];
+  let arguments = (ins AnyRankedOrUnrankedMemRef:$buffer);
+  let results = (outs AnyRankedOrUnrankedMemRef:$result);
+}
+
 //===----------------------------------------------------------------------===//
 // Test RegionBranchOpInterface
 //===----------------------------------------------------------------------===//

From 38b5f2edfc67d2155d043519c89568bb00a9eced Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hristo.goshev.hristov@gmail.com>
Date: Sun, 21 Jan 2024 12:27:16 +0200
Subject: [PATCH 288/843] [libc++][spaceship][NFC] Status page update (#78894)

Co-authored-by: Zingam <zingam@outlook.com>
---
 libcxx/docs/Status/SpaceshipPapers.csv | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/docs/Status/SpaceshipPapers.csv b/libcxx/docs/Status/SpaceshipPapers.csv
index 1a00dc2f274f3..48fe417bc7dc0 100644
--- a/libcxx/docs/Status/SpaceshipPapers.csv
+++ b/libcxx/docs/Status/SpaceshipPapers.csv
@@ -1,8 +1,8 @@
 "Number","Name","Status","First released version"
 `P1614R2 <https://wg21.link/P1614R2>`_,The Mothership has Landed,|In Progress|,
 `P2404R3 <https://wg21.link/P2404R3>`_,"Relaxing ``equality_comparable_with``'s, ``totally_ordered_with``'s, and ``three_way_comparable_with``'s common reference requirements to support move-only types",,
-`LWG3330 <https://wg21.link/LWG3330>`_,Include <compare> from most library headers,"|Complete|","13.0"
-`LWG3347 <https://wg21.link/LWG3347>`_,"``std::pair<T, U>`` now requires ``T`` and ``U`` to be *``less-than-comparable``*",|Nothing To Do|,
+`LWG3330 <https://wg21.link/LWG3330>`_,Include ``<compare>`` from most library headers,"|Complete|","13.0"
+`LWG3347 <https://wg21.link/LWG3347>`_,"``std::pair<T, U>`` now requires ``T`` and ``U`` to be *less-than-comparable*",|Nothing To Do|,
 `LWG3350 <https://wg21.link/LWG3350>`_,Simplify return type of lexicographical_compare_three_way,|Nothing To Do|,
 `LWG3360 <https://wg21.link/LWG3360>`_,``three_way_comparable_with`` is inconsistent with similar concepts,|Nothing To Do|,
 `LWG3380 <https://wg21.link/LWG3380>`_,common_type and comparison categories,|Nothing To Do|,

From 7d9b5aa65b09126031e1c2903605a7d34aea4bc1 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hristo.goshev.hristov@gmail.com>
Date: Sun, 21 Jan 2024 12:30:25 +0200
Subject: [PATCH 289/843] [libc++][format] P2637R3: Member `visit`
 (`std::basic_format_arg`) (#76449)

Implements parts of: `P2637R3` https://wg21.link/P2637R3
(https://eel.is/c++draft/variant.visit)

Implements:
`basic_format_arg.visit()`
`basic_format_arg.visit<R>()`
Deprecates:
`std::visit_format_arg()`

The tests are as close as possible to the non-member function tests.

To land after: https://github.com/llvm/llvm-project/pull/76447,
https://github.com/llvm/llvm-project/pull/76268

---------

Co-authored-by: Zingam <zingam@outlook.com>
---
 libcxx/docs/ReleaseNotes/18.rst               |   1 +
 libcxx/docs/Status/Cxx2cPapers.csv            |   2 +-
 libcxx/docs/Status/FormatIssues.csv           |   2 +-
 libcxx/include/__config                       |   6 +
 libcxx/include/__format/format_arg.h          | 109 +++++-
 libcxx/include/__format/format_context.h      |  33 +-
 libcxx/include/format                         |   2 +-
 .../format.arg/visit.pass.cpp                 | 333 ++++++++++++++++
 .../format.arg/visit.return_type.pass.cpp     | 369 ++++++++++++++++++
 .../visit_format_arg.deprecated.verify.cpp    |  38 ++
 .../format.arg/visit_format_arg.pass.cpp      |   6 +-
 .../format.arguments/format.args/get.pass.cpp |  48 ++-
 libcxx/test/support/test_basic_format_arg.h   |  20 +-
 libcxx/test/support/test_macros.h             |   5 +
 .../generate_feature_test_macro_components.py |   1 +
 15 files changed, 927 insertions(+), 48 deletions(-)
 create mode 100644 libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
 create mode 100644 libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
 create mode 100644 libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp

diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 311a06e34c49d..14e1e09907fad 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -61,6 +61,7 @@ Implemented Papers
 - P1759R6 - Native handles and file streams
 - P2868R3 - Remove Deprecated ``std::allocator`` Typedef From C++26
 - P2517R1 - Add a conditional ``noexcept`` specification to ``std::apply``
+- P2637R3 - Member ``visit``
 - P2447R6 - ``span`` over initializer list
 
 
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index f6124d9a6c4e3..e49290b45589a 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -17,7 +17,7 @@
 "`P0792R14 <https://wg21.link/P0792R14>`__","LWG","``function_ref``: a type-erased callable reference","Varna June 2023","","",""
 "`P2874R2 <https://wg21.link/P2874R2>`__","LWG","Mandating Annex D Require No More","Varna June 2023","","",""
 "`P2757R3 <https://wg21.link/P2757R3>`__","LWG","Type-checking format args","Varna June 2023","","","|format|"
-"`P2637R3 <https://wg21.link/P2637R3>`__","LWG","Member ``visit``","Varna June 2023","|Partial|","18.0",""
+"`P2637R3 <https://wg21.link/P2637R3>`__","LWG","Member ``visit``","Varna June 2023","|Complete|","18.0",""
 "`P2641R4 <https://wg21.link/P2641R4>`__","CWG, LWG","Checking if a ``union`` alternative is active","Varna June 2023","","",""
 "`P1759R6 <https://wg21.link/P1759R6>`__","LWG","Native handles and file streams","Varna June 2023","|Complete|","18.0",""
 "`P2697R1 <https://wg21.link/P2697R1>`__","LWG","Interfacing ``bitset`` with ``string_view``","Varna June 2023","|Complete|","18.0",""
diff --git a/libcxx/docs/Status/FormatIssues.csv b/libcxx/docs/Status/FormatIssues.csv
index 513988d08036c..6e58e752191ea 100644
--- a/libcxx/docs/Status/FormatIssues.csv
+++ b/libcxx/docs/Status/FormatIssues.csv
@@ -16,7 +16,7 @@ Number,Name,Standard,Assignee,Status,First released version
 "`P2693R1 <https://wg21.link/P2693R1>`__","Formatting ``thread::id`` and ``stacktrace``","C++23","Mark de Wever","|In Progress|"
 "`P2510R3 <https://wg21.link/P2510R3>`__","Formatting pointers","C++26","Mark de Wever","|Complete|",17.0
 "`P2757R3 <https://wg21.link/P2757R3>`__","Type-checking format args","C++26","","",
-"`P2637R3 <https://wg21.link/P2637R3>`__","Member ``visit``","C++26","","",
+"`P2637R3 <https://wg21.link/P2637R3>`__","Member ``visit``","C++26","Hristo Hristov","|Complete|",18.0
 "`P2905R2 <https://wg21.link/P2905R2>`__","Runtime format strings","C++26 DR","Mark de Wever","|Complete|",18.0
 "`P2918R2 <https://wg21.link/P2918R2>`__","Runtime format strings II","C++26","Mark de Wever","|Complete|",18.0
 "`P2909R4 <https://wg21.link/P2909R4>`__","Fix formatting of code units as integers (Dude, where’s my ``char``?)","C++26 DR","Mark de Wever","|Complete|",18.0
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 33a79b9a79f62..39ce6d628e3bf 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -995,6 +995,12 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_IN_CXX23
 #  endif
 
+#  if _LIBCPP_STD_VER >= 26
+#    define _LIBCPP_DEPRECATED_IN_CXX26 _LIBCPP_DEPRECATED
+#  else
+#    define _LIBCPP_DEPRECATED_IN_CXX26
+#  endif
+
 #  if !defined(_LIBCPP_HAS_NO_CHAR8_T)
 #    define _LIBCPP_DEPRECATED_WITH_CHAR8_T _LIBCPP_DEPRECATED
 #  else
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index 10fca15d5a7a9..02ee3cef7d402 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -93,7 +93,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr __arg_t __get_packed_type(uint64_t __types, size
 
 } // namespace __format
 
-// This function is not user obervable, so it can directly use the non-standard
+// This function is not user observable, so it can directly use the non-standard
 // types of the "variant". See __arg_t for more details.
 template <class _Visitor, class _Context>
 _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
@@ -144,6 +144,59 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_
   __libcpp_unreachable();
 }
 
+#  if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
+
+template <class _Rp, class _Visitor, class _Context>
+_LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
+  switch (__arg.__type_) {
+  case __format::__arg_t::__none:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__monostate_);
+  case __format::__arg_t::__boolean:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__boolean_);
+  case __format::__arg_t::__char_type:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__char_type_);
+  case __format::__arg_t::__int:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__int_);
+  case __format::__arg_t::__long_long:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__long_long_);
+  case __format::__arg_t::__i128:
+#    ifndef _LIBCPP_HAS_NO_INT128
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__i128_);
+#    else
+    __libcpp_unreachable();
+#    endif
+  case __format::__arg_t::__unsigned:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__unsigned_);
+  case __format::__arg_t::__unsigned_long_long:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__unsigned_long_long_);
+  case __format::__arg_t::__u128:
+#    ifndef _LIBCPP_HAS_NO_INT128
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__u128_);
+#    else
+    __libcpp_unreachable();
+#    endif
+  case __format::__arg_t::__float:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__float_);
+  case __format::__arg_t::__double:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__double_);
+  case __format::__arg_t::__long_double:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__long_double_);
+  case __format::__arg_t::__const_char_type_ptr:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__const_char_type_ptr_);
+  case __format::__arg_t::__string_view:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__string_view_);
+  case __format::__arg_t::__ptr:
+    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__ptr_);
+  case __format::__arg_t::__handle:
+    return std::invoke_r<_Rp>(
+        std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__arg.__value_.__handle_});
+  }
+
+  __libcpp_unreachable();
+}
+
+#  endif // _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
+
 /// Contains the values used in basic_format_arg.
 ///
 /// This is a separate type so it's possible to store the values and types in
@@ -227,6 +280,52 @@ class _LIBCPP_TEMPLATE_VIS basic_format_arg {
 
   _LIBCPP_HIDE_FROM_ABI explicit operator bool() const noexcept { return __type_ != __format::__arg_t::__none; }
 
+#  if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
+
+  // This function is user facing, so it must wrap the non-standard types of
+  // the "variant" in a handle to stay conforming. See __arg_t for more details.
+  template <class _Visitor>
+  _LIBCPP_HIDE_FROM_ABI decltype(auto) visit(this basic_format_arg __arg, _Visitor&& __vis) {
+    switch (__arg.__type_) {
+#    ifndef _LIBCPP_HAS_NO_INT128
+    case __format::__arg_t::__i128: {
+      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i128_};
+      return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+    }
+
+    case __format::__arg_t::__u128: {
+      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
+      return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+    }
+#    endif
+    default:
+      return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg);
+    }
+  }
+
+  // This function is user facing, so it must wrap the non-standard types of
+  // the "variant" in a handle to stay conforming. See __arg_t for more details.
+  template <class _Rp, class _Visitor>
+  _LIBCPP_HIDE_FROM_ABI _Rp visit(this basic_format_arg __arg, _Visitor&& __vis) {
+    switch (__arg.__type_) {
+#    ifndef _LIBCPP_HAS_NO_INT128
+    case __format::__arg_t::__i128: {
+      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i128_};
+      return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+    }
+
+    case __format::__arg_t::__u128: {
+      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
+      return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
+    }
+#    endif
+    default:
+      return std::__visit_format_arg<_Rp>(std::forward<_Visitor>(__vis), __arg);
+    }
+  }
+
+#  endif // _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
+
 private:
   using char_type = typename _Context::char_type;
 
@@ -267,7 +366,11 @@ class _LIBCPP_TEMPLATE_VIS basic_format_arg<_Context>::handle {
 // This function is user facing, so it must wrap the non-standard types of
 // the "variant" in a handle to stay conforming. See __arg_t for more details.
 template <class _Visitor, class _Context>
-_LIBCPP_HIDE_FROM_ABI decltype(auto) visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
+#  if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
+_LIBCPP_DEPRECATED_IN_CXX26
+#  endif
+    _LIBCPP_HIDE_FROM_ABI decltype(auto)
+    visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
   switch (__arg.__type_) {
 #  ifndef _LIBCPP_HAS_NO_INT128
   case __format::__arg_t::__i128: {
@@ -279,7 +382,7 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) visit_format_arg(_Visitor&& __vis, basic_fo
     typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
     return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
   }
-#  endif
+#  endif // _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
   default:
     return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg);
   }
diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h
index 5b252b81f691b..0beaa84b028be 100644
--- a/libcxx/include/__format/format_context.h
+++ b/libcxx/include/__format/format_context.h
@@ -163,20 +163,25 @@ class _LIBCPP_TEMPLATE_VIS basic_format_context<typename __format::__retarget_bu
 #  endif
         __ctx_(std::addressof(__ctx)),
         __arg_([](void* __c, size_t __id) {
-          return std::visit_format_arg(
-              [&](auto __arg) -> basic_format_arg<basic_format_context> {
-                if constexpr (same_as<decltype(__arg), monostate>)
-                  return {};
-                else if constexpr (same_as<decltype(__arg), typename basic_format_arg<_Context>::handle>)
-                  // At the moment it's not possible for formatting to use a re-targeted handle.
-                  // TODO FMT add this when support is needed.
-                  std::__throw_format_error("Re-targeting handle not supported");
-                else
-                  return basic_format_arg<basic_format_context>{
-                      __format::__determine_arg_t<basic_format_context, decltype(__arg)>(),
-                      __basic_format_arg_value<basic_format_context>(__arg)};
-              },
-              static_cast<_Context*>(__c)->arg(__id));
+          auto __visitor = [&](auto __arg) -> basic_format_arg<basic_format_context> {
+            if constexpr (same_as<decltype(__arg), monostate>)
+              return {};
+            else if constexpr (same_as<decltype(__arg), typename basic_format_arg<_Context>::handle>)
+              // At the moment it's not possible for formatting to use a re-targeted handle.
+              // TODO FMT add this when support is needed.
+              std::__throw_format_error("Re-targeting handle not supported");
+            else
+              return basic_format_arg<basic_format_context>{
+                  __format::__determine_arg_t<basic_format_context, decltype(__arg)>(),
+                  __basic_format_arg_value<basic_format_context>(__arg)};
+          };
+#  if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
+          return static_cast<_Context*>(__c)->arg(__id).visit(std::move(__visitor));
+#  else
+          _LIBCPP_SUPPRESS_DEPRECATED_PUSH
+          return std::visit_format_arg(std::move(__visitor), static_cast<_Context*>(__c)->arg(__id));
+          _LIBCPP_SUPPRESS_DEPRECATED_POP
+#  endif // _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
         }) {
   }
 
diff --git a/libcxx/include/format b/libcxx/include/format
index ab9b336d0cdab..64f6ba1d25284 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -170,7 +170,7 @@ namespace std {
   template<class Context> class basic_format_arg;
 
   template<class Visitor, class Context>
-    see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg);
+    see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg); // Deprecated in C++26
 
   // [format.arg.store], class template format-arg-store
   template<class Context, class... Args> struct format-arg-store;      // exposition only
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
new file mode 100644
index 0000000000000..994ccc70a38da
--- /dev/null
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
@@ -0,0 +1,333 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+// The tested functionality needs deducing this.
+// UNSUPPORTED: clang-16 || clang-17
+// XFAIL: apple-clang
+
+// <format>
+
+// class basic_format_arg;
+
+// template<class Visitor>
+//   decltype(auto) visit(this basic_format_arg arg, Visitor&& vis);  // since C++26
+
+#include <algorithm>
+#include <cassert>
+#include <format>
+#include <type_traits>
+
+#include "constexpr_char_traits.h"
+#include "make_string.h"
+#include "min_allocator.h"
+#include "test_macros.h"
+
+template <class Context, class To, class From>
+void test(From value) {
+  auto store = std::make_format_args<Context>(value);
+  std::basic_format_args<Context> format_args{store};
+
+  LIBCPP_ASSERT(format_args.__size() == 1);
+  assert(format_args.get(0));
+
+  auto result = format_args.get(0).visit([v = To(value)](auto a) -> To {
+    if constexpr (std::is_same_v<To, decltype(a)>) {
+      assert(v == a);
+      return a;
+    } else {
+      assert(false);
+      return {};
+    }
+  });
+
+  using ct = std::common_type_t<From, To>;
+  assert(static_cast<ct>(result) == static_cast<ct>(value));
+}
+
+// Some types, as an extension, are stored in the variant. The Standard
+// requires them to be observed as a handle.
+template <class Context, class T>
+void test_handle(T value) {
+  auto store = std::make_format_args<Context>(value);
+  std::basic_format_args<Context> format_args{store};
+
+  LIBCPP_ASSERT(format_args.__size() == 1);
+  assert(format_args.get(0));
+
+  format_args.get(0).visit([](auto a) {
+    assert((std::is_same_v<decltype(a), typename std::basic_format_arg<Context>::handle>));
+  });
+}
+
+// Test specific for string and string_view.
+//
+// Since both result in a string_view there's no need to pass this as a
+// template argument.
+template <class Context, class From>
+void test_string_view(From value) {
+  auto store = std::make_format_args<Context>(value);
+  std::basic_format_args<Context> format_args{store};
+
+  LIBCPP_ASSERT(format_args.__size() == 1);
+  assert(format_args.get(0));
+
+  using CharT = typename Context::char_type;
+  using To    = std::basic_string_view<CharT>;
+  using V     = std::basic_string<CharT>;
+
+  auto result = format_args.get(0).visit([v = V(value.begin(), value.end())](auto a) -> To {
+    if constexpr (std::is_same_v<To, decltype(a)>) {
+      assert(v == a);
+      return a;
+    } else {
+      assert(false);
+      return {};
+    }
+  });
+
+  assert(std::equal(value.begin(), value.end(), result.begin(), result.end()));
+}
+
+template <class CharT>
+void test() {
+  using Context = std::basic_format_context<CharT*, CharT>;
+  std::basic_string<CharT> empty;
+  std::basic_string<CharT> str = MAKE_STRING(CharT, "abc");
+
+  // Test boolean types.
+
+  test<Context, bool>(true);
+  test<Context, bool>(false);
+
+  // Test CharT types.
+
+  test<Context, CharT, CharT>('a');
+  test<Context, CharT, CharT>('z');
+  test<Context, CharT, CharT>('0');
+  test<Context, CharT, CharT>('9');
+
+  // Test char types.
+
+  if (std::is_same_v<CharT, char>) {
+    // char to char -> char
+    test<Context, CharT, char>('a');
+    test<Context, CharT, char>('z');
+    test<Context, CharT, char>('0');
+    test<Context, CharT, char>('9');
+  } else {
+    if (std::is_same_v<CharT, wchar_t>) {
+      // char to wchar_t -> wchar_t
+      test<Context, wchar_t, char>('a');
+      test<Context, wchar_t, char>('z');
+      test<Context, wchar_t, char>('0');
+      test<Context, wchar_t, char>('9');
+    } else if (std::is_signed_v<char>) {
+      // char to CharT -> int
+      // This happens when CharT is a char8_t, char16_t, or char32_t and char
+      // is a signed type.
+      // Note if sizeof(CharT) > sizeof(int) this test fails. If there are
+      // platforms where that occurs extra tests need to be added for char32_t
+      // testing it against a long long.
+      test<Context, int, char>('a');
+      test<Context, int, char>('z');
+      test<Context, int, char>('0');
+      test<Context, int, char>('9');
+    } else {
+      // char to CharT -> unsigned
+      // This happens when CharT is a char8_t, char16_t, or char32_t and char
+      // is an unsigned type.
+      // Note if sizeof(CharT) > sizeof(unsigned) this test fails. If there are
+      // platforms where that occurs extra tests need to be added for char32_t
+      // testing it against an unsigned long long.
+      test<Context, unsigned, char>('a');
+      test<Context, unsigned, char>('z');
+      test<Context, unsigned, char>('0');
+      test<Context, unsigned, char>('9');
+    }
+  }
+
+  // Test signed integer types.
+
+  test<Context, int, signed char>(std::numeric_limits<signed char>::min());
+  test<Context, int, signed char>(0);
+  test<Context, int, signed char>(std::numeric_limits<signed char>::max());
+
+  test<Context, int, short>(std::numeric_limits<short>::min());
+  test<Context, int, short>(std::numeric_limits<signed char>::min());
+  test<Context, int, short>(0);
+  test<Context, int, short>(std::numeric_limits<signed char>::max());
+  test<Context, int, short>(std::numeric_limits<short>::max());
+
+  test<Context, int, int>(std::numeric_limits<int>::min());
+  test<Context, int, int>(std::numeric_limits<short>::min());
+  test<Context, int, int>(std::numeric_limits<signed char>::min());
+  test<Context, int, int>(0);
+  test<Context, int, int>(std::numeric_limits<signed char>::max());
+  test<Context, int, int>(std::numeric_limits<short>::max());
+  test<Context, int, int>(std::numeric_limits<int>::max());
+
+  using LongToType = std::conditional_t<sizeof(long) == sizeof(int), int, long long>;
+
+  test<Context, LongToType, long>(std::numeric_limits<long>::min());
+  test<Context, LongToType, long>(std::numeric_limits<int>::min());
+  test<Context, LongToType, long>(std::numeric_limits<short>::min());
+  test<Context, LongToType, long>(std::numeric_limits<signed char>::min());
+  test<Context, LongToType, long>(0);
+  test<Context, LongToType, long>(std::numeric_limits<signed char>::max());
+  test<Context, LongToType, long>(std::numeric_limits<short>::max());
+  test<Context, LongToType, long>(std::numeric_limits<int>::max());
+  test<Context, LongToType, long>(std::numeric_limits<long>::max());
+
+  test<Context, long long, long long>(std::numeric_limits<long long>::min());
+  test<Context, long long, long long>(std::numeric_limits<long>::min());
+  test<Context, long long, long long>(std::numeric_limits<int>::min());
+  test<Context, long long, long long>(std::numeric_limits<short>::min());
+  test<Context, long long, long long>(std::numeric_limits<signed char>::min());
+  test<Context, long long, long long>(0);
+  test<Context, long long, long long>(std::numeric_limits<signed char>::max());
+  test<Context, long long, long long>(std::numeric_limits<short>::max());
+  test<Context, long long, long long>(std::numeric_limits<int>::max());
+  test<Context, long long, long long>(std::numeric_limits<long>::max());
+  test<Context, long long, long long>(std::numeric_limits<long long>::max());
+
+#ifndef TEST_HAS_NO_INT128
+  test_handle<Context, __int128_t>(0);
+#endif // TEST_HAS_NO_INT128
+
+  // Test unsigned integer types.
+
+  test<Context, unsigned, unsigned char>(0);
+  test<Context, unsigned, unsigned char>(std::numeric_limits<unsigned char>::max());
+
+  test<Context, unsigned, unsigned short>(0);
+  test<Context, unsigned, unsigned short>(std::numeric_limits<unsigned char>::max());
+  test<Context, unsigned, unsigned short>(std::numeric_limits<unsigned short>::max());
+
+  test<Context, unsigned, unsigned>(0);
+  test<Context, unsigned, unsigned>(std::numeric_limits<unsigned char>::max());
+  test<Context, unsigned, unsigned>(std::numeric_limits<unsigned short>::max());
+  test<Context, unsigned, unsigned>(std::numeric_limits<unsigned>::max());
+
+  using UnsignedLongToType =
+      std::conditional_t<sizeof(unsigned long) == sizeof(unsigned), unsigned, unsigned long long>;
+
+  test<Context, UnsignedLongToType, unsigned long>(0);
+  test<Context, UnsignedLongToType, unsigned long>(std::numeric_limits<unsigned char>::max());
+  test<Context, UnsignedLongToType, unsigned long>(std::numeric_limits<unsigned short>::max());
+  test<Context, UnsignedLongToType, unsigned long>(std::numeric_limits<unsigned>::max());
+  test<Context, UnsignedLongToType, unsigned long>(std::numeric_limits<unsigned long>::max());
+
+  test<Context, unsigned long long, unsigned long long>(0);
+  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned char>::max());
+  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned short>::max());
+  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned>::max());
+  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned long>::max());
+  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned long long>::max());
+
+#ifndef TEST_HAS_NO_INT128
+  test_handle<Context, __uint128_t>(0);
+#endif // TEST_HAS_NO_INT128
+
+  // Test floating point types.
+
+  test<Context, float, float>(-std::numeric_limits<float>::max());
+  test<Context, float, float>(-std::numeric_limits<float>::min());
+  test<Context, float, float>(-0.0);
+  test<Context, float, float>(0.0);
+  test<Context, float, float>(std::numeric_limits<float>::min());
+  test<Context, float, float>(std::numeric_limits<float>::max());
+
+  test<Context, double, double>(-std::numeric_limits<double>::max());
+  test<Context, double, double>(-std::numeric_limits<double>::min());
+  test<Context, double, double>(-0.0);
+  test<Context, double, double>(0.0);
+  test<Context, double, double>(std::numeric_limits<double>::min());
+  test<Context, double, double>(std::numeric_limits<double>::max());
+
+  test<Context, long double, long double>(-std::numeric_limits<long double>::max());
+  test<Context, long double, long double>(-std::numeric_limits<long double>::min());
+  test<Context, long double, long double>(-0.0);
+  test<Context, long double, long double>(0.0);
+  test<Context, long double, long double>(std::numeric_limits<long double>::min());
+  test<Context, long double, long double>(std::numeric_limits<long double>::max());
+
+  // Test const CharT pointer types.
+
+  test<Context, const CharT*, const CharT*>(empty.c_str());
+  test<Context, const CharT*, const CharT*>(str.c_str());
+
+  // Test string_view types.
+
+  {
+    using From = std::basic_string_view<CharT>;
+
+    test_string_view<Context>(From());
+    test_string_view<Context>(From(empty.c_str()));
+    test_string_view<Context>(From(str.c_str()));
+  }
+
+  {
+    using From = std::basic_string_view<CharT, constexpr_char_traits<CharT>>;
+
+    test_string_view<Context>(From());
+    test_string_view<Context>(From(empty.c_str()));
+    test_string_view<Context>(From(str.c_str()));
+  }
+
+  // Test string types.
+
+  {
+    using From = std::basic_string<CharT>;
+
+    test_string_view<Context>(From());
+    test_string_view<Context>(From(empty.c_str()));
+    test_string_view<Context>(From(str.c_str()));
+  }
+
+  {
+    using From = std::basic_string<CharT, constexpr_char_traits<CharT>, std::allocator<CharT>>;
+
+    test_string_view<Context>(From());
+    test_string_view<Context>(From(empty.c_str()));
+    test_string_view<Context>(From(str.c_str()));
+  }
+
+  {
+    using From = std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>>;
+
+    test_string_view<Context>(From());
+    test_string_view<Context>(From(empty.c_str()));
+    test_string_view<Context>(From(str.c_str()));
+  }
+
+  {
+    using From = std::basic_string<CharT, constexpr_char_traits<CharT>, min_allocator<CharT>>;
+
+    test_string_view<Context>(From());
+    test_string_view<Context>(From(empty.c_str()));
+    test_string_view<Context>(From(str.c_str()));
+  }
+
+  // Test pointer types.
+
+  test<Context, const void*>(nullptr);
+  int i = 0;
+  test<Context, const void*>(static_cast<void*>(&i));
+  const int ci = 0;
+  test<Context, const void*>(static_cast<const void*>(&ci));
+}
+
+int main(int, char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
new file mode 100644
index 0000000000000..473278b7b4434
--- /dev/null
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
@@ -0,0 +1,369 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+// The tested functionality needs deducing this.
+// UNSUPPORTED: clang-16 || clang-17
+// XFAIL: apple-clang
+
+// <format>
+
+// class basic_format_arg;
+
+// template<class R, class Visitor>
+//   R visit(this basic_format_arg arg, Visitor&& vis);
+
+#include <algorithm>
+#include <cassert>
+#include <format>
+#include <type_traits>
+
+#include "constexpr_char_traits.h"
+#include "make_string.h"
+#include "min_allocator.h"
+#include "test_macros.h"
+
+// The expected result type shouldn't matter,therefore use a hardcoded value for simplicity.
+using ExpectedResultType = bool;
+constexpr ExpectedResultType visited{true};
+
+template <class ExpectedR>
+ExpectedR make_expected_result() {
+  if constexpr (std::is_same_v<ExpectedR, bool>) {
+    return true;
+  } else if constexpr (std::is_same_v<ExpectedR, long>) {
+    return 192812079084L;
+  } else {
+    return "visited";
+  }
+}
+
+template <class Context, class To, class ExpectedR, class From>
+void test(From value, const ExpectedR& expectedValue) {
+  auto store = std::make_format_args<Context>(value);
+  std::basic_format_args<Context> format_args{store};
+
+  LIBCPP_ASSERT(format_args.__size() == 1);
+  assert(format_args.get(0));
+
+  // member
+  {
+    std::same_as<ExpectedR> decltype(auto) result =
+        format_args.get(0).template visit<ExpectedR>([v = To(value)](auto a) -> ExpectedR {
+          if constexpr (std::is_same_v<To, decltype(a)>) {
+            assert(v == a);
+            return make_expected_result<ExpectedR>();
+          } else {
+            assert(false);
+            return {};
+          }
+        });
+
+    assert(result == expectedValue);
+  }
+}
+
+// Some types, as an extension, are stored in the variant. The Standard
+// requires them to be observed as a handle.
+template <class Context, class T, class ExpectedR>
+void test_handle(T value, ExpectedR expectedValue) {
+  auto store = std::make_format_args<Context>(value);
+  std::basic_format_args<Context> format_args{store};
+
+  LIBCPP_ASSERT(format_args.__size() == 1);
+  assert(format_args.get(0));
+
+  std::same_as<ExpectedR> decltype(auto) result = format_args.get(0).template visit<ExpectedR>([](auto a) -> ExpectedR {
+    assert((std::is_same_v<decltype(a), typename std::basic_format_arg<Context>::handle>));
+
+    return make_expected_result<ExpectedR>();
+  });
+
+  assert(result == expectedValue);
+}
+
+// Test specific for string and string_view.
+//
+// Since both result in a string_view there's no need to pass this as a
+// template argument.
+template <class Context, class ExpectedR, class From>
+void test_string_view(From value, ExpectedR expectedValue) {
+  auto store = std::make_format_args<Context>(value);
+  std::basic_format_args<Context> format_args{store};
+
+  LIBCPP_ASSERT(format_args.__size() == 1);
+  assert(format_args.get(0));
+
+  using CharT = typename Context::char_type;
+  using To    = std::basic_string_view<CharT>;
+  using V     = std::basic_string<CharT>;
+
+  std::same_as<ExpectedR> decltype(auto) result =
+      format_args.get(0).template visit<ExpectedR>([v = V(value.begin(), value.end())](auto a) -> ExpectedR {
+        if constexpr (std::is_same_v<To, decltype(a)>) {
+          assert(v == a);
+          return make_expected_result<ExpectedR>();
+        } else {
+          assert(false);
+          return {};
+        }
+      });
+
+  assert(result == expectedValue);
+}
+
+template <class CharT>
+void test() {
+  using Context = std::basic_format_context<CharT*, CharT>;
+  std::basic_string<CharT> empty;
+  std::basic_string<CharT> str = MAKE_STRING(CharT, "abc");
+
+  // Test boolean types.
+
+  test<Context, bool, ExpectedResultType>(true, visited);
+  test<Context, bool, ExpectedResultType>(false, visited);
+
+  test<Context, bool, std::string>(true, "visited");
+  test<Context, bool, std::string>(false, "visited");
+
+  test<Context, bool, long>(true, 192812079084L);
+  test<Context, bool, long>(false, 192812079084L);
+
+  // Test CharT types.
+
+  test<Context, CharT, ExpectedResultType, CharT>('a', visited);
+  test<Context, CharT, ExpectedResultType, CharT>('z', visited);
+  test<Context, CharT, ExpectedResultType, CharT>('0', visited);
+  test<Context, CharT, ExpectedResultType, CharT>('9', visited);
+
+  // Test char types.
+
+  if (std::is_same_v<CharT, char>) {
+    // char to char -> char
+    test<Context, CharT, ExpectedResultType, char>('a', visited);
+    test<Context, CharT, ExpectedResultType, char>('z', visited);
+    test<Context, CharT, ExpectedResultType, char>('0', visited);
+    test<Context, CharT, ExpectedResultType, char>('9', visited);
+  } else {
+    if (std::is_same_v<CharT, wchar_t>) {
+      // char to wchar_t -> wchar_t
+      test<Context, wchar_t, ExpectedResultType, char>('a', visited);
+      test<Context, wchar_t, ExpectedResultType, char>('z', visited);
+      test<Context, wchar_t, ExpectedResultType, char>('0', visited);
+      test<Context, wchar_t, ExpectedResultType, char>('9', visited);
+    } else if (std::is_signed_v<char>) {
+      // char to CharT -> int
+      // This happens when CharT is a char8_t, char16_t, or char32_t and char
+      // is a signed type.
+      // Note if sizeof(CharT) > sizeof(int) this test fails. If there are
+      // platforms where that occurs extra tests need to be added for char32_t
+      // testing it against a long long.
+      test<Context, int, ExpectedResultType, char>('a', visited);
+      test<Context, int, ExpectedResultType, char>('z', visited);
+      test<Context, int, ExpectedResultType, char>('0', visited);
+      test<Context, int, ExpectedResultType, char>('9', visited);
+    } else {
+      // char to CharT -> unsigned
+      // This happens when CharT is a char8_t, char16_t, or char32_t and char
+      // is an unsigned type.
+      // Note if sizeof(CharT) > sizeof(unsigned) this test fails. If there are
+      // platforms where that occurs extra tests need to be added for char32_t
+      // testing it against an unsigned long long.
+      test<Context, unsigned, ExpectedResultType, char>('a', visited);
+      test<Context, unsigned, ExpectedResultType, char>('z', visited);
+      test<Context, unsigned, ExpectedResultType, char>('0', visited);
+      test<Context, unsigned, ExpectedResultType, char>('9', visited);
+    }
+  }
+
+  // Test signed integer types.
+
+  test<Context, int, ExpectedResultType, signed char>(std::numeric_limits<signed char>::min(), visited);
+  test<Context, int, ExpectedResultType, signed char>(0, visited);
+  test<Context, int, ExpectedResultType, signed char>(std::numeric_limits<signed char>::max(), visited);
+
+  test<Context, int, ExpectedResultType, short>(std::numeric_limits<short>::min(), visited);
+  test<Context, int, ExpectedResultType, short>(std::numeric_limits<signed char>::min(), visited);
+  test<Context, int, ExpectedResultType, short>(0, visited);
+  test<Context, int, ExpectedResultType, short>(std::numeric_limits<signed char>::max(), visited);
+  test<Context, int, ExpectedResultType, short>(std::numeric_limits<short>::max(), visited);
+
+  test<Context, int, ExpectedResultType, int>(std::numeric_limits<int>::min(), visited);
+  test<Context, int, ExpectedResultType, int>(std::numeric_limits<short>::min(), visited);
+  test<Context, int, ExpectedResultType, int>(std::numeric_limits<signed char>::min(), visited);
+  test<Context, int, ExpectedResultType, int>(0, visited);
+  test<Context, int, ExpectedResultType, int>(std::numeric_limits<signed char>::max(), visited);
+  test<Context, int, ExpectedResultType, int>(std::numeric_limits<short>::max(), visited);
+  test<Context, int, ExpectedResultType, int>(std::numeric_limits<int>::max(), visited);
+
+  using LongToType = std::conditional_t<sizeof(long) == sizeof(int), int, long long>;
+
+  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<long>::min(), visited);
+  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<int>::min(), visited);
+  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<short>::min(), visited);
+  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<signed char>::min(), visited);
+  test<Context, LongToType, ExpectedResultType, long>(0, visited);
+  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<signed char>::max(), visited);
+  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<short>::max(), visited);
+  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<int>::max(), visited);
+  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<long>::max(), visited);
+
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<long long>::min(), visited);
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<long>::min(), visited);
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<int>::min(), visited);
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<short>::min(), visited);
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<signed char>::min(), visited);
+  test<Context, long long, ExpectedResultType, long long>(0, visited);
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<signed char>::max(), visited);
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<short>::max(), visited);
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<int>::max(), visited);
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<long>::max(), visited);
+  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<long long>::max(), visited);
+
+#ifndef TEST_HAS_NO_INT128
+  test_handle<Context, __int128_t, ExpectedResultType>(0, visited);
+#endif // TEST_HAS_NO_INT128
+
+  // Test unsigned integer types.
+
+  test<Context, unsigned, ExpectedResultType, unsigned char>(0, visited);
+  test<Context, unsigned, ExpectedResultType, unsigned char>(std::numeric_limits<unsigned char>::max(), visited);
+
+  test<Context, unsigned, ExpectedResultType, unsigned short>(0, visited);
+  test<Context, unsigned, ExpectedResultType, unsigned short>(std::numeric_limits<unsigned char>::max(), visited);
+  test<Context, unsigned, ExpectedResultType, unsigned short>(std::numeric_limits<unsigned short>::max(), visited);
+
+  test<Context, unsigned, ExpectedResultType, unsigned>(0, visited);
+  test<Context, unsigned, ExpectedResultType, unsigned>(std::numeric_limits<unsigned char>::max(), visited);
+  test<Context, unsigned, ExpectedResultType, unsigned>(std::numeric_limits<unsigned short>::max(), visited);
+  test<Context, unsigned, ExpectedResultType, unsigned>(std::numeric_limits<unsigned>::max(), visited);
+
+  using UnsignedLongToType =
+      std::conditional_t<sizeof(unsigned long) == sizeof(unsigned), unsigned, unsigned long long>;
+
+  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(0, visited);
+  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(
+      std::numeric_limits<unsigned char>::max(), visited);
+  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(
+      std::numeric_limits<unsigned short>::max(), visited);
+  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(std::numeric_limits<unsigned>::max(), visited);
+  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(
+      std::numeric_limits<unsigned long>::max(), visited);
+
+  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(0, visited);
+  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
+      std::numeric_limits<unsigned char>::max(), visited);
+  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
+      std::numeric_limits<unsigned short>::max(), visited);
+  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
+      std::numeric_limits<unsigned>::max(), visited);
+  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
+      std::numeric_limits<unsigned long>::max(), visited);
+  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
+      std::numeric_limits<unsigned long long>::max(), visited);
+
+#ifndef TEST_HAS_NO_INT128
+  test_handle<Context, __uint128_t, ExpectedResultType>(0, visited);
+#endif // TEST_HAS_NO_INT128
+
+  // Test floating point types.
+
+  test<Context, float, ExpectedResultType, float>(-std::numeric_limits<float>::max(), visited);
+  test<Context, float, ExpectedResultType, float>(-std::numeric_limits<float>::min(), visited);
+  test<Context, float, ExpectedResultType, float>(-0.0, visited);
+  test<Context, float, ExpectedResultType, float>(0.0, visited);
+  test<Context, float, ExpectedResultType, float>(std::numeric_limits<float>::min(), visited);
+  test<Context, float, ExpectedResultType, float>(std::numeric_limits<float>::max(), visited);
+
+  test<Context, double, ExpectedResultType, double>(-std::numeric_limits<double>::max(), visited);
+  test<Context, double, ExpectedResultType, double>(-std::numeric_limits<double>::min(), visited);
+  test<Context, double, ExpectedResultType, double>(-0.0, visited);
+  test<Context, double, ExpectedResultType, double>(0.0, visited);
+  test<Context, double, ExpectedResultType, double>(std::numeric_limits<double>::min(), visited);
+  test<Context, double, ExpectedResultType, double>(std::numeric_limits<double>::max(), visited);
+
+  test<Context, long double, ExpectedResultType, long double>(-std::numeric_limits<long double>::max(), visited);
+  test<Context, long double, ExpectedResultType, long double>(-std::numeric_limits<long double>::min(), visited);
+  test<Context, long double, ExpectedResultType, long double>(-0.0, visited);
+  test<Context, long double, ExpectedResultType, long double>(0.0, visited);
+  test<Context, long double, ExpectedResultType, long double>(std::numeric_limits<long double>::min(), visited);
+  test<Context, long double, ExpectedResultType, long double>(std::numeric_limits<long double>::max(), visited);
+
+  // Test const CharT pointer types.
+
+  test<Context, const CharT*, ExpectedResultType, const CharT*>(empty.c_str(), visited);
+  test<Context, const CharT*, ExpectedResultType, const CharT*>(str.c_str(), visited);
+
+  // Test string_view types.
+
+  {
+    using From = std::basic_string_view<CharT>;
+
+    test_string_view<Context, ExpectedResultType>(From(), visited);
+    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
+    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
+  }
+  {
+    using From = std::basic_string_view<CharT, constexpr_char_traits<CharT>>;
+
+    test_string_view<Context, ExpectedResultType>(From(), visited);
+    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
+    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
+  }
+
+  // Test string types.
+
+  {
+    using From = std::basic_string<CharT>;
+
+    test_string_view<Context, ExpectedResultType>(From(), visited);
+    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
+    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
+  }
+
+  {
+    using From = std::basic_string<CharT, constexpr_char_traits<CharT>, std::allocator<CharT>>;
+
+    test_string_view<Context, ExpectedResultType>(From(), visited);
+    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
+    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
+  }
+
+  {
+    using From = std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>>;
+
+    test_string_view<Context, ExpectedResultType>(From(), visited);
+    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
+    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
+  }
+
+  {
+    using From = std::basic_string<CharT, constexpr_char_traits<CharT>, min_allocator<CharT>>;
+
+    test_string_view<Context, ExpectedResultType>(From(), visited);
+    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
+    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
+  }
+
+  // Test pointer types.
+
+  test<Context, const void*, ExpectedResultType>(nullptr, visited);
+  int i = 0;
+  test<Context, const void*, ExpectedResultType>(static_cast<void*>(&i), visited);
+  const int ci = 0;
+  test<Context, const void*, ExpectedResultType>(static_cast<const void*>(&ci), visited);
+}
+
+int main(int, char**) {
+  test<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
new file mode 100644
index 0000000000000..acd9228369e60
--- /dev/null
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
@@ -0,0 +1,38 @@
+//===----------------------------------------------------------------------===//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+// UNSUPPORTED: clang-16 || clang-17
+// XFAIL: apple-clang
+
+// <format>
+
+// template<class Visitor, class Context>
+//   see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg);
+
+#include <format>
+#include <tuple>
+
+#include "test_macros.h"
+
+template <typename CharT, class To, class From>
+void test(From value) {
+  using Context = std::basic_format_context<CharT*, CharT>;
+  auto store    = std::make_format_args<Context>(value);
+  std::basic_format_args<Context> format_args{store};
+
+  // expected-warning-re@+1 1-2 {{std::basic_format_context{{.*}}' is deprecated}}
+  std::ignore = std::visit_format_arg([]([[maybe_unused]] auto a) -> To { return {}; }, format_args.get(0));
+}
+
+void test() {
+  test<char, bool>('a');
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t, bool>('a');
+#endif
+}
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
index 3ddf2d0ff732a..3497d8935c8d6 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
@@ -11,7 +11,7 @@
 // <format>
 
 // template<class Visitor, class Context>
-//   see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg);
+//   see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg); // Deprecated in C++26
 
 #include <algorithm>
 #include <format>
@@ -23,6 +23,10 @@
 #include "make_string.h"
 #include "min_allocator.h"
 
+#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
+TEST_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations")
+#endif
+
 template <class Context, class To, class From>
 void test(From value) {
   auto store = std::make_format_args<Context>(value);
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
index 0f3931adf54df..a5d3703397f4e 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
@@ -24,14 +24,17 @@ void test(From value) {
   auto store = std::make_format_args<Context>(value);
   const std::basic_format_args<Context> format_args{store};
 
-  std::visit_format_arg(
-      [v = To(value)](auto a) {
-        if constexpr (std::is_same_v<To, decltype(a)>)
-          assert(v == a);
-        else
-          assert(false);
-      },
-      format_args.get(0));
+  auto visitor = [v = To(value)](auto a) {
+    if constexpr (std::is_same_v<To, decltype(a)>)
+      assert(v == a);
+    else
+      assert(false);
+  };
+#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
+  format_args.get(0).visit(visitor);
+#else
+  std::visit_format_arg(visitor, format_args.get(0));
+#endif
 }
 
 // Some types, as an extension, are stored in the variant. The Standard
@@ -41,9 +44,12 @@ void test_handle(T value) {
   auto store = std::make_format_args<Context>(value);
   std::basic_format_args<Context> format_args{store};
 
-  std::visit_format_arg(
-      [](auto a) { assert((std::is_same_v<decltype(a), typename std::basic_format_arg<Context>::handle>)); },
-      format_args.get(0));
+  auto visitor = [](auto a) { assert((std::is_same_v<decltype(a), typename std::basic_format_arg<Context>::handle>)); };
+#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
+  format_args.get(0).visit(visitor);
+#else
+  std::visit_format_arg(visitor, format_args.get(0));
+#endif
 }
 
 // Test specific for string and string_view.
@@ -58,14 +64,18 @@ void test_string_view(From value) {
   using CharT = typename Context::char_type;
   using To = std::basic_string_view<CharT>;
   using V = std::basic_string<CharT>;
-  std::visit_format_arg(
-      [v = V(value.begin(), value.end())](auto a) {
-        if constexpr (std::is_same_v<To, decltype(a)>)
-          assert(v == a);
-        else
-          assert(false);
-      },
-      format_args.get(0));
+
+  auto visitor = [v = V(value.begin(), value.end())](auto a) {
+    if constexpr (std::is_same_v<To, decltype(a)>)
+      assert(v == a);
+    else
+      assert(false);
+  };
+#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
+  format_args.get(0).visit(visitor);
+#else
+  std::visit_format_arg(visitor, format_args.get(0));
+#endif
 }
 
 template <class CharT>
diff --git a/libcxx/test/support/test_basic_format_arg.h b/libcxx/test/support/test_basic_format_arg.h
index d8547d34b6db4..1ec719a48b6c8 100644
--- a/libcxx/test/support/test_basic_format_arg.h
+++ b/libcxx/test/support/test_basic_format_arg.h
@@ -7,18 +7,22 @@
 
 #include <concepts>
 #include <format>
+#include <utility>
 
 #include "test_macros.h"
 
 /// Returns whether the basic_format_arg contains a type T with the expected value.
 template <class Context, class T>
 bool test_basic_format_arg(std::basic_format_arg<Context> arg, T expected) {
-  return std::visit_format_arg(
-      [expected](auto a) {
-        if constexpr (std::same_as<decltype(a), T>)
-          return a == expected;
-        else
-          return false;
-      },
-      arg);
+  auto visitor = [expected](auto a) {
+    if constexpr (std::same_as<decltype(a), T>)
+      return a == expected;
+    else
+      return false;
+  };
+#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
+  return arg.visit(std::move(visitor));
+#else
+  return std::visit_format_arg(std::move(visitor), arg);
+#endif
 }
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index fe68e13de6bcb..24f69c758f365 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -470,4 +470,9 @@ inline Tp const& DoNotOptimize(Tp const& value) {
 #  define TEST_IF_AIX(arg_true, arg_false) arg_false
 #endif
 
+// Clang-18 has support for deducing this, but it does not set the FTM.
+#ifdef _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
+#  define TEST_HAS_EXPLICIT_THIS_PARAMETER
+#endif
+
 #endif // SUPPORT_TEST_MACROS_HPP
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index a5df187b046f6..667dd75278c8b 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -467,6 +467,7 @@ def add_version_header(tc):
                 # "c++20": 202110 Not implemented P2372R3 Fixing locale handling in chrono formatters
                 "c++20": 202106,
                 # "c++23": 202207, Not implemented P2419R2 Clarify handling of encodings in localized formatting of chrono types
+                # "c++26": 202306, P2637R3 Member Visit (implemented)
                 # "c++26": 202311, P2918R2 Runtime format strings II (implemented)
             },
             # Note these three papers are adopted at the June 2023 meeting and have sequential numbering

From 1d6b6132ff9d59c27f033c8a2003ea9bff421e04 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sun, 21 Jan 2024 12:02:39 +0100
Subject: [PATCH 290/843] [libc++] Reland CI module improvements.

Revert "Revert #76246 and #76083"

This reverts commit 5c150e7eeba9db13cc65b329b3c3537b613ae61d.

Adds a small fix that should properly disable the tests on Windows.
Unfortunately the original poster has not provided feedback and the
original patch did not fail in the LLVM CI infrastructure.

Modules are known to fail on Windows due to non compliance of the
C library. Currently not having this patch prevents testing on other
platforms.
---
 .github/workflows/libcxx-build-and-test.yaml  |  8 --
 libcxx/CMakeLists.txt                         | 15 +---
 libcxx/cmake/caches/Generic-cxx26.cmake       |  1 -
 .../Generic-hardening-mode-extensive.cmake    |  1 -
 .../cmake/caches/Generic-no-exceptions.cmake  |  1 -
 .../caches/Generic-no-experimental.cmake      |  1 -
 .../cmake/caches/Generic-no-filesystem.cmake  |  1 -
 .../caches/Generic-no-localization.cmake      |  1 -
 .../caches/Generic-no-random_device.cmake     |  1 -
 libcxx/cmake/caches/Generic-no-threads.cmake  |  1 -
 libcxx/cmake/caches/Generic-no-tzdb.cmake     |  1 -
 libcxx/cmake/caches/Generic-no-unicode.cmake  |  1 -
 .../caches/Generic-no-wide-characters.cmake   |  1 -
 libcxx/docs/Modules.rst                       |  2 +-
 libcxx/docs/ReleaseNotes/18.rst               |  4 +
 libcxx/docs/TestingLibcxx.rst                 |  9 +-
 libcxx/modules/CMakeLists.txt                 | 28 ------
 libcxx/modules/CMakeLists.txt.in              | 86 -------------------
 libcxx/test/CMakeLists.txt                    | 25 ------
 libcxx/test/configs/cmake-bridge.cfg.in       |  7 --
 libcxx/test/libcxx/module_std.gen.py          |  1 +
 libcxx/test/libcxx/module_std_compat.gen.py   |  1 +
 .../libcxx/selftest/modules/no-modules.sh.cpp | 14 +++
 .../modules/std-and-std.compat-module.sh.cpp  | 26 ++++++
 .../libcxx/selftest/modules/std-module.sh.cpp | 24 ++++++
 .../selftest/modules/std.compat-module.sh.cpp | 28 ++++++
 libcxx/test/lit.local.cfg                     | 83 ------------------
 libcxx/test/std/modules/std.compat.pass.cpp   |  7 +-
 libcxx/test/std/modules/std.pass.cpp          |  7 +-
 libcxx/utils/ci/Dockerfile                    | 11 ---
 libcxx/utils/ci/buildkite-pipeline.yml        |  2 -
 libcxx/utils/ci/run-buildbot                  | 10 ---
 libcxx/utils/libcxx/test/config.py            |  4 +
 libcxx/utils/libcxx/test/features.py          | 13 ++-
 libcxx/utils/libcxx/test/format.py            | 69 +++++++++++++--
 libcxx/utils/libcxx/test/modules.py           |  4 +-
 36 files changed, 201 insertions(+), 298 deletions(-)
 delete mode 100644 libcxx/modules/CMakeLists.txt.in
 create mode 100644 libcxx/test/libcxx/selftest/modules/no-modules.sh.cpp
 create mode 100644 libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
 create mode 100644 libcxx/test/libcxx/selftest/modules/std-module.sh.cpp
 create mode 100644 libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
 delete mode 100644 libcxx/test/lit.local.cfg

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 1666a687aa5d0..0cab9b841e4ee 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -35,7 +35,6 @@ concurrency:
 
 
 env:
-  CMAKE: "/opt/bin/cmake"
   # LLVM POST-BRANCH bump version
   # LLVM POST-BRANCH add compiler test for ToT - 1, e.g. "Clang 17"
   # LLVM RELEASE bump remove compiler ToT - 3, e.g. "Clang 15"
@@ -169,24 +168,18 @@ jobs:
           'bootstrapping-build'
         ]
         machine: [ 'libcxx-runners-8-set' ]
-        std_modules: [ 'OFF' ]
         include:
         - config: 'generic-cxx26'
           machine: libcxx-runners-8-set
-          std_modules: 'ON'
         - config: 'generic-asan'
           machine: libcxx-runners-8-set
-          std_modules: 'OFF'
         - config: 'generic-tsan'
           machine: libcxx-runners-8-set
-          std_modules: 'OFF'
         - config: 'generic-ubsan'
           machine: libcxx-runners-8-set
-          std_modules: 'OFF'
         # Use a larger machine for MSAN to avoid timeout and memory allocation issues.
         - config: 'generic-msan'
           machine: libcxx-runners-8-set
-          std_modules: 'OFF'
     runs-on: ${{ matrix.machine }}
     steps:
       - uses: actions/checkout@v4
@@ -196,7 +189,6 @@ jobs:
           CC: clang-18
           CXX: clang++-18
           ENABLE_CLANG_TIDY: "OFF"
-          ENABLE_STD_MODULES: ${{ matrix.std_modules }}
       - uses: actions/upload-artifact@v3
         if: always()
         with:
diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index a5b5169747008..53de53480266b 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -124,12 +124,6 @@ option(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS
    the shared library they shipped should turn this on and see `include/__availability`
    for more details." OFF)
 option(LIBCXX_ENABLE_CLANG_TIDY "Whether to compile and run clang-tidy checks" OFF)
-# TODO MODULES Remove this option and test for the requirements (CMake/Clang) instead.
-option(LIBCXX_ENABLE_STD_MODULES
-   "Whether to enable the building the C++23 `std` module. This feature is
-    experimental and has additional dependencies. Only enable this when
-    interested in testing or developing this module. See
-    https://libcxx.llvm.org/Modules.html for more information." OFF)
 
 if (CMAKE_CXX_COMPILER_ID STREQUAL "GNU")
   set(LIBCXX_DEFAULT_TEST_CONFIG "llvm-libc++-shared-gcc.cfg.in")
@@ -779,7 +773,6 @@ config_define_if_not(LIBCXX_ENABLE_RANDOM_DEVICE _LIBCPP_HAS_NO_RANDOM_DEVICE)
 config_define_if_not(LIBCXX_ENABLE_LOCALIZATION _LIBCPP_HAS_NO_LOCALIZATION)
 config_define_if_not(LIBCXX_ENABLE_UNICODE _LIBCPP_HAS_NO_UNICODE)
 config_define_if_not(LIBCXX_ENABLE_WIDE_CHARACTERS _LIBCPP_HAS_NO_WIDE_CHARACTERS)
-config_define_if_not(LIBCXX_ENABLE_STD_MODULES _LIBCPP_HAS_NO_STD_MODULES)
 config_define_if_not(LIBCXX_ENABLE_TIME_ZONE_DATABASE _LIBCPP_HAS_NO_TIME_ZONE_DATABASE)
 config_define_if_not(LIBCXX_ENABLE_VENDOR_AVAILABILITY_ANNOTATIONS _LIBCPP_HAS_NO_VENDOR_AVAILABILITY_ANNOTATIONS)
 
@@ -863,9 +856,7 @@ endfunction()
 add_subdirectory(include)
 add_subdirectory(src)
 add_subdirectory(utils)
-if (LIBCXX_ENABLE_STD_MODULES)
-  add_subdirectory(modules)
-endif()
+add_subdirectory(modules)
 
 set(LIBCXX_TEST_DEPS "cxx_experimental")
 
@@ -873,9 +864,7 @@ if (LIBCXX_ENABLE_CLANG_TIDY)
   list(APPEND LIBCXX_TEST_DEPS cxx-tidy)
 endif()
 
-if (LIBCXX_ENABLE_STD_MODULES)
-  list(APPEND LIBCXX_TEST_DEPS generate-cxx-modules generate-test-module-std)
-endif()
+list(APPEND LIBCXX_TEST_DEPS generate-cxx-modules)
 
 if (LIBCXX_INCLUDE_BENCHMARKS)
   add_subdirectory(benchmarks)
diff --git a/libcxx/cmake/caches/Generic-cxx26.cmake b/libcxx/cmake/caches/Generic-cxx26.cmake
index f48d72d493c2f..6ba9482af5785 100644
--- a/libcxx/cmake/caches/Generic-cxx26.cmake
+++ b/libcxx/cmake/caches/Generic-cxx26.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++26" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
index 0487377d4e9ba..72263dfd84635 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
@@ -1,2 +1 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_HARDENING_MODE "extensive" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-exceptions.cmake b/libcxx/cmake/caches/Generic-no-exceptions.cmake
index f405f7fe99375..f0dffef60dba0 100644
--- a/libcxx/cmake/caches/Generic-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Generic-no-exceptions.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-experimental.cmake b/libcxx/cmake/caches/Generic-no-experimental.cmake
index fe14e7afed7b9..f33ed01418990 100644
--- a/libcxx/cmake/caches/Generic-no-experimental.cmake
+++ b/libcxx/cmake/caches/Generic-no-experimental.cmake
@@ -1,3 +1,2 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "enable_experimental=False" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-filesystem.cmake b/libcxx/cmake/caches/Generic-no-filesystem.cmake
index db62f86854d94..4000f3a3e8ef2 100644
--- a/libcxx/cmake/caches/Generic-no-filesystem.cmake
+++ b/libcxx/cmake/caches/Generic-no-filesystem.cmake
@@ -1,2 +1 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-localization.cmake b/libcxx/cmake/caches/Generic-no-localization.cmake
index 54a7ec3f1f5b3..79d6b44c7139a 100644
--- a/libcxx/cmake/caches/Generic-no-localization.cmake
+++ b/libcxx/cmake/caches/Generic-no-localization.cmake
@@ -1,2 +1 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-random_device.cmake b/libcxx/cmake/caches/Generic-no-random_device.cmake
index adfa2458a8edf..e9b4cc60cc80e 100644
--- a/libcxx/cmake/caches/Generic-no-random_device.cmake
+++ b/libcxx/cmake/caches/Generic-no-random_device.cmake
@@ -1,2 +1 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-threads.cmake b/libcxx/cmake/caches/Generic-no-threads.cmake
index 2aeab22915e00..616baef1be7be 100644
--- a/libcxx/cmake/caches/Generic-no-threads.cmake
+++ b/libcxx/cmake/caches/Generic-no-threads.cmake
@@ -1,4 +1,3 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-tzdb.cmake b/libcxx/cmake/caches/Generic-no-tzdb.cmake
index c5dc882e58442..27c826edfecff 100644
--- a/libcxx/cmake/caches/Generic-no-tzdb.cmake
+++ b/libcxx/cmake/caches/Generic-no-tzdb.cmake
@@ -1,2 +1 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_TIME_ZONE_DATABASE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-unicode.cmake b/libcxx/cmake/caches/Generic-no-unicode.cmake
index 880e2d502ad91..01160bf218981 100644
--- a/libcxx/cmake/caches/Generic-no-unicode.cmake
+++ b/libcxx/cmake/caches/Generic-no-unicode.cmake
@@ -1,2 +1 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-wide-characters.cmake b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
index 5036f6abd52e8..728d41086a386 100644
--- a/libcxx/cmake/caches/Generic-no-wide-characters.cmake
+++ b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
@@ -1,2 +1 @@
-set(LIBCXX_ENABLE_STD_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "")
diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst
index 146a91522436d..a0735c0ed3f95 100644
--- a/libcxx/docs/Modules.rst
+++ b/libcxx/docs/Modules.rst
@@ -115,7 +115,7 @@ directory. First libc++ needs to be build with module support enabled.
   $ git clone https://github.com/llvm/llvm-project.git
   $ cd llvm-project
   $ mkdir build
-  $ cmake -G Ninja -S runtimes -B build -DLIBCXX_ENABLE_STD_MODULES=ON -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind"
+  $ cmake -G Ninja -S runtimes -B build -DLLVM_ENABLE_RUNTIMES="libcxx;libcxxabi;libunwind"
   $ ninja -C build
 
 The above ``build`` directory will be referred to as ``<build>`` in the
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 14e1e09907fad..77cf22398ea4c 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -95,6 +95,10 @@ Improvements and New Features
 - The ``_LIBCPP_ENABLE_CXX20_REMOVED_SHARED_PTR_UNIQUE`` macro has been added to make
   the function ``std::shared_ptr<...>::unique()`` available.
 
+- The cmake option ``LIBCXX_ENABLE_STD_MODULES`` has been removed. The test
+  infrastructure no longer depends on a modern CMake, it works with the minimal
+  required LLVM version (3.20.0).
+
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/docs/TestingLibcxx.rst b/libcxx/docs/TestingLibcxx.rst
index e7645cb5885f1..50ee9d4ee400b 100644
--- a/libcxx/docs/TestingLibcxx.rst
+++ b/libcxx/docs/TestingLibcxx.rst
@@ -394,7 +394,7 @@ Custom Directives
 ~~~~~~~~~~~~~~~~~
 
 Lit has many directives built in (e.g., ``DEFINE``, ``UNSUPPORTED``). In addition to those directives, libc++ adds two additional libc++-specific directives that makes
-writing tests easier. See `libc++-specific Lit Directives`_ for more information about the ``FILE_DEPENDENCIES`` and ``ADDITIONAL_COMPILE_FLAGS`` libc++-specific directives.
+writing tests easier. See `libc++-specific Lit Directives`_ for more information about the ``FILE_DEPENDENCIES``, ``ADDITIONAL_COMPILE_FLAGS``, and ``MODULE_DEPENDENCIES`` libc++-specific directives.
 
 .. _libc++-specific Lit Directives:
 .. list-table:: libc++-specific Lit Directives
@@ -417,6 +417,13 @@ writing tests easier. See `libc++-specific Lit Directives`_ for more information
      - The additional compiler flags specified by a space-separated list to the ``ADDITIONAL_COMPILE_FLAGS`` libc++-specific Lit directive will be added to the end of the ``%{compile_flags}``
        substitution for the test that contains it. This libc++-specific Lit directive makes it possible to add special compilation flags without having to resort to writing a ``.sh.cpp`` test (see
        `Lit Meaning of libc++ Test Filenames`_), more powerful but perhaps overkill.
+   * - ``MODULE_DEPENDENCIES``
+     - ``// MODULE_DEPENDENCIES: std std.compat``
+     - This directive will build the required C++23 standard library
+       modules and add the additional compiler flags in
+       %{compile_flags}. (Libc++ offers these modules in C++20 as an
+       extension.)
+
 
 Benchmarks
 ==========
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index fae6448a7eec8..31fbadf449f77 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -1,8 +1,3 @@
-if (CMAKE_VERSION VERSION_LESS 3.26)
-  message(WARNING "The libc++ modules won't be available because the CMake version is too old. Update to CMake 3.26 or later.")
-  return()
-endif()
-
 # The headers of Table 24: C++ library headers [tab:headers.cpp]
 # and the headers of Table 25: C++ headers for C library facilities [tab:headers.cpp.c]
 set(LIBCXX_MODULE_STD_SOURCES
@@ -142,28 +137,6 @@ set(LIBCXX_MODULE_STD_COMPAT_SOURCES
   std.compat/cwctype.inc
 )
 
-# TODO MODULES the CMakeLists.txt in the install directory is only temporary
-# When that is removed the configured file can use the substitution
-# LIBCXX_GENERATED_INCLUDE_TARGET_DIR avoiding this set.
-# Also clean up the parts needed to generate the install version.
-# - LIBCXX_GENERATED_INCLUDE_DIR contains the libc++ headers
-# - LIBCXX_GENERATED_INCLUDE_TARGET_DIR contains the libc++ site config
-if ("${LIBCXX_GENERATED_INCLUDE_DIR}" STREQUAL "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR}")
-  # This typically happens when the target is not installed.
-  set(LIBCXX_CONFIGURED_INCLUDE_DIRS "${LIBCXX_GENERATED_INCLUDE_DIR}")
-else()
-  # It's important that the arch directory be included first so that its header files
-  # which interpose on the default include dir be included instead of the default ones.
-  set(LIBCXX_CONFIGURED_INCLUDE_DIRS
-    "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR};${LIBCXX_GENERATED_INCLUDE_DIR}"
-  )
-endif()
-configure_file(
-  "CMakeLists.txt.in"
-  "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt"
-  @ONLY
-)
-
 set(LIBCXX_MODULE_STD_INCLUDE_SOURCES)
 foreach(file ${LIBCXX_MODULE_STD_SOURCES})
   set(
@@ -193,7 +166,6 @@ configure_file(
 )
 
 set(_all_modules)
-list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/CMakeLists.txt")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.cppm")
 list(APPEND _all_modules "${LIBCXX_GENERATED_MODULE_DIR}/std.compat.cppm")
 foreach(file ${LIBCXX_MODULE_STD_SOURCES} ${LIBCXX_MODULE_STD_COMPAT_SOURCES})
diff --git a/libcxx/modules/CMakeLists.txt.in b/libcxx/modules/CMakeLists.txt.in
deleted file mode 100644
index 98168673ebfe9..0000000000000
--- a/libcxx/modules/CMakeLists.txt.in
+++ /dev/null
@@ -1,86 +0,0 @@
-cmake_minimum_required(VERSION 3.26)
-
-project(libc++-modules LANGUAGES CXX)
-
-# Enable CMake's module support
-if(CMAKE_VERSION VERSION_LESS "3.28.0")
-  if(CMAKE_VERSION VERSION_LESS "3.27.0")
-    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "2182bf5c-ef0d-489a-91da-49dbc3090d2a")
-  else()
-    set(CMAKE_EXPERIMENTAL_CXX_MODULE_CMAKE_API "aa1f7df0-828a-4fcd-9afc-2dc80491aca7")
-  endif()
-  set(CMAKE_EXPERIMENTAL_CXX_MODULE_DYNDEP 1)
-else()
-  cmake_policy(VERSION 3.28)
-endif()
-
-# Default to C++ extensions being off. Libc++'s modules support have trouble
-# with extensions right now.
-set(CMAKE_CXX_EXTENSIONS OFF)
-
-# Propagates the CMake options to the modules.
-#
-# This uses the std module hard-coded since the std.compat module does not
-# depend on these flags.
-macro(compile_define_if_not condition def)
-  if (NOT ${condition})
-    target_compile_definitions(std PRIVATE ${def})
-  endif()
-endmacro()
-macro(compile_define_if condition def)
-  if (${condition})
-    target_compile_definitions(std PRIVATE ${def})
-  endif()
-endmacro()
-
-### STD
-
-add_library(std)
-target_sources(std
-  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
-    std.cppm
-)
-
-target_include_directories(std SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
-
-if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
-  target_compile_options(std PUBLIC -fno-exceptions)
-endif()
-
-target_compile_options(std
-  PUBLIC
-    -nostdinc++
-    -Wno-reserved-module-identifier
-    -Wno-reserved-user-defined-literal
-    @LIBCXX_COMPILE_FLAGS@
-)
-set_target_properties(std
-  PROPERTIES
-    OUTPUT_NAME   "c++std"
-)
-
-### STD.COMPAT
-
-add_library(std.compat)
-target_sources(std.compat
-  PUBLIC FILE_SET cxx_modules TYPE CXX_MODULES FILES
-    std.compat.cppm
-)
-
-target_include_directories(std.compat SYSTEM PRIVATE @LIBCXX_CONFIGURED_INCLUDE_DIRS@)
-
-if (NOT @LIBCXX_ENABLE_EXCEPTIONS@)
-  target_compile_options(std.compat PUBLIC -fno-exceptions)
-endif()
-
-target_compile_options(std.compat
-  PUBLIC
-    -nostdinc++
-    -Wno-reserved-module-identifier
-    -Wno-reserved-user-defined-literal
-    @LIBCXX_COMPILE_FLAGS@
-)
-set_target_properties(std.compat
-  PROPERTIES
-    OUTPUT_NAME   "c++std.compat"
-)
diff --git a/libcxx/test/CMakeLists.txt b/libcxx/test/CMakeLists.txt
index 48dd233462ab3..52620fc55feeb 100644
--- a/libcxx/test/CMakeLists.txt
+++ b/libcxx/test/CMakeLists.txt
@@ -87,31 +87,6 @@ if (LIBCXX_INCLUDE_TESTS)
     ${CMAKE_CURRENT_BINARY_DIR}
     DEPENDS cxx-test-depends)
 
-  if(LIBCXX_ENABLE_STD_MODULES)
-    # Generates the modules used in the test.
-    # Note the test will regenerate this with the proper setting
-    # - the right DCMAKE_CXX_STANDARD
-    # - the right test compilation flags
-    # Since modules depend on these flags there currently is no way to
-    # avoid generating these for the tests. The advantage of the
-    # pre generation is that less build information needs to be shared
-    # in the bridge.
-    add_custom_command(
-        OUTPUT "${CMAKE_BINARY_DIR}/test/__config_module__/CMakeCache.txt"
-        COMMAND
-        ${CMAKE_COMMAND}
-            "-G${CMAKE_GENERATOR}"
-            "-DCMAKE_MAKE_PROGRAM=${CMAKE_MAKE_PROGRAM}"
-            "-B${CMAKE_BINARY_DIR}/test/__config_module__"
-            "-H${LIBCXX_GENERATED_MODULE_DIR}"
-            "-DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}"
-            "-DCMAKE_CXX_STANDARD=23"
-            "-DCMAKE_EXPORT_COMPILE_COMMANDS=ON"
-            )
-  add_custom_target(generate-test-module-std
-      DEPENDS "${CMAKE_BINARY_DIR}/test/__config_module__/CMakeCache.txt"
-      COMMENT "Builds generic module std.")
-  endif()
 endif()
 
 if (LIBCXX_GENERATE_COVERAGE)
diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in
index 0e3c3040c9644..72b2ddf378bb6 100644
--- a/libcxx/test/configs/cmake-bridge.cfg.in
+++ b/libcxx/test/configs/cmake-bridge.cfg.in
@@ -31,10 +31,3 @@ config.substitutions.append(('%{target-include}', '@LIBCXX_GENERATED_INCLUDE_TAR
 config.substitutions.append(('%{lib}', '@LIBCXX_LIBRARY_DIR@'))
 config.substitutions.append(('%{module}', '@LIBCXX_GENERATED_MODULE_DIR@'))
 config.substitutions.append(('%{test-tools}', '@LIBCXX_TEST_TOOLS_PATH@'))
-
-# The test needs to manually rebuild the module. The compiler flags used in the
-# test need to be the same as the compiler flags used to generate the module.
-# In the future, when CMake can generated modules this may no longer be
-# necessary.
-# TODO MODULES whether it's possible to remove this substitution.
-config.substitutions.append(('%{cmake}', '@CMAKE_COMMAND@'))
diff --git a/libcxx/test/libcxx/module_std.gen.py b/libcxx/test/libcxx/module_std.gen.py
index 8e03d6e5b5b52..a9a05a0cd74e6 100644
--- a/libcxx/test/libcxx/module_std.gen.py
+++ b/libcxx/test/libcxx/module_std.gen.py
@@ -30,6 +30,7 @@
     "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin",
     "%{cxx}",
     "%{flags} %{compile_flags}",
+    "std",
 )
 
 
diff --git a/libcxx/test/libcxx/module_std_compat.gen.py b/libcxx/test/libcxx/module_std_compat.gen.py
index c4792db3d71e6..2866066ccedc8 100644
--- a/libcxx/test/libcxx/module_std_compat.gen.py
+++ b/libcxx/test/libcxx/module_std_compat.gen.py
@@ -30,6 +30,7 @@
     "%{test-tools}/clang_tidy_checks/libcxx-tidy.plugin",
     "%{cxx}",
     "%{flags} %{compile_flags}",
+    "std.compat",
 )
 
 
diff --git a/libcxx/test/libcxx/selftest/modules/no-modules.sh.cpp b/libcxx/test/libcxx/selftest/modules/no-modules.sh.cpp
new file mode 100644
index 0000000000000..74e65a9072f7b
--- /dev/null
+++ b/libcxx/test/libcxx/selftest/modules/no-modules.sh.cpp
@@ -0,0 +1,14 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Make sure that the compile flags contain no module information.
+
+// MODULE_DEPENDENCIES:
+
+// RUN: echo "%{compile_flags}" | grep -v "std.pcm"
+// RUN: echo "%{compile_flags}" | grep -v "std.compat.pcm"
diff --git a/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
new file mode 100644
index 0000000000000..81241d7f43f9a
--- /dev/null
+++ b/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: clang-modules-build
+// UNSUPPORTED: gcc
+
+// XFAIL: has-no-cxx-module-support
+
+// picolibc does not provide the required timespec_get function, and the
+// "using-if-exists" mechanism apparently did not work here.
+// XFAIL: LIBCXX-PICOLIBC-FIXME
+
+// Make sure that the compile flags contain the expected elements.
+// The tests only look for the expected components and not the exact flags.
+// Otherwise changing the location of the module would break this test.
+
+// MODULE_DEPENDENCIES: std std.compat
+
+// RUN: echo "%{compile_flags}" | grep -- "-fmodule-file=std=.*/std.pcm .*/std.pcm"
+// RUN: echo "%{compile_flags}" | grep -- "-fmodule-file=std.compat=.*/std.compat.pcm .*/std.compat.pcm"
diff --git a/libcxx/test/libcxx/selftest/modules/std-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std-module.sh.cpp
new file mode 100644
index 0000000000000..ec43994fa1ef9
--- /dev/null
+++ b/libcxx/test/libcxx/selftest/modules/std-module.sh.cpp
@@ -0,0 +1,24 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: clang-modules-build
+// UNSUPPORTED: gcc
+
+// XFAIL: has-no-cxx-module-support
+
+// Make sure that the compile flags contain the expected elements.
+// The tests only look for the expected components and not the exact flags.
+// Otherwise changing the location of the module would break this test.
+
+// MODULE_DEPENDENCIES: std
+
+// RUN: echo "%{compile_flags}" | grep -- "-fmodule-file=std=.*/std.pcm .*/std.pcm"
+
+// The std module should not provide the std.compat module
+// RUN: echo "%{compile_flags}" | grep -v "std.compat.pcm"
diff --git a/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
new file mode 100644
index 0000000000000..b74c2f1a249fc
--- /dev/null
+++ b/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
+// UNSUPPORTED: clang-modules-build
+// UNSUPPORTED: gcc
+
+// XFAIL: has-no-cxx-module-support
+
+// picolibc does not provide the required timespec_get function, and the
+// "using-if-exists" mechanism apparently did not work here.
+// XFAIL: LIBCXX-PICOLIBC-FIXME
+
+// Make sure that the compile flags contain the expected elements.
+// The tests only look for the expected components and not the exact flags.
+// Otherwise changing the location of the module would break this test.
+
+// MODULE_DEPENDENCIES: std.compat
+
+// RUN: echo "%{compile_flags}" | grep -- "-fmodule-file=std.compat=.*/std.compat.pcm .*/std.compat.pcm"
+
+// It's unspecified whether std.compat is built on the std module.
+// Therefore don't test its presence
diff --git a/libcxx/test/lit.local.cfg b/libcxx/test/lit.local.cfg
deleted file mode 100644
index 1ee9086ee22e3..0000000000000
--- a/libcxx/test/lit.local.cfg
+++ /dev/null
@@ -1,83 +0,0 @@
-# ===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-# ===----------------------------------------------------------------------===##
-
-# This configuration builds the C++23 std module.
-# It is build when the current lit configuration supports modules.
-#
-# TODO MODULES Evaluate whether this file can be removed when CMake supports
-# modules in libc++.
-
-import os
-import site
-import subprocess
-import libcxx.test.params, libcxx.test.config, libcxx.test.dsl
-
-
-def getSubstitution(substitution, config):
-    for orig, replacement in config.substitutions:
-        if orig == substitution:
-            return replacement
-    raise ValueError("Substitution {} is not in the config.".format(substitution))
-
-
-def appendToSubstitution(substitutions, key, value):
-    return [(k, v + " " + value) if k == key else (k, v) for (k, v) in substitutions]
-
-
-std = getSubstitution("%{cxx_std}", config)
-if std == "cxx26":
-    std = "26"
-elif std == "cxx23":
-    std = "23"
-elif std == "cxx20":
-    std = "20"
-else:
-    std = ""
-
-if (
-    std
-    and not "libcpp-has-no-std-modules" in config.available_features
-    and not "clang-modules-build" in config.available_features
-):
-    build = os.path.join(config.test_exec_root, "__config_module__")
-    config.substitutions = appendToSubstitution(
-        config.substitutions,
-        "%{compile_flags}",
-        "-fprebuilt-module-path="
-        + os.path.join(config.test_exec_root, "__config_module__/CMakeFiles/std.dir"),
-    )
-
-    cmake = getSubstitution("%{cmake}", config)
-    flags = getSubstitution("%{flags}", config)
-    if "c++experimental" in config.available_features:
-        flags = f"{flags} -D_LIBCPP_ENABLE_EXPERIMENTAL"
-
-    subprocess.check_call(
-        [cmake, f"-DCMAKE_CXX_STANDARD={std}", f"-DCMAKE_CXX_FLAGS={flags}", build],
-        env={},
-    )
-    subprocess.check_call([cmake, "--build", build, "--", "-v"], env={})
-    config.substitutions = appendToSubstitution(
-        config.substitutions,
-        "%{link_flags}",
-        os.path.join(build, "libc++std.a"),
-    )
-
-    config.substitutions = appendToSubstitution(
-        config.substitutions,
-        "%{compile_flags}",
-        "-fprebuilt-module-path="
-        + os.path.join(
-            config.test_exec_root, "__config_module__/CMakeFiles/std.compat.dir"
-        ),
-    )
-    config.substitutions = appendToSubstitution(
-        config.substitutions,
-        "%{link_flags}",
-        os.path.join(build, "libc++std.compat.a"),
-    )
diff --git a/libcxx/test/std/modules/std.compat.pass.cpp b/libcxx/test/std/modules/std.compat.pass.cpp
index 9a2f330d722ed..40ea979e27346 100644
--- a/libcxx/test/std/modules/std.compat.pass.cpp
+++ b/libcxx/test/std/modules/std.compat.pass.cpp
@@ -7,9 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-
-// UNSUPPORTED: libcpp-has-no-std-modules
 // UNSUPPORTED: clang-modules-build
+// UNSUPPORTED: gcc
+
+// XFAIL: has-no-cxx-module-support
 
 // picolibc does not provide the required timespec_get function, and the
 // "using-if-exists" mechanism apparently did not work here.
@@ -17,6 +18,8 @@
 
 // A minimal test to validate import works.
 
+// MODULE_DEPENDENCIES: std.compat
+
 import std.compat;
 
 int main(int, char**) { return !(::strlen("Hello modular world") == 19); }
diff --git a/libcxx/test/std/modules/std.pass.cpp b/libcxx/test/std/modules/std.pass.cpp
index 8ec3ce27322b7..ca05825b3a186 100644
--- a/libcxx/test/std/modules/std.pass.cpp
+++ b/libcxx/test/std/modules/std.pass.cpp
@@ -7,12 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
-
-// UNSUPPORTED: libcpp-has-no-std-modules
 // UNSUPPORTED: clang-modules-build
+// UNSUPPORTED: gcc
+
+// XFAIL: has-no-cxx-module-support
 
 // A minimal test to validate import works.
 
+// MODULE_DEPENDENCIES: std
+
 import std;
 
 int main(int, char**) {
diff --git a/libcxx/utils/ci/Dockerfile b/libcxx/utils/ci/Dockerfile
index e4bda4f06742c..225de937cc869 100644
--- a/libcxx/utils/ci/Dockerfile
+++ b/libcxx/utils/ci/Dockerfile
@@ -152,17 +152,6 @@ RUN <<EOF
     wget https://github.com/Kitware/CMake/releases/download/v3.21.1/cmake-3.21.1-linux-x86_64.sh -O /tmp/install-cmake.sh
     sudo bash /tmp/install-cmake.sh --prefix=/usr --exclude-subdir --skip-license
     rm /tmp/install-cmake.sh
-
-    # Install a newer CMake for modules
-    # TODO Remove the duplicated installation when all runtimes can be build with CMake 3.28.
-    wget https://github.com/Kitware/CMake/releases/download/v3.27.1/cmake-3.27.1-linux-x86_64.sh -O /tmp/install-cmake.sh
-    sudo bash /tmp/install-cmake.sh --prefix=/opt --exclude-subdir --skip-license
-    rm /tmp/install-cmake.sh
-
-    wget https://github.com/Kitware/CMake/releases/download/v3.28.0-rc4/cmake-3.28.0-rc4-linux-x86_64.sh -O /tmp/install-cmake.sh
-    sudo mkdir /opt/cmake-3.28
-    sudo bash /tmp/install-cmake.sh --prefix=/opt/cmake-3.28 --exclude-subdir --skip-license
-    rm /tmp/install-cmake.sh
 EOF
 
 # ===----------------------------------------------------------------------===##
diff --git a/libcxx/utils/ci/buildkite-pipeline.yml b/libcxx/utils/ci/buildkite-pipeline.yml
index a48f8524ef63c..a7c44dab70939 100644
--- a/libcxx/utils/ci/buildkite-pipeline.yml
+++ b/libcxx/utils/ci/buildkite-pipeline.yml
@@ -48,7 +48,6 @@ environment_definitions:
       CLANG_CRASH_DIAGNOSTICS_DIR: "crash_diagnostics"
       CC: clang-${LLVM_HEAD_VERSION}
       CXX: clang++-${LLVM_HEAD_VERSION}
-      CMAKE: /opt/bin/cmake
 
   _absolute_path_clang: &absolute_path_clang
     # Note modules require and absolute path for clang-scan-deps
@@ -262,7 +261,6 @@ steps:
     env:
       CC: clang16
       CXX: clang++16
-      ENABLE_STD_MODULES: 'Off'
     agents:
       queue: libcxx-builders
       os: freebsd
diff --git a/libcxx/utils/ci/run-buildbot b/libcxx/utils/ci/run-buildbot
index 221b497be6d95..ddfe53d2d8bd6 100755
--- a/libcxx/utils/ci/run-buildbot
+++ b/libcxx/utils/ci/run-buildbot
@@ -47,11 +47,6 @@ CLANG_FORMAT        The clang-format binary to use when generating the format
 ENABLE_CLANG_TIDY   Whether to compile and run clang-tidy checks. This variable
                     is optional.
 
-ENABLE_STD_MODULES  Whether to enable or disable building the C++23 std
-                    modules. This variable is optional.
-                    TODO MODULES remove when all supported compilers support
-                    modules.
-
 EOF
 }
 
@@ -120,10 +115,6 @@ if [ -z "${ENABLE_CLANG_TIDY}" ]; then
     ENABLE_CLANG_TIDY=Off
 fi
 
-if [ -n "${ENABLE_STD_MODULES}" ]; then
-    ENABLE_STD_MODULES="-DLIBCXX_ENABLE_STD_MODULES=${ENABLE_STD_MODULES}"
-fi
-
 function generate-cmake-base() {
     echo "--- Generating CMake"
     ${CMAKE} \
@@ -136,7 +127,6 @@ function generate-cmake-base() {
           -DLIBCXXABI_ENABLE_WERROR=YES \
           -DLIBUNWIND_ENABLE_WERROR=YES \
           -DLIBCXX_ENABLE_CLANG_TIDY=${ENABLE_CLANG_TIDY} \
-          ${ENABLE_STD_MODULES} \
           -DLLVM_LIT_ARGS="-sv --xunit-xml-output test-results.xml --timeout=1500 --time-tests" \
           "${@}"
 }
diff --git a/libcxx/utils/libcxx/test/config.py b/libcxx/utils/libcxx/test/config.py
index 63cfa1b677819..9a71c494030d7 100644
--- a/libcxx/utils/libcxx/test/config.py
+++ b/libcxx/utils/libcxx/test/config.py
@@ -16,6 +16,10 @@ def _getSubstitution(substitution, config):
     raise ValueError("Substitution {} is not in the config.".format(substitution))
 
 
+def _appendToSubstitution(substitutions, key, value):
+    return [(k, v + " " + value) if k == key else (k, v) for (k, v) in substitutions]
+
+
 def configure(parameters, features, config, lit_config):
     note = lambda s: lit_config.note("({}) {}".format(config.name, s))
     config.environment = dict(os.environ)
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index e5b561d7feeb5..c4f537f411cc2 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -317,6 +317,18 @@ def _getAndroidDeviceApi(cfg):
             AddSubstitution("%{clang-tidy}", lambda cfg: _getSuitableClangTidy(cfg))
         ],
     ),
+    # Whether module support for the platform is available.
+    Feature(
+        name="has-no-cxx-module-support",
+        # The libc of these platforms have functions with internal linkage.
+        # This is not allowed per C11 7.1.2 Standard headers/6
+        #  Any declaration of a library function shall have external linkage.
+        when=lambda cfg: "__ANDROID__" in compilerMacros(cfg)
+        or "_WIN32" in compilerMacros(cfg)
+        or platform.system().lower().startswith("aix")
+        # Avoid building on platforms that don't support modules properly.
+        or not hasCompileFlag(cfg, "-Wno-reserved-module-identifier"),
+    ),
 ]
 
 # Deduce and add the test features that that are implied by the #defines in
@@ -343,7 +355,6 @@ def _getAndroidDeviceApi(cfg):
     "_LIBCPP_HAS_NO_WIDE_CHARACTERS": "no-wide-characters",
     "_LIBCPP_HAS_NO_TIME_ZONE_DATABASE": "no-tzdb",
     "_LIBCPP_HAS_NO_UNICODE": "libcpp-has-no-unicode",
-    "_LIBCPP_HAS_NO_STD_MODULES":  "libcpp-has-no-std-modules",
     "_LIBCPP_PSTL_CPU_BACKEND_LIBDISPATCH": "libcpp-pstl-cpu-backend-libdispatch",
 }
 for macro, feature in macros.items():
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index 5d84711bf5d28..c605e8796ac54 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -7,6 +7,7 @@
 # ===----------------------------------------------------------------------===##
 
 import lit
+import libcxx.test.config as config
 import lit.formats
 import os
 import re
@@ -52,6 +53,14 @@ def _executeScriptInternal(test, litConfig, commands):
     return (out, err, exitCode, timeoutInfo, parsedCommands)
 
 
+def _validateModuleDependencies(modules):
+    for m in modules:
+        if m not in ("std", "std.compat"):
+            raise RuntimeError(
+                f"Invalid module dependency '{m}', only 'std' and 'std.compat' are valid"
+            )
+
+
 def parseScript(test, preamble):
     """
     Extract the script from a test, with substitutions applied.
@@ -91,6 +100,8 @@ def parseScript(test, preamble):
     # Parse the test file, including custom directives
     additionalCompileFlags = []
     fileDependencies = []
+    modules = []  # The enabled modules
+    moduleCompileFlags = []  # The compilation flags to use modules
     parsers = [
         lit.TestRunner.IntegratedTestKeywordParser(
             "FILE_DEPENDENCIES:",
@@ -102,6 +113,11 @@ def parseScript(test, preamble):
             lit.TestRunner.ParserKind.SPACE_LIST,
             initial_value=additionalCompileFlags,
         ),
+        lit.TestRunner.IntegratedTestKeywordParser(
+            "MODULE_DEPENDENCIES:",
+            lit.TestRunner.ParserKind.SPACE_LIST,
+            initial_value=modules,
+        ),
     ]
 
     # Add conditional parsers for ADDITIONAL_COMPILE_FLAGS. This should be replaced by first
@@ -132,12 +148,53 @@ def parseScript(test, preamble):
     script += scriptInTest
 
     # Add compile flags specified with ADDITIONAL_COMPILE_FLAGS.
-    substitutions = [
-        (s, x + " " + " ".join(additionalCompileFlags))
-        if s == "%{compile_flags}"
-        else (s, x)
-        for (s, x) in substitutions
-    ]
+    # Modules need to be built with the same compilation flags as the
+    # test. So add these flags before adding the modules.
+    substitutions = config._appendToSubstitution(
+        substitutions, "%{compile_flags}", " ".join(additionalCompileFlags)
+    )
+
+    if modules:
+        _validateModuleDependencies(modules)
+
+        # The moduleCompileFlags are added to the %{compile_flags}, but
+        # the modules need to be built without these flags. So expand the
+        # %{compile_flags} eagerly and hardcode them in the build script.
+        compileFlags = config._getSubstitution("%{compile_flags}", test.config)
+
+        # Building the modules needs to happen before the other script
+        # commands are executed. Therefore the commands are added to the
+        # front of the list.
+        if "std.compat" in modules:
+            script.insert(
+                0,
+                "%dbg(MODULE std.compat) %{cxx} %{flags} "
+                f"{compileFlags} "
+                "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
+                "--precompile -o %T/std.compat.pcm -c %{module}/std.compat.cppm",
+            )
+            moduleCompileFlags.extend(
+                ["-fmodule-file=std.compat=%T/std.compat.pcm", "%T/std.compat.pcm"]
+            )
+
+        # Make sure the std module is built before std.compat. Libc++'s
+        # std.compat module depends on the std module. It is not
+        # known whether the compiler expects the modules in the order of
+        # their dependencies. However it's trivial to provide them in
+        # that order.
+        script.insert(
+            0,
+            "%dbg(MODULE std) %{cxx} %{flags} "
+            f"{compileFlags} "
+            "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
+            "--precompile -o %T/std.pcm -c %{module}/std.cppm",
+        )
+        moduleCompileFlags.extend(["-fmodule-file=std=%T/std.pcm", "%T/std.pcm"])
+
+        # Add compile flags required for the modules.
+        substitutions = config._appendToSubstitution(
+            substitutions, "%{compile_flags}", " ".join(moduleCompileFlags)
+        )
 
     # Perform substitutions in the script itself.
     script = lit.TestRunner.applySubstitutions(
diff --git a/libcxx/utils/libcxx/test/modules.py b/libcxx/utils/libcxx/test/modules.py
index bd19fac314dd9..3e9fcae4c5389 100644
--- a/libcxx/utils/libcxx/test/modules.py
+++ b/libcxx/utils/libcxx/test/modules.py
@@ -113,12 +113,12 @@ class module_test_generator:
     clang_tidy_plugin: str
     compiler: str
     compiler_flags: str
+    module: str
 
     def write_lit_configuration(self):
         print(
             f"""\
 // UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: libcpp-has-no-std-modules
 // UNSUPPORTED: clang-modules-build
 
 // REQUIRES: has-clang-tidy
@@ -126,6 +126,8 @@ def write_lit_configuration(self):
 // The GCC compiler flags are not always compatible with clang-tidy.
 // UNSUPPORTED: gcc
 
+// MODULE_DEPENDENCIES: {self.module}
+
 // RUN: echo -n > {self.tmp_prefix}.all_partitions
 """
         )

From 04757337cdc8ba2da9bc3303fb62af514d5452c5 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sun, 21 Jan 2024 12:15:15 +0100
Subject: [PATCH 291/843] [libc++][modules] Improves std.compat module.
 (#76330)

Let the std.compat module use the std module instead of duplicating the
exports.

Based on @ChuanqiXu9's suggestion in #71438.
---
 libcxx/modules/std.compat.cppm.in           | 125 +-------------------
 libcxx/modules/std.cppm.in                  |   1 +
 libcxx/test/libcxx/module_std_compat.gen.py |  25 +---
 libcxx/utils/generate_libcxx_cppm_in.py     |   6 +-
 libcxx/utils/libcxx/header_information.py   |  25 ++++
 libcxx/utils/libcxx/test/format.py          |   1 +
 6 files changed, 35 insertions(+), 148 deletions(-)

diff --git a/libcxx/modules/std.compat.cppm.in b/libcxx/modules/std.compat.cppm.in
index f199e194e60b1..651d6ec7b9fe2 100644
--- a/libcxx/modules/std.compat.cppm.in
+++ b/libcxx/modules/std.compat.cppm.in
@@ -17,38 +17,17 @@ module;
 
 // The headers of Table 24: C++ library headers [tab:headers.cpp]
 // and the headers of Table 25: C++ headers for C library facilities [tab:headers.cpp.c]
-#include <algorithm>
-#include <any>
-#include <array>
-#if !defined(_LIBCPP_HAS_NO_ATOMIC_HEADER)
-#  include <atomic>
-#endif
-#if !defined(_LIBCPP_HAS_NO_THREADS)
-#  include <barrier>
-#endif
-#include <bit>
-#include <bitset>
 #include <cassert>
 #include <cctype>
 #include <cerrno>
 #include <cfenv>
 #include <cfloat>
-#include <charconv>
-#include <chrono>
 #include <cinttypes>
 #include <climits>
 #if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
 #  include <clocale>
 #endif
 #include <cmath>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <codecvt>
-#endif
-#include <compare>
-#include <complex>
-#include <concepts>
-#include <condition_variable>
-#include <coroutine>
 #include <csetjmp>
 #include <csignal>
 #include <cstdarg>
@@ -65,107 +44,6 @@ module;
 #if !defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
 #  include <cwctype>
 #endif
-#include <deque>
-#include <exception>
-#include <execution>
-#include <expected>
-#include <filesystem>
-#include <format>
-#include <forward_list>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <fstream>
-#endif
-#include <functional>
-#if !defined(_LIBCPP_HAS_NO_THREADS)
-#  include <future>
-#endif
-#include <initializer_list>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <iomanip>
-#endif
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <ios>
-#endif
-#include <iosfwd>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <iostream>
-#endif
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <istream>
-#endif
-#include <iterator>
-#if !defined(_LIBCPP_HAS_NO_THREADS)
-#  include <latch>
-#endif
-#include <limits>
-#include <list>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <locale>
-#endif
-#include <map>
-#include <mdspan>
-#include <memory>
-#include <memory_resource>
-#include <mutex>
-#include <new>
-#include <numbers>
-#include <numeric>
-#include <optional>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <ostream>
-#endif
-#include <print>
-#include <queue>
-#include <random>
-#include <ranges>
-#include <ratio>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <regex>
-#endif
-#include <scoped_allocator>
-#if !defined(_LIBCPP_HAS_NO_THREADS)
-#  include <semaphore>
-#endif
-#include <set>
-#if !defined(_LIBCPP_HAS_NO_THREADS)
-#  include <shared_mutex>
-#endif
-#include <source_location>
-#include <span>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <sstream>
-#endif
-#include <stack>
-#include <stdexcept>
-#if !defined(_LIBCPP_HAS_NO_THREADS)
-#  include <stop_token>
-#endif
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <streambuf>
-#endif
-#include <string>
-#include <string_view>
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <strstream>
-#endif
-#if !defined(_LIBCPP_HAS_NO_LOCALIZATION)
-#  include <syncstream>
-#endif
-#include <system_error>
-#if !defined(_LIBCPP_HAS_NO_THREADS)
-#  include <thread>
-#endif
-#include <tuple>
-#include <type_traits>
-#include <typeindex>
-#include <typeinfo>
-#include <unordered_map>
-#include <unordered_set>
-#include <utility>
-#include <valarray>
-#include <variant>
-#include <vector>
-#include <version>
 
 // *** Headers not yet available ***
 #if __has_include(<debugging>)
@@ -203,6 +81,7 @@ module;
 #endif // __has_include(<text_encoding>)
 
 export module std.compat;
+export import std;
+
 
-@LIBCXX_MODULE_STD_INCLUDE_SOURCES@
 @LIBCXX_MODULE_STD_COMPAT_INCLUDE_SOURCES@
\ No newline at end of file
diff --git a/libcxx/modules/std.cppm.in b/libcxx/modules/std.cppm.in
index b46c52e781f82..6ce8e287737b8 100644
--- a/libcxx/modules/std.cppm.in
+++ b/libcxx/modules/std.cppm.in
@@ -204,4 +204,5 @@ module;
 
 export module std;
 
+
 @LIBCXX_MODULE_STD_INCLUDE_SOURCES@
diff --git a/libcxx/test/libcxx/module_std_compat.gen.py b/libcxx/test/libcxx/module_std_compat.gen.py
index 2866066ccedc8..270d131779e5b 100644
--- a/libcxx/test/libcxx/module_std_compat.gen.py
+++ b/libcxx/test/libcxx/module_std_compat.gen.py
@@ -21,6 +21,7 @@
 import sys
 
 sys.path.append(sys.argv[1])
+from libcxx.header_information import module_c_headers
 from libcxx.test.modules import module_test_generator
 
 generator = module_test_generator(
@@ -37,27 +38,5 @@
 print("//--- module_std_compat.sh.cpp")
 generator.write_test(
     "std.compat",
-    [
-        "cassert",
-        "cctype",
-        "cerrno",
-        "cfenv",
-        "cfloat",
-        "cinttypes",
-        "climits",
-        "clocale",
-        "cmath",
-        "csetjmp",
-        "csignal",
-        "cstdarg",
-        "cstddef",
-        "cstdint",
-        "cstdio",
-        "cstdlib",
-        "cstring",
-        "ctime",
-        "cuchar",
-        "cwchar",
-        "cwctype",
-    ],
+    module_c_headers,
 )
diff --git a/libcxx/utils/generate_libcxx_cppm_in.py b/libcxx/utils/generate_libcxx_cppm_in.py
index f957406778d39..2d3f829847fb9 100644
--- a/libcxx/utils/generate_libcxx_cppm_in.py
+++ b/libcxx/utils/generate_libcxx_cppm_in.py
@@ -9,6 +9,7 @@
 import os.path
 import sys
 
+from libcxx.header_information import module_c_headers
 from libcxx.header_information import module_headers
 from libcxx.header_information import header_restrictions
 from libcxx.header_information import headers_not_available
@@ -44,7 +45,7 @@ def write_file(module):
 // and the headers of Table 25: C++ headers for C library facilities [tab:headers.cpp.c]
 """
         )
-        for header in module_headers:
+        for header in module_headers if module == "std" else module_c_headers:
             if header in header_restrictions:
                 module_cpp_in.write(
                     f"""\
@@ -69,8 +70,9 @@ def write_file(module):
         module_cpp_in.write(
             f"""
 export module {module};
+{'export import std;' if module == 'std.compat' else ''}
 
-@LIBCXX_MODULE_STD_INCLUDE_SOURCES@
+{'@LIBCXX_MODULE_STD_INCLUDE_SOURCES@' if module == 'std' else ''}
 {'@LIBCXX_MODULE_STD_COMPAT_INCLUDE_SOURCES@' if module == 'std.compat' else ''}"""
         )
 
diff --git a/libcxx/utils/libcxx/header_information.py b/libcxx/utils/libcxx/header_information.py
index 2268034bae204..b2aa3739ea697 100644
--- a/libcxx/utils/libcxx/header_information.py
+++ b/libcxx/utils/libcxx/header_information.py
@@ -215,3 +215,28 @@ def is_modulemap_header(header):
     # These headers have been removed in C++20 so are never part of a module.
     and not header in ["ccomplex", "ciso646", "cstdbool", "ctgmath"]
 ]
+
+# The C headers used in the std and std.compat modules.
+module_c_headers = [
+    "cassert",
+    "cctype",
+    "cerrno",
+    "cfenv",
+    "cfloat",
+    "cinttypes",
+    "climits",
+    "clocale",
+    "cmath",
+    "csetjmp",
+    "csignal",
+    "cstdarg",
+    "cstddef",
+    "cstdint",
+    "cstdio",
+    "cstdlib",
+    "cstring",
+    "ctime",
+    "cuchar",
+    "cwchar",
+    "cwctype",
+]
diff --git a/libcxx/utils/libcxx/test/format.py b/libcxx/utils/libcxx/test/format.py
index c605e8796ac54..13175214879c4 100644
--- a/libcxx/utils/libcxx/test/format.py
+++ b/libcxx/utils/libcxx/test/format.py
@@ -171,6 +171,7 @@ def parseScript(test, preamble):
                 "%dbg(MODULE std.compat) %{cxx} %{flags} "
                 f"{compileFlags} "
                 "-Wno-reserved-module-identifier -Wno-reserved-user-defined-literal "
+                "-fmodule-file=std=%T/std.pcm " # The std.compat module imports std.
                 "--precompile -o %T/std.compat.pcm -c %{module}/std.compat.cppm",
             )
             moduleCompileFlags.extend(

From 8b47bb657b5905d954b9041415020358802407d5 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sun, 21 Jan 2024 12:16:22 +0100
Subject: [PATCH 292/843] [libc++] Install modules. (#75741)

Installs the source files of the experimental libc++ modules. These
source files (.cppm) are used by the Clang to build the std and
std.compat modules.

The design of this patch is based on a discussing in SG-15 on
12.12.2023. (SG-15 is the ISO C++ Tooling study group):

- The modules are installed at a location, that is not known to build
  systems and compilers.
- Next to the library there will be a module manifest json file.
  This json file contains the information to build the module from the
  libraries sources. This information includes the location where the
  sources are installed. @ruoso supplied the specification of this json
  file.
- If possible, the compiler has an option to give the location of the
  module manifest file
  (https://github.com/llvm/llvm-project/pull/76451).

Currently there is no build system support, but it expected to be added
in the future.

Fixes: https://github.com/llvm/llvm-project/issues/73089
---
 libcxx/CMakeLists.txt                         |  5 ++
 libcxx/cmake/caches/Generic-cxx20.cmake       |  1 +
 libcxx/cmake/caches/Generic-cxx23.cmake       |  1 +
 libcxx/cmake/caches/Generic-cxx26.cmake       |  1 +
 .../Generic-hardening-mode-extensive.cmake    |  1 +
 .../cmake/caches/Generic-no-exceptions.cmake  |  1 +
 .../caches/Generic-no-experimental.cmake      |  1 +
 .../cmake/caches/Generic-no-filesystem.cmake  |  1 +
 .../caches/Generic-no-localization.cmake      |  1 +
 .../caches/Generic-no-random_device.cmake     |  1 +
 libcxx/cmake/caches/Generic-no-threads.cmake  |  1 +
 libcxx/cmake/caches/Generic-no-unicode.cmake  |  1 +
 .../caches/Generic-no-wide-characters.cmake   |  1 +
 libcxx/docs/Modules.rst                       |  1 -
 libcxx/docs/ReleaseNotes/18.rst               |  7 +++
 libcxx/modules/CMakeLists.txt                 | 54 +++++++++++++++++++
 libcxx/modules/modules.json.in                | 26 +++++++++
 libcxx/src/CMakeLists.txt                     |  5 ++
 18 files changed, 109 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/modules/modules.json.in

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 53de53480266b..d392e95077ac5 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -178,6 +178,9 @@ set(LIBCXX_LIBDIR_SUFFIX "${LLVM_LIBDIR_SUFFIX}" CACHE STRING
     "Define suffix of library directory name (32/64)")
 option(LIBCXX_INSTALL_HEADERS "Install the libc++ headers." ON)
 option(LIBCXX_INSTALL_LIBRARY "Install the libc++ library." ON)
+option(LIBCXX_INSTALL_MODULES
+  "Install the libc++ C++20 module source files (experimental)." OFF
+)
 cmake_dependent_option(LIBCXX_INSTALL_STATIC_LIBRARY
   "Install the static libc++ library." ON
   "LIBCXX_ENABLE_STATIC;LIBCXX_INSTALL_LIBRARY" OFF)
@@ -425,6 +428,8 @@ set(LIBCXX_INSTALL_INCLUDE_DIR "${CMAKE_INSTALL_INCLUDEDIR}/c++/v1" CACHE STRING
     "Path where target-agnostic libc++ headers should be installed.")
 set(LIBCXX_INSTALL_RUNTIME_DIR "${CMAKE_INSTALL_BINDIR}" CACHE STRING
     "Path where built libc++ runtime libraries should be installed.")
+set(LIBCXX_INSTALL_MODULES_DIR "share/libc++/v1" CACHE STRING
+    "Path where target-agnostic libc++ module source files should be installed.")
 
 set(LIBCXX_SHARED_OUTPUT_NAME "c++" CACHE STRING "Output name for the shared libc++ runtime library.")
 set(LIBCXX_STATIC_OUTPUT_NAME "c++" CACHE STRING "Output name for the static libc++ runtime library.")
diff --git a/libcxx/cmake/caches/Generic-cxx20.cmake b/libcxx/cmake/caches/Generic-cxx20.cmake
index 3c44fdaf0e425..641c131a737b1 100644
--- a/libcxx/cmake/caches/Generic-cxx20.cmake
+++ b/libcxx/cmake/caches/Generic-cxx20.cmake
@@ -1,2 +1,3 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++20" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-cxx23.cmake b/libcxx/cmake/caches/Generic-cxx23.cmake
index bf88abf56ca6a..f5409e4652e42 100644
--- a/libcxx/cmake/caches/Generic-cxx23.cmake
+++ b/libcxx/cmake/caches/Generic-cxx23.cmake
@@ -1,2 +1,3 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++23" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-cxx26.cmake b/libcxx/cmake/caches/Generic-cxx26.cmake
index 6ba9482af5785..2d9c018a4ff54 100644
--- a/libcxx/cmake/caches/Generic-cxx26.cmake
+++ b/libcxx/cmake/caches/Generic-cxx26.cmake
@@ -1,2 +1,3 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "std=c++26" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
index 72263dfd84635..9542dcdbf7783 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
@@ -1 +1,2 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_HARDENING_MODE "extensive" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-exceptions.cmake b/libcxx/cmake/caches/Generic-no-exceptions.cmake
index f0dffef60dba0..c68adfc1276b5 100644
--- a/libcxx/cmake/caches/Generic-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Generic-no-exceptions.cmake
@@ -1,2 +1,3 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-experimental.cmake b/libcxx/cmake/caches/Generic-no-experimental.cmake
index f33ed01418990..62b7d7373d447 100644
--- a/libcxx/cmake/caches/Generic-no-experimental.cmake
+++ b/libcxx/cmake/caches/Generic-no-experimental.cmake
@@ -1,2 +1,3 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_TEST_PARAMS "enable_experimental=False" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-no-filesystem.cmake b/libcxx/cmake/caches/Generic-no-filesystem.cmake
index 4000f3a3e8ef2..01ae7e68f12c9 100644
--- a/libcxx/cmake/caches/Generic-no-filesystem.cmake
+++ b/libcxx/cmake/caches/Generic-no-filesystem.cmake
@@ -1 +1,2 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_FILESYSTEM OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-localization.cmake b/libcxx/cmake/caches/Generic-no-localization.cmake
index 79d6b44c7139a..fc4957b2d53a7 100644
--- a/libcxx/cmake/caches/Generic-no-localization.cmake
+++ b/libcxx/cmake/caches/Generic-no-localization.cmake
@@ -1 +1,2 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_LOCALIZATION OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-random_device.cmake b/libcxx/cmake/caches/Generic-no-random_device.cmake
index e9b4cc60cc80e..ddf479add6269 100644
--- a/libcxx/cmake/caches/Generic-no-random_device.cmake
+++ b/libcxx/cmake/caches/Generic-no-random_device.cmake
@@ -1 +1,2 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_RANDOM_DEVICE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-threads.cmake b/libcxx/cmake/caches/Generic-no-threads.cmake
index 616baef1be7be..724fbc466b58a 100644
--- a/libcxx/cmake/caches/Generic-no-threads.cmake
+++ b/libcxx/cmake/caches/Generic-no-threads.cmake
@@ -1,3 +1,4 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_THREADS OFF CACHE BOOL "")
 set(LIBCXX_ENABLE_MONOTONIC_CLOCK OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-unicode.cmake b/libcxx/cmake/caches/Generic-no-unicode.cmake
index 01160bf218981..a4cf7dd73772b 100644
--- a/libcxx/cmake/caches/Generic-no-unicode.cmake
+++ b/libcxx/cmake/caches/Generic-no-unicode.cmake
@@ -1 +1,2 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_UNICODE OFF CACHE BOOL "")
diff --git a/libcxx/cmake/caches/Generic-no-wide-characters.cmake b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
index 728d41086a386..dc19389bb5ae2 100644
--- a/libcxx/cmake/caches/Generic-no-wide-characters.cmake
+++ b/libcxx/cmake/caches/Generic-no-wide-characters.cmake
@@ -1 +1,2 @@
+set(LIBCXX_INSTALL_MODULES ON CACHE BOOL "") # TODO MODULES Remove when enabled automatically.
 set(LIBCXX_ENABLE_WIDE_CHARACTERS OFF CACHE BOOL "")
diff --git a/libcxx/docs/Modules.rst b/libcxx/docs/Modules.rst
index a0735c0ed3f95..533c3fbd2a1ee 100644
--- a/libcxx/docs/Modules.rst
+++ b/libcxx/docs/Modules.rst
@@ -69,7 +69,6 @@ Some of the current limitations
  * The path to the compiler may not be a symlink, ``clang-scan-deps`` does
    not handle that case properly
  * Libc++ is not tested with modules instead of headers
- * The module ``.cppm`` files are not installed
  * Clang supports modules using GNU extensions, but libc++ does not work using
    GNU extensions.
  * Clang:
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 77cf22398ea4c..799347b646e63 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -99,6 +99,13 @@ Improvements and New Features
   infrastructure no longer depends on a modern CMake, it works with the minimal
   required LLVM version (3.20.0).
 
+- The ``.cppm`` files of experimental standard library modules can now be
+  installed. By default, they are not installed. This can be enabled by
+  configuring CMake with ``-DLIBCXX_INSTALL_MODULES=ON``. The installation
+  directory can be configured with the CMake option
+  ``-DLIBCXX_INSTALL_MODULE_DIR=<path>``. The default location is
+  ``${PREFIX}/share/libc++/v1``.
+
 
 Deprecations and Removals
 -------------------------
diff --git a/libcxx/modules/CMakeLists.txt b/libcxx/modules/CMakeLists.txt
index 31fbadf449f77..0388c048dacb8 100644
--- a/libcxx/modules/CMakeLists.txt
+++ b/libcxx/modules/CMakeLists.txt
@@ -182,3 +182,57 @@ add_custom_target(generate-cxx-modules
   ALL DEPENDS
     ${_all_modules}
 )
+
+# Configure the modules manifest.
+# Use the relative path between the installation and the module in the json
+# file. This allows moving the entire installation to a different location.
+file(RELATIVE_PATH LIBCXX_MODULE_RELATIVE_PATH
+  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_LIBRARY_DIR}
+  ${CMAKE_INSTALL_PREFIX}/${LIBCXX_INSTALL_MODULES_DIR})
+configure_file(
+  "modules.json.in"
+  "${LIBCXX_LIBRARY_DIR}/libc++.modules.json"
+  @ONLY
+)
+
+# Dummy library to make modules an installation component.
+add_library(cxx-modules INTERFACE)
+add_dependencies(cxx-modules generate-cxx-modules)
+
+if (LIBCXX_INSTALL_MODULES)
+  foreach(file ${LIBCXX_MODULE_STD_SOURCES} ${LIBCXX_MODULE_STD_COMPAT_SOURCES})
+    get_filename_component(dir ${file} DIRECTORY)
+    install(FILES ${file}
+      DESTINATION "${LIBCXX_INSTALL_MODULES_DIR}/${dir}"
+      COMPONENT cxx-modules
+      PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+    )
+  endforeach()
+
+  # Install the generated module files.
+  install(FILES
+      "${LIBCXX_GENERATED_MODULE_DIR}/std.cppm"
+      "${LIBCXX_GENERATED_MODULE_DIR}/std.compat.cppm"
+    DESTINATION "${LIBCXX_INSTALL_MODULES_DIR}"
+    COMPONENT cxx-modules
+    PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  )
+
+  # Install the module manifest.
+  install(FILES
+      "${LIBCXX_LIBRARY_DIR}/libc++.modules.json"
+    DESTINATION "${LIBCXX_INSTALL_LIBRARY_DIR}"
+    COMPONENT cxx-modules
+    PERMISSIONS OWNER_READ OWNER_WRITE GROUP_READ WORLD_READ
+  )
+
+  if (NOT CMAKE_CONFIGURATION_TYPES)
+    add_custom_target(install-cxx-modules
+                      DEPENDS cxx-modules
+                      COMMAND "${CMAKE_COMMAND}"
+                              -DCMAKE_INSTALL_COMPONENT=cxx-modules
+                              -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
+    # Stripping is a no-op for modules
+    add_custom_target(install-cxx-modules-stripped DEPENDS install-cxx-modules)
+  endif()
+endif()
diff --git a/libcxx/modules/modules.json.in b/libcxx/modules/modules.json.in
new file mode 100644
index 0000000000000..ddc377f28f919
--- /dev/null
+++ b/libcxx/modules/modules.json.in
@@ -0,0 +1,26 @@
+{
+  "version": 1,
+  "revision": 1,
+  "modules": [
+    {
+      "logical-name": "std",
+      "source-path": "@LIBCXX_MODULE_RELATIVE_PATH@/std.cppm",
+      "is-standard-library": true,
+      "local-arguments": {
+        "system-include-directories": [
+          "@LIBCXX_MODULE_RELATIVE_PATH@"
+        ]
+      }
+    },
+    {
+      "logical-name": "std.compat",
+      "source-path": "@LIBCXX_MODULE_RELATIVE_PATH@/std.compat.cppm",
+      "is-std-library": true,
+      "local-arguments": {
+        "system-include-directories": [
+          "@LIBCXX_MODULE_RELATIVE_PATH@"
+        ]
+      }
+    }
+  ]
+}
diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index dc2c98188aacc..5481fe440e781 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -398,11 +398,15 @@ if (NOT CMAKE_CONFIGURATION_TYPES)
     endif()
     if(LIBCXX_INSTALL_HEADERS)
       set(header_install_target install-cxx-headers)
+    endif()
+	if(LIBCXX_INSTALL_MODULES)
+      set(module_install_target install-cxx-modules)
     endif()
     add_custom_target(install-cxx
                       DEPENDS ${lib_install_target}
                               cxx_experimental
                               ${header_install_target}
+                              ${module_install_target}
                       COMMAND "${CMAKE_COMMAND}"
                       -DCMAKE_INSTALL_COMPONENT=cxx
                       -P "${LIBCXX_BINARY_DIR}/cmake_install.cmake")
@@ -410,6 +414,7 @@ if (NOT CMAKE_CONFIGURATION_TYPES)
                       DEPENDS ${lib_install_target}
                               cxx_experimental
                               ${header_install_target}
+                              ${module_install_target}
                       COMMAND "${CMAKE_COMMAND}"
                       -DCMAKE_INSTALL_COMPONENT=cxx
                       -DCMAKE_INSTALL_DO_STRIP=1

From 07e6b983cc21d7f12ee9fe0c94aefc4ed9fa67a9 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Sun, 21 Jan 2024 19:15:06 +0800
Subject: [PATCH 293/843] [X86][NFC] Remove unnecessary parameters for
 MaskedShiftAmountPats/MaskedRotateAmountPats and rename one_bit_patterns

This patch is to extract NFC in #78853 into a separate commit.
---
 llvm/lib/Target/X86/X86InstrCompiler.td | 94 ++++++++++++-------------
 1 file changed, 46 insertions(+), 48 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 7e5ce7c32f87c..8e412204c989c 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1787,31 +1787,31 @@ let Predicates = [HasNDD] in {
 }
 
 // Shift amount is implicitly masked.
-multiclass MaskedShiftAmountPats<SDNode frag, string name> {
+multiclass MaskedShiftAmountPats<SDNode frag> {
   // (shift x (and y, 31)) ==> (shift x, y)
   def : Pat<(frag GR8:$src1, (shiftMask32 CL)),
-            (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+            (!cast<Instruction>(NAME # "8rCL") GR8:$src1)>;
   def : Pat<(frag GR16:$src1, (shiftMask32 CL)),
-            (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+            (!cast<Instruction>(NAME # "16rCL") GR16:$src1)>;
   def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
-            (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+            (!cast<Instruction>(NAME # "32rCL") GR32:$src1)>;
   def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask32 CL)), addr:$dst),
-            (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+            (!cast<Instruction>(NAME # "8mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask32 CL)), addr:$dst),
-            (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+            (!cast<Instruction>(NAME # "16mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
-            (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+            (!cast<Instruction>(NAME # "32mCL") addr:$dst)>;
 
   // (shift x (and y, 63)) ==> (shift x, y)
   def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
-            (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+            (!cast<Instruction>(NAME # "64rCL") GR64:$src1)>;
   def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
-            (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+            (!cast<Instruction>(NAME # "64mCL") addr:$dst)>;
 }
 
-defm : MaskedShiftAmountPats<shl, "SHL">;
-defm : MaskedShiftAmountPats<srl, "SHR">;
-defm : MaskedShiftAmountPats<sra, "SAR">;
+defm SHL : MaskedShiftAmountPats<shl>;
+defm SHR : MaskedShiftAmountPats<srl>;
+defm SAR : MaskedShiftAmountPats<sra>;
 
 // ROL/ROR instructions allow a stronger mask optimization than shift for 8- and
 // 16-bit. We can remove a mask of any (bitwidth - 1) on the rotation amount
@@ -1819,31 +1819,30 @@ defm : MaskedShiftAmountPats<sra, "SAR">;
 // docs with: "tempCOUNT <- (COUNT & COUNTMASK) MOD SIZE". Masking the rotation
 // amount could affect EFLAGS results, but that does not matter because we are
 // not tracking flags for these nodes.
-multiclass MaskedRotateAmountPats<SDNode frag, string name> {
+multiclass MaskedRotateAmountPats<SDNode frag> {
   // (rot x (and y, BitWidth - 1)) ==> (rot x, y)
   def : Pat<(frag GR8:$src1, (shiftMask8 CL)),
-  (!cast<Instruction>(name # "8rCL") GR8:$src1)>;
+            (!cast<Instruction>(NAME # "8rCL") GR8:$src1)>;
   def : Pat<(frag GR16:$src1, (shiftMask16 CL)),
-  (!cast<Instruction>(name # "16rCL") GR16:$src1)>;
+            (!cast<Instruction>(NAME # "16rCL") GR16:$src1)>;
   def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
-  (!cast<Instruction>(name # "32rCL") GR32:$src1)>;
+            (!cast<Instruction>(NAME # "32rCL") GR32:$src1)>;
   def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask8 CL)), addr:$dst),
-  (!cast<Instruction>(name # "8mCL") addr:$dst)>;
+            (!cast<Instruction>(NAME # "8mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask16 CL)), addr:$dst),
-  (!cast<Instruction>(name # "16mCL") addr:$dst)>;
+            (!cast<Instruction>(NAME # "16mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
-  (!cast<Instruction>(name # "32mCL") addr:$dst)>;
+            (!cast<Instruction>(NAME # "32mCL") addr:$dst)>;
 
   // (rot x (and y, 63)) ==> (rot x, y)
   def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
-  (!cast<Instruction>(name # "64rCL") GR64:$src1)>;
+            (!cast<Instruction>(NAME # "64rCL") GR64:$src1)>;
   def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
-  (!cast<Instruction>(name # "64mCL") addr:$dst)>;
+            (!cast<Instruction>(NAME # "64mCL") addr:$dst)>;
 }
 
-
-defm : MaskedRotateAmountPats<rotl, "ROL">;
-defm : MaskedRotateAmountPats<rotr, "ROR">;
+defm ROL : MaskedRotateAmountPats<rotl>;
+defm ROR : MaskedRotateAmountPats<rotr>;
 
 // Double "funnel" shift amount is implicitly masked.
 // (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) (NOTE: modulo32)
@@ -1865,34 +1864,33 @@ def : Pat<(fshr GR64:$src2, GR64:$src1, (shiftMask64 CL)),
           (SHRD64rrCL GR64:$src1, GR64:$src2)>;
 
 // Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location.
-multiclass one_bit_patterns<RegisterClass RC, ValueType VT, Instruction BTR,
-                            Instruction BTS, Instruction BTC,
-                            PatFrag ShiftMask> {
-  def : Pat<(and RC:$src1, (rotl -2, GR8:$src2)),
-            (BTR RC:$src1,
-                 (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-  def : Pat<(or RC:$src1, (shl 1, GR8:$src2)),
-            (BTS RC:$src1,
-                 (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-  def : Pat<(xor RC:$src1, (shl 1, GR8:$src2)),
-            (BTC RC:$src1,
-                 (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+multiclass OneBitPats<RegisterClass rc, ValueType vt, Instruction btr,
+                      Instruction bts, Instruction btc, PatFrag mask> {
+  def : Pat<(and rc:$src1, (rotl -2, GR8:$src2)),
+            (btr rc:$src1,
+                 (INSERT_SUBREG (vt (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(or rc:$src1, (shl 1, GR8:$src2)),
+            (bts rc:$src1,
+                 (INSERT_SUBREG (vt (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(xor rc:$src1, (shl 1, GR8:$src2)),
+            (btc rc:$src1,
+                 (INSERT_SUBREG (vt (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
 
   // Similar to above, but removing unneeded masking of the shift amount.
-  def : Pat<(and RC:$src1, (rotl -2, (ShiftMask GR8:$src2))),
-            (BTR RC:$src1,
-                 (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-  def : Pat<(or RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
-            (BTS RC:$src1,
-                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
-  def : Pat<(xor RC:$src1, (shl 1, (ShiftMask GR8:$src2))),
-            (BTC RC:$src1,
-                (INSERT_SUBREG (VT (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(and rc:$src1, (rotl -2, (mask GR8:$src2))),
+            (btr rc:$src1,
+                 (INSERT_SUBREG (vt (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(or rc:$src1, (shl 1, (mask GR8:$src2))),
+            (bts rc:$src1,
+                (INSERT_SUBREG (vt (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
+  def : Pat<(xor rc:$src1, (shl 1, (mask GR8:$src2))),
+            (btc rc:$src1,
+                (INSERT_SUBREG (vt (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
 }
 
-defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
-defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
-defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
+defm : OneBitPats<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
+defm : OneBitPats<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
+defm : OneBitPats<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
 
 //===----------------------------------------------------------------------===//
 // EFLAGS-defining Patterns

From ae8d699d6cfb9eee2331bd5cdf8e1ffbf5168cff Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sun, 21 Jan 2024 12:27:21 +0100
Subject: [PATCH 294/843] [NFC][libc++] tab -> space

---
 libcxx/src/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/src/CMakeLists.txt b/libcxx/src/CMakeLists.txt
index 5481fe440e781..44a088663463c 100644
--- a/libcxx/src/CMakeLists.txt
+++ b/libcxx/src/CMakeLists.txt
@@ -399,7 +399,7 @@ if (NOT CMAKE_CONFIGURATION_TYPES)
     if(LIBCXX_INSTALL_HEADERS)
       set(header_install_target install-cxx-headers)
     endif()
-	if(LIBCXX_INSTALL_MODULES)
+    if(LIBCXX_INSTALL_MODULES)
       set(module_install_target install-cxx-modules)
     endif()
     add_custom_target(install-cxx

From 5b7bb56a053702f7ed486aba1427454ec222bb36 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sun, 31 Dec 2023 14:59:46 +0100
Subject: [PATCH 295/843] [libc++] Clang-tidy enable modernize-use-nullptr.

Clang-tidy 18 no longer has false positives with the spaceship operator.
Note that I'm quite sure there are more occurrences in our headers that
are not caught.

This relands https://github.com/llvm/llvm-project/pull/76659 with fixes
tested in https://github.com/llvm/llvm-project/pull/78746.
---
 libcxx/.clang-tidy                            |  2 +-
 libcxx/include/__atomic/is_always_lock_free.h |  2 +-
 libcxx/include/__iterator/iterator_traits.h   |  2 +-
 libcxx/include/locale                         | 12 ++++++------
 4 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/libcxx/.clang-tidy b/libcxx/.clang-tidy
index 214661789cd51..ec7cab9b878ed 100644
--- a/libcxx/.clang-tidy
+++ b/libcxx/.clang-tidy
@@ -15,6 +15,7 @@ Checks: >
 
   modernize-loop-convert,
   modernize-redundant-void-arg,
+  modernize-use-nullptr,
   modernize-use-override,
 
   readability-duplicate-include,
@@ -68,7 +69,6 @@ CheckOptions:
 # modernize-use-default-member-init,
 # modernize-use-equals-default,
 # modernize-use-equals-delete,
-# modernize-use-nullptr,
 # portability-restrict-system-includes,
 # readability-function-cognitive-complexity,
 # readability-implicit-bool-conversion,
diff --git a/libcxx/include/__atomic/is_always_lock_free.h b/libcxx/include/__atomic/is_always_lock_free.h
index fbbd437074990..f928e79f70cea 100644
--- a/libcxx/include/__atomic/is_always_lock_free.h
+++ b/libcxx/include/__atomic/is_always_lock_free.h
@@ -20,7 +20,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _Tp>
 struct __libcpp_is_always_lock_free {
   // __atomic_always_lock_free is available in all Standard modes
-  static const bool __value = __atomic_always_lock_free(sizeof(_Tp), 0);
+  static const bool __value = __atomic_always_lock_free(sizeof(_Tp), nullptr);
 };
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__iterator/iterator_traits.h b/libcxx/include/__iterator/iterator_traits.h
index dae8cc75ae349..2cd82525ba063 100644
--- a/libcxx/include/__iterator/iterator_traits.h
+++ b/libcxx/include/__iterator/iterator_traits.h
@@ -121,7 +121,7 @@ struct __has_iterator_typedefs {
          __void_t<typename _Up::pointer>*           = nullptr);
 
 public:
-  static const bool value = decltype(__test<_Tp>(0, 0, 0, 0, 0))::value;
+  static const bool value = decltype(__test<_Tp>(nullptr, nullptr, nullptr, nullptr, nullptr))::value;
 };
 
 template <class _Tp>
diff --git a/libcxx/include/locale b/libcxx/include/locale
index 70d22ff95e1ee..9e97eb9f33953 100644
--- a/libcxx/include/locale
+++ b/libcxx/include/locale
@@ -3421,7 +3421,7 @@ wbuffer_convert<_Codecvt, _Elem, _Tr>::~wbuffer_convert() {
 template <class _Codecvt, class _Elem, class _Tr>
 typename wbuffer_convert<_Codecvt, _Elem, _Tr>::int_type wbuffer_convert<_Codecvt, _Elem, _Tr>::underflow() {
   _LIBCPP_SUPPRESS_DEPRECATED_POP
-  if (__cv_ == 0 || __bufptr_ == 0)
+  if (__cv_ == 0 || __bufptr_ == nullptr)
     return traits_type::eof();
   bool __initial = __read_mode();
   char_type __1buf;
@@ -3478,7 +3478,7 @@ template <class _Codecvt, class _Elem, class _Tr>
 typename wbuffer_convert<_Codecvt, _Elem, _Tr>::int_type
 wbuffer_convert<_Codecvt, _Elem, _Tr>::pbackfail(int_type __c) {
   _LIBCPP_SUPPRESS_DEPRECATED_POP
-  if (__cv_ != 0 && __bufptr_ != 0 && this->eback() < this->gptr()) {
+  if (__cv_ != 0 && __bufptr_ && this->eback() < this->gptr()) {
     if (traits_type::eq_int_type(__c, traits_type::eof())) {
       this->gbump(-1);
       return traits_type::not_eof(__c);
@@ -3496,7 +3496,7 @@ _LIBCPP_SUPPRESS_DEPRECATED_PUSH
 template <class _Codecvt, class _Elem, class _Tr>
 typename wbuffer_convert<_Codecvt, _Elem, _Tr>::int_type wbuffer_convert<_Codecvt, _Elem, _Tr>::overflow(int_type __c) {
   _LIBCPP_SUPPRESS_DEPRECATED_POP
-  if (__cv_ == 0 || __bufptr_ == 0)
+  if (__cv_ == 0 || !__bufptr_)
     return traits_type::eof();
   __write_mode();
   char_type __1buf;
@@ -3588,7 +3588,7 @@ template <class _Codecvt, class _Elem, class _Tr>
 typename wbuffer_convert<_Codecvt, _Elem, _Tr>::pos_type
 wbuffer_convert<_Codecvt, _Elem, _Tr>::seekoff(off_type __off, ios_base::seekdir __way, ios_base::openmode __om) {
   int __width = __cv_->encoding();
-  if (__cv_ == 0 || __bufptr_ == 0 || (__width <= 0 && __off != 0) || sync())
+  if (__cv_ == 0 || !__bufptr_ || (__width <= 0 && __off != 0) || sync())
     return pos_type(off_type(-1));
   // __width > 0 || __off == 0, now check __way
   if (__way != ios_base::beg && __way != ios_base::cur && __way != ios_base::end)
@@ -3601,7 +3601,7 @@ wbuffer_convert<_Codecvt, _Elem, _Tr>::seekoff(off_type __off, ios_base::seekdir
 template <class _Codecvt, class _Elem, class _Tr>
 typename wbuffer_convert<_Codecvt, _Elem, _Tr>::pos_type
 wbuffer_convert<_Codecvt, _Elem, _Tr>::seekpos(pos_type __sp, ios_base::openmode __wch) {
-  if (__cv_ == 0 || __bufptr_ == 0 || sync())
+  if (__cv_ == 0 || !__bufptr_ || sync())
     return pos_type(off_type(-1));
   if (__bufptr_->pubseekpos(__sp, __wch) == pos_type(off_type(-1)))
     return pos_type(off_type(-1));
@@ -3611,7 +3611,7 @@ wbuffer_convert<_Codecvt, _Elem, _Tr>::seekpos(pos_type __sp, ios_base::openmode
 template <class _Codecvt, class _Elem, class _Tr>
 int wbuffer_convert<_Codecvt, _Elem, _Tr>::sync() {
   _LIBCPP_SUPPRESS_DEPRECATED_POP
-  if (__cv_ == 0 || __bufptr_ == 0)
+  if (__cv_ == 0 || !__bufptr_)
     return 0;
   if (__cm_ & ios_base::out) {
     if (this->pptr() != this->pbase())

From b0b491d458962136c696366b8cf535d54511baf3 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Sun, 21 Jan 2024 13:46:42 +0100
Subject: [PATCH 296/843] [clang-tidy] Fix handling of functional cast in
 google-readability-casting (#71650)

Fix issue with constructor call being interpreted as functional cast and
considered for a replacement
with static cast or being removed as redundant.

Closes #57959
---
 .../clang-tidy/google/AvoidCStyleCastsCheck.cpp   | 12 +++++++-----
 .../clang-tidy/google/AvoidCStyleCastsCheck.h     |  3 +++
 clang-tools-extra/docs/ReleaseNotes.rst           |  4 ++++
 .../checkers/google/readability-casting.cpp       | 15 +++++----------
 4 files changed, 19 insertions(+), 15 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp
index 995961b077480..3109bbb3724c7 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.cpp
@@ -23,14 +23,16 @@ void AvoidCStyleCastsCheck::registerMatchers(
           // Filter out (EnumType)IntegerLiteral construct, which is generated
           // for non-type template arguments of enum types.
           // FIXME: Remove this once this is fixed in the AST.
-          unless(hasParent(substNonTypeTemplateParmExpr())),
-          // Avoid matches in template instantiations.
-          unless(isInTemplateInstantiation()))
+          unless(hasParent(substNonTypeTemplateParmExpr())))
           .bind("cast"),
       this);
+
   Finder->addMatcher(
-      cxxFunctionalCastExpr(unless(hasDescendant(cxxConstructExpr())),
-                            unless(hasDescendant(initListExpr())))
+      cxxFunctionalCastExpr(
+          hasDestinationType(hasCanonicalType(anyOf(
+              builtinType(), references(qualType()), pointsTo(qualType())))),
+          unless(
+              hasSourceExpression(anyOf(cxxConstructExpr(), initListExpr()))))
           .bind("cast"),
       this);
 }
diff --git a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h
index 485640d230280..4267b896b6992 100644
--- a/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h
+++ b/clang-tools-extra/clang-tidy/google/AvoidCStyleCastsCheck.h
@@ -31,6 +31,9 @@ class AvoidCStyleCastsCheck : public ClangTidyCheck {
       : ClangTidyCheck(Name, Context) {}
   void registerMatchers(ast_matchers::MatchFinder *Finder) override;
   void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
 };
 
 } // namespace clang::tidy::google::readability
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index bdbdee31f39d1..2822b17b43514 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -370,6 +370,10 @@ Changes in existing checks
   to ignore unused parameters when they are marked as unused and parameters of
   deleted functions and constructors.
 
+- Improved :doc:`google-readability-casting
+  <clang-tidy/checks/google/readability-casting>` check to ignore constructor
+  calls disguised as functional casts.
+
 - Improved :doc:`llvm-namespace-comment
   <clang-tidy/checks/llvm/namespace-comment>` check to provide fixes for
   ``inline`` namespaces in the same format as :program:`clang-format`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
index e25463cf451b7..fdc71167cd82b 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
@@ -322,17 +322,10 @@ void conversions() {
 }
 
 template <class T>
-T functional_cast_template_used_by_class(float i) {
+T functional_cast_template(float i) {
   return T(i);
 }
 
-template <class T>
-T functional_cast_template_used_by_int(float i) {
-  return T(i);
-  // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: C-style casts are discouraged; use static_cast
-  // CHECK-FIXES: return static_cast<T>(i);
-}
-
 struct S2 {
   S2(float);
 };
@@ -356,8 +349,8 @@ void functional_casts() {
   auto s = S(str);
 
   // Functional casts in template functions
-  functional_cast_template_used_by_class<S2>(x);
-  functional_cast_template_used_by_int<int>(x);
+  functional_cast_template<S2>(x);
+  functional_cast_template<int>(x);
 
   // New expressions are not functional casts
   auto w = new int(x);
@@ -366,4 +359,6 @@ void functional_casts() {
   S2 t = T(x); // OK, constructor call
   S2 u = U(x); // NOK, it's a reinterpret_cast in disguise
   // CHECK-MESSAGES: :[[@LINE-1]]:10: warning: C-style casts are discouraged; use static_cast/const_cast/reinterpret_cast
+
+  throw S2(5.0f);
 }

From 3683852d4988e4641fb6c20ca8c3013d5c2989f1 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Sun, 21 Jan 2024 12:56:16 +0000
Subject: [PATCH 297/843] [VPlan] Use replaceUsesWithIf in replaceAllUseswith
 and add comment (NFCI).

Follow-up to post-commit commens for b1bfe221e6.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp | 21 +++++----------------
 1 file changed, 5 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index b6e56c47c227f..3eeb1a6948f27 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1136,29 +1136,18 @@ void VPlanIngredient::print(raw_ostream &O) const {
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
 
 void VPValue::replaceAllUsesWith(VPValue *New) {
-  if (this == New)
-    return;
-  for (unsigned J = 0; J < getNumUsers();) {
-    VPUser *User = Users[J];
-    bool RemovedUser = false;
-    for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I)
-      if (User->getOperand(I) == this) {
-        User->setOperand(I, New);
-        RemovedUser = true;
-      }
-    // If a user got removed after updating the current user, the next user to
-    // update will be moved to the current position, so we only need to
-    // increment the index if the number of users did not change.
-    if (!RemovedUser)
-      J++;
-  }
+  replaceUsesWithIf(New, [](VPUser &, unsigned) { return true; });
 }
 
 void VPValue::replaceUsesWithIf(
     VPValue *New,
     llvm::function_ref<bool(VPUser &U, unsigned Idx)> ShouldReplace) {
+  // Note that this early exit is required for correctness; the implementation
+  // below relies on the number of users for this VPValue to decrease, which
+  // isn't the case if this == New.
   if (this == New)
     return;
+
   for (unsigned J = 0; J < getNumUsers();) {
     VPUser *User = Users[J];
     bool RemovedUser = false;

From 547685d9e4c64f6f5dadd0ac979ab312b9d395e7 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Sun, 21 Jan 2024 13:27:10 +0000
Subject: [PATCH 298/843] [clang-tidy][NFC] Enable exceptions in test for
 google-readability-casting

Add missing -fexceptions in test.
---
 .../test/clang-tidy/checkers/google/readability-casting.cpp     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
index fdc71167cd82b..bc53b54264af8 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/readability-casting.cpp
@@ -1,4 +1,4 @@
-// RUN: %check_clang_tidy -std=c++11-or-later %s google-readability-casting %t
+// RUN: %check_clang_tidy -std=c++11-or-later %s google-readability-casting %t -- -- -fexceptions
 
 bool g() { return false; }
 

From f73bf45d68bd2c17602a909751da4a23138d711a Mon Sep 17 00:00:00 2001
From: bobsayshilol <bobsayshilol@live.co.uk>
Date: Sun, 21 Jan 2024 15:15:46 +0000
Subject: [PATCH 299/843] [libc++] Fix typo in
 _LIBCPP_REMOVE_TRANSITIVE_INCLUDES (#78639)

Spotted by inspection when trialling out
`_LIBCPP_REMOVE_TRANSITIVE_INCLUDES`. No other instances of `_LIPCPP_`
were found in the repo.
---
 libcxx/include/concepts | 2 +-
 libcxx/include/sstream  | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/concepts b/libcxx/include/concepts
index 196fa2e0ea706..5fdf30ecfbd3f 100644
--- a/libcxx/include/concepts
+++ b/libcxx/include/concepts
@@ -155,7 +155,7 @@ namespace std {
 #include <__config>
 #include <version>
 
-#if _LIBCPP_STD_VER <= 20 && !defined(_LIPCPP_REMOVE_TRANSITIVE_INCLUDES)
+#if _LIBCPP_STD_VER <= 20 && !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES)
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/sstream b/libcxx/include/sstream
index 9f75b7e0ac9e4..6c354cf0b3973 100644
--- a/libcxx/include/sstream
+++ b/libcxx/include/sstream
@@ -1108,7 +1108,7 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#if _LIBCPP_STD_VER <= 20 && !defined(_LIPCPP_REMOVE_TRANSITIVE_INCLUDES)
+#if _LIBCPP_STD_VER <= 20 && !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES)
 #  include <type_traits>
 #endif
 

From 128d53f44cae211ad5501733ec1d9bc787b6f7a9 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Sun, 21 Jan 2024 11:13:12 -0500
Subject: [PATCH 300/843] [libc] add missing header dependencies for sched
 objects (#78741)

This patch fixes full build problems in
https://github.com/llvm/llvm-project/issues/78721 (the header problem).
The `libc.a` target can be built now.

As a separate issue, `check-libc` is failing because undefined symbols
from `libunwind`, which I do not actually know the reason yet. I will be
looking into it.
---
 libc/src/sched/linux/CMakeLists.txt | 8 ++++++++
 libc/src/spawn/linux/CMakeLists.txt | 1 +
 2 files changed, 9 insertions(+)

diff --git a/libc/src/sched/linux/CMakeLists.txt b/libc/src/sched/linux/CMakeLists.txt
index d53c6e5d45052..ac95bf85da534 100644
--- a/libc/src/sched/linux/CMakeLists.txt
+++ b/libc/src/sched/linux/CMakeLists.txt
@@ -52,6 +52,8 @@ add_entrypoint_object(
     ../sched_setparam.h
   DEPENDS
     libc.include.sys_syscall
+    libc.include.time
+    libc.include.sched
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
@@ -64,6 +66,8 @@ add_entrypoint_object(
     ../sched_getparam.h
   DEPENDS
     libc.include.sys_syscall
+    libc.include.time
+    libc.include.sched
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
@@ -76,6 +80,8 @@ add_entrypoint_object(
     ../sched_setscheduler.h
   DEPENDS
     libc.include.sys_syscall
+    libc.include.time
+    libc.include.sched
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
@@ -87,6 +93,7 @@ add_entrypoint_object(
   HDRS
     ../sched_getscheduler.h
   DEPENDS
+    libc.include.sched
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
@@ -124,6 +131,7 @@ add_entrypoint_object(
     ../sched_rr_get_interval.h
   DEPENDS
     libc.include.sys_syscall
+    libc.include.sched
     libc.src.__support.OSUtil.osutil
     libc.src.errno.errno
 )
diff --git a/libc/src/spawn/linux/CMakeLists.txt b/libc/src/spawn/linux/CMakeLists.txt
index 9bd3ac50415b1..9ef3a9d18b0c6 100644
--- a/libc/src/spawn/linux/CMakeLists.txt
+++ b/libc/src/spawn/linux/CMakeLists.txt
@@ -8,6 +8,7 @@ add_entrypoint_object(
     libc.include.fcntl
     libc.include.spawn
     libc.include.sys_syscall
+    libc.include.signal
     libc.src.__support.CPP.optional
     libc.src.__support.OSUtil.osutil
     libc.src.spawn.file_actions

From bc82cfb38d83f1afeb2c290aa472c2e2e88919cb Mon Sep 17 00:00:00 2001
From: Emma Pilkington <emma.pilkington95@gmail.com>
Date: Sun, 21 Jan 2024 11:54:47 -0500
Subject: [PATCH 301/843] [AMDGPU] Add an asm directive to track
 code_object_version (#76267)

Named '.amdhsa_code_object_version'. This directive sets the
e_ident[ABIVERSION] in the ELF header, and should be used as the assumed
COV for the rest of the asm file.

This commit also weakens the --amdhsa-code-object-version CL flag.
Previously, the CL flag took precedence over the IR flag. Now the IR
flag/asm directive take precedence over the CL flag. This is implemented
by merging a few COV-checking functions in AMDGPUBaseInfo.h.
---
 llvm/docs/AMDGPUUsage.rst                     |   8 ++
 llvm/include/llvm/MC/MCObjectWriter.h         |   3 +
 llvm/include/llvm/Support/AMDGPUMetadata.h    |  10 +-
 llvm/lib/MC/ELFObjectWriter.cpp               |  11 +-
 llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp   |  14 +-
 llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp   |   2 +-
 llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp |   2 +-
 .../AMDGPU/AMDGPUHSAMetadataStreamer.cpp      |   3 +-
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   4 +-
 .../AMDGPU/AMDGPULowerKernelAttributes.cpp    |   5 +-
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    |   2 +-
 llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp    |   2 +-
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  91 ++----------
 .../Disassembler/AMDGPUDisassembler.cpp       |   3 +-
 .../AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp  |  14 +-
 .../MCTargetDesc/AMDGPUELFObjectWriter.cpp    |  19 +--
 .../AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h  |   4 +-
 .../MCTargetDesc/AMDGPUTargetStreamer.cpp     | 134 ++----------------
 .../MCTargetDesc/AMDGPUTargetStreamer.h       |  59 +++-----
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   8 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  87 +++++-------
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |  31 ++--
 .../AMDGPU/codegen-internal-only-func.ll      |   2 +
 .../AMDGPU/tid-mul-func-xnack-all-any.ll      |   4 +-
 .../tid-mul-func-xnack-all-not-supported.ll   |   4 +-
 .../AMDGPU/tid-mul-func-xnack-all-off.ll      |   4 +-
 .../AMDGPU/tid-mul-func-xnack-all-on.ll       |   4 +-
 .../AMDGPU/tid-mul-func-xnack-any-off-1.ll    |   4 +-
 .../AMDGPU/tid-mul-func-xnack-any-off-2.ll    |   4 +-
 .../AMDGPU/tid-mul-func-xnack-any-on-1.ll     |   4 +-
 .../AMDGPU/tid-mul-func-xnack-any-on-2.ll     |   4 +-
 .../CodeGen/AMDGPU/tid-one-func-xnack-any.ll  |   4 +-
 .../tid-one-func-xnack-not-supported.ll       |   4 +-
 .../CodeGen/AMDGPU/tid-one-func-xnack-off.ll  |   4 +-
 .../CodeGen/AMDGPU/tid-one-func-xnack-on.ll   |   4 +-
 llvm/test/MC/AMDGPU/elf-header-cov.s          |  24 ++++
 llvm/test/MC/AMDGPU/hsa-exp.s                 |   7 +-
 llvm/test/MC/AMDGPU/hsa-gfx12-v4.s            |   7 +-
 llvm/test/MC/AMDGPU/hsa-v4.s                  |   7 +-
 .../MC/AMDGPU/hsa-v5-uses-dynamic-stack.s     |   7 +-
 llvm/test/MC/AMDGPU/hsa_isa_version_attrs.s   |   6 -
 41 files changed, 217 insertions(+), 407 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/elf-header-cov.s
 delete mode 100644 llvm/test/MC/AMDGPU/hsa_isa_version_attrs.s

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 768358c345f0a..d2d622ee52017 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -15428,6 +15428,14 @@ command-line options such as ``-triple``, ``-mcpu``, and
   The target ID syntax used for code object V2 to V3 for this directive differs
   from that used elsewhere. See :ref:`amdgpu-target-id-v2-v3`.
 
+.. _amdgpu-assembler-directive-amdhsa-code-object-version:
+
+.amdhsa_code_object_version <version>
++++++++++++++++++++++++++++++++++++++
+
+Optional directive which declares the code object version to be generated by the
+assembler. If not present, a default value will be used.
+
 .amdhsa_kernel <name>
 +++++++++++++++++++++
 
diff --git a/llvm/include/llvm/MC/MCObjectWriter.h b/llvm/include/llvm/MC/MCObjectWriter.h
index 8c10452373933..e66bb2c932dd0 100644
--- a/llvm/include/llvm/MC/MCObjectWriter.h
+++ b/llvm/include/llvm/MC/MCObjectWriter.h
@@ -92,6 +92,9 @@ class MCObjectWriter {
   /// ELF only. Mark that we have seen GNU ABI usage (e.g. SHF_GNU_RETAIN).
   virtual void markGnuAbi() {}
 
+  /// ELF only, override the default ABIVersion in the ELF header.
+  virtual void setOverrideABIVersion(uint8_t ABIVersion) {}
+
   /// Tell the object writer to emit an address-significance table during
   /// writeObject(). If this function is not called, all symbols are treated as
   /// address-significant.
diff --git a/llvm/include/llvm/Support/AMDGPUMetadata.h b/llvm/include/llvm/Support/AMDGPUMetadata.h
index e0838a1f425ea..2dae6feac0889 100644
--- a/llvm/include/llvm/Support/AMDGPUMetadata.h
+++ b/llvm/include/llvm/Support/AMDGPUMetadata.h
@@ -29,11 +29,6 @@ namespace AMDGPU {
 //===----------------------------------------------------------------------===//
 namespace HSAMD {
 
-/// HSA metadata major version for code object V2.
-constexpr uint32_t VersionMajorV2 = 1;
-/// HSA metadata minor version for code object V2.
-constexpr uint32_t VersionMinorV2 = 0;
-
 /// HSA metadata major version for code object V3.
 constexpr uint32_t VersionMajorV3 = 1;
 /// HSA metadata minor version for code object V3.
@@ -49,10 +44,9 @@ constexpr uint32_t VersionMajorV5 = 1;
 /// HSA metadata minor version for code object V5.
 constexpr uint32_t VersionMinorV5 = 2;
 
-/// HSA metadata beginning assembler directive.
+/// Old HSA metadata beginning assembler directive for V2. This is only used for
+/// diagnostics now.
 constexpr char AssemblerDirectiveBegin[] = ".amd_amdgpu_hsa_metadata";
-/// HSA metadata ending assembler directive.
-constexpr char AssemblerDirectiveEnd[] = ".end_amd_amdgpu_hsa_metadata";
 
 /// Access qualifiers.
 enum class AccessQualifier : uint8_t {
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index cb8af1aa99551..531d29954c382 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -226,6 +226,8 @@ class ELFObjectWriter : public MCObjectWriter {
 
   bool SeenGnuAbi = false;
 
+  std::optional<uint8_t> OverrideABIVersion;
+
   bool hasRelocationAddend() const;
 
   bool shouldRelocateWithSymbol(const MCAssembler &Asm, const MCValue &Val,
@@ -238,6 +240,7 @@ class ELFObjectWriter : public MCObjectWriter {
 
   void reset() override {
     SeenGnuAbi = false;
+    OverrideABIVersion.reset();
     Relocations.clear();
     Renames.clear();
     MCObjectWriter::reset();
@@ -264,6 +267,10 @@ class ELFObjectWriter : public MCObjectWriter {
   void markGnuAbi() override { SeenGnuAbi = true; }
   bool seenGnuAbi() const { return SeenGnuAbi; }
 
+  bool seenOverrideABIVersion() const { return OverrideABIVersion.has_value(); }
+  uint8_t getOverrideABIVersion() const { return OverrideABIVersion.value(); }
+  void setOverrideABIVersion(uint8_t V) override { OverrideABIVersion = V; }
+
   friend struct ELFWriter;
 };
 
@@ -417,7 +424,9 @@ void ELFWriter::writeHeader(const MCAssembler &Asm) {
                    ? int(ELF::ELFOSABI_GNU)
                    : OSABI);
   // e_ident[EI_ABIVERSION]
-  W.OS << char(OWriter.TargetObjectWriter->getABIVersion());
+  W.OS << char(OWriter.seenOverrideABIVersion()
+                   ? OWriter.getOverrideABIVersion()
+                   : OWriter.TargetObjectWriter->getABIVersion());
 
   W.OS.write_zeros(ELF::EI_NIDENT - ELF::EI_PAD);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 10f7e7a26edb4..279ef8ca2751a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -123,8 +123,11 @@ void AMDGPUAsmPrinter::initTargetStreamer(Module &M) {
 
   getTargetStreamer()->EmitDirectiveAMDGCNTarget();
 
-  if (TM.getTargetTriple().getOS() == Triple::AMDHSA)
+  if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
+    getTargetStreamer()->EmitDirectiveAMDHSACodeObjectVersion(
+        CodeObjectVersion);
     HSAMetadataStream->begin(M, *getTargetStreamer()->getTargetID());
+  }
 
   if (TM.getTargetTriple().getOS() == Triple::AMDPAL)
     getTargetStreamer()->getPALMetadata()->readFromIR(M);
@@ -230,8 +233,7 @@ void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
           IsaInfo::getNumExtraSGPRs(
               &STM, CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
               getTargetStreamer()->getTargetID()->isXnackOnOrAny()),
-      CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed,
-      CodeObjectVersion);
+      CurrentProgramInfo.VCCUsed, CurrentProgramInfo.FlatUsed);
 
   Streamer.popSection();
 }
@@ -323,7 +325,7 @@ void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
 }
 
 bool AMDGPUAsmPrinter::doInitialization(Module &M) {
-  CodeObjectVersion = AMDGPU::getCodeObjectVersion(M);
+  CodeObjectVersion = AMDGPU::getAMDHSACodeObjectVersion(M);
 
   if (TM.getTargetTriple().getOS() == Triple::AMDHSA) {
     switch (CodeObjectVersion) {
@@ -631,8 +633,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 void AMDGPUAsmPrinter::initializeTargetID(const Module &M) {
   // In the beginning all features are either 'Any' or 'NotSupported',
   // depending on global target features. This will cover empty modules.
-  getTargetStreamer()->initializeTargetID(
-      *getGlobalSTI(), getGlobalSTI()->getFeatureString(), CodeObjectVersion);
+  getTargetStreamer()->initializeTargetID(*getGlobalSTI(),
+                                          getGlobalSTI()->getFeatureString());
 
   // If module is empty, we are done.
   if (M.empty())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
index 5fd9e571282db..d7f5110427ece 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAttributor.cpp
@@ -144,7 +144,7 @@ class AMDGPUInformationCache : public InformationCache {
                          BumpPtrAllocator &Allocator,
                          SetVector<Function *> *CGSCC, TargetMachine &TM)
       : InformationCache(M, AG, Allocator, CGSCC), TM(TM),
-        CodeObjectVersion(AMDGPU::getCodeObjectVersion(M)) {}
+        CodeObjectVersion(AMDGPU::getAMDHSACodeObjectVersion(M)) {}
 
   TargetMachine &TM;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index d3b2cb1936b53..6d05c3678bf09 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -474,7 +474,7 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
 
   const Module *M = MF.getFunction().getParent();
   if (UserSGPRInfo.hasQueuePtr() &&
-      AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
+      AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
     Register QueuePtrReg = Info.addQueuePtr(TRI);
     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(QueuePtrReg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 74e9cd7d09654..186fa58524b9f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -532,7 +532,8 @@ void MetadataStreamerMsgPackV4::emitKernel(const MachineFunction &MF,
       Func.getCallingConv() != CallingConv::SPIR_KERNEL)
     return;
 
-  auto CodeObjectVersion = AMDGPU::getCodeObjectVersion(*Func.getParent());
+  auto CodeObjectVersion =
+      AMDGPU::getAMDHSACodeObjectVersion(*Func.getParent());
   auto Kern = getHSAKernelProps(MF, ProgramInfo, CodeObjectVersion);
 
   auto Kernels =
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 0040cd7c3f1a1..8e74d4c0e9459 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -2139,7 +2139,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
     LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
   // For code object version 5, private_base and shared_base are passed through
   // implicit kernargs.
-  if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
+  if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
       AMDGPU::AMDHSA_COV5) {
     AMDGPUTargetLowering::ImplicitParameter Param =
         AS == AMDGPUAS::LOCAL_ADDRESS ? AMDGPUTargetLowering::SHARED_BASE
@@ -6582,7 +6582,7 @@ bool AMDGPULegalizerInfo::legalizeTrapHsaQueuePtr(
 
   Register SGPR01(AMDGPU::SGPR0_SGPR1);
   // For code object version 5, queue_ptr is passed through implicit kernarg.
-  if (AMDGPU::getCodeObjectVersion(*MF.getFunction().getParent()) >=
+  if (AMDGPU::getAMDHSACodeObjectVersion(*MF.getFunction().getParent()) >=
       AMDGPU::AMDHSA_COV5) {
     AMDGPUTargetLowering::ImplicitParameter Param =
         AMDGPUTargetLowering::QUEUE_PTR;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 097722157d410..bf7f67c086f2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -323,7 +323,8 @@ static bool processUse(CallInst *CI, bool IsV5OrAbove) {
 // TargetPassConfig for subtarget.
 bool AMDGPULowerKernelAttributes::runOnModule(Module &M) {
   bool MadeChange = false;
-  bool IsV5OrAbove = AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
+  bool IsV5OrAbove =
+      AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5;
   Function *BasePtr = getBasePtrIntrinsic(M, IsV5OrAbove);
 
   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
@@ -356,7 +357,7 @@ ModulePass *llvm::createAMDGPULowerKernelAttributesPass() {
 PreservedAnalyses
 AMDGPULowerKernelAttributesPass::run(Function &F, FunctionAnalysisManager &AM) {
   bool IsV5OrAbove =
-      AMDGPU::getCodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
+      AMDGPU::getAMDHSACodeObjectVersion(*F.getParent()) >= AMDGPU::AMDHSA_COV5;
   Function *BasePtr = getBasePtrIntrinsic(*F.getParent(), IsV5OrAbove);
 
   if (!BasePtr) // ImplicitArgPtr/DispatchPtr not used.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index fc47b02c98e03..0c759e7f3b095 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -112,7 +112,7 @@ bool AMDGPUResourceUsageAnalysis::runOnModule(Module &M) {
 
   // By default, for code object v5 and later, track only the minimum scratch
   // size
-  if (AMDGPU::getCodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
+  if (AMDGPU::getAMDHSACodeObjectVersion(M) >= AMDGPU::AMDHSA_COV5 ||
       STI.getTargetTriple().getOS() == Triple::AMDPAL) {
     if (!AssumedStackSizeForDynamicSizeObjects.getNumOccurrences())
       AssumedStackSizeForDynamicSizeObjects = 0;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index f19c576685640..bcc7dedf32296 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -571,7 +571,7 @@ unsigned AMDGPUSubtarget::getImplicitArgNumBytes(const Function &F) const {
   // Assume all implicit inputs are used by default
   const Module *M = F.getParent();
   unsigned NBytes =
-      AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
+      AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5 ? 256 : 56;
   return F.getFnAttributeAsParsedInteger("amdgpu-implicitarg-num-bytes",
                                          NBytes);
 }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index bd68054589b11..66267c9255f41 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1303,10 +1303,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
                           unsigned NextFreeSGPR, SMRange SGPRRange,
                           unsigned &VGPRBlocks, unsigned &SGPRBlocks);
   bool ParseDirectiveAMDGCNTarget();
+  bool ParseDirectiveAMDHSACodeObjectVersion();
   bool ParseDirectiveAMDHSAKernel();
-  bool ParseDirectiveMajorMinor(uint32_t &Major, uint32_t &Minor);
-  bool ParseDirectiveHSACodeObjectVersion();
-  bool ParseDirectiveHSACodeObjectISA();
   bool ParseAMDKernelCodeTValue(StringRef ID, amd_kernel_code_t &Header);
   bool ParseDirectiveAMDKernelCodeT();
   // TODO: Possibly make subtargetHasRegister const.
@@ -5133,20 +5131,6 @@ bool AMDGPUAsmParser::ParseAsAbsoluteExpression(uint32_t &Ret) {
   return false;
 }
 
-bool AMDGPUAsmParser::ParseDirectiveMajorMinor(uint32_t &Major,
-                                               uint32_t &Minor) {
-  if (ParseAsAbsoluteExpression(Major))
-    return TokError("invalid major version");
-
-  if (!trySkipToken(AsmToken::Comma))
-    return TokError("minor version number required, comma expected");
-
-  if (ParseAsAbsoluteExpression(Minor))
-    return TokError("invalid minor version");
-
-  return false;
-}
-
 bool AMDGPUAsmParser::ParseDirectiveAMDGCNTarget() {
   if (getSTI().getTargetTriple().getArch() != Triple::amdgcn)
     return TokError("directive only supported for amdgcn architecture");
@@ -5612,63 +5596,18 @@ bool AMDGPUAsmParser::ParseDirectiveAMDHSAKernel() {
     }
   }
 
-  getTargetStreamer().EmitAmdhsaKernelDescriptor(
-      getSTI(), KernelName, KD, NextFreeVGPR, NextFreeSGPR, ReserveVCC,
-      ReserveFlatScr, AMDGPU::getAmdhsaCodeObjectVersion());
-  return false;
-}
-
-bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectVersion() {
-  uint32_t Major;
-  uint32_t Minor;
-
-  if (ParseDirectiveMajorMinor(Major, Minor))
-    return true;
-
-  getTargetStreamer().EmitDirectiveHSACodeObjectVersion(Major, Minor);
+  getTargetStreamer().EmitAmdhsaKernelDescriptor(getSTI(), KernelName, KD,
+                                                 NextFreeVGPR, NextFreeSGPR,
+                                                 ReserveVCC, ReserveFlatScr);
   return false;
 }
 
-bool AMDGPUAsmParser::ParseDirectiveHSACodeObjectISA() {
-  uint32_t Major;
-  uint32_t Minor;
-  uint32_t Stepping;
-  StringRef VendorName;
-  StringRef ArchName;
-
-  // If this directive has no arguments, then use the ISA version for the
-  // targeted GPU.
-  if (isToken(AsmToken::EndOfStatement)) {
-    AMDGPU::IsaVersion ISA = AMDGPU::getIsaVersion(getSTI().getCPU());
-    getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(ISA.Major, ISA.Minor,
-                                                        ISA.Stepping,
-                                                        "AMD", "AMDGPU");
-    return false;
-  }
-
-  if (ParseDirectiveMajorMinor(Major, Minor))
-    return true;
-
-  if (!trySkipToken(AsmToken::Comma))
-    return TokError("stepping version number required, comma expected");
-
-  if (ParseAsAbsoluteExpression(Stepping))
-    return TokError("invalid stepping version");
-
-  if (!trySkipToken(AsmToken::Comma))
-    return TokError("vendor name required, comma expected");
-
-  if (!parseString(VendorName, "invalid vendor name"))
-    return true;
-
-  if (!trySkipToken(AsmToken::Comma))
-    return TokError("arch name required, comma expected");
-
-  if (!parseString(ArchName, "invalid arch name"))
+bool AMDGPUAsmParser::ParseDirectiveAMDHSACodeObjectVersion() {
+  uint32_t Version;
+  if (ParseAsAbsoluteExpression(Version))
     return true;
 
-  getTargetStreamer().EmitDirectiveHSACodeObjectISAV2(Major, Minor, Stepping,
-                                                      VendorName, ArchName);
+  getTargetStreamer().EmitDirectiveAMDHSACodeObjectVersion(Version);
   return false;
 }
 
@@ -5955,16 +5894,13 @@ bool AMDGPUAsmParser::ParseDirective(AsmToken DirectiveID) {
     if (IDVal == ".amdhsa_kernel")
      return ParseDirectiveAMDHSAKernel();
 
+    if (IDVal == ".amdhsa_code_object_version")
+      return ParseDirectiveAMDHSACodeObjectVersion();
+
     // TODO: Restructure/combine with PAL metadata directive.
     if (IDVal == AMDGPU::HSAMD::V3::AssemblerDirectiveBegin)
       return ParseDirectiveHSAMetadata();
   } else {
-    if (IDVal == ".hsa_code_object_version")
-      return ParseDirectiveHSACodeObjectVersion();
-
-    if (IDVal == ".hsa_code_object_isa")
-      return ParseDirectiveHSACodeObjectISA();
-
     if (IDVal == ".amd_kernel_code_t")
       return ParseDirectiveAMDKernelCodeT();
 
@@ -8137,9 +8073,8 @@ void AMDGPUAsmParser::onBeginOfFile() {
     return;
 
   if (!getTargetStreamer().getTargetID())
-    getTargetStreamer().initializeTargetID(getSTI(), getSTI().getFeatureString(),
-        // TODO: Should try to check code object version from directive???
-        AMDGPU::getAmdhsaCodeObjectVersion());
+    getTargetStreamer().initializeTargetID(getSTI(),
+                                           getSTI().getFeatureString());
 
   if (isHsaAbi(getSTI()))
     getTargetStreamer().EmitDirectiveAMDGCNTarget();
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 441032a37dfd9..86096b0d80b42 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -2184,7 +2184,8 @@ AMDGPUDisassembler::decodeKernelDescriptorDirective(
                       KERNEL_CODE_PROPERTY_ENABLE_WAVEFRONT_SIZE32);
     }
 
-    if (AMDGPU::getAmdhsaCodeObjectVersion() >= AMDGPU::AMDHSA_COV5)
+    // FIXME: We should be looking at the ELF header ABI version for this.
+    if (AMDGPU::getDefaultAMDHSACodeObjectVersion() >= AMDGPU::AMDHSA_COV5)
       PRINT_DIRECTIVE(".amdhsa_uses_dynamic_stack",
                       KERNEL_CODE_PROPERTY_USES_DYNAMIC_STACK);
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index f91f36ed851b7..8eb246ef57c95 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -232,13 +232,11 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
   bool Is64Bit;
   bool HasRelocationAddend;
   uint8_t OSABI = ELF::ELFOSABI_NONE;
-  uint8_t ABIVersion = 0;
 
 public:
-  ELFAMDGPUAsmBackend(const Target &T, const Triple &TT, uint8_t ABIVersion) :
-      AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
-      HasRelocationAddend(TT.getOS() == Triple::AMDHSA),
-      ABIVersion(ABIVersion) {
+  ELFAMDGPUAsmBackend(const Target &T, const Triple &TT)
+      : AMDGPUAsmBackend(T), Is64Bit(TT.getArch() == Triple::amdgcn),
+        HasRelocationAddend(TT.getOS() == Triple::AMDHSA) {
     switch (TT.getOS()) {
     case Triple::AMDHSA:
       OSABI = ELF::ELFOSABI_AMDGPU_HSA;
@@ -256,8 +254,7 @@ class ELFAMDGPUAsmBackend : public AMDGPUAsmBackend {
 
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override {
-    return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend,
-                                       ABIVersion);
+    return createAMDGPUELFObjectWriter(Is64Bit, OSABI, HasRelocationAddend);
   }
 };
 
@@ -267,6 +264,5 @@ MCAsmBackend *llvm::createAMDGPUAsmBackend(const Target &T,
                                            const MCSubtargetInfo &STI,
                                            const MCRegisterInfo &MRI,
                                            const MCTargetOptions &Options) {
-  return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple(),
-                                 getHsaAbiVersion(&STI).value_or(0));
+  return new ELFAMDGPUAsmBackend(T, STI.getTargetTriple());
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index 58eed81e07556..2d960a32339f4 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -18,8 +18,7 @@ namespace {
 
 class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
 public:
-  AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend,
-                        uint8_t ABIVersion);
+  AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI, bool HasRelocationAddend);
 
 protected:
   unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
@@ -29,12 +28,10 @@ class AMDGPUELFObjectWriter : public MCELFObjectTargetWriter {
 
 } // end anonymous namespace
 
-AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit,
-                                             uint8_t OSABI,
-                                             bool HasRelocationAddend,
-                                             uint8_t ABIVersion)
-  : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU,
-                            HasRelocationAddend, ABIVersion) {}
+AMDGPUELFObjectWriter::AMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
+                                             bool HasRelocationAddend)
+    : MCELFObjectTargetWriter(Is64Bit, OSABI, ELF::EM_AMDGPU,
+                              HasRelocationAddend) {}
 
 unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
                                              const MCValue &Target,
@@ -100,9 +97,7 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
 
 std::unique_ptr<MCObjectTargetWriter>
 llvm::createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
-                                  bool HasRelocationAddend,
-                                  uint8_t ABIVersion) {
+                                  bool HasRelocationAddend) {
   return std::make_unique<AMDGPUELFObjectWriter>(Is64Bit, OSABI,
-                                                  HasRelocationAddend,
-                                                  ABIVersion);
+                                                 HasRelocationAddend);
 }
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 006115ba14fc1..3ef00f75735b0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -42,8 +42,8 @@ MCAsmBackend *createAMDGPUAsmBackend(const Target &T,
 
 std::unique_ptr<MCObjectTargetWriter>
 createAMDGPUELFObjectWriter(bool Is64Bit, uint8_t OSABI,
-                            bool HasRelocationAddend, uint8_t ABIVersion);
-} // End llvm namespace
+                            bool HasRelocationAddend);
+} // namespace llvm
 
 #define GET_REGINFO_ENUM
 #include "AMDGPUGenRegisterInfo.inc"
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index e135a4e25dd15..d7e8ab76d5ffe 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -20,6 +20,7 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFStreamer.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/AMDGPUMetadata.h"
@@ -35,27 +36,6 @@ using namespace llvm::AMDGPU;
 // AMDGPUTargetStreamer
 //===----------------------------------------------------------------------===//
 
-static void convertIsaVersionV2(uint32_t &Major, uint32_t &Minor,
-                                uint32_t &Stepping, bool Sramecc, bool Xnack) {
-  if (Major == 9 && Minor == 0) {
-    switch (Stepping) {
-      case 0:
-      case 2:
-      case 4:
-      case 6:
-        if (Xnack)
-          Stepping++;
-    }
-  }
-}
-
-bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
-  HSAMD::Metadata HSAMetadata;
-  if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
-    return false;
-  return EmitHSAMetadata(HSAMetadata);
-}
-
 bool AMDGPUTargetStreamer::EmitHSAMetadataV3(StringRef HSAMetadataString) {
   msgpack::Document HSAMetadataDoc;
   if (!HSAMetadataDoc.fromYAML(HSAMetadataString))
@@ -238,21 +218,10 @@ void AMDGPUTargetAsmStreamer::EmitDirectiveAMDGCNTarget() {
   OS << "\t.amdgcn_target \"" << getTargetID()->toString() << "\"\n";
 }
 
-void AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectVersion(
-    uint32_t Major, uint32_t Minor) {
-  OS << "\t.hsa_code_object_version " <<
-        Twine(Major) << "," << Twine(Minor) << '\n';
-}
-
-void
-AMDGPUTargetAsmStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
-                                                         uint32_t Minor,
-                                                         uint32_t Stepping,
-                                                         StringRef VendorName,
-                                                         StringRef ArchName) {
-  convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
-  OS << "\t.hsa_code_object_isa " << Twine(Major) << "," << Twine(Minor) << ","
-     << Twine(Stepping) << ",\"" << VendorName << "\",\"" << ArchName << "\"\n";
+void AMDGPUTargetAsmStreamer::EmitDirectiveAMDHSACodeObjectVersion(
+    unsigned COV) {
+  AMDGPUTargetStreamer::EmitDirectiveAMDHSACodeObjectVersion(COV);
+  OS << "\t.amdhsa_code_object_version " << COV << '\n';
 }
 
 void
@@ -283,18 +252,6 @@ bool AMDGPUTargetAsmStreamer::EmitISAVersion() {
   return true;
 }
 
-bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
-    const AMDGPU::HSAMD::Metadata &HSAMetadata) {
-  std::string HSAMetadataString;
-  if (HSAMD::toString(HSAMetadata, HSAMetadataString))
-    return false;
-
-  OS << '\t' << HSAMD::AssemblerDirectiveBegin << '\n';
-  OS << HSAMetadataString << '\n';
-  OS << '\t' << HSAMD::AssemblerDirectiveEnd << '\n';
-  return true;
-}
-
 bool AMDGPUTargetAsmStreamer::EmitHSAMetadata(
     msgpack::Document &HSAMetadataDoc, bool Strict) {
   HSAMD::V3::MetadataVerifier Verifier(Strict);
@@ -336,7 +293,7 @@ bool AMDGPUTargetAsmStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
 void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
     const MCSubtargetInfo &STI, StringRef KernelName,
     const amdhsa::kernel_descriptor_t &KD, uint64_t NextVGPR, uint64_t NextSGPR,
-    bool ReserveVCC, bool ReserveFlatScr, unsigned CodeObjectVersion) {
+    bool ReserveVCC, bool ReserveFlatScr) {
   IsaVersion IVersion = getIsaVersion(STI.getCPU());
 
   OS << "\t.amdhsa_kernel " << KernelName << '\n';
@@ -529,6 +486,8 @@ MCELFStreamer &AMDGPUTargetELFStreamer::getStreamer() {
 void AMDGPUTargetELFStreamer::finish() {
   MCAssembler &MCA = getStreamer().getAssembler();
   MCA.setELFHeaderEFlags(getEFlags());
+  MCA.getWriter().setOverrideABIVersion(
+      getELFABIVersion(STI.getTargetTriple(), CodeObjectVersion));
 
   std::string Blob;
   const char *Vendor = getPALMetadata()->getVendor();
@@ -616,17 +575,7 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsUnknownOS() {
 unsigned AMDGPUTargetELFStreamer::getEFlagsAMDHSA() {
   assert(isHsaAbi(STI));
 
-  if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(&STI)) {
-    switch (*HsaAbiVer) {
-    case ELF::ELFABIVERSION_AMDGPU_HSA_V3:
-      return getEFlagsV3();
-    case ELF::ELFABIVERSION_AMDGPU_HSA_V4:
-    case ELF::ELFABIVERSION_AMDGPU_HSA_V5:
-      return getEFlagsV4();
-    }
-  }
-
-  llvm_unreachable("HSA OS ABI Version identification must be defined");
+  return getEFlagsV4();
 }
 
 unsigned AMDGPUTargetELFStreamer::getEFlagsAMDPAL() {
@@ -699,44 +648,6 @@ unsigned AMDGPUTargetELFStreamer::getEFlagsV4() {
 
 void AMDGPUTargetELFStreamer::EmitDirectiveAMDGCNTarget() {}
 
-void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
-    uint32_t Major, uint32_t Minor) {
-
-  EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
-           ELF::NT_AMD_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
-             OS.emitInt32(Major);
-             OS.emitInt32(Minor);
-           });
-}
-
-void
-AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISAV2(uint32_t Major,
-                                                         uint32_t Minor,
-                                                         uint32_t Stepping,
-                                                         StringRef VendorName,
-                                                         StringRef ArchName) {
-  uint16_t VendorNameSize = VendorName.size() + 1;
-  uint16_t ArchNameSize = ArchName.size() + 1;
-
-  unsigned DescSZ = sizeof(VendorNameSize) + sizeof(ArchNameSize) +
-    sizeof(Major) + sizeof(Minor) + sizeof(Stepping) +
-    VendorNameSize + ArchNameSize;
-
-  convertIsaVersionV2(Major, Minor, Stepping, TargetID->isSramEccOnOrAny(), TargetID->isXnackOnOrAny());
-  EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
-           ELF::NT_AMD_HSA_ISA_VERSION, [&](MCELFStreamer &OS) {
-             OS.emitInt16(VendorNameSize);
-             OS.emitInt16(ArchNameSize);
-             OS.emitInt32(Major);
-             OS.emitInt32(Minor);
-             OS.emitInt32(Stepping);
-             OS.emitBytes(VendorName);
-             OS.emitInt8(0); // NULL terminate VendorName
-             OS.emitBytes(ArchName);
-             OS.emitInt8(0); // NULL terminate ArchName
-           });
-}
-
 void
 AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
 
@@ -818,30 +729,6 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
   return true;
 }
 
-bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
-    const AMDGPU::HSAMD::Metadata &HSAMetadata) {
-  std::string HSAMetadataString;
-  if (HSAMD::toString(HSAMetadata, HSAMetadataString))
-    return false;
-
-  // Create two labels to mark the beginning and end of the desc field
-  // and a MCExpr to calculate the size of the desc field.
-  auto &Context = getContext();
-  auto *DescBegin = Context.createTempSymbol();
-  auto *DescEnd = Context.createTempSymbol();
-  auto *DescSZ = MCBinaryExpr::createSub(
-    MCSymbolRefExpr::create(DescEnd, Context),
-    MCSymbolRefExpr::create(DescBegin, Context), Context);
-
-  EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_HSA_METADATA,
-           [&](MCELFStreamer &OS) {
-             OS.emitLabel(DescBegin);
-             OS.emitBytes(HSAMetadataString);
-             OS.emitLabel(DescEnd);
-           });
-  return true;
-}
-
 bool AMDGPUTargetAsmStreamer::EmitKernargPreloadHeader(
     const MCSubtargetInfo &STI) {
   for (int i = 0; i < 64; ++i) {
@@ -889,8 +776,7 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd(const MCSubtargetInfo &STI) {
 void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
     const MCSubtargetInfo &STI, StringRef KernelName,
     const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-    uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
-    unsigned CodeObjectVersion) {
+    uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {
   auto &Streamer = getStreamer();
   auto &Context = Streamer.getContext();
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 55b5246c92100..7f8ddc42b2eef 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -37,23 +37,24 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
 protected:
   // TODO: Move HSAMetadataStream to AMDGPUTargetStreamer.
   std::optional<AMDGPU::IsaInfo::AMDGPUTargetID> TargetID;
+  unsigned CodeObjectVersion;
 
   MCContext &getContext() const { return Streamer.getContext(); }
 
 public:
-  AMDGPUTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
+  AMDGPUTargetStreamer(MCStreamer &S)
+      : MCTargetStreamer(S),
+        // Assume the default COV for now, EmitDirectiveAMDHSACodeObjectVersion
+        // will update this if it is encountered.
+        CodeObjectVersion(AMDGPU::getDefaultAMDHSACodeObjectVersion()) {}
 
   AMDGPUPALMetadata *getPALMetadata() { return &PALMetadata; }
 
   virtual void EmitDirectiveAMDGCNTarget(){};
 
-  virtual void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
-                                                 uint32_t Minor){};
-
-  virtual void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
-                                               uint32_t Stepping,
-                                               StringRef VendorName,
-                                               StringRef ArchName){};
+  virtual void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV) {
+    CodeObjectVersion = COV;
+  }
 
   virtual void EmitAMDKernelCodeT(const amd_kernel_code_t &Header){};
 
@@ -65,9 +66,6 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   /// \returns True on success, false on failure.
   virtual bool EmitISAVersion() { return true; }
 
-  /// \returns True on success, false on failure.
-  virtual bool EmitHSAMetadataV2(StringRef HSAMetadataString);
-
   /// \returns True on success, false on failure.
   virtual bool EmitHSAMetadataV3(StringRef HSAMetadataString);
 
@@ -98,8 +96,7 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   virtual void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
       const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
-      unsigned CodeObjectVersion){};
+      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) {}
 
   static StringRef getArchNameFromElfMach(unsigned ElfMach);
   static unsigned getElfMach(StringRef GPU);
@@ -110,15 +107,12 @@ class AMDGPUTargetStreamer : public MCTargetStreamer {
   std::optional<AMDGPU::IsaInfo::AMDGPUTargetID> &getTargetID() {
     return TargetID;
   }
-  void initializeTargetID(const MCSubtargetInfo &STI,
-                          unsigned CodeObjectVersion) {
+  void initializeTargetID(const MCSubtargetInfo &STI) {
     assert(TargetID == std::nullopt && "TargetID can only be initialized once");
     TargetID.emplace(STI);
-    getTargetID()->setCodeObjectVersion(CodeObjectVersion);
   }
-  void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString,
-                          unsigned CodeObjectVersion) {
-    initializeTargetID(STI, CodeObjectVersion);
+  void initializeTargetID(const MCSubtargetInfo &STI, StringRef FeatureString) {
+    initializeTargetID(STI);
 
     assert(getTargetID() != std::nullopt && "TargetID is None");
     getTargetID()->setTargetIDFromFeaturesString(FeatureString);
@@ -134,12 +128,7 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
 
   void EmitDirectiveAMDGCNTarget() override;
 
-  void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
-                                         uint32_t Minor) override;
-
-  void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
-                                       uint32_t Stepping, StringRef VendorName,
-                                       StringRef ArchName) override;
+  void EmitDirectiveAMDHSACodeObjectVersion(unsigned COV) override;
 
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
 
@@ -153,9 +142,6 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   /// \returns True on success, false on failure.
   bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
 
-  /// \returns True on success, false on failure.
-  bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
-
   /// \returns True on success, false on failure.
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
@@ -165,8 +151,7 @@ class AMDGPUTargetAsmStreamer final : public AMDGPUTargetStreamer {
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
       const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
-      unsigned CodeObjectVersion) override;
+      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
 };
 
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
@@ -198,13 +183,6 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
 
   void EmitDirectiveAMDGCNTarget() override;
 
-  void EmitDirectiveHSACodeObjectVersion(uint32_t Major,
-                                         uint32_t Minor) override;
-
-  void EmitDirectiveHSACodeObjectISAV2(uint32_t Major, uint32_t Minor,
-                                       uint32_t Stepping, StringRef VendorName,
-                                       StringRef ArchName) override;
-
   void EmitAMDKernelCodeT(const amd_kernel_code_t &Header) override;
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
@@ -217,9 +195,6 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   /// \returns True on success, false on failure.
   bool EmitHSAMetadata(msgpack::Document &HSAMetadata, bool Strict) override;
 
-  /// \returns True on success, false on failure.
-  bool EmitHSAMetadata(const AMDGPU::HSAMD::Metadata &HSAMetadata) override;
-
   /// \returns True on success, false on failure.
   bool EmitCodeEnd(const MCSubtargetInfo &STI) override;
 
@@ -229,9 +204,7 @@ class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   void EmitAmdhsaKernelDescriptor(
       const MCSubtargetInfo &STI, StringRef KernelName,
       const amdhsa::kernel_descriptor_t &KernelDescriptor, uint64_t NextVGPR,
-      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr,
-      unsigned CodeObjectVersion) override;
+      uint64_t NextSGPR, bool ReserveVCC, bool ReserveFlatScr) override;
 };
-
 }
 #endif
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cc0c4d4e36eaa..073c8cc721173 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2302,7 +2302,7 @@ void SITargetLowering::allocateSpecialInputSGPRs(
 
   const Module *M = MF.getFunction().getParent();
   if (UserSGPRInfo.hasQueuePtr() &&
-      AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)
+      AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5)
     allocateSGPR64Input(CCInfo, ArgInfo.QueuePtr);
 
   // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
@@ -2355,7 +2355,7 @@ void SITargetLowering::allocateHSAUserSGPRs(CCState &CCInfo,
 
   const Module *M = MF.getFunction().getParent();
   if (UserSGPRInfo.hasQueuePtr() &&
-      AMDGPU::getCodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
+      AMDGPU::getAMDHSACodeObjectVersion(*M) < AMDGPU::AMDHSA_COV5) {
     Register QueuePtrReg = Info.addQueuePtr(TRI);
     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(QueuePtrReg);
@@ -6438,7 +6438,7 @@ SDValue SITargetLowering::lowerTrapHsaQueuePtr(
   SDValue QueuePtr;
   // For code object version 5, QueuePtr is passed through implicit kernarg.
   const Module *M = DAG.getMachineFunction().getFunction().getParent();
-  if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
+  if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
     QueuePtr =
         loadImplicitKernelArgument(DAG, MVT::i64, SL, Align(8), QUEUE_PTR);
   } else {
@@ -6542,7 +6542,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
   // For code object version 5, private_base and shared_base are passed through
   // implicit kernargs.
   const Module *M = DAG.getMachineFunction().getFunction().getParent();
-  if (AMDGPU::getCodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
+  if (AMDGPU::getAMDHSACodeObjectVersion(*M) >= AMDGPU::AMDHSA_COV5) {
     ImplicitParameter Param =
         (AS == AMDGPUAS::LOCAL_ADDRESS) ? SHARED_BASE : PRIVATE_BASE;
     return loadImplicitKernelArgument(DAG, MVT::i32, DL, Align(4), Param);
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index b4f7fc456f0bd..f1c05446bf606 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -31,10 +31,11 @@
 #define GET_INSTRMAP_INFO
 #include "AMDGPUGenInstrInfo.inc"
 
-static llvm::cl::opt<unsigned>
-    AmdhsaCodeObjectVersion("amdhsa-code-object-version", llvm::cl::Hidden,
-                            llvm::cl::desc("AMDHSA Code Object Version"),
-                            llvm::cl::init(4));
+static llvm::cl::opt<unsigned> DefaultAMDHSACodeObjectVersion(
+    "amdhsa-code-object-version", llvm::cl::Hidden,
+    llvm::cl::init(llvm::AMDGPU::AMDHSA_COV4),
+    llvm::cl::desc("Set default AMDHSA Code Object Version (module flag "
+                   "or asm directive still take priority if present)"));
 
 namespace {
 
@@ -161,45 +162,32 @@ bool isHsaAbi(const MCSubtargetInfo &STI) {
   return STI.getTargetTriple().getOS() == Triple::AMDHSA;
 }
 
-std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI) {
-  if (STI && STI->getTargetTriple().getOS() != Triple::AMDHSA)
-    return std::nullopt;
-
-  switch (AmdhsaCodeObjectVersion) {
-  case 4:
-    return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
-  case 5:
-    return ELF::ELFABIVERSION_AMDGPU_HSA_V5;
-  default:
-    report_fatal_error(Twine("Unsupported AMDHSA Code Object Version ") +
-                       Twine(AmdhsaCodeObjectVersion));
+unsigned getAMDHSACodeObjectVersion(const Module &M) {
+  if (auto Ver = mdconst::extract_or_null<ConstantInt>(
+          M.getModuleFlag("amdgpu_code_object_version"))) {
+    return (unsigned)Ver->getZExtValue() / 100;
   }
-}
 
-bool isHsaAbiVersion4(const MCSubtargetInfo *STI) {
-  if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
-    return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V4;
-  return false;
+  return getDefaultAMDHSACodeObjectVersion();
 }
 
-bool isHsaAbiVersion5(const MCSubtargetInfo *STI) {
-  if (std::optional<uint8_t> HsaAbiVer = getHsaAbiVersion(STI))
-    return *HsaAbiVer == ELF::ELFABIVERSION_AMDGPU_HSA_V5;
-  return false;
+unsigned getDefaultAMDHSACodeObjectVersion() {
+  return DefaultAMDHSACodeObjectVersion;
 }
 
-unsigned getAmdhsaCodeObjectVersion() {
-  return AmdhsaCodeObjectVersion;
-}
+uint8_t getELFABIVersion(const Triple &T, unsigned CodeObjectVersion) {
+  if (T.getOS() != Triple::AMDHSA)
+    return 0;
 
-unsigned getCodeObjectVersion(const Module &M) {
-  if (auto Ver = mdconst::extract_or_null<ConstantInt>(
-      M.getModuleFlag("amdgpu_code_object_version"))) {
-    return (unsigned)Ver->getZExtValue() / 100;
+  switch (CodeObjectVersion) {
+  case 4:
+    return ELF::ELFABIVERSION_AMDGPU_HSA_V4;
+  case 5:
+    return ELF::ELFABIVERSION_AMDGPU_HSA_V5;
+  default:
+    report_fatal_error("Unsupported AMDHSA Code Object Version " +
+                       Twine(CodeObjectVersion));
   }
-
-  // Default code object version.
-  return AMDHSA_COV4;
 }
 
 unsigned getMultigridSyncArgImplicitArgPosition(unsigned CodeObjectVersion) {
@@ -705,7 +693,7 @@ namespace IsaInfo {
 
 AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI)
     : STI(STI), XnackSetting(TargetIDSetting::Any),
-      SramEccSetting(TargetIDSetting::Any), CodeObjectVersion(0) {
+      SramEccSetting(TargetIDSetting::Any) {
   if (!STI.getFeatureBits().test(FeatureSupportsXNACK))
     XnackSetting = TargetIDSetting::Unsupported;
   if (!STI.getFeatureBits().test(FeatureSupportsSRAMECC))
@@ -817,23 +805,16 @@ std::string AMDGPUTargetID::toString() const {
 
   std::string Features;
   if (STI.getTargetTriple().getOS() == Triple::AMDHSA) {
-    switch (CodeObjectVersion) {
-    case AMDGPU::AMDHSA_COV4:
-    case AMDGPU::AMDHSA_COV5:
-      // sramecc.
-      if (getSramEccSetting() == TargetIDSetting::Off)
-        Features += ":sramecc-";
-      else if (getSramEccSetting() == TargetIDSetting::On)
-        Features += ":sramecc+";
-      // xnack.
-      if (getXnackSetting() == TargetIDSetting::Off)
-        Features += ":xnack-";
-      else if (getXnackSetting() == TargetIDSetting::On)
-        Features += ":xnack+";
-      break;
-    default:
-      break;
-    }
+    // sramecc.
+    if (getSramEccSetting() == TargetIDSetting::Off)
+      Features += ":sramecc-";
+    else if (getSramEccSetting() == TargetIDSetting::On)
+      Features += ":sramecc+";
+    // xnack.
+    if (getXnackSetting() == TargetIDSetting::Off)
+      Features += ":xnack-";
+    else if (getXnackSetting() == TargetIDSetting::On)
+      Features += ":xnack+";
   }
 
   StreamRep << Processor << Features;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 351563e957f14..d3f55c7920174 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -46,14 +46,18 @@ enum { AMDHSA_COV4 = 4, AMDHSA_COV5 = 5 };
 
 /// \returns True if \p STI is AMDHSA.
 bool isHsaAbi(const MCSubtargetInfo &STI);
-/// \returns HSA OS ABI Version identification.
-std::optional<uint8_t> getHsaAbiVersion(const MCSubtargetInfo *STI);
-/// \returns True if HSA OS ABI Version identification is 4,
-/// false otherwise.
-bool isHsaAbiVersion4(const MCSubtargetInfo *STI);
-/// \returns True if HSA OS ABI Version identification is 5,
-/// false otherwise.
-bool isHsaAbiVersion5(const MCSubtargetInfo *STI);
+
+/// \returns Code object version from the IR module flag.
+unsigned getAMDHSACodeObjectVersion(const Module &M);
+
+/// \returns The default HSA code object version. This should only be used when
+/// we lack a more accurate CodeObjectVersion value (e.g. from the IR module
+/// flag or a .amdhsa_code_object_version directive)
+unsigned getDefaultAMDHSACodeObjectVersion();
+
+/// \returns ABIVersion suitable for use in ELF's e_ident[ABIVERSION]. \param
+/// CodeObjectVersion is a value returned by getAMDHSACodeObjectVersion().
+uint8_t getELFABIVersion(const Triple &OS, unsigned CodeObjectVersion);
 
 /// \returns The offset of the multigrid_sync_arg argument from implicitarg_ptr
 unsigned getMultigridSyncArgImplicitArgPosition(unsigned COV);
@@ -64,12 +68,6 @@ unsigned getHostcallImplicitArgPosition(unsigned COV);
 unsigned getDefaultQueueImplicitArgPosition(unsigned COV);
 unsigned getCompletionActionImplicitArgPosition(unsigned COV);
 
-/// \returns Code object version.
-unsigned getAmdhsaCodeObjectVersion();
-
-/// \returns Code object version.
-unsigned getCodeObjectVersion(const Module &M);
-
 struct GcnBufferFormatInfo {
   unsigned Format;
   unsigned BitsPerComp;
@@ -114,7 +112,6 @@ class AMDGPUTargetID {
   const MCSubtargetInfo &STI;
   TargetIDSetting XnackSetting;
   TargetIDSetting SramEccSetting;
-  unsigned CodeObjectVersion;
 
 public:
   explicit AMDGPUTargetID(const MCSubtargetInfo &STI);
@@ -144,10 +141,6 @@ class AMDGPUTargetID {
     return XnackSetting;
   }
 
-  void setCodeObjectVersion(unsigned COV) {
-    CodeObjectVersion = COV;
-  }
-
   /// Sets xnack setting to \p NewXnackSetting.
   void setXnackSetting(TargetIDSetting NewXnackSetting) {
     XnackSetting = NewXnackSetting;
diff --git a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
index 680fae1869600..07b230d8f974f 100644
--- a/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
+++ b/llvm/test/CodeGen/AMDGPU/codegen-internal-only-func.ll
@@ -9,6 +9,8 @@
 ; OPT:	  .text
 ; OPT-NEXT: .section	".note.GNU-stack"
 ; OPT-NEXT: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
+; COV4-NEXT: .amdhsa_code_object_version 4
+; COV5-NEXT: .amdhsa_code_object_version 5
 ; OPT-NEXT: .amdgpu_metadata
 ; OPT-NEXT: ---
 ; OPT-NEXT: amdhsa.kernels:  []
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll
index b78f6412cd677..41311abb6983f 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-any.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx900
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll
index a3c75e0997533..3f380a97240e5 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-not-supported.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx700
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll
index d4d8af8c1a99b..da3f5640e6182 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-off.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll
index 9ca8b055a2797..d458f34891293 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-all-on.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll
index fd3f5878469e6..5c23e1ef5b42f 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-1.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll
index 34673dd5b8919..e3635ba5c2acb 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-off-2.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll
index c283ece7e8bdf..1b7c65a9151d8 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-1.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
diff --git a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll
index 869254cae5258..bd74574746030 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-mul-func-xnack-any-on-2.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-any.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-any.ll
index b1bcb34c8aee4..321c20bc91de1 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-any.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-any.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx900"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx900
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll
index cc04eb0e661d6..18b118fb5739c 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-not-supported.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx700"
 ; ASM:  amdhsa.target: amdgcn-amd-amdhsa--gfx700
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll
index e84778b526f11..db6e8923165b4 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-off.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack-"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack-'
diff --git a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll
index a1ab6ed5f082a..0725c779cc66b 100644
--- a/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll
+++ b/llvm/test/CodeGen/AMDGPU/tid-one-func-xnack-on.ll
@@ -3,8 +3,8 @@
 ; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 ; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
 
-; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
-; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/500/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=4 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF5 %s
+; RUN: sed 's/CODE_OBJECT_VERSION/400/g' %s | llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 --amdhsa-code-object-version=5 --filetype=obj | llvm-readobj --file-headers - | FileCheck --check-prefixes=ELF,ELF4 %s
 
 ; ASM: .amdgcn_target  "amdgcn-amd-amdhsa--gfx900:xnack+"
 ; ASM:  amdhsa.target: 'amdgcn-amd-amdhsa--gfx900:xnack+'
diff --git a/llvm/test/MC/AMDGPU/elf-header-cov.s b/llvm/test/MC/AMDGPU/elf-header-cov.s
new file mode 100644
index 0000000000000..e8baad1ba533e
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/elf-header-cov.s
@@ -0,0 +1,24 @@
+// RUN: sed 's/COV/4/g' %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj | \
+// RUN:   llvm-readobj --file-headers - | FileCheck %s --check-prefixes=HS4
+
+// RUN: sed 's/COV/5/g' %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx802 -filetype=obj | \
+// RUN:   llvm-readobj --file-headers - | FileCheck %s --check-prefixes=HS5
+
+// RUN: sed 's/COV/4/g' %s | not llvm-mc -triple amdgcn-amd-amdpal -mcpu=gfx802 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR
+
+// RUN: sed 's/COV/4/g' %s | not llvm-mc -triple amdgcn-amd-mesa3d -mcpu=gfx802 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR
+
+// RUN: sed 's/COV/4/g' %s | not llvm-mc -triple amdgcn-amd- -mcpu=gfx802 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=ERR
+
+.amdhsa_code_object_version COV
+
+// ERR: error: unknown directive
+
+// HS4: OS/ABI: AMDGPU_HSA (0x40)
+// HS4-NEXT: ABIVersion: 2
+
+// HS5: OS/ABI: AMDGPU_HSA (0x40)
+// HS5-NEXT: ABIVersion: 3
diff --git a/llvm/test/MC/AMDGPU/hsa-exp.s b/llvm/test/MC/AMDGPU/hsa-exp.s
index 23b2b8f31a4c2..2c8dd6f8eeb48 100644
--- a/llvm/test/MC/AMDGPU/hsa-exp.s
+++ b/llvm/test/MC/AMDGPU/hsa-exp.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=4 -show-encoding %s | FileCheck %s --check-prefix=ASM
-// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri --amdhsa-code-object-version=4 -show-encoding %s | llvm-readobj --symbols -S --sd - | FileCheck %s --check-prefix=ELF
+// RUN: llvm-mc -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | FileCheck %s --check-prefix=ASM
+// RUN: llvm-mc -filetype=obj -triple amdgcn--amdhsa -mcpu=kaveri -show-encoding %s | llvm-readobj --symbols -S --sd - | FileCheck %s --check-prefix=ELF
 
 // ELF: Section {
 // ELF: Name: .text
@@ -19,6 +19,9 @@
 .amdgcn_target "amdgcn-unknown-amdhsa--gfx700"
 // ASM: .amdgcn_target "amdgcn-unknown-amdhsa--gfx700"
 
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
 .set my_is_ptr64, 1
 
 .if my_is_ptr64 == 0
diff --git a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
index 186d98f78b986..8b90e20bb87d1 100644
--- a/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-gfx12-v4.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1200 --amdhsa-code-object-version=4 < %s | FileCheck --check-prefix=ASM %s
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1200 --amdhsa-code-object-version=4 -filetype=obj < %s > %t
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx1200 -filetype=obj < %s > %t
 // RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
 // RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
 
@@ -52,6 +52,9 @@
 .amdgcn_target "amdgcn-amd-amdhsa--gfx1200"
 // ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx1200"
 
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
 .p2align 8
 .type minimal,@function
 minimal:
diff --git a/llvm/test/MC/AMDGPU/hsa-v4.s b/llvm/test/MC/AMDGPU/hsa-v4.s
index 6a824b8bcc7b9..e19dba0f5fd0f 100644
--- a/llvm/test/MC/AMDGPU/hsa-v4.s
+++ b/llvm/test/MC/AMDGPU/hsa-v4.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=4 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=4 -mattr=+xnack -filetype=obj < %s > %t
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+xnack -filetype=obj < %s > %t
 // RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
 // RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
 
@@ -52,6 +52,9 @@
 .amdgcn_target "amdgcn-amd-amdhsa--gfx904:xnack+"
 // ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx904:xnack+"
 
+.amdhsa_code_object_version 4
+// ASM: .amdhsa_code_object_version 4
+
 .p2align 8
 .type minimal,@function
 minimal:
diff --git a/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s b/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s
index 6edac771faa07..248890391a6b8 100644
--- a/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s
+++ b/llvm/test/MC/AMDGPU/hsa-v5-uses-dynamic-stack.s
@@ -1,5 +1,5 @@
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=5 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
-// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 --amdhsa-code-object-version=5 -mattr=+xnack -filetype=obj < %s > %t
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+xnack < %s | FileCheck --check-prefix=ASM %s
+// RUN: llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx904 -mattr=+xnack -filetype=obj < %s > %t
 // RUN: llvm-readelf -S -r -s %t | FileCheck --check-prefix=READOBJ %s
 // RUN: llvm-objdump -s -j .rodata %t | FileCheck --check-prefix=OBJDUMP %s
 
@@ -52,6 +52,9 @@
 .amdgcn_target "amdgcn-amd-amdhsa--gfx904:xnack+"
 // ASM: .amdgcn_target "amdgcn-amd-amdhsa--gfx904:xnack+"
 
+.amdhsa_code_object_version 5
+// ASM: .amdhsa_code_object_version 5
+
 .p2align 8
 .type minimal,@function
 minimal:
diff --git a/llvm/test/MC/AMDGPU/hsa_isa_version_attrs.s b/llvm/test/MC/AMDGPU/hsa_isa_version_attrs.s
deleted file mode 100644
index aafad9bbaf4c4..0000000000000
--- a/llvm/test/MC/AMDGPU/hsa_isa_version_attrs.s
+++ /dev/null
@@ -1,6 +0,0 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx801 -mattr=-fast-fmaf -show-encoding %s | FileCheck --check-prefix=GFX8 %s
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -mattr=-mad-mix-insts,-xnack -show-encoding %s | FileCheck --check-prefix=GFX9 %s
-
-.hsa_code_object_isa
-// GFX8:  .hsa_code_object_isa 8,0,1,"AMD","AMDGPU"
-// GFX9:  .hsa_code_object_isa 9,0,0,"AMD","AMDGPU"

From 997ffce43c6d2d3f647eb091c732665049b1f47f Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Sun, 21 Jan 2024 13:18:51 -0500
Subject: [PATCH 302/843] [C23] Implement N2490, Remove trigraphs??!

This follows the same implementation logic as with C++ and is
compatible with the GCC behavior in C.

Trigraphs are enabled by default in -std=c* conformance modes before
C23, but are disabled in GNU and Microsoft modes as well as in C23 or
later.
---
 clang/docs/ReleaseNotes.rst                 |  7 +++++
 clang/lib/Frontend/CompilerInvocation.cpp   |  8 ++++--
 clang/test/C/C2x/n2940.c                    | 26 +++++++++++++++++
 clang/test/C/drs/dr3xx.c                    | 31 +++++++++++++--------
 clang/test/Preprocessor/ucn-pp-identifier.c |  2 +-
 clang/www/c_status.html                     |  2 +-
 6 files changed, 59 insertions(+), 17 deletions(-)
 create mode 100644 clang/test/C/C2x/n2940.c

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 8bb26fcae18d6..28fd79abbe782 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -271,6 +271,13 @@ C23 Feature Support
   previously implemented allowing a label at the end of a compound statement,
   and now we've implemented allowing a label to be followed by a declaration
   instead of a statement.
+- Implemented
+  `N2940 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2940.pdf>`_ which
+  removes support for trigraphs in C23 and later. In earlier language modes,
+  trigraphs remain enabled by default in conforming modes (e.g. ``-std=c17``)
+  and disabled by default in GNU and Microsoft modes (e.g., ``-std=gnu17`` or
+  ``-fms-compatibility``). If needed, you can enable trigraphs by passing
+  ``-ftrigraphs``.
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index ed9cd1299eae2..7edea7798af1e 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -3480,7 +3480,8 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
                 Twine(Major) + "." + Twine(Minor) + "." + Twine(Subminor));
   }
 
-  if ((!Opts.GNUMode && !Opts.MSVCCompat && !Opts.CPlusPlus17) || T.isOSzOS()) {
+  if ((!Opts.GNUMode && !Opts.MSVCCompat && !Opts.CPlusPlus17 && !Opts.C23) ||
+      T.isOSzOS()) {
     if (!Opts.Trigraphs)
       GenerateArg(Consumer, OPT_fno_trigraphs);
   } else {
@@ -3876,10 +3877,11 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
 
   // Mimicking gcc's behavior, trigraphs are only enabled if -trigraphs
   // is specified, or -std is set to a conforming mode.
-  // Trigraphs are disabled by default in c++1z onwards.
+  // Trigraphs are disabled by default in C++17 and C23 onwards.
   // For z/OS, trigraphs are enabled by default (without regard to the above).
   Opts.Trigraphs =
-      (!Opts.GNUMode && !Opts.MSVCCompat && !Opts.CPlusPlus17) || T.isOSzOS();
+      (!Opts.GNUMode && !Opts.MSVCCompat && !Opts.CPlusPlus17 && !Opts.C23) ||
+      T.isOSzOS();
   Opts.Trigraphs =
       Args.hasFlag(OPT_ftrigraphs, OPT_fno_trigraphs, Opts.Trigraphs);
 
diff --git a/clang/test/C/C2x/n2940.c b/clang/test/C/C2x/n2940.c
new file mode 100644
index 0000000000000..ee92b6ec94a8c
--- /dev/null
+++ b/clang/test/C/C2x/n2940.c
@@ -0,0 +1,26 @@
+// RUN: %clang_cc1 -verify=no-trigraphs -std=c23 %s
+// RUN: %clang_cc1 -verify=no-trigraphs -std=gnu23 %s
+// RUN: %clang_cc1 -verify=no-trigraphs -std=gnu17 %s
+// RUN: %clang_cc1 -verify=no-trigraphs -std=gnu11 %s
+// RUN: %clang_cc1 -verify=no-trigraphs -std=gnu99 %s
+// RUN: %clang_cc1 -verify=no-trigraphs -std=gnu89 %s
+// RUN: %clang_cc1 -verify=trigraphs -std=c17 %s
+// RUN: %clang_cc1 -verify=trigraphs -std=c11 %s
+// RUN: %clang_cc1 -verify=trigraphs -std=c99 %s
+// RUN: %clang_cc1 -verify=trigraphs -std=c89 %s
+// RUN: %clang_cc1 -verify=trigraphs -std=c23 -ftrigraphs %s
+
+/* WG14 N2940: Clang 18
+ * Removing trigraphs??!
+ */
+
+// Trigraphs are enabled by default in any conforming C mode before C23, but
+// are otherwise disabled (in all GNU modes, and in C23 or later).
+// The ??= trigraph, if supported, will become the # character, which is a null
+// preprocessor directive that does nothing.
+
+??=
+// no-trigraphs-warning@-1 {{trigraph ignored}} \
+   no-trigraphs-error@-1 {{expected identifier or '('}} \
+   trigraphs-warning@-1 {{trigraph converted to '#' character}}
+
diff --git a/clang/test/C/drs/dr3xx.c b/clang/test/C/drs/dr3xx.c
index 4d6617184d9a4..0f67470b013a5 100644
--- a/clang/test/C/drs/dr3xx.c
+++ b/clang/test/C/drs/dr3xx.c
@@ -1,8 +1,8 @@
-/* RUN: %clang_cc1 -std=c89 -fsyntax-only -Wvla -verify=expected,c89only -pedantic -Wno-c11-extensions %s
-   RUN: %clang_cc1 -std=c99 -fsyntax-only -Wvla -verify=expected,c99andup -pedantic -Wno-c11-extensions %s
-   RUN: %clang_cc1 -std=c11 -fsyntax-only -Wvla -verify=expected,c99andup -pedantic %s
-   RUN: %clang_cc1 -std=c17 -fsyntax-only -Wvla -verify=expected,c99andup -pedantic %s
-   RUN: %clang_cc1 -std=c2x -fsyntax-only -Wvla -verify=expected,c99andup -pedantic %s
+/* RUN: %clang_cc1 -std=c89 -fsyntax-only -Wvla -verify=expected,c89only,c17andearlier -pedantic -Wno-c11-extensions %s
+   RUN: %clang_cc1 -std=c99 -fsyntax-only -Wvla -verify=expected,c99andup,c17andearlier -pedantic -Wno-c11-extensions %s
+   RUN: %clang_cc1 -std=c11 -fsyntax-only -Wvla -verify=expected,c99andup,c17andearlier -pedantic %s
+   RUN: %clang_cc1 -std=c17 -fsyntax-only -Wvla -verify=expected,c99andup,c17andearlier -pedantic %s
+   RUN: %clang_cc1 -std=c2x -fsyntax-only -Wvla -verify=expected,c99andup,c23andup -pedantic %s
  */
 
 /* The following are DRs which do not require tests to demonstrate
@@ -70,13 +70,6 @@
 #error "We definitely should not have gotten here"
 #endif
 
-/* WG14 DR309: yes
- * Clarifying trigraph substitution
- */
-int dr309??(1??) = { 1 }; /* expected-warning {{trigraph converted to '[' character}}
-                             expected-warning {{trigraph converted to ']' character}}
-                           */
-
 /* WG14 DR311: yes
  * Definition of variably modified types
  */
@@ -292,3 +285,17 @@ void f(long double f,
        char (**a)[10 * sizeof f]) {
   _Static_assert(sizeof **a == sizeof(long double) * 10, "");
 }
+
+/* WG14 DR309: yes
+ * Clarifying trigraph substitution
+ */
+int dr309??(1??) = { 1 }; /* c17andearlier-warning {{trigraph converted to '[' character}}
+                             c17andearlier-warning {{trigraph converted to ']' character}}
+                             c23andup-warning 2 {{trigraph ignored}}
+                             c23andup-error {{expected ';' after top level declarator}}
+                           */
+
+/* NOTE: Due to interactions with the diagnostic system, dr309 should be the
+ * last test case in this file because subsequent diagnostics may not be
+ * generated as expected.
+ */
diff --git a/clang/test/Preprocessor/ucn-pp-identifier.c b/clang/test/Preprocessor/ucn-pp-identifier.c
index 19cfc2589efdd..5efcfe48f638a 100644
--- a/clang/test/Preprocessor/ucn-pp-identifier.c
+++ b/clang/test/Preprocessor/ucn-pp-identifier.c
@@ -1,5 +1,5 @@
 // RUN: %clang_cc1 %s -fsyntax-only -std=c99 -pedantic -verify=expected,ext -Wundef -DTRIGRAPHS=1
-// RUN: %clang_cc1 %s -fsyntax-only -std=c2x -pedantic -verify=expected,ext -Wundef -DTRIGRAPHS=1
+// RUN: %clang_cc1 %s -fsyntax-only -std=c23 -pedantic -verify=expected,ext -Wundef -ftrigraphs -DTRIGRAPHS=1
 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -fno-trigraphs
 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -std=c++23 -pedantic -ftrigraphs -DTRIGRAPHS=1 -verify=expected,cxx23 -Wundef -Wpre-c++23-compat
 // RUN: %clang_cc1 %s -fsyntax-only -x c++ -pedantic -verify=expected,ext -Wundef -ftrigraphs -DTRIGRAPHS=1
diff --git a/clang/www/c_status.html b/clang/www/c_status.html
index b9e0650ddca24..3955a1d796397 100644
--- a/clang/www/c_status.html
+++ b/clang/www/c_status.html
@@ -1156,7 +1156,7 @@ <h2 id="c2x">C23 implementation status</h2>
     <tr>
       <td>Remove trigraphs??!</td>
       <td><a href="https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2940.pdf">N2940</a></td>
-      <td class="full" align="center">Yes</td>
+      <td class="unreleased" align="center">Clang 18</td>
     </tr>
     <tr>
       <td>Improved normal enumerations</td>

From 30d6806a08f8f453d6389bdfeae4c32e937a9821 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Sun, 21 Jan 2024 19:48:33 +0100
Subject: [PATCH 303/843] [libc++][doc] Update the release notes for LLVM 18
 (#78324)

This is a preparation for the upcoming LLVM 18 release.
---
 libcxx/docs/Hardening.rst          |  2 +
 libcxx/docs/ReleaseNotes/18.rst    | 88 ++++++++++++++++++++++++------
 libcxx/docs/Status/Parallelism.rst |  2 +-
 libcxx/docs/Status/Ranges.rst      |  2 +-
 4 files changed, 74 insertions(+), 20 deletions(-)

diff --git a/libcxx/docs/Hardening.rst b/libcxx/docs/Hardening.rst
index 33a168d6e1683..0761f42368e92 100644
--- a/libcxx/docs/Hardening.rst
+++ b/libcxx/docs/Hardening.rst
@@ -1,3 +1,5 @@
+.. _hardening-modes:
+
 ===============
 Hardening Modes
 ===============
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 799347b646e63..0b43d76650a9d 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -35,9 +35,26 @@ see the `releases page <https://llvm.org/releases/>`_.
 What's New in Libc++ 18.0.0?
 ==============================
 
-- A new debug mode has been added, replacing the legacy debug mode that was
-  removed in the LLVM 17 release. See ``libcxx/docs/Hardening.rst`` for more
-  details.
+The main focus of the libc++ team has been to implement new C++20, C++23,
+and C++26 features.
+
+New hardened modes for the library have been added, replacing the legacy debug mode that was
+removed in the LLVM 17 release. Unlike the legacy debug mode, some of these hardening modes are
+also intended to be used in production. See :ref:`hardening-modes` for more details.
+
+Work on the ranges support has progressed. See
+:ref:`ranges-status` for the current status.
+
+Work on the experimental C++23 module support has progressed. The ``std.compat``
+module is available and the feature is retroactively available in C++20. See
+:ref:`ModulesInLibcxx` for more information.
+
+Work on the experimental C++17 Parallel STL has progressed. See
+:ref:`pstl-status` for the current status.
+
+Work on the experimental C++17 SIMD support has progressed. See
+:ref:`parallelism-status` for the current status.
+
 
 Implemented Papers
 ------------------
@@ -68,8 +85,9 @@ Implemented Papers
 Improvements and New Features
 -----------------------------
 
-- ``std::ranges::count`` is now optimized for ``vector<bool>::iterator``, which
-  can lead up to 350x performance improvements.
+- ``std::ranges::count`` and ``std::ranges::find`` are  now optimized for
+  ``std::vector<bool>::iterator``, which can lead up to 350x performance
+  improvements.
 
 - ``std::for_each`` has been optimized for segmented iterators like ``std::deque::iterator`` in C++23 and
   later, which can lead up to 40x performance improvements.
@@ -106,6 +124,11 @@ Improvements and New Features
   ``-DLIBCXX_INSTALL_MODULE_DIR=<path>``. The default location is
   ``${PREFIX}/share/libc++/v1``.
 
+- AddressSanitizer annotations have been added to ``std::basic_string``.
+
+- The libc++ source code has been formatted with ``clang-format``. This 
+  `discourse thread <https://discourse.llvm.org/t/rfc-clang-formatting-all-of-libc-once-and-for-all>`_
+  contains information how to rebase downstream patches.
 
 Deprecations and Removals
 -------------------------
@@ -118,7 +141,7 @@ Deprecations and Removals
 - The non-conforming constructor ``std::future_error(std::error_code)`` has been removed. Please use the
   ``std::future_error(std::future_errc)`` constructor provided in C++17 instead.
 
-- `P1957 <https://wg21.link/P1957>` has been implemented in Clang and libc++ removed a code path that led to
+- `P1957 <https://wg21.link/P1957>`_ has been implemented in Clang and libc++ removed a code path that led to
   narrowing conversions in ``std::variant`` behaving in a non-standard way. This may change how some uses of
   ``std::variant``'s constructor behave in user code. The ``_LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT``
   macro is provided to restore the previous behavior, and it will be supported in the LLVM 18 release only.
@@ -155,6 +178,14 @@ Deprecations and Removals
   that were removed in the C++17 and C++20 standards. Instead of using these
   macros, please use the macros to re-enable individual features.
 
+- The macro ``_LIBCPP_INLINE_VISIBILITY`` has been deprecated in LLVM 18 and
+  will be removed entirely in LLVM 19. The macro ``_LIBCPP_HIDE_FROM_ABI`` is
+  the drop-in replacement.
+
+- The macro ``_VSTD`` has been deprecated in LLVM 18 and will be removed
+  entirely in LLVM 19. The code ``std`` is the drop-in replacement.
+
+
 Upcoming Deprecations and Removals
 ----------------------------------
 
@@ -163,6 +194,9 @@ Upcoming Deprecations and Removals
 LLVM 19
 ~~~~~~~
 
+- The ``LIBCXX_EXECUTOR`` CMake variable has been deprecated.  LLVM 19 will
+  completely remove support for the ``*_EXECUTOR`` variables.
+
 - The ``LIBCXX_ENABLE_ASSERTIONS`` CMake variable that was used to enable the safe mode will be deprecated and setting
   it will trigger an error; use the ``LIBCXX_HARDENING_MODE`` variable with the value ``extensive`` instead. Similarly,
   the ``_LIBCPP_ENABLE_ASSERTIONS`` macro will be deprecated (setting it to ``1`` still enables the extensive mode in
@@ -187,6 +221,12 @@ LLVM 19
   ``_LIBCPP_ENABLE_CXX20_REMOVED_FEATURES`` macros have been deprecated
   in LLVM 18 and will be removed entirely in LLVM 19.
 
+- The macro ``_LIBCPP_INLINE_VISIBILITY`` has been deprecated in LLVM 18 and
+  will be removed entirely in LLVM 19.
+
+- The macro ``_VSTD`` has been deprecated in LLVM 18 and will be removed
+  entirely in LLVM 19.
+
 LLVM 20
 ~~~~~~~
 
@@ -203,12 +243,12 @@ ABI Affecting Changes
 - This release of libc++ added missing visibility annotations on some types in the library. Users compiling with
   ``-fvisbility=hidden`` may notice that additional type infos from libc++ are being exported from their ABI. This is
   the correct behavior in almost all cases since exporting the RTTI is required for these types to work properly with
-  dynamic_cast, exceptions and other mechanisms across binaries. However, if you intend to use libc++ purely as an
+  ``dynamic_cast``, exceptions and other mechanisms across binaries. However, if you intend to use libc++ purely as an
   internal implementation detail (i.e. you use libc++ as a static archive and never export libc++ symbols from your ABI)
   and you notice changes to your exported symbols list, then this means that you were not properly preventing libc++
   symbols from being part of your ABI.
 
-- The name mangling for intantiations of ``std::projected`` has changed in order to implement P2538R1. This technically
+- The name mangling for instantiations of ``std::projected`` has changed in order to implement P2538R1. This technically
   results in an ABI break, however in practice we expect uses of ``std::projected`` in ABI-sensitive places to be
   extremely rare. Any error resulting from this change should result in a link-time error.
 
@@ -219,18 +259,30 @@ ABI Affecting Changes
   to throw a different exception when attempting allocations that are too large
   (``std::bad_alloc`` vs ``std::length_error``).
 
-- The layout of some views inside ``std::ranges`` that use the ``movable-box`` exposition-only type as an implementation 
-  detail has changed in order to fix a bug which could result in overwriting user data following the ``movable-box``
-  <https://github.com/llvm/llvm-project/issues/70506>. 
-  This was caused by incorrect usage of the ``[[no_unique_address]]`` attribute inside the implementation of ``movable-box``. 
-  This only affects the layout of the following views: ``take_while_view``, ``filter_view``, ``single_view``, ``drop_while_view``, 
-  ``repeat_view``, ``transform_view``, ``chunk_by_view``. In order to avoid silent breakage, an ABI tag has been added to 
-  these views such that their mangled name will be different starting in this version of libc++. 
-  As a result, attempting to call a function that expects one of these views will fail to link until the code has been rebuilt 
-  against a matching version of libc++. In practice, we believe it is unusual for these views to appear at ABI boundaries so this 
-  should not be a major problem for most users. However it is probably worth auditing ranges-heavy code for ABI boundaries that 
+- The layout of some range adaptors that use the ``movable-box`` exposition-only type as an implementation
+  detail has changed in order to fix a `bug <https://github.com/llvm/llvm-project/issues/70506>`_ which could result in
+  overwriting user data following the ``movable-box``.
+  This bug was caused by incorrect usage of the ``[[no_unique_address]]`` attribute inside the implementation of ``movable-box``.
+  This fix affects the layout of the following views: ``take_while_view``, ``filter_view``, ``single_view``, ``drop_while_view``,
+  ``repeat_view``, ``transform_view``, ``chunk_by_view``. In order to avoid silent breakage as a result of this fix, an ABI tag has been added to
+  these views such that their mangled name will be different starting in this version of libc++.
+  As a result, attempting to call a function that expects one of these views will fail to link until the code has been rebuilt
+  against a matching version of libc++. In practice, we believe it is unusual for these views to appear at ABI boundaries so this
+  should not be a major problem for most users. However it is probably worth auditing ranges-heavy code for ABI boundaries that
   would contain these views, or for types that contain these views as members and which are passed across ABI boundaries.
 
+- Some properties of libc++ may cause ODR-violations when mixing multiple libc++
+  instances. To avoid these, often benign, ODR-violations the ODR-affecting
+  properties are now part of the ABI tag. The ODR-affecting properties are:
+
+    - library version (This was part of the ABI tag prior to LLVM 18.)
+    - exceptions vs no-exceptions
+    - hardening mode
+
+  This should not be ABI-affecting except that libc++ will be more robust
+  against different configurations of it being used in different translation
+  units.
+
 Build System Changes
 --------------------
 
diff --git a/libcxx/docs/Status/Parallelism.rst b/libcxx/docs/Status/Parallelism.rst
index 4fce429bbe6ce..9f8a326867f1c 100644
--- a/libcxx/docs/Status/Parallelism.rst
+++ b/libcxx/docs/Status/Parallelism.rst
@@ -1,4 +1,4 @@
-.. parallelism-status:
+.. _parallelism-status:
 
 ====================================
 libc++ Parallelism TS Status (N4808)
diff --git a/libcxx/docs/Status/Ranges.rst b/libcxx/docs/Status/Ranges.rst
index 7d36700d6545f..cdbf68393b739 100644
--- a/libcxx/docs/Status/Ranges.rst
+++ b/libcxx/docs/Status/Ranges.rst
@@ -1,4 +1,4 @@
-.. ranges-status:
+.. _ranges-status:
 
 ================================
 libc++ Ranges Status

From 5518a9d7673bfe55b4110bea049140316d032fbf Mon Sep 17 00:00:00 2001
From: Andrey Ali Khan Bolshakov
 <32954549+bolshakov-a@users.noreply.github.com>
Date: Sun, 21 Jan 2024 23:28:57 +0300
Subject: [PATCH 304/843] [c++20] P1907R1: Support for generalized non-type
 template arguments of scalar type. (#78041)

Previously committed as 9e08e51a20d0d2b1c5724bb17e969d036fced4cd, and
reverted because a dependency commit was reverted, then committed again
as 4b574008aef5a7235c1f894ab065fe300d26e786 and reverted again because
"dependency commit" 5a391d38ac6c561ba908334d427f26124ed9132e was
reverted. But it doesn't seem that 5a391d38ac6c was a real dependency
for this.

This commit incorporates 4b574008aef5a7235c1f894ab065fe300d26e786 and
18e093faf726d15f210ab4917142beec51848258 by Richard Smith (@zygoloid),
with some minor fixes, most notably:

- `UncommonValue` renamed to `StructuralValue`

- `VK_PRValue` instead of `VK_RValue` as default kind in lvalue and
member pointer handling branch in
`BuildExpressionFromNonTypeTemplateArgumentValue`;

- handling of `StructuralValue` in `IsTypeDeclaredInsideVisitor`;

- filling in `SugaredConverted` along with `CanonicalConverted`
parameter in `Sema::CheckTemplateArgument`;

- minor cleanup in
`TemplateInstantiator::transformNonTypeTemplateParmRef`;

- `TemplateArgument` constructors refactored;

- `ODRHash` calculation for `UncommonValue`;

- USR generation for `UncommonValue`;

- more correct MS compatibility mangling algorithm (tested on MSVC ver.
19.35; toolset ver. 143);

- IR emitting fixed on using a subobject as a template argument when the
corresponding template parameter is used in an lvalue context;

- `noundef` attribute and opaque pointers in `template-arguments` test;

- analysis for C++17 mode is turned off for templates in
`warn-bool-conversion` test; in C++17 and C++20 mode, array reference
used as a template argument of pointer type produces template argument
of UncommonValue type, and
`BuildExpressionFromNonTypeTemplateArgumentValue` makes
`OpaqueValueExpr` for it, and `DiagnoseAlwaysNonNullPointer` cannot see
through it; despite of "These cases should not warn" comment, I'm not
sure about correct behavior; I'd expect a suggestion to replace `if` by
`if constexpr`;

- `temp.arg.nontype/p1.cpp` and `dr18xx.cpp` tests fixed.
---
 clang-tools-extra/clangd/DumpAST.cpp          |   1 +
 clang-tools-extra/clangd/FindTarget.cpp       |   1 +
 clang/docs/ReleaseNotes.rst                   |   3 +
 clang/include/clang/AST/ODRHash.h             |   3 +
 clang/include/clang/AST/PropertiesBase.td     |  14 ++
 clang/include/clang/AST/RecursiveASTVisitor.h |   2 +
 .../clang/AST/TemplateArgumentVisitor.h       |   2 +
 clang/include/clang/AST/TemplateBase.h        |  86 ++++---
 .../clang/Basic/DiagnosticSemaKinds.td        |   5 -
 clang/include/clang/Sema/Sema.h               |   4 +-
 clang/lib/AST/ASTContext.cpp                  |   5 +
 clang/lib/AST/ASTImporter.cpp                 |  13 ++
 clang/lib/AST/ASTStructuralEquivalence.cpp    |   3 +
 clang/lib/AST/Decl.cpp                        |   4 +
 clang/lib/AST/ItaniumMangle.cpp               |  36 ++-
 clang/lib/AST/MicrosoftMangle.cpp             |  78 +++++--
 clang/lib/AST/ODRHash.cpp                     |  67 ++++++
 clang/lib/AST/StmtProfile.cpp                 |   6 +
 clang/lib/AST/TemplateBase.cpp                | 113 ++++++++-
 clang/lib/AST/TypeLoc.cpp                     |   1 +
 clang/lib/CodeGen/CGDebugInfo.cpp             |  10 +
 clang/lib/CodeGen/CGExpr.cpp                  |  12 +-
 clang/lib/Index/USRGeneration.cpp             |  10 +
 clang/lib/Sema/SemaLookup.cpp                 |   1 +
 clang/lib/Sema/SemaOverload.cpp               |  10 +-
 clang/lib/Sema/SemaTemplate.cpp               | 218 +++++++++++-------
 clang/lib/Sema/SemaTemplateDeduction.cpp      |  64 +++--
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |  14 +-
 clang/lib/Sema/SemaTemplateVariadic.cpp       |   2 +
 clang/lib/Sema/TreeTransform.h                |  12 +-
 clang/lib/Serialization/ASTReader.cpp         |   1 +
 clang/lib/Serialization/ASTWriter.cpp         |   1 +
 clang/test/CXX/drs/dr18xx.cpp                 |   4 +-
 .../CXX/temp/temp.arg/temp.arg.nontype/p1.cpp |   4 +-
 clang/test/CodeGenCXX/mangle-ms-templates.cpp |  48 ++++
 clang/test/CodeGenCXX/mangle-template.cpp     |  40 +++-
 clang/test/CodeGenCXX/template-arguments.cpp  | 113 +++++++++
 .../Index/USR/structural-value-tpl-arg.cpp    |  23 ++
 clang/test/Modules/odr_hash.cpp               | 193 +++++++++++++++-
 clang/test/SemaCXX/warn-bool-conversion.cpp   |   2 +
 .../SemaTemplate/temp_arg_nontype_cxx1z.cpp   |  40 ++--
 .../SemaTemplate/temp_arg_nontype_cxx20.cpp   |  40 ++--
 clang/tools/libclang/CIndex.cpp               |   5 +
 clang/tools/libclang/CXCursor.cpp             |   3 +
 clang/www/cxx_status.html                     |  18 +-
 lldb/include/lldb/lldb-enumerations.h         |   1 +
 .../TypeSystem/Clang/TypeSystemClang.cpp      |   3 +
 47 files changed, 1109 insertions(+), 230 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/template-arguments.cpp
 create mode 100644 clang/test/Index/USR/structural-value-tpl-arg.cpp

diff --git a/clang-tools-extra/clangd/DumpAST.cpp b/clang-tools-extra/clangd/DumpAST.cpp
index b0cec65c39fa3..9a525efb938e8 100644
--- a/clang-tools-extra/clangd/DumpAST.cpp
+++ b/clang-tools-extra/clangd/DumpAST.cpp
@@ -143,6 +143,7 @@ class DumpVisitor : public RecursiveASTVisitor<DumpVisitor> {
       TEMPLATE_ARGUMENT_KIND(Declaration);
       TEMPLATE_ARGUMENT_KIND(Template);
       TEMPLATE_ARGUMENT_KIND(TemplateExpansion);
+      TEMPLATE_ARGUMENT_KIND(StructuralValue);
 #undef TEMPLATE_ARGUMENT_KIND
     }
     llvm_unreachable("Unhandled ArgKind enum");
diff --git a/clang-tools-extra/clangd/FindTarget.cpp b/clang-tools-extra/clangd/FindTarget.cpp
index 3d73e77b16aff..e702c6b3537a0 100644
--- a/clang-tools-extra/clangd/FindTarget.cpp
+++ b/clang-tools-extra/clangd/FindTarget.cpp
@@ -1039,6 +1039,7 @@ class ExplicitReferenceCollector
     case TemplateArgument::Pack:
     case TemplateArgument::Type:
     case TemplateArgument::Expression:
+    case TemplateArgument::StructuralValue:
       break; // Handled by VisitType and VisitExpression.
     };
     return RecursiveASTVisitor::TraverseTemplateArgumentLoc(A);
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 28fd79abbe782..dc31594e4b040 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -183,6 +183,9 @@ C++ Language Changes
 
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
+- Implemented `P1907R1 <https://wg21.link/P1907R1>` which extends allowed non-type template argument
+  kinds with e.g. floating point values and pointers and references to subobjects.
+  This feature is still experimental. Accordingly, `__cpp_nontype_template_args` was not updated.
 
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/include/clang/AST/ODRHash.h b/clang/include/clang/AST/ODRHash.h
index cedf644520fc3..a1caa6d39a87c 100644
--- a/clang/include/clang/AST/ODRHash.h
+++ b/clang/include/clang/AST/ODRHash.h
@@ -25,6 +25,7 @@
 
 namespace clang {
 
+class APValue;
 class Decl;
 class IdentifierInfo;
 class NestedNameSpecifier;
@@ -101,6 +102,8 @@ class ODRHash {
   // Save booleans until the end to lower the size of data to process.
   void AddBoolean(bool value);
 
+  void AddStructuralValue(const APValue &);
+
   static bool isSubDeclToBeProcessed(const Decl *D, const DeclContext *Parent);
 
 private:
diff --git a/clang/include/clang/AST/PropertiesBase.td b/clang/include/clang/AST/PropertiesBase.td
index d86c4eba6a225..0270c086d06b6 100644
--- a/clang/include/clang/AST/PropertiesBase.td
+++ b/clang/include/clang/AST/PropertiesBase.td
@@ -808,6 +808,20 @@ let Class = PropertyTypeCase<TemplateArgument, "Integral"> in {
     return TemplateArgument(ctx, value, type, isDefaulted);
   }]>;
 }
+let Class = PropertyTypeCase<TemplateArgument, "StructuralValue"> in {
+  def : Property<"value", APValue> {
+    let Read = [{ node.getAsStructuralValue() }];
+  }
+  def : Property<"type", QualType> {
+    let Read = [{ node.getStructuralValueType() }];
+  }
+  def : Property<"isDefaulted", Bool> {
+    let Read = [{ node.getIsDefaulted() }];
+  }
+  def : Creator<[{
+    return TemplateArgument(ctx, type, value, isDefaulted);
+  }]>;
+}
 let Class = PropertyTypeCase<TemplateArgument, "Template"> in {
   def : Property<"name", TemplateName> {
     let Read = [{ node.getAsTemplateOrTemplatePattern() }];
diff --git a/clang/include/clang/AST/RecursiveASTVisitor.h b/clang/include/clang/AST/RecursiveASTVisitor.h
index 8f2714e142bbe..2aee6a947141b 100644
--- a/clang/include/clang/AST/RecursiveASTVisitor.h
+++ b/clang/include/clang/AST/RecursiveASTVisitor.h
@@ -850,6 +850,7 @@ bool RecursiveASTVisitor<Derived>::TraverseTemplateArgument(
   case TemplateArgument::Declaration:
   case TemplateArgument::Integral:
   case TemplateArgument::NullPtr:
+  case TemplateArgument::StructuralValue:
     return true;
 
   case TemplateArgument::Type:
@@ -882,6 +883,7 @@ bool RecursiveASTVisitor<Derived>::TraverseTemplateArgumentLoc(
   case TemplateArgument::Declaration:
   case TemplateArgument::Integral:
   case TemplateArgument::NullPtr:
+  case TemplateArgument::StructuralValue:
     return true;
 
   case TemplateArgument::Type: {
diff --git a/clang/include/clang/AST/TemplateArgumentVisitor.h b/clang/include/clang/AST/TemplateArgumentVisitor.h
index 190aa97adf455..cf0d322015806 100644
--- a/clang/include/clang/AST/TemplateArgumentVisitor.h
+++ b/clang/include/clang/AST/TemplateArgumentVisitor.h
@@ -37,6 +37,7 @@ class Base {
       DISPATCH(Declaration);
       DISPATCH(NullPtr);
       DISPATCH(Integral);
+      DISPATCH(StructuralValue);
       DISPATCH(Template);
       DISPATCH(TemplateExpansion);
       DISPATCH(Expression);
@@ -59,6 +60,7 @@ class Base {
   VISIT_METHOD(Declaration);
   VISIT_METHOD(NullPtr);
   VISIT_METHOD(Integral);
+  VISIT_METHOD(StructuralValue);
   VISIT_METHOD(Template);
   VISIT_METHOD(TemplateExpansion);
   VISIT_METHOD(Expression);
diff --git a/clang/include/clang/AST/TemplateBase.h b/clang/include/clang/AST/TemplateBase.h
index b7cd71f17c944..fea2c8ccfee67 100644
--- a/clang/include/clang/AST/TemplateBase.h
+++ b/clang/include/clang/AST/TemplateBase.h
@@ -50,6 +50,7 @@ template <> struct PointerLikeTypeTraits<clang::Expr *> {
 
 namespace clang {
 
+class APValue;
 class ASTContext;
 class Expr;
 struct PrintingPolicy;
@@ -80,6 +81,13 @@ class TemplateArgument {
     /// that was provided for an integral non-type template parameter.
     Integral,
 
+    /// The template argument is a non-type template argument that can't be
+    /// represented by the special-case Declaration, NullPtr, or Integral
+    /// forms. These values are only ever produced by constant evaluation,
+    /// so cannot be dependent.
+    /// TODO: merge Declaration, NullPtr and Integral into this?
+    StructuralValue,
+
     /// The template argument is a template name that was provided for a
     /// template template parameter.
     Template,
@@ -130,6 +138,14 @@ class TemplateArgument {
     };
     void *Type;
   };
+  struct V {
+    LLVM_PREFERRED_TYPE(ArgKind)
+    unsigned Kind : 31;
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned IsDefaulted : 1;
+    APValue *Value;
+    void *Type;
+  };
   struct A {
     LLVM_PREFERRED_TYPE(ArgKind)
     unsigned Kind : 31;
@@ -156,11 +172,19 @@ class TemplateArgument {
   union {
     struct DA DeclArg;
     struct I Integer;
+    struct V Value;
     struct A Args;
     struct TA TemplateArg;
     struct TV TypeOrValue;
   };
 
+  void initFromType(QualType T, bool IsNullPtr, bool IsDefaulted);
+  void initFromDeclaration(ValueDecl *D, QualType QT, bool IsDefaulted);
+  void initFromIntegral(const ASTContext &Ctx, const llvm::APSInt &Value,
+                        QualType Type, bool IsDefaulted);
+  void initFromStructural(const ASTContext &Ctx, QualType Type,
+                          const APValue &V, bool IsDefaulted);
+
 public:
   /// Construct an empty, invalid template argument.
   constexpr TemplateArgument() : TypeOrValue({Null, 0, /* IsDefaulted */ 0}) {}
@@ -168,25 +192,22 @@ class TemplateArgument {
   /// Construct a template type argument.
   TemplateArgument(QualType T, bool isNullPtr = false,
                    bool IsDefaulted = false) {
-    TypeOrValue.Kind = isNullPtr ? NullPtr : Type;
-    TypeOrValue.IsDefaulted = IsDefaulted;
-    TypeOrValue.V = reinterpret_cast<uintptr_t>(T.getAsOpaquePtr());
+    initFromType(T, isNullPtr, IsDefaulted);
   }
 
-  /// Construct a template argument that refers to a
-  /// declaration, which is either an external declaration or a
-  /// template declaration.
+  /// Construct a template argument that refers to a (non-dependent)
+  /// declaration.
   TemplateArgument(ValueDecl *D, QualType QT, bool IsDefaulted = false) {
-    assert(D && "Expected decl");
-    DeclArg.Kind = Declaration;
-    DeclArg.IsDefaulted = IsDefaulted;
-    DeclArg.QT = QT.getAsOpaquePtr();
-    DeclArg.D = D;
+    initFromDeclaration(D, QT, IsDefaulted);
   }
 
   /// Construct an integral constant template argument. The memory to
   /// store the value is allocated with Ctx.
-  TemplateArgument(ASTContext &Ctx, const llvm::APSInt &Value, QualType Type,
+  TemplateArgument(const ASTContext &Ctx, const llvm::APSInt &Value,
+                   QualType Type, bool IsDefaulted = false);
+
+  /// Construct a template argument from an arbitrary constant value.
+  TemplateArgument(const ASTContext &Ctx, QualType Type, const APValue &Value,
                    bool IsDefaulted = false);
 
   /// Construct an integral constant template argument with the same
@@ -297,7 +318,7 @@ class TemplateArgument {
   /// Retrieve the type for a type template argument.
   QualType getAsType() const {
     assert(getKind() == Type && "Unexpected kind");
-    return QualType::getFromOpaquePtr(reinterpret_cast<void*>(TypeOrValue.V));
+    return QualType::getFromOpaquePtr(reinterpret_cast<void *>(TypeOrValue.V));
   }
 
   /// Retrieve the declaration for a declaration non-type
@@ -315,7 +336,7 @@ class TemplateArgument {
   /// Retrieve the type for null non-type template argument.
   QualType getNullPtrType() const {
     assert(getKind() == NullPtr && "Unexpected kind");
-    return QualType::getFromOpaquePtr(reinterpret_cast<void*>(TypeOrValue.V));
+    return QualType::getFromOpaquePtr(reinterpret_cast<void *>(TypeOrValue.V));
   }
 
   /// Retrieve the template name for a template name argument.
@@ -371,6 +392,14 @@ class TemplateArgument {
   /// default template parameter.
   bool getIsDefaulted() const { return (bool)TypeOrValue.IsDefaulted; }
 
+  /// Get the value of a StructuralValue.
+  const APValue &getAsStructuralValue() const { return *Value.Value; }
+
+  /// Get the type of a StructuralValue.
+  QualType getStructuralValueType() const {
+    return QualType::getFromOpaquePtr(Value.Type);
+  }
+
   /// If this is a non-type template argument, get its type. Otherwise,
   /// returns a null QualType.
   QualType getNonTypeTemplateArgumentType() const;
@@ -516,6 +545,7 @@ class TemplateArgumentLoc {
     assert(Argument.getKind() == TemplateArgument::NullPtr ||
            Argument.getKind() == TemplateArgument::Integral ||
            Argument.getKind() == TemplateArgument::Declaration ||
+           Argument.getKind() == TemplateArgument::StructuralValue ||
            Argument.getKind() == TemplateArgument::Expression);
   }
 
@@ -541,13 +571,9 @@ class TemplateArgumentLoc {
   /// - Fetches the full source range of the argument.
   SourceRange getSourceRange() const LLVM_READONLY;
 
-  const TemplateArgument &getArgument() const {
-    return Argument;
-  }
+  const TemplateArgument &getArgument() const { return Argument; }
 
-  TemplateArgumentLocInfo getLocInfo() const {
-    return LocInfo;
-  }
+  TemplateArgumentLocInfo getLocInfo() const { return LocInfo; }
 
   TypeSourceInfo *getTypeSourceInfo() const {
     if (Argument.getKind() != TemplateArgument::Type)
@@ -575,6 +601,11 @@ class TemplateArgumentLoc {
     return LocInfo.getAsExpr();
   }
 
+  Expr *getSourceStructuralValueExpression() const {
+    assert(Argument.getKind() == TemplateArgument::StructuralValue);
+    return LocInfo.getAsExpr();
+  }
+
   NestedNameSpecifierLoc getTemplateQualifierLoc() const {
     if (Argument.getKind() != TemplateArgument::Template &&
         Argument.getKind() != TemplateArgument::TemplateExpansion)
@@ -606,8 +637,7 @@ class TemplateArgumentListInfo {
 public:
   TemplateArgumentListInfo() = default;
 
-  TemplateArgumentListInfo(SourceLocation LAngleLoc,
-                           SourceLocation RAngleLoc)
+  TemplateArgumentListInfo(SourceLocation LAngleLoc, SourceLocation RAngleLoc)
       : LAngleLoc(LAngleLoc), RAngleLoc(RAngleLoc) {}
 
   // This can leak if used in an AST node, use ASTTemplateArgumentListInfo
@@ -626,21 +656,15 @@ class TemplateArgumentListInfo {
     return Arguments.data();
   }
 
-  llvm::ArrayRef<TemplateArgumentLoc> arguments() const {
-    return Arguments;
-  }
+  llvm::ArrayRef<TemplateArgumentLoc> arguments() const { return Arguments; }
 
   const TemplateArgumentLoc &operator[](unsigned I) const {
     return Arguments[I];
   }
 
-  TemplateArgumentLoc &operator[](unsigned I) {
-    return Arguments[I];
-  }
+  TemplateArgumentLoc &operator[](unsigned I) { return Arguments[I]; }
 
-  void addArgument(const TemplateArgumentLoc &Loc) {
-    Arguments.push_back(Loc);
-  }
+  void addArgument(const TemplateArgumentLoc &Loc) { Arguments.push_back(Loc); }
 };
 
 /// Represents an explicit template argument list in C++, e.g.,
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dd24767b62f3c..501968cb7d5f9 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -5154,8 +5154,6 @@ def err_non_type_template_arg_subobject : Error<
   "non-type template argument refers to subobject '%0'">;
 def err_non_type_template_arg_addr_label_diff : Error<
   "template argument / label address difference / what did you expect?">;
-def err_non_type_template_arg_unsupported : Error<
-  "non-type template argument of type %0 is not yet supported">;
 def err_template_arg_not_convertible : Error<
   "non-type template argument of type %0 cannot be converted to a value "
   "of type %1">;
@@ -5207,9 +5205,6 @@ def err_template_arg_not_object_or_func : Error<
   "non-type template argument does not refer to an object or function">;
 def err_template_arg_not_pointer_to_member_form : Error<
   "non-type template argument is not a pointer to member constant">;
-def err_template_arg_member_ptr_base_derived_not_supported : Error<
-  "non-type template argument of pointer-to-member type %1 that refers "
-  "to member %q0 of a different class is not supported yet">;
 def err_template_arg_invalid : Error<
   "non-type template argument '%0' is invalid">;
 def ext_template_arg_extra_parens : ExtWarn<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 0db39333b0ee3..d0f62afdf7bb1 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -8595,8 +8595,8 @@ class Sema final {
                                           QualType ParamType,
                                           SourceLocation Loc);
   ExprResult
-  BuildExpressionFromIntegralTemplateArgument(const TemplateArgument &Arg,
-                                              SourceLocation Loc);
+  BuildExpressionFromNonTypeTemplateArgument(const TemplateArgument &Arg,
+                                             SourceLocation Loc);
 
   /// Enumeration describing how template parameter lists are compared
   /// for equality.
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 0fc0831b221aa..5eb7aa3664569 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -6754,6 +6754,11 @@ ASTContext::getCanonicalTemplateArgument(const TemplateArgument &Arg) const {
     case TemplateArgument::Integral:
       return TemplateArgument(Arg, getCanonicalType(Arg.getIntegralType()));
 
+    case TemplateArgument::StructuralValue:
+      return TemplateArgument(*this,
+                              getCanonicalType(Arg.getStructuralValueType()),
+                              Arg.getAsStructuralValue());
+
     case TemplateArgument::Type:
       return TemplateArgument(getCanonicalType(Arg.getAsType()),
                               /*isNullPtr*/ false, Arg.getIsDefaulted());
diff --git a/clang/lib/AST/ASTImporter.cpp b/clang/lib/AST/ASTImporter.cpp
index b364796073022..12734d62ed9fb 100644
--- a/clang/lib/AST/ASTImporter.cpp
+++ b/clang/lib/AST/ASTImporter.cpp
@@ -823,6 +823,17 @@ ASTNodeImporter::import(const TemplateArgument &From) {
                             From.getIsDefaulted());
   }
 
+  case TemplateArgument::StructuralValue: {
+    ExpectedType ToTypeOrErr = import(From.getStructuralValueType());
+    if (!ToTypeOrErr)
+      return ToTypeOrErr.takeError();
+    Expected<APValue> ToValueOrErr = import(From.getAsStructuralValue());
+    if (!ToValueOrErr)
+      return ToValueOrErr.takeError();
+    return TemplateArgument(Importer.getToContext(), *ToTypeOrErr,
+                            *ToValueOrErr);
+  }
+
   case TemplateArgument::Template: {
     Expected<TemplateName> ToTemplateOrErr = import(From.getAsTemplate());
     if (!ToTemplateOrErr)
@@ -3572,6 +3583,8 @@ class IsTypeDeclaredInsideVisitor
     case TemplateArgument::NullPtr:
       // FIXME: The type is not allowed to be in the function?
       return CheckType(Arg.getNullPtrType());
+    case TemplateArgument::StructuralValue:
+      return CheckType(Arg.getStructuralValueType());
     case TemplateArgument::Pack:
       for (const auto &PackArg : Arg.getPackAsArray())
         if (checkTemplateArgument(PackArg))
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 5103fc86a8005..be7a850a2982c 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -685,6 +685,9 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
     return IsStructurallyEquivalent(Context, Arg1.getAsExpr(),
                                     Arg2.getAsExpr());
 
+  case TemplateArgument::StructuralValue:
+    return Arg1.structurallyEquals(Arg2);
+
   case TemplateArgument::Pack:
     return IsStructurallyEquivalent(Context, Arg1.pack_elements(),
                                     Arg2.pack_elements());
diff --git a/clang/lib/AST/Decl.cpp b/clang/lib/AST/Decl.cpp
index 2c3a0afee8cef..26fdfa040796e 100644
--- a/clang/lib/AST/Decl.cpp
+++ b/clang/lib/AST/Decl.cpp
@@ -343,6 +343,10 @@ LinkageComputer::getLVForTemplateArgumentList(ArrayRef<TemplateArgument> Args,
       LV.merge(getTypeLinkageAndVisibility(Arg.getNullPtrType()));
       continue;
 
+    case TemplateArgument::StructuralValue:
+      LV.merge(getLVForValue(Arg.getAsStructuralValue(), computation));
+      continue;
+
     case TemplateArgument::Template:
     case TemplateArgument::TemplateExpansion:
       if (TemplateDecl *Template =
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index b1678479888eb..56e8ee923e1ee 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4833,9 +4833,23 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
     E = cast<CXXStdInitializerListExpr>(E)->getSubExpr();
     goto recurse;
 
-  case Expr::SubstNonTypeTemplateParmExprClass:
+  case Expr::SubstNonTypeTemplateParmExprClass: {
+    // Mangle a substituted parameter the same way we mangle the template
+    // argument.
+    auto *SNTTPE = cast<SubstNonTypeTemplateParmExpr>(E);
+    if (auto *CE = dyn_cast<ConstantExpr>(SNTTPE->getReplacement())) {
+      // Pull out the constant value and mangle it as a template argument.
+      QualType ParamType = SNTTPE->getParameterType(Context.getASTContext());
+      assert(CE->hasAPValueResult() && "expected the NTTP to have an APValue");
+      mangleValueInTemplateArg(ParamType, CE->getAPValueResult(), false,
+                               /*NeedExactType=*/true);
+      break;
+    }
+    // The remaining cases all happen to be substituted with expressions that
+    // mangle the same as a corresponding template argument anyway.
     E = cast<SubstNonTypeTemplateParmExpr>(E)->getReplacement();
     goto recurse;
+  }
 
   case Expr::UserDefinedLiteralClass:
     // We follow g++'s approach of mangling a UDL as a call to the literal
@@ -6064,6 +6078,11 @@ void CXXNameMangler::mangleTemplateArg(TemplateArgument A, bool NeedExactType) {
     mangleNullPointer(A.getNullPtrType());
     break;
   }
+  case TemplateArgument::StructuralValue:
+    mangleValueInTemplateArg(A.getStructuralValueType(),
+                             A.getAsStructuralValue(),
+                             /*TopLevel=*/true, NeedExactType);
+    break;
   case TemplateArgument::Pack: {
     //  <template-arg> ::= J <template-arg>* E
     Out << 'J';
@@ -6472,7 +6491,20 @@ void CXXNameMangler::mangleValueInTemplateArg(QualType T, const APValue &V,
       Out << "plcvPcad";
       Kind = Offset;
     } else {
-      if (!V.getLValuePath().empty() || V.isLValueOnePastTheEnd()) {
+      // Clang 11 and before mangled an array subject to array-to-pointer decay
+      // as if it were the declaration itself.
+      bool IsArrayToPointerDecayMangledAsDecl = false;
+      if (TopLevel && Ctx.getLangOpts().getClangABICompat() <=
+                          LangOptions::ClangABI::Ver11) {
+        QualType BType = B.getType();
+        IsArrayToPointerDecayMangledAsDecl =
+            BType->isArrayType() && V.getLValuePath().size() == 1 &&
+            V.getLValuePath()[0].getAsArrayIndex() == 0 &&
+            Ctx.hasSimilarType(T, Ctx.getDecayedType(BType));
+      }
+
+      if ((!V.getLValuePath().empty() || V.isLValueOnePastTheEnd()) &&
+          !IsArrayToPointerDecayMangledAsDecl) {
         NotPrimaryExpr();
         // A final conversion to the template parameter's type is usually
         // folded into the 'so' mangling, but we can't do that for 'void*'
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 8346ad87b409b..36b5bf64f675a 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -337,6 +337,7 @@ class MicrosoftCXXNameMangler {
 
 public:
   enum QualifierMangleMode { QMM_Drop, QMM_Mangle, QMM_Escape, QMM_Result };
+  enum class TplArgKind { ClassNTTP, StructuralValue };
 
   MicrosoftCXXNameMangler(MicrosoftMangleContextImpl &C, raw_ostream &Out_)
       : Context(C), Out(Out_), Structor(nullptr), StructorType(-1),
@@ -453,7 +454,7 @@ class MicrosoftCXXNameMangler {
                           const TemplateArgumentList &TemplateArgs);
   void mangleTemplateArg(const TemplateDecl *TD, const TemplateArgument &TA,
                          const NamedDecl *Parm);
-  void mangleTemplateArgValue(QualType T, const APValue &V,
+  void mangleTemplateArgValue(QualType T, const APValue &V, TplArgKind,
                               bool WithScalarType = false);
 
   void mangleObjCProtocol(const ObjCProtocolDecl *PD);
@@ -1088,7 +1089,7 @@ void MicrosoftCXXNameMangler::mangleUnqualifiedName(GlobalDecl GD,
       if (const auto *TPO = dyn_cast<TemplateParamObjectDecl>(ND)) {
         Out << "?__N";
         mangleTemplateArgValue(TPO->getType().getUnqualifiedType(),
-                               TPO->getValue());
+                               TPO->getValue(), TplArgKind::ClassNTTP);
         break;
       }
 
@@ -1604,6 +1605,22 @@ void MicrosoftCXXNameMangler::mangleTemplateArgs(
   }
 }
 
+/// If value V (with type T) represents a decayed pointer to the first element
+/// of an array, return that array.
+static ValueDecl *getAsArrayToPointerDecayedDecl(QualType T, const APValue &V) {
+  // Must be a pointer...
+  if (!T->isPointerType() || !V.isLValue() || !V.hasLValuePath() ||
+      !V.getLValueBase())
+    return nullptr;
+  // ... to element 0 of an array.
+  QualType BaseT = V.getLValueBase().getType();
+  if (!BaseT->isArrayType() || V.getLValuePath().size() != 1 ||
+      V.getLValuePath()[0].getAsArrayIndex() != 0)
+    return nullptr;
+  return const_cast<ValueDecl *>(
+      V.getLValueBase().dyn_cast<const ValueDecl *>());
+}
+
 void MicrosoftCXXNameMangler::mangleTemplateArg(const TemplateDecl *TD,
                                                 const TemplateArgument &TA,
                                                 const NamedDecl *Parm) {
@@ -1669,7 +1686,7 @@ void MicrosoftCXXNameMangler::mangleTemplateArg(const TemplateDecl *TD,
       Out << "$";
       auto *TPO = cast<TemplateParamObjectDecl>(ND);
       mangleTemplateArgValue(TPO->getType().getUnqualifiedType(),
-                             TPO->getValue());
+                             TPO->getValue(), TplArgKind::ClassNTTP);
     } else {
       mangle(ND, "$1?");
     }
@@ -1712,6 +1729,27 @@ void MicrosoftCXXNameMangler::mangleTemplateArg(const TemplateDecl *TD,
                          cast<NonTypeTemplateParmDecl>(Parm), T);
     break;
   }
+  case TemplateArgument::StructuralValue:
+    if (ValueDecl *D = getAsArrayToPointerDecayedDecl(
+            TA.getStructuralValueType(), TA.getAsStructuralValue())) {
+      // Mangle the result of array-to-pointer decay as if it were a reference
+      // to the original declaration, to match MSVC's behavior. This can result
+      // in mangling collisions in some cases!
+      return mangleTemplateArg(
+          TD, TemplateArgument(D, TA.getStructuralValueType()), Parm);
+    }
+    Out << "$";
+    if (cast<NonTypeTemplateParmDecl>(Parm)
+            ->getType()
+            ->getContainedDeducedType()) {
+      Out << "M";
+      mangleType(TA.getNonTypeTemplateArgumentType(), SourceRange(), QMM_Drop);
+    }
+    mangleTemplateArgValue(TA.getStructuralValueType(),
+                           TA.getAsStructuralValue(),
+                           TplArgKind::StructuralValue,
+                           /*WithScalarType=*/false);
+    break;
   case TemplateArgument::Expression:
     mangleExpression(TA.getAsExpr(), cast<NonTypeTemplateParmDecl>(Parm));
     break;
@@ -1754,6 +1792,7 @@ void MicrosoftCXXNameMangler::mangleTemplateArg(const TemplateDecl *TD,
 
 void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
                                                      const APValue &V,
+                                                     TplArgKind TAK,
                                                      bool WithScalarType) {
   switch (V.getKind()) {
   case APValue::None:
@@ -1806,7 +1845,7 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
         break;
       }
     } else {
-      if (T->isPointerType())
+      if (TAK == TplArgKind::ClassNTTP && T->isPointerType())
         Out << "5";
 
       SmallVector<char, 2> EntryTypes;
@@ -1850,12 +1889,12 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
       auto *VD = Base.dyn_cast<const ValueDecl*>();
       if (!VD)
         break;
-      Out << "E";
+      Out << (TAK == TplArgKind::ClassNTTP ? 'E' : '1');
       mangle(VD);
 
       for (const std::function<void()> &Mangler : EntryManglers)
         Mangler();
-      if (T->isPointerType())
+      if (TAK == TplArgKind::ClassNTTP && T->isPointerType())
         Out << '@';
     }
 
@@ -1869,11 +1908,18 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
     const CXXRecordDecl *RD =
         T->castAs<MemberPointerType>()->getMostRecentCXXRecordDecl();
     const ValueDecl *D = V.getMemberPointerDecl();
-    if (T->isMemberDataPointerType())
-      mangleMemberDataPointerInClassNTTP(RD, D);
-    else
-      mangleMemberFunctionPointerInClassNTTP(RD,
-                                             cast_or_null<CXXMethodDecl>(D));
+    if (TAK == TplArgKind::ClassNTTP) {
+      if (T->isMemberDataPointerType())
+        mangleMemberDataPointerInClassNTTP(RD, D);
+      else
+        mangleMemberFunctionPointerInClassNTTP(RD,
+                                               cast_or_null<CXXMethodDecl>(D));
+    } else {
+      if (T->isMemberDataPointerType())
+        mangleMemberDataPointer(RD, D, "");
+      else
+        mangleMemberFunctionPointer(RD, cast_or_null<CXXMethodDecl>(D), "");
+    }
     return;
   }
 
@@ -1885,11 +1931,11 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
 
     unsigned BaseIndex = 0;
     for (const CXXBaseSpecifier &B : RD->bases())
-      mangleTemplateArgValue(B.getType(), V.getStructBase(BaseIndex++));
+      mangleTemplateArgValue(B.getType(), V.getStructBase(BaseIndex++), TAK);
     for (const FieldDecl *FD : RD->fields())
       if (!FD->isUnnamedBitfield())
         mangleTemplateArgValue(FD->getType(),
-                               V.getStructField(FD->getFieldIndex()),
+                               V.getStructField(FD->getFieldIndex()), TAK,
                                /*WithScalarType*/ true);
     Out << '@';
     return;
@@ -1900,7 +1946,7 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
     mangleType(T, SourceRange(), QMM_Escape);
     if (const FieldDecl *FD = V.getUnionField()) {
       mangleUnqualifiedName(FD);
-      mangleTemplateArgValue(FD->getType(), V.getUnionValue());
+      mangleTemplateArgValue(FD->getType(), V.getUnionValue(), TAK);
     }
     Out << '@';
     return;
@@ -1932,7 +1978,7 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
       const APValue &ElemV = I < V.getArrayInitializedElts()
                                  ? V.getArrayInitializedElt(I)
                                  : V.getArrayFiller();
-      mangleTemplateArgValue(ElemT, ElemV);
+      mangleTemplateArgValue(ElemT, ElemV, TAK);
       Out << '@';
     }
     Out << '@';
@@ -1949,7 +1995,7 @@ void MicrosoftCXXNameMangler::mangleTemplateArgValue(QualType T,
     mangleType(ElemT, SourceRange(), QMM_Escape);
     for (unsigned I = 0, N = V.getVectorLength(); I != N; ++I) {
       const APValue &ElemV = V.getVectorElt(I);
-      mangleTemplateArgValue(ElemT, ElemV);
+      mangleTemplateArgValue(ElemT, ElemV, TAK);
       Out << '@';
     }
     Out << "@@";
diff --git a/clang/lib/AST/ODRHash.cpp b/clang/lib/AST/ODRHash.cpp
index c5aa91a81da05..5b98646a1e8dc 100644
--- a/clang/lib/AST/ODRHash.cpp
+++ b/clang/lib/AST/ODRHash.cpp
@@ -180,6 +180,10 @@ void ODRHash::AddTemplateArgument(TemplateArgument TA) {
       TA.getAsIntegral().Profile(ID);
       break;
     }
+    case TemplateArgument::StructuralValue:
+      AddQualType(TA.getStructuralValueType());
+      AddStructuralValue(TA.getAsStructuralValue());
+      break;
     case TemplateArgument::Template:
     case TemplateArgument::TemplateExpansion:
       AddTemplateName(TA.getAsTemplateOrTemplatePattern());
@@ -1296,3 +1300,66 @@ void ODRHash::AddQualType(QualType T) {
 void ODRHash::AddBoolean(bool Value) {
   Bools.push_back(Value);
 }
+
+void ODRHash::AddStructuralValue(const APValue &Value) {
+  ID.AddInteger(Value.getKind());
+
+  // 'APValue::Profile' uses pointer values to make hash for LValue and
+  // MemberPointer, but they differ from one compiler invocation to another.
+  // So, handle them explicitly here.
+
+  switch (Value.getKind()) {
+  case APValue::LValue: {
+    const APValue::LValueBase &Base = Value.getLValueBase();
+    if (!Base) {
+      ID.AddInteger(Value.getLValueOffset().getQuantity());
+      break;
+    }
+
+    assert(Base.is<const ValueDecl *>());
+    AddDecl(Base.get<const ValueDecl *>());
+    ID.AddInteger(Value.getLValueOffset().getQuantity());
+
+    bool OnePastTheEnd = Value.isLValueOnePastTheEnd();
+    if (Value.hasLValuePath()) {
+      QualType TypeSoFar = Base.getType();
+      for (APValue::LValuePathEntry E : Value.getLValuePath()) {
+        if (const auto *AT = TypeSoFar->getAsArrayTypeUnsafe()) {
+          if (const auto *CAT = dyn_cast<ConstantArrayType>(AT))
+            OnePastTheEnd |= CAT->getSize() == E.getAsArrayIndex();
+          TypeSoFar = AT->getElementType();
+        } else {
+          const Decl *D = E.getAsBaseOrMember().getPointer();
+          if (const auto *FD = dyn_cast<FieldDecl>(D)) {
+            if (FD->getParent()->isUnion())
+              ID.AddInteger(FD->getFieldIndex());
+            TypeSoFar = FD->getType();
+          } else {
+            TypeSoFar =
+                D->getASTContext().getRecordType(cast<CXXRecordDecl>(D));
+          }
+        }
+      }
+    }
+    unsigned Val = 0;
+    if (Value.isNullPointer())
+      Val |= 1 << 0;
+    if (OnePastTheEnd)
+      Val |= 1 << 1;
+    if (Value.hasLValuePath())
+      Val |= 1 << 2;
+    ID.AddInteger(Val);
+    break;
+  }
+  case APValue::MemberPointer: {
+    const ValueDecl *D = Value.getMemberPointerDecl();
+    assert(D);
+    AddDecl(D);
+    ID.AddInteger(
+        D->getASTContext().getMemberPointerPathAdjustment(Value).getQuantity());
+    break;
+  }
+  default:
+    Value.Profile(ID);
+  }
+}
diff --git a/clang/lib/AST/StmtProfile.cpp b/clang/lib/AST/StmtProfile.cpp
index d7b980a585702..dd0838edab7b3 100644
--- a/clang/lib/AST/StmtProfile.cpp
+++ b/clang/lib/AST/StmtProfile.cpp
@@ -2416,6 +2416,12 @@ void StmtProfiler::VisitTemplateArgument(const TemplateArgument &Arg) {
     Arg.getAsIntegral().Profile(ID);
     break;
 
+  case TemplateArgument::StructuralValue:
+    VisitType(Arg.getStructuralValueType());
+    // FIXME: Do we need to recursively decompose this ourselves?
+    Arg.getAsStructuralValue().Profile(ID);
+    break;
+
   case TemplateArgument::Expression:
     Visit(Arg.getAsExpr());
     break;
diff --git a/clang/lib/AST/TemplateBase.cpp b/clang/lib/AST/TemplateBase.cpp
index 2ec0b2cce08ca..2bdbeb08ef204 100644
--- a/clang/lib/AST/TemplateBase.cpp
+++ b/clang/lib/AST/TemplateBase.cpp
@@ -161,8 +161,25 @@ static bool needsAmpersandOnTemplateArg(QualType paramType, QualType argType) {
 // TemplateArgument Implementation
 //===----------------------------------------------------------------------===//
 
-TemplateArgument::TemplateArgument(ASTContext &Ctx, const llvm::APSInt &Value,
-                                   QualType Type, bool IsDefaulted) {
+void TemplateArgument::initFromType(QualType T, bool IsNullPtr,
+                                    bool IsDefaulted) {
+  TypeOrValue.Kind = IsNullPtr ? NullPtr : Type;
+  TypeOrValue.IsDefaulted = IsDefaulted;
+  TypeOrValue.V = reinterpret_cast<uintptr_t>(T.getAsOpaquePtr());
+}
+
+void TemplateArgument::initFromDeclaration(ValueDecl *D, QualType QT,
+                                           bool IsDefaulted) {
+  assert(D && "Expected decl");
+  DeclArg.Kind = Declaration;
+  DeclArg.IsDefaulted = IsDefaulted;
+  DeclArg.QT = QT.getAsOpaquePtr();
+  DeclArg.D = D;
+}
+
+void TemplateArgument::initFromIntegral(const ASTContext &Ctx,
+                                        const llvm::APSInt &Value,
+                                        QualType Type, bool IsDefaulted) {
   Integer.Kind = Integral;
   Integer.IsDefaulted = IsDefaulted;
   // Copy the APSInt value into our decomposed form.
@@ -181,6 +198,56 @@ TemplateArgument::TemplateArgument(ASTContext &Ctx, const llvm::APSInt &Value,
   Integer.Type = Type.getAsOpaquePtr();
 }
 
+void TemplateArgument::initFromStructural(const ASTContext &Ctx, QualType Type,
+                                          const APValue &V, bool IsDefaulted) {
+  Value.Kind = StructuralValue;
+  Value.IsDefaulted = IsDefaulted;
+  Value.Value = new (Ctx) APValue(V);
+  Ctx.addDestruction(Value.Value);
+  Value.Type = Type.getAsOpaquePtr();
+}
+
+TemplateArgument::TemplateArgument(const ASTContext &Ctx,
+                                   const llvm::APSInt &Value, QualType Type,
+                                   bool IsDefaulted) {
+  initFromIntegral(Ctx, Value, Type, IsDefaulted);
+}
+
+static const ValueDecl *getAsSimpleValueDeclRef(const ASTContext &Ctx,
+                                                QualType T, const APValue &V) {
+  // Pointers to members are relatively easy.
+  if (V.isMemberPointer() && V.getMemberPointerPath().empty())
+    return V.getMemberPointerDecl();
+
+  // We model class non-type template parameters as their template parameter
+  // object declaration.
+  if (V.isStruct() || V.isUnion())
+    return Ctx.getTemplateParamObjectDecl(T, V);
+
+  // Pointers and references with an empty path use the special 'Declaration'
+  // representation.
+  if (V.isLValue() && V.hasLValuePath() && V.getLValuePath().empty() &&
+      !V.isLValueOnePastTheEnd())
+    return V.getLValueBase().dyn_cast<const ValueDecl *>();
+
+  // Everything else uses the 'structural' representation.
+  return nullptr;
+}
+
+TemplateArgument::TemplateArgument(const ASTContext &Ctx, QualType Type,
+                                   const APValue &V, bool IsDefaulted) {
+  if (Type->isIntegralOrEnumerationType() && V.isInt())
+    initFromIntegral(Ctx, V.getInt(), Type, IsDefaulted);
+  else if ((V.isLValue() && V.isNullPointer()) ||
+           (V.isMemberPointer() && !V.getMemberPointerDecl()))
+    initFromType(Type, /*isNullPtr=*/true, IsDefaulted);
+  else if (const ValueDecl *VD = getAsSimpleValueDeclRef(Ctx, Type, V))
+    // FIXME: The Declaration form should expose a const ValueDecl*.
+    initFromDeclaration(const_cast<ValueDecl *>(VD), Type, IsDefaulted);
+  else
+    initFromStructural(Ctx, Type, V, IsDefaulted);
+}
+
 TemplateArgument
 TemplateArgument::CreatePackCopy(ASTContext &Context,
                                  ArrayRef<TemplateArgument> Args) {
@@ -221,6 +288,7 @@ TemplateArgumentDependence TemplateArgument::getDependence() const {
 
   case NullPtr:
   case Integral:
+  case StructuralValue:
     return TemplateArgumentDependence::None;
 
   case Expression:
@@ -251,6 +319,7 @@ bool TemplateArgument::isPackExpansion() const {
   case Null:
   case Declaration:
   case Integral:
+  case StructuralValue:
   case Pack:
   case Template:
   case NullPtr:
@@ -301,6 +370,9 @@ QualType TemplateArgument::getNonTypeTemplateArgumentType() const {
 
   case TemplateArgument::NullPtr:
     return getNullPtrType();
+
+  case TemplateArgument::StructuralValue:
+    return getStructuralValueType();
   }
 
   llvm_unreachable("Invalid TemplateArgument Kind!");
@@ -334,8 +406,13 @@ void TemplateArgument::Profile(llvm::FoldingSetNodeID &ID,
     break;
 
   case Integral:
-    getAsIntegral().Profile(ID);
     getIntegralType().Profile(ID);
+    getAsIntegral().Profile(ID);
+    break;
+
+  case StructuralValue:
+    getStructuralValueType().Profile(ID);
+    getAsStructuralValue().Profile(ID);
     break;
 
   case Expression:
@@ -372,6 +449,16 @@ bool TemplateArgument::structurallyEquals(const TemplateArgument &Other) const {
     return getIntegralType() == Other.getIntegralType() &&
            getAsIntegral() == Other.getAsIntegral();
 
+  case StructuralValue: {
+    if (getStructuralValueType() != Other.getStructuralValueType())
+      return false;
+
+    llvm::FoldingSetNodeID A, B;
+    getAsStructuralValue().Profile(A);
+    Other.getAsStructuralValue().Profile(B);
+    return A == B;
+  }
+
   case Pack:
     if (Args.NumArgs != Other.Args.NumArgs) return false;
     for (unsigned I = 0, E = Args.NumArgs; I != E; ++I)
@@ -398,6 +485,7 @@ TemplateArgument TemplateArgument::getPackExpansionPattern() const {
 
   case Declaration:
   case Integral:
+  case StructuralValue:
   case Pack:
   case Null:
   case Template:
@@ -440,6 +528,10 @@ void TemplateArgument::print(const PrintingPolicy &Policy, raw_ostream &Out,
     break;
   }
 
+  case StructuralValue:
+    getAsStructuralValue().printPretty(Out, Policy, getStructuralValueType());
+    break;
+
   case NullPtr:
     // FIXME: Include the type if it's not obvious from the context.
     Out << "nullptr";
@@ -523,6 +615,9 @@ SourceRange TemplateArgumentLoc::getSourceRange() const {
   case TemplateArgument::Integral:
     return getSourceIntegralExpression()->getSourceRange();
 
+  case TemplateArgument::StructuralValue:
+    return getSourceStructuralValueExpression()->getSourceRange();
+
   case TemplateArgument::Pack:
   case TemplateArgument::Null:
     return SourceRange();
@@ -551,6 +646,18 @@ static const T &DiagTemplateArg(const T &DB, const TemplateArgument &Arg) {
   case TemplateArgument::Integral:
     return DB << toString(Arg.getAsIntegral(), 10);
 
+  case TemplateArgument::StructuralValue: {
+    // FIXME: We're guessing at LangOptions!
+    SmallString<32> Str;
+    llvm::raw_svector_ostream OS(Str);
+    LangOptions LangOpts;
+    LangOpts.CPlusPlus = true;
+    PrintingPolicy Policy(LangOpts);
+    Arg.getAsStructuralValue().printPretty(OS, Policy,
+                                           Arg.getStructuralValueType());
+    return DB << OS.str();
+  }
+
   case TemplateArgument::Template:
     return DB << Arg.getAsTemplate();
 
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
index e12b9b50f6e72..66732bba18e2d 100644
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -586,6 +586,7 @@ void TemplateSpecializationTypeLoc::initializeArgLocs(
     case TemplateArgument::Integral:
     case TemplateArgument::Declaration:
     case TemplateArgument::NullPtr:
+    case TemplateArgument::StructuralValue:
       ArgInfos[i] = TemplateArgumentLocInfo();
       break;
 
diff --git a/clang/lib/CodeGen/CGDebugInfo.cpp b/clang/lib/CodeGen/CGDebugInfo.cpp
index c5b96f93b42c3..0f3f684d61dc9 100644
--- a/clang/lib/CodeGen/CGDebugInfo.cpp
+++ b/clang/lib/CodeGen/CGDebugInfo.cpp
@@ -2201,6 +2201,14 @@ CGDebugInfo::CollectTemplateParams(std::optional<TemplateArgs> OArgs,
       TemplateParams.push_back(DBuilder.createTemplateValueParameter(
           TheCU, Name, TTy, defaultParameter, V));
     } break;
+    case TemplateArgument::StructuralValue: {
+      QualType T = TA.getStructuralValueType();
+      llvm::DIType *TTy = getOrCreateType(T, Unit);
+      llvm::Constant *V = ConstantEmitter(CGM).emitAbstract(
+          SourceLocation(), TA.getAsStructuralValue(), T);
+      TemplateParams.push_back(DBuilder.createTemplateValueParameter(
+          TheCU, Name, TTy, defaultParameter, V));
+    } break;
     case TemplateArgument::Template: {
       std::string QualName;
       llvm::raw_string_ostream OS(QualName);
@@ -5401,6 +5409,8 @@ std::string CGDebugInfo::GetName(const Decl *D, bool Qualified) const {
             // feasible some day.
             return TA.getAsIntegral().getBitWidth() <= 64 &&
                    IsReconstitutableType(TA.getIntegralType());
+          case TemplateArgument::StructuralValue:
+            return false;
           case TemplateArgument::Type:
             return IsReconstitutableType(TA.getAsType());
           default:
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index d12e85b48d0b0..c5f6b6d3a99f0 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1516,6 +1516,14 @@ LValue CodeGenFunction::EmitLValue(const Expr *E,
   return LV;
 }
 
+static QualType getConstantExprReferredType(const FullExpr *E,
+                                            const ASTContext &Ctx) {
+  const Expr *SE = E->getSubExpr()->IgnoreImplicit();
+  if (isa<OpaqueValueExpr>(SE))
+    return SE->getType();
+  return cast<CallExpr>(SE)->getCallReturnType(Ctx)->getPointeeType();
+}
+
 LValue CodeGenFunction::EmitLValueHelper(const Expr *E,
                                          KnownNonNull_t IsKnownNonNull) {
   ApplyDebugLocation DL(*this, E);
@@ -1554,9 +1562,7 @@ LValue CodeGenFunction::EmitLValueHelper(const Expr *E,
   case Expr::ConstantExprClass: {
     const ConstantExpr *CE = cast<ConstantExpr>(E);
     if (llvm::Value *Result = ConstantEmitter(*this).tryEmitConstantExpr(CE)) {
-      QualType RetType = cast<CallExpr>(CE->getSubExpr()->IgnoreImplicit())
-                             ->getCallReturnType(getContext())
-                             ->getPointeeType();
+      QualType RetType = getConstantExprReferredType(CE, getContext());
       return MakeNaturalAlignAddrLValue(Result, RetType);
     }
     return EmitLValue(cast<ConstantExpr>(E)->getSubExpr(), IsKnownNonNull);
diff --git a/clang/lib/Index/USRGeneration.cpp b/clang/lib/Index/USRGeneration.cpp
index fb936d9fbf8ab..5acc86191f8f9 100644
--- a/clang/lib/Index/USRGeneration.cpp
+++ b/clang/lib/Index/USRGeneration.cpp
@@ -12,6 +12,7 @@
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/AST/DeclVisitor.h"
+#include "clang/AST/ODRHash.h"
 #include "clang/Basic/FileManager.h"
 #include "clang/Lex/PreprocessingRecord.h"
 #include "llvm/Support/Path.h"
@@ -1051,6 +1052,15 @@ void USRGenerator::VisitTemplateArgument(const TemplateArgument &Arg) {
     VisitType(Arg.getIntegralType());
     Out << Arg.getAsIntegral();
     break;
+
+  case TemplateArgument::StructuralValue: {
+    Out << 'S';
+    VisitType(Arg.getStructuralValueType());
+    ODRHash Hash{};
+    Hash.AddStructuralValue(Arg.getAsStructuralValue());
+    Out << Hash.CalculateHash();
+    break;
+  }
   }
 }
 
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index 996f8b57233ba..02b1a045df44c 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -2982,6 +2982,7 @@ addAssociatedClassesAndNamespaces(AssociatedLookup &Result,
     case TemplateArgument::Integral:
     case TemplateArgument::Expression:
     case TemplateArgument::NullPtr:
+    case TemplateArgument::StructuralValue:
       // [Note: non-type template arguments do not contribute to the set of
       //  associated namespaces. ]
       break;
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 6ee5f26d55c3a..697a1c0cfc404 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -6217,7 +6217,15 @@ Sema::EvaluateConvertedConstantExpression(Expr *E, QualType T, APValue &Value,
 
     if (Notes.empty()) {
       // It's a constant expression.
-      Expr *E = ConstantExpr::Create(Context, Result.get(), Value);
+      Expr *E = Result.get();
+      if (const auto *CE = dyn_cast<ConstantExpr>(E)) {
+        // We expect a ConstantExpr to have a value associated with it
+        // by this point.
+        assert(CE->getResultStorageKind() != ConstantResultStorageKind::None &&
+               "ConstantExpr has no value associated with it");
+      } else {
+        E = ConstantExpr::Create(Context, Result.get(), Value);
+      }
       if (!PreNarrowingValue.isAbsent())
         Value = std::move(PreNarrowingValue);
       return E;
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 839d508b911f0..9bfa71dc8bcf1 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -4427,6 +4427,7 @@ static bool isTemplateArgumentTemplateParameter(
   case TemplateArgument::NullPtr:
   case TemplateArgument::Integral:
   case TemplateArgument::Declaration:
+  case TemplateArgument::StructuralValue:
   case TemplateArgument::Pack:
   case TemplateArgument::TemplateExpansion:
     return false;
@@ -5758,6 +5759,7 @@ bool Sema::CheckTemplateArgument(
 
     case TemplateArgument::Declaration:
     case TemplateArgument::Integral:
+    case TemplateArgument::StructuralValue:
     case TemplateArgument::NullPtr:
       // We've already checked this template argument, so just copy
       // it to the list of converted arguments.
@@ -5912,11 +5914,10 @@ bool Sema::CheckTemplateArgument(
     return true;
 
   case TemplateArgument::Declaration:
-    llvm_unreachable("Declaration argument with template template parameter");
   case TemplateArgument::Integral:
-    llvm_unreachable("Integral argument with template template parameter");
+  case TemplateArgument::StructuralValue:
   case TemplateArgument::NullPtr:
-    llvm_unreachable("Null pointer argument with template template parameter");
+    llvm_unreachable("non-type argument with template template parameter");
 
   case TemplateArgument::Pack:
     llvm_unreachable("Caller must expand template argument packs");
@@ -7411,44 +7412,9 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
     if (ArgResult.isInvalid())
       return ExprError();
 
-    // Convert the APValue to a TemplateArgument.
-    switch (Value.getKind()) {
-    case APValue::None:
-      assert(ParamType->isNullPtrType());
-      SugaredConverted = TemplateArgument(ParamType, /*isNullPtr=*/true);
-      CanonicalConverted = TemplateArgument(CanonParamType, /*isNullPtr=*/true);
-      break;
-    case APValue::Indeterminate:
-      llvm_unreachable("result of constant evaluation should be initialized");
-      break;
-    case APValue::Int:
-      assert(ParamType->isIntegralOrEnumerationType());
-      SugaredConverted = TemplateArgument(Context, Value.getInt(), ParamType);
-      CanonicalConverted =
-          TemplateArgument(Context, Value.getInt(), CanonParamType);
-      break;
-    case APValue::MemberPointer: {
-      assert(ParamType->isMemberPointerType());
-
-      // FIXME: We need TemplateArgument representation and mangling for these.
-      if (!Value.getMemberPointerPath().empty()) {
-        Diag(Arg->getBeginLoc(),
-             diag::err_template_arg_member_ptr_base_derived_not_supported)
-            << Value.getMemberPointerDecl() << ParamType
-            << Arg->getSourceRange();
-        return ExprError();
-      }
-
-      auto *VD = const_cast<ValueDecl*>(Value.getMemberPointerDecl());
-      SugaredConverted = VD ? TemplateArgument(VD, ParamType)
-                            : TemplateArgument(ParamType, /*isNullPtr=*/true);
-      CanonicalConverted =
-          VD ? TemplateArgument(cast<ValueDecl>(VD->getCanonicalDecl()),
-                                CanonParamType)
-             : TemplateArgument(CanonParamType, /*isNullPtr=*/true);
-      break;
-    }
-    case APValue::LValue: {
+    // Prior to C++20, enforce restrictions on possible template argument
+    // values.
+    if (!getLangOpts().CPlusPlus20 && Value.isLValue()) {
       //   For a non-type template-parameter of pointer or reference type,
       //   the value of the constant expression shall not refer to
       assert(ParamType->isPointerType() || ParamType->isReferenceType() ||
@@ -7466,8 +7432,7 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
             << Arg->getSourceRange();
         return ExprError();
       }
-      // -- a subobject
-      // FIXME: Until C++20
+      // -- a subobject [until C++20]
       if (Value.hasLValuePath() && Value.getLValuePath().size() == 1 &&
           VD && VD->getType()->isArrayType() &&
           Value.getLValuePath()[0].getAsArrayIndex() == 0 &&
@@ -7485,37 +7450,13 @@ ExprResult Sema::CheckTemplateArgument(NonTypeTemplateParmDecl *Param,
              "null reference should not be a constant expression");
       assert((!VD || !ParamType->isNullPtrType()) &&
              "non-null value of type nullptr_t?");
-
-      SugaredConverted = VD ? TemplateArgument(VD, ParamType)
-                            : TemplateArgument(ParamType, /*isNullPtr=*/true);
-      CanonicalConverted =
-          VD ? TemplateArgument(cast<ValueDecl>(VD->getCanonicalDecl()),
-                                CanonParamType)
-             : TemplateArgument(CanonParamType, /*isNullPtr=*/true);
-      break;
-    }
-    case APValue::Struct:
-    case APValue::Union: {
-      // Get or create the corresponding template parameter object.
-      TemplateParamObjectDecl *D =
-          Context.getTemplateParamObjectDecl(ParamType, Value);
-      SugaredConverted = TemplateArgument(D, ParamType);
-      CanonicalConverted =
-          TemplateArgument(D->getCanonicalDecl(), CanonParamType);
-      break;
     }
-    case APValue::AddrLabelDiff:
+
+    if (Value.isAddrLabelDiff())
       return Diag(StartLoc, diag::err_non_type_template_arg_addr_label_diff);
-    case APValue::FixedPoint:
-    case APValue::Float:
-    case APValue::ComplexInt:
-    case APValue::ComplexFloat:
-    case APValue::Vector:
-    case APValue::Array:
-      return Diag(StartLoc, diag::err_non_type_template_arg_unsupported)
-             << ParamType;
-    }
 
+    SugaredConverted = TemplateArgument(Context, ParamType, Value);
+    CanonicalConverted = TemplateArgument(Context, CanonParamType, Value);
     return ArgResult.get();
   }
 
@@ -8099,12 +8040,9 @@ Sema::BuildExpressionFromDeclTemplateArgument(const TemplateArgument &Arg,
 /// This routine takes care of the mapping from an integral template
 /// argument (which may have any integral type) to the appropriate
 /// literal value.
-ExprResult
-Sema::BuildExpressionFromIntegralTemplateArgument(const TemplateArgument &Arg,
-                                                  SourceLocation Loc) {
-  assert(Arg.getKind() == TemplateArgument::Integral &&
-         "Operation is only valid for integral template arguments");
-  QualType OrigT = Arg.getIntegralType();
+static Expr *BuildExpressionFromIntegralTemplateArgumentValue(
+    Sema &S, QualType OrigT, const llvm::APSInt &Int, SourceLocation Loc) {
+  assert(OrigT->isIntegralOrEnumerationType());
 
   // If this is an enum type that we're instantiating, we need to use an integer
   // type the same size as the enumerator.  We don't want to build an
@@ -8120,7 +8058,7 @@ Sema::BuildExpressionFromIntegralTemplateArgument(const TemplateArgument &Arg,
     CharacterLiteralKind Kind;
     if (T->isWideCharType())
       Kind = CharacterLiteralKind::Wide;
-    else if (T->isChar8Type() && getLangOpts().Char8)
+    else if (T->isChar8Type() && S.getLangOpts().Char8)
       Kind = CharacterLiteralKind::UTF8;
     else if (T->isChar16Type())
       Kind = CharacterLiteralKind::UTF16;
@@ -8129,29 +8067,133 @@ Sema::BuildExpressionFromIntegralTemplateArgument(const TemplateArgument &Arg,
     else
       Kind = CharacterLiteralKind::Ascii;
 
-    E = new (Context) CharacterLiteral(Arg.getAsIntegral().getZExtValue(),
-                                       Kind, T, Loc);
+    E = new (S.Context) CharacterLiteral(Int.getZExtValue(), Kind, T, Loc);
   } else if (T->isBooleanType()) {
-    E = CXXBoolLiteralExpr::Create(Context, Arg.getAsIntegral().getBoolValue(),
-                                   T, Loc);
-  } else if (T->isNullPtrType()) {
-    E = new (Context) CXXNullPtrLiteralExpr(Context.NullPtrTy, Loc);
+    E = CXXBoolLiteralExpr::Create(S.Context, Int.getBoolValue(), T, Loc);
   } else {
-    E = IntegerLiteral::Create(Context, Arg.getAsIntegral(), T, Loc);
+    E = IntegerLiteral::Create(S.Context, Int, T, Loc);
   }
 
   if (OrigT->isEnumeralType()) {
     // FIXME: This is a hack. We need a better way to handle substituted
     // non-type template parameters.
-    E = CStyleCastExpr::Create(Context, OrigT, VK_PRValue, CK_IntegralCast, E,
-                               nullptr, CurFPFeatureOverrides(),
-                               Context.getTrivialTypeSourceInfo(OrigT, Loc),
+    E = CStyleCastExpr::Create(S.Context, OrigT, VK_PRValue, CK_IntegralCast, E,
+                               nullptr, S.CurFPFeatureOverrides(),
+                               S.Context.getTrivialTypeSourceInfo(OrigT, Loc),
                                Loc, Loc);
   }
 
   return E;
 }
 
+static Expr *BuildExpressionFromNonTypeTemplateArgumentValue(
+    Sema &S, QualType T, const APValue &Val, SourceLocation Loc) {
+  auto MakeInitList = [&](ArrayRef<Expr *> Elts) -> Expr * {
+    auto *ILE = new (S.Context) InitListExpr(S.Context, Loc, Elts, Loc);
+    ILE->setType(T);
+    return ILE;
+  };
+
+  switch (Val.getKind()) {
+  case APValue::AddrLabelDiff:
+    // This cannot occur in a template argument at all.
+  case APValue::Array:
+  case APValue::Struct:
+  case APValue::Union:
+    // These can only occur within a template parameter object, which is
+    // represented as a TemplateArgument::Declaration.
+    llvm_unreachable("unexpected template argument value");
+
+  case APValue::Int:
+    return BuildExpressionFromIntegralTemplateArgumentValue(S, T, Val.getInt(),
+                                                            Loc);
+
+  case APValue::Float:
+    return FloatingLiteral::Create(S.Context, Val.getFloat(), /*IsExact=*/true,
+                                   T, Loc);
+
+  case APValue::FixedPoint:
+    return FixedPointLiteral::CreateFromRawInt(
+        S.Context, Val.getFixedPoint().getValue(), T, Loc,
+        Val.getFixedPoint().getScale());
+
+  case APValue::ComplexInt: {
+    QualType ElemT = T->castAs<ComplexType>()->getElementType();
+    return MakeInitList({BuildExpressionFromIntegralTemplateArgumentValue(
+                             S, ElemT, Val.getComplexIntReal(), Loc),
+                         BuildExpressionFromIntegralTemplateArgumentValue(
+                             S, ElemT, Val.getComplexIntImag(), Loc)});
+  }
+
+  case APValue::ComplexFloat: {
+    QualType ElemT = T->castAs<ComplexType>()->getElementType();
+    return MakeInitList(
+        {FloatingLiteral::Create(S.Context, Val.getComplexFloatReal(), true,
+                                 ElemT, Loc),
+         FloatingLiteral::Create(S.Context, Val.getComplexFloatImag(), true,
+                                 ElemT, Loc)});
+  }
+
+  case APValue::Vector: {
+    QualType ElemT = T->castAs<VectorType>()->getElementType();
+    llvm::SmallVector<Expr *, 8> Elts;
+    for (unsigned I = 0, N = Val.getVectorLength(); I != N; ++I)
+      Elts.push_back(BuildExpressionFromNonTypeTemplateArgumentValue(
+          S, ElemT, Val.getVectorElt(I), Loc));
+    return MakeInitList(Elts);
+  }
+
+  case APValue::None:
+  case APValue::Indeterminate:
+    llvm_unreachable("Unexpected APValue kind.");
+  case APValue::LValue:
+  case APValue::MemberPointer:
+    // There isn't necessarily a valid equivalent source-level syntax for
+    // these; in particular, a naive lowering might violate access control.
+    // So for now we lower to a ConstantExpr holding the value, wrapped around
+    // an OpaqueValueExpr.
+    // FIXME: We should have a better representation for this.
+    ExprValueKind VK = VK_PRValue;
+    if (T->isReferenceType()) {
+      T = T->getPointeeType();
+      VK = VK_LValue;
+    }
+    auto *OVE = new (S.Context) OpaqueValueExpr(Loc, T, VK);
+    return ConstantExpr::Create(S.Context, OVE, Val);
+  }
+  llvm_unreachable("Unhandled APValue::ValueKind enum");
+}
+
+ExprResult
+Sema::BuildExpressionFromNonTypeTemplateArgument(const TemplateArgument &Arg,
+                                                 SourceLocation Loc) {
+  switch (Arg.getKind()) {
+  case TemplateArgument::Null:
+  case TemplateArgument::Type:
+  case TemplateArgument::Template:
+  case TemplateArgument::TemplateExpansion:
+  case TemplateArgument::Pack:
+    llvm_unreachable("not a non-type template argument");
+
+  case TemplateArgument::Expression:
+    return Arg.getAsExpr();
+
+  case TemplateArgument::NullPtr:
+  case TemplateArgument::Declaration:
+    return BuildExpressionFromDeclTemplateArgument(
+        Arg, Arg.getNonTypeTemplateArgumentType(), Loc);
+
+  case TemplateArgument::Integral:
+    return BuildExpressionFromIntegralTemplateArgumentValue(
+        *this, Arg.getIntegralType(), Arg.getAsIntegral(), Loc);
+
+  case TemplateArgument::StructuralValue:
+    return BuildExpressionFromNonTypeTemplateArgumentValue(
+        *this, Arg.getStructuralValueType(), Arg.getAsStructuralValue(), Loc);
+  }
+  llvm_unreachable("Unhandled TemplateArgument::ArgKind enum");
+}
+
 /// Match two template parameters within template parameter lists.
 static bool MatchTemplateParameterKind(
     Sema &S, NamedDecl *New,
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 015b0abaf0e5e..e9e7ab5bb6698 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -268,6 +268,16 @@ checkDeducedTemplateArguments(ASTContext &Context,
     // All other combinations are incompatible.
     return DeducedTemplateArgument();
 
+  case TemplateArgument::StructuralValue:
+    // If we deduced a value and a dependent expression, keep the value.
+    if (Y.getKind() == TemplateArgument::Expression ||
+        (Y.getKind() == TemplateArgument::StructuralValue &&
+         X.structurallyEquals(Y)))
+      return X;
+
+    // All other combinations are incompatible.
+    return DeducedTemplateArgument();
+
   case TemplateArgument::Template:
     if (Y.getKind() == TemplateArgument::Template &&
         Context.hasSameTemplateName(X.getAsTemplate(), Y.getAsTemplate()))
@@ -2300,27 +2310,45 @@ DeduceTemplateArguments(Sema &S, TemplateParameterList *TemplateParams,
     Info.SecondArg = A;
     return Sema::TDK_NonDeducedMismatch;
 
+  case TemplateArgument::StructuralValue:
+    if (A.getKind() == TemplateArgument::StructuralValue &&
+        A.structurallyEquals(P))
+      return Sema::TDK_Success;
+
+    Info.FirstArg = P;
+    Info.SecondArg = A;
+    return Sema::TDK_NonDeducedMismatch;
+
   case TemplateArgument::Expression:
     if (const NonTypeTemplateParmDecl *NTTP =
             getDeducedParameterFromExpr(Info, P.getAsExpr())) {
-      if (A.getKind() == TemplateArgument::Integral)
+      switch (A.getKind()) {
+      case TemplateArgument::Integral:
+      case TemplateArgument::Expression:
+      case TemplateArgument::StructuralValue:
         return DeduceNonTypeTemplateArgument(
-            S, TemplateParams, NTTP, A.getAsIntegral(), A.getIntegralType(),
-            /*ArrayBound=*/false, Info, Deduced);
-      if (A.getKind() == TemplateArgument::NullPtr)
+            S, TemplateParams, NTTP, DeducedTemplateArgument(A),
+            A.getNonTypeTemplateArgumentType(), Info, Deduced);
+
+      case TemplateArgument::NullPtr:
         return DeduceNullPtrTemplateArgument(S, TemplateParams, NTTP,
                                              A.getNullPtrType(), Info, Deduced);
-      if (A.getKind() == TemplateArgument::Expression)
-        return DeduceNonTypeTemplateArgument(S, TemplateParams, NTTP,
-                                             A.getAsExpr(), Info, Deduced);
-      if (A.getKind() == TemplateArgument::Declaration)
+
+      case TemplateArgument::Declaration:
         return DeduceNonTypeTemplateArgument(
             S, TemplateParams, NTTP, A.getAsDecl(), A.getParamTypeForDecl(),
             Info, Deduced);
 
-      Info.FirstArg = P;
-      Info.SecondArg = A;
-      return Sema::TDK_NonDeducedMismatch;
+      case TemplateArgument::Null:
+      case TemplateArgument::Type:
+      case TemplateArgument::Template:
+      case TemplateArgument::TemplateExpansion:
+      case TemplateArgument::Pack:
+        Info.FirstArg = P;
+        Info.SecondArg = A;
+        return Sema::TDK_NonDeducedMismatch;
+      }
+      llvm_unreachable("Unknown template argument kind");
     }
 
     // Can't deduce anything, but that's okay.
@@ -2505,6 +2533,9 @@ static bool isSameTemplateArg(ASTContext &Context,
     case TemplateArgument::Integral:
       return hasSameExtendedValue(X.getAsIntegral(), Y.getAsIntegral());
 
+    case TemplateArgument::StructuralValue:
+      return X.structurallyEquals(Y);
+
     case TemplateArgument::Expression: {
       llvm::FoldingSetNodeID XID, YID;
       X.getAsExpr()->Profile(XID, Context, true);
@@ -2585,9 +2616,9 @@ Sema::getTrivialTemplateArgumentLoc(const TemplateArgument &Arg,
                                E);
   }
 
-  case TemplateArgument::Integral: {
-    Expr *E =
-        BuildExpressionFromIntegralTemplateArgument(Arg, Loc).getAs<Expr>();
+  case TemplateArgument::Integral:
+  case TemplateArgument::StructuralValue: {
+    Expr *E = BuildExpressionFromNonTypeTemplateArgument(Arg, Loc).get();
     return TemplateArgumentLoc(TemplateArgument(E), E);
   }
 
@@ -6449,11 +6480,8 @@ MarkUsedTemplateParameters(ASTContext &Ctx,
   case TemplateArgument::Null:
   case TemplateArgument::Integral:
   case TemplateArgument::Declaration:
-    break;
-
   case TemplateArgument::NullPtr:
-    MarkUsedTemplateParameters(Ctx, TemplateArg.getNullPtrType(), OnlyDeduced,
-                               Depth, Used);
+  case TemplateArgument::StructuralValue:
     break;
 
   case TemplateArgument::Type:
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index fc80515b45e35..e12186d7d82f8 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -1975,9 +1975,8 @@ ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef(
     }
   } else if (arg.getKind() == TemplateArgument::Declaration ||
              arg.getKind() == TemplateArgument::NullPtr) {
-    ValueDecl *VD;
     if (arg.getKind() == TemplateArgument::Declaration) {
-      VD = arg.getAsDecl();
+      ValueDecl *VD = arg.getAsDecl();
 
       // Find the instantiation of the template argument.  This is
       // required for nested templates.
@@ -1985,21 +1984,20 @@ ExprResult TemplateInstantiator::transformNonTypeTemplateParmRef(
              getSema().FindInstantiatedDecl(loc, VD, TemplateArgs));
       if (!VD)
         return ExprError();
-    } else {
-      // Propagate NULL template argument.
-      VD = nullptr;
     }
 
-    QualType paramType = VD ? arg.getParamTypeForDecl() : arg.getNullPtrType();
+    QualType paramType = arg.getNonTypeTemplateArgumentType();
     assert(!paramType.isNull() && "type substitution failed for param type");
     assert(!paramType->isDependentType() && "param type still dependent");
     result = SemaRef.BuildExpressionFromDeclTemplateArgument(arg, paramType, loc);
     refParam = paramType->isReferenceType();
   } else {
-    result = SemaRef.BuildExpressionFromIntegralTemplateArgument(arg, loc);
+    QualType paramType = arg.getNonTypeTemplateArgumentType();
+    result = SemaRef.BuildExpressionFromNonTypeTemplateArgument(arg, loc);
+    refParam = paramType->isReferenceType();
     assert(result.isInvalid() ||
            SemaRef.Context.hasSameType(result.get()->getType(),
-                                       arg.getIntegralType()));
+                                       paramType.getNonReferenceType()));
   }
 
   if (result.isInvalid())
diff --git a/clang/lib/Sema/SemaTemplateVariadic.cpp b/clang/lib/Sema/SemaTemplateVariadic.cpp
index 0f4e9e7f94c81..4a7872b2cc73c 100644
--- a/clang/lib/Sema/SemaTemplateVariadic.cpp
+++ b/clang/lib/Sema/SemaTemplateVariadic.cpp
@@ -1112,6 +1112,7 @@ TemplateArgumentLoc Sema::getTemplateArgumentPackExpansionPattern(
   case TemplateArgument::NullPtr:
   case TemplateArgument::Template:
   case TemplateArgument::Integral:
+  case TemplateArgument::StructuralValue:
   case TemplateArgument::Pack:
   case TemplateArgument::Null:
     return TemplateArgumentLoc();
@@ -1162,6 +1163,7 @@ std::optional<unsigned> Sema::getFullyPackExpandedSize(TemplateArgument Arg) {
   case TemplateArgument::NullPtr:
   case TemplateArgument::TemplateExpansion:
   case TemplateArgument::Integral:
+  case TemplateArgument::StructuralValue:
   case TemplateArgument::Pack:
   case TemplateArgument::Null:
     return std::nullopt;
diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 4463904b07211..2deac7877fcda 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -3908,6 +3908,7 @@ class TreeTransform {
     case TemplateArgument::Null:
     case TemplateArgument::Integral:
     case TemplateArgument::Declaration:
+    case TemplateArgument::StructuralValue:
     case TemplateArgument::Pack:
     case TemplateArgument::TemplateExpansion:
     case TemplateArgument::NullPtr:
@@ -4577,7 +4578,8 @@ bool TreeTransform<Derived>::TransformTemplateArgument(
 
   case TemplateArgument::Integral:
   case TemplateArgument::NullPtr:
-  case TemplateArgument::Declaration: {
+  case TemplateArgument::Declaration:
+  case TemplateArgument::StructuralValue: {
     // Transform a resolved template argument straight to a resolved template
     // argument. We get here when substituting into an already-substituted
     // template type argument during concept satisfaction checking.
@@ -4604,9 +4606,15 @@ bool TreeTransform<Derived>::TransformTemplateArgument(
     else if (Arg.getKind() == TemplateArgument::NullPtr)
       Output = TemplateArgumentLoc(TemplateArgument(NewT, /*IsNullPtr=*/true),
                                    TemplateArgumentLocInfo());
-    else
+    else if (Arg.getKind() == TemplateArgument::Declaration)
       Output = TemplateArgumentLoc(TemplateArgument(NewD, NewT),
                                    TemplateArgumentLocInfo());
+    else if (Arg.getKind() == TemplateArgument::StructuralValue)
+      Output = TemplateArgumentLoc(
+          TemplateArgument(getSema().Context, NewT, Arg.getAsStructuralValue()),
+          TemplateArgumentLocInfo());
+    else
+      llvm_unreachable("unexpected template argument kind");
 
     return false;
   }
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 287f9a0300be5..fe8782a3eb9e7 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -7460,6 +7460,7 @@ ASTRecordReader::readTemplateArgumentLocInfo(TemplateArgument::ArgKind Kind) {
   case TemplateArgument::Integral:
   case TemplateArgument::Declaration:
   case TemplateArgument::NullPtr:
+  case TemplateArgument::StructuralValue:
   case TemplateArgument::Pack:
     // FIXME: Is this right?
     return TemplateArgumentLocInfo();
diff --git a/clang/lib/Serialization/ASTWriter.cpp b/clang/lib/Serialization/ASTWriter.cpp
index a3fa99c3d7e49..03bddfe0f5047 100644
--- a/clang/lib/Serialization/ASTWriter.cpp
+++ b/clang/lib/Serialization/ASTWriter.cpp
@@ -5609,6 +5609,7 @@ void ASTRecordWriter::AddTemplateArgumentLocInfo(
   case TemplateArgument::Integral:
   case TemplateArgument::Declaration:
   case TemplateArgument::NullPtr:
+  case TemplateArgument::StructuralValue:
   case TemplateArgument::Pack:
     // FIXME: Is this right?
     break;
diff --git a/clang/test/CXX/drs/dr18xx.cpp b/clang/test/CXX/drs/dr18xx.cpp
index f2c056e13ecad..37ea450137ac5 100644
--- a/clang/test/CXX/drs/dr18xx.cpp
+++ b/clang/test/CXX/drs/dr18xx.cpp
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -std=c++98 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx98 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++11 -triple x86_64-unknown-unknown %s -verify=expected,cxx98-14,cxx11-17,since-cxx11 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++14 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,cxx98-14,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
-// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
+// RUN: %clang_cc1 -std=c++17 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,cxx11-17,since-cxx11,since-cxx14,cxx17 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++20 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++23 -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
 // RUN: %clang_cc1 -std=c++2c -triple x86_64-unknown-unknown %s -verify=expected,since-cxx14,since-cxx17,since-cxx20,since-cxx11,since-cxx14 -fexceptions -Wno-deprecated-builtins -fcxx-exceptions -pedantic-errors
@@ -25,7 +25,7 @@ template <int &> struct S {}; // #dr1801-S
 S<i> V; // #dr1801-S-i
 // cxx98-14-error@-1 {{non-type template argument does not refer to any declaration}}
 //   cxx98-14-note@#dr1801-S {{template parameter is declared here}}
-// since-cxx17-error@#dr1801-S-i {{non-type template argument refers to subobject '.i'}}
+// cxx17-error@#dr1801-S-i {{non-type template argument refers to subobject '.i'}}
 }
 
 namespace dr1802 { // dr1802: 3.1
diff --git a/clang/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp b/clang/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp
index a17720c9267cc..7fce615516924 100644
--- a/clang/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp
+++ b/clang/test/CXX/temp/temp.arg/temp.arg.nontype/p1.cpp
@@ -200,7 +200,9 @@ namespace bad_args {
   // cxx17-note@-3 {{reinterpret_cast}}
   X0<__builtin_constant_p(0) ? (int*)1 : (int*)1> x0d;
   // precxx17-error@-1 {{non-type template argument '(int *)1' is invalid}}
-  // cxx17-error@-2 {{non-type template argument refers to subobject '(int *)1'}}
+#if __cplusplus == 201703L
+  // cxx17-error@-3 {{non-type template argument refers to subobject '(int *)1'}}
+#endif
 }
 #endif // CPP11ONLY
 
diff --git a/clang/test/CodeGenCXX/mangle-ms-templates.cpp b/clang/test/CodeGenCXX/mangle-ms-templates.cpp
index cefe1e0cb5a55..ec4250edf314f 100644
--- a/clang/test/CodeGenCXX/mangle-ms-templates.cpp
+++ b/clang/test/CodeGenCXX/mangle-ms-templates.cpp
@@ -2,6 +2,21 @@
 // RUN: %clang_cc1 -std=c++11 -fms-compatibility-version=19 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-win32 | FileCheck -check-prefix X64 %s
 // RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=i386-pc-win32 | FileCheck %s
 // RUN: %clang_cc1 -std=c++17 -fms-compatibility-version=19 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-win32 | FileCheck -check-prefix X64 %s
+// RUN: %clang_cc1 -std=c++20 -fms-compatibility-version=19 -emit-llvm %s -o - -fms-extensions -fdelayed-template-parsing -triple=x86_64-pc-win32 | FileCheck -check-prefix CXX20-X64 %s
+
+// Check that array-to-pointer decay is mangled as the underlying declaration.
+extern const char arr[4] = "foo";
+template<const char*> struct Decay1 {};
+// CHECK: "?decay1@@3U?$Decay1@$1?arr@@3QBDB@@A"
+Decay1<arr> decay1;
+#if __cplusplus >= 201702L
+// Note that this mangling approach can lead to collisions.
+template<const void*> struct Decay2 {};
+// CXX20-X64: "?decay2a@@3U?$Decay2@$1?arr@@3QBDB@@A"
+Decay2<(const void*)arr> decay2a;
+// CXX20-X64: "?decay2b@@3U?$Decay2@$1?arr@@3QBDB@@A"
+Decay2<(const void*)&arr> decay2b;
+#endif
 
 template<typename T>
 class Class {
@@ -327,3 +342,36 @@ void fun_uint128(UInt128<(unsigned __int128)-1>) {}
 // X64: define {{.*}} @"?fun_uint128@@YAXU?$UInt128@$0DPPPPPPPPPPPPPPPAAAAAAAAAAAAAAAB@@@@Z"(
 void fun_uint128(UInt128<(unsigned __int128)9223372036854775807 * (unsigned __int128)9223372036854775807>) {}
 #endif
+
+#if __cplusplus >= 202002L
+
+template<float> struct Float {};
+// CXX20-X64: define {{.*}} @"?f@@YAXU?$Float@$ADPIAAAAA@@@@Z"(
+void f(Float<1.0f>) {}
+template<auto> struct Auto {};
+// CXX20-X64: define {{.*}} @"?f@@YAXU?$Auto@$MMADPIAAAAA@@@@Z"(
+void f(Auto<1.0f>) {}
+
+struct S2 {
+  int arr[2][3];
+  int i;
+  void fn();
+} s2;
+
+template<int&> struct TplSubobjectRef {};
+// CXX20-X64: define {{.*}} @"?f@@YAXU?$TplSubobjectRef@$CC61?s2@@3US2@@Aarr@@00@01@@@@Z"(
+void f(TplSubobjectRef<s2.arr[1][2]>) {}
+template<int*> struct TplSubobjectPtr {};
+// CXX20-X64: define {{.*}} @"?f@@YAXU?$TplSubobjectPtr@$CC61?s2@@3US2@@Aarr@@00@01@@@@Z"(
+void f(TplSubobjectPtr<&s2.arr[1][2]>) {}
+
+struct Derived : S2 {};
+
+template <int Derived::*> struct TplMemberPtr {};
+// CXX20-X64: define {{.*}} @"?f@@YAXU?$TplMemberPtr@$0BI@@@@Z"(
+void f(TplMemberPtr<(int Derived::*)&S2::i>) {}
+template <void (Derived::*)()> struct TplMemberFnPtr {};
+// CXX20-X64: define {{.*}} @"?f@@YAXU?$TplMemberFnPtr@$1?fn@S2@@QEAAXXZ@@@Z"(
+void f(TplMemberFnPtr<(void (Derived::*)())&S2::fn>) {}
+
+#endif
diff --git a/clang/test/CodeGenCXX/mangle-template.cpp b/clang/test/CodeGenCXX/mangle-template.cpp
index 8415bacbb9fc6..a4cb22739ac2a 100644
--- a/clang/test/CodeGenCXX/mangle-template.cpp
+++ b/clang/test/CodeGenCXX/mangle-template.cpp
@@ -225,6 +225,16 @@ namespace test16 {
 namespace cxx20 {
   template<auto> struct A {};
   template<typename T, T V> struct B {};
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AILf3f800000EEE(
+  void f(A<1.0f>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AILd3ff0000000000000EEE(
+  void f(A<1.0>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AILe3fff8000000000000000EEE(
+  void f(A<1.0l>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXtlCiLi0ELi1EEEEE(
+  void f(A<1i>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXtlCdLd0000000000000000ELd3ff0000000000000EEEEE(
+  void f(A<1.0i>) {}
 
   int x;
   // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXadL_ZNS_1xEEEEE(
@@ -244,7 +254,24 @@ namespace cxx20 {
   // CXX20: define {{.*}} @_ZN5cxx201fENS_1BIPKvXadL_ZNS_1xEEEEE(
   void f(B<const void*, (const void*)&x>) {}
 
-  struct Q { int x; };
+  struct Q { int x; } q;
+
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXadsoiL_ZNS_1qEEEEEE(
+  void f(A<&q.x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1BIPiXadsoiL_ZNS_1qEEEEEE(
+  void f(B<int*, &q.x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXadsoKiL_ZNS_1qEEEEEE(
+  void f(A<(const int*)&q.x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1BIPKiXadsoS1_L_ZNS_1qEEEEEE
+  void f(B<const int*, (const int*)&q.x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXcvPvadsoiL_ZNS_1qEEEEEE(
+  void f(A<(void*)&q.x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1BIPvXadsoiL_ZNS_1qEEEEEE(
+  void f(B<void*, (void*)&q.x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXcvPKvadsoiL_ZNS_1qEEEEEE(
+  void f(A<(const void*)&q.x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1BIPKvXadsoiL_ZNS_1qEEEEEE(
+  void f(B<const void*, (const void*)&q.x>) {}
 
   // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXadL_ZNS_1Q1xEEEEE(
   void f(A<&Q::x>) {}
@@ -254,6 +281,17 @@ namespace cxx20 {
   void f(A<(const int Q::*)&Q::x>) {}
   // CXX20: define {{.*}} @_ZN5cxx201fENS_1BIMNS_1QEKiXadL_ZNS1_1xEEEEE(
   void f(B<const int Q::*, (const int Q::*)&Q::x>) {}
+
+  struct R : Q {};
+
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXmcMNS_1REiadL_ZNS_1Q1xEEEEEE(
+  void f(A<(int R::*)&Q::x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1BIMNS_1REiXmcS2_adL_ZNS_1Q1xEEEEEE(
+  void f(B<int R::*, (int R::*)&Q::x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1AIXmcMNS_1REKiadL_ZNS_1Q1xEEEEEE(
+  void f(A<(const int R::*)&Q::x>) {}
+  // CXX20: define {{.*}} @_ZN5cxx201fENS_1BIMNS_1REKiXmcS3_adL_ZNS_1Q1xEEEEEE(
+  void f(B<const int R::*, (const int R::*)&Q::x>) {}
 }
 #endif
 
diff --git a/clang/test/CodeGenCXX/template-arguments.cpp b/clang/test/CodeGenCXX/template-arguments.cpp
new file mode 100644
index 0000000000000..14a454937122a
--- /dev/null
+++ b/clang/test/CodeGenCXX/template-arguments.cpp
@@ -0,0 +1,113 @@
+// RUN: %clang_cc1 -std=c++20 %s -emit-llvm -o - -triple x86_64-linux -DCONSTEXPR= | FileCheck %s
+// RUN: %clang_cc1 -std=c++20 %s -emit-llvm -o - -triple x86_64-linux -DCONSTEXPR=constexpr | FileCheck %s --check-prefix=CONST
+
+template<typename T> CONSTEXPR T id(T v) { return v; }
+template<auto V> auto value = id(V);
+
+// CHECK: call {{.*}} @_Z2idIiET_S0_(i32 noundef 1)
+// CONST: @_Z5valueILi1EE = weak_odr {{.*}} i32 1,
+template int value<1>;
+
+// CHECK: call {{.*}} @_Z2idIyET_S0_(i64 noundef -1)
+// CONST: @_Z5valueILy18446744073709551615EE = weak_odr {{.*}} i64 -1,
+template unsigned long long value<-1ULL>;
+
+// CHECK: call {{.*}} @_Z2idIfET_S0_(float noundef 1.000000e+00)
+// CONST: @_Z5valueILf3f800000EE = weak_odr {{.*}} float 1.000000e+00,
+template float value<1.0f>;
+// CHECK: call {{.*}} @_Z2idIdET_S0_(double noundef 1.000000e+00)
+// CONST: @_Z5valueILd3ff0000000000000EE = weak_odr {{.*}} double 1.000000e+00,
+template double value<1.0>;
+
+enum E{ E1, E2};
+
+// CHECK: call {{.*}} @_Z2idI1EET_S1_(i32 noundef 1)
+// CONST: @_Z5valueIL1E1EE = weak_odr {{.*}} i32 1,
+template E value<E2>;
+
+int n;
+// CHECK: call {{.*}} @_Z2idIPiET_S1_(ptr noundef @n)
+// CONST: @_Z5valueIXadL_Z1nEEE = weak_odr {{.*}} ptr @n,
+template int *value<&n>;
+
+struct A { int a[3]; } a;
+// CHECK: call {{.*}} @_Z2idIPiET_S1_(ptr noundef @a)
+// CONST: @_Z5valueIXadsoiL_Z1aEEEE = weak_odr {{.*}} ptr @a,
+template int *value<&a.a[0]>;
+// CHECK: call {{.*}} @_Z2idIPiET_S1_(ptr noundef getelementptr (i8, ptr @a, i64 4))
+// CONST: @_Z5valueIXadsoiL_Z1aE4EEE = weak_odr {{.*}} ptr getelementptr (i8, ptr @a, i64 4),
+template int *value<&a.a[1]>;
+// CHECK: call {{.*}} @_Z2idIPiET_S1_(ptr noundef getelementptr (i8, ptr @a, i64 8))
+// CONST: @_Z5valueIXadsoiL_Z1aE8EEE = weak_odr {{.*}} ptr getelementptr (i8, ptr @a, i64 8),
+template int *value<&a.a[2]>;
+// CHECK: call {{.*}} @_Z2idIPiET_S1_(ptr noundef getelementptr (i8, ptr @a, i64 12))
+// CONST: @_Z5valueIXadsoiL_Z1aE12pEEE = weak_odr {{.*}} ptr getelementptr (i8, ptr @a, i64 12),
+template int *value<&a.a[3]>;
+
+union U {
+  int x, y;
+  union {
+    int x, y;
+  } internal;
+} u;
+
+// CHECK: call {{.*}} @_Z2idIPiET_S1_(ptr noundef @u)
+// CONST: @_Z5valueIXadsoiL_Z1uE_EEE = weak_odr {{.*}} ptr @u,
+template int *value<&u.x>;
+// CHECK: call {{.*}} @_Z2idIPiET_S1_(ptr noundef @u)
+// CONST: @_Z5valueIXadsoiL_Z1uE_0EEE = weak_odr {{.*}} ptr @u,
+template int *value<&u.y>;
+// CHECK: call {{.*}} @_Z2idIPiET_S1_(ptr noundef @u)
+// CONST: @_Z5valueIXadsoiL_Z1uE_1_0EEE = weak_odr {{.*}} ptr @u,
+template int *value<&u.internal.y>;
+
+struct B { int x, y; };
+// CHECK: call {{.*}} @_Z2idIM1BiET_S2_(i64 0)
+// CONST: @_Z5valueIXadL_ZN1B1xEEEE = weak_odr {{.*}} i64 0,
+template int B::*value<&B::x>;
+// CHECK: call {{.*}} @_Z2idIM1BiET_S2_(i64 4)
+// CONST: @_Z5valueIXadL_ZN1B1yEEEE = weak_odr {{.*}} i64 4,
+template int B::*value<&B::y>;
+
+struct C : A, B { int z; };
+// CHECK: call {{.*}} @_Z2idIM1CiET_S2_(i64 12)
+// CONST: @_Z5valueIXmcM1CiadL_ZN1B1xEE12EEE = weak_odr {{.*}} i64 12,
+template int C::*value<(int C::*)&B::x>;
+// CHECK: call {{.*}} @_Z2idIM1BiET_S2_(i64 8)
+// CONST: @_Z5valueIXmcM1BiadL_ZN1C1zEEn12EEE = weak_odr {{.*}} i64 8,
+template int B::*value<(int B::*)&C::z>;
+
+// CHECK: store i32 1,
+// CHECK: store i32 2,
+// CHECK: load i64,
+// CHECK: call {{.*}} @_Z2idICiET_S1_(i64 noundef %
+// CONST: @_Z5valueIXtlCiLi1ELi2EEEE = weak_odr {{.*}} { i32, i32 } { i32 1, i32 2 },
+template _Complex int value<1 + 2j>;
+
+// CHECK: store float 1.000000e+00,
+// CHECK: store float 2.000000e+00,
+// CHECK: load <2 x float>,
+// CHECK: call {{.*}} @_Z2idICfET_S1_(<2 x float> noundef %
+// CONST: @_Z5valueIXtlCfLf3f800000ELf40000000EEEE = weak_odr {{.*}} { float, float } { float 1.000000e+00, float 2.000000e+00 },
+template _Complex float value<1.0f + 2.0fj>;
+
+using V3i __attribute__((ext_vector_type(3))) = int;
+// CHECK: call {{.*}} @_Z2idIDv3_iET_S1_(<3 x i32> noundef <i32 1, i32 2, i32 3>)
+// CONST: @_Z5valueIXtlDv3_iLi1ELi2ELi3EEEE = weak_odr {{.*}} <3 x i32> <i32 1, i32 2, i32 3>
+template V3i value<V3i{1, 2, 3}>;
+
+using V3f [[gnu::vector_size(12)]] = float;
+// CHECK: call {{.*}} @_Z2idIDv3_fET_S1_(<3 x float> noundef <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>)
+// CONST: @_Z5valueIXtlDv3_fLf3f800000ELf40000000ELf40400000EEEE = weak_odr {{.*}} <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>
+template V3f value<V3f{1, 2, 3}>;
+
+
+template <int& i>
+void setByRef() {
+  i = 1;
+}
+
+void callSetByRefWithSubobject() {
+  // CHECK: store i32 1, ptr getelementptr (i8, ptr @a, i64 4)
+  setByRef<a.a[1]>();
+}
diff --git a/clang/test/Index/USR/structural-value-tpl-arg.cpp b/clang/test/Index/USR/structural-value-tpl-arg.cpp
new file mode 100644
index 0000000000000..8130012451691
--- /dev/null
+++ b/clang/test/Index/USR/structural-value-tpl-arg.cpp
@@ -0,0 +1,23 @@
+// RUN: c-index-test -test-load-source-usrs local -std=c++20 -- %s | FileCheck %s
+
+// Check USRs of template specializations with structural NTTP values.
+
+template <auto> struct Tpl{};
+
+struct {
+  int n;
+} s;
+
+void fn1(Tpl<1.5>);
+// CHECK: fn1#$@S@Tpl>#Sd[[#HASH:]]#
+void fn2(Tpl<1.7>);
+// CHECK-NOT: [[#HASH]]
+void fn1(Tpl<1.5>) {}
+// CHECK: fn1#$@S@Tpl>#Sd[[#HASH]]#
+
+void fn(Tpl<&s.n>);
+// CHECK: #S*I[[#HASH:]]#
+void fn(Tpl<(void*)&s.n>);
+// CHECK: #S*v[[#HASH]]#
+void fn(Tpl<&s.n>) {}
+// CHECK: #S*I[[#HASH]]#
diff --git a/clang/test/Modules/odr_hash.cpp b/clang/test/Modules/odr_hash.cpp
index 220ef767df849..fa8b2c81ab46e 100644
--- a/clang/test/Modules/odr_hash.cpp
+++ b/clang/test/Modules/odr_hash.cpp
@@ -13,8 +13,8 @@
 // RUN: cat %s                >> %t/Inputs/second.h
 
 // Test that each header can compile
-// RUN: %clang_cc1 -fsyntax-only -x c++ -std=c++1z %t/Inputs/first.h
-// RUN: %clang_cc1 -fsyntax-only -x c++ -std=c++1z %t/Inputs/second.h
+// RUN: %clang_cc1 -fsyntax-only -x c++ -std=c++20 %t/Inputs/first.h
+// RUN: %clang_cc1 -fsyntax-only -x c++ -std=c++20 %t/Inputs/second.h
 
 // Build module map file
 // RUN: echo "module FirstModule {"     >> %t/Inputs/module.modulemap
@@ -25,7 +25,7 @@
 // RUN: echo "}"                        >> %t/Inputs/module.modulemap
 
 // Run test
-// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++ -std=c++1z \
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -x c++ -std=c++20 \
 // RUN:   -fmodules -fimplicit-module-maps -fmodules-cache-path=%t/cache \
 // RUN:   -I%t/Inputs -verify %s
 
@@ -2093,6 +2093,193 @@ struct S21 {
 S21 s21;
 #endif
 
+#if defined(FIRST)
+struct S22 {
+  template <double> void f(){};
+  template <> void f<1.5>(){};
+};
+#elif defined(SECOND)
+struct S22 {
+  template <double> void f(){};
+  template <> void f<1.7>(){};
+};
+#else
+S22 s22;
+// expected-error@second.h:* {{'TemplateArgument::S22' has different definitions in different modules; first difference is definition in module 'SecondModule' found method 'f' with 1.700000e+00 for 1st template argument}}
+// expected-note@first.h:* {{but in 'FirstModule' found method 'f' with 1.500000e+00 for 1st template argument}}
+#endif
+
+#if defined(FIRST)
+struct S23 {
+  template <double> void f(){};
+  template <> void f<2.7>(){};
+};
+#elif defined(SECOND)
+struct S23 {
+  template <double> void f(){};
+  template <> void f<2.7>(){};
+};
+#else
+S23 s23;
+#endif
+
+#if defined(FIRST) || defined(SECOND)
+struct Composite {
+  int n1[4];
+  int n2[4];
+};
+extern Composite composite;
+#endif
+
+#if defined(FIRST)
+struct S24 {
+  template <int&> void f(){};
+  template <> void f<composite.n1[1]>(){};
+};
+#elif defined(SECOND)
+struct S24 {
+  template <int&> void f(){};
+  template <> void f<composite.n1[2]>(){};
+};
+#else
+S24 s24;
+// expected-error@second.h:* {{'TemplateArgument::S24' has different definitions in different modules; first difference is definition in module 'SecondModule' found method 'f' with composite.n1[2] for 1st template argument}}
+// expected-note@first.h:* {{but in 'FirstModule' found method 'f' with composite.n1[1] for 1st template argument}}
+#endif
+
+#if defined(FIRST) || defined(SECOND)
+struct S25 {
+  template <int&> void f();
+  template <> void f<composite.n1[2]>();
+};
+#else
+S25 s25;
+#endif
+
+#if defined(FIRST)
+struct S26 {
+  template <int*> void f(){};
+  template <> void f<&composite.n1[4]>(){}; // Past-the-end pointer.
+};
+#elif defined(SECOND)
+struct S26 {
+  template <int*> void f(){};
+  template <> void f<&composite.n2[0]>(){};
+};
+#else
+S26 s26;
+// expected-error@second.h:* {{'TemplateArgument::S26' has different definitions in different modules; first difference is definition in module 'SecondModule' found method 'f' with &composite.n2[0] for 1st template argument}}
+// expected-note@first.h:* {{but in 'FirstModule' found method 'f' with &composite.n1[4] for 1st template argument}}
+#endif
+
+#if defined(FIRST) || defined(SECOND)
+union Union {
+  int i1;
+  int i2;
+};
+extern Union u;
+#endif
+
+#if defined(FIRST)
+struct S27 {
+  template <int&> void f(){};
+  template <> void f<u.i1>(){};
+};
+#elif defined(SECOND)
+struct S27 {
+  template <int&> void f(){};
+  template <> void f<u.i2>(){};
+};
+#else
+S27 s27;
+// expected-error@second.h:* {{'TemplateArgument::S27' has different definitions in different modules; first difference is definition in module 'SecondModule' found method 'f' with u.i2 for 1st template argument}}
+// expected-note@first.h:* {{but in 'FirstModule' found method 'f' with u.i1 for 1st template argument}}
+#endif
+
+#if defined(FIRST) || defined(SECOND)
+struct S28 {
+  template <int&> void f(){};
+  template <> void f<u.i1>(){};
+};
+#else
+S28 s28;
+#endif
+
+#if defined(FIRST) || defined(SECOND)
+struct A {
+  int a;
+};
+struct B : A {};
+struct C : A {};
+struct D : B, C {};
+#endif
+
+#if defined(FIRST)
+struct S29 {
+  template <int D::*> void f(){};
+  template <> void f<(int D::*)(int B::*)&A::a>(){};
+};
+#elif defined(SECOND)
+struct S29 {
+  template <int D::*> void f(){};
+  template <> void f<(int D::*)(int C::*)&A::a>(){};
+};
+#else
+S29 s29;
+// expected-error@second.h:* {{'TemplateArgument::S29' has different definitions in different modules; first difference is definition in module 'SecondModule' found method 'f' with &A::a for 1st template argument}}
+// expected-note@first.h:* {{but in 'FirstModule' found method 'f' with &A::a for 1st template argument}}
+#endif
+
+#if defined(FIRST) || defined(SECOND)
+struct S30 {
+  template <int D::*> void f(){};
+  template <> void f<(int D::*)(int B::*)&A::a>(){};
+};
+#else
+S30 s30;
+#endif
+
+#if defined(FIRST)
+struct S31 {
+  template <auto*> void f(){};
+  template <> void f<&composite.n1[2]>(){};
+};
+#elif defined(SECOND)
+struct S31 {
+  template <auto*> void f(){};
+  template <> void f<(void*)&composite.n1[2]>(){};
+};
+#else
+S31 s31;
+// expected-error@second.h:* {{'TemplateArgument::S31' has different definitions in different modules; first difference is definition in module 'SecondModule' found method 'f' with &composite.n1[2] for 1st template argument}}
+// expected-note@first.h:* {{but in 'FirstModule' found method 'f' with &composite.n1[2] for 1st template argument}}
+#endif
+
+#if defined(FIRST)
+struct S32 {
+  template <int*> void f(){};
+  template <> void f<__builtin_constant_p(0) ? (int*)1 : (int*)1>(){};
+};
+#elif defined(SECOND)
+struct S32 {
+  template <int*> void f(){};
+  template <> void f<__builtin_constant_p(0) ? (int*)2 : (int*)2>(){};
+};
+#else
+S32 s32;
+// expected-error@second.h:* {{'TemplateArgument::S32' has different definitions in different modules; first difference is definition in module 'SecondModule' found method 'f' with (int *)2 for 1st template argument}}
+// expected-note@first.h:* {{but in 'FirstModule' found method 'f' with (int *)1 for 1st template argument}}
+#endif
+
+#if defined(FIRST) || defined(SECOND)
+struct S33 {
+  template <int*> void f(){};
+  template <> void f<__builtin_constant_p(0) ? (int*)1 : (int*)1>(){};
+};
+#else
+S33 s33;
+#endif
+
 #define DECLS                   \
   OneClass<int> a;              \
   OneInt<1> b;                  \
diff --git a/clang/test/SemaCXX/warn-bool-conversion.cpp b/clang/test/SemaCXX/warn-bool-conversion.cpp
index 7b671c52b5b9c..c81d52d864f2d 100644
--- a/clang/test/SemaCXX/warn-bool-conversion.cpp
+++ b/clang/test/SemaCXX/warn-bool-conversion.cpp
@@ -186,6 +186,7 @@ namespace macros {
   }
 }
 
+#if __cplusplus < 201703L
 namespace Template {
   // FIXME: These cases should not warn.
   template<int *p> void f() { if (p) {} } // expected-warning 2{{will always evaluate to 'true'}} expected-cxx11-warning {{implicit conversion of nullptr}}
@@ -205,3 +206,4 @@ namespace Template {
 #endif
   template void h<d>();
 }
+#endif // __cplusplus < 201703L
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp
index b54b5a8007408..ae06055bf5265 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx1z.cpp
@@ -2,7 +2,7 @@
 
 template<typename T, T val> struct A {};
 
-template<typename T, typename U> constexpr bool is_same = false; // expected-note +{{here}}
+template<typename T, typename U> constexpr bool is_same = false;
 template<typename T> constexpr bool is_same<T, T> = true;
 
 namespace String {
@@ -84,34 +84,32 @@ namespace PtrMem {
   constexpr int B::*b = &B::b;
   constexpr int C::*cb = b;
   constexpr int D::*db = b;
-  constexpr int E::*ecb = cb; // expected-note +{{here}}
-  constexpr int E::*edb = db; // expected-note +{{here}}
+  constexpr int E::*ecb = cb;
+  constexpr int E::*edb = db;
 
   constexpr int E::*e = &E::e;
   constexpr int D::*de = (int D::*)e;
   constexpr int C::*ce = (int C::*)e;
-  constexpr int B::*bde = (int B::*)de; // expected-note +{{here}}
-  constexpr int B::*bce = (int B::*)ce; // expected-note +{{here}}
+  constexpr int B::*bde = (int B::*)de;
+  constexpr int B::*bce = (int B::*)ce;
 
-  // FIXME: This should all be accepted, but we don't yet have a representation
-  // nor mangling for this form of template argument.
   using Ab = A<int B::*, b>;
   using Ab = A<int B::*, &B::b>;
-  using Abce = A<int B::*, bce>; // expected-error {{not supported}}
-  using Abde = A<int B::*, bde>; // expected-error {{not supported}}
-  static_assert(!is_same<Ab, Abce>, ""); // expected-error {{undeclared}} expected-error {{must be a type}}
-  static_assert(!is_same<Ab, Abde>, ""); // expected-error {{undeclared}} expected-error {{must be a type}}
-  static_assert(!is_same<Abce, Abde>, ""); // expected-error 2{{undeclared}} expected-error {{must be a type}}
-  static_assert(is_same<Abce, A<int B::*, (int B::*)(int C::*)&E::e>>, ""); // expected-error {{undeclared}} expected-error {{not supported}}
+  using Abce = A<int B::*, bce>;
+  using Abde = A<int B::*, bde>;
+  static_assert(!is_same<Ab, Abce>, "");
+  static_assert(!is_same<Ab, Abde>, "");
+  static_assert(!is_same<Abce, Abde>, "");
+  static_assert(is_same<Abce, A<int B::*, (int B::*)(int C::*)&E::e>>, "");
 
   using Ae = A<int E::*, e>;
   using Ae = A<int E::*, &E::e>;
-  using Aecb = A<int E::*, ecb>; // expected-error {{not supported}}
-  using Aedb = A<int E::*, edb>; // expected-error {{not supported}}
-  static_assert(!is_same<Ae, Aecb>, ""); // expected-error {{undeclared}} expected-error {{must be a type}}
-  static_assert(!is_same<Ae, Aedb>, ""); // expected-error {{undeclared}} expected-error {{must be a type}}
-  static_assert(!is_same<Aecb, Aedb>, ""); // expected-error 2{{undeclared}} expected-error {{must be a type}}
-  static_assert(is_same<Aecb, A<int E::*, (int E::*)(int C::*)&B::b>>, ""); // expected-error {{undeclared}} expected-error {{not supported}}
+  using Aecb = A<int E::*, ecb>;
+  using Aedb = A<int E::*, edb>;
+  static_assert(!is_same<Ae, Aecb>, "");
+  static_assert(!is_same<Ae, Aedb>, "");
+  static_assert(!is_same<Aecb, Aedb>, "");
+  static_assert(is_same<Aecb, A<int E::*, (int E::*)(int C::*)&B::b>>, "");
 
   using An = A<int E::*, nullptr>;
   using A0 = A<int E::*, (int E::*)0>;
@@ -205,9 +203,9 @@ namespace Auto {
 
     struct Y : X {};
     void type_affects_identity(B<&X::n>) {}
-    void type_affects_identity(B<(int Y::*)&X::n>) {} // FIXME: expected-error {{non-type template argument of pointer-to-member type}}
+    void type_affects_identity(B<(int Y::*)&X::n>) {}
     void type_affects_identity(B<(const int X::*)&X::n>) {}
-    void type_affects_identity(B<(const int Y::*)&X::n>) {} // FIXME: expected-error {{non-type template argument of pointer-to-member type}}
+    void type_affects_identity(B<(const int Y::*)&X::n>) {}
 
     // A case where we need to do auto-deduction, and check whether the
     // resulting dependent types match during partial ordering. These
diff --git a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
index e345a1f64b921..b5b8cadc909ce 100644
--- a/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
+++ b/clang/test/SemaTemplate/temp_arg_nontype_cxx20.cpp
@@ -8,8 +8,8 @@ namespace std {
 
 // floating-point arguments
 template<float> struct Float {};
-using F1 = Float<1.0f>; // FIXME expected-error {{non-type template argument of type}}
-using F1 = Float<2.0f / 2>; // FIXME expected-error {{non-type template argument of type}}
+using F1 = Float<1.0f>;
+using F1 = Float<2.0f / 2>;
 
 struct S { int n[3]; } s; // expected-note 1+{{here}}
 union U { int a, b; } u;
@@ -17,24 +17,28 @@ int n; // expected-note 1+{{here}}
 
 // pointers to subobjects
 template<int *> struct IntPtr {};
-using IPn = IntPtr<&n + 1>; // FIXME expected-error {{refers to subobject}}
-using IPn = IntPtr<&n + 1>; // FIXME expected-error {{refers to subobject}}
+using IPn = IntPtr<&n + 1>;
+using IPn = IntPtr<&n + 1>;
 
-using IP2 = IntPtr<&s.n[2]>; // FIXME expected-error {{refers to subobject}}
-using IP2 = IntPtr<s.n + 2>; // FIXME expected-error {{refers to subobject}}
+using IPn2 = IntPtr<&n + 2>; // expected-error {{not a constant expression}} expected-note {{cannot refer to element 2 of non-array object}}
 
-using IP3 = IntPtr<&s.n[3]>; // FIXME expected-error {{refers to subobject}}
-using IP3 = IntPtr<s.n + 3>; // FIXME expected-error {{refers to subobject}}
+using IP2 = IntPtr<&s.n[2]>;
+using IP2 = IntPtr<s.n + 2>;
+
+using IP3 = IntPtr<&s.n[3]>;
+using IP3 = IntPtr<s.n + 3>;
+
+using IP5 = IntPtr<&s.n[5]>; // expected-error {{not a constant expression}} expected-note {{cannot refer to element 5 of array of 3 elements}}
 
 template<int &> struct IntRef {};
-using IPn = IntRef<*(&n + 1)>; // expected-error {{not a constant expression}} expected-note {{dereferenced pointer past the end of 'n'}}
-using IPn = IntRef<*(&n + 1)>; // expected-error {{not a constant expression}} expected-note {{dereferenced pointer past the end of 'n'}}
+using IRn = IntRef<*(&n + 1)>; // expected-error {{not a constant expression}} expected-note {{dereferenced pointer past the end of 'n'}}
+using IRn = IntRef<*(&n + 1)>; // expected-error {{not a constant expression}} expected-note {{dereferenced pointer past the end of 'n'}}
 
-using IP2 = IntRef<s.n[2]>; // FIXME expected-error {{refers to subobject}}
-using IP2 = IntRef<*(s.n + 2)>; // FIXME expected-error {{refers to subobject}}
+using IR2 = IntRef<s.n[2]>;
+using IR2 = IntRef<*(s.n + 2)>;
 
-using IP3 = IntRef<s.n[3]>; // expected-error {{not a constant expression}} expected-note {{dereferenced pointer past the end of subobject of 's'}}
-using IP3 = IntRef<*(s.n + 3)>; // expected-error {{not a constant expression}} expected-note {{dereferenced pointer past the end of subobject of 's'}}
+using IR3 = IntRef<s.n[3]>; // expected-error {{not a constant expression}} expected-note {{dereferenced pointer past the end of subobject of 's'}}
+using IR3 = IntRef<*(s.n + 3)>; // expected-error {{not a constant expression}} expected-note {{dereferenced pointer past the end of subobject of 's'}}
 
 // classes
 template<S> struct Struct {};
@@ -48,12 +52,12 @@ using U1 = Union<U{.b = 1}>; // expected-error {{different types}}
 
 // miscellaneous scalar types
 template<_Complex int> struct ComplexInt {};
-using CI = ComplexInt<1 + 3i>; // FIXME: expected-error {{non-type template argument of type}}
-using CI = ComplexInt<1 + 3i>; // FIXME: expected-error {{non-type template argument of type}}
+using CI = ComplexInt<1 + 3i>;
+using CI = ComplexInt<3i + 1>;
 
 template<_Complex float> struct ComplexFloat {};
-using CF = ComplexFloat<1.0f + 3.0fi>; // FIXME: expected-error {{non-type template argument of type}}
-using CF = ComplexFloat<1.0f + 3.0fi>; // FIXME: expected-error {{non-type template argument of type}}
+using CF = ComplexFloat<1.0f + 3.0fi>;
+using CF = ComplexFloat<3.0fi + 1.0f>;
 
 namespace ClassNTTP {
   struct A { // expected-note 2{{candidate}}
diff --git a/clang/tools/libclang/CIndex.cpp b/clang/tools/libclang/CIndex.cpp
index 00c0dff9cce4c..b03ad6054b9ac 100644
--- a/clang/tools/libclang/CIndex.cpp
+++ b/clang/tools/libclang/CIndex.cpp
@@ -1582,6 +1582,11 @@ bool CursorVisitor::VisitTemplateArgumentLoc(const TemplateArgumentLoc &TAL) {
       return Visit(MakeCXCursor(E, StmtParent, TU, RegionOfInterest));
     return false;
 
+  case TemplateArgument::StructuralValue:
+    if (Expr *E = TAL.getSourceStructuralValueExpression())
+      return Visit(MakeCXCursor(E, StmtParent, TU, RegionOfInterest));
+    return false;
+
   case TemplateArgument::NullPtr:
     if (Expr *E = TAL.getSourceNullPtrExpression())
       return Visit(MakeCXCursor(E, StmtParent, TU, RegionOfInterest));
diff --git a/clang/tools/libclang/CXCursor.cpp b/clang/tools/libclang/CXCursor.cpp
index fd03c48ba1a42..978adac5521aa 100644
--- a/clang/tools/libclang/CXCursor.cpp
+++ b/clang/tools/libclang/CXCursor.cpp
@@ -1466,6 +1466,9 @@ enum CXTemplateArgumentKind clang_Cursor_getTemplateArgumentKind(CXCursor C,
     return CXTemplateArgumentKind_NullPtr;
   case TemplateArgument::Integral:
     return CXTemplateArgumentKind_Integral;
+  case TemplateArgument::StructuralValue:
+    // FIXME: Expose these values.
+    return CXTemplateArgumentKind_Invalid;
   case TemplateArgument::Template:
     return CXTemplateArgumentKind_Template;
   case TemplateArgument::TemplateExpansion:
diff --git a/clang/www/cxx_status.html b/clang/www/cxx_status.html
index 197726f3aa3ee..a48f35eb50483 100755
--- a/clang/www/cxx_status.html
+++ b/clang/www/cxx_status.html
@@ -637,13 +637,21 @@ <h2 id="cxx20">C++20 implementation status</h2>
     </tr>
     <!-- Rapperswil papers -->
     <tr>
-      <td rowspan="2">Class types as non-type template parameters</td>
+      <td>Class types as non-type template parameters</td>
       <td><a href="https://wg21.link/p0732r2">P0732R2</a></td>
-      <td rowspan="2" class="partial" align="center">Partial</td>
+      <td class="full" align="center">Clang 12</td>
+    </tr>
+    <tr> <!-- from Belfast -->
+      <td>Generalized non-type template parameters of scalar type</td>
+      <td><a href="https://wg21.link/p1907r1">P1907R1</a></td>
+      <td class="partial" align="center">
+        <details>
+          <summary>Clang 18 (Partial)</summary>
+          Reference type template arguments referring to instantiation-dependent objects and subobjects
+          (i.e. declared inside a template but neither type- nor value-dependent) aren't fully supported.
+        </details>
+      </td>
     </tr>
-      <tr> <!-- from Belfast -->
-        <td><a href="https://wg21.link/p1907r1">P1907R1</a></td>
-      </tr>
     <tr>
       <td>Destroying operator delete</td>
       <td><a href="https://wg21.link/p0722r3">P0722R3</a></td>
diff --git a/lldb/include/lldb/lldb-enumerations.h b/lldb/include/lldb/lldb-enumerations.h
index ed1dec85d4840..392d333c23a44 100644
--- a/lldb/include/lldb/lldb-enumerations.h
+++ b/lldb/include/lldb/lldb-enumerations.h
@@ -875,6 +875,7 @@ enum TemplateArgumentKind {
   eTemplateArgumentKindExpression,
   eTemplateArgumentKindPack,
   eTemplateArgumentKindNullPtr,
+  eTemplateArgumentKindStructuralValue,
 };
 
 /// Type of match to be performed when looking for a formatter for a data type.
diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index ba37afbb619fb..6f5ff105477be 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -7168,6 +7168,9 @@ TypeSystemClang::GetTemplateArgumentKind(lldb::opaque_compiler_type_t type,
 
   case clang::TemplateArgument::Pack:
     return eTemplateArgumentKindPack;
+
+  case clang::TemplateArgument::StructuralValue:
+    return eTemplateArgumentKindStructuralValue;
   }
   llvm_unreachable("Unhandled clang::TemplateArgument::ArgKind");
 }

From cc3fd1974696a792ba70ba670ed761937cd0735c Mon Sep 17 00:00:00 2001
From: Vlad Serebrennikov <serebrennikov.vladislav@gmail.com>
Date: Mon, 22 Jan 2024 00:42:50 +0400
Subject: [PATCH 305/843] [clang] Remove `CXXNewInitializationStyle::Implicit`
 (#78793)

This is a follow up to https://github.com/llvm/llvm-project/pull/71417 ,
which aims to resolve concerns brought up there. Namely, this patch
replaces `CXXNewInitializationStyle::Implicit` with a dedicated
`HasInitializer` flag. This makes `CXXNewInitializationStyle` to model
syntax again. This patch also renames `Call` and `List` to less
confusing `Parens` and `Braces`.
---
 .../modernize/MakeSmartPtrCheck.cpp           |  7 +++--
 clang/include/clang/AST/ExprCXX.h             | 19 +++-----------
 clang/include/clang/AST/Stmt.h                |  8 +++---
 clang/lib/AST/ExprCXX.cpp                     | 10 +++----
 clang/lib/AST/ItaniumMangle.cpp               |  4 +--
 clang/lib/AST/JSONNodeDumper.cpp              |  5 ++--
 clang/lib/AST/StmtPrinter.cpp                 |  5 ++--
 clang/lib/Sema/SemaExprCXX.cpp                | 26 ++++++-------------
 clang/lib/Serialization/ASTReaderStmt.cpp     |  1 +
 clang/lib/Serialization/ASTWriterStmt.cpp     |  1 +
 10 files changed, 32 insertions(+), 54 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
index 616e57efa76de..d1d7e9dcfa9c0 100644
--- a/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
+++ b/clang-tools-extra/clang-tidy/modernize/MakeSmartPtrCheck.cpp
@@ -323,8 +323,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     return false;
   };
   switch (New->getInitializationStyle()) {
-  case CXXNewInitializationStyle::None:
-  case CXXNewInitializationStyle::Implicit: {
+  case CXXNewInitializationStyle::None: {
     if (ArraySizeExpr.empty()) {
       Diag << FixItHint::CreateRemoval(SourceRange(NewStart, NewEnd));
     } else {
@@ -335,7 +334,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     }
     break;
   }
-  case CXXNewInitializationStyle::Call: {
+  case CXXNewInitializationStyle::Parens: {
     // FIXME: Add fixes for constructors with parameters that can be created
     // with a C++11 braced-init-list (e.g. std::vector, std::map).
     // Unlike ordinal cases, braced list can not be deduced in
@@ -372,7 +371,7 @@ bool MakeSmartPtrCheck::replaceNew(DiagnosticBuilder &Diag,
     }
     break;
   }
-  case CXXNewInitializationStyle::List: {
+  case CXXNewInitializationStyle::Braces: {
     // Range of the substring that we do not want to remove.
     SourceRange InitRange;
     if (const auto *NewConstruct = New->getConstructExpr()) {
diff --git a/clang/include/clang/AST/ExprCXX.h b/clang/include/clang/AST/ExprCXX.h
index 2427801643183..9a7c632c36c5e 100644
--- a/clang/include/clang/AST/ExprCXX.h
+++ b/clang/include/clang/AST/ExprCXX.h
@@ -2210,14 +2210,11 @@ enum class CXXNewInitializationStyle {
   /// New-expression has no initializer as written.
   None,
 
-  /// New-expression has no written initializer, but has an implicit one.
-  Implicit,
-
   /// New-expression has a C++98 paren-delimited initializer.
-  Call,
+  Parens,
 
   /// New-expression has a C++11 list-initializer.
-  List
+  Braces
 };
 
 /// Represents a new-expression for memory allocation and constructor
@@ -2388,17 +2385,7 @@ class CXXNewExpr final
   bool isGlobalNew() const { return CXXNewExprBits.IsGlobalNew; }
 
   /// Whether this new-expression has any initializer at all.
-  bool hasInitializer() const {
-    switch (getInitializationStyle()) {
-    case CXXNewInitializationStyle::None:
-      return false;
-    case CXXNewInitializationStyle::Implicit:
-    case CXXNewInitializationStyle::Call:
-    case CXXNewInitializationStyle::List:
-      return true;
-    }
-    llvm_unreachable("Unknown initializer");
-  }
+  bool hasInitializer() const { return CXXNewExprBits.HasInitializer; }
 
   /// The kind of initializer this new-expression has.
   CXXNewInitializationStyle getInitializationStyle() const {
diff --git a/clang/include/clang/AST/Stmt.h b/clang/include/clang/AST/Stmt.h
index e1fde24e64778..55eca4007d17e 100644
--- a/clang/include/clang/AST/Stmt.h
+++ b/clang/include/clang/AST/Stmt.h
@@ -868,9 +868,11 @@ class alignas(void *) Stmt {
     LLVM_PREFERRED_TYPE(bool)
     unsigned UsualArrayDeleteWantsSize : 1;
 
-    /// What kind of initializer do we have? Could be none, parens, or braces.
-    /// In storage, we distinguish between "none, and no initializer expr", and
-    /// "none, but an implicit initializer expr".
+    // Is initializer expr present?
+    LLVM_PREFERRED_TYPE(bool)
+    unsigned HasInitializer : 1;
+
+    /// What kind of initializer syntax used? Could be none, parens, or braces.
     LLVM_PREFERRED_TYPE(CXXNewInitializationStyle)
     unsigned StoredInitializationStyle : 2;
 
diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index 83af7998f6833..e61c11dffd884 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -194,14 +194,14 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
       DirectInitRange(DirectInitRange) {
 
   assert((Initializer != nullptr ||
-          InitializationStyle == CXXNewInitializationStyle::None ||
-          InitializationStyle == CXXNewInitializationStyle::Implicit) &&
-         "Only NoInit can have no initializer!");
+          InitializationStyle == CXXNewInitializationStyle::None) &&
+         "Only CXXNewInitializationStyle::None can have no initializer!");
 
   CXXNewExprBits.IsGlobalNew = IsGlobalNew;
   CXXNewExprBits.IsArray = ArraySize.has_value();
   CXXNewExprBits.ShouldPassAlignment = ShouldPassAlignment;
   CXXNewExprBits.UsualArrayDeleteWantsSize = UsualArrayDeleteWantsSize;
+  CXXNewExprBits.HasInitializer = Initializer != nullptr;
   CXXNewExprBits.StoredInitializationStyle =
       llvm::to_underlying(InitializationStyle);
   bool IsParenTypeId = TypeIdParens.isValid();
@@ -219,10 +219,10 @@ CXXNewExpr::CXXNewExpr(bool IsGlobalNew, FunctionDecl *OperatorNew,
     getTrailingObjects<SourceRange>()[0] = TypeIdParens;
 
   switch (getInitializationStyle()) {
-  case CXXNewInitializationStyle::Call:
+  case CXXNewInitializationStyle::Parens:
     this->Range.setEnd(DirectInitRange.getEnd());
     break;
-  case CXXNewInitializationStyle::List:
+  case CXXNewInitializationStyle::Braces:
     this->Range.setEnd(getInitializer()->getSourceRange().getEnd());
     break;
   default:
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 56e8ee923e1ee..40b1e086ddd0c 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4897,7 +4897,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
     Out << '_';
     mangleType(New->getAllocatedType());
     if (New->hasInitializer()) {
-      if (New->getInitializationStyle() == CXXNewInitializationStyle::List)
+      if (New->getInitializationStyle() == CXXNewInitializationStyle::Braces)
         Out << "il";
       else
         Out << "pi";
@@ -4912,7 +4912,7 @@ void CXXNameMangler::mangleExpression(const Expr *E, unsigned Arity,
         for (unsigned i = 0, e = PLE->getNumExprs(); i != e; ++i)
           mangleExpression(PLE->getExpr(i));
       } else if (New->getInitializationStyle() ==
-                     CXXNewInitializationStyle::List &&
+                     CXXNewInitializationStyle::Braces &&
                  isa<InitListExpr>(Init)) {
         // Only take InitListExprs apart for list-initialization.
         mangleInitListElements(cast<InitListExpr>(Init));
diff --git a/clang/lib/AST/JSONNodeDumper.cpp b/clang/lib/AST/JSONNodeDumper.cpp
index cb3d3b0df1ee3..3daba13d0fce7 100644
--- a/clang/lib/AST/JSONNodeDumper.cpp
+++ b/clang/lib/AST/JSONNodeDumper.cpp
@@ -1356,12 +1356,11 @@ void JSONNodeDumper::VisitCXXNewExpr(const CXXNewExpr *NE) {
   attributeOnlyIfTrue("isPlacement", NE->getNumPlacementArgs() != 0);
   switch (NE->getInitializationStyle()) {
   case CXXNewInitializationStyle::None:
-  case CXXNewInitializationStyle::Implicit:
     break;
-  case CXXNewInitializationStyle::Call:
+  case CXXNewInitializationStyle::Parens:
     JOS.attribute("initStyle", "call");
     break;
-  case CXXNewInitializationStyle::List:
+  case CXXNewInitializationStyle::Braces:
     JOS.attribute("initStyle", "list");
     break;
   }
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index c04cb313c3387..9d4aa07ec4da7 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -2300,9 +2300,8 @@ void StmtPrinter::VisitCXXNewExpr(CXXNewExpr *E) {
     OS << ")";
 
   CXXNewInitializationStyle InitStyle = E->getInitializationStyle();
-  if (InitStyle != CXXNewInitializationStyle::None &&
-      InitStyle != CXXNewInitializationStyle::Implicit) {
-    bool Bare = InitStyle == CXXNewInitializationStyle::Call &&
+  if (InitStyle != CXXNewInitializationStyle::None) {
+    bool Bare = InitStyle == CXXNewInitializationStyle::Parens &&
                 !isa<ParenListExpr>(E->getInitializer());
     if (Bare)
       OS << "(";
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 8b6a80b45b27b..482afb5a677ec 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -1957,7 +1957,7 @@ static bool isLegalArrayNewInitializer(CXXNewInitializationStyle Style,
   else if (CXXConstructExpr *CCE = dyn_cast<CXXConstructExpr>(Init))
     return !CCE->isListInitialization() &&
            CCE->getConstructor()->isDefaultConstructor();
-  else if (Style == CXXNewInitializationStyle::List) {
+  else if (Style == CXXNewInitializationStyle::Braces) {
     assert(isa<InitListExpr>(Init) &&
            "Shouldn't create list CXXConstructExprs for arrays.");
     return true;
@@ -2011,9 +2011,9 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
   CXXNewInitializationStyle InitStyle;
   if (DirectInitRange.isValid()) {
     assert(Initializer && "Have parens but no initializer.");
-    InitStyle = CXXNewInitializationStyle::Call;
+    InitStyle = CXXNewInitializationStyle::Parens;
   } else if (Initializer && isa<InitListExpr>(Initializer))
-    InitStyle = CXXNewInitializationStyle::List;
+    InitStyle = CXXNewInitializationStyle::Braces;
   else {
     assert((!Initializer || isa<ImplicitValueInitExpr>(Initializer) ||
             isa<CXXConstructExpr>(Initializer)) &&
@@ -2023,7 +2023,7 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
 
   MultiExprArg Exprs(&Initializer, Initializer ? 1 : 0);
   if (ParenListExpr *List = dyn_cast_or_null<ParenListExpr>(Initializer)) {
-    assert(InitStyle == CXXNewInitializationStyle::Call &&
+    assert(InitStyle == CXXNewInitializationStyle::Parens &&
            "paren init for non-call init");
     Exprs = MultiExprArg(List->getExprs(), List->getNumExprs());
   }
@@ -2037,15 +2037,14 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
     //       initialized (8.5); if no initialization is performed,
     //       the object has indeterminate value
     case CXXNewInitializationStyle::None:
-    case CXXNewInitializationStyle::Implicit:
       return InitializationKind::CreateDefault(TypeRange.getBegin());
     //     - Otherwise, the new-initializer is interpreted according to the
     //       initialization rules of 8.5 for direct-initialization.
-    case CXXNewInitializationStyle::Call:
+    case CXXNewInitializationStyle::Parens:
       return InitializationKind::CreateDirect(TypeRange.getBegin(),
                                               DirectInitRange.getBegin(),
                                               DirectInitRange.getEnd());
-    case CXXNewInitializationStyle::List:
+    case CXXNewInitializationStyle::Braces:
       return InitializationKind::CreateDirectList(TypeRange.getBegin(),
                                                   Initializer->getBeginLoc(),
                                                   Initializer->getEndLoc());
@@ -2072,14 +2071,13 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
       return ExprError();
   } else if (Deduced && !Deduced->isDeduced()) {
     MultiExprArg Inits = Exprs;
-    bool Braced = (InitStyle == CXXNewInitializationStyle::List);
+    bool Braced = (InitStyle == CXXNewInitializationStyle::Braces);
     if (Braced) {
       auto *ILE = cast<InitListExpr>(Exprs[0]);
       Inits = MultiExprArg(ILE->getInits(), ILE->getNumInits());
     }
 
-    if (InitStyle == CXXNewInitializationStyle::None ||
-        InitStyle == CXXNewInitializationStyle::Implicit || Inits.empty())
+    if (InitStyle == CXXNewInitializationStyle::None || Inits.empty())
       return ExprError(Diag(StartLoc, diag::err_auto_new_requires_ctor_arg)
                        << AllocType << TypeRange);
     if (Inits.size() > 1) {
@@ -2447,14 +2445,6 @@ ExprResult Sema::BuildCXXNew(SourceRange Range, bool UseGlobal,
       FullInit = Binder->getSubExpr();
 
     Initializer = FullInit.get();
-    // We don't know that we're generating an implicit initializer until now, so
-    // we have to update the initialization style as well.
-    //
-    // FIXME: it would be nice to determine the correct initialization style
-    // earlier so InitStyle doesn't need adjusting.
-    if (InitStyle == CXXNewInitializationStyle::None && Initializer) {
-      InitStyle = CXXNewInitializationStyle::Implicit;
-    }
 
     // FIXME: If we have a KnownArraySize, check that the array bound of the
     // initializer is no greater than that constant value.
diff --git a/clang/lib/Serialization/ASTReaderStmt.cpp b/clang/lib/Serialization/ASTReaderStmt.cpp
index 21aed570ba26c..85ecfa1a1a0bf 100644
--- a/clang/lib/Serialization/ASTReaderStmt.cpp
+++ b/clang/lib/Serialization/ASTReaderStmt.cpp
@@ -1901,6 +1901,7 @@ void ASTStmtReader::VisitCXXNewExpr(CXXNewExpr *E) {
   E->CXXNewExprBits.IsGlobalNew = Record.readInt();
   E->CXXNewExprBits.ShouldPassAlignment = Record.readInt();
   E->CXXNewExprBits.UsualArrayDeleteWantsSize = Record.readInt();
+  E->CXXNewExprBits.HasInitializer = Record.readInt();
   E->CXXNewExprBits.StoredInitializationStyle = Record.readInt();
 
   assert((IsArray == E->isArray()) && "Wrong IsArray!");
diff --git a/clang/lib/Serialization/ASTWriterStmt.cpp b/clang/lib/Serialization/ASTWriterStmt.cpp
index 7f888e44dde1e..e5836f5dcbe95 100644
--- a/clang/lib/Serialization/ASTWriterStmt.cpp
+++ b/clang/lib/Serialization/ASTWriterStmt.cpp
@@ -1901,6 +1901,7 @@ void ASTStmtWriter::VisitCXXNewExpr(CXXNewExpr *E) {
   Record.push_back(E->isGlobalNew());
   Record.push_back(E->passAlignment());
   Record.push_back(E->doesUsualArrayDeleteWantSize());
+  Record.push_back(E->CXXNewExprBits.HasInitializer);
   Record.push_back(E->CXXNewExprBits.StoredInitializationStyle);
 
   Record.AddDeclRef(E->getOperatorNew());

From 86b6dfc619695b5bfb1880b2ed9abb4a6805fbe6 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Sun, 21 Jan 2024 16:24:04 -0500
Subject: [PATCH 306/843] [libc++] Fix Coverity warning about use-after-move
 (#78780)

While the code is technically correct because the index is never
actually moved from (and anyway that wouldn't matter since it's an
integer), it's still better style not to access an object after it has
been moved-from. Since this is so easy to do, just save the index in a
temporary variable.

rdar://120501577
---
 libcxx/include/variant | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/variant b/libcxx/include/variant
index ac69645a0fab0..6063739e52c86 100644
--- a/libcxx/include/variant
+++ b/libcxx/include/variant
@@ -808,14 +808,15 @@ protected:
   _LIBCPP_HIDE_FROM_ABI static void __generic_construct(__ctor& __lhs, _Rhs&& __rhs) {
     __lhs.__destroy();
     if (!__rhs.valueless_by_exception()) {
+      auto __rhs_index = __rhs.index();
       __visitation::__base::__visit_alt_at(
-          __rhs.index(),
+          __rhs_index,
           [](auto& __lhs_alt, auto&& __rhs_alt) {
             __construct_alt(__lhs_alt, std::forward<decltype(__rhs_alt)>(__rhs_alt).__value);
           },
           __lhs,
           std::forward<_Rhs>(__rhs));
-      __lhs.__index = __rhs.index();
+      __lhs.__index = __rhs_index;
     }
   }
 };

From c71956d760517a99d5d3d13b454873016333fcb3 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 21 Jan 2024 14:49:45 -0800
Subject: [PATCH 307/843] [MLGO] Add tests for scripts (#78878)

This patch adds integration tests for the script entry points. The tests
don't exercise all functionality, as that case is better covered by the
unit testing already checked in. This ensures that things like flag
parsing work and that the scripts are syntactically valid.
---
 llvm/utils/mlgo-utils/CMakeLists.txt          |  2 +-
 .../combine_training_corpus_script.test       | 31 +++++++++++++
 .../tests/corpus/extract_ir_script.test       | 46 +++++++++++++++++++
 .../tests/corpus/make_corpus_script.test      | 24 ++++++++++
 llvm/utils/mlgo-utils/tests/lit.cfg           | 10 +++-
 5 files changed, 111 insertions(+), 2 deletions(-)
 create mode 100644 llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
 create mode 100644 llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
 create mode 100644 llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test

diff --git a/llvm/utils/mlgo-utils/CMakeLists.txt b/llvm/utils/mlgo-utils/CMakeLists.txt
index 7b303c7639401..3129331d58c75 100644
--- a/llvm/utils/mlgo-utils/CMakeLists.txt
+++ b/llvm/utils/mlgo-utils/CMakeLists.txt
@@ -5,7 +5,7 @@ configure_lit_site_cfg(
 
 add_lit_testsuite(check-mlgo-utils "Running mlgo-utils tests"
   ${CMAKE_CURRENT_BINARY_DIR}
-  DEPENDS "FileCheck" "not" "count"
+  DEPENDS "FileCheck" "not" "count" "split-file" "yaml2obj" "llvm-objcopy"
 )
 
 set_target_properties(check-mlgo-utils PROPERTIES FOLDER "Tests")
diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
new file mode 100644
index 0000000000000..933a9c2b9f811
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
@@ -0,0 +1,31 @@
+# REQUIRES: python-38, absl, system-linux
+
+## Testing that the combine_trainig_corpus script works as expected when
+## invoked.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: split-file %s %t.dir
+# RUN: %python %scripts_dir/corpus/combine_training_corpus.py --root_dir=%t.dir
+# RUN: cat %t.dir/corpus_description.json | FileCheck %s
+
+## Check that we end up with the same properties as the original corpora
+# CHECK: "has_thinlto": false
+
+## Check that the modules end up in the combined corpus. Order does not matter.
+# CHECK-DAG: "subcorpus1/test1.o"
+# CHECK-DAG: "subcorpus2/test2.o"
+
+#--- subcorpus1/corpus_description.json
+{
+  "has_thinlto": false,
+  "modules": [
+    "test1.o"
+  ]
+}
+#--- subcorpus2/corpus_description.json
+{
+  "has_thinlto": false,
+  "modules": [
+    "test2.o"
+  ]
+}
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
new file mode 100644
index 0000000000000..c20581dacdc65
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
@@ -0,0 +1,46 @@
+# REQUIRES: python-38, absl, system-linux
+
+## Test that invoking the extract_ir script work as expected.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: yaml2obj %s -o %t.dir/test1.o
+# RUN: yaml2obj %s -o %t.dir/test2.o
+# RUN: rm -rf %t.dir.out && mkdir %t.dir.out
+
+# RUN: %python %scripts_dir/corpus/extract_ir.py --input=%t.dir --input_type=directory --output_dir=%t.dir.out --llvm_objcopy_path=llvm-objcopy
+# RUN: cat %t.dir.out/corpus_description.json | FileCheck %s
+
+## Check that this is not a thinLTO build
+# CHECK: "has_thinlto": false
+## Check that the expected modules end up in the corpus description
+# CHECK-DAG: "test1.o"
+# CHECK-DAG: "test2.o"
+
+# RUN: ls %t.dir.out | FileCheck %s --check-prefix CHECK-DIR
+
+# CHECK-DIR: test1.o.bc
+# CHECK-DIR: test1.o.cmd
+# CHECK-DIR: test2.o.bc
+# CHECK-DIR: test2.o.cmd
+
+--- !ELF
+FileHeader:
+  Class:           ELFCLASS64
+  Data:            ELFDATA2LSB
+  Type:            ET_REL
+  Machine:         EM_X86_64
+  SectionHeaderStringTable: .strtab
+Sections:
+  - Name:            .text
+    Type:            SHT_PROGBITS
+    Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
+    AddressAlign:    0x10
+    Content:         55
+  - Name:            .llvmbc
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         55
+  - Name:            .llvmcmd
+    Type:            SHT_PROGBITS
+    AddressAlign:    0x1
+    Content:         ff
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
new file mode 100644
index 0000000000000..3c1b96523718e
--- /dev/null
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
@@ -0,0 +1,24 @@
+# REQUIRES: python-38, absl, system-linux
+
+## Testing that the make_corpus script works as expected when invoked.
+
+# RUN: rm -rf %t.dir && mkdir %t.dir
+# RUN: touch %t.dir/test1.bc
+# RUN: touch %t.dir/test2.bc
+# RUN: rm -rf %t.out.dir && mkdir %t.out.dir
+
+# RUN: %python %scripts_dir/corpus/make_corpus.py --input_dir=%t.dir --output_dir=%t.out.dir --default_args="-test"
+
+# RUN: cat %t.out.dir/corpus_description.json | FileCheck %s
+
+## Check that we get the expected command in the global command override
+# CHECK: "-test"
+# CHECK: "has_thinlto": false
+## Check that the modules are in the corpus description
+# CHECK: "test1"
+# CHECK: "test2"
+
+# RUN: ls %t.out.dir | FileCheck %s --check-prefix CHECK-DIR
+
+# CHECK-DIR: test1.bc
+# CHECK-DIR: test2.bc
diff --git a/llvm/utils/mlgo-utils/tests/lit.cfg b/llvm/utils/mlgo-utils/tests/lit.cfg
index 055f0945942fc..0f6137e5e9138 100644
--- a/llvm/utils/mlgo-utils/tests/lit.cfg
+++ b/llvm/utils/mlgo-utils/tests/lit.cfg
@@ -1,3 +1,5 @@
+import os
+
 import lit.formats
 
 from lit.llvm import llvm_config
@@ -5,7 +7,7 @@ from lit.llvm import llvm_config
 config.name = "mlgo-utils"
 config.test_format = lit.formats.ShTest(execute_external=False)
 
-config.suffixes = [".py"]
+config.suffixes = [".py", ".test"]
 
 config.test_source_root = os.path.dirname(__file__)
 config.test_exec_root = config.obj_root
@@ -13,3 +15,9 @@ config.test_exec_root = config.obj_root
 config.environment["PYTHONPATH"] = os.path.join(config.src_root, "utils", "mlgo-utils")
 
 llvm_config.use_default_substitutions()
+config.substitutions.append(("split-file", llvm_config.use_llvm_tool("split-file")))
+config.substitutions.append(("yaml2obj", llvm_config.use_llvm_tool("yaml2obj")))
+config.substitutions.append(("llvm-objcopy", llvm_config.use_llvm_tool("llvm-objcopy")))
+
+scripts_dir = os.path.join(config.src_root, "utils/mlgo-utils/mlgo")
+config.substitutions.append(("%scripts_dir", scripts_dir))

From 120e0623773dc9c43f393d43be0641c7d7ad26f2 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 21 Jan 2024 15:06:58 -0800
Subject: [PATCH 308/843] [MLGO] Remove absl dependency from scripts (#78880)

This patch removes the absl dependency from the mlgo-utils scripts. We
were only using absl.logging, and absl.flags, so this patch just
consists of mechanically converting the absl flags parsing to Python's
builtin argparse as Python's logging is a drop in replacement for
absl.logging.
---
 .../mlgo/corpus/combine_training_corpus.py    |  27 ++-
 .../mlgo-utils/mlgo/corpus/extract_ir.py      | 208 +++++++++---------
 .../mlgo-utils/mlgo/corpus/make_corpus.py     |  45 ++--
 llvm/utils/mlgo-utils/pyproject.toml          |   3 -
 .../combine_training_corpus_script.test       |   2 +-
 .../tests/corpus/extract_ir_script.test       |   2 +-
 .../tests/corpus/make_corpus_script.test      |   2 +-
 llvm/utils/mlgo-utils/tests/lit.local.cfg     |   8 -
 8 files changed, 148 insertions(+), 149 deletions(-)

diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
index 9aabd87b4688e..3b2077b4c0e0e 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/combine_training_corpus.py
@@ -23,26 +23,25 @@
 and corpus2 are combined into combinedcorpus.
 """
 
-from absl import app
-from absl import flags
+import argparse
 
 from mlgo.corpus import combine_training_corpus_lib
 
-flags.DEFINE_string("root_dir", "", "root dir of module paths to combine.")
 
-FLAGS = flags.FLAGS
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for combining multiple training corpora"
+    )
+    parser.add_argument(
+        "--root_dir", type=str, help="The root dir of module paths to combine."
+    )
+    args = parser.parse_args()
+    main(args)
 
 
-def main(argv):
-    if len(argv) > 1:
-        raise app.UsageError("Too many command-line arguments.")
-
-    combine_training_corpus_lib.combine_corpus(FLAGS.root_dir)
-
-
-def entrypoint():
-    app.run(main)
+def main(args):
+    combine_training_corpus_lib.combine_corpus(args.root_dir)
 
 
 if __name__ == "__main__":
-    entrypoint()
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
index 9463e61dc534f..94415431ab4a3 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/extract_ir.py
@@ -24,129 +24,146 @@
 any output.
 """
 
+import argparse
 import json
+import logging
 import multiprocessing
 
-from absl import app
-from absl import flags
-from absl import logging
-
 from mlgo.corpus import extract_ir_lib
 
-flags.DEFINE_string(
-    "input",
-    None,
-    "Input file or directory - either compile_commands.json, a linker parameter"
-    "list, or a path to a directory containing object files.",
-)
-flags.DEFINE_enum(
-    "input_type",
-    "json",
-    ["json", "params", "directory"],
-    "Input file type - json, params, or directory. params latter refers to lld"
-    "params.",
-)
-flags.DEFINE_string("output_dir", None, "Output directory")
-flags.DEFINE_integer(
-    "num_workers",
-    None,
-    "Number of parallel workers for objcopy. `None` for maximum available.",
-)
-flags.DEFINE_string("llvm_objcopy_path", "llvm-objcopy", "Path to llvm-objcopy")
-flags.DEFINE_string(
-    "obj_base_dir",
-    "",
-    "Base directory for object files. Defaults to current working dir.",
-)
-flags.DEFINE_string(
-    "cmd_filter",
-    None,
-    "Include only those modules with a command line matching this regexp. "
-    "Setting it to None for not filtering. Note that the regexp is applied "
-    "independently for each separate command line option. For example, ^-Oz$ "
-    "will match Oz - built binaries. Does not work with thinlto_build=lld.",
-)
-flags.DEFINE_enum(
-    "thinlto_build",
-    None,
-    ["distributed", "local"],
-    "Set if the build was performed with either 'distributed' or "
-    "'local' ThinLTO. This ensures the thinlto.bc files are also copied. "
-    "The build is assumed to have had "
-    "-mllvm -lto-embed-bitcode=post-merge-pre-opt passed in the distributed "
-    "case, or -Wl,--save-temps=import and -Wl,--thinlto-emit-index-files "
-    "passed in the local case.",
-)
-flags.DEFINE_string(
-    "cmd_section_name",
-    ".llvmcmd",
-    "The section name passed to llvm-objcopy. For ELF object files, the "
-    "default .llvmcmd is correct. For Mach-O object files, one should use "
-    "something like __LLVM,__cmdline",
-)
-flags.DEFINE_string(
-    "bitcode_section_name",
-    ".llvmbc",
-    "The section name passed to llvm-objcopy. For ELF object files, the "
-    "default .llvmbc is correct. For Mach-O object files, one should use "
-    "__LLVM,__bitcode",
-)
-
-flags.mark_flag_as_required("output_dir")
-
-FLAGS = flags.FLAGS
-
-
-def main(argv):
-    if len(argv) > 1:
-        raise app.UsageError("Too many command-line arguments.")
 
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from build artifacts"
+    )
+    parser.add_argument(
+        "--input",
+        type=str,
+        help="Input file or directory - either compile_commands.json, a linker "
+        "parameter list, or a path to a directory containing object files.",
+    )
+    parser.add_argument(
+        "--input_type",
+        type=str,
+        help="Input file type - JSON, LLD params, or directory.",
+        choices=["json", "params", "directory"],
+        default="json",
+        nargs="?",
+    )
+    parser.add_argument("--output_dir", type=str, help="Output directory")
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        help="Number of parallel works for objcopy. `None` for maximum available.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--llvm_objcopy_path",
+        type=str,
+        help="Path to llvm-objcopy",
+        default="llvm-objcopy",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--obj_base_dir",
+        type=str,
+        help="Base directory for object files. Defaults to current working dir.",
+        default="",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_filter",
+        type=str,
+        help="Include only those modules with a command line matching this regular "
+        "expression. Set it to None to not perform any filtering. Note that the "
+        "regular expression is applied independently for each separate command line "
+        "option. For example, ^-Oz$ will match Oz built binaries. This does not work "
+        "with thinlto_build=lld.",
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--thinlto_build",
+        type=str,
+        help="Set if the build was performed with either 'distributed' or 'local' "
+        "ThinLTO. This ensures the thinlto.bc files are also copied. The build is "
+        "assumed to have had -mllvm -lto-embed-bitcode=post-merge-pre-opt passed in "
+        "the distributed case or -Wl,--save-temps=import and "
+        "-Wl,--thinlto-emit-index-files passed in the local case",
+        choices=["distributed", "local"],
+        default=None,
+        nargs="?",
+    )
+    parser.add_argument(
+        "--cmd_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmcmd is correct. For Mach-O object files, one should use "
+        "something like __LLVM,__cmdline",
+        default=".llvmcmd",
+        nargs="?",
+    )
+    parser.add_argument(
+        "--bitcode_section_name",
+        type=str,
+        help="The section name passed to llvm-objcopy. For ELF object files, the "
+        "default .llvmbc is correct. For Mach-O object files, one should use "
+        "__LLVM,__bitcode",
+        default=".llvmbc",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)
+
+
+def main(args):
     objs = []
-    if FLAGS.input is not None and FLAGS.thinlto_build == "local":
+    if args.input is not None and args.thinlto_build == "local":
         raise ValueError("--thinlto_build=local cannot be run with --input")
-    if FLAGS.input is None:
-        if FLAGS.thinlto_build != "local":
+    if args.input is None:
+        if args.thinlto_build != "local":
             raise ValueError("--input or --thinlto_build=local must be provided")
-        objs = extract_ir_lib.load_for_lld_thinlto(FLAGS.obj_base_dir, FLAGS.output_dir)
-    elif FLAGS.input_type == "json":
-        with open(FLAGS.input, encoding="utf-8") as f:
+        objs = extract_ir_lib.load_for_lld_thinlto(args.obj_base_dir, args.output_dir)
+    elif args.input_type == "json":
+        with open(args.input, encoding="utf-8") as f:
             objs = extract_ir_lib.load_from_compile_commands(
-                json.load(f), FLAGS.output_dir
+                json.load(f), args.output_dir
             )
-    elif FLAGS.input_type == "params":
-        if not FLAGS.obj_base_dir:
+    elif args.input_type == "params":
+        if not args.obj_base_dir:
             logging.info(
                 "-obj_base_dir is unspecified, assuming current directory."
                 "If no objects are found, use this option to specify the root"
                 "directory for the object file paths in the input file."
             )
-        with open(FLAGS.input, encoding="utf-8") as f:
+        with open(args.input, encoding="utf-8") as f:
             objs = extract_ir_lib.load_from_lld_params(
-                [l.strip() for l in f.readlines()], FLAGS.obj_base_dir, FLAGS.output_dir
+                [l.strip() for l in f.readlines()], args.obj_base_dir, args.output_dir
             )
-    elif FLAGS.input_type == "directory":
+    elif args.input_type == "directory":
         logging.warning(
             "Using the directory input is only recommended if the build system"
             "your project uses does not support any structured output that"
             "ml-compiler-opt understands. If your build system provides a"
             "structured compilation database, use that instead"
         )
-        objs = extract_ir_lib.load_from_directory(FLAGS.input, FLAGS.output_dir)
+        objs = extract_ir_lib.load_from_directory(args.input, args.output_dir)
     else:
-        logging.error("Unknown input type: %s", FLAGS.input_type)
+        logging.error("Unknown input type: %s", args.input_type)
 
     relative_output_paths = extract_ir_lib.run_extraction(
         objs,
-        FLAGS.num_workers,
-        FLAGS.llvm_objcopy_path,
-        FLAGS.cmd_filter,
-        FLAGS.thinlto_build,
-        FLAGS.cmd_section_name,
-        FLAGS.bitcode_section_name,
+        args.num_workers,
+        args.llvm_objcopy_path,
+        args.cmd_filter,
+        args.thinlto_build,
+        args.cmd_section_name,
+        args.bitcode_section_name,
     )
 
     extract_ir_lib.write_corpus_manifest(
-        FLAGS.thinlto_build, relative_output_paths, FLAGS.output_dir
+        args.thinlto_build, relative_output_paths, args.output_dir
     )
 
     logging.info(
@@ -156,10 +173,5 @@ def main(argv):
     )
 
 
-def entrypoint():
-    multiprocessing.set_start_method("fork")
-    app.run(main)
-
-
 if __name__ == "__main__":
-    entrypoint()
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
index edb0ecd853de2..221486e16c6e0 100644
--- a/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
+++ b/llvm/utils/mlgo-utils/mlgo/corpus/make_corpus.py
@@ -12,43 +12,42 @@
   --default_args="<list of space separated flags>"
 """
 
-from absl import app
-from absl import flags
-from absl import logging
+import argparse
+import logging
 
 from mlgo.corpus import make_corpus_lib
 
-flags.DEFINE_string("input_dir", None, "The input directory.")
-flags.DEFINE_string("output_dir", None, "The output directory.")
-flags.DEFINE_string(
-    "default_args",
-    "",
-    "The compiler flags to compile with when using downstream tooling.",
-)
 
-flags.mark_flag_as_required("input_dir")
-flags.mark_flag_as_required("output_dir")
-
-FLAGS = flags.FLAGS
+def parse_args_and_run():
+    parser = argparse.ArgumentParser(
+        description="A tool for making a corpus from arbitrary bitcode"
+    )
+    parser.add_argument("--input_dir", type=str, help="The input directory.")
+    parser.add_argument("--output_dir", type=str, help="The output directory.")
+    parser.add_argument(
+        "--default_args",
+        type=str,
+        help="The compiler flags to compile with when using downstream tooling.",
+        default="",
+        nargs="?",
+    )
+    args = parser.parse_args()
+    main(args)
 
 
-def main(_):
+def main(args):
     logging.warning(
         "Using this tool does not guarantee that the bitcode is taken at "
         "the correct stage for consumption during model training. Make "
         "sure to validate assumptions about where the bitcode is coming "
         "from before using it in production."
     )
-    relative_paths = make_corpus_lib.load_bitcode_from_directory(FLAGS.input_dir)
-    make_corpus_lib.copy_bitcode(relative_paths, FLAGS.input_dir, FLAGS.output_dir)
+    relative_paths = make_corpus_lib.load_bitcode_from_directory(args.input_dir)
+    make_corpus_lib.copy_bitcode(relative_paths, args.input_dir, args.output_dir)
     make_corpus_lib.write_corpus_manifest(
-        relative_paths, FLAGS.output_dir, FLAGS.default_args.split()
+        relative_paths, args.output_dir, args.default_args.split()
     )
 
 
-def entrypoint():
-    app.run(main)
-
-
 if __name__ == "__main__":
-    entrypoint()
+    parse_args_and_run()
diff --git a/llvm/utils/mlgo-utils/pyproject.toml b/llvm/utils/mlgo-utils/pyproject.toml
index be2af86cd05df..dac18a785c17b 100644
--- a/llvm/utils/mlgo-utils/pyproject.toml
+++ b/llvm/utils/mlgo-utils/pyproject.toml
@@ -7,9 +7,6 @@ name = "mlgo"
 description = "Tooling for ML in LLVM"
 readme = "README.md"
 requires-python = ">=3.8,<3.11"
-dependencies = [
-  "absl-py>=1.0.0"
-]
 dynamic = ["version"]
 license = {text = "Apache-2.0 WITH LLVM-exception"}
 classifiers = [
diff --git a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
index 933a9c2b9f811..51dc637347caf 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/combine_training_corpus_script.test
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, absl, system-linux
+# REQUIRES: python-38, system-linux
 
 ## Testing that the combine_trainig_corpus script works as expected when
 ## invoked.
diff --git a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
index c20581dacdc65..107116618ce97 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/extract_ir_script.test
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, absl, system-linux
+# REQUIRES: python-38, system-linux
 
 ## Test that invoking the extract_ir script work as expected.
 
diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
index 3c1b96523718e..a08780055f31f 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
@@ -1,4 +1,4 @@
-# REQUIRES: python-38, absl, system-linux
+# REQUIRES: python-38, system-linux
 
 ## Testing that the make_corpus script works as expected when invoked.
 
diff --git a/llvm/utils/mlgo-utils/tests/lit.local.cfg b/llvm/utils/mlgo-utils/tests/lit.local.cfg
index 90cdf8ba618ed..a9088750cb58b 100644
--- a/llvm/utils/mlgo-utils/tests/lit.local.cfg
+++ b/llvm/utils/mlgo-utils/tests/lit.local.cfg
@@ -4,11 +4,3 @@ import sys
 # the entire project has been bumped to 3.8.
 if sys.version_info > (3,8):
     config.available_features.add("python-38")
-
-# TODO(boomanaiden154): Remove this flag once the scripts are converted to
-# not use absl anymore.
-try:
-    import absl
-    config.available_features.add("absl")
-except:
-    pass

From b54e919573a8ee03b4b813ad9705b36cba62232d Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Sun, 21 Jan 2024 23:20:56 +0000
Subject: [PATCH 309/843] [MLGO] Fix make_corpus_script.test

This test is currently failing as the order that the files end up in the
corpus description is somewhat dependent upon platform and the check is
checking for a specific order. This patch switches to using the
CHECK-DAG directive to make the checks order invariant to fix the broken
bots.
---
 llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
index a08780055f31f..9b4d57330f010 100644
--- a/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
+++ b/llvm/utils/mlgo-utils/tests/corpus/make_corpus_script.test
@@ -14,9 +14,9 @@
 ## Check that we get the expected command in the global command override
 # CHECK: "-test"
 # CHECK: "has_thinlto": false
-## Check that the modules are in the corpus description
-# CHECK: "test1"
-# CHECK: "test2"
+## Check that the modules are in the corpus description (order invariant)
+# CHECK-DAG: "test1"
+# CHECK-DAG: "test2"
 
 # RUN: ls %t.out.dir | FileCheck %s --check-prefix CHECK-DIR
 

From b7355ee99ec63f44a6dc4f7dad9bb5a130bcc0eb Mon Sep 17 00:00:00 2001
From: Dave Lee <davelee.com@gmail.com>
Date: Sun, 21 Jan 2024 15:34:02 -0800
Subject: [PATCH 310/843] [lldb] Skip ObjC timezone tests on macOS >= 14 (NFC)
 (#78817)

Starting with macOS 14, the `NSTimeZone` and `CFTimeZone` types are backed by swift
implementations. These tests won't pass on mainline lldb, since it doesn't have Swift
support.
---
 .../TestDataFormatterObjCNSDate.py            | 26 ++++++++++++-------
 1 file changed, 16 insertions(+), 10 deletions(-)

diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSDate.py b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSDate.py
index a1ffe84ad556f..c56b887a3f508 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSDate.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-objc/TestDataFormatterObjCNSDate.py
@@ -19,6 +19,11 @@ def test_nsdate_with_run_command(self):
         """Test formatters for  NSDate."""
         self.appkit_tester_impl(self.nsdate_data_formatter_commands, False)
 
+    @skipIf(macos_version=[">=", "14.0"])
+    def test_timezone_with_run_command(self):
+        """Test formatters for NSTimeZone and CFTimeZone."""
+        self.appkit_tester_impl(self.timezone_data_formatter_commands, False)
+
     def nsdate_data_formatter_commands(self):
         self.expect(
             "frame variable date1 date2",
@@ -51,16 +56,6 @@ def nsdate_data_formatter_commands(self):
         self.expect_expr("date_1970_plus_05", result_summary="1970-01-01 00:00:00 UTC")
         self.expect_expr("date_1970_plus_04", result_summary="1970-01-01 00:00:00 UTC")
 
-        self.expect(
-            "frame variable cupertino home europe",
-            substrs=['"America/Los_Angeles"', '"Europe/Rome"', '"Europe/Paris"'],
-        )
-
-        self.expect(
-            "frame variable cupertino_ns home_ns europe_ns",
-            substrs=['"America/Los_Angeles"', '"Europe/Rome"', '"Europe/Paris"'],
-        )
-
         self.expect(
             "frame variable mut_bv",
             substrs=[
@@ -71,3 +66,14 @@ def nsdate_data_formatter_commands(self):
 
         self.expect_expr("distant_past", result_summary="0001-01-01 00:00:00 UTC")
         self.expect_expr("distant_future", result_summary="4001-01-01 00:00:00 UTC")
+
+    def timezone_data_formatter_commands(self):
+        self.expect(
+            "frame variable cupertino home europe",
+            substrs=['"America/Los_Angeles"', '"Europe/Rome"', '"Europe/Paris"'],
+        )
+
+        self.expect(
+            "frame variable cupertino_ns home_ns europe_ns",
+            substrs=['"America/Los_Angeles"', '"Europe/Rome"', '"Europe/Paris"'],
+        )

From 3a4615c0fe985fe7b971bf5c067ff0f7ce5e3d10 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 21 Jan 2024 16:42:44 -0800
Subject: [PATCH 311/843] [ELF] Clarify the first entry of .got.plt NFC

Differential Revision: https://reviews.llvm.org/D47053
---
 lld/ELF/Arch/X86_64.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lld/ELF/Arch/X86_64.cpp b/lld/ELF/Arch/X86_64.cpp
index c28e01e481958..de459013595fe 100644
--- a/lld/ELF/Arch/X86_64.cpp
+++ b/lld/ELF/Arch/X86_64.cpp
@@ -403,10 +403,10 @@ RelExpr X86_64::getRelExpr(RelType type, const Symbol &s,
 }
 
 void X86_64::writeGotPltHeader(uint8_t *buf) const {
-  // The first entry holds the value of _DYNAMIC. It is not clear why that is
-  // required, but it is documented in the psabi and the glibc dynamic linker
-  // seems to use it (note that this is relevant for linking ld.so, not any
-  // other program).
+  // The first entry holds the link-time address of _DYNAMIC. It is documented
+  // in the psABI and glibc before Aug 2021 used the entry to compute run-time
+  // load address of the shared object (note that this is relevant for linking
+  // ld.so, not any other program).
   write64le(buf, mainPart->dynamic->getVA());
 }
 

From 1d9a65b220a92b59b0556ba8b3195b5346e71170 Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Mon, 22 Jan 2024 09:31:32 +0800
Subject: [PATCH 312/843] [clang][analyzer][NFC] Simplify ranges in
 StdLibraryFunctionsChecker (#78886)

---
 .../Checkers/StdLibraryFunctionsChecker.cpp      | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
index d0eb5091444f6..fcd907a9bb0da 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StdLibraryFunctionsChecker.cpp
@@ -2146,6 +2146,8 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
         ConstraintSet{ReturnValueCondition(WithinRange, SingleValue(0))};
     const auto ReturnsMinusOne =
         ConstraintSet{ReturnValueCondition(WithinRange, SingleValue(-1))};
+    const auto ReturnsEOF =
+        ConstraintSet{ReturnValueCondition(WithinRange, SingleValue(EOFv))};
     const auto ReturnsNonnegative =
         ConstraintSet{ReturnValueCondition(WithinRange, Range(0, IntMax))};
     const auto ReturnsNonZero =
@@ -2207,8 +2209,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
         "fclose", Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
         Summary(NoEvalCall)
             .Case(ReturnsZero, ErrnoMustNotBeChecked, GenericSuccessMsg)
-            .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))},
-                  ErrnoNEZeroIrrelevant, GenericFailureMsg)
+            .Case(ReturnsEOF, ErrnoNEZeroIrrelevant, GenericFailureMsg)
             .ArgConstraint(NotNull(ArgNo(0))));
 
     // int ungetc(int c, FILE *stream);
@@ -2219,7 +2220,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
                    ArgumentCondition(0, WithinRange, {{0, UCharRangeMax}})},
                   ErrnoMustNotBeChecked, GenericSuccessMsg)
             .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv)),
-                   ArgumentCondition(0, WithinRange, {{EOFv, EOFv}})},
+                   ArgumentCondition(0, WithinRange, SingleValue(EOFv))},
                   ErrnoNEZeroIrrelevant,
                   "Assuming that 'ungetc' fails because EOF was passed as "
                   "character")
@@ -2287,8 +2288,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
         "fflush", Signature(ArgTypes{FilePtrTy}, RetType{IntTy}),
         Summary(NoEvalCall)
             .Case(ReturnsZero, ErrnoMustNotBeChecked, GenericSuccessMsg)
-            .Case({ReturnValueCondition(WithinRange, SingleValue(EOFv))},
-                  ErrnoNEZeroIrrelevant, GenericFailureMsg));
+            .Case(ReturnsEOF, ErrnoNEZeroIrrelevant, GenericFailureMsg));
 
     // long ftell(FILE *stream);
     // From 'The Open Group Base Specifications Issue 7, 2018 edition':
@@ -3002,8 +3002,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
         "execv",
         Signature(ArgTypes{ConstCharPtrTy, CharPtrConstPtr}, RetType{IntTy}),
         Summary(NoEvalCall)
-            .Case({ReturnValueCondition(WithinRange, SingleValue(-1))},
-                  ErrnoIrrelevant)
+            .Case(ReturnsMinusOne, ErrnoIrrelevant)
             .ArgConstraint(NotNull(ArgNo(0))));
 
     // int execvp(const char *file, char *const argv[]);
@@ -3011,8 +3010,7 @@ void StdLibraryFunctionsChecker::initFunctionSummaries(
         "execvp",
         Signature(ArgTypes{ConstCharPtrTy, CharPtrConstPtr}, RetType{IntTy}),
         Summary(NoEvalCall)
-            .Case({ReturnValueCondition(WithinRange, SingleValue(-1))},
-                  ErrnoIrrelevant)
+            .Case(ReturnsMinusOne, ErrnoIrrelevant)
             .ArgConstraint(NotNull(ArgNo(0))));
 
     // int getopt(int argc, char * const argv[], const char *optstring);

From 890acf8d382d70c07a589d5ab9b83e64841b9e96 Mon Sep 17 00:00:00 2001
From: hev <wangrui@loongson.cn>
Date: Mon, 22 Jan 2024 09:35:21 +0800
Subject: [PATCH 313/843] [docs] Add llvm and clang release notes for the
 global-var code model attribute (#78664)

---
 clang/docs/ReleaseNotes.rst | 9 +++++++++
 llvm/docs/ReleaseNotes.rst  | 4 ++++
 2 files changed, 13 insertions(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index dc31594e4b040..2c7c7b8a21b8e 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1166,6 +1166,15 @@ Windows Support
 
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
+- The ``model`` attribute is now supported for overriding the default code
+  model used to access global variables. The following values are supported:
+  ``normal``, ``medium`` and ``extreme``.
+
+  *Example Code*:
+
+  .. code-block:: c
+
+     int var __attribute((model("extreme")));
 
 RISC-V Support
 ^^^^^^^^^^^^^^
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 82cf130ffd186..c17c834c8081b 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -70,6 +70,8 @@ Changes to the LLVM IR
 
 * Added `llvm.exp10` intrinsic.
 
+* Added a ``code_model`` attribute for the `global variable <LangRef.html#global-variables>`_.
+
 Changes to LLVM infrastructure
 ------------------------------
 
@@ -130,6 +132,8 @@ Changes to the Hexagon Backend
 
 Changes to the LoongArch Backend
 --------------------------------
+* The code model of global variables can now be overridden by means of
+  the newly added LLVM IR attribute, ``code_model``.
 
 Changes to the MIPS Backend
 ---------------------------

From 2e30e31e1e80184d9b2c8aa98f617b4d1cb56d55 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 21 Jan 2024 17:43:05 -0800
Subject: [PATCH 314/843] [ELF] Claim recognized -z options. NFC

... so that we can reimplement `checkZOptions` using
https://reviews.llvm.org/D48433
---
 lld/Common/Args.cpp |  11 ++---
 lld/ELF/Driver.cpp  | 116 +++++++++++++++++++++++++++-----------------
 2 files changed, 76 insertions(+), 51 deletions(-)

diff --git a/lld/Common/Args.cpp b/lld/Common/Args.cpp
index 67d7babc96fff..5546b2aece641 100644
--- a/lld/Common/Args.cpp
+++ b/lld/Common/Args.cpp
@@ -60,17 +60,16 @@ SmallVector<StringRef, 0> lld::args::getStrings(opt::InputArgList &args,
 }
 
 uint64_t lld::args::getZOptionValue(opt::InputArgList &args, int id,
-                                    StringRef key, uint64_t Default) {
-  for (auto *arg : args.filtered_reverse(id)) {
+                                    StringRef key, uint64_t defaultValue) {
+  for (auto *arg : args.filtered(id)) {
     std::pair<StringRef, StringRef> kv = StringRef(arg->getValue()).split('=');
     if (kv.first == key) {
-      uint64_t result = Default;
-      if (!to_integer(kv.second, result))
+      if (!to_integer(kv.second, defaultValue))
         error("invalid " + key + ": " + kv.second);
-      return result;
+      arg->claim();
     }
   }
-  return Default;
+  return defaultValue;
 }
 
 std::vector<StringRef> lld::args::getLines(MemoryBufferRef mb) {
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 07f4263c90e62..157f6ee5ceafc 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -473,65 +473,84 @@ static const char *getReproduceOption(opt::InputArgList &args) {
 }
 
 static bool hasZOption(opt::InputArgList &args, StringRef key) {
+  bool ret = false;
   for (auto *arg : args.filtered(OPT_z))
-    if (key == arg->getValue())
-      return true;
-  return false;
+    if (key == arg->getValue()) {
+      ret = true;
+      arg->claim();
+    }
+  return ret;
 }
 
 static bool getZFlag(opt::InputArgList &args, StringRef k1, StringRef k2,
-                     bool Default) {
-  for (auto *arg : args.filtered_reverse(OPT_z)) {
-    if (k1 == arg->getValue())
-      return true;
-    if (k2 == arg->getValue())
-      return false;
+                     bool defaultValue) {
+  for (auto *arg : args.filtered(OPT_z)) {
+    StringRef v = arg->getValue();
+    if (k1 == v)
+      defaultValue = true;
+    else if (k2 == v)
+      defaultValue = false;
+    else
+      continue;
+    arg->claim();
   }
-  return Default;
+  return defaultValue;
 }
 
 static SeparateSegmentKind getZSeparate(opt::InputArgList &args) {
-  for (auto *arg : args.filtered_reverse(OPT_z)) {
+  auto ret = SeparateSegmentKind::None;
+  for (auto *arg : args.filtered(OPT_z)) {
     StringRef v = arg->getValue();
     if (v == "noseparate-code")
-      return SeparateSegmentKind::None;
-    if (v == "separate-code")
-      return SeparateSegmentKind::Code;
-    if (v == "separate-loadable-segments")
-      return SeparateSegmentKind::Loadable;
+      ret = SeparateSegmentKind::None;
+    else if (v == "separate-code")
+      ret = SeparateSegmentKind::Code;
+    else if (v == "separate-loadable-segments")
+      ret = SeparateSegmentKind::Loadable;
+    else
+      continue;
+    arg->claim();
   }
-  return SeparateSegmentKind::None;
+  return ret;
 }
 
 static GnuStackKind getZGnuStack(opt::InputArgList &args) {
-  for (auto *arg : args.filtered_reverse(OPT_z)) {
-    if (StringRef("execstack") == arg->getValue())
-      return GnuStackKind::Exec;
-    if (StringRef("noexecstack") == arg->getValue())
-      return GnuStackKind::NoExec;
-    if (StringRef("nognustack") == arg->getValue())
-      return GnuStackKind::None;
+  auto ret = GnuStackKind::NoExec;
+  for (auto *arg : args.filtered(OPT_z)) {
+    StringRef v = arg->getValue();
+    if (v == "execstack")
+      ret = GnuStackKind::Exec;
+    else if (v == "noexecstack")
+      ret = GnuStackKind::NoExec;
+    else if (v == "nognustack")
+      ret = GnuStackKind::None;
+    else
+      continue;
+    arg->claim();
   }
-
-  return GnuStackKind::NoExec;
+  return ret;
 }
 
 static uint8_t getZStartStopVisibility(opt::InputArgList &args) {
-  for (auto *arg : args.filtered_reverse(OPT_z)) {
+  uint8_t ret = STV_PROTECTED;
+  for (auto *arg : args.filtered(OPT_z)) {
     std::pair<StringRef, StringRef> kv = StringRef(arg->getValue()).split('=');
     if (kv.first == "start-stop-visibility") {
+      arg->claim();
       if (kv.second == "default")
-        return STV_DEFAULT;
+        ret = STV_DEFAULT;
       else if (kv.second == "internal")
-        return STV_INTERNAL;
+        ret = STV_INTERNAL;
       else if (kv.second == "hidden")
-        return STV_HIDDEN;
+        ret = STV_HIDDEN;
       else if (kv.second == "protected")
-        return STV_PROTECTED;
-      error("unknown -z start-stop-visibility= value: " + StringRef(kv.second));
+        ret = STV_PROTECTED;
+      else
+        error("unknown -z start-stop-visibility= value: " +
+              StringRef(kv.second));
     }
   }
-  return STV_PROTECTED;
+  return ret;
 }
 
 constexpr const char *knownZFlags[] = {
@@ -734,6 +753,9 @@ static void setUnresolvedSymbolPolicy(opt::InputArgList &args) {
         diagRegular = true;
       else if (StringRef(arg->getValue()) == "undefs")
         diagRegular = false;
+      else
+        break;
+      arg->claim();
       break;
     case OPT_allow_shlib_undefined:
       diagShlib = false;
@@ -1162,20 +1184,22 @@ static SmallVector<StringRef, 0> getSymbolOrderingFile(MemoryBufferRef mb) {
 }
 
 static bool getIsRela(opt::InputArgList &args) {
+  // The psABI specifies the default relocation entry format.
+  bool rela = is_contained({EM_AARCH64, EM_AMDGPU, EM_HEXAGON, EM_LOONGARCH,
+                            EM_PPC, EM_PPC64, EM_RISCV, EM_X86_64},
+                           config->emachine);
   // If -z rel or -z rela is specified, use the last option.
-  for (auto *arg : args.filtered_reverse(OPT_z)) {
+  for (auto *arg : args.filtered(OPT_z)) {
     StringRef s(arg->getValue());
     if (s == "rel")
-      return false;
-    if (s == "rela")
-      return true;
+      rela = false;
+    else if (s == "rela")
+      rela = true;
+    else
+      continue;
+    arg->claim();
   }
-
-  // Otherwise use the psABI defined relocation entry format.
-  uint16_t m = config->emachine;
-  return m == EM_AARCH64 || m == EM_AMDGPU || m == EM_HEXAGON ||
-         m == EM_LOONGARCH || m == EM_PPC || m == EM_PPC64 || m == EM_RISCV ||
-         m == EM_X86_64;
+  return rela;
 }
 
 static void parseClangOption(StringRef opt, const Twine &msg) {
@@ -1220,9 +1244,9 @@ static void readConfigs(opt::InputArgList &args) {
       args.hasArg(OPT_visual_studio_diagnostics_format, false);
 
   config->allowMultipleDefinition =
+      hasZOption(args, "muldefs") ||
       args.hasFlag(OPT_allow_multiple_definition,
-                   OPT_no_allow_multiple_definition, false) ||
-      hasZOption(args, "muldefs");
+                   OPT_no_allow_multiple_definition, false);
   config->androidMemtagHeap =
       args.hasFlag(OPT_android_memtag_heap, OPT_no_android_memtag_heap, false);
   config->androidMemtagStack = args.hasFlag(OPT_android_memtag_stack,
@@ -1525,6 +1549,7 @@ static void readConfigs(opt::InputArgList &args) {
     for (auto reportArg : reports) {
       if (option.first != reportArg.first)
         continue;
+      arg->claim();
       if (!isValidReportString(option.second)) {
         error(Twine("-z ") + reportArg.first + "= parameter " + option.second +
               " is not recognized");
@@ -1539,6 +1564,7 @@ static void readConfigs(opt::InputArgList &args) {
         StringRef(arg->getValue()).split('=');
     if (option.first != "dead-reloc-in-nonalloc")
       continue;
+    arg->claim();
     constexpr StringRef errPrefix = "-z dead-reloc-in-nonalloc=: ";
     std::pair<StringRef, StringRef> kv = option.second.split('=');
     if (kv.first.empty() || kv.second.empty()) {

From 665f913e4509e3e4f531aa4a4ebe92ec2ea5c23f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Sun, 21 Jan 2024 18:15:57 -0800
Subject: [PATCH 315/843] [ELF] Reimplement unknown -z options using the
 isClaimed bit

Maintaining the long list of known -z options
(https://reviews.llvm.org/D48621) turns out to be cumbersome. Go the
D48433 route instead.

max-page-size/common-page-size are claimed when `target` is available.

Inspired by: https://reviews.llvm.org/D48433
---
 lld/ELF/Driver.cpp         | 63 ++++----------------------------------
 lld/test/ELF/common-page.s |  2 +-
 lld/test/ELF/driver.test   |  2 +-
 3 files changed, 8 insertions(+), 59 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 157f6ee5ceafc..b988f4311e61b 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -553,65 +553,14 @@ static uint8_t getZStartStopVisibility(opt::InputArgList &args) {
   return ret;
 }
 
-constexpr const char *knownZFlags[] = {
-    "combreloc",
-    "copyreloc",
-    "defs",
-    "execstack",
-    "force-bti",
-    "force-ibt",
-    "global",
-    "hazardplt",
-    "ifunc-noplt",
-    "initfirst",
-    "interpose",
-    "keep-text-section-prefix",
-    "lazy",
-    "muldefs",
-    "nocombreloc",
-    "nocopyreloc",
-    "nodefaultlib",
-    "nodelete",
-    "nodlopen",
-    "noexecstack",
-    "nognustack",
-    "nokeep-text-section-prefix",
-    "nopack-relative-relocs",
-    "norelro",
-    "noseparate-code",
-    "nostart-stop-gc",
-    "notext",
-    "now",
-    "origin",
-    "pac-plt",
-    "pack-relative-relocs",
-    "rel",
-    "rela",
-    "relro",
-    "retpolineplt",
-    "rodynamic",
-    "separate-code",
-    "separate-loadable-segments",
-    "shstk",
-    "start-stop-gc",
-    "text",
-    "undefs",
-    "wxneeded",
-};
-
-static bool isKnownZFlag(StringRef s) {
-  return llvm::is_contained(knownZFlags, s) ||
-         s.starts_with("common-page-size=") || s.starts_with("bti-report=") ||
-         s.starts_with("cet-report=") ||
-         s.starts_with("dead-reloc-in-nonalloc=") ||
-         s.starts_with("max-page-size=") || s.starts_with("stack-size=") ||
-         s.starts_with("start-stop-visibility=");
-}
-
 // Report a warning for an unknown -z option.
 static void checkZOptions(opt::InputArgList &args) {
+  // This function is called before getTarget(), when certain options are not
+  // initialized yet. Claim them here.
+  args::getZOptionValue(args, OPT_z, "max-page-size", 0);
+  args::getZOptionValue(args, OPT_z, "common-page-size", 0);
   for (auto *arg : args.filtered(OPT_z))
-    if (!isKnownZFlag(arg->getValue()))
+    if (!arg->isClaimed())
       warn("unknown -z value: " + StringRef(arg->getValue()));
 }
 
@@ -629,7 +578,6 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
       args.hasFlag(OPT_fatal_warnings, OPT_no_fatal_warnings, false) &&
       !args.hasArg(OPT_no_warnings);
   errorHandler().suppressWarnings = args.hasArg(OPT_no_warnings);
-  checkZOptions(args);
 
   // Handle -help
   if (args.hasArg(OPT_help)) {
@@ -672,6 +620,7 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
   }
 
   readConfigs(args);
+  checkZOptions(args);
 
   // The behavior of -v or --version is a bit strange, but this is
   // needed for compatibility with GNU linkers.
diff --git a/lld/test/ELF/common-page.s b/lld/test/ELF/common-page.s
index a11b6993d3607..fd6564cb00361 100644
--- a/lld/test/ELF/common-page.s
+++ b/lld/test/ELF/common-page.s
@@ -11,7 +11,7 @@ _start:
 # of 4k. If the last loadable segment is executable then lld aligns the next
 # section using the common page size.
 
-# RUN: ld.lld -z max-page-size=0x10000 -z common-page-size=0x1000 %t -o %t2
+# RUN: ld.lld -z max-page-size=0x10000 -z common-page-size=0x1000 %t -o %t2 2>&1 | count 0
 # RUN: llvm-readobj --sections -l %t2 | FileCheck --check-prefix=CHECK-MAX %s
 
 # CHECK-MAX:      Sections [
diff --git a/lld/test/ELF/driver.test b/lld/test/ELF/driver.test
index cb43998bf9120..260be8e8f883b 100644
--- a/lld/test/ELF/driver.test
+++ b/lld/test/ELF/driver.test
@@ -65,7 +65,7 @@
 # ERR10-FATAL: error: unknown -z value: foo
 
 # RUN: not ld.lld %t -z max-page-size 2>&1 | FileCheck -check-prefix=ERR11 %s
-# ERR11: unknown -z value: max-page-size
+# ERR11: error: invalid max-page-size:
 
 ## Attempt to use -r and --export-dynamic together
 # RUN: not ld.lld -r -export-dynamic %t -o /dev/null 2>&1 | FileCheck -check-prefix=ERR12 %s

From be0fa319f922e6ea3661640aa55f6b9092be5a99 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Sun, 21 Jan 2024 21:37:17 -0500
Subject: [PATCH 316/843] [libc] fix unit tests in fullbuild (#78864)

fixes https://github.com/llvm/llvm-project/issues/78743

- For normal objects, the patch removes `RTTI` and exceptions in `fullbuild`
- For FP tests, the patch adds links to `stdc++` and `gcc_s` if `MPFR` is used.
---
 libc/cmake/modules/LLVMLibCTestRules.cmake |  2 +-
 libc/utils/MPFRWrapper/CMakeLists.txt      | 12 ++++++++----
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 24f543f6e4c13..9b7e918027541 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -187,7 +187,7 @@ function(create_libc_unittest fq_target_name)
   if(LLVM_LIBC_FULL_BUILD)
     target_compile_options(
       ${fq_build_target_name}
-      PRIVATE -ffreestanding
+      PRIVATE -ffreestanding -fno-exceptions -fno-rtti -fno-unwind-tables -fno-asynchronous-unwind-tables
     )
   endif()
   if(LIBC_UNITTEST_COMPILE_OPTIONS)
diff --git a/libc/utils/MPFRWrapper/CMakeLists.txt b/libc/utils/MPFRWrapper/CMakeLists.txt
index d9fa0e31df0e5..adc073c9a91f5 100644
--- a/libc/utils/MPFRWrapper/CMakeLists.txt
+++ b/libc/utils/MPFRWrapper/CMakeLists.txt
@@ -4,9 +4,13 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     MPFRUtils.h
     mpfr_inc.h
   )
-  add_compile_options(
-    -O3
-  )
+  target_compile_options(libcMPFRWrapper PRIVATE -O3)
+  if (LLVM_LIBC_FULL_BUILD)
+    # It is not easy to make libcMPFRWrapper a standalone library because gmp.h may unconditionally
+    # pull in some STL headers. As a result, targets using this library will need to link against 
+    # C++ and unwind libraries. Since we are using MPFR anyway, we directly specifies the GNU toolchain.
+    target_link_libraries(libcMPFRWrapper PUBLIC -lstdc++ -lgcc_s)
+  endif()
   add_dependencies(
     libcMPFRWrapper 
     libc.src.__support.CPP.string_view 
@@ -19,7 +23,7 @@ if(LIBC_TESTS_CAN_USE_MPFR)
     target_include_directories(libcMPFRWrapper PUBLIC ${LLVM_LIBC_MPFR_INSTALL_PATH}/include)
     target_link_directories(libcMPFRWrapper PUBLIC ${LLVM_LIBC_MPFR_INSTALL_PATH}/lib)
   endif()
-  target_link_libraries(libcMPFRWrapper LibcFPTestHelpers.unit LibcTest.unit mpfr gmp)
+  target_link_libraries(libcMPFRWrapper PUBLIC LibcFPTestHelpers.unit LibcTest.unit mpfr gmp)
 elseif(NOT LIBC_TARGET_ARCHITECTURE_IS_GPU)
   message(WARNING "Math tests using MPFR will be skipped.")
 endif()

From 04c85587596ab10d885a957a00c8fa22740f15c1 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 22 Jan 2024 03:26:18 +0000
Subject: [PATCH 317/843] [libc] Fix issue introduces by #76449

Use correct CMake variable.
---
 libc/lib/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/lib/CMakeLists.txt b/libc/lib/CMakeLists.txt
index 66aaa34632b39..af7ef2de93dd4 100644
--- a/libc/lib/CMakeLists.txt
+++ b/libc/lib/CMakeLists.txt
@@ -71,7 +71,7 @@ add_custom_target(install-libc
                           ${header_install_target}
                   COMMAND "${CMAKE_COMMAND}"
                           -DCMAKE_INSTALL_COMPONENT=libc
-                          -P "${LIBCXX_BINARY_DIR}/cmake_install.cmake")
+                          -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")
 add_custom_target(install-libc-stripped
                   DEPENDS ${added_archive_targets}
                           ${startup_target}
@@ -79,4 +79,4 @@ add_custom_target(install-libc-stripped
                   COMMAND "${CMAKE_COMMAND}"
                           -DCMAKE_INSTALL_COMPONENT=libc
                           -DCMAKE_INSTALL_DO_STRIP=1
-                          -P "${LIBCXX_BINARY_DIR}/cmake_install.cmake")
+                          -P "${CMAKE_BINARY_DIR}/cmake_install.cmake")

From 71dbefa446e60dcce86fc3f8531e50b1d12ce31f Mon Sep 17 00:00:00 2001
From: Yuxuan Chen <yuxuanchen1997@outlook.com>
Date: Sun, 21 Jan 2024 19:31:40 -0800
Subject: [PATCH 318/843] [Clang] Drop workaround for old gcc versions (#78803)

This workaround existed due to https://gcc.gnu.org/PR56135. The website
says that the bug was fixed in GCC 4.8.0 and the latest host toolchain
requirement says GCC 7.4. I think it would be very safe to drop this
workaround.
---
 clang/lib/Sema/TreeTransform.h | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Sema/TreeTransform.h b/clang/lib/Sema/TreeTransform.h
index 2deac7877fcda..e55e752b9cc35 100644
--- a/clang/lib/Sema/TreeTransform.h
+++ b/clang/lib/Sema/TreeTransform.h
@@ -6089,12 +6089,11 @@ QualType
 TreeTransform<Derived>::TransformFunctionProtoType(TypeLocBuilder &TLB,
                                                    FunctionProtoTypeLoc TL) {
   SmallVector<QualType, 4> ExceptionStorage;
-  TreeTransform *This = this; // Work around gcc.gnu.org/PR56135.
   return getDerived().TransformFunctionProtoType(
       TLB, TL, nullptr, Qualifiers(),
       [&](FunctionProtoType::ExceptionSpecInfo &ESI, bool &Changed) {
-        return This->getDerived().TransformExceptionSpec(
-            TL.getBeginLoc(), ESI, ExceptionStorage, Changed);
+        return getDerived().TransformExceptionSpec(TL.getBeginLoc(), ESI,
+                                                   ExceptionStorage, Changed);
       });
 }
 
@@ -13645,12 +13644,11 @@ TreeTransform<Derived>::TransformLambdaExpr(LambdaExpr *E) {
     auto TransformFunctionProtoTypeLoc =
         [this](TypeLocBuilder &TLB, FunctionProtoTypeLoc FPTL) -> QualType {
       SmallVector<QualType, 4> ExceptionStorage;
-      TreeTransform *This = this; // Work around gcc.gnu.org/PR56135.
       return this->TransformFunctionProtoType(
           TLB, FPTL, nullptr, Qualifiers(),
           [&](FunctionProtoType::ExceptionSpecInfo &ESI, bool &Changed) {
-            return This->TransformExceptionSpec(FPTL.getBeginLoc(), ESI,
-                                                ExceptionStorage, Changed);
+            return TransformExceptionSpec(FPTL.getBeginLoc(), ESI,
+                                          ExceptionStorage, Changed);
           });
     };
 

From d3cd1ce6ab13ae6be7842e2d905c5f3c783d3f04 Mon Sep 17 00:00:00 2001
From: XinWang10 <108658776+XinWang10@users.noreply.github.com>
Date: Sun, 21 Jan 2024 19:32:03 -0800
Subject: [PATCH 319/843] [X86] Add lowering tests for promoted CMPCCXADD and
 update CC representation (#78685)

https://github.com/llvm/llvm-project/pull/76125 supported the enc/dec
for CMPCCXADD instructions, this patch
1. Add lowering test for promoted CMPCCXADD
2. Update the representation of condition code for promoted CMPCCXADD to
align with the existing one
---
 .../X86/MCTargetDesc/X86InstPrinterCommon.cpp |   4 +-
 llvm/test/CodeGen/X86/cmpccxadd-intrinsics.ll | 193 ++++++++++++++++++
 .../MC/Disassembler/X86/apx/cmpccxadd.txt     |  80 ++++----
 llvm/test/MC/X86/apx/cmpccxadd-att.s          |  80 ++++----
 llvm/test/MC/X86/apx/cmpccxadd-intel.s        |  80 ++++----
 5 files changed, 316 insertions(+), 121 deletions(-)

diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index 1947313a9dfb0..e519c00a21109 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -30,7 +30,9 @@ void X86InstPrinterCommon::printCondCode(const MCInst *MI, unsigned Op,
                                          raw_ostream &O) {
   int64_t Imm = MI->getOperand(Op).getImm();
   bool Flavor = MI->getOpcode() == X86::CMPCCXADDmr32 ||
-                MI->getOpcode() == X86::CMPCCXADDmr64;
+                MI->getOpcode() == X86::CMPCCXADDmr64 ||
+                MI->getOpcode() == X86::CMPCCXADDmr32_EVEX ||
+                MI->getOpcode() == X86::CMPCCXADDmr64_EVEX;
   switch (Imm) {
   default: llvm_unreachable("Invalid condcode argument!");
   case    0: O << "o";  break;
diff --git a/llvm/test/CodeGen/X86/cmpccxadd-intrinsics.ll b/llvm/test/CodeGen/X86/cmpccxadd-intrinsics.ll
index b888fb5b1ff69..f88216f95a761 100644
--- a/llvm/test/CodeGen/X86/cmpccxadd-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/cmpccxadd-intrinsics.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+cmpccxadd | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=x86_64-unknown-unknown --show-mc-encoding -mattr=+cmpccxadd,+egpr | FileCheck %s --check-prefix=EGPR
 
 define dso_local i32 @test_cmpbexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-LABEL: test_cmpbexadd32:
@@ -7,6 +8,12 @@ define dso_local i32 @test_cmpbexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpoxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe0,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpbexadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpoxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe0,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 0)
   ret i32 %0
@@ -20,6 +27,12 @@ define dso_local i64 @test_cmpbexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpoxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe0,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpbexadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpoxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe0,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 0)
   ret i64 %0
@@ -33,6 +46,12 @@ define dso_local i32 @test_cmpbxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpnoxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe1,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpbxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpnoxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe1,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 1)
   ret i32 %0
@@ -44,6 +63,12 @@ define dso_local i64 @test_cmpbxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpnoxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe1,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpbxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpnoxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe1,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 1)
   ret i64 %0
@@ -55,6 +80,12 @@ define dso_local i32 @test_cmplexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpbxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe2,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmplexadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpbxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe2,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 2)
   ret i32 %0
@@ -66,6 +97,12 @@ define dso_local i64 @test_cmplexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpbxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe2,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmplexadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpbxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe2,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 2)
   ret i64 %0
@@ -77,6 +114,12 @@ define dso_local i32 @test_cmplxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpnbxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe3,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmplxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpnbxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe3,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 3)
   ret i32 %0
@@ -88,6 +131,12 @@ define dso_local i64 @test_cmplxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpnbxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe3,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmplxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpnbxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe3,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 3)
   ret i64 %0
@@ -99,6 +148,12 @@ define dso_local i32 @test_cmpnbexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpzxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe4,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnbexadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpzxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe4,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 4)
   ret i32 %0
@@ -110,6 +165,12 @@ define dso_local i64 @test_cmpnbexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpzxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe4,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnbexadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpzxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe4,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 4)
   ret i64 %0
@@ -121,6 +182,12 @@ define dso_local i32 @test_cmpnbxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpnzxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe5,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnbxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpnzxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe5,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 5)
   ret i32 %0
@@ -132,6 +199,12 @@ define dso_local i64 @test_cmpnbxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpnzxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe5,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnbxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpnzxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe5,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 5)
   ret i64 %0
@@ -143,6 +216,12 @@ define dso_local i32 @test_cmpnlexadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpbexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe6,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnlexadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpbexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe6,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 6)
   ret i32 %0
@@ -154,6 +233,12 @@ define dso_local i64 @test_cmpnlexadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpbexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe6,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnlexadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpbexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe6,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 6)
   ret i64 %0
@@ -165,6 +250,12 @@ define dso_local i32 @test_cmpnlxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpnbexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe7,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnlxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpnbexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe7,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 7)
   ret i32 %0
@@ -176,6 +267,12 @@ define dso_local i64 @test_cmpnlxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpnbexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe7,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnlxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpnbexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe7,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 7)
   ret i64 %0
@@ -187,6 +284,12 @@ define dso_local i32 @test_cmpnoxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpsxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe8,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnoxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpsxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe8,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 8)
   ret i32 %0
@@ -198,6 +301,12 @@ define dso_local i64 @test_cmpnoxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpsxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe8,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnoxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpsxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe8,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 8)
   ret i64 %0
@@ -209,6 +318,12 @@ define dso_local i32 @test_cmpnpxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpnsxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xe9,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnpxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpnsxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xe9,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 9)
   ret i32 %0
@@ -220,6 +335,12 @@ define dso_local i64 @test_cmpnpxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpnsxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xe9,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnpxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpnsxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xe9,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 9)
   ret i64 %0
@@ -231,6 +352,12 @@ define dso_local i32 @test_cmpnsxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmppxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xea,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnsxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmppxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xea,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 10)
   ret i32 %0
@@ -242,6 +369,12 @@ define dso_local i64 @test_cmpnsxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmppxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xea,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnsxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmppxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xea,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 10)
   ret i64 %0
@@ -253,6 +386,12 @@ define dso_local i32 @test_cmpnzxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpnpxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xeb,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnzxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpnpxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xeb,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 11)
   ret i32 %0
@@ -264,6 +403,12 @@ define dso_local i64 @test_cmpnzxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpnpxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xeb,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpnzxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpnpxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xeb,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 11)
   ret i64 %0
@@ -275,6 +420,12 @@ define dso_local i32 @test_cmpoxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmplxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xec,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpoxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmplxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xec,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 12)
   ret i32 %0
@@ -286,6 +437,12 @@ define dso_local i64 @test_cmpoxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmplxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xec,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpoxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmplxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xec,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 12)
   ret i64 %0
@@ -297,6 +454,12 @@ define dso_local i32 @test_cmppxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpnlxadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xed,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmppxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpnlxadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xed,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 13)
   ret i32 %0
@@ -308,6 +471,12 @@ define dso_local i64 @test_cmppxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpnlxadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xed,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmppxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpnlxadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xed,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 13)
   ret i64 %0
@@ -319,6 +488,12 @@ define dso_local i32 @test_cmpsxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmplexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xee,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpsxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmplexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xee,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 14)
   ret i32 %0
@@ -330,6 +505,12 @@ define dso_local i64 @test_cmpsxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmplexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xee,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpsxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmplexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xee,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 14)
   ret i64 %0
@@ -341,6 +522,12 @@ define dso_local i32 @test_cmpzxadd32(ptr %__A, i32 %__B, i32 %__C) nounwind {
 ; CHECK-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
 ; CHECK-NEXT:    cmpnlexadd %edx, %eax, (%rdi) # encoding: [0xc4,0xe2,0x69,0xef,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpzxadd32:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movl %esi, %eax # encoding: [0x89,0xf0]
+; EGPR-NEXT:    cmpnlexadd %edx, %eax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x69,0xef,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i32 @llvm.x86.cmpccxadd32(ptr %__A, i32 %__B, i32 %__C, i32 15)
   ret i32 %0
@@ -352,6 +539,12 @@ define dso_local i64 @test_cmpzxadd64(ptr %__A, i64 %__B, i64 %__C) nounwind {
 ; CHECK-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
 ; CHECK-NEXT:    cmpnlexadd %rdx, %rax, (%rdi) # encoding: [0xc4,0xe2,0xe9,0xef,0x07]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
+;
+; EGPR-LABEL: test_cmpzxadd64:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    movq %rsi, %rax # encoding: [0x48,0x89,0xf0]
+; EGPR-NEXT:    cmpnlexadd %rdx, %rax, (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0xe9,0xef,0x07]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %0 = tail call i64 @llvm.x86.cmpccxadd64(ptr %__A, i64 %__B, i64 %__C, i32 15)
   ret i64 %0
diff --git a/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
index 7a2e09af5b3db..2a54bebd5212c 100644
--- a/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
+++ b/llvm/test/MC/Disassembler/X86/apx/cmpccxadd.txt
@@ -1,20 +1,20 @@
 # RUN: llvm-mc --disassemble %s -triple=x86_64 | FileCheck %s --check-prefixes=ATT
 # RUN: llvm-mc --disassemble %s -triple=x86_64 -x86-asm-syntax=intel --output-asm-variant=1 | FileCheck %s --check-prefixes=INTEL
 
-# ATT:   cmpaxadd	%ecx, %edx, 123(%rax,%rbx,4)
-# INTEL: cmpaxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# ATT:   cmpnbexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpnbexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 0x62,0xf2,0x75,0x08,0xe7,0x54,0x98,0x7b
 
-# ATT:   cmpaxadd	%r9, %r15, 123(%rax,%rbx,4)
-# INTEL: cmpaxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# ATT:   cmpnbexadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpnbexadd	qword ptr [rax + 4*rbx + 123], r15, r9
 0x62,0x72,0xb5,0x08,0xe7,0x7c,0x98,0x7b
 
-# ATT:   cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# ATT:   cmpnbexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# ATT:   cmpnbexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00
 
 # ATT:   cmpbexadd	%ecx, %edx, 123(%rax,%rbx,4)
@@ -49,52 +49,52 @@
 # INTEL: cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpexadd	%ecx, %edx, 123(%rax,%rbx,4)
-# INTEL: cmpexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# ATT:   cmpzxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpzxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 0x62,0xf2,0x75,0x08,0xe4,0x54,0x98,0x7b
 
-# ATT:   cmpexadd	%r9, %r15, 123(%rax,%rbx,4)
-# INTEL: cmpexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# ATT:   cmpzxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpzxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 0x62,0x72,0xb5,0x08,0xe4,0x7c,0x98,0x7b
 
-# ATT:   cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# ATT:   cmpzxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpzxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpexadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# ATT:   cmpzxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpzxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpgexadd	%ecx, %edx, 123(%rax,%rbx,4)
-# INTEL: cmpgexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# ATT:   cmpnlxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpnlxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 0x62,0xf2,0x75,0x08,0xed,0x54,0x98,0x7b
 
-# ATT:   cmpgexadd	%r9, %r15, 123(%rax,%rbx,4)
-# INTEL: cmpgexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# ATT:   cmpnlxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpnlxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 0x62,0x72,0xb5,0x08,0xed,0x7c,0x98,0x7b
 
-# ATT:   cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# ATT:   cmpnlxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnlxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# ATT:   cmpnlxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnlxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpgxadd	%ecx, %edx, 123(%rax,%rbx,4)
-# INTEL: cmpgxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# ATT:   cmpnlexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpnlexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 0x62,0xf2,0x75,0x08,0xef,0x54,0x98,0x7b
 
-# ATT:   cmpgxadd	%r9, %r15, 123(%rax,%rbx,4)
-# INTEL: cmpgxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# ATT:   cmpnlexadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpnlexadd	qword ptr [rax + 4*rbx + 123], r15, r9
 0x62,0x72,0xb5,0x08,0xef,0x7c,0x98,0x7b
 
-# ATT:   cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# ATT:   cmpnlexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnlexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# ATT:   cmpnlexadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnlexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00
 
 # ATT:   cmplexadd	%ecx, %edx, 123(%rax,%rbx,4)
@@ -129,20 +129,20 @@
 # INTEL: cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpnexadd	%ecx, %edx, 123(%rax,%rbx,4)
-# INTEL: cmpnexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# ATT:   cmpnzxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# INTEL: cmpnzxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 0x62,0xf2,0x75,0x08,0xe5,0x54,0x98,0x7b
 
-# ATT:   cmpnexadd	%r9, %r15, 123(%rax,%rbx,4)
-# INTEL: cmpnexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# ATT:   cmpnzxadd	%r9, %r15, 123(%rax,%rbx,4)
+# INTEL: cmpnzxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 0x62,0x72,0xb5,0x08,0xe5,0x7c,0x98,0x7b
 
-# ATT:   cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
-# INTEL: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# ATT:   cmpnzxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# INTEL: cmpnzxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00
 
-# ATT:   cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
-# INTEL: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# ATT:   cmpnzxadd	%r19, %r23, 291(%r28,%r29,4)
+# INTEL: cmpnzxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00
 
 # ATT:   cmpnoxadd	%ecx, %edx, 123(%rax,%rbx,4)
diff --git a/llvm/test/MC/X86/apx/cmpccxadd-att.s b/llvm/test/MC/X86/apx/cmpccxadd-att.s
index 7ff803ad79ecb..0a21abbd0ed25 100644
--- a/llvm/test/MC/X86/apx/cmpccxadd-att.s
+++ b/llvm/test/MC/X86/apx/cmpccxadd-att.s
@@ -3,21 +3,21 @@
 
 # ERROR-COUNT-60: error:
 # ERROR-NOT: error:
-# CHECK: {evex}	cmpaxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpnbexadd	%ecx, %edx, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe7,0x54,0x98,0x7b]
-         {evex}	cmpaxadd	%ecx, %edx, 123(%rax,%rbx,4)
+         {evex}	cmpnbexadd	%ecx, %edx, 123(%rax,%rbx,4)
 
-# CHECK: {evex}	cmpaxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpnbexadd	%r9, %r15, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe7,0x7c,0x98,0x7b]
-         {evex}	cmpaxadd	%r9, %r15, 123(%rax,%rbx,4)
+         {evex}	cmpnbexadd	%r9, %r15, 123(%rax,%rbx,4)
 
-# CHECK: cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: cmpnbexadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpaxadd	%r18d, %r22d, 291(%r28,%r29,4)
+         cmpnbexadd	%r18d, %r22d, 291(%r28,%r29,4)
 
-# CHECK: cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: cmpnbexadd	%r19, %r23, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpaxadd	%r19, %r23, 291(%r28,%r29,4)
+         cmpnbexadd	%r19, %r23, 291(%r28,%r29,4)
 
 # CHECK: {evex}	cmpbexadd	%ecx, %edx, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe6,0x54,0x98,0x7b]
@@ -51,53 +51,53 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpbxadd	%r19, %r23, 291(%r28,%r29,4)
 
-# CHECK: {evex}	cmpexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpzxadd	%ecx, %edx, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe4,0x54,0x98,0x7b]
-         {evex}	cmpexadd	%ecx, %edx, 123(%rax,%rbx,4)
+         {evex}	cmpzxadd	%ecx, %edx, 123(%rax,%rbx,4)
 
-# CHECK: {evex}	cmpexadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpzxadd	%r9, %r15, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe4,0x7c,0x98,0x7b]
-         {evex}	cmpexadd	%r9, %r15, 123(%rax,%rbx,4)
+         {evex}	cmpzxadd	%r9, %r15, 123(%rax,%rbx,4)
 
-# CHECK: cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: cmpzxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpexadd	%r18d, %r22d, 291(%r28,%r29,4)
+         cmpzxadd	%r18d, %r22d, 291(%r28,%r29,4)
 
-# CHECK: cmpexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: cmpzxadd	%r19, %r23, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpexadd	%r19, %r23, 291(%r28,%r29,4)
+         cmpzxadd	%r19, %r23, 291(%r28,%r29,4)
 
-# CHECK: {evex}	cmpgexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpnlxadd	%ecx, %edx, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xed,0x54,0x98,0x7b]
-         {evex}	cmpgexadd	%ecx, %edx, 123(%rax,%rbx,4)
+         {evex}	cmpnlxadd	%ecx, %edx, 123(%rax,%rbx,4)
 
-# CHECK: {evex}	cmpgexadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpnlxadd	%r9, %r15, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xed,0x7c,0x98,0x7b]
-         {evex}	cmpgexadd	%r9, %r15, 123(%rax,%rbx,4)
+         {evex}	cmpnlxadd	%r9, %r15, 123(%rax,%rbx,4)
 
-# CHECK: cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: cmpnlxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpgexadd	%r18d, %r22d, 291(%r28,%r29,4)
+         cmpnlxadd	%r18d, %r22d, 291(%r28,%r29,4)
 
-# CHECK: cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: cmpnlxadd	%r19, %r23, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpgexadd	%r19, %r23, 291(%r28,%r29,4)
+         cmpnlxadd	%r19, %r23, 291(%r28,%r29,4)
 
-# CHECK: {evex}	cmpgxadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpnlexadd	%ecx, %edx, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xef,0x54,0x98,0x7b]
-         {evex}	cmpgxadd	%ecx, %edx, 123(%rax,%rbx,4)
+         {evex}	cmpnlexadd	%ecx, %edx, 123(%rax,%rbx,4)
 
-# CHECK: {evex}	cmpgxadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpnlexadd	%r9, %r15, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xef,0x7c,0x98,0x7b]
-         {evex}	cmpgxadd	%r9, %r15, 123(%rax,%rbx,4)
+         {evex}	cmpnlexadd	%r9, %r15, 123(%rax,%rbx,4)
 
-# CHECK: cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: cmpnlexadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpgxadd	%r18d, %r22d, 291(%r28,%r29,4)
+         cmpnlexadd	%r18d, %r22d, 291(%r28,%r29,4)
 
-# CHECK: cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: cmpnlexadd	%r19, %r23, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpgxadd	%r19, %r23, 291(%r28,%r29,4)
+         cmpnlexadd	%r19, %r23, 291(%r28,%r29,4)
 
 # CHECK: {evex}	cmplexadd	%ecx, %edx, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xee,0x54,0x98,0x7b]
@@ -131,21 +131,21 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmplxadd	%r19, %r23, 291(%r28,%r29,4)
 
-# CHECK: {evex}	cmpnexadd	%ecx, %edx, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpnzxadd	%ecx, %edx, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe5,0x54,0x98,0x7b]
-         {evex}	cmpnexadd	%ecx, %edx, 123(%rax,%rbx,4)
+         {evex}	cmpnzxadd	%ecx, %edx, 123(%rax,%rbx,4)
 
-# CHECK: {evex}	cmpnexadd	%r9, %r15, 123(%rax,%rbx,4)
+# CHECK: {evex}	cmpnzxadd	%r9, %r15, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe5,0x7c,0x98,0x7b]
-         {evex}	cmpnexadd	%r9, %r15, 123(%rax,%rbx,4)
+         {evex}	cmpnzxadd	%r9, %r15, 123(%rax,%rbx,4)
 
-# CHECK: cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+# CHECK: cmpnzxadd	%r18d, %r22d, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpnexadd	%r18d, %r22d, 291(%r28,%r29,4)
+         cmpnzxadd	%r18d, %r22d, 291(%r28,%r29,4)
 
-# CHECK: cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+# CHECK: cmpnzxadd	%r19, %r23, 291(%r28,%r29,4)
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpnexadd	%r19, %r23, 291(%r28,%r29,4)
+         cmpnzxadd	%r19, %r23, 291(%r28,%r29,4)
 
 # CHECK: {evex}	cmpnoxadd	%ecx, %edx, 123(%rax,%rbx,4)
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe1,0x54,0x98,0x7b]
diff --git a/llvm/test/MC/X86/apx/cmpccxadd-intel.s b/llvm/test/MC/X86/apx/cmpccxadd-intel.s
index cace33e59d6a7..4c44968fbf91c 100644
--- a/llvm/test/MC/X86/apx/cmpccxadd-intel.s
+++ b/llvm/test/MC/X86/apx/cmpccxadd-intel.s
@@ -1,20 +1,20 @@
 # RUN: llvm-mc -triple x86_64 -x86-asm-syntax=intel -output-asm-variant=1 --show-encoding %s | FileCheck %s
 
-# CHECK: {evex}	cmpaxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: {evex}	cmpnbexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe7,0x54,0x98,0x7b]
-         {evex}	cmpaxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+         {evex}	cmpnbexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 
-# CHECK: {evex}	cmpaxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: {evex}	cmpnbexadd	qword ptr [rax + 4*rbx + 123], r15, r9
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe7,0x7c,0x98,0x7b]
-         {evex}	cmpaxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+         {evex}	cmpnbexadd	qword ptr [rax + 4*rbx + 123], r15, r9
 
-# CHECK: cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: cmpnbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe7,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpaxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+         cmpnbexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 
-# CHECK: cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: cmpnbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe7,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpaxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+         cmpnbexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
 # CHECK: {evex}	cmpbexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe6,0x54,0x98,0x7b]
@@ -48,53 +48,53 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe2,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmpbxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
-# CHECK: {evex}	cmpexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: {evex}	cmpzxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe4,0x54,0x98,0x7b]
-         {evex}	cmpexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+         {evex}	cmpzxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 
-# CHECK: {evex}	cmpexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: {evex}	cmpzxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe4,0x7c,0x98,0x7b]
-         {evex}	cmpexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+         {evex}	cmpzxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 
-# CHECK: cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: cmpzxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe4,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+         cmpzxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 
-# CHECK: cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: cmpzxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe4,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+         cmpzxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
-# CHECK: {evex}	cmpgexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: {evex}	cmpnlxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xed,0x54,0x98,0x7b]
-         {evex}	cmpgexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+         {evex}	cmpnlxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 
-# CHECK: {evex}	cmpgexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: {evex}	cmpnlxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xed,0x7c,0x98,0x7b]
-         {evex}	cmpgexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+         {evex}	cmpnlxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 
-# CHECK: cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: cmpnlxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xed,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpgexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+         cmpnlxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 
-# CHECK: cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: cmpnlxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xed,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpgexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+         cmpnlxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
-# CHECK: {evex}	cmpgxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: {evex}	cmpnlexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xef,0x54,0x98,0x7b]
-         {evex}	cmpgxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+         {evex}	cmpnlexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 
-# CHECK: {evex}	cmpgxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: {evex}	cmpnlexadd	qword ptr [rax + 4*rbx + 123], r15, r9
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xef,0x7c,0x98,0x7b]
-         {evex}	cmpgxadd	qword ptr [rax + 4*rbx + 123], r15, r9
+         {evex}	cmpnlexadd	qword ptr [rax + 4*rbx + 123], r15, r9
 
-# CHECK: cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: cmpnlexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xef,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpgxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+         cmpnlexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 
-# CHECK: cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: cmpnlexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xef,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpgxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+         cmpnlexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
 # CHECK: {evex}	cmplexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xee,0x54,0x98,0x7b]
@@ -128,21 +128,21 @@
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xec,0xbc,0xac,0x23,0x01,0x00,0x00]
          cmplxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
-# CHECK: {evex}	cmpnexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+# CHECK: {evex}	cmpnzxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe5,0x54,0x98,0x7b]
-         {evex}	cmpnexadd	dword ptr [rax + 4*rbx + 123], edx, ecx
+         {evex}	cmpnzxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 
-# CHECK: {evex}	cmpnexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+# CHECK: {evex}	cmpnzxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 # CHECK: encoding: [0x62,0x72,0xb5,0x08,0xe5,0x7c,0x98,0x7b]
-         {evex}	cmpnexadd	qword ptr [rax + 4*rbx + 123], r15, r9
+         {evex}	cmpnzxadd	qword ptr [rax + 4*rbx + 123], r15, r9
 
-# CHECK: cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+# CHECK: cmpnzxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 # CHECK: encoding: [0x62,0x8a,0x69,0x00,0xe5,0xb4,0xac,0x23,0x01,0x00,0x00]
-         cmpnexadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
+         cmpnzxadd	dword ptr [r28 + 4*r29 + 291], r22d, r18d
 
-# CHECK: cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+# CHECK: cmpnzxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 # CHECK: encoding: [0x62,0x8a,0xe1,0x00,0xe5,0xbc,0xac,0x23,0x01,0x00,0x00]
-         cmpnexadd	qword ptr [r28 + 4*r29 + 291], r23, r19
+         cmpnzxadd	qword ptr [r28 + 4*r29 + 291], r23, r19
 
 # CHECK: {evex}	cmpnoxadd	dword ptr [rax + 4*rbx + 123], edx, ecx
 # CHECK: encoding: [0x62,0xf2,0x75,0x08,0xe1,0x54,0x98,0x7b]

From dd6fec5d4fde803f8aad909be4f43ac9e4fed816 Mon Sep 17 00:00:00 2001
From: XinWang10 <108658776+XinWang10@users.noreply.github.com>
Date: Sun, 21 Jan 2024 19:33:23 -0800
Subject: [PATCH 320/843] [X86][APX]Support lowering for APX promoted AMX-TILE
 instructions (#78689)

The enc/dec of promoted AMX-TILE instructions have been supported in
https://github.com/llvm/llvm-project/pull/76210.
This patch support lowering for promoted AMX-TILE instructions and
integrate test to existing tests.
---
 llvm/lib/Target/X86/X86ExpandPseudo.cpp       |  11 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  18 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp          |  13 +-
 llvm/lib/Target/X86/X86LowerTileCopy.cpp      |   6 +-
 .../CodeGen/X86/AMX/amx-lower-tile-copy.ll    | 128 ++++++++++++++
 llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll  | 163 ++++++++++++++++++
 .../CodeGen/X86/AMX/amx-tile-intrinsics.ll    |  12 ++
 7 files changed, 337 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index ecc7208e76072..95c4b02842ac5 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -556,16 +556,18 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   case TargetOpcode::ICALL_BRANCH_FUNNEL:
     ExpandICallBranchFunnel(&MBB, MBBI);
     return true;
+#define GET_EGPR_IF_ENABLED(OPC) (STI->hasEGPR() ? OPC##_EVEX : OPC)
   case X86::PLDTILECFGV: {
-    MI.setDesc(TII->get(X86::LDTILECFG));
+    MI.setDesc(TII->get(GET_EGPR_IF_ENABLED(X86::LDTILECFG)));
     return true;
   }
   case X86::PTILELOADDV:
   case X86::PTILELOADDT1V: {
     for (unsigned i = 2; i > 0; --i)
       MI.removeOperand(i);
-    unsigned Opc =
-        Opcode == X86::PTILELOADDV ? X86::TILELOADD : X86::TILELOADDT1;
+    unsigned Opc = Opcode == X86::PTILELOADDV
+                       ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
+                       : GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
     MI.setDesc(TII->get(Opc));
     return true;
   }
@@ -599,9 +601,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
   case X86::PTILESTOREDV: {
     for (int i = 1; i >= 0; --i)
       MI.removeOperand(i);
-    MI.setDesc(TII->get(X86::TILESTORED));
+    MI.setDesc(TII->get(GET_EGPR_IF_ENABLED(X86::TILESTORED)));
     return true;
   }
+#undef GET_EGPR_IF_ENABLED
   case X86::PTILEZEROV: {
     for (int i = 2; i > 0; --i) // Remove row, col
       MI.removeOperand(i);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 68634068fee31..974b6a176f0dc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -36374,14 +36374,22 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     unsigned Opc;
     switch (MI.getOpcode()) {
     default: llvm_unreachable("illegal opcode!");
-    case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
-    case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
-    case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
+#define GET_EGPR_IF_ENABLED(OPC) (Subtarget.hasEGPR() ? OPC##_EVEX : OPC)
+    case X86::PTILELOADD:
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
+      break;
+    case X86::PTILELOADDT1:
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADDT1);
+      break;
+    case X86::PTILESTORED:
+      Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
+      break;
+#undef GET_EGPR_IF_ENABLED
     }
 
     MachineInstrBuilder MIB = BuildMI(*BB, MI, MIMD, TII->get(Opc));
     unsigned CurOp = 0;
-    if (Opc != X86::TILESTORED)
+    if (Opc != X86::TILESTORED && Opc != X86::TILESTORED_EVEX)
       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
                  RegState::Define);
 
@@ -36391,7 +36399,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
     MIB.add(MI.getOperand(CurOp++)); // displacement
     MIB.add(MI.getOperand(CurOp++)); // segment
 
-    if (Opc == X86::TILESTORED)
+    if (Opc == X86::TILESTORED || Opc == X86::TILESTORED_EVEX)
       MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
                  RegState::Undef);
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 8b454a2cde416..36022ef35118f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4382,7 +4382,10 @@ static unsigned getLoadStoreRegOpcode(Register Reg,
   case 1024:
     assert(X86::TILERegClass.hasSubClassEq(RC) && "Unknown 1024-byte regclass");
     assert(STI.hasAMXTILE() && "Using 8*1024-bit register requires AMX-TILE");
-    return Load ? X86::TILELOADD : X86::TILESTORED;
+#define GET_EGPR_IF_ENABLED(OPC) (STI.hasEGPR() ? OPC##_EVEX : OPC)
+    return Load ? GET_EGPR_IF_ENABLED(X86::TILELOADD)
+                : GET_EGPR_IF_ENABLED(X86::TILESTORED);
+#undef GET_EGPR_IF_ENABLED
   }
 }
 
@@ -4575,6 +4578,8 @@ static bool isAMXOpcode(unsigned Opc) {
     return false;
   case X86::TILELOADD:
   case X86::TILESTORED:
+  case X86::TILELOADD_EVEX:
+  case X86::TILESTORED_EVEX:
     return true;
   }
 }
@@ -4586,7 +4591,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
   switch (Opc) {
   default:
     llvm_unreachable("Unexpected special opcode!");
-  case X86::TILESTORED: {
+  case X86::TILESTORED:
+  case X86::TILESTORED_EVEX: {
     // tilestored %tmm, (%sp, %idx)
     MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
     Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
@@ -4599,7 +4605,8 @@ void X86InstrInfo::loadStoreTileReg(MachineBasicBlock &MBB,
     MO.setIsKill(true);
     break;
   }
-  case X86::TILELOADD: {
+  case X86::TILELOADD:
+  case X86::TILELOADD_EVEX: {
     // tileloadd (%sp, %idx), %tmm
     MachineRegisterInfo &RegInfo = MBB.getParent()->getRegInfo();
     Register VirtReg = RegInfo.createVirtualRegister(&X86::GR64_NOSPRegClass);
diff --git a/llvm/lib/Target/X86/X86LowerTileCopy.cpp b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
index d6b42145859d8..e7afc49240e54 100644
--- a/llvm/lib/Target/X86/X86LowerTileCopy.cpp
+++ b/llvm/lib/Target/X86/X86LowerTileCopy.cpp
@@ -107,7 +107,8 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
       // mov 64 %rax
       BuildMI(MBB, MI, DL, TII->get(X86::MOV64ri), GR64Cand).addImm(64);
       // tilestored %tmm, (%sp, %idx)
-      unsigned Opc = X86::TILESTORED;
+#define GET_EGPR_IF_ENABLED(OPC) (ST.hasEGPR() ? OPC##_EVEX : OPC)
+      unsigned Opc = GET_EGPR_IF_ENABLED(X86::TILESTORED);
       MachineInstr *NewMI =
           addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc)), TileSS)
               .addReg(SrcReg, getKillRegState(SrcMO.isKill()));
@@ -115,7 +116,8 @@ bool X86LowerTileCopy::runOnMachineFunction(MachineFunction &MF) {
       MO.setReg(GR64Cand);
       MO.setIsKill(true);
       // tileloadd (%sp, %idx), %tmm
-      Opc = X86::TILELOADD;
+      Opc = GET_EGPR_IF_ENABLED(X86::TILELOADD);
+#undef GET_EGPR_IF_ENABLED
       NewMI = addFrameReference(BuildMI(MBB, MI, DL, TII->get(Opc), DstReg),
                                 TileSS);
       // restore %rax
diff --git a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
index e3c6f039cf0be..4686361ad2fcf 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-lower-tile-copy.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f,+egpr --show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=EGPR
 
 define dso_local void @test1(ptr%buf) nounwind {
 ; CHECK-LABEL: test1:
@@ -63,6 +64,79 @@ define dso_local void @test1(ptr%buf) nounwind {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test1:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    pushq %r15 # encoding: [0x41,0x57]
+; EGPR-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; EGPR-NEXT:    pushq %rbx # encoding: [0x53]
+; EGPR-NEXT:    subq $4056, %rsp # encoding: [0x48,0x81,0xec,0xd8,0x0f,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xFD8
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x44,0x24,0x0f]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xc0,0x03,0x00,0x00,0x01]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xf0,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0xd0,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xf1,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0xd2,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xf2,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0xd4,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x84,0x24,0xf3,0x03,0x00,0x00,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x84,0x24,0xd6,0x03,0x00,0x00,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    movl $64, %eax # encoding: [0xb8,0x40,0x00,0x00,0x00]
+; EGPR-NEXT:    movw $8, %bp # encoding: [0x66,0xbd,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rdi,%rax), %tmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x1c,0x07]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
+; EGPR-NEXT:    jne .LBB0_3 # encoding: [0x75,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB0_3-1, kind: FK_PCRel_1
+; EGPR-NEXT:  # %bb.1: # %loop.header.preheader
+; EGPR-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
+; EGPR-NEXT:    xorl %r14d, %r14d # encoding: [0x45,0x31,0xf6]
+; EGPR-NEXT:    movl $32, %r15d # encoding: [0x41,0xbf,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    .p2align 4, 0x90
+; EGPR-NEXT:  .LBB0_2: # %loop.header
+; EGPR-NEXT:    # =>This Inner Loop Header: Depth=1
+; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm3, 3024(%rsp,%rax) # 1024-byte Folded Spill
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x9c,0x04,0xd0,0x0b,0x00,0x00]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    callq foo # encoding: [0xe8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: foo-4, kind: reloc_branch_4byte_pcrel
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x84,0x24,0xc0,0x03,0x00,0x00]
+; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloadd 3024(%rsp,%rax), %tmm3 # 1024-byte Folded Reload
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x9c,0x04,0xd0,0x0b,0x00,0x00]
+; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x04,0x3b]
+; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
+; EGPR-NEXT:    # implicit-def: $rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    # encoding: [0x48,0x89,0x84,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm3, 1024(%rsp,%rax) # 1024-byte Folded Spill
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x9c,0x04,0x00,0x04,0x00,0x00]
+; EGPR-NEXT:    tileloadd {{[-0-9]+}}(%r{{[sb]}}p), %tmm2 # 1024-byte Folded Reload
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x94,0x24,0x00,0x04,0x00,0x00]
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; EGPR-NEXT:    # encoding: [0x48,0x8b,0x84,0x24,0xb8,0x03,0x00,0x00]
+; EGPR-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x5e,0xd0]
+; EGPR-NEXT:    tilestored %tmm2, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x14,0x3b]
+; EGPR-NEXT:    incl %r14d # encoding: [0x41,0xff,0xc6]
+; EGPR-NEXT:    cmpw $100, %r14w # encoding: [0x66,0x41,0x83,0xfe,0x64]
+; EGPR-NEXT:    jl .LBB0_2 # encoding: [0x7c,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB0_2-1, kind: FK_PCRel_1
+; EGPR-NEXT:  .LBB0_3: # %exit
+; EGPR-NEXT:    addq $4056, %rsp # encoding: [0x48,0x81,0xc4,0xd8,0x0f,0x00,0x00]
+; EGPR-NEXT:    # imm = 0xFD8
+; EGPR-NEXT:    popq %rbx # encoding: [0x5b]
+; EGPR-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; EGPR-NEXT:    popq %r15 # encoding: [0x41,0x5f]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 8, ptr %buf, i64 64)
   br i1 undef, label %loop.header, label %exit
@@ -139,6 +213,60 @@ define dso_local void @test2(ptr%buf) nounwind {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test2:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    pushq %r15 # encoding: [0x41,0x57]
+; EGPR-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; EGPR-NEXT:    pushq %rbx # encoding: [0x53]
+; EGPR-NEXT:    subq $72, %rsp # encoding: [0x48,0x83,0xec,0x48]
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x84,0x24,0x08,0x00,0x00,0x00]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x08,0x01]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x38,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x18,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x39,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x1a,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x3a,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x1c,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x08]
+; EGPR-NEXT:    movw $8, %bp # encoding: [0x66,0xbd,0x08,0x00]
+; EGPR-NEXT:    tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
+; EGPR-NEXT:    jne .LBB1_3 # encoding: [0x75,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB1_3-1, kind: FK_PCRel_1
+; EGPR-NEXT:  # %bb.1: # %loop.header.preheader
+; EGPR-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
+; EGPR-NEXT:    xorl %r14d, %r14d # encoding: [0x45,0x31,0xf6]
+; EGPR-NEXT:    movl $32, %r15d # encoding: [0x41,0xbf,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    .p2align 4, 0x90
+; EGPR-NEXT:  .LBB1_2: # %loop.header
+; EGPR-NEXT:    # =>This Inner Loop Header: Depth=1
+; EGPR-NEXT:    tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    callq foo # encoding: [0xe8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: foo-4, kind: reloc_branch_4byte_pcrel
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x08]
+; EGPR-NEXT:    tilezero %tmm2 # encoding: [0xc4,0xe2,0x7b,0x49,0xd0]
+; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x04,0x3b]
+; EGPR-NEXT:    tileloadd (%rbx,%r15), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x3b]
+; EGPR-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2 # encoding: [0xc4,0xe2,0x73,0x5e,0xd0]
+; EGPR-NEXT:    tilestored %tmm2, (%rbx,%r15) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x14,0x3b]
+; EGPR-NEXT:    incl %r14d # encoding: [0x41,0xff,0xc6]
+; EGPR-NEXT:    cmpw $100, %r14w # encoding: [0x66,0x41,0x83,0xfe,0x64]
+; EGPR-NEXT:    jl .LBB1_2 # encoding: [0x7c,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; EGPR-NEXT:  .LBB1_3: # %exit
+; EGPR-NEXT:    addq $72, %rsp # encoding: [0x48,0x83,0xc4,0x48]
+; EGPR-NEXT:    popq %rbx # encoding: [0x5b]
+; EGPR-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; EGPR-NEXT:    popq %r15 # encoding: [0x41,0x5f]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
   br i1 undef, label %loop.header, label %exit
diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
index 9f0d8aee3c4ee..c7c919c7cbb30 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f,+egpr --show-mc-encoding -verify-machineinstrs | FileCheck %s --check-prefix=EGPR
 
 @buf = dso_local global [3072 x i8] zeroinitializer, align 64
 
@@ -88,6 +89,111 @@ define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_api:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    pushq %r15 # encoding: [0x41,0x57]
+; EGPR-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; EGPR-NEXT:    pushq %rbx # encoding: [0x53]
+; EGPR-NEXT:    subq $2120, %rsp # encoding: [0x48,0x81,0xec,0x48,0x08,0x00,0x00]
+; EGPR-NEXT:    # imm = 0x848
+; EGPR-NEXT:    movl %esi, %ebx # encoding: [0x89,0xf3]
+; EGPR-NEXT:    movl %edi, %ebp # encoding: [0x89,0xfd]
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, (%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x04,0x24]
+; EGPR-NEXT:    movb $1, (%rsp) # encoding: [0xc6,0x04,0x24,0x01]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x10,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x31,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x14,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x33,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x18,0x08,0x00]
+; EGPR-NEXT:    movw %bx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x5c,0x24,0x1c]
+; EGPR-NEXT:    movb %bpl, {{[0-9]+}}(%rsp) # encoding: [0x40,0x88,0x6c,0x24,0x36]
+; EGPR-NEXT:    movw %bx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x5c,0x24,0x1a]
+; EGPR-NEXT:    movb %bpl, {{[0-9]+}}(%rsp) # encoding: [0x40,0x88,0x6c,0x24,0x35]
+; EGPR-NEXT:    movb %bpl, {{[0-9]+}}(%rsp) # encoding: [0x40,0x88,0x6c,0x24,0x34]
+; EGPR-NEXT:    movw %bx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x5c,0x24,0x16]
+; EGPR-NEXT:    movb %bpl, {{[0-9]+}}(%rsp) # encoding: [0x40,0x88,0x6c,0x24,0x32]
+; EGPR-NEXT:    movw %bx, {{[0-9]+}}(%rsp) # encoding: [0x66,0x89,0x5c,0x24,0x12]
+; EGPR-NEXT:    movb %bpl, {{[0-9]+}}(%rsp) # encoding: [0x40,0x88,0x6c,0x24,0x30]
+; EGPR-NEXT:    ldtilecfg (%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x04,0x24]
+; EGPR-NEXT:    movl $32, %r14d # encoding: [0x41,0xbe,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    movl $buf+2048, %r15d # encoding: [0x41,0xbf,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 2, value: buf+2048, kind: FK_Data_4
+; EGPR-NEXT:    tileloadd (%r15,%r14), %tmm5 # EVEX TO VEX Compression encoding: [0xc4,0x82,0x7b,0x4b,0x2c,0x37]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
+; EGPR-NEXT:    jne .LBB0_2 # encoding: [0x75,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB0_2-1, kind: FK_PCRel_1
+; EGPR-NEXT:  # %bb.1: # %if.true
+; EGPR-NEXT:    movl $buf, %eax # encoding: [0xb8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: buf, kind: FK_Data_4
+; EGPR-NEXT:    movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rax,%r14), %tmm0 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x04,0x30]
+; EGPR-NEXT:    movl $buf+1024, %eax # encoding: [0xb8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: buf+1024, kind: FK_Data_4
+; EGPR-NEXT:    tileloadd (%rax,%r14), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x30]
+; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm5, 1088(%rsp,%rax) # 1024-byte Folded Spill
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0xac,0x04,0x40,0x04,0x00,0x00]
+; EGPR-NEXT:    tdpbssd %tmm1, %tmm0, %tmm5 # encoding: [0xc4,0xe2,0x73,0x5e,0xe8]
+; EGPR-NEXT:    tilestored %tmm5, 64(%rsp,%rax) # 1024-byte Folded Spill
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x6c,0x04,0x40]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    callq foo # encoding: [0xe8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: foo-4, kind: reloc_branch_4byte_pcrel
+; EGPR-NEXT:    ldtilecfg (%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x04,0x24]
+; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloadd 64(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x74,0x04,0x40]
+; EGPR-NEXT:    jmp .LBB0_3 # encoding: [0xeb,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB0_3-1, kind: FK_PCRel_1
+; EGPR-NEXT:  .LBB0_2: # %if.false
+; EGPR-NEXT:    movl $buf, %eax # encoding: [0xb8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: buf, kind: FK_Data_4
+; EGPR-NEXT:    movw $8, %cx # encoding: [0x66,0xb9,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rax,%r14), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x14,0x30]
+; EGPR-NEXT:    movl $buf+1024, %eax # encoding: [0xb8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: buf+1024, kind: FK_Data_4
+; EGPR-NEXT:    tileloadd (%rax,%r14), %tmm3 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x1c,0x30]
+; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tilestored %tmm5, 1088(%rsp,%rax) # 1024-byte Folded Spill
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0xac,0x04,0x40,0x04,0x00,0x00]
+; EGPR-NEXT:    tdpbssd %tmm3, %tmm2, %tmm5 # encoding: [0xc4,0xe2,0x63,0x5e,0xea]
+; EGPR-NEXT:    tilestored %tmm5, 64(%rsp,%rax) # 1024-byte Folded Spill
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x6c,0x04,0x40]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    callq foo # encoding: [0xe8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: foo-4, kind: reloc_branch_4byte_pcrel
+; EGPR-NEXT:    ldtilecfg (%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x04,0x24]
+; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloadd 64(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x74,0x04,0x40]
+; EGPR-NEXT:    tilestored %tmm6, (%r15,%r14) # EVEX TO VEX Compression encoding: [0xc4,0x82,0x7a,0x4b,0x34,0x37]
+; EGPR-NEXT:  .LBB0_3: # %exit
+; EGPR-NEXT:    movl $buf, %eax # encoding: [0xb8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: buf, kind: FK_Data_4
+; EGPR-NEXT:    movl $32, %ecx # encoding: [0xb9,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    movw $8, %dx # encoding: [0x66,0xba,0x08,0x00]
+; EGPR-NEXT:    tileloadd (%rax,%rcx), %tmm4 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x24,0x08]
+; EGPR-NEXT:    movabsq $64, %rax # encoding: [0x48,0xb8,0x40,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+; EGPR-NEXT:    tileloadd 1088(%rsp,%rax), %tmm5 # 1024-byte Folded Reload
+; EGPR-NEXT:    # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0xac,0x04,0x40,0x04,0x00,0x00]
+; EGPR-NEXT:    tdpbssd %tmm4, %tmm6, %tmm5 # encoding: [0xc4,0xe2,0x5b,0x5e,0xee]
+; EGPR-NEXT:    movl $buf+2048, %eax # encoding: [0xb8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: buf+2048, kind: FK_Data_4
+; EGPR-NEXT:    tilestored %tmm5, (%rax,%rcx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x2c,0x08]
+; EGPR-NEXT:    addq $2120, %rsp # encoding: [0x48,0x81,0xc4,0x48,0x08,0x00,0x00]
+; EGPR-NEXT:    # imm = 0x848
+; EGPR-NEXT:    popq %rbx # encoding: [0x5b]
+; EGPR-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; EGPR-NEXT:    popq %r15 # encoding: [0x41,0x5f]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   %c = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, ptr getelementptr inbounds ([3072 x i8], ptr @buf, i64 0, i64 2048), i64 32)
   br i1 undef, label %if.true, label %if.false
 if.true:
@@ -165,6 +271,63 @@ define dso_local void @test3(ptr%buf) nounwind {
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test3:
+; EGPR:       # %bb.0: # %entry
+; EGPR-NEXT:    pushq %rbp # encoding: [0x55]
+; EGPR-NEXT:    pushq %r15 # encoding: [0x41,0x57]
+; EGPR-NEXT:    pushq %r14 # encoding: [0x41,0x56]
+; EGPR-NEXT:    pushq %rbx # encoding: [0x53]
+; EGPR-NEXT:    subq $72, %rsp # encoding: [0x48,0x83,0xec,0x48]
+; EGPR-NEXT:    vxorps %xmm0, %xmm0, %xmm0 # encoding: [0xc5,0xf8,0x57,0xc0]
+; EGPR-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp) # encoding: [0x62,0xf1,0x7c,0x48,0x11,0x84,0x24,0x08,0x00,0x00,0x00]
+; EGPR-NEXT:    movb $1, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x08,0x01]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x38,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x18,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x39,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x1a,0x08,0x00]
+; EGPR-NEXT:    movb $8, {{[0-9]+}}(%rsp) # encoding: [0xc6,0x44,0x24,0x3a,0x08]
+; EGPR-NEXT:    movw $8, {{[0-9]+}}(%rsp) # encoding: [0x66,0xc7,0x44,0x24,0x1c,0x08,0x00]
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x08]
+; EGPR-NEXT:    movw $8, %bp # encoding: [0x66,0xbd,0x08,0x00]
+; EGPR-NEXT:    tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    testb %al, %al # encoding: [0x84,0xc0]
+; EGPR-NEXT:    jne .LBB1_3 # encoding: [0x75,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB1_3-1, kind: FK_PCRel_1
+; EGPR-NEXT:  # %bb.1: # %loop.header.preheader
+; EGPR-NEXT:    movq %rdi, %rbx # encoding: [0x48,0x89,0xfb]
+; EGPR-NEXT:    movl $32, %r14d # encoding: [0x41,0xbe,0x20,0x00,0x00,0x00]
+; EGPR-NEXT:    xorl %r15d, %r15d # encoding: [0x45,0x31,0xff]
+; EGPR-NEXT:    .p2align 4, 0x90
+; EGPR-NEXT:  .LBB1_2: # %loop.header
+; EGPR-NEXT:    # =>This Inner Loop Header: Depth=1
+; EGPR-NEXT:    tilestored %tmm0, (%rbx,%r14) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x33]
+; EGPR-NEXT:    xorl %eax, %eax # encoding: [0x31,0xc0]
+; EGPR-NEXT:    tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    callq foo # encoding: [0xe8,A,A,A,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: foo-4, kind: reloc_branch_4byte_pcrel
+; EGPR-NEXT:    ldtilecfg {{[0-9]+}}(%rsp) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x44,0x24,0x08]
+; EGPR-NEXT:    tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
+; EGPR-NEXT:    tileloadd (%rbx,%r14), %tmm1 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x0c,0x33]
+; EGPR-NEXT:    tileloadd (%rbx,%r14), %tmm2 # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7b,0x4b,0x14,0x33]
+; EGPR-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0 # encoding: [0xc4,0xe2,0x6b,0x5e,0xc1]
+; EGPR-NEXT:    tilestored %tmm0, (%rbx,%r14) # EVEX TO VEX Compression encoding: [0xc4,0xa2,0x7a,0x4b,0x04,0x33]
+; EGPR-NEXT:    tilezero %tmm0 # encoding: [0xc4,0xe2,0x7b,0x49,0xc0]
+; EGPR-NEXT:    incl %r15d # encoding: [0x41,0xff,0xc7]
+; EGPR-NEXT:    cmpw $100, %r15w # encoding: [0x66,0x41,0x83,0xff,0x64]
+; EGPR-NEXT:    jl .LBB1_2 # encoding: [0x7c,A]
+; EGPR-NEXT:    # fixup A - offset: 1, value: .LBB1_2-1, kind: FK_PCRel_1
+; EGPR-NEXT:  .LBB1_3: # %exit
+; EGPR-NEXT:    addq $72, %rsp # encoding: [0x48,0x83,0xc4,0x48]
+; EGPR-NEXT:    popq %rbx # encoding: [0x5b]
+; EGPR-NEXT:    popq %r14 # encoding: [0x41,0x5e]
+; EGPR-NEXT:    popq %r15 # encoding: [0x41,0x5f]
+; EGPR-NEXT:    popq %rbp # encoding: [0x5d]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    vzeroupper # encoding: [0xc5,0xf8,0x77]
+; EGPR-NEXT:    retq # encoding: [0xc3]
 entry:
   %t5 = tail call x86_amx @llvm.x86.tilezero.internal(i16 8, i16 8)
   br i1 undef, label %loop.header, label %exit
diff --git a/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll
index 6170a04c78cc9..ce2829d5330bc 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-tile-intrinsics.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -verify-machineinstrs | FileCheck %s
 ; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O0 -mtriple=x86_64-unknown-unknown -mattr=+amx-tile,+egpr -verify-machineinstrs --show-mc-encoding | FileCheck %s --check-prefix=EGPR
 
 define void @test_amx(ptr %pointer, ptr %base, i64 %stride) {
 ; CHECK-LABEL: test_amx:
@@ -13,6 +14,17 @@ define void @test_amx(ptr %pointer, ptr %base, i64 %stride) {
 ; CHECK-NEXT:    tileloaddt1 (%rsi,%rdx), %tmm3
 ; CHECK-NEXT:    tilestored %tmm3, (%rsi,%rdx)
 ; CHECK-NEXT:    retq
+;
+; EGPR-LABEL: test_amx:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    ldtilecfg (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x78,0x49,0x07]
+; EGPR-NEXT:    sttilecfg (%rdi) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x49,0x07]
+; EGPR-NEXT:    tilerelease # encoding: [0xc4,0xe2,0x78,0x49,0xc0]
+; EGPR-NEXT:    tilezero %tmm3 # encoding: [0xc4,0xe2,0x7b,0x49,0xd8]
+; EGPR-NEXT:    tileloadd (%rsi,%rdx), %tmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7b,0x4b,0x1c,0x16]
+; EGPR-NEXT:    tileloaddt1 (%rsi,%rdx), %tmm3 # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x4b,0x1c,0x16]
+; EGPR-NEXT:    tilestored %tmm3, (%rsi,%rdx) # EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7a,0x4b,0x1c,0x16]
+; EGPR-NEXT:    retq # encoding: [0xc3]
   call void @llvm.x86.ldtilecfg(ptr %pointer)
 
   call void @llvm.x86.sttilecfg(ptr %pointer)

From c2bef33c5e61557314bbea6dcced710d18bcc413 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Mon, 22 Jan 2024 11:47:00 +0800
Subject: [PATCH 321/843] [X86][NFC] Auto-generate the function to check
 predicate for EVEX compression

---
 llvm/lib/Target/X86/X86CompressEVEX.cpp       | 40 +------------------
 .../TableGen/X86CompressEVEXTablesEmitter.cpp | 34 +++++++++++++++-
 2 files changed, 34 insertions(+), 40 deletions(-)

diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index a2bab9793b9ce..78a7e9850abdb 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -121,44 +121,6 @@ static bool usesExtendedRegister(const MachineInstr &MI) {
   return false;
 }
 
-static bool checkVEXInstPredicate(unsigned OldOpc, const X86Subtarget &ST) {
-  switch (OldOpc) {
-  default:
-    return true;
-  case X86::VCVTNEPS2BF16Z128rm:
-  case X86::VCVTNEPS2BF16Z128rr:
-  case X86::VCVTNEPS2BF16Z256rm:
-  case X86::VCVTNEPS2BF16Z256rr:
-    return ST.hasAVXNECONVERT();
-  case X86::VPDPBUSDSZ128m:
-  case X86::VPDPBUSDSZ128r:
-  case X86::VPDPBUSDSZ256m:
-  case X86::VPDPBUSDSZ256r:
-  case X86::VPDPBUSDZ128m:
-  case X86::VPDPBUSDZ128r:
-  case X86::VPDPBUSDZ256m:
-  case X86::VPDPBUSDZ256r:
-  case X86::VPDPWSSDSZ128m:
-  case X86::VPDPWSSDSZ128r:
-  case X86::VPDPWSSDSZ256m:
-  case X86::VPDPWSSDSZ256r:
-  case X86::VPDPWSSDZ128m:
-  case X86::VPDPWSSDZ128r:
-  case X86::VPDPWSSDZ256m:
-  case X86::VPDPWSSDZ256r:
-    return ST.hasAVXVNNI();
-  case X86::VPMADD52HUQZ128m:
-  case X86::VPMADD52HUQZ128r:
-  case X86::VPMADD52HUQZ256m:
-  case X86::VPMADD52HUQZ256r:
-  case X86::VPMADD52LUQZ128m:
-  case X86::VPMADD52LUQZ128r:
-  case X86::VPMADD52LUQZ256m:
-  case X86::VPMADD52LUQZ256r:
-    return ST.hasAVXIFMA();
-  }
-}
-
 // Do any custom cleanup needed to finalize the conversion.
 static bool performCustomAdjustments(MachineInstr &MI, unsigned NewOpc) {
   (void)NewOpc;
@@ -279,7 +241,7 @@ static bool CompressEVEXImpl(MachineInstr &MI, const X86Subtarget &ST) {
   }
 
   if (!IsND) {
-    if (usesExtendedRegister(MI) || !checkVEXInstPredicate(Opc, ST) ||
+    if (usesExtendedRegister(MI) || !checkPredicate(I->NewOpc, &ST) ||
         !performCustomAdjustments(MI, I->NewOpc))
       return false;
   }
diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
index aa8527e75380c..133eb9dfbf4bb 100644
--- a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
@@ -46,8 +46,12 @@ class X86CompressEVEXTablesEmitter {
 
   typedef std::pair<const CodeGenInstruction *, const CodeGenInstruction *>
       Entry;
+  typedef std::map<const Record *, std::vector<const CodeGenInstruction *>>
+      PredicateInstMap;
 
   std::vector<Entry> Table;
+  // Hold all compressed instructions that need to check predicate
+  PredicateInstMap PredicateInsts;
 
 public:
   X86CompressEVEXTablesEmitter(RecordKeeper &R) : Records(R), Target(R) {}
@@ -58,12 +62,15 @@ class X86CompressEVEXTablesEmitter {
 private:
   // Prints the given table as a C++ array of type X86CompressEVEXTableEntry
   void printTable(const std::vector<Entry> &Table, raw_ostream &OS);
+  // Prints function which checks target feature for compressed instructions.
+  void printCheckPredicate(const PredicateInstMap &PredicateInsts,
+                           raw_ostream &OS);
 };
 
 void X86CompressEVEXTablesEmitter::printTable(const std::vector<Entry> &Table,
                                               raw_ostream &OS) {
 
-  OS << "static const X86CompressEVEXTableEntry X86CompressEVEXTable[] = { \n";
+  OS << "static const X86CompressEVEXTableEntry X86CompressEVEXTable[] = {\n";
 
   // Print all entries added to the table
   for (const auto &Pair : Table)
@@ -73,6 +80,22 @@ void X86CompressEVEXTablesEmitter::printTable(const std::vector<Entry> &Table,
   OS << "};\n\n";
 }
 
+void X86CompressEVEXTablesEmitter::printCheckPredicate(
+    const PredicateInstMap &PredicateInsts, raw_ostream &OS) {
+
+  OS << "static bool checkPredicate(unsigned Opc, const X86Subtarget *Subtarget) {\n"
+     << "  switch (Opc) {\n"
+     << "  default: return true;\n";
+  for (const auto &[Key, Val] : PredicateInsts) {
+    for (const auto &Inst : Val)
+      OS << "  case X86::" << Inst->TheDef->getName() << ":\n";
+    OS << "    return " << Key->getValueAsString("CondString") << ";\n";
+  }
+
+  OS << "  }\n";
+  OS << "};\n\n";
+}
+
 static uint8_t byteFromBitsInit(const BitsInit *B) {
   unsigned N = B->getNumBits();
   assert(N <= 8 && "Field is too large for uint8_t!");
@@ -196,9 +219,18 @@ void X86CompressEVEXTablesEmitter::run(raw_ostream &OS) {
       continue;
 
     Table.push_back(std::make_pair(Inst, NewInst));
+    auto Predicates = NewInst->TheDef->getValueAsListOfDefs("Predicates");
+    auto It = llvm::find_if(Predicates, [](const Record *R) {
+      StringRef Name = R->getName();
+      return Name == "HasAVXNECONVERT" || Name == "HasAVXVNNI" ||
+             Name == "HasAVXIFMA";
+    });
+    if(It!= Predicates.end())
+      PredicateInsts[*It].push_back(NewInst);
   }
 
   printTable(Table, OS);
+  printCheckPredicate(PredicateInsts, OS);
 }
 } // namespace
 

From 381f5851988f2602ef0bbfd7547e1cfd9f2a3ead Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Mon, 22 Jan 2024 12:24:41 +0800
Subject: [PATCH 322/843] [X86] Fix Werror X86GenCompressEVEXTables.inc:1627:2:
 error: extra ';' outside of a function

---
 llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
index 133eb9dfbf4bb..fef8dc7236f57 100644
--- a/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86CompressEVEXTablesEmitter.cpp
@@ -93,7 +93,7 @@ void X86CompressEVEXTablesEmitter::printCheckPredicate(
   }
 
   OS << "  }\n";
-  OS << "};\n\n";
+  OS << "}\n\n";
 }
 
 static uint8_t byteFromBitsInit(const BitsInit *B) {

From 85337df9e36a10941faa14472b1a4ea0607bfced Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Mon, 22 Jan 2024 12:30:25 +0800
Subject: [PATCH 323/843] [X86][Driver] Enable feature ndd for -mapxf (#78901)

---
 clang/include/clang/Driver/Options.td         | 6 +++---
 clang/test/Driver/x86-target-features.c       | 4 ++--
 clang/test/Preprocessor/x86_target_features.c | 8 ++++----
 3 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f1687d823f6e0..199537404d0c6 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -6053,11 +6053,11 @@ def mapx_features_EQ : CommaJoined<["-"], "mapx-features=">, Group<m_x86_Feature
     HelpText<"Enable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,cf">;
 def mno_apx_features_EQ : CommaJoined<["-"], "mno-apx-features=">, Group<m_x86_Features_Group>,
     HelpText<"Disable features of APX">, Values<"egpr,push2pop2,ppx,ndd,ccmp,cf">;
-// Features egpr, push2pop2 and ppx are validated with llvm-test-suite && cpu2017 on Intel SDE.
+// Features egpr, push2pop2, ppx and ndd are validated with llvm-test-suite && cpu2017 on Intel SDE.
 // For stability, we turn on these features only for -mapxf. After a feature pass the validation,
 // we will add it to -mapxf.
-def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx"]>;
-def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx"]>;
+def mapxf : Flag<["-"], "mapxf">, Alias<mapx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx", "ndd"]>;
+def mno_apxf : Flag<["-"], "mno-apxf">, Alias<mno_apx_features_EQ>, AliasArgs<["egpr","push2pop2","ppx","ndd"]>;
 } // let Flags = [TargetSpecific]
 
 // VE feature flags
diff --git a/clang/test/Driver/x86-target-features.c b/clang/test/Driver/x86-target-features.c
index 6fb9df96e2b33..998d5f37da69b 100644
--- a/clang/test/Driver/x86-target-features.c
+++ b/clang/test/Driver/x86-target-features.c
@@ -428,8 +428,8 @@
 // RUN: %clang -target x86_64-unknown-linux-gnu -mno-apxf -mapxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=APXF %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapxf -mno-apxf %s -### -o %t.o 2>&1 | FileCheck -check-prefix=NO-APXF %s
 //
-// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx"
-// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx"
+// APXF: "-target-feature" "+egpr" "-target-feature" "+push2pop2" "-target-feature" "+ppx" "-target-feature" "+ndd"
+// NO-APXF: "-target-feature" "-egpr" "-target-feature" "-push2pop2" "-target-feature" "-ppx" "-target-feature" "-ndd"
 
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=egpr %s -### -o %t.o 2>&1 | FileCheck -check-prefix=EGPR %s
 // RUN: %clang -target x86_64-unknown-linux-gnu -mapx-features=push2pop2 %s -### -o %t.o 2>&1 | FileCheck -check-prefix=PUSH2POP2 %s
diff --git a/clang/test/Preprocessor/x86_target_features.c b/clang/test/Preprocessor/x86_target_features.c
index 888eecd08d84a..a1882043910f7 100644
--- a/clang/test/Preprocessor/x86_target_features.c
+++ b/clang/test/Preprocessor/x86_target_features.c
@@ -803,10 +803,10 @@
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ndd -x c -E -dM -o - %s | FileCheck --check-prefix=NDD %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=ccmp -x c -E -dM -o - %s | FileCheck --check-prefix=CCMP %s
 // RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapx-features=cf -x c -E -dM -o - %s | FileCheck --check-prefix=CF %s
-// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX %s
+// RUN: %clang -target x86_64-unknown-unknown -march=x86-64 -mapxf -x c -E -dM -o - %s | FileCheck --check-prefixes=EGPR,PUSH2POP2,PPX,NDD %s
+// CCMP: #define __CCMP__ 1
+// CF: #define __CF__ 1
 // EGPR: #define __EGPR__ 1
+// NDD: #define __NDD__ 1
 // PPX: #define __PPX__ 1
 // PUSH2POP2: #define __PUSH2POP2__ 1
-// NDD: #define __NDD__ 1
-// CCMP: #define __CCMP__ 1
-// CF: #define __CF__ 1

From 03c19e91e8d8cb706b58e02d69f80caeaf7eb0f4 Mon Sep 17 00:00:00 2001
From: Hristo Hristov <hristo.goshev.hristov@gmail.com>
Date: Mon, 22 Jan 2024 06:57:45 +0200
Subject: [PATCH 324/843] [libc++][numeric] P0543R3: Saturation arithmetic
 (#77967)

Implements: https://wg21.link/P0543R3
- https://eel.is/c++draft/numeric.sat

Additional references:
- Division: https://eel.is/c++draft/expr.mul#4
- Arithmetic conversions: https://eel.is/c++draft/expr.arith.conv#1
- Clang builtins:
https://clang.llvm.org/docs/LanguageExtensions.html#builtin-functions

Depends on: https://github.com/llvm/llvm-project/pull/78086

---------

Co-authored-by: Zingam <zingam@outlook.com>
Co-authored-by: Mark de Wever <zar-rpg@xs4all.nl>
---
 libcxx/docs/FeatureTestMacroTable.rst         |   2 +-
 libcxx/docs/ReleaseNotes/18.rst               |   1 +
 libcxx/docs/Status/Cxx2cPapers.csv            |   2 +-
 libcxx/include/CMakeLists.txt                 |   1 +
 .../include/__numeric/saturation_arithmetic.h | 110 +++++
 libcxx/include/libcxx.imp                     |   1 +
 libcxx/include/module.modulemap.in            |   1 +
 libcxx/include/numeric                        |  13 +
 libcxx/include/version                        |   2 +-
 libcxx/modules/std/numeric.inc                |  10 +
 .../numeric.version.compile.pass.cpp          |  16 +-
 .../version.version.compile.pass.cpp          |  16 +-
 .../numeric.ops.sat/add_sat.compile.pass.cpp  |  77 ++++
 .../numeric.ops.sat/add_sat.pass.cpp          | 173 ++++++++
 .../numeric.ops.sat/div_sat.assert.pass.cpp   |  58 +++
 .../numeric.ops.sat/div_sat.compile.pass.cpp  | 108 +++++
 .../numeric.ops.sat/div_sat.pass.cpp          | 158 +++++++
 .../numeric.ops.sat/mul_sat.compile.pass.cpp  |  77 ++++
 .../numeric.ops.sat/mul_sat.pass.cpp          | 180 ++++++++
 .../saturate_cast.compile.pass.cpp            |  79 ++++
 .../numeric.ops.sat/saturate_cast.pass.cpp    | 397 ++++++++++++++++++
 .../numeric.ops.sat/sub_sat.compile.pass.cpp  |  77 ++++
 .../numeric.ops.sat/sub_sat.pass.cpp          | 162 +++++++
 .../generate_feature_test_macro_components.py |   5 +-
 24 files changed, 1697 insertions(+), 29 deletions(-)
 create mode 100644 libcxx/include/__numeric/saturation_arithmetic.h
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.compile.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.assert.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.compile.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.compile.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.compile.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.compile.pass.cpp
 create mode 100644 libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp

diff --git a/libcxx/docs/FeatureTestMacroTable.rst b/libcxx/docs/FeatureTestMacroTable.rst
index 82e51d4e7eb53..d0d057e6bbaf0 100644
--- a/libcxx/docs/FeatureTestMacroTable.rst
+++ b/libcxx/docs/FeatureTestMacroTable.rst
@@ -432,7 +432,7 @@ Status
     --------------------------------------------------- -----------------
     ``__cpp_lib_rcu``                                   *unimplemented*
     --------------------------------------------------- -----------------
-    ``__cpp_lib_saturation_arithmetic``                 *unimplemented*
+    ``__cpp_lib_saturation_arithmetic``                 ``202311L``
     --------------------------------------------------- -----------------
     ``__cpp_lib_smart_ptr_owner_equality``              *unimplemented*
     --------------------------------------------------- -----------------
diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 0b43d76650a9d..b868b37790f77 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -75,6 +75,7 @@ Implemented Papers
 - P2909R4 - Fix formatting of code units as integers (Dude, where’s my ``char``?)
 - P2821R5 - ``span.at()``
 - P0521R0 - Proposed Resolution for CA 14 (``shared_ptr`` ``use_count/unique``)
+- P0543R3 - Saturation arithmetic
 - P1759R6 - Native handles and file streams
 - P2868R3 - Remove Deprecated ``std::allocator`` Typedef From C++26
 - P2517R1 - Add a conditional ``noexcept`` specification to ``std::apply``
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index e49290b45589a..c45aa3c510072 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -27,7 +27,7 @@
 "`P2714R1 <https://wg21.link/P2714R1>`__","LWG","Bind front and back to NTTP callables","Varna June 2023","","",""
 "`P2630R4 <https://wg21.link/P2630R4>`__","LWG","``submdspan``","Varna June 2023","","",""
 "","","","","","",""
-"`P0543R3 <https://wg21.link/P0543R3>`__","LWG","Saturation arithmetic","Kona November 2023","","",""
+"`P0543R3 <https://wg21.link/P0543R3>`__","LWG","Saturation arithmetic","Kona November 2023","|Complete|","18.0",""
 "`P2407R5 <https://wg21.link/P2407R5>`__","LWG","Freestanding Library: Partial Classes","Kona November 2023","","",""
 "`P2546R5 <https://wg21.link/P2546R5>`__","LWG","Debugging Support","Kona November 2023","","",""
 "`P2905R2 <https://wg21.link/P2905R2>`__","LWG","Runtime format strings","Kona November 2023","|Complete|","18.0","|format| |DR|"
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index 9fdf978a89d7e..ed721d467e94f 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -569,6 +569,7 @@ set(files
   __numeric/pstl_reduce.h
   __numeric/pstl_transform_reduce.h
   __numeric/reduce.h
+  __numeric/saturation_arithmetic.h
   __numeric/transform_exclusive_scan.h
   __numeric/transform_inclusive_scan.h
   __numeric/transform_reduce.h
diff --git a/libcxx/include/__numeric/saturation_arithmetic.h b/libcxx/include/__numeric/saturation_arithmetic.h
new file mode 100644
index 0000000000000..50274c6bbd9f3
--- /dev/null
+++ b/libcxx/include/__numeric/saturation_arithmetic.h
@@ -0,0 +1,110 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H
+#define _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H
+
+#include <__concepts/arithmetic.h>
+#include <__config>
+#include <__utility/cmp.h>
+#include <limits>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+
+#if _LIBCPP_STD_VER >= 26
+
+template <__libcpp_integer _Tp>
+_LIBCPP_HIDE_FROM_ABI constexpr _Tp add_sat(_Tp __x, _Tp __y) noexcept {
+  if (_Tp __sum; !__builtin_add_overflow(__x, __y, &__sum))
+    return __sum;
+  // Handle overflow
+  if constexpr (__libcpp_unsigned_integer<_Tp>) {
+    return std::numeric_limits<_Tp>::max();
+  } else {
+    // Signed addition overflow
+    if (__x > 0)
+      // Overflows if (x > 0 && y > 0)
+      return std::numeric_limits<_Tp>::max();
+    else
+      // Overflows if  (x < 0 && y < 0)
+      return std::numeric_limits<_Tp>::min();
+  }
+}
+
+template <__libcpp_integer _Tp>
+_LIBCPP_HIDE_FROM_ABI constexpr _Tp sub_sat(_Tp __x, _Tp __y) noexcept {
+  if (_Tp __sub; !__builtin_sub_overflow(__x, __y, &__sub))
+    return __sub;
+  // Handle overflow
+  if constexpr (__libcpp_unsigned_integer<_Tp>) {
+    // Overflows if (x < y)
+    return std::numeric_limits<_Tp>::min();
+  } else {
+    // Signed subtration overflow
+    if (__x >= 0)
+      // Overflows if (x >= 0 && y < 0)
+      return std::numeric_limits<_Tp>::max();
+    else
+      // Overflows if (x < 0 && y > 0)
+      return std::numeric_limits<_Tp>::min();
+  }
+}
+
+template <__libcpp_integer _Tp>
+_LIBCPP_HIDE_FROM_ABI constexpr _Tp mul_sat(_Tp __x, _Tp __y) noexcept {
+  if (_Tp __mul; !__builtin_mul_overflow(__x, __y, &__mul))
+    return __mul;
+  // Handle overflow
+  if constexpr (__libcpp_unsigned_integer<_Tp>) {
+    return std::numeric_limits<_Tp>::max();
+  } else {
+    // Signed multiplication overflow
+    if ((__x > 0 && __y > 0) || (__x < 0 && __y < 0))
+      return std::numeric_limits<_Tp>::max();
+    // Overflows if (x < 0 && y > 0) || (x > 0 && y < 0)
+    return std::numeric_limits<_Tp>::min();
+  }
+}
+
+template <__libcpp_integer _Tp>
+_LIBCPP_HIDE_FROM_ABI constexpr _Tp div_sat(_Tp __x, _Tp __y) noexcept {
+  _LIBCPP_ASSERT_UNCATEGORIZED(__y != 0, "Division by 0 is undefined");
+  if constexpr (__libcpp_unsigned_integer<_Tp>) {
+    return __x / __y;
+  } else {
+    // Handle signed division overflow
+    if (__x == std::numeric_limits<_Tp>::min() && __y == _Tp{-1})
+      return std::numeric_limits<_Tp>::max();
+    return __x / __y;
+  }
+}
+
+template <__libcpp_integer _Rp, __libcpp_integer _Tp>
+_LIBCPP_HIDE_FROM_ABI constexpr _Rp saturate_cast(_Tp __x) noexcept {
+  // Saturation is impossible edge case when ((min _Rp) < (min _Tp) && (max _Rp) > (max _Tp)) and it is expected to be
+  // optimized out by the compiler.
+
+  // Handle overflow
+  if (std::cmp_less(__x, std::numeric_limits<_Rp>::min()))
+    return std::numeric_limits<_Rp>::min();
+  if (std::cmp_greater(__x, std::numeric_limits<_Rp>::max()))
+    return std::numeric_limits<_Rp>::max();
+  // No overflow
+  return static_cast<_Rp>(__x);
+}
+
+#endif // _LIBCPP_STD_VER >= 26
+
+_LIBCPP_END_NAMESPACE_STD
+
+#endif // _LIBCPP___NUMERIC_SATURATION_ARITHMETIC_H
diff --git a/libcxx/include/libcxx.imp b/libcxx/include/libcxx.imp
index 8616f9639f4d9..45fa4a9541917 100644
--- a/libcxx/include/libcxx.imp
+++ b/libcxx/include/libcxx.imp
@@ -559,6 +559,7 @@
   { include: [ "<__numeric/pstl_reduce.h>", "private", "<numeric>", "public" ] },
   { include: [ "<__numeric/pstl_transform_reduce.h>", "private", "<numeric>", "public" ] },
   { include: [ "<__numeric/reduce.h>", "private", "<numeric>", "public" ] },
+  { include: [ "<__numeric/saturation_arithmetic.h>", "private", "<numeric>", "public" ] },
   { include: [ "<__numeric/transform_exclusive_scan.h>", "private", "<numeric>", "public" ] },
   { include: [ "<__numeric/transform_inclusive_scan.h>", "private", "<numeric>", "public" ] },
   { include: [ "<__numeric/transform_reduce.h>", "private", "<numeric>", "public" ] },
diff --git a/libcxx/include/module.modulemap.in b/libcxx/include/module.modulemap.in
index d10670d4faaff..194a74a1e07b1 100644
--- a/libcxx/include/module.modulemap.in
+++ b/libcxx/include/module.modulemap.in
@@ -1580,6 +1580,7 @@ module std_private_numeric_pstl_transform_reduce    [system] {
   export *
 }
 module std_private_numeric_reduce                   [system] { header "__numeric/reduce.h" }
+module std_private_numeric_saturation_arithmetic    [system] { header "__numeric/saturation_arithmetic.h" }
 module std_private_numeric_transform_exclusive_scan [system] { header "__numeric/transform_exclusive_scan.h" }
 module std_private_numeric_transform_inclusive_scan [system] { header "__numeric/transform_inclusive_scan.h" }
 module std_private_numeric_transform_reduce         [system] { header "__numeric/transform_reduce.h" }
diff --git a/libcxx/include/numeric b/libcxx/include/numeric
index d09d0a81fcc03..0fe7115f1c666 100644
--- a/libcxx/include/numeric
+++ b/libcxx/include/numeric
@@ -140,6 +140,18 @@ template<class T>
 template<class T>
     constexpr T* midpoint(T* a, T* b);        // C++20
 
+// [numeric.sat], saturation arithmetic
+template<class T>
+constexpr T add_sat(T x, T y) noexcept;                     // freestanding, Since C++26
+template<class T>
+constexpr T sub_sat(T x, T y) noexcept;                     // freestanding, Since C++26
+template<class T>
+constexpr T mul_sat(T x, T y) noexcept;                     // freestanding, Since C++26
+template<class T>
+constexpr T div_sat(T x, T y) noexcept;                     // freestanding, Since C++26
+template<class T, class U>
+constexpr T saturate_cast(U x) noexcept;                    // freestanding, Since C++26
+
 }  // std
 
 */
@@ -160,6 +172,7 @@ template<class T>
 #include <__numeric/pstl_reduce.h>
 #include <__numeric/pstl_transform_reduce.h>
 #include <__numeric/reduce.h>
+#include <__numeric/saturation_arithmetic.h>
 #include <__numeric/transform_exclusive_scan.h>
 #include <__numeric/transform_inclusive_scan.h>
 #include <__numeric/transform_reduce.h>
diff --git a/libcxx/include/version b/libcxx/include/version
index 7b2f487fe7020..9e26da8c1b242 100644
--- a/libcxx/include/version
+++ b/libcxx/include/version
@@ -504,7 +504,7 @@ __cpp_lib_within_lifetime                               202306L <type_traits>
 // # define __cpp_lib_out_ptr                              202311L
 # define __cpp_lib_ratio                                202306L
 // # define __cpp_lib_rcu                                  202306L
-// # define __cpp_lib_saturation_arithmetic                202311L
+# define __cpp_lib_saturation_arithmetic                202311L
 // # define __cpp_lib_smart_ptr_owner_equality             202306L
 # define __cpp_lib_span_at                              202311L
 # define __cpp_lib_span_initializer_list                202311L
diff --git a/libcxx/modules/std/numeric.inc b/libcxx/modules/std/numeric.inc
index d2b7688d4e5f1..3bc7b23168158 100644
--- a/libcxx/modules/std/numeric.inc
+++ b/libcxx/modules/std/numeric.inc
@@ -54,4 +54,14 @@ export namespace std {
 
   // [numeric.ops.midpoint], midpoint
   using std::midpoint;
+
+#if _LIBCPP_STD_VER >= 26
+  // [numeric.sat], saturation arithmetic
+  using std::add_sat;
+  using std::div_sat;
+  using std::mul_sat;
+  using std::saturate_cast;
+  using std::sub_sat;
+#endif
+
 } // namespace std
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp
index b510eefc69a5d..d132b7c7b9c4f 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/numeric.version.compile.pass.cpp
@@ -263,17 +263,11 @@
 #   endif
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_saturation_arithmetic
-#     error "__cpp_lib_saturation_arithmetic should be defined in c++26"
-#   endif
-#   if __cpp_lib_saturation_arithmetic != 202311L
-#     error "__cpp_lib_saturation_arithmetic should have the value 202311L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_saturation_arithmetic
-#     error "__cpp_lib_saturation_arithmetic should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_saturation_arithmetic
+#   error "__cpp_lib_saturation_arithmetic should be defined in c++26"
+# endif
+# if __cpp_lib_saturation_arithmetic != 202311L
+#   error "__cpp_lib_saturation_arithmetic should have the value 202311L in c++26"
 # endif
 
 #endif // TEST_STD_VER > 23
diff --git a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
index fa188533446b4..c319940fe6e49 100644
--- a/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
+++ b/libcxx/test/std/language.support/support.limits/support.limits.general/version.version.compile.pass.cpp
@@ -7167,17 +7167,11 @@
 #   error "__cpp_lib_sample should have the value 201603L in c++26"
 # endif
 
-# if !defined(_LIBCPP_VERSION)
-#   ifndef __cpp_lib_saturation_arithmetic
-#     error "__cpp_lib_saturation_arithmetic should be defined in c++26"
-#   endif
-#   if __cpp_lib_saturation_arithmetic != 202311L
-#     error "__cpp_lib_saturation_arithmetic should have the value 202311L in c++26"
-#   endif
-# else // _LIBCPP_VERSION
-#   ifdef __cpp_lib_saturation_arithmetic
-#     error "__cpp_lib_saturation_arithmetic should not be defined because it is unimplemented in libc++!"
-#   endif
+# ifndef __cpp_lib_saturation_arithmetic
+#   error "__cpp_lib_saturation_arithmetic should be defined in c++26"
+# endif
+# if __cpp_lib_saturation_arithmetic != 202311L
+#   error "__cpp_lib_saturation_arithmetic should have the value 202311L in c++26"
 # endif
 
 # ifndef __cpp_lib_scoped_lock
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.compile.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.compile.pass.cpp
new file mode 100644
index 0000000000000..38b6d627869a0
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.compile.pass.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <numeric>
+
+// template<class T>
+// constexpr T add_sat(T x, T y) noexcept;                     // freestanding
+
+#include <concepts>
+#include <numeric>
+
+#include "test_macros.h"
+
+template <typename T, typename U>
+concept CanDo = requires(T x, U y) {
+  { std::add_sat(x, y) } -> std::same_as<T>;
+};
+
+template <typename T, typename U>
+constexpr void test_constraint_success() {
+  static_assert(CanDo<T, T>);
+  static_assert(!CanDo<U, T>);
+  static_assert(!CanDo<T, U>);
+}
+
+template <typename T>
+constexpr void test_constraint_fail() {
+  using I = int;
+  static_assert(!CanDo<T, T>);
+  static_assert(!CanDo<I, T>);
+  static_assert(!CanDo<T, I>);
+}
+
+constexpr void test() {
+  // Contraint success - Signed
+  using SI = long long int;
+  test_constraint_success<signed char, SI>();
+  test_constraint_success<short int, SI>();
+  test_constraint_success<signed char, SI>();
+  test_constraint_success<short int, SI>();
+  test_constraint_success<int, SI>();
+  test_constraint_success<long int, SI>();
+  test_constraint_success<long long int, int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<__int128_t, SI>();
+#endif
+  // Contraint success - Unsigned
+  using UI = unsigned long long int;
+  test_constraint_success<unsigned char, UI>();
+  test_constraint_success<unsigned short int, UI>();
+  test_constraint_success<unsigned int, UI>();
+  test_constraint_success<unsigned long int, UI>();
+  test_constraint_success<unsigned long long int, unsigned int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<__uint128_t, UI>();
+#endif
+
+  // Contraint failure
+  test_constraint_fail<bool>();
+  test_constraint_fail<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test_constraint_fail<wchar_t>();
+#endif
+  test_constraint_fail<char8_t>();
+  test_constraint_fail<char16_t>();
+  test_constraint_fail<char32_t>();
+  test_constraint_fail<float>();
+  test_constraint_fail<double>();
+  test_constraint_fail<long double>();
+}
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
new file mode 100644
index 0000000000000..036bf53e36dcd
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/add_sat.pass.cpp
@@ -0,0 +1,173 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <numeric>
+
+// template<class T>
+// constexpr T add_sat(T x, T y) noexcept;                     // freestanding
+
+#include <cassert>
+#include <concepts>
+#include <limits>
+#include <numeric>
+
+#include "test_macros.h"
+
+template <typename IntegerT>
+constexpr bool test_signed() {
+  constexpr auto minVal = std::numeric_limits<IntegerT>::min();
+  constexpr auto maxVal = std::numeric_limits<IntegerT>::max();
+
+  // TODO(LLVM-20) remove [[maybe_unused]]  since all supported compilers support "Placeholder variables with no name"
+  [[maybe_unused]] std::same_as<IntegerT> decltype(auto) _ = std::add_sat(minVal, maxVal);
+
+  static_assert(noexcept(std::add_sat(minVal, maxVal)));
+
+  // clang-format off
+
+  // Limit values (-1, 0, 1, min, max)
+
+  assert(std::add_sat(IntegerT{-1}, IntegerT{-1}) == IntegerT{-2});
+  assert(std::add_sat(IntegerT{-1}, IntegerT{ 0}) == IntegerT{-1});
+  assert(std::add_sat(IntegerT{-1}, IntegerT{ 1}) == IntegerT{ 0});
+  assert(std::add_sat(IntegerT{-1},       minVal) == minVal); // saturated
+  assert(std::add_sat(IntegerT{-1},       maxVal) == IntegerT{-1} + maxVal);
+
+  assert(std::add_sat(IntegerT{ 0}, IntegerT{-1}) == IntegerT{-1});
+  assert(std::add_sat(IntegerT{ 0}, IntegerT{ 0}) == IntegerT{ 0});
+  assert(std::add_sat(IntegerT{ 0}, IntegerT{ 1}) == IntegerT{ 1});
+  assert(std::add_sat(IntegerT{ 0},       minVal) == minVal);
+  assert(std::add_sat(IntegerT{ 0},       maxVal) == maxVal);
+
+  assert(std::add_sat(IntegerT{ 1}, IntegerT{-1}) == IntegerT{ 0});
+  assert(std::add_sat(IntegerT{ 1}, IntegerT{ 0}) == IntegerT{ 1});
+  assert(std::add_sat(IntegerT{ 1}, IntegerT{ 1}) == IntegerT{ 2});
+  assert(std::add_sat(IntegerT{ 1},       minVal) == IntegerT{ 1} + minVal);
+  assert(std::add_sat(IntegerT{ 1},       maxVal) == maxVal); // saturated
+
+  assert(std::add_sat(      minVal, IntegerT{-1}) == minVal); // saturated
+  assert(std::add_sat(      minVal, IntegerT{ 0}) == minVal);
+  assert(std::add_sat(      minVal, IntegerT{ 1}) == minVal + IntegerT{ 1});
+  assert(std::add_sat(      minVal,       minVal) == minVal); // saturated
+  assert(std::add_sat(      minVal,       maxVal) == IntegerT{-1});
+
+  assert(std::add_sat(      maxVal, IntegerT{-1}) == maxVal + IntegerT{-1});
+  assert(std::add_sat(      maxVal, IntegerT{ 0}) == maxVal);
+  assert(std::add_sat(      maxVal, IntegerT{ 1}) == maxVal); // saturated
+  assert(std::add_sat(      maxVal,       minVal) == IntegerT{-1});
+  assert(std::add_sat(      maxVal,       maxVal) == maxVal); // saturated
+
+  // No saturation (no limit values)
+
+  assert(std::add_sat(IntegerT{-27}, IntegerT{28})== IntegerT{ 1});
+  assert(std::add_sat(IntegerT{ 27}, IntegerT{28})== IntegerT{55});
+  {
+    constexpr IntegerT x = maxVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT y = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::add_sat(x, y) == maxVal);
+  }
+
+  // Saturation (no limit values)
+
+  {
+    constexpr IntegerT x = minVal / IntegerT{2} + IntegerT{-27};
+    constexpr IntegerT y = minVal / IntegerT{2} + IntegerT{-28};
+    assert(std::add_sat(x, y) == minVal); // saturated
+  }
+  {
+    constexpr IntegerT x = maxVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT y = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::add_sat(x, y) == maxVal); // saturated
+  }
+
+  // clang-format on
+
+  return true;
+}
+
+template <typename IntegerT>
+constexpr bool test_unsigned() {
+  constexpr auto minVal = std::numeric_limits<IntegerT>::min();
+  constexpr auto maxVal = std::numeric_limits<IntegerT>::max();
+
+  // TODO(LLVM-20) remove [[maybe_unused]]  since all supported compilers support "Placeholder variables with no name"
+  [[maybe_unused]] std::same_as<IntegerT> decltype(auto) _ = std::add_sat(minVal, maxVal);
+
+  static_assert(noexcept(std::add_sat(minVal, maxVal)));
+
+  // clang-format off
+
+  // Litmit values (0, 1, min, max)
+
+  assert(std::add_sat(IntegerT{0}, IntegerT{0}) == IntegerT{0});
+  assert(std::add_sat(IntegerT{0}, IntegerT{1}) == IntegerT{1});
+  assert(std::add_sat(IntegerT{0},      minVal) == IntegerT{0});
+  assert(std::add_sat(IntegerT{0},      maxVal) == maxVal);
+  assert(std::add_sat(IntegerT{1}, IntegerT{0}) == IntegerT{1});
+  assert(std::add_sat(IntegerT{1}, IntegerT{1}) == IntegerT{2});
+  assert(std::add_sat(IntegerT{1},      minVal) == IntegerT{1});
+  assert(std::add_sat(IntegerT{1},      maxVal) == maxVal); // saturated
+  assert(std::add_sat(     minVal, IntegerT{0}) == IntegerT{0});
+  assert(std::add_sat(     minVal, IntegerT{1}) == IntegerT{1});
+  assert(std::add_sat(     minVal,      minVal) == minVal);
+  assert(std::add_sat(     minVal,      maxVal) == maxVal);
+  assert(std::add_sat(     maxVal, IntegerT{0}) == maxVal);
+  assert(std::add_sat(     maxVal, IntegerT{1}) == maxVal); // saturated
+  assert(std::add_sat(     maxVal,      minVal) == maxVal);
+  assert(std::add_sat(     maxVal,      maxVal) == maxVal); // saturated
+
+  // No saturation (no limit values)
+
+  assert(std::add_sat(IntegerT{27}, IntegerT{28}) == IntegerT{55});
+
+  // Saturation (no limit values)
+
+  {
+    constexpr IntegerT x = maxVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT y = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::add_sat(     x,           y) == maxVal); // saturated
+    assert(std::add_sat(     x,      maxVal) == maxVal); // saturated
+    assert(std::add_sat(maxVal,           y) == maxVal); // saturated
+  }
+
+  // clang-format on
+
+  return true;
+}
+
+constexpr bool test() {
+  // Signed
+  test_signed<signed char>();
+  test_signed<short int>();
+  test_signed<int>();
+  test_signed<long int>();
+  test_signed<long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_signed<__int128_t>();
+#endif
+  // Unsigned
+  test_unsigned<unsigned char>();
+  test_unsigned<unsigned short int>();
+  test_unsigned<unsigned int>();
+  test_unsigned<unsigned long int>();
+  test_unsigned<unsigned long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_unsigned<__uint128_t>();
+#endif
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.assert.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.assert.pass.cpp
new file mode 100644
index 0000000000000..b1ef8345d51e6
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.assert.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// REQUIRES: has-unix-headers
+// REQUIRES: libcpp-hardening-mode={{extensive|debug}}
+// XFAIL: availability-verbose_abort-missing
+
+// <numeric>
+
+// template<class T>
+// constexpr T div_sat(T x, T y) noexcept;                     // freestanding
+
+#include <cassert>
+#include <numeric>
+
+#include "check_assertion.h"
+#include "test_macros.h"
+
+template <typename IntegerT>
+void test_runtime_assertion() {
+  TEST_LIBCPP_ASSERT_FAILURE((void)std::div_sat(IntegerT{27}, IntegerT{0}), "Division by 0 is undefined");
+}
+
+bool test() {
+  // Signed
+  test_runtime_assertion<signed char>();
+  test_runtime_assertion<short int>();
+  test_runtime_assertion<int>();
+  test_runtime_assertion<long int>();
+  test_runtime_assertion<long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_runtime_assertion<__int128_t>();
+#endif
+  // Unsigned
+  test_runtime_assertion<unsigned char>();
+  test_runtime_assertion<unsigned short int>();
+  test_runtime_assertion<unsigned int>();
+  test_runtime_assertion<unsigned long int>();
+  test_runtime_assertion<unsigned long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_runtime_assertion<__uint128_t>();
+#endif
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+
+  return 0;
+}
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.compile.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.compile.pass.cpp
new file mode 100644
index 0000000000000..32d77f710bd70
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.compile.pass.cpp
@@ -0,0 +1,108 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <numeric>
+
+// template<class T>
+// constexpr T div_sat(T x, T y) noexcept;                     // freestanding
+
+#include <concepts>
+#include <numeric>
+#include <type_traits>
+
+#include "test_macros.h"
+
+// Constraints
+
+template <typename T, typename U>
+concept CanDo = requires(T x, U y) {
+  { std::div_sat(x, y) } -> std::same_as<T>;
+};
+
+template <typename T, typename U>
+constexpr void test_constraint_success() {
+  static_assert(CanDo<T, T>);
+  static_assert(!CanDo<U, T>);
+  static_assert(!CanDo<T, U>);
+}
+
+template <typename T>
+constexpr void test_constraint_fail() {
+  using I = int;
+  static_assert(!CanDo<T, T>);
+  static_assert(!CanDo<I, T>);
+  static_assert(!CanDo<T, I>);
+}
+
+constexpr void test() {
+  // Contraint success - Signed
+  using SI = long long int;
+  test_constraint_success<signed char, SI>();
+  test_constraint_success<short int, SI>();
+  test_constraint_success<signed char, SI>();
+  test_constraint_success<short int, SI>();
+  test_constraint_success<int, SI>();
+  test_constraint_success<long int, SI>();
+  test_constraint_success<long long int, int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<__int128_t, SI>();
+#endif
+  // Contraint success - Unsigned
+  using UI = unsigned long long int;
+  test_constraint_success<unsigned char, UI>();
+  test_constraint_success<unsigned short int, UI>();
+  test_constraint_success<unsigned int, UI>();
+  test_constraint_success<unsigned long int, UI>();
+  test_constraint_success<unsigned long long int, unsigned int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<__uint128_t, UI>();
+#endif
+
+  // Contraint failure
+  test_constraint_fail<bool>();
+  test_constraint_fail<char>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_fail<wchar_t>();
+#endif
+  test_constraint_fail<char8_t>();
+  test_constraint_fail<char16_t>();
+  test_constraint_fail<char32_t>();
+  test_constraint_fail<float>();
+  test_constraint_fail<double>();
+  test_constraint_fail<long double>();
+}
+
+//  A function call expression that violates the precondition in the Preconditions: element is not a core constant expression (7.7 [expr.const]).
+
+template <auto N>
+using QuotT = std::integral_constant<decltype(N), std::div_sat(N, N)>;
+
+template <auto N>
+QuotT<N> div_by_zero();
+
+template <auto N>
+concept CanDivByZero = requires { div_by_zero<N>(); };
+
+static_assert(!CanDivByZero<static_cast<signed char>(0)>);
+static_assert(!CanDivByZero<static_cast<short int>(0)>);
+static_assert(!CanDivByZero<0>);
+static_assert(!CanDivByZero<0L>);
+static_assert(!CanDivByZero<0LL>);
+#ifndef TEST_HAS_NO_INT128
+static_assert(!CanDivByZero<static_cast<__int128_t>(0)>);
+#endif
+static_assert(!CanDivByZero<static_cast<unsigned char>(0)>);
+static_assert(!CanDivByZero<static_cast<unsigned short int>(0)>);
+static_assert(!CanDivByZero<0U>);
+static_assert(!CanDivByZero<0UL>);
+static_assert(!CanDivByZero<0ULL>);
+#ifndef TEST_HAS_NO_INT128
+static_assert(!CanDivByZero<static_cast<__uint128_t>(0)>);
+#endif
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
new file mode 100644
index 0000000000000..b1cace74d8828
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/div_sat.pass.cpp
@@ -0,0 +1,158 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// The test uses "Placeholder variables with no name"
+
+// <numeric>
+
+// template<class T>
+// constexpr T div_sat(T x, T y) noexcept;                     // freestanding
+
+#include <cassert>
+#include <concepts>
+#include <limits>
+#include <numeric>
+
+#include "test_macros.h"
+
+template <typename IntegerT>
+constexpr bool test_signed() {
+  constexpr auto minVal = std::numeric_limits<IntegerT>::min();
+  constexpr auto maxVal = std::numeric_limits<IntegerT>::max();
+
+  // TODO(LLVM-20) remove [[maybe_unused]] and `{}` scope since all supported compilers support "Placeholder variables with no name"
+  [[maybe_unused]] std::same_as<IntegerT> decltype(auto) _ = std::div_sat(minVal, maxVal);
+
+  static_assert(noexcept(std::div_sat(minVal, maxVal)));
+
+  // clang-format off
+
+  // Limit values (-1, 0, 1, min, max)
+
+  assert(std::div_sat(IntegerT{-1}, IntegerT{-1}) == IntegerT{ 1});
+  assert(std::div_sat(IntegerT{-1}, IntegerT{ 1}) == IntegerT{-1});
+  assert(std::div_sat(IntegerT{-1},       minVal) == IntegerT{ 0});
+  assert(std::div_sat(IntegerT{-1},       maxVal) == IntegerT{ 0});
+  assert(std::div_sat(IntegerT{ 0}, IntegerT{-1}) == IntegerT{ 0});
+  assert(std::div_sat(IntegerT{ 0}, IntegerT{ 1}) == IntegerT{ 0});
+  assert(std::div_sat(IntegerT{ 0},       minVal) == IntegerT{ 0});
+  assert(std::div_sat(IntegerT{ 0},       maxVal) == IntegerT{ 0});
+  assert(std::div_sat(IntegerT{ 1}, IntegerT{-1}) == IntegerT{-1});
+  assert(std::div_sat(IntegerT{ 1}, IntegerT{ 1}) == IntegerT{ 1});
+  assert(std::div_sat(IntegerT{ 1},       minVal) == IntegerT{ 0});
+  assert(std::div_sat(IntegerT{ 1},       maxVal) == IntegerT{ 0});
+  assert(std::div_sat(      minVal, IntegerT{ 1}) == minVal);
+  assert(std::div_sat(      minVal, IntegerT{-1}) == maxVal); // saturated
+  assert(std::div_sat(      minVal,       minVal) == IntegerT{ 1});
+  assert(std::div_sat(      minVal,       maxVal) == (minVal / maxVal));
+  assert(std::div_sat(      maxVal, IntegerT{-1}) == -maxVal);
+  assert(std::div_sat(      maxVal, IntegerT{ 1}) == maxVal);
+  assert(std::div_sat(      maxVal,       minVal) == IntegerT{ 0});
+  assert(std::div_sat(      maxVal,       maxVal) == IntegerT{ 1});
+
+  // No saturation (no limit values)
+
+  assert(std::div_sat(IntegerT{27}, IntegerT{28}) == IntegerT{0});
+  assert(std::div_sat(IntegerT{28}, IntegerT{27}) == IntegerT{1});
+  {
+    constexpr IntegerT lesserVal = minVal / IntegerT{2} + IntegerT{-28};
+    constexpr IntegerT biggerVal = minVal / IntegerT{2} + IntegerT{-27};
+    assert(std::div_sat(lesserVal, biggerVal) == IntegerT{1});
+    assert(std::div_sat(biggerVal, lesserVal) == IntegerT{0});
+  }
+  {
+    constexpr IntegerT lesserVal = minVal / IntegerT{2} + IntegerT{-27};
+    constexpr IntegerT biggerVal = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::div_sat(lesserVal, biggerVal) == IntegerT{-1});
+    assert(std::div_sat(biggerVal, lesserVal) == IntegerT{-1});
+  }
+  {
+    constexpr IntegerT lesserVal = maxVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT biggerVal = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::div_sat(lesserVal, biggerVal) == IntegerT{0});
+    assert(std::div_sat(biggerVal, lesserVal) == IntegerT{1});
+  }
+
+  // clang-format on
+
+  return true;
+}
+
+template <typename IntegerT>
+constexpr bool test_unsigned() {
+  constexpr auto minVal = std::numeric_limits<IntegerT>::min();
+  constexpr auto maxVal = std::numeric_limits<IntegerT>::max();
+
+  // TODO(LLVM-20) remove [[maybe_unused]] since all supported compilers support "Placeholder variables with no name"
+  [[maybe_unused]] std::same_as<IntegerT> decltype(auto) _ = std::div_sat(minVal, maxVal);
+  static_assert(noexcept(std::div_sat(minVal, maxVal)));
+
+  // clang-format off
+
+  // No limit values (0, 1, min, max)
+
+  assert(std::div_sat(IntegerT{0}, IntegerT{1}) == IntegerT{0});
+  assert(std::div_sat(IntegerT{0},      maxVal) == IntegerT{0});
+
+  assert(std::div_sat(IntegerT{1}, IntegerT{1}) == IntegerT{1});
+  assert(std::div_sat(IntegerT{1},      maxVal) == IntegerT{0});
+
+  assert(std::div_sat(     minVal, IntegerT{1}) == minVal);
+  assert(std::div_sat(     minVal,      maxVal) == IntegerT{0});
+
+  assert(std::div_sat(     maxVal, IntegerT{1}) == maxVal);
+  assert(std::div_sat(     maxVal,      maxVal) == IntegerT{1});
+
+  // No saturation (no limit values)
+
+  assert(std::div_sat(IntegerT{27}, IntegerT{28}) == IntegerT{0});
+  assert(std::div_sat(IntegerT{28}, IntegerT{27}) == IntegerT{1});
+  {
+    constexpr IntegerT lesserVal = maxVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT biggerVal = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::div_sat(lesserVal, biggerVal) == IntegerT{0});
+    assert(std::div_sat(biggerVal, lesserVal) == IntegerT{1});
+  }
+
+  // Unsigned integer division never overflows
+
+  // clang-format on
+
+  return true;
+}
+
+constexpr bool test() {
+  // Signed
+  test_signed<signed char>();
+  test_signed<short int>();
+  test_signed<int>();
+  test_signed<long int>();
+  test_signed<long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_signed<__int128_t>();
+#endif
+  // Unsigned
+  test_unsigned<unsigned char>();
+  test_unsigned<unsigned short int>();
+  test_unsigned<unsigned int>();
+  test_unsigned<unsigned long int>();
+  test_unsigned<unsigned long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_unsigned<__uint128_t>();
+#endif
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.compile.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.compile.pass.cpp
new file mode 100644
index 0000000000000..f1762d2591199
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.compile.pass.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <numeric>
+
+// template<class T>
+// constexpr T mul_sat(T x, T y) noexcept;                     // freestanding
+
+#include <concepts>
+#include <numeric>
+
+#include "test_macros.h"
+
+template <typename T, typename U>
+concept CanDo = requires(T x, U y) {
+  { std::mul_sat(x, y) } -> std::same_as<T>;
+};
+
+template <typename T, typename U>
+constexpr void test_constraint_success() {
+  static_assert(CanDo<T, T>);
+  static_assert(!CanDo<U, T>);
+  static_assert(!CanDo<T, U>);
+}
+
+template <typename T>
+constexpr void test_constraint_fail() {
+  using I = int;
+  static_assert(!CanDo<T, T>);
+  static_assert(!CanDo<I, T>);
+  static_assert(!CanDo<T, I>);
+}
+
+constexpr void test() {
+  // Contraint success - Signed
+  using SI = long long int;
+  test_constraint_success<signed char, SI>();
+  test_constraint_success<short int, SI>();
+  test_constraint_success<signed char, SI>();
+  test_constraint_success<short int, SI>();
+  test_constraint_success<int, SI>();
+  test_constraint_success<long int, SI>();
+  test_constraint_success<long long int, int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<__int128_t, SI>();
+#endif
+  // Contraint success - Unsigned
+  using UI = unsigned long long int;
+  test_constraint_success<unsigned char, UI>();
+  test_constraint_success<unsigned short int, UI>();
+  test_constraint_success<unsigned int, UI>();
+  test_constraint_success<unsigned long int, UI>();
+  test_constraint_success<unsigned long long int, unsigned int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<__uint128_t, UI>();
+#endif
+
+  // Contraint failure
+  test_constraint_fail<bool>();
+  test_constraint_fail<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test_constraint_fail<wchar_t>();
+#endif
+  test_constraint_fail<char8_t>();
+  test_constraint_fail<char16_t>();
+  test_constraint_fail<char32_t>();
+  test_constraint_fail<float>();
+  test_constraint_fail<double>();
+  test_constraint_fail<long double>();
+}
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
new file mode 100644
index 0000000000000..2c8eec57e1204
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/mul_sat.pass.cpp
@@ -0,0 +1,180 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// The test uses "Placeholder variables with no name"
+
+// <numeric>
+
+// template<class T>
+// constexpr T mul_sat(T x, T y) noexcept;                     // freestanding
+
+#include <cassert>
+#include <concepts>
+#include <limits>
+#include <numeric>
+
+#include "test_macros.h"
+
+template <typename IntegerT>
+constexpr bool test_signed() {
+  constexpr auto minVal = std::numeric_limits<IntegerT>::min();
+  constexpr auto maxVal = std::numeric_limits<IntegerT>::max();
+
+  // TODO(LLVM-20) remove [[maybe_unused]] since all supported compilers support "Placeholder variables with no name"
+  [[maybe_unused]] std::same_as<IntegerT> decltype(auto) _ = std::mul_sat(minVal, maxVal);
+
+  static_assert(noexcept(std::mul_sat(minVal, maxVal)));
+
+  // clang-format off
+
+  // Limit values (-1, 0, 1, min, max)
+
+  assert(std::mul_sat(IntegerT{-1}, IntegerT{-1}) == IntegerT{ 1});
+  assert(std::mul_sat(IntegerT{-1}, IntegerT{ 0}) == IntegerT{ 0});
+  assert(std::mul_sat(IntegerT{-1}, IntegerT{ 1}) == IntegerT{-1});
+  assert(std::mul_sat(IntegerT{-1},       minVal) == maxVal); // saturated
+  assert(std::mul_sat(IntegerT{-1},       maxVal) == -maxVal);
+
+  assert(std::mul_sat(IntegerT{ 0}, IntegerT{-1}) == IntegerT{ 0});
+  assert(std::mul_sat(IntegerT{ 0}, IntegerT{ 0}) == IntegerT{ 0});
+  assert(std::mul_sat(IntegerT{ 0}, IntegerT{ 1}) == IntegerT{ 0});
+  assert(std::mul_sat(IntegerT{ 0},       minVal) == IntegerT{ 0});
+  assert(std::mul_sat(IntegerT{ 0},       maxVal) == IntegerT{ 0});
+
+  assert(std::mul_sat(IntegerT{ 1}, IntegerT{-1}) == IntegerT{-1});
+  assert(std::mul_sat(IntegerT{ 1}, IntegerT{ 0}) == IntegerT{ 0});
+  assert(std::mul_sat(IntegerT{ 1}, IntegerT{ 1}) == IntegerT{ 1});
+  assert(std::mul_sat(IntegerT{ 1},       minVal) == minVal);
+  assert(std::mul_sat(IntegerT{ 1},       maxVal) == maxVal);
+
+  assert(std::mul_sat(      minVal, IntegerT{-1}) == maxVal); // saturated
+  assert(std::mul_sat(      minVal, IntegerT{ 0}) == IntegerT{ 0});
+  assert(std::mul_sat(      minVal, IntegerT{ 1}) == minVal);
+  assert(std::mul_sat(      minVal,       minVal) == maxVal); // saturated
+  assert(std::mul_sat(      minVal,       maxVal) == minVal); // saturated
+
+  assert(std::mul_sat(      maxVal, IntegerT{-1}) == -maxVal);
+  assert(std::mul_sat(      maxVal, IntegerT{ 0}) == IntegerT{ 0});
+  assert(std::mul_sat(      maxVal, IntegerT{ 1}) == maxVal); // saturated
+  assert(std::mul_sat(      maxVal,       minVal) == minVal); // saturated
+  assert(std::mul_sat(      maxVal,       maxVal) == maxVal); // saturated
+
+  // No saturation (no limit values)
+
+  assert(std::mul_sat(IntegerT{27}, IntegerT{ 2}) == IntegerT{54});
+  assert(std::mul_sat(IntegerT{ 2}, IntegerT{28}) == IntegerT{56});
+
+  // Saturation (no limit values)
+
+  {
+    constexpr IntegerT x = minVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT y = minVal / IntegerT{2} + IntegerT{28};
+    assert(std::mul_sat(x, y) == maxVal); // saturated
+  }
+  {
+    constexpr IntegerT x = minVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT y = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::mul_sat(x, y) == minVal); // saturated
+  }
+  {
+    constexpr IntegerT x = maxVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT y = minVal / IntegerT{2} + IntegerT{28};
+    assert(std::mul_sat(x, y) == minVal); // saturated
+  }
+  {
+    constexpr IntegerT x = maxVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT y = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::mul_sat(x, y) == maxVal); // saturated
+  }
+
+  // clang-format on
+
+  return true;
+}
+
+template <typename IntegerT>
+constexpr bool test_unsigned() {
+  constexpr auto minVal = std::numeric_limits<IntegerT>::min();
+  constexpr auto maxVal = std::numeric_limits<IntegerT>::max();
+
+  // TODO(LLVM-20) remove [[maybe_unused]] since all supported compilers support "Placeholder variables with no name"
+  [[maybe_unused]] std::same_as<IntegerT> decltype(auto) _ = std::mul_sat(minVal, maxVal);
+
+  static_assert(noexcept(std::mul_sat(minVal, maxVal)));
+
+  // clang-format off
+
+  // No saturation (0, 1)
+
+  assert(std::mul_sat(IntegerT{0}, IntegerT{0}) == IntegerT{0});
+  assert(std::mul_sat(IntegerT{0}, IntegerT{1}) == IntegerT{0});
+  assert(std::mul_sat(IntegerT{0},      minVal) == IntegerT{0});
+  assert(std::mul_sat(IntegerT{0},      maxVal) == IntegerT{0});
+
+  assert(std::mul_sat(IntegerT{1}, IntegerT{0}) == IntegerT{0});
+  assert(std::mul_sat(IntegerT{1}, IntegerT{1}) == IntegerT{1});
+  assert(std::mul_sat(IntegerT{1},      minVal) == minVal);
+  assert(std::mul_sat(IntegerT{1},      maxVal) == maxVal);
+
+  assert(std::mul_sat(     minVal, IntegerT{0}) == IntegerT{0});
+  assert(std::mul_sat(     minVal, IntegerT{1}) == minVal);
+  assert(std::mul_sat(     minVal,      maxVal) == minVal);
+  assert(std::mul_sat(     minVal,      maxVal) == minVal);
+
+  assert(std::mul_sat(     maxVal, IntegerT{0}) == IntegerT{0});
+  assert(std::mul_sat(     maxVal, IntegerT{1}) == maxVal);
+  assert(std::mul_sat(     maxVal,      minVal) == IntegerT{0});
+  assert(std::mul_sat(     maxVal,      maxVal) == maxVal); // saturated
+
+  // No saturation (no limit values)
+
+  assert(std::mul_sat(IntegerT{28}, IntegerT{2}) == IntegerT{56});
+
+  // Saturation (no limit values
+
+  {
+    constexpr IntegerT x = maxVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT y = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::mul_sat(x, y) == maxVal); // saturated
+  }
+
+  // clang-format on
+
+  return true;
+}
+
+constexpr bool test() {
+  // Signed
+  test_signed<signed char>();
+  test_signed<short int>();
+  test_signed<int>();
+  test_signed<long int>();
+  test_signed<long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_signed<__int128_t>();
+#endif
+  // Unsigned
+  test_unsigned<unsigned char>();
+  test_unsigned<unsigned short int>();
+  test_unsigned<unsigned int>();
+  test_unsigned<unsigned long int>();
+  test_unsigned<unsigned long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_unsigned<__uint128_t>();
+#endif
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.compile.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.compile.pass.cpp
new file mode 100644
index 0000000000000..9b035c1e3c4e5
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.compile.pass.cpp
@@ -0,0 +1,79 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <numeric>
+
+// template<class R, class T>
+//   constexpr R saturate_cast(T x) noexcept;                    // freestanding
+
+#include <concepts>
+#include <numeric>
+
+#include "test_macros.h"
+
+template <typename R, typename T>
+concept CanDo = requires(T x) {
+  { std::saturate_cast<R>(x) } -> std::same_as<R>;
+};
+
+template <typename R, typename T>
+constexpr void test_constraint_success() {
+  static_assert(CanDo<R, T>);
+  static_assert(CanDo<T, T>);
+  static_assert(CanDo<T, R>);
+}
+
+template <typename T>
+constexpr void test_constraint_fail() {
+  using I = int;
+  using R = T;
+  static_assert(!CanDo<R, T>);
+  static_assert(!CanDo<T, R>);
+  static_assert(!CanDo<I, T>);
+  static_assert(!CanDo<T, I>);
+}
+
+constexpr void test() {
+  // Contraint success - Signed
+  using SI = long long int;
+  test_constraint_success<SI, signed char>();
+  test_constraint_success<SI, short int>();
+  test_constraint_success<SI, signed char>();
+  test_constraint_success<SI, short int>();
+  test_constraint_success<SI, int>();
+  test_constraint_success<SI, long int>();
+  test_constraint_success<int, long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<__int128_t, SI>();
+#endif
+  // Contraint success - Unsigned
+  using UI = unsigned long long int;
+  test_constraint_success<UI, unsigned char>();
+  test_constraint_success<UI, unsigned short int>();
+  test_constraint_success<UI, unsigned int>();
+  test_constraint_success<UI, unsigned long int>();
+  test_constraint_success<unsigned int, unsigned long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<UI, __uint128_t>();
+#endif
+
+  // Contraint failure
+  test_constraint_fail<bool>();
+  test_constraint_fail<char>();
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test_constraint_fail<wchar_t>();
+#endif
+  test_constraint_fail<char8_t>();
+  test_constraint_fail<char16_t>();
+  test_constraint_fail<char32_t>();
+  test_constraint_fail<float>();
+  test_constraint_fail<double>();
+  test_constraint_fail<long double>();
+}
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
new file mode 100644
index 0000000000000..c06a9ed2d5cb4
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/saturate_cast.pass.cpp
@@ -0,0 +1,397 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <numeric>
+
+// template<class R, class T>
+//   constexpr R saturate_cast(T x) noexcept;                     // freestanding
+
+#include <cassert>
+#include <climits>
+#include <concepts>
+#include <limits>
+#include <numeric>
+
+#include "test_macros.h"
+#include <print>
+
+// Smaller to larger
+static_assert(noexcept(std::saturate_cast<signed int>(std::numeric_limits<signed char>::max())));
+static_assert(noexcept(std::saturate_cast<signed int>(std::numeric_limits<unsigned char>::max())));
+
+static_assert(noexcept(std::saturate_cast<unsigned int>(std::numeric_limits<signed char>::max())));
+static_assert(noexcept(std::saturate_cast<unsigned int>(std::numeric_limits<unsigned char>::max())));
+
+// Same type
+static_assert(noexcept(std::saturate_cast<signed long int>(std::numeric_limits<signed long int>::max())));
+static_assert(noexcept(std::saturate_cast<unsigned long int>(std::numeric_limits<unsigned long int>::max())));
+
+// Larger to smaller
+static_assert(noexcept(std::saturate_cast<signed char>(std::numeric_limits<signed int>::max())));
+static_assert(noexcept(std::saturate_cast<signed char>(std::numeric_limits<unsigned int>::max())));
+
+static_assert(noexcept(std::saturate_cast<unsigned char>(std::numeric_limits<signed int>::max())));
+static_assert(noexcept(std::saturate_cast<unsigned char>(std::numeric_limits<unsigned int>::max())));
+
+// Tests
+
+constexpr bool test() {
+  // clang-format off
+
+#ifndef TEST_HAS_NO_INT128
+  using SIntT = __int128_t;
+  using UIntT = __uint128_t;
+#else
+  using SIntT = long long int;
+  using UIntT = unsigned long long int;
+#endif
+
+  // Constants the values of which depend on the context (platform)
+
+  constexpr auto sBigMin = std::numeric_limits<SIntT>::min();
+  constexpr auto sZero   = SIntT{0};
+  constexpr auto sBigMax = std::numeric_limits<SIntT>::max();
+
+  constexpr auto uZero   = UIntT{0};
+  constexpr auto uBigMax = std::numeric_limits<UIntT>::max();
+
+  // Constants to avoid casting in place
+
+  constexpr auto O_C  = static_cast<signed char>(0);
+  constexpr auto O_UC = static_cast<unsigned char>(0);
+
+  constexpr auto O_S  = static_cast<signed short int>(0);
+  constexpr auto O_US = static_cast<unsigned short int>(0);
+
+  // signed char
+
+  // TODO(LLVM-20) remove [[maybe_unused]] and `{}` scope since all supported compilers support "Placeholder variables with no name",
+  // here and below...
+  { [[maybe_unused]] std::same_as<signed char> decltype(auto) _ = std::saturate_cast<signed char>(SCHAR_MAX); }
+  assert(std::saturate_cast<signed char>(SCHAR_MIN)  == SCHAR_MIN);
+  assert(std::saturate_cast<signed char>(      O_C)  ==       O_C);
+  assert(std::saturate_cast<signed char>(SCHAR_MAX)  == SCHAR_MAX);
+
+  { [[maybe_unused]] std::same_as<signed char> decltype(auto) _ = std::saturate_cast<signed char>(UCHAR_MAX); }
+  assert(std::saturate_cast<signed char>(     O_UC)  ==       O_C);
+  assert(std::saturate_cast<signed char>(UCHAR_MAX)  == SCHAR_MAX);
+
+  { [[maybe_unused]] std::same_as<signed char> decltype(auto) _ = std::saturate_cast<signed char>(sBigMax); }
+  assert(std::saturate_cast<signed char>(sBigMin)    == SCHAR_MIN); // saturated
+  assert(std::saturate_cast<signed char>(  sZero)    ==       O_C);
+  assert(std::saturate_cast<signed char>(sBigMax)    == SCHAR_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<signed char> decltype(auto) _ = std::saturate_cast<signed char>(uBigMax); }
+  assert(std::saturate_cast<signed char>(  uZero)    ==       O_C);
+  assert(std::saturate_cast<signed char>(uBigMax)    == SCHAR_MAX); // saturated
+
+  // short
+
+  { [[maybe_unused]] std::same_as<signed short int> decltype(auto) _ = std::saturate_cast<signed short int>(SCHAR_MAX); }
+  assert(std::saturate_cast<signed short int>(SCHAR_MIN) == static_cast<signed short int>(SCHAR_MIN));
+  assert(std::saturate_cast<signed short int>(      O_C) == O_S);
+  assert(std::saturate_cast<signed short int>(SCHAR_MAX) == static_cast<signed short int>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<signed short int> decltype(auto) _ = std::saturate_cast<signed short int>(UCHAR_MAX); }
+  assert(std::saturate_cast<signed short int>(     O_UC) == O_S);
+  assert(std::saturate_cast<signed short int>(UCHAR_MAX) == static_cast<signed short int>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<signed short int> decltype(auto) _ = std::saturate_cast<signed short int>(SHRT_MAX); }
+  assert(std::saturate_cast<signed short int>( SHRT_MIN) == SHRT_MIN);
+  assert(std::saturate_cast<signed short int>(      O_S) == O_S);
+  assert(std::saturate_cast<signed short int>( SHRT_MAX) == SHRT_MAX);
+
+  { [[maybe_unused]] std::same_as<signed short int> decltype(auto) _ = std::saturate_cast<signed short int>(USHRT_MAX); }
+  assert(std::saturate_cast<signed short int>(     O_US) == O_S);
+  assert(std::saturate_cast<signed short int>(USHRT_MAX) == SHRT_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<signed short int> decltype(auto) _ = std::saturate_cast<signed short int>(sBigMax); }
+  assert(std::saturate_cast<signed short int>( sBigMin)   == SHRT_MIN); // saturated
+  assert(std::saturate_cast<signed short int>(   sZero)   == O_S);
+  assert(std::saturate_cast<signed short int>( sBigMax)   == SHRT_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<signed short int> decltype(auto) _ = std::saturate_cast<signed short int>(uBigMax); }
+  assert(std::saturate_cast<signed short int>(   uZero)   == O_S);
+  assert(std::saturate_cast<signed short int>( uBigMax)   == SHRT_MAX); // saturated
+
+  // int
+
+  { [[maybe_unused]] std::same_as<signed int> decltype(auto) _ = std::saturate_cast<signed int>(SCHAR_MAX); }
+  assert(std::saturate_cast<signed int>(SCHAR_MIN) == static_cast<signed int>(SCHAR_MIN));
+  assert(std::saturate_cast<signed int>(      O_C) == 0);
+  assert(std::saturate_cast<signed int>(SCHAR_MAX) == static_cast<signed int>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<signed int> decltype(auto) _ = std::saturate_cast<signed int>(UCHAR_MAX); }
+  assert(std::saturate_cast<signed int>(     O_UC) == 0);
+  assert(std::saturate_cast<signed int>(UCHAR_MAX) == static_cast<signed int>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<signed int> decltype(auto) _ = std::saturate_cast<signed int>(INT_MAX); }
+  assert(std::saturate_cast<signed int>(  INT_MIN) == INT_MIN);
+  assert(std::saturate_cast<signed int>(        0) == 0);
+  assert(std::saturate_cast<signed int>(  INT_MAX) == INT_MAX);
+
+  { [[maybe_unused]] std::same_as<signed int> decltype(auto) _ = std::saturate_cast<signed int>(UINT_MAX); }
+  assert(std::saturate_cast<signed int>(       0)  == 0);
+  assert(std::saturate_cast<signed int>(UINT_MAX)  == INT_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<signed int> decltype(auto) _ = std::saturate_cast<signed int>(sBigMax); }
+  assert(std::saturate_cast<signed int>( sBigMin)  == INT_MIN); // saturated
+  assert(std::saturate_cast<signed int>(   sZero)  == 0);
+  assert(std::saturate_cast<signed int>( sBigMax)  == INT_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<signed int> decltype(auto) _ = std::saturate_cast<signed int>(uBigMax); }
+  assert(std::saturate_cast<signed int>( uZero)    == 0);
+  assert(std::saturate_cast<signed int>( uBigMax)  == INT_MAX); // saturated
+
+  // long
+
+  { [[maybe_unused]] std::same_as<signed long int> decltype(auto) _ = std::saturate_cast<signed long int>(SCHAR_MAX); }
+  assert(std::saturate_cast<signed long int>(SCHAR_MIN) == static_cast<signed long int>(SCHAR_MIN));
+  assert(std::saturate_cast<signed long int>(      O_C) == 0L);
+  assert(std::saturate_cast<signed long int>(SCHAR_MAX) == static_cast<signed long int>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<signed long int> decltype(auto) _ = std::saturate_cast<signed long int>(UCHAR_MAX); }
+  assert(std::saturate_cast<signed long int>(     O_UC) == 0L);
+  assert(std::saturate_cast<signed long int>(UCHAR_MAX) == static_cast<signed long int>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<signed long int> decltype(auto) _ = std::saturate_cast<signed long int>(LONG_MAX); }
+  assert(std::saturate_cast<signed long int>( LONG_MIN) == LONG_MIN);
+  assert(std::saturate_cast<signed long int>(       0L) == 0L);
+  assert(std::saturate_cast<signed long int>( LONG_MAX) == LONG_MAX);
+
+  { [[maybe_unused]] std::same_as<signed long int> decltype(auto) _ = std::saturate_cast<signed long int>(ULONG_MAX); }
+  assert(std::saturate_cast<signed long int>(      0UL) == 0L);
+  assert(std::saturate_cast<signed long int>(ULONG_MAX) == LONG_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<signed long int> decltype(auto) _ = std::saturate_cast<signed long int>(sBigMax); }
+  assert(std::saturate_cast<signed long int>(  sBigMin) == LONG_MIN); // saturated
+  assert(std::saturate_cast<signed long int>(    sZero) == 0L);
+  assert(std::saturate_cast<signed long int>(  sBigMax) == LONG_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<signed long int> decltype(auto) _ = std::saturate_cast<signed long int>(uBigMax); }
+  assert(std::saturate_cast<signed long int>(    uZero) == 0L);
+  assert(std::saturate_cast<signed long int>(  uBigMax) == LONG_MAX); // saturated
+
+  // long long
+
+  { [[maybe_unused]] std::same_as<signed long long int> decltype(auto) _ = std::saturate_cast<signed long long int>(SCHAR_MAX); }
+  assert(std::saturate_cast<signed long long int>(SCHAR_MIN) == static_cast<signed long long int>(SCHAR_MIN));
+  assert(std::saturate_cast<signed long long int>(      0LL) == 0LL);
+  assert(std::saturate_cast<signed long long int>(SCHAR_MAX) == static_cast<signed long long int>(SCHAR_MAX));
+
+  { [[maybe_unused]]   std::same_as<signed long long int> decltype(auto) _ = std::saturate_cast<signed long long int>(UCHAR_MAX); }
+  assert(std::saturate_cast<signed long long int>(     O_UC) == 0LL);
+  assert(std::saturate_cast<signed long long int>(UCHAR_MAX) == static_cast<signed long long int>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<signed long long int> decltype(auto) _ = std::saturate_cast<signed long long int>(LLONG_MIN); }
+  assert(std::saturate_cast<signed long long int>(LLONG_MIN) == LLONG_MIN);
+  assert(std::saturate_cast<signed long long int>(      0LL) == 0LL);
+  assert(std::saturate_cast<signed long long int>(LLONG_MAX) == LLONG_MAX);
+
+  { [[maybe_unused]] std::same_as<signed long long int> decltype(auto) _ = std::saturate_cast<signed long long int>(ULLONG_MAX); }
+  assert(std::saturate_cast<signed long long int>(      0ULL) == 0LL);
+  assert(std::saturate_cast<signed long long int>(ULLONG_MAX) == LLONG_MAX); // saturated
+
+#ifndef TEST_HAS_NO_INT128
+  { [[maybe_unused]] std::same_as<signed long long int> decltype(auto) _ = std::saturate_cast<signed long long int>(sBigMax); }
+  assert(std::saturate_cast<signed long long int>(   sBigMin) == LLONG_MIN); // (128-bit) saturated
+  assert(std::saturate_cast<signed long long int>(     sZero) == 0LL);
+  assert(std::saturate_cast<signed long long int>(   sBigMax) == LLONG_MAX); // (128-bit) saturated
+
+  { [[maybe_unused]] std::same_as<signed long long int> decltype(auto) _ = std::saturate_cast<signed long long int>(uBigMax); }
+  assert(std::saturate_cast<signed long long int>(     uZero) == 0LL);
+  assert(std::saturate_cast<signed long long int>(   uBigMax) == LLONG_MAX); // (128-bit) saturated
+
+  { [[maybe_unused]] std::same_as<__int128_t> decltype(auto) _ = std::saturate_cast<__int128_t>(SCHAR_MAX); }
+  assert(std::saturate_cast<__int128_t>(SCHAR_MIN) == static_cast<__int128_t>(SCHAR_MIN));
+  assert(std::saturate_cast<__int128_t>(      O_C) == sZero);
+  assert(std::saturate_cast<__int128_t>(SCHAR_MAX) == static_cast<__int128_t>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<__int128_t> decltype(auto) _ = std::saturate_cast<__int128_t>(UCHAR_MAX); }
+  assert(std::saturate_cast<__int128_t>(     O_UC) == sZero);
+  assert(std::saturate_cast<__int128_t>(UCHAR_MAX) == static_cast<__int128_t>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<__int128_t> decltype(auto) _ = std::saturate_cast<__int128_t>(sBigMax); }
+  assert(std::saturate_cast<__int128_t>(  sBigMin) == sBigMin);
+  assert(std::saturate_cast<__int128_t>(    sZero) == sZero);
+  assert(std::saturate_cast<__int128_t>(  sBigMax) == sBigMax);
+
+  { [[maybe_unused]] std::same_as<__int128_t> decltype(auto) _ = std::saturate_cast<__int128_t>(uBigMax); }
+  assert(std::saturate_cast<__int128_t>(    uZero) == sZero);
+  assert(std::saturate_cast<__int128_t>(  uBigMax) == sBigMax); // saturated
+#endif
+
+  // unsigned char
+
+  { [[maybe_unused]] std::same_as<unsigned char> decltype(auto) _ = std::saturate_cast<unsigned char>(SCHAR_MAX); }
+  assert(std::saturate_cast<unsigned char>(SCHAR_MIN) == O_UC);
+  assert(std::saturate_cast<unsigned char>(      O_C) == O_UC);
+  assert(std::saturate_cast<unsigned char>(SCHAR_MAX) == static_cast<unsigned char>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned char> decltype(auto) _ = std::saturate_cast<unsigned char>(UCHAR_MAX); }
+  assert(std::saturate_cast<unsigned char>(     O_UC) == O_UC);
+  assert(std::saturate_cast<unsigned char>(UCHAR_MAX) == UCHAR_MAX);
+
+  { [[maybe_unused]] std::same_as<unsigned char> decltype(auto) _ = std::saturate_cast<unsigned char>(sBigMax); }
+  assert(std::saturate_cast<unsigned char>(  sBigMin) == O_UC);      // saturated
+  assert(std::saturate_cast<unsigned char>(    sZero) == O_UC);
+  assert(std::saturate_cast<unsigned char>(  sBigMax) == UCHAR_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<unsigned char> decltype(auto) _ = std::saturate_cast<unsigned char>(uBigMax); }
+  assert(std::saturate_cast<unsigned char>(    uZero) == O_UC);
+  assert(std::saturate_cast<unsigned char>(  uBigMax) == UCHAR_MAX); // saturated
+
+  // unsigned short
+
+  { [[maybe_unused]] std::same_as<unsigned short int> decltype(auto) _ = std::saturate_cast<unsigned short int>(SCHAR_MAX); }
+  assert(std::saturate_cast<unsigned short int>(SCHAR_MIN) == O_US);
+  assert(std::saturate_cast<unsigned short int>(      O_C) == O_US);
+  assert(std::saturate_cast<unsigned short int>(SCHAR_MAX) == static_cast<unsigned short int>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned short int> decltype(auto) _ = std::saturate_cast<unsigned short int>(UCHAR_MAX); }
+  assert(std::saturate_cast<unsigned short int>(     O_UC) == O_US);
+  assert(std::saturate_cast<unsigned short int>(UCHAR_MAX) == static_cast<unsigned short int>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned short int> decltype(auto) _ = std::saturate_cast<unsigned short int>(SCHAR_MIN); }
+  assert(std::saturate_cast<unsigned short int>( SHRT_MIN) == O_US);
+  assert(std::saturate_cast<unsigned short int>(      O_S) == O_US);
+  assert(std::saturate_cast<unsigned short int>( SHRT_MAX) == static_cast<unsigned short int>(SHRT_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned short int> decltype(auto) _ = std::saturate_cast<unsigned short int>(UCHAR_MAX); }
+  assert(std::saturate_cast<unsigned short int>(     O_US) == O_US);
+  assert(std::saturate_cast<unsigned short int>(USHRT_MAX) == USHRT_MAX);
+
+  { [[maybe_unused]] std::same_as<unsigned short int> decltype(auto) _ = std::saturate_cast<unsigned short int>(sBigMax); }
+  assert(std::saturate_cast<unsigned short int>(  sBigMin) == O_US);      // saturated
+  assert(std::saturate_cast<unsigned short int>(    sZero) == O_US);
+  assert(std::saturate_cast<unsigned short int>(  sBigMax) == USHRT_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<unsigned short int> decltype(auto) _ = std::saturate_cast<unsigned short int>(uBigMax); }
+  assert(std::saturate_cast<unsigned short int>(    uZero) == O_US);
+  assert(std::saturate_cast<unsigned short int>(  uBigMax) == USHRT_MAX); // saturated
+
+  // unsigned int
+
+  { [[maybe_unused]] std::same_as<unsigned int> decltype(auto) _ = std::saturate_cast<unsigned int>(SCHAR_MAX); }
+  assert(std::saturate_cast<unsigned int>(SCHAR_MIN) == O_US);
+  assert(std::saturate_cast<unsigned int>(     O_UC) == 0U);
+  assert(std::saturate_cast<unsigned int>(SCHAR_MAX) == static_cast<unsigned int>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned int> decltype(auto) _ = std::saturate_cast<unsigned int>(UCHAR_MAX); }
+  assert(std::saturate_cast<unsigned int>(     O_UC) == 0U);
+  assert(std::saturate_cast<unsigned int>(UCHAR_MAX) == static_cast<unsigned int>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned int> decltype(auto) _ = std::saturate_cast<unsigned int>(INT_MAX); }
+  assert(std::saturate_cast<unsigned int>(  INT_MIN) == 0U);
+  assert(std::saturate_cast<unsigned int>(        0) == 0U);
+  assert(std::saturate_cast<unsigned int>(  INT_MAX) == static_cast<unsigned int>(INT_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned int> decltype(auto) _ = std::saturate_cast<unsigned int>(UINT_MAX); }
+  assert(std::saturate_cast<unsigned int>(       0U) == 0U);
+  assert(std::saturate_cast<unsigned int>( UINT_MAX) == UINT_MAX);
+
+  { [[maybe_unused]] std::same_as<unsigned int> decltype(auto) _ = std::saturate_cast<unsigned int>(sBigMax); }
+  assert(std::saturate_cast<unsigned int>(  sBigMin) == 0U);       // saturated
+  assert(std::saturate_cast<unsigned int>(    sZero) == 0U);
+  assert(std::saturate_cast<unsigned int>(  sBigMax) == UINT_MAX); // saturated
+  
+  { [[maybe_unused]] std::same_as<unsigned int> decltype(auto) _ = std::saturate_cast<unsigned int>(uBigMax); }
+  assert(std::saturate_cast<unsigned int>(    uZero) == 0U);
+  assert(std::saturate_cast<unsigned int>(  uBigMax) == UINT_MAX);  // saturated
+
+  // unsigned long
+
+  { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(SCHAR_MAX); }
+  assert(std::saturate_cast<unsigned long int>(SCHAR_MIN) == 0UL);
+  assert(std::saturate_cast<unsigned long int>(      O_C) == 0UL);
+  assert(std::saturate_cast<unsigned long int>(SCHAR_MAX) == static_cast<unsigned long int>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(UCHAR_MAX); }
+  assert(std::saturate_cast<unsigned long int>(     O_UC) == 0UL);
+  assert(std::saturate_cast<unsigned long int>(UCHAR_MAX) == static_cast<unsigned long int>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(LONG_MAX); }
+  assert(std::saturate_cast<unsigned long int>( LONG_MIN) == 0UL);
+  assert(std::saturate_cast<unsigned long int>(       0L) == 0UL);
+  assert(std::saturate_cast<unsigned long int>( LONG_MAX) == static_cast<unsigned long int>(LONG_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(ULONG_MAX); }
+  assert(std::saturate_cast<unsigned long int>(      0UL) == 0UL);
+  assert(std::saturate_cast<unsigned long int>(ULONG_MAX) == ULONG_MAX);
+
+  { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(sBigMax); }
+  assert(std::saturate_cast<unsigned long int>(  sBigMin) == 0UL);       // saturated
+  assert(std::saturate_cast<unsigned long int>(    sZero) == 0UL);
+  assert(std::saturate_cast<unsigned long int>(  sBigMax) == ULONG_MAX); // saturated
+
+  { [[maybe_unused]] std::same_as<unsigned long int> decltype(auto) _ = std::saturate_cast<unsigned long int>(uBigMax); }
+  assert(std::saturate_cast<unsigned long int>(    uZero) == 0UL);
+  assert(std::saturate_cast<unsigned long int>(  uBigMax) == ULONG_MAX); // saturated
+
+  // unsigned long long
+
+  { [[maybe_unused]] std::same_as<unsigned long long int> decltype(auto) _ = std::saturate_cast<unsigned long long int>(SCHAR_MAX); }
+  assert(std::saturate_cast<unsigned long long int>( SCHAR_MIN) == 0ULL);
+  assert(std::saturate_cast<unsigned long long int>(       O_C) == 0ULL);
+  assert(std::saturate_cast<unsigned long long int>( SCHAR_MAX) == static_cast<unsigned long long int>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned long long  int> decltype(auto) _ = std::saturate_cast<unsigned long long int>(UCHAR_MAX); }
+  assert(std::saturate_cast<unsigned long long int>(      O_UC) == 0ULL);
+  assert(std::saturate_cast<unsigned long long int>( UCHAR_MAX) == static_cast<unsigned long long int>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned long long int> decltype(auto) _ = std::saturate_cast<unsigned long long int>(LLONG_MAX); }
+  assert(std::saturate_cast<unsigned long long int>( LLONG_MIN) == 0ULL);
+  assert(std::saturate_cast<unsigned long long int>(       0LL) == 0ULL);
+  assert(std::saturate_cast<unsigned long long int>( LLONG_MAX) == static_cast<unsigned long long int>(LLONG_MAX));
+
+  { [[maybe_unused]] std::same_as<unsigned long long int> decltype(auto) _ = std::saturate_cast<unsigned long long int>(ULLONG_MAX); }
+  assert(std::saturate_cast<unsigned long long int>(      0ULL) == 0ULL);
+  assert(std::saturate_cast<unsigned long long int>(ULLONG_MAX) == ULLONG_MAX);
+
+#ifndef TEST_HAS_NO_INT128
+  { [[maybe_unused]] std::same_as<unsigned long long int> decltype(auto) _ = std::saturate_cast<unsigned long long int>(sBigMax); }
+  assert(std::saturate_cast<unsigned long long int>(   sBigMin) == 0ULL);       // (128-bit) saturated
+  assert(std::saturate_cast<unsigned long long int>(     sZero) == 0ULL);
+  assert(std::saturate_cast<unsigned long long int>(   sBigMax) == ULLONG_MAX); // (128-bit) saturated
+
+  { [[maybe_unused]] std::same_as<unsigned long long int> decltype(auto) _ = std::saturate_cast<unsigned long long int>(uBigMax); }
+  assert(std::saturate_cast<unsigned long long int>(     uZero) == 0ULL);
+  assert(std::saturate_cast<unsigned long long int>(   uBigMax) == ULLONG_MAX); // (128-bit) saturated
+
+  { [[maybe_unused]] std::same_as<__uint128_t> decltype(auto) _ = std::saturate_cast<__uint128_t>(SCHAR_MIN); }
+  assert(std::saturate_cast<__uint128_t>(SCHAR_MIN) == uZero);
+  assert(std::saturate_cast<__uint128_t>(      O_C) == uZero);
+  assert(std::saturate_cast<__uint128_t>(SCHAR_MAX) == static_cast<__uint128_t>(SCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<__uint128_t> decltype(auto) _ = std::saturate_cast<__uint128_t>(UCHAR_MAX); }
+  assert(std::saturate_cast<__uint128_t>(     O_UC) == uZero);
+  assert(std::saturate_cast<__uint128_t>(UCHAR_MAX) == static_cast<__uint128_t>(UCHAR_MAX));
+
+  { [[maybe_unused]] std::same_as<__uint128_t> decltype(auto) _ = std::saturate_cast<__uint128_t>(sBigMax); }
+  assert(std::saturate_cast<__uint128_t>(  sBigMin) == uZero); // saturated
+  assert(std::saturate_cast<__uint128_t>(    sZero) == uZero);
+  assert(std::saturate_cast<__uint128_t>(  sBigMax) == static_cast<__uint128_t>(sBigMax));
+
+  { [[maybe_unused]] std::same_as<__uint128_t> decltype(auto) _ = std::saturate_cast<__uint128_t>(uBigMax); }
+  assert(std::saturate_cast<__uint128_t>(    uZero) == uZero);
+  assert(std::saturate_cast<__uint128_t>(  uBigMax) == uBigMax);
+#endif
+
+  // clang-format on
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.compile.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.compile.pass.cpp
new file mode 100644
index 0000000000000..9234819d9ec24
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.compile.pass.cpp
@@ -0,0 +1,77 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+
+// <numeric>
+
+// template<class T>
+// constexpr T sub_sat(T x, T y) noexcept;                     // freestanding
+
+#include <concepts>
+#include <numeric>
+
+#include "test_macros.h"
+
+template <typename T, typename U>
+concept CanDo = requires(T x, U y) {
+  { std::sub_sat(x, y) } -> std::same_as<T>;
+};
+
+template <typename T, typename U>
+constexpr void test_constraint_success() {
+  static_assert(CanDo<T, T>);
+  static_assert(!CanDo<U, T>);
+  static_assert(!CanDo<T, U>);
+}
+
+template <typename T>
+constexpr void test_constraint_fail() {
+  using I = int;
+  static_assert(!CanDo<T, T>);
+  static_assert(!CanDo<I, T>);
+  static_assert(!CanDo<T, I>);
+}
+
+constexpr void test() {
+  // Contraint success - Signed
+  using SI = long long int;
+  test_constraint_success<signed char, SI>();
+  test_constraint_success<short int, SI>();
+  test_constraint_success<signed char, SI>();
+  test_constraint_success<short int, SI>();
+  test_constraint_success<int, SI>();
+  test_constraint_success<long int, SI>();
+  test_constraint_success<long long int, int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<__int128_t, SI>();
+#endif
+  // Contraint success - Unsigned
+  using UI = unsigned long long int;
+  test_constraint_success<unsigned char, UI>();
+  test_constraint_success<unsigned short int, UI>();
+  test_constraint_success<unsigned int, UI>();
+  test_constraint_success<unsigned long int, UI>();
+  test_constraint_success<unsigned long long int, unsigned int>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_success<__uint128_t, UI>();
+#endif
+
+  // Contraint failure
+  test_constraint_fail<bool>();
+  test_constraint_fail<char>();
+#ifndef TEST_HAS_NO_INT128
+  test_constraint_fail<wchar_t>();
+#endif
+  test_constraint_fail<char8_t>();
+  test_constraint_fail<char16_t>();
+  test_constraint_fail<char32_t>();
+  test_constraint_fail<float>();
+  test_constraint_fail<double>();
+  test_constraint_fail<long double>();
+}
diff --git a/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
new file mode 100644
index 0000000000000..d7bdf2c0271ac
--- /dev/null
+++ b/libcxx/test/std/numerics/numeric.ops/numeric.ops.sat/sub_sat.pass.cpp
@@ -0,0 +1,162 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
+// The test uses "Placeholder variables with no name"
+
+// <numeric>
+
+// template<class T>
+// constexpr T sub_sat(T x, T y) noexcept;                     // freestanding
+
+#include <cassert>
+#include <concepts>
+#include <limits>
+#include <numeric>
+
+#include "test_macros.h"
+
+template <typename IntegerT>
+constexpr bool test_signed() {
+  constexpr auto minVal = std::numeric_limits<IntegerT>::min();
+  constexpr auto maxVal = std::numeric_limits<IntegerT>::max();
+
+  // TODO(LLVM-20) remove [[maybe_unused]] since all supported compilers support "Placeholder variables with no name"
+  [[maybe_unused]] std::same_as<IntegerT> decltype(auto) _ = std::sub_sat(minVal, maxVal);
+
+  static_assert(noexcept(std::sub_sat(minVal, maxVal)));
+
+  // clang-format off
+
+  // Limit values (-1, 0, 1, min, max)
+
+  assert(std::sub_sat(IntegerT{-1}, IntegerT{-1}) == IntegerT{ 0});
+  assert(std::sub_sat(IntegerT{-1}, IntegerT{ 0}) == IntegerT{-1});
+  assert(std::sub_sat(IntegerT{-1}, IntegerT{ 1}) == IntegerT{-2});
+  assert(std::sub_sat(IntegerT{-1},       minVal) == IntegerT{-1} - minVal);
+  assert(std::sub_sat(IntegerT{-1},       maxVal) == IntegerT{-1} - maxVal);
+
+  assert(std::sub_sat(IntegerT{ 0}, IntegerT{-1}) == IntegerT{ 1});
+  assert(std::sub_sat(IntegerT{ 0}, IntegerT{ 0}) == IntegerT{ 0});
+  assert(std::sub_sat(IntegerT{ 0}, IntegerT{ 1}) == IntegerT{-1});
+  assert(std::sub_sat(IntegerT{ 0},       minVal) == maxVal); // saturated
+  assert(std::sub_sat(IntegerT{ 0},       maxVal) == -maxVal);
+
+  assert(std::sub_sat(      minVal, IntegerT{-1}) == minVal - IntegerT{-1});
+  assert(std::sub_sat(      minVal, IntegerT{ 0}) == minVal);
+  assert(std::sub_sat(      minVal, IntegerT{ 1}) == minVal); // saturated
+  assert(std::sub_sat(      minVal,       minVal) == IntegerT{0});
+  assert(std::sub_sat(      minVal,       maxVal) == minVal); // saturated
+
+  assert(std::sub_sat(      maxVal, IntegerT{-1}) == maxVal); // saturated
+  assert(std::sub_sat(      maxVal, IntegerT{ 0}) == maxVal);
+  assert(std::sub_sat(      maxVal, IntegerT{ 1}) == maxVal - IntegerT{ 1});
+  assert(std::sub_sat(      maxVal,       minVal) == maxVal); // saturated
+  assert(std::sub_sat(      maxVal,       maxVal) == IntegerT{0});
+
+  // No saturation (no limit values)
+
+  assert(std::sub_sat(IntegerT{ 27}, IntegerT{-28}) ==  55);
+  assert(std::sub_sat(IntegerT{ 27}, IntegerT{ 28}) ==  -1);
+  assert(std::sub_sat(IntegerT{-27}, IntegerT{ 28}) == -55);
+  assert(std::sub_sat(IntegerT{-27}, IntegerT{-28}) ==   1);
+
+  // Saturation (no limit values)
+
+  {
+    constexpr IntegerT lesserVal = minVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT biggerVal = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::sub_sat(lesserVal, biggerVal) == minVal); // saturated
+  }
+  {
+    constexpr IntegerT biggerVal = maxVal / IntegerT{2} + IntegerT{28};
+    constexpr IntegerT lesserVal = minVal / IntegerT{2} + IntegerT{27};
+    assert(std::sub_sat(biggerVal, lesserVal) == maxVal); // saturated
+  }
+
+  // clang-format on
+
+  return true;
+}
+
+template <typename IntegerT>
+constexpr bool test_unsigned() {
+  constexpr auto minVal = std::numeric_limits<IntegerT>::min();
+  constexpr auto maxVal = std::numeric_limits<IntegerT>::max();
+
+  // TODO(LLVM-20) remove [[maybe_unused]] since all supported compilers support "Placeholder variables with no name"
+  [[maybe_unused]] std::same_as<IntegerT> decltype(auto) _ = std::sub_sat(minVal, maxVal);
+
+  static_assert(noexcept(std::sub_sat(minVal, maxVal)));
+
+  // clang-format off
+
+  // Limit values (0, 1, min, max)
+
+  assert(std::sub_sat(IntegerT{0}, IntegerT{0}) == IntegerT{0});
+  assert(std::sub_sat(IntegerT{0}, IntegerT{1}) == minVal); // saturated
+  assert(std::sub_sat(IntegerT{0},      minVal) == minVal);
+  assert(std::sub_sat(IntegerT{0},      maxVal) == minVal); // saturated
+
+  assert(std::sub_sat(IntegerT{1}, IntegerT{0}) == IntegerT{1});
+  assert(std::sub_sat(IntegerT{1}, IntegerT{1}) == IntegerT{0});
+  assert(std::sub_sat(IntegerT{1},      minVal) == IntegerT{1});
+  assert(std::sub_sat(IntegerT{1},      maxVal) == minVal); // saturated
+
+  assert(std::sub_sat(     minVal, IntegerT{0}) == IntegerT{0});
+  assert(std::sub_sat(     minVal, IntegerT{1}) == minVal);
+  assert(std::sub_sat(     minVal,      maxVal) == minVal);
+  assert(std::sub_sat(     minVal,      maxVal) == minVal);
+
+  assert(std::sub_sat(     maxVal, IntegerT{0}) == maxVal);
+  assert(std::sub_sat(     maxVal, IntegerT{1}) == maxVal - IntegerT{1});
+  assert(std::sub_sat(     maxVal,      minVal) == maxVal);
+  assert(std::sub_sat(     maxVal,      maxVal) == IntegerT{0});
+
+  // Saturation (no limit values)
+
+  {
+    constexpr IntegerT lesserVal = minVal / IntegerT{2} + IntegerT{27};
+    constexpr IntegerT biggerVal = maxVal / IntegerT{2} + IntegerT{28};
+    assert(std::sub_sat(lesserVal, biggerVal) == minVal); // saturated
+  }
+
+  // clang-format on
+
+  return true;
+}
+
+constexpr bool test() {
+  // Signed
+  test_signed<signed char>();
+  test_signed<short int>();
+  test_signed<int>();
+  test_signed<long int>();
+  test_signed<long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_signed<__int128_t>();
+#endif
+  // Unsigned
+  test_unsigned<unsigned char>();
+  test_unsigned<unsigned short int>();
+  test_unsigned<unsigned int>();
+  test_unsigned<unsigned long int>();
+  test_unsigned<unsigned long long int>();
+#ifndef TEST_HAS_NO_INT128
+  test_unsigned<__uint128_t>();
+#endif
+
+  return true;
+}
+
+int main(int, char**) {
+  test();
+  static_assert(test());
+
+  return 0;
+}
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 667dd75278c8b..065b70620cd17 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -1011,10 +1011,7 @@ def add_version_header(tc):
         {
             "name": "__cpp_lib_saturation_arithmetic",
             "values": {"c++26": 202311},  # P0543R3 Saturation arithmetic
-            "headers": [
-                "numeric"  # TODO verify this entry since the paper was underspecified.
-            ],
-            "unimplemented": True,
+            "headers": ["numeric"],
         },
         {
             "name": "__cpp_lib_scoped_lock",

From 5ffe777c4acd1051c4cebc8464c7e1ae5ca2f689 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 21 Jan 2024 20:32:38 -0800
Subject: [PATCH 325/843] [RISCV] Add Zvkb test to riscv-target-features.c. NFC

---
 clang/test/Preprocessor/riscv-target-features.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 4fb3f0bab36b0..03d482d65b7e7 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -112,6 +112,7 @@
 // CHECK-NOT: __riscv_zve64f {{.*$}}
 // CHECK-NOT: __riscv_zve64x {{.*$}}
 // CHECK-NOT: __riscv_zvfh {{.*$}}
+// CHECK-NOT: __riscv_zvkb {{.*$}}
 // CHECK-NOT: __riscv_zvkg {{.*$}}
 // CHECK-NOT: __riscv_zvkn {{.*$}}
 // CHECK-NOT: __riscv_zvknc {{.*$}}
@@ -1141,6 +1142,14 @@
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVBC-EXT %s
 // CHECK-ZVBC-EXT: __riscv_zvbc  1000000{{$}}
 
+// RUN: %clang --target=riscv32 \
+// RUN: -march=rv32i_zve64x_zvkb1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZVKB-EXT %s
+// RUN: %clang --target=riscv64 \
+// RUN: -march=rv64i_zve64x_zvkb1p0 -x c -E -dM %s \
+// RUN: -o - | FileCheck --check-prefix=CHECK-ZVKB-EXT %s
+// CHECK-ZVKB-EXT: __riscv_zvkb  1000000{{$}}
+
 // RUN: %clang --target=riscv32 \
 // RUN: -march=rv32i_zve32x_zvkg1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-ZVKG-EXT %s

From 25063bedb596943e546994a45710c79fdd6539e8 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Sun, 21 Jan 2024 20:36:24 -0800
Subject: [PATCH 326/843] [RISCV] Replace Zvbb with Zvkb in the Zvk* combine
 tests in riscv-target-features.c. NFC

The tests are testing that specifying individual Zvk* extensions
set the preprocessor directives for Zvk* shorthand extensions.

None of the shorthands refer to Zvbb so we should use Zvkb(which
is implied by Zvbb).
---
 .../test/Preprocessor/riscv-target-features.c | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 03d482d65b7e7..5fde5ccdbeacf 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -1167,10 +1167,10 @@
 // CHECK-ZVKN-EXT: __riscv_zvkn 1000000{{$}}
 
 // RUN: %clang --target=riscv32 \
-// RUN: -march=rv32iv_zvbb1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv32iv_zvkb1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKN %s
 // RUN: %clang --target=riscv64 \
-// RUN: -march=rv64iv_zvbb1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv64iv_zvkb1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKN %s
 // CHECK-COMBINE-INTO-ZVKN: __riscv_zvkn 1000000{{$}}
 
@@ -1183,10 +1183,10 @@
 // CHECK-ZVKNC-EXT: __riscv_zvknc 1000000{{$}}
 
 // RUN: %clang --target=riscv32 \
-// RUN: -march=rv32iv_zvbb1p0_zvbc1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv32iv_zvkb1p0_zvbc1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKNC %s
 // RUN: %clang --target=riscv64 \
-// RUN: -march=rv64iv_zvbb1p0_zvbc1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv64iv_zvkb1p0_zvbc1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKNC %s
 // CHECK-COMBINE-INTO-ZVKNC: __riscv_zvkn 1000000{{$}}
 // CHECK-COMBINE-INTO-ZVKNC: __riscv_zvknc 1000000{{$}}
@@ -1208,10 +1208,10 @@
 // CHECK-ZVKNG-EXT: __riscv_zvkng 1000000{{$}}
 
 // RUN: %clang --target=riscv32 \
-// RUN: -march=rv32iv_zvbb1p0_zvkg1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv32iv_zvkb1p0_zvkg1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKNG %s
 // RUN: %clang --target=riscv64 \
-// RUN: -march=rv64iv_zvbb1p0_zvkg1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv64iv_zvkb1p0_zvkg1p0_zvkned1p0_zvknhb1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKNG %s
 // CHECK-COMBINE-INTO-ZVKNG: __riscv_zvkn 1000000{{$}}
 // CHECK-COMBINE-INTO-ZVKNG: __riscv_zvkng 1000000{{$}}
@@ -1241,10 +1241,10 @@
 // CHECK-ZVKS-EXT: __riscv_zvks 1000000{{$}}
 
 // RUN: %clang --target=riscv32 \
-// RUN: -march=rv32iv_zvbb1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv32iv_zvkb1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKS %s
 // RUN: %clang --target=riscv64 \
-// RUN: -march=rv64iv_zvbb1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv64iv_zvkb1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKS %s
 // CHECK-COMBINE-INTO-ZVKS: __riscv_zvks 1000000{{$}}
 
@@ -1257,10 +1257,10 @@
 // CHECK-ZVKSC-EXT: __riscv_zvksc 1000000{{$}}
 
 // RUN: %clang --target=riscv32 \
-// RUN: -march=rv32iv_zvbb1p0_zvbc1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv32iv_zvkb1p0_zvbc1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKSC %s
 // RUN: %clang --target=riscv64 \
-// RUN: -march=rv64iv_zvbb1p0_zvbc1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv64iv_zvkb1p0_zvbc1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKSC %s
 // CHECK-COMBINE-INTO-ZVKSC: __riscv_zvks 1000000{{$}}
 // CHECK-COMBINE-INTO-ZVKSC: __riscv_zvksc 1000000{{$}}
@@ -1282,10 +1282,10 @@
 // CHECK-ZVKSG-EXT: __riscv_zvksg 1000000{{$}}
 
 // RUN: %clang --target=riscv32 \
-// RUN: -march=rv32iv_zvbb1p0_zvkg1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv32iv_zvkb1p0_zvkg1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKSG %s
 // RUN: %clang --target=riscv64 \
-// RUN: -march=rv64iv_zvbb1p0_zvkg1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
+// RUN: -march=rv64iv_zvkb1p0_zvkg1p0_zvksed1p0_zvksh1p0_zvkt1p0 -x c -E -dM %s \
 // RUN: -o - | FileCheck --check-prefix=CHECK-COMBINE-INTO-ZVKSG %s
 // CHECK-COMBINE-INTO-ZVKSG: __riscv_zvks 1000000{{$}}
 // CHECK-COMBINE-INTO-ZVKSG: __riscv_zvksg 1000000{{$}}

From 8bef13ef4f59bae481583913a39e5369730effa7 Mon Sep 17 00:00:00 2001
From: Wu Yingcong <yingcong.wu@intel.com>
Date: Sun, 21 Jan 2024 21:42:38 -0800
Subject: [PATCH 327/843] [hwasan] Fix a possible null dereference problem
 (#77737)

This is clearly a copy-paste mistake, fix it with this patch.
After checking the `local.function_name` is not null, it should check
the len for `local.function_name`, not `local.name`. And this could lead
to possible null dereference since the second
`internal_strlen(local.name)` does not guarantee `local.name` is not
null.
---
 compiler-rt/lib/hwasan/hwasan_report.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/hwasan/hwasan_report.cpp b/compiler-rt/lib/hwasan/hwasan_report.cpp
index 784cfb904aa27..12a4fa47f2151 100644
--- a/compiler-rt/lib/hwasan/hwasan_report.cpp
+++ b/compiler-rt/lib/hwasan/hwasan_report.cpp
@@ -222,7 +222,7 @@ static void PrintStackAllocations(const StackAllocationsRingBuffer *sa,
         if (!local.has_frame_offset || !local.has_size || !local.has_tag_offset)
           continue;
         if (!(local.name && internal_strlen(local.name)) &&
-            !(local.function_name && internal_strlen(local.name)) &&
+            !(local.function_name && internal_strlen(local.function_name)) &&
             !(local.decl_file && internal_strlen(local.decl_file)))
           continue;
         tag_t obj_tag = base_tag ^ local.tag_offset;

From 745883bba69007f1d2c5135f3d5b0f1efcfc82cd Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Sun, 21 Jan 2024 22:03:39 -0800
Subject: [PATCH 328/843] [Clang][CMake] Support perf, LBR, and Instrument
 CLANG_BOLT options (#69133)

Split up and refactor CLANG_BOLT_INSTRUMENT into support for
BOLT instrumentation, perf no-LBR and perf with LBR profiling.

Differential Revision: https://reviews.llvm.org/D143617
---
 clang/CMakeLists.txt                          | 44 +++++++++-----
 clang/cmake/caches/BOLT.cmake                 |  2 +-
 clang/utils/perf-training/CMakeLists.txt      | 29 +++++++++-
 clang/utils/perf-training/bolt.lit.cfg        | 47 ++++++++++++---
 .../utils/perf-training/bolt.lit.site.cfg.in  |  2 +
 clang/utils/perf-training/perf-helper.py      | 58 +++++++++++++++++++
 6 files changed, 156 insertions(+), 26 deletions(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 5f2b7f064da43..b9c193f360bc4 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -876,23 +876,38 @@ if (CLANG_ENABLE_BOOTSTRAP)
   endforeach()
 endif()
 
-if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
+set(CLANG_BOLT "INSTRUMENT" CACHE STRING "Apply BOLT optimization to Clang. \
+  May be specified as Instrument or Perf or LBR to use a particular profiling \
+  mechanism.")
+string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
+
+if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
   set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
-  set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst)
+  set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
   set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata)
 
-  # Instrument clang with BOLT
-  add_custom_target(clang-instrumented
-    DEPENDS ${CLANG_INSTRUMENTED}
-  )
-  add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
-    DEPENDS clang llvm-bolt
-    COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
-      -instrument --instrumentation-file-append-pid
-      --instrumentation-file=${BOLT_FDATA}
-    COMMENT "Instrumenting clang binary with BOLT"
-    VERBATIM
-  )
+  # Pass extra flag in no-LBR mode
+  if (CLANG_BOLT STREQUAL "PERF")
+    set(BOLT_NO_LBR "-nl")
+  endif()
+
+  if (CLANG_BOLT STREQUAL "INSTRUMENT")
+    # Instrument clang with BOLT
+    add_custom_target(clang-instrumented
+      DEPENDS ${CLANG_INSTRUMENTED}
+    )
+    add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
+      DEPENDS clang llvm-bolt
+      COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
+        -instrument --instrumentation-file-append-pid
+        --instrumentation-file=${BOLT_FDATA}
+      COMMENT "Instrumenting clang binary with BOLT"
+      VERBATIM
+    )
+    add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented)
+  else() # perf or LBR
+    add_custom_target(clang-bolt-training-deps DEPENDS clang)
+  endif()
 
   # Optimize original (pre-bolt) Clang using the collected profile
   set(CLANG_OPTIMIZED ${CMAKE_CURRENT_BINARY_DIR}/clang.bolt)
@@ -906,6 +921,7 @@ if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
       -data ${BOLT_FDATA}
       -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions
       -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack
+      ${BOLT_NO_LBR}
     COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} $<TARGET_FILE:clang>
     COMMENT "Optimizing Clang with BOLT"
     VERBATIM
diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake
index 0442f73e5426a..eba2346b2f4ca 100644
--- a/clang/cmake/caches/BOLT.cmake
+++ b/clang/cmake/caches/BOLT.cmake
@@ -1,5 +1,5 @@
 set(CMAKE_BUILD_TYPE Release CACHE STRING "")
-set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "")
+set(CLANG_BOLT "INSTRUMENT" CACHE STRING "")
 set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
 
 set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt
index c6d51863fb1b5..601f40902fa34 100644
--- a/clang/utils/perf-training/CMakeLists.txt
+++ b/clang/utils/perf-training/CMakeLists.txt
@@ -62,7 +62,9 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
     DEPENDS generate-dtrace-logs)
 endif()
 
-if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
+if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
+  set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
+    "Name of BOLT-instrumented Clang binary")
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg
@@ -71,16 +73,37 @@ if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
   add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang"
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/
     EXCLUDE_FROM_CHECK_ALL
-    DEPENDS clang-instrumented clear-bolt-fdata
+    DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data
     )
 
   add_custom_target(clear-bolt-fdata
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata
     COMMENT "Clearing old BOLT fdata")
 
+  add_custom_target(clear-perf-data
+    COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
+    COMMENT "Clearing old perf data")
+
+  string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
+  if (CLANG_BOLT STREQUAL "LBR")
+    set(BOLT_LBR "--lbr")
+  endif()
+
+  add_custom_target(merge-fdata-deps)
+  if (CLANG_BOLT STREQUAL "INSTRUMENT")
+    add_dependencies(merge-fdata-deps generate-bolt-fdata)
+  else()
+    # Convert perf profiles into fdata
+    add_custom_target(convert-perf-fdata
+      COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $<TARGET_FILE:llvm-bolt> ${CMAKE_CURRENT_BINARY_DIR} $<TARGET_FILE:clang> ${BOLT_LBR}
+      COMMENT "Converting perf files to BOLT fdata"
+      DEPENDS llvm-bolt generate-bolt-fdata)
+    add_dependencies(merge-fdata-deps convert-perf-fdata)
+  endif()
+
   # Merge profiles into one using merge-fdata
   add_custom_target(clang-bolt-profile
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR}
     COMMENT "Merging BOLT fdata"
-    DEPENDS merge-fdata generate-bolt-fdata)
+    DEPENDS merge-fdata merge-fdata-deps)
 endif()
diff --git a/clang/utils/perf-training/bolt.lit.cfg b/clang/utils/perf-training/bolt.lit.cfg
index 234ac855bd67c..0e81a5501e9fc 100644
--- a/clang/utils/perf-training/bolt.lit.cfg
+++ b/clang/utils/perf-training/bolt.lit.cfg
@@ -6,15 +6,46 @@ import lit.util
 import os
 import subprocess
 
-config.clang = os.path.realpath(lit.util.which('clang-bolt.inst', config.clang_tools_dir)).replace('\\', '/')
+clang_bolt_mode = config.clang_bolt_mode.lower()
+clang_binary = "clang"
+perf_wrapper = f"{config.python_exe} {config.perf_helper_dir}/perf-helper.py perf "
 
-config.name = 'Clang Perf Training'
-config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test']
+if clang_bolt_mode == "instrument":
+    perf_wrapper = ""
+    clang_binary = config.clang_bolt_name
+elif clang_bolt_mode == "lbr":
+    perf_wrapper += " --lbr -- "
+elif clang_bolt_mode == "perf":
+    perf_wrapper += " -- "
+else:
+    assert 0, "Unsupported CLANG_BOLT_MODE variable"
+
+config.clang = perf_wrapper + os.path.realpath(
+    lit.util.which(clang_binary, config.clang_tools_dir)
+).replace("\\", "/")
+
+config.name = "Clang Perf Training"
+config.suffixes = [
+    ".c",
+    ".cc",
+    ".cpp",
+    ".m",
+    ".mm",
+    ".cu",
+    ".ll",
+    ".cl",
+    ".s",
+    ".S",
+    ".modulemap",
+    ".test",
+]
 
 use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL")
 config.test_format = lit.formats.ShTest(use_lit_shell == "0")
-config.substitutions.append( ('%clang_cpp_skip_driver', ' %s --driver-mode=g++ ' % (config.clang)))
-config.substitutions.append( ('%clang_cpp', ' %s --driver-mode=g++ ' % (config.clang)))
-config.substitutions.append( ('%clang_skip_driver', ' %s ' % (config.clang)))
-config.substitutions.append( ('%clang', ' %s ' % (config.clang) ) )
-config.substitutions.append( ('%test_root', config.test_exec_root ) )
+config.substitutions.append(
+    ("%clang_cpp_skip_driver", f" {config.clang} --driver-mode=g++ ")
+)
+config.substitutions.append(("%clang_cpp", f" {config.clang} --driver-mode=g++ "))
+config.substitutions.append(("%clang_skip_driver", config.clang))
+config.substitutions.append(("%clang", config.clang))
+config.substitutions.append(("%test_root", config.test_exec_root))
diff --git a/clang/utils/perf-training/bolt.lit.site.cfg.in b/clang/utils/perf-training/bolt.lit.site.cfg.in
index 3029319673fc2..54de12701c1ae 100644
--- a/clang/utils/perf-training/bolt.lit.site.cfg.in
+++ b/clang/utils/perf-training/bolt.lit.site.cfg.in
@@ -9,6 +9,8 @@ config.test_source_root = "@CLANG_PGO_TRAINING_DATA@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.python_exe = "@Python3_EXECUTABLE@"
 config.clang_obj_root = path(r"@CLANG_BINARY_DIR@")
+config.clang_bolt_mode = "@CLANG_BOLT@"
+config.clang_bolt_name = "@CLANG_BOLT_INSTRUMENTED@"
 
 # Let the main config do the real work.
 lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/bolt.lit.cfg")
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 99d6a3333b6ef..959bdba5c98cc 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -67,6 +67,62 @@ def merge_fdata(args):
     return 0
 
 
+def perf(args):
+    parser = argparse.ArgumentParser(
+        prog="perf-helper perf", description="perf wrapper for BOLT profile collection"
+    )
+    parser.add_argument(
+        "--lbr", action="store_true", help="Use perf with branch stacks"
+    )
+    parser.add_argument("cmd", nargs=argparse.REMAINDER, help="")
+
+    opts = parser.parse_args(args)
+    cmd = opts.cmd[1:]
+
+    perf_args = [
+        "perf",
+        "record",
+        "--event=cycles:u",
+        "--freq=max",
+        "--output=%d.perf.data" % os.getpid(),
+    ]
+    if opts.lbr:
+        perf_args += ["--branch-filter=any,u"]
+    perf_args.extend(cmd)
+
+    start_time = time.time()
+    subprocess.check_call(perf_args)
+
+    elapsed = time.time() - start_time
+    print("... data collection took %.4fs" % elapsed)
+    return 0
+
+
+def perf2bolt(args):
+    parser = argparse.ArgumentParser(
+        prog="perf-helper perf2bolt",
+        description="perf2bolt conversion wrapper for perf.data files",
+    )
+    parser.add_argument("bolt", help="Path to llvm-bolt")
+    parser.add_argument("path", help="Path containing perf.data files")
+    parser.add_argument("binary", help="Input binary")
+    parser.add_argument("--lbr", action="store_true", help="Use LBR perf2bolt mode")
+    opts = parser.parse_args(args)
+
+    p2b_args = [
+        opts.bolt,
+        opts.binary,
+        "--aggregate-only",
+        "--profile-format=yaml",
+    ]
+    if not opts.lbr:
+        p2b_args += ["-nl"]
+    p2b_args += ["-p"]
+    for filename in findFilesWithExtension(opts.path, "perf.data"):
+        subprocess.check_call(p2b_args + [filename, "-o", filename + ".fdata"])
+    return 0
+
+
 def dtrace(args):
     parser = argparse.ArgumentParser(
         prog="perf-helper dtrace",
@@ -507,6 +563,8 @@ def genOrderFile(args):
     "cc1": cc1,
     "gen-order-file": genOrderFile,
     "merge-fdata": merge_fdata,
+    "perf": perf,
+    "perf2bolt": perf2bolt,
 }
 
 
From a31a60074717fc40887cfe132b77eec93bedd307 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Mon, 22 Jan 2024 14:22:16 +0800
Subject: [PATCH 329/843] [docs] Update StandardCPlusPlusModules.rst with
 clang18

Changed Things:
- Mentioning we need to specify BMIs for indirectly dependent BMIs too.
- Remove the note for `delayed-template-parsing` since
  https://github.com/llvm/llvm-project/issues/61068 got closed.
- Add a note for https://github.com/llvm/llvm-project/issues/78850 since
  we've seen it for a lot of times.
- Add a note for https://github.com/llvm/llvm-project/issues/78173 since
  we've seen it for a lot of times.
---
 clang/docs/StandardCPlusPlusModules.rst | 69 ++++++++++++++++++++++---
 1 file changed, 63 insertions(+), 6 deletions(-)

diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index 22d506f0da2b1..81043ff25be02 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -345,6 +345,9 @@ In case all ``-fprebuilt-module-path=<path/to/directory>``, ``-fmodule-file=<pat
 takes highest precedence and ``-fmodule-file=<module-name>=<path/to/BMI>`` will take the second
 highest precedence.
 
+We need to specify all the dependent (directly and indirectly) BMIs.
+See https://github.com/llvm/llvm-project/issues/62707 for detail.
+
 When we compile a ``module implementation unit``, we must specify the BMI of the corresponding
 ``primary module interface unit``.
 Since the language specification says a module implementation unit implicitly imports
@@ -689,14 +692,68 @@ the BMI within ``clang-cl.exe``.
 
 This is tracked in: https://github.com/llvm/llvm-project/issues/64118
 
-delayed template parsing is not supported/broken with C++ modules
-~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+false positive ODR violation diagnostic due to using inconsistent qualified but the same type
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+ODR violation is a pretty common issue when using modules.
+Sometimes the program violated the One Definition Rule actually.
+But sometimes it shows the compiler gives false positive diagnostics.
+
+One often reported example is:
+
+.. code-block:: c++
+
+  // part.cc
+  module;
+  typedef long T;
+  namespace ns {
+  inline void fun() {
+      (void)(T)0;
+  }
+  }
+  export module repro:part;
+
+  // repro.cc
+  module;
+  typedef long T;
+  namespace ns {
+      using ::T;
+  }
+  namespace ns {
+  inline void fun() {
+      (void)(T)0;
+  }
+  }
+  export module repro;
+  export import :part;
+
+Currently the compiler complains about the inconsistent definition of `fun()` in
+2 module units. This is incorrect. Since both definitions of `fun()` has the same
+spelling and `T` refers to the same type entity finally. So the program should be
+fine.
+
+This is tracked in https://github.com/llvm/llvm-project/issues/78850.
+
+Using TU-local entity in other units
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+Module units are translation units. So the entities which should only be local to the
+module unit itself shouldn't be used by other units in any means.
+
+In the language side, to address the idea formally, the language specification defines
+the concept of ``TU-local`` and ``exposure`` in
+`basic.link/p14 <https://eel.is/c++draft/basic.link#14>`_,
+`basic.link/p15 <https://eel.is/c++draft/basic.link#15>`_,
+`basic.link/p16 <https://eel.is/c++draft/basic.link#16>`_,
+`basic.link/p17 <https://eel.is/c++draft/basic.link#17>`_ and
+`basic.link/p18 <https://eel.is/c++draft/basic.link#18>`_.
 
-The feature `-fdelayed-template-parsing` can't work well with C++ modules now.
-Note that this is significant on Windows since the option will be enabled by default
-on Windows.
+However, the compiler doesn't support these 2 ideas formally.
+This results in unclear and confusing diagnostic messages.
+And it is worse that the compiler may import TU-local entities to other units without any
+diagnostics.
 
-This is tracked in: https://github.com/llvm/llvm-project/issues/61068
+This is tracked in https://github.com/llvm/llvm-project/issues/78173.
 
 Header Units
 ============

From 21830c913505b1fd2cf10e454253483180c7e10b Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Mon, 22 Jan 2024 08:32:56 +0100
Subject: [PATCH 330/843] [mlir][nvgpu] Fix 'warpgroup.mma.store' index
 calculation (#78413)

This PR fixes the 'nvgpu.warpgroup.mma.store' index calculation. When
the destionation memref and current accumulator matrix were small, the
previous code was reaching out of range.
---
 .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp    |  32 +++--
 .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir | 130 ++++++++++++++++++
 2 files changed, 152 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index 759766275de4a..ab4dea9d5618d 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -1548,12 +1548,6 @@ struct NVGPUWarpgroupMmaStoreOpLowering
       return b.create<LLVM::AddOp>(lhs.getType(), lhs, rhs);
     };
 
-    Value tidx = b.create<NVVM::ThreadIdXOp>(i32);
-    Value laneId = b.create<LLVM::URemOp>(i32, tidx, warpSize);
-    Value warpId = b.create<LLVM::UDivOp>(i32, tidx, warpSize);
-    Value lane4Id = b.create<LLVM::UDivOp>(i32, laneId, c4);
-    Value lane4modId = b.create<LLVM::URemOp>(i32, laneId, c4);
-
     auto makeExtractAndStore = [&](int i, Value wgmmaResult, Value x, Value y,
                                    TypedValue<::mlir::MemRefType> memref) {
       Type it = b.getIndexType();
@@ -1566,16 +1560,34 @@ struct NVGPUWarpgroupMmaStoreOpLowering
       b.create<memref::StoreOp>(d1, memref, ValueRange{idx, idy1});
     };
 
+    Value tidx = b.create<NVVM::ThreadIdXOp>(i32);
+    Value laneId = b.create<LLVM::URemOp>(i32, tidx, warpSize);
+    Value warpId = b.create<LLVM::UDivOp>(i32, tidx, warpSize);
+    Value lane4Id = b.create<LLVM::UDivOp>(i32, laneId, c4);
+    Value lane4modId = b.create<LLVM::URemOp>(i32, laneId, c4);
+
     Value tj = makeMul(lane4modId, c2);
     Value ti = makeAdd(lane4Id, makeMul(warpId, c16));
     if (offset)
       ti = makeAdd(ti, makeConst(offset));
-    for (int i = 0; i < 2; ++i) {
+
+    auto structType = matrixD.getType().cast<LLVM::LLVMStructType>();
+
+    // Number of 32-bit registers owns per thread
+    constexpr unsigned numAdjacentRegisters = 2;
+    // Number of 8x8 matrices one below another per warp
+    constexpr unsigned numStackedMatrices = 2;
+
+    size_t storeCount = (structType.getBody().size() /
+                         (numStackedMatrices * numAdjacentRegisters));
+
+    for (size_t i = 0; i < numStackedMatrices; ++i) {
       Value idx = makeAdd(ti, makeMul(makeConst(i), c8));
-      for (int j = 0; j < 16; ++j) {
+      for (size_t j = 0; j < storeCount; ++j) {
         Value idy = makeAdd(tj, makeMul(makeConst(j), c8));
-        int sIndex = i * 2 + j * 4;
-        makeExtractAndStore(sIndex, matrixD, idx, idy, dstMemref);
+        size_t structIndex = (i * numAdjacentRegisters) +
+                             (j * (numStackedMatrices * numAdjacentRegisters));
+        makeExtractAndStore(structIndex, matrixD, idx, idy, dstMemref);
       }
     }
   }
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index edccd7e80603b..b495363e228d8 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -1055,6 +1055,136 @@ func.func @warpgroup_mma_store(
   return 
 }
 
+// CHECK-LABEL: @warpgroup_mma_store_multiple
+func.func @warpgroup_mma_store_multiple(
+    %shmem_m64n8k : memref<64x8xf32>, 
+    %shmem_m64n16k : memref<64x16xf32>, 
+    %shmem_m64n24k : memref<64x24xf32>, 
+    %shmem_m64n32k : memref<64x32xf32>, 
+    %shmem_m64n40k : memref<64x40xf32>, 
+    %shmem_m64n48k : memref<64x48xf32>, 
+    %shmem_m64n56k : memref<64x56xf32>, 
+    %shmem_m64n64k : memref<64x64xf32>, 
+    %shmem_m64n72k : memref<64x72xf32>, 
+    %shmem_m64n80k : memref<64x80xf32>, 
+    %shmem_m64n88k : memref<64x88xf32>, 
+    %shmem_m64n96k : memref<64x96xf32>, 
+    %shmem_m64n104k : memref<64x104xf32>, 
+    %shmem_m64n112k : memref<64x112xf32>, 
+    %shmem_m64n120k : memref<64x120xf32>, 
+    %shmem_m64n128k : memref<64x128xf32>, 
+    %shmem_m64n136k : memref<64x136xf32>, 
+    %shmem_m64n144k : memref<64x144xf32>, 
+    %shmem_m64n152k : memref<64x152xf32>, 
+    %shmem_m64n160k : memref<64x160xf32>, 
+    %shmem_m64n168k : memref<64x168xf32>, 
+    %shmem_m64n176k : memref<64x176xf32>, 
+    %shmem_m64n184k : memref<64x184xf32>, 
+    %shmem_m64n192k : memref<64x192xf32>, 
+    %shmem_m64n200k : memref<64x200xf32>, 
+    %shmem_m64n208k : memref<64x208xf32>, 
+    %shmem_m64n216k : memref<64x216xf32>, 
+    %shmem_m64n224k : memref<64x224xf32>, 
+    %shmem_m64n232k : memref<64x232xf32>, 
+    %shmem_m64n240k : memref<64x240xf32>, 
+    %shmem_m64n248k : memref<64x248xf32>, 
+    %shmem_m64n256k : memref<64x256xf32>, 
+    %res_m64n16k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x16xf32>>, 
+    %res_m64n24k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x24xf32>>, 
+    %res_m64n32k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x32xf32>>, 
+    %res_m64n40k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x40xf32>>, 
+    %res_m64n48k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x48xf32>>, 
+    %res_m64n56k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x56xf32>>, 
+    %res_m64n64k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x64xf32>>, 
+    %res_m64n72k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x72xf32>>, 
+    %res_m64n80k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x80xf32>>, 
+    %res_m64n88k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x88xf32>>, 
+    %res_m64n96k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x96xf32>>, 
+    %res_m64n104k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x104xf32>>, 
+    %res_m64n112k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x112xf32>>, 
+    %res_m64n120k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x120xf32>>, 
+    %res_m64n128k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>>, 
+    %res_m64n136k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x136xf32>>, 
+    %res_m64n144k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x144xf32>>, 
+    %res_m64n152k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x152xf32>>, 
+    %res_m64n160k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x160xf32>>, 
+    %res_m64n168k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x168xf32>>, 
+    %res_m64n176k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x176xf32>>, 
+    %res_m64n184k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x184xf32>>, 
+    %res_m64n192k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x192xf32>>, 
+    %res_m64n200k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x200xf32>>, 
+    %res_m64n208k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x208xf32>>, 
+    %res_m64n216k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x216xf32>>, 
+    %res_m64n224k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x224xf32>>, 
+    %res_m64n232k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x232xf32>>, 
+    %res_m64n240k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x240xf32>>, 
+    %res_m64n248k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x248xf32>>, 
+    %res_m64n256k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x256xf32>>) {
+    // CHECK-COUNT-8: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x16xf32>
+    // CHECK-COUNT-12: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x24xf32>
+    // CHECK-COUNT-16: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x32xf32>
+    // CHECK-COUNT-20: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x40xf32>
+    // CHECK-COUNT-24: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x48xf32>
+    // CHECK-COUNT-28: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x56xf32>
+    // CHECK-COUNT-32: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x64xf32>
+    // CHECK-COUNT-36: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x72xf32>
+    // CHECK-COUNT-40: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x80xf32>
+    // CHECK-COUNT-44: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x88xf32>
+    // CHECK-COUNT-48: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x96xf32>
+    // CHECK-COUNT-52: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x104xf32>
+    // CHECK-COUNT-56: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x112xf32>
+    // CHECK-COUNT-60: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x120xf32>
+    // CHECK-COUNT-64: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x128xf32>
+    // CHECK-COUNT-68: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x136xf32>
+    // CHECK-COUNT-72: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x144xf32>
+    // CHECK-COUNT-76: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x152xf32>
+    // CHECK-COUNT-80: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x160xf32>
+    // CHECK-COUNT-84: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x168xf32>
+    // CHECK-COUNT-88: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x176xf32>
+    // CHECK-COUNT-92: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x184xf32>
+    // CHECK-COUNT-96: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x192xf32>
+    // CHECK-COUNT-100: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x200xf32>
+    // CHECK-COUNT-104: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x208xf32>
+    // CHECK-COUNT-108: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x216xf32>
+    // CHECK-COUNT-112: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x224xf32>
+    // CHECK-COUNT-116: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x232xf32>
+    // CHECK-COUNT-120: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x240xf32>
+    // CHECK-COUNT-124: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x248xf32>
+    // CHECK-COUNT-128: memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<64x256xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n16k, %shmem_m64n16k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x16xf32>> to memref<64x16xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n24k, %shmem_m64n24k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x24xf32>> to memref<64x24xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n32k, %shmem_m64n32k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x32xf32>> to memref<64x32xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n40k, %shmem_m64n40k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x40xf32>> to memref<64x40xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n48k, %shmem_m64n48k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x48xf32>> to memref<64x48xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n56k, %shmem_m64n56k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x56xf32>> to memref<64x56xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n64k, %shmem_m64n64k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x64xf32>> to memref<64x64xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n72k, %shmem_m64n72k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x72xf32>> to memref<64x72xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n80k, %shmem_m64n80k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x80xf32>> to memref<64x80xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n88k, %shmem_m64n88k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x88xf32>> to memref<64x88xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n96k, %shmem_m64n96k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x96xf32>> to memref<64x96xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n104k, %shmem_m64n104k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x104xf32>> to memref<64x104xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n112k, %shmem_m64n112k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x112xf32>> to memref<64x112xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n120k, %shmem_m64n120k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x120xf32>> to memref<64x120xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n128k, %shmem_m64n128k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x128xf32>> to memref<64x128xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n136k, %shmem_m64n136k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x136xf32>> to memref<64x136xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n144k, %shmem_m64n144k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x144xf32>> to memref<64x144xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n152k, %shmem_m64n152k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x152xf32>> to memref<64x152xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n160k, %shmem_m64n160k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x160xf32>> to memref<64x160xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n168k, %shmem_m64n168k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x168xf32>> to memref<64x168xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n176k, %shmem_m64n176k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x176xf32>> to memref<64x176xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n184k, %shmem_m64n184k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x184xf32>> to memref<64x184xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n192k, %shmem_m64n192k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x192xf32>> to memref<64x192xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n200k, %shmem_m64n200k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x200xf32>> to memref<64x200xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n208k, %shmem_m64n208k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x208xf32>> to memref<64x208xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n216k, %shmem_m64n216k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x216xf32>> to memref<64x216xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n224k, %shmem_m64n224k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x224xf32>> to memref<64x224xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n232k, %shmem_m64n232k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x232xf32>> to memref<64x232xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n240k, %shmem_m64n240k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x240xf32>> to memref<64x240xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n248k, %shmem_m64n248k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x248xf32>> to memref<64x248xf32>
+    nvgpu.warpgroup.mma.store  %res_m64n256k, %shmem_m64n256k : !nvgpu.warpgroup.accumulator<fragmented = vector<64x256xf32>> to memref<64x256xf32>
+  return 
+}
+
 func.func @warpgroup_mma_init() {
   //CHECK: %[[S1:.+]] = llvm.mlir.constant(0.000000e+00 : f32) : f3
   //CHECK: %[[S0:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)>

From 12c241b3654800ab708607dbc1998975c893fc14 Mon Sep 17 00:00:00 2001
From: Guray Ozen <guray.ozen@gmail.com>
Date: Mon, 22 Jan 2024 08:37:20 +0100
Subject: [PATCH 331/843] [MLIR][NVVM] Explicit Data Type for Output in
 `wgmma.mma_async` (#78713)

The current implementation of `nvvm.wgmma.mma_async` Op deduces the data
type of the output matrix from the data type of struct member, which can be
non-intuitive, especially in cases where types like `2xf16` are packed
into `i32`.

This PR addresses this issue by improving the Op to include an explicit
data type for the output matrix.

The modified Op now includes an explicit data type for Matrix-D (<f16>),
and looks as follows:

```
%result = llvm.mlir.undef : !llvm.struct<(struct<(i32, i32, ...
nvvm.wgmma.mma_async
    %descA, %descB, %result,
    #nvvm.shape<m = 64, n = 32, k = 16>,
    D [<f16>, #nvvm.wgmma_scale_out<zero>],
    A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>],
    B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
```
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   |  10 +-
 .../Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp    |  15 ++-
 mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp    | 117 +++++++++---------
 .../Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir |  18 +--
 mlir/test/Conversion/NVVMToLLVM/invalid.mlir  |  42 +++----
 .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir   |  56 ++++-----
 mlir/test/python/dialects/nvvm.py             |   3 +-
 7 files changed, 135 insertions(+), 126 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 7140e614412f9..b1bd3a9506807 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1833,11 +1833,14 @@ def WGMMATypeB1 : I32EnumAttrCase<"b1", 4>;
 def WGMMATypeBF16 : I32EnumAttrCase<"bf16", 5>;
 def WGMMATypeF8E4M3 : I32EnumAttrCase<"e4m3", 6>;
 def WGMMATypeF8E5M2 : I32EnumAttrCase<"e5m2", 7>;
+def WGMMATypeF32 : I32EnumAttrCase<"f32", 8>;
+def WGMMATypeS32 : I32EnumAttrCase<"s32", 9>;
+
 def WGMMATypes : I32EnumAttr<"WGMMATypes", "NVVM WGMMA types",
   [WGMMATypeF16, WGMMATypeTF32,
     WGMMATypeU8, WGMMATypeS8,
     WGMMATypeB1, WGMMATypeBF16, WGMMATypeF8E4M3, 
-    WGMMATypeF8E5M2]> {
+    WGMMATypeF8E5M2, WGMMATypeF32, WGMMATypeS32]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::NVVM";
 }
@@ -1859,6 +1862,7 @@ def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async",
     NVVM_MMAShapeAttr:$shape,
     WGMMATypesAttr:$typeA,
     WGMMATypesAttr:$typeB,
+    WGMMATypesAttr:$typeD,
     WGMMAScaleOutAttr:$scaleD,
     WGMMAScaleInAttr:$scaleA,
     WGMMAScaleInAttr:$scaleB, 
@@ -1868,8 +1872,8 @@ def NVVM_WgmmaMmaAsyncOp : NVVM_Op<"wgmma.mma_async",
   );  
   
    let assemblyFormat = [{ 
-      $descriptorA `,` $descriptorB `,` $shape `,` 
-      `D` `[` $inouts `,` $scaleD (`,` $satfinite^)? `]` `,`
+      $descriptorA `,` $descriptorB `,` $inouts `,` $shape `,`
+      `D` `[` $typeD `,` $scaleD (`,` $satfinite^)? `]` `,`
       `A` `[` $typeA `,` $scaleA `,` $layoutA `]` `,` 
       `B` `[` $typeB `,` $scaleB `,` $layoutB `]`
       attr-dict `:` 
diff --git a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
index ab4dea9d5618d..43d05b872a4fb 100644
--- a/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
+++ b/mlir/lib/Conversion/NVGPUToNVVM/NVGPUToNVVM.cpp
@@ -1267,10 +1267,11 @@ struct NVGPUWarpgroupMmaOpLowering
     }
 
     /// Generates WGMMATypesAttr from MLIR Type
-    NVVM::WGMMATypesAttr generateWgmmaType(Type type) const {
-      auto getWgmmaType = [](Type elemType) {
+    NVVM::WGMMATypesAttr generateWgmmaType(Type type,
+                                           bool useF32 = false) const {
+      auto getWgmmaType = [=](Type elemType) {
         if (elemType.isF32() || elemType.isTF32())
-          return NVVM::WGMMATypes::tf32;
+          return useF32 ? NVVM::WGMMATypes::f32 : NVVM::WGMMATypes::tf32;
         if (elemType.isF16())
           return NVVM::WGMMATypes::f16;
         if (elemType.isBF16())
@@ -1285,6 +1286,8 @@ struct NVGPUWarpgroupMmaOpLowering
           return NVVM::WGMMATypes::s8;
         if (elemType.isUnsignedInteger(8))
           return NVVM::WGMMATypes::u8;
+        if (elemType.isInteger(32))
+          return NVVM::WGMMATypes::s32;
         llvm_unreachable("unsupported type");
       };
       return NVVM::WGMMATypesAttr::get(op->getContext(), getWgmmaType(type));
@@ -1397,6 +1400,9 @@ struct NVGPUWarpgroupMmaOpLowering
       Type elemB = op.getDescriptorB().getType().getTensor().getElementType();
       NVVM::WGMMATypesAttr itypeB = generateWgmmaType(elemB);
 
+      Type elemD = op.getMatrixC().getType().getFragmented().getElementType();
+      NVVM::WGMMATypesAttr itypeD = generateWgmmaType(elemD, true);
+
       NVVM::MMAShapeAttr shape = generateWgmmaShape();
       NVVM::WGMMAScaleOutAttr scaleOut = generateScaleOut();
       NVVM::WGMMAScaleInAttr scaleIn = generateScaleIn();
@@ -1408,7 +1414,8 @@ struct NVGPUWarpgroupMmaOpLowering
 
       return b.create<NVVM::WgmmaMmaAsyncOp>(
           matrixC.getType(), matrixC, descriptorA, descriptorB, shape, itypeA,
-          itypeB, scaleOut, scaleIn, scaleIn, layoutA, layoutB, overflow);
+          itypeB, itypeD, scaleOut, scaleIn, scaleIn, layoutA, layoutB,
+          overflow);
     }
 
     /// Generates multiple wgmma instructions to complete the given GEMM shape
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index aa49c4dc31fbc..a855e4b209ac5 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -755,37 +755,44 @@ FailureOr<int> getAllowedSizeK(NVVM::WGMMATypes typeA) {
   return failure();
 }
 
-LogicalResult isAllowedWGMMADataType(Type typeD, NVVM::WGMMATypes typeA,
+LogicalResult isAllowedWGMMADataType(NVVM::WGMMATypes typeD,
+                                     NVVM::WGMMATypes typeA,
                                      NVVM::WGMMATypes typeB) {
   switch (typeA) {
   case NVVM::WGMMATypes::f16:
-    if ((typeD.isF32() || typeD.isF16()) && typeB == NVVM::WGMMATypes::f16)
+    if ((typeD == NVVM::WGMMATypes::f32 || typeD == NVVM::WGMMATypes::f16) &&
+        typeB == NVVM::WGMMATypes::f16)
       return success();
     break;
   case NVVM::WGMMATypes::tf32:
-    if (typeD.isF32() && typeB == NVVM::WGMMATypes::tf32)
+    if (typeD == NVVM::WGMMATypes::f32 && typeB == NVVM::WGMMATypes::tf32)
       return success();
     break;
   case NVVM::WGMMATypes::u8:
   case NVVM::WGMMATypes::s8:
-    if (typeD.isInteger(32) &&
+    if (typeD == NVVM::WGMMATypes::s32 &&
         (typeB == NVVM::WGMMATypes::u8 || typeB == NVVM::WGMMATypes::s8))
       return success();
     break;
   case NVVM::WGMMATypes::b1:
-    if (typeD.isInteger(32) && typeB == NVVM::WGMMATypes::b1)
+    if (typeD == NVVM::WGMMATypes::s32 && typeB == NVVM::WGMMATypes::b1)
       return success();
     break;
   case NVVM::WGMMATypes::bf16:
-    if ((typeD.isF32() || typeD.isF16()) && typeB == NVVM::WGMMATypes::bf16)
+    if ((typeD == NVVM::WGMMATypes::f32 || typeD == NVVM::WGMMATypes::f16) &&
+        typeB == NVVM::WGMMATypes::bf16)
       return success();
     break;
   case NVVM::WGMMATypes::e4m3:
   case NVVM::WGMMATypes::e5m2:
-    if ((typeD.isF32() || typeD.isF16()) &&
+    if ((typeD == NVVM::WGMMATypes::f32 || typeD == NVVM::WGMMATypes::f16) &&
         (typeB == NVVM::WGMMATypes::e5m2 || typeB == NVVM::WGMMATypes::e4m3))
       return success();
     break;
+  case WGMMATypes::f32:
+  case WGMMATypes::s32:
+    llvm_unreachable("unsupported input types");
+    break;
   }
   return failure();
 }
@@ -799,19 +806,24 @@ LogicalResult isAllowedSizeN(int sizeN, NVVM::WGMMATypes typeA) {
                                     80,  96,  112, 128, 144, 160,
                                     176, 192, 208, 224, 240, 256};
   switch (typeA) {
-  case mlir::NVVM::WGMMATypes::f16:
-  case mlir::NVVM::WGMMATypes::tf32:
-  case mlir::NVVM::WGMMATypes::bf16:
-  case mlir::NVVM::WGMMATypes::e4m3:
-  case mlir::NVVM::WGMMATypes::e5m2:
+  case WGMMATypes::f16:
+  case WGMMATypes::tf32:
+  case WGMMATypes::bf16:
+  case WGMMATypes::e4m3:
+  case WGMMATypes::e5m2:
     if (llvm::is_contained(allowedN, sizeN))
       return success();
     break;
-  case mlir::NVVM::WGMMATypes::u8:
-  case mlir::NVVM::WGMMATypes::s8:
-  case mlir::NVVM::WGMMATypes::b1:
+  case WGMMATypes::u8:
+  case WGMMATypes::s8:
+  case WGMMATypes::b1:
     if (llvm::is_contained(allowedNshort, sizeN))
       return success();
+    break;
+  case WGMMATypes::f32:
+  case WGMMATypes::s32:
+    llvm_unreachable("unsupported input types");
+    break;
   }
   return failure();
 }
@@ -821,27 +833,29 @@ LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
   auto stype = dyn_cast<LLVM::LLVMStructType>(outValue.getType());
   if (!stype)
     return emitOpError() << "expected results to be struct";
-  Type outputType = stype.getBody().front();
   int outputSize = stype.getBody().size();
+  WGMMATypes typeD = getTypeD();
+  WGMMATypes typeA = getTypeA();
+  WGMMATypes typeB = getTypeB();
+
   for (Type t : stype.getBody()) {
-    if (t != outputType)
+    if (t != stype.getBody().front())
       return emitOpError()
              << "all elements in struct must be same type but there is " << t;
   }
 
-  if (!outputType.isF32() && !outputType.isInteger(32) && !outputType.isF16()) {
+  if (typeD != WGMMATypes::f32 && typeD != WGMMATypes::f16 &&
+      typeD != WGMMATypes::s32) {
     return emitOpError() << "does not support the given output type "
-                         << outputType;
+                         << NVVM::stringifyWGMMATypes(typeD);
   }
-  if (outputType.isInteger(32) && (getScaleA() == NVVM::WGMMAScaleIn::neg ||
-                                   getScaleB() == NVVM::WGMMAScaleIn::neg)) {
+  if (typeD == WGMMATypes::s32 &&
+      (getScaleA() == WGMMAScaleIn::neg || getScaleB() == WGMMAScaleIn::neg)) {
     return emitOpError() << "has s32 output, scaleA and scaleB cannot be neg";
   }
 
-  mlir::NVVM::WGMMATypes typeA = getTypeA();
-  mlir::NVVM::WGMMATypes typeB = getTypeB();
-  if (failed(isAllowedWGMMADataType(outputType, typeA, typeB))) {
-    return emitOpError() << outputType
+  if (failed(isAllowedWGMMADataType(typeD, typeA, typeB))) {
+    return emitOpError() << NVVM::stringifyWGMMATypes(typeD)
                          << " += " << NVVM::stringifyWGMMATypes(typeA) << " * "
                          << NVVM::stringifyWGMMATypes(typeB)
                          << ", it is not supported.";
@@ -866,8 +880,7 @@ LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
   }
 
   // Check transpose (only available for f16/bf16)
-  if ((typeA != mlir::NVVM::WGMMATypes::f16 &&
-       typeA != mlir::NVVM::WGMMATypes::bf16) &&
+  if ((typeA != WGMMATypes::f16 && typeA != WGMMATypes::bf16) &&
       (getLayoutA() == mlir::NVVM::MMALayout::col ||
        getLayoutB() == mlir::NVVM::MMALayout::col)) {
     return emitOpError()
@@ -876,29 +889,29 @@ LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
            << " for input types " << stringifyWGMMATypes(typeA) << " and "
            << stringifyWGMMATypes(typeB)
            << " requires transpose. However, this is only supported for: "
-           << stringifyMMATypes(mlir::NVVM::MMATypes::f16) << " and "
-           << stringifyMMATypes(mlir::NVVM::MMATypes::bf16);
+           << stringifyMMATypes(MMATypes::f16) << " and "
+           << stringifyMMATypes(MMATypes::bf16);
   }
 
   // Check result registers
-  int expectedOutput;
-  if (outputType.isF32() || outputType.isInteger(32))
+  int expectedOutput = 0;
+  if (typeD == WGMMATypes::f32 || typeD == WGMMATypes::s32)
     expectedOutput = getShape().getN() / 2;
-  if (outputType.isF16())
+  if (typeD == WGMMATypes::f16)
     expectedOutput = getShape().getN() / 4;
   if (outputSize != expectedOutput) {
     return emitOpError() << "results " << expectedOutput
                          << ", however output struct has " << outputSize
                          << " elements";
   }
-  // Check satfinite (only availalbe for s32 accumulator)
-  if (!outputType.isInteger(32) &&
+  // Check satfinite (only available for s32 accumulator)
+  if (typeD != WGMMATypes::s32 &&
       getSatfinite().value_or(NVVM::MMAIntOverflow::wrapped) ==
           NVVM::MMAIntOverflow::satfinite) {
     return emitOpError()
            << " `satfinite` can be only used with s32 accumulator, however "
               "the current accumulator is "
-           << outputType;
+           << NVVM::stringifyWGMMATypes(typeD);
   }
 
   return success();
@@ -907,27 +920,15 @@ LogicalResult NVVM::WgmmaMmaAsyncOp::verify() {
 std::string NVVM::WgmmaMmaAsyncOp::getPtx() {
 
   int m = getShape().getM(), n = getShape().getN(), k = getShape().getK();
-  bool isF16 = getTypeA() == mlir::NVVM::WGMMATypes::f16 ||
-               getTypeA() == mlir::NVVM::WGMMATypes::bf16;
+  bool isF16 = getTypeA() == WGMMATypes::f16 || getTypeA() == WGMMATypes::bf16;
 
-  Value outValue = getResults() ? getResults() : getInouts();
-  auto stype = dyn_cast<LLVM::LLVMStructType>(outValue.getType());
-  Type outputType = stype.getBody().front();
-  std::string outputTypeName;
-  if (outputType.isF16())
-    outputTypeName = "f16";
-  else if (outputType.isF32())
-    outputTypeName = "f32";
-  else if (outputType.isInteger(32))
-    outputTypeName = "s32";
-  else
-    assert(false && "unsupported output type");
+  StringRef outputTypeName = stringifyWGMMATypes(getTypeD());
 
-  int expectedOutputRegisters;
-  if (outputType.isF32() || outputType.isInteger(32))
-    expectedOutputRegisters = getShape().getN() / 2;
-  if (outputType.isF16())
+  int expectedOutputRegisters = 0;
+  if (getTypeD() == WGMMATypes::f16)
     expectedOutputRegisters = getShape().getN() / 4;
+  else
+    expectedOutputRegisters = getShape().getN() / 2;
 
   std::string ptx;
   llvm::raw_string_ostream ss(ptx);
@@ -958,7 +959,7 @@ std::string NVVM::WgmmaMmaAsyncOp::getPtx() {
   ss << " $" << (regCnt) << ","
      << " $" << (regCnt + 1) << ","
      << " p";
-  if (!outputType.isInteger(32)) {
+  if (getTypeD() != WGMMATypes::s32) {
     ss << ", $" << (regCnt + 3) << ",  $" << (regCnt + 4);
   }
   // Don't add transpose parameters unless needed.
@@ -975,11 +976,7 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues(
     RewriterBase &rewriter,
     llvm::SmallVectorImpl<std::pair<mlir::Value, mlir::NVVM::PTXRegisterMod>>
         &asmValues) {
-  Value outValue = getResults() ? getResults() : getInouts();
-  auto stype = dyn_cast<LLVM::LLVMStructType>(outValue.getType());
-  Type outputType = stype.getBody().front();
-  bool isF16 = getTypeA() == mlir::NVVM::WGMMATypes::f16 ||
-               getTypeA() == mlir::NVVM::WGMMATypes::bf16;
+  bool isF16 = getTypeA() == WGMMATypes::f16 || getTypeA() == WGMMATypes::bf16;
   if (getResults())
     asmValues.push_back({getResults(), mlir::NVVM::PTXRegisterMod::Write});
   if (getInouts())
@@ -988,7 +985,7 @@ void NVVM::WgmmaMmaAsyncOp::getAsmValues(
   asmValues.push_back({getDescriptorB(), mlir::NVVM::PTXRegisterMod::Read});
   asmValues.push_back({makeConstantI32(rewriter, static_cast<int>(getScaleD())),
                        mlir::NVVM::PTXRegisterMod::Read});
-  if (!outputType.isInteger(32)) {
+  if (getTypeD() != WGMMATypes::s32) {
     asmValues.push_back(
         {makeConstantI32(rewriter,
                          getScaleA() == NVVM::WGMMAScaleIn::neg ? -1 : 1),
diff --git a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
index b495363e228d8..b25dd76bf1203 100644
--- a/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
+++ b/mlir/test/Conversion/NVGPUToNVVM/nvgpu-to-nvvm.mlir
@@ -880,41 +880,41 @@ func.func @warpgroup_mma_128_128_64(
 // CHECK: nvvm.wgmma.fence.aligned
 // CHECK: %[[UD:.+]] =  llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)>
 // CHECK: %[[S2:.+]] = llvm.extractvalue %[[ARG]][0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
-// CHECK: %[[S4:.+]] = nvvm.wgmma.mma_async %[[S0]], %[[S1]], <m = 64, n = 128, k = 16>, D[%[[S2]], <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+// CHECK: %[[S4:.+]] = nvvm.wgmma.mma_async %[[S0]], %[[S1]], %[[S2]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
 // CHECK: %[[S5:.+]] = llvm.mlir.constant(2 : i32) : i64
 // CHECK: %[[S6:.+]] = llvm.add %[[S0]], %[[S5]] : i64
 // CHECK: %[[S7:.+]] = llvm.mlir.constant(128 : i32) : i64
 // CHECK: %[[S8:.+]] = llvm.add %[[S1]], %[[S7]]  : i64
-// CHECK: %[[S9:.+]] = nvvm.wgmma.mma_async %[[S6]], %[[S8]], <m = 64, n = 128, k = 16>, D[%[[S4]], <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
+// CHECK: %[[S9:.+]] = nvvm.wgmma.mma_async %[[S6]], %[[S8]], %[[S4]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
 // CHECK: %[[S10:.+]] = llvm.mlir.constant(4 : i32) : i64
 // CHECK: %[[S11:.+]] = llvm.add %[[S0]], %[[S10]]  : i64
 // CHECK: %[[S12:.+]] = llvm.mlir.constant(256 : i32) : i64
 // CHECK: %[[S13:.+]] = llvm.add %[[S1]], %[[S12]]  : i64
-// CHECK: %[[S14:.+]] = nvvm.wgmma.mma_async %[[S11]], %[[S13]], <m = 64, n = 128, k = 16>, D[%[[S9]], <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
+// CHECK: %[[S14:.+]] = nvvm.wgmma.mma_async %[[S11]], %[[S13]], %[[S9]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
 // CHECK: %[[S15:.+]] = llvm.mlir.constant(6 : i32) : i64
 // CHECK: %[[S16:.+]] = llvm.add %[[S0]], %[[S15]]  : i64
 // CHECK: %[[S17:.+]] = llvm.mlir.constant(384 : i32) : i64
 // CHECK: %[[S18:.+]] = llvm.add %[[S1]], %[[S17]]  : i64
-// CHECK: %[[S19:.+]] = nvvm.wgmma.mma_async %[[S16]], %[[S18]], <m = 64, n = 128, k = 16>, D[%[[S14]], <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
+// CHECK: %[[S19:.+]] = nvvm.wgmma.mma_async %[[S16]], %[[S18]], %[[S14]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
 // CHECK: %[[S3:.+]] = llvm.extractvalue %[[ARG]][1] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
 // CHECK: %[[S21:.+]] = llvm.mlir.constant(512 : i32) : i64
 // CHECK: %[[S22:.+]] = llvm.add %[[S0]], %[[S21]]  : i64
-// CHECK: %[[S23:.+]] = nvvm.wgmma.mma_async %[[S22]], %[[S1]], <m = 64, n = 128, k = 16>, D[%[[S3]], <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
+// CHECK: %[[S23:.+]] = nvvm.wgmma.mma_async %[[S22]], %[[S1]], %[[S3]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
 // CHECK: %[[S24:.+]] = llvm.mlir.constant(514 : i32) : i64
 // CHECK: %[[S25:.+]] = llvm.add %[[S0]], %[[S24]]  : i64
 // CHECK: %[[S26:.+]] = llvm.mlir.constant(128 : i32) : i64
 // CHECK: %[[S27:.+]] = llvm.add %[[S1]], %[[S26]]  : i64
-// CHECK: %[[S28:.+]] = nvvm.wgmma.mma_async %[[S25]], %[[S27]], <m = 64, n = 128, k = 16>, D[%[[S23]], <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
+// CHECK: %[[S28:.+]] = nvvm.wgmma.mma_async %[[S25]], %[[S27]], %[[S23]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
 // CHECK: %[[S29:.+]] = llvm.mlir.constant(516 : i32) : i64
 // CHECK: %[[S30:.+]] = llvm.add %[[S0]], %[[S29]]  : i64
 // CHECK: %[[S31:.+]] = llvm.mlir.constant(256 : i32) : i64
 // CHECK: %[[S32:.+]] = llvm.add %[[S1]], %[[S31]]  : i64
-// CHECK: %[[S33:.+]] = nvvm.wgmma.mma_async %[[S30]], %[[S32]], <m = 64, n = 128, k = 16>, D[%[[S28]], <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
+// CHECK: %[[S33:.+]] = nvvm.wgmma.mma_async %[[S30]], %[[S32]], %[[S28]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
 // CHECK: %[[S34:.+]] = llvm.mlir.constant(518 : i32) : i64
 // CHECK: %[[S35:.+]] = llvm.add %[[S0]], %[[S34]]  : i64
 // CHECK: %[[S36:.+]] = llvm.mlir.constant(384 : i32) : i64
 // CHECK: %[[S37:.+]] = llvm.add %[[S1]], %[[S36]]  : i64
-// CHECK: %[[S38:.+]] = nvvm.wgmma.mma_async %[[S35]], %[[S37]], <m = 64, n = 128, k = 16>, D[%[[S33]], <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
+// CHECK: %[[S38:.+]] = nvvm.wgmma.mma_async %[[S35]], %[[S37]], %[[S33]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct
 // CHECK: %[[S40:.+]] = llvm.insertvalue %[[S19]], %[[UD]][0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
 // CHECK: %[[S41:.+]] = llvm.insertvalue %[[S38]], %[[S40]][1] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
 // CHECK: nvvm.wgmma.commit.group.sync.aligned
@@ -1299,7 +1299,7 @@ func.func @warpgroup_matrix_multiply_m128n128k64(
 // CHECK: nvvm.wgmma.fence.aligned
 // CHECK: %[[S137:.+]] = llvm.mlir.undef : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)>
 // CHECK: %[[S138:.+]] = llvm.extractvalue %136[0] : !llvm.struct<(struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>, struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>)> 
-// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %0, %1, <m = 64, n = 128, k = 16>, D[%[[S138]], <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
+// CHECK: %[[S139:.+]] = nvvm.wgmma.mma_async %0, %1, %[[S138]], <m = 64, n = 128, k = 16>, D[<f32>, <one>, <wrapped>], A[<f16>, <one>, <row>], B[<f16>, <one>, <col>] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)> -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32, f32)>
 // CHECK: nvvm.wgmma.mma_async
 // CHECK: nvvm.wgmma.mma_async
 // CHECK: %[[S154:.+]] = nvvm.wgmma.mma_async
diff --git a/mlir/test/Conversion/NVVMToLLVM/invalid.mlir b/mlir/test/Conversion/NVVMToLLVM/invalid.mlir
index 34c8de9f7ed8c..9ebe3a009adf2 100644
--- a/mlir/test/Conversion/NVVMToLLVM/invalid.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/invalid.mlir
@@ -4,9 +4,9 @@
 func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{  
   %result = llvm.mlir.undef : !mat64f32
   // expected-error @+1 {{'nvvm.wgmma.mma_async' op results 64, however output struct has 7 elements}}
-  %res = nvvm.wgmma.mma_async %descA, %descB, 
+  %res = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 128, k = 16>, 
-      D [%result, <zero>],
+      D [<f32>, <zero>],
       A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
       : !mat64f32 -> !mat64f32
@@ -17,10 +17,10 @@ func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{
 
 func.func @wgmma_f32_satfinite(%descA : i64, %descB : i64) {  
   %result = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
-  // expected-error @+1 {{`satfinite` can be only used with s32 accumulator, however the current accumulator is 'f32'}}
-  %res = nvvm.wgmma.mma_async %descA, %descB, 
+  // expected-error @+1 {{`satfinite` can be only used with s32 accumulator, however the current accumulator is f32}}
+  %res = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 16, k = 16>, 
-      D [%result, <zero>, <satfinite>], 
+      D [<f32>, <zero>, <satfinite>], 
       A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
       : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
@@ -33,9 +33,9 @@ func.func @wgmma_f32_satfinite(%descA : i64, %descB : i64) {
 func.func @wgmma_f32_m32(%descA : i64, %descB : i64) {  
   %result = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
   // expected-error @+1 {{shape 'm' must be 64}}
-  %res = nvvm.wgmma.mma_async %descA, %descB, 
+  %res = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 32, n = 16, k = 16>, 
-      D [%result, <zero>], 
+      D [<f32>, <zero>], 
       A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
       : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
@@ -48,9 +48,9 @@ func.func @wgmma_f32_m32(%descA : i64, %descB : i64) {
 func.func @wgmma_f32_m32(%descA : i64, %descB : i64) { 
   %result = llvm.mlir.undef : !llvm.struct<(f32, f32, i32, f32, f32, f32, f32, f32)> 
   // expected-error @+1 {{op all elements in struct must be same type but there is 'i32'}}
-  %res = nvvm.wgmma.mma_async %descA, %descB, 
+  %res = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 16, k = 16>, 
-      D [%result, <zero>], 
+      D [<f32>, <zero>], 
       A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
       : !llvm.struct<(f32, f32, i32, f32, f32, f32, f32, f32)> 
@@ -63,9 +63,9 @@ func.func @wgmma_f32_m32(%descA : i64, %descB : i64) {
 func.func @wgmma_f32_m32(%descA : i64, %descB : i64) {  
   %result = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
   // expected-error @+1 {{op shape 'k' must be 16 for input type f16}}
-  %res = nvvm.wgmma.mma_async %descA, %descB, 
+  %res = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 16, k = 3>, 
-      D [%result, <zero>], 
+      D [<f32>, <zero>], 
       A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
       : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
@@ -78,9 +78,9 @@ func.func @wgmma_f32_m32(%descA : i64, %descB : i64) {
 func.func @wgmma_transpose(%descA : i64, %descB : i64) {
   %result = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
   // expected-error @+1 {{op given layouts layout_a = col and layout_b = col for input types tf32 and tf32 requires transpose. However, this is only supported for: f16 and bf16}}
-  %res = nvvm.wgmma.mma_async %descA, %descB, 
+  %res = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 16, k = 8>, 
-      D [%result, <zero>], 
+      D [<f32>, <zero>], 
       A [<tf32>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<tf32>, #nvvm.wgmma_scale_in<neg>, <col>]
       : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
@@ -92,10 +92,10 @@ func.func @wgmma_transpose(%descA : i64, %descB : i64) {
 
 func.func @wgmma_transpose(%descA : i64, %descB : i64) {  
   %result = llvm.mlir.undef : !llvm.struct<(f16, f16, f16, f16)>
-  // expected-error @+1 {{'nvvm.wgmma.mma_async' op 'f16' += tf32 * tf32, it is not supported.}}
-  %res = nvvm.wgmma.mma_async %descA, %descB, 
+  // expected-error @+1 {{'nvvm.wgmma.mma_async' op f16 += tf32 * tf32, it is not supported.}}
+  %res = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 16, k = 8>, 
-      D [%result, <zero>], 
+      D [<f16>, <zero>], 
       A [<tf32>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<tf32>, #nvvm.wgmma_scale_in<neg>, <col>]
       :!llvm.struct<(f16, f16, f16, f16)>
@@ -108,9 +108,9 @@ func.func @wgmma_transpose(%descA : i64, %descB : i64) {
 func.func @wgmma_f32_m32(%descA : i64, %descB : i64) {  
   %result = llvm.mlir.undef : !llvm.struct<(i32, i32, i32, i32)>
   // expected-error @+1 {{input struct and result struct must be the same type}}
-  %res = nvvm.wgmma.mma_async %descA, %descB, 
+  %res = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 8, k = 16>, 
-      D [%result, <zero>], 
+      D [<f16>, <zero>], 
       A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
       : !llvm.struct<(i32, i32, i32, i32)> 
@@ -122,10 +122,10 @@ func.func @wgmma_f32_m32(%descA : i64, %descB : i64) {
 
 func.func @wgmma_f32_m32(%descA : i64, %descB : i64) {  
   %result = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
-  // expected-error @+1 {{op 'f32' += bf16 * f16, it is not supported}}
-  %res = nvvm.wgmma.mma_async %descA, %descB, 
+  // expected-error @+1 {{op f32 += bf16 * f16, it is not supported}}
+  %res = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 8, k = 16>, 
-      D [%result, <zero>], 
+      D [<f32>, <zero>], 
       A [<bf16>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
       : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index a9487bdf3bd21..9c7c27c49eb11 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -329,9 +329,9 @@ func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{
     // CHECK-SAME: %[[V0_2]], %{{.*}}, %{{.*}}, %{{.*}}, %[[V4_2]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[V11_2]], %{{.*}}, %[[V13_2]], %{{.*}}, %{{.*}}, %[[DESCa]], %[[DESCb]], %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} 
   %result = llvm.mlir.undef : !mat64f32
   %result1 = nvvm.wgmma.mma_async 
-      %descA, %descB, 
+      %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 32, k = 16>, 
-      D [%result, #nvvm.wgmma_scale_out<zero>],
+      D [<f32>, #nvvm.wgmma_scale_out<zero>],
       A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
       :!mat64f32 -> !mat64f32
@@ -339,9 +339,9 @@ func.func @wgmma_f32_f16_f16(%descA : i64, %descB : i64) -> !mat64f32{
   %descAnext = arith.addi %descA, %c2 : i64
   %descBnext = arith.addi %descB, %c2 : i64
   %result2 = nvvm.wgmma.mma_async 
-      %descAnext, %descBnext, 
+      %descAnext, %descBnext, %result1,
       #nvvm.shape<m = 64, n = 32, k = 16>, 
-      D [%result1, #nvvm.wgmma_scale_out<zero>],
+      D [<f32>, #nvvm.wgmma_scale_out<zero>],
       A [<f16>, #nvvm.wgmma_scale_in<neg>, <col>], 
       B [<f16>, #nvvm.wgmma_scale_in<neg>, <col>]
       : !mat64f32 -> !mat64f32
@@ -393,21 +393,21 @@ func.func @wgmma_s32_s8_s8_satfinite(%descA : i64, %descB : i64) -> !mat16i32{
 // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n8k32.s32.s8.s8.satfinite
 // CHECK-SAME: {$0, $1, $2, $3}, $8, $9, p;\0A}\0A", "=r,=r,=r,=r,0,1,2,3,l,l,n" 
 // CHECK-SAME: %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result, 
       #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [%result, #nvvm.wgmma_scale_out<one>, <satfinite>],
+      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
       A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
       B [<s8>, #nvvm.wgmma_scale_in<one>, <row>]
       : !mat16i32 -> !mat16i32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1, 
       #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [%result1, #nvvm.wgmma_scale_out<one>, <satfinite>],
+      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
       A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
       B [<s8>, #nvvm.wgmma_scale_in<one>, <row>]
       : !mat16i32 -> !mat16i32
-  %result3 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2, 
       #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [%result2, #nvvm.wgmma_scale_out<one>, <satfinite>],
+      D [<s32>, #nvvm.wgmma_scale_out<one>, <satfinite>],
       A [<s8>, #nvvm.wgmma_scale_in<one>, <row>], 
       B [<s8>, #nvvm.wgmma_scale_in<one>, <row>]
       : !mat16i32 -> !mat16i32
@@ -454,21 +454,21 @@ func.func @wgmma_s32_u8_u8(%descA : i64, %descB : i64) -> !mat16i32 {
 // CHECK-SAME:}\0A", 
 // CHECK-SAME:"=r,=r,=r,=r,0,1,2,3,l,l,n" %[[V0_3]], %[[V1_3]], %[[V2_3]], %[[V3_3]], %[[ARG0]], %[[ARG1]], %{{.*}} 
   %result = llvm.mlir.undef : !mat16i32
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [%result, #nvvm.wgmma_scale_out<one>],
+      D [<s32>, #nvvm.wgmma_scale_out<one>],
       A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
       B [<u8>, #nvvm.wgmma_scale_in<one>, <row>]
       : !mat16i32 -> !mat16i32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
       #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [%result1, #nvvm.wgmma_scale_out<one>],
+      D [<s32>, #nvvm.wgmma_scale_out<one>],
       A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
       B [<u8>, #nvvm.wgmma_scale_in<one>, <row>]
       : !mat16i32 -> !mat16i32
-  %result3 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result3 = nvvm.wgmma.mma_async %descA, %descB, %result2,
       #nvvm.shape<m = 64, n = 8, k = 32>, 
-      D [%result2, #nvvm.wgmma_scale_out<one>],
+      D [<s32>, #nvvm.wgmma_scale_out<one>],
       A [<u8>, #nvvm.wgmma_scale_in<one>, <row>], 
       B [<u8>, #nvvm.wgmma_scale_in<one>, <row>]
       : !mat16i32 -> !mat16i32
@@ -496,15 +496,15 @@ func.func @wgmma_f32_tf32_tf32(%descA : i64, %descB : i64) -> !mat32f32 {
   // CHECK-SAME: setp.ne.b32 p, $66, 0;
   // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k8.f32.tf32.tf32 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
   %result = llvm.mlir.undef : !mat32f32
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 64, k = 8>, 
-      D [%result, #nvvm.wgmma_scale_out<one>],
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
       B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
        : !mat32f32 -> !mat32f32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
       #nvvm.shape<m = 64, n = 64, k = 8>, 
-      D [%result1, #nvvm.wgmma_scale_out<one>],
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
       B [#nvvm.wgmma_type<tf32>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
       : !mat32f32 -> !mat32f32
@@ -529,15 +529,15 @@ func.func @wgmma_f32_e4m3_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {
   // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
   // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e4m3.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
   %result = llvm.mlir.undef : !mat32f32
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 64, k = 32>, 
-      D [%result, #nvvm.wgmma_scale_out<one>],
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
       B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
        : !mat32f32 -> !mat32f32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
       #nvvm.shape<m = 64, n = 64, k = 32>, 
-      D [%result1, #nvvm.wgmma_scale_out<one>],
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
       B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
       : !mat32f32 -> !mat32f32
@@ -561,15 +561,15 @@ func.func @wgmma_f32_e5m2_e4m3(%descA : i64, %descB : i64) -> !mat32f32 {
   // CHECK-SAME: "{\0A.reg .pred p;\0Asetp.ne.b32 p, $66, 0;
   // CHECK-SAME: wgmma.mma_async.sync.aligned.m64n64k32.f32.e5m2.e4m3 {$0, $1, $2, $3, $4, $5, $6, $7, $8, $9, $10, $11, $12, $13, $14, $15, $16, $17, $18, $19, $20, $21, $22, $23, $24, $25, $26, $27, $28, $29, $30, $31}, $64, $65, p, $67,  $68;\0A}\0A", "=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,=f,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,l,l,n,n,n"
   %result = llvm.mlir.undef : !mat32f32
-  %result1 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result1 = nvvm.wgmma.mma_async %descA, %descB, %result,
       #nvvm.shape<m = 64, n = 64, k = 32>, 
-      D [%result, #nvvm.wgmma_scale_out<one>],
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
       B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
        : !mat32f32 -> !mat32f32
-  %result2 = nvvm.wgmma.mma_async %descA, %descB, 
+  %result2 = nvvm.wgmma.mma_async %descA, %descB, %result1,
       #nvvm.shape<m = 64, n = 64, k = 32>, 
-      D [%result1, #nvvm.wgmma_scale_out<one>],
+      D [#nvvm.wgmma_type<f32>, #nvvm.wgmma_scale_out<one>],
       A [#nvvm.wgmma_type<e5m2>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>], 
       B [#nvvm.wgmma_type<e4m3>, #nvvm.wgmma_scale_in<one>, #nvvm.mma_layout<row>]
       : !mat32f32 -> !mat32f32
diff --git a/mlir/test/python/dialects/nvvm.py b/mlir/test/python/dialects/nvvm.py
index 36aaaea79b186..0eef97d95479a 100644
--- a/mlir/test/python/dialects/nvvm.py
+++ b/mlir/test/python/dialects/nvvm.py
@@ -32,7 +32,7 @@ def wgmma_f32_f16_f16(desc_a, desc_b):
         nvvm.CpAsyncWaitGroupOp(5)
         # CHECK: %0 = llvm.mlir.undef : [[MAT_T:.*]]
         result = llvm.UndefOp(mat64f32_t)
-        # CHECK: %1 = nvvm.wgmma.mma_async %arg0, %arg1, <m = 64, n = 32, k = 16>, D[%0, <zero>], A[<f16>, <neg>, <col>], B[<f16>, <neg>, <col>] : [[MAT_T]] -> [[MAT_T]]
+        # CHECK: %1 = nvvm.wgmma.mma_async %arg0, %arg1, %0, <m = 64, n = 32, k = 16>, D[<f32>, <zero>], A[<f16>, <neg>, <col>], B[<f16>, <neg>, <col>] : [[MAT_T]] -> [[MAT_T]]
         result1 = nvvm.WgmmaMmaAsyncOp(
             results_=mat64f32_t,
             inouts=result,
@@ -41,6 +41,7 @@ def wgmma_f32_f16_f16(desc_a, desc_b):
             shape=shape_attr,
             typeA=nvvm.WGMMATypes.f16,
             typeB=nvvm.WGMMATypes.f16,
+            typeD=nvvm.WGMMATypes.f32,
             scaleD=nvvm.WGMMAScaleOut.zero,
             scaleA=nvvm.WGMMAScaleIn.neg,
             scaleB=nvvm.WGMMAScaleIn.neg,

From aa4547fcc8eeb9bf4f3cf48cc926f62544e58767 Mon Sep 17 00:00:00 2001
From: Durgadoss R <durgadossr@nvidia.com>
Date: Mon, 22 Jan 2024 13:09:30 +0530
Subject: [PATCH 332/843] [MLIR][NVVM] Update cp.async.bulk Ops to use
 intrinsics (#78900)

This patch updates the cp.async.bulk.{commit/wait}_group Ops to use NVVM
intrinsics.
* Doc updated for the commit_group Op.
* Tests are added to verify the lowering to the intrinsics.

While we are there, fix the FileCheck directive on the
'nvvm.setmaxregister' test.

Signed-off-by: Durgadoss R <durgadossr@nvidia.com>
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td   | 30 +++++++++++--------
 .../Conversion/NVVMToLLVM/nvvm-to-llvm.mlir   | 18 +++++------
 mlir/test/Target/LLVMIR/nvvmir.mlir           | 24 +++++++++++++--
 3 files changed, 47 insertions(+), 25 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index b1bd3a9506807..37e525a139d4a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1591,19 +1591,26 @@ def NVVM_MmaOp : NVVM_Op<"mma.sync", [AttrSizedOperandSegments]> {
 // NVVM TMA Ops
 //===----------------------------------------------------------------------===//
 
-def NVVM_CpAsyncBulkCommitGroupOp : NVVM_PTXBuilder_Op<"cp.async.bulk.commit.group">,
+def NVVM_CpAsyncBulkCommitGroupOp : NVVM_Op<"cp.async.bulk.commit.group">,
   Arguments<(ins )> {
   let assemblyFormat = "attr-dict";
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { return std::string("cp.async.bulk.commit_group;"); }
+  let description = [{
+    This Op commits all prior initiated but uncommitted cp.async.bulk
+    instructions into a cp.async.bulk-group.
+
+    [For more information, see PTX ISA]
+    (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-commit-group)
+  }];
+
+  string llvmBuilder = [{
+    createIntrinsicCall(builder, llvm::Intrinsic::nvvm_cp_async_bulk_commit_group);
   }];
 }
 
-def NVVM_CpAsyncBulkWaitGroupOp : NVVM_PTXBuilder_Op<"cp.async.bulk.wait_group">,
+def NVVM_CpAsyncBulkWaitGroupOp : NVVM_Op<"cp.async.bulk.wait_group">,
   Arguments<(ins 
     ConfinedAttr<I32Attr, [IntMinValue<0>]>:$group, 
-    OptionalAttr<UnitAttr>:$read)> 
-{
+    OptionalAttr<UnitAttr>:$read)> {
   let assemblyFormat = "$group attr-dict";
   let description = [{
     Op waits for completion of the most recent bulk async-groups.
@@ -1620,15 +1627,14 @@ def NVVM_CpAsyncBulkWaitGroupOp : NVVM_PTXBuilder_Op<"cp.async.bulk.wait_group">
     (https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-wait-group)
   }];
   
-  let extraClassDefinition = [{
-    std::string $cppClass::getPtx() { 
-      auto ptx = std::string("cp.async.bulk.wait_group");
-      if(getRead()) ptx += ".read";
-      ptx += " %0;"; return ptx; }
+  string llvmBuilder = [{
+    auto intId = op.getRead() ?
+      llvm::Intrinsic::nvvm_cp_async_bulk_wait_group_read :
+      llvm::Intrinsic::nvvm_cp_async_bulk_wait_group;
+    createIntrinsicCall(builder, intId, builder.getInt32($group));
   }];
 }
 
-
 def NVVM_CpAsyncBulkTensorGlobalToSharedClusterOp : 
   NVVM_Op<"cp.async.bulk.tensor.shared.cluster.global", 
   [DeclareOpInterfaceMethods<BasicPtxBuilderOpInterface>, 
diff --git a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
index 9c7c27c49eb11..0ac7331e1f698 100644
--- a/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
+++ b/mlir/test/Conversion/NVVMToLLVM/nvvm-to-llvm.mlir
@@ -638,23 +638,19 @@ func.func @set_max_register() {
 
 // -----
 
-func.func @cp_bulk_commit() {
-  //CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.commit_group;"
+func.func @cp_async_bulk_commit() {
+  // CHECK: nvvm.cp.async.bulk.commit.group
   nvvm.cp.async.bulk.commit.group
   func.return
 }
 
 // -----
 
-func.func @cp_bulk_wait_group() {
-  // CHECK: %[[S0:.+]] = llvm.mlir.constant(1 : i32) : i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.wait_group $0;", "n" %[[S0]] : (i32) -> ()
-  // CHECK: %[[S1:.+]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.wait_group $0;", "n" %[[S1]] : (i32) -> ()
-  // CHECK: %[[S2:.+]] = llvm.mlir.constant(5 : i32) : i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.wait_group.read $0;", "n" %[[S2]] : (i32) -> ()
-  // CHECK: %[[S3:.+]] = llvm.mlir.constant(0 : i32) : i32
-  // CHECK: llvm.inline_asm has_side_effects asm_dialect = att "cp.async.bulk.wait_group.read $0;", "n" %[[S3]] : (i32) -> ()
+func.func @cp_async_bulk_wait_group() {
+  // CHECK: nvvm.cp.async.bulk.wait_group 1
+  // CHECK: nvvm.cp.async.bulk.wait_group 0
+  // CHECK: nvvm.cp.async.bulk.wait_group 5 {read}
+  // CHECK: nvvm.cp.async.bulk.wait_group 0 {read}
   nvvm.cp.async.bulk.wait_group 1
   nvvm.cp.async.bulk.wait_group 0
   nvvm.cp.async.bulk.wait_group 5 {read}
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 8c5e3524a848f..49f9426daabc2 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -398,13 +398,33 @@ llvm.func @cp_async_mbarrier_arrive(%bar_shared: !llvm.ptr<3>, %bar_gen: !llvm.p
 
 // CHECK-LABEL: @llvm_nvvm_setmaxregister
 llvm.func @llvm_nvvm_setmaxregister() {
-  // CHECK-LLVM: call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 256)
+  // CHECK: call void @llvm.nvvm.setmaxnreg.inc.sync.aligned.u32(i32 256)
   nvvm.setmaxregister increase 256
-  // CHECK-LLVM: call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 24)
+  // CHECK: call void @llvm.nvvm.setmaxnreg.dec.sync.aligned.u32(i32 24)
   nvvm.setmaxregister decrease 24
   llvm.return
 }
 
+// CHECK-LABEL: @llvm_nvvm_cp_async_bulk_commit_group
+llvm.func @llvm_nvvm_cp_async_bulk_commit_group() {
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.commit.group()
+  nvvm.cp.async.bulk.commit.group
+  llvm.return
+}
+
+// CHECK-LABEL: @llvm_nvvm_cp_async_bulk_wait_group
+llvm.func @llvm_nvvm_cp_async_bulk_wait_group() {
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.wait.group(i32 0)
+  nvvm.cp.async.bulk.wait_group 0
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.wait.group(i32 3)
+  nvvm.cp.async.bulk.wait_group 3
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.wait.group.read(i32 0)
+  nvvm.cp.async.bulk.wait_group 0 {read}
+  // CHECK: call void @llvm.nvvm.cp.async.bulk.wait.group.read(i32 3)
+  nvvm.cp.async.bulk.wait_group 3 {read}
+  llvm.return
+}
+
 // CHECK-LABEL: @ld_matrix
 llvm.func @ld_matrix(%arg0: !llvm.ptr<3>) {
   // CHECK: call i32 @llvm.nvvm.ldmatrix.sync.aligned.m8n8.x1.b16.p3(ptr addrspace(3) %{{.*}})

From 5ab2d9c0d9d4ea381cc2f5cbb047c1b1847d0b21 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 22 Jan 2024 00:04:46 -0800
Subject: [PATCH 333/843] [RISCV] Arrange RISCVFeatures.td into sections of
 related extensions. NFC (#78790)

Put I and Zi* together. Put F/D/Zf* together. Put A and Za* together,
etc.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td | 578 +++++++++++++------------
 1 file changed, 304 insertions(+), 274 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 115da8c4a1a93..cbb096ba20ae6 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -10,12 +10,7 @@
 // RISC-V subtarget features and instruction predicates.
 //===----------------------------------------------------------------------===//
 
-def FeatureStdExtZicsr
-    : SubtargetFeature<"zicsr", "HasStdExtZicsr", "true",
-                       "'zicsr' (CSRs)">;
-def HasStdExtZicsr : Predicate<"Subtarget->hasStdExtZicsr()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicsr),
-                                "'Zicsr' (CSRs)">;
+// Integer Extensions
 
 def FeatureStdExtI
     : SubtargetFeature<"i", "HasStdExtI", "true",
@@ -23,6 +18,115 @@ def FeatureStdExtI
 def HasStdExtI : Predicate<"Subtarget->hasStdExtI()">,
                            AssemblerPredicate<(all_of FeatureStdExtI),
                            "'I' (Base Integer Instruction Set)">;
+def FeatureStdExtZic64b
+    : SubtargetFeature<"zic64b", "HasStdExtZic64b", "true",
+                       "'Zic64b' (Cache Block Size Is 64 Bytes)">;
+
+def FeatureStdExtZicbom
+    : SubtargetFeature<"zicbom", "HasStdExtZicbom", "true",
+                       "'Zicbom' (Cache-Block Management Instructions)">;
+def HasStdExtZicbom : Predicate<"Subtarget->hasStdExtZicbom()">,
+                                AssemblerPredicate<(all_of FeatureStdExtZicbom),
+                                "'Zicbom' (Cache-Block Management Instructions)">;
+
+def FeatureStdExtZicbop
+    : SubtargetFeature<"zicbop", "HasStdExtZicbop", "true",
+                       "'Zicbop' (Cache-Block Prefetch Instructions)">;
+def HasStdExtZicbop : Predicate<"Subtarget->hasStdExtZicbop()">,
+                                AssemblerPredicate<(all_of FeatureStdExtZicbop),
+                                "'Zicbop' (Cache-Block Prefetch Instructions)">;
+
+def FeatureStdExtZicboz
+    : SubtargetFeature<"zicboz", "HasStdExtZicboz", "true",
+                       "'Zicboz' (Cache-Block Zero Instructions)">;
+def HasStdExtZicboz : Predicate<"Subtarget->hasStdExtZicboz()">,
+                                AssemblerPredicate<(all_of FeatureStdExtZicboz),
+                                "'Zicboz' (Cache-Block Zero Instructions)">;
+
+def FeatureStdExtZiccamoa
+    : SubtargetFeature<"ziccamoa", "HasStdExtZiccamoa", "true",
+                       "'Ziccamoa' (Main Memory Supports All Atomics in A)">;
+
+def FeatureStdExtZiccif
+    : SubtargetFeature<"ziccif", "HasStdExtZiccif", "true",
+                       "'Ziccif' (Main Memory Supports Instruction Fetch with Atomicity Requirement)">;
+
+def FeatureStdExtZicclsm
+    : SubtargetFeature<"zicclsm", "HasStdExtZicclsm", "true",
+                       "'Zicclsm' (Main Memory Supports Misaligned Loads/Stores)">;
+
+def FeatureStdExtZiccrse
+    : SubtargetFeature<"ziccrse", "HasStdExtZiccrse", "true",
+                       "'Ziccrse' (Main Memory Supports Forward Progress on LR/SC Sequences)">;
+
+def FeatureStdExtZicsr
+    : SubtargetFeature<"zicsr", "HasStdExtZicsr", "true",
+                       "'zicsr' (CSRs)">;
+def HasStdExtZicsr : Predicate<"Subtarget->hasStdExtZicsr()">,
+                                AssemblerPredicate<(all_of FeatureStdExtZicsr),
+                                "'Zicsr' (CSRs)">;
+
+def FeatureStdExtZicntr
+    : SubtargetFeature<"zicntr", "HasStdExtZicntr", "true",
+                       "'Zicntr' (Base Counters and Timers)",
+                       [FeatureStdExtZicsr]>;
+
+def FeatureStdExtZicond
+    : SubtargetFeature<"experimental-zicond", "HasStdExtZicond", "true",
+                       "'Zicond' (Integer Conditional Operations)">;
+def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
+                                AssemblerPredicate<(all_of FeatureStdExtZicond),
+                                "'Zicond' (Integer Conditional Operations)">;
+
+def FeatureStdExtZifencei
+    : SubtargetFeature<"zifencei", "HasStdExtZifencei", "true",
+                       "'Zifencei' (fence.i)">;
+def HasStdExtZifencei : Predicate<"Subtarget->hasStdExtZifencei()">,
+                                   AssemblerPredicate<(all_of FeatureStdExtZifencei),
+                                   "'Zifencei' (fence.i)">;
+
+def FeatureStdExtZihintpause
+    : SubtargetFeature<"zihintpause", "HasStdExtZihintpause", "true",
+                       "'Zihintpause' (Pause Hint)">;
+def HasStdExtZihintpause : Predicate<"Subtarget->hasStdExtZihintpause()">,
+                                     AssemblerPredicate<(all_of FeatureStdExtZihintpause),
+                                     "'Zihintpause' (Pause Hint)">;
+
+def FeatureStdExtZihintntl
+    : SubtargetFeature<"zihintntl", "HasStdExtZihintntl", "true",
+                       "'Zihintntl' (Non-Temporal Locality Hints)">;
+def HasStdExtZihintntl : Predicate<"Subtarget->hasStdExtZihintntl()">,
+                                    AssemblerPredicate<(all_of FeatureStdExtZihintntl),
+                                    "'Zihintntl' (Non-Temporal Locality Hints)">;
+
+def FeatureStdExtZihpm
+    : SubtargetFeature<"zihpm", "HasStdExtZihpm", "true",
+                       "'Zihpm' (Hardware Performance Counters)",
+                       [FeatureStdExtZicsr]>;
+
+def FeatureStdExtZimop : SubtargetFeature<"experimental-zimop", "HasStdExtZimop", "true",
+                                          "'Zimop' (May-Be-Operations)">;
+def HasStdExtZimop : Predicate<"Subtarget->hasStdExtZimop()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZimop),
+                               "'Zimop' (May-Be-Operations)">;
+
+def FeatureStdExtZicfilp
+    : SubtargetFeature<"experimental-zicfilp", "HasStdExtZicfilp", "true",
+                       "'Zicfilp' (Landing pad)">;
+def HasStdExtZicfilp : Predicate<"Subtarget->hasStdExtZicfilp()">,
+                                 AssemblerPredicate<(all_of FeatureStdExtZicfilp),
+                                 "'Zicfilp' (Landing pad)">;
+
+def FeatureStdExtZicfiss
+    : SubtargetFeature<"experimental-zicfiss", "HasStdExtZicfiss", "true",
+                       "'Zicfiss' (Shadow stack)",
+                       [FeatureStdExtZicsr, FeatureStdExtZimop]>;
+def HasStdExtZicfiss : Predicate<"Subtarget->hasStdExtZicfiss()">,
+                                 AssemblerPredicate<(all_of FeatureStdExtZicfiss),
+                                 "'Zicfiss' (Shadow stack)">;
+def NoHasStdExtZicfiss : Predicate<"!Subtarget->hasStdExtZicfiss()">;
+
+// Multiply Extensions
 
 def FeatureStdExtM
     : SubtargetFeature<"m", "HasStdExtM", "true",
@@ -41,6 +145,8 @@ def HasStdExtMOrZmmul
                                    "'M' (Integer Multiplication and Division) or "
                                    "'Zmmul' (Integer Multiplication)">;
 
+// Atomic Extensions
+
 def FeatureStdExtA
     : SubtargetFeature<"a", "HasStdExtA", "true",
                        "'A' (Atomic Instructions)">;
@@ -48,6 +154,36 @@ def HasStdExtA : Predicate<"Subtarget->hasStdExtA()">,
                            AssemblerPredicate<(all_of FeatureStdExtA),
                            "'A' (Atomic Instructions)">;
 
+def FeatureStdExtZtso
+    : SubtargetFeature<"experimental-ztso", "HasStdExtZtso", "true",
+                       "'Ztso' (Memory Model - Total Store Order)">;
+def HasStdExtZtso : Predicate<"Subtarget->hasStdExtZtso()">,
+                              AssemblerPredicate<(all_of FeatureStdExtZtso),
+                              "'Ztso' (Memory Model - Total Store Order)">;
+def NotHasStdExtZtso : Predicate<"!Subtarget->hasStdExtZtso()">;
+
+def FeatureStdExtZa64rs : SubtargetFeature<"za64rs", "HasStdExtZa64rs", "true",
+                                           "'Za64rs' (Reservation Set Size of at Most 64 Bytes)">;
+
+def FeatureStdExtZa128rs : SubtargetFeature<"za128rs", "HasStdExtZa128rs", "true",
+                                            "'Za128rs' (Reservation Set Size of at Most 128 Bytes)">;
+
+def FeatureStdExtZacas
+    : SubtargetFeature<"experimental-zacas", "HasStdExtZacas", "true",
+                       "'Zacas' (Atomic Compare-And-Swap Instructions)">;
+def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZacas),
+                               "'Zacas' (Atomic Compare-And-Swap Instructions)">;
+def NoStdExtZacas : Predicate<"!Subtarget->hasStdExtZacas()">;
+
+def FeatureStdExtZawrs : SubtargetFeature<"zawrs", "HasStdExtZawrs", "true",
+                                          "'Zawrs' (Wait on Reservation Set)">;
+def HasStdExtZawrs : Predicate<"Subtarget->hasStdExtZawrs()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZawrs),
+                               "'Zawrs' (Wait on Reservation Set)">;
+
+// Floating Point Extensions
+
 def FeatureStdExtF
     : SubtargetFeature<"f", "HasStdExtF", "true",
                        "'F' (Single-Precision Floating-Point)",
@@ -72,53 +208,6 @@ def HasStdExtH : Predicate<"Subtarget->hasStdExtH()">,
                            AssemblerPredicate<(all_of FeatureStdExtH),
                            "'H' (Hypervisor)">;
 
-def FeatureStdExtZihintpause
-    : SubtargetFeature<"zihintpause", "HasStdExtZihintpause", "true",
-                       "'Zihintpause' (Pause Hint)">;
-def HasStdExtZihintpause : Predicate<"Subtarget->hasStdExtZihintpause()">,
-                                     AssemblerPredicate<(all_of FeatureStdExtZihintpause),
-                                     "'Zihintpause' (Pause Hint)">;
-
-def FeatureStdExtZihintntl
-    : SubtargetFeature<"zihintntl", "HasStdExtZihintntl", "true",
-                       "'Zihintntl' (Non-Temporal Locality Hints)">;
-def HasStdExtZihintntl : Predicate<"Subtarget->hasStdExtZihintntl()">,
-                                    AssemblerPredicate<(all_of FeatureStdExtZihintntl),
-                                    "'Zihintntl' (Non-Temporal Locality Hints)">;
-
-def FeatureStdExtZifencei
-    : SubtargetFeature<"zifencei", "HasStdExtZifencei", "true",
-                       "'Zifencei' (fence.i)">;
-def HasStdExtZifencei : Predicate<"Subtarget->hasStdExtZifencei()">,
-                                   AssemblerPredicate<(all_of FeatureStdExtZifencei),
-                                   "'Zifencei' (fence.i)">;
-
-def FeatureStdExtZiccamoa
-    : SubtargetFeature<"ziccamoa", "HasStdExtZiccamoa", "true",
-                       "'Ziccamoa' (Main Memory Supports All Atomics in A)">;
-
-def FeatureStdExtZiccif
-    : SubtargetFeature<"ziccif", "HasStdExtZiccif", "true",
-                       "'Ziccif' (Main Memory Supports Instruction Fetch with Atomicity Requirement)">;
-
-def FeatureStdExtZicclsm
-    : SubtargetFeature<"zicclsm", "HasStdExtZicclsm", "true",
-                       "'Zicclsm' (Main Memory Supports Misaligned Loads/Stores)">;
-
-def FeatureStdExtZiccrse
-    : SubtargetFeature<"ziccrse", "HasStdExtZiccrse", "true",
-                       "'Ziccrse' (Main Memory Supports Forward Progress on LR/SC Sequences)">;
-
-def FeatureStdExtZicntr
-    : SubtargetFeature<"zicntr", "HasStdExtZicntr", "true",
-                       "'Zicntr' (Base Counters and Timers)",
-                       [FeatureStdExtZicsr]>;
-
-def FeatureStdExtZihpm
-    : SubtargetFeature<"zihpm", "HasStdExtZihpm", "true",
-                       "'Zihpm' (Hardware Performance Counters)",
-                       [FeatureStdExtZicsr]>;
-
 def FeatureStdExtZfhmin
     : SubtargetFeature<"zfhmin", "HasStdExtZfhmin", "true",
                        "'Zfhmin' (Half-Precision Floating-Point Minimal)",
@@ -136,12 +225,37 @@ def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
                              "'Zfh' (Half-Precision Floating-Point)">;
 def NoStdExtZfh : Predicate<"!Subtarget->hasStdExtZfh()">;
 
+// FIXME: Remove this.
 def HasStdExtZfhOrZfhmin
     : Predicate<"Subtarget->hasStdExtZfhmin()">,
                 AssemblerPredicate<(all_of FeatureStdExtZfhmin),
                                    "'Zfh' (Half-Precision Floating-Point) or "
                                    "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
 
+def FeatureStdExtZfbfmin
+    : SubtargetFeature<"experimental-zfbfmin", "HasStdExtZfbfmin", "true",
+                       "'Zfbfmin' (Scalar BF16 Converts)",
+                       [FeatureStdExtF]>;
+def HasStdExtZfbfmin : Predicate<"Subtarget->hasStdExtZfbfmin()">,
+                                 AssemblerPredicate<(all_of FeatureStdExtZfbfmin),
+                                 "'Zfbfmin' (Scalar BF16 Converts)">;
+
+def HasHalfFPLoadStoreMove
+    : Predicate<"Subtarget->hasHalfFPLoadStoreMove()">,
+                AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZfhmin,
+                                    FeatureStdExtZfbfmin),
+                                    "'Zfh' (Half-Precision Floating-Point) or "
+                                    "'Zfhmin' (Half-Precision Floating-Point Minimal) or "
+                                    "'Zfbfmin' (Scalar BF16 Converts)">;
+
+def FeatureStdExtZfa
+    : SubtargetFeature<"zfa", "HasStdExtZfa", "true",
+                       "'Zfa' (Additional Floating-Point)",
+                       [FeatureStdExtF]>;
+def HasStdExtZfa : Predicate<"Subtarget->hasStdExtZfa()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZfa),
+                             "'Zfa' (Additional Floating-Point)">;
+
 def FeatureStdExtZfinx
     : SubtargetFeature<"zfinx", "HasStdExtZfinx", "true",
                        "'Zfinx' (Float in Integer)",
@@ -181,13 +295,7 @@ def HasStdExtZhinxOrZhinxmin
                                    "'Zhinx' (Half Float in Integer) or "
                                    "'Zhinxmin' (Half Float in Integer Minimal)">;
 
-def FeatureStdExtZfa
-    : SubtargetFeature<"zfa", "HasStdExtZfa", "true",
-                       "'Zfa' (Additional Floating-Point)",
-                       [FeatureStdExtF]>;
-def HasStdExtZfa : Predicate<"Subtarget->hasStdExtZfa()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZfa),
-                             "'Zfa' (Additional Floating-Point)">;
+// Compressed Extensions
 
 def FeatureStdExtC
     : SubtargetFeature<"c", "HasStdExtC", "true",
@@ -196,6 +304,88 @@ def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
                            AssemblerPredicate<(all_of FeatureStdExtC),
                            "'C' (Compressed Instructions)">;
 
+def FeatureNoRVCHints
+    : SubtargetFeature<"no-rvc-hints", "EnableRVCHintInstrs", "false",
+                       "Disable RVC Hint Instructions.">;
+def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
+                  AssemblerPredicate<(all_of(not FeatureNoRVCHints)),
+                                      "RVC Hint Instructions">;
+
+def FeatureStdExtZca
+    : SubtargetFeature<"zca", "HasStdExtZca", "true",
+                       "'Zca' (part of the C extension, excluding compressed "
+                       "floating point loads/stores)">;
+
+def HasStdExtCOrZca
+    : Predicate<"Subtarget->hasStdExtCOrZca()">,
+                AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZca),
+                                   "'C' (Compressed Instructions) or "
+                                   "'Zca' (part of the C extension, excluding "
+                                   "compressed floating point loads/stores)">;
+
+def FeatureStdExtZcb
+    : SubtargetFeature<"zcb", "HasStdExtZcb", "true",
+                       "'Zcb' (Compressed basic bit manipulation instructions)",
+                       [FeatureStdExtZca]>;
+def HasStdExtZcb : Predicate<"Subtarget->hasStdExtZcb()">,
+                             AssemblerPredicate<(all_of FeatureStdExtZcb),
+                             "'Zcb' (Compressed basic bit manipulation instructions)">;
+
+def FeatureStdExtZcd
+    : SubtargetFeature<"zcd", "HasStdExtZcd", "true",
+                       "'Zcd' (Compressed Double-Precision Floating-Point Instructions)",
+                       [FeatureStdExtZca]>;
+
+def HasStdExtCOrZcd
+    : Predicate<"Subtarget->hasStdExtC() || Subtarget->hasStdExtZcd()">,
+                AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZcd),
+                                   "'C' (Compressed Instructions) or "
+                                   "'Zcd' (Compressed Double-Precision Floating-Point Instructions)">;
+
+def FeatureStdExtZcf
+    : SubtargetFeature<"zcf", "HasStdExtZcf", "true",
+                       "'Zcf' (Compressed Single-Precision Floating-Point Instructions)",
+                       [FeatureStdExtZca]>;
+
+def FeatureStdExtZcmp
+    : SubtargetFeature<"zcmp", "HasStdExtZcmp", "true",
+                       "'Zcmp' (sequenced instuctions for code-size reduction)",
+                       [FeatureStdExtZca]>;
+def HasStdExtZcmp : Predicate<"Subtarget->hasStdExtZcmp() && !Subtarget->hasStdExtC()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZcmp),
+                               "'Zcmp' (sequenced instuctions for code-size reduction)">;
+
+def FeatureStdExtZcmt
+    : SubtargetFeature<"zcmt", "HasStdExtZcmt", "true",
+                       "'Zcmt' (table jump instuctions for code-size reduction)",
+                       [FeatureStdExtZca, FeatureStdExtZicsr]>;
+def HasStdExtZcmt : Predicate<"Subtarget->hasStdExtZcmt()">,
+                           AssemblerPredicate<(all_of FeatureStdExtZcmt),
+                           "'Zcmt' (table jump instuctions for code-size reduction)">;
+
+def FeatureStdExtZce
+    : SubtargetFeature<"zce", "HasStdExtZce", "true",
+                       "'Zce' (Compressed extensions for microcontrollers)",
+                       [FeatureStdExtZca, FeatureStdExtZcb, FeatureStdExtZcmp,
+                        FeatureStdExtZcmt]>;
+
+def HasStdExtCOrZcfOrZce
+    : Predicate<"Subtarget->hasStdExtC() || Subtarget->hasStdExtZcf() "
+                "Subtarget->hasStdExtZce()">,
+                AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZcf,
+                                           FeatureStdExtZce),
+                                   "'C' (Compressed Instructions) or "
+                                   "'Zcf' (Compressed Single-Precision Floating-Point Instructions)">;
+
+def FeatureStdExtZcmop : SubtargetFeature<"experimental-zcmop", "HasStdExtZcmop", "true",
+                                          "'Zcmop' (Compressed May-Be-Operations)",
+                                          [FeatureStdExtZca]>;
+def HasStdExtZcmop : Predicate<"Subtarget->hasStdExtZcmop()">,
+                               AssemblerPredicate<(all_of FeatureStdExtZcmop),
+                               "'Zcmop' (Compressed May-Be-Operations)">;
+
+// Bitmanip Extensions
+
 def FeatureStdExtZba
     : SubtargetFeature<"zba", "HasStdExtZba", "true",
                        "'Zba' (Address Generation Instructions)">;
@@ -225,6 +415,8 @@ def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
                              AssemblerPredicate<(all_of FeatureStdExtZbs),
                              "'Zbs' (Single-Bit Instructions)">;
 
+// Bitmanip Extensions for Cryptography Extensions
+
 def FeatureStdExtZbkb
     : SubtargetFeature<"zbkb", "HasStdExtZbkb", "true",
                        "'Zbkb' (Bitmanip instructions for Cryptography)">;
@@ -264,6 +456,8 @@ def HasStdExtZbcOrZbkc
                                    "'Zbkc' (Carry-less multiply instructions "
                                    "for Cryptography)">;
 
+// Cryptography Extensions
+
 def FeatureStdExtZknd
     : SubtargetFeature<"zknd", "HasStdExtZknd", "true",
                        "'Zknd' (NIST Suite: AES Decryption)">;
@@ -345,78 +539,7 @@ def FeatureStdExtZk
                         FeatureStdExtZkr,
                         FeatureStdExtZkt]>;
 
-def FeatureStdExtZca
-    : SubtargetFeature<"zca", "HasStdExtZca", "true",
-                       "'Zca' (part of the C extension, excluding compressed "
-                       "floating point loads/stores)">;
-
-def HasStdExtCOrZca
-    : Predicate<"Subtarget->hasStdExtCOrZca()">,
-                AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZca),
-                                   "'C' (Compressed Instructions) or "
-                                   "'Zca' (part of the C extension, excluding "
-                                   "compressed floating point loads/stores)">;
-
-def FeatureStdExtZcb
-    : SubtargetFeature<"zcb", "HasStdExtZcb", "true",
-                       "'Zcb' (Compressed basic bit manipulation instructions)",
-                       [FeatureStdExtZca]>;
-def HasStdExtZcb : Predicate<"Subtarget->hasStdExtZcb()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZcb),
-                             "'Zcb' (Compressed basic bit manipulation instructions)">;
-
-def FeatureStdExtZcd
-    : SubtargetFeature<"zcd", "HasStdExtZcd", "true",
-                       "'Zcd' (Compressed Double-Precision Floating-Point Instructions)",
-                       [FeatureStdExtZca]>;
-
-def HasStdExtCOrZcd
-    : Predicate<"Subtarget->hasStdExtC() || Subtarget->hasStdExtZcd()">,
-                AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZcd),
-                                   "'C' (Compressed Instructions) or "
-                                   "'Zcd' (Compressed Double-Precision Floating-Point Instructions)">;
-
-def FeatureStdExtZcf
-    : SubtargetFeature<"zcf", "HasStdExtZcf", "true",
-                       "'Zcf' (Compressed Single-Precision Floating-Point Instructions)",
-                       [FeatureStdExtZca]>;
-
-def FeatureStdExtZcmp
-    : SubtargetFeature<"zcmp", "HasStdExtZcmp", "true",
-                       "'Zcmp' (sequenced instuctions for code-size reduction)",
-                       [FeatureStdExtZca]>;
-def HasStdExtZcmp : Predicate<"Subtarget->hasStdExtZcmp() && !Subtarget->hasStdExtC()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZcmp),
-                               "'Zcmp' (sequenced instuctions for code-size reduction)">;
-
-def FeatureStdExtZcmt
-    : SubtargetFeature<"zcmt", "HasStdExtZcmt", "true",
-                       "'Zcmt' (table jump instuctions for code-size reduction)",
-                       [FeatureStdExtZca, FeatureStdExtZicsr]>;
-def HasStdExtZcmt : Predicate<"Subtarget->hasStdExtZcmt()">,
-                           AssemblerPredicate<(all_of FeatureStdExtZcmt),
-                           "'Zcmt' (table jump instuctions for code-size reduction)">;
-
-def FeatureStdExtZce
-    : SubtargetFeature<"zce", "HasStdExtZce", "true",
-                       "'Zce' (Compressed extensions for microcontrollers)",
-                       [FeatureStdExtZca, FeatureStdExtZcb, FeatureStdExtZcmp,
-                        FeatureStdExtZcmt]>;
-
-def HasStdExtCOrZcfOrZce
-    : Predicate<"Subtarget->hasStdExtC() || Subtarget->hasStdExtZcf() "
-                "Subtarget->hasStdExtZce()">,
-                AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZcf,
-                                           FeatureStdExtZce),
-                                   "'C' (Compressed Instructions) or "
-                                   "'Zcf' (Compressed Single-Precision Floating-Point Instructions)">;
-
-def FeatureNoRVCHints
-    : SubtargetFeature<"no-rvc-hints", "EnableRVCHintInstrs", "false",
-                       "Disable RVC Hint Instructions.">;
-def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
-                  AssemblerPredicate<(all_of(not FeatureNoRVCHints)),
-                                      "RVC Hint Instructions">;
+// Vector Extensions
 
 def FeatureStdExtZvl32b : SubtargetFeature<"zvl32b", "ZvlLen", "32",
                                            "'Zvl' (Minimum Vector Length) 32">;
@@ -464,34 +587,6 @@ def FeatureStdExtV
                        "'V' (Vector Extension for Application Processors)",
                        [FeatureStdExtZvl128b, FeatureStdExtZve64d]>;
 
-def HasVInstructions    : Predicate<"Subtarget->hasVInstructions()">,
-      AssemblerPredicate<
-          (any_of FeatureStdExtZve32x),
-          "'V' (Vector Extension for Application Processors), 'Zve32x' "
-          "(Vector Extensions for Embedded Processors)">;
-def HasVInstructionsI64 : Predicate<"Subtarget->hasVInstructionsI64()">,
-      AssemblerPredicate<
-          (any_of FeatureStdExtZve64x),
-          "'V' (Vector Extension for Application Processors) or 'Zve64x' "
-          "(Vector Extensions for Embedded Processors)">;
-def HasVInstructionsAnyF : Predicate<"Subtarget->hasVInstructionsAnyF()">,
-      AssemblerPredicate<
-          (any_of FeatureStdExtZve32f),
-          "'V' (Vector Extension for Application Processors), 'Zve32f' "
-          "(Vector Extensions for Embedded Processors)">;
-
-def HasVInstructionsF64 : Predicate<"Subtarget->hasVInstructionsF64()">;
-
-def HasVInstructionsFullMultiply : Predicate<"Subtarget->hasVInstructionsFullMultiply()">;
-
-def FeatureStdExtZfbfmin
-    : SubtargetFeature<"experimental-zfbfmin", "HasStdExtZfbfmin", "true",
-                       "'Zfbfmin' (Scalar BF16 Converts)",
-                       [FeatureStdExtF]>;
-def HasStdExtZfbfmin : Predicate<"Subtarget->hasStdExtZfbfmin()">,
-                                 AssemblerPredicate<(all_of FeatureStdExtZfbfmin),
-                                 "'Zfbfmin' (Scalar BF16 Converts)">;
-
 def FeatureStdExtZvfbfmin
     : SubtargetFeature<"experimental-zvfbfmin", "HasStdExtZvfbfmin", "true",
                        "'Zvbfmin' (Vector BF16 Converts)",
@@ -508,8 +603,6 @@ def HasStdExtZvfbfwma : Predicate<"Subtarget->hasStdExtZvfbfwma()">,
                                   AssemblerPredicate<(all_of FeatureStdExtZvfbfwma),
                                   "'Zvfbfwma' (Vector BF16 widening mul-add)">;
 
-def HasVInstructionsBF16 : Predicate<"Subtarget->hasVInstructionsBF16()">;
-
 def FeatureStdExtZvfhmin
     : SubtargetFeature<"zvfhmin", "HasStdExtZvfhmin", "true",
                        "'Zvfhmin' (Vector Half-Precision Floating-Point Minimal)",
@@ -520,78 +613,13 @@ def FeatureStdExtZvfh
                        "'Zvfh' (Vector Half-Precision Floating-Point)",
                        [FeatureStdExtZvfhmin, FeatureStdExtZfhmin]>;
 
-def HasVInstructionsF16 : Predicate<"Subtarget->hasVInstructionsF16()">;
-
-def HasVInstructionsF16Minimal : Predicate<"Subtarget->hasVInstructionsF16Minimal()">,
-      AssemblerPredicate<(any_of FeatureStdExtZvfhmin, FeatureStdExtZvfh),
-                         "'Zvfhmin' (Vector Half-Precision Floating-Point Minimal) or "
-                         "'Zvfh' (Vector Half-Precision Floating-Point)">;
-
 def HasStdExtZfhOrZvfh
     : Predicate<"Subtarget->hasStdExtZfh() || Subtarget->hasStdExtZvfh()">,
                 AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZvfh),
                                    "'Zfh' (Half-Precision Floating-Point) or "
                                    "'Zvfh' (Vector Half-Precision Floating-Point)">;
 
-def FeatureStdExtZic64b
-    : SubtargetFeature<"zic64b", "HasStdExtZic64b", "true",
-                       "'Zic64b' (Cache Block Size Is 64 Bytes)">;
-
-def FeatureStdExtZicbom
-    : SubtargetFeature<"zicbom", "HasStdExtZicbom", "true",
-                       "'Zicbom' (Cache-Block Management Instructions)">;
-def HasStdExtZicbom : Predicate<"Subtarget->hasStdExtZicbom()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicbom),
-                                "'Zicbom' (Cache-Block Management Instructions)">;
-
-def FeatureStdExtZicboz
-    : SubtargetFeature<"zicboz", "HasStdExtZicboz", "true",
-                       "'Zicboz' (Cache-Block Zero Instructions)">;
-def HasStdExtZicboz : Predicate<"Subtarget->hasStdExtZicboz()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicboz),
-                                "'Zicboz' (Cache-Block Zero Instructions)">;
-
-def FeatureStdExtZicbop
-    : SubtargetFeature<"zicbop", "HasStdExtZicbop", "true",
-                       "'Zicbop' (Cache-Block Prefetch Instructions)">;
-def HasStdExtZicbop : Predicate<"Subtarget->hasStdExtZicbop()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicbop),
-                                "'Zicbop' (Cache-Block Prefetch Instructions)">;
-
-def FeatureStdExtSvnapot
-    : SubtargetFeature<"svnapot", "HasStdExtSvnapot", "true",
-                       "'Svnapot' (NAPOT Translation Contiguity)">;
-
-def FeatureStdExtSvpbmt
-    : SubtargetFeature<"svpbmt", "HasStdExtSvpbmt", "true",
-                       "'Svpbmt' (Page-Based Memory Types)">;
-
-def FeatureStdExtSvinval
-    : SubtargetFeature<"svinval", "HasStdExtSvinval", "true",
-                       "'Svinval' (Fine-Grained Address-Translation Cache Invalidation)">;
-def HasStdExtSvinval : Predicate<"Subtarget->hasStdExtSvinval()">,
-                                AssemblerPredicate<(all_of FeatureStdExtSvinval),
-                                "'Svinval' (Fine-Grained Address-Translation Cache Invalidation)">;
-
-def FeatureStdExtZtso
-    : SubtargetFeature<"experimental-ztso", "HasStdExtZtso", "true",
-                       "'Ztso' (Memory Model - Total Store Order)">;
-def HasStdExtZtso : Predicate<"Subtarget->hasStdExtZtso()">,
-                              AssemblerPredicate<(all_of FeatureStdExtZtso),
-                              "'Ztso' (Memory Model - Total Store Order)">;
-def NotHasStdExtZtso : Predicate<"!Subtarget->hasStdExtZtso()">;
-
-def FeatureStdExtZa64rs : SubtargetFeature<"za64rs", "HasStdExtZa64rs", "true",
-                                           "'Za64rs' (Reservation Set Size of at Most 64 Bytes)">;
-
-def FeatureStdExtZa128rs : SubtargetFeature<"za128rs", "HasStdExtZa128rs", "true",
-                                            "'Za128rs' (Reservation Set Size of at Most 128 Bytes)">;
-
-def FeatureStdExtZawrs : SubtargetFeature<"zawrs", "HasStdExtZawrs", "true",
-                                          "'Zawrs' (Wait on Reservation Set)">;
-def HasStdExtZawrs : Predicate<"Subtarget->hasStdExtZawrs()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZawrs),
-                               "'Zawrs' (Wait on Reservation Set)">;
+// Vector Cryptography and Bitmanip Extensions
 
 def FeatureStdExtZvkb
     : SubtargetFeature<"zvkb", "HasStdExtZvkb", "true",
@@ -702,47 +730,41 @@ def FeatureStdExtZvksg
                        "'Zvksg' (shorthand for 'Zvks' and 'Zvkg')",
                        [FeatureStdExtZvks, FeatureStdExtZvkg]>;
 
-def FeatureStdExtZicfilp
-    : SubtargetFeature<"experimental-zicfilp", "HasStdExtZicfilp", "true",
-                       "'Zicfilp' (Landing pad)">;
-def HasStdExtZicfilp : Predicate<"Subtarget->hasStdExtZicfilp()">,
-                                 AssemblerPredicate<(all_of FeatureStdExtZicfilp),
-                                 "'Zicfilp' (Landing pad)">;
+// Vector instruction predicates
 
-def FeatureStdExtZicond
-    : SubtargetFeature<"experimental-zicond", "HasStdExtZicond", "true",
-                       "'Zicond' (Integer Conditional Operations)">;
-def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicond),
-                                "'Zicond' (Integer Conditional Operations)">;
+def HasVInstructions    : Predicate<"Subtarget->hasVInstructions()">,
+      AssemblerPredicate<
+          (any_of FeatureStdExtZve32x),
+          "'V' (Vector Extension for Application Processors), 'Zve32x' "
+          "(Vector Extensions for Embedded Processors)">;
+def HasVInstructionsI64 : Predicate<"Subtarget->hasVInstructionsI64()">,
+      AssemblerPredicate<
+          (any_of FeatureStdExtZve64x),
+          "'V' (Vector Extension for Application Processors) or 'Zve64x' "
+          "(Vector Extensions for Embedded Processors)">;
+def HasVInstructionsAnyF : Predicate<"Subtarget->hasVInstructionsAnyF()">,
+      AssemblerPredicate<
+          (any_of FeatureStdExtZve32f),
+          "'V' (Vector Extension for Application Processors), 'Zve32f' "
+          "(Vector Extensions for Embedded Processors)">;
 
-def FeatureStdExtZimop : SubtargetFeature<"experimental-zimop", "HasStdExtZimop", "true",
-                                          "'Zimop' (May-Be-Operations)">;
-def HasStdExtZimop : Predicate<"Subtarget->hasStdExtZimop()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZimop),
-                               "'Zimop' (May-Be-Operations)">;
+def HasVInstructionsF16Minimal : Predicate<"Subtarget->hasVInstructionsF16Minimal()">,
+      AssemblerPredicate<(any_of FeatureStdExtZvfhmin, FeatureStdExtZvfh),
+                         "'Zvfhmin' (Vector Half-Precision Floating-Point Minimal) or "
+                         "'Zvfh' (Vector Half-Precision Floating-Point)">;
 
-def FeatureStdExtZcmop : SubtargetFeature<"experimental-zcmop", "HasStdExtZcmop", "true",
-                                          "'Zcmop' (Compressed May-Be-Operations)",
-                                          [FeatureStdExtZca]>;
-def HasStdExtZcmop : Predicate<"Subtarget->hasStdExtZcmop()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZcmop),
-                               "'Zcmop' (Compressed May-Be-Operations)">;
+def HasVInstructionsBF16 : Predicate<"Subtarget->hasVInstructionsBF16()">;
+def HasVInstructionsF16 : Predicate<"Subtarget->hasVInstructionsF16()">;
+def HasVInstructionsF64 : Predicate<"Subtarget->hasVInstructionsF64()">;
 
-def FeatureStdExtZicfiss
-    : SubtargetFeature<"experimental-zicfiss", "HasStdExtZicfiss", "true",
-                       "'Zicfiss' (Shadow stack)",
-                       [FeatureStdExtZicsr, FeatureStdExtZimop]>;
-def HasStdExtZicfiss : Predicate<"Subtarget->hasStdExtZicfiss()">,
-                                 AssemblerPredicate<(all_of FeatureStdExtZicfiss),
-                                 "'Zicfiss' (Shadow stack)">;
-def NoHasStdExtZicfiss : Predicate<"!Subtarget->hasStdExtZicfiss()">;
+def HasVInstructionsFullMultiply : Predicate<"Subtarget->hasVInstructionsFullMultiply()">;
+
+// Supervisor extensions
 
 def FeatureStdExtSmaia
     : SubtargetFeature<"smaia", "HasStdExtSmaia", "true",
                        "'Smaia' (Advanced Interrupt Architecture Machine "
                        "Level)", []>;
-
 def FeatureStdExtSsaia
     : SubtargetFeature<"ssaia", "HasStdExtSsaia", "true",
                        "'Ssaia' (Advanced Interrupt Architecture Supervisor "
@@ -752,26 +774,27 @@ def FeatureStdExtSmepmp
     : SubtargetFeature<"smepmp", "HasStdExtSmepmp", "true",
                        "'Smepmp' (Enhanced Physical Memory Protection)", []>;
 
-def HasHalfFPLoadStoreMove
-    : Predicate<"Subtarget->hasHalfFPLoadStoreMove()">,
-                AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZfhmin,
-                                    FeatureStdExtZfbfmin),
-                                    "'Zfh' (Half-Precision Floating-Point) or "
-                                    "'Zfhmin' (Half-Precision Floating-Point Minimal) or "
-                                    "'Zfbfmin' (Scalar BF16 Converts)">;
+def FeatureStdExtSvinval
+    : SubtargetFeature<"svinval", "HasStdExtSvinval", "true",
+                       "'Svinval' (Fine-Grained Address-Translation Cache Invalidation)">;
+def HasStdExtSvinval : Predicate<"Subtarget->hasStdExtSvinval()">,
+                                AssemblerPredicate<(all_of FeatureStdExtSvinval),
+                                "'Svinval' (Fine-Grained Address-Translation Cache Invalidation)">;
 
-def FeatureStdExtZacas
-    : SubtargetFeature<"experimental-zacas", "HasStdExtZacas", "true",
-                       "'Zacas' (Atomic Compare-And-Swap Instructions)">;
-def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZacas),
-                               "'Zacas' (Atomic Compare-And-Swap Instructions)">;
-def NoStdExtZacas : Predicate<"!Subtarget->hasStdExtZacas()">;
+def FeatureStdExtSvnapot
+    : SubtargetFeature<"svnapot", "HasStdExtSvnapot", "true",
+                       "'Svnapot' (NAPOT Translation Contiguity)">;
+
+def FeatureStdExtSvpbmt
+    : SubtargetFeature<"svpbmt", "HasStdExtSvpbmt", "true",
+                       "'Svpbmt' (Page-Based Memory Types)">;
 
 //===----------------------------------------------------------------------===//
 // Vendor extensions
 //===----------------------------------------------------------------------===//
 
+// Ventana Extenions
+
 def FeatureVendorXVentanaCondOps
     : SubtargetFeature<"xventanacondops", "HasVendorXVentanaCondOps", "true",
                        "'XVentanaCondOps' (Ventana Conditional Ops)">;
@@ -779,6 +802,8 @@ def HasVendorXVentanaCondOps : Predicate<"Subtarget->hasVendorXVentanaCondOps()"
                                 AssemblerPredicate<(all_of FeatureVendorXVentanaCondOps),
                                 "'XVentanaCondOps' (Ventana Conditional Ops)">;
 
+// T-Head Extensions
+
 def FeatureVendorXTHeadBa
     : SubtargetFeature<"xtheadba", "HasVendorXTHeadBa", "true",
                        "'xtheadba' (T-Head address calculation instructions)">;
@@ -858,6 +883,8 @@ def HasVendorXTHeadVdot : Predicate<"Subtarget->hasVendorXTHeadVdot()">,
                                     AssemblerPredicate<(all_of FeatureVendorXTHeadVdot),
                                     "'xtheadvdot' (T-Head Vector Extensions for Dot)">;
 
+// SiFive Extensions
+
 def FeatureVendorXSfvcp
     : SubtargetFeature<"xsfvcp", "HasVendorXSfvcp", "true",
                        "'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions)",
@@ -897,6 +924,9 @@ def FeatureVendorXSfvfnrclipxfqf
 def HasVendorXSfvfnrclipxfqf : Predicate<"Subtarget->hasVendorXSfvfnrclipxfqf()">,
                                AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf),
                                "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">;
+
+// Core-V Extensions
+
 def FeatureVendorXCVelw
    : SubtargetFeature<"xcvelw", "HasVendorXCVelw", "true",
                       "'XCVelw' (CORE-V Event Load Word)">;

From 7556626dcff15c8cc5160078a4d6ed2469eed81b Mon Sep 17 00:00:00 2001
From: Ryotaro KASUGA <kasuga.ryotaro@fujitsu.com>
Date: Mon, 22 Jan 2024 17:06:37 +0900
Subject: [PATCH 334/843] [CodeGen][MachinePipeliner] Limit register pressure
 when scheduling (#74807)

In software pipelining, when searching for the Initiation Interval (II),
`MachinePipeliner` tries to reduce register pressure, but doesn't check
how many variables can actually be alive at the same time. As a result,
a lot of register spills/fills can be generated after register
allocation, which might cause performance degradation. To prevent such
cases, this patch adds a check phase that calculates the maximum
register pressure of the scheduled loop and reject it if the pressure is
too high. This can be enabled this by specifying
`pipeliner-register-pressure`. Additionally, an II search range is
currently fixed at 10, which is too small to find a schedule when the
above algorithm is applied. Therefore this patch also adds a new option
`pipeliner-ii-search-range` to specify the length of the range to
search. There is one more new option
`pipeliner-register-pressure-margin`, which can be used to estimate a
register pressure limit less than actual for conservative analysis.

Discourse thread:
https://discourse.llvm.org/t/considering-register-pressure-when-deciding-initiation-interval-in-machinepipeliner/74725
---
 llvm/include/llvm/CodeGen/MachinePipeliner.h |  18 +-
 llvm/lib/CodeGen/MachinePipeliner.cpp        | 482 ++++++++++++++++++-
 llvm/test/CodeGen/PowerPC/sms-regpress.mir   | 328 +++++++++++++
 3 files changed, 801 insertions(+), 27 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/sms-regpress.mir

diff --git a/llvm/include/llvm/CodeGen/MachinePipeliner.h b/llvm/include/llvm/CodeGen/MachinePipeliner.h
index 04055ba9732dd..8f0a17cf99967 100644
--- a/llvm/include/llvm/CodeGen/MachinePipeliner.h
+++ b/llvm/include/llvm/CodeGen/MachinePipeliner.h
@@ -273,8 +273,8 @@ class SwingSchedulerDAG : public ScheduleDAGInstrs {
 
   /// Return the new base register that was stored away for the changed
   /// instruction.
-  unsigned getInstrBaseReg(SUnit *SU) {
-    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::iterator It =
+  unsigned getInstrBaseReg(SUnit *SU) const {
+    DenseMap<SUnit *, std::pair<unsigned, int64_t>>::const_iterator It =
         InstrChanges.find(SU);
     if (It != InstrChanges.end())
       return It->second.first;
@@ -639,16 +639,20 @@ class SMSchedule {
   computeUnpipelineableNodes(SwingSchedulerDAG *SSD,
                              TargetInstrInfo::PipelinerLoopInfo *PLI);
 
+  std::deque<SUnit *>
+  reorderInstructions(const SwingSchedulerDAG *SSD,
+                      const std::deque<SUnit *> &Instrs) const;
+
   bool
   normalizeNonPipelinedInstructions(SwingSchedulerDAG *SSD,
                                     TargetInstrInfo::PipelinerLoopInfo *PLI);
   bool isValidSchedule(SwingSchedulerDAG *SSD);
   void finalizeSchedule(SwingSchedulerDAG *SSD);
-  void orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
-                       std::deque<SUnit *> &Insts);
-  bool isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi);
-  bool isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD, MachineInstr *Def,
-                             MachineOperand &MO);
+  void orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
+                       std::deque<SUnit *> &Insts) const;
+  bool isLoopCarried(const SwingSchedulerDAG *SSD, MachineInstr &Phi) const;
+  bool isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD, MachineInstr *Def,
+                             MachineOperand &MO) const;
   void print(raw_ostream &os) const;
   void dump() const;
 };
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 8cd7f4ebe88d9..5c9f0f1703a6e 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -35,6 +35,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PriorityQueue.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -60,9 +61,12 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ModuloSchedule.h"
+#include "llvm/CodeGen/Register.h"
+#include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/RegisterPressure.h"
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/ScheduleDAGMutation.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -174,6 +178,20 @@ static cl::opt<bool> ExperimentalCodeGen(
     cl::desc(
         "Use the experimental peeling code generator for software pipelining"));
 
+static cl::opt<int> SwpIISearchRange("pipeliner-ii-search-range",
+                                     cl::desc("Range to search for II"),
+                                     cl::Hidden, cl::init(10));
+
+static cl::opt<bool>
+    LimitRegPressure("pipeliner-register-pressure", cl::Hidden, cl::init(false),
+                     cl::desc("Limit register pressure of scheduled loop"));
+
+static cl::opt<int>
+    RegPressureMargin("pipeliner-register-pressure-margin", cl::Hidden,
+                      cl::init(5),
+                      cl::desc("Margin representing the unused percentage of "
+                               "the register pressure limit"));
+
 namespace llvm {
 
 // A command line option to enable the CopyToPhi DAG mutation.
@@ -484,7 +502,7 @@ void SwingSchedulerDAG::setMAX_II() {
   else if (II_setByPragma > 0)
     MAX_II = II_setByPragma;
   else
-    MAX_II = MII + 10;
+    MAX_II = MII + SwpIISearchRange;
 }
 
 /// We override the schedule function in ScheduleDAGInstrs to implement the
@@ -695,7 +713,8 @@ static void getPhiRegs(MachineInstr &Phi, MachineBasicBlock *Loop,
 }
 
 /// Return the Phi register value that comes the loop block.
-static unsigned getLoopPhiReg(MachineInstr &Phi, MachineBasicBlock *LoopBB) {
+static unsigned getLoopPhiReg(const MachineInstr &Phi,
+                              const MachineBasicBlock *LoopBB) {
   for (unsigned i = 1, e = Phi.getNumOperands(); i != e; i += 2)
     if (Phi.getOperand(i + 1).getMBB() == LoopBB)
       return Phi.getOperand(i).getReg();
@@ -996,6 +1015,41 @@ void SwingSchedulerDAG::changeDependences() {
   }
 }
 
+/// Create an instruction stream that represents a single iteration and stage of
+/// each instruction. This function differs from SMSchedule::finalizeSchedule in
+/// that this doesn't have any side-effect to SwingSchedulerDAG. That is, this
+/// function is an approximation of SMSchedule::finalizeSchedule with all
+/// non-const operations removed.
+static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
+                                  SMSchedule &Schedule,
+                                  std::vector<MachineInstr *> &OrderedInsts,
+                                  DenseMap<MachineInstr *, unsigned> &Stages) {
+  DenseMap<int, std::deque<SUnit *>> Instrs;
+
+  // Move all instructions to the first stage from the later stages.
+  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+       ++Cycle) {
+    for (int Stage = 0, LastStage = Schedule.getMaxStageCount();
+         Stage <= LastStage; ++Stage) {
+      for (SUnit *SU : llvm::reverse(Schedule.getInstructions(
+               Cycle + Stage * Schedule.getInitiationInterval()))) {
+        Instrs[Cycle].push_front(SU);
+      }
+    }
+  }
+
+  for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
+       ++Cycle) {
+    std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
+    CycleInstrs = std::move(Schedule.reorderInstructions(SSD, CycleInstrs));
+    for (SUnit *SU : CycleInstrs) {
+      MachineInstr *MI = SU->getInstr();
+      OrderedInsts.push_back(MI);
+      Stages[MI] = Schedule.stageScheduled(SU);
+    }
+  }
+}
+
 namespace {
 
 // FuncUnitSorter - Comparison operator used to sort instructions by
@@ -1102,6 +1156,375 @@ struct FuncUnitSorter {
   }
 };
 
+/// Calculate the maximum register pressure of the scheduled instructions stream
+class HighRegisterPressureDetector {
+  MachineBasicBlock *OrigMBB;
+  const MachineFunction &MF;
+  const MachineRegisterInfo &MRI;
+  const TargetRegisterInfo *TRI;
+
+  const unsigned PSetNum;
+
+  // Indexed by PSet ID
+  // InitSetPressure takes into account the register pressure of live-in
+  // registers. It's not depend on how the loop is scheduled, so it's enough to
+  // calculate them once at the beginning.
+  std::vector<unsigned> InitSetPressure;
+
+  // Indexed by PSet ID
+  // Upper limit for each register pressure set
+  std::vector<unsigned> PressureSetLimit;
+
+  DenseMap<MachineInstr *, RegisterOperands> ROMap;
+
+  using Instr2LastUsesTy = DenseMap<MachineInstr *, SmallDenseSet<Register, 4>>;
+
+public:
+  using OrderedInstsTy = std::vector<MachineInstr *>;
+  using Instr2StageTy = DenseMap<MachineInstr *, unsigned>;
+
+private:
+  static void dumpRegisterPressures(const std::vector<unsigned> &Pressures) {
+    if (Pressures.size() == 0) {
+      dbgs() << "[]";
+    } else {
+      char Prefix = '[';
+      for (unsigned P : Pressures) {
+        dbgs() << Prefix << P;
+        Prefix = ' ';
+      }
+      dbgs() << ']';
+    }
+  }
+
+  void dumpPSet(Register Reg) const {
+    dbgs() << "Reg=" << printReg(Reg, TRI, 0, &MRI) << " PSet=";
+    for (auto PSetIter = MRI.getPressureSets(Reg); PSetIter.isValid();
+         ++PSetIter) {
+      dbgs() << *PSetIter << ' ';
+    }
+    dbgs() << '\n';
+  }
+
+  void increaseRegisterPressure(std::vector<unsigned> &Pressure,
+                                Register Reg) const {
+    auto PSetIter = MRI.getPressureSets(Reg);
+    unsigned Weight = PSetIter.getWeight();
+    for (; PSetIter.isValid(); ++PSetIter)
+      Pressure[*PSetIter] += Weight;
+  }
+
+  void decreaseRegisterPressure(std::vector<unsigned> &Pressure,
+                                Register Reg) const {
+    auto PSetIter = MRI.getPressureSets(Reg);
+    unsigned Weight = PSetIter.getWeight();
+    for (; PSetIter.isValid(); ++PSetIter) {
+      auto &P = Pressure[*PSetIter];
+      assert(P >= Weight &&
+             "register pressure must be greater than or equal weight");
+      P -= Weight;
+    }
+  }
+
+  // Return true if Reg is fixed one, for example, stack pointer
+  bool isFixedRegister(Register Reg) const {
+    return Reg.isPhysical() && TRI->isFixedRegister(MF, Reg.asMCReg());
+  }
+
+  bool isDefinedInThisLoop(Register Reg) const {
+    return Reg.isVirtual() && MRI.getVRegDef(Reg)->getParent() == OrigMBB;
+  }
+
+  // Search for live-in variables. They are factored into the register pressure
+  // from the begining. Live-in variables used by every iteration should be
+  // considered as alive throughout the loop. For example, the variable `c` in
+  // following code. \code
+  //   int c = ...;
+  //   for (int i = 0; i < n; i++)
+  //     a[i] += b[i] + c;
+  // \endcode
+  void computeLiveIn() {
+    DenseSet<Register> Used;
+    for (auto &MI : *OrigMBB) {
+      if (MI.isDebugInstr())
+        continue;
+      for (auto Use : ROMap[&MI].Uses) {
+        auto Reg = Use.RegUnit;
+        // Ignore the variable that appears only on one side of phi instruction
+        // because it's used only at the first iteration.
+        if (MI.isPHI() && Reg != getLoopPhiReg(MI, OrigMBB))
+          continue;
+        if (isFixedRegister(Reg))
+          continue;
+        if (isDefinedInThisLoop(Reg))
+          continue;
+        Used.insert(Reg);
+      }
+    }
+
+    for (auto LiveIn : Used)
+      increaseRegisterPressure(InitSetPressure, LiveIn);
+  }
+
+  // Calculate the upper limit of each pressure set
+  void computePressureSetLimit(const RegisterClassInfo &RCI) {
+    for (unsigned PSet = 0; PSet < PSetNum; PSet++)
+      PressureSetLimit[PSet] = RCI.getRegPressureSetLimit(PSet);
+
+    // We assume fixed registers, such as stack pointer, are already in use.
+    // Therefore subtracting the weight of the fixed registers from the limit of
+    // each pressure set in advance.
+    SmallDenseSet<Register, 8> FixedRegs;
+    for (const TargetRegisterClass *TRC : TRI->regclasses()) {
+      for (const MCPhysReg Reg : *TRC)
+        if (isFixedRegister(Reg))
+          FixedRegs.insert(Reg);
+    }
+
+    LLVM_DEBUG({
+      for (auto Reg : FixedRegs) {
+        dbgs() << printReg(Reg, TRI, 0, &MRI) << ": [";
+        const int *Sets = TRI->getRegUnitPressureSets(Reg);
+        for (; *Sets != -1; Sets++) {
+          dbgs() << TRI->getRegPressureSetName(*Sets) << ", ";
+        }
+        dbgs() << "]\n";
+      }
+    });
+
+    for (auto Reg : FixedRegs) {
+      LLVM_DEBUG(dbgs() << "fixed register: " << printReg(Reg, TRI, 0, &MRI)
+                        << "\n");
+      auto PSetIter = MRI.getPressureSets(Reg);
+      unsigned Weight = PSetIter.getWeight();
+      for (; PSetIter.isValid(); ++PSetIter) {
+        unsigned &Limit = PressureSetLimit[*PSetIter];
+        assert(Limit >= Weight &&
+               "register pressure limit must be greater than or equal weight");
+        Limit -= Weight;
+        LLVM_DEBUG(dbgs() << "PSet=" << *PSetIter << " Limit=" << Limit
+                          << " (decreased by " << Weight << ")\n");
+      }
+    }
+  }
+
+  // There are two patterns of last-use.
+  //   - by an instruction of the current iteration
+  //   - by a phi instruction of the next iteration (loop carried value)
+  //
+  // Furthermore, following two groups of instructions are executed
+  // simultaneously
+  //   - next iteration's phi instructions in i-th stage
+  //   - current iteration's instructions in i+1-th stage
+  //
+  // This function calculates the last-use of each register while taking into
+  // account the above two patterns.
+  Instr2LastUsesTy computeLastUses(const OrderedInstsTy &OrderedInsts,
+                                   Instr2StageTy &Stages) const {
+    // We treat virtual registers that are defined and used in this loop.
+    // Following virtual register will be ignored
+    //   - live-in one
+    //   - defined but not used in the loop (potentially live-out)
+    DenseSet<Register> TargetRegs;
+    const auto UpdateTargetRegs = [this, &TargetRegs](Register Reg) {
+      if (isDefinedInThisLoop(Reg))
+        TargetRegs.insert(Reg);
+    };
+    for (MachineInstr *MI : OrderedInsts) {
+      if (MI->isPHI()) {
+        Register Reg = getLoopPhiReg(*MI, OrigMBB);
+        UpdateTargetRegs(Reg);
+      } else {
+        for (auto Use : ROMap.find(MI)->getSecond().Uses)
+          UpdateTargetRegs(Use.RegUnit);
+      }
+    }
+
+    const auto InstrScore = [&Stages](MachineInstr *MI) {
+      return Stages[MI] + MI->isPHI();
+    };
+
+    DenseMap<Register, MachineInstr *> LastUseMI;
+    for (MachineInstr *MI : llvm::reverse(OrderedInsts)) {
+      for (auto Use : ROMap.find(MI)->getSecond().Uses) {
+        auto Reg = Use.RegUnit;
+        if (!TargetRegs.contains(Reg))
+          continue;
+        auto Ite = LastUseMI.find(Reg);
+        if (Ite == LastUseMI.end()) {
+          LastUseMI[Reg] = MI;
+        } else {
+          MachineInstr *Orig = Ite->second;
+          MachineInstr *New = MI;
+          if (InstrScore(Orig) < InstrScore(New))
+            LastUseMI[Reg] = New;
+        }
+      }
+    }
+
+    Instr2LastUsesTy LastUses;
+    for (auto &Entry : LastUseMI)
+      LastUses[Entry.second].insert(Entry.first);
+    return LastUses;
+  }
+
+  // Compute the maximum register pressure of the kernel. We'll simulate #Stage
+  // iterations and check the register pressure at the point where all stages
+  // overlapping.
+  //
+  // An example of unrolled loop where #Stage is 4..
+  // Iter   i+0 i+1 i+2 i+3
+  // ------------------------
+  // Stage   0
+  // Stage   1   0
+  // Stage   2   1   0
+  // Stage   3   2   1   0  <- All stages overlap
+  //
+  std::vector<unsigned>
+  computeMaxSetPressure(const OrderedInstsTy &OrderedInsts,
+                        Instr2StageTy &Stages,
+                        const unsigned StageCount) const {
+    using RegSetTy = SmallDenseSet<Register, 16>;
+
+    // Indexed by #Iter. To treat "local" variables of each stage separately, we
+    // manage the liveness of the registers independently by iterations.
+    SmallVector<RegSetTy> LiveRegSets(StageCount);
+
+    auto CurSetPressure = InitSetPressure;
+    auto MaxSetPressure = InitSetPressure;
+    auto LastUses = std::move(computeLastUses(OrderedInsts, Stages));
+
+    LLVM_DEBUG({
+      dbgs() << "Ordered instructions:\n";
+      for (MachineInstr *MI : OrderedInsts) {
+        dbgs() << "Stage " << Stages[MI] << ": ";
+        MI->dump();
+      }
+    });
+
+    const auto InsertReg = [this, &CurSetPressure](RegSetTy &RegSet,
+                                                   Register Reg) {
+      if (!Reg.isValid() || isFixedRegister(Reg))
+        return;
+
+      bool Inserted = RegSet.insert(Reg).second;
+      if (!Inserted)
+        return;
+
+      LLVM_DEBUG(dbgs() << "insert " << printReg(Reg, TRI, 0, &MRI) << "\n");
+      increaseRegisterPressure(CurSetPressure, Reg);
+      LLVM_DEBUG(dumpPSet(Reg));
+    };
+
+    const auto EraseReg = [this, &CurSetPressure](RegSetTy &RegSet,
+                                                  Register Reg) {
+      if (!Reg.isValid() || isFixedRegister(Reg))
+        return;
+
+      // live-in register
+      if (!RegSet.contains(Reg))
+        return;
+
+      LLVM_DEBUG(dbgs() << "erase " << printReg(Reg, TRI, 0, &MRI) << "\n");
+      RegSet.erase(Reg);
+      decreaseRegisterPressure(CurSetPressure, Reg);
+      LLVM_DEBUG(dumpPSet(Reg));
+    };
+
+    for (unsigned I = 0; I < StageCount; I++) {
+      for (MachineInstr *MI : OrderedInsts) {
+        const auto Stage = Stages[MI];
+        if (I < Stage)
+          continue;
+
+        const unsigned Iter = I - Stage;
+
+        for (auto Def : ROMap.find(MI)->getSecond().Defs)
+          InsertReg(LiveRegSets[Iter], Def.RegUnit);
+
+        for (auto LastUse : LastUses[MI]) {
+          if (MI->isPHI()) {
+            if (Iter != 0)
+              EraseReg(LiveRegSets[Iter - 1], LastUse);
+          } else {
+            EraseReg(LiveRegSets[Iter], LastUse);
+          }
+        }
+
+        for (unsigned PSet = 0; PSet < PSetNum; PSet++)
+          MaxSetPressure[PSet] =
+              std::max(MaxSetPressure[PSet], CurSetPressure[PSet]);
+
+        LLVM_DEBUG({
+          dbgs() << "CurSetPressure=";
+          dumpRegisterPressures(CurSetPressure);
+          dbgs() << " iter=" << Iter << " stage=" << Stage << ":";
+          MI->dump();
+        });
+      }
+    }
+
+    return MaxSetPressure;
+  }
+
+public:
+  HighRegisterPressureDetector(MachineBasicBlock *OrigMBB,
+                               const MachineFunction &MF)
+      : OrigMBB(OrigMBB), MF(MF), MRI(MF.getRegInfo()),
+        TRI(MF.getSubtarget().getRegisterInfo()),
+        PSetNum(TRI->getNumRegPressureSets()), InitSetPressure(PSetNum, 0),
+        PressureSetLimit(PSetNum, 0) {}
+
+  // Used to calculate register pressure, which is independent of loop
+  // scheduling.
+  void init(const RegisterClassInfo &RCI) {
+    for (MachineInstr &MI : *OrigMBB) {
+      if (MI.isDebugInstr())
+        continue;
+      ROMap[&MI].collect(MI, *TRI, MRI, false, true);
+    }
+
+    computeLiveIn();
+    computePressureSetLimit(RCI);
+  }
+
+  // Calculate the maximum register pressures of the loop and check if they
+  // exceed the limit
+  bool detect(const SwingSchedulerDAG *SSD, SMSchedule &Schedule,
+              const unsigned MaxStage) const {
+    assert(0 <= RegPressureMargin && RegPressureMargin <= 100 &&
+           "the percentage of the margin must be between 0 to 100");
+
+    OrderedInstsTy OrderedInsts;
+    Instr2StageTy Stages;
+    computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
+    const auto MaxSetPressure =
+        std::move(computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1));
+
+    LLVM_DEBUG({
+      dbgs() << "Dump MaxSetPressure:\n";
+      for (unsigned I = 0; I < MaxSetPressure.size(); I++) {
+        dbgs() << format("MaxSetPressure[%d]=%d\n", I, MaxSetPressure[I]);
+      }
+      dbgs() << '\n';
+    });
+
+    for (unsigned PSet = 0; PSet < PSetNum; PSet++) {
+      unsigned Limit = PressureSetLimit[PSet];
+      unsigned Margin = Limit * RegPressureMargin / 100;
+      LLVM_DEBUG(dbgs() << "PSet=" << PSet << " Limit=" << Limit
+                        << " Margin=" << Margin << "\n");
+      if (Limit < MaxSetPressure[PSet] + Margin) {
+        LLVM_DEBUG(
+            dbgs()
+            << "Rejected the schedule because of too high register pressure\n");
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
 } // end anonymous namespace
 
 /// Calculate the resource constrained minimum initiation interval for the
@@ -1967,6 +2390,12 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
   }
 
   bool scheduleFound = false;
+  std::unique_ptr<HighRegisterPressureDetector> HRPDetector;
+  if (LimitRegPressure) {
+    HRPDetector =
+        std::make_unique<HighRegisterPressureDetector>(Loop.getHeader(), MF);
+    HRPDetector->init(RegClassInfo);
+  }
   // Keep increasing II until a valid schedule is found.
   for (unsigned II = MII; II <= MAX_II && !scheduleFound; ++II) {
     Schedule.reset();
@@ -2044,6 +2473,12 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
     // If a schedule is found, check if it is a valid schedule too.
     if (scheduleFound)
       scheduleFound = Schedule.isValidSchedule(this);
+
+    // If a schedule was found and the option is enabled, check if the schedule
+    // might generate additional register spills/fills.
+    if (scheduleFound && LimitRegPressure)
+      scheduleFound =
+          !HRPDetector->detect(this, Schedule, Schedule.getMaxStageCount());
   }
 
   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound
@@ -2483,8 +2918,8 @@ void SMSchedule::computeStart(SUnit *SU, int *MaxEarlyStart, int *MinLateStart,
 /// Order the instructions within a cycle so that the definitions occur
 /// before the uses. Returns true if the instruction is added to the start
 /// of the list, or false if added to the end.
-void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
-                                 std::deque<SUnit *> &Insts) {
+void SMSchedule::orderDependence(const SwingSchedulerDAG *SSD, SUnit *SU,
+                                 std::deque<SUnit *> &Insts) const {
   MachineInstr *MI = SU->getInstr();
   bool OrderBeforeUse = false;
   bool OrderAfterDef = false;
@@ -2611,7 +3046,8 @@ void SMSchedule::orderDependence(SwingSchedulerDAG *SSD, SUnit *SU,
 }
 
 /// Return true if the scheduled Phi has a loop carried operand.
-bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
+bool SMSchedule::isLoopCarried(const SwingSchedulerDAG *SSD,
+                               MachineInstr &Phi) const {
   if (!Phi.isPHI())
     return false;
   assert(Phi.isPHI() && "Expecting a Phi.");
@@ -2639,8 +3075,9 @@ bool SMSchedule::isLoopCarried(SwingSchedulerDAG *SSD, MachineInstr &Phi) {
 ///  (MO)   = v1
 /// If MO appears before Def, then v1 and v3 may get assigned to the same
 /// register.
-bool SMSchedule::isLoopCarriedDefOfUse(SwingSchedulerDAG *SSD,
-                                       MachineInstr *Def, MachineOperand &MO) {
+bool SMSchedule::isLoopCarriedDefOfUse(const SwingSchedulerDAG *SSD,
+                                       MachineInstr *Def,
+                                       MachineOperand &MO) const {
   if (!MO.isReg())
     return false;
   if (Def->isPHI())
@@ -2895,6 +3332,23 @@ void SwingSchedulerDAG::fixupRegisterOverlaps(std::deque<SUnit *> &Instrs) {
   }
 }
 
+std::deque<SUnit *>
+SMSchedule::reorderInstructions(const SwingSchedulerDAG *SSD,
+                                const std::deque<SUnit *> &Instrs) const {
+  std::deque<SUnit *> NewOrderPhi;
+  for (SUnit *SU : Instrs) {
+    if (SU->getInstr()->isPHI())
+      NewOrderPhi.push_back(SU);
+  }
+  std::deque<SUnit *> NewOrderI;
+  for (SUnit *SU : Instrs) {
+    if (!SU->getInstr()->isPHI())
+      orderDependence(SSD, SU, NewOrderI);
+  }
+  llvm::append_range(NewOrderPhi, NewOrderI);
+  return NewOrderPhi;
+}
+
 /// After the schedule has been formed, call this function to combine
 /// the instructions from the different stages/cycles.  That is, this
 /// function creates a schedule that represents a single iteration.
@@ -2924,19 +3378,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
   // generated code.
   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
-    std::deque<SUnit *> newOrderPhi;
-    for (SUnit *SU : cycleInstrs) {
-      if (SU->getInstr()->isPHI())
-        newOrderPhi.push_back(SU);
-    }
-    std::deque<SUnit *> newOrderI;
-    for (SUnit *SU : cycleInstrs) {
-      if (!SU->getInstr()->isPHI())
-        orderDependence(SSD, SU, newOrderI);
-    }
-    // Replace the old order with the new order.
-    cycleInstrs.swap(newOrderPhi);
-    llvm::append_range(cycleInstrs, newOrderI);
+    cycleInstrs = std::move(reorderInstructions(SSD, cycleInstrs));
     SSD->fixupRegisterOverlaps(cycleInstrs);
   }
 
diff --git a/llvm/test/CodeGen/PowerPC/sms-regpress.mir b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
new file mode 100644
index 0000000000000..f523b4548eecc
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
@@ -0,0 +1,328 @@
+# RUN: llc --verify-machineinstrs -mcpu=pwr9 -o - %s -run-pass=pipeliner -ppc-enable-pipeliner  -pipeliner-register-pressure -pipeliner-max-mii=50 -pipeliner-ii-search-range=30 -pipeliner-max-stages=10 -debug-only=pipeliner 2>&1 | FileCheck %s
+
+# Check that if the register pressure is too high, the schedule is rejected, II is incremented, and scheduling continues.
+# The specific value of II is not important.
+
+# CHECK: Try to schedule with 21
+# CHECK: 	Can't schedule
+# CHECK: Try to schedule with 22
+# CHECK: 	Can't schedule
+# CHECK: Try to schedule with 23
+# CHECK: Rejected the schedule because of too high register pressure
+# CHECK: Try to schedule with 24
+# CHECK: Rejected the schedule because of too high register pressure
+# CHECK: Try to schedule with 25
+# CHECK: Rejected the schedule because of too high register pressure
+# CHECK: Try to schedule with 26
+# CHECK: Schedule Found? 1 (II=26)
+
+--- |
+  ; ModuleID = 'a.ll'
+  source_filename = "a.c"
+  target datalayout = "e-m:e-Fn32-i64:64-n32:64"
+  target triple = "ppc64le"
+  
+  ; Function Attrs: nofree nosync nounwind memory(argmem: read) uwtable
+  define dso_local double @kernel(ptr nocapture noundef readonly %a, ptr nocapture noundef readonly %b, i32 noundef signext %n) local_unnamed_addr #0 {
+  entry:
+    %0 = load double, ptr %a, align 8, !tbaa !3
+    %arrayidx1 = getelementptr inbounds double, ptr %a, i64 1
+    %1 = load double, ptr %arrayidx1, align 8, !tbaa !3
+    %cmp163 = icmp sgt i32 %n, 0
+    br i1 %cmp163, label %for.body.preheader, label %for.cond.cleanup
+  
+  for.body.preheader:                               ; preds = %entry
+    %wide.trip.count = zext i32 %n to i64
+    %scevgep1 = getelementptr i8, ptr %b, i64 -8
+    call void @llvm.set.loop.iterations.i64(i64 %wide.trip.count)
+    br label %for.body
+  
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    %res.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %30, %for.body ]
+    ret double %res.0.lcssa
+  
+  for.body:                                         ; preds = %for.body, %for.body.preheader
+    %res.0165 = phi double [ 0.000000e+00, %for.body.preheader ], [ %30, %for.body ]
+    %2 = phi ptr [ %scevgep1, %for.body.preheader ], [ %3, %for.body ]
+    %3 = getelementptr i8, ptr %2, i64 8
+    %4 = load double, ptr %3, align 8, !tbaa !3
+    %5 = tail call double @llvm.fmuladd.f64(double %0, double %4, double %0)
+    %6 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %5)
+    %7 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %6)
+    %8 = tail call double @llvm.fmuladd.f64(double %7, double %4, double %7)
+    %9 = tail call double @llvm.fmuladd.f64(double %8, double %4, double %8)
+    %10 = tail call double @llvm.fmuladd.f64(double %9, double %4, double %9)
+    %11 = tail call double @llvm.fmuladd.f64(double %10, double %4, double %10)
+    %12 = tail call double @llvm.fmuladd.f64(double %11, double %4, double %11)
+    %13 = tail call double @llvm.fmuladd.f64(double %12, double %4, double %12)
+    %14 = tail call double @llvm.fmuladd.f64(double %13, double %4, double %13)
+    %15 = tail call double @llvm.fmuladd.f64(double %14, double %4, double %14)
+    %16 = tail call double @llvm.fmuladd.f64(double %15, double %4, double %15)
+    %17 = tail call double @llvm.fmuladd.f64(double %16, double %4, double %16)
+    %18 = tail call double @llvm.fmuladd.f64(double %17, double %4, double %17)
+    %19 = tail call double @llvm.fmuladd.f64(double %18, double %4, double %18)
+    %20 = tail call double @llvm.fmuladd.f64(double %19, double %4, double %19)
+    %add = fadd double %19, %20
+    %21 = tail call double @llvm.fmuladd.f64(double %20, double %4, double %add)
+    %add35 = fadd double %12, %21
+    %22 = tail call double @llvm.fmuladd.f64(double %5, double %4, double %add35)
+    %add38 = fadd double %13, %22
+    %23 = tail call double @llvm.fmuladd.f64(double %6, double %4, double %add38)
+    %mul = fmul double %4, %7
+    %mul46 = fmul double %mul, %14
+    %24 = tail call double @llvm.fmuladd.f64(double %mul46, double %13, double %16)
+    %mul50 = fmul double %4, %9
+    %mul51 = fmul double %1, %mul50
+    %25 = tail call double @llvm.fmuladd.f64(double %mul51, double %11, double %24)
+    %add53 = fadd double %5, %25
+    %add54 = fadd double %6, %add53
+    %mul55 = fmul double %14, %16
+    %mul56 = fmul double %mul55, %17
+    %mul57 = fmul double %mul56, %18
+    %26 = tail call double @llvm.fmuladd.f64(double %mul57, double %19, double %add54)
+    %27 = tail call double @llvm.fmuladd.f64(double %10, double %1, double %26)
+    %28 = tail call double @llvm.fmuladd.f64(double %8, double %6, double %27)
+    %mul61 = fmul double %20, %21
+    %mul62 = fmul double %mul61, %22
+    %29 = tail call double @llvm.fmuladd.f64(double %mul62, double %23, double %28)
+    %mul64 = fmul double %26, %29
+    %mul65 = fmul double %24, %mul64
+    %mul66 = fmul double %12, %mul65
+    %30 = tail call double @llvm.fmuladd.f64(double %mul66, double %10, double %res.0165)
+    %31 = call i1 @llvm.loop.decrement.i64(i64 1)
+    br i1 %31, label %for.body, label %for.cond.cleanup, !llvm.loop !7
+  }
+  
+  ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+  declare double @llvm.fmuladd.f64(double, double, double) #1
+  
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare void @llvm.set.loop.iterations.i64(i64) #2
+  
+  ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
+  declare i1 @llvm.loop.decrement.i64(i64) #2
+  
+  attributes #0 = { nofree nosync nounwind memory(argmem: read) uwtable "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crbits,+crypto,+direct-move,+extdiv,+htm,+isa-v206-instructions,+isa-v207-instructions,+isa-v30-instructions,+power8-vector,+power9-vector,+quadword-atomics,+vsx,-aix-small-local-exec-tls,-privileged,-rop-protect,-spe" }
+  attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+  attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn }
+  
+  !llvm.module.flags = !{!0, !1}
+  !llvm.ident = !{!2}
+  
+  !0 = !{i32 1, !"wchar_size", i32 4}
+  !1 = !{i32 7, !"uwtable", i32 2}
+  !2 = !{!"clang version 18.0.0 (https://miratech-soft@dev.azure.com/miratech-soft/llvm/_git/llvm c8d01fb665fc5d9378100a6d92ebcd3be49be655)"}
+  !3 = !{!4, !4, i64 0}
+  !4 = !{!"double", !5, i64 0}
+  !5 = !{!"omnipotent char", !6, i64 0}
+  !6 = !{!"Simple C/C++ TBAA"}
+  !7 = distinct !{!7, !8, !9}
+  !8 = !{!"llvm.loop.mustprogress"}
+  !9 = !{!"llvm.loop.unroll.disable"}
+
+...
+---
+name:            kernel
+alignment:       16
+exposesReturnsTwice: false
+legalized:       false
+regBankSelected: false
+selected:        false
+failedISel:      false
+tracksRegLiveness: true
+hasWinCFI:       false
+callsEHReturn:   false
+callsUnwindInit: false
+hasEHCatchret:   false
+hasEHScopes:     false
+hasEHFunclets:   false
+isOutlined:      false
+debugInstrRef:   false
+failsVerification: false
+tracksDebugUserValues: false
+registers:
+  - { id: 0, class: vsfrc, preferred-register: '' }
+  - { id: 1, class: vsfrc, preferred-register: '' }
+  - { id: 2, class: g8rc, preferred-register: '' }
+  - { id: 3, class: vsfrc, preferred-register: '' }
+  - { id: 4, class: vsfrc, preferred-register: '' }
+  - { id: 5, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 6, class: g8rc, preferred-register: '' }
+  - { id: 7, class: vsfrc, preferred-register: '' }
+  - { id: 8, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 9, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 10, class: g8rc, preferred-register: '' }
+  - { id: 11, class: gprc, preferred-register: '' }
+  - { id: 12, class: vsfrc, preferred-register: '' }
+  - { id: 13, class: crrc, preferred-register: '' }
+  - { id: 14, class: vsfrc, preferred-register: '' }
+  - { id: 15, class: g8rc, preferred-register: '' }
+  - { id: 16, class: g8rc, preferred-register: '' }
+  - { id: 17, class: g8rc, preferred-register: '' }
+  - { id: 18, class: f8rc, preferred-register: '' }
+  - { id: 19, class: g8rc_and_g8rc_nox0, preferred-register: '' }
+  - { id: 20, class: vsfrc, preferred-register: '' }
+  - { id: 21, class: vsfrc, preferred-register: '' }
+  - { id: 22, class: vsfrc, preferred-register: '' }
+  - { id: 23, class: vsfrc, preferred-register: '' }
+  - { id: 24, class: vsfrc, preferred-register: '' }
+  - { id: 25, class: vsfrc, preferred-register: '' }
+  - { id: 26, class: vsfrc, preferred-register: '' }
+  - { id: 27, class: vsfrc, preferred-register: '' }
+  - { id: 28, class: vsfrc, preferred-register: '' }
+  - { id: 29, class: vsfrc, preferred-register: '' }
+  - { id: 30, class: vsfrc, preferred-register: '' }
+  - { id: 31, class: vsfrc, preferred-register: '' }
+  - { id: 32, class: vsfrc, preferred-register: '' }
+  - { id: 33, class: vsfrc, preferred-register: '' }
+  - { id: 34, class: vsfrc, preferred-register: '' }
+  - { id: 35, class: vsfrc, preferred-register: '' }
+  - { id: 36, class: vsfrc, preferred-register: '' }
+  - { id: 37, class: vsfrc, preferred-register: '' }
+  - { id: 38, class: vsfrc, preferred-register: '' }
+  - { id: 39, class: vsfrc, preferred-register: '' }
+  - { id: 40, class: vsfrc, preferred-register: '' }
+  - { id: 41, class: vsfrc, preferred-register: '' }
+  - { id: 42, class: vsfrc, preferred-register: '' }
+  - { id: 43, class: vsfrc, preferred-register: '' }
+  - { id: 44, class: vsfrc, preferred-register: '' }
+  - { id: 45, class: vsfrc, preferred-register: '' }
+  - { id: 46, class: vsfrc, preferred-register: '' }
+  - { id: 47, class: vsfrc, preferred-register: '' }
+  - { id: 48, class: vsfrc, preferred-register: '' }
+  - { id: 49, class: vsfrc, preferred-register: '' }
+  - { id: 50, class: vsfrc, preferred-register: '' }
+  - { id: 51, class: vsfrc, preferred-register: '' }
+  - { id: 52, class: vsfrc, preferred-register: '' }
+  - { id: 53, class: vsfrc, preferred-register: '' }
+  - { id: 54, class: vsfrc, preferred-register: '' }
+  - { id: 55, class: vsfrc, preferred-register: '' }
+  - { id: 56, class: vsfrc, preferred-register: '' }
+  - { id: 57, class: vsfrc, preferred-register: '' }
+  - { id: 58, class: vsfrc, preferred-register: '' }
+  - { id: 59, class: vsfrc, preferred-register: '' }
+  - { id: 60, class: vsfrc, preferred-register: '' }
+  - { id: 61, class: vsfrc, preferred-register: '' }
+  - { id: 62, class: crbitrc, preferred-register: '' }
+liveins:
+  - { reg: '$x3', virtual-reg: '%8' }
+  - { reg: '$x4', virtual-reg: '%9' }
+  - { reg: '$x5', virtual-reg: '%10' }
+frameInfo:
+  isFrameAddressTaken: false
+  isReturnAddressTaken: false
+  hasStackMap:     false
+  hasPatchPoint:   false
+  stackSize:       0
+  offsetAdjustment: 0
+  maxAlignment:    1
+  adjustsStack:    false
+  hasCalls:        false
+  stackProtector:  ''
+  functionContext: ''
+  maxCallFrameSize: 4294967295
+  cvBytesOfCalleeSavedRegisters: 0
+  hasOpaqueSPAdjustment: false
+  hasVAStart:      false
+  hasMustTailInVarArgFunc: false
+  hasTailCall:     false
+  localFrameSize:  0
+  savePoint:       ''
+  restorePoint:    ''
+fixedStack:      []
+stack:           []
+entry_values:    []
+callSites:       []
+debugValueSubstitutions: []
+constants:       []
+machineFunctionInfo: {}
+body:             |
+  bb.0.entry:
+    successors: %bb.2(0x50000000), %bb.1(0x30000000)
+    liveins: $x3, $x4, $x5
+  
+    %10:g8rc = COPY killed $x5
+    %9:g8rc_and_g8rc_nox0 = COPY killed $x4
+    %8:g8rc_and_g8rc_nox0 = COPY killed $x3
+    %11:gprc = COPY killed %10.sub_32
+    %13:crrc = CMPWI %11, 0
+    BCC 44, killed %13, %bb.2
+  
+  bb.1:
+    successors: %bb.3(0x80000000)
+  
+    %12:vsfrc = XXLXORdpz
+    B %bb.3
+  
+  bb.2.for.body.preheader:
+    successors: %bb.4(0x80000000)
+  
+    %0:vsfrc = DFLOADf64 0, %8 :: (load (s64) from %ir.a, !tbaa !3)
+    %1:vsfrc = DFLOADf64 8, killed %8 :: (load (s64) from %ir.arrayidx1, !tbaa !3)
+    %16:g8rc = IMPLICIT_DEF
+    %15:g8rc = INSERT_SUBREG killed %16, killed %11, %subreg.sub_32
+    %17:g8rc = RLDICL killed %15, 0, 32
+    %2:g8rc = ADDI8 killed %9, -8
+    MTCTR8loop killed %17, implicit-def dead $ctr8
+    %14:vsfrc = XXLXORdpz
+    B %bb.4
+  
+  bb.3.for.cond.cleanup:
+    %3:vsfrc = PHI %12, %bb.1, %7, %bb.4
+    $f1 = COPY killed %3
+    BLR8 implicit $lr8, implicit $rm, implicit killed $f1
+  
+  bb.4.for.body:
+    successors: %bb.4(0x7c000000), %bb.3(0x04000000)
+  
+    %4:vsfrc = PHI %14, %bb.2, %7, %bb.4
+    %5:g8rc_and_g8rc_nox0 = PHI %2, %bb.2, %6, %bb.4
+    %18:f8rc, %19:g8rc_and_g8rc_nox0 = LFDU 8, killed %5 :: (load (s64) from %ir.3, !tbaa !3)
+    %6:g8rc = COPY killed %19
+    %20:vsfrc = nofpexcept XSMADDADP %0, %0, %18, implicit $rm
+    %21:vsfrc = nofpexcept XSMADDADP %20, %20, %18, implicit $rm
+    %22:vsfrc = nofpexcept XSMADDADP %21, %21, %18, implicit $rm
+    %23:vsfrc = nofpexcept XSMADDADP %22, %22, %18, implicit $rm
+    %24:vsfrc = nofpexcept XSMADDADP %23, %23, %18, implicit $rm
+    %25:vsfrc = nofpexcept XSMADDADP %24, %24, %18, implicit $rm
+    %26:vsfrc = nofpexcept XSMADDADP %25, %25, %18, implicit $rm
+    %27:vsfrc = nofpexcept XSMADDADP %26, %26, %18, implicit $rm
+    %28:vsfrc = nofpexcept XSMADDADP %27, %27, %18, implicit $rm
+    %29:vsfrc = nofpexcept XSMADDADP %28, %28, %18, implicit $rm
+    %30:vsfrc = nofpexcept XSMADDADP %29, %29, %18, implicit $rm
+    %31:vsfrc = nofpexcept XSMADDADP killed %30, %30, %18, implicit $rm
+    %32:vsfrc = nofpexcept XSMADDADP %31, %31, %18, implicit $rm
+    %33:vsfrc = nofpexcept XSMADDADP %32, %32, %18, implicit $rm
+    %34:vsfrc = nofpexcept XSMADDADP %33, %33, %18, implicit $rm
+    %35:vsfrc = nofpexcept XSMADDADP %34, %34, %18, implicit $rm
+    %36:vsfrc = nofpexcept XSADDDP %34, %35, implicit $rm
+    %37:vsfrc = nofpexcept XSMADDADP killed %36, %35, %18, implicit $rm
+    %38:vsfrc = nofpexcept XSADDDP %27, %37, implicit $rm
+    %39:vsfrc = nofpexcept XSMADDADP killed %38, %20, %18, implicit $rm
+    %40:vsfrc = nofpexcept XSADDDP %28, %39, implicit $rm
+    %41:vsfrc = nofpexcept XSMADDADP killed %40, %21, %18, implicit $rm
+    %42:vsfrc = nofpexcept XSMULDP %18, killed %22, implicit $rm
+    %43:vsfrc = nofpexcept XSMULDP killed %42, %29, implicit $rm
+    %44:vsfrc = nofpexcept XSMADDADP %31, killed %43, killed %28, implicit $rm
+    %45:vsfrc = nofpexcept XSMULDP killed %18, killed %24, implicit $rm
+    %46:vsfrc = nofpexcept XSMULDP %1, killed %45, implicit $rm
+    %47:vsfrc = nofpexcept XSMADDADP %44, killed %46, killed %26, implicit $rm
+    %48:vsfrc = nofpexcept XSADDDP killed %20, killed %47, implicit $rm
+    %49:vsfrc = nofpexcept XSADDDP %21, killed %48, implicit $rm
+    %50:vsfrc = nofpexcept XSMULDP killed %29, killed %31, implicit $rm
+    %51:vsfrc = nofpexcept XSMULDP killed %50, killed %32, implicit $rm
+    %52:vsfrc = nofpexcept XSMULDP killed %51, killed %33, implicit $rm
+    %53:vsfrc = nofpexcept XSMADDADP killed %49, killed %52, killed %34, implicit $rm
+    %54:vsfrc = nofpexcept XSMADDADP %53, %25, %1, implicit $rm
+    %55:vsfrc = nofpexcept XSMADDADP killed %54, killed %23, killed %21, implicit $rm
+    %56:vsfrc = nofpexcept XSMULDP killed %35, killed %37, implicit $rm
+    %57:vsfrc = nofpexcept XSMULDP killed %56, killed %39, implicit $rm
+    %58:vsfrc = nofpexcept XSMADDADP killed %55, killed %57, killed %41, implicit $rm
+    %59:vsfrc = nofpexcept XSMULDP killed %53, killed %58, implicit $rm
+    %60:vsfrc = nofpexcept XSMULDP killed %44, killed %59, implicit $rm
+    %61:vsfrc = nofpexcept XSMULDP killed %27, killed %60, implicit $rm
+    %7:vsfrc = nofpexcept XSMADDADP killed %4, killed %61, killed %25, implicit $rm
+    BDNZ8 %bb.4, implicit-def $ctr8, implicit $ctr8
+    B %bb.3
+
+...

From 21730eb49b7c53abd47eff898c913b48e2f1dfc9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 00:13:23 -0800
Subject: [PATCH 335/843] [lld] Use SmallString::operator std::string (NFC)

---
 lld/COFF/Driver.cpp       | 6 +++---
 lld/COFF/DriverUtils.cpp  | 2 +-
 lld/COFF/InputFiles.cpp   | 2 +-
 lld/ELF/DriverUtils.cpp   | 2 +-
 lld/MachO/DriverUtils.cpp | 2 +-
 lld/wasm/Driver.cpp       | 2 +-
 lld/wasm/WriterUtils.cpp  | 2 +-
 7 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index cfcf73843e978..e0afb6b18805b 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -855,7 +855,7 @@ static std::string createResponseFile(const opt::InputArgList &args,
   for (StringRef path : filePaths)
     os << quote(relativeToRoot(path)) << "\n";
 
-  return std::string(data.str());
+  return std::string(data);
 }
 
 static unsigned parseDebugTypes(const opt::InputArgList &args) {
@@ -910,7 +910,7 @@ std::string LinkerDriver::getImplibPath() {
     return std::string(ctx.config.implib);
   SmallString<128> out = StringRef(ctx.config.outputFile);
   sys::path::replace_extension(out, ".lib");
-  return std::string(out.str());
+  return std::string(out);
 }
 
 // The import name is calculated as follows:
@@ -934,7 +934,7 @@ std::string LinkerDriver::getImportName(bool asLib) {
                                    (ctx.config.dll || asLib) ? ".dll" : ".exe");
   }
 
-  return std::string(out.str());
+  return std::string(out);
 }
 
 void LinkerDriver::createImportLibrary(bool asLib) {
diff --git a/lld/COFF/DriverUtils.cpp b/lld/COFF/DriverUtils.cpp
index 4678767b6a110..fc8eb327be49b 100644
--- a/lld/COFF/DriverUtils.cpp
+++ b/lld/COFF/DriverUtils.cpp
@@ -350,7 +350,7 @@ class TemporaryFile {
     SmallString<128> s;
     if (auto ec = sys::fs::createTemporaryFile("lld-" + prefix, extn, s))
       fatal("cannot create a temporary file: " + ec.message());
-    path = std::string(s.str());
+    path = std::string(s);
 
     if (!contents.empty()) {
       std::error_code ec;
diff --git a/lld/COFF/InputFiles.cpp b/lld/COFF/InputFiles.cpp
index 22cc0e3e5dbaf..1ed90d74229aa 100644
--- a/lld/COFF/InputFiles.cpp
+++ b/lld/COFF/InputFiles.cpp
@@ -828,7 +828,7 @@ static std::string getPdbBaseName(ObjFile *file, StringRef tSPath) {
   // on Windows, so we can assume type server paths are Windows style.
   sys::path::append(path,
                     sys::path::filename(tSPath, sys::path::Style::windows));
-  return std::string(path.str());
+  return std::string(path);
 }
 
 // The casing of the PDB path stamped in the OBJ can differ from the actual path
diff --git a/lld/ELF/DriverUtils.cpp b/lld/ELF/DriverUtils.cpp
index 0a27422e3b2df..de8ffab0e2124 100644
--- a/lld/ELF/DriverUtils.cpp
+++ b/lld/ELF/DriverUtils.cpp
@@ -204,7 +204,7 @@ std::string elf::createResponseFile(const opt::InputArgList &args) {
       os << toString(*arg) << "\n";
     }
   }
-  return std::string(data.str());
+  return std::string(data);
 }
 
 // Find a file by concatenating given paths. If a resulting path
diff --git a/lld/MachO/DriverUtils.cpp b/lld/MachO/DriverUtils.cpp
index 17499451382a5..d6f18ecb85b8a 100644
--- a/lld/MachO/DriverUtils.cpp
+++ b/lld/MachO/DriverUtils.cpp
@@ -180,7 +180,7 @@ std::string macho::createResponseFile(const InputArgList &args) {
       os << toString(*arg) << "\n";
     }
   }
-  return std::string(data.str());
+  return std::string(data);
 }
 
 static void searchedDylib(const Twine &path, bool found) {
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index edf1979c1d302..88a3db75b54a1 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -1019,7 +1019,7 @@ static std::string createResponseFile(const opt::InputArgList &args) {
       os << toString(*arg) << "\n";
     }
   }
-  return std::string(data.str());
+  return std::string(data);
 }
 
 // The --wrap option is a feature to rename symbols so that you can write
diff --git a/lld/wasm/WriterUtils.cpp b/lld/wasm/WriterUtils.cpp
index cc8ed0b1de237..418192f302375 100644
--- a/lld/wasm/WriterUtils.cpp
+++ b/lld/wasm/WriterUtils.cpp
@@ -51,7 +51,7 @@ std::string toString(const WasmSignature &sig) {
     s += "void";
   else
     s += toString(sig.Returns[0]);
-  return std::string(s.str());
+  return std::string(s);
 }
 
 std::string toString(const WasmGlobalType &type) {

From 8fddf7fd148cbb9e5c09e351cd1be410a513702a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 00:13:25 -0800
Subject: [PATCH 336/843] [llvm] Use MachineBasicBlock::succ_empty (NFC)

---
 llvm/lib/CodeGen/UnreachableBlockElim.cpp               | 2 +-
 llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index f17450d264ba0..1a60e9abbe2e2 100644
--- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -117,7 +117,7 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
       if (MLI) MLI->removeBlock(&BB);
       if (MDT && MDT->getNode(&BB)) MDT->eraseNode(&BB);
 
-      while (BB.succ_begin() != BB.succ_end()) {
+      while (!BB.succ_empty()) {
         MachineBasicBlock* succ = *BB.succ_begin();
 
         for (MachineInstr &Phi : succ->phis()) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
index d90fcac875408..289c35e11bebf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp
@@ -1721,7 +1721,7 @@ void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB,
                                             unsigned CodeSourceRegister,
                                             bool IsUndefIfSource) {
   // If this is the function exit block, we don't need a phi.
-  if (MergeBB->succ_begin() == MergeBB->succ_end()) {
+  if (MergeBB->succ_empty()) {
     return;
   }
   LLVM_DEBUG(dbgs() << "Merge PHI (" << printMBBReference(*MergeBB)

From 234da203779be31c83541d99ee28e01ee422c506 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 00:13:27 -0800
Subject: [PATCH 337/843] [Analysis] Use StringRef::ends_with_insensitive (NFC)

---
 clang/lib/Analysis/RetainSummaryManager.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Analysis/RetainSummaryManager.cpp b/clang/lib/Analysis/RetainSummaryManager.cpp
index 6f50d95b179f4..8d279d969b613 100644
--- a/clang/lib/Analysis/RetainSummaryManager.cpp
+++ b/clang/lib/Analysis/RetainSummaryManager.cpp
@@ -1098,7 +1098,7 @@ RetainSummaryManager::getStandardMethodSummary(const ObjCMethodDecl *MD,
   if (S.isKeywordSelector()) {
     for (unsigned i = 0, e = S.getNumArgs(); i != e; ++i) {
       StringRef Slot = S.getNameForSlot(i);
-      if (Slot.substr(Slot.size() - 8).equals_insensitive("delegate")) {
+      if (Slot.ends_with_insensitive("delegate")) {
         if (ResultEff == ObjCInitRetE)
           ResultEff = RetEffect::MakeNoRetHard();
         else

From a54463a4c6c32810b064e02b39e2c8f0de974006 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 00:13:29 -0800
Subject: [PATCH 338/843] [clang-tidy] Use llvm::any_of (NFC)

---
 clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp
index 865c88391b0b4..ff5b885aa95f8 100644
--- a/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/EmptyCatchCheck.cpp
@@ -46,7 +46,7 @@ AST_MATCHER_P(CompoundStmt, hasAnyTextFromList, std::vector<llvm::StringRef>,
   StringRef Text = Lexer::getSourceText(
       CharSourceRange::getTokenRange(Node.getSourceRange()), SM,
       Context.getLangOpts());
-  return std::any_of(List.begin(), List.end(), [&](const StringRef &Str) {
+  return llvm::any_of(List, [&](const StringRef &Str) {
     return Text.contains_insensitive(Str);
   });
 }

From 9f290509421b874ecf8082fa8f754850fb121655 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Mon, 22 Jan 2024 16:17:18 +0800
Subject: [PATCH 339/843] [CodeGen][MachinePipeliner] Fix -Wpessimizing-move in
 MachinePipeliner.cpp (NFC)

/Users/jiefu/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp:1044:19: error: moving a temporary object prevents copy elision [-Werror,-Wpessimizing-move]
 1044 |     CycleInstrs = std::move(Schedule.reorderInstructions(SSD, CycleInstrs));
      |                   ^
/Users/jiefu/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp:1044:19: note: remove std::move call here
 1044 |     CycleInstrs = std::move(Schedule.reorderInstructions(SSD, CycleInstrs));
      |                   ^~~~~~~~~~                                              ~
/Users/jiefu/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp:1395:21: error: moving a temporary object prevents copy elision [-Werror,-Wpessimizing-move]
 1395 |     auto LastUses = std::move(computeLastUses(OrderedInsts, Stages));
      |                     ^
/Users/jiefu/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp:1395:21: note: remove std::move call here
 1395 |     auto LastUses = std::move(computeLastUses(OrderedInsts, Stages));
      |                     ^~~~~~~~~~                                     ~
/Users/jiefu/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp:1502:9: error: moving a temporary object prevents copy elision [-Werror,-Wpessimizing-move]
 1502 |         std::move(computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1));
      |         ^
/Users/jiefu/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp:1502:9: note: remove std::move call here
 1502 |         std::move(computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1));
      |         ^~~~~~~~~~                                                         ~
/Users/jiefu/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp:3381:19: error: moving a temporary object prevents copy elision [-Werror,-Wpessimizing-move]
 3381 |     cycleInstrs = std::move(reorderInstructions(SSD, cycleInstrs));
      |                   ^
/Users/jiefu/llvm-project/llvm/lib/CodeGen/MachinePipeliner.cpp:3381:19: note: remove std::move call here
 3381 |     cycleInstrs = std::move(reorderInstructions(SSD, cycleInstrs));
      |                   ^~~~~~~~~~                                     ~
4 errors generated.
---
 llvm/lib/CodeGen/MachinePipeliner.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index 5c9f0f1703a6e..2d2d0bffe2169 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -1041,7 +1041,7 @@ static void computeScheduledInsts(const SwingSchedulerDAG *SSD,
   for (int Cycle = Schedule.getFirstCycle(); Cycle <= Schedule.getFinalCycle();
        ++Cycle) {
     std::deque<SUnit *> &CycleInstrs = Instrs[Cycle];
-    CycleInstrs = std::move(Schedule.reorderInstructions(SSD, CycleInstrs));
+    CycleInstrs = Schedule.reorderInstructions(SSD, CycleInstrs);
     for (SUnit *SU : CycleInstrs) {
       MachineInstr *MI = SU->getInstr();
       OrderedInsts.push_back(MI);
@@ -1392,7 +1392,7 @@ class HighRegisterPressureDetector {
 
     auto CurSetPressure = InitSetPressure;
     auto MaxSetPressure = InitSetPressure;
-    auto LastUses = std::move(computeLastUses(OrderedInsts, Stages));
+    auto LastUses = computeLastUses(OrderedInsts, Stages);
 
     LLVM_DEBUG({
       dbgs() << "Ordered instructions:\n";
@@ -1499,7 +1499,7 @@ class HighRegisterPressureDetector {
     Instr2StageTy Stages;
     computeScheduledInsts(SSD, Schedule, OrderedInsts, Stages);
     const auto MaxSetPressure =
-        std::move(computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1));
+        computeMaxSetPressure(OrderedInsts, Stages, MaxStage + 1);
 
     LLVM_DEBUG({
       dbgs() << "Dump MaxSetPressure:\n";
@@ -3378,7 +3378,7 @@ void SMSchedule::finalizeSchedule(SwingSchedulerDAG *SSD) {
   // generated code.
   for (int Cycle = getFirstCycle(), E = getFinalCycle(); Cycle <= E; ++Cycle) {
     std::deque<SUnit *> &cycleInstrs = ScheduledInstrs[Cycle];
-    cycleInstrs = std::move(reorderInstructions(SSD, cycleInstrs));
+    cycleInstrs = reorderInstructions(SSD, cycleInstrs);
     SSD->fixupRegisterOverlaps(cycleInstrs);
   }
 

From 3b943c0203df5c35089417567cc470d5cdbc497e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 22 Jan 2024 00:20:40 -0800
Subject: [PATCH 340/843] [Thumb,test] Improve __stack_chk_guard test

---
 llvm/test/CodeGen/ARM/stack-guard-elf.ll | 149 +++++++++++++++++++++++
 1 file changed, 149 insertions(+)

diff --git a/llvm/test/CodeGen/ARM/stack-guard-elf.ll b/llvm/test/CodeGen/ARM/stack-guard-elf.ll
index 7a342d003c72d..250f2ad9ed109 100644
--- a/llvm/test/CodeGen/ARM/stack-guard-elf.ll
+++ b/llvm/test/CodeGen/ARM/stack-guard-elf.ll
@@ -4,6 +4,11 @@
 ;; R_ARM_GOT_ABS does not have assembler support.
 ; RUN: llc -relocation-model=static < %s | FileCheck %s
 ; RUN: llc -relocation-model=pic < %s | FileCheck %s
+;; Also check Thumb1 and Thumb2.
+; RUN: llc -mtriple=thumbv6-linux-gnueabi -relocation-model=static < %s | FileCheck %s --check-prefix=THUMB1
+; RUN: llc -mtriple=thumbv6-linux-gnueabi -relocation-model=pic < %s | FileCheck %s --check-prefix=THUMB1-PIC
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -relocation-model=static < %s | FileCheck %s --check-prefix=THUMB2
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -relocation-model=pic < %s | FileCheck %s --check-prefix=THUMB2-PIC
 
 target triple = "armv7a-linux-gnueabi"
 
@@ -42,6 +47,150 @@ define i32 @test1() #0 {
 ; CHECK-NEXT:  .LCPI0_1:
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+8)-.Ltmp1)
+;
+; THUMB1-LABEL: test1:
+; THUMB1:       @ %bb.0:
+; THUMB1-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-NEXT:    .pad #508
+; THUMB1-NEXT:    sub sp, #508
+; THUMB1-NEXT:    .pad #508
+; THUMB1-NEXT:    sub sp, #508
+; THUMB1-NEXT:    .pad #16
+; THUMB1-NEXT:    sub sp, #16
+; THUMB1-NEXT:    ldr r0, .LCPI0_0
+; THUMB1-NEXT:    ldr r0, [r0]
+; THUMB1-NEXT:    ldr r0, [r0]
+; THUMB1-NEXT:    add r1, sp, #904
+; THUMB1-NEXT:    str r0, [r1, #124]
+; THUMB1-NEXT:    add r0, sp, #4
+; THUMB1-NEXT:    bl foo
+; THUMB1-NEXT:    add r0, sp, #904
+; THUMB1-NEXT:    ldr r0, [r0, #124]
+; THUMB1-NEXT:    ldr r1, .LCPI0_0
+; THUMB1-NEXT:    ldr r1, [r1]
+; THUMB1-NEXT:    ldr r1, [r1]
+; THUMB1-NEXT:    cmp r1, r0
+; THUMB1-NEXT:    bne .LBB0_2
+; THUMB1-NEXT:  @ %bb.1:
+; THUMB1-NEXT:    movs r0, #0
+; THUMB1-NEXT:    add sp, #508
+; THUMB1-NEXT:    add sp, #508
+; THUMB1-NEXT:    add sp, #16
+; THUMB1-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-NEXT:  .LBB0_2:
+; THUMB1-NEXT:    bl __stack_chk_fail
+; THUMB1-NEXT:    .p2align 2
+; THUMB1-NEXT:  @ %bb.3:
+; THUMB1-NEXT:  .LCPI0_0:
+; THUMB1-NEXT:    .long __stack_chk_guard
+;
+; THUMB1-PIC-LABEL: test1:
+; THUMB1-PIC:       @ %bb.0:
+; THUMB1-PIC-NEXT:    .save {r4, r5, r6, lr}
+; THUMB1-PIC-NEXT:    push {r4, r5, r6, lr}
+; THUMB1-PIC-NEXT:    .pad #508
+; THUMB1-PIC-NEXT:    sub sp, #508
+; THUMB1-PIC-NEXT:    .pad #508
+; THUMB1-PIC-NEXT:    sub sp, #508
+; THUMB1-PIC-NEXT:    .pad #16
+; THUMB1-PIC-NEXT:    sub sp, #16
+; THUMB1-PIC-NEXT:    ldr r0, .LCPI0_0
+; THUMB1-PIC-NEXT:  .LPC0_0:
+; THUMB1-PIC-NEXT:    add r0, pc
+; THUMB1-PIC-NEXT:    ldr r0, [r0]
+; THUMB1-PIC-NEXT:    ldr r0, [r0]
+; THUMB1-PIC-NEXT:    add r1, sp, #904
+; THUMB1-PIC-NEXT:    str r0, [r1, #124]
+; THUMB1-PIC-NEXT:    add r0, sp, #4
+; THUMB1-PIC-NEXT:    bl foo
+; THUMB1-PIC-NEXT:    add r0, sp, #904
+; THUMB1-PIC-NEXT:    ldr r0, [r0, #124]
+; THUMB1-PIC-NEXT:    ldr r1, .LCPI0_1
+; THUMB1-PIC-NEXT:  .LPC0_1:
+; THUMB1-PIC-NEXT:    add r1, pc
+; THUMB1-PIC-NEXT:    ldr r1, [r1]
+; THUMB1-PIC-NEXT:    ldr r1, [r1]
+; THUMB1-PIC-NEXT:    cmp r1, r0
+; THUMB1-PIC-NEXT:    bne .LBB0_2
+; THUMB1-PIC-NEXT:  @ %bb.1:
+; THUMB1-PIC-NEXT:    movs r0, #0
+; THUMB1-PIC-NEXT:    add sp, #508
+; THUMB1-PIC-NEXT:    add sp, #508
+; THUMB1-PIC-NEXT:    add sp, #16
+; THUMB1-PIC-NEXT:    pop {r4, r5, r6, pc}
+; THUMB1-PIC-NEXT:  .LBB0_2:
+; THUMB1-PIC-NEXT:    bl __stack_chk_fail
+; THUMB1-PIC-NEXT:    .p2align 2
+; THUMB1-PIC-NEXT:  @ %bb.3:
+; THUMB1-PIC-NEXT:  .LCPI0_0:
+; THUMB1-PIC-NEXT:  .Ltmp0:
+; THUMB1-PIC-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+4)-.Ltmp0)
+; THUMB1-PIC-NEXT:  .LCPI0_1:
+; THUMB1-PIC-NEXT:  .Ltmp1:
+; THUMB1-PIC-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+4)-.Ltmp1)
+;
+; THUMB2-LABEL: test1:
+; THUMB2:       @ %bb.0:
+; THUMB2-NEXT:    .save {r7, lr}
+; THUMB2-NEXT:    push {r7, lr}
+; THUMB2-NEXT:    .pad #1032
+; THUMB2-NEXT:    sub.w sp, sp, #1032
+; THUMB2-NEXT:    movw r0, :lower16:__stack_chk_guard
+; THUMB2-NEXT:    movt r0, :upper16:__stack_chk_guard
+; THUMB2-NEXT:    ldr r0, [r0]
+; THUMB2-NEXT:    ldr r0, [r0]
+; THUMB2-NEXT:    str.w r0, [sp, #1028]
+; THUMB2-NEXT:    add r0, sp, #4
+; THUMB2-NEXT:    bl foo
+; THUMB2-NEXT:    movw r1, :lower16:__stack_chk_guard
+; THUMB2-NEXT:    ldr.w r0, [sp, #1028]
+; THUMB2-NEXT:    movt r1, :upper16:__stack_chk_guard
+; THUMB2-NEXT:    ldr r1, [r1]
+; THUMB2-NEXT:    ldr r1, [r1]
+; THUMB2-NEXT:    cmp r1, r0
+; THUMB2-NEXT:    ittt eq
+; THUMB2-NEXT:    moveq r0, #0
+; THUMB2-NEXT:    addeq.w sp, sp, #1032
+; THUMB2-NEXT:    popeq {r7, pc}
+; THUMB2-NEXT:  .LBB0_1:
+; THUMB2-NEXT:    bl __stack_chk_fail
+;
+; THUMB2-PIC-LABEL: test1:
+; THUMB2-PIC:       @ %bb.0:
+; THUMB2-PIC-NEXT:    .save {r7, lr}
+; THUMB2-PIC-NEXT:    push {r7, lr}
+; THUMB2-PIC-NEXT:    .pad #1032
+; THUMB2-PIC-NEXT:    sub.w sp, sp, #1032
+; THUMB2-PIC-NEXT:    ldr r0, .LCPI0_0
+; THUMB2-PIC-NEXT:  .LPC0_0:
+; THUMB2-PIC-NEXT:    add r0, pc
+; THUMB2-PIC-NEXT:    ldr r0, [r0]
+; THUMB2-PIC-NEXT:    ldr r0, [r0]
+; THUMB2-PIC-NEXT:    str.w r0, [sp, #1028]
+; THUMB2-PIC-NEXT:    add r0, sp, #4
+; THUMB2-PIC-NEXT:    bl foo
+; THUMB2-PIC-NEXT:    ldr.w r0, [sp, #1028]
+; THUMB2-PIC-NEXT:    ldr r1, .LCPI0_1
+; THUMB2-PIC-NEXT:  .LPC0_1:
+; THUMB2-PIC-NEXT:    add r1, pc
+; THUMB2-PIC-NEXT:    ldr r1, [r1]
+; THUMB2-PIC-NEXT:    ldr r1, [r1]
+; THUMB2-PIC-NEXT:    cmp r1, r0
+; THUMB2-PIC-NEXT:    ittt eq
+; THUMB2-PIC-NEXT:    moveq r0, #0
+; THUMB2-PIC-NEXT:    addeq.w sp, sp, #1032
+; THUMB2-PIC-NEXT:    popeq {r7, pc}
+; THUMB2-PIC-NEXT:  .LBB0_1:
+; THUMB2-PIC-NEXT:    bl __stack_chk_fail
+; THUMB2-PIC-NEXT:    .p2align 2
+; THUMB2-PIC-NEXT:  @ %bb.2:
+; THUMB2-PIC-NEXT:  .LCPI0_0:
+; THUMB2-PIC-NEXT:  .Ltmp0:
+; THUMB2-PIC-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+4)-.Ltmp0)
+; THUMB2-PIC-NEXT:  .LCPI0_1:
+; THUMB2-PIC-NEXT:  .Ltmp1:
+; THUMB2-PIC-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+4)-.Ltmp1)
   %a1 = alloca [256 x i32], align 4
   call void @foo(ptr %a1) #3
   ret i32 0

From a2caa4929e8e8a2ffff4ee5f03ab37a9be7462a0 Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Mon, 22 Jan 2024 09:23:06 +0100
Subject: [PATCH 341/843] [clang][dataflow] Treat comma operator correctly in
 `getResultObjectLocation()`. (#78427)

---
 .../FlowSensitive/DataflowEnvironment.cpp     |  9 ++++++--
 .../Analysis/FlowSensitive/TransferTest.cpp   | 22 ++++++++++++++-----
 2 files changed, 23 insertions(+), 8 deletions(-)

diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 07dc3a9f76ac2..196a1360a7750 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -782,8 +782,13 @@ Environment::getResultObjectLocation(const Expr &RecordPRValue) const {
     return Val->getLoc();
   }
 
-  // Expression nodes that propagate a record prvalue should have exactly one
-  // child.
+  if (auto *Op = dyn_cast<BinaryOperator>(&RecordPRValue);
+      Op && Op->isCommaOp()) {
+    return getResultObjectLocation(*Op->getRHS());
+  }
+
+  // All other expression nodes that propagate a record prvalue should have
+  // exactly one child.
   llvm::SmallVector<const Stmt *> children(RecordPRValue.child_begin(),
                                            RecordPRValue.child_end());
   assert(children.size() == 1);
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index d0a0e6d3f5836..85ae24f0b6f16 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -2642,14 +2642,17 @@ TEST(TransferTest, ResultObjectLocation) {
     };
 
     void target() {
-      A();
+      0, A();
       (void)0; // [[p]]
     }
   )";
+  using ast_matchers::binaryOperator;
   using ast_matchers::cxxBindTemporaryExpr;
   using ast_matchers::cxxTemporaryObjectExpr;
   using ast_matchers::exprWithCleanups;
   using ast_matchers::has;
+  using ast_matchers::hasOperatorName;
+  using ast_matchers::hasRHS;
   using ast_matchers::match;
   using ast_matchers::selectFirst;
   using ast_matchers::traverse;
@@ -2659,26 +2662,33 @@ TEST(TransferTest, ResultObjectLocation) {
          ASTContext &ASTCtx) {
         const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
 
-        // The expresssion `A()` in the code above produces the following
-        // structure, consisting of three prvalues of record type.
+        // The expression `0, A()` in the code above produces the following
+        // structure, consisting of four prvalues of record type.
         // `Env.getResultObjectLocation()` should return the same location for
         // all of these.
         auto MatchResult = match(
             traverse(TK_AsIs,
                      exprWithCleanups(
-                         has(cxxBindTemporaryExpr(
-                                 has(cxxTemporaryObjectExpr().bind("toe")))
-                                 .bind("bte")))
+                         has(binaryOperator(
+                                 hasOperatorName(","),
+                                 hasRHS(cxxBindTemporaryExpr(
+                                            has(cxxTemporaryObjectExpr().bind(
+                                                "toe")))
+                                            .bind("bte")))
+                                 .bind("comma")))
                          .bind("ewc")),
             ASTCtx);
         auto *TOE = selectFirst<CXXTemporaryObjectExpr>("toe", MatchResult);
         ASSERT_NE(TOE, nullptr);
+        auto *Comma = selectFirst<BinaryOperator>("comma", MatchResult);
+        ASSERT_NE(Comma, nullptr);
         auto *EWC = selectFirst<ExprWithCleanups>("ewc", MatchResult);
         ASSERT_NE(EWC, nullptr);
         auto *BTE = selectFirst<CXXBindTemporaryExpr>("bte", MatchResult);
         ASSERT_NE(BTE, nullptr);
 
         RecordStorageLocation &Loc = Env.getResultObjectLocation(*TOE);
+        EXPECT_EQ(&Loc, &Env.getResultObjectLocation(*Comma));
         EXPECT_EQ(&Loc, &Env.getResultObjectLocation(*EWC));
         EXPECT_EQ(&Loc, &Env.getResultObjectLocation(*BTE));
       });

From 21199f9842dffa4f34b38101195c6f57d1bd4630 Mon Sep 17 00:00:00 2001
From: Dominik Adamski <dominik.adamski@amd.com>
Date: Mon, 22 Jan 2024 09:24:45 +0100
Subject: [PATCH 342/843] [OpenMP][OMPIRBuilder] Fix LLVM IR codegen for
 collapsed device loop (#78708)

When we generate the loop body function, we need to be sure, that all
original loop counters are replaced by the new counter.

We need to save all items which use the original loop counter and then
perform replacement of the original loop counter. If we don't do it,
there is a risk that some values are not updated.
---
 llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp     |  7 +--
 .../LLVMIR/omptarget-wsloop-collapsed.mlir    | 39 ++++++++++++++++
 .../fortran/target-parallel-do-collapse.f90   | 44 +++++++++++++++++++
 3 files changed, 87 insertions(+), 3 deletions(-)
 create mode 100644 mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
 create mode 100644 openmp/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90

diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index f6cf358119fb7..211281452de22 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -2876,9 +2876,10 @@ OpenMPIRBuilder::applyWorkshareLoopTarget(DebugLoc DL, CanonicalLoopInfo *CLI,
   // We need to model loop body region as the function f(cnt, loop_arg).
   // That's why we replace loop induction variable by the new counter
   // which will be one of loop body function argument
-  for (auto Use = CLI->getIndVar()->user_begin();
-       Use != CLI->getIndVar()->user_end(); ++Use) {
-    if (Instruction *Inst = dyn_cast<Instruction>(*Use)) {
+  SmallVector<User *> Users(CLI->getIndVar()->user_begin(),
+                            CLI->getIndVar()->user_end());
+  for (auto Use : Users) {
+    if (Instruction *Inst = dyn_cast<Instruction>(Use)) {
       if (ParallelRegionBlockSet.count(Inst->getParent())) {
         Inst->replaceUsesOfWith(CLI->getIndVar(), NewLoopCntLoad);
       }
diff --git a/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
new file mode 100644
index 0000000000000..e246c551886cf
--- /dev/null
+++ b/mlir/test/Target/LLVMIR/omptarget-wsloop-collapsed.mlir
@@ -0,0 +1,39 @@
+// RUN: mlir-translate -mlir-to-llvmir %s | FileCheck %s
+
+// The aim of the test is to check the GPU LLVM IR codegen
+// for nested omp do loop with collapse clause inside omp target region
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<"dlti.alloca_memory_space", 5 : ui32>>, llvm.data_layout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8", llvm.target_triple = "amdgcn-amd-amdhsa", omp.is_gpu = true, omp.is_target_device = true } {
+  llvm.func @target_collapsed_wsloop(%arg0: !llvm.ptr) {
+    %loop_ub = llvm.mlir.constant(99 : i32) : i32
+    %loop_lb = llvm.mlir.constant(0 : i32) : i32
+    %loop_step = llvm.mlir.constant(1 : index) : i32
+    omp.wsloop for  (%arg1, %arg2) : i32 = (%loop_lb, %loop_lb) to (%loop_ub, %loop_ub) inclusive step (%loop_step, %loop_step) {
+      %1 = llvm.add %arg1, %arg2  : i32
+      %2 = llvm.mul %arg2, %loop_ub overflow<nsw>  : i32
+      %3 = llvm.add %arg1, %2 :i32
+      %4 = llvm.getelementptr %arg0[%3] : (!llvm.ptr, i32) -> !llvm.ptr, i32
+      llvm.store %1, %4 : i32, !llvm.ptr
+      omp.yield
+    }
+    llvm.return
+  }
+}
+
+// CHECK: define void @[[FUNC_COLLAPSED_WSLOOP:.*]](ptr %[[ARG0:.*]])
+// CHECK:   call void @__kmpc_for_static_loop_4u(ptr addrspacecast (ptr addrspace(1) @[[GLOB2:[0-9]+]] to ptr),
+// CHECK-SAME: ptr @[[COLLAPSED_WSLOOP_BODY_FN:.*]], ptr %[[STRUCT_ARG:.*]], i32 10000,
+// CHECK-SAME: i32 %[[NUM_THREADS:.*]], i32 0)
+
+// CHECK: define internal void @[[COLLAPSED_WSLOOP_BODY_FN]](i32 %[[LOOP_CNT:.*]], ptr %[[LOOP_BODY_ARG:.*]])
+// CHECK:   %[[TMP0:.*]] = urem i32 %[[LOOP_CNT]], 100
+// CHECK:   %[[TMP1:.*]] = udiv i32 %[[LOOP_CNT]], 100
+// CHECK:   %[[TMP2:.*]] = mul i32 %[[TMP1]], 1
+// CHECK:   %[[TMP3:.*]] = add i32 %[[TMP2]], 0
+// CHECK:   %[[TMP4:.*]] = mul i32 %[[TMP0]], 1
+// CHECK:   %[[TMP5:.*]] = add i32 %[[TMP4]], 0
+// CHECK:   %[[TMP6:.*]] = add i32 %[[TMP3]], %[[TMP5]]
+// CHECK:   %[[TMP7:.*]] = mul nsw i32 %[[TMP5]], 99
+// CHECK:   %[[TMP8:.*]] = add i32 %[[TMP3]], %[[TMP7]]
+// CHECK:   %[[TMP9:.*]] = getelementptr i32, ptr %[[ARRAY:.*]], i32 %[[TMP8]]
+// CHECK:   store i32 %[[TMP6]], ptr %[[TMP9]], align 4
diff --git a/openmp/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90 b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90
new file mode 100644
index 0000000000000..e44d6a50969bb
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/fortran/target-parallel-do-collapse.f90
@@ -0,0 +1,44 @@
+! Basic offloading test with a target region
+! REQUIRES: flang
+! UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+! UNSUPPORTED: aarch64-unknown-linux-gnu
+! UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+! UNSUPPORTED: x86_64-pc-linux-gnu
+! UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+! RUN: %libomptarget-compile-fortran-generic
+! RUN: env LIBOMPTARGET_INFO=16 %libomptarget-run-generic 2>&1 | %fcheck-generic
+program main
+   use omp_lib
+   implicit none
+   integer :: i,j
+   integer :: array(10,10), errors = 0
+   do i = 1, 10
+      do j = 1, 10
+         array(j, i) = 0
+      end do
+   end do
+
+   !$omp target parallel do map(from:array) collapse(2)
+   do i = 1, 10
+      do j = 1, 10
+         array( j, i) = i + j
+      end do
+    end do
+    !$omp end target parallel do
+
+    do i = 1, 10
+       do j = 1, 10
+          if ( array( j, i) .ne. (i + j) ) then
+             errors = errors + 1
+          end if
+       end do
+   end do
+
+   print *,"number of errors: ", errors
+
+end program main
+
+! CHECK:  "PluginInterface" device {{[0-9]+}} info: Launching kernel {{.*}}
+! CHECK:  number of errors: 0
+

From 8658d157654832fe24b4f3d2a9a62784a4d6a162 Mon Sep 17 00:00:00 2001
From: Corentin Jabot <corentinjabot@gmail.com>
Date: Mon, 22 Jan 2024 09:29:01 +0100
Subject: [PATCH 343/843] Revert "[Clang][Sema] Diagnose function/variable
 templates that shadow their own template parameters (#78274)"

This reverts commit fc0253264445be7f88d4cf0f9129dcb10c2fb84b.

This errors is disruptive to downstream projects
and should be reintroduced as a separate on-by-default
warning.

https://github.com/llvm/llvm-project/pull/78274
---
 clang/docs/ReleaseNotes.rst                   |  1 -
 clang/lib/Sema/SemaDecl.cpp                   | 12 +++++------
 .../test/CXX/temp/temp.res/temp.local/p6.cpp  | 20 +++----------------
 3 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 2c7c7b8a21b8e..7c9f9ecca727a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -590,7 +590,6 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses the requirement that non-template friend declarations with requires clauses
   and template friend declarations with a constraint that depends on a template parameter from an
   enclosing template must be a definition.
-- Clang now diagnoses function/variable templates that shadow their own template parameters, e.g. ``template<class T> void T();``.
 - Clang now diagnoses incorrect usage of ``const`` and ``pure`` attributes, so ``-Wignored-attributes`` diagnoses more cases.
 - Clang now emits more descriptive diagnostics for 'unusual' expressions (e.g. incomplete index
   expressions on matrix types or builtin functions without an argument list) as placement-args
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 8dff2cdc063df..13ca438e6a487 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -6377,6 +6377,12 @@ NamedDecl *Sema::HandleDeclarator(Scope *S, Declarator &D,
   } else if (DiagnoseUnexpandedParameterPack(NameInfo, UPPC_DeclarationType))
     return nullptr;
 
+  // The scope passed in may not be a decl scope.  Zip up the scope tree until
+  // we find one that is.
+  while ((S->getFlags() & Scope::DeclScope) == 0 ||
+         (S->getFlags() & Scope::TemplateParamScope) != 0)
+    S = S->getParent();
+
   DeclContext *DC = CurContext;
   if (D.getCXXScopeSpec().isInvalid())
     D.setInvalidType();
@@ -6529,12 +6535,6 @@ NamedDecl *Sema::HandleDeclarator(Scope *S, Declarator &D,
   if (getLangOpts().CPlusPlus)
     CheckExtraCXXDefaultArguments(D);
 
-  // The scope passed in may not be a decl scope.  Zip up the scope tree until
-  // we find one that is.
-  while ((S->getFlags() & Scope::DeclScope) == 0 ||
-         (S->getFlags() & Scope::TemplateParamScope) != 0)
-    S = S->getParent();
-
   NamedDecl *New;
 
   bool AddToScope = true;
diff --git a/clang/test/CXX/temp/temp.res/temp.local/p6.cpp b/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
index 0702966e56854..e2aa0ff344291 100644
--- a/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
+++ b/clang/test/CXX/temp/temp.res/temp.local/p6.cpp
@@ -127,30 +127,16 @@ template<int T> struct Z { // expected-note 16{{declared here}}
 template<typename T> // expected-note {{declared here}}
 void f(int T) {} // expected-error {{declaration of 'T' shadows template parameter}}
 
+// FIXME: These are ill-formed: a template-parameter shall not have the same name as the template name.
 namespace A {
   template<typename T> struct T {};  // expected-error{{declaration of 'T' shadows template parameter}}
                                      // expected-note@-1{{template parameter is declared here}}
-  template<typename T> struct U {
-    template<typename V> struct V {}; // expected-error{{declaration of 'V' shadows template parameter}}
-                                      // expected-note@-1{{template parameter is declared here}}
-  };
 }
 namespace B {
-  template<typename T> void T() {} // expected-error{{declaration of 'T' shadows template parameter}}
-                                   // expected-note@-1{{template parameter is declared here}}
-
-  template<typename T> struct U {
-    template<typename V> void V(); // expected-error{{declaration of 'V' shadows template parameter}}
-                                   // expected-note@-1{{template parameter is declared here}}
-  };
+  template<typename T> void T() {}
 }
 namespace C {
-  template<typename T> int T; // expected-error{{declaration of 'T' shadows template parameter}}
-                              // expected-note@-1{{template parameter is declared here}}
-  template<typename T> struct U {
-    template<typename V> static int V; // expected-error{{declaration of 'V' shadows template parameter}}
-                                       // expected-note@-1{{template parameter is declared here}}
-  };
+  template<typename T> int T;
 }
 
 namespace PR28023 {

From 262735bbcc22f216a688b934ca9ff50b427c9dc1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Zolt=C3=A1n=20B=C3=B6sz=C3=B6rm=C3=A9nyi?=
 <zboszor@gmail.com>
Date: Mon, 22 Jan 2024 09:32:13 +0100
Subject: [PATCH 344/843] libclc: add missing AMD gfx symlinks (#78884)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes #44186

---------

Signed-off-by: Zoltán Böszörményi <zboszor@gmail.com>
---
 libclc/CMakeLists.txt | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/libclc/CMakeLists.txt b/libclc/CMakeLists.txt
index 9daef8265c16f..fa1d8e4adbcc4 100644
--- a/libclc/CMakeLists.txt
+++ b/libclc/CMakeLists.txt
@@ -142,17 +142,15 @@ set( cypress_aliases hemlock )
 set( barts_aliases turks caicos )
 set( cayman_aliases aruba )
 set( tahiti_aliases pitcairn verde oland hainan bonaire kabini kaveri hawaii
-	mullins tonga iceland carrizo fiji stoney polaris10 polaris11 )
-
-# Support for gfx9 was added in LLVM 5.0 (r295554)
-if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "4.99.99" )
-	set( tahiti_aliases ${tahiti_aliases} gfx900 gfx902 )
-endif()
-
-# Support for Vega12 and Vega20 was added in LLVM 7 (r331215)
-if( ${LLVM_PACKAGE_VERSION} VERSION_GREATER "6.99.99" )
-	set( tahiti_aliases ${tahiti_aliases} gfx904 gfx906 )
-endif()
+	mullins tonga tongapro iceland carrizo fiji stoney polaris10 polaris11
+	gfx602 gfx705 gfx805
+	gfx900 gfx902 gfx904 gfx906 gfx908 gfx909 gfx90a gfx90c gfx940 gfx941 gfx942
+	gfx1010 gfx1011 gfx1012 gfx1013
+	gfx1030 gfx1031 gfx1032 gfx1033 gfx1034 gfx1035 gfx1036
+	gfx1100 gfx1101 gfx1102 gfx1103
+	gfx1150 gfx1151
+	gfx1200 gfx1201
+)
 
 # pkg-config file
 configure_file( libclc.pc.in libclc.pc @ONLY )

From 376f019609b3eba578723c26e1635d1be31e6057 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= <Mirko.Brkusanin@amd.com>
Date: Mon, 22 Jan 2024 09:35:08 +0100
Subject: [PATCH 345/843] [AMDGPU][NFC] Update cache policy descriptions
 (#78768)

---
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td | 150 +++++++++++++++++++----
 1 file changed, 126 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 21765cdd13a15..e6db9da5526aa 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -860,7 +860,6 @@ class AMDGPUImageDimIntrinsic<AMDGPUDimProfile P_,
       [llvm_i32_ty,                              // texfailctrl(imm; bit 0 = tfe, bit 1 = lwe)
        llvm_i32_ty]),                            // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc;
                                                  //   gfx12+ imm: bits [0-2] = th, bits [3-4] = scope)
-                                                 // TODO-GFX12: Update all other cachepolicy descriptions.
 
      !listconcat(props, [IntrNoCallback, IntrNoFree, IntrWillReturn],
           !if(P_.IsAtomic, [], [ImmArg<ArgIndex<AMDGPUImageDimIntrinsicEval<P_>.DmaskArgIndex>>]),
@@ -1088,7 +1087,8 @@ def int_amdgcn_s_buffer_load : DefaultAttrsIntrinsic <
   [llvm_any_ty],
   [llvm_v4i32_ty,     // rsrc(SGPR)
    llvm_i32_ty,       // byte offset
-   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 2 = dlc)
+   llvm_i32_ty],      // cachepolicy(imm; bit 0 = glc, bit 1 = slc, bit 2 = dlc;
+                      //   gfx12+ imm: bits [0-2] = th, bits [3-4] = scope)
                       // Note: volatile bit is **not** permitted here.
   [IntrNoMem, ImmArg<ArgIndex<2>>]>,
   AMDGPURsrcIntrinsic<0>;
@@ -1128,8 +1128,13 @@ class AMDGPURawBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrinsi
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                       //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
+                      //                                       bit 2 = dlc on gfx10/gfx11),
                       //                      swizzled buffer (bit 3 = swz),
+                      //                  gfx12+:
+                      //                      cachepolicy (bits [0-2] = th,
+                      //                                   bits [3-4] = scope)
+                      //                      swizzled buffer (bit 6 = swz),
+                      //                  all:
                       //                      volatile op (bit 31, stripped at lowering))
   [IntrReadMem, ImmArg<ArgIndex<3>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
@@ -1143,8 +1148,13 @@ class AMDGPURawPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
    llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],                // auxiliary data (imm, cachepolicy (bit 0 = glc,
                                 //                                   bit 1 = slc,
-                                //                                   bit 2 = dlc on gfx10+),
+                                //                                   bit 2 = dlc on gfx10/gfx11),
                                 //                      swizzled buffer (bit 3 = swz),
+                                //                  gfx12+:
+                                //                      cachepolicy (bits [0-2] = th,
+                                //                                   bits [3-4] = scope)
+                                //                      swizzled buffer (bit 6 = swz),
+                                //                  all:
                                 //                      volatile op (bit 31, stripped at lowering))
 
   [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
@@ -1161,8 +1171,13 @@ class AMDGPUStructBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntri
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                       //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
+                      //                                       bit 2 = dlc on gfx10/gfx11),
                       //                      swizzled buffer (bit 3 = swz),
+                      //                  gfx12+:
+                      //                      cachepolicy (bits [0-2] = th,
+                      //                                   bits [3-4] = scope)
+                      //                      swizzled buffer (bit 6 = swz),
+                      //                  all:
                       //                      volatile op (bit 31, stripped at lowering))
   [IntrReadMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
@@ -1177,8 +1192,13 @@ class AMDGPUStructPtrBufferLoad<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIn
    llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],                // auxiliary data (imm, cachepolicy (bit 0 = glc,
                                 //                                   bit 1 = slc,
-                                //                                   bit 2 = dlc on gfx10+),
+                                //                                   bit 2 = dlc on gfx10/gfx11),
                                 //                      swizzled buffer (bit 3 = swz),
+                                //                  gfx12+:
+                                //                      cachepolicy (bits [0-2] = th,
+                                //                                   bits [3-4] = scope)
+                                //                      swizzled buffer (bit 6 = swz),
+                                //                  all:
                                 //                      volatile op (bit 31, stripped at lowering))
   [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
    ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
@@ -1194,8 +1214,13 @@ class AMDGPURawBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntrins
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                       //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
+                      //                                       bit 2 = dlc on gfx10/gfx11),
                       //                      swizzled buffer (bit 3 = swz),
+                      //                  gfx12+:
+                      //                      cachepolicy (bits [0-2] = th,
+                      //                                   bits [3-4] = scope)
+                      //                      swizzled buffer (bit 6 = swz),
+                      //                  all:
                       //                      volatile op (bit 31, stripped at lowering))
   [IntrWriteMem, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
@@ -1210,8 +1235,13 @@ class AMDGPURawPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntr
    llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],                // auxiliary data (imm, cachepolicy (bit 0 = glc,
                                 //                                   bit 1 = slc,
-                                //                                   bit 2 = dlc on gfx10+),
+                                //                                   bit 2 = dlc on gfx10/gfx11),
                                 //                      swizzled buffer (bit 3 = swz),
+                                //                  gfx12+:
+                                //                      cachepolicy (bits [0-2] = th,
+                                //                                   bits [3-4] = scope)
+                                //                      swizzled buffer (bit 6 = swz),
+                                //                  all:
                                 //                      volatile op (bit 31, stripped at lowering))
   [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
   ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
@@ -1228,8 +1258,13 @@ class AMDGPUStructBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsIntr
    llvm_i32_ty,       // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],      // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                       //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
+                      //                                       bit 2 = dlc on gfx10/gfx11),
                       //                      swizzled buffer (bit 3 = swz),
+                      //                  gfx12+:
+                      //                      cachepolicy (bits [0-2] = th,
+                      //                                   bits [3-4] = scope)
+                      //                      swizzled buffer (bit 6 = swz),
+                      //                  all:
                       //                      volatile op (bit 31, stripped at lowering))
   [IntrWriteMem, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
@@ -1245,8 +1280,13 @@ class AMDGPUStructPtrBufferStore<LLVMType data_ty = llvm_any_ty> : DefaultAttrsI
    llvm_i32_ty,                 // soffset(SGPR/imm, excluded from bounds checking and swizzling)
    llvm_i32_ty],                // auxiliary data (imm, cachepolicy (bit 0 = glc,
                                 //                                   bit 1 = slc,
-                                //                                   bit 2 = dlc on gfx10+),
+                                //                                   bit 2 = dlc on gfx10/gfx11),
                                 //                      swizzled buffer (bit 3 = swz),
+                                //                  gfx12+:
+                                //                      cachepolicy (bits [0-2] = th,
+                                //                                   bits [3-4] = scope)
+                                //                      swizzled buffer (bit 6 = swz),
+                                //                  all:
                                 //                      volatile op (bit 31, stripped at lowering))
   [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
    ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
@@ -1502,8 +1542,12 @@ def int_amdgcn_raw_tbuffer_load : DefaultAttrsIntrinsic <
      llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
      llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                       //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
+                      //                                       bit 2 = dlc on gfx10/gfx11),
                       //                      swizzled buffer (bit 3 = swz))
+                      //                  gfx12+:
+                      //                      cachepolicy (bits [0-2] = th,
+                      //                                   bits [3-4] = scope)
+                      //                      swizzled buffer (bit 6 = swz)
     [IntrReadMem,
      ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<0>;
@@ -1516,8 +1560,12 @@ def int_amdgcn_raw_ptr_tbuffer_load : DefaultAttrsIntrinsic <
      llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
      llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                       //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
+                      //                                       bit 2 = dlc on gfx10/gfx11),
                       //                      swizzled buffer (bit 3 = swz),
+                      //                  gfx12+:
+                      //                      cachepolicy (bits [0-2] = th,
+                      //                                   bits [3-4] = scope)
+                      //                      swizzled buffer (bit 6 = swz)
                       //                      volatile op (bit 31, stripped at lowering))
     [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
      ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>], "", [SDNPMemOperand]>,
@@ -1532,8 +1580,13 @@ def int_amdgcn_raw_tbuffer_store : DefaultAttrsIntrinsic <
      llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
      llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                      //                                       bit 1 = slc,
-                     //                                       bit 2 = dlc on gfx10+),
+                     //                                       bit 2 = dlc on gfx10/gfx11),
                      //                      swizzled buffer (bit 3 = swz),
+                     //                  gfx12+:
+                     //                      cachepolicy (bits [0-2] = th,
+                     //                                   bits [3-4] = scope)
+                     //                      swizzled buffer (bit 6 = swz),
+                     //                  all:
                      //                      volatile op (bit 31, stripped at lowering))
     [IntrWriteMem,
      ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
@@ -1548,8 +1601,13 @@ def int_amdgcn_raw_ptr_tbuffer_store : DefaultAttrsIntrinsic <
      llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
      llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                      //                                       bit 1 = slc,
-                     //                                       bit 2 = dlc on gfx10+),
+                     //                                       bit 2 = dlc on gfx10/gfx11),
                      //                      swizzled buffer (bit 3 = swz),
+                     //                  gfx12+:
+                     //                      cachepolicy (bits [0-2] = th,
+                     //                                   bits [3-4] = scope)
+                     //                      swizzled buffer (bit 6 = swz),
+                     //                  all:
                      //                      volatile op (bit 31, stripped at lowering))
     [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
      ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
@@ -1564,8 +1622,13 @@ def int_amdgcn_struct_tbuffer_load : DefaultAttrsIntrinsic <
      llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
      llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                       //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
+                      //                                       bit 2 = dlc on gfx10/gfx11),
                       //                      swizzled buffer (bit 3 = swz),
+                      //                  gfx12+:
+                      //                      cachepolicy (bits [0-2] = th,
+                      //                                   bits [3-4] = scope)
+                      //                      swizzled buffer (bit 6 = swz),
+                      //                  all:
                       //                      volatile op (bit 31, stripped at lowering))
     [IntrReadMem,
      ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
@@ -1580,8 +1643,13 @@ def int_amdgcn_struct_ptr_tbuffer_load : DefaultAttrsIntrinsic <
      llvm_i32_ty,     // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
      llvm_i32_ty],    // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                       //                                       bit 1 = slc,
-                      //                                       bit 2 = dlc on gfx10+),
+                      //                                       bit 2 = dlc on gfx10/gfx11),
                       //                      swizzled buffer (bit 3 = swz),
+                      //                  gfx12+:
+                      //                      cachepolicy (bits [0-2] = th,
+                      //                                   bits [3-4] = scope)
+                      //                      swizzled buffer (bit 6 = swz),
+                      //                  all:
                       //                      volatile op (bit 31, stripped at lowering))
     [IntrArgMemOnly, IntrReadMem, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
      ImmArg<ArgIndex<4>>, ImmArg<ArgIndex<5>>], "", [SDNPMemOperand]>,
@@ -1597,9 +1665,14 @@ def int_amdgcn_struct_ptr_tbuffer_store : DefaultAttrsIntrinsic <
      llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
      llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                      //                                       bit 1 = slc,
-                     //                                       bit 2 = dlc on gfx10+),
+                     //                                       bit 2 = dlc on gfx10/gfx11),
                      //                      swizzled buffer (bit 3 = swz),
-                    //                      volatile op (bit 31, stripped at lowering))
+                     //                  gfx12+:
+                     //                      cachepolicy (bits [0-2] = th,
+                     //                                   bits [3-4] = scope)
+                     //                      swizzled buffer (bit 6 = swz),
+                     //                  all:
+                     //                      volatile op (bit 31, stripped at lowering))
     [IntrArgMemOnly, IntrWriteMem, WriteOnly<ArgIndex<1>>, NoCapture<ArgIndex<1>>,
      ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
   AMDGPURsrcIntrinsic<1>;
@@ -1614,8 +1687,13 @@ def int_amdgcn_struct_tbuffer_store : DefaultAttrsIntrinsic <
      llvm_i32_ty,    // format(imm; bits 3..0 = dfmt, bits 6..4 = nfmt)
      llvm_i32_ty],   // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                      //                                       bit 1 = slc,
-                     //                                       bit 2 = dlc on gfx10+),
+                     //                                       bit 2 = dlc on gfx10/gfx11),
                      //                      swizzled buffer (bit 3 = swz),
+                     //                  gfx12+:
+                     //                      cachepolicy (bits [0-2] = th,
+                     //                                   bits [3-4] = scope)
+                     //                      swizzled buffer (bit 6 = swz),
+                     //                  all:
                      //                      volatile op (bit 31, stripped at lowering))
     [IntrWriteMem,
      ImmArg<ArgIndex<5>>, ImmArg<ArgIndex<6>>], "", [SDNPMemOperand]>,
@@ -1676,8 +1754,13 @@ class AMDGPURawBufferLoadLDS : Intrinsic <
    llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
    llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                                        //                                       bit 1 = slc,
-                                       //                                       bit 2 = dlc on gfx10+))
+                                       //                                       bit 2 = dlc on gfx10/gfx11))
                                        //                      swizzled buffer (bit 3 = swz),
+                                       //                  gfx12+:
+                                       //                      cachepolicy (bits [0-2] = th,
+                                       //                                   bits [3-4] = scope)
+                                       //                      swizzled buffer (bit 6 = swz),
+                                       //                  all:
                                        //                      volatile op (bit 31, stripped at lowering))
   [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>,
    ImmArg<ArgIndex<6>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
@@ -1693,8 +1776,13 @@ class AMDGPURawPtrBufferLoadLDS : Intrinsic <
    llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
    llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                                        //                                       bit 1 = slc,
-                                       //                                       bit 2 = dlc on gfx10+))
+                                       //                                       bit 2 = dlc on gfx10/gfx11))
                                        //                      swizzled buffer (bit 3 = swz),
+                                       //                  gfx12+:
+                                       //                      cachepolicy (bits [0-2] = th,
+                                       //                                   bits [3-4] = scope)
+                                       //                      swizzled buffer (bit 6 = swz),
+                                       //                  all:
                                        //                      volatile op (bit 31, stripped at lowering))
   [IntrWillReturn, IntrArgMemOnly,
    ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
@@ -1714,8 +1802,13 @@ class AMDGPUStructBufferLoadLDS : Intrinsic <
    llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
    llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                                        //                                       bit 1 = slc,
-                                       //                                       bit 2 = dlc on gfx10+))
+                                       //                                       bit 2 = dlc on gfx10/gfx11))
                                        //                      swizzled buffer (bit 3 = swz),
+                                       //                  gfx12+:
+                                       //                      cachepolicy (bits [0-2] = th,
+                                       //                                   bits [3-4] = scope)
+                                       //                      swizzled buffer (bit 6 = swz),
+                                       //                  all:
                                        //                      volatile op (bit 31, stripped at lowering))
   [IntrWillReturn, NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>,
    ImmArg<ArgIndex<7>>, IntrNoCallback, IntrNoFree], "", [SDNPMemOperand]>, AMDGPURsrcIntrinsic<0>;
@@ -1732,8 +1825,13 @@ class AMDGPUStructPtrBufferLoadLDS : Intrinsic <
    llvm_i32_ty,                        // imm offset(imm, included in bounds checking and swizzling)
    llvm_i32_ty],                       // auxiliary data (imm, cachepolicy     (bit 0 = glc,
                                        //                                       bit 1 = slc,
-                                       //                                       bit 2 = dlc on gfx10+))
+                                       //                                       bit 2 = dlc on gfx10/gfx11))
                                        //                      swizzled buffer (bit 3 = swz),
+                                       //                  gfx12+:
+                                       //                      cachepolicy (bits [0-2] = th,
+                                       //                                   bits [3-4] = scope)
+                                       //                      swizzled buffer (bit 6 = swz),
+                                       //                  all:
                                        //                      volatile op (bit 31, stripped at lowering))
   [IntrWillReturn, IntrArgMemOnly,
    ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
@@ -2407,8 +2505,12 @@ class AMDGPUGlobalLoadLDS : Intrinsic <
    llvm_i32_ty,                        // imm offset (applied to both global and LDS address)
    llvm_i32_ty],                       // auxiliary data (imm, cachepolicy (bit 0 = glc/sc0,
                                        //                                   bit 1 = slc/sc1,
-                                       //                                   bit 2 = dlc on gfx10+))
+                                       //                                   bit 2 = dlc on gfx10/gfx11))
                                        //                                   bit 4 = scc/nt on gfx90a+))
+                                       //                  gfx12+:
+                                       //                      cachepolicy (bits [0-2] = th,
+                                       //                                   bits [3-4] = scope)
+                                       //                      swizzled buffer (bit 6 = swz),
   [IntrWillReturn, NoCapture<ArgIndex<0>>, NoCapture<ArgIndex<1>>,
    ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree],
   "", [SDNPMemOperand]>;

From f36845d0c696023ea97931a4201b43ddfababf9c Mon Sep 17 00:00:00 2001
From: David Chisnall <davidchisnall@users.noreply.github.com>
Date: Mon, 22 Jan 2024 08:38:41 +0000
Subject: [PATCH 346/843] Enable direct methods and fast alloc calls for
 libobjc2. (#78030)

These will be supported in the upcoming 2.2 release and so are gated on
that version.

Direct methods call `objc_send_initialize` if they are class methods
that may not have called initialize. This is guarded by checking for the
class flag bit that is set on initialisation in the class. This bit now
forms part of the ABI, but it's been stable for 30+ years so that's fine
as a contract going forwards.
---
 clang/include/clang/Basic/ObjCRuntime.h       |  15 +-
 clang/lib/CodeGen/CGObjCGNU.cpp               | 214 ++++++++++++++++--
 .../test/CodeGenObjC/gnustep2-direct-method.m |  37 +++
 3 files changed, 239 insertions(+), 27 deletions(-)
 create mode 100644 clang/test/CodeGenObjC/gnustep2-direct-method.m

diff --git a/clang/include/clang/Basic/ObjCRuntime.h b/clang/include/clang/Basic/ObjCRuntime.h
index f05debe6fea51..1ccf60f0b7bee 100644
--- a/clang/include/clang/Basic/ObjCRuntime.h
+++ b/clang/include/clang/Basic/ObjCRuntime.h
@@ -211,7 +211,13 @@ class ObjCRuntime {
     case GCC:
       return false;
     case GNUstep:
-      return false;
+      // This could be enabled for all versions, except for the fact that the
+      // implementation of `objc_retain` and friends prior to 2.2 call [object
+      // retain] in their fall-back paths, which leads to infinite recursion if
+      // the runtime is built with this enabled.  Since distributions typically
+      // build all Objective-C things with the same compiler version and flags,
+      // it's better to be conservative here.
+      return (getVersion() >= VersionTuple(2, 2));
     case ObjFW:
       return false;
     }
@@ -248,7 +254,7 @@ class ObjCRuntime {
     case GCC:
       return false;
     case GNUstep:
-      return false;
+      return getVersion() >= VersionTuple(2, 2);
     case ObjFW:
       return false;
     }
@@ -266,6 +272,8 @@ class ObjCRuntime {
       return getVersion() >= VersionTuple(12, 2);
     case WatchOS:
       return getVersion() >= VersionTuple(5, 2);
+    case GNUstep:
+      return getVersion() >= VersionTuple(2, 2);
     default:
       return false;
     }
@@ -463,7 +471,8 @@ class ObjCRuntime {
     case iOS: return true;
     case WatchOS: return true;
     case GCC: return false;
-    case GNUstep: return false;
+    case GNUstep:
+      return (getVersion() >= VersionTuple(2, 2));
     case ObjFW: return false;
     }
     llvm_unreachable("bad kind");
diff --git a/clang/lib/CodeGen/CGObjCGNU.cpp b/clang/lib/CodeGen/CGObjCGNU.cpp
index 9cc7f32815f7e..a36b0cdddaf0a 100644
--- a/clang/lib/CodeGen/CGObjCGNU.cpp
+++ b/clang/lib/CodeGen/CGObjCGNU.cpp
@@ -18,6 +18,8 @@
 #include "CGObjCRuntime.h"
 #include "CodeGenFunction.h"
 #include "CodeGenModule.h"
+#include "CodeGenTypes.h"
+#include "SanitizerMetadata.h"
 #include "clang/AST/ASTContext.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/Decl.h"
@@ -597,6 +599,10 @@ class CGObjCGNU : public CGObjCRuntime {
 
   llvm::Function *GenerateMethod(const ObjCMethodDecl *OMD,
                                  const ObjCContainerDecl *CD) override;
+
+  // Map to unify direct method definitions.
+  llvm::DenseMap<const ObjCMethodDecl *, llvm::Function *>
+      DirectMethodDefinitions;
   void GenerateDirectMethodPrologue(CodeGenFunction &CGF, llvm::Function *Fn,
                                     const ObjCMethodDecl *OMD,
                                     const ObjCContainerDecl *CD) override;
@@ -917,6 +923,14 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
     ClassAliasSection,
     ConstantStringSection
   };
+  /// The subset of `objc_class_flags` used at compile time.
+  enum ClassFlags {
+    /// This is a metaclass
+    ClassFlagMeta = (1 << 0),
+    /// This class has been initialised by the runtime (+initialize has been
+    /// sent if necessary).
+    ClassFlagInitialized = (1 << 8),
+  };
   static const char *const SectionsBaseNames[8];
   static const char *const PECOFFSectionsBaseNames[8];
   template<SectionKind K>
@@ -932,6 +946,8 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
   /// structure describing the receiver and the class, and a selector as
   /// arguments.  Returns the IMP for the corresponding method.
   LazyRuntimeFunction MsgLookupSuperFn;
+  /// Function to ensure that +initialize is sent to a class.
+  LazyRuntimeFunction SentInitializeFn;
   /// A flag indicating if we've emitted at least one protocol.
   /// If we haven't, then we need to emit an empty protocol, to ensure that the
   /// __start__objc_protocols and __stop__objc_protocols sections exist.
@@ -1719,7 +1735,7 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
     metaclassFields.addInt(LongTy, 0);
     // unsigned long info;
     // objc_class_flag_meta
-    metaclassFields.addInt(LongTy, 1);
+    metaclassFields.addInt(LongTy, ClassFlags::ClassFlagMeta);
     // long instance_size;
     // Setting this to zero is consistent with the older ABI, but it might be
     // more sensible to set this to sizeof(struct objc_class)
@@ -1993,6 +2009,8 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
     CGObjCGNUstep2(CodeGenModule &Mod) : CGObjCGNUstep(Mod, 10, 4, 2) {
       MsgLookupSuperFn.init(&CGM, "objc_msg_lookup_super", IMPTy,
                             PtrToObjCSuperTy, SelectorTy);
+      SentInitializeFn.init(&CGM, "objc_send_initialize",
+                            llvm::Type::getVoidTy(VMContext), IdTy);
       // struct objc_property
       // {
       //   const char *name;
@@ -2006,6 +2024,106 @@ class CGObjCGNUstep2 : public CGObjCGNUstep {
             { PtrToInt8Ty, PtrToInt8Ty, PtrToInt8Ty, PtrToInt8Ty, PtrToInt8Ty });
     }
 
+    void GenerateDirectMethodPrologue(CodeGenFunction &CGF, llvm::Function *Fn,
+                                      const ObjCMethodDecl *OMD,
+                                      const ObjCContainerDecl *CD) override {
+      auto &Builder = CGF.Builder;
+      bool ReceiverCanBeNull = true;
+      auto selfAddr = CGF.GetAddrOfLocalVar(OMD->getSelfDecl());
+      auto selfValue = Builder.CreateLoad(selfAddr);
+
+      // Generate:
+      //
+      // /* unless the receiver is never NULL */
+      // if (self == nil) {
+      //     return (ReturnType){ };
+      // }
+      //
+      // /* for class methods only to force class lazy initialization */
+      // if (!__objc_{class}_initialized)
+      // {
+      //   objc_send_initialize(class);
+      //   __objc_{class}_initialized = 1;
+      // }
+      //
+      // _cmd = @selector(...)
+      // ...
+
+      if (OMD->isClassMethod()) {
+        const ObjCInterfaceDecl *OID = cast<ObjCInterfaceDecl>(CD);
+
+        // Nullable `Class` expressions cannot be messaged with a direct method
+        // so the only reason why the receive can be null would be because
+        // of weak linking.
+        ReceiverCanBeNull = isWeakLinkedClass(OID);
+      }
+
+      llvm::MDBuilder MDHelper(CGM.getLLVMContext());
+      if (ReceiverCanBeNull) {
+        llvm::BasicBlock *SelfIsNilBlock =
+            CGF.createBasicBlock("objc_direct_method.self_is_nil");
+        llvm::BasicBlock *ContBlock =
+            CGF.createBasicBlock("objc_direct_method.cont");
+
+        // if (self == nil) {
+        auto selfTy = cast<llvm::PointerType>(selfValue->getType());
+        auto Zero = llvm::ConstantPointerNull::get(selfTy);
+
+        Builder.CreateCondBr(Builder.CreateICmpEQ(selfValue, Zero),
+                             SelfIsNilBlock, ContBlock,
+                             MDHelper.createBranchWeights(1, 1 << 20));
+
+        CGF.EmitBlock(SelfIsNilBlock);
+
+        //   return (ReturnType){ };
+        auto retTy = OMD->getReturnType();
+        Builder.SetInsertPoint(SelfIsNilBlock);
+        if (!retTy->isVoidType()) {
+          CGF.EmitNullInitialization(CGF.ReturnValue, retTy);
+        }
+        CGF.EmitBranchThroughCleanup(CGF.ReturnBlock);
+        // }
+
+        // rest of the body
+        CGF.EmitBlock(ContBlock);
+        Builder.SetInsertPoint(ContBlock);
+      }
+
+      if (OMD->isClassMethod()) {
+        // Prefix of the class type.
+        auto *classStart =
+            llvm::StructType::get(PtrTy, PtrTy, PtrTy, LongTy, LongTy);
+        auto &astContext = CGM.getContext();
+        auto flags = Builder.CreateLoad(
+            Address{Builder.CreateStructGEP(classStart, selfValue, 4), LongTy,
+                    CharUnits::fromQuantity(
+                        astContext.getTypeAlign(astContext.UnsignedLongTy))});
+        auto isInitialized =
+            Builder.CreateAnd(flags, ClassFlags::ClassFlagInitialized);
+        llvm::BasicBlock *notInitializedBlock =
+            CGF.createBasicBlock("objc_direct_method.class_uninitialized");
+        llvm::BasicBlock *initializedBlock =
+            CGF.createBasicBlock("objc_direct_method.class_initialized");
+        Builder.CreateCondBr(Builder.CreateICmpEQ(isInitialized, Zeros[0]),
+                             notInitializedBlock, initializedBlock,
+                             MDHelper.createBranchWeights(1, 1 << 20));
+        CGF.EmitBlock(notInitializedBlock);
+        Builder.SetInsertPoint(notInitializedBlock);
+        CGF.EmitRuntimeCall(SentInitializeFn, selfValue);
+        Builder.CreateBr(initializedBlock);
+        CGF.EmitBlock(initializedBlock);
+        Builder.SetInsertPoint(initializedBlock);
+      }
+
+      // only synthesize _cmd if it's referenced
+      if (OMD->getCmdDecl()->isUsed()) {
+        // `_cmd` is not a parameter to direct methods, so storage must be
+        // explicitly declared for it.
+        CGF.EmitVarDecl(*OMD->getCmdDecl());
+        Builder.CreateStore(GetSelector(CGF, OMD),
+                            CGF.GetAddrOfLocalVar(OMD->getCmdDecl()));
+      }
+    }
 };
 
 const char *const CGObjCGNUstep2::SectionsBaseNames[8] =
@@ -2649,13 +2767,18 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF,
     }
   }
 
+  bool isDirect = Method && Method->isDirectMethod();
+
   IdTy = cast<llvm::PointerType>(CGM.getTypes().ConvertType(ASTIdTy));
   llvm::Value *cmd;
-  if (Method)
-    cmd = GetSelector(CGF, Method);
-  else
-    cmd = GetSelector(CGF, Sel);
-  cmd = EnforceType(Builder, cmd, SelectorTy);
+  if (!isDirect) {
+    if (Method)
+      cmd = GetSelector(CGF, Method);
+    else
+      cmd = GetSelector(CGF, Sel);
+    cmd = EnforceType(Builder, cmd, SelectorTy);
+  }
+
   Receiver = EnforceType(Builder, Receiver, IdTy);
 
   llvm::Metadata *impMD[] = {
@@ -2667,7 +2790,8 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF,
 
   CallArgList ActualArgs;
   ActualArgs.add(RValue::get(Receiver), ASTIdTy);
-  ActualArgs.add(RValue::get(cmd), CGF.getContext().getObjCSelType());
+  if (!isDirect)
+    ActualArgs.add(RValue::get(cmd), CGF.getContext().getObjCSelType());
   ActualArgs.addFrom(CallArgs);
 
   MessageSendInfo MSI = getMessageSendInfo(Method, ResultType, ActualArgs);
@@ -2686,7 +2810,7 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF,
   // Rather than doing a whole target-specific analysis, we assume it
   // only works for void, integer, and pointer types, and in all
   // other cases we do an explicit nil check is emitted code.  In
-  // addition to ensuring we produe a zero value for other types, this
+  // addition to ensuring we produce a zero value for other types, this
   // sidesteps the few outright CC incompatibilities we know about that
   // could otherwise lead to crashes, like when a method is expected to
   // return on the x87 floating point stack or adjust the stack pointer
@@ -2720,8 +2844,9 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF,
         // FIXME: we probably need a size limit here, but we've
         // never imposed one before
       } else {
-        // Otherwise, use an explicit check just to be sure.
-        requiresExplicitZeroResult = true;
+        // Otherwise, use an explicit check just to be sure, unless we're
+        // calling a direct method, where the implementation does this for us.
+        requiresExplicitZeroResult = !isDirect;
       }
     }
 
@@ -2765,10 +2890,14 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF,
   // Get the IMP to call
   llvm::Value *imp;
 
-  // If we have non-legacy dispatch specified, we try using the objc_msgSend()
-  // functions.  These are not supported on all platforms (or all runtimes on a
-  // given platform), so we
-  switch (CGM.getCodeGenOpts().getObjCDispatchMethod()) {
+  // If this is a direct method, just emit it here.
+  if (isDirect)
+    imp = GenerateMethod(Method, Method->getClassInterface());
+  else
+    // If we have non-legacy dispatch specified, we try using the
+    // objc_msgSend() functions.  These are not supported on all platforms
+    // (or all runtimes on a given platform), so we
+    switch (CGM.getCodeGenOpts().getObjCDispatchMethod()) {
     case CodeGenOptions::Legacy:
       imp = LookupIMP(CGF, Receiver, cmd, node, MSI);
       break;
@@ -2791,7 +2920,7 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF,
                      llvm::FunctionType::get(IdTy, IdTy, true), "objc_msgSend")
                   .getCallee();
       }
-  }
+    }
 
   // Reset the receiver in case the lookup modified it
   ActualArgs[0] = CallArg(RValue::get(Receiver), ASTIdTy);
@@ -2801,7 +2930,8 @@ CGObjCGNU::GenerateMessageSend(CodeGenFunction &CGF,
   llvm::CallBase *call;
   CGCallee callee(CGCalleeInfo(), imp);
   RValue msgRet = CGF.EmitCall(MSI.CallInfo, callee, Return, ActualArgs, &call);
-  call->setMetadata(msgSendMDKind, node);
+  if (!isDirect)
+    call->setMetadata(msgSendMDKind, node);
 
   if (requiresNilReceiverCheck) {
     llvm::BasicBlock *nonNilPathBB = CGF.Builder.GetInsertBlock();
@@ -3924,14 +4054,50 @@ llvm::Function *CGObjCGNU::GenerateMethod(const ObjCMethodDecl *OMD,
   CodeGenTypes &Types = CGM.getTypes();
   llvm::FunctionType *MethodTy =
     Types.GetFunctionType(Types.arrangeObjCMethodDeclaration(OMD));
-  std::string FunctionName = getSymbolNameForMethod(OMD);
-
-  llvm::Function *Method
-    = llvm::Function::Create(MethodTy,
-                             llvm::GlobalValue::InternalLinkage,
-                             FunctionName,
-                             &TheModule);
-  return Method;
+
+  bool isDirect = OMD->isDirectMethod();
+  std::string FunctionName =
+      getSymbolNameForMethod(OMD, /*include category*/ !isDirect);
+
+  if (!isDirect)
+    return llvm::Function::Create(MethodTy,
+                                  llvm::GlobalVariable::InternalLinkage,
+                                  FunctionName, &TheModule);
+
+  auto *COMD = OMD->getCanonicalDecl();
+  auto I = DirectMethodDefinitions.find(COMD);
+  llvm::Function *OldFn = nullptr, *Fn = nullptr;
+
+  if (I == DirectMethodDefinitions.end()) {
+    auto *F =
+        llvm::Function::Create(MethodTy, llvm::GlobalVariable::ExternalLinkage,
+                               FunctionName, &TheModule);
+    DirectMethodDefinitions.insert(std::make_pair(COMD, F));
+    return F;
+  }
+
+  // Objective-C allows for the declaration and implementation types
+  // to differ slightly.
+  //
+  // If we're being asked for the Function associated for a method
+  // implementation, a previous value might have been cached
+  // based on the type of the canonical declaration.
+  //
+  // If these do not match, then we'll replace this function with
+  // a new one that has the proper type below.
+  if (!OMD->getBody() || COMD->getReturnType() == OMD->getReturnType())
+    return I->second;
+
+  OldFn = I->second;
+  Fn = llvm::Function::Create(MethodTy, llvm::GlobalValue::ExternalLinkage, "",
+                              &CGM.getModule());
+  Fn->takeName(OldFn);
+  OldFn->replaceAllUsesWith(Fn);
+  OldFn->eraseFromParent();
+
+  // Replace the cached function in the map.
+  I->second = Fn;
+  return Fn;
 }
 
 void CGObjCGNU::GenerateDirectMethodPrologue(CodeGenFunction &CGF,
diff --git a/clang/test/CodeGenObjC/gnustep2-direct-method.m b/clang/test/CodeGenObjC/gnustep2-direct-method.m
new file mode 100644
index 0000000000000..7fa2775eee39b
--- /dev/null
+++ b/clang/test/CodeGenObjC/gnustep2-direct-method.m
@@ -0,0 +1,37 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-freebsd -S -emit-llvm -fobjc-runtime=gnustep-2.2 -o - %s | FileCheck %s
+
+@interface X
+@end
+
+@implementation X
+//- (int)x __attribute__((objc_direct)) { return 12; }
+- (int)x __attribute__((objc_direct)) { return 12; }
+
+// Check that the name is mangled like Objective-C methods and contains a nil check
+// CHECK-LABEL: @_i_X__x
+// CHECK: icmp eq ptr %0, null
+
++ (int)clsMeth __attribute__((objc_direct)) { return 42; }
+// Check that the name is mangled like Objective-C methods and contains an initialisation check
+// CHECK-LABEL: @_c_X__clsMeth
+// CHECK: getelementptr inbounds { ptr, ptr, ptr, i64, i64 }, ptr %0, i32 0, i32 4
+// CHECK: load i64, ptr %1, align 64
+// CHECK: and i64 %2, 256
+// CHECK: objc_direct_method.class_uninitialized:
+// CHECK: call void @objc_send_initialize(ptr %0)
+
+@end
+
+// Check that the call sides are set up correctly.
+void callCls(void)
+{
+	// CHECK: call i32 @_c_X__clsMeth
+	[X clsMeth];
+}
+
+void callInstance(X *x)
+{
+	// CHECK: call i32 @_i_X__x
+	[x x];
+}
+

From 5fb39efe680642c6cab072560efa3bfce6646fb0 Mon Sep 17 00:00:00 2001
From: Andrei Golubev <andrey.golubev@intel.com>
Date: Mon, 22 Jan 2024 10:39:11 +0200
Subject: [PATCH 347/843] [LLVM][ADT] Explicitly convert size_t values to
 SmallVector's size type (#77939)

Multiple places rely on implicit conversion when assigning 'size_t'
values to the member fields (size or capacity) of SmallVector.

Depending on the platform / compiler configuration, this may result in
narrowing conversion warnings (especially given that the size type of
SmallVector's member fields is determined based on type T - in
SmallVector<T>). To avoid the problem altogether, make the conversions
explicit.

Co-authored-by: Orest Chura <orest.chura@intel.com>
---
 llvm/include/llvm/ADT/SmallVector.h | 19 ++++++++++++++-----
 llvm/lib/Support/SmallVector.cpp    |  3 +--
 2 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index 2e6d2dc6ce90a..09676d792dfeb 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -61,7 +61,7 @@ template <class Size_T> class SmallVectorBase {
 
   SmallVectorBase() = delete;
   SmallVectorBase(void *FirstEl, size_t TotalCapacity)
-      : BeginX(FirstEl), Capacity(TotalCapacity) {}
+      : BeginX(FirstEl), Capacity(static_cast<Size_T>(TotalCapacity)) {}
 
   /// This is a helper for \a grow() that's out of line to reduce code
   /// duplication.  This function will report a fatal error if it can't grow at
@@ -99,8 +99,18 @@ template <class Size_T> class SmallVectorBase {
   ///
   /// This does not construct or destroy any elements in the vector.
   void set_size(size_t N) {
-    assert(N <= capacity());
-    Size = N;
+    assert(N <= capacity()); // implies no overflow in assignment
+    Size = static_cast<Size_T>(N);
+  }
+
+  /// Set the array data pointer to \p Begin and capacity to \p N.
+  ///
+  /// This does not construct or destroy any elements in the vector.
+  //  This does not clean up any existing allocation.
+  void set_allocation_range(void *Begin, size_t N) {
+    assert(N <= SizeTypeMax());
+    BeginX = Begin;
+    Capacity = static_cast<Size_T>(N);
   }
 };
 
@@ -467,8 +477,7 @@ void SmallVectorTemplateBase<T, TriviallyCopyable>::takeAllocationForGrow(
   if (!this->isSmall())
     free(this->begin());
 
-  this->BeginX = NewElts;
-  this->Capacity = NewCapacity;
+  this->set_allocation_range(NewElts, NewCapacity);
 }
 
 /// SmallVectorTemplateBase<TriviallyCopyable = true> - This is where we put
diff --git a/llvm/lib/Support/SmallVector.cpp b/llvm/lib/Support/SmallVector.cpp
index f7e7e80332cc3..b6ce37842040b 100644
--- a/llvm/lib/Support/SmallVector.cpp
+++ b/llvm/lib/Support/SmallVector.cpp
@@ -153,8 +153,7 @@ void SmallVectorBase<Size_T>::grow_pod(void *FirstEl, size_t MinSize,
       NewElts = replaceAllocation(NewElts, TSize, NewCapacity, size());
   }
 
-  this->BeginX = NewElts;
-  this->Capacity = NewCapacity;
+  this->set_allocation_range(NewElts, NewCapacity);
 }
 
 template class llvm::SmallVectorBase<uint32_t>;

From 68a5261d260e95148755c6725f56957cb8fd23a3 Mon Sep 17 00:00:00 2001
From: Abhinav271828 <71174780+Abhinav271828@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:22:01 +0530
Subject: [PATCH 348/843] [MLIR][Presburger] Implement function to evaluate the
 number of terms in a generating function. (#78078)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

We implement `computeNumTerms()`, which counts the number of terms in a
generating function by substituting the unit vector in it.
This is the main function in Barvinok's algorithm – the number of points
in a polytope is given by the number of terms in the generating function
corresponding to it.
We also modify the GeneratingFunction class to have `const` getters and
improve the simplification of QuasiPolynomials.
---
 .../mlir/Analysis/Presburger/Barvinok.h       |   6 +
 .../Analysis/Presburger/GeneratingFunction.h  |  10 +-
 .../Analysis/Presburger/QuasiPolynomial.h     |   7 +-
 mlir/include/mlir/Analysis/Presburger/Utils.h |   5 +
 mlir/lib/Analysis/Presburger/Barvinok.cpp     | 239 ++++++++++++++++++
 .../Analysis/Presburger/QuasiPolynomial.cpp   |  47 +++-
 mlir/lib/Analysis/Presburger/Utils.cpp        |  27 ++
 .../Analysis/Presburger/BarvinokTest.cpp      | 110 ++++++++
 .../Analysis/Presburger/UtilsTest.cpp         |  19 ++
 9 files changed, 464 insertions(+), 6 deletions(-)

diff --git a/mlir/include/mlir/Analysis/Presburger/Barvinok.h b/mlir/include/mlir/Analysis/Presburger/Barvinok.h
index edee19f0e1a53..b70ec33b69323 100644
--- a/mlir/include/mlir/Analysis/Presburger/Barvinok.h
+++ b/mlir/include/mlir/Analysis/Presburger/Barvinok.h
@@ -99,6 +99,12 @@ QuasiPolynomial getCoefficientInRationalFunction(unsigned power,
                                                  ArrayRef<QuasiPolynomial> num,
                                                  ArrayRef<Fraction> den);
 
+/// Find the number of terms in a generating function, as
+/// a quasipolynomial in the parameter space of the input function.
+/// The generating function must be such that for all values of the
+/// parameters, the number of terms is finite.
+QuasiPolynomial computeNumTerms(const GeneratingFunction &gf);
+
 } // namespace detail
 } // namespace presburger
 } // namespace mlir
diff --git a/mlir/include/mlir/Analysis/Presburger/GeneratingFunction.h b/mlir/include/mlir/Analysis/Presburger/GeneratingFunction.h
index eaf0449fe8a11..c38eab6efd0fc 100644
--- a/mlir/include/mlir/Analysis/Presburger/GeneratingFunction.h
+++ b/mlir/include/mlir/Analysis/Presburger/GeneratingFunction.h
@@ -62,13 +62,15 @@ class GeneratingFunction {
 #endif // NDEBUG
   }
 
-  unsigned getNumParams() { return numParam; }
+  unsigned getNumParams() const { return numParam; }
 
-  SmallVector<int> getSigns() { return signs; }
+  SmallVector<int> getSigns() const { return signs; }
 
-  std::vector<ParamPoint> getNumerators() { return numerators; }
+  std::vector<ParamPoint> getNumerators() const { return numerators; }
 
-  std::vector<std::vector<Point>> getDenominators() { return denominators; }
+  std::vector<std::vector<Point>> getDenominators() const {
+    return denominators;
+  }
 
   GeneratingFunction operator+(GeneratingFunction &gf) const {
     assert(numParam == gf.getNumParams() &&
diff --git a/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h b/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h
index d03446f50264f..aeac19e827b44 100644
--- a/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h
+++ b/mlir/include/mlir/Analysis/Presburger/QuasiPolynomial.h
@@ -59,9 +59,14 @@ class QuasiPolynomial : public PresburgerSpace {
   QuasiPolynomial operator*(const QuasiPolynomial &x) const;
   QuasiPolynomial operator/(const Fraction x) const;
 
-  // Removes terms which evaluate to zero from the expression.
+  // Removes terms which evaluate to zero from the expression
+  // and folds affine functions which are constant into the
+  // constant coefficients.
   QuasiPolynomial simplify();
 
+  // Group together like terms in the expression.
+  QuasiPolynomial collectTerms();
+
   Fraction getConstantTerm();
 
 private:
diff --git a/mlir/include/mlir/Analysis/Presburger/Utils.h b/mlir/include/mlir/Analysis/Presburger/Utils.h
index 20af0bfcd62ba..e6d29e4ef6d06 100644
--- a/mlir/include/mlir/Analysis/Presburger/Utils.h
+++ b/mlir/include/mlir/Analysis/Presburger/Utils.h
@@ -281,6 +281,11 @@ SmallVector<MPInt, 8> getComplementIneq(ArrayRef<MPInt> ineq);
 /// The vectors must have the same sizes.
 Fraction dotProduct(ArrayRef<Fraction> a, ArrayRef<Fraction> b);
 
+/// Find the product of two polynomials, each given by an array of
+/// coefficients.
+std::vector<Fraction> multiplyPolynomials(ArrayRef<Fraction> a,
+                                          ArrayRef<Fraction> b);
+
 } // namespace presburger
 } // namespace mlir
 
diff --git a/mlir/lib/Analysis/Presburger/Barvinok.cpp b/mlir/lib/Analysis/Presburger/Barvinok.cpp
index 4ba4462af0317..e0fd0dd8caa4d 100644
--- a/mlir/lib/Analysis/Presburger/Barvinok.cpp
+++ b/mlir/lib/Analysis/Presburger/Barvinok.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/Presburger/Barvinok.h"
+#include "mlir/Analysis/Presburger/Utils.h"
 #include "llvm/ADT/Sequence.h"
 #include <algorithm>
 
@@ -245,3 +246,241 @@ QuasiPolynomial mlir::presburger::detail::getCoefficientInRationalFunction(
   }
   return coefficients[power].simplify();
 }
+
+/// Substitute x_i = t^μ_i in one term of a generating function, returning
+/// a quasipolynomial which represents the exponent of the numerator
+/// of the result, and a vector which represents the exponents of the
+/// denominator of the result.
+/// If the returned value is {num, dens}, it represents the function
+/// t^num / \prod_j (1 - t^dens[j]).
+/// v represents the affine functions whose floors are multiplied by the
+/// generators, and ds represents the list of generators.
+std::pair<QuasiPolynomial, std::vector<Fraction>>
+substituteMuInTerm(unsigned numParams, ParamPoint v, std::vector<Point> ds,
+                   Point mu) {
+  unsigned numDims = mu.size();
+  for (const Point &d : ds)
+    assert(d.size() == numDims &&
+           "μ has to have the same number of dimensions as the generators!");
+
+  // First, the exponent in the numerator becomes
+  // - (μ • u_1) * (floor(first col of v))
+  // - (μ • u_2) * (floor(second col of v)) - ...
+  // - (μ • u_d) * (floor(d'th col of v))
+  // So we store the negation of the dot products.
+
+  // We have d terms, each of whose coefficient is the negative dot product.
+  SmallVector<Fraction> coefficients;
+  coefficients.reserve(numDims);
+  for (const Point &d : ds)
+    coefficients.push_back(-dotProduct(mu, d));
+
+  // Then, the affine function is a single floor expression, given by the
+  // corresponding column of v.
+  ParamPoint vTranspose = v.transpose();
+  std::vector<std::vector<SmallVector<Fraction>>> affine;
+  affine.reserve(numDims);
+  for (unsigned j = 0; j < numDims; ++j)
+    affine.push_back({SmallVector<Fraction>(vTranspose.getRow(j))});
+
+  QuasiPolynomial num(numParams, coefficients, affine);
+  num = num.simplify();
+
+  std::vector<Fraction> dens;
+  dens.reserve(ds.size());
+  // Similarly, each term in the denominator has exponent
+  // given by the dot product of μ with u_i.
+  for (const Point &d : ds) {
+    // This term in the denominator is
+    // (1 - t^dens.back())
+    dens.push_back(dotProduct(d, mu));
+  }
+
+  return {num, dens};
+}
+
+/// Normalize all denominator exponents `dens` to their absolute values
+/// by multiplying and dividing by the inverses, in a function of the form
+/// sign * t^num / prod_j (1 - t^dens[j]).
+/// Here, sign = ± 1,
+/// num is a QuasiPolynomial, and
+/// each dens[j] is a Fraction.
+void normalizeDenominatorExponents(int &sign, QuasiPolynomial &num,
+                                   std::vector<Fraction> &dens) {
+  // We track the number of exponents that are negative in the
+  // denominator, and convert them to their absolute values.
+  unsigned numNegExps = 0;
+  Fraction sumNegExps(0, 1);
+  for (unsigned j = 0, e = dens.size(); j < e; ++j) {
+    if (dens[j] < 0) {
+      numNegExps += 1;
+      sumNegExps += dens[j];
+    }
+  }
+
+  // If we have (1 - t^-c) in the denominator, for positive c,
+  // multiply and divide by t^c.
+  // We convert all negative-exponent terms at once; therefore
+  // we multiply and divide by t^sumNegExps.
+  // Then we get
+  // -(1 - t^c) in the denominator,
+  // increase the numerator by c, and
+  // flip the sign of the function.
+  if (numNegExps % 2 == 1)
+    sign = -sign;
+  num = num - QuasiPolynomial(num.getNumInputs(), sumNegExps);
+}
+
+/// Compute the binomial coefficients nCi for 0 ≤ i ≤ r,
+/// where n is a QuasiPolynomial.
+std::vector<QuasiPolynomial> getBinomialCoefficients(QuasiPolynomial n,
+                                                     unsigned r) {
+  unsigned numParams = n.getNumInputs();
+  std::vector<QuasiPolynomial> coefficients;
+  coefficients.reserve(r + 1);
+  coefficients.push_back(QuasiPolynomial(numParams, 1));
+  for (unsigned j = 1; j <= r; ++j)
+    // We use the recursive formula for binomial coefficients here and below.
+    coefficients.push_back(
+        (coefficients[j - 1] * (n - QuasiPolynomial(numParams, j - 1)) /
+         Fraction(j, 1))
+            .simplify());
+  return coefficients;
+}
+
+/// Compute the binomial coefficients nCi for 0 ≤ i ≤ r,
+/// where n is a QuasiPolynomial.
+std::vector<Fraction> getBinomialCoefficients(Fraction n, Fraction r) {
+  std::vector<Fraction> coefficients;
+  coefficients.reserve((int64_t)floor(r));
+  coefficients.push_back(1);
+  for (unsigned j = 1; j <= r; ++j)
+    coefficients.push_back(coefficients[j - 1] * (n - (j - 1)) / (j));
+  return coefficients;
+}
+
+/// We have a generating function of the form
+/// f_p(x) = \sum_i sign_i * (x^n_i(p)) / (\prod_j (1 - x^d_{ij})
+///
+/// where sign_i is ±1,
+/// n_i \in Q^p -> Q^d is the sum of the vectors d_{ij}, weighted by the
+/// floors of d affine functions on p parameters.
+/// d_{ij} \in Q^d are vectors.
+///
+/// We need to find the number of terms of the form x^t in the expansion of
+/// this function.
+/// However, direct substitution (x = (1, ..., 1)) causes the denominator
+/// to become zero.
+///
+/// We therefore use the following procedure instead:
+/// 1. Substitute x_i = (s+1)^μ_i for some vector μ. This makes the generating
+/// function a function of a scalar s.
+/// 2. Write each term in this function as P(s)/Q(s), where P and Q are
+/// polynomials. P has coefficients as quasipolynomials in d parameters, while
+/// Q has coefficients as scalars.
+/// 3. Find the constant term in the expansion of each term P(s)/Q(s). This is
+/// equivalent to substituting s = 0.
+///
+/// Verdoolaege, Sven, et al. "Counting integer points in parametric
+/// polytopes using Barvinok's rational functions." Algorithmica 48 (2007):
+/// 37-66.
+QuasiPolynomial
+mlir::presburger::detail::computeNumTerms(const GeneratingFunction &gf) {
+  // Step (1) We need to find a μ such that we can substitute x_i =
+  // (s+1)^μ_i. After this substitution, the exponent of (s+1) in the
+  // denominator is (μ_i • d_{ij}) in each term. Clearly, this cannot become
+  // zero. Hence we find a vector μ that is not orthogonal to any of the
+  // d_{ij} and substitute x accordingly.
+  std::vector<Point> allDenominators;
+  for (ArrayRef<Point> den : gf.getDenominators())
+    allDenominators.insert(allDenominators.end(), den.begin(), den.end());
+  Point mu = getNonOrthogonalVector(allDenominators);
+
+  unsigned numParams = gf.getNumParams();
+  const std::vector<std::vector<Point>> &ds = gf.getDenominators();
+  QuasiPolynomial totalTerm(numParams, 0);
+  for (unsigned i = 0, e = ds.size(); i < e; ++i) {
+    int sign = gf.getSigns()[i];
+
+    // Compute the new exponents of (s+1) for the numerator and the
+    // denominator after substituting μ.
+    auto [numExp, dens] =
+        substituteMuInTerm(numParams, gf.getNumerators()[i], ds[i], mu);
+    // Now the numerator is (s+1)^numExp
+    // and the denominator is \prod_j (1 - (s+1)^dens[j]).
+
+    // Step (2) We need to express the terms in the function as quotients of
+    // polynomials. Each term is now of the form
+    // sign_i * (s+1)^numExp / (\prod_j (1 - (s+1)^dens[j]))
+    // For the i'th term, we first normalize the denominator to have only
+    // positive exponents. We convert all the dens[j] to their
+    // absolute values and change the sign and exponent in the numerator.
+    normalizeDenominatorExponents(sign, numExp, dens);
+
+    // Then, using the formula for geometric series, we replace each (1 -
+    // (s+1)^(dens[j])) with
+    // (-s)(\sum_{0 ≤ k < dens[j]} (s+1)^k).
+    for (unsigned j = 0, e = dens.size(); j < e; ++j)
+      dens[j] = abs(dens[j]) - 1;
+    // Note that at this point, the semantics of `dens[j]` changes to mean
+    // a term (\sum_{0 ≤ k ≤ dens[j]} (s+1)^k). The denominator is, as before,
+    // a product of these terms.
+
+    // Since the -s are taken out, the sign changes if there is an odd number
+    // of such terms.
+    unsigned r = dens.size();
+    if (dens.size() % 2 == 1)
+      sign = -sign;
+
+    // Thus the term overall now has the form
+    // sign'_i * (s+1)^numExp /
+    // (s^r * \prod_j (\sum_{0 ≤ k < dens[j]} (s+1)^k)).
+    // This means that
+    // the numerator is a polynomial in s, with coefficients as
+    // quasipolynomials (given by binomial coefficients), and the denominator
+    // is a polynomial in s, with integral coefficients (given by taking the
+    // convolution over all j).
+
+    // Step (3) We need to find the constant term in the expansion of each
+    // term. Since each term has s^r as a factor in the denominator, we avoid
+    // substituting s = 0 directly; instead, we find the coefficient of s^r in
+    // sign'_i * (s+1)^numExp / (\prod_j (\sum_k (s+1)^k)),
+    // Letting P(s) = (s+1)^numExp and Q(s) = \prod_j (...),
+    // we need to find the coefficient of s^r in P(s)/Q(s),
+    // for which we use the `getCoefficientInRationalFunction()` function.
+
+    // First, we compute the coefficients of P(s), which are binomial
+    // coefficients.
+    // We only need the first r+1 of these, as higher-order terms do not
+    // contribute to the coefficient of s^r.
+    std::vector<QuasiPolynomial> numeratorCoefficients =
+        getBinomialCoefficients(numExp, r);
+
+    // Then we compute the coefficients of each individual term in Q(s),
+    // which are (dens[i]+1) C (k+1) for 0 ≤ k ≤ dens[i].
+    std::vector<std::vector<Fraction>> eachTermDenCoefficients;
+    std::vector<Fraction> singleTermDenCoefficients;
+    eachTermDenCoefficients.reserve(r);
+    for (const Fraction &den : dens) {
+      singleTermDenCoefficients = getBinomialCoefficients(den + 1, den + 1);
+      eachTermDenCoefficients.push_back(
+          ArrayRef<Fraction>(singleTermDenCoefficients).slice(1));
+    }
+
+    // Now we find the coefficients in Q(s) itself
+    // by taking the convolution of the coefficients
+    // of all the terms.
+    std::vector<Fraction> denominatorCoefficients;
+    denominatorCoefficients = eachTermDenCoefficients[0];
+    for (unsigned j = 1, e = eachTermDenCoefficients.size(); j < e; ++j)
+      denominatorCoefficients = multiplyPolynomials(denominatorCoefficients,
+                                                    eachTermDenCoefficients[j]);
+
+    totalTerm =
+        totalTerm + getCoefficientInRationalFunction(r, numeratorCoefficients,
+                                                     denominatorCoefficients) *
+                        QuasiPolynomial(numParams, sign);
+  }
+
+  return totalTerm.simplify();
+}
\ No newline at end of file
diff --git a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
index 385d4c354be18..4fd4886d22536 100644
--- a/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
+++ b/mlir/lib/Analysis/Presburger/QuasiPolynomial.cpp
@@ -97,10 +97,18 @@ QuasiPolynomial QuasiPolynomial::operator/(const Fraction x) const {
   return qp;
 }
 
-// Removes terms which evaluate to zero from the expression.
+// Removes terms which evaluate to zero from the expression and
+// integrate affine functions which are constants into the
+// coefficients.
 QuasiPolynomial QuasiPolynomial::simplify() {
+  Fraction newCoeff = 0;
   SmallVector<Fraction> newCoeffs({});
+
+  std::vector<SmallVector<Fraction>> newAffineTerm({});
   std::vector<std::vector<SmallVector<Fraction>>> newAffine({});
+
+  unsigned numParam = getNumInputs();
+
   for (unsigned i = 0, e = coefficients.size(); i < e; i++) {
     // A term is zero if its coefficient is zero, or
     if (coefficients[i] == Fraction(0, 1))
@@ -114,9 +122,46 @@ QuasiPolynomial QuasiPolynomial::simplify() {
         });
     if (product_is_zero)
       continue;
+
+    // Now, we know the term is nonzero.
+
+    // We now eliminate the affine functions which are constant
+    // by merging them into the coefficients.
+    newAffineTerm = {};
+    newCoeff = coefficients[i];
+    for (ArrayRef<Fraction> term : affine[i]) {
+      bool allCoeffsZero = llvm::all_of(
+          term.slice(0, numParam), [](const Fraction c) { return c == 0; });
+      if (allCoeffsZero)
+        newCoeff *= term[numParam];
+      else
+        newAffineTerm.push_back(SmallVector<Fraction>(term));
+    }
+
+    newCoeffs.push_back(newCoeff);
+    newAffine.push_back(newAffineTerm);
+  }
+  return QuasiPolynomial(getNumInputs(), newCoeffs, newAffine);
+}
+
+QuasiPolynomial QuasiPolynomial::collectTerms() {
+  SmallVector<Fraction> newCoeffs({});
+  std::vector<std::vector<SmallVector<Fraction>>> newAffine({});
+
+  for (unsigned i = 0, e = affine.size(); i < e; i++) {
+    bool alreadyPresent = false;
+    for (unsigned j = 0, f = newAffine.size(); j < f; j++) {
+      if (affine[i] == newAffine[j]) {
+        newCoeffs[j] += coefficients[i];
+        alreadyPresent = true;
+      }
+    }
+    if (alreadyPresent)
+      continue;
     newCoeffs.push_back(coefficients[i]);
     newAffine.push_back(affine[i]);
   }
+
   return QuasiPolynomial(getNumInputs(), newCoeffs, newAffine);
 }
 
diff --git a/mlir/lib/Analysis/Presburger/Utils.cpp b/mlir/lib/Analysis/Presburger/Utils.cpp
index 5e267d2045bc1..a8d860885ef10 100644
--- a/mlir/lib/Analysis/Presburger/Utils.cpp
+++ b/mlir/lib/Analysis/Presburger/Utils.cpp
@@ -537,4 +537,31 @@ Fraction presburger::dotProduct(ArrayRef<Fraction> a, ArrayRef<Fraction> b) {
   for (unsigned i = 0, e = a.size(); i < e; i++)
     sum += a[i] * b[i];
   return sum;
+}
+
+/// Find the product of two polynomials, each given by an array of
+/// coefficients, by taking the convolution.
+std::vector<Fraction> presburger::multiplyPolynomials(ArrayRef<Fraction> a,
+                                                      ArrayRef<Fraction> b) {
+  // The length of the convolution is the sum of the lengths
+  // of the two sequences. We pad the shorter one with zeroes.
+  unsigned len = a.size() + b.size() - 1;
+
+  // We define accessors to avoid out-of-bounds errors.
+  auto getCoeff = [](ArrayRef<Fraction> arr, unsigned i) -> Fraction {
+    if (i < arr.size())
+      return arr[i];
+    else
+      return 0;
+  };
+
+  std::vector<Fraction> convolution;
+  convolution.reserve(len);
+  for (unsigned k = 0; k < len; ++k) {
+    Fraction sum(0, 1);
+    for (unsigned l = 0; l <= k; ++l)
+      sum += getCoeff(a, l) * getCoeff(b, k - l);
+    convolution.push_back(sum);
+  }
+  return convolution;
 }
\ No newline at end of file
diff --git a/mlir/unittests/Analysis/Presburger/BarvinokTest.cpp b/mlir/unittests/Analysis/Presburger/BarvinokTest.cpp
index e304f81de21f0..919aaa7a42859 100644
--- a/mlir/unittests/Analysis/Presburger/BarvinokTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/BarvinokTest.cpp
@@ -124,3 +124,113 @@ TEST(BarvinokTest, getCoefficientInRationalFunction) {
   coeff = getCoefficientInRationalFunction(3, numerator, denominator);
   EXPECT_EQ(coeff.getConstantTerm(), Fraction(55, 64));
 }
+
+TEST(BarvinokTest, computeNumTerms) {
+  // The following test is taken from
+  // Verdoolaege, Sven, et al. "Counting integer points in parametric
+  // polytopes using Barvinok's rational functions." Algorithmica 48 (2007):
+  // 37-66.
+  // It represents a right-angled triangle with right angle at the origin,
+  // with height and base lengths (p/2).
+  GeneratingFunction gf(
+      1, {1, 1, 1},
+      {makeFracMatrix(2, 2, {{0, Fraction(1, 2)}, {0, 0}}),
+       makeFracMatrix(2, 2, {{0, Fraction(1, 2)}, {0, 0}}),
+       makeFracMatrix(2, 2, {{0, 0}, {0, 0}})},
+      {{{-1, 1}, {-1, 0}}, {{1, -1}, {0, -1}}, {{1, 0}, {0, 1}}});
+
+  QuasiPolynomial numPoints = computeNumTerms(gf).collectTerms();
+
+  // First, we make sure that all the affine functions are of the form ⌊p/2⌋.
+  for (const std::vector<SmallVector<Fraction>> &term : numPoints.getAffine()) {
+    for (const SmallVector<Fraction> &aff : term) {
+      EXPECT_EQ(aff.size(), 2u);
+      EXPECT_EQ(aff[0], Fraction(1, 2));
+      EXPECT_EQ(aff[1], Fraction(0, 1));
+    }
+  }
+
+  // Now, we can gather the like terms because we know there's only
+  // either ⌊p/2⌋^2, ⌊p/2⌋, or constants.
+  // The total coefficient of ⌊p/2⌋^2 is the sum of coefficients of all
+  // terms with 2 affine functions, and
+  // the coefficient of total ⌊p/2⌋ is the sum of coefficients of all
+  // terms with 1 affine function,
+  Fraction pSquaredCoeff = 0, pCoeff = 0, constantTerm = 0;
+  SmallVector<Fraction> coefficients = numPoints.getCoefficients();
+  for (unsigned i = 0; i < numPoints.getCoefficients().size(); i++)
+    if (numPoints.getAffine()[i].size() == 2)
+      pSquaredCoeff = pSquaredCoeff + coefficients[i];
+    else if (numPoints.getAffine()[i].size() == 1)
+      pCoeff = pCoeff + coefficients[i];
+
+  // We expect the answer to be (1/2)⌊p/2⌋^2 + (3/2)⌊p/2⌋ + 1.
+  EXPECT_EQ(pSquaredCoeff, Fraction(1, 2));
+  EXPECT_EQ(pCoeff, Fraction(3, 2));
+  EXPECT_EQ(numPoints.getConstantTerm(), Fraction(1, 1));
+
+  // The following generating function corresponds to a cuboid
+  // with length M (x-axis), width N (y-axis), and height P (z-axis).
+  // There are eight terms.
+  gf = GeneratingFunction(
+      3, {1, 1, 1, 1, 1, 1, 1, 1},
+      {makeFracMatrix(4, 3, {{0, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}}),
+       makeFracMatrix(4, 3, {{1, 0, 0}, {0, 0, 0}, {0, 0, 0}, {0, 0, 0}}),
+       makeFracMatrix(4, 3, {{0, 0, 0}, {0, 1, 0}, {0, 0, 0}, {0, 0, 0}}),
+       makeFracMatrix(4, 3, {{0, 0, 0}, {0, 0, 0}, {0, 0, 1}, {0, 0, 0}}),
+       makeFracMatrix(4, 3, {{1, 0, 0}, {0, 1, 0}, {0, 0, 0}, {0, 0, 0}}),
+       makeFracMatrix(4, 3, {{1, 0, 0}, {0, 0, 0}, {0, 0, 1}, {0, 0, 0}}),
+       makeFracMatrix(4, 3, {{0, 0, 0}, {0, 1, 0}, {0, 0, 1}, {0, 0, 0}}),
+       makeFracMatrix(4, 3, {{1, 0, 0}, {0, 1, 0}, {0, 0, 1}, {0, 0, 0}})},
+      {{{1, 0, 0}, {0, 1, 0}, {0, 0, 1}},
+       {{-1, 0, 0}, {0, 1, 0}, {0, 0, 1}},
+       {{1, 0, 0}, {0, -1, 0}, {0, 0, 1}},
+       {{1, 0, 0}, {0, 1, 0}, {0, 0, -1}},
+       {{-1, 0, 0}, {0, -1, 0}, {0, 0, 1}},
+       {{-1, 0, 0}, {0, 1, 0}, {0, 0, -1}},
+       {{1, 0, 0}, {0, -1, 0}, {0, 0, -1}},
+       {{-1, 0, 0}, {0, -1, 0}, {0, 0, -1}}});
+
+  numPoints = computeNumTerms(gf);
+  numPoints = numPoints.collectTerms().simplify();
+
+  // First, we make sure all the affine functions are either
+  // M, N, P, or 1.
+  for (const std::vector<SmallVector<Fraction>> &term : numPoints.getAffine()) {
+    for (const SmallVector<Fraction> &aff : term) {
+      // First, ensure that the coefficients are all nonnegative integers.
+      for (const Fraction &c : aff) {
+        EXPECT_TRUE(c >= 0);
+        EXPECT_EQ(c, c.getAsInteger());
+      }
+      // Now, if the coefficients add up to 1, we can be sure the term is
+      // either M, N, P, or 1.
+      EXPECT_EQ(aff[0] + aff[1] + aff[2] + aff[3], 1);
+    }
+  }
+
+  // We store the coefficients of M, N and P in this array.
+  Fraction count[2][2][2];
+  coefficients = numPoints.getCoefficients();
+  for (unsigned i = 0, e = coefficients.size(); i < e; i++) {
+    unsigned mIndex = 0, nIndex = 0, pIndex = 0;
+    for (const SmallVector<Fraction> &aff : numPoints.getAffine()[i]) {
+      if (aff[0] == 1)
+        mIndex = 1;
+      if (aff[1] == 1)
+        nIndex = 1;
+      if (aff[2] == 1)
+        pIndex = 1;
+      EXPECT_EQ(aff[3], 0);
+    }
+    count[mIndex][nIndex][pIndex] += coefficients[i];
+  }
+
+  // We expect the answer to be
+  // (⌊M⌋ + 1)(⌊N⌋ + 1)(⌊P⌋ + 1) =
+  // ⌊M⌋⌊N⌋⌊P⌋ + ⌊M⌋⌊N⌋ + ⌊N⌋⌊P⌋ + ⌊M⌋⌊P⌋ + ⌊M⌋ + ⌊N⌋ + ⌊P⌋ + 1.
+  for (unsigned i = 0; i < 2; i++)
+    for (unsigned j = 0; j < 2; j++)
+      for (unsigned k = 0; k < 2; k++)
+        EXPECT_EQ(count[i][j][k], 1);
+}
\ No newline at end of file
diff --git a/mlir/unittests/Analysis/Presburger/UtilsTest.cpp b/mlir/unittests/Analysis/Presburger/UtilsTest.cpp
index 7c1646aa84116..f09a1a760ce60 100644
--- a/mlir/unittests/Analysis/Presburger/UtilsTest.cpp
+++ b/mlir/unittests/Analysis/Presburger/UtilsTest.cpp
@@ -66,3 +66,22 @@ TEST(UtilsTest, DivisionReprNormalizeTest) {
   checkEqual(a, b);
   checkEqual(c, d);
 }
+
+TEST(UtilsTest, convolution) {
+  std::vector<Fraction> aVals({1, 2, 3, 4});
+  std::vector<Fraction> bVals({7, 3, 1, 6});
+  ArrayRef<Fraction> a(aVals);
+  ArrayRef<Fraction> b(bVals);
+
+  std::vector<Fraction> conv = multiplyPolynomials(a, b);
+
+  EXPECT_EQ(conv, std::vector<Fraction>({7, 17, 28, 45, 27, 22, 24}));
+
+  aVals = {3, 6, 0, 2, 5};
+  bVals = {2, 0, 6};
+  a = aVals;
+  b = bVals;
+
+  conv = multiplyPolynomials(a, b);
+  EXPECT_EQ(conv, std::vector<Fraction>({6, 12, 18, 40, 10, 12, 30}));
+}

From 0845514d1a78ca04ef90b775d8819a8a8f19a533 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Mon, 22 Jan 2024 09:58:09 +0100
Subject: [PATCH 349/843] [clang][analyzer] Add function 'fscanf' to
 StreamChecker. (#78180)

---
 .../StaticAnalyzer/Checkers/StreamChecker.cpp | 69 +++++++++++++++++++
 clang/test/Analysis/stream-error.c            | 25 +++++++
 clang/test/Analysis/stream.c                  |  6 ++
 3 files changed, 100 insertions(+)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 95c7503e49e0d..84923732fe1b2 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -266,6 +266,9 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
       {{{"fprintf"}},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalFprintf, _1, _2, _3, _4), 0}},
+      {{{"fscanf"}},
+       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
+        std::bind(&StreamChecker::evalFscanf, _1, _2, _3, _4), 0}},
       {{{"ungetc"}, 2},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalUngetc, _1, _2, _3, _4), 1}},
@@ -345,6 +348,9 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
   void evalFprintf(const FnDescription *Desc, const CallEvent &Call,
                    CheckerContext &C) const;
 
+  void evalFscanf(const FnDescription *Desc, const CallEvent &Call,
+                  CheckerContext &C) const;
+
   void evalUngetc(const FnDescription *Desc, const CallEvent &Call,
                   CheckerContext &C) const;
 
@@ -975,6 +981,69 @@ void StreamChecker::evalFprintf(const FnDescription *Desc,
   C.addTransition(StateFailed);
 }
 
+void StreamChecker::evalFscanf(const FnDescription *Desc, const CallEvent &Call,
+                               CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+  if (Call.getNumArgs() < 2)
+    return;
+  SymbolRef StreamSym = getStreamArg(Desc, Call).getAsSymbol();
+  if (!StreamSym)
+    return;
+
+  const CallExpr *CE = dyn_cast_or_null<CallExpr>(Call.getOriginExpr());
+  if (!CE)
+    return;
+
+  const StreamState *OldSS = State->get<StreamMap>(StreamSym);
+  if (!OldSS)
+    return;
+
+  assertStreamStateOpened(OldSS);
+
+  SValBuilder &SVB = C.getSValBuilder();
+  ASTContext &ACtx = C.getASTContext();
+
+  // Add the success state.
+  // In this context "success" means there is not an EOF or other read error
+  // before any item is matched in 'fscanf'. But there may be match failure,
+  // therefore return value can be 0 or greater.
+  // It is not specified what happens if some items (not all) are matched and
+  // then EOF or read error happens. Now this case is handled like a "success"
+  // case, and no error flags are set on the stream. This is probably not
+  // accurate, and the POSIX documentation does not tell more.
+  if (OldSS->ErrorState != ErrorFEof) {
+    NonLoc RetVal = makeRetVal(C, CE).castAs<NonLoc>();
+    ProgramStateRef StateNotFailed =
+        State->BindExpr(CE, C.getLocationContext(), RetVal);
+    auto RetGeZero =
+        SVB.evalBinOp(StateNotFailed, BO_GE, RetVal,
+                      SVB.makeZeroVal(ACtx.IntTy), SVB.getConditionType())
+            .getAs<DefinedOrUnknownSVal>();
+    if (!RetGeZero)
+      return;
+    StateNotFailed = StateNotFailed->assume(*RetGeZero, true);
+
+    C.addTransition(StateNotFailed);
+  }
+
+  // Add transition for the failed state.
+  // Error occurs if nothing is matched yet and reading the input fails.
+  // Error can be EOF, or other error. At "other error" FERROR or 'errno' can
+  // be set but it is not further specified if all are required to be set.
+  // Documentation does not mention, but file position will be set to
+  // indeterminate similarly as at 'fread'.
+  ProgramStateRef StateFailed = bindInt(*EofVal, State, C, CE);
+  StreamErrorState NewES = (OldSS->ErrorState == ErrorFEof)
+                               ? ErrorFEof
+                               : ErrorNone | ErrorFEof | ErrorFError;
+  StreamState NewSS = StreamState::getOpened(Desc, NewES, !NewES.isFEof());
+  StateFailed = StateFailed->set<StreamMap>(StreamSym, NewSS);
+  if (OldSS->ErrorState != ErrorFEof)
+    C.addTransition(StateFailed, constructSetEofNoteTag(C, StreamSym));
+  else
+    C.addTransition(StateFailed);
+}
+
 void StreamChecker::evalUngetc(const FnDescription *Desc, const CallEvent &Call,
                                CheckerContext &C) const {
   ProgramStateRef State = C.getState();
diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c
index 0f7fdddc0dd4c..2cf46e1d4ad51 100644
--- a/clang/test/Analysis/stream-error.c
+++ b/clang/test/Analysis/stream-error.c
@@ -208,6 +208,31 @@ void error_fprintf(void) {
   fprintf(F, "ccc"); // expected-warning {{Stream might be already closed}}
 }
 
+void error_fscanf(int *A) {
+  FILE *F = tmpfile();
+  if (!F)
+    return;
+  int Ret = fscanf(F, "a%ib", A);
+  if (Ret >= 0) {
+    clang_analyzer_eval(feof(F) || ferror(F)); // expected-warning {{FALSE}}
+    fscanf(F, "bbb");                          // no-warning
+  } else {
+    if (ferror(F)) {
+      clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+      fscanf(F, "bbb");               // expected-warning {{might be 'indeterminate'}}
+    } else if (feof(F)) {
+      clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+      fscanf(F, "bbb");               // expected-warning {{is in EOF state}}
+      clang_analyzer_eval(feof(F));   // expected-warning {{TRUE}}
+    } else {
+      clang_analyzer_warnIfReached(); // expected-warning {{REACHABLE}}
+      fscanf(F, "bbb");               // expected-warning {{might be 'indeterminate'}}
+    }
+  }
+  fclose(F);
+  fscanf(F, "ccc"); // expected-warning {{Stream might be already closed}}
+}
+
 void error_ungetc() {
   FILE *F = tmpfile();
   if (!F)
diff --git a/clang/test/Analysis/stream.c b/clang/test/Analysis/stream.c
index e8f06922bdb2f..36a9b4e26b07a 100644
--- a/clang/test/Analysis/stream.c
+++ b/clang/test/Analysis/stream.c
@@ -45,6 +45,12 @@ void check_fprintf(void) {
   fclose(fp);
 }
 
+void check_fscanf(void) {
+  FILE *fp = tmpfile();
+  fscanf(fp, "ABC"); // expected-warning {{Stream pointer might be NULL}}
+  fclose(fp);
+}
+
 void check_ungetc(void) {
   FILE *fp = tmpfile();
   ungetc('A', fp); // expected-warning {{Stream pointer might be NULL}}

From 6c47419703acfcd7dcca9e30ab9dba6a7a42f977 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 22 Jan 2024 08:57:35 +0000
Subject: [PATCH 350/843] Revert "[Clang][CMake] Support perf, LBR, and
 Instrument CLANG_BOLT options (#69133)"

This reverts commit 745883bba69007f1d2c5135f3d5b0f1efcfc82cd.

This is failing to configure on many of our bots:
https://lab.llvm.org/buildbot/#/builders/245/builds/19468

This did not get caught right away because generally bots only
clean the build every so often.
---
 clang/CMakeLists.txt                          | 44 +++++---------
 clang/cmake/caches/BOLT.cmake                 |  2 +-
 clang/utils/perf-training/CMakeLists.txt      | 29 +---------
 clang/utils/perf-training/bolt.lit.cfg        | 47 +++------------
 .../utils/perf-training/bolt.lit.site.cfg.in  |  2 -
 clang/utils/perf-training/perf-helper.py      | 58 -------------------
 6 files changed, 26 insertions(+), 156 deletions(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index b9c193f360bc4..5f2b7f064da43 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -876,38 +876,23 @@ if (CLANG_ENABLE_BOOTSTRAP)
   endforeach()
 endif()
 
-set(CLANG_BOLT "INSTRUMENT" CACHE STRING "Apply BOLT optimization to Clang. \
-  May be specified as Instrument or Perf or LBR to use a particular profiling \
-  mechanism.")
-string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
-
-if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
+if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
   set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
-  set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
+  set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst)
   set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata)
 
-  # Pass extra flag in no-LBR mode
-  if (CLANG_BOLT STREQUAL "PERF")
-    set(BOLT_NO_LBR "-nl")
-  endif()
-
-  if (CLANG_BOLT STREQUAL "INSTRUMENT")
-    # Instrument clang with BOLT
-    add_custom_target(clang-instrumented
-      DEPENDS ${CLANG_INSTRUMENTED}
-    )
-    add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
-      DEPENDS clang llvm-bolt
-      COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
-        -instrument --instrumentation-file-append-pid
-        --instrumentation-file=${BOLT_FDATA}
-      COMMENT "Instrumenting clang binary with BOLT"
-      VERBATIM
-    )
-    add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented)
-  else() # perf or LBR
-    add_custom_target(clang-bolt-training-deps DEPENDS clang)
-  endif()
+  # Instrument clang with BOLT
+  add_custom_target(clang-instrumented
+    DEPENDS ${CLANG_INSTRUMENTED}
+  )
+  add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
+    DEPENDS clang llvm-bolt
+    COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
+      -instrument --instrumentation-file-append-pid
+      --instrumentation-file=${BOLT_FDATA}
+    COMMENT "Instrumenting clang binary with BOLT"
+    VERBATIM
+  )
 
   # Optimize original (pre-bolt) Clang using the collected profile
   set(CLANG_OPTIMIZED ${CMAKE_CURRENT_BINARY_DIR}/clang.bolt)
@@ -921,7 +906,6 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
       -data ${BOLT_FDATA}
       -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions
       -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack
-      ${BOLT_NO_LBR}
     COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} $<TARGET_FILE:clang>
     COMMENT "Optimizing Clang with BOLT"
     VERBATIM
diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake
index eba2346b2f4ca..0442f73e5426a 100644
--- a/clang/cmake/caches/BOLT.cmake
+++ b/clang/cmake/caches/BOLT.cmake
@@ -1,5 +1,5 @@
 set(CMAKE_BUILD_TYPE Release CACHE STRING "")
-set(CLANG_BOLT "INSTRUMENT" CACHE STRING "")
+set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "")
 set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
 
 set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt
index 601f40902fa34..c6d51863fb1b5 100644
--- a/clang/utils/perf-training/CMakeLists.txt
+++ b/clang/utils/perf-training/CMakeLists.txt
@@ -62,9 +62,7 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
     DEPENDS generate-dtrace-logs)
 endif()
 
-if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
-  set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
-    "Name of BOLT-instrumented Clang binary")
+if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg
@@ -73,37 +71,16 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
   add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang"
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/
     EXCLUDE_FROM_CHECK_ALL
-    DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data
+    DEPENDS clang-instrumented clear-bolt-fdata
     )
 
   add_custom_target(clear-bolt-fdata
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata
     COMMENT "Clearing old BOLT fdata")
 
-  add_custom_target(clear-perf-data
-    COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
-    COMMENT "Clearing old perf data")
-
-  string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
-  if (CLANG_BOLT STREQUAL "LBR")
-    set(BOLT_LBR "--lbr")
-  endif()
-
-  add_custom_target(merge-fdata-deps)
-  if (CLANG_BOLT STREQUAL "INSTRUMENT")
-    add_dependencies(merge-fdata-deps generate-bolt-fdata)
-  else()
-    # Convert perf profiles into fdata
-    add_custom_target(convert-perf-fdata
-      COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $<TARGET_FILE:llvm-bolt> ${CMAKE_CURRENT_BINARY_DIR} $<TARGET_FILE:clang> ${BOLT_LBR}
-      COMMENT "Converting perf files to BOLT fdata"
-      DEPENDS llvm-bolt generate-bolt-fdata)
-    add_dependencies(merge-fdata-deps convert-perf-fdata)
-  endif()
-
   # Merge profiles into one using merge-fdata
   add_custom_target(clang-bolt-profile
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR}
     COMMENT "Merging BOLT fdata"
-    DEPENDS merge-fdata merge-fdata-deps)
+    DEPENDS merge-fdata generate-bolt-fdata)
 endif()
diff --git a/clang/utils/perf-training/bolt.lit.cfg b/clang/utils/perf-training/bolt.lit.cfg
index 0e81a5501e9fc..234ac855bd67c 100644
--- a/clang/utils/perf-training/bolt.lit.cfg
+++ b/clang/utils/perf-training/bolt.lit.cfg
@@ -6,46 +6,15 @@ import lit.util
 import os
 import subprocess
 
-clang_bolt_mode = config.clang_bolt_mode.lower()
-clang_binary = "clang"
-perf_wrapper = f"{config.python_exe} {config.perf_helper_dir}/perf-helper.py perf "
+config.clang = os.path.realpath(lit.util.which('clang-bolt.inst', config.clang_tools_dir)).replace('\\', '/')
 
-if clang_bolt_mode == "instrument":
-    perf_wrapper = ""
-    clang_binary = config.clang_bolt_name
-elif clang_bolt_mode == "lbr":
-    perf_wrapper += " --lbr -- "
-elif clang_bolt_mode == "perf":
-    perf_wrapper += " -- "
-else:
-    assert 0, "Unsupported CLANG_BOLT_MODE variable"
-
-config.clang = perf_wrapper + os.path.realpath(
-    lit.util.which(clang_binary, config.clang_tools_dir)
-).replace("\\", "/")
-
-config.name = "Clang Perf Training"
-config.suffixes = [
-    ".c",
-    ".cc",
-    ".cpp",
-    ".m",
-    ".mm",
-    ".cu",
-    ".ll",
-    ".cl",
-    ".s",
-    ".S",
-    ".modulemap",
-    ".test",
-]
+config.name = 'Clang Perf Training'
+config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test']
 
 use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL")
 config.test_format = lit.formats.ShTest(use_lit_shell == "0")
-config.substitutions.append(
-    ("%clang_cpp_skip_driver", f" {config.clang} --driver-mode=g++ ")
-)
-config.substitutions.append(("%clang_cpp", f" {config.clang} --driver-mode=g++ "))
-config.substitutions.append(("%clang_skip_driver", config.clang))
-config.substitutions.append(("%clang", config.clang))
-config.substitutions.append(("%test_root", config.test_exec_root))
+config.substitutions.append( ('%clang_cpp_skip_driver', ' %s --driver-mode=g++ ' % (config.clang)))
+config.substitutions.append( ('%clang_cpp', ' %s --driver-mode=g++ ' % (config.clang)))
+config.substitutions.append( ('%clang_skip_driver', ' %s ' % (config.clang)))
+config.substitutions.append( ('%clang', ' %s ' % (config.clang) ) )
+config.substitutions.append( ('%test_root', config.test_exec_root ) )
diff --git a/clang/utils/perf-training/bolt.lit.site.cfg.in b/clang/utils/perf-training/bolt.lit.site.cfg.in
index 54de12701c1ae..3029319673fc2 100644
--- a/clang/utils/perf-training/bolt.lit.site.cfg.in
+++ b/clang/utils/perf-training/bolt.lit.site.cfg.in
@@ -9,8 +9,6 @@ config.test_source_root = "@CLANG_PGO_TRAINING_DATA@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.python_exe = "@Python3_EXECUTABLE@"
 config.clang_obj_root = path(r"@CLANG_BINARY_DIR@")
-config.clang_bolt_mode = "@CLANG_BOLT@"
-config.clang_bolt_name = "@CLANG_BOLT_INSTRUMENTED@"
 
 # Let the main config do the real work.
 lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/bolt.lit.cfg")
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 959bdba5c98cc..99d6a3333b6ef 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -67,62 +67,6 @@ def merge_fdata(args):
     return 0
 
 
-def perf(args):
-    parser = argparse.ArgumentParser(
-        prog="perf-helper perf", description="perf wrapper for BOLT profile collection"
-    )
-    parser.add_argument(
-        "--lbr", action="store_true", help="Use perf with branch stacks"
-    )
-    parser.add_argument("cmd", nargs=argparse.REMAINDER, help="")
-
-    opts = parser.parse_args(args)
-    cmd = opts.cmd[1:]
-
-    perf_args = [
-        "perf",
-        "record",
-        "--event=cycles:u",
-        "--freq=max",
-        "--output=%d.perf.data" % os.getpid(),
-    ]
-    if opts.lbr:
-        perf_args += ["--branch-filter=any,u"]
-    perf_args.extend(cmd)
-
-    start_time = time.time()
-    subprocess.check_call(perf_args)
-
-    elapsed = time.time() - start_time
-    print("... data collection took %.4fs" % elapsed)
-    return 0
-
-
-def perf2bolt(args):
-    parser = argparse.ArgumentParser(
-        prog="perf-helper perf2bolt",
-        description="perf2bolt conversion wrapper for perf.data files",
-    )
-    parser.add_argument("bolt", help="Path to llvm-bolt")
-    parser.add_argument("path", help="Path containing perf.data files")
-    parser.add_argument("binary", help="Input binary")
-    parser.add_argument("--lbr", action="store_true", help="Use LBR perf2bolt mode")
-    opts = parser.parse_args(args)
-
-    p2b_args = [
-        opts.bolt,
-        opts.binary,
-        "--aggregate-only",
-        "--profile-format=yaml",
-    ]
-    if not opts.lbr:
-        p2b_args += ["-nl"]
-    p2b_args += ["-p"]
-    for filename in findFilesWithExtension(opts.path, "perf.data"):
-        subprocess.check_call(p2b_args + [filename, "-o", filename + ".fdata"])
-    return 0
-
-
 def dtrace(args):
     parser = argparse.ArgumentParser(
         prog="perf-helper dtrace",
@@ -563,8 +507,6 @@ def genOrderFile(args):
     "cc1": cc1,
     "gen-order-file": genOrderFile,
     "merge-fdata": merge_fdata,
-    "perf": perf,
-    "perf2bolt": perf2bolt,
 }
 
 
From 11d1310b57a9f2defb4d65a35b90a69020c52e46 Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Mon, 22 Jan 2024 12:06:26 +0300
Subject: [PATCH 351/843] [clang] Fix assertion failure with deleted overloaded
 unary operators (#78316)

When emitting notes related to wrong number of arguments do not consider
object argument.

Fixes https://github.com/llvm/llvm-project/issues/78314
---
 clang/docs/ReleaseNotes.rst                |  2 +
 clang/lib/Sema/SemaOverload.cpp            |  9 +++-
 clang/test/SemaCXX/overloaded-operator.cpp | 57 ++++++++++++++++++++--
 3 files changed, 61 insertions(+), 7 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 7c9f9ecca727a..4888ffe6f4dfc 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -837,6 +837,8 @@ Bug Fixes in This Version
 - Fix an issue with missing symbol definitions when the first coroutine
   statement appears in a discarded ``if constexpr`` branch.
   Fixes (`#78290 <https://github.com/llvm/llvm-project/issues/78290>`_)
+- Fixed assertion failure with deleted overloaded unary operators.
+  Fixes (`#78314 <https://github.com/llvm/llvm-project/issues/78314>`_)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 697a1c0cfc404..030878899b812 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -14332,12 +14332,17 @@ Sema::CreateOverloadedUnaryOp(SourceLocation OpLoc, UnaryOperatorKind Opc,
     return ExprError();
 
   case OR_Deleted:
+    // CreateOverloadedUnaryOp fills the first element of ArgsArray with the
+    // object whose method was called. Later in NoteCandidates size of ArgsArray
+    // is passed further and it eventually ends up compared to number of
+    // function candidate parameters which never includes the object parameter,
+    // so slice ArgsArray to make sure apples are compared to apples.
     CandidateSet.NoteCandidates(
         PartialDiagnosticAt(OpLoc, PDiag(diag::err_ovl_deleted_oper)
                                        << UnaryOperator::getOpcodeStr(Opc)
                                        << Input->getSourceRange()),
-        *this, OCD_AllCandidates, ArgsArray, UnaryOperator::getOpcodeStr(Opc),
-        OpLoc);
+        *this, OCD_AllCandidates, ArgsArray.drop_front(),
+        UnaryOperator::getOpcodeStr(Opc), OpLoc);
     return ExprError();
   }
 
diff --git a/clang/test/SemaCXX/overloaded-operator.cpp b/clang/test/SemaCXX/overloaded-operator.cpp
index 83a7e65b43dd0..887848c29b83c 100644
--- a/clang/test/SemaCXX/overloaded-operator.cpp
+++ b/clang/test/SemaCXX/overloaded-operator.cpp
@@ -1,4 +1,6 @@
-// RUN: %clang_cc1 -fsyntax-only -verify -std=c++11 %s 
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,precxx23 -std=c++11 %s
+// RUN: %clang_cc1 -fsyntax-only -verify=expected,cxx23 -std=c++23 %s
+
 class X { };
 
 X operator+(X, X);
@@ -33,7 +35,9 @@ struct A {
 
 A make_A();
 
-bool operator==(A&, Z&); // expected-note 3{{candidate function}}
+bool operator==(A&, Z&); // expected-note 3{{candidate function}} \
+                         // cxx23-note 2{{candidate function}}
+
 
 void h(A a, const A ac, Z z) {
   make_A() == z; // expected-warning{{equality comparison result unused}}
@@ -68,7 +72,9 @@ struct E2 {
 };
 
 // C++ [over.match.oper]p3 - enum restriction.
-float& operator==(E1, E2);  // expected-note{{candidate function}}
+float& operator==(E1, E2);  // expected-note{{candidate function}} \
+                            // cxx23-note{{candidate function}}
+
 
 void enum_test(Enum1 enum1, Enum2 enum2, E1 e1, E2 e2, Enum1 next_enum1) {
   float &f1 = (e1 == e2);
@@ -86,7 +92,8 @@ class pr5244_foo
 };
 
 bool operator==(const pr5244_foo& s1, const pr5244_foo& s2); // expected-note{{candidate function}}
-bool operator==(char c, const pr5244_foo& s); // expected-note{{candidate function}}
+bool operator==(char c, const pr5244_foo& s); // expected-note{{candidate function}} \
+                                              // cxx23-note{{candidate function}}
 
 enum pr5244_bar
 {
@@ -130,7 +137,7 @@ struct SmartPtr {
 };
 
 void test_smartptr(SmartPtr ptr, const SmartPtr cptr, 
-                   const volatile SmartPtr cvptr) {
+                   const volatile SmartPtr cvptr) { // cxx23-warning {{volatile-qualified parameter type 'const volatile SmartPtr' is deprecated}}
   int &ir = *ptr;
   long &lr = *cptr;
   long &lr2 = *cvptr;
@@ -598,3 +605,43 @@ namespace B {
 }
 void g(B::X x) { A::f(x); }
 }
+
+namespace GH78314 {
+
+class a {
+public:
+  void operator--() = delete; // expected-note {{candidate function has been explicitly deleted}} \
+                              // expected-note {{candidate function not viable: requires 0 arguments, but 1 was provided}}
+  void operator--(int) = delete; // expected-note {{candidate function has been explicitly deleted}} \
+                                 // expected-note {{candidate function not viable: requires 1 argument, but 0 were provided}}
+};
+
+class c {
+  void operator--(this c) = delete; //precxx23-error {{explicit object parameters are incompatible with C++ standards before C++2b}} \
+                                    // expected-note {{candidate function has been explicitly deleted}} \
+                                    // expected-note {{candidate function not viable: requires 0 non-object arguments, but 1 was provided}}
+  void operator--(this c, int) = delete; //precxx23-error {{explicit object parameters are incompatible with C++ standards before C++2b}} \
+                                         // expected-note {{candidate function has been explicitly deleted}} \
+                                         // expected-note {{candidate function not viable: requires 1 non-object argument, but 0 were provided}}
+};
+
+void foo() {
+  a aa;
+  --aa; // expected-error {{overload resolution selected deleted operator '--'}}
+  aa--; // expected-error {{overload resolution selected deleted operator '--'}}
+
+  c cc; 
+  --cc; // expected-error {{overload resolution selected deleted operator '--'}}
+  cc--; // expected-error {{overload resolution selected deleted operator '--'}}
+}
+
+class b {
+  void operator++() = delete; // expected-note {{candidate function has been explicitly deleted}}
+  template <class> void operator++(int) { // expected-note {{function template not viable: requires 1 argument, but 0 were provided}}
+    b bb;
+    ++bb; // expected-error {{overload resolution selected deleted operator '++'}}
+  }
+};
+
+
+}

From 7b925c3edb6297df6bcf87dfcfdfd645f03b5388 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Lira=20Junior?= <jljuniorpb@gmail.com>
Date: Mon, 22 Jan 2024 06:08:42 -0300
Subject: [PATCH 352/843] [lldb] refactor highlighting function for image
 lookup command (#76112)

Follow-up to #69422.

This PR puts all the highlighting settings into a single struct for
easier handling

Co-authored-by: Talha Tahir <talha.tahir@10xengineers.ai>
---
 lldb/include/lldb/Core/Address.h             | 10 +++--
 lldb/include/lldb/Symbol/Symbol.h            |  6 ++-
 lldb/include/lldb/Symbol/SymbolContext.h     | 16 ++++----
 lldb/include/lldb/Utility/Stream.h           | 18 +++++++--
 lldb/source/Commands/CommandObjectTarget.cpp | 38 ++++++++++--------
 lldb/source/Core/Address.cpp                 | 26 +++++--------
 lldb/source/Symbol/Symbol.cpp                | 17 +++-----
 lldb/source/Symbol/SymbolContext.cpp         | 41 +++++++-------------
 lldb/source/Utility/Stream.cpp               | 17 ++++----
 9 files changed, 89 insertions(+), 100 deletions(-)

diff --git a/lldb/include/lldb/Core/Address.h b/lldb/include/lldb/Core/Address.h
index 725b5d9f91d3d..9b5874f8b1fbe 100644
--- a/lldb/include/lldb/Core/Address.h
+++ b/lldb/include/lldb/Core/Address.h
@@ -9,6 +9,7 @@
 #ifndef LLDB_CORE_ADDRESS_H
 #define LLDB_CORE_ADDRESS_H
 
+#include "lldb/Utility/Stream.h"
 #include "lldb/lldb-defines.h"
 #include "lldb/lldb-forward.h"
 #include "lldb/lldb-private-enumerations.h"
@@ -252,10 +253,11 @@ class Address {
   ///     in such cases.
   ///
   /// \see Address::DumpStyle
-  bool Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
-            DumpStyle fallback_style = DumpStyleInvalid,
-            uint32_t addr_byte_size = UINT32_MAX, bool all_ranges = false,
-            llvm::StringRef pattern = "") const;
+  bool
+  Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
+       DumpStyle fallback_style = DumpStyleInvalid,
+       uint32_t addr_byte_size = UINT32_MAX, bool all_ranges = false,
+       std::optional<Stream::HighlightSettings> settings = std::nullopt) const;
 
   AddressClass GetAddressClass() const;
 
diff --git a/lldb/include/lldb/Symbol/Symbol.h b/lldb/include/lldb/Symbol/Symbol.h
index e6c0b495bcf28..e05c845a69f3e 100644
--- a/lldb/include/lldb/Symbol/Symbol.h
+++ b/lldb/include/lldb/Symbol/Symbol.h
@@ -13,6 +13,7 @@
 #include "lldb/Core/Mangled.h"
 #include "lldb/Core/Section.h"
 #include "lldb/Symbol/SymbolContextScope.h"
+#include "lldb/Utility/Stream.h"
 #include "lldb/Utility/UserID.h"
 #include "lldb/lldb-private.h"
 #include "llvm/Support/JSON.h"
@@ -174,8 +175,9 @@ class Symbol : public SymbolContextScope {
 
   void SetFlags(uint32_t flags) { m_flags = flags; }
 
-  void GetDescription(Stream *s, lldb::DescriptionLevel level, Target *target,
-                      llvm::StringRef pattern = "") const;
+  void GetDescription(
+      Stream *s, lldb::DescriptionLevel level, Target *target,
+      std::optional<Stream::HighlightSettings> settings = std::nullopt) const;
 
   bool IsSynthetic() const { return m_is_synthetic; }
 
diff --git a/lldb/include/lldb/Symbol/SymbolContext.h b/lldb/include/lldb/Symbol/SymbolContext.h
index 26f3bac09a962..bd33a71b46cac 100644
--- a/lldb/include/lldb/Symbol/SymbolContext.h
+++ b/lldb/include/lldb/Symbol/SymbolContext.h
@@ -17,6 +17,7 @@
 #include "lldb/Core/Mangled.h"
 #include "lldb/Symbol/LineEntry.h"
 #include "lldb/Utility/Iterable.h"
+#include "lldb/Utility/Stream.h"
 #include "lldb/lldb-private.h"
 
 namespace lldb_private {
@@ -153,11 +154,11 @@ class SymbolContext {
   ///
   /// \return
   ///     \b true if some text was dumped, \b false otherwise.
-  bool DumpStopContext(Stream *s, ExecutionContextScope *exe_scope,
-                       const Address &so_addr, bool show_fullpaths,
-                       bool show_module, bool show_inlined_frames,
-                       bool show_function_arguments, bool show_function_name,
-                       llvm::StringRef pattern = "") const;
+  bool DumpStopContext(
+      Stream *s, ExecutionContextScope *exe_scope, const Address &so_addr,
+      bool show_fullpaths, bool show_module, bool show_inlined_frames,
+      bool show_function_arguments, bool show_function_name,
+      std::optional<Stream::HighlightSettings> settings = std::nullopt) const;
 
   /// Get the address range contained within a symbol context.
   ///
@@ -223,8 +224,9 @@ class SymbolContext {
   ///     The symbol that was found, or \b nullptr if none was found.
   const Symbol *FindBestGlobalDataSymbol(ConstString name, Status &error);
 
-  void GetDescription(Stream *s, lldb::DescriptionLevel level, Target *target,
-                      llvm::StringRef pattern = "") const;
+  void GetDescription(
+      Stream *s, lldb::DescriptionLevel level, Target *target,
+      std::optional<Stream::HighlightSettings> settings = std::nullopt) const;
 
   uint32_t GetResolvedMask() const;
 
diff --git a/lldb/include/lldb/Utility/Stream.h b/lldb/include/lldb/Utility/Stream.h
index 20c55ac4597ae..37bcdc9924171 100644
--- a/lldb/include/lldb/Utility/Stream.h
+++ b/lldb/include/lldb/Utility/Stream.h
@@ -33,6 +33,17 @@ class Stream {
                        /// string mode.
   };
 
+  /// Struct to store information for color highlighting in the stream.
+  struct HighlightSettings {
+    llvm::StringRef pattern; ///< Regex pattern for highlighting.
+    llvm::StringRef prefix;  ///< ANSI color code to start colorization.
+    llvm::StringRef suffix;  ///< ANSI color code to end colorization.
+
+    HighlightSettings(llvm::StringRef p, llvm::StringRef pre,
+                      llvm::StringRef suf)
+        : pattern(p), prefix(pre), suffix(suf) {}
+  };
+
   /// Utility class for counting the bytes that were written to a stream in a
   /// certain time span.
   ///
@@ -260,10 +271,9 @@ class Stream {
   ///     The ANSI color code to end colorization. This is
   ///     environment-dependent.
 
-  void PutCStringColorHighlighted(llvm::StringRef text,
-                                  llvm::StringRef pattern = "",
-                                  llvm::StringRef prefix = "",
-                                  llvm::StringRef suffix = "");
+  void PutCStringColorHighlighted(
+      llvm::StringRef text,
+      std::optional<HighlightSettings> settings = std::nullopt);
 
   /// Output and End of Line character to the stream.
   size_t EOL();
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index bc8bc51356c8c..c3ecdb7700c25 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -53,6 +53,7 @@
 #include "lldb/Utility/FileSpec.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/State.h"
+#include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StructuredData.h"
 #include "lldb/Utility/Timer.h"
 #include "lldb/lldb-enumerations.h"
@@ -1531,9 +1532,10 @@ static void DumpOsoFilesTable(Stream &strm,
   });
 }
 
-static void DumpAddress(ExecutionContextScope *exe_scope,
-                        const Address &so_addr, bool verbose, bool all_ranges,
-                        Stream &strm, llvm::StringRef pattern = "") {
+static void
+DumpAddress(ExecutionContextScope *exe_scope, const Address &so_addr,
+            bool verbose, bool all_ranges, Stream &strm,
+            std::optional<Stream::HighlightSettings> settings = std::nullopt) {
   strm.IndentMore();
   strm.Indent("    Address: ");
   so_addr.Dump(&strm, exe_scope, Address::DumpStyleModuleWithFileAddress);
@@ -1544,13 +1546,13 @@ static void DumpAddress(ExecutionContextScope *exe_scope,
   const uint32_t save_indent = strm.GetIndentLevel();
   strm.SetIndentLevel(save_indent + 13);
   so_addr.Dump(&strm, exe_scope, Address::DumpStyleResolvedDescription,
-               Address::DumpStyleInvalid, UINT32_MAX, false, pattern);
+               Address::DumpStyleInvalid, UINT32_MAX, false, settings);
   strm.SetIndentLevel(save_indent);
   // Print out detailed address information when verbose is enabled
   if (verbose) {
     strm.EOL();
     so_addr.Dump(&strm, exe_scope, Address::DumpStyleDetailedSymbolContext,
-                 Address::DumpStyleInvalid, UINT32_MAX, all_ranges, pattern);
+                 Address::DumpStyleInvalid, UINT32_MAX, all_ranges, settings);
   }
   strm.IndentLess();
 }
@@ -1615,6 +1617,9 @@ static uint32_t LookupSymbolInModule(CommandInterpreter &interpreter,
     DumpFullpath(strm, &module->GetFileSpec(), 0);
     strm.PutCString(":\n");
     strm.IndentMore();
+    Stream::HighlightSettings settings(
+        name, interpreter.GetDebugger().GetRegexMatchAnsiPrefix(),
+        interpreter.GetDebugger().GetRegexMatchAnsiSuffix());
     for (uint32_t i = 0; i < num_matches; ++i) {
       Symbol *symbol = symtab->SymbolAtIndex(match_indexes[i]);
       if (symbol) {
@@ -1622,18 +1627,18 @@ static uint32_t LookupSymbolInModule(CommandInterpreter &interpreter,
           DumpAddress(
               interpreter.GetExecutionContext().GetBestExecutionContextScope(),
               symbol->GetAddressRef(), verbose, all_ranges, strm,
-              use_color && name_is_regex ? name : nullptr);
+              use_color && name_is_regex
+                  ? std::optional<Stream::HighlightSettings>{settings}
+                  : std::nullopt);
           strm.EOL();
         } else {
           strm.IndentMore();
           strm.Indent("    Name: ");
-          llvm::StringRef ansi_prefix =
-              interpreter.GetDebugger().GetRegexMatchAnsiPrefix();
-          llvm::StringRef ansi_suffix =
-              interpreter.GetDebugger().GetRegexMatchAnsiSuffix();
           strm.PutCStringColorHighlighted(
               symbol->GetDisplayName().GetStringRef(),
-              use_color ? name : nullptr, ansi_prefix, ansi_suffix);
+              use_color && name_is_regex
+                  ? std::optional<Stream::HighlightSettings>{settings}
+                  : std::nullopt);
           strm.EOL();
           strm.Indent("    Value: ");
           strm.Printf("0x%16.16" PRIx64 "\n", symbol->GetRawValue());
@@ -1650,10 +1655,10 @@ static uint32_t LookupSymbolInModule(CommandInterpreter &interpreter,
   return num_matches;
 }
 
-static void DumpSymbolContextList(ExecutionContextScope *exe_scope,
-                                  Stream &strm,
-                                  const SymbolContextList &sc_list,
-                                  bool verbose, bool all_ranges) {
+static void DumpSymbolContextList(
+    ExecutionContextScope *exe_scope, Stream &strm,
+    const SymbolContextList &sc_list, bool verbose, bool all_ranges,
+    std::optional<Stream::HighlightSettings> settings = std::nullopt) {
   strm.IndentMore();
   bool first_module = true;
   for (const SymbolContext &sc : sc_list) {
@@ -1664,7 +1669,8 @@ static void DumpSymbolContextList(ExecutionContextScope *exe_scope,
 
     sc.GetAddressRange(eSymbolContextEverything, 0, true, range);
 
-    DumpAddress(exe_scope, range.GetBaseAddress(), verbose, all_ranges, strm);
+    DumpAddress(exe_scope, range.GetBaseAddress(), verbose, all_ranges, strm,
+                settings);
     first_module = false;
   }
   strm.IndentLess();
diff --git a/lldb/source/Core/Address.cpp b/lldb/source/Core/Address.cpp
index 19d34db44ea55..6f5c366ab38a3 100644
--- a/lldb/source/Core/Address.cpp
+++ b/lldb/source/Core/Address.cpp
@@ -407,7 +407,8 @@ bool Address::GetDescription(Stream &s, Target &target,
 
 bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
                    DumpStyle fallback_style, uint32_t addr_size,
-                   bool all_ranges, llvm::StringRef pattern) const {
+                   bool all_ranges,
+                   std::optional<Stream::HighlightSettings> settings) const {
   // If the section was nullptr, only load address is going to work unless we
   // are trying to deref a pointer
   SectionSP section_sp(GetSection());
@@ -516,16 +517,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
               if (symbol) {
                 const char *symbol_name = symbol->GetName().AsCString();
                 if (symbol_name) {
-                  llvm::StringRef ansi_prefix;
-                  llvm::StringRef ansi_suffix;
-                  if (target) {
-                    ansi_prefix =
-                        target->GetDebugger().GetRegexMatchAnsiPrefix();
-                    ansi_suffix =
-                        target->GetDebugger().GetRegexMatchAnsiSuffix();
-                  }
-                  s->PutCStringColorHighlighted(symbol_name, pattern,
-                                                ansi_prefix, ansi_suffix);
+                  s->PutCStringColorHighlighted(symbol_name, settings);
                   addr_t delta =
                       file_Addr - symbol->GetAddressRef().GetFileAddress();
                   if (delta)
@@ -653,7 +645,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
                     pointer_sc.symbol != nullptr) {
                   s->PutCString(": ");
                   pointer_sc.DumpStopContext(s, exe_scope, so_addr, true, false,
-                                             false, true, true, pattern);
+                                             false, true, true, settings);
                 }
               }
             }
@@ -693,13 +685,13 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
               sc.DumpStopContext(s, exe_scope, *this, show_fullpaths,
                                  show_module, show_inlined_frames,
                                  show_function_arguments, show_function_name,
-                                 pattern);
+                                 settings);
             } else {
               // We found a symbol but it was in a different section so it
               // isn't the symbol we should be showing, just show the section
               // name + offset
               Dump(s, exe_scope, DumpStyleSectionNameOffset, DumpStyleInvalid,
-                   UINT32_MAX, false, pattern);
+                   UINT32_MAX, false, settings);
             }
           }
         }
@@ -707,7 +699,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
     } else {
       if (fallback_style != DumpStyleInvalid)
         return Dump(s, exe_scope, fallback_style, DumpStyleInvalid, addr_size,
-                    false, pattern);
+                    false, settings);
       return false;
     }
     break;
@@ -728,7 +720,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
               sc.symbol->GetAddressRef().GetSection() != GetSection())
             sc.symbol = nullptr;
         }
-        sc.GetDescription(s, eDescriptionLevelBrief, target, pattern);
+        sc.GetDescription(s, eDescriptionLevelBrief, target, settings);
 
         if (sc.block) {
           bool can_create = true;
@@ -777,7 +769,7 @@ bool Address::Dump(Stream *s, ExecutionContextScope *exe_scope, DumpStyle style,
     } else {
       if (fallback_style != DumpStyleInvalid)
         return Dump(s, exe_scope, fallback_style, DumpStyleInvalid, addr_size,
-                    false, pattern);
+                    false, settings);
       return false;
     }
     break;
diff --git a/lldb/source/Symbol/Symbol.cpp b/lldb/source/Symbol/Symbol.cpp
index 08900a3ef3491..1895f299cc06a 100644
--- a/lldb/source/Symbol/Symbol.cpp
+++ b/lldb/source/Symbol/Symbol.cpp
@@ -226,8 +226,9 @@ bool Symbol::IsTrampoline() const { return m_type == eSymbolTypeTrampoline; }
 
 bool Symbol::IsIndirect() const { return m_type == eSymbolTypeResolver; }
 
-void Symbol::GetDescription(Stream *s, lldb::DescriptionLevel level,
-                            Target *target, llvm::StringRef pattern) const {
+void Symbol::GetDescription(
+    Stream *s, lldb::DescriptionLevel level, Target *target,
+    std::optional<Stream::HighlightSettings> settings) const {
   s->Printf("id = {0x%8.8x}", m_uid);
 
   if (m_addr_range.GetBaseAddress().GetSection()) {
@@ -254,22 +255,14 @@ void Symbol::GetDescription(Stream *s, lldb::DescriptionLevel level,
       s->Printf(", value = 0x%16.16" PRIx64,
                 m_addr_range.GetBaseAddress().GetOffset());
   }
-  llvm::StringRef ansi_prefix;
-  llvm::StringRef ansi_suffix;
-  if (target) {
-    ansi_prefix = target->GetDebugger().GetRegexMatchAnsiPrefix();
-    ansi_suffix = target->GetDebugger().GetRegexMatchAnsiSuffix();
-  }
   if (ConstString demangled = m_mangled.GetDemangledName()) {
     s->PutCString(", name=\"");
-    s->PutCStringColorHighlighted(demangled.GetStringRef(), pattern,
-                                  ansi_prefix, ansi_suffix);
+    s->PutCStringColorHighlighted(demangled.GetStringRef(), settings);
     s->PutCString("\"");
   }
   if (ConstString mangled_name = m_mangled.GetMangledName()) {
     s->PutCString(", mangled=\"");
-    s->PutCStringColorHighlighted(mangled_name.GetStringRef(), pattern,
-                                  ansi_prefix, ansi_suffix);
+    s->PutCStringColorHighlighted(mangled_name.GetStringRef(), settings);
     s->PutCString("\"");
   }
 }
diff --git a/lldb/source/Symbol/SymbolContext.cpp b/lldb/source/Symbol/SymbolContext.cpp
index d581c90cede7d..3c70b8d8743cf 100644
--- a/lldb/source/Symbol/SymbolContext.cpp
+++ b/lldb/source/Symbol/SymbolContext.cpp
@@ -24,6 +24,7 @@
 #include "lldb/Target/Target.h"
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
+#include "lldb/Utility/Stream.h"
 #include "lldb/Utility/StreamString.h"
 #include "lldb/lldb-enumerations.h"
 
@@ -68,12 +69,11 @@ void SymbolContext::Clear(bool clear_target) {
   variable = nullptr;
 }
 
-bool SymbolContext::DumpStopContext(Stream *s, ExecutionContextScope *exe_scope,
-                                    const Address &addr, bool show_fullpaths,
-                                    bool show_module, bool show_inlined_frames,
-                                    bool show_function_arguments,
-                                    bool show_function_name,
-                                    llvm::StringRef pattern) const {
+bool SymbolContext::DumpStopContext(
+    Stream *s, ExecutionContextScope *exe_scope, const Address &addr,
+    bool show_fullpaths, bool show_module, bool show_inlined_frames,
+    bool show_function_arguments, bool show_function_name,
+    std::optional<Stream::HighlightSettings> settings) const {
   bool dumped_something = false;
   if (show_module && module_sp) {
     if (show_fullpaths)
@@ -95,16 +95,8 @@ bool SymbolContext::DumpStopContext(Stream *s, ExecutionContextScope *exe_scope,
         name = function->GetNameNoArguments();
       if (!name)
         name = function->GetName();
-      if (name) {
-        llvm::StringRef ansi_prefix;
-        llvm::StringRef ansi_suffix;
-        if (target_sp) {
-          ansi_prefix = target_sp->GetDebugger().GetRegexMatchAnsiPrefix();
-          ansi_suffix = target_sp->GetDebugger().GetRegexMatchAnsiSuffix();
-        }
-        s->PutCStringColorHighlighted(name.GetStringRef(), pattern, ansi_prefix,
-                                      ansi_suffix);
-      }
+      if (name)
+        s->PutCStringColorHighlighted(name.GetStringRef(), settings);
     }
 
     if (addr.IsValid()) {
@@ -172,14 +164,7 @@ bool SymbolContext::DumpStopContext(Stream *s, ExecutionContextScope *exe_scope,
       dumped_something = true;
       if (symbol->GetType() == eSymbolTypeTrampoline)
         s->PutCString("symbol stub for: ");
-      llvm::StringRef ansi_prefix;
-      llvm::StringRef ansi_suffix;
-      if (target_sp) {
-        ansi_prefix = target_sp->GetDebugger().GetRegexMatchAnsiPrefix();
-        ansi_suffix = target_sp->GetDebugger().GetRegexMatchAnsiSuffix();
-      }
-      s->PutCStringColorHighlighted(symbol->GetName().GetStringRef(), pattern,
-                                    ansi_prefix, ansi_suffix);
+      s->PutCStringColorHighlighted(symbol->GetName().GetStringRef(), settings);
     }
 
     if (addr.IsValid() && symbol->ValueIsAddress()) {
@@ -201,9 +186,9 @@ bool SymbolContext::DumpStopContext(Stream *s, ExecutionContextScope *exe_scope,
   return dumped_something;
 }
 
-void SymbolContext::GetDescription(Stream *s, lldb::DescriptionLevel level,
-                                   Target *target,
-                                   llvm::StringRef pattern) const {
+void SymbolContext::GetDescription(
+    Stream *s, lldb::DescriptionLevel level, Target *target,
+    std::optional<Stream::HighlightSettings> settings) const {
   if (module_sp) {
     s->Indent("     Module: file = \"");
     module_sp->GetFileSpec().Dump(s->AsRawOstream());
@@ -263,7 +248,7 @@ void SymbolContext::GetDescription(Stream *s, lldb::DescriptionLevel level,
 
   if (symbol != nullptr) {
     s->Indent("     Symbol: ");
-    symbol->GetDescription(s, level, target, pattern);
+    symbol->GetDescription(s, level, target, settings);
     s->EOL();
   }
 
diff --git a/lldb/source/Utility/Stream.cpp b/lldb/source/Utility/Stream.cpp
index 62e061e9d09c0..89dce9fb0e1f7 100644
--- a/lldb/source/Utility/Stream.cpp
+++ b/lldb/source/Utility/Stream.cpp
@@ -72,23 +72,20 @@ size_t Stream::PutCString(llvm::StringRef str) {
   return bytes_written;
 }
 
-void Stream::PutCStringColorHighlighted(llvm::StringRef text,
-                                        llvm::StringRef pattern,
-                                        llvm::StringRef prefix,
-                                        llvm::StringRef suffix) {
-  // Only apply color formatting when a pattern is present and both prefix and
-  // suffix are specified. In the absence of these conditions, output the text
-  // without color formatting.
-  if (pattern.empty() || (prefix.empty() && suffix.empty())) {
+void Stream::PutCStringColorHighlighted(
+    llvm::StringRef text, std::optional<HighlightSettings> pattern_info) {
+  // Only apply color formatting when a pattern information is specified.
+  // Otherwise, output the text without color formatting.
+  if (!pattern_info.has_value()) {
     PutCString(text);
     return;
   }
 
-  llvm::Regex reg_pattern(pattern);
+  llvm::Regex reg_pattern(pattern_info->pattern);
   llvm::SmallVector<llvm::StringRef, 1> matches;
   llvm::StringRef remaining = text;
   std::string format_str = lldb_private::ansi::FormatAnsiTerminalCodes(
-      prefix.str() + "%.*s" + suffix.str());
+      pattern_info->prefix.str() + "%.*s" + pattern_info->suffix.str());
   while (reg_pattern.match(remaining, &matches)) {
     llvm::StringRef match = matches[0];
     size_t match_start_pos = match.data() - remaining.data();

From 11c0dc3d4081b7739500d31332eba0760fed174c Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 22 Jan 2024 09:10:00 +0000
Subject: [PATCH 353/843] [GitHub][workflows] Run automation script with
 python3 (#78695)

This means we don't have to chmod, or change permissions any other way.
---
 .github/workflows/issue-subscriber.yml | 3 +--
 .github/workflows/new-prs.yml          | 3 +--
 .github/workflows/pr-subscriber.yml    | 3 +--
 3 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/issue-subscriber.yml b/.github/workflows/issue-subscriber.yml
index 111fa6d7aa67e..ef6cd0674e808 100644
--- a/.github/workflows/issue-subscriber.yml
+++ b/.github/workflows/issue-subscriber.yml
@@ -22,7 +22,6 @@ jobs:
       - name: Setup Automation Script
         working-directory: ./llvm/utils/git/
         run: |
-          chmod a+x github-automation.py
           pip install -r requirements.txt
 
       - name: Update watchers
@@ -31,7 +30,7 @@ jobs:
         env:
           LABEL_NAME: ${{ github.event.label.name }}
         run: |
-          ./github-automation.py \
+          python3 ./github-automation.py \
             --token '${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}' \
             issue-subscriber \
             --issue-number '${{ github.event.issue.number }}' \
diff --git a/.github/workflows/new-prs.yml b/.github/workflows/new-prs.yml
index a52e4e9436263..a60f82ce35d1f 100644
--- a/.github/workflows/new-prs.yml
+++ b/.github/workflows/new-prs.yml
@@ -43,13 +43,12 @@ jobs:
       - name: Setup Automation Script
         working-directory: ./llvm/utils/git/
         run: |
-          chmod a+x github-automation.py
           pip install -r requirements.txt
 
       - name: Greet Author
         working-directory: ./llvm/utils/git/
         run: |
-          ./github-automation.py \
+          python3 ./github-automation.py \
             --token '${{ secrets.GITHUB_TOKEN }}' \
             pr-greeter \
             --issue-number "${{ github.event.pull_request.number }}"
diff --git a/.github/workflows/pr-subscriber.yml b/.github/workflows/pr-subscriber.yml
index aa36e6d502c95..3952493bb698f 100644
--- a/.github/workflows/pr-subscriber.yml
+++ b/.github/workflows/pr-subscriber.yml
@@ -22,13 +22,12 @@ jobs:
       - name: Setup Automation Script
         working-directory: ./llvm/utils/git/
         run: |
-          chmod a+x github-automation.py
           pip install -r requirements.txt
 
       - name: Update watchers
         working-directory: ./llvm/utils/git/
         run: |
-          ./github-automation.py \
+          python3 ./github-automation.py \
             --token '${{ secrets.ISSUE_SUBSCRIBER_TOKEN }}' \
             pr-subscriber \
             --issue-number "${{ github.event.number }}" \

From 9f7fff7f1391ea3bec394d8251b81cea92175cca Mon Sep 17 00:00:00 2001
From: Cullen Rhodes <cullen.rhodes@arm.com>
Date: Mon, 22 Jan 2024 09:23:11 +0000
Subject: [PATCH 354/843] [mlir][ArmSME] Add arith-to-arm-sme conversion pass
 (#78197)

Existing 'arith::ConstantOp' conversion and tests are moved from
VectorToArmSME. There's currently only a single op that's converted at
the moment, but this will grow in the future as things like in-tile add
are implemented. Also, 'createLoopOverTileSlices' is moved to ArmSME
utils since it's relevant for both conversions.
---
 .../Conversion/ArithToArmSME/ArithToArmSME.h  |  27 ++++
 mlir/include/mlir/Conversion/Passes.h         |   1 +
 mlir/include/mlir/Conversion/Passes.td        |   9 ++
 .../include/mlir/Dialect/ArmSME/Utils/Utils.h |  14 ++
 .../ArithToArmSME/ArithToArmSME.cpp           | 125 +++++++++++++++++
 .../Conversion/ArithToArmSME/CMakeLists.txt   |  18 +++
 mlir/lib/Conversion/CMakeLists.txt            |   1 +
 .../VectorToArmSME/VectorToArmSME.cpp         | 128 ++++--------------
 mlir/lib/Dialect/ArmSME/IR/Utils.cpp          |  22 +++
 .../ArithToArmSME/arith-to-arm-sme.mlir}      |   2 +-
 .../Dialect/ArmSME/vector-ops-to-llvm.mlir    |   2 +-
 .../Dialect/Linalg/CPU/ArmSME/fill-2d.mlir    |   3 +-
 .../Linalg/CPU/ArmSME/use-too-many-tiles.mlir |   4 +-
 .../CPU/ArmSME/test-outerproduct-f32.mlir     |   3 +-
 .../CPU/ArmSME/test-outerproduct-f64.mlir     |   3 +-
 .../CPU/ArmSME/test-transfer-write-2d.mlir    |   3 +-
 .../Dialect/Vector/CPU/ArmSME/tile_fill.mlir  |   3 +-
 .../Dialect/Vector/CPU/ArmSME/vector-ops.mlir |   3 +-
 18 files changed, 257 insertions(+), 114 deletions(-)
 create mode 100644 mlir/include/mlir/Conversion/ArithToArmSME/ArithToArmSME.h
 create mode 100644 mlir/lib/Conversion/ArithToArmSME/ArithToArmSME.cpp
 create mode 100644 mlir/lib/Conversion/ArithToArmSME/CMakeLists.txt
 rename mlir/test/{Dialect/ArmSME/arith-ops-to-sme.mlir => Conversion/ArithToArmSME/arith-to-arm-sme.mlir} (97%)

diff --git a/mlir/include/mlir/Conversion/ArithToArmSME/ArithToArmSME.h b/mlir/include/mlir/Conversion/ArithToArmSME/ArithToArmSME.h
new file mode 100644
index 0000000000000..012e7fb5b0af2
--- /dev/null
+++ b/mlir/include/mlir/Conversion/ArithToArmSME/ArithToArmSME.h
@@ -0,0 +1,27 @@
+//===- ArithToArmSME.h - Arith to ArmSME dialect conversion -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_CONVERSION_ARITHTOARMSME_ARITHTOARMSME_H
+#define MLIR_CONVERSION_ARITHTOARMSME_ARITHTOARMSME_H
+
+#include <memory>
+
+namespace mlir {
+
+class RewritePatternSet;
+class Pass;
+
+#define GEN_PASS_DECL_ARITHTOARMSMECONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+
+namespace arith {
+void populateArithToArmSMEConversionPatterns(RewritePatternSet &patterns);
+} // namespace arith
+} // namespace mlir
+
+#endif // MLIR_CONVERSION_ARITHTOARMSME_ARITHTOARMSME_H
diff --git a/mlir/include/mlir/Conversion/Passes.h b/mlir/include/mlir/Conversion/Passes.h
index a25fd17ea923f..0bfc5064c5dd7 100644
--- a/mlir/include/mlir/Conversion/Passes.h
+++ b/mlir/include/mlir/Conversion/Passes.h
@@ -12,6 +12,7 @@
 #include "mlir/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.h"
 #include "mlir/Conversion/AffineToStandard/AffineToStandard.h"
 #include "mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h"
+#include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
 #include "mlir/Conversion/ArithToLLVM/ArithToLLVM.h"
 #include "mlir/Conversion/ArithToSPIRV/ArithToSPIRV.h"
 #include "mlir/Conversion/ArmNeon2dToIntr/ArmNeon2dToIntr.h"
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 71be8841ca7c0..3467e042c493e 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -164,6 +164,15 @@ def ConvertArithToSPIRV : Pass<"convert-arith-to-spirv"> {
   ];
 }
 
+//===----------------------------------------------------------------------===//
+// ArithToArmSME
+//===----------------------------------------------------------------------===//
+
+def ArithToArmSMEConversionPass : Pass<"convert-arith-to-arm-sme"> {
+  let summary = "Convert Arith dialect to ArmSME dialect";
+  let dependentDialects = ["arm_sme::ArmSMEDialect"];
+}
+
 //===----------------------------------------------------------------------===//
 // ArmNeon2dToIntr
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/ArmSME/Utils/Utils.h b/mlir/include/mlir/Dialect/ArmSME/Utils/Utils.h
index b7d90195d49d7..41702008ee48f 100644
--- a/mlir/include/mlir/Dialect/ArmSME/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/ArmSME/Utils/Utils.h
@@ -16,9 +16,16 @@
 #define MLIR_DIALECT_ARMSME_UTILS_UTILS_H_
 
 #include "mlir/Dialect/ArmSME/IR/ArmSMEEnums.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include <optional>
 
+namespace mlir {
+class Location;
+class PatternRewriter;
+class Value;
+} // namespace mlir
+
 namespace mlir::arm_sme {
 
 constexpr unsigned MinStreamingVectorLengthInBits = 128;
@@ -42,6 +49,13 @@ std::optional<ArmSMETileType> getSMETileType(VectorType);
 /// Verifies the tile ID (if set) on this tile operation is valid.
 LogicalResult verifyOperationHasValidTileId(Operation *);
 
+/// Generates a for loop over ZA tile slices where the induction variable is
+/// the tile slice index and each iteration yields a new tile. Loop body is
+/// built via `makeLoopBody`, which returns the next tile value.
+scf::ForOp createLoopOverTileSlices(
+    PatternRewriter &rewriter, Location loc, Value initTile,
+    std::function<Value(OpBuilder &, Location, Value, Value)> makeLoopBody);
+
 } // namespace mlir::arm_sme
 
 #endif // MLIR_DIALECT_ARMSME_UTILS_UTILS_H_
diff --git a/mlir/lib/Conversion/ArithToArmSME/ArithToArmSME.cpp b/mlir/lib/Conversion/ArithToArmSME/ArithToArmSME.cpp
new file mode 100644
index 0000000000000..b12aa92001ff2
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToArmSME/ArithToArmSME.cpp
@@ -0,0 +1,125 @@
+//===- ArithToArmSME.cpp - Arith to ArmSME dialect conversion -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Conversion/ArithToArmSME/ArithToArmSME.h"
+
+#include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/ArmSME/IR/ArmSME.h"
+#include "mlir/Dialect/ArmSME/Utils/Utils.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+
+namespace mlir {
+#define GEN_PASS_DEF_ARITHTOARMSMECONVERSIONPASS
+#include "mlir/Conversion/Passes.h.inc"
+} // namespace mlir
+
+#define DEBUG_TYPE "arith-to-arm-sme"
+
+using namespace mlir;
+
+//===----------------------------------------------------------------------===//
+// Conversion helpers
+//===----------------------------------------------------------------------===//
+
+/// Returns true if 'val' is a splat of zero, false otherwise.
+static bool isSplatZero(Type elemType, DenseElementsAttr val) {
+  if (llvm::isa<FloatType>(elemType))
+    return val && val.isSplat() && val.getSplatValue<APFloat>().isZero();
+  if (llvm::isa<IntegerType>(elemType))
+    return val && val.isSplat() && val.getSplatValue<APInt>().isZero();
+  return false;
+}
+
+namespace {
+
+//===----------------------------------------------------------------------===//
+// ConstantOp
+//===----------------------------------------------------------------------===//
+
+/// Conversion pattern for dense arith.constant.
+struct ConstantOpToArmSMELowering : public OpRewritePattern<arith::ConstantOp> {
+  using OpRewritePattern<arith::ConstantOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(arith::ConstantOp constantOp,
+                                PatternRewriter &rewriter) const final {
+    auto tileType = dyn_cast<VectorType>(constantOp.getType());
+    if (!tileType || !arm_sme::isValidSMETileVectorType(tileType))
+      return failure();
+
+    auto denseAttr = dyn_cast<DenseElementsAttr>(constantOp.getValueAttr());
+    if (!denseAttr || !denseAttr.isSplat())
+      return failure();
+
+    auto tileElementType = tileType.getElementType();
+
+    // Lower 'arith.constant dense<0>' to 'arm_sme.zero' op.
+    if (isSplatZero(tileElementType, denseAttr)) {
+      rewriter.replaceOpWithNewOp<arm_sme::ZeroOp>(constantOp, tileType);
+      return success();
+    }
+
+    // Lower non-zero constants to a loop of 'arm_sme.move_vector_to_tile_slice'
+    // ops that broadcast the constant to each tile slice.
+    auto loc = constantOp.getLoc();
+
+    // To fill a tile with a constant, we create a 1-D splat of the constant,
+    // then move that into each tile slice (the largest unit we can set at once,
+    // outside of operations like the outerproduct).
+    VectorType tileSliceType = VectorType::Builder(tileType).dropDim(0);
+    auto denseAttr1D = DenseElementsAttr::get(
+        tileSliceType, denseAttr.getSplatValue<Attribute>());
+    auto constantOp1D = rewriter.create<arith::ConstantOp>(loc, denseAttr1D);
+
+    auto initTile = rewriter.create<arm_sme::GetTileOp>(loc, tileType);
+    auto makeLoopBody = [&](OpBuilder &b, Location loc, Value tileSliceIndex,
+                            Value currentTile) {
+      // Create 'arm_sme.move_vector_to_tile_slice' to write vector to tile
+      // slice.
+      auto nextTile = b.create<arm_sme::MoveVectorToTileSliceOp>(
+          loc, tileType, constantOp1D, currentTile, tileSliceIndex);
+      return nextTile.getResult();
+    };
+    auto forOp = mlir::arm_sme::createLoopOverTileSlices(
+        rewriter, loc, initTile, makeLoopBody);
+    rewriter.replaceOp(constantOp, forOp.getResult(0));
+
+    return success();
+  }
+};
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pattern population
+//===----------------------------------------------------------------------===//
+
+void mlir::arith::populateArithToArmSMEConversionPatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<ConstantOpToArmSMELowering>(patterns.getContext());
+}
+
+//===----------------------------------------------------------------------===//
+// Pass definition
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct ArithToArmSMEConversionPass final
+    : impl::ArithToArmSMEConversionPassBase<ArithToArmSMEConversionPass> {
+  using impl::ArithToArmSMEConversionPassBase<
+      ArithToArmSMEConversionPass>::ArithToArmSMEConversionPassBase;
+
+  void runOnOperation() override {
+    RewritePatternSet patterns(&getContext());
+    arith::populateArithToArmSMEConversionPatterns(patterns);
+    if (failed(
+            applyPatternsAndFoldGreedily(getOperation(), std::move(patterns))))
+      return signalPassFailure();
+  }
+};
+} // namespace
diff --git a/mlir/lib/Conversion/ArithToArmSME/CMakeLists.txt b/mlir/lib/Conversion/ArithToArmSME/CMakeLists.txt
new file mode 100644
index 0000000000000..c2a6fe5398e7c
--- /dev/null
+++ b/mlir/lib/Conversion/ArithToArmSME/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_mlir_conversion_library(MLIRArithToArmSME
+  ArithToArmSME.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Conversion/ArithToArmSME
+
+  DEPENDS
+  MLIRConversionPassIncGen
+
+  LINK_COMPONENTS
+  Core
+
+  LINK_LIBS PUBLIC
+  MLIRArmSMEDialect
+  MLIRArithDialect
+  MLIRPass
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Conversion/CMakeLists.txt b/mlir/lib/Conversion/CMakeLists.txt
index c3a2481975040..3a5dbc12c23f5 100644
--- a/mlir/lib/Conversion/CMakeLists.txt
+++ b/mlir/lib/Conversion/CMakeLists.txt
@@ -2,6 +2,7 @@ add_subdirectory(AffineToStandard)
 add_subdirectory(AMDGPUToROCDL)
 add_subdirectory(ArithCommon)
 add_subdirectory(ArithToAMDGPU)
+add_subdirectory(ArithToArmSME)
 add_subdirectory(ArithToLLVM)
 add_subdirectory(ArithToSPIRV)
 add_subdirectory(ArmNeon2dToIntr)
diff --git a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
index 87d1bf9bed5a3..d8e473a562e53 100644
--- a/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
+++ b/mlir/lib/Conversion/VectorToArmSME/VectorToArmSME.cpp
@@ -16,39 +16,6 @@
 
 using namespace mlir;
 
-/// Returns true if 'val' is a splat of zero, false otherwise.
-static bool isSplatZero(Type elemType, DenseElementsAttr val) {
-  if (llvm::isa<FloatType>(elemType))
-    return val && val.isSplat() && val.getSplatValue<APFloat>().isZero();
-  if (llvm::isa<IntegerType>(elemType))
-    return val && val.isSplat() && val.getSplatValue<APInt>().isZero();
-  return false;
-}
-
-/// Generates a for loop over ZA tile slices where the induction variable is
-/// the tile slice index and each iteration yields a new tile. Loop body is
-/// built via the callback, which returns the next tile value.
-template <typename LoopBodyCallback>
-static scf::ForOp createLoopOverTileSlices(PatternRewriter &rewriter,
-                                           Location loc, Value initTile,
-                                           LoopBodyCallback callback) {
-  OpBuilder::InsertionGuard g(rewriter);
-  auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
-  auto minTileSlices = rewriter.create<arith::ConstantIndexOp>(
-      loc, llvm::cast<VectorType>(initTile.getType()).getDimSize(0));
-  auto vscale =
-      rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
-  auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
-  auto numTileSlices =
-      rewriter.create<arith::MulIOp>(loc, minTileSlices, vscale);
-  auto forOp = rewriter.create<scf::ForOp>(loc, lowerBound, numTileSlices, step,
-                                           ValueRange{initTile});
-  rewriter.setInsertionPointToStart(forOp.getBody());
-  auto nextTile = callback(forOp);
-  rewriter.create<scf::YieldOp>(loc, nextTile.getResult());
-  return forOp;
-}
-
 namespace {
 
 /// Conversion pattern for vector.transfer_read.
@@ -223,56 +190,6 @@ struct VectorStoreToArmSMELowering : public OpRewritePattern<vector::StoreOp> {
   }
 };
 
-/// Conversion pattern for dense arith.constant.
-struct ConstantOpToArmSMELowering : public OpRewritePattern<arith::ConstantOp> {
-  using OpRewritePattern<arith::ConstantOp>::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(arith::ConstantOp constantOp,
-                                PatternRewriter &rewriter) const final {
-    auto tileType = dyn_cast<VectorType>(constantOp.getType());
-    if (!tileType || !arm_sme::isValidSMETileVectorType(tileType))
-      return failure();
-
-    auto denseAttr = dyn_cast<DenseElementsAttr>(constantOp.getValueAttr());
-    if (!denseAttr || !denseAttr.isSplat())
-      return failure();
-
-    auto tileElementType = tileType.getElementType();
-
-    // Lower 'arith.constant dense<0>' to 'arm_sme.zero' op.
-    if (isSplatZero(tileElementType, denseAttr)) {
-      rewriter.replaceOpWithNewOp<arm_sme::ZeroOp>(constantOp, tileType);
-      return success();
-    }
-
-    // Lower non-zero constants to a loop of 'arm_sme.move_vector_to_tile_slice'
-    // ops that broadcast the constant to each tile slice.
-    auto loc = constantOp.getLoc();
-
-    // To fill a tile with a constant, we create a 1-D splat of the constant,
-    // then move that into each tile slice (the largest unit we can set at once,
-    // outside of operations like the outerproduct).
-    VectorType tileSliceType = VectorType::Builder(tileType).dropDim(0);
-    auto denseAttr1D = DenseElementsAttr::get(
-        tileSliceType, denseAttr.getSplatValue<Attribute>());
-    auto constantOp1D = rewriter.create<arith::ConstantOp>(loc, denseAttr1D);
-
-    auto initTile = rewriter.create<arm_sme::GetTileOp>(loc, tileType);
-    auto forOp =
-        createLoopOverTileSlices(rewriter, loc, initTile, [&](auto forOp) {
-          auto tileSliceIndex = forOp.getInductionVar();
-          auto currentTile = forOp.getRegionIterArg(0);
-          // Create 'arm_sme.move_vector_to_tile_slice' to write vector to tile
-          // slice.
-          return rewriter.create<arm_sme::MoveVectorToTileSliceOp>(
-              loc, tileType, constantOp1D, currentTile, tileSliceIndex);
-        });
-    rewriter.replaceOp(constantOp, forOp.getResult(0));
-
-    return success();
-  }
-};
-
 /// Conversion pattern for vector.broadcast.
 ///
 /// Example:
@@ -322,16 +239,18 @@ struct BroadcastOpToArmSMELowering
 
     auto initTile = rewriter.create<arm_sme::GetTileOp>(loc, tileType);
 
+    auto makeLoopBody = [&](OpBuilder &b, Location loc, Value tileSliceIndex,
+                            Value currentTile) {
+      // Create 'arm_sme.move_vector_to_tile_slice' to broadcast the value
+      // to each tile slice.
+      auto nextTile = b.create<arm_sme::MoveVectorToTileSliceOp>(
+          loc, tileType, broadcastOp1D, currentTile, tileSliceIndex);
+      return nextTile.getResult();
+    };
+
     // Create a loop over ZA tile slices.
     auto forOp =
-        createLoopOverTileSlices(rewriter, loc, initTile, [&](auto forOp) {
-          auto tileSliceIndex = forOp.getInductionVar();
-          auto currentTile = forOp.getRegionIterArg(0);
-          // Create 'arm_sme.move_vector_to_tile_slice' to broadcast the value
-          // to each tile slice.
-          return rewriter.create<arm_sme::MoveVectorToTileSliceOp>(
-              loc, tileType, broadcastOp1D, currentTile, tileSliceIndex);
-        });
+        createLoopOverTileSlices(rewriter, loc, initTile, makeLoopBody);
 
     rewriter.replaceOp(broadcastOp, forOp.getResult(0));
 
@@ -381,15 +300,17 @@ struct SplatOpToArmSMELowering : public OpRewritePattern<vector::SplatOp> {
 
     auto initTile = rewriter.create<arm_sme::GetTileOp>(loc, tileType);
 
+    auto makeLoopBody = [&](OpBuilder &b, Location loc, Value tileSliceIndex,
+                            Value currentTile) {
+      auto nextTile = b.create<arm_sme::MoveVectorToTileSliceOp>(
+          loc, tileType, broadcastOp1D, currentTile, tileSliceIndex);
+      return nextTile.getResult();
+    };
+
     // Next, create a loop over ZA tile slices and "move" the generated 1-d
     // vector to each slice.
     auto forOp =
-        createLoopOverTileSlices(rewriter, loc, initTile, [&](auto forOp) {
-          auto tileSliceIndex = forOp.getInductionVar();
-          auto currentTile = forOp.getRegionIterArg(0);
-          return rewriter.create<arm_sme::MoveVectorToTileSliceOp>(
-              loc, tileType, broadcastOp1D, currentTile, tileSliceIndex);
-        });
+        createLoopOverTileSlices(rewriter, loc, initTile, makeLoopBody);
 
     rewriter.replaceOp(splatOp, forOp.getResult(0));
 
@@ -741,11 +662,10 @@ struct VectorPrintToArmSMELowering : public OpRewritePattern<vector::PrintOp> {
 
 void mlir::populateVectorToArmSMEPatterns(RewritePatternSet &patterns,
                                           MLIRContext &ctx) {
-  patterns
-      .add<BroadcastOpToArmSMELowering, ConstantOpToArmSMELowering,
-           SplatOpToArmSMELowering, TransferReadToArmSMELowering,
-           TransferWriteToArmSMELowering, TransposeOpToArmSMELowering,
-           VectorLoadToArmSMELowering, VectorStoreToArmSMELowering,
-           VectorOuterProductToArmSMELowering, VectorExtractToArmSMELowering,
-           VectorInsertToArmSMELowering, VectorPrintToArmSMELowering>(&ctx);
+  patterns.add<BroadcastOpToArmSMELowering, SplatOpToArmSMELowering,
+               TransferReadToArmSMELowering, TransferWriteToArmSMELowering,
+               TransposeOpToArmSMELowering, VectorLoadToArmSMELowering,
+               VectorStoreToArmSMELowering, VectorOuterProductToArmSMELowering,
+               VectorExtractToArmSMELowering, VectorInsertToArmSMELowering,
+               VectorPrintToArmSMELowering>(&ctx);
 }
diff --git a/mlir/lib/Dialect/ArmSME/IR/Utils.cpp b/mlir/lib/Dialect/ArmSME/IR/Utils.cpp
index 1fa060cafc0bc..d039121055566 100644
--- a/mlir/lib/Dialect/ArmSME/IR/Utils.cpp
+++ b/mlir/lib/Dialect/ArmSME/IR/Utils.cpp
@@ -72,4 +72,26 @@ LogicalResult verifyOperationHasValidTileId(Operation *op) {
   return success();
 }
 
+scf::ForOp createLoopOverTileSlices(
+    PatternRewriter &rewriter, Location loc, Value initTile,
+    std::function<Value(OpBuilder &, Location, Value, Value)> makeLoopBody) {
+  OpBuilder::InsertionGuard g(rewriter);
+  auto step = rewriter.create<arith::ConstantIndexOp>(loc, 1);
+  auto minTileSlices = rewriter.create<arith::ConstantIndexOp>(
+      loc, llvm::cast<VectorType>(initTile.getType()).getDimSize(0));
+  auto vscale =
+      rewriter.create<vector::VectorScaleOp>(loc, rewriter.getIndexType());
+  auto lowerBound = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+  auto numTileSlices =
+      rewriter.create<arith::MulIOp>(loc, minTileSlices, vscale);
+  auto forOp = rewriter.create<scf::ForOp>(loc, lowerBound, numTileSlices, step,
+                                           ValueRange{initTile});
+  rewriter.setInsertionPointToStart(forOp.getBody());
+  Value nextTile =
+      makeLoopBody(rewriter, loc, /*tileSliceIndex=*/forOp.getInductionVar(),
+                   /*currentTile=*/forOp.getRegionIterArg(0));
+  rewriter.create<scf::YieldOp>(loc, nextTile);
+  return forOp;
+}
+
 } // namespace mlir::arm_sme
diff --git a/mlir/test/Dialect/ArmSME/arith-ops-to-sme.mlir b/mlir/test/Conversion/ArithToArmSME/arith-to-arm-sme.mlir
similarity index 97%
rename from mlir/test/Dialect/ArmSME/arith-ops-to-sme.mlir
rename to mlir/test/Conversion/ArithToArmSME/arith-to-arm-sme.mlir
index e51f2485dadbc..49d2e2f3c182b 100644
--- a/mlir/test/Dialect/ArmSME/arith-ops-to-sme.mlir
+++ b/mlir/test/Conversion/ArithToArmSME/arith-to-arm-sme.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-arm-sme -split-input-file -allow-unregistered-dialect | FileCheck %s
+// RUN: mlir-opt %s -convert-arith-to-arm-sme -split-input-file -allow-unregistered-dialect | FileCheck %s
 
 // =============================================================================
 // arith.constant dense<0> to arm_sme.zero
diff --git a/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir b/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir
index ce5bfd25cbdbc..17a070999c20a 100644
--- a/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir
+++ b/mlir/test/Dialect/ArmSME/vector-ops-to-llvm.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf -convert-arm-sme-to-llvm -cse -canonicalize -split-input-file -allow-unregistered-dialect -verify-diagnostics | FileCheck %s
+// RUN: mlir-opt %s -convert-vector-to-arm-sme -convert-arith-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf -convert-arm-sme-to-llvm -cse -canonicalize -split-input-file -allow-unregistered-dialect -verify-diagnostics | FileCheck %s
 
 //===----------------------------------------------------------------------===//
 // vector.transfer_write
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
index 6314e6f279952..44ff1afe76d38 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/fill-2d.mlir
@@ -4,7 +4,8 @@
 // RUN:   -lower-vector-mask \
 // RUN:   -one-shot-bufferize="bufferize-function-boundaries" \
 // RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// RUN:   -convert-vector-to-arm-sme -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
+// RUN:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
+// RUN:   -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
 // RUN:   -convert-arm-sme-to-llvm -cse -canonicalize \
 // RUN:   -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
index dd9f280cb7509..42fe21cccd48a 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/ArmSME/use-too-many-tiles.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s \
-// RUN:   -convert-vector-to-arm-sme -allocate-arm-sme-tiles  \
-// RUN:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf \
+// RUN:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
+// RUN:   -allocate-arm-sme-tiles -convert-arm-sme-to-scf \
 // RUN:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops"  \
 // RUN:   -convert-vector-to-scf -cse -arm-sve-legalize-vector-storage \
 // RUN:   -convert-arm-sme-to-llvm -convert-vector-to-llvm=enable-arm-sve -cse \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
index 8c73c24d695cf..5f41b37560e76 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f32.mlir
@@ -1,7 +1,8 @@
 // DEFINE: %{entry_point} = test_outerproduct_no_accumulator_4x4xf32
 // DEFINE: %{compile} = mlir-opt %s \
 // DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
+// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
+// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
 // DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
 // DEFINE:   -test-lower-to-llvm -o %t
 // DEFINE: %{run} = %mcr_aarch64_cmd %t \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
index 965337c60b9ff..a1bb9b7d6f80e 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-outerproduct-f64.mlir
@@ -1,7 +1,8 @@
 // DEFINE: %{entry_point} = test_outerproduct_no_accumulator_2x2xf64
 // DEFINE: %{compile} = mlir-opt %s \
 // DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles  \
+// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
+// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
 // DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
 // DEFINE:   -test-lower-to-llvm -o %t
 // DEFINE: %{run} = %mcr_aarch64_cmd %t \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
index cb30fee4e12d7..c0c1f55d7ddd1 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/test-transfer-write-2d.mlir
@@ -1,6 +1,7 @@
 // DEFINE: %{entry_point} = entry
 // DEFINE: %{compile} = mlir-opt %s \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
+// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
+// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
 // DEFINE:   -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za only-if-required-by-ops" \
 // DEFINE:   -convert-arm-sme-to-llvm -cse -canonicalize \
 // DEFINE:   -test-lower-to-llvm
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
index b45f24f6c8fdd..223bc8ce74343 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/tile_fill.mlir
@@ -1,5 +1,6 @@
 // RUN: mlir-opt %s -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// RUN:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
+// RUN:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
+// RUN:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
 // RUN:   -convert-arm-sme-to-llvm -cse -canonicalize \
 // RUN:   -test-lower-to-llvm | \
 // RUN: %mcr_aarch64_cmd \
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir
index 073c08bff1c41..f28bf19b29993 100644
--- a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/vector-ops.mlir
@@ -1,6 +1,7 @@
 // DEFINE: %{entry_point} = entry
 // DEFINE: %{compile} = mlir-opt %s -enable-arm-streaming="streaming-mode=streaming-locally za-mode=new-za" \
-// DEFINE:   -convert-vector-to-arm-sme -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
+// DEFINE:   -convert-vector-to-arm-sme -convert-arith-to-arm-sme \
+// DEFINE:   -convert-arm-sme-to-scf -allocate-arm-sme-tiles \
 // DEFINE:   -convert-arm-sme-to-llvm -test-lower-to-llvm
 // DEFINE: %{run} = %mcr_aarch64_cmd \
 // DEFINE:  -march=aarch64 -mattr=+sve,+sme \

From e280c287e42065736d9c343306603ea07430a82b Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Mon, 22 Jan 2024 09:28:13 +0000
Subject: [PATCH 355/843] [mlir] Add `mlir_arm_runner_utils` library for use in
 integration tests (#78583)

This adds a new `mlir_arm_runner_utils` library that contains utils
specific to Arm/AArch64. This is for use in MLIR integration tests.

This initial patch adds `setArmVLBits()` and `setArmSVLBits()`. This
allows changing vector length or streaming vector length at runtime (or
setting it to a known minimum, i.e. 128-bits).
---
 mlir/lib/ExecutionEngine/ArmRunnerUtils.cpp   | 69 ++++++++++++++++
 mlir/lib/ExecutionEngine/CMakeLists.txt       |  5 ++
 mlir/test/CMakeLists.txt                      |  4 +
 .../Vector/CPU/ArmSME/Emulated/lit.local.cfg  |  5 ++
 .../ArmSME/Emulated/test-setArmSVLBits.mlir   | 78 +++++++++++++++++++
 .../Vector/CPU/ArmSVE/Emulated/lit.local.cfg  |  5 ++
 .../ArmSVE/Emulated/test-setArmVLBits.mlir    | 51 ++++++++++++
 mlir/test/lit.cfg.py                          |  3 +
 8 files changed, 220 insertions(+)
 create mode 100644 mlir/lib/ExecutionEngine/ArmRunnerUtils.cpp
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/lit.local.cfg
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/lit.local.cfg
 create mode 100644 mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/test-setArmVLBits.mlir

diff --git a/mlir/lib/ExecutionEngine/ArmRunnerUtils.cpp b/mlir/lib/ExecutionEngine/ArmRunnerUtils.cpp
new file mode 100644
index 0000000000000..13ca5aa5b046d
--- /dev/null
+++ b/mlir/lib/ExecutionEngine/ArmRunnerUtils.cpp
@@ -0,0 +1,69 @@
+//===- ArmRunnerUtils.cpp - Utilities for configuring architecture properties //
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MathExtras.h"
+#include <iostream>
+#include <stdint.h>
+#include <string_view>
+
+#if (defined(_WIN32) || defined(__CYGWIN__))
+#define MLIR_ARMRUNNERUTILS_EXPORTED __declspec(dllexport)
+#else
+#define MLIR_ARMRUNNERUTILS_EXPORTED __attribute__((visibility("default")))
+#endif
+
+#ifdef __linux__
+#include <sys/prctl.h>
+#endif
+
+extern "C" {
+
+// Defines for prctl() calls. These may not necessarily exist in the host
+// <sys/prctl.h>, but will still be useable under emulation.
+//
+// https://www.kernel.org/doc/html/v5.3/arm64/sve.html#prctl-extensions
+#ifndef PR_SVE_SET_VL
+#define PR_SVE_SET_VL 50
+#endif
+// https://docs.kernel.org/arch/arm64/sme.html#prctl-extensions
+#ifndef PR_SME_SET_VL
+#define PR_SME_SET_VL 63
+#endif
+// Note: This mask is the same as both PR_SME_VL_LEN_MASK and
+// PR_SVE_VL_LEN_MASK.
+#define PR_VL_LEN_MASK 0xffff
+
+static void setArmVectorLength(std::string_view helper_name, int option,
+                               uint32_t bits) {
+#if defined(__linux__) && defined(__aarch64__)
+  if (bits < 128 || bits > 2048 || !llvm::isPowerOf2_32(bits)) {
+    std::cerr << "[error] Attempted to set an invalid vector length (" << bits
+              << "-bit)" << std::endl;
+    abort();
+  }
+  uint32_t vl = bits / 8;
+  if (auto ret = prctl(option, vl & PR_VL_LEN_MASK); ret < 0) {
+    std::cerr << "[error] prctl failed (" << ret << ")" << std::endl;
+    abort();
+  }
+#else
+  std::cerr << "[error] " << helper_name << " is unsupported" << std::endl;
+  abort();
+#endif
+}
+
+/// Sets the SVE vector length (in bits) to `bits`.
+void MLIR_ARMRUNNERUTILS_EXPORTED setArmVLBits(uint32_t bits) {
+  setArmVectorLength(__func__, PR_SVE_SET_VL, bits);
+}
+
+/// Sets the SME streaming vector length (in bits) to `bits`.
+void MLIR_ARMRUNNERUTILS_EXPORTED setArmSVLBits(uint32_t bits) {
+  setArmVectorLength(__func__, PR_SME_SET_VL, bits);
+}
+}
diff --git a/mlir/lib/ExecutionEngine/CMakeLists.txt b/mlir/lib/ExecutionEngine/CMakeLists.txt
index 2f391b7698cbb..b7e448d5417ea 100644
--- a/mlir/lib/ExecutionEngine/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/CMakeLists.txt
@@ -2,6 +2,7 @@
 # is a big dependency which most don't need.
 
 set(LLVM_OPTIONAL_SOURCES
+  ArmRunnerUtils.cpp
   ArmSMEStubs.cpp
   AsyncRuntime.cpp
   CRunnerUtils.cpp
@@ -186,6 +187,10 @@ if(LLVM_ENABLE_PIC)
     ArmSMEStubs.cpp)
   target_compile_definitions(mlir_arm_sme_abi_stubs PRIVATE mlir_arm_sme_abi_stubs_EXPORTS)
 
+  add_mlir_library(mlir_arm_runner_utils
+    SHARED
+    ArmRunnerUtils.cpp)
+
   if(MLIR_ENABLE_CUDA_RUNNER)
     # Configure CUDA support. Using check_language first allows us to give a
     # custom error message.
diff --git a/mlir/test/CMakeLists.txt b/mlir/test/CMakeLists.txt
index 8ce030feeded9..6724dd4bdd1bc 100644
--- a/mlir/test/CMakeLists.txt
+++ b/mlir/test/CMakeLists.txt
@@ -153,6 +153,10 @@ if (MLIR_RUN_ARM_SME_TESTS AND NOT ARM_SME_ABI_ROUTINES_SHLIB)
   list(APPEND MLIR_TEST_DEPENDS mlir_arm_sme_abi_stubs)
 endif()
 
+if (MLIR_RUN_ARM_SVE_TESTS OR MLIR_RUN_ARM_SME_TESTS)
+  list(APPEND MLIR_TEST_DEPENDS mlir_arm_runner_utils)
+endif()
+
 list(APPEND MLIR_TEST_DEPENDS MLIRUnitTests)
 
 if(LLVM_BUILD_EXAMPLES)
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/lit.local.cfg b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/lit.local.cfg
new file mode 100644
index 0000000000000..0d8ad605f598f
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/lit.local.cfg
@@ -0,0 +1,5 @@
+# The tests in this folder assume full control of the hardware features, such as
+# the vector length, so must be run under an emulator.
+
+if not config.arm_emulator_executable:
+    config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
new file mode 100644
index 0000000000000..415181171e27b
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSME/Emulated/test-setArmSVLBits.mlir
@@ -0,0 +1,78 @@
+// DEFINE: %{entry_point} = main
+// DEFINE: %{compile} = mlir-opt %s -convert-arm-sme-to-llvm -test-lower-to-llvm
+// DEFINE: %{run} = %mcr_aarch64_cmd \
+// DEFINE:  -march=aarch64 -mattr=+sve,+sme \
+// DEFINE:  -e %{entry_point} -entry-point-result=void \
+// DEFINE:  -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%mlir_arm_runner_utils
+
+// RUN: %{compile} | %{run} | FileCheck %s
+
+func.func @checkSVL() {
+  %svl_b = arm_sme.streaming_vl <byte>
+  %svl_h = arm_sme.streaming_vl <half>
+  %svl_w = arm_sme.streaming_vl <word>
+  %svl_d = arm_sme.streaming_vl <double>
+  vector.print str "SVL.b"
+  vector.print %svl_b : index
+  vector.print str "SVL.h"
+  vector.print %svl_h : index
+  vector.print str "SVL.w"
+  vector.print %svl_w : index
+  vector.print str "SVL.d"
+  vector.print %svl_d : index
+  return
+}
+
+func.func @setAndCheckSVL(%bits: i32) {
+  func.call @setArmSVLBits(%bits) : (i32) -> ()
+  func.call @checkSVL() : () -> ()
+  return
+}
+
+func.func @main() {
+  //      CHECK: SVL.b
+  // CHECK-NEXT: 16
+  //
+  // CHECK-NEXT: SVL.h
+  // CHECK-NEXT: 8
+  //
+  // CHECK-NEXT: SVL.w
+  // CHECK-NEXT: 4
+  //
+  // CHECK-NEXT: SVL.d
+  // CHECK-NEXT: 2
+  %c128 = arith.constant 128 : i32
+  func.call @setAndCheckSVL(%c128) : (i32) -> ()
+
+  //      CHECK: SVL.b
+  // CHECK-NEXT: 32
+  //
+  // CHECK-NEXT: SVL.h
+  // CHECK-NEXT: 16
+  //
+  // CHECK-NEXT: SVL.w
+  // CHECK-NEXT: 8
+  //
+  // CHECK-NEXT: SVL.d
+  // CHECK-NEXT: 4
+  %c256 = arith.constant 256 : i32
+  func.call @setAndCheckSVL(%c256) : (i32) -> ()
+
+  //      CHECK: SVL.b
+  // CHECK-NEXT: 64
+  //
+  // CHECK-NEXT: SVL.h
+  // CHECK-NEXT: 32
+  //
+  // CHECK-NEXT: SVL.w
+  // CHECK-NEXT: 16
+  //
+  // CHECK-NEXT: SVL.d
+  // CHECK-NEXT: 8
+  %c512 = arith.constant 512 : i32
+  func.call @setAndCheckSVL(%c512) : (i32) -> ()
+
+  return
+}
+
+func.func private @setArmSVLBits(%bits : i32)
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/lit.local.cfg b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/lit.local.cfg
new file mode 100644
index 0000000000000..0d8ad605f598f
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/lit.local.cfg
@@ -0,0 +1,5 @@
+# The tests in this folder assume full control of the hardware features, such as
+# the vector length, so must be run under an emulator.
+
+if not config.arm_emulator_executable:
+    config.unsupported = True
diff --git a/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/test-setArmVLBits.mlir b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/test-setArmVLBits.mlir
new file mode 100644
index 0000000000000..4f46c6e1ebf6a
--- /dev/null
+++ b/mlir/test/Integration/Dialect/Vector/CPU/ArmSVE/Emulated/test-setArmVLBits.mlir
@@ -0,0 +1,51 @@
+// DEFINE: %{entry_point} = main
+// DEFINE: %{compile} = mlir-opt %s -test-lower-to-llvm
+// DEFINE: %{run} = %mcr_aarch64_cmd -march=aarch64 -mattr=+sve \
+// DEFINE:  -e %{entry_point} -entry-point-result=void \
+// DEFINE:  -shared-libs=%mlir_runner_utils,%mlir_c_runner_utils,%mlir_arm_runner_utils
+
+// RUN: %{compile} | %{run} | FileCheck %s
+
+func.func @checkVScale() {
+  %vscale = vector.vscale
+  vector.print str "vscale"
+  vector.print %vscale : index
+  return
+}
+
+func.func @setAndCheckVL(%bits: i32) {
+  func.call @setArmVLBits(%bits) : (i32) -> ()
+  func.call @checkVScale() : () -> ()
+  return
+}
+
+func.func @main() {
+  //      CHECK: vscale
+  // CHECK-NEXT: 1
+  %c128 = arith.constant 128 : i32
+  func.call @setAndCheckVL(%c128) : (i32) -> ()
+
+  //      CHECK: vscale
+  // CHECK-NEXT: 2
+  %c256 = arith.constant 256 : i32
+  func.call @setAndCheckVL(%c256) : (i32) -> ()
+
+  //      CHECK: vscale
+  // CHECK-NEXT: 4
+  %c512 = arith.constant 512 : i32
+  func.call @setAndCheckVL(%c512) : (i32) -> ()
+
+  //      CHECK: vscale
+  // CHECK-NEXT: 8
+  %c1024 = arith.constant 1024 : i32
+  func.call @setAndCheckVL(%c1024) : (i32) -> ()
+
+  //      CHECK: vscale
+  // CHECK-NEXT: 16
+  %c2048 = arith.constant 2048 : i32
+  func.call @setAndCheckVL(%c2048) : (i32) -> ()
+
+  return
+}
+
+func.func private @setArmVLBits(%bits : i32)
diff --git a/mlir/test/lit.cfg.py b/mlir/test/lit.cfg.py
index 0a1ea1d16da45..38e65e4549c55 100644
--- a/mlir/test/lit.cfg.py
+++ b/mlir/test/lit.cfg.py
@@ -135,6 +135,9 @@ def add_runtime(name):
 if config.enable_sycl_runner:
     tools.extend([add_runtime("mlir_sycl_runtime")])
 
+if config.mlir_run_arm_sve_tests or config.mlir_run_arm_sme_tests:
+    tools.extend([add_runtime("mlir_arm_runner_utils")])
+
 if config.mlir_run_arm_sme_tests:
     config.substitutions.append(
         (

From ac296b696ccf3081b2fc920f860da894fb1d8eb0 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Mon, 22 Jan 2024 10:31:37 +0100
Subject: [PATCH 356/843] [AMDGPU] Drop verify from SIMemoryLegalizer tests
 (#78697)

SIMemoryLegalizer tests were slow, with most of them taking 4.5 to 5.3s
to complete and that's on a fast machine. I also recall seeing them in
the slowest tests list on build bots.

This removes the verify-machineinstrs option from these tests to speed
them up, bringing the slowest test down to +-2s.
Verifier still runs in EXPENSIVE_CHECKS builds.
---
 .../memory-legalizer-atomic-fence.ll          | 14 +++++-----
 .../CodeGen/AMDGPU/memory-legalizer-fence.ll  | 23 ++++++++--------
 .../AMDGPU/memory-legalizer-flat-agent.ll     | 21 +++++++--------
 .../memory-legalizer-flat-nontemporal.ll      | 20 +++++++-------
 .../memory-legalizer-flat-singlethread.ll     | 21 +++++++--------
 .../AMDGPU/memory-legalizer-flat-system.ll    | 21 +++++++--------
 .../AMDGPU/memory-legalizer-flat-volatile.ll  | 12 ++++-----
 .../AMDGPU/memory-legalizer-flat-wavefront.ll | 21 +++++++--------
 .../AMDGPU/memory-legalizer-flat-workgroup.ll | 21 +++++++--------
 .../AMDGPU/memory-legalizer-global-agent.ll   | 23 ++++++++--------
 .../memory-legalizer-global-nontemporal.ll    | 22 ++++++++--------
 .../memory-legalizer-global-singlethread.ll   | 23 ++++++++--------
 .../AMDGPU/memory-legalizer-global-system.ll  | 23 ++++++++--------
 .../memory-legalizer-global-volatile.ll       | 14 +++++-----
 .../memory-legalizer-global-wavefront.ll      | 23 ++++++++--------
 .../memory-legalizer-global-workgroup.ll      | 23 ++++++++--------
 .../memory-legalizer-invalid-syncscope.ll     |  8 +++---
 .../AMDGPU/memory-legalizer-local-agent.ll    | 23 ++++++++--------
 .../memory-legalizer-local-nontemporal.ll     | 22 ++++++++--------
 .../memory-legalizer-local-singlethread.ll    | 23 ++++++++--------
 .../AMDGPU/memory-legalizer-local-system.ll   | 23 ++++++++--------
 .../AMDGPU/memory-legalizer-local-volatile.ll | 14 +++++-----
 .../memory-legalizer-local-wavefront.ll       | 23 ++++++++--------
 .../memory-legalizer-local-workgroup.ll       | 23 ++++++++--------
 .../memory-legalizer-private-nontemporal.ll   | 26 +++++++++----------
 .../memory-legalizer-private-volatile.ll      | 18 ++++++-------
 .../memory-legalizer-store-infinite-loop.ll   |  2 +-
 27 files changed, 257 insertions(+), 273 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
index 4cb83040bfe73..03cd8332bb3a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/memory-legalizer-atomic-fence.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10CU %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11WGP %s
-; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck -check-prefix=GFX11CU %s
+; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd- -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10WGP %s
+; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck -check-prefix=GFX10CU %s
+; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GFX11WGP %s
+; RUN: llc -global-isel -stop-after=si-memory-legalizer -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck -check-prefix=GFX11CU %s
 
 ; Note: we use MIR test checks + stop after legalizer to prevent
 ; tests from being optimized out.
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
index d950c24192f39..89ccf97c3f4b2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-fence.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @singlethread_acquire_fence() {
 ; GFX6-LABEL: singlethread_acquire_fence:
@@ -2210,4 +2210,3 @@ entry:
   fence syncscope("one-as") seq_cst
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
index 6989c068794ee..d6c759f733b42 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @flat_agent_unordered_load(
 ; GFX7-LABEL: flat_agent_unordered_load:
@@ -13140,4 +13140,3 @@ entry:
   store i32 %val0, ptr %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
index 6d48e36d91b14..a59c0394bebe2 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
index 8720186ab02f3..c4c01968d5c39 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @flat_singlethread_unordered_load(
 ; GFX7-LABEL: flat_singlethread_unordered_load:
@@ -11615,4 +11615,3 @@ entry:
   store i32 %val0, ptr %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
index a3d73bbd9ff00..461fe8b1296a3 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @flat_system_unordered_load(
 ; GFX7-LABEL: flat_system_unordered_load:
@@ -13392,4 +13392,3 @@ entry:
   store i32 %val0, ptr %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
index 3c4a7fb78b3ac..3c8b3aab6ad02 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @flat_nontemporal_load_0(
 ; GFX7-LABEL: flat_nontemporal_load_0:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
index 350ab8d48da81..0881f3665f3ad 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @flat_wavefront_unordered_load(
 ; GFX7-LABEL: flat_wavefront_unordered_load:
@@ -11472,4 +11472,3 @@ entry:
   store i32 %val0, ptr %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
index 6c071c6f0ba97..420819b12d125 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll
@@ -1,14 +1,14 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @flat_workgroup_unordered_load(
 ; GFX7-LABEL: flat_workgroup_unordered_load:
@@ -11714,4 +11714,3 @@ entry:
   store i32 %val0, ptr %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
index f344ebe0c3be6..0a7a07e361444 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @global_agent_unordered_load(
 ; GFX6-LABEL: global_agent_unordered_load:
@@ -13580,4 +13580,3 @@ entry:
   store i32 %val0, ptr addrspace(1) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
index 16e336f772df3..07a3f85066991 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @global_nontemporal_load_0(
 ; GFX6-LABEL: global_nontemporal_load_0:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
index 897c098c6b821..495449ed8eee1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @global_singlethread_unordered_load(
 ; GFX6-LABEL: global_singlethread_unordered_load:
@@ -12440,4 +12440,3 @@ entry:
   store i32 %val0, ptr addrspace(1) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
index 0f91d1301cca9..97b2f0813e5ee 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @global_system_unordered_load(
 ; GFX6-LABEL: global_system_unordered_load:
@@ -13226,4 +13226,3 @@ entry:
   store i32 %val0, ptr addrspace(1) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
index dfc5217538dd8..5875420eac09a 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @global_volatile_load_0(
 ; GFX6-LABEL: global_volatile_load_0:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
index c1bcdedb95218..db1877af5541c 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @global_wavefront_unordered_load(
 ; GFX6-LABEL: global_wavefront_unordered_load:
@@ -12440,4 +12440,3 @@ entry:
   store i32 %val0, ptr addrspace(1) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
index 7e94b6a00875e..0db3cfb36bb7f 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @global_workgroup_unordered_load(
 ; GFX6-LABEL: global_workgroup_unordered_load:
@@ -12776,4 +12776,3 @@ entry:
   store i32 %val0, ptr addrspace(1) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
index 64da9e52b98aa..67ca31a2bb84e 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-invalid-syncscope.ll
@@ -1,7 +1,7 @@
-; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s 2>&1 | FileCheck %s
-; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn-amd- -mcpu=gfx803 < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx803 < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s 2>&1 | FileCheck %s
+; RUN: not llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s 2>&1 | FileCheck %s
 
 ; CHECK: error: <unknown>:0:0: in function invalid_fence void (): Unsupported atomic synchronization scope
 define amdgpu_kernel void @invalid_fence() {
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
index 1f0cf859e0dc7..b53065cf89786 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @local_agent_unordered_load(
 ; GFX6-LABEL: local_agent_unordered_load:
@@ -11864,4 +11864,3 @@ entry:
   store i32 %val0, ptr addrspace(3) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
index 7c102727da45d..70c91d2600602 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @local_nontemporal_load_0(
 ; GFX6-LABEL: local_nontemporal_load_0:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
index 869ba8654e14a..1f494bf3d3605 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @local_singlethread_unordered_load(
 ; GFX6-LABEL: local_singlethread_unordered_load:
@@ -11584,4 +11584,3 @@ entry:
   store i32 %val0, ptr addrspace(3) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
index e39cbf114f9bb..baec9cb601bbd 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @local_system_unordered_load(
 ; GFX6-LABEL: local_system_unordered_load:
@@ -11864,4 +11864,3 @@ entry:
   store i32 %val0, ptr addrspace(3) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
index 25ecba631ed19..1dda86e6e0987 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll
@@ -1,11 +1,11 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @local_volatile_load_0(
 ; GFX6-LABEL: local_volatile_load_0:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
index 212aec947eb57..1e9da50285dfa 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @local_wavefront_unordered_load(
 ; GFX6-LABEL: local_wavefront_unordered_load:
@@ -11584,4 +11584,3 @@ entry:
   store i32 %val0, ptr addrspace(3) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
index cff9a7ea76c25..652652039b637 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll
@@ -1,15 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
 
 define amdgpu_kernel void @local_workgroup_unordered_load(
 ; GFX6-LABEL: local_workgroup_unordered_load:
@@ -11864,4 +11864,3 @@ entry:
   store i32 %val0, ptr addrspace(3) %out, align 4
   ret void
 }
-
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
index d5223b5d8d389..069443791b3e1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a < %s | FileCheck -check-prefixes=GFX90A-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX90A-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 < %s | FileCheck -check-prefixes=GFX940-NOTTGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -mattr=+tgsplit < %s | FileCheck -check-prefixes=GFX940-TGSPLIT %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @private_nontemporal_load_0(
 ; GFX6-LABEL: private_nontemporal_load_0:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
index b42a497991e2c..216b1eccf3a69 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll
@@ -1,13 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX6 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX7 %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX10-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations -verify-machineinstrs < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX11-CU %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-WGP %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode -verify-machineinstrs < %s | FileCheck --check-prefixes=GFX12-CU %s
+; RUN: llc -mtriple=amdgcn-amd- -mcpu=gfx600 < %s | FileCheck --check-prefixes=GFX6 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 < %s | FileCheck --check-prefixes=GFX7 %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 < %s | FileCheck --check-prefixes=GFX10-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX10-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx700 -amdgcn-skip-cache-invalidations < %s | FileCheck --check-prefixes=SKIP-CACHE-INV %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck --check-prefixes=GFX11-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX11-CU %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck --check-prefixes=GFX12-WGP %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -mattr=+cumode < %s | FileCheck --check-prefixes=GFX12-CU %s
 
 define amdgpu_kernel void @private_volatile_load_0(
 ; GFX6-LABEL: private_volatile_load_0:
diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
index 5a3b64f35cb83..7059e80d5c3d1 100644
--- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-store-infinite-loop.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn--amdhsa < %s | FileCheck -check-prefix=GCN %s
 
 ; Effectively, check that the compile finishes; in the case
 ; of an infinite loop, llc toggles between merging 2 ST4s

From 5cd8d53cac00feafd739dba6215e1f6eed502e46 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Mon, 22 Jan 2024 17:36:32 +0800
Subject: [PATCH 357/843] [RISCV] Teach RISCVMergeBaseOffset to handle inline
 asm (#78945)

For inline asm with memory operands, we can merge the offset into
the second operand of memory constraint operands.

Differential Revision: https://reviews.llvm.org/D158062
---
 llvm/include/llvm/CodeGen/MachineOperand.h    |   4 +
 llvm/lib/CodeGen/MachineOperand.cpp           |  13 +
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |   5 +-
 .../lib/Target/RISCV/RISCVMergeBaseOffset.cpp |  78 +++++-
 .../RISCV/inline-asm-mem-constraint.ll        | 264 +++++++-----------
 5 files changed, 193 insertions(+), 171 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineOperand.h b/llvm/include/llvm/CodeGen/MachineOperand.h
index 1f3b7feedd189..63a172134538c 100644
--- a/llvm/include/llvm/CodeGen/MachineOperand.h
+++ b/llvm/include/llvm/CodeGen/MachineOperand.h
@@ -782,6 +782,10 @@ class MachineOperand {
   void ChangeToGA(const GlobalValue *GV, int64_t Offset,
                   unsigned TargetFlags = 0);
 
+  /// ChangeToBA - Replace this operand with a new block address operand.
+  void ChangeToBA(const BlockAddress *BA, int64_t Offset,
+                  unsigned TargetFlags = 0);
+
   /// ChangeToMCSymbol - Replace this operand with a new MC symbol operand.
   void ChangeToMCSymbol(MCSymbol *Sym, unsigned TargetFlags = 0);
 
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 12d6b79f735d7..c7c0a1c20d57f 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -206,6 +206,19 @@ void MachineOperand::ChangeToGA(const GlobalValue *GV, int64_t Offset,
   setTargetFlags(TargetFlags);
 }
 
+void MachineOperand::ChangeToBA(const BlockAddress *BA, int64_t Offset,
+                                unsigned TargetFlags) {
+  assert((!isReg() || !isTied()) &&
+         "Cannot change a tied operand into a block address");
+
+  removeRegFromUses();
+
+  OpKind = MO_BlockAddress;
+  Contents.OffsetedInfo.Val.BA = BA;
+  setOffset(Offset);
+  setTargetFlags(TargetFlags);
+}
+
 void MachineOperand::ChangeToMCSymbol(MCSymbol *Sym, unsigned TargetFlags) {
   assert((!isReg() || !isTied()) &&
          "Cannot change a tied operand into an MCSymbol");
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index f2bd5118fc071..aee6ec05f1f9c 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -323,7 +323,8 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   // RISCVDAGToDAGISel::SelectInlineAsmMemoryOperand).
   if (!AddrReg.isReg())
     return true;
-  if (!Offset.isImm() && !Offset.isGlobal() && !Offset.isBlockAddress())
+  if (!Offset.isImm() && !Offset.isGlobal() && !Offset.isBlockAddress() &&
+      !Offset.isMCSymbol())
     return true;
 
   MCOperand MCO;
@@ -332,7 +333,7 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 
   if (Offset.isImm())
     OS << MCO.getImm();
-  else if (Offset.isGlobal() || Offset.isBlockAddress())
+  else if (Offset.isGlobal() || Offset.isBlockAddress() || Offset.isMCSymbol())
     OS << *MCO.getExpr();
   OS << "(" << RISCVInstPrinter::getRegisterName(AddrReg.getReg()) << ")";
   return false;
diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 866b2453579cc..313f7f396d4d4 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -362,6 +362,8 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
   // Tail: lw vreg3, 8(vreg2)
 
   std::optional<int64_t> CommonOffset;
+  DenseMap<const MachineInstr *, SmallVector<unsigned>>
+      InlineAsmMemoryOpIndexesMap;
   for (const MachineInstr &UseMI : MRI->use_instructions(DestReg)) {
     switch (UseMI.getOpcode()) {
     default:
@@ -396,6 +398,49 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
       if (CommonOffset && Offset != CommonOffset)
         return false;
       CommonOffset = Offset;
+      break;
+    }
+    case RISCV::INLINEASM:
+    case RISCV::INLINEASM_BR: {
+      SmallVector<unsigned> InlineAsmMemoryOpIndexes;
+      unsigned NumOps = 0;
+      for (unsigned I = InlineAsm::MIOp_FirstOperand;
+           I < UseMI.getNumOperands(); I += 1 + NumOps) {
+        const MachineOperand &FlagsMO = UseMI.getOperand(I);
+        // Should be an imm.
+        if (!FlagsMO.isImm())
+          continue;
+
+        const InlineAsm::Flag Flags(FlagsMO.getImm());
+        NumOps = Flags.getNumOperandRegisters();
+
+        // Memory constraints have two operands.
+        if (NumOps != 2 || !Flags.isMemKind())
+          continue;
+
+        // We can't do this for constraint A because AMO instructions don't have
+        // an immediate offset field.
+        if (Flags.getMemoryConstraintID() == InlineAsm::ConstraintCode::A)
+          return false;
+
+        const MachineOperand &AddrMO = UseMI.getOperand(I + 1);
+        if (!AddrMO.isReg() || AddrMO.getReg() != DestReg)
+          continue;
+
+        const MachineOperand &OffsetMO = UseMI.getOperand(I + 2);
+        if (!OffsetMO.isImm())
+          continue;
+
+        // All inline asm memory operands must use the same offset.
+        int64_t Offset = OffsetMO.getImm();
+        if (CommonOffset && Offset != CommonOffset)
+          return false;
+        CommonOffset = Offset;
+        InlineAsmMemoryOpIndexes.push_back(I + 1);
+      }
+      InlineAsmMemoryOpIndexesMap.insert(
+          std::make_pair(&UseMI, InlineAsmMemoryOpIndexes));
+      break;
     }
     }
   }
@@ -420,13 +465,36 @@ bool RISCVMergeBaseOffsetOpt::foldIntoMemoryOps(MachineInstr &Hi,
   // Update the immediate in the load/store instructions to add the offset.
   for (MachineInstr &UseMI :
        llvm::make_early_inc_range(MRI->use_instructions(DestReg))) {
-    UseMI.removeOperand(2);
-    UseMI.addOperand(ImmOp);
-    // Update the base reg in the Tail instruction to feed from LUI.
-    // Output of Hi is only used in Lo, no need to use MRI->replaceRegWith().
-    UseMI.getOperand(1).setReg(Hi.getOperand(0).getReg());
+    if (UseMI.getOpcode() == RISCV::INLINEASM ||
+        UseMI.getOpcode() == RISCV::INLINEASM_BR) {
+      auto &InlineAsmMemoryOpIndexes = InlineAsmMemoryOpIndexesMap[&UseMI];
+      for (unsigned I : InlineAsmMemoryOpIndexes) {
+        MachineOperand &MO = UseMI.getOperand(I + 1);
+        switch (ImmOp.getType()) {
+        case MachineOperand::MO_GlobalAddress:
+          MO.ChangeToGA(ImmOp.getGlobal(), ImmOp.getOffset(),
+                        ImmOp.getTargetFlags());
+          break;
+        case MachineOperand::MO_MCSymbol:
+          MO.ChangeToMCSymbol(ImmOp.getMCSymbol(), ImmOp.getTargetFlags());
+          MO.setOffset(ImmOp.getOffset());
+          break;
+        case MachineOperand::MO_BlockAddress:
+          MO.ChangeToBA(ImmOp.getBlockAddress(), ImmOp.getOffset(),
+                        ImmOp.getTargetFlags());
+          break;
+        default:
+          report_fatal_error("unsupported machine operand type");
+          break;
+        }
+      }
+    } else {
+      UseMI.removeOperand(2);
+      UseMI.addOperand(ImmOp);
+    }
   }
 
+  MRI->replaceRegWith(Lo.getOperand(0).getReg(), Hi.getOperand(0).getReg());
   Lo.eraseFromParent();
   return true;
 }
diff --git a/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll b/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll
index eb0e0287d48d7..52d0dabf18839 100644
--- a/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll
+++ b/llvm/test/CodeGen/RISCV/inline-asm-mem-constraint.ll
@@ -125,9 +125,8 @@ define void @constraint_m_with_global_1() nounwind {
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi0:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi0)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi0)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -135,9 +134,8 @@ define void @constraint_m_with_global_1() nounwind {
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi0:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi0)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi0)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0", "=*m"(ptr elementtype(i32) @eg)
@@ -147,39 +145,35 @@ define void @constraint_m_with_global_1() nounwind {
 define void @constraint_m_with_global_2() nounwind {
 ; RV32I-LABEL: constraint_m_with_global_2:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a0, %hi(eg)
-; RV32I-NEXT:    addi a0, a0, %lo(eg)
+; RV32I-NEXT:    lui a0, %hi(eg+4)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    sw zero, 4(a0)
+; RV32I-NEXT:    sw zero, %lo(eg+4)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: constraint_m_with_global_2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a0, %hi(eg)
-; RV64I-NEXT:    addi a0, a0, %lo(eg)
+; RV64I-NEXT:    lui a0, %hi(eg+4)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    sw zero, 4(a0)
+; RV64I-NEXT:    sw zero, %lo(eg+4)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-MEDIUM-LABEL: constraint_m_with_global_2:
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi1:
-; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi1)
+; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg+4)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 4(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi1)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
 ; RV64I-MEDIUM-LABEL: constraint_m_with_global_2:
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi1:
-; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi1)
+; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg+4)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 4(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi1)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0", "=*m"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 1))
@@ -190,18 +184,16 @@ define void @constraint_m_with_global_3() nounwind {
 ; RV32I-LABEL: constraint_m_with_global_3:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, %hi(eg+8000)
-; RV32I-NEXT:    addi a0, a0, %lo(eg+8000)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    sw zero, 0(a0)
+; RV32I-NEXT:    sw zero, %lo(eg+8000)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: constraint_m_with_global_3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a0, %hi(eg+8000)
-; RV64I-NEXT:    addi a0, a0, %lo(eg+8000)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    sw zero, 0(a0)
+; RV64I-NEXT:    sw zero, %lo(eg+8000)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -209,9 +201,8 @@ define void @constraint_m_with_global_3() nounwind {
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi2:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg+8000)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi2)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi2)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -219,9 +210,8 @@ define void @constraint_m_with_global_3() nounwind {
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi2:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg+8000)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi2)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi2)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0", "=*m"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 2000))
@@ -271,19 +261,17 @@ define void @constraint_m_with_extern_weak_global_1() nounwind {
 define void @constraint_m_with_extern_weak_global_2() nounwind {
 ; RV32I-LABEL: constraint_m_with_extern_weak_global_2:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a0, %hi(ewg)
-; RV32I-NEXT:    addi a0, a0, %lo(ewg)
+; RV32I-NEXT:    lui a0, %hi(ewg+4)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    sw zero, 4(a0)
+; RV32I-NEXT:    sw zero, %lo(ewg+4)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: constraint_m_with_extern_weak_global_2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a0, %hi(ewg)
-; RV64I-NEXT:    addi a0, a0, %lo(ewg)
+; RV64I-NEXT:    lui a0, %hi(ewg+4)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    sw zero, 4(a0)
+; RV64I-NEXT:    sw zero, %lo(ewg+4)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -314,18 +302,16 @@ define void @constraint_m_with_extern_weak_global_3() nounwind {
 ; RV32I-LABEL: constraint_m_with_extern_weak_global_3:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, %hi(ewg+8000)
-; RV32I-NEXT:    addi a0, a0, %lo(ewg+8000)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    sw zero, 0(a0)
+; RV32I-NEXT:    sw zero, %lo(ewg+8000)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: constraint_m_with_extern_weak_global_3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a0, %hi(ewg+8000)
-; RV64I-NEXT:    addi a0, a0, %lo(ewg+8000)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    sw zero, 0(a0)
+; RV64I-NEXT:    sw zero, %lo(ewg+8000)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -385,9 +371,8 @@ define void @constraint_m_with_local_1() nounwind {
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi6:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp0)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi6)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    lw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi6)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -397,9 +382,8 @@ define void @constraint_m_with_local_1() nounwind {
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi6:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp0)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi6)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    lw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi6)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
 entry:
@@ -415,10 +399,9 @@ define void @constraint_m_with_local_2() nounwind {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:  .Ltmp1: # Block address taken
 ; RV32I-NEXT:  # %bb.1: # %label
-; RV32I-NEXT:    lui a0, %hi(.Ltmp1)
-; RV32I-NEXT:    addi a0, a0, %lo(.Ltmp1)
+; RV32I-NEXT:    lui a0, %hi(.Ltmp1+4)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    lw zero, 4(a0)
+; RV32I-NEXT:    lw zero, %lo(.Ltmp1+4)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
@@ -426,10 +409,9 @@ define void @constraint_m_with_local_2() nounwind {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:  .Ltmp1: # Block address taken
 ; RV64I-NEXT:  # %bb.1: # %label
-; RV64I-NEXT:    lui a0, %hi(.Ltmp1)
-; RV64I-NEXT:    addi a0, a0, %lo(.Ltmp1)
+; RV64I-NEXT:    lui a0, %hi(.Ltmp1+4)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    lw zero, 4(a0)
+; RV64I-NEXT:    lw zero, %lo(.Ltmp1+4)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -438,10 +420,9 @@ define void @constraint_m_with_local_2() nounwind {
 ; RV32I-MEDIUM-NEXT:  .Ltmp1: # Block address taken
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi7:
-; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp1)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi7)
+; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp1+4)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    lw zero, 4(a0)
+; RV32I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi7)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -450,10 +431,9 @@ define void @constraint_m_with_local_2() nounwind {
 ; RV64I-MEDIUM-NEXT:  .Ltmp1: # Block address taken
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi7:
-; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp1)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi7)
+; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp1+4)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    lw zero, 4(a0)
+; RV64I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi7)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
 entry:
@@ -469,10 +449,9 @@ define void @constraint_m_with_local_3() nounwind {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:  .Ltmp2: # Block address taken
 ; RV32I-NEXT:  # %bb.1: # %label
-; RV32I-NEXT:    lui a0, %hi(.Ltmp2)
-; RV32I-NEXT:    addi a0, a0, %lo(.Ltmp2)
+; RV32I-NEXT:    lui a0, %hi(.Ltmp2+2000)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    lw zero, 2000(a0)
+; RV32I-NEXT:    lw zero, %lo(.Ltmp2+2000)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
@@ -480,10 +459,9 @@ define void @constraint_m_with_local_3() nounwind {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:  .Ltmp2: # Block address taken
 ; RV64I-NEXT:  # %bb.1: # %label
-; RV64I-NEXT:    lui a0, %hi(.Ltmp2)
-; RV64I-NEXT:    addi a0, a0, %lo(.Ltmp2)
+; RV64I-NEXT:    lui a0, %hi(.Ltmp2+2000)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    lw zero, 2000(a0)
+; RV64I-NEXT:    lw zero, %lo(.Ltmp2+2000)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -492,10 +470,9 @@ define void @constraint_m_with_local_3() nounwind {
 ; RV32I-MEDIUM-NEXT:  .Ltmp2: # Block address taken
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi8:
-; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp2)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi8)
+; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp2+2000)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    lw zero, 2000(a0)
+; RV32I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi8)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -504,10 +481,9 @@ define void @constraint_m_with_local_3() nounwind {
 ; RV64I-MEDIUM-NEXT:  .Ltmp2: # Block address taken
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi8:
-; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp2)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi8)
+; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp2+2000)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    lw zero, 2000(a0)
+; RV64I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi8)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
 entry:
@@ -539,9 +515,8 @@ define void @constraint_m_with_multi_operands() nounwind {
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi9:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi9)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0); sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi9)(a0); sw zero, %pcrel_lo(.Lpcrel_hi9)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -549,9 +524,8 @@ define void @constraint_m_with_multi_operands() nounwind {
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi9:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi9)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0); sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi9)(a0); sw zero, %pcrel_lo(.Lpcrel_hi9)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0; sw zero, $1", "=*m,=*m"(ptr elementtype(i32) @eg, ptr elementtype(i32) @eg)
@@ -585,12 +559,11 @@ define void @constraint_m_with_multi_asm() nounwind {
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi10:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi10)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi10)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi10)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -598,12 +571,11 @@ define void @constraint_m_with_multi_asm() nounwind {
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi10:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi10)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi10)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi10)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0", "=*m"(ptr elementtype(i32) @eg)
@@ -646,9 +618,8 @@ define i32 @constraint_m_with_callbr_multi_operands(i32 %a) {
 ; RV32I-MEDIUM:       # %bb.0: # %entry
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi11:
 ; RV32I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi11)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB14_2
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi11)(a1); sw zero, %pcrel_lo(.Lpcrel_hi11)(a1); beqz a0, .LBB14_2
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %normal
 ; RV32I-MEDIUM-NEXT:    li a0, 0
@@ -663,9 +634,8 @@ define i32 @constraint_m_with_callbr_multi_operands(i32 %a) {
 ; RV64I-MEDIUM:       # %bb.0: # %entry
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi11:
 ; RV64I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi11)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB14_2
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi11)(a1); sw zero, %pcrel_lo(.Lpcrel_hi11)(a1); beqz a0, .LBB14_2
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %normal
 ; RV64I-MEDIUM-NEXT:    li a0, 0
@@ -728,13 +698,12 @@ define i32 @constraint_m_with_multi_callbr_asm(i32 %a) {
 ; RV32I-MEDIUM:       # %bb.0: # %entry
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi12:
 ; RV32I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi12)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a1); beqz a0, .LBB15_3
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi12)(a1); beqz a0, .LBB15_3
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %normal0
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a1); beqz a0, .LBB15_3
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi12)(a1); beqz a0, .LBB15_3
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:  # %bb.2: # %normal1
 ; RV32I-MEDIUM-NEXT:    li a0, 0
@@ -749,13 +718,12 @@ define i32 @constraint_m_with_multi_callbr_asm(i32 %a) {
 ; RV64I-MEDIUM:       # %bb.0: # %entry
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi12:
 ; RV64I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi12)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a1); beqz a0, .LBB15_3
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi12)(a1); beqz a0, .LBB15_3
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %normal0
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a1); beqz a0, .LBB15_3
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi12)(a1); beqz a0, .LBB15_3
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:  # %bb.2: # %normal1
 ; RV64I-MEDIUM-NEXT:    li a0, 0
@@ -892,9 +860,8 @@ define void @constraint_o_with_global_1() nounwind {
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi13:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi13)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi13)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -902,9 +869,8 @@ define void @constraint_o_with_global_1() nounwind {
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi13:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi13)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi13)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0", "=*o"(ptr elementtype(i32) @eg)
@@ -914,39 +880,35 @@ define void @constraint_o_with_global_1() nounwind {
 define void @constraint_o_with_global_2() nounwind {
 ; RV32I-LABEL: constraint_o_with_global_2:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a0, %hi(eg)
-; RV32I-NEXT:    addi a0, a0, %lo(eg)
+; RV32I-NEXT:    lui a0, %hi(eg+4)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    sw zero, 4(a0)
+; RV32I-NEXT:    sw zero, %lo(eg+4)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: constraint_o_with_global_2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a0, %hi(eg)
-; RV64I-NEXT:    addi a0, a0, %lo(eg)
+; RV64I-NEXT:    lui a0, %hi(eg+4)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    sw zero, 4(a0)
+; RV64I-NEXT:    sw zero, %lo(eg+4)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
 ; RV32I-MEDIUM-LABEL: constraint_o_with_global_2:
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi14:
-; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi14)
+; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg+4)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 4(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi14)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
 ; RV64I-MEDIUM-LABEL: constraint_o_with_global_2:
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi14:
-; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi14)
+; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg+4)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 4(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi14)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0", "=*o"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 1))
@@ -957,18 +919,16 @@ define void @constraint_o_with_global_3() nounwind {
 ; RV32I-LABEL: constraint_o_with_global_3:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, %hi(eg+8000)
-; RV32I-NEXT:    addi a0, a0, %lo(eg+8000)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    sw zero, 0(a0)
+; RV32I-NEXT:    sw zero, %lo(eg+8000)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: constraint_o_with_global_3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a0, %hi(eg+8000)
-; RV64I-NEXT:    addi a0, a0, %lo(eg+8000)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    sw zero, 0(a0)
+; RV64I-NEXT:    sw zero, %lo(eg+8000)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -976,9 +936,8 @@ define void @constraint_o_with_global_3() nounwind {
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi15:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg+8000)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi15)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi15)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -986,9 +945,8 @@ define void @constraint_o_with_global_3() nounwind {
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi15:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg+8000)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi15)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi15)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0", "=*o"(ptr elementtype(i32) getelementptr ([400000 x i32], ptr @eg, i32 0, i32 2000))
@@ -1038,19 +996,17 @@ define void @constraint_o_with_extern_weak_global_1() nounwind {
 define void @constraint_o_with_extern_weak_global_2() nounwind {
 ; RV32I-LABEL: constraint_o_with_extern_weak_global_2:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a0, %hi(ewg)
-; RV32I-NEXT:    addi a0, a0, %lo(ewg)
+; RV32I-NEXT:    lui a0, %hi(ewg+4)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    sw zero, 4(a0)
+; RV32I-NEXT:    sw zero, %lo(ewg+4)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: constraint_o_with_extern_weak_global_2:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a0, %hi(ewg)
-; RV64I-NEXT:    addi a0, a0, %lo(ewg)
+; RV64I-NEXT:    lui a0, %hi(ewg+4)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    sw zero, 4(a0)
+; RV64I-NEXT:    sw zero, %lo(ewg+4)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -1081,18 +1037,16 @@ define void @constraint_o_with_extern_weak_global_3() nounwind {
 ; RV32I-LABEL: constraint_o_with_extern_weak_global_3:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, %hi(ewg+8000)
-; RV32I-NEXT:    addi a0, a0, %lo(ewg+8000)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    sw zero, 0(a0)
+; RV32I-NEXT:    sw zero, %lo(ewg+8000)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
 ; RV64I-LABEL: constraint_o_with_extern_weak_global_3:
 ; RV64I:       # %bb.0:
 ; RV64I-NEXT:    lui a0, %hi(ewg+8000)
-; RV64I-NEXT:    addi a0, a0, %lo(ewg+8000)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    sw zero, 0(a0)
+; RV64I-NEXT:    sw zero, %lo(ewg+8000)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -1146,9 +1100,8 @@ define void @constraint_o_with_multi_operands() nounwind {
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi19:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi19)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0) \n sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi19)(a0) \n sw zero, %pcrel_lo(.Lpcrel_hi19)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -1156,9 +1109,8 @@ define void @constraint_o_with_multi_operands() nounwind {
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi19:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi19)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0) \n sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi19)(a0) \n sw zero, %pcrel_lo(.Lpcrel_hi19)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0 \n sw zero, $1", "=*o,=*o"(ptr elementtype(i32) @eg, ptr elementtype(i32) @eg)
@@ -1192,12 +1144,11 @@ define void @constraint_o_with_multi_asm() nounwind {
 ; RV32I-MEDIUM:       # %bb.0:
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi20:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi20)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi20)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi20)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -1205,12 +1156,11 @@ define void @constraint_o_with_multi_asm() nounwind {
 ; RV64I-MEDIUM:       # %bb.0:
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi20:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi20)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi20)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi20)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
   call void asm "sw zero, $0", "=*o"(ptr elementtype(i32) @eg)
@@ -1253,9 +1203,8 @@ define i32 @constraint_o_with_callbr_multi_operands(i32 %a) {
 ; RV32I-MEDIUM:       # %bb.0: # %entry
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi21:
 ; RV32I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi21)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB27_2
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi21)(a1); sw zero, %pcrel_lo(.Lpcrel_hi21)(a1); beqz a0, .LBB27_2
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %normal
 ; RV32I-MEDIUM-NEXT:    li a0, 0
@@ -1270,9 +1219,8 @@ define i32 @constraint_o_with_callbr_multi_operands(i32 %a) {
 ; RV64I-MEDIUM:       # %bb.0: # %entry
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi21:
 ; RV64I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi21)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a1); sw zero, 0(a1); beqz a0, .LBB27_2
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi21)(a1); sw zero, %pcrel_lo(.Lpcrel_hi21)(a1); beqz a0, .LBB27_2
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %normal
 ; RV64I-MEDIUM-NEXT:    li a0, 0
@@ -1335,13 +1283,12 @@ define i32 @constraint_o_with_multi_callbr_asm(i32 %a) {
 ; RV32I-MEDIUM:       # %bb.0: # %entry
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi22:
 ; RV32I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(eg)
-; RV32I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi22)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a1); beqz a0, .LBB28_3
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi22)(a1); beqz a0, .LBB28_3
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %normal0
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    sw zero, 0(a1); beqz a0, .LBB28_3
+; RV32I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi22)(a1); beqz a0, .LBB28_3
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:  # %bb.2: # %normal1
 ; RV32I-MEDIUM-NEXT:    li a0, 0
@@ -1356,13 +1303,12 @@ define i32 @constraint_o_with_multi_callbr_asm(i32 %a) {
 ; RV64I-MEDIUM:       # %bb.0: # %entry
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi22:
 ; RV64I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(eg)
-; RV64I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi22)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a1); beqz a0, .LBB28_3
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi22)(a1); beqz a0, .LBB28_3
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %normal0
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    sw zero, 0(a1); beqz a0, .LBB28_3
+; RV64I-MEDIUM-NEXT:    sw zero, %pcrel_lo(.Lpcrel_hi22)(a1); beqz a0, .LBB28_3
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:  # %bb.2: # %normal1
 ; RV64I-MEDIUM-NEXT:    li a0, 0
@@ -1412,9 +1358,8 @@ define void @constraint_o_with_local_1() nounwind {
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi23:
 ; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp3)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi23)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    lw zero, 0(a0)
+; RV32I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi23)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -1424,9 +1369,8 @@ define void @constraint_o_with_local_1() nounwind {
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi23:
 ; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp3)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi23)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    lw zero, 0(a0)
+; RV64I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi23)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
 entry:
@@ -1442,10 +1386,9 @@ define void @constraint_o_with_local_2() nounwind {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:  .Ltmp4: # Block address taken
 ; RV32I-NEXT:  # %bb.1: # %label
-; RV32I-NEXT:    lui a0, %hi(.Ltmp4)
-; RV32I-NEXT:    addi a0, a0, %lo(.Ltmp4)
+; RV32I-NEXT:    lui a0, %hi(.Ltmp4+4)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    lw zero, 4(a0)
+; RV32I-NEXT:    lw zero, %lo(.Ltmp4+4)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
@@ -1453,10 +1396,9 @@ define void @constraint_o_with_local_2() nounwind {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:  .Ltmp4: # Block address taken
 ; RV64I-NEXT:  # %bb.1: # %label
-; RV64I-NEXT:    lui a0, %hi(.Ltmp4)
-; RV64I-NEXT:    addi a0, a0, %lo(.Ltmp4)
+; RV64I-NEXT:    lui a0, %hi(.Ltmp4+4)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    lw zero, 4(a0)
+; RV64I-NEXT:    lw zero, %lo(.Ltmp4+4)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -1465,10 +1407,9 @@ define void @constraint_o_with_local_2() nounwind {
 ; RV32I-MEDIUM-NEXT:  .Ltmp4: # Block address taken
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi24:
-; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp4)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi24)
+; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp4+4)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    lw zero, 4(a0)
+; RV32I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi24)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -1477,10 +1418,9 @@ define void @constraint_o_with_local_2() nounwind {
 ; RV64I-MEDIUM-NEXT:  .Ltmp4: # Block address taken
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi24:
-; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp4)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi24)
+; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp4+4)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    lw zero, 4(a0)
+; RV64I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi24)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
 entry:
@@ -1496,10 +1436,9 @@ define void @constraint_o_with_local_3() nounwind {
 ; RV32I:       # %bb.0: # %entry
 ; RV32I-NEXT:  .Ltmp5: # Block address taken
 ; RV32I-NEXT:  # %bb.1: # %label
-; RV32I-NEXT:    lui a0, %hi(.Ltmp5)
-; RV32I-NEXT:    addi a0, a0, %lo(.Ltmp5)
+; RV32I-NEXT:    lui a0, %hi(.Ltmp5+2000)
 ; RV32I-NEXT:    #APP
-; RV32I-NEXT:    lw zero, 2000(a0)
+; RV32I-NEXT:    lw zero, %lo(.Ltmp5+2000)(a0)
 ; RV32I-NEXT:    #NO_APP
 ; RV32I-NEXT:    ret
 ;
@@ -1507,10 +1446,9 @@ define void @constraint_o_with_local_3() nounwind {
 ; RV64I:       # %bb.0: # %entry
 ; RV64I-NEXT:  .Ltmp5: # Block address taken
 ; RV64I-NEXT:  # %bb.1: # %label
-; RV64I-NEXT:    lui a0, %hi(.Ltmp5)
-; RV64I-NEXT:    addi a0, a0, %lo(.Ltmp5)
+; RV64I-NEXT:    lui a0, %hi(.Ltmp5+2000)
 ; RV64I-NEXT:    #APP
-; RV64I-NEXT:    lw zero, 2000(a0)
+; RV64I-NEXT:    lw zero, %lo(.Ltmp5+2000)(a0)
 ; RV64I-NEXT:    #NO_APP
 ; RV64I-NEXT:    ret
 ;
@@ -1519,10 +1457,9 @@ define void @constraint_o_with_local_3() nounwind {
 ; RV32I-MEDIUM-NEXT:  .Ltmp5: # Block address taken
 ; RV32I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi25:
-; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp5)
-; RV32I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi25)
+; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp5+2000)
 ; RV32I-MEDIUM-NEXT:    #APP
-; RV32I-MEDIUM-NEXT:    lw zero, 2000(a0)
+; RV32I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi25)(a0)
 ; RV32I-MEDIUM-NEXT:    #NO_APP
 ; RV32I-MEDIUM-NEXT:    ret
 ;
@@ -1531,10 +1468,9 @@ define void @constraint_o_with_local_3() nounwind {
 ; RV64I-MEDIUM-NEXT:  .Ltmp5: # Block address taken
 ; RV64I-MEDIUM-NEXT:  # %bb.1: # %label
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi25:
-; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp5)
-; RV64I-MEDIUM-NEXT:    addi a0, a0, %pcrel_lo(.Lpcrel_hi25)
+; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(.Ltmp5+2000)
 ; RV64I-MEDIUM-NEXT:    #APP
-; RV64I-MEDIUM-NEXT:    lw zero, 2000(a0)
+; RV64I-MEDIUM-NEXT:    lw zero, %pcrel_lo(.Lpcrel_hi25)(a0)
 ; RV64I-MEDIUM-NEXT:    #NO_APP
 ; RV64I-MEDIUM-NEXT:    ret
 entry:

From a43c192567eb4ea2535d73b83da5c7d5ed2b6122 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 22 Jan 2024 10:33:50 +0100
Subject: [PATCH 358/843] [llvm-jitlink] Use SmallVectorImpl when referencing
 StubInfos (NFC)

The element type is declared as SmallVector<T, 1>, but we assign to
SmallVector<T> &. These types are not the same on 32-bit systems,
resulting in a compilation error.

Fix this by using SmallVectorImpl<T> & instead, which is independent
of the small size.
---
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index d233ebdb5a3a8..7e213777a6727 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1212,7 +1212,7 @@ Error Session::FileInfo::registerStubEntry(
   if (!TS)
     return TS.takeError();
 
-  SmallVector<MemoryRegionInfo> &Entry = StubInfos[TS->getName()];
+  SmallVectorImpl<MemoryRegionInfo> &Entry = StubInfos[TS->getName()];
   Entry.insert(Entry.begin(),
                {Sym.getSymbolContent(), Sym.getAddress().getValue(),
                 Sym.getTargetFlags()});
@@ -1230,7 +1230,7 @@ Error Session::FileInfo::registerMultiStubEntry(
   if (!Target)
     return Target.takeError();
 
-  SmallVector<MemoryRegionInfo> &Entry = StubInfos[Target->getName()];
+  SmallVectorImpl<MemoryRegionInfo> &Entry = StubInfos[Target->getName()];
   Entry.emplace_back(Sym.getSymbolContent(), Sym.getAddress().getValue(),
                      Sym.getTargetFlags());
 

From 50df08cd43ec02c58067797df33ec67c128431bb Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh@arm.com>
Date: Mon, 22 Jan 2024 10:05:37 +0000
Subject: [PATCH 359/843] [GlobalISel][AArch64] Combine Vector Reduction Add
 Long (#76241)

ADDLV(ADDLP) => ADDLV
Removes unnecessary ADDLP instruction
Already exists for SDAG, adding for GlobalISel
---
 llvm/lib/Target/AArch64/AArch64InstrGISel.td  |  15 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  23 +
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |  51 +
 .../AArch64/GlobalISel/legalize-ctpop.mir     | 116 ++-
 .../AArch64/GlobalISel/legalize-cttz.mir      |  12 +-
 .../test/CodeGen/AArch64/arm64-neon-across.ll |  32 +-
 llvm/test/CodeGen/AArch64/arm64-vadd.ll       | 883 ++++++++++++++----
 llvm/test/CodeGen/AArch64/dp1.ll              |  33 +-
 llvm/test/CodeGen/AArch64/neon-addlv.ll       |  59 +-
 llvm/test/CodeGen/AArch64/popcount.ll         |  26 +-
 10 files changed, 938 insertions(+), 312 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index e53328d6553af..58ca52f37b63b 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -227,6 +227,18 @@ def G_SMULL : AArch64GenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_UADDLP : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = 0;
+}
+
+def G_SADDLP : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src1);
+  let hasSideEffects = 0;
+}
+
 def G_UADDLV : AArch64GenericInstruction {
   let OutOperandList = (outs type0:$dst);
   let InOperandList = (ins type0:$src1);
@@ -294,6 +306,9 @@ def : GINodeEquiv<G_BSP, AArch64bsp>;
 def : GINodeEquiv<G_UMULL, AArch64umull>;
 def : GINodeEquiv<G_SMULL, AArch64smull>;
 
+def : GINodeEquiv<G_SADDLP, AArch64saddlp_n>;
+def : GINodeEquiv<G_UADDLP, AArch64uaddlp_n>;
+
 def : GINodeEquiv<G_SADDLV, AArch64saddlv>;
 def : GINodeEquiv<G_UADDLV, AArch64uaddlv>;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 6a744689b79ec..c63f23bda6805 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6664,6 +6664,26 @@ multiclass SIMDAcrossLaneLongPairIntrinsic<string Opc, SDPatternOperator addlp>
 defm : SIMDAcrossLaneLongPairIntrinsic<"UADDLV", AArch64uaddlp>;
 defm : SIMDAcrossLaneLongPairIntrinsic<"SADDLV", AArch64saddlp>;
 
+// Pattern is used for GlobalISel
+multiclass SIMDAcrossLaneLongPairIntrinsicGISel<string Opc, SDPatternOperator addlp> {
+  // Patterns for addv(addlp(x)) ==> addlv
+  def : Pat<(i16 (vecreduce_add (v4i16 (addlp (v8i8 V64:$Rn))))),
+            (!cast<Instruction>(Opc#"v8i8v") V64:$Rn)>;
+  def : Pat<(i16 (vecreduce_add (v8i16 (addlp (v16i8 V128:$Rn))))),
+            (!cast<Instruction>(Opc#"v16i8v") V128:$Rn)>;
+  def : Pat<(i32 (vecreduce_add (v4i32 (addlp (v8i16 V128:$Rn))))),
+            (!cast<Instruction>(Opc#"v8i16v") V128:$Rn)>;
+
+  // Patterns for addp(addlp(x))) ==> addlv
+  def : Pat<(i32 (vecreduce_add (v2i32 (addlp (v4i16 V64:$Rn))))),
+            (!cast<Instruction>(Opc#"v4i16v") V64:$Rn)>;
+  def : Pat<(i64 (vecreduce_add (v2i64 (addlp (v4i32 V128:$Rn))))),
+            (!cast<Instruction>(Opc#"v4i32v") V128:$Rn)>;
+}
+
+defm : SIMDAcrossLaneLongPairIntrinsicGISel<"UADDLV", AArch64uaddlp>;
+defm : SIMDAcrossLaneLongPairIntrinsicGISel<"SADDLV", AArch64saddlp>;
+
 // Patterns for uaddlv(uaddlp(x)) ==> uaddlv
 def : Pat<(i64 (int_aarch64_neon_uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
           (i64 (EXTRACT_SUBREG
@@ -6675,6 +6695,9 @@ def : Pat<(i32 (int_aarch64_neon_uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op)))
             (v8i16 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub)),
             ssub))>;
 
+def : Pat<(v2i64 (AArch64uaddlv (v4i32 (AArch64uaddlp (v8i16 V128:$op))))),
+          (v2i64 (SUBREG_TO_REG (i64 0), (UADDLVv8i16v V128:$op), ssub))>;
+
 def : Pat<(v4i32 (AArch64uaddlv (v8i16 (AArch64uaddlp (v16i8 V128:$op))))),
           (v4i32 (SUBREG_TO_REG (i64 0), (UADDLVv16i8v V128:$op), hsub))>;
 
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index b561cb12c93a1..091b70e3c1f15 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1451,6 +1451,57 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
 
     return true;
   }
+  case Intrinsic::aarch64_neon_uaddlp:
+  case Intrinsic::aarch64_neon_saddlp: {
+    MachineIRBuilder MIB(MI);
+    MachineRegisterInfo &MRI = *MIB.getMRI();
+
+    unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
+                       ? AArch64::G_UADDLP
+                       : AArch64::G_SADDLP;
+    MIB.buildInstr(Opc, {MI.getOperand(0)}, {MI.getOperand(2)});
+    MI.eraseFromParent();
+
+    return true;
+  }
+  case Intrinsic::aarch64_neon_uaddlv:
+  case Intrinsic::aarch64_neon_saddlv: {
+    MachineIRBuilder MIB(MI);
+    MachineRegisterInfo &MRI = *MIB.getMRI();
+
+    unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlv
+                       ? AArch64::G_UADDLV
+                       : AArch64::G_SADDLV;
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(2).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+
+    LLT MidTy, ExtTy;
+    if (DstTy.isScalar() && DstTy.getScalarSizeInBits() <= 32) {
+      MidTy = LLT::fixed_vector(4, 32);
+      ExtTy = LLT::scalar(32);
+    } else {
+      MidTy = LLT::fixed_vector(2, 64);
+      ExtTy = LLT::scalar(64);
+    }
+
+    Register MidReg =
+        MIB.buildInstr(Opc, {MidTy}, {SrcReg})->getOperand(0).getReg();
+    Register ZeroReg =
+        MIB.buildConstant(LLT::scalar(64), 0)->getOperand(0).getReg();
+    Register ExtReg = MIB.buildInstr(AArch64::G_EXTRACT_VECTOR_ELT, {ExtTy},
+                                     {MidReg, ZeroReg})
+                          .getReg(0);
+
+    if (DstTy.getScalarSizeInBits() < 32)
+      MIB.buildTrunc(DstReg, ExtReg);
+    else
+      MIB.buildCopy(DstReg, ExtReg);
+
+    MI.eraseFromParent();
+
+    return true;
+  }
   case Intrinsic::aarch64_neon_smax:
   case Intrinsic::aarch64_neon_smin:
   case Intrinsic::aarch64_neon_umax:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
index ae0c29927afa6..fe28c3a47ad5e 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-ctpop.mir
@@ -69,7 +69,10 @@ body:             |
     ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT %copy(s32)
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[ZEXT]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: %ctpop:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C]](s64)
+    ; CHECK-NEXT: %ctpop:_(s32) = COPY [[EVEC]](s32)
     ; CHECK-NEXT: $w0 = COPY %ctpop(s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
@@ -98,8 +101,11 @@ body:             |
     ; CHECK-NEXT: %copy:_(s64) = COPY $x0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST %copy(s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: %ctpop:_(s64) = G_ZEXT [[INT]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: %ctpop:_(s64) = G_ZEXT [[COPY]](s32)
     ; CHECK-NEXT: $x0 = COPY %ctpop(s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0
     ;
@@ -131,12 +137,14 @@ body:             |
     ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s128) = G_MERGE_VALUES [[COPY]](s64), [[COPY1]](s64)
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[MV]](s128)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[INT]](s32), [[C]](s32)
-    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C]](s64)
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+    ; CHECK-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[COPY2]](s32), [[C1]](s32)
     ; CHECK-NEXT: $x0 = COPY [[MV1]](s64)
-    ; CHECK-NEXT: $x1 = COPY [[C1]](s64)
+    ; CHECK-NEXT: $x1 = COPY [[C]](s64)
     ; CHECK-NEXT: RET_ReallyLR implicit $x0, implicit $x1
     ;
     ; CHECK-CSSC-LABEL: name: s128_lower
@@ -177,9 +185,12 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
-    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
     ; CHECK-CSSC-LABEL: name: widen_s16
@@ -216,9 +227,12 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
-    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
     ; CHECK-CSSC-LABEL: name: widen_s8
@@ -255,9 +269,12 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
-    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
     ; CHECK-CSSC-LABEL: name: widen_s3
@@ -293,9 +310,12 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s64) = G_AND [[ANYEXT]], [[C]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[AND]](s64)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(s32) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlv), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[INT]](s32)
-    ; CHECK-NEXT: $w0 = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[UADDLV:%[0-9]+]]:_(<4 x s32>) = G_UADDLV [[CTPOP]]
+    ; CHECK-NEXT: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[UADDLV]](<4 x s32>), [[C1]](s64)
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY [[EVEC]](s32)
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: $w0 = COPY [[COPY1]](s32)
     ; CHECK-NEXT: RET_ReallyLR implicit $w0
     ;
     ; CHECK-CSSC-LABEL: name: different_sizes
@@ -329,8 +349,8 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<8 x s16>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: $q0 = COPY [[INT]](<8 x s16>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: $q0 = COPY [[UADDLP]](<8 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     ;
     ; CHECK-CSSC-LABEL: name: custom_8x16
@@ -339,8 +359,8 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<8 x s16>) = COPY $q0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<8 x s16>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: $q0 = COPY [[INT]](<8 x s16>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP]](<8 x s16>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
     %0:_(<8 x s16>) = COPY $q0
     %1:_(<8 x s16>) = G_CTPOP %0(<8 x s16>)
@@ -361,9 +381,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<4 x s32>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     ;
     ; CHECK-CSSC-LABEL: name: custom_4x32
@@ -372,9 +392,9 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<4 x s32>) = COPY $q0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<4 x s32>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-CSSC-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
     %0:_(<4 x s32>) = COPY $q0
     %1:_(<4 x s32>) = G_CTPOP %0(<4 x s32>)
@@ -395,10 +415,10 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<2 x s64>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-NEXT: [[INT2:%[0-9]+]]:_(<2 x s64>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT1]](<4 x s32>)
-    ; CHECK-NEXT: $q0 = COPY [[INT2]](<2 x s64>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-NEXT: [[UADDLP2:%[0-9]+]]:_(<2 x s64>) = G_UADDLP [[UADDLP1]]
+    ; CHECK-NEXT: $q0 = COPY [[UADDLP2]](<2 x s64>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     ;
     ; CHECK-CSSC-LABEL: name: custom_2x64
@@ -407,10 +427,10 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[COPY]](<2 x s64>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-CSSC-NEXT: [[INT2:%[0-9]+]]:_(<2 x s64>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT1]](<4 x s32>)
-    ; CHECK-CSSC-NEXT: $q0 = COPY [[INT2]](<2 x s64>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP2:%[0-9]+]]:_(<2 x s64>) = G_UADDLP [[UADDLP1]]
+    ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP2]](<2 x s64>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
     %0:_(<2 x s64>) = COPY $q0
     %1:_(<2 x s64>) = G_CTPOP %0(<2 x s64>)
@@ -431,8 +451,8 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<4 x s16>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: $d0 = COPY [[INT]](<4 x s16>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: $d0 = COPY [[UADDLP]](<4 x s16>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     ;
     ; CHECK-CSSC-LABEL: name: custom_4x16
@@ -441,8 +461,8 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<4 x s16>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
-    ; CHECK-CSSC-NEXT: $d0 = COPY [[INT]](<4 x s16>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: $d0 = COPY [[UADDLP]](<4 x s16>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $d0
     %0:_(<4 x s16>) = COPY $d0
     %1:_(<4 x s16>) = G_CTPOP %0(<4 x s16>)
@@ -463,9 +483,9 @@ body:             |
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<2 x s32>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
-    ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<4 x s16>)
-    ; CHECK-NEXT: $d0 = COPY [[INT1]](<2 x s32>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<2 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-NEXT: $d0 = COPY [[UADDLP1]](<2 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $d0
     ;
     ; CHECK-CSSC-LABEL: name: custom_2x32
@@ -474,9 +494,9 @@ body:             |
     ; CHECK-CSSC-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $d0
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<8 x s8>) = G_BITCAST [[COPY]](<2 x s32>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<8 x s8>) = G_CTPOP [[BITCAST]](<8 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<4 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<8 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<2 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<4 x s16>)
-    ; CHECK-CSSC-NEXT: $d0 = COPY [[INT1]](<2 x s32>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<4 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<2 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-CSSC-NEXT: $d0 = COPY [[UADDLP1]](<2 x s32>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $d0
     %0:_(<2 x s32>) = COPY $d0
     %1:_(<2 x s32>) = G_CTPOP %0(<2 x s32>)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
index 535a8d811e43a..8b39ebd986dd7 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-cttz.mir
@@ -147,9 +147,9 @@ body:             |
     ; CHECK-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[XOR]], [[ADD]]
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[AND]](<4 x s32>)
     ; CHECK-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+    ; CHECK-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
     ; CHECK-NEXT: RET_ReallyLR implicit $q0
     ;
     ; CHECK-CSSC-LABEL: name: v4s32
@@ -163,9 +163,9 @@ body:             |
     ; CHECK-CSSC-NEXT: [[AND:%[0-9]+]]:_(<4 x s32>) = G_AND [[XOR]], [[ADD]]
     ; CHECK-CSSC-NEXT: [[BITCAST:%[0-9]+]]:_(<16 x s8>) = G_BITCAST [[AND]](<4 x s32>)
     ; CHECK-CSSC-NEXT: [[CTPOP:%[0-9]+]]:_(<16 x s8>) = G_CTPOP [[BITCAST]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT:%[0-9]+]]:_(<8 x s16>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[CTPOP]](<16 x s8>)
-    ; CHECK-CSSC-NEXT: [[INT1:%[0-9]+]]:_(<4 x s32>) = G_INTRINSIC intrinsic(@llvm.aarch64.neon.uaddlp), [[INT]](<8 x s16>)
-    ; CHECK-CSSC-NEXT: $q0 = COPY [[INT1]](<4 x s32>)
+    ; CHECK-CSSC-NEXT: [[UADDLP:%[0-9]+]]:_(<8 x s16>) = G_UADDLP [[CTPOP]]
+    ; CHECK-CSSC-NEXT: [[UADDLP1:%[0-9]+]]:_(<4 x s32>) = G_UADDLP [[UADDLP]]
+    ; CHECK-CSSC-NEXT: $q0 = COPY [[UADDLP1]](<4 x s32>)
     ; CHECK-CSSC-NEXT: RET_ReallyLR implicit $q0
     %val:_(<4 x s32>) = COPY $q0
     %1:_(<4 x s32>) = G_CTTZ %val(<4 x s32>)
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
index 218f4147787d1..f7ff64228ecd3 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-across.ll
@@ -81,11 +81,17 @@ declare i32 @llvm.aarch64.neon.saddlv.i32.v4i16(<4 x i16>)
 declare i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8>)
 
 define i16 @test_vaddlv_s8(<8 x i8> %a) {
-; CHECK-LABEL: test_vaddlv_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv h0, v0.8b
-; CHECK-NEXT:    smov w0, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddlv_s8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlv h0, v0.8b
+; CHECK-SD-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddlv_s8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %saddlvv.i to i16
@@ -127,11 +133,17 @@ entry:
 }
 
 define i16 @test_vaddlvq_s8(<16 x i8> %a) {
-; CHECK-LABEL: test_vaddlvq_s8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    saddlv h0, v0.16b
-; CHECK-NEXT:    smov w0, v0.h[0]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_vaddlvq_s8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    saddlv h0, v0.16b
+; CHECK-SD-NEXT:    smov w0, v0.h[0]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_vaddlvq_s8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    saddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %saddlvv.i = tail call i32 @llvm.aarch64.neon.saddlv.i32.v16i8(<16 x i8> %a)
   %0 = trunc i32 %saddlvv.i to i16
diff --git a/llvm/test/CodeGen/AArch64/arm64-vadd.ll b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
index ad089f38955be..38a568ac91916 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vadd.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vadd.ll
@@ -1,8 +1,17 @@
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -asm-verbose=false | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=arm64-eabi | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for saddlp1d
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for uaddlp1d
 
 define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn8b:
-;CHECK: addhn.8b
+; CHECK-LABEL: addhn8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %tmp3 = call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -10,8 +19,12 @@ define <8 x i8> @addhn8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn4h:
-;CHECK: addhn.4h
+; CHECK-LABEL: addhn4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -19,8 +32,12 @@ define <4 x i16> @addhn4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2s:
-;CHECK: addhn.2s
+; CHECK-LABEL: addhn2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -28,9 +45,12 @@ define <2 x i32> @addhn2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: addhn2_16b:
-;CHECK: addhn.8b
-;CHECK-NEXT: addhn2.16b
+; CHECK-LABEL: addhn2_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addhn v2.8b, v0.8h, v1.8h
+; CHECK-NEXT:    addhn2 v2.16b, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vaddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %vaddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vaddhn2.i, <8 x i8> %vaddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -38,9 +58,12 @@ define <16 x i8> @addhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
 }
 
 define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: addhn2_8h:
-;CHECK: addhn.4h
-;CHECK-NEXT: addhn2.8h
+; CHECK-LABEL: addhn2_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addhn v2.4h, v0.4s, v1.4s
+; CHECK-NEXT:    addhn2 v2.8h, v0.4s, v1.4s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vaddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %vaddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.addhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vaddhn2.i, <4 x i16> %vaddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -48,9 +71,12 @@ define <8 x i16> @addhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
 }
 
 define <4 x i32> @addhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: addhn2_4s:
-;CHECK: addhn.2s
-;CHECK-NEXT: addhn2.4s
+; CHECK-LABEL: addhn2_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    addhn v2.2s, v0.2d, v1.2d
+; CHECK-NEXT:    addhn2 v2.4s, v0.2d, v1.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vaddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %vaddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.addhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vaddhn2.i, <2 x i32> %vaddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -63,8 +89,12 @@ declare <8 x i8> @llvm.aarch64.neon.addhn.v8i8(<8 x i16>, <8 x i16>) nounwind re
 
 
 define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: raddhn8b:
-;CHECK: raddhn.8b
+; CHECK-LABEL: raddhn8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    raddhn v0.8b, v0.8h, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %tmp3 = call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -72,8 +102,12 @@ define <8 x i8> @raddhn8b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: raddhn4h:
-;CHECK: raddhn.4h
+; CHECK-LABEL: raddhn4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    raddhn v0.4h, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -81,8 +115,12 @@ define <4 x i16> @raddhn4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: raddhn2s:
-;CHECK: raddhn.2s
+; CHECK-LABEL: raddhn2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    raddhn v0.2s, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -90,9 +128,12 @@ define <2 x i32> @raddhn2s(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
-;CHECK-LABEL: raddhn2_16b:
-;CHECK: raddhn.8b
-;CHECK-NEXT: raddhn2.16b
+; CHECK-LABEL: raddhn2_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    raddhn v2.8b, v0.8h, v1.8h
+; CHECK-NEXT:    raddhn2 v2.16b, v0.8h, v1.8h
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vraddhn2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %vraddhn_high2.i = tail call <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16> %a, <8 x i16> %b) nounwind
   %res = shufflevector <8 x i8> %vraddhn2.i, <8 x i8> %vraddhn_high2.i, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
@@ -100,9 +141,12 @@ define <16 x i8> @raddhn2_16b(<8 x i16> %a, <8 x i16> %b) nounwind {
 }
 
 define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
-;CHECK-LABEL: raddhn2_8h:
-;CHECK: raddhn.4h
-;CHECK-NEXT: raddhn2.8h
+; CHECK-LABEL: raddhn2_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    raddhn v2.4h, v0.4s, v1.4s
+; CHECK-NEXT:    raddhn2 v2.8h, v0.4s, v1.4s
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vraddhn2.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %vraddhn_high3.i = tail call <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32> %a, <4 x i32> %b) nounwind
   %res = shufflevector <4 x i16> %vraddhn2.i, <4 x i16> %vraddhn_high3.i, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -110,9 +154,12 @@ define <8 x i16> @raddhn2_8h(<4 x i32> %a, <4 x i32> %b) nounwind {
 }
 
 define <4 x i32> @raddhn2_4s(<2 x i64> %a, <2 x i64> %b) nounwind {
-;CHECK-LABEL: raddhn2_4s:
-;CHECK: raddhn.2s
-;CHECK-NEXT: raddhn2.4s
+; CHECK-LABEL: raddhn2_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    raddhn v2.2s, v0.2d, v1.2d
+; CHECK-NEXT:    raddhn2 v2.4s, v0.2d, v1.2d
+; CHECK-NEXT:    mov v0.16b, v2.16b
+; CHECK-NEXT:    ret
   %vraddhn2.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %vraddhn_high3.i = tail call <2 x i32> @llvm.aarch64.neon.raddhn.v2i32(<2 x i64> %a, <2 x i64> %b) nounwind
   %res = shufflevector <2 x i32> %vraddhn2.i, <2 x i32> %vraddhn_high3.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
@@ -124,8 +171,12 @@ declare <4 x i16> @llvm.aarch64.neon.raddhn.v4i16(<4 x i32>, <4 x i32>) nounwind
 declare <8 x i8> @llvm.aarch64.neon.raddhn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
 
 define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddl8h:
-;CHECK: saddl.8h
+; CHECK-LABEL: saddl8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = sext <8 x i8> %tmp1 to <8 x i16>
@@ -135,8 +186,12 @@ define <8 x i16> @saddl8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddl4s:
-;CHECK: saddl.4s
+; CHECK-LABEL: saddl4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = sext <4 x i16> %tmp1 to <4 x i32>
@@ -146,8 +201,12 @@ define <4 x i32> @saddl4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddl2d:
-;CHECK: saddl.2d
+; CHECK-LABEL: saddl2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i32>, ptr %A
         %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = sext <2 x i32> %tmp1 to <2 x i64>
@@ -158,8 +217,9 @@ define <2 x i64> @saddl2d(ptr %A, ptr %B) nounwind {
 
 define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 ; CHECK-LABEL: saddl2_8h:
-; CHECK-NEXT: saddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saddl2 v0.8h, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp = bitcast <16 x i8> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
@@ -174,8 +234,9 @@ define <8 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 
 define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 ; CHECK-LABEL: saddl2_4s:
-; CHECK-NEXT: saddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saddl2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %tmp = bitcast <8 x i16> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
@@ -190,8 +251,9 @@ define <4 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 
 define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 ; CHECK-LABEL: saddl2_2d:
-; CHECK-NEXT: saddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    saddl2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %tmp = bitcast <4 x i32> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
@@ -205,8 +267,12 @@ define <2 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 }
 
 define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddl8h:
-;CHECK: uaddl.8h
+; CHECK-LABEL: uaddl8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddl v0.8h, v0.8b, v1.8b
+; CHECK-NEXT:    ret
   %tmp1 = load <8 x i8>, ptr %A
   %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = zext <8 x i8> %tmp1 to <8 x i16>
@@ -216,8 +282,12 @@ define <8 x i16> @uaddl8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddl4s:
-;CHECK: uaddl.4s
+; CHECK-LABEL: uaddl4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
+; CHECK-NEXT:    ret
   %tmp1 = load <4 x i16>, ptr %A
   %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = zext <4 x i16> %tmp1 to <4 x i32>
@@ -227,8 +297,12 @@ define <4 x i32> @uaddl4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddl2d:
-;CHECK: uaddl.2d
+; CHECK-LABEL: uaddl2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %tmp1 = load <2 x i32>, ptr %A
   %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = zext <2 x i32> %tmp1 to <2 x i64>
@@ -240,8 +314,9 @@ define <2 x i64> @uaddl2d(ptr %A, ptr %B) nounwind {
 
 define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 ; CHECK-LABEL: uaddl2_8h:
-; CHECK-NEXT: uaddl2.8h v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddl2 v0.8h, v0.16b, v1.16b
+; CHECK-NEXT:    ret
   %tmp = bitcast <16 x i8> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <8 x i8>
@@ -256,8 +331,9 @@ define <8 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) nounwind  {
 
 define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 ; CHECK-LABEL: uaddl2_4s:
-; CHECK-NEXT: uaddl2.4s v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddl2 v0.4s, v0.8h, v1.8h
+; CHECK-NEXT:    ret
   %tmp = bitcast <8 x i16> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <4 x i16>
@@ -272,8 +348,9 @@ define <4 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) nounwind  {
 
 define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 ; CHECK-LABEL: uaddl2_2d:
-; CHECK-NEXT: uaddl2.2d v0, v0, v1
-; CHECK-NEXT: ret
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    uaddl2 v0.2d, v0.4s, v1.4s
+; CHECK-NEXT:    ret
   %tmp = bitcast <4 x i32> %a to <2 x i64>
   %shuffle.i.i.i = shufflevector <2 x i64> %tmp, <2 x i64> undef, <1 x i32> <i32 1>
   %tmp1 = bitcast <1 x i64> %shuffle.i.i.i to <2 x i32>
@@ -287,8 +364,12 @@ define <2 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) nounwind  {
 }
 
 define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw8h:
-;CHECK: uaddw.8h
+; CHECK-LABEL: uaddw8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddw v0.8h, v0.8h, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i8>, ptr %B
   %tmp3 = zext <8 x i8> %tmp2 to <8 x i16>
@@ -297,8 +378,12 @@ define <8 x i16> @uaddw8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw4s:
-;CHECK: uaddw.4s
+; CHECK-LABEL: uaddw4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i16>, ptr %B
   %tmp3 = zext <4 x i16> %tmp2 to <4 x i32>
@@ -307,8 +392,12 @@ define <4 x i32> @uaddw4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2d:
-;CHECK: uaddw.2d
+; CHECK-LABEL: uaddw2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    uaddw v0.2d, v0.2d, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i32>, ptr %B
   %tmp3 = zext <2 x i32> %tmp2 to <2 x i64>
@@ -317,8 +406,19 @@ define <2 x i64> @uaddw2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @uaddw2_8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2_8h:
-;CHECK: uaddw.8h
+; CHECK-SD-LABEL: uaddw2_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    uaddw v0.8h, v0.8h, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddw2_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    uaddw2 v0.8h, v0.8h, v1.16b
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
 
         %tmp2 = load <16 x i8>, ptr %B
@@ -330,8 +430,19 @@ define <8 x i16> @uaddw2_8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @uaddw2_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2_4s:
-;CHECK: uaddw.4s
+; CHECK-SD-LABEL: uaddw2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    uaddw v0.4s, v0.4s, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddw2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v0.4s, v1.8h
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
 
         %tmp2 = load <8 x i16>, ptr %B
@@ -343,8 +454,19 @@ define <4 x i32> @uaddw2_4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @uaddw2_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uaddw2_2d:
-;CHECK: uaddw.2d
+; CHECK-SD-LABEL: uaddw2_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    uaddw v0.2d, v0.2d, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddw2_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v0.2d, v1.4s
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
 
         %tmp2 = load <4 x i32>, ptr %B
@@ -356,8 +478,12 @@ define <2 x i64> @uaddw2_2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw8h:
-;CHECK: saddw.8h
+; CHECK-LABEL: saddw8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddw v0.8h, v0.8h, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i8>, ptr %B
         %tmp3 = sext <8 x i8> %tmp2 to <8 x i16>
@@ -366,8 +492,12 @@ define <8 x i16> @saddw8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw4s:
-;CHECK: saddw.4s
+; CHECK-LABEL: saddw4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i16>, ptr %B
         %tmp3 = sext <4 x i16> %tmp2 to <4 x i32>
@@ -376,8 +506,12 @@ define <4 x i32> @saddw4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2d:
-;CHECK: saddw.2d
+; CHECK-LABEL: saddw2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    saddw v0.2d, v0.2d, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i32>, ptr %B
         %tmp3 = sext <2 x i32> %tmp2 to <2 x i64>
@@ -386,8 +520,19 @@ define <2 x i64> @saddw2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @saddw2_8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2_8h:
-;CHECK: saddw.8h
+; CHECK-SD-LABEL: saddw2_8h:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    saddw v0.8h, v0.8h, v1.8b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddw2_8h:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    saddw2 v0.8h, v0.8h, v1.16b
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
 
         %tmp2 = load <16 x i8>, ptr %B
@@ -399,8 +544,19 @@ define <8 x i16> @saddw2_8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @saddw2_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2_4s:
-;CHECK: saddw.4s
+; CHECK-SD-LABEL: saddw2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    saddw v0.4s, v0.4s, v1.4h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddw2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    saddw2 v0.4s, v0.4s, v1.8h
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
 
         %tmp2 = load <8 x i16>, ptr %B
@@ -412,8 +568,19 @@ define <4 x i32> @saddw2_4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @saddw2_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: saddw2_2d:
-;CHECK: saddw.2d
+; CHECK-SD-LABEL: saddw2_2d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr d1, [x1, #8]
+; CHECK-SD-NEXT:    saddw v0.2d, v0.2d, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddw2_2d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    saddw2 v0.2d, v0.2d, v1.4s
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
 
         %tmp2 = load <4 x i32>, ptr %B
@@ -425,48 +592,66 @@ define <2 x i64> @saddw2_2d(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @saddlp4h(ptr %A) nounwind {
-;CHECK-LABEL: saddlp4h:
-;CHECK: saddlp.4h
+; CHECK-LABEL: saddlp4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    saddlp v0.4h, v0.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @saddlp2s(ptr %A) nounwind {
-;CHECK-LABEL: saddlp2s:
-;CHECK: saddlp.2s
+; CHECK-LABEL: saddlp2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    saddlp v0.2s, v0.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
 define <1 x i64> @saddlp1d(ptr %A) nounwind {
-;CHECK-LABEL: saddlp1d:
-;CHECK: saddlp.1d
+; CHECK-LABEL: saddlp1d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    saddlp v0.1d, v0.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i32>, ptr %A
         %tmp3 = call <1 x i64> @llvm.aarch64.neon.saddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
 define <8 x i16> @saddlp8h(ptr %A) nounwind {
-;CHECK-LABEL: saddlp8h:
-;CHECK: saddlp.8h
+; CHECK-LABEL: saddlp8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    saddlp v0.8h, v0.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @saddlp4s(ptr %A) nounwind {
-;CHECK-LABEL: saddlp4s:
-;CHECK: saddlp.4s
+; CHECK-LABEL: saddlp4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    saddlp v0.4s, v0.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @saddlp2d(ptr %A) nounwind {
-;CHECK-LABEL: saddlp2d:
-;CHECK: saddlp.2d
+; CHECK-LABEL: saddlp2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    saddlp v0.2d, v0.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
@@ -481,48 +666,66 @@ declare <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16>) nounwind read
 declare <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @uaddlp4h(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp4h:
-;CHECK: uaddlp.4h
+; CHECK-LABEL: uaddlp4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    uaddlp v0.4h, v0.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         ret <4 x i16> %tmp3
 }
 
 define <2 x i32> @uaddlp2s(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp2s:
-;CHECK: uaddlp.2s
+; CHECK-LABEL: uaddlp2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    uaddlp v0.2s, v0.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         ret <2 x i32> %tmp3
 }
 
 define <1 x i64> @uaddlp1d(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp1d:
-;CHECK: uaddlp.1d
+; CHECK-LABEL: uaddlp1d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    uaddlp v0.1d, v0.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i32>, ptr %A
         %tmp3 = call <1 x i64> @llvm.aarch64.neon.uaddlp.v1i64.v2i32(<2 x i32> %tmp1)
         ret <1 x i64> %tmp3
 }
 
 define <8 x i16> @uaddlp8h(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp8h:
-;CHECK: uaddlp.8h
+; CHECK-LABEL: uaddlp8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uaddlp v0.8h, v0.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         ret <8 x i16> %tmp3
 }
 
 define <4 x i32> @uaddlp4s(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp4s:
-;CHECK: uaddlp.4s
+; CHECK-LABEL: uaddlp4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uaddlp v0.4s, v0.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         ret <4 x i32> %tmp3
 }
 
 define <2 x i64> @uaddlp2d(ptr %A) nounwind {
-;CHECK-LABEL: uaddlp2d:
-;CHECK: uaddlp.2d
+; CHECK-LABEL: uaddlp2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    uaddlp v0.2d, v0.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         ret <2 x i64> %tmp3
@@ -537,8 +740,12 @@ declare <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16>) nounwind read
 declare <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32>) nounwind readnone
 
 define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp4h:
-;CHECK: sadalp.4h
+; CHECK-LABEL: sadalp4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    sadalp v0.4h, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.saddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>, ptr %B
@@ -547,8 +754,12 @@ define <4 x i16> @sadalp4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp2s:
-;CHECK: sadalp.2s
+; CHECK-LABEL: sadalp2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    sadalp v0.2s, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.saddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>, ptr %B
@@ -557,8 +768,12 @@ define <2 x i32> @sadalp2s(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp8h:
-;CHECK: sadalp.8h
+; CHECK-LABEL: sadalp8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    sadalp v0.8h, v1.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.saddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>, ptr %B
@@ -567,8 +782,12 @@ define <8 x i16> @sadalp8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp4s:
-;CHECK: sadalp.4s
+; CHECK-LABEL: sadalp4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    sadalp v0.4s, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.saddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>, ptr %B
@@ -577,8 +796,12 @@ define <4 x i32> @sadalp4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: sadalp2d:
-;CHECK: sadalp.2d
+; CHECK-LABEL: sadalp2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    sadalp v0.2d, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.saddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>, ptr %B
@@ -587,8 +810,12 @@ define <2 x i64> @sadalp2d(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp4h:
-;CHECK: uadalp.4h
+; CHECK-LABEL: uadalp4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    uadalp v0.4h, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8> %tmp1)
         %tmp4 = load <4 x i16>, ptr %B
@@ -597,8 +824,12 @@ define <4 x i16> @uadalp4h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp2s:
-;CHECK: uadalp.2s
+; CHECK-LABEL: uadalp2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d1, [x0]
+; CHECK-NEXT:    ldr d0, [x1]
+; CHECK-NEXT:    uadalp v0.2s, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.uaddlp.v2i32.v4i16(<4 x i16> %tmp1)
         %tmp4 = load <2 x i32>, ptr %B
@@ -607,8 +838,12 @@ define <2 x i32> @uadalp2s(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp8h:
-;CHECK: uadalp.8h
+; CHECK-LABEL: uadalp8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    uadalp v0.8h, v1.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8> %tmp1)
         %tmp4 = load <8 x i16>, ptr %B
@@ -617,8 +852,12 @@ define <8 x i16> @uadalp8h(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp4s:
-;CHECK: uadalp.4s
+; CHECK-LABEL: uadalp4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    uadalp v0.4s, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.uaddlp.v4i32.v8i16(<8 x i16> %tmp1)
         %tmp4 = load <4 x i32>, ptr %B
@@ -627,8 +866,12 @@ define <4 x i32> @uadalp4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: uadalp2d:
-;CHECK: uadalp.2d
+; CHECK-LABEL: uadalp2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q1, [x0]
+; CHECK-NEXT:    ldr q0, [x1]
+; CHECK-NEXT:    uadalp v0.2d, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.uaddlp.v2i64.v4i32(<4 x i32> %tmp1)
         %tmp4 = load <2 x i64>, ptr %B
@@ -637,8 +880,12 @@ define <2 x i64> @uadalp2d(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_8b:
-;CHECK: addp.8b
+; CHECK-LABEL: addp_8b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    addp v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i8>, ptr %A
         %tmp2 = load <8 x i8>, ptr %B
         %tmp3 = call <8 x i8> @llvm.aarch64.neon.addp.v8i8(<8 x i8> %tmp1, <8 x i8> %tmp2)
@@ -646,8 +893,12 @@ define <8 x i8> @addp_8b(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_16b:
-;CHECK: addp.16b
+; CHECK-LABEL: addp_16b:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addp v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
         %tmp1 = load <16 x i8>, ptr %A
         %tmp2 = load <16 x i8>, ptr %B
         %tmp3 = call <16 x i8> @llvm.aarch64.neon.addp.v16i8(<16 x i8> %tmp1, <16 x i8> %tmp2)
@@ -655,8 +906,12 @@ define <16 x i8> @addp_16b(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_4h:
-;CHECK: addp.4h
+; CHECK-LABEL: addp_4h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    addp v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i16>, ptr %A
         %tmp2 = load <4 x i16>, ptr %B
         %tmp3 = call <4 x i16> @llvm.aarch64.neon.addp.v4i16(<4 x i16> %tmp1, <4 x i16> %tmp2)
@@ -664,8 +919,12 @@ define <4 x i16> @addp_4h(ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_8h:
-;CHECK: addp.8h
+; CHECK-LABEL: addp_8h:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addp v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %tmp3 = call <8 x i16> @llvm.aarch64.neon.addp.v8i16(<8 x i16> %tmp1, <8 x i16> %tmp2)
@@ -673,8 +932,12 @@ define <8 x i16> @addp_8h(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_2s:
-;CHECK: addp.2s
+; CHECK-LABEL: addp_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    addp v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i32>, ptr %A
         %tmp2 = load <2 x i32>, ptr %B
         %tmp3 = call <2 x i32> @llvm.aarch64.neon.addp.v2i32(<2 x i32> %tmp1, <2 x i32> %tmp2)
@@ -682,8 +945,12 @@ define <2 x i32> @addp_2s(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_4s:
-;CHECK: addp.4s
+; CHECK-LABEL: addp_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addp v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %tmp3 = call <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32> %tmp1, <4 x i32> %tmp2)
@@ -691,8 +958,12 @@ define <4 x i32> @addp_4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i64> @addp_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addp_2d:
-;CHECK: addp.2d
+; CHECK-LABEL: addp_2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    addp v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %tmp3 = call <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64> %tmp1, <2 x i64> %tmp2)
@@ -708,8 +979,12 @@ declare <4 x i32> @llvm.aarch64.neon.addp.v4i32(<4 x i32>, <4 x i32>) nounwind r
 declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) nounwind readnone
 
 define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: faddp_2s:
-;CHECK: faddp.2s
+; CHECK-LABEL: faddp_2s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr d0, [x0]
+; CHECK-NEXT:    ldr d1, [x1]
+; CHECK-NEXT:    faddp v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x float>, ptr %A
         %tmp2 = load <2 x float>, ptr %B
         %tmp3 = call <2 x float> @llvm.aarch64.neon.faddp.v2f32(<2 x float> %tmp1, <2 x float> %tmp2)
@@ -717,8 +992,12 @@ define <2 x float> @faddp_2s(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: faddp_4s:
-;CHECK: faddp.4s
+; CHECK-LABEL: faddp_4s:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    faddp v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
         %tmp1 = load <4 x float>, ptr %A
         %tmp2 = load <4 x float>, ptr %B
         %tmp3 = call <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float> %tmp1, <4 x float> %tmp2)
@@ -726,8 +1005,12 @@ define <4 x float> @faddp_4s(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x double> @faddp_2d(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: faddp_2d:
-;CHECK: faddp.2d
+; CHECK-LABEL: faddp_2d:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    ldr q1, [x1]
+; CHECK-NEXT:    faddp v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
         %tmp1 = load <2 x double>, ptr %A
         %tmp2 = load <2 x double>, ptr %B
         %tmp3 = call <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double> %tmp1, <2 x double> %tmp2)
@@ -739,9 +1022,11 @@ declare <4 x float> @llvm.aarch64.neon.faddp.v4f32(<4 x float>, <4 x float>) nou
 declare <2 x double> @llvm.aarch64.neon.faddp.v2f64(<2 x double>, <2 x double>) nounwind readnone
 
 define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: uaddl.2d
+; CHECK-LABEL: uaddl_duprhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    uaddl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -755,9 +1040,18 @@ define <2 x i64> @uaddl_duprhs(<4 x i32> %lhs, i32 %rhs) {
 }
 
 define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: uaddl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: uaddl2.2d
+; CHECK-SD-LABEL: uaddl2_duprhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    uaddl2 v0.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddl2_duprhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.2s, w0
+; CHECK-GI-NEXT:    ushll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    uaddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -771,9 +1065,11 @@ define <2 x i64> @uaddl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 }
 
 define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: saddl_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: saddl.2d
+; CHECK-LABEL: saddl_duplhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    saddl v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    ret
   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 
@@ -787,9 +1083,18 @@ define <2 x i64> @saddl_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: saddl2_duplhs
-; CHECK-NOT: ext.16b
-; CHECK: saddl2.2d
+; CHECK-SD-LABEL: saddl2_duplhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    saddl2 v0.2d, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: saddl2_duplhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.2s, w0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    saddw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    ret
   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 
@@ -803,9 +1108,11 @@ define <2 x i64> @saddl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: usubl.2d
+; CHECK-LABEL: usubl_duprhs:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    usubl v0.2d, v0.2s, v1.2s
+; CHECK-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -819,9 +1126,18 @@ define <2 x i64> @usubl_duprhs(<4 x i32> %lhs, i32 %rhs) {
 }
 
 define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
-; CHECK-LABEL: usubl2_duprhs
-; CHECK-NOT: ext.16b
-; CHECK: usubl2.2d
+; CHECK-SD-LABEL: usubl2_duprhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    usubl2 v0.2d, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: usubl2_duprhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.2s, w0
+; CHECK-GI-NEXT:    mov d0, v0.d[1]
+; CHECK-GI-NEXT:    usubl v0.2d, v0.2s, v1.2s
+; CHECK-GI-NEXT:    ret
   %rhsvec.tmp = insertelement <2 x i32> undef, i32 %rhs, i32 0
   %rhsvec = insertelement <2 x i32> %rhsvec.tmp, i32 %rhs, i32 1
 
@@ -836,8 +1152,10 @@ define <2 x i64> @usubl2_duprhs(<4 x i32> %lhs, i32 %rhs) {
 
 define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
 ; CHECK-LABEL: ssubl_duplhs:
-; CHECK-NOT: ext.16b
-; CHECK: ssubl.2d
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v1.2s, w0
+; CHECK-NEXT:    ssubl v0.2d, v1.2s, v0.2s
+; CHECK-NEXT:    ret
   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 
@@ -851,9 +1169,18 @@ define <2 x i64> @ssubl_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
-; CHECK-LABEL: ssubl2_duplhs:
-; CHECK-NOT: ext.16b
-; CHECK: ssubl2.2d
+; CHECK-SD-LABEL: ssubl2_duplhs:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    dup v1.4s, w0
+; CHECK-SD-NEXT:    ssubl2 v0.2d, v1.4s, v0.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ssubl2_duplhs:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    dup v1.2s, w0
+; CHECK-GI-NEXT:    sshll v1.2d, v1.2s, #0
+; CHECK-GI-NEXT:    ssubw2 v0.2d, v1.2d, v0.4s
+; CHECK-GI-NEXT:    ret
   %lhsvec.tmp = insertelement <2 x i32> undef, i32 %lhs, i32 0
   %lhsvec = insertelement <2 x i32> %lhsvec.tmp, i32 %lhs, i32 1
 
@@ -867,8 +1194,20 @@ define <2 x i64> @ssubl2_duplhs(i32 %lhs, <4 x i32> %rhs) {
 }
 
 define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn8b_natural:
-;CHECK: addhn.8b
+; CHECK-SD-LABEL: addhn8b_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    addhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn8b_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %sum = add <8 x i16> %tmp1, %tmp2
@@ -878,8 +1217,20 @@ define <8 x i8> @addhn8b_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn4h_natural:
-;CHECK: addhn.4h
+; CHECK-SD-LABEL: addhn4h_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    addhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn4h_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %sum = add <4 x i32> %tmp1, %tmp2
@@ -889,8 +1240,20 @@ define <4 x i16> @addhn4h_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2s_natural:
-;CHECK: addhn.2s
+; CHECK-SD-LABEL: addhn2s_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    addhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn2s_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %sum = add <2 x i64> %tmp1, %tmp2
@@ -900,8 +1263,22 @@ define <2 x i32> @addhn2s_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2_16b_natural:
-;CHECK: addhn2.16b
+; CHECK-SD-LABEL: addhn2_16b_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn2_16b_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    shrn2 v0.16b, v1.8h, #8
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %sum = add <8 x i16> %tmp1, %tmp2
@@ -912,8 +1289,22 @@ define <16 x i8> @addhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2_8h_natural:
-;CHECK: addhn2.8h
+; CHECK-SD-LABEL: addhn2_8h_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn2_8h_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #16
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %sum = add <4 x i32> %tmp1, %tmp2
@@ -924,8 +1315,22 @@ define <8 x i16> @addhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: addhn2_4s_natural:
-;CHECK: addhn2.4s
+; CHECK-SD-LABEL: addhn2_4s_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn2_4s_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %sum = add <2 x i64> %tmp1, %tmp2
@@ -936,10 +1341,22 @@ define <4 x i32> @addhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
-;CHECK-LABEL: addhn_addhn2_4s
-;CHECK:     addhn.2s
-;CHECK:     addhn2.4s
-;CHECK-NOT: uzp2.4s
+; CHECK-SD-LABEL: addhn_addhn2_4s:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    addhn v0.2s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    addhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: addhn_addhn2_4s:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    add v1.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v1.2d, #32
+; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
+; CHECK-GI-NEXT:    ret
             %tmp1 = load <2 x i64>, ptr %A
             %tmp2 = load <2 x i64>, ptr %B
             %sum1 = add <2 x i64> %tmp1, %tmp2
@@ -955,8 +1372,20 @@ define <4 x i32> @addhn_addhn2_4s(ptr %A, ptr %B, ptr %C, ptr %D) nounwind {
 }
 
 define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn8b_natural:
-;CHECK: subhn.8b
+; CHECK-SD-LABEL: subhn8b_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    subhn v0.8b, v0.8h, v1.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn8b_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    sub v0.8h, v0.8h, v1.8h
+; CHECK-GI-NEXT:    shrn v0.8b, v0.8h, #8
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %diff = sub <8 x i16> %tmp1, %tmp2
@@ -966,8 +1395,20 @@ define <8 x i8> @subhn8b_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn4h_natural:
-;CHECK: subhn.4h
+; CHECK-SD-LABEL: subhn4h_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    subhn v0.4h, v0.4s, v1.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn4h_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    sub v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    shrn v0.4h, v0.4s, #16
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %diff = sub <4 x i32> %tmp1, %tmp2
@@ -977,8 +1418,20 @@ define <4 x i16> @subhn4h_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2s_natural:
-;CHECK: subhn.2s
+; CHECK-SD-LABEL: subhn2s_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q0, [x0]
+; CHECK-SD-NEXT:    ldr q1, [x1]
+; CHECK-SD-NEXT:    subhn v0.2s, v0.2d, v1.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn2s_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q0, [x0]
+; CHECK-GI-NEXT:    ldr q1, [x1]
+; CHECK-GI-NEXT:    sub v0.2d, v0.2d, v1.2d
+; CHECK-GI-NEXT:    shrn v0.2s, v0.2d, #32
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %diff = sub <2 x i64> %tmp1, %tmp2
@@ -988,8 +1441,22 @@ define <2 x i32> @subhn2s_natural(ptr %A, ptr %B) nounwind {
 }
 
 define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2_16b_natural:
-;CHECK: subhn2.16b
+; CHECK-SD-LABEL: subhn2_16b_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.16b, v1.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn2_16b_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    sub v1.8h, v1.8h, v2.8h
+; CHECK-GI-NEXT:    shrn2 v0.16b, v1.8h, #8
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <8 x i16>, ptr %A
         %tmp2 = load <8 x i16>, ptr %B
         %diff = sub <8 x i16> %tmp1, %tmp2
@@ -1000,8 +1467,22 @@ define <16 x i8> @subhn2_16b_natural(<8 x i8> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2_8h_natural:
-;CHECK: subhn2.8h
+; CHECK-SD-LABEL: subhn2_8h_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.8h, v1.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn2_8h_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    sub v1.4s, v1.4s, v2.4s
+; CHECK-GI-NEXT:    shrn2 v0.8h, v1.4s, #16
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <4 x i32>, ptr %A
         %tmp2 = load <4 x i32>, ptr %B
         %diff = sub <4 x i32> %tmp1, %tmp2
@@ -1012,8 +1493,22 @@ define <8 x i16> @subhn2_8h_natural(<4 x i16> %low, ptr %A, ptr %B) nounwind {
 }
 
 define <4 x i32> @subhn2_4s_natural(<2 x i32> %low, ptr %A, ptr %B) nounwind {
-;CHECK-LABEL: subhn2_4s_natural:
-;CHECK: subhn2.4s
+; CHECK-SD-LABEL: subhn2_4s_natural:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ldr q1, [x0]
+; CHECK-SD-NEXT:    ldr q2, [x1]
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    subhn2 v0.4s, v1.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: subhn2_4s_natural:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ldr q1, [x0]
+; CHECK-GI-NEXT:    ldr q2, [x1]
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT:    sub v1.2d, v1.2d, v2.2d
+; CHECK-GI-NEXT:    shrn2 v0.4s, v1.2d, #32
+; CHECK-GI-NEXT:    ret
         %tmp1 = load <2 x i64>, ptr %A
         %tmp2 = load <2 x i64>, ptr %B
         %diff = sub <2 x i64> %tmp1, %tmp2
diff --git a/llvm/test/CodeGen/AArch64/dp1.ll b/llvm/test/CodeGen/AArch64/dp1.ll
index bb5b19e51995a..949dad7798a6c 100644
--- a/llvm/test/CodeGen/AArch64/dp1.ll
+++ b/llvm/test/CodeGen/AArch64/dp1.ll
@@ -197,27 +197,16 @@ define void @cttz_zeroundef_i64() {
 }
 
 define void @ctpop_i32() {
-; CHECK-SDAG-LABEL: ctpop_i32:
-; CHECK-SDAG:       // %bb.0:
-; CHECK-SDAG-NEXT:    adrp x8, :got:var32
-; CHECK-SDAG-NEXT:    ldr x8, [x8, :got_lo12:var32]
-; CHECK-SDAG-NEXT:    ldr w9, [x8]
-; CHECK-SDAG-NEXT:    fmov d0, x9
-; CHECK-SDAG-NEXT:    cnt v0.8b, v0.8b
-; CHECK-SDAG-NEXT:    uaddlv h0, v0.8b
-; CHECK-SDAG-NEXT:    str s0, [x8]
-; CHECK-SDAG-NEXT:    ret
-;
-; CHECK-GISEL-LABEL: ctpop_i32:
-; CHECK-GISEL:       // %bb.0:
-; CHECK-GISEL-NEXT:    adrp x8, :got:var32
-; CHECK-GISEL-NEXT:    ldr x8, [x8, :got_lo12:var32]
-; CHECK-GISEL-NEXT:    ldr w9, [x8]
-; CHECK-GISEL-NEXT:    fmov d0, x9
-; CHECK-GISEL-NEXT:    cnt v0.8b, v0.8b
-; CHECK-GISEL-NEXT:    uaddlv h0, v0.8b
-; CHECK-GISEL-NEXT:    str s0, [x8]
-; CHECK-GISEL-NEXT:    ret
+; CHECK-LABEL: ctpop_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    adrp x8, :got:var32
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:var32]
+; CHECK-NEXT:    ldr w9, [x8]
+; CHECK-NEXT:    fmov d0, x9
+; CHECK-NEXT:    cnt v0.8b, v0.8b
+; CHECK-NEXT:    uaddlv h0, v0.8b
+; CHECK-NEXT:    str s0, [x8]
+; CHECK-NEXT:    ret
   %val0_tmp = load i32, ptr @var32
   %val4_tmp = call i32 @llvm.ctpop.i32(i32 %val0_tmp)
   store volatile i32 %val4_tmp, ptr @var32
@@ -244,7 +233,7 @@ define void @ctpop_i64() {
 ; CHECK-GISEL-NEXT:    fmov d0, x9
 ; CHECK-GISEL-NEXT:    cnt v0.8b, v0.8b
 ; CHECK-GISEL-NEXT:    uaddlv h0, v0.8b
-; CHECK-GISEL-NEXT:    fmov w9, s0
+; CHECK-GISEL-NEXT:    mov w9, v0.s[0]
 ; CHECK-GISEL-NEXT:    str x9, [x8]
 ; CHECK-GISEL-NEXT:    ret
   %val0_tmp = load i64, ptr @var64
diff --git a/llvm/test/CodeGen/AArch64/neon-addlv.ll b/llvm/test/CodeGen/AArch64/neon-addlv.ll
index 0241091fae025..50f555b18ff07 100644
--- a/llvm/test/CodeGen/AArch64/neon-addlv.ll
+++ b/llvm/test/CodeGen/AArch64/neon-addlv.ll
@@ -1,5 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple aarch64-none-linux-gnu -global-isel -global-isel-abort=2 2>&1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for uaddlv_v8i8_urshr
 
 declare <4 x i16>  @llvm.aarch64.neon.uaddlp.v4i16.v8i8(<8 x i8>) nounwind readnone
 declare <8 x i16>  @llvm.aarch64.neon.uaddlp.v8i16.v16i8(<16 x i8>) nounwind readnone
@@ -154,11 +157,18 @@ define i32 @saddlv4h_from_v4i16(ptr %A) nounwind {
 declare i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8>) nounwind readnone
 
 define i32 @uaddlv_known_bits_v8i8(<8 x i8> %a) {
-; CHECK-LABEL: uaddlv_known_bits_v8i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv_known_bits_v8i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    uaddlv h0, v0.8b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv_known_bits_v8i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
   %tmp1 = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
   %tmp2 = and i32 %tmp1, 65535
   ret i32 %tmp2
@@ -167,11 +177,18 @@ define i32 @uaddlv_known_bits_v8i8(<8 x i8> %a) {
 declare i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8>) nounwind readnone
 
 define i32 @uaddlv_known_bits_v16i8(<16 x i8> %a) {
-; CHECK-LABEL: uaddlv_known_bits_v16i8:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv h0, v0.16b
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv_known_bits_v16i8:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv h0, v0.16b
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv_known_bits_v16i8:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddlv h0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    and w0, w8, #0xffff
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v16i8(<16 x i8> %a)
   %0 = and i32 %vaddlv.i, 65535
@@ -179,12 +196,20 @@ entry:
 }
 
 define dso_local <8 x i8> @uaddlv_v8i8_dup(<8 x i8> %a) {
-; CHECK-LABEL: uaddlv_v8i8_dup:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddlv h0, v0.8b
-; CHECK-NEXT:    dup v0.8h, v0.h[0]
-; CHECK-NEXT:    rshrn v0.8b, v0.8h, #3
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: uaddlv_v8i8_dup:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv h0, v0.8b
+; CHECK-SD-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-SD-NEXT:    rshrn v0.8b, v0.8h, #3
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: uaddlv_v8i8_dup:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddlv h0, v0.8b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    dup v0.8h, w8
+; CHECK-GI-NEXT:    rshrn v0.8b, v0.8h, #3
+; CHECK-GI-NEXT:    ret
 entry:
   %vaddlv.i = tail call i32 @llvm.aarch64.neon.uaddlv.i32.v8i8(<8 x i8> %a)
   %0 = trunc i32 %vaddlv.i to i16
diff --git a/llvm/test/CodeGen/AArch64/popcount.ll b/llvm/test/CodeGen/AArch64/popcount.ll
index 0a3ee98f843c8..b1231eeac1ea4 100644
--- a/llvm/test/CodeGen/AArch64/popcount.ll
+++ b/llvm/test/CodeGen/AArch64/popcount.ll
@@ -7,9 +7,8 @@ define i8 @popcount128(ptr nocapture nonnull readonly %0) {
 ; CHECK:       // %bb.0: // %Entry
 ; CHECK-NEXT:    ldr q0, [x0]
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlv h1, v0.16b
-; CHECK-NEXT:    // implicit-def: $q0
-; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    // kill: def $q0 killed $h0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
@@ -38,19 +37,17 @@ define i16 @popcount256(ptr nocapture nonnull readonly %0) {
 ; CHECK-NEXT:    mov v0.d[0], x9
 ; CHECK-NEXT:    mov v0.d[1], x8
 ; CHECK-NEXT:    cnt v1.16b, v1.16b
-; CHECK-NEXT:    uaddlv h2, v1.16b
-; CHECK-NEXT:    // implicit-def: $q1
-; CHECK-NEXT:    fmov s1, s2
+; CHECK-NEXT:    uaddlv h1, v1.16b
+; CHECK-NEXT:    // kill: def $q1 killed $h1
 ; CHECK-NEXT:    // kill: def $s1 killed $s1 killed $q1
-; CHECK-NEXT:    mov w10, wzr
 ; CHECK-NEXT:    fmov w0, s1
+; CHECK-NEXT:    mov w10, wzr
 ; CHECK-NEXT:    mov w9, w0
 ; CHECK-NEXT:    mov w8, w10
 ; CHECK-NEXT:    bfi x9, x8, #32, #32
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlv h1, v0.16b
-; CHECK-NEXT:    // implicit-def: $q0
-; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    // kill: def $q0 killed $h0
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    mov w8, w0
@@ -76,16 +73,15 @@ define <1 x i128> @popcount1x128(<1 x i128> %0) {
 ; CHECK-NEXT:    mov v0.d[0], x0
 ; CHECK-NEXT:    mov v0.d[1], x1
 ; CHECK-NEXT:    cnt v0.16b, v0.16b
-; CHECK-NEXT:    uaddlv h1, v0.16b
-; CHECK-NEXT:    // implicit-def: $q0
-; CHECK-NEXT:    fmov s0, s1
+; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    // kill: def $q0 killed $h0
+; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    mov w8, wzr
 ; CHECK-NEXT:    // kill: def $x0 killed $w0
 ; CHECK-NEXT:    // kill: def $x8 killed $w8
 ; CHECK-NEXT:    bfi x0, x8, #32, #32
-; CHECK-NEXT:    mov x1, xzr
 ; CHECK-NEXT:    ret
 Entry:
   %1 = tail call <1 x i128> @llvm.ctpop.v1.i128(<1 x i128> %0)

From 54952e95b1ee1ab361f5523744f5a6eedaad4bd4 Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Mon, 22 Jan 2024 11:32:55 +0100
Subject: [PATCH 360/843] Fix an unused variable, NFC.

---
 llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 091b70e3c1f15..ae7ed08194d0d 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1454,7 +1454,6 @@ bool AArch64LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
   case Intrinsic::aarch64_neon_uaddlp:
   case Intrinsic::aarch64_neon_saddlp: {
     MachineIRBuilder MIB(MI);
-    MachineRegisterInfo &MRI = *MIB.getMRI();
 
     unsigned Opc = IntrinsicID == Intrinsic::aarch64_neon_uaddlp
                        ? AArch64::G_UADDLP

From cc38cff05cfafb92bf91aadc39692ec5e12710a0 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Mon, 22 Jan 2024 11:36:27 +0100
Subject: [PATCH 361/843] [mlir][bazel] Fix BUILD after
 9f7fff7f1391ea3bec394d8251b81cea92175cca.

---
 .../llvm-project-overlay/mlir/BUILD.bazel     | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index cd3236b3f5a3d..c0c16957c5c89 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3890,6 +3890,7 @@ cc_library(
         ":AMDGPUToROCDL",
         ":AffineToStandard",
         ":ArithToAMDGPU",
+        ":ArithToArmSME",
         ":ArithToLLVM",
         ":ArithToSPIRV",
         ":ArmNeon2dToIntr",
@@ -8041,6 +8042,27 @@ cc_library(
     ],
 )
 
+cc_library(
+    name = "ArithToArmSME",
+    srcs = glob([
+        "lib/Conversion/ArithToArmSME/*.cpp",
+        "lib/Conversion/ArithToArmSME/*.h",
+    ]),
+    hdrs = glob([
+        "include/mlir/Conversion/ArithToArmSME/*.h",
+    ]),
+    includes = ["include"],
+    deps = [
+        ":ArmSMEDialect",
+        ":ArithDialect",
+        ":ConversionPassIncGen",
+        ":IR",
+        ":Pass",
+        ":Transforms",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "ArithToLLVM",
     srcs = glob(["lib/Conversion/ArithToLLVM/*.cpp"]),

From c9f5b5c935bd12d76d4bafff61d8116cb3229972 Mon Sep 17 00:00:00 2001
From: Mitch Phillips <31459023+hctim@users.noreply.github.com>
Date: Mon, 22 Jan 2024 11:55:39 +0100
Subject: [PATCH 362/843] [MTE] Disable all MTE protection of globals in
 sections (#78443)

Previous work in this area (#70186) disabled MTE in constructor
sections. Looks like I missed one, ".preinit_array".

Also, in the meantime, I found an exciting feature in the linker where
globals placed into an explicit section, where the section name is a
valid C identifer, gets an implicit '__start_<sectionname>' and
'__stop_<sectionname>' symbol as well. This is convenient for iterating
over some globals, but of course iteration over differently-tagged
globals in MTE explodes.

Thus, disable MTE globals for anything that has a section.
---
 clang/test/CodeGen/memtag-globals-asm.cpp     |  8 ++++++
 .../Target/AArch64/AArch64GlobalsTagging.cpp  | 26 ++++++++++++++-----
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/clang/test/CodeGen/memtag-globals-asm.cpp b/clang/test/CodeGen/memtag-globals-asm.cpp
index 4b76b394e0c1d..186045f8f2fb5 100644
--- a/clang/test/CodeGen/memtag-globals-asm.cpp
+++ b/clang/test/CodeGen/memtag-globals-asm.cpp
@@ -271,6 +271,10 @@ CONSTRUCTOR(".ctors") func_t func_ctors = func_constructor;
 CONSTRUCTOR(".dtors") func_t func_dtors = func_constructor;
 CONSTRUCTOR(".init_array") func_t func_init_array = func_constructor;
 CONSTRUCTOR(".fini_array") func_t func_fini_array = func_constructor;
+CONSTRUCTOR(".preinit_array") func_t preinit_array = func_constructor;
+CONSTRUCTOR("array_of_globals") int global1;
+CONSTRUCTOR("array_of_globals") int global2;
+CONSTRUCTOR("array_of_globals") int global_string;
 
 // CHECK-NOT: .memtag func_constructor
 // CHECK-NOT: .memtag func_init
@@ -279,3 +283,7 @@ CONSTRUCTOR(".fini_array") func_t func_fini_array = func_constructor;
 // CHECK-NOT: .memtag func_dtors
 // CHECK-NOT: .memtag func_init_array
 // CHECK-NOT: .memtag func_fini_array
+// CHECK-NOT: .memtag preinit_array
+// CHECK-NOT: .memtag global1
+// CHECK-NOT: .memtag global2
+// CHECK-NOT: .memtag global_string
diff --git a/llvm/lib/Target/AArch64/AArch64GlobalsTagging.cpp b/llvm/lib/Target/AArch64/AArch64GlobalsTagging.cpp
index 8ce6f94e7341d..27959489e7dfa 100644
--- a/llvm/lib/Target/AArch64/AArch64GlobalsTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64GlobalsTagging.cpp
@@ -43,13 +43,25 @@ static bool shouldTagGlobal(GlobalVariable &G) {
     return false;
   }
 
-  // Don't instrument function pointers that are going into various init arrays
-  // via `__attribute__((section(<foo>)))`:
-  // https://github.com/llvm/llvm-project/issues/69939
-  if (G.hasSection() &&
-      (G.getSection() == ".init" || G.getSection() == ".fini" ||
-       G.getSection() == ".init_array" || G.getSection() == ".fini_array" ||
-       G.getSection() == ".ctors" || G.getSection() == ".dtors")) {
+  // Globals can be placed implicitly or explicitly in sections. There's two
+  // different types of globals that meet this criteria that cause problems:
+  //  1. Function pointers that are going into various init arrays (either
+  //     explicitly through `__attribute__((section(<foo>)))` or implicitly
+  //     through `__attribute__((constructor)))`, such as ".(pre)init(_array)",
+  //     ".fini(_array)", ".ctors", and ".dtors". These function pointers end up
+  //     overaligned and overpadded, making iterating over them problematic, and
+  //     each function pointer is individually tagged (so the iteration over
+  //     them causes SIGSEGV/MTE[AS]ERR).
+  //  2. Global variables put into an explicit section, where the section's name
+  //     is a valid C-style identifier. The linker emits a `__start_<name>` and
+  //     `__stop_<na,e>` symbol for the section, so that you can iterate over
+  //     globals within this section. Unfortunately, again, these globals would
+  //     be tagged and so iteration causes SIGSEGV/MTE[AS]ERR.
+  //
+  // To mitigate both these cases, and because specifying a section is rare
+  // outside of these two cases, disable MTE protection for globals in any
+  // section.
+  if (G.hasSection()) {
     Meta.Memtag = false;
     G.setSanitizerMetadata(Meta);
     return false;

From c4fc563b8d41c28f3e4cbcd4ef943c26d234b5ae Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Mon, 22 Jan 2024 12:00:24 +0100
Subject: [PATCH 363/843] [JITLink][AArch32] Add GOT builder and implement
 R_ARM_GOT_PREL relocations for ELF (#78753)

LLJIT needs this relocation for running deinitializers. Implementation and
test are adapted from test arm-fpic-got.s in LLD.
---
 .../llvm/ExecutionEngine/JITLink/aarch32.h    | 17 ++++-
 .../ExecutionEngine/JITLink/ELF_aarch32.cpp   |  7 ++
 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp  | 71 +++++++++++++++----
 .../JITLink/AArch32/ELF_relocations_data.s    | 50 ++++++++++---
 4 files changed, 124 insertions(+), 21 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
index 7765208b5e3df..0968a093279bf 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
@@ -41,7 +41,10 @@ enum EdgeKind_aarch32 : Edge::Kind {
   /// Absolute 32-bit value relocation
   Data_Pointer32,
 
-  LastDataRelocation = Data_Pointer32,
+  /// Create GOT entry and store offset
+  Data_RequestGOTAndTransformToDelta32,
+
+  LastDataRelocation = Data_RequestGOTAndTransformToDelta32,
 
   ///
   /// Relocations of class Arm (covers fixed-width 4-byte instruction subset)
@@ -318,6 +321,18 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
   llvm_unreachable("Relocation must be of class Data, Arm or Thumb");
 }
 
+/// Populate a Global Offset Table from edges that request it.
+class GOTBuilder : public TableManager<GOTBuilder> {
+public:
+  static StringRef getSectionName() { return "$__GOT"; }
+
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E);
+  Symbol &createEntry(LinkGraph &G, Symbol &Target);
+
+private:
+  Section *GOTSection = nullptr;
+};
+
 /// Stubs builder for v7 emits non-position-independent Thumb stubs.
 ///
 /// Right now we only have one default stub kind, but we want to extend this
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
index b862a7ba2acc9..2553cd70a5769 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
@@ -35,6 +35,8 @@ Expected<aarch32::EdgeKind_aarch32> getJITLinkEdgeKind(uint32_t ELFType) {
   switch (ELFType) {
   case ELF::R_ARM_ABS32:
     return aarch32::Data_Pointer32;
+  case ELF::R_ARM_GOT_PREL:
+    return aarch32::Data_RequestGOTAndTransformToDelta32;
   case ELF::R_ARM_REL32:
     return aarch32::Data_Delta32;
   case ELF::R_ARM_CALL:
@@ -71,6 +73,8 @@ Expected<uint32_t> getELFRelocationType(Edge::Kind Kind) {
     return ELF::R_ARM_REL32;
   case aarch32::Data_Pointer32:
     return ELF::R_ARM_ABS32;
+  case aarch32::Data_RequestGOTAndTransformToDelta32:
+    return ELF::R_ARM_GOT_PREL;
   case aarch32::Arm_Call:
     return ELF::R_ARM_CALL;
   case aarch32::Arm_Jump24:
@@ -222,6 +226,9 @@ Error buildTables_ELF_aarch32(LinkGraph &G) {
 
   StubsManagerType StubsManager;
   visitExistingEdges(G, StubsManager);
+  aarch32::GOTBuilder GOT;
+  visitExistingEdges(G, GOT);
+
   return Error::success();
 }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
index 8153c97deff62..111527a39e06e 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -395,6 +395,7 @@ Expected<int64_t> readAddendData(LinkGraph &G, Block &B, Edge::OffsetT Offset,
   switch (Kind) {
   case Data_Delta32:
   case Data_Pointer32:
+  case Data_RequestGOTAndTransformToDelta32:
     return SignExtend64<32>(support::endian::read32(FixupPtr, Endian));
   default:
     return make_error<JITLinkError>(
@@ -464,15 +465,6 @@ Error applyFixupData(LinkGraph &G, Block &B, const Edge &E) {
   char *BlockWorkingMem = B.getAlreadyMutableContent().data();
   char *FixupPtr = BlockWorkingMem + E.getOffset();
 
-  auto Write32 = [FixupPtr, Endian = G.getEndianness()](int64_t Value) {
-    assert(isInt<32>(Value) && "Must be in signed 32-bit range");
-    uint32_t Imm = static_cast<int32_t>(Value);
-    if (LLVM_LIKELY(Endian == endianness::little))
-      endian::write32<endianness::little>(FixupPtr, Imm);
-    else
-      endian::write32<endianness::big>(FixupPtr, Imm);
-  };
-
   Edge::Kind Kind = E.getKind();
   uint64_t FixupAddress = (B.getAddress() + E.getOffset()).getValue();
   int64_t Addend = E.getAddend();
@@ -487,16 +479,24 @@ Error applyFixupData(LinkGraph &G, Block &B, const Edge &E) {
     int64_t Value = TargetAddress - FixupAddress + Addend;
     if (!isInt<32>(Value))
       return makeTargetOutOfRangeError(G, B, E);
-    Write32(Value);
+    if (LLVM_LIKELY(G.getEndianness() == endianness::little))
+      endian::write32le(FixupPtr, Value);
+    else
+      endian::write32be(FixupPtr, Value);
     return Error::success();
   }
   case Data_Pointer32: {
     int64_t Value = TargetAddress + Addend;
-    if (!isInt<32>(Value))
+    if (!isUInt<32>(Value))
       return makeTargetOutOfRangeError(G, B, E);
-    Write32(Value);
+    if (LLVM_LIKELY(G.getEndianness() == endianness::little))
+      endian::write32le(FixupPtr, Value);
+    else
+      endian::write32be(FixupPtr, Value);
     return Error::success();
   }
+  case Data_RequestGOTAndTransformToDelta32:
+    llvm_unreachable("Should be transformed");
   default:
     return make_error<JITLinkError>(
         "In graph " + G.getName() + ", section " + B.getSection().getName() +
@@ -678,6 +678,52 @@ Error applyFixupThumb(LinkGraph &G, Block &B, const Edge &E,
   }
 }
 
+const uint8_t GOTEntryInit[] = {
+    0x00,
+    0x00,
+    0x00,
+    0x00,
+};
+
+/// Create a new node in the link-graph for the given pointer value.
+template <size_t Size>
+static Block &allocPointer(LinkGraph &G, Section &S,
+                           const uint8_t (&Content)[Size]) {
+  static_assert(Size == 4, "Pointers are 32-bit");
+  constexpr uint64_t Alignment = 4;
+  ArrayRef<char> Init(reinterpret_cast<const char *>(Content), Size);
+  return G.createContentBlock(S, Init, orc::ExecutorAddr(), Alignment, 0);
+}
+
+Symbol &GOTBuilder::createEntry(LinkGraph &G, Symbol &Target) {
+  if (!GOTSection)
+    GOTSection = &G.createSection(getSectionName(), orc::MemProt::Read);
+  Block &B = allocPointer(G, *GOTSection, GOTEntryInit);
+  constexpr int64_t GOTEntryAddend = 0;
+  B.addEdge(Data_Pointer32, 0, Target, GOTEntryAddend);
+  return G.addAnonymousSymbol(B, 0, B.getSize(), false, false);
+}
+
+bool GOTBuilder::visitEdge(LinkGraph &G, Block *B, Edge &E) {
+  Edge::Kind KindToSet = Edge::Invalid;
+  switch (E.getKind()) {
+  case aarch32::Data_RequestGOTAndTransformToDelta32: {
+    KindToSet = aarch32::Data_Delta32;
+    break;
+  }
+  default:
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "  Transforming " << G.getEdgeKindName(E.getKind())
+                    << " edge at " << B->getFixupAddress(E) << " ("
+                    << B->getAddress() << " + "
+                    << formatv("{0:x}", E.getOffset()) << ") into "
+                    << G.getEdgeKindName(KindToSet) << "\n");
+  E.setKind(KindToSet);
+  E.setTarget(getEntryForTarget(G, E.getTarget()));
+  return true;
+}
+
 const uint8_t Thumbv7ABS[] = {
     0x40, 0xf2, 0x00, 0x0c, // movw r12, #0x0000    ; lower 16-bit
     0xc0, 0xf2, 0x00, 0x0c, // movt r12, #0x0000    ; upper 16-bit
@@ -709,6 +755,7 @@ const char *getEdgeKindName(Edge::Kind K) {
   switch (K) {
     KIND_NAME_CASE(Data_Delta32)
     KIND_NAME_CASE(Data_Pointer32)
+    KIND_NAME_CASE(Data_RequestGOTAndTransformToDelta32)
     KIND_NAME_CASE(Arm_Call)
     KIND_NAME_CASE(Arm_Jump24)
     KIND_NAME_CASE(Arm_MovwAbsNC)
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
index 379d35fe4902c..f91a4733c40ee 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
@@ -1,12 +1,13 @@
-# RUN: llvm-mc -triple=armv7-none-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv7.o %s
-# RUN: llvm-objdump -r %t_armv7.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: rm -rf %t && mkdir -p %t/armv7 && mkdir -p %t/thumbv7
+# RUN: llvm-mc -triple=armv7-none-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t/armv7/out.o %s
+# RUN: llvm-objdump -r %t/armv7/out.o | FileCheck --check-prefix=CHECK-TYPE %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb -slab-page-size 4096 \
-# RUN:              -abs target=0x76bbe88f -check %s %t_armv7.o
+# RUN:              -abs target=0x76bbe88f -check %s %t/armv7/out.o
 
-# RUN: llvm-mc -triple=thumbv7-none-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7.o %s
-# RUN: llvm-objdump -r %t_thumbv7.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-mc -triple=thumbv7-none-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t/thumbv7/out.o %s
+# RUN: llvm-objdump -r %t/thumbv7/out.o | FileCheck --check-prefix=CHECK-TYPE %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb -slab-page-size 4096 \
-# RUN:              -abs target=0x76bbe88f -check %s %t_thumbv7.o
+# RUN:              -abs target=0x76bbe88f -check %s %t/thumbv7/out.o
 
 	.data
 	.global target
@@ -28,10 +29,43 @@ rel32:
 	.word target - .
 	.size rel32, .-rel32
 
-# Empty main function for jitlink to be happy
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_GOT_PREL target
+#
+# The GOT entry contains the absolute address of the external:
+# jitlink-check: *{4}(got_addr(out.o, target)) = target
+#
+# The embedded offset value contains the offset to the GOT entry relative to pc.
+# The +12 accounts for the ARM branch offset (8) and the .LPC offset (4), which
+# is stored as initial addend inline.
+# FIXME: We shouldn't need to substract the 64-bit sign-extension manually.
+# jitlink-check: *{4}got_prel_offset = got_addr(out.o, target) - (got_prel + 12) - 0xffffffff00000000
+	.globl	got_prel
+	.type	got_prel,%function
+	.p2align	2
+	.code	32
+got_prel:
+	ldr	r0, .LCPI
+.LPC:
+	ldr	r0, [pc, r0]
+	ldr	r0, [r0]
+	bx	lr
+# Actual relocation site is on the embedded offset value:
+	.globl	got_prel_offset
+got_prel_offset:
+.LCPI:
+	.long	target(GOT_PREL)-((.LPC+8)-.LCPI)
+	.size	got_prel_offset, .-got_prel_offset
+	.size	got_prel, .-got_prel
+
+# This test is executable with any 4-byte external target:
+#  > echo "unsigned target = 42;" | clang -target armv7-linux-gnueabihf -o target.o -c -xc -
+#  > llvm-jitlink target.o armv7/out.o
+#
 	.globl  main
 	.type main, %function
 	.p2align  2
 main:
-	bx lr
+	push	{lr}
+	bl	got_prel
+	pop	{pc}
 	.size   main, .-main

From 6aeb7a71d40faed14820523b5be24ff93a4e9bf9 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Mon, 22 Jan 2024 11:05:27 +0000
Subject: [PATCH 364/843] [RemoveDIs][DebugInfo] Add interface changes for AT
 analysis (#78460)

This patch adds the preliminary changes for handling DPValues in
AssignmentTrackingAnalysis - very few functional changes are included,
but internal data structures have been changed to operate with DPValues
as well as Instructions, allowing future patches to process DPValues
correctly.
---
 llvm/include/llvm/IR/DebugInfo.h              |  14 ++-
 .../CodeGen/AssignmentTrackingAnalysis.cpp    |  91 +++++++++++----
 llvm/lib/IR/DebugInfo.cpp                     | 110 ++++++++++++++----
 llvm/lib/Transforms/Utils/Local.cpp           |  14 ---
 4 files changed, 172 insertions(+), 57 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index e634f4bd2f5aa..55b8ce0998413 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -58,6 +58,7 @@ DISubprogram *getDISubprogram(const MDNode *Scope);
 /// Produce a DebugLoc to use for each dbg.declare that is promoted to a
 /// dbg.value.
 DebugLoc getDebugValueLoc(DbgVariableIntrinsic *DII);
+DebugLoc getDebugValueLoc(DPValue *DPV);
 
 /// Strip debug info in the module if it exists.
 ///
@@ -223,6 +224,11 @@ inline AssignmentMarkerRange getAssignmentMarkers(const Instruction *Inst) {
   else
     return make_range(Value::user_iterator(), Value::user_iterator());
 }
+inline SmallVector<DPValue *> getDPVAssignmentMarkers(const Instruction *Inst) {
+  if (auto *ID = Inst->getMetadata(LLVMContext::MD_DIAssignID))
+    return cast<DIAssignID>(ID)->getAllDPValueUsers();
+  return {};
+}
 
 /// Delete the llvm.dbg.assign intrinsics linked to \p Inst.
 void deleteAssignmentMarkers(const Instruction *Inst);
@@ -244,7 +250,11 @@ void deleteAll(Function *F);
 /// Result contains a zero-sized fragment if there's no intersect.
 bool calculateFragmentIntersect(
     const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
-    uint64_t SliceSizeInBits, const DbgAssignIntrinsic *DAI,
+    uint64_t SliceSizeInBits, const DbgAssignIntrinsic *DbgAssign,
+    std::optional<DIExpression::FragmentInfo> &Result);
+bool calculateFragmentIntersect(
+    const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
+    uint64_t SliceSizeInBits, const DPValue *DPVAssign,
     std::optional<DIExpression::FragmentInfo> &Result);
 
 /// Helper struct for trackAssignments, below. We don't use the similar
@@ -259,6 +269,8 @@ struct VarRecord {
 
   VarRecord(DbgVariableIntrinsic *DVI)
       : Var(DVI->getVariable()), DL(getDebugValueLoc(DVI)) {}
+  VarRecord(DPValue *DPV)
+      : Var(DPV->getVariable()), DL(getDebugValueLoc(DPV)) {}
   VarRecord(DILocalVariable *Var, DILocation *DL) : Var(Var), DL(DL) {}
   friend bool operator<(const VarRecord &LHS, const VarRecord &RHS) {
     return std::tie(LHS.Var, LHS.DL) < std::tie(RHS.Var, RHS.DL);
diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 5ed7b020862a7..52b464cc76af4 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -20,6 +20,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/DebugProgramInstruction.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -81,6 +82,19 @@ template <> struct llvm::DenseMapInfo<VariableID> {
   }
 };
 
+using VarLocInsertPt = PointerUnion<const Instruction *, const DPValue *>;
+
+namespace std {
+template <> struct hash<VarLocInsertPt> {
+  using argument_type = VarLocInsertPt;
+  using result_type = std::size_t;
+
+  result_type operator()(const argument_type &Arg) const {
+    return std::hash<void *>()(Arg.getOpaqueValue());
+  }
+};
+} // namespace std
+
 /// Helper class to build FunctionVarLocs, since that class isn't easy to
 /// modify. TODO: There's not a great deal of value in the split, it could be
 /// worth merging the two classes.
@@ -89,8 +103,7 @@ class FunctionVarLocsBuilder {
   UniqueVector<DebugVariable> Variables;
   // Use an unordered_map so we don't invalidate iterators after
   // insert/modifications.
-  std::unordered_map<const Instruction *, SmallVector<VarLocInfo>>
-      VarLocsBeforeInst;
+  std::unordered_map<VarLocInsertPt, SmallVector<VarLocInfo>> VarLocsBeforeInst;
 
   SmallVector<VarLocInfo> SingleLocVars;
 
@@ -109,7 +122,7 @@ class FunctionVarLocsBuilder {
 
   /// Return ptr to wedge of defs or nullptr if no defs come just before /p
   /// Before.
-  const SmallVectorImpl<VarLocInfo> *getWedge(const Instruction *Before) const {
+  const SmallVectorImpl<VarLocInfo> *getWedge(VarLocInsertPt Before) const {
     auto R = VarLocsBeforeInst.find(Before);
     if (R == VarLocsBeforeInst.end())
       return nullptr;
@@ -117,7 +130,7 @@ class FunctionVarLocsBuilder {
   }
 
   /// Replace the defs that come just before /p Before with /p Wedge.
-  void setWedge(const Instruction *Before, SmallVector<VarLocInfo> &&Wedge) {
+  void setWedge(VarLocInsertPt Before, SmallVector<VarLocInfo> &&Wedge) {
     VarLocsBeforeInst[Before] = std::move(Wedge);
   }
 
@@ -133,7 +146,7 @@ class FunctionVarLocsBuilder {
   }
 
   /// Add a def to the wedge of defs just before /p Before.
-  void addVarLoc(Instruction *Before, DebugVariable Var, DIExpression *Expr,
+  void addVarLoc(VarLocInsertPt Before, DebugVariable Var, DIExpression *Expr,
                  DebugLoc DL, RawLocationWrapper R) {
     VarLocInfo VarLoc;
     VarLoc.VariableID = insertVariable(Var);
@@ -201,15 +214,31 @@ void FunctionVarLocs::init(FunctionVarLocsBuilder &Builder) {
   SingleVarLocEnd = VarLocRecords.size();
 
   // Insert a contiguous block of VarLocInfos for each instruction, mapping it
-  // to the start and end position in the vector with VarLocsBeforeInst.
+  // to the start and end position in the vector with VarLocsBeforeInst. This
+  // block includes VarLocs for any DPValues attached to that instruction.
   for (auto &P : Builder.VarLocsBeforeInst) {
+    // Process VarLocs attached to a DPValue alongside their marker Instruction.
+    if (isa<const DPValue *>(P.first))
+      continue;
+    const Instruction *I = cast<const Instruction *>(P.first);
     unsigned BlockStart = VarLocRecords.size();
+    // Any VarLocInfos attached to a DPValue should now be remapped to their
+    // marker Instruction, in order of DPValue appearance and prior to any
+    // VarLocInfos attached directly to that instruction.
+    for (const DPValue &DPV : I->getDbgValueRange()) {
+      // Even though DPV defines a variable location, VarLocsBeforeInst can
+      // still be empty if that VarLoc was redundant.
+      if (!Builder.VarLocsBeforeInst.count(&DPV))
+        continue;
+      for (const VarLocInfo &VarLoc : Builder.VarLocsBeforeInst[&DPV])
+        VarLocRecords.emplace_back(VarLoc);
+    }
     for (const VarLocInfo &VarLoc : P.second)
       VarLocRecords.emplace_back(VarLoc);
     unsigned BlockEnd = VarLocRecords.size();
     // Record the start and end indices.
     if (BlockEnd != BlockStart)
-      VarLocsBeforeInst[P.first] = {BlockStart, BlockEnd};
+      VarLocsBeforeInst[I] = {BlockStart, BlockEnd};
   }
 
   // Copy the Variables vector from the builder's UniqueVector.
@@ -370,7 +399,7 @@ class MemLocFragmentFill {
     unsigned SizeInBits;
     DebugLoc DL;
   };
-  using InsertMap = MapVector<Instruction *, SmallVector<FragMemLoc>>;
+  using InsertMap = MapVector<VarLocInsertPt, SmallVector<FragMemLoc>>;
 
   /// BBInsertBeforeMap holds a description for the set of location defs to be
   /// inserted after the analysis is complete. It is updated during the dataflow
@@ -590,7 +619,7 @@ class MemLocFragmentFill {
     return /*Changed=*/false;
   }
 
-  void insertMemLoc(BasicBlock &BB, Instruction &Before, unsigned Var,
+  void insertMemLoc(BasicBlock &BB, VarLocInsertPt Before, unsigned Var,
                     unsigned StartBit, unsigned EndBit, unsigned Base,
                     DebugLoc DL) {
     assert(StartBit < EndBit && "Cannot create fragment of size <= 0");
@@ -603,7 +632,7 @@ class MemLocFragmentFill {
     assert(Base && "Expected a non-zero ID for Base address");
     Loc.Base = Base;
     Loc.DL = DL;
-    BBInsertBeforeMap[&BB][&Before].push_back(Loc);
+    BBInsertBeforeMap[&BB][Before].push_back(Loc);
     LLVM_DEBUG(dbgs() << "Add mem def for " << Aggregates[Var].first->getName()
                       << " bits [" << StartBit << ", " << EndBit << ")\n");
   }
@@ -612,7 +641,7 @@ class MemLocFragmentFill {
   /// in \p FragMap starts before \p StartBit or ends after \p EndBit (which
   /// indicates - assuming StartBit->EndBit has just been inserted - that the
   /// slice has been coalesced in the map).
-  void coalesceFragments(BasicBlock &BB, Instruction &Before, unsigned Var,
+  void coalesceFragments(BasicBlock &BB, VarLocInsertPt Before, unsigned Var,
                          unsigned StartBit, unsigned EndBit, unsigned Base,
                          DebugLoc DL, const FragsInMemMap &FragMap) {
     if (!CoalesceAdjacentFragments)
@@ -633,7 +662,7 @@ class MemLocFragmentFill {
                  Base, DL);
   }
 
-  void addDef(const VarLocInfo &VarLoc, Instruction &Before, BasicBlock &BB,
+  void addDef(const VarLocInfo &VarLoc, VarLocInsertPt Before, BasicBlock &BB,
               VarFragMap &LiveSet) {
     DebugVariable DbgVar = FnVarLocs->getVariable(VarLoc.VariableID);
     if (skipVariable(DbgVar.getVariable()))
@@ -802,7 +831,7 @@ class MemLocFragmentFill {
     for (auto &I : BB) {
       if (const auto *Locs = FnVarLocs->getWedge(&I)) {
         for (const VarLocInfo &Loc : *Locs) {
-          addDef(Loc, I, *I.getParent(), LiveSet);
+          addDef(Loc, &I, *I.getParent(), LiveSet);
         }
       }
     }
@@ -923,7 +952,7 @@ class MemLocFragmentFill {
     for (auto &Pair : BBInsertBeforeMap) {
       InsertMap &Map = Pair.second;
       for (auto &Pair : Map) {
-        Instruction *InsertBefore = Pair.first;
+        auto InsertBefore = Pair.first;
         assert(InsertBefore && "should never be null");
         auto FragMemLocs = Pair.second;
         auto &Ctx = Fn.getContext();
@@ -1056,11 +1085,12 @@ class AssignmentTrackingLowering {
   UntaggedStoreAssignmentMap UntaggedStoreVars;
 
   // Machinery to defer inserting dbg.values.
-  using InsertMap = MapVector<Instruction *, SmallVector<VarLocInfo>>;
-  InsertMap InsertBeforeMap;
+  using InstInsertMap = MapVector<VarLocInsertPt, SmallVector<VarLocInfo>>;
+  InstInsertMap InsertBeforeMap;
   /// Clear the location definitions currently cached for insertion after /p
   /// After.
   void resetInsertionPoint(Instruction &After);
+  void resetInsertionPoint(DPValue &After);
   void emitDbgValue(LocKind Kind, const DbgVariableIntrinsic *Source,
                     Instruction *After);
 
@@ -1418,6 +1448,24 @@ const char *locStr(AssignmentTrackingLowering::LocKind Loc) {
 }
 #endif
 
+VarLocInsertPt getNextNode(const DPValue *DPV) {
+  auto NextIt = ++(DPV->getIterator());
+  if (NextIt == DPV->getMarker()->getDbgValueRange().end())
+    return DPV->getMarker()->MarkedInstr;
+  return &*NextIt;
+}
+VarLocInsertPt getNextNode(const Instruction *Inst) {
+  const Instruction *Next = Inst->getNextNode();
+  if (!Next->hasDbgValues())
+    return Next;
+  return &*Next->getDbgValueRange().begin();
+}
+VarLocInsertPt getNextNode(VarLocInsertPt InsertPt) {
+  if (isa<const Instruction *>(InsertPt))
+    return getNextNode(cast<const Instruction *>(InsertPt));
+  return getNextNode(cast<const DPValue *>(InsertPt));
+}
+
 void AssignmentTrackingLowering::emitDbgValue(
     AssignmentTrackingLowering::LocKind Kind,
     const DbgVariableIntrinsic *Source, Instruction *After) {
@@ -1430,7 +1478,7 @@ void AssignmentTrackingLowering::emitDbgValue(
           PoisonValue::get(Type::getInt1Ty(Source->getContext())));
 
     // Find a suitable insert point.
-    Instruction *InsertBefore = After->getNextNode();
+    auto InsertBefore = getNextNode(After);
     assert(InsertBefore && "Shouldn't be inserting after a terminator");
 
     VariableID Var = getVariableID(DebugVariable(Source));
@@ -1538,8 +1586,9 @@ void AssignmentTrackingLowering::processUntaggedInstruction(
     Ops.push_back(dwarf::DW_OP_deref);
     DIE = DIExpression::prependOpcodes(DIE, Ops, /*StackValue=*/false,
                                        /*EntryValue=*/false);
-    // Find a suitable insert point.
-    Instruction *InsertBefore = I.getNextNode();
+    // Find a suitable insert point, before the next instruction or DPValue
+    // after I.
+    auto InsertBefore = getNextNode(&I);
     assert(InsertBefore && "Shouldn't be inserting after a terminator");
 
     // Get DILocation for this unrecorded assignment.
@@ -1710,8 +1759,8 @@ void AssignmentTrackingLowering::processDbgValue(DbgValueInst &DVI,
   emitDbgValue(LocKind::Val, &DVI, &DVI);
 }
 
-static bool hasZeroSizedFragment(DbgVariableIntrinsic &DVI) {
-  if (auto F = DVI.getExpression()->getFragmentInfo())
+template <typename T> static bool hasZeroSizedFragment(T &DbgValue) {
+  if (auto F = DbgValue.getExpression()->getFragmentInfo())
     return F->SizeInBits == 0;
   return false;
 }
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 312d670d5b30e..fcd3f77f8f6e2 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -163,6 +163,18 @@ DebugLoc llvm::getDebugValueLoc(DbgVariableIntrinsic *DII) {
   return DILocation::get(DII->getContext(), 0, 0, Scope, InlinedAt);
 }
 
+DebugLoc llvm::getDebugValueLoc(DPValue *DPV) {
+  // Original dbg.declare must have a location.
+  const DebugLoc &DeclareLoc = DPV->getDebugLoc();
+  MDNode *Scope = DeclareLoc.getScope();
+  DILocation *InlinedAt = DeclareLoc.getInlinedAt();
+  // Because no machine insts can come from debug intrinsics, only the scope
+  // and inlinedAt is significant. Zero line numbers are used in case this
+  // DebugLoc leaks into any adjacent instructions. Produce an unknown location
+  // with the correct scope / inlinedAt fields.
+  return DILocation::get(DPV->getContext(), 0, 0, Scope, InlinedAt);
+}
+
 //===----------------------------------------------------------------------===//
 // DebugInfoFinder implementations.
 //===----------------------------------------------------------------------===//
@@ -1779,11 +1791,14 @@ AssignmentMarkerRange at::getAssignmentMarkers(DIAssignID *ID) {
 
 void at::deleteAssignmentMarkers(const Instruction *Inst) {
   auto Range = getAssignmentMarkers(Inst);
-  if (Range.empty())
+  SmallVector<DPValue *> DPVAssigns = getDPVAssignmentMarkers(Inst);
+  if (Range.empty() && DPVAssigns.empty())
     return;
   SmallVector<DbgAssignIntrinsic *> ToDelete(Range.begin(), Range.end());
   for (auto *DAI : ToDelete)
     DAI->eraseFromParent();
+  for (auto *DPV : DPVAssigns)
+    DPV->eraseFromParent();
 }
 
 void at::RAUW(DIAssignID *Old, DIAssignID *New) {
@@ -1813,9 +1828,34 @@ void at::deleteAll(Function *F) {
     DAI->eraseFromParent();
 }
 
-bool at::calculateFragmentIntersect(
+/// Get the FragmentInfo for the variable if it exists, otherwise return a
+/// FragmentInfo that covers the entire variable if the variable size is
+/// known, otherwise return a zero-sized fragment.
+static DIExpression::FragmentInfo
+getFragmentOrEntireVariable(const DPValue *DPV) {
+  DIExpression::FragmentInfo VariableSlice(0, 0);
+  // Get the fragment or variable size, or zero.
+  if (auto Sz = DPV->getFragmentSizeInBits())
+    VariableSlice.SizeInBits = *Sz;
+  if (auto Frag = DPV->getExpression()->getFragmentInfo())
+    VariableSlice.OffsetInBits = Frag->OffsetInBits;
+  return VariableSlice;
+}
+
+static DIExpression::FragmentInfo
+getFragmentOrEntireVariable(const DbgVariableIntrinsic *DVI) {
+  DIExpression::FragmentInfo VariableSlice(0, 0);
+  // Get the fragment or variable size, or zero.
+  if (auto Sz = DVI->getFragmentSizeInBits())
+    VariableSlice.SizeInBits = *Sz;
+  if (auto Frag = DVI->getExpression()->getFragmentInfo())
+    VariableSlice.OffsetInBits = Frag->OffsetInBits;
+  return VariableSlice;
+}
+template <typename T>
+bool calculateFragmentIntersectImpl(
     const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
-    uint64_t SliceSizeInBits, const DbgAssignIntrinsic *DAI,
+    uint64_t SliceSizeInBits, const T *AssignRecord,
     std::optional<DIExpression::FragmentInfo> &Result) {
   // There are multiple offsets at play in this function, so let's break it
   // down. Starting with how variables may be stored in allocas:
@@ -1844,7 +1884,7 @@ bool at::calculateFragmentIntersect(
   // dbg.assign, that has been killed, if any.
   //
   //     calculateFragmentIntersect(..., SliceOffsetInBits=0,
-  //                 SliceSizeInBits=32, Dest=%dest, DAI=dbg.assign)
+  //                 SliceSizeInBits=32, Dest=%dest, Assign=dbg.assign)
   //
   // Drawing the store (s) in memory followed by the shortened version ($),
   // then the dbg.assign (d), with the fragment information on a seperate scale
@@ -1857,7 +1897,8 @@ bool at::calculateFragmentIntersect(
   //        |      |
   //       s[######] - Original stores 64 bits to Dest.
   //       $----[##] - DSE says the lower 32 bits are dead, to be removed.
-  //       d    [##] - DAI's address-modifying expression adds 4 bytes to dest.
+  //       d    [##] - Assign's address-modifying expression adds 4 bytes to
+  //       dest.
   // Variable   |  |
   // Fragment   128|
   //  Offsets      159
@@ -1872,10 +1913,10 @@ bool at::calculateFragmentIntersect(
   //
   // 3. That offset along with the store size (32) represents the bits of the
   //    variable that'd be affected by the store. Call it SliceOfVariable.
-  //    Intersect that with DAI's fragment info:
-  //      SliceOfVariable ∩ DAI_fragment = none
+  //    Intersect that with Assign's fragment info:
+  //      SliceOfVariable ∩ Assign_fragment = none
   //
-  // In this case: none of the dead bits of the store affect DAI.
+  // In this case: none of the dead bits of the store affect Assign.
   //
   // # Example 2
   // Similar example with the same goal. This time the upper 16 bits
@@ -1886,7 +1927,7 @@ bool at::calculateFragmentIntersect(
   //               !DIExpression(DW_OP_plus_uconst, 4))
   //
   //     calculateFragmentIntersect(..., SliceOffsetInBits=48,
-  //                 SliceSizeInBits=16, Dest=%dest, DAI=dbg.assign)
+  //                 SliceSizeInBits=16, Dest=%dest, Assign=dbg.assign)
   //
   // Memory
   // offset
@@ -1895,7 +1936,8 @@ bool at::calculateFragmentIntersect(
   //        |      |
   //       s[######] - Original stores 64 bits to Dest.
   //       $[####]-- - DSE says the upper 16 bits are dead, to be removed.
-  //       d    [##] - DAI's address-modifying expression adds 4 bytes to dest.
+  //       d    [##] - Assign's address-modifying expression adds 4 bytes to
+  //       dest.
   // Variable   |  |
   // Fragment   128|
   //  Offsets      159
@@ -1904,13 +1946,14 @@ bool at::calculateFragmentIntersect(
   // 1. SliceOffsetInBits:48 + VarFrag.OffsetInBits:128 = 176
   // 2. 176 - (expression_offset:32 + (d.address - dest):0) = 144
   // 3. SliceOfVariable offset = 144, size = 16:
-  //      SliceOfVariable ∩ DAI_fragment = (offset: 144, size: 16)
-  // SliceOfVariable tells us the bits of the variable described by DAI that are
-  // affected by the DSE.
-  if (DAI->isKillAddress())
+  //      SliceOfVariable ∩ Assign_fragment = (offset: 144, size: 16)
+  // SliceOfVariable tells us the bits of the variable described by Assign that
+  // are affected by the DSE.
+  if (AssignRecord->isKillAddress())
     return false;
 
-  DIExpression::FragmentInfo VarFrag = DAI->getFragmentOrEntireVariable();
+  DIExpression::FragmentInfo VarFrag =
+      getFragmentOrEntireVariable(AssignRecord);
   if (VarFrag.SizeInBits == 0)
     return false; // Variable size is unknown.
 
@@ -1918,12 +1961,14 @@ bool at::calculateFragmentIntersect(
   // address-modifying expression.
   int64_t PointerOffsetInBits;
   {
-    auto DestOffsetInBytes = DAI->getAddress()->getPointerOffsetFrom(Dest, DL);
+    auto DestOffsetInBytes =
+        AssignRecord->getAddress()->getPointerOffsetFrom(Dest, DL);
     if (!DestOffsetInBytes)
       return false; // Can't calculate difference in addresses.
 
     int64_t ExprOffsetInBytes;
-    if (!DAI->getAddressExpression()->extractIfOffset(ExprOffsetInBytes))
+    if (!AssignRecord->getAddressExpression()->extractIfOffset(
+            ExprOffsetInBytes))
       return false;
 
     int64_t PointerOffsetInBytes = *DestOffsetInBytes + ExprOffsetInBytes;
@@ -1937,7 +1982,8 @@ bool at::calculateFragmentIntersect(
   if (NewOffsetInBits < 0)
     return false; // Fragment offsets can only be positive.
   DIExpression::FragmentInfo SliceOfVariable(SliceSizeInBits, NewOffsetInBits);
-  // Intersect the variable slice with DAI's fragment to trim it down to size.
+  // Intersect the variable slice with AssignRecord's fragment to trim it down
+  // to size.
   DIExpression::FragmentInfo TrimmedSliceOfVariable =
       DIExpression::FragmentInfo::intersect(SliceOfVariable, VarFrag);
   if (TrimmedSliceOfVariable == VarFrag)
@@ -1946,6 +1992,20 @@ bool at::calculateFragmentIntersect(
     Result = TrimmedSliceOfVariable;
   return true;
 }
+bool at::calculateFragmentIntersect(
+    const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
+    uint64_t SliceSizeInBits, const DbgAssignIntrinsic *DbgAssign,
+    std::optional<DIExpression::FragmentInfo> &Result) {
+  return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits,
+                                        SliceSizeInBits, DbgAssign, Result);
+}
+bool at::calculateFragmentIntersect(
+    const DataLayout &DL, const Value *Dest, uint64_t SliceOffsetInBits,
+    uint64_t SliceSizeInBits, const DPValue *DPVAssign,
+    std::optional<DIExpression::FragmentInfo> &Result) {
+  return calculateFragmentIntersectImpl(DL, Dest, SliceOffsetInBits,
+                                        SliceSizeInBits, DPVAssign, Result);
+}
 
 /// Collect constant properies (base, size, offset) of \p StoreDest.
 /// Return std::nullopt if any properties are not constants or the
@@ -2194,7 +2254,9 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   for (auto &P : DbgDeclares) {
     const AllocaInst *Alloca = P.first;
     auto Markers = at::getAssignmentMarkers(Alloca);
+    SmallVector<DPValue *> DPMarkers = at::getDPVAssignmentMarkers(Alloca);
     (void)Markers;
+    (void)DPMarkers;
     for (DbgDeclareInst *DDI : P.second) {
       // Assert that the alloca that DDI uses is now linked to a dbg.assign
       // describing the same variable (i.e. check that this dbg.declare has
@@ -2203,9 +2265,15 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
       // fragment. e.g. if the alloca is smaller than the variable, then
       // trackAssignments will create an alloca-sized fragment for the
       // dbg.assign.
-      assert(llvm::any_of(Markers, [DDI](DbgAssignIntrinsic *DAI) {
-        return DebugVariableAggregate(DAI) == DebugVariableAggregate(DDI);
-      }));
+      assert(llvm::any_of(Markers,
+                          [DDI](DbgAssignIntrinsic *DAI) {
+                            return DebugVariableAggregate(DAI) ==
+                                   DebugVariableAggregate(DDI);
+                          }) ||
+             llvm::any_of(DPMarkers, [DDI](DPValue *DPV) {
+               return DebugVariableAggregate(DPV) ==
+                      DebugVariableAggregate(DDI);
+             }));
       // Delete DDI because the variable location is now tracked using
       // assignment tracking.
       DDI->eraseFromParent();
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index d1b42f28923f5..2a1ac85ee55bf 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1724,20 +1724,6 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
                           SI->getIterator());
 }
 
-namespace llvm {
-// RemoveDIs: duplicate the getDebugValueLoc method using DPValues instead of
-// dbg.value intrinsics. In llvm namespace so that it overloads the
-// DbgVariableIntrinsic version.
-static DebugLoc getDebugValueLoc(DPValue *DPV) {
-  // Original dbg.declare must have a location.
-  const DebugLoc &DeclareLoc = DPV->getDebugLoc();
-  MDNode *Scope = DeclareLoc.getScope();
-  DILocation *InlinedAt = DeclareLoc.getInlinedAt();
-  // Produce an unknown location with the correct scope / inlinedAt fields.
-  return DILocation::get(DPV->getContext(), 0, 0, Scope, InlinedAt);
-}
-} // namespace llvm
-
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
 /// that has an associated llvm.dbg.declare intrinsic.
 void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,

From ad01447d30ed48e127254e0c45350c938d72c966 Mon Sep 17 00:00:00 2001
From: Michael Schellenberger Costa <miscco@nvidia.com>
Date: Mon, 22 Jan 2024 03:10:30 -0800
Subject: [PATCH 365/843] [libcxx] Fix typo in parallel `for_each_n` test
 (#78954)

This fixes a trivial copy and paste error where we forgot to change
`for_each` to `for_each_n`
---
 .../alg.nonmodifying/alg.foreach/pstl.for_each_n.pass.cpp   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.for_each_n.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.for_each_n.pass.cpp
index d5b42966b0000..c00eed1ead564 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.for_each_n.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.foreach/pstl.for_each_n.pass.cpp
@@ -25,10 +25,10 @@
 #include "test_execution_policies.h"
 #include "test_iterators.h"
 
-EXECUTION_POLICY_SFINAE_TEST(for_each);
+EXECUTION_POLICY_SFINAE_TEST(for_each_n);
 
-static_assert(sfinae_test_for_each<int, int*, int*, bool (*)(int)>);
-static_assert(!sfinae_test_for_each<std::execution::parallel_policy, int*, int*, bool (*)(int)>);
+static_assert(sfinae_test_for_each_n<int, int*, int, bool (*)(int)>);
+static_assert(!sfinae_test_for_each_n<std::execution::parallel_policy, int*, int, bool (*)(int)>);
 
 template <class Iter>
 struct Test {

From f45249f05ffcda0456dd1d5e9f9a7f0d75a20f84 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 22 Jan 2024 06:29:34 -0500
Subject: [PATCH 366/843] [gn] port 03c19e91e8d8

---
 llvm/utils/gn/secondary/libcxx/include/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index 30217374904a3..6ab1a7d6f4eba 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -643,6 +643,7 @@ if (current_toolchain == default_toolchain) {
       "__numeric/pstl_reduce.h",
       "__numeric/pstl_transform_reduce.h",
       "__numeric/reduce.h",
+      "__numeric/saturation_arithmetic.h",
       "__numeric/transform_exclusive_scan.h",
       "__numeric/transform_inclusive_scan.h",
       "__numeric/transform_reduce.h",

From a590f2315f45615920f244dcce12a7e169148da7 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Mon, 22 Jan 2024 11:38:00 +0000
Subject: [PATCH 367/843] [hwasan] Update dbg.assign intrinsics in HWAsan pass
 (#78606)

llvm.dbg.assign intrinsics have 2 {value, expression} pairs; fix hwasan to update
the second expression.

Fixes #76545
---
 llvm/lib/IR/DebugInfo.cpp                     |  4 --
 .../Instrumentation/HWAddressSanitizer.cpp    |  5 ++
 .../Transforms/Utils/MemoryTaggingSupport.cpp | 10 ++-
 .../AArch64/dbg-assign-tag-offset-mix-loc.ll  | 71 +++++++++++++++++++
 .../CodeGen/AArch64/dbg-assign-tag-offset.ll  | 71 +++++++++++++++++++
 .../declare-to-assign/hwasan.ll               |  2 +-
 .../dbg-assign-tag-offset.ll                  | 59 +++++++++++++++
 7 files changed, 214 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
 create mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
 create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index fcd3f77f8f6e2..2e64d0db57b25 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2200,10 +2200,6 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return /*Changed*/ false;
 
-  // FIXME: https://github.com/llvm/llvm-project/issues/76545
-  if (F.hasFnAttribute(Attribute::SanitizeHWAddress))
-    return /*Changed*/ false;
-
   bool Changed = false;
   auto *DL = &F.getParent()->getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index efb621cde9065..b213915dbcf62 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1435,6 +1435,11 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
         if (DDI->getVariableLocationOp(LocNo) == AI)
           DDI->setExpression(DIExpression::appendOpsToArg(DDI->getExpression(),
                                                           NewOps, LocNo));
+      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DDI)) {
+        assert(DAI->getAddress() == AI);
+        DAI->setAddressExpression(
+            DIExpression::prependOpcodes(DDI->getExpression(), NewOps));
+      }
     }
 
     auto TagEnd = [&](Instruction *Node) {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index f94047633022c..d2efcde5d3803 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -138,16 +138,20 @@ void StackInfoBuilder::visit(Instruction &Inst) {
     return;
   }
   if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    for (Value *V : DVI->location_ops()) {
+    auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
-          continue;
+          return;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
         auto &DVIVec = AInfo.DbgVariableIntrinsics;
         if (DVIVec.empty() || DVIVec.back() != DVI)
           DVIVec.push_back(DVI);
       }
-    }
+    };
+    for (Value *V : DVI->location_ops())
+      AddIfInteresting(V);
+    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
+      AddIfInteresting(DAI->getAddress());
   }
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
new file mode 100644
index 0000000000000..ef0dd46cb45c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
@@ -0,0 +1,71 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+;; Similar to dbg-assign-tag-offset.ll except the variable 'x' has been removed
+;; and 'y' has an implicit location range as well as stack location range
+;; (according to the hand-modified debug info -- see the dbg.value).
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @llvm.dbg.value(metadata i32 2, metadata !20, metadata !DIExpression()), !dbg !22
+  call void @use(ptr null), !dbg !28
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
new file mode 100644
index 0000000000000..a587f93d14d70
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
@@ -0,0 +1,71 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x00)
+; CHECK-NEXT:   DW_AT_name    ("x")
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !18, metadata !DIExpression(), metadata !31, metadata ptr %1, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0)), !dbg !22
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
index c4b209de77017..6c9366609cba2 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
 
-; CHECK: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.assign
 
 define dso_local void @f() sanitize_hwaddress !dbg !9 {
 entry:
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
new file mode 100644
index 0000000000000..1248c0bc586ab
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
@@ -0,0 +1,59 @@
+; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
+
+source_filename = "test.ll"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
+
+; Function Attrs: sanitize_hwaddress
+define void @f() #0 !dbg !7 {
+entry:
+  %nodebug0 = alloca ptr, align 8
+  %nodebug1 = alloca ptr, align 8
+  %nodebug2 = alloca ptr, align 8
+  %nodebug3 = alloca ptr, align 8
+  ; CHECK: %a = alloca{{.*}} !DIAssignID ![[ID1:[0-9]+]]
+  %a = alloca ptr, align 8, !DIAssignID !13
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID1]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !15
+  ; CHECK: %b = alloca{{.*}} !DIAssignID ![[ID2:[0-9]+]]
+  %b = alloca ptr, align 8, !DIAssignID !16
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID2]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !17, metadata !DIExpression(), metadata !16, metadata ptr %b, metadata !DIExpression()), !dbg !15
+  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
+  ret void, !dbg !18
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #1
+
+attributes #0 = { sanitize_hwaddress }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "x.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!6 = !{!"clang"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
+!12 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!13 = distinct !DIAssignID()
+!14 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 1, type: !10)
+!15 = !DILocation(line: 0, scope: !7)
+!16 = distinct !DIAssignID()
+!17 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 1, type: !10)
+!18 = !DILocation(line: 1, column: 37, scope: !7)

From c1729c8df2e2d0e9ef3c039df78f2711ea8fe65c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Fri, 19 Jan 2024 18:17:45 +0000
Subject: [PATCH 368/843] [X86] X86FixupVectorConstants.cpp - pull out
 rebuildConstant helper for future patches. NFC.

Add helper to convert raw APInt bit stream into ConstantDataVector elements.

This was used internally by rebuildSplatableConstant but will be reused in future patches for #73783 and #71078
---
 .../Target/X86/X86FixupVectorConstants.cpp    | 62 ++++++++++++-------
 1 file changed, 38 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 8fcf077906493..5ae41dcb645a6 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -158,34 +158,23 @@ static std::optional<APInt> getSplatableConstant(const Constant *C,
   return std::nullopt;
 }
 
-// Attempt to rebuild a normalized splat vector constant of the requested splat
-// width, built up of potentially smaller scalar values.
+// Split raw bits into a constant vector of elements of a specific bit width.
 // NOTE: We don't always bother converting to scalars if the vector length is 1.
-static Constant *rebuildSplatableConstant(const Constant *C,
-                                          unsigned SplatBitWidth) {
-  std::optional<APInt> Splat = getSplatableConstant(C, SplatBitWidth);
-  if (!Splat)
-    return nullptr;
-
-  // Determine scalar size to use for the constant splat vector, clamping as we
-  // might have found a splat smaller than the original constant data.
-  const Type *OriginalType = C->getType();
-  Type *SclTy = OriginalType->getScalarType();
-  unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
-  NumSclBits = std::min<unsigned>(NumSclBits, SplatBitWidth);
-  LLVMContext &Ctx = OriginalType->getContext();
+static Constant *rebuildConstant(LLVMContext &Ctx, Type *SclTy,
+                                 const APInt &Bits, unsigned NumSclBits) {
+  unsigned BitWidth = Bits.getBitWidth();
 
   if (NumSclBits == 8) {
     SmallVector<uint8_t> RawBits;
-    for (unsigned I = 0; I != SplatBitWidth; I += 8)
-      RawBits.push_back(Splat->extractBits(8, I).getZExtValue());
+    for (unsigned I = 0; I != BitWidth; I += 8)
+      RawBits.push_back(Bits.extractBits(8, I).getZExtValue());
     return ConstantDataVector::get(Ctx, RawBits);
   }
 
   if (NumSclBits == 16) {
     SmallVector<uint16_t> RawBits;
-    for (unsigned I = 0; I != SplatBitWidth; I += 16)
-      RawBits.push_back(Splat->extractBits(16, I).getZExtValue());
+    for (unsigned I = 0; I != BitWidth; I += 16)
+      RawBits.push_back(Bits.extractBits(16, I).getZExtValue());
     if (SclTy->is16bitFPTy())
       return ConstantDataVector::getFP(SclTy, RawBits);
     return ConstantDataVector::get(Ctx, RawBits);
@@ -193,22 +182,47 @@ static Constant *rebuildSplatableConstant(const Constant *C,
 
   if (NumSclBits == 32) {
     SmallVector<uint32_t> RawBits;
-    for (unsigned I = 0; I != SplatBitWidth; I += 32)
-      RawBits.push_back(Splat->extractBits(32, I).getZExtValue());
+    for (unsigned I = 0; I != BitWidth; I += 32)
+      RawBits.push_back(Bits.extractBits(32, I).getZExtValue());
     if (SclTy->isFloatTy())
       return ConstantDataVector::getFP(SclTy, RawBits);
     return ConstantDataVector::get(Ctx, RawBits);
   }
 
-  // Fallback to i64 / double.
+  assert(NumSclBits == 64 && "Unhandled vector element width");
+
   SmallVector<uint64_t> RawBits;
-  for (unsigned I = 0; I != SplatBitWidth; I += 64)
-    RawBits.push_back(Splat->extractBits(64, I).getZExtValue());
+  for (unsigned I = 0; I != BitWidth; I += 64)
+    RawBits.push_back(Bits.extractBits(64, I).getZExtValue());
   if (SclTy->isDoubleTy())
     return ConstantDataVector::getFP(SclTy, RawBits);
   return ConstantDataVector::get(Ctx, RawBits);
 }
 
+// Attempt to rebuild a normalized splat vector constant of the requested splat
+// width, built up of potentially smaller scalar values.
+static Constant *rebuildSplatableConstant(const Constant *C,
+                                          unsigned SplatBitWidth) {
+  std::optional<APInt> Splat = getSplatableConstant(C, SplatBitWidth);
+  if (!Splat)
+    return nullptr;
+
+  // Determine scalar size to use for the constant splat vector, clamping as we
+  // might have found a splat smaller than the original constant data.
+  const Type *OriginalType = C->getType();
+  Type *SclTy = OriginalType->getScalarType();
+  unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
+  NumSclBits = std::min<unsigned>(NumSclBits, SplatBitWidth);
+
+  // Fallback to i64 / double.
+  NumSclBits = (NumSclBits == 8 || NumSclBits == 16 || NumSclBits == 32)
+                   ? NumSclBits
+                   : 64;
+
+  // Extract per-element bits.
+  return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
+}
+
 bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                                                      MachineBasicBlock &MBB,
                                                      MachineInstr &MI) {

From 1a5eeade161beddf9c8b2fabad56af3081cd3629 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Jan 2024 11:29:36 +0000
Subject: [PATCH 369/843] [X86] Add printZeroUpperMove constant/shuffle
 comments helper. NFC.

Pull out helper instead of repeating switch cases.
---
 llvm/lib/Target/X86/X86MCInstLower.cpp | 136 +++++++++++--------------
 1 file changed, 58 insertions(+), 78 deletions(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 6e8e512336997..53a81fde12c82 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1536,6 +1536,52 @@ static void printConstant(const Constant *COp, unsigned BitWidth,
   }
 }
 
+static void printZeroUpperMove(const MachineInstr *MI, MCStreamer &OutStreamer,
+                               int SclWidth, int VecWidth,
+                               const char *ShuffleComment) {
+  assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+         "Unexpected number of operands!");
+
+  std::string Comment;
+  raw_string_ostream CS(Comment);
+  const MachineOperand &DstOp = MI->getOperand(0);
+  CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+
+  if (auto *C =
+          X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+    int CstEltSize = C->getType()->getScalarSizeInBits();
+    if (SclWidth == CstEltSize) {
+      if (auto *CI = dyn_cast<ConstantInt>(C)) {
+        CS << "[";
+        printConstant(CI->getValue(), CS);
+        for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
+          CS << ",0";
+        }
+        CS << "]";
+        OutStreamer.AddComment(CS.str());
+        return; // early-out
+      }
+
+      if (auto *CF = dyn_cast<ConstantFP>(C)) {
+        CS << "[";
+        printConstant(CF->getValue(), CS);
+        APFloat ZeroFP = APFloat::getZero(CF->getValue().getSemantics());
+        for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
+          CS << ",";
+          printConstant(ZeroFP, CS);
+        }
+        CS << "]";
+        OutStreamer.AddComment(CS.str());
+        return; // early-out
+      }
+    }
+  }
+
+  // We didn't find a constant load, fallback to a shuffle mask decode.
+  CS << ShuffleComment;
+  OutStreamer.AddComment(CS.str());
+}
+
 void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
   assert((getSubtarget().isOSWindows() || TM.getTargetTriple().isUEFI()) &&
@@ -1807,94 +1853,28 @@ static void addConstantComments(const MachineInstr *MI,
   }
 
   case X86::MOVSDrm:
-  case X86::MOVSSrm:
   case X86::VMOVSDrm:
-  case X86::VMOVSSrm:
   case X86::VMOVSDZrm:
-  case X86::VMOVSSZrm:
   case X86::MOVSDrm_alt:
-  case X86::MOVSSrm_alt:
   case X86::VMOVSDrm_alt:
-  case X86::VMOVSSrm_alt:
   case X86::VMOVSDZrm_alt:
-  case X86::VMOVSSZrm_alt:
-  case X86::MOVDI2PDIrm:
   case X86::MOVQI2PQIrm:
-  case X86::VMOVDI2PDIrm:
   case X86::VMOVQI2PQIrm:
-  case X86::VMOVDI2PDIZrm:
-  case X86::VMOVQI2PQIZrm: {
-    assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
-           "Unexpected number of operands!");
-    int SclWidth = 32;
-    int VecWidth = 128;
-
-    switch (MI->getOpcode()) {
-    default:
-      llvm_unreachable("Invalid opcode");
-    case X86::MOVSDrm:
-    case X86::VMOVSDrm:
-    case X86::VMOVSDZrm:
-    case X86::MOVSDrm_alt:
-    case X86::VMOVSDrm_alt:
-    case X86::VMOVSDZrm_alt:
-    case X86::MOVQI2PQIrm:
-    case X86::VMOVQI2PQIrm:
-    case X86::VMOVQI2PQIZrm:
-      SclWidth = 64;
-      VecWidth = 128;
+  case X86::VMOVQI2PQIZrm:
+    printZeroUpperMove(MI, OutStreamer, 64, 128, "mem[0],zero");
       break;
-    case X86::MOVSSrm:
-    case X86::VMOVSSrm:
-    case X86::VMOVSSZrm:
-    case X86::MOVSSrm_alt:
-    case X86::VMOVSSrm_alt:
-    case X86::VMOVSSZrm_alt:
-    case X86::MOVDI2PDIrm:
-    case X86::VMOVDI2PDIrm:
-    case X86::VMOVDI2PDIZrm:
-      SclWidth = 32;
-      VecWidth = 128;
-      break;
-    }
-    std::string Comment;
-    raw_string_ostream CS(Comment);
-    const MachineOperand &DstOp = MI->getOperand(0);
-    CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
 
-    if (auto *C =
-            X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
-      if ((unsigned)SclWidth == C->getType()->getScalarSizeInBits()) {
-        if (auto *CI = dyn_cast<ConstantInt>(C)) {
-          CS << "[";
-          printConstant(CI->getValue(), CS);
-          for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
-            CS << ",0";
-          }
-          CS << "]";
-          OutStreamer.AddComment(CS.str());
-          break; // early-out
-        }
-        if (auto *CF = dyn_cast<ConstantFP>(C)) {
-          CS << "[";
-          printConstant(CF->getValue(), CS);
-          APFloat ZeroFP = APFloat::getZero(CF->getValue().getSemantics());
-          for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
-            CS << ",";
-            printConstant(ZeroFP, CS);
-          }
-          CS << "]";
-          OutStreamer.AddComment(CS.str());
-          break; // early-out
-        }
-      }
-    }
-
-    // We didn't find a constant load, fallback to a shuffle mask decode.
-    CS << (SclWidth == 32 ? "mem[0],zero,zero,zero" : "mem[0],zero");
-    OutStreamer.AddComment(CS.str());
+  case X86::MOVSSrm:
+  case X86::VMOVSSrm:
+  case X86::VMOVSSZrm:
+  case X86::MOVSSrm_alt:
+  case X86::VMOVSSrm_alt:
+  case X86::VMOVSSZrm_alt:
+  case X86::MOVDI2PDIrm:
+  case X86::VMOVDI2PDIrm:
+  case X86::VMOVDI2PDIZrm:
+    printZeroUpperMove(MI, OutStreamer, 32, 128, "mem[0],zero,zero,zero");
     break;
-  }
 
 #define MOV_CASE(Prefix, Suffix)                                               \
   case X86::Prefix##MOVAPD##Suffix##rm:                                        \

From 865e4a1f33bd3be42ff256c6839aff0860610a5a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Hana=20Dus=C3=ADkov=C3=A1?= <hanicka@hanicka.net>
Date: Mon, 22 Jan 2024 12:50:20 +0100
Subject: [PATCH 370/843] [coverage] skipping code coverage for 'if constexpr'
 and 'if consteval' (#78033)

`if constexpr` and `if consteval` conditional statements code coverage
should behave more like a preprocesor `#if`-s than normal
ConditionalStmt. This PR should fix that.

---------

Co-authored-by: cor3ntin <corentinjabot@gmail.com>
---
 clang/docs/ReleaseNotes.rst                   |   2 +-
 clang/lib/CodeGen/CoverageMappingGen.cpp      | 214 +++++++++++++++---
 .../CoverageMapping/branch-constfolded.cpp    |   8 +-
 clang/test/CoverageMapping/if.cpp             | 146 ++++++++----
 .../ProfileData/Coverage/CoverageMapping.cpp  |   9 +-
 llvm/tools/llvm-cov/SourceCoverageView.cpp    |   2 +
 .../ProfileData/CoverageMappingTest.cpp       |  24 +-
 7 files changed, 325 insertions(+), 80 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 4888ffe6f4dfc..45fc8bbbb6862 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -811,7 +811,7 @@ Bug Fixes in This Version
   invalidation by invalid initializer Expr.
   Fixes (`#30908 <https://github.com/llvm/llvm-project/issues/30908>`_)
 - Clang now emits correct source location for code-coverage regions in `if constexpr`
-  and `if consteval` branches.
+  and `if consteval` branches. Untaken branches are now skipped.
   Fixes (`#54419 <https://github.com/llvm/llvm-project/issues/54419>`_)
 - Fix assertion failure when declaring a template friend function with
   a constrained parameter in a template class that declares a class method
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 4a44d113ddec9..8b5e6c4ad8272 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -119,12 +119,16 @@ class SourceMappingRegion {
   /// as the line execution count if there are no other regions on the line.
   bool GapRegion;
 
+  /// Whetever this region is skipped ('if constexpr' or 'if consteval' untaken
+  /// branch, or anything skipped but not empty line / comments)
+  bool SkippedRegion;
+
 public:
   SourceMappingRegion(Counter Count, std::optional<SourceLocation> LocStart,
                       std::optional<SourceLocation> LocEnd,
                       bool GapRegion = false)
-      : Count(Count), LocStart(LocStart), LocEnd(LocEnd), GapRegion(GapRegion) {
-  }
+      : Count(Count), LocStart(LocStart), LocEnd(LocEnd), GapRegion(GapRegion),
+        SkippedRegion(false) {}
 
   SourceMappingRegion(Counter Count, std::optional<Counter> FalseCount,
                       MCDCParameters MCDCParams,
@@ -132,13 +136,14 @@ class SourceMappingRegion {
                       std::optional<SourceLocation> LocEnd,
                       bool GapRegion = false)
       : Count(Count), FalseCount(FalseCount), MCDCParams(MCDCParams),
-        LocStart(LocStart), LocEnd(LocEnd), GapRegion(GapRegion) {}
+        LocStart(LocStart), LocEnd(LocEnd), GapRegion(GapRegion),
+        SkippedRegion(false) {}
 
   SourceMappingRegion(MCDCParameters MCDCParams,
                       std::optional<SourceLocation> LocStart,
                       std::optional<SourceLocation> LocEnd)
       : MCDCParams(MCDCParams), LocStart(LocStart), LocEnd(LocEnd),
-        GapRegion(false) {}
+        GapRegion(false), SkippedRegion(false) {}
 
   const Counter &getCounter() const { return Count; }
 
@@ -174,6 +179,10 @@ class SourceMappingRegion {
 
   void setGap(bool Gap) { GapRegion = Gap; }
 
+  bool isSkipped() const { return SkippedRegion; }
+
+  void setSkipped(bool Skipped) { SkippedRegion = Skipped; }
+
   bool isBranch() const { return FalseCount.has_value(); }
 
   bool isMCDCDecision() const { return MCDCParams.NumConditions != 0; }
@@ -468,6 +477,10 @@ class CoverageMappingBuilder {
         MappingRegions.push_back(CounterMappingRegion::makeGapRegion(
             Region.getCounter(), *CovFileID, SR.LineStart, SR.ColumnStart,
             SR.LineEnd, SR.ColumnEnd));
+      } else if (Region.isSkipped()) {
+        MappingRegions.push_back(CounterMappingRegion::makeSkipped(
+            *CovFileID, SR.LineStart, SR.ColumnStart, SR.LineEnd,
+            SR.ColumnEnd));
       } else if (Region.isBranch()) {
         MappingRegions.push_back(CounterMappingRegion::makeBranchRegion(
             Region.getCounter(), Region.getFalseCounter(),
@@ -1251,6 +1264,69 @@ struct CounterCoverageMappingBuilder
     popRegions(Index);
   }
 
+  /// Find a valid range starting with \p StartingLoc and ending before \p
+  /// BeforeLoc.
+  std::optional<SourceRange> findAreaStartingFromTo(SourceLocation StartingLoc,
+                                                    SourceLocation BeforeLoc) {
+    // If StartingLoc is in function-like macro, use its start location.
+    if (StartingLoc.isMacroID()) {
+      FileID FID = SM.getFileID(StartingLoc);
+      const SrcMgr::ExpansionInfo *EI = &SM.getSLocEntry(FID).getExpansion();
+      if (EI->isFunctionMacroExpansion())
+        StartingLoc = EI->getExpansionLocStart();
+    }
+
+    size_t StartDepth = locationDepth(StartingLoc);
+    size_t EndDepth = locationDepth(BeforeLoc);
+    while (!SM.isWrittenInSameFile(StartingLoc, BeforeLoc)) {
+      bool UnnestStart = StartDepth >= EndDepth;
+      bool UnnestEnd = EndDepth >= StartDepth;
+      if (UnnestEnd) {
+        assert(SM.isWrittenInSameFile(getStartOfFileOrMacro(BeforeLoc),
+                                      BeforeLoc));
+
+        BeforeLoc = getIncludeOrExpansionLoc(BeforeLoc);
+        assert(BeforeLoc.isValid());
+        EndDepth--;
+      }
+      if (UnnestStart) {
+        assert(SM.isWrittenInSameFile(StartingLoc,
+                                      getStartOfFileOrMacro(StartingLoc)));
+
+        StartingLoc = getIncludeOrExpansionLoc(StartingLoc);
+        assert(StartingLoc.isValid());
+        StartDepth--;
+      }
+    }
+    // If the start and end locations of the gap are both within the same macro
+    // file, the range may not be in source order.
+    if (StartingLoc.isMacroID() || BeforeLoc.isMacroID())
+      return std::nullopt;
+    if (!SM.isWrittenInSameFile(StartingLoc, BeforeLoc) ||
+        !SpellingRegion(SM, StartingLoc, BeforeLoc).isInSourceOrder())
+      return std::nullopt;
+    return {{StartingLoc, BeforeLoc}};
+  }
+
+  void markSkipped(SourceLocation StartLoc, SourceLocation BeforeLoc) {
+    const auto Skipped = findAreaStartingFromTo(StartLoc, BeforeLoc);
+
+    if (!Skipped)
+      return;
+
+    const auto NewStartLoc = Skipped->getBegin();
+    const auto EndLoc = Skipped->getEnd();
+
+    if (NewStartLoc == EndLoc)
+      return;
+    assert(SpellingRegion(SM, NewStartLoc, EndLoc).isInSourceOrder());
+    handleFileExit(NewStartLoc);
+    size_t Index = pushRegion({}, NewStartLoc, EndLoc);
+    getRegion().setSkipped(true);
+    handleFileExit(EndLoc);
+    popRegions(Index);
+  }
+
   /// Keep counts of breaks and continues inside loops.
   struct BreakContinue {
     Counter BreakCount;
@@ -1700,43 +1776,119 @@ struct CounterCoverageMappingBuilder
     Visit(S->getSubStmt());
   }
 
+  void coverIfConsteval(const IfStmt *S) {
+    assert(S->isConsteval());
+
+    const auto *Then = S->getThen();
+    const auto *Else = S->getElse();
+
+    // It's better for llvm-cov to create a new region with same counter
+    // so line-coverage can be properly calculated for lines containing
+    // a skipped region (without it the line is marked uncovered)
+    const Counter ParentCount = getRegion().getCounter();
+
+    extendRegion(S);
+
+    if (S->isNegatedConsteval()) {
+      // ignore 'if consteval'
+      markSkipped(S->getIfLoc(), getStart(Then));
+      propagateCounts(ParentCount, Then);
+
+      if (Else) {
+        // ignore 'else <else>'
+        markSkipped(getEnd(Then), getEnd(Else));
+      }
+    } else {
+      assert(S->isNonNegatedConsteval());
+      // ignore 'if consteval <then> [else]'
+      markSkipped(S->getIfLoc(), Else ? getStart(Else) : getEnd(Then));
+
+      if (Else)
+        propagateCounts(ParentCount, Else);
+    }
+  }
+
+  void coverIfConstexpr(const IfStmt *S) {
+    assert(S->isConstexpr());
+
+    // evaluate constant condition...
+    const auto *E = cast<ConstantExpr>(S->getCond());
+    const bool isTrue = E->getResultAsAPSInt().getExtValue();
+
+    extendRegion(S);
+
+    // I'm using 'propagateCounts' later as new region is better and allows me
+    // to properly calculate line coverage in llvm-cov utility
+    const Counter ParentCount = getRegion().getCounter();
+
+    // ignore 'if constexpr ('
+    SourceLocation startOfSkipped = S->getIfLoc();
+
+    if (const auto *Init = S->getInit()) {
+      const auto start = getStart(Init);
+      const auto end = getEnd(Init);
+
+      // this check is to make sure typedef here which doesn't have valid source
+      // location won't crash it
+      if (start.isValid() && end.isValid()) {
+        markSkipped(startOfSkipped, start);
+        propagateCounts(ParentCount, Init);
+        startOfSkipped = getEnd(Init);
+      }
+    }
+
+    const auto *Then = S->getThen();
+    const auto *Else = S->getElse();
+
+    if (isTrue) {
+      // ignore '<condition>)'
+      markSkipped(startOfSkipped, getStart(Then));
+      propagateCounts(ParentCount, Then);
+
+      if (Else)
+        // ignore 'else <else>'
+        markSkipped(getEnd(Then), getEnd(Else));
+    } else {
+      // ignore '<condition>) <then> [else]'
+      markSkipped(startOfSkipped, Else ? getStart(Else) : getEnd(Then));
+
+      if (Else)
+        propagateCounts(ParentCount, Else);
+    }
+  }
+
   void VisitIfStmt(const IfStmt *S) {
+    // "if constexpr" and "if consteval" are not normal conditional statements,
+    // their discarded statement should be skipped
+    if (S->isConsteval())
+      return coverIfConsteval(S);
+    else if (S->isConstexpr())
+      return coverIfConstexpr(S);
+
     extendRegion(S);
     if (S->getInit())
       Visit(S->getInit());
 
     // Extend into the condition before we propagate through it below - this is
     // needed to handle macros that generate the "if" but not the condition.
-    if (!S->isConsteval())
-      extendRegion(S->getCond());
+    extendRegion(S->getCond());
 
     Counter ParentCount = getRegion().getCounter();
+    Counter ThenCount = getRegionCounter(S);
 
-    // If this is "if !consteval" the then-branch will never be taken, we don't
-    // need to change counter
-    Counter ThenCount =
-        S->isNegatedConsteval() ? ParentCount : getRegionCounter(S);
+    // Emitting a counter for the condition makes it easier to interpret the
+    // counter for the body when looking at the coverage.
+    propagateCounts(ParentCount, S->getCond());
 
-    if (!S->isConsteval()) {
-      // Emitting a counter for the condition makes it easier to interpret the
-      // counter for the body when looking at the coverage.
-      propagateCounts(ParentCount, S->getCond());
-
-      // The 'then' count applies to the area immediately after the condition.
-      std::optional<SourceRange> Gap =
-          findGapAreaBetween(S->getRParenLoc(), getStart(S->getThen()));
-      if (Gap)
-        fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), ThenCount);
-    }
+    // The 'then' count applies to the area immediately after the condition.
+    std::optional<SourceRange> Gap =
+        findGapAreaBetween(S->getRParenLoc(), getStart(S->getThen()));
+    if (Gap)
+      fillGapAreaWithCount(Gap->getBegin(), Gap->getEnd(), ThenCount);
 
     extendRegion(S->getThen());
     Counter OutCount = propagateCounts(ThenCount, S->getThen());
-
-    // If this is "if consteval" the else-branch will never be taken, we don't
-    // need to change counter
-    Counter ElseCount = S->isNonNegatedConsteval()
-                            ? ParentCount
-                            : subtractCounters(ParentCount, ThenCount);
+    Counter ElseCount = subtractCounters(ParentCount, ThenCount);
 
     if (const Stmt *Else = S->getElse()) {
       bool ThenHasTerminateStmt = HasTerminateStmt;
@@ -1759,11 +1911,9 @@ struct CounterCoverageMappingBuilder
       GapRegionCounter = OutCount;
     }
 
-    if (!S->isConsteval()) {
-      // Create Branch Region around condition.
-      createBranchRegion(S->getCond(), ThenCount,
-                         subtractCounters(ParentCount, ThenCount));
-    }
+    // Create Branch Region around condition.
+    createBranchRegion(S->getCond(), ThenCount,
+                       subtractCounters(ParentCount, ThenCount));
   }
 
   void VisitCXXTryStmt(const CXXTryStmt *S) {
diff --git a/clang/test/CoverageMapping/branch-constfolded.cpp b/clang/test/CoverageMapping/branch-constfolded.cpp
index 4fdb640506c9d..c8755d5d752b6 100644
--- a/clang/test/CoverageMapping/branch-constfolded.cpp
+++ b/clang/test/CoverageMapping/branch-constfolded.cpp
@@ -90,10 +90,10 @@ bool for_7(bool a) {       // MCDC: Decision,File 0, [[@LINE+1]]:10 -> [[@LINE+1
 }                          // CHECK: Branch,File 0, [[@LINE-1]]:15 -> [[@LINE-1]]:19 = 0, 0
 
 // CHECK-LABEL: _Z5for_8b:
-bool for_8(bool a) {       // MCDC: Decision,File 0, [[@LINE+3]]:17 -> [[@LINE+3]]:30 = M:0, C:2
-                           // CHECK: Branch,File 0, [[@LINE+2]]:17 -> [[@LINE+2]]:21 = 0, 0
-                           // CHECK: Branch,File 0, [[@LINE+1]]:25 -> [[@LINE+1]]:30 = 0, 0
-  if constexpr (true && false)
+bool for_8(bool a) {       // MCDC: Decision,File 0, [[@LINE+3]]:7 -> [[@LINE+3]]:20 = M:0, C:2
+                           // CHECK: Branch,File 0, [[@LINE+2]]:7 -> [[@LINE+2]]:11 = 0, 0
+                           // CHECK: Branch,File 0, [[@LINE+1]]:15 -> [[@LINE+1]]:20 = 0, 0
+  if (true && false)
       return true;
   else
       return false;
diff --git a/clang/test/CoverageMapping/if.cpp b/clang/test/CoverageMapping/if.cpp
index 92d560be01f3b..3045ffe43948c 100644
--- a/clang/test/CoverageMapping/if.cpp
+++ b/clang/test/CoverageMapping/if.cpp
@@ -23,47 +23,107 @@ void foo() {                    // CHECK-NEXT: Gap,File 0, [[@LINE+1]]:21 -> [[@
 }                               // CHECK-NEXT: [[@LINE-2]]:9 -> [[@LINE-1]]:5 = #1
                                 // CHECK-NEXT: [[@LINE-2]]:5 -> [[@LINE-2]]:8 = #1
 
-// FIXME: Do not generate coverage for discarded branches in if constexpr
+// GH-57377
 // CHECK-LABEL: _Z30check_constexpr_true_with_elsei:
 int check_constexpr_true_with_else(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
-                                // CHECK-NEXT: [[@LINE+2]]:16 -> [[@LINE+2]]:20 = #0
-                                // CHECK-NEXT: Branch,File 0, [[@LINE+1]]:16 -> [[@LINE+1]]:20 = 0, 0
-  if constexpr(true) {          // CHECK-NEXT: Gap,File 0, [[@LINE]]:21 -> [[@LINE]]:22 = #1
-    i *= 3;                     // CHECK-NEXT: [[@LINE-1]]:22 -> [[@LINE+1]]:4 = #1
-  } else {                      // CHECK-NEXT: Gap,File 0, [[@LINE]]:4 -> [[@LINE]]:10 = (#0 - #1)
-    i *= 5;                     // CHECK-NEXT: [[@LINE-1]]:10 -> [[@LINE+1]]:4 = (#0 - #1)
+  if constexpr(true) {          // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:22 = 0
+    i *= 3;                     // CHECK-NEXT: File 0, [[@LINE-1]]:22 -> [[@LINE+1]]:4 = #0
+  } else {                      // CHECK-NEXT: Skipped,File 0, [[@LINE]]:4 -> [[@LINE+2]]:4 = 0
+    i *= 5;
   }
   return i;
 }
 
+// GH-57377
 // CHECK-LABEL: _Z33check_constexpr_true_without_elsei:
 int check_constexpr_true_without_else(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
-                                // CHECK-NEXT: [[@LINE+2]]:16 -> [[@LINE+2]]:20 = #0
-                                // CHECK-NEXT: Branch,File 0, [[@LINE+1]]:16 -> [[@LINE+1]]:20 = 0, 0
-  if constexpr(true) {          // CHECK-NEXT: Gap,File 0, [[@LINE]]:21 -> [[@LINE]]:22 = #1
-    i *= 3;                     // CHECK-NEXT: [[@LINE-1]]:22 -> [[@LINE+1]]:4 = #1
+  if constexpr(true) {          // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:22 = 0
+    i *= 3;                     // CHECK-NEXT: File 0, [[@LINE-1]]:22 -> [[@LINE+1]]:4 = #0
   }
   return i;
 }
 
+// GH-57377
 // CHECK-LABEL: _Z31check_constexpr_false_with_elsei:
 int check_constexpr_false_with_else(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
-                                // CHECK-NEXT: [[@LINE+2]]:16 -> [[@LINE+2]]:21 = #0
-                                // CHECK-NEXT: Branch,File 0, [[@LINE+1]]:16 -> [[@LINE+1]]:21 = 0, 0
-  if constexpr(false) {         // CHECK-NEXT: Gap,File 0, [[@LINE]]:22 -> [[@LINE]]:23 = #1
-    i *= 3;                     // CHECK-NEXT: File 0, [[@LINE-1]]:23 -> [[@LINE+1]]:4 = #1
-  } else {                      // CHECK-NEXT: Gap,File 0, [[@LINE]]:4 -> [[@LINE]]:10 = (#0 - #1)
-    i *= 5;                     // CHECK-NEXT: File 0, [[@LINE-1]]:10 -> [[@LINE+1]]:4 = (#0 - #1)
+  if constexpr(false) {         // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = 0
+    i *= 3;
+  } else {                      // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE+2]]:4 = #0
+    i *= 5;
   }
   return i;
 }
 
+// GH-57377
 // CHECK-LABEL: _Z34check_constexpr_false_without_elsei:
 int check_constexpr_false_without_else(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
-                                // CHECK-NEXT: [[@LINE+2]]:16 -> [[@LINE+2]]:21 = #0
-                                // CHECK-NEXT: Branch,File 0, [[@LINE+1]]:16 -> [[@LINE+1]]:21 = 0, 0
-  if constexpr(false) {         // CHECK-NEXT: Gap,File 0, [[@LINE]]:22 -> [[@LINE]]:23 = #1
-    i *= 3;                     // CHECK-NEXT: File 0, [[@LINE-1]]:23 -> [[@LINE+1]]:4 = #1
+  if constexpr(false) {         // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE+2]]:4 = 0
+    i *= 3;
+  }
+  return i;
+}
+
+// GH-57377
+// CHECK-LABEL: _Z35check_constexpr_init_true_with_elsei:
+int check_constexpr_init_true_with_else(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
+  if constexpr(int j = i; true) {          // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:16 = 0
+                                           // CHECK-NEXT: File 0, [[@LINE-1]]:16 -> [[@LINE-1]]:26 = #0
+                                           // CHECK-NEXT: Skipped,File 0, [[@LINE-2]]:26 -> [[@LINE-2]]:33 = 0
+                                           // CHECK-NEXT: File 0, [[@LINE-3]]:33 -> [[@LINE+2]]:4 = #0
+    i *= j;
+  } else {                                 // CHECK-NEXT: Skipped,File 0, [[@LINE]]:4 -> [[@LINE+2]]:4 = 0
+    i *= j;
+  }
+  return i;
+}
+
+// GH-57377
+// CHECK-LABEL: _Z38check_constexpr_init_true_without_elsei:
+int check_constexpr_init_true_without_else(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
+  if constexpr(int j = i; true) {          // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:16 = 0
+                                           // CHECK-NEXT: File 0, [[@LINE-1]]:16 -> [[@LINE-1]]:26 = #0
+                                           // CHECK-NEXT: Skipped,File 0, [[@LINE-2]]:26 -> [[@LINE-2]]:33 = 0
+                                           // CHECK-NEXT: File 0, [[@LINE-3]]:33 -> [[@LINE+2]]:4 = #0
+    i *= j;
+  }
+  return i;
+}
+
+// GH-57377
+// CHECK-LABEL: _Z36check_constexpr_init_false_with_elsei:
+int check_constexpr_init_false_with_else(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
+  if constexpr(int j = i; false) {         // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:16 = 0
+                                           // CHECK-NEXT: File 0, [[@LINE-1]]:16 -> [[@LINE-1]]:26 = #0
+    i *= j;                                // CHECK-NEXT: Skipped,File 0, [[@LINE-2]]:26 -> [[@LINE+1]]:10 = 0
+  } else {                                 // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE+2]]:4 = #0
+    i *= j;
+  }
+  return i;
+}
+
+// GH-57377
+// CHECK-LABEL: _Z39check_constexpr_init_false_without_elsei:
+int check_constexpr_init_false_without_else(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
+  if constexpr(int j = i; false) {         // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:16 = 0
+                                           // CHECK-NEXT: File 0, [[@LINE-1]]:16 -> [[@LINE-1]]:26 = #0
+    i *= j;                                // CHECK-NEXT: Skipped,File 0, [[@LINE-2]]:26 -> [[@LINE+1]]:4 = 0
+  }
+  return i;
+}
+
+// CHECK-LABEL: _Z32check_constexpr_init_with_if_defi:
+int check_constexpr_init_with_if_def(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
+  if constexpr(using foo = int; true) {         // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:39 = 0
+    i *= foo(42);                               // CHECK-NEXT: File 0, [[@LINE-1]]:39 -> [[@LINE+1]]:4 = #0
+  }
+  return i;
+}
+
+// CHECK-LABEL: _Z32check_macro_constexpr_if_skippedi:
+int check_macro_constexpr_if_skipped(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
+#define IF_CONSTEXPR if constexpr               // CHECK-NEXT: Expansion,File 0, [[@LINE+1]]:3 -> [[@LINE+1]]:15 = #0 (Expanded file = 1)
+  IF_CONSTEXPR(false) {                         // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE+2]]:4 = 0
+    i *= 2;                                     // CHECK-NEXT: File 1, [[@LINE-2]]:22 -> [[@LINE-2]]:34 = #0
   }
   return i;
 }
@@ -127,48 +187,58 @@ void ternary() {
 
 // GH-57377
 // CHECK-LABEL: _Z40check_consteval_with_else_discarded_theni:
-// FIXME: Do not generate coverage for discarded <then> branch in if consteval
 constexpr int check_consteval_with_else_discarded_then(int i) { // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
-  if consteval {                        
-    i *= 3;                             // CHECK-NEXT: [[@LINE-1]]:16 -> [[@LINE+1]]:4 = #1
-  } else {                              // CHECK-NEXT: Gap,File 0, [[@LINE]]:4 -> [[@LINE]]:10 = #0
-    i *= 5;                             // CHECK-NEXT: [[@LINE-1]]:10 -> [[@LINE+1]]:4 = #0
+  if consteval {                        // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE+2]]:10 = 0
+    i *= 3;
+  } else {                              // CHECK-NEXT: File 0, [[@LINE]]:10 -> [[@LINE+2]]:4 = #0
+    i *= 5;
   }
-  return i;                             // CHECK-NEXT: [[@LINE]]:3 -> [[@LINE]]:11 = (#0 + #1)
+  return i;
 }
 
+// GH-57377
 // CHECK-LABEL: _Z43check_notconsteval_with_else_discarded_elsei:
-// FIXME: Do not generate coverage for discarded <else> branch in if consteval
 constexpr int check_notconsteval_with_else_discarded_else(int i) { // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
-  if !consteval {                        
-    i *= 3;                             // CHECK-NEXT: [[@LINE-1]]:17 -> [[@LINE+1]]:4 = #0
-  } else {                              // CHECK-NEXT: Gap,File 0, [[@LINE]]:4 -> [[@LINE]]:10 = 0
-    i *= 5;                             // CHECK-NEXT: [[@LINE-1]]:10 -> [[@LINE+1]]:4 = 0
+  if !consteval {                       // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:17 = 0
+    i *= 3;                             
+  } else {                              // CHECK-NEXT: File 0, [[@LINE-2]]:17 -> [[@LINE]]:4 = #0
+    i *= 5;                             // CHECK-NEXT: Skipped,File 0, [[@LINE-1]]:4 -> [[@LINE+1]]:4 = 0
   }
   return i;
 }
 
+// GH-57377
 // CHECK-LABEL: _Z32check_consteval_branch_discardedi:
-// FIXME: Do not generate coverage for discarded <then> branch in if consteval
 constexpr int check_consteval_branch_discarded(int i) { // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
-  if consteval {                        
-    i *= 3;                             // CHECK-NEXT: [[@LINE-1]]:16 -> [[@LINE+1]]:4 = #1
+  if consteval {                        // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE+2]]:4 = 0 
+    i *= 3;      
   }
-  return i;                             // CHECK-NEXT: [[@LINE]]:3 -> [[@LINE]]:11 = (#0 + #1)
+  return i;      
 }
 
+// GH-57377
 // CHECK-LABEL: _Z30check_notconsteval_branch_kepti:
 constexpr int check_notconsteval_branch_kept(int i) { // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
-  if !consteval {                        
-    i *= 3;                             // CHECK-NEXT: [[@LINE-1]]:17 -> [[@LINE+1]]:4 = #0
+  if !consteval {                       // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:17 = 0
+    i *= 3;                             // CHECK-NEXT: File 0, [[@LINE-1]]:17 -> [[@LINE+1]]:4 = #0
   }
   return i;
 }
 
+// CHECK-LABEL: _Z32check_macro_consteval_if_skippedi:
+constexpr int check_macro_consteval_if_skipped(int i) {   // CHECK-NEXT: [[@LINE]]:{{[0-9]+}} -> {{[0-9]+}}:2 = #0
+#define IF_RUNTIME if !consteval               // CHECK-NEXT: Expansion,File 0, [[@LINE+1]]:3 -> [[@LINE+1]]:13 = #0 (Expanded file = 1)
+  IF_RUNTIME {                                 // CHECK-NEXT: Skipped,File 0, [[@LINE]]:3 -> [[@LINE]]:14 = 0
+    i *= 2;                                    // CHECK-NEXT: File 0, [[@LINE-1]]:14 -> [[@LINE+1]]:4 = #0
+  }                                            // CHECK-NEXT: File 1, [[@LINE-3]]:20 -> [[@LINE-3]]:33 = #0
+  return i;
+}
+
 int instantiate_consteval(int i) {
   i *= check_consteval_with_else_discarded_then(i);
   i *= check_notconsteval_with_else_discarded_else(i);
   i *= check_consteval_branch_discarded(i);
   i *= check_notconsteval_branch_kept(i);
+  i *= check_macro_consteval_if_skipped(i);
   return i;
 }
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index eece6a2cc7179..d8be7be6d7a59 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -1319,8 +1319,15 @@ LineCoverageStats::LineCoverageStats(
       !StartOfSkippedRegion &&
       ((WrappedSegment && WrappedSegment->HasCount) || (MinRegionCount > 0));
 
-  if (!Mapped)
+  // if there is any starting segment at this line with a counter, it must be
+  // mapped
+  Mapped |= std::any_of(
+      LineSegments.begin(), LineSegments.end(),
+      [](const auto *Seq) { return Seq->IsRegionEntry && Seq->HasCount; });
+
+  if (!Mapped) {
     return;
+  }
 
   // Pick the max count from the non-gap, region entry segments and the
   // wrapped count.
diff --git a/llvm/tools/llvm-cov/SourceCoverageView.cpp b/llvm/tools/llvm-cov/SourceCoverageView.cpp
index b92c62df7950a..71edd5fec4280 100644
--- a/llvm/tools/llvm-cov/SourceCoverageView.cpp
+++ b/llvm/tools/llvm-cov/SourceCoverageView.cpp
@@ -130,6 +130,8 @@ bool SourceCoverageView::shouldRenderRegionMarkers(
     const auto *CurSeg = Segments[I];
     if (!CurSeg->IsRegionEntry || CurSeg->Count == LCS.getExecutionCount())
       continue;
+    if (!CurSeg->HasCount) // don't show tooltips for SkippedRegions
+      continue;
     return true;
   }
   return false;
diff --git a/llvm/unittests/ProfileData/CoverageMappingTest.cpp b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
index 1cf497cbdc2e6..23f66a0232ddb 100644
--- a/llvm/unittests/ProfileData/CoverageMappingTest.cpp
+++ b/llvm/unittests/ProfileData/CoverageMappingTest.cpp
@@ -187,6 +187,11 @@ struct CoverageMappingTest : ::testing::TestWithParam<std::tuple<bool, bool>> {
                 : CounterMappingRegion::makeRegion(C, FileID, LS, CS, LE, CE));
   }
 
+  void addSkipped(StringRef File, unsigned LS, unsigned CS, unsigned LE,
+                  unsigned CE) {
+    addCMR(Counter::getZero(), File, LS, CS, LE, CE, true);
+  }
+
   void addMCDCDecisionCMR(unsigned Mask, unsigned NC, StringRef File,
                           unsigned LS, unsigned CS, unsigned LE, unsigned CE) {
     auto &Regions = InputFunctions.back().Regions;
@@ -700,22 +705,33 @@ TEST_P(CoverageMappingTest, test_line_coverage_iterator) {
 
   startFunction("func", 0x1234);
   addCMR(Counter::getCounter(0), "file1", 1, 1, 9, 9);
+  addSkipped("file1", 1, 3, 1, 8); // skipped region inside previous block
   addCMR(Counter::getCounter(1), "file1", 1, 1, 4, 7);
+  addSkipped("file1", 4, 1, 4, 20); // skipped line
   addCMR(Counter::getCounter(2), "file1", 5, 8, 9, 1);
+  addSkipped("file1", 10, 1, 12,
+             20); // skipped region which contains next region
   addCMR(Counter::getCounter(3), "file1", 10, 10, 11, 11);
   EXPECT_THAT_ERROR(loadCoverageMapping(), Succeeded());
-
   CoverageData Data = LoadedCoverage->getCoverageForFile("file1");
 
   unsigned Line = 0;
-  unsigned LineCounts[] = {20, 20, 20, 20, 30, 10, 10, 10, 10, 0, 0};
+  const unsigned LineCounts[] = {20, 20, 20, 0, 30, 10, 10, 10, 10, 0, 0, 0, 0};
+  const bool MappedLines[] = {1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0};
+  ASSERT_EQ(std::size(LineCounts), std::size(MappedLines));
+
   for (const auto &LCS : getLineCoverageStats(Data)) {
+    ASSERT_LT(Line, std::size(LineCounts));
+    ASSERT_LT(Line, std::size(MappedLines));
+
     ASSERT_EQ(Line + 1, LCS.getLine());
-    errs() << "Line: " << Line + 1 << ", count = " << LCS.getExecutionCount() << "\n";
+    errs() << "Line: " << Line + 1 << ", count = " << LCS.getExecutionCount()
+           << ", mapped = " << LCS.isMapped() << "\n";
     ASSERT_EQ(LineCounts[Line], LCS.getExecutionCount());
+    ASSERT_EQ(MappedLines[Line], LCS.isMapped());
     ++Line;
   }
-  ASSERT_EQ(11U, Line);
+  ASSERT_EQ(12U, Line);
 
   // Check that operator->() works / compiles.
   ASSERT_EQ(1U, LineCoverageIterator(Data)->getLine());

From 365aa1574a1b4a3cdee6648227d095d00536ffde Mon Sep 17 00:00:00 2001
From: Rin Dobrescu <irina.dobrescu@arm.com>
Date: Mon, 22 Jan 2024 11:59:40 +0000
Subject: [PATCH 371/843] [AArch64] Convert UADDV(add(zext, zext)) into
 UADDLV(concat). (#78301)

We can convert a UADDV(add(zext(64-bit source), zext(64-bit source)))
into UADDLV(concat), where the concat represents the 64-bit zext
sources.
---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  48 +++++-
 .../AArch64/aarch64-combine-add-zext.ll       |  54 ++++++
 llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll  |  12 +-
 llvm/test/CodeGen/AArch64/vecreduce-add.ll    | 155 +++++++++---------
 4 files changed, 179 insertions(+), 90 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-combine-add-zext.ll

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 96ea692d03f56..23d37d67864a5 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16603,8 +16603,7 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
   auto DetectAddExtract = [&](SDValue A) {
     // Look for add(zext(extract_lo(x)), zext(extract_hi(x))), returning
     // UADDLP(x) if found.
-    if (A.getOpcode() != ISD::ADD)
-      return SDValue();
+    assert(A.getOpcode() == ISD::ADD);
     EVT VT = A.getValueType();
     SDValue Op0 = A.getOperand(0);
     SDValue Op1 = A.getOperand(1);
@@ -16647,11 +16646,54 @@ static SDValue performUADDVAddCombine(SDValue A, SelectionDAG &DAG) {
   return SDValue();
 }
 
+// We can convert a UADDV(add(zext(64-bit source), zext(64-bit source))) into
+// UADDLV(concat), where the concat represents the 64-bit zext sources.
+static SDValue performUADDVZextCombine(SDValue A, SelectionDAG &DAG) {
+  // Look for add(zext(64-bit source), zext(64-bit source)), returning
+  // UADDLV(concat(zext, zext)) if found.
+  assert(A.getOpcode() == ISD::ADD);
+  EVT VT = A.getValueType();
+  if (VT != MVT::v8i16 && VT != MVT::v4i32 && VT != MVT::v2i64)
+    return SDValue();
+  SDValue Op0 = A.getOperand(0);
+  SDValue Op1 = A.getOperand(1);
+  if (Op0.getOpcode() != ISD::ZERO_EXTEND || Op0.getOpcode() != Op1.getOpcode())
+    return SDValue();
+  SDValue Ext0 = Op0.getOperand(0);
+  SDValue Ext1 = Op1.getOperand(0);
+  EVT ExtVT0 = Ext0.getValueType();
+  EVT ExtVT1 = Ext1.getValueType();
+  // Check zext VTs are the same and 64-bit length.
+  if (ExtVT0 != ExtVT1 ||
+      VT.getScalarSizeInBits() != (2 * ExtVT0.getScalarSizeInBits()))
+    return SDValue();
+  // Get VT for concat of zext sources.
+  EVT PairVT = ExtVT0.getDoubleNumVectorElementsVT(*DAG.getContext());
+  SDValue Concat =
+      DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(A), PairVT, Ext0, Ext1);
+
+  switch (VT.getSimpleVT().SimpleTy) {
+  case MVT::v2i64:
+  case MVT::v4i32:
+    return DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), VT, Concat);
+  case MVT::v8i16: {
+    SDValue Uaddlv =
+        DAG.getNode(AArch64ISD::UADDLV, SDLoc(A), MVT::v4i32, Concat);
+    return DAG.getNode(AArch64ISD::NVCAST, SDLoc(A), MVT::v8i16, Uaddlv);
+  }
+  default:
+    llvm_unreachable("Unhandled vector type");
+  }
+}
+
 static SDValue performUADDVCombine(SDNode *N, SelectionDAG &DAG) {
   SDValue A = N->getOperand(0);
-  if (A.getOpcode() == ISD::ADD)
+  if (A.getOpcode() == ISD::ADD) {
     if (SDValue R = performUADDVAddCombine(A, DAG))
       return DAG.getNode(N->getOpcode(), SDLoc(N), N->getValueType(0), R);
+    else if (SDValue R = performUADDVZextCombine(A, DAG))
+      return R;
+  }
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-combine-add-zext.ll b/llvm/test/CodeGen/AArch64/aarch64-combine-add-zext.ll
new file mode 100644
index 0000000000000..b1b995931ac02
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-combine-add-zext.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
+
+define i16 @test_add_zext_v8i16(<8 x i8> %a, <8 x i8> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_add_zext_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uaddlv h0, v0.16b
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    ret
+  %z1 = zext <8 x i8> %a to <8 x i16>
+  %z2 = zext <8 x i8> %b to <8 x i16>
+  %z = add <8 x i16> %z1, %z2
+  %r = call i16 @llvm.vector.reduce.add.v8i16(<8 x i16> %z)
+  ret i16 %r
+}
+
+define i32 @test_add_zext_v4i32(<4 x i16> %a, <4 x i16> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_add_zext_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uaddlv s0, v0.8h
+; CHECK-NEXT:    fmov w0, s0
+; CHECK-NEXT:    ret
+  %z1 = zext <4 x i16> %a to <4 x i32>
+  %z2 = zext <4 x i16> %b to <4 x i32>
+  %z = add <4 x i32> %z1, %z2
+  %r = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %z)
+  ret i32 %r
+}
+
+define i64 @test_add_zext_v2i64(<2 x i32> %a, <2 x i32> %b) local_unnamed_addr #0 {
+; CHECK-LABEL: test_add_zext_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uaddlv d0, v0.4s
+; CHECK-NEXT:    fmov x0, d0
+; CHECK-NEXT:    ret
+  %z1 = zext <2 x i32> %a to <2 x i64>
+  %z2 = zext <2 x i32> %b to <2 x i64>
+  %z = add <2 x i64> %z1, %z2
+  %r = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> %z)
+  ret i64 %r
+}
+
+declare i16 @llvm.vector.reduce.add.v8i16(<8 x i16>)
+declare i32 @llvm.vector.reduce.add.v4i32(<4 x i32>)
+declare i64 @llvm.vector.reduce.add.v2i64(<2 x i64>)
diff --git a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
index 1fc177f034975..24cce9a2b26b5 100644
--- a/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
+++ b/llvm/test/CodeGen/AArch64/avoid-pre-trunc.ll
@@ -18,14 +18,14 @@ define i32 @lower_lshr(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d, <
 ; CHECK-NEXT:    mov v4.s[2], v6.s[0]
 ; CHECK-NEXT:    mov v0.s[3], v1.s[0]
 ; CHECK-NEXT:    mov v4.s[3], v3.s[0]
-; CHECK-NEXT:    xtn v2.4h, v0.4s
+; CHECK-NEXT:    xtn v1.4h, v0.4s
 ; CHECK-NEXT:    shrn v0.4h, v0.4s, #16
-; CHECK-NEXT:    xtn v1.4h, v4.4s
+; CHECK-NEXT:    xtn v2.4h, v4.4s
 ; CHECK-NEXT:    shrn v3.4h, v4.4s, #16
-; CHECK-NEXT:    uhadd v0.4h, v2.4h, v0.4h
-; CHECK-NEXT:    uhadd v1.4h, v1.4h, v3.4h
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-NEXT:    addv s0, v0.4s
+; CHECK-NEXT:    uhadd v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    uhadd v1.4h, v2.4h, v3.4h
+; CHECK-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-NEXT:    uaddlv s0, v0.8h
 ; CHECK-NEXT:    fmov w0, s0
 ; CHECK-NEXT:    ret
   %l87  = tail call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %a)
diff --git a/llvm/test/CodeGen/AArch64/vecreduce-add.ll b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
index 0b43e3b695a39..ad82d2e7955c2 100644
--- a/llvm/test/CodeGen/AArch64/vecreduce-add.ll
+++ b/llvm/test/CodeGen/AArch64/vecreduce-add.ll
@@ -1434,37 +1434,21 @@ entry:
 }
 
 define zeroext i8 @add_v16i8_v16i8_acc(<16 x i8> %x, i8 %a) {
-; CHECK-SD-BASE-LABEL: add_v16i8_v16i8_acc:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    addv b0, v0.16b
-; CHECK-SD-BASE-NEXT:    fmov w8, s0
-; CHECK-SD-BASE-NEXT:    add w8, w8, w0
-; CHECK-SD-BASE-NEXT:    and w0, w8, #0xff
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: add_v16i8_v16i8_acc:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    addv b0, v0.16b
-; CHECK-SD-DOT-NEXT:    fmov w8, s0
-; CHECK-SD-DOT-NEXT:    add w8, w8, w0
-; CHECK-SD-DOT-NEXT:    and w0, w8, #0xff
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: add_v16i8_v16i8_acc:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    addv b0, v0.16b
-; CHECK-GI-BASE-NEXT:    fmov w8, s0
-; CHECK-GI-BASE-NEXT:    add w8, w0, w8, uxtb
-; CHECK-GI-BASE-NEXT:    and w0, w8, #0xff
-; CHECK-GI-BASE-NEXT:    ret
+; CHECK-SD-LABEL: add_v16i8_v16i8_acc:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    addv b0, v0.16b
+; CHECK-SD-NEXT:    fmov w8, s0
+; CHECK-SD-NEXT:    add w8, w8, w0
+; CHECK-SD-NEXT:    and w0, w8, #0xff
+; CHECK-SD-NEXT:    ret
 ;
-; CHECK-GI-DOT-LABEL: add_v16i8_v16i8_acc:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    addv b0, v0.16b
-; CHECK-GI-DOT-NEXT:    fmov w8, s0
-; CHECK-GI-DOT-NEXT:    add w8, w0, w8, uxtb
-; CHECK-GI-DOT-NEXT:    and w0, w8, #0xff
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-GI-LABEL: add_v16i8_v16i8_acc:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    addv b0, v0.16b
+; CHECK-GI-NEXT:    fmov w8, s0
+; CHECK-GI-NEXT:    add w8, w0, w8, uxtb
+; CHECK-GI-NEXT:    and w0, w8, #0xff
+; CHECK-GI-NEXT:    ret
 entry:
   %z = call i8 @llvm.vector.reduce.add.v16i8(<16 x i8> %x)
   %r = add i8 %z, %a
@@ -1783,8 +1767,10 @@ entry:
 define i64 @add_pair_v2i32_v2i64_zext(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-SD-LABEL: add_pair_v2i32_v2i64_zext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
 ; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1889,8 +1875,10 @@ entry:
 define i32 @add_pair_v4i16_v4i32_zext(<4 x i16> %x, <4 x i16> %y) {
 ; CHECK-SD-LABEL: add_pair_v4i16_v4i32_zext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3298,8 +3286,8 @@ define i64 @add_pair_v2i16_v2i64_zext(<2 x i16> %x, <2 x i16> %y) {
 ; CHECK-SD-NEXT:    movi d2, #0x00ffff0000ffff
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
 ; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3590,10 +3578,12 @@ entry:
 define i32 @add_pair_v4i8_v4i32_zext(<4 x i8> %x, <4 x i8> %y) {
 ; CHECK-SD-LABEL: add_pair_v4i8_v4i32_zext:
 ; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
 ; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
 ; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-SD-NEXT:    uaddl v0.4s, v0.4h, v1.4h
-; CHECK-SD-NEXT:    addv s0, v0.4s
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
 ; CHECK-SD-NEXT:    fmov w0, s0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -3710,9 +3700,11 @@ entry:
 define zeroext i16 @add_pair_v8i8_v8i16_zext(<8 x i8> %x, <8 x i8> %y) {
 ; CHECK-SD-LABEL: add_pair_v8i8_v8i16_zext:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    uaddl v0.8h, v0.8b, v1.8b
-; CHECK-SD-NEXT:    addv h0, v0.8h
-; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT:    // kill: def $d1 killed $d1 def $q1
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uaddlv h0, v0.16b
+; CHECK-SD-NEXT:    umov w0, v0.h[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: add_pair_v8i8_v8i16_zext:
@@ -4047,8 +4039,8 @@ define i64 @add_pair_v2i8_v2i64_zext(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-SD-NEXT:    movi d2, #0x0000ff000000ff
 ; CHECK-SD-NEXT:    and v0.8b, v0.8b, v2.8b
 ; CHECK-SD-NEXT:    and v1.8b, v1.8b, v2.8b
-; CHECK-SD-NEXT:    uaddl v0.2d, v0.2s, v1.2s
-; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    mov v0.d[1], v1.d[0]
+; CHECK-SD-NEXT:    uaddlv d0, v0.4s
 ; CHECK-SD-NEXT:    fmov x0, d0
 ; CHECK-SD-NEXT:    ret
 ;
@@ -5378,33 +5370,19 @@ entry:
 }
 
 define i32 @extract_hi_lo(<8 x i16> %a) {
-; CHECK-SD-BASE-LABEL: extract_hi_lo:
-; CHECK-SD-BASE:       // %bb.0: // %entry
-; CHECK-SD-BASE-NEXT:    uaddlv s0, v0.8h
-; CHECK-SD-BASE-NEXT:    fmov w0, s0
-; CHECK-SD-BASE-NEXT:    ret
-;
-; CHECK-SD-DOT-LABEL: extract_hi_lo:
-; CHECK-SD-DOT:       // %bb.0: // %entry
-; CHECK-SD-DOT-NEXT:    uaddlv s0, v0.8h
-; CHECK-SD-DOT-NEXT:    fmov w0, s0
-; CHECK-SD-DOT-NEXT:    ret
-;
-; CHECK-GI-BASE-LABEL: extract_hi_lo:
-; CHECK-GI-BASE:       // %bb.0: // %entry
-; CHECK-GI-BASE-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-BASE-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-BASE-NEXT:    addv s0, v0.4s
-; CHECK-GI-BASE-NEXT:    fmov w0, s0
-; CHECK-GI-BASE-NEXT:    ret
+; CHECK-SD-LABEL: extract_hi_lo:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
 ;
-; CHECK-GI-DOT-LABEL: extract_hi_lo:
-; CHECK-GI-DOT:       // %bb.0: // %entry
-; CHECK-GI-DOT-NEXT:    ushll v1.4s, v0.4h, #0
-; CHECK-GI-DOT-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
-; CHECK-GI-DOT-NEXT:    addv s0, v0.4s
-; CHECK-GI-DOT-NEXT:    fmov w0, s0
-; CHECK-GI-DOT-NEXT:    ret
+; CHECK-GI-LABEL: extract_hi_lo:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    ushll v1.4s, v0.4h, #0
+; CHECK-GI-NEXT:    uaddw2 v0.4s, v1.4s, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
@@ -5416,12 +5394,20 @@ entry:
 }
 
 define i32 @extract_hi_hi(<8 x i16> %a) {
-; CHECK-LABEL: extract_hi_hi:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl2 v0.4s, v0.8h, v0.8h
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_hi_hi:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; CHECK-SD-NEXT:    mov v0.d[1], v0.d[0]
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_hi_hi:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl2 v0.4s, v0.8h, v0.8h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %e2 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
   %z2 = zext <4 x i16> %e2 to <4 x i32>
@@ -5431,12 +5417,19 @@ entry:
 }
 
 define i32 @extract_lo_lo(<8 x i16> %a) {
-; CHECK-LABEL: extract_lo_lo:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    uaddl v0.4s, v0.4h, v0.4h
-; CHECK-NEXT:    addv s0, v0.4s
-; CHECK-NEXT:    fmov w0, s0
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: extract_lo_lo:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    mov v0.d[1], v0.d[0]
+; CHECK-SD-NEXT:    uaddlv s0, v0.8h
+; CHECK-SD-NEXT:    fmov w0, s0
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: extract_lo_lo:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    uaddl v0.4s, v0.4h, v0.4h
+; CHECK-GI-NEXT:    addv s0, v0.4s
+; CHECK-GI-NEXT:    fmov w0, s0
+; CHECK-GI-NEXT:    ret
 entry:
   %e1 = shufflevector <8 x i16> %a, <8 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %z1 = zext <4 x i16> %e1 to <4 x i32>

From 3c94154c860ea9c2fdd5775bd6ad31ac1db0c261 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Mon, 22 Jan 2024 20:12:39 +0800
Subject: [PATCH 372/843] [mlir] Fix -Wunused-variable in Barvinok.cpp (NFC)

llvm-project/mlir/lib/Analysis/Presburger/Barvinok.cpp:262:21:
 error: unused variable 'd' [-Werror,-Wunused-variable]
  for (const Point &d : ds)
                    ^
1 error generated.
---
 mlir/lib/Analysis/Presburger/Barvinok.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/mlir/lib/Analysis/Presburger/Barvinok.cpp b/mlir/lib/Analysis/Presburger/Barvinok.cpp
index e0fd0dd8caa4d..d2752ded6b43f 100644
--- a/mlir/lib/Analysis/Presburger/Barvinok.cpp
+++ b/mlir/lib/Analysis/Presburger/Barvinok.cpp
@@ -259,9 +259,11 @@ std::pair<QuasiPolynomial, std::vector<Fraction>>
 substituteMuInTerm(unsigned numParams, ParamPoint v, std::vector<Point> ds,
                    Point mu) {
   unsigned numDims = mu.size();
+#ifndef NDEBUG
   for (const Point &d : ds)
     assert(d.size() == numDims &&
            "μ has to have the same number of dimensions as the generators!");
+#endif
 
   // First, the exponent in the numerator becomes
   // - (μ • u_1) * (floor(first col of v))

From 09bd2cb70f4db9f6638a2e9c89d0397d051a3897 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Jan 2024 12:01:28 +0000
Subject: [PATCH 373/843] [X86] Add printLaneBroadcast constant comments
 helper. NFC.

Pull out helper instead of repeating switch cases.
---
 llvm/lib/Target/X86/X86MCInstLower.cpp | 158 +++++++++++--------------
 1 file changed, 72 insertions(+), 86 deletions(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 53a81fde12c82..5958face0bab3 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1582,6 +1582,58 @@ static void printZeroUpperMove(const MachineInstr *MI, MCStreamer &OutStreamer,
   OutStreamer.AddComment(CS.str());
 }
 
+static void printLaneBroadcast(const MachineInstr *MI, MCStreamer &OutStreamer,
+                               int NumLanes, int BitWidth) {
+  assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+         "Unexpected number of operands!");
+  if (auto *C =
+          X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+    int CstEltSize = C->getType()->getScalarSizeInBits();
+
+    std::string Comment;
+    raw_string_ostream CS(Comment);
+    const MachineOperand &DstOp = MI->getOperand(0);
+    CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+    if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
+      int NumElements = CDS->getNumElements();
+      if ((BitWidth % CstEltSize) == 0)
+        NumElements = std::min<int>(NumElements, BitWidth / CstEltSize);
+      CS << "[";
+      for (int l = 0; l != NumLanes; ++l) {
+        for (int i = 0; i < NumElements; ++i) {
+          if (i != 0 || l != 0)
+            CS << ",";
+          if (CDS->getElementType()->isIntegerTy())
+            printConstant(CDS->getElementAsAPInt(i), CS);
+          else if (CDS->getElementType()->isHalfTy() ||
+                   CDS->getElementType()->isFloatTy() ||
+                   CDS->getElementType()->isDoubleTy())
+            printConstant(CDS->getElementAsAPFloat(i), CS);
+          else
+            CS << "?";
+        }
+      }
+      CS << "]";
+      OutStreamer.AddComment(CS.str());
+    } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
+      int NumOperands = CV->getNumOperands();
+      if ((BitWidth % CstEltSize) == 0)
+        NumOperands = std::min<int>(NumOperands, BitWidth / CstEltSize);
+      CS << "<";
+      for (int l = 0; l != NumLanes; ++l) {
+        for (int i = 0; i < NumOperands; ++i) {
+          if (i != 0 || l != 0)
+            CS << ",";
+          printConstant(CV->getOperand(i),
+                        CV->getType()->getPrimitiveSizeInBits(), CS);
+        }
+      }
+      CS << ">";
+      OutStreamer.AddComment(CS.str());
+    }
+  }
+}
+
 void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
   assert((getSubtarget().isOSWindows() || TM.getTargetTriple().isUEFI()) &&
@@ -1908,102 +1960,36 @@ static void addConstantComments(const MachineInstr *MI,
 #define CASE_512_MOV_RM()                                                      \
   MOV_AVX512_CASE(Z)
 
-#define CASE_ALL_MOV_RM()                                                      \
-  MOV_CASE(, )   /* SSE */                                                     \
-  MOV_CASE(V, )  /* AVX-128 */                                                 \
-  MOV_CASE(V, Y) /* AVX-256 */                                                 \
-  MOV_AVX512_CASE(Z)                                                           \
-  MOV_AVX512_CASE(Z256)                                                        \
-  MOV_AVX512_CASE(Z128)
-
     // For loads from a constant pool to a vector register, print the constant
     // loaded.
-    CASE_ALL_MOV_RM()
+    CASE_128_MOV_RM()
+    printLaneBroadcast(MI, OutStreamer, 1, 128);
+    break;
+    CASE_256_MOV_RM()
+    printLaneBroadcast(MI, OutStreamer, 1, 256);
+    break;
+    CASE_512_MOV_RM()
+    printLaneBroadcast(MI, OutStreamer, 1, 512);
+    break;
   case X86::VBROADCASTF128rm:
   case X86::VBROADCASTI128rm:
   case X86::VBROADCASTF32X4Z256rm:
-  case X86::VBROADCASTF32X4rm:
-  case X86::VBROADCASTF32X8rm:
   case X86::VBROADCASTF64X2Z128rm:
-  case X86::VBROADCASTF64X2rm:
-  case X86::VBROADCASTF64X4rm:
   case X86::VBROADCASTI32X4Z256rm:
-  case X86::VBROADCASTI32X4rm:
-  case X86::VBROADCASTI32X8rm:
   case X86::VBROADCASTI64X2Z128rm:
+    printLaneBroadcast(MI, OutStreamer, 2, 128);
+    break;
+  case X86::VBROADCASTF32X4rm:
+  case X86::VBROADCASTF64X2rm:
+  case X86::VBROADCASTI32X4rm:
   case X86::VBROADCASTI64X2rm:
+    printLaneBroadcast(MI, OutStreamer, 4, 128);
+    break;
+  case X86::VBROADCASTF32X8rm:
+  case X86::VBROADCASTF64X4rm:
+  case X86::VBROADCASTI32X8rm:
   case X86::VBROADCASTI64X4rm:
-    assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
-           "Unexpected number of operands!");
-    if (auto *C =
-            X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
-      int NumLanes = 1;
-      int BitWidth = 128;
-      int CstEltSize = C->getType()->getScalarSizeInBits();
-
-      // Get destination BitWidth + override NumLanes for the broadcasts.
-      switch (MI->getOpcode()) {
-      CASE_128_MOV_RM()                NumLanes = 1; BitWidth = 128; break;
-      CASE_256_MOV_RM()                NumLanes = 1; BitWidth = 256; break;
-      CASE_512_MOV_RM()                NumLanes = 1; BitWidth = 512; break;
-      case X86::VBROADCASTF128rm:      NumLanes = 2; BitWidth = 128; break;
-      case X86::VBROADCASTI128rm:      NumLanes = 2; BitWidth = 128; break;
-      case X86::VBROADCASTF32X4Z256rm: NumLanes = 2; BitWidth = 128; break;
-      case X86::VBROADCASTF32X4rm:     NumLanes = 4; BitWidth = 128; break;
-      case X86::VBROADCASTF32X8rm:     NumLanes = 2; BitWidth = 256; break;
-      case X86::VBROADCASTF64X2Z128rm: NumLanes = 2; BitWidth = 128; break;
-      case X86::VBROADCASTF64X2rm:     NumLanes = 4; BitWidth = 128; break;
-      case X86::VBROADCASTF64X4rm:     NumLanes = 2; BitWidth = 256; break;
-      case X86::VBROADCASTI32X4Z256rm: NumLanes = 2; BitWidth = 128; break;
-      case X86::VBROADCASTI32X4rm:     NumLanes = 4; BitWidth = 128; break;
-      case X86::VBROADCASTI32X8rm:     NumLanes = 2; BitWidth = 256; break;
-      case X86::VBROADCASTI64X2Z128rm: NumLanes = 2; BitWidth = 128; break;
-      case X86::VBROADCASTI64X2rm:     NumLanes = 4; BitWidth = 128; break;
-      case X86::VBROADCASTI64X4rm:     NumLanes = 2; BitWidth = 256; break;
-      }
-
-      std::string Comment;
-      raw_string_ostream CS(Comment);
-      const MachineOperand &DstOp = MI->getOperand(0);
-      CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
-      if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
-        int NumElements = CDS->getNumElements();
-        if ((BitWidth % CstEltSize) == 0)
-          NumElements = std::min<int>(NumElements, BitWidth / CstEltSize);
-        CS << "[";
-        for (int l = 0; l != NumLanes; ++l) {
-          for (int i = 0; i < NumElements; ++i) {
-            if (i != 0 || l != 0)
-              CS << ",";
-            if (CDS->getElementType()->isIntegerTy())
-              printConstant(CDS->getElementAsAPInt(i), CS);
-            else if (CDS->getElementType()->isHalfTy() ||
-                     CDS->getElementType()->isFloatTy() ||
-                     CDS->getElementType()->isDoubleTy())
-              printConstant(CDS->getElementAsAPFloat(i), CS);
-            else
-              CS << "?";
-          }
-        }
-        CS << "]";
-        OutStreamer.AddComment(CS.str());
-      } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
-        int NumOperands = CV->getNumOperands();
-        if ((BitWidth % CstEltSize) == 0)
-          NumOperands = std::min<int>(NumOperands, BitWidth / CstEltSize);
-        CS << "<";
-        for (int l = 0; l != NumLanes; ++l) {
-          for (int i = 0; i < NumOperands; ++i) {
-            if (i != 0 || l != 0)
-              CS << ",";
-            printConstant(CV->getOperand(i),
-                          CV->getType()->getPrimitiveSizeInBits(), CS);
-          }
-        }
-        CS << ">";
-        OutStreamer.AddComment(CS.str());
-      }
-    }
+    printLaneBroadcast(MI, OutStreamer, 2, 256);
     break;
 
   case X86::MOVDDUPrm:

From 60963272c5c83890910fd97f8d941180b6006fca Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Jan 2024 12:16:12 +0000
Subject: [PATCH 374/843] [X86] Add printElementBroadcast constant comments
 helper. NFC.

Pull out helper instead of repeating switch cases.
---
 llvm/lib/Target/X86/X86MCInstLower.cpp | 135 ++++++++++++-------------
 1 file changed, 65 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 5958face0bab3..c92712df3f767 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1634,6 +1634,28 @@ static void printLaneBroadcast(const MachineInstr *MI, MCStreamer &OutStreamer,
   }
 }
 
+static void printElementBroadcast(const MachineInstr *MI,
+                                  MCStreamer &OutStreamer, int NumElts,
+                                  int EltBits) {
+  assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+         "Unexpected number of operands!");
+  if (auto *C =
+          X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+    std::string Comment;
+    raw_string_ostream CS(Comment);
+    const MachineOperand &DstOp = MI->getOperand(0);
+    CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
+    CS << "[";
+    for (int i = 0; i != NumElts; ++i) {
+      if (i != 0)
+        CS << ",";
+      printConstant(C, EltBits, CS);
+    }
+    CS << "]";
+    OutStreamer.AddComment(CS.str());
+  }
+}
+
 void X86AsmPrinter::EmitSEHInstruction(const MachineInstr *MI) {
   assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
   assert((getSubtarget().isOSWindows() || TM.getTargetTriple().isUEFI()) &&
@@ -1992,90 +2014,63 @@ static void addConstantComments(const MachineInstr *MI,
     printLaneBroadcast(MI, OutStreamer, 2, 256);
     break;
 
+  // For broadcast loads from a constant pool to a vector register, repeatedly
+  // print the constant loaded.
   case X86::MOVDDUPrm:
   case X86::VMOVDDUPrm:
   case X86::VMOVDDUPZ128rm:
-  case X86::VBROADCASTSSrm:
-  case X86::VBROADCASTSSYrm:
-  case X86::VBROADCASTSSZ128rm:
-  case X86::VBROADCASTSSZ256rm:
-  case X86::VBROADCASTSSZrm:
+  case X86::VPBROADCASTQrm:
+  case X86::VPBROADCASTQZ128rm:
+    printElementBroadcast(MI, OutStreamer, 2, 64);
+    break;
   case X86::VBROADCASTSDYrm:
   case X86::VBROADCASTSDZ256rm:
+  case X86::VPBROADCASTQYrm:
+  case X86::VPBROADCASTQZ256rm:
+    printElementBroadcast(MI, OutStreamer, 4, 64);
+    break;
   case X86::VBROADCASTSDZrm:
-  case X86::VPBROADCASTBrm:
-  case X86::VPBROADCASTBYrm:
-  case X86::VPBROADCASTBZ128rm:
-  case X86::VPBROADCASTBZ256rm:
-  case X86::VPBROADCASTBZrm:
+  case X86::VPBROADCASTQZrm:
+    printElementBroadcast(MI, OutStreamer, 8, 64);
+    break;
+  case X86::VBROADCASTSSrm:
+  case X86::VBROADCASTSSZ128rm:
   case X86::VPBROADCASTDrm:
-  case X86::VPBROADCASTDYrm:
   case X86::VPBROADCASTDZ128rm:
+    printElementBroadcast(MI, OutStreamer, 4, 32);
+    break;
+  case X86::VBROADCASTSSYrm:
+  case X86::VBROADCASTSSZ256rm:
+  case X86::VPBROADCASTDYrm:
   case X86::VPBROADCASTDZ256rm:
+    printElementBroadcast(MI, OutStreamer, 8, 32);
+    break;
+  case X86::VBROADCASTSSZrm:
   case X86::VPBROADCASTDZrm:
-  case X86::VPBROADCASTQrm:
-  case X86::VPBROADCASTQYrm:
-  case X86::VPBROADCASTQZ128rm:
-  case X86::VPBROADCASTQZ256rm:
-  case X86::VPBROADCASTQZrm:
+    printElementBroadcast(MI, OutStreamer, 16, 32);
+    break;
   case X86::VPBROADCASTWrm:
-  case X86::VPBROADCASTWYrm:
   case X86::VPBROADCASTWZ128rm:
+    printElementBroadcast(MI, OutStreamer, 8, 16);
+    break;
+  case X86::VPBROADCASTWYrm:
   case X86::VPBROADCASTWZ256rm:
+    printElementBroadcast(MI, OutStreamer, 16, 16);
+    break;
   case X86::VPBROADCASTWZrm:
-    assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
-           "Unexpected number of operands!");
-    if (auto *C =
-            X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
-      int NumElts, EltBits;
-      switch (MI->getOpcode()) {
-      default: llvm_unreachable("Invalid opcode");
-      case X86::MOVDDUPrm:          NumElts = 2;  EltBits = 64; break;
-      case X86::VMOVDDUPrm:         NumElts = 2;  EltBits = 64; break;
-      case X86::VMOVDDUPZ128rm:     NumElts = 2;  EltBits = 64; break;
-      case X86::VBROADCASTSSrm:     NumElts = 4;  EltBits = 32; break;
-      case X86::VBROADCASTSSYrm:    NumElts = 8;  EltBits = 32; break;
-      case X86::VBROADCASTSSZ128rm: NumElts = 4;  EltBits = 32; break;
-      case X86::VBROADCASTSSZ256rm: NumElts = 8;  EltBits = 32; break;
-      case X86::VBROADCASTSSZrm:    NumElts = 16; EltBits = 32; break;
-      case X86::VBROADCASTSDYrm:    NumElts = 4;  EltBits = 64; break;
-      case X86::VBROADCASTSDZ256rm: NumElts = 4;  EltBits = 64; break;
-      case X86::VBROADCASTSDZrm:    NumElts = 8;  EltBits = 64; break;
-      case X86::VPBROADCASTBrm:     NumElts = 16; EltBits = 8; break;
-      case X86::VPBROADCASTBYrm:    NumElts = 32; EltBits = 8; break;
-      case X86::VPBROADCASTBZ128rm: NumElts = 16; EltBits = 8; break;
-      case X86::VPBROADCASTBZ256rm: NumElts = 32; EltBits = 8; break;
-      case X86::VPBROADCASTBZrm:    NumElts = 64; EltBits = 8; break;
-      case X86::VPBROADCASTDrm:     NumElts = 4;  EltBits = 32; break;
-      case X86::VPBROADCASTDYrm:    NumElts = 8;  EltBits = 32; break;
-      case X86::VPBROADCASTDZ128rm: NumElts = 4;  EltBits = 32; break;
-      case X86::VPBROADCASTDZ256rm: NumElts = 8;  EltBits = 32; break;
-      case X86::VPBROADCASTDZrm:    NumElts = 16; EltBits = 32; break;
-      case X86::VPBROADCASTQrm:     NumElts = 2;  EltBits = 64; break;
-      case X86::VPBROADCASTQYrm:    NumElts = 4;  EltBits = 64; break;
-      case X86::VPBROADCASTQZ128rm: NumElts = 2;  EltBits = 64; break;
-      case X86::VPBROADCASTQZ256rm: NumElts = 4;  EltBits = 64; break;
-      case X86::VPBROADCASTQZrm:    NumElts = 8;  EltBits = 64; break;
-      case X86::VPBROADCASTWrm:     NumElts = 8;  EltBits = 16; break;
-      case X86::VPBROADCASTWYrm:    NumElts = 16; EltBits = 16; break;
-      case X86::VPBROADCASTWZ128rm: NumElts = 8;  EltBits = 16; break;
-      case X86::VPBROADCASTWZ256rm: NumElts = 16; EltBits = 16; break;
-      case X86::VPBROADCASTWZrm:    NumElts = 32; EltBits = 16; break;
-      }
-
-      std::string Comment;
-      raw_string_ostream CS(Comment);
-      const MachineOperand &DstOp = MI->getOperand(0);
-      CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
-      CS << "[";
-      for (int i = 0; i != NumElts; ++i) {
-        if (i != 0)
-          CS << ",";
-        printConstant(C, EltBits, CS);
-      }
-      CS << "]";
-      OutStreamer.AddComment(CS.str());
-    }
+    printElementBroadcast(MI, OutStreamer, 32, 16);
+    break;
+  case X86::VPBROADCASTBrm:
+  case X86::VPBROADCASTBZ128rm:
+    printElementBroadcast(MI, OutStreamer, 16, 8);
+    break;
+  case X86::VPBROADCASTBYrm:
+  case X86::VPBROADCASTBZ256rm:
+    printElementBroadcast(MI, OutStreamer, 32, 8);
+    break;
+  case X86::VPBROADCASTBZrm:
+    printElementBroadcast(MI, OutStreamer, 64, 8);
+    break;
   }
 }
 

From df4ba00c7b50429fa88c3a9991e9194e2422dc76 Mon Sep 17 00:00:00 2001
From: Hirofumi Nakamura <k.nakamura.hirofumi@gmail.com>
Date: Mon, 22 Jan 2024 21:35:01 +0900
Subject: [PATCH 375/843] [clang-format] Support of TableGen statements in
 unwrapped line parser (#78846)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make TableGen's statements to be parsed considering their structure.
- Avoid to parse label
- Avoid class from being parsed as c++'s class
- Support if statement of the form `if <cond> then { ... }`
- Support defset statement of the form `defset <type> <name> {}`

---------

Co-authored-by: Björn Schäpers <github@hazardy.de>
---
 clang/lib/Format/UnwrappedLineParser.cpp      | 27 ++++++++++++++++++-
 clang/unittests/Format/TokenAnnotatorTest.cpp | 12 +++++++++
 2 files changed, 38 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index 7bc6410a78a49..b904e0e56d9eb 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -1661,7 +1661,8 @@ void UnwrappedLineParser::parseStructuralElement(
     // In Verilog labels can be any expression, so we don't do them here.
     // JS doesn't have macros, and within classes colons indicate fields, not
     // labels.
-    if (!Style.isJavaScript() && !Style.isVerilog() &&
+    // TableGen doesn't have labels.
+    if (!Style.isJavaScript() && !Style.isVerilog() && !Style.isTableGen() &&
         Tokens->peekNextToken()->is(tok::colon) && !Line->MustBeDeclaration) {
       nextToken();
       Line->Tokens.begin()->Tok->MustBreakBefore = true;
@@ -1790,6 +1791,12 @@ void UnwrappedLineParser::parseStructuralElement(
         addUnwrappedLine();
         return;
       }
+      if (Style.isTableGen()) {
+        // Do nothing special. In this case the l_brace becomes FunctionLBrace.
+        // This is same as def and so on.
+        nextToken();
+        break;
+      }
       [[fallthrough]];
     case tok::kw_struct:
     case tok::kw_union:
@@ -2028,6 +2035,16 @@ void UnwrappedLineParser::parseStructuralElement(
         // initialisers are indented the same way.
         if (Style.isCSharp())
           FormatTok->setBlockKind(BK_BracedInit);
+        // TableGen's defset statement has syntax of the form,
+        // `defset <type> <name> = { <statement>... }`
+        if (Style.isTableGen() &&
+            Line->Tokens.begin()->Tok->is(Keywords.kw_defset)) {
+          FormatTok->setFinalizedType(TT_FunctionLBrace);
+          parseBlock(/*MustBeDeclaration=*/false, /*AddLevels=*/1u,
+                     /*MunchSemi=*/false);
+          addUnwrappedLine();
+          break;
+        }
         nextToken();
         parseBracedList();
       } else if (Style.Language == FormatStyle::LK_Proto &&
@@ -2743,6 +2760,14 @@ FormatToken *UnwrappedLineParser::parseIfThenElse(IfStmtKind *IfKind,
     }
   }
 
+  // TableGen's if statement has the form of `if <cond> then { ... }`.
+  if (Style.isTableGen()) {
+    while (!eof() && FormatTok->isNot(Keywords.kw_then)) {
+      // Simply skip until then. This range only contains a value.
+      nextToken();
+    }
+  }
+
   // Handle `if !consteval`.
   if (FormatTok->is(tok::exclaim))
     nextToken();
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index b1e1e70a1abbf..88ca8a6cb39e3 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -2232,6 +2232,18 @@ TEST_F(TokenAnnotatorTest, UnderstandTableGenTokens) {
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
   Tokens = Annotate("01234Vector");
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_Unknown);
+
+  // Structured statements.
+  Tokens = Annotate("class Foo {}");
+  EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_FunctionLBrace);
+  Tokens = Annotate("def Def: Foo {}");
+  EXPECT_TOKEN(Tokens[2], tok::colon, TT_InheritanceColon);
+  EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_FunctionLBrace);
+  Tokens = Annotate("if cond then {} else {}");
+  EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_ControlStatementLBrace);
+  EXPECT_TOKEN(Tokens[6], tok::l_brace, TT_ElseLBrace);
+  Tokens = Annotate("defset Foo Def2 = {}");
+  EXPECT_TOKEN(Tokens[4], tok::l_brace, TT_FunctionLBrace);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandConstructors) {

From 27ce26b06655cfece3d54b30e442ef93d3e78ac7 Mon Sep 17 00:00:00 2001
From: bd1976bris <bd1976llvm@gmail.com>
Date: Mon, 22 Jan 2024 12:37:11 +0000
Subject: [PATCH 376/843] [Sema] Add `-fvisibility-global-new-delete=` option
 (#75364)

[Sema] Add `-fvisibility-global-new-delete=` option (#75364)

By default the implicitly declared replaceable global new and delete
operators are given a default visibility attribute. Previous work, see:
https://reviews.llvm.org/D53787, added
`-fvisibility-global-new-delete-hidden` to change this to a hidden
visibility attribute.

This change adds `-fvisibility-global-new-delete=` which controls
whether (or not) to add an implicit visibility attribute to the implicit
declarations for these functions, and what visibility that attribute
will specify. The option takes 4 values: `force-hidden`,
`force-protected`, `force-default` and `source`. Option values
`force-hidden`, `force-protected` and `force-default` assign hidden,
protected, and default visibilities respectively; the use of the term
force in the value names is designed to imply to a user that the semantics
of this option differ significantly from `-fvisibility=`. An option
value of `source` implies that no implicit attribute is added; without
the attribute the replaceable global new and delete operators behave
normally (like other functions) with respect to visibility attributes,
pragmas and options.

The motivation for the `source` value is to facilitate users who intend
to replace these functions either for a single linkage unit or a limited
set of linkage units. `-fvisibility-global-new-delete=source` can be
applied globally to the compilations in a build where the existing
`-fvisibility-global-new-delete-hidden` cannot, as it conflicts with a
common pattern where these functions are dynamically imported.

The existing `-fvisibility-global-new-delete-hidden` is now a deprecated
spelling of `-fvisibility-global-new-delete=force-hidden`

A release note has been added for these changes.

`-fvisibility-global-new-delete=source` will be set by default for PS5.
PS5 users that want the normal toolchain behaviour will be able to
supply `-fvisibility-global-new-delete=force-default`.
---
 clang/docs/ReleaseNotes.rst                   | 15 +++++-
 clang/include/clang/Basic/LangOptions.def     |  3 +-
 clang/include/clang/Basic/LangOptions.h       | 31 +++++++++++
 clang/include/clang/Driver/Options.td         | 11 +++-
 clang/lib/Driver/ToolChains/Clang.cpp         | 23 +++++++-
 clang/lib/Driver/ToolChains/PS4CPU.cpp        |  6 +++
 clang/lib/Sema/SemaExprCXX.cpp                | 11 ++--
 .../visibility-global-new-delete.cpp          | 14 +++++
 .../Driver/visibility-global-new-delete.cl    | 52 +++++++++++++++++++
 9 files changed, 157 insertions(+), 9 deletions(-)
 create mode 100644 clang/test/CodeGenCXX/visibility-global-new-delete.cpp
 create mode 100644 clang/test/Driver/visibility-global-new-delete.cl

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 45fc8bbbb6862..5846503af3acd 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -320,7 +320,7 @@ New Compiler Flags
   handlers will be smaller. A throw expression of a type with a
   potentially-throwing destructor will lead to an error.
 
-* ``-fopenacc`` was added as a part of the effort to support OpenACC in clang.
+* ``-fopenacc`` was added as a part of the effort to support OpenACC in Clang.
 
 * ``-fcx-limited-range`` enables the naive mathematical formulas for complex
   division and multiplication with no NaN checking of results. The default is
@@ -331,6 +331,16 @@ New Compiler Flags
   division. See SMITH, R. L. Algorithm 116: Complex division. Commun. ACM 5, 8
   (1962). The default is ``-fno-cx-fortran-rules``.
 
+* ``-fvisibility-global-new-delete=<value>`` gives more freedom to users to
+  control how and if Clang forces a visibility for the replaceable new and
+  delete declarations. The option takes 4 values: ``force-hidden``,
+  ``force-protected``, ``force-default`` and ``source``; ``force-default`` is
+  the default. Option values with prefix ``force-`` assign such declarations
+  an implicit visibility attribute with the corresponding visibility. An option
+  value of ``source`` implies that no implicit attribute is added. Without the
+  attribute the replaceable global new and delete operators behave normally
+  (like other functions) with respect to visibility attributes, pragmas and
+  options (e.g ``--fvisibility=``).
 
 Deprecated Compiler Flags
 -------------------------
@@ -347,6 +357,9 @@ Modified Compiler Flags
   ``rtdcall``. This new default CC only works for M68k and will use the new
   ``m68k_rtdcc`` CC on every functions that are not variadic. The ``-mrtd``
   driver/frontend flag has the same effect when targeting M68k.
+* ``-fvisibility-global-new-delete-hidden`` is now a deprecated spelling of
+  ``-fvisibility-global-new-delete=force-hidden`` (``-fvisibility-global-new-delete=``
+  is new in this release).
 
 Removed Compiler Flags
 -------------------------
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index f181da9aea6f9..c0ab6b97dd8db 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -308,7 +308,8 @@ BENIGN_LANGOPT(IgnoreXCOFFVisibility, 1, 0, "All the visibility attributes that
 BENIGN_LANGOPT(VisibilityInlinesHiddenStaticLocalVar, 1, 0,
                "hidden visibility for static local variables in inline C++ "
                "methods when -fvisibility-inlines hidden is enabled")
-LANGOPT(GlobalAllocationFunctionVisibilityHidden , 1, 0, "hidden visibility for global operator new and delete declaration")
+ENUM_LANGOPT(GlobalAllocationFunctionVisibility, VisibilityForcedKinds, 3, VisibilityForcedKinds::ForceDefault,
+             "How to apply visibility to global operator new and delete declarations")
 LANGOPT(NewInfallible , 1, 0, "Treats throwing global C++ operator new as always returning valid memory (annotates with __attribute__((returns_nonnull)) and throw()). This is detectable in source.")
 BENIGN_LANGOPT(ParseUnknownAnytype, 1, 0, "__unknown_anytype")
 BENIGN_LANGOPT(DebuggerSupport , 1, 0, "debugger support")
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 976a00d1a6fa5..79947431e4e2c 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -381,6 +381,17 @@ class LangOptions : public LangOptionsBase {
     All,
   };
 
+  enum class VisibilityForcedKinds {
+    /// Force hidden visibility
+    ForceHidden,
+    /// Force protected visibility
+    ForceProtected,
+    /// Force default visibility
+    ForceDefault,
+    /// Don't alter the visibility
+    Source,
+  };
+
   enum class VisibilityFromDLLStorageClassKinds {
     /// Keep the IR-gen assigned visibility.
     Keep,
@@ -673,6 +684,26 @@ class LangOptions : public LangOptionsBase {
            DefaultVisiblityExportMapping::All;
   }
 
+  bool hasGlobalAllocationFunctionVisibility() const {
+    return getGlobalAllocationFunctionVisibility() !=
+           VisibilityForcedKinds::Source;
+  }
+
+  bool hasDefaultGlobalAllocationFunctionVisibility() const {
+    return getGlobalAllocationFunctionVisibility() ==
+           VisibilityForcedKinds::ForceDefault;
+  }
+
+  bool hasProtectedGlobalAllocationFunctionVisibility() const {
+    return getGlobalAllocationFunctionVisibility() ==
+           VisibilityForcedKinds::ForceProtected;
+  }
+
+  bool hasHiddenGlobalAllocationFunctionVisibility() const {
+    return getGlobalAllocationFunctionVisibility() ==
+           VisibilityForcedKinds::ForceHidden;
+  }
+
   /// Remap path prefix according to -fmacro-prefix-path option.
   void remapPathPrefix(SmallVectorImpl<char> &Path) const;
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 199537404d0c6..95579d30cd293 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3910,9 +3910,16 @@ def fvisibility_ms_compat : Flag<["-"], "fvisibility-ms-compat">, Group<f_Group>
   HelpText<"Give global types 'default' visibility and global functions and "
            "variables 'hidden' visibility by default">;
 def fvisibility_global_new_delete_hidden : Flag<["-"], "fvisibility-global-new-delete-hidden">, Group<f_Group>,
-  HelpText<"Give global C++ operator new and delete declarations hidden visibility">,
+  HelpText<"Give global C++ operator new and delete declarations hidden visibility">;
+class MarshallingInfoVisibilityGlobalNewDelete<KeyPathAndMacro kpm, code default>
+  : MarshallingInfoEnum<kpm, default>,
+    Values<"force-default,force-protected,force-hidden,source">,
+    NormalizedValuesScope<"LangOptions::VisibilityForcedKinds">,
+    NormalizedValues<["ForceDefault", "ForceProtected", "ForceHidden", "Source"]> {}
+def fvisibility_global_new_delete_EQ : Joined<["-"], "fvisibility-global-new-delete=">, Group<f_Group>,
   Visibility<[ClangOption, CC1Option]>,
-  MarshallingInfoFlag<LangOpts<"GlobalAllocationFunctionVisibilityHidden">>;
+  HelpText<"The visibility for global C++ operator new and delete declarations. If 'source' is specified the visibility is not adjusted">,
+  MarshallingInfoVisibilityGlobalNewDelete<LangOpts<"GlobalAllocationFunctionVisibility">, "ForceDefault">;
 def mdefault_visibility_export_mapping_EQ : Joined<["-"], "mdefault-visibility-export-mapping=">,
   Values<"none,explicit,all">,
   NormalizedValuesScope<"LangOptions::DefaultVisiblityExportMapping">,
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index fead2e884030e..f80f8e44de1c4 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6325,7 +6325,28 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
 
   Args.AddLastArg(CmdArgs, options::OPT_fvisibility_inlines_hidden_static_local_var,
                            options::OPT_fno_visibility_inlines_hidden_static_local_var);
-  Args.AddLastArg(CmdArgs, options::OPT_fvisibility_global_new_delete_hidden);
+
+  // -fvisibility-global-new-delete-hidden is a deprecated spelling of
+  // -fvisibility-global-new-delete=force-hidden.
+  if (const Arg *A =
+          Args.getLastArg(options::OPT_fvisibility_global_new_delete_hidden)) {
+    D.Diag(diag::warn_drv_deprecated_arg)
+        << A->getAsString(Args)
+        << "-fvisibility-global-new-delete=force-hidden";
+  }
+
+  if (const Arg *A =
+          Args.getLastArg(options::OPT_fvisibility_global_new_delete_EQ,
+                          options::OPT_fvisibility_global_new_delete_hidden)) {
+    if (A->getOption().matches(options::OPT_fvisibility_global_new_delete_EQ)) {
+      A->render(Args, CmdArgs);
+    } else {
+      assert(A->getOption().matches(
+          options::OPT_fvisibility_global_new_delete_hidden));
+      CmdArgs.push_back("-fvisibility-global-new-delete=force-hidden");
+    }
+  }
+
   Args.AddLastArg(CmdArgs, options::OPT_ftlsmodel_EQ);
 
   if (Args.hasFlag(options::OPT_fnew_infallible,
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 287d2e98931da..8ba8b80cfec78 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -359,6 +359,12 @@ void toolchains::PS4PS5Base::addClangTargetOptions(
 
   CC1Args.push_back("-fno-use-init-array");
 
+  // Default to -fvisibility-global-new-delete=source for PS5.
+  if (getTriple().isPS5() &&
+      !DriverArgs.hasArg(options::OPT_fvisibility_global_new_delete_EQ,
+                         options::OPT_fvisibility_global_new_delete_hidden))
+    CC1Args.push_back("-fvisibility-global-new-delete=source");
+
   const Arg *A =
       DriverArgs.getLastArg(options::OPT_fvisibility_from_dllstorageclass,
                             options::OPT_fno_visibility_from_dllstorageclass);
diff --git a/clang/lib/Sema/SemaExprCXX.cpp b/clang/lib/Sema/SemaExprCXX.cpp
index 482afb5a677ec..953bfe484a528 100644
--- a/clang/lib/Sema/SemaExprCXX.cpp
+++ b/clang/lib/Sema/SemaExprCXX.cpp
@@ -3216,10 +3216,13 @@ void Sema::DeclareGlobalAllocationFunction(DeclarationName Name,
       Alloc->setLocalOwningModule(TheGlobalModuleFragment);
     }
 
-    Alloc->addAttr(VisibilityAttr::CreateImplicit(
-        Context, LangOpts.GlobalAllocationFunctionVisibilityHidden
-                     ? VisibilityAttr::Hidden
-                     : VisibilityAttr::Default));
+    if (LangOpts.hasGlobalAllocationFunctionVisibility())
+      Alloc->addAttr(VisibilityAttr::CreateImplicit(
+          Context, LangOpts.hasHiddenGlobalAllocationFunctionVisibility()
+                       ? VisibilityAttr::Hidden
+                   : LangOpts.hasProtectedGlobalAllocationFunctionVisibility()
+                       ? VisibilityAttr::Protected
+                       : VisibilityAttr::Default));
 
     llvm::SmallVector<ParmVarDecl *, 3> ParamDecls;
     for (QualType T : Params) {
diff --git a/clang/test/CodeGenCXX/visibility-global-new-delete.cpp b/clang/test/CodeGenCXX/visibility-global-new-delete.cpp
new file mode 100644
index 0000000000000..4e9445c9a55cc
--- /dev/null
+++ b/clang/test/CodeGenCXX/visibility-global-new-delete.cpp
@@ -0,0 +1,14 @@
+// RUN: %clang_cc1 %s -std=c++11 -triple x86_64-unknown-unknown -fvisibility=hidden -emit-llvm -o - | FileCheck %s -DLINKAGE=dso_local
+// RUN: %clang_cc1 %s -std=c++11 -triple x86_64-unknown-unknown -fvisibility=default -fvisibility-global-new-delete=force-hidden -emit-llvm -o - | FileCheck %s -DLINKAGE=hidden
+// RUN: %clang_cc1 %s -std=c++11 -triple x86_64-unknown-unknown -fvisibility=hidden -fvisibility-global-new-delete=force-protected -emit-llvm -o - | FileCheck %s -DLINKAGE=protected
+// RUN: %clang_cc1 %s -std=c++11 -triple x86_64-unknown-unknown -fvisibility=hidden -fvisibility-global-new-delete=force-default -emit-llvm -o - | FileCheck %s -DLINKAGE=dso_local
+// RUN: %clang_cc1 %s -std=c++11 -triple x86_64-unknown-unknown -fvisibility=hidden -fvisibility-global-new-delete=source -emit-llvm -o - | FileCheck %s -DLINKAGE=hidden
+
+namespace std {
+  typedef __typeof__(sizeof(0)) size_t;
+  struct nothrow_t {};
+}
+
+// Definition which inherits visibility from the implicit compiler generated declaration.
+void operator delete(void*) throw() {}
+// CHECK: define [[LINKAGE]] void @_ZdlPv
diff --git a/clang/test/Driver/visibility-global-new-delete.cl b/clang/test/Driver/visibility-global-new-delete.cl
new file mode 100644
index 0000000000000..3e7f3c8cd1acb
--- /dev/null
+++ b/clang/test/Driver/visibility-global-new-delete.cl
@@ -0,0 +1,52 @@
+/// Check driver handling for "-fvisibility-global-new-delete-hidden" and "-fvisibility-global-new-delete=".
+
+/// These options are not added by default.
+// RUN: %clang -### --target=x86_64-unknown-unknown -c -emit-llvm %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=DEFAULTS %s
+// DEFAULTS-NOT: "-fvisibility-global-new-delete="
+// DEFAULTS-NOT: "-fvisibility-global-new-delete-hidden"
+
+// DEFINE: %{implicit-check-nots} = --implicit-check-not=-fvisibility-global-new-delete= --implicit-check-not=-fvisibility-global-new-delete-hidden
+
+/// "-fvisibility-global-new-delete=source" added by default for PS5.
+// RUN: %clang -### --target=x86_64-sie-ps5 -c -emit-llvm %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=PS5 %s 
+// PS5: "-fvisibility-global-new-delete=source"
+
+/// -fvisibility-global-new-delete-hidden added explicitly.
+// RUN: %clang -### --target=x86_64-unknown-unknown -c -emit-llvm \
+// RUN:   -fvisibility-global-new-delete-hidden %s 2>&1 | FileCheck -check-prefixes=VGNDH,DEPRECATED %s %{implicit-check-nots}
+// RUN: %clang -### --target=x86_64-sie-ps5 -c -emit-llvm \
+// RUN:   -fvisibility-global-new-delete-hidden %s 2>&1 | FileCheck -check-prefixes=VGNDH,DEPRECATED %s %{implicit-check-nots}
+// DEPRECATED-DAG:  clang: warning: argument '-fvisibility-global-new-delete-hidden' is deprecated, use '-fvisibility-global-new-delete=force-hidden' instead [-Wdeprecated]
+// VGNDH-DAG: "-fvisibility-global-new-delete=force-hidden"
+
+/// -fvisibility-global-new-delete=force-hidden added explicitly.
+// RUN: %clang -### --target=x86_64-unknown-unknown -c -emit-llvm \
+// RUN:   -fvisibility-global-new-delete=force-hidden %s 2>&1 | FileCheck -check-prefixes=VGNDH %s %{implicit-check-nots}
+// RUN: %clang -### --target=x86_64-sie-ps5 -c -emit-llvm \
+// RUN:   -fvisibility-global-new-delete=force-hidden %s 2>&1 | FileCheck -check-prefixes=VGNDH %s %{implicit-check-nots}
+
+/// -fvisibility-global-new-delete=force-protected added explicitly.
+// RUN: %clang -### --target=x86_64-unknown-unknown -c -emit-llvm \
+// RUN:   -fvisibility-global-new-delete=force-protected %s 2>&1 | FileCheck -check-prefixes=VGNDP %s %{implicit-check-nots}
+// RUN: %clang -### --target=x86_64-sie-ps5 -c -emit-llvm \
+// RUN:   -fvisibility-global-new-delete=force-protected %s 2>&1 | FileCheck -check-prefixes=VGNDP %s %{implicit-check-nots}
+// VGNDP-DAG: "-fvisibility-global-new-delete=force-protected"
+
+/// -fvisibility-global-new-delete=force-default added explicitly.
+// RUN: %clang -### --target=x86_64-unknown-unknown -c -emit-llvm \
+// RUN:   -fvisibility-global-new-delete=force-default %s 2>&1 | FileCheck -check-prefixes=VGNDD %s %{implicit-check-nots}
+// RUN: %clang -### --target=x86_64-sie-ps5 -c -emit-llvm \
+// RUN:   -fvisibility-global-new-delete=force-default %s 2>&1 | FileCheck -check-prefixes=VGNDD %s %{implicit-check-nots}
+// VGNDD-DAG: "-fvisibility-global-new-delete=force-default"
+
+/// last specfied used: -fvisibility-global-new-delete-hidden.
+// RUN: %clang -### --target=x86_64-unknown-unknown -c -emit-llvm \ 
+// RUN:   -fvisibility-global-new-delete=force-default -fvisibility-global-new-delete=force-protected -fvisibility-global-new-delete-hidden %s 2>&1 | \
+// RUN:     FileCheck -check-prefixes=VGNDH,DEPRECATED %s %{implicit-check-nots}
+
+/// last specfied used: -fvisibility-global-new-delete=.
+// RUN: %clang -### --target=x86_64-unknown-unknown -c -emit-llvm \ 
+// RUN:   -fvisibility-global-new-delete-hidden -fvisibility-global-new-delete=force-default -fvisibility-global-new-delete=force-protected %s 2>&1 | \
+// RUN:     FileCheck -check-prefixes=VGNDP,DEPRECATED %s %{implicit-check-nots}

From 88d1de5ec64210686d93a90529583505635d257d Mon Sep 17 00:00:00 2001
From: Emilia Kond <emilia@rymiel.space>
Date: Mon, 22 Jan 2024 14:53:47 +0200
Subject: [PATCH 377/843] [clang-format][NFC] Unify token size tests to use
 ASSERT_EQ

---
 clang/unittests/Format/TokenAnnotatorTest.cpp | 134 +++++++++---------
 1 file changed, 67 insertions(+), 67 deletions(-)

diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 88ca8a6cb39e3..3dbf504c35ed5 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -50,7 +50,7 @@ class TokenAnnotatorTest : public ::testing::Test {
 
 TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   auto Tokens = annotate("auto x = [](const decltype(x) &ptr) {};");
-  EXPECT_EQ(Tokens.size(), 18u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::kw_decltype, TT_Unknown);
   EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_TypeDeclarationParen);
   EXPECT_TOKEN(Tokens[9], tok::identifier, TT_Unknown);
@@ -58,12 +58,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_TOKEN(Tokens[11], tok::amp, TT_PointerOrReference);
 
   Tokens = annotate("auto x = [](const decltype(x) *ptr) {};");
-  EXPECT_EQ(Tokens.size(), 18u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[10], tok::r_paren, TT_TypeDeclarationParen);
   EXPECT_TOKEN(Tokens[11], tok::star, TT_PointerOrReference);
 
   Tokens = annotate("#define lambda [](const decltype(x) &ptr) {}");
-  EXPECT_EQ(Tokens.size(), 17u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::kw_decltype, TT_Unknown);
   EXPECT_TOKEN(Tokens[8], tok::l_paren, TT_TypeDeclarationParen);
   EXPECT_TOKEN(Tokens[9], tok::identifier, TT_Unknown);
@@ -71,7 +71,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_TOKEN(Tokens[11], tok::amp, TT_PointerOrReference);
 
   Tokens = annotate("#define lambda [](const decltype(x) *ptr) {}");
-  EXPECT_EQ(Tokens.size(), 17u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
   EXPECT_TOKEN(Tokens[10], tok::r_paren, TT_TypeDeclarationParen);
   EXPECT_TOKEN(Tokens[11], tok::star, TT_PointerOrReference);
 
@@ -79,63 +79,63 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
                     "  while (p < a && *p == 'a')\n"
                     "    p++;\n"
                     "}");
-  EXPECT_EQ(Tokens.size(), 21u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 21u) << Tokens;
   EXPECT_TOKEN(Tokens[10], tok::ampamp, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[11], tok::star, TT_UnaryOperator);
 
   Tokens = annotate("case *x:");
-  EXPECT_EQ(Tokens.size(), 5u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 5u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::star, TT_UnaryOperator);
   Tokens = annotate("case &x:");
-  EXPECT_EQ(Tokens.size(), 5u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 5u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::amp, TT_UnaryOperator);
 
   Tokens = annotate("bool b = 3 == int{3} && true;");
-  EXPECT_EQ(Tokens.size(), 13u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 13u) << Tokens;
   EXPECT_TOKEN(Tokens[9], tok::ampamp, TT_BinaryOperator);
 
   Tokens = annotate("struct {\n"
                     "} *ptr;");
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::star, TT_PointerOrReference);
   Tokens = annotate("union {\n"
                     "} *ptr;");
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::star, TT_PointerOrReference);
   Tokens = annotate("class {\n"
                     "} *ptr;");
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::star, TT_PointerOrReference);
 
   Tokens = annotate("struct {\n"
                     "} &&ptr = {};");
-  EXPECT_EQ(Tokens.size(), 10u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::ampamp, TT_PointerOrReference);
   Tokens = annotate("union {\n"
                     "} &&ptr = {};");
-  EXPECT_EQ(Tokens.size(), 10u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::ampamp, TT_PointerOrReference);
   Tokens = annotate("class {\n"
                     "} &&ptr = {};");
-  EXPECT_EQ(Tokens.size(), 10u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::ampamp, TT_PointerOrReference);
   Tokens = annotate("int i = int{42} * 2;");
-  EXPECT_EQ(Tokens.size(), 11u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::star, TT_BinaryOperator);
 
   Tokens = annotate("delete[] *ptr;");
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::star, TT_UnaryOperator);
   Tokens = annotate("delete[] **ptr;");
-  EXPECT_EQ(Tokens.size(), 8u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::star, TT_UnaryOperator);
   EXPECT_TOKEN(Tokens[4], tok::star, TT_UnaryOperator);
   Tokens = annotate("delete[] *(ptr);");
-  EXPECT_EQ(Tokens.size(), 9u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::star, TT_UnaryOperator);
 
   Tokens = annotate("void f() { void (*fnptr)(char* foo); }");
-  EXPECT_EQ(Tokens.size(), 18u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_FunctionTypeLParen);
   // FIXME: The star of a function pointer probably makes more sense as
   // TT_PointerOrReference.
@@ -143,7 +143,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   EXPECT_TOKEN(Tokens[12], tok::star, TT_PointerOrReference);
 
   Tokens = annotate("void f() { void (*fnptr)(t* foo); }");
-  EXPECT_EQ(Tokens.size(), 18u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[6], tok::l_paren, TT_FunctionTypeLParen);
   EXPECT_TOKEN(Tokens[7], tok::star, TT_UnaryOperator);
   EXPECT_TOKEN(Tokens[12], tok::star, TT_PointerOrReference);
@@ -276,12 +276,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfStarAndAmp) {
   Tokens =
       annotate("auto foo() noexcept(noexcept(bar()) && "
                "trait<std::decay_t<decltype(bar())>> && noexcept(baz())) {}");
-  EXPECT_EQ(Tokens.size(), 38u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 38u) << Tokens;
   EXPECT_TOKEN(Tokens[12], tok::ampamp, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[27], tok::ampamp, TT_BinaryOperator);
 
   Tokens = annotate("foo = *i < *j && *j > *k;");
-  EXPECT_EQ(Tokens.size(), 15u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
   EXPECT_TOKEN(Tokens[4], tok::less, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[7], tok::ampamp, TT_BinaryOperator);
   EXPECT_TOKEN(Tokens[10], tok::greater, TT_BinaryOperator);
@@ -402,44 +402,44 @@ TEST_F(TokenAnnotatorTest, UnderstandsUsesOfPlusAndMinus) {
 
 TEST_F(TokenAnnotatorTest, UnderstandsClasses) {
   auto Tokens = annotate("class C {};");
-  EXPECT_EQ(Tokens.size(), 6u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 6u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_ClassRBrace);
 
   Tokens = annotate("const class C {} c;");
-  EXPECT_EQ(Tokens.size(), 8u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[4], tok::r_brace, TT_ClassRBrace);
 
   Tokens = annotate("const class {} c;");
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_ClassRBrace);
 
   Tokens = annotate("class [[deprecated(\"\")]] C { int i; };");
-  EXPECT_EQ(Tokens.size(), 17u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
   EXPECT_TOKEN(Tokens[10], tok::l_brace, TT_ClassLBrace);
   EXPECT_TOKEN(Tokens[14], tok::r_brace, TT_ClassRBrace);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   auto Tokens = annotate("struct S {};");
-  EXPECT_EQ(Tokens.size(), 6u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 6u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("struct EXPORT_MACRO [[nodiscard]] C { int i; };");
-  EXPECT_EQ(Tokens.size(), 15u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
   EXPECT_TOKEN(Tokens[8], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[12], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("struct [[deprecated]] [[nodiscard]] C { int i; };");
-  EXPECT_EQ(Tokens.size(), 19u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 19u) << Tokens;
   EXPECT_TOKEN(Tokens[12], tok::l_brace, TT_StructLBrace);
   EXPECT_TOKEN(Tokens[16], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("template <typename T> struct S<const T[N]> {};");
-  EXPECT_EQ(Tokens.size(), 18u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[10], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
@@ -447,7 +447,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   EXPECT_TOKEN(Tokens[15], tok::r_brace, TT_StructRBrace);
 
   Tokens = annotate("template <typename T> struct S<T const[N]> {};");
-  EXPECT_EQ(Tokens.size(), 18u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 18u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[10], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[13], tok::greater, TT_TemplateCloser);
@@ -457,7 +457,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
   Tokens = annotate("template <typename T, unsigned n> struct S<T const[n]> {\n"
                     "  void f(T const (&a)[n]);\n"
                     "};");
-  EXPECT_EQ(Tokens.size(), 35u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 35u) << Tokens;
   EXPECT_TOKEN(Tokens[10], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[13], tok::l_square, TT_ArraySubscriptLSquare);
   EXPECT_TOKEN(Tokens[16], tok::greater, TT_TemplateCloser);
@@ -470,12 +470,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsStructs) {
 
 TEST_F(TokenAnnotatorTest, UnderstandsUnions) {
   auto Tokens = annotate("union U {};");
-  EXPECT_EQ(Tokens.size(), 6u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 6u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_UnionLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_UnionRBrace);
 
   Tokens = annotate("union U { void f() { return; } };");
-  EXPECT_EQ(Tokens.size(), 14u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 14u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_UnionLBrace);
   EXPECT_TOKEN(Tokens[7], tok::l_brace, TT_FunctionLBrace);
   EXPECT_TOKEN(Tokens[11], tok::r_brace, TT_UnionRBrace);
@@ -483,25 +483,25 @@ TEST_F(TokenAnnotatorTest, UnderstandsUnions) {
 
 TEST_F(TokenAnnotatorTest, UnderstandsEnums) {
   auto Tokens = annotate("enum E {};");
-  EXPECT_EQ(Tokens.size(), 6u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 6u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::l_brace, TT_EnumLBrace);
   EXPECT_TOKEN(Tokens[3], tok::r_brace, TT_EnumRBrace);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsDefaultedAndDeletedFunctions) {
   auto Tokens = annotate("auto operator<=>(const T &) const & = default;");
-  EXPECT_EQ(Tokens.size(), 14u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 14u) << Tokens;
   EXPECT_TOKEN(Tokens[9], tok::amp, TT_PointerOrReference);
 
   Tokens = annotate("template <typename T> void F(T) && = delete;");
-  EXPECT_EQ(Tokens.size(), 15u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
   EXPECT_TOKEN(Tokens[10], tok::ampamp, TT_PointerOrReference);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsVariables) {
   auto Tokens =
       annotate("inline bool var = is_integral_v<int> && is_signed_v<int>;");
-  EXPECT_EQ(Tokens.size(), 15u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 15u) << Tokens;
   EXPECT_TOKEN(Tokens[8], tok::ampamp, TT_BinaryOperator);
 }
 
@@ -509,7 +509,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsVariableTemplates) {
   auto Tokens =
       annotate("template <typename T> "
                "inline bool var = is_integral_v<int> && is_signed_v<int>;");
-  EXPECT_EQ(Tokens.size(), 20u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 20u) << Tokens;
   EXPECT_TOKEN(Tokens[13], tok::ampamp, TT_BinaryOperator);
 }
 
@@ -554,90 +554,90 @@ TEST_F(TokenAnnotatorTest, UnderstandsWhitespaceSensitiveMacros) {
   Style.WhitespaceSensitiveMacros.push_back("FOO");
 
   auto Tokens = annotate("FOO(1+2 )", Style);
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_UntouchableMacroFunc);
 
   Tokens = annotate("FOO(a:b:c)", Style);
-  EXPECT_EQ(Tokens.size(), 9u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[0], tok::identifier, TT_UntouchableMacroFunc);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsDelete) {
   auto Tokens = annotate("delete (void *)p;");
-  EXPECT_EQ(Tokens.size(), 8u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[4], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("delete[] (void *)p;");
-  EXPECT_EQ(Tokens.size(), 10u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[6], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("delete[] /*comment*/ (void *)p;");
-  EXPECT_EQ(Tokens.size(), 11u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("delete[/*comment*/] (void *)p;");
-  EXPECT_EQ(Tokens.size(), 11u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("delete/*comment*/[] (void *)p;");
-  EXPECT_EQ(Tokens.size(), 11u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::r_paren, TT_CastRParen);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsCasts) {
   auto Tokens = annotate("(void)p;");
-  EXPECT_EQ(Tokens.size(), 6u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 6u) << Tokens;
   EXPECT_TOKEN(Tokens[2], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("auto x = (Foo)p;");
-  EXPECT_EQ(Tokens.size(), 9u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[5], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("(std::vector<int>)p;");
-  EXPECT_EQ(Tokens.size(), 11u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 11u) << Tokens;
   EXPECT_TOKEN(Tokens[7], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("return (Foo)p;");
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("throw (Foo)p;");
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::r_paren, TT_CastRParen);
 
   Tokens = annotate("#define FOO(x) (((uint64_t)(x) * BAR) / 100)");
-  EXPECT_EQ(Tokens.size(), 21u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 21u) << Tokens;
   EXPECT_TOKEN(Tokens[10], tok::r_paren, TT_CastRParen);
   EXPECT_TOKEN(Tokens[13], tok::r_paren, TT_Unknown);
   EXPECT_TOKEN(Tokens[14], tok::star, TT_BinaryOperator);
 
   Tokens = annotate("return (Foo) & 10;");
-  EXPECT_EQ(Tokens.size(), 8u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 8u) << Tokens;
   EXPECT_TOKEN(Tokens[3], tok::r_paren, TT_Unknown);
   EXPECT_TOKEN(Tokens[4], tok::amp, TT_BinaryOperator);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsDynamicExceptionSpecifier) {
   auto Tokens = annotate("void f() throw(int);");
-  EXPECT_EQ(Tokens.size(), 10u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 10u) << Tokens;
   EXPECT_TOKEN(Tokens[4], tok::kw_throw, TT_Unknown);
 }
 
 TEST_F(TokenAnnotatorTest, UnderstandsFunctionRefQualifiers) {
   auto Tokens = annotate("void f() &;");
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[4], tok::amp, TT_PointerOrReference);
 
   Tokens = annotate("void operator=(T) &&;");
-  EXPECT_EQ(Tokens.size(), 9u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[6], tok::ampamp, TT_PointerOrReference);
 
   Tokens = annotate("template <typename T> void f() &;");
-  EXPECT_EQ(Tokens.size(), 12u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[9], tok::amp, TT_PointerOrReference);
 
   Tokens = annotate("template <typename T> void operator=(T) &;");
-  EXPECT_EQ(Tokens.size(), 14u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 14u) << Tokens;
   EXPECT_TOKEN(Tokens[11], tok::amp, TT_PointerOrReference);
 }
 
@@ -1058,12 +1058,12 @@ TEST_F(TokenAnnotatorTest, UnderstandsRequiresClausesAndConcepts) {
   EXPECT_TOKEN(Tokens[10], tok::ampamp, TT_PointerOrReference);
 
   Tokens = annotate("void f() & requires(true) {}");
-  EXPECT_EQ(Tokens.size(), 12u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 12u) << Tokens;
   EXPECT_TOKEN(Tokens[4], tok::amp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[5], tok::kw_requires, TT_RequiresClause);
 
   Tokens = annotate("void f() & requires(C<true, true>) {}");
-  EXPECT_EQ(Tokens.size(), 17u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 17u) << Tokens;
   EXPECT_TOKEN(Tokens[4], tok::amp, TT_PointerOrReference);
   EXPECT_TOKEN(Tokens[5], tok::kw_requires, TT_RequiresClause);
 
@@ -1439,7 +1439,7 @@ TEST_F(TokenAnnotatorTest, UnderstandsObjCMethodExpr) {
                          "  //\n"
                          "  BOOL a = [b.c n] > 1;\n"
                          "}");
-  EXPECT_EQ(Tokens.size(), 20u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 20u) << Tokens;
   EXPECT_TOKEN(Tokens[9], tok::l_square, TT_ObjCMethodExpr);
   EXPECT_TOKEN(Tokens[15], tok::greater, TT_BinaryOperator);
 }
@@ -2310,27 +2310,27 @@ TEST_F(TokenAnnotatorTest, CSharpNullableTypes) {
   FormatStyle Style = getGoogleStyle(FormatStyle::LK_CSharp);
 
   auto Tokens = annotate("int? a;", Style);
-  EXPECT_EQ(Tokens.size(), 5u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 5u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::question, TT_CSharpNullable);
 
   Tokens = annotate("int? a = 1;", Style);
-  EXPECT_EQ(Tokens.size(), 7u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 7u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::question, TT_CSharpNullable);
 
   Tokens = annotate("int?)", Style);
-  EXPECT_EQ(Tokens.size(), 4u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 4u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::question, TT_CSharpNullable);
 
   Tokens = annotate("int?>", Style);
-  EXPECT_EQ(Tokens.size(), 4u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 4u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::question, TT_CSharpNullable);
 
   Tokens = annotate("cond? id : id2", Style);
-  EXPECT_EQ(Tokens.size(), 6u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 6u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::question, TT_ConditionalExpr);
 
   Tokens = annotate("cond ? cond2 ? : id1 : id2", Style);
-  EXPECT_EQ(Tokens.size(), 9u) << Tokens;
+  ASSERT_EQ(Tokens.size(), 9u) << Tokens;
   EXPECT_TOKEN(Tokens[1], tok::question, TT_ConditionalExpr);
 }
 

From 4821c90c24d52d4a42990fd9371caedb157bc58b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Mon, 22 Jan 2024 13:56:44 +0100
Subject: [PATCH 378/843] [clang-repl] Fix PLT offset too large linker error on
 ARM (#78959)

I cross-compile clang-repl with GCC-10 on Ubuntu 20.04 and get this
error when linking with gold: PLT offset too large, try linking with
--long-plt
---
 clang/tools/clang-repl/CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index 2ccbe292fd49e..b0aaf39f01154 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -22,3 +22,8 @@ clang_target_link_libraries(clang-repl PRIVATE
 if(CLANG_PLUGIN_SUPPORT)
   export_executable_symbols_for_plugins(clang-repl)
 endif()
+
+string(TOUPPER ${CMAKE_SYSTEM_PROCESSOR} system_processor)
+if(${system_processor} MATCHES "ARM")
+  target_link_options(clang-repl PRIVATE LINKER:--long-plt)
+endif()

From fa6025e25b5754e8cf39169e3a7085b57ea35de5 Mon Sep 17 00:00:00 2001
From: Emilia Kond <emilia@rymiel.space>
Date: Mon, 22 Jan 2024 14:57:37 +0200
Subject: [PATCH 379/843] [clang-format] Don't confuse initializer equal signs
 in for loops (#77712)

clang-format has logic to align declarations of multiple variables of
the same type, aligning them at the equals sign. This logic is applied
in for loops as well. However, this alignment logic also erroneously
affected the equals signs of designated initializers.

This patch forbids alignment if the token 2 tokens back from the equals
sign is a designated initializer period.

Fixes https://github.com/llvm/llvm-project/issues/73902
---
 clang/lib/Format/ContinuationIndenter.cpp |  4 +++-
 clang/unittests/Format/FormatTest.cpp     | 12 ++++++++++++
 2 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index e6eaaa9ab4570..ddbee15cb159f 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -703,7 +703,9 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
 
   if (Current.is(tok::equal) &&
       (State.Line->First->is(tok::kw_for) || Current.NestingLevel == 0) &&
-      CurrentState.VariablePos == 0) {
+      CurrentState.VariablePos == 0 &&
+      (!Previous.Previous ||
+       Previous.Previous->isNot(TT_DesignatedInitializerPeriod))) {
     CurrentState.VariablePos = State.Column;
     // Move over * and & if they are bound to the variable name.
     const FormatToken *Tok = &Previous;
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 3fb55ae2c1f41..a0825dc2ad4ef 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -5008,6 +5008,18 @@ TEST_F(FormatTest, DesignatedInitializers) {
                "    [3] = cccccccccccccccccccccccccccccccccccccc,\n"
                "    [4] = dddddddddddddddddddddddddddddddddddddd,\n"
                "    [5] = eeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeeee};");
+
+  verifyFormat("for (const TestCase &test_case : {\n"
+               "         TestCase{\n"
+               "             .a = 1,\n"
+               "             .b = 1,\n"
+               "         },\n"
+               "         TestCase{\n"
+               "             .a = 2,\n"
+               "             .b = 2,\n"
+               "         },\n"
+               "     }) {\n"
+               "}\n");
 }
 
 TEST_F(FormatTest, BracedInitializerIndentWidth) {

From ab1b4991cfacfe04a579bf0b0ff4a704a6224d4a Mon Sep 17 00:00:00 2001
From: Tuan Chuong Goh <chuong.goh@arm.com>
Date: Mon, 22 Jan 2024 13:13:56 +0000
Subject: [PATCH 380/843] [AArch64] Adding tests for shifts

---
 llvm/test/CodeGen/AArch64/shift.ll | 1015 ++++++++++++++++++++++++++++
 1 file changed, 1015 insertions(+)
 create mode 100644 llvm/test/CodeGen/AArch64/shift.ll

diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
new file mode 100644
index 0000000000000..55ff97563e1ef
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -0,0 +1,1015 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI:         warning: Instruction selection used fallback path for shl_v4i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v32i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v2i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v16i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v4i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v32i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v2i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v16i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v4i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v32i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v2i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v16i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v3i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v7i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v3i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v7i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v3i32
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v3i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v7i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v3i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v7i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v3i32
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v3i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v7i8
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v3i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v7i16
+; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v3i32
+
+
+define i1 @shl_i1(i1 %0, i1 %1){
+; CHECK-SD-LABEL: shl_i1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w0, w0, #0x1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_i1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w1, #0x1
+; CHECK-GI-NEXT:    lsl w8, w0, w8
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:    ret
+    %3 = shl i1 %0, %1
+    ret i1 %3
+}
+
+define i8 @shl_i8(i8 %0, i8 %1){
+; CHECK-SD-LABEL: shl_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    lsl w0, w0, w1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w1, #0xff
+; CHECK-GI-NEXT:    lsl w0, w0, w8
+; CHECK-GI-NEXT:    ret
+    %3 = shl i8 %0, %1
+    ret i8 %3
+}
+
+define i16 @shl_i16(i16 %0, i16 %1){
+; CHECK-SD-LABEL: shl_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    lsl w0, w0, w1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w1, #0xffff
+; CHECK-GI-NEXT:    lsl w0, w0, w8
+; CHECK-GI-NEXT:    ret
+    %3 = shl i16 %0, %1
+    ret i16 %3
+}
+
+define i32 @shl_i32(i32 %0, i32 %1){
+; CHECK-LABEL: shl_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl w0, w0, w1
+; CHECK-NEXT:    ret
+    %3 = shl i32 %0, %1
+    ret i32 %3
+}
+
+define i64 @shl_i64(i64 %0, i64 %1){
+; CHECK-LABEL: shl_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsl x0, x0, x1
+; CHECK-NEXT:    ret
+    %3 = shl i64 %0, %1
+    ret i64 %3
+}
+
+define i128 @shl_i128(i128 %0, i128 %1){
+; CHECK-SD-LABEL: shl_i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsr x8, x0, #1
+; CHECK-SD-NEXT:    mvn w9, w2
+; CHECK-SD-NEXT:    lsl x10, x1, x2
+; CHECK-SD-NEXT:    tst x2, #0x40
+; CHECK-SD-NEXT:    lsr x8, x8, x9
+; CHECK-SD-NEXT:    lsl x9, x0, x2
+; CHECK-SD-NEXT:    orr x8, x10, x8
+; CHECK-SD-NEXT:    csel x0, xzr, x9, ne
+; CHECK-SD-NEXT:    csel x1, x9, x8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    sub x9, x2, #64
+; CHECK-GI-NEXT:    lsl x10, x1, x2
+; CHECK-GI-NEXT:    sub x8, x8, x2
+; CHECK-GI-NEXT:    lsl x11, x0, x2
+; CHECK-GI-NEXT:    lsl x9, x0, x9
+; CHECK-GI-NEXT:    lsr x8, x0, x8
+; CHECK-GI-NEXT:    cmp x2, #64
+; CHECK-GI-NEXT:    csel x0, x11, xzr, lo
+; CHECK-GI-NEXT:    orr x8, x8, x10
+; CHECK-GI-NEXT:    csel x8, x8, x9, lo
+; CHECK-GI-NEXT:    cmp x2, #0
+; CHECK-GI-NEXT:    csel x1, x1, x8, eq
+; CHECK-GI-NEXT:    ret
+    %3 = shl i128 %0, %1
+    ret i128 %3
+}
+
+define i1 @ashr_i1(i1 %0, i1 %1){
+; CHECK-SD-LABEL: ashr_i1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w0, w0, #0x1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_i1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sbfx w8, w0, #0, #1
+; CHECK-GI-NEXT:    and w9, w1, #0x1
+; CHECK-GI-NEXT:    asr w8, w8, w9
+; CHECK-GI-NEXT:    and w0, w8, #0x1
+; CHECK-GI-NEXT:    ret
+    %3 = ashr i1 %0, %1
+    ret i1 %3
+}
+
+define i8 @ashr_i8(i8 %0, i8 %1){
+; CHECK-SD-LABEL: ashr_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sxtb w8, w0
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    asr w0, w8, w1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sxtb w8, w0
+; CHECK-GI-NEXT:    and w9, w1, #0xff
+; CHECK-GI-NEXT:    asr w0, w8, w9
+; CHECK-GI-NEXT:    ret
+    %3 = ashr i8 %0, %1
+    ret i8 %3
+}
+
+define i16 @ashr_i16(i16 %0, i16 %1){
+; CHECK-SD-LABEL: ashr_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    sxth w8, w0
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    asr w0, w8, w1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    sxth w8, w0
+; CHECK-GI-NEXT:    and w9, w1, #0xffff
+; CHECK-GI-NEXT:    asr w0, w8, w9
+; CHECK-GI-NEXT:    ret
+    %3 = ashr i16 %0, %1
+    ret i16 %3
+}
+
+define i32 @ashr_i32(i32 %0, i32 %1){
+; CHECK-LABEL: ashr_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr w0, w0, w1
+; CHECK-NEXT:    ret
+    %3 = ashr i32 %0, %1
+    ret i32 %3
+}
+
+define i64 @ashr_i64(i64 %0, i64 %1){
+; CHECK-LABEL: ashr_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    asr x0, x0, x1
+; CHECK-NEXT:    ret
+    %3 = ashr i64 %0, %1
+    ret i64 %3
+}
+
+define i128 @ashr_i128(i128 %0, i128 %1){
+; CHECK-SD-LABEL: ashr_i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl x8, x1, #1
+; CHECK-SD-NEXT:    mvn w9, w2
+; CHECK-SD-NEXT:    lsr x10, x0, x2
+; CHECK-SD-NEXT:    asr x11, x1, #63
+; CHECK-SD-NEXT:    tst x2, #0x40
+; CHECK-SD-NEXT:    lsl x8, x8, x9
+; CHECK-SD-NEXT:    asr x9, x1, x2
+; CHECK-SD-NEXT:    orr x8, x8, x10
+; CHECK-SD-NEXT:    csel x1, x11, x9, ne
+; CHECK-SD-NEXT:    csel x0, x9, x8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    sub x9, x2, #64
+; CHECK-GI-NEXT:    lsr x10, x0, x2
+; CHECK-GI-NEXT:    sub x8, x8, x2
+; CHECK-GI-NEXT:    asr x9, x1, x9
+; CHECK-GI-NEXT:    cmp x2, #64
+; CHECK-GI-NEXT:    lsl x8, x1, x8
+; CHECK-GI-NEXT:    asr x11, x1, x2
+; CHECK-GI-NEXT:    orr x8, x10, x8
+; CHECK-GI-NEXT:    asr x10, x1, #63
+; CHECK-GI-NEXT:    csel x8, x8, x9, lo
+; CHECK-GI-NEXT:    cmp x2, #0
+; CHECK-GI-NEXT:    csel x0, x0, x8, eq
+; CHECK-GI-NEXT:    cmp x2, #64
+; CHECK-GI-NEXT:    csel x1, x11, x10, lo
+; CHECK-GI-NEXT:    ret
+    %3 = ashr i128 %0, %1
+    ret i128 %3
+}
+
+define i1 @lshr_i1(i1 %0, i1 %1){
+; CHECK-SD-LABEL: lshr_i1:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w0, w0, #0x1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_i1:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w1, #0x1
+; CHECK-GI-NEXT:    and w9, w0, #0x1
+; CHECK-GI-NEXT:    lsr w0, w9, w8
+; CHECK-GI-NEXT:    ret
+    %3 = lshr i1 %0, %1
+    ret i1 %3
+}
+
+define i8 @lshr_i8(i8 %0, i8 %1){
+; CHECK-SD-LABEL: lshr_i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xff
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    lsr w0, w8, w1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w1, #0xff
+; CHECK-GI-NEXT:    and w9, w0, #0xff
+; CHECK-GI-NEXT:    lsr w0, w9, w8
+; CHECK-GI-NEXT:    ret
+    %3 = lshr i8 %0, %1
+    ret i8 %3
+}
+
+define i16 @lshr_i16(i16 %0, i16 %1){
+; CHECK-SD-LABEL: lshr_i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    and w8, w0, #0xffff
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    lsr w0, w8, w1
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    and w8, w1, #0xffff
+; CHECK-GI-NEXT:    and w9, w0, #0xffff
+; CHECK-GI-NEXT:    lsr w0, w9, w8
+; CHECK-GI-NEXT:    ret
+    %3 = lshr i16 %0, %1
+    ret i16 %3
+}
+
+define i32 @lshr_i32(i32 %0, i32 %1){
+; CHECK-LABEL: lshr_i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr w0, w0, w1
+; CHECK-NEXT:    ret
+    %3 = lshr i32 %0, %1
+    ret i32 %3
+}
+
+define i64 @lshr_i64(i64 %0, i64 %1){
+; CHECK-LABEL: lshr_i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    lsr x0, x0, x1
+; CHECK-NEXT:    ret
+    %3 = lshr i64 %0, %1
+    ret i64 %3
+}
+
+define i128 @lshr_i128(i128 %0, i128 %1){
+; CHECK-SD-LABEL: lshr_i128:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    lsl x8, x1, #1
+; CHECK-SD-NEXT:    mvn w9, w2
+; CHECK-SD-NEXT:    lsr x10, x0, x2
+; CHECK-SD-NEXT:    tst x2, #0x40
+; CHECK-SD-NEXT:    lsl x8, x8, x9
+; CHECK-SD-NEXT:    lsr x9, x1, x2
+; CHECK-SD-NEXT:    orr x8, x8, x10
+; CHECK-SD-NEXT:    csel x1, xzr, x9, ne
+; CHECK-SD-NEXT:    csel x0, x9, x8, ne
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_i128:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    mov w8, #64 // =0x40
+; CHECK-GI-NEXT:    sub x9, x2, #64
+; CHECK-GI-NEXT:    lsr x10, x0, x2
+; CHECK-GI-NEXT:    sub x8, x8, x2
+; CHECK-GI-NEXT:    lsr x9, x1, x9
+; CHECK-GI-NEXT:    cmp x2, #64
+; CHECK-GI-NEXT:    lsl x8, x1, x8
+; CHECK-GI-NEXT:    orr x8, x10, x8
+; CHECK-GI-NEXT:    lsr x10, x1, x2
+; CHECK-GI-NEXT:    csel x8, x8, x9, lo
+; CHECK-GI-NEXT:    cmp x2, #0
+; CHECK-GI-NEXT:    csel x0, x0, x8, eq
+; CHECK-GI-NEXT:    cmp x2, #64
+; CHECK-GI-NEXT:    csel x1, x10, xzr, lo
+; CHECK-GI-NEXT:    ret
+    %3 = lshr i128 %0, %1
+    ret i128 %3
+}
+
+; ===== Legal Vector Type =====
+
+define <8 x i8> @shl_v8i8(<8 x i8> %0, <8 x i8> %1){
+; CHECK-LABEL: shl_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+    %3 = shl <8 x i8> %0, %1
+    ret <8 x i8> %3
+}
+
+define <16 x i8> @shl_v16i8(<16 x i8> %0, <16 x i8> %1){
+; CHECK-LABEL: shl_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+    %3 = shl <16 x i8> %0, %1
+    ret <16 x i8> %3
+}
+
+define <4 x i16> @shl_v4i16(<4 x i16> %0, <4 x i16> %1){
+; CHECK-LABEL: shl_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %3 = shl <4 x i16> %0, %1
+    ret <4 x i16> %3
+}
+
+define <8 x i16> @shl_v8i16(<8 x i16> %0, <8 x i16> %1){
+; CHECK-LABEL: shl_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+    %3 = shl <8 x i16> %0, %1
+    ret <8 x i16> %3
+}
+
+define <2 x i32> @shl_v2i32(<2 x i32> %0, <2 x i32> %1){
+; CHECK-LABEL: shl_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+    %3 = shl <2 x i32> %0, %1
+    ret <2 x i32> %3
+}
+
+define <4 x i32> @shl_v4i32(<4 x i32> %0, <4 x i32> %1){
+; CHECK-LABEL: shl_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+    %3 = shl <4 x i32> %0, %1
+    ret <4 x i32> %3
+}
+
+define <2 x i64> @shl_v2i64(<2 x i64> %0, <2 x i64> %1){
+; CHECK-LABEL: shl_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+    %3 = shl <2 x i64> %0, %1
+    ret <2 x i64> %3
+}
+
+define <8 x i8> @ashr_v8i8(<8 x i8> %0, <8 x i8> %1){
+; CHECK-LABEL: ashr_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8b, v1.8b
+; CHECK-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+    %3 = ashr <8 x i8> %0, %1
+    ret <8 x i8> %3
+}
+
+define <16 x i8> @ashr_v16i8(<16 x i8> %0, <16 x i8> %1){
+; CHECK-LABEL: ashr_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.16b, v1.16b
+; CHECK-NEXT:    sshl v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+    %3 = ashr <16 x i8> %0, %1
+    ret <16 x i8> %3
+}
+
+define <4 x i16> @ashr_v4i16(<4 x i16> %0, <4 x i16> %1){
+; CHECK-LABEL: ashr_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %3 = ashr <4 x i16> %0, %1
+    ret <4 x i16> %3
+}
+
+define <8 x i16> @ashr_v8i16(<8 x i16> %0, <8 x i16> %1){
+; CHECK-LABEL: ashr_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8h, v1.8h
+; CHECK-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+    %3 = ashr <8 x i16> %0, %1
+    ret <8 x i16> %3
+}
+
+define <2 x i32> @ashr_v2i32(<2 x i32> %0, <2 x i32> %1){
+; CHECK-LABEL: ashr_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.2s, v1.2s
+; CHECK-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+    %3 = ashr <2 x i32> %0, %1
+    ret <2 x i32> %3
+}
+
+define <4 x i32> @ashr_v4i32(<4 x i32> %0, <4 x i32> %1){
+; CHECK-LABEL: ashr_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4s, v1.4s
+; CHECK-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+    %3 = ashr <4 x i32> %0, %1
+    ret <4 x i32> %3
+}
+
+define <2 x i64> @ashr_v2i64(<2 x i64> %0, <2 x i64> %1){
+; CHECK-LABEL: ashr_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.2d, v1.2d
+; CHECK-NEXT:    sshl v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+    %3 = ashr <2 x i64> %0, %1
+    ret <2 x i64> %3
+}
+
+define <8 x i8> @lshr_v8i8(<8 x i8> %0, <8 x i8> %1){
+; CHECK-LABEL: lshr_v8i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8b, v1.8b
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+    %3 = lshr <8 x i8> %0, %1
+    ret <8 x i8> %3
+}
+
+define <16 x i8> @lshr_v16i8(<16 x i8> %0, <16 x i8> %1){
+; CHECK-LABEL: lshr_v16i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.16b, v1.16b
+; CHECK-NEXT:    ushl v0.16b, v0.16b, v1.16b
+; CHECK-NEXT:    ret
+    %3 = lshr <16 x i8> %0, %1
+    ret <16 x i8> %3
+}
+
+define <4 x i16> @lshr_v4i16(<4 x i16> %0, <4 x i16> %1){
+; CHECK-LABEL: lshr_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %3 = lshr <4 x i16> %0, %1
+    ret <4 x i16> %3
+}
+
+define <8 x i16> @lshr_v8i16(<8 x i16> %0, <8 x i16> %1){
+; CHECK-LABEL: lshr_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8h, v1.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+    %3 = lshr <8 x i16> %0, %1
+    ret <8 x i16> %3
+}
+
+define <2 x i32> @lshr_v2i32(<2 x i32> %0, <2 x i32> %1){
+; CHECK-LABEL: lshr_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.2s, v1.2s
+; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+    %3 = lshr <2 x i32> %0, %1
+    ret <2 x i32> %3
+}
+
+define <4 x i32> @lshr_v4i32(<4 x i32> %0, <4 x i32> %1){
+; CHECK-LABEL: lshr_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4s, v1.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+    %3 = lshr <4 x i32> %0, %1
+    ret <4 x i32> %3
+}
+
+define <2 x i64> @lshr_v2i64(<2 x i64> %0, <2 x i64> %1){
+; CHECK-LABEL: lshr_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.2d, v1.2d
+; CHECK-NEXT:    ushl v0.2d, v0.2d, v1.2d
+; CHECK-NEXT:    ret
+    %3 = lshr <2 x i64> %0, %1
+    ret <2 x i64> %3
+}
+
+; ===== Vector Larger/Smaller than Legal =====
+
+define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){
+; CHECK-LABEL: shl_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %3 = shl <4 x i8> %0, %1
+    ret <4 x i8> %3
+}
+
+define <32 x i8> @shl_v32i8(<32 x i8> %0, <32 x i8> %1){
+; CHECK-LABEL: shl_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    ushl v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ret
+    %3 = shl <32 x i8> %0, %1
+    ret <32 x i8> %3
+}
+
+define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){
+; CHECK-LABEL: shl_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+    %3 = shl <2 x i16> %0, %1
+    ret <2 x i16> %3
+}
+
+define <16 x i16> @shl_v16i16(<16 x i16> %0, <16 x i16> %1){
+; CHECK-LABEL: shl_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ret
+    %3 = shl <16 x i16> %0, %1
+    ret <16 x i16> %3
+}
+
+define <1 x i32> @shl_v1i32(<1 x i32> %0, <1 x i32> %1){
+; CHECK-SD-LABEL: shl_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    lsl w8, w8, w9
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %3 = shl <1 x i32> %0, %1
+    ret <1 x i32> %3
+}
+
+define <8 x i32> @shl_v8i32(<8 x i32> %0, <8 x i32> %1){
+; CHECK-SD-LABEL: shl_v8i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-SD-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v8i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-GI-NEXT:    ret
+    %3 = shl <8 x i32> %0, %1
+    ret <8 x i32> %3
+}
+
+define <4 x i64> @shl_v4i64(<4 x i64> %0, <4 x i64> %1){
+; CHECK-SD-LABEL: shl_v4i64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushl v1.2d, v1.2d, v3.2d
+; CHECK-SD-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v4i64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    ushl v1.2d, v1.2d, v3.2d
+; CHECK-GI-NEXT:    ret
+    %3 = shl <4 x i64> %0, %1
+    ret <4 x i64> %3
+}
+
+define <4 x i8> @ashr_v4i8(<4 x i8> %0, <4 x i8> %1){
+; CHECK-LABEL: ashr_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %3 = ashr <4 x i8> %0, %1
+    ret <4 x i8> %3
+}
+
+define <32 x i8> @ashr_v32i8(<32 x i8> %0, <32 x i8> %1){
+; CHECK-LABEL: ashr_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v2.16b, v2.16b
+; CHECK-NEXT:    neg v3.16b, v3.16b
+; CHECK-NEXT:    sshl v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    sshl v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    ret
+    %3 = ashr <32 x i8> %0, %1
+    ret <32 x i8> %3
+}
+
+define <2 x i16> @ashr_v2i16(<2 x i16> %0, <2 x i16> %1){
+; CHECK-LABEL: ashr_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-NEXT:    shl v0.2s, v0.2s, #16
+; CHECK-NEXT:    sshr v0.2s, v0.2s, #16
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    neg v1.2s, v1.2s
+; CHECK-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+    %3 = ashr <2 x i16> %0, %1
+    ret <2 x i16> %3
+}
+
+define <16 x i16> @ashr_v16i16(<16 x i16> %0, <16 x i16> %1){
+; CHECK-LABEL: ashr_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v2.8h, v2.8h
+; CHECK-NEXT:    neg v3.8h, v3.8h
+; CHECK-NEXT:    sshl v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    sshl v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    ret
+    %3 = ashr <16 x i16> %0, %1
+    ret <16 x i16> %3
+}
+
+define <1 x i32> @ashr_v1i32(<1 x i32> %0, <1 x i32> %1){
+; CHECK-SD-LABEL: ashr_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.2s, v1.2s
+; CHECK-SD-NEXT:    sshl v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    asr w8, w8, w9
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %3 = ashr <1 x i32> %0, %1
+    ret <1 x i32> %3
+}
+
+define <8 x i32> @ashr_v8i32(<8 x i32> %0, <8 x i32> %1){
+; CHECK-LABEL: ashr_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v2.4s, v2.4s
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    sshl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    sshl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    ret
+    %3 = ashr <8 x i32> %0, %1
+    ret <8 x i32> %3
+}
+
+define <4 x i64> @ashr_v4i64(<4 x i64> %0, <4 x i64> %1){
+; CHECK-LABEL: ashr_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v2.2d, v2.2d
+; CHECK-NEXT:    neg v3.2d, v3.2d
+; CHECK-NEXT:    sshl v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    sshl v1.2d, v1.2d, v3.2d
+; CHECK-NEXT:    ret
+    %3 = ashr <4 x i64> %0, %1
+    ret <4 x i64> %3
+}
+
+define <4 x i8> @lshr_v4i8(<4 x i8> %0, <4 x i8> %1){
+; CHECK-LABEL: lshr_v4i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %3 = lshr <4 x i8> %0, %1
+    ret <4 x i8> %3
+}
+
+define <32 x i8> @lshr_v32i8(<32 x i8> %0, <32 x i8> %1){
+; CHECK-LABEL: lshr_v32i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v2.16b, v2.16b
+; CHECK-NEXT:    neg v3.16b, v3.16b
+; CHECK-NEXT:    ushl v0.16b, v0.16b, v2.16b
+; CHECK-NEXT:    ushl v1.16b, v1.16b, v3.16b
+; CHECK-NEXT:    ret
+    %3 = lshr <32 x i8> %0, %1
+    ret <32 x i8> %3
+}
+
+define <2 x i16> @lshr_v2i16(<2 x i16> %0, <2 x i16> %1){
+; CHECK-LABEL: lshr_v2i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    movi d2, #0x00ffff0000ffff
+; CHECK-NEXT:    and v1.8b, v1.8b, v2.8b
+; CHECK-NEXT:    and v0.8b, v0.8b, v2.8b
+; CHECK-NEXT:    neg v1.2s, v1.2s
+; CHECK-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-NEXT:    ret
+    %3 = lshr <2 x i16> %0, %1
+    ret <2 x i16> %3
+}
+
+define <16 x i16> @lshr_v16i16(<16 x i16> %0, <16 x i16> %1){
+; CHECK-LABEL: lshr_v16i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v2.8h, v2.8h
+; CHECK-NEXT:    neg v3.8h, v3.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-NEXT:    ret
+    %3 = lshr <16 x i16> %0, %1
+    ret <16 x i16> %3
+}
+
+define <1 x i32> @lshr_v1i32(<1 x i32> %0, <1 x i32> %1){
+; CHECK-SD-LABEL: lshr_v1i32:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v1.2s, v1.2s
+; CHECK-SD-NEXT:    ushl v0.2s, v0.2s, v1.2s
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v1i32:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    fmov x9, d1
+; CHECK-GI-NEXT:    lsr w8, w8, w9
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT:    ret
+    %3 = lshr <1 x i32> %0, %1
+    ret <1 x i32> %3
+}
+
+define <8 x i32> @lshr_v8i32(<8 x i32> %0, <8 x i32> %1){
+; CHECK-LABEL: lshr_v8i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v2.4s, v2.4s
+; CHECK-NEXT:    neg v3.4s, v3.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v2.4s
+; CHECK-NEXT:    ushl v1.4s, v1.4s, v3.4s
+; CHECK-NEXT:    ret
+    %3 = lshr <8 x i32> %0, %1
+    ret <8 x i32> %3
+}
+
+define <4 x i64> @lshr_v4i64(<4 x i64> %0, <4 x i64> %1){
+; CHECK-LABEL: lshr_v4i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v2.2d, v2.2d
+; CHECK-NEXT:    neg v3.2d, v3.2d
+; CHECK-NEXT:    ushl v0.2d, v0.2d, v2.2d
+; CHECK-NEXT:    ushl v1.2d, v1.2d, v3.2d
+; CHECK-NEXT:    ret
+    %3 = lshr <4 x i64> %0, %1
+    ret <4 x i64> %3
+}
+
+; ===== Vector with Non-Pow 2 Width =====
+
+define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
+; CHECK-LABEL: shl_v3i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w3
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    mov v0.h[1], w4
+; CHECK-NEXT:    mov v1.h[1], w1
+; CHECK-NEXT:    mov v0.h[2], w5
+; CHECK-NEXT:    mov v1.h[2], w2
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    ushl v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    umov w1, v0.h[1]
+; CHECK-NEXT:    umov w2, v0.h[2]
+; CHECK-NEXT:    ret
+    %3 = shl <3 x i8> %0, %1
+    ret <3 x i8> %3
+}
+
+define <7 x i8> @shl_v7i8(<7 x i8> %0, <7 x i8> %1){
+; CHECK-LABEL: shl_v7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+    %3 = shl <7 x i8> %0, %1
+    ret <7 x i8> %3
+}
+
+define <3 x i16> @shl_v3i16(<3 x i16> %0, <3 x i16> %1){
+; CHECK-LABEL: shl_v3i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %3 = shl <3 x i16> %0, %1
+    ret <3 x i16> %3
+}
+
+define <7 x i16> @shl_v7i16(<7 x i16> %0, <7 x i16> %1){
+; CHECK-LABEL: shl_v7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+    %3 = shl <7 x i16> %0, %1
+    ret <7 x i16> %3
+}
+
+define <3 x i32> @shl_v3i32(<3 x i32> %0, <3 x i32> %1){
+; CHECK-LABEL: shl_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+    %3 = shl <3 x i32> %0, %1
+    ret <3 x i32> %3
+}
+
+define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
+; CHECK-LABEL: ashr_v3i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w0
+; CHECK-NEXT:    fmov s1, w3
+; CHECK-NEXT:    mov v0.h[1], w1
+; CHECK-NEXT:    mov v1.h[1], w4
+; CHECK-NEXT:    mov v0.h[2], w2
+; CHECK-NEXT:    mov v1.h[2], w5
+; CHECK-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    umov w1, v0.h[1]
+; CHECK-NEXT:    umov w2, v0.h[2]
+; CHECK-NEXT:    ret
+    %3 = ashr <3 x i8> %0, %1
+    ret <3 x i8> %3
+}
+
+define <7 x i8> @ashr_v7i8(<7 x i8> %0, <7 x i8> %1){
+; CHECK-LABEL: ashr_v7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8b, v1.8b
+; CHECK-NEXT:    sshl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+    %3 = ashr <7 x i8> %0, %1
+    ret <7 x i8> %3
+}
+
+define <3 x i16> @ashr_v3i16(<3 x i16> %0, <3 x i16> %1){
+; CHECK-LABEL: ashr_v3i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %3 = ashr <3 x i16> %0, %1
+    ret <3 x i16> %3
+}
+
+define <7 x i16> @ashr_v7i16(<7 x i16> %0, <7 x i16> %1){
+; CHECK-LABEL: ashr_v7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8h, v1.8h
+; CHECK-NEXT:    sshl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+    %3 = ashr <7 x i16> %0, %1
+    ret <7 x i16> %3
+}
+
+define <3 x i32> @ashr_v3i32(<3 x i32> %0, <3 x i32> %1){
+; CHECK-LABEL: ashr_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4s, v1.4s
+; CHECK-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+    %3 = ashr <3 x i32> %0, %1
+    ret <3 x i32> %3
+}
+
+define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
+; CHECK-LABEL: lshr_v3i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fmov s0, w3
+; CHECK-NEXT:    fmov s1, w0
+; CHECK-NEXT:    mov v0.h[1], w4
+; CHECK-NEXT:    mov v1.h[1], w1
+; CHECK-NEXT:    mov v0.h[2], w5
+; CHECK-NEXT:    mov v1.h[2], w2
+; CHECK-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-NEXT:    neg v0.4h, v0.4h
+; CHECK-NEXT:    ushl v0.4h, v1.4h, v0.4h
+; CHECK-NEXT:    umov w0, v0.h[0]
+; CHECK-NEXT:    umov w1, v0.h[1]
+; CHECK-NEXT:    umov w2, v0.h[2]
+; CHECK-NEXT:    ret
+    %3 = lshr <3 x i8> %0, %1
+    ret <3 x i8> %3
+}
+
+define <7 x i8> @lshr_v7i8(<7 x i8> %0, <7 x i8> %1){
+; CHECK-LABEL: lshr_v7i8:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8b, v1.8b
+; CHECK-NEXT:    ushl v0.8b, v0.8b, v1.8b
+; CHECK-NEXT:    ret
+    %3 = lshr <7 x i8> %0, %1
+    ret <7 x i8> %3
+}
+
+define <3 x i16> @lshr_v3i16(<3 x i16> %0, <3 x i16> %1){
+; CHECK-LABEL: lshr_v3i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4h, v1.4h
+; CHECK-NEXT:    ushl v0.4h, v0.4h, v1.4h
+; CHECK-NEXT:    ret
+    %3 = lshr <3 x i16> %0, %1
+    ret <3 x i16> %3
+}
+
+define <7 x i16> @lshr_v7i16(<7 x i16> %0, <7 x i16> %1){
+; CHECK-LABEL: lshr_v7i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.8h, v1.8h
+; CHECK-NEXT:    ushl v0.8h, v0.8h, v1.8h
+; CHECK-NEXT:    ret
+    %3 = lshr <7 x i16> %0, %1
+    ret <7 x i16> %3
+}
+
+define <3 x i32> @lshr_v3i32(<3 x i32> %0, <3 x i32> %1){
+; CHECK-LABEL: lshr_v3i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    neg v1.4s, v1.4s
+; CHECK-NEXT:    ushl v0.4s, v0.4s, v1.4s
+; CHECK-NEXT:    ret
+    %3 = lshr <3 x i32> %0, %1
+    ret <3 x i32> %3
+}
+
+; TODO:
+; ===== Vector with Odd Element Sizes =====

From b689b4fe55103a04eac847964e126b6048b89ae0 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 22 Jan 2024 07:27:06 -0600
Subject: [PATCH 381/843] [LLVM][CMake] Add ffi_static target for the FFI
 static library (#78779)

Summary:
This patch is an attempt to make the `find_package(FFI)` support in LLVM
prefer to provide the static library version if present. This is
currently
an optional library for building `libffi`, and its presence implies that
it should likely be used. This patch is an attempt to fix some problems
observed with testing programs linked against `libffi` on many different
systems that could have conflicting paths. Linking it statically
prevents this.

This patch adds the `ffi_static` target for this library.
---
 llvm/cmake/modules/FindFFI.cmake                   | 14 ++++++++++++--
 openmp/libomptarget/plugins-nextgen/CMakeLists.txt | 13 ++++++++-----
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/llvm/cmake/modules/FindFFI.cmake b/llvm/cmake/modules/FindFFI.cmake
index c9ba104601872..8e67c5d8c6d17 100644
--- a/llvm/cmake/modules/FindFFI.cmake
+++ b/llvm/cmake/modules/FindFFI.cmake
@@ -15,6 +15,7 @@
 # FFI_FOUND
 # FFI_INCLUDE_DIRS
 # FFI_LIBRARIES
+# FFI_STATIC_LIBRARIES
 # HAVE_FFI_CALL
 #
 # HAVE_FFI_H or HAVE_FFI_FFI_H is defined depending on the ffi.h include path.
@@ -34,7 +35,8 @@ else()
   endif()
 endif()
 
-find_library(FFI_LIBRARIES ffi PATHS ${FFI_LIBRARY_DIR})
+find_library(FFI_LIBRARIES NAMES ffi PATHS ${FFI_LIBRARY_DIR})
+find_library(FFI_STATIC_LIBRARIES NAMES libffi.a PATHS ${FFI_LIBRARY_DIR})
 
 if(FFI_LIBRARIES)
   include(CMakePushCheckState)
@@ -76,6 +78,7 @@ find_package_handle_standard_args(FFI
                                     ${required_includes}
                                     HAVE_FFI_CALL)
 mark_as_advanced(FFI_LIBRARIES
+                 FFI_STATIC_LIBRARIES
                  FFI_INCLUDE_DIRS
                  HAVE_FFI_CALL
                  FFI_HEADER
@@ -83,11 +86,18 @@ mark_as_advanced(FFI_LIBRARIES
                  HAVE_FFI_FFI_H)
 
 if(FFI_FOUND)
-  if(NOT TARGET FFI::ffi)
+  if(NOT TARGET FFI::ffi AND FFI_LIBRARIES)
     add_library(FFI::ffi UNKNOWN IMPORTED)
     set_target_properties(FFI::ffi PROPERTIES IMPORTED_LOCATION "${FFI_LIBRARIES}")
     if(FFI_INCLUDE_DIRS)
       set_target_properties(FFI::ffi PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${FFI_INCLUDE_DIRS}")
     endif()
   endif()
+  if(NOT TARGET FFI::ffi_static AND FFI_STATIC_LIBRARIES)
+    add_library(FFI::ffi_static UNKNOWN IMPORTED)
+    set_target_properties(FFI::ffi_static PROPERTIES IMPORTED_LOCATION "${FFI_STATIC_LIBRARIES}")
+    if(FFI_INCLUDE_DIRS)
+      set_target_properties(FFI::ffi_static PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${FFI_INCLUDE_DIRS}")
+    endif()
+  endif()
 endif()
diff --git a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
index 9b4e94550239c..f5fc3b6211109 100644
--- a/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
+++ b/openmp/libomptarget/plugins-nextgen/CMakeLists.txt
@@ -49,11 +49,14 @@ if(CMAKE_SYSTEM_PROCESSOR MATCHES "${tmachine}$")
   )
 
   if(LIBOMPTARGET_DEP_LIBFFI_FOUND)
-     libomptarget_say("Building ${tmachine_libname} plugin linked with libffi")
-     target_link_libraries("omptarget.rtl.${tmachine_libname}" PRIVATE
-                           ${FFI_LIBRARIES})
-     target_include_directories("omptarget.rtl.${tmachine_libname}" PRIVATE
-                                ${FFI_INCLUDE_DIRS})
+    libomptarget_say("Building ${tmachine_libname} plugin linked with libffi")
+    if(FFI_STATIC_LIBRARIES)
+      target_link_libraries(
+        "omptarget.rtl.${tmachine_libname}" PRIVATE FFI::ffi_static)
+    else()
+      target_link_libraries(
+        "omptarget.rtl.${tmachine_libname}" PRIVATE FFI::ffi)
+    endif()
   else()
      libomptarget_say("Building ${tmachine_libname} plugie for dlopened libffi")
      target_sources("omptarget.rtl.${tmachine_libname}" PRIVATE

From 5266c1285bb9560d8d9d266977167f6c24282e95 Mon Sep 17 00:00:00 2001
From: Orlando Cazalet-Hyams <orlando.hyams@sony.com>
Date: Mon, 22 Jan 2024 13:30:50 +0000
Subject: [PATCH 382/843] Revert "[hwasan] Update dbg.assign intrinsics in
 HWAsan pass" (#78971)

Reverts llvm/llvm-project#78606

https://lab.llvm.org/buildbot/#/builders/77/builds/33963
---
 llvm/lib/IR/DebugInfo.cpp                     |  4 ++
 .../Instrumentation/HWAddressSanitizer.cpp    |  5 --
 .../Transforms/Utils/MemoryTaggingSupport.cpp | 10 +--
 .../AArch64/dbg-assign-tag-offset-mix-loc.ll  | 71 -------------------
 .../CodeGen/AArch64/dbg-assign-tag-offset.ll  | 71 -------------------
 .../declare-to-assign/hwasan.ll               |  2 +-
 .../dbg-assign-tag-offset.ll                  | 59 ---------------
 7 files changed, 8 insertions(+), 214 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
 delete mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 2e64d0db57b25..fcd3f77f8f6e2 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2200,6 +2200,10 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return /*Changed*/ false;
 
+  // FIXME: https://github.com/llvm/llvm-project/issues/76545
+  if (F.hasFnAttribute(Attribute::SanitizeHWAddress))
+    return /*Changed*/ false;
+
   bool Changed = false;
   auto *DL = &F.getParent()->getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index b213915dbcf62..efb621cde9065 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1435,11 +1435,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
         if (DDI->getVariableLocationOp(LocNo) == AI)
           DDI->setExpression(DIExpression::appendOpsToArg(DDI->getExpression(),
                                                           NewOps, LocNo));
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DDI)) {
-        assert(DAI->getAddress() == AI);
-        DAI->setAddressExpression(
-            DIExpression::prependOpcodes(DDI->getExpression(), NewOps));
-      }
     }
 
     auto TagEnd = [&](Instruction *Node) {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index d2efcde5d3803..f94047633022c 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -138,20 +138,16 @@ void StackInfoBuilder::visit(Instruction &Inst) {
     return;
   }
   if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    auto AddIfInteresting = [&](Value *V) {
+    for (Value *V : DVI->location_ops()) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
-          return;
+          continue;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
         auto &DVIVec = AInfo.DbgVariableIntrinsics;
         if (DVIVec.empty() || DVIVec.back() != DVI)
           DVIVec.push_back(DVI);
       }
-    };
-    for (Value *V : DVI->location_ops())
-      AddIfInteresting(V);
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-      AddIfInteresting(DAI->getAddress());
+    }
   }
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
deleted file mode 100644
index ef0dd46cb45c7..0000000000000
--- a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
-
-;; Similar to dbg-assign-tag-offset.ll except the variable 'x' has been removed
-;; and 'y' has an implicit location range as well as stack location range
-;; (according to the hand-modified debug info -- see the dbg.value).
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-android24"
-
-; CHECK:      DW_TAG_variable
-; CHECK-NOT:  DW_TAG
-; CHECK:        DW_AT_LLVM_tag_offset (0x80)
-; CHECK-NEXT:   DW_AT_name    ("y")
-
-define dso_local void @f() !dbg !14 {
-  %1 = alloca i32, align 4, !DIAssignID !31
-  %2 = alloca i32, align 4, !DIAssignID !32
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  call void @llvm.dbg.value(metadata i32 2, metadata !20, metadata !DIExpression()), !dbg !22
-  call void @use(ptr null), !dbg !28
-  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
-  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  call void @use(ptr nonnull %1), !dbg !28
-  call void @use(ptr nonnull %2), !dbg !29
-  ret void, !dbg !30
-}
-
-declare !dbg !5 void @use(ptr)
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
-!llvm.ident = !{!13}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
-!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{!4, !5}
-!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
-!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
-!6 = !DISubroutineType(types: !7)
-!7 = !{null, !4}
-!8 = !{i32 7, !"Dwarf Version", i32 4}
-!9 = !{i32 2, !"Debug Info Version", i32 3}
-!10 = !{i32 1, !"wchar_size", i32 4}
-!11 = !{i32 7, !"PIC Level", i32 2}
-!12 = !{i32 7, !"PIE Level", i32 2}
-!13 = !{!"clang version 10.0.0"}
-!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
-!15 = !DISubroutineType(types: !16)
-!16 = !{null}
-!17 = !{!18, !20}
-!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
-!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
-!21 = !DILocation(line: 5, column: 3, scope: !14)
-!22 = !DILocation(line: 0, scope: !14)
-!23 = !DILocation(line: 5, column: 10, scope: !14)
-!24 = !{!25, !25, i64 0}
-!25 = !{!"int", !26, i64 0}
-!26 = !{!"omnipotent char", !27, i64 0}
-!27 = !{!"Simple C++ TBAA"}
-!28 = !DILocation(line: 6, column: 3, scope: !14)
-!29 = !DILocation(line: 7, column: 3, scope: !14)
-!30 = !DILocation(line: 8, column: 1, scope: !14)
-!31 = distinct !DIAssignID()
-!32 = distinct !DIAssignID()
-!33 = distinct !DIAssignID()
-!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
deleted file mode 100644
index a587f93d14d70..0000000000000
--- a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-android24"
-
-; CHECK:      DW_TAG_variable
-; CHECK-NOT:  DW_TAG
-; CHECK:        DW_AT_LLVM_tag_offset (0x00)
-; CHECK-NEXT:   DW_AT_name    ("x")
-
-; CHECK:      DW_TAG_variable
-; CHECK-NOT:  DW_TAG
-; CHECK:        DW_AT_LLVM_tag_offset (0x80)
-; CHECK-NEXT:   DW_AT_name    ("y")
-
-define dso_local void @f() !dbg !14 {
-  %1 = alloca i32, align 4, !DIAssignID !31
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !18, metadata !DIExpression(), metadata !31, metadata ptr %1, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0)), !dbg !22
-  %2 = alloca i32, align 4, !DIAssignID !32
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
-  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  call void @use(ptr nonnull %1), !dbg !28
-  call void @use(ptr nonnull %2), !dbg !29
-  ret void, !dbg !30
-}
-
-declare !dbg !5 void @use(ptr)
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
-!llvm.ident = !{!13}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
-!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{!4, !5}
-!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
-!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
-!6 = !DISubroutineType(types: !7)
-!7 = !{null, !4}
-!8 = !{i32 7, !"Dwarf Version", i32 4}
-!9 = !{i32 2, !"Debug Info Version", i32 3}
-!10 = !{i32 1, !"wchar_size", i32 4}
-!11 = !{i32 7, !"PIC Level", i32 2}
-!12 = !{i32 7, !"PIE Level", i32 2}
-!13 = !{!"clang version 10.0.0"}
-!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
-!15 = !DISubroutineType(types: !16)
-!16 = !{null}
-!17 = !{!18, !20}
-!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
-!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
-!21 = !DILocation(line: 5, column: 3, scope: !14)
-!22 = !DILocation(line: 0, scope: !14)
-!23 = !DILocation(line: 5, column: 10, scope: !14)
-!24 = !{!25, !25, i64 0}
-!25 = !{!"int", !26, i64 0}
-!26 = !{!"omnipotent char", !27, i64 0}
-!27 = !{!"Simple C++ TBAA"}
-!28 = !DILocation(line: 6, column: 3, scope: !14)
-!29 = !DILocation(line: 7, column: 3, scope: !14)
-!30 = !DILocation(line: 8, column: 1, scope: !14)
-!31 = distinct !DIAssignID()
-!32 = distinct !DIAssignID()
-!33 = distinct !DIAssignID()
-!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
index 6c9366609cba2..c4b209de77017 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
 
-; CHECK: call void @llvm.dbg.assign
+; CHECK: call void @llvm.dbg.declare
 
 define dso_local void @f() sanitize_hwaddress !dbg !9 {
 entry:
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
deleted file mode 100644
index 1248c0bc586ab..0000000000000
--- a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
-
-source_filename = "test.ll"
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-android"
-
-declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
-
-; Function Attrs: sanitize_hwaddress
-define void @f() #0 !dbg !7 {
-entry:
-  %nodebug0 = alloca ptr, align 8
-  %nodebug1 = alloca ptr, align 8
-  %nodebug2 = alloca ptr, align 8
-  %nodebug3 = alloca ptr, align 8
-  ; CHECK: %a = alloca{{.*}} !DIAssignID ![[ID1:[0-9]+]]
-  %a = alloca ptr, align 8, !DIAssignID !13
-  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID1]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !15
-  ; CHECK: %b = alloca{{.*}} !DIAssignID ![[ID2:[0-9]+]]
-  %b = alloca ptr, align 8, !DIAssignID !16
-  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID2]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !17, metadata !DIExpression(), metadata !16, metadata ptr %b, metadata !DIExpression()), !dbg !15
-  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
-  ret void, !dbg !18
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #1
-
-attributes #0 = { sanitize_hwaddress }
-attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "x.c", directory: "/")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
-!6 = !{!"clang"}
-!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null, !10}
-!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
-!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
-!12 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
-!13 = distinct !DIAssignID()
-!14 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 1, type: !10)
-!15 = !DILocation(line: 0, scope: !7)
-!16 = distinct !DIAssignID()
-!17 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 1, type: !10)
-!18 = !DILocation(line: 1, column: 37, scope: !7)

From 490a09a02e81c0034aa08a800fa7a57ec6ef0767 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bj=C3=B6rn=20Pettersson?= <bjorn.a.pettersson@ericsson.com>
Date: Mon, 22 Jan 2024 14:43:02 +0100
Subject: [PATCH 383/843] [UnrollAnalyzerTest] Remove dependency to pass
 managers (#78473)

Remove use of LegacyPassManager in the UnrollAnalyzerTest unit test.

Given that the goal isn't to test pass manager interfaces, and since the
LoopUnrollAnalyzer isn't even implemented as a pass, we do not really
need the complexity of using a pass manager. Instead we just make sure
that we run LoopUnrollAnalyzer and other needed analyses standalone
(without any pass manager). This was inspired by the LoopInfoTest unit
test.
---
 .../unittests/Analysis/UnrollAnalyzerTest.cpp | 115 +++++++-----------
 1 file changed, 45 insertions(+), 70 deletions(-)

diff --git a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp
index 2f9135d4fb242..721d67f22f2f2 100644
--- a/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp
+++ b/llvm/unittests/Analysis/UnrollAnalyzerTest.cpp
@@ -6,61 +6,52 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Dominators.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/InitializePasses.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
 using namespace llvm;
-namespace llvm {
-void initializeUnrollAnalyzerTestPass(PassRegistry &);
 
-static SmallVector<DenseMap<Value *, Value *>, 16> SimplifiedValuesVector;
-static unsigned TripCount = 0;
+typedef SmallVector<DenseMap<Value *, Value *>, 16> SimplifiedValuesVectorTy;
 
-namespace {
-struct UnrollAnalyzerTest : public FunctionPass {
-  static char ID;
-  bool runOnFunction(Function &F) override {
-    LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-    ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+/// Build loop info and scalar evolution for the function and run the analysis.
+static void
+runUnrollAnalyzer(Module &M, StringRef FuncName,
+                  SimplifiedValuesVectorTy &SimplifiedValuesVector) {
+  auto *F = M.getFunction(FuncName);
+  ASSERT_NE(F, nullptr) << "Could not find " << FuncName;
 
-    Function::iterator FI = F.begin();
-    FI++; // First basic block is entry - skip it.
-    BasicBlock *Header = &*FI++;
-    Loop *L = LI->getLoopFor(Header);
-    BasicBlock *Exiting = L->getExitingBlock();
+  TargetLibraryInfoImpl TLII;
+  TargetLibraryInfo TLI(TLII);
+  AssumptionCache AC(*F);
+  DominatorTree DT(*F);
+  LoopInfo LI(DT);
+  ScalarEvolution SE(*F, TLI, AC, DT, LI);
 
-    SimplifiedValuesVector.clear();
-    TripCount = SE->getSmallConstantTripCount(L, Exiting);
-    for (unsigned Iteration = 0; Iteration < TripCount; Iteration++) {
-      DenseMap<Value *, Value *> SimplifiedValues;
-      UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, *SE, L);
-      for (auto *BB : L->getBlocks())
-        for (Instruction &I : *BB)
-          Analyzer.visit(I);
-      SimplifiedValuesVector.push_back(SimplifiedValues);
-    }
-    return false;
-  }
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
-    AU.addRequired<ScalarEvolutionWrapperPass>();
-    AU.setPreservesAll();
-  }
-  UnrollAnalyzerTest() : FunctionPass(ID) {
-    initializeUnrollAnalyzerTestPass(*PassRegistry::getPassRegistry());
+  Function::iterator FI = F->begin();
+  FI++; // First basic block is entry - skip it.
+  BasicBlock *Header = &*FI++;
+  Loop *L = LI.getLoopFor(Header);
+  BasicBlock *Exiting = L->getExitingBlock();
+
+  SimplifiedValuesVector.clear();
+  unsigned TripCount = SE.getSmallConstantTripCount(L, Exiting);
+  for (unsigned Iteration = 0; Iteration < TripCount; Iteration++) {
+    DenseMap<Value *, Value *> SimplifiedValues;
+    UnrolledInstAnalyzer Analyzer(Iteration, SimplifiedValues, SE, L);
+    for (auto *BB : L->getBlocks())
+      for (Instruction &I : *BB)
+        Analyzer.visit(I);
+    SimplifiedValuesVector.push_back(SimplifiedValues);
   }
-};
 }
 
-char UnrollAnalyzerTest::ID = 0;
-
 std::unique_ptr<Module> makeLLVMModule(LLVMContext &Context,
                                        const char *ModuleStr) {
   SMDiagnostic Err;
@@ -85,12 +76,11 @@ TEST(UnrollAnalyzerTest, BasicSimplifications) {
       "  %x.lcssa = phi i64 [ %x2, %loop ]\n"
       "  ret i64 %x.lcssa\n"
       "}\n";
-  UnrollAnalyzerTest *P = new UnrollAnalyzerTest();
   LLVMContext Context;
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleStr);
-  legacy::PassManager Passes;
-  Passes.add(P);
-  Passes.run(*M);
+  SimplifiedValuesVectorTy SimplifiedValuesVector;
+  runUnrollAnalyzer(*M, "propagate_loop_phis", SimplifiedValuesVector);
+  unsigned TripCount = SimplifiedValuesVector.size();
 
   // Perform checks
   Module::iterator MI = M->begin();
@@ -148,12 +138,10 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) {
       "  ret void\n"
       "}\n";
 
-  UnrollAnalyzerTest *P = new UnrollAnalyzerTest();
   LLVMContext Context;
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleStr);
-  legacy::PassManager Passes;
-  Passes.add(P);
-  Passes.run(*M);
+  SimplifiedValuesVectorTy SimplifiedValuesVector;
+  runUnrollAnalyzer(*M, "foo", SimplifiedValuesVector);
 
   Module::iterator MI = M->begin();
   Function *F = &*MI++;
@@ -177,6 +165,7 @@ TEST(UnrollAnalyzerTest, OuterLoopSimplification) {
   auto I2 = SimplifiedValuesVector[0].find(Y2);
   EXPECT_TRUE(I2 == SimplifiedValuesVector[0].end());
 }
+
 TEST(UnrollAnalyzerTest, CmpSimplifications) {
   const char *ModuleStr =
       "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n"
@@ -193,12 +182,10 @@ TEST(UnrollAnalyzerTest, CmpSimplifications) {
       "for.end:\n"
       "  ret void\n"
       "}\n";
-  UnrollAnalyzerTest *P = new UnrollAnalyzerTest();
   LLVMContext Context;
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleStr);
-  legacy::PassManager Passes;
-  Passes.add(P);
-  Passes.run(*M);
+  SimplifiedValuesVectorTy SimplifiedValuesVector;
+  runUnrollAnalyzer(*M, "branch_iv_trunc", SimplifiedValuesVector);
 
   // Perform checks
   Module::iterator MI = M->begin();
@@ -221,6 +208,7 @@ TEST(UnrollAnalyzerTest, CmpSimplifications) {
   EXPECT_TRUE(I2 != SimplifiedValuesVector[5].end());
   EXPECT_EQ(cast<ConstantInt>((*I2).second)->getZExtValue(), 1U);
 }
+
 TEST(UnrollAnalyzerTest, PtrCmpSimplifications) {
   const char *ModuleStr =
       "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n"
@@ -240,12 +228,10 @@ TEST(UnrollAnalyzerTest, PtrCmpSimplifications) {
       "loop.exit:\n"
       "  ret void\n"
       "}\n";
-  UnrollAnalyzerTest *P = new UnrollAnalyzerTest();
   LLVMContext Context;
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleStr);
-  legacy::PassManager Passes;
-  Passes.add(P);
-  Passes.run(*M);
+  SimplifiedValuesVectorTy SimplifiedValuesVector;
+  runUnrollAnalyzer(*M, "ptr_cmp", SimplifiedValuesVector);
 
   // Perform checks
   Module::iterator MI = M->begin();
@@ -263,6 +249,7 @@ TEST(UnrollAnalyzerTest, PtrCmpSimplifications) {
   EXPECT_TRUE(I1 != SimplifiedValuesVector[5].end());
   EXPECT_EQ(cast<ConstantInt>((*I1).second)->getZExtValue(), 0U);
 }
+
 TEST(UnrollAnalyzerTest, CastSimplifications) {
   const char *ModuleStr =
       "target datalayout = \"e-m:o-i64:64-f80:128-n8:16:32:64-S128\"\n"
@@ -286,12 +273,10 @@ TEST(UnrollAnalyzerTest, CastSimplifications) {
       "  ret void\n"
       "}\n";
 
-  UnrollAnalyzerTest *P = new UnrollAnalyzerTest();
   LLVMContext Context;
   std::unique_ptr<Module> M = makeLLVMModule(Context, ModuleStr);
-  legacy::PassManager Passes;
-  Passes.add(P);
-  Passes.run(*M);
+  SimplifiedValuesVectorTy SimplifiedValuesVector;
+  runUnrollAnalyzer(*M, "const_load_cast", SimplifiedValuesVector);
 
   // Perform checks
   Module::iterator MI = M->begin();
@@ -319,13 +304,3 @@ TEST(UnrollAnalyzerTest, CastSimplifications) {
   EXPECT_TRUE(I3 != SimplifiedValuesVector[5].end());
   EXPECT_EQ(cast<ConstantInt>((*I3).second)->getZExtValue(), 3U);
 }
-
-} // end namespace llvm
-
-INITIALIZE_PASS_BEGIN(UnrollAnalyzerTest, "unrollanalyzertestpass",
-                      "unrollanalyzertestpass", false, false)
-INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
-INITIALIZE_PASS_END(UnrollAnalyzerTest, "unrollanalyzertestpass",
-                    "unrollanalyzertestpass", false, false)

From 8fab16c86c9860fa78022a97553761fcdccfa03e Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 22 Jan 2024 08:48:58 -0500
Subject: [PATCH 384/843] [libc++][NFC] Remove trailing whitespace from release
 notes

---
 libcxx/docs/ReleaseNotes/18.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index b868b37790f77..7d6331a025808 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -127,7 +127,7 @@ Improvements and New Features
 
 - AddressSanitizer annotations have been added to ``std::basic_string``.
 
-- The libc++ source code has been formatted with ``clang-format``. This 
+- The libc++ source code has been formatted with ``clang-format``. This
   `discourse thread <https://discourse.llvm.org/t/rfc-clang-formatting-all-of-libc-once-and-for-all>`_
   contains information how to rebase downstream patches.
 

From 5a667bee9c983f882255732ae27c96d1d94e0893 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <5361294+alexey-bataev@users.noreply.github.com>
Date: Fri, 19 Jan 2024 09:29:01 -0500
Subject: [PATCH 385/843] [InstCombine] Try to fold trunc(shuffle(zext)) to
 just a shuffle (#78636)

Tries to remove extra trunc/ext instruction for shufflevector
instructions.

Differential Review: https://github.com/llvm/llvm-project/pull/78636
---
 .../lib/Transforms/InstCombine/InstCombineCasts.cpp | 13 +++++++++++++
 .../InstCombine/logical-select-inseltpoison.ll      |  7 ++-----
 llvm/test/Transforms/InstCombine/logical-select.ll  |  7 ++-----
 3 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 6629ca840a67c..58f0763bb0c0c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -103,6 +103,16 @@ Value *InstCombinerImpl::EvaluateInDifferentType(Value *V, Type *Ty,
       }
     }
     break;
+  case Instruction::ShuffleVector: {
+    auto *ScalarTy = cast<VectorType>(Ty)->getElementType();
+    auto *VTy = cast<VectorType>(I->getOperand(0)->getType());
+    auto *FixedTy = VectorType::get(ScalarTy, VTy->getElementCount());
+    Value *Op0 = EvaluateInDifferentType(I->getOperand(0), FixedTy, isSigned);
+    Value *Op1 = EvaluateInDifferentType(I->getOperand(1), FixedTy, isSigned);
+    Res = new ShuffleVectorInst(Op0, Op1,
+                                cast<ShuffleVectorInst>(I)->getShuffleMask());
+    break;
+  }
   default:
     // TODO: Can handle more cases here.
     llvm_unreachable("Unreachable!");
@@ -363,6 +373,9 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombinerImpl &IC,
           I->getOpcode() == Instruction::FPToSI);
     return Ty->getScalarSizeInBits() >= MinBitWidth;
   }
+  case Instruction::ShuffleVector:
+    return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+           canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
   default:
     // TODO: Can handle more cases here.
     break;
diff --git a/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
index 29e2cb42e1bef..b3d147621b59e 100644
--- a/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select-inseltpoison.ll
@@ -671,11 +671,8 @@ define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %
 
 define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i32> %y, <4 x i1> %cond1, <4 x i1> %cond2) {
 ; CHECK-LABEL: @computesignbits_through_two_input_shuffle(
-; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[COND]] to <4 x i1>
-; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[COND1:%.*]], <4 x i1> [[COND2:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[SEL]]
 ;
   %sext1 = sext <4 x i1> %cond1 to <4 x i32>
diff --git a/llvm/test/Transforms/InstCombine/logical-select.ll b/llvm/test/Transforms/InstCombine/logical-select.ll
index fcca9588767dd..af1a3e1455e49 100644
--- a/llvm/test/Transforms/InstCombine/logical-select.ll
+++ b/llvm/test/Transforms/InstCombine/logical-select.ll
@@ -707,11 +707,8 @@ define <4 x i32> @computesignbits_through_shuffles(<4 x float> %x, <4 x float> %
 
 define <4 x i32> @computesignbits_through_two_input_shuffle(<4 x i32> %x, <4 x i32> %y, <4 x i1> %cond1, <4 x i1> %cond2) {
 ; CHECK-LABEL: @computesignbits_through_two_input_shuffle(
-; CHECK-NEXT:    [[SEXT1:%.*]] = sext <4 x i1> [[COND1:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[SEXT2:%.*]] = sext <4 x i1> [[COND2:%.*]] to <4 x i32>
-; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i32> [[SEXT1]], <4 x i32> [[SEXT2]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
-; CHECK-NEXT:    [[TMP1:%.*]] = trunc <4 x i32> [[COND]] to <4 x i1>
-; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
+; CHECK-NEXT:    [[COND:%.*]] = shufflevector <4 x i1> [[COND1:%.*]], <4 x i1> [[COND2:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[COND]], <4 x i32> [[Y:%.*]], <4 x i32> [[X:%.*]]
 ; CHECK-NEXT:    ret <4 x i32> [[SEL]]
 ;
   %sext1 = sext <4 x i1> %cond1 to <4 x i32>

From cabe8be6bb2120828a741217365be975c59ac7b6 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 22 Jan 2024 15:02:21 +0100
Subject: [PATCH 386/843] [libc] `FPRep` builders return `FPRep` instead of raw
 `StorageType` (#78588)

---
 libc/src/__support/FPUtil/FPBits.h            | 593 +++++++++---------
 .../test/src/__support/FPUtil/fpbits_test.cpp | 171 ++---
 2 files changed, 399 insertions(+), 365 deletions(-)

diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index be700285de828..bc6b19b6ebf10 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -64,38 +64,46 @@ LIBC_INLINE_VAR constexpr Sign Sign::POS = Sign(false);
 //             └─────────▲─────────┘
 //                       │
 //             ┌─────────┴─────────┐
-//             │ FPRepBase<FPType> │
+//             │ FPStorage<FPType> │
 //             └─────────▲─────────┘
 //                       │
 //          ┌────────────┴─────────────┐
 //          │                          │
-// ┌────────┴──────┐     ┌─────────────┴──────────────┐
-// │ FPRep<FPType> │     │ FPRep<FPType::X86_Binary80 │
-// └────────▲──────┘     └─────────────▲──────────────┘
+// ┌────────┴─────────┐ ┌──────────────┴──────────────────┐
+// │ FPRepSem<FPType> │ │  FPRepSem<FPType::X86_Binary80  │
+// └────────▲─────────┘ └──────────────▲──────────────────┘
 //          │                          │
 //          └────────────┬─────────────┘
 //                       │
 //                 ┌─────┴─────┐
+//                 │  FPRep<T> │
+//                 └───────────┘
+//                       │
+//                 ┌─────┴─────┐
 //                 │ FPBits<T> │
 //                 └───────────┘
 //
-// - 'FPLayout' defines only a few constants, namely the 'StorageType' and the
-// length of the sign, the exponent and significand parts.
-// - 'FPRepBase' builds more constants on top of those from 'FPLayout' like
-// exponent bias, shifts and masks. It also defines tools to assemble or test
+// - 'FPLayout' defines only a few constants, namely the 'StorageType' and
+// length of the sign, the exponent, fraction and significand parts.
+// - 'FPStorage' builds more constants on top of those from 'FPLayout' like
+// exponent bias and masks. It also holds the bit representation of the
+// floating point as a 'StorageType' type and defines tools to assemble or test
 // these parts.
-// - 'FPRep' defines functions to interact with the floating point
-// representation. The default implementation is the one for 'IEEE754', a
-// specialization is provided for X86 Extended Precision that has a different
-// encoding.
-// - 'FPBits' is templated on the platform floating point types. Contrary to
-// 'FPRep' that is platform agnostic 'FPBits' is architecture dependent.
+// - 'FPRepSem' defines functions to interact semantically with the floating
+// point representation. The default implementation is the one for 'IEEE754', a
+// specialization is provided for X86 Extended Precision.
+// - 'FPRep' derives from 'FPRepSem' and adds functions that are common to all
+// implementations.
+// - 'FPBits' exposes all functions from 'FPRep' but operates on the native C++
+// floating point type instead of 'FPType'.
 
 namespace internal {
 
 // Defines the layout (sign, exponent, significand) of a floating point type in
 // memory. It also defines its associated StorageType, i.e., the unsigned
 // integer type used to manipulate its representation.
+// Additionally we provide the fractional part length, i.e., the number of bits
+// after the decimal dot when the number is in normal form.
 template <FPType> struct FPLayout {};
 
 template <> struct FPLayout<FPType::IEEE754_Binary16> {
@@ -103,6 +111,7 @@ template <> struct FPLayout<FPType::IEEE754_Binary16> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 5;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 10;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary32> {
@@ -110,6 +119,7 @@ template <> struct FPLayout<FPType::IEEE754_Binary32> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 8;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 23;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary64> {
@@ -117,6 +127,7 @@ template <> struct FPLayout<FPType::IEEE754_Binary64> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 11;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 52;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary128> {
@@ -124,6 +135,7 @@ template <> struct FPLayout<FPType::IEEE754_Binary128> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 112;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::X86_Binary80> {
@@ -131,23 +143,22 @@ template <> struct FPLayout<FPType::X86_Binary80> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 64;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN - 1;
 };
 
-} // namespace internal
-
-// FPRepBase derives useful constants from the FPLayout.
-template <FPType fp_type>
-struct FPRepBase : public internal::FPLayout<fp_type> {
-private:
-  using UP = internal::FPLayout<fp_type>;
+// FPStorage derives useful constants from the FPLayout above.
+template <FPType fp_type> struct FPStorage : public FPLayout<fp_type> {
+  using UP = FPLayout<fp_type>;
 
-public:
   using UP::EXP_LEN;  // The number of bits for the *exponent* part
   using UP::SIG_LEN;  // The number of bits for the *significand* part
   using UP::SIGN_LEN; // The number of bits for the *sign* part
   // For convenience, the sum of `SIG_LEN`, `EXP_LEN`, and `SIGN_LEN`.
   LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + EXP_LEN + SIG_LEN;
 
+  // The number of bits after the decimal dot when the number is in normal form.
+  using UP::FRACTION_LEN;
+
   // An unsigned integer that is wide enough to contain all of the floating
   // point bits.
   using StorageType = typename UP::StorageType;
@@ -162,41 +173,30 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
       (1U << (EXP_LEN - 1U)) - 1U;
   static_assert(EXP_BIAS > 0);
 
-protected:
-  // The shift amount to get the *significand* part to the least significant
-  // bit. Always `0` but kept for consistency.
-  LIBC_INLINE_VAR static constexpr int SIG_MASK_SHIFT = 0;
-  // The shift amount to get the *exponent* part to the least significant bit.
-  LIBC_INLINE_VAR static constexpr int EXP_MASK_SHIFT = SIG_LEN;
-  // The shift amount to get the *sign* part to the least significant bit.
-  LIBC_INLINE_VAR static constexpr int SIGN_MASK_SHIFT = SIG_LEN + EXP_LEN;
-
   // The bit pattern that keeps only the *significand* part.
   LIBC_INLINE_VAR static constexpr StorageType SIG_MASK =
-      mask_trailing_ones<StorageType, SIG_LEN>() << SIG_MASK_SHIFT;
-
-public:
+      mask_trailing_ones<StorageType, SIG_LEN>();
   // The bit pattern that keeps only the *exponent* part.
   LIBC_INLINE_VAR static constexpr StorageType EXP_MASK =
-      mask_trailing_ones<StorageType, EXP_LEN>() << EXP_MASK_SHIFT;
+      mask_trailing_ones<StorageType, EXP_LEN>() << SIG_LEN;
   // The bit pattern that keeps only the *sign* part.
   LIBC_INLINE_VAR static constexpr StorageType SIGN_MASK =
-      mask_trailing_ones<StorageType, SIGN_LEN>() << SIGN_MASK_SHIFT;
+      mask_trailing_ones<StorageType, SIGN_LEN>() << (EXP_LEN + SIG_LEN);
   // The bit pattern that keeps only the *exponent + significand* part.
   LIBC_INLINE_VAR static constexpr StorageType EXP_SIG_MASK =
       mask_trailing_ones<StorageType, EXP_LEN + SIG_LEN>();
   // The bit pattern that keeps only the *sign + exponent + significand* part.
   LIBC_INLINE_VAR static constexpr StorageType FP_MASK =
       mask_trailing_ones<StorageType, TOTAL_LEN>();
+  // The bit pattern that keeps only the *fraction* part.
+  // i.e., the *significand* without the leading one.
+  LIBC_INLINE_VAR static constexpr StorageType FRACTION_MASK =
+      mask_trailing_ones<StorageType, FRACTION_LEN>();
 
   static_assert((SIG_MASK & EXP_MASK & SIGN_MASK) == 0, "masks disjoint");
   static_assert((SIG_MASK | EXP_MASK | SIGN_MASK) == FP_MASK, "masks cover");
 
 protected:
-  LIBC_INLINE static constexpr StorageType bit_at(int position) {
-    return StorageType(1) << position;
-  }
-
   // A stongly typed integer that prevents mixing and matching integers with
   // different semantics.
   template <typename T> struct TypedInt {
@@ -248,7 +248,7 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
 
   // An opaque type to store a floating point significand.
   // We define special values but it is valid to create arbitrary values as long
-  // as they are in the range [BITS_ALL_ZEROES, BITS_ALL_ONES].
+  // as they are in the range [ZERO, BITS_ALL_ONES].
   // Note that the semantics of the Significand are implementation dependent.
   // Values greater than BITS_ALL_ONES are truncated.
   struct Significand : public TypedInt<StorageType> {
@@ -277,10 +277,8 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
       return Significand(StorageType(1));
     }
     LIBC_INLINE static constexpr auto MSB() {
-      return Significand(StorageType(bit_at(SIG_LEN - 1)));
+      return Significand(StorageType(1) << (SIG_LEN - 1));
     }
-    // Aliases
-    LIBC_INLINE static constexpr auto BITS_ALL_ZEROES() { return ZERO(); }
     LIBC_INLINE static constexpr auto BITS_ALL_ONES() {
       return Significand(SIG_MASK);
     }
@@ -306,181 +304,95 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
     return encode(exp, sig);
   }
 
+  // The floating point number representation as an unsigned integer.
+  StorageType bits{};
+
+  LIBC_INLINE constexpr FPStorage() : bits(0) {}
+  LIBC_INLINE constexpr FPStorage(StorageType value) : bits(value) {}
+
+  // Observers
   LIBC_INLINE constexpr StorageType exp_bits() const { return bits & EXP_MASK; }
   LIBC_INLINE constexpr StorageType sig_bits() const { return bits & SIG_MASK; }
   LIBC_INLINE constexpr StorageType exp_sig_bits() const {
     return bits & EXP_SIG_MASK;
   }
-
-private:
-  // Merge bits from 'a' and 'b' values according to 'mask'.
-  // Use 'a' bits when corresponding 'mask' bits are zeroes and 'b' bits when
-  // corresponding bits are ones.
-  LIBC_INLINE static constexpr StorageType merge(StorageType a, StorageType b,
-                                                 StorageType mask) {
-    // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
-    return a ^ ((a ^ b) & mask);
-  }
-
-protected:
-  // The number of bits after the decimal dot when the number is in normal form.
-  LIBC_INLINE_VAR static constexpr int FRACTION_LEN =
-      fp_type == FPType::X86_Binary80 ? SIG_LEN - 1 : SIG_LEN;
-  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
-      FRACTION_LEN + 1;
-  LIBC_INLINE_VAR static constexpr StorageType FRACTION_MASK =
-      mask_trailing_ones<StorageType, FRACTION_LEN>();
-
-  // The floating point number representation as an unsigned integer.
-  StorageType bits = 0;
-
-public:
-  LIBC_INLINE constexpr Sign sign() const {
-    return (bits & SIGN_MASK) ? Sign::NEG : Sign::POS;
-  }
-
-  LIBC_INLINE constexpr void set_sign(Sign signVal) {
-    if (sign() != signVal)
-      bits ^= SIGN_MASK;
-  }
-
-  LIBC_INLINE constexpr StorageType get_mantissa() const {
-    return bits & FRACTION_MASK;
-  }
-
-  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
-    bits = merge(bits, mantVal, FRACTION_MASK);
-  }
-
-  LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
-    return uint16_t((bits & EXP_MASK) >> EXP_MASK_SHIFT);
-  }
-
-  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
-    bits = merge(bits, biased << EXP_MASK_SHIFT, EXP_MASK);
-  }
-
-  LIBC_INLINE constexpr int get_exponent() const {
-    return int(get_biased_exponent()) - EXP_BIAS;
-  }
-
-  // If the number is subnormal, the exponent is treated as if it were the
-  // minimum exponent for a normal number. This is to keep continuity between
-  // the normal and subnormal ranges, but it causes problems for functions where
-  // values are calculated from the exponent, since just subtracting the bias
-  // will give a slightly incorrect result. Additionally, zero has an exponent
-  // of zero, and that should actually be treated as zero.
-  LIBC_INLINE constexpr int get_explicit_exponent() const {
-    const int biased_exp = int(get_biased_exponent());
-    if (is_zero()) {
-      return 0;
-    } else if (biased_exp == 0) {
-      return 1 - EXP_BIAS;
-    } else {
-      return biased_exp - EXP_BIAS;
-    }
-  }
-
-  LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
-  LIBC_INLINE constexpr void set_uintval(StorageType value) {
-    bits = (value & FP_MASK);
-  }
-
-  LIBC_INLINE constexpr bool is_zero() const { return exp_sig_bits() == 0; }
-
-  LIBC_INLINE
-  constexpr bool is_subnormal() const {
-    return exp_bits() == encode(BiasedExponent::BITS_ALL_ZEROES());
-  }
-
-  LIBC_INLINE constexpr bool is_neg() const { return sign().is_neg(); }
-  LIBC_INLINE constexpr bool is_pos() const { return sign().is_pos(); }
 };
 
-namespace internal {
-
-// Manipulates the representation of a floating point number defined by its
-// FPType. This layer is architecture agnostic and does not handle C++ floating
-// point types directly ('float', 'double' and 'long double'). Use the FPBits
-// below if needed.
-//
-// TODO: Specialize this class for FPType::X86_Binary80 and remove ad-hoc logic
-// from FPRepBase.
-template <FPType fp_type> struct FPRep : public FPRepBase<fp_type> {
-  using UP = FPRepBase<fp_type>;
+// This layer defines all functions that are specific to how the the floating
+// point type is encoded. It enables constructions, modification and observation
+// of values manipulated as 'StorageType'.
+template <FPType fp_type, typename RetT>
+struct FPRepSem : public FPStorage<fp_type> {
+  using UP = FPStorage<fp_type>;
   using typename UP::StorageType;
   using UP::FRACTION_LEN;
   using UP::FRACTION_MASK;
-  using UP::MANTISSA_PRECISION;
 
 protected:
-  using typename UP::BiasedExponent;
-  using typename UP::Exponent;
-  using typename UP::Significand;
+  using BiasedExp = typename UP::BiasedExponent;
+  using Exp = typename UP::Exponent;
+  using Sig = typename UP::Significand;
   using UP::encode;
   using UP::exp_bits;
   using UP::exp_sig_bits;
   using UP::sig_bits;
+  using UP::UP;
 
 public:
-  LIBC_INLINE constexpr bool is_nan() const {
-    return exp_sig_bits() >
-           encode(BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
+  // Builders
+  LIBC_INLINE static constexpr RetT one(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exp::ZERO(), Sig::ZERO()));
   }
-  LIBC_INLINE constexpr bool is_quiet_nan() const {
-    return exp_sig_bits() >=
-           encode(BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
+  LIBC_INLINE static constexpr RetT min_subnormal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::LSB()));
   }
-  LIBC_INLINE constexpr bool is_signaling_nan() const {
-    return is_nan() && !is_quiet_nan();
+  LIBC_INLINE static constexpr RetT max_subnormal(Sign sign = Sign::POS) {
+    return RetT(
+        encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::BITS_ALL_ONES()));
   }
-  LIBC_INLINE constexpr bool is_inf() const {
-    return exp_sig_bits() ==
-           encode(BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
+  LIBC_INLINE static constexpr RetT min_normal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exp::MIN(), Sig::ZERO()));
   }
-  LIBC_INLINE constexpr bool is_finite() const {
-    return exp_bits() != encode(BiasedExponent::BITS_ALL_ONES());
+  LIBC_INLINE static constexpr RetT max_normal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exp::MAX(), Sig::BITS_ALL_ONES()));
   }
-  LIBC_INLINE constexpr bool is_normal() const {
-    return is_finite() && !UP::is_subnormal();
+  LIBC_INLINE static constexpr RetT inf(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(), Sig::ZERO()));
   }
-
-  LIBC_INLINE static constexpr StorageType zero(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::ZERO());
+  LIBC_INLINE static constexpr RetT build_nan(Sign sign = Sign::POS,
+                                              StorageType v = 0) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(),
+                       (v ? Sig(v) : (Sig::MSB() >> 1))));
   }
-  LIBC_INLINE static constexpr StorageType one(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::ZERO(), Significand::ZERO());
+  LIBC_INLINE static constexpr RetT build_quiet_nan(Sign sign = Sign::POS,
+                                                    StorageType v = 0) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(), Sig::MSB() | Sig(v)));
   }
-  LIBC_INLINE static constexpr StorageType
-  min_subnormal(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::LSB());
+
+  // Observers
+  LIBC_INLINE constexpr bool is_nan() const {
+    return exp_sig_bits() > encode(BiasedExp::BITS_ALL_ONES(), Sig::ZERO());
   }
-  LIBC_INLINE static constexpr StorageType
-  max_subnormal(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(),
-                  Significand::BITS_ALL_ONES());
+  LIBC_INLINE constexpr bool is_quiet_nan() const {
+    return exp_sig_bits() >= encode(BiasedExp::BITS_ALL_ONES(), Sig::MSB());
   }
-  LIBC_INLINE static constexpr StorageType min_normal(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::MIN(), Significand::ZERO());
+  LIBC_INLINE constexpr bool is_signaling_nan() const {
+    return is_nan() && !is_quiet_nan();
   }
-  LIBC_INLINE static constexpr StorageType max_normal(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::MAX(), Significand::BITS_ALL_ONES());
+  LIBC_INLINE constexpr bool is_inf() const {
+    return exp_sig_bits() == encode(BiasedExp::BITS_ALL_ONES(), Sig::ZERO());
   }
-  LIBC_INLINE static constexpr StorageType inf(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
+  LIBC_INLINE constexpr bool is_finite() const {
+    return exp_bits() != encode(BiasedExp::BITS_ALL_ONES());
   }
-  LIBC_INLINE static constexpr StorageType build_nan(Sign sign = Sign::POS,
-                                                     StorageType v = 0) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
-                  (v ? Significand(v) : (Significand::MSB() >> 1)));
+  LIBC_INLINE
+  constexpr bool is_subnormal() const {
+    return exp_bits() == encode(BiasedExp::BITS_ALL_ZEROES());
   }
-  LIBC_INLINE static constexpr StorageType
-  build_quiet_nan(Sign sign = Sign::POS, StorageType v = 0) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
-                  Significand::MSB() | Significand(v));
+  LIBC_INLINE constexpr bool is_normal() const {
+    return is_finite() && !UP::is_subnormal();
   }
-
-  // The function return mantissa with the implicit bit set iff the current
+  // Returns the mantissa with the implicit bit set iff the current
   // value is a valid normal number.
   LIBC_INLINE constexpr StorageType get_explicit_mantissa() {
     if (UP::is_subnormal())
@@ -490,20 +402,14 @@ template <FPType fp_type> struct FPRep : public FPRepBase<fp_type> {
 };
 
 // Specialization for the X86 Extended Precision type.
-template <>
-struct FPRep<FPType::X86_Binary80> : public FPRepBase<FPType::X86_Binary80> {
-  using UP = FPRepBase<FPType::X86_Binary80>;
+template <typename RetT>
+struct FPRepSem<FPType::X86_Binary80, RetT>
+    : public FPStorage<FPType::X86_Binary80> {
+  using UP = FPStorage<FPType::X86_Binary80>;
   using typename UP::StorageType;
   using UP::FRACTION_LEN;
   using UP::FRACTION_MASK;
-  using UP::MANTISSA_PRECISION;
-
-protected:
-  using typename UP::BiasedExponent;
-  using typename UP::Significand;
-  using UP::encode;
 
-public:
   // The x86 80 bit float represents the leading digit of the mantissa
   // explicitly. This is the mask for that bit.
   static constexpr StorageType EXPLICIT_BIT_MASK = StorageType(1)
@@ -515,6 +421,45 @@ struct FPRep<FPType::X86_Binary80> : public FPRepBase<FPType::X86_Binary80> {
                 "the explicit bit and the fractional part should cover the "
                 "whole significand");
 
+protected:
+  using BiasedExp = typename UP::BiasedExponent;
+  using Sig = typename UP::Significand;
+  using UP::encode;
+  using UP::UP;
+
+public:
+  // Builders
+  LIBC_INLINE static constexpr RetT one(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exponent::ZERO(), Sig::MSB()));
+  }
+  LIBC_INLINE static constexpr RetT min_subnormal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::LSB()));
+  }
+  LIBC_INLINE static constexpr RetT max_subnormal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(),
+                       Sig::BITS_ALL_ONES() ^ Sig::MSB()));
+  }
+  LIBC_INLINE static constexpr RetT min_normal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exponent::MIN(), Sig::MSB()));
+  }
+  LIBC_INLINE static constexpr RetT max_normal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exponent::MAX(), Sig::BITS_ALL_ONES()));
+  }
+  LIBC_INLINE static constexpr RetT inf(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(), Sig::MSB()));
+  }
+  LIBC_INLINE static constexpr RetT build_nan(Sign sign = Sign::POS,
+                                              StorageType v = 0) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(),
+                       Sig::MSB() | (v ? Sig(v) : (Sig::MSB() >> 2))));
+  }
+  LIBC_INLINE static constexpr RetT build_quiet_nan(Sign sign = Sign::POS,
+                                                    StorageType v = 0) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(),
+                       Sig::MSB() | (Sig::MSB() >> 1) | Sig(v)));
+  }
+
+  // Observers
   LIBC_INLINE constexpr bool is_nan() const {
     // Most encoding forms from the table found in
     // https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format
@@ -527,85 +472,183 @@ struct FPRep<FPType::X86_Binary80> : public FPRepBase<FPType::X86_Binary80> {
     // - Quiet Not a Number
     // - Unnormal
     // This can be reduced to the following logic:
-    if (exp_bits() == encode(BiasedExponent::BITS_ALL_ONES()))
+    if (exp_bits() == encode(BiasedExp::BITS_ALL_ONES()))
       return !is_inf();
-    if (exp_bits() != encode(BiasedExponent::BITS_ALL_ZEROES()))
-      return (sig_bits() & encode(Significand::MSB())) == 0;
+    if (exp_bits() != encode(BiasedExp::BITS_ALL_ZEROES()))
+      return (sig_bits() & encode(Sig::MSB())) == 0;
     return false;
   }
   LIBC_INLINE constexpr bool is_quiet_nan() const {
     return exp_sig_bits() >=
-           encode(BiasedExponent::BITS_ALL_ONES(),
-                  Significand::MSB() | (Significand::MSB() >> 1));
+           encode(BiasedExp::BITS_ALL_ONES(), Sig::MSB() | (Sig::MSB() >> 1));
   }
   LIBC_INLINE constexpr bool is_signaling_nan() const {
     return is_nan() && !is_quiet_nan();
   }
   LIBC_INLINE constexpr bool is_inf() const {
-    return exp_sig_bits() ==
-           encode(BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
+    return exp_sig_bits() == encode(BiasedExp::BITS_ALL_ONES(), Sig::MSB());
   }
   LIBC_INLINE constexpr bool is_finite() const {
     return !is_inf() && !is_nan();
   }
+  LIBC_INLINE
+  constexpr bool is_subnormal() const {
+    return exp_bits() == encode(BiasedExp::BITS_ALL_ZEROES());
+  }
   LIBC_INLINE constexpr bool is_normal() const {
     const auto exp = exp_bits();
-    if (exp == encode(BiasedExponent::BITS_ALL_ZEROES()) ||
-        exp == encode(BiasedExponent::BITS_ALL_ONES()))
+    if (exp == encode(BiasedExp::BITS_ALL_ZEROES()) ||
+        exp == encode(BiasedExp::BITS_ALL_ONES()))
       return false;
     return get_implicit_bit();
   }
+  LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
+    return sig_bits();
+  }
+
+  // This functions is specific to FPRepSem<FPType::X86_Binary80>.
+  // TODO: Remove if possible.
+  LIBC_INLINE constexpr bool get_implicit_bit() const {
+    return static_cast<bool>(bits & EXPLICIT_BIT_MASK);
+  }
+
+  // This functions is specific to FPRepSem<FPType::X86_Binary80>.
+  // TODO: Remove if possible.
+  LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
+    if (get_implicit_bit() != implicitVal)
+      bits ^= EXPLICIT_BIT_MASK;
+  }
+};
+
+// 'FPRep' is the bottom of the class hierarchy that only deals with 'FPType'.
+// The operations dealing with specific float semantics are implemented by
+// 'FPRepSem' above and specialized when needed.
+//
+// The 'RetT' type is being propagated up to 'FPRepSem' so that the functions
+// creating new values (Builders) can return the appropriate type. That is, when
+// creating a value through 'FPBits' below the builder will return an 'FPBits'
+// value:
+// i.e., FPBits<float>::zero() // returns an FPBits<float>
+// When we don't care about specific C++ floating point type we can use 'FPRep'
+// directly and 'RetT' defaults to 'StorageType':
+// i.e., FPRep<FPType:IEEE754_Binary32:>::zero() // returns an 'uint32_t'
+template <FPType fp_type,
+          typename RetT = typename FPLayout<fp_type>::StorageType>
+struct FPRep : public FPRepSem<fp_type, RetT> {
+  using UP = FPRepSem<fp_type, RetT>;
+  using StorageType = typename UP::StorageType;
+
+protected:
+  using UP::bits;
+  using UP::encode;
+  using UP::exp_bits;
+  using UP::exp_sig_bits;
+
+  using BiasedExp = typename UP::BiasedExponent;
+  using Sig = typename UP::Significand;
 
-  LIBC_INLINE static constexpr StorageType zero(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::ZERO());
+  using UP::FP_MASK;
+  using UP::SIG_LEN;
+
+public:
+  using UP::EXP_BIAS;
+  using UP::EXP_MASK;
+  using UP::FRACTION_MASK;
+  using UP::SIGN_MASK;
+
+  // Representation
+  LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
+  LIBC_INLINE constexpr void set_uintval(StorageType value) {
+    bits = (value & FP_MASK);
   }
-  LIBC_INLINE static constexpr StorageType one(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::ZERO(), Significand::MSB());
+
+  // Builders
+  LIBC_INLINE static constexpr RetT zero(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::ZERO()));
   }
-  LIBC_INLINE static constexpr StorageType
-  min_subnormal(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::LSB());
+  using UP::build_nan;
+  using UP::build_quiet_nan;
+  using UP::inf;
+  using UP::max_normal;
+  using UP::max_subnormal;
+  using UP::min_normal;
+  using UP::min_subnormal;
+  using UP::one;
+
+  // Modifiers
+  LIBC_INLINE constexpr RetT abs() const {
+    return RetT(bits & UP::EXP_SIG_MASK);
   }
-  LIBC_INLINE static constexpr StorageType
-  max_subnormal(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(),
-                  Significand::BITS_ALL_ONES() ^ Significand::MSB());
+
+  // Observers
+  using UP::get_explicit_mantissa;
+  LIBC_INLINE constexpr bool is_zero() const { return exp_sig_bits() == 0; }
+  LIBC_INLINE constexpr bool is_inf_or_nan() const { return !is_finite(); }
+  using UP::is_finite;
+  using UP::is_inf;
+  using UP::is_nan;
+  using UP::is_normal;
+  using UP::is_quiet_nan;
+  using UP::is_signaling_nan;
+  using UP::is_subnormal;
+  LIBC_INLINE constexpr bool is_neg() const { return sign().is_neg(); }
+  LIBC_INLINE constexpr bool is_pos() const { return sign().is_pos(); }
+
+  // Parts
+  LIBC_INLINE constexpr Sign sign() const {
+    return (bits & SIGN_MASK) ? Sign::NEG : Sign::POS;
   }
-  LIBC_INLINE static constexpr StorageType min_normal(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::MIN(), Significand::MSB());
+
+  LIBC_INLINE constexpr void set_sign(Sign signVal) {
+    if (sign() != signVal)
+      bits ^= SIGN_MASK;
   }
-  LIBC_INLINE static constexpr StorageType max_normal(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::MAX(), Significand::BITS_ALL_ONES());
+
+  LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
+    return uint16_t((bits & UP::EXP_MASK) >> UP::SIG_LEN);
   }
-  LIBC_INLINE static constexpr StorageType inf(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
+
+  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
+    bits = merge(bits, biased << SIG_LEN, EXP_MASK);
   }
-  LIBC_INLINE static constexpr StorageType build_nan(Sign sign = Sign::POS,
-                                                     StorageType v = 0) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
-                  Significand::MSB() |
-                      (v ? Significand(v) : (Significand::MSB() >> 2)));
+
+  LIBC_INLINE constexpr int get_exponent() const {
+    return int(get_biased_exponent()) - EXP_BIAS;
   }
-  LIBC_INLINE static constexpr StorageType
-  build_quiet_nan(Sign sign = Sign::POS, StorageType v = 0) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
-                  Significand::MSB() | (Significand::MSB() >> 1) |
-                      Significand(v));
+
+  // If the number is subnormal, the exponent is treated as if it were the
+  // minimum exponent for a normal number. This is to keep continuity between
+  // the normal and subnormal ranges, but it causes problems for functions where
+  // values are calculated from the exponent, since just subtracting the bias
+  // will give a slightly incorrect result. Additionally, zero has an exponent
+  // of zero, and that should actually be treated as zero.
+  LIBC_INLINE constexpr int get_explicit_exponent() const {
+    const int biased_exp = int(get_biased_exponent());
+    if (is_zero()) {
+      return 0;
+    } else if (biased_exp == 0) {
+      return 1 - EXP_BIAS;
+    } else {
+      return biased_exp - EXP_BIAS;
+    }
   }
 
-  LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
-    return sig_bits();
+  LIBC_INLINE constexpr StorageType get_mantissa() const {
+    return bits & FRACTION_MASK;
   }
 
-  // The following functions are specific to FPRep<FPType::X86_Binary80>.
-  // TODO: Remove if possible.
-  LIBC_INLINE constexpr bool get_implicit_bit() const {
-    return static_cast<bool>(bits & EXPLICIT_BIT_MASK);
+  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
+    bits = merge(bits, mantVal, FRACTION_MASK);
   }
 
-  LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
-    if (get_implicit_bit() != implicitVal)
-      bits ^= EXPLICIT_BIT_MASK;
+private:
+  // Merge bits from 'a' and 'b' values according to 'mask'.
+  // Use 'a' bits when corresponding 'mask' bits are zeroes and 'b' bits when
+  // corresponding bits are ones.
+  LIBC_INLINE static constexpr StorageType merge(StorageType a, StorageType b,
+                                                 StorageType mask) {
+    // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
+    return a ^ ((a ^ b) & mask);
   }
 };
 
@@ -642,29 +685,31 @@ template <typename T> LIBC_INLINE static constexpr FPType get_fp_type() {
     static_assert(cpp::always_false<UnqualT>, "Unsupported type");
 }
 
-// A generic class to represent floating point formats.
-// On most platforms, the 'float' type corresponds to single precision
-// floating point numbers, the 'double' type corresponds to double precision
-// floating point numers, and the 'long double' type corresponds to the quad
-// precision floating numbers. On x86 platforms however, the 'long double'
-// type maps to an x87 floating point format.
-template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
+// A generic class to manipulate floating point formats.
+// It derives most of its functionality to FPRep above.
+template <typename T>
+struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
   static_assert(cpp::is_floating_point_v<T>,
                 "FPBits instantiated with invalid type.");
-  using UP = internal::FPRep<get_fp_type<T>()>;
+  using UP = internal::FPRep<get_fp_type<T>(), FPBits<T>>;
   using Rep = UP;
   using StorageType = typename UP::StorageType;
 
   using UP::bits;
-  using UP::EXP_LEN;
-  using UP::UP;
 
   // Constants.
-  static constexpr int MAX_BIASED_EXPONENT = (1 << EXP_LEN) - 1;
-  static constexpr StorageType MIN_NORMAL = UP::min_normal(Sign::POS);
-  static constexpr StorageType MAX_NORMAL = UP::max_normal(Sign::POS);
-  static constexpr StorageType MIN_SUBNORMAL = UP::min_subnormal(Sign::POS);
-  static constexpr StorageType MAX_SUBNORMAL = UP::max_subnormal(Sign::POS);
+  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
+      UP::FRACTION_LEN + 1;
+  LIBC_INLINE_VAR static constexpr StorageType MIN_NORMAL =
+      UP::min_normal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr StorageType MAX_NORMAL =
+      UP::max_normal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr StorageType MIN_SUBNORMAL =
+      UP::min_subnormal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr StorageType MAX_SUBNORMAL =
+      UP::max_subnormal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr int MAX_BIASED_EXPONENT =
+      (1 << UP::EXP_LEN) - 1;
 
   // Constructors.
   LIBC_INLINE constexpr FPBits() = default;
@@ -686,49 +731,35 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
 
   LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
 
-  LIBC_INLINE constexpr bool is_inf_or_nan() const { return !UP::is_finite(); }
-
-  LIBC_INLINE constexpr FPBits abs() const {
-    return FPBits(bits & UP::EXP_SIG_MASK);
-  }
-
   // Methods below this are used by tests.
-
+  // TODO: inline and remove.
   LIBC_INLINE static constexpr T one(Sign sign = Sign::POS) {
-    return FPBits(UP::one(sign)).get_val();
+    return T(UP::one(sign));
   }
-
   LIBC_INLINE static constexpr T zero(Sign sign = Sign::POS) {
-    return FPBits(UP::zero(sign)).get_val();
+    return T(UP::zero(sign));
   }
-
   LIBC_INLINE static constexpr T inf(Sign sign = Sign::POS) {
-    return FPBits(UP::inf(sign)).get_val();
+    return T(UP::inf(sign));
   }
-
   LIBC_INLINE static constexpr T min_normal() {
-    return FPBits(UP::min_normal(Sign::POS)).get_val();
+    return T(UP::min_normal(Sign::POS));
   }
-
   LIBC_INLINE static constexpr T max_normal() {
-    return FPBits(UP::max_normal(Sign::POS)).get_val();
+    return T(UP::max_normal(Sign::POS));
   }
-
   LIBC_INLINE static constexpr T min_denormal() {
-    return FPBits(UP::min_subnormal(Sign::POS)).get_val();
+    return T(UP::min_subnormal(Sign::POS));
   }
-
   LIBC_INLINE static constexpr T max_denormal() {
-    return FPBits(UP::max_subnormal(Sign::POS)).get_val();
+    return T(UP::max_subnormal(Sign::POS));
   }
-
   LIBC_INLINE static constexpr T build_nan(StorageType v) {
-    return FPBits(UP::build_nan(Sign::POS, v)).get_val();
+    return T(UP::build_nan(Sign::POS, v));
   }
-
   LIBC_INLINE static constexpr T build_quiet_nan(StorageType v,
                                                  Sign sign = Sign::POS) {
-    return FPBits(UP::build_quiet_nan(sign, v)).get_val();
+    return T(UP::build_quiet_nan(sign, v));
   }
 
   // TODO: Use an uint32_t for 'biased_exp'.
@@ -757,7 +788,7 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
                   "This function is not tested for X86 Extended Precision");
     FPBits<T> result;
     // offset: +1 for sign, but -1 for implicit first bit
-    int lz = cpp::countl_zero(number) - EXP_LEN;
+    int lz = cpp::countl_zero(number) - UP::EXP_LEN;
     number <<= lz;
     ep -= lz;
 
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index f0b155085dabf..e6b6d7d9ec780 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -17,69 +17,72 @@ TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary16) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
   using Rep = FPRep<FPType::IEEE754_Binary16>;
-  using u16 = uint16_t;
-
-  EXPECT_EQ(u16(0b0'00000'0000000000), Rep::zero());
-  EXPECT_EQ(u16(0b0'01111'0000000000), Rep::one());
-  EXPECT_EQ(u16(0b0'00000'0000000001), Rep::min_subnormal());
-  EXPECT_EQ(u16(0b0'00000'1111111111), Rep::max_subnormal());
-  EXPECT_EQ(u16(0b0'00001'0000000000), Rep::min_normal());
-  EXPECT_EQ(u16(0b0'11110'1111111111), Rep::max_normal());
-  EXPECT_EQ(u16(0b0'11111'0000000000), Rep::inf());
-  EXPECT_EQ(u16(0b0'11111'0100000000), Rep::build_nan());
-  EXPECT_EQ(u16(0b0'11111'1000000000), Rep::build_quiet_nan());
+  using u16 = typename Rep::StorageType;
+
+  EXPECT_EQ(u16(0b0'00000'0000000000), u16(Rep::zero()));
+  EXPECT_EQ(u16(0b0'01111'0000000000), u16(Rep::one()));
+  EXPECT_EQ(u16(0b0'00000'0000000001), u16(Rep::min_subnormal()));
+  EXPECT_EQ(u16(0b0'00000'1111111111), u16(Rep::max_subnormal()));
+  EXPECT_EQ(u16(0b0'00001'0000000000), u16(Rep::min_normal()));
+  EXPECT_EQ(u16(0b0'11110'1111111111), u16(Rep::max_normal()));
+  EXPECT_EQ(u16(0b0'11111'0000000000), u16(Rep::inf()));
+  EXPECT_EQ(u16(0b0'11111'0100000000), u16(Rep::build_nan()));
+  EXPECT_EQ(u16(0b0'11111'1000000000), u16(Rep::build_quiet_nan()));
 }
 
 TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary32) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
   using Rep = FPRep<FPType::IEEE754_Binary32>;
-  using u32 = uint32_t;
-
-  EXPECT_EQ(u32(0b0'00000000'00000000000000000000000), Rep::zero());
-  EXPECT_EQ(u32(0b0'01111111'00000000000000000000000), Rep::one());
-  EXPECT_EQ(u32(0b0'00000000'00000000000000000000001), Rep::min_subnormal());
-  EXPECT_EQ(u32(0b0'00000000'11111111111111111111111), Rep::max_subnormal());
-  EXPECT_EQ(u32(0b0'00000001'00000000000000000000000), Rep::min_normal());
-  EXPECT_EQ(u32(0b0'11111110'11111111111111111111111), Rep::max_normal());
-  EXPECT_EQ(u32(0b0'11111111'00000000000000000000000), Rep::inf());
-  EXPECT_EQ(u32(0b0'11111111'01000000000000000000000), Rep::build_nan());
-  EXPECT_EQ(u32(0b0'11111111'10000000000000000000000), Rep::build_quiet_nan());
+  using u32 = typename Rep::StorageType;
+
+  EXPECT_EQ(u32(0b0'00000000'00000000000000000000000), u32(Rep::zero()));
+  EXPECT_EQ(u32(0b0'01111111'00000000000000000000000), u32(Rep::one()));
+  EXPECT_EQ(u32(0b0'00000000'00000000000000000000001),
+            u32(Rep::min_subnormal()));
+  EXPECT_EQ(u32(0b0'00000000'11111111111111111111111),
+            u32(Rep::max_subnormal()));
+  EXPECT_EQ(u32(0b0'00000001'00000000000000000000000), u32(Rep::min_normal()));
+  EXPECT_EQ(u32(0b0'11111110'11111111111111111111111), u32(Rep::max_normal()));
+  EXPECT_EQ(u32(0b0'11111111'00000000000000000000000), u32(Rep::inf()));
+  EXPECT_EQ(u32(0b0'11111111'01000000000000000000000), u32(Rep::build_nan()));
+  EXPECT_EQ(u32(0b0'11111111'10000000000000000000000),
+            u32(Rep::build_quiet_nan()));
 }
 
 TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary64) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
   using Rep = FPRep<FPType::IEEE754_Binary64>;
-  using u64 = uint64_t;
+  using u64 = typename Rep::StorageType;
 
   EXPECT_EQ(
       u64(0b0'00000000000'0000000000000000000000000000000000000000000000000000),
-      Rep::zero());
+      u64(Rep::zero()));
   EXPECT_EQ(
       u64(0b0'01111111111'0000000000000000000000000000000000000000000000000000),
-      Rep::one());
+      u64(Rep::one()));
   EXPECT_EQ(
       u64(0b0'00000000000'0000000000000000000000000000000000000000000000000001),
-      Rep::min_subnormal());
+      u64(Rep::min_subnormal()));
   EXPECT_EQ(
       u64(0b0'00000000000'1111111111111111111111111111111111111111111111111111),
-      Rep::max_subnormal());
+      u64(Rep::max_subnormal()));
   EXPECT_EQ(
       u64(0b0'00000000001'0000000000000000000000000000000000000000000000000000),
-      Rep::min_normal());
+      u64(Rep::min_normal()));
   EXPECT_EQ(
       u64(0b0'11111111110'1111111111111111111111111111111111111111111111111111),
-      Rep::max_normal());
+      u64(Rep::max_normal()));
   EXPECT_EQ(
       u64(0b0'11111111111'0000000000000000000000000000000000000000000000000000),
-      Rep::inf());
+      u64(Rep::inf()));
   EXPECT_EQ(
       u64(0b0'11111111111'0100000000000000000000000000000000000000000000000000),
-      Rep::build_nan());
+      u64(Rep::build_nan()));
   EXPECT_EQ(
       u64(0b0'11111111111'1000000000000000000000000000000000000000000000000000),
-      Rep::build_quiet_nan());
+      u64(Rep::build_quiet_nan()));
 }
 
 static constexpr UInt128 u128(uint64_t hi, uint64_t lo) {
@@ -90,6 +93,49 @@ static constexpr UInt128 u128(uint64_t hi, uint64_t lo) {
 #endif
 }
 
+TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary128) {
+  using LIBC_NAMESPACE::fputil::FPType;
+  using LIBC_NAMESPACE::fputil::internal::FPRep;
+  using Rep = FPRep<FPType::IEEE754_Binary128>;
+
+  EXPECT_EQ(
+      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::zero()));
+  EXPECT_EQ(
+      u128(0b0'011111111111111'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::one()));
+  EXPECT_EQ(
+      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000001),
+      UInt128(Rep::min_subnormal()));
+  EXPECT_EQ(
+      u128(0b0'000000000000000'111111111111111111111111111111111111111111111111,
+           0b1111111111111111111111111111111111111111111111111111111111111111),
+      UInt128(Rep::max_subnormal()));
+  EXPECT_EQ(
+      u128(0b0'000000000000001'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::min_normal()));
+  EXPECT_EQ(
+      u128(0b0'111111111111110'111111111111111111111111111111111111111111111111,
+           0b1111111111111111111111111111111111111111111111111111111111111111),
+      UInt128(Rep::max_normal()));
+  EXPECT_EQ(
+      u128(0b0'111111111111111'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::inf()));
+  EXPECT_EQ(
+      u128(0b0'111111111111111'010000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::build_nan()));
+  EXPECT_EQ(
+      u128(0b0'111111111111111'100000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::build_quiet_nan()));
+}
+
 TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
@@ -98,39 +144,39 @@ TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80) {
   EXPECT_EQ(
       u128(0b0'000000000000000,
            0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::zero());
+      UInt128(Rep::zero()));
   EXPECT_EQ(
       u128(0b0'011111111111111,
            0b1000000000000000000000000000000000000000000000000000000000000000),
-      Rep::one());
+      UInt128(Rep::one()));
   EXPECT_EQ(
       u128(0b0'000000000000000,
            0b0000000000000000000000000000000000000000000000000000000000000001),
-      Rep::min_subnormal());
+      UInt128(Rep::min_subnormal()));
   EXPECT_EQ(
       u128(0b0'000000000000000,
            0b0111111111111111111111111111111111111111111111111111111111111111),
-      Rep::max_subnormal());
+      UInt128(Rep::max_subnormal()));
   EXPECT_EQ(
       u128(0b0'000000000000001,
            0b1000000000000000000000000000000000000000000000000000000000000000),
-      Rep::min_normal());
+      UInt128(Rep::min_normal()));
   EXPECT_EQ(
       u128(0b0'111111111111110,
            0b1111111111111111111111111111111111111111111111111111111111111111),
-      Rep::max_normal());
+      UInt128(Rep::max_normal()));
   EXPECT_EQ(
       u128(0b0'111111111111111,
            0b1000000000000000000000000000000000000000000000000000000000000000),
-      Rep::inf());
+      UInt128(Rep::inf()));
   EXPECT_EQ(
       u128(0b0'111111111111111,
            0b1010000000000000000000000000000000000000000000000000000000000000),
-      Rep::build_nan());
+      UInt128(Rep::build_nan()));
   EXPECT_EQ(
       u128(0b0'111111111111111,
            0b1100000000000000000000000000000000000000000000000000000000000000),
-      Rep::build_quiet_nan());
+      UInt128(Rep::build_quiet_nan()));
 }
 
 TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80_IsNan) {
@@ -183,49 +229,6 @@ TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80_IsNan) {
       0b1000000000000000000000000000000000000000000000000000000000000000));
 }
 
-TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary128) {
-  using LIBC_NAMESPACE::fputil::FPType;
-  using LIBC_NAMESPACE::fputil::internal::FPRep;
-  using Rep = FPRep<FPType::IEEE754_Binary128>;
-
-  EXPECT_EQ(
-      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::zero());
-  EXPECT_EQ(
-      u128(0b0'011111111111111'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::one());
-  EXPECT_EQ(
-      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000001),
-      Rep::min_subnormal());
-  EXPECT_EQ(
-      u128(0b0'000000000000000'111111111111111111111111111111111111111111111111,
-           0b1111111111111111111111111111111111111111111111111111111111111111),
-      Rep::max_subnormal());
-  EXPECT_EQ(
-      u128(0b0'000000000000001'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::min_normal());
-  EXPECT_EQ(
-      u128(0b0'111111111111110'111111111111111111111111111111111111111111111111,
-           0b1111111111111111111111111111111111111111111111111111111111111111),
-      Rep::max_normal());
-  EXPECT_EQ(
-      u128(0b0'111111111111111'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::inf());
-  EXPECT_EQ(
-      u128(0b0'111111111111111'010000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::build_nan());
-  EXPECT_EQ(
-      u128(0b0'111111111111111'100000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::build_quiet_nan());
-}
-
 TEST(LlvmLibcFPBitsTest, FloatType) {
   using FloatBits = FPBits<float>;
 

From bf7b8dae0615884816fff54cac08bc691746b1ee Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 22 Jan 2024 15:04:50 +0100
Subject: [PATCH 387/843] Revert "[libc] `FPRep` builders return `FPRep`
 instead of raw `StorageType`" (#78974)

Reverts llvm/llvm-project#78588
---
 libc/src/__support/FPUtil/FPBits.h            | 593 +++++++++---------
 .../test/src/__support/FPUtil/fpbits_test.cpp | 171 +++--
 2 files changed, 365 insertions(+), 399 deletions(-)

diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index bc6b19b6ebf10..be700285de828 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -64,46 +64,38 @@ LIBC_INLINE_VAR constexpr Sign Sign::POS = Sign(false);
 //             └─────────▲─────────┘
 //                       │
 //             ┌─────────┴─────────┐
-//             │ FPStorage<FPType> │
+//             │ FPRepBase<FPType> │
 //             └─────────▲─────────┘
 //                       │
 //          ┌────────────┴─────────────┐
 //          │                          │
-// ┌────────┴─────────┐ ┌──────────────┴──────────────────┐
-// │ FPRepSem<FPType> │ │  FPRepSem<FPType::X86_Binary80  │
-// └────────▲─────────┘ └──────────────▲──────────────────┘
+// ┌────────┴──────┐     ┌─────────────┴──────────────┐
+// │ FPRep<FPType> │     │ FPRep<FPType::X86_Binary80 │
+// └────────▲──────┘     └─────────────▲──────────────┘
 //          │                          │
 //          └────────────┬─────────────┘
 //                       │
 //                 ┌─────┴─────┐
-//                 │  FPRep<T> │
-//                 └───────────┘
-//                       │
-//                 ┌─────┴─────┐
 //                 │ FPBits<T> │
 //                 └───────────┘
 //
-// - 'FPLayout' defines only a few constants, namely the 'StorageType' and
-// length of the sign, the exponent, fraction and significand parts.
-// - 'FPStorage' builds more constants on top of those from 'FPLayout' like
-// exponent bias and masks. It also holds the bit representation of the
-// floating point as a 'StorageType' type and defines tools to assemble or test
+// - 'FPLayout' defines only a few constants, namely the 'StorageType' and the
+// length of the sign, the exponent and significand parts.
+// - 'FPRepBase' builds more constants on top of those from 'FPLayout' like
+// exponent bias, shifts and masks. It also defines tools to assemble or test
 // these parts.
-// - 'FPRepSem' defines functions to interact semantically with the floating
-// point representation. The default implementation is the one for 'IEEE754', a
-// specialization is provided for X86 Extended Precision.
-// - 'FPRep' derives from 'FPRepSem' and adds functions that are common to all
-// implementations.
-// - 'FPBits' exposes all functions from 'FPRep' but operates on the native C++
-// floating point type instead of 'FPType'.
+// - 'FPRep' defines functions to interact with the floating point
+// representation. The default implementation is the one for 'IEEE754', a
+// specialization is provided for X86 Extended Precision that has a different
+// encoding.
+// - 'FPBits' is templated on the platform floating point types. Contrary to
+// 'FPRep' that is platform agnostic 'FPBits' is architecture dependent.
 
 namespace internal {
 
 // Defines the layout (sign, exponent, significand) of a floating point type in
 // memory. It also defines its associated StorageType, i.e., the unsigned
 // integer type used to manipulate its representation.
-// Additionally we provide the fractional part length, i.e., the number of bits
-// after the decimal dot when the number is in normal form.
 template <FPType> struct FPLayout {};
 
 template <> struct FPLayout<FPType::IEEE754_Binary16> {
@@ -111,7 +103,6 @@ template <> struct FPLayout<FPType::IEEE754_Binary16> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 5;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 10;
-  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary32> {
@@ -119,7 +110,6 @@ template <> struct FPLayout<FPType::IEEE754_Binary32> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 8;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 23;
-  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary64> {
@@ -127,7 +117,6 @@ template <> struct FPLayout<FPType::IEEE754_Binary64> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 11;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 52;
-  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary128> {
@@ -135,7 +124,6 @@ template <> struct FPLayout<FPType::IEEE754_Binary128> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 112;
-  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::X86_Binary80> {
@@ -143,22 +131,23 @@ template <> struct FPLayout<FPType::X86_Binary80> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 64;
-  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN - 1;
 };
 
-// FPStorage derives useful constants from the FPLayout above.
-template <FPType fp_type> struct FPStorage : public FPLayout<fp_type> {
-  using UP = FPLayout<fp_type>;
+} // namespace internal
+
+// FPRepBase derives useful constants from the FPLayout.
+template <FPType fp_type>
+struct FPRepBase : public internal::FPLayout<fp_type> {
+private:
+  using UP = internal::FPLayout<fp_type>;
 
+public:
   using UP::EXP_LEN;  // The number of bits for the *exponent* part
   using UP::SIG_LEN;  // The number of bits for the *significand* part
   using UP::SIGN_LEN; // The number of bits for the *sign* part
   // For convenience, the sum of `SIG_LEN`, `EXP_LEN`, and `SIGN_LEN`.
   LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + EXP_LEN + SIG_LEN;
 
-  // The number of bits after the decimal dot when the number is in normal form.
-  using UP::FRACTION_LEN;
-
   // An unsigned integer that is wide enough to contain all of the floating
   // point bits.
   using StorageType = typename UP::StorageType;
@@ -173,30 +162,41 @@ template <FPType fp_type> struct FPStorage : public FPLayout<fp_type> {
       (1U << (EXP_LEN - 1U)) - 1U;
   static_assert(EXP_BIAS > 0);
 
+protected:
+  // The shift amount to get the *significand* part to the least significant
+  // bit. Always `0` but kept for consistency.
+  LIBC_INLINE_VAR static constexpr int SIG_MASK_SHIFT = 0;
+  // The shift amount to get the *exponent* part to the least significant bit.
+  LIBC_INLINE_VAR static constexpr int EXP_MASK_SHIFT = SIG_LEN;
+  // The shift amount to get the *sign* part to the least significant bit.
+  LIBC_INLINE_VAR static constexpr int SIGN_MASK_SHIFT = SIG_LEN + EXP_LEN;
+
   // The bit pattern that keeps only the *significand* part.
   LIBC_INLINE_VAR static constexpr StorageType SIG_MASK =
-      mask_trailing_ones<StorageType, SIG_LEN>();
+      mask_trailing_ones<StorageType, SIG_LEN>() << SIG_MASK_SHIFT;
+
+public:
   // The bit pattern that keeps only the *exponent* part.
   LIBC_INLINE_VAR static constexpr StorageType EXP_MASK =
-      mask_trailing_ones<StorageType, EXP_LEN>() << SIG_LEN;
+      mask_trailing_ones<StorageType, EXP_LEN>() << EXP_MASK_SHIFT;
   // The bit pattern that keeps only the *sign* part.
   LIBC_INLINE_VAR static constexpr StorageType SIGN_MASK =
-      mask_trailing_ones<StorageType, SIGN_LEN>() << (EXP_LEN + SIG_LEN);
+      mask_trailing_ones<StorageType, SIGN_LEN>() << SIGN_MASK_SHIFT;
   // The bit pattern that keeps only the *exponent + significand* part.
   LIBC_INLINE_VAR static constexpr StorageType EXP_SIG_MASK =
       mask_trailing_ones<StorageType, EXP_LEN + SIG_LEN>();
   // The bit pattern that keeps only the *sign + exponent + significand* part.
   LIBC_INLINE_VAR static constexpr StorageType FP_MASK =
       mask_trailing_ones<StorageType, TOTAL_LEN>();
-  // The bit pattern that keeps only the *fraction* part.
-  // i.e., the *significand* without the leading one.
-  LIBC_INLINE_VAR static constexpr StorageType FRACTION_MASK =
-      mask_trailing_ones<StorageType, FRACTION_LEN>();
 
   static_assert((SIG_MASK & EXP_MASK & SIGN_MASK) == 0, "masks disjoint");
   static_assert((SIG_MASK | EXP_MASK | SIGN_MASK) == FP_MASK, "masks cover");
 
 protected:
+  LIBC_INLINE static constexpr StorageType bit_at(int position) {
+    return StorageType(1) << position;
+  }
+
   // A stongly typed integer that prevents mixing and matching integers with
   // different semantics.
   template <typename T> struct TypedInt {
@@ -248,7 +248,7 @@ template <FPType fp_type> struct FPStorage : public FPLayout<fp_type> {
 
   // An opaque type to store a floating point significand.
   // We define special values but it is valid to create arbitrary values as long
-  // as they are in the range [ZERO, BITS_ALL_ONES].
+  // as they are in the range [BITS_ALL_ZEROES, BITS_ALL_ONES].
   // Note that the semantics of the Significand are implementation dependent.
   // Values greater than BITS_ALL_ONES are truncated.
   struct Significand : public TypedInt<StorageType> {
@@ -277,8 +277,10 @@ template <FPType fp_type> struct FPStorage : public FPLayout<fp_type> {
       return Significand(StorageType(1));
     }
     LIBC_INLINE static constexpr auto MSB() {
-      return Significand(StorageType(1) << (SIG_LEN - 1));
+      return Significand(StorageType(bit_at(SIG_LEN - 1)));
     }
+    // Aliases
+    LIBC_INLINE static constexpr auto BITS_ALL_ZEROES() { return ZERO(); }
     LIBC_INLINE static constexpr auto BITS_ALL_ONES() {
       return Significand(SIG_MASK);
     }
@@ -304,95 +306,181 @@ template <FPType fp_type> struct FPStorage : public FPLayout<fp_type> {
     return encode(exp, sig);
   }
 
-  // The floating point number representation as an unsigned integer.
-  StorageType bits{};
-
-  LIBC_INLINE constexpr FPStorage() : bits(0) {}
-  LIBC_INLINE constexpr FPStorage(StorageType value) : bits(value) {}
-
-  // Observers
   LIBC_INLINE constexpr StorageType exp_bits() const { return bits & EXP_MASK; }
   LIBC_INLINE constexpr StorageType sig_bits() const { return bits & SIG_MASK; }
   LIBC_INLINE constexpr StorageType exp_sig_bits() const {
     return bits & EXP_SIG_MASK;
   }
-};
 
-// This layer defines all functions that are specific to how the the floating
-// point type is encoded. It enables constructions, modification and observation
-// of values manipulated as 'StorageType'.
-template <FPType fp_type, typename RetT>
-struct FPRepSem : public FPStorage<fp_type> {
-  using UP = FPStorage<fp_type>;
-  using typename UP::StorageType;
-  using UP::FRACTION_LEN;
-  using UP::FRACTION_MASK;
+private:
+  // Merge bits from 'a' and 'b' values according to 'mask'.
+  // Use 'a' bits when corresponding 'mask' bits are zeroes and 'b' bits when
+  // corresponding bits are ones.
+  LIBC_INLINE static constexpr StorageType merge(StorageType a, StorageType b,
+                                                 StorageType mask) {
+    // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
+    return a ^ ((a ^ b) & mask);
+  }
 
 protected:
-  using BiasedExp = typename UP::BiasedExponent;
-  using Exp = typename UP::Exponent;
-  using Sig = typename UP::Significand;
-  using UP::encode;
-  using UP::exp_bits;
-  using UP::exp_sig_bits;
-  using UP::sig_bits;
-  using UP::UP;
+  // The number of bits after the decimal dot when the number is in normal form.
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN =
+      fp_type == FPType::X86_Binary80 ? SIG_LEN - 1 : SIG_LEN;
+  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
+      FRACTION_LEN + 1;
+  LIBC_INLINE_VAR static constexpr StorageType FRACTION_MASK =
+      mask_trailing_ones<StorageType, FRACTION_LEN>();
+
+  // The floating point number representation as an unsigned integer.
+  StorageType bits = 0;
 
 public:
-  // Builders
-  LIBC_INLINE static constexpr RetT one(Sign sign = Sign::POS) {
-    return RetT(encode(sign, Exp::ZERO(), Sig::ZERO()));
+  LIBC_INLINE constexpr Sign sign() const {
+    return (bits & SIGN_MASK) ? Sign::NEG : Sign::POS;
   }
-  LIBC_INLINE static constexpr RetT min_subnormal(Sign sign = Sign::POS) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::LSB()));
+
+  LIBC_INLINE constexpr void set_sign(Sign signVal) {
+    if (sign() != signVal)
+      bits ^= SIGN_MASK;
   }
-  LIBC_INLINE static constexpr RetT max_subnormal(Sign sign = Sign::POS) {
-    return RetT(
-        encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::BITS_ALL_ONES()));
+
+  LIBC_INLINE constexpr StorageType get_mantissa() const {
+    return bits & FRACTION_MASK;
+  }
+
+  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
+    bits = merge(bits, mantVal, FRACTION_MASK);
+  }
+
+  LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
+    return uint16_t((bits & EXP_MASK) >> EXP_MASK_SHIFT);
   }
-  LIBC_INLINE static constexpr RetT min_normal(Sign sign = Sign::POS) {
-    return RetT(encode(sign, Exp::MIN(), Sig::ZERO()));
+
+  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
+    bits = merge(bits, biased << EXP_MASK_SHIFT, EXP_MASK);
   }
-  LIBC_INLINE static constexpr RetT max_normal(Sign sign = Sign::POS) {
-    return RetT(encode(sign, Exp::MAX(), Sig::BITS_ALL_ONES()));
+
+  LIBC_INLINE constexpr int get_exponent() const {
+    return int(get_biased_exponent()) - EXP_BIAS;
   }
-  LIBC_INLINE static constexpr RetT inf(Sign sign = Sign::POS) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(), Sig::ZERO()));
+
+  // If the number is subnormal, the exponent is treated as if it were the
+  // minimum exponent for a normal number. This is to keep continuity between
+  // the normal and subnormal ranges, but it causes problems for functions where
+  // values are calculated from the exponent, since just subtracting the bias
+  // will give a slightly incorrect result. Additionally, zero has an exponent
+  // of zero, and that should actually be treated as zero.
+  LIBC_INLINE constexpr int get_explicit_exponent() const {
+    const int biased_exp = int(get_biased_exponent());
+    if (is_zero()) {
+      return 0;
+    } else if (biased_exp == 0) {
+      return 1 - EXP_BIAS;
+    } else {
+      return biased_exp - EXP_BIAS;
+    }
   }
-  LIBC_INLINE static constexpr RetT build_nan(Sign sign = Sign::POS,
-                                              StorageType v = 0) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(),
-                       (v ? Sig(v) : (Sig::MSB() >> 1))));
+
+  LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
+  LIBC_INLINE constexpr void set_uintval(StorageType value) {
+    bits = (value & FP_MASK);
   }
-  LIBC_INLINE static constexpr RetT build_quiet_nan(Sign sign = Sign::POS,
-                                                    StorageType v = 0) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(), Sig::MSB() | Sig(v)));
+
+  LIBC_INLINE constexpr bool is_zero() const { return exp_sig_bits() == 0; }
+
+  LIBC_INLINE
+  constexpr bool is_subnormal() const {
+    return exp_bits() == encode(BiasedExponent::BITS_ALL_ZEROES());
   }
 
-  // Observers
+  LIBC_INLINE constexpr bool is_neg() const { return sign().is_neg(); }
+  LIBC_INLINE constexpr bool is_pos() const { return sign().is_pos(); }
+};
+
+namespace internal {
+
+// Manipulates the representation of a floating point number defined by its
+// FPType. This layer is architecture agnostic and does not handle C++ floating
+// point types directly ('float', 'double' and 'long double'). Use the FPBits
+// below if needed.
+//
+// TODO: Specialize this class for FPType::X86_Binary80 and remove ad-hoc logic
+// from FPRepBase.
+template <FPType fp_type> struct FPRep : public FPRepBase<fp_type> {
+  using UP = FPRepBase<fp_type>;
+  using typename UP::StorageType;
+  using UP::FRACTION_LEN;
+  using UP::FRACTION_MASK;
+  using UP::MANTISSA_PRECISION;
+
+protected:
+  using typename UP::BiasedExponent;
+  using typename UP::Exponent;
+  using typename UP::Significand;
+  using UP::encode;
+  using UP::exp_bits;
+  using UP::exp_sig_bits;
+  using UP::sig_bits;
+
+public:
   LIBC_INLINE constexpr bool is_nan() const {
-    return exp_sig_bits() > encode(BiasedExp::BITS_ALL_ONES(), Sig::ZERO());
+    return exp_sig_bits() >
+           encode(BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
   }
   LIBC_INLINE constexpr bool is_quiet_nan() const {
-    return exp_sig_bits() >= encode(BiasedExp::BITS_ALL_ONES(), Sig::MSB());
+    return exp_sig_bits() >=
+           encode(BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
   }
   LIBC_INLINE constexpr bool is_signaling_nan() const {
     return is_nan() && !is_quiet_nan();
   }
   LIBC_INLINE constexpr bool is_inf() const {
-    return exp_sig_bits() == encode(BiasedExp::BITS_ALL_ONES(), Sig::ZERO());
+    return exp_sig_bits() ==
+           encode(BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
   }
   LIBC_INLINE constexpr bool is_finite() const {
-    return exp_bits() != encode(BiasedExp::BITS_ALL_ONES());
-  }
-  LIBC_INLINE
-  constexpr bool is_subnormal() const {
-    return exp_bits() == encode(BiasedExp::BITS_ALL_ZEROES());
+    return exp_bits() != encode(BiasedExponent::BITS_ALL_ONES());
   }
   LIBC_INLINE constexpr bool is_normal() const {
     return is_finite() && !UP::is_subnormal();
   }
-  // Returns the mantissa with the implicit bit set iff the current
+
+  LIBC_INLINE static constexpr StorageType zero(Sign sign = Sign::POS) {
+    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::ZERO());
+  }
+  LIBC_INLINE static constexpr StorageType one(Sign sign = Sign::POS) {
+    return encode(sign, Exponent::ZERO(), Significand::ZERO());
+  }
+  LIBC_INLINE static constexpr StorageType
+  min_subnormal(Sign sign = Sign::POS) {
+    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::LSB());
+  }
+  LIBC_INLINE static constexpr StorageType
+  max_subnormal(Sign sign = Sign::POS) {
+    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(),
+                  Significand::BITS_ALL_ONES());
+  }
+  LIBC_INLINE static constexpr StorageType min_normal(Sign sign = Sign::POS) {
+    return encode(sign, Exponent::MIN(), Significand::ZERO());
+  }
+  LIBC_INLINE static constexpr StorageType max_normal(Sign sign = Sign::POS) {
+    return encode(sign, Exponent::MAX(), Significand::BITS_ALL_ONES());
+  }
+  LIBC_INLINE static constexpr StorageType inf(Sign sign = Sign::POS) {
+    return encode(sign, BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
+  }
+  LIBC_INLINE static constexpr StorageType build_nan(Sign sign = Sign::POS,
+                                                     StorageType v = 0) {
+    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
+                  (v ? Significand(v) : (Significand::MSB() >> 1)));
+  }
+  LIBC_INLINE static constexpr StorageType
+  build_quiet_nan(Sign sign = Sign::POS, StorageType v = 0) {
+    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
+                  Significand::MSB() | Significand(v));
+  }
+
+  // The function return mantissa with the implicit bit set iff the current
   // value is a valid normal number.
   LIBC_INLINE constexpr StorageType get_explicit_mantissa() {
     if (UP::is_subnormal())
@@ -402,14 +490,20 @@ struct FPRepSem : public FPStorage<fp_type> {
 };
 
 // Specialization for the X86 Extended Precision type.
-template <typename RetT>
-struct FPRepSem<FPType::X86_Binary80, RetT>
-    : public FPStorage<FPType::X86_Binary80> {
-  using UP = FPStorage<FPType::X86_Binary80>;
+template <>
+struct FPRep<FPType::X86_Binary80> : public FPRepBase<FPType::X86_Binary80> {
+  using UP = FPRepBase<FPType::X86_Binary80>;
   using typename UP::StorageType;
   using UP::FRACTION_LEN;
   using UP::FRACTION_MASK;
+  using UP::MANTISSA_PRECISION;
+
+protected:
+  using typename UP::BiasedExponent;
+  using typename UP::Significand;
+  using UP::encode;
 
+public:
   // The x86 80 bit float represents the leading digit of the mantissa
   // explicitly. This is the mask for that bit.
   static constexpr StorageType EXPLICIT_BIT_MASK = StorageType(1)
@@ -421,45 +515,6 @@ struct FPRepSem<FPType::X86_Binary80, RetT>
                 "the explicit bit and the fractional part should cover the "
                 "whole significand");
 
-protected:
-  using BiasedExp = typename UP::BiasedExponent;
-  using Sig = typename UP::Significand;
-  using UP::encode;
-  using UP::UP;
-
-public:
-  // Builders
-  LIBC_INLINE static constexpr RetT one(Sign sign = Sign::POS) {
-    return RetT(encode(sign, Exponent::ZERO(), Sig::MSB()));
-  }
-  LIBC_INLINE static constexpr RetT min_subnormal(Sign sign = Sign::POS) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::LSB()));
-  }
-  LIBC_INLINE static constexpr RetT max_subnormal(Sign sign = Sign::POS) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(),
-                       Sig::BITS_ALL_ONES() ^ Sig::MSB()));
-  }
-  LIBC_INLINE static constexpr RetT min_normal(Sign sign = Sign::POS) {
-    return RetT(encode(sign, Exponent::MIN(), Sig::MSB()));
-  }
-  LIBC_INLINE static constexpr RetT max_normal(Sign sign = Sign::POS) {
-    return RetT(encode(sign, Exponent::MAX(), Sig::BITS_ALL_ONES()));
-  }
-  LIBC_INLINE static constexpr RetT inf(Sign sign = Sign::POS) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(), Sig::MSB()));
-  }
-  LIBC_INLINE static constexpr RetT build_nan(Sign sign = Sign::POS,
-                                              StorageType v = 0) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(),
-                       Sig::MSB() | (v ? Sig(v) : (Sig::MSB() >> 2))));
-  }
-  LIBC_INLINE static constexpr RetT build_quiet_nan(Sign sign = Sign::POS,
-                                                    StorageType v = 0) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(),
-                       Sig::MSB() | (Sig::MSB() >> 1) | Sig(v)));
-  }
-
-  // Observers
   LIBC_INLINE constexpr bool is_nan() const {
     // Most encoding forms from the table found in
     // https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format
@@ -472,183 +527,85 @@ struct FPRepSem<FPType::X86_Binary80, RetT>
     // - Quiet Not a Number
     // - Unnormal
     // This can be reduced to the following logic:
-    if (exp_bits() == encode(BiasedExp::BITS_ALL_ONES()))
+    if (exp_bits() == encode(BiasedExponent::BITS_ALL_ONES()))
       return !is_inf();
-    if (exp_bits() != encode(BiasedExp::BITS_ALL_ZEROES()))
-      return (sig_bits() & encode(Sig::MSB())) == 0;
+    if (exp_bits() != encode(BiasedExponent::BITS_ALL_ZEROES()))
+      return (sig_bits() & encode(Significand::MSB())) == 0;
     return false;
   }
   LIBC_INLINE constexpr bool is_quiet_nan() const {
     return exp_sig_bits() >=
-           encode(BiasedExp::BITS_ALL_ONES(), Sig::MSB() | (Sig::MSB() >> 1));
+           encode(BiasedExponent::BITS_ALL_ONES(),
+                  Significand::MSB() | (Significand::MSB() >> 1));
   }
   LIBC_INLINE constexpr bool is_signaling_nan() const {
     return is_nan() && !is_quiet_nan();
   }
   LIBC_INLINE constexpr bool is_inf() const {
-    return exp_sig_bits() == encode(BiasedExp::BITS_ALL_ONES(), Sig::MSB());
+    return exp_sig_bits() ==
+           encode(BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
   }
   LIBC_INLINE constexpr bool is_finite() const {
     return !is_inf() && !is_nan();
   }
-  LIBC_INLINE
-  constexpr bool is_subnormal() const {
-    return exp_bits() == encode(BiasedExp::BITS_ALL_ZEROES());
-  }
   LIBC_INLINE constexpr bool is_normal() const {
     const auto exp = exp_bits();
-    if (exp == encode(BiasedExp::BITS_ALL_ZEROES()) ||
-        exp == encode(BiasedExp::BITS_ALL_ONES()))
+    if (exp == encode(BiasedExponent::BITS_ALL_ZEROES()) ||
+        exp == encode(BiasedExponent::BITS_ALL_ONES()))
       return false;
     return get_implicit_bit();
   }
-  LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
-    return sig_bits();
-  }
-
-  // This functions is specific to FPRepSem<FPType::X86_Binary80>.
-  // TODO: Remove if possible.
-  LIBC_INLINE constexpr bool get_implicit_bit() const {
-    return static_cast<bool>(bits & EXPLICIT_BIT_MASK);
-  }
-
-  // This functions is specific to FPRepSem<FPType::X86_Binary80>.
-  // TODO: Remove if possible.
-  LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
-    if (get_implicit_bit() != implicitVal)
-      bits ^= EXPLICIT_BIT_MASK;
-  }
-};
-
-// 'FPRep' is the bottom of the class hierarchy that only deals with 'FPType'.
-// The operations dealing with specific float semantics are implemented by
-// 'FPRepSem' above and specialized when needed.
-//
-// The 'RetT' type is being propagated up to 'FPRepSem' so that the functions
-// creating new values (Builders) can return the appropriate type. That is, when
-// creating a value through 'FPBits' below the builder will return an 'FPBits'
-// value:
-// i.e., FPBits<float>::zero() // returns an FPBits<float>
-// When we don't care about specific C++ floating point type we can use 'FPRep'
-// directly and 'RetT' defaults to 'StorageType':
-// i.e., FPRep<FPType:IEEE754_Binary32:>::zero() // returns an 'uint32_t'
-template <FPType fp_type,
-          typename RetT = typename FPLayout<fp_type>::StorageType>
-struct FPRep : public FPRepSem<fp_type, RetT> {
-  using UP = FPRepSem<fp_type, RetT>;
-  using StorageType = typename UP::StorageType;
-
-protected:
-  using UP::bits;
-  using UP::encode;
-  using UP::exp_bits;
-  using UP::exp_sig_bits;
-
-  using BiasedExp = typename UP::BiasedExponent;
-  using Sig = typename UP::Significand;
 
-  using UP::FP_MASK;
-  using UP::SIG_LEN;
-
-public:
-  using UP::EXP_BIAS;
-  using UP::EXP_MASK;
-  using UP::FRACTION_MASK;
-  using UP::SIGN_MASK;
-
-  // Representation
-  LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
-  LIBC_INLINE constexpr void set_uintval(StorageType value) {
-    bits = (value & FP_MASK);
+  LIBC_INLINE static constexpr StorageType zero(Sign sign = Sign::POS) {
+    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::ZERO());
   }
-
-  // Builders
-  LIBC_INLINE static constexpr RetT zero(Sign sign = Sign::POS) {
-    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::ZERO()));
+  LIBC_INLINE static constexpr StorageType one(Sign sign = Sign::POS) {
+    return encode(sign, Exponent::ZERO(), Significand::MSB());
   }
-  using UP::build_nan;
-  using UP::build_quiet_nan;
-  using UP::inf;
-  using UP::max_normal;
-  using UP::max_subnormal;
-  using UP::min_normal;
-  using UP::min_subnormal;
-  using UP::one;
-
-  // Modifiers
-  LIBC_INLINE constexpr RetT abs() const {
-    return RetT(bits & UP::EXP_SIG_MASK);
+  LIBC_INLINE static constexpr StorageType
+  min_subnormal(Sign sign = Sign::POS) {
+    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::LSB());
   }
-
-  // Observers
-  using UP::get_explicit_mantissa;
-  LIBC_INLINE constexpr bool is_zero() const { return exp_sig_bits() == 0; }
-  LIBC_INLINE constexpr bool is_inf_or_nan() const { return !is_finite(); }
-  using UP::is_finite;
-  using UP::is_inf;
-  using UP::is_nan;
-  using UP::is_normal;
-  using UP::is_quiet_nan;
-  using UP::is_signaling_nan;
-  using UP::is_subnormal;
-  LIBC_INLINE constexpr bool is_neg() const { return sign().is_neg(); }
-  LIBC_INLINE constexpr bool is_pos() const { return sign().is_pos(); }
-
-  // Parts
-  LIBC_INLINE constexpr Sign sign() const {
-    return (bits & SIGN_MASK) ? Sign::NEG : Sign::POS;
+  LIBC_INLINE static constexpr StorageType
+  max_subnormal(Sign sign = Sign::POS) {
+    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(),
+                  Significand::BITS_ALL_ONES() ^ Significand::MSB());
   }
-
-  LIBC_INLINE constexpr void set_sign(Sign signVal) {
-    if (sign() != signVal)
-      bits ^= SIGN_MASK;
+  LIBC_INLINE static constexpr StorageType min_normal(Sign sign = Sign::POS) {
+    return encode(sign, Exponent::MIN(), Significand::MSB());
   }
-
-  LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
-    return uint16_t((bits & UP::EXP_MASK) >> UP::SIG_LEN);
+  LIBC_INLINE static constexpr StorageType max_normal(Sign sign = Sign::POS) {
+    return encode(sign, Exponent::MAX(), Significand::BITS_ALL_ONES());
   }
-
-  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
-    bits = merge(bits, biased << SIG_LEN, EXP_MASK);
+  LIBC_INLINE static constexpr StorageType inf(Sign sign = Sign::POS) {
+    return encode(sign, BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
   }
-
-  LIBC_INLINE constexpr int get_exponent() const {
-    return int(get_biased_exponent()) - EXP_BIAS;
+  LIBC_INLINE static constexpr StorageType build_nan(Sign sign = Sign::POS,
+                                                     StorageType v = 0) {
+    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
+                  Significand::MSB() |
+                      (v ? Significand(v) : (Significand::MSB() >> 2)));
   }
-
-  // If the number is subnormal, the exponent is treated as if it were the
-  // minimum exponent for a normal number. This is to keep continuity between
-  // the normal and subnormal ranges, but it causes problems for functions where
-  // values are calculated from the exponent, since just subtracting the bias
-  // will give a slightly incorrect result. Additionally, zero has an exponent
-  // of zero, and that should actually be treated as zero.
-  LIBC_INLINE constexpr int get_explicit_exponent() const {
-    const int biased_exp = int(get_biased_exponent());
-    if (is_zero()) {
-      return 0;
-    } else if (biased_exp == 0) {
-      return 1 - EXP_BIAS;
-    } else {
-      return biased_exp - EXP_BIAS;
-    }
+  LIBC_INLINE static constexpr StorageType
+  build_quiet_nan(Sign sign = Sign::POS, StorageType v = 0) {
+    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
+                  Significand::MSB() | (Significand::MSB() >> 1) |
+                      Significand(v));
   }
 
-  LIBC_INLINE constexpr StorageType get_mantissa() const {
-    return bits & FRACTION_MASK;
+  LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
+    return sig_bits();
   }
 
-  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
-    bits = merge(bits, mantVal, FRACTION_MASK);
+  // The following functions are specific to FPRep<FPType::X86_Binary80>.
+  // TODO: Remove if possible.
+  LIBC_INLINE constexpr bool get_implicit_bit() const {
+    return static_cast<bool>(bits & EXPLICIT_BIT_MASK);
   }
 
-private:
-  // Merge bits from 'a' and 'b' values according to 'mask'.
-  // Use 'a' bits when corresponding 'mask' bits are zeroes and 'b' bits when
-  // corresponding bits are ones.
-  LIBC_INLINE static constexpr StorageType merge(StorageType a, StorageType b,
-                                                 StorageType mask) {
-    // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
-    return a ^ ((a ^ b) & mask);
+  LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
+    if (get_implicit_bit() != implicitVal)
+      bits ^= EXPLICIT_BIT_MASK;
   }
 };
 
@@ -685,31 +642,29 @@ template <typename T> LIBC_INLINE static constexpr FPType get_fp_type() {
     static_assert(cpp::always_false<UnqualT>, "Unsupported type");
 }
 
-// A generic class to manipulate floating point formats.
-// It derives most of its functionality to FPRep above.
-template <typename T>
-struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
+// A generic class to represent floating point formats.
+// On most platforms, the 'float' type corresponds to single precision
+// floating point numbers, the 'double' type corresponds to double precision
+// floating point numers, and the 'long double' type corresponds to the quad
+// precision floating numbers. On x86 platforms however, the 'long double'
+// type maps to an x87 floating point format.
+template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
   static_assert(cpp::is_floating_point_v<T>,
                 "FPBits instantiated with invalid type.");
-  using UP = internal::FPRep<get_fp_type<T>(), FPBits<T>>;
+  using UP = internal::FPRep<get_fp_type<T>()>;
   using Rep = UP;
   using StorageType = typename UP::StorageType;
 
   using UP::bits;
+  using UP::EXP_LEN;
+  using UP::UP;
 
   // Constants.
-  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
-      UP::FRACTION_LEN + 1;
-  LIBC_INLINE_VAR static constexpr StorageType MIN_NORMAL =
-      UP::min_normal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr StorageType MAX_NORMAL =
-      UP::max_normal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr StorageType MIN_SUBNORMAL =
-      UP::min_subnormal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr StorageType MAX_SUBNORMAL =
-      UP::max_subnormal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr int MAX_BIASED_EXPONENT =
-      (1 << UP::EXP_LEN) - 1;
+  static constexpr int MAX_BIASED_EXPONENT = (1 << EXP_LEN) - 1;
+  static constexpr StorageType MIN_NORMAL = UP::min_normal(Sign::POS);
+  static constexpr StorageType MAX_NORMAL = UP::max_normal(Sign::POS);
+  static constexpr StorageType MIN_SUBNORMAL = UP::min_subnormal(Sign::POS);
+  static constexpr StorageType MAX_SUBNORMAL = UP::max_subnormal(Sign::POS);
 
   // Constructors.
   LIBC_INLINE constexpr FPBits() = default;
@@ -731,35 +686,49 @@ struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
 
   LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
 
+  LIBC_INLINE constexpr bool is_inf_or_nan() const { return !UP::is_finite(); }
+
+  LIBC_INLINE constexpr FPBits abs() const {
+    return FPBits(bits & UP::EXP_SIG_MASK);
+  }
+
   // Methods below this are used by tests.
-  // TODO: inline and remove.
+
   LIBC_INLINE static constexpr T one(Sign sign = Sign::POS) {
-    return T(UP::one(sign));
+    return FPBits(UP::one(sign)).get_val();
   }
+
   LIBC_INLINE static constexpr T zero(Sign sign = Sign::POS) {
-    return T(UP::zero(sign));
+    return FPBits(UP::zero(sign)).get_val();
   }
+
   LIBC_INLINE static constexpr T inf(Sign sign = Sign::POS) {
-    return T(UP::inf(sign));
+    return FPBits(UP::inf(sign)).get_val();
   }
+
   LIBC_INLINE static constexpr T min_normal() {
-    return T(UP::min_normal(Sign::POS));
+    return FPBits(UP::min_normal(Sign::POS)).get_val();
   }
+
   LIBC_INLINE static constexpr T max_normal() {
-    return T(UP::max_normal(Sign::POS));
+    return FPBits(UP::max_normal(Sign::POS)).get_val();
   }
+
   LIBC_INLINE static constexpr T min_denormal() {
-    return T(UP::min_subnormal(Sign::POS));
+    return FPBits(UP::min_subnormal(Sign::POS)).get_val();
   }
+
   LIBC_INLINE static constexpr T max_denormal() {
-    return T(UP::max_subnormal(Sign::POS));
+    return FPBits(UP::max_subnormal(Sign::POS)).get_val();
   }
+
   LIBC_INLINE static constexpr T build_nan(StorageType v) {
-    return T(UP::build_nan(Sign::POS, v));
+    return FPBits(UP::build_nan(Sign::POS, v)).get_val();
   }
+
   LIBC_INLINE static constexpr T build_quiet_nan(StorageType v,
                                                  Sign sign = Sign::POS) {
-    return T(UP::build_quiet_nan(sign, v));
+    return FPBits(UP::build_quiet_nan(sign, v)).get_val();
   }
 
   // TODO: Use an uint32_t for 'biased_exp'.
@@ -788,7 +757,7 @@ struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
                   "This function is not tested for X86 Extended Precision");
     FPBits<T> result;
     // offset: +1 for sign, but -1 for implicit first bit
-    int lz = cpp::countl_zero(number) - UP::EXP_LEN;
+    int lz = cpp::countl_zero(number) - EXP_LEN;
     number <<= lz;
     ep -= lz;
 
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index e6b6d7d9ec780..f0b155085dabf 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -17,72 +17,69 @@ TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary16) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
   using Rep = FPRep<FPType::IEEE754_Binary16>;
-  using u16 = typename Rep::StorageType;
-
-  EXPECT_EQ(u16(0b0'00000'0000000000), u16(Rep::zero()));
-  EXPECT_EQ(u16(0b0'01111'0000000000), u16(Rep::one()));
-  EXPECT_EQ(u16(0b0'00000'0000000001), u16(Rep::min_subnormal()));
-  EXPECT_EQ(u16(0b0'00000'1111111111), u16(Rep::max_subnormal()));
-  EXPECT_EQ(u16(0b0'00001'0000000000), u16(Rep::min_normal()));
-  EXPECT_EQ(u16(0b0'11110'1111111111), u16(Rep::max_normal()));
-  EXPECT_EQ(u16(0b0'11111'0000000000), u16(Rep::inf()));
-  EXPECT_EQ(u16(0b0'11111'0100000000), u16(Rep::build_nan()));
-  EXPECT_EQ(u16(0b0'11111'1000000000), u16(Rep::build_quiet_nan()));
+  using u16 = uint16_t;
+
+  EXPECT_EQ(u16(0b0'00000'0000000000), Rep::zero());
+  EXPECT_EQ(u16(0b0'01111'0000000000), Rep::one());
+  EXPECT_EQ(u16(0b0'00000'0000000001), Rep::min_subnormal());
+  EXPECT_EQ(u16(0b0'00000'1111111111), Rep::max_subnormal());
+  EXPECT_EQ(u16(0b0'00001'0000000000), Rep::min_normal());
+  EXPECT_EQ(u16(0b0'11110'1111111111), Rep::max_normal());
+  EXPECT_EQ(u16(0b0'11111'0000000000), Rep::inf());
+  EXPECT_EQ(u16(0b0'11111'0100000000), Rep::build_nan());
+  EXPECT_EQ(u16(0b0'11111'1000000000), Rep::build_quiet_nan());
 }
 
 TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary32) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
   using Rep = FPRep<FPType::IEEE754_Binary32>;
-  using u32 = typename Rep::StorageType;
-
-  EXPECT_EQ(u32(0b0'00000000'00000000000000000000000), u32(Rep::zero()));
-  EXPECT_EQ(u32(0b0'01111111'00000000000000000000000), u32(Rep::one()));
-  EXPECT_EQ(u32(0b0'00000000'00000000000000000000001),
-            u32(Rep::min_subnormal()));
-  EXPECT_EQ(u32(0b0'00000000'11111111111111111111111),
-            u32(Rep::max_subnormal()));
-  EXPECT_EQ(u32(0b0'00000001'00000000000000000000000), u32(Rep::min_normal()));
-  EXPECT_EQ(u32(0b0'11111110'11111111111111111111111), u32(Rep::max_normal()));
-  EXPECT_EQ(u32(0b0'11111111'00000000000000000000000), u32(Rep::inf()));
-  EXPECT_EQ(u32(0b0'11111111'01000000000000000000000), u32(Rep::build_nan()));
-  EXPECT_EQ(u32(0b0'11111111'10000000000000000000000),
-            u32(Rep::build_quiet_nan()));
+  using u32 = uint32_t;
+
+  EXPECT_EQ(u32(0b0'00000000'00000000000000000000000), Rep::zero());
+  EXPECT_EQ(u32(0b0'01111111'00000000000000000000000), Rep::one());
+  EXPECT_EQ(u32(0b0'00000000'00000000000000000000001), Rep::min_subnormal());
+  EXPECT_EQ(u32(0b0'00000000'11111111111111111111111), Rep::max_subnormal());
+  EXPECT_EQ(u32(0b0'00000001'00000000000000000000000), Rep::min_normal());
+  EXPECT_EQ(u32(0b0'11111110'11111111111111111111111), Rep::max_normal());
+  EXPECT_EQ(u32(0b0'11111111'00000000000000000000000), Rep::inf());
+  EXPECT_EQ(u32(0b0'11111111'01000000000000000000000), Rep::build_nan());
+  EXPECT_EQ(u32(0b0'11111111'10000000000000000000000), Rep::build_quiet_nan());
 }
 
 TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary64) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
   using Rep = FPRep<FPType::IEEE754_Binary64>;
-  using u64 = typename Rep::StorageType;
+  using u64 = uint64_t;
 
   EXPECT_EQ(
       u64(0b0'00000000000'0000000000000000000000000000000000000000000000000000),
-      u64(Rep::zero()));
+      Rep::zero());
   EXPECT_EQ(
       u64(0b0'01111111111'0000000000000000000000000000000000000000000000000000),
-      u64(Rep::one()));
+      Rep::one());
   EXPECT_EQ(
       u64(0b0'00000000000'0000000000000000000000000000000000000000000000000001),
-      u64(Rep::min_subnormal()));
+      Rep::min_subnormal());
   EXPECT_EQ(
       u64(0b0'00000000000'1111111111111111111111111111111111111111111111111111),
-      u64(Rep::max_subnormal()));
+      Rep::max_subnormal());
   EXPECT_EQ(
       u64(0b0'00000000001'0000000000000000000000000000000000000000000000000000),
-      u64(Rep::min_normal()));
+      Rep::min_normal());
   EXPECT_EQ(
       u64(0b0'11111111110'1111111111111111111111111111111111111111111111111111),
-      u64(Rep::max_normal()));
+      Rep::max_normal());
   EXPECT_EQ(
       u64(0b0'11111111111'0000000000000000000000000000000000000000000000000000),
-      u64(Rep::inf()));
+      Rep::inf());
   EXPECT_EQ(
       u64(0b0'11111111111'0100000000000000000000000000000000000000000000000000),
-      u64(Rep::build_nan()));
+      Rep::build_nan());
   EXPECT_EQ(
       u64(0b0'11111111111'1000000000000000000000000000000000000000000000000000),
-      u64(Rep::build_quiet_nan()));
+      Rep::build_quiet_nan());
 }
 
 static constexpr UInt128 u128(uint64_t hi, uint64_t lo) {
@@ -93,49 +90,6 @@ static constexpr UInt128 u128(uint64_t hi, uint64_t lo) {
 #endif
 }
 
-TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary128) {
-  using LIBC_NAMESPACE::fputil::FPType;
-  using LIBC_NAMESPACE::fputil::internal::FPRep;
-  using Rep = FPRep<FPType::IEEE754_Binary128>;
-
-  EXPECT_EQ(
-      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::zero()));
-  EXPECT_EQ(
-      u128(0b0'011111111111111'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::one()));
-  EXPECT_EQ(
-      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000001),
-      UInt128(Rep::min_subnormal()));
-  EXPECT_EQ(
-      u128(0b0'000000000000000'111111111111111111111111111111111111111111111111,
-           0b1111111111111111111111111111111111111111111111111111111111111111),
-      UInt128(Rep::max_subnormal()));
-  EXPECT_EQ(
-      u128(0b0'000000000000001'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::min_normal()));
-  EXPECT_EQ(
-      u128(0b0'111111111111110'111111111111111111111111111111111111111111111111,
-           0b1111111111111111111111111111111111111111111111111111111111111111),
-      UInt128(Rep::max_normal()));
-  EXPECT_EQ(
-      u128(0b0'111111111111111'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::inf()));
-  EXPECT_EQ(
-      u128(0b0'111111111111111'010000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::build_nan()));
-  EXPECT_EQ(
-      u128(0b0'111111111111111'100000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::build_quiet_nan()));
-}
-
 TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
@@ -144,39 +98,39 @@ TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80) {
   EXPECT_EQ(
       u128(0b0'000000000000000,
            0b0000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::zero()));
+      Rep::zero());
   EXPECT_EQ(
       u128(0b0'011111111111111,
            0b1000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::one()));
+      Rep::one());
   EXPECT_EQ(
       u128(0b0'000000000000000,
            0b0000000000000000000000000000000000000000000000000000000000000001),
-      UInt128(Rep::min_subnormal()));
+      Rep::min_subnormal());
   EXPECT_EQ(
       u128(0b0'000000000000000,
            0b0111111111111111111111111111111111111111111111111111111111111111),
-      UInt128(Rep::max_subnormal()));
+      Rep::max_subnormal());
   EXPECT_EQ(
       u128(0b0'000000000000001,
            0b1000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::min_normal()));
+      Rep::min_normal());
   EXPECT_EQ(
       u128(0b0'111111111111110,
            0b1111111111111111111111111111111111111111111111111111111111111111),
-      UInt128(Rep::max_normal()));
+      Rep::max_normal());
   EXPECT_EQ(
       u128(0b0'111111111111111,
            0b1000000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::inf()));
+      Rep::inf());
   EXPECT_EQ(
       u128(0b0'111111111111111,
            0b1010000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::build_nan()));
+      Rep::build_nan());
   EXPECT_EQ(
       u128(0b0'111111111111111,
            0b1100000000000000000000000000000000000000000000000000000000000000),
-      UInt128(Rep::build_quiet_nan()));
+      Rep::build_quiet_nan());
 }
 
 TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80_IsNan) {
@@ -229,6 +183,49 @@ TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80_IsNan) {
       0b1000000000000000000000000000000000000000000000000000000000000000));
 }
 
+TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary128) {
+  using LIBC_NAMESPACE::fputil::FPType;
+  using LIBC_NAMESPACE::fputil::internal::FPRep;
+  using Rep = FPRep<FPType::IEEE754_Binary128>;
+
+  EXPECT_EQ(
+      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      Rep::zero());
+  EXPECT_EQ(
+      u128(0b0'011111111111111'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      Rep::one());
+  EXPECT_EQ(
+      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000001),
+      Rep::min_subnormal());
+  EXPECT_EQ(
+      u128(0b0'000000000000000'111111111111111111111111111111111111111111111111,
+           0b1111111111111111111111111111111111111111111111111111111111111111),
+      Rep::max_subnormal());
+  EXPECT_EQ(
+      u128(0b0'000000000000001'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      Rep::min_normal());
+  EXPECT_EQ(
+      u128(0b0'111111111111110'111111111111111111111111111111111111111111111111,
+           0b1111111111111111111111111111111111111111111111111111111111111111),
+      Rep::max_normal());
+  EXPECT_EQ(
+      u128(0b0'111111111111111'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      Rep::inf());
+  EXPECT_EQ(
+      u128(0b0'111111111111111'010000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      Rep::build_nan());
+  EXPECT_EQ(
+      u128(0b0'111111111111111'100000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      Rep::build_quiet_nan());
+}
+
 TEST(LlvmLibcFPBitsTest, FloatType) {
   using FloatBits = FPBits<float>;
 

From 4f4690530e8b40cdf3a17c76a352b26c2fb0446c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jan=20Kokem=C3=BCller?= <jan.kokemueller@gmail.com>
Date: Mon, 22 Jan 2024 15:05:39 +0100
Subject: [PATCH 388/843] [libc++] Ensure that std::expected has no tail
 padding (#69673)

Currently std::expected can have some padding bytes in its tail due to
[[no_unique_address]]. Those padding bytes can be used by other objects.
For example, in the current implementation:

  sizeof(std::expected<std::optional<int>, bool>) ==
    sizeof(std::expected<std::expected<std::optional<int>, bool>, bool>)

As a result, the data layout of an
  std::expected<std::expected<std::optional<int>, bool>, bool>
can look like this:

              +-- optional "has value" flag
              |        +--padding
  /---int---\ |        |
  00 00 00 00 01 00 00 00
                |  |
                |  +- "outer" expected "has value" flag
                |
                +- expected "has value" flag

This is problematic because `emplace()`ing the "inner" expected can not
only overwrite the "inner" expected "has value" flag (issue #68552) but
also the tail padding where other objects might live.

This patch fixes the problem by ensuring that std::expected has no tail
padding, which is achieved by conditional usage of [[no_unique_address]]
based on the tail padding that this would create.

This is an ABI breaking change because the following property changes:

  sizeof(std::expected<std::optional<int>, bool>) <
    sizeof(std::expected<std::expected<std::optional<int>, bool>, bool>)

Before the change, this relation didn't hold. After the change, the relation
does hold, which means that the size of std::expected in these cases increases
after this patch. The data layout will change in the following cases where
tail padding can be reused by other objects:

  class foo : std::expected<std::optional<int>, bool> {
    bool b;
  };

or using [[no_unique_address]]:

  struct foo {
    [[no_unique_address]] std::expected<std::optional<int>, bool> e;
    bool b;
  };

The vendor communication is handled in #70820.
Fixes: #70494

Co-authored-by: philnik777 <nikolasklauser@berlin.de>
Co-authored-by: Louis Dionne <ldionne.2@gmail.com>
---
 libcxx/docs/ReleaseNotes/18.rst               |   22 +
 libcxx/include/__expected/expected.h          | 1140 +++++++++++------
 .../no_unique_address.compile.pass.cpp        |   45 +-
 .../transform_error.mandates.verify.cpp       |    7 +-
 .../no_unique_address.compile.pass.cpp        |   38 +-
 .../transform_error.mandates.verify.cpp       |    6 +
 .../assign/assign.U.pass.cpp                  |   14 +
 .../assign/assign.copy.pass.cpp               |   23 +
 .../assign/assign.move.pass.cpp               |   23 +
 .../expected.expected/assign/emplace.pass.cpp |   14 +
 .../monadic/transform.pass.cpp                |    4 +-
 .../monadic/transform_error.pass.cpp          |    4 +-
 .../swap/member.swap.pass.cpp                 |   22 +
 .../expected.void/assign/assign.copy.pass.cpp |   22 +
 .../expected.void/assign/assign.move.pass.cpp |   22 +
 .../assign/assign.unexpected.copy.pass.cpp    |   16 +
 .../assign/assign.unexpected.move.pass.cpp    |   17 +
 .../monadic/transform_error.pass.cpp          |    4 +-
 .../expected.void/swap/member.swap.pass.cpp   |   22 +
 libcxx/test/std/utilities/expected/types.h    |  137 ++
 20 files changed, 1201 insertions(+), 401 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 7d6331a025808..c619f94fd0035 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -284,6 +284,28 @@ ABI Affecting Changes
   against different configurations of it being used in different translation
   units.
 
+- The amount of padding bytes available for use at the end of certain ``std::expected`` instantiations has changed in this
+  release. This is an ABI break for any code that held a ``std::expected`` member with ``[[no_unique_address]]`` in an
+  ABI-facing type. In those cases, the layout of the enclosing type will change, breaking the ABI. However, the
+  ``std::expected<T, E>`` member requires a few characteristics in order to be affected by this change:
+
+  - A type equivalent to ``union {T ; E}`` needs to have more than one byte of padding available.
+  - The ``std::expected<T, E>`` member must have been in a situation where its padding bytes were previously reused by
+    another object, which can happen in a few cases (this is probably not exhaustive):
+
+    - It is a member with ``[[no_unique_address]]`` applied to it, and it is followed by another data member, or
+    - It is a member with ``[[no_unique_address]]`` applied to it, and it is the last member of the user-defined type,
+      and that user-defined type is used in ways that its padding bytes can be reused, or
+    - It is inherited from
+
+  We expect that this will not be a very frequent occurrence. However, there is unfortunately no technique we can use
+  in the library to catch such misuse. Indeed, even applying an ABI tag to ``std::expected`` would not help since ABI
+  tags are not propagated to containing types. As a result, if you notice very difficult to explain bugs around the
+  usage of a ``std::expected``, you should consider checking whether you are hitting this ABI break. This change was
+  done to fix `#70494 <https://github.com/llvm/llvm-project/issues/70494>`_ and the vendor communication is handled
+  in `#70820 <https://github.com/llvm/llvm-project/issues/70820>`_.
+
+
 Build System Changes
 --------------------
 
diff --git a/libcxx/include/__expected/expected.h b/libcxx/include/__expected/expected.h
index 9f36c41b78edf..443d9257dc598 100644
--- a/libcxx/include/__expected/expected.h
+++ b/libcxx/include/__expected/expected.h
@@ -88,8 +88,372 @@ _LIBCPP_HIDE_FROM_ABI void __throw_bad_expected_access(_Arg&& __arg) {
 #  endif
 }
 
+// If parameter type `_Tp` of `__conditional_no_unique_address` is neither
+// copyable nor movable, a constructor with this tag is provided. For that
+// constructor, the user has to provide a function and arguments. The function
+// must return an object of type `_Tp`. When the function is invoked by the
+// constructor, guaranteed copy elision kicks in and the `_Tp` is constructed
+// in place.
+struct __conditional_no_unique_address_invoke_tag {};
+
+// This class implements an object with `[[no_unique_address]]` conditionally applied to it,
+// based on the value of `_NoUnique`.
+//
+// A member of this class must always have `[[no_unique_address]]` applied to
+// it. Otherwise, the `[[no_unique_address]]` in the "`_NoUnique == true`" case
+// would not have any effect. In the `false` case, the `__v` is not
+// `[[no_unique_address]]`, so nullifies the effects of the "outer"
+// `[[no_unique_address]]` regarding data layout.
+//
+// If we had a language feature, this class would basically be replaced by `[[no_unique_address(condition)]]`.
+template <bool _NoUnique, class _Tp>
+struct __conditional_no_unique_address;
+
+template <class _Tp>
+struct __conditional_no_unique_address<true, _Tp> {
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __conditional_no_unique_address(in_place_t, _Args&&... __args)
+      : __v(std::forward<_Args>(__args)...) {}
+
+  template <class _Func, class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __conditional_no_unique_address(
+      __conditional_no_unique_address_invoke_tag, _Func&& __f, _Args&&... __args)
+      : __v(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
+
+  _LIBCPP_NO_UNIQUE_ADDRESS _Tp __v;
+};
+
+template <class _Tp>
+struct __conditional_no_unique_address<false, _Tp> {
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __conditional_no_unique_address(in_place_t, _Args&&... __args)
+      : __v(std::forward<_Args>(__args)...) {}
+
+  template <class _Func, class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __conditional_no_unique_address(
+      __conditional_no_unique_address_invoke_tag, _Func&& __f, _Args&&... __args)
+      : __v(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
+
+  _Tp __v;
+};
+
+// This function returns whether the type `_Second` can be stuffed into the tail padding
+// of the `_First` type if both of them are given `[[no_unique_address]]`.
+template <class _First, class _Second>
+inline constexpr bool __fits_in_tail_padding = []() {
+  struct __x {
+    _LIBCPP_NO_UNIQUE_ADDRESS _First __first;
+    _LIBCPP_NO_UNIQUE_ADDRESS _Second __second;
+  };
+  return sizeof(__x) == sizeof(_First);
+}();
+
+// This class implements the storage used by `std::expected`. We have a few
+// goals for this storage:
+// 1. Whenever the underlying {_Tp | _Unex} combination has free bytes in its
+//    tail padding, we should reuse it to store the bool discriminator of the
+//    expected, so as to save space.
+// 2. Whenever the `expected<_Tp, _Unex>` as a whole has free bytes in its tail
+//    padding, we should allow an object following the expected to be stored in
+//    its tail padding.
+// 3. However, we never want a user object (say `X`) that would follow an
+//    `expected<_Tp, _Unex>` to be stored in the padding bytes of the
+//    underlying {_Tp | _Unex} union, if any. That is because we use
+//    `construct_at` on that union, which would end up overwriting the `X`
+//    member if it is stored in the tail padding of the union.
+//
+// To achieve this, `__expected_base`'s logic is implemented in an inner
+// `__repr` class. `__expected_base` holds one `__repr` member which is
+// conditionally `[[no_unique_address]]`. The `__repr` class holds the
+// underlying {_Tp | _Unex} union and a boolean "has value" flag.
+//
+// Which one of the `__repr_`/`__union_` members is `[[no_unique_address]]`
+// depends on whether the "has value" boolean fits into the tail padding of
+// the underlying {_Tp | _Unex} union:
+//
+// - In case the "has value" bool fits into the tail padding of the union, the
+//   whole `__repr_` member is _not_ `[[no_unique_address]]` as it needs to be
+//   transparently replaced on `emplace()`/`swap()` etc.
+// - In case the "has value" bool does not fit into the tail padding of the
+//   union, only the union member must be transparently replaced (therefore is
+//   _not_ `[[no_unique_address]]`) and the "has value" flag must be adjusted
+//   manually.
+//
+// This way, the member that is transparently replaced on mutating operations
+// is never `[[no_unique_address]]`, satisfying the requirements from
+// "[basic.life]" in the standard.
+//
+// Stripped away of all superfluous elements, the layout of `__expected_base`
+// then looks like this:
+//
+//     template <class Tp, class Err>
+//     class expected_base {
+//       union union_t {
+//         [[no_unique_address]] Tp val;
+//         [[no_unique_address]] Err unex;
+//       };
+//
+//       static constexpr bool put_flag_in_tail                    = fits_in_tail_padding<union_t, bool>;
+//       static constexpr bool allow_reusing_expected_tail_padding = !put_flag_in_tail;
+//
+//       struct repr {
+//       private:
+//         // If "has value" fits into the tail, this should be
+//         // `[[no_unique_address]]`, otherwise not.
+//         [[no_unique_address]] conditional_no_unique_address<
+//             put_flag_in_tail,
+//             union_t>::type union_;
+//         [[no_unique_address]] bool has_val_;
+//       };
+//
+//     protected:
+//       // If "has value" fits into the tail, this must _not_ be
+//       // `[[no_unique_address]]` so that we fill out the
+//       // complete `expected` object.
+//       [[no_unique_address]] conditional_no_unique_address<
+//           allow_reusing_expected_tail_padding,
+//           repr>::type repr_;
+//     };
+//
 template <class _Tp, class _Err>
-class expected {
+class __expected_base {
+  // use named union because [[no_unique_address]] cannot be applied to an unnamed union,
+  // also guaranteed elision into a potentially-overlapping subobject is unsettled (and
+  // it's not clear that it's implementable, given that the function is allowed to clobber
+  // the tail padding) - see https://github.com/itanium-cxx-abi/cxx-abi/issues/107.
+  union __union_t {
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(const __union_t&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(const __union_t&)
+      requires(is_copy_constructible_v<_Tp> && is_copy_constructible_v<_Err> &&
+               is_trivially_copy_constructible_v<_Tp> && is_trivially_copy_constructible_v<_Err>)
+    = default;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(__union_t&&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(__union_t&&)
+      requires(is_move_constructible_v<_Tp> && is_move_constructible_v<_Err> &&
+               is_trivially_move_constructible_v<_Tp> && is_trivially_move_constructible_v<_Err>)
+    = default;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t& operator=(const __union_t&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t& operator=(__union_t&&)      = delete;
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(in_place_t, _Args&&... __args)
+        : __val_(std::forward<_Args>(__args)...) {}
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(unexpect_t, _Args&&... __args)
+        : __unex_(std::forward<_Args>(__args)...) {}
+
+    template <class _Func, class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
+        std::__expected_construct_in_place_from_invoke_tag, _Func&& __f, _Args&&... __args)
+        : __val_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
+
+    template <class _Func, class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
+        std::__expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
+        : __unex_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
+
+    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
+      requires(is_trivially_destructible_v<_Tp> && is_trivially_destructible_v<_Err>)
+    = default;
+
+    // __repr's destructor handles this
+    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t() {}
+
+    _LIBCPP_NO_UNIQUE_ADDRESS _Tp __val_;
+    _LIBCPP_NO_UNIQUE_ADDRESS _Err __unex_;
+  };
+
+  static constexpr bool __put_flag_in_tail                    = __fits_in_tail_padding<__union_t, bool>;
+  static constexpr bool __allow_reusing_expected_tail_padding = !__put_flag_in_tail;
+
+  struct __repr {
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr() = delete;
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr(in_place_t __tag, _Args&&... __args)
+        : __union_(in_place, __tag, std::forward<_Args>(__args)...), __has_val_(true) {}
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr(unexpect_t __tag, _Args&&... __args)
+        : __union_(in_place, __tag, std::forward<_Args>(__args)...), __has_val_(false) {}
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr(std::__expected_construct_in_place_from_invoke_tag __tag,
+                                                    _Args&&... __args)
+        : __union_(in_place, __tag, std::forward<_Args>(__args)...), __has_val_(true) {}
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr(std::__expected_construct_unexpected_from_invoke_tag __tag,
+                                                    _Args&&... __args)
+        : __union_(in_place, __tag, std::forward<_Args>(__args)...), __has_val_(false) {}
+
+    // The return value of `__make_union` must be constructed in place in the
+    // `__v` member of `__union_`, relying on guaranteed copy elision. To do
+    // this, the `__conditional_no_unique_address_invoke_tag` constructor is
+    // called with a lambda that is immediately called inside
+    // `__conditional_no_unique_address`'s constructor.
+    template <class _OtherUnion>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr(bool __has_val, _OtherUnion&& __other)
+      requires(__allow_reusing_expected_tail_padding)
+        : __union_(__conditional_no_unique_address_invoke_tag{},
+                   [&] { return __make_union(__has_val, std::forward<_OtherUnion>(__other)); }),
+          __has_val_(__has_val) {}
+
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr(const __repr&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr(const __repr&)
+      requires(is_copy_constructible_v<_Tp> && is_copy_constructible_v<_Err> &&
+               is_trivially_copy_constructible_v<_Tp> && is_trivially_copy_constructible_v<_Err>)
+    = default;
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr(__repr&&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr(__repr&&)
+      requires(is_move_constructible_v<_Tp> && is_move_constructible_v<_Err> &&
+               is_trivially_move_constructible_v<_Tp> && is_trivially_move_constructible_v<_Err>)
+    = default;
+
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr& operator=(const __repr&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr& operator=(__repr&&)      = delete;
+
+    _LIBCPP_HIDE_FROM_ABI constexpr ~__repr()
+      requires(is_trivially_destructible_v<_Tp> && is_trivially_destructible_v<_Err>)
+    = default;
+
+    _LIBCPP_HIDE_FROM_ABI constexpr ~__repr()
+      requires(!is_trivially_destructible_v<_Tp> || !is_trivially_destructible_v<_Err>)
+    {
+      __destroy_union_member();
+    }
+
+    _LIBCPP_HIDE_FROM_ABI constexpr void __destroy_union()
+      requires(__allow_reusing_expected_tail_padding &&
+               (is_trivially_destructible_v<_Tp> && is_trivially_destructible_v<_Err>))
+    {
+      // Note: Since the destructor of the union is trivial, this does nothing
+      // except to end the lifetime of the union.
+      std::destroy_at(&__union_.__v);
+    }
+
+    _LIBCPP_HIDE_FROM_ABI constexpr void __destroy_union()
+      requires(__allow_reusing_expected_tail_padding &&
+               (!is_trivially_destructible_v<_Tp> || !is_trivially_destructible_v<_Err>))
+    {
+      __destroy_union_member();
+      std::destroy_at(&__union_.__v);
+    }
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr void __construct_union(in_place_t, _Args&&... __args)
+      requires(__allow_reusing_expected_tail_padding)
+    {
+      std::construct_at(&__union_.__v, in_place, std::forward<_Args>(__args)...);
+      __has_val_ = true;
+    }
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr void __construct_union(unexpect_t, _Args&&... __args)
+      requires(__allow_reusing_expected_tail_padding)
+    {
+      std::construct_at(&__union_.__v, unexpect, std::forward<_Args>(__args)...);
+      __has_val_ = false;
+    }
+
+  private:
+    template <class, class>
+    friend class __expected_base;
+
+    _LIBCPP_HIDE_FROM_ABI constexpr void __destroy_union_member()
+      requires(!is_trivially_destructible_v<_Tp> || !is_trivially_destructible_v<_Err>)
+    {
+      if (__has_val_) {
+        std::destroy_at(std::addressof(__union_.__v.__val_));
+      } else {
+        std::destroy_at(std::addressof(__union_.__v.__unex_));
+      }
+    }
+
+    template <class _OtherUnion>
+    _LIBCPP_HIDE_FROM_ABI static constexpr __union_t __make_union(bool __has_val, _OtherUnion&& __other)
+      requires(__allow_reusing_expected_tail_padding)
+    {
+      if (__has_val)
+        return __union_t(in_place, std::forward<_OtherUnion>(__other).__val_);
+      else
+        return __union_t(unexpect, std::forward<_OtherUnion>(__other).__unex_);
+    }
+
+    _LIBCPP_NO_UNIQUE_ADDRESS __conditional_no_unique_address<__put_flag_in_tail, __union_t> __union_;
+    _LIBCPP_NO_UNIQUE_ADDRESS bool __has_val_;
+  };
+
+  template <class _OtherUnion>
+  _LIBCPP_HIDE_FROM_ABI static constexpr __repr __make_repr(bool __has_val, _OtherUnion&& __other)
+    requires(__put_flag_in_tail)
+  {
+    if (__has_val)
+      return __repr(in_place, std::forward<_OtherUnion>(__other).__val_);
+    else
+      return __repr(unexpect, std::forward<_OtherUnion>(__other).__unex_);
+  }
+
+protected:
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __expected_base(_Args&&... __args)
+      : __repr_(in_place, std::forward<_Args>(__args)...) {}
+
+  // In case we copy/move construct from another `expected` we need to create
+  // our `expected` so that it either has a value or not, depending on the "has
+  // value" flag of the other `expected`. To do this without falling back on
+  // `std::construct_at` we rely on guaranteed copy elision using two helper
+  // functions `__make_repr` and `__make_union`. There have to be two since
+  // there are two data layouts with different members being
+  // `[[no_unique_address]]`. GCC (as of version 13) does not do guaranteed
+  // copy elision when initializing `[[no_unique_address]]` members. The two
+  // cases are:
+  //
+  // - `__make_repr`: This is used when the "has value" flag lives in the tail
+  //   of the union. In this case, the `__repr` member is _not_
+  //   `[[no_unique_address]]`.
+  // - `__make_union`: When the "has value" flag does _not_ fit in the tail of
+  //   the union, the `__repr` member is `[[no_unique_address]]` and the union
+  //   is not.
+  //
+  // This constructor "catches" the first case and leaves the second case to
+  // `__union_t`, its constructors and `__make_union`.
+  template <class _OtherUnion>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __expected_base(bool __has_val, _OtherUnion&& __other)
+    requires(__put_flag_in_tail)
+      : __repr_(__conditional_no_unique_address_invoke_tag{},
+                [&] { return __make_repr(__has_val, std::forward<_OtherUnion>(__other)); }) {}
+
+  _LIBCPP_HIDE_FROM_ABI constexpr void __destroy() {
+    if constexpr (__put_flag_in_tail)
+      std::destroy_at(&__repr_.__v);
+    else
+      __repr_.__v.__destroy_union();
+  }
+
+  template <class _Tag, class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr void __construct(_Tag __tag, _Args&&... __args) {
+    if constexpr (__put_flag_in_tail)
+      std::construct_at(&__repr_.__v, __tag, std::forward<_Args>(__args)...);
+    else
+      __repr_.__v.__construct_union(__tag, std::forward<_Args>(__args)...);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr bool __has_val() const { return __repr_.__v.__has_val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr __union_t& __union() { return __repr_.__v.__union_.__v; }
+  _LIBCPP_HIDE_FROM_ABI constexpr const __union_t& __union() const { return __repr_.__v.__union_.__v; }
+  _LIBCPP_HIDE_FROM_ABI constexpr _Tp& __val() { return __repr_.__v.__union_.__v.__val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr const _Tp& __val() const { return __repr_.__v.__union_.__v.__val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr _Err& __unex() { return __repr_.__v.__union_.__v.__unex_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr const _Err& __unex() const { return __repr_.__v.__union_.__v.__unex_; }
+
+private:
+  _LIBCPP_NO_UNIQUE_ADDRESS __conditional_no_unique_address<__allow_reusing_expected_tail_padding, __repr> __repr_;
+};
+
+template <class _Tp, class _Err>
+class expected : private __expected_base<_Tp, _Err> {
   static_assert(!is_reference_v<_Tp> && !is_function_v<_Tp> && !is_same_v<remove_cv_t<_Tp>, in_place_t> &&
                     !is_same_v<remove_cv_t<_Tp>, unexpect_t> && !__is_std_unexpected<remove_cv_t<_Tp>>::value &&
                     __valid_std_unexpected<_Err>::value,
@@ -102,6 +466,8 @@ class expected {
   template <class _Up, class _OtherErr>
   friend class expected;
 
+  using __base = __expected_base<_Tp, _Err>;
+
 public:
   using value_type      = _Tp;
   using error_type      = _Err;
@@ -113,7 +479,7 @@ class expected {
   // [expected.object.ctor], constructors
   _LIBCPP_HIDE_FROM_ABI constexpr expected() noexcept(is_nothrow_default_constructible_v<_Tp>) // strengthened
     requires is_default_constructible_v<_Tp>
-      : __union_(std::in_place), __has_val_(true) {}
+      : __base(in_place) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr expected(const expected&) = delete;
 
@@ -126,7 +492,7 @@ class expected {
       is_nothrow_copy_constructible_v<_Tp> && is_nothrow_copy_constructible_v<_Err>) // strengthened
     requires(is_copy_constructible_v<_Tp> && is_copy_constructible_v<_Err> &&
              !(is_trivially_copy_constructible_v<_Tp> && is_trivially_copy_constructible_v<_Err>))
-      : __union_(__other.__has_val_, __other.__union_), __has_val_(__other.__has_val_) {}
+      : __base(__other.__has_val(), __other.__union()) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr expected(expected&&)
     requires(is_move_constructible_v<_Tp> && is_move_constructible_v<_Err> && is_trivially_move_constructible_v<_Tp> &&
@@ -137,7 +503,7 @@ class expected {
       is_nothrow_move_constructible_v<_Tp> && is_nothrow_move_constructible_v<_Err>)
     requires(is_move_constructible_v<_Tp> && is_move_constructible_v<_Err> &&
              !(is_trivially_move_constructible_v<_Tp> && is_trivially_move_constructible_v<_Err>))
-      : __union_(__other.__has_val_, std::move(__other.__union_)), __has_val_(__other.__has_val_) {}
+      : __base(__other.__has_val(), std::move(__other.__union())) {}
 
 private:
   template <class _Up, class _OtherErr, class _UfQual, class _OtherErrQual>
@@ -162,12 +528,12 @@ class expected {
   template <class _Func, class... _Args>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(
       std::__expected_construct_in_place_from_invoke_tag __tag, _Func&& __f, _Args&&... __args)
-      : __union_(__tag, std::forward<_Func>(__f), std::forward<_Args>(__args)...), __has_val_(true) {}
+      : __base(__tag, std::forward<_Func>(__f), std::forward<_Args>(__args)...) {}
 
   template <class _Func, class... _Args>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(
       std::__expected_construct_unexpected_from_invoke_tag __tag, _Func&& __f, _Args&&... __args)
-      : __union_(__tag, std::forward<_Func>(__f), std::forward<_Args>(__args)...), __has_val_(false) {}
+      : __base(__tag, std::forward<_Func>(__f), std::forward<_Args>(__args)...) {}
 
 public:
   template <class _Up, class _OtherErr>
@@ -177,14 +543,14 @@ class expected {
       expected(const expected<_Up, _OtherErr>& __other) noexcept(
           is_nothrow_constructible_v<_Tp, const _Up&> &&
           is_nothrow_constructible_v<_Err, const _OtherErr&>) // strengthened
-      : __union_(__other.__has_val_, __other.__union_), __has_val_(__other.__has_val_) {}
+      : __base(__other.__has_val(), __other.__union()) {}
 
   template <class _Up, class _OtherErr>
     requires __can_convert<_Up, _OtherErr, _Up, _OtherErr>::value
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_Up, _Tp> || !is_convertible_v<_OtherErr, _Err>)
       expected(expected<_Up, _OtherErr>&& __other) noexcept(
           is_nothrow_constructible_v<_Tp, _Up> && is_nothrow_constructible_v<_Err, _OtherErr>) // strengthened
-      : __union_(__other.__has_val_, std::move(__other.__union_)), __has_val_(__other.__has_val_) {}
+      : __base(__other.__has_val(), std::move(__other.__union())) {}
 
   template <class _Up = _Tp>
     requires(!is_same_v<remove_cvref_t<_Up>, in_place_t> && !is_same_v<expected, remove_cvref_t<_Up>> &&
@@ -192,80 +558,67 @@ class expected {
              (!is_same_v<remove_cv_t<_Tp>, bool> || !__is_std_expected<remove_cvref_t<_Up>>::value))
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_Up, _Tp>)
       expected(_Up&& __u) noexcept(is_nothrow_constructible_v<_Tp, _Up>) // strengthened
-      : __union_(std::in_place, std::forward<_Up>(__u)), __has_val_(true) {}
+      : __base(in_place, std::forward<_Up>(__u)) {}
 
   template <class _OtherErr>
     requires is_constructible_v<_Err, const _OtherErr&>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<const _OtherErr&, _Err>) expected(
       const unexpected<_OtherErr>& __unex) noexcept(is_nothrow_constructible_v<_Err, const _OtherErr&>) // strengthened
-      : __union_(std::unexpect, __unex.error()), __has_val_(false) {}
+      : __base(unexpect, __unex.error()) {}
 
   template <class _OtherErr>
     requires is_constructible_v<_Err, _OtherErr>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherErr, _Err>)
       expected(unexpected<_OtherErr>&& __unex) noexcept(is_nothrow_constructible_v<_Err, _OtherErr>) // strengthened
-      : __union_(std::unexpect, std::move(__unex.error())), __has_val_(false) {}
+      : __base(unexpect, std::move(__unex.error())) {}
 
   template <class... _Args>
     requires is_constructible_v<_Tp, _Args...>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(in_place_t, _Args&&... __args) noexcept(
       is_nothrow_constructible_v<_Tp, _Args...>) // strengthened
-      : __union_(std::in_place, std::forward<_Args>(__args)...), __has_val_(true) {}
+      : __base(in_place, std::forward<_Args>(__args)...) {}
 
   template <class _Up, class... _Args>
     requires is_constructible_v< _Tp, initializer_list<_Up>&, _Args... >
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(in_place_t, initializer_list<_Up> __il, _Args&&... __args) noexcept(
       is_nothrow_constructible_v<_Tp, initializer_list<_Up>&, _Args...>) // strengthened
-      : __union_(std::in_place, __il, std::forward<_Args>(__args)...), __has_val_(true) {}
+      : __base(in_place, __il, std::forward<_Args>(__args)...) {}
 
   template <class... _Args>
     requires is_constructible_v<_Err, _Args...>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(unexpect_t, _Args&&... __args) noexcept(
       is_nothrow_constructible_v<_Err, _Args...>) // strengthened
-      : __union_(std::unexpect, std::forward<_Args>(__args)...), __has_val_(false) {}
+      : __base(unexpect, std::forward<_Args>(__args)...) {}
 
   template <class _Up, class... _Args>
     requires is_constructible_v< _Err, initializer_list<_Up>&, _Args... >
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(unexpect_t, initializer_list<_Up> __il, _Args&&... __args) noexcept(
       is_nothrow_constructible_v<_Err, initializer_list<_Up>&, _Args...>) // strengthened
-      : __union_(std::unexpect, __il, std::forward<_Args>(__args)...), __has_val_(false) {}
+      : __base(unexpect, __il, std::forward<_Args>(__args)...) {}
 
   // [expected.object.dtor], destructor
 
-  _LIBCPP_HIDE_FROM_ABI constexpr ~expected()
-    requires(is_trivially_destructible_v<_Tp> && is_trivially_destructible_v<_Err>)
-  = default;
-
-  _LIBCPP_HIDE_FROM_ABI constexpr ~expected()
-    requires(!is_trivially_destructible_v<_Tp> || !is_trivially_destructible_v<_Err>)
-  {
-    if (__has_val_) {
-      std::destroy_at(std::addressof(__union_.__val_));
-    } else {
-      std::destroy_at(std::addressof(__union_.__unex_));
-    }
-  }
+  _LIBCPP_HIDE_FROM_ABI constexpr ~expected() = default;
 
 private:
-  template <class _T1, class _T2, class... _Args>
-  _LIBCPP_HIDE_FROM_ABI static constexpr void __reinit_expected(_T1& __newval, _T2& __oldval, _Args&&... __args) {
+  template <class _Tag, class _OtherTag, class _T1, class _T2, class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr void __reinit_expected(_T2& __oldval, _Args&&... __args) {
     if constexpr (is_nothrow_constructible_v<_T1, _Args...>) {
-      std::destroy_at(std::addressof(__oldval));
-      std::construct_at(std::addressof(__newval), std::forward<_Args>(__args)...);
+      this->__destroy();
+      this->__construct(_Tag{}, std::forward<_Args>(__args)...);
     } else if constexpr (is_nothrow_move_constructible_v<_T1>) {
       _T1 __tmp(std::forward<_Args>(__args)...);
-      std::destroy_at(std::addressof(__oldval));
-      std::construct_at(std::addressof(__newval), std::move(__tmp));
+      this->__destroy();
+      this->__construct(_Tag{}, std::move(__tmp));
     } else {
       static_assert(
           is_nothrow_move_constructible_v<_T2>,
           "To provide strong exception guarantee, T2 has to satisfy `is_nothrow_move_constructible_v` so that it can "
           "be reverted to the previous state in case an exception is thrown during the assignment.");
       _T2 __tmp(std::move(__oldval));
-      std::destroy_at(std::addressof(__oldval));
-      auto __trans =
-          std::__make_exception_guard([&] { std::construct_at(std::addressof(__oldval), std::move(__tmp)); });
-      std::construct_at(std::addressof(__newval), std::forward<_Args>(__args)...);
+      this->__destroy();
+      auto __trans = std::__make_exception_guard([&] { this->__construct(_OtherTag{}, std::move(__tmp)); });
+      this->__construct(_Tag{}, std::forward<_Args>(__args)...);
       __trans.__complete();
     }
   }
@@ -281,17 +634,15 @@ class expected {
              is_copy_constructible_v<_Err> &&
              (is_nothrow_move_constructible_v<_Tp> || is_nothrow_move_constructible_v<_Err>))
   {
-    if (__has_val_ && __rhs.__has_val_) {
-      __union_.__val_ = __rhs.__union_.__val_;
-    } else if (__has_val_) {
-      __reinit_expected(__union_.__unex_, __union_.__val_, __rhs.__union_.__unex_);
-    } else if (__rhs.__has_val_) {
-      __reinit_expected(__union_.__val_, __union_.__unex_, __rhs.__union_.__val_);
+    if (this->__has_val() && __rhs.__has_val()) {
+      this->__val() = __rhs.__val();
+    } else if (this->__has_val()) {
+      __reinit_expected<unexpect_t, in_place_t, _Err, _Tp>(this->__val(), __rhs.__unex());
+    } else if (__rhs.__has_val()) {
+      __reinit_expected<in_place_t, unexpect_t, _Tp, _Err>(this->__unex(), __rhs.__val());
     } else {
-      __union_.__unex_ = __rhs.__union_.__unex_;
+      this->__unex() = __rhs.__unex();
     }
-    // note: only reached if no exception+rollback was done inside __reinit_expected
-    __has_val_ = __rhs.__has_val_;
     return *this;
   }
 
@@ -302,17 +653,15 @@ class expected {
              is_move_assignable_v<_Err> &&
              (is_nothrow_move_constructible_v<_Tp> || is_nothrow_move_constructible_v<_Err>))
   {
-    if (__has_val_ && __rhs.__has_val_) {
-      __union_.__val_ = std::move(__rhs.__union_.__val_);
-    } else if (__has_val_) {
-      __reinit_expected(__union_.__unex_, __union_.__val_, std::move(__rhs.__union_.__unex_));
-    } else if (__rhs.__has_val_) {
-      __reinit_expected(__union_.__val_, __union_.__unex_, std::move(__rhs.__union_.__val_));
+    if (this->__has_val() && __rhs.__has_val()) {
+      this->__val() = std::move(__rhs.__val());
+    } else if (this->__has_val()) {
+      __reinit_expected<unexpect_t, in_place_t, _Err, _Tp>(this->__val(), std::move(__rhs.__unex()));
+    } else if (__rhs.__has_val()) {
+      __reinit_expected<in_place_t, unexpect_t, _Tp, _Err>(this->__unex(), std::move(__rhs.__val()));
     } else {
-      __union_.__unex_ = std::move(__rhs.__union_.__unex_);
+      this->__unex() = std::move(__rhs.__unex());
     }
-    // note: only reached if no exception+rollback was done inside __reinit_expected
-    __has_val_ = __rhs.__has_val_;
     return *this;
   }
 
@@ -323,11 +672,10 @@ class expected {
              (is_nothrow_constructible_v<_Tp, _Up> || is_nothrow_move_constructible_v<_Tp> ||
               is_nothrow_move_constructible_v<_Err>))
   {
-    if (__has_val_) {
-      __union_.__val_ = std::forward<_Up>(__v);
+    if (this->__has_val()) {
+      this->__val() = std::forward<_Up>(__v);
     } else {
-      __reinit_expected(__union_.__val_, __union_.__unex_, std::forward<_Up>(__v));
-      __has_val_ = true;
+      __reinit_expected<in_place_t, unexpect_t, _Tp, _Err>(this->__unex(), std::forward<_Up>(__v));
     }
     return *this;
   }
@@ -346,11 +694,10 @@ class expected {
   template <class _OtherErr>
     requires(__can_assign_from_unexpected<const _OtherErr&>)
   _LIBCPP_HIDE_FROM_ABI constexpr expected& operator=(const unexpected<_OtherErr>& __un) {
-    if (__has_val_) {
-      __reinit_expected(__union_.__unex_, __union_.__val_, __un.error());
-      __has_val_ = false;
+    if (this->__has_val()) {
+      __reinit_expected<unexpect_t, in_place_t, _Err, _Tp>(this->__val(), __un.error());
     } else {
-      __union_.__unex_ = __un.error();
+      this->__unex() = __un.error();
     }
     return *this;
   }
@@ -358,11 +705,10 @@ class expected {
   template <class _OtherErr>
     requires(__can_assign_from_unexpected<_OtherErr>)
   _LIBCPP_HIDE_FROM_ABI constexpr expected& operator=(unexpected<_OtherErr>&& __un) {
-    if (__has_val_) {
-      __reinit_expected(__union_.__unex_, __union_.__val_, std::move(__un.error()));
-      __has_val_ = false;
+    if (this->__has_val()) {
+      __reinit_expected<unexpect_t, in_place_t, _Err, _Tp>(this->__val(), std::move(__un.error()));
     } else {
-      __union_.__unex_ = std::move(__un.error());
+      this->__unex() = std::move(__un.error());
     }
     return *this;
   }
@@ -370,27 +716,17 @@ class expected {
   template <class... _Args>
     requires is_nothrow_constructible_v<_Tp, _Args...>
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp& emplace(_Args&&... __args) noexcept {
-    if (__has_val_) {
-      std::destroy_at(std::addressof(__union_.__val_));
-    } else {
-      std::destroy_at(std::addressof(__union_.__unex_));
-    }
-    std::construct_at(std::addressof(__union_.__val_), std::forward<_Args>(__args)...);
-    __has_val_ = true;
-    return __union_.__val_;
+    this->__destroy();
+    this->__construct(in_place, std::forward<_Args>(__args)...);
+    return this->__val();
   }
 
   template <class _Up, class... _Args>
-    requires is_nothrow_constructible_v< _Tp, initializer_list<_Up>&, _Args... >
+    requires is_nothrow_constructible_v<_Tp, initializer_list<_Up>&, _Args...>
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp& emplace(initializer_list<_Up> __il, _Args&&... __args) noexcept {
-    if (__has_val_) {
-      std::destroy_at(std::addressof(__union_.__val_));
-    } else {
-      std::destroy_at(std::addressof(__union_.__unex_));
-    }
-    std::construct_at(std::addressof(__union_.__val_), __il, std::forward<_Args>(__args)...);
-    __has_val_ = true;
-    return __union_.__val_;
+    this->__destroy();
+    this->__construct(in_place, __il, std::forward<_Args>(__args)...);
+    return this->__val();
   }
 
 public:
@@ -402,48 +738,42 @@ class expected {
              is_move_constructible_v<_Err> &&
              (is_nothrow_move_constructible_v<_Tp> || is_nothrow_move_constructible_v<_Err>))
   {
-    auto __swap_val_unex_impl = [&](expected& __with_val, expected& __with_err) {
+    auto __swap_val_unex_impl = [](expected& __with_val, expected& __with_err) {
       if constexpr (is_nothrow_move_constructible_v<_Err>) {
-        _Err __tmp(std::move(__with_err.__union_.__unex_));
-        std::destroy_at(std::addressof(__with_err.__union_.__unex_));
-        auto __trans = std::__make_exception_guard([&] {
-          std::construct_at(std::addressof(__with_err.__union_.__unex_), std::move(__tmp));
-        });
-        std::construct_at(std::addressof(__with_err.__union_.__val_), std::move(__with_val.__union_.__val_));
+        _Err __tmp(std::move(__with_err.__unex()));
+        __with_err.__destroy();
+        auto __trans = std::__make_exception_guard([&] { __with_err.__construct(unexpect, std::move(__tmp)); });
+        __with_err.__construct(in_place, std::move(__with_val.__val()));
         __trans.__complete();
-        std::destroy_at(std::addressof(__with_val.__union_.__val_));
-        std::construct_at(std::addressof(__with_val.__union_.__unex_), std::move(__tmp));
+        __with_val.__destroy();
+        __with_val.__construct(unexpect, std::move(__tmp));
       } else {
         static_assert(is_nothrow_move_constructible_v<_Tp>,
                       "To provide strong exception guarantee, Tp has to satisfy `is_nothrow_move_constructible_v` so "
                       "that it can be reverted to the previous state in case an exception is thrown during swap.");
-        _Tp __tmp(std::move(__with_val.__union_.__val_));
-        std::destroy_at(std::addressof(__with_val.__union_.__val_));
-        auto __trans = std::__make_exception_guard([&] {
-          std::construct_at(std::addressof(__with_val.__union_.__val_), std::move(__tmp));
-        });
-        std::construct_at(std::addressof(__with_val.__union_.__unex_), std::move(__with_err.__union_.__unex_));
+        _Tp __tmp(std::move(__with_val.__val()));
+        __with_val.__destroy();
+        auto __trans = std::__make_exception_guard([&] { __with_val.__construct(in_place, std::move(__tmp)); });
+        __with_val.__construct(unexpect, std::move(__with_err.__unex()));
         __trans.__complete();
-        std::destroy_at(std::addressof(__with_err.__union_.__unex_));
-        std::construct_at(std::addressof(__with_err.__union_.__val_), std::move(__tmp));
+        __with_err.__destroy();
+        __with_err.__construct(in_place, std::move(__tmp));
       }
-      __with_val.__has_val_ = false;
-      __with_err.__has_val_ = true;
     };
 
-    if (__has_val_) {
-      if (__rhs.__has_val_) {
+    if (this->__has_val()) {
+      if (__rhs.__has_val()) {
         using std::swap;
-        swap(__union_.__val_, __rhs.__union_.__val_);
+        swap(this->__val(), __rhs.__val());
       } else {
         __swap_val_unex_impl(*this, __rhs);
       }
     } else {
-      if (__rhs.__has_val_) {
+      if (__rhs.__has_val()) {
         __swap_val_unex_impl(__rhs, *this);
       } else {
         using std::swap;
-        swap(__union_.__unex_, __rhs.__union_.__unex_);
+        swap(this->__unex(), __rhs.__unex());
       }
     }
   }
@@ -456,105 +786,115 @@ class expected {
 
   // [expected.object.obs], observers
   _LIBCPP_HIDE_FROM_ABI constexpr const _Tp* operator->() const noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__has_val_, "expected::operator-> requires the expected to contain a value");
-    return std::addressof(__union_.__val_);
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        this->__has_val(), "expected::operator-> requires the expected to contain a value");
+    return std::addressof(this->__val());
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp* operator->() noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__has_val_, "expected::operator-> requires the expected to contain a value");
-    return std::addressof(__union_.__val_);
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        this->__has_val(), "expected::operator-> requires the expected to contain a value");
+    return std::addressof(this->__val());
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr const _Tp& operator*() const& noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__has_val_, "expected::operator* requires the expected to contain a value");
-    return __union_.__val_;
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        this->__has_val(), "expected::operator* requires the expected to contain a value");
+    return this->__val();
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp& operator*() & noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__has_val_, "expected::operator* requires the expected to contain a value");
-    return __union_.__val_;
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        this->__has_val(), "expected::operator* requires the expected to contain a value");
+    return this->__val();
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr const _Tp&& operator*() const&& noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__has_val_, "expected::operator* requires the expected to contain a value");
-    return std::move(__union_.__val_);
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        this->__has_val(), "expected::operator* requires the expected to contain a value");
+    return std::move(this->__val());
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& operator*() && noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__has_val_, "expected::operator* requires the expected to contain a value");
-    return std::move(__union_.__val_);
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        this->__has_val(), "expected::operator* requires the expected to contain a value");
+    return std::move(this->__val());
   }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit operator bool() const noexcept { return __has_val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit operator bool() const noexcept { return this->__has_val(); }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr bool has_value() const noexcept { return __has_val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr bool has_value() const noexcept { return this->__has_val(); }
 
   _LIBCPP_HIDE_FROM_ABI constexpr const _Tp& value() const& {
     static_assert(is_copy_constructible_v<_Err>, "error_type has to be copy constructible");
-    if (!__has_val_) {
+    if (!this->__has_val()) {
       std::__throw_bad_expected_access<_Err>(std::as_const(error()));
     }
-    return __union_.__val_;
+    return this->__val();
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp& value() & {
     static_assert(is_copy_constructible_v<_Err>, "error_type has to be copy constructible");
-    if (!__has_val_) {
+    if (!this->__has_val()) {
       std::__throw_bad_expected_access<_Err>(std::as_const(error()));
     }
-    return __union_.__val_;
+    return this->__val();
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr const _Tp&& value() const&& {
     static_assert(is_copy_constructible_v<_Err> && is_constructible_v<_Err, decltype(std::move(error()))>,
                   "error_type has to be both copy constructible and constructible from decltype(std::move(error()))");
-    if (!__has_val_) {
+    if (!this->__has_val()) {
       std::__throw_bad_expected_access<_Err>(std::move(error()));
     }
-    return std::move(__union_.__val_);
+    return std::move(this->__val());
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp&& value() && {
     static_assert(is_copy_constructible_v<_Err> && is_constructible_v<_Err, decltype(std::move(error()))>,
                   "error_type has to be both copy constructible and constructible from decltype(std::move(error()))");
-    if (!__has_val_) {
+    if (!this->__has_val()) {
       std::__throw_bad_expected_access<_Err>(std::move(error()));
     }
-    return std::move(__union_.__val_);
+    return std::move(this->__val());
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr const _Err& error() const& noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!__has_val_, "expected::error requires the expected to contain an error");
-    return __union_.__unex_;
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !this->__has_val(), "expected::error requires the expected to contain an error");
+    return this->__unex();
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr _Err& error() & noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!__has_val_, "expected::error requires the expected to contain an error");
-    return __union_.__unex_;
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !this->__has_val(), "expected::error requires the expected to contain an error");
+    return this->__unex();
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr const _Err&& error() const&& noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!__has_val_, "expected::error requires the expected to contain an error");
-    return std::move(__union_.__unex_);
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !this->__has_val(), "expected::error requires the expected to contain an error");
+    return std::move(this->__unex());
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr _Err&& error() && noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!__has_val_, "expected::error requires the expected to contain an error");
-    return std::move(__union_.__unex_);
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !this->__has_val(), "expected::error requires the expected to contain an error");
+    return std::move(this->__unex());
   }
 
   template <class _Up>
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) const& {
     static_assert(is_copy_constructible_v<_Tp>, "value_type has to be copy constructible");
     static_assert(is_convertible_v<_Up, _Tp>, "argument has to be convertible to value_type");
-    return __has_val_ ? __union_.__val_ : static_cast<_Tp>(std::forward<_Up>(__v));
+    return this->__has_val() ? this->__val() : static_cast<_Tp>(std::forward<_Up>(__v));
   }
 
   template <class _Up>
   _LIBCPP_HIDE_FROM_ABI constexpr _Tp value_or(_Up&& __v) && {
     static_assert(is_move_constructible_v<_Tp>, "value_type has to be move constructible");
     static_assert(is_convertible_v<_Up, _Tp>, "argument has to be convertible to value_type");
-    return __has_val_ ? std::move(__union_.__val_) : static_cast<_Tp>(std::forward<_Up>(__v));
+    return this->__has_val() ? std::move(this->__val()) : static_cast<_Tp>(std::forward<_Up>(__v));
   }
 
   template <class _Up = _Err>
@@ -584,7 +924,7 @@ class expected {
     static_assert(is_same_v<typename _Up::error_type, _Err>,
                   "The result of f(**this) must have the same error_type as this expected");
     if (has_value()) {
-      return std::invoke(std::forward<_Func>(__f), __union_.__val_);
+      return std::invoke(std::forward<_Func>(__f), this->__val());
     }
     return _Up(unexpect, error());
   }
@@ -597,7 +937,7 @@ class expected {
     static_assert(is_same_v<typename _Up::error_type, _Err>,
                   "The result of f(**this) must have the same error_type as this expected");
     if (has_value()) {
-      return std::invoke(std::forward<_Func>(__f), __union_.__val_);
+      return std::invoke(std::forward<_Func>(__f), this->__val());
     }
     return _Up(unexpect, error());
   }
@@ -611,7 +951,7 @@ class expected {
     static_assert(is_same_v<typename _Up::error_type, _Err>,
                   "The result of f(std::move(**this)) must have the same error_type as this expected");
     if (has_value()) {
-      return std::invoke(std::forward<_Func>(__f), std::move(__union_.__val_));
+      return std::invoke(std::forward<_Func>(__f), std::move(this->__val()));
     }
     return _Up(unexpect, std::move(error()));
   }
@@ -625,7 +965,7 @@ class expected {
     static_assert(is_same_v<typename _Up::error_type, _Err>,
                   "The result of f(std::move(**this)) must have the same error_type as this expected");
     if (has_value()) {
-      return std::invoke(std::forward<_Func>(__f), std::move(__union_.__val_));
+      return std::invoke(std::forward<_Func>(__f), std::move(this->__val()));
     }
     return _Up(unexpect, std::move(error()));
   }
@@ -638,7 +978,7 @@ class expected {
     static_assert(is_same_v<typename _Gp::value_type, _Tp>,
                   "The result of f(error()) must have the same value_type as this expected");
     if (has_value()) {
-      return _Gp(in_place, __union_.__val_);
+      return _Gp(in_place, this->__val());
     }
     return std::invoke(std::forward<_Func>(__f), error());
   }
@@ -651,7 +991,7 @@ class expected {
     static_assert(is_same_v<typename _Gp::value_type, _Tp>,
                   "The result of f(error()) must have the same value_type as this expected");
     if (has_value()) {
-      return _Gp(in_place, __union_.__val_);
+      return _Gp(in_place, this->__val());
     }
     return std::invoke(std::forward<_Func>(__f), error());
   }
@@ -665,7 +1005,7 @@ class expected {
     static_assert(is_same_v<typename _Gp::value_type, _Tp>,
                   "The result of f(std::move(error())) must have the same value_type as this expected");
     if (has_value()) {
-      return _Gp(in_place, std::move(__union_.__val_));
+      return _Gp(in_place, std::move(this->__val()));
     }
     return std::invoke(std::forward<_Func>(__f), std::move(error()));
   }
@@ -679,7 +1019,7 @@ class expected {
     static_assert(is_same_v<typename _Gp::value_type, _Tp>,
                   "The result of f(std::move(error())) must have the same value_type as this expected");
     if (has_value()) {
-      return _Gp(in_place, std::move(__union_.__val_));
+      return _Gp(in_place, std::move(this->__val()));
     }
     return std::invoke(std::forward<_Func>(__f), std::move(error()));
   }
@@ -693,9 +1033,9 @@ class expected {
     }
     if constexpr (!is_void_v<_Up>) {
       return expected<_Up, _Err>(
-          __expected_construct_in_place_from_invoke_tag{}, std::forward<_Func>(__f), __union_.__val_);
+          __expected_construct_in_place_from_invoke_tag{}, std::forward<_Func>(__f), this->__val());
     } else {
-      std::invoke(std::forward<_Func>(__f), __union_.__val_);
+      std::invoke(std::forward<_Func>(__f), this->__val());
       return expected<_Up, _Err>();
     }
   }
@@ -709,9 +1049,9 @@ class expected {
     }
     if constexpr (!is_void_v<_Up>) {
       return expected<_Up, _Err>(
-          __expected_construct_in_place_from_invoke_tag{}, std::forward<_Func>(__f), __union_.__val_);
+          __expected_construct_in_place_from_invoke_tag{}, std::forward<_Func>(__f), this->__val());
     } else {
-      std::invoke(std::forward<_Func>(__f), __union_.__val_);
+      std::invoke(std::forward<_Func>(__f), this->__val());
       return expected<_Up, _Err>();
     }
   }
@@ -725,9 +1065,9 @@ class expected {
     }
     if constexpr (!is_void_v<_Up>) {
       return expected<_Up, _Err>(
-          __expected_construct_in_place_from_invoke_tag{}, std::forward<_Func>(__f), std::move(__union_.__val_));
+          __expected_construct_in_place_from_invoke_tag{}, std::forward<_Func>(__f), std::move(this->__val()));
     } else {
-      std::invoke(std::forward<_Func>(__f), std::move(__union_.__val_));
+      std::invoke(std::forward<_Func>(__f), std::move(this->__val()));
       return expected<_Up, _Err>();
     }
   }
@@ -741,9 +1081,9 @@ class expected {
     }
     if constexpr (!is_void_v<_Up>) {
       return expected<_Up, _Err>(
-          __expected_construct_in_place_from_invoke_tag{}, std::forward<_Func>(__f), std::move(__union_.__val_));
+          __expected_construct_in_place_from_invoke_tag{}, std::forward<_Func>(__f), std::move(this->__val()));
     } else {
-      std::invoke(std::forward<_Func>(__f), std::move(__union_.__val_));
+      std::invoke(std::forward<_Func>(__f), std::move(this->__val()));
       return expected<_Up, _Err>();
     }
   }
@@ -755,7 +1095,7 @@ class expected {
     static_assert(__valid_std_unexpected<_Gp>::value,
                   "The result of f(error()) must be a valid template argument for unexpected");
     if (has_value()) {
-      return expected<_Tp, _Gp>(in_place, __union_.__val_);
+      return expected<_Tp, _Gp>(in_place, this->__val());
     }
     return expected<_Tp, _Gp>(__expected_construct_unexpected_from_invoke_tag{}, std::forward<_Func>(__f), error());
   }
@@ -767,7 +1107,7 @@ class expected {
     static_assert(__valid_std_unexpected<_Gp>::value,
                   "The result of f(error()) must be a valid template argument for unexpected");
     if (has_value()) {
-      return expected<_Tp, _Gp>(in_place, __union_.__val_);
+      return expected<_Tp, _Gp>(in_place, this->__val());
     }
     return expected<_Tp, _Gp>(__expected_construct_unexpected_from_invoke_tag{}, std::forward<_Func>(__f), error());
   }
@@ -779,7 +1119,7 @@ class expected {
     static_assert(__valid_std_unexpected<_Gp>::value,
                   "The result of f(std::move(error())) must be a valid template argument for unexpected");
     if (has_value()) {
-      return expected<_Tp, _Gp>(in_place, std::move(__union_.__val_));
+      return expected<_Tp, _Gp>(in_place, std::move(this->__val()));
     }
     return expected<_Tp, _Gp>(
         __expected_construct_unexpected_from_invoke_tag{}, std::forward<_Func>(__f), std::move(error()));
@@ -792,7 +1132,7 @@ class expected {
     static_assert(__valid_std_unexpected<_Gp>::value,
                   "The result of f(std::move(error())) must be a valid template argument for unexpected");
     if (has_value()) {
-      return expected<_Tp, _Gp>(in_place, std::move(__union_.__val_));
+      return expected<_Tp, _Gp>(in_place, std::move(this->__val()));
     }
     return expected<_Tp, _Gp>(
         __expected_construct_unexpected_from_invoke_tag{}, std::forward<_Func>(__f), std::move(error()));
@@ -802,123 +1142,220 @@ class expected {
   template <class _T2, class _E2>
     requires(!is_void_v<_T2>)
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y) {
-    if (__x.__has_val_ != __y.__has_val_) {
+    if (__x.__has_val() != __y.__has_val()) {
       return false;
     } else {
-      if (__x.__has_val_) {
-        return __x.__union_.__val_ == __y.__union_.__val_;
+      if (__x.__has_val()) {
+        return __x.__val() == __y.__val();
       } else {
-        return __x.__union_.__unex_ == __y.__union_.__unex_;
+        return __x.__unex() == __y.__unex();
       }
     }
   }
 
   template <class _T2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const _T2& __v) {
-    return __x.__has_val_ && static_cast<bool>(__x.__union_.__val_ == __v);
+    return __x.__has_val() && static_cast<bool>(__x.__val() == __v);
   }
 
   template <class _E2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const unexpected<_E2>& __e) {
-    return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __e.error());
+    return !__x.__has_val() && static_cast<bool>(__x.__unex() == __e.error());
   }
+};
 
-private:
-  template <class _ValueType, class _ErrorType>
+template <class _Err>
+class __expected_void_base {
+  struct __empty_t {};
+  // use named union because [[no_unique_address]] cannot be applied to an unnamed union,
+  // also guaranteed elision into a potentially-overlapping subobject is unsettled (and
+  // it's not clear that it's implementable, given that the function is allowed to clobber
+  // the tail padding) - see https://github.com/itanium-cxx-abi/cxx-abi/issues/107.
   union __union_t {
-    template <class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::in_place_t, _Args&&... __args)
-        : __val_(std::forward<_Args>(__args)...) {}
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(const __union_t&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(const __union_t&)
+      requires(is_copy_constructible_v<_Err> && is_trivially_copy_constructible_v<_Err>)
+    = default;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(__union_t&&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(__union_t&&)
+      requires(is_move_constructible_v<_Err> && is_trivially_move_constructible_v<_Err>)
+    = default;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t& operator=(const __union_t&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __union_t& operator=(__union_t&&)      = delete;
+
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(in_place_t) : __empty_() {}
 
     template <class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::unexpect_t, _Args&&... __args)
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(unexpect_t, _Args&&... __args)
         : __unex_(std::forward<_Args>(__args)...) {}
 
     template <class _Func, class... _Args>
     _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
-        std::__expected_construct_in_place_from_invoke_tag, _Func&& __f, _Args&&... __args)
-        : __val_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
-
-    template <class _Func, class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
-        std::__expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
+        __expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
         : __unex_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
 
-    template <class _Union>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(bool __has_val, _Union&& __other) {
-      if (__has_val)
-        std::construct_at(std::addressof(__val_), std::forward<_Union>(__other).__val_);
-      else
-        std::construct_at(std::addressof(__unex_), std::forward<_Union>(__other).__unex_);
-    }
-
     _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
-      requires(is_trivially_destructible_v<_ValueType> && is_trivially_destructible_v<_ErrorType>)
+      requires(is_trivially_destructible_v<_Err>)
     = default;
 
-    // the expected's destructor handles this
-    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t() {}
+    // __repr's destructor handles this
+    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
+      requires(!is_trivially_destructible_v<_Err>)
+    {}
 
-    _ValueType __val_;
-    _ErrorType __unex_;
+    _LIBCPP_NO_UNIQUE_ADDRESS __empty_t __empty_;
+    _LIBCPP_NO_UNIQUE_ADDRESS _Err __unex_;
   };
 
-  // use named union because [[no_unique_address]] cannot be applied to an unnamed union,
-  // also guaranteed elision into a potentially-overlapping subobject is unsettled (and
-  // it's not clear that it's implementable, given that the function is allowed to clobber
-  // the tail padding) - see https://github.com/itanium-cxx-abi/cxx-abi/issues/107.
-  template <class _ValueType, class _ErrorType>
-    requires(is_trivially_move_constructible_v<_ValueType> && is_trivially_move_constructible_v<_ErrorType>)
-  union __union_t<_ValueType, _ErrorType> {
-    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(const __union_t&)            = default;
-    _LIBCPP_HIDE_FROM_ABI constexpr __union_t& operator=(const __union_t&) = default;
+  static constexpr bool __put_flag_in_tail                    = __fits_in_tail_padding<__union_t, bool>;
+  static constexpr bool __allow_reusing_expected_tail_padding = !__put_flag_in_tail;
+
+  struct __repr {
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr() = delete;
 
     template <class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::in_place_t, _Args&&... __args)
-        : __val_(std::forward<_Args>(__args)...) {}
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr(in_place_t __tag) : __union_(in_place, __tag), __has_val_(true) {}
 
     template <class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::unexpect_t, _Args&&... __args)
-        : __unex_(std::forward<_Args>(__args)...) {}
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr(unexpect_t __tag, _Args&&... __args)
+        : __union_(in_place, __tag, std::forward<_Args>(__args)...), __has_val_(false) {}
 
-    template <class _Func, class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
-        std::__expected_construct_in_place_from_invoke_tag, _Func&& __f, _Args&&... __args)
-        : __val_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr(std::__expected_construct_unexpected_from_invoke_tag __tag,
+                                                    _Args&&... __args)
+        : __union_(in_place, __tag, std::forward<_Args>(__args)...), __has_val_(false) {}
+
+    template <class _OtherUnion>
+    _LIBCPP_HIDE_FROM_ABI constexpr explicit __repr(bool __has_val, _OtherUnion&& __other)
+      requires(__allow_reusing_expected_tail_padding)
+        : __union_(__conditional_no_unique_address_invoke_tag{},
+                   [&] { return __make_union(__has_val, std::forward<_OtherUnion>(__other)); }),
+          __has_val_(__has_val) {}
+
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr(const __repr&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr(const __repr&)
+      requires(is_copy_constructible_v<_Err> && is_trivially_copy_constructible_v<_Err>)
+    = default;
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr(__repr&&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr(__repr&&)
+      requires(is_move_constructible_v<_Err> && is_trivially_move_constructible_v<_Err>)
+    = default;
 
-    template <class _Func, class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
-        std::__expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
-        : __unex_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr& operator=(const __repr&) = delete;
+    _LIBCPP_HIDE_FROM_ABI constexpr __repr& operator=(__repr&&)      = delete;
+
+    _LIBCPP_HIDE_FROM_ABI constexpr ~__repr()
+      requires(is_trivially_destructible_v<_Err>)
+    = default;
 
-    template <class _Union>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(bool __has_val, _Union&& __other) {
+    _LIBCPP_HIDE_FROM_ABI constexpr ~__repr()
+      requires(!is_trivially_destructible_v<_Err>)
+    {
+      __destroy_union_member();
+    }
+
+    _LIBCPP_HIDE_FROM_ABI constexpr void __destroy_union()
+      requires(__allow_reusing_expected_tail_padding && is_trivially_destructible_v<_Err>)
+    {
+      std::destroy_at(&__union_.__v);
+    }
+
+    _LIBCPP_HIDE_FROM_ABI constexpr void __destroy_union()
+      requires(__allow_reusing_expected_tail_padding && !is_trivially_destructible_v<_Err>)
+    {
+      __destroy_union_member();
+      std::destroy_at(&__union_.__v);
+    }
+
+    _LIBCPP_HIDE_FROM_ABI constexpr void __construct_union(in_place_t)
+      requires(__allow_reusing_expected_tail_padding)
+    {
+      std::construct_at(&__union_.__v, in_place);
+      __has_val_ = true;
+    }
+
+    template <class... _Args>
+    _LIBCPP_HIDE_FROM_ABI constexpr void __construct_union(unexpect_t, _Args&&... __args)
+      requires(__allow_reusing_expected_tail_padding)
+    {
+      std::construct_at(&__union_.__v, unexpect, std::forward<_Args>(__args)...);
+      __has_val_ = false;
+    }
+
+  private:
+    template <class>
+    friend class __expected_void_base;
+
+    _LIBCPP_HIDE_FROM_ABI constexpr void __destroy_union_member()
+      requires(!is_trivially_destructible_v<_Err>)
+    {
+      if (!__has_val_)
+        std::destroy_at(std::addressof(__union_.__v.__unex_));
+    }
+
+    template <class _OtherUnion>
+    _LIBCPP_HIDE_FROM_ABI static constexpr __union_t __make_union(bool __has_val, _OtherUnion&& __other)
+      requires(__allow_reusing_expected_tail_padding)
+    {
       if (__has_val)
-        std::construct_at(std::addressof(__val_), std::forward<_Union>(__other).__val_);
+        return __union_t(in_place);
       else
-        std::construct_at(std::addressof(__unex_), std::forward<_Union>(__other).__unex_);
+        return __union_t(unexpect, std::forward<_OtherUnion>(__other).__unex_);
     }
 
-    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
-      requires(is_trivially_destructible_v<_ValueType> && is_trivially_destructible_v<_ErrorType>)
-    = default;
+    _LIBCPP_NO_UNIQUE_ADDRESS __conditional_no_unique_address<__put_flag_in_tail, __union_t> __union_;
+    _LIBCPP_NO_UNIQUE_ADDRESS bool __has_val_;
+  };
 
-    // the expected's destructor handles this
-    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
-      requires(!is_trivially_destructible_v<_ValueType> || !is_trivially_destructible_v<_ErrorType>)
-    {}
+  template <class _OtherUnion>
+  _LIBCPP_HIDE_FROM_ABI static constexpr __repr __make_repr(bool __has_val, _OtherUnion&& __other)
+    requires(__put_flag_in_tail)
+  {
+    if (__has_val)
+      return __repr(in_place);
+    else
+      return __repr(unexpect, std::forward<_OtherUnion>(__other).__unex_);
+  }
 
-    _LIBCPP_NO_UNIQUE_ADDRESS _ValueType __val_;
-    _LIBCPP_NO_UNIQUE_ADDRESS _ErrorType __unex_;
-  };
+protected:
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __expected_void_base(_Args&&... __args)
+      : __repr_(in_place, std::forward<_Args>(__args)...) {}
+
+  template <class _OtherUnion>
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __expected_void_base(bool __has_val, _OtherUnion&& __other)
+    requires(__put_flag_in_tail)
+      : __repr_(__conditional_no_unique_address_invoke_tag{},
+                [&] { return __make_repr(__has_val, std::forward<_OtherUnion>(__other)); }) {}
 
-  _LIBCPP_NO_UNIQUE_ADDRESS __union_t<_Tp, _Err> __union_;
-  bool __has_val_;
+  _LIBCPP_HIDE_FROM_ABI constexpr void __destroy() {
+    if constexpr (__put_flag_in_tail)
+      std::destroy_at(&__repr_.__v);
+    else
+      __repr_.__v.__destroy_union();
+  }
+
+  template <class _Tag, class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr void __construct(_Tag __tag, _Args&&... __args) {
+    if constexpr (__put_flag_in_tail)
+      std::construct_at(&__repr_.__v, __tag, std::forward<_Args>(__args)...);
+    else
+      __repr_.__v.__construct_union(__tag, std::forward<_Args>(__args)...);
+  }
+
+  _LIBCPP_HIDE_FROM_ABI constexpr bool __has_val() const { return __repr_.__v.__has_val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr __union_t& __union() { return __repr_.__v.__union_.__v; }
+  _LIBCPP_HIDE_FROM_ABI constexpr const __union_t& __union() const { return __repr_.__v.__union_.__v; }
+  _LIBCPP_HIDE_FROM_ABI constexpr _Err& __unex() { return __repr_.__v.__union_.__v.__unex_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr const _Err& __unex() const { return __repr_.__v.__union_.__v.__unex_; }
+
+private:
+  _LIBCPP_NO_UNIQUE_ADDRESS __conditional_no_unique_address<__allow_reusing_expected_tail_padding, __repr> __repr_;
 };
 
 template <class _Tp, class _Err>
   requires is_void_v<_Tp>
-class expected<_Tp, _Err> {
+class expected<_Tp, _Err> : private __expected_void_base<_Err> {
   static_assert(__valid_std_unexpected<_Err>::value,
                 "[expected.void.general] A program that instantiates expected<T, E> with a E that is not a "
                 "valid argument for unexpected<E> is ill-formed");
@@ -935,6 +1372,8 @@ class expected<_Tp, _Err> {
             _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>&>>,
             _Not<is_constructible<unexpected<_Err>, const expected<_Up, _OtherErr>>>>;
 
+  using __base = __expected_void_base<_Err>;
+
 public:
   using value_type      = _Tp;
   using error_type      = _Err;
@@ -944,7 +1383,7 @@ class expected<_Tp, _Err> {
   using rebind = expected<_Up, error_type>;
 
   // [expected.void.ctor], constructors
-  _LIBCPP_HIDE_FROM_ABI constexpr expected() noexcept : __has_val_(true) {}
+  _LIBCPP_HIDE_FROM_ABI constexpr expected() noexcept : __base(in_place) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr expected(const expected&) = delete;
 
@@ -955,7 +1394,7 @@ class expected<_Tp, _Err> {
   _LIBCPP_HIDE_FROM_ABI constexpr expected(const expected& __rhs) noexcept(
       is_nothrow_copy_constructible_v<_Err>) // strengthened
     requires(is_copy_constructible_v<_Err> && !is_trivially_copy_constructible_v<_Err>)
-      : __union_(__rhs.__has_val_, __rhs.__union_), __has_val_(__rhs.__has_val_) {}
+      : __base(__rhs.__has_val(), __rhs.__union()) {}
 
   _LIBCPP_HIDE_FROM_ABI constexpr expected(expected&&)
     requires(is_move_constructible_v<_Err> && is_trivially_move_constructible_v<_Err>)
@@ -963,93 +1402,93 @@ class expected<_Tp, _Err> {
 
   _LIBCPP_HIDE_FROM_ABI constexpr expected(expected&& __rhs) noexcept(is_nothrow_move_constructible_v<_Err>)
     requires(is_move_constructible_v<_Err> && !is_trivially_move_constructible_v<_Err>)
-      : __union_(__rhs.__has_val_, std::move(__rhs.__union_)), __has_val_(__rhs.__has_val_) {}
+      : __base(__rhs.__has_val(), std::move(__rhs.__union())) {}
 
   template <class _Up, class _OtherErr>
     requires __can_convert<_Up, _OtherErr, const _OtherErr&>::value
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<const _OtherErr&, _Err>)
       expected(const expected<_Up, _OtherErr>& __rhs) noexcept(
           is_nothrow_constructible_v<_Err, const _OtherErr&>) // strengthened
-      : __union_(__rhs.__has_val_, __rhs.__union_), __has_val_(__rhs.__has_val_) {}
+      : __base(__rhs.__has_val(), __rhs.__union()) {}
 
   template <class _Up, class _OtherErr>
     requires __can_convert<_Up, _OtherErr, _OtherErr>::value
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherErr, _Err>)
       expected(expected<_Up, _OtherErr>&& __rhs) noexcept(is_nothrow_constructible_v<_Err, _OtherErr>) // strengthened
-      : __union_(__rhs.__has_val_, std::move(__rhs.__union_)), __has_val_(__rhs.__has_val_) {}
+      : __base(__rhs.__has_val(), std::move(__rhs.__union())) {}
 
   template <class _OtherErr>
     requires is_constructible_v<_Err, const _OtherErr&>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<const _OtherErr&, _Err>) expected(
       const unexpected<_OtherErr>& __unex) noexcept(is_nothrow_constructible_v<_Err, const _OtherErr&>) // strengthened
-      : __union_(std::unexpect, __unex.error()), __has_val_(false) {}
+      : __base(unexpect, __unex.error()) {}
 
   template <class _OtherErr>
     requires is_constructible_v<_Err, _OtherErr>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit(!is_convertible_v<_OtherErr, _Err>)
       expected(unexpected<_OtherErr>&& __unex) noexcept(is_nothrow_constructible_v<_Err, _OtherErr>) // strengthened
-      : __union_(std::unexpect, std::move(__unex.error())), __has_val_(false) {}
+      : __base(unexpect, std::move(__unex.error())) {}
 
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(in_place_t) noexcept : __has_val_(true) {}
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(in_place_t) noexcept : __base(in_place) {}
 
   template <class... _Args>
     requires is_constructible_v<_Err, _Args...>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(unexpect_t, _Args&&... __args) noexcept(
       is_nothrow_constructible_v<_Err, _Args...>) // strengthened
-      : __union_(std::unexpect, std::forward<_Args>(__args)...), __has_val_(false) {}
+      : __base(unexpect, std::forward<_Args>(__args)...) {}
 
   template <class _Up, class... _Args>
     requires is_constructible_v< _Err, initializer_list<_Up>&, _Args... >
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(unexpect_t, initializer_list<_Up> __il, _Args&&... __args) noexcept(
       is_nothrow_constructible_v<_Err, initializer_list<_Up>&, _Args...>) // strengthened
-      : __union_(std::unexpect, __il, std::forward<_Args>(__args)...), __has_val_(false) {}
+      : __base(unexpect, __il, std::forward<_Args>(__args)...) {}
 
 private:
-  template <class _Func>
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(__expected_construct_in_place_from_invoke_tag, _Func&& __f)
-      : __has_val_(true) {
-    std::invoke(std::forward<_Func>(__f));
-  }
-
   template <class _Func, class... _Args>
   _LIBCPP_HIDE_FROM_ABI constexpr explicit expected(
       __expected_construct_unexpected_from_invoke_tag __tag, _Func&& __f, _Args&&... __args)
-      : __union_(__tag, std::forward<_Func>(__f), std::forward<_Args>(__args)...), __has_val_(false) {}
+      : __base(__tag, std::forward<_Func>(__f), std::forward<_Args>(__args)...) {}
 
 public:
   // [expected.void.dtor], destructor
 
-  _LIBCPP_HIDE_FROM_ABI constexpr ~expected()
-    requires is_trivially_destructible_v<_Err>
-  = default;
+  _LIBCPP_HIDE_FROM_ABI constexpr ~expected() = default;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr ~expected()
-    requires(!is_trivially_destructible_v<_Err>)
-  {
-    if (!__has_val_) {
-      std::destroy_at(std::addressof(__union_.__unex_));
-    }
+private:
+  template <class... _Args>
+  _LIBCPP_HIDE_FROM_ABI constexpr void __reinit_expected(unexpect_t, _Args&&... __args) {
+    _LIBCPP_ASSERT_INTERNAL(this->__has_val(), "__reinit_expected(unexpect_t, ...) needs value to be set");
+
+    this->__destroy();
+    auto __trans = std::__make_exception_guard([&] { this->__construct(in_place); });
+    this->__construct(unexpect, std::forward<_Args>(__args)...);
+    __trans.__complete();
   }
 
-  // [expected.void.assign], assignment
+  _LIBCPP_HIDE_FROM_ABI constexpr void __reinit_expected(in_place_t) {
+    _LIBCPP_ASSERT_INTERNAL(!this->__has_val(), "__reinit_expected(in_place_t, ...) needs value to be unset");
 
+    this->__destroy();
+    this->__construct(in_place);
+  }
+
+public:
+  // [expected.void.assign], assignment
   _LIBCPP_HIDE_FROM_ABI constexpr expected& operator=(const expected&) = delete;
 
   _LIBCPP_HIDE_FROM_ABI constexpr expected& operator=(const expected& __rhs) noexcept(
       is_nothrow_copy_assignable_v<_Err> && is_nothrow_copy_constructible_v<_Err>) // strengthened
     requires(is_copy_assignable_v<_Err> && is_copy_constructible_v<_Err>)
   {
-    if (__has_val_) {
-      if (!__rhs.__has_val_) {
-        std::construct_at(std::addressof(__union_.__unex_), __rhs.__union_.__unex_);
-        __has_val_ = false;
+    if (this->__has_val()) {
+      if (!__rhs.__has_val()) {
+        __reinit_expected(unexpect, __rhs.__unex());
       }
     } else {
-      if (__rhs.__has_val_) {
-        std::destroy_at(std::addressof(__union_.__unex_));
-        __has_val_ = true;
+      if (__rhs.__has_val()) {
+        __reinit_expected(in_place);
       } else {
-        __union_.__unex_ = __rhs.__union_.__unex_;
+        this->__unex() = __rhs.__unex();
       }
     }
     return *this;
@@ -1061,17 +1500,15 @@ class expected<_Tp, _Err> {
   operator=(expected&& __rhs) noexcept(is_nothrow_move_assignable_v<_Err> && is_nothrow_move_constructible_v<_Err>)
     requires(is_move_assignable_v<_Err> && is_move_constructible_v<_Err>)
   {
-    if (__has_val_) {
-      if (!__rhs.__has_val_) {
-        std::construct_at(std::addressof(__union_.__unex_), std::move(__rhs.__union_.__unex_));
-        __has_val_ = false;
+    if (this->__has_val()) {
+      if (!__rhs.__has_val()) {
+        __reinit_expected(unexpect, std::move(__rhs.__unex()));
       }
     } else {
-      if (__rhs.__has_val_) {
-        std::destroy_at(std::addressof(__union_.__unex_));
-        __has_val_ = true;
+      if (__rhs.__has_val()) {
+        __reinit_expected(in_place);
       } else {
-        __union_.__unex_ = std::move(__rhs.__union_.__unex_);
+        this->__unex() = std::move(__rhs.__unex());
       }
     }
     return *this;
@@ -1080,11 +1517,10 @@ class expected<_Tp, _Err> {
   template <class _OtherErr>
     requires(is_constructible_v<_Err, const _OtherErr&> && is_assignable_v<_Err&, const _OtherErr&>)
   _LIBCPP_HIDE_FROM_ABI constexpr expected& operator=(const unexpected<_OtherErr>& __un) {
-    if (__has_val_) {
-      std::construct_at(std::addressof(__union_.__unex_), __un.error());
-      __has_val_ = false;
+    if (this->__has_val()) {
+      __reinit_expected(unexpect, __un.error());
     } else {
-      __union_.__unex_ = __un.error();
+      this->__unex() = __un.error();
     }
     return *this;
   }
@@ -1092,19 +1528,17 @@ class expected<_Tp, _Err> {
   template <class _OtherErr>
     requires(is_constructible_v<_Err, _OtherErr> && is_assignable_v<_Err&, _OtherErr>)
   _LIBCPP_HIDE_FROM_ABI constexpr expected& operator=(unexpected<_OtherErr>&& __un) {
-    if (__has_val_) {
-      std::construct_at(std::addressof(__union_.__unex_), std::move(__un.error()));
-      __has_val_ = false;
+    if (this->__has_val()) {
+      __reinit_expected(unexpect, std::move(__un.error()));
     } else {
-      __union_.__unex_ = std::move(__un.error());
+      this->__unex() = std::move(__un.error());
     }
     return *this;
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr void emplace() noexcept {
-    if (!__has_val_) {
-      std::destroy_at(std::addressof(__union_.__unex_));
-      __has_val_ = true;
+    if (!this->__has_val()) {
+      __reinit_expected(in_place);
     }
   }
 
@@ -1113,23 +1547,23 @@ class expected<_Tp, _Err> {
   swap(expected& __rhs) noexcept(is_nothrow_move_constructible_v<_Err> && is_nothrow_swappable_v<_Err>)
     requires(is_swappable_v<_Err> && is_move_constructible_v<_Err>)
   {
-    auto __swap_val_unex_impl = [&](expected& __with_val, expected& __with_err) {
-      std::construct_at(std::addressof(__with_val.__union_.__unex_), std::move(__with_err.__union_.__unex_));
-      std::destroy_at(std::addressof(__with_err.__union_.__unex_));
-      __with_val.__has_val_ = false;
-      __with_err.__has_val_ = true;
+    auto __swap_val_unex_impl = [](expected& __with_val, expected& __with_err) {
+      // May throw, but will re-engage `__with_val` in that case.
+      __with_val.__reinit_expected(unexpect, std::move(__with_err.__unex()));
+      // Will not throw.
+      __with_err.__reinit_expected(in_place);
     };
 
-    if (__has_val_) {
-      if (!__rhs.__has_val_) {
+    if (this->__has_val()) {
+      if (!__rhs.__has_val()) {
         __swap_val_unex_impl(*this, __rhs);
       }
     } else {
-      if (__rhs.__has_val_) {
+      if (__rhs.__has_val()) {
         __swap_val_unex_impl(__rhs, *this);
       } else {
         using std::swap;
-        swap(__union_.__unex_, __rhs.__union_.__unex_);
+        swap(this->__unex(), __rhs.__unex());
       }
     }
   }
@@ -1141,46 +1575,51 @@ class expected<_Tp, _Err> {
   }
 
   // [expected.void.obs], observers
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit operator bool() const noexcept { return __has_val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit operator bool() const noexcept { return this->__has_val(); }
 
-  _LIBCPP_HIDE_FROM_ABI constexpr bool has_value() const noexcept { return __has_val_; }
+  _LIBCPP_HIDE_FROM_ABI constexpr bool has_value() const noexcept { return this->__has_val(); }
 
   _LIBCPP_HIDE_FROM_ABI constexpr void operator*() const noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(__has_val_, "expected::operator* requires the expected to contain a value");
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        this->__has_val(), "expected::operator* requires the expected to contain a value");
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr void value() const& {
     static_assert(is_copy_constructible_v<_Err>);
-    if (!__has_val_) {
-      std::__throw_bad_expected_access<_Err>(__union_.__unex_);
+    if (!this->__has_val()) {
+      std::__throw_bad_expected_access<_Err>(this->__unex());
     }
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr void value() && {
     static_assert(is_copy_constructible_v<_Err> && is_move_constructible_v<_Err>);
-    if (!__has_val_) {
-      std::__throw_bad_expected_access<_Err>(std::move(__union_.__unex_));
+    if (!this->__has_val()) {
+      std::__throw_bad_expected_access<_Err>(std::move(this->__unex()));
     }
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr const _Err& error() const& noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!__has_val_, "expected::error requires the expected to contain an error");
-    return __union_.__unex_;
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !this->__has_val(), "expected::error requires the expected to contain an error");
+    return this->__unex();
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr _Err& error() & noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!__has_val_, "expected::error requires the expected to contain an error");
-    return __union_.__unex_;
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !this->__has_val(), "expected::error requires the expected to contain an error");
+    return this->__unex();
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr const _Err&& error() const&& noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!__has_val_, "expected::error requires the expected to contain an error");
-    return std::move(__union_.__unex_);
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !this->__has_val(), "expected::error requires the expected to contain an error");
+    return std::move(this->__unex());
   }
 
   _LIBCPP_HIDE_FROM_ABI constexpr _Err&& error() && noexcept {
-    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(!__has_val_, "expected::error requires the expected to contain an error");
-    return std::move(__union_.__unex_);
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
+        !this->__has_val(), "expected::error requires the expected to contain an error");
+    return std::move(this->__unex());
   }
 
   template <class _Up = _Err>
@@ -1416,96 +1855,17 @@ class expected<_Tp, _Err> {
   template <class _T2, class _E2>
     requires is_void_v<_T2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const expected<_T2, _E2>& __y) {
-    if (__x.__has_val_ != __y.__has_val_) {
+    if (__x.__has_val() != __y.__has_val()) {
       return false;
     } else {
-      return __x.__has_val_ || static_cast<bool>(__x.__union_.__unex_ == __y.__union_.__unex_);
+      return __x.__has_val() || static_cast<bool>(__x.__unex() == __y.__unex());
     }
   }
 
   template <class _E2>
   _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator==(const expected& __x, const unexpected<_E2>& __y) {
-    return !__x.__has_val_ && static_cast<bool>(__x.__union_.__unex_ == __y.error());
+    return !__x.__has_val() && static_cast<bool>(__x.__unex() == __y.error());
   }
-
-private:
-  struct __empty_t {};
-
-  template <class _ErrorType>
-  union __union_t {
-    _LIBCPP_HIDE_FROM_ABI constexpr __union_t() : __empty_() {}
-
-    template <class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::unexpect_t, _Args&&... __args)
-        : __unex_(std::forward<_Args>(__args)...) {}
-
-    template <class _Func, class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
-        __expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
-        : __unex_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
-
-    template <class _Union>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(bool __has_val, _Union&& __other) {
-      if (__has_val)
-        std::construct_at(std::addressof(__empty_));
-      else
-        std::construct_at(std::addressof(__unex_), std::forward<_Union>(__other).__unex_);
-    }
-
-    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
-      requires(is_trivially_destructible_v<_ErrorType>)
-    = default;
-
-    // the expected's destructor handles this
-    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t() {}
-
-    __empty_t __empty_;
-    _ErrorType __unex_;
-  };
-
-  // use named union because [[no_unique_address]] cannot be applied to an unnamed union,
-  // also guaranteed elision into a potentially-overlapping subobject is unsettled (and
-  // it's not clear that it's implementable, given that the function is allowed to clobber
-  // the tail padding) - see https://github.com/itanium-cxx-abi/cxx-abi/issues/107.
-  template <class _ErrorType>
-    requires is_trivially_move_constructible_v<_ErrorType>
-  union __union_t<_ErrorType> {
-    _LIBCPP_HIDE_FROM_ABI constexpr __union_t() : __empty_() {}
-    _LIBCPP_HIDE_FROM_ABI constexpr __union_t(const __union_t&)            = default;
-    _LIBCPP_HIDE_FROM_ABI constexpr __union_t& operator=(const __union_t&) = default;
-
-    template <class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(std::unexpect_t, _Args&&... __args)
-        : __unex_(std::forward<_Args>(__args)...) {}
-
-    template <class _Func, class... _Args>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(
-        __expected_construct_unexpected_from_invoke_tag, _Func&& __f, _Args&&... __args)
-        : __unex_(std::invoke(std::forward<_Func>(__f), std::forward<_Args>(__args)...)) {}
-
-    template <class _Union>
-    _LIBCPP_HIDE_FROM_ABI constexpr explicit __union_t(bool __has_val, _Union&& __other) {
-      if (__has_val)
-        std::construct_at(std::addressof(__empty_));
-      else
-        std::construct_at(std::addressof(__unex_), std::forward<_Union>(__other).__unex_);
-    }
-
-    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
-      requires(is_trivially_destructible_v<_ErrorType>)
-    = default;
-
-    // the expected's destructor handles this
-    _LIBCPP_HIDE_FROM_ABI constexpr ~__union_t()
-      requires(!is_trivially_destructible_v<_ErrorType>)
-    {}
-
-    _LIBCPP_NO_UNIQUE_ADDRESS __empty_t __empty_;
-    _LIBCPP_NO_UNIQUE_ADDRESS _ErrorType __unex_;
-  };
-
-  _LIBCPP_NO_UNIQUE_ADDRESS __union_t<_Err> __union_;
-  bool __has_val_;
 };
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
index c8c217054ae6c..cf1909b928732 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/no_unique_address.compile.pass.cpp
@@ -12,7 +12,10 @@
 
 // test [[no_unique_address]] is applied to the union
 
+#include <__type_traits/datasizeof.h>
 #include <expected>
+#include <optional>
+#include <memory>
 
 struct Empty {};
 
@@ -23,13 +26,49 @@ struct A {
 
 struct B : public A {
   int z_;
+  short z2_;
   virtual ~B() = default;
 };
 
+struct BoolWithPadding {
+  explicit operator bool() { return b; }
+
+private:
+  alignas(1024) bool b = false;
+};
+
 static_assert(sizeof(std::expected<Empty, Empty>) == sizeof(bool));
 static_assert(sizeof(std::expected<Empty, A>) == 2 * sizeof(int) + alignof(std::expected<Empty, A>));
-static_assert(sizeof(std::expected<Empty, B>) == sizeof(B) + alignof(std::expected<Empty, B>));
+static_assert(sizeof(std::expected<Empty, B>) == sizeof(B));
 static_assert(sizeof(std::expected<A, Empty>) == 2 * sizeof(int) + alignof(std::expected<A, Empty>));
 static_assert(sizeof(std::expected<A, A>) == 2 * sizeof(int) + alignof(std::expected<A, A>));
-static_assert(sizeof(std::expected<B, Empty>) == sizeof(B) + alignof(std::expected<B, Empty>));
-static_assert(sizeof(std::expected<B, B>) == sizeof(B) + alignof(std::expected<B, B>));
+static_assert(sizeof(std::expected<B, Empty>) == sizeof(B));
+static_assert(sizeof(std::expected<B, B>) == sizeof(B));
+
+// Check that `expected`'s datasize is large enough for the parameter type(s).
+static_assert(sizeof(std::expected<BoolWithPadding, Empty>) ==
+              std::__libcpp_datasizeof<std::expected<BoolWithPadding, Empty>>::value);
+static_assert(sizeof(std::expected<Empty, BoolWithPadding>) ==
+              std::__libcpp_datasizeof<std::expected<Empty, BoolWithPadding>>::value);
+
+// In this case, there should be tail padding in the `expected` because `A`
+// itself does _not_ have tail padding.
+static_assert(sizeof(std::expected<A, A>) > std::__libcpp_datasizeof<std::expected<A, A>>::value);
+
+// Test with some real types.
+static_assert(sizeof(std::expected<std::optional<int>, int>) == 8);
+static_assert(std::__libcpp_datasizeof<std::expected<std::optional<int>, int>>::value == 8);
+
+static_assert(sizeof(std::expected<int, std::optional<int>>) == 8);
+static_assert(std::__libcpp_datasizeof<std::expected<int, std::optional<int>>>::value == 8);
+
+static_assert(sizeof(std::expected<int, int>) == 8);
+static_assert(std::__libcpp_datasizeof<std::expected<int, int>>::value == 5);
+
+// clang-format off
+static_assert(std::__libcpp_datasizeof<int>::value == 4);
+static_assert(std::__libcpp_datasizeof<std::expected<int, int>>::value == 5);
+static_assert(std::__libcpp_datasizeof<std::expected<std::expected<int, int>, int>>::value == 8);
+static_assert(std::__libcpp_datasizeof<std::expected<std::expected<std::expected<int, int>, int>, int>>::value == 9);
+static_assert(std::__libcpp_datasizeof<std::expected<std::expected<std::expected<std::expected<int, int>, int>, int>, int>>::value == 12);
+// clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
index 82024a068f00d..5bf094bf37709 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.expected/transform_error.mandates.verify.cpp
@@ -7,11 +7,15 @@
 //===----------------------------------------------------------------------===//
 
 // Clang-18 fixed some spurious clang diagnostics. Once clang-18 is the
-// minumum required version these obsolete tests can be removed.
+// minimum required version these obsolete tests can be removed.
 // TODO(LLVM-20) remove spurious clang diagnostic tests.
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
+// With clang-cl, some warnings have a 'which is a Microsoft extension' suffix
+// which break the tests.
+// XFAIL: msvc
+
 // Test the mandates
 
 // template<class F> constexpr auto transform_error(F&& f) &;
@@ -52,6 +56,7 @@ void test() {
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
     // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}[expected.object.general] A program that instantiates the definition of template expected<T, E> for {{.*}} is ill-formed.}}
+    // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
     // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
index dc59a6228386b..fdee8b71e5d98 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/no_unique_address.compile.pass.cpp
@@ -11,8 +11,13 @@
 // XFAIL: msvc
 
 // test [[no_unique_address]] is applied to the union
+// In particular, we ensure that we reuse tail padding in the T
+// to store the discriminator whenever we can.
 
+#include <__type_traits/datasizeof.h>
 #include <expected>
+#include <optional>
+#include <memory>
 
 struct Empty {};
 
@@ -23,9 +28,40 @@ struct A {
 
 struct B : public A {
   int z_;
+  short z2_;
   virtual ~B() = default;
 };
 
+struct BoolWithPadding {
+  explicit operator bool() { return b; }
+
+private:
+  alignas(1024) bool b = false;
+};
+
 static_assert(sizeof(std::expected<void, Empty>) == sizeof(bool));
 static_assert(sizeof(std::expected<void, A>) == 2 * sizeof(int) + alignof(std::expected<void, A>));
-static_assert(sizeof(std::expected<void, B>) == sizeof(B) + alignof(std::expected<void, B>));
+static_assert(sizeof(std::expected<void, B>) == sizeof(B));
+
+// Check that `expected`'s datasize is large enough for the parameter type(s).
+static_assert(sizeof(std::expected<void, BoolWithPadding>) ==
+              std::__libcpp_datasizeof<std::expected<void, BoolWithPadding>>::value);
+
+// In this case, there should be tail padding in the `expected` because `A`
+// itself does _not_ have tail padding.
+static_assert(sizeof(std::expected<void, A>) > std::__libcpp_datasizeof<std::expected<void, A>>::value);
+
+// Test with some real types.
+static_assert(sizeof(std::expected<void, std::optional<int>>) == 8);
+static_assert(std::__libcpp_datasizeof<std::expected<void, std::optional<int>>>::value == 8);
+
+static_assert(sizeof(std::expected<void, int>) == 8);
+static_assert(std::__libcpp_datasizeof<std::expected<void, int>>::value == 5);
+
+// clang-format off
+static_assert(std::__libcpp_datasizeof<int>::value == 4);
+static_assert(std::__libcpp_datasizeof<std::expected<void, int>>::value == 5);
+static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, int>>>::value == 8);
+static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, std::expected<void, int>>>>::value == 9);
+static_assert(std::__libcpp_datasizeof<std::expected<void, std::expected<void, std::expected<void, std::expected<void, int>>>>>::value == 12);
+// clang-format on
diff --git a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
index 0f2fb581ccd70..4f4f5839361e2 100644
--- a/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
+++ b/libcxx/test/libcxx/utilities/expected/expected.void/transform_error.mandates.verify.cpp
@@ -12,6 +12,10 @@
 
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
+// With clang-cl, some warnings have a 'which is a Microsoft extension' suffix
+// which break the tests.
+// XFAIL: msvc
+
 // Test the mandates
 
 // template<class F> constexpr auto transform_error(F&& f) &;
@@ -52,6 +56,8 @@ void test() {
     e.transform_error(return_unexpected<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
     // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
     // expected-error-re@*:* {{static assertion failed {{.*}}A program that instantiates expected<T, E> with a E that is not a valid argument for unexpected<E> is ill-formed}}
+    // expected-error-re@*:* {{call to deleted constructor of {{.*}}}}
+    // expected-error-re@*:* {{union member {{.*}} has reference type {{.*}}}}
 
     e.transform_error(return_no_object<int&>); // expected-error-re@*:* {{static assertion failed {{.*}}The result of {{.*}} must be a valid template argument for unexpected}}
     // expected-error-re@*:* 0-1 {{{{(excess elements in struct initializer|no matching constructor for initialization of)}}{{.*}}}}
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp
index 1c8750b7acd10..2d3b03689b1d9 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.U.pass.cpp
@@ -310,6 +310,20 @@ constexpr bool test() {
     assert(e.value().j == 8);
   }
 
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true> e1(std::unexpect);
+      e1 = 42;
+      assert(e1.check());
+    }
+    {
+      CheckForInvalidWrites<false> e1(std::unexpect);
+      e1 = true;
+      assert(e1.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp
index 03fe888b0a5e7..2f5291332355c 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.copy.pass.cpp
@@ -240,6 +240,29 @@ constexpr bool test() {
     assert(e1.error().data_ == 10);
     assert(oldState.copyAssignCalled);
   }
+
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true> e1(std::unexpect);
+      CheckForInvalidWrites<true> e2;
+
+      e1 = e2;
+
+      assert(e1.check());
+      assert(e2.check());
+    }
+    {
+      CheckForInvalidWrites<false> e1(std::unexpect);
+      CheckForInvalidWrites<false> e2;
+
+      e1 = e2;
+
+      assert(e1.check());
+      assert(e2.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp
index 8c419afd10729..065827ae652c9 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/assign.move.pass.cpp
@@ -258,6 +258,29 @@ constexpr bool test() {
     assert(e1.error().data_ == 10);
     assert(oldState.moveAssignCalled);
   }
+
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true> e1(std::unexpect);
+      CheckForInvalidWrites<true> e2;
+
+      e1 = std::move(e2);
+
+      assert(e1.check());
+      assert(e2.check());
+    }
+    {
+      CheckForInvalidWrites<false> e1(std::unexpect);
+      CheckForInvalidWrites<false> e2;
+
+      e1 = std::move(e2);
+
+      assert(e1.check());
+      assert(e2.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
index 491de2dff0331..7e37f8bdf4abc 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/assign/emplace.pass.cpp
@@ -80,6 +80,20 @@ constexpr bool test() {
     assert(e.has_value());
   }
 
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true> e;
+      e.emplace();
+      assert(e.check());
+    }
+    {
+      CheckForInvalidWrites<false> e;
+      e.emplace();
+      assert(e.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp
index f443613fe7f34..d38a46f045248 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`,
-// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333, but we have a workaround to
-// avoid this issue.
+// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333
+// XFAIL: gcc-13
 
 // <expected>
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp
index 84b57aea3cd23..ec55f637f0202 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/monadic/transform_error.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`,
-// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333, but we have a workaround to
-// avoid this issue.
+// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333.
+// XFAIL: gcc-13
 
 // <expected>
 
diff --git a/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
index 34782303909e2..f19599d43b32d 100644
--- a/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.expected/swap/member.swap.pass.cpp
@@ -227,6 +227,28 @@ constexpr bool test() {
     }
   }
 
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true> x(std::unexpect);
+      CheckForInvalidWrites<true> y;
+
+      x.swap(y);
+
+      assert(x.check());
+      assert(y.check());
+    }
+    {
+      CheckForInvalidWrites<false> x(std::unexpect);
+      CheckForInvalidWrites<false> y;
+
+      x.swap(y);
+
+      assert(x.check());
+      assert(y.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp
index b1968bcc24240..a51916f874eea 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.copy.pass.cpp
@@ -99,6 +99,28 @@ constexpr bool test() {
     assert(state.copyAssignCalled);
   }
 
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true, true> e1;
+      CheckForInvalidWrites<true, true> e2(std::unexpect);
+
+      e1 = e2;
+
+      assert(e1.check());
+      assert(e2.check());
+    }
+    {
+      CheckForInvalidWrites<false, true> e1;
+      CheckForInvalidWrites<false, true> e2(std::unexpect);
+
+      e1 = e2;
+
+      assert(e1.check());
+      assert(e2.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp
index e6a29cd62583b..60ae0346f7af1 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.move.pass.cpp
@@ -125,6 +125,28 @@ constexpr bool test() {
     assert(state.moveAssignCalled);
   }
 
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true, true> e1;
+      CheckForInvalidWrites<true, true> e2(std::unexpect);
+
+      e1 = std::move(e2);
+
+      assert(e1.check());
+      assert(e2.check());
+    }
+    {
+      CheckForInvalidWrites<false, true> e1;
+      CheckForInvalidWrites<false, true> e2(std::unexpect);
+
+      e1 = std::move(e2);
+
+      assert(e1.check());
+      assert(e2.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp
index 1ae96530e528e..699597d243c19 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.copy.pass.cpp
@@ -91,6 +91,22 @@ constexpr bool test() {
     assert(state1.copyAssignCalled);
   }
 
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true, true> e;
+      std::unexpected<int> un(std::in_place, 42);
+      e = un;
+      assert(e.check());
+    }
+    {
+      CheckForInvalidWrites<false, true> e;
+      std::unexpected<bool> un(std::in_place, true);
+      e = un;
+      assert(e.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp
index ea947715af28f..641eb4927a724 100644
--- a/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/assign/assign.unexpected.move.pass.cpp
@@ -172,6 +172,23 @@ constexpr bool test() {
     assert(e1.error().data_ == 10);
     assert(oldState.moveAssignCalled);
   }
+
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true, true> e;
+      std::unexpected<int> un(std::in_place, 42);
+      e = std::move(un);
+      assert(e.check());
+    }
+    {
+      CheckForInvalidWrites<false, true> e;
+      std::unexpected<bool> un(std::in_place, true);
+      e = std::move(un);
+      assert(e.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp
index f0e19ac3c3982..cd6e5a5038d28 100644
--- a/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/monadic/transform_error.pass.cpp
@@ -9,8 +9,8 @@
 // UNSUPPORTED: c++03, c++11, c++14, c++17, c++20
 
 // GCC has a issue for `Guaranteed copy elision for potentially-overlapping non-static data members`,
-// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333, but we have a workaround to
-// avoid this issue.
+// please refer to: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=108333
+// XFAIL: gcc-13
 
 // <expected>
 
diff --git a/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp b/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
index 07980dee13c1e..25601af39a316 100644
--- a/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
+++ b/libcxx/test/std/utilities/expected/expected.void/swap/member.swap.pass.cpp
@@ -139,6 +139,28 @@ constexpr bool test() {
     assert(y.has_value());
   }
 
+  // CheckForInvalidWrites
+  {
+    {
+      CheckForInvalidWrites<true, true> x(std::unexpect);
+      CheckForInvalidWrites<true, true> y;
+
+      x.swap(y);
+
+      assert(x.check());
+      assert(y.check());
+    }
+    {
+      CheckForInvalidWrites<false, true> x(std::unexpect);
+      CheckForInvalidWrites<false, true> y;
+
+      x.swap(y);
+
+      assert(x.check());
+      assert(y.check());
+    }
+  }
+
   return true;
 }
 
diff --git a/libcxx/test/std/utilities/expected/types.h b/libcxx/test/std/utilities/expected/types.h
index 90768ed8e7520..2b6983fb399c6 100644
--- a/libcxx/test/std/utilities/expected/types.h
+++ b/libcxx/test/std/utilities/expected/types.h
@@ -199,4 +199,141 @@ static_assert(!std::is_trivially_move_constructible_v<TailClobbererNonTrivialMov
 static_assert(std::is_nothrow_move_constructible_v<TailClobbererNonTrivialMove<0, true>>);
 static_assert(!std::is_nothrow_move_constructible_v<TailClobbererNonTrivialMove<0, false>>);
 
+// The `CheckForInvalidWrites` class recreates situations where other objects
+// may be placed into a `std::expected`'s tail padding (see
+// https://github.com/llvm/llvm-project/issues/70494). With a template
+// parameter `WithPaddedExpected` two cases can be tested:
+//
+// 1. The `std::expected<T, E>` itself has padding, because `T`/`E` _don't_
+//    have tail padding. This is modelled by `CheckForInvalidWrites<true>`
+//    which has a (potential) data layout like this:
+//
+//                +- `expected`'s "has value" flag
+//                |
+//                |             +- `please_dont_overwrite_me`
+//                |             |
+//    /---int---\ |  /----------^-------\                                    //
+//    00 00 00 00 01 01 01 01 01 01 01 01
+//                   \--v---/
+//                      |
+//                      |
+//                      +- `expected`'s tail padding which
+//                         gets repurposed by `please_dont_overwrite_me`
+//
+// 2. There is tail padding in the union of `T` and `E` which means the
+//    "has value" flag can be put into this tail padding. In this case, the
+//    `std::expected` itself _must not_ have any tail padding as it may get
+//    overwritten on mutating operations such as `emplace()`. This case is
+//    modelled by `CheckForInvalidWrites<false>` with a (potential) data
+//    layout like this:
+//
+//    +- bool
+//    |                                +- please_dont_overwrite_me
+//    |  +- "has value" flag           |
+//    |  |                    /--------^---------\                           //
+//    00 00 00 00 00 00 00 00 01 01 01 01 01 01 01 00
+//          \---padding-----/                      |
+//                                                 +- `CheckForInvalidWrites`
+//                                                    padding
+//
+// Note that other implementation strategies are viable, including one that
+// doesn't make use of `[[no_unique_address]]`. But if an implementation uses
+// the strategy above, it must make sure that those tail padding bytes are not
+// overwritten improperly on operations such as `emplace()`.
+
+struct BoolWithPadding {
+  constexpr explicit BoolWithPadding() noexcept : BoolWithPadding(false) {}
+  constexpr BoolWithPadding(bool val) noexcept {
+    if (!std::is_constant_evaluated()) {
+      std::memset(this, 0, sizeof(*this));
+    }
+    val_ = val;
+  }
+  constexpr BoolWithPadding(const BoolWithPadding& other) noexcept : BoolWithPadding(other.val_) {}
+  constexpr BoolWithPadding& operator=(const BoolWithPadding& other) noexcept {
+    val_ = other.val_;
+    return *this;
+  }
+  // The previous data layout of libc++'s `expected` required `T` to be
+  // trivially move constructible to employ the `[[no_unique_address]]`
+  // optimization. To trigger bugs with the old implementation, make
+  // `BoolWithPadding` trivially move constructible.
+  constexpr BoolWithPadding(BoolWithPadding&&) = default;
+
+private:
+  alignas(8) bool val_;
+};
+
+struct IntWithoutPadding {
+  constexpr explicit IntWithoutPadding() noexcept : IntWithoutPadding(0) {}
+  constexpr IntWithoutPadding(int val) noexcept {
+    if (!std::is_constant_evaluated()) {
+      std::memset(this, 0, sizeof(*this));
+    }
+    val_ = val;
+  }
+  constexpr IntWithoutPadding(const IntWithoutPadding& other) noexcept : IntWithoutPadding(other.val_) {}
+  constexpr IntWithoutPadding& operator=(const IntWithoutPadding& other) noexcept {
+    val_ = other.val_;
+    return *this;
+  }
+  // See comment on `BoolWithPadding`.
+  constexpr IntWithoutPadding(IntWithoutPadding&&) = default;
+
+private:
+  int val_;
+};
+
+template <bool WithPaddedExpected, bool ExpectedVoid>
+struct CheckForInvalidWritesBaseImpl;
+template <>
+struct CheckForInvalidWritesBaseImpl<true, false> {
+  using type = std::expected<IntWithoutPadding, bool>;
+};
+template <>
+struct CheckForInvalidWritesBaseImpl<false, false> {
+  using type = std::expected<BoolWithPadding, bool>;
+};
+template <>
+struct CheckForInvalidWritesBaseImpl<true, true> {
+  using type = std::expected<void, IntWithoutPadding>;
+};
+template <>
+struct CheckForInvalidWritesBaseImpl<false, true> {
+  using type = std::expected<void, BoolWithPadding>;
+};
+
+template <bool WithPaddedExpected, bool ExpectedVoid>
+using CheckForInvalidWritesBase = typename CheckForInvalidWritesBaseImpl<WithPaddedExpected, ExpectedVoid>::type;
+
+template <bool WithPaddedExpected, bool ExpectedVoid = false>
+struct CheckForInvalidWrites : public CheckForInvalidWritesBase<WithPaddedExpected, ExpectedVoid> {
+  constexpr CheckForInvalidWrites() = default;
+  constexpr CheckForInvalidWrites(std::unexpect_t)
+      : CheckForInvalidWritesBase<WithPaddedExpected, ExpectedVoid>(std::unexpect) {}
+
+  constexpr CheckForInvalidWrites& operator=(const CheckForInvalidWrites& other) {
+    CheckForInvalidWritesBase<WithPaddedExpected, ExpectedVoid>::operator=(other);
+    return *this;
+  }
+
+  constexpr CheckForInvalidWrites& operator=(CheckForInvalidWrites&& other) {
+    CheckForInvalidWritesBase<WithPaddedExpected, ExpectedVoid>::operator=(std::move(other));
+    return *this;
+  }
+
+  using CheckForInvalidWritesBase<WithPaddedExpected, ExpectedVoid>::operator=;
+
+  const bool please_dont_overwrite_me[7] = {true, true, true, true, true, true, true};
+
+  constexpr bool check() {
+    for (bool i : please_dont_overwrite_me) {
+      if (!i) {
+        return false;
+      }
+    }
+    return true;
+  }
+};
+
 #endif // TEST_STD_UTILITIES_EXPECTED_TYPES_H

From bfef161a80b62723bedad996aa7a697f99e6802a Mon Sep 17 00:00:00 2001
From: chuongg3 <chuong.goh@arm.com>
Date: Mon, 22 Jan 2024 14:08:26 +0000
Subject: [PATCH 389/843] [AArch64][GlobalISel] Legalize Shifts for
 Smaller/Larger Vectors (#78750)

Legalize shl/lshr/ashr for smaller/larger vector widths with legal
element sizes

Smaller than legal vector types does not work at the moment as it relies
on G_ANYEXT to work with smaller than legal vector types
---
 .../CodeGen/GlobalISel/LegalizerHelper.cpp    |   5 +-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    |   2 +
 llvm/test/CodeGen/AArch64/fcmp.ll             | 303 ++++++++++--------
 llvm/test/CodeGen/AArch64/icmp.ll             |  34 +-
 llvm/test/CodeGen/AArch64/sext.ll             |  71 ++--
 llvm/test/CodeGen/AArch64/shift.ll            | 234 +++++++++-----
 6 files changed, 412 insertions(+), 237 deletions(-)

diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index a868860f343ba..3b2cf31910927 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -5198,7 +5198,10 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case TargetOpcode::G_FMAXIMUM:
   case TargetOpcode::G_STRICT_FADD:
   case TargetOpcode::G_STRICT_FSUB:
-  case TargetOpcode::G_STRICT_FMUL: {
+  case TargetOpcode::G_STRICT_FMUL:
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_ASHR:
+  case TargetOpcode::G_LSHR: {
     Observer.changingInstr(MI);
     moreElementsVectorSrc(MI, MoreTy, 1);
     moreElementsVectorSrc(MI, MoreTy, 2);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index ae7ed08194d0d..fd69a7d6c33d0 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -169,6 +169,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(0)
       .clampScalar(1, s32, s64)
       .clampScalar(0, s32, s64)
+      .clampNumElements(0, v8s8, v16s8)
+      .clampNumElements(0, v4s16, v8s16)
       .clampNumElements(0, v2s32, v4s32)
       .clampNumElements(0, v2s64, v2s64)
       .moreElementsToNextPow2(0)
diff --git a/llvm/test/CodeGen/AArch64/fcmp.ll b/llvm/test/CodeGen/AArch64/fcmp.ll
index 82e29d0f8a194..d3bc7fc6dc063 100644
--- a/llvm/test/CodeGen/AArch64/fcmp.ll
+++ b/llvm/test/CodeGen/AArch64/fcmp.ll
@@ -8,9 +8,9 @@
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f64_i32
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f32_float
 ; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v3f32_i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v7f16_half
+; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v7f16_half
 ; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v16f16_half
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for v7f16_i32
+; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v7f16_i32
 ; CHECK-GI-FP16-NEXT:  warning: Instruction selection used fallback path for v16f16_i32
 
 define double @f64_double(double %a, double %b, double %d, double %e) {
@@ -437,62 +437,87 @@ define <7 x half> @v7f16_half(<7 x half> %a, <7 x half> %b, <7 x half> %d, <7 x
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_half:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h0
-; CHECK-GI-NOFP16-NEXT:    mov h16, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov w9, #15 // =0xf
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h1
+; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h19, v1.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h20, v1.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h4
 ; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcmp s5, s4
-; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s7, s6
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
 ; CHECK-GI-NOFP16-NEXT:    fcvt s7, h16
-; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    csetm w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s4, s7
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w9
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s16, h16
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], w8
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s5
-; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], w8
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fcmp s16, s7
-; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h16, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], w8
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s5
-; CHECK-GI-NOFP16-NEXT:    fcvt s5, h7
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h16
+; CHECK-GI-NOFP16-NEXT:    fmov s6, w9
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h17
+; CHECK-GI-NOFP16-NEXT:    mov h17, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s19, h19
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s5
+; CHECK-GI-NOFP16-NEXT:    mov h18, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w8
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s7, s16
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h4
+; CHECK-GI-NOFP16-NEXT:    fcvt s16, h17
+; CHECK-GI-NOFP16-NEXT:    fmov s17, w9
+; CHECK-GI-NOFP16-NEXT:    mov v4.16b, v6.16b
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h18
 ; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov h17, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    fcmp s7, s16
+; CHECK-GI-NOFP16-NEXT:    fmov s7, w8
+; CHECK-GI-NOFP16-NEXT:    mov w8, #65535 // =0xffff
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s7, w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s18, s19
+; CHECK-GI-NOFP16-NEXT:    fcvt s17, h17
+; CHECK-GI-NOFP16-NEXT:    fcvt s18, h20
+; CHECK-GI-NOFP16-NEXT:    fmov s19, w8
 ; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], w8
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s6, s5
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], w8
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v16.16b, v7.16b
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], v19.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[1], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fcmp s17, s18
+; CHECK-GI-NOFP16-NEXT:    fmov s17, w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[4], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[4], v17.h[0]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
 ; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], w8
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[7], w8
-; CHECK-GI-NOFP16-NEXT:    mov v0.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[2], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[5], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[5], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w8
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[3], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[6], v6.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[6], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[4], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.h[7], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v5.h[7], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[5], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    neg v0.8h, v4.8h
+; CHECK-GI-NOFP16-NEXT:    ushl v1.8h, v5.8h, v4.8h
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[6], v7.h[0]
+; CHECK-GI-NOFP16-NEXT:    mov v16.h[7], v0.h[0]
+; CHECK-GI-NOFP16-NEXT:    sshl v0.8h, v1.8h, v0.8h
+; CHECK-GI-NOFP16-NEXT:    eor v1.16b, v0.16b, v16.16b
+; CHECK-GI-NOFP16-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    orr v0.16b, v0.16b, v1.16b
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_half:
@@ -1112,90 +1137,110 @@ define <7 x i32> @v7f16_i32(<7 x half> %a, <7 x half> %b, <7 x i32> %d, <7 x i32
 ;
 ; CHECK-GI-NOFP16-LABEL: v7f16_i32:
 ; CHECK-GI-NOFP16:       // %bb.0: // %entry
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[1]
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[2]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[2]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h1
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h0
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v1.h[1]
+; CHECK-GI-NOFP16-NEXT:    mov w13, #31 // =0x1f
+; CHECK-GI-NOFP16-NEXT:    fcvt s4, h0
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h1
+; CHECK-GI-NOFP16-NEXT:    ldr s17, [sp, #40]
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[2]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[2]
+; CHECK-GI-NOFP16-NEXT:    fmov s16, w0
 ; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
 ; CHECK-GI-NOFP16-NEXT:    fcvt s3, h3
+; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
+; CHECK-GI-NOFP16-NEXT:    mov h4, v0.h[3]
+; CHECK-GI-NOFP16-NEXT:    mov h5, v1.h[3]
+; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
+; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[1], w1
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s2, s3
+; CHECK-GI-NOFP16-NEXT:    mov h2, v0.h[4]
+; CHECK-GI-NOFP16-NEXT:    mov h3, v1.h[4]
 ; CHECK-GI-NOFP16-NEXT:    fcvt s4, h4
 ; CHECK-GI-NOFP16-NEXT:    fcvt s5, h5
-; CHECK-GI-NOFP16-NEXT:    fcmp s3, s2
-; CHECK-GI-NOFP16-NEXT:    mov h2, v1.h[3]
-; CHECK-GI-NOFP16-NEXT:    mov h3, v0.h[3]
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-GI-NOFP16-NEXT:    mov h6, v1.h[5]
-; CHECK-GI-NOFP16-NEXT:    mov h7, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[2], w2
+; CHECK-GI-NOFP16-NEXT:    cset w10, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s6, s7
+; CHECK-GI-NOFP16-NEXT:    mov h6, v0.h[5]
+; CHECK-GI-NOFP16-NEXT:    mov h7, v1.h[5]
 ; CHECK-GI-NOFP16-NEXT:    fcvt s2, h2
+; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[6]
 ; CHECK-GI-NOFP16-NEXT:    fcvt s3, h3
-; CHECK-GI-NOFP16-NEXT:    csetm w9, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s5, s4
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[4]
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[4]
-; CHECK-GI-NOFP16-NEXT:    fcvt s6, h6
-; CHECK-GI-NOFP16-NEXT:    fcvt s7, h7
-; CHECK-GI-NOFP16-NEXT:    csetm w10, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s3, s2
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-GI-NOFP16-NEXT:    mov h4, v1.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[7]
-; CHECK-GI-NOFP16-NEXT:    fcvt s3, h5
-; CHECK-GI-NOFP16-NEXT:    mov h5, v0.h[6]
-; CHECK-GI-NOFP16-NEXT:    mov h0, v0.h[7]
-; CHECK-GI-NOFP16-NEXT:    csetm w11, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s7, s6
-; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
-; CHECK-GI-NOFP16-NEXT:    csetm w12, mi
-; CHECK-GI-NOFP16-NEXT:    fcmp s3, s2
-; CHECK-GI-NOFP16-NEXT:    fcvt s2, h4
-; CHECK-GI-NOFP16-NEXT:    fcvt s3, h5
-; CHECK-GI-NOFP16-NEXT:    fmov s4, w9
+; CHECK-GI-NOFP16-NEXT:    mov h1, v1.h[6]
+; CHECK-GI-NOFP16-NEXT:    cset w11, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
+; CHECK-GI-NOFP16-NEXT:    mov v16.s[3], w3
+; CHECK-GI-NOFP16-NEXT:    fcvt s4, h6
+; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #32]
+; CHECK-GI-NOFP16-NEXT:    fcvt s5, h7
 ; CHECK-GI-NOFP16-NEXT:    fcvt s0, h0
-; CHECK-GI-NOFP16-NEXT:    add x9, sp, #8
-; CHECK-GI-NOFP16-NEXT:    csetm w13, mi
-; CHECK-GI-NOFP16-NEXT:    fmov s5, w13
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[1], w8
-; CHECK-GI-NOFP16-NEXT:    mov x8, sp
-; CHECK-GI-NOFP16-NEXT:    fcmp s3, s2
-; CHECK-GI-NOFP16-NEXT:    fmov s2, w7
-; CHECK-GI-NOFP16-NEXT:    fmov s3, w0
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[1], w12
-; CHECK-GI-NOFP16-NEXT:    ld1 { v2.s }[1], [x8]
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w1
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[2], w10
+; CHECK-GI-NOFP16-NEXT:    fmov s7, w4
+; CHECK-GI-NOFP16-NEXT:    cset w8, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s2, s3
+; CHECK-GI-NOFP16-NEXT:    fcvt s1, h1
+; CHECK-GI-NOFP16-NEXT:    fmov s2, w13
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[1], w5
+; CHECK-GI-NOFP16-NEXT:    cset w12, mi
+; CHECK-GI-NOFP16-NEXT:    fcmp s4, s5
+; CHECK-GI-NOFP16-NEXT:    ldr s5, [sp]
+; CHECK-GI-NOFP16-NEXT:    fmov s3, w12
+; CHECK-GI-NOFP16-NEXT:    mov v2.s[1], w13
+; CHECK-GI-NOFP16-NEXT:    cset w14, mi
 ; CHECK-GI-NOFP16-NEXT:    fcmp s0, s1
-; CHECK-GI-NOFP16-NEXT:    fmov s1, w4
-; CHECK-GI-NOFP16-NEXT:    ldr s0, [sp, #24]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[2], w8
-; CHECK-GI-NOFP16-NEXT:    ld1 { v2.s }[2], [x9]
-; CHECK-GI-NOFP16-NEXT:    add x9, sp, #32
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w2
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], w5
-; CHECK-GI-NOFP16-NEXT:    csetm w8, mi
-; CHECK-GI-NOFP16-NEXT:    mov v4.h[3], w11
-; CHECK-GI-NOFP16-NEXT:    ld1 { v0.s }[1], [x9]
-; CHECK-GI-NOFP16-NEXT:    mov v5.h[3], w8
-; CHECK-GI-NOFP16-NEXT:    add x8, sp, #16
-; CHECK-GI-NOFP16-NEXT:    ld1 { v2.s }[3], [x8]
-; CHECK-GI-NOFP16-NEXT:    mov v3.s[3], w3
-; CHECK-GI-NOFP16-NEXT:    add x8, sp, #40
-; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], w6
-; CHECK-GI-NOFP16-NEXT:    sshll v4.4s, v4.4h, #0
-; CHECK-GI-NOFP16-NEXT:    ld1 { v0.s }[2], [x8]
-; CHECK-GI-NOFP16-NEXT:    sshll v5.4s, v5.4h, #0
-; CHECK-GI-NOFP16-NEXT:    bit v2.16b, v3.16b, v4.16b
-; CHECK-GI-NOFP16-NEXT:    bit v0.16b, v1.16b, v5.16b
-; CHECK-GI-NOFP16-NEXT:    mov w1, v2.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov w2, v2.s[2]
-; CHECK-GI-NOFP16-NEXT:    mov w3, v2.s[3]
-; CHECK-GI-NOFP16-NEXT:    fmov w0, s2
-; CHECK-GI-NOFP16-NEXT:    mov w5, v0.s[1]
-; CHECK-GI-NOFP16-NEXT:    mov w6, v0.s[2]
-; CHECK-GI-NOFP16-NEXT:    fmov w4, s0
+; CHECK-GI-NOFP16-NEXT:    fmov s0, w9
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[1], w14
+; CHECK-GI-NOFP16-NEXT:    ldr s1, [sp, #24]
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[2], w6
+; CHECK-GI-NOFP16-NEXT:    mov v2.s[2], w13
+; CHECK-GI-NOFP16-NEXT:    cset w9, mi
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[1], w10
+; CHECK-GI-NOFP16-NEXT:    fmov w10, s5
+; CHECK-GI-NOFP16-NEXT:    fmov s5, w7
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[1], v6.s[0]
+; CHECK-GI-NOFP16-NEXT:    ldr s6, [sp, #8]
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[2], w9
+; CHECK-GI-NOFP16-NEXT:    mov w9, #-1 // =0xffffffff
+; CHECK-GI-NOFP16-NEXT:    fmov s4, w9
+; CHECK-GI-NOFP16-NEXT:    mov v2.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[2], w11
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[1], w10
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[2], v17.s[0]
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[1], w9
+; CHECK-GI-NOFP16-NEXT:    mov v3.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    neg v18.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s6
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[2], w9
+; CHECK-GI-NOFP16-NEXT:    ushl v2.4s, v3.4s, v2.4s
+; CHECK-GI-NOFP16-NEXT:    ldr s3, [sp, #16]
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[2], w8
+; CHECK-GI-NOFP16-NEXT:    mov v7.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    shl v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    sshl v2.4s, v2.4s, v18.4s
+; CHECK-GI-NOFP16-NEXT:    mov v4.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    fmov w8, s3
+; CHECK-GI-NOFP16-NEXT:    mov v1.s[3], v0.s[0]
+; CHECK-GI-NOFP16-NEXT:    sshr v0.4s, v0.4s, #31
+; CHECK-GI-NOFP16-NEXT:    mov v5.s[3], w8
+; CHECK-GI-NOFP16-NEXT:    eor v3.16b, v2.16b, v4.16b
+; CHECK-GI-NOFP16-NEXT:    and v2.16b, v7.16b, v2.16b
+; CHECK-GI-NOFP16-NEXT:    and v1.16b, v1.16b, v3.16b
+; CHECK-GI-NOFP16-NEXT:    bsl v0.16b, v16.16b, v5.16b
+; CHECK-GI-NOFP16-NEXT:    orr v1.16b, v2.16b, v1.16b
+; CHECK-GI-NOFP16-NEXT:    mov s2, v0.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s3, v0.s[2]
+; CHECK-GI-NOFP16-NEXT:    mov s4, v0.s[3]
+; CHECK-GI-NOFP16-NEXT:    fmov w0, s0
+; CHECK-GI-NOFP16-NEXT:    mov s5, v1.s[1]
+; CHECK-GI-NOFP16-NEXT:    mov s6, v1.s[2]
+; CHECK-GI-NOFP16-NEXT:    fmov w4, s1
+; CHECK-GI-NOFP16-NEXT:    fmov w1, s2
+; CHECK-GI-NOFP16-NEXT:    fmov w2, s3
+; CHECK-GI-NOFP16-NEXT:    fmov w3, s4
+; CHECK-GI-NOFP16-NEXT:    fmov w5, s5
+; CHECK-GI-NOFP16-NEXT:    fmov w6, s6
 ; CHECK-GI-NOFP16-NEXT:    ret
 ;
 ; CHECK-GI-FP16-LABEL: v7f16_i32:
diff --git a/llvm/test/CodeGen/AArch64/icmp.ll b/llvm/test/CodeGen/AArch64/icmp.ll
index 26711ea584c97..789bc99810ae8 100644
--- a/llvm/test/CodeGen/AArch64/icmp.ll
+++ b/llvm/test/CodeGen/AArch64/icmp.ll
@@ -2,8 +2,6 @@
 ; RUN: llc -mtriple=aarch64-none-eabi -verify-machineinstrs %s -o - | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64-none-eabi -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:  warning: Instruction selection used fallback path for v3i32_i32
-
 define i64 @i64_i64(i64 %a, i64 %b, i64 %d, i64 %e) {
 ; CHECK-LABEL: i64_i64:
 ; CHECK:       // %bb.0: // %entry
@@ -165,11 +163,33 @@ entry:
 }
 
 define <3 x i32> @v3i32_i32(<3 x i32> %a, <3 x i32> %b, <3 x i32> %d, <3 x i32> %e) {
-; CHECK-LABEL: v3i32_i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    cmgt v0.4s, v1.4s, v0.4s
-; CHECK-NEXT:    bsl v0.16b, v2.16b, v3.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: v3i32_i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-SD-NEXT:    bsl v0.16b, v2.16b, v3.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: v3i32_i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #31 // =0x1f
+; CHECK-GI-NEXT:    cmgt v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    fmov s4, w8
+; CHECK-GI-NEXT:    mov v4.s[1], w8
+; CHECK-GI-NEXT:    mov v4.s[2], w8
+; CHECK-GI-NEXT:    mov w8, #-1 // =0xffffffff
+; CHECK-GI-NEXT:    fmov s5, w8
+; CHECK-GI-NEXT:    mov v5.s[1], w8
+; CHECK-GI-NEXT:    mov v4.s[3], w8
+; CHECK-GI-NEXT:    mov v5.s[2], w8
+; CHECK-GI-NEXT:    neg v1.4s, v4.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v0.4s, v4.4s
+; CHECK-GI-NEXT:    mov v5.s[3], w8
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v1.4s
+; CHECK-GI-NEXT:    eor v1.16b, v0.16b, v5.16b
+; CHECK-GI-NEXT:    and v0.16b, v2.16b, v0.16b
+; CHECK-GI-NEXT:    and v1.16b, v3.16b, v1.16b
+; CHECK-GI-NEXT:    orr v0.16b, v0.16b, v1.16b
+; CHECK-GI-NEXT:    ret
 entry:
   %c = icmp slt <3 x i32> %a, %b
   %s = select <3 x i1> %c, <3 x i32> %d, <3 x i32> %e
diff --git a/llvm/test/CodeGen/AArch64/sext.ll b/llvm/test/CodeGen/AArch64/sext.ll
index dd53780be14c1..f319721e0f2f0 100644
--- a/llvm/test/CodeGen/AArch64/sext.ll
+++ b/llvm/test/CodeGen/AArch64/sext.ll
@@ -2,9 +2,6 @@
 ; RUN: llc -mtriple=aarch64 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-SD
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
-; CHECK-GI:       warning: Instruction selection used fallback path for sext_v3i8_v3i32
-; CHECK-GI-NEXT:  warning: Instruction selection used fallback path for sext_v3i10_v3i32
-
 define i16 @sext_i8_to_i16(i8 %a) {
 ; CHECK-LABEL: sext_i8_to_i16:
 ; CHECK:       // %bb.0: // %entry
@@ -236,15 +233,31 @@ entry:
 }
 
 define <3 x i32> @sext_v3i8_v3i32(<3 x i8> %a) {
-; CHECK-LABEL: sext_v3i8_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    shl v0.4s, v0.4s, #24
-; CHECK-NEXT:    sshr v0.4s, v0.4s, #24
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sext_v3i8_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #24
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #24
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sext_v3i8_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #24 // =0x18
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w2
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    neg v2.4s, v0.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i8> %a to <3 x i32>
   ret <3 x i32> %c
@@ -388,15 +401,31 @@ entry:
 }
 
 define <3 x i32> @sext_v3i10_v3i32(<3 x i10> %a) {
-; CHECK-LABEL: sext_v3i10_v3i32:
-; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    ushll v0.4s, v0.4h, #0
-; CHECK-NEXT:    shl v0.4s, v0.4s, #22
-; CHECK-NEXT:    sshr v0.4s, v0.4s, #22
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: sext_v3i10_v3i32:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    shl v0.4s, v0.4s, #22
+; CHECK-SD-NEXT:    sshr v0.4s, v0.4s, #22
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: sext_v3i10_v3i32:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    mov w8, #22 // =0x16
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    mov v1.s[1], w1
+; CHECK-GI-NEXT:    mov v0.s[1], w8
+; CHECK-GI-NEXT:    mov v1.s[2], w2
+; CHECK-GI-NEXT:    mov v0.s[2], w8
+; CHECK-GI-NEXT:    mov v1.s[3], w8
+; CHECK-GI-NEXT:    mov v0.s[3], w8
+; CHECK-GI-NEXT:    neg v2.4s, v0.4s
+; CHECK-GI-NEXT:    ushl v0.4s, v1.4s, v0.4s
+; CHECK-GI-NEXT:    sshl v0.4s, v0.4s, v2.4s
+; CHECK-GI-NEXT:    ret
 entry:
   %c = sext <3 x i10> %a to <3 x i32>
   ret <3 x i32> %c
diff --git a/llvm/test/CodeGen/AArch64/shift.ll b/llvm/test/CodeGen/AArch64/shift.ll
index 55ff97563e1ef..15c8e1792f3d3 100644
--- a/llvm/test/CodeGen/AArch64/shift.ll
+++ b/llvm/test/CodeGen/AArch64/shift.ll
@@ -3,33 +3,11 @@
 ; RUN: llc -mtriple=aarch64 -global-isel -global-isel-abort=2 -verify-machineinstrs %s -o - 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
 
 ; CHECK-GI:         warning: Instruction selection used fallback path for shl_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v32i8
 ; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v2i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v16i16
 ; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v32i8
 ; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v2i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v16i16
 ; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v4i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v32i8
 ; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v2i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v16i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v3i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v7i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v3i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v7i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for shl_v3i32
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v3i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v7i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v3i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v7i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for ashr_v3i32
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v3i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v7i8
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v3i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v7i16
-; CHECK-GI-NEXT:    warning: Instruction selection used fallback path for lshr_v3i32
-
 
 define i1 @shl_i1(i1 %0, i1 %1){
 ; CHECK-SD-LABEL: shl_i1:
@@ -562,11 +540,17 @@ define <4 x i8> @shl_v4i8(<4 x i8> %0, <4 x i8> %1){
 }
 
 define <32 x i8> @shl_v32i8(<32 x i8> %0, <32 x i8> %1){
-; CHECK-LABEL: shl_v32i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushl v1.16b, v1.16b, v3.16b
-; CHECK-NEXT:    ushl v0.16b, v0.16b, v2.16b
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_v32i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushl v1.16b, v1.16b, v3.16b
+; CHECK-SD-NEXT:    ushl v0.16b, v0.16b, v2.16b
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v32i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushl v0.16b, v0.16b, v2.16b
+; CHECK-GI-NEXT:    ushl v1.16b, v1.16b, v3.16b
+; CHECK-GI-NEXT:    ret
     %3 = shl <32 x i8> %0, %1
     ret <32 x i8> %3
 }
@@ -583,11 +567,17 @@ define <2 x i16> @shl_v2i16(<2 x i16> %0, <2 x i16> %1){
 }
 
 define <16 x i16> @shl_v16i16(<16 x i16> %0, <16 x i16> %1){
-; CHECK-LABEL: shl_v16i16:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    ushl v1.8h, v1.8h, v3.8h
-; CHECK-NEXT:    ushl v0.8h, v0.8h, v2.8h
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_v16i16:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-SD-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v16i16:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    ushl v0.8h, v0.8h, v2.8h
+; CHECK-GI-NEXT:    ushl v1.8h, v1.8h, v3.8h
+; CHECK-GI-NEXT:    ret
     %3 = shl <16 x i16> %0, %1
     ret <16 x i16> %3
 }
@@ -834,20 +824,48 @@ define <4 x i64> @lshr_v4i64(<4 x i64> %0, <4 x i64> %1){
 ; ===== Vector with Non-Pow 2 Width =====
 
 define <3 x i8> @shl_v3i8(<3 x i8> %0, <3 x i8> %1){
-; CHECK-LABEL: shl_v3i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w3
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    mov v0.h[1], w4
-; CHECK-NEXT:    mov v1.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w5
-; CHECK-NEXT:    mov v1.h[2], w2
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    ushl v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    umov w0, v0.h[0]
-; CHECK-NEXT:    umov w1, v0.h[1]
-; CHECK-NEXT:    umov w2, v0.h[2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: shl_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w3
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w4
+; CHECK-SD-NEXT:    mov v1.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w5
+; CHECK-SD-NEXT:    mov v1.h[2], w2
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    ushl v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    umov w0, v0.h[0]
+; CHECK-SD-NEXT:    umov w1, v0.h[1]
+; CHECK-SD-NEXT:    umov w2, v0.h[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: shl_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s0, w0
+; CHECK-GI-NEXT:    fmov s1, w1
+; CHECK-GI-NEXT:    fmov s2, w3
+; CHECK-GI-NEXT:    fmov s3, w4
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w2
+; CHECK-GI-NEXT:    mov v2.b[1], v3.b[0]
+; CHECK-GI-NEXT:    fmov s3, w5
+; CHECK-GI-NEXT:    mov v0.b[2], v1.b[0]
+; CHECK-GI-NEXT:    mov v2.b[2], v3.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v2.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
+; CHECK-GI-NEXT:    mov v2.b[4], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
+; CHECK-GI-NEXT:    mov v2.b[5], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
+; CHECK-GI-NEXT:    mov v2.b[6], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
+; CHECK-GI-NEXT:    mov v2.b[7], v0.b[0]
+; CHECK-GI-NEXT:    ushl v0.8b, v0.8b, v2.8b
+; CHECK-GI-NEXT:    umov w0, v0.b[0]
+; CHECK-GI-NEXT:    umov w1, v0.b[1]
+; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    ret
     %3 = shl <3 x i8> %0, %1
     ret <3 x i8> %3
 }
@@ -889,23 +907,52 @@ define <3 x i32> @shl_v3i32(<3 x i32> %0, <3 x i32> %1){
 }
 
 define <3 x i8> @ashr_v3i8(<3 x i8> %0, <3 x i8> %1){
-; CHECK-LABEL: ashr_v3i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w0
-; CHECK-NEXT:    fmov s1, w3
-; CHECK-NEXT:    mov v0.h[1], w1
-; CHECK-NEXT:    mov v1.h[1], w4
-; CHECK-NEXT:    mov v0.h[2], w2
-; CHECK-NEXT:    mov v1.h[2], w5
-; CHECK-NEXT:    shl v0.4h, v0.4h, #8
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    sshr v0.4h, v0.4h, #8
-; CHECK-NEXT:    neg v1.4h, v1.4h
-; CHECK-NEXT:    sshl v0.4h, v0.4h, v1.4h
-; CHECK-NEXT:    umov w0, v0.h[0]
-; CHECK-NEXT:    umov w1, v0.h[1]
-; CHECK-NEXT:    umov w2, v0.h[2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: ashr_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w0
+; CHECK-SD-NEXT:    fmov s1, w3
+; CHECK-SD-NEXT:    mov v0.h[1], w1
+; CHECK-SD-NEXT:    mov v1.h[1], w4
+; CHECK-SD-NEXT:    mov v0.h[2], w2
+; CHECK-SD-NEXT:    mov v1.h[2], w5
+; CHECK-SD-NEXT:    shl v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT:    sshr v0.4h, v0.4h, #8
+; CHECK-SD-NEXT:    neg v1.4h, v1.4h
+; CHECK-SD-NEXT:    sshl v0.4h, v0.4h, v1.4h
+; CHECK-SD-NEXT:    umov w0, v0.h[0]
+; CHECK-SD-NEXT:    umov w1, v0.h[1]
+; CHECK-SD-NEXT:    umov w2, v0.h[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: ashr_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s0, w3
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w5
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w2
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[4], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[5], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[6], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT:    neg v0.8b, v0.8b
+; CHECK-GI-NEXT:    sshl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    umov w0, v0.b[0]
+; CHECK-GI-NEXT:    umov w1, v0.b[1]
+; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    ret
     %3 = ashr <3 x i8> %0, %1
     ret <3 x i8> %3
 }
@@ -951,22 +998,51 @@ define <3 x i32> @ashr_v3i32(<3 x i32> %0, <3 x i32> %1){
 }
 
 define <3 x i8> @lshr_v3i8(<3 x i8> %0, <3 x i8> %1){
-; CHECK-LABEL: lshr_v3i8:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fmov s0, w3
-; CHECK-NEXT:    fmov s1, w0
-; CHECK-NEXT:    mov v0.h[1], w4
-; CHECK-NEXT:    mov v1.h[1], w1
-; CHECK-NEXT:    mov v0.h[2], w5
-; CHECK-NEXT:    mov v1.h[2], w2
-; CHECK-NEXT:    bic v0.4h, #255, lsl #8
-; CHECK-NEXT:    bic v1.4h, #255, lsl #8
-; CHECK-NEXT:    neg v0.4h, v0.4h
-; CHECK-NEXT:    ushl v0.4h, v1.4h, v0.4h
-; CHECK-NEXT:    umov w0, v0.h[0]
-; CHECK-NEXT:    umov w1, v0.h[1]
-; CHECK-NEXT:    umov w2, v0.h[2]
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: lshr_v3i8:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fmov s0, w3
+; CHECK-SD-NEXT:    fmov s1, w0
+; CHECK-SD-NEXT:    mov v0.h[1], w4
+; CHECK-SD-NEXT:    mov v1.h[1], w1
+; CHECK-SD-NEXT:    mov v0.h[2], w5
+; CHECK-SD-NEXT:    mov v1.h[2], w2
+; CHECK-SD-NEXT:    bic v0.4h, #255, lsl #8
+; CHECK-SD-NEXT:    bic v1.4h, #255, lsl #8
+; CHECK-SD-NEXT:    neg v0.4h, v0.4h
+; CHECK-SD-NEXT:    ushl v0.4h, v1.4h, v0.4h
+; CHECK-SD-NEXT:    umov w0, v0.h[0]
+; CHECK-SD-NEXT:    umov w1, v0.h[1]
+; CHECK-SD-NEXT:    umov w2, v0.h[2]
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: lshr_v3i8:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fmov s0, w3
+; CHECK-GI-NEXT:    fmov s1, w4
+; CHECK-GI-NEXT:    fmov s2, w1
+; CHECK-GI-NEXT:    mov v0.b[1], v1.b[0]
+; CHECK-GI-NEXT:    fmov s1, w0
+; CHECK-GI-NEXT:    mov v1.b[1], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w5
+; CHECK-GI-NEXT:    mov v0.b[2], v2.b[0]
+; CHECK-GI-NEXT:    fmov s2, w2
+; CHECK-GI-NEXT:    mov v1.b[2], v2.b[0]
+; CHECK-GI-NEXT:    mov v0.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[3], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[4], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[4], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[5], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[5], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[6], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[6], v0.b[0]
+; CHECK-GI-NEXT:    mov v0.b[7], v0.b[0]
+; CHECK-GI-NEXT:    mov v1.b[7], v0.b[0]
+; CHECK-GI-NEXT:    neg v0.8b, v0.8b
+; CHECK-GI-NEXT:    ushl v0.8b, v1.8b, v0.8b
+; CHECK-GI-NEXT:    umov w0, v0.b[0]
+; CHECK-GI-NEXT:    umov w1, v0.b[1]
+; CHECK-GI-NEXT:    umov w2, v0.b[2]
+; CHECK-GI-NEXT:    ret
     %3 = lshr <3 x i8> %0, %1
     ret <3 x i8> %3
 }

From 3c246efd04210af56ab6ce960b98283ec5bc7c30 Mon Sep 17 00:00:00 2001
From: Petr Maj <53400784+zduka@users.noreply.github.com>
Date: Mon, 22 Jan 2024 15:10:04 +0100
Subject: [PATCH 390/843] True fixpoint algorithm in RS4GC (#75826)

Fixes a problem where the explicit marking of various instructions as
conflicts did not propagate to their users. An example of this:

```
%getelementptr = getelementptr i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i64> <i64 888, i64 908>
%shufflevector = shufflevector <2 x ptr addrspace(1)> %getelementptr, <2 x ptr addrspace(1)> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%shufflevector1 = shufflevector <2 x ptr addrspace(1)> %getelementptr, <2 x ptr addrspace(1)> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
%select = select i1 false, <4 x ptr addrspace(1)> %shufflevector1, <4 x ptr addrspace(1)> %shufflevector
```

Here the vector shuffles will get single base (gep) during the fixpoint
and therefore the select will get a known base (gep). We later mark the
shuffles as conflicts, but this does not change the base of select. This
gets caught by an assert where the select's type will differ from its
(wrong) base later on.

The solution in the MR is to move the explicit conflict marking into the
fixpoint phase.

---------

Co-authored-by: Petr Maj <pmaj@azul.com>
---
 .../Scalar/RewriteStatepointsForGC.cpp        | 94 ++++++++++---------
 .../RewriteStatepointsForGC/base-pointers.ll  | 34 ++++++-
 2 files changed, 79 insertions(+), 49 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index b98f823ab00b1..c09983702d6e2 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -967,6 +967,44 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
     return BDVState(BaseValue, BDVState::Base, BaseValue);
   };
 
+  // Even though we have identified a concrete base (or a conflict) for all live
+  // pointers at this point, there are cases where the base is of an
+  // incompatible type compared to the original instruction. We conservatively
+  // mark those as conflicts to ensure that corresponding BDVs will be generated
+  // in the next steps.
+
+  // this is a rather explicit check for all cases where we should mark the
+  // state as a conflict to force the latter stages of the algorithm to emit
+  // the BDVs.
+  // TODO: in many cases the instructions emited for the conflicting states
+  // will be identical to the I itself (if the I's operate on their BDVs
+  // themselves). We should exploit this, but can't do it here since it would
+  // break the invariant about the BDVs not being known to be a base.
+  // TODO: the code also does not handle constants at all - the algorithm relies
+  // on all constants having the same BDV and therefore constant-only insns
+  // will never be in conflict, but this check is ignored here. If the
+  // constant conflicts will be to BDVs themselves, they will be identical
+  // instructions and will get optimized away (as in the above TODO)
+  auto MarkConflict = [&](Instruction *I, Value *BaseValue) {
+    // II and EE mixes vector & scalar so is always a conflict
+    if (isa<InsertElementInst>(I) || isa<ExtractElementInst>(I))
+      return true;
+    // Shuffle vector is always a conflict as it creates new vector from
+    // existing ones.
+    if (isa<ShuffleVectorInst>(I))
+      return true;
+    // Any  instructions where the computed base type differs from the
+    // instruction type. An example is where an extract instruction is used by a
+    // select. Here the select's BDV is a vector (because of extract's BDV),
+    // while the select itself is a scalar type. Note that the IE and EE
+    // instruction check is not fully subsumed by the vector<->scalar check at
+    // the end, this is due to the BDV algorithm being ignorant of BDV types at
+    // this junction.
+    if (!areBothVectorOrScalar(BaseValue, I))
+      return true;
+    return false;
+  };
+
   bool Progress = true;
   while (Progress) {
 #ifndef NDEBUG
@@ -993,6 +1031,14 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
         NewState.meet(OpState);
       });
 
+      // if the instruction has known base, but should in fact be marked as
+      // conflict because of incompatible in/out types, we mark it as such
+      // ensuring that it will propagate through the fixpoint iteration
+      auto I = cast<Instruction>(BDV);
+      auto BV = NewState.getBaseValue();
+      if (BV && MarkConflict(I, BV))
+        NewState = BDVState(I, BDVState::Conflict);
+
       BDVState OldState = Pair.second;
       if (OldState != NewState) {
         Progress = true;
@@ -1012,44 +1058,8 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
   }
 #endif
 
-  // Even though we have identified a concrete base (or a conflict) for all live
-  // pointers at this point, there are cases where the base is of an
-  // incompatible type compared to the original instruction. We conservatively
-  // mark those as conflicts to ensure that corresponding BDVs will be generated
-  // in the next steps.
-
-  // this is a rather explicit check for all cases where we should mark the
-  // state as a conflict to force the latter stages of the algorithm to emit
-  // the BDVs.
-  // TODO: in many cases the instructions emited for the conflicting states
-  // will be identical to the I itself (if the I's operate on their BDVs
-  // themselves). We should expoit this, but can't do it here since it would
-  // break the invariant about the BDVs not being known to be a base.
-  // TODO: the code also does not handle constants at all - the algorithm relies
-  // on all constants having the same BDV and therefore constant-only insns
-  // will never be in conflict, but this check is ignored here. If the
-  // constant conflicts will be to BDVs themselves, they will be identical
-  // instructions and will get optimized away (as in the above TODO)
-  auto MarkConflict = [&](Instruction *I, Value *BaseValue) {
-    // II and EE mixes vector & scalar so is always a conflict
-    if (isa<InsertElementInst>(I) || isa<ExtractElementInst>(I))
-      return true;
-    // Shuffle vector is always a conflict as it creates new vector from
-    // existing ones.
-    if (isa<ShuffleVectorInst>(I))
-      return true;
-    // Any  instructions where the computed base type differs from the
-    // instruction type. An example is where an extract instruction is used by a
-    // select. Here the select's BDV is a vector (because of extract's BDV),
-    // while the select itself is a scalar type. Note that the IE and EE
-    // instruction check is not fully subsumed by the vector<->scalar check at
-    // the end, this is due to the BDV algorithm being ignorant of BDV types at
-    // this junction.
-    if (!areBothVectorOrScalar(BaseValue, I))
-      return true;
-    return false;
-  };
-
+  // since we do the conflict marking as part of the fixpoint iteration this
+  // loop only asserts that invariants are met
   for (auto Pair : States) {
     Instruction *I = cast<Instruction>(Pair.first);
     BDVState State = Pair.second;
@@ -1061,14 +1071,6 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
         (!isKnownBase(I, KnownBases) || !areBothVectorOrScalar(I, BaseValue)) &&
         "why did it get added?");
     assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
-
-    // since we only mark vec-scalar insns as conflicts in the pass, our work is
-    // done if the instruction already conflicts
-    if (State.isConflict())
-      continue;
-
-    if (MarkConflict(I, BaseValue))
-      States[I] = BDVState(I, BDVState::Conflict);
   }
 
 #ifndef NDEBUG
diff --git a/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers.ll b/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers.ll
index d85e236591770..3718b4ead2f81 100644
--- a/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers.ll
+++ b/llvm/test/Transforms/RewriteStatepointsForGC/base-pointers.ll
@@ -39,9 +39,9 @@ define ptr addrspace(1) @test1(i32 %caller, ptr addrspace(1) %a, ptr addrspace(1
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[LEFT:%.*]], label [[RIGHT:%.*]]
 ; CHECK:       left:
 ; CHECK-NEXT:    switch i32 [[UNKNOWN:%.*]], label [[RIGHT]] [
-; CHECK-NEXT:    i32 0, label [[MERGE:%.*]]
-; CHECK-NEXT:    i32 1, label [[MERGE]]
-; CHECK-NEXT:    i32 5, label [[MERGE]]
+; CHECK-NEXT:      i32 0, label [[MERGE:%.*]]
+; CHECK-NEXT:      i32 1, label [[MERGE]]
+; CHECK-NEXT:      i32 5, label [[MERGE]]
 ; CHECK-NEXT:    ]
 ; CHECK:       right:
 ; CHECK-NEXT:    br label [[MERGE]]
@@ -221,4 +221,32 @@ next:
   ret ptr addrspace(1) %bdv
 }
 
+declare i32 @snork()
+
+define void @test7() gc "statepoint-example" {
+; CHECK-LABEL: @test7(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    [[GETELEMENTPTR:%.*]] = getelementptr i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i64> <i64 888, i64 908>
+; CHECK-NEXT:    [[SHUFFLEVECTOR_BASE:%.*]] = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !is_base_value !0
+; CHECK-NEXT:    [[SHUFFLEVECTOR:%.*]] = shufflevector <2 x ptr addrspace(1)> [[GETELEMENTPTR]], <2 x ptr addrspace(1)> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SHUFFLEVECTOR1_BASE:%.*]] = shufflevector <2 x ptr addrspace(1)> zeroinitializer, <2 x ptr addrspace(1)> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>, !is_base_value !0
+; CHECK-NEXT:    [[SHUFFLEVECTOR1:%.*]] = shufflevector <2 x ptr addrspace(1)> [[GETELEMENTPTR]], <2 x ptr addrspace(1)> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[SELECT_BASE:%.*]] = select i1 false, <4 x ptr addrspace(1)> [[SHUFFLEVECTOR1_BASE]], <4 x ptr addrspace(1)> [[SHUFFLEVECTOR_BASE]], !is_base_value !0
+; CHECK-NEXT:    [[SELECT:%.*]] = select i1 false, <4 x ptr addrspace(1)> [[SHUFFLEVECTOR1]], <4 x ptr addrspace(1)> [[SHUFFLEVECTOR]]
+; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr elementtype(i32 ()) @snork, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(<4 x ptr addrspace(1)> [[SELECT]], <4 x ptr addrspace(1)> [[SELECT_BASE]]) ]
+; CHECK-NEXT:    [[SELECT_RELOCATED:%.*]] = call coldcc <4 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v4p1(token [[STATEPOINT_TOKEN]], i32 1, i32 0)
+; CHECK-NEXT:    [[SELECT_BASE_RELOCATED:%.*]] = call coldcc <4 x ptr addrspace(1)> @llvm.experimental.gc.relocate.v4p1(token [[STATEPOINT_TOKEN]], i32 1, i32 1)
+; CHECK-NEXT:    [[EXTRACTELEMENT:%.*]] = extractelement <4 x ptr addrspace(1)> [[SELECT_RELOCATED]], i32 0
+; CHECK-NEXT:    ret void
+;
+bb:
+  %getelementptr = getelementptr i8, <2 x ptr addrspace(1)> zeroinitializer, <2 x i64> <i64 888, i64 908>
+  %shufflevector = shufflevector <2 x ptr addrspace(1)> %getelementptr, <2 x ptr addrspace(1)> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %shufflevector1 = shufflevector <2 x ptr addrspace(1)> %getelementptr, <2 x ptr addrspace(1)> zeroinitializer, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %select = select i1 false, <4 x ptr addrspace(1)> %shufflevector1, <4 x ptr addrspace(1)> %shufflevector
+  %call = call i32 @snork()
+  %extractelement = extractelement <4 x ptr addrspace(1)> %select, i32 0
+  ret void
+}
+
 declare void @foo()

From 1edb43f62ca408774b4392cbe641598e1c4725be Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Mon, 22 Jan 2024 15:18:16 +0100
Subject: [PATCH 391/843] [reland][libc] `FPRep` builders return `FPRep`
 instead of raw `StorageType` (#78978)

Reland #78588
---
 libc/src/__support/FPUtil/FPBits.h            | 595 +++++++++---------
 .../test/src/__support/FPUtil/fpbits_test.cpp | 171 ++---
 2 files changed, 400 insertions(+), 366 deletions(-)

diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index be700285de828..2465158bb2cdf 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -64,38 +64,46 @@ LIBC_INLINE_VAR constexpr Sign Sign::POS = Sign(false);
 //             └─────────▲─────────┘
 //                       │
 //             ┌─────────┴─────────┐
-//             │ FPRepBase<FPType> │
+//             │ FPStorage<FPType> │
 //             └─────────▲─────────┘
 //                       │
 //          ┌────────────┴─────────────┐
 //          │                          │
-// ┌────────┴──────┐     ┌─────────────┴──────────────┐
-// │ FPRep<FPType> │     │ FPRep<FPType::X86_Binary80 │
-// └────────▲──────┘     └─────────────▲──────────────┘
+// ┌────────┴─────────┐ ┌──────────────┴──────────────────┐
+// │ FPRepSem<FPType> │ │  FPRepSem<FPType::X86_Binary80  │
+// └────────▲─────────┘ └──────────────▲──────────────────┘
 //          │                          │
 //          └────────────┬─────────────┘
 //                       │
 //                 ┌─────┴─────┐
+//                 │  FPRep<T> │
+//                 └───────────┘
+//                       │
+//                 ┌─────┴─────┐
 //                 │ FPBits<T> │
 //                 └───────────┘
 //
-// - 'FPLayout' defines only a few constants, namely the 'StorageType' and the
-// length of the sign, the exponent and significand parts.
-// - 'FPRepBase' builds more constants on top of those from 'FPLayout' like
-// exponent bias, shifts and masks. It also defines tools to assemble or test
+// - 'FPLayout' defines only a few constants, namely the 'StorageType' and
+// length of the sign, the exponent, fraction and significand parts.
+// - 'FPStorage' builds more constants on top of those from 'FPLayout' like
+// exponent bias and masks. It also holds the bit representation of the
+// floating point as a 'StorageType' type and defines tools to assemble or test
 // these parts.
-// - 'FPRep' defines functions to interact with the floating point
-// representation. The default implementation is the one for 'IEEE754', a
-// specialization is provided for X86 Extended Precision that has a different
-// encoding.
-// - 'FPBits' is templated on the platform floating point types. Contrary to
-// 'FPRep' that is platform agnostic 'FPBits' is architecture dependent.
+// - 'FPRepSem' defines functions to interact semantically with the floating
+// point representation. The default implementation is the one for 'IEEE754', a
+// specialization is provided for X86 Extended Precision.
+// - 'FPRep' derives from 'FPRepSem' and adds functions that are common to all
+// implementations.
+// - 'FPBits' exposes all functions from 'FPRep' but operates on the native C++
+// floating point type instead of 'FPType'.
 
 namespace internal {
 
 // Defines the layout (sign, exponent, significand) of a floating point type in
 // memory. It also defines its associated StorageType, i.e., the unsigned
 // integer type used to manipulate its representation.
+// Additionally we provide the fractional part length, i.e., the number of bits
+// after the decimal dot when the number is in normal form.
 template <FPType> struct FPLayout {};
 
 template <> struct FPLayout<FPType::IEEE754_Binary16> {
@@ -103,6 +111,7 @@ template <> struct FPLayout<FPType::IEEE754_Binary16> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 5;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 10;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary32> {
@@ -110,6 +119,7 @@ template <> struct FPLayout<FPType::IEEE754_Binary32> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 8;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 23;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary64> {
@@ -117,6 +127,7 @@ template <> struct FPLayout<FPType::IEEE754_Binary64> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 11;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 52;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::IEEE754_Binary128> {
@@ -124,6 +135,7 @@ template <> struct FPLayout<FPType::IEEE754_Binary128> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 112;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN;
 };
 
 template <> struct FPLayout<FPType::X86_Binary80> {
@@ -131,23 +143,22 @@ template <> struct FPLayout<FPType::X86_Binary80> {
   LIBC_INLINE_VAR static constexpr int SIGN_LEN = 1;
   LIBC_INLINE_VAR static constexpr int EXP_LEN = 15;
   LIBC_INLINE_VAR static constexpr int SIG_LEN = 64;
+  LIBC_INLINE_VAR static constexpr int FRACTION_LEN = SIG_LEN - 1;
 };
 
-} // namespace internal
-
-// FPRepBase derives useful constants from the FPLayout.
-template <FPType fp_type>
-struct FPRepBase : public internal::FPLayout<fp_type> {
-private:
-  using UP = internal::FPLayout<fp_type>;
+// FPStorage derives useful constants from the FPLayout above.
+template <FPType fp_type> struct FPStorage : public FPLayout<fp_type> {
+  using UP = FPLayout<fp_type>;
 
-public:
   using UP::EXP_LEN;  // The number of bits for the *exponent* part
   using UP::SIG_LEN;  // The number of bits for the *significand* part
   using UP::SIGN_LEN; // The number of bits for the *sign* part
   // For convenience, the sum of `SIG_LEN`, `EXP_LEN`, and `SIGN_LEN`.
   LIBC_INLINE_VAR static constexpr int TOTAL_LEN = SIGN_LEN + EXP_LEN + SIG_LEN;
 
+  // The number of bits after the decimal dot when the number is in normal form.
+  using UP::FRACTION_LEN;
+
   // An unsigned integer that is wide enough to contain all of the floating
   // point bits.
   using StorageType = typename UP::StorageType;
@@ -162,41 +173,30 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
       (1U << (EXP_LEN - 1U)) - 1U;
   static_assert(EXP_BIAS > 0);
 
-protected:
-  // The shift amount to get the *significand* part to the least significant
-  // bit. Always `0` but kept for consistency.
-  LIBC_INLINE_VAR static constexpr int SIG_MASK_SHIFT = 0;
-  // The shift amount to get the *exponent* part to the least significant bit.
-  LIBC_INLINE_VAR static constexpr int EXP_MASK_SHIFT = SIG_LEN;
-  // The shift amount to get the *sign* part to the least significant bit.
-  LIBC_INLINE_VAR static constexpr int SIGN_MASK_SHIFT = SIG_LEN + EXP_LEN;
-
   // The bit pattern that keeps only the *significand* part.
   LIBC_INLINE_VAR static constexpr StorageType SIG_MASK =
-      mask_trailing_ones<StorageType, SIG_LEN>() << SIG_MASK_SHIFT;
-
-public:
+      mask_trailing_ones<StorageType, SIG_LEN>();
   // The bit pattern that keeps only the *exponent* part.
   LIBC_INLINE_VAR static constexpr StorageType EXP_MASK =
-      mask_trailing_ones<StorageType, EXP_LEN>() << EXP_MASK_SHIFT;
+      mask_trailing_ones<StorageType, EXP_LEN>() << SIG_LEN;
   // The bit pattern that keeps only the *sign* part.
   LIBC_INLINE_VAR static constexpr StorageType SIGN_MASK =
-      mask_trailing_ones<StorageType, SIGN_LEN>() << SIGN_MASK_SHIFT;
+      mask_trailing_ones<StorageType, SIGN_LEN>() << (EXP_LEN + SIG_LEN);
   // The bit pattern that keeps only the *exponent + significand* part.
   LIBC_INLINE_VAR static constexpr StorageType EXP_SIG_MASK =
       mask_trailing_ones<StorageType, EXP_LEN + SIG_LEN>();
   // The bit pattern that keeps only the *sign + exponent + significand* part.
   LIBC_INLINE_VAR static constexpr StorageType FP_MASK =
       mask_trailing_ones<StorageType, TOTAL_LEN>();
+  // The bit pattern that keeps only the *fraction* part.
+  // i.e., the *significand* without the leading one.
+  LIBC_INLINE_VAR static constexpr StorageType FRACTION_MASK =
+      mask_trailing_ones<StorageType, FRACTION_LEN>();
 
   static_assert((SIG_MASK & EXP_MASK & SIGN_MASK) == 0, "masks disjoint");
   static_assert((SIG_MASK | EXP_MASK | SIGN_MASK) == FP_MASK, "masks cover");
 
 protected:
-  LIBC_INLINE static constexpr StorageType bit_at(int position) {
-    return StorageType(1) << position;
-  }
-
   // A stongly typed integer that prevents mixing and matching integers with
   // different semantics.
   template <typename T> struct TypedInt {
@@ -248,7 +248,7 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
 
   // An opaque type to store a floating point significand.
   // We define special values but it is valid to create arbitrary values as long
-  // as they are in the range [BITS_ALL_ZEROES, BITS_ALL_ONES].
+  // as they are in the range [ZERO, BITS_ALL_ONES].
   // Note that the semantics of the Significand are implementation dependent.
   // Values greater than BITS_ALL_ONES are truncated.
   struct Significand : public TypedInt<StorageType> {
@@ -277,10 +277,8 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
       return Significand(StorageType(1));
     }
     LIBC_INLINE static constexpr auto MSB() {
-      return Significand(StorageType(bit_at(SIG_LEN - 1)));
+      return Significand(StorageType(1) << (SIG_LEN - 1));
     }
-    // Aliases
-    LIBC_INLINE static constexpr auto BITS_ALL_ZEROES() { return ZERO(); }
     LIBC_INLINE static constexpr auto BITS_ALL_ONES() {
       return Significand(SIG_MASK);
     }
@@ -306,204 +304,112 @@ struct FPRepBase : public internal::FPLayout<fp_type> {
     return encode(exp, sig);
   }
 
+  // The floating point number representation as an unsigned integer.
+  StorageType bits{};
+
+  LIBC_INLINE constexpr FPStorage() : bits(0) {}
+  LIBC_INLINE constexpr FPStorage(StorageType value) : bits(value) {}
+
+  // Observers
   LIBC_INLINE constexpr StorageType exp_bits() const { return bits & EXP_MASK; }
   LIBC_INLINE constexpr StorageType sig_bits() const { return bits & SIG_MASK; }
   LIBC_INLINE constexpr StorageType exp_sig_bits() const {
     return bits & EXP_SIG_MASK;
   }
-
-private:
-  // Merge bits from 'a' and 'b' values according to 'mask'.
-  // Use 'a' bits when corresponding 'mask' bits are zeroes and 'b' bits when
-  // corresponding bits are ones.
-  LIBC_INLINE static constexpr StorageType merge(StorageType a, StorageType b,
-                                                 StorageType mask) {
-    // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
-    return a ^ ((a ^ b) & mask);
-  }
-
-protected:
-  // The number of bits after the decimal dot when the number is in normal form.
-  LIBC_INLINE_VAR static constexpr int FRACTION_LEN =
-      fp_type == FPType::X86_Binary80 ? SIG_LEN - 1 : SIG_LEN;
-  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
-      FRACTION_LEN + 1;
-  LIBC_INLINE_VAR static constexpr StorageType FRACTION_MASK =
-      mask_trailing_ones<StorageType, FRACTION_LEN>();
-
-  // The floating point number representation as an unsigned integer.
-  StorageType bits = 0;
-
-public:
-  LIBC_INLINE constexpr Sign sign() const {
-    return (bits & SIGN_MASK) ? Sign::NEG : Sign::POS;
-  }
-
-  LIBC_INLINE constexpr void set_sign(Sign signVal) {
-    if (sign() != signVal)
-      bits ^= SIGN_MASK;
-  }
-
-  LIBC_INLINE constexpr StorageType get_mantissa() const {
-    return bits & FRACTION_MASK;
-  }
-
-  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
-    bits = merge(bits, mantVal, FRACTION_MASK);
-  }
-
-  LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
-    return uint16_t((bits & EXP_MASK) >> EXP_MASK_SHIFT);
-  }
-
-  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
-    bits = merge(bits, biased << EXP_MASK_SHIFT, EXP_MASK);
-  }
-
-  LIBC_INLINE constexpr int get_exponent() const {
-    return int(get_biased_exponent()) - EXP_BIAS;
-  }
-
-  // If the number is subnormal, the exponent is treated as if it were the
-  // minimum exponent for a normal number. This is to keep continuity between
-  // the normal and subnormal ranges, but it causes problems for functions where
-  // values are calculated from the exponent, since just subtracting the bias
-  // will give a slightly incorrect result. Additionally, zero has an exponent
-  // of zero, and that should actually be treated as zero.
-  LIBC_INLINE constexpr int get_explicit_exponent() const {
-    const int biased_exp = int(get_biased_exponent());
-    if (is_zero()) {
-      return 0;
-    } else if (biased_exp == 0) {
-      return 1 - EXP_BIAS;
-    } else {
-      return biased_exp - EXP_BIAS;
-    }
-  }
-
-  LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
-  LIBC_INLINE constexpr void set_uintval(StorageType value) {
-    bits = (value & FP_MASK);
-  }
-
-  LIBC_INLINE constexpr bool is_zero() const { return exp_sig_bits() == 0; }
-
-  LIBC_INLINE
-  constexpr bool is_subnormal() const {
-    return exp_bits() == encode(BiasedExponent::BITS_ALL_ZEROES());
-  }
-
-  LIBC_INLINE constexpr bool is_neg() const { return sign().is_neg(); }
-  LIBC_INLINE constexpr bool is_pos() const { return sign().is_pos(); }
 };
 
-namespace internal {
-
-// Manipulates the representation of a floating point number defined by its
-// FPType. This layer is architecture agnostic and does not handle C++ floating
-// point types directly ('float', 'double' and 'long double'). Use the FPBits
-// below if needed.
-//
-// TODO: Specialize this class for FPType::X86_Binary80 and remove ad-hoc logic
-// from FPRepBase.
-template <FPType fp_type> struct FPRep : public FPRepBase<fp_type> {
-  using UP = FPRepBase<fp_type>;
+// This layer defines all functions that are specific to how the the floating
+// point type is encoded. It enables constructions, modification and observation
+// of values manipulated as 'StorageType'.
+template <FPType fp_type, typename RetT>
+struct FPRepSem : public FPStorage<fp_type> {
+  using UP = FPStorage<fp_type>;
   using typename UP::StorageType;
   using UP::FRACTION_LEN;
   using UP::FRACTION_MASK;
-  using UP::MANTISSA_PRECISION;
 
 protected:
-  using typename UP::BiasedExponent;
-  using typename UP::Exponent;
-  using typename UP::Significand;
+  using BiasedExp = typename UP::BiasedExponent;
+  using Exp = typename UP::Exponent;
+  using Sig = typename UP::Significand;
   using UP::encode;
   using UP::exp_bits;
   using UP::exp_sig_bits;
   using UP::sig_bits;
+  using UP::UP;
 
 public:
-  LIBC_INLINE constexpr bool is_nan() const {
-    return exp_sig_bits() >
-           encode(BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
+  // Builders
+  LIBC_INLINE static constexpr RetT one(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exp::ZERO(), Sig::ZERO()));
   }
-  LIBC_INLINE constexpr bool is_quiet_nan() const {
-    return exp_sig_bits() >=
-           encode(BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
+  LIBC_INLINE static constexpr RetT min_subnormal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::LSB()));
   }
-  LIBC_INLINE constexpr bool is_signaling_nan() const {
-    return is_nan() && !is_quiet_nan();
+  LIBC_INLINE static constexpr RetT max_subnormal(Sign sign = Sign::POS) {
+    return RetT(
+        encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::BITS_ALL_ONES()));
   }
-  LIBC_INLINE constexpr bool is_inf() const {
-    return exp_sig_bits() ==
-           encode(BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
+  LIBC_INLINE static constexpr RetT min_normal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exp::MIN(), Sig::ZERO()));
   }
-  LIBC_INLINE constexpr bool is_finite() const {
-    return exp_bits() != encode(BiasedExponent::BITS_ALL_ONES());
+  LIBC_INLINE static constexpr RetT max_normal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exp::MAX(), Sig::BITS_ALL_ONES()));
   }
-  LIBC_INLINE constexpr bool is_normal() const {
-    return is_finite() && !UP::is_subnormal();
+  LIBC_INLINE static constexpr RetT inf(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(), Sig::ZERO()));
   }
-
-  LIBC_INLINE static constexpr StorageType zero(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::ZERO());
+  LIBC_INLINE static constexpr RetT build_nan(Sign sign = Sign::POS,
+                                              StorageType v = 0) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(),
+                       (v ? Sig(v) : (Sig::MSB() >> 1))));
   }
-  LIBC_INLINE static constexpr StorageType one(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::ZERO(), Significand::ZERO());
+  LIBC_INLINE static constexpr RetT build_quiet_nan(Sign sign = Sign::POS,
+                                                    StorageType v = 0) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(), Sig::MSB() | Sig(v)));
   }
-  LIBC_INLINE static constexpr StorageType
-  min_subnormal(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::LSB());
+
+  // Observers
+  LIBC_INLINE constexpr bool is_nan() const {
+    return exp_sig_bits() > encode(BiasedExp::BITS_ALL_ONES(), Sig::ZERO());
   }
-  LIBC_INLINE static constexpr StorageType
-  max_subnormal(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(),
-                  Significand::BITS_ALL_ONES());
+  LIBC_INLINE constexpr bool is_quiet_nan() const {
+    return exp_sig_bits() >= encode(BiasedExp::BITS_ALL_ONES(), Sig::MSB());
   }
-  LIBC_INLINE static constexpr StorageType min_normal(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::MIN(), Significand::ZERO());
+  LIBC_INLINE constexpr bool is_signaling_nan() const {
+    return is_nan() && !is_quiet_nan();
   }
-  LIBC_INLINE static constexpr StorageType max_normal(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::MAX(), Significand::BITS_ALL_ONES());
+  LIBC_INLINE constexpr bool is_inf() const {
+    return exp_sig_bits() == encode(BiasedExp::BITS_ALL_ONES(), Sig::ZERO());
   }
-  LIBC_INLINE static constexpr StorageType inf(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(), Significand::ZERO());
+  LIBC_INLINE constexpr bool is_finite() const {
+    return exp_bits() != encode(BiasedExp::BITS_ALL_ONES());
   }
-  LIBC_INLINE static constexpr StorageType build_nan(Sign sign = Sign::POS,
-                                                     StorageType v = 0) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
-                  (v ? Significand(v) : (Significand::MSB() >> 1)));
+  LIBC_INLINE
+  constexpr bool is_subnormal() const {
+    return exp_bits() == encode(BiasedExp::BITS_ALL_ZEROES());
   }
-  LIBC_INLINE static constexpr StorageType
-  build_quiet_nan(Sign sign = Sign::POS, StorageType v = 0) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
-                  Significand::MSB() | Significand(v));
+  LIBC_INLINE constexpr bool is_normal() const {
+    return is_finite() && !UP::is_subnormal();
   }
-
-  // The function return mantissa with the implicit bit set iff the current
+  // Returns the mantissa with the implicit bit set iff the current
   // value is a valid normal number.
   LIBC_INLINE constexpr StorageType get_explicit_mantissa() {
-    if (UP::is_subnormal())
+    if (is_subnormal())
       return sig_bits();
     return (StorageType(1) << UP::SIG_LEN) | sig_bits();
   }
 };
 
 // Specialization for the X86 Extended Precision type.
-template <>
-struct FPRep<FPType::X86_Binary80> : public FPRepBase<FPType::X86_Binary80> {
-  using UP = FPRepBase<FPType::X86_Binary80>;
+template <typename RetT>
+struct FPRepSem<FPType::X86_Binary80, RetT>
+    : public FPStorage<FPType::X86_Binary80> {
+  using UP = FPStorage<FPType::X86_Binary80>;
   using typename UP::StorageType;
   using UP::FRACTION_LEN;
   using UP::FRACTION_MASK;
-  using UP::MANTISSA_PRECISION;
-
-protected:
-  using typename UP::BiasedExponent;
-  using typename UP::Significand;
-  using UP::encode;
 
-public:
   // The x86 80 bit float represents the leading digit of the mantissa
   // explicitly. This is the mask for that bit.
   static constexpr StorageType EXPLICIT_BIT_MASK = StorageType(1)
@@ -515,6 +421,45 @@ struct FPRep<FPType::X86_Binary80> : public FPRepBase<FPType::X86_Binary80> {
                 "the explicit bit and the fractional part should cover the "
                 "whole significand");
 
+protected:
+  using BiasedExp = typename UP::BiasedExponent;
+  using Sig = typename UP::Significand;
+  using UP::encode;
+  using UP::UP;
+
+public:
+  // Builders
+  LIBC_INLINE static constexpr RetT one(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exponent::ZERO(), Sig::MSB()));
+  }
+  LIBC_INLINE static constexpr RetT min_subnormal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::LSB()));
+  }
+  LIBC_INLINE static constexpr RetT max_subnormal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(),
+                       Sig::BITS_ALL_ONES() ^ Sig::MSB()));
+  }
+  LIBC_INLINE static constexpr RetT min_normal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exponent::MIN(), Sig::MSB()));
+  }
+  LIBC_INLINE static constexpr RetT max_normal(Sign sign = Sign::POS) {
+    return RetT(encode(sign, Exponent::MAX(), Sig::BITS_ALL_ONES()));
+  }
+  LIBC_INLINE static constexpr RetT inf(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(), Sig::MSB()));
+  }
+  LIBC_INLINE static constexpr RetT build_nan(Sign sign = Sign::POS,
+                                              StorageType v = 0) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(),
+                       Sig::MSB() | (v ? Sig(v) : (Sig::MSB() >> 2))));
+  }
+  LIBC_INLINE static constexpr RetT build_quiet_nan(Sign sign = Sign::POS,
+                                                    StorageType v = 0) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ONES(),
+                       Sig::MSB() | (Sig::MSB() >> 1) | Sig(v)));
+  }
+
+  // Observers
   LIBC_INLINE constexpr bool is_nan() const {
     // Most encoding forms from the table found in
     // https://en.wikipedia.org/wiki/Extended_precision#x86_extended_precision_format
@@ -527,85 +472,183 @@ struct FPRep<FPType::X86_Binary80> : public FPRepBase<FPType::X86_Binary80> {
     // - Quiet Not a Number
     // - Unnormal
     // This can be reduced to the following logic:
-    if (exp_bits() == encode(BiasedExponent::BITS_ALL_ONES()))
+    if (exp_bits() == encode(BiasedExp::BITS_ALL_ONES()))
       return !is_inf();
-    if (exp_bits() != encode(BiasedExponent::BITS_ALL_ZEROES()))
-      return (sig_bits() & encode(Significand::MSB())) == 0;
+    if (exp_bits() != encode(BiasedExp::BITS_ALL_ZEROES()))
+      return (sig_bits() & encode(Sig::MSB())) == 0;
     return false;
   }
   LIBC_INLINE constexpr bool is_quiet_nan() const {
     return exp_sig_bits() >=
-           encode(BiasedExponent::BITS_ALL_ONES(),
-                  Significand::MSB() | (Significand::MSB() >> 1));
+           encode(BiasedExp::BITS_ALL_ONES(), Sig::MSB() | (Sig::MSB() >> 1));
   }
   LIBC_INLINE constexpr bool is_signaling_nan() const {
     return is_nan() && !is_quiet_nan();
   }
   LIBC_INLINE constexpr bool is_inf() const {
-    return exp_sig_bits() ==
-           encode(BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
+    return exp_sig_bits() == encode(BiasedExp::BITS_ALL_ONES(), Sig::MSB());
   }
   LIBC_INLINE constexpr bool is_finite() const {
     return !is_inf() && !is_nan();
   }
+  LIBC_INLINE
+  constexpr bool is_subnormal() const {
+    return exp_bits() == encode(BiasedExp::BITS_ALL_ZEROES());
+  }
   LIBC_INLINE constexpr bool is_normal() const {
     const auto exp = exp_bits();
-    if (exp == encode(BiasedExponent::BITS_ALL_ZEROES()) ||
-        exp == encode(BiasedExponent::BITS_ALL_ONES()))
+    if (exp == encode(BiasedExp::BITS_ALL_ZEROES()) ||
+        exp == encode(BiasedExp::BITS_ALL_ONES()))
       return false;
     return get_implicit_bit();
   }
+  LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
+    return sig_bits();
+  }
+
+  // This functions is specific to FPRepSem<FPType::X86_Binary80>.
+  // TODO: Remove if possible.
+  LIBC_INLINE constexpr bool get_implicit_bit() const {
+    return static_cast<bool>(bits & EXPLICIT_BIT_MASK);
+  }
+
+  // This functions is specific to FPRepSem<FPType::X86_Binary80>.
+  // TODO: Remove if possible.
+  LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
+    if (get_implicit_bit() != implicitVal)
+      bits ^= EXPLICIT_BIT_MASK;
+  }
+};
+
+// 'FPRep' is the bottom of the class hierarchy that only deals with 'FPType'.
+// The operations dealing with specific float semantics are implemented by
+// 'FPRepSem' above and specialized when needed.
+//
+// The 'RetT' type is being propagated up to 'FPRepSem' so that the functions
+// creating new values (Builders) can return the appropriate type. That is, when
+// creating a value through 'FPBits' below the builder will return an 'FPBits'
+// value:
+// i.e., FPBits<float>::zero() // returns an FPBits<float>
+// When we don't care about specific C++ floating point type we can use 'FPRep'
+// directly and 'RetT' defaults to 'StorageType':
+// i.e., FPRep<FPType:IEEE754_Binary32:>::zero() // returns an 'uint32_t'
+template <FPType fp_type,
+          typename RetT = typename FPLayout<fp_type>::StorageType>
+struct FPRep : public FPRepSem<fp_type, RetT> {
+  using UP = FPRepSem<fp_type, RetT>;
+  using StorageType = typename UP::StorageType;
+
+protected:
+  using UP::bits;
+  using UP::encode;
+  using UP::exp_bits;
+  using UP::exp_sig_bits;
+
+  using BiasedExp = typename UP::BiasedExponent;
+  using Sig = typename UP::Significand;
 
-  LIBC_INLINE static constexpr StorageType zero(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::ZERO());
+  using UP::FP_MASK;
+  using UP::SIG_LEN;
+
+public:
+  using UP::EXP_BIAS;
+  using UP::EXP_MASK;
+  using UP::FRACTION_MASK;
+  using UP::SIGN_MASK;
+
+  // Representation
+  LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
+  LIBC_INLINE constexpr void set_uintval(StorageType value) {
+    bits = (value & FP_MASK);
   }
-  LIBC_INLINE static constexpr StorageType one(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::ZERO(), Significand::MSB());
+
+  // Builders
+  LIBC_INLINE static constexpr RetT zero(Sign sign = Sign::POS) {
+    return RetT(encode(sign, BiasedExp::BITS_ALL_ZEROES(), Sig::ZERO()));
   }
-  LIBC_INLINE static constexpr StorageType
-  min_subnormal(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(), Significand::LSB());
+  using UP::build_nan;
+  using UP::build_quiet_nan;
+  using UP::inf;
+  using UP::max_normal;
+  using UP::max_subnormal;
+  using UP::min_normal;
+  using UP::min_subnormal;
+  using UP::one;
+
+  // Modifiers
+  LIBC_INLINE constexpr RetT abs() const {
+    return RetT(bits & UP::EXP_SIG_MASK);
   }
-  LIBC_INLINE static constexpr StorageType
-  max_subnormal(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ZEROES(),
-                  Significand::BITS_ALL_ONES() ^ Significand::MSB());
+
+  // Observers
+  using UP::get_explicit_mantissa;
+  LIBC_INLINE constexpr bool is_zero() const { return exp_sig_bits() == 0; }
+  LIBC_INLINE constexpr bool is_inf_or_nan() const { return !is_finite(); }
+  using UP::is_finite;
+  using UP::is_inf;
+  using UP::is_nan;
+  using UP::is_normal;
+  using UP::is_quiet_nan;
+  using UP::is_signaling_nan;
+  using UP::is_subnormal;
+  LIBC_INLINE constexpr bool is_neg() const { return sign().is_neg(); }
+  LIBC_INLINE constexpr bool is_pos() const { return sign().is_pos(); }
+
+  // Parts
+  LIBC_INLINE constexpr Sign sign() const {
+    return (bits & SIGN_MASK) ? Sign::NEG : Sign::POS;
   }
-  LIBC_INLINE static constexpr StorageType min_normal(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::MIN(), Significand::MSB());
+
+  LIBC_INLINE constexpr void set_sign(Sign signVal) {
+    if (sign() != signVal)
+      bits ^= SIGN_MASK;
   }
-  LIBC_INLINE static constexpr StorageType max_normal(Sign sign = Sign::POS) {
-    return encode(sign, Exponent::MAX(), Significand::BITS_ALL_ONES());
+
+  LIBC_INLINE constexpr uint16_t get_biased_exponent() const {
+    return uint16_t((bits & UP::EXP_MASK) >> UP::SIG_LEN);
   }
-  LIBC_INLINE static constexpr StorageType inf(Sign sign = Sign::POS) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(), Significand::MSB());
+
+  LIBC_INLINE constexpr void set_biased_exponent(StorageType biased) {
+    bits = merge(bits, biased << SIG_LEN, EXP_MASK);
   }
-  LIBC_INLINE static constexpr StorageType build_nan(Sign sign = Sign::POS,
-                                                     StorageType v = 0) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
-                  Significand::MSB() |
-                      (v ? Significand(v) : (Significand::MSB() >> 2)));
+
+  LIBC_INLINE constexpr int get_exponent() const {
+    return int(get_biased_exponent()) - EXP_BIAS;
   }
-  LIBC_INLINE static constexpr StorageType
-  build_quiet_nan(Sign sign = Sign::POS, StorageType v = 0) {
-    return encode(sign, BiasedExponent::BITS_ALL_ONES(),
-                  Significand::MSB() | (Significand::MSB() >> 1) |
-                      Significand(v));
+
+  // If the number is subnormal, the exponent is treated as if it were the
+  // minimum exponent for a normal number. This is to keep continuity between
+  // the normal and subnormal ranges, but it causes problems for functions where
+  // values are calculated from the exponent, since just subtracting the bias
+  // will give a slightly incorrect result. Additionally, zero has an exponent
+  // of zero, and that should actually be treated as zero.
+  LIBC_INLINE constexpr int get_explicit_exponent() const {
+    const int biased_exp = int(get_biased_exponent());
+    if (is_zero()) {
+      return 0;
+    } else if (biased_exp == 0) {
+      return 1 - EXP_BIAS;
+    } else {
+      return biased_exp - EXP_BIAS;
+    }
   }
 
-  LIBC_INLINE constexpr StorageType get_explicit_mantissa() const {
-    return sig_bits();
+  LIBC_INLINE constexpr StorageType get_mantissa() const {
+    return bits & FRACTION_MASK;
   }
 
-  // The following functions are specific to FPRep<FPType::X86_Binary80>.
-  // TODO: Remove if possible.
-  LIBC_INLINE constexpr bool get_implicit_bit() const {
-    return static_cast<bool>(bits & EXPLICIT_BIT_MASK);
+  LIBC_INLINE constexpr void set_mantissa(StorageType mantVal) {
+    bits = merge(bits, mantVal, FRACTION_MASK);
   }
 
-  LIBC_INLINE constexpr void set_implicit_bit(bool implicitVal) {
-    if (get_implicit_bit() != implicitVal)
-      bits ^= EXPLICIT_BIT_MASK;
+private:
+  // Merge bits from 'a' and 'b' values according to 'mask'.
+  // Use 'a' bits when corresponding 'mask' bits are zeroes and 'b' bits when
+  // corresponding bits are ones.
+  LIBC_INLINE static constexpr StorageType merge(StorageType a, StorageType b,
+                                                 StorageType mask) {
+    // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
+    return a ^ ((a ^ b) & mask);
   }
 };
 
@@ -642,29 +685,31 @@ template <typename T> LIBC_INLINE static constexpr FPType get_fp_type() {
     static_assert(cpp::always_false<UnqualT>, "Unsupported type");
 }
 
-// A generic class to represent floating point formats.
-// On most platforms, the 'float' type corresponds to single precision
-// floating point numbers, the 'double' type corresponds to double precision
-// floating point numers, and the 'long double' type corresponds to the quad
-// precision floating numbers. On x86 platforms however, the 'long double'
-// type maps to an x87 floating point format.
-template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
+// A generic class to manipulate floating point formats.
+// It derives most of its functionality to FPRep above.
+template <typename T>
+struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
   static_assert(cpp::is_floating_point_v<T>,
                 "FPBits instantiated with invalid type.");
-  using UP = internal::FPRep<get_fp_type<T>()>;
+  using UP = internal::FPRep<get_fp_type<T>(), FPBits<T>>;
   using Rep = UP;
   using StorageType = typename UP::StorageType;
 
   using UP::bits;
-  using UP::EXP_LEN;
-  using UP::UP;
 
   // Constants.
-  static constexpr int MAX_BIASED_EXPONENT = (1 << EXP_LEN) - 1;
-  static constexpr StorageType MIN_NORMAL = UP::min_normal(Sign::POS);
-  static constexpr StorageType MAX_NORMAL = UP::max_normal(Sign::POS);
-  static constexpr StorageType MIN_SUBNORMAL = UP::min_subnormal(Sign::POS);
-  static constexpr StorageType MAX_SUBNORMAL = UP::max_subnormal(Sign::POS);
+  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
+      UP::FRACTION_LEN + 1;
+  LIBC_INLINE_VAR static constexpr StorageType MIN_NORMAL =
+      UP::min_normal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr StorageType MAX_NORMAL =
+      UP::max_normal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr StorageType MIN_SUBNORMAL =
+      UP::min_subnormal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr StorageType MAX_SUBNORMAL =
+      UP::max_subnormal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr int MAX_BIASED_EXPONENT =
+      (1 << UP::EXP_LEN) - 1;
 
   // Constructors.
   LIBC_INLINE constexpr FPBits() = default;
@@ -686,49 +731,35 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
 
   LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
 
-  LIBC_INLINE constexpr bool is_inf_or_nan() const { return !UP::is_finite(); }
-
-  LIBC_INLINE constexpr FPBits abs() const {
-    return FPBits(bits & UP::EXP_SIG_MASK);
-  }
-
   // Methods below this are used by tests.
-
+  // TODO: inline and remove.
   LIBC_INLINE static constexpr T one(Sign sign = Sign::POS) {
-    return FPBits(UP::one(sign)).get_val();
+    return T(UP::one(sign));
   }
-
   LIBC_INLINE static constexpr T zero(Sign sign = Sign::POS) {
-    return FPBits(UP::zero(sign)).get_val();
+    return T(UP::zero(sign));
   }
-
   LIBC_INLINE static constexpr T inf(Sign sign = Sign::POS) {
-    return FPBits(UP::inf(sign)).get_val();
+    return T(UP::inf(sign));
   }
-
   LIBC_INLINE static constexpr T min_normal() {
-    return FPBits(UP::min_normal(Sign::POS)).get_val();
+    return T(UP::min_normal(Sign::POS));
   }
-
   LIBC_INLINE static constexpr T max_normal() {
-    return FPBits(UP::max_normal(Sign::POS)).get_val();
+    return T(UP::max_normal(Sign::POS));
   }
-
   LIBC_INLINE static constexpr T min_denormal() {
-    return FPBits(UP::min_subnormal(Sign::POS)).get_val();
+    return T(UP::min_subnormal(Sign::POS));
   }
-
   LIBC_INLINE static constexpr T max_denormal() {
-    return FPBits(UP::max_subnormal(Sign::POS)).get_val();
+    return T(UP::max_subnormal(Sign::POS));
   }
-
   LIBC_INLINE static constexpr T build_nan(StorageType v) {
-    return FPBits(UP::build_nan(Sign::POS, v)).get_val();
+    return T(UP::build_nan(Sign::POS, v));
   }
-
   LIBC_INLINE static constexpr T build_quiet_nan(StorageType v,
                                                  Sign sign = Sign::POS) {
-    return FPBits(UP::build_quiet_nan(sign, v)).get_val();
+    return T(UP::build_quiet_nan(sign, v));
   }
 
   // TODO: Use an uint32_t for 'biased_exp'.
@@ -757,7 +788,7 @@ template <typename T> struct FPBits : public internal::FPRep<get_fp_type<T>()> {
                   "This function is not tested for X86 Extended Precision");
     FPBits<T> result;
     // offset: +1 for sign, but -1 for implicit first bit
-    int lz = cpp::countl_zero(number) - EXP_LEN;
+    int lz = cpp::countl_zero(number) - UP::EXP_LEN;
     number <<= lz;
     ep -= lz;
 
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index f0b155085dabf..e6b6d7d9ec780 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -17,69 +17,72 @@ TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary16) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
   using Rep = FPRep<FPType::IEEE754_Binary16>;
-  using u16 = uint16_t;
-
-  EXPECT_EQ(u16(0b0'00000'0000000000), Rep::zero());
-  EXPECT_EQ(u16(0b0'01111'0000000000), Rep::one());
-  EXPECT_EQ(u16(0b0'00000'0000000001), Rep::min_subnormal());
-  EXPECT_EQ(u16(0b0'00000'1111111111), Rep::max_subnormal());
-  EXPECT_EQ(u16(0b0'00001'0000000000), Rep::min_normal());
-  EXPECT_EQ(u16(0b0'11110'1111111111), Rep::max_normal());
-  EXPECT_EQ(u16(0b0'11111'0000000000), Rep::inf());
-  EXPECT_EQ(u16(0b0'11111'0100000000), Rep::build_nan());
-  EXPECT_EQ(u16(0b0'11111'1000000000), Rep::build_quiet_nan());
+  using u16 = typename Rep::StorageType;
+
+  EXPECT_EQ(u16(0b0'00000'0000000000), u16(Rep::zero()));
+  EXPECT_EQ(u16(0b0'01111'0000000000), u16(Rep::one()));
+  EXPECT_EQ(u16(0b0'00000'0000000001), u16(Rep::min_subnormal()));
+  EXPECT_EQ(u16(0b0'00000'1111111111), u16(Rep::max_subnormal()));
+  EXPECT_EQ(u16(0b0'00001'0000000000), u16(Rep::min_normal()));
+  EXPECT_EQ(u16(0b0'11110'1111111111), u16(Rep::max_normal()));
+  EXPECT_EQ(u16(0b0'11111'0000000000), u16(Rep::inf()));
+  EXPECT_EQ(u16(0b0'11111'0100000000), u16(Rep::build_nan()));
+  EXPECT_EQ(u16(0b0'11111'1000000000), u16(Rep::build_quiet_nan()));
 }
 
 TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary32) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
   using Rep = FPRep<FPType::IEEE754_Binary32>;
-  using u32 = uint32_t;
-
-  EXPECT_EQ(u32(0b0'00000000'00000000000000000000000), Rep::zero());
-  EXPECT_EQ(u32(0b0'01111111'00000000000000000000000), Rep::one());
-  EXPECT_EQ(u32(0b0'00000000'00000000000000000000001), Rep::min_subnormal());
-  EXPECT_EQ(u32(0b0'00000000'11111111111111111111111), Rep::max_subnormal());
-  EXPECT_EQ(u32(0b0'00000001'00000000000000000000000), Rep::min_normal());
-  EXPECT_EQ(u32(0b0'11111110'11111111111111111111111), Rep::max_normal());
-  EXPECT_EQ(u32(0b0'11111111'00000000000000000000000), Rep::inf());
-  EXPECT_EQ(u32(0b0'11111111'01000000000000000000000), Rep::build_nan());
-  EXPECT_EQ(u32(0b0'11111111'10000000000000000000000), Rep::build_quiet_nan());
+  using u32 = typename Rep::StorageType;
+
+  EXPECT_EQ(u32(0b0'00000000'00000000000000000000000), u32(Rep::zero()));
+  EXPECT_EQ(u32(0b0'01111111'00000000000000000000000), u32(Rep::one()));
+  EXPECT_EQ(u32(0b0'00000000'00000000000000000000001),
+            u32(Rep::min_subnormal()));
+  EXPECT_EQ(u32(0b0'00000000'11111111111111111111111),
+            u32(Rep::max_subnormal()));
+  EXPECT_EQ(u32(0b0'00000001'00000000000000000000000), u32(Rep::min_normal()));
+  EXPECT_EQ(u32(0b0'11111110'11111111111111111111111), u32(Rep::max_normal()));
+  EXPECT_EQ(u32(0b0'11111111'00000000000000000000000), u32(Rep::inf()));
+  EXPECT_EQ(u32(0b0'11111111'01000000000000000000000), u32(Rep::build_nan()));
+  EXPECT_EQ(u32(0b0'11111111'10000000000000000000000),
+            u32(Rep::build_quiet_nan()));
 }
 
 TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary64) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
   using Rep = FPRep<FPType::IEEE754_Binary64>;
-  using u64 = uint64_t;
+  using u64 = typename Rep::StorageType;
 
   EXPECT_EQ(
       u64(0b0'00000000000'0000000000000000000000000000000000000000000000000000),
-      Rep::zero());
+      u64(Rep::zero()));
   EXPECT_EQ(
       u64(0b0'01111111111'0000000000000000000000000000000000000000000000000000),
-      Rep::one());
+      u64(Rep::one()));
   EXPECT_EQ(
       u64(0b0'00000000000'0000000000000000000000000000000000000000000000000001),
-      Rep::min_subnormal());
+      u64(Rep::min_subnormal()));
   EXPECT_EQ(
       u64(0b0'00000000000'1111111111111111111111111111111111111111111111111111),
-      Rep::max_subnormal());
+      u64(Rep::max_subnormal()));
   EXPECT_EQ(
       u64(0b0'00000000001'0000000000000000000000000000000000000000000000000000),
-      Rep::min_normal());
+      u64(Rep::min_normal()));
   EXPECT_EQ(
       u64(0b0'11111111110'1111111111111111111111111111111111111111111111111111),
-      Rep::max_normal());
+      u64(Rep::max_normal()));
   EXPECT_EQ(
       u64(0b0'11111111111'0000000000000000000000000000000000000000000000000000),
-      Rep::inf());
+      u64(Rep::inf()));
   EXPECT_EQ(
       u64(0b0'11111111111'0100000000000000000000000000000000000000000000000000),
-      Rep::build_nan());
+      u64(Rep::build_nan()));
   EXPECT_EQ(
       u64(0b0'11111111111'1000000000000000000000000000000000000000000000000000),
-      Rep::build_quiet_nan());
+      u64(Rep::build_quiet_nan()));
 }
 
 static constexpr UInt128 u128(uint64_t hi, uint64_t lo) {
@@ -90,6 +93,49 @@ static constexpr UInt128 u128(uint64_t hi, uint64_t lo) {
 #endif
 }
 
+TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary128) {
+  using LIBC_NAMESPACE::fputil::FPType;
+  using LIBC_NAMESPACE::fputil::internal::FPRep;
+  using Rep = FPRep<FPType::IEEE754_Binary128>;
+
+  EXPECT_EQ(
+      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::zero()));
+  EXPECT_EQ(
+      u128(0b0'011111111111111'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::one()));
+  EXPECT_EQ(
+      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000001),
+      UInt128(Rep::min_subnormal()));
+  EXPECT_EQ(
+      u128(0b0'000000000000000'111111111111111111111111111111111111111111111111,
+           0b1111111111111111111111111111111111111111111111111111111111111111),
+      UInt128(Rep::max_subnormal()));
+  EXPECT_EQ(
+      u128(0b0'000000000000001'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::min_normal()));
+  EXPECT_EQ(
+      u128(0b0'111111111111110'111111111111111111111111111111111111111111111111,
+           0b1111111111111111111111111111111111111111111111111111111111111111),
+      UInt128(Rep::max_normal()));
+  EXPECT_EQ(
+      u128(0b0'111111111111111'000000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::inf()));
+  EXPECT_EQ(
+      u128(0b0'111111111111111'010000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::build_nan()));
+  EXPECT_EQ(
+      u128(0b0'111111111111111'100000000000000000000000000000000000000000000000,
+           0b0000000000000000000000000000000000000000000000000000000000000000),
+      UInt128(Rep::build_quiet_nan()));
+}
+
 TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80) {
   using LIBC_NAMESPACE::fputil::FPType;
   using LIBC_NAMESPACE::fputil::internal::FPRep;
@@ -98,39 +144,39 @@ TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80) {
   EXPECT_EQ(
       u128(0b0'000000000000000,
            0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::zero());
+      UInt128(Rep::zero()));
   EXPECT_EQ(
       u128(0b0'011111111111111,
            0b1000000000000000000000000000000000000000000000000000000000000000),
-      Rep::one());
+      UInt128(Rep::one()));
   EXPECT_EQ(
       u128(0b0'000000000000000,
            0b0000000000000000000000000000000000000000000000000000000000000001),
-      Rep::min_subnormal());
+      UInt128(Rep::min_subnormal()));
   EXPECT_EQ(
       u128(0b0'000000000000000,
            0b0111111111111111111111111111111111111111111111111111111111111111),
-      Rep::max_subnormal());
+      UInt128(Rep::max_subnormal()));
   EXPECT_EQ(
       u128(0b0'000000000000001,
            0b1000000000000000000000000000000000000000000000000000000000000000),
-      Rep::min_normal());
+      UInt128(Rep::min_normal()));
   EXPECT_EQ(
       u128(0b0'111111111111110,
            0b1111111111111111111111111111111111111111111111111111111111111111),
-      Rep::max_normal());
+      UInt128(Rep::max_normal()));
   EXPECT_EQ(
       u128(0b0'111111111111111,
            0b1000000000000000000000000000000000000000000000000000000000000000),
-      Rep::inf());
+      UInt128(Rep::inf()));
   EXPECT_EQ(
       u128(0b0'111111111111111,
            0b1010000000000000000000000000000000000000000000000000000000000000),
-      Rep::build_nan());
+      UInt128(Rep::build_nan()));
   EXPECT_EQ(
       u128(0b0'111111111111111,
            0b1100000000000000000000000000000000000000000000000000000000000000),
-      Rep::build_quiet_nan());
+      UInt128(Rep::build_quiet_nan()));
 }
 
 TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80_IsNan) {
@@ -183,49 +229,6 @@ TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80_IsNan) {
       0b1000000000000000000000000000000000000000000000000000000000000000));
 }
 
-TEST(LlvmLibcFPBitsTest, FPType_IEEE754_Binary128) {
-  using LIBC_NAMESPACE::fputil::FPType;
-  using LIBC_NAMESPACE::fputil::internal::FPRep;
-  using Rep = FPRep<FPType::IEEE754_Binary128>;
-
-  EXPECT_EQ(
-      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::zero());
-  EXPECT_EQ(
-      u128(0b0'011111111111111'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::one());
-  EXPECT_EQ(
-      u128(0b0'000000000000000'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000001),
-      Rep::min_subnormal());
-  EXPECT_EQ(
-      u128(0b0'000000000000000'111111111111111111111111111111111111111111111111,
-           0b1111111111111111111111111111111111111111111111111111111111111111),
-      Rep::max_subnormal());
-  EXPECT_EQ(
-      u128(0b0'000000000000001'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::min_normal());
-  EXPECT_EQ(
-      u128(0b0'111111111111110'111111111111111111111111111111111111111111111111,
-           0b1111111111111111111111111111111111111111111111111111111111111111),
-      Rep::max_normal());
-  EXPECT_EQ(
-      u128(0b0'111111111111111'000000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::inf());
-  EXPECT_EQ(
-      u128(0b0'111111111111111'010000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::build_nan());
-  EXPECT_EQ(
-      u128(0b0'111111111111111'100000000000000000000000000000000000000000000000,
-           0b0000000000000000000000000000000000000000000000000000000000000000),
-      Rep::build_quiet_nan());
-}
-
 TEST(LlvmLibcFPBitsTest, FloatType) {
   using FloatBits = FPBits<float>;
 

From 70823fe4871eb6ca3fe8dc8264ac005c8edbfe70 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 22 Jan 2024 09:22:08 -0500
Subject: [PATCH 392/843] [libc++][NFC] Fix incorrect formatting of release
 notes

---
 libcxx/docs/ReleaseNotes/18.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index c619f94fd0035..237a63022d55f 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -148,11 +148,11 @@ Deprecations and Removals
   macro is provided to restore the previous behavior, and it will be supported in the LLVM 18 release only.
   In LLVM 19 and beyond, ``_LIBCPP_ENABLE_NARROWING_CONVERSIONS_IN_VARIANT`` will not be honored anymore.
 
-- Overriding `__libcpp_verbose_abort` no longer has any effect on library assertions. The only supported way
+- Overriding ``__libcpp_verbose_abort`` no longer has any effect on library assertions. The only supported way
   to customize the assertion handler that gets invoked when a hardening assertion fails is now by setting the
   ``LIBCXX_ASSERTION_HANDLER_FILE`` CMake variable and providing a custom header. See the documentation on
-  overriding the default assertion handler for details. The ability to override `__libcpp_verbose_abort` will
-  be removed in an upcoming release in favor of the new overriding mechanism.
+  overriding the default assertion handler for details. The ability to override ``__libcpp_verbose_abort``
+  will be removed in an upcoming release in favor of the new overriding mechanism.
 
 - In safe mode (which is now equivalent to the ``extensive`` hardening mode), a failed assertion will now
   generate a trap rather than a call to verbose abort.

From 52a8bed426e59d10b41a12208d3ba55cd2c76a1f Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 22 Jan 2024 14:25:08 +0000
Subject: [PATCH 393/843] [DebugInfo][RemoveDIs] Adjust AMDGPU passes to work
 with DPValues (#78736)

This patch tweaks two AMDGPU passes to use iterators rather than
instruction pointers for expressing an insertion point. This is needed
to accurately support DPValues, the non-instruction storage object for
debug-info.

Two tests were sensitive to this change (variable assignments were being
put in the wrong place), and I've added extra run-lines with the "try
new debug-info..." flag. These get tested on our public buildbot to
ensure they continue to work accurately.
---
 llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp | 2 +-
 llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp      | 7 ++++---
 llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll            | 3 +++
 llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll      | 1 +
 4 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index 1c514ffa76c9d..015c71080d670 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -106,7 +106,7 @@ static bool lowerKernelArguments(Function &F, const TargetMachine &TM) {
   LLVMContext &Ctx = F.getParent()->getContext();
   const DataLayout &DL = F.getParent()->getDataLayout();
   BasicBlock &EntryBlock = *F.begin();
-  IRBuilder<> Builder(&*getInsertPt(EntryBlock));
+  IRBuilder<> Builder(&EntryBlock, getInsertPt(EntryBlock));
 
   const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
   const uint64_t BaseOffset = ST.getExplicitKernelArgOffset();
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 932c0d6216ced..c921e5a35d2da 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -329,15 +329,16 @@ bool SIAnnotateControlFlow::closeControlFlow(BasicBlock *BB) {
   }
 
   Value *Exec = popSaved();
-  Instruction *FirstInsertionPt = &*BB->getFirstInsertionPt();
+  BasicBlock::iterator FirstInsertionPt = BB->getFirstInsertionPt();
   if (!isa<UndefValue>(Exec) && !isa<UnreachableInst>(FirstInsertionPt)) {
     Instruction *ExecDef = cast<Instruction>(Exec);
     BasicBlock *DefBB = ExecDef->getParent();
     if (!DT->dominates(DefBB, BB)) {
       // Split edge to make Def dominate Use
-      FirstInsertionPt = &*SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
+      FirstInsertionPt = SplitEdge(DefBB, BB, DT, LI)->getFirstInsertionPt();
     }
-    IRBuilder<>(FirstInsertionPt).CreateCall(EndCf, {Exec});
+    IRBuilder<>(FirstInsertionPt->getParent(), FirstInsertionPt)
+        .CreateCall(EndCf, {Exec});
   }
 
   return true;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
index 7414ec9e254b2..a496e1c3ae6f7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.dbg.value.ll
@@ -1,6 +1,9 @@
 ; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,NOOPT %s
 ; RUN: llc -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,OPT %s
 
+; RUN: llc -O0 -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s --try-experimental-debuginfo-iterators | FileCheck -check-prefixes=GCN,NOOPT %s
+; RUN: llc -mtriple=amdgcn-unknown-amdhsa -verify-machineinstrs < %s --try-experimental-debuginfo-iterators | FileCheck -check-prefixes=GCN,OPT %s
+
 ; GCN-LABEL: {{^}}test_debug_value:
 ; NOOPT: .loc	1 1 42 prologue_end     ; /tmp/test_debug_value.cl:1:42
 ; NOOPT-NEXT: s_load_dwordx2 s[4:5], s[6:7], 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
index 215c324f1623a..a7af02017001f 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-dbg-info.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 ; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s | FileCheck -check-prefix=OPT %s
+; RUN: opt -mtriple=amdgcn-- -S -structurizecfg -si-annotate-control-flow %s --try-experimental-debuginfo-iterators | FileCheck -check-prefix=OPT %s
 
 define amdgpu_ps i32 @if_else(i32 %0) !dbg !5 {
 ; OPT-LABEL: define amdgpu_ps i32 @if_else(

From 75b0c913a5fa3895818a7c7a066c0a8e1a214329 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski@arm.com>
Date: Mon, 22 Jan 2024 14:21:21 +0000
Subject: [PATCH 394/843] [mlir][nfc] Update comments

1. Updates and clarifies a few comments related to hooks for
   vector.{insert|extract}_strided_slice.

2. For consistency with vector.insert_strided_slice, removes a TODO from
   vector.extract_strided_slice Op def. It's self-explenatory that
   adding support for non-unit strides is a "TODO".
---
 mlir/include/mlir/Dialect/Vector/IR/VectorOps.td |  1 -
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp         | 16 +++++++++-------
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index 8e333def33869..fdf51f0173511 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -1121,7 +1121,6 @@ def Vector_ExtractStridedSliceOp :
     attribute and extracts the n-D subvector at the proper offset.
 
     At the moment strides must contain only 1s.
-    // TODO: support non-1 strides.
 
     Returns an n-D vector where the first k-D dimensions match the `sizes`
     attribute. The returned subvector contains the elements starting at offset
diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 749eb56b3d3be..791924f92e8ad 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -2789,9 +2789,11 @@ isIntegerArrayAttrConfinedToShape(OpType op, ArrayAttr arrayAttr,
   return success();
 }
 
-// Returns true if all integers in `arrayAttr` are in the interval [min, max}.
-// interval. If `halfOpen` is true then the admissible interval is [min, max).
-// Otherwise, the admissible interval is [min, max].
+// Returns true if, for all indices i = 0..shape.size()-1, val is in the
+// [min, max} interval:
+//   val = `arrayAttr1[i]` + `arrayAttr2[i]`,
+// If `halfOpen` is true then the admissible interval is [min, max). Otherwise,
+// the admissible interval is [min, max].
 template <typename OpType>
 static LogicalResult isSumOfIntegerArrayAttrConfinedToShape(
     OpType op, ArrayAttr arrayAttr1, ArrayAttr arrayAttr2,
@@ -2845,8 +2847,8 @@ LogicalResult InsertStridedSliceOp::verify() {
   auto stridesName = InsertStridedSliceOp::getStridesAttrName();
   if (failed(isIntegerArrayAttrConfinedToShape(*this, offsets, destShape,
                                                offName)) ||
-      failed(isIntegerArrayAttrConfinedToRange(*this, strides, 1, 1,
-                                               stridesName,
+      failed(isIntegerArrayAttrConfinedToRange(*this, strides, /*min=*/1,
+                                               /*max=*/1, stridesName,
                                                /*halfOpen=*/false)) ||
       failed(isSumOfIntegerArrayAttrConfinedToShape(
           *this, offsets,
@@ -3250,8 +3252,8 @@ LogicalResult ExtractStridedSliceOp::verify() {
       failed(isIntegerArrayAttrConfinedToShape(*this, sizes, shape, sizesName,
                                                /*halfOpen=*/false,
                                                /*min=*/1)) ||
-      failed(isIntegerArrayAttrConfinedToRange(*this, strides, 1, 1,
-                                               stridesName,
+      failed(isIntegerArrayAttrConfinedToRange(*this, strides, /*min=*/1,
+                                               /*max=*/1, stridesName,
                                                /*halfOpen=*/false)) ||
       failed(isSumOfIntegerArrayAttrConfinedToShape(*this, offsets, sizes,
                                                     shape, offName, sizesName,

From 90af11ea97cf9ab34677720c5b7848eb5e9d4efe Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 22 Jan 2024 09:34:47 -0500
Subject: [PATCH 395/843] [libc++] Make sure to publish release notes at
 ReleaseNotes.html

In LLVM 17, we switched to numbered RST files for release notes, which
makes it easier to deal with cherry-picks around release points. However,
we stopped publishing `libcxx/docs/ReleaseNotes.html`, which was
referenced by external sites.

This patch ensures that we keep publishing `ReleaseNotes.html` by simply
including the versioned RST file in the unversioned RST file.

Fixes #77955
---
 libcxx/docs/ReleaseNotes.rst | 4 ++++
 libcxx/docs/index.rst        | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)
 create mode 100644 libcxx/docs/ReleaseNotes.rst

diff --git a/libcxx/docs/ReleaseNotes.rst b/libcxx/docs/ReleaseNotes.rst
new file mode 100644
index 0000000000000..5c2c6114beb56
--- /dev/null
+++ b/libcxx/docs/ReleaseNotes.rst
@@ -0,0 +1,4 @@
+.. include:: ReleaseNotes/18.rst
+
+.. The release notes are in versioned files, but we make sure to keep publishing
+.. them in an unversioned ReleaseNotes.html page for external sites to reference.
diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index 066d8d87e5526..ab7472071ebb9 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -34,7 +34,7 @@ Getting Started with libc++
 .. toctree::
    :maxdepth: 1
 
-   ReleaseNotes/18
+   ReleaseNotes
    UsingLibcxx
    BuildingLibcxx
    TestingLibcxx

From 55cb52bbc539777aeb176f22fef0b4bfc70883fc Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 22 Jan 2024 08:41:07 -0600
Subject: [PATCH 396/843] [Flang][OpenMP] Restructure recursive lowering in
 `createBodyOfOp` (#77761)

This brings `createBodyOfOp` to its final intended form. First, input
privatization is performed, then the recursive lowering takes place, and
finally the output privatization (lastprivate) is done.

This enables fixing a known issue with infinite loops inside of an
OpenMP region, and the fix is included in this patch.

Fixes https://github.com/llvm/llvm-project/issues/74348.

Recursive lowering [5/5]

---------

Co-authored-by: Kiran Chandramohan <kiran.chandramohan@arm.com>
---
 flang/include/flang/Lower/OpenMP.h            |   4 +-
 flang/lib/Lower/OpenMP.cpp                    | 144 +++++++++++++-----
 flang/test/Lower/OpenMP/FIR/sections.f90      |   6 +-
 .../OpenMP/infinite-loop-in-construct.f90     |  26 ++++
 flang/test/Lower/OpenMP/sections.f90          |   6 +-
 .../test/Lower/OpenMP/wsloop-unstructured.f90 |  61 ++++++++
 6 files changed, 198 insertions(+), 49 deletions(-)
 create mode 100644 flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
 create mode 100644 flang/test/Lower/OpenMP/wsloop-unstructured.f90

diff --git a/flang/include/flang/Lower/OpenMP.h b/flang/include/flang/Lower/OpenMP.h
index 872b7d50c3e7a..d2b7a423d7781 100644
--- a/flang/include/flang/Lower/OpenMP.h
+++ b/flang/include/flang/Lower/OpenMP.h
@@ -50,8 +50,8 @@ struct Variable;
 } // namespace pft
 
 // Generate the OpenMP terminator for Operation at Location.
-void genOpenMPTerminator(fir::FirOpBuilder &, mlir::Operation *,
-                         mlir::Location);
+mlir::Operation *genOpenMPTerminator(fir::FirOpBuilder &, mlir::Operation *,
+                                     mlir::Location);
 
 void genOpenMPConstruct(AbstractConverter &, Fortran::lower::SymMap &,
                         semantics::SemanticsContext &, pft::Evaluation &,
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 7dd25f75d9eb7..3deeb6019b19b 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -27,6 +27,7 @@
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/openmp-directive-sets.h"
 #include "flang/Semantics/tools.h"
+#include "mlir/Dialect/ControlFlow/IR/ControlFlowOps.h"
 #include "mlir/Dialect/OpenMP/OpenMPDialect.h"
 #include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Transforms/RegionUtils.h"
@@ -385,7 +386,8 @@ void DataSharingProcessor::insertLastPrivateCompare(mlir::Operation *op) {
             // construct
             mlir::OpBuilder::InsertPoint unstructuredSectionsIP =
                 firOpBuilder.saveInsertionPoint();
-            firOpBuilder.setInsertionPointToStart(&op->getRegion(0).back());
+            mlir::Operation *lastOper = op->getRegion(0).back().getTerminator();
+            firOpBuilder.setInsertionPoint(lastOper);
             lastPrivIP = firOpBuilder.saveInsertionPoint();
             firOpBuilder.restoreInsertionPoint(unstructuredSectionsIP);
           }
@@ -2206,15 +2208,6 @@ static mlir::Type getLoopVarType(Fortran::lower::AbstractConverter &converter,
   return converter.getFirOpBuilder().getIntegerType(loopVarTypeSize);
 }
 
-static void resetBeforeTerminator(fir::FirOpBuilder &firOpBuilder,
-                                  mlir::Operation *storeOp,
-                                  mlir::Block &block) {
-  if (storeOp)
-    firOpBuilder.setInsertionPointAfter(storeOp);
-  else
-    firOpBuilder.setInsertionPointToStart(&block);
-}
-
 static mlir::Operation *
 createAndSetPrivatizedLoopVar(Fortran::lower::AbstractConverter &converter,
                               mlir::Location loc, mlir::Value indexVal,
@@ -2257,11 +2250,17 @@ static void createBodyOfOp(
     const llvm::SmallVector<const Fortran::semantics::Symbol *> &args = {},
     bool outerCombined = false, DataSharingProcessor *dsp = nullptr) {
   fir::FirOpBuilder &firOpBuilder = converter.getFirOpBuilder();
+
+  auto insertMarker = [](fir::FirOpBuilder &builder) {
+    mlir::Value undef = builder.create<fir::UndefOp>(builder.getUnknownLoc(),
+                                                     builder.getIndexType());
+    return undef.getDefiningOp();
+  };
+
   // If an argument for the region is provided then create the block with that
   // argument. Also update the symbol's address with the mlir argument value.
   // e.g. For loops the argument is the induction variable. And all further
   // uses of the induction variable should use this mlir value.
-  mlir::Operation *storeOp = nullptr;
   if (args.size()) {
     std::size_t loopVarTypeSize = 0;
     for (const Fortran::semantics::Symbol *arg : args)
@@ -2272,20 +2271,20 @@ static void createBodyOfOp(
     firOpBuilder.createBlock(&op.getRegion(), {}, tiv, locs);
     // The argument is not currently in memory, so make a temporary for the
     // argument, and store it there, then bind that location to the argument.
+    mlir::Operation *storeOp = nullptr;
     for (auto [argIndex, argSymbol] : llvm::enumerate(args)) {
       mlir::Value indexVal =
           fir::getBase(op.getRegion().front().getArgument(argIndex));
       storeOp =
           createAndSetPrivatizedLoopVar(converter, loc, indexVal, argSymbol);
     }
+    firOpBuilder.setInsertionPointAfter(storeOp);
   } else {
     firOpBuilder.createBlock(&op.getRegion());
   }
-  // Set the insert for the terminator operation to go at the end of the
-  // block - this is either empty or the block with the stores above,
-  // the end of the block works for both.
-  mlir::Block &block = op.getRegion().back();
-  firOpBuilder.setInsertionPointToEnd(&block);
+
+  // Mark the earliest insertion point.
+  mlir::Operation *marker = insertMarker(firOpBuilder);
 
   // If it is an unstructured region and is not the outer region of a combined
   // construct, create empty blocks for all evaluations.
@@ -2294,37 +2293,100 @@ static void createBodyOfOp(
                                             mlir::omp::YieldOp>(
         firOpBuilder, eval.getNestedEvaluations());
 
-  // Insert the terminator.
-  Fortran::lower::genOpenMPTerminator(firOpBuilder, op.getOperation(), loc);
-  // Reset the insert point to before the terminator.
-  resetBeforeTerminator(firOpBuilder, storeOp, block);
+  // Start with privatization, so that the lowering of the nested
+  // code will use the right symbols.
+  constexpr bool isLoop = std::is_same_v<Op, mlir::omp::WsLoopOp> ||
+                          std::is_same_v<Op, mlir::omp::SimdLoopOp>;
+  bool privatize = clauses && !outerCombined;
 
-  // Handle privatization. Do not privatize if this is the outer operation.
-  if (clauses && !outerCombined) {
-    constexpr bool isLoop = std::is_same_v<Op, mlir::omp::WsLoopOp> ||
-                            std::is_same_v<Op, mlir::omp::SimdLoopOp>;
+  firOpBuilder.setInsertionPoint(marker);
+  std::optional<DataSharingProcessor> tempDsp;
+  if (privatize) {
     if (!dsp) {
-      DataSharingProcessor proc(converter, *clauses, eval);
-      proc.processStep1();
-      proc.processStep2(op, isLoop);
-    } else {
-      if (isLoop && args.size() > 0)
-        dsp->setLoopIV(converter.getSymbolAddress(*args[0]));
-      dsp->processStep2(op, isLoop);
+      tempDsp.emplace(converter, *clauses, eval);
+      tempDsp->processStep1();
     }
-
-    if (storeOp)
-      firOpBuilder.setInsertionPointAfter(storeOp);
   }
 
   if constexpr (std::is_same_v<Op, mlir::omp::ParallelOp>) {
     threadPrivatizeVars(converter, eval);
-    if (clauses)
+    if (clauses) {
+      firOpBuilder.setInsertionPoint(marker);
       ClauseProcessor(converter, *clauses).processCopyin();
+    }
   }
 
-  if (genNested)
+  if (genNested) {
+    // genFIR(Evaluation&) tries to patch up unterminated blocks, causing
+    // a lot of trouble if the terminator generation is delayed past this
+    // point. Insert a temporary terminator here, then delete it.
+    firOpBuilder.setInsertionPointToEnd(&op.getRegion().back());
+    auto *temp = Fortran::lower::genOpenMPTerminator(firOpBuilder,
+                                                     op.getOperation(), loc);
+    firOpBuilder.setInsertionPointAfter(marker);
     genNestedEvaluations(converter, eval);
+    temp->erase();
+  }
+
+  // Get or create a unique exiting block from the given region, or
+  // return nullptr if there is no exiting block.
+  auto getUniqueExit = [&](mlir::Region &region) -> mlir::Block * {
+    // Find the blocks where the OMP terminator should go. In simple cases
+    // it is the single block in the operation's region. When the region
+    // is more complicated, especially with unstructured control flow, there
+    // may be multiple blocks, and some of them may have non-OMP terminators
+    // resulting from lowering of the code contained within the operation.
+    // All the remaining blocks are potential exit points from the op's region.
+    //
+    // Explicit control flow cannot exit any OpenMP region (other than via
+    // STOP), and that is enforced by semantic checks prior to lowering. STOP
+    // statements are lowered to a function call.
+
+    // Collect unterminated blocks.
+    llvm::SmallVector<mlir::Block *> exits;
+    for (mlir::Block &b : region) {
+      if (b.empty() || !b.back().hasTrait<mlir::OpTrait::IsTerminator>())
+        exits.push_back(&b);
+    }
+
+    if (exits.empty())
+      return nullptr;
+    // If there already is a unique exiting block, do not create another one.
+    // Additionally, some ops (e.g. omp.sections) require only 1 block in
+    // its region.
+    if (exits.size() == 1)
+      return exits[0];
+    mlir::Block *exit = firOpBuilder.createBlock(&region);
+    for (mlir::Block *b : exits) {
+      firOpBuilder.setInsertionPointToEnd(b);
+      firOpBuilder.create<mlir::cf::BranchOp>(loc, exit);
+    }
+    return exit;
+  };
+
+  if (auto *exitBlock = getUniqueExit(op.getRegion())) {
+    firOpBuilder.setInsertionPointToEnd(exitBlock);
+    auto *term = Fortran::lower::genOpenMPTerminator(firOpBuilder,
+                                                     op.getOperation(), loc);
+    // Only insert lastprivate code when there actually is an exit block.
+    // Such a block may not exist if the nested code produced an infinite
+    // loop (this may not make sense in production code, but a user could
+    // write that and we should handle it).
+    firOpBuilder.setInsertionPoint(term);
+    if (privatize) {
+      if (!dsp) {
+        assert(tempDsp.has_value());
+        tempDsp->processStep2(op, isLoop);
+      } else {
+        if (isLoop && args.size() > 0)
+          dsp->setLoopIV(converter.getSymbolAddress(*args[0]));
+        dsp->processStep2(op, isLoop);
+      }
+    }
+  }
+
+  firOpBuilder.setInsertionPointAfter(marker);
+  marker->erase();
 }
 
 static void genBodyOfTargetDataOp(
@@ -3756,14 +3818,14 @@ genOMP(Fortran::lower::AbstractConverter &converter,
 // Public functions
 //===----------------------------------------------------------------------===//
 
-void Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder,
-                                         mlir::Operation *op,
-                                         mlir::Location loc) {
+mlir::Operation *Fortran::lower::genOpenMPTerminator(fir::FirOpBuilder &builder,
+                                                     mlir::Operation *op,
+                                                     mlir::Location loc) {
   if (mlir::isa<mlir::omp::WsLoopOp, mlir::omp::ReductionDeclareOp,
                 mlir::omp::AtomicUpdateOp, mlir::omp::SimdLoopOp>(op))
-    builder.create<mlir::omp::YieldOp>(loc);
+    return builder.create<mlir::omp::YieldOp>(loc);
   else
-    builder.create<mlir::omp::TerminatorOp>(loc);
+    return builder.create<mlir::omp::TerminatorOp>(loc);
 }
 
 void Fortran::lower::genOpenMPConstruct(
diff --git a/flang/test/Lower/OpenMP/FIR/sections.f90 b/flang/test/Lower/OpenMP/FIR/sections.f90
index 87d34e58321be..640ec34a05bc2 100644
--- a/flang/test/Lower/OpenMP/FIR/sections.f90
+++ b/flang/test/Lower/OpenMP/FIR/sections.f90
@@ -126,11 +126,11 @@ subroutine lastprivate()
             x = x * 10
 !CHECK: omp.section {
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
-!CHECK: %[[true:.*]] = arith.constant true
 !CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
 !CHECK: %[[const:.*]] = arith.constant 1 : i32
 !CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
 !CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
+!CHECK: %[[true:.*]] = arith.constant true
 !CHECK: fir.if %[[true]] {
 !CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
 !CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
@@ -163,11 +163,11 @@ subroutine lastprivate()
 !CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
 !CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
 !CHECK: omp.barrier
-!CHECK: %[[true:.*]] = arith.constant true
 !CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
 !CHECK: %[[const:.*]] = arith.constant 1 : i32
 !CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
 !CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
+!CHECK: %[[true:.*]] = arith.constant true
 !CHECK: fir.if %true {
 !CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
 !CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
@@ -200,11 +200,11 @@ subroutine lastprivate()
 !CHECK: %[[temp:.*]] = fir.load %[[X]] : !fir.ref<i32>
 !CHECK: fir.store %[[temp]] to %[[PRIVATE_X]] : !fir.ref<i32>
 !CHECK: omp.barrier
-!CHECK: %[[true:.*]] = arith.constant true
 !CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
 !CHECK: %[[const:.*]] = arith.constant 1 : i32
 !CHECK: %[[result:.*]] = arith.addi %[[temp]], %[[const]] : i32
 !CHECK: fir.store %[[result]] to %[[PRIVATE_X]] : !fir.ref<i32>
+!CHECK: %[[true:.*]] = arith.constant true
 !CHECK: fir.if %true {
 !CHECK: %[[temp:.*]] = fir.load %[[PRIVATE_X]] : !fir.ref<i32>
 !CHECK: fir.store %[[temp]] to %[[X]] : !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90 b/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
new file mode 100644
index 0000000000000..16b400a231860
--- /dev/null
+++ b/flang/test/Lower/OpenMP/infinite-loop-in-construct.f90
@@ -0,0 +1,26 @@
+! RUN: bbc -fopenmp -o - %s | FileCheck %s
+
+! Check that this test can be lowered successfully.
+! See https://github.com/llvm/llvm-project/issues/74348
+
+! CHECK-LABEL:  func.func @_QPsb
+! CHECK:          omp.parallel
+! CHECK:            cf.cond_br %{{[0-9]+}}, ^bb1, ^bb2
+! CHECK-NEXT:     ^bb1:  // pred: ^bb0
+! CHECK:            cf.br ^bb2
+! CHECK-NEXT:     ^bb2:  // 3 preds: ^bb0, ^bb1, ^bb2
+! CHECK-NEXT:       cf.br ^bb2
+! CHECK-NEXT:     }
+
+subroutine sb(ninter, numnod)
+  integer :: ninter, numnod
+  integer, dimension(:), allocatable :: indx_nm
+
+  !$omp parallel
+  if (ninter>0) then
+    allocate(indx_nm(numnod))
+  endif
+  220 continue
+  goto 220
+  !$omp end parallel
+end subroutine
diff --git a/flang/test/Lower/OpenMP/sections.f90 b/flang/test/Lower/OpenMP/sections.f90
index 16426d070d9a9..6bad688058d28 100644
--- a/flang/test/Lower/OpenMP/sections.f90
+++ b/flang/test/Lower/OpenMP/sections.f90
@@ -141,11 +141,11 @@ subroutine lastprivate()
 !CHECK: omp.section {
 !CHECK: %[[PRIVATE_X:.*]] = fir.alloca i32 {bindc_name = "x", pinned, uniq_name = "_QFlastprivateEx"}
 !CHECK: %[[PRIVATE_X_DECL:.*]]:2 = hlfir.declare %[[PRIVATE_X]] {uniq_name = "_QFlastprivateEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-!CHECK: %[[TRUE:.*]] = arith.constant true
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[CONST:.*]] = arith.constant 1 : i32
 !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[TRUE:.*]] = arith.constant true
 !CHECK: fir.if %[[TRUE]] {
 !CHECK: %[[TEMP1:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP1]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
@@ -180,11 +180,11 @@ subroutine lastprivate()
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
 !CHECK: omp.barrier
-!CHECK: %[[TRUE:.*]] = arith.constant true
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[CONST:.*]] = arith.constant 1 : i32
 !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[TRUE:.*]] = arith.constant true
 !CHECK: fir.if %[[TRUE]] {
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
@@ -219,11 +219,11 @@ subroutine lastprivate()
 !CHECK: %[[TEMP:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[PRIVATE_X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
 !CHECK: omp.barrier
-!CHECK: %[[TRUE:.*]] = arith.constant true
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: %[[CONST:.*]] = arith.constant 1 : i32
 !CHECK: %[[RESULT:.*]] = arith.addi %[[TEMP]], %[[CONST]] : i32
 !CHECK: hlfir.assign %[[RESULT]] to %[[PRIVATE_X_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK: %[[TRUE:.*]] = arith.constant true
 !CHECK: fir.if %[[TRUE]] {
 !CHECK: %[[TEMP:.*]] = fir.load %[[PRIVATE_X_DECL]]#0 : !fir.ref<i32>
 !CHECK: hlfir.assign %[[TEMP]] to %[[X_DECL]]#0 temporary_lhs : i32, !fir.ref<i32>
diff --git a/flang/test/Lower/OpenMP/wsloop-unstructured.f90 b/flang/test/Lower/OpenMP/wsloop-unstructured.f90
new file mode 100644
index 0000000000000..7fe63a1fe607c
--- /dev/null
+++ b/flang/test/Lower/OpenMP/wsloop-unstructured.f90
@@ -0,0 +1,61 @@
+! RUN: bbc -emit-hlfir -fopenmp -o - %s | FileCheck %s
+
+subroutine sub(imax, jmax, x, y)
+  integer, intent(in) :: imax, jmax
+  real, intent(in), dimension(1:imax, 1:jmax) :: x, y
+
+  integer :: i, j, ii
+
+  ! collapse(2) is needed to reproduce the issue
+  !$omp parallel do collapse(2)
+  do j = 1, jmax
+    do i = 1, imax
+      do  ii = 1, imax ! note that this loop is not collapsed
+        if (x(i,j) < y(ii,j)) then
+          ! exit needed to force unstructured control flow
+          exit
+        endif
+      enddo
+    enddo
+  enddo
+end subroutine sub
+
+! this is testing that we don't crash generating code for this: in particular
+! that all blocks are terminated
+
+! CHECK-LABEL:   func.func @_QPsub(
+! CHECK-SAME:                      %[[VAL_0:.*]]: !fir.ref<i32> {fir.bindc_name = "imax"},
+! CHECK-SAME:                      %[[VAL_1:.*]]: !fir.ref<i32> {fir.bindc_name = "jmax"},
+! CHECK-SAME:                      %[[VAL_2:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "x"},
+! CHECK-SAME:                      %[[VAL_3:.*]]: !fir.ref<!fir.array<?x?xf32>> {fir.bindc_name = "y"}) {
+! [...]
+! CHECK:             omp.wsloop for  (%[[VAL_53:.*]], %[[VAL_54:.*]]) : i32 = ({{.*}}) to ({{.*}}) inclusive step ({{.*}}) {
+! [...]
+! CHECK:               cf.br ^bb1
+! CHECK:             ^bb1:
+! CHECK:               cf.br ^bb2
+! CHECK:             ^bb2:
+! [...]
+! CHECK:               cf.br ^bb3
+! CHECK:             ^bb3:
+! [...]
+! CHECK:               %[[VAL_63:.*]] = arith.cmpi sgt, %{{.*}}, %{{.*}} : i32
+! CHECK:               cf.cond_br %[[VAL_63]], ^bb4, ^bb7
+! CHECK:             ^bb4:
+! [...]
+! CHECK:               %[[VAL_76:.*]] = arith.cmpf olt, %{{.*}}, %{{.*}} fastmath<contract> : f32
+! CHECK:               cf.cond_br %[[VAL_76]], ^bb5, ^bb6
+! CHECK:             ^bb5:
+! CHECK:               cf.br ^bb7
+! CHECK:             ^bb6:
+! [...]
+! CHECK:               cf.br ^bb3
+! CHECK:             ^bb7:
+! CHECK:               omp.yield
+! CHECK:             }
+! CHECK:             omp.terminator
+! CHECK:           }
+! CHECK:           cf.br ^bb1
+! CHECK:         ^bb1:
+! CHECK:           return
+! CHECK:         }

From ce519b59b77551d1e9698de17367385a3dbf1352 Mon Sep 17 00:00:00 2001
From: Natalie Chouinard <sudonatalie@google.com>
Date: Mon, 22 Jan 2024 09:54:54 -0500
Subject: [PATCH 397/843] [HLSL][SPIR-V] Add support -fspv-target-env opt
 (#78611)

Add the -fspv-target-env option to the clang-dxc compatibility driver to
specify the SPIR-V target environment, which is propagated to the target
Triple.
---
 clang/include/clang/Driver/Options.td |  3 +++
 clang/lib/Driver/Driver.cpp           | 15 ++++++++++++++-
 clang/test/Driver/dxc_spirv.hlsl      | 11 ++++++++++-
 3 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 95579d30cd293..f9e883e3e22de 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8479,3 +8479,6 @@ def : Option<["/", "-"], "Qembed_debug", KIND_FLAG>, Group<dxc_Group>,
   HelpText<"Embed PDB in shader container (ignored)">;
 def spirv : DXCFlag<"spirv">,
   HelpText<"Generate SPIR-V code">;
+def fspv_target_env_EQ : Joined<["-"], "fspv-target-env=">, Group<dxc_Group>,
+  HelpText<"Specify the target environment">,
+  Values<"vulkan1.2, vulkan1.3">;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 10f7b99a59bd6..da27ca2d28e91 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -1310,10 +1310,23 @@ Compilation *Driver::BuildCompilation(ArrayRef<const char *> ArgList) {
 
       A->claim();
 
-      // TODO: Specify Vulkan target environment somewhere in the triple.
       if (Args.hasArg(options::OPT_spirv)) {
         llvm::Triple T(TargetTriple);
         T.setArch(llvm::Triple::spirv);
+        T.setOS(llvm::Triple::Vulkan);
+
+        // Set specific Vulkan version if applicable.
+        if (const Arg *A = Args.getLastArg(options::OPT_fspv_target_env_EQ)) {
+          const llvm::StringSet<> ValidValues = {"vulkan1.2", "vulkan1.3"};
+          if (ValidValues.contains(A->getValue())) {
+            T.setOSName(A->getValue());
+          } else {
+            Diag(diag::err_drv_invalid_value)
+                << A->getAsString(Args) << A->getValue();
+          }
+          A->claim();
+        }
+
         TargetTriple = T.str();
       }
     } else {
diff --git a/clang/test/Driver/dxc_spirv.hlsl b/clang/test/Driver/dxc_spirv.hlsl
index a3c5c30af06e3..c087ea4b0d709 100644
--- a/clang/test/Driver/dxc_spirv.hlsl
+++ b/clang/test/Driver/dxc_spirv.hlsl
@@ -1,4 +1,13 @@
 // RUN: %clang_dxc -T cs_6_0 -spirv -### %s 2>&1 | FileCheck %s
+// RUN: %clang_dxc -T cs_6_0 -spirv -fspv-target-env=vulkan1.2 -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-VULKAN12
+// RUN: %clang_dxc -T cs_6_0 -spirv -fspv-target-env=vulkan1.3 -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-VULKAN13
+// RUN: not %clang_dxc -T cs_6_0 -spirv -fspv-target-env=vulkan1.0 -### %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 
-// CHECK: "-triple" "spirv-unknown-shadermodel6.0-compute"
+// CHECK: "-triple" "spirv-unknown-vulkan-compute"
 // CHECK-SAME: "-x" "hlsl"
+
+// CHECK-VULKAN12: "-triple" "spirv-unknown-vulkan1.2-compute"
+
+// CHECK-VULKAN13: "-triple" "spirv-unknown-vulkan1.3-compute"
+
+// CHECK-ERROR: error: invalid value 'vulkan1.0' in '-fspv-target-env=vulkan1.0'

From ebb853fbe5f67ab42d78de45bbcb7b32b6323fb1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Mon, 22 Jan 2024 15:31:00 +0100
Subject: [PATCH 398/843] [ConstraintElim] Remove unused checkCondition()
 parameters (NFC)

---
 .../lib/Transforms/Scalar/ConstraintElimination.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 6fec54ac79224..8f09569d0d9cc 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1314,9 +1314,7 @@ static void generateReproducer(CmpInst *Cond, Module *M,
 
 static std::optional<bool> checkCondition(CmpInst::Predicate Pred, Value *A,
                                           Value *B, Instruction *CheckInst,
-                                          ConstraintInfo &Info, unsigned NumIn,
-                                          unsigned NumOut,
-                                          Instruction *ContextInst) {
+                                          ConstraintInfo &Info) {
   LLVM_DEBUG(dbgs() << "Checking " << *CheckInst << "\n");
 
   auto R = Info.getConstraintForSolving(Pred, A, B);
@@ -1385,9 +1383,9 @@ static bool checkAndReplaceCondition(
     return true;
   };
 
-  if (auto ImpliedCondition = checkCondition(
-          Cmp->getPredicate(), Cmp->getOperand(0), Cmp->getOperand(1), Cmp,
-          Info, NumIn, NumOut, ContextInst))
+  if (auto ImpliedCondition =
+          checkCondition(Cmp->getPredicate(), Cmp->getOperand(0),
+                         Cmp->getOperand(1), Cmp, Info))
     return ReplaceCmpWithConstant(Cmp, *ImpliedCondition);
   return false;
 }
@@ -1446,8 +1444,7 @@ static bool checkOrAndOpImpliedByOther(
   // Check if the second condition can be simplified now.
   if (auto ImpliedCondition =
           checkCondition(CmpToCheck->getPredicate(), CmpToCheck->getOperand(0),
-                         CmpToCheck->getOperand(1), CmpToCheck, Info, CB.NumIn,
-                         CB.NumOut, CB.getContextInst())) {
+                         CmpToCheck->getOperand(1), CmpToCheck, Info)) {
     if (IsOr && isa<SelectInst>(JoinOp)) {
       JoinOp->setOperand(
           OtherOpIdx == 0 ? 2 : 0,

From ac3ee1b1aec424c60660fd245f5b53aaffa2f5b1 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Mon, 22 Jan 2024 22:54:27 +0800
Subject: [PATCH 399/843] [Transforms] Fix -Wunused-variable and remove
 redundant VerifyStates after #75826 (NFC)

llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp:1064:18: error: unused variable 'I' [-Werror,-Wunused-variable]
    Instruction *I = cast<Instruction>(Pair.first);
                 ^
llvm-project/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp:1066:11: error: unused variable 'BaseValue' [-Werror,-Wunused-variable]
    auto *BaseValue = State.getBaseValue();
          ^
2 errors generated.
---
 llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index c09983702d6e2..45ce3bf3ceae2 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -1056,7 +1056,6 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
   for (const auto &Pair : States) {
     LLVM_DEBUG(dbgs() << " " << Pair.second << " for " << *Pair.first << "\n");
   }
-#endif
 
   // since we do the conflict marking as part of the fixpoint iteration this
   // loop only asserts that invariants are met
@@ -1072,9 +1071,6 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache,
         "why did it get added?");
     assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
   }
-
-#ifndef NDEBUG
-  VerifyStates();
 #endif
 
   // Insert Phis for all conflicts

From 3440466536c97ced20e366301a60f12f4fd01e30 Mon Sep 17 00:00:00 2001
From: carlobertolli <carlo.bertolli@amd.com>
Date: Mon, 22 Jan 2024 08:56:51 -0600
Subject: [PATCH 400/843] [OpenMP] Fix two usm tests for amdgpus. (#78824)

Some are missing setting of HSA_XNACK=1 environment variable, used to
enable unified memory support on amdgpu's when it's not been set at
kernel boot time. Some others needed to be marked as supporting
unified_shared_memory in the lit test harness.

Extend lit test harness to enable unified_shared_memory requirement for
AMD GPUs.

Reland: #77851
---
 openmp/libomptarget/test/lit.cfg                         | 9 +++++++--
 openmp/libomptarget/test/lit.site.cfg.in                 | 1 +
 openmp/libomptarget/test/unified_shared_memory/api.c     | 7 ++++---
 .../test/unified_shared_memory/close_enter_exit.c        | 6 +++---
 4 files changed, 15 insertions(+), 8 deletions(-)

diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index 9078561e3198e..a0df7314fe974 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -116,8 +116,13 @@ if config.libomptarget_current_target.startswith('nvptx'):
   except ValueError:
     # If the architecture is invalid, assume it is supported.
     supports_unified_shared_memory = True
-if config.libomptarget_current_target.startswith('amdgcn'):
-    supports_unified_shared_memory = False
+elif config.libomptarget_current_target.startswith('amdgcn'):
+    # amdgpu_test_arch contains a list of AMD GPUs in the system
+    # only check the first one assuming that we will run the test on it.
+    if not (config.amdgpu_test_arch.startswith("gfx90a") or
+            config.amdgpu_test_arch.startswith("gfx940") or
+            config.amdgpu_test_arch.startswith("gfx942")):
+       supports_unified_shared_memory = False
 if supports_unified_shared_memory:
    config.available_features.add('unified_shared_memory')
 
diff --git a/openmp/libomptarget/test/lit.site.cfg.in b/openmp/libomptarget/test/lit.site.cfg.in
index 2d63811883872..7c75aaa18fa77 100644
--- a/openmp/libomptarget/test/lit.site.cfg.in
+++ b/openmp/libomptarget/test/lit.site.cfg.in
@@ -10,6 +10,7 @@ config.test_extra_flags = "@OPENMP_TEST_FLAGS@"
 config.cuda_path = "@CUDA_TOOLKIT_ROOT_DIR@"
 config.cuda_libdir = "@CUDA_LIBDIR@"
 config.cuda_test_arch = "@LIBOMPTARGET_DEP_CUDA_ARCH@"
+config.amdgpu_test_arch = "@LIBOMPTARGET_AMDGPU_DETECTED_ARCH_LIST@"
 config.libomptarget_obj_root = "@CMAKE_CURRENT_BINARY_DIR@/@CURRENT_TARGET@"
 config.library_dir = "@LIBOMPTARGET_LIBRARY_DIR@"
 config.llvm_library_dir = "@LIBOMPTARGET_LLVM_LIBRARY_DIR@"
diff --git a/openmp/libomptarget/test/unified_shared_memory/api.c b/openmp/libomptarget/test/unified_shared_memory/api.c
index 4f0f93b9bf509..c7ab055abb515 100644
--- a/openmp/libomptarget/test/unified_shared_memory/api.c
+++ b/openmp/libomptarget/test/unified_shared_memory/api.c
@@ -1,9 +1,10 @@
-// RUN: %libomptarget-compile-run-and-check-generic
+// RUN: %libomptarget-compile-generic
+// RUN: env HSA_XNACK=1 \
+// RUN: %libomptarget-run-generic | %fcheck-generic
 // XFAIL: nvptx64-nvidia-cuda
 // XFAIL: nvptx64-nvidia-cuda-LTO
 
-// Fails on amdgpu with error: GPU Memory Error
-// UNSUPPORTED: amdgcn-amd-amdhsa
+// REQUIRES: unified_shared_memory
 
 #include <omp.h>
 #include <stdio.h>
diff --git a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
index 71be4c9177b06..b116c652f47e9 100644
--- a/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
+++ b/openmp/libomptarget/test/unified_shared_memory/close_enter_exit.c
@@ -1,11 +1,11 @@
-// RUN: %libomptarget-compile-run-and-check-generic
+// RUN: %libomptarget-compile-generic
+// RUN: env HSA_XNACK=1 \
+// RUN: %libomptarget-run-generic | %fcheck-generic
 
 // REQUIRES: unified_shared_memory
 // UNSUPPORTED: clang-6, clang-7, clang-8, clang-9
 
-// Fails on amdgpu with error: GPU Memory Error
 // Fails on nvptx with error: an illegal memory access was encountered
-// XFAIL: amdgcn-amd-amdhsa
 // XFAIL: nvptx64-nvidia-cuda
 // XFAIL: nvptx64-nvidia-cuda-LTO
 

From ff1cde5ba2da8b227e1ed4a4c47b636ca4fbe59e Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Mon, 22 Jan 2024 15:02:41 +0000
Subject: [PATCH 401/843] [AArch64] Add vec3 load/store tests with GEPs with
 const offsets.

Extra tests for
  https://github.com/llvm/llvm-project/pull/78637
  https://github.com/llvm/llvm-project/pull/78632
---
 .../AArch64/vec3-loads-ext-trunc-stores.ll    | 172 +++++++++++++++++-
 1 file changed, 169 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 9eeb194409df6..9040e9007c084 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -2,7 +2,7 @@
 ; RUN: llc -mtriple=arm64-apple-macosx -o - %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64_be -o - %s | FileCheck --check-prefix BE %s
 
-define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
+define <16 x i8> @load_v3i8(ptr %src) {
 ; CHECK-LABEL: load_v3i8:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
@@ -44,7 +44,7 @@ define <16 x i8> @load_v3i8(ptr %src, ptr %dst) {
   ret <16 x i8> %s
 }
 
-define <4 x i32> @load_v3i8_to_4xi32(ptr %src, ptr %dst) {
+define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
 ; CHECK-LABEL: load_v3i8_to_4xi32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
@@ -87,7 +87,95 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src, ptr %dst) {
   ret <4 x i32> %e
 }
 
-define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src, ptr %dst) {
+define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
+; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldurh w8, [x0, #1]
+; CHECK-NEXT:    movi.2d v1, #0x0000ff000000ff
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldrsb w8, [x0, #3]
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    mov.h v0[1], v0[1]
+; CHECK-NEXT:    mov.h v0[2], w8
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_v3i8_to_4xi32_const_offset_1:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldurh w8, [x0, #1]
+; BE-NEXT:    movi v1.2d, #0x0000ff000000ff
+; BE-NEXT:    strh w8, [sp, #12]
+; BE-NEXT:    ldr s0, [sp, #12]
+; BE-NEXT:    ldrsb w8, [x0, #3]
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    ushll v0.8h, v0.8b, #0
+; BE-NEXT:    mov v0.h[1], v0.h[1]
+; BE-NEXT:    mov v0.h[2], w8
+; BE-NEXT:    ushll v0.4s, v0.4h, #0
+; BE-NEXT:    and v0.16b, v0.16b, v1.16b
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %src.1 = getelementptr inbounds i8, ptr %src, i64 1
+  %l = load <3 x i8>, ptr %src.1, align 1
+  %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> <i32 3, i32 4, i32 5, i32 undef>
+  %e = zext <4 x i8> %s to <4 x i32>
+  ret <4 x i32> %e
+}
+
+define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
+; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_3:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldurh w8, [x0, #3]
+; CHECK-NEXT:    movi.2d v1, #0x0000ff000000ff
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldrsb w8, [x0, #5]
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    mov.h v0[1], v0[1]
+; CHECK-NEXT:    mov.h v0[2], w8
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_v3i8_to_4xi32_const_offset_3:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldurh w8, [x0, #3]
+; BE-NEXT:    movi v1.2d, #0x0000ff000000ff
+; BE-NEXT:    strh w8, [sp, #12]
+; BE-NEXT:    ldr s0, [sp, #12]
+; BE-NEXT:    ldrsb w8, [x0, #5]
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    ushll v0.8h, v0.8b, #0
+; BE-NEXT:    mov v0.h[1], v0.h[1]
+; BE-NEXT:    mov v0.h[2], w8
+; BE-NEXT:    ushll v0.4s, v0.4h, #0
+; BE-NEXT:    and v0.16b, v0.16b, v1.16b
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %src.3 = getelementptr inbounds i8, ptr %src, i64 3
+  %l = load <3 x i8>, ptr %src.3, align 1
+  %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> <i32 3, i32 4, i32 5, i32 undef>
+  %e = zext <4 x i8> %s to <4 x i32>
+  ret <4 x i32> %e
+}
+
+define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
 ; CHECK-LABEL: volatile_load_v3i8_to_4xi32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
@@ -271,6 +359,84 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
   ret void
 }
 
+define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
+; CHECK-LABEL: shift_trunc_store_const_offset_1:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    shrn.4h v0, v0, #16
+; CHECK-NEXT:    xtn.8b v1, v0
+; CHECK-NEXT:    umov.h w8, v0[2]
+; CHECK-NEXT:    str s1, [sp, #12]
+; CHECK-NEXT:    ldrh w9, [sp, #12]
+; CHECK-NEXT:    strb w8, [x1, #3]
+; CHECK-NEXT:    sturh w9, [x1, #1]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: shift_trunc_store_const_offset_1:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ld1 { v0.4s }, [x0]
+; BE-NEXT:    shrn v0.4h, v0.4s, #16
+; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    umov w8, v0.h[2]
+; BE-NEXT:    rev32 v1.16b, v1.16b
+; BE-NEXT:    str s1, [sp, #12]
+; BE-NEXT:    ldrh w9, [sp, #12]
+; BE-NEXT:    strb w8, [x1, #3]
+; BE-NEXT:    sturh w9, [x1, #1]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load <3 x i32>, ptr %src
+  %s = lshr <3 x i32> %l, <i32 16, i32 16, i32 16>
+  %t = trunc <3 x i32> %s to <3 x i8>
+  %dst.1 = getelementptr inbounds i8, ptr %dst, i64 1
+  store <3 x i8> %t, ptr %dst.1, align 1
+  ret void
+}
+
+define void @shift_trunc_store_const_offset_3(ptr %src, ptr %dst) {
+; CHECK-LABEL: shift_trunc_store_const_offset_3:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    shrn.4h v0, v0, #16
+; CHECK-NEXT:    xtn.8b v1, v0
+; CHECK-NEXT:    umov.h w8, v0[2]
+; CHECK-NEXT:    str s1, [sp, #12]
+; CHECK-NEXT:    ldrh w9, [sp, #12]
+; CHECK-NEXT:    strb w8, [x1, #5]
+; CHECK-NEXT:    sturh w9, [x1, #3]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: shift_trunc_store_const_offset_3:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ld1 { v0.4s }, [x0]
+; BE-NEXT:    shrn v0.4h, v0.4s, #16
+; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    umov w8, v0.h[2]
+; BE-NEXT:    rev32 v1.16b, v1.16b
+; BE-NEXT:    str s1, [sp, #12]
+; BE-NEXT:    ldrh w9, [sp, #12]
+; BE-NEXT:    strb w8, [x1, #5]
+; BE-NEXT:    sturh w9, [x1, #3]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load <3 x i32>, ptr %src
+  %s = lshr <3 x i32> %l, <i32 16, i32 16, i32 16>
+  %t = trunc <3 x i32> %s to <3 x i8>
+  %dst.3 = getelementptr inbounds i8, ptr %dst, i64 3
+  store <3 x i8> %t, ptr %dst.3, align 1
+  ret void
+}
+
 define void @shift_trunc_volatile_store(ptr %src, ptr %dst) {
 ; CHECK-LABEL: shift_trunc_volatile_store:
 ; CHECK:       ; %bb.0:

From 3de5d8e1254977c8812412a159da4185c9853fd0 Mon Sep 17 00:00:00 2001
From: Sam McCall <sam.mccall@gmail.com>
Date: Mon, 22 Jan 2024 16:03:37 +0100
Subject: [PATCH 402/843] [include-cleaner] Add --only-headers flag, opposite
 of --ignore-headers (#78714)

---
 .../include-cleaner/test/tool.cpp             |  4 +++
 .../include-cleaner/tool/IncludeCleaner.cpp   | 30 +++++++++++++++++--
 2 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/include-cleaner/test/tool.cpp b/clang-tools-extra/include-cleaner/test/tool.cpp
index bd90679875acb..de47b2a3f3778 100644
--- a/clang-tools-extra/include-cleaner/test/tool.cpp
+++ b/clang-tools-extra/include-cleaner/test/tool.cpp
@@ -22,6 +22,10 @@ int x = foo();
 // IGNORE2-NOT: - "foobar.h"
 //     IGNORE2: + "foo.h"
 
+//        RUN: clang-include-cleaner -print=changes %s --only-headers="foo\.h" -- -I%S/Inputs/ | FileCheck --match-full-lines --allow-empty --check-prefix=ONLY %s
+//   ONLY-NOT: - "foobar.h"
+//       ONLY: + "foo.h"
+
 //        RUN: clang-include-cleaner -print %s -- -I%S/Inputs/ | FileCheck --match-full-lines --check-prefix=PRINT %s
 //      PRINT: #include "foo.h"
 //  PRINT-NOT: {{^}}#include "foobar.h"{{$}}
diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
index e078bfae66c3b..132ad25411509 100644
--- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
+++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
@@ -57,6 +57,14 @@ cl::opt<std::string> HTMLReportPath{
     cl::cat(IncludeCleaner),
 };
 
+cl::opt<std::string> OnlyHeaders{
+    "only-headers",
+    cl::desc("A comma-separated list of regexes to match against suffix of a "
+             "header. Only headers that match will be analyzed."),
+    cl::init(""),
+    cl::cat(IncludeCleaner),
+};
+
 cl::opt<std::string> IgnoreHeaders{
     "ignore-headers",
     cl::desc("A comma-separated list of regexes to match against suffix of a "
@@ -221,11 +229,12 @@ class ActionFactory : public tooling::FrontendActionFactory {
   llvm::StringMap<std::string> EditedFiles;
 };
 
-std::function<bool(llvm::StringRef)> headerFilter() {
+// Compiles a regex list into a function that return true if any match a header.
+// Prints and returns nullptr if any regexes are invalid.
+std::function<bool(llvm::StringRef)> matchesAny(llvm::StringRef RegexFlag) {
   auto FilterRegs = std::make_shared<std::vector<llvm::Regex>>();
-
   llvm::SmallVector<llvm::StringRef> Headers;
-  llvm::StringRef(IgnoreHeaders).split(Headers, ',', -1, /*KeepEmpty=*/false);
+  RegexFlag.split(Headers, ',', -1, /*KeepEmpty=*/false);
   for (auto HeaderPattern : Headers) {
     std::string AnchoredPattern = "(" + HeaderPattern.str() + ")$";
     llvm::Regex CompiledRegex(AnchoredPattern);
@@ -246,6 +255,21 @@ std::function<bool(llvm::StringRef)> headerFilter() {
   };
 }
 
+std::function<bool(llvm::StringRef)> headerFilter() {
+  auto OnlyMatches = matchesAny(OnlyHeaders);
+  auto IgnoreMatches = matchesAny(IgnoreHeaders);
+  if (!OnlyMatches || !IgnoreMatches)
+    return nullptr;
+
+  return [OnlyMatches, IgnoreMatches](llvm::StringRef Header) {
+    if (OnlyHeaders.getNumOccurrences() && !OnlyMatches(Header))
+      return true;
+    if (IgnoreHeaders.getNumOccurrences() && IgnoreMatches(Header))
+      return true;
+    return false;
+  };
+}
+
 } // namespace
 } // namespace include_cleaner
 } // namespace clang

From af1741ed40ec3a2664c50effe0d0c9bd89e60cdc Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Mon, 22 Jan 2024 09:01:02 -0600
Subject: [PATCH 403/843] [Flang][OpenMP] Reword comment for clarification, NFC

---
 flang/lib/Lower/OpenMP.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index 3deeb6019b19b..bebae92f2f9f0 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -2318,8 +2318,9 @@ static void createBodyOfOp(
 
   if (genNested) {
     // genFIR(Evaluation&) tries to patch up unterminated blocks, causing
-    // a lot of trouble if the terminator generation is delayed past this
-    // point. Insert a temporary terminator here, then delete it.
+    // a lot of complications for our approach if the terminator generation
+    // is delayed past this point. Insert a temporary terminator here, then
+    // delete it.
     firOpBuilder.setInsertionPointToEnd(&op.getRegion().back());
     auto *temp = Fortran::lower::genOpenMPTerminator(firOpBuilder,
                                                      op.getOperation(), loc);

From 0d8e333a7e756fc6ab07305ba1bbce916f510b30 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Fri, 19 Jan 2024 07:25:07 -0800
Subject: [PATCH 404/843] [OpenACC] Implement 'vector_length' clause parsing.

The 'vector_length' clause is the first of the 'int-expr' clauses that I've
implemented.  Currently this is just being parsed as an assignment-expr,
  since it needs to be usable in a list. Sema implementation will
  enforce the integral-nature of it.
---
 clang/include/clang/Basic/OpenACCKinds.h   |  6 +++++
 clang/include/clang/Parse/Parser.h         |  4 +++-
 clang/lib/Parse/ParseOpenACC.cpp           | 16 ++++++++++++-
 clang/test/ParserOpenACC/parse-clauses.c   | 27 ++++++++++++++++++++++
 clang/test/ParserOpenACC/parse-clauses.cpp |  8 +++++++
 5 files changed, 59 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h
index a27b78d2e6da8..a33daea2331aa 100644
--- a/clang/include/clang/Basic/OpenACCKinds.h
+++ b/clang/include/clang/Basic/OpenACCKinds.h
@@ -221,6 +221,9 @@ enum class OpenACCClauseKind {
   Collapse,
   /// 'bind' clause, allowed on routine constructs.
   Bind,
+  /// 'vector_length' clause, allowed on 'parallel', 'kernels', 'parallel loop',
+  /// and 'kernels loop' constructs.
+  VectorLength,
 
   /// Represents an invalid clause, for the purposes of parsing.
   Invalid,
@@ -322,6 +325,9 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
   case OpenACCClauseKind::Bind:
     return Out << "bind";
 
+  case OpenACCClauseKind::VectorLength:
+    return Out << "vector_length";
+
   case OpenACCClauseKind::Invalid:
     return Out << "<invalid>";
   }
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 9e2b452cbe051..6bfcd3a0d8412 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3584,7 +3584,9 @@ class Parser : public CodeCompletionHandler {
   /// Parses the clause of the 'bind' argument, which can be a string literal or
   /// an ID expression.
   ExprResult ParseOpenACCBindClauseArgument();
-
+  /// Parses the clause kind of 'int-expr', which can be any integral
+  /// expression.
+  ExprResult ParseOpenACCIntExpr();
 private:
   //===--------------------------------------------------------------------===//
   // C++ 14: Templates [temp]
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 1f6d111f3aa57..9b0d7077670a7 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -119,6 +119,7 @@ OpenACCClauseKind getOpenACCClauseKind(Token Tok) {
       .Case("seq", OpenACCClauseKind::Seq)
       .Case("use_device", OpenACCClauseKind::UseDevice)
       .Case("vector", OpenACCClauseKind::Vector)
+      .Case("vector_length", OpenACCClauseKind::VectorLength)
       .Case("worker", OpenACCClauseKind::Worker)
       .Default(OpenACCClauseKind::Invalid);
 }
@@ -469,6 +470,7 @@ ClauseParensKind getClauseParensKind(OpenACCDirectiveKind DirKind,
   case OpenACCClauseKind::Reduction:
   case OpenACCClauseKind::Collapse:
   case OpenACCClauseKind::Bind:
+  case OpenACCClauseKind::VectorLength:
     return ClauseParensKind::Required;
 
   case OpenACCClauseKind::Auto:
@@ -536,6 +538,12 @@ void Parser::ParseOpenACCClauseList(OpenACCDirectiveKind DirKind) {
   }
 }
 
+ExprResult Parser::ParseOpenACCIntExpr() {
+  // FIXME: this is required to be an integer expression (or dependent), so we
+  // should ensure that is the case by passing this to SEMA here.
+  return getActions().CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+}
+
 bool Parser::ParseOpenACCClauseVarList(OpenACCClauseKind Kind) {
   // FIXME: Future clauses will require 'special word' parsing, check for one,
   // then parse it based on whether it is a clause that requires a 'special
@@ -665,7 +673,7 @@ bool Parser::ParseOpenACCClauseParams(OpenACCDirectiveKind DirKind,
       tryParseAndConsumeSpecialTokenKind(*this, OpenACCSpecialTokenKind::Force,
                                          Kind);
       ExprResult NumLoops =
-          getActions().CorrectDelayedTyposInExpr(ParseAssignmentExpression());
+          getActions().CorrectDelayedTyposInExpr(ParseConstantExpression());
       if (NumLoops.isInvalid())
         return true;
       break;
@@ -676,6 +684,12 @@ bool Parser::ParseOpenACCClauseParams(OpenACCDirectiveKind DirKind,
         return true;
       break;
     }
+    case OpenACCClauseKind::VectorLength: {
+      ExprResult IntExpr = ParseOpenACCIntExpr();
+      if (IntExpr.isInvalid())
+        return true;
+      break;
+    }
     default:
       llvm_unreachable("Not a required parens type?");
     }
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 22dce4c4d8cd9..31643073eb76a 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -698,6 +698,33 @@ void ReductionClauseParsing() {
 #pragma acc serial seq, reduction(&&: Begin, End)
   // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
 #pragma acc serial reduction(||: Begin, End), seq
+}
+
+int returns_int();
+
+void IntExprParsing() {
+  // expected-error@+2{{expected '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel vector_length
+
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel vector_length()
+
+  // expected-error@+2{{use of undeclared identifier 'invalid'}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel vector_length(invalid)
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel vector_length(5, 4)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel vector_length(5)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel vector_length(returns_int())
 }
 
   // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
diff --git a/clang/test/ParserOpenACC/parse-clauses.cpp b/clang/test/ParserOpenACC/parse-clauses.cpp
index 1b3196094103c..07164b5e33794 100644
--- a/clang/test/ParserOpenACC/parse-clauses.cpp
+++ b/clang/test/ParserOpenACC/parse-clauses.cpp
@@ -9,6 +9,14 @@ void templ() {
   // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
 #pragma acc loop collapse(T::value)
   for(;;){}
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel vector_length(T::value)
+  for(;;){}
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel vector_length(I)
+  for(;;){}
 }
 
 struct S {

From 6ba62f4f251763745d39194ecd33ebaf7a739059 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau@arm.com>
Date: Mon, 22 Jan 2024 15:11:49 +0000
Subject: [PATCH 405/843] [AArch64][SME2] Refine fcvtu/fcvts/scvtf/ucvtf
 (#77947)

Rename intrinsics for fcvtu to fcvtzu and fcvts to fcvtzs.

Use llvm_anyvector_ty for both multi vector returns and operands,
therefore the return and operands can be specified in the intrinsic
call, e.g.

@llvm.aarch64.sve.scvtf.x4.nxv4f32.nxv4i32
---
 clang/include/clang/Basic/TargetBuiltins.h    |  4 +-
 clang/include/clang/Basic/arm_sve.td          | 48 ++++++-------
 clang/include/clang/Basic/arm_sve_sme_incl.td | 70 +++++++++----------
 clang/lib/CodeGen/CGBuiltin.cpp               |  2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_cvt.c   | 32 ++++-----
 llvm/include/llvm/IR/IntrinsicsAArch64.td     | 36 ++++------
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    |  8 +--
 .../CodeGen/AArch64/sme2-intrinsics-cvt.ll    | 32 ++++-----
 8 files changed, 111 insertions(+), 121 deletions(-)

diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index c9f9cbec7493b..c31834fb52a90 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -291,7 +291,9 @@ namespace clang {
     bool isZExtReturn() const { return Flags & IsZExtReturn; }
     bool isByteIndexed() const { return Flags & IsByteIndexed; }
     bool isOverloadNone() const { return Flags & IsOverloadNone; }
-    bool isOverloadWhile() const { return Flags & IsOverloadWhile; }
+    bool isOverloadWhileOrMultiVecCvt() const {
+      return Flags & IsOverloadWhileOrMultiVecCvt;
+    }
     bool isOverloadDefault() const { return !(Flags & OverloadKindMask); }
     bool isOverloadWhileRW() const { return Flags & IsOverloadWhileRW; }
     bool isOverloadCvt() const { return Flags & IsOverloadCvt; }
diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 51cf8e29129c6..4c5c1b5603f18 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -749,14 +749,14 @@ def SVCMPLS_WIDE_N : SInst<"svcmple_wide[_n_{d}]", "PPdj", "UcUsUi", MergeNone,
 ////////////////////////////////////////////////////////////////////////////////
 // While comparisons
 
-def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILELE_S32 : SInst<"svwhilele_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILELE_S64 : SInst<"svwhilele_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILELO_U32 : SInst<"svwhilelt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILELO_U64 : SInst<"svwhilelt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilelo", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILELS_U32 : SInst<"svwhilele_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILELS_U64 : SInst<"svwhilele_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilels", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILELT_S32 : SInst<"svwhilelt_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILELT_S64 : SInst<"svwhilelt_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Counting bit
@@ -1336,14 +1336,14 @@ let TargetGuard = "sve2p1|sme2" in {
 ////////////////////////////////////////////////////////////////////////////////
 // SVE2 WhileGE/GT
 let TargetGuard = "sve2" in {
-def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>;
-def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhile, IsStreamingCompatible]>;
+def SVWHILEGE_S32 : SInst<"svwhilege_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILEGE_S64 : SInst<"svwhilege_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILEGT_S32 : SInst<"svwhilegt_{d}[_{1}]", "Pkk", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILEGT_S64 : SInst<"svwhilegt_{d}[_{1}]", "Pll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILEHI_U32 : SInst<"svwhilegt_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILEHI_U64 : SInst<"svwhilegt_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehi", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILEHS_U32 : SInst<"svwhilege_{d}[_{1}]", "Pmm", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
+def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNone, "aarch64_sve_whilehs", [IsOverloadWhileOrMultiVecCvt, IsStreamingCompatible]>;
 }
 
 let TargetGuard = "sve2p1|sme2"  in {
@@ -2244,15 +2244,15 @@ let TargetGuard = "sme2" in {
   def SVCVT_F16_X2  : SInst<"svcvt_f16[_f32_x2]", "e2", "f", MergeNone, "aarch64_sve_fcvt_x2", [IsStreaming],[]>;
   def SVCVT_BF16_X2 : SInst<"svcvt_bf16[_f32_x2]", "$2", "f", MergeNone, "aarch64_sve_bfcvt_x2", [IsOverloadNone, IsStreaming],[]>;
 
-  def SVCVT_F32_U32_X2 : SInst<"svcvt_{d}[_u32_x2]", "2.d2.u", "f", MergeNone, "aarch64_sve_ucvtf_x2", [IsStreaming], []>;
-  def SVCVT_U32_F32_X2 : SInst<"svcvt_u32[_{d}_x2]", "2.u2.d", "f", MergeNone, "aarch64_sve_fcvtu_x2", [IsStreaming], []>;
-  def SVCVT_F32_S32_X2 : SInst<"svcvt_{d}[_s32_x2]", "2.d2.x", "f", MergeNone, "aarch64_sve_scvtf_x2", [IsStreaming], []>;
-  def SVCVT_S32_F32_X2 : SInst<"svcvt_s32[_{d}_x2]", "2.x2.d", "f", MergeNone, "aarch64_sve_fcvts_x2", [IsStreaming], []>;
+  def SVCVT_F32_U32_X2 : SInst<"svcvt_{d}[_u32_x2]", "2.d2.u", "f",  MergeNone, "aarch64_sve_ucvtf_x2",  [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
+  def SVCVT_U32_F32_X2 : SInst<"svcvt_{d}[_f32_x2]", "2.d2.M", "Ui", MergeNone, "aarch64_sve_fcvtzu_x2", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
+  def SVCVT_F32_S32_X2 : SInst<"svcvt_{d}[_s32_x2]", "2.d2.x", "f",  MergeNone, "aarch64_sve_scvtf_x2",  [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
+  def SVCVT_S32_F32_X2 : SInst<"svcvt_{d}[_f32_x2]", "2.d2.M", "i",  MergeNone, "aarch64_sve_fcvtzs_x2", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
 
-  def SVCVT_F32_U32_X4 : SInst<"svcvt_{d}[_u32_x4]", "4.d4.u", "f", MergeNone, "aarch64_sve_ucvtf_x4", [IsStreaming], []>;
-  def SVCVT_U32_F32_X4 : SInst<"svcvt_u32[_{d}_x4]", "4.u4.d", "f", MergeNone, "aarch64_sve_fcvtu_x4", [IsStreaming], []>;
-  def SVCVT_F32_S32_X4 : SInst<"svcvt_{d}[_s32_x4]", "4.d4.x", "f", MergeNone, "aarch64_sve_scvtf_x4", [IsStreaming], []>;
-  def SVCVT_S32_F32_X4 : SInst<"svcvt_s32[_{d}_x4]", "4.x4.d", "f", MergeNone, "aarch64_sve_fcvts_x4", [IsStreaming], []>;
+  def SVCVT_F32_U32_X4 : SInst<"svcvt_{d}[_u32_x4]", "4.d4.u", "f",  MergeNone, "aarch64_sve_ucvtf_x4",  [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
+  def SVCVT_U32_F32_X4 : SInst<"svcvt_{d}[_f32_x4]", "4.d4.M", "Ui", MergeNone, "aarch64_sve_fcvtzu_x4", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
+  def SVCVT_F32_S32_X4 : SInst<"svcvt_{d}[_s32_x4]", "4.d4.x", "f",  MergeNone, "aarch64_sve_scvtf_x4",  [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
+  def SVCVT_S32_F32_X4 : SInst<"svcvt_{d}[_f32_x4]", "4.d4.M", "i",  MergeNone, "aarch64_sve_fcvtzs_x4", [IsStreaming, IsOverloadWhileOrMultiVecCvt], []>;
 }
 
 //
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index c05684c0b248a..e6ad84676053b 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -194,41 +194,41 @@ def FirstSplatOperand         : FlagType<0x00000400>;
 // These flags are used to specify which scalar operand
 // needs to be duplicated/splatted into a vector.
 //      :                                     :
-def SplatOperandMask          : FlagType<0x00001C00>;
-def IsLoad                    : FlagType<0x00002000>;
-def IsStore                   : FlagType<0x00004000>;
-def IsGatherLoad              : FlagType<0x00008000>;
-def IsScatterStore            : FlagType<0x00010000>;
-def IsStructLoad              : FlagType<0x00020000>;
-def IsStructStore             : FlagType<0x00040000>;
-def IsZExtReturn              : FlagType<0x00080000>; // Return value is sign-extend by default
-def IsOverloadNone            : FlagType<0x00100000>; // Intrinsic does not take any overloaded types.
-def IsOverloadWhile           : FlagType<0x00200000>; // Use {default type, typeof(operand1)} as overloaded types.
-def IsOverloadWhileRW         : FlagType<0x00400000>; // Use {pred(default type), typeof(operand0)} as overloaded types.
-def IsOverloadCvt             : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types.
-def OverloadKindMask          : FlagType<0x00E00000>; // When the masked values are all '0', the default type is used as overload type.
-def IsByteIndexed             : FlagType<0x01000000>;
-def IsAppendSVALL             : FlagType<0x02000000>; // Appends SV_ALL as the last operand.
-def IsInsertOp1SVALL          : FlagType<0x04000000>; // Inserts SV_ALL as the second operand.
-def IsPrefetch                : FlagType<0x08000000>; // Contiguous prefetches.
-def IsGatherPrefetch          : FlagType<0x10000000>;
-def ReverseCompare            : FlagType<0x20000000>; // Compare operands must be swapped.
-def ReverseUSDOT              : FlagType<0x40000000>; // Unsigned/signed operands must be swapped.
-def IsUndef                   : FlagType<0x80000000>; // Codegen `undef` of given type.
-def IsTupleCreate             : FlagType<0x100000000>;
-def IsTupleGet                : FlagType<0x200000000>;
-def IsTupleSet                : FlagType<0x400000000>;
-def ReverseMergeAnyBinOp      : FlagType<0x800000000>; // e.g. Implement SUBR_X using SUB_X.
-def ReverseMergeAnyAccOp      : FlagType<0x1000000000>; // e.g. Implement MSB_X using MLS_X.
-def IsStreaming               : FlagType<0x2000000000>;
-def IsStreamingCompatible     : FlagType<0x4000000000>;
-def IsReadZA                  : FlagType<0x8000000000>;
-def IsWriteZA                 : FlagType<0x10000000000>;
-def IsReductionQV             : FlagType<0x20000000000>;
-def IsStreamingOrSVE2p1       : FlagType<0x40000000000>; // Use for intrinsics that are common between sme/sme2 and sve2p1.
-def IsInZA                    : FlagType<0x80000000000>;
-def IsOutZA                   : FlagType<0x100000000000>;
-def IsInOutZA                 : FlagType<0x200000000000>;
+def SplatOperandMask                : FlagType<0x00001C00>;
+def IsLoad                          : FlagType<0x00002000>;
+def IsStore                         : FlagType<0x00004000>;
+def IsGatherLoad                    : FlagType<0x00008000>;
+def IsScatterStore                  : FlagType<0x00010000>;
+def IsStructLoad                    : FlagType<0x00020000>;
+def IsStructStore                   : FlagType<0x00040000>;
+def IsZExtReturn                    : FlagType<0x00080000>; // Return value is sign-extend by default
+def IsOverloadNone                  : FlagType<0x00100000>; // Intrinsic does not take any overloaded types.
+def IsOverloadWhileOrMultiVecCvt    : FlagType<0x00200000>; // Use {default type, typeof(operand1)} as overloaded types.
+def IsOverloadWhileRW               : FlagType<0x00400000>; // Use {pred(default type), typeof(operand0)} as overloaded types.
+def IsOverloadCvt                   : FlagType<0x00800000>; // Use {typeof(operand0), typeof(last operand)} as overloaded types.
+def OverloadKindMask                : FlagType<0x00E00000>; // When the masked values are all '0', the default type is used as overload type.
+def IsByteIndexed                   : FlagType<0x01000000>;
+def IsAppendSVALL                   : FlagType<0x02000000>; // Appends SV_ALL as the last operand.
+def IsInsertOp1SVALL                : FlagType<0x04000000>; // Inserts SV_ALL as the second operand.
+def IsPrefetch                      : FlagType<0x08000000>; // Contiguous prefetches.
+def IsGatherPrefetch                : FlagType<0x10000000>;
+def ReverseCompare                  : FlagType<0x20000000>; // Compare operands must be swapped.
+def ReverseUSDOT                    : FlagType<0x40000000>; // Unsigned/signed operands must be swapped.
+def IsUndef                         : FlagType<0x80000000>; // Codegen `undef` of given type.
+def IsTupleCreate                   : FlagType<0x100000000>;
+def IsTupleGet                      : FlagType<0x200000000>;
+def IsTupleSet                      : FlagType<0x400000000>;
+def ReverseMergeAnyBinOp            : FlagType<0x800000000>; // e.g. Implement SUBR_X using SUB_X.
+def ReverseMergeAnyAccOp            : FlagType<0x1000000000>; // e.g. Implement MSB_X using MLS_X.
+def IsStreaming                     : FlagType<0x2000000000>;
+def IsStreamingCompatible           : FlagType<0x4000000000>;
+def IsReadZA                        : FlagType<0x8000000000>;
+def IsWriteZA                       : FlagType<0x10000000000>;
+def IsReductionQV                   : FlagType<0x20000000000>;
+def IsStreamingOrSVE2p1             : FlagType<0x40000000000>; // Use for intrinsics that are common between sme/sme2 and sve2p1.
+def IsInZA                          : FlagType<0x80000000000>;
+def IsOutZA                         : FlagType<0x100000000000>;
+def IsInOutZA                       : FlagType<0x200000000000>;
 
 // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
 class ImmCheckType<int val> {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index cb65c7373a34c..7ef764b8e1ac8 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -10056,7 +10056,7 @@ CodeGenFunction::getSVEOverloadTypes(const SVETypeFlags &TypeFlags,
 
   llvm::Type *DefaultType = getSVEType(TypeFlags);
 
-  if (TypeFlags.isOverloadWhile())
+  if (TypeFlags.isOverloadWhileOrMultiVecCvt())
     return {DefaultType, Ops[1]->getType()};
 
   if (TypeFlags.isOverloadWhileRW())
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index f596b3167f2b8..b292e54ea457f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -59,7 +59,7 @@ svbfloat16_t test_cvt_bf16_x2(svfloat32x2_t zn)  __arm_streaming {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
@@ -70,7 +70,7 @@ svbfloat16_t test_cvt_bf16_x2(svfloat32x2_t zn)  __arm_streaming {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
@@ -85,7 +85,7 @@ svfloat32x2_t test_svcvt_f32_u32_x2(svuint32x2_t zn)  __arm_streaming {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
@@ -96,7 +96,7 @@ svfloat32x2_t test_svcvt_f32_u32_x2(svuint32x2_t zn)  __arm_streaming {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv8i32(<vscale x 8 x i32> [[ZN]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]])
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> poison, <vscale x 4 x float> [[TMP3]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float> } [[TMP2]], 1
@@ -111,7 +111,7 @@ svfloat32x2_t test_svcvt_f32_s32_x2(svint32x2_t zn) __arm_streaming {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzu.x2.nxv4i32.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
@@ -122,7 +122,7 @@ svfloat32x2_t test_svcvt_f32_s32_x2(svint32x2_t zn) __arm_streaming {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzu.x2.nxv4i32.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
@@ -137,7 +137,7 @@ svuint32x2_t test_svcvt_u32_f32_x2(svfloat32x2_t zn)  __arm_streaming {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
-// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
@@ -148,7 +148,7 @@ svuint32x2_t test_svcvt_u32_f32_x2(svfloat32x2_t zn)  __arm_streaming {
 // CPP-CHECK-NEXT:  entry:
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN:%.*]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv8f32(<vscale x 8 x float> [[ZN]], i64 4)
-// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
+// CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]])
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 0
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> poison, <vscale x 4 x i32> [[TMP3]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP2]], 1
@@ -166,7 +166,7 @@ svint32x2_t test_svcvt_s32_f32_x2(svfloat32x2_t zn) __arm_streaming {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
 // CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
@@ -183,7 +183,7 @@ svint32x2_t test_svcvt_s32_f32_x2(svfloat32x2_t zn) __arm_streaming {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ucvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
@@ -204,7 +204,7 @@ svfloat32x4_t test_svcvt_f32_u32_x4(svuint32x4_t zn) __arm_streaming {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
 // CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
@@ -221,7 +221,7 @@ svfloat32x4_t test_svcvt_f32_u32_x4(svuint32x4_t zn) __arm_streaming {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x i32> @llvm.vector.extract.nxv4i32.nxv16i32(<vscale x 16 x i32> [[ZN]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.scvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> [[TMP0]], <vscale x 4 x i32> [[TMP1]], <vscale x 4 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]])
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 0
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> poison, <vscale x 4 x float> [[TMP5]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP4]], 1
@@ -242,7 +242,7 @@ svfloat32x4_t test_svcvt_f32_s32_x4(svint32x4_t zn) __arm_streaming {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzu.x4.nxv4i32.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
 // CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
@@ -259,7 +259,7 @@ svfloat32x4_t test_svcvt_f32_s32_x4(svint32x4_t zn) __arm_streaming {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzu.x4.nxv4i32.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
@@ -280,7 +280,7 @@ svuint32x4_t test_svcvt_u32_f32_x4(svfloat32x4_t zn) __arm_streaming {
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
 // CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
 // CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzs.x4.nxv4i32.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
 // CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
 // CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
@@ -297,7 +297,7 @@ svuint32x4_t test_svcvt_u32_f32_x4(svfloat32x4_t zn) __arm_streaming {
 // CPP-CHECK-NEXT:    [[TMP1:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 4)
 // CPP-CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 8)
 // CPP-CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[ZN]], i64 12)
-// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
+// CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.fcvtzs.x4.nxv4i32.nxv4f32(<vscale x 4 x float> [[TMP0]], <vscale x 4 x float> [[TMP1]], <vscale x 4 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]])
 // CPP-CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 0
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> poison, <vscale x 4 x i32> [[TMP5]], i64 0)
 // CPP-CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[TMP4]], 1
diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td
index 9955e78ef3ce8..921e5b95ae03e 100644
--- a/llvm/include/llvm/IR/IntrinsicsAArch64.td
+++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td
@@ -3095,26 +3095,14 @@ let TargetPrefix = "aarch64" in {
                             [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_ItoF_X2_Intrinsic
+  class SME2_CVT_X2_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>],
-                            [LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
+                            [llvm_anyvector_ty, LLVMMatchType<1>],
                             [IntrNoMem]>;
 
-  class SME2_CVT_FtoI_X2_Intrinsic
-    : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
-                            [llvm_anyvector_ty, LLVMMatchType<0>],
-                            [IntrNoMem]>;
-
-  class SME2_CVT_ItoF_X4_Intrinsic
+  class SME2_CVT_X4_Intrinsic
     : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
-                            [LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>,
-                             LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
-                            [IntrNoMem]>;
-
-  class SME2_CVT_FtoI_X4_Intrinsic
-    : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>,
-                             LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>],
-                            [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>],
+                            [llvm_anyvector_ty, LLVMMatchType<1>, LLVMMatchType<1>, LLVMMatchType<1>],
                             [IntrNoMem]>;
 
   class SME2_BFMLS_Intrinsic
@@ -3403,14 +3391,14 @@ let TargetPrefix = "aarch64" in {
   //
   def int_aarch64_sve_fcvt_x2  : SME2_CVT_VG2_SINGLE_Intrinsic;
   def int_aarch64_sve_bfcvt_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic;
-  def int_aarch64_sve_fcvts_x2 : SME2_CVT_FtoI_X2_Intrinsic;
-  def int_aarch64_sve_fcvtu_x2 : SME2_CVT_FtoI_X2_Intrinsic;
-  def int_aarch64_sve_scvtf_x2 : SME2_CVT_ItoF_X2_Intrinsic;
-  def int_aarch64_sve_ucvtf_x2 : SME2_CVT_ItoF_X2_Intrinsic;
-  def int_aarch64_sve_fcvts_x4 : SME2_CVT_FtoI_X4_Intrinsic;
-  def int_aarch64_sve_fcvtu_x4 : SME2_CVT_FtoI_X4_Intrinsic;
-  def int_aarch64_sve_scvtf_x4 : SME2_CVT_ItoF_X4_Intrinsic;
-  def int_aarch64_sve_ucvtf_x4 : SME2_CVT_ItoF_X4_Intrinsic;
+  def int_aarch64_sve_fcvtzs_x2 : SME2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_fcvtzu_x2 : SME2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_scvtf_x2  : SME2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_ucvtf_x2  : SME2_CVT_X2_Intrinsic;
+  def int_aarch64_sve_fcvtzs_x4 : SME2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_fcvtzu_x4 : SME2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_scvtf_x4  : SME2_CVT_X4_Intrinsic;
+  def int_aarch64_sve_ucvtf_x4  : SME2_CVT_X4_Intrinsic;
 
   //
   // Multi-vector saturating extract narrow
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index 1437a2b5acbcf..163ed520a8a67 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -5627,25 +5627,25 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
                AArch64::FMINNM_VG4_4Z4Z_D}))
         SelectDestructiveMultiIntrinsic(Node, 4, true, Op);
       return;
-    case Intrinsic::aarch64_sve_fcvts_x2:
+    case Intrinsic::aarch64_sve_fcvtzs_x2:
       SelectCVTIntrinsic(Node, 2, AArch64::FCVTZS_2Z2Z_StoS);
       return;
     case Intrinsic::aarch64_sve_scvtf_x2:
       SelectCVTIntrinsic(Node, 2, AArch64::SCVTF_2Z2Z_StoS);
       return;
-    case Intrinsic::aarch64_sve_fcvtu_x2:
+    case Intrinsic::aarch64_sve_fcvtzu_x2:
       SelectCVTIntrinsic(Node, 2, AArch64::FCVTZU_2Z2Z_StoS);
       return;
     case Intrinsic::aarch64_sve_ucvtf_x2:
       SelectCVTIntrinsic(Node, 2, AArch64::UCVTF_2Z2Z_StoS);
       return;
-    case Intrinsic::aarch64_sve_fcvts_x4:
+    case Intrinsic::aarch64_sve_fcvtzs_x4:
       SelectCVTIntrinsic(Node, 4, AArch64::FCVTZS_4Z4Z_StoS);
       return;
     case Intrinsic::aarch64_sve_scvtf_x4:
       SelectCVTIntrinsic(Node, 4, AArch64::SCVTF_4Z4Z_StoS);
       return;
-    case Intrinsic::aarch64_sve_fcvtu_x4:
+    case Intrinsic::aarch64_sve_fcvtzu_x4:
       SelectCVTIntrinsic(Node, 4, AArch64::FCVTZU_4Z4Z_StoS);
       return;
     case Intrinsic::aarch64_sve_ucvtf_x4:
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
index 2f7e2ba5bede5..bc1db878cbd31 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll
@@ -41,7 +41,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>}  @multi_vector_cvt_x2_s32_f32(<v
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    fcvtzs { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
+  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
 }
 
@@ -54,7 +54,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    fcvtzs { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
+  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzs.x4.nxv4i32.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %res
 }
 
@@ -68,7 +68,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>}  @multi_vector_cvt_x2_u32_f32(<v
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    fcvtzu { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
+  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzu.x2.nxv4i32.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>} %res
 }
 
@@ -81,7 +81,7 @@ define {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    fcvtzu { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
+  %res = call {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzu.x4.nxv4i32.nxv4f32(<vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
   ret {<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>} %res
 }
 
@@ -95,7 +95,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>}  @multi_vector_cvt_x2_f32_s3
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    scvtf { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
   ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
@@ -108,7 +108,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    scvtf { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
   ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
@@ -122,7 +122,7 @@ define {<vscale x 4 x float>, <vscale x 4 x float>}  @multi_vector_cvt_x2_f32_u3
 ; CHECK-NEXT:    mov z2.d, z1.d
 ; CHECK-NEXT:    ucvtf { z0.s, z1.s }, { z2.s, z3.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
   ret {<vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
@@ -135,17 +135,17 @@ define {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale
 ; CHECK-NEXT:    mov z4.d, z1.d
 ; CHECK-NEXT:    ucvtf { z0.s - z3.s }, { z4.s - z7.s }
 ; CHECK-NEXT:    ret
-  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
+  %res = call {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
   ret {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} %res
 }
 
 declare <vscale x 8 x half> @llvm.aarch64.sve.fcvt.x2.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>)
 declare <vscale x 8 x bfloat> @llvm.aarch64.sve.bfcvt.x2(<vscale x 4 x float>, <vscale x 4 x float>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvts.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtu.x2.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x2.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x2.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvts.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>)
-declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtu.x4.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x4.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
-declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32(<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzs.x2.nxv4i32.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzu.x2.nxv4i32.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x2.nxv4f32.nxv4i32(<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzs.x4.nxv4i32.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>)
+declare {<vscale x 4 x i32>, <vscale x 4 x i32>,<vscale x 4 x i32>, <vscale x 4 x i32>} @llvm.aarch64.sve.fcvtzu.x4.nxv4i32.nxv4f32(<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>,<vscale x 4 x float>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.scvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare {<vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>} @llvm.aarch64.sve.ucvtf.x4.nxv4f32.nxv4i32(<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)

From 51e91b64d0deb6a743861c2a0fba84865250036e Mon Sep 17 00:00:00 2001
From: itrofimow <i.trofimow@yandex.ru>
Date: Mon, 22 Jan 2024 19:12:41 +0400
Subject: [PATCH 406/843] [libc++abi] Implement __cxa_init_primary_exception
 and use it to optimize std::make_exception_ptr (#65534)

This patch implements __cxa_init_primary_exception, an extension to the
Itanium C++ ABI. This extension is already present in both libsupc++ and
libcxxrt. This patch also starts making use of this function in
std::make_exception_ptr: instead of going through a full throw/catch
cycle, we are now able to initialize an exception directly, thus making
std::make_exception_ptr around 30x faster.
---
 libcxx/benchmarks/CMakeLists.txt              |  1 +
 libcxx/benchmarks/exception_ptr.bench.cpp     | 19 +++++++
 libcxx/docs/BuildingLibcxx.rst                |  3 ++
 libcxx/include/__availability                 | 25 ++++++++--
 libcxx/include/__exception/exception_ptr.h    | 50 +++++++++++++++++++
 libcxx/include/new                            |  1 -
 libcxx/include/typeinfo                       |  1 -
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  3 ++
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  2 +
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  2 +
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  2 +
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  3 ++
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  2 +
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  2 +
 ...bcxxabi.v1.stable.exceptions.nonew.abilist |  2 +
 ...xxabi.v1.stable.noexceptions.nonew.abilist |  3 +-
 .../runtime/exception_pointer_cxxabi.ipp      |  8 +++
 .../runtime/exception_pointer_glibcxx.ipp     |  8 +++
 .../test/libcxx/transitive_includes/cxx03.csv |  5 +-
 .../test/libcxx/transitive_includes/cxx11.csv |  5 +-
 .../test/libcxx/transitive_includes/cxx14.csv |  5 +-
 .../test/libcxx/transitive_includes/cxx17.csv |  5 +-
 .../test/libcxx/transitive_includes/cxx20.csv |  5 +-
 .../test/libcxx/transitive_includes/cxx23.csv |  2 +
 .../test/libcxx/transitive_includes/cxx26.csv |  2 +
 libcxxabi/include/cxxabi.h                    |  6 +++
 libcxxabi/lib/exceptions.exp                  |  1 +
 libcxxabi/src/cxa_exception.cpp               | 32 +++++++-----
 28 files changed, 171 insertions(+), 34 deletions(-)
 create mode 100644 libcxx/benchmarks/exception_ptr.bench.cpp

diff --git a/libcxx/benchmarks/CMakeLists.txt b/libcxx/benchmarks/CMakeLists.txt
index ce4f5fde47b77..2434d82c6fd6b 100644
--- a/libcxx/benchmarks/CMakeLists.txt
+++ b/libcxx/benchmarks/CMakeLists.txt
@@ -202,6 +202,7 @@ set(BENCHMARK_TESTS
     allocation.bench.cpp
     deque.bench.cpp
     deque_iterator.bench.cpp
+    exception_ptr.bench.cpp
     filesystem.bench.cpp
     format_to_n.bench.cpp
     format_to.bench.cpp
diff --git a/libcxx/benchmarks/exception_ptr.bench.cpp b/libcxx/benchmarks/exception_ptr.bench.cpp
new file mode 100644
index 0000000000000..1292ad7935e37
--- /dev/null
+++ b/libcxx/benchmarks/exception_ptr.bench.cpp
@@ -0,0 +1,19 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <benchmark/benchmark.h>
+#include <exception>
+
+void bm_make_exception_ptr(benchmark::State& state) {
+  for (auto _ : state) {
+    benchmark::DoNotOptimize(std::make_exception_ptr(42));
+  }
+}
+BENCHMARK(bm_make_exception_ptr)->ThreadRange(1, 8);
+
+BENCHMARK_MAIN();
diff --git a/libcxx/docs/BuildingLibcxx.rst b/libcxx/docs/BuildingLibcxx.rst
index dcd05a28f36d1..28145ed1049e0 100644
--- a/libcxx/docs/BuildingLibcxx.rst
+++ b/libcxx/docs/BuildingLibcxx.rst
@@ -563,6 +563,7 @@ We can now run CMake:
   $ cmake -G Ninja -S runtimes -B build       \
     -DLLVM_ENABLE_RUNTIMES="libcxx"           \
     -DLIBCXX_CXX_ABI=libstdc++                \
+    -DLIBCXXABI_USE_LLVM_UNWINDER=OFF         \
     -DLIBCXX_CXX_ABI_INCLUDE_PATHS="/usr/include/c++/4.7/;/usr/include/c++/4.7/x86_64-linux-gnu/"
   $ ninja -C build install-cxx
 
@@ -589,6 +590,8 @@ We can now run CMake like:
   $ cmake -G Ninja -S runtimes -B build                               \
           -DLLVM_ENABLE_RUNTIMES="libcxx"                             \
           -DLIBCXX_CXX_ABI=libcxxrt                                   \
+          -DLIBCXX_ENABLE_NEW_DELETE_DEFINITIONS=ON                   \
+          -DLIBCXXABI_USE_LLVM_UNWINDER=OFF                           \
           -DLIBCXX_CXX_ABI_INCLUDE_PATHS=path/to/libcxxrt-sources/src
   $ ninja -C build install-cxx
 
diff --git a/libcxx/include/__availability b/libcxx/include/__availability
index 4f0746ffffc07..c5069a027750e 100644
--- a/libcxx/include/__availability
+++ b/libcxx/include/__availability
@@ -101,6 +101,12 @@
 #  define _LIBCPP_AVAILABILITY_HAS_BAD_ANY_CAST 1
 #  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST
 
+// These macros controls the availability of __cxa_init_primary_exception
+// in the built library, which std::make_exception_ptr might use
+// (see libcxx/include/__exception/exception_ptr.h).
+#  define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION 1
+#  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
+
 // These macros control the availability of all parts of <filesystem> that
 // depend on something in the dylib.
 #  define _LIBCPP_AVAILABILITY_HAS_FILESYSTEM_LIBRARY 1
@@ -136,9 +142,9 @@
 #  define _LIBCPP_AVAILABILITY_HAS_TZDB 1
 #  define _LIBCPP_AVAILABILITY_TZDB
 
-    // This controls the availability of C++23 <print>, which
-    // has a dependency on the built library (it needs access to
-    // the underlying buffer types of std::cout, std::cerr, and std::clog.
+// This controls the availability of C++23 <print>, which
+// has a dependency on the built library (it needs access to
+// the underlying buffer types of std::cout, std::cerr, and std::clog.
 #  define _LIBCPP_AVAILABILITY_HAS_PRINT 1
 #  define _LIBCPP_AVAILABILITY_PRINT
 
@@ -167,6 +173,10 @@
 #  define _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
 #  define _LIBCPP_AVAILABILITY_BAD_ANY_CAST _LIBCPP_AVAILABILITY_BAD_OPTIONAL_ACCESS
 
+// TODO: Update once this is released
+#  define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION 0
+#  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION __attribute__((unavailable))
+
 // <filesystem>
 // clang-format off
 #  if (defined(__ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__) && __ENVIRONMENT_MAC_OS_X_VERSION_MIN_REQUIRED__ < 101500) ||   \
@@ -303,4 +313,13 @@
 #  define _LIBCPP_AVAILABILITY_THROW_BAD_VARIANT_ACCESS _LIBCPP_AVAILABILITY_BAD_VARIANT_ACCESS
 #endif
 
+// Define availability attributes that depend on both
+// _LIBCPP_HAS_NO_EXCEPTIONS and _LIBCPP_HAS_NO_RTTI.
+#if defined(_LIBCPP_HAS_NO_EXCEPTIONS) || defined(_LIBCPP_HAS_NO_RTTI)
+#  undef _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION
+#  undef _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
+#  define _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION 0
+#  define _LIBCPP_AVAILABILITY_INIT_PRIMARY_EXCEPTION
+#endif
+
 #endif // _LIBCPP___AVAILABILITY
diff --git a/libcxx/include/__exception/exception_ptr.h b/libcxx/include/__exception/exception_ptr.h
index 970d8196724b7..53e2f718bc1b3 100644
--- a/libcxx/include/__exception/exception_ptr.h
+++ b/libcxx/include/__exception/exception_ptr.h
@@ -9,16 +9,44 @@
 #ifndef _LIBCPP___EXCEPTION_EXCEPTION_PTR_H
 #define _LIBCPP___EXCEPTION_EXCEPTION_PTR_H
 
+#include <__availability>
 #include <__config>
 #include <__exception/operations.h>
 #include <__memory/addressof.h>
+#include <__memory/construct_at.h>
+#include <__type_traits/decay.h>
 #include <cstddef>
 #include <cstdlib>
+#include <new>
+#include <typeinfo>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
+#ifndef _LIBCPP_ABI_MICROSOFT
+
+namespace __cxxabiv1 {
+
+extern "C" {
+_LIBCPP_OVERRIDABLE_FUNC_VIS void* __cxa_allocate_exception(size_t) throw();
+_LIBCPP_OVERRIDABLE_FUNC_VIS void __cxa_free_exception(void*) throw();
+
+struct __cxa_exception;
+_LIBCPP_OVERRIDABLE_FUNC_VIS __cxa_exception* __cxa_init_primary_exception(
+    void*,
+    std::type_info*,
+    void(
+#  if defined(_WIN32)
+        __thiscall
+#  endif
+            *)(void*)) throw();
+}
+
+} // namespace __cxxabiv1
+
+#endif
+
 namespace std { // purposefully not using versioning namespace
 
 #ifndef _LIBCPP_ABI_MICROSOFT
@@ -26,6 +54,11 @@ namespace std { // purposefully not using versioning namespace
 class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
   void* __ptr_;
 
+  static exception_ptr __from_native_exception_pointer(void*) _NOEXCEPT;
+
+  template <class _Ep>
+  friend _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep) _NOEXCEPT;
+
 public:
   _LIBCPP_HIDE_FROM_ABI exception_ptr() _NOEXCEPT : __ptr_() {}
   _LIBCPP_HIDE_FROM_ABI exception_ptr(nullptr_t) _NOEXCEPT : __ptr_() {}
@@ -51,11 +84,28 @@ class _LIBCPP_EXPORTED_FROM_ABI exception_ptr {
 template <class _Ep>
 _LIBCPP_HIDE_FROM_ABI exception_ptr make_exception_ptr(_Ep __e) _NOEXCEPT {
 #  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+#    if _LIBCPP_AVAILABILITY_HAS_INIT_PRIMARY_EXCEPTION && __cplusplus >= 201103L
+  using _Ep2 = __decay_t<_Ep>;
+
+  void* __ex = __cxxabiv1::__cxa_allocate_exception(sizeof(_Ep));
+  (void)__cxxabiv1::__cxa_init_primary_exception(__ex, const_cast<std::type_info*>(&typeid(_Ep)), [](void* __p) {
+    std::__destroy_at(static_cast<_Ep2*>(__p));
+  });
+
+  try {
+    ::new (__ex) _Ep2(__e);
+    return exception_ptr::__from_native_exception_pointer(__ex);
+  } catch (...) {
+    __cxxabiv1::__cxa_free_exception(__ex);
+    return current_exception();
+  }
+#    else
   try {
     throw __e;
   } catch (...) {
     return current_exception();
   }
+#    endif
 #  else
   ((void)__e);
   std::abort();
diff --git a/libcxx/include/new b/libcxx/include/new
index 136adc41c24be..86fbcb524b66d 100644
--- a/libcxx/include/new
+++ b/libcxx/include/new
@@ -362,7 +362,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cstdlib>
-#  include <exception>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/include/typeinfo b/libcxx/include/typeinfo
index 39a90676cc440..1144b5b12913e 100644
--- a/libcxx/include/typeinfo
+++ b/libcxx/include/typeinfo
@@ -372,7 +372,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 #if !defined(_LIBCPP_REMOVE_TRANSITIVE_INCLUDES) && _LIBCPP_STD_VER <= 20
 #  include <cstdlib>
-#  include <exception>
 #  include <type_traits>
 #endif
 
diff --git a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index b51af1bb0f9ef..c2fea4d8adb42 100644
--- a/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/arm64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -300,6 +300,7 @@
 {'is_defined': False, 'name': '___cxa_guard_acquire', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_guard_release', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_increment_exception_refcount', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_init_primary_exception', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_pure_virtual', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow_primary_exception', 'type': 'U'}
@@ -811,6 +812,7 @@
 {'is_defined': True, 'name': '__ZNSt13bad_exceptionD0Ev', 'type': 'I'}
 {'is_defined': True, 'name': '__ZNSt13bad_exceptionD1Ev', 'type': 'I'}
 {'is_defined': True, 'name': '__ZNSt13bad_exceptionD2Ev', 'type': 'I'}
+{'is_defined': True, 'name': '__ZNSt13exception_ptr31__from_native_exception_pointerEPv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt13exception_ptrC1ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt13exception_ptrC2ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt13exception_ptrD1Ev', 'type': 'FUNC'}
@@ -2509,6 +2511,7 @@
 {'is_defined': True, 'name': '___cxa_guard_abort', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_acquire', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_release', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_init_primary_exception', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_pure_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_rethrow', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw', 'type': 'I'}
diff --git a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index c78fa0e7c68f7..a60f099b53205 100644
--- a/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/i686-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -493,6 +493,7 @@
 {'is_defined': True, 'name': '_ZNSt13bad_exceptionD0Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13bad_exceptionD1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13bad_exceptionD2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt13exception_ptr31__from_native_exception_pointerEPv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC1ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC2ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrD1Ev', 'type': 'FUNC'}
@@ -2311,6 +2312,7 @@
 {'is_defined': True, 'name': '__cxa_guard_acquire', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_guard_release', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_increment_exception_refcount', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__cxa_init_primary_exception', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_new_handler', 'size': 4, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__cxa_pure_virtual', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_rethrow', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index 55987d4c913bb..a159ff5221866 100644
--- a/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -249,6 +249,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD0Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt13exception_ptr31__from_native_exception_pointerEPv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt13exception_ptrC1ERKS_', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt13exception_ptrC2ERKS_', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt13exception_ptrD1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
@@ -1111,6 +1112,7 @@
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_guard_acquire', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_guard_release', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_increment_exception_refcount', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_init_primary_exception', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_pure_virtual', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_rethrow', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_rethrow_primary_exception', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
index d861d890861bf..5749a7520f9ba 100644
--- a/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/powerpc64-ibm-aix.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -249,6 +249,7 @@
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD0Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD2Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt13exception_ptr31__from_native_exception_pointerEPv', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt13exception_ptrC1ERKS_', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt13exception_ptrC2ERKS_', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'EXP', 'is_defined': True, 'name': '_ZNSt13exception_ptrD1Ev', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
@@ -1111,6 +1112,7 @@
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_guard_acquire', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_guard_release', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_increment_exception_refcount', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
+{'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_init_primary_exception', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_pure_virtual', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_rethrow', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
 {'import_export': 'IMP', 'is_defined': False, 'name': '__cxa_rethrow_primary_exception', 'storage_mapping_class': 'DS', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
index ce0a2e354d6d2..e827114f16919 100644
--- a/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-apple-darwin.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -300,6 +300,7 @@
 {'is_defined': False, 'name': '___cxa_guard_acquire', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_guard_release', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_increment_exception_refcount', 'type': 'U'}
+{'is_defined': False, 'name': '___cxa_init_primary_exception', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_pure_virtual', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow', 'type': 'U'}
 {'is_defined': False, 'name': '___cxa_rethrow_primary_exception', 'type': 'U'}
@@ -811,6 +812,7 @@
 {'is_defined': True, 'name': '__ZNSt13bad_exceptionD0Ev', 'type': 'I'}
 {'is_defined': True, 'name': '__ZNSt13bad_exceptionD1Ev', 'type': 'I'}
 {'is_defined': True, 'name': '__ZNSt13bad_exceptionD2Ev', 'type': 'I'}
+{'is_defined': True, 'name': '__ZNSt13exception_ptr31__from_native_exception_pointerEPv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt13exception_ptrC1ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt13exception_ptrC2ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__ZNSt13exception_ptrD1Ev', 'type': 'FUNC'}
@@ -2543,6 +2545,7 @@
 {'is_defined': True, 'name': '___cxa_guard_abort', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_acquire', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_guard_release', 'type': 'I'}
+{'is_defined': True, 'name': '___cxa_init_primary_exception', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_pure_virtual', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_rethrow', 'type': 'I'}
 {'is_defined': True, 'name': '___cxa_throw', 'type': 'I'}
diff --git a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
index ae88d4453e3b9..f4077adc074e0 100644
--- a/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-linux-android21.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -493,6 +493,7 @@
 {'is_defined': True, 'name': '_ZNSt13bad_exceptionD0Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13bad_exceptionD1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13bad_exceptionD2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt13exception_ptr31__from_native_exception_pointerEPv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC1ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC2ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrD1Ev', 'type': 'FUNC'}
@@ -2305,6 +2306,7 @@
 {'is_defined': True, 'name': '__cxa_guard_acquire', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_guard_release', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_increment_exception_refcount', 'type': 'FUNC'}
+{'is_defined': True, 'name': '__cxa_init_primary_exception', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_new_handler', 'size': 8, 'type': 'OBJECT'}
 {'is_defined': True, 'name': '__cxa_pure_virtual', 'type': 'FUNC'}
 {'is_defined': True, 'name': '__cxa_rethrow', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
index f55dffc312090..e3d3fcb35d840 100644
--- a/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-freebsd.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -56,6 +56,7 @@
 {'is_defined': False, 'name': '__cxa_guard_acquire', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_guard_release', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_increment_exception_refcount', 'type': 'FUNC'}
+{'is_defined': False, 'name': '__cxa_init_primary_exception', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_pure_virtual', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_rethrow', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_rethrow_primary_exception', 'type': 'FUNC'}
@@ -523,6 +524,7 @@
 {'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD0Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt13exception_ptr31__from_native_exception_pointerEPv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC1ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC2ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrD1Ev', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
index 74408ca930c90..16923301d2548 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.exceptions.nonew.abilist
@@ -54,6 +54,7 @@
 {'is_defined': False, 'name': '__cxa_guard_acquire', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_guard_release', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_increment_exception_refcount', 'type': 'FUNC'}
+{'is_defined': False, 'name': '__cxa_init_primary_exception', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_pure_virtual', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_rethrow', 'type': 'FUNC'}
 {'is_defined': False, 'name': '__cxa_rethrow_primary_exception', 'type': 'FUNC'}
@@ -521,6 +522,7 @@
 {'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD0Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt13exception_ptr31__from_native_exception_pointerEPv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC1ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC2ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrD1Ev', 'type': 'FUNC'}
diff --git a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
index 618d2968d1a64..2380ffb100de9 100644
--- a/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
+++ b/libcxx/lib/abi/x86_64-unknown-linux-gnu.libcxxabi.v1.stable.noexceptions.nonew.abilist
@@ -493,6 +493,7 @@
 {'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD0Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt12experimental19bad_optional_accessD2Ev', 'type': 'FUNC'}
+{'is_defined': True, 'name': '_ZNSt13exception_ptr31__from_native_exception_pointerEPv', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC1ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrC2ERKS_', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZNSt13exception_ptrD1Ev', 'type': 'FUNC'}
@@ -1999,4 +2000,4 @@
 {'is_defined': True, 'name': '_ZTv0_n24_NSt3__114basic_iostreamIcNS_11char_traitsIcEEED0Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZTv0_n24_NSt3__114basic_iostreamIcNS_11char_traitsIcEEED1Ev', 'type': 'FUNC'}
 {'is_defined': True, 'name': '_ZTv0_n24_NSt3__19strstreamD0Ev', 'type': 'FUNC'}
-{'is_defined': True, 'name': '_ZTv0_n24_NSt3__19strstreamD1Ev', 'type': 'FUNC'}
\ No newline at end of file
+{'is_defined': True, 'name': '_ZTv0_n24_NSt3__19strstreamD1Ev', 'type': 'FUNC'}
diff --git a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
index c07de5838b14a..bdb17b9996b7e 100644
--- a/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_cxxabi.ipp
@@ -28,6 +28,14 @@ exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept {
   return *this;
 }
 
+exception_ptr exception_ptr::__from_native_exception_pointer(void* __e) noexcept {
+  exception_ptr ptr;
+  ptr.__ptr_ = __e;
+  __cxa_increment_exception_refcount(ptr.__ptr_);
+
+  return ptr;
+}
+
 nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {}
 
 nested_exception::~nested_exception() noexcept {}
diff --git a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
index 8e0e63cd4d497..6dad248f9e1fd 100644
--- a/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_glibcxx.ipp
@@ -23,6 +23,7 @@ namespace __exception_ptr {
 struct exception_ptr {
   void* __ptr_;
 
+  explicit exception_ptr(void*) noexcept;
   exception_ptr(const exception_ptr&) noexcept;
   exception_ptr& operator=(const exception_ptr&) noexcept;
   ~exception_ptr() noexcept;
@@ -45,6 +46,13 @@ exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept {
   return *this;
 }
 
+exception_ptr exception_ptr::__from_native_exception_pointer(void* __e) noexcept {
+  exception_ptr ptr{};
+  new (reinterpret_cast<void*>(&ptr)) __exception_ptr::exception_ptr(__e);
+
+  return ptr;
+}
+
 nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {}
 
 _LIBCPP_NORETURN void nested_exception::rethrow_nested() const {
diff --git a/libcxx/test/libcxx/transitive_includes/cxx03.csv b/libcxx/test/libcxx/transitive_includes/cxx03.csv
index 5769fc70d8a75..7737551fa08b1 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx03.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx03.csv
@@ -221,7 +221,9 @@ deque typeinfo
 deque version
 exception cstddef
 exception cstdlib
+exception new
 exception type_traits
+exception typeinfo
 exception version
 execution cstddef
 execution version
@@ -536,7 +538,6 @@ mutex typeinfo
 mutex version
 new cstddef
 new cstdlib
-new exception
 new type_traits
 new version
 numbers concepts
@@ -643,7 +644,6 @@ ranges initializer_list
 ranges iosfwd
 ranges iterator
 ranges limits
-ranges new
 ranges optional
 ranges span
 ranges tuple
@@ -875,7 +875,6 @@ typeindex version
 typeinfo cstddef
 typeinfo cstdint
 typeinfo cstdlib
-typeinfo exception
 typeinfo type_traits
 unordered_map algorithm
 unordered_map bit
diff --git a/libcxx/test/libcxx/transitive_includes/cxx11.csv b/libcxx/test/libcxx/transitive_includes/cxx11.csv
index 863b674a13d0a..21f0138b1b4bb 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx11.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx11.csv
@@ -222,7 +222,9 @@ deque typeinfo
 deque version
 exception cstddef
 exception cstdlib
+exception new
 exception type_traits
+exception typeinfo
 exception version
 execution cstddef
 execution version
@@ -541,7 +543,6 @@ mutex typeinfo
 mutex version
 new cstddef
 new cstdlib
-new exception
 new type_traits
 new version
 numbers concepts
@@ -648,7 +649,6 @@ ranges initializer_list
 ranges iosfwd
 ranges iterator
 ranges limits
-ranges new
 ranges optional
 ranges span
 ranges tuple
@@ -881,7 +881,6 @@ typeindex version
 typeinfo cstddef
 typeinfo cstdint
 typeinfo cstdlib
-typeinfo exception
 typeinfo type_traits
 unordered_map algorithm
 unordered_map bit
diff --git a/libcxx/test/libcxx/transitive_includes/cxx14.csv b/libcxx/test/libcxx/transitive_includes/cxx14.csv
index bbaf1ed89fac6..71d1735d79487 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx14.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx14.csv
@@ -222,7 +222,9 @@ deque typeinfo
 deque version
 exception cstddef
 exception cstdlib
+exception new
 exception type_traits
+exception typeinfo
 exception version
 execution cstddef
 execution version
@@ -543,7 +545,6 @@ mutex typeinfo
 mutex version
 new cstddef
 new cstdlib
-new exception
 new type_traits
 new version
 numbers concepts
@@ -650,7 +651,6 @@ ranges initializer_list
 ranges iosfwd
 ranges iterator
 ranges limits
-ranges new
 ranges optional
 ranges span
 ranges tuple
@@ -883,7 +883,6 @@ typeindex version
 typeinfo cstddef
 typeinfo cstdint
 typeinfo cstdlib
-typeinfo exception
 typeinfo type_traits
 unordered_map algorithm
 unordered_map bit
diff --git a/libcxx/test/libcxx/transitive_includes/cxx17.csv b/libcxx/test/libcxx/transitive_includes/cxx17.csv
index bbaf1ed89fac6..71d1735d79487 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx17.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx17.csv
@@ -222,7 +222,9 @@ deque typeinfo
 deque version
 exception cstddef
 exception cstdlib
+exception new
 exception type_traits
+exception typeinfo
 exception version
 execution cstddef
 execution version
@@ -543,7 +545,6 @@ mutex typeinfo
 mutex version
 new cstddef
 new cstdlib
-new exception
 new type_traits
 new version
 numbers concepts
@@ -650,7 +651,6 @@ ranges initializer_list
 ranges iosfwd
 ranges iterator
 ranges limits
-ranges new
 ranges optional
 ranges span
 ranges tuple
@@ -883,7 +883,6 @@ typeindex version
 typeinfo cstddef
 typeinfo cstdint
 typeinfo cstdlib
-typeinfo exception
 typeinfo type_traits
 unordered_map algorithm
 unordered_map bit
diff --git a/libcxx/test/libcxx/transitive_includes/cxx20.csv b/libcxx/test/libcxx/transitive_includes/cxx20.csv
index 07e23053e9558..4d4372275eb82 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx20.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx20.csv
@@ -228,7 +228,9 @@ deque typeinfo
 deque version
 exception cstddef
 exception cstdlib
+exception new
 exception type_traits
+exception typeinfo
 exception version
 execution cstddef
 execution version
@@ -548,7 +550,6 @@ mutex typeinfo
 mutex version
 new cstddef
 new cstdlib
-new exception
 new type_traits
 new version
 numbers concepts
@@ -655,7 +656,6 @@ ranges initializer_list
 ranges iosfwd
 ranges iterator
 ranges limits
-ranges new
 ranges optional
 ranges span
 ranges tuple
@@ -887,7 +887,6 @@ typeindex version
 typeinfo cstddef
 typeinfo cstdint
 typeinfo cstdlib
-typeinfo exception
 typeinfo type_traits
 unordered_map algorithm
 unordered_map bit
diff --git a/libcxx/test/libcxx/transitive_includes/cxx23.csv b/libcxx/test/libcxx/transitive_includes/cxx23.csv
index 23a1f42556f22..94c67307cc2f1 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx23.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx23.csv
@@ -153,6 +153,8 @@ deque tuple
 deque version
 exception cstddef
 exception cstdlib
+exception new
+exception typeinfo
 exception version
 execution cstddef
 execution version
diff --git a/libcxx/test/libcxx/transitive_includes/cxx26.csv b/libcxx/test/libcxx/transitive_includes/cxx26.csv
index 23a1f42556f22..94c67307cc2f1 100644
--- a/libcxx/test/libcxx/transitive_includes/cxx26.csv
+++ b/libcxx/test/libcxx/transitive_includes/cxx26.csv
@@ -153,6 +153,8 @@ deque tuple
 deque version
 exception cstddef
 exception cstdlib
+exception new
+exception typeinfo
 exception version
 execution cstddef
 execution version
diff --git a/libcxxabi/include/cxxabi.h b/libcxxabi/include/cxxabi.h
index 6f4f823bc273f..d0701181751c5 100644
--- a/libcxxabi/include/cxxabi.h
+++ b/libcxxabi/include/cxxabi.h
@@ -36,6 +36,9 @@ class type_info; // forward declaration
 
 // runtime routines use C calling conventions, but are in __cxxabiv1 namespace
 namespace __cxxabiv1 {
+
+struct __cxa_exception;
+
 extern "C"  {
 
 // 2.4.2 Allocating the Exception Object
@@ -43,6 +46,9 @@ extern _LIBCXXABI_FUNC_VIS void *
 __cxa_allocate_exception(size_t thrown_size) throw();
 extern _LIBCXXABI_FUNC_VIS void
 __cxa_free_exception(void *thrown_exception) throw();
+// This function is an LLVM extension, which mirrors the same extension in libsupc++ and libcxxrt
+extern _LIBCXXABI_FUNC_VIS __cxa_exception*
+__cxa_init_primary_exception(void* object, std::type_info* tinfo, void(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw();
 
 // 2.4.3 Throwing the Exception Object
 extern _LIBCXXABI_FUNC_VIS _LIBCXXABI_NORETURN void
diff --git a/libcxxabi/lib/exceptions.exp b/libcxxabi/lib/exceptions.exp
index c3780d2974903..9dcfbdbd3598e 100644
--- a/libcxxabi/lib/exceptions.exp
+++ b/libcxxabi/lib/exceptions.exp
@@ -7,5 +7,6 @@ ___cxa_end_catch
 ___cxa_free_dependent_exception
 ___cxa_free_exception
 ___cxa_get_exception_ptr
+___cxa_init_primary_exception
 ___cxa_rethrow
 ___cxa_throw
diff --git a/libcxxabi/src/cxa_exception.cpp b/libcxxabi/src/cxa_exception.cpp
index f723ececfe332..65e9f4504ddad 100644
--- a/libcxxabi/src/cxa_exception.cpp
+++ b/libcxxabi/src/cxa_exception.cpp
@@ -206,6 +206,19 @@ void __cxa_free_exception(void *thrown_object) throw() {
     __aligned_free_with_fallback((void *)raw_buffer);
 }
 
+__cxa_exception* __cxa_init_primary_exception(void* object, std::type_info* tinfo,
+                                              void(_LIBCXXABI_DTOR_FUNC* dest)(void*)) throw() {
+  __cxa_exception* exception_header = cxa_exception_from_thrown_object(object);
+  exception_header->referenceCount = 0;
+  exception_header->unexpectedHandler = std::get_unexpected();
+  exception_header->terminateHandler = std::get_terminate();
+  exception_header->exceptionType = tinfo;
+  exception_header->exceptionDestructor = dest;
+  setOurExceptionClass(&exception_header->unwindHeader);
+  exception_header->unwindHeader.exception_cleanup = exception_cleanup_func;
+
+  return exception_header;
+}
 
 //  This function shall allocate a __cxa_dependent_exception and
 //  return a pointer to it. (Really to the object, not past its' end).
@@ -260,22 +273,15 @@ __cxa_throw(void *thrown_object, std::type_info *tinfo, void *(_LIBCXXABI_DTOR_F
 #else
 __cxa_throw(void *thrown_object, std::type_info *tinfo, void (_LIBCXXABI_DTOR_FUNC *dest)(void *)) {
 #endif
-    __cxa_eh_globals *globals = __cxa_get_globals();
-    __cxa_exception* exception_header = cxa_exception_from_thrown_object(thrown_object);
-
-    exception_header->unexpectedHandler = std::get_unexpected();
-    exception_header->terminateHandler  = std::get_terminate();
-    exception_header->exceptionType = tinfo;
-    exception_header->exceptionDestructor = dest;
-    setOurExceptionClass(&exception_header->unwindHeader);
-    exception_header->referenceCount = 1;  // This is a newly allocated exception, no need for thread safety.
-    globals->uncaughtExceptions += 1;   // Not atomically, since globals are thread-local
+  __cxa_eh_globals* globals = __cxa_get_globals();
+  globals->uncaughtExceptions += 1; // Not atomically, since globals are thread-local
 
-    exception_header->unwindHeader.exception_cleanup = exception_cleanup_func;
+  __cxa_exception* exception_header = __cxa_init_primary_exception(thrown_object, tinfo, dest);
+  exception_header->referenceCount = 1; // This is a newly allocated exception, no need for thread safety.
 
 #if __has_feature(address_sanitizer)
-    // Inform the ASan runtime that now might be a good time to clean stuff up.
-    __asan_handle_no_return();
+  // Inform the ASan runtime that now might be a good time to clean stuff up.
+  __asan_handle_no_return();
 #endif
 
 #ifdef __USING_SJLJ_EXCEPTIONS__

From 6a80e56ad0c7ae0adb8c4fb3f88eab7643566f41 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Mon, 22 Jan 2024 16:17:33 +0100
Subject: [PATCH 407/843] [clang-tidy] Fix macros handling in
 cppcoreguidelines-prefer-member-initializer (#72037)

Produces now valid fixes for a member variables initialized with macros.
Correctly uses expansion location instead of location inside macro to
get init code.

Close #70189
---
 .../PreferMemberInitializerCheck.cpp          |  8 ++---
 clang-tools-extra/docs/ReleaseNotes.rst       |  3 +-
 .../prefer-member-initializer.cpp             | 30 +++++++++++++++++++
 3 files changed, 36 insertions(+), 5 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
index 23d7303371529..f79a67819bb85 100644
--- a/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
+++ b/clang-tools-extra/clang-tidy/cppcoreguidelines/PreferMemberInitializerCheck.cpp
@@ -261,9 +261,9 @@ void PreferMemberInitializerCheck::check(
 
       SmallString<128> Insertion(
           {UseAssignment ? " = " : "{",
-           Lexer::getSourceText(
-               CharSourceRange(InitValue->getSourceRange(), true),
-               *Result.SourceManager, getLangOpts()),
+           Lexer::getSourceText(Result.SourceManager->getExpansionRange(
+                                    InitValue->getSourceRange()),
+                                *Result.SourceManager, getLangOpts()),
            UseAssignment ? "" : "}"});
 
       Diag << FixItHint::CreateInsertion(FieldEnd, Insertion)
@@ -346,7 +346,7 @@ void PreferMemberInitializerCheck::check(
       if (InvalidFix)
         continue;
       StringRef NewInit = Lexer::getSourceText(
-          CharSourceRange(InitValue->getSourceRange(), true),
+          Result.SourceManager->getExpansionRange(InitValue->getSourceRange()),
           *Result.SourceManager, getLangOpts());
       if (HasInitAlready) {
         if (InsertPos.isValid())
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 2822b17b43514..7ab091e10231c 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -337,7 +337,8 @@ Changes in existing checks
 - Improved :doc:`cppcoreguidelines-prefer-member-initializer
   <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>` check to
   ignore delegate constructors and ignore re-assignment for reference or when
-  initialization depend on field that is initialized before.
+  initialization depend on field that is initialized before. Additionally, it
+  now provides valid fixes for member variables initialized with macros.
 
 - Improved :doc:`cppcoreguidelines-pro-bounds-array-to-pointer-decay
   <clang-tidy/checks/cppcoreguidelines/pro-bounds-array-to-pointer-decay>` check
diff --git a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp
index b5603dea316d5..8086caa2aa6f2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/cppcoreguidelines/prefer-member-initializer.cpp
@@ -586,3 +586,33 @@ struct ReassignmentAfterUnsafetyAssignment {
   }
   int m_i;
 };
+
+namespace PR70189 {
+#define RGB(r,g,b) ((unsigned long)(((unsigned char)(r)|((unsigned short)((unsigned char)(g))<<8))|(((unsigned long)(unsigned char)(b))<<16)))
+#define INVALID_HANDLE_VALUE ((void*)(unsigned long long)-1)
+#define SIMPLE 12
+
+class Foo {
+public:
+  Foo() {
+// CHECK-FIXES: Foo() : m_color(RGB(255, 128, 0)), m_handle(INVALID_HANDLE_VALUE), m_myval(SIMPLE) {
+    m_color = RGB(255, 128, 0);
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'm_color' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
+// CHECK-FIXES: {{^\ *$}}
+    m_handle = INVALID_HANDLE_VALUE;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'm_handle' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
+// CHECK-FIXES: {{^\ *$}}
+    m_myval = SIMPLE;
+// CHECK-MESSAGES: :[[@LINE-1]]:5: warning: 'm_myval' should be initialized in a member initializer of the constructor [cppcoreguidelines-prefer-member-initializer]
+// CHECK-FIXES: {{^\ *$}}
+  }
+private:
+  unsigned long m_color;
+  void* m_handle;
+  int m_myval;
+};
+
+#undef SIMPLE
+#undef INVALID_HANDLE_VALUE
+#undef RGB
+}

From 726d940586d7018ef03e87cb86601f4885f66001 Mon Sep 17 00:00:00 2001
From: Konstantin Zhuravlyov <kzhuravl_dev@outlook.com>
Date: Mon, 22 Jan 2024 10:32:35 -0500
Subject: [PATCH 408/843] AMDGPU/Docs: Add link to MI300 Instruction Set
 Architecture (#78777)

---
 llvm/docs/AMDGPUUsage.rst | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index d2d622ee52017..548d677afdecb 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -323,7 +323,7 @@ Every processor supports every OS ABI (see :ref:`amdgpu-os`) with the following
                                                                                                         Add product
                                                                                                         names.
 
-     **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_
+     **GCN GFX9 (Vega)** [AMD-GCN-GFX900-GFX904-VEGA]_ [AMD-GCN-GFX906-VEGA7NM]_ [AMD-GCN-GFX908-CDNA1]_ [AMD-GCN-GFX90A-CDNA2]_ [AMD-GCN-GFX940-GFX942-CDNA3]_
      -----------------------------------------------------------------------------------------------------------------------
      ``gfx900``                  ``amdgcn``   dGPU  - xnack           - Absolute      - *rocm-amdhsa* - Radeon Vega
                                                                         flat          - *pal-amdhsa*    Frontier Edition
@@ -14958,8 +14958,9 @@ For more information about instructions, their semantics and supported
 combinations of operands, refer to one of instruction set architecture manuals
 [AMD-GCN-GFX6]_, [AMD-GCN-GFX7]_, [AMD-GCN-GFX8]_,
 [AMD-GCN-GFX900-GFX904-VEGA]_, [AMD-GCN-GFX906-VEGA7NM]_,
-[AMD-GCN-GFX908-CDNA1]_, [AMD-GCN-GFX90A-CDNA2]_, [AMD-GCN-GFX10-RDNA1]_,
-[AMD-GCN-GFX10-RDNA2]_ and [AMD-GCN-GFX11-RDNA3]_.
+[AMD-GCN-GFX908-CDNA1]_, [AMD-GCN-GFX90A-CDNA2]_,
+[AMD-GCN-GFX940-GFX942-CDNA3]_, [AMD-GCN-GFX10-RDNA1]_, [AMD-GCN-GFX10-RDNA2]_
+and [AMD-GCN-GFX11-RDNA3]_.
 
 Operands
 ~~~~~~~~
@@ -15753,6 +15754,7 @@ Additional Documentation
 .. [AMD-GCN-GFX906-VEGA7NM] `AMD Vega 7nm Instruction Set Architecture <https://gpuopen.com/wp-content/uploads/2019/11/Vega_7nm_Shader_ISA_26November2019.pdf>`__
 .. [AMD-GCN-GFX908-CDNA1] `AMD Instinct MI100 Instruction Set Architecture <https://developer.amd.com/wp-content/resources/CDNA1_Shader_ISA_14December2020.pdf>`__
 .. [AMD-GCN-GFX90A-CDNA2] `AMD Instinct MI200 Instruction Set Architecture <https://developer.amd.com/wp-content/resources/CDNA2_Shader_ISA_4February2022.pdf>`__
+.. [AMD-GCN-GFX940-GFX942-CDNA3] `AMD Instinct MI300 Instruction Set Architecture <https://www.amd.com/content/dam/amd/en/documents/instinct-tech-docs/instruction-set-architectures/amd-instinct-mi300-cdna3-instruction-set-architecture.pdf>`__
 .. [AMD-GCN-GFX10-RDNA1] `AMD RDNA 1.0 Instruction Set Architecture <https://gpuopen.com/wp-content/uploads/2019/08/RDNA_Shader_ISA_5August2019.pdf>`__
 .. [AMD-GCN-GFX10-RDNA2] `AMD RDNA 2 Instruction Set Architecture <https://developer.amd.com/wp-content/resources/RDNA2_Shader_ISA_November2020.pdf>`__
 .. [AMD-GCN-GFX11-RDNA3] `AMD RDNA 3 Instruction Set Architecture <https://developer.amd.com/wp-content/resources/RDNA3_Shader_ISA_December2022.pdf>`__

From 69fedaf830f8a2df4751a3c20189b7299daf88ae Mon Sep 17 00:00:00 2001
From: Daniel Grumberg <dgrumberg@apple.com>
Date: Mon, 22 Jan 2024 15:32:57 +0000
Subject: [PATCH 409/843] [clang][ExtractAPI] Add support C unions in non C++
 parsing mode (#77451)

Ensure that we generate correct symbol kinds and declaration fragments
for unions in C and Objective-C parsing modes.

rdar://120544091
---
 clang/include/clang/ExtractAPI/API.h          |  48 +--
 .../clang/ExtractAPI/DeclarationFragments.h   |   5 +-
 .../clang/ExtractAPI/ExtractAPIVisitor.h      |  42 ++-
 .../ExtractAPI/Serialization/SerializerBase.h |  12 +-
 .../Serialization/SymbolGraphSerializer.h     |   4 +-
 clang/lib/ExtractAPI/API.cpp                  |  35 ++-
 clang/lib/ExtractAPI/DeclarationFragments.cpp |   9 +-
 .../Serialization/SymbolGraphSerializer.cpp   |  18 +-
 clang/test/ExtractAPI/union.c                 | 281 ++++++++++++++++++
 9 files changed, 380 insertions(+), 74 deletions(-)
 create mode 100644 clang/test/ExtractAPI/union.c

diff --git a/clang/include/clang/ExtractAPI/API.h b/clang/include/clang/ExtractAPI/API.h
index f4a6374161685..0a0f1bd1e95f7 100644
--- a/clang/include/clang/ExtractAPI/API.h
+++ b/clang/include/clang/ExtractAPI/API.h
@@ -169,6 +169,7 @@ struct APIRecord {
     RK_Enum,
     RK_StructField,
     RK_Struct,
+    RK_UnionField,
     RK_Union,
     RK_StaticField,
     RK_CXXField,
@@ -478,17 +479,19 @@ struct EnumRecord : APIRecord {
 };
 
 /// This holds information associated with struct fields.
-struct StructFieldRecord : APIRecord {
-  StructFieldRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
+struct RecordFieldRecord : APIRecord {
+  RecordFieldRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
                     AvailabilityInfo Availability, const DocComment &Comment,
                     DeclarationFragments Declaration,
-                    DeclarationFragments SubHeading, bool IsFromSystemHeader)
-      : APIRecord(RK_StructField, USR, Name, Loc, std::move(Availability),
+                    DeclarationFragments SubHeading, RecordKind Kind,
+                    bool IsFromSystemHeader)
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader) {}
 
   static bool classof(const APIRecord *Record) {
-    return Record->getKind() == RK_StructField;
+    return Record->getKind() == RK_StructField ||
+           Record->getKind() == RK_UnionField;
   }
 
 private:
@@ -496,19 +499,20 @@ struct StructFieldRecord : APIRecord {
 };
 
 /// This holds information associated with structs.
-struct StructRecord : APIRecord {
-  SmallVector<std::unique_ptr<StructFieldRecord>> Fields;
+struct RecordRecord : APIRecord {
+  SmallVector<std::unique_ptr<RecordFieldRecord>> Fields;
 
-  StructRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
+  RecordRecord(StringRef USR, StringRef Name, PresumedLoc Loc,
                AvailabilityInfo Availability, const DocComment &Comment,
                DeclarationFragments Declaration,
-               DeclarationFragments SubHeading, bool IsFromSystemHeader)
-      : APIRecord(RK_Struct, USR, Name, Loc, std::move(Availability),
+               DeclarationFragments SubHeading, RecordKind Kind,
+               bool IsFromSystemHeader)
+      : APIRecord(Kind, USR, Name, Loc, std::move(Availability),
                   LinkageInfo::none(), Comment, Declaration, SubHeading,
                   IsFromSystemHeader) {}
 
   static bool classof(const APIRecord *Record) {
-    return Record->getKind() == RK_Struct;
+    return Record->getKind() == RK_Struct || Record->getKind() == RK_Union;
   }
 
 private:
@@ -1266,30 +1270,31 @@ class APISet {
                       DeclarationFragments Declaration,
                       DeclarationFragments SubHeading, bool IsFromSystemHeader);
 
-  /// Create and add a struct field record into the API set.
+  /// Create and add a record field record into the API set.
   ///
   /// Note: the caller is responsible for keeping the StringRef \p Name and
   /// \p USR alive. APISet::copyString provides a way to copy strings into
   /// APISet itself, and APISet::recordUSR(const Decl *D) is a helper method
   /// to generate the USR for \c D and keep it alive in APISet.
-  StructFieldRecord *
-  addStructField(StructRecord *Struct, StringRef Name, StringRef USR,
+  RecordFieldRecord *
+  addRecordField(RecordRecord *Record, StringRef Name, StringRef USR,
                  PresumedLoc Loc, AvailabilityInfo Availability,
                  const DocComment &Comment, DeclarationFragments Declaration,
-                 DeclarationFragments SubHeading, bool IsFromSystemHeader);
+                 DeclarationFragments SubHeading, APIRecord::RecordKind Kind,
+                 bool IsFromSystemHeader);
 
-  /// Create and add a struct record into the API set.
+  /// Create and add a record record into the API set.
   ///
   /// Note: the caller is responsible for keeping the StringRef \p Name and
   /// \p USR alive. APISet::copyString provides a way to copy strings into
   /// APISet itself, and APISet::recordUSR(const Decl *D) is a helper method
   /// to generate the USR for \c D and keep it alive in APISet.
-  StructRecord *addStruct(StringRef Name, StringRef USR, PresumedLoc Loc,
+  RecordRecord *addRecord(StringRef Name, StringRef USR, PresumedLoc Loc,
                           AvailabilityInfo Availability,
                           const DocComment &Comment,
                           DeclarationFragments Declaration,
                           DeclarationFragments SubHeading,
-                          bool IsFromSystemHeader);
+                          APIRecord::RecordKind Kind, bool IsFromSystemHeader);
 
   StaticFieldRecord *
   addStaticField(StringRef Name, StringRef USR, PresumedLoc Loc,
@@ -1544,7 +1549,7 @@ class APISet {
     return GlobalVariableTemplatePartialSpecializations;
   }
   const RecordMap<EnumRecord> &getEnums() const { return Enums; }
-  const RecordMap<StructRecord> &getStructs() const { return Structs; }
+  const RecordMap<RecordRecord> &getRecords() const { return Records; }
   const RecordMap<CXXClassRecord> &getCXXClasses() const { return CXXClasses; }
   const RecordMap<CXXMethodTemplateRecord> &getCXXMethodTemplates() const {
     return CXXMethodTemplates;
@@ -1563,7 +1568,6 @@ class APISet {
   const RecordMap<CXXFieldTemplateRecord> &getCXXFieldTemplates() const {
     return CXXFieldTemplates;
   }
-  const RecordMap<ConceptRecord> &getConcepts() const { return Concepts; }
   const RecordMap<ClassTemplateRecord> &getClassTemplates() const {
     return ClassTemplates;
   }
@@ -1575,7 +1579,7 @@ class APISet {
   getClassTemplatePartialSpecializations() const {
     return ClassTemplatePartialSpecializations;
   }
-  const RecordMap<ConceptRecord> &getRecords() const { return Concepts; }
+  const RecordMap<ConceptRecord> &getConcepts() const { return Concepts; }
   const RecordMap<ObjCCategoryRecord> &getObjCCategories() const {
     return ObjCCategories;
   }
@@ -1641,7 +1645,7 @@ class APISet {
   RecordMap<ConceptRecord> Concepts;
   RecordMap<StaticFieldRecord> StaticFields;
   RecordMap<EnumRecord> Enums;
-  RecordMap<StructRecord> Structs;
+  RecordMap<RecordRecord> Records;
   RecordMap<CXXClassRecord> CXXClasses;
   RecordMap<CXXFieldRecord> CXXFields;
   RecordMap<CXXMethodRecord> CXXMethods;
diff --git a/clang/include/clang/ExtractAPI/DeclarationFragments.h b/clang/include/clang/ExtractAPI/DeclarationFragments.h
index d719196b9a43e..1b78c8b5931e4 100644
--- a/clang/include/clang/ExtractAPI/DeclarationFragments.h
+++ b/clang/include/clang/ExtractAPI/DeclarationFragments.h
@@ -295,8 +295,9 @@ class DeclarationFragmentsBuilder {
   /// Build DeclarationFragments for a field declaration FieldDecl.
   static DeclarationFragments getFragmentsForField(const FieldDecl *);
 
-  /// Build DeclarationFragments for a struct record declaration RecordDecl.
-  static DeclarationFragments getFragmentsForStruct(const RecordDecl *);
+  /// Build DeclarationFragments for a struct/union record declaration
+  /// RecordDecl.
+  static DeclarationFragments getFragmentsForRecordDecl(const RecordDecl *);
 
   static DeclarationFragments getFragmentsForCXXClass(const CXXRecordDecl *);
 
diff --git a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
index 1f76add1faae8..ac6f4e313540c 100644
--- a/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
+++ b/clang/include/clang/ExtractAPI/ExtractAPIVisitor.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_CLANG_EXTRACTAPI_EXTRACT_API_VISITOR_H
 #define LLVM_CLANG_EXTRACTAPI_EXTRACT_API_VISITOR_H
 
+#include "clang/AST/Decl.h"
 #include "clang/AST/DeclCXX.h"
 #include "clang/AST/DeclTemplate.h"
 #include "clang/Basic/OperatorKinds.h"
@@ -129,9 +130,10 @@ class ExtractAPIVisitorBase : public RecursiveASTVisitor<Derived> {
   void recordEnumConstants(EnumRecord *EnumRecord,
                            const EnumDecl::enumerator_range Constants);
 
-  /// Collect API information for the struct fields and associate with the
+  /// Collect API information for the record fields and associate with the
   /// parent struct.
-  void recordStructFields(StructRecord *StructRecord,
+  void recordRecordFields(RecordRecord *RecordRecord,
+                          APIRecord::RecordKind FieldKind,
                           const RecordDecl::field_range Fields);
 
   /// Collect API information for the Objective-C methods and associate with the
@@ -525,16 +527,25 @@ bool ExtractAPIVisitorBase<Derived>::VisitRecordDecl(const RecordDecl *Decl) {
 
   // Build declaration fragments and sub-heading for the struct.
   DeclarationFragments Declaration =
-      DeclarationFragmentsBuilder::getFragmentsForStruct(Decl);
+      DeclarationFragmentsBuilder::getFragmentsForRecordDecl(Decl);
   DeclarationFragments SubHeading =
       DeclarationFragmentsBuilder::getSubHeading(Decl);
-  StructRecord *StructRecord =
-      API.addStruct(Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl),
-                    Comment, Declaration, SubHeading, isInSystemHeader(Decl));
+
+  auto RecordKind = APIRecord::RK_Struct;
+  auto FieldRecordKind = APIRecord::RK_StructField;
+
+  if (Decl->isUnion()) {
+    RecordKind = APIRecord::RK_Union;
+    FieldRecordKind = APIRecord::RK_UnionField;
+  }
+
+  RecordRecord *RecordRecord = API.addRecord(
+      Name, USR, Loc, AvailabilityInfo::createFromDecl(Decl), Comment,
+      Declaration, SubHeading, RecordKind, isInSystemHeader(Decl));
 
   // Now collect information about the fields in this struct.
-  getDerivedExtractAPIVisitor().recordStructFields(StructRecord,
-                                                   Decl->fields());
+  getDerivedExtractAPIVisitor().recordRecordFields(
+      RecordRecord, FieldRecordKind, Decl->fields());
 
   return true;
 }
@@ -1055,8 +1066,8 @@ bool ExtractAPIVisitorBase<Derived>::VisitTypedefNameDecl(
           dyn_cast<ElaboratedType>(Decl->getUnderlyingType())) {
     if (const TagType *TagTy = dyn_cast<TagType>(ET->desugar())) {
       if (Decl->getName() == TagTy->getDecl()->getName()) {
-        if (TagTy->getDecl()->isStruct()) {
-          modifyRecords(API.getStructs(), Decl->getName());
+        if (isa<RecordDecl>(TagTy->getDecl())) {
+          modifyRecords(API.getRecords(), Decl->getName());
         }
         if (TagTy->getDecl()->isEnum()) {
           modifyRecords(API.getEnums(), Decl->getName());
@@ -1171,8 +1182,9 @@ void ExtractAPIVisitorBase<Derived>::recordEnumConstants(
 /// Collect API information for the struct fields and associate with the
 /// parent struct.
 template <typename Derived>
-void ExtractAPIVisitorBase<Derived>::recordStructFields(
-    StructRecord *StructRecord, const RecordDecl::field_range Fields) {
+void ExtractAPIVisitorBase<Derived>::recordRecordFields(
+    RecordRecord *RecordRecord, APIRecord::RecordKind FieldKind,
+    const RecordDecl::field_range Fields) {
   for (const auto *Field : Fields) {
     // Collect symbol information.
     StringRef Name = Field->getName();
@@ -1191,9 +1203,9 @@ void ExtractAPIVisitorBase<Derived>::recordStructFields(
     DeclarationFragments SubHeading =
         DeclarationFragmentsBuilder::getSubHeading(Field);
 
-    API.addStructField(StructRecord, Name, USR, Loc,
-                       AvailabilityInfo::createFromDecl(Field), Comment,
-                       Declaration, SubHeading, isInSystemHeader(Field));
+    API.addRecordField(
+        RecordRecord, Name, USR, Loc, AvailabilityInfo::createFromDecl(Field),
+        Comment, Declaration, SubHeading, FieldKind, isInSystemHeader(Field));
   }
 }
 
diff --git a/clang/include/clang/ExtractAPI/Serialization/SerializerBase.h b/clang/include/clang/ExtractAPI/Serialization/SerializerBase.h
index a188400a74d55..f0629a9ad56b0 100644
--- a/clang/include/clang/ExtractAPI/Serialization/SerializerBase.h
+++ b/clang/include/clang/ExtractAPI/Serialization/SerializerBase.h
@@ -65,7 +65,7 @@ template <typename Derived> class APISetVisitor {
 
     getDerived()->traverseGlobalFunctionTemplateSpecializationRecords();
 
-    getDerived()->traverseStructRecords();
+    getDerived()->traverseRecordRecords();
 
     getDerived()->traverseObjCInterfaces();
 
@@ -98,9 +98,9 @@ template <typename Derived> class APISetVisitor {
       getDerived()->visitEnumRecord(*Enum.second);
   }
 
-  void traverseStructRecords() {
-    for (const auto &Struct : API.getStructs())
-      getDerived()->visitStructRecord(*Struct.second);
+  void traverseRecordRecords() {
+    for (const auto &Record : API.getRecords())
+      getDerived()->visitRecordRecord(*Record.second);
   }
 
   void traverseStaticFieldRecords() {
@@ -238,8 +238,8 @@ template <typename Derived> class APISetVisitor {
   /// Visit an enum record.
   void visitEnumRecord(const EnumRecord &Record){};
 
-  /// Visit a struct record.
-  void visitStructRecord(const StructRecord &Record){};
+  /// Visit a record record.
+  void visitRecordRecord(const RecordRecord &Record){};
 
   void visitStaticFieldRecord(const StaticFieldRecord &Record){};
 
diff --git a/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h b/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h
index a9b714dc48465..4249ac405fd26 100644
--- a/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h
+++ b/clang/include/clang/ExtractAPI/Serialization/SymbolGraphSerializer.h
@@ -170,8 +170,8 @@ class SymbolGraphSerializer : public APISetVisitor<SymbolGraphSerializer> {
   /// Visit an enum record.
   void visitEnumRecord(const EnumRecord &Record);
 
-  /// Visit a struct record.
-  void visitStructRecord(const StructRecord &Record);
+  /// Visit a record record.
+  void visitRecordRecord(const RecordRecord &Record);
 
   void visitStaticFieldRecord(const StaticFieldRecord &Record);
 
diff --git a/clang/lib/ExtractAPI/API.cpp b/clang/lib/ExtractAPI/API.cpp
index a7709fff85ffe..aa7a1e9360f47 100644
--- a/clang/lib/ExtractAPI/API.cpp
+++ b/clang/lib/ExtractAPI/API.cpp
@@ -144,31 +144,30 @@ EnumRecord *APISet::addEnum(StringRef Name, StringRef USR, PresumedLoc Loc,
                            SubHeading, IsFromSystemHeader);
 }
 
-StructFieldRecord *APISet::addStructField(StructRecord *Struct, StringRef Name,
-                                          StringRef USR, PresumedLoc Loc,
-                                          AvailabilityInfo Availability,
-                                          const DocComment &Comment,
-                                          DeclarationFragments Declaration,
-                                          DeclarationFragments SubHeading,
-                                          bool IsFromSystemHeader) {
-  auto Record = std::make_unique<StructFieldRecord>(
+RecordFieldRecord *APISet::addRecordField(
+    RecordRecord *Record, StringRef Name, StringRef USR, PresumedLoc Loc,
+    AvailabilityInfo Availability, const DocComment &Comment,
+    DeclarationFragments Declaration, DeclarationFragments SubHeading,
+    APIRecord::RecordKind Kind, bool IsFromSystemHeader) {
+  auto RecordField = std::make_unique<RecordFieldRecord>(
       USR, Name, Loc, std::move(Availability), Comment, Declaration, SubHeading,
-      IsFromSystemHeader);
-  Record->ParentInformation = APIRecord::HierarchyInformation(
-      Struct->USR, Struct->Name, Struct->getKind(), Struct);
-  USRBasedLookupTable.insert({USR, Record.get()});
-  return Struct->Fields.emplace_back(std::move(Record)).get();
+      Kind, IsFromSystemHeader);
+  RecordField->ParentInformation = APIRecord::HierarchyInformation(
+      Record->USR, Record->Name, Record->getKind(), Record);
+  USRBasedLookupTable.insert({USR, RecordField.get()});
+  return Record->Fields.emplace_back(std::move(RecordField)).get();
 }
 
-StructRecord *APISet::addStruct(StringRef Name, StringRef USR, PresumedLoc Loc,
+RecordRecord *APISet::addRecord(StringRef Name, StringRef USR, PresumedLoc Loc,
                                 AvailabilityInfo Availability,
                                 const DocComment &Comment,
                                 DeclarationFragments Declaration,
                                 DeclarationFragments SubHeading,
+                                APIRecord::RecordKind Kind,
                                 bool IsFromSystemHeader) {
-  return addTopLevelRecord(USRBasedLookupTable, Structs, USR, Name, Loc,
+  return addTopLevelRecord(USRBasedLookupTable, Records, USR, Name, Loc,
                            std::move(Availability), Comment, Declaration,
-                           SubHeading, IsFromSystemHeader);
+                           SubHeading, Kind, IsFromSystemHeader);
 }
 
 StaticFieldRecord *
@@ -547,8 +546,8 @@ void GlobalFunctionRecord::anchor() {}
 void GlobalVariableRecord::anchor() {}
 void EnumConstantRecord::anchor() {}
 void EnumRecord::anchor() {}
-void StructFieldRecord::anchor() {}
-void StructRecord::anchor() {}
+void RecordFieldRecord::anchor() {}
+void RecordRecord::anchor() {}
 void CXXFieldRecord::anchor() {}
 void CXXClassRecord::anchor() {}
 void CXXConstructorRecord::anchor() {}
diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index eb6eea0aaf546..044ccf5dea095 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -760,13 +760,16 @@ DeclarationFragmentsBuilder::getFragmentsForField(const FieldDecl *Field) {
       .append(";", DeclarationFragments::FragmentKind::Text);
 }
 
-DeclarationFragments
-DeclarationFragmentsBuilder::getFragmentsForStruct(const RecordDecl *Record) {
+DeclarationFragments DeclarationFragmentsBuilder::getFragmentsForRecordDecl(
+    const RecordDecl *Record) {
   if (const auto *TypedefNameDecl = Record->getTypedefNameForAnonDecl())
     return getFragmentsForTypedef(TypedefNameDecl);
 
   DeclarationFragments Fragments;
-  Fragments.append("struct", DeclarationFragments::FragmentKind::Keyword);
+  if (Record->isUnion())
+    Fragments.append("union", DeclarationFragments::FragmentKind::Keyword);
+  else
+    Fragments.append("struct", DeclarationFragments::FragmentKind::Keyword);
 
   if (!Record->getName().empty())
     Fragments.appendSpace().append(
diff --git a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
index ee424a16fc1cf..349b93e2a2326 100644
--- a/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
+++ b/clang/lib/ExtractAPI/Serialization/SymbolGraphSerializer.cpp
@@ -400,7 +400,7 @@ Object serializeSymbolKind(APIRecord::RecordKind RK, Language Lang) {
     Kind["identifier"] = AddLangPrefix("struct");
     Kind["displayName"] = "Structure";
     break;
-  case APIRecord::RK_CXXField:
+  case APIRecord::RK_UnionField:
     Kind["identifier"] = AddLangPrefix("property");
     Kind["displayName"] = "Instance Property";
     break;
@@ -408,6 +408,10 @@ Object serializeSymbolKind(APIRecord::RecordKind RK, Language Lang) {
     Kind["identifier"] = AddLangPrefix("union");
     Kind["displayName"] = "Union";
     break;
+  case APIRecord::RK_CXXField:
+    Kind["identifier"] = AddLangPrefix("property");
+    Kind["displayName"] = "Instance Property";
+    break;
   case APIRecord::RK_StaticField:
     Kind["identifier"] = AddLangPrefix("type.property");
     Kind["displayName"] = "Type Property";
@@ -871,12 +875,12 @@ void SymbolGraphSerializer::visitEnumRecord(const EnumRecord &Record) {
   serializeMembers(Record, Record.Constants);
 }
 
-void SymbolGraphSerializer::visitStructRecord(const StructRecord &Record) {
-  auto Struct = serializeAPIRecord(Record);
-  if (!Struct)
+void SymbolGraphSerializer::visitRecordRecord(const RecordRecord &Record) {
+  auto SerializedRecord = serializeAPIRecord(Record);
+  if (!SerializedRecord)
     return;
 
-  Symbols.emplace_back(std::move(*Struct));
+  Symbols.emplace_back(std::move(*SerializedRecord));
   serializeMembers(Record, Record.Fields);
 }
 
@@ -1167,7 +1171,9 @@ void SymbolGraphSerializer::serializeSingleRecord(const APIRecord *Record) {
     visitEnumRecord(*cast<EnumRecord>(Record));
     break;
   case APIRecord::RK_Struct:
-    visitStructRecord(*cast<StructRecord>(Record));
+    LLVM_FALLTHROUGH;
+  case APIRecord::RK_Union:
+    visitRecordRecord(*cast<RecordRecord>(Record));
     break;
   case APIRecord::RK_StaticField:
     visitStaticFieldRecord(*cast<StaticFieldRecord>(Record));
diff --git a/clang/test/ExtractAPI/union.c b/clang/test/ExtractAPI/union.c
new file mode 100644
index 0000000000000..6ec9fd3ddf6e9
--- /dev/null
+++ b/clang/test/ExtractAPI/union.c
@@ -0,0 +1,281 @@
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: sed -e "s@INPUT_DIR@%{/t:regex_replacement}@g" \
+// RUN: %t/reference.output.json.in >> %t/reference.output.json
+// RUN: %clang_cc1 -extract-api -triple arm64-apple-macosx -x c-header\
+// RUN:   %t/input.h -o %t/output.json -verify
+
+// Generator version is not consistent across test runs, normalize it.
+// RUN: sed -e "s@\"generator\": \".*\"@\"generator\": \"?\"@g" \
+// RUN: %t/output.json >> %t/output-normalized.json
+// RUN: diff %t/reference.output.json %t/output-normalized.json
+
+//--- input.h
+/// My Union
+union Union{
+    /// the a option
+    int a;
+    /// the b option
+    char b;
+};
+// expected-no-diagnostics
+
+//--- reference.output.json.in
+{
+  "metadata": {
+    "formatVersion": {
+      "major": 0,
+      "minor": 5,
+      "patch": 3
+    },
+    "generator": "?"
+  },
+  "module": {
+    "name": "",
+    "platform": {
+      "architecture": "arm64",
+      "operatingSystem": {
+        "minimumVersion": {
+          "major": 11,
+          "minor": 0,
+          "patch": 0
+        },
+        "name": "macosx"
+      },
+      "vendor": "apple"
+    }
+  },
+  "relationships": [
+    {
+      "kind": "memberOf",
+      "source": "c:@U@Union@FI@a",
+      "target": "c:@U@Union",
+      "targetFallback": "Union"
+    },
+    {
+      "kind": "memberOf",
+      "source": "c:@U@Union@FI@b",
+      "target": "c:@U@Union",
+      "targetFallback": "Union"
+    }
+  ],
+  "symbols": [
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "keyword",
+          "spelling": "union"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "Union"
+        },
+        {
+          "kind": "text",
+          "spelling": ";"
+        }
+      ],
+      "docComment": {
+        "lines": [
+          {
+            "range": {
+              "end": {
+                "character": 12,
+                "line": 0
+              },
+              "start": {
+                "character": 4,
+                "line": 0
+              }
+            },
+            "text": "My Union"
+          }
+        ]
+      },
+      "identifier": {
+        "interfaceLanguage": "c",
+        "precise": "c:@U@Union"
+      },
+      "kind": {
+        "displayName": "Union",
+        "identifier": "c.union"
+      },
+      "location": {
+        "position": {
+          "character": 6,
+          "line": 1
+        },
+        "uri": "file://INPUT_DIR/input.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "Union"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "identifier",
+            "spelling": "Union"
+          }
+        ],
+        "title": "Union"
+      },
+      "pathComponents": [
+        "Union"
+      ]
+    },
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "typeIdentifier",
+          "preciseIdentifier": "c:I",
+          "spelling": "int"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "a"
+        },
+        {
+          "kind": "text",
+          "spelling": ";"
+        }
+      ],
+      "docComment": {
+        "lines": [
+          {
+            "range": {
+              "end": {
+                "character": 20,
+                "line": 2
+              },
+              "start": {
+                "character": 8,
+                "line": 2
+              }
+            },
+            "text": "the a option"
+          }
+        ]
+      },
+      "identifier": {
+        "interfaceLanguage": "c",
+        "precise": "c:@U@Union@FI@a"
+      },
+      "kind": {
+        "displayName": "Instance Property",
+        "identifier": "c.property"
+      },
+      "location": {
+        "position": {
+          "character": 8,
+          "line": 3
+        },
+        "uri": "file://INPUT_DIR/input.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "a"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "identifier",
+            "spelling": "a"
+          }
+        ],
+        "title": "a"
+      },
+      "pathComponents": [
+        "Union",
+        "a"
+      ]
+    },
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "typeIdentifier",
+          "preciseIdentifier": "c:C",
+          "spelling": "char"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "b"
+        },
+        {
+          "kind": "text",
+          "spelling": ";"
+        }
+      ],
+      "docComment": {
+        "lines": [
+          {
+            "range": {
+              "end": {
+                "character": 20,
+                "line": 4
+              },
+              "start": {
+                "character": 8,
+                "line": 4
+              }
+            },
+            "text": "the b option"
+          }
+        ]
+      },
+      "identifier": {
+        "interfaceLanguage": "c",
+        "precise": "c:@U@Union@FI@b"
+      },
+      "kind": {
+        "displayName": "Instance Property",
+        "identifier": "c.property"
+      },
+      "location": {
+        "position": {
+          "character": 9,
+          "line": 5
+        },
+        "uri": "file://INPUT_DIR/input.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "b"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "identifier",
+            "spelling": "b"
+          }
+        ],
+        "title": "b"
+      },
+      "pathComponents": [
+        "Union",
+        "b"
+      ]
+    }
+  ]
+}

From d9cb37c9045bb0b3692d6faaeac43150a26e42e4 Mon Sep 17 00:00:00 2001
From: Paul T Robinson <paul.robinson@sony.com>
Date: Mon, 22 Jan 2024 07:38:33 -0800
Subject: [PATCH 410/843] [Headers][X86] Add macro descriptions to ia32intrin.h
 (#78613)

These are largely copy-pasted from the corresponding function
descriptions. Updated _rdtsc definition because it was just plain wrong.
---
 clang/lib/Headers/ia32intrin.h | 272 ++++++++++++++++++++++++++++++++-
 1 file changed, 269 insertions(+), 3 deletions(-)

diff --git a/clang/lib/Headers/ia32intrin.h b/clang/lib/Headers/ia32intrin.h
index 7d5fede61ce85..1b979770e1962 100644
--- a/clang/lib/Headers/ia32intrin.h
+++ b/clang/lib/Headers/ia32intrin.h
@@ -37,6 +37,7 @@
 /// \param __A
 ///    A 32-bit integer operand.
 /// \returns A 32-bit integer containing the bit number.
+/// \see _bit_scan_forward
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsfd(int __A) {
   return __builtin_ctz((unsigned int)__A);
@@ -53,6 +54,7 @@ __bsfd(int __A) {
 /// \param __A
 ///    A 32-bit integer operand.
 /// \returns A 32-bit integer containing the bit number.
+/// \see _bit_scan_reverse
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __bsrd(int __A) {
   return 31 - __builtin_clz((unsigned int)__A);
@@ -88,7 +90,40 @@ _bswap(int __A) {
   return (int)__builtin_bswap32((unsigned int)__A);
 }
 
+/// Find the first set bit starting from the lsb. Result is undefined if
+///    input is 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _bit_scan_forward(int A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BSF instruction or the
+///    \c TZCNT instruction.
+///
+/// \param A
+///    A 32-bit integer operand.
+/// \returns A 32-bit integer containing the bit number.
+/// \see __bsfd
 #define _bit_scan_forward(A) __bsfd((A))
+
+/// Find the first set bit starting from the msb. Result is undefined if
+///    input is 0.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _bit_scan_reverse(int A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BSR instruction or the
+///    \c LZCNT instruction and an \c XOR.
+///
+/// \param A
+///    A 32-bit integer operand.
+/// \returns A 32-bit integer containing the bit number.
+/// \see __bsrd
 #define _bit_scan_reverse(A) __bsrd((A))
 
 #ifdef __x86_64__
@@ -134,13 +169,29 @@ __bsrq(long long __A) {
 /// \param __A
 ///    A 64-bit integer operand.
 /// \returns A 64-bit integer containing the swapped bytes.
+/// \see _bswap64
 static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __bswapq(long long __A) {
   return (long long)__builtin_bswap64((unsigned long long)__A);
 }
 
+/// Swaps the bytes in the input. Converting little endian to big endian or
+///    vice versa.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// long long _bswap64(long long A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BSWAP instruction.
+///
+/// \param A
+///    A 64-bit integer operand.
+/// \returns A 64-bit integer containing the swapped bytes.
+/// \see __bswapq
 #define _bswap64(A) __bswapq((A))
-#endif
+#endif /* __x86_64__ */
 
 /// Counts the number of bits in the source operand having a value of 1.
 ///
@@ -153,12 +204,29 @@ __bswapq(long long __A) {
 ///    An unsigned 32-bit integer operand.
 /// \returns A 32-bit integer containing the number of bits with value 1 in the
 ///    source operand.
+/// \see _popcnt32
 static __inline__ int __DEFAULT_FN_ATTRS_CONSTEXPR
 __popcntd(unsigned int __A)
 {
   return __builtin_popcount(__A);
 }
 
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// int _popcnt32(int A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c POPCNT instruction or a
+///    a sequence of arithmetic and logic ops to calculate it.
+///
+/// \param A
+///    An unsigned 32-bit integer operand.
+/// \returns A 32-bit integer containing the number of bits with value 1 in the
+///    source operand.
+/// \see __popcntd
 #define _popcnt32(A) __popcntd((A))
 
 #ifdef __x86_64__
@@ -173,12 +241,29 @@ __popcntd(unsigned int __A)
 ///    An unsigned 64-bit integer operand.
 /// \returns A 64-bit integer containing the number of bits with value 1 in the
 ///    source operand.
+/// \see _popcnt64
 static __inline__ long long __DEFAULT_FN_ATTRS_CONSTEXPR
 __popcntq(unsigned long long __A)
 {
   return __builtin_popcountll(__A);
 }
 
+/// Counts the number of bits in the source operand having a value of 1.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// long long _popcnt64(unsigned long long A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c POPCNT instruction or a
+///    a sequence of arithmetic and logic ops to calculate it.
+///
+/// \param A
+///    An unsigned 64-bit integer operand.
+/// \returns A 64-bit integer containing the number of bits with value 1 in the
+///    source operand.
+/// \see __popcntq
 #define _popcnt64(A) __popcntq((A))
 #endif /* __x86_64__ */
 
@@ -396,6 +481,7 @@ __crc32q(unsigned long long __C, unsigned long long __D)
 /// \param __A
 ///    The performance counter to read.
 /// \returns The 64-bit value read from the performance counter.
+/// \see _rdpmc
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __rdpmc(int __A) {
   return __builtin_ia32_rdpmc(__A);
@@ -416,8 +502,35 @@ __rdtscp(unsigned int *__A) {
   return __builtin_ia32_rdtscp(__A);
 }
 
+/// Reads the processor's time stamp counter.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _rdtsc();
+/// \endcode
+///
+/// This intrinsic corresponds to the \c RDTSC instruction.
+///
+/// \returns The 64-bit value of the time stamp counter.
 #define _rdtsc() __rdtsc()
 
+/// Reads the specified performance monitoring counter. Refer to your
+///    processor's documentation to determine which performance counters are
+///    supported.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _rdpmc(int A);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c RDPMC instruction.
+///
+/// \param A
+///    The performance counter to read.
+/// \returns The 64-bit value read from the performance counter.
+/// \see __rdpmc
 #define _rdpmc(A) __rdpmc(A)
 
 static __inline__ void __DEFAULT_FN_ATTRS
@@ -474,6 +587,7 @@ __rorb(unsigned char __X, int __C) {
 /// \param __C
 ///    The number of bits to rotate the value.
 /// \returns The rotated value.
+/// \see _rotwl
 static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
 __rolw(unsigned short __X, int __C) {
   return __builtin_rotateleft16(__X, __C);
@@ -492,6 +606,7 @@ __rolw(unsigned short __X, int __C) {
 /// \param __C
 ///    The number of bits to rotate the value.
 /// \returns The rotated value.
+/// \see _rotwr
 static __inline__ unsigned short __DEFAULT_FN_ATTRS_CONSTEXPR
 __rorw(unsigned short __X, int __C) {
   return __builtin_rotateright16(__X, __C);
@@ -510,6 +625,7 @@ __rorw(unsigned short __X, int __C) {
 /// \param __C
 ///    The number of bits to rotate the value.
 /// \returns The rotated value.
+/// \see _rotl
 static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rold(unsigned int __X, int __C) {
   return __builtin_rotateleft32(__X, (unsigned int)__C);
@@ -528,6 +644,7 @@ __rold(unsigned int __X, int __C) {
 /// \param __C
 ///    The number of bits to rotate the value.
 /// \returns The rotated value.
+/// \see _rotr
 static __inline__ unsigned int __DEFAULT_FN_ATTRS_CONSTEXPR
 __rord(unsigned int __X, int __C) {
   return __builtin_rotateright32(__X, (unsigned int)__C);
@@ -575,18 +692,167 @@ __rorq(unsigned long long __X, int __C) {
 /* These are already provided as builtins for MSVC. */
 /* Select the correct function based on the size of long. */
 #ifdef __LP64__
+/// Rotates a 64-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _lrotl(unsigned long long a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param a
+///    The unsigned 64-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rolq
 #define _lrotl(a,b) __rolq((a), (b))
+
+/// Rotates a 64-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _lrotr(unsigned long long a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param a
+///    The unsigned 64-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rorq
 #define _lrotr(a,b) __rorq((a), (b))
-#else
+#else // __LP64__
+/// Rotates a 32-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _lrotl(unsigned int a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param a
+///    The unsigned 32-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rold
 #define _lrotl(a,b) __rold((a), (b))
+
+/// Rotates a 32-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _lrotr(unsigned int a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param a
+///    The unsigned 32-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rord
 #define _lrotr(a,b) __rord((a), (b))
-#endif
+#endif // __LP64__
+
+/// Rotates a 32-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _rotl(unsigned int a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param a
+///    The unsigned 32-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rold
 #define _rotl(a,b) __rold((a), (b))
+
+/// Rotates a 32-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _rotr(unsigned int a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param a
+///    The unsigned 32-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rord
 #define _rotr(a,b) __rord((a), (b))
 #endif // _MSC_VER
 
 /* These are not builtins so need to be provided in all modes. */
+/// Rotates a 16-bit value to the left by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _rotwl(unsigned short a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROL instruction.
+///
+/// \param a
+///    The unsigned 16-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rolw
 #define _rotwl(a,b) __rolw((a), (b))
+
+/// Rotates a 16-bit value to the right by the specified number of bits.
+///    This operation is undefined if the number of bits exceeds the size of
+///    the value.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned short _rotwr(unsigned short a, int b);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ROR instruction.
+///
+/// \param a
+///    The unsigned 16-bit value to be rotated.
+/// \param b
+///    The number of bits to rotate the value.
+/// \returns The rotated value.
+/// \see __rorw
 #define _rotwr(a,b) __rorw((a), (b))
 
 #undef __DEFAULT_FN_ATTRS

From 8c1b7fba1fbe9729d6258127b633bd05339e766b Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 22 Jan 2024 15:39:35 +0000
Subject: [PATCH 411/843] [SelectionDAG][DebugInfo][RemoveDIs] Handle entry
 value variables in DPValues too (#78726)

This patch abstracts visitEntryValueDbgValue to deal with the substance
of variable locations (Value, Var, Expr, DebugLoc) rather than how
they're stored. That allows us to call it from handleDebugValue, which
is similarly abstracted. This allows the entry-value behaviour (see the
test) to be supported with non-instruction debug-info too!.
---
 .../SelectionDAG/SelectionDAGBuilder.cpp      | 23 ++++++++++---------
 .../SelectionDAG/SelectionDAGBuilder.h        |  4 +++-
 .../CodeGen/AArch64/dbg-value-swift-async.ll  |  4 ++++
 3 files changed, 19 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 44c78a006656e..a7dd23f78993f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1549,6 +1549,11 @@ bool SelectionDAGBuilder::handleDebugValue(ArrayRef<const Value *> Values,
                                            unsigned Order, bool IsVariadic) {
   if (Values.empty())
     return true;
+
+  // Filter EntryValue locations out early.
+  if (visitEntryValueDbgValue(Values, Var, Expr, DbgLoc))
+    return true;
+
   SmallVector<SDDbgOperand> LocationOps;
   SmallVector<SDNode *> Dependencies;
   for (const Value *V : Values) {
@@ -6023,14 +6028,14 @@ static const CallBase *FindPreallocatedCall(const Value *PreallocatedSetup) {
 /// If DI is a debug value with an EntryValue expression, lower it using the
 /// corresponding physical register of the associated Argument value
 /// (guaranteed to exist by the verifier).
-bool SelectionDAGBuilder::visitEntryValueDbgValue(const DbgValueInst &DI) {
-  DILocalVariable *Variable = DI.getVariable();
-  DIExpression *Expr = DI.getExpression();
-  if (!Expr->isEntryValue() || !hasSingleElement(DI.getValues()))
+bool SelectionDAGBuilder::visitEntryValueDbgValue(
+    ArrayRef<const Value *> Values, DILocalVariable *Variable,
+    DIExpression *Expr, DebugLoc DbgLoc) {
+  if (!Expr->isEntryValue() || !hasSingleElement(Values))
     return false;
 
   // These properties are guaranteed by the verifier.
-  Argument *Arg = cast<Argument>(DI.getValue(0));
+  const Argument *Arg = cast<Argument>(Values[0]);
   assert(Arg->hasAttribute(Attribute::AttrKind::SwiftAsync));
 
   auto ArgIt = FuncInfo.ValueMap.find(Arg);
@@ -6044,9 +6049,8 @@ bool SelectionDAGBuilder::visitEntryValueDbgValue(const DbgValueInst &DI) {
 
   for (auto [PhysReg, VirtReg] : FuncInfo.RegInfo->liveins())
     if (ArgVReg == VirtReg || ArgVReg == PhysReg) {
-      SDDbgValue *SDV =
-          DAG.getVRegDbgValue(Variable, Expr, PhysReg, false /*IsIndidrect*/,
-                              DI.getDebugLoc(), SDNodeOrder);
+      SDDbgValue *SDV = DAG.getVRegDbgValue(
+          Variable, Expr, PhysReg, false /*IsIndidrect*/, DbgLoc, SDNodeOrder);
       DAG.AddDbgValue(SDV, false /*treat as dbg.declare byval parameter*/);
       return true;
     }
@@ -6338,9 +6342,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     DIExpression *Expression = DI.getExpression();
     dropDanglingDebugInfo(Variable, Expression);
 
-    if (visitEntryValueDbgValue(DI))
-      return;
-
     if (DI.isKillLocation()) {
       handleKillDebugValue(Variable, Expression, DI.getDebugLoc(), SDNodeOrder);
       return;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 6dcb8c816ad08..40e2f791f59e3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -612,7 +612,9 @@ class SelectionDAGBuilder {
   void visitInlineAsm(const CallBase &Call,
                       const BasicBlock *EHPadBB = nullptr);
 
-  bool visitEntryValueDbgValue(const DbgValueInst &I);
+  bool visitEntryValueDbgValue(ArrayRef<const Value *> Values,
+                               DILocalVariable *Variable, DIExpression *Expr,
+                               DebugLoc DbgLoc);
   void visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
   void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
diff --git a/llvm/test/CodeGen/AArch64/dbg-value-swift-async.ll b/llvm/test/CodeGen/AArch64/dbg-value-swift-async.ll
index e65f48825e8e3..4649fad5b2c2a 100644
--- a/llvm/test/CodeGen/AArch64/dbg-value-swift-async.ll
+++ b/llvm/test/CodeGen/AArch64/dbg-value-swift-async.ll
@@ -2,6 +2,10 @@
 ; RUN: llc --mtriple="aarch64-" -O0 -fast-isel=false -global-isel=false -stop-after=finalize-isel %s -o - | FileCheck %s --check-prefix=AARCH
 ; RUN: llc --mtriple="aarch64-" -O0 -fast-isel -stop-after=finalize-isel %s -o - | FileCheck %s --check-prefix=AARCH
 
+; RUN: llc --mtriple="aarch64-" -O0 -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=AARCH
+; RUN: llc --mtriple="aarch64-" -O0 -fast-isel=false -global-isel=false -stop-after=finalize-isel %s -o - --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=AARCH
+; RUN: llc --mtriple="aarch64-" -O0 -fast-isel -stop-after=finalize-isel %s -o - --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=AARCH
+
 ; AARCH-NOT:  DBG_VALUE
 ; AARCH:      DBG_VALUE $x22, $noreg, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1)
 ; AARCH-NEXT: DBG_VALUE $x22, $noreg, !{{.*}}, !DIExpression(DW_OP_LLVM_entry_value, 1)

From 4e64ed97804ab4146d9cc24db16a624a265aa690 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Jan 2024 13:33:10 +0000
Subject: [PATCH 412/843] [X86] Update X86::getConstantFromPool to take base
 OperandNo instead of Displacement MachineOperand

This allows us to check the entire constant address calculation, and ensure we're not performing any runtime address math into the constant pool (noticed in an upcoming patch).
---
 .../Target/X86/X86FixupVectorConstants.cpp    |  5 +--
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 14 ++++--
 llvm/lib/Target/X86/X86InstrInfo.h            |  3 +-
 llvm/lib/Target/X86/X86MCInstLower.cpp        | 44 ++++---------------
 4 files changed, 22 insertions(+), 44 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 5ae41dcb645a6..9da8d7338ea36 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -240,8 +240,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
     assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
            "Unexpected number of operands!");
 
-    MachineOperand &CstOp = MI.getOperand(OperandNo + X86::AddrDisp);
-    if (auto *C = X86::getConstantFromPool(MI, CstOp)) {
+    if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
       // Attempt to detect a suitable splat from increasing splat widths.
       std::pair<unsigned, unsigned> Broadcasts[] = {
           {8, OpBcst8},   {16, OpBcst16},   {32, OpBcst32},
@@ -255,7 +254,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
             unsigned NewCPI =
                 CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
             MI.setDesc(TII->get(OpBcst));
-            CstOp.setIndex(NewCPI);
+            MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
             return true;
           }
         }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 36022ef35118f..6082a569cd49a 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3515,13 +3515,21 @@ int X86::getFirstAddrOperandIdx(const MachineInstr &MI) {
 }
 
 const Constant *X86::getConstantFromPool(const MachineInstr &MI,
-                                         const MachineOperand &Op) {
-  if (!Op.isCPI() || Op.getOffset() != 0)
+                                         unsigned OpNo) {
+  assert(MI.getNumOperands() >= (OpNo + X86::AddrNumOperands) &&
+         "Unexpected number of operands!");
+
+  const MachineOperand &Index = MI.getOperand(OpNo + X86::AddrIndexReg);
+  if (!Index.isReg() || Index.getReg() != X86::NoRegister)
+    return nullptr;
+
+  const MachineOperand &Disp = MI.getOperand(OpNo + X86::AddrDisp);
+  if (!Disp.isCPI() || Disp.getOffset() != 0)
     return nullptr;
 
   ArrayRef<MachineConstantPoolEntry> Constants =
       MI.getParent()->getParent()->getConstantPool()->getConstants();
-  const MachineConstantPoolEntry &ConstantEntry = Constants[Op.getIndex()];
+  const MachineConstantPoolEntry &ConstantEntry = Constants[Disp.getIndex()];
 
   // Bail if this is a machine constant pool entry, we won't be able to dig out
   // anything useful.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 407bb87267408..939bc7daf1c7d 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -87,8 +87,7 @@ bool isX87Instruction(MachineInstr &MI);
 int getFirstAddrOperandIdx(const MachineInstr &MI);
 
 /// Find any constant pool entry associated with a specific instruction operand.
-const Constant *getConstantFromPool(const MachineInstr &MI,
-                                    const MachineOperand &Op);
+const Constant *getConstantFromPool(const MachineInstr &MI, unsigned OpNo);
 
 } // namespace X86
 
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index c92712df3f767..1c251e7972d56 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1539,16 +1539,12 @@ static void printConstant(const Constant *COp, unsigned BitWidth,
 static void printZeroUpperMove(const MachineInstr *MI, MCStreamer &OutStreamer,
                                int SclWidth, int VecWidth,
                                const char *ShuffleComment) {
-  assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
-         "Unexpected number of operands!");
-
   std::string Comment;
   raw_string_ostream CS(Comment);
   const MachineOperand &DstOp = MI->getOperand(0);
   CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
 
-  if (auto *C =
-          X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+  if (auto *C = X86::getConstantFromPool(*MI, 1)) {
     int CstEltSize = C->getType()->getScalarSizeInBits();
     if (SclWidth == CstEltSize) {
       if (auto *CI = dyn_cast<ConstantInt>(C)) {
@@ -1584,10 +1580,7 @@ static void printZeroUpperMove(const MachineInstr *MI, MCStreamer &OutStreamer,
 
 static void printLaneBroadcast(const MachineInstr *MI, MCStreamer &OutStreamer,
                                int NumLanes, int BitWidth) {
-  assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
-         "Unexpected number of operands!");
-  if (auto *C =
-          X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+  if (auto *C = X86::getConstantFromPool(*MI, 1)) {
     int CstEltSize = C->getType()->getScalarSizeInBits();
 
     std::string Comment;
@@ -1637,10 +1630,7 @@ static void printLaneBroadcast(const MachineInstr *MI, MCStreamer &OutStreamer,
 static void printElementBroadcast(const MachineInstr *MI,
                                   MCStreamer &OutStreamer, int NumElts,
                                   int EltBits) {
-  assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
-         "Unexpected number of operands!");
-  if (auto *C =
-          X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+  if (auto *C = X86::getConstantFromPool(*MI, 1)) {
     std::string Comment;
     raw_string_ostream CS(Comment);
     const MachineOperand &DstOp = MI->getOperand(0);
@@ -1771,13 +1761,8 @@ static void addConstantComments(const MachineInstr *MI,
         ++SrcIdx;
       }
     }
-    unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
-
-    assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
-           "Unexpected number of operands!");
 
-    const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
-    if (auto *C = X86::getConstantFromPool(*MI, MaskOp)) {
+    if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
       unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 64> Mask;
       DecodePSHUFBMask(C, Width, Mask);
@@ -1849,13 +1834,8 @@ static void addConstantComments(const MachineInstr *MI,
         ++SrcIdx;
       }
     }
-    unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
-
-    assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
-           "Unexpected number of operands!");
 
-    const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
-    if (auto *C = X86::getConstantFromPool(*MI, MaskOp)) {
+    if (auto *C = X86::getConstantFromPool(*MI, SrcIdx + 1)) {
       unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPERMILPMask(C, ElSize, Width, Mask);
@@ -1883,8 +1863,7 @@ static void addConstantComments(const MachineInstr *MI,
     case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
     }
 
-    const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
-    if (auto *C = X86::getConstantFromPool(*MI, MaskOp)) {
+    if (auto *C = X86::getConstantFromPool(*MI, 3)) {
       unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
@@ -1895,11 +1874,7 @@ static void addConstantComments(const MachineInstr *MI,
   }
 
   case X86::VPPERMrrm: {
-    assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands) &&
-           "Unexpected number of operands!");
-
-    const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
-    if (auto *C = X86::getConstantFromPool(*MI, MaskOp)) {
+    if (auto *C = X86::getConstantFromPool(*MI, 3)) {
       unsigned Width = getRegisterWidth(MI->getDesc().operands()[0]);
       SmallVector<int, 16> Mask;
       DecodeVPPERMMask(C, Width, Mask);
@@ -1910,10 +1885,7 @@ static void addConstantComments(const MachineInstr *MI,
   }
 
   case X86::MMX_MOVQ64rm: {
-    assert(MI->getNumOperands() == (1 + X86::AddrNumOperands) &&
-           "Unexpected number of operands!");
-    if (auto *C =
-            X86::getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
+    if (auto *C = X86::getConstantFromPool(*MI, 1)) {
       std::string Comment;
       raw_string_ostream CS(Comment);
       const MachineOperand &DstOp = MI->getOperand(0);

From 74ab7958bdda4d10b082129151f434176f99320e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Jan 2024 15:17:39 +0000
Subject: [PATCH 413/843] [X86] printZeroUpperMove - add support for constant
 vectors.

Allows cases where movss/movsd etc. are loading constant (ConstantDataSequential) sub-vectors, ensuring we pad with the correct number of zero upper elements by making repeated printConstant calls to print zeroes in a matching int/fp format.
---
 llvm/lib/Target/X86/X86MCInstLower.cpp        | 61 ++++++++----------
 .../test/CodeGen/X86/combine-concatvectors.ll |  2 +-
 llvm/test/CodeGen/X86/nontemporal-4.ll        | 64 +++++++++----------
 llvm/test/CodeGen/X86/pr13577.ll              |  2 +-
 llvm/test/CodeGen/X86/ret-mmx.ll              |  2 +-
 5 files changed, 61 insertions(+), 70 deletions(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 1c251e7972d56..5b261522de329 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1484,50 +1484,55 @@ static std::string getShuffleComment(const MachineInstr *MI, unsigned SrcOp1Idx,
   return Comment;
 }
 
-static void printConstant(const APInt &Val, raw_ostream &CS) {
+static void printConstant(const APInt &Val, raw_ostream &CS,
+                          bool PrintZero = false) {
   if (Val.getBitWidth() <= 64) {
-    CS << Val.getZExtValue();
+    CS << (PrintZero ? 0ULL : Val.getZExtValue());
   } else {
     // print multi-word constant as (w0,w1)
     CS << "(";
     for (int i = 0, N = Val.getNumWords(); i < N; ++i) {
       if (i > 0)
         CS << ",";
-      CS << Val.getRawData()[i];
+      CS << (PrintZero ? 0ULL : Val.getRawData()[i]);
     }
     CS << ")";
   }
 }
 
-static void printConstant(const APFloat &Flt, raw_ostream &CS) {
+static void printConstant(const APFloat &Flt, raw_ostream &CS,
+                          bool PrintZero = false) {
   SmallString<32> Str;
   // Force scientific notation to distinguish from integers.
-  Flt.toString(Str, 0, 0);
+  if (PrintZero)
+    APFloat::getZero(Flt.getSemantics()).toString(Str, 0, 0);
+  else
+    Flt.toString(Str, 0, 0);
   CS << Str;
 }
 
 static void printConstant(const Constant *COp, unsigned BitWidth,
-                          raw_ostream &CS) {
+                          raw_ostream &CS, bool PrintZero = false) {
   if (isa<UndefValue>(COp)) {
     CS << "u";
   } else if (auto *CI = dyn_cast<ConstantInt>(COp)) {
-    printConstant(CI->getValue(), CS);
+    printConstant(CI->getValue(), CS, PrintZero);
   } else if (auto *CF = dyn_cast<ConstantFP>(COp)) {
-    printConstant(CF->getValueAPF(), CS);
+    printConstant(CF->getValueAPF(), CS, PrintZero);
   } else if (auto *CDS = dyn_cast<ConstantDataSequential>(COp)) {
     Type *EltTy = CDS->getElementType();
     bool IsInteger = EltTy->isIntegerTy();
     bool IsFP = EltTy->isHalfTy() || EltTy->isFloatTy() || EltTy->isDoubleTy();
     unsigned EltBits = EltTy->getPrimitiveSizeInBits();
     unsigned E = std::min(BitWidth / EltBits, CDS->getNumElements());
-    assert((BitWidth % EltBits) == 0 && "Broadcast element size mismatch");
+    assert((BitWidth % EltBits) == 0 && "Element size mismatch");
     for (unsigned I = 0; I != E; ++I) {
       if (I != 0)
         CS << ",";
       if (IsInteger)
-        printConstant(CDS->getElementAsAPInt(I), CS);
+        printConstant(CDS->getElementAsAPInt(I), CS, PrintZero);
       else if (IsFP)
-        printConstant(CDS->getElementAsAPFloat(I), CS);
+        printConstant(CDS->getElementAsAPFloat(I), CS, PrintZero);
       else
         CS << "?";
     }
@@ -1545,31 +1550,17 @@ static void printZeroUpperMove(const MachineInstr *MI, MCStreamer &OutStreamer,
   CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
 
   if (auto *C = X86::getConstantFromPool(*MI, 1)) {
-    int CstEltSize = C->getType()->getScalarSizeInBits();
-    if (SclWidth == CstEltSize) {
-      if (auto *CI = dyn_cast<ConstantInt>(C)) {
-        CS << "[";
-        printConstant(CI->getValue(), CS);
-        for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
-          CS << ",0";
-        }
-        CS << "]";
-        OutStreamer.AddComment(CS.str());
-        return; // early-out
-      }
-
-      if (auto *CF = dyn_cast<ConstantFP>(C)) {
-        CS << "[";
-        printConstant(CF->getValue(), CS);
-        APFloat ZeroFP = APFloat::getZero(CF->getValue().getSemantics());
-        for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
-          CS << ",";
-          printConstant(ZeroFP, CS);
-        }
-        CS << "]";
-        OutStreamer.AddComment(CS.str());
-        return; // early-out
+    if (isa<ConstantInt>(C) || isa<ConstantFP>(C) ||
+        isa<ConstantDataSequential>(C)) {
+      CS << "[";
+      printConstant(C, SclWidth, CS);
+      for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
+        CS << ",";
+        printConstant(C, SclWidth, CS, true);
       }
+      CS << "]";
+      OutStreamer.AddComment(CS.str());
+      return; // early-out
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/combine-concatvectors.ll b/llvm/test/CodeGen/X86/combine-concatvectors.ll
index 8a0e39c57b787..17d22607cfea8 100644
--- a/llvm/test/CodeGen/X86/combine-concatvectors.ll
+++ b/llvm/test/CodeGen/X86/combine-concatvectors.ll
@@ -50,7 +50,7 @@ define void @concat_of_broadcast_v2f64_v4f64() {
 ; AVX1-NEXT:    movq %rcx, 46348(%rax)
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} ymm0 = [1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216,1065353216]
 ; AVX1-NEXT:    vmovups %ymm0, 48296(%rax)
-; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [7.812501848093234E-3,0.0E+0]
 ; AVX1-NEXT:    vmovsd %xmm0, 47372(%rax)
 ; AVX1-NEXT:    vzeroupper
 ; AVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/nontemporal-4.ll b/llvm/test/CodeGen/X86/nontemporal-4.ll
index 6a8df2445690a..c1eff891a9487 100644
--- a/llvm/test/CodeGen/X86/nontemporal-4.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-4.ll
@@ -36,7 +36,7 @@ define void @test_constant_v4f32_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [2.0000004731118679E+0,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -124,7 +124,7 @@ define void @test_constant_v4i32_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [2.1219957909652723E-314,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -168,7 +168,7 @@ define void @test_constant_v8i16_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [4.1720559249406128E-309,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -212,7 +212,7 @@ define void @test_constant_v16i8_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [7.9499288951273625E-275,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -278,11 +278,11 @@ define void @test_constant_v8f32_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-5.1200036668777466E+2,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -346,7 +346,7 @@ define void @test_constant_v4i64_align1(ptr %dst) nounwind {
 ; SSE4A-NEXT:    movntiq %rax, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -406,11 +406,11 @@ define void @test_constant_v8i32_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -470,11 +470,11 @@ define void @test_constant_v16i16_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -534,11 +534,11 @@ define void @test_constant_v32i8_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-3.826728214441238E+279,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.6485712323024388E+202,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -803,19 +803,19 @@ define void @test_constant_v16f32_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-0.0E+0,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-5.1200036668777466E+2,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.3107209417724609E+5,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-2.0971535092773438E+6,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -911,15 +911,15 @@ define void @test_constant_v8i64_align1(ptr %dst) nounwind {
 ; SSE4A-NEXT:    movntiq %rax, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -1011,19 +1011,19 @@ define void @test_constant_v16i32_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -1115,19 +1115,19 @@ define void @test_constant_v32i16_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [NaN,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.6853227412070812E+308,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.2358925997317751E+308,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
 ; SSE4A-NEXT:    retq
 ;
@@ -1219,19 +1219,19 @@ define void @test_constant_v64i8_align1(ptr %dst) nounwind {
 ; SSE4A:       # %bb.0:
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-3.826728214441238E+279,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-1.6485712323024388E+202,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-7.1020783099933495E+124,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
 ; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
 ; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; SSE4A-NEXT:    movsd {{.*#+}} xmm0 = [-3.0595730451167367E+47,0.0E+0]
 ; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
 ; SSE4A-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/pr13577.ll b/llvm/test/CodeGen/X86/pr13577.ll
index 7511560d85f51..3b8a05ef30f81 100644
--- a/llvm/test/CodeGen/X86/pr13577.ll
+++ b/llvm/test/CodeGen/X86/pr13577.ll
@@ -29,7 +29,7 @@ declare x86_fp80 @copysignl(x86_fp80, x86_fp80) nounwind readnone
 define float @pr26070() {
 ; CHECK-LABEL: pr26070:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    retq
   %c = call float @copysignf(float 1.0, float undef) readnone
   ret float %c
diff --git a/llvm/test/CodeGen/X86/ret-mmx.ll b/llvm/test/CodeGen/X86/ret-mmx.ll
index f0d4d5bdb6cee..815f95e64496b 100644
--- a/llvm/test/CodeGen/X86/ret-mmx.ll
+++ b/llvm/test/CodeGen/X86/ret-mmx.ll
@@ -40,7 +40,7 @@ define <2 x i32> @t3() nounwind {
 define double @t4() nounwind {
 ; CHECK-LABEL: t4:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1,0,0,0]
 ; CHECK-NEXT:    retq
   ret double bitcast (<2 x i32> <i32 1, i32 0> to double)
 }

From c5532124dc0ffc892ded0b093e92244977e748f8 Mon Sep 17 00:00:00 2001
From: Daniel Grumberg <dgrumberg@apple.com>
Date: Mon, 22 Jan 2024 15:41:29 +0000
Subject: [PATCH 414/843] [clang][ExtractAPI] Ensure typedef to pointer types
 are preserved (#78584)

When generating declaration fragments for types that use typedefs to
pointer types ensure that we keep the user-defined typedef form instead
of desugaring the typedef.

rdar://102137655
---
 clang/lib/ExtractAPI/DeclarationFragments.cpp |  80 ++---
 clang/test/ExtractAPI/typedef.c               | 280 ++++++++++++++++++
 2 files changed, 320 insertions(+), 40 deletions(-)

diff --git a/clang/lib/ExtractAPI/DeclarationFragments.cpp b/clang/lib/ExtractAPI/DeclarationFragments.cpp
index 044ccf5dea095..56c1f5bf5eab0 100644
--- a/clang/lib/ExtractAPI/DeclarationFragments.cpp
+++ b/clang/lib/ExtractAPI/DeclarationFragments.cpp
@@ -252,6 +252,46 @@ DeclarationFragments DeclarationFragmentsBuilder::getFragmentsForType(
 
   DeclarationFragments Fragments;
 
+  // An ElaboratedType is a sugar for types that are referred to using an
+  // elaborated keyword, e.g., `struct S`, `enum E`, or (in C++) via a
+  // qualified name, e.g., `N::M::type`, or both.
+  if (const ElaboratedType *ET = dyn_cast<ElaboratedType>(T)) {
+    ElaboratedTypeKeyword Keyword = ET->getKeyword();
+    if (Keyword != ElaboratedTypeKeyword::None) {
+      Fragments
+          .append(ElaboratedType::getKeywordName(Keyword),
+                  DeclarationFragments::FragmentKind::Keyword)
+          .appendSpace();
+    }
+
+    if (const NestedNameSpecifier *NNS = ET->getQualifier())
+      Fragments.append(getFragmentsForNNS(NNS, Context, After));
+
+    // After handling the elaborated keyword or qualified name, build
+    // declaration fragments for the desugared underlying type.
+    return Fragments.append(getFragmentsForType(ET->desugar(), Context, After));
+  }
+
+  // If the type is a typedefed type, get the underlying TypedefNameDecl for a
+  // direct reference to the typedef instead of the wrapped type.
+
+  // 'id' type is a typedef for an ObjCObjectPointerType
+  //  we treat it as a typedef
+  if (const TypedefType *TypedefTy = dyn_cast<TypedefType>(T)) {
+    const TypedefNameDecl *Decl = TypedefTy->getDecl();
+    TypedefUnderlyingTypeResolver TypedefResolver(Context);
+    std::string USR = TypedefResolver.getUSRForType(QualType(T, 0));
+
+    if (T->isObjCIdType()) {
+      return Fragments.append(Decl->getName(),
+                              DeclarationFragments::FragmentKind::Keyword);
+    }
+
+    return Fragments.append(
+        Decl->getName(), DeclarationFragments::FragmentKind::TypeIdentifier,
+        USR, TypedefResolver.getUnderlyingTypeDecl(QualType(T, 0)));
+  }
+
   // Declaration fragments of a pointer type is the declaration fragments of
   // the pointee type followed by a `*`,
   if (T->isPointerType() && !T->isFunctionPointerType())
@@ -328,46 +368,6 @@ DeclarationFragments DeclarationFragmentsBuilder::getFragmentsForType(
         getFragmentsForType(AT->getElementType(), Context, After));
   }
 
-  // An ElaboratedType is a sugar for types that are referred to using an
-  // elaborated keyword, e.g., `struct S`, `enum E`, or (in C++) via a
-  // qualified name, e.g., `N::M::type`, or both.
-  if (const ElaboratedType *ET = dyn_cast<ElaboratedType>(T)) {
-    ElaboratedTypeKeyword Keyword = ET->getKeyword();
-    if (Keyword != ElaboratedTypeKeyword::None) {
-      Fragments
-          .append(ElaboratedType::getKeywordName(Keyword),
-                  DeclarationFragments::FragmentKind::Keyword)
-          .appendSpace();
-    }
-
-    if (const NestedNameSpecifier *NNS = ET->getQualifier())
-      Fragments.append(getFragmentsForNNS(NNS, Context, After));
-
-    // After handling the elaborated keyword or qualified name, build
-    // declaration fragments for the desugared underlying type.
-    return Fragments.append(getFragmentsForType(ET->desugar(), Context, After));
-  }
-
-  // If the type is a typedefed type, get the underlying TypedefNameDecl for a
-  // direct reference to the typedef instead of the wrapped type.
-
-  // 'id' type is a typedef for an ObjCObjectPointerType
-  //  we treat it as a typedef
-  if (const TypedefType *TypedefTy = dyn_cast<TypedefType>(T)) {
-    const TypedefNameDecl *Decl = TypedefTy->getDecl();
-    TypedefUnderlyingTypeResolver TypedefResolver(Context);
-    std::string USR = TypedefResolver.getUSRForType(QualType(T, 0));
-
-    if (T->isObjCIdType()) {
-      return Fragments.append(Decl->getName(),
-                              DeclarationFragments::FragmentKind::Keyword);
-    }
-
-    return Fragments.append(
-        Decl->getName(), DeclarationFragments::FragmentKind::TypeIdentifier,
-        USR, TypedefResolver.getUnderlyingTypeDecl(QualType(T, 0)));
-  }
-
   // Everything we care about has been handled now, reduce to the canonical
   // unqualified base type.
   QualType Base = T->getCanonicalTypeUnqualified();
diff --git a/clang/test/ExtractAPI/typedef.c b/clang/test/ExtractAPI/typedef.c
index 7f30b4bc9bb0c..c30e65549f2b7 100644
--- a/clang/test/ExtractAPI/typedef.c
+++ b/clang/test/ExtractAPI/typedef.c
@@ -16,6 +16,12 @@
 //--- input.h
 typedef int MyInt;
 
+typedef struct Bar *BarPtr;
+
+void foo(BarPtr value);
+
+void baz(BarPtr *value);
+
 //--- reference.output.json.in
 {
   "metadata": {
@@ -43,6 +49,208 @@ typedef int MyInt;
   },
   "relationships": [],
   "symbols": [
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "typeIdentifier",
+          "preciseIdentifier": "c:v",
+          "spelling": "void"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "foo"
+        },
+        {
+          "kind": "text",
+          "spelling": "("
+        },
+        {
+          "kind": "typeIdentifier",
+          "preciseIdentifier": "c:input.h@T@BarPtr",
+          "spelling": "BarPtr"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "internalParam",
+          "spelling": "value"
+        },
+        {
+          "kind": "text",
+          "spelling": ");"
+        }
+      ],
+      "functionSignature": {
+        "parameters": [
+          {
+            "declarationFragments": [
+              {
+                "kind": "typeIdentifier",
+                "preciseIdentifier": "c:input.h@T@BarPtr",
+                "spelling": "BarPtr"
+              },
+              {
+                "kind": "text",
+                "spelling": " "
+              },
+              {
+                "kind": "internalParam",
+                "spelling": "value"
+              }
+            ],
+            "name": "value"
+          }
+        ],
+        "returns": [
+          {
+            "kind": "typeIdentifier",
+            "preciseIdentifier": "c:v",
+            "spelling": "void"
+          }
+        ]
+      },
+      "identifier": {
+        "interfaceLanguage": "objective-c",
+        "precise": "c:@F@foo"
+      },
+      "kind": {
+        "displayName": "Function",
+        "identifier": "objective-c.func"
+      },
+      "location": {
+        "position": {
+          "character": 5,
+          "line": 4
+        },
+        "uri": "file://INPUT_DIR/input.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "foo"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "identifier",
+            "spelling": "foo"
+          }
+        ],
+        "title": "foo"
+      },
+      "pathComponents": [
+        "foo"
+      ]
+    },
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "typeIdentifier",
+          "preciseIdentifier": "c:v",
+          "spelling": "void"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "baz"
+        },
+        {
+          "kind": "text",
+          "spelling": "("
+        },
+        {
+          "kind": "typeIdentifier",
+          "preciseIdentifier": "c:input.h@T@BarPtr",
+          "spelling": "BarPtr"
+        },
+        {
+          "kind": "text",
+          "spelling": " * "
+        },
+        {
+          "kind": "internalParam",
+          "spelling": "value"
+        },
+        {
+          "kind": "text",
+          "spelling": ");"
+        }
+      ],
+      "functionSignature": {
+        "parameters": [
+          {
+            "declarationFragments": [
+              {
+                "kind": "typeIdentifier",
+                "preciseIdentifier": "c:input.h@T@BarPtr",
+                "spelling": "BarPtr"
+              },
+              {
+                "kind": "text",
+                "spelling": " * "
+              },
+              {
+                "kind": "internalParam",
+                "spelling": "value"
+              }
+            ],
+            "name": "value"
+          }
+        ],
+        "returns": [
+          {
+            "kind": "typeIdentifier",
+            "preciseIdentifier": "c:v",
+            "spelling": "void"
+          }
+        ]
+      },
+      "identifier": {
+        "interfaceLanguage": "objective-c",
+        "precise": "c:@F@baz"
+      },
+      "kind": {
+        "displayName": "Function",
+        "identifier": "objective-c.func"
+      },
+      "location": {
+        "position": {
+          "character": 5,
+          "line": 6
+        },
+        "uri": "file://INPUT_DIR/input.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "baz"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "identifier",
+            "spelling": "baz"
+          }
+        ],
+        "title": "baz"
+      },
+      "pathComponents": [
+        "baz"
+      ]
+    },
     {
       "accessLevel": "public",
       "declarationFragments": [
@@ -106,6 +314,78 @@ typedef int MyInt;
         "MyInt"
       ],
       "type": "c:I"
+    },
+    {
+      "accessLevel": "public",
+      "declarationFragments": [
+        {
+          "kind": "keyword",
+          "spelling": "typedef"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "keyword",
+          "spelling": "struct"
+        },
+        {
+          "kind": "text",
+          "spelling": " "
+        },
+        {
+          "kind": "typeIdentifier",
+          "preciseIdentifier": "c:@S@Bar",
+          "spelling": "Bar"
+        },
+        {
+          "kind": "text",
+          "spelling": " * "
+        },
+        {
+          "kind": "identifier",
+          "spelling": "BarPtr"
+        },
+        {
+          "kind": "text",
+          "spelling": ";"
+        }
+      ],
+      "identifier": {
+        "interfaceLanguage": "objective-c",
+        "precise": "c:input.h@T@BarPtr"
+      },
+      "kind": {
+        "displayName": "Type Alias",
+        "identifier": "objective-c.typealias"
+      },
+      "location": {
+        "position": {
+          "character": 20,
+          "line": 2
+        },
+        "uri": "file://INPUT_DIR/input.h"
+      },
+      "names": {
+        "navigator": [
+          {
+            "kind": "identifier",
+            "spelling": "BarPtr"
+          }
+        ],
+        "subHeading": [
+          {
+            "kind": "identifier",
+            "spelling": "BarPtr"
+          }
+        ],
+        "title": "BarPtr"
+      },
+      "pathComponents": [
+        "BarPtr"
+      ],
+      "type": "c:*$@S@Bar"
     }
   ]
 }

From bfb09326be2281bdbf4c1f3cea532fc17a28ad15 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Sat, 20 Jan 2024 16:12:46 +0100
Subject: [PATCH 415/843] [JITLink][AArch32] Implement ELF relocation
 R_ARM_TARGET1

Prepare a configuration switch and default to R_ARM_ABS32
---
 llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h       | 2 ++
 llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp          | 8 ++++++--
 .../JITLink/AArch32/ELF_relocations_data.s                | 7 +++++++
 llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp   | 8 +++++---
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
index 0968a093279bf..938d25dc15bf2 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
@@ -135,6 +135,8 @@ enum class StubsFlavor {
 struct ArmConfig {
   bool J1J2BranchEncoding = false;
   StubsFlavor Stubs = StubsFlavor::Unsupported;
+  // In the long term, we might want a linker switch like --target1-rel
+  bool Target1Rel = false;
 };
 
 /// Obtain the sub-arch configuration for a given Arm CPU model.
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
index 2553cd70a5769..ed7d58da2dc11 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
@@ -31,7 +31,8 @@ namespace llvm {
 namespace jitlink {
 
 /// Translate from ELF relocation type to JITLink-internal edge kind.
-Expected<aarch32::EdgeKind_aarch32> getJITLinkEdgeKind(uint32_t ELFType) {
+Expected<aarch32::EdgeKind_aarch32>
+getJITLinkEdgeKind(uint32_t ELFType, const aarch32::ArmConfig &ArmCfg) {
   switch (ELFType) {
   case ELF::R_ARM_ABS32:
     return aarch32::Data_Pointer32;
@@ -47,6 +48,9 @@ Expected<aarch32::EdgeKind_aarch32> getJITLinkEdgeKind(uint32_t ELFType) {
     return aarch32::Arm_MovwAbsNC;
   case ELF::R_ARM_MOVT_ABS:
     return aarch32::Arm_MovtAbs;
+  case ELF::R_ARM_TARGET1:
+    return (ArmCfg.Target1Rel) ? aarch32::Data_Delta32
+                               : aarch32::Data_Pointer32;
   case ELF::R_ARM_THM_CALL:
     return aarch32::Thumb_Call;
   case ELF::R_ARM_THM_JUMP24:
@@ -171,7 +175,7 @@ class ELFLinkGraphBuilder_aarch32
           inconvertibleErrorCode());
 
     uint32_t Type = Rel.getType(false);
-    Expected<aarch32::EdgeKind_aarch32> Kind = getJITLinkEdgeKind(Type);
+    Expected<aarch32::EdgeKind_aarch32> Kind = getJITLinkEdgeKind(Type, ArmCfg);
     if (!Kind)
       return Kind.takeError();
 
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
index f91a4733c40ee..de25a82aff16b 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
@@ -29,6 +29,13 @@ rel32:
 	.word target - .
 	.size rel32, .-rel32
 
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_TARGET1 target
+# jitlink-check: *{4}(target1_abs32) = target
+	.global target1_abs32
+target1_abs32:
+	.word target(target1)
+	.size	target1_abs32, .-target1_abs32
+
 # CHECK-TYPE: {{[0-9a-f]+}} R_ARM_GOT_PREL target
 #
 # The GOT entry contains the absolute address of the external:
diff --git a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
index 26c773b8dc3a7..a08884e1c7f7f 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
@@ -39,7 +39,8 @@ struct MutableWord {
 namespace llvm {
 namespace jitlink {
 
-Expected<aarch32::EdgeKind_aarch32> getJITLinkEdgeKind(uint32_t ELFType);
+Expected<aarch32::EdgeKind_aarch32>
+getJITLinkEdgeKind(uint32_t ELFType, const aarch32::ArmConfig &Cfg);
 Expected<uint32_t> getELFRelocationType(Edge::Kind Kind);
 
 } // namespace jitlink
@@ -47,7 +48,8 @@ Expected<uint32_t> getELFRelocationType(Edge::Kind Kind);
 
 TEST(AArch32_ELF, EdgeKinds) {
   // Fails: Invalid ELF type -> JITLink kind
-  Expected<uint32_t> ErrKind = getJITLinkEdgeKind(ELF::R_ARM_NONE);
+  aarch32::ArmConfig Cfg;
+  Expected<uint32_t> ErrKind = getJITLinkEdgeKind(ELF::R_ARM_NONE, Cfg);
   EXPECT_TRUE(errorToBool(ErrKind.takeError()));
 
   // Fails: Invalid JITLink kind -> ELF type
@@ -59,7 +61,7 @@ TEST(AArch32_ELF, EdgeKinds) {
     EXPECT_FALSE(errorToBool(ELFType.takeError()))
         << "Failed to translate JITLink kind -> ELF type";
 
-    Expected<Edge::Kind> JITLinkKind = getJITLinkEdgeKind(*ELFType);
+    Expected<Edge::Kind> JITLinkKind = getJITLinkEdgeKind(*ELFType, Cfg);
     EXPECT_FALSE(errorToBool(JITLinkKind.takeError()))
         << "Failed to translate ELF type -> JITLink kind";
 

From 565470ed27131b2550e71cf334bf3f33f9d82c62 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Sat, 20 Jan 2024 18:06:14 +0100
Subject: [PATCH 416/843] [JITLink][AArch32] Implement ELF relocation
 R_ARM_NONE

---
 .../llvm/ExecutionEngine/JITLink/aarch32.h    | 12 +++++++---
 .../ExecutionEngine/JITLink/ELF_aarch32.cpp   |  4 ++++
 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp  |  1 +
 .../JITLink/AArch32/ELF_relocations_data.s    | 24 +++++++++++++++++++
 .../ExecutionEngine/JITLink/AArch32Tests.cpp  |  2 +-
 5 files changed, 39 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
index 938d25dc15bf2..081f77a85e182 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
@@ -101,7 +101,11 @@ enum EdgeKind_aarch32 : Edge::Kind {
   Thumb_MovtPrel,
 
   LastThumbRelocation = Thumb_MovtPrel,
-  LastRelocation = LastThumbRelocation,
+
+  /// No-op relocation
+  None,
+
+  LastRelocation = None,
 };
 
 /// Flags enum for AArch32-specific symbol properties
@@ -293,7 +297,8 @@ inline Expected<int64_t> readAddend(LinkGraph &G, Block &B,
   if (Kind <= LastThumbRelocation)
     return readAddendThumb(G, B, Offset, Kind, ArmCfg);
 
-  llvm_unreachable("Relocation must be of class Data, Arm or Thumb");
+  assert(Kind == None && "Not associated with a relocation class");
+  return 0;
 }
 
 /// Helper function to apply the fixup for Data-class relocations.
@@ -320,7 +325,8 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
   if (Kind <= LastThumbRelocation)
     return applyFixupThumb(G, B, E, ArmCfg);
 
-  llvm_unreachable("Relocation must be of class Data, Arm or Thumb");
+  assert(Kind == None && "Not associated with a relocation class");
+  return Error::success();
 }
 
 /// Populate a Global Offset Table from edges that request it.
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
index ed7d58da2dc11..15c209e1ebe5b 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
@@ -48,6 +48,8 @@ getJITLinkEdgeKind(uint32_t ELFType, const aarch32::ArmConfig &ArmCfg) {
     return aarch32::Arm_MovwAbsNC;
   case ELF::R_ARM_MOVT_ABS:
     return aarch32::Arm_MovtAbs;
+  case ELF::R_ARM_NONE:
+    return aarch32::None;
   case ELF::R_ARM_TARGET1:
     return (ArmCfg.Target1Rel) ? aarch32::Data_Delta32
                                : aarch32::Data_Pointer32;
@@ -99,6 +101,8 @@ Expected<uint32_t> getELFRelocationType(Edge::Kind Kind) {
     return ELF::R_ARM_THM_MOVW_PREL_NC;
   case aarch32::Thumb_MovtPrel:
     return ELF::R_ARM_THM_MOVT_PREL;
+  case aarch32::None:
+    return ELF::R_ARM_NONE;
   }
 
   return make_error<JITLinkError>(formatv("Invalid aarch32 edge {0:d}: ",
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
index 111527a39e06e..1797a0068cd7c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -766,6 +766,7 @@ const char *getEdgeKindName(Edge::Kind K) {
     KIND_NAME_CASE(Thumb_MovtAbs)
     KIND_NAME_CASE(Thumb_MovwPrelNC)
     KIND_NAME_CASE(Thumb_MovtPrel)
+    KIND_NAME_CASE(None)
   default:
     return getGenericEdgeKindName(K);
   }
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
index de25a82aff16b..7bd59f8a52de6 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
@@ -64,6 +64,30 @@ got_prel_offset:
 	.size	got_prel_offset, .-got_prel_offset
 	.size	got_prel, .-got_prel
 
+# EH personality routine
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_NONE __aeabi_unwind_cpp_pr0
+	.globl __aeabi_unwind_cpp_pr0
+	.type __aeabi_unwind_cpp_pr0,%function
+	.align 2
+__aeabi_unwind_cpp_pr0:
+	bx lr
+
+# Generate reference to EH personality (right now we ignore the resulting
+# R_ARM_PREL31 relocation since it's in .ARM.exidx)
+	.globl  prel31
+	.type   prel31,%function
+	.align  2
+prel31:
+	.fnstart
+	.save   {r11, lr}
+	push    {r11, lr}
+	.setfp  r11, sp
+	mov     r11, sp
+	pop     {r11, lr}
+	mov     pc, lr
+	.size   prel31,.-prel31
+	.fnend
+
 # This test is executable with any 4-byte external target:
 #  > echo "unsigned target = 42;" | clang -target armv7-linux-gnueabihf -o target.o -c -xc -
 #  > llvm-jitlink target.o armv7/out.o
diff --git a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
index a08884e1c7f7f..b1890d884d173 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/AArch32Tests.cpp
@@ -49,7 +49,7 @@ Expected<uint32_t> getELFRelocationType(Edge::Kind Kind);
 TEST(AArch32_ELF, EdgeKinds) {
   // Fails: Invalid ELF type -> JITLink kind
   aarch32::ArmConfig Cfg;
-  Expected<uint32_t> ErrKind = getJITLinkEdgeKind(ELF::R_ARM_NONE, Cfg);
+  Expected<uint32_t> ErrKind = getJITLinkEdgeKind(ELF::R_ARM_ME_TOO, Cfg);
   EXPECT_TRUE(errorToBool(ErrKind.takeError()));
 
   // Fails: Invalid JITLink kind -> ELF type

From 160ddf71147637b4b806dde2a0c686496aae5f14 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski@arm.com>
Date: Mon, 22 Jan 2024 15:50:37 +0000
Subject: [PATCH 417/843] [mlir] Remove duplicate test

The removed test is identical to the one directly above.
---
 mlir/test/Dialect/Vector/invalid.mlir | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/mlir/test/Dialect/Vector/invalid.mlir b/mlir/test/Dialect/Vector/invalid.mlir
index 9c73f5a8f74a8..5fa8ac245ce97 100644
--- a/mlir/test/Dialect/Vector/invalid.mlir
+++ b/mlir/test/Dialect/Vector/invalid.mlir
@@ -666,13 +666,6 @@ func.func @extract_strided_slice(%arg0: vector<4x8x16xf32>) {
 
 // -----
 
-func.func @extract_strided_slice(%arg0: vector<4x8x16xf32>) {
-  // expected-error@+1 {{expected offsets attribute of rank no greater than vector rank}}
-  %1 = vector.extract_strided_slice %arg0 {offsets = [2, 2, 2, 2], sizes = [2, 2, 2, 2], strides = [1, 1, 1, 1]} : vector<4x8x16xf32> to vector<2x2x16xf32>
-}
-
-// -----
-
 func.func @extract_strided_slice(%arg0: vector<4x8x16xf32>) {
   // expected-error@+1 {{op expected offsets dimension 0 to be confined to [0, 4)}}
   %1 = vector.extract_strided_slice %arg0 {offsets = [100], sizes = [100], strides = [100]} : vector<4x8x16xf32> to vector<100x8x16xf32>

From e302950023cd99251371c5dc8a1e3b609dd5a8fe Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Mon, 22 Jan 2024 15:55:05 +0000
Subject: [PATCH 418/843] [flang] Handle -S assemble only flag in
 flang-to-external-fc (#78979)

Flang was recently updated on Compiler Explorer and by default it's in
assemble only mode, you have to enable linking and executing.

This means that the default output for flang-to-external-fc is nothing,
as it doesn't know what `-S` means. You'd have to know to enable the
link to binary option to see any output.

Handle `-S` so that users of Compiler Explorer don't have to wonder why
the "compiler" is broken.
---
 flang/tools/f18/flang-to-external-fc.in | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/flang/tools/f18/flang-to-external-fc.in b/flang/tools/f18/flang-to-external-fc.in
index 24ceda3b7812c..bd16c030121cb 100755
--- a/flang/tools/f18/flang-to-external-fc.in
+++ b/flang/tools/f18/flang-to-external-fc.in
@@ -108,6 +108,10 @@ parse_args()
           COMPILE_ONLY="True"
         fi
 
+        if [[ $1 == "-S" ]]; then
+          COMPILE_ONLY="True"
+        fi
+
         if [[ $1 == "-E" ]]; then
           PREPROCESS_ONLY="True"
         fi

From 27eb8d53ae44e2f5a6259744446ea389afaf68a2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Mon, 22 Jan 2024 15:46:24 +0000
Subject: [PATCH 419/843] [X86] printConstant - add ConstantVector handling

---
 llvm/lib/Target/X86/X86MCInstLower.cpp   | 28 ++++++++++++++----------
 llvm/test/CodeGen/X86/widen_shuffle-1.ll |  2 +-
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 5b261522de329..8bf099a165816 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1536,6 +1536,15 @@ static void printConstant(const Constant *COp, unsigned BitWidth,
       else
         CS << "?";
     }
+  } else if (auto *CV = dyn_cast<ConstantVector>(COp)) {
+    unsigned EltBits = CV->getType()->getScalarSizeInBits();
+    unsigned E = std::min(BitWidth / EltBits, CV->getNumOperands());
+    assert((BitWidth % EltBits) == 0 && "Element size mismatch");
+    for (unsigned I = 0; I != E; ++I) {
+      if (I != 0)
+        CS << ",";
+      printConstant(CV->getOperand(I), EltBits, CS, PrintZero);
+    }
   } else {
     CS << "?";
   }
@@ -1550,18 +1559,15 @@ static void printZeroUpperMove(const MachineInstr *MI, MCStreamer &OutStreamer,
   CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
 
   if (auto *C = X86::getConstantFromPool(*MI, 1)) {
-    if (isa<ConstantInt>(C) || isa<ConstantFP>(C) ||
-        isa<ConstantDataSequential>(C)) {
-      CS << "[";
-      printConstant(C, SclWidth, CS);
-      for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
-        CS << ",";
-        printConstant(C, SclWidth, CS, true);
-      }
-      CS << "]";
-      OutStreamer.AddComment(CS.str());
-      return; // early-out
+    CS << "[";
+    printConstant(C, SclWidth, CS);
+    for (int I = 1, E = VecWidth / SclWidth; I < E; ++I) {
+      CS << ",";
+      printConstant(C, SclWidth, CS, true);
     }
+    CS << "]";
+    OutStreamer.AddComment(CS.str());
+    return; // early-out
   }
 
   // We didn't find a constant load, fallback to a shuffle mask decode.
diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
index f55bc06bc3c45..925b96f5e346c 100644
--- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
@@ -105,7 +105,7 @@ define void @shuf5(ptr %p) nounwind {
 ; X86-LABEL: shuf5:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; X86-NEXT:    movsd {{.*#+}} xmm0 = [33,33,33,33,33,33,33,33,0,0,0,0,0,0,0,0]
 ; X86-NEXT:    movsd %xmm0, (%eax)
 ; X86-NEXT:    retl
 ;

From 06c3c3b67cb0287856145806cfb0179def3214bd Mon Sep 17 00:00:00 2001
From: Piotr Zegar <piotr.zegar@nokia.com>
Date: Mon, 22 Jan 2024 17:11:02 +0100
Subject: [PATCH 420/843] [clang-tidy] Add bugprone-chained-comparison check
 (#76365)

Check that flags chained comparison expressions,
such as a < b < c or a == b == c, which may have
unintended behavior due to implicit operator
associativity.

Moved from Phabricator  (D144429).
---
 .../bugprone/BugproneTidyModule.cpp           |   3 +
 .../clang-tidy/bugprone/CMakeLists.txt        |   1 +
 .../bugprone/ChainedComparisonCheck.cpp       | 150 ++++++++++++++++++
 .../bugprone/ChainedComparisonCheck.h         |  34 ++++
 clang-tools-extra/docs/ReleaseNotes.rst       |   6 +
 .../checks/bugprone/chained-comparison.rst    |  73 +++++++++
 .../docs/clang-tidy/checks/list.rst           |   1 +
 .../checkers/bugprone/chained-comparison.c    |  76 +++++++++
 .../checkers/bugprone/chained-comparison.cpp  |  91 +++++++++++
 9 files changed, 435 insertions(+)
 create mode 100644 clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp
 create mode 100644 clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h
 create mode 100644 clang-tools-extra/docs/clang-tidy/checks/bugprone/chained-comparison.rst
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/chained-comparison.c
 create mode 100644 clang-tools-extra/test/clang-tidy/checkers/bugprone/chained-comparison.cpp

diff --git a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
index 435cb1e3fbcff..a8a23b045f80b 100644
--- a/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/BugproneTidyModule.cpp
@@ -17,6 +17,7 @@
 #include "BoolPointerImplicitConversionCheck.h"
 #include "BranchCloneCheck.h"
 #include "CastingThroughVoidCheck.h"
+#include "ChainedComparisonCheck.h"
 #include "ComparePointerToMemberVirtualFunctionCheck.h"
 #include "CopyConstructorInitCheck.h"
 #include "DanglingHandleCheck.h"
@@ -108,6 +109,8 @@ class BugproneModule : public ClangTidyModule {
     CheckFactories.registerCheck<BranchCloneCheck>("bugprone-branch-clone");
     CheckFactories.registerCheck<CastingThroughVoidCheck>(
         "bugprone-casting-through-void");
+    CheckFactories.registerCheck<ChainedComparisonCheck>(
+        "bugprone-chained-comparison");
     CheckFactories.registerCheck<ComparePointerToMemberVirtualFunctionCheck>(
         "bugprone-compare-pointer-to-member-virtual-function");
     CheckFactories.registerCheck<CopyConstructorInitCheck>(
diff --git a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
index 70e7fbc7ec0c1..1cd6fb207d762 100644
--- a/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
+++ b/clang-tools-extra/clang-tidy/bugprone/CMakeLists.txt
@@ -12,6 +12,7 @@ add_clang_library(clangTidyBugproneModule
   BranchCloneCheck.cpp
   BugproneTidyModule.cpp
   CastingThroughVoidCheck.cpp
+  ChainedComparisonCheck.cpp
   ComparePointerToMemberVirtualFunctionCheck.cpp
   CopyConstructorInitCheck.cpp
   DanglingHandleCheck.cpp
diff --git a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp
new file mode 100644
index 0000000000000..c6bb5bdf5ce91
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.cpp
@@ -0,0 +1,150 @@
+//===--- ChainedComparisonCheck.cpp - clang-tidy --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ChainedComparisonCheck.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include <algorithm>
+
+using namespace clang::ast_matchers;
+
+namespace clang::tidy::bugprone {
+static bool isExprAComparisonOperator(const Expr *E) {
+  if (const auto *Op = dyn_cast_or_null<BinaryOperator>(E->IgnoreImplicit()))
+    return Op->isComparisonOp();
+  if (const auto *Op =
+          dyn_cast_or_null<CXXOperatorCallExpr>(E->IgnoreImplicit()))
+    return Op->isComparisonOp();
+  return false;
+}
+
+namespace {
+AST_MATCHER(BinaryOperator,
+            hasBinaryOperatorAChildComparisonOperatorWithoutParen) {
+  return isExprAComparisonOperator(Node.getLHS()) ||
+         isExprAComparisonOperator(Node.getRHS());
+}
+
+AST_MATCHER(CXXOperatorCallExpr,
+            hasCppOperatorAChildComparisonOperatorWithoutParen) {
+  return std::any_of(Node.arg_begin(), Node.arg_end(),
+                     isExprAComparisonOperator);
+}
+
+struct ChainedComparisonData {
+  llvm::SmallString<256U> Name;
+  llvm::SmallVector<const Expr *, 32U> Operands;
+
+  explicit ChainedComparisonData(const Expr *Op) { extract(Op); }
+
+private:
+  void add(const Expr *Operand);
+  void add(llvm::StringRef Opcode);
+  void extract(const Expr *Op);
+  void extract(const BinaryOperator *Op);
+  void extract(const CXXOperatorCallExpr *Op);
+};
+
+void ChainedComparisonData::add(const Expr *Operand) {
+  if (!Name.empty())
+    Name += ' ';
+  Name += 'v';
+  Name += std::to_string(Operands.size());
+  Operands.push_back(Operand);
+}
+
+void ChainedComparisonData::add(llvm::StringRef Opcode) {
+  Name += ' ';
+  Name += Opcode;
+}
+
+void ChainedComparisonData::extract(const BinaryOperator *Op) {
+  const Expr *LHS = Op->getLHS()->IgnoreImplicit();
+  if (isExprAComparisonOperator(LHS))
+    extract(LHS);
+  else
+    add(LHS);
+
+  add(Op->getOpcodeStr());
+
+  const Expr *RHS = Op->getRHS()->IgnoreImplicit();
+  if (isExprAComparisonOperator(RHS))
+    extract(RHS);
+  else
+    add(RHS);
+}
+
+void ChainedComparisonData::extract(const CXXOperatorCallExpr *Op) {
+  const Expr *FirstArg = Op->getArg(0U)->IgnoreImplicit();
+  if (isExprAComparisonOperator(FirstArg))
+    extract(FirstArg);
+  else
+    add(FirstArg);
+
+  add(getOperatorSpelling(Op->getOperator()));
+
+  const Expr *SecondArg = Op->getArg(1U)->IgnoreImplicit();
+  if (isExprAComparisonOperator(SecondArg))
+    extract(SecondArg);
+  else
+    add(SecondArg);
+}
+
+void ChainedComparisonData::extract(const Expr *Op) {
+  if (!Op)
+    return;
+
+  if (const auto *BinaryOp = dyn_cast<BinaryOperator>(Op)) {
+    extract(BinaryOp);
+    return;
+  }
+
+  if (const auto *OverloadedOp = dyn_cast<CXXOperatorCallExpr>(Op)) {
+    if (OverloadedOp->getNumArgs() == 2U)
+      extract(OverloadedOp);
+  }
+}
+
+} // namespace
+
+void ChainedComparisonCheck::registerMatchers(MatchFinder *Finder) {
+  const auto OperatorMatcher = expr(anyOf(
+      binaryOperator(isComparisonOperator(),
+                     hasBinaryOperatorAChildComparisonOperatorWithoutParen()),
+      cxxOperatorCallExpr(
+          isComparisonOperator(),
+          hasCppOperatorAChildComparisonOperatorWithoutParen())));
+
+  Finder->addMatcher(
+      expr(OperatorMatcher, unless(hasParent(OperatorMatcher))).bind("op"),
+      this);
+}
+
+void ChainedComparisonCheck::check(const MatchFinder::MatchResult &Result) {
+  const auto *MatchedOperator = Result.Nodes.getNodeAs<Expr>("op");
+
+  ChainedComparisonData Data(MatchedOperator);
+  if (Data.Operands.empty())
+    return;
+
+  diag(MatchedOperator->getBeginLoc(),
+       "chained comparison '%0' may generate unintended results, use "
+       "parentheses to specify order of evaluation or a logical operator to "
+       "separate comparison expressions")
+      << llvm::StringRef(Data.Name).trim() << MatchedOperator->getSourceRange();
+
+  for (std::size_t Index = 0U; Index < Data.Operands.size(); ++Index) {
+    diag(Data.Operands[Index]->getBeginLoc(), "operand 'v%0' is here",
+         DiagnosticIDs::Note)
+        << Index << Data.Operands[Index]->getSourceRange();
+  }
+}
+
+} // namespace clang::tidy::bugprone
diff --git a/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h
new file mode 100644
index 0000000000000..a914149a42e69
--- /dev/null
+++ b/clang-tools-extra/clang-tidy/bugprone/ChainedComparisonCheck.h
@@ -0,0 +1,34 @@
+//===--- ChainedComparisonCheck.h - clang-tidy ------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_CHAINEDCOMPARISONCHECK_H
+#define LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_CHAINEDCOMPARISONCHECK_H
+
+#include "../ClangTidyCheck.h"
+
+namespace clang::tidy::bugprone {
+
+/// Check detects chained comparison operators that can lead to unintended
+/// behavior or logical errors.
+///
+/// For the user-facing documentation see:
+/// http://clang.llvm.org/extra/clang-tidy/checks/bugprone/chained-comparison.html
+class ChainedComparisonCheck : public ClangTidyCheck {
+public:
+  ChainedComparisonCheck(StringRef Name, ClangTidyContext *Context)
+      : ClangTidyCheck(Name, Context) {}
+  void registerMatchers(ast_matchers::MatchFinder *Finder) override;
+  void check(const ast_matchers::MatchFinder::MatchResult &Result) override;
+  std::optional<TraversalKind> getCheckTraversalKind() const override {
+    return TK_IgnoreUnlessSpelledInSource;
+  }
+};
+
+} // namespace clang::tidy::bugprone
+
+#endif // LLVM_CLANG_TOOLS_EXTRA_CLANG_TIDY_BUGPRONE_CHAINEDCOMPARISONCHECK_H
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 7ab091e10231c..1015b6bd188d8 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -143,6 +143,12 @@ New checks
 
   Detects unsafe or redundant two-step casting operations involving ``void*``.
 
+- New :doc:`bugprone-chained-comparison
+  <clang-tidy/checks/bugprone/chained-comparison>` check.
+
+  Check detects chained comparison operators that can lead to unintended
+  behavior or logical errors.
+
 - New :doc:`bugprone-compare-pointer-to-member-virtual-function
   <clang-tidy/checks/bugprone/compare-pointer-to-member-virtual-function>` check.
 
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/chained-comparison.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/chained-comparison.rst
new file mode 100644
index 0000000000000..45b069a8e29de
--- /dev/null
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/chained-comparison.rst
@@ -0,0 +1,73 @@
+.. title:: clang-tidy - bugprone-chained-comparison
+
+bugprone-chained-comparison
+===========================
+
+Check detects chained comparison operators that can lead to unintended
+behavior or logical errors.
+
+Chained comparisons are expressions that use multiple comparison operators
+to compare three or more values. For example, the expression ``a < b < c``
+compares the values of ``a``, ``b``, and ``c``. However, this expression does
+not evaluate as ``(a < b) && (b < c)``, which is probably what the developer
+intended. Instead, it evaluates as ``(a < b) < c``, which may produce
+unintended results, especially when the types of ``a``, ``b``, and ``c`` are
+different.
+
+To avoid such errors, the check will issue a warning when a chained
+comparison operator is detected, suggesting to use parentheses to specify
+the order of evaluation or to use a logical operator to separate comparison
+expressions.
+
+Consider the following examples:
+
+.. code-block:: c++
+
+    int a = 2, b = 6, c = 4;
+    if (a < b < c) {
+        // This block will be executed
+    }
+
+
+In this example, the developer intended to check if ``a`` is less than ``b``
+and ``b`` is less than ``c``. However, the expression ``a < b < c`` is
+equivalent to ``(a < b) < c``. Since ``a < b`` is ``true``, the expression
+``(a < b) < c`` is evaluated as ``1 < c``, which is equivalent to ``true < c``
+and is invalid in this case as ``b < c`` is ``false``.
+
+Even that above issue could be detected as comparison of ``int`` to ``bool``,
+there is more dangerous example:
+
+.. code-block:: c++
+
+    bool a = false, b = false, c = true;
+    if (a == b == c) {
+        // This block will be executed
+    }
+
+In this example, the developer intended to check if ``a``, ``b``, and ``c`` are
+all equal. However, the expression ``a == b == c`` is evaluated as
+``(a == b) == c``. Since ``a == b`` is true, the expression ``(a == b) == c``
+is evaluated as ``true == c``, which is equivalent to ``true == true``.
+This comparison yields ``true``, even though ``a`` and ``b`` are ``false``, and
+are not equal to ``c``.
+
+To avoid this issue, the developer can use a logical operator to separate the
+comparison expressions, like this:
+
+.. code-block:: c++
+
+    if (a == b && b == c) {
+        // This block will not be executed
+    }
+
+
+Alternatively, use of parentheses in the comparison expressions can make the
+developer's intention more explicit and help avoid misunderstanding.
+
+.. code-block:: c++
+
+    if ((a == b) == c) {
+        // This block will be executed
+    }
+
diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index e972e376ebc5a..1fac3874b3c32 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -83,6 +83,7 @@ Clang-Tidy Checks
    :doc:`bugprone-bool-pointer-implicit-conversion <bugprone/bool-pointer-implicit-conversion>`, "Yes"
    :doc:`bugprone-branch-clone <bugprone/branch-clone>`,
    :doc:`bugprone-casting-through-void <bugprone/casting-through-void>`,
+   :doc:`bugprone-chained-comparison <bugprone/chained-comparison>`,
    :doc:`bugprone-compare-pointer-to-member-virtual-function <bugprone/compare-pointer-to-member-virtual-function>`,
    :doc:`bugprone-copy-constructor-init <bugprone/copy-constructor-init>`, "Yes"
    :doc:`bugprone-dangling-handle <bugprone/dangling-handle>`,
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/chained-comparison.c b/clang-tools-extra/test/clang-tidy/checkers/bugprone/chained-comparison.c
new file mode 100644
index 0000000000000..53c6139dcfbc2
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/chained-comparison.c
@@ -0,0 +1,76 @@
+// RUN: %check_clang_tidy %s bugprone-chained-comparison %t
+
+void badly_chained_1(int x, int y, int z)
+{
+    int result = x < y < z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:18: warning: chained comparison 'v0 < v1 < v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_2(int x, int y, int z)
+{
+    int result = x <= y <= z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:18: warning: chained comparison 'v0 <= v1 <= v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_3(int x, int y, int z)
+{
+    int result = x > y > z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:18: warning: chained comparison 'v0 > v1 > v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_4(int x, int y, int z)
+{
+    int result = x >= y >= z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:18: warning: chained comparison 'v0 >= v1 >= v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_5(int x, int y, int z)
+{
+    int result = x == y != z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:18: warning: chained comparison 'v0 == v1 != v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_6(int x, int y, int z)
+{
+    int result = x != y == z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:18: warning: chained comparison 'v0 != v1 == v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_multiple(int a, int b, int c, int d, int e, int f, int g, int h)
+{
+    int result = a == b == c == d == e == f == g == h;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:18: warning: chained comparison 'v0 == v1 == v2 == v3 == v4 == v5 == v6 == v7' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_limit(int v[29])
+{
+// CHECK-MESSAGES: :[[@LINE+1]]:18: warning: chained comparison 'v0 == v1 == v2 == v3 == v4 == v5 == v6 == v7 == v8 == v9 == v10 == v11 == v12 == v13 == v14 == v15 == v16 == v17 == v18 == v19 == v20 == v21 == v22 == v23 == v24 == v25 == v26 == v27 == v28' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+    int result = v[0] == v[1] == v[2] == v[3] == v[4] == v[5] == v[6] == v[7] ==
+                  v[8] == v[9] == v[10] == v[11] == v[12] == v[13] == v[14] ==
+                  v[15] == v[16] == v[17] == v[18] == v[19] == v[20] == v[21] ==
+                  v[22] == v[23] == v[24] == v[25] == v[26] == v[27] == v[28];
+
+}
+
+void badly_chained_parens2(int x, int y, int z, int t, int a, int b)
+{
+    int result = (x < y) < (z && t) > (a == b);
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:18: warning: chained comparison 'v0 < v1 > v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_inner(int x, int y, int z, int t, int u)
+{
+    int result = x && y < z < t && u;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:23: warning: chained comparison 'v0 < v1 < v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void properly_chained_1(int x, int y, int z)
+{
+    int result = x < y && y < z;
+}
+
+void properly_chained_2(int x, int y, int z)
+{
+    int result = (x < y) == z;
+}
+
diff --git a/clang-tools-extra/test/clang-tidy/checkers/bugprone/chained-comparison.cpp b/clang-tools-extra/test/clang-tidy/checkers/bugprone/chained-comparison.cpp
new file mode 100644
index 0000000000000..8a664aa205acf
--- /dev/null
+++ b/clang-tools-extra/test/clang-tidy/checkers/bugprone/chained-comparison.cpp
@@ -0,0 +1,91 @@
+// RUN: %check_clang_tidy -std=c++98-or-later %s bugprone-chained-comparison %t
+
+void badly_chained_1(int x, int y, int z)
+{
+    bool result = x < y < z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: warning: chained comparison 'v0 < v1 < v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_2(int x, int y, int z)
+{
+    bool result = x <= y <= z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: warning: chained comparison 'v0 <= v1 <= v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_3(int x, int y, int z)
+{
+    bool result = x > y > z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: warning: chained comparison 'v0 > v1 > v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_4(int x, int y, int z)
+{
+    bool result = x >= y >= z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: warning: chained comparison 'v0 >= v1 >= v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_5(int x, int y, int z)
+{
+    bool result = x == y != z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: warning: chained comparison 'v0 == v1 != v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_6(bool x, bool y, bool z)
+{
+    bool result = x != y == z;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: warning: chained comparison 'v0 != v1 == v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_multiple(bool a, bool b, bool c, bool d, bool e, bool f, bool g, bool h)
+{
+    bool result = a == b == c == d == e == f == g == h;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: warning: chained comparison 'v0 == v1 == v2 == v3 == v4 == v5 == v6 == v7' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_limit(bool v[29])
+{
+// CHECK-MESSAGES: :[[@LINE+1]]:19: warning: chained comparison 'v0 == v1 == v2 == v3 == v4 == v5 == v6 == v7 == v8 == v9 == v10 == v11 == v12 == v13 == v14 == v15 == v16 == v17 == v18 == v19 == v20 == v21 == v22 == v23 == v24 == v25 == v26 == v27 == v28' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+    bool result = v[0] == v[1] == v[2] == v[3] == v[4] == v[5] == v[6] == v[7] ==
+                  v[8] == v[9] == v[10] == v[11] == v[12] == v[13] == v[14] ==
+                  v[15] == v[16] == v[17] == v[18] == v[19] == v[20] == v[21] ==
+                  v[22] == v[23] == v[24] == v[25] == v[26] == v[27] == v[28];
+
+}
+
+void badly_chained_parens2(int x, int y, int z, int t, int a, int b)
+{
+    bool result = (x < y) < (z && t) > (a == b);
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:19: warning: chained comparison 'v0 < v1 > v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void badly_chained_inner(int x, int y, int z, int t, int u)
+{
+    bool result = x && y < z < t && u;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:24: warning: chained comparison 'v0 < v1 < v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+void properly_chained_1(int x, int y, int z)
+{
+    bool result = x < y && y < z;
+}
+
+void properly_chained_2(int x, int y, bool z)
+{
+    bool result = (x < y) == z;
+}
+
+struct Value {
+    bool operator<(const Value&) const;
+};
+
+bool operator==(bool, const Value&);
+
+bool badWithCppOperator(Value a, Value b, Value c) {
+    return a < b == c;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:12: warning: chained comparison 'v0 < v1 == v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]
+
+bool mixedBinaryAndCpp(Value a, Value b, bool c) {
+    return a < b == c;
+}
+// CHECK-MESSAGES: :[[@LINE-2]]:12: warning: chained comparison 'v0 < v1 == v2' may generate unintended results, use parentheses to specify order of evaluation or a logical operator to separate comparison expressions [bugprone-chained-comparison]

From 2bb6d7b8a4d1e31d5c856174c163e77d7d3a0b94 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Mon, 22 Jan 2024 17:04:07 +0100
Subject: [PATCH 421/843] [clang-repl] Limit use of PLT offset flag to linkers
 that support it

Follow-up fix from https://github.com/llvm/llvm-project/pull/78959
---
 clang/tools/clang-repl/CMakeLists.txt | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index b0aaf39f01154..2a0f617a2c0ff 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -24,6 +24,11 @@ if(CLANG_PLUGIN_SUPPORT)
 endif()
 
 string(TOUPPER ${CMAKE_SYSTEM_PROCESSOR} system_processor)
-if(${system_processor} MATCHES "ARM")
-  target_link_options(clang-repl PRIVATE LINKER:--long-plt)
+if(system_processor MATCHES "ARM")
+  set(FLAG_LONG_PLT "-Wl,--long-plt")
+  llvm_check_linker_flag(CXX ${FLAG_LONG_PLT} LINKER_HAS_FLAG_LONG_PLT)
+  # Linkers without this flag are assumed to have long PLTs by default
+  if(LINKER_HAS_FLAG_LONG_PLT)
+    target_link_options(clang-repl PRIVATE ${FLAG_LONG_PLT})
+  endif()
 endif()

From 4897b9888f11023bde363fb7dcebea440a0a13e9 Mon Sep 17 00:00:00 2001
From: Emma Pilkington <emma.pilkington95@gmail.com>
Date: Mon, 22 Jan 2024 11:22:57 -0500
Subject: [PATCH 422/843] [AMDGPU] Make a few more tests default COV agnostic
 (#78926)

---
 lld/test/ELF/lto/amdgcn-oses.ll               |  5 ++++-
 .../GlobalISel/irtranslator-function-args.ll  |  3 +++
 .../GlobalISel/legalize-addrspacecast.mir     | 10 ++++-----
 .../AMDGPU/remove-no-kernel-id-attribute.ll   | 21 +++++++++++--------
 4 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/lld/test/ELF/lto/amdgcn-oses.ll b/lld/test/ELF/lto/amdgcn-oses.ll
index a2f25cdd57d87..0fd0ce4b94775 100644
--- a/lld/test/ELF/lto/amdgcn-oses.ll
+++ b/lld/test/ELF/lto/amdgcn-oses.ll
@@ -15,7 +15,7 @@
 ; RUN: llvm-readobj --file-headers %t/mesa3d.so | FileCheck %s --check-prefixes=GCN,NON-AMDHSA,MESA3D
 
 ; AMDHSA: OS/ABI: AMDGPU_HSA (0x40)
-; AMDHSA: ABIVersion: 2
+; AMDHSA: ABIVersion: 3
 
 ; AMDPAL: OS/ABI: AMDGPU_PAL (0x41)
 ; MESA3D: OS/ABI: AMDGPU_MESA3D (0x42)
@@ -27,6 +27,9 @@
 target triple = "amdgcn-amd-amdhsa"
 target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5"
 
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
+
 define void @_start() {
   ret void
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
index 186c9dbcaa98c..2f0156d67bdfe 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/irtranslator-function-args.ll
@@ -3234,3 +3234,6 @@ define void @void_func_v2p3_inreg(<2 x ptr addrspace(3)> inreg %arg0) #0 {
 }
 
 attributes #0 = { nounwind }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
index 6f2e74e5947e3..24c4d668885f4 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-addrspacecast.mir
@@ -1,9 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -run-pass=legalizer -o - %s  | FileCheck -check-prefixes=GCN,SIVI %s
-# RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 -run-pass=legalizer -o - %s  | FileCheck -check-prefixes=GCN,SIVI %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 -run-pass=legalizer -o - %s  | FileCheck -check-prefixes=GCN,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -O0 -run-pass=legalizer -o - %s  | FileCheck -check-prefixes=GCN,GFX9 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -O0 -run-pass=legalizer -o - %s  | FileCheck -check-prefixes=GCN,GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=fiji -O0 -run-pass=legalizer --amdhsa-code-object-version=4 -o - %s  | FileCheck -check-prefixes=GCN,SIVI %s
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -O0 -run-pass=legalizer --amdhsa-code-object-version=4 -o - %s  | FileCheck -check-prefixes=GCN,SIVI %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -O0 -run-pass=legalizer --amdhsa-code-object-version=4 -o - %s  | FileCheck -check-prefixes=GCN,GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -O0 -run-pass=legalizer --amdhsa-code-object-version=4 -o - %s  | FileCheck -check-prefixes=GCN,GFX9 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -O0 -run-pass=legalizer --amdhsa-code-object-version=4 -o - %s  | FileCheck -check-prefixes=GCN,GFX9 %s
 
 ---
 name: test_addrspacecast_p0_to_p1
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 80eb52726826b..2c1a6b43c847f 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -105,7 +105,7 @@ define internal void @f0_transitive() {
 
 define amdgpu_kernel void @k0_f0() {
 ; CHECK-LABEL: define amdgpu_kernel void @k0_f0(
-; CHECK-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !1 {
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] !llvm.amdgcn.lds.kernel.id !2 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k0_f0.lds) ]
 ; CHECK-NEXT:    call void @f0_transitive()
 ; CHECK-NEXT:    ret void
@@ -116,8 +116,8 @@ define amdgpu_kernel void @k0_f0() {
 
 define amdgpu_kernel void @k1_f0() {
 ; CHECK-LABEL: define amdgpu_kernel void @k1_f0(
-; CHECK-SAME: ) #[[ATTR3:[0-9]+]] !llvm.amdgcn.lds.kernel.id !2 {
-; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds) ], !alias.scope !3, !noalias !6
+; CHECK-SAME: ) #[[ATTR3:[0-9]+]] !llvm.amdgcn.lds.kernel.id !3 {
+; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.k1_f0.lds) ], !alias.scope !4, !noalias !7
 ; CHECK-NEXT:    call void @f0_transitive()
 ; CHECK-NEXT:    [[FPTR:%.*]] = load volatile ptr, ptr addrspace(1) null, align 8
 ; CHECK-NEXT:    call void [[FPTR]]()
@@ -178,7 +178,7 @@ define internal void @mutual_recursion_1(i16 %arg) {
 
 define amdgpu_kernel void @kernel_lds_recursion() {
 ; CHECK-LABEL: define amdgpu_kernel void @kernel_lds_recursion(
-; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id !8 {
+; CHECK-SAME: ) #[[ATTR2]] !llvm.amdgcn.lds.kernel.id !9 {
 ; CHECK-NEXT:    call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kernel_lds_recursion.lds) ]
 ; CHECK-NEXT:    call void @mutual_recursion_0(i16 0)
 ; CHECK-NEXT:    ret void
@@ -187,6 +187,9 @@ define amdgpu_kernel void @kernel_lds_recursion() {
   ret void
 }
 
+!llvm.module.flags = !{!1}
+!1 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+
 ;.
 ; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
 ; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
@@ -199,11 +202,11 @@ define amdgpu_kernel void @kernel_lds_recursion() {
 ; CHECK: [[META0:![0-9]+]] = !{i32 0, i32 1}
 ; CHECK: [[META1:![0-9]+]] = !{i32 0}
 ; CHECK: [[META2:![0-9]+]] = !{i32 1}
-; CHECK: [[META3:![0-9]+]] = !{!4}
-; CHECK: [[META4:![0-9]+]] = distinct !{!4, !5}
-; CHECK: [[META5:![0-9]+]] = distinct !{!5}
-; CHECK: [[META6:![0-9]+]] = !{!7}
-; CHECK: [[META7:![0-9]+]] = distinct !{!7, !5}
+; CHECK: [[META3:![0-9]+]] = !{!5}
+; CHECK: [[META4:![0-9]+]] = distinct !{!5, !6}
+; CHECK: [[META5:![0-9]+]] = distinct !{!6}
+; CHECK: [[META6:![0-9]+]] = !{!8}
+; CHECK: [[META7:![0-9]+]] = distinct !{!8, !6}
 ; CHECK: [[META8:![0-9]+]] = !{i32 2}
 ;.
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

From ae99966a279601022d2b4d61dfbec349f7d65c12 Mon Sep 17 00:00:00 2001
From: carlobertolli <carlo.bertolli@amd.com>
Date: Mon, 22 Jan 2024 10:30:22 -0600
Subject: [PATCH 423/843] [OpenMP] Enable automatic unified shared memory on
 MI300A. (#77512)

This patch enables applications that did not request OpenMP
unified_shared_memory to run with the same zero-copy behavior, where
mapped memory does not result in extra memory allocations and memory
copies, but CPU-allocated memory is accessed from the device. The name
for this behavior is "automatic zero-copy" and it relies on detecting:
that the runtime is running on a MI300A, that the user did not select
unified_shared_memory in their program, and that XNACK (unified memory
support) is enabled in the current GPU configuration. If all these
conditions are met, then automatic zero-copy is triggered.

This patch also introduces an environment variable OMPX_APU_MAPS that,
if set, triggers automatic zero-copy also on non APU GPUs (e.g., on
discrete GPUs).
This patch is still missing support for global variables, which will be
provided in a subsequent patch.

Co-authored-by: Thorsten Blass <thorsten.blass@amd.com>
---
 .../libomptarget/include/Shared/PluginAPI.h   |   3 +
 .../libomptarget/include/Shared/PluginAPI.inc |   1 +
 .../include/Shared/Requirements.h             |  15 ++-
 openmp/libomptarget/include/device.h          |   3 +
 .../amdgpu/dynamic_hsa/hsa_ext_amd.h          |   1 +
 .../plugins-nextgen/amdgpu/src/rtl.cpp        | 122 ++++++++++++++----
 .../common/include/PluginInterface.h          |   5 +
 .../common/src/PluginInterface.cpp            |  10 ++
 openmp/libomptarget/src/OpenMP/Mapping.cpp    |  13 +-
 openmp/libomptarget/src/PluginManager.cpp     |  11 ++
 openmp/libomptarget/src/device.cpp            |   6 +
 .../test/mapping/auto_zero_copy.cpp           |  57 ++++++++
 12 files changed, 219 insertions(+), 28 deletions(-)
 create mode 100644 openmp/libomptarget/test/mapping/auto_zero_copy.cpp

diff --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h
index c6aacf4ce2124..aece53d7ee1ca 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.h
+++ b/openmp/libomptarget/include/Shared/PluginAPI.h
@@ -219,6 +219,9 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
                                            void *VAddr, bool isRecord,
                                            bool SaveOutput,
                                            uint64_t &ReqPtrArgOffset);
+
+// Returns true if the device \p DeviceId suggests to use auto zero-copy.
+int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId);
 }
 
 #endif // OMPTARGET_SHARED_PLUGIN_API_H
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc
index 25ebe7d437f9d..b842c6eef1d4f 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.inc
+++ b/openmp/libomptarget/include/Shared/PluginAPI.inc
@@ -47,3 +47,4 @@ PLUGIN_API_HANDLE(data_notify_mapped, false);
 PLUGIN_API_HANDLE(data_notify_unmapped, false);
 PLUGIN_API_HANDLE(set_device_offset, false);
 PLUGIN_API_HANDLE(initialize_record_replay, false);
+PLUGIN_API_HANDLE(use_auto_zero_copy, false);
diff --git a/openmp/libomptarget/include/Shared/Requirements.h b/openmp/libomptarget/include/Shared/Requirements.h
index 19d6b8ffca495..b16a1650f0c40 100644
--- a/openmp/libomptarget/include/Shared/Requirements.h
+++ b/openmp/libomptarget/include/Shared/Requirements.h
@@ -33,7 +33,12 @@ enum OpenMPOffloadingRequiresDirFlags : int64_t {
   /// unified_shared_memory clause.
   OMP_REQ_UNIFIED_SHARED_MEMORY = 0x008,
   /// dynamic_allocators clause.
-  OMP_REQ_DYNAMIC_ALLOCATORS = 0x010
+  OMP_REQ_DYNAMIC_ALLOCATORS = 0x010,
+  /// Auto zero-copy extension:
+  /// when running on an APU, the GPU plugin may decide to
+  /// run in zero-copy even though the user did not program
+  /// their application with unified_shared_memory requirement.
+  OMPX_REQ_AUTO_ZERO_COPY = 0x020
 };
 
 class RequirementCollection {
@@ -65,6 +70,14 @@ class RequirementCollection {
       return;
     }
 
+    // Auto zero-copy is only valid when no other requirement has been set
+    // and it is computed at device initialization time, after the requirement
+    // flag has already been set to OMP_REQ_NONE.
+    if (SetFlags == OMP_REQ_NONE && NewFlags == OMPX_REQ_AUTO_ZERO_COPY) {
+      SetFlags = NewFlags;
+      return;
+    }
+
     // If multiple compilation units are present enforce
     // consistency across all of them for require clauses:
     //  - reverse_offload
diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index e94f48891dab3..3023fba6cc64d 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -164,6 +164,9 @@ struct DeviceTy {
   /// Print all offload entries to stderr.
   void dumpOffloadEntries();
 
+  /// Ask the device whether the runtime should use auto zero-copy.
+  bool useAutoZeroCopy();
+
 private:
   /// Deinitialize the device (and plugin).
   void deinit();
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
index 9c59d3bf824de..3117763e35896 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h
@@ -63,6 +63,7 @@ typedef enum {
 } hsa_amd_memory_pool_access_t;
 
 typedef enum hsa_amd_agent_info_s {
+  HSA_AMD_AGENT_INFO_CHIP_ID = 0xA000,
   HSA_AMD_AGENT_INFO_CACHELINE_SIZE = 0xA001,
   HSA_AMD_AGENT_INFO_COMPUTE_UNIT_COUNT = 0xA002,
   HSA_AMD_AGENT_INFO_MAX_CLOCK_FREQUENCY = 0xA003,
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 1038a29a18cb8..8066a231ef93f 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -184,6 +184,29 @@ Error asyncMemCopy(bool UseMultipleSdmaEngines, void *Dst, hsa_agent_t DstAgent,
 #endif
 }
 
+Expected<std::string> getTargetTripleAndFeatures(hsa_agent_t Agent) {
+  std::string Target;
+  auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
+    uint32_t Length;
+    hsa_status_t Status;
+    Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
+    if (Status != HSA_STATUS_SUCCESS)
+      return Status;
+
+    llvm::SmallVector<char> ISAName(Length);
+    Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
+    if (Status != HSA_STATUS_SUCCESS)
+      return Status;
+
+    llvm::StringRef TripleTarget(ISAName.begin(), Length);
+    if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
+      Target = TripleTarget.ltrim('-').rtrim('\0').str();
+    return HSA_STATUS_SUCCESS;
+  });
+  if (Err)
+    return Err;
+  return Target;
+}
 } // namespace utils
 
 /// Utility class representing generic resource references to AMDGPU resources.
@@ -1849,8 +1872,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
         OMPX_StreamBusyWait("LIBOMPTARGET_AMDGPU_STREAM_BUSYWAIT", 2000000),
         OMPX_UseMultipleSdmaEngines(
             "LIBOMPTARGET_AMDGPU_USE_MULTIPLE_SDMA_ENGINES", false),
-        AMDGPUStreamManager(*this, Agent), AMDGPUEventManager(*this),
-        AMDGPUSignalManager(*this), Agent(Agent), HostDevice(HostDevice) {}
+        OMPX_ApuMaps("OMPX_APU_MAPS", false), AMDGPUStreamManager(*this, Agent),
+        AMDGPUEventManager(*this), AMDGPUSignalManager(*this), Agent(Agent),
+        HostDevice(HostDevice) {}
 
   ~AMDGPUDeviceTy() {}
 
@@ -1941,6 +1965,19 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = AMDGPUSignalManager.init(OMPX_InitialNumSignals))
       return Err;
 
+    // Detect if XNACK is enabled
+    auto TargeTripleAndFeaturesOrError =
+        utils::getTargetTripleAndFeatures(Agent);
+    if (!TargeTripleAndFeaturesOrError)
+      return TargeTripleAndFeaturesOrError.takeError();
+    if (static_cast<StringRef>(*TargeTripleAndFeaturesOrError)
+            .contains("xnack+"))
+      IsXnackEnabled = true;
+
+    // detect if device is an APU.
+    if (auto Err = checkIfAPU())
+      return Err;
+
     return Plugin::success();
   }
 
@@ -2650,6 +2687,21 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
+  /// Returns true if auto zero-copy the best configuration for the current
+  /// arch.
+  /// On AMDGPUs, automatic zero-copy is turned on
+  /// when running on an APU with XNACK (unified memory) support
+  /// enabled. On discrete GPUs, automatic zero-copy is triggered
+  /// if the user sets the environment variable OMPX_APU_MAPS=1
+  /// and if XNACK is enabled. The rationale is that zero-copy
+  /// is the best configuration (performance, memory footprint) on APUs,
+  /// while it is often not the best on discrete GPUs.
+  /// XNACK can be enabled with a kernel boot parameter or with
+  /// the HSA_XNACK environment variable.
+  bool useAutoZeroCopyImpl() override {
+    return ((IsAPU || OMPX_ApuMaps) && IsXnackEnabled);
+  }
+
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
     Value = StackSize;
@@ -2749,6 +2801,34 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Err;
   }
 
+  /// Detect if current architecture is an APU.
+  Error checkIfAPU() {
+    // TODO: replace with ROCr API once it becomes available.
+    llvm::StringRef StrGfxName(ComputeUnitKind);
+    IsAPU = llvm::StringSwitch<bool>(StrGfxName)
+                .Case("gfx940", true)
+                .Default(false);
+    if (IsAPU)
+      return Plugin::success();
+
+    bool MayBeAPU = llvm::StringSwitch<bool>(StrGfxName)
+                        .Case("gfx942", true)
+                        .Default(false);
+    if (!MayBeAPU)
+      return Plugin::success();
+
+    // can be MI300A or MI300X
+    uint32_t ChipID = 0;
+    if (auto Err = getDeviceAttr(HSA_AMD_AGENT_INFO_CHIP_ID, ChipID))
+      return Err;
+
+    if (!(ChipID & 0x1)) {
+      IsAPU = true;
+      return Plugin::success();
+    }
+    return Plugin::success();
+  }
+
   /// Envar for controlling the number of HSA queues per device. High number of
   /// queues may degrade performance.
   UInt32Envar OMPX_NumQueues;
@@ -2785,6 +2865,10 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// Use ROCm 5.7 interface for multiple SDMA engines
   BoolEnvar OMPX_UseMultipleSdmaEngines;
 
+  /// Value of OMPX_APU_MAPS env var used to force
+  /// automatic zero-copy behavior on non-APU GPUs.
+  BoolEnvar OMPX_ApuMaps;
+
   /// Stream manager for AMDGPU streams.
   AMDGPUStreamManagerTy AMDGPUStreamManager;
 
@@ -2815,6 +2899,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   /// The current size of the stack that will be used in cases where it could
   /// not be statically determined.
   uint64_t StackSize = 16 * 1024 /* 16 KB */;
+
+  /// Is the plugin associated with an APU?
+  bool IsAPU = false;
+
+  /// True is the system is configured with XNACK-Enabled.
+  /// False otherwise.
+  bool IsXnackEnabled = false;
 };
 
 Error AMDGPUDeviceImageTy::loadExecutable(const AMDGPUDeviceTy &Device) {
@@ -3059,30 +3150,13 @@ struct AMDGPUPluginTy final : public GenericPluginTy {
     std::optional<StringRef> Processor = ElfOrErr->tryGetCPUName();
 
     for (hsa_agent_t Agent : KernelAgents) {
-      std::string Target;
-      auto Err = utils::iterateAgentISAs(Agent, [&](hsa_isa_t ISA) {
-        uint32_t Length;
-        hsa_status_t Status;
-        Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME_LENGTH, &Length);
-        if (Status != HSA_STATUS_SUCCESS)
-          return Status;
-
-        llvm::SmallVector<char> ISAName(Length);
-        Status = hsa_isa_get_info_alt(ISA, HSA_ISA_INFO_NAME, ISAName.begin());
-        if (Status != HSA_STATUS_SUCCESS)
-          return Status;
-
-        llvm::StringRef TripleTarget(ISAName.begin(), Length);
-        if (TripleTarget.consume_front("amdgcn-amd-amdhsa"))
-          Target = TripleTarget.ltrim('-').rtrim('\0').str();
-        return HSA_STATUS_SUCCESS;
-      });
-      if (Err)
-        return std::move(Err);
-
+      auto TargeTripleAndFeaturesOrError =
+          utils::getTargetTripleAndFeatures(Agent);
+      if (!TargeTripleAndFeaturesOrError)
+        return TargeTripleAndFeaturesOrError.takeError();
       if (!utils::isImageCompatibleWithEnv(Processor ? *Processor : "",
                                            ElfOrErr->getPlatformFlags(),
-                                           Target))
+                                           *TargeTripleAndFeaturesOrError))
         return false;
     }
     return true;
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index f7f723610dd05..d55dfbdd9e4c1 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -883,6 +883,11 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
 
   virtual Error getDeviceStackSize(uint64_t &V) = 0;
 
+  /// Returns true if current plugin architecture is an APU
+  /// and unified_shared_memory was not requested by the program.
+  bool useAutoZeroCopy();
+  virtual bool useAutoZeroCopyImpl() { return false; }
+
 private:
   /// Register offload entry for global variable.
   Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 1bd70b85da341..6ae30e78ce8c2 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1555,6 +1555,8 @@ Error GenericDeviceTy::syncEvent(void *EventPtr) {
   return syncEventImpl(EventPtr);
 }
 
+bool GenericDeviceTy::useAutoZeroCopy() { return useAutoZeroCopyImpl(); }
+
 Error GenericPluginTy::init() {
   auto NumDevicesOrErr = initImpl();
   if (!NumDevicesOrErr)
@@ -2067,6 +2069,14 @@ int32_t __tgt_rtl_set_device_offset(int32_t DeviceIdOffset) {
   return OFFLOAD_SUCCESS;
 }
 
+int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
+  // Automatic zero-copy only applies to programs that did
+  // not request unified_shared_memory and are deployed on an
+  // APU with XNACK enabled.
+  if (Plugin::get().getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY)
+    return false;
+  return Plugin::get().getDevice(DeviceId).useAutoZeroCopy();
+}
 #ifdef __cplusplus
 }
 #endif
diff --git a/openmp/libomptarget/src/OpenMP/Mapping.cpp b/openmp/libomptarget/src/OpenMP/Mapping.cpp
index 833856f2abf29..9c0b219b6f15f 100644
--- a/openmp/libomptarget/src/OpenMP/Mapping.cpp
+++ b/openmp/libomptarget/src/OpenMP/Mapping.cpp
@@ -252,8 +252,10 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
       MESSAGE("device mapping required by 'present' map type modifier does not "
               "exist for host address " DPxMOD " (%" PRId64 " bytes)",
               DPxPTR(HstPtrBegin), Size);
-  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
-             !HasCloseModifier) {
+  } else if ((PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY &&
+              !HasCloseModifier) ||
+             (PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY)) {
+
     // If unified shared memory is active, implicitly mapped variables that are
     // not privatized use host address. Any explicitly mapped variables also use
     // host address where correctness is not impeded. In all other cases maps
@@ -261,6 +263,10 @@ TargetPointerResultTy MappingInfoTy::getTargetPointer(
     // In addition to the mapping rules above, the close map modifier forces the
     // mapping of the variable to the device.
     if (Size) {
+      INFO(OMP_INFOTYPE_MAPPING_CHANGED, Device.DeviceID,
+           "Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
+           "memory\n",
+           DPxPTR((uintptr_t)HstPtrBegin), Size);
       DP("Return HstPtrBegin " DPxMOD " Size=%" PRId64 " for unified shared "
          "memory\n",
          DPxPTR((uintptr_t)HstPtrBegin), Size);
@@ -415,7 +421,8 @@ TargetPointerResultTy MappingInfoTy::getTgtPtrBegin(
          LR.TPR.getEntry()->dynRefCountToStr().c_str(), DynRefCountAction,
          LR.TPR.getEntry()->holdRefCountToStr().c_str(), HoldRefCountAction);
     LR.TPR.TargetPointer = (void *)TP;
-  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+  } else if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY ||
+             PM->getRequirements() & OMPX_REQ_AUTO_ZERO_COPY) {
     // If the value isn't found in the mapping and unified shared memory
     // is on then it means we have stumbled upon a value which we need to
     // use directly from the host.
diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 83bf65f0f0de5..50059ba23b1a7 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -144,6 +144,9 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
 
   int32_t NumPD = getNumberOfPluginDevices();
   ExclusiveDevicesAccessor->reserve(DeviceOffset + NumPD);
+  // Auto zero-copy is a per-device property. We need to ensure
+  // that all devices are suggesting to use it.
+  bool UseAutoZeroCopy = !(NumPD == 0);
   for (int32_t PDevI = 0, UserDevId = DeviceOffset; PDevI < NumPD; PDevI++) {
     auto Device = std::make_unique<DeviceTy>(this, UserDevId, PDevI);
     if (auto Err = Device->init()) {
@@ -151,12 +154,20 @@ void PluginAdaptorTy::initDevices(PluginManager &PM) {
          toString(std::move(Err)).c_str());
       continue;
     }
+    UseAutoZeroCopy = UseAutoZeroCopy && Device->useAutoZeroCopy();
 
     ExclusiveDevicesAccessor->push_back(std::move(Device));
     ++NumberOfUserDevices;
     ++UserDevId;
   }
 
+  // Auto Zero-Copy can only be currently triggered when the system is an
+  // homogeneous APU architecture without attached discrete GPUs.
+  // If all devices suggest to use it, change requirment flags to trigger
+  // zero-copy behavior when mapping memory.
+  if (UseAutoZeroCopy)
+    PM.addRequirements(OMPX_REQ_AUTO_ZERO_COPY);
+
   DP("Plugin adaptor " DPxMOD " has index %d, exposes %d out of %d devices!\n",
      DPxPTR(LibraryHandler.get()), DeviceOffset, NumberOfUserDevices,
      NumberOfPluginDevices);
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 654efd524024a..404d7b6174e4a 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -339,3 +339,9 @@ void DeviceTy::dumpOffloadEntries() {
     fprintf(stderr, "  %11s: %s\n", Kind, It.second.getNameAsCStr());
   }
 }
+
+bool DeviceTy::useAutoZeroCopy() {
+  if (RTL->use_auto_zero_copy)
+    return RTL->use_auto_zero_copy(RTLDeviceID);
+  return false;
+}
diff --git a/openmp/libomptarget/test/mapping/auto_zero_copy.cpp b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
new file mode 100644
index 0000000000000..6f9d8c2b128c9
--- /dev/null
+++ b/openmp/libomptarget/test/mapping/auto_zero_copy.cpp
@@ -0,0 +1,57 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: env OMPX_APU_MAPS=1 HSA_XNACK=1 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=INFO_ZERO -check-prefix=CHECK
+
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=0 LIBOMPTARGET_INFO=30 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=INFO_COPY -check-prefix=CHECK
+
+// UNSUPPORTED: aarch64-unknown-linux-gnu
+// UNSUPPORTED: aarch64-unknown-linux-gnu-LTO
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// UNSUPPORTED: x86_64-pc-linux-gnu
+// UNSUPPORTED: x86_64-pc-linux-gnu-LTO
+
+// REQUIRES: unified_shared_memory
+
+// clang-format on
+
+#include <cstdio>
+
+int main() {
+  int n = 1024;
+
+  // test various mapping types
+  int *a = new int[n];
+  int k = 3;
+  int b[n];
+
+  for (int i = 0; i < n; i++)
+    b[i] = i;
+
+    // clang-format off
+  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+  // INFO_ZERO: Return HstPtrBegin 0x{{.*}} Size=4096 for unified shared memory
+
+  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
+  // INFO_COPY: Creating new map entry with HstPtrBase=0x{{.*}}, HstPtrBegin=0x{{.*}}, TgtAllocBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096,
+  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
+  // INFO_COPY: Mapping exists with HstPtrBegin=0x{{.*}}, TgtPtrBegin=0x{{.*}}, Size=4096, DynRefCount=1 (update suppressed)
+// clang-format on
+#pragma omp target teams distribute parallel for map(tofrom : a[ : n])         \
+    map(to : b[ : n])
+  for (int i = 0; i < n; i++)
+    a[i] = i + b[i] + k;
+
+  int err = 0;
+  for (int i = 0; i < n; i++)
+    if (a[i] != i + b[i] + k)
+      err++;
+
+  // CHECK: PASS
+  if (err == 0)
+    printf("PASS\n");
+  return err;
+}

From d4d81acb52bd44681210001c148ac86df8a344f0 Mon Sep 17 00:00:00 2001
From: Kerry McLaughlin <kerry.mclaughlin@arm.com>
Date: Mon, 22 Jan 2024 16:30:43 +0000
Subject: [PATCH 424/843] [AArch64][SME2] Extend SMEABIPass to handle functions
 with new ZT0 state (#78848)

updateNewZAFunctions is extended to generate the following on entry to a
function with either the "aarch64_pstate_za_new" or "arm_new_zt0"
attribute:
- Private-ZA interface: commit any active lazy-saves & enable PSTATE.ZA.
  - "aarch64_pstate_za_new": zero ZA.
  - "arm_new_zt0": zero ZT0.

Additionally, PSTATE.ZA should disabled before returning if the function
has a private-ZA interface.
---
 llvm/lib/Target/AArch64/SMEABIPass.cpp        | 132 +++++++++++-------
 .../AArch64/Utils/AArch64SMEAttributes.cpp    |  11 +-
 .../AArch64/Utils/AArch64SMEAttributes.h      |  19 +--
 llvm/test/CodeGen/AArch64/sme-zt0-state.ll    | 115 +++++++++++++++
 4 files changed, 211 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Target/AArch64/SMEABIPass.cpp b/llvm/lib/Target/AArch64/SMEABIPass.cpp
index 3315171798d9f..0247488ce93f1 100644
--- a/llvm/lib/Target/AArch64/SMEABIPass.cpp
+++ b/llvm/lib/Target/AArch64/SMEABIPass.cpp
@@ -40,7 +40,8 @@ struct SMEABI : public FunctionPass {
   bool runOnFunction(Function &F) override;
 
 private:
-  bool updateNewZAFunctions(Module *M, Function *F, IRBuilder<> &Builder);
+  bool updateNewStateFunctions(Module *M, Function *F, IRBuilder<> &Builder,
+                               SMEAttrs FnAttrs);
 };
 } // end anonymous namespace
 
@@ -76,56 +77,87 @@ void emitTPIDR2Save(Module *M, IRBuilder<> &Builder) {
                      Builder.getInt64(0));
 }
 
-/// This function generates code to commit a lazy save at the beginning of a
-/// function marked with `aarch64_pstate_za_new`. If the value read from
-/// TPIDR2_EL0 is not null on entry to the function then the lazy-saving scheme
-/// is active and we should call __arm_tpidr2_save to commit the lazy save.
-/// Additionally, PSTATE.ZA should be enabled at the beginning of the function
-/// and disabled before returning.
-bool SMEABI::updateNewZAFunctions(Module *M, Function *F,
-                                  IRBuilder<> &Builder) {
+/// This function generates code at the beginning and end of a function marked
+/// with either `aarch64_pstate_za_new` or `aarch64_new_zt0`.
+/// At the beginning of the function, the following code is generated:
+///  - Commit lazy-save if active   [Private-ZA Interface*]
+///  - Enable PSTATE.ZA             [Private-ZA Interface]
+///  - Zero ZA                      [Has New ZA State]
+///  - Zero ZT0                     [Has New ZT0 State]
+///
+/// * A function with new ZT0 state will not change ZA, so committing the
+/// lazy-save is not strictly necessary. However, the lazy-save mechanism
+/// may be active on entry to the function, with PSTATE.ZA set to 1. If
+/// the new ZT0 function calls a function that does not share ZT0, we will
+/// need to conditionally SMSTOP ZA before the call, setting PSTATE.ZA to 0.
+/// For this reason, it's easier to always commit the lazy-save at the
+/// beginning of the function regardless of whether it has ZA state.
+///
+/// At the end of the function, PSTATE.ZA is disabled if the function has a
+/// Private-ZA Interface. A function is considered to have a Private-ZA
+/// interface if it does not share ZA or ZT0.
+///
+bool SMEABI::updateNewStateFunctions(Module *M, Function *F,
+                                     IRBuilder<> &Builder, SMEAttrs FnAttrs) {
   LLVMContext &Context = F->getContext();
   BasicBlock *OrigBB = &F->getEntryBlock();
-
-  // Create the new blocks for reading TPIDR2_EL0 & enabling ZA state.
-  auto *SaveBB = OrigBB->splitBasicBlock(OrigBB->begin(), "save.za", true);
-  auto *PreludeBB = BasicBlock::Create(Context, "prelude", F, SaveBB);
-
-  // Read TPIDR2_EL0 in PreludeBB & branch to SaveBB if not 0.
-  Builder.SetInsertPoint(PreludeBB);
-  Function *TPIDR2Intr =
-      Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2);
-  auto *TPIDR2 = Builder.CreateCall(TPIDR2Intr->getFunctionType(), TPIDR2Intr,
-                                    {}, "tpidr2");
-  auto *Cmp =
-      Builder.CreateCmp(ICmpInst::ICMP_NE, TPIDR2, Builder.getInt64(0), "cmp");
-  Builder.CreateCondBr(Cmp, SaveBB, OrigBB);
-
-  // Create a call __arm_tpidr2_save, which commits the lazy save.
-  Builder.SetInsertPoint(&SaveBB->back());
-  emitTPIDR2Save(M, Builder);
-
-  // Enable pstate.za at the start of the function.
   Builder.SetInsertPoint(&OrigBB->front());
-  Function *EnableZAIntr =
-      Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable);
-  Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr);
-
-  // ZA state must be zeroed upon entry to a function with NewZA
-  Function *ZeroIntr =
-      Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero);
-  Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr,
-                     Builder.getInt32(0xff));
-
-  // Before returning, disable pstate.za
-  for (BasicBlock &BB : *F) {
-    Instruction *T = BB.getTerminator();
-    if (!T || !isa<ReturnInst>(T))
-      continue;
-    Builder.SetInsertPoint(T);
-    Function *DisableZAIntr =
-        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable);
-    Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr);
+
+  // Commit any active lazy-saves if this is a Private-ZA function. If the
+  // value read from TPIDR2_EL0 is not null on entry to the function then
+  // the lazy-saving scheme is active and we should call __arm_tpidr2_save
+  // to commit the lazy save.
+  if (FnAttrs.hasPrivateZAInterface()) {
+    // Create the new blocks for reading TPIDR2_EL0 & enabling ZA state.
+    auto *SaveBB = OrigBB->splitBasicBlock(OrigBB->begin(), "save.za", true);
+    auto *PreludeBB = BasicBlock::Create(Context, "prelude", F, SaveBB);
+
+    // Read TPIDR2_EL0 in PreludeBB & branch to SaveBB if not 0.
+    Builder.SetInsertPoint(PreludeBB);
+    Function *TPIDR2Intr =
+        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_get_tpidr2);
+    auto *TPIDR2 = Builder.CreateCall(TPIDR2Intr->getFunctionType(), TPIDR2Intr,
+                                      {}, "tpidr2");
+    auto *Cmp = Builder.CreateCmp(ICmpInst::ICMP_NE, TPIDR2,
+                                  Builder.getInt64(0), "cmp");
+    Builder.CreateCondBr(Cmp, SaveBB, OrigBB);
+
+    // Create a call __arm_tpidr2_save, which commits the lazy save.
+    Builder.SetInsertPoint(&SaveBB->back());
+    emitTPIDR2Save(M, Builder);
+
+    // Enable pstate.za at the start of the function.
+    Builder.SetInsertPoint(&OrigBB->front());
+    Function *EnableZAIntr =
+        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_enable);
+    Builder.CreateCall(EnableZAIntr->getFunctionType(), EnableZAIntr);
+  }
+
+  if (FnAttrs.hasNewZABody()) {
+    Function *ZeroIntr =
+        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero);
+    Builder.CreateCall(ZeroIntr->getFunctionType(), ZeroIntr,
+                       Builder.getInt32(0xff));
+  }
+
+  if (FnAttrs.isNewZT0()) {
+    Function *ClearZT0Intr =
+        Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_zero_zt);
+    Builder.CreateCall(ClearZT0Intr->getFunctionType(), ClearZT0Intr,
+                       {Builder.getInt32(0)});
+  }
+
+  if (FnAttrs.hasPrivateZAInterface()) {
+    // Before returning, disable pstate.za
+    for (BasicBlock &BB : *F) {
+      Instruction *T = BB.getTerminator();
+      if (!T || !isa<ReturnInst>(T))
+        continue;
+      Builder.SetInsertPoint(T);
+      Function *DisableZAIntr =
+          Intrinsic::getDeclaration(M, Intrinsic::aarch64_sme_za_disable);
+      Builder.CreateCall(DisableZAIntr->getFunctionType(), DisableZAIntr);
+    }
   }
 
   F->addFnAttr("aarch64_expanded_pstate_za");
@@ -142,8 +174,8 @@ bool SMEABI::runOnFunction(Function &F) {
 
   bool Changed = false;
   SMEAttrs FnAttrs(F);
-  if (FnAttrs.hasNewZABody())
-    Changed |= updateNewZAFunctions(M, &F, Builder);
+  if (FnAttrs.hasNewZABody() || FnAttrs.isNewZT0())
+    Changed |= updateNewStateFunctions(M, &F, Builder, FnAttrs);
 
   return Changed;
 }
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index 9693b6a664be2..3ee54e5df0a13 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -27,10 +27,8 @@ void SMEAttrs::set(unsigned M, bool Enable) {
          "ZA_New and ZA_Shared are mutually exclusive");
   assert(!(hasNewZABody() && preservesZA()) &&
          "ZA_New and ZA_Preserved are mutually exclusive");
-  assert(!(hasNewZABody() && (Bitmask & ZA_NoLazySave)) &&
-         "ZA_New and ZA_NoLazySave are mutually exclusive");
-  assert(!(sharesZA() && (Bitmask & ZA_NoLazySave)) &&
-         "ZA_Shared and ZA_NoLazySave are mutually exclusive");
+  assert(!(hasNewZABody() && (Bitmask & SME_ABI_Routine)) &&
+         "ZA_New and SME_ABI_Routine are mutually exclusive");
 
   // ZT0 Attrs
   assert(
@@ -49,11 +47,10 @@ SMEAttrs::SMEAttrs(const CallBase &CB) {
 
 SMEAttrs::SMEAttrs(StringRef FuncName) : Bitmask(0) {
   if (FuncName == "__arm_tpidr2_save" || FuncName == "__arm_sme_state")
-    Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::ZA_Preserved |
-                SMEAttrs::ZA_NoLazySave);
+    Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::SME_ABI_Routine);
   if (FuncName == "__arm_tpidr2_restore")
     Bitmask |= (SMEAttrs::SM_Compatible | SMEAttrs::ZA_Shared |
-                SMEAttrs::ZA_NoLazySave);
+                SMEAttrs::SME_ABI_Routine);
 }
 
 SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index 8af219bb361fd..27b7075a0944f 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -38,13 +38,13 @@ class SMEAttrs {
   // Enum with bitmasks for each individual SME feature.
   enum Mask {
     Normal = 0,
-    SM_Enabled = 1 << 0,    // aarch64_pstate_sm_enabled
-    SM_Compatible = 1 << 1, // aarch64_pstate_sm_compatible
-    SM_Body = 1 << 2,       // aarch64_pstate_sm_body
-    ZA_Shared = 1 << 3,     // aarch64_pstate_sm_shared
-    ZA_New = 1 << 4,        // aarch64_pstate_sm_new
-    ZA_Preserved = 1 << 5,  // aarch64_pstate_sm_preserved
-    ZA_NoLazySave = 1 << 6, // Used for SME ABI routines to avoid lazy saves
+    SM_Enabled = 1 << 0,      // aarch64_pstate_sm_enabled
+    SM_Compatible = 1 << 1,   // aarch64_pstate_sm_compatible
+    SM_Body = 1 << 2,         // aarch64_pstate_sm_body
+    ZA_Shared = 1 << 3,       // aarch64_pstate_sm_shared
+    ZA_New = 1 << 4,          // aarch64_pstate_sm_new
+    ZA_Preserved = 1 << 5,    // aarch64_pstate_sm_preserved
+    SME_ABI_Routine = 1 << 6, // Used for SME ABI routines to avoid lazy saves
     ZT0_Shift = 7,
     ZT0_Mask = 0b111 << ZT0_Shift
   };
@@ -86,7 +86,7 @@ class SMEAttrs {
   bool hasZAState() const { return hasNewZABody() || sharesZA(); }
   bool requiresLazySave(const SMEAttrs &Callee) const {
     return hasZAState() && Callee.hasPrivateZAInterface() &&
-           !(Callee.Bitmask & ZA_NoLazySave);
+           !(Callee.Bitmask & SME_ABI_Routine);
   }
 
   // Interfaces to query ZT0 State
@@ -116,7 +116,8 @@ class SMEAttrs {
     return hasZT0State() && !Callee.sharesZT0();
   }
   bool requiresDisablingZABeforeCall(const SMEAttrs &Callee) const {
-    return hasZT0State() && !hasZAState() && Callee.hasPrivateZAInterface();
+    return hasZT0State() && !hasZAState() && Callee.hasPrivateZAInterface() &&
+           !(Callee.Bitmask & SME_ABI_Routine);
   }
   bool requiresEnablingZAAfterCall(const SMEAttrs &Callee) const {
     return requiresLazySave(Callee) || requiresDisablingZABeforeCall(Callee);
diff --git a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
index 88eaf19ec488f..18d1e40bf4d0f 100644
--- a/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
+++ b/llvm/test/CodeGen/AArch64/sme-zt0-state.ll
@@ -153,3 +153,118 @@ define void @zt0_in_caller_zt0_new_callee() "aarch64_in_zt0" nounwind {
   call void @callee() "aarch64_new_zt0";
   ret void;
 }
+
+;
+; New-ZA Caller
+;
+
+; Expect commit of lazy-save if ZA is dormant
+; Expect smstart ZA & clear ZT0
+; Before return, expect smstop ZA
+define void @zt0_new_caller() "aarch64_new_zt0" nounwind {
+; CHECK-LABEL: zt0_new_caller:
+; CHECK:       // %bb.0: // %prelude
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    str x30, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    cbz x8, .LBB6_2
+; CHECK-NEXT:  // %bb.1: // %save.za
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    str zt0, [x8]
+; CHECK-NEXT:    bl __arm_tpidr2_save
+; CHECK-NEXT:    ldr zt0, [x8]
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    zero { zt0 }
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    ldr x30, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #80
+; CHECK-NEXT:    ret
+  call void @callee() "aarch64_in_zt0";
+  ret void;
+}
+
+; Expect commit of lazy-save if ZA is dormant
+; Expect smstart ZA, clear ZA & clear ZT0
+; Before return, expect smstop ZA
+define void @new_za_zt0_caller() "aarch64_pstate_za_new" "aarch64_new_zt0" nounwind {
+; CHECK-LABEL: new_za_zt0_caller:
+; CHECK:       // %bb.0: // %prelude
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #80
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    mrs x8, TPIDR2_EL0
+; CHECK-NEXT:    cbz x8, .LBB7_2
+; CHECK-NEXT:  // %bb.1: // %save.za
+; CHECK-NEXT:    sub x8, x29, #80
+; CHECK-NEXT:    str zt0, [x8]
+; CHECK-NEXT:    bl __arm_tpidr2_save
+; CHECK-NEXT:    ldr zt0, [x8]
+; CHECK-NEXT:    msr TPIDR2_EL0, xzr
+; CHECK-NEXT:  .LBB7_2:
+; CHECK-NEXT:    smstart za
+; CHECK-NEXT:    zero {za}
+; CHECK-NEXT:    zero { zt0 }
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    smstop za
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee() "aarch64_pstate_za_shared" "aarch64_in_zt0";
+  ret void;
+}
+
+; Expect clear ZA on entry
+define void @new_za_shared_zt0_caller() "aarch64_pstate_za_new" "aarch64_in_zt0" nounwind {
+; CHECK-LABEL: new_za_shared_zt0_caller:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    zero {za}
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee() "aarch64_pstate_za_shared" "aarch64_in_zt0";
+  ret void;
+}
+
+; Expect clear ZT0 on entry
+define void @shared_za_new_zt0() "aarch64_pstate_za_shared" "aarch64_new_zt0" nounwind {
+; CHECK-LABEL: shared_za_new_zt0:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT:    mov x29, sp
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov x9, sp
+; CHECK-NEXT:    msub x8, x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    stur wzr, [x29, #-4]
+; CHECK-NEXT:    sturh wzr, [x29, #-6]
+; CHECK-NEXT:    stur x8, [x29, #-16]
+; CHECK-NEXT:    zero { zt0 }
+; CHECK-NEXT:    bl callee
+; CHECK-NEXT:    mov sp, x29
+; CHECK-NEXT:    ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  call void @callee() "aarch64_pstate_za_shared" "aarch64_in_zt0";
+  ret void;
+}

From f188f4589cc8f690779c62996520663718df106d Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 22 Jan 2024 16:32:18 +0000
Subject: [PATCH 425/843] [DebugInfo] Disable a test runline temporarily

This is a follow-up to 8c1b7fba1fb -- GlobalISel currently doesn't handle
RemoveDIs mode debug-info, but will (see #75228). Disable this runline
until then.

(This is a patch-landing ordering problem)
---
 llvm/test/CodeGen/AArch64/dbg-value-swift-async.ll | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/AArch64/dbg-value-swift-async.ll b/llvm/test/CodeGen/AArch64/dbg-value-swift-async.ll
index 4649fad5b2c2a..57153d04b8bca 100644
--- a/llvm/test/CodeGen/AArch64/dbg-value-swift-async.ll
+++ b/llvm/test/CodeGen/AArch64/dbg-value-swift-async.ll
@@ -2,7 +2,8 @@
 ; RUN: llc --mtriple="aarch64-" -O0 -fast-isel=false -global-isel=false -stop-after=finalize-isel %s -o - | FileCheck %s --check-prefix=AARCH
 ; RUN: llc --mtriple="aarch64-" -O0 -fast-isel -stop-after=finalize-isel %s -o - | FileCheck %s --check-prefix=AARCH
 
-; RUN: llc --mtriple="aarch64-" -O0 -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=AARCH
+;; run: llc --mtriple="aarch64-" -O0 -global-isel -stop-after=irtranslator -verify-machineinstrs %s -o - --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=AARCH
+;; ^^ Disabled while the global-isel patch for RemoveDIs lands
 ; RUN: llc --mtriple="aarch64-" -O0 -fast-isel=false -global-isel=false -stop-after=finalize-isel %s -o - --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=AARCH
 ; RUN: llc --mtriple="aarch64-" -O0 -fast-isel -stop-after=finalize-isel %s -o - --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=AARCH
 

From b5df6a90f5365e61d2dfa1583d36cbc79ab5775b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 22 Jan 2024 08:40:33 -0800
Subject: [PATCH 426/843] [mlir][openacc] Fix num_gang parser (#78792)

Nb of operand per segment is not correctly computed.
---
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 4 ++--
 mlir/test/Dialect/OpenACC/ops.mlir      | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index bc03adbcae64d..80f0529c87460 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -818,6 +818,7 @@ static ParseResult parseNumGangs(
     if (failed(parser.parseLBrace()))
       return failure();
 
+    int32_t crtOperandsSize = operands.size();
     if (failed(parser.parseCommaSeparatedList(
             mlir::AsmParser::Delimiter::None, [&]() {
               if (parser.parseOperand(operands.emplace_back()) ||
@@ -826,8 +827,7 @@ static ParseResult parseNumGangs(
               return success();
             })))
       return failure();
-
-    seg.push_back(operands.size());
+    seg.push_back(operands.size() - crtOperandsSize);
 
     if (failed(parser.parseRBrace()))
       return failure();
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 99b44183758d9..bda31a19cf5cd 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -1878,3 +1878,12 @@ func.func @acc_atomic_capture(%v: memref<i32>, %x: memref<i32>, %expr: i32) {
 
   return
 }
+
+// -----
+
+%c2 = arith.constant 2 : i32
+%c1 = arith.constant 1 : i32
+acc.parallel num_gangs({%c2 : i32} [#acc.device_type<default>], {%c1 : i32, %c1 : i32, %c1 : i32} [#acc.device_type<nvidia>]) {
+}
+
+// CHECK: acc.parallel num_gangs({%c2{{.*}} : i32} [#acc.device_type<default>], {%c1{{.*}} : i32, %c1{{.*}} : i32, %c1{{.*}} : i32} [#acc.device_type<nvidia>])

From ee6199ca3cf101c764788ebf8df5b0e3e00f5538 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 22 Jan 2024 08:40:52 -0800
Subject: [PATCH 427/843] [mlir][openacc][NFC] Cleanup hasOnly functions for
 device_type support (#78800)

Just a cleanup for all the `has.*Only()` function to avoid code
duplication
---
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp | 150 ++++++++----------------
 1 file changed, 49 insertions(+), 101 deletions(-)

diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 80f0529c87460..bdc9c345341b2 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -69,6 +69,41 @@ void OpenACCDialect::initialize() {
       *getContext());
 }
 
+//===----------------------------------------------------------------------===//
+// device_type support helpers
+//===----------------------------------------------------------------------===//
+
+static bool hasDeviceTypeValues(std::optional<mlir::ArrayAttr> arrayAttr) {
+  if (arrayAttr && *arrayAttr && arrayAttr->size() > 0)
+    return true;
+  return false;
+}
+
+static bool hasDeviceType(std::optional<mlir::ArrayAttr> arrayAttr,
+                          mlir::acc::DeviceType deviceType) {
+  if (!hasDeviceTypeValues(arrayAttr))
+    return false;
+
+  for (auto attr : *arrayAttr) {
+    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
+    if (deviceTypeAttr.getValue() == deviceType)
+      return true;
+  }
+
+  return false;
+}
+
+static void printDeviceTypes(mlir::OpAsmPrinter &p,
+                             std::optional<mlir::ArrayAttr> deviceTypes) {
+  if (!hasDeviceTypeValues(deviceTypes))
+    return;
+
+  p << "[";
+  llvm::interleaveComma(*deviceTypes, p,
+                        [&](mlir::Attribute attr) { p << attr; });
+  p << "]";
+}
+
 //===----------------------------------------------------------------------===//
 // DataBoundsOp
 //===----------------------------------------------------------------------===//
@@ -722,11 +757,7 @@ bool acc::ParallelOp::hasAsyncOnly() {
 }
 
 bool acc::ParallelOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getAsyncOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getAsyncOnly(), deviceType);
 }
 
 mlir::Value acc::ParallelOp::getAsyncValue() {
@@ -789,11 +820,7 @@ bool acc::ParallelOp::hasWaitOnly() {
 }
 
 bool acc::ParallelOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getWaitOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getWaitOnly(), deviceType);
 }
 
 mlir::Operation::operand_range ParallelOp::getWaitValues() {
@@ -1033,23 +1060,6 @@ static ParseResult parseDeviceTypeOperandsWithKeywordOnly(
   return success();
 }
 
-static bool hasDeviceTypeValues(std::optional<mlir::ArrayAttr> arrayAttr) {
-  if (arrayAttr && *arrayAttr && arrayAttr->size() > 0)
-    return true;
-  return false;
-}
-
-static void printDeviceTypes(mlir::OpAsmPrinter &p,
-                             std::optional<mlir::ArrayAttr> deviceTypes) {
-  if (!hasDeviceTypeValues(deviceTypes))
-    return;
-
-  p << "[";
-  llvm::interleaveComma(*deviceTypes, p,
-                        [&](mlir::Attribute attr) { p << attr; });
-  p << "]";
-}
-
 static void printDeviceTypeOperandsWithKeywordOnly(
     mlir::OpAsmPrinter &p, mlir::Operation *op, mlir::OperandRange operands,
     mlir::TypeRange types, std::optional<mlir::ArrayAttr> deviceTypes,
@@ -1093,11 +1103,7 @@ bool acc::SerialOp::hasAsyncOnly() {
 }
 
 bool acc::SerialOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getAsyncOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getAsyncOnly(), deviceType);
 }
 
 mlir::Value acc::SerialOp::getAsyncValue() {
@@ -1114,11 +1120,7 @@ bool acc::SerialOp::hasWaitOnly() {
 }
 
 bool acc::SerialOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getWaitOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getWaitOnly(), deviceType);
 }
 
 mlir::Operation::operand_range SerialOp::getWaitValues() {
@@ -1177,11 +1179,7 @@ bool acc::KernelsOp::hasAsyncOnly() {
 }
 
 bool acc::KernelsOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getAsyncOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getAsyncOnly(), deviceType);
 }
 
 mlir::Value acc::KernelsOp::getAsyncValue() {
@@ -1228,11 +1226,7 @@ bool acc::KernelsOp::hasWaitOnly() {
 }
 
 bool acc::KernelsOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getWaitOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getWaitOnly(), deviceType);
 }
 
 mlir::Operation::operand_range KernelsOp::getWaitValues() {
@@ -1646,11 +1640,7 @@ Value LoopOp::getDataOperand(unsigned i) {
 bool LoopOp::hasAuto() { return hasAuto(mlir::acc::DeviceType::None); }
 
 bool LoopOp::hasAuto(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getAuto_()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getAuto_(), deviceType);
 }
 
 bool LoopOp::hasIndependent() {
@@ -1658,21 +1648,13 @@ bool LoopOp::hasIndependent() {
 }
 
 bool LoopOp::hasIndependent(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getIndependent()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getIndependent(), deviceType);
 }
 
 bool LoopOp::hasSeq() { return hasSeq(mlir::acc::DeviceType::None); }
 
 bool LoopOp::hasSeq(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getSeq()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getSeq(), deviceType);
 }
 
 mlir::Value LoopOp::getVectorValue() {
@@ -1687,11 +1669,7 @@ mlir::Value LoopOp::getVectorValue(mlir::acc::DeviceType deviceType) {
 bool LoopOp::hasVector() { return hasVector(mlir::acc::DeviceType::None); }
 
 bool LoopOp::hasVector(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getVector()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getVector(), deviceType);
 }
 
 mlir::Value LoopOp::getWorkerValue() {
@@ -1706,11 +1684,7 @@ mlir::Value LoopOp::getWorkerValue(mlir::acc::DeviceType deviceType) {
 bool LoopOp::hasWorker() { return hasWorker(mlir::acc::DeviceType::None); }
 
 bool LoopOp::hasWorker(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getWorker()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getWorker(), deviceType);
 }
 
 mlir::Operation::operand_range LoopOp::getTileValues() {
@@ -1771,11 +1745,7 @@ mlir::Value LoopOp::getGangValue(mlir::acc::GangArgType gangArgType,
 bool LoopOp::hasGang() { return hasGang(mlir::acc::DeviceType::None); }
 
 bool LoopOp::hasGang(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getGang()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getGang(), deviceType);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1815,11 +1785,7 @@ bool acc::DataOp::hasAsyncOnly() {
 }
 
 bool acc::DataOp::hasAsyncOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getAsyncOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getAsyncOnly(), deviceType);
 }
 
 mlir::Value DataOp::getAsyncValue() {
@@ -1834,11 +1800,7 @@ mlir::Value DataOp::getAsyncValue(mlir::acc::DeviceType deviceType) {
 bool DataOp::hasWaitOnly() { return hasWaitOnly(mlir::acc::DeviceType::None); }
 
 bool DataOp::hasWaitOnly(mlir::acc::DeviceType deviceType) {
-  if (auto arrayAttr = getWaitOnly()) {
-    if (findSegment(*arrayAttr, deviceType))
-      return true;
-  }
-  return false;
+  return hasDeviceType(getWaitOnly(), deviceType);
 }
 
 mlir::Operation::operand_range DataOp::getWaitValues() {
@@ -2091,20 +2053,6 @@ LogicalResult acc::DeclareOp::verify() {
 // RoutineOp
 //===----------------------------------------------------------------------===//
 
-static bool hasDeviceType(std::optional<mlir::ArrayAttr> arrayAttr,
-                          mlir::acc::DeviceType deviceType) {
-  if (!hasDeviceTypeValues(arrayAttr))
-    return false;
-
-  for (auto attr : *arrayAttr) {
-    auto deviceTypeAttr = mlir::dyn_cast<mlir::acc::DeviceTypeAttr>(attr);
-    if (deviceTypeAttr.getValue() == deviceType)
-      return true;
-  }
-
-  return false;
-}
-
 static unsigned getParallelismForDeviceType(acc::RoutineOp op,
                                             acc::DeviceType dtype) {
   unsigned parallelism = 0;

From ebd4dc42630e201c15894b48c3ea890eaa8c3b18 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 22 Jan 2024 08:56:00 -0800
Subject: [PATCH 428/843] [asan,test] Make alloca_loop_unpoisoning.cpp robust
 and fix s390x failure (#78774)

In the test from https://reviews.llvm.org/D7098, `char array[len];` is
32-byte aligned on most targets whether it is instrumented or not
(optimized by StackSafetyAnalysis), due to the the used `*FrameLowering`
being `StackRealignable`.

However, when using `SystemZELFFrameLowering`, an un-instrumented
`char array[len];` is only 8-byte aligned.

Ensure `char array[len];` gets instrumented like what we did to
`alloca_vla_interact.cpp`, to make the test pass on s390x.
---
 .../test/asan/TestCases/alloca_loop_unpoisoning.cpp      | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/test/asan/TestCases/alloca_loop_unpoisoning.cpp b/compiler-rt/test/asan/TestCases/alloca_loop_unpoisoning.cpp
index ac25a4faa2dc1..0967b34dc7dbf 100644
--- a/compiler-rt/test/asan/TestCases/alloca_loop_unpoisoning.cpp
+++ b/compiler-rt/test/asan/TestCases/alloca_loop_unpoisoning.cpp
@@ -2,7 +2,6 @@
 // RUN: %env_asan_opts=detect_stack_use_after_return=0 %run %t 2>&1
 //
 // REQUIRES: stable-runtime
-// UNSUPPORTED: target=s390{{.*}}
 
 // This testcase checks that allocas and VLAs inside loop are correctly unpoisoned.
 
@@ -25,11 +24,15 @@ void *top, *bot;
 __attribute__((noinline)) void foo(int len) {
   char x;
   top = &x;
-  char array[len];
+  volatile char array[len];
+  if (len)
+    array[0] = 0;
   assert(!(reinterpret_cast<uintptr_t>(array) & 31L));
   alloca(len);
   for (int i = 0; i < 32; ++i) {
-    char array[i];
+    volatile char array[i];
+    if (i)
+      array[0] = 0;
     bot = alloca(i);
     assert(!(reinterpret_cast<uintptr_t>(bot) & 31L));
   }

From 621bafd5c14cc324612e32c8123ac1ebf1c0530b Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Mon, 22 Jan 2024 11:06:47 -0600
Subject: [PATCH 429/843] [Libomptarget] Move target table handling out of the
 plugins (#77150)

Summary:
This patch removes the bulk of the handling of the
`__tgt_offload_entries` out of the plugins itself. The reason for this
is because the plugins themselves should not be handling this
implementation detail of the OpenMP runtime. Instead, we expose two new
plugin API functions to get the points to a device pointer for a global
as well as a kernel type.

This required introducing a new type to represent a binary image that
has been loaded on a device. We can then use this to load the addresses
as needed. The creation of the mapping table is then handled just in
`libomptarget` where we simply look up each address individually. This
should allow us to expose these operations more generically when we
provide a separate API.
---
 openmp/libomptarget/include/Shared/APITypes.h |   5 +
 .../libomptarget/include/Shared/PluginAPI.h   |  14 +-
 .../libomptarget/include/Shared/PluginAPI.inc |   2 +
 openmp/libomptarget/include/device.h          |   2 +-
 openmp/libomptarget/include/rtl.h             |   5 +
 .../plugins-nextgen/amdgpu/src/rtl.cpp        |  12 +-
 .../common/include/GlobalHandler.h            |   3 -
 .../common/include/PluginInterface.h          |  68 ++----
 .../common/src/PluginInterface.cpp            | 208 ++++++++----------
 .../plugins-nextgen/cuda/src/rtl.cpp          |  12 +-
 .../generic-elf-64bit/src/rtl.cpp             |  12 +-
 openmp/libomptarget/src/PluginManager.cpp     |   2 +
 openmp/libomptarget/src/device.cpp            |  10 +-
 openmp/libomptarget/src/omptarget.cpp         |  56 ++++-
 14 files changed, 201 insertions(+), 210 deletions(-)

diff --git a/openmp/libomptarget/include/Shared/APITypes.h b/openmp/libomptarget/include/Shared/APITypes.h
index 763a22f0a5e86..94521b4fbb577 100644
--- a/openmp/libomptarget/include/Shared/APITypes.h
+++ b/openmp/libomptarget/include/Shared/APITypes.h
@@ -62,6 +62,11 @@ struct __tgt_target_table {
       *EntriesEnd; // End of the table with all the entries (non inclusive)
 };
 
+/// This struct contains a handle to a loaded binary in the plugin device.
+struct __tgt_device_binary {
+  uintptr_t handle;
+};
+
 // clang-format on
 
 /// This struct contains information exchanged between different asynchronous
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.h b/openmp/libomptarget/include/Shared/PluginAPI.h
index aece53d7ee1ca..5de5f106045b5 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.h
+++ b/openmp/libomptarget/include/Shared/PluginAPI.h
@@ -57,8 +57,18 @@ int32_t __tgt_rtl_init_device(int32_t ID);
 // return NULL. Otherwise, return a pointer to the built address table.
 // Individual entries in the table may also be NULL, when the corresponding
 // offload region is not supported on the target device.
-__tgt_target_table *__tgt_rtl_load_binary(int32_t ID,
-                                          __tgt_device_image *Image);
+int32_t __tgt_rtl_load_binary(int32_t ID, __tgt_device_image *Image,
+                              __tgt_device_binary *Binary);
+
+// Look up the device address of the named symbol in the given binary. Returns
+// non-zero on failure.
+int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size,
+                             const char *Name, void **DevicePtr);
+
+// Look up the device address of the named kernel in the given binary. Returns
+// non-zero on failure.
+int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name,
+                               void **DevicePtr);
 
 // Allocate data on the particular target device, of the specified size.
 // HostPtr is a address of the host data the allocated target data
diff --git a/openmp/libomptarget/include/Shared/PluginAPI.inc b/openmp/libomptarget/include/Shared/PluginAPI.inc
index b842c6eef1d4f..5f8a9dd11fdce 100644
--- a/openmp/libomptarget/include/Shared/PluginAPI.inc
+++ b/openmp/libomptarget/include/Shared/PluginAPI.inc
@@ -19,6 +19,8 @@ PLUGIN_API_HANDLE(is_data_exchangable, false);
 PLUGIN_API_HANDLE(number_of_devices, true);
 PLUGIN_API_HANDLE(init_device, true);
 PLUGIN_API_HANDLE(load_binary, true);
+PLUGIN_API_HANDLE(get_global, true);
+PLUGIN_API_HANDLE(get_function, true);
 PLUGIN_API_HANDLE(data_alloc, true);
 PLUGIN_API_HANDLE(data_submit, true);
 PLUGIN_API_HANDLE(data_submit_async, false);
diff --git a/openmp/libomptarget/include/device.h b/openmp/libomptarget/include/device.h
index 3023fba6cc64d..1dc82e36f6813 100644
--- a/openmp/libomptarget/include/device.h
+++ b/openmp/libomptarget/include/device.h
@@ -70,7 +70,7 @@ struct DeviceTy {
   /// Provide access to the mapping handler.
   MappingInfoTy &getMappingInfo() { return MappingInfo; }
 
-  __tgt_target_table *loadBinary(__tgt_device_image *Img);
+  llvm::Expected<__tgt_device_binary> loadBinary(__tgt_device_image *Img);
 
   // device memory allocation/deallocation routines
   /// Allocates \p Size bytes on the device, host or shared memory space
diff --git a/openmp/libomptarget/include/rtl.h b/openmp/libomptarget/include/rtl.h
index d110e89de5f14..5e198bdad4364 100644
--- a/openmp/libomptarget/include/rtl.h
+++ b/openmp/libomptarget/include/rtl.h
@@ -26,11 +26,16 @@
 /// are trying to (re)register an existing lib or really have a new one.
 struct TranslationTable {
   __tgt_target_table HostTable;
+  llvm::SmallVector<__tgt_target_table> DeviceTables;
 
   // Image assigned to a given device.
   llvm::SmallVector<__tgt_device_image *>
       TargetsImages; // One image per device ID.
 
+  // Arrays of entries active on the device.
+  llvm::SmallVector<llvm::SmallVector<__tgt_offload_entry>>
+      TargetsEntries; // One table per device ID.
+
   // Table of entry points or NULL if it was not already computed.
   llvm::SmallVector<__tgt_target_table *>
       TargetsTable; // One table per device ID.
diff --git a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
index 8066a231ef93f..81634ae1edc49 100644
--- a/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -439,8 +439,9 @@ struct AMDGPUMemoryManagerTy : public DeviceAllocatorTy {
 /// Class implementing the AMDGPU device images' properties.
 struct AMDGPUDeviceImageTy : public DeviceImageTy {
   /// Create the AMDGPU image with the id and the target image pointer.
-  AMDGPUDeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, TgtImage) {}
+  AMDGPUDeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
+                      const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, Device, TgtImage) {}
 
   /// Prepare and load the executable corresponding to the image.
   Error loadExecutable(const AMDGPUDeviceTy &Device);
@@ -2105,14 +2106,13 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
   uint64_t getClockFrequency() const override { return ClockFrequency; }
 
   /// Allocate and construct an AMDGPU kernel.
-  Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry) override {
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the AMDGPU kernel.
     AMDGPUKernelTy *AMDGPUKernel = Plugin::get().allocate<AMDGPUKernelTy>();
     if (!AMDGPUKernel)
       return Plugin::error("Failed to allocate memory for AMDGPU kernel");
 
-    new (AMDGPUKernel) AMDGPUKernelTy(KernelEntry.name);
+    new (AMDGPUKernel) AMDGPUKernelTy(Name);
 
     return *AMDGPUKernel;
   }
@@ -2160,7 +2160,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     // Allocate and initialize the image object.
     AMDGPUDeviceImageTy *AMDImage =
         Plugin::get().allocate<AMDGPUDeviceImageTy>();
-    new (AMDImage) AMDGPUDeviceImageTy(ImageId, TgtImage);
+    new (AMDImage) AMDGPUDeviceImageTy(ImageId, *this, TgtImage);
 
     // Load the HSA executable.
     if (Error Err = AMDImage->loadExecutable(*this))
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
index 8707e7b4c504e..5c767995126b7 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/GlobalHandler.h
@@ -47,9 +47,6 @@ class GlobalTy {
   GlobalTy(const std::string &Name, uint32_t Size, void *Ptr = nullptr)
       : Name(Name), Size(Size), Ptr(Ptr) {}
 
-  GlobalTy(const __tgt_offload_entry &Entry)
-      : Name(Entry.name), Size(Entry.size), Ptr(Entry.addr) {}
-
   const std::string &getName() const { return Name; }
   uint32_t getSize() const { return Size; }
   void *getPtr() const { return Ptr; }
diff --git a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
index d55dfbdd9e4c1..3c2a4d7e6c0e7 100644
--- a/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
+++ b/openmp/libomptarget/plugins-nextgen/common/include/PluginInterface.h
@@ -182,34 +182,6 @@ class InfoQueueTy {
 /// specific device. This class is responsible for storing and managing
 /// the offload entries for an image on a device.
 class DeviceImageTy {
-
-  /// Class representing the offload entry table. The class stores the
-  /// __tgt_target_table and a map to search in the table faster.
-  struct OffloadEntryTableTy {
-    /// Add new entry to the table.
-    void addEntry(const __tgt_offload_entry &Entry) {
-      Entries.push_back(Entry);
-      TTTablePtr.EntriesBegin = &Entries[0];
-      TTTablePtr.EntriesEnd = TTTablePtr.EntriesBegin + Entries.size();
-    }
-
-    /// Get the raw pointer to the __tgt_target_table.
-    operator __tgt_target_table *() {
-      if (Entries.empty())
-        return nullptr;
-      return &TTTablePtr;
-    }
-
-  private:
-    __tgt_target_table TTTablePtr;
-    llvm::SmallVector<__tgt_offload_entry> Entries;
-
-  public:
-    using const_iterator = decltype(Entries)::const_iterator;
-    const_iterator begin() const { return Entries.begin(); }
-    const_iterator end() const { return Entries.end(); }
-  };
-
   /// Image identifier within the corresponding device. Notice that this id is
   /// not unique between different device; they may overlap.
   int32_t ImageId;
@@ -218,18 +190,19 @@ class DeviceImageTy {
   const __tgt_device_image *TgtImage;
   const __tgt_device_image *TgtImageBitcode;
 
+  /// Reference to the device this image is loaded on.
+  GenericDeviceTy &Device;
+
   /// If this image has any global destructors that much be called.
   /// FIXME: This is only required because we currently have no invariants
   ///        towards the lifetime of the underlying image. We should either copy
   ///        the image into memory locally or erase the pointers after init.
   bool PendingGlobalDtors;
 
-  /// Table of offload entries.
-  OffloadEntryTableTy OffloadEntryTable;
-
 public:
-  DeviceImageTy(int32_t Id, const __tgt_device_image *Image)
-      : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr),
+  DeviceImageTy(int32_t Id, GenericDeviceTy &Device,
+                const __tgt_device_image *Image)
+      : ImageId(Id), TgtImage(Image), TgtImageBitcode(nullptr), Device(Device),
         PendingGlobalDtors(false) {
     assert(TgtImage && "Invalid target image");
   }
@@ -237,6 +210,9 @@ class DeviceImageTy {
   /// Get the image identifier within the device.
   int32_t getId() const { return ImageId; }
 
+  /// Get the device that this image is loaded onto.
+  GenericDeviceTy &getDevice() const { return Device; }
+
   /// Get the pointer to the raw __tgt_device_image.
   const __tgt_device_image *getTgtImage() const { return TgtImage; }
 
@@ -261,13 +237,9 @@ class DeviceImageTy {
     return MemoryBufferRef(StringRef((const char *)getStart(), getSize()),
                            "Image");
   }
-
   /// Accessors to the boolean value
   bool setPendingGlobalDtors() { return PendingGlobalDtors = true; }
   bool hasPendingGlobalDtors() const { return PendingGlobalDtors; }
-
-  /// Get a reference to the offload entry table for the image.
-  OffloadEntryTableTy &getOffloadEntryTable() { return OffloadEntryTable; }
 };
 
 /// Class implementing common functionalities of offload kernels. Each plugin
@@ -661,8 +633,8 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   virtual Error deinitImpl() = 0;
 
   /// Load the binary image into the device and return the target table.
-  Expected<__tgt_target_table *> loadBinary(GenericPluginTy &Plugin,
-                                            const __tgt_device_image *TgtImage);
+  Expected<DeviceImageTy *> loadBinary(GenericPluginTy &Plugin,
+                                       const __tgt_device_image *TgtImage);
   virtual Expected<DeviceImageTy *>
   loadBinaryImpl(const __tgt_device_image *TgtImage, int32_t ImageId) = 0;
 
@@ -680,9 +652,6 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   // up to the target to override this using the shouldSetupRPCServer function.
   Error setupRPCServer(GenericPluginTy &Plugin, DeviceImageTy &Image);
 
-  /// Register the offload entries for a specific image on the device.
-  Error registerOffloadEntries(DeviceImageTy &Image);
-
   /// Synchronize the current thread with the pending operations on the
   /// __tgt_async_info structure.
   Error synchronize(__tgt_async_info *AsyncInfo);
@@ -888,21 +857,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   bool useAutoZeroCopy();
   virtual bool useAutoZeroCopyImpl() { return false; }
 
-private:
-  /// Register offload entry for global variable.
-  Error registerGlobalOffloadEntry(DeviceImageTy &DeviceImage,
-                                   const __tgt_offload_entry &GlobalEntry,
-                                   __tgt_offload_entry &DeviceEntry);
-
-  /// Register offload entry for kernel function.
-  Error registerKernelOffloadEntry(DeviceImageTy &DeviceImage,
-                                   const __tgt_offload_entry &KernelEntry,
-                                   __tgt_offload_entry &DeviceEntry);
-
   /// Allocate and construct a kernel object.
-  virtual Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry) = 0;
+  virtual Expected<GenericKernelTy &> constructKernel(const char *Name) = 0;
 
+private:
   /// Get and set the stack size and heap size for the device. If not used, the
   /// plugin can implement the setters as no-op and setting the output
   /// value to zero for the getters.
diff --git a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
index 6ae30e78ce8c2..def9c14fa53f8 100644
--- a/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/openmp/libomptarget/plugins-nextgen/common/src/PluginInterface.cpp
@@ -61,6 +61,14 @@ struct RecordReplayTy {
   bool UsedVAMap = false;
   uintptr_t MemoryOffset = 0;
 
+  // A list of all globals mapped to the device.
+  struct GlobalEntry {
+    const char *Name;
+    uint64_t Size;
+    void *Addr;
+  };
+  llvm::SmallVector<GlobalEntry> GlobalEntries{};
+
   void *suggestAddress(uint64_t MaxMemoryAllocation) {
     // Get a valid pointer address for this system
     void *Addr =
@@ -189,6 +197,9 @@ struct RecordReplayTy {
   }
   void setStatus(RRStatusTy Status) { this->Status = Status; }
   bool isSaveOutputEnabled() const { return ReplaySaveOutput; }
+  void addEntry(const char *Name, uint64_t Size, void *Addr) {
+    GlobalEntries.emplace_back(GlobalEntry{Name, Size, Addr});
+  }
 
   void saveImage(const char *Name, const DeviceImageTy &Image) {
     SmallString<128> ImageName = {Name, ".image"};
@@ -211,12 +222,12 @@ struct RecordReplayTy {
   void dumpGlobals(StringRef Filename, DeviceImageTy &Image) {
     int32_t Size = 0;
 
-    for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
-      if (!OffloadEntry.size)
+    for (auto &OffloadEntry : GlobalEntries) {
+      if (!OffloadEntry.Size)
         continue;
       // Get the total size of the string and entry including the null byte.
-      Size += std::strlen(OffloadEntry.name) + 1 + sizeof(uint32_t) +
-              OffloadEntry.size;
+      Size += std::strlen(OffloadEntry.Name) + 1 + sizeof(uint32_t) +
+              OffloadEntry.Size;
     }
 
     ErrorOr<std::unique_ptr<WritableMemoryBuffer>> GlobalsMB =
@@ -225,26 +236,26 @@ struct RecordReplayTy {
       report_fatal_error("Error creating MemoryBuffer for globals memory");
 
     void *BufferPtr = GlobalsMB.get()->getBufferStart();
-    for (auto &OffloadEntry : Image.getOffloadEntryTable()) {
-      if (!OffloadEntry.size)
+    for (auto &OffloadEntry : GlobalEntries) {
+      if (!OffloadEntry.Size)
         continue;
 
-      int32_t NameLength = std::strlen(OffloadEntry.name) + 1;
-      memcpy(BufferPtr, OffloadEntry.name, NameLength);
+      int32_t NameLength = std::strlen(OffloadEntry.Name) + 1;
+      memcpy(BufferPtr, OffloadEntry.Name, NameLength);
       BufferPtr = advanceVoidPtr(BufferPtr, NameLength);
 
-      *((uint32_t *)(BufferPtr)) = OffloadEntry.size;
+      *((uint32_t *)(BufferPtr)) = OffloadEntry.Size;
       BufferPtr = advanceVoidPtr(BufferPtr, sizeof(uint32_t));
 
       auto Err = Plugin::success();
       {
-        if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.addr,
-                                            OffloadEntry.size, nullptr))
+        if (auto Err = Device->dataRetrieve(BufferPtr, OffloadEntry.Addr,
+                                            OffloadEntry.Size, nullptr))
           report_fatal_error("Error retrieving data for global");
       }
       if (Err)
         report_fatal_error("Error retrieving data for global");
-      BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.size);
+      BufferPtr = advanceVoidPtr(BufferPtr, OffloadEntry.Size);
     }
     assert(BufferPtr == GlobalsMB->get()->getBufferEnd() &&
            "Buffer over/under-filled.");
@@ -841,7 +852,7 @@ Error GenericDeviceTy::deinit(GenericPluginTy &Plugin) {
 
   return deinitImpl();
 }
-Expected<__tgt_target_table *>
+Expected<DeviceImageTy *>
 GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
                             const __tgt_device_image *InputTgtImage) {
   assert(InputTgtImage && "Expected non-null target image");
@@ -885,10 +896,6 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
       return std::move(Err);
   }
 
-  // Register all offload entries of the image.
-  if (auto Err = registerOffloadEntries(*Image))
-    return std::move(Err);
-
   if (auto Err = setupRPCServer(Plugin, *Image))
     return std::move(Err);
 
@@ -909,7 +916,7 @@ GenericDeviceTy::loadBinary(GenericPluginTy &Plugin,
     return std::move(Err);
 
   // Return the pointer to the table of entries.
-  return Image->getOffloadEntryTable();
+  return Image;
 }
 
 Error GenericDeviceTy::setupDeviceEnvironment(GenericPluginTy &Plugin,
@@ -1018,99 +1025,6 @@ Error GenericDeviceTy::setupRPCServer(GenericPluginTy &Plugin,
   return Plugin::success();
 }
 
-Error GenericDeviceTy::registerOffloadEntries(DeviceImageTy &Image) {
-  const __tgt_offload_entry *Begin = Image.getTgtImage()->EntriesBegin;
-  const __tgt_offload_entry *End = Image.getTgtImage()->EntriesEnd;
-  for (const __tgt_offload_entry *Entry = Begin; Entry != End; ++Entry) {
-    // The host should have always something in the address to uniquely
-    // identify the entry.
-    if (!Entry->addr)
-      return Plugin::error("Failure to register entry without address");
-
-    __tgt_offload_entry DeviceEntry = {0};
-
-    if (Entry->size) {
-      if (auto Err = registerGlobalOffloadEntry(Image, *Entry, DeviceEntry))
-        return Err;
-    } else {
-      if (auto Err = registerKernelOffloadEntry(Image, *Entry, DeviceEntry))
-        return Err;
-    }
-
-    assert(DeviceEntry.addr && "Device addr of offload entry cannot be null");
-
-    DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
-       DPxPTR(Entry - Begin), (Entry->size) ? " global" : "", Entry->name,
-       DPxPTR(DeviceEntry.addr));
-  }
-  return Plugin::success();
-}
-
-Error GenericDeviceTy::registerGlobalOffloadEntry(
-    DeviceImageTy &Image, const __tgt_offload_entry &GlobalEntry,
-    __tgt_offload_entry &DeviceEntry) {
-
-  GenericPluginTy &Plugin = Plugin::get();
-
-  DeviceEntry = GlobalEntry;
-
-  // Create a metadata object for the device global.
-  GlobalTy DeviceGlobal(GlobalEntry.name, GlobalEntry.size);
-
-  // Get the address of the device of the global.
-  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
-  if (auto Err =
-          GHandler.getGlobalMetadataFromDevice(*this, Image, DeviceGlobal))
-    return Err;
-
-  // Store the device address on the device entry.
-  DeviceEntry.addr = DeviceGlobal.getPtr();
-  assert(DeviceEntry.addr && "Invalid device global's address");
-
-  // Note: In the current implementation declare target variables
-  // can either be link or to. This means that once unified
-  // memory is activated via the requires directive, the variable
-  // can be used directly from the host in both cases.
-  if (Plugin.getRequiresFlags() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
-    // If unified memory is present any target link or to variables
-    // can access host addresses directly. There is no longer a
-    // need for device copies.
-    GlobalTy HostGlobal(GlobalEntry);
-    if (auto Err =
-            GHandler.writeGlobalToDevice(*this, HostGlobal, DeviceGlobal))
-      return Err;
-  }
-
-  // Add the device entry on the entry table.
-  Image.getOffloadEntryTable().addEntry(DeviceEntry);
-
-  return Plugin::success();
-}
-
-Error GenericDeviceTy::registerKernelOffloadEntry(
-    DeviceImageTy &Image, const __tgt_offload_entry &KernelEntry,
-    __tgt_offload_entry &DeviceEntry) {
-  DeviceEntry = KernelEntry;
-
-  // Create a kernel object.
-  auto KernelOrErr = constructKernel(KernelEntry);
-  if (!KernelOrErr)
-    return KernelOrErr.takeError();
-
-  GenericKernelTy &Kernel = *KernelOrErr;
-
-  // Initialize the kernel.
-  if (auto Err = Kernel.init(*this, Image))
-    return Err;
-
-  // Set the device entry address to the kernel address and store the entry on
-  // the entry table.
-  DeviceEntry.addr = (void *)&Kernel;
-  Image.getOffloadEntryTable().addEntry(DeviceEntry);
-
-  return Plugin::success();
-}
-
 Error PinnedAllocationMapTy::insertEntry(void *HstPtr, void *DevAccessiblePtr,
                                          size_t Size, bool ExternallyLocked) {
   // Insert the new entry into the map.
@@ -1757,23 +1671,25 @@ int32_t __tgt_rtl_initialize_record_replay(int32_t DeviceId, int64_t MemorySize,
   return OFFLOAD_SUCCESS;
 }
 
-__tgt_target_table *__tgt_rtl_load_binary(int32_t DeviceId,
-                                          __tgt_device_image *TgtImage) {
+int32_t __tgt_rtl_load_binary(int32_t DeviceId, __tgt_device_image *TgtImage,
+                              __tgt_device_binary *Binary) {
   GenericPluginTy &Plugin = Plugin::get();
   GenericDeviceTy &Device = Plugin.getDevice(DeviceId);
 
-  auto TableOrErr = Device.loadBinary(Plugin, TgtImage);
-  if (!TableOrErr) {
-    auto Err = TableOrErr.takeError();
+  auto ImageOrErr = Device.loadBinary(Plugin, TgtImage);
+  if (!ImageOrErr) {
+    auto Err = ImageOrErr.takeError();
     REPORT("Failure to load binary image %p on device %d: %s\n", TgtImage,
            DeviceId, toString(std::move(Err)).data());
-    return nullptr;
+    return OFFLOAD_FAIL;
   }
 
-  __tgt_target_table *Table = *TableOrErr;
-  assert(Table != nullptr && "Invalid table");
+  DeviceImageTy *Image = *ImageOrErr;
+  assert(Image != nullptr && "Invalid Image");
+
+  *Binary = __tgt_device_binary{reinterpret_cast<uint64_t>(Image)};
 
-  return Table;
+  return OFFLOAD_SUCCESS;
 }
 
 void *__tgt_rtl_data_alloc(int32_t DeviceId, int64_t Size, void *HostPtr,
@@ -2077,6 +1993,58 @@ int32_t __tgt_rtl_use_auto_zero_copy(int32_t DeviceId) {
     return false;
   return Plugin::get().getDevice(DeviceId).useAutoZeroCopy();
 }
+
+int32_t __tgt_rtl_get_global(__tgt_device_binary Binary, uint64_t Size,
+                             const char *Name, void **DevicePtr) {
+  assert(Binary.handle && "Invalid device binary handle");
+  DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
+
+  GenericPluginTy &Plugin = Plugin::get();
+  GenericDeviceTy &Device = Image.getDevice();
+
+  GlobalTy DeviceGlobal(Name, Size);
+  GenericGlobalHandlerTy &GHandler = Plugin.getGlobalHandler();
+  if (auto Err =
+          GHandler.getGlobalMetadataFromDevice(Device, Image, DeviceGlobal)) {
+    REPORT("Failure to look up global address: %s\n",
+           toString(std::move(Err)).data());
+    return OFFLOAD_FAIL;
+  }
+
+  *DevicePtr = DeviceGlobal.getPtr();
+  assert(DevicePtr && "Invalid device global's address");
+
+  // Save the loaded globals if we are recording.
+  if (RecordReplay.isRecording())
+    RecordReplay.addEntry(Name, Size, *DevicePtr);
+
+  return OFFLOAD_SUCCESS;
+}
+
+int32_t __tgt_rtl_get_function(__tgt_device_binary Binary, const char *Name,
+                               void **KernelPtr) {
+  assert(Binary.handle && "Invalid device binary handle");
+  DeviceImageTy &Image = *reinterpret_cast<DeviceImageTy *>(Binary.handle);
+
+  GenericDeviceTy &Device = Image.getDevice();
+
+  auto KernelOrErr = Device.constructKernel(Name);
+  if (Error Err = KernelOrErr.takeError()) {
+    REPORT("Failure to look up kernel: %s\n", toString(std::move(Err)).data());
+    return OFFLOAD_FAIL;
+  }
+
+  GenericKernelTy &Kernel = *KernelOrErr;
+  if (auto Err = Kernel.init(Device, Image)) {
+    REPORT("Failure to init kernel: %s\n", toString(std::move(Err)).data());
+    return OFFLOAD_FAIL;
+  }
+
+  // Note that this is not the kernel's device address.
+  *KernelPtr = &Kernel;
+  return OFFLOAD_SUCCESS;
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
index ce6b39898ae95..5ed73d103584d 100644
--- a/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/cuda/src/rtl.cpp
@@ -77,8 +77,9 @@ CUresult cuMemFreeAsync(CUdeviceptr dptr, CUstream hStream) {}
 /// Class implementing the CUDA device images properties.
 struct CUDADeviceImageTy : public DeviceImageTy {
   /// Create the CUDA image with the id and the target image pointer.
-  CUDADeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, TgtImage), Module(nullptr) {}
+  CUDADeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
+                    const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, Device, TgtImage), Module(nullptr) {}
 
   /// Load the image as a CUDA module.
   Error loadModule() {
@@ -468,14 +469,13 @@ struct CUDADeviceTy : public GenericDeviceTy {
   }
 
   /// Allocate and construct a CUDA kernel.
-  Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry) override {
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the CUDA kernel.
     CUDAKernelTy *CUDAKernel = Plugin::get().allocate<CUDAKernelTy>();
     if (!CUDAKernel)
       return Plugin::error("Failed to allocate memory for CUDA kernel");
 
-    new (CUDAKernel) CUDAKernelTy(KernelEntry.name);
+    new (CUDAKernel) CUDAKernelTy(Name);
 
     return *CUDAKernel;
   }
@@ -530,7 +530,7 @@ struct CUDADeviceTy : public GenericDeviceTy {
 
     // Allocate and initialize the image object.
     CUDADeviceImageTy *CUDAImage = Plugin::get().allocate<CUDADeviceImageTy>();
-    new (CUDAImage) CUDADeviceImageTy(ImageId, TgtImage);
+    new (CUDAImage) CUDADeviceImageTy(ImageId, *this, TgtImage);
 
     // Load the CUDA module.
     if (auto Err = CUDAImage->loadModule())
diff --git a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
index 6466afc543b56..38fc275804faf 100644
--- a/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
+++ b/openmp/libomptarget/plugins-nextgen/generic-elf-64bit/src/rtl.cpp
@@ -111,8 +111,9 @@ struct GenELF64KernelTy : public GenericKernelTy {
 /// Class implementing the GenELF64 device images properties.
 struct GenELF64DeviceImageTy : public DeviceImageTy {
   /// Create the GenELF64 image with the id and the target image pointer.
-  GenELF64DeviceImageTy(int32_t ImageId, const __tgt_device_image *TgtImage)
-      : DeviceImageTy(ImageId, TgtImage), DynLib() {}
+  GenELF64DeviceImageTy(int32_t ImageId, GenericDeviceTy &Device,
+                        const __tgt_device_image *TgtImage)
+      : DeviceImageTy(ImageId, Device, TgtImage), DynLib() {}
 
   /// Getter and setter for the dynamic library.
   DynamicLibrary &getDynamicLibrary() { return DynLib; }
@@ -141,15 +142,14 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
   std::string getComputeUnitKind() const override { return "generic-64bit"; }
 
   /// Construct the kernel for a specific image on the device.
-  Expected<GenericKernelTy &>
-  constructKernel(const __tgt_offload_entry &KernelEntry) override {
+  Expected<GenericKernelTy &> constructKernel(const char *Name) override {
     // Allocate and construct the kernel.
     GenELF64KernelTy *GenELF64Kernel =
         Plugin::get().allocate<GenELF64KernelTy>();
     if (!GenELF64Kernel)
       return Plugin::error("Failed to allocate memory for GenELF64 kernel");
 
-    new (GenELF64Kernel) GenELF64KernelTy(KernelEntry.name);
+    new (GenELF64Kernel) GenELF64KernelTy(Name);
 
     return *GenELF64Kernel;
   }
@@ -163,7 +163,7 @@ struct GenELF64DeviceTy : public GenericDeviceTy {
     // Allocate and initialize the image object.
     GenELF64DeviceImageTy *Image =
         Plugin::get().allocate<GenELF64DeviceImageTy>();
-    new (Image) GenELF64DeviceImageTy(ImageId, TgtImage);
+    new (Image) GenELF64DeviceImageTy(ImageId, *this, TgtImage);
 
     // Create a temporary file.
     char TmpFileName[] = "/tmp/tmpfile_XXXXXX";
diff --git a/openmp/libomptarget/src/PluginManager.cpp b/openmp/libomptarget/src/PluginManager.cpp
index 50059ba23b1a7..f65ffc47d89a1 100644
--- a/openmp/libomptarget/src/PluginManager.cpp
+++ b/openmp/libomptarget/src/PluginManager.cpp
@@ -192,7 +192,9 @@ static void registerImageIntoTranslationTable(TranslationTable &TT,
       RTL.DeviceOffset + RTL.getNumberOfUserDevices();
 
   if (TT.TargetsTable.size() < TargetsTableMinimumSize) {
+    TT.DeviceTables.resize(TargetsTableMinimumSize, {});
     TT.TargetsImages.resize(TargetsTableMinimumSize, 0);
+    TT.TargetsEntries.resize(TargetsTableMinimumSize, {});
     TT.TargetsTable.resize(TargetsTableMinimumSize, 0);
   }
 
diff --git a/openmp/libomptarget/src/device.cpp b/openmp/libomptarget/src/device.cpp
index 404d7b6174e4a..9bdc6b7cd8c9a 100644
--- a/openmp/libomptarget/src/device.cpp
+++ b/openmp/libomptarget/src/device.cpp
@@ -107,8 +107,14 @@ llvm::Error DeviceTy::init() {
 }
 
 // Load binary to device.
-__tgt_target_table *DeviceTy::loadBinary(__tgt_device_image *Img) {
-  return RTL->load_binary(RTLDeviceID, Img);
+llvm::Expected<__tgt_device_binary>
+DeviceTy::loadBinary(__tgt_device_image *Img) {
+  __tgt_device_binary Binary;
+
+  if (RTL->load_binary(RTLDeviceID, Img, &Binary) != OFFLOAD_SUCCESS)
+    return llvm::createStringError(llvm::inconvertibleErrorCode(),
+                                   "Failed to load binary %p", Img);
+  return Binary;
 }
 
 void *DeviceTy::allocData(int64_t Size, void *HstPtr, int32_t Kind) {
diff --git a/openmp/libomptarget/src/omptarget.cpp b/openmp/libomptarget/src/omptarget.cpp
index eb2ecfc2bc56b..04490ab076b65 100644
--- a/openmp/libomptarget/src/omptarget.cpp
+++ b/openmp/libomptarget/src/omptarget.cpp
@@ -168,19 +168,57 @@ static int initLibrary(DeviceTy &Device) {
         Rc = OFFLOAD_FAIL;
         break;
       }
-      // 2) load image into the target table.
-      __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
-          Device.loadBinary(Img);
-      // Unable to get table for this image: invalidate image and fail.
-      if (!TargetTable) {
-        REPORT("Unable to generate entries table for device id %d.\n",
-               DeviceId);
-        TransTable->TargetsImages[DeviceId] = 0;
+
+      // 2) Load the image onto the given device.
+      auto BinaryOrErr = Device.loadBinary(Img);
+      if (llvm::Error Err = BinaryOrErr.takeError()) {
+        REPORT("Failed to load image %s\n",
+               llvm::toString(std::move(Err)).c_str());
         Rc = OFFLOAD_FAIL;
         break;
       }
 
-      // Verify whether the two table sizes match.
+      // 3) Create the translation table.
+      llvm::SmallVector<__tgt_offload_entry> &DeviceEntries =
+          TransTable->TargetsEntries[DeviceId];
+      for (__tgt_offload_entry &Entry :
+           llvm::make_range(Img->EntriesBegin, Img->EntriesEnd)) {
+        __tgt_device_binary &Binary = *BinaryOrErr;
+
+        __tgt_offload_entry DeviceEntry = Entry;
+        if (Entry.size) {
+          if (Device.RTL->get_global(Binary, Entry.size, Entry.name,
+                                     &DeviceEntry.addr) != OFFLOAD_SUCCESS)
+            REPORT("Failed to load symbol %s\n", Entry.name);
+
+          // If unified memory is active, the corresponding global is a device
+          // reference to the host global. We need to initialize the pointer on
+          // the deive to point to the memory on the host.
+          if (PM->getRequirements() & OMP_REQ_UNIFIED_SHARED_MEMORY) {
+            if (Device.RTL->data_submit(DeviceId, DeviceEntry.addr, Entry.addr,
+                                        Entry.size) != OFFLOAD_SUCCESS)
+              REPORT("Failed to write symbol for USM %s\n", Entry.name);
+          }
+        } else {
+          if (Device.RTL->get_function(Binary, Entry.name, &DeviceEntry.addr) !=
+              OFFLOAD_SUCCESS)
+            REPORT("Failed to load kernel %s\n", Entry.name);
+        }
+        DP("Entry point " DPxMOD " maps to%s %s (" DPxMOD ")\n",
+           DPxPTR(Entry.addr), (Entry.size) ? " global" : "", Entry.name,
+           DPxPTR(DeviceEntry.addr));
+
+        DeviceEntries.emplace_back(DeviceEntry);
+      }
+
+      // Set the storage for the table and get a pointer to it.
+      __tgt_target_table DeviceTable{&DeviceEntries[0],
+                                     &DeviceEntries[0] + DeviceEntries.size()};
+      TransTable->DeviceTables[DeviceId] = DeviceTable;
+      __tgt_target_table *TargetTable = TransTable->TargetsTable[DeviceId] =
+          &TransTable->DeviceTables[DeviceId];
+
+      // 4) Verify whether the two table sizes match.
       size_t Hsize =
           TransTable->HostTable.EntriesEnd - TransTable->HostTable.EntriesBegin;
       size_t Tsize = TargetTable->EntriesEnd - TargetTable->EntriesBegin;

From 76160718df7c1f31ff50a4964d749c2b9d83f9cf Mon Sep 17 00:00:00 2001
From: OCHyams <orlando.hyams@sony.com>
Date: Thu, 18 Jan 2024 14:28:18 +0000
Subject: [PATCH 430/843] Reapply [hwasan] Update dbg.assign intrinsics in
 HWAsan pass #78606

llvm.dbg.assign intrinsics have 2 {value, expression} pairs; fix hwasan to update
the second expression.

Fixes #76545
---
 llvm/lib/IR/DebugInfo.cpp                     |  4 --
 .../Instrumentation/HWAddressSanitizer.cpp    |  5 ++
 .../Transforms/Utils/MemoryTaggingSupport.cpp | 10 ++-
 .../AArch64/dbg-assign-tag-offset-mix-loc.ll  | 71 +++++++++++++++++++
 .../CodeGen/AArch64/dbg-assign-tag-offset.ll  | 71 +++++++++++++++++++
 .../declare-to-assign/hwasan.ll               |  2 +-
 .../dbg-assign-tag-offset.ll                  | 59 +++++++++++++++
 7 files changed, 214 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
 create mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
 create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index fcd3f77f8f6e2..2e64d0db57b25 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2200,10 +2200,6 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return /*Changed*/ false;
 
-  // FIXME: https://github.com/llvm/llvm-project/issues/76545
-  if (F.hasFnAttribute(Attribute::SanitizeHWAddress))
-    return /*Changed*/ false;
-
   bool Changed = false;
   auto *DL = &F.getParent()->getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index efb621cde9065..633183dd8eac8 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1435,6 +1435,11 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
         if (DDI->getVariableLocationOp(LocNo) == AI)
           DDI->setExpression(DIExpression::appendOpsToArg(DDI->getExpression(),
                                                           NewOps, LocNo));
+      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DDI)) {
+        if (DAI->getAddress() == AI)
+          DAI->setAddressExpression(
+              DIExpression::prependOpcodes(DDI->getExpression(), NewOps));
+      }
     }
 
     auto TagEnd = [&](Instruction *Node) {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index f94047633022c..d2efcde5d3803 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -138,16 +138,20 @@ void StackInfoBuilder::visit(Instruction &Inst) {
     return;
   }
   if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    for (Value *V : DVI->location_ops()) {
+    auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
-          continue;
+          return;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
         auto &DVIVec = AInfo.DbgVariableIntrinsics;
         if (DVIVec.empty() || DVIVec.back() != DVI)
           DVIVec.push_back(DVI);
       }
-    }
+    };
+    for (Value *V : DVI->location_ops())
+      AddIfInteresting(V);
+    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
+      AddIfInteresting(DAI->getAddress());
   }
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
new file mode 100644
index 0000000000000..ef0dd46cb45c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
@@ -0,0 +1,71 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+;; Similar to dbg-assign-tag-offset.ll except the variable 'x' has been removed
+;; and 'y' has an implicit location range as well as stack location range
+;; (according to the hand-modified debug info -- see the dbg.value).
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @llvm.dbg.value(metadata i32 2, metadata !20, metadata !DIExpression()), !dbg !22
+  call void @use(ptr null), !dbg !28
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
new file mode 100644
index 0000000000000..a587f93d14d70
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
@@ -0,0 +1,71 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x00)
+; CHECK-NEXT:   DW_AT_name    ("x")
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !18, metadata !DIExpression(), metadata !31, metadata ptr %1, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0)), !dbg !22
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
index c4b209de77017..6c9366609cba2 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
 
-; CHECK: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.assign
 
 define dso_local void @f() sanitize_hwaddress !dbg !9 {
 entry:
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
new file mode 100644
index 0000000000000..1248c0bc586ab
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
@@ -0,0 +1,59 @@
+; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
+
+source_filename = "test.ll"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
+
+; Function Attrs: sanitize_hwaddress
+define void @f() #0 !dbg !7 {
+entry:
+  %nodebug0 = alloca ptr, align 8
+  %nodebug1 = alloca ptr, align 8
+  %nodebug2 = alloca ptr, align 8
+  %nodebug3 = alloca ptr, align 8
+  ; CHECK: %a = alloca{{.*}} !DIAssignID ![[ID1:[0-9]+]]
+  %a = alloca ptr, align 8, !DIAssignID !13
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID1]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !15
+  ; CHECK: %b = alloca{{.*}} !DIAssignID ![[ID2:[0-9]+]]
+  %b = alloca ptr, align 8, !DIAssignID !16
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID2]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !17, metadata !DIExpression(), metadata !16, metadata ptr %b, metadata !DIExpression()), !dbg !15
+  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
+  ret void, !dbg !18
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #1
+
+attributes #0 = { sanitize_hwaddress }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "x.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!6 = !{!"clang"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
+!12 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!13 = distinct !DIAssignID()
+!14 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 1, type: !10)
+!15 = !DILocation(line: 0, scope: !7)
+!16 = distinct !DIAssignID()
+!17 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 1, type: !10)
+!18 = !DILocation(line: 1, column: 37, scope: !7)

From a859df3b0a099648ec4cd305f22c87ea12ebaac9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Dominik=20W=C3=B3jt?= <dominik.wojt@arm.com>
Date: Mon, 22 Jan 2024 18:09:14 +0100
Subject: [PATCH 431/843] [libc++] Add "using-if-exists" to timespec_get in
 modules (#78686)

Picolibc does not provide timespec_get function. Adding
"using-if-exists" attribute fixes the modules.

This is a follow up patch for
https://github.com/llvm/llvm-project/pull/78580
---
 libcxx/modules/std.compat/ctime.inc                           | 2 +-
 libcxx/modules/std/ctime.inc                                  | 2 +-
 .../libcxx/selftest/modules/std-and-std.compat-module.sh.cpp  | 4 ----
 libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp  | 4 ----
 libcxx/test/std/modules/std.compat.pass.cpp                   | 4 ----
 5 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/libcxx/modules/std.compat/ctime.inc b/libcxx/modules/std.compat/ctime.inc
index 92e3403a5e58e..6e621f494348d 100644
--- a/libcxx/modules/std.compat/ctime.inc
+++ b/libcxx/modules/std.compat/ctime.inc
@@ -24,5 +24,5 @@ export {
   using ::mktime;
   using ::strftime;
   using ::time;
-  using ::timespec_get;
+  using ::timespec_get _LIBCPP_USING_IF_EXISTS;
 } // export
diff --git a/libcxx/modules/std/ctime.inc b/libcxx/modules/std/ctime.inc
index c407ffc35e3fe..c98cb28e649b8 100644
--- a/libcxx/modules/std/ctime.inc
+++ b/libcxx/modules/std/ctime.inc
@@ -24,5 +24,5 @@ export namespace std {
   using std::mktime;
   using std::strftime;
   using std::time;
-  using std::timespec_get;
+  using std::timespec_get _LIBCPP_USING_IF_EXISTS;
 } // namespace std
diff --git a/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
index 81241d7f43f9a..d56ebb2961d4a 100644
--- a/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
+++ b/libcxx/test/libcxx/selftest/modules/std-and-std.compat-module.sh.cpp
@@ -12,10 +12,6 @@
 
 // XFAIL: has-no-cxx-module-support
 
-// picolibc does not provide the required timespec_get function, and the
-// "using-if-exists" mechanism apparently did not work here.
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // Make sure that the compile flags contain the expected elements.
 // The tests only look for the expected components and not the exact flags.
 // Otherwise changing the location of the module would break this test.
diff --git a/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp b/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
index b74c2f1a249fc..e84709853fbca 100644
--- a/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
+++ b/libcxx/test/libcxx/selftest/modules/std.compat-module.sh.cpp
@@ -12,10 +12,6 @@
 
 // XFAIL: has-no-cxx-module-support
 
-// picolibc does not provide the required timespec_get function, and the
-// "using-if-exists" mechanism apparently did not work here.
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // Make sure that the compile flags contain the expected elements.
 // The tests only look for the expected components and not the exact flags.
 // Otherwise changing the location of the module would break this test.
diff --git a/libcxx/test/std/modules/std.compat.pass.cpp b/libcxx/test/std/modules/std.compat.pass.cpp
index 40ea979e27346..e840f3c6b629c 100644
--- a/libcxx/test/std/modules/std.compat.pass.cpp
+++ b/libcxx/test/std/modules/std.compat.pass.cpp
@@ -12,10 +12,6 @@
 
 // XFAIL: has-no-cxx-module-support
 
-// picolibc does not provide the required timespec_get function, and the
-// "using-if-exists" mechanism apparently did not work here.
-// XFAIL: LIBCXX-PICOLIBC-FIXME
-
 // A minimal test to validate import works.
 
 // MODULE_DEPENDENCIES: std.compat

From 43b13341fbbb718223484a79a539a3c13062f39f Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 22 Jan 2024 09:09:46 -0800
Subject: [PATCH 432/843] [ELF] Add internal InputFile (#78944)

Based on https://reviews.llvm.org/D45375 . Introduce a new InputFile
kind `InternalKind`, use it for

* `ctx.internalFile`: for linker-defined symbols and some synthesized
`Undefined`
* `createInternalFile`: for symbol assignments and --defsym

I picked "internal" instead of "synthetic" to avoid confusion with
SyntheticSection.

Currently a symbol's file is one of: nullptr, ObjKind, SharedKind,
BitcodeKind, BinaryKind. Now it's non-null (I plan to add an
`assert(file)` to Symbol::Symbol and change `toString(const InputFile
*)`
separately).

Debugging and error reporting gets improved. The immediate user-facing
difference is more descriptive "File" column in the --cref output. This
patch may unlock further simplification.

Currently each symbol assignment gets its own
`createInternalFile(cmd->location)`. Two symbol assignments in a linker
script do not share the same file. Making the file the same would be
nice, but would require non trivial code.
---
 lld/ELF/Arch/ARM.cpp                          |  6 ++--
 lld/ELF/Arch/Mips.cpp                         |  7 ++---
 lld/ELF/Arch/PPC64.cpp                        |  2 +-
 lld/ELF/Config.h                              |  2 ++
 lld/ELF/Driver.cpp                            |  6 +++-
 lld/ELF/InputFiles.cpp                        |  4 +++
 lld/ELF/InputFiles.h                          |  7 +++--
 lld/ELF/InputSection.cpp                      |  4 ++-
 lld/ELF/InputSection.h                        |  3 +-
 lld/ELF/LTO.cpp                               |  6 ++--
 lld/ELF/LinkerScript.cpp                      |  8 ++---
 lld/ELF/Relocations.cpp                       |  2 +-
 lld/ELF/ScriptLexer.h                         |  4 +--
 lld/ELF/ScriptParser.cpp                      |  3 +-
 lld/ELF/SyntheticSections.cpp                 |  4 +--
 lld/ELF/Target.cpp                            |  2 +-
 lld/ELF/Writer.cpp                            | 31 ++++++++++---------
 lld/test/ELF/cref.s                           | 11 ++++++-
 .../ELF/linkerscript/symbol-ordering-file2.s  |  5 +--
 lld/test/ELF/x86-64-gotpc-no-relax-err.s      |  3 ++
 20 files changed, 75 insertions(+), 45 deletions(-)

diff --git a/lld/ELF/Arch/ARM.cpp b/lld/ELF/Arch/ARM.cpp
index d34e74a11c6d8..687f9499009d5 100644
--- a/lld/ELF/Arch/ARM.cpp
+++ b/lld/ELF/Arch/ARM.cpp
@@ -1381,9 +1381,9 @@ template <typename ELFT> void elf::writeARMCmseImportLib() {
   // Copy the secure gateway entry symbols to the import library symbol table.
   for (auto &p : symtab.cmseSymMap) {
     Defined *d = cast<Defined>(p.second.sym);
-    impSymTab->addSymbol(makeDefined(nullptr, d->getName(), d->computeBinding(),
-                                     /*stOther=*/0, STT_FUNC, d->getVA(),
-                                     d->getSize(), nullptr));
+    impSymTab->addSymbol(makeDefined(
+        ctx.internalFile, d->getName(), d->computeBinding(),
+        /*stOther=*/0, STT_FUNC, d->getVA(), d->getSize(), nullptr));
   }
 
   size_t idx = 0;
diff --git a/lld/ELF/Arch/Mips.cpp b/lld/ELF/Arch/Mips.cpp
index d6c70aeba95dd..b02ad10649d90 100644
--- a/lld/ELF/Arch/Mips.cpp
+++ b/lld/ELF/Arch/Mips.cpp
@@ -771,12 +771,11 @@ template <class ELFT> bool elf::isMipsPIC(const Defined *sym) {
   if (!sym->section)
     return false;
 
-  ObjFile<ELFT> *file =
-      cast<InputSectionBase>(sym->section)->template getFile<ELFT>();
-  if (!file)
+  InputFile *file = cast<InputSectionBase>(sym->section)->file;
+  if (!file || file->isInternal())
     return false;
 
-  return file->getObj().getHeader().e_flags & EF_MIPS_PIC;
+  return cast<ObjFile<ELFT>>(file)->getObj().getHeader().e_flags & EF_MIPS_PIC;
 }
 
 template <class ELFT> TargetInfo *elf::getMipsTargetInfo() {
diff --git a/lld/ELF/Arch/PPC64.cpp b/lld/ELF/Arch/PPC64.cpp
index 097a57514770a..de52f6a79a40b 100644
--- a/lld/ELF/Arch/PPC64.cpp
+++ b/lld/ELF/Arch/PPC64.cpp
@@ -253,7 +253,7 @@ static bool addOptional(StringRef name, uint64_t value,
   Symbol *sym = symtab.find(name);
   if (!sym || sym->isDefined())
     return false;
-  sym->resolve(Defined{/*file=*/nullptr, StringRef(), STB_GLOBAL, STV_HIDDEN,
+  sym->resolve(Defined{ctx.internalFile, StringRef(), STB_GLOBAL, STV_HIDDEN,
                        STT_FUNC, value,
                        /*size=*/0, /*section=*/nullptr});
   defined.push_back(cast<Defined>(sym));
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 56229334f9a44..3a4a47d900fe7 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -473,6 +473,8 @@ struct Ctx {
                  std::pair<const InputFile *, const InputFile *>>
       backwardReferences;
   llvm::SmallSet<llvm::StringRef, 0> auxiliaryFiles;
+  // InputFile for linker created symbols with no source location.
+  InputFile *internalFile;
   // True if SHT_LLVM_SYMPART is used.
   std::atomic<bool> hasSympart{false};
   // True if there are TLS IE relocations. Set DF_STATIC_TLS if -shared.
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index b988f4311e61b..62e1c29504ba2 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -105,6 +105,7 @@ void Ctx::reset() {
   whyExtractRecords.clear();
   backwardReferences.clear();
   auxiliaryFiles.clear();
+  internalFile = nullptr;
   hasSympart.store(false, std::memory_order_relaxed);
   hasTlsIe.store(false, std::memory_order_relaxed);
   needsTlsLd.store(false, std::memory_order_relaxed);
@@ -2337,7 +2338,8 @@ static void readSymbolPartitionSection(InputSectionBase *s) {
 
 static Symbol *addUnusedUndefined(StringRef name,
                                   uint8_t binding = STB_GLOBAL) {
-  return symtab.addSymbol(Undefined{nullptr, name, binding, STV_DEFAULT, 0});
+  return symtab.addSymbol(
+      Undefined{ctx.internalFile, name, binding, STV_DEFAULT, 0});
 }
 
 static void markBuffersAsDontNeed(bool skipLinkedOutput) {
@@ -2696,6 +2698,8 @@ void LinkerDriver::link(opt::InputArgList &args) {
   for (auto *arg : args.filtered(OPT_trace_symbol))
     symtab.insert(arg->getValue())->traced = true;
 
+  ctx.internalFile = createInternalFile("<internal>");
+
   // Handle -u/--undefined before input files. If both a.a and b.so define foo,
   // -u foo a.a b.so will extract a.a.
   for (StringRef name : config->undefined)
diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index 147afcf306980..af42b0381e991 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1785,6 +1785,10 @@ void BinaryFile::parse() {
                                       nullptr});
 }
 
+InputFile *elf::createInternalFile(StringRef name) {
+  return make<InputFile>(InputFile::InternalKind, MemoryBufferRef("", name));
+}
+
 ELFFileBase *elf::createObjFile(MemoryBufferRef mb, StringRef archiveName,
                                 bool lazy) {
   ELFFileBase *f;
diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index ab98d78fcf145..833124275b2e1 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -63,14 +63,17 @@ class InputFile {
     SharedKind,
     BitcodeKind,
     BinaryKind,
+    InternalKind,
   };
 
+  InputFile(Kind k, MemoryBufferRef m);
   Kind kind() const { return fileKind; }
 
   bool isElf() const {
     Kind k = kind();
     return k == ObjKind || k == SharedKind;
   }
+  bool isInternal() const { return kind() == InternalKind; }
 
   StringRef getName() const { return mb.getBufferIdentifier(); }
   MemoryBufferRef mb;
@@ -151,9 +154,6 @@ class InputFile {
   // R_PPC64_TLSLD. Disable TLS relaxation to avoid bad code generation.
   bool ppc64DisableTLSRelax = false;
 
-protected:
-  InputFile(Kind k, MemoryBufferRef m);
-
 public:
   // If not empty, this stores the name of the archive containing this file.
   // We use this string for creating error messages.
@@ -380,6 +380,7 @@ class BinaryFile : public InputFile {
   void parse();
 };
 
+InputFile *createInternalFile(StringRef name);
 ELFFileBase *createObjFile(MemoryBufferRef mb, StringRef archiveName = "",
                            bool lazy = false);
 
diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index 586404643cc1f..c728dd6c6306a 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -245,6 +245,8 @@ InputSection *InputSectionBase::getLinkOrderDep() const {
 // Find a symbol that encloses a given location.
 Defined *InputSectionBase::getEnclosingSymbol(uint64_t offset,
                                               uint8_t type) const {
+  if (file->isInternal())
+    return nullptr;
   for (Symbol *b : file->getSymbols())
     if (Defined *d = dyn_cast<Defined>(b))
       if (d->section == this && d->value <= offset &&
@@ -344,7 +346,7 @@ template <class ELFT> void InputSection::copyShtGroup(uint8_t *buf) {
 }
 
 InputSectionBase *InputSection::getRelocatedSection() const {
-  if (!file || (type != SHT_RELA && type != SHT_REL))
+  if (!file || file->isInternal() || (type != SHT_RELA && type != SHT_REL))
     return nullptr;
   ArrayRef<InputSectionBase *> sections = file->getSections();
   return sections[info];
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index fbaea57bd586b..dda4242d8be1c 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -9,6 +9,7 @@
 #ifndef LLD_ELF_INPUT_SECTION_H
 #define LLD_ELF_INPUT_SECTION_H
 
+#include "Config.h"
 #include "Relocations.h"
 #include "lld/Common/CommonLinkerContext.h"
 #include "lld/Common/LLVM.h"
@@ -413,7 +414,7 @@ class SyntheticSection : public InputSection {
 public:
   SyntheticSection(uint64_t flags, uint32_t type, uint32_t addralign,
                    StringRef name)
-      : InputSection(nullptr, flags, type, addralign, {}, name,
+      : InputSection(ctx.internalFile, flags, type, addralign, {}, name,
                      InputSectionBase::Synthetic) {}
 
   virtual ~SyntheticSection() = default;
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index 504c12aac6c56..728e15a6976ab 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -256,10 +256,12 @@ void BitcodeCompiler::add(BitcodeFile &f) {
         // Symbol section is always null for bitcode symbols, hence the check
         // for isElf(). Skip linker script defined symbols as well: they have
         // no File defined.
-        !(dr->section == nullptr && (!sym->file || sym->file->isElf()));
+        !(dr->section == nullptr &&
+          (sym->file->isInternal() || sym->file->isElf()));
 
     if (r.Prevailing)
-      Undefined(nullptr, StringRef(), STB_GLOBAL, STV_DEFAULT, sym->type)
+      Undefined(ctx.internalFile, StringRef(), STB_GLOBAL, STV_DEFAULT,
+                sym->type)
           .overwrite(*sym);
 
     // We tell LTO to not apply interprocedural optimization for wrapped
diff --git a/lld/ELF/LinkerScript.cpp b/lld/ELF/LinkerScript.cpp
index 03aec187668a4..9e7647f63ca5a 100644
--- a/lld/ELF/LinkerScript.cpp
+++ b/lld/ELF/LinkerScript.cpp
@@ -233,8 +233,8 @@ void LinkerScript::addSymbol(SymbolAssignment *cmd) {
   // write expressions like this: `alignment = 16; . = ALIGN(., alignment)`.
   uint64_t symValue = value.sec ? 0 : value.getValue();
 
-  Defined newSym(nullptr, cmd->name, STB_GLOBAL, visibility, value.type,
-                 symValue, 0, sec);
+  Defined newSym(createInternalFile(cmd->location), cmd->name, STB_GLOBAL,
+                 visibility, value.type, symValue, 0, sec);
 
   Symbol *sym = symtab.insert(cmd->name);
   sym->mergeProperties(newSym);
@@ -250,8 +250,8 @@ static void declareSymbol(SymbolAssignment *cmd) {
     return;
 
   uint8_t visibility = cmd->hidden ? STV_HIDDEN : STV_DEFAULT;
-  Defined newSym(nullptr, cmd->name, STB_GLOBAL, visibility, STT_NOTYPE, 0, 0,
-                 nullptr);
+  Defined newSym(ctx.internalFile, cmd->name, STB_GLOBAL, visibility,
+                 STT_NOTYPE, 0, 0, nullptr);
 
   // If the symbol is already defined, its order is 0 (with absence indicating
   // 0); otherwise it's assigned the order of the SymbolAssignment.
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 59b0220795871..b6a317bc3b6d6 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -1757,7 +1757,7 @@ void elf::postScanRelocations() {
 
   GotSection *got = in.got.get();
   if (ctx.needsTlsLd.load(std::memory_order_relaxed) && got->addTlsIndex()) {
-    static Undefined dummy(nullptr, "", STB_LOCAL, 0, 0);
+    static Undefined dummy(ctx.internalFile, "", STB_LOCAL, 0, 0);
     if (config->shared)
       mainPart->relaDyn->addReloc(
           {target->tlsModuleIndexRel, got, got->getTlsIndexOff()});
diff --git a/lld/ELF/ScriptLexer.h b/lld/ELF/ScriptLexer.h
index 17c0b529fa176..7919e493fa28b 100644
--- a/lld/ELF/ScriptLexer.h
+++ b/lld/ELF/ScriptLexer.h
@@ -32,6 +32,7 @@ class ScriptLexer {
   void expect(StringRef expect);
   bool consumeLabel(StringRef tok);
   std::string getCurrentLocation();
+  MemoryBufferRef getCurrentMB();
 
   std::vector<MemoryBufferRef> mbs;
   std::vector<StringRef> tokens;
@@ -41,9 +42,6 @@ class ScriptLexer {
   size_t lastLineNumber = 0;
   size_t lastLineNumberOffset = 0;
 
-protected:
-  MemoryBufferRef getCurrentMB();
-
 private:
   void maybeSplitExpr();
   StringRef getLine();
diff --git a/lld/ELF/ScriptParser.cpp b/lld/ELF/ScriptParser.cpp
index 4fdb8c7075a61..dd69916d6b05e 100644
--- a/lld/ELF/ScriptParser.cpp
+++ b/lld/ELF/ScriptParser.cpp
@@ -288,7 +288,8 @@ void ScriptParser::readDefsym(StringRef name) {
   Expr e = readExpr();
   if (!atEOF())
     setError("EOF expected, but got " + next());
-  auto *cmd = make<SymbolAssignment>(name, e, 0, getCurrentLocation());
+  auto *cmd = make<SymbolAssignment>(
+      name, e, 0, getCurrentMB().getBufferIdentifier().str());
   script->sectionCommands.push_back(cmd);
 }
 
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index 1c1b0ee2f9c8c..4b413163314b2 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -261,8 +261,8 @@ InputSection *elf::createInterpSection() {
   StringRef s = saver().save(config->dynamicLinker);
   ArrayRef<uint8_t> contents = {(const uint8_t *)s.data(), s.size() + 1};
 
-  return make<InputSection>(nullptr, SHF_ALLOC, SHT_PROGBITS, 1, contents,
-                            ".interp");
+  return make<InputSection>(ctx.internalFile, SHF_ALLOC, SHT_PROGBITS, 1,
+                            contents, ".interp");
 }
 
 Defined *elf::addSyntheticLocal(StringRef name, uint8_t type, uint64_t value,
diff --git a/lld/ELF/Target.cpp b/lld/ELF/Target.cpp
index 41990f40f68b8..671d22cc66a0e 100644
--- a/lld/ELF/Target.cpp
+++ b/lld/ELF/Target.cpp
@@ -112,7 +112,7 @@ ErrorPlace elf::getErrorPlace(const uint8_t *loc) {
       std::string objLoc = isec->getLocation(loc - isecLoc);
       // Return object file location and source file location.
       // TODO: Refactor getSrcMsg not to take a variable.
-      Undefined dummy(nullptr, "", STB_LOCAL, 0, 0);
+      Undefined dummy(ctx.internalFile, "", STB_LOCAL, 0, 0);
       return {isec, objLoc + ": ",
               isec->file ? isec->getSrcMsg(dummy, loc - isecLoc) : ""};
     }
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index dfec5e07301a7..b18d2239dc58f 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -148,23 +148,24 @@ static Defined *addOptionalRegular(StringRef name, SectionBase *sec,
   if (!s || s->isDefined() || s->isCommon())
     return nullptr;
 
-  s->resolve(Defined{nullptr, StringRef(), STB_GLOBAL, stOther, STT_NOTYPE, val,
+  s->resolve(Defined{ctx.internalFile, StringRef(), STB_GLOBAL, stOther,
+                     STT_NOTYPE, val,
                      /*size=*/0, sec});
   s->isUsedInRegularObj = true;
   return cast<Defined>(s);
 }
 
-static Defined *addAbsolute(StringRef name) {
-  Symbol *sym = symtab.addSymbol(Defined{nullptr, name, STB_GLOBAL, STV_HIDDEN,
-                                         STT_NOTYPE, 0, 0, nullptr});
-  sym->isUsedInRegularObj = true;
-  return cast<Defined>(sym);
-}
-
 // The linker is expected to define some symbols depending on
 // the linking result. This function defines such symbols.
 void elf::addReservedSymbols() {
   if (config->emachine == EM_MIPS) {
+    auto addAbsolute = [](StringRef name) {
+      Symbol *sym =
+          symtab.addSymbol(Defined{ctx.internalFile, name, STB_GLOBAL,
+                                   STV_HIDDEN, STT_NOTYPE, 0, 0, nullptr});
+      sym->isUsedInRegularObj = true;
+      return cast<Defined>(sym);
+    };
     // Define _gp for MIPS. st_value of _gp symbol will be updated by Writer
     // so that it points to an absolute address which by default is relative
     // to GOT. Default offset is 0x7ff0.
@@ -213,7 +214,7 @@ void elf::addReservedSymbols() {
     if (config->emachine == EM_PPC64)
       gotOff = 0x8000;
 
-    s->resolve(Defined{/*file=*/nullptr, StringRef(), STB_GLOBAL, STV_HIDDEN,
+    s->resolve(Defined{ctx.internalFile, StringRef(), STB_GLOBAL, STV_HIDDEN,
                        STT_NOTYPE, gotOff, /*size=*/0, Out::elfHeader});
     ElfSym::globalOffsetTable = cast<Defined>(s);
   }
@@ -280,7 +281,8 @@ static void demoteSymbolsAndComputeIsPreemptible() {
       auto *s = dyn_cast<SharedSymbol>(sym);
       if (sym->isLazy() || (s && !cast<SharedFile>(s->file)->isNeeded)) {
         uint8_t binding = sym->isLazy() ? sym->binding : uint8_t(STB_WEAK);
-        Undefined(nullptr, sym->getName(), binding, sym->stOther, sym->type)
+        Undefined(ctx.internalFile, sym->getName(), binding, sym->stOther,
+                  sym->type)
             .overwrite(*sym);
         sym->versionId = VER_NDX_GLOBAL;
       }
@@ -1922,7 +1924,7 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
     // https://sourceware.org/ml/binutils/2002-03/msg00360.html
     if (mainPart->dynamic->parent) {
       Symbol *s = symtab.addSymbol(Defined{
-          /*file=*/nullptr, "_DYNAMIC", STB_WEAK, STV_HIDDEN, STT_NOTYPE,
+          ctx.internalFile, "_DYNAMIC", STB_WEAK, STV_HIDDEN, STT_NOTYPE,
           /*value=*/0, /*size=*/0, mainPart->dynamic.get()});
       s->isUsedInRegularObj = true;
     }
@@ -1964,7 +1966,7 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
       // define _TLS_MODULE_BASE_ relative to the first TLS section.
       Symbol *s = symtab.find("_TLS_MODULE_BASE_");
       if (s && s->isUndefined()) {
-        s->resolve(Defined{/*file=*/nullptr, StringRef(), STB_GLOBAL,
+        s->resolve(Defined{ctx.internalFile, StringRef(), STB_GLOBAL,
                            STV_HIDDEN, STT_TLS, /*value=*/0, 0,
                            /*section=*/nullptr});
         ElfSym::tlsModuleBase = cast<Defined>(s);
@@ -2106,8 +2108,9 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
   // With the outputSections available check for GDPLT relocations
   // and add __tls_get_addr symbol if needed.
   if (config->emachine == EM_HEXAGON && hexagonNeedsTLSSymbol(outputSections)) {
-    Symbol *sym = symtab.addSymbol(Undefined{
-        nullptr, "__tls_get_addr", STB_GLOBAL, STV_DEFAULT, STT_NOTYPE});
+    Symbol *sym =
+        symtab.addSymbol(Undefined{ctx.internalFile, "__tls_get_addr",
+                                   STB_GLOBAL, STV_DEFAULT, STT_NOTYPE});
     sym->isPreemptible = true;
     partitions[0].dynSymTab->addSymbol(sym);
   }
diff --git a/lld/test/ELF/cref.s b/lld/test/ELF/cref.s
index 662a2ce339c33..ad87a419a55f8 100644
--- a/lld/test/ELF/cref.s
+++ b/lld/test/ELF/cref.s
@@ -3,11 +3,12 @@
 // RUN: echo '.global foo; foo:' | llvm-mc -filetype=obj -triple=x86_64-pc-linux - -o %t1.o
 // RUN: echo '.global foo, bar; bar:' | llvm-mc -filetype=obj -triple=x86_64-pc-linux - -o %t2.o
 // RUN: echo '.global zed; zed:' | llvm-mc -filetype=obj -triple=x86_64-pc-linux - -o %ta.o
+// RUN: echo 'abs1 = 42;' > %t.lds
 // RUN: rm -f %t.a
 // RUN: llvm-ar rcs %t.a %ta.o
 // RUN: llvm-mc -filetype=obj -triple=x86_64-pc-linux %s -o %t3.o
 // RUN: ld.lld -shared -o %t1.so %t1.o
-// RUN: ld.lld -o /dev/null %t1.so %t2.o %t3.o %t.a --gc-sections --cref | FileCheck -strict-whitespace %s
+// RUN: ld.lld -o /dev/null %t1.so %t2.o %t3.o %t.a %t.a %t.lds --defsym abs2=43 --gc-sections --cref | FileCheck -strict-whitespace %s
 
 /// If -Map is specified, print to the map file.
 // RUN: ld.lld -o /dev/null %t1.so %t2.o %t3.o %t.a --gc-sections -Map=%t.map --cref
@@ -26,6 +27,10 @@
 // CHECK-NEXT: baz                                               {{.*}}3.o
 // CHECK-NEXT: zed                                               {{.*}}.a({{.*}}a.o)
 // CHECK-NEXT:                                                   {{.*}}3.o
+// CHECK-NEXT: abs1                                              {{.*}}.lds:1
+// CHECK-NEXT:                                                   {{.*}}3.o
+// CHECK-NEXT: abs2                                              --defsym{{$}}
+// CHECK-NEXT:                                                   {{.*}}3.o
 // CHECK-NOT:  discarded
 
 // CHECK2:         VMA              LMA     Size Align Out     In      Symbol
@@ -46,3 +51,7 @@ baz:
 
 .section .text.a,"ax",@progbits
 discarded:
+
+.data
+.quad abs1
+.quad abs2
diff --git a/lld/test/ELF/linkerscript/symbol-ordering-file2.s b/lld/test/ELF/linkerscript/symbol-ordering-file2.s
index 31746ae0a333c..e441aa2afaa3a 100644
--- a/lld/test/ELF/linkerscript/symbol-ordering-file2.s
+++ b/lld/test/ELF/linkerscript/symbol-ordering-file2.s
@@ -7,10 +7,11 @@
 # RUN: echo "SECTIONS { bar = 1; }" > %t.script
 # RUN: ld.lld --symbol-ordering-file %t.ord %t.o --script %t.script \
 # RUN:   -o %t.out 2>&1 | FileCheck %s
-# CHECK: warning: <internal>: unable to order absolute symbol: bar
+# CHECK: warning: {{.*}}.script:1: unable to order absolute symbol: bar
 
 ## Check we do not crash when trying to order --defsym symbol.
 
 # RUN: echo "bar" > %t.ord
 # RUN: ld.lld --symbol-ordering-file %t.ord %t.o -defsym=bar=1 \
-# RUN:   -o %t.out 2>&1 | FileCheck %s
+# RUN:   -o %t.out 2>&1 | FileCheck %s --check-prefix=DEFSYM
+# DEFSYM: warning: --defsym: unable to order absolute symbol: bar
diff --git a/lld/test/ELF/x86-64-gotpc-no-relax-err.s b/lld/test/ELF/x86-64-gotpc-no-relax-err.s
index 84b0d304df4ac..618dca47755f4 100644
--- a/lld/test/ELF/x86-64-gotpc-no-relax-err.s
+++ b/lld/test/ELF/x86-64-gotpc-no-relax-err.s
@@ -8,7 +8,10 @@
 ## associated file or linker script line number).
 
 # CHECK:      error: {{.*}}:(.text+0x2): relocation R_X86_64_GOTPCRELX out of range: 2147483658 is not in [-2147483648, 2147483647]; references '__stop_data'
+# CHECK-NEXT: >>> defined in <internal>
+# CHECK-EMPTY:
 # CHECK-NEXT: error: {{.*}}:(.text+0x9): relocation R_X86_64_REX_GOTPCRELX out of range: 2147483651 is not in [-2147483648, 2147483647]; references '__stop_data'
+# CHECK-NEXT: >>> defined in <internal>
 
 #--- a.s
   movl __stop_data@GOTPCREL(%rip), %eax  # out of range

From c083b38007c4895cee8b175bd54d24be9fdb5fc0 Mon Sep 17 00:00:00 2001
From: Jon Roelofs <jonathan_roelofs@apple.com>
Date: Mon, 22 Jan 2024 09:10:16 -0800
Subject: [PATCH 433/843] [builtins][FMV][Apple] Use builtin atomic load/store,
 instead of libdispatch (#78807)

---
 .../builtins/cpu_model/aarch64/fmv/apple.inc  | 28 ++++++++++---------
 1 file changed, 15 insertions(+), 13 deletions(-)

diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
index 589c3ca72e7b2..0bb755f4b305c 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/apple.inc
@@ -14,19 +14,17 @@ static bool isKnownAndSupported(const char *name) {
 void __init_cpu_features_resolver(void) {
   // On Darwin platforms, this may be called concurrently by multiple threads
   // because the resolvers that use it are called lazily at runtime (unlike on
-  // ELF platforms, where IFuncs are resolved serially at load time).  This
-  // function's effect on __aarch64_cpu_features should be idempotent, but even
-  // so we need dispatch_once to resolve the race condition.  Dispatch is
-  // available through libSystem, which we need anyway for the sysctl, so this
-  // does not add a new dependency.
+  // ELF platforms, where IFuncs are resolved serially at load time). This
+  // function's effect on __aarch64_cpu_features must be idempotent.
+
+  if (!__atomic_load_n(&__aarch64_cpu_features.features, __ATOMIC_RELAXED)) {
+    uint64_t features = 0;
 
-  static dispatch_once_t onceToken = 0;
-  dispatch_once(&onceToken, ^{
     // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics
     static const struct {
       const char *sysctl_name;
       enum CPUFeatures feature;
-    } features[] = {
+    } feature_checks[] = {
         {"hw.optional.arm.FEAT_FlagM", FEAT_FLAGM},
         {"hw.optional.arm.FEAT_FlagM2", FEAT_FLAGM2},
         {"hw.optional.arm.FEAT_FHM", FEAT_FP16FML},
@@ -58,12 +56,16 @@ void __init_cpu_features_resolver(void) {
         {"hw.optional.arm.FEAT_BTI", FEAT_BTI},
     };
 
-    for (size_t I = 0, E = sizeof(features) / sizeof(features[0]); I != E; ++I)
-      if (isKnownAndSupported(features[I].sysctl_name))
-        __aarch64_cpu_features.features |= (1ULL << features[I].feature);
+    for (size_t I = 0, E = sizeof(feature_checks) / sizeof(feature_checks[0]);
+         I != E; ++I)
+      if (isKnownAndSupported(feature_checks[I].sysctl_name))
+        features |= (1ULL << feature_checks[I].feature);
+
+    features |= (1ULL << FEAT_INIT);
 
-    __aarch64_cpu_features.features |= (1ULL << FEAT_INIT);
-  });
+    __atomic_store(&__aarch64_cpu_features.features, &features,
+                   __ATOMIC_RELAXED);
+  }
 }
 
 #endif // TARGET_OS_OSX || TARGET_OS_IPHONE

From 9a90aa01edfc5a56d975658a0cbbfa980a0ef7ed Mon Sep 17 00:00:00 2001
From: Gareth Williamson <gwllx.dev@gmail.com>
Date: Mon, 22 Jan 2024 18:11:45 +0100
Subject: [PATCH 434/843] [libc++][modules] Add using_if_exists attribute
 (#77559) (#78909)

libc on macOS does not provide at_quick_exit or quick_exit. This allows
modules to build on macOS and defer any errors to usage site of these
symbols.

Fixes: https://github.com/llvm/llvm-project/issues/77559
---
 libcxx/modules/std.compat/cstdlib.inc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/modules/std.compat/cstdlib.inc b/libcxx/modules/std.compat/cstdlib.inc
index 9333d84870710..a45a0a1caf8ba 100644
--- a/libcxx/modules/std.compat/cstdlib.inc
+++ b/libcxx/modules/std.compat/cstdlib.inc
@@ -16,10 +16,10 @@ export {
   // [support.start.term], start and termination
   using ::_Exit;
   using ::abort;
-  using ::at_quick_exit;
+  using ::at_quick_exit _LIBCPP_USING_IF_EXISTS;
   using ::atexit;
   using ::exit;
-  using ::quick_exit;
+  using ::quick_exit _LIBCPP_USING_IF_EXISTS;
 
   using ::getenv;
   using ::system;

From 312acdfae1409bee29a33f71306e3ae3e1ea7d66 Mon Sep 17 00:00:00 2001
From: Matthew Devereau <matthew.devereau@arm.com>
Date: Mon, 22 Jan 2024 17:12:16 +0000
Subject: [PATCH 435/843] [AArch64][SME] Take arm_sme.h out of draft (#78961)

---
 clang/lib/Headers/CMakeLists.txt                          | 6 +++---
 .../CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c     | 2 +-
 .../CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c     | 2 +-
 clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c  | 2 +-
 clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c  | 2 +-
 .../CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c    | 2 +-
 clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c  | 2 +-
 .../CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c   | 2 +-
 .../CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c   | 2 +-
 .../CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c   | 2 +-
 .../CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c   | 2 +-
 clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c | 2 +-
 clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c  | 2 +-
 .../CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c    | 2 +-
 .../CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c  | 2 +-
 clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c  | 2 +-
 .../test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c  | 2 +-
 clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c  | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c     | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c  | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c   | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c     | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c  | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c        | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c     | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c  | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c  | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c     | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c  | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c  | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c  | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c     | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c  | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c     | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c  | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c     | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c  | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c  | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c | 2 +-
 .../acle_sme2_reinterpret_svcount_svbool.c                | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c   | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c  | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c    | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c    | 2 +-
 .../test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_vector_add.c        | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c      | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c       | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c      | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c      | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c      | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c      | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c      | 2 +-
 .../aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c      | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c     | 2 +-
 .../CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c   | 2 +-
 clang/test/Sema/aarch64-incompat-sm-builtin-calls.c       | 2 +-
 clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp   | 2 +-
 clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c  | 2 +-
 clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp | 2 +-
 clang/utils/TableGen/SveEmitter.cpp                       | 4 ++--
 llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn        | 8 ++++----
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel        | 6 +++---
 68 files changed, 76 insertions(+), 76 deletions(-)

diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 735e4e4e3be89..b9a966be73ee9 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -380,8 +380,8 @@ if(ARM IN_LIST LLVM_TARGETS_TO_BUILD OR AArch64 IN_LIST LLVM_TARGETS_TO_BUILD)
   clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h)
   # Generate arm_sve.h
   clang_generate_header(-gen-arm-sve-header arm_sve.td arm_sve.h)
-  # Generate arm_sme_draft_spec_subject_to_change.h
-  clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme_draft_spec_subject_to_change.h)
+  # Generate arm_sme.h
+  clang_generate_header(-gen-arm-sme-header arm_sme.td arm_sme.h)
   # Generate arm_bf16.h
   clang_generate_header(-gen-arm-bf16 arm_bf16.td arm_bf16.h)
   # Generate arm_mve.h
@@ -404,7 +404,7 @@ if(ARM IN_LIST LLVM_TARGETS_TO_BUILD OR AArch64 IN_LIST LLVM_TARGETS_TO_BUILD)
 
   list(APPEND aarch64_only_generated_files
     "${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h"
-    "${CMAKE_CURRENT_BINARY_DIR}/arm_sme_draft_spec_subject_to_change.h"
+    "${CMAKE_CURRENT_BINARY_DIR}/arm_sme.h"
     "${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h"
     "${CMAKE_CURRENT_BINARY_DIR}/arm_vector_types.h"
     )
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
index 2ee14f6a7e882..695e0afa3d0de 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i32.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
index a0fb9bdb4e25a..a3c3d8bf13db6 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_add-i64.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-i16i64 -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
index 09b183dac3e26..2c2f100ac7f8c 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_cnt.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: define dso_local i64 @test_svcntsb(
 // CHECK-C-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
index 183a3986212bc..0502073097d54 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: define dso_local void @test_svld1_hor_za8(
 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
index 68a294b1a237a..60feebced32d2 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1_vnum.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: define dso_local void @test_svld1_hor_vnum_za8(
 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
index 56eb7f45f6d9a..b0c1dd904284f 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ldr.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: @test_svldr_vnum_za(
 // CHECK-CXX-LABEL: @_Z18test_svldr_vnum_zajPKv(
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
index 5015f9bc9f41e..3dcc2c70d3cfc 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za32.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
index 80457359e2a4e..06a6a19ca3f7a 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mopa-za64.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
index 7dad7289d98ad..69641df8a80fb 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za32.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
index e95a5ea2e6941..d726855c9743f 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_mops-za64.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme-f64f64 -target-feature +sme-i16i64 -target-feature +sve -target-feature +bf16 -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
index 700564c1532e0..ed1e70a3a469f 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_read.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
index 97e20ab262902..f93f3e5138b24 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: define dso_local void @test_svst1_hor_za8(
 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
index 7566ad8889e05..b4bc041a21d58 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1_vnum.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: define dso_local void @test_svst1_hor_vnum_za8(
 // CHECK-C-SAME: i32 noundef [[SLICE_BASE:%.*]], <vscale x 16 x i1> [[PG:%.*]], ptr noundef [[PTR:%.*]], i64 noundef [[VNUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
index c3e4967bfe9b1..dc07efbb81603 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_state_funs.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-LABEL: @test_in_streaming_mode(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
index d21c1ce7a8cd9..2ad7e36425723 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_str.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: @test_svstr_vnum_za(
 // CHECK-CXX-LABEL: @_Z18test_svstr_vnum_zajPv(
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
index 5ddd90b95ce89..a00a48ceafccb 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_write.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSME_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SME_OVERLOADED_FORMS
 #define SME_ACLE_FUNC(A1,A2_UNUSED,A3) A1##A3
diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
index 658092bc33ff9..7f56941108828 100644
--- a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
+++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_zero.c
@@ -4,7 +4,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefixes=CHECK,CHECK-CXX
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -S -O1 -Werror -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-C-LABEL: define dso_local void @test_svzero_mask_za(
 // CHECK-C-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
index 311f965786137..cfc2ee0f77be6 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_add.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
index e6415c66fbb48..720254c842120 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_bmop.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
index 4dce1699c264f..a5dec263bff2c 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_clamp.c
@@ -11,7 +11,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve \
 // RUN:   -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
index b292e54ea457f..a8d881b1e2ed3 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvt.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c
index a865fd1bd5f5f..f03e998131df5 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_cvtn.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1  -D__SVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +bf16 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
index c4651f4d83e1f..3093eaf745861 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_fp_dots.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
index 2a34b0e2878ef..b90358112e47f 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_frint.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
index 6e0e9433bae2f..51c07f3cf38a1 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_int_dots.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
index 2e3cbd33d1e2c..1a495d3b117ec 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // LDR ZT0
 
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
index a31c6d982100b..d656178fdf7a1 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 
 // CHECK-LABEL: @test_svluti2_lane_zt_u8(
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
index db44f52a37bf0..60cd24fdf4630 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-LABEL: @test_svluti2_lane_zt_u8(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
index 23b2c6cc51283..e05748c8a6424 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-LABEL: @test_svluti2_lane_zt_u8(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
index c949cee1124ec..1e303a9a661d7 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 
 // CHECK-LABEL: @test_svluti4_lane_zt_u8(
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
index 61affd86d9119..3544f62527aca 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-LABEL: @test_svluti4_lane_zt_u8(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
index 7b478aa2b6ad4..e7151e8256039 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-LABEL: define dso_local <vscale x 32 x i16> @test_svluti4_lane_zt_u16
 // CHECK-SAME: (<vscale x 16 x i8> [[ZN:%.*]]) #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
index 18bda583f0501..4b9b4363ec629 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_max.c
@@ -5,7 +5,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
index 3a14112c0a2c1..5e499573304c8 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_maxnm.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
index 3cab50c13d6ce..26c175c76532b 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_min.c
@@ -5,7 +5,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // REQUIRES: aarch64-registered-target
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
index f102a2e278ffc..8c5e7eb401991 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_minnm.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
index 18c4b77ac3292..597efff0eab99 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mla.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
index 4b785cb86df64..252da9af33bee 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlal.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
index 50cbe3e4bac5b..a4da4e5410ae1 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlall.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sme-i16i64 -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
index 98eaa929cfb77..74511d971400b 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mls.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
index ea685079ebbca..ce74f5d307b45 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mlsl.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
index a2904dc1a6025..b552bbb66a259 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_mop.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
index 028a3d7b155dd..cd87ee0099950 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_read.c
@@ -5,7 +5,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-LABEL: @test_svread_ver_za8_u8_vg2(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
index 82b30e8bbe9b9..6847eb99af2df 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_reinterpret_svcount_svbool.c
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.§
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c
index 6bbd23ccd32a5..7cf53e9a14527 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sqdmulh.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
index 68913c5da4f75..c7da703ddb271 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_sub.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
index 2f427689323b4..6f20b37f87897 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx2.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
index cdcc62d5405e6..781b699c882bb 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_unpkx4.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
index 9aa993f68b163..08431b3ceabed 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vdot.c
@@ -5,7 +5,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
index fa48e63efe3b0..2b24f65efc266 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_add.c
@@ -8,7 +8,7 @@
 // RUN: %clang_cc1 -fclang-abi-compat=latest -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -fclang-abi-compat=latest -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sme-i16i64 -target-feature +sme-f64f64 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
index 50cac48887894..32f9bc3ab8808 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_qrshr.c
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S  -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature +sme-f64f64 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
index ab4594b71293a..8735016ae728c 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_rshl.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -target-feature +sve -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
index 45781f5c27821..32eb9a4b40c40 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx2.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
index f0327790af687..6e9d5cf4492f5 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_selx4.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -target-feature -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
index 118b73972575f..f4dadef8fa578 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx2.c
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
index 9dcd995445c25..ff7f0c501f550 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_uzpx4.c
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
index 17555dc7683f4..3f13b3a0db73d 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx2.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
index ef88f724011bf..b8408ea31ed0b 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_vector_zipx4.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
index bf8e12b6b4969..33733356f3078 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_write.c
@@ -7,7 +7,7 @@
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sve -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -target-feature +sve -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 #ifdef SVE_OVERLOADED_FORMS
 // A simple used,unused... macro, long enough to represent any SVE builtin.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
index 081b1a1d2627c..4a038ce61e3bf 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
@@ -6,7 +6,7 @@
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -p mem2reg,instcombine,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -o /dev/null %s
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 // CHECK-LABEL: @test_svzero_zt(
 // CHECK-NEXT:  entry:
diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
index 9e4b3920e543b..079cff5a5bbae 100644
--- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
+++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
@@ -5,7 +5,7 @@
 // REQUIRES: aarch64-registered-target
 
 #include "arm_neon.h"
-#include "arm_sme_draft_spec_subject_to_change.h"
+#include "arm_sme.h"
 #include "arm_sve.h"
 
 int16x8_t incompat_neon_sm(int16x8_t splat) __arm_streaming {
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
index 40254a5a0eafa..8f3f98394492d 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_imm.cpp
@@ -10,7 +10,7 @@
 #define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4
 #endif
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 void test_range_0_0(uint32_t slice, svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
   // expected-error@+1 {{argument value 18446744073709551615 is outside the valid range [0, 0]}}
diff --git a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
index f1e858f819602..6af115beba8e4 100644
--- a/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
+++ b/clang/test/Sema/aarch64-sme-intrinsics/acle_sme_target.c
@@ -3,7 +3,7 @@
 
 // Test that functions with the correct target attributes can use the correct SME intrinsics.
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 __attribute__((target("sme")))
 void test_sme(svbool_t pg, void *ptr) __arm_streaming __arm_inout("za") {
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 36103480861c3..0d9f090637eec 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -3,7 +3,7 @@
 
 // REQUIRES: aarch64-registered-target
 
-#include <arm_sme_draft_spec_subject_to_change.h>
+#include <arm_sme.h>
 
 void test_multivector_read(uint32_t base) __arm_streaming __arm_in("za") {
 
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index b67c8e524287f..fbedd27bf998d 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1558,7 +1558,7 @@ void SVEEmitter::createTypeFlags(raw_ostream &OS) {
 }
 
 void SVEEmitter::createSMEHeader(raw_ostream &OS) {
-  OS << "/*===---- arm_sme_draft_spec_subject_to_change.h - ARM SME intrinsics "
+  OS << "/*===---- arm_sme.h - ARM SME intrinsics "
         "------===\n"
         " *\n"
         " *\n"
@@ -1575,7 +1575,7 @@ void SVEEmitter::createSMEHeader(raw_ostream &OS) {
   OS << "#define __ARM_SME_H\n\n";
 
   OS << "#if !defined(__LITTLE_ENDIAN__)\n";
-  OS << "#error \"Big endian is currently not supported for arm_sme_draft_spec_subject_to_change.h\"\n";
+  OS << "#error \"Big endian is currently not supported for arm_sme.h\"\n";
   OS << "#endif\n";
 
   OS << "#include <arm_sve.h>\n\n";
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 6059074dfa27b..7c00aaffdc1b9 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -22,11 +22,11 @@ clang_tablegen("arm_sve") {
   output_name = "arm_sve.h"
 }
 
-# Generate arm_sme_draft_spec_subject_to_change.h
-clang_tablegen("arm_sme_draft_spec_subject_to_change") {
+# Generate arm_sme.h
+clang_tablegen("arm_sme") {
   args = [ "-gen-arm-sme-header" ]
   td_file = "//clang/include/clang/Basic/arm_sme.td"
-  output_name = "arm_sme_draft_spec_subject_to_change.h"
+  output_name = "arm_sme.h"
 }
 
 # Generate arm_bf16.h
@@ -72,7 +72,7 @@ copy("tablegen_headers") {
     ":arm_fp16",
     ":arm_mve",
     ":arm_neon",
-    ":arm_sme_draft_spec_subject_to_change",
+    ":arm_sme",
     ":arm_sve",
     ":arm_vector_types",
     ":riscv_vector",
diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index f7ae7f077d514..a0c547b558448 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -1681,14 +1681,14 @@ gentbl(
 )
 
 gentbl(
-    name = "headers_arm_sme_draft_spec_subject_to_change_gen",
+    name = "headers_arm_sme_gen",
     copts = [
         "-Wno-implicit-fallthrough",
         "-Wno-error=frame-larger-than=",
     ],
     tbl_outs = [(
         "-gen-arm-sme-header",
-        "lib/Headers/arm_sme_draft_spec_subject_to_change.h",
+        "lib/Headers/arm_sme.h",
     )],
     tblgen = ":clang-tblgen",
     td_file = "include/clang/Basic/arm_sme.td",
@@ -1743,7 +1743,7 @@ builtin_headers = glob(
     "lib/Headers/arm_mve.h",
     "lib/Headers/arm_neon.h",
     "lib/Headers/arm_sve.h",
-    "lib/Headers/arm_sme_draft_spec_subject_to_change.h",
+    "lib/Headers/arm_sme.h",
     "lib/Headers/arm_vector_types.h",
     "lib/Headers/arm_bf16.h",
     "lib/Headers/module.modulemap",

From 530c72b498e571f2de4850e7ca4b1558a83742da Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Mon, 22 Jan 2024 17:15:17 +0000
Subject: [PATCH 436/843] [TLI] Add missing ArmPL mappings (#78474)

Adds TLI mappings for fixed and scalable vector variants of cospi(f),
fmax(f), ilogb(f) and ldexp(f).
---
 llvm/include/llvm/Analysis/VecFuncs.def       | 20 ++++++++++++
 .../AArch64/veclib-function-calls.ll          | 32 +++++++++----------
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/llvm/include/llvm/Analysis/VecFuncs.def b/llvm/include/llvm/Analysis/VecFuncs.def
index 5d2900a858c91..f09e12f3038ca 100644
--- a/llvm/include/llvm/Analysis/VecFuncs.def
+++ b/llvm/include/llvm/Analysis/VecFuncs.def
@@ -873,6 +873,11 @@ TLI_DEFINE_VECFUNC("coshf", "armpl_vcoshq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v
 TLI_DEFINE_VECFUNC("cosh", "armpl_svcosh_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
 TLI_DEFINE_VECFUNC("coshf", "armpl_svcosh_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
 
+TLI_DEFINE_VECFUNC("cospi", "armpl_vcospiq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("cospif", "armpl_vcospiq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("cospi", "armpl_svcospi_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("cospif", "armpl_svcospi_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
 TLI_DEFINE_VECFUNC("erf", "armpl_verfq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("erff", "armpl_verfq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("erf", "armpl_sverf_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
@@ -928,6 +933,11 @@ TLI_DEFINE_VECFUNC("fmaf", "armpl_vfmaq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vvv
 TLI_DEFINE_VECFUNC("fma", "armpl_svfma_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvvv")
 TLI_DEFINE_VECFUNC("fmaf", "armpl_svfma_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvvv")
 
+TLI_DEFINE_VECFUNC("fmax", "armpl_vfmaxq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("fmaxf", "armpl_vfmaxq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("fmax", "armpl_svfmax_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("fmaxf", "armpl_svfmax_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
 TLI_DEFINE_VECFUNC("fmin", "armpl_vfminq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
 TLI_DEFINE_VECFUNC("fminf", "armpl_vfminq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
 TLI_DEFINE_VECFUNC("fmin", "armpl_svfmin_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
@@ -943,6 +953,16 @@ TLI_DEFINE_VECFUNC("hypotf", "armpl_vhypotq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N
 TLI_DEFINE_VECFUNC("hypot", "armpl_svhypot_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
 TLI_DEFINE_VECFUNC("hypotf", "armpl_svhypot_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
 
+TLI_DEFINE_VECFUNC("ilogb", "armpl_vilogbq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
+TLI_DEFINE_VECFUNC("ilogbf", "armpl_vilogbq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
+TLI_DEFINE_VECFUNC("ilogb", "armpl_svilogb_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
+TLI_DEFINE_VECFUNC("ilogbf", "armpl_svilogb_f32_x", SCALABLE(4), MASKED, "_ZGVsMxv")
+
+TLI_DEFINE_VECFUNC("ldexp", "armpl_vldexpq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2vv")
+TLI_DEFINE_VECFUNC("ldexpf", "armpl_vldexpq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4vv")
+TLI_DEFINE_VECFUNC("ldexp", "armpl_svldexp_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxvv")
+TLI_DEFINE_VECFUNC("ldexpf", "armpl_svldexp_f32_x", SCALABLE(4), MASKED, "_ZGVsMxvv")
+
 TLI_DEFINE_VECFUNC("lgamma", "armpl_vlgammaq_f64", FIXED(2), NOMASK, "_ZGV_LLVM_N2v")
 TLI_DEFINE_VECFUNC("lgammaf", "armpl_vlgammaq_f32", FIXED(4), NOMASK, "_ZGV_LLVM_N4v")
 TLI_DEFINE_VECFUNC("lgamma", "armpl_svlgamma_f64_x",  SCALABLE(2), MASKED, "_ZGVsMxv")
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
index c656099838f62..d07f72792e6b9 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/veclib-function-calls.ll
@@ -828,11 +828,11 @@ define void @cospi_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ;
 ; ARMPL-NEON-LABEL: define void @cospi_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-NEON:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vcospiq_f64(<2 x double> [[WIDE_LOAD:%.*]])
 ;
 ; ARMPL-SVE-LABEL: define void @cospi_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[CALL:%.*]] = tail call double @cospi(double [[IN:%.*]])
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svcospi_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
   entry:
   br label %for.body
@@ -863,11 +863,11 @@ define void @cospi_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ;
 ; ARMPL-NEON-LABEL: define void @cospi_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-NEON:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vcospiq_f32(<4 x float> [[WIDE_LOAD:%.*]])
 ;
 ; ARMPL-SVE-LABEL: define void @cospi_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[CALL:%.*]] = tail call float @cospif(float [[IN:%.*]])
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svcospi_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
   entry:
   br label %for.body
@@ -1485,11 +1485,11 @@ define void @fmax_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ;
 ; ARMPL-NEON-LABEL: define void @fmax_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-NEON:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x double> @armpl_vfmaxq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x double> [[WIDE_LOAD]])
 ;
 ; ARMPL-SVE-LABEL: define void @fmax_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[CALL:%.*]] = tail call double @fmax(double [[IN:%.*]], double [[IN]])
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x double> @armpl_svfmax_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x double> [[WIDE_MASKED_LOAD]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
   entry:
   br label %for.body
@@ -1520,11 +1520,11 @@ define void @fmax_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ;
 ; ARMPL-NEON-LABEL: define void @fmax_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-NEON:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x float> @armpl_vfmaxq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x float> [[WIDE_LOAD]])
 ;
 ; ARMPL-SVE-LABEL: define void @fmax_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[CALL:%.*]] = tail call float @fmaxf(float [[IN:%.*]], float [[IN]])
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x float> @armpl_svfmax_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x float> [[WIDE_MASKED_LOAD]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
   entry:
   br label %for.body
@@ -1777,11 +1777,11 @@ define void @ilogb_f64(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ;
 ; ARMPL-NEON-LABEL: define void @ilogb_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-NEON:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+; ARMPL-NEON:    [[TMP3:%.*]] = call <2 x i32> @armpl_vilogbq_f64(<2 x double> [[WIDE_LOAD:%.*]])
 ;
 ; ARMPL-SVE-LABEL: define void @ilogb_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[CALL:%.*]] = tail call i32 @ilogb(double [[IN:%.*]])
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 2 x i32> @armpl_svilogb_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
   entry:
   br label %for.body
@@ -1812,11 +1812,11 @@ define void @ilogb_f32(ptr noalias %in.ptr, ptr noalias %out.ptr) {
 ;
 ; ARMPL-NEON-LABEL: define void @ilogb_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-NEON:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+; ARMPL-NEON:    [[TMP3:%.*]] = call <4 x i32> @armpl_vilogbq_f32(<4 x float> [[WIDE_LOAD:%.*]])
 ;
 ; ARMPL-SVE-LABEL: define void @ilogb_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[CALL:%.*]] = tail call i32 @ilogbf(float [[IN:%.*]])
+; ARMPL-SVE:    [[TMP15:%.*]] = call <vscale x 4 x i32> @armpl_svilogb_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
   entry:
   br label %for.body
@@ -1850,11 +1850,11 @@ define void @ldexp_f64(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %
 ;
 ; ARMPL-NEON-LABEL: define void @ldexp_f64
 ; ARMPL-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-NEON:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+; ARMPL-NEON:    [[TMP5:%.*]] = call <2 x double> @armpl_vldexpq_f64(<2 x double> [[WIDE_LOAD:%.*]], <2 x i32> [[WIDE_LOAD1:%.*]])
 ;
 ; ARMPL-SVE-LABEL: define void @ldexp_f64
 ; ARMPL-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[CALL:%.*]] = tail call double @ldexp(double [[IN1:%.*]], i32 [[IN2:%.*]])
+; ARMPL-SVE:    [[TMP17:%.*]] = call <vscale x 2 x double> @armpl_svldexp_f64_x(<vscale x 2 x double> [[WIDE_MASKED_LOAD:%.*]], <vscale x 2 x i32> [[WIDE_MASKED_LOAD1:%.*]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
   entry:
   br label %for.body
@@ -1887,11 +1887,11 @@ define void @ldexp_f32(ptr noalias %in1.ptr, ptr noalias %in2.ptr, ptr noalias %
 ;
 ; ARMPL-NEON-LABEL: define void @ldexp_f32
 ; ARMPL-NEON-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-NEON:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+; ARMPL-NEON:    [[TMP5:%.*]] = call <4 x float> @armpl_vldexpq_f32(<4 x float> [[WIDE_LOAD:%.*]], <4 x i32> [[WIDE_LOAD1:%.*]])
 ;
 ; ARMPL-SVE-LABEL: define void @ldexp_f32
 ; ARMPL-SVE-SAME: (ptr noalias [[IN1_PTR:%.*]], ptr noalias [[IN2_PTR:%.*]], ptr noalias [[OUT_PTR:%.*]]) #[[ATTR0]] {
-; ARMPL-SVE:    [[CALL:%.*]] = tail call float @ldexpf(float [[IN1:%.*]], i32 [[IN2:%.*]])
+; ARMPL-SVE:    [[TMP17:%.*]] = call <vscale x 4 x float> @armpl_svldexp_f32_x(<vscale x 4 x float> [[WIDE_MASKED_LOAD:%.*]], <vscale x 4 x i32> [[WIDE_MASKED_LOAD1:%.*]], <vscale x 4 x i1> [[ACTIVE_LANE_MASK:%.*]])
 ;
   entry:
   br label %for.body

From ed760d170f18b3ca9f53f4e4a2e31d82bf0ba3e7 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 22 Jan 2024 12:18:39 -0500
Subject: [PATCH 437/843] [libc++] Mention __cxa_init_primary_exception in the
 ABI changelog

---
 libcxx/lib/abi/CHANGELOG.TXT | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/libcxx/lib/abi/CHANGELOG.TXT b/libcxx/lib/abi/CHANGELOG.TXT
index df4e5aa385954..1179c253f18c8 100644
--- a/libcxx/lib/abi/CHANGELOG.TXT
+++ b/libcxx/lib/abi/CHANGELOG.TXT
@@ -16,6 +16,18 @@ New entries should be added directly below the "Version" header.
 Version 18.0
 ------------
 
+* [libc++abi] Implement __cxa_init_primary_exception and use it to optimize std::make_exception_ptr (#65534)
+
+  This patch implements __cxa_init_primary_exception, an extension to the Itanium C++ ABI.
+  This extension is already present in both libsupc++ and libcxxrt. This patch also starts
+  making use of this function in std::make_exception_ptr: instead of going through a full
+  throw/catch cycle, we are now able to initialize an exception directly, thus making
+  std::make_exception_ptr around 30x faster. Adding a new symbol is not an ABI break.
+
+  All platforms
+  -------------
+  Symbol added: __cxa_init_primary_exception
+
 * [libc++] Simplify the implementation of locale::id
 
   This patch removes a symbol defined in the library for std::locale::id::__init().

From 02f95b77515fe18ed1076b94cbb850ea0cf3c77e Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 22 Jan 2024 16:55:12 +0000
Subject: [PATCH 438/843] Revert "[libc++][format] P2637R3: Member `visit`
 (`std::basic_format_arg`) (#76449)"

This reverts commit 7d9b5aa65b09126031e1c2903605a7d34aea4bc1 since
std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
is failing on Windows when building with Clang-cl.
---
 libcxx/docs/ReleaseNotes/18.rst               |   1 -
 libcxx/docs/Status/Cxx2cPapers.csv            |   2 +-
 libcxx/docs/Status/FormatIssues.csv           |   2 +-
 libcxx/include/__config                       |   6 -
 libcxx/include/__format/format_arg.h          | 109 +-----
 libcxx/include/__format/format_context.h      |  33 +-
 libcxx/include/format                         |   2 +-
 .../format.arg/visit.pass.cpp                 | 333 ----------------
 .../format.arg/visit.return_type.pass.cpp     | 369 ------------------
 .../visit_format_arg.deprecated.verify.cpp    |  38 --
 .../format.arg/visit_format_arg.pass.cpp      |   6 +-
 .../format.arguments/format.args/get.pass.cpp |  48 +--
 libcxx/test/support/test_basic_format_arg.h   |  20 +-
 libcxx/test/support/test_macros.h             |   5 -
 .../generate_feature_test_macro_components.py |   1 -
 15 files changed, 48 insertions(+), 927 deletions(-)
 delete mode 100644 libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
 delete mode 100644 libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
 delete mode 100644 libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp

diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index 237a63022d55f..fd882bafe19a5 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -79,7 +79,6 @@ Implemented Papers
 - P1759R6 - Native handles and file streams
 - P2868R3 - Remove Deprecated ``std::allocator`` Typedef From C++26
 - P2517R1 - Add a conditional ``noexcept`` specification to ``std::apply``
-- P2637R3 - Member ``visit``
 - P2447R6 - ``span`` over initializer list
 
 
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index c45aa3c510072..f80b1f6b663f0 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -17,7 +17,7 @@
 "`P0792R14 <https://wg21.link/P0792R14>`__","LWG","``function_ref``: a type-erased callable reference","Varna June 2023","","",""
 "`P2874R2 <https://wg21.link/P2874R2>`__","LWG","Mandating Annex D Require No More","Varna June 2023","","",""
 "`P2757R3 <https://wg21.link/P2757R3>`__","LWG","Type-checking format args","Varna June 2023","","","|format|"
-"`P2637R3 <https://wg21.link/P2637R3>`__","LWG","Member ``visit``","Varna June 2023","|Complete|","18.0",""
+"`P2637R3 <https://wg21.link/P2637R3>`__","LWG","Member ``visit``","Varna June 2023","|Partial|","18.0",""
 "`P2641R4 <https://wg21.link/P2641R4>`__","CWG, LWG","Checking if a ``union`` alternative is active","Varna June 2023","","",""
 "`P1759R6 <https://wg21.link/P1759R6>`__","LWG","Native handles and file streams","Varna June 2023","|Complete|","18.0",""
 "`P2697R1 <https://wg21.link/P2697R1>`__","LWG","Interfacing ``bitset`` with ``string_view``","Varna June 2023","|Complete|","18.0",""
diff --git a/libcxx/docs/Status/FormatIssues.csv b/libcxx/docs/Status/FormatIssues.csv
index 6e58e752191ea..513988d08036c 100644
--- a/libcxx/docs/Status/FormatIssues.csv
+++ b/libcxx/docs/Status/FormatIssues.csv
@@ -16,7 +16,7 @@ Number,Name,Standard,Assignee,Status,First released version
 "`P2693R1 <https://wg21.link/P2693R1>`__","Formatting ``thread::id`` and ``stacktrace``","C++23","Mark de Wever","|In Progress|"
 "`P2510R3 <https://wg21.link/P2510R3>`__","Formatting pointers","C++26","Mark de Wever","|Complete|",17.0
 "`P2757R3 <https://wg21.link/P2757R3>`__","Type-checking format args","C++26","","",
-"`P2637R3 <https://wg21.link/P2637R3>`__","Member ``visit``","C++26","Hristo Hristov","|Complete|",18.0
+"`P2637R3 <https://wg21.link/P2637R3>`__","Member ``visit``","C++26","","",
 "`P2905R2 <https://wg21.link/P2905R2>`__","Runtime format strings","C++26 DR","Mark de Wever","|Complete|",18.0
 "`P2918R2 <https://wg21.link/P2918R2>`__","Runtime format strings II","C++26","Mark de Wever","|Complete|",18.0
 "`P2909R4 <https://wg21.link/P2909R4>`__","Fix formatting of code units as integers (Dude, where’s my ``char``?)","C++26 DR","Mark de Wever","|Complete|",18.0
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 39ce6d628e3bf..33a79b9a79f62 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -995,12 +995,6 @@ typedef __char32_t char32_t;
 #    define _LIBCPP_DEPRECATED_IN_CXX23
 #  endif
 
-#  if _LIBCPP_STD_VER >= 26
-#    define _LIBCPP_DEPRECATED_IN_CXX26 _LIBCPP_DEPRECATED
-#  else
-#    define _LIBCPP_DEPRECATED_IN_CXX26
-#  endif
-
 #  if !defined(_LIBCPP_HAS_NO_CHAR8_T)
 #    define _LIBCPP_DEPRECATED_WITH_CHAR8_T _LIBCPP_DEPRECATED
 #  else
diff --git a/libcxx/include/__format/format_arg.h b/libcxx/include/__format/format_arg.h
index 02ee3cef7d402..10fca15d5a7a9 100644
--- a/libcxx/include/__format/format_arg.h
+++ b/libcxx/include/__format/format_arg.h
@@ -93,7 +93,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr __arg_t __get_packed_type(uint64_t __types, size
 
 } // namespace __format
 
-// This function is not user observable, so it can directly use the non-standard
+// This function is not user obervable, so it can directly use the non-standard
 // types of the "variant". See __arg_t for more details.
 template <class _Visitor, class _Context>
 _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
@@ -144,59 +144,6 @@ _LIBCPP_HIDE_FROM_ABI decltype(auto) __visit_format_arg(_Visitor&& __vis, basic_
   __libcpp_unreachable();
 }
 
-#  if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
-
-template <class _Rp, class _Visitor, class _Context>
-_LIBCPP_HIDE_FROM_ABI _Rp __visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
-  switch (__arg.__type_) {
-  case __format::__arg_t::__none:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__monostate_);
-  case __format::__arg_t::__boolean:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__boolean_);
-  case __format::__arg_t::__char_type:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__char_type_);
-  case __format::__arg_t::__int:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__int_);
-  case __format::__arg_t::__long_long:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__long_long_);
-  case __format::__arg_t::__i128:
-#    ifndef _LIBCPP_HAS_NO_INT128
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__i128_);
-#    else
-    __libcpp_unreachable();
-#    endif
-  case __format::__arg_t::__unsigned:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__unsigned_);
-  case __format::__arg_t::__unsigned_long_long:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__unsigned_long_long_);
-  case __format::__arg_t::__u128:
-#    ifndef _LIBCPP_HAS_NO_INT128
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__u128_);
-#    else
-    __libcpp_unreachable();
-#    endif
-  case __format::__arg_t::__float:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__float_);
-  case __format::__arg_t::__double:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__double_);
-  case __format::__arg_t::__long_double:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__long_double_);
-  case __format::__arg_t::__const_char_type_ptr:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__const_char_type_ptr_);
-  case __format::__arg_t::__string_view:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__string_view_);
-  case __format::__arg_t::__ptr:
-    return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), __arg.__value_.__ptr_);
-  case __format::__arg_t::__handle:
-    return std::invoke_r<_Rp>(
-        std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__arg.__value_.__handle_});
-  }
-
-  __libcpp_unreachable();
-}
-
-#  endif // _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
-
 /// Contains the values used in basic_format_arg.
 ///
 /// This is a separate type so it's possible to store the values and types in
@@ -280,52 +227,6 @@ class _LIBCPP_TEMPLATE_VIS basic_format_arg {
 
   _LIBCPP_HIDE_FROM_ABI explicit operator bool() const noexcept { return __type_ != __format::__arg_t::__none; }
 
-#  if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
-
-  // This function is user facing, so it must wrap the non-standard types of
-  // the "variant" in a handle to stay conforming. See __arg_t for more details.
-  template <class _Visitor>
-  _LIBCPP_HIDE_FROM_ABI decltype(auto) visit(this basic_format_arg __arg, _Visitor&& __vis) {
-    switch (__arg.__type_) {
-#    ifndef _LIBCPP_HAS_NO_INT128
-    case __format::__arg_t::__i128: {
-      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i128_};
-      return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
-    }
-
-    case __format::__arg_t::__u128: {
-      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
-      return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
-    }
-#    endif
-    default:
-      return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg);
-    }
-  }
-
-  // This function is user facing, so it must wrap the non-standard types of
-  // the "variant" in a handle to stay conforming. See __arg_t for more details.
-  template <class _Rp, class _Visitor>
-  _LIBCPP_HIDE_FROM_ABI _Rp visit(this basic_format_arg __arg, _Visitor&& __vis) {
-    switch (__arg.__type_) {
-#    ifndef _LIBCPP_HAS_NO_INT128
-    case __format::__arg_t::__i128: {
-      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__i128_};
-      return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
-    }
-
-    case __format::__arg_t::__u128: {
-      typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
-      return std::invoke_r<_Rp>(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
-    }
-#    endif
-    default:
-      return std::__visit_format_arg<_Rp>(std::forward<_Visitor>(__vis), __arg);
-    }
-  }
-
-#  endif // _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
-
 private:
   using char_type = typename _Context::char_type;
 
@@ -366,11 +267,7 @@ class _LIBCPP_TEMPLATE_VIS basic_format_arg<_Context>::handle {
 // This function is user facing, so it must wrap the non-standard types of
 // the "variant" in a handle to stay conforming. See __arg_t for more details.
 template <class _Visitor, class _Context>
-#  if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
-_LIBCPP_DEPRECATED_IN_CXX26
-#  endif
-    _LIBCPP_HIDE_FROM_ABI decltype(auto)
-    visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
+_LIBCPP_HIDE_FROM_ABI decltype(auto) visit_format_arg(_Visitor&& __vis, basic_format_arg<_Context> __arg) {
   switch (__arg.__type_) {
 #  ifndef _LIBCPP_HAS_NO_INT128
   case __format::__arg_t::__i128: {
@@ -382,7 +279,7 @@ _LIBCPP_DEPRECATED_IN_CXX26
     typename __basic_format_arg_value<_Context>::__handle __h{__arg.__value_.__u128_};
     return std::invoke(std::forward<_Visitor>(__vis), typename basic_format_arg<_Context>::handle{__h});
   }
-#  endif // _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
+#  endif
   default:
     return std::__visit_format_arg(std::forward<_Visitor>(__vis), __arg);
   }
diff --git a/libcxx/include/__format/format_context.h b/libcxx/include/__format/format_context.h
index 0beaa84b028be..5b252b81f691b 100644
--- a/libcxx/include/__format/format_context.h
+++ b/libcxx/include/__format/format_context.h
@@ -163,25 +163,20 @@ class _LIBCPP_TEMPLATE_VIS basic_format_context<typename __format::__retarget_bu
 #  endif
         __ctx_(std::addressof(__ctx)),
         __arg_([](void* __c, size_t __id) {
-          auto __visitor = [&](auto __arg) -> basic_format_arg<basic_format_context> {
-            if constexpr (same_as<decltype(__arg), monostate>)
-              return {};
-            else if constexpr (same_as<decltype(__arg), typename basic_format_arg<_Context>::handle>)
-              // At the moment it's not possible for formatting to use a re-targeted handle.
-              // TODO FMT add this when support is needed.
-              std::__throw_format_error("Re-targeting handle not supported");
-            else
-              return basic_format_arg<basic_format_context>{
-                  __format::__determine_arg_t<basic_format_context, decltype(__arg)>(),
-                  __basic_format_arg_value<basic_format_context>(__arg)};
-          };
-#  if _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
-          return static_cast<_Context*>(__c)->arg(__id).visit(std::move(__visitor));
-#  else
-          _LIBCPP_SUPPRESS_DEPRECATED_PUSH
-          return std::visit_format_arg(std::move(__visitor), static_cast<_Context*>(__c)->arg(__id));
-          _LIBCPP_SUPPRESS_DEPRECATED_POP
-#  endif // _LIBCPP_STD_VER >= 26 && defined(_LIBCPP_HAS_EXPLICIT_THIS_PARAMETER)
+          return std::visit_format_arg(
+              [&](auto __arg) -> basic_format_arg<basic_format_context> {
+                if constexpr (same_as<decltype(__arg), monostate>)
+                  return {};
+                else if constexpr (same_as<decltype(__arg), typename basic_format_arg<_Context>::handle>)
+                  // At the moment it's not possible for formatting to use a re-targeted handle.
+                  // TODO FMT add this when support is needed.
+                  std::__throw_format_error("Re-targeting handle not supported");
+                else
+                  return basic_format_arg<basic_format_context>{
+                      __format::__determine_arg_t<basic_format_context, decltype(__arg)>(),
+                      __basic_format_arg_value<basic_format_context>(__arg)};
+              },
+              static_cast<_Context*>(__c)->arg(__id));
         }) {
   }
 
diff --git a/libcxx/include/format b/libcxx/include/format
index 64f6ba1d25284..ab9b336d0cdab 100644
--- a/libcxx/include/format
+++ b/libcxx/include/format
@@ -170,7 +170,7 @@ namespace std {
   template<class Context> class basic_format_arg;
 
   template<class Visitor, class Context>
-    see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg); // Deprecated in C++26
+    see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg);
 
   // [format.arg.store], class template format-arg-store
   template<class Context, class... Args> struct format-arg-store;      // exposition only
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
deleted file mode 100644
index 994ccc70a38da..0000000000000
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.pass.cpp
+++ /dev/null
@@ -1,333 +0,0 @@
-//===----------------------------------------------------------------------===//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
-// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
-// The tested functionality needs deducing this.
-// UNSUPPORTED: clang-16 || clang-17
-// XFAIL: apple-clang
-
-// <format>
-
-// class basic_format_arg;
-
-// template<class Visitor>
-//   decltype(auto) visit(this basic_format_arg arg, Visitor&& vis);  // since C++26
-
-#include <algorithm>
-#include <cassert>
-#include <format>
-#include <type_traits>
-
-#include "constexpr_char_traits.h"
-#include "make_string.h"
-#include "min_allocator.h"
-#include "test_macros.h"
-
-template <class Context, class To, class From>
-void test(From value) {
-  auto store = std::make_format_args<Context>(value);
-  std::basic_format_args<Context> format_args{store};
-
-  LIBCPP_ASSERT(format_args.__size() == 1);
-  assert(format_args.get(0));
-
-  auto result = format_args.get(0).visit([v = To(value)](auto a) -> To {
-    if constexpr (std::is_same_v<To, decltype(a)>) {
-      assert(v == a);
-      return a;
-    } else {
-      assert(false);
-      return {};
-    }
-  });
-
-  using ct = std::common_type_t<From, To>;
-  assert(static_cast<ct>(result) == static_cast<ct>(value));
-}
-
-// Some types, as an extension, are stored in the variant. The Standard
-// requires them to be observed as a handle.
-template <class Context, class T>
-void test_handle(T value) {
-  auto store = std::make_format_args<Context>(value);
-  std::basic_format_args<Context> format_args{store};
-
-  LIBCPP_ASSERT(format_args.__size() == 1);
-  assert(format_args.get(0));
-
-  format_args.get(0).visit([](auto a) {
-    assert((std::is_same_v<decltype(a), typename std::basic_format_arg<Context>::handle>));
-  });
-}
-
-// Test specific for string and string_view.
-//
-// Since both result in a string_view there's no need to pass this as a
-// template argument.
-template <class Context, class From>
-void test_string_view(From value) {
-  auto store = std::make_format_args<Context>(value);
-  std::basic_format_args<Context> format_args{store};
-
-  LIBCPP_ASSERT(format_args.__size() == 1);
-  assert(format_args.get(0));
-
-  using CharT = typename Context::char_type;
-  using To    = std::basic_string_view<CharT>;
-  using V     = std::basic_string<CharT>;
-
-  auto result = format_args.get(0).visit([v = V(value.begin(), value.end())](auto a) -> To {
-    if constexpr (std::is_same_v<To, decltype(a)>) {
-      assert(v == a);
-      return a;
-    } else {
-      assert(false);
-      return {};
-    }
-  });
-
-  assert(std::equal(value.begin(), value.end(), result.begin(), result.end()));
-}
-
-template <class CharT>
-void test() {
-  using Context = std::basic_format_context<CharT*, CharT>;
-  std::basic_string<CharT> empty;
-  std::basic_string<CharT> str = MAKE_STRING(CharT, "abc");
-
-  // Test boolean types.
-
-  test<Context, bool>(true);
-  test<Context, bool>(false);
-
-  // Test CharT types.
-
-  test<Context, CharT, CharT>('a');
-  test<Context, CharT, CharT>('z');
-  test<Context, CharT, CharT>('0');
-  test<Context, CharT, CharT>('9');
-
-  // Test char types.
-
-  if (std::is_same_v<CharT, char>) {
-    // char to char -> char
-    test<Context, CharT, char>('a');
-    test<Context, CharT, char>('z');
-    test<Context, CharT, char>('0');
-    test<Context, CharT, char>('9');
-  } else {
-    if (std::is_same_v<CharT, wchar_t>) {
-      // char to wchar_t -> wchar_t
-      test<Context, wchar_t, char>('a');
-      test<Context, wchar_t, char>('z');
-      test<Context, wchar_t, char>('0');
-      test<Context, wchar_t, char>('9');
-    } else if (std::is_signed_v<char>) {
-      // char to CharT -> int
-      // This happens when CharT is a char8_t, char16_t, or char32_t and char
-      // is a signed type.
-      // Note if sizeof(CharT) > sizeof(int) this test fails. If there are
-      // platforms where that occurs extra tests need to be added for char32_t
-      // testing it against a long long.
-      test<Context, int, char>('a');
-      test<Context, int, char>('z');
-      test<Context, int, char>('0');
-      test<Context, int, char>('9');
-    } else {
-      // char to CharT -> unsigned
-      // This happens when CharT is a char8_t, char16_t, or char32_t and char
-      // is an unsigned type.
-      // Note if sizeof(CharT) > sizeof(unsigned) this test fails. If there are
-      // platforms where that occurs extra tests need to be added for char32_t
-      // testing it against an unsigned long long.
-      test<Context, unsigned, char>('a');
-      test<Context, unsigned, char>('z');
-      test<Context, unsigned, char>('0');
-      test<Context, unsigned, char>('9');
-    }
-  }
-
-  // Test signed integer types.
-
-  test<Context, int, signed char>(std::numeric_limits<signed char>::min());
-  test<Context, int, signed char>(0);
-  test<Context, int, signed char>(std::numeric_limits<signed char>::max());
-
-  test<Context, int, short>(std::numeric_limits<short>::min());
-  test<Context, int, short>(std::numeric_limits<signed char>::min());
-  test<Context, int, short>(0);
-  test<Context, int, short>(std::numeric_limits<signed char>::max());
-  test<Context, int, short>(std::numeric_limits<short>::max());
-
-  test<Context, int, int>(std::numeric_limits<int>::min());
-  test<Context, int, int>(std::numeric_limits<short>::min());
-  test<Context, int, int>(std::numeric_limits<signed char>::min());
-  test<Context, int, int>(0);
-  test<Context, int, int>(std::numeric_limits<signed char>::max());
-  test<Context, int, int>(std::numeric_limits<short>::max());
-  test<Context, int, int>(std::numeric_limits<int>::max());
-
-  using LongToType = std::conditional_t<sizeof(long) == sizeof(int), int, long long>;
-
-  test<Context, LongToType, long>(std::numeric_limits<long>::min());
-  test<Context, LongToType, long>(std::numeric_limits<int>::min());
-  test<Context, LongToType, long>(std::numeric_limits<short>::min());
-  test<Context, LongToType, long>(std::numeric_limits<signed char>::min());
-  test<Context, LongToType, long>(0);
-  test<Context, LongToType, long>(std::numeric_limits<signed char>::max());
-  test<Context, LongToType, long>(std::numeric_limits<short>::max());
-  test<Context, LongToType, long>(std::numeric_limits<int>::max());
-  test<Context, LongToType, long>(std::numeric_limits<long>::max());
-
-  test<Context, long long, long long>(std::numeric_limits<long long>::min());
-  test<Context, long long, long long>(std::numeric_limits<long>::min());
-  test<Context, long long, long long>(std::numeric_limits<int>::min());
-  test<Context, long long, long long>(std::numeric_limits<short>::min());
-  test<Context, long long, long long>(std::numeric_limits<signed char>::min());
-  test<Context, long long, long long>(0);
-  test<Context, long long, long long>(std::numeric_limits<signed char>::max());
-  test<Context, long long, long long>(std::numeric_limits<short>::max());
-  test<Context, long long, long long>(std::numeric_limits<int>::max());
-  test<Context, long long, long long>(std::numeric_limits<long>::max());
-  test<Context, long long, long long>(std::numeric_limits<long long>::max());
-
-#ifndef TEST_HAS_NO_INT128
-  test_handle<Context, __int128_t>(0);
-#endif // TEST_HAS_NO_INT128
-
-  // Test unsigned integer types.
-
-  test<Context, unsigned, unsigned char>(0);
-  test<Context, unsigned, unsigned char>(std::numeric_limits<unsigned char>::max());
-
-  test<Context, unsigned, unsigned short>(0);
-  test<Context, unsigned, unsigned short>(std::numeric_limits<unsigned char>::max());
-  test<Context, unsigned, unsigned short>(std::numeric_limits<unsigned short>::max());
-
-  test<Context, unsigned, unsigned>(0);
-  test<Context, unsigned, unsigned>(std::numeric_limits<unsigned char>::max());
-  test<Context, unsigned, unsigned>(std::numeric_limits<unsigned short>::max());
-  test<Context, unsigned, unsigned>(std::numeric_limits<unsigned>::max());
-
-  using UnsignedLongToType =
-      std::conditional_t<sizeof(unsigned long) == sizeof(unsigned), unsigned, unsigned long long>;
-
-  test<Context, UnsignedLongToType, unsigned long>(0);
-  test<Context, UnsignedLongToType, unsigned long>(std::numeric_limits<unsigned char>::max());
-  test<Context, UnsignedLongToType, unsigned long>(std::numeric_limits<unsigned short>::max());
-  test<Context, UnsignedLongToType, unsigned long>(std::numeric_limits<unsigned>::max());
-  test<Context, UnsignedLongToType, unsigned long>(std::numeric_limits<unsigned long>::max());
-
-  test<Context, unsigned long long, unsigned long long>(0);
-  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned char>::max());
-  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned short>::max());
-  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned>::max());
-  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned long>::max());
-  test<Context, unsigned long long, unsigned long long>(std::numeric_limits<unsigned long long>::max());
-
-#ifndef TEST_HAS_NO_INT128
-  test_handle<Context, __uint128_t>(0);
-#endif // TEST_HAS_NO_INT128
-
-  // Test floating point types.
-
-  test<Context, float, float>(-std::numeric_limits<float>::max());
-  test<Context, float, float>(-std::numeric_limits<float>::min());
-  test<Context, float, float>(-0.0);
-  test<Context, float, float>(0.0);
-  test<Context, float, float>(std::numeric_limits<float>::min());
-  test<Context, float, float>(std::numeric_limits<float>::max());
-
-  test<Context, double, double>(-std::numeric_limits<double>::max());
-  test<Context, double, double>(-std::numeric_limits<double>::min());
-  test<Context, double, double>(-0.0);
-  test<Context, double, double>(0.0);
-  test<Context, double, double>(std::numeric_limits<double>::min());
-  test<Context, double, double>(std::numeric_limits<double>::max());
-
-  test<Context, long double, long double>(-std::numeric_limits<long double>::max());
-  test<Context, long double, long double>(-std::numeric_limits<long double>::min());
-  test<Context, long double, long double>(-0.0);
-  test<Context, long double, long double>(0.0);
-  test<Context, long double, long double>(std::numeric_limits<long double>::min());
-  test<Context, long double, long double>(std::numeric_limits<long double>::max());
-
-  // Test const CharT pointer types.
-
-  test<Context, const CharT*, const CharT*>(empty.c_str());
-  test<Context, const CharT*, const CharT*>(str.c_str());
-
-  // Test string_view types.
-
-  {
-    using From = std::basic_string_view<CharT>;
-
-    test_string_view<Context>(From());
-    test_string_view<Context>(From(empty.c_str()));
-    test_string_view<Context>(From(str.c_str()));
-  }
-
-  {
-    using From = std::basic_string_view<CharT, constexpr_char_traits<CharT>>;
-
-    test_string_view<Context>(From());
-    test_string_view<Context>(From(empty.c_str()));
-    test_string_view<Context>(From(str.c_str()));
-  }
-
-  // Test string types.
-
-  {
-    using From = std::basic_string<CharT>;
-
-    test_string_view<Context>(From());
-    test_string_view<Context>(From(empty.c_str()));
-    test_string_view<Context>(From(str.c_str()));
-  }
-
-  {
-    using From = std::basic_string<CharT, constexpr_char_traits<CharT>, std::allocator<CharT>>;
-
-    test_string_view<Context>(From());
-    test_string_view<Context>(From(empty.c_str()));
-    test_string_view<Context>(From(str.c_str()));
-  }
-
-  {
-    using From = std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>>;
-
-    test_string_view<Context>(From());
-    test_string_view<Context>(From(empty.c_str()));
-    test_string_view<Context>(From(str.c_str()));
-  }
-
-  {
-    using From = std::basic_string<CharT, constexpr_char_traits<CharT>, min_allocator<CharT>>;
-
-    test_string_view<Context>(From());
-    test_string_view<Context>(From(empty.c_str()));
-    test_string_view<Context>(From(str.c_str()));
-  }
-
-  // Test pointer types.
-
-  test<Context, const void*>(nullptr);
-  int i = 0;
-  test<Context, const void*>(static_cast<void*>(&i));
-  const int ci = 0;
-  test<Context, const void*>(static_cast<const void*>(&ci));
-}
-
-int main(int, char**) {
-  test<char>();
-#ifndef TEST_HAS_NO_WIDE_CHARACTERS
-  test<wchar_t>();
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
deleted file mode 100644
index 473278b7b4434..0000000000000
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit.return_type.pass.cpp
+++ /dev/null
@@ -1,369 +0,0 @@
-//===----------------------------------------------------------------------===//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
-// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
-// The tested functionality needs deducing this.
-// UNSUPPORTED: clang-16 || clang-17
-// XFAIL: apple-clang
-
-// <format>
-
-// class basic_format_arg;
-
-// template<class R, class Visitor>
-//   R visit(this basic_format_arg arg, Visitor&& vis);
-
-#include <algorithm>
-#include <cassert>
-#include <format>
-#include <type_traits>
-
-#include "constexpr_char_traits.h"
-#include "make_string.h"
-#include "min_allocator.h"
-#include "test_macros.h"
-
-// The expected result type shouldn't matter,therefore use a hardcoded value for simplicity.
-using ExpectedResultType = bool;
-constexpr ExpectedResultType visited{true};
-
-template <class ExpectedR>
-ExpectedR make_expected_result() {
-  if constexpr (std::is_same_v<ExpectedR, bool>) {
-    return true;
-  } else if constexpr (std::is_same_v<ExpectedR, long>) {
-    return 192812079084L;
-  } else {
-    return "visited";
-  }
-}
-
-template <class Context, class To, class ExpectedR, class From>
-void test(From value, const ExpectedR& expectedValue) {
-  auto store = std::make_format_args<Context>(value);
-  std::basic_format_args<Context> format_args{store};
-
-  LIBCPP_ASSERT(format_args.__size() == 1);
-  assert(format_args.get(0));
-
-  // member
-  {
-    std::same_as<ExpectedR> decltype(auto) result =
-        format_args.get(0).template visit<ExpectedR>([v = To(value)](auto a) -> ExpectedR {
-          if constexpr (std::is_same_v<To, decltype(a)>) {
-            assert(v == a);
-            return make_expected_result<ExpectedR>();
-          } else {
-            assert(false);
-            return {};
-          }
-        });
-
-    assert(result == expectedValue);
-  }
-}
-
-// Some types, as an extension, are stored in the variant. The Standard
-// requires them to be observed as a handle.
-template <class Context, class T, class ExpectedR>
-void test_handle(T value, ExpectedR expectedValue) {
-  auto store = std::make_format_args<Context>(value);
-  std::basic_format_args<Context> format_args{store};
-
-  LIBCPP_ASSERT(format_args.__size() == 1);
-  assert(format_args.get(0));
-
-  std::same_as<ExpectedR> decltype(auto) result = format_args.get(0).template visit<ExpectedR>([](auto a) -> ExpectedR {
-    assert((std::is_same_v<decltype(a), typename std::basic_format_arg<Context>::handle>));
-
-    return make_expected_result<ExpectedR>();
-  });
-
-  assert(result == expectedValue);
-}
-
-// Test specific for string and string_view.
-//
-// Since both result in a string_view there's no need to pass this as a
-// template argument.
-template <class Context, class ExpectedR, class From>
-void test_string_view(From value, ExpectedR expectedValue) {
-  auto store = std::make_format_args<Context>(value);
-  std::basic_format_args<Context> format_args{store};
-
-  LIBCPP_ASSERT(format_args.__size() == 1);
-  assert(format_args.get(0));
-
-  using CharT = typename Context::char_type;
-  using To    = std::basic_string_view<CharT>;
-  using V     = std::basic_string<CharT>;
-
-  std::same_as<ExpectedR> decltype(auto) result =
-      format_args.get(0).template visit<ExpectedR>([v = V(value.begin(), value.end())](auto a) -> ExpectedR {
-        if constexpr (std::is_same_v<To, decltype(a)>) {
-          assert(v == a);
-          return make_expected_result<ExpectedR>();
-        } else {
-          assert(false);
-          return {};
-        }
-      });
-
-  assert(result == expectedValue);
-}
-
-template <class CharT>
-void test() {
-  using Context = std::basic_format_context<CharT*, CharT>;
-  std::basic_string<CharT> empty;
-  std::basic_string<CharT> str = MAKE_STRING(CharT, "abc");
-
-  // Test boolean types.
-
-  test<Context, bool, ExpectedResultType>(true, visited);
-  test<Context, bool, ExpectedResultType>(false, visited);
-
-  test<Context, bool, std::string>(true, "visited");
-  test<Context, bool, std::string>(false, "visited");
-
-  test<Context, bool, long>(true, 192812079084L);
-  test<Context, bool, long>(false, 192812079084L);
-
-  // Test CharT types.
-
-  test<Context, CharT, ExpectedResultType, CharT>('a', visited);
-  test<Context, CharT, ExpectedResultType, CharT>('z', visited);
-  test<Context, CharT, ExpectedResultType, CharT>('0', visited);
-  test<Context, CharT, ExpectedResultType, CharT>('9', visited);
-
-  // Test char types.
-
-  if (std::is_same_v<CharT, char>) {
-    // char to char -> char
-    test<Context, CharT, ExpectedResultType, char>('a', visited);
-    test<Context, CharT, ExpectedResultType, char>('z', visited);
-    test<Context, CharT, ExpectedResultType, char>('0', visited);
-    test<Context, CharT, ExpectedResultType, char>('9', visited);
-  } else {
-    if (std::is_same_v<CharT, wchar_t>) {
-      // char to wchar_t -> wchar_t
-      test<Context, wchar_t, ExpectedResultType, char>('a', visited);
-      test<Context, wchar_t, ExpectedResultType, char>('z', visited);
-      test<Context, wchar_t, ExpectedResultType, char>('0', visited);
-      test<Context, wchar_t, ExpectedResultType, char>('9', visited);
-    } else if (std::is_signed_v<char>) {
-      // char to CharT -> int
-      // This happens when CharT is a char8_t, char16_t, or char32_t and char
-      // is a signed type.
-      // Note if sizeof(CharT) > sizeof(int) this test fails. If there are
-      // platforms where that occurs extra tests need to be added for char32_t
-      // testing it against a long long.
-      test<Context, int, ExpectedResultType, char>('a', visited);
-      test<Context, int, ExpectedResultType, char>('z', visited);
-      test<Context, int, ExpectedResultType, char>('0', visited);
-      test<Context, int, ExpectedResultType, char>('9', visited);
-    } else {
-      // char to CharT -> unsigned
-      // This happens when CharT is a char8_t, char16_t, or char32_t and char
-      // is an unsigned type.
-      // Note if sizeof(CharT) > sizeof(unsigned) this test fails. If there are
-      // platforms where that occurs extra tests need to be added for char32_t
-      // testing it against an unsigned long long.
-      test<Context, unsigned, ExpectedResultType, char>('a', visited);
-      test<Context, unsigned, ExpectedResultType, char>('z', visited);
-      test<Context, unsigned, ExpectedResultType, char>('0', visited);
-      test<Context, unsigned, ExpectedResultType, char>('9', visited);
-    }
-  }
-
-  // Test signed integer types.
-
-  test<Context, int, ExpectedResultType, signed char>(std::numeric_limits<signed char>::min(), visited);
-  test<Context, int, ExpectedResultType, signed char>(0, visited);
-  test<Context, int, ExpectedResultType, signed char>(std::numeric_limits<signed char>::max(), visited);
-
-  test<Context, int, ExpectedResultType, short>(std::numeric_limits<short>::min(), visited);
-  test<Context, int, ExpectedResultType, short>(std::numeric_limits<signed char>::min(), visited);
-  test<Context, int, ExpectedResultType, short>(0, visited);
-  test<Context, int, ExpectedResultType, short>(std::numeric_limits<signed char>::max(), visited);
-  test<Context, int, ExpectedResultType, short>(std::numeric_limits<short>::max(), visited);
-
-  test<Context, int, ExpectedResultType, int>(std::numeric_limits<int>::min(), visited);
-  test<Context, int, ExpectedResultType, int>(std::numeric_limits<short>::min(), visited);
-  test<Context, int, ExpectedResultType, int>(std::numeric_limits<signed char>::min(), visited);
-  test<Context, int, ExpectedResultType, int>(0, visited);
-  test<Context, int, ExpectedResultType, int>(std::numeric_limits<signed char>::max(), visited);
-  test<Context, int, ExpectedResultType, int>(std::numeric_limits<short>::max(), visited);
-  test<Context, int, ExpectedResultType, int>(std::numeric_limits<int>::max(), visited);
-
-  using LongToType = std::conditional_t<sizeof(long) == sizeof(int), int, long long>;
-
-  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<long>::min(), visited);
-  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<int>::min(), visited);
-  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<short>::min(), visited);
-  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<signed char>::min(), visited);
-  test<Context, LongToType, ExpectedResultType, long>(0, visited);
-  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<signed char>::max(), visited);
-  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<short>::max(), visited);
-  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<int>::max(), visited);
-  test<Context, LongToType, ExpectedResultType, long>(std::numeric_limits<long>::max(), visited);
-
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<long long>::min(), visited);
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<long>::min(), visited);
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<int>::min(), visited);
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<short>::min(), visited);
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<signed char>::min(), visited);
-  test<Context, long long, ExpectedResultType, long long>(0, visited);
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<signed char>::max(), visited);
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<short>::max(), visited);
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<int>::max(), visited);
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<long>::max(), visited);
-  test<Context, long long, ExpectedResultType, long long>(std::numeric_limits<long long>::max(), visited);
-
-#ifndef TEST_HAS_NO_INT128
-  test_handle<Context, __int128_t, ExpectedResultType>(0, visited);
-#endif // TEST_HAS_NO_INT128
-
-  // Test unsigned integer types.
-
-  test<Context, unsigned, ExpectedResultType, unsigned char>(0, visited);
-  test<Context, unsigned, ExpectedResultType, unsigned char>(std::numeric_limits<unsigned char>::max(), visited);
-
-  test<Context, unsigned, ExpectedResultType, unsigned short>(0, visited);
-  test<Context, unsigned, ExpectedResultType, unsigned short>(std::numeric_limits<unsigned char>::max(), visited);
-  test<Context, unsigned, ExpectedResultType, unsigned short>(std::numeric_limits<unsigned short>::max(), visited);
-
-  test<Context, unsigned, ExpectedResultType, unsigned>(0, visited);
-  test<Context, unsigned, ExpectedResultType, unsigned>(std::numeric_limits<unsigned char>::max(), visited);
-  test<Context, unsigned, ExpectedResultType, unsigned>(std::numeric_limits<unsigned short>::max(), visited);
-  test<Context, unsigned, ExpectedResultType, unsigned>(std::numeric_limits<unsigned>::max(), visited);
-
-  using UnsignedLongToType =
-      std::conditional_t<sizeof(unsigned long) == sizeof(unsigned), unsigned, unsigned long long>;
-
-  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(0, visited);
-  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(
-      std::numeric_limits<unsigned char>::max(), visited);
-  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(
-      std::numeric_limits<unsigned short>::max(), visited);
-  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(std::numeric_limits<unsigned>::max(), visited);
-  test<Context, UnsignedLongToType, ExpectedResultType, unsigned long>(
-      std::numeric_limits<unsigned long>::max(), visited);
-
-  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(0, visited);
-  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
-      std::numeric_limits<unsigned char>::max(), visited);
-  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
-      std::numeric_limits<unsigned short>::max(), visited);
-  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
-      std::numeric_limits<unsigned>::max(), visited);
-  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
-      std::numeric_limits<unsigned long>::max(), visited);
-  test<Context, unsigned long long, ExpectedResultType, unsigned long long>(
-      std::numeric_limits<unsigned long long>::max(), visited);
-
-#ifndef TEST_HAS_NO_INT128
-  test_handle<Context, __uint128_t, ExpectedResultType>(0, visited);
-#endif // TEST_HAS_NO_INT128
-
-  // Test floating point types.
-
-  test<Context, float, ExpectedResultType, float>(-std::numeric_limits<float>::max(), visited);
-  test<Context, float, ExpectedResultType, float>(-std::numeric_limits<float>::min(), visited);
-  test<Context, float, ExpectedResultType, float>(-0.0, visited);
-  test<Context, float, ExpectedResultType, float>(0.0, visited);
-  test<Context, float, ExpectedResultType, float>(std::numeric_limits<float>::min(), visited);
-  test<Context, float, ExpectedResultType, float>(std::numeric_limits<float>::max(), visited);
-
-  test<Context, double, ExpectedResultType, double>(-std::numeric_limits<double>::max(), visited);
-  test<Context, double, ExpectedResultType, double>(-std::numeric_limits<double>::min(), visited);
-  test<Context, double, ExpectedResultType, double>(-0.0, visited);
-  test<Context, double, ExpectedResultType, double>(0.0, visited);
-  test<Context, double, ExpectedResultType, double>(std::numeric_limits<double>::min(), visited);
-  test<Context, double, ExpectedResultType, double>(std::numeric_limits<double>::max(), visited);
-
-  test<Context, long double, ExpectedResultType, long double>(-std::numeric_limits<long double>::max(), visited);
-  test<Context, long double, ExpectedResultType, long double>(-std::numeric_limits<long double>::min(), visited);
-  test<Context, long double, ExpectedResultType, long double>(-0.0, visited);
-  test<Context, long double, ExpectedResultType, long double>(0.0, visited);
-  test<Context, long double, ExpectedResultType, long double>(std::numeric_limits<long double>::min(), visited);
-  test<Context, long double, ExpectedResultType, long double>(std::numeric_limits<long double>::max(), visited);
-
-  // Test const CharT pointer types.
-
-  test<Context, const CharT*, ExpectedResultType, const CharT*>(empty.c_str(), visited);
-  test<Context, const CharT*, ExpectedResultType, const CharT*>(str.c_str(), visited);
-
-  // Test string_view types.
-
-  {
-    using From = std::basic_string_view<CharT>;
-
-    test_string_view<Context, ExpectedResultType>(From(), visited);
-    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
-    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
-  }
-  {
-    using From = std::basic_string_view<CharT, constexpr_char_traits<CharT>>;
-
-    test_string_view<Context, ExpectedResultType>(From(), visited);
-    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
-    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
-  }
-
-  // Test string types.
-
-  {
-    using From = std::basic_string<CharT>;
-
-    test_string_view<Context, ExpectedResultType>(From(), visited);
-    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
-    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
-  }
-
-  {
-    using From = std::basic_string<CharT, constexpr_char_traits<CharT>, std::allocator<CharT>>;
-
-    test_string_view<Context, ExpectedResultType>(From(), visited);
-    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
-    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
-  }
-
-  {
-    using From = std::basic_string<CharT, std::char_traits<CharT>, min_allocator<CharT>>;
-
-    test_string_view<Context, ExpectedResultType>(From(), visited);
-    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
-    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
-  }
-
-  {
-    using From = std::basic_string<CharT, constexpr_char_traits<CharT>, min_allocator<CharT>>;
-
-    test_string_view<Context, ExpectedResultType>(From(), visited);
-    test_string_view<Context, ExpectedResultType>(From(empty.c_str()), visited);
-    test_string_view<Context, ExpectedResultType>(From(str.c_str()), visited);
-  }
-
-  // Test pointer types.
-
-  test<Context, const void*, ExpectedResultType>(nullptr, visited);
-  int i = 0;
-  test<Context, const void*, ExpectedResultType>(static_cast<void*>(&i), visited);
-  const int ci = 0;
-  test<Context, const void*, ExpectedResultType>(static_cast<const void*>(&ci), visited);
-}
-
-int main(int, char**) {
-  test<char>();
-#ifndef TEST_HAS_NO_WIDE_CHARACTERS
-  test<wchar_t>();
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
deleted file mode 100644
index acd9228369e60..0000000000000
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.deprecated.verify.cpp
+++ /dev/null
@@ -1,38 +0,0 @@
-//===----------------------------------------------------------------------===//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03, c++11, c++14, c++17, c++20, c++23
-// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
-// UNSUPPORTED: clang-16 || clang-17
-// XFAIL: apple-clang
-
-// <format>
-
-// template<class Visitor, class Context>
-//   see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg);
-
-#include <format>
-#include <tuple>
-
-#include "test_macros.h"
-
-template <typename CharT, class To, class From>
-void test(From value) {
-  using Context = std::basic_format_context<CharT*, CharT>;
-  auto store    = std::make_format_args<Context>(value);
-  std::basic_format_args<Context> format_args{store};
-
-  // expected-warning-re@+1 1-2 {{std::basic_format_context{{.*}}' is deprecated}}
-  std::ignore = std::visit_format_arg([]([[maybe_unused]] auto a) -> To { return {}; }, format_args.get(0));
-}
-
-void test() {
-  test<char, bool>('a');
-#ifndef TEST_HAS_NO_WIDE_CHARACTERS
-  test<wchar_t, bool>('a');
-#endif
-}
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
index 3497d8935c8d6..3ddf2d0ff732a 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.arg/visit_format_arg.pass.cpp
@@ -11,7 +11,7 @@
 // <format>
 
 // template<class Visitor, class Context>
-//   see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg); // Deprecated in C++26
+//   see below visit_format_arg(Visitor&& vis, basic_format_arg<Context> arg);
 
 #include <algorithm>
 #include <format>
@@ -23,10 +23,6 @@
 #include "make_string.h"
 #include "min_allocator.h"
 
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
-TEST_CLANG_DIAGNOSTIC_IGNORED("-Wdeprecated-declarations")
-#endif
-
 template <class Context, class To, class From>
 void test(From value) {
   auto store = std::make_format_args<Context>(value);
diff --git a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
index a5d3703397f4e..0f3931adf54df 100644
--- a/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
+++ b/libcxx/test/std/utilities/format/format.arguments/format.args/get.pass.cpp
@@ -24,17 +24,14 @@ void test(From value) {
   auto store = std::make_format_args<Context>(value);
   const std::basic_format_args<Context> format_args{store};
 
-  auto visitor = [v = To(value)](auto a) {
-    if constexpr (std::is_same_v<To, decltype(a)>)
-      assert(v == a);
-    else
-      assert(false);
-  };
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
-  format_args.get(0).visit(visitor);
-#else
-  std::visit_format_arg(visitor, format_args.get(0));
-#endif
+  std::visit_format_arg(
+      [v = To(value)](auto a) {
+        if constexpr (std::is_same_v<To, decltype(a)>)
+          assert(v == a);
+        else
+          assert(false);
+      },
+      format_args.get(0));
 }
 
 // Some types, as an extension, are stored in the variant. The Standard
@@ -44,12 +41,9 @@ void test_handle(T value) {
   auto store = std::make_format_args<Context>(value);
   std::basic_format_args<Context> format_args{store};
 
-  auto visitor = [](auto a) { assert((std::is_same_v<decltype(a), typename std::basic_format_arg<Context>::handle>)); };
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
-  format_args.get(0).visit(visitor);
-#else
-  std::visit_format_arg(visitor, format_args.get(0));
-#endif
+  std::visit_format_arg(
+      [](auto a) { assert((std::is_same_v<decltype(a), typename std::basic_format_arg<Context>::handle>)); },
+      format_args.get(0));
 }
 
 // Test specific for string and string_view.
@@ -64,18 +58,14 @@ void test_string_view(From value) {
   using CharT = typename Context::char_type;
   using To = std::basic_string_view<CharT>;
   using V = std::basic_string<CharT>;
-
-  auto visitor = [v = V(value.begin(), value.end())](auto a) {
-    if constexpr (std::is_same_v<To, decltype(a)>)
-      assert(v == a);
-    else
-      assert(false);
-  };
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
-  format_args.get(0).visit(visitor);
-#else
-  std::visit_format_arg(visitor, format_args.get(0));
-#endif
+  std::visit_format_arg(
+      [v = V(value.begin(), value.end())](auto a) {
+        if constexpr (std::is_same_v<To, decltype(a)>)
+          assert(v == a);
+        else
+          assert(false);
+      },
+      format_args.get(0));
 }
 
 template <class CharT>
diff --git a/libcxx/test/support/test_basic_format_arg.h b/libcxx/test/support/test_basic_format_arg.h
index 1ec719a48b6c8..d8547d34b6db4 100644
--- a/libcxx/test/support/test_basic_format_arg.h
+++ b/libcxx/test/support/test_basic_format_arg.h
@@ -7,22 +7,18 @@
 
 #include <concepts>
 #include <format>
-#include <utility>
 
 #include "test_macros.h"
 
 /// Returns whether the basic_format_arg contains a type T with the expected value.
 template <class Context, class T>
 bool test_basic_format_arg(std::basic_format_arg<Context> arg, T expected) {
-  auto visitor = [expected](auto a) {
-    if constexpr (std::same_as<decltype(a), T>)
-      return a == expected;
-    else
-      return false;
-  };
-#if TEST_STD_VER >= 26 && defined(TEST_HAS_EXPLICIT_THIS_PARAMETER)
-  return arg.visit(std::move(visitor));
-#else
-  return std::visit_format_arg(std::move(visitor), arg);
-#endif
+  return std::visit_format_arg(
+      [expected](auto a) {
+        if constexpr (std::same_as<decltype(a), T>)
+          return a == expected;
+        else
+          return false;
+      },
+      arg);
 }
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 24f69c758f365..fe68e13de6bcb 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -470,9 +470,4 @@ inline Tp const& DoNotOptimize(Tp const& value) {
 #  define TEST_IF_AIX(arg_true, arg_false) arg_false
 #endif
 
-// Clang-18 has support for deducing this, but it does not set the FTM.
-#ifdef _LIBCPP_HAS_EXPLICIT_THIS_PARAMETER
-#  define TEST_HAS_EXPLICIT_THIS_PARAMETER
-#endif
-
 #endif // SUPPORT_TEST_MACROS_HPP
diff --git a/libcxx/utils/generate_feature_test_macro_components.py b/libcxx/utils/generate_feature_test_macro_components.py
index 065b70620cd17..b7d95d451f213 100755
--- a/libcxx/utils/generate_feature_test_macro_components.py
+++ b/libcxx/utils/generate_feature_test_macro_components.py
@@ -467,7 +467,6 @@ def add_version_header(tc):
                 # "c++20": 202110 Not implemented P2372R3 Fixing locale handling in chrono formatters
                 "c++20": 202106,
                 # "c++23": 202207, Not implemented P2419R2 Clarify handling of encodings in localized formatting of chrono types
-                # "c++26": 202306, P2637R3 Member Visit (implemented)
                 # "c++26": 202311, P2918R2 Runtime format strings II (implemented)
             },
             # Note these three papers are adopted at the June 2023 meeting and have sequential numbering

From 0fe20aa45eab5f66ac88776f068b7f41b8204aaf Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Mon, 22 Jan 2024 12:45:35 -0500
Subject: [PATCH 439/843] [libc] support PIE relocations (#78993)

For some reasons, we are using `-fpie`
(libc/cmake/modules/LLVMLibCObjectRules.cmake:31) without supporting it.
According to @lntue, some of the hermetic tests are broken without
proper PIE support. This patch implements basic relocations support for
PIE.
---
 libc/startup/linux/do_start.cpp | 29 +++++++++++++++++++++--------
 1 file changed, 21 insertions(+), 8 deletions(-)

diff --git a/libc/startup/linux/do_start.cpp b/libc/startup/linux/do_start.cpp
index 05dbd4488f588..10b215cdc27b7 100644
--- a/libc/startup/linux/do_start.cpp
+++ b/libc/startup/linux/do_start.cpp
@@ -29,6 +29,11 @@ extern uintptr_t __init_array_start[];
 extern uintptr_t __init_array_end[];
 extern uintptr_t __fini_array_start[];
 extern uintptr_t __fini_array_end[];
+// https://refspecs.linuxbase.org/elf/gabi4+/ch5.dynamic.html#dynamic_section
+// This symbol is provided by the dynamic linker. It can be undefined depending
+// on how the program is loaded exactly.
+[[gnu::weak,
+  gnu::visibility("hidden")]] extern const Elf64_Dyn _DYNAMIC[]; // NOLINT
 }
 
 namespace LIBC_NAMESPACE {
@@ -94,18 +99,26 @@ static ThreadAttributes main_thread_attrib;
     }
   }
 
+  ptrdiff_t base = 0;
   app.tls.size = 0;
+  Elf64_Phdr *tls_phdr = nullptr;
+
   for (uintptr_t i = 0; i < program_hdr_count; ++i) {
-    Elf64_Phdr *phdr = program_hdr_table + i;
-    if (phdr->p_type != PT_TLS)
-      continue;
-    // TODO: p_vaddr value has to be adjusted for static-pie executables.
-    app.tls.address = phdr->p_vaddr;
-    app.tls.size = phdr->p_memsz;
-    app.tls.init_size = phdr->p_filesz;
-    app.tls.align = phdr->p_align;
+    Elf64_Phdr &phdr = program_hdr_table[i];
+    if (phdr.p_type == PT_PHDR)
+      base = reinterpret_cast<ptrdiff_t>(program_hdr_table) - phdr.p_vaddr;
+    if (phdr.p_type == PT_DYNAMIC && _DYNAMIC)
+      base = reinterpret_cast<ptrdiff_t>(_DYNAMIC) - phdr.p_vaddr;
+    if (phdr.p_type == PT_TLS)
+      tls_phdr = &phdr;
+    // TODO: adjust PT_GNU_STACK
   }
 
+  app.tls.address = tls_phdr->p_vaddr + base;
+  app.tls.size = tls_phdr->p_memsz;
+  app.tls.init_size = tls_phdr->p_filesz;
+  app.tls.align = tls_phdr->p_align;
+
   // This descriptor has to be static since its cleanup function cannot
   // capture the context.
   static TLSDescriptor tls;

From 89aa3355e2366c6cf6ad9ac1f4693ff950d86c30 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Mon, 22 Jan 2024 18:04:07 +0000
Subject: [PATCH 440/843] [RemoveDIs][DebugInfo] Remove redundant DPVAssigns
 (#78574)

DPValues are already supported by most of the utilities that remove
redundant debug info after certain passes; the exception to this is
`removeUndefDbgAssignsFromEntryBlock`, which applies only to
llvm.dbg.assigns which were previously unimplemented for DPValues. Now
that DPVAssigns exist, we have to support removing redundant instances
in the same way, which this patch implements.
---
 llvm/include/llvm/IR/DebugInfo.h              |  6 ++
 llvm/lib/Transforms/Utils/BasicBlockUtils.cpp | 71 +++++++++++++++++--
 .../instcombine/remove-redundant-dbg.ll       |  2 +
 .../assignment-tracking/remove-redundant.ll   |  2 +
 .../sroa/remove-redundant-dbg.ll              |  2 +
 5 files changed, 76 insertions(+), 7 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 55b8ce0998413..8f9f2594541a0 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -193,6 +193,12 @@ inline AssignmentInstRange getAssignmentInsts(const DbgAssignIntrinsic *DAI) {
   return getAssignmentInsts(DAI->getAssignID());
 }
 
+inline AssignmentInstRange getAssignmentInsts(const DPValue *DPV) {
+  assert(DPV->isDbgAssign() &&
+         "Can't get assignment instructions for non-assign DPV!");
+  return getAssignmentInsts(DPV->getAssignID());
+}
+
 //
 // Utilities for enumerating llvm.dbg.assign intrinsic from an assignment ID.
 //
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index e019f76c8cacc..ec0482ac2cdeb 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -405,10 +405,17 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingBackwardScan(BasicBlock *BB) {
       // If the same variable fragment is described more than once it is enough
       // to keep the last one (i.e. the first found since we for reverse
       // iteration).
-      // FIXME: add assignment tracking support (see parallel implementation
-      // below).
-      if (!R.second)
-        ToBeRemoved.push_back(&DPV);
+      if (R.second)
+        continue;
+
+      if (DPV.isDbgAssign()) {
+        // Don't delete dbg.assign intrinsics that are linked to instructions.
+        if (!at::getAssignmentInsts(&DPV).empty())
+          continue;
+        // Unlinked dbg.assign intrinsics can be treated like dbg.values.
+      }
+
+      ToBeRemoved.push_back(&DPV);
       continue;
     }
     // Sequence with consecutive dbg.value instrs ended. Clear the map to
@@ -495,14 +502,25 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
       DebugVariable Key(DPV.getVariable(), std::nullopt,
                         DPV.getDebugLoc()->getInlinedAt());
       auto VMI = VariableMap.find(Key);
+      // A dbg.assign with no linked instructions can be treated like a
+      // dbg.value (i.e. can be deleted).
+      bool IsDbgValueKind =
+          (!DPV.isDbgAssign() || at::getAssignmentInsts(&DPV).empty());
+
       // Update the map if we found a new value/expression describing the
       // variable, or if the variable wasn't mapped already.
       SmallVector<Value *, 4> Values(DPV.location_ops());
       if (VMI == VariableMap.end() || VMI->second.first != Values ||
           VMI->second.second != DPV.getExpression()) {
-        VariableMap[Key] = {Values, DPV.getExpression()};
+        if (IsDbgValueKind)
+          VariableMap[Key] = {Values, DPV.getExpression()};
+        else
+          VariableMap[Key] = {Values, nullptr};
         continue;
       }
+      // Don't delete dbg.assign intrinsics that are linked to instructions.
+      if (!IsDbgValueKind)
+        continue;
       // Found an identical mapping. Remember the instruction for later removal.
       ToBeRemoved.push_back(&DPV);
     }
@@ -514,6 +532,42 @@ static bool DPValuesRemoveRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
   return !ToBeRemoved.empty();
 }
 
+static bool DPValuesRemoveUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
+  assert(BB->isEntryBlock() && "expected entry block");
+  SmallVector<DPValue *, 8> ToBeRemoved;
+  DenseSet<DebugVariable> SeenDefForAggregate;
+  // Returns the DebugVariable for DVI with no fragment info.
+  auto GetAggregateVariable = [](const DPValue &DPV) {
+    return DebugVariable(DPV.getVariable(), std::nullopt,
+                         DPV.getDebugLoc().getInlinedAt());
+  };
+
+  // Remove undef dbg.assign intrinsics that are encountered before
+  // any non-undef intrinsics from the entry block.
+  for (auto &I : *BB) {
+    for (DPValue &DPV : I.getDbgValueRange()) {
+      if (!DPV.isDbgValue() && !DPV.isDbgAssign())
+        continue;
+      bool IsDbgValueKind =
+          (DPV.isDbgValue() || at::getAssignmentInsts(&DPV).empty());
+      DebugVariable Aggregate = GetAggregateVariable(DPV);
+      if (!SeenDefForAggregate.contains(Aggregate)) {
+        bool IsKill = DPV.isKillLocation() && IsDbgValueKind;
+        if (!IsKill) {
+          SeenDefForAggregate.insert(Aggregate);
+        } else if (DPV.isDbgAssign()) {
+          ToBeRemoved.push_back(&DPV);
+        }
+      }
+    }
+  }
+
+  for (DPValue *DPV : ToBeRemoved)
+    DPV->eraseFromParent();
+
+  return !ToBeRemoved.empty();
+}
+
 static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
   if (BB->IsNewDbgInfoFormat)
     return DPValuesRemoveRedundantDbgInstrsUsingForwardScan(BB);
@@ -578,7 +632,10 @@ static bool removeRedundantDbgInstrsUsingForwardScan(BasicBlock *BB) {
 /// then (only) the instruction marked with (*) can be removed.
 /// Possible improvements:
 /// - Keep track of non-overlapping fragments.
-static bool remomveUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
+static bool removeUndefDbgAssignsFromEntryBlock(BasicBlock *BB) {
+  if (BB->IsNewDbgInfoFormat)
+    return DPValuesRemoveUndefDbgAssignsFromEntryBlock(BB);
+
   assert(BB->isEntryBlock() && "expected entry block");
   SmallVector<DbgAssignIntrinsic *, 8> ToBeRemoved;
   DenseSet<DebugVariable> SeenDefForAggregate;
@@ -629,7 +686,7 @@ bool llvm::RemoveRedundantDbgInstrs(BasicBlock *BB) {
   MadeChanges |= removeRedundantDbgInstrsUsingBackwardScan(BB);
   if (BB->isEntryBlock() &&
       isAssignmentTrackingEnabled(*BB->getParent()->getParent()))
-    MadeChanges |= remomveUndefDbgAssignsFromEntryBlock(BB);
+    MadeChanges |= removeUndefDbgAssignsFromEntryBlock(BB);
   MadeChanges |= removeRedundantDbgInstrsUsingForwardScan(BB);
 
   if (MadeChanges)
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
index 11895098179eb..cffac06f8e545 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll
index efb20b6edee2d..24ec3e94ed275 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=redundant-dbg-inst-elim -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=redundant-dbg-inst-elim -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Hand-written. Test how RemoveRedundantDbgInstrs interacts with dbg.assign
 ;; intrinsics. FileCehck directives are inline.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
index 11895098179eb..cffac06f8e545 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler

From 19261390cc3d0e58e0130ee4078fd1049889f510 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Mon, 22 Jan 2024 10:04:26 -0800
Subject: [PATCH 441/843] [lld][WebAssembly] Implement
 `--start-lib`/`--end-lib` (#78821)

Fixes: #77960
---
 lld/ELF/InputFiles.h              |  4 ++--
 lld/test/wasm/Inputs/start-lib1.s |  7 +++++++
 lld/test/wasm/Inputs/start-lib2.s |  4 ++++
 lld/test/wasm/start-lib.s         | 33 +++++++++++++++++++++++++++++++
 lld/wasm/Driver.cpp               | 15 +++++++++++++-
 lld/wasm/InputFiles.h             |  5 ++---
 lld/wasm/Options.td               |  6 ++++++
 lld/wasm/Symbols.h                |  6 +++---
 8 files changed, 71 insertions(+), 9 deletions(-)
 create mode 100644 lld/test/wasm/Inputs/start-lib1.s
 create mode 100644 lld/test/wasm/Inputs/start-lib2.s
 create mode 100644 lld/test/wasm/start-lib.s

diff --git a/lld/ELF/InputFiles.h b/lld/ELF/InputFiles.h
index 833124275b2e1..0cbe00aa396ac 100644
--- a/lld/ELF/InputFiles.h
+++ b/lld/ELF/InputFiles.h
@@ -129,8 +129,8 @@ class InputFile {
   uint8_t osabi = 0;
   uint8_t abiVersion = 0;
 
-  // True if this is a relocatable object file/bitcode file between --start-lib
-  // and --end-lib.
+  // True if this is a relocatable object file/bitcode file in an ar archive
+  // or between --start-lib and --end-lib.
   bool lazy = false;
 
   // True if this is an argument for --just-symbols. Usually false.
diff --git a/lld/test/wasm/Inputs/start-lib1.s b/lld/test/wasm/Inputs/start-lib1.s
new file mode 100644
index 0000000000000..229f67a4bd897
--- /dev/null
+++ b/lld/test/wasm/Inputs/start-lib1.s
@@ -0,0 +1,7 @@
+.functype bar () -> ()
+
+.globl foo
+foo:
+  .functype foo () -> ()
+  call bar
+  end_function
diff --git a/lld/test/wasm/Inputs/start-lib2.s b/lld/test/wasm/Inputs/start-lib2.s
new file mode 100644
index 0000000000000..28cfa6f637c58
--- /dev/null
+++ b/lld/test/wasm/Inputs/start-lib2.s
@@ -0,0 +1,4 @@
+.globl bar
+bar:
+  .functype bar () -> ()
+  end_function
diff --git a/lld/test/wasm/start-lib.s b/lld/test/wasm/start-lib.s
new file mode 100644
index 0000000000000..8bed63cdcf494
--- /dev/null
+++ b/lld/test/wasm/start-lib.s
@@ -0,0 +1,33 @@
+// Based on lld/test/ELF/start-lib.s
+
+// RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown %s -o %t1.o
+// RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown \
+// RUN:   %p/Inputs/start-lib1.s -o %t2.o
+// RUN: llvm-mc -filetype=obj -triple=wasm32-unknown-unknown \
+// RUN:   %p/Inputs/start-lib2.s -o %t3.o
+
+// RUN: wasm-ld --no-gc-sections -o %t3 %t1.o %t2.o %t3.o
+// RUN: obj2yaml %t3 | FileCheck --check-prefix=TEST1 %s
+// TEST1: Name: foo
+// TEST1: Name: bar
+
+// RUN: wasm-ld --no-gc-sections -o %t3 %t1.o -u bar --start-lib %t2.o %t3.o
+// RUN: obj2yaml %t3 | FileCheck --check-prefix=TEST2 %s
+// TEST2-NOT: Name: foo
+// TEST2: Name: bar
+
+// RUN: wasm-ld --no-gc-sections -o %t3 %t1.o --start-lib %t2.o %t3.o
+// RUN: obj2yaml %t3 | FileCheck --check-prefix=TEST3 %s
+// TEST3-NOT: Name: foo
+// TEST3-NOT: Name: bar
+
+// RUN: not wasm-ld %t1.o --start-lib --start-lib 2>&1 | FileCheck -check-prefix=NESTED-LIB %s
+// NESTED-LIB: nested --start-lib
+
+// RUN: not wasm-ld --end-lib 2>&1 | FileCheck -check-prefix=END %s
+// END: stray --end-lib
+
+.globl _start
+_start:
+  .functype _start () -> ()
+  end_function
diff --git a/lld/wasm/Driver.cpp b/lld/wasm/Driver.cpp
index 88a3db75b54a1..635f19f78b15e 100644
--- a/lld/wasm/Driver.cpp
+++ b/lld/wasm/Driver.cpp
@@ -94,6 +94,9 @@ class LinkerDriver {
   // True if we are in --whole-archive and --no-whole-archive.
   bool inWholeArchive = false;
 
+  // True if we are in --start-lib and --end-lib.
+  bool inLib = false;
+
   std::vector<InputFile *> files;
 };
 } // anonymous namespace
@@ -304,7 +307,7 @@ void LinkerDriver::addFile(StringRef path) {
   }
   case file_magic::bitcode:
   case file_magic::wasm_object:
-    files.push_back(createObjectFile(mbref));
+    files.push_back(createObjectFile(mbref, "", 0, inLib));
     break;
   case file_magic::unknown:
     if (mbref.getBuffer().starts_with("#STUB")) {
@@ -375,6 +378,16 @@ void LinkerDriver::createFiles(opt::InputArgList &args) {
     case OPT_no_whole_archive:
       inWholeArchive = false;
       break;
+    case OPT_start_lib:
+      if (inLib)
+        error("nested --start-lib");
+      inLib = true;
+      break;
+    case OPT_end_lib:
+      if (!inLib)
+        error("stray --end-lib");
+      inLib = false;
+      break;
     }
   }
   if (files.empty() && errorCount() == 0)
diff --git a/lld/wasm/InputFiles.h b/lld/wasm/InputFiles.h
index fd3d5e5ef4796..a129be36515d0 100644
--- a/lld/wasm/InputFiles.h
+++ b/lld/wasm/InputFiles.h
@@ -67,9 +67,8 @@ class InputFile {
   void markLive() { live = true; }
   bool isLive() const { return live; }
 
-  // True if this file is exists as in an archive file and has not yet been
-  // extracted.
-  // TODO(sbc): Use this to implement --start-lib/--end-lib.
+  // True if this is a relocatable object file/bitcode file in an ar archive
+  // or between --start-lib and --end-lib.
   bool lazy = false;
 
 protected:
diff --git a/lld/wasm/Options.td b/lld/wasm/Options.td
index 95ebc202a4518..8190717cef63b 100644
--- a/lld/wasm/Options.td
+++ b/lld/wasm/Options.td
@@ -70,6 +70,9 @@ defm export_dynamic: B<"export-dynamic",
     "Put symbols in the dynamic symbol table",
     "Do not put symbols in the dynamic symbol table (default)">;
 
+def end_lib: F<"end-lib">,
+  HelpText<"End a grouping of objects that should be treated as if they were together in an archive">;
+
 def entry: S<"entry">, MetaVarName<"<entry>">,
   HelpText<"Name of entry point symbol">;
 
@@ -128,6 +131,9 @@ defm rsp_quoting: Eq<"rsp-quoting", "Quoting style for response files">,
 
 def shared: F<"shared">, HelpText<"Build a shared object">;
 
+def start_lib: F<"start-lib">,
+  HelpText<"Start a grouping of objects that should be treated as if they were together in an archive">;
+
 def strip_all: F<"strip-all">, HelpText<"Strip all symbols">;
 
 def strip_debug: F<"strip-debug">, HelpText<"Strip debugging information">;
diff --git a/lld/wasm/Symbols.h b/lld/wasm/Symbols.h
index de52c92d34e78..38586bbd13236 100644
--- a/lld/wasm/Symbols.h
+++ b/lld/wasm/Symbols.h
@@ -486,9 +486,9 @@ class UndefinedTag : public TagSymbol {
   static bool classof(const Symbol *s) { return s->kind() == UndefinedTagKind; }
 };
 
-// LazySymbol represents a symbol that is not yet in the link, but we know where
-// to find it if needed. If the resolver finds both Undefined and Lazy for the
-// same name, it will ask the Lazy to load a file.
+// LazySymbol symbols represent symbols in object files between --start-lib and
+// --end-lib options. LLD also handles traditional archives as if all the files
+// in the archive are surrounded by --start-lib and --end-lib.
 //
 // A special complication is the handling of weak undefined symbols. They should
 // not load a file, but we have to remember we have seen both the weak undefined

From 042a6a1349d512edaaa225380771c64a8d92810a Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Mon, 22 Jan 2024 19:06:15 +0100
Subject: [PATCH 442/843] [libc++][chrono] Fixes (sys|local)_time formatters.
 (#76456)

- The sys_time formatter is constrained, which was not implemented.
- There is a sys_days formatter which was not implemented.
- The local_time formatter uses the sys_time formatter in its
implementation so "inherited" the same issues.

Fixes: https://github.com/llvm/llvm-project/issues/73849
Fixes: https://github.com/llvm/llvm-project/issues/67983
---
 libcxx/include/__chrono/ostream.h             |   9 +-
 libcxx/include/chrono                         |   4 +
 .../time.clock.local/ostream.pass.cpp         |  39 ++--
 .../time.clock.local/ostream.verify.cpp       |  89 +++++++++
 .../sys_date.ostream.pass.cpp                 | 175 ++++++++++++++++++
 ...eam.pass.cpp => sys_time.ostream.pass.cpp} |  56 +++---
 6 files changed, 314 insertions(+), 58 deletions(-)
 create mode 100644 libcxx/test/std/time/time.clock/time.clock.local/ostream.verify.cpp
 create mode 100644 libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp
 rename libcxx/test/std/time/time.clock/time.clock.system/{ostream.pass.cpp => sys_time.ostream.pass.cpp} (74%)

diff --git a/libcxx/include/__chrono/ostream.h b/libcxx/include/__chrono/ostream.h
index f171944b5cab3..b687ef8059d5f 100644
--- a/libcxx/include/__chrono/ostream.h
+++ b/libcxx/include/__chrono/ostream.h
@@ -42,11 +42,18 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 namespace chrono {
 
 template <class _CharT, class _Traits, class _Duration>
+  requires(!treat_as_floating_point_v<typename _Duration::rep> && _Duration{1} < days{1})
 _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
-operator<<(basic_ostream<_CharT, _Traits>& __os, const sys_time<_Duration> __tp) {
+operator<<(basic_ostream<_CharT, _Traits>& __os, const sys_time<_Duration>& __tp) {
   return __os << std::format(__os.getloc(), _LIBCPP_STATICALLY_WIDEN(_CharT, "{:L%F %T}"), __tp);
 }
 
+template <class _CharT, class _Traits>
+_LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
+operator<<(basic_ostream<_CharT, _Traits>& __os, const sys_days& __dp) {
+  return __os << year_month_day{__dp};
+}
+
 template <class _CharT, class _Traits, class _Duration>
 _LIBCPP_HIDE_FROM_ABI basic_ostream<_CharT, _Traits>&
 operator<<(basic_ostream<_CharT, _Traits>& __os, const file_time<_Duration> __tp) {
diff --git a/libcxx/include/chrono b/libcxx/include/chrono
index b3ed9acc5e5de..c80fa78a56ba1 100644
--- a/libcxx/include/chrono
+++ b/libcxx/include/chrono
@@ -296,6 +296,10 @@ template<class charT, class traits, class Duration>     // C++20
   basic_ostream<charT, traits>&
     operator<<(basic_ostream<charT, traits>& os, const sys_time<Duration>& tp);
 
+template<class charT, class traits>                    // C++20
+  basic_ostream<charT, traits>&
+    operator<<(basic_ostream<charT, traits>& os, const sys_days& dp);
+
 class file_clock                                        // C++20
 {
 public:
diff --git a/libcxx/test/std/time/time.clock/time.clock.local/ostream.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.local/ostream.pass.cpp
index 4f4fd3f40e23b..9fdef8d5adc78 100644
--- a/libcxx/test/std/time/time.clock/time.clock.local/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.clock/time.clock.local/ostream.pass.cpp
@@ -75,9 +75,11 @@ static void test_c() {
   assert(stream_c_locale<CharT>(std::chrono::local_time<std::chrono::minutes>{20'576'131min}) ==
          SV("2009-02-13 23:31:00"));
   assert(stream_c_locale<CharT>(std::chrono::local_time<std::chrono::hours>{342'935h}) == SV("2009-02-13 23:00:00"));
-  assert(stream_c_locale<CharT>(std::chrono::local_days{std::chrono::days{14'288}}) == SV("2009-02-13 00:00:00"));
+
+  // These switch to sys_day formatter, which omits the time.
+  assert(stream_c_locale<CharT>(std::chrono::local_days{std::chrono::days{14'288}}) == SV("2009-02-13"));
   assert(stream_c_locale<CharT>(std::chrono::local_time<std::chrono::weeks>{std::chrono::weeks{2041}}) ==
-         SV("2009-02-12 00:00:00"));
+         SV("2009-02-12"));
 
   assert(stream_c_locale<CharT>(std::chrono::local_time<std::chrono::duration<signed char, std::ratio<2, 1>>>{
              std::chrono::duration<signed char, std::ratio<2, 1>>{60}}) == SV("1970-01-01 00:02:00"));
@@ -89,13 +91,6 @@ static void test_c() {
              std::chrono::duration<long, std::ratio<1, 10>>{36611}}) == SV("1970-01-01 01:01:01.1"));
   assert(stream_c_locale<CharT>(std::chrono::local_time<std::chrono::duration<long long, std::ratio<1, 100>>>{
              std::chrono::duration<long long, std::ratio<1, 100>>{12'345'678'9010}}) == SV("2009-02-13 23:31:30.10"));
-
-  assert(stream_c_locale<CharT>(std::chrono::local_time<std::chrono::duration<float, std::ratio<1, 1>>>{
-             std::chrono::duration<float, std::ratio<1, 1>>{123.456}}) == SV("1970-01-01 00:02:03"));
-  assert(stream_c_locale<CharT>(std::chrono::local_time<std::chrono::duration<double, std::ratio<1, 10>>>{
-             std::chrono::duration<double, std::ratio<1, 10>>{123.456}}) == SV("1970-01-01 00:00:12.3"));
-  assert(stream_c_locale<CharT>(std::chrono::local_time<std::chrono::duration<long double, std::ratio<1, 100>>>{
-             std::chrono::duration<long double, std::ratio<1, 100>>{123.456}}) == SV("1970-01-01 00:00:01.23"));
 }
 
 template <class CharT>
@@ -114,9 +109,11 @@ static void test_fr_FR() {
          SV("2009-02-13 23:31:00"));
   assert(stream_fr_FR_locale<CharT>(std::chrono::local_time<std::chrono::hours>{342'935h}) ==
          SV("2009-02-13 23:00:00"));
-  assert(stream_fr_FR_locale<CharT>(std::chrono::local_days{std::chrono::days{14'288}}) == SV("2009-02-13 00:00:00"));
+
+  // These switch to sys_day formatter, which omits the time.
+  assert(stream_fr_FR_locale<CharT>(std::chrono::local_days{std::chrono::days{14'288}}) == SV("2009-02-13"));
   assert(stream_fr_FR_locale<CharT>(std::chrono::local_time<std::chrono::weeks>{std::chrono::weeks{2041}}) ==
-         SV("2009-02-12 00:00:00"));
+         SV("2009-02-12"));
 
   assert(stream_fr_FR_locale<CharT>(std::chrono::local_time<std::chrono::duration<signed char, std::ratio<2, 1>>>{
              std::chrono::duration<signed char, std::ratio<2, 1>>{60}}) == SV("1970-01-01 00:02:00"));
@@ -128,13 +125,6 @@ static void test_fr_FR() {
              std::chrono::duration<long, std::ratio<1, 10>>{36611}}) == SV("1970-01-01 01:01:01,1"));
   assert(stream_fr_FR_locale<CharT>(std::chrono::local_time<std::chrono::duration<long long, std::ratio<1, 100>>>{
              std::chrono::duration<long long, std::ratio<1, 100>>{12'345'678'9010}}) == SV("2009-02-13 23:31:30,10"));
-
-  assert(stream_fr_FR_locale<CharT>(std::chrono::local_time<std::chrono::duration<float, std::ratio<1, 1>>>{
-             std::chrono::duration<float, std::ratio<1, 1>>{123.456}}) == SV("1970-01-01 00:02:03"));
-  assert(stream_fr_FR_locale<CharT>(std::chrono::local_time<std::chrono::duration<double, std::ratio<1, 10>>>{
-             std::chrono::duration<double, std::ratio<1, 10>>{123.456}}) == SV("1970-01-01 00:00:12,3"));
-  assert(stream_fr_FR_locale<CharT>(std::chrono::local_time<std::chrono::duration<long double, std::ratio<1, 100>>>{
-             std::chrono::duration<long double, std::ratio<1, 100>>{123.456}}) == SV("1970-01-01 00:00:01,23"));
 }
 
 template <class CharT>
@@ -153,9 +143,11 @@ static void test_ja_JP() {
          SV("2009-02-13 23:31:00"));
   assert(stream_ja_JP_locale<CharT>(std::chrono::local_time<std::chrono::hours>{342'935h}) ==
          SV("2009-02-13 23:00:00"));
-  assert(stream_ja_JP_locale<CharT>(std::chrono::local_days{std::chrono::days{14'288}}) == SV("2009-02-13 00:00:00"));
+
+  // These switch to sys_day formatter, which omits the time.
+  assert(stream_ja_JP_locale<CharT>(std::chrono::local_days{std::chrono::days{14'288}}) == SV("2009-02-13"));
   assert(stream_ja_JP_locale<CharT>(std::chrono::local_time<std::chrono::weeks>{std::chrono::weeks{2041}}) ==
-         SV("2009-02-12 00:00:00"));
+         SV("2009-02-12"));
 
   assert(stream_ja_JP_locale<CharT>(std::chrono::local_time<std::chrono::duration<signed char, std::ratio<2, 1>>>{
              std::chrono::duration<signed char, std::ratio<2, 1>>{60}}) == SV("1970-01-01 00:02:00"));
@@ -167,13 +159,6 @@ static void test_ja_JP() {
              std::chrono::duration<long, std::ratio<1, 10>>{36611}}) == SV("1970-01-01 01:01:01.1"));
   assert(stream_ja_JP_locale<CharT>(std::chrono::local_time<std::chrono::duration<long long, std::ratio<1, 100>>>{
              std::chrono::duration<long long, std::ratio<1, 100>>{12'345'678'9010}}) == SV("2009-02-13 23:31:30.10"));
-
-  assert(stream_ja_JP_locale<CharT>(std::chrono::local_time<std::chrono::duration<float, std::ratio<1, 1>>>{
-             std::chrono::duration<float, std::ratio<1, 1>>{123.456}}) == SV("1970-01-01 00:02:03"));
-  assert(stream_ja_JP_locale<CharT>(std::chrono::local_time<std::chrono::duration<double, std::ratio<1, 10>>>{
-             std::chrono::duration<double, std::ratio<1, 10>>{123.456}}) == SV("1970-01-01 00:00:12.3"));
-  assert(stream_ja_JP_locale<CharT>(std::chrono::local_time<std::chrono::duration<long double, std::ratio<1, 100>>>{
-             std::chrono::duration<long double, std::ratio<1, 100>>{123.456}}) == SV("1970-01-01 00:00:01.23"));
 }
 
 template <class CharT>
diff --git a/libcxx/test/std/time/time.clock/time.clock.local/ostream.verify.cpp b/libcxx/test/std/time/time.clock/time.clock.local/ostream.verify.cpp
new file mode 100644
index 0000000000000..aa348b06947aa
--- /dev/null
+++ b/libcxx/test/std/time/time.clock/time.clock.local/ostream.verify.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-localization
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+
+// TODO FMT This test should not require std::to_chars(floating-point)
+// XFAIL: availability-fp_to_chars-missing
+
+// <chrono>
+
+// class system_clock;
+
+// template<class charT, class traits, class Duration>
+//   basic_ostream<charT, traits>&
+//     operator<<(basic_ostream<charT, traits>& os, const local_time<Duration>& tp);
+
+// The function uses the system_clock which has two overloads
+
+// template<class charT, class traits, class Duration>
+//   basic_ostream<charT, traits>&
+//     operator<<(basic_ostream<charT, traits>& os, const sys_time<Duration>& tp);
+// Constraints: treat_as_floating_point_v<typename Duration::rep> is false, and Duration{1} < days{1} is true.
+
+// template<class charT, class traits>
+//   basic_ostream<charT, traits>&
+//     operator<<(basic_ostream<charT, traits>& os, const sys_days& dp);
+
+// Note local_time's operator<< is specified a
+//   Effects: os << sys_time<Duration>{lt.time_since_epoch()};
+// since it uses Effects and not Effects: Equivalent to the constrains of
+// sys_time do not apply to this operator. This means it's not possible to use
+// a SFINAE test.
+
+#include <chrono>
+#include <ratio>
+#include <sstream>
+#include <type_traits>
+
+void test() {
+  std::stringstream sstr;
+
+  // floating-point values
+
+  sstr << // expected-error@*:* {{invalid operands to binary expression}}
+      std::chrono::local_time<std::chrono::duration<float, std::ratio<1, 1>>>{
+          std::chrono::duration<float, std::ratio<1, 1>>{0}};
+
+  sstr << // expected-error@*:* {{invalid operands to binary expression}}
+      std::chrono::local_time<std::chrono::duration<double, std::ratio<1, 1>>>{
+          std::chrono::duration<double, std::ratio<1, 1>>{0}};
+
+  sstr << // expected-error@*:* {{invalid operands to binary expression}}
+      std::chrono::local_time<std::chrono::duration<long double, std::ratio<1, 1>>>{
+          std::chrono::duration<long double, std::ratio<1, 1>>{0}};
+
+  // duration >= day
+
+  sstr << // valid since day has its own formatter
+      std::chrono::local_days{std::chrono::days{0}};
+
+  using rep = std::conditional_t<std::is_same_v<std::chrono::days::rep, int>, long, int>;
+  sstr << // a different rep does not matter,
+      std::chrono::local_time<std::chrono::duration<rep, std::ratio<86400>>>{
+          std::chrono::duration<rep, std::ratio<86400>>{0}};
+
+  sstr << // expected-error@*:* {{invalid operands to binary expression}}
+      std::chrono::local_time<std::chrono::duration<typename std::chrono::days::rep, std::ratio<86401>>>{
+          std::chrono::duration<typename std::chrono::days::rep, std::ratio<86401>>{0}};
+
+  sstr << // These are considered days.
+      std::chrono::local_time<std::chrono::weeks>{std::chrono::weeks{3}};
+
+  sstr << // These too.
+      std::chrono::local_time<std::chrono::duration<rep, std::ratio<20 * 86400>>>{
+          std::chrono::duration<rep, std::ratio<20 * 86400>>{0}};
+
+  sstr << // expected-error@*:* {{invalid operands to binary expression}}
+      std::chrono::local_time<std::chrono::months>{std::chrono::months{0}};
+
+  sstr << // expected-error@*:* {{invalid operands to binary expression}}
+      std::chrono::local_time<std::chrono::years>{std::chrono::years{0}};
+}
diff --git a/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp
new file mode 100644
index 0000000000000..7af3ebf776807
--- /dev/null
+++ b/libcxx/test/std/time/time.clock/time.clock.system/sys_date.ostream.pass.cpp
@@ -0,0 +1,175 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// UNSUPPORTED: no-localization
+// UNSUPPORTED: GCC-ALWAYS_INLINE-FIXME
+
+// TODO FMT This test should not require std::to_chars(floating-point)
+// XFAIL: availability-fp_to_chars-missing
+
+// TODO FMT Investigate Windows issues.
+// XFAIL: msvc
+
+// REQUIRES: locale.fr_FR.UTF-8
+// REQUIRES: locale.ja_JP.UTF-8
+
+// <chrono>
+
+// class system_Clock;
+
+// template<class charT, class traits>
+//   basic_ostream<charT, traits>&
+//     operator<<(basic_ostream<charT, traits>& os, const sys_days& dp);
+
+#include <cassert>
+#include <chrono>
+#include <ratio>
+#include <sstream>
+
+#include "make_string.h"
+#include "platform_support.h" // locale name macros
+#include "test_macros.h"
+#include "assert_macros.h"
+#include "concat_macros.h"
+
+#define SV(S) MAKE_STRING_VIEW(CharT, S)
+
+#define TEST_EQUAL(OUT, EXPECTED)                                                                                      \
+  TEST_REQUIRE(OUT == EXPECTED,                                                                                        \
+               TEST_WRITE_CONCATENATED(                                                                                \
+                   "\nExpression      ", #OUT, "\nExpected output ", EXPECTED, "\nActual output   ", OUT, '\n'));
+
+template <class CharT>
+static std::basic_string<CharT> stream_c_locale(const std::chrono::sys_days& dp) {
+  std::basic_stringstream<CharT> sstr;
+  sstr << dp;
+  return sstr.str();
+}
+
+template <class CharT>
+static std::basic_string<CharT> stream_fr_FR_locale(const std::chrono::sys_days& dp) {
+  std::basic_stringstream<CharT> sstr;
+  const std::locale locale(LOCALE_fr_FR_UTF_8);
+  sstr.imbue(locale);
+  sstr << dp;
+  return sstr.str();
+}
+
+template <class CharT>
+static std::basic_string<CharT> stream_ja_JP_locale(const std::chrono::sys_days& dp) {
+  std::basic_stringstream<CharT> sstr;
+  const std::locale locale(LOCALE_ja_JP_UTF_8);
+  sstr.imbue(locale);
+  sstr << dp;
+  return sstr.str();
+}
+
+template <class CharT>
+static void test() {
+  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{-32'768}, std::chrono::month{1}, std::chrono::day{1}}}),
+             SV("-32768-01-01 is not a valid date"));
+  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{1}, std::chrono::day{1}}}),
+             SV("1970-01-01"));
+  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}}),
+             SV("2000-02-29"));
+
+#if defined(_AIX)
+  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
+             SV("+32767-12-31"));
+#elif defined(_WIN32) // defined(_AIX)
+  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
+             SV(""));
+#else                 //  defined(_AIX)
+  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
+             SV("32767-12-31"));
+#endif                // defined(_AIX)
+
+  // multiples of days are considered days.
+  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{3}}),
+             SV("1970-01-22"));
+  TEST_EQUAL(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::duration<int, std::ratio<30 * 86400>>>{
+                 std::chrono::duration<int, std::ratio<30 * 86400>>{1}}),
+             SV("1970-01-31"));
+
+  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{-32'768}, std::chrono::month{1}, std::chrono::day{1}}}),
+             SV("-32768-01-01 is not a valid date"));
+  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{1}, std::chrono::day{1}}}),
+             SV("1970-01-01"));
+  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}}),
+             SV("2000-02-29"));
+#if defined(_AIX)
+  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
+             SV("+32767-12-31"));
+#elif defined(_WIN32) // defined(_AIX)
+  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
+             SV(""));
+#else                 // defined(_AIX)
+  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
+             SV("32767-12-31"));
+#endif                // defined(_AIX)
+
+  // multiples of days are considered days.
+  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{3}}),
+             SV("1970-01-22"));
+  TEST_EQUAL(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::duration<int, std::ratio<30 * 86400>>>{
+                 std::chrono::duration<int, std::ratio<30 * 86400>>{1}}),
+             SV("1970-01-31"));
+
+  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{-32'768}, std::chrono::month{1}, std::chrono::day{1}}}),
+             SV("-32768-01-01 is not a valid date"));
+  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{1970}, std::chrono::month{1}, std::chrono::day{1}}}),
+             SV("1970-01-01"));
+  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{2000}, std::chrono::month{2}, std::chrono::day{29}}}),
+             SV("2000-02-29"));
+#if defined(_AIX)
+  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
+             SV("+32767-12-31"));
+#elif defined(_WIN32) // defined(_AIX)
+  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
+             SV(""));
+#else                 // defined(_AIX)
+  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_days{
+                 std::chrono::year_month_day{std::chrono::year{32'767}, std::chrono::month{12}, std::chrono::day{31}}}),
+             SV("32767-12-31"));
+#endif                // defined(_AIX)
+
+  // multiples of days are considered days.
+  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{3}}),
+             SV("1970-01-22"));
+  TEST_EQUAL(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::duration<int, std::ratio<30 * 86400>>>{
+                 std::chrono::duration<int, std::ratio<30 * 86400>>{1}}),
+             SV("1970-01-31"));
+}
+
+int main(int, char**) {
+  test<char>();
+
+#ifndef TEST_HAS_NO_WIDE_CHARACTERS
+  test<wchar_t>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/std/time/time.clock/time.clock.system/ostream.pass.cpp b/libcxx/test/std/time/time.clock/time.clock.system/sys_time.ostream.pass.cpp
similarity index 74%
rename from libcxx/test/std/time/time.clock/time.clock.system/ostream.pass.cpp
rename to libcxx/test/std/time/time.clock/time.clock.system/sys_time.ostream.pass.cpp
index 553b744879619..78d8da57c150a 100644
--- a/libcxx/test/std/time/time.clock/time.clock.system/ostream.pass.cpp
+++ b/libcxx/test/std/time/time.clock/time.clock.system/sys_time.ostream.pass.cpp
@@ -75,9 +75,6 @@ static void test_c() {
   assert(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::minutes>{20'576'131min}) ==
          SV("2009-02-13 23:31:00"));
   assert(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::hours>{342'935h}) == SV("2009-02-13 23:00:00"));
-  assert(stream_c_locale<CharT>(std::chrono::sys_days{std::chrono::days{14'288}}) == SV("2009-02-13 00:00:00"));
-  assert(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{2041}}) ==
-         SV("2009-02-12 00:00:00"));
 
   assert(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::duration<signed char, std::ratio<2, 1>>>{
              std::chrono::duration<signed char, std::ratio<2, 1>>{60}}) == SV("1970-01-01 00:02:00"));
@@ -89,13 +86,6 @@ static void test_c() {
              std::chrono::duration<long, std::ratio<1, 10>>{36611}}) == SV("1970-01-01 01:01:01.1"));
   assert(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::duration<long long, std::ratio<1, 100>>>{
              std::chrono::duration<long long, std::ratio<1, 100>>{12'345'678'9010}}) == SV("2009-02-13 23:31:30.10"));
-
-  assert(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::duration<float, std::ratio<1, 1>>>{
-             std::chrono::duration<float, std::ratio<1, 1>>{123.456}}) == SV("1970-01-01 00:02:03"));
-  assert(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::duration<double, std::ratio<1, 10>>>{
-             std::chrono::duration<double, std::ratio<1, 10>>{123.456}}) == SV("1970-01-01 00:00:12.3"));
-  assert(stream_c_locale<CharT>(std::chrono::sys_time<std::chrono::duration<long double, std::ratio<1, 100>>>{
-             std::chrono::duration<long double, std::ratio<1, 100>>{123.456}}) == SV("1970-01-01 00:00:01.23"));
 }
 
 template <class CharT>
@@ -113,9 +103,6 @@ static void test_fr_FR() {
   assert(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::minutes>{20'576'131min}) ==
          SV("2009-02-13 23:31:00"));
   assert(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::hours>{342'935h}) == SV("2009-02-13 23:00:00"));
-  assert(stream_fr_FR_locale<CharT>(std::chrono::sys_days{std::chrono::days{14'288}}) == SV("2009-02-13 00:00:00"));
-  assert(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{2041}}) ==
-         SV("2009-02-12 00:00:00"));
 
   assert(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::duration<signed char, std::ratio<2, 1>>>{
              std::chrono::duration<signed char, std::ratio<2, 1>>{60}}) == SV("1970-01-01 00:02:00"));
@@ -127,13 +114,6 @@ static void test_fr_FR() {
              std::chrono::duration<long, std::ratio<1, 10>>{36611}}) == SV("1970-01-01 01:01:01,1"));
   assert(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::duration<long long, std::ratio<1, 100>>>{
              std::chrono::duration<long long, std::ratio<1, 100>>{12'345'678'9010}}) == SV("2009-02-13 23:31:30,10"));
-
-  assert(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::duration<float, std::ratio<1, 1>>>{
-             std::chrono::duration<float, std::ratio<1, 1>>{123.456}}) == SV("1970-01-01 00:02:03"));
-  assert(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::duration<double, std::ratio<1, 10>>>{
-             std::chrono::duration<double, std::ratio<1, 10>>{123.456}}) == SV("1970-01-01 00:00:12,3"));
-  assert(stream_fr_FR_locale<CharT>(std::chrono::sys_time<std::chrono::duration<long double, std::ratio<1, 100>>>{
-             std::chrono::duration<long double, std::ratio<1, 100>>{123.456}}) == SV("1970-01-01 00:00:01,23"));
 }
 
 template <class CharT>
@@ -151,9 +131,6 @@ static void test_ja_JP() {
   assert(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::minutes>{20'576'131min}) ==
          SV("2009-02-13 23:31:00"));
   assert(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::hours>{342'935h}) == SV("2009-02-13 23:00:00"));
-  assert(stream_ja_JP_locale<CharT>(std::chrono::sys_days{std::chrono::days{14'288}}) == SV("2009-02-13 00:00:00"));
-  assert(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::weeks>{std::chrono::weeks{2041}}) ==
-         SV("2009-02-12 00:00:00"));
 
   assert(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::duration<signed char, std::ratio<2, 1>>>{
              std::chrono::duration<signed char, std::ratio<2, 1>>{60}}) == SV("1970-01-01 00:02:00"));
@@ -165,17 +142,36 @@ static void test_ja_JP() {
              std::chrono::duration<long, std::ratio<1, 10>>{36611}}) == SV("1970-01-01 01:01:01.1"));
   assert(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::duration<long long, std::ratio<1, 100>>>{
              std::chrono::duration<long long, std::ratio<1, 100>>{12'345'678'9010}}) == SV("2009-02-13 23:31:30.10"));
-
-  assert(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::duration<float, std::ratio<1, 1>>>{
-             std::chrono::duration<float, std::ratio<1, 1>>{123.456}}) == SV("1970-01-01 00:02:03"));
-  assert(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::duration<double, std::ratio<1, 10>>>{
-             std::chrono::duration<double, std::ratio<1, 10>>{123.456}}) == SV("1970-01-01 00:00:12.3"));
-  assert(stream_ja_JP_locale<CharT>(std::chrono::sys_time<std::chrono::duration<long double, std::ratio<1, 100>>>{
-             std::chrono::duration<long double, std::ratio<1, 100>>{123.456}}) == SV("1970-01-01 00:00:01.23"));
 }
 
+template <class CharT, class T>
+concept is_ostreamable = requires(std::basic_ostream<CharT>& os, T const& val) {
+  { os << val };
+};
+
 template <class CharT>
 static void test() {
+  // Test sys_time's constrains:
+  //   treat_as_floating_point_v<typename Duration::rep> is false, and Duration{1} < days{1} is true.
+  static_assert(is_ostreamable<CharT, std::chrono::sys_seconds>);
+  static_assert(is_ostreamable<CharT, std::chrono::sys_time<std::chrono::duration<long long>>>);
+  // floating-point types
+  static_assert(!is_ostreamable<CharT, std::chrono::sys_time<std::chrono::duration<float>>>);
+  static_assert(!is_ostreamable<CharT, std::chrono::sys_time<std::chrono::duration<double>>>);
+  static_assert(!is_ostreamable<CharT, std::chrono::sys_time<std::chrono::duration<long double>>>);
+
+  static_assert(is_ostreamable<CharT, std::chrono::sys_days>);
+  static_assert(is_ostreamable<CharT, std::chrono::sys_time<std::chrono::duration<int, std::ratio<86400>>>>);
+
+  // duration > days{1}
+  static_assert(!is_ostreamable<CharT, std::chrono::sys_time<std::chrono::duration<int, std::ratio<86401>>>>);
+  static_assert(!is_ostreamable<CharT, std::chrono::sys_time<std::chrono::months>>);
+  static_assert(!is_ostreamable<CharT, std::chrono::sys_time<std::chrono::years>>);
+
+  // multiple of days are considered days.
+  static_assert(is_ostreamable<CharT, std::chrono::sys_time<std::chrono::duration<int, std::ratio<3 * 86400>>>>);
+  static_assert(is_ostreamable<CharT, std::chrono::sys_time<std::chrono::weeks>>);
+
   test_c<CharT>();
   test_fr_FR<CharT>();
   test_ja_JP<CharT>();

From 806f43e3cb9ca2bff7c2ae6f1324a062ddb83cac Mon Sep 17 00:00:00 2001
From: Rajveer Singh Bharadwaj <rajveer.developer@icloud.com>
Date: Mon, 22 Jan 2024 23:36:37 +0530
Subject: [PATCH 443/843] [libc++] Diagnoses insufficiently aligned pointers
 for std::assume_aligned during constant evaluation (#73775)

This is a `libc++` enhancement when violating alignment assumption for
`__builtin_assume_aligned`.

Fixes #64078
---
 libcxx/include/__memory/assume_aligned.h      |  1 +
 .../assume_aligned.const_eval.verify.cpp      | 48 +++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 libcxx/test/libcxx/utilities/memory/ptr.align/assume_aligned.const_eval.verify.cpp

diff --git a/libcxx/include/__memory/assume_aligned.h b/libcxx/include/__memory/assume_aligned.h
index 5036df7f0f5d5..526eb3334f958 100644
--- a/libcxx/include/__memory/assume_aligned.h
+++ b/libcxx/include/__memory/assume_aligned.h
@@ -27,6 +27,7 @@ _LIBCPP_NODISCARD _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 _Tp* __ass
   static_assert(_Np != 0 && (_Np & (_Np - 1)) == 0, "std::assume_aligned<N>(p) requires N to be a power of two");
 
   if (__libcpp_is_constant_evaluated()) {
+    (void)__builtin_assume_aligned(__ptr, _Np);
     return __ptr;
   } else {
     _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(
diff --git a/libcxx/test/libcxx/utilities/memory/ptr.align/assume_aligned.const_eval.verify.cpp b/libcxx/test/libcxx/utilities/memory/ptr.align/assume_aligned.const_eval.verify.cpp
new file mode 100644
index 0000000000000..d63018da7408a
--- /dev/null
+++ b/libcxx/test/libcxx/utilities/memory/ptr.align/assume_aligned.const_eval.verify.cpp
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+
+// template<size_t N, class T>
+// [[nodiscard]] constexpr T* assume_aligned(T* ptr);
+
+#include <memory>
+#include <cstddef>
+
+template <size_t Size>
+constexpr bool test() {
+  char data[1];
+
+  [[maybe_unused]] auto data1 = std::assume_aligned<Size>(data);
+
+  return true;
+}
+
+static_assert(test<2>());
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+// expected-note@* {{alignment of the base pointee object (1 byte) is less than the asserted 2 bytes}}
+
+static_assert(test<4>());
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+// expected-note@* {{alignment of the base pointee object (1 byte) is less than the asserted 4 bytes}}
+
+static_assert(test<8>());
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+// expected-note@* {{alignment of the base pointee object (1 byte) is less than the asserted 8 bytes}}
+
+static_assert(test<16>());
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+// expected-note@* {{alignment of the base pointee object (1 byte) is less than the asserted 16 bytes}}
+
+static_assert(test<32>());
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+// expected-note@* {{alignment of the base pointee object (1 byte) is less than the asserted 32 bytes}}
+
+static_assert(test<64>());
+// expected-error@-1 {{static assertion expression is not an integral constant expression}}
+// expected-note@* {{alignment of the base pointee object (1 byte) is less than the asserted 64 bytes}}

From e390bda9782b461f10433aa6728acf87521e22a5 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 22 Jan 2024 10:09:35 -0800
Subject: [PATCH 444/843] [ELF] Suppress --no-allow-shlib-undefined diagnostic
 when a SharedSymbol is overridden by a hidden visibility Defined which is
 later discarded

Commit 1981b1b6b92f7579a30c9ed32dbdf3bc749c1b40 unexpectedly strengthened
--no-allow-shlib-undefined to catch a kind of ODR violation.
More precisely, when all three conditions are met, the new
`--no-allow-shlib-undefined` code reports an error.

* There is a DSO undef that has been satisfied by a definition from
  another DSO.
* The `SharedSymbol` is overridden by a non-exported (usually of hidden
  visibility) definition in a relocatable object file (`Defined`).
* The section containing the `Defined` is garbage-collected (it is not
  part of `.dynsym` and is not marked as live).

Technically, the hidden Defined in the executable can be intentional: it
can be meant to remain non-exported and not interact with any dynamic
symbols of the same name that might exist in other DSOs. To allow for
such use cases, allocate a new bit in
Symbol and relax the --no-allow-shlib-undefined check to before
commit 1981b1b6b92f7579a30c9ed32dbdf3bc749c1b40.
---
 lld/ELF/InputFiles.cpp               |  2 ++
 lld/ELF/Symbols.h                    | 10 +++++++---
 lld/ELF/Writer.cpp                   |  7 +++++++
 lld/test/ELF/allow-shlib-undefined.s |  2 +-
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/lld/ELF/InputFiles.cpp b/lld/ELF/InputFiles.cpp
index af42b0381e991..75e5ee1d0da4f 100644
--- a/lld/ELF/InputFiles.cpp
+++ b/lld/ELF/InputFiles.cpp
@@ -1546,6 +1546,7 @@ template <class ELFT> void SharedFile::parse() {
       auto *s = symtab.addSymbol(
           SharedSymbol{*this, name, sym.getBinding(), sym.st_other,
                        sym.getType(), sym.st_value, sym.st_size, alignment});
+      s->dsoDefined = true;
       if (s->file == this)
         s->versionId = ver;
     }
@@ -1563,6 +1564,7 @@ template <class ELFT> void SharedFile::parse() {
     auto *s = symtab.addSymbol(
         SharedSymbol{*this, saver().save(name), sym.getBinding(), sym.st_other,
                      sym.getType(), sym.st_value, sym.st_size, alignment});
+    s->dsoDefined = true;
     if (s->file == this)
       s->versionId = idx;
   }
diff --git a/lld/ELF/Symbols.h b/lld/ELF/Symbols.h
index ec541de5c4613..c65c5d6cd0dca 100644
--- a/lld/ELF/Symbols.h
+++ b/lld/ELF/Symbols.h
@@ -298,12 +298,13 @@ class Symbol {
   // of the symbol.
   uint8_t scriptDefined : 1;
 
+  // True if defined in a DSO. There may also be a definition in a relocatable
+  // object file.
+  uint8_t dsoDefined : 1;
+
   // True if defined in a DSO as protected visibility.
   uint8_t dsoProtected : 1;
 
-  // True if targeted by a range extension thunk.
-  uint8_t thunkAccessed : 1;
-
   // Temporary flags used to communicate which symbol entries need PLT and GOT
   // entries during postScanRelocations();
   std::atomic<uint16_t> flags;
@@ -320,6 +321,9 @@ class Symbol {
   uint16_t versionId;
   uint8_t versionScriptAssigned : 1;
 
+  // True if targeted by a range extension thunk.
+  uint8_t thunkAccessed : 1;
+
   void setFlags(uint16_t bits) {
     flags.fetch_or(bits, std::memory_order_relaxed);
   }
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index b18d2239dc58f..6f66f3615fa4a 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2025,6 +2025,11 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
       // to catch more cases. That is too much for us. Our approach resembles
       // the one used in ld.gold, achieves a good balance to be useful but not
       // too smart.
+      //
+      // If a DSO reference is resolved by a SharedSymbol, but the SharedSymbol
+      // is overridden by a hidden visibility Defined (which is later discarded
+      // due to GC), don't report the diagnostic. However, this may indicate an
+      // unintended SharedSymbol.
       for (SharedFile *file : ctx.sharedFiles) {
         bool allNeededIsKnown =
             llvm::all_of(file->dtNeeded, [&](StringRef needed) {
@@ -2033,6 +2038,8 @@ template <class ELFT> void Writer<ELFT>::finalizeSections() {
         if (!allNeededIsKnown)
           continue;
         for (Symbol *sym : file->requiredSymbols) {
+          if (sym->dsoDefined)
+            continue;
           if (sym->isUndefined() && !sym->isWeak()) {
             diagnose("undefined reference due to --no-allow-shlib-undefined: " +
                      toString(*sym) + "\n>>> referenced by " + toString(file));
diff --git a/lld/test/ELF/allow-shlib-undefined.s b/lld/test/ELF/allow-shlib-undefined.s
index 56b44e144661c..969e87b69eb85 100644
--- a/lld/test/ELF/allow-shlib-undefined.s
+++ b/lld/test/ELF/allow-shlib-undefined.s
@@ -40,7 +40,7 @@
 # RUN: not ld.lld --gc-sections main.o a.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s
 ## The definition def.so is ignored.
 # RUN: ld.lld -shared def.o -o def.so
-# RUN: not ld.lld --gc-sections main.o a.so def.so def-hidden.o -o /dev/null 2>&1 | FileCheck %s
+# RUN: ld.lld --gc-sections main.o a.so def.so def-hidden.o --fatal-warnings -o /dev/null
 
 # CHECK-NOT:   error:
 # CHECK:       error: undefined reference due to --no-allow-shlib-undefined: x1{{$}}

From 7378fb30645ad5398491acea3960a8115d1b171c Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Mon, 22 Jan 2024 19:12:05 +0100
Subject: [PATCH 445/843] [libc++] Protect the libc++ implementation from CUDA
 SDK's __noinline__ macro (#73838)

The CUDA SDK contains an unfortunate definition for the `__noinline__`
macro. This patch works around it by using `__attribute__((noinline))`
instead of `__attribute__((__noinline__))` on CUDA. We are still waiting
for a long-term resolution to this issue in NVIDIA/cccl#1235.
---
 libcxx/include/__config | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/libcxx/include/__config b/libcxx/include/__config
index 33a79b9a79f62..9a64cdb489119 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1216,6 +1216,23 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 #    define _LIBCPP_NOINLINE
 #  endif
 
+#  if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
+// The CUDA SDK contains an unfortunate definition for the __noinline__ macro,
+// which breaks the regular __attribute__((__noinline__)) syntax. Therefore,
+// when compiling for CUDA we use the non-underscored version of the noinline
+// attribute.
+//
+// This is a temporary workaround and we still expect the CUDA SDK team to solve
+// this issue properly in the SDK headers.
+//
+// See https://github.com/llvm/llvm-project/pull/73838 for more details.
+#    define _LIBCPP_NOINLINE __attribute__((noinline))
+#  elif __has_attribute(__noinline__)
+#    define _LIBCPP_NOINLINE __attribute__((__noinline__))
+#  else
+#    define _LIBCPP_NOINLINE
+#  endif
+
 // We often repeat things just for handling wide characters in the library.
 // When wide characters are disabled, it can be useful to have a quick way of
 // disabling it without having to resort to #if-#endif, which has a larger

From d7fb9eb818d22085c7dae0ce9a8be7ade963d7e5 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 22 Jan 2024 18:12:24 +0000
Subject: [PATCH 446/843] [DebugInfo][RemoveDIs] Handle DPValues in
 SelectOptimize (#79005)

When there are debug intrinsics in-between groups of select
instructions, select-optimise sinks them into the "end" block. This
needs to be replicated for DPValues, the non-instruction variable
assignment object. Implement that and add a RUN line to a test that was
sensitive to this to ensure it gets tested.

(The exact range of instructions being transformed here is a little
fiddly, hence I've gone with a helper lambda).
---
 llvm/lib/CodeGen/SelectOptimize.cpp      | 17 +++++++++++++++++
 llvm/test/CodeGen/X86/select-optimize.ll |  3 +++
 2 files changed, 20 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index 1316919e65dac..d14283c521554 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -491,6 +491,23 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
       DI->moveBeforePreserving(&*EndBlock->getFirstInsertionPt());
     }
 
+    // Duplicate implementation for DPValues, the non-instruction debug-info
+    // record. Helper lambda for moving DPValues to the end block.
+    auto TransferDPValues = [&](Instruction &I) {
+      for (auto &DPValue : llvm::make_early_inc_range(I.getDbgValueRange())) {
+        DPValue.removeFromParent();
+        EndBlock->insertDPValueBefore(&DPValue,
+                                      EndBlock->getFirstInsertionPt());
+      }
+    };
+
+    // Iterate over all instructions in between SI and LastSI, not including
+    // SI itself. These are all the variable assignments that happen "in the
+    // middle" of the select group.
+    auto R = make_range(std::next(SI->getIterator()),
+                        std::next(LastSI->getIterator()));
+    llvm::for_each(R, TransferDPValues);
+
     // These are the new basic blocks for the conditional branch.
     // At least one will become an actual new basic block.
     BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
diff --git a/llvm/test/CodeGen/X86/select-optimize.ll b/llvm/test/CodeGen/X86/select-optimize.ll
index 6e44065440875..aa04db882f5d4 100644
--- a/llvm/test/CodeGen/X86/select-optimize.ll
+++ b/llvm/test/CodeGen/X86/select-optimize.ll
@@ -2,6 +2,9 @@
 ; RUN: opt -mtriple=x86_64-unknown-unknown -select-optimize -S < %s | FileCheck %s
 ; RUN: opt -mtriple=x86_64-unknown-unknown -passes='require<profile-summary>,function(select-optimize)' -S < %s | FileCheck %s
 
+; RUN: opt -mtriple=x86_64-unknown-unknown -select-optimize -S < %s --try-experimental-debuginfo-iterators | FileCheck %s
+; RUN: opt -mtriple=x86_64-unknown-unknown -passes='require<profile-summary>,function(select-optimize)' -S < %s --try-experimental-debuginfo-iterators | FileCheck %s
+
 ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;
 ;; Test base heuristic 1:
 ;; highly-biased selects assumed to be highly predictable, converted to branches

From c0a74ad90c34de5e17f713cd2a97223a425dfeb7 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 22 Jan 2024 10:25:06 -0800
Subject: [PATCH 447/843] [libc] Use QUEUE_TYPEOF in STAILQ_LAST (#79011)

This is to ensure this macro is compatible with both C and C++.
---
 libc/include/llvm-libc-macros/sys-queue-macros.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/libc/include/llvm-libc-macros/sys-queue-macros.h b/libc/include/llvm-libc-macros/sys-queue-macros.h
index f41e7363e418b..59e6a9a392c97 100644
--- a/libc/include/llvm-libc-macros/sys-queue-macros.h
+++ b/libc/include/llvm-libc-macros/sys-queue-macros.h
@@ -160,7 +160,9 @@
 #define STAILQ_EMPTY(head) ((head)->first == NULL)
 #define STAILQ_FIRST(head) ((head)->first)
 #define STAILQ_LAST(head, type, field)                                         \
-  (STAILQ_EMPTY(head) ? NULL : __containerof((head)->last, type, field.next))
+  (STAILQ_EMPTY(head)                                                          \
+       ? NULL                                                                  \
+       : __containerof((head)->last, QUEUE_TYPEOF(type), field.next))
 #define STAILQ_NEXT(elem, field) ((elem)->field.next)
 
 #define STAILQ_FOREACH(var, head, field)                                       \

From 3eb4178b9cc321da5b74b9bba1a5ad825531e474 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 22 Jan 2024 10:31:29 -0800
Subject: [PATCH 448/843] [mlir][openacc] Update acc.loop to be a proper loop
 like operation (#67355)

The initial design of the `acc.loop` was to be an operation that
encapsulates a loop like operation. This was an early design and we now
want to change it so the `acc.loop` operation becomes a real loop-like
operation by implementing the LoopLikeInterface.

Differential Revision: https://reviews.llvm.org/D159229

This patch is just moved from Phabricator to github
---
 mlir/include/mlir/Dialect/OpenACC/OpenACC.h   |   1 +
 .../mlir/Dialect/OpenACC/OpenACCOps.td        |  33 +-
 mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp       | 110 ++++--
 mlir/test/Dialect/OpenACC/canonicalize.mlir   |  10 +-
 mlir/test/Dialect/OpenACC/invalid.mlir        |  58 +++-
 mlir/test/Dialect/OpenACC/ops.mlir            | 319 ++++++++----------
 6 files changed, 301 insertions(+), 230 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index 36daf8de235f3..bb3b9617c24ed 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -25,6 +25,7 @@
 #include "mlir/Dialect/OpenACC/OpenACCTypeInterfaces.h.inc"
 #include "mlir/Dialect/OpenACCMPCommon/Interfaces/AtomicInterfaces.h"
 #include "mlir/Interfaces/ControlFlowInterfaces.h"
+#include "mlir/Interfaces/LoopLikeInterface.h"
 #include "mlir/Interfaces/SideEffectInterfaces.h"
 
 #define GET_TYPEDEF_CLASSES
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index 7344ab2852b9c..992f2809644a6 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -14,6 +14,7 @@
 #define OPENACC_OPS
 
 include "mlir/Interfaces/ControlFlowInterfaces.td"
+include "mlir/Interfaces/LoopLikeInterface.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/BuiltinTypes.td"
 include "mlir/IR/EnumAttr.td"
@@ -1474,29 +1475,34 @@ def OpenACC_HostDataOp : OpenACC_Op<"host_data",
 
 def OpenACC_LoopOp : OpenACC_Op<"loop",
     [AttrSizedOperandSegments, RecursiveMemoryEffects,
-     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>]> {
+     MemoryEffects<[MemWrite<OpenACC_ConstructResource>]>,
+     DeclareOpInterfaceMethods<LoopLikeOpInterface>]> {
   let summary = "loop construct";
 
   let description = [{
-    The "acc.loop" operation represents the OpenACC loop construct.
+    The "acc.loop" operation represents the OpenACC loop construct. The lower
+    and upper bounds specify a half-open range: the range includes the lower
+    bound but does not include the upper bound. If the `inclusive` attribute is
+    set then the upper bound is included.
 
     Example:
 
     ```mlir
-    acc.loop gang vector {
-      scf.for %arg3 = %c0 to %c10 step %c1 {
-        scf.for %arg4 = %c0 to %c10 step %c1 {
-          scf.for %arg5 = %c0 to %c10 step %c1 {
-            // ... body
-          }
-        }
-      }
+    acc.loop gang() vector() (%arg3 : index, %arg4 : index, %arg5 : index) = 
+        (%c0, %c0, %c0 : index, index, index) to 
+        (%c10, %c10, %c10 : index, index, index) step 
+        (%c1, %c1, %c1 : index, index, index) {
+      // Loop body
       acc.yield
     } attributes { collapse = [3] }
     ```
   }];
 
   let arguments = (ins
+      Variadic<IntOrIndex>:$lowerbound,
+      Variadic<IntOrIndex>:$upperbound,
+      Variadic<IntOrIndex>:$step,
+      OptionalAttr<DenseBoolArrayAttr>:$inclusiveUpperbound,
       OptionalAttr<I64ArrayAttr>:$collapse,
       OptionalAttr<DeviceTypeArrayAttr>:$collapseDeviceType,
       Variadic<IntOrIndex>:$gangOperands,
@@ -1521,7 +1527,7 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
       OptionalAttr<SymbolRefArrayAttr>:$privatizations,
       Variadic<AnyType>:$reductionOperands,
       OptionalAttr<SymbolRefArrayAttr>:$reductionRecipes
-      );
+  );
 
   let results = (outs Variadic<AnyType>:$results);
 
@@ -1539,6 +1545,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
     /// The i-th data operand passed.
     Value getDataOperand(unsigned i);
 
+    Block &getBody() { return getLoopRegions().front()->front(); }
+
     /// Return true if the op has the auto attribute for the
     /// mlir::acc::DeviceType::None device_type.
     bool hasAuto();
@@ -1628,7 +1636,8 @@ def OpenACC_LoopOp : OpenACC_Op<"loop",
         `)`
       | `cache` `(` $cacheOperands `:` type($cacheOperands) `)`
     )
-    $region
+    custom<LoopControl>($region, $lowerbound, type($lowerbound), $upperbound,
+        type($upperbound), $step, type($step))
     ( `(` type($results)^ `)` )?
     attr-dict-with-keyword
   }];
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index bdc9c345341b2..f6229e5192a0a 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -1008,17 +1008,12 @@ static ParseResult parseDeviceTypeOperandsWithKeywordOnly(
   llvm::SmallVector<mlir::Attribute> keywordOnlyDeviceTypeAttributes;
   bool needCommaBeforeOperands = false;
 
-  // Keyword only
-  if (failed(parser.parseOptionalLParen())) {
-    keywordOnlyDeviceTypeAttributes.push_back(mlir::acc::DeviceTypeAttr::get(
-        parser.getContext(), mlir::acc::DeviceType::None));
-    keywordOnlyDeviceType =
-        ArrayAttr::get(parser.getContext(), keywordOnlyDeviceTypeAttributes);
-    return success();
-  }
+  if (failed(parser.parseOptionalLParen()))
+    return failure();
 
   // Parse keyword only attributes
   if (succeeded(parser.parseOptionalLSquare())) {
+    // Parse keyword only attributes
     if (failed(parser.parseCommaSeparatedList([&]() {
           if (parser.parseAttribute(
                   keywordOnlyDeviceTypeAttributes.emplace_back()))
@@ -1029,6 +1024,13 @@ static ParseResult parseDeviceTypeOperandsWithKeywordOnly(
     if (parser.parseRSquare())
       return failure();
     needCommaBeforeOperands = true;
+  } else if (succeeded(parser.parseOptionalRParen())) {
+    // Keyword only
+    keywordOnlyDeviceTypeAttributes.push_back(mlir::acc::DeviceTypeAttr::get(
+        parser.getContext(), mlir::acc::DeviceType::None));
+    keywordOnlyDeviceType =
+        ArrayAttr::get(parser.getContext(), keywordOnlyDeviceTypeAttributes);
+    return success();
   }
 
   if (needCommaBeforeOperands && failed(parser.parseComma()))
@@ -1065,15 +1067,18 @@ static void printDeviceTypeOperandsWithKeywordOnly(
     mlir::TypeRange types, std::optional<mlir::ArrayAttr> deviceTypes,
     std::optional<mlir::ArrayAttr> keywordOnlyDeviceTypes) {
 
+  p << "(";
+
   if (operands.begin() == operands.end() && keywordOnlyDeviceTypes &&
       keywordOnlyDeviceTypes->size() == 1) {
     auto deviceTypeAttr =
         mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*keywordOnlyDeviceTypes)[0]);
-    if (deviceTypeAttr.getValue() == mlir::acc::DeviceType::None)
+    if (deviceTypeAttr.getValue() == mlir::acc::DeviceType::None) {
+      p << ")";
       return;
+    }
   }
 
-  p << "(";
   printDeviceTypes(p, keywordOnlyDeviceTypes);
   if (hasDeviceTypeValues(keywordOnlyDeviceTypes) &&
       hasDeviceTypeValues(deviceTypes))
@@ -1323,17 +1328,12 @@ static ParseResult parseGangClause(
   bool needCommaBetweenValues = false;
   bool needCommaBeforeOperands = false;
 
-  // Gang only keyword
-  if (failed(parser.parseOptionalLParen())) {
-    gangOnlyDeviceTypeAttributes.push_back(mlir::acc::DeviceTypeAttr::get(
-        parser.getContext(), mlir::acc::DeviceType::None));
-    gangOnlyDeviceType =
-        ArrayAttr::get(parser.getContext(), gangOnlyDeviceTypeAttributes);
-    return success();
-  }
+  if (failed(parser.parseOptionalLParen()))
+    return failure();
 
   // Parse gang only attributes
   if (succeeded(parser.parseOptionalLSquare())) {
+    // Parse gang only attributes
     if (failed(parser.parseCommaSeparatedList([&]() {
           if (parser.parseAttribute(
                   gangOnlyDeviceTypeAttributes.emplace_back()))
@@ -1344,6 +1344,13 @@ static ParseResult parseGangClause(
     if (parser.parseRSquare())
       return failure();
     needCommaBeforeOperands = true;
+  } else if (succeeded(parser.parseOptionalRParen())) {
+    // Gang only keyword
+    gangOnlyDeviceTypeAttributes.push_back(mlir::acc::DeviceTypeAttr::get(
+        parser.getContext(), mlir::acc::DeviceType::None));
+    gangOnlyDeviceType =
+        ArrayAttr::get(parser.getContext(), gangOnlyDeviceTypeAttributes);
+    return success();
   }
 
   auto argNum = mlir::acc::GangArgTypeAttr::get(parser.getContext(),
@@ -1443,16 +1450,18 @@ void printGangClause(OpAsmPrinter &p, Operation *op,
                      std::optional<mlir::DenseI32ArrayAttr> segments,
                      std::optional<mlir::ArrayAttr> gangOnlyDeviceTypes) {
 
+  p << "(";
   if (operands.begin() == operands.end() &&
       hasDeviceTypeValues(gangOnlyDeviceTypes) &&
       gangOnlyDeviceTypes->size() == 1) {
     auto deviceTypeAttr =
         mlir::dyn_cast<mlir::acc::DeviceTypeAttr>((*gangOnlyDeviceTypes)[0]);
-    if (deviceTypeAttr.getValue() == mlir::acc::DeviceType::None)
+    if (deviceTypeAttr.getValue() == mlir::acc::DeviceType::None) {
+      p << ")";
       return;
+    }
   }
 
-  p << "(";
   printDeviceTypes(p, gangOnlyDeviceTypes);
 
   if (hasDeviceTypeValues(gangOnlyDeviceTypes) &&
@@ -1516,6 +1525,11 @@ LogicalResult checkDeviceTypes(mlir::ArrayAttr deviceTypes) {
 }
 
 LogicalResult acc::LoopOp::verify() {
+  if (!getUpperbound().empty() && getInclusiveUpperbound() &&
+      (getUpperbound().size() != getInclusiveUpperbound()->size()))
+    return emitError() << "inclusiveUpperbound size is expected to be the same"
+                       << " as upperbound size";
+
   // Check collapse
   if (getCollapseAttr() && !getCollapseDeviceTypeAttr())
     return emitOpError() << "collapse device_type attr must be define when"
@@ -1629,7 +1643,9 @@ unsigned LoopOp::getNumDataOperands() {
 }
 
 Value LoopOp::getDataOperand(unsigned i) {
-  unsigned numOptional = getGangOperands().size();
+  unsigned numOptional =
+      getLowerbound().size() + getUpperbound().size() + getStep().size();
+  numOptional += getGangOperands().size();
   numOptional += getVectorOperands().size();
   numOptional += getWorkerNumOperands().size();
   numOptional += getTileOperands().size();
@@ -1748,6 +1764,58 @@ bool LoopOp::hasGang(mlir::acc::DeviceType deviceType) {
   return hasDeviceType(getGang(), deviceType);
 }
 
+llvm::SmallVector<mlir::Region *> acc::LoopOp::getLoopRegions() {
+  return {&getRegion()};
+}
+
+/// loop-control ::= `(` ssa-id-and-type-list `)` `=` `(` ssa-id-and-type-list
+/// `)` `to` `(` ssa-id-and-type-list `)` `step` `(` ssa-id-and-type-list `)`
+ParseResult
+parseLoopControl(OpAsmParser &parser, Region &region,
+                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &lowerbound,
+                 SmallVectorImpl<Type> &lowerboundType,
+                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &upperbound,
+                 SmallVectorImpl<Type> &upperboundType,
+                 SmallVectorImpl<OpAsmParser::UnresolvedOperand> &step,
+                 SmallVectorImpl<Type> &stepType) {
+
+  SmallVector<OpAsmParser::Argument> inductionVars;
+  if (succeeded(parser.parseOptionalLParen())) {
+    if (parser.parseArgumentList(inductionVars, OpAsmParser::Delimiter::None,
+                                 /*allowType=*/true) ||
+        parser.parseRParen() || parser.parseEqual() || parser.parseLParen() ||
+        parser.parseOperandList(lowerbound, inductionVars.size(),
+                                OpAsmParser::Delimiter::None) ||
+        parser.parseColonTypeList(lowerboundType) || parser.parseRParen() ||
+        parser.parseKeyword("to") || parser.parseLParen() ||
+        parser.parseOperandList(upperbound, inductionVars.size(),
+                                OpAsmParser::Delimiter::None) ||
+        parser.parseColonTypeList(upperboundType) || parser.parseRParen() ||
+        parser.parseKeyword("step") || parser.parseLParen() ||
+        parser.parseOperandList(step, inductionVars.size(),
+                                OpAsmParser::Delimiter::None) ||
+        parser.parseColonTypeList(stepType) || parser.parseRParen())
+      return failure();
+  }
+  return parser.parseRegion(region, inductionVars);
+}
+
+void printLoopControl(OpAsmPrinter &p, Operation *op, Region &region,
+                      ValueRange lowerbound, TypeRange lowerboundType,
+                      ValueRange upperbound, TypeRange upperboundType,
+                      ValueRange steps, TypeRange stepType) {
+  ValueRange regionArgs = region.front().getArguments();
+  if (!regionArgs.empty()) {
+    p << "(";
+    llvm::interleaveComma(regionArgs, p,
+                          [&p](Value v) { p << v << " : " << v.getType(); });
+    p << ") = (" << lowerbound << " : " << lowerboundType << ") to ("
+      << upperbound << " : " << upperboundType << ") "
+      << " step (" << steps << " : " << stepType << ") ";
+  }
+  p.printRegion(region, /*printEntryBlockArgs=*/false);
+}
+
 //===----------------------------------------------------------------------===//
 // DataOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenACC/canonicalize.mlir b/mlir/test/Dialect/OpenACC/canonicalize.mlir
index 6173ab6699c6c..4522ffb252a62 100644
--- a/mlir/test/Dialect/OpenACC/canonicalize.mlir
+++ b/mlir/test/Dialect/OpenACC/canonicalize.mlir
@@ -110,14 +110,16 @@ func.func @testupdateop(%a: memref<f32>, %ifCond: i1) -> () {
 
 func.func @testhostdataop(%a: memref<f32>, %ifCond: i1) -> () {
   %0 = acc.use_device varPtr(%a : memref<f32>) -> memref<f32>
+  %1 = arith.constant 1 : i32
+  %2 = arith.constant 10 : i32
   %false = arith.constant false
   acc.host_data dataOperands(%0 : memref<f32>) if(%false) {
-    acc.loop {
+    acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
       acc.yield
-    }
-    acc.loop {
+    } attributes { inclusiveUpperbound = array<i1: true> }
+    acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
       acc.yield
-    }
+    } attributes { inclusiveUpperbound = array<i1: true> }
     acc.terminator
   }
   return
diff --git a/mlir/test/Dialect/OpenACC/invalid.mlir b/mlir/test/Dialect/OpenACC/invalid.mlir
index 5dcdb3a37e4e3..57ae5856149d1 100644
--- a/mlir/test/Dialect/OpenACC/invalid.mlir
+++ b/mlir/test/Dialect/OpenACC/invalid.mlir
@@ -1,53 +1,67 @@
 // RUN: mlir-opt -split-input-file -verify-diagnostics %s
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop {
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 } attributes {seq = [#acc.device_type<none>], gang = [#acc.device_type<none>]}
 
 // -----
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop {
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 } attributes {seq = [#acc.device_type<none>], worker = [#acc.device_type<none>]}
 
 // -----
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop {
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 } attributes {seq = [#acc.device_type<none>], vector = [#acc.device_type<none>]}
 
 // -----
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop {
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 } attributes {seq = [#acc.device_type<none>], worker = [#acc.device_type<none>], gang = [#acc.device_type<none>]}
 
 // -----
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop {
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 } attributes {seq = [#acc.device_type<none>], vector = [#acc.device_type<none>], gang = [#acc.device_type<none>]}
 
 // -----
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
-acc.loop {
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 } attributes {seq = [#acc.device_type<none>], vector = [#acc.device_type<none>], worker = [#acc.device_type<none>]}
 
 // -----
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{gang, worker or vector cannot appear with the seq attr}}
 acc.loop {
   "test.openacc_dummy_op"() : () -> ()
@@ -83,10 +97,12 @@ acc.loop {
 
 // -----
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{only one of "auto", "independent", "seq" can be present at the same time}}
-acc.loop {
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   acc.yield
-} attributes {auto_ = [#acc.device_type<none>], seq = [#acc.device_type<none>]}
+} attributes {auto_ = [#acc.device_type<none>], seq = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 
 // -----
 
@@ -154,11 +170,13 @@ acc.parallel {
 
 // -----
 
-acc.loop {
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32){
 // expected-error@+1 {{'acc.init' op cannot be nested in a compute operation}}
   acc.init
   acc.yield
-}
+} attributes {inclusiveUpperbound = array<i1: true>}
 
 // -----
 
@@ -170,21 +188,25 @@ acc.parallel {
 
 // -----
 
-acc.loop {
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
 // expected-error@+1 {{'acc.shutdown' op cannot be nested in a compute operation}}
   acc.shutdown
   acc.yield
-}
+} attributes {inclusiveUpperbound = array<i1: true>}
 
 // -----
 
-acc.loop {
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
+acc.loop (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() ({
     // expected-error@+1 {{'acc.shutdown' op cannot be nested in a compute operation}}
     acc.shutdown
   }) : () -> ()
   acc.yield
-}
+} attributes {inclusiveUpperbound = array<i1: true>}
 
 // -----
 
@@ -388,8 +410,10 @@ acc.firstprivate.recipe @privatization_i32 : i32 init {
 
 // -----
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{expected ')'}}
-acc.loop gang({static=%i64Value: i64, num=%i64Value: i64} {
+acc.loop gang({static=%i64Value: i64, num=%i64Value: i64} (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 }
@@ -457,8 +481,10 @@ acc.reduction.recipe @reduction_i64 : i64 reduction_operator<add> init {
 
 // -----
 
+%1 = arith.constant 1 : i32
+%2 = arith.constant 10 : i32
 // expected-error@+1 {{new value expected after comma}}
-acc.loop gang({static=%i64Value: i64, ) {
+acc.loop gang({static=%i64Value: i64, ) (%iv : i32) = (%1 : i32) to (%2 : i32) step (%1 : i32) {
   "test.openacc_dummy_op"() : () -> ()
   acc.yield
 }
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index bda31a19cf5cd..d4c884a837f87 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -11,52 +11,40 @@ func.func @compute1(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
   %async = arith.constant 1 : i64
 
   acc.parallel async(%async: i64) {
-    acc.loop gang vector {
-      scf.for %arg3 = %c0 to %c10 step %c1 {
-        scf.for %arg4 = %c0 to %c10 step %c1 {
-          scf.for %arg5 = %c0 to %c10 step %c1 {
-            %a = memref.load %A[%arg3, %arg5] : memref<10x10xf32>
-            %b = memref.load %B[%arg5, %arg4] : memref<10x10xf32>
-            %cij = memref.load %C[%arg3, %arg4] : memref<10x10xf32>
-            %p = arith.mulf %a, %b : f32
-            %co = arith.addf %cij, %p : f32
-            memref.store %co, %C[%arg3, %arg4] : memref<10x10xf32>
-          }
-        }
-      }
+    acc.loop gang() vector() (%arg3 : index, %arg4 : index, %arg5 : index) = (%c0, %c0, %c0 : index, index, index) to (%c10, %c10, %c10 : index, index, index) step (%c1, %c1, %c1 : index, index, index) {
+      %a = memref.load %A[%arg3, %arg5] : memref<10x10xf32>
+      %b = memref.load %B[%arg5, %arg4] : memref<10x10xf32>
+      %cij = memref.load %C[%arg3, %arg4] : memref<10x10xf32>
+      %p = arith.mulf %a, %b : f32
+      %co = arith.addf %cij, %p : f32
+      memref.store %co, %C[%arg3, %arg4] : memref<10x10xf32>
       acc.yield
-    } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>]}
+    } attributes { collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
     acc.yield
   }
 
   return %C : memref<10x10xf32>
 }
 
-// CHECK-LABEL: func @compute1(
+// CHECK-LABEL: func @compute1
 //  CHECK-NEXT:   %{{.*}} = arith.constant 0 : index
 //  CHECK-NEXT:   %{{.*}} = arith.constant 10 : index
 //  CHECK-NEXT:   %{{.*}} = arith.constant 1 : index
 //  CHECK-NEXT:   [[ASYNC:%.*]] = arith.constant 1 : i64
 //  CHECK-NEXT:   acc.parallel async([[ASYNC]] : i64) {
-//  CHECK-NEXT:     acc.loop gang vector {
-//  CHECK-NEXT:       scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
-//  CHECK-NEXT:         scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
-//  CHECK-NEXT:           scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
-//  CHECK-NEXT:             %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-//  CHECK-NEXT:             %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-//  CHECK-NEXT:             %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-//  CHECK-NEXT:             %{{.*}} = arith.mulf %{{.*}}, %{{.*}} : f32
-//  CHECK-NEXT:             %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
-//  CHECK-NEXT:             memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-//  CHECK-NEXT:           }
-//  CHECK-NEXT:         }
-//  CHECK-NEXT:       }
+//  CHECK-NEXT:     acc.loop gang() vector() (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+//  CHECK-NEXT:       %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+//  CHECK-NEXT:       %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+//  CHECK-NEXT:       %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+//  CHECK-NEXT:       %{{.*}} = arith.mulf %{{.*}}, %{{.*}} : f32
+//  CHECK-NEXT:       %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
+//  CHECK-NEXT:       memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
 //  CHECK-NEXT:       acc.yield
-//  CHECK-NEXT:     } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>]}
+//  CHECK-NEXT:     } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true, true>}
 //  CHECK-NEXT:     acc.yield
 //  CHECK-NEXT:   }
 //  CHECK-NEXT:   return %{{.*}} : memref<10x10xf32>
-//  CHECK-NEXT: }
+
 
 // -----
 
@@ -66,21 +54,19 @@ func.func @compute2(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
   %c1 = arith.constant 1 : index
 
   acc.parallel {
-    acc.loop {
-      scf.for %arg3 = %c0 to %c10 step %c1 {
-        scf.for %arg4 = %c0 to %c10 step %c1 {
-          scf.for %arg5 = %c0 to %c10 step %c1 {
-            %a = memref.load %A[%arg3, %arg5] : memref<10x10xf32>
-            %b = memref.load %B[%arg5, %arg4] : memref<10x10xf32>
-            %cij = memref.load %C[%arg3, %arg4] : memref<10x10xf32>
-            %p = arith.mulf %a, %b : f32
-            %co = arith.addf %cij, %p : f32
-            memref.store %co, %C[%arg3, %arg4] : memref<10x10xf32>
-          }
+    acc.loop (%arg3 : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
+      scf.for %arg4 = %c0 to %c10 step %c1 {
+        scf.for %arg5 = %c0 to %c10 step %c1 {
+          %a = memref.load %A[%arg3, %arg5] : memref<10x10xf32>
+          %b = memref.load %B[%arg5, %arg4] : memref<10x10xf32>
+          %cij = memref.load %C[%arg3, %arg4] : memref<10x10xf32>
+          %p = arith.mulf %a, %b : f32
+          %co = arith.addf %cij, %p : f32
+          memref.store %co, %C[%arg3, %arg4] : memref<10x10xf32>
         }
       }
       acc.yield
-    } attributes {seq = [#acc.device_type<none>]}
+    } attributes {seq = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
     acc.yield
   }
 
@@ -92,21 +78,19 @@ func.func @compute2(%A: memref<10x10xf32>, %B: memref<10x10xf32>, %C: memref<10x
 //  CHECK-NEXT:   %{{.*}} = arith.constant 10 : index
 //  CHECK-NEXT:   %{{.*}} = arith.constant 1 : index
 //  CHECK-NEXT:   acc.parallel {
-//  CHECK-NEXT:     acc.loop {
+//  CHECK-NEXT:     acc.loop  (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}})
 //  CHECK-NEXT:       scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
 //  CHECK-NEXT:         scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
-//  CHECK-NEXT:           scf.for %{{.*}} = %{{.*}} to %{{.*}} step %{{.*}} {
-//  CHECK-NEXT:             %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-//  CHECK-NEXT:             %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-//  CHECK-NEXT:             %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-//  CHECK-NEXT:             %{{.*}} = arith.mulf %{{.*}}, %{{.*}} : f32
-//  CHECK-NEXT:             %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
-//  CHECK-NEXT:             memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-//  CHECK-NEXT:           }
+//  CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+//  CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+//  CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+//  CHECK-NEXT:           %{{.*}} = arith.mulf %{{.*}}, %{{.*}} : f32
+//  CHECK-NEXT:           %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
+//  CHECK-NEXT:           memref.store %{{.*}}, %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
 //  CHECK-NEXT:         }
 //  CHECK-NEXT:       }
 //  CHECK-NEXT:       acc.yield
-//  CHECK-NEXT:     } attributes {seq = [#acc.device_type<none>]}
+//  CHECK-NEXT:     } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
 //  CHECK-NEXT:     acc.yield
 //  CHECK-NEXT:   }
 //  CHECK-NEXT:   return %{{.*}} : memref<10x10xf32>
@@ -138,32 +122,26 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
   acc.data dataOperands(%pa, %pb, %pc, %pd: memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
     %private = acc.private varPtr(%c : memref<10xf32>) -> memref<10xf32>
     acc.parallel num_gangs({%numGangs: i64}) num_workers(%numWorkers: i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %private : memref<10xf32>) {
-      acc.loop gang {
-        scf.for %x = %lb to %c10 step %st {
-          acc.loop worker {
-            scf.for %y = %lb to %c10 step %st {
-              %axy = memref.load %a[%x, %y] : memref<10x10xf32>
-              %bxy = memref.load %b[%x, %y] : memref<10x10xf32>
-              %tmp = arith.addf %axy, %bxy : f32
-              memref.store %tmp, %c[%y] : memref<10xf32>
-            }
-            acc.yield
-          }
-
-          acc.loop {
-            // for i = 0 to 10 step 1
-            //   d[x] += c[i]
-            scf.for %i = %lb to %c10 step %st {
-              %ci = memref.load %c[%i] : memref<10xf32>
-              %dx = memref.load %d[%x] : memref<10xf32>
-              %z = arith.addf %ci, %dx : f32
-              memref.store %z, %d[%x] : memref<10xf32>
-            }
-            acc.yield
-          } attributes {seq = [#acc.device_type<none>]}
-        }
+      acc.loop gang() (%x : index) = (%lb : index) to (%c10 : index) step (%st : index) {
+        acc.loop worker() (%y : index) = (%lb : index) to (%c10 : index) step (%st : index) {
+          %axy = memref.load %a[%x, %y] : memref<10x10xf32>
+          %bxy = memref.load %b[%x, %y] : memref<10x10xf32>
+          %tmp = arith.addf %axy, %bxy : f32
+          memref.store %tmp, %c[%y] : memref<10xf32>
+          acc.yield
+        } attributes {inclusiveUpperbound = array<i1: true>}
+
+        acc.loop (%i : index) = (%lb : index) to (%c10 : index) step (%st : index) {
+          // for i = 0 to 10 step 1
+          //   d[x] += c[i]
+          %ci = memref.load %c[%i] : memref<10xf32>
+          %dx = memref.load %d[%x] : memref<10xf32>
+          %z = arith.addf %ci, %dx : f32
+          memref.store %z, %d[%x] : memref<10xf32>
+          acc.yield
+        } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<nvidia>]}
         acc.yield
-      }
+      } attributes {inclusiveUpperbound = array<i1: true>}
       acc.yield
     }
     acc.terminator
@@ -181,29 +159,23 @@ func.func @compute3(%a: memref<10x10xf32>, %b: memref<10x10xf32>, %c: memref<10x
 // CHECK:        acc.data dataOperands(%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}} : memref<10x10xf32>, memref<10x10xf32>, memref<10xf32>, memref<10xf32>) {
 // CHECK-NEXT:     %[[P_ARG2:.*]] = acc.private varPtr([[ARG2]] : memref<10xf32>) -> memref<10xf32> 
 // CHECK-NEXT:     acc.parallel num_gangs({[[NUMGANG]] : i64}) num_workers([[NUMWORKERS]] : i64 [#acc.device_type<nvidia>]) private(@privatization_memref_10_f32 -> %[[P_ARG2]] : memref<10xf32>) {
-// CHECK-NEXT:       acc.loop gang {
-// CHECK-NEXT:         scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
-// CHECK-NEXT:           acc.loop worker {
-// CHECK-NEXT:             scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
-// CHECK-NEXT:               %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-// CHECK-NEXT:               %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
-// CHECK-NEXT:               %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
-// CHECK-NEXT:               memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
-// CHECK-NEXT:             }
-// CHECK-NEXT:             acc.yield
-// CHECK-NEXT:           }
-// CHECK-NEXT:           acc.loop {
-// CHECK-NEXT:             scf.for %{{.*}} = [[C0]] to [[C10]] step [[C1]] {
-// CHECK-NEXT:               %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32>
-// CHECK-NEXT:               %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32>
-// CHECK-NEXT:               %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
-// CHECK-NEXT:               memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
-// CHECK-NEXT:             }
-// CHECK-NEXT:             acc.yield
-// CHECK-NEXT:           } attributes {seq = [#acc.device_type<none>]}
-// CHECK-NEXT:         }
+// CHECK-NEXT:       acc.loop gang() (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+// CHECK-NEXT:         acc.loop worker() (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+// CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+// CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}, %{{.*}}] : memref<10x10xf32>
+// CHECK-NEXT:           %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:           memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:           acc.yield
+// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>}
+// CHECK-NEXT:         acc.loop (%{{.*}}) = (%{{.*}}) to (%{{.*}}) step (%{{.*}}) {
+// CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:           %{{.*}} = memref.load %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:           %{{.*}} = arith.addf %{{.*}}, %{{.*}} : f32
+// CHECK-NEXT:           memref.store %{{.*}}, %{{.*}}[%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:           acc.yield
+// CHECK-NEXT:         } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<nvidia>]}
 // CHECK-NEXT:         acc.yield
-// CHECK-NEXT:       }
+// CHECK-NEXT:       } attributes {inclusiveUpperbound = array<i1: true>}
 // CHECK-NEXT:       acc.yield
 // CHECK-NEXT:     }
 // CHECK-NEXT:     acc.terminator
@@ -217,171 +189,161 @@ func.func @testloopop(%a : memref<10xf32>) -> () {
   %i64Value = arith.constant 1 : i64
   %i32Value = arith.constant 128 : i32
   %idxValue = arith.constant 8 : index
+  %c0 = arith.constant 0 : index
+  %c10 = arith.constant 10 : index
+  %c1 = arith.constant 1 : index
 
-  acc.loop gang vector worker {
+  acc.loop gang() vector() worker() (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop gang({num=%i64Value: i64}) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop gang({num=%i64Value: i64}) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop gang({static=%i64Value: i64}) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop gang({static=%i64Value: i64}) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop worker(%i64Value: i64) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop worker(%i64Value: i64) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop worker(%i32Value: i32) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop worker(%i32Value: i32) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop worker(%idxValue: index) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop worker(%idxValue: index) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop vector(%i64Value: i64) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop vector(%i64Value: i64) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop vector(%i32Value: i32) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop vector(%i32Value: i32) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop vector(%idxValue: index) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop vector(%idxValue: index) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop gang({num=%i64Value: i64}) worker vector {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop gang({num=%i64Value: i64}) worker() vector() (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop gang({num=%i64Value: i64, static=%i64Value: i64}) worker(%i64Value: i64) vector(%i64Value: i64) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop gang({num=%i64Value: i64, static=%i64Value: i64}) worker(%i64Value: i64) vector(%i64Value: i64) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop gang({num=%i32Value: i32, static=%idxValue: index}) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop gang({num=%i32Value: i32, static=%idxValue: index}) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop tile({%i64Value : i64, %i64Value : i64}) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop tile({%i64Value : i64, %i64Value : i64}) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop tile({%i32Value : i32, %i32Value : i32}) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop tile({%i32Value : i32, %i32Value : i32}) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop gang({static=%i64Value: i64, num=%i64Value: i64}) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop gang({static=%i64Value: i64, num=%i64Value: i64}) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
-  acc.loop gang({dim=%i64Value : i64, static=%i64Value: i64}) {
+  } attributes {inclusiveUpperbound = array<i1: true>}
+  acc.loop gang({dim=%i64Value : i64, static=%i64Value: i64}) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
+  } attributes {inclusiveUpperbound = array<i1: true>}
   %b = acc.cache varPtr(%a : memref<10xf32>) -> memref<10xf32>
-  acc.loop cache(%b : memref<10xf32>) {
+  acc.loop cache(%b : memref<10xf32>) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
     "test.openacc_dummy_op"() : () -> ()
     acc.yield
-  }
+  } attributes {inclusiveUpperbound = array<i1: true>}
   return
 }
 
 // CHECK:      [[I64VALUE:%.*]] = arith.constant 1 : i64
 // CHECK-NEXT: [[I32VALUE:%.*]] = arith.constant 128 : i32
 // CHECK-NEXT: [[IDXVALUE:%.*]] = arith.constant 8 : index
-// CHECK:      acc.loop gang worker vector {
+// CHECK:      acc.loop
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop gang({num=[[I64VALUE]] : i64}) {
+// CHECK-NEXT: attributes {inclusiveUpperbound = array<i1: true>}
+// CHECK:      acc.loop gang({num=[[I64VALUE]] : i64})
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop gang({static=[[I64VALUE]] : i64}) {
+// CHECK:      acc.loop gang({static=[[I64VALUE]] : i64})
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop worker([[I64VALUE]] : i64) {
+// CHECK:      acc.loop worker([[I64VALUE]] : i64)
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop worker([[I32VALUE]] : i32) {
+// CHECK:      acc.loop worker([[I32VALUE]] : i32)
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop worker([[IDXVALUE]] : index) {
+// CHECK:      acc.loop worker([[IDXVALUE]] : index)
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop vector([[I64VALUE]] : i64) {
+// CHECK:      acc.loop vector([[I64VALUE]] : i64)
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop vector([[I32VALUE]] : i32) {
+// CHECK:      acc.loop vector([[I32VALUE]] : i32)
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop vector([[IDXVALUE]] : index) {
+// CHECK:      acc.loop vector([[IDXVALUE]] : index)
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop gang({num=[[I64VALUE]] : i64}) worker vector {
+// CHECK:      acc.loop gang({num=[[I64VALUE]] : i64}) worker() vector()
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop gang({num=[[I64VALUE]] : i64, static=[[I64VALUE]] : i64}) worker([[I64VALUE]] : i64) vector([[I64VALUE]] : i64) {
+// CHECK:      acc.loop gang({num=[[I64VALUE]] : i64, static=[[I64VALUE]] : i64}) worker([[I64VALUE]] : i64) vector([[I64VALUE]] : i64)
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop gang({num=[[I32VALUE]] : i32, static=[[IDXVALUE]] : index}) {
+// CHECK:      acc.loop gang({num=[[I32VALUE]] : i32, static=[[IDXVALUE]] : index})
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop tile({[[I64VALUE]] : i64, [[I64VALUE]] : i64}) {
+// CHECK:      acc.loop tile({[[I64VALUE]] : i64, [[I64VALUE]] : i64})
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop tile({[[I32VALUE]] : i32, [[I32VALUE]] : i32}) {
+// CHECK:      acc.loop tile({[[I32VALUE]] : i32, [[I32VALUE]] : i32})
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop gang({static=[[I64VALUE]] : i64, num=[[I64VALUE]] : i64}) {
+// CHECK:      acc.loop gang({static=[[I64VALUE]] : i64, num=[[I64VALUE]] : i64})
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
-// CHECK:      acc.loop gang({dim=[[I64VALUE]] : i64, static=[[I64VALUE]] : i64}) {
+// CHECK:      acc.loop gang({dim=[[I64VALUE]] : i64, static=[[I64VALUE]] : i64})
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
 // CHECK:      %{{.*}} = acc.cache varPtr(%{{.*}} : memref<10xf32>) -> memref<10xf32>
-// CHECK-NEXT: acc.loop cache(%{{.*}} : memref<10xf32>) {
+// CHECK-NEXT: acc.loop cache(%{{.*}} : memref<10xf32>)
 // CHECK-NEXT:   "test.openacc_dummy_op"() : () -> ()
 // CHECK-NEXT:   acc.yield
-// CHECK-NEXT: }
 
 // -----
 
 func.func @acc_loop_multiple_block() {
+  %c0 = arith.constant 0 : index
+  %c10 = arith.constant 10 : index
+  %c1 = arith.constant 1 : index
   acc.parallel {
-    acc.loop {
-      %c1 = arith.constant 1 : index
-      cf.br ^bb1(%c1 : index)
+    acc.loop  (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
+      %c1_1 = arith.constant 1 : index
+      cf.br ^bb1(%c1_1 : index)
     ^bb1(%9: index):
-      %c0 = arith.constant 0 : index
-      %12 = arith.cmpi sgt, %9, %c0 : index
+      %c0_1 = arith.constant 0 : index
+      %12 = arith.cmpi sgt, %9, %c0_1 : index
       cf.cond_br %12, ^bb2, ^bb3
     ^bb2:
       %c1_0 = arith.constant 1 : index
-      %c10 = arith.constant 10 : index
-      %22 = arith.subi %c10, %c1_0 : index
+      %c10_1 = arith.constant 10 : index
+      %22 = arith.subi %c10_1, %c1_0 : index
       cf.br ^bb1(%22 : index)
     ^bb3:
       acc.yield
-    }
+    } attributes {inclusiveUpperbound = array<i1: true>}
     acc.yield
   }
   return
@@ -1509,10 +1471,13 @@ acc.reduction.recipe @reduction_add_i64 : i64 reduction_operator<add> init {
 // CHECK:       }
 
 func.func @acc_reduc_test(%a : i64) -> () {
+  %c0 = arith.constant 0 : index
+  %c10 = arith.constant 10 : index
+  %c1 = arith.constant 1 : index
   acc.parallel reduction(@reduction_add_i64 -> %a : i64) {
-    acc.loop reduction(@reduction_add_i64 -> %a : i64) {
+    acc.loop reduction(@reduction_add_i64 -> %a : i64) (%iv : index) = (%c0 : index) to (%c10 : index) step (%c1 : index) {
       acc.yield
-    }
+    } attributes { inclusiveUpperbound = array<i1: true> }
     acc.yield
   }
   return

From 5062a178bf9dd46008b8f7a182facb6152c46889 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Mon, 22 Jan 2024 10:31:37 -0800
Subject: [PATCH 449/843] [flang][openacc] Lower loop directive to the new
 acc.loop op design (#65417)

acc.loop was redesigned in https://reviews.llvm.org/D159229. This patch
updates the lowering to match the new op.

DO CONCURRENT construct will be added in a follow up patch.

Note that the pre-commit ci will fail until D159229 is merged.

Depends on #67355
---
 flang/include/flang/Lower/OpenACC.h           |   5 +
 flang/lib/Lower/Bridge.cpp                    |  36 ++-
 flang/lib/Lower/OpenACC.cpp                   | 189 ++++++++++++----
 flang/test/Lower/OpenACC/acc-kernels-loop.f90 | 162 +++++--------
 flang/test/Lower/OpenACC/acc-loop-exit.f90    |  10 +-
 flang/test/Lower/OpenACC/acc-loop.f90         | 213 ++++++++----------
 .../test/Lower/OpenACC/acc-parallel-loop.f90  | 172 ++++++--------
 flang/test/Lower/OpenACC/acc-private.f90      |  14 +-
 flang/test/Lower/OpenACC/acc-reduction.f90    |  42 ++--
 flang/test/Lower/OpenACC/acc-serial-loop.f90  | 150 +++++-------
 flang/test/Lower/OpenACC/locations.f90        |  32 ++-
 11 files changed, 511 insertions(+), 514 deletions(-)

diff --git a/flang/include/flang/Lower/OpenACC.h b/flang/include/flang/Lower/OpenACC.h
index f23e4726f33e0..fbf61e7184ae2 100644
--- a/flang/include/flang/Lower/OpenACC.h
+++ b/flang/include/flang/Lower/OpenACC.h
@@ -35,6 +35,7 @@ class FirOpBuilder;
 
 namespace Fortran {
 namespace parser {
+struct AccClauseList;
 struct OpenACCConstruct;
 struct OpenACCDeclarativeConstruct;
 struct OpenACCRoutineConstruct;
@@ -64,6 +65,8 @@ static constexpr llvm::StringRef declarePreDeallocSuffix =
 static constexpr llvm::StringRef declarePostDeallocSuffix =
     "_acc_declare_update_desc_post_dealloc";
 
+static constexpr llvm::StringRef privatizationRecipePrefix = "privatization";
+
 mlir::Value genOpenACCConstruct(AbstractConverter &,
                                 Fortran::semantics::SemanticsContext &,
                                 pft::Evaluation &,
@@ -113,6 +116,8 @@ void attachDeclarePostDeallocAction(AbstractConverter &, fir::FirOpBuilder &,
 void genOpenACCTerminator(fir::FirOpBuilder &, mlir::Operation *,
                           mlir::Location);
 
+int64_t getCollapseValue(const Fortran::parser::AccClauseList &);
+
 bool isInOpenACCLoop(fir::FirOpBuilder &);
 
 void setInsertionPointAfterOpenACCLoopIfInside(fir::FirOpBuilder &);
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 8006b9b426f4d..b3aeb99fc5d57 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -2391,13 +2391,43 @@ class FirConverter : public Fortran::lower::AbstractConverter {
     localSymbols.pushScope();
     mlir::Value exitCond = genOpenACCConstruct(
         *this, bridge.getSemanticsContext(), getEval(), acc);
-    for (Fortran::lower::pft::Evaluation &e : getEval().getNestedEvaluations())
+
+    const Fortran::parser::OpenACCLoopConstruct *accLoop =
+        std::get_if<Fortran::parser::OpenACCLoopConstruct>(&acc.u);
+    const Fortran::parser::OpenACCCombinedConstruct *accCombined =
+        std::get_if<Fortran::parser::OpenACCCombinedConstruct>(&acc.u);
+
+    Fortran::lower::pft::Evaluation *curEval = &getEval();
+
+    if (accLoop || accCombined) {
+      int64_t collapseValue;
+      if (accLoop) {
+        const Fortran::parser::AccBeginLoopDirective &beginLoopDir =
+            std::get<Fortran::parser::AccBeginLoopDirective>(accLoop->t);
+        const Fortran::parser::AccClauseList &clauseList =
+            std::get<Fortran::parser::AccClauseList>(beginLoopDir.t);
+        collapseValue = Fortran::lower::getCollapseValue(clauseList);
+      } else if (accCombined) {
+        const Fortran::parser::AccBeginCombinedDirective &beginCombinedDir =
+            std::get<Fortran::parser::AccBeginCombinedDirective>(
+                accCombined->t);
+        const Fortran::parser::AccClauseList &clauseList =
+            std::get<Fortran::parser::AccClauseList>(beginCombinedDir.t);
+        collapseValue = Fortran::lower::getCollapseValue(clauseList);
+      }
+
+      if (curEval->lowerAsStructured()) {
+        curEval = &curEval->getFirstNestedEvaluation();
+        for (int64_t i = 1; i < collapseValue; i++)
+          curEval = &*std::next(curEval->getNestedEvaluations().begin());
+      }
+    }
+
+    for (Fortran::lower::pft::Evaluation &e : curEval->getNestedEvaluations())
       genFIR(e);
     localSymbols.popScope();
     builder->restoreInsertionPoint(insertPt);
 
-    const Fortran::parser::OpenACCLoopConstruct *accLoop =
-        std::get_if<Fortran::parser::OpenACCLoopConstruct>(&acc.u);
     if (accLoop && exitCond) {
       Fortran::lower::pft::FunctionLikeUnit *funit =
           getEval().getOwningProcedure();
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 682ca06cabd6f..474d4e686c135 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -787,7 +787,8 @@ genPrivatizations(const Fortran::parser::AccObjectList &objectList,
     mlir::Type retTy = getTypeFromBounds(bounds, info.addr.getType());
     if constexpr (std::is_same_v<RecipeOp, mlir::acc::PrivateRecipeOp>) {
       std::string recipeName =
-          fir::getTypeAsString(retTy, converter.getKindMap(), "privatization");
+          fir::getTypeAsString(retTy, converter.getKindMap(),
+                               Fortran::lower::privatizationRecipePrefix);
       recipe = Fortran::lower::createOrGetPrivateRecipe(builder, recipeName,
                                                         operandLocation, retTy);
       auto op = createDataEntryOp<mlir::acc::PrivateOp>(
@@ -1412,15 +1413,17 @@ static void addOperand(llvm::SmallVectorImpl<mlir::Value> &operands,
 }
 
 template <typename Op, typename Terminator>
-static Op createRegionOp(fir::FirOpBuilder &builder, mlir::Location loc,
-                         Fortran::lower::pft::Evaluation &eval,
-                         const llvm::SmallVectorImpl<mlir::Value> &operands,
-                         const llvm::SmallVectorImpl<int32_t> &operandSegments,
-                         bool outerCombined = false,
-                         llvm::SmallVector<mlir::Type> retTy = {},
-                         mlir::Value yieldValue = {}) {
+static Op
+createRegionOp(fir::FirOpBuilder &builder, mlir::Location loc,
+               mlir::Location returnLoc, Fortran::lower::pft::Evaluation &eval,
+               const llvm::SmallVectorImpl<mlir::Value> &operands,
+               const llvm::SmallVectorImpl<int32_t> &operandSegments,
+               bool outerCombined = false,
+               llvm::SmallVector<mlir::Type> retTy = {},
+               mlir::Value yieldValue = {}, mlir::TypeRange argsTy = {},
+               llvm::SmallVector<mlir::Location> locs = {}) {
   Op op = builder.create<Op>(loc, retTy, operands);
-  builder.createBlock(&op.getRegion());
+  builder.createBlock(&op.getRegion(), op.getRegion().end(), argsTy, locs);
   mlir::Block &block = op.getRegion().back();
   builder.setInsertionPointToStart(&block);
 
@@ -1439,13 +1442,13 @@ static Op createRegionOp(fir::FirOpBuilder &builder, mlir::Location loc,
 
   if (yieldValue) {
     if constexpr (std::is_same_v<Terminator, mlir::acc::YieldOp>) {
-      Terminator yieldOp = builder.create<Terminator>(loc, yieldValue);
+      Terminator yieldOp = builder.create<Terminator>(returnLoc, yieldValue);
       yieldValue.getDefiningOp()->moveBefore(yieldOp);
     } else {
-      builder.create<Terminator>(loc);
+      builder.create<Terminator>(returnLoc);
     }
   } else {
-    builder.create<Terminator>(loc);
+    builder.create<Terminator>(returnLoc);
   }
   builder.setInsertionPointToStart(&block);
   return op;
@@ -1595,18 +1598,28 @@ genWaitClause(Fortran::lower::AbstractConverter &converter,
   }
 }
 
+mlir::Type getTypeFromIvTypeSize(fir::FirOpBuilder &builder,
+                                 const Fortran::semantics::Symbol &ivSym) {
+  std::size_t ivTypeSize = ivSym.size();
+  if (ivTypeSize == 0)
+    llvm::report_fatal_error("unexpected induction variable size");
+  // ivTypeSize is in bytes and IntegerType needs to be in bits.
+  return builder.getIntegerType(ivTypeSize * 8);
+}
+
 static mlir::acc::LoopOp
 createLoopOp(Fortran::lower::AbstractConverter &converter,
              mlir::Location currentLocation,
-             Fortran::lower::pft::Evaluation &eval,
              Fortran::semantics::SemanticsContext &semanticsContext,
              Fortran::lower::StatementContext &stmtCtx,
+             const Fortran::parser::DoConstruct &outerDoConstruct,
+             Fortran::lower::pft::Evaluation &eval,
              const Fortran::parser::AccClauseList &accClauseList,
              bool needEarlyReturnHandling = false) {
   fir::FirOpBuilder &builder = converter.getFirOpBuilder();
-  llvm::SmallVector<mlir::Value> tileOperands, privateOperands,
+  llvm::SmallVector<mlir::Value> tileOperands, privateOperands, ivPrivate,
       reductionOperands, cacheOperands, vectorOperands, workerNumOperands,
-      gangOperands;
+      gangOperands, lowerbounds, upperbounds, steps;
   llvm::SmallVector<mlir::Attribute> privatizations, reductionRecipes;
   llvm::SmallVector<int32_t> tileOperandsSegments, gangOperandsSegments;
   llvm::SmallVector<int64_t> collapseValues;
@@ -1623,6 +1636,74 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
   crtDeviceTypes.push_back(mlir::acc::DeviceTypeAttr::get(
       builder.getContext(), mlir::acc::DeviceType::None));
 
+  llvm::SmallVector<mlir::Type> ivTypes;
+  llvm::SmallVector<mlir::Location> ivLocs;
+  llvm::SmallVector<bool> inclusiveBounds;
+
+  if (outerDoConstruct.IsDoConcurrent())
+    TODO(currentLocation, "OpenACC loop with DO CONCURRENT");
+
+  llvm::SmallVector<mlir::Location> locs;
+  locs.push_back(currentLocation); // Location of the directive
+
+  int64_t collapseValue = Fortran::lower::getCollapseValue(accClauseList);
+  Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
+  for (unsigned i = 0; i < collapseValue; ++i) {
+    const Fortran::parser::LoopControl *loopControl;
+    if (i == 0) {
+      loopControl = &*outerDoConstruct.GetLoopControl();
+      locs.push_back(converter.genLocation(
+          Fortran::parser::FindSourceLocation(outerDoConstruct)));
+    } else {
+      auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
+      assert(doCons && "expect do construct");
+      loopControl = &*doCons->GetLoopControl();
+      locs.push_back(
+          converter.genLocation(Fortran::parser::FindSourceLocation(*doCons)));
+    }
+
+    const Fortran::parser::LoopControl::Bounds *bounds =
+        std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
+    assert(bounds && "Expected bounds on the loop construct");
+    lowerbounds.push_back(fir::getBase(converter.genExprValue(
+        *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
+    upperbounds.push_back(fir::getBase(converter.genExprValue(
+        *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
+    if (bounds->step)
+      steps.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
+    else // If `step` is not present, assume it as `1`.
+      steps.push_back(builder.createIntegerConstant(
+          currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
+
+    Fortran::semantics::Symbol &ivSym =
+        bounds->name.thing.symbol->GetUltimate();
+
+    mlir::Type ivTy = getTypeFromIvTypeSize(builder, ivSym);
+    mlir::Value ivValue = converter.getSymbolAddress(ivSym);
+    ivTypes.push_back(ivTy);
+    ivLocs.push_back(currentLocation);
+    std::string recipeName =
+        fir::getTypeAsString(ivValue.getType(), converter.getKindMap(),
+                             Fortran::lower::privatizationRecipePrefix);
+    auto recipe = Fortran::lower::createOrGetPrivateRecipe(
+        builder, recipeName, currentLocation, ivValue.getType());
+    std::stringstream asFortran;
+    auto op = createDataEntryOp<mlir::acc::PrivateOp>(
+        builder, currentLocation, ivValue, asFortran, {}, true,
+        /*implicit=*/true, mlir::acc::DataClause::acc_private,
+        ivValue.getType());
+
+    privateOperands.push_back(op.getAccPtr());
+    ivPrivate.push_back(op.getAccPtr());
+    privatizations.push_back(mlir::SymbolRefAttr::get(
+        builder.getContext(), recipe.getSymName().str()));
+    inclusiveBounds.push_back(true);
+    converter.bindSymbol(ivSym, op.getAccPtr());
+    if (i < collapseValue - 1)
+      crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
+  }
+
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
     mlir::Location clauseLocation = converter.genLocation(clause.source);
     if (const auto *gangClause =
@@ -1776,6 +1857,9 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
   // Prepare the operand segment size attribute and the operands value range.
   llvm::SmallVector<mlir::Value> operands;
   llvm::SmallVector<int32_t> operandSegments;
+  addOperands(operands, operandSegments, lowerbounds);
+  addOperands(operands, operandSegments, upperbounds);
+  addOperands(operands, operandSegments, steps);
   addOperands(operands, operandSegments, gangOperands);
   addOperands(operands, operandSegments, workerNumOperands);
   addOperands(operands, operandSegments, vectorOperands);
@@ -1793,8 +1877,15 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
   }
 
   auto loopOp = createRegionOp<mlir::acc::LoopOp, mlir::acc::YieldOp>(
-      builder, currentLocation, eval, operands, operandSegments,
-      /*outerCombined=*/false, retTy, yieldValue);
+      builder, builder.getFusedLoc(locs), currentLocation, eval, operands,
+      operandSegments, /*outerCombined=*/false, retTy, yieldValue, ivTypes,
+      ivLocs);
+
+  for (auto [arg, value] : llvm::zip(
+           loopOp.getLoopRegions().front()->front().getArguments(), ivPrivate))
+    builder.create<fir::StoreOp>(currentLocation, arg, value);
+
+  loopOp.setInclusiveUpperbound(inclusiveBounds);
 
   if (!gangDeviceTypes.empty())
     loopOp.setGangAttr(builder.getArrayAttr(gangDeviceTypes));
@@ -1881,15 +1972,19 @@ genACC(Fortran::lower::AbstractConverter &converter,
       converter.genLocation(beginLoopDirective.source);
   Fortran::lower::StatementContext stmtCtx;
 
-  if (loopDirective.v == llvm::acc::ACCD_loop) {
-    const auto &accClauseList =
-        std::get<Fortran::parser::AccClauseList>(beginLoopDirective.t);
-    auto loopOp =
-        createLoopOp(converter, currentLocation, eval, semanticsContext,
-                     stmtCtx, accClauseList, needEarlyExitHandling);
-    if (needEarlyExitHandling)
-      return loopOp.getResult(0);
-  }
+  assert(loopDirective.v == llvm::acc::ACCD_loop &&
+         "Unsupported OpenACC loop construct");
+
+  const auto &accClauseList =
+      std::get<Fortran::parser::AccClauseList>(beginLoopDirective.t);
+  const auto &outerDoConstruct =
+      std::get<std::optional<Fortran::parser::DoConstruct>>(loopConstruct.t);
+  auto loopOp = createLoopOp(converter, currentLocation, semanticsContext,
+                             stmtCtx, *outerDoConstruct, eval, accClauseList,
+                             needEarlyExitHandling);
+  if (needEarlyExitHandling)
+    return loopOp.getResult(0);
+
   return mlir::Value{};
 }
 
@@ -2186,12 +2281,12 @@ createComputeOp(Fortran::lower::AbstractConverter &converter,
   Op computeOp;
   if constexpr (std::is_same_v<Op, mlir::acc::KernelsOp>)
     computeOp = createRegionOp<Op, mlir::acc::TerminatorOp>(
-        builder, currentLocation, eval, operands, operandSegments,
-        outerCombined);
+        builder, currentLocation, currentLocation, eval, operands,
+        operandSegments, outerCombined);
   else
     computeOp = createRegionOp<Op, mlir::acc::YieldOp>(
-        builder, currentLocation, eval, operands, operandSegments,
-        outerCombined);
+        builder, currentLocation, currentLocation, eval, operands,
+        operandSegments, outerCombined);
 
   if (addSelfAttr)
     computeOp.setSelfAttrAttr(builder.getUnitAttr());
@@ -2397,7 +2492,8 @@ static void genACCDataOp(Fortran::lower::AbstractConverter &converter,
     return;
 
   auto dataOp = createRegionOp<mlir::acc::DataOp, mlir::acc::TerminatorOp>(
-      builder, currentLocation, eval, operands, operandSegments);
+      builder, currentLocation, currentLocation, eval, operands,
+      operandSegments);
 
   if (!asyncDeviceTypes.empty())
     dataOp.setAsyncDeviceTypeAttr(builder.getArrayAttr(asyncDeviceTypes));
@@ -2486,7 +2582,8 @@ genACCHostDataOp(Fortran::lower::AbstractConverter &converter,
 
   auto hostDataOp =
       createRegionOp<mlir::acc::HostDataOp, mlir::acc::TerminatorOp>(
-          builder, currentLocation, eval, operands, operandSegments);
+          builder, currentLocation, currentLocation, eval, operands,
+          operandSegments);
 
   if (addIfPresentAttr)
     hostDataOp.setIfPresentAttr(builder.getUnitAttr());
@@ -2539,6 +2636,9 @@ genACC(Fortran::lower::AbstractConverter &converter,
       std::get<Fortran::parser::AccCombinedDirective>(beginCombinedDirective.t);
   const auto &accClauseList =
       std::get<Fortran::parser::AccClauseList>(beginCombinedDirective.t);
+  const auto &outerDoConstruct =
+      std::get<std::optional<Fortran::parser::DoConstruct>>(
+          combinedConstruct.t);
 
   mlir::Location currentLocation =
       converter.genLocation(beginCombinedDirective.source);
@@ -2548,20 +2648,20 @@ genACC(Fortran::lower::AbstractConverter &converter,
     createComputeOp<mlir::acc::KernelsOp>(
         converter, currentLocation, eval, semanticsContext, stmtCtx,
         accClauseList, /*outerCombined=*/true);
-    createLoopOp(converter, currentLocation, eval, semanticsContext, stmtCtx,
-                 accClauseList);
+    createLoopOp(converter, currentLocation, semanticsContext, stmtCtx,
+                 *outerDoConstruct, eval, accClauseList);
   } else if (combinedDirective.v == llvm::acc::ACCD_parallel_loop) {
     createComputeOp<mlir::acc::ParallelOp>(
         converter, currentLocation, eval, semanticsContext, stmtCtx,
         accClauseList, /*outerCombined=*/true);
-    createLoopOp(converter, currentLocation, eval, semanticsContext, stmtCtx,
-                 accClauseList);
+    createLoopOp(converter, currentLocation, semanticsContext, stmtCtx,
+                 *outerDoConstruct, eval, accClauseList);
   } else if (combinedDirective.v == llvm::acc::ACCD_serial_loop) {
     createComputeOp<mlir::acc::SerialOp>(converter, currentLocation, eval,
                                          semanticsContext, stmtCtx,
                                          accClauseList, /*outerCombined=*/true);
-    createLoopOp(converter, currentLocation, eval, semanticsContext, stmtCtx,
-                 accClauseList);
+    createLoopOp(converter, currentLocation, semanticsContext, stmtCtx,
+                 *outerDoConstruct, eval, accClauseList);
   } else {
     llvm::report_fatal_error("Unknown combined construct encountered");
   }
@@ -3985,3 +4085,16 @@ void Fortran::lower::genEarlyReturnInOpenACCLoop(fir::FirOpBuilder &builder,
       builder.createIntegerConstant(loc, builder.getI1Type(), 1);
   builder.create<mlir::acc::YieldOp>(loc, yieldValue);
 }
+
+int64_t Fortran::lower::getCollapseValue(
+    const Fortran::parser::AccClauseList &clauseList) {
+  for (const Fortran::parser::AccClause &clause : clauseList.v) {
+    if (const auto *collapseClause =
+            std::get_if<Fortran::parser::AccClause::Collapse>(&clause.u)) {
+      const parser::AccCollapseArg &arg = collapseClause->v;
+      const auto &collapseValue{std::get<parser::ScalarIntConstantExpr>(arg.t)};
+      return *Fortran::semantics::GetIntValue(collapseValue);
+    }
+  }
+  return 1;
+}
diff --git a/flang/test/Lower/OpenACC/acc-kernels-loop.f90 b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
index 755111a69467b..21660d5c3a131 100644
--- a/flang/test/Lower/OpenACC/acc-kernels-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-kernels-loop.f90
@@ -43,8 +43,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -57,8 +56,7 @@ subroutine acc_kernels_loop
   !$acc end kernels loop
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -71,8 +69,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[ASYNC1:%.*]] = arith.constant 1 : i32
 ! CHECK:      acc.kernels async([[ASYNC1]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -85,8 +82,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[ASYNC2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.kernels async([[ASYNC2]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -98,8 +94,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -112,8 +107,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
 ! CHECK:      acc.kernels wait({[[WAIT1]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -127,8 +121,7 @@ subroutine acc_kernels_loop
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
 ! CHECK:      acc.kernels wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -142,8 +135,7 @@ subroutine acc_kernels_loop
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.kernels wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -156,8 +148,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
 ! CHECK:      acc.kernels num_gangs({[[NUMGANGS1]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -170,8 +161,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.kernels num_gangs({[[NUMGANGS2]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -184,8 +174,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[NUMWORKERS1:%.*]] = arith.constant 10 : i32
 ! CHECK:      acc.kernels num_workers([[NUMWORKERS1]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -198,8 +187,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[NUMWORKERS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.kernels num_workers([[NUMWORKERS2]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -212,8 +200,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[VECTORLENGTH1:%.*]] = arith.constant 128 : i32
 ! CHECK:      acc.kernels vector_length([[VECTORLENGTH1]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -226,8 +213,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[VECTORLENGTH2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.kernels vector_length([[VECTORLENGTH2]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -240,8 +226,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[IF1:%.*]] = arith.constant true
 ! CHECK:      acc.kernels if([[IF1]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -255,8 +240,7 @@ subroutine acc_kernels_loop
 ! CHECK:      [[IFCOND:%.*]] = fir.load %{{.*}} : !fir.ref<!fir.logical<4>>
 ! CHECK:      [[IF2:%.*]] = fir.convert [[IFCOND]] : (!fir.logical<4>) -> i1
 ! CHECK:      acc.kernels if([[IF2]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -269,8 +253,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      [[SELF1:%.*]] = arith.constant true
 ! CHECK:      acc.kernels self([[SELF1]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -282,8 +265,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -297,8 +279,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      %[[SELF2:.*]] = fir.convert %[[DECLIFCONDITION]]#1 : (!fir.ref<!fir.logical<4>>) -> i1
 ! CHECK:      acc.kernels self(%[[SELF2]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -313,8 +294,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
 ! CHECK:      %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
 ! CHECK:      acc.kernels dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -330,8 +310,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
 ! CHECK:      %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
 ! CHECK:      acc.kernels dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -347,8 +326,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
 ! CHECK:      acc.kernels dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -362,8 +340,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
 ! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
 ! CHECK:      acc.kernels dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -379,8 +356,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_create_zero>, name = "a"}
 ! CHECK:      acc.kernels dataOperands(%[[CREATE_B]], %[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -396,8 +372,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[NOCREATE_A:.*]] = acc.nocreate varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[NOCREATE_B:.*]] = acc.nocreate varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.kernels dataOperands(%[[NOCREATE_A]], %[[NOCREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -411,8 +386,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[PRESENT_A:.*]] = acc.present varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[PRESENT_B:.*]] = acc.present varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.kernels dataOperands(%[[PRESENT_A]], %[[PRESENT_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -426,8 +400,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[DEVICEPTR_A:.*]] = acc.deviceptr varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[DEVICEPTR_B:.*]] = acc.deviceptr varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.kernels dataOperands(%[[DEVICEPTR_A]], %[[DEVICEPTR_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -445,8 +418,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[BOX_ADDR_G:.*]] = fir.box_addr %[[BOX_G]] : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
 ! CHECK:      %[[ATTACH_G:.*]] = acc.attach varPtr(%[[BOX_ADDR_G]] : !fir.ptr<f32>) -> !fir.ptr<f32> {name = "g"}
 ! CHECK:      acc.kernels dataOperands(%[[ATTACH_F]], %[[ATTACH_G]] : !fir.ptr<f32>, !fir.ptr<f32>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -458,10 +430,9 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {seq = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -471,10 +442,9 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -484,10 +454,9 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {independent = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -497,10 +466,9 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop gang {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop gang() {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -511,8 +479,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[GANGNUM1:%.*]] = arith.constant 8 : i32
-! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM1]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM1]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -525,8 +492,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM2]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM2]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -538,8 +504,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32})
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -550,10 +515,9 @@ subroutine acc_kernels_loop
     a(i) = b(i)
   END DO
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop vector {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop vector() {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -564,8 +528,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[CONSTANT128:%.*]] = arith.constant 128 : i32
-! CHECK:        acc.loop vector([[CONSTANT128]] : i32) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop vector([[CONSTANT128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -578,8 +541,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:        acc.loop vector([[VECTORLENGTH]] : i32) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop vector([[VECTORLENGTH]] : i32) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -591,10 +553,9 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop worker {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop worker() {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -605,8 +566,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[WORKER128:%.*]] = arith.constant 128 : i32
-! CHECK:        acc.loop worker([[WORKER128]] : i32) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop worker([[WORKER128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -620,11 +580,9 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
-! CHECK:            fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true>}
 ! CHECK:        acc.terminator
 ! CHECK-NEXT: }{{$}}
 
@@ -637,10 +595,8 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
-! CHECK:            acc.loop {
-! CHECK:              fir.do_loop
+! CHECK:        acc.loop {{.*}} {
+! CHECK:            acc.loop {{.*}} {
 ! CHECK:              acc.yield
 ! CHECK-NEXT:     }{{$}}
 ! CHECK:          acc.yield
@@ -655,8 +611,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[TILESIZE:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile({[[TILESIZE]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({[[TILESIZE]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -669,8 +624,7 @@ subroutine acc_kernels_loop
 
 ! CHECK:      acc.kernels {
 ! CHECK:        [[TILESIZEM1:%.*]] = arith.constant -1 : i32
-! CHECK:        acc.loop tile({[[TILESIZEM1]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({[[TILESIZEM1]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -686,8 +640,7 @@ subroutine acc_kernels_loop
 ! CHECK:      acc.kernels {
 ! CHECK:        [[TILESIZE1:%.*]] = arith.constant 2 : i32
 ! CHECK:        [[TILESIZE2:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -699,8 +652,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop tile({%{{.*}} : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({%{{.*}} : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -714,8 +666,7 @@ subroutine acc_kernels_loop
   END DO
 
 ! CHECK:      acc.kernels {
-! CHECK:        acc.loop tile({%{{.*}} : i32, %{{.*}} : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({%{{.*}} : i32, %{{.*}} : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
@@ -730,8 +681,7 @@ subroutine acc_kernels_loop
 ! CHECK:      %[[COPYINREDR:.*]] = acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "reduction_r"}
 ! CHECK:      %[[COPYINREDI:.*]] = acc.copyin varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "reduction_i"}
 ! CHECK:      acc.kernels dataOperands(%[[COPYINREDR]], %[[COPYINREDI]] : !fir.ref<f32>, !fir.ref<i32>) {
-! CHECK:        acc.loop reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.terminator
diff --git a/flang/test/Lower/OpenACC/acc-loop-exit.f90 b/flang/test/Lower/OpenACC/acc-loop-exit.f90
index 75f1c30733272..c1ea057af6673 100644
--- a/flang/test/Lower/OpenACC/acc-loop-exit.f90
+++ b/flang/test/Lower/OpenACC/acc-loop-exit.f90
@@ -15,11 +15,15 @@ subroutine sub1(x, a)
 
 ! CHECK-LABEL: func.func @_QPsub1
 ! CHECK: %[[A:.*]]:2 = hlfir.declare %arg1 {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
-! CHECK: %[[EXIT_COND:.*]] = acc.loop {
+! CHECK: %[[I:.*]]:2 = hlfir.declare %2 {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[I:.*]]:2 = hlfir.declare %6 {uniq_name = "_QFsub1Ei"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+! CHECK: %[[EXIT_COND:.*]] = acc.loop
 ! CHECK: ^bb{{.*}}:
 ! CHECK: ^bb{{.*}}:
-! CHECK:   %[[LOAD_A:.*]] = fir.load %[[A]]#0 : !fir.ref<i32>
-! CHECK:   %[[CMP:.*]] = arith.cmpi eq, %15, %[[LOAD_A]] : i32
+! CHECK:   %[[LOAD_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32> 
+! CHECK:   %[[LOAD_I:.*]] = fir.load %[[I]]#0 : !fir.ref<i32> 
+! CHECK:   %[[LOAD_A:.*]] = fir.load %[[A]]#0 : !fir.ref<i32> 
+! CHECK:   %[[CMP:.*]] = arith.cmpi eq, %[[LOAD_I]], %[[LOAD_A]] : i32
 ! CHECK:   cf.cond_br %[[CMP]], ^[[EARLY_RET:.*]], ^[[NO_RET:.*]]
 ! CHECK: ^[[EARLY_RET]]:
 ! CHECK:   acc.yield %true : i1
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index 59c2513332a97..334d8767a727a 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -27,185 +27,168 @@ program acc_loop
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK: acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 
  !$acc loop seq
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: } attributes {seq = [#acc.device_type<none>]}
+! CHECK: acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
 
   !$acc loop auto
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>]}
+! CHECK: acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 
   !$acc loop independent
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: } attributes {independent = [#acc.device_type<none>]}
+! CHECK:      acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 
   !$acc loop gang
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop gang {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop gang() private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop gang(num: 8)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      [[GANGNUM1:%.*]] = arith.constant 8 : i32
-!CHECK-NEXT: acc.loop gang({num=[[GANGNUM1]] : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      [[GANGNUM1:%.*]] = arith.constant 8 : i32
+! CHECK-NEXT: acc.loop gang({num=[[GANGNUM1]] : i32}) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop gang(num: gangNum)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-!CHECK-NEXT: acc.loop gang({num=[[GANGNUM2]] : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK-NEXT: acc.loop gang({num=[[GANGNUM2]] : i32}) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
  !$acc loop gang(num: gangNum, static: gangStatic)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK: acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK: acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop vector
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop vector {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop vector() private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop vector(128)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK: [[CONSTANT128:%.*]] = arith.constant 128 : i32
-!CHECK:      acc.loop vector([[CONSTANT128]] : i32) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK: [[CONSTANT128:%.*]] = arith.constant 128 : i32
+! CHECK:      acc.loop vector([[CONSTANT128]] : i32) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: }{{$}}
 
   !$acc loop vector(vectorLength)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-!CHECK:      acc.loop vector([[VECTORLENGTH]] : i32) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK:      acc.loop vector([[VECTORLENGTH]] : i32) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
 !$acc loop worker
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop worker {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop worker() private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop worker(128)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK: [[WORKER128:%.*]] = arith.constant 128 : i32
-!CHECK:      acc.loop worker([[WORKER128]] : i32) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK: [[WORKER128:%.*]] = arith.constant 128 : i32
+! CHECK:      acc.loop worker([[WORKER128]] : i32) private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop private(c)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop private(@privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop private(c, d)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop private(@privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop private(c) private(d)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop private(@privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>, @privatization_ref_10x10xf32 -> %{{.*}} : !fir.ref<!fir.array<10x10xf32>>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop tile(2)
   DO i = 1, n
     a(i) = b(i)
   END DO
-!CHECK:      [[TILESIZE:%.*]] = arith.constant 2 : i32
-!CHECK:      acc.loop tile({[[TILESIZE]] : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+
+! CHECK:      [[TILESIZE:%.*]] = arith.constant 2 : i32
+! CHECK:      acc.loop {{.*}} tile({[[TILESIZE]] : i32}) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
  !$acc loop tile(*)
   DO i = 1, n
     a(i) = b(i)
   END DO
-!CHECK:      [[TILESIZEM1:%.*]] = arith.constant -1 : i32
-!CHECK:      acc.loop tile({[[TILESIZEM1]] : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      [[TILESIZEM1:%.*]] = arith.constant -1 : i32
+! CHECK:      acc.loop {{.*}} tile({[[TILESIZEM1]] : i32}) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop tile(2, 2)
   DO i = 1, n
@@ -214,22 +197,20 @@ program acc_loop
     END DO
   END DO
 
-!CHECK:      [[TILESIZE1:%.*]] = arith.constant 2 : i32
-!CHECK:      [[TILESIZE2:%.*]] = arith.constant 2 : i32
-!CHECK:      acc.loop tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      [[TILESIZE1:%.*]] = arith.constant 2 : i32
+! CHECK:      [[TILESIZE2:%.*]] = arith.constant 2 : i32
+! CHECK:      acc.loop {{.*}} tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop tile(tileSize)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      acc.loop tile({%{{.*}} : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop {{.*}} tile({%{{.*}} : i32}) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop tile(tileSize, tileSize)
   DO i = 1, n
@@ -238,10 +219,9 @@ program acc_loop
     END DO
   END DO
 
-!CHECK:      acc.loop tile({%{{.*}} : i32, %{{.*}} : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop {{.*}} tile({%{{.*}} : i32, %{{.*}} : i32}) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop collapse(2)
   DO i = 1, n
@@ -250,11 +230,11 @@ program acc_loop
     END DO
   END DO
 
-!CHECK:      acc.loop {
-!CHECK:        fir.do_loop
-!CHECK:          fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
+! CHECK:      acc.loop {{.*}} (%arg0 : i32, %arg1 : i32) = (%{{.*}} : i32, i32) to (%{{.*}} : i32, i32) step (%{{.*}} : i32, i32) {
+! CHECK:        fir.store %arg0 to %{{.*}} : !fir.ref<i32>
+! CHECK:        fir.store %arg1 to %{{.*}} : !fir.ref<i32>
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true>}
 
   !$acc loop
   DO i = 1, n
@@ -264,14 +244,12 @@ program acc_loop
     END DO
   END DO
 
-!CHECK:      acc.loop {
-!CHECK:        fir.do_loop
-!CHECK:          acc.loop {
-!CHECK:            fir.do_loop
-!CHECK:            acc.yield
-!CHECK-NEXT:   }{{$}}
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop {{.*}} (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:          acc.loop {{.*}} (%arg1 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:            acc.yield
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop reduction(+:reduction_r) reduction(*:reduction_i)
   do i = 1, n
@@ -279,31 +257,27 @@ program acc_loop
     reduction_i = 1
   end do
 
-! CHECK:      acc.loop reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {
-! CHECK:        fir.do_loop
+! CHECK:      acc.loop private(@privatization_ref_i32 -> %{{.*}} : !fir.ref<i32>) reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
 ! CHECK:        acc.yield
-! CHECK-NEXT: }{{$}}
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
  !$acc loop gang(dim: gangDim, static: gangStatic)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK: acc.loop gang({dim=%{{.*}}, static=%{{.*}} : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK: acc.loop gang({dim=%{{.*}}, static=%{{.*}} : i32}) {{.*}} (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop gang(dim: 1)
   DO i = 1, n
     a(i) = b(i)
   END DO
 
-!CHECK:      [[GANGDIM1:%.*]] = arith.constant 1 : i32
-!CHECK-NEXT: acc.loop gang({dim=[[GANGDIM1]] : i32}) {
-!CHECK:        fir.do_loop
-!CHECK:        acc.yield
-!CHECK-NEXT: }{{$}}
+! CHECK:      acc.loop gang({dim={{.*}} : i32}) {{.*}} (%arg0 : i32) = (%{{.*}} : i32) to (%{{.*}} : i32) step (%{{.*}} : i32) {
+! CHECK:        acc.yield
+! CHECK-NEXT: } attributes {inclusiveUpperbound = array<i1: true>}
 
   !$acc loop
   DO i = 1, n
@@ -312,13 +286,12 @@ program acc_loop
   END DO
 
 ! CHECK: %[[CACHE:.*]] = acc.cache varPtr(%{{.*}} : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
-! CHECK: acc.loop cache(%[[CACHE]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK: acc.loop {{.*}} cache(%[[CACHE]] : !fir.ref<!fir.array<10xf32>>)
 
   !$acc loop
   do 100 i=0, n
   100 continue
 ! CHECK: acc.loop
-! CHECK: fir.do_loop
 
   !$acc loop gang device_type(nvidia) gang(8)
   DO i = 1, n
diff --git a/flang/test/Lower/OpenACC/acc-parallel-loop.f90 b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
index faef8517850e0..614d201f98e26 100644
--- a/flang/test/Lower/OpenACC/acc-parallel-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-parallel-loop.f90
@@ -45,8 +45,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -59,8 +58,7 @@ subroutine acc_parallel_loop
   !$acc end parallel loop
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -73,8 +71,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[ASYNC1:%.*]] = arith.constant 1 : i32
 ! CHECK:      acc.parallel async([[ASYNC1]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -87,8 +84,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[ASYNC2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.parallel async([[ASYNC2]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -100,8 +96,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -114,8 +109,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
 ! CHECK:      acc.parallel wait({[[WAIT1]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -129,8 +123,7 @@ subroutine acc_parallel_loop
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
 ! CHECK:      acc.parallel wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -144,8 +137,7 @@ subroutine acc_parallel_loop
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.parallel wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -158,8 +150,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[NUMGANGS1:%.*]] = arith.constant 1 : i32
 ! CHECK:      acc.parallel num_gangs({[[NUMGANGS1]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -172,8 +163,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[NUMGANGS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.parallel num_gangs({[[NUMGANGS2]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -186,8 +176,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[NUMWORKERS1:%.*]] = arith.constant 10 : i32
 ! CHECK:      acc.parallel num_workers([[NUMWORKERS1]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -200,8 +189,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[NUMWORKERS2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.parallel num_workers([[NUMWORKERS2]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -214,8 +202,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[VECTORLENGTH1:%.*]] = arith.constant 128 : i32
 ! CHECK:      acc.parallel vector_length([[VECTORLENGTH1]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -228,8 +215,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[VECTORLENGTH2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.parallel vector_length([[VECTORLENGTH2]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -242,8 +228,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[IF1:%.*]] = arith.constant true
 ! CHECK:      acc.parallel if([[IF1]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -257,8 +242,7 @@ subroutine acc_parallel_loop
 ! CHECK:      [[IFCOND:%.*]] = fir.load %{{.*}} : !fir.ref<!fir.logical<4>>
 ! CHECK:      [[IF2:%.*]] = fir.convert [[IFCOND]] : (!fir.logical<4>) -> i1
 ! CHECK:      acc.parallel if([[IF2]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -271,8 +255,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      [[SELF1:%.*]] = arith.constant true
 ! CHECK:      acc.parallel self([[SELF1]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -284,8 +267,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -298,8 +280,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      %[[SELF2:.*]] = fir.convert %[[DECLIFCONDITION]]#1 : (!fir.ref<!fir.logical<4>>) -> i1
 ! CHECK:      acc.parallel self(%[[SELF2]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -313,8 +294,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
 ! CHECK:      %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
 ! CHECK:      acc.parallel dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -330,8 +310,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
 ! CHECK:      %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
 ! CHECK:      acc.parallel dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -347,8 +326,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
 ! CHECK:      acc.parallel dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -362,8 +340,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
 ! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
 ! CHECK:      acc.parallel dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -379,8 +356,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_create_zero>, name = "a"}
 ! CHECK:      acc.parallel dataOperands(%[[CREATE_B]], %[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -396,8 +372,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[NOCREATE_A:.*]] = acc.nocreate varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[NOCREATE_B:.*]] = acc.nocreate varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.parallel dataOperands(%[[NOCREATE_A]], %[[NOCREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -411,8 +386,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[PRESENT_A:.*]] = acc.present varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[PRESENT_B:.*]] = acc.present varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.parallel dataOperands(%[[PRESENT_A]], %[[PRESENT_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -426,8 +400,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[DEVICEPTR_A:.*]] = acc.deviceptr varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[DEVICEPTR_B:.*]] = acc.deviceptr varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.parallel dataOperands(%[[DEVICEPTR_A]], %[[DEVICEPTR_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -445,8 +418,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[BOX_ADDR_G:.*]] = fir.box_addr %[[BOX_G]] : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
 ! CHECK:      %[[ATTACH_G:.*]] = acc.attach varPtr(%[[BOX_ADDR_G]] : !fir.ptr<f32>) -> !fir.ptr<f32> {name = "g"}
 ! CHECK:      acc.parallel dataOperands(%[[ATTACH_F]], %[[ATTACH_G]] : !fir.ptr<f32>, !fir.ptr<f32>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -460,8 +432,8 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[ACC_PRIVATE_B:.*]] = acc.firstprivate varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.parallel firstprivate(@firstprivatization_section_ext10_ref_10xf32 -> %[[ACC_PRIVATE_B]] : !fir.ref<!fir.array<10xf32>>) {
 ! CHECK:      %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK:        acc.loop private(@privatization_ref_10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10xf32>>) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop private({{.*}}@privatization_ref_10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK-NOT:      fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -473,10 +445,9 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {seq = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -486,10 +457,9 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -499,10 +469,9 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {independent = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -512,10 +481,9 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop gang {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop gang()
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -526,8 +494,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[GANGNUM1:%.*]] = arith.constant 8 : i32
-! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM1]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM1]] : i32})
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -540,8 +507,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM2]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM2]] : i32})
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -553,8 +519,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32})
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -565,10 +530,9 @@ subroutine acc_parallel_loop
     a(i) = b(i)
   END DO
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop vector {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop vector()
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -579,8 +543,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[CONSTANT128:%.*]] = arith.constant 128 : i32
-! CHECK:        acc.loop vector([[CONSTANT128]] : i32) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop vector([[CONSTANT128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -593,8 +556,8 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:        acc.loop vector([[VECTORLENGTH]] : i32) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop vector([[VECTORLENGTH]] : i32) {{.*}} {
+! CHECK-NOT:      fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -606,10 +569,10 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop worker {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop worker() {{.*}} {
+! CHECK-NOT:      fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -620,8 +583,8 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[WORKER128:%.*]] = arith.constant 128 : i32
-! CHECK:        acc.loop worker([[WORKER128]] : i32) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop worker([[WORKER128]] : i32) {{.*}} {
+! CHECK-NOT:      fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -635,11 +598,9 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
-! CHECK:            fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -652,10 +613,8 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
-! CHECK:            acc.loop {
-! CHECK:              fir.do_loop
+! CHECK:        acc.loop {{.*}} {
+! CHECK:            acc.loop {{.*}} {
 ! CHECK:              acc.yield
 ! CHECK-NEXT:     }{{$}}
 ! CHECK:          acc.yield
@@ -670,8 +629,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[TILESIZE:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile({[[TILESIZE]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({[[TILESIZE]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -684,8 +642,7 @@ subroutine acc_parallel_loop
 
 ! CHECK:      acc.parallel {
 ! CHECK:        [[TILESIZEM1:%.*]] = arith.constant -1 : i32
-! CHECK:        acc.loop tile({[[TILESIZEM1]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({[[TILESIZEM1]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -701,8 +658,7 @@ subroutine acc_parallel_loop
 ! CHECK:      acc.parallel {
 ! CHECK:        [[TILESIZE1:%.*]] = arith.constant 2 : i32
 ! CHECK:        [[TILESIZE2:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -714,8 +670,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop tile({%{{.*}} : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({%{{.*}} : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -729,8 +684,7 @@ subroutine acc_parallel_loop
   END DO
 
 ! CHECK:      acc.parallel {
-! CHECK:        acc.loop tile({%{{.*}} : i32, %{{.*}} : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({%{{.*}} : i32, %{{.*}} : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -745,8 +699,7 @@ subroutine acc_parallel_loop
 ! CHECK:      %[[COPYINREDR:.*]] = acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "reduction_r"}
 ! CHECK:      %[[COPYINREDI:.*]] = acc.copyin varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "reduction_i"}
 ! CHECK:      acc.parallel dataOperands(%[[COPYINREDR]], %[[COPYINREDI]] : !fir.ref<f32>, !fir.ref<i32>) {
-! CHECK:        acc.loop reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {{.*}}
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -754,11 +707,12 @@ subroutine acc_parallel_loop
 ! CHECK:      acc.copyout accPtr(%[[COPYINREDR]] : !fir.ref<f32>) to varPtr(%{{.*}} : !fir.ref<f32>) {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "reduction_r"}
 ! CHECK:      acc.copyout accPtr(%[[COPYINREDI]] : !fir.ref<i32>) to varPtr(%{{.*}} : !fir.ref<i32>) {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "reduction_i"}
 
+
   !$acc parallel loop
   do 10 i=0, n
   10 continue
 ! CHECK: acc.parallel
 ! CHECK: acc.loop
-! CHECK: fir.do_loop
+! CHECK-NOT: fir.do_loop
 
 end subroutine acc_parallel_loop
diff --git a/flang/test/Lower/OpenACC/acc-private.f90 b/flang/test/Lower/OpenACC/acc-private.f90
index 042596568bd77..c0bcb5e2ffd56 100644
--- a/flang/test/Lower/OpenACC/acc-private.f90
+++ b/flang/test/Lower/OpenACC/acc-private.f90
@@ -189,7 +189,7 @@ program acc_private
   END DO
 
 ! CHECK: %[[C_PRIVATE:.*]] = acc.private varPtr(%[[DECLC]]#1 : !fir.ref<i32>) -> !fir.ref<i32> {name = "c"}
-! CHECK: acc.loop private(@privatization_ref_i32 -> %[[C_PRIVATE]] : !fir.ref<i32>)
+! CHECK: acc.loop private({{.*}}@privatization_ref_i32 -> %[[C_PRIVATE]] : !fir.ref<i32>)
 ! CHECK: acc.yield
 
   !$acc loop private(b)
@@ -203,7 +203,7 @@ program acc_private
 ! CHECK: %[[UB:.*]] = arith.subi %{{.*}}, %[[C1]] : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%{{.*}} : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[B_PRIVATE:.*]] = acc.private varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
-! CHECK: acc.loop private(@privatization_ref_100xf32 -> %[[B_PRIVATE]] : !fir.ref<!fir.array<100xf32>>) {
+! CHECK: acc.loop private({{.*}}@privatization_ref_100xf32 -> %[[B_PRIVATE]] : !fir.ref<!fir.array<100xf32>>)
 ! CHECK: acc.yield
 
   !$acc loop private(b(1:50))
@@ -217,7 +217,7 @@ program acc_private
 ! CHECK: %[[UB:.*]] = arith.constant 49 : index
 ! CHECK: %[[BOUND:.*]] = acc.bounds lowerbound(%[[LB]] : index) upperbound(%[[UB]] : index) extent(%{{.*}} : index) stride(%[[C1]] : index) startIdx(%[[C1]] : index)
 ! CHECK: %[[B_PRIVATE:.*]] = acc.private varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xf32>>) bounds(%[[BOUND]]) -> !fir.ref<!fir.array<50xf32>> {name = "b(1:50)"}
-! CHECK: acc.loop private(@privatization_ref_50xf32 -> %[[B_PRIVATE]] : !fir.ref<!fir.array<50xf32>>)
+! CHECK: acc.loop private({{.*}}@privatization_ref_50xf32 -> %[[B_PRIVATE]] : !fir.ref<!fir.array<50xf32>>)
 
   !$acc parallel loop firstprivate(c)
   DO i = 1, n
@@ -273,7 +273,7 @@ subroutine acc_private_assumed_shape(a, n)
 ! CHECK: acc.parallel {
 ! CHECK: %[[ADDR:.*]] = fir.box_addr %[[DECL_A]]#1 : (!fir.box<!fir.array<?xi32>>) -> !fir.ref<!fir.array<?xi32>>
 ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[ADDR]] : !fir.ref<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<?xi32>> {name = "a"}
-! CHECK: acc.loop private(@privatization_box_Uxi32 -> %[[PRIVATE]] : !fir.ref<!fir.array<?xi32>>) {
+! CHECK: acc.loop private({{.*}}@privatization_box_Uxi32 -> %[[PRIVATE]] : !fir.ref<!fir.array<?xi32>>)
 
 subroutine acc_private_allocatable_array(a, n)
   integer, allocatable :: a(:)
@@ -296,7 +296,7 @@ subroutine acc_private_allocatable_array(a, n)
 ! CHECK: %[[BOX:.*]] = fir.load %[[DECLA_A]]#1 : !fir.ref<!fir.box<!fir.heap<!fir.array<?xi32>>>>
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.heap<!fir.array<?xi32>>>) -> !fir.heap<!fir.array<?xi32>>
 ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[BOX_ADDR]] : !fir.heap<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.heap<!fir.array<?xi32>> {name = "a"}
-! CHECK: acc.loop private(@privatization_box_heap_Uxi32 -> %[[PRIVATE]] : !fir.heap<!fir.array<?xi32>>)
+! CHECK: acc.loop private({{.*}}@privatization_box_heap_Uxi32 -> %[[PRIVATE]] : !fir.heap<!fir.array<?xi32>>)
 ! CHECK: acc.serial private(@privatization_box_heap_Uxi32 -> %{{.*}} : !fir.heap<!fir.array<?xi32>>)
 
 subroutine acc_private_pointer_array(a, n)
@@ -316,7 +316,7 @@ subroutine acc_private_pointer_array(a, n)
 ! CHECK: %[[BOX:.*]] = fir.load %[[DECLA_A]]#1 : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>
 ! CHECK: %[[BOX_ADDR:.*]] = fir.box_addr %[[BOX]] : (!fir.box<!fir.ptr<!fir.array<?xi32>>>) -> !fir.ptr<!fir.array<?xi32>>
 ! CHECK: %[[PRIVATE:.*]] = acc.private varPtr(%[[BOX_ADDR]] : !fir.ptr<!fir.array<?xi32>>) bounds(%{{.*}}) -> !fir.ptr<!fir.array<?xi32>> {name = "a"}
-! CHECK: acc.loop private(@privatization_box_ptr_Uxi32 -> %[[PRIVATE]] : !fir.ptr<!fir.array<?xi32>>)
+! CHECK: acc.loop private({{.*}}@privatization_box_ptr_Uxi32 -> %[[PRIVATE]] : !fir.ptr<!fir.array<?xi32>>)
 
 subroutine acc_private_dynamic_extent(a, n)
   integer :: n, i
@@ -334,7 +334,7 @@ subroutine acc_private_dynamic_extent(a, n)
 ! CHECK: %[[DECL_A:.*]]:2 = hlfir.declare %[[ARG0]](%16) {uniq_name = "_QFacc_private_dynamic_extentEa"} : (!fir.ref<!fir.array<?x?x2xi32>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x2xi32>>, !fir.ref<!fir.array<?x?x2xi32>>)
 ! CHECK: acc.parallel {
 ! CHECK: %[[PRIV:.*]] = acc.private varPtr(%[[DECL_A]]#1 : !fir.ref<!fir.array<?x?x2xi32>>) bounds(%{{.*}}, %{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<?x?x2xi32>> {name = "a"}
-! CHECK: acc.loop private(@privatization_ref_UxUx2xi32 -> %[[PRIV]] : !fir.ref<!fir.array<?x?x2xi32>>)
+! CHECK: acc.loop private({{.*}}@privatization_ref_UxUx2xi32 -> %[[PRIV]] : !fir.ref<!fir.array<?x?x2xi32>>)
 
 subroutine acc_firstprivate_assumed_shape(a, n)
   integer :: a(:), i, n
diff --git a/flang/test/Lower/OpenACC/acc-reduction.f90 b/flang/test/Lower/OpenACC/acc-reduction.f90
index fe04c3114b7e6..66a3153191723 100644
--- a/flang/test/Lower/OpenACC/acc-reduction.f90
+++ b/flang/test/Lower/OpenACC/acc-reduction.f90
@@ -704,7 +704,7 @@ subroutine acc_reduction_add_int(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<i32>) -> !fir.ref<i32> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_add_ref_i32 -> %[[RED_B]] : !fir.ref<i32>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_add_ref_i32 -> %[[RED_B]] : !fir.ref<i32>)
 
 subroutine acc_reduction_add_int_array_1d(a, b)
   integer :: a(100)
@@ -720,7 +720,7 @@ subroutine acc_reduction_add_int_array_1d(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_add_section_ext100_ref_100xi32 -> %[[RED_B]] : !fir.ref<!fir.array<100xi32>>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_add_section_ext100_ref_100xi32 -> %[[RED_B]] : !fir.ref<!fir.array<100xi32>>)
 
 subroutine acc_reduction_add_int_array_2d(a, b)
   integer :: a(100, 10), b(100, 10)
@@ -738,8 +738,8 @@ subroutine acc_reduction_add_int_array_2d(a, b)
 ! CHECK-SAME:  %[[ARG0:.*]]: !fir.ref<!fir.array<100x10xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>> {fir.bindc_name = "b"}) {
 ! CHECK:       %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! CHECK:       %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10xi32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xi32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_add_section_ext100xext10_ref_100x10xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xi32>>) {
-! CHECK: } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
+! CHECK:       acc.loop {{.*}} reduction(@reduction_add_section_ext100xext10_ref_100x10xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xi32>>)
+! CHECK: } attributes {collapse = [2]{{.*}}
 
 subroutine acc_reduction_add_int_array_3d(a, b)
   integer :: a(100, 10, 2), b(100, 10, 2)
@@ -759,8 +759,8 @@ subroutine acc_reduction_add_int_array_3d(a, b)
 ! CHECK-SAME: %{{.*}}: !fir.ref<!fir.array<100x10x2xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10x2xi32>> {fir.bindc_name = "b"})
 ! CHECK: %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! CHECK: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10x2xi32>>) bounds(%{{.*}}, %{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10x2xi32>> {name = "b"}
-! CHECK: acc.loop reduction(@reduction_add_section_ext100xext10xext2_ref_100x10x2xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10x2xi32>>)
-! CHECK: } attributes {collapse = [3], collapseDeviceType = [#acc.device_type<none>]}
+! CHECK: acc.loop {{.*}} reduction(@reduction_add_section_ext100xext10xext2_ref_100x10x2xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10x2xi32>>)
+! CHECK: } attributes {collapse = [3]{{.*}}
 
 subroutine acc_reduction_add_float(a, b)
   real :: a(100), b
@@ -776,7 +776,7 @@ subroutine acc_reduction_add_float(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<f32> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<f32>) -> !fir.ref<f32> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_add_ref_f32 -> %[[RED_B]] : !fir.ref<f32>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_add_ref_f32 -> %[[RED_B]] : !fir.ref<f32>)
 
 subroutine acc_reduction_add_float_array_1d(a, b)
   real :: a(100), b(100)
@@ -792,7 +792,7 @@ subroutine acc_reduction_add_float_array_1d(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "b"})
 ! CHECK: %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK: %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_add_section_ext100_ref_100xf32 -> %[[RED_B]] : !fir.ref<!fir.array<100xf32>>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_add_section_ext100_ref_100xf32 -> %[[RED_B]] : !fir.ref<!fir.array<100xf32>>)
 
 subroutine acc_reduction_mul_int(a, b)
   integer :: a(100)
@@ -808,7 +808,7 @@ subroutine acc_reduction_mul_int(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<i32>) -> !fir.ref<i32> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_mul_ref_i32 -> %[[RED_B]] : !fir.ref<i32>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_mul_ref_i32 -> %[[RED_B]] : !fir.ref<i32>)
 
 subroutine acc_reduction_mul_int_array_1d(a, b)
   integer :: a(100)
@@ -824,7 +824,7 @@ subroutine acc_reduction_mul_int_array_1d(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_mul_section_ext100_ref_100xi32 -> %[[RED_B]] : !fir.ref<!fir.array<100xi32>>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_mul_section_ext100_ref_100xi32 -> %[[RED_B]] : !fir.ref<!fir.array<100xi32>>)
 
 subroutine acc_reduction_mul_float(a, b)
   real :: a(100), b
@@ -840,7 +840,7 @@ subroutine acc_reduction_mul_float(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<f32> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<f32>) -> !fir.ref<f32> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_mul_ref_f32 -> %[[RED_B]] : !fir.ref<f32>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_mul_ref_f32 -> %[[RED_B]] : !fir.ref<f32>)
 
 subroutine acc_reduction_mul_float_array_1d(a, b)
   real :: a(100), b(100)
@@ -856,7 +856,7 @@ subroutine acc_reduction_mul_float_array_1d(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<100xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_mul_section_ext100_ref_100xf32 -> %[[RED_B]] : !fir.ref<!fir.array<100xf32>>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_mul_section_ext100_ref_100xf32 -> %[[RED_B]] : !fir.ref<!fir.array<100xf32>>)
 
 subroutine acc_reduction_min_int(a, b)
   integer :: a(100)
@@ -872,7 +872,7 @@ subroutine acc_reduction_min_int(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<i32>) -> !fir.ref<i32> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_min_ref_i32 -> %[[RED_B]] : !fir.ref<i32>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_min_ref_i32 -> %[[RED_B]] : !fir.ref<i32>)
 
 subroutine acc_reduction_min_int_array_1d(a, b)
   integer :: a(100), b(100)
@@ -888,7 +888,7 @@ subroutine acc_reduction_min_int_array_1d(a, b)
 ! CHECK-SAME: %{{.*}}: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "b"})
 ! CHECK: %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! CHECK: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100xi32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xi32>> {name = "b"}
-! CHECK: acc.loop reduction(@reduction_min_section_ext100_ref_100xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100xi32>>)
+! CHECK: acc.loop {{.*}} reduction(@reduction_min_section_ext100_ref_100xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100xi32>>)
 
 subroutine acc_reduction_min_float(a, b)
   real :: a(100), b
@@ -904,7 +904,7 @@ subroutine acc_reduction_min_float(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<f32> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<f32>) -> !fir.ref<f32> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_min_ref_f32 -> %[[RED_B]] : !fir.ref<f32>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_min_ref_f32 -> %[[RED_B]] : !fir.ref<f32>)
 
 subroutine acc_reduction_min_float_array2d(a, b)
   real :: a(100, 10), b(100, 10)
@@ -922,8 +922,8 @@ subroutine acc_reduction_min_float_array2d(a, b)
 ! CHECK-SAME: %{{.*}}: !fir.ref<!fir.array<100x10xf32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xf32>> {fir.bindc_name = "b"})
 ! CHECK: %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! CHECK: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10xf32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xf32>> {name = "b"}
-! CHECK: acc.loop reduction(@reduction_min_section_ext100xext10_ref_100x10xf32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xf32>>)
-! CHECK: attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
+! CHECK: acc.loop {{.*}} reduction(@reduction_min_section_ext100xext10_ref_100x10xf32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xf32>>)
+! CHECK: attributes {collapse = [2]{{.*}}
 
 subroutine acc_reduction_max_int(a, b)
   integer :: a(100)
@@ -939,7 +939,7 @@ subroutine acc_reduction_max_int(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xi32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<i32> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<i32>) -> !fir.ref<i32> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_max_ref_i32 -> %[[RED_B]] : !fir.ref<i32>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_max_ref_i32 -> %[[RED_B]] : !fir.ref<i32>)
 
 subroutine acc_reduction_max_int_array2d(a, b)
   integer :: a(100, 10), b(100, 10)
@@ -957,7 +957,7 @@ subroutine acc_reduction_max_int_array2d(a, b)
 ! CHECK-SAME: %{{.*}}: !fir.ref<!fir.array<100x10xi32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<100x10xi32>> {fir.bindc_name = "b"})
 ! CHECK: %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! CHECK: %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100x10xi32>>) bounds(%{{.*}}, %{{.*}}) -> !fir.ref<!fir.array<100x10xi32>> {name = "b"}
-! CHECK: acc.loop reduction(@reduction_max_section_ext100xext10_ref_100x10xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xi32>>)
+! CHECK: acc.loop {{.*}} reduction(@reduction_max_section_ext100xext10_ref_100x10xi32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100x10xi32>>)
 
 subroutine acc_reduction_max_float(a, b)
   real :: a(100), b
@@ -973,7 +973,7 @@ subroutine acc_reduction_max_float(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "a"}, %[[B:.*]]: !fir.ref<f32> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLB:.*]]:2 = hlfir.declare %[[B]]
 ! CHECK:       %[[RED_B:.*]] = acc.reduction varPtr(%[[DECLB]]#1 : !fir.ref<f32>) -> !fir.ref<f32> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_max_ref_f32 -> %[[RED_B]] : !fir.ref<f32>)
+! CHECK:       acc.loop {{.*}} reduction(@reduction_max_ref_f32 -> %[[RED_B]] : !fir.ref<f32>)
 
 subroutine acc_reduction_max_float_array1d(a, b)
   real :: a(100), b(100)
@@ -989,7 +989,7 @@ subroutine acc_reduction_max_float_array1d(a, b)
 ! CHECK-SAME:  %{{.*}}: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "a"}, %[[ARG1:.*]]: !fir.ref<!fir.array<100xf32>> {fir.bindc_name = "b"})
 ! CHECK:       %[[DECLARG1:.*]]:2 = hlfir.declare %[[ARG1]]
 ! CHECK:       %[[RED_ARG1:.*]] = acc.reduction varPtr(%[[DECLARG1]]#1 : !fir.ref<!fir.array<100xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<100xf32>> {name = "b"}
-! CHECK:       acc.loop reduction(@reduction_max_section_ext100_ref_100xf32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100xf32>>) {
+! CHECK:       acc.loop {{.*}} reduction(@reduction_max_section_ext100_ref_100xf32 -> %[[RED_ARG1]] : !fir.ref<!fir.array<100xf32>>)
 
 subroutine acc_reduction_iand()
   integer :: i
diff --git a/flang/test/Lower/OpenACC/acc-serial-loop.f90 b/flang/test/Lower/OpenACC/acc-serial-loop.f90
index 9333761e4c296..4134f9ff0ccf5 100644
--- a/flang/test/Lower/OpenACC/acc-serial-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-serial-loop.f90
@@ -64,8 +64,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -78,8 +77,7 @@ subroutine acc_serial_loop
   !$acc end serial loop
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -92,8 +90,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[ASYNC1:%.*]] = arith.constant 1 : i32
 ! CHECK:      acc.serial async([[ASYNC1]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -106,8 +103,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[ASYNC2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.serial async([[ASYNC2]] : i32) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -119,8 +115,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -133,8 +128,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[WAIT1:%.*]] = arith.constant 1 : i32
 ! CHECK:      acc.serial wait({[[WAIT1]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -148,8 +142,7 @@ subroutine acc_serial_loop
 ! CHECK:      [[WAIT2:%.*]] = arith.constant 1 : i32
 ! CHECK:      [[WAIT3:%.*]] = arith.constant 2 : i32
 ! CHECK:      acc.serial wait({[[WAIT2]] : i32, [[WAIT3]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -163,8 +156,7 @@ subroutine acc_serial_loop
 ! CHECK:      [[WAIT4:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      [[WAIT5:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
 ! CHECK:      acc.serial wait({[[WAIT4]] : i32, [[WAIT5]] : i32}) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -177,8 +169,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[IF1:%.*]] = arith.constant true
 ! CHECK:      acc.serial if([[IF1]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -192,8 +183,7 @@ subroutine acc_serial_loop
 ! CHECK:      [[IFCOND:%.*]] = fir.load %{{.*}} : !fir.ref<!fir.logical<4>>
 ! CHECK:      [[IF2:%.*]] = fir.convert [[IFCOND]] : (!fir.logical<4>) -> i1
 ! CHECK:      acc.serial if([[IF2]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -206,8 +196,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      [[SELF1:%.*]] = arith.constant true
 ! CHECK:      acc.serial self([[SELF1]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -219,8 +208,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -233,8 +221,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      %[[SELF2:.*]] = fir.convert %[[DECLIFCONDITION]]#1 : (!fir.ref<!fir.logical<4>>) -> i1
 ! CHECK:      acc.serial self(%[[SELF2]]) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -248,8 +235,7 @@ subroutine acc_serial_loop
 ! CHECK:      %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
 ! CHECK:      %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
 ! CHECK:      acc.serial dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -265,8 +251,7 @@ subroutine acc_serial_loop
 ! CHECK:      %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "a"}
 ! CHECK:      %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copy>, name = "b"}
 ! CHECK:      acc.serial dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -282,8 +267,7 @@ subroutine acc_serial_loop
 ! CHECK:      %[[COPYIN_A:.*]] = acc.copyin varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[COPYIN_B:.*]] = acc.copyin varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyin_readonly>, name = "b"}
 ! CHECK:      acc.serial dataOperands(%[[COPYIN_A]], %[[COPYIN_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -297,8 +281,7 @@ subroutine acc_serial_loop
 ! CHECK:      %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "a"}
 ! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_copyout>, name = "b"}
 ! CHECK:      acc.serial dataOperands(%[[CREATE_A]], %[[CREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -314,8 +297,7 @@ subroutine acc_serial_loop
 ! CHECK:      %[[CREATE_B:.*]] = acc.create varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      %[[CREATE_A:.*]] = acc.create varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {dataClause = #acc<data_clause acc_create_zero>, name = "a"}
 ! CHECK:      acc.serial dataOperands(%[[CREATE_B]], %[[CREATE_A]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -331,8 +313,7 @@ subroutine acc_serial_loop
 ! CHECK:      %[[NOCREATE_A:.*]] = acc.nocreate varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[NOCREATE_B:.*]] = acc.nocreate varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.serial dataOperands(%[[NOCREATE_A]], %[[NOCREATE_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -346,8 +327,7 @@ subroutine acc_serial_loop
 ! CHECK:      %[[PRESENT_A:.*]] = acc.present varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[PRESENT_B:.*]] = acc.present varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.serial dataOperands(%[[PRESENT_A]], %[[PRESENT_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -361,8 +341,7 @@ subroutine acc_serial_loop
 ! CHECK:      %[[DEVICEPTR_A:.*]] = acc.deviceptr varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
 ! CHECK:      %[[DEVICEPTR_B:.*]] = acc.deviceptr varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.serial dataOperands(%[[DEVICEPTR_A]], %[[DEVICEPTR_B]] : !fir.ref<!fir.array<10xf32>>, !fir.ref<!fir.array<10xf32>>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -380,8 +359,7 @@ subroutine acc_serial_loop
 ! CHECK:      %[[BOX_ADDR_G:.*]] = fir.box_addr %[[BOX_G]] : (!fir.box<!fir.ptr<f32>>) -> !fir.ptr<f32>
 ! CHECK:      %[[ATTACH_G:.*]] = acc.attach varPtr(%[[BOX_ADDR_G]] : !fir.ptr<f32>) -> !fir.ptr<f32> {name = "g"}
 ! CHECK:      acc.serial dataOperands(%[[ATTACH_F]], %[[ATTACH_G]] : !fir.ptr<f32>, !fir.ptr<f32>) {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -395,8 +373,8 @@ subroutine acc_serial_loop
 ! CHECK:      %[[ACC_FPRIVATE_B:.*]] = acc.firstprivate varPtr(%[[DECLB]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "b"}
 ! CHECK:      acc.serial firstprivate(@firstprivatization_section_ext10_ref_10xf32 -> %[[ACC_FPRIVATE_B]] : !fir.ref<!fir.array<10xf32>>) {
 ! CHECK:      %[[ACC_PRIVATE_A:.*]] = acc.private varPtr(%[[DECLA]]#1 : !fir.ref<!fir.array<10xf32>>) bounds(%{{.*}}) -> !fir.ref<!fir.array<10xf32>> {name = "a"}
-! CHECK:        acc.loop private(@privatization_ref_10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10xf32>>) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop private({{.*}}@privatization_ref_10xf32 -> %[[ACC_PRIVATE_A]] : !fir.ref<!fir.array<10xf32>>)
+! CHECK-NOT:          fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -408,10 +386,9 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {seq = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, seq = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -421,10 +398,9 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {auto_ = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -434,10 +410,9 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {independent = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>, independent = [#acc.device_type<none>]}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -447,10 +422,9 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop gang {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop gang() {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -461,8 +435,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[GANGNUM1:%.*]] = arith.constant 8 : i32
-! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM1]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM1]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -475,8 +448,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[GANGNUM2:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM2]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK-NEXT:   acc.loop gang({num=[[GANGNUM2]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -488,8 +460,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop gang({num=%{{.*}} : i32, static=%{{.*}} : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -500,10 +471,9 @@ subroutine acc_serial_loop
     a(i) = b(i)
   END DO
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop vector {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop vector() {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -514,8 +484,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[CONSTANT128:%.*]] = arith.constant 128 : i32
-! CHECK:        acc.loop vector([[CONSTANT128]] : i32) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop vector([[CONSTANT128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -528,8 +497,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[VECTORLENGTH:%.*]] = fir.load %{{.*}} : !fir.ref<i32>
-! CHECK:        acc.loop vector([[VECTORLENGTH]] : i32) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop vector([[VECTORLENGTH]] : i32) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -541,10 +509,9 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop worker {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop worker() {{.*}} {
 ! CHECK:          acc.yield
-! CHECK-NEXT:   }{{$}}
+! CHECK-NEXT:   } attributes {inclusiveUpperbound = array<i1: true>}{{$}}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -555,8 +522,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[WORKER128:%.*]] = arith.constant 128 : i32
-! CHECK:        acc.loop worker([[WORKER128]] : i32) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop worker([[WORKER128]] : i32) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -570,11 +536,10 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
-! CHECK:            fir.do_loop
+! CHECK:        acc.loop {{.*}} {
+! CHECK-NOT:            fir.do_loop
 ! CHECK:          acc.yield
-! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>]}
+! CHECK-NEXT:   } attributes {collapse = [2], collapseDeviceType = [#acc.device_type<none>], inclusiveUpperbound = array<i1: true, true>}
 ! CHECK:        acc.yield
 ! CHECK-NEXT: }{{$}}
 
@@ -587,10 +552,8 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop {
-! CHECK:          fir.do_loop
-! CHECK:            acc.loop {
-! CHECK:              fir.do_loop
+! CHECK:        acc.loop {{.*}} {
+! CHECK:            acc.loop {{.*}} {
 ! CHECK:              acc.yield
 ! CHECK-NEXT:     }{{$}}
 ! CHECK:          acc.yield
@@ -605,8 +568,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[TILESIZE:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile({[[TILESIZE]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({[[TILESIZE]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -619,8 +581,7 @@ subroutine acc_serial_loop
 
 ! CHECK:      acc.serial {
 ! CHECK:        [[TILESIZEM1:%.*]] = arith.constant -1 : i32
-! CHECK:        acc.loop tile({[[TILESIZEM1]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({[[TILESIZEM1]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -636,8 +597,7 @@ subroutine acc_serial_loop
 ! CHECK:      acc.serial {
 ! CHECK:        [[TILESIZE1:%.*]] = arith.constant 2 : i32
 ! CHECK:        [[TILESIZE2:%.*]] = arith.constant 2 : i32
-! CHECK:        acc.loop tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({[[TILESIZE1]] : i32, [[TILESIZE2]] : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -649,8 +609,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop tile({%{{.*}} : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({%{{.*}} : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -664,8 +623,7 @@ subroutine acc_serial_loop
   END DO
 
 ! CHECK:      acc.serial {
-! CHECK:        acc.loop tile({%{{.*}} : i32, %{{.*}} : i32}) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} tile({%{{.*}} : i32, %{{.*}} : i32}) {{.*}} {
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
@@ -680,8 +638,8 @@ subroutine acc_serial_loop
 ! CHECK:      %[[COPYINREDR:.*]] = acc.copyin varPtr(%{{.*}} : !fir.ref<f32>) -> !fir.ref<f32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "reduction_r"}
 ! CHECK:      %[[COPYINREDI:.*]] = acc.copyin varPtr(%{{.*}} : !fir.ref<i32>) -> !fir.ref<i32> {dataClause = #acc<data_clause acc_reduction>, implicit = true, name = "reduction_i"}
 ! CHECK:      acc.serial dataOperands(%[[COPYINREDR]], %[[COPYINREDI]] : !fir.ref<f32>, !fir.ref<i32>) {
-! CHECK:        acc.loop reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>) {
-! CHECK:          fir.do_loop
+! CHECK:        acc.loop {{.*}} reduction(@reduction_add_ref_f32 -> %{{.*}} : !fir.ref<f32>, @reduction_mul_ref_i32 -> %{{.*}} : !fir.ref<i32>)
+! CHECK-NOT:      fir.do_loop
 ! CHECK:          acc.yield
 ! CHECK-NEXT:   }{{$}}
 ! CHECK:        acc.yield
diff --git a/flang/test/Lower/OpenACC/locations.f90 b/flang/test/Lower/OpenACC/locations.f90
index c0114476b57ee..84dd512a5d43f 100644
--- a/flang/test/Lower/OpenACC/locations.f90
+++ b/flang/test/Lower/OpenACC/locations.f90
@@ -53,7 +53,7 @@ subroutine nested_acc_locations(arr1d)
     !CHECK: acc.loop
 
     !CHECK:        acc.yield loc("{{.*}}locations.f90":44:11)
-    !CHECK-NEXT: } loc("{{.*}}locations.f90":44:11)
+    !CHECK-NEXT: } attributes {{.*}} loc(fused["{{.*}}locations.f90":44:11, "{{.*}}locations.f90":45:5])
 
     !CHECK:        acc.yield loc("{{.*}}locations.f90":43:11)
     !CHECK-NEXT: } loc("{{.*}}locations.f90":43:11)
@@ -87,7 +87,7 @@ subroutine combined_directive_locations(arr)
     !CHECK: acc.parallel
     !CHECK: acc.loop
     !CHECK:      acc.yield loc("{{.*}}locations.f90":82:11)
-    !CHECK-NEXT: } loc("{{.*}}locations.f90":82:11)
+    !CHECK-NEXT: } {{.*}} loc(fused["{{.*}}locations.f90":82:11, "{{.*}}locations.f90":83:5])
     !CHECK:      acc.yield loc("{{.*}}locations.f90":82:11)
     !CHECK-NEXT: } loc("{{.*}}locations.f90":82:11)
   end subroutine
@@ -106,7 +106,7 @@ subroutine if_clause_expr_location(arr)
     !CHECK: acc.parallel
     !CHECK: acc.loop
     !CHECK:      acc.yield loc("{{.*}}locations.f90":99:11)
-    !CHECK-NEXT: } loc("{{.*}}locations.f90":99:11)
+    !CHECK-NEXT: } {{.*}} loc(fused["{{.*}}locations.f90":99:11, "{{.*}}locations.f90":100:5])
     !CHECK:      acc.yield loc("{{.*}}locations.f90":99:11)
     !CHECK-NEXT: } loc("{{.*}}locations.f90":99:11)
   end subroutine
@@ -150,16 +150,26 @@ subroutine atomic_update_loc()
 ! CHECK: } loc("{{.*}}locations.f90":142:3)
     
     !$acc atomic update
-    z = x * z 
-
-    ! %3 = fir.load %0 : !fir.ref<i32> loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)
-    ! acc.atomic.update %2 : !fir.ref<i32> {
-    ! ^bb0(%arg0: i32 loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)):
-    !   %4 = arith.muli %3, %arg0 : i32 loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)
-    !   acc.yield %4 : i32 loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)
-    ! } loc("/local/home/vclement/llvm-project/flang/test/Lower/OpenACC/locations.f90":142:3)
+    z = x * z
+  end subroutine
+
+  subroutine acc_loop_fused_locations(arr)
+    real, dimension(10,10,10) :: arr
+    integer :: i, j, k
+
+    !$acc loop collapse(3)
+    do i = 1, 10
+      do j = 1, 10
+        do k = 1, 10
+          arr(i,j,k) = arr(i,j,k) * 2
+        end do
+      end do
+    end do
   end subroutine
 
+! CHECK-LABEL: func.func @_QMacc_locationsPacc_loop_fused_locations
+! CHECK: acc.loop
+! CHECK: } attributes {collapse = [3]{{.*}}} loc(fused["{{.*}}locations.f90":160:11, "{{.*}}locations.f90":161:5, "{{.*}}locations.f90":162:7, "{{.*}}locations.f90":163:9])
 
 end module
 

From f4c2ee12684a6b4857c9e6777ba53e02555978ba Mon Sep 17 00:00:00 2001
From: Stephen Tozer <Stephen.Tozer@Sony.com>
Date: Mon, 22 Jan 2024 18:30:27 +0000
Subject: [PATCH 450/843] [RemoveDIs] Remove tests for redundant DPVAssigns
 until DPVAssigns are enabled

This patch fixes commit 89aa3355, which added tests for
the removal of redundant DPVAssigns; unlike other cases where
adding tests for DPVAssigns before they are enabled is harmless,
these tests require them to be enabled, so must be deleted until
we enable them.

Fixes failures on llvm-new-debug-iterators buildbot:
https://lab.llvm.org/buildbot/#/builders/275/builds/3581
---
 .../assignment-tracking/instcombine/remove-redundant-dbg.ll     | 2 --
 .../DebugInfo/Generic/assignment-tracking/remove-redundant.ll   | 2 --
 .../Generic/assignment-tracking/sroa/remove-redundant-dbg.ll    | 2 --
 3 files changed, 6 deletions(-)

diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
index cffac06f8e545..11895098179eb 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
-; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
-; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll
index 24ec3e94ed275..efb20b6edee2d 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -passes=redundant-dbg-inst-elim -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
-; RUN: opt --try-experimental-debuginfo-iterators -passes=redundant-dbg-inst-elim -S %s -o - \
-; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Hand-written. Test how RemoveRedundantDbgInstrs interacts with dbg.assign
 ;; intrinsics. FileCehck directives are inline.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
index cffac06f8e545..11895098179eb 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
-; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
-; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler

From 5ce286849a0cc605210031411265c2a84fc6c633 Mon Sep 17 00:00:00 2001
From: Mingming Liu <mingmingl@google.com>
Date: Mon, 22 Jan 2024 10:36:03 -0800
Subject: [PATCH 451/843] [CGProfile] Use callee's PGO name when caller->callee
 is an indirect call. (#78610)

- With PGO, indirect call edges are constructed using value profiles, and the profile address is mapped to a function's PGO name. The PGO name is computed using a functions linkage before LTO internalization or global promotion.
- With ThinLTO, local functions [could be
promoted](https://github.com/llvm/llvm-project/blob/2663d2cb9c9361f0b234c40a0f50c7ba0748eb26/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp#L288) to have external linkage; and with
[full](https://github.com/llvm/llvm-project/blob/2663d2cb9c9361f0b234c40a0f50c7ba0748eb26/llvm/lib/LTO/LTO.cpp#L1328)
or
[thin](https://github.com/llvm/llvm-project/blob/2663d2cb9c9361f0b234c40a0f50c7ba0748eb26/llvm/lib/LTO/LTO.cpp#L448)
LTO, global functions could be internalized. Edge construction should use a function's PGO name before its linkage is updated.
---
 .../llvm/Transforms/Instrumentation/CGProfile.h |  4 ++++
 llvm/lib/Passes/PassBuilder.cpp                 |  4 ++++
 llvm/lib/Passes/PassBuilderPipelines.cpp        |  5 +++--
 llvm/lib/Passes/PassRegistry.def                |  5 ++++-
 .../Transforms/Instrumentation/CGProfile.cpp    |  8 ++++----
 llvm/test/Instrumentation/cgprofile.ll          | 17 ++++++++++++-----
 6 files changed, 31 insertions(+), 12 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h b/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h
index 9f9ce42277a0c..0011b7b8b36c7 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/CGProfile.h
@@ -18,7 +18,11 @@ namespace llvm {
 class Module;
 class CGProfilePass : public PassInfoMixin<CGProfilePass> {
 public:
+  CGProfilePass(bool InLTO) : InLTO(InLTO) {}
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &AM);
+
+private:
+  bool InLTO = false;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 8d3f69be50383..19fb136f37563 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -661,6 +661,10 @@ Expected<bool> parseGlobalDCEPassOptions(StringRef Params) {
   return parseSinglePassOption(Params, "vfe-linkage-unit-visibility", "GlobalDCE");
 }
 
+Expected<bool> parseCGProfilePassOptions(StringRef Params) {
+  return parseSinglePassOption(Params, "in-lto-post-link", "CGProfile");
+}
+
 Expected<bool> parseInlinerPassOptions(StringRef Params) {
   return parseSinglePassOption(Params, "only-mandatory", "InlinerPass");
 }
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 5c6c391049a7b..525b83dee79b0 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1477,7 +1477,8 @@ PassBuilder::buildModuleOptimizationPipeline(OptimizationLevel Level,
   MPM.addPass(ConstantMergePass());
 
   if (PTO.CallGraphProfile && !LTOPreLink)
-    MPM.addPass(CGProfilePass());
+    MPM.addPass(CGProfilePass(LTOPhase == ThinOrFullLTOPhase::FullLTOPostLink ||
+                              LTOPhase == ThinOrFullLTOPhase::ThinLTOPostLink));
 
   // TODO: Relative look table converter pass caused an issue when full lto is
   // enabled. See https://reviews.llvm.org/D94355 for more details.
@@ -1972,7 +1973,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level,
     MPM.addPass(MergeFunctionsPass());
 
   if (PTO.CallGraphProfile)
-    MPM.addPass(CGProfilePass());
+    MPM.addPass(CGProfilePass(/*InLTOPostLink=*/true));
 
   invokeFullLinkTimeOptimizationLastEPCallbacks(MPM, Level);
 
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 756f4ab23ab66..5f25302091124 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -47,7 +47,6 @@ MODULE_PASS("attributor", AttributorPass())
 MODULE_PASS("attributor-light", AttributorLightPass())
 MODULE_PASS("called-value-propagation", CalledValuePropagationPass())
 MODULE_PASS("canonicalize-aliases", CanonicalizeAliasesPass())
-MODULE_PASS("cg-profile", CGProfilePass())
 MODULE_PASS("check-debugify", NewPMCheckDebugifyPass())
 MODULE_PASS("constmerge", ConstantMergePass())
 MODULE_PASS("coro-cleanup", CoroCleanupPass())
@@ -151,6 +150,10 @@ MODULE_PASS_WITH_PARAMS(
     "asan", "AddressSanitizerPass",
     [](AddressSanitizerOptions Opts) { return AddressSanitizerPass(Opts); },
     parseASanPassOptions, "kernel")
+MODULE_PASS_WITH_PARAMS(
+    "cg-profile", "CGProfilePass",
+    [](bool InLTOPostLink) { return CGProfilePass(InLTOPostLink);},
+    parseCGProfilePassOptions, "in-lto-post-link")
 MODULE_PASS_WITH_PARAMS(
     "global-merge", "GlobalMergePass",
     [TM = TM](GlobalMergeOptions Opts) { return GlobalMergePass(TM, Opts); },
diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
index e2e5f21b376b0..c322d0abd6bc1 100644
--- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -44,8 +44,8 @@ addModuleFlags(Module &M,
   return true;
 }
 
-static bool runCGProfilePass(
-    Module &M, FunctionAnalysisManager &FAM) {
+static bool runCGProfilePass(Module &M, FunctionAnalysisManager &FAM,
+                             bool InLTO) {
   MapVector<std::pair<Function *, Function *>, uint64_t> Counts;
   InstrProfSymtab Symtab;
   auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
@@ -59,7 +59,7 @@ static bool runCGProfilePass(
     Count = SaturatingAdd(Count, NewCount);
   };
   // Ignore error here.  Indirect calls are ignored if this fails.
-  (void)(bool) Symtab.create(M);
+  (void)(bool)Symtab.create(M, InLTO);
   for (auto &F : M) {
     // Avoid extra cost of running passes for BFI when the function doesn't have
     // entry count.
@@ -101,7 +101,7 @@ static bool runCGProfilePass(
 PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
   FunctionAnalysisManager &FAM =
       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  runCGProfilePass(M, FAM);
+  runCGProfilePass(M, FAM, InLTO);
 
   return PreservedAnalyses::all();
 }
diff --git a/llvm/test/Instrumentation/cgprofile.ll b/llvm/test/Instrumentation/cgprofile.ll
index 2a523d33ae077..72d10f6754c48 100644
--- a/llvm/test/Instrumentation/cgprofile.ll
+++ b/llvm/test/Instrumentation/cgprofile.ll
@@ -1,4 +1,5 @@
-; RUN: opt < %s -passes cg-profile -S | FileCheck %s
+; RUN: opt < %s -passes='cg-profile<in-lto-post-link>' -S | FileCheck %s --check-prefixes=CHECK,LTO
+; RUN: opt < %s -passes='cg-profile' -S | FileCheck %s --check-prefixes=CHECK,NOLTO --implicit-check-not="!{ptr @freq, ptr @func3.llvm.12345"
 
 declare void @b()
 
@@ -7,10 +8,14 @@ define void @a() !prof !1 {
   ret void
 }
 
+define void @func3.llvm.12345() !PGOFuncName !4 {
+  ret void
+}
+
 @foo = common global ptr null, align 8
 declare i32 @func1()
 declare i32 @func2()
-declare i32 @func3()
+
 declare i32 @func4()
 declare dllimport i32 @func5()
 declare i32 @func6()
@@ -29,15 +34,17 @@ B:
 
 !1 = !{!"function_entry_count", i64 32}
 !2 = !{!"branch_weights", i32 5, i32 10}
-!3 = !{!"VP", i32 0, i64 1600, i64 7651369219802541373, i64 1030, i64 -4377547752858689819, i64 410, i64 -6929281286627296573, i64 150, i64 -2545542355363006406, i64 10, i64 3667884930908592509, i64 1, i64 15435711456043681792, i64 0}
+!3 = !{!"VP", i32 0, i64 1600, i64 7651369219802541373, i64 1030, i64 -4377547752858689819, i64 410, i64 5415368997850289431, i64 150, i64 -2545542355363006406, i64 10, i64 3667884930908592509, i64 1, i64 15435711456043681792, i64 0}
+!4 = !{!"cgprofile.ll;func3"}
 
 ; CHECK: !llvm.module.flags = !{![[cgprof:[0-9]+]]}
 ; CHECK: ![[cgprof]] = !{i32 5, !"CG Profile", ![[prof:[0-9]+]]}
-; CHECK: ![[prof]] = distinct !{![[e0:[0-9]+]], ![[e1:[0-9]+]], ![[e2:[0-9]+]], ![[e3:[0-9]+]], ![[e4:[0-9]+]], ![[e5:[0-9]+]], ![[e6:[0-9]+]]}
+; LTO: ![[prof]] = distinct !{![[e0:[0-9]+]], ![[e1:[0-9]+]], ![[e2:[0-9]+]], ![[e3:[0-9]+]], ![[e4:[0-9]+]], ![[e5:[0-9]+]], ![[e6:[0-9]+]]}
+; NOLTO: ![[prof]] = distinct !{![[e0:[0-9]+]], ![[e1:[0-9]+]], ![[e2:[0-9]+]], ![[e4:[0-9]+]], ![[e5:[0-9]+]], ![[e6:[0-9]+]]}
 ; CHECK: ![[e0]] = !{ptr @a, ptr @b, i64 32}
 ; CHECK: ![[e1]] = !{ptr @freq, ptr @func4, i64 1030}
 ; CHECK: ![[e2]] = !{ptr @freq, ptr @func2, i64 410}
-; CHECK: ![[e3]] = !{ptr @freq, ptr @func3, i64 150}
+; LTO: ![[e3]] = !{ptr @freq, ptr @func3.llvm.12345, i64 150}
 ; CHECK: ![[e4]] = !{ptr @freq, ptr @func1, i64 10}
 ; CHECK: ![[e5]] = !{ptr @freq, ptr @a, i64 11}
 ; CHECK: ![[e6]] = !{ptr @freq, ptr @b, i64 21}

From 4207ad57707f07208dfb1d7c79889e1372c396ab Mon Sep 17 00:00:00 2001
From: azhan92 <alisonxzhang@gmail.com>
Date: Mon, 22 Jan 2024 13:38:54 -0500
Subject: [PATCH 452/843] [libc++] Fix noexcept behaviour of operator new
 helper functions  (#74337)

This patch removes the noexcept specifier introduced in #69407 since the
Standard allows a new handler to throw an exception of type bad_alloc
(or derived from it). With the noexcept specifier on the helper
functions, we would immediately terminate the program.

The patch also adds tests for the case that had regressed.

Co-authored-by: Alison Zhang <alisonzhang@ibm.com>
---
 libcxx/src/new.cpp                            |  4 +-
 .../new.delete.array/new.size.except.pass.cpp | 44 +++++++++++++++++++
 .../new.size_align.except.pass.cpp            | 44 +++++++++++++++++++
 .../new.size_align_nothrow.except.pass.cpp    | 44 +++++++++++++++++++
 .../new.size_nothrow.except.pass.cpp          | 44 +++++++++++++++++++
 .../new.size.except.pass.cpp                  | 44 +++++++++++++++++++
 .../new.size_align.except.pass.cpp            | 44 +++++++++++++++++++
 .../new.size_align_nothrow.except.pass.cpp    | 44 +++++++++++++++++++
 .../new.size_nothrow.except.pass.cpp          | 44 +++++++++++++++++++
 libcxxabi/src/stdlib_new_delete.cpp           |  4 +-
 10 files changed, 356 insertions(+), 4 deletions(-)
 create mode 100644 libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_nothrow.except.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp
 create mode 100644 libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_nothrow.except.pass.cpp

diff --git a/libcxx/src/new.cpp b/libcxx/src/new.cpp
index 033bba5c1fc95..cb8b4aae8d5f9 100644
--- a/libcxx/src/new.cpp
+++ b/libcxx/src/new.cpp
@@ -20,7 +20,7 @@
 // in this shared library, so that they can be overridden by programs
 // that define non-weak copies of the functions.
 
-static void* operator_new_impl(std::size_t size) noexcept {
+static void* operator_new_impl(std::size_t size) {
   if (size == 0)
     size = 1;
   void* p;
@@ -87,7 +87,7 @@ _LIBCPP_WEAK void operator delete[](void* ptr, size_t) noexcept { ::operator del
 
 #  if !defined(_LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION)
 
-static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignment) noexcept {
+static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignment) {
   if (size == 0)
     size = 1;
   if (static_cast<size_t>(alignment) < sizeof(void*))
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp
new file mode 100644
index 0000000000000..6a2b098c1b573
--- /dev/null
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size.except.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+// UNSUPPORTED: sanitizer-new-delete
+
+#include <new>
+#include <cassert>
+#include <limits>
+#include <cstdlib>
+
+struct construction_key {};
+struct my_bad_alloc : std::bad_alloc {
+  my_bad_alloc(const my_bad_alloc&) : self(this) { std::abort(); }
+  my_bad_alloc(construction_key) : self(this) {}
+  const my_bad_alloc* const self;
+};
+
+int new_handler_called = 0;
+
+void my_new_handler() {
+  ++new_handler_called;
+  throw my_bad_alloc(construction_key());
+}
+
+int main(int, char**) {
+  std::set_new_handler(my_new_handler);
+  try {
+    void* x = operator new[](std::numeric_limits<std::size_t>::max());
+    (void)x;
+    assert(false);
+  } catch (my_bad_alloc const& e) {
+    assert(new_handler_called == 1);
+    assert(e.self == &e);
+  } catch (...) {
+    assert(false);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
new file mode 100644
index 0000000000000..cde7dc53be5bd
--- /dev/null
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align.except.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+// UNSUPPORTED: sanitizer-new-delete
+
+#include <new>
+#include <cassert>
+#include <limits>
+#include <cstdlib>
+
+struct construction_key {};
+struct my_bad_alloc : std::bad_alloc {
+  my_bad_alloc(const my_bad_alloc&) : self(this) { std::abort(); }
+  my_bad_alloc(construction_key) : self(this) {}
+  const my_bad_alloc* const self;
+};
+
+int new_handler_called = 0;
+
+void my_new_handler() {
+  ++new_handler_called;
+  throw my_bad_alloc(construction_key());
+}
+
+int main(int, char**) {
+  std::set_new_handler(my_new_handler);
+  try {
+    void* x = operator new[](std::numeric_limits<std::size_t>::max(), static_cast<std::align_val_t>(32));
+    (void)x;
+    assert(false);
+  } catch (my_bad_alloc const& e) {
+    assert(new_handler_called == 1);
+    assert(e.self == &e);
+  } catch (...) {
+    assert(false);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp
new file mode 100644
index 0000000000000..251ba0f495991
--- /dev/null
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.except.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+// UNSUPPORTED: sanitizer-new-delete
+
+#include <new>
+#include <cassert>
+#include <limits>
+#include <cstdlib>
+
+struct construction_key {};
+struct my_bad_alloc : std::bad_alloc {
+  my_bad_alloc(const my_bad_alloc&) : self(this) { std::abort(); }
+  my_bad_alloc(construction_key) : self(this) {}
+  const my_bad_alloc* const self;
+};
+
+int new_handler_called = 0;
+
+void my_new_handler() {
+  ++new_handler_called;
+  throw my_bad_alloc(construction_key());
+}
+
+int main(int, char**) {
+  std::set_new_handler(my_new_handler);
+  try {
+    void* x = operator new[](std::numeric_limits<std::size_t>::max(), static_cast<std::align_val_t>(32), std::nothrow);
+    (void)x;
+    assert(x == NULL);
+  } catch (my_bad_alloc const& e) {
+    assert(new_handler_called == 1);
+    assert(e.self == &e);
+  } catch (...) {
+    assert(false);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_nothrow.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_nothrow.except.pass.cpp
new file mode 100644
index 0000000000000..a68a980bb4f7a
--- /dev/null
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_nothrow.except.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+// UNSUPPORTED: sanitizer-new-delete
+
+#include <new>
+#include <cassert>
+#include <limits>
+#include <cstdlib>
+
+struct construction_key {};
+struct my_bad_alloc : std::bad_alloc {
+  my_bad_alloc(const my_bad_alloc&) : self(this) { std::abort(); }
+  my_bad_alloc(construction_key) : self(this) {}
+  const my_bad_alloc* const self;
+};
+
+int new_handler_called = 0;
+
+void my_new_handler() {
+  ++new_handler_called;
+  throw my_bad_alloc(construction_key());
+}
+
+int main(int, char**) {
+  std::set_new_handler(my_new_handler);
+  try {
+    void* x = operator new[](std::numeric_limits<std::size_t>::max(), std::nothrow);
+    (void)x;
+    assert(x == NULL);
+  } catch (my_bad_alloc const& e) {
+    assert(new_handler_called == 1);
+    assert(e.self == &e);
+  } catch (...) {
+    assert(false);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp
new file mode 100644
index 0000000000000..6a515555e6dbd
--- /dev/null
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size.except.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+// UNSUPPORTED: sanitizer-new-delete
+
+#include <new>
+#include <cassert>
+#include <limits>
+#include <cstdlib>
+
+struct construction_key {};
+struct my_bad_alloc : std::bad_alloc {
+  my_bad_alloc(const my_bad_alloc&) : self(this) { std::abort(); }
+  my_bad_alloc(construction_key) : self(this) {}
+  const my_bad_alloc* const self;
+};
+
+int new_handler_called = 0;
+
+void my_new_handler() {
+  ++new_handler_called;
+  throw my_bad_alloc(construction_key());
+}
+
+int main(int, char**) {
+  std::set_new_handler(my_new_handler);
+  try {
+    void* x = operator new(std::numeric_limits<std::size_t>::max());
+    (void)x;
+    assert(false);
+  } catch (my_bad_alloc const& e) {
+    assert(new_handler_called == 1);
+    assert(e.self == &e);
+  } catch (...) {
+    assert(false);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
new file mode 100644
index 0000000000000..cb83fb26ab1e6
--- /dev/null
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align.except.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+// UNSUPPORTED: sanitizer-new-delete
+
+#include <new>
+#include <cassert>
+#include <limits>
+#include <cstdlib>
+
+struct construction_key {};
+struct my_bad_alloc : std::bad_alloc {
+  my_bad_alloc(const my_bad_alloc&) : self(this) { std::abort(); }
+  my_bad_alloc(construction_key) : self(this) {}
+  const my_bad_alloc* const self;
+};
+
+int new_handler_called = 0;
+
+void my_new_handler() {
+  ++new_handler_called;
+  throw my_bad_alloc(construction_key());
+}
+
+int main(int, char**) {
+  std::set_new_handler(my_new_handler);
+  try {
+    void* x = operator new(std::numeric_limits<std::size_t>::max(), static_cast<std::align_val_t>(32));
+    (void)x;
+    assert(false);
+  } catch (my_bad_alloc const& e) {
+    assert(new_handler_called == 1);
+    assert(e.self == &e);
+  } catch (...) {
+    assert(false);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp
new file mode 100644
index 0000000000000..d95e78e24e1a9
--- /dev/null
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.except.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+// UNSUPPORTED: sanitizer-new-delete
+
+#include <new>
+#include <cassert>
+#include <limits>
+#include <cstdlib>
+
+struct construction_key {};
+struct my_bad_alloc : std::bad_alloc {
+  my_bad_alloc(const my_bad_alloc&) : self(this) { std::abort(); }
+  my_bad_alloc(construction_key) : self(this) {}
+  const my_bad_alloc* const self;
+};
+
+int new_handler_called = 0;
+
+void my_new_handler() {
+  ++new_handler_called;
+  throw my_bad_alloc(construction_key());
+}
+
+int main(int, char**) {
+  std::set_new_handler(my_new_handler);
+  try {
+    void* x = operator new(std::numeric_limits<std::size_t>::max(), static_cast<std::align_val_t>(32), std::nothrow);
+    (void)x;
+    assert(x == NULL);
+  } catch (my_bad_alloc const& e) {
+    assert(new_handler_called == 1);
+    assert(e.self == &e);
+  } catch (...) {
+    assert(false);
+  }
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_nothrow.except.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_nothrow.except.pass.cpp
new file mode 100644
index 0000000000000..db9a90709b505
--- /dev/null
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_nothrow.except.pass.cpp
@@ -0,0 +1,44 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: no-exceptions
+// UNSUPPORTED: sanitizer-new-delete
+
+#include <new>
+#include <cassert>
+#include <limits>
+#include <cstdlib>
+
+struct construction_key {};
+struct my_bad_alloc : std::bad_alloc {
+  my_bad_alloc(const my_bad_alloc&) : self(this) { std::abort(); }
+  my_bad_alloc(construction_key) : self(this) {}
+  const my_bad_alloc* const self;
+};
+
+int new_handler_called = 0;
+
+void my_new_handler() {
+  ++new_handler_called;
+  throw my_bad_alloc(construction_key());
+}
+
+int main(int, char**) {
+  std::set_new_handler(my_new_handler);
+  try {
+    void* x = operator new(std::numeric_limits<std::size_t>::max(), std::nothrow);
+    (void)x;
+    assert(x == NULL);
+  } catch (my_bad_alloc const& e) {
+    assert(new_handler_called == 1);
+    assert(e.self == &e);
+  } catch (...) {
+    assert(false);
+  }
+  return 0;
+}
diff --git a/libcxxabi/src/stdlib_new_delete.cpp b/libcxxabi/src/stdlib_new_delete.cpp
index 6c9990f063dde..f8a00ec584256 100644
--- a/libcxxabi/src/stdlib_new_delete.cpp
+++ b/libcxxabi/src/stdlib_new_delete.cpp
@@ -30,7 +30,7 @@
 // in this shared library, so that they can be overridden by programs
 // that define non-weak copies of the functions.
 
-static void* operator_new_impl(std::size_t size) noexcept {
+static void* operator_new_impl(std::size_t size) {
   if (size == 0)
     size = 1;
   void* p;
@@ -107,7 +107,7 @@ void operator delete[](void* ptr, size_t) noexcept { ::operator delete[](ptr); }
 
 #if !defined(_LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION)
 
-static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignment) noexcept {
+static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignment) {
   if (size == 0)
     size = 1;
   if (static_cast<size_t>(alignment) < sizeof(void*))

From a301fb11014f9cfdf4ee8cada173c46a7677d9d3 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Mon, 22 Jan 2024 19:42:32 +0100
Subject: [PATCH 453/843] [clang][modules] Print library module manifest path.
 (#76451)

This implements a way for the compiler to find the modules.json
associated with the C++23 Standard library modules.

This is based on a discussion in SG15. At the moment no Standard library
installs this manifest. #75741 adds this feature in libc++.
---
 clang/include/clang/Driver/Driver.h           | 10 +++++
 clang/include/clang/Driver/Options.td         |  3 ++
 clang/lib/Driver/Driver.cpp                   | 44 +++++++++++++++++++
 ...les-print-library-module-manifest-path.cpp | 36 +++++++++++++++
 4 files changed, 93 insertions(+)
 create mode 100644 clang/test/Driver/modules-print-library-module-manifest-path.cpp

diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 3ee1bcf2a69c9..595f104e406d3 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -602,6 +602,16 @@ class Driver {
   // FIXME: This should be in CompilationInfo.
   std::string GetProgramPath(StringRef Name, const ToolChain &TC) const;
 
+  /// Lookup the path to the Standard library module manifest.
+  ///
+  /// \param C - The compilation.
+  /// \param TC - The tool chain for additional information on
+  /// directories to search.
+  //
+  // FIXME: This should be in CompilationInfo.
+  std::string GetStdModuleManifestPath(const Compilation &C,
+                                       const ToolChain &TC) const;
+
   /// HandleAutocompletions - Handle --autocomplete by searching and printing
   /// possible flags, descriptions, and its arguments.
   void HandleAutocompletions(StringRef PassedFlags) const;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f9e883e3e22de..637b10652fcd9 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5306,6 +5306,9 @@ def print_resource_dir : Flag<["-", "--"], "print-resource-dir">,
 def print_search_dirs : Flag<["-", "--"], "print-search-dirs">,
   HelpText<"Print the paths used for finding libraries and programs">,
   Visibility<[ClangOption, CLOption]>;
+def print_std_module_manifest_path : Flag<["-", "--"], "print-library-module-manifest-path">,
+  HelpText<"Print the path for the C++ Standard library module manifest">,
+  Visibility<[ClangOption, CLOption]>;
 def print_targets : Flag<["-", "--"], "print-targets">,
   HelpText<"Print the registered targets">,
   Visibility<[ClangOption, CLOption]>;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index da27ca2d28e91..1c32370208230 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2194,6 +2194,12 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
     return false;
   }
 
+  if (C.getArgs().hasArg(options::OPT_print_std_module_manifest_path)) {
+    llvm::outs() << GetStdModuleManifestPath(C, C.getDefaultToolChain())
+                 << '\n';
+    return false;
+  }
+
   if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) {
     if (std::optional<std::string> RuntimePath = TC.getRuntimePath())
       llvm::outs() << *RuntimePath << '\n';
@@ -6165,6 +6171,44 @@ std::string Driver::GetProgramPath(StringRef Name, const ToolChain &TC) const {
   return std::string(Name);
 }
 
+std::string Driver::GetStdModuleManifestPath(const Compilation &C,
+                                             const ToolChain &TC) const {
+  std::string error = "<NOT PRESENT>";
+
+  switch (TC.GetCXXStdlibType(C.getArgs())) {
+  case ToolChain::CST_Libcxx: {
+    std::string lib = GetFilePath("libc++.so", TC);
+
+    // Note when there are multiple flavours of libc++ the module json needs to
+    // look at the command-line arguments for the proper json.
+    // These flavours do not exist at the moment, but there are plans to
+    // provide a variant that is built with sanitizer instrumentation enabled.
+
+    // For example
+    //  StringRef modules = [&] {
+    //    const SanitizerArgs &Sanitize = TC.getSanitizerArgs(C.getArgs());
+    //    if (Sanitize.needsAsanRt())
+    //      return "modules-asan.json";
+    //    return "modules.json";
+    //  }();
+
+    SmallString<128> path(lib.begin(), lib.end());
+    llvm::sys::path::remove_filename(path);
+    llvm::sys::path::append(path, "modules.json");
+    if (TC.getVFS().exists(path))
+      return static_cast<std::string>(path);
+
+    return error;
+  }
+
+  case ToolChain::CST_Libstdcxx:
+    // libstdc++ does not provide Standard library modules yet.
+    return error;
+  }
+
+  return error;
+}
+
 std::string Driver::GetTemporaryPath(StringRef Prefix, StringRef Suffix) const {
   SmallString<128> Path;
   std::error_code EC = llvm::sys::fs::createTemporaryFile(Prefix, Suffix, Path);
diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
new file mode 100644
index 0000000000000..693a992876019
--- /dev/null
+++ b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
@@ -0,0 +1,36 @@
+// Test that -print-library-module-manifest-path finds the correct file.
+
+// RUN: rm -rf %t && split-file %s %t && cd %t
+// RUN: mkdir -p %t/Inputs/usr/lib/x86_64-linux-gnu
+// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.so
+
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libc++ \
+// RUN:     --sysroot=%t/Inputs \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libcxx-no-module-json.cpp
+
+// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/modules.json
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libc++ \
+// RUN:     --sysroot=%t/Inputs \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libcxx.cpp
+
+// RUN: %clang -print-library-module-manifest-path \
+// RUN:     -stdlib=libstdc++ \
+// RUN:     --sysroot=%t/Inputs \
+// RUN:     --target=x86_64-linux-gnu 2>&1 \
+// RUN:   | FileCheck libstdcxx.cpp
+
+//--- libcxx-no-module-json.cpp
+
+// CHECK: <NOT PRESENT>
+
+//--- libcxx.cpp
+
+// CHECK: {{.*}}/Inputs/usr/lib/x86_64-linux-gnu{{/|\\}}modules.json
+
+//--- libstdcxx.cpp
+
+// CHECK: <NOT PRESENT>

From 04952c5bec69c197ea6ffb718ae80fcf78c7828a Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Mon, 22 Jan 2024 08:19:19 -0800
Subject: [PATCH 454/843] [OpenACC] Implement remaining 'simple' int-expr
 clauses.

'num_gangs', 'num_workers', 'device_num', and 'default_async' are all
exactly the same (for the purposes of parsing) as 'vector_length', so
implement these the same way.
---
 clang/include/clang/Basic/OpenACCKinds.h | 22 ++++++
 clang/lib/Parse/ParseOpenACC.cpp         | 12 ++++
 clang/test/ParserOpenACC/parse-clauses.c | 92 ++++++++++++++++++++++++
 3 files changed, 126 insertions(+)

diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h
index a33daea2331aa..3d3f128f7784f 100644
--- a/clang/include/clang/Basic/OpenACCKinds.h
+++ b/clang/include/clang/Basic/OpenACCKinds.h
@@ -224,6 +224,16 @@ enum class OpenACCClauseKind {
   /// 'vector_length' clause, allowed on 'parallel', 'kernels', 'parallel loop',
   /// and 'kernels loop' constructs.
   VectorLength,
+  /// 'num_gangs' clause, allowed on 'parallel', 'kernels', parallel loop', and
+  /// 'kernels loop' constructs.
+  NumGangs,
+  /// 'num_workers' clause, allowed on 'parallel', 'kernels', parallel loop',
+  /// and 'kernels loop' constructs.
+  NumWorkers,
+  /// 'device_num' clause, allowed on 'init', 'shutdown', and 'set' constructs.
+  DeviceNum,
+  /// 'default_async' clause, allowed on 'set' construct.
+  DefaultAsync,
 
   /// Represents an invalid clause, for the purposes of parsing.
   Invalid,
@@ -328,6 +338,18 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
   case OpenACCClauseKind::VectorLength:
     return Out << "vector_length";
 
+  case OpenACCClauseKind::NumGangs:
+    return Out << "num_gangs";
+
+  case OpenACCClauseKind::NumWorkers:
+    return Out << "num_workers";
+
+  case OpenACCClauseKind::DeviceNum:
+    return Out << "device_num";
+
+  case OpenACCClauseKind::DefaultAsync:
+    return Out << "default_async";
+
   case OpenACCClauseKind::Invalid:
     return Out << "<invalid>";
   }
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 9b0d7077670a7..612ee5ef214c1 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -98,9 +98,11 @@ OpenACCClauseKind getOpenACCClauseKind(Token Tok) {
       .Case("copyin", OpenACCClauseKind::CopyIn)
       .Case("copyout", OpenACCClauseKind::CopyOut)
       .Case("default", OpenACCClauseKind::Default)
+      .Case("default_async", OpenACCClauseKind::DefaultAsync)
       .Case("delete", OpenACCClauseKind::Delete)
       .Case("detach", OpenACCClauseKind::Detach)
       .Case("device", OpenACCClauseKind::Device)
+      .Case("device_num", OpenACCClauseKind::DeviceNum)
       .Case("device_resident", OpenACCClauseKind::DeviceResident)
       .Case("deviceptr", OpenACCClauseKind::DevicePtr)
       .Case("finalize", OpenACCClauseKind::Finalize)
@@ -111,6 +113,8 @@ OpenACCClauseKind getOpenACCClauseKind(Token Tok) {
       .Case("independent", OpenACCClauseKind::Independent)
       .Case("link", OpenACCClauseKind::Link)
       .Case("no_create", OpenACCClauseKind::NoCreate)
+      .Case("num_gangs", OpenACCClauseKind::NumGangs)
+      .Case("num_workers", OpenACCClauseKind::NumWorkers)
       .Case("nohost", OpenACCClauseKind::NoHost)
       .Case("present", OpenACCClauseKind::Present)
       .Case("private", OpenACCClauseKind::Private)
@@ -471,6 +475,10 @@ ClauseParensKind getClauseParensKind(OpenACCDirectiveKind DirKind,
   case OpenACCClauseKind::Collapse:
   case OpenACCClauseKind::Bind:
   case OpenACCClauseKind::VectorLength:
+  case OpenACCClauseKind::NumGangs:
+  case OpenACCClauseKind::NumWorkers:
+  case OpenACCClauseKind::DeviceNum:
+  case OpenACCClauseKind::DefaultAsync:
     return ClauseParensKind::Required;
 
   case OpenACCClauseKind::Auto:
@@ -684,6 +692,10 @@ bool Parser::ParseOpenACCClauseParams(OpenACCDirectiveKind DirKind,
         return true;
       break;
     }
+    case OpenACCClauseKind::NumGangs:
+    case OpenACCClauseKind::NumWorkers:
+    case OpenACCClauseKind::DeviceNum:
+    case OpenACCClauseKind::DefaultAsync:
     case OpenACCClauseKind::VectorLength: {
       ExprResult IntExpr = ParseOpenACCIntExpr();
       if (IntExpr.isInvalid())
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index 31643073eb76a..b6114c73f1b19 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -725,6 +725,98 @@ void IntExprParsing() {
 
   // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
 #pragma acc parallel vector_length(returns_int())
+
+  // expected-error@+2{{expected '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_gangs
+
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_gangs()
+
+  // expected-error@+2{{use of undeclared identifier 'invalid'}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_gangs(invalid)
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_gangs(5, 4)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_gangs(5)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_gangs(returns_int())
+
+  // expected-error@+2{{expected '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_workers
+
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_workers()
+
+  // expected-error@+2{{use of undeclared identifier 'invalid'}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_workers(invalid)
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_workers(5, 4)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_workers(5)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel num_workers(returns_int())
+
+  // expected-error@+2{{expected '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc init device_num
+
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc init device_num()
+
+  // expected-error@+2{{use of undeclared identifier 'invalid'}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc init device_num(invalid)
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc init device_num(5, 4)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc init device_num(5)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc init device_num(returns_int())
+
+  // expected-error@+2{{expected '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc set default_async
+
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc set default_async()
+
+  // expected-error@+2{{use of undeclared identifier 'invalid'}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc set default_async(invalid)
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc set default_async(5, 4)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc set default_async(5)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc set default_async(returns_int())
 }
 
   // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}

From 0cea54a382f3187acbe3e81bd0fd7cf2cb1077b8 Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Mon, 22 Jan 2024 10:46:20 -0800
Subject: [PATCH 455/843] [lldb][NFCI] Remove EventData* param from
 BroadcastEvent (#78773)

BroadcastEvent currently takes its EventData* param and shoves it into
an Event object, which takes ownership of the pointer and places it into
a shared_ptr to manage the lifetime.

Instead of relying on `new` and passing raw pointers around, I think it
would make more sense to create the shared_ptr up front.
---
 lldb/include/lldb/Breakpoint/Watchpoint.h     |  2 --
 lldb/include/lldb/Utility/Broadcaster.h       |  6 ++---
 lldb/source/Breakpoint/BreakpointList.cpp     |  7 ++++--
 lldb/source/Breakpoint/BreakpointLocation.cpp |  6 ++---
 lldb/source/Breakpoint/Watchpoint.cpp         | 22 ++++--------------
 lldb/source/Breakpoint/WatchpointList.cpp     | 23 +++++++++++--------
 .../Process/gdb-remote/ProcessGDBRemote.cpp   | 17 +++++++-------
 lldb/source/Target/Process.cpp                |  6 ++---
 lldb/source/Target/Target.cpp                 | 15 +++++++-----
 lldb/source/Target/Thread.cpp                 | 15 +++++++-----
 lldb/source/Target/ThreadList.cpp             | 10 ++++----
 lldb/source/Utility/Broadcaster.cpp           |  5 ++--
 12 files changed, 66 insertions(+), 68 deletions(-)

diff --git a/lldb/include/lldb/Breakpoint/Watchpoint.h b/lldb/include/lldb/Breakpoint/Watchpoint.h
index 851162af24c74..22fdfd686c3f1 100644
--- a/lldb/include/lldb/Breakpoint/Watchpoint.h
+++ b/lldb/include/lldb/Breakpoint/Watchpoint.h
@@ -235,8 +235,6 @@ class Watchpoint : public std::enable_shared_from_this<Watchpoint>,
 
   void SendWatchpointChangedEvent(lldb::WatchpointEventType eventKind);
 
-  void SendWatchpointChangedEvent(WatchpointEventData *data);
-
   Watchpoint(const Watchpoint &) = delete;
   const Watchpoint &operator=(const Watchpoint &) = delete;
 };
diff --git a/lldb/include/lldb/Utility/Broadcaster.h b/lldb/include/lldb/Utility/Broadcaster.h
index 8444c38f6ecc6..c8127f0a921d8 100644
--- a/lldb/include/lldb/Utility/Broadcaster.h
+++ b/lldb/include/lldb/Utility/Broadcaster.h
@@ -177,8 +177,8 @@ class Broadcaster {
     m_broadcaster_sp->BroadcastEvent(event_type, event_data_sp);
   }
 
-  void BroadcastEvent(uint32_t event_type, EventData *event_data = nullptr) {
-    m_broadcaster_sp->BroadcastEvent(event_type, event_data);
+  void BroadcastEvent(uint32_t event_type) {
+    m_broadcaster_sp->BroadcastEvent(event_type);
   }
 
   void BroadcastEventIfUnique(uint32_t event_type,
@@ -346,7 +346,7 @@ class Broadcaster {
 
     void BroadcastEventIfUnique(lldb::EventSP &event_sp);
 
-    void BroadcastEvent(uint32_t event_type, EventData *event_data = nullptr);
+    void BroadcastEvent(uint32_t event_type);
 
     void BroadcastEvent(uint32_t event_type,
                         const lldb::EventDataSP &event_data_sp);
diff --git a/lldb/source/Breakpoint/BreakpointList.cpp b/lldb/source/Breakpoint/BreakpointList.cpp
index f7c2cdb938e62..2c47b3b1263c6 100644
--- a/lldb/source/Breakpoint/BreakpointList.cpp
+++ b/lldb/source/Breakpoint/BreakpointList.cpp
@@ -17,9 +17,12 @@ using namespace lldb_private;
 
 static void NotifyChange(const BreakpointSP &bp, BreakpointEventType event) {
   Target &target = bp->GetTarget();
-  if (target.EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged))
+  if (target.EventTypeHasListeners(Target::eBroadcastBitBreakpointChanged)) {
+    auto event_data_sp =
+        std::make_shared<Breakpoint::BreakpointEventData>(event, bp);
     target.BroadcastEvent(Target::eBroadcastBitBreakpointChanged,
-                          new Breakpoint::BreakpointEventData(event, bp));
+                          event_data_sp);
+  }
 }
 
 BreakpointList::BreakpointList(bool is_internal)
diff --git a/lldb/source/Breakpoint/BreakpointLocation.cpp b/lldb/source/Breakpoint/BreakpointLocation.cpp
index 99f94d04bb318..98de059c2e296 100644
--- a/lldb/source/Breakpoint/BreakpointLocation.cpp
+++ b/lldb/source/Breakpoint/BreakpointLocation.cpp
@@ -649,11 +649,11 @@ void BreakpointLocation::SendBreakpointLocationChangedEvent(
   if (!m_being_created && !m_owner.IsInternal() &&
       m_owner.GetTarget().EventTypeHasListeners(
           Target::eBroadcastBitBreakpointChanged)) {
-    Breakpoint::BreakpointEventData *data = new Breakpoint::BreakpointEventData(
+    auto data_sp = std::make_shared<Breakpoint::BreakpointEventData>(
         eventKind, m_owner.shared_from_this());
-    data->GetBreakpointLocationCollection().Add(shared_from_this());
+    data_sp->GetBreakpointLocationCollection().Add(shared_from_this());
     m_owner.GetTarget().BroadcastEvent(Target::eBroadcastBitBreakpointChanged,
-                                       data);
+                                       data_sp);
   }
 }
 
diff --git a/lldb/source/Breakpoint/Watchpoint.cpp b/lldb/source/Breakpoint/Watchpoint.cpp
index 4602ce4213b9c..1a8ef87c1e67a 100644
--- a/lldb/source/Breakpoint/Watchpoint.cpp
+++ b/lldb/source/Breakpoint/Watchpoint.cpp
@@ -462,26 +462,14 @@ const char *Watchpoint::GetConditionText() const {
 
 void Watchpoint::SendWatchpointChangedEvent(
     lldb::WatchpointEventType eventKind) {
-  if (!m_being_created &&
-      GetTarget().EventTypeHasListeners(
-          Target::eBroadcastBitWatchpointChanged)) {
-    WatchpointEventData *data =
-        new Watchpoint::WatchpointEventData(eventKind, shared_from_this());
-    GetTarget().BroadcastEvent(Target::eBroadcastBitWatchpointChanged, data);
+  if (!m_being_created && GetTarget().EventTypeHasListeners(
+                              Target::eBroadcastBitWatchpointChanged)) {
+    auto data_sp =
+        std::make_shared<WatchpointEventData>(eventKind, shared_from_this());
+    GetTarget().BroadcastEvent(Target::eBroadcastBitWatchpointChanged, data_sp);
   }
 }
 
-void Watchpoint::SendWatchpointChangedEvent(WatchpointEventData *data) {
-  if (data == nullptr)
-    return;
-
-  if (!m_being_created &&
-      GetTarget().EventTypeHasListeners(Target::eBroadcastBitWatchpointChanged))
-    GetTarget().BroadcastEvent(Target::eBroadcastBitWatchpointChanged, data);
-  else
-    delete data;
-}
-
 Watchpoint::WatchpointEventData::WatchpointEventData(
     WatchpointEventType sub_type, const WatchpointSP &new_watchpoint_sp)
     : m_watchpoint_event(sub_type), m_new_watchpoint_sp(new_watchpoint_sp) {}
diff --git a/lldb/source/Breakpoint/WatchpointList.cpp b/lldb/source/Breakpoint/WatchpointList.cpp
index fc0cc9126d602..f7564483e6f1f 100644
--- a/lldb/source/Breakpoint/WatchpointList.cpp
+++ b/lldb/source/Breakpoint/WatchpointList.cpp
@@ -23,10 +23,12 @@ lldb::watch_id_t WatchpointList::Add(const WatchpointSP &wp_sp, bool notify) {
   m_watchpoints.push_back(wp_sp);
   if (notify) {
     if (wp_sp->GetTarget().EventTypeHasListeners(
-            Target::eBroadcastBitWatchpointChanged))
+            Target::eBroadcastBitWatchpointChanged)) {
+      auto data_sp = std::make_shared<Watchpoint::WatchpointEventData>(
+          eWatchpointEventTypeAdded, wp_sp);
       wp_sp->GetTarget().BroadcastEvent(Target::eBroadcastBitWatchpointChanged,
-                                        new Watchpoint::WatchpointEventData(
-                                            eWatchpointEventTypeAdded, wp_sp));
+                                        data_sp);
+    }
   }
   return wp_sp->GetID();
 }
@@ -171,11 +173,12 @@ bool WatchpointList::Remove(lldb::watch_id_t watch_id, bool notify) {
     WatchpointSP wp_sp = *pos;
     if (notify) {
       if (wp_sp->GetTarget().EventTypeHasListeners(
-              Target::eBroadcastBitWatchpointChanged))
+              Target::eBroadcastBitWatchpointChanged)) {
+        auto data_sp = std::make_shared<Watchpoint::WatchpointEventData>(
+            eWatchpointEventTypeRemoved, wp_sp);
         wp_sp->GetTarget().BroadcastEvent(
-            Target::eBroadcastBitWatchpointChanged,
-            new Watchpoint::WatchpointEventData(eWatchpointEventTypeRemoved,
-                                                wp_sp));
+            Target::eBroadcastBitWatchpointChanged, data_sp);
+      }
     }
     m_watchpoints.erase(pos);
     return true;
@@ -234,10 +237,10 @@ void WatchpointList::RemoveAll(bool notify) {
       for (pos = m_watchpoints.begin(); pos != end; ++pos) {
         if ((*pos)->GetTarget().EventTypeHasListeners(
                 Target::eBroadcastBitBreakpointChanged)) {
+          auto data_sp = std::make_shared<Watchpoint::WatchpointEventData>(
+              eWatchpointEventTypeRemoved, *pos);
           (*pos)->GetTarget().BroadcastEvent(
-              Target::eBroadcastBitWatchpointChanged,
-              new Watchpoint::WatchpointEventData(eWatchpointEventTypeRemoved,
-                                                  *pos));
+              Target::eBroadcastBitWatchpointChanged, data_sp);
         }
       }
     }
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 23834d403579c..eb42b9eb6cb6a 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -1089,8 +1089,8 @@ Status ProcessGDBRemote::DoAttachToProcessWithID(
       const int packet_len =
           ::snprintf(packet, sizeof(packet), "vAttach;%" PRIx64, attach_pid);
       SetID(attach_pid);
-      m_async_broadcaster.BroadcastEvent(
-          eBroadcastBitAsyncContinue, new EventDataBytes(packet, packet_len));
+      auto data_sp = std::make_shared<EventDataBytes>(packet, packet_len);
+      m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp);
     } else
       SetExitStatus(-1, error.AsCString());
   }
@@ -1127,9 +1127,9 @@ Status ProcessGDBRemote::DoAttachToProcessWithName(
                                endian::InlHostByteOrder(),
                                endian::InlHostByteOrder());
 
-      m_async_broadcaster.BroadcastEvent(
-          eBroadcastBitAsyncContinue,
-          new EventDataBytes(packet.GetString().data(), packet.GetSize()));
+      auto data_sp = std::make_shared<EventDataBytes>(packet.GetString().data(),
+                                                      packet.GetSize());
+      m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp);
 
     } else
       SetExitStatus(-1, error.AsCString());
@@ -1374,10 +1374,9 @@ Status ProcessGDBRemote::DoResume() {
         return error;
       }
 
-      m_async_broadcaster.BroadcastEvent(
-          eBroadcastBitAsyncContinue,
-          new EventDataBytes(continue_packet.GetString().data(),
-                             continue_packet.GetSize()));
+      auto data_sp = std::make_shared<EventDataBytes>(
+          continue_packet.GetString().data(), continue_packet.GetSize());
+      m_async_broadcaster.BroadcastEvent(eBroadcastBitAsyncContinue, data_sp);
 
       if (!listener_sp->GetEvent(event_sp, std::chrono::seconds(5))) {
         error.SetErrorString("Resume timed out.");
diff --git a/lldb/source/Target/Process.cpp b/lldb/source/Target/Process.cpp
index 8158bf6125837..e1c16ca216432 100644
--- a/lldb/source/Target/Process.cpp
+++ b/lldb/source/Target/Process.cpp
@@ -4304,9 +4304,9 @@ void Process::BroadcastAsyncProfileData(const std::string &one_profile_data) {
 
 void Process::BroadcastStructuredData(const StructuredData::ObjectSP &object_sp,
                                       const StructuredDataPluginSP &plugin_sp) {
-  BroadcastEvent(
-      eBroadcastBitStructuredData,
-      new EventDataStructuredData(shared_from_this(), object_sp, plugin_sp));
+  auto data_sp = std::make_shared<EventDataStructuredData>(
+      shared_from_this(), object_sp, plugin_sp);
+  BroadcastEvent(eBroadcastBitStructuredData, data_sp);
 }
 
 StructuredDataPluginSP
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index 302c2bad7021b..e969340fdf1eb 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -1704,8 +1704,9 @@ void Target::ModulesDidLoad(ModuleList &module_list) {
     if (m_process_sp) {
       m_process_sp->ModulesDidLoad(module_list);
     }
-    BroadcastEvent(eBroadcastBitModulesLoaded,
-                   new TargetEventData(this->shared_from_this(), module_list));
+    auto data_sp =
+        std::make_shared<TargetEventData>(shared_from_this(), module_list);
+    BroadcastEvent(eBroadcastBitModulesLoaded, data_sp);
   }
 }
 
@@ -1719,16 +1720,18 @@ void Target::SymbolsDidLoad(ModuleList &module_list) {
 
     m_breakpoint_list.UpdateBreakpoints(module_list, true, false);
     m_internal_breakpoint_list.UpdateBreakpoints(module_list, true, false);
-    BroadcastEvent(eBroadcastBitSymbolsLoaded,
-                   new TargetEventData(this->shared_from_this(), module_list));
+    auto data_sp =
+        std::make_shared<TargetEventData>(shared_from_this(), module_list);
+    BroadcastEvent(eBroadcastBitSymbolsLoaded, data_sp);
   }
 }
 
 void Target::ModulesDidUnload(ModuleList &module_list, bool delete_locations) {
   if (m_valid && module_list.GetSize()) {
     UnloadModuleSections(module_list);
-    BroadcastEvent(eBroadcastBitModulesUnloaded,
-                   new TargetEventData(this->shared_from_this(), module_list));
+    auto data_sp =
+        std::make_shared<TargetEventData>(shared_from_this(), module_list);
+    BroadcastEvent(eBroadcastBitModulesUnloaded, data_sp);
     m_breakpoint_list.UpdateBreakpoints(module_list, false, delete_locations);
     m_internal_breakpoint_list.UpdateBreakpoints(module_list, false,
                                                  delete_locations);
diff --git a/lldb/source/Target/Thread.cpp b/lldb/source/Target/Thread.cpp
index 865cee97e6d87..8ae2179c1281d 100644
--- a/lldb/source/Target/Thread.cpp
+++ b/lldb/source/Target/Thread.cpp
@@ -253,9 +253,11 @@ void Thread::DestroyThread() {
 }
 
 void Thread::BroadcastSelectedFrameChange(StackID &new_frame_id) {
-  if (EventTypeHasListeners(eBroadcastBitSelectedFrameChanged))
-    BroadcastEvent(eBroadcastBitSelectedFrameChanged,
-                   new ThreadEventData(this->shared_from_this(), new_frame_id));
+  if (EventTypeHasListeners(eBroadcastBitSelectedFrameChanged)) {
+    auto data_sp =
+        std::make_shared<ThreadEventData>(shared_from_this(), new_frame_id);
+    BroadcastEvent(eBroadcastBitSelectedFrameChanged, data_sp);
+  }
 }
 
 lldb::StackFrameSP
@@ -1507,9 +1509,10 @@ Status Thread::ReturnFromFrame(lldb::StackFrameSP frame_sp,
       if (copy_success) {
         thread->DiscardThreadPlans(true);
         thread->ClearStackFrames();
-        if (broadcast && EventTypeHasListeners(eBroadcastBitStackChanged))
-          BroadcastEvent(eBroadcastBitStackChanged,
-                         new ThreadEventData(this->shared_from_this()));
+        if (broadcast && EventTypeHasListeners(eBroadcastBitStackChanged)) {
+          auto data_sp = std::make_shared<ThreadEventData>(shared_from_this());
+          BroadcastEvent(eBroadcastBitStackChanged, data_sp);
+        }
       } else {
         return_error.SetErrorString("Could not reset register values.");
       }
diff --git a/lldb/source/Target/ThreadList.cpp b/lldb/source/Target/ThreadList.cpp
index 1ba0c435b993d..03e8daedff129 100644
--- a/lldb/source/Target/ThreadList.cpp
+++ b/lldb/source/Target/ThreadList.cpp
@@ -726,10 +726,12 @@ bool ThreadList::SetSelectedThreadByIndexID(uint32_t index_id, bool notify) {
 void ThreadList::NotifySelectedThreadChanged(lldb::tid_t tid) {
   ThreadSP selected_thread_sp(FindThreadByID(tid));
   if (selected_thread_sp->EventTypeHasListeners(
-          Thread::eBroadcastBitThreadSelected))
-    selected_thread_sp->BroadcastEvent(
-        Thread::eBroadcastBitThreadSelected,
-        new Thread::ThreadEventData(selected_thread_sp));
+          Thread::eBroadcastBitThreadSelected)) {
+    auto data_sp =
+        std::make_shared<Thread::ThreadEventData>(selected_thread_sp);
+    selected_thread_sp->BroadcastEvent(Thread::eBroadcastBitThreadSelected,
+                                       data_sp);
+  }
 }
 
 void ThreadList::Update(ThreadList &rhs) {
diff --git a/lldb/source/Utility/Broadcaster.cpp b/lldb/source/Utility/Broadcaster.cpp
index 914812d785777..33cd49963e7c7 100644
--- a/lldb/source/Utility/Broadcaster.cpp
+++ b/lldb/source/Utility/Broadcaster.cpp
@@ -300,9 +300,8 @@ void Broadcaster::BroadcasterImpl::PrivateBroadcastEvent(EventSP &event_sp,
   }
 }
 
-void Broadcaster::BroadcasterImpl::BroadcastEvent(uint32_t event_type,
-                                                  EventData *event_data) {
-  auto event_sp = std::make_shared<Event>(event_type, event_data);
+void Broadcaster::BroadcasterImpl::BroadcastEvent(uint32_t event_type) {
+  auto event_sp = std::make_shared<Event>(event_type, /*data = */ nullptr);
   PrivateBroadcastEvent(event_sp, false);
 }
 

From 414df7051ac90f186fac9d3d8968aa827eceb697 Mon Sep 17 00:00:00 2001
From: Malavika Samak <malavika.samak@gmail.com>
Date: Mon, 22 Jan 2024 10:46:59 -0800
Subject: [PATCH 456/843] [-Wunsafe-buffer-usage] Fix the crash introduced by
 the unsafe invocation of span::data warning (#78815)

The patch fixes the crash introduced by the DataInvocation warning
gadget designed to warn against unsafe invocations of span::data method.

It also now considers the invocation of span::data method inside
parenthesis.

Radar: 121223051

---------

Co-authored-by: MalavikaSamak <malavika2@apple.com>
---
 clang/lib/Analysis/UnsafeBufferUsage.cpp      |  5 ++--
 clang/lib/Sema/AnalysisBasedWarnings.cpp      | 19 +++++++++------
 ...e-buffer-usage-warning-data-invocation.cpp | 23 ++++++++++++-------
 3 files changed, 30 insertions(+), 17 deletions(-)

diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 724c4304a0724..7df706beb2266 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -739,9 +739,10 @@ class DataInvocationGadget : public WarningGadget {
   }
 
   static Matcher matcher() {
+    Matcher callExpr = cxxMemberCallExpr(
+        callee(cxxMethodDecl(hasName("data"), ofClass(hasName("std::span")))));
     return stmt(
-        explicitCastExpr(has(cxxMemberCallExpr(callee(cxxMethodDecl(
-                             hasName("data"), ofClass(hasName("std::span")))))))
+        explicitCastExpr(anyOf(has(callExpr), has(parenExpr(has(callExpr)))))
             .bind(OpTag));
   }
   const Stmt *getBaseStmt() const override { return Op; }
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 9eb1df5f02405..649c3f533b820 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2263,15 +2263,20 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
         MsgParam = 3;
       } else if (const auto *ECE = dyn_cast<ExplicitCastExpr>(Operation)) {
         QualType destType = ECE->getType();
+        if (!isa<PointerType>(destType))
+          return;
+
+        const Expr *subExpr = ECE->getSubExpr();
+
         const uint64_t dSize =
             Ctx.getTypeSize(destType.getTypePtr()->getPointeeType());
-        if (const auto *CE = dyn_cast<CXXMemberCallExpr>(ECE->getSubExpr())) {
-          QualType srcType = CE->getType();
-          const uint64_t sSize =
-              Ctx.getTypeSize(srcType.getTypePtr()->getPointeeType());
-          if (sSize >= dSize)
-            return;
-        }
+
+        QualType srcType = ECE->getSubExpr()->getType();
+        const uint64_t sSize =
+            Ctx.getTypeSize(srcType.getTypePtr()->getPointeeType());
+        if (sSize >= dSize)
+          return;
+
         MsgParam = 4;
       }
       Loc = Operation->getBeginLoc();
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
index 79eb3bb4bacc6..574afcd0eb6dc 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
@@ -7,6 +7,7 @@
 // RUN: %clang_cc1 -std=c++20 -fblocks -include %s %s 2>&1 | FileCheck --allow-empty %s
 // CHECK-NOT: [-Wunsafe-buffer-usage]
 
+#include <stdint.h>
 #ifndef INCLUDED
 #define INCLUDED
 #pragma clang system_header
@@ -90,15 +91,18 @@ void cast_without_data(int *ptr) {
 void warned_patterns(std::span<int> span_ptr, std::span<Base> base_span, span<int> span_without_qual) {
     A *a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of span::data}}
     a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of span::data}}
-  
-    A *a2 = (A*) span_without_qual.data(); // expected-warning{{unsafe invocation of span::data}}
-   
-    // TODO:: Should we warn when we cast from base to derived type?
-    Derived *b = dynamic_cast<Derived*> (base_span.data());// expected-warning{{unsafe invocation of span::data}}
 
-   // TODO:: This pattern is safe. We can add special handling for it, if we decide this
-   // is the recommended fixit for the unsafe invocations.
-   A *a3 = (A*)span_ptr.subspan(0, sizeof(A)).data(); // expected-warning{{unsafe invocation of span::data}}
+    a1 = (A*)(span_ptr.data()); // expected-warning{{unsafe invocation of span::data}}
+    A *a2 = (A*) (span_without_qual.data()); // expected-warning{{unsafe invocation of span::data}}
+
+    a2 = (A*) span_without_qual.data(); // expected-warning{{unsafe invocation of span::data}}
+
+     // TODO:: Should we warn when we cast from base to derived type?
+     Derived *b = dynamic_cast<Derived*> (base_span.data());// expected-warning{{unsafe invocation of span::data}}
+
+    // TODO:: This pattern is safe. We can add special handling for it, if we decide this
+    // is the recommended fixit for the unsafe invocations.
+    A *a3 = (A*)span_ptr.subspan(0, sizeof(A)).data(); // expected-warning{{unsafe invocation of span::data}}
 }
 
 void not_warned_patterns(std::span<A> span_ptr, std::span<Base> base_span) {
@@ -108,6 +112,9 @@ void not_warned_patterns(std::span<A> span_ptr, std::span<Base> base_span) {
 
     p = (int*) span_ptr.data();
     A *a = (A*) span_ptr.hello(); // Invoking other methods.
+   
+     intptr_t k = (intptr_t) span_ptr.data();
+    k = (intptr_t) (span_ptr.data());
 }
 
 // We do not want to warn about other types

From 16d2583254284611843b2517c8e9f9d6efe8c627 Mon Sep 17 00:00:00 2001
From: Valentin Clement <clementval@gmail.com>
Date: Mon, 22 Jan 2024 10:48:17 -0800
Subject: [PATCH 457/843] [flang][openacc] Fix test with new loop design

---
 flang/test/Lower/OpenACC/acc-loop.f90 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index 334d8767a727a..6bd4dd61bd893 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -303,6 +303,6 @@ program acc_loop
   DO i = 1, n
   END DO
 
-! CHECK: acc.loop gang([#acc.device_type<nvidia>, #acc.device_type<default>]) {
+! CHECK: acc.loop gang([#acc.device_type<nvidia>, #acc.device_type<default>])
 
 end program

From 1be0d9d7d88a9bdabe6ef4d81720ddf4cf6f71c1 Mon Sep 17 00:00:00 2001
From: Dani <DanielKristofKiss@users.noreply.github.com>
Date: Mon, 22 Jan 2024 19:55:16 +0100
Subject: [PATCH 458/843] [AArch64][Clang] Fix linker error for function
 multiversioning (#74358)

AArch64 part of https://github.com/llvm/llvm-project/pull/71706.

Default version is now mangled with .default.
Resolver for the TargetVersion need to be emitted from the
CodeGenModule::EmitMultiVersionFunctionDefinition.
---
 clang/lib/CodeGen/CodeGenModule.cpp           |  28 +-
 .../test/CodeGen/attr-target-clones-aarch64.c | 187 ++++--
 clang/test/CodeGen/attr-target-version.c      | 549 +++++++++++++-----
 .../CodeGenCXX/attr-target-clones-aarch64.cpp |  26 +-
 clang/test/CodeGenCXX/attr-target-version.cpp | 168 +++---
 5 files changed, 656 insertions(+), 302 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 77c931842dd21..78b2f2ba29fdf 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -1716,8 +1716,10 @@ static void AppendCPUSpecificCPUDispatchMangling(const CodeGenModule &CGM,
 static void AppendTargetVersionMangling(const CodeGenModule &CGM,
                                         const TargetVersionAttr *Attr,
                                         raw_ostream &Out) {
-  if (Attr->isDefaultVersion())
+  if (Attr->isDefaultVersion()) {
+    Out << ".default";
     return;
+  }
   Out << "._";
   const TargetInfo &TI = CGM.getTarget();
   llvm::SmallVector<StringRef, 8> Feats;
@@ -1780,8 +1782,10 @@ static void AppendTargetClonesMangling(const CodeGenModule &CGM,
   const TargetInfo &TI = CGM.getTarget();
   if (TI.getTriple().isAArch64()) {
     StringRef FeatureStr = Attr->getFeatureStr(VersionIndex);
-    if (FeatureStr == "default")
+    if (FeatureStr == "default") {
+      Out << ".default";
       return;
+    }
     Out << "._";
     SmallVector<StringRef, 8> Features;
     FeatureStr.split(Features, "+");
@@ -4029,6 +4033,8 @@ void CodeGenModule::EmitMultiVersionFunctionDefinition(GlobalDecl GD,
         EmitGlobalFunctionDefinition(GD.getWithMultiVersionIndex(I), nullptr);
     // Ensure that the resolver function is also emitted.
     GetOrCreateMultiVersionResolver(GD);
+  } else if (FD->hasAttr<TargetVersionAttr>()) {
+    GetOrCreateMultiVersionResolver(GD);
   } else
     EmitGlobalFunctionDefinition(GD, GV);
 }
@@ -4210,14 +4216,7 @@ void CodeGenModule::emitMultiVersionFunctions() {
     llvm::Constant *ResolverConstant = GetOrCreateMultiVersionResolver(GD);
     if (auto *IFunc = dyn_cast<llvm::GlobalIFunc>(ResolverConstant)) {
       ResolverConstant = IFunc->getResolver();
-      // In Aarch64, default versions of multiversioned functions are mangled to
-      // their 'normal' assembly name. This deviates from other targets which
-      // append a '.default' string. As a result we need to continue appending
-      // .ifunc in Aarch64.
-      // FIXME: Should Aarch64 mangling for 'default' multiversion function and
-      // in turn ifunc function match that of other targets?
-      if (FD->isTargetClonesMultiVersion() &&
-          !getTarget().getTriple().isAArch64()) {
+      if (FD->isTargetClonesMultiVersion()) {
         const CGFunctionInfo &FI = getTypes().arrangeGlobalDeclaration(GD);
         llvm::FunctionType *DeclTy = getTypes().GetFunctionType(FI);
         std::string MangledName = getMangledNameImpl(
@@ -4398,14 +4397,7 @@ llvm::Constant *CodeGenModule::GetOrCreateMultiVersionResolver(GlobalDecl GD) {
   // a separate resolver).
   std::string ResolverName = MangledName;
   if (getTarget().supportsIFunc()) {
-    // In Aarch64, default versions of multiversioned functions are mangled to
-    // their 'normal' assembly name. This deviates from other targets which
-    // append a '.default' string. As a result we need to continue appending
-    // .ifunc in Aarch64.
-    // FIXME: Should Aarch64 mangling for 'default' multiversion function and
-    // in turn ifunc function match that of other targets?
-    if (!FD->isTargetClonesMultiVersion() ||
-        getTarget().getTriple().isAArch64())
+    if (!FD->isTargetClonesMultiVersion())
       ResolverName += ".ifunc";
   } else if (FD->isTargetMultiVersion()) {
     ResolverName += ".resolver";
diff --git a/clang/test/CodeGen/attr-target-clones-aarch64.c b/clang/test/CodeGen/attr-target-clones-aarch64.c
index 3f2f2fdd24e8a..5ea3f4a9b0b11 100644
--- a/clang/test/CodeGen/attr-target-clones-aarch64.c
+++ b/clang/test/CodeGen/attr-target-clones-aarch64.c
@@ -22,27 +22,44 @@ int __attribute__((target_clones("default"))) main() {
 inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))) ftc_inline2(void) { return 2; };
 
 
-// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @ftc.ifunc = weak_odr ifunc i32 (), ptr @ftc.resolver
-// CHECK: @ftc_def.ifunc = weak_odr ifunc i32 (), ptr @ftc_def.resolver
-// CHECK: @ftc_dup1.ifunc = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver
-// CHECK: @ftc_dup2.ifunc = weak_odr ifunc i32 (), ptr @ftc_dup2.resolver
-// CHECK: @ftc_inline1.ifunc = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver
-// CHECK: @ftc_inline2.ifunc = weak_odr ifunc i32 (), ptr @ftc_inline2.resolver
-// CHECK: @ftc_inline3.ifunc = weak_odr ifunc i32 (), ptr @ftc_inline3.resolver
 
+
+
+//.
+// CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
+// CHECK: @ftc.ifunc = weak_odr alias i32 (), ptr @ftc
+// CHECK: @ftc_def.ifunc = weak_odr alias i32 (), ptr @ftc_def
+// CHECK: @ftc_dup1.ifunc = weak_odr alias i32 (), ptr @ftc_dup1
+// CHECK: @ftc_dup2.ifunc = weak_odr alias i32 (), ptr @ftc_dup2
+// CHECK: @ftc_inline1.ifunc = weak_odr alias i32 (), ptr @ftc_inline1
+// CHECK: @ftc_inline2.ifunc = weak_odr alias i32 (), ptr @ftc_inline2
+// CHECK: @ftc_inline3.ifunc = weak_odr alias i32 (), ptr @ftc_inline3
+// CHECK: @ftc = weak_odr ifunc i32 (), ptr @ftc.resolver
+// CHECK: @ftc_def = weak_odr ifunc i32 (), ptr @ftc_def.resolver
+// CHECK: @ftc_dup1 = weak_odr ifunc i32 (), ptr @ftc_dup1.resolver
+// CHECK: @ftc_dup2 = weak_odr ifunc i32 (), ptr @ftc_dup2.resolver
+// CHECK: @ftc_inline1 = weak_odr ifunc i32 (), ptr @ftc_inline1.resolver
+// CHECK: @ftc_inline2 = weak_odr ifunc i32 (), ptr @ftc_inline2.resolver
+// CHECK: @ftc_inline3 = weak_odr ifunc i32 (), ptr @ftc_inline3.resolver
+//.
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc._MlseMaes(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc._Msve2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc(
+// CHECK-LABEL: @ftc.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
+//
+//
 // CHECK-LABEL: @ftc.resolver(
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -62,19 +79,27 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return1:
 // CHECK-NEXT:    ret ptr @ftc._Msve2
 // CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @ftc
+// CHECK-NEXT:    ret ptr @ftc.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_def._Msha2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_def._Msha2Mmemtag2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_def(
+// CHECK-LABEL: @ftc_def.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
+//
+//
 // CHECK-LABEL: @ftc_def.resolver(
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -94,15 +119,21 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return1:
 // CHECK-NEXT:    ret ptr @ftc_def._Msha2
 // CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @ftc_def
+// CHECK-NEXT:    ret ptr @ftc_def.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_dup1._Msha2(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_dup1(
+// CHECK-LABEL: @ftc_dup1.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
+//
+//
 // CHECK-LABEL: @ftc_dup1.resolver(
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -114,19 +145,27 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @ftc_dup1._Msha2
 // CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @ftc_dup1
+// CHECK-NEXT:    ret ptr @ftc_dup1.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_dup2._Mfp(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_dup2._MdotprodMcrc(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_dup2(
+// CHECK-LABEL: @ftc_dup2.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
+//
+//
 // CHECK-LABEL: @ftc_dup2.resolver(
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -146,35 +185,43 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return1:
 // CHECK-NEXT:    ret ptr @ftc_dup2._Mfp
 // CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @ftc_dup2
+// CHECK-NEXT:    ret ptr @ftc_dup2.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @foo(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @ftc.ifunc()
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @ftc_def.ifunc()
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @ftc()
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @ftc_def()
 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @ftc_dup1.ifunc()
+// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @ftc_dup1()
 // CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
-// CHECK-NEXT:    [[CALL4:%.*]] = call i32 @ftc_dup2.ifunc()
+// CHECK-NEXT:    [[CALL4:%.*]] = call i32 @ftc_dup2()
 // CHECK-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD3]], [[CALL4]]
 // CHECK-NEXT:    ret i32 [[ADD5]]
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_direct(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @main(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @ftc_inline1.ifunc()
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @ftc_inline2.ifunc()
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @ftc_inline1()
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @ftc_inline2()
 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @ftc_inline3.ifunc()
+// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @ftc_inline3()
 // CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
 // CHECK-NEXT:    [[CALL4:%.*]] = call i32 @ftc_direct()
 // CHECK-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD3]], [[CALL4]]
 // CHECK-NEXT:    ret i32 [[ADD5]]
+//
+//
 // CHECK-LABEL: @ftc_inline1.resolver(
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -202,7 +249,9 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return3:
 // CHECK-NEXT:    ret ptr @ftc_inline1._MrngMsimd
 // CHECK:       resolver_else4:
-// CHECK-NEXT:    ret ptr @ftc_inline1
+// CHECK-NEXT:    ret ptr @ftc_inline1.default
+//
+//
 // CHECK-LABEL: @ftc_inline2.resolver(
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -222,7 +271,9 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return1:
 // CHECK-NEXT:    ret ptr @ftc_inline2._Mfp16
 // CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @ftc_inline2
+// CHECK-NEXT:    ret ptr @ftc_inline2.default
+//
+//
 // CHECK-LABEL: @ftc_inline3.resolver(
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
@@ -242,63 +293,93 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK:       resolver_return1:
 // CHECK-NEXT:    ret ptr @ftc_inline3._Mbti
 // CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @ftc_inline3
+// CHECK-NEXT:    ret ptr @ftc_inline3.default
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline1._MrngMsimd(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline1._MrcpcMpredres(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline1._Msve2-aesMwfxt(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_inline1(
+// CHECK-LABEL: @ftc_inline1.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline2._Mfp16(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline2._MfcmaMsve2-bitperm(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_inline2(
+// CHECK-LABEL: @ftc_inline2.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline3._Mbti(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
 // CHECK-LABEL: @ftc_inline3._MsveMsb(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
+//
+//
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: @ftc_inline3(
+// CHECK-LABEL: @ftc_inline3.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: @ftc(
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: @ftc_def(
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 1
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: @ftc_dup1(
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 2
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: @ftc_dup2(
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 3
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: @foo(
 // CHECK-NOFMV-NEXT:  entry:
@@ -310,10 +391,14 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NOFMV-NEXT:    [[CALL4:%.*]] = call i32 @ftc_dup2()
 // CHECK-NOFMV-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD3]], [[CALL4]]
 // CHECK-NOFMV-NEXT:    ret i32 [[ADD5]]
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: @ftc_direct(
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 4
+//
+//
 // CHECK-NOFMV: Function Attrs: noinline nounwind optnone
 // CHECK-NOFMV-LABEL: @main(
 // CHECK-NOFMV-NEXT:  entry:
@@ -327,21 +412,29 @@ inline int __attribute__((target_clones("fp16", "sve2-bitperm+fcma", "default"))
 // CHECK-NOFMV-NEXT:    [[CALL4:%.*]] = call i32 @ftc_direct()
 // CHECK-NOFMV-NEXT:    [[ADD5:%.*]] = add nsw i32 [[ADD3]], [[CALL4]]
 // CHECK-NOFMV-NEXT:    ret i32 [[ADD5]]
-
-// CHECK: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon" }
-// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2" }
-// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #3 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sha2" }
-// CHECK: attributes #4 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+mte,+neon,+sha2" }
-// CHECK: attributes #5 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
-// CHECK: attributes #6 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+dotprod,+fp-armv8,+neon" }
-// CHECK: attributes #7 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rand" }
-// CHECK: attributes #8 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+predres,+rcpc" }
-// CHECK: attributes #9 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+wfxt" }
-// CHECK: attributes #10 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
-// CHECK: attributes #11 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-bitperm" }
-// CHECK: attributes #12 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti" }
-// CHECK: attributes #13 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sb,+sve" }
-
-// CHECK-NOFMV: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
-// CHECK-NOFMV: attributes #1 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
+//
+//.
+// CHECK: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+lse,+neon" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2" }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+sha2" }
+// CHECK: attributes #[[ATTR4:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+mte,+neon,+sha2" }
+// CHECK: attributes #[[ATTR5:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR6:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+dotprod,+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR7:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+neon,+rand" }
+// CHECK: attributes #[[ATTR8:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+predres,+rcpc" }
+// CHECK: attributes #[[ATTR9:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-aes,+wfxt" }
+// CHECK: attributes #[[ATTR10:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon" }
+// CHECK: attributes #[[ATTR11:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+complxnum,+fp-armv8,+fullfp16,+neon,+sve,+sve2,+sve2-bitperm" }
+// CHECK: attributes #[[ATTR12:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti" }
+// CHECK: attributes #[[ATTR13:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+neon,+sb,+sve" }
+//.
+// CHECK-NOFMV: attributes #[[ATTR0:[0-9]+]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
+// CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// CHECK-NOFMV: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK-NOFMV: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index a7cd6f7bf802c..13b895ad8e9b8 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -1,4 +1,4 @@
-// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --check-globals --include-generated-funcs
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --function-signature --check-attributes --check-globals --include-generated-funcs
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +ls64 -target-feature +fullfp16 -S -emit-llvm -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature -fmv -S -emit-llvm -o - %s | FileCheck %s -check-prefix=CHECK-NOFMV
 
@@ -80,78 +80,28 @@ int hoo(void) {
   return fp1() + fp2();
 }
 
+
+
+
+
+//.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
 // CHECK: @fmv.ifunc = weak_odr ifunc i32 (), ptr @fmv.resolver
 // CHECK: @fmv_one.ifunc = weak_odr ifunc i32 (), ptr @fmv_one.resolver
 // CHECK: @fmv_two.ifunc = weak_odr ifunc i32 (), ptr @fmv_two.resolver
-// CHECK: @fmv_inline.ifunc = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
 // CHECK: @fmv_e.ifunc = weak_odr ifunc i32 (), ptr @fmv_e.resolver
-// CHECK: @fmv_d.ifunc = internal ifunc i32 (), ptr @fmv_d.resolver
 // CHECK: @fmv_c.ifunc = weak_odr ifunc void (), ptr @fmv_c.resolver
-
-// CHECK-LABEL: @fmv._MrngMflagmMfp16fml(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-// CHECK-LABEL: @fmv._Mflagm2Msme-i16i64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 2
-// CHECK-LABEL: @fmv._MlseMsha2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 3
-// CHECK-LABEL: @fmv._MdotprodMls64_accdata(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 4
-// CHECK-LABEL: @fmv._Mfp16fmlMmemtag(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 5
-// CHECK-LABEL: @fmv._MfpMaes(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 6
-// CHECK-LABEL: @fmv._McrcMls64_v(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 7
-// CHECK-LABEL: @fmv._Mbti(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 8
-// CHECK-LABEL: @fmv._Msme2(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 9
-// CHECK-LABEL: @fmv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-// CHECK-LABEL: @fmv_one._MsimdMls64(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 1
-// CHECK-LABEL: @fmv_one._Mdpb(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 2
-// CHECK-LABEL: @fmv_one(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-// CHECK-LABEL: @fmv_two._Mfp(
+// CHECK: @fmv_inline.ifunc = weak_odr ifunc i32 (), ptr @fmv_inline.resolver
+// CHECK: @fmv_d.ifunc = internal ifunc i32 (), ptr @fmv_d.resolver
+//.
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv._MrngMflagmMfp16fml
+// CHECK-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
-// CHECK-LABEL: @fmv_two._Msimd(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 2
-// CHECK-LABEL: @fmv_two._Mdgh(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 3
-// CHECK-LABEL: @fmv_two._MsimdMfp16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 4
-// CHECK-LABEL: @fmv_two(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 0
-// CHECK-LABEL: @foo(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv.ifunc()
-// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one.ifunc()
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_two.ifunc()
-// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
-// CHECK-NEXT:    ret i32 [[ADD3]]
-// CHECK-LABEL: @fmv.resolver(
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@fmv.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -226,26 +176,81 @@ int hoo(void) {
 // CHECK:       resolver_return15:
 // CHECK-NEXT:    ret ptr @fmv._Mbti
 // CHECK:       resolver_else16:
-// CHECK-NEXT:    ret ptr @fmv
-// CHECK-LABEL: @fmv_one.resolver(
+// CHECK-NEXT:    ret ptr @fmv.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_one._MsimdMls64
+// CHECK-SAME: () #[[ATTR1:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@fmv_one.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @fmv_one._MsimdMls64
-// CHECK-LABEL: @fmv_two.resolver(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mfp
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 1
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@fmv_two.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    ret ptr @fmv_two._MsimdMfp16
-// CHECK-LABEL: @fmv_e(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@foo
+// CHECK-SAME: () #[[ATTR2:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 20
-// CHECK-LABEL: @fmv_default(
+// CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv.ifunc()
+// CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one.ifunc()
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
+// CHECK-NEXT:    [[CALL2:%.*]] = call i32 @fmv_two.ifunc()
+// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
+// CHECK-NEXT:    ret i32 [[ADD3]]
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@fmv_e.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    ret ptr @fmv_e._Mls64
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_default
+// CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 111
-// CHECK-LABEL: @fmv_c._Mssbs(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret void
-// CHECK-LABEL: @fmv_c(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_c._Mssbs
+// CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret void
-// CHECK-LABEL: @goo(
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@fmv_c.resolver() comdat {
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @fmv_c._Mssbs
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @fmv_c.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@goo
+// CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline.ifunc()
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e.ifunc()
@@ -253,7 +258,9 @@ int hoo(void) {
 // CHECK-NEXT:    call void @fmv_c.ifunc()
 // CHECK-NEXT:    [[CALL3:%.*]] = call i32 @fmv_default()
 // CHECK-NEXT:    ret i32 [[CALL3]]
-// CHECK-LABEL: @fmv_inline.resolver(
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline.resolver() comdat {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -352,11 +359,10 @@ int hoo(void) {
 // CHECK:       resolver_return21:
 // CHECK-NEXT:    ret ptr @fmv_inline._Mdpb2Mjscvt
 // CHECK:       resolver_else22:
-// CHECK-NEXT:    ret ptr @fmv_inline
-// CHECK-LABEL: @fmv_e.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    ret ptr @fmv_e._Mls64
-// CHECK-LABEL: @fmv_d.resolver(
+// CHECK-NEXT:    ret ptr @fmv_inline.default
+//
+//
+// CHECK-LABEL: define {{[^@]+}}@fmv_d.resolver() {
 // CHECK-NEXT:  resolver_entry:
 // CHECK-NEXT:    call void @__init_cpu_features_resolver()
 // CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
@@ -367,31 +373,31 @@ int hoo(void) {
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @fmv_d._Msb
 // CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @fmv_d
-// CHECK-LABEL: @fmv_c.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 281474976710656
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 281474976710656
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @fmv_c._Mssbs
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @fmv_c
-// CHECK-LABEL: @recur(
+// CHECK-NEXT:    ret ptr @fmv_d.default
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@recur
+// CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    call void @reca()
 // CHECK-NEXT:    ret void
-// CHECK-LABEL: @main(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@main
+// CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK-NEXT:    call void @recur()
 // CHECK-NEXT:    [[CALL:%.*]] = call i32 @goo()
 // CHECK-NEXT:    ret i32 [[CALL]]
-// CHECK-LABEL: @hoo(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@hoo
+// CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
@@ -404,61 +410,256 @@ int hoo(void) {
 // CHECK-NEXT:    [[CALL1:%.*]] = call i32 [[TMP1]]()
 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
 // CHECK-NEXT:    ret i32 [[ADD]]
-// CHECK-LABEL: @fmv_inline._Msha1MpmullMf64mm(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv._Mflagm2Msme-i16i64
+// CHECK-SAME: () #[[ATTR4:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv._MlseMsha2
+// CHECK-SAME: () #[[ATTR5:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 3
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv._MdotprodMls64_accdata
+// CHECK-SAME: () #[[ATTR6:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 4
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv._Mfp16fmlMmemtag
+// CHECK-SAME: () #[[ATTR7:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 5
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv._MfpMaes
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 6
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv._McrcMls64_v
+// CHECK-SAME: () #[[ATTR8:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 7
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv._Mbti
+// CHECK-SAME: () #[[ATTR9:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 8
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv._Msme2
+// CHECK-SAME: () #[[ATTR10:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 9
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv.default
+// CHECK-SAME: () #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_one._Mdpb
+// CHECK-SAME: () #[[ATTR11:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_one.default
+// CHECK-SAME: () #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_two._Msimd
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 2
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_two._Mdgh
+// CHECK-SAME: () #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 3
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_two._MsimdMfp16
+// CHECK-SAME: () #[[ATTR1]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 4
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_two.default
+// CHECK-SAME: () #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 0
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_e.default
+// CHECK-SAME: () #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 20
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_c.default
+// CHECK-SAME: () #[[ATTR2]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret void
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msha1MpmullMf64mm
+// CHECK-SAME: () #[[ATTR12:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
-// CHECK-LABEL: @fmv_inline._Mfp16Mfp16MfcmaMsme(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mfp16Mfp16MfcmaMsme
+// CHECK-SAME: () #[[ATTR13:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
-// CHECK-LABEL: @fmv_inline._Msha3Mi8mmMf32mm(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msha3Mi8mmMf32mm
+// CHECK-SAME: () #[[ATTR14:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 12
-// CHECK-LABEL: @fmv_inline._MditMsve-ebf16(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MditMsve-ebf16
+// CHECK-SAME: () #[[ATTR15:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 8
-// CHECK-LABEL: @fmv_inline._MdpbMrcpc2(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MdpbMrcpc2
+// CHECK-SAME: () #[[ATTR16:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 6
-// CHECK-LABEL: @fmv_inline._Mdpb2Mjscvt(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mdpb2Mjscvt
+// CHECK-SAME: () #[[ATTR17:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 7
-// CHECK-LABEL: @fmv_inline._MrcpcMfrintts(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MrcpcMfrintts
+// CHECK-SAME: () #[[ATTR18:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
-// CHECK-LABEL: @fmv_inline._MsveMsve-bf16(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._MsveMsve-bf16
+// CHECK-SAME: () #[[ATTR19:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
-// CHECK-LABEL: @fmv_inline._Msve2-aesMsve2-sha3(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2-aesMsve2-sha3
+// CHECK-SAME: () #[[ATTR20:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 5
-// CHECK-LABEL: @fmv_inline._Msve2Msve2-pmull128Msve2-bitperm(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2Msve2-pmull128Msve2-bitperm
+// CHECK-SAME: () #[[ATTR21:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 9
-// CHECK-LABEL: @fmv_inline._Msve2-sm4Mmemtag2(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Msve2-sm4Mmemtag2
+// CHECK-SAME: () #[[ATTR22:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 10
-// CHECK-LABEL: @fmv_inline._Mrcpc3Mmemtag3(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mrcpc3Mmemtag3
+// CHECK-SAME: () #[[ATTR23:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 11
-// CHECK-LABEL: @fmv_inline(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline.default
+// CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 3
-// CHECK-LABEL: @fmv_d._Msb(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_d._Msb
+// CHECK-SAME: () #[[ATTR24:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 0
-// CHECK-LABEL: define internal i32 @fmv_d(
+//
+//
+// CHECK: Function Attrs: noinline nounwind optnone
+// CHECK-LABEL: define {{[^@]+}}@fmv_d.default
+// CHECK-SAME: () #[[ATTR2]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 1
-// CHECK-NOFMV-LABEL: @fmv(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv
+// CHECK-NOFMV-SAME: () #[[ATTR0:[0-9]+]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
-// CHECK-NOFMV-LABEL: @fmv_one(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_one
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
-// CHECK-NOFMV-LABEL: @fmv_two(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_two
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 0
-// CHECK-NOFMV-LABEL: @foo(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@foo
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @fmv()
 // CHECK-NOFMV-NEXT:    [[CALL1:%.*]] = call i32 @fmv_one()
@@ -466,16 +667,32 @@ int hoo(void) {
 // CHECK-NOFMV-NEXT:    [[CALL2:%.*]] = call i32 @fmv_two()
 // CHECK-NOFMV-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
 // CHECK-NOFMV-NEXT:    ret i32 [[ADD3]]
-// CHECK-NOFMV-LABEL: @fmv_e(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_e
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 20
-// CHECK-NOFMV-LABEL: @fmv_default(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_default
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 111
-// CHECK-NOFMV-LABEL: @fmv_c(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_c
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret void
-// CHECK-NOFMV-LABEL: @goo(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@goo
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @fmv_inline()
 // CHECK-NOFMV-NEXT:    [[CALL1:%.*]] = call i32 @fmv_e()
@@ -483,21 +700,37 @@ int hoo(void) {
 // CHECK-NOFMV-NEXT:    call void @fmv_c()
 // CHECK-NOFMV-NEXT:    [[CALL3:%.*]] = call i32 @fmv_default()
 // CHECK-NOFMV-NEXT:    ret i32 [[CALL3]]
-// CHECK-NOFMV-LABEL: define internal i32 @fmv_d(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@fmv_d
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    ret i32 1
-// CHECK-NOFMV-LABEL: @recur(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@recur
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    call void @reca()
 // CHECK-NOFMV-NEXT:    ret void
-// CHECK-NOFMV-LABEL: @main(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@main
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    [[RETVAL:%.*]] = alloca i32, align 4
 // CHECK-NOFMV-NEXT:    store i32 0, ptr [[RETVAL]], align 4
 // CHECK-NOFMV-NEXT:    call void @recur()
 // CHECK-NOFMV-NEXT:    [[CALL:%.*]] = call i32 @goo()
 // CHECK-NOFMV-NEXT:    ret i32 [[CALL]]
-// CHECK-NOFMV-LABEL: @hoo(
+//
+//
+// CHECK-NOFMV: Function Attrs: noinline nounwind optnone
+// CHECK-NOFMV-LABEL: define {{[^@]+}}@hoo
+// CHECK-NOFMV-SAME: () #[[ATTR0]] {
 // CHECK-NOFMV-NEXT:  entry:
 // CHECK-NOFMV-NEXT:    [[FP1:%.*]] = alloca ptr, align 8
 // CHECK-NOFMV-NEXT:    [[FP2:%.*]] = alloca ptr, align 8
@@ -510,32 +743,40 @@ int hoo(void) {
 // CHECK-NOFMV-NEXT:    [[CALL1:%.*]] = call i32 [[TMP1]]()
 // CHECK-NOFMV-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
 // CHECK-NOFMV-NEXT:    ret i32 [[ADD]]
-
-// CHECK: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp-armv8,+fp16fml,+fullfp16,+ls64,+neon,+rand" }
-// CHECK: attributes #1 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+altnzcv,+bf16,+flagm,+fullfp16,+ls64,+sme,+sme-i16i64" }
-// CHECK: attributes #2 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+lse,+neon,+sha2" }
-// CHECK: attributes #3 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+fullfp16,+ls64,+neon" }
-// CHECK: attributes #4 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fp16fml,+fullfp16,+ls64,+neon" }
-// CHECK: attributes #5 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon" }
-// CHECK: attributes #6 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fullfp16,+ls64" }
-// CHECK: attributes #7 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fullfp16,+ls64" }
-// CHECK: attributes #8 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fullfp16,+ls64,+sme,+sme2" }
-// CHECK: attributes #9 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64" }
-// CHECK: attributes #10 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+fullfp16,+ls64" }
-// CHECK: attributes #11 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64" }
-// CHECK: attributes #12 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+ls64,+neon,+sve" }
-// CHECK: attributes #13 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+ls64,+neon,+sme" }
-// CHECK: attributes #14 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+ls64,+neon,+sha2,+sha3,+sve" }
-// CHECK: attributes #15 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+ls64,+neon,+sve" }
-// CHECK: attributes #16 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+fullfp16,+ls64,+rcpc" }
-// CHECK: attributes #17 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+fp-armv8,+fullfp16,+jsconv,+ls64,+neon" }
-// CHECK: attributes #18 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+fullfp16,+ls64,+rcpc" }
-// CHECK: attributes #19 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+ls64,+neon,+sve" }
-// CHECK: attributes #20 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3" }
-// CHECK: attributes #21 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm" }
-// CHECK: attributes #22 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+mte,+neon,+sve,+sve2,+sve2-sm4" }
-// CHECK: attributes #23 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+mte,+rcpc,+rcpc3" }
-// CHECK: attributes #24 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+sb" }
-
-// CHECK-NOFMV: attributes #0 = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
-// CHECK-NOFMV: attributes #1 = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
+//
+//.
+// CHECK: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+flagm,+fp-armv8,+fp16fml,+fullfp16,+ls64,+neon,+rand" }
+// CHECK: attributes #[[ATTR1]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon" }
+// CHECK: attributes #[[ATTR2]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64" }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64" }
+// CHECK: attributes #[[ATTR4]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+altnzcv,+bf16,+flagm,+fullfp16,+ls64,+sme,+sme-i16i64" }
+// CHECK: attributes #[[ATTR5]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+lse,+neon,+sha2" }
+// CHECK: attributes #[[ATTR6]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+fullfp16,+ls64,+neon" }
+// CHECK: attributes #[[ATTR7]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fp16fml,+fullfp16,+ls64,+neon" }
+// CHECK: attributes #[[ATTR8]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc,+fullfp16,+ls64" }
+// CHECK: attributes #[[ATTR9]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bti,+fullfp16,+ls64" }
+// CHECK: attributes #[[ATTR10]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fullfp16,+ls64,+sme,+sme2" }
+// CHECK: attributes #[[ATTR11]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+fullfp16,+ls64" }
+// CHECK: attributes #[[ATTR12]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+aes,+f64mm,+fp-armv8,+fullfp16,+ls64,+neon,+sve" }
+// CHECK: attributes #[[ATTR13]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+complxnum,+fp-armv8,+fullfp16,+ls64,+neon,+sme" }
+// CHECK: attributes #[[ATTR14]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+f32mm,+fp-armv8,+fullfp16,+i8mm,+ls64,+neon,+sha2,+sha3,+sve" }
+// CHECK: attributes #[[ATTR15]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+dit,+fp-armv8,+fullfp16,+ls64,+neon,+sve" }
+// CHECK: attributes #[[ATTR16]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccpp,+fullfp16,+ls64,+rcpc" }
+// CHECK: attributes #[[ATTR17]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+ccdp,+ccpp,+fp-armv8,+fullfp16,+jsconv,+ls64,+neon" }
+// CHECK: attributes #[[ATTR18]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fptoint,+fullfp16,+ls64,+rcpc" }
+// CHECK: attributes #[[ATTR19]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+fullfp16,+ls64,+neon,+sve" }
+// CHECK: attributes #[[ATTR20]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3" }
+// CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm" }
+// CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+mte,+neon,+sve,+sve2,+sve2-sm4" }
+// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+mte,+rcpc,+rcpc3" }
+// CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+sb" }
+//.
+// CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
+// CHECK-NOFMV: attributes #[[ATTR1:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
+// CHECK-NOFMV: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK-NOFMV: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.
diff --git a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
index 7cc9a9e03630a..2d3f448947991 100644
--- a/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
+++ b/clang/test/CodeGenCXX/attr-target-clones-aarch64.cpp
@@ -37,17 +37,17 @@ void run_foo_tml() {
 
 
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @_Z7foo_ovli.ifunc = weak_odr ifunc i32 (i32), ptr @_Z7foo_ovli.resolver
-// CHECK: @_Z7foo_ovlv.ifunc = weak_odr ifunc i32 (), ptr @_Z7foo_ovlv.resolver
-// CHECK: @_ZN7MyClassIssE7foo_tmlEv.ifunc = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv.resolver
-// CHECK: @_ZN7MyClassIisE7foo_tmlEv.ifunc = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIisE7foo_tmlEv.resolver
+// CHECK: @_Z7foo_ovli = weak_odr ifunc i32 (i32), ptr @_Z7foo_ovli.resolver
+// CHECK: @_Z7foo_ovlv = weak_odr ifunc i32 (), ptr @_Z7foo_ovlv.resolver
+// CHECK: @_ZN7MyClassIssE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIssE7foo_tmlEv.resolver
+// CHECK: @_ZN7MyClassIisE7foo_tmlEv = weak_odr ifunc i32 (ptr), ptr @_ZN7MyClassIisE7foo_tmlEv.resolver
 
 // CHECK-LABEL: @_Z7foo_ovli._Mfp16Mls64_v(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    ret i32 1
-// CHECK-LABEL: @_Z7foo_ovli(
+// CHECK-LABEL: @_Z7foo_ovli.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
@@ -67,7 +67,7 @@ void run_foo_tml() {
 // CHECK-LABEL: @_Z7foo_ovlv._Mls64Mls64_accdata(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
-// CHECK-LABEL: @_Z7foo_ovlv(
+// CHECK-LABEL: @_Z7foo_ovlv.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 2
 // CHECK-LABEL: @_Z7foo_ovlv.resolver(
@@ -81,11 +81,11 @@ void run_foo_tml() {
 // CHECK:       resolver_return:
 // CHECK-NEXT:    ret ptr @_Z7foo_ovlv._Mls64Mls64_accdata
 // CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @_Z7foo_ovlv
+// CHECK-NEXT:    ret ptr @_Z7foo_ovlv.default
 // CHECK-LABEL: @_Z3barv(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z7foo_ovli.ifunc(i32 noundef 1)
-// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z7foo_ovlv.ifunc()
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_Z7foo_ovli(i32 noundef 1)
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z7foo_ovlv()
 // CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
 // CHECK-NEXT:    ret i32 [[ADD]]
 // CHECK-LABEL: @_Z11run_foo_tmlv(
@@ -94,8 +94,8 @@ void run_foo_tml() {
 // CHECK-NEXT:    [[MC2:%.*]] = alloca [[STRUCT_MYCLASS_0:%.*]], align 1
 // CHECK-NEXT:    [[MC3:%.*]] = alloca [[STRUCT_MYCLASS_1:%.*]], align 1
 // CHECK-NEXT:    [[MC4:%.*]] = alloca [[STRUCT_MYCLASS_2:%.*]], align 1
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN7MyClassIssE7foo_tmlEv.ifunc(ptr noundef nonnull align 1 dereferenceable(1) [[MC1]])
-// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN7MyClassIisE7foo_tmlEv.ifunc(ptr noundef nonnull align 1 dereferenceable(1) [[MC2]])
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN7MyClassIssE7foo_tmlEv(ptr noundef nonnull align 1 dereferenceable(1) [[MC1]])
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_ZN7MyClassIisE7foo_tmlEv(ptr noundef nonnull align 1 dereferenceable(1) [[MC2]])
 // CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @_ZN7MyClassIfsE7foo_tmlEv(ptr noundef nonnull align 1 dereferenceable(1) [[MC3]])
 // CHECK-NEXT:    [[CALL3:%.*]] = call noundef i32 @_ZN7MyClassIdfE7foo_tmlEv(ptr noundef nonnull align 1 dereferenceable(1) [[MC4]])
 // CHECK-NEXT:    ret void
@@ -163,7 +163,7 @@ void run_foo_tml() {
 // CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 1
-// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv(
+// CHECK-LABEL: @_ZN7MyClassIssE7foo_tmlEv.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
@@ -181,7 +181,7 @@ void run_foo_tml() {
 // CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 2
-// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv(
+// CHECK-LABEL: @_ZN7MyClassIisE7foo_tmlEv.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
diff --git a/clang/test/CodeGenCXX/attr-target-version.cpp b/clang/test/CodeGenCXX/attr-target-version.cpp
index f8a4419f249f3..68dd7be1180b4 100644
--- a/clang/test/CodeGenCXX/attr-target-version.cpp
+++ b/clang/test/CodeGenCXX/attr-target-version.cpp
@@ -22,35 +22,109 @@ int bar() {
 }
 
 
+
+
+//.
 // CHECK: @__aarch64_cpu_features = external dso_local global { i64 }
-// CHECK: @_ZN7MyClass3gooEi.ifunc = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver
 // CHECK: @_Z3fooi.ifunc = weak_odr ifunc i32 (i32), ptr @_Z3fooi.resolver
 // CHECK: @_Z3foov.ifunc = weak_odr ifunc i32 (), ptr @_Z3foov.resolver
-
+// CHECK: @_ZN7MyClass3gooEi.ifunc = weak_odr ifunc i32 (ptr, i32), ptr @_ZN7MyClass3gooEi.resolver
+//.
 // CHECK-LABEL: @_Z3fooi._Mbf16Msme-f64f64(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    ret i32 1
-// CHECK-LABEL: @_Z3fooi(
+//
+//
+// CHECK-LABEL: @_Z3fooi.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @_Z3fooi._Mbf16Msme-f64f64
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @_Z3fooi.default
+//
+//
+// CHECK-LABEL: @_Z3foov._Msm4Mebf16(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    ret i32 3
+//
+//
+// CHECK-LABEL: @_Z3foov.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 268435488
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @_Z3foov._Msm4Mebf16
+// CHECK:       resolver_else:
+// CHECK-NEXT:    ret ptr @_Z3foov.default
+//
+//
+// CHECK-LABEL: @_ZN7MyClass3gooEi.resolver(
+// CHECK-NEXT:  resolver_entry:
+// CHECK-NEXT:    call void @__init_cpu_features_resolver()
+// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1024
+// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1024
+// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
+// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
+// CHECK:       resolver_return:
+// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi._Mcrc
+// CHECK:       resolver_else:
+// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16
+// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
+// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
+// CHECK:       resolver_return1:
+// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi._Mdotprod
+// CHECK:       resolver_else2:
+// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi.default
+//
+//
+// CHECK-LABEL: @_Z3barv(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[M:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 1
+// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi.ifunc(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1)
+// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z3fooi.ifunc(i32 noundef 1)
+// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
+// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @_Z3foov.ifunc()
+// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
+// CHECK-NEXT:    ret i32 [[ADD3]]
+//
+//
+// CHECK-LABEL: @_Z3fooi.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    ret i32 2
-// CHECK-LABEL: @_Z3foov._Msm4Mebf16(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    ret i32 3
-// CHECK-LABEL: @_Z3foov(
+//
+//
+// CHECK-LABEL: @_Z3foov.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 4
-// CHECK-LABEL: @_ZN7MyClass3gooEi(
+//
+//
+// CHECK-LABEL: @_ZN7MyClass3gooEi._Mdotprod(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    ret i32 1
+// CHECK-NEXT:    ret i32 3
+//
+//
 // CHECK-LABEL: @_ZN7MyClass3gooEi._Mcrc(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
@@ -59,70 +133,24 @@ int bar() {
 // CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    ret i32 2
-// CHECK-LABEL: @_ZN7MyClass3gooEi._Mdotprod(
+//
+//
+// CHECK-LABEL: @_ZN7MyClass3gooEi.default(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[THIS_ADDR:%.*]] = alloca ptr, align 8
 // CHECK-NEXT:    [[DOTADDR:%.*]] = alloca i32, align 4
 // CHECK-NEXT:    store ptr [[THIS:%.*]], ptr [[THIS_ADDR]], align 8
 // CHECK-NEXT:    store i32 [[TMP0:%.*]], ptr [[DOTADDR]], align 4
 // CHECK-NEXT:    [[THIS1:%.*]] = load ptr, ptr [[THIS_ADDR]], align 8
-// CHECK-NEXT:    ret i32 3
-// CHECK-LABEL: @_Z3barv(
-// CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[M:%.*]] = alloca [[STRUCT_MYCLASS:%.*]], align 1
-// CHECK-NEXT:    [[CALL:%.*]] = call noundef i32 @_ZN7MyClass3gooEi.ifunc(ptr noundef nonnull align 1 dereferenceable(1) [[M]], i32 noundef 1)
-// CHECK-NEXT:    [[CALL1:%.*]] = call noundef i32 @_Z3fooi.ifunc(i32 noundef 1)
-// CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[CALL]], [[CALL1]]
-// CHECK-NEXT:    [[CALL2:%.*]] = call noundef i32 @_Z3foov.ifunc()
-// CHECK-NEXT:    [[ADD3:%.*]] = add nsw i32 [[ADD]], [[CALL2]]
-// CHECK-NEXT:    ret i32 [[ADD3]]
-// CHECK-LABEL: @_ZN7MyClass3gooEi.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 1024
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 1024
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi._Mcrc
-// CHECK:       resolver_else:
-// CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 16
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 16
-// CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
-// CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
-// CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi._Mdotprod
-// CHECK:       resolver_else2:
-// CHECK-NEXT:    ret ptr @_ZN7MyClass3gooEi
-// CHECK-LABEL: @_Z3fooi.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 36028797153181696
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 36028797153181696
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_Z3fooi._Mbf16Msme-f64f64
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @_Z3fooi
-// CHECK-LABEL: @_Z3foov.resolver(
-// CHECK-NEXT:  resolver_entry:
-// CHECK-NEXT:    call void @__init_cpu_features_resolver()
-// CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP1:%.*]] = and i64 [[TMP0]], 268435488
-// CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[TMP1]], 268435488
-// CHECK-NEXT:    [[TMP3:%.*]] = and i1 true, [[TMP2]]
-// CHECK-NEXT:    br i1 [[TMP3]], label [[RESOLVER_RETURN:%.*]], label [[RESOLVER_ELSE:%.*]]
-// CHECK:       resolver_return:
-// CHECK-NEXT:    ret ptr @_Z3foov._Msm4Mebf16
-// CHECK:       resolver_else:
-// CHECK-NEXT:    ret ptr @_Z3foov
-
-// CHECK: attributes #0 = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" }
-// CHECK: attributes #1 = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
-// CHECK: attributes #2 = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+neon,+sm4" }
-// CHECK: attributes #3 = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc" }
-// CHECK: attributes #4 = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon" }
+// CHECK-NEXT:    ret i32 1
+//
+//.
+// CHECK: attributes #[[ATTR0:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+sme,+sme-f64f64" }
+// CHECK: attributes #[[ATTR1:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+bf16,+fp-armv8,+neon,+sm4" }
+// CHECK: attributes #[[ATTR2:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
+// CHECK: attributes #[[ATTR3:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+dotprod,+fp-armv8,+neon" }
+// CHECK: attributes #[[ATTR4:[0-9]+]] = { mustprogress noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+crc" }
+//.
+// CHECK: [[META0:![0-9]+]] = !{i32 1, !"wchar_size", i32 4}
+// CHECK: [[META1:![0-9]+]] = !{!"{{.*}}clang version {{.*}}"}
+//.

From 7c53e9f6671b44ffd2c36d8c6f0f05456a6222ea Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Mon, 22 Jan 2024 18:59:19 +0000
Subject: [PATCH 459/843] [RemoveDIs][DebugInfo] Add support for DPValues to
 LoopStrengthReduce (#78706)

This patch trivially extends support for DbgValueInst recovery to
DPValues in LoopStrengthReduce; they are handled identically, so this is
mostly done by reusing the DbgValueInst code (using templates or
auto-parameter lambdas to reduce actual code duplication).
---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  | 240 ++++++++++--------
 .../LoopStrengthReduce/X86/lsr-cond-dbg.ll    |   1 +
 .../LoopStrengthReduce/dbg-preserve-0.ll      |   1 +
 .../LoopStrengthReduce/dbg-preserve-1.ll      |   1 +
 .../LoopStrengthReduce/dbg-preserve-2.ll      |   1 +
 .../debuginfo-scev-salvage-0.ll               |   1 +
 .../debuginfo-scev-salvage-1.ll               |   1 +
 .../debuginfo-scev-salvage-2.ll               |   1 +
 .../debuginfo-scev-salvage-3.ll               |   1 +
 .../debuginfo-scev-salvage-4.ll               |   1 +
 .../debuginfo-scev-salvage-5.ll               |   1 +
 .../optimizemax_debugloc.ll                   |   1 +
 .../Transforms/LoopStrengthReduce/pr12018.ll  |   1 +
 .../Transforms/LoopStrengthReduce/pr51329.ll  |   1 +
 .../Transforms/LoopStrengthReduce/pr51656.ll  |   1 +
 .../Transforms/LoopStrengthReduce/pr52161.ll  |   1 +
 16 files changed, 151 insertions(+), 104 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e496f79a8f8ce..9346da4384f9e 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -6366,10 +6366,12 @@ struct SCEVDbgValueBuilder {
 /// and DIExpression.
 struct DVIRecoveryRec {
   DVIRecoveryRec(DbgValueInst *DbgValue)
-      : DVI(DbgValue), Expr(DbgValue->getExpression()),
+      : DbgRef(DbgValue), Expr(DbgValue->getExpression()),
         HadLocationArgList(false) {}
+  DVIRecoveryRec(DPValue *DPV)
+      : DbgRef(DPV), Expr(DPV->getExpression()), HadLocationArgList(false) {}
 
-  DbgValueInst *DVI;
+  PointerUnion<DbgValueInst *, DPValue *> DbgRef;
   DIExpression *Expr;
   bool HadLocationArgList;
   SmallVector<WeakVH, 2> LocationOps;
@@ -6401,17 +6403,19 @@ static unsigned numLLVMArgOps(SmallVectorImpl<uint64_t> &Expr) {
 /// Overwrites DVI with the location and Ops as the DIExpression. This will
 /// create an invalid expression if Ops has any dwarf::DW_OP_llvm_arg operands,
 /// because a DIArglist is not created for the first argument of the dbg.value.
-static void updateDVIWithLocation(DbgValueInst &DVI, Value *Location,
+template <typename T>
+static void updateDVIWithLocation(T &DbgVal, Value *Location,
                                   SmallVectorImpl<uint64_t> &Ops) {
-  assert(
-      numLLVMArgOps(Ops) == 0 &&
-      "Expected expression that does not contain any DW_OP_llvm_arg operands.");
-  DVI.setRawLocation(ValueAsMetadata::get(Location));
-  DVI.setExpression(DIExpression::get(DVI.getContext(), Ops));
+  assert(numLLVMArgOps(Ops) == 0 && "Expected expression that does not "
+                                    "contain any DW_OP_llvm_arg operands.");
+  DbgVal.setRawLocation(ValueAsMetadata::get(Location));
+  DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
+  DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
 }
 
 /// Overwrite DVI with locations placed into a DIArglist.
-static void updateDVIWithLocations(DbgValueInst &DVI,
+template <typename T>
+static void updateDVIWithLocations(T &DbgVal,
                                    SmallVectorImpl<Value *> &Locations,
                                    SmallVectorImpl<uint64_t> &Ops) {
   assert(numLLVMArgOps(Ops) != 0 &&
@@ -6421,8 +6425,8 @@ static void updateDVIWithLocations(DbgValueInst &DVI,
   for (Value *V : Locations)
     MetadataLocs.push_back(ValueAsMetadata::get(V));
   auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
-  DVI.setRawLocation(llvm::DIArgList::get(DVI.getContext(), ValArrayRef));
-  DVI.setExpression(DIExpression::get(DVI.getContext(), Ops));
+  DbgVal.setRawLocation(llvm::DIArgList::get(DbgVal.getContext(), ValArrayRef));
+  DbgVal.setExpression(DIExpression::get(DbgVal.getContext(), Ops));
 }
 
 /// Write the new expression and new location ops for the dbg.value. If possible
@@ -6433,30 +6437,37 @@ static void updateDVIWithLocations(DbgValueInst &DVI,
 static void UpdateDbgValueInst(DVIRecoveryRec &DVIRec,
                                SmallVectorImpl<Value *> &NewLocationOps,
                                SmallVectorImpl<uint64_t> &NewExpr) {
-  unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
-  if (NumLLVMArgs == 0) {
-    // Location assumed to be on the stack.
-    updateDVIWithLocation(*DVIRec.DVI, NewLocationOps[0], NewExpr);
-  } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
-    // There is only a single DW_OP_llvm_arg at the start of the expression,
-    // so it can be omitted along with DIArglist.
-    assert(NewExpr[1] == 0 &&
-           "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
-    llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2));
-    updateDVIWithLocation(*DVIRec.DVI, NewLocationOps[0], ShortenedOps);
-  } else {
-    // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
-    updateDVIWithLocations(*DVIRec.DVI, NewLocationOps, NewExpr);
-  }
+  auto UpdateDbgValueInstImpl = [&](auto *DbgVal) {
+    unsigned NumLLVMArgs = numLLVMArgOps(NewExpr);
+    if (NumLLVMArgs == 0) {
+      // Location assumed to be on the stack.
+      updateDVIWithLocation(*DbgVal, NewLocationOps[0], NewExpr);
+    } else if (NumLLVMArgs == 1 && NewExpr[0] == dwarf::DW_OP_LLVM_arg) {
+      // There is only a single DW_OP_llvm_arg at the start of the expression,
+      // so it can be omitted along with DIArglist.
+      assert(NewExpr[1] == 0 &&
+             "Lone LLVM_arg in a DIExpression should refer to location-op 0.");
+      llvm::SmallVector<uint64_t, 6> ShortenedOps(llvm::drop_begin(NewExpr, 2));
+      updateDVIWithLocation(*DbgVal, NewLocationOps[0], ShortenedOps);
+    } else {
+      // Multiple DW_OP_llvm_arg, so DIArgList is strictly necessary.
+      updateDVIWithLocations(*DbgVal, NewLocationOps, NewExpr);
+    }
 
-  // If the DIExpression was previously empty then add the stack terminator.
-  // Non-empty expressions have only had elements inserted into them and so the
-  // terminator should already be present e.g. stack_value or fragment.
-  DIExpression *SalvageExpr = DVIRec.DVI->getExpression();
-  if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
-    SalvageExpr = DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
-    DVIRec.DVI->setExpression(SalvageExpr);
-  }
+    // If the DIExpression was previously empty then add the stack terminator.
+    // Non-empty expressions have only had elements inserted into them and so
+    // the terminator should already be present e.g. stack_value or fragment.
+    DIExpression *SalvageExpr = DbgVal->getExpression();
+    if (!DVIRec.Expr->isComplex() && SalvageExpr->isComplex()) {
+      SalvageExpr =
+          DIExpression::append(SalvageExpr, {dwarf::DW_OP_stack_value});
+      DbgVal->setExpression(SalvageExpr);
+    }
+  };
+  if (isa<DbgValueInst *>(DVIRec.DbgRef))
+    UpdateDbgValueInstImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
+  else
+    UpdateDbgValueInstImpl(cast<DPValue *>(DVIRec.DbgRef));
 }
 
 /// Cached location ops may be erased during LSR, in which case a poison is
@@ -6470,40 +6481,49 @@ static Value *getValueOrPoison(WeakVH &VH, LLVMContext &C) {
 
 /// Restore the DVI's pre-LSR arguments. Substitute undef for any erased values.
 static void restorePreTransformState(DVIRecoveryRec &DVIRec) {
-  LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
-                    << "scev-salvage: post-LSR: " << *DVIRec.DVI << '\n');
-  assert(DVIRec.Expr && "Expected an expression");
-  DVIRec.DVI->setExpression(DVIRec.Expr);
-
-  // Even a single location-op may be inside a DIArgList and referenced with
-  // DW_OP_LLVM_arg, which is valid only with a DIArgList.
-  if (!DVIRec.HadLocationArgList) {
-    assert(DVIRec.LocationOps.size() == 1 &&
-           "Unexpected number of location ops.");
-    // LSR's unsuccessful salvage attempt may have added DIArgList, which in
-    // this case was not present before, so force the location back to a single
-    // uncontained Value.
-    Value *CachedValue =
-        getValueOrPoison(DVIRec.LocationOps[0], DVIRec.DVI->getContext());
-    DVIRec.DVI->setRawLocation(ValueAsMetadata::get(CachedValue));
-  } else {
-    SmallVector<ValueAsMetadata *, 3> MetadataLocs;
-    for (WeakVH VH : DVIRec.LocationOps) {
-      Value *CachedValue = getValueOrPoison(VH, DVIRec.DVI->getContext());
-      MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
+  auto RestorePreTransformStateImpl = [&](auto *DbgVal) {
+    LLVM_DEBUG(dbgs() << "scev-salvage: restore dbg.value to pre-LSR state\n"
+                      << "scev-salvage: post-LSR: " << *DbgVal << '\n');
+    assert(DVIRec.Expr && "Expected an expression");
+    DbgVal->setExpression(DVIRec.Expr);
+
+    // Even a single location-op may be inside a DIArgList and referenced with
+    // DW_OP_LLVM_arg, which is valid only with a DIArgList.
+    if (!DVIRec.HadLocationArgList) {
+      assert(DVIRec.LocationOps.size() == 1 &&
+             "Unexpected number of location ops.");
+      // LSR's unsuccessful salvage attempt may have added DIArgList, which in
+      // this case was not present before, so force the location back to a
+      // single uncontained Value.
+      Value *CachedValue =
+          getValueOrPoison(DVIRec.LocationOps[0], DbgVal->getContext());
+      DbgVal->setRawLocation(ValueAsMetadata::get(CachedValue));
+    } else {
+      SmallVector<ValueAsMetadata *, 3> MetadataLocs;
+      for (WeakVH VH : DVIRec.LocationOps) {
+        Value *CachedValue = getValueOrPoison(VH, DbgVal->getContext());
+        MetadataLocs.push_back(ValueAsMetadata::get(CachedValue));
+      }
+      auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
+      DbgVal->setRawLocation(
+          llvm::DIArgList::get(DbgVal->getContext(), ValArrayRef));
     }
-    auto ValArrayRef = llvm::ArrayRef<llvm::ValueAsMetadata *>(MetadataLocs);
-    DVIRec.DVI->setRawLocation(
-        llvm::DIArgList::get(DVIRec.DVI->getContext(), ValArrayRef));
-  }
-  LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DVIRec.DVI << '\n');
+    LLVM_DEBUG(dbgs() << "scev-salvage: pre-LSR: " << *DbgVal << '\n');
+  };
+  if (isa<DbgValueInst *>(DVIRec.DbgRef))
+    RestorePreTransformStateImpl(cast<DbgValueInst *>(DVIRec.DbgRef));
+  else
+    RestorePreTransformStateImpl(cast<DPValue *>(DVIRec.DbgRef));
 }
 
 static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
                        llvm::PHINode *LSRInductionVar, DVIRecoveryRec &DVIRec,
                        const SCEV *SCEVInductionVar,
                        SCEVDbgValueBuilder IterCountExpr) {
-  if (!DVIRec.DVI->isKillLocation())
+
+  if (isa<DbgValueInst *>(DVIRec.DbgRef)
+          ? !cast<DbgValueInst *>(DVIRec.DbgRef)->isKillLocation()
+          : !cast<DPValue *>(DVIRec.DbgRef)->isKillLocation())
     return false;
 
   // LSR may have caused several changes to the dbg.value in the failed salvage
@@ -6596,16 +6616,20 @@ static bool SalvageDVI(llvm::Loop *L, ScalarEvolution &SE,
   }
 
   UpdateDbgValueInst(DVIRec, NewLocationOps, NewExpr);
-  LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: " << *DVIRec.DVI << "\n");
+  if (isa<DbgValueInst *>(DVIRec.DbgRef))
+    LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
+                      << *cast<DbgValueInst *>(DVIRec.DbgRef) << "\n");
+  else
+    LLVM_DEBUG(dbgs() << "scev-salvage: Updated DVI: "
+                      << *cast<DPValue *>(DVIRec.DbgRef) << "\n");
   return true;
 }
 
 /// Obtain an expression for the iteration count, then attempt to salvage the
 /// dbg.value intrinsics.
-static void
-DbgRewriteSalvageableDVIs(llvm::Loop *L, ScalarEvolution &SE,
-                          llvm::PHINode *LSRInductionVar,
-                          SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
+static void DbgRewriteSalvageableDVIs(
+    llvm::Loop *L, ScalarEvolution &SE, llvm::PHINode *LSRInductionVar,
+    SmallVector<std::unique_ptr<DVIRecoveryRec>, 2> &DVIToUpdate) {
   if (DVIToUpdate.empty())
     return;
 
@@ -6647,48 +6671,56 @@ static void DbgGatherSalvagableDVI(
     SmallSet<AssertingVH<DbgValueInst>, 2> &DVIHandles) {
   for (const auto &B : L->getBlocks()) {
     for (auto &I : *B) {
-      auto DVI = dyn_cast<DbgValueInst>(&I);
-      if (!DVI)
-        continue;
-      // Ensure that if any location op is undef that the dbg.vlue is not
-      // cached.
-      if (DVI->isKillLocation())
-        continue;
-
-      // Check that the location op SCEVs are suitable for translation to
-      // DIExpression.
-      const auto &HasTranslatableLocationOps =
-          [&](const DbgValueInst *DVI) -> bool {
-        for (const auto LocOp : DVI->location_ops()) {
-          if (!LocOp)
-            return false;
-
-          if (!SE.isSCEVable(LocOp->getType()))
-            return false;
-
-          const SCEV *S = SE.getSCEV(LocOp);
-          if (SE.containsUndefs(S))
-            return false;
+      auto ProcessDbgValue = [&](auto *DbgVal) -> bool {
+        // Ensure that if any location op is undef that the dbg.vlue is not
+        // cached.
+        if (DbgVal->isKillLocation())
+          return false;
+
+        // Check that the location op SCEVs are suitable for translation to
+        // DIExpression.
+        const auto &HasTranslatableLocationOps =
+            [&](const auto *DbgValToTranslate) -> bool {
+          for (const auto LocOp : DbgValToTranslate->location_ops()) {
+            if (!LocOp)
+              return false;
+
+            if (!SE.isSCEVable(LocOp->getType()))
+              return false;
+
+            const SCEV *S = SE.getSCEV(LocOp);
+            if (SE.containsUndefs(S))
+              return false;
+          }
+          return true;
+        };
+
+        if (!HasTranslatableLocationOps(DbgVal))
+          return false;
+
+        std::unique_ptr<DVIRecoveryRec> NewRec =
+            std::make_unique<DVIRecoveryRec>(DbgVal);
+        // Each location Op may need a SCEVDbgValueBuilder in order to recover
+        // it. Pre-allocating a vector will enable quick lookups of the builder
+        // later during the salvage.
+        NewRec->RecoveryExprs.resize(DbgVal->getNumVariableLocationOps());
+        for (const auto LocOp : DbgVal->location_ops()) {
+          NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
+          NewRec->LocationOps.push_back(LocOp);
+          NewRec->HadLocationArgList = DbgVal->hasArgList();
         }
+        SalvageableDVISCEVs.push_back(std::move(NewRec));
         return true;
       };
-
-      if (!HasTranslatableLocationOps(DVI))
-        continue;
-
-      std::unique_ptr<DVIRecoveryRec> NewRec =
-          std::make_unique<DVIRecoveryRec>(DVI);
-      // Each location Op may need a SCEVDbgValueBuilder in order to recover it.
-      // Pre-allocating a vector will enable quick lookups of the builder later
-      // during the salvage.
-      NewRec->RecoveryExprs.resize(DVI->getNumVariableLocationOps());
-      for (const auto LocOp : DVI->location_ops()) {
-        NewRec->SCEVs.push_back(SE.getSCEV(LocOp));
-        NewRec->LocationOps.push_back(LocOp);
-        NewRec->HadLocationArgList = DVI->hasArgList();
+      for (auto &DPV : I.getDbgValueRange()) {
+        if (DPV.isDbgValue() || DPV.isDbgAssign())
+          ProcessDbgValue(&DPV);
       }
-      SalvageableDVISCEVs.push_back(std::move(NewRec));
-      DVIHandles.insert(DVI);
+      auto DVI = dyn_cast<DbgValueInst>(&I);
+      if (!DVI)
+        continue;
+      if (ProcessDbgValue(DVI))
+        DVIHandles.insert(DVI);
     }
   }
 }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-cond-dbg.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-cond-dbg.ll
index 7be45db67a0f9..01c7950ab702d 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-cond-dbg.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/lsr-cond-dbg.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce < %s | FileCheck %s
 
 ; During Loop Strength Reduce, if the terminating condition for the loop is not
 ; immediately adjacent to the terminating branch and it has more than one use,
diff --git a/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-0.ll b/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-0.ll
index 5e983524e408d..204ea5d320f53 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-0.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-0.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-reduce -S %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -loop-reduce -S %s | FileCheck %s
 
 ;; Test that LSR preserves debug-info for induction variables and scev-based
 ;; salvaging produces short DIExpressions that use a constant offset from the
diff --git a/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-1.ll b/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-1.ll
index 7aa6d97d935dd..8b857e1bb6845 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-1.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-1.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-reduce -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators < %s -loop-reduce -S | FileCheck %s
 ;
 ; Test that LSR avoids crashing on very large integer inputs. It should
 ; discard the variable location by creating an undef dbg.value.
diff --git a/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-2.ll b/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-2.ll
index 8bbcc4d3b3a55..ec9c48628179a 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-2.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/dbg-preserve-2.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-reduce -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators < %s -loop-reduce -S | FileCheck %s
 
 ; Test that LSR does not produce invalid debug info when a debug value is
 ; salvaged during LSR by adding additional location operands, then becomes
diff --git a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-0.ll b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-0.ll
index 1767017232342..836f525422173 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-0.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-0.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-reduce %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s -o - | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 ;; Ensure that we retain debuginfo for the induction variable and dependant
diff --git a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-1.ll b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-1.ll
index 57ec5b2214233..d20b99283fbbc 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-1.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-1.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-reduce %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s -o - | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 ;; Ensure that we retain debuginfo for the induction variable and dependant
diff --git a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-2.ll b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-2.ll
index ececeb7cc575e..789f47c87d174 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-2.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-2.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-reduce %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s -o - | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 ;; Ensure that we retain debuginfo for the induction variable and dependant
diff --git a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-3.ll b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-3.ll
index 5fe00e7d1cafc..1c5dff9481693 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-3.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-3.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-reduce %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s -o - | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 ;; Ensure that we retain debuginfo for the induction variable and dependant
diff --git a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-4.ll b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-4.ll
index a3dd7a74edbf7..6456ed43aee64 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-4.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-4.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-reduce %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s -o - | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 ;; Ensure that we retain debuginfo for the induction variable and dependant
diff --git a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-5.ll b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-5.ll
index 5656acf8f60e8..b06757ccf9158 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-5.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/debuginfo-scev-salvage-5.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-reduce %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 ;; Ensure that SCEV-based salvaging in Loop Strength Reduction can salvage
diff --git a/llvm/test/Transforms/LoopStrengthReduce/optimizemax_debugloc.ll b/llvm/test/Transforms/LoopStrengthReduce/optimizemax_debugloc.ll
index d2d5550c3d68f..60ea02c1d2a7b 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/optimizemax_debugloc.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/optimizemax_debugloc.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-reduce -S 2>&1 | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators < %s -loop-reduce -S 2>&1 | FileCheck %s
 ;; This test case checks that whether the new icmp instruction preserves
 ;; the debug location of the original instruction for %exitcond
 ; CHECK: icmp uge i32 %indvar.next, %n, !dbg ![[DBGLOC:[0-9]+]]
diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr12018.ll b/llvm/test/Transforms/LoopStrengthReduce/pr12018.ll
index 78fef12d28750..62b2fb7fca8d9 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/pr12018.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/pr12018.ll
@@ -1,4 +1,5 @@
 ; RUN: opt < %s -loop-reduce
+; RUN: opt --try-experimental-debuginfo-iterators < %s -loop-reduce
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr51329.ll b/llvm/test/Transforms/LoopStrengthReduce/pr51329.ll
index ffdb785b8ef8f..52a8f1921dae9 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/pr51329.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/pr51329.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-reduce %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s | FileCheck %s
 ;
 ; Test that LSR SCEV-based salvaging does not crash when translating SCEVs
 ; that contain integers with binary representations greater than 64-bits. 
diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr51656.ll b/llvm/test/Transforms/LoopStrengthReduce/pr51656.ll
index da9ce16b6d860..1d5886c01c737 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/pr51656.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/pr51656.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -loop-reduce -S %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -loop-reduce -S %s | FileCheck %s
 
 ;; This test ensures that no attempt is made to translate long SCEVs into
 ;; DIExpressions. Attempting the translation can use excessive resources and
diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr52161.ll b/llvm/test/Transforms/LoopStrengthReduce/pr52161.ll
index 47b8fda174982..1a679392a0600 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/pr52161.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/pr52161.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -loop-reduce %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S -loop-reduce %s | FileCheck %s
 
 ;; Ensure that scev-based salvaging in LSR does not select an IV containing
 ;; an 'undef' element.

From 6d5f8d3e6e473a6890f232dba8b14b7b46140558 Mon Sep 17 00:00:00 2001
From: Kevin Frei <kevinfrei@users.noreply.github.com>
Date: Mon, 22 Jan 2024 11:09:48 -0800
Subject: [PATCH 460/843] Added settings for DEBUGINFOD cache location and
 timeout (#78605)

I've been working on more/better configuration for improving DEBUGINFOD
support. This is the first (and easiest) slice of the work.

I've added `timeout` and `cache-path` settings that can override the
DEBUGINFOD library defaults (and environment variables.) I also renamed
the `plugin.symbol-locator.debuginfod.server_urls` setting to
`server-urls` to be more consistent with the rest of LLDB's settings
(the underscore switch is switched to a hyphen)

I've got a few tests that validate the cache-path setting (as a
side-effect), but they've exposed a few bugs that I'll be putting up a
separate PR for (which will include the tests).

---------

Co-authored-by: Kevin Frei <freik@meta.com>
---
 .../Debuginfod/SymbolLocatorDebuginfod.cpp    | 83 +++++++++++++++----
 .../SymbolLocatorDebuginfodProperties.td      |  8 +-
 llvm/include/llvm/Debuginfod/Debuginfod.h     | 14 ++++
 llvm/lib/Debuginfod/Debuginfod.cpp            | 36 +++++---
 4 files changed, 113 insertions(+), 28 deletions(-)

diff --git a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
index 111be6be36524..2cd7bbbb24490 100644
--- a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
+++ b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfod.cpp
@@ -9,7 +9,10 @@
 #include "SymbolLocatorDebuginfod.h"
 
 #include "lldb/Core/PluginManager.h"
+#include "lldb/Interpreter/OptionValueString.h"
 #include "lldb/Utility/Args.h"
+#include "lldb/Utility/LLDBLog.h"
+#include "lldb/Utility/Log.h"
 
 #include "llvm/Debuginfod/Debuginfod.h"
 #include "llvm/Debuginfod/HTTPClient.h"
@@ -54,6 +57,32 @@ class PluginProperties : public Properties {
     return urls;
   }
 
+  llvm::Expected<std::string> GetCachePath() {
+    OptionValueString *s =
+        m_collection_sp->GetPropertyAtIndexAsOptionValueString(
+            ePropertySymbolCachePath);
+    // If we don't have a valid cache location, use the default one.
+    if (!s || !s->GetCurrentValueAsRef().size()) {
+      llvm::Expected<std::string> maybeCachePath =
+          llvm::getDefaultDebuginfodCacheDirectory();
+      if (!maybeCachePath)
+        return maybeCachePath;
+      return *maybeCachePath;
+    }
+    return s->GetCurrentValue();
+  }
+
+  std::chrono::milliseconds GetTimeout() const {
+    std::optional<uint64_t> seconds =
+        m_collection_sp->GetPropertyAtIndexAs<uint64_t>(ePropertyTimeout);
+    if (seconds && *seconds != 0) {
+      return std::chrono::duration_cast<std::chrono::milliseconds>(
+          std::chrono::seconds(*seconds));
+    } else {
+      return llvm::getDefaultDebuginfodTimeout();
+    }
+  }
+
 private:
   void ServerURLsChangedCallback() {
     m_server_urls = GetDebugInfoDURLs();
@@ -112,31 +141,51 @@ SymbolLocator *SymbolLocatorDebuginfod::CreateInstance() {
   return new SymbolLocatorDebuginfod();
 }
 
-static std::optional<FileSpec> GetFileForModule(
-    const ModuleSpec &module_spec,
-    std::function<llvm::Expected<std::string>(llvm::object::BuildIDRef)>
-        PullFromServer) {
-  if (!ModuleList::GetGlobalModuleListProperties().GetEnableExternalLookup())
-    return {};
+static std::optional<FileSpec>
+GetFileForModule(const ModuleSpec &module_spec,
+                 std::function<std::string(llvm::object::BuildID)> UrlBuilder) {
   const UUID &module_uuid = module_spec.GetUUID();
-  if (module_uuid.IsValid() && llvm::canUseDebuginfod()) {
-    llvm::object::BuildID build_id(module_uuid.GetBytes());
-    llvm::Expected<std::string> result = PullFromServer(build_id);
-    if (result)
-      return FileSpec(*result);
-    // An error here should be logged as a failure in the Debuginfod library,
-    // so just consume it here
-    consumeError(result.takeError());
-  }
+  // Don't bother if we don't have a valid UUID, Debuginfod isn't available,
+  // or if the 'symbols.enable-external-lookup' setting is false.
+  if (!module_uuid.IsValid() || !llvm::canUseDebuginfod() ||
+      !ModuleList::GetGlobalModuleListProperties().GetEnableExternalLookup())
+    return {};
+
+  // Grab LLDB's Debuginfod overrides from the
+  // plugin.symbol-locator.debuginfod.* settings.
+  PluginProperties &plugin_props = GetGlobalPluginProperties();
+  llvm::Expected<std::string> cache_path_or_err = plugin_props.GetCachePath();
+  // A cache location is *required*.
+  if (!cache_path_or_err)
+    return {};
+  std::string cache_path = *cache_path_or_err;
+  llvm::SmallVector<llvm::StringRef> debuginfod_urls =
+      llvm::getDefaultDebuginfodUrls();
+  std::chrono::milliseconds timeout = plugin_props.GetTimeout();
+
+  // We're ready to ask the Debuginfod library to find our file.
+  llvm::object::BuildID build_id(module_uuid.GetBytes());
+  std::string url_path = UrlBuilder(build_id);
+  std::string cache_key = llvm::getDebuginfodCacheKey(url_path);
+  llvm::Expected<std::string> result = llvm::getCachedOrDownloadArtifact(
+      cache_key, url_path, cache_path, debuginfod_urls, timeout);
+  if (result)
+    return FileSpec(*result);
+
+  Log *log = GetLog(LLDBLog::Symbols);
+  auto err_message = llvm::toString(result.takeError());
+  LLDB_LOGV(log,
+            "Debuginfod failed to download symbol artifact {0} with error {1}",
+            url_path, err_message);
   return {};
 }
 
 std::optional<ModuleSpec> SymbolLocatorDebuginfod::LocateExecutableObjectFile(
     const ModuleSpec &module_spec) {
-  return GetFileForModule(module_spec, llvm::getCachedOrDownloadExecutable);
+  return GetFileForModule(module_spec, llvm::getDebuginfodExecutableUrlPath);
 }
 
 std::optional<FileSpec> SymbolLocatorDebuginfod::LocateExecutableSymbolFile(
     const ModuleSpec &module_spec, const FileSpecList &default_search_paths) {
-  return GetFileForModule(module_spec, llvm::getCachedOrDownloadDebuginfo);
+  return GetFileForModule(module_spec, llvm::getDebuginfodDebuginfoUrlPath);
 }
diff --git a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfodProperties.td b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfodProperties.td
index 1c668b001a165..0ff02674b8ea3 100644
--- a/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfodProperties.td
+++ b/lldb/source/Plugins/SymbolLocator/Debuginfod/SymbolLocatorDebuginfodProperties.td
@@ -1,7 +1,13 @@
 include "../../../../include/lldb/Core/PropertiesBase.td"
 
 let Definition = "symbollocatordebuginfod" in {
-  def ServerURLs : Property<"server_urls", "Array">,
+  def ServerURLs : Property<"server-urls", "Array">,
     ElementType<"String">,
     Desc<"An ordered list of Debuginfod server URLs to query for symbols. This defaults to the contents of the DEBUGINFOD_URLS environment variable.">;
+  def SymbolCachePath: Property<"cache-path", "String">,
+    DefaultStringValue<"">,
+    Desc<"The path where symbol files should be cached. This defaults to LLDB's system cache location.">;
+  def Timeout : Property<"timeout", "UInt64">,
+    DefaultUnsignedValue<0>,
+    Desc<"Timeout (in seconds) for requests made to a DEBUGINFOD server. A value of zero means we use the debuginfod default timeout: DEBUGINFOD_TIMEOUT if the environment variable is set and 90 seconds otherwise.">;
 }
diff --git a/llvm/include/llvm/Debuginfod/Debuginfod.h b/llvm/include/llvm/Debuginfod/Debuginfod.h
index 251fd7005305e..ef03948a706c0 100644
--- a/llvm/include/llvm/Debuginfod/Debuginfod.h
+++ b/llvm/include/llvm/Debuginfod/Debuginfod.h
@@ -46,6 +46,9 @@ bool canUseDebuginfod();
 /// environment variable.
 SmallVector<StringRef> getDefaultDebuginfodUrls();
 
+/// Returns the cache key for a given debuginfod URL path.
+std::string getDebuginfodCacheKey(StringRef UrlPath);
+
 /// Sets the list of debuginfod server URLs to query. This overrides the
 /// environment variable DEBUGINFOD_URLS.
 void setDefaultDebuginfodUrls(const SmallVector<StringRef> &URLs);
@@ -58,15 +61,26 @@ Expected<std::string> getDefaultDebuginfodCacheDirectory();
 /// DEBUGINFOD_TIMEOUT environment variable, default is 90 seconds (90000 ms).
 std::chrono::milliseconds getDefaultDebuginfodTimeout();
 
+/// Get the full URL path for a source request of a given BuildID and file
+/// path.
+std::string getDebuginfodSourceUrlPath(object::BuildIDRef ID,
+                                       StringRef SourceFilePath);
+
 /// Fetches a specified source file by searching the default local cache
 /// directory and server URLs.
 Expected<std::string> getCachedOrDownloadSource(object::BuildIDRef ID,
                                                 StringRef SourceFilePath);
 
+/// Get the full URL path for an executable request of a given BuildID.
+std::string getDebuginfodExecutableUrlPath(object::BuildIDRef ID);
+
 /// Fetches an executable by searching the default local cache directory and
 /// server URLs.
 Expected<std::string> getCachedOrDownloadExecutable(object::BuildIDRef ID);
 
+/// Get the full URL path for a debug binary request of a given BuildID.
+std::string getDebuginfodDebuginfoUrlPath(object::BuildIDRef ID);
+
 /// Fetches a debug binary by searching the default local cache directory and
 /// server URLs.
 Expected<std::string> getCachedOrDownloadDebuginfo(object::BuildIDRef ID);
diff --git a/llvm/lib/Debuginfod/Debuginfod.cpp b/llvm/lib/Debuginfod/Debuginfod.cpp
index 9df30ab55cbad..4928fcb3b3f87 100644
--- a/llvm/lib/Debuginfod/Debuginfod.cpp
+++ b/llvm/lib/Debuginfod/Debuginfod.cpp
@@ -54,7 +54,7 @@ std::optional<SmallVector<StringRef>> DebuginfodUrls;
 llvm::sys::RWMutex UrlsMutex;
 } // namespace
 
-static std::string uniqueKey(llvm::StringRef S) {
+std::string getDebuginfodCacheKey(llvm::StringRef S) {
   return utostr(xxh3_64bits(S));
 }
 
@@ -72,7 +72,7 @@ SmallVector<StringRef> getDefaultDebuginfodUrls() {
   std::shared_lock<llvm::sys::RWMutex> ReadGuard(UrlsMutex);
   if (!DebuginfodUrls) {
     // Only read from the environment variable if the user hasn't already
-    // set the value
+    // set the value.
     ReadGuard.unlock();
     std::unique_lock<llvm::sys::RWMutex> WriteGuard(UrlsMutex);
     DebuginfodUrls = SmallVector<StringRef>();
@@ -86,7 +86,7 @@ SmallVector<StringRef> getDefaultDebuginfodUrls() {
   return DebuginfodUrls.value();
 }
 
-// Set the default debuginfod URL list, override the environment variable
+// Set the default debuginfod URL list, override the environment variable.
 void setDefaultDebuginfodUrls(const SmallVector<StringRef> &URLs) {
   std::unique_lock<llvm::sys::RWMutex> WriteGuard(UrlsMutex);
   DebuginfodUrls = URLs;
@@ -120,27 +120,43 @@ std::chrono::milliseconds getDefaultDebuginfodTimeout() {
 /// cache and return the cached file path. They first search the local cache,
 /// followed by the debuginfod servers.
 
-Expected<std::string> getCachedOrDownloadSource(BuildIDRef ID,
-                                                StringRef SourceFilePath) {
+std::string getDebuginfodSourceUrlPath(BuildIDRef ID,
+                                       StringRef SourceFilePath) {
   SmallString<64> UrlPath;
   sys::path::append(UrlPath, sys::path::Style::posix, "buildid",
                     buildIDToString(ID), "source",
                     sys::path::convert_to_slash(SourceFilePath));
-  return getCachedOrDownloadArtifact(uniqueKey(UrlPath), UrlPath);
+  return std::string(UrlPath);
 }
 
-Expected<std::string> getCachedOrDownloadExecutable(BuildIDRef ID) {
+Expected<std::string> getCachedOrDownloadSource(BuildIDRef ID,
+                                                StringRef SourceFilePath) {
+  std::string UrlPath = getDebuginfodSourceUrlPath(ID, SourceFilePath);
+  return getCachedOrDownloadArtifact(getDebuginfodCacheKey(UrlPath), UrlPath);
+}
+
+std::string getDebuginfodExecutableUrlPath(BuildIDRef ID) {
   SmallString<64> UrlPath;
   sys::path::append(UrlPath, sys::path::Style::posix, "buildid",
                     buildIDToString(ID), "executable");
-  return getCachedOrDownloadArtifact(uniqueKey(UrlPath), UrlPath);
+  return std::string(UrlPath);
 }
 
-Expected<std::string> getCachedOrDownloadDebuginfo(BuildIDRef ID) {
+Expected<std::string> getCachedOrDownloadExecutable(BuildIDRef ID) {
+  std::string UrlPath = getDebuginfodExecutableUrlPath(ID);
+  return getCachedOrDownloadArtifact(getDebuginfodCacheKey(UrlPath), UrlPath);
+}
+
+std::string getDebuginfodDebuginfoUrlPath(BuildIDRef ID) {
   SmallString<64> UrlPath;
   sys::path::append(UrlPath, sys::path::Style::posix, "buildid",
                     buildIDToString(ID), "debuginfo");
-  return getCachedOrDownloadArtifact(uniqueKey(UrlPath), UrlPath);
+  return std::string(UrlPath);
+}
+
+Expected<std::string> getCachedOrDownloadDebuginfo(BuildIDRef ID) {
+  std::string UrlPath = getDebuginfodDebuginfoUrlPath(ID);
+  return getCachedOrDownloadArtifact(getDebuginfodCacheKey(UrlPath), UrlPath);
 }
 
 // General fetching function.

From d83a3ea529575f17ea6ea608a43bca0b3065dcfe Mon Sep 17 00:00:00 2001
From: Sanjay Marreddi <sanjay.mareddi@gmail.com>
Date: Mon, 22 Jan 2024 19:15:05 +0000
Subject: [PATCH 461/843] [libc++] Fix std::regex_search to match $ alone with
 match_default flag (#78845)

Using std::regex_search with the regex_constant match_default and a
simple regex pattern `$` is expected to match general strings such as
_"a", "ab", "abc"..._ at `[last, last)` positions. But, the current
implementation fails to do so.

Fixes #75042
---
 libcxx/include/regex                          |  8 ++++
 .../re.matchflag/match_not_eol.pass.cpp       | 39 +++++++++++++++++++
 2 files changed, 47 insertions(+)

diff --git a/libcxx/include/regex b/libcxx/include/regex
index b575a267583b5..48af5b8b57fd6 100644
--- a/libcxx/include/regex
+++ b/libcxx/include/regex
@@ -5124,6 +5124,14 @@ bool basic_regex<_CharT, _Traits>::__search(
       }
       __m.__matches_.assign(__m.size(), __m.__unmatched_);
     }
+    __m.__matches_.assign(__m.size(), __m.__unmatched_);
+    if (__match_at_start(__first, __last, __m, __flags, false)) {
+      __m.__prefix_.second  = __m[0].first;
+      __m.__prefix_.matched = __m.__prefix_.first != __m.__prefix_.second;
+      __m.__suffix_.first   = __m[0].second;
+      __m.__suffix_.matched = __m.__suffix_.first != __m.__suffix_.second;
+      return true;
+    }
   }
   __m.__matches_.clear();
   return false;
diff --git a/libcxx/test/std/re/re.const/re.matchflag/match_not_eol.pass.cpp b/libcxx/test/std/re/re.const/re.matchflag/match_not_eol.pass.cpp
index edeea517d2537..89d939eaf809f 100644
--- a/libcxx/test/std/re/re.const/re.matchflag/match_not_eol.pass.cpp
+++ b/libcxx/test/std/re/re.const/re.matchflag/match_not_eol.pass.cpp
@@ -47,5 +47,44 @@ int main(int, char**)
     assert( std::regex_search(target, re, std::regex_constants::match_not_eol));
     }
 
+    {
+      std::string target = "foo";
+      std::regex re("$");
+      assert(std::regex_search(target, re));
+      assert(!std::regex_search(target, re, std::regex_constants::match_not_eol));
+
+      std::smatch match;
+      assert(std::regex_search(target, match, re));
+      assert(match.position(0) == 3);
+      assert(match.length(0) == 0);
+      assert(!std::regex_search(target, match, re, std::regex_constants::match_not_eol));
+      assert(match.length(0) == 0);
+    }
+
+    {
+      std::string target = "foo";
+      std::regex re("$", std::regex::multiline);
+      std::smatch match;
+      assert(std::regex_search(target, match, re));
+      assert(match.position(0) == 3);
+      assert(match.length(0) == 0);
+      assert(!std::regex_search(target, match, re, std::regex_constants::match_not_eol));
+      assert(match.length(0) == 0);
+    }
+
+    {
+      std::string target = "foo";
+      std::regex re("$");
+      assert(!std::regex_match(target, re));
+      assert(!std::regex_match(target, re, std::regex_constants::match_not_eol));
+    }
+
+    {
+      std::string target = "a";
+      std::regex re("^b*$");
+      assert(!std::regex_search(target, re));
+      assert(!std::regex_search(target, re, std::regex_constants::match_not_eol));
+    }
+
   return 0;
 }

From 73ff017c9b39d8a1ecc350f0572e3274dd4c9146 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 11:17:21 -0800
Subject: [PATCH 462/843] [Sema] Fix a warning

This patch fixes:

  clang/lib/Sema/AnalysisBasedWarnings.cpp:2269:21: error: unused
  variable 'subExpr' [-Werror,-Wunused-variable]
---
 clang/lib/Sema/AnalysisBasedWarnings.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 649c3f533b820..9e9294572df91 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2266,8 +2266,6 @@ class UnsafeBufferUsageReporter : public UnsafeBufferUsageHandler {
         if (!isa<PointerType>(destType))
           return;
 
-        const Expr *subExpr = ECE->getSubExpr();
-
         const uint64_t dSize =
             Ctx.getTypeSize(destType.getTypePtr()->getPointeeType());
 

From bb28442c0b9790473363cbca2867630262358018 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <37383324+aganea@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:19:08 -0500
Subject: [PATCH 463/843] [CodeGen][X86] Fix lowering of tailcalls when
 `-ms-hotpatch` is used (#77245)

Previously, tail jump pseudo-opcodes were skipped by the
`encodeInstruction()` call inside `X86AsmPrinter::LowerPATCHABLE_OP`.
This caused emission of a 2-byte NOP and dropping of the tail jump.

With this PR, we change `PATCHABLE_OP` to not wrap the first
`MachineInstr` anymore, but inserting itself before,
leaving the instruction unaltered. At lowering time in `X86AsmPrinter`,
we now "look ahead" for the next non-pseudo `MachineInstr` and
lower+encode it, to inspect its size. If the size is below what
`PATCHABLE_OP` expects, it inserts NOPs; otherwise it does nothing. That
way, now the first `MachineInstr` is always lowered as usual even if
`"patchable-function"="prologue-short-redirect"` is used.

Fixes https://github.com/llvm/llvm-project/issues/76879,
https://github.com/llvm/llvm-project/issues/76958 and
https://github.com/llvm/llvm-project/issues/59039
---
 llvm/include/llvm/Support/TargetOpcodes.def   | 11 ++--
 llvm/lib/CodeGen/PatchableFunction.cpp        | 52 ++++---------------
 llvm/lib/Target/X86/X86MCInstLower.cpp        | 33 ++++--------
 .../X86/patchable-prologue-tailcall.ll        | 34 ++++++++++++
 llvm/test/CodeGen/X86/patchable-prologue.ll   |  4 +-
 5 files changed, 62 insertions(+), 72 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/patchable-prologue-tailcall.ll

diff --git a/llvm/include/llvm/Support/TargetOpcodes.def b/llvm/include/llvm/Support/TargetOpcodes.def
index c005218c80f44..abb237083d254 100644
--- a/llvm/include/llvm/Support/TargetOpcodes.def
+++ b/llvm/include/llvm/Support/TargetOpcodes.def
@@ -170,15 +170,10 @@ HANDLE_TARGET_OPCODE(LOCAL_ESCAPE)
 /// comparisons into existing memory operations.
 HANDLE_TARGET_OPCODE(FAULTING_OP)
 
-/// Wraps a machine instruction to add patchability constraints.  An
-/// instruction wrapped in PATCHABLE_OP has to either have a minimum
+/// Precedes a machine instruction to add patchability constraints.  An
+/// instruction after PATCHABLE_OP has to either have a minimum
 /// size or be preceded with a nop of that size.  The first operand is
-/// an immediate denoting the minimum size of the instruction, the
-/// second operand is an immediate denoting the opcode of the original
-/// instruction.  The rest of the operands are the operands of the
-/// original instruction.
-/// PATCHABLE_OP can be used as second operand to only insert a nop of
-/// required size.
+/// an immediate denoting the minimum size of the following instruction.
 HANDLE_TARGET_OPCODE(PATCHABLE_OP)
 
 /// This is a marker instruction which gets translated into a nop sled, useful
diff --git a/llvm/lib/CodeGen/PatchableFunction.cpp b/llvm/lib/CodeGen/PatchableFunction.cpp
index 9449f143366f0..75c2dfeca58d5 100644
--- a/llvm/lib/CodeGen/PatchableFunction.cpp
+++ b/llvm/lib/CodeGen/PatchableFunction.cpp
@@ -38,58 +38,28 @@ struct PatchableFunction : public MachineFunctionPass {
 }
 
 bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {
+  MachineBasicBlock &FirstMBB = *MF.begin();
+
   if (MF.getFunction().hasFnAttribute("patchable-function-entry")) {
-    MachineBasicBlock &FirstMBB = *MF.begin();
     const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
     // The initial .loc covers PATCHABLE_FUNCTION_ENTER.
     BuildMI(FirstMBB, FirstMBB.begin(), DebugLoc(),
             TII->get(TargetOpcode::PATCHABLE_FUNCTION_ENTER));
     return true;
-  }
-
-  if (!MF.getFunction().hasFnAttribute("patchable-function"))
-    return false;
-
+  } else if (MF.getFunction().hasFnAttribute("patchable-function")) {
 #ifndef NDEBUG
-  Attribute PatchAttr = MF.getFunction().getFnAttribute("patchable-function");
-  StringRef PatchType = PatchAttr.getValueAsString();
-  assert(PatchType == "prologue-short-redirect" && "Only possibility today!");
+    Attribute PatchAttr = MF.getFunction().getFnAttribute("patchable-function");
+    StringRef PatchType = PatchAttr.getValueAsString();
+    assert(PatchType == "prologue-short-redirect" && "Only possibility today!");
 #endif
-
-  auto &FirstMBB = *MF.begin();
-  auto *TII = MF.getSubtarget().getInstrInfo();
-
-  MachineBasicBlock::iterator FirstActualI = llvm::find_if(
-      FirstMBB, [](const MachineInstr &MI) { return !MI.isMetaInstruction(); });
-
-  if (FirstActualI == FirstMBB.end()) {
-    // As of Microsoft documentation on /hotpatch feature, we must ensure that
-    // "the first instruction of each function is at least two bytes, and no
-    // jump within the function goes to the first instruction"
-
-    // When the first MBB is empty, insert a patchable no-op. This ensures the
-    // first instruction is patchable in two special cases:
-    // - the function is empty (e.g. unreachable)
-    // - the function jumps back to the first instruction, which is in a
-    // successor MBB.
-    BuildMI(&FirstMBB, DebugLoc(), TII->get(TargetOpcode::PATCHABLE_OP))
-        .addImm(2)
-        .addImm(TargetOpcode::PATCHABLE_OP);
+    auto *TII = MF.getSubtarget().getInstrInfo();
+    BuildMI(FirstMBB, FirstMBB.begin(), DebugLoc(),
+            TII->get(TargetOpcode::PATCHABLE_OP))
+        .addImm(2);
     MF.ensureAlignment(Align(16));
     return true;
   }
-
-  auto MIB = BuildMI(FirstMBB, FirstActualI, FirstActualI->getDebugLoc(),
-                     TII->get(TargetOpcode::PATCHABLE_OP))
-                 .addImm(2)
-                 .addImm(FirstActualI->getOpcode());
-
-  for (auto &MO : FirstActualI->operands())
-    MIB.add(MO);
-
-  FirstActualI->eraseFromParent();
-  MF.ensureAlignment(Align(16));
-  return true;
+  return false;
 }
 
 char PatchableFunction::ID = 0;
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 8bf099a165816..6cf93dc51238c 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -948,24 +948,22 @@ void X86AsmPrinter::LowerASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
 
 void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
                                       X86MCInstLower &MCIL) {
-  // PATCHABLE_OP minsize, opcode, operands
+  // PATCHABLE_OP minsize
 
   NoAutoPaddingScope NoPadScope(*OutStreamer);
 
-  unsigned MinSize = MI.getOperand(0).getImm();
-  unsigned Opcode = MI.getOperand(1).getImm();
-  // Opcode PATCHABLE_OP is a special case: there is no instruction to wrap,
-  // simply emit a nop of size MinSize.
-  bool EmptyInst = (Opcode == TargetOpcode::PATCHABLE_OP);
-
-  MCInst MCI;
-  MCI.setOpcode(Opcode);
-  for (auto &MO : drop_begin(MI.operands(), 2))
-    if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
-      MCI.addOperand(*MaybeOperand);
+  auto NextMI = std::find_if(std::next(MI.getIterator()),
+                             MI.getParent()->end().getInstrIterator(),
+                             [](auto &II) { return !II.isMetaInstruction(); });
 
   SmallString<256> Code;
-  if (!EmptyInst) {
+  unsigned MinSize = MI.getOperand(0).getImm();
+
+  if (NextMI != MI.getParent()->end()) {
+    // Lower the next MachineInstr to find its byte size.
+    MCInst MCI;
+    MCIL.Lower(&*NextMI, MCI);
+
     SmallVector<MCFixup, 4> Fixups;
     CodeEmitter->encodeInstruction(MCI, Code, Fixups, getSubtargetInfo());
   }
@@ -981,21 +979,12 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
       OutStreamer->emitInstruction(
           MCInstBuilder(X86::MOV32rr_REV).addReg(X86::EDI).addReg(X86::EDI),
           *Subtarget);
-    } else if (MinSize == 2 && Opcode == X86::PUSH64r) {
-      // This is an optimization that lets us get away without emitting a nop in
-      // many cases.
-      //
-      // NB! In some cases the encoding for PUSH64r (e.g. PUSH64r %r9) takes two
-      // bytes too, so the check on MinSize is important.
-      MCI.setOpcode(X86::PUSH64rmr);
     } else {
       unsigned NopSize = emitNop(*OutStreamer, MinSize, Subtarget);
       assert(NopSize == MinSize && "Could not implement MinSize!");
       (void)NopSize;
     }
   }
-  if (!EmptyInst)
-    OutStreamer->emitInstruction(MCI, getSubtargetInfo());
 }
 
 // Lower a stackmap of the form:
diff --git a/llvm/test/CodeGen/X86/patchable-prologue-tailcall.ll b/llvm/test/CodeGen/X86/patchable-prologue-tailcall.ll
new file mode 100644
index 0000000000000..5e55524fecc9a
--- /dev/null
+++ b/llvm/test/CodeGen/X86/patchable-prologue-tailcall.ll
@@ -0,0 +1,34 @@
+; RUN: llc -verify-machineinstrs -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=CHECK
+
+; CHECK: f1:
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: jmp     f0                          # TAILCALL
+
+; CHECK: f2:
+; CHECK-NEXT: # %bb.0:
+; CHECK-NEXT: jmp     malloc                          # TAILCALL
+
+define ptr @f1(i64 %count) "patchable-function"="prologue-short-redirect" {
+entry:
+  %call = tail call ptr @f0(i64 %count)
+  ret ptr %call
+}
+
+declare ptr @f0(i64)
+
+define noalias ptr @f2(i64 %count) "patchable-function"="prologue-short-redirect" {
+entry:
+  %call = tail call ptr @malloc(i64 %count)
+  ret ptr %call
+}
+
+declare noalias ptr @malloc(i64) #0
+
+attributes #0 = { allockind("alloc,uninitialized") allocsize(0) memory(inaccessiblemem: readwrite) "alloc-family"="malloc" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 1, !"wchar_size", i32 2}
+!1 = !{i32 8, !"PIC Level", i32 2}
+!2 = !{i32 7, !"uwtable", i32 2}
+!3 = !{i32 1, !"MaxTLSAlign", i32 65536}
diff --git a/llvm/test/CodeGen/X86/patchable-prologue.ll b/llvm/test/CodeGen/X86/patchable-prologue.ll
index f1a39777a40bc..71a392845fdea 100644
--- a/llvm/test/CodeGen/X86/patchable-prologue.ll
+++ b/llvm/test/CodeGen/X86/patchable-prologue.ll
@@ -32,7 +32,8 @@ define void @f0() "patchable-function"="prologue-short-redirect" {
 
 define void @f1() "patchable-function"="prologue-short-redirect" "frame-pointer"="all" {
 ; CHECK-LABEL: _f1
-; CHECK-NEXT: ff f5 	pushq	%rbp
+; CHECK-NEXT: 66 90     nop
+; CHECK-NEXT: 55		pushq	%rbp
 
 ; CHECK-ALIGN: 	.p2align	4, 0x90
 ; CHECK-ALIGN: _f1:
@@ -47,6 +48,7 @@ define void @f1() "patchable-function"="prologue-short-redirect" "frame-pointer"
 ; 64: f1:
 ; 64-NEXT: .seh_proc f1
 ; 64-NEXT: # %bb.0:
+; 64-NEXT: xchgw %ax, %ax
 ; 64-NEXT: pushq   %rbp
 		
   ret void

From 309dcc112b1d3f0a24ea67e5b0377c649637260f Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 22 Jan 2024 14:18:55 -0500
Subject: [PATCH 464/843] [gn] port a80e65e00ada7

See here for context:
https://github.com/llvm/llvm-project/pull/65534#issuecomment-1904633461
---
 llvm/utils/gn/secondary/libcxx/src/BUILD.gn | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
index 095cd5bb5b7cc..79504775ea21f 100644
--- a/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/src/BUILD.gn
@@ -56,6 +56,9 @@ config("cxx_config") {
     "_LIBCPP_BUILDING_LIBRARY",
     "_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER",
   ]
+  if (current_os != "win") {
+    defines += [ "LIBCXX_BUILDING_LIBCXXABI" ]
+  }
   if (target_os == "win") {
     cflags += [ "/Zl" ]
     defines += [

From bcfdab87052116ce7bf2a9d9b2b1cc2e40a43618 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 11:24:55 -0800
Subject: [PATCH 465/843] [flang] Fix a warning

This patch fixes:

  flang/lib/Lower/OpenACC.cpp:1964:15: error: unused variable
  'loopDirective' [-Werror,-Wunused-variable]
---
 flang/lib/Lower/OpenACC.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index 474d4e686c135..d619d47fc2359 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1974,6 +1974,7 @@ genACC(Fortran::lower::AbstractConverter &converter,
 
   assert(loopDirective.v == llvm::acc::ACCD_loop &&
          "Unsupported OpenACC loop construct");
+  (void)loopDirective;
 
   const auto &accClauseList =
       std::get<Fortran::parser::AccClauseList>(beginLoopDirective.t);

From bfd12f39b16e7ba2c342e120b925499f8504f5b3 Mon Sep 17 00:00:00 2001
From: Mark de Wever <koraq@xs4all.nl>
Date: Mon, 22 Jan 2024 20:25:14 +0100
Subject: [PATCH 466/843] [lldb][libc++] Adds system_clock data formatters.
 (#78609)

---
 .../Language/CPlusPlus/CPlusPlusLanguage.cpp  | 25 +++++
 .../Plugins/Language/CPlusPlus/LibCxx.cpp     | 73 +++++++++++++-
 .../Plugins/Language/CPlusPlus/LibCxx.h       |  8 ++
 .../chrono/TestDataFormatterLibcxxChrono.py   | 96 +++++++++++++++++++
 .../data-formatter-stl/libcxx/chrono/main.cpp | 52 ++++++++++
 5 files changed, 252 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
index 21c73e6361904..f0fe6c9e06d9d 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/CPlusPlusLanguage.cpp
@@ -1032,6 +1032,31 @@ static void LoadLibCxxFormatters(lldb::TypeCategoryImplSP cpp_category_sp) {
       TypeSummaryImplSP(new StringSummaryFormat(
           eTypeOptionHideChildren | eTypeOptionHideValue, "${var.__rep_} s")));
 
+  // Chrono time point types
+
+  AddCXXSummary(cpp_category_sp,
+                lldb_private::formatters::LibcxxChronoSysSecondsSummaryProvider,
+                "libc++ std::chrono::sys_seconds summary provider",
+                "^std::__[[:alnum:]]+::chrono::time_point<"
+                "std::__[[:alnum:]]+::chrono::system_clock, "
+                "std::__[[:alnum:]]+::chrono::duration<long long, "
+                "std::__[[:alnum:]]+::ratio<1, 1> "
+                "> >$",
+                eTypeOptionHideChildren | eTypeOptionHideValue |
+                    eTypeOptionCascade,
+                true);
+  AddCXXSummary(cpp_category_sp,
+                lldb_private::formatters::LibcxxChronoSysDaysSummaryProvider,
+                "libc++ std::chrono::sys_seconds summary provider",
+                "^std::__[[:alnum:]]+::chrono::time_point<"
+                "std::__[[:alnum:]]+::chrono::system_clock, "
+                "std::__[[:alnum:]]+::chrono::duration<int, "
+                "std::__[[:alnum:]]+::ratio<86400, 1> "
+                "> >$",
+                eTypeOptionHideChildren | eTypeOptionHideValue |
+                    eTypeOptionCascade,
+                true);
+
   // Chrono calendar types
 
   cpp_category_sp->AddTypeSummary(
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
index d232a38adc029..060324e2fcfe2 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.cpp
@@ -1073,18 +1073,87 @@ bool lldb_private::formatters::LibcxxWStringViewSummaryProvider(
   bool success;
   ValueObjectSP dataobj;
   size_t size;
-  std::tie( success, dataobj, size ) = LibcxxExtractStringViewData(valobj);
+  std::tie(success, dataobj, size) = LibcxxExtractStringViewData(valobj);
 
   if (!success) {
     stream << "Summary Unavailable";
     return true;
   }
 
-
   return ::LibcxxWStringSummaryProvider(valobj, stream, summary_options,
                                         dataobj, size);
 }
 
+bool lldb_private::formatters::LibcxxChronoSysSecondsSummaryProvider(
+    ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
+  ValueObjectSP ptr_sp = valobj.GetChildMemberWithName("__d_");
+  if (!ptr_sp)
+    return false;
+  ptr_sp = ptr_sp->GetChildMemberWithName("__rep_");
+  if (!ptr_sp)
+    return false;
+
+  // The date time in the chrono library is valid in the range
+  // [-32767-01-01T00:00:00Z, 32767-12-31T23:59:59Z]. A 64-bit time_t has a
+  // larger range, the function strftime is not able to format the entire range
+  // of time_t. The exact point has not been investigated; it's limited to
+  // chrono's range.
+  const std::time_t chrono_timestamp_min =
+      -1'096'193'779'200; // -32767-01-01T00:00:00Z
+  const std::time_t chrono_timestamp_max =
+      971'890'963'199; // 32767-12-31T23:59:59Z
+
+  const std::time_t seconds = ptr_sp->GetValueAsSigned(0);
+  if (seconds < chrono_timestamp_min || seconds > chrono_timestamp_max)
+    stream.Printf("timestamp=%ld s", seconds);
+  else {
+    std::array<char, 128> str;
+    std::size_t size =
+        std::strftime(str.data(), str.size(), "%FT%H:%M:%SZ", gmtime(&seconds));
+    if (size == 0)
+      return false;
+
+    stream.Printf("date/time=%s timestamp=%ld s", str.data(), seconds);
+  }
+
+  return true;
+}
+
+bool lldb_private::formatters::LibcxxChronoSysDaysSummaryProvider(
+    ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
+  ValueObjectSP ptr_sp = valobj.GetChildMemberWithName("__d_");
+  if (!ptr_sp)
+    return false;
+  ptr_sp = ptr_sp->GetChildMemberWithName("__rep_");
+  if (!ptr_sp)
+    return false;
+
+  // The date time in the chrono library is valid in the range
+  // [-32767-01-01Z, 32767-12-31Z]. A 32-bit time_t has a larger range, the
+  // function strftime is not able to format the entire range of time_t. The
+  // exact point has not been investigated; it's limited to chrono's range.
+  const int chrono_timestamp_min = -12'687'428; // -32767-01-01Z
+  const int chrono_timestamp_max = 11'248'737;  // 32767-12-31Z
+
+  const int days = ptr_sp->GetValueAsSigned(0);
+  if (days < chrono_timestamp_min || days > chrono_timestamp_max)
+    stream.Printf("timestamp=%d days", days);
+
+  else {
+    const std::time_t seconds = std::time_t(86400) * days;
+
+    std::array<char, 128> str;
+    std::size_t size =
+        std::strftime(str.data(), str.size(), "%FZ", gmtime(&seconds));
+    if (size == 0)
+      return false;
+
+    stream.Printf("date=%s timestamp=%d days", str.data(), days);
+  }
+
+  return true;
+}
+
 bool lldb_private::formatters::LibcxxChronoMonthSummaryProvider(
     ValueObject &valobj, Stream &stream, const TypeSummaryOptions &options) {
   // FIXME: These are the names used in the C++20 ostream operator. Since LLVM
diff --git a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
index 532d185b18543..72da6b2426efe 100644
--- a/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
+++ b/lldb/source/Plugins/Language/CPlusPlus/LibCxx.h
@@ -261,6 +261,14 @@ SyntheticChildrenFrontEnd *
 LibcxxStdRangesRefViewSyntheticFrontEndCreator(CXXSyntheticChildren *,
                                                lldb::ValueObjectSP);
 
+bool LibcxxChronoSysSecondsSummaryProvider(
+    ValueObject &valobj, Stream &stream,
+    const TypeSummaryOptions &options); // libc++ std::chrono::sys_seconds
+
+bool LibcxxChronoSysDaysSummaryProvider(
+    ValueObject &valobj, Stream &stream,
+    const TypeSummaryOptions &options); // libc++ std::chrono::sys_days
+
 bool LibcxxChronoMonthSummaryProvider(
     ValueObject &valobj, Stream &stream,
     const TypeSummaryOptions &options); // libc++ std::chrono::month
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
index d4bc140015fbb..a90fb828d121a 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/TestDataFormatterLibcxxChrono.py
@@ -32,6 +32,102 @@ def test_with_run_command(self):
         self.expect("frame variable m", substrs=["m = 4321 months"])
         self.expect("frame variable y", substrs=["y = 321 years"])
 
+        self.expect(
+            "frame variable ss_tp",
+            substrs=["ss_tp = date/time=1970-01-01T00:00:00Z timestamp=0 s"],
+        )
+        self.expect(
+            "frame variable ss_tp_d",
+            substrs=["ss_tp_d = date/time=1970-01-01T00:00:00Z timestamp=0 s"],
+        )
+        self.expect(
+            "frame variable ss_tp_d_r",
+            substrs=["ss_tp_d_r = date/time=1970-01-01T00:00:00Z timestamp=0 s"],
+        )
+        self.expect(
+            "frame variable ss_tp_d_r2",
+            substrs=["ss_tp_d_r2 = date/time=1970-01-01T00:00:00Z timestamp=0 s"],
+        )
+
+        self.expect(
+            "frame variable ss_0",
+            substrs=["ss_0 = date/time=1970-01-01T00:00:00Z timestamp=0 s"],
+        )
+
+        self.expect(
+            "frame variable ss_neg_date_time",
+            substrs=[
+                "ss_neg_date_time = date/time=-32767-01-01T00:00:00Z timestamp=-1096193779200 s"
+            ],
+        )
+        self.expect(
+            "frame variable ss_neg_seconds",
+            substrs=["ss_neg_seconds = timestamp=-1096193779201 s"],
+        )
+
+        self.expect(
+            "frame variable ss_pos_date_time",
+            substrs=[
+                "ss_pos_date_time = date/time=32767-12-31T23:59:59Z timestamp=971890963199 s"
+            ],
+        )
+        self.expect(
+            "frame variable ss_pos_seconds",
+            substrs=["ss_pos_seconds = timestamp=971890963200 s"],
+        )
+
+        self.expect(
+            "frame variable ss_min",
+            substrs=["ss_min = timestamp=-9223372036854775808 s"],
+        )
+        self.expect(
+            "frame variable ss_max",
+            substrs=["ss_max = timestamp=9223372036854775807 s"],
+        )
+
+        self.expect(
+            "frame variable sd_tp",
+            substrs=["sd_tp = date=1970-01-01Z timestamp=0 days"],
+        )
+        self.expect(
+            "frame variable sd_tp_d_r",
+            substrs=["sd_tp_d_r = date=1970-01-01Z timestamp=0 days"],
+        )
+        self.expect(
+            "frame variable sd_tp_d_r2",
+            substrs=["sd_tp_d_r2 = date=1970-01-01Z timestamp=0 days"],
+        )
+
+        self.expect(
+            "frame variable sd_0", substrs=["sd_0 = date=1970-01-01Z timestamp=0 days"]
+        )
+        self.expect(
+            "frame variable sd_neg_date",
+            substrs=["sd_neg_date = date=-32767-01-01Z timestamp=-12687428 days"],
+        )
+        self.expect(
+            "frame variable sd_neg_days",
+            substrs=["sd_neg_days = timestamp=-12687429 days"],
+        )
+
+        self.expect(
+            "frame variable sd_pos_date",
+            substrs=["sd_pos_date = date=32767-12-31Z timestamp=11248737 days"],
+        )
+        self.expect(
+            "frame variable sd_pos_days",
+            substrs=["sd_pos_days = timestamp=11248738 days"],
+        )
+
+        self.expect(
+            "frame variable sd_min",
+            substrs=["sd_min = timestamp=-2147483648 days"],
+        )
+        self.expect(
+            "frame variable sd_max",
+            substrs=["sd_max = timestamp=2147483647 days"],
+        )
+
         self.expect("frame variable d_0", substrs=["d_0 = day=0"])
         self.expect("frame variable d_1", substrs=["d_1 = day=1"])
         self.expect("frame variable d_31", substrs=["d_31 = day=31"])
diff --git a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/main.cpp b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/main.cpp
index 57215aaf343f6..315c88a787d82 100644
--- a/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/main.cpp
+++ b/lldb/test/API/functionalities/data-formatter/data-formatter-stl/libcxx/chrono/main.cpp
@@ -15,6 +15,58 @@ int main() {
   std::chrono::months m{4321};
   std::chrono::years y{321};
 
+  // sys_seconds aliasses
+  std::chrono::time_point<std::chrono::system_clock, std::chrono::seconds>
+      ss_tp{std::chrono::seconds{0}};
+  std::chrono::time_point<std::chrono::system_clock,
+                          std::chrono::duration<long long>>
+      ss_tp_d{std::chrono::seconds{0}};
+  std::chrono::time_point<std::chrono::system_clock,
+                          std::chrono::duration<long long, std::ratio<1>>>
+      ss_tp_d_r{std::chrono::seconds{0}};
+  std::chrono::time_point<std::chrono::system_clock,
+                          std::chrono::duration<long long, std::ratio<1>>>
+      ss_tp_d_r2{std::chrono::seconds{0}};
+
+  // sys_seconds
+  std::chrono::sys_seconds ss_0{std::chrono::seconds{0}};
+  std::chrono::sys_seconds ss_neg_date_time{
+      std::chrono::seconds{-1'096'193'779'200}};
+  std::chrono::sys_seconds ss_neg_seconds{
+      std::chrono::seconds{-1'096'193'779'201}};
+  std::chrono::sys_seconds ss_pos_date_time{
+      std::chrono::seconds{971'890'963'199}};
+  std::chrono::sys_seconds ss_pos_seconds{
+      std::chrono::seconds{971'890'963'200}};
+  std::chrono::sys_seconds ss_min{
+      std::chrono::seconds{std::numeric_limits<long long>::min()}};
+  std::chrono::sys_seconds ss_max{
+      std::chrono::seconds{std::numeric_limits<long long>::max()}};
+
+  // sys_days aliasses
+  std::chrono::time_point<std::chrono::system_clock, std::chrono::days> sd_tp{
+      std::chrono::days{0}};
+  std::chrono::time_point<std::chrono::system_clock,
+                          std::chrono::duration<int, std::ratio<86400>>>
+      sd_tp_d_r{std::chrono::days{0}};
+  std::chrono::time_point<std::chrono::system_clock,
+                          std::chrono::duration<int, std::ratio<86400, 1>>>
+      sd_tp_d_r2{std::chrono::days{0}};
+
+  // sys_days
+  std::chrono::sys_days sd_0{std::chrono::days{0}};
+
+  std::chrono::sys_days sd_neg_date{std::chrono::days{-12'687'428}};
+  std::chrono::sys_days sd_neg_days{std::chrono::days{-12'687'429}};
+
+  std::chrono::sys_days sd_pos_date{std::chrono::days{11'248'737}};
+  std::chrono::sys_days sd_pos_days{std::chrono::days{11'248'738}};
+
+  std::chrono::sys_days sd_min{
+      std::chrono::days{std::numeric_limits<int>::min()}};
+  std::chrono::sys_days sd_max{
+      std::chrono::days{std::numeric_limits<int>::max()}};
+
   std::chrono::day d_0{0};
   std::chrono::day d_1{1};
   std::chrono::day d_31{31};

From f10ec8c77403fd722b3f50710cb73757da4383e4 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 22 Jan 2024 11:33:13 -0800
Subject: [PATCH 467/843] [libc][riscv] Check if we have F or D extension
 before using them (#79036)

We shouldn't be using instructions that require F or D extensions
unconditionally before checking if those instructions are available.
---
 libc/src/__support/FPUtil/riscv/FMA.h  | 4 ++++
 libc/src/__support/FPUtil/riscv/sqrt.h | 4 ++++
 2 files changed, 8 insertions(+)

diff --git a/libc/src/__support/FPUtil/riscv/FMA.h b/libc/src/__support/FPUtil/riscv/FMA.h
index 32bef1ea8843e..f01962174f16f 100644
--- a/libc/src/__support/FPUtil/riscv/FMA.h
+++ b/libc/src/__support/FPUtil/riscv/FMA.h
@@ -26,6 +26,7 @@
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
+#ifdef __riscv_flen
 template <typename T>
 LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, float>, T> fma(T x, T y, T z) {
   float result;
@@ -35,6 +36,7 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, float>, T> fma(T x, T y, T z) {
   return result;
 }
 
+#if __riscv_flen >= 64
 template <typename T>
 LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, double>, T> fma(T x, T y, T z) {
   double result;
@@ -43,6 +45,8 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_same_v<T, double>, T> fma(T x, T y, T z) {
                   : "f"(x), "f"(y), "f"(z));
   return result;
 }
+#endif // __riscv_flen >= 64
+#endif // __riscv_flen
 
 } // namespace fputil
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/__support/FPUtil/riscv/sqrt.h b/libc/src/__support/FPUtil/riscv/sqrt.h
index a42687004639b..a1c436d0ebb1d 100644
--- a/libc/src/__support/FPUtil/riscv/sqrt.h
+++ b/libc/src/__support/FPUtil/riscv/sqrt.h
@@ -21,17 +21,21 @@
 namespace LIBC_NAMESPACE {
 namespace fputil {
 
+#ifdef __riscv_flen
 template <> LIBC_INLINE float sqrt<float>(float x) {
   float result;
   __asm__ __volatile__("fsqrt.s %0, %1\n\t" : "=f"(result) : "f"(x));
   return result;
 }
 
+#if __riscv_flen >= 64
 template <> LIBC_INLINE double sqrt<double>(double x) {
   double result;
   __asm__ __volatile__("fsqrt.d %0, %1\n\t" : "=f"(result) : "f"(x));
   return result;
 }
+#endif // __riscv_flen >= 64
+#endif // __riscv_flen
 
 } // namespace fputil
 } // namespace LIBC_NAMESPACE

From 0dd72eb269dd485b98afba2c58a1d747db53dcad Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Mon, 22 Jan 2024 11:21:53 -0800
Subject: [PATCH 468/843] [OpenACC] Implement 'vector' and 'worker' cluase
 argument parsing

Both of the clauses 'vector' and 'worker' have an optional 'special'
word followed by an int-expr.  The arguments list is optional, as is the
special word, but if the parens are included, an int-expr is required.

This patch implements parsing for both.
---
 clang/include/clang/Basic/OpenACCKinds.h |  6 +-
 clang/lib/Parse/ParseOpenACC.cpp         | 23 +++++-
 clang/test/ParserOpenACC/parse-clauses.c | 90 ++++++++++++++++++++++++
 3 files changed, 113 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h
index 3d3f128f7784f..872faec0deaeb 100644
--- a/clang/include/clang/Basic/OpenACCKinds.h
+++ b/clang/include/clang/Basic/OpenACCKinds.h
@@ -154,11 +154,9 @@ enum class OpenACCClauseKind {
   Independent,
   /// 'auto' clause, allowed on 'loop' directives.
   Auto,
-  /// 'worker' clause, allowed on 'loop' and 'routine' directives.
+  /// 'worker' clause, allowed on 'loop', Combined, and 'routine' directives.
   Worker,
-  /// 'vector' clause, allowed on 'loop' and 'routine' directives. Takes no
-  /// arguments for 'routine', so the 'loop' version is not yet implemented
-  /// completely.
+  /// 'vector' clause, allowed on 'loop', Combined, and 'routine' directives.
   Vector,
   /// 'nohost' clause, allowed on 'routine' directives.
   NoHost,
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 612ee5ef214c1..8020c455feecb 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -159,6 +159,8 @@ enum class OpenACCSpecialTokenKind {
   Queues,
   Zero,
   Force,
+  Num,
+  Length,
 };
 
 bool isOpenACCSpecialToken(OpenACCSpecialTokenKind Kind, Token Tok) {
@@ -176,6 +178,10 @@ bool isOpenACCSpecialToken(OpenACCSpecialTokenKind Kind, Token Tok) {
     return Tok.getIdentifierInfo()->isStr("zero");
   case OpenACCSpecialTokenKind::Force:
     return Tok.getIdentifierInfo()->isStr("force");
+  case OpenACCSpecialTokenKind::Num:
+    return Tok.getIdentifierInfo()->isStr("num");
+  case OpenACCSpecialTokenKind::Length:
+    return Tok.getIdentifierInfo()->isStr("length");
   }
   llvm_unreachable("Unknown 'Kind' Passed");
 }
@@ -451,6 +457,9 @@ ClauseParensKind getClauseParensKind(OpenACCDirectiveKind DirKind,
   case OpenACCClauseKind::Self:
     return DirKind == OpenACCDirectiveKind::Update ? ClauseParensKind::Required
                                                    : ClauseParensKind::Optional;
+  case OpenACCClauseKind::Worker:
+  case OpenACCClauseKind::Vector:
+    return ClauseParensKind::Optional;
 
   case OpenACCClauseKind::Default:
   case OpenACCClauseKind::If:
@@ -488,8 +497,6 @@ ClauseParensKind getClauseParensKind(OpenACCDirectiveKind DirKind,
   case OpenACCClauseKind::Invalid:
   case OpenACCClauseKind::NoHost:
   case OpenACCClauseKind::Seq:
-  case OpenACCClauseKind::Worker:
-  case OpenACCClauseKind::Vector:
     return ClauseParensKind::None;
   }
   llvm_unreachable("Unhandled clause kind");
@@ -719,6 +726,18 @@ bool Parser::ParseOpenACCClauseParams(OpenACCDirectiveKind DirKind,
           return true;
         break;
       }
+      case OpenACCClauseKind::Vector:
+      case OpenACCClauseKind::Worker: {
+        tryParseAndConsumeSpecialTokenKind(*this,
+                                           Kind == OpenACCClauseKind::Vector
+                                               ? OpenACCSpecialTokenKind::Length
+                                               : OpenACCSpecialTokenKind::Num,
+                                           Kind);
+        ExprResult IntExpr = ParseOpenACCIntExpr();
+        if (IntExpr.isInvalid())
+          return true;
+        break;
+      }
       default:
         llvm_unreachable("Not an optional parens type?");
       }
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index b6114c73f1b19..a80173c6eca3d 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -817,6 +817,96 @@ void IntExprParsing() {
 
   // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
 #pragma acc set default_async(returns_int())
+
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector()
+  // expected-error@+3{{invalid tag 'invalid' on 'vector' clause}}
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(invalid:)
+  // expected-error@+2{{invalid tag 'invalid' on 'vector' clause}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(invalid:5)
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(length:)
+  // expected-error@+3{{invalid tag 'num' on 'vector' clause}}
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(num:)
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(5, 4)
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(length:6,4)
+  // expected-error@+4{{invalid tag 'num' on 'vector' clause}}
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(num:6,4)
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(5)
+  // expected-error@+2{{invalid tag 'num' on 'vector' clause}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(num:5)
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(length:5)
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(returns_int())
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop vector(length:returns_int())
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker()
+  // expected-error@+3{{invalid tag 'invalid' on 'worker' clause}}
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(invalid:)
+  // expected-error@+2{{invalid tag 'invalid' on 'worker' clause}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(invalid:5)
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(num:)
+  // expected-error@+3{{invalid tag 'length' on 'worker' clause}}
+  // expected-error@+2{{expected expression}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(length:)
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(5, 4)
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(num:6,4)
+  // expected-error@+4{{invalid tag 'length' on 'worker' clause}}
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(length:6,4)
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(5)
+  // expected-error@+2{{invalid tag 'length' on 'worker' clause}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(length:5)
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(num:5)
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(returns_int())
+  // expected-error@+2{{invalid tag 'length' on 'worker' clause}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc loop worker(length:returns_int())
 }
 
   // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}

From 1e2a4ccb6237b6db9db7dc3fd11a94410ec9b0df Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Mon, 22 Jan 2024 11:38:30 -0800
Subject: [PATCH 469/843] [CLANG] Add warning when INF or NAN are used in a
 binary operation or as function argument in fast math mode. (#76873)

Check for operations using INF or NaN when in ffast-math mode and
generate a warning.
---
 clang/docs/ReleaseNotes.rst                   |   5 +
 .../clang/Basic/DiagnosticCommonKinds.td      |   4 +
 clang/include/clang/Lex/Preprocessor.h        |   9 +
 clang/include/clang/Sema/Sema.h               |   7 +-
 clang/lib/Lex/Preprocessor.cpp                |   5 +
 clang/lib/Sema/SemaChecking.cpp               |  48 ++++-
 .../Sema/warn-infinity-nan-disabled-lnx.cpp   | 176 +++++++++++++++++
 .../Sema/warn-infinity-nan-disabled-win.cpp   | 179 ++++++++++++++++++
 8 files changed, 425 insertions(+), 8 deletions(-)
 create mode 100644 clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
 create mode 100644 clang/test/Sema/warn-infinity-nan-disabled-win.cpp

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 5846503af3acd..13e6c5e8db3ea 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -596,6 +596,11 @@ Improvements to Clang's diagnostics
 - Clang now diagnoses narrowing conversions involving const references.
   (`#63151: <https://github.com/llvm/llvm-project/issues/63151>`_).
 - Clang now diagnoses unexpanded packs within the template argument lists of function template specializations.
+- The warning `-Wnan-infinity-disabled` is now emitted when ``INFINITY``
+  or ``NAN`` are used in arithmetic operations or function arguments in
+  floating-point mode where ``INFINITY`` or ``NAN`` don't have the expected
+  values.
+
 - Clang now diagnoses attempts to bind a bitfield to an NTTP of a reference type as erroneous
   converted constant expression and not as a reference to subobject.
 - Clang now diagnoses ``auto`` and ``decltype(auto)`` in declarations of conversion function template
diff --git a/clang/include/clang/Basic/DiagnosticCommonKinds.td b/clang/include/clang/Basic/DiagnosticCommonKinds.td
index 5544dc88004d9..b1bada65cb6b2 100644
--- a/clang/include/clang/Basic/DiagnosticCommonKinds.td
+++ b/clang/include/clang/Basic/DiagnosticCommonKinds.td
@@ -70,6 +70,10 @@ def warn_pragma_debug_missing_argument : Warning<
 def warn_pragma_debug_unexpected_argument : Warning<
   "unexpected argument to debug command">, InGroup<IgnoredPragmas>;
 
+def warn_fp_nan_inf_when_disabled : Warning<
+  "use of %select{infinity|NaN}0%select{| via a macro}1 is undefined behavior "
+  "due to the currently enabled floating-point options">,
+  InGroup<DiagGroup<"nan-infinity-disabled">>;
 }
 
 // Parse && Sema
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 4ec21a8b6be2c..fed3c1b770383 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2835,6 +2835,13 @@ class Preprocessor {
     if (Identifier.getIdentifierInfo()->isRestrictExpansion() &&
         !SourceMgr.isInMainFile(Identifier.getLocation()))
       emitRestrictExpansionWarning(Identifier);
+
+    if (Identifier.getIdentifierInfo()->getName() == "INFINITY")
+      if (getLangOpts().NoHonorInfs)
+        emitRestrictInfNaNWarning(Identifier, 0);
+    if (Identifier.getIdentifierInfo()->getName() == "NAN")
+      if (getLangOpts().NoHonorNaNs)
+        emitRestrictInfNaNWarning(Identifier, 1);
   }
 
   static void processPathForFileMacro(SmallVectorImpl<char> &Path,
@@ -2850,6 +2857,8 @@ class Preprocessor {
   void emitMacroDeprecationWarning(const Token &Identifier) const;
   void emitRestrictExpansionWarning(const Token &Identifier) const;
   void emitFinalMacroWarning(const Token &Identifier, bool IsUndef) const;
+  void emitRestrictInfNaNWarning(const Token &Identifier,
+                                 unsigned DiagSelection) const;
 
   /// This boolean state keeps track if the current scanned token (by this PP)
   /// is in an "-Wunsafe-buffer-usage" opt-out region. Assuming PP scans a
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index d0f62afdf7bb1..1f1cbd11ff735 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -13932,8 +13932,9 @@ class Sema final {
 
   bool SemaBuiltinVAStart(unsigned BuiltinID, CallExpr *TheCall);
   bool SemaBuiltinVAStartARMMicrosoft(CallExpr *Call);
-  bool SemaBuiltinUnorderedCompare(CallExpr *TheCall);
-  bool SemaBuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs);
+  bool SemaBuiltinUnorderedCompare(CallExpr *TheCall, unsigned BuiltinID);
+  bool SemaBuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs,
+                                   unsigned BuiltinID);
   bool SemaBuiltinComplex(CallExpr *TheCall);
   bool SemaBuiltinVSX(CallExpr *TheCall);
   bool SemaBuiltinOSLogFormat(CallExpr *TheCall);
@@ -14037,6 +14038,8 @@ class Sema final {
                             SourceRange range,
                             llvm::SmallBitVector &CheckedVarArgs);
 
+  void CheckInfNaNFunction(const CallExpr *Call, const FunctionDecl *FDecl);
+
   void CheckAbsoluteValueFunction(const CallExpr *Call,
                                   const FunctionDecl *FDecl);
 
diff --git a/clang/lib/Lex/Preprocessor.cpp b/clang/lib/Lex/Preprocessor.cpp
index 64f54c6fc6382..7fdb5d4c0d7b8 100644
--- a/clang/lib/Lex/Preprocessor.cpp
+++ b/clang/lib/Lex/Preprocessor.cpp
@@ -1457,6 +1457,11 @@ void Preprocessor::emitRestrictExpansionWarning(const Token &Identifier) const {
   Diag(Info.Location, diag::note_pp_macro_annotation) << 1;
 }
 
+void Preprocessor::emitRestrictInfNaNWarning(const Token &Identifier,
+                                             unsigned DiagSelection) const {
+  Diag(Identifier, diag::warn_fp_nan_inf_when_disabled) << DiagSelection << 1;
+}
+
 void Preprocessor::emitFinalMacroWarning(const Token &Identifier,
                                          bool IsUndef) const {
   const MacroAnnotations &A =
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 3e0d94e817634..28f5667a1b6ba 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2169,6 +2169,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
     ICEArguments &= ~(1 << ArgNo);
   }
 
+  FPOptions FPO;
   switch (BuiltinID) {
   case Builtin::BI__builtin___CFStringMakeConstantString:
     // CFStringMakeConstantString is currently not implemented for GOFF (i.e.,
@@ -2245,15 +2246,15 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI__builtin_islessequal:
   case Builtin::BI__builtin_islessgreater:
   case Builtin::BI__builtin_isunordered:
-    if (SemaBuiltinUnorderedCompare(TheCall))
+    if (SemaBuiltinUnorderedCompare(TheCall, BuiltinID))
       return ExprError();
     break;
   case Builtin::BI__builtin_fpclassify:
-    if (SemaBuiltinFPClassification(TheCall, 6))
+    if (SemaBuiltinFPClassification(TheCall, 6, BuiltinID))
       return ExprError();
     break;
   case Builtin::BI__builtin_isfpclass:
-    if (SemaBuiltinFPClassification(TheCall, 2))
+    if (SemaBuiltinFPClassification(TheCall, 2, BuiltinID))
       return ExprError();
     break;
   case Builtin::BI__builtin_isfinite:
@@ -2267,7 +2268,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
   case Builtin::BI__builtin_signbit:
   case Builtin::BI__builtin_signbitf:
   case Builtin::BI__builtin_signbitl:
-    if (SemaBuiltinFPClassification(TheCall, 1))
+    if (SemaBuiltinFPClassification(TheCall, 1, BuiltinID))
       return ExprError();
     break;
   case Builtin::BI__builtin_shufflevector:
@@ -7648,6 +7649,7 @@ bool Sema::CheckFunctionCall(FunctionDecl *FDecl, CallExpr *TheCall,
 
   CheckAbsoluteValueFunction(TheCall, FDecl);
   CheckMaxUnsignedZero(TheCall, FDecl);
+  CheckInfNaNFunction(TheCall, FDecl);
 
   if (getLangOpts().ObjC)
     DiagnoseCStringFormatDirectiveInCFAPI(*this, FDecl, Args, NumArgs);
@@ -9117,10 +9119,15 @@ bool Sema::SemaBuiltinVAStartARMMicrosoft(CallExpr *Call) {
 
 /// SemaBuiltinUnorderedCompare - Handle functions like __builtin_isgreater and
 /// friends.  This is declared to take (...), so we have to check everything.
-bool Sema::SemaBuiltinUnorderedCompare(CallExpr *TheCall) {
+bool Sema::SemaBuiltinUnorderedCompare(CallExpr *TheCall, unsigned BuiltinID) {
   if (checkArgCount(*this, TheCall, 2))
     return true;
 
+  if (BuiltinID == Builtin::BI__builtin_isunordered &&
+      TheCall->getFPFeaturesInEffect(getLangOpts()).getNoHonorNaNs())
+    Diag(TheCall->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
+        << 1 << 0 << TheCall->getSourceRange();
+
   ExprResult OrigArg0 = TheCall->getArg(0);
   ExprResult OrigArg1 = TheCall->getArg(1);
 
@@ -9155,10 +9162,23 @@ bool Sema::SemaBuiltinUnorderedCompare(CallExpr *TheCall) {
 /// SemaBuiltinSemaBuiltinFPClassification - Handle functions like
 /// __builtin_isnan and friends.  This is declared to take (...), so we have
 /// to check everything.
-bool Sema::SemaBuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs) {
+bool Sema::SemaBuiltinFPClassification(CallExpr *TheCall, unsigned NumArgs,
+                                       unsigned BuiltinID) {
   if (checkArgCount(*this, TheCall, NumArgs))
     return true;
 
+  FPOptions FPO = TheCall->getFPFeaturesInEffect(getLangOpts());
+  if (FPO.getNoHonorInfs() && (BuiltinID == Builtin::BI__builtin_isfinite ||
+                               BuiltinID == Builtin::BI__builtin_isinf ||
+                               BuiltinID == Builtin::BI__builtin_isinf_sign))
+    Diag(TheCall->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
+        << 0 << 0 << TheCall->getSourceRange();
+
+  if (FPO.getNoHonorNaNs() && (BuiltinID == Builtin::BI__builtin_isnan ||
+                               BuiltinID == Builtin::BI__builtin_isunordered))
+    Diag(TheCall->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
+        << 1 << 0 << TheCall->getSourceRange();
+
   bool IsFPClass = NumArgs == 2;
 
   // Find out position of floating-point argument.
@@ -12905,6 +12925,22 @@ static bool IsStdFunction(const FunctionDecl *FDecl,
   return true;
 }
 
+void Sema::CheckInfNaNFunction(const CallExpr *Call,
+                               const FunctionDecl *FDecl) {
+  FPOptions FPO = Call->getFPFeaturesInEffect(getLangOpts());
+  if ((IsStdFunction(FDecl, "isnan") || IsStdFunction(FDecl, "isunordered") ||
+       (Call->getBuiltinCallee() == Builtin::BI__builtin_nanf)) &&
+      FPO.getNoHonorNaNs())
+    Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
+        << 1 << 0 << Call->getSourceRange();
+  else if ((IsStdFunction(FDecl, "isinf") ||
+            (IsStdFunction(FDecl, "isfinite") ||
+             (FDecl->getIdentifier() && FDecl->getName() == "infinity"))) &&
+           FPO.getNoHonorInfs())
+    Diag(Call->getBeginLoc(), diag::warn_fp_nan_inf_when_disabled)
+        << 0 << 0 << Call->getSourceRange();
+}
+
 // Warn when using the wrong abs() function.
 void Sema::CheckAbsoluteValueFunction(const CallExpr *Call,
                                       const FunctionDecl *FDecl) {
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
new file mode 100644
index 0000000000000..8a610fa0e737e
--- /dev/null
+++ b/clang/test/Sema/warn-infinity-nan-disabled-lnx.cpp
@@ -0,0 +1,176 @@
+// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans
+
+// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown %s
+
+// RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs
+
+// RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-nans
+
+// no-fast-no-diagnostics
+
+int isunorderedf (float x, float y);
+extern "C++" {
+namespace std __attribute__((__visibility__("default"))) {
+  bool
+  isinf(float __x);
+  bool
+  isinf(double __x);
+  bool
+  isinf(long double __x);
+  bool
+  isnan(float __x);
+  bool
+  isnan(double __x);
+  bool
+  isnan(long double __x);
+bool
+  isfinite(float __x);
+  bool
+  isfinite(double __x);
+  bool
+  isfinte(long double __x);
+ bool
+  isunordered(float __x, float __y);
+  bool
+  isunordered(double __x, double __y);
+  bool
+  isunordered(long double __x, long double __y);
+} // namespace )
+}
+
+#define NAN (__builtin_nanf(""))
+#define INFINITY (__builtin_inff())
+
+template <class _Ty>
+class numeric_limits {
+public:
+    [[nodiscard]] static constexpr _Ty infinity() noexcept {
+        return _Ty();
+    }
+};
+
+template <>
+class numeric_limits<float>  {
+public:
+    [[nodiscard]] static constexpr float infinity() noexcept {
+        return __builtin_huge_val();
+    }
+};
+template <>
+class numeric_limits<double>  {
+public:
+    [[nodiscard]] static constexpr double infinity() noexcept {
+        return __builtin_huge_val();
+    }
+};
+
+int compareit(float a, float b) {
+  volatile int i, j, k, l, m, n, o, p;
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  i = a == INFINITY;
+
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = INFINITY == a;
+
+// no-inf-no-nan-warning@+4 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+2 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  i = a == NAN;
+
+// no-inf-no-nan-warning@+4 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+2 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = NAN == a;
+
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = INFINITY <= a;
+
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = INFINITY < a;
+
+// no-inf-no-nan-warning@+4 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+2 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  j = a > NAN;
+
+// no-inf-no-nan-warning@+4 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+2 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = a >= NAN;
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  k = std::isinf(a);
+
+// no-inf-no-nan-warning@+2 {{use of NaN is undefined behavior due to the currently enabled floating-point option}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  l = std::isnan(a);
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  o = std::isfinite(a);
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  m = __builtin_isinf(a);
+
+// no-inf-no-nan-warning@+2 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  n = __builtin_isnan(a);
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  p = __builtin_isfinite(a);
+
+  // These should NOT warn, since they are not using NaN or infinity.
+  j = a > 1.1;
+  j = b < 1.1;
+  j = a >= 1.1;
+  j = b <= 1.1;
+  j = isunorderedf(a, b);
+
+// no-inf-no-nan-warning@+4 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+2 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  j = isunorderedf(a, NAN);
+
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = isunorderedf(a, INFINITY);
+
+// no-inf-no-nan-warning@+6 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+5 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+4 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+3 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+2 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  i = std::isunordered(a, NAN);
+
+// no-inf-no-nan-warning@+4 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  i = std::isunordered(a, INFINITY);
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  double y = i * numeric_limits<double>::infinity();
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  j = numeric_limits<float>::infinity();
+  return 0;
+
+}  
diff --git a/clang/test/Sema/warn-infinity-nan-disabled-win.cpp b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
new file mode 100644
index 0000000000000..19a575386e329
--- /dev/null
+++ b/clang/test/Sema/warn-infinity-nan-disabled-win.cpp
@@ -0,0 +1,179 @@
+// Use of NAN macro will trigger a warning "infinity defined in macro" because
+// on Windows the NAN macro is defined using INFINITY. See below.
+
+// RUN: %clang_cc1 -x c++ -verify=no-inf-no-nan -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs -menable-no-nans
+
+// RUN: %clang_cc1 -x c++ -verify=no-fast -triple powerpc64le-unknown-unknown %s
+
+// RUN: %clang_cc1 -x c++ -verify=no-inf -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-infs
+
+// RUN: %clang_cc1 -x c++ -verify=no-nan -triple powerpc64le-unknown-unknown %s \
+// RUN: -menable-no-nans
+
+// no-fast-no-diagnostics
+
+int isunorderedf (float x, float y);
+extern "C++" {
+namespace std __attribute__((__visibility__("default"))) {
+  bool
+  isinf(float __x);
+  bool
+  isinf(double __x);
+  bool
+  isinf(long double __x);
+  bool
+  isnan(float __x);
+  bool
+  isnan(double __x);
+  bool
+  isnan(long double __x);
+bool
+  isfinite(float __x);
+  bool
+  isfinite(double __x);
+  bool
+  isfinte(long double __x);
+ bool
+  isunordered(float __x, float __y);
+  bool
+  isunordered(double __x, double __y);
+  bool
+  isunordered(long double __x, long double __y);
+} // namespace )
+}
+
+#define INFINITY ((float)(1e+300 * 1e+300))
+#define NAN      (-(float)(INFINITY * 0.0F))
+
+template <class _Ty>
+class numeric_limits {
+public:
+    [[nodiscard]] static constexpr _Ty infinity() noexcept {
+        return _Ty();
+    }
+};
+
+template <>
+class numeric_limits<float>  {
+public:
+    [[nodiscard]] static constexpr float infinity() noexcept {
+        return __builtin_huge_val();
+    }
+};
+template <>
+class numeric_limits<double>  {
+public:
+    [[nodiscard]] static constexpr double infinity() noexcept {
+        return __builtin_huge_val();
+    }
+};
+
+int compareit(float a, float b) {
+  volatile int i, j, k, l, m, n, o, p;
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  i = a == INFINITY;
+
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = INFINITY == a;
+
+// no-inf-no-nan-warning@+4 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+  i = a == NAN;
+
+// no-inf-no-nan-warning@+4 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = NAN == a;
+
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = INFINITY <= a;
+
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = INFINITY < a;
+
+// no-inf-no-nan-warning@+4 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = a > NAN;
+
+// no-inf-no-nan-warning@+4 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = a >= NAN;
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  k = std::isinf(a);
+
+// no-inf-no-nan-warning@+2 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  l = std::isnan(a);
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  o = std::isfinite(a);
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  m = __builtin_isinf(a);
+
+// no-inf-no-nan-warning@+2 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  n = __builtin_isnan(a);
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  p = __builtin_isfinite(a);
+
+  // These should NOT warn, since they are not using NaN or infinity.
+  j = a > 1.1;
+  j = b < 1.1;
+  j = a >= 1.1;
+  j = b <= 1.1;
+  j = isunorderedf(a, b);
+
+// no-inf-no-nan-warning@+4 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point option}}
+// no-inf-no-nan-warning@+3 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = isunorderedf(a, NAN);
+
+// no-inf-no-nan-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+  j = isunorderedf(a, INFINITY);
+
+// no-inf-no-nan-warning@+6 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+5 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+4 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+3 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+2 {{use of NaN via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  i = std::isunordered(a, NAN);
+
+// no-inf-no-nan-warning@+4 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-no-nan-warning@+3 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+2 {{use of infinity via a macro is undefined behavior due to the currently enabled floating-point options}}
+// no-nan-warning@+1 {{use of NaN is undefined behavior due to the currently enabled floating-point options}}
+  i = std::isunordered(a, INFINITY);
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  double y = i * numeric_limits<double>::infinity();
+
+// no-inf-no-nan-warning@+2 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+// no-inf-warning@+1 {{use of infinity is undefined behavior due to the currently enabled floating-point options}}
+  j = numeric_limits<float>::infinity();
+  return 0;
+
+}  

From 02aa6956b5b6c92f3d4aea0aa466ed0d33343335 Mon Sep 17 00:00:00 2001
From: Jake Egan <5326451+jakeegan@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:53:35 -0500
Subject: [PATCH 470/843] [clang][AIX] Only export libclang.map symbols from
 libclang (#78748)

This will prevent unnecessary symbols being exported in libclang, which
could cause issues with non-unique objects.
---
 clang/test/Index/Core/designated-inits.c                  | 1 -
 clang/test/Index/Core/external-source-symbol-attr-cxx.cpp | 1 -
 clang/test/Index/Core/external-source-symbol-attr.m       | 1 -
 clang/test/Index/Core/index-dependent-source.cpp          | 1 -
 clang/test/Index/Core/index-instantiated-source.cpp       | 1 -
 clang/test/Index/Core/index-source-invalid-name.cpp       | 1 -
 clang/test/Index/Core/index-source.cpp                    | 1 -
 clang/test/Index/Core/index-source.m                      | 1 -
 clang/test/Index/Core/index-source.mm                     | 1 -
 clang/test/Index/Core/index-subkinds.m                    | 1 -
 clang/test/Index/Core/index-with-module.m                 | 1 -
 clang/test/Index/ms-property.cpp                          | 1 -
 clang/test/Index/using_if_exists.cpp                      | 1 -
 clang/tools/libclang/CMakeLists.txt                       | 5 +++--
 14 files changed, 3 insertions(+), 15 deletions(-)

diff --git a/clang/test/Index/Core/designated-inits.c b/clang/test/Index/Core/designated-inits.c
index 10f48c68915fe..a31cb95a4b82d 100644
--- a/clang/test/Index/Core/designated-inits.c
+++ b/clang/test/Index/Core/designated-inits.c
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
 
 struct MyStruct {
diff --git a/clang/test/Index/Core/external-source-symbol-attr-cxx.cpp b/clang/test/Index/Core/external-source-symbol-attr-cxx.cpp
index b3fb31c54a742..8ccf3c7d69496 100644
--- a/clang/test/Index/Core/external-source-symbol-attr-cxx.cpp
+++ b/clang/test/Index/Core/external-source-symbol-attr-cxx.cpp
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
 
 #define GEN_DECL_USR(mod_name, usr) __attribute__((external_source_symbol(language="Swift", defined_in=mod_name, USR=usr, generated_declaration)))
diff --git a/clang/test/Index/Core/external-source-symbol-attr.m b/clang/test/Index/Core/external-source-symbol-attr.m
index fee80d3b0fdf6..1f907912737b0 100644
--- a/clang/test/Index/Core/external-source-symbol-attr.m
+++ b/clang/test/Index/Core/external-source-symbol-attr.m
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
 
 #define EXT_DECL(mod_name) __attribute__((external_source_symbol(language="Swift", defined_in=mod_name)))
diff --git a/clang/test/Index/Core/index-dependent-source.cpp b/clang/test/Index/Core/index-dependent-source.cpp
index eabec29a1fd7b..8fec9abd1e926 100644
--- a/clang/test/Index/Core/index-dependent-source.cpp
+++ b/clang/test/Index/Core/index-dependent-source.cpp
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -std=c++14 -target x86_64-apple-macosx10.7 | FileCheck %s
 
 int invalid;
diff --git a/clang/test/Index/Core/index-instantiated-source.cpp b/clang/test/Index/Core/index-instantiated-source.cpp
index ccb39019ef067..2a67a3a3c7938 100644
--- a/clang/test/Index/Core/index-instantiated-source.cpp
+++ b/clang/test/Index/Core/index-instantiated-source.cpp
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -std=c++14 -target x86_64-apple-macosx10.7 | FileCheck %s
 // References to declarations in instantiations should be canonicalized:
 
diff --git a/clang/test/Index/Core/index-source-invalid-name.cpp b/clang/test/Index/Core/index-source-invalid-name.cpp
index 034fd6b7009b6..1b4b059cd1b36 100644
--- a/clang/test/Index/Core/index-source-invalid-name.cpp
+++ b/clang/test/Index/Core/index-source-invalid-name.cpp
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -std=c++1z -target x86_64-apple-macosx10.7 | FileCheck %s
 
 namespace rdar32474406 {
diff --git a/clang/test/Index/Core/index-source.cpp b/clang/test/Index/Core/index-source.cpp
index ed7474499dc72..8f9fbc4c8d29c 100644
--- a/clang/test/Index/Core/index-source.cpp
+++ b/clang/test/Index/Core/index-source.cpp
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -std=c++1z -target x86_64-apple-macosx10.7 | FileCheck %s
 // RUN: c-index-test core -print-source-symbols -include-locals -- %s -std=c++1z -target x86_64-apple-macosx10.7 | FileCheck -check-prefix=LOCAL %s
 
diff --git a/clang/test/Index/Core/index-source.m b/clang/test/Index/Core/index-source.m
index a551fe965bb0b..07c7d207607cb 100644
--- a/clang/test/Index/Core/index-source.m
+++ b/clang/test/Index/Core/index-source.m
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
 // RUN: c-index-test core -print-source-symbols -include-locals -- %s -target x86_64-apple-macosx10.7 | FileCheck -check-prefix=LOCAL %s
 
diff --git a/clang/test/Index/Core/index-source.mm b/clang/test/Index/Core/index-source.mm
index 659e16f013290..049a0bdaf6474 100644
--- a/clang/test/Index/Core/index-source.mm
+++ b/clang/test/Index/Core/index-source.mm
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
 
 @interface MyCls
diff --git a/clang/test/Index/Core/index-subkinds.m b/clang/test/Index/Core/index-subkinds.m
index d95dfe746b603..5eea046721b6c 100644
--- a/clang/test/Index/Core/index-subkinds.m
+++ b/clang/test/Index/Core/index-subkinds.m
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -target x86_64-apple-macosx10.7 | FileCheck %s
 
 // CHECK: [[@LINE+1]]:12 | class/ObjC | XCTestCase | c:objc(cs)XCTestCase | _OBJC_CLASS_$_XCTestCase | Decl | rel: 0
diff --git a/clang/test/Index/Core/index-with-module.m b/clang/test/Index/Core/index-with-module.m
index 5258e64c1941e..b116768fdb557 100644
--- a/clang/test/Index/Core/index-with-module.m
+++ b/clang/test/Index/Core/index-with-module.m
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: rm -rf %t.mcp
 // RUN: c-index-test core -print-source-symbols -dump-imported-module-files -- %s -I %S/Inputs/module -fmodules -fmodules-cache-path=%t.mcp | FileCheck %s
 
diff --git a/clang/test/Index/ms-property.cpp b/clang/test/Index/ms-property.cpp
index d30d30d033d88..74b5b1399cf50 100644
--- a/clang/test/Index/ms-property.cpp
+++ b/clang/test/Index/ms-property.cpp
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- -target x86_64-apple-darwin10 -fms-extensions -fno-ms-compatibility %s | FileCheck %s
 
 // CHECK: [[@LINE+1]]:8 | struct/C++ | Simple | [[Simple_USR:.*]] | <no-cgname> | Def | rel: 0
diff --git a/clang/test/Index/using_if_exists.cpp b/clang/test/Index/using_if_exists.cpp
index 73d1be739e428..ed13dad9b1f74 100644
--- a/clang/test/Index/using_if_exists.cpp
+++ b/clang/test/Index/using_if_exists.cpp
@@ -1,4 +1,3 @@
-// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: c-index-test core -print-source-symbols -- %s -target x86_64-unknown-unknown 2>&1 | FileCheck %s
 
 namespace ns {
diff --git a/clang/tools/libclang/CMakeLists.txt b/clang/tools/libclang/CMakeLists.txt
index 1cfc46eb1a52f..b5b6d2807d714 100644
--- a/clang/tools/libclang/CMakeLists.txt
+++ b/clang/tools/libclang/CMakeLists.txt
@@ -97,7 +97,7 @@ if(MSVC)
   set(LLVM_EXPORTED_SYMBOL_FILE)
 endif()
 
-if (UNIX AND NOT APPLE)
+if (UNIX AND NOT APPLE AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
   set(LLVM_EXPORTED_SYMBOL_FILE)
   set(USE_VERSION_SCRIPT ${LLVM_HAVE_LINK_VERSION_SCRIPT})
 endif()
@@ -129,8 +129,9 @@ else()
   set(output_name "clang")
 endif()
 
-# libclang requires headers which need _ALL_SOURCE to build on AIX
 if (UNIX AND ${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+    set(CMAKE_AIX_EXPORT_ALL_SYMBOLS OFF)
+    # libclang requires headers which need _ALL_SOURCE to build on AIX
     remove_definitions("-D_XOPEN_SOURCE=700")
 endif()
 

From 4f21fb84479286ddc781d73f8df152f81a8264e7 Mon Sep 17 00:00:00 2001
From: Qiongsi Wu <274595+qiongsiwu@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:54:58 -0500
Subject: [PATCH 471/843] [PGO] Reland PGO's Counter Reset and File Dumping
 APIs #76471 (#78285)

https://github.com/llvm/llvm-project/pull/76471 caused buildbot failures
on Windows. For more details, see
https://github.com/llvm/llvm-project/issues/77546.

This PR revises the test and relands
https://github.com/llvm/llvm-project/pull/76471.
---
 .../ExpandModularHeadersPPCallbacks.cpp       |   2 +-
 clang/docs/UsersManual.rst                    | 104 ++++++++++++++++++
 clang/include/clang/Basic/CodeGenOptions.h    |   3 +
 clang/include/clang/Frontend/Utils.h          |   4 +-
 clang/lib/Frontend/CompilerInstance.cpp       |   2 +-
 clang/lib/Frontend/InitPreprocessor.cpp       |  23 +++-
 clang/test/Profile/c-general.c                |  10 ++
 compiler-rt/include/CMakeLists.txt            |   1 +
 .../include/profile/instr_prof_interface.h    |  92 ++++++++++++++++
 compiler-rt/lib/profile/InstrProfiling.h      |  61 ++--------
 .../profile/Linux/instrprof-weak-symbol.c     |  16 +++
 compiler-rt/test/profile/instrprof-api.c      |  49 +++++++++
 12 files changed, 310 insertions(+), 57 deletions(-)
 create mode 100644 compiler-rt/include/profile/instr_prof_interface.h
 create mode 100644 compiler-rt/test/profile/Linux/instrprof-weak-symbol.c
 create mode 100644 compiler-rt/test/profile/instrprof-api.c

diff --git a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
index e414ac8c77050..5ecd4fb19131e 100644
--- a/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
+++ b/clang-tools-extra/clang-tidy/ExpandModularHeadersPPCallbacks.cpp
@@ -100,7 +100,7 @@ ExpandModularHeadersPPCallbacks::ExpandModularHeadersPPCallbacks(
                                               /*OwnsHeaderSearch=*/false);
   PP->Initialize(Compiler.getTarget(), Compiler.getAuxTarget());
   InitializePreprocessor(*PP, *PO, Compiler.getPCHContainerReader(),
-                         Compiler.getFrontendOpts());
+                         Compiler.getFrontendOpts(), Compiler.getCodeGenOpts());
   ApplyHeaderSearchOptions(*HeaderInfo, *HSO, LangOpts,
                            Compiler.getTarget().getTriple());
 }
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index 881d903d91a7e..ff2d4a68b8e55 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -2809,6 +2809,110 @@ indexed format, regardeless whether it is produced by frontend or the IR pass.
   overhead. ``prefer-atomic`` will be transformed to ``atomic`` when supported
   by the target, or ``single`` otherwise.
 
+Fine Tuning Profile Collection
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The PGO infrastructure provides user program knobs to fine tune profile
+collection. Specifically, the PGO runtime provides the following functions
+that can be used to control the regions in the program where profiles should
+be collected.
+
+ * ``void __llvm_profile_set_filename(const char *Name)``: changes the name of
+   the profile file to ``Name``.
+ * ``void __llvm_profile_reset_counters(void)``: resets all counters to zero.
+ * ``int __llvm_profile_dump(void)``: write the profile data to disk.
+ * ``int __llvm_orderfile_dump(void)``: write the order file to disk.
+
+For example, the following pattern can be used to skip profiling program
+initialization, profile two specific hot regions, and skip profiling program
+cleanup:
+
+.. code-block:: c
+
+    int main() {
+      initialize();
+
+      // Reset all profile counters to 0 to omit profile collected during
+      // initialize()'s execution.
+      __llvm_profile_reset_counters();
+      ... hot region 1
+      // Dump the profile for hot region 1.
+      __llvm_profile_set_filename("region1.profraw");
+      __llvm_profile_dump();
+
+      // Reset counters before proceeding to hot region 2.
+      __llvm_profile_reset_counters();
+      ... hot region 2
+      // Dump the profile for hot region 2.
+      __llvm_profile_set_filename("region2.profraw");
+      __llvm_profile_dump();
+
+      // Since the profile has been dumped, no further profile data
+      // will be collected beyond the above __llvm_profile_dump().
+      cleanup();
+      return 0;
+    }
+
+These APIs' names can be introduced to user programs in two ways.
+They can be declared as weak symbols on platforms which support
+treating weak symbols as ``null`` during linking. For example, the user can
+have
+
+.. code-block:: c
+
+    __attribute__((weak)) int __llvm_profile_dump(void);
+
+    // Then later in the same source file
+    if (__llvm_profile_dump)
+      if (__llvm_profile_dump() != 0) { ... }
+    // The first if condition tests if the symbol is actually defined.
+    // Profile dumping only happens if the symbol is defined. Hence,
+    // the user program works correctly during normal (not profile-generate)
+    // executions.
+
+Alternatively, the user program can include the header
+``profile/instr_prof_interface.h``, which contains the API names. For example,
+
+.. code-block:: c
+
+    #include "profile/instr_prof_interface.h"
+
+    // Then later in the same source file
+    if (__llvm_profile_dump() != 0) { ... }
+
+The user code does not need to check if the API names are defined, because
+these names are automatically replaced by ``(0)`` or the equivalence of noop
+if the ``clang`` is not compiling for profile generation.
+
+Such replacement can happen because ``clang`` adds one of two macros depending
+on the ``-fprofile-generate`` and the ``-fprofile-use`` flags.
+
+ * ``__LLVM_INSTR_PROFILE_GENERATE``: defined when one of
+   ``-fprofile[-instr]-generate``/``-fcs-profile-generate`` is in effect.
+ * ``__LLVM_INSTR_PROFILE_USE``: defined when one of
+   ``-fprofile-use``/``-fprofile-instr-use`` is in effect.
+
+The two macros can be used to provide more flexibiilty so a user program
+can execute code specifically intended for profile generate or profile use.
+For example, a user program can have special logging during profile generate:
+
+.. code-block:: c
+
+    #if __LLVM_INSTR_PROFILE_GENERATE
+    expensive_logging_of_full_program_state();
+    #endif
+
+The logging is automatically excluded during a normal build of the program,
+hence it does not impact performance during a normal execution.
+
+It is advised to use such fine tuning only in a program's cold regions. The weak
+symbols can introduce extra control flow (the ``if`` checks), while the macros
+(hence declarations they guard in ``profile/instr_prof_interface.h``)
+can change the control flow of the functions that use them between profile
+generation and profile use (which can lead to discarded counters in such
+functions). Using these APIs in the program's cold regions introduces less
+overhead and leads to more optimized code.
+
 Disabling Instrumentation
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index 6952b48e898a8..3f8fe385fef3d 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -494,6 +494,9 @@ class CodeGenOptions : public CodeGenOptionsBase {
     return getProfileInstr() == ProfileCSIRInstr;
   }
 
+  /// Check if any form of instrumentation is on.
+  bool hasProfileInstr() const { return getProfileInstr() != ProfileNone; }
+
   /// Check if Clang profile use is on.
   bool hasProfileClangUse() const {
     return getProfileUse() == ProfileClangInstr;
diff --git a/clang/include/clang/Frontend/Utils.h b/clang/include/clang/Frontend/Utils.h
index 143cf4359f00b..604e42067a3f1 100644
--- a/clang/include/clang/Frontend/Utils.h
+++ b/clang/include/clang/Frontend/Utils.h
@@ -43,12 +43,14 @@ class PCHContainerReader;
 class Preprocessor;
 class PreprocessorOptions;
 class PreprocessorOutputOptions;
+class CodeGenOptions;
 
 /// InitializePreprocessor - Initialize the preprocessor getting it and the
 /// environment ready to process a single file.
 void InitializePreprocessor(Preprocessor &PP, const PreprocessorOptions &PPOpts,
                             const PCHContainerReader &PCHContainerRdr,
-                            const FrontendOptions &FEOpts);
+                            const FrontendOptions &FEOpts,
+                            const CodeGenOptions &CodeGenOpts);
 
 /// DoPrintPreprocessedInput - Implement -E mode.
 void DoPrintPreprocessedInput(Preprocessor &PP, raw_ostream *OS,
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index c258870072613..a25aa88bd85ef 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -470,7 +470,7 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
 
   // Predefine macros and configure the preprocessor.
   InitializePreprocessor(*PP, PPOpts, getPCHContainerReader(),
-                         getFrontendOpts());
+                         getFrontendOpts(), getCodeGenOpts());
 
   // Initialize the header search object.  In CUDA compilations, we use the aux
   // triple (the host triple) to initialize our header search, since we need to
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index d83128adb511e..fe0fd3614113c 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -1364,12 +1364,22 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
   TI.getTargetDefines(LangOpts, Builder);
 }
 
+static void InitializePGOProfileMacros(const CodeGenOptions &CodeGenOpts,
+                                       MacroBuilder &Builder) {
+  if (CodeGenOpts.hasProfileInstr())
+    Builder.defineMacro("__LLVM_INSTR_PROFILE_GENERATE");
+
+  if (CodeGenOpts.hasProfileIRUse() || CodeGenOpts.hasProfileClangUse())
+    Builder.defineMacro("__LLVM_INSTR_PROFILE_USE");
+}
+
 /// InitializePreprocessor - Initialize the preprocessor getting it and the
 /// environment ready to process a single file.
-void clang::InitializePreprocessor(
-    Preprocessor &PP, const PreprocessorOptions &InitOpts,
-    const PCHContainerReader &PCHContainerRdr,
-    const FrontendOptions &FEOpts) {
+void clang::InitializePreprocessor(Preprocessor &PP,
+                                   const PreprocessorOptions &InitOpts,
+                                   const PCHContainerReader &PCHContainerRdr,
+                                   const FrontendOptions &FEOpts,
+                                   const CodeGenOptions &CodeGenOpts) {
   const LangOptions &LangOpts = PP.getLangOpts();
   std::string PredefineBuffer;
   PredefineBuffer.reserve(4080);
@@ -1416,6 +1426,11 @@ void clang::InitializePreprocessor(
   InitializeStandardPredefinedMacros(PP.getTargetInfo(), PP.getLangOpts(),
                                      FEOpts, Builder);
 
+  // The PGO instrumentation profile macros are driven by options
+  // -fprofile[-instr]-generate/-fcs-profile-generate/-fprofile[-instr]-use,
+  // hence they are not guarded by InitOpts.UsePredefines.
+  InitializePGOProfileMacros(CodeGenOpts, Builder);
+
   // Add on the predefines from the driver.  Wrap in a #line directive to report
   // that they come from the command line.
   Builder.append("# 1 \"<command line>\" 1");
diff --git a/clang/test/Profile/c-general.c b/clang/test/Profile/c-general.c
index b841f9c3d2a1d..2f621ec9b0bf9 100644
--- a/clang/test/Profile/c-general.c
+++ b/clang/test/Profile/c-general.c
@@ -9,6 +9,16 @@
 // Also check compatibility with older profiles.
 // RUN: %clang_cc1 -triple x86_64-apple-macosx10.9 -main-file-name c-general.c %s -o - -emit-llvm -fprofile-instrument-use-path=%S/Inputs/c-general.profdata.v1 | FileCheck -allow-deprecated-dag-overlap  -check-prefix=PGOUSE %s
 
+// RUN: %clang -fprofile-generate -E -dM %s | FileCheck -match-full-lines -check-prefix=PROFGENMACRO %s
+// RUN: %clang -fprofile-instr-generate -E -dM %s | FileCheck -match-full-lines -check-prefix=PROFGENMACRO %s
+// RUN: %clang -fcs-profile-generate -E -dM %s | FileCheck -match-full-lines -check-prefix=PROFGENMACRO %s
+//
+// RUN: %clang -fprofile-use=%t.profdata -E -dM %s | FileCheck -match-full-lines -check-prefix=PROFUSEMACRO %s
+// RUN: %clang -fprofile-instr-use=%t.profdata -E -dM %s | FileCheck -match-full-lines -check-prefix=PROFUSEMACRO %s
+
+// PROFGENMACRO:#define __LLVM_INSTR_PROFILE_GENERATE 1
+// PROFUSEMACRO:#define __LLVM_INSTR_PROFILE_USE 1
+
 // PGOGEN: @[[SLC:__profc_simple_loops]] = private global [4 x i64] zeroinitializer
 // PGOGEN: @[[IFC:__profc_conditionals]] = private global [13 x i64] zeroinitializer
 // PGOGEN: @[[EEC:__profc_early_exits]] = private global [9 x i64] zeroinitializer
diff --git a/compiler-rt/include/CMakeLists.txt b/compiler-rt/include/CMakeLists.txt
index 78427beedb3cc..7a100c66bbcfd 100644
--- a/compiler-rt/include/CMakeLists.txt
+++ b/compiler-rt/include/CMakeLists.txt
@@ -44,6 +44,7 @@ endif(COMPILER_RT_BUILD_ORC)
 if (COMPILER_RT_BUILD_PROFILE)
   set(PROFILE_HEADERS
     profile/InstrProfData.inc
+    profile/instr_prof_interface.h
     )
 endif(COMPILER_RT_BUILD_PROFILE)
 
diff --git a/compiler-rt/include/profile/instr_prof_interface.h b/compiler-rt/include/profile/instr_prof_interface.h
new file mode 100644
index 0000000000000..be40f2685934b
--- /dev/null
+++ b/compiler-rt/include/profile/instr_prof_interface.h
@@ -0,0 +1,92 @@
+/*===---- instr_prof_interface.h - Instrumentation PGO User Program API ----===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ *
+ * This header provides a public interface for fine-grained control of counter
+ * reset and profile dumping. These interface functions can be directly called
+ * in user programs.
+ *
+\*===---------------------------------------------------------------------===*/
+
+#ifndef COMPILER_RT_INSTR_PROFILING
+#define COMPILER_RT_INSTR_PROFILING
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifdef __LLVM_INSTR_PROFILE_GENERATE
+// Profile file reset and dump interfaces.
+// When `-fprofile[-instr]-generate`/`-fcs-profile-generate` is in effect,
+// clang defines __LLVM_INSTR_PROFILE_GENERATE to pick up the API calls.
+
+/*!
+ * \brief Set the filename for writing instrumentation data.
+ *
+ * Sets the filename to be used for subsequent calls to
+ * \a __llvm_profile_write_file().
+ *
+ * \c Name is not copied, so it must remain valid.  Passing NULL resets the
+ * filename logic to the default behaviour.
+ *
+ * Note: There may be multiple copies of the profile runtime (one for each
+ * instrumented image/DSO). This API only modifies the filename within the
+ * copy of the runtime available to the calling image.
+ *
+ * Warning: This is a no-op if continuous mode (\ref
+ * __llvm_profile_is_continuous_mode_enabled) is on. The reason for this is
+ * that in continuous mode, profile counters are mmap()'d to the profile at
+ * program initialization time. Support for transferring the mmap'd profile
+ * counts to a new file has not been implemented.
+ */
+void __llvm_profile_set_filename(const char *Name);
+
+/*!
+ * \brief Interface to set all PGO counters to zero for the current process.
+ *
+ */
+void __llvm_profile_reset_counters(void);
+
+/*!
+ * \brief this is a wrapper interface to \c __llvm_profile_write_file.
+ * After this interface is invoked, an already dumped flag will be set
+ * so that profile won't be dumped again during program exit.
+ * Invocation of interface __llvm_profile_reset_counters will clear
+ * the flag. This interface is designed to be used to collect profile
+ * data from user selected hot regions. The use model is
+ *      __llvm_profile_reset_counters();
+ *      ... hot region 1
+ *      __llvm_profile_dump();
+ *      .. some other code
+ *      __llvm_profile_reset_counters();
+ *      ... hot region 2
+ *      __llvm_profile_dump();
+ *
+ *  It is expected that on-line profile merging is on with \c %m specifier
+ *  used in profile filename . If merging is not turned on, user is expected
+ *  to invoke __llvm_profile_set_filename to specify different profile names
+ *  for different regions before dumping to avoid profile write clobbering.
+ */
+int __llvm_profile_dump(void);
+
+// Interface to dump the current process' order file to disk.
+int __llvm_orderfile_dump(void);
+
+#else
+
+#define __llvm_profile_set_filename(Name)
+#define __llvm_profile_reset_counters()
+#define __llvm_profile_dump() (0)
+#define __llvm_orderfile_dump() (0)
+
+#endif
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 137115996748c..0123908336918 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -12,6 +12,17 @@
 #include "InstrProfilingPort.h"
 #include <stdio.h>
 
+// Make sure __LLVM_INSTR_PROFILE_GENERATE is always defined before
+// including instr_prof_interface.h so the interface functions are
+// declared correctly for the runtime.
+// __LLVM_INSTR_PROFILE_GENERATE is always `#undef`ed after the header,
+// because compiler-rt does not support profiling the profiling runtime itself.
+#ifndef __LLVM_INSTR_PROFILE_GENERATE
+#define __LLVM_INSTR_PROFILE_GENERATE
+#endif
+#include "profile/instr_prof_interface.h"
+#undef __LLVM_INSTR_PROFILE_GENERATE
+
 #define INSTR_PROF_VISIBILITY COMPILER_RT_VISIBILITY
 #include "profile/InstrProfData.inc"
 
@@ -100,12 +111,6 @@ ValueProfNode *__llvm_profile_begin_vnodes();
 ValueProfNode *__llvm_profile_end_vnodes();
 uint32_t *__llvm_profile_begin_orderfile();
 
-/*!
- * \brief Clear profile counters to zero.
- *
- */
-void __llvm_profile_reset_counters(void);
-
 /*!
  * \brief Merge profile data from buffer.
  *
@@ -156,50 +161,6 @@ void __llvm_profile_instrument_target_value(uint64_t TargetValue, void *Data,
 int __llvm_profile_write_file(void);
 
 int __llvm_orderfile_write_file(void);
-/*!
- * \brief this is a wrapper interface to \c __llvm_profile_write_file.
- * After this interface is invoked, an already dumped flag will be set
- * so that profile won't be dumped again during program exit.
- * Invocation of interface __llvm_profile_reset_counters will clear
- * the flag. This interface is designed to be used to collect profile
- * data from user selected hot regions. The use model is
- *      __llvm_profile_reset_counters();
- *      ... hot region 1
- *      __llvm_profile_dump();
- *      .. some other code
- *      __llvm_profile_reset_counters();
- *       ... hot region 2
- *      __llvm_profile_dump();
- *
- *  It is expected that on-line profile merging is on with \c %m specifier
- *  used in profile filename . If merging is  not turned on, user is expected
- *  to invoke __llvm_profile_set_filename  to specify different profile names
- *  for different regions before dumping to avoid profile write clobbering.
- */
-int __llvm_profile_dump(void);
-
-int __llvm_orderfile_dump(void);
-
-/*!
- * \brief Set the filename for writing instrumentation data.
- *
- * Sets the filename to be used for subsequent calls to
- * \a __llvm_profile_write_file().
- *
- * \c Name is not copied, so it must remain valid.  Passing NULL resets the
- * filename logic to the default behaviour.
- *
- * Note: There may be multiple copies of the profile runtime (one for each
- * instrumented image/DSO). This API only modifies the filename within the
- * copy of the runtime available to the calling image.
- *
- * Warning: This is a no-op if continuous mode (\ref
- * __llvm_profile_is_continuous_mode_enabled) is on. The reason for this is
- * that in continuous mode, profile counters are mmap()'d to the profile at
- * program initialization time. Support for transferring the mmap'd profile
- * counts to a new file has not been implemented.
- */
-void __llvm_profile_set_filename(const char *Name);
 
 /*!
  * \brief Set the FILE object for writing instrumentation data. Return 0 if set
diff --git a/compiler-rt/test/profile/Linux/instrprof-weak-symbol.c b/compiler-rt/test/profile/Linux/instrprof-weak-symbol.c
new file mode 100644
index 0000000000000..eda299cb6610e
--- /dev/null
+++ b/compiler-rt/test/profile/Linux/instrprof-weak-symbol.c
@@ -0,0 +1,16 @@
+// Test the linker feature that treats undefined weak symbols as null values.
+
+// RUN: %clang_pgogen -o %t %s
+// RUN: not %t
+// RUN: %clang -o %t %s
+// RUN: %t
+
+__attribute__((weak)) void __llvm_profile_reset_counters(void);
+
+int main() {
+  if (__llvm_profile_reset_counters) {
+    __llvm_profile_reset_counters();
+    return 1;
+  }
+  return 0;
+}
diff --git a/compiler-rt/test/profile/instrprof-api.c b/compiler-rt/test/profile/instrprof-api.c
new file mode 100644
index 0000000000000..fedec2d1afc48
--- /dev/null
+++ b/compiler-rt/test/profile/instrprof-api.c
@@ -0,0 +1,49 @@
+// UNSUPPORTED: target={{.*windows.*}}
+// __llvm_orderfile_dump() is not supported on Windows.
+
+// Testing profile generate.
+// RUN: %clang_profgen %s -S -emit-llvm -o - | FileCheck %s --check-prefix=PROFGEN
+// RUN: %clang_pgogen %s -S -emit-llvm -o - | FileCheck %s --check-prefix=PROFGEN
+
+// Testing profile use. Generate some profile file first.
+// RUN: rm -rf rawprof.profraw
+// RUN: %clang_profgen -o %t1 %s
+// RUN: %run %t1
+// RUN: llvm-profdata merge -o %t1.profdata rawprof.profraw
+// RUN: %clang_profuse=%t1.profdata %s -S -emit-llvm -o - | FileCheck %s --check-prefix=PROFUSE
+// RUN: rm -rf rawprof.profraw
+// RUN: %clang_pgogen -o %t2 %s
+// RUN: %run %t2
+// RUN: llvm-profdata merge -o %t2.profdata rawprof.profraw
+// RUN: %clang_pgouse=%t2.profdata %s -S -emit-llvm -o - | FileCheck %s --check-prefix=PROFUSE
+#include "profile/instr_prof_interface.h"
+
+__attribute__((noinline)) int bar() { return 4; }
+
+int foo() {
+  __llvm_profile_reset_counters();
+  // PROFGEN: call void @__llvm_profile_reset_counters()
+  // PROFUSE-NOT: call void @__llvm_profile_reset_counters()
+  return bar();
+}
+
+// PROFUSE-NOT: declare void @__llvm_profile_reset_counters()
+
+int main() {
+  int z = foo() + 3;
+  __llvm_profile_set_filename("rawprof.profraw");
+  // PROFGEN: call void @__llvm_profile_set_filename(ptr noundef @{{.*}})
+  // PROFUSE-NOT: call void @__llvm_profile_set_filename(ptr noundef @{{.*}})
+  if (__llvm_profile_dump())
+    return 2;
+  // PROFGEN: %{{.*}} = call {{(signext )*}}i32 @__llvm_profile_dump()
+  // PROFUSE-NOT: %{{.*}} = call {{(signext )*}}i32 @__llvm_profile_dump()
+  __llvm_orderfile_dump();
+  // PROFGEN: %{{.*}} = call {{(signext )*}}i32 @__llvm_orderfile_dump()
+  // PROFUSE-NOT: %{{.*}} = call {{(signext )*}}i32 @__llvm_orderfile_dump()
+  return z + bar() - 11;
+}
+
+// PROFUSE-NOT: declare void @__llvm_profile_set_filename(ptr noundef)
+// PROFUSE-NOT: declare signext i32 @__llvm_profile_dump()
+// PROFUSE-NOT: declare signext i32 @__llvm_orderfile_dump()

From e3172e841899779caf461cee97826940dca806e7 Mon Sep 17 00:00:00 2001
From: Andrzej Warzynski <andrzej.warzynski@arm.com>
Date: Mon, 22 Jan 2024 20:01:45 +0000
Subject: [PATCH 472/843] [mlir] Update "UNSUPPORTED" directive in a test

Add missing "arm64" target to the list of unsupported targets ("arm64"
is used on Darwin).
---
 mlir/test/mlir-cpu-runner/global-constructors.mlir | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/test/mlir-cpu-runner/global-constructors.mlir b/mlir/test/mlir-cpu-runner/global-constructors.mlir
index fe1fe06574648..0443e4d51e9ad 100644
--- a/mlir/test/mlir-cpu-runner/global-constructors.mlir
+++ b/mlir/test/mlir-cpu-runner/global-constructors.mlir
@@ -1,4 +1,4 @@
-// UNSUPPORTED: target=aarch64{{.*}}
+// UNSUPPORTED: target=aarch64{{.*}}, target=arm64{{.*}}
 // RUN: mlir-cpu-runner %s -e entry -entry-point-result=void  \
 // RUN: -shared-libs=%mlir_c_runner_utils | \
 // RUN: FileCheck %s

From 83f82a9efdb4492659396b9a0bcdf64371752cd9 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 22 Jan 2024 15:06:30 -0500
Subject: [PATCH 473/843] [gn] port 4f21fb844792

---
 llvm/utils/gn/secondary/compiler-rt/include/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/compiler-rt/include/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/include/BUILD.gn
index bd88978c105ce..485beabd477a1 100644
--- a/llvm/utils/gn/secondary/compiler-rt/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/include/BUILD.gn
@@ -6,6 +6,7 @@ copy("include") {
     "orc_rt/c_api.h",
     "profile/InstrProfData.inc",
     "profile/MemProfData.inc",
+    "profile/instr_prof_interface.h",
     "sanitizer/allocator_interface.h",
     "sanitizer/asan_interface.h",
     "sanitizer/common_interface_defs.h",

From 0ab9c382d4ca070cf2335030a689b505bbd967c6 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 22 Jan 2024 20:06:43 +0000
Subject: [PATCH 474/843] [gn build] Port 06c3c3b67cb0

---
 .../gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn  | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
index 7f9302e06f8b7..1863aca2bd3ab 100644
--- a/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang-tools-extra/clang-tidy/bugprone/BUILD.gn
@@ -24,6 +24,7 @@ static_library("bugprone") {
     "BranchCloneCheck.cpp",
     "BugproneTidyModule.cpp",
     "CastingThroughVoidCheck.cpp",
+    "ChainedComparisonCheck.cpp",
     "ComparePointerToMemberVirtualFunctionCheck.cpp",
     "CopyConstructorInitCheck.cpp",
     "DanglingHandleCheck.cpp",

From 02a28ee8d981b2a8416c50e41c74cb21e73b8340 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Mon, 22 Jan 2024 21:19:29 +0100
Subject: [PATCH 475/843] [Clang] Update feature test macros for Clang 18
 (#78991)

* Set `__cpp_auto_cast`, as per
https://github.com/cplusplus/CWG/issues/281
* Support `__has_extension(cxx_generalized_nttp)` in C++20 as the
feature isn't stable enough for a feature test macro
* Support `__has_extension(cxx_explicit_this_parameter)` in c++23 as the
feature isn't stable enough for a feature test macro
---
 clang/docs/ReleaseNotes.rst             |  6 +++++-
 clang/include/clang/Basic/Features.def  |  4 ++++
 clang/lib/Frontend/InitPreprocessor.cpp |  1 +
 clang/test/Lexer/cxx-features.cpp       |  5 +++++
 clang/test/Lexer/has_extension_cxx.cpp  | 16 ++++++++++++++++
 5 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 13e6c5e8db3ea..0b37b5fc29260 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -185,7 +185,8 @@ C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 - Implemented `P1907R1 <https://wg21.link/P1907R1>` which extends allowed non-type template argument
   kinds with e.g. floating point values and pointers and references to subobjects.
-  This feature is still experimental. Accordingly, `__cpp_nontype_template_args` was not updated.
+  This feature is still experimental. Accordingly, ``__cpp_nontype_template_args`` was not updated.
+  However, its support can be tested with ``__has_extension(cxx_generalized_nttp)``.
 
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -194,6 +195,7 @@ C++23 Feature Support
   `CWG2653 <https://wg21.link/CWG2653>`_, `CWG2687 <https://wg21.link/CWG2687>`_). Because the
   support for this feature is still experimental, the feature test macro ``__cpp_explicit_this_parameter``
   was not set in this version.
+  However, its support can be tested with ``__has_extension(cxx_explicit_this_parameter)``.
 
 - Added a separate warning to warn the use of attributes on lambdas as a C++23 extension
   in previous language versions: ``-Wc++23-lambda-attributes``.
@@ -1045,6 +1047,8 @@ Bug Fixes to C++ Support
 - Remove recorded `#pragma once` state for headers included in named modules.
   Fixes (`#77995 <https://github.com/llvm/llvm-project/issues/77995>`_)
 
+- Set the ``__cpp_auto_cast`` feature test macro in C++23 mode.
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
diff --git a/clang/include/clang/Basic/Features.def b/clang/include/clang/Basic/Features.def
index 06efac0cf1abd..5fad5fc3623cb 100644
--- a/clang/include/clang/Basic/Features.def
+++ b/clang/include/clang/Basic/Features.def
@@ -269,6 +269,10 @@ EXTENSION(cxx_fixed_enum, true)
 EXTENSION(cxx_binary_literals, true)
 EXTENSION(cxx_init_captures, LangOpts.CPlusPlus11)
 EXTENSION(cxx_variable_templates, LangOpts.CPlusPlus)
+//C++20
+EXTENSION(cxx_generalized_nttp, LangOpts.CPlusPlus20)
+//C++23
+EXTENSION(cxx_explicit_this_parameter, LangOpts.CPlusPlus23)
 // Miscellaneous language extensions
 EXTENSION(overloadable_unmarked, true)
 EXTENSION(pragma_clang_attribute_namespaces, true)
diff --git a/clang/lib/Frontend/InitPreprocessor.cpp b/clang/lib/Frontend/InitPreprocessor.cpp
index fe0fd3614113c..1b91c86f91398 100644
--- a/clang/lib/Frontend/InitPreprocessor.cpp
+++ b/clang/lib/Frontend/InitPreprocessor.cpp
@@ -728,6 +728,7 @@ static void InitializeCPlusPlusFeatureTestMacros(const LangOptions &LangOpts,
     Builder.defineMacro("__cpp_size_t_suffix", "202011L");
     Builder.defineMacro("__cpp_if_consteval", "202106L");
     Builder.defineMacro("__cpp_multidimensional_subscript", "202211L");
+    Builder.defineMacro("__cpp_auto_cast", "202110L");
   }
 
   // We provide those C++23 features as extensions in earlier language modes, so
diff --git a/clang/test/Lexer/cxx-features.cpp b/clang/test/Lexer/cxx-features.cpp
index 6120b299039e5..eb0e615f9470b 100644
--- a/clang/test/Lexer/cxx-features.cpp
+++ b/clang/test/Lexer/cxx-features.cpp
@@ -40,6 +40,11 @@
 
 // --- C++23 features ---
 
+#if check(auto_cast, 0, 0, 0, 0, 0, 202110, 202110)
+#error "wrong value for __cpp_auto_cast"
+#endif
+
+
 #if check(implicit_move, 0, 0, 0, 0, 0, 202011, 202011)
 #error "wrong value for __cpp_implicit_move"
 #endif
diff --git a/clang/test/Lexer/has_extension_cxx.cpp b/clang/test/Lexer/has_extension_cxx.cpp
index 1ae6a446b300a..7941997428aca 100644
--- a/clang/test/Lexer/has_extension_cxx.cpp
+++ b/clang/test/Lexer/has_extension_cxx.cpp
@@ -1,5 +1,7 @@
 // RUN: %clang_cc1 -std=c++98 -E %s -o - | FileCheck %s
 // RUN: %clang_cc1 -std=c++11 -E %s -o - | FileCheck %s --check-prefix=CHECK11
+// RUN: %clang_cc1 -std=c++20 -E %s -o - | FileCheck %s --check-prefix=CHECK20
+// RUN: %clang_cc1 -std=c++23 -E %s -o - | FileCheck %s --check-prefix=CHECK23
 
 // CHECK: c_static_assert
 #if __has_extension(c_static_assert)
@@ -76,3 +78,17 @@ int has_variable_templates();
 #if __has_extension(cxx_init_captures)
 int has_init_captures();
 #endif
+
+
+// CHECK11-NOT: has_generalized_nttp
+// CHECK20: has_generalized_nttp
+#if __has_extension(cxx_generalized_nttp)
+int has_generalized_nttp();
+#endif
+
+
+// CHECK20-NOT: has_explicit_this_parameter
+// CHECK23: has_explicit_this_parameter
+#if __has_extension(cxx_explicit_this_parameter)
+int has_explicit_this_parameter();
+#endif

From 263efb044add93b946ca4a96a8f1699b3eb9ae73 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Mon, 22 Jan 2024 21:24:57 +0100
Subject: [PATCH 476/843] [lli] Revisit Orc debug output tests (#76822)

Integrate in-memory debug-info dumps into the `--orc-lazy-debug`
command-line option instead of exposing built-in functions to be called
from JITed code. This reduces overall amount of code (removing
`ExecutionUtils.cpp`) and seems cleaner anyway.

All existing items of `OrcDumpKind` work on IR level and run in the
IR-transform step of the JIT. The newly added `DumpDebugDescriptor` and
`DumpDebugObjects` must run after debug-registration and thus are
deferred to the Object-transform step of the JIT. This separation is the
major side-effect of the patch.
---
 ...tor-elf-minimal.ll => debug-descriptor.ll} |  30 ++--
 .../OrcLazy/debug-objects-elf-minimal.ll      |  82 -----------
 .../ExecutionEngine/OrcLazy/debug-objects.ll  |  62 ++++++++
 llvm/tools/lli/CMakeLists.txt                 |   1 -
 llvm/tools/lli/ExecutionUtils.cpp             | 122 ----------------
 llvm/tools/lli/ExecutionUtils.h               |  60 --------
 llvm/tools/lli/lli.cpp                        | 135 +++++++++++++-----
 7 files changed, 170 insertions(+), 322 deletions(-)
 rename llvm/test/ExecutionEngine/OrcLazy/{debug-descriptor-elf-minimal.ll => debug-descriptor.ll} (53%)
 delete mode 100644 llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
 create mode 100644 llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll
 delete mode 100644 llvm/tools/lli/ExecutionUtils.cpp
 delete mode 100644 llvm/tools/lli/ExecutionUtils.h

diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor.ll
similarity index 53%
rename from llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
rename to llvm/test/ExecutionEngine/OrcLazy/debug-descriptor.ll
index 9bac3897864f7..bf7f13fb5439f 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor.ll
@@ -1,29 +1,17 @@
-; REQUIRES: native && x86_64-linux
+; REQUIRES: native && target-x86_64
 
-; RUN: lli --jit-linker=rtdyld \
-; RUN:     --generate=__dump_jit_debug_descriptor %s | FileCheck %s
+; RUN: lli --jit-linker=rtdyld --orc-lazy-debug=jit-debug-descriptor %s 2>&1 | FileCheck %s
+; RUN: lli --jit-linker=jitlink --orc-lazy-debug=jit-debug-descriptor %s 2>&1 | FileCheck %s
 ;
-; RUN: lli --jit-linker=jitlink \
-; RUN:     --generate=__dump_jit_debug_descriptor %s | FileCheck %s
+; Initial entry should be empty:
+; CHECK: jit_debug_descriptor 0x0000000000000000
 ;
-; CHECK: Reading __jit_debug_descriptor at 0x{{.*}}
-; CHECK: Version: 1
-; CHECK: Action: JIT_REGISTER_FN
-; CHECK:       Entry               Symbol File             Size  Previous Entry
-; CHECK: [ 0]  0x{{.*}}            0x{{.*}}              {{.*}}  0x0000000000000000
-
-target triple = "x86_64-unknown-unknown-elf"
-
-; Built-in symbol provided by the JIT
-declare void @__dump_jit_debug_descriptor(ptr)
-
-; Host-process symbol from the GDB JIT interface
-@__jit_debug_descriptor = external global i8, align 1
+; After adding the module it must not be empty anymore:
+; CHECK: jit_debug_descriptor 0x
+; CHECK-NOT:                    000000000000000
+; CHECK-SAME:                               {{[048c]}}
 
 define i32 @main() !dbg !9 {
-  %1 = alloca i32, align 4
-  store i32 0, ptr %1, align 4
-  call void @__dump_jit_debug_descriptor(ptr @__jit_debug_descriptor), !dbg !13
   ret i32 0, !dbg !14
 }
 
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
deleted file mode 100644
index 31fe730b74036..0000000000000
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; REQUIRES: native && x86_64-linux
-
-; In-memory debug-object contains some basic DWARF
-;
-; RUN: lli --jit-linker=rtdyld \
-; RUN:     --generate=__dump_jit_debug_objects %s | llvm-dwarfdump --diff - | FileCheck %s
-;
-; RUN: lli --jit-linker=jitlink \
-; RUN:     --generate=__dump_jit_debug_objects %s | llvm-dwarfdump --diff - | FileCheck %s
-;
-; CHECK: -:	file format elf64-x86-64
-; CHECK: .debug_info contents:
-; CHECK: 0x00000000: Compile Unit: length = 0x00000047, format = DWARF32, version = 0x0004, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x0000004b)
-; CHECK: DW_TAG_compile_unit
-; CHECK:               DW_AT_producer	("compiler version")
-; CHECK:               DW_AT_language	(DW_LANG_C99)
-; CHECK:               DW_AT_name	("source-file.c")
-; CHECK:               DW_AT_stmt_list	()
-; CHECK:               DW_AT_comp_dir	("/workspace")
-; CHECK:               DW_AT_low_pc	()
-; CHECK:               DW_AT_high_pc	()
-; CHECK:   DW_TAG_subprogram
-; CHECK:                 DW_AT_low_pc	()
-; CHECK:                 DW_AT_high_pc	()
-; CHECK:                 DW_AT_frame_base	(DW_OP_reg7 RSP)
-; CHECK:                 DW_AT_name	("main")
-; CHECK:                 DW_AT_decl_file	("/workspace/source-file.c")
-; CHECK:                 DW_AT_decl_line	(4)
-; CHECK:                 DW_AT_type	("int")
-; CHECK:                 DW_AT_external	(true)
-; CHECK:   DW_TAG_base_type
-; CHECK:                 DW_AT_name	("int")
-; CHECK:                 DW_AT_encoding	(DW_ATE_signed)
-; CHECK:                 DW_AT_byte_size	(0x04)
-; CHECK:   NULL
-
-; Text section of the in-memory debug-object has a non-null load-address
-;
-; RUN: lli --jit-linker=rtdyld \
-; RUN:     --generate=__dump_jit_debug_objects %s | llvm-objdump --section-headers - | \
-; RUN:     FileCheck --check-prefix=CHECK_LOAD_ADDR %s
-;
-; RUN: lli --jit-linker=jitlink \
-; RUN:     --generate=__dump_jit_debug_objects %s | llvm-objdump --section-headers - | \
-; RUN:     FileCheck --check-prefix=CHECK_LOAD_ADDR %s
-;
-; CHECK_LOAD_ADDR-NOT: {{[0-9]*}} .ltext {{.*}} 0000000000000000 TEXT
-
-target triple = "x86_64-unknown-unknown-elf"
-
-; Built-in symbol provided by the JIT
-declare void @__dump_jit_debug_objects(ptr)
-
-; Host-process symbol from the GDB JIT interface
-@__jit_debug_descriptor = external global i8, align 1
-
-define i32 @main() !dbg !9 {
-  %1 = alloca i32, align 4
-  store i32 0, ptr %1, align 4
-  call void @__dump_jit_debug_objects(ptr @__jit_debug_descriptor), !dbg !13
-  ret i32 0, !dbg !14
-}
-
-!llvm.module.flags = !{!0, !1, !2, !3, !4}
-!llvm.dbg.cu = !{!5}
-!llvm.ident = !{!8}
-
-!0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 6]}
-!1 = !{i32 7, !"Dwarf Version", i32 4}
-!2 = !{i32 2, !"Debug Info Version", i32 3}
-!3 = !{i32 1, !"wchar_size", i32 4}
-!4 = !{i32 7, !"PIC Level", i32 2}
-!5 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, producer: "compiler version", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, nameTableKind: None)
-!6 = !DIFile(filename: "source-file.c", directory: "/workspace")
-!7 = !{}
-!8 = !{!"compiler version"}
-!9 = distinct !DISubprogram(name: "main", scope: !6, file: !6, line: 4, type: !10, scopeLine: 4, spFlags: DISPFlagDefinition, unit: !5, retainedNodes: !7)
-!10 = !DISubroutineType(types: !11)
-!11 = !{!12}
-!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!13 = !DILocation(line: 5, column: 3, scope: !9)
-!14 = !DILocation(line: 6, column: 3, scope: !9)
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll
new file mode 100644
index 0000000000000..1f48bd9a2318a
--- /dev/null
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll
@@ -0,0 +1,62 @@
+; REQUIRES: native && x86_64-linux
+
+; In-memory debug-objects contain DWARF
+;
+; RUN: lli --jit-linker=rtdyld  --orc-lazy-debug=jit-debug-objects %s | llvm-dwarfdump --diff - | FileCheck %s
+; RUN: lli --jit-linker=jitlink --orc-lazy-debug=jit-debug-objects %s | llvm-dwarfdump --diff - | FileCheck %s
+;
+; CHECK: -:	file format elf64-x86-64
+; TODO: Synthesized Mach-O objects error out with:
+;       truncated or malformed object (offset field of section 8 in
+;       LC_SEGMENT_64 command 0 extends past the end of the file)
+;
+; CHECK: .debug_info contents:
+; CHECK: format = DWARF32
+; CHECK: DW_TAG_compile_unit
+; CHECK:               DW_AT_producer	("clang version 18.0.0git")
+; CHECK:               DW_AT_language	(DW_LANG_C11)
+; CHECK:               DW_AT_name	("source-file.c")
+; CHECK:               DW_AT_comp_dir	("/workspace")
+; CHECK:   DW_TAG_subprogram
+; CHECK:                 DW_AT_frame_base	(DW_OP_reg7 RSP)
+; CHECK:                 DW_AT_name	("main")
+; CHECK:                 DW_AT_decl_file	("/workspace/source-file.c")
+; CHECK:                 DW_AT_decl_line	(1)
+; CHECK:                 DW_AT_type	("int")
+; CHECK:                 DW_AT_external	(true)
+; CHECK:   DW_TAG_base_type
+; CHECK:                 DW_AT_name	("int")
+; CHECK:                 DW_AT_encoding	(DW_ATE_signed)
+; CHECK:                 DW_AT_byte_size	(0x04)
+; CHECK:   NULL
+
+; Text section of the in-memory debug-objects have non-null load-address
+;
+; RUN: lli --jit-linker=rtdyld --orc-lazy-debug=jit-debug-objects %s | \
+; RUN:                              llvm-objdump --section-headers - | \
+; RUN:                              FileCheck --check-prefix=CHECK_LOAD_ADDR %s
+; RUN: lli --jit-linker=jitlink --orc-lazy-debug=jit-debug-objects %s | \
+; RUN:                               llvm-objdump --section-headers - | \
+; RUN:                               FileCheck --check-prefix=CHECK_LOAD_ADDR %s
+;
+; CHECK_LOAD_ADDR:      .text
+; CHECK_LOAD_ADDR-NOT:  0000000000000000
+; CHECK_LOAD_ADDR-SAME: TEXT
+
+define i32 @main() !dbg !3 {
+entry:
+  ret i32 0, !dbg !8
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !DIFile(filename: "source-file.c", directory: "/workspace")
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 18.0.0git", emissionKind: FullDebug)
+!3 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !4, scopeLine: 1, unit: !2, retainedNodes: !7)
+!4 = !DISubroutineType(types: !5)
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 14, scope: !3)
diff --git a/llvm/tools/lli/CMakeLists.txt b/llvm/tools/lli/CMakeLists.txt
index 315de28e12b57..e3fca225a2275 100644
--- a/llvm/tools/lli/CMakeLists.txt
+++ b/llvm/tools/lli/CMakeLists.txt
@@ -53,7 +53,6 @@ endif( LLVM_USE_PERF )
 
 add_llvm_tool(lli
   lli.cpp
-  ExecutionUtils.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/tools/lli/ExecutionUtils.cpp b/llvm/tools/lli/ExecutionUtils.cpp
deleted file mode 100644
index b6cc3bb174d3c..0000000000000
--- a/llvm/tools/lli/ExecutionUtils.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===---- ExecutionUtils.cpp - Utilities for executing functions in lli ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ExecutionUtils.h"
-
-#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <cstdint>
-#include <vector>
-
-namespace llvm {
-
-template <typename... Ts> static void outsv(const char *Fmt, Ts &&...Vals) {
-  outs() << formatv(Fmt, Vals...);
-}
-
-static const char *actionFlagToStr(uint32_t ActionFlag) {
-  switch (ActionFlag) {
-  case JIT_NOACTION:
-    return "JIT_NOACTION";
-  case JIT_REGISTER_FN:
-    return "JIT_REGISTER_FN";
-  case JIT_UNREGISTER_FN:
-    return "JIT_UNREGISTER_FN";
-  }
-  return "<invalid action_flag>";
-}
-
-// Declarations follow the GDB JIT interface (version 1, 2009) and must match
-// those of the DYLD used for testing.
-//
-// Sample output:
-//
-//   Reading __jit_debug_descriptor at 0x0000000000404048
-//
-//   Version: 0
-//   Action: JIT_REGISTER_FN
-//
-//         Entry               Symbol File         Size  Previous Entry
-//   [ 0]  0x0000000000451290  0x0000000000002000   200  0x0000000000000000
-//   [ 1]  0x0000000000451260  0x0000000000001000   100  0x0000000000451290
-//   ...
-//
-static void dumpDebugDescriptor(void *Addr) {
-  outsv("Reading __jit_debug_descriptor at {0}\n\n", Addr);
-
-  jit_descriptor *Descriptor = reinterpret_cast<jit_descriptor *>(Addr);
-  outsv("Version: {0}\n", Descriptor->version);
-  outsv("Action: {0}\n\n", actionFlagToStr(Descriptor->action_flag));
-  outsv("{0,11}  {1,24}  {2,15}  {3,14}\n", "Entry", "Symbol File", "Size",
-        "Previous Entry");
-
-  unsigned Idx = 0;
-  for (auto *Entry = Descriptor->first_entry; Entry; Entry = Entry->next_entry)
-    outsv("[{0,2}]  {1:X16}  {2:X16}  {3,8:D}  {4}\n", Idx++, Entry,
-          reinterpret_cast<const void *>(Entry->symfile_addr),
-          Entry->symfile_size, Entry->prev_entry);
-}
-
-static LLIBuiltinFunctionGenerator *Generator = nullptr;
-
-static void dumpDebugObjects(void *Addr) {
-  jit_descriptor *Descriptor = reinterpret_cast<jit_descriptor *>(Addr);
-  for (auto *Entry = Descriptor->first_entry; Entry; Entry = Entry->next_entry)
-    Generator->appendDebugObject(Entry->symfile_addr, Entry->symfile_size);
-}
-
-LLIBuiltinFunctionGenerator::LLIBuiltinFunctionGenerator(
-    std::vector<BuiltinFunctionKind> Enabled, orc::MangleAndInterner &Mangle)
-    : TestOut(nullptr) {
-  Generator = this;
-  for (BuiltinFunctionKind F : Enabled) {
-    switch (F) {
-    case BuiltinFunctionKind::DumpDebugDescriptor:
-      expose(Mangle("__dump_jit_debug_descriptor"), &dumpDebugDescriptor);
-      break;
-    case BuiltinFunctionKind::DumpDebugObjects:
-      expose(Mangle("__dump_jit_debug_objects"), &dumpDebugObjects);
-      TestOut = createToolOutput();
-      break;
-    }
-  }
-}
-
-Error LLIBuiltinFunctionGenerator::tryToGenerate(
-    orc::LookupState &LS, orc::LookupKind K, orc::JITDylib &JD,
-    orc::JITDylibLookupFlags JDLookupFlags,
-    const orc::SymbolLookupSet &Symbols) {
-  orc::SymbolMap NewSymbols;
-  for (const auto &NameFlags : Symbols) {
-    auto It = BuiltinFunctions.find(NameFlags.first);
-    if (It != BuiltinFunctions.end())
-      NewSymbols.insert(*It);
-  }
-
-  if (NewSymbols.empty())
-    return Error::success();
-
-  return JD.define(absoluteSymbols(std::move(NewSymbols)));
-}
-
-// static
-std::unique_ptr<ToolOutputFile>
-LLIBuiltinFunctionGenerator::createToolOutput() {
-  std::error_code EC;
-  auto TestOut = std::make_unique<ToolOutputFile>("-", EC, sys::fs::OF_None);
-  if (EC) {
-    errs() << "Error creating tool output file: " << EC.message() << '\n';
-    exit(1);
-  }
-  return TestOut;
-}
-
-} // namespace llvm
diff --git a/llvm/tools/lli/ExecutionUtils.h b/llvm/tools/lli/ExecutionUtils.h
deleted file mode 100644
index 6bf9cd58e031b..0000000000000
--- a/llvm/tools/lli/ExecutionUtils.h
+++ /dev/null
@@ -1,60 +0,0 @@
-//===- ExecutionUtils.h - Utilities for executing code in lli ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Contains utilities for executing code in lli.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLI_EXECUTIONUTILS_H
-#define LLVM_TOOLS_LLI_EXECUTIONUTILS_H
-
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ToolOutputFile.h"
-
-#include <memory>
-#include <utility>
-
-namespace llvm {
-
-enum class BuiltinFunctionKind {
-  DumpDebugDescriptor,
-  DumpDebugObjects,
-};
-
-// Utility class to expose symbols for special-purpose functions to the JIT.
-class LLIBuiltinFunctionGenerator : public orc::DefinitionGenerator {
-public:
-  LLIBuiltinFunctionGenerator(std::vector<BuiltinFunctionKind> Enabled,
-                              orc::MangleAndInterner &Mangle);
-
-  Error tryToGenerate(orc::LookupState &LS, orc::LookupKind K,
-                      orc::JITDylib &JD, orc::JITDylibLookupFlags JDLookupFlags,
-                      const orc::SymbolLookupSet &Symbols) override;
-
-  void appendDebugObject(const char *Addr, size_t Size) {
-    TestOut->os().write(Addr, Size);
-  }
-
-private:
-  orc::SymbolMap BuiltinFunctions;
-  std::unique_ptr<ToolOutputFile> TestOut;
-
-  template <typename T> void expose(orc::SymbolStringPtr Name, T *Handler) {
-    BuiltinFunctions[Name] = {orc::ExecutorAddr::fromPtr(Handler),
-                              JITSymbolFlags::Exported};
-  }
-
-  static std::unique_ptr<ToolOutputFile> createToolOutput();
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_LLI_EXECUTIONUTILS_H
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index 8a7ea2d3d0c58..15acf6d38dba6 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ExecutionUtils.h"
 #include "ForwardingMemoryManager.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -33,6 +32,7 @@
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
@@ -62,6 +62,7 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -254,35 +255,29 @@ namespace {
     NoDump,
     DumpFuncsToStdOut,
     DumpModsToStdOut,
-    DumpModsToDisk
+    DumpModsToDisk,
+    DumpDebugDescriptor,
+    DumpDebugObjects,
   };
 
   cl::opt<DumpKind> OrcDumpKind(
       "orc-lazy-debug", cl::desc("Debug dumping for the orc-lazy JIT."),
       cl::init(DumpKind::NoDump),
-      cl::values(clEnumValN(DumpKind::NoDump, "no-dump",
-                            "Don't dump anything."),
-                 clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
-                            "Dump function names to stdout."),
-                 clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
-                            "Dump modules to stdout."),
-                 clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
-                            "Dump modules to the current "
-                            "working directory. (WARNING: "
-                            "will overwrite existing files).")),
-      cl::Hidden);
-
-  cl::list<BuiltinFunctionKind> GenerateBuiltinFunctions(
-      "generate",
-      cl::desc("Provide built-in functions for access by JITed code "
-               "(jit-kind=orc-lazy only)"),
-      cl::values(clEnumValN(BuiltinFunctionKind::DumpDebugDescriptor,
-                            "__dump_jit_debug_descriptor",
-                            "Dump __jit_debug_descriptor contents to stdout"),
-                 clEnumValN(BuiltinFunctionKind::DumpDebugObjects,
-                            "__dump_jit_debug_objects",
-                            "Dump __jit_debug_descriptor in-memory debug "
-                            "objects as tool output")),
+      cl::values(
+          clEnumValN(DumpKind::NoDump, "no-dump", "Don't dump anything."),
+          clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
+                     "Dump function names to stdout."),
+          clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
+                     "Dump modules to stdout."),
+          clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
+                     "Dump modules to the current "
+                     "working directory. (WARNING: "
+                     "will overwrite existing files)."),
+          clEnumValN(DumpKind::DumpDebugDescriptor, "jit-debug-descriptor",
+                     "Dump __jit_debug_descriptor contents to stdout"),
+          clEnumValN(DumpKind::DumpDebugObjects, "jit-debug-objects",
+                     "Dump __jit_debug_descriptor in-memory debug "
+                     "objects as tool output")),
       cl::Hidden);
 
   ExitOnError ExitOnErr;
@@ -756,9 +751,41 @@ int main(int argc, char **argv, char * const *envp) {
   return Result;
 }
 
-static std::function<void(Module &)> createDebugDumper() {
+// JITLink debug support plugins put information about JITed code in this GDB
+// JIT Interface global from OrcTargetProcess.
+extern "C" struct jit_descriptor __jit_debug_descriptor;
+
+static struct jit_code_entry *
+findNextDebugDescriptorEntry(struct jit_code_entry *Latest) {
+  if (Latest == nullptr)
+    return __jit_debug_descriptor.first_entry;
+  if (Latest->next_entry)
+    return Latest->next_entry;
+  return nullptr;
+}
+
+static ToolOutputFile &claimToolOutput() {
+  static std::unique_ptr<ToolOutputFile> ToolOutput = nullptr;
+  if (ToolOutput) {
+    WithColor::error(errs(), "lli")
+        << "Can not claim stdout for tool output twice\n";
+    exit(1);
+  }
+  std::error_code EC;
+  ToolOutput = std::make_unique<ToolOutputFile>("-", EC, sys::fs::OF_None);
+  if (EC) {
+    WithColor::error(errs(), "lli")
+        << "Failed to create tool output file: " << EC.message() << "\n";
+    exit(1);
+  }
+  return *ToolOutput;
+}
+
+static std::function<void(Module &)> createIRDebugDumper() {
   switch (OrcDumpKind) {
   case DumpKind::NoDump:
+  case DumpKind::DumpDebugDescriptor:
+  case DumpKind::DumpDebugObjects:
     return [](Module &M) {};
 
   case DumpKind::DumpFuncsToStdOut:
@@ -800,6 +827,43 @@ static std::function<void(Module &)> createDebugDumper() {
   llvm_unreachable("Unknown DumpKind");
 }
 
+static std::function<void(MemoryBuffer &)> createObjDebugDumper() {
+  switch (OrcDumpKind) {
+  case DumpKind::NoDump:
+  case DumpKind::DumpFuncsToStdOut:
+  case DumpKind::DumpModsToStdOut:
+  case DumpKind::DumpModsToDisk:
+    return [](MemoryBuffer &) {};
+
+  case DumpKind::DumpDebugDescriptor: {
+    // Dump the empty descriptor at startup once
+    fprintf(stderr, "jit_debug_descriptor 0x%016" PRIx64 "\n",
+            pointerToJITTargetAddress(__jit_debug_descriptor.first_entry));
+    return [](MemoryBuffer &) {
+      // Dump new entries as they appear
+      static struct jit_code_entry *Latest = nullptr;
+      while (auto *NewEntry = findNextDebugDescriptorEntry(Latest)) {
+        fprintf(stderr, "jit_debug_descriptor 0x%016" PRIx64 "\n",
+                pointerToJITTargetAddress(NewEntry));
+        Latest = NewEntry;
+      }
+    };
+  }
+
+  case DumpKind::DumpDebugObjects: {
+    return [](MemoryBuffer &Obj) {
+      static struct jit_code_entry *Latest = nullptr;
+      static ToolOutputFile &ToolOutput = claimToolOutput();
+      while (auto *NewEntry = findNextDebugDescriptorEntry(Latest)) {
+        ToolOutput.os().write(NewEntry->symfile_addr, NewEntry->symfile_size);
+        Latest = NewEntry;
+      }
+    };
+  }
+  }
+  llvm_unreachable("Unknown DumpKind");
+}
+
 Error loadDylibs() {
   for (const auto &Dylib : Dylibs) {
     std::string ErrMsg;
@@ -1001,8 +1065,7 @@ int runOrcJIT(const char *ProgName) {
   if (PerModuleLazy)
     J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule);
 
-  auto Dump = createDebugDumper();
-
+  auto IRDump = createIRDebugDumper();
   J->getIRTransformLayer().setTransform(
       [&](orc::ThreadSafeModule TSM,
           const orc::MaterializationResponsibility &R) {
@@ -1011,18 +1074,18 @@ int runOrcJIT(const char *ProgName) {
             dbgs() << "Bad module: " << &M << "\n";
             exit(1);
           }
-          Dump(M);
+          IRDump(M);
         });
         return TSM;
       });
 
-  if (GenerateBuiltinFunctions.size() > 0) {
-    // Add LLI builtins.
-    orc::MangleAndInterner Mangle(J->getExecutionSession(), J->getDataLayout());
-    J->getMainJITDylib().addGenerator(
-        std::make_unique<LLIBuiltinFunctionGenerator>(GenerateBuiltinFunctions,
-                                                      Mangle));
-  }
+  auto ObjDump = createObjDebugDumper();
+  J->getObjTransformLayer().setTransform(
+      [&](std::unique_ptr<MemoryBuffer> Obj)
+          -> Expected<std::unique_ptr<MemoryBuffer>> {
+        ObjDump(*Obj);
+        return Obj;
+      });
 
   // If this is a Mingw or Cygwin executor then we need to alias __main to
   // orc_rt_int_void_return_0.

From 2d373143ad69910c56bbc7161224d365813a95b0 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Mon, 22 Jan 2024 20:26:19 +0000
Subject: [PATCH 477/843] [gn build] Port 263efb044add

---
 llvm/utils/gn/secondary/llvm/tools/lli/BUILD.gn | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/tools/lli/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/lli/BUILD.gn
index 19b02d0d526cb..f351ae4fb0cb5 100644
--- a/llvm/utils/gn/secondary/llvm/tools/lli/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/lli/BUILD.gn
@@ -23,10 +23,7 @@ executable("lli") {
     "//llvm/lib/Transforms/Instrumentation",
     "//llvm/lib/Transforms/Utils",
   ]
-  sources = [
-    "ExecutionUtils.cpp",
-    "lli.cpp",
-  ]
+  sources = [ "lli.cpp" ]
   if (host_os != "mac" && host_os != "win") {
     # Corresponds to export_executable_symbols() in cmake.
     ldflags = [ "-rdynamic" ]

From b00aa1c77b0aade04adf89336c52d415a71c9477 Mon Sep 17 00:00:00 2001
From: gulfemsavrun <gulfem@google.com>
Date: Mon, 22 Jan 2024 12:44:46 -0800
Subject: [PATCH 478/843] =?UTF-8?q?Revert=20"Reapply=20[hwasan]=20Update?=
 =?UTF-8?q?=20dbg.assign=20intrinsics=20in=20HWAsan=20pass=20=E2=80=A6=20(?=
 =?UTF-8?q?#79053)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…#78606"

This reverts commit 76160718df7c1f31ff50a4964d749c2b9d83f9cf because it
caused an assertion failure in emitDbgValue function in Codegen in Clang
Linux toolchain builders for Fuchsia.
https://logs.chromium.org/logs/fuchsia/buildbucket/cr-buildbucket/8758181086086431185/+/u/clang/build/stdout
---
 llvm/lib/IR/DebugInfo.cpp                     |  4 ++
 .../Instrumentation/HWAddressSanitizer.cpp    |  5 --
 .../Transforms/Utils/MemoryTaggingSupport.cpp | 10 +--
 .../AArch64/dbg-assign-tag-offset-mix-loc.ll  | 71 -------------------
 .../CodeGen/AArch64/dbg-assign-tag-offset.ll  | 71 -------------------
 .../declare-to-assign/hwasan.ll               |  2 +-
 .../dbg-assign-tag-offset.ll                  | 59 ---------------
 7 files changed, 8 insertions(+), 214 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
 delete mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 2e64d0db57b25..fcd3f77f8f6e2 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2200,6 +2200,10 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return /*Changed*/ false;
 
+  // FIXME: https://github.com/llvm/llvm-project/issues/76545
+  if (F.hasFnAttribute(Attribute::SanitizeHWAddress))
+    return /*Changed*/ false;
+
   bool Changed = false;
   auto *DL = &F.getParent()->getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 633183dd8eac8..efb621cde9065 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1435,11 +1435,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
         if (DDI->getVariableLocationOp(LocNo) == AI)
           DDI->setExpression(DIExpression::appendOpsToArg(DDI->getExpression(),
                                                           NewOps, LocNo));
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DDI)) {
-        if (DAI->getAddress() == AI)
-          DAI->setAddressExpression(
-              DIExpression::prependOpcodes(DDI->getExpression(), NewOps));
-      }
     }
 
     auto TagEnd = [&](Instruction *Node) {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index d2efcde5d3803..f94047633022c 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -138,20 +138,16 @@ void StackInfoBuilder::visit(Instruction &Inst) {
     return;
   }
   if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    auto AddIfInteresting = [&](Value *V) {
+    for (Value *V : DVI->location_ops()) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
-          return;
+          continue;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
         auto &DVIVec = AInfo.DbgVariableIntrinsics;
         if (DVIVec.empty() || DVIVec.back() != DVI)
           DVIVec.push_back(DVI);
       }
-    };
-    for (Value *V : DVI->location_ops())
-      AddIfInteresting(V);
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-      AddIfInteresting(DAI->getAddress());
+    }
   }
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
deleted file mode 100644
index ef0dd46cb45c7..0000000000000
--- a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
-
-;; Similar to dbg-assign-tag-offset.ll except the variable 'x' has been removed
-;; and 'y' has an implicit location range as well as stack location range
-;; (according to the hand-modified debug info -- see the dbg.value).
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-android24"
-
-; CHECK:      DW_TAG_variable
-; CHECK-NOT:  DW_TAG
-; CHECK:        DW_AT_LLVM_tag_offset (0x80)
-; CHECK-NEXT:   DW_AT_name    ("y")
-
-define dso_local void @f() !dbg !14 {
-  %1 = alloca i32, align 4, !DIAssignID !31
-  %2 = alloca i32, align 4, !DIAssignID !32
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  call void @llvm.dbg.value(metadata i32 2, metadata !20, metadata !DIExpression()), !dbg !22
-  call void @use(ptr null), !dbg !28
-  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
-  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  call void @use(ptr nonnull %1), !dbg !28
-  call void @use(ptr nonnull %2), !dbg !29
-  ret void, !dbg !30
-}
-
-declare !dbg !5 void @use(ptr)
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
-!llvm.ident = !{!13}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
-!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{!4, !5}
-!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
-!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
-!6 = !DISubroutineType(types: !7)
-!7 = !{null, !4}
-!8 = !{i32 7, !"Dwarf Version", i32 4}
-!9 = !{i32 2, !"Debug Info Version", i32 3}
-!10 = !{i32 1, !"wchar_size", i32 4}
-!11 = !{i32 7, !"PIC Level", i32 2}
-!12 = !{i32 7, !"PIE Level", i32 2}
-!13 = !{!"clang version 10.0.0"}
-!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
-!15 = !DISubroutineType(types: !16)
-!16 = !{null}
-!17 = !{!18, !20}
-!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
-!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
-!21 = !DILocation(line: 5, column: 3, scope: !14)
-!22 = !DILocation(line: 0, scope: !14)
-!23 = !DILocation(line: 5, column: 10, scope: !14)
-!24 = !{!25, !25, i64 0}
-!25 = !{!"int", !26, i64 0}
-!26 = !{!"omnipotent char", !27, i64 0}
-!27 = !{!"Simple C++ TBAA"}
-!28 = !DILocation(line: 6, column: 3, scope: !14)
-!29 = !DILocation(line: 7, column: 3, scope: !14)
-!30 = !DILocation(line: 8, column: 1, scope: !14)
-!31 = distinct !DIAssignID()
-!32 = distinct !DIAssignID()
-!33 = distinct !DIAssignID()
-!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
deleted file mode 100644
index a587f93d14d70..0000000000000
--- a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-android24"
-
-; CHECK:      DW_TAG_variable
-; CHECK-NOT:  DW_TAG
-; CHECK:        DW_AT_LLVM_tag_offset (0x00)
-; CHECK-NEXT:   DW_AT_name    ("x")
-
-; CHECK:      DW_TAG_variable
-; CHECK-NOT:  DW_TAG
-; CHECK:        DW_AT_LLVM_tag_offset (0x80)
-; CHECK-NEXT:   DW_AT_name    ("y")
-
-define dso_local void @f() !dbg !14 {
-  %1 = alloca i32, align 4, !DIAssignID !31
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !18, metadata !DIExpression(), metadata !31, metadata ptr %1, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0)), !dbg !22
-  %2 = alloca i32, align 4, !DIAssignID !32
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
-  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  call void @use(ptr nonnull %1), !dbg !28
-  call void @use(ptr nonnull %2), !dbg !29
-  ret void, !dbg !30
-}
-
-declare !dbg !5 void @use(ptr)
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
-!llvm.ident = !{!13}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
-!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{!4, !5}
-!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
-!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
-!6 = !DISubroutineType(types: !7)
-!7 = !{null, !4}
-!8 = !{i32 7, !"Dwarf Version", i32 4}
-!9 = !{i32 2, !"Debug Info Version", i32 3}
-!10 = !{i32 1, !"wchar_size", i32 4}
-!11 = !{i32 7, !"PIC Level", i32 2}
-!12 = !{i32 7, !"PIE Level", i32 2}
-!13 = !{!"clang version 10.0.0"}
-!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
-!15 = !DISubroutineType(types: !16)
-!16 = !{null}
-!17 = !{!18, !20}
-!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
-!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
-!21 = !DILocation(line: 5, column: 3, scope: !14)
-!22 = !DILocation(line: 0, scope: !14)
-!23 = !DILocation(line: 5, column: 10, scope: !14)
-!24 = !{!25, !25, i64 0}
-!25 = !{!"int", !26, i64 0}
-!26 = !{!"omnipotent char", !27, i64 0}
-!27 = !{!"Simple C++ TBAA"}
-!28 = !DILocation(line: 6, column: 3, scope: !14)
-!29 = !DILocation(line: 7, column: 3, scope: !14)
-!30 = !DILocation(line: 8, column: 1, scope: !14)
-!31 = distinct !DIAssignID()
-!32 = distinct !DIAssignID()
-!33 = distinct !DIAssignID()
-!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
index 6c9366609cba2..c4b209de77017 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
 
-; CHECK: call void @llvm.dbg.assign
+; CHECK: call void @llvm.dbg.declare
 
 define dso_local void @f() sanitize_hwaddress !dbg !9 {
 entry:
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
deleted file mode 100644
index 1248c0bc586ab..0000000000000
--- a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
-
-source_filename = "test.ll"
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-android"
-
-declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
-
-; Function Attrs: sanitize_hwaddress
-define void @f() #0 !dbg !7 {
-entry:
-  %nodebug0 = alloca ptr, align 8
-  %nodebug1 = alloca ptr, align 8
-  %nodebug2 = alloca ptr, align 8
-  %nodebug3 = alloca ptr, align 8
-  ; CHECK: %a = alloca{{.*}} !DIAssignID ![[ID1:[0-9]+]]
-  %a = alloca ptr, align 8, !DIAssignID !13
-  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID1]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !15
-  ; CHECK: %b = alloca{{.*}} !DIAssignID ![[ID2:[0-9]+]]
-  %b = alloca ptr, align 8, !DIAssignID !16
-  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID2]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !17, metadata !DIExpression(), metadata !16, metadata ptr %b, metadata !DIExpression()), !dbg !15
-  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
-  ret void, !dbg !18
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #1
-
-attributes #0 = { sanitize_hwaddress }
-attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "x.c", directory: "/")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
-!6 = !{!"clang"}
-!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null, !10}
-!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
-!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
-!12 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
-!13 = distinct !DIAssignID()
-!14 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 1, type: !10)
-!15 = !DILocation(line: 0, scope: !7)
-!16 = distinct !DIAssignID()
-!17 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 1, type: !10)
-!18 = !DILocation(line: 1, column: 37, scope: !7)

From 364a5b5b850a4f3e564da4bde080567a5d1e14d2 Mon Sep 17 00:00:00 2001
From: Zahira Ammarguellat <zahira.ammarguellat@intel.com>
Date: Mon, 22 Jan 2024 12:50:24 -0800
Subject: [PATCH 479/843] Fix a bug in implementation of Smith's algorithm used
 in complex div. (#78330)

This patch fixes a bug in Smith's algorithm (thanks to @andykaylor who
detected it) and makes sure that last option in command line rules.
---
 clang/include/clang/Basic/LangOptions.def |  2 +-
 clang/include/clang/Basic/LangOptions.h   |  2 +-
 clang/lib/CodeGen/CGExprComplex.cpp       | 10 +++-
 clang/lib/Driver/ToolChains/Clang.cpp     | 36 +++++++++-----
 clang/test/CodeGen/cx-complex-range.c     | 16 ++++++
 clang/test/CodeGen/smiths-complex-div.c   | 59 +++++++++++++++++++++++
 clang/test/Driver/range.c                 |  7 +++
 7 files changed, 116 insertions(+), 16 deletions(-)
 create mode 100644 clang/test/CodeGen/smiths-complex-div.c

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index c0ab6b97dd8db..47cbd3d30f847 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -220,7 +220,7 @@ BENIGN_LANGOPT(NoSignedZero      , 1, 0, "Permit Floating Point optimization wit
 BENIGN_LANGOPT(AllowRecip        , 1, 0, "Permit Floating Point reciprocal")
 BENIGN_LANGOPT(ApproxFunc        , 1, 0, "Permit Floating Point approximation")
 
-ENUM_LANGOPT(ComplexRange, ComplexRangeKind, 2, CX_Full, "Enable use of range reduction for complex arithmetics.")
+ENUM_LANGOPT(ComplexRange, ComplexRangeKind, 2, CX_None, "Enable use of range reduction for complex arithmetics.")
 
 BENIGN_LANGOPT(ObjCGCBitmapPrint , 1, 0, "printing of GC's bitmap layout for __weak/__strong ivars")
 
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 79947431e4e2c..c1cc5548ef10c 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -414,7 +414,7 @@ class LangOptions : public LangOptionsBase {
     IncompleteOnly = 3,
   };
 
-  enum ComplexRangeKind { CX_Full, CX_Limited, CX_Fortran };
+  enum ComplexRangeKind { CX_Full, CX_Limited, CX_Fortran, CX_None };
 
 public:
   /// The used language standard.
diff --git a/clang/lib/CodeGen/CGExprComplex.cpp b/clang/lib/CodeGen/CGExprComplex.cpp
index e532794b71bdb..839fe16cd7725 100644
--- a/clang/lib/CodeGen/CGExprComplex.cpp
+++ b/clang/lib/CodeGen/CGExprComplex.cpp
@@ -892,6 +892,9 @@ ComplexPairTy ComplexExprEmitter::EmitRangeReductionDiv(llvm::Value *LHSr,
                                                         llvm::Value *LHSi,
                                                         llvm::Value *RHSr,
                                                         llvm::Value *RHSi) {
+  // FIXME: This could eventually be replaced by an LLVM intrinsic to
+  // avoid this long IR sequence.
+
   // (a + ib) / (c + id) = (e + if)
   llvm::Value *FAbsRHSr = EmitllvmFAbs(CGF, RHSr); // |c|
   llvm::Value *FAbsRHSi = EmitllvmFAbs(CGF, RHSi); // |d|
@@ -936,7 +939,7 @@ ComplexPairTy ComplexExprEmitter::EmitRangeReductionDiv(llvm::Value *LHSr,
   llvm::Value *RC = Builder.CreateFMul(CdD, RHSr);  // rc
   llvm::Value *DpRC = Builder.CreateFAdd(RHSi, RC); // tmp=d+rc
 
-  llvm::Value *T7 = Builder.CreateFMul(LHSr, RC);    // ar
+  llvm::Value *T7 = Builder.CreateFMul(LHSr, CdD);   // ar
   llvm::Value *T8 = Builder.CreateFAdd(T7, LHSi);    // ar+b
   llvm::Value *DSTFr = Builder.CreateFDiv(T8, DpRC); // (ar+b)/tmp
 
@@ -978,7 +981,10 @@ ComplexPairTy ComplexExprEmitter::EmitBinDiv(const BinOpInfo &Op) {
       return EmitRangeReductionDiv(LHSr, LHSi, RHSr, RHSi);
     else if (Op.FPFeatures.getComplexRange() == LangOptions::CX_Limited)
       return EmitAlgebraicDiv(LHSr, LHSi, RHSr, RHSi);
-    else if (!CGF.getLangOpts().FastMath) {
+    else if (!CGF.getLangOpts().FastMath ||
+             // '-ffast-math' is used in the command line but followed by an
+             // '-fno-cx-limited-range'.
+             Op.FPFeatures.getComplexRange() == LangOptions::CX_Full) {
       LHSi = OrigLHSi;
       // If we have a complex operand on the RHS and FastMath is not allowed, we
       // delegate to a libcall to handle all of the complexities and minimize
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index f80f8e44de1c4..776a13b958893 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2712,9 +2712,22 @@ static void EmitComplexRangeDiag(const Driver &D,
         << EnumComplexRangeToStr(Range1) << EnumComplexRangeToStr(Range2);
 }
 
-static std::string RenderComplexRangeOption(std::string Range) {
+static std::string
+RenderComplexRangeOption(LangOptions::ComplexRangeKind Range) {
   std::string ComplexRangeStr = "-complex-range=";
-  ComplexRangeStr += Range;
+  switch (Range) {
+  case LangOptions::ComplexRangeKind::CX_Full:
+    ComplexRangeStr += "full";
+    break;
+  case LangOptions::ComplexRangeKind::CX_Limited:
+    ComplexRangeStr += "limited";
+    break;
+  case LangOptions::ComplexRangeKind::CX_Fortran:
+    ComplexRangeStr += "fortran";
+    break;
+  default:
+    assert("Unexpected range option");
+  }
   return ComplexRangeStr;
 }
 
@@ -2764,7 +2777,8 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
   bool StrictFPModel = false;
   StringRef Float16ExcessPrecision = "";
   StringRef BFloat16ExcessPrecision = "";
-  LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_Full;
+  LangOptions::ComplexRangeKind Range = LangOptions::ComplexRangeKind::CX_None;
+  std::string ComplexRangeStr = "";
 
   if (const Arg *A = Args.getLastArg(options::OPT_flimited_precision_EQ)) {
     CmdArgs.push_back("-mlimit-float-precision");
@@ -2780,23 +2794,19 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
     case options::OPT_fcx_limited_range: {
       EmitComplexRangeDiag(D, Range, LangOptions::ComplexRangeKind::CX_Limited);
       Range = LangOptions::ComplexRangeKind::CX_Limited;
-      std::string ComplexRangeStr = RenderComplexRangeOption("limited");
-      if (!ComplexRangeStr.empty())
-        CmdArgs.push_back(Args.MakeArgString(ComplexRangeStr));
       break;
     }
     case options::OPT_fno_cx_limited_range:
+      EmitComplexRangeDiag(D, Range, LangOptions::ComplexRangeKind::CX_Full);
       Range = LangOptions::ComplexRangeKind::CX_Full;
       break;
     case options::OPT_fcx_fortran_rules: {
       EmitComplexRangeDiag(D, Range, LangOptions::ComplexRangeKind::CX_Fortran);
       Range = LangOptions::ComplexRangeKind::CX_Fortran;
-      std::string ComplexRangeStr = RenderComplexRangeOption("fortran");
-      if (!ComplexRangeStr.empty())
-        CmdArgs.push_back(Args.MakeArgString(ComplexRangeStr));
       break;
     }
     case options::OPT_fno_cx_fortran_rules:
+      EmitComplexRangeDiag(D, Range, LangOptions::ComplexRangeKind::CX_Full);
       Range = LangOptions::ComplexRangeKind::CX_Full;
       break;
     case options::OPT_ffp_model_EQ: {
@@ -3068,9 +3078,7 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
       SeenUnsafeMathModeOption = true;
       // ffast-math enables fortran rules for complex multiplication and
       // division.
-      std::string ComplexRangeStr = RenderComplexRangeOption("limited");
-      if (!ComplexRangeStr.empty())
-        CmdArgs.push_back(Args.MakeArgString(ComplexRangeStr));
+      Range = LangOptions::ComplexRangeKind::CX_Limited;
       break;
     }
     case options::OPT_fno_fast_math:
@@ -3227,6 +3235,10 @@ static void RenderFloatingPointOptions(const ToolChain &TC, const Driver &D,
                    options::OPT_fstrict_float_cast_overflow, false))
     CmdArgs.push_back("-fno-strict-float-cast-overflow");
 
+  if (Range != LangOptions::ComplexRangeKind::CX_None)
+    ComplexRangeStr = RenderComplexRangeOption(Range);
+  if (!ComplexRangeStr.empty())
+    CmdArgs.push_back(Args.MakeArgString(ComplexRangeStr));
   if (Args.hasArg(options::OPT_fcx_limited_range))
     CmdArgs.push_back("-fcx-limited-range");
   if (Args.hasArg(options::OPT_fcx_fortran_rules))
diff --git a/clang/test/CodeGen/cx-complex-range.c b/clang/test/CodeGen/cx-complex-range.c
index 8368fa611335c..2d8507c710f20 100644
--- a/clang/test/CodeGen/cx-complex-range.c
+++ b/clang/test/CodeGen/cx-complex-range.c
@@ -15,9 +15,25 @@
 // RUN: -ffast-math -complex-range=limited -emit-llvm -o - %s \
 // RUN: | FileCheck %s --check-prefix=LMTD-FAST
 
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu \
+// RUN: -ffast-math -complex-range=full -emit-llvm -o - %s \
+// RUN: | FileCheck %s --check-prefix=FULL
+
 // RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown \
 // RUN: -fno-cx-fortran-rules -o - | FileCheck %s --check-prefix=FULL
 
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown \
+// RUN: -fcx-limited-range -fno-cx-limited-range -o - \
+// RUN: | FileCheck %s --check-prefix=FULL
+
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown \
+// RUN: -fno-cx-limited-range -fcx-limited-range -o - \
+// RUN: | FileCheck %s --check-prefix=FULL
+
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown \
+// RUN: -fno-cx-fortran-rules -fcx-fortran-rules -o - \
+// RUN: | FileCheck %s --check-prefix=FULL
+
 _Complex float div(_Complex float a, _Complex float b) {
   // LABEL: define {{.*}} @div(
   // FULL:  call {{.*}} @__divsc3
diff --git a/clang/test/CodeGen/smiths-complex-div.c b/clang/test/CodeGen/smiths-complex-div.c
new file mode 100644
index 0000000000000..75775675c9238
--- /dev/null
+++ b/clang/test/CodeGen/smiths-complex-div.c
@@ -0,0 +1,59 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 %s -O0 -emit-llvm -triple x86_64-unknown-unknown \
+// RUN: -complex-range=fortran -o - | FileCheck %s --check-prefix=FRTRN
+
+// FRTRN-LABEL: define dso_local <2 x float> @div(
+// FRTRN-SAME: <2 x float> noundef [[A_COERCE:%.*]], <2 x float> noundef [[B_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// FRTRN-NEXT:  entry:
+// FRTRN-NEXT:    [[RETVAL:%.*]] = alloca { float, float }, align 4
+// FRTRN-NEXT:    [[A:%.*]] = alloca { float, float }, align 4
+// FRTRN-NEXT:    [[B:%.*]] = alloca { float, float }, align 4
+// FRTRN-NEXT:    store <2 x float> [[A_COERCE]], ptr [[A]], align 4
+// FRTRN-NEXT:    store <2 x float> [[B_COERCE]], ptr [[B]], align 4
+// FRTRN-NEXT:    [[A_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 0
+// FRTRN-NEXT:    [[A_REAL:%.*]] = load float, ptr [[A_REALP]], align 4
+// FRTRN-NEXT:    [[A_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[A]], i32 0, i32 1
+// FRTRN-NEXT:    [[A_IMAG:%.*]] = load float, ptr [[A_IMAGP]], align 4
+// FRTRN-NEXT:    [[B_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 0
+// FRTRN-NEXT:    [[B_REAL:%.*]] = load float, ptr [[B_REALP]], align 4
+// FRTRN-NEXT:    [[B_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[B]], i32 0, i32 1
+// FRTRN-NEXT:    [[B_IMAG:%.*]] = load float, ptr [[B_IMAGP]], align 4
+// FRTRN-NEXT:    [[TMP0:%.*]] = call float @llvm.fabs.f32(float [[B_REAL]])
+// FRTRN-NEXT:    [[TMP1:%.*]] = call float @llvm.fabs.f32(float [[B_IMAG]])
+// FRTRN-NEXT:    [[ABS_CMP:%.*]] = fcmp ugt float [[TMP0]], [[TMP1]]
+// FRTRN-NEXT:    br i1 [[ABS_CMP]], label [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI:%.*]], label [[ABS_RHSR_LESS_THAN_ABS_RHSI:%.*]]
+// FRTRN:       abs_rhsr_greater_or_equal_abs_rhsi:
+// FRTRN-NEXT:    [[TMP2:%.*]] = fdiv float [[B_IMAG]], [[B_REAL]]
+// FRTRN-NEXT:    [[TMP3:%.*]] = fmul float [[TMP2]], [[B_IMAG]]
+// FRTRN-NEXT:    [[TMP4:%.*]] = fadd float [[B_REAL]], [[TMP3]]
+// FRTRN-NEXT:    [[TMP5:%.*]] = fmul float [[A_IMAG]], [[TMP2]]
+// FRTRN-NEXT:    [[TMP6:%.*]] = fadd float [[A_REAL]], [[TMP5]]
+// FRTRN-NEXT:    [[TMP7:%.*]] = fdiv float [[TMP6]], [[TMP4]]
+// FRTRN-NEXT:    [[TMP8:%.*]] = fmul float [[A_REAL]], [[TMP2]]
+// FRTRN-NEXT:    [[TMP9:%.*]] = fsub float [[A_IMAG]], [[TMP8]]
+// FRTRN-NEXT:    [[TMP10:%.*]] = fdiv float [[TMP9]], [[TMP4]]
+// FRTRN-NEXT:    br label [[COMPLEX_DIV:%.*]]
+// FRTRN:       abs_rhsr_less_than_abs_rhsi:
+// FRTRN-NEXT:    [[TMP11:%.*]] = fdiv float [[B_REAL]], [[B_IMAG]]
+// FRTRN-NEXT:    [[TMP12:%.*]] = fmul float [[TMP11]], [[B_REAL]]
+// FRTRN-NEXT:    [[TMP13:%.*]] = fadd float [[B_IMAG]], [[TMP12]]
+// FRTRN-NEXT:    [[TMP14:%.*]] = fmul float [[A_REAL]], [[TMP11]]
+// FRTRN-NEXT:    [[TMP15:%.*]] = fadd float [[TMP14]], [[A_IMAG]]
+// FRTRN-NEXT:    [[TMP16:%.*]] = fdiv float [[TMP15]], [[TMP13]]
+// FRTRN-NEXT:    [[TMP17:%.*]] = fmul float [[A_IMAG]], [[TMP11]]
+// FRTRN-NEXT:    [[TMP18:%.*]] = fsub float [[TMP17]], [[A_REAL]]
+// FRTRN-NEXT:    [[TMP19:%.*]] = fdiv float [[TMP18]], [[TMP13]]
+// FRTRN-NEXT:    br label [[COMPLEX_DIV]]
+// FRTRN:       complex_div:
+// FRTRN-NEXT:    [[TMP20:%.*]] = phi float [ [[TMP7]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP16]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// FRTRN-NEXT:    [[TMP21:%.*]] = phi float [ [[TMP10]], [[ABS_RHSR_GREATER_OR_EQUAL_ABS_RHSI]] ], [ [[TMP19]], [[ABS_RHSR_LESS_THAN_ABS_RHSI]] ]
+// FRTRN-NEXT:    [[RETVAL_REALP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 0
+// FRTRN-NEXT:    [[RETVAL_IMAGP:%.*]] = getelementptr inbounds { float, float }, ptr [[RETVAL]], i32 0, i32 1
+// FRTRN-NEXT:    store float [[TMP20]], ptr [[RETVAL_REALP]], align 4
+// FRTRN-NEXT:    store float [[TMP21]], ptr [[RETVAL_IMAGP]], align 4
+// FRTRN-NEXT:    [[TMP22:%.*]] = load <2 x float>, ptr [[RETVAL]], align 4
+// FRTRN-NEXT:    ret <2 x float> [[TMP22]]
+//
+_Complex float div(_Complex float a, _Complex float b) {
+  return a / b;
+}
diff --git a/clang/test/Driver/range.c b/clang/test/Driver/range.c
index 8d456a997d696..045d9c7d3d802 100644
--- a/clang/test/Driver/range.c
+++ b/clang/test/Driver/range.c
@@ -6,6 +6,9 @@
 // RUN: %clang -### -target x86_64 -fno-cx-limited-range -c %s 2>&1 \
 // RUN:   | FileCheck %s
 
+// RUN: %clang -### -target x86_64 -fcx-limited-range -fno-cx-limited-range \
+// RUN: -c %s 2>&1 | FileCheck --check-prefix=FULL %s
+
 // RUN: %clang -### -target x86_64 -fcx-fortran-rules -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=FRTRN %s
 
@@ -29,7 +32,11 @@
 // RUN: %clang -### -target x86_64 -fcx-limited-range -ffast-math -c %s 2>&1 \
 // RUN:   | FileCheck --check-prefix=LMTD %s
 
+// RUN: %clang -### -target x86_64 -ffast-math -fno-cx-limited-range -c %s 2>&1 \
+// RUN:   | FileCheck --check-prefix=FULL %s
+
 // LMTD: -complex-range=limited
+// FULL: -complex-range=full
 // LMTD-NOT: -complex-range=fortran
 // CHECK-NOT: -complex-range=limited
 // FRTRN: -complex-range=fortran

From 9f8ccf50dde17adae1368a0cf41edadc8327aaf4 Mon Sep 17 00:00:00 2001
From: Julian Schmidt <44101708+5chmidti@users.noreply.github.com>
Date: Mon, 22 Jan 2024 21:51:39 +0100
Subject: [PATCH 480/843] [clang-tidy] fix misc-const-correctnes false-positive
 for fold expressions (#78320)

The check no longer emits a diagnostic for non-parameter-pack
variables in C++17 fold expressions.
The operator used is type-dependent because of the parameter pack
and can therefore not be guaranteed to not mutate the variable.

Fixes: #70323
---
 clang-tools-extra/docs/ReleaseNotes.rst       |  3 +-
 .../misc/const-correctness-templates.cpp      | 28 +++++++++++++++++
 clang/lib/Analysis/ExprMutationAnalyzer.cpp   |  4 +++
 .../Analysis/ExprMutationAnalyzerTest.cpp     | 31 +++++++++++++++++++
 4 files changed, 65 insertions(+), 1 deletion(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 1015b6bd188d8..27f098a54327d 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -404,7 +404,8 @@ Changes in existing checks
   using pointer to member function. Additionally, the check no longer emits
   a diagnostic when a variable that is not type-dependent is an operand of a
   type-dependent binary operator. Improved performance of the check through
-  optimizations.
+  optimizations. The check no longer emits a diagnostic for non-parameter-pack
+  variables in C++17 fold expressions.
 
 - Improved :doc:`misc-include-cleaner
   <clang-tidy/checks/misc/include-cleaner>` check by adding option
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp
index 794578ceeeba8..9da468128743e 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/const-correctness-templates.cpp
@@ -30,3 +30,31 @@ namespace gh57297{
 struct Stream { };
 template <typename T> void f() { T t; Stream x; x << t; }
 } // namespace gh57297
+
+namespace gh70323{
+// A fold expression may contain the checked variable as it's initializer.
+// We don't know if the operator modifies that variable because the
+// operator is type dependent due to the parameter pack.
+
+struct Stream {};
+template <typename... Args>
+void concatenate1(Args... args)
+{
+    Stream stream;
+    (stream << ... << args);
+}
+
+template <typename... Args>
+void concatenate2(Args... args)
+{
+    Stream stream;
+    (args << ... << stream);
+}
+
+template <typename... Args>
+void concatenate3(Args... args)
+{
+    Stream stream;
+    (..., (stream << args));
+}
+} // namespace gh70323
diff --git a/clang/lib/Analysis/ExprMutationAnalyzer.cpp b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
index 9af818be0415f..bb042760d297a 100644
--- a/clang/lib/Analysis/ExprMutationAnalyzer.cpp
+++ b/clang/lib/Analysis/ExprMutationAnalyzer.cpp
@@ -343,6 +343,10 @@ const Stmt *ExprMutationAnalyzer::findDirectMutation(const Expr *Exp) {
       // in different instantiations of the template.
       binaryOperator(isTypeDependent(),
                      hasEitherOperand(ignoringImpCasts(canResolveToExpr(Exp)))),
+      // A fold expression may contain `Exp` as it's initializer.
+      // We don't know if the operator modifies `Exp` because the
+      // operator is type dependent due to the parameter pack.
+      cxxFoldExpr(hasFoldInit(ignoringImpCasts(canResolveToExpr(Exp)))),
       // Within class templates and member functions the member expression might
       // not be resolved. In that case, the `callExpr` is considered to be a
       // modification.
diff --git a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
index a94f857720b03..f58ce4aebcbfc 100644
--- a/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
+++ b/clang/unittests/Analysis/ExprMutationAnalyzerTest.cpp
@@ -359,6 +359,37 @@ TEST(ExprMutationAnalyzerTest, DependentOperatorWithNonDependentOperand) {
   EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("x << t"));
 }
 
+TEST(ExprMutationAnalyzerTest, FoldExpression) {
+  // gh70323
+  // A fold expression may contain `Exp` as it's initializer.
+  // We don't know if the operator modifies `Exp` because the
+  // operator is type dependent due to the parameter pack.
+  auto AST = buildASTFromCodeWithArgs(
+      "struct Stream {};"
+      "template <typename... Args> void concatenate(Args... args) "
+      "{ Stream x; (x << ... << args); }",
+      {"-fno-delayed-template-parsing"});
+  auto Results =
+      match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("(x << ... << args)"));
+
+  AST = buildASTFromCodeWithArgs(
+      "struct Stream {};"
+      "template <typename... Args> void concatenate(Args... args) "
+      "{ Stream x; (args << ... << x); }",
+      {"-fno-delayed-template-parsing"});
+  Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("(args << ... << x)"));
+
+  AST = buildASTFromCodeWithArgs(
+      "struct Stream {};"
+      "template <typename... Args> void concatenate(Args... args) "
+      "{ Stream x; (..., (x << args)); }",
+      {"-fno-delayed-template-parsing"});
+  Results = match(withEnclosingCompound(declRefTo("x")), AST->getASTContext());
+  EXPECT_THAT(mutatedBy(Results, AST.get()), ElementsAre("x << args"));
+}
+
 // Section: expression as call argument
 
 TEST(ExprMutationAnalyzerTest, ByValueArgument) {

From 3f3a3e873a684e924d9d873f9e094e217e323f2a Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Mon, 22 Jan 2024 12:54:30 -0800
Subject: [PATCH 481/843] Revert "[lli] Revisit Orc debug output tests"
 (#79055)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Reverts llvm/llvm-project#76822

This fails to build with gcc 7.5:
```
llvm/tools/lli/lli.cpp:1087:16: error: could not convert ‘Obj’ from ‘std::unique_ptr<llvm::MemoryBuffer>’ to ‘llvm::Expected<std::unique_ptr<llvm::MemoryBuffer> >’
         return Obj;
                ^~~
```
---
 ...tor.ll => debug-descriptor-elf-minimal.ll} |  30 ++--
 .../OrcLazy/debug-objects-elf-minimal.ll      |  82 +++++++++++
 .../ExecutionEngine/OrcLazy/debug-objects.ll  |  62 --------
 llvm/tools/lli/CMakeLists.txt                 |   1 +
 llvm/tools/lli/ExecutionUtils.cpp             | 122 ++++++++++++++++
 llvm/tools/lli/ExecutionUtils.h               |  60 ++++++++
 llvm/tools/lli/lli.cpp                        | 135 +++++-------------
 7 files changed, 322 insertions(+), 170 deletions(-)
 rename llvm/test/ExecutionEngine/OrcLazy/{debug-descriptor.ll => debug-descriptor-elf-minimal.ll} (53%)
 create mode 100644 llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
 delete mode 100644 llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll
 create mode 100644 llvm/tools/lli/ExecutionUtils.cpp
 create mode 100644 llvm/tools/lli/ExecutionUtils.h

diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
similarity index 53%
rename from llvm/test/ExecutionEngine/OrcLazy/debug-descriptor.ll
rename to llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
index bf7f13fb5439f..9bac3897864f7 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
@@ -1,17 +1,29 @@
-; REQUIRES: native && target-x86_64
+; REQUIRES: native && x86_64-linux
 
-; RUN: lli --jit-linker=rtdyld --orc-lazy-debug=jit-debug-descriptor %s 2>&1 | FileCheck %s
-; RUN: lli --jit-linker=jitlink --orc-lazy-debug=jit-debug-descriptor %s 2>&1 | FileCheck %s
+; RUN: lli --jit-linker=rtdyld \
+; RUN:     --generate=__dump_jit_debug_descriptor %s | FileCheck %s
 ;
-; Initial entry should be empty:
-; CHECK: jit_debug_descriptor 0x0000000000000000
+; RUN: lli --jit-linker=jitlink \
+; RUN:     --generate=__dump_jit_debug_descriptor %s | FileCheck %s
 ;
-; After adding the module it must not be empty anymore:
-; CHECK: jit_debug_descriptor 0x
-; CHECK-NOT:                    000000000000000
-; CHECK-SAME:                               {{[048c]}}
+; CHECK: Reading __jit_debug_descriptor at 0x{{.*}}
+; CHECK: Version: 1
+; CHECK: Action: JIT_REGISTER_FN
+; CHECK:       Entry               Symbol File             Size  Previous Entry
+; CHECK: [ 0]  0x{{.*}}            0x{{.*}}              {{.*}}  0x0000000000000000
+
+target triple = "x86_64-unknown-unknown-elf"
+
+; Built-in symbol provided by the JIT
+declare void @__dump_jit_debug_descriptor(ptr)
+
+; Host-process symbol from the GDB JIT interface
+@__jit_debug_descriptor = external global i8, align 1
 
 define i32 @main() !dbg !9 {
+  %1 = alloca i32, align 4
+  store i32 0, ptr %1, align 4
+  call void @__dump_jit_debug_descriptor(ptr @__jit_debug_descriptor), !dbg !13
   ret i32 0, !dbg !14
 }
 
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
new file mode 100644
index 0000000000000..31fe730b74036
--- /dev/null
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
@@ -0,0 +1,82 @@
+; REQUIRES: native && x86_64-linux
+
+; In-memory debug-object contains some basic DWARF
+;
+; RUN: lli --jit-linker=rtdyld \
+; RUN:     --generate=__dump_jit_debug_objects %s | llvm-dwarfdump --diff - | FileCheck %s
+;
+; RUN: lli --jit-linker=jitlink \
+; RUN:     --generate=__dump_jit_debug_objects %s | llvm-dwarfdump --diff - | FileCheck %s
+;
+; CHECK: -:	file format elf64-x86-64
+; CHECK: .debug_info contents:
+; CHECK: 0x00000000: Compile Unit: length = 0x00000047, format = DWARF32, version = 0x0004, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x0000004b)
+; CHECK: DW_TAG_compile_unit
+; CHECK:               DW_AT_producer	("compiler version")
+; CHECK:               DW_AT_language	(DW_LANG_C99)
+; CHECK:               DW_AT_name	("source-file.c")
+; CHECK:               DW_AT_stmt_list	()
+; CHECK:               DW_AT_comp_dir	("/workspace")
+; CHECK:               DW_AT_low_pc	()
+; CHECK:               DW_AT_high_pc	()
+; CHECK:   DW_TAG_subprogram
+; CHECK:                 DW_AT_low_pc	()
+; CHECK:                 DW_AT_high_pc	()
+; CHECK:                 DW_AT_frame_base	(DW_OP_reg7 RSP)
+; CHECK:                 DW_AT_name	("main")
+; CHECK:                 DW_AT_decl_file	("/workspace/source-file.c")
+; CHECK:                 DW_AT_decl_line	(4)
+; CHECK:                 DW_AT_type	("int")
+; CHECK:                 DW_AT_external	(true)
+; CHECK:   DW_TAG_base_type
+; CHECK:                 DW_AT_name	("int")
+; CHECK:                 DW_AT_encoding	(DW_ATE_signed)
+; CHECK:                 DW_AT_byte_size	(0x04)
+; CHECK:   NULL
+
+; Text section of the in-memory debug-object has a non-null load-address
+;
+; RUN: lli --jit-linker=rtdyld \
+; RUN:     --generate=__dump_jit_debug_objects %s | llvm-objdump --section-headers - | \
+; RUN:     FileCheck --check-prefix=CHECK_LOAD_ADDR %s
+;
+; RUN: lli --jit-linker=jitlink \
+; RUN:     --generate=__dump_jit_debug_objects %s | llvm-objdump --section-headers - | \
+; RUN:     FileCheck --check-prefix=CHECK_LOAD_ADDR %s
+;
+; CHECK_LOAD_ADDR-NOT: {{[0-9]*}} .ltext {{.*}} 0000000000000000 TEXT
+
+target triple = "x86_64-unknown-unknown-elf"
+
+; Built-in symbol provided by the JIT
+declare void @__dump_jit_debug_objects(ptr)
+
+; Host-process symbol from the GDB JIT interface
+@__jit_debug_descriptor = external global i8, align 1
+
+define i32 @main() !dbg !9 {
+  %1 = alloca i32, align 4
+  store i32 0, ptr %1, align 4
+  call void @__dump_jit_debug_objects(ptr @__jit_debug_descriptor), !dbg !13
+  ret i32 0, !dbg !14
+}
+
+!llvm.module.flags = !{!0, !1, !2, !3, !4}
+!llvm.dbg.cu = !{!5}
+!llvm.ident = !{!8}
+
+!0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 6]}
+!1 = !{i32 7, !"Dwarf Version", i32 4}
+!2 = !{i32 2, !"Debug Info Version", i32 3}
+!3 = !{i32 1, !"wchar_size", i32 4}
+!4 = !{i32 7, !"PIC Level", i32 2}
+!5 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, producer: "compiler version", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, nameTableKind: None)
+!6 = !DIFile(filename: "source-file.c", directory: "/workspace")
+!7 = !{}
+!8 = !{!"compiler version"}
+!9 = distinct !DISubprogram(name: "main", scope: !6, file: !6, line: 4, type: !10, scopeLine: 4, spFlags: DISPFlagDefinition, unit: !5, retainedNodes: !7)
+!10 = !DISubroutineType(types: !11)
+!11 = !{!12}
+!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!13 = !DILocation(line: 5, column: 3, scope: !9)
+!14 = !DILocation(line: 6, column: 3, scope: !9)
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll
deleted file mode 100644
index 1f48bd9a2318a..0000000000000
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll
+++ /dev/null
@@ -1,62 +0,0 @@
-; REQUIRES: native && x86_64-linux
-
-; In-memory debug-objects contain DWARF
-;
-; RUN: lli --jit-linker=rtdyld  --orc-lazy-debug=jit-debug-objects %s | llvm-dwarfdump --diff - | FileCheck %s
-; RUN: lli --jit-linker=jitlink --orc-lazy-debug=jit-debug-objects %s | llvm-dwarfdump --diff - | FileCheck %s
-;
-; CHECK: -:	file format elf64-x86-64
-; TODO: Synthesized Mach-O objects error out with:
-;       truncated or malformed object (offset field of section 8 in
-;       LC_SEGMENT_64 command 0 extends past the end of the file)
-;
-; CHECK: .debug_info contents:
-; CHECK: format = DWARF32
-; CHECK: DW_TAG_compile_unit
-; CHECK:               DW_AT_producer	("clang version 18.0.0git")
-; CHECK:               DW_AT_language	(DW_LANG_C11)
-; CHECK:               DW_AT_name	("source-file.c")
-; CHECK:               DW_AT_comp_dir	("/workspace")
-; CHECK:   DW_TAG_subprogram
-; CHECK:                 DW_AT_frame_base	(DW_OP_reg7 RSP)
-; CHECK:                 DW_AT_name	("main")
-; CHECK:                 DW_AT_decl_file	("/workspace/source-file.c")
-; CHECK:                 DW_AT_decl_line	(1)
-; CHECK:                 DW_AT_type	("int")
-; CHECK:                 DW_AT_external	(true)
-; CHECK:   DW_TAG_base_type
-; CHECK:                 DW_AT_name	("int")
-; CHECK:                 DW_AT_encoding	(DW_ATE_signed)
-; CHECK:                 DW_AT_byte_size	(0x04)
-; CHECK:   NULL
-
-; Text section of the in-memory debug-objects have non-null load-address
-;
-; RUN: lli --jit-linker=rtdyld --orc-lazy-debug=jit-debug-objects %s | \
-; RUN:                              llvm-objdump --section-headers - | \
-; RUN:                              FileCheck --check-prefix=CHECK_LOAD_ADDR %s
-; RUN: lli --jit-linker=jitlink --orc-lazy-debug=jit-debug-objects %s | \
-; RUN:                               llvm-objdump --section-headers - | \
-; RUN:                               FileCheck --check-prefix=CHECK_LOAD_ADDR %s
-;
-; CHECK_LOAD_ADDR:      .text
-; CHECK_LOAD_ADDR-NOT:  0000000000000000
-; CHECK_LOAD_ADDR-SAME: TEXT
-
-define i32 @main() !dbg !3 {
-entry:
-  ret i32 0, !dbg !8
-}
-
-!llvm.module.flags = !{!0}
-!llvm.dbg.cu = !{!2}
-
-!0 = !{i32 2, !"Debug Info Version", i32 3}
-!1 = !DIFile(filename: "source-file.c", directory: "/workspace")
-!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 18.0.0git", emissionKind: FullDebug)
-!3 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !4, scopeLine: 1, unit: !2, retainedNodes: !7)
-!4 = !DISubroutineType(types: !5)
-!5 = !{!6}
-!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!7 = !{}
-!8 = !DILocation(line: 1, column: 14, scope: !3)
diff --git a/llvm/tools/lli/CMakeLists.txt b/llvm/tools/lli/CMakeLists.txt
index e3fca225a2275..315de28e12b57 100644
--- a/llvm/tools/lli/CMakeLists.txt
+++ b/llvm/tools/lli/CMakeLists.txt
@@ -53,6 +53,7 @@ endif( LLVM_USE_PERF )
 
 add_llvm_tool(lli
   lli.cpp
+  ExecutionUtils.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/tools/lli/ExecutionUtils.cpp b/llvm/tools/lli/ExecutionUtils.cpp
new file mode 100644
index 0000000000000..b6cc3bb174d3c
--- /dev/null
+++ b/llvm/tools/lli/ExecutionUtils.cpp
@@ -0,0 +1,122 @@
+//===---- ExecutionUtils.cpp - Utilities for executing functions in lli ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "ExecutionUtils.h"
+
+#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <cstdint>
+#include <vector>
+
+namespace llvm {
+
+template <typename... Ts> static void outsv(const char *Fmt, Ts &&...Vals) {
+  outs() << formatv(Fmt, Vals...);
+}
+
+static const char *actionFlagToStr(uint32_t ActionFlag) {
+  switch (ActionFlag) {
+  case JIT_NOACTION:
+    return "JIT_NOACTION";
+  case JIT_REGISTER_FN:
+    return "JIT_REGISTER_FN";
+  case JIT_UNREGISTER_FN:
+    return "JIT_UNREGISTER_FN";
+  }
+  return "<invalid action_flag>";
+}
+
+// Declarations follow the GDB JIT interface (version 1, 2009) and must match
+// those of the DYLD used for testing.
+//
+// Sample output:
+//
+//   Reading __jit_debug_descriptor at 0x0000000000404048
+//
+//   Version: 0
+//   Action: JIT_REGISTER_FN
+//
+//         Entry               Symbol File         Size  Previous Entry
+//   [ 0]  0x0000000000451290  0x0000000000002000   200  0x0000000000000000
+//   [ 1]  0x0000000000451260  0x0000000000001000   100  0x0000000000451290
+//   ...
+//
+static void dumpDebugDescriptor(void *Addr) {
+  outsv("Reading __jit_debug_descriptor at {0}\n\n", Addr);
+
+  jit_descriptor *Descriptor = reinterpret_cast<jit_descriptor *>(Addr);
+  outsv("Version: {0}\n", Descriptor->version);
+  outsv("Action: {0}\n\n", actionFlagToStr(Descriptor->action_flag));
+  outsv("{0,11}  {1,24}  {2,15}  {3,14}\n", "Entry", "Symbol File", "Size",
+        "Previous Entry");
+
+  unsigned Idx = 0;
+  for (auto *Entry = Descriptor->first_entry; Entry; Entry = Entry->next_entry)
+    outsv("[{0,2}]  {1:X16}  {2:X16}  {3,8:D}  {4}\n", Idx++, Entry,
+          reinterpret_cast<const void *>(Entry->symfile_addr),
+          Entry->symfile_size, Entry->prev_entry);
+}
+
+static LLIBuiltinFunctionGenerator *Generator = nullptr;
+
+static void dumpDebugObjects(void *Addr) {
+  jit_descriptor *Descriptor = reinterpret_cast<jit_descriptor *>(Addr);
+  for (auto *Entry = Descriptor->first_entry; Entry; Entry = Entry->next_entry)
+    Generator->appendDebugObject(Entry->symfile_addr, Entry->symfile_size);
+}
+
+LLIBuiltinFunctionGenerator::LLIBuiltinFunctionGenerator(
+    std::vector<BuiltinFunctionKind> Enabled, orc::MangleAndInterner &Mangle)
+    : TestOut(nullptr) {
+  Generator = this;
+  for (BuiltinFunctionKind F : Enabled) {
+    switch (F) {
+    case BuiltinFunctionKind::DumpDebugDescriptor:
+      expose(Mangle("__dump_jit_debug_descriptor"), &dumpDebugDescriptor);
+      break;
+    case BuiltinFunctionKind::DumpDebugObjects:
+      expose(Mangle("__dump_jit_debug_objects"), &dumpDebugObjects);
+      TestOut = createToolOutput();
+      break;
+    }
+  }
+}
+
+Error LLIBuiltinFunctionGenerator::tryToGenerate(
+    orc::LookupState &LS, orc::LookupKind K, orc::JITDylib &JD,
+    orc::JITDylibLookupFlags JDLookupFlags,
+    const orc::SymbolLookupSet &Symbols) {
+  orc::SymbolMap NewSymbols;
+  for (const auto &NameFlags : Symbols) {
+    auto It = BuiltinFunctions.find(NameFlags.first);
+    if (It != BuiltinFunctions.end())
+      NewSymbols.insert(*It);
+  }
+
+  if (NewSymbols.empty())
+    return Error::success();
+
+  return JD.define(absoluteSymbols(std::move(NewSymbols)));
+}
+
+// static
+std::unique_ptr<ToolOutputFile>
+LLIBuiltinFunctionGenerator::createToolOutput() {
+  std::error_code EC;
+  auto TestOut = std::make_unique<ToolOutputFile>("-", EC, sys::fs::OF_None);
+  if (EC) {
+    errs() << "Error creating tool output file: " << EC.message() << '\n';
+    exit(1);
+  }
+  return TestOut;
+}
+
+} // namespace llvm
diff --git a/llvm/tools/lli/ExecutionUtils.h b/llvm/tools/lli/ExecutionUtils.h
new file mode 100644
index 0000000000000..6bf9cd58e031b
--- /dev/null
+++ b/llvm/tools/lli/ExecutionUtils.h
@@ -0,0 +1,60 @@
+//===- ExecutionUtils.h - Utilities for executing code in lli ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Contains utilities for executing code in lli.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLI_EXECUTIONUTILS_H
+#define LLVM_TOOLS_LLI_EXECUTIONUTILS_H
+
+#include "llvm/ExecutionEngine/JITSymbol.h"
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/ExecutionEngine/Orc/Mangling.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ToolOutputFile.h"
+
+#include <memory>
+#include <utility>
+
+namespace llvm {
+
+enum class BuiltinFunctionKind {
+  DumpDebugDescriptor,
+  DumpDebugObjects,
+};
+
+// Utility class to expose symbols for special-purpose functions to the JIT.
+class LLIBuiltinFunctionGenerator : public orc::DefinitionGenerator {
+public:
+  LLIBuiltinFunctionGenerator(std::vector<BuiltinFunctionKind> Enabled,
+                              orc::MangleAndInterner &Mangle);
+
+  Error tryToGenerate(orc::LookupState &LS, orc::LookupKind K,
+                      orc::JITDylib &JD, orc::JITDylibLookupFlags JDLookupFlags,
+                      const orc::SymbolLookupSet &Symbols) override;
+
+  void appendDebugObject(const char *Addr, size_t Size) {
+    TestOut->os().write(Addr, Size);
+  }
+
+private:
+  orc::SymbolMap BuiltinFunctions;
+  std::unique_ptr<ToolOutputFile> TestOut;
+
+  template <typename T> void expose(orc::SymbolStringPtr Name, T *Handler) {
+    BuiltinFunctions[Name] = {orc::ExecutorAddr::fromPtr(Handler),
+                              JITSymbolFlags::Exported};
+  }
+
+  static std::unique_ptr<ToolOutputFile> createToolOutput();
+};
+
+} // end namespace llvm
+
+#endif // LLVM_TOOLS_LLI_EXECUTIONUTILS_H
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index 15acf6d38dba6..8a7ea2d3d0c58 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "ExecutionUtils.h"
 #include "ForwardingMemoryManager.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -32,7 +33,6 @@
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
-#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
@@ -62,7 +62,6 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -255,29 +254,35 @@ namespace {
     NoDump,
     DumpFuncsToStdOut,
     DumpModsToStdOut,
-    DumpModsToDisk,
-    DumpDebugDescriptor,
-    DumpDebugObjects,
+    DumpModsToDisk
   };
 
   cl::opt<DumpKind> OrcDumpKind(
       "orc-lazy-debug", cl::desc("Debug dumping for the orc-lazy JIT."),
       cl::init(DumpKind::NoDump),
-      cl::values(
-          clEnumValN(DumpKind::NoDump, "no-dump", "Don't dump anything."),
-          clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
-                     "Dump function names to stdout."),
-          clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
-                     "Dump modules to stdout."),
-          clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
-                     "Dump modules to the current "
-                     "working directory. (WARNING: "
-                     "will overwrite existing files)."),
-          clEnumValN(DumpKind::DumpDebugDescriptor, "jit-debug-descriptor",
-                     "Dump __jit_debug_descriptor contents to stdout"),
-          clEnumValN(DumpKind::DumpDebugObjects, "jit-debug-objects",
-                     "Dump __jit_debug_descriptor in-memory debug "
-                     "objects as tool output")),
+      cl::values(clEnumValN(DumpKind::NoDump, "no-dump",
+                            "Don't dump anything."),
+                 clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
+                            "Dump function names to stdout."),
+                 clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
+                            "Dump modules to stdout."),
+                 clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
+                            "Dump modules to the current "
+                            "working directory. (WARNING: "
+                            "will overwrite existing files).")),
+      cl::Hidden);
+
+  cl::list<BuiltinFunctionKind> GenerateBuiltinFunctions(
+      "generate",
+      cl::desc("Provide built-in functions for access by JITed code "
+               "(jit-kind=orc-lazy only)"),
+      cl::values(clEnumValN(BuiltinFunctionKind::DumpDebugDescriptor,
+                            "__dump_jit_debug_descriptor",
+                            "Dump __jit_debug_descriptor contents to stdout"),
+                 clEnumValN(BuiltinFunctionKind::DumpDebugObjects,
+                            "__dump_jit_debug_objects",
+                            "Dump __jit_debug_descriptor in-memory debug "
+                            "objects as tool output")),
       cl::Hidden);
 
   ExitOnError ExitOnErr;
@@ -751,41 +756,9 @@ int main(int argc, char **argv, char * const *envp) {
   return Result;
 }
 
-// JITLink debug support plugins put information about JITed code in this GDB
-// JIT Interface global from OrcTargetProcess.
-extern "C" struct jit_descriptor __jit_debug_descriptor;
-
-static struct jit_code_entry *
-findNextDebugDescriptorEntry(struct jit_code_entry *Latest) {
-  if (Latest == nullptr)
-    return __jit_debug_descriptor.first_entry;
-  if (Latest->next_entry)
-    return Latest->next_entry;
-  return nullptr;
-}
-
-static ToolOutputFile &claimToolOutput() {
-  static std::unique_ptr<ToolOutputFile> ToolOutput = nullptr;
-  if (ToolOutput) {
-    WithColor::error(errs(), "lli")
-        << "Can not claim stdout for tool output twice\n";
-    exit(1);
-  }
-  std::error_code EC;
-  ToolOutput = std::make_unique<ToolOutputFile>("-", EC, sys::fs::OF_None);
-  if (EC) {
-    WithColor::error(errs(), "lli")
-        << "Failed to create tool output file: " << EC.message() << "\n";
-    exit(1);
-  }
-  return *ToolOutput;
-}
-
-static std::function<void(Module &)> createIRDebugDumper() {
+static std::function<void(Module &)> createDebugDumper() {
   switch (OrcDumpKind) {
   case DumpKind::NoDump:
-  case DumpKind::DumpDebugDescriptor:
-  case DumpKind::DumpDebugObjects:
     return [](Module &M) {};
 
   case DumpKind::DumpFuncsToStdOut:
@@ -827,43 +800,6 @@ static std::function<void(Module &)> createIRDebugDumper() {
   llvm_unreachable("Unknown DumpKind");
 }
 
-static std::function<void(MemoryBuffer &)> createObjDebugDumper() {
-  switch (OrcDumpKind) {
-  case DumpKind::NoDump:
-  case DumpKind::DumpFuncsToStdOut:
-  case DumpKind::DumpModsToStdOut:
-  case DumpKind::DumpModsToDisk:
-    return [](MemoryBuffer &) {};
-
-  case DumpKind::DumpDebugDescriptor: {
-    // Dump the empty descriptor at startup once
-    fprintf(stderr, "jit_debug_descriptor 0x%016" PRIx64 "\n",
-            pointerToJITTargetAddress(__jit_debug_descriptor.first_entry));
-    return [](MemoryBuffer &) {
-      // Dump new entries as they appear
-      static struct jit_code_entry *Latest = nullptr;
-      while (auto *NewEntry = findNextDebugDescriptorEntry(Latest)) {
-        fprintf(stderr, "jit_debug_descriptor 0x%016" PRIx64 "\n",
-                pointerToJITTargetAddress(NewEntry));
-        Latest = NewEntry;
-      }
-    };
-  }
-
-  case DumpKind::DumpDebugObjects: {
-    return [](MemoryBuffer &Obj) {
-      static struct jit_code_entry *Latest = nullptr;
-      static ToolOutputFile &ToolOutput = claimToolOutput();
-      while (auto *NewEntry = findNextDebugDescriptorEntry(Latest)) {
-        ToolOutput.os().write(NewEntry->symfile_addr, NewEntry->symfile_size);
-        Latest = NewEntry;
-      }
-    };
-  }
-  }
-  llvm_unreachable("Unknown DumpKind");
-}
-
 Error loadDylibs() {
   for (const auto &Dylib : Dylibs) {
     std::string ErrMsg;
@@ -1065,7 +1001,8 @@ int runOrcJIT(const char *ProgName) {
   if (PerModuleLazy)
     J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule);
 
-  auto IRDump = createIRDebugDumper();
+  auto Dump = createDebugDumper();
+
   J->getIRTransformLayer().setTransform(
       [&](orc::ThreadSafeModule TSM,
           const orc::MaterializationResponsibility &R) {
@@ -1074,18 +1011,18 @@ int runOrcJIT(const char *ProgName) {
             dbgs() << "Bad module: " << &M << "\n";
             exit(1);
           }
-          IRDump(M);
+          Dump(M);
         });
         return TSM;
       });
 
-  auto ObjDump = createObjDebugDumper();
-  J->getObjTransformLayer().setTransform(
-      [&](std::unique_ptr<MemoryBuffer> Obj)
-          -> Expected<std::unique_ptr<MemoryBuffer>> {
-        ObjDump(*Obj);
-        return Obj;
-      });
+  if (GenerateBuiltinFunctions.size() > 0) {
+    // Add LLI builtins.
+    orc::MangleAndInterner Mangle(J->getExecutionSession(), J->getDataLayout());
+    J->getMainJITDylib().addGenerator(
+        std::make_unique<LLIBuiltinFunctionGenerator>(GenerateBuiltinFunctions,
+                                                      Mangle));
+  }
 
   // If this is a Mingw or Cygwin executor then we need to alias __main to
   // orc_rt_int_void_return_0.

From fa4780fa6cc36188b84b2a977ac15351c39d45dd Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <jplehr@users.noreply.github.com>
Date: Mon, 22 Jan 2024 21:59:26 +0100
Subject: [PATCH 482/843] [OpenMP][USM] Introduces -fopenmp-force-usm flag
 (#76571)

This flag forces the compiler to generate code for OpenMP target regions
as if the user specified the #pragma omp requires unified_shared_memory
in each source file.

The option does not have a -fno-* friend since OpenMP requires the
unified_shared_memory clause to be present in all source files. Since
this flag does no harm if the clause is present, it can be used in
conjunction. My understanding is that USM should not be turned off
selectively, hence, no -fno- version.

This adds a basic test to check the correct generation of double
indirect access to declare target globals in USM mode vs non-USM mode.
Which I think is the only difference observable in code generation.

This runtime test checks for the (non-)occurence of data movement between host
and device. It does one run without the flag and one with the flag to
also see that both versions behave as expected. In the case w/o the new
flag data movement between host and device is expected. In the case with
the flag such data movement should not be present / reported.
---
 clang/include/clang/Basic/LangOptions.def     |  1 +
 clang/include/clang/Driver/Options.td         |  4 +
 clang/lib/CodeGen/CGOpenMPRuntime.cpp         |  7 ++
 clang/lib/Driver/ToolChains/Clang.cpp         |  2 +
 clang/test/OpenMP/force-usm.c                 | 79 +++++++++++++++++++
 openmp/libomptarget/test/lit.cfg              |  8 ++
 .../test/offloading/force-usm.cpp             | 59 ++++++++++++++
 7 files changed, 160 insertions(+)
 create mode 100644 clang/test/OpenMP/force-usm.c
 create mode 100644 openmp/libomptarget/test/offloading/force-usm.cpp

diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 47cbd3d30f847..8fc75e1cca039 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -260,6 +260,7 @@ LANGOPT(OpenMPTeamSubscription  , 1, 0, "Assume distributed loops do not have mo
 LANGOPT(OpenMPNoThreadState  , 1, 0, "Assume that no thread in a parallel region will modify an ICV.")
 LANGOPT(OpenMPNoNestedParallelism  , 1, 0, "Assume that no thread in a parallel region will encounter a parallel region")
 LANGOPT(OpenMPOffloadMandatory  , 1, 0, "Assert that offloading is mandatory and do not create a host fallback.")
+LANGOPT(OpenMPForceUSM     , 1, 0, "Enable OpenMP unified shared memory mode via compiler.")
 LANGOPT(NoGPULib  , 1, 0, "Indicate a build without the standard GPU libraries.")
 LANGOPT(RenderScript      , 1, 0, "RenderScript")
 
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 637b10652fcd9..819f6f1a15c3f 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -3459,6 +3459,10 @@ def fopenmp_offload_mandatory : Flag<["-"], "fopenmp-offload-mandatory">, Group<
   Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
   HelpText<"Do not create a host fallback if offloading to the device fails.">,
   MarshallingInfoFlag<LangOpts<"OpenMPOffloadMandatory">>;
+def fopenmp_force_usm : Flag<["-"], "fopenmp-force-usm">, Group<f_Group>,
+  Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CC1Option]>,
+  HelpText<"Force behvaior as if the user specified pragma omp requires unified_shared_memory.">,
+  MarshallingInfoFlag<LangOpts<"OpenMPForceUSM">>;
 def fopenmp_target_jit : Flag<["-"], "fopenmp-target-jit">, Group<f_Group>,
   Flags<[NoArgumentUnused]>, Visibility<[ClangOption, CLOption]>,
   HelpText<"Emit code that can be JIT compiled for OpenMP offloading. Implies -foffload-lto=full">;
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index ea6645a39e832..4855e7410a015 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -1044,6 +1044,13 @@ CGOpenMPRuntime::CGOpenMPRuntime(CodeGenModule &CGM)
                                          ? CGM.getLangOpts().OMPHostIRFile
                                          : StringRef{});
   OMPBuilder.setConfig(Config);
+
+  // The user forces the compiler to behave as if omp requires
+  // unified_shared_memory was given.
+  if (CGM.getLangOpts().OpenMPForceUSM) {
+    HasRequiresUnifiedSharedMemory = true;
+    OMPBuilder.Config.setHasRequiresUnifiedSharedMemory(true);
+  }
 }
 
 void CGOpenMPRuntime::clear() {
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 776a13b958893..2f33943de45c5 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -6460,6 +6460,8 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
         CmdArgs.push_back("-fopenmp-assume-no-nested-parallelism");
       if (Args.hasArg(options::OPT_fopenmp_offload_mandatory))
         CmdArgs.push_back("-fopenmp-offload-mandatory");
+      if (Args.hasArg(options::OPT_fopenmp_force_usm))
+        CmdArgs.push_back("-fopenmp-force-usm");
       break;
     default:
       // By default, if Clang doesn't know how to generate useful OpenMP code
diff --git a/clang/test/OpenMP/force-usm.c b/clang/test/OpenMP/force-usm.c
new file mode 100644
index 0000000000000..5c63a9a5e7004
--- /dev/null
+++ b/clang/test/OpenMP/force-usm.c
@@ -0,0 +1,79 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --include-generated-funcs --replace-value-regex "__omp_offloading_[0-9a-z]+_[0-9a-z]+" "pl_cond[.].+[.|,]" --prefix-filecheck-ir-name _ --version 3
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -fopenmp-force-usm -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-force-usm -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-USM %s
+
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple x86_64-unknown-unknown -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -fopenmp -x c++ -std=c++11 -triple amdgcn-amd-amdhsa -fopenmp-targets=amdgcn-amd-amdhsa -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck -check-prefix=CHECK-DEFAULT %s
+// expected-no-diagnostics
+
+extern "C" void *malloc(unsigned int b);
+
+int GI;
+#pragma omp declare target
+int *pGI;
+#pragma omp end declare target
+
+int main(void) {
+
+  GI = 0;
+
+  pGI = (int *) malloc(sizeof(int));
+  *pGI = 42;
+
+#pragma omp target map(pGI[:1], GI)
+  {
+    GI = 1;
+    *pGI = 2;
+  }
+
+  return 0;
+}
+
+// CHECK-USM-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25(
+// CHECK-USM-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-USM-NEXT:  entry:
+// CHECK-USM-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-USM-NEXT:    [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-USM-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// CHECK-USM-NEXT:    [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
+// CHECK-USM-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
+// CHECK-USM-NEXT:    store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-USM-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-USM-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]])
+// CHECK-USM-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-USM-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-USM:       user_code.entry:
+// CHECK-USM-NEXT:    store i32 1, ptr [[TMP0]], align 4
+// CHECK-USM-NEXT:    [[TMP2:%.*]] = load ptr, ptr @pGI_decl_tgt_ref_ptr, align 8
+// CHECK-USM-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP2]], align 8
+// CHECK-USM-NEXT:    store i32 2, ptr [[TMP3]], align 4
+// CHECK-USM-NEXT:    call void @__kmpc_target_deinit()
+// CHECK-USM-NEXT:    ret void
+// CHECK-USM:       worker.exit:
+// CHECK-USM-NEXT:    ret void
+//
+//
+// CHECK-DEFAULT-LABEL: define weak_odr protected amdgpu_kernel void @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25(
+// CHECK-DEFAULT-SAME: ptr noalias noundef [[DYN_PTR:%.*]], ptr noundef nonnull align 4 dereferenceable(4) [[GI:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-DEFAULT-NEXT:  entry:
+// CHECK-DEFAULT-NEXT:    [[DYN_PTR_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-DEFAULT-NEXT:    [[GI_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-DEFAULT-NEXT:    [[DYN_PTR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DYN_PTR_ADDR]] to ptr
+// CHECK-DEFAULT-NEXT:    [[GI_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[GI_ADDR]] to ptr
+// CHECK-DEFAULT-NEXT:    store ptr [[DYN_PTR]], ptr [[DYN_PTR_ADDR_ASCAST]], align 8
+// CHECK-DEFAULT-NEXT:    store ptr [[GI]], ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-DEFAULT-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[GI_ADDR_ASCAST]], align 8
+// CHECK-DEFAULT-NEXT:    [[TMP1:%.*]] = call i32 @__kmpc_target_init(ptr addrspacecast (ptr addrspace(1) @{{__omp_offloading_[0-9a-z]+_[0-9a-z]+}}_main_l25_kernel_environment to ptr), ptr [[DYN_PTR]])
+// CHECK-DEFAULT-NEXT:    [[EXEC_USER_CODE:%.*]] = icmp eq i32 [[TMP1]], -1
+// CHECK-DEFAULT-NEXT:    br i1 [[EXEC_USER_CODE]], label [[USER_CODE_ENTRY:%.*]], label [[WORKER_EXIT:%.*]]
+// CHECK-DEFAULT:       user_code.entry:
+// CHECK-DEFAULT-NEXT:    store i32 1, ptr [[TMP0]], align 4
+// CHECK-DEFAULT-NEXT:    [[TMP2:%.*]] = load ptr, ptr addrspacecast (ptr addrspace(1) @pGI to ptr), align 8
+// CHECK-DEFAULT-NEXT:    store i32 2, ptr [[TMP2]], align 4
+// CHECK-DEFAULT-NEXT:    call void @__kmpc_target_deinit()
+// CHECK-DEFAULT-NEXT:    ret void
+// CHECK-DEFAULT:       worker.exit:
+// CHECK-DEFAULT-NEXT:    ret void
+//
diff --git a/openmp/libomptarget/test/lit.cfg b/openmp/libomptarget/test/lit.cfg
index a0df7314fe974..d912b622c05ba 100644
--- a/openmp/libomptarget/test/lit.cfg
+++ b/openmp/libomptarget/test/lit.cfg
@@ -190,6 +190,8 @@ for libomptarget_target in config.libomptarget_all_targets:
             "%libomptarget-compile-and-run-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compilexx-generic",
             "%libomptarget-compilexx-" + libomptarget_target))
+        config.substitutions.append(("%libomptarget-compilexxx-generic-force-usm",
+            "%libomptarget-compilexxx-force-usm-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compile-generic",
             "%libomptarget-compile-" + libomptarget_target))
         config.substitutions.append(("%libomptarget-compile-fortran-generic",
@@ -247,6 +249,9 @@ for libomptarget_target in config.libomptarget_all_targets:
         config.substitutions.append(("%libomptarget-compilexx-" + \
             libomptarget_target, \
             "%clangxx-" + libomptarget_target + add_libraries(" %s -o %t")))
+        config.substitutions.append(("%libomptarget-compilexxx-force-usm-" +
+            libomptarget_target, "%clangxxx-force-usm-" + libomptarget_target + \
+                                     add_libraries(" %s -o %t")))
         config.substitutions.append(("%libomptarget-compile-" + \
             libomptarget_target, \
             "%clang-" + libomptarget_target + add_libraries(" %s -o %t")))
@@ -284,6 +289,9 @@ for libomptarget_target in config.libomptarget_all_targets:
         config.substitutions.append(("%clangxx-" + libomptarget_target, \
                                      "%clangxx %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
+        config.substitutions.append(("%clangxxx-force-usm-" + libomptarget_target, \
+                                     "%clangxx %openmp_flags -fopenmp-force-usm  %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
+                                     remove_suffix_if_present(libomptarget_target)))
         config.substitutions.append(("%clang-" + libomptarget_target, \
                                      "%clang %openmp_flags %cuda_flags %flags %flags_clang -fopenmp-targets=" +\
                                      remove_suffix_if_present(libomptarget_target)))
diff --git a/openmp/libomptarget/test/offloading/force-usm.cpp b/openmp/libomptarget/test/offloading/force-usm.cpp
new file mode 100644
index 0000000000000..5bddecd5b4675
--- /dev/null
+++ b/openmp/libomptarget/test/offloading/force-usm.cpp
@@ -0,0 +1,59 @@
+// clang-format off
+// RUN: %libomptarget-compilexx-generic
+// RUN: env LIBOMPTARGET_INFO=32 %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=NO-USM
+//
+// RUN: %libomptarget-compilexxx-generic-force-usm
+// RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=32 \
+// RUN:       %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=FORCE-USM
+//
+// UNSUPPORTED: nvptx64-nvidia-cuda
+// UNSUPPORTED: nvptx64-nvidia-cuda-LTO
+// clang-format on
+
+#include <cassert>
+#include <cstdio>
+#include <cstdlib>
+
+int GI;
+#pragma omp declare target
+int *pGI;
+#pragma omp end declare target
+
+int main(void) {
+
+  GI = 0;
+  // Implicit mappings
+  int alpha = 1;
+  int beta[3] = {2, 5, 8};
+
+  // Require map clauses for non-USM execution
+  pGI = (int *)malloc(sizeof(int));
+  *pGI = 42;
+
+#pragma omp target map(pGI[ : 1], GI)
+  {
+    GI = 1 * alpha;
+    *pGI = 2 * beta[1];
+  }
+
+  assert(GI == 1);
+  assert(*pGI == 10);
+
+  printf("SUCCESS\n");
+
+  return 0;
+}
+
+// clang-format off
+// NO-USM: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
+// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=12
+// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=4
+// NO-USM-NEXT: omptarget device 0 info: Copying data from host to device, HstPtr={{.*}}, TgtPtr={{.*}}, Size=8, Name=pGI
+// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
+// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=12
+// NO-USM-NEXT: omptarget device 0 info: Copying data from device to host, TgtPtr={{.*}}, HstPtr={{.*}}, Size=4
+// NO-USM-NEXT: SUCCESS
+
+// FORCE-USM: SUCCESS
+//
+// clang-format on

From 80b67eebd2ca55c571031ee8a6d23ce3c24a86db Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 22 Jan 2024 13:05:47 -0800
Subject: [PATCH 483/843] [RISCV] Add Zic64b, Ziccamoa, Ziccif, Zicclsm,
 Ziccrse, and Za64rs to sifive-p450. (#79030)

---
 clang/test/Driver/riscv-cpus.c           | 6 ++++++
 llvm/lib/Target/RISCV/RISCVProcessors.td | 6 ++++++
 2 files changed, 12 insertions(+)

diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index d181755bb5850..015df83e77800 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -228,14 +228,20 @@
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+f"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+d"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+c"
+// MCPU-SIFIVE-P450-SAME: "-target-feature" "+zic64b"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zicbom"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zicbop"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zicboz"
+// MCPU-SIFIVE-P450-SAME: "-target-feature" "+ziccamoa"
+// MCPU-SIFIVE-P450-SAME: "-target-feature" "+ziccif"
+// MCPU-SIFIVE-P450-SAME: "-target-feature" "+zicclsm"
+// MCPU-SIFIVE-P450-SAME: "-target-feature" "+ziccrse"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zicsr"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zifencei"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zihintntl"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zihintpause"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zihpm"
+// MCPU-SIFIVE-P450-SAME: "-target-feature" "+za64rs"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zfhmin"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zba"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zbb"
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index d1cd9ba1dd84d..c8cf3644b21fb 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -223,9 +223,15 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                        FeatureStdExtF,
                                        FeatureStdExtD,
                                        FeatureStdExtC,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZic64b,
                                        FeatureStdExtZicbop,
                                        FeatureStdExtZicbom,
                                        FeatureStdExtZicboz,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZicclsm,
+                                       FeatureStdExtZiccrse,
                                        FeatureStdExtZihintntl,
                                        FeatureStdExtZihintpause,
                                        FeatureStdExtZihpm,

From 7c8030e9bbad913a6a87ddf9ea4d69ab33ffb6c4 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 22 Jan 2024 13:06:37 -0800
Subject: [PATCH 484/843] [RISCV] Combine HasStdExtZfhOrZfhmin and
 HasStdExtZfhmin. NFC (#78826)

I kept the AssemblerPredicate and diagnostic from HasStdExtZfhOrZfhmin
that mentions both extensions, but replaced all uses with
HasStdExtZfhmin.

Same for the Zhinxmin equivalent.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td     | 15 +------
 llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td | 48 +++++++++++-----------
 2 files changed, 26 insertions(+), 37 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index cbb096ba20ae6..c84436ca5d109 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -214,6 +214,7 @@ def FeatureStdExtZfhmin
                        [FeatureStdExtF]>;
 def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
                              AssemblerPredicate<(all_of FeatureStdExtZfhmin),
+                             "'Zfh' (Half-Precision Floating-Point) or "
                              "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
 
 def FeatureStdExtZfh
@@ -225,13 +226,6 @@ def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
                              "'Zfh' (Half-Precision Floating-Point)">;
 def NoStdExtZfh : Predicate<"!Subtarget->hasStdExtZfh()">;
 
-// FIXME: Remove this.
-def HasStdExtZfhOrZfhmin
-    : Predicate<"Subtarget->hasStdExtZfhmin()">,
-                AssemblerPredicate<(all_of FeatureStdExtZfhmin),
-                                   "'Zfh' (Half-Precision Floating-Point) or "
-                                   "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
-
 def FeatureStdExtZfbfmin
     : SubtargetFeature<"experimental-zfbfmin", "HasStdExtZfbfmin", "true",
                        "'Zfbfmin' (Scalar BF16 Converts)",
@@ -278,6 +272,7 @@ def FeatureStdExtZhinxmin
                        [FeatureStdExtZfinx]>;
 def HasStdExtZhinxmin : Predicate<"Subtarget->hasStdExtZhinxmin()">,
                                   AssemblerPredicate<(all_of FeatureStdExtZhinxmin),
+                                  "'Zhinx' (Half Float in Integer) or "
                                   "'Zhinxmin' (Half Float in Integer Minimal)">;
 
 def FeatureStdExtZhinx
@@ -289,12 +284,6 @@ def HasStdExtZhinx : Predicate<"Subtarget->hasStdExtZhinx()">,
                                "'Zhinx' (Half Float in Integer)">;
 def NoStdExtZhinx : Predicate<"!Subtarget->hasStdExtZhinx()">;
 
-def HasStdExtZhinxOrZhinxmin
-    : Predicate<"Subtarget->hasStdExtZhinxmin()">,
-                AssemblerPredicate<(all_of FeatureStdExtZhinxmin),
-                                   "'Zhinx' (Half Float in Integer) or "
-                                   "'Zhinxmin' (Half Float in Integer Minimal)">;
-
 // Compressed Extensions
 
 def FeatureStdExtC
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
index 055f130327886..2e0f754cdf7c8 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZfh.td
@@ -40,30 +40,30 @@ def FPR16INX : RegisterOperand<GPRF16> {
 
 def ZfhExt     : ExtInfo<"", "", [HasStdExtZfh],
                          f16, FPR16, FPR32, ?, FPR16>;
-def ZfhminExt  : ExtInfo<"", "", [HasStdExtZfhOrZfhmin],
+def ZfhminExt  : ExtInfo<"", "", [HasStdExtZfhmin],
                          f16, FPR16, FPR32, ?, FPR16>;
 def ZfhDExt    : ExtInfo<"", "", [HasStdExtZfh, HasStdExtD],
                          ?, ?, FPR32, FPR64, FPR16>;
-def ZfhminDExt : ExtInfo<"", "", [HasStdExtZfhOrZfhmin, HasStdExtD],
+def ZfhminDExt : ExtInfo<"", "", [HasStdExtZfhmin, HasStdExtD],
                          ?, ?, FPR32, FPR64, FPR16>;
 
 def ZhinxExt            : ExtInfo<"_INX", "RVZfinx",
                                   [HasStdExtZhinx],
                                   f16, FPR16INX, FPR32INX, ?, FPR16INX>;
 def ZhinxminExt         : ExtInfo<"_INX", "RVZfinx",
-                                  [HasStdExtZhinxOrZhinxmin],
+                                  [HasStdExtZhinxmin],
                                   f16, FPR16INX, FPR32INX, ?, FPR16INX>;
 def ZhinxZdinxExt       : ExtInfo<"_INX", "RVZfinx",
                                   [HasStdExtZhinx, HasStdExtZdinx, IsRV64],
                                   ?, ?, FPR32INX, FPR64INX, FPR16INX>;
 def ZhinxminZdinxExt    : ExtInfo<"_INX", "RVZfinx",
-                                  [HasStdExtZhinxOrZhinxmin, HasStdExtZdinx, IsRV64],
+                                  [HasStdExtZhinxmin, HasStdExtZdinx, IsRV64],
                                   ?, ?, FPR32INX, FPR64INX, FPR16INX>;
 def ZhinxZdinx32Ext     : ExtInfo<"_IN32X", "RV32Zdinx",
                                   [HasStdExtZhinx, HasStdExtZdinx, IsRV32],
                                   ?, ?, FPR32INX, FPR64IN32X, FPR16INX >;
 def ZhinxminZdinx32Ext  : ExtInfo<"_IN32X", "RV32Zdinx",
-                                  [HasStdExtZhinxOrZhinxmin, HasStdExtZdinx, IsRV32],
+                                  [HasStdExtZhinxmin, HasStdExtZdinx, IsRV32],
                                   ?, ?, FPR32INX, FPR64IN32X, FPR16INX>;
 
 defvar ZfhExts = [ZfhExt, ZhinxExt];
@@ -200,10 +200,10 @@ foreach Ext = ZfhminDExts in {
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
 //===----------------------------------------------------------------------===//
 
-let Predicates = [HasStdExtZfhOrZfhmin] in {
+let Predicates = [HasStdExtZfhmin] in {
 def : InstAlias<"flh $rd, (${rs1})",  (FLH FPR16:$rd,  GPR:$rs1, 0), 0>;
 def : InstAlias<"fsh $rs2, (${rs1})", (FSH FPR16:$rs2, GPR:$rs1, 0), 0>;
-} // Predicates = [HasStdExtZfhOrZfhmin]
+} // Predicates = [HasStdExtZfhmin]
 
 let Predicates = [HasStdExtZfh] in {
 def : InstAlias<"fmv.h $rd, $rs",  (FSGNJ_H  FPR16:$rd, FPR16:$rs, FPR16:$rs)>;
@@ -223,10 +223,10 @@ def PseudoQuietFLT_H : PseudoQuietFCMP<FPR16>;
 }
 } // Predicates = [HasStdExtZfh]
 
-let Predicates = [HasStdExtZfhOrZfhmin] in {
+let Predicates = [HasStdExtZfhmin] in {
 def PseudoFLH  : PseudoFloatLoad<"flh", FPR16>;
 def PseudoFSH  : PseudoStore<"fsh", FPR16>;
-} // Predicates = [HasStdExtZfhOrZfhmin]
+} // Predicates = [HasStdExtZfhmin]
 
 let Predicates = [HasStdExtZhinx] in {
 def : InstAlias<"fmv.h $rd, $rs",  (FSGNJ_H_INX  FPR16INX:$rd, FPR16INX:$rs, FPR16INX:$rs)>;
@@ -242,7 +242,7 @@ let usesCustomInserter = 1 in {
 def PseudoQuietFLE_H_INX : PseudoQuietFCMP<FPR16INX>;
 def PseudoQuietFLT_H_INX : PseudoQuietFCMP<FPR16INX>;
 }
-} // Predicates = [HasStdExtZhinxOrZhinxmin]
+} // Predicates = [HasStdExtZhinxmin]
 
 //===----------------------------------------------------------------------===//
 // Pseudo-instructions and codegen patterns
@@ -412,15 +412,15 @@ defm Select_FPR16INX : SelectCC_GPR_rrirr<FPR16INX, f16>;
 def PseudoFROUND_H_INX : PseudoFROUND<FPR16INX, f16>;
 } // Predicates = [HasStdExtZhinx]
 
-let Predicates = [HasStdExtZfhOrZfhmin] in {
+let Predicates = [HasStdExtZfhmin] in {
 /// Loads
 def : LdPat<load, FLH, f16>;
 
 /// Stores
 def : StPat<store, FSH, FPR16, f16>;
-} // Predicates = [HasStdExtZfhOrZfhmin]
+} // Predicates = [HasStdExtZfhmin]
 
-let Predicates = [HasStdExtZhinxOrZhinxmin] in {
+let Predicates = [HasStdExtZhinxmin] in {
 /// Loads
 def : Pat<(f16 (load (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm12))),
           (COPY_TO_REGCLASS (LH GPR:$rs1, simm12:$imm12), GPRF16)>;
@@ -429,9 +429,9 @@ def : Pat<(f16 (load (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm12))),
 def : Pat<(store (f16 FPR16INX:$rs2),
                  (AddrRegImm (XLenVT GPR:$rs1), simm12:$imm12)),
           (SH (COPY_TO_REGCLASS FPR16INX:$rs2, GPR), GPR:$rs1, simm12:$imm12)>;
-} // Predicates = [HasStdExtZhinxOrZhinxmin]
+} // Predicates = [HasStdExtZhinxmin]
 
-let Predicates = [HasStdExtZfhOrZfhmin] in {
+let Predicates = [HasStdExtZfhmin] in {
 /// Float conversion operations
 
 // f32 -> f16, f16 -> f32
@@ -444,9 +444,9 @@ def : Pat<(riscv_fmv_x_anyexth (f16 FPR16:$src)), (FMV_X_H FPR16:$src)>;
 def : Pat<(riscv_fmv_x_signexth (f16 FPR16:$src)), (FMV_X_H FPR16:$src)>;
 
 def : Pat<(fcopysign FPR32:$rs1, (f16 FPR16:$rs2)), (FSGNJ_S $rs1, (FCVT_S_H $rs2, FRM_RNE))>;
-} // Predicates = [HasStdExtZfhOrZfhmin]
+} // Predicates = [HasStdExtZfhmin]
 
-let Predicates = [HasStdExtZhinxOrZhinxmin] in {
+let Predicates = [HasStdExtZhinxmin] in {
 /// Float conversion operations
 
 // f32 -> f16, f16 -> f32
@@ -459,7 +459,7 @@ def : Pat<(riscv_fmv_x_anyexth FPR16INX:$src), (COPY_TO_REGCLASS FPR16INX:$src,
 def : Pat<(riscv_fmv_x_signexth FPR16INX:$src), (COPY_TO_REGCLASS FPR16INX:$src, GPR)>;
 
 def : Pat<(fcopysign FPR32INX:$rs1, FPR16INX:$rs2), (FSGNJ_S_INX $rs1, (FCVT_S_H_INX $rs2, FRM_RNE))>;
-} // Predicates = [HasStdExtZhinxOrZhinxmin]
+} // Predicates = [HasStdExtZhinxmin]
 
 let Predicates = [HasStdExtZfh] in {
 // half->[u]int. Round-to-zero must be used.
@@ -561,7 +561,7 @@ def : Pat<(any_sint_to_fp (i64 GPR:$rs1)), (FCVT_H_L_INX $rs1, FRM_DYN)>;
 def : Pat<(any_uint_to_fp (i64 GPR:$rs1)), (FCVT_H_LU_INX $rs1, FRM_DYN)>;
 } // Predicates = [HasStdExtZhinx, IsRV64]
 
-let Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD] in {
+let Predicates = [HasStdExtZfhmin, HasStdExtD] in {
 /// Float conversion operations
 // f64 -> f16, f16 -> f64
 def : Pat<(f16 (any_fpround FPR64:$rs1)), (FCVT_H_D FPR64:$rs1, FRM_DYN)>;
@@ -571,9 +571,9 @@ def : Pat<(any_fpextend (f16 FPR16:$rs1)), (FCVT_D_H FPR16:$rs1, FRM_RNE)>;
 def : Pat<(f16 (fcopysign FPR16:$rs1, FPR64:$rs2)),
           (FSGNJ_H $rs1, (FCVT_H_D $rs2, FRM_DYN))>;
 def : Pat<(fcopysign FPR64:$rs1, (f16 FPR16:$rs2)), (FSGNJ_D $rs1, (FCVT_D_H $rs2, FRM_RNE))>;
-} // Predicates = [HasStdExtZfhOrZfhmin, HasStdExtD]
+} // Predicates = [HasStdExtZfhmin, HasStdExtD]
 
-let Predicates = [HasStdExtZhinxOrZhinxmin, HasStdExtZdinx, IsRV32] in {
+let Predicates = [HasStdExtZhinxmin, HasStdExtZdinx, IsRV32] in {
 /// Float conversion operations
 // f64 -> f16, f16 -> f64
 def : Pat<(any_fpround FPR64IN32X:$rs1), (FCVT_H_D_IN32X FPR64IN32X:$rs1, FRM_DYN)>;
@@ -583,9 +583,9 @@ def : Pat<(any_fpextend FPR16INX:$rs1), (FCVT_D_H_IN32X FPR16INX:$rs1, FRM_RNE)>
 def : Pat<(fcopysign FPR16INX:$rs1, FPR64IN32X:$rs2),
           (FSGNJ_H_INX $rs1, (FCVT_H_D_IN32X $rs2, 0b111))>;
 def : Pat<(fcopysign FPR64IN32X:$rs1, FPR16INX:$rs2), (FSGNJ_D_IN32X $rs1, (FCVT_D_H_IN32X $rs2, FRM_RNE))>;
-} // Predicates = [HasStdExtZhinxOrZhinxmin, HasStdExtZdinx, IsRV32]
+} // Predicates = [HasStdExtZhinxmin, HasStdExtZdinx, IsRV32]
 
-let Predicates = [HasStdExtZhinxOrZhinxmin, HasStdExtZdinx, IsRV64] in {
+let Predicates = [HasStdExtZhinxmin, HasStdExtZdinx, IsRV64] in {
 /// Float conversion operations
 // f64 -> f16, f16 -> f64
 def : Pat<(any_fpround FPR64INX:$rs1), (FCVT_H_D_INX FPR64INX:$rs1, FRM_DYN)>;
@@ -595,7 +595,7 @@ def : Pat<(any_fpextend FPR16INX:$rs1), (FCVT_D_H_INX FPR16INX:$rs1, FRM_RNE)>;
 def : Pat<(fcopysign FPR16INX:$rs1, FPR64INX:$rs2),
           (FSGNJ_H_INX $rs1, (FCVT_H_D_INX $rs2, 0b111))>;
 def : Pat<(fcopysign FPR64INX:$rs1, FPR16INX:$rs2), (FSGNJ_D_INX $rs1, (FCVT_D_H_INX $rs2, FRM_RNE))>;
-} // Predicates = [HasStdExtZhinxOrZhinxmin, HasStdExtZdinx, IsRV64]
+} // Predicates = [HasStdExtZhinxmin, HasStdExtZdinx, IsRV64]
 
 let Predicates = [HasStdExtZfhmin, NoStdExtZfh] in {
 // half->[u]int. Round-to-zero must be used.

From 4cb90ca8f8bfbe8dc938a1b8b821d98640cbab4c Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 22 Jan 2024 13:16:31 -0800
Subject: [PATCH 485/843] [Thumb,ELF] Fix access to dso_preemptable
 __stack_chk_guard with static relocation model (#78950)

PR #70014 fixes A32 to use GOT for dso_preemptable `__stack_chk_guard`
with static relocation model (e.g. -fPIE/-fPIC LTO compiles with -no-pie
linking).

This patch fixes such `__stack_chk_guard` access for Thumb1 and Thumb2.
Note: `t2LDRLIT_ga_pcrel` is only for ELF. mingw needs
`.refptr.__stack_chk_guard` (https://reviews.llvm.org/D92738).

Fix #64999
---
 llvm/lib/Target/ARM/Thumb1InstrInfo.cpp  |  4 ++--
 llvm/lib/Target/ARM/Thumb2InstrInfo.cpp  |  6 ++---
 llvm/test/CodeGen/ARM/stack-guard-elf.ll | 30 +++++++++++++++++++-----
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index e2f3fad200790..85eabdb17ad19 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -135,14 +135,14 @@ void Thumb1InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 void Thumb1InstrInfo::expandLoadStackGuard(
     MachineBasicBlock::iterator MI) const {
   MachineFunction &MF = *MI->getParent()->getParent();
-  const TargetMachine &TM = MF.getTarget();
   const ARMSubtarget &ST = MF.getSubtarget<ARMSubtarget>();
+  const auto *GV = cast<GlobalValue>((*MI->memoperands_begin())->getValue());
 
   assert(MF.getFunction().getParent()->getStackProtectorGuard() != "tls" &&
          "TLS stack protector not supported for Thumb1 targets");
 
   unsigned Instr;
-  if (TM.isPositionIndependent())
+  if (!GV->isDSOLocal())
     Instr = ARM::tLDRLIT_ga_pcrel;
   else if (ST.genExecuteOnly() && ST.hasV8MBaselineOps())
     Instr = ARM::t2MOVi32imm;
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 2ea0eaa0aad8f..083f25f49dec4 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -261,10 +261,8 @@ void Thumb2InstrInfo::expandLoadStackGuard(
     return;
   }
 
-  const GlobalValue *GV =
-      cast<GlobalValue>((*MI->memoperands_begin())->getValue());
-
-  if (MF.getSubtarget<ARMSubtarget>().isGVInGOT(GV))
+  const auto *GV = cast<GlobalValue>((*MI->memoperands_begin())->getValue());
+  if (MF.getSubtarget<ARMSubtarget>().isTargetELF() && !GV->isDSOLocal())
     expandLoadStackGuardBase(MI, ARM::t2LDRLIT_ga_pcrel, ARM::t2LDRi12);
   else if (MF.getTarget().isPositionIndependent())
     expandLoadStackGuardBase(MI, ARM::t2MOV_ga_pcrel, ARM::t2LDRi12);
diff --git a/llvm/test/CodeGen/ARM/stack-guard-elf.ll b/llvm/test/CodeGen/ARM/stack-guard-elf.ll
index 250f2ad9ed109..d0e5db7e5711b 100644
--- a/llvm/test/CodeGen/ARM/stack-guard-elf.ll
+++ b/llvm/test/CodeGen/ARM/stack-guard-elf.ll
@@ -59,6 +59,8 @@ define i32 @test1() #0 {
 ; THUMB1-NEXT:    .pad #16
 ; THUMB1-NEXT:    sub sp, #16
 ; THUMB1-NEXT:    ldr r0, .LCPI0_0
+; THUMB1-NEXT:  .LPC0_0:
+; THUMB1-NEXT:    add r0, pc
 ; THUMB1-NEXT:    ldr r0, [r0]
 ; THUMB1-NEXT:    ldr r0, [r0]
 ; THUMB1-NEXT:    add r1, sp, #904
@@ -67,7 +69,9 @@ define i32 @test1() #0 {
 ; THUMB1-NEXT:    bl foo
 ; THUMB1-NEXT:    add r0, sp, #904
 ; THUMB1-NEXT:    ldr r0, [r0, #124]
-; THUMB1-NEXT:    ldr r1, .LCPI0_0
+; THUMB1-NEXT:    ldr r1, .LCPI0_1
+; THUMB1-NEXT:  .LPC0_1:
+; THUMB1-NEXT:    add r1, pc
 ; THUMB1-NEXT:    ldr r1, [r1]
 ; THUMB1-NEXT:    ldr r1, [r1]
 ; THUMB1-NEXT:    cmp r1, r0
@@ -83,7 +87,11 @@ define i32 @test1() #0 {
 ; THUMB1-NEXT:    .p2align 2
 ; THUMB1-NEXT:  @ %bb.3:
 ; THUMB1-NEXT:  .LCPI0_0:
-; THUMB1-NEXT:    .long __stack_chk_guard
+; THUMB1-NEXT:  .Ltmp0:
+; THUMB1-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+4)-.Ltmp0)
+; THUMB1-NEXT:  .LCPI0_1:
+; THUMB1-NEXT:  .Ltmp1:
+; THUMB1-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+4)-.Ltmp1)
 ;
 ; THUMB1-PIC-LABEL: test1:
 ; THUMB1-PIC:       @ %bb.0:
@@ -136,16 +144,18 @@ define i32 @test1() #0 {
 ; THUMB2-NEXT:    push {r7, lr}
 ; THUMB2-NEXT:    .pad #1032
 ; THUMB2-NEXT:    sub.w sp, sp, #1032
-; THUMB2-NEXT:    movw r0, :lower16:__stack_chk_guard
-; THUMB2-NEXT:    movt r0, :upper16:__stack_chk_guard
+; THUMB2-NEXT:    ldr r0, .LCPI0_0
+; THUMB2-NEXT:  .LPC0_0:
+; THUMB2-NEXT:    add r0, pc
 ; THUMB2-NEXT:    ldr r0, [r0]
 ; THUMB2-NEXT:    ldr r0, [r0]
 ; THUMB2-NEXT:    str.w r0, [sp, #1028]
 ; THUMB2-NEXT:    add r0, sp, #4
 ; THUMB2-NEXT:    bl foo
-; THUMB2-NEXT:    movw r1, :lower16:__stack_chk_guard
 ; THUMB2-NEXT:    ldr.w r0, [sp, #1028]
-; THUMB2-NEXT:    movt r1, :upper16:__stack_chk_guard
+; THUMB2-NEXT:    ldr r1, .LCPI0_1
+; THUMB2-NEXT:  .LPC0_1:
+; THUMB2-NEXT:    add r1, pc
 ; THUMB2-NEXT:    ldr r1, [r1]
 ; THUMB2-NEXT:    ldr r1, [r1]
 ; THUMB2-NEXT:    cmp r1, r0
@@ -155,6 +165,14 @@ define i32 @test1() #0 {
 ; THUMB2-NEXT:    popeq {r7, pc}
 ; THUMB2-NEXT:  .LBB0_1:
 ; THUMB2-NEXT:    bl __stack_chk_fail
+; THUMB2-NEXT:    .p2align 2
+; THUMB2-NEXT:  @ %bb.2:
+; THUMB2-NEXT:  .LCPI0_0:
+; THUMB2-NEXT:  .Ltmp0:
+; THUMB2-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_0+4)-.Ltmp0)
+; THUMB2-NEXT:  .LCPI0_1:
+; THUMB2-NEXT:  .Ltmp1:
+; THUMB2-NEXT:    .long __stack_chk_guard(GOT_PREL)-((.LPC0_1+4)-.Ltmp1)
 ;
 ; THUMB2-PIC-LABEL: test1:
 ; THUMB2-PIC:       @ %bb.0:

From 181c4c331a0c661fe90d5e3803049232022aa582 Mon Sep 17 00:00:00 2001
From: Jan Patrick Lehr <jplehr@users.noreply.github.com>
Date: Mon, 22 Jan 2024 22:21:31 +0100
Subject: [PATCH 486/843] [OpenMP][Fix] Require USM capability in force-usm
 test (#79059)

This should fix the AMDGPU buildbot breakage from #76571
---
 openmp/libomptarget/test/offloading/force-usm.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/openmp/libomptarget/test/offloading/force-usm.cpp b/openmp/libomptarget/test/offloading/force-usm.cpp
index 5bddecd5b4675..a043ba47f54ad 100644
--- a/openmp/libomptarget/test/offloading/force-usm.cpp
+++ b/openmp/libomptarget/test/offloading/force-usm.cpp
@@ -6,6 +6,8 @@
 // RUN: env HSA_XNACK=1 LIBOMPTARGET_INFO=32 \
 // RUN:       %libomptarget-run-generic 2>&1 | %fcheck-generic --check-prefix=FORCE-USM
 //
+// REQUIRES: unified_shared_memory
+//
 // UNSUPPORTED: nvptx64-nvidia-cuda
 // UNSUPPORTED: nvptx64-nvidia-cuda-LTO
 // clang-format on

From 95c1039eca1fd32bd7f48db2e82c4294564d75d7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 22 Jan 2024 13:26:18 -0800
Subject: [PATCH 487/843] [RISCV] Add TuneNoDefaultUnroll to sifive-p450.

---
 llvm/lib/Target/RISCV/RISCVProcessors.td | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index c8cf3644b21fb..db60a5a1dab00 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -239,7 +239,8 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                        FeatureStdExtZbb,
                                        FeatureStdExtZbs,
                                        FeatureStdExtZfhmin],
-                                      [TuneConditionalCompressedMoveFusion,
+                                      [TuneNoDefaultUnroll,
+                                       TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,
                                        TuneAUIPCADDIFusion]>;
 

From 8789b7e5559c0c419d5247f60866e29ab3b651a8 Mon Sep 17 00:00:00 2001
From: Alan Phipps <a-phipps@ti.com>
Date: Mon, 22 Jan 2024 14:27:16 -0600
Subject: [PATCH 488/843] [clang][NFC] Update top-level Code Coverage
 documentation to include MC/DC.

---
 clang/docs/SourceBasedCodeCoverage.rst | 52 +++++++++++++++++++++++---
 llvm/docs/CoverageMappingFormat.rst    | 13 +++++++
 2 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/clang/docs/SourceBasedCodeCoverage.rst b/clang/docs/SourceBasedCodeCoverage.rst
index 2be6fa3d717a4..cee706289284d 100644
--- a/clang/docs/SourceBasedCodeCoverage.rst
+++ b/clang/docs/SourceBasedCodeCoverage.rst
@@ -64,6 +64,11 @@ To compile code with coverage enabled, pass ``-fprofile-instr-generate
 Note that linking together code with and without coverage instrumentation is
 supported. Uninstrumented code simply won't be accounted for in reports.
 
+To compile code with Modified Condition/Decision Coverage (MC/DC) enabled,
+pass ``-fcoverage-mcdc`` in addition to the clang options specified above.
+MC/DC is an advanced form of code coverage most applicable in the embedded
+space.
+
 Running the instrumented program
 ================================
 
@@ -211,6 +216,10 @@ region counts:
     |      4|    1|}
     ------------------
 
+If the application was instrumented for Modified Condition/Decision Coverage
+(MC/DC) using the clang option ``-fcoverage-mcdc``, an MC/DC subview can be
+enabled using ``--show-mcdc`` that will show detailed MC/DC information for
+each complex condition boolean expression containing at most six conditions.
 
 To generate a file-level summary of coverage statistics instead of a
 line-oriented report, try:
@@ -259,7 +268,7 @@ the exported data at a high level in the llvm-cov source code.
 Interpreting reports
 ====================
 
-There are five statistics tracked in a coverage summary:
+There are six statistics tracked in a coverage summary:
 
 * Function coverage is the percentage of functions which have been executed at
   least once. A function is considered to be executed if any of its
@@ -288,10 +297,28 @@ There are five statistics tracked in a coverage summary:
   that is comprised of two individual conditions, each of which evaluates to
   either true or false, producing four total branch outcomes.
 
-Of these five statistics, function coverage is usually the least granular while
-branch coverage is the most granular. 100% branch coverage for a function
-implies 100% region coverage for a function. The project-wide totals for each
-statistic are listed in the summary.
+* Modified Condition/Decision Coverage (MC/DC) is the percentage of individual
+  branch conditions that have been shown to independently affect the decision
+  outcome of the boolean expression they comprise. This is accomplished using
+  the analysis of executed control flow through the expression (i.e. test
+  vectors) to show that as a condition's outcome is varied between "true" and
+  false", the decision's outcome also varies between "true" and false", while
+  the outcome of all other conditions is held fixed (or they are masked out as
+  unevaluatable, as happens in languages whose logical operators have
+  short-circuit semantics).  MC/DC builds on top of branch coverage and
+  requires that all code blocks and all execution paths have been tested.  This
+  statistic is hidden by default in reports, but it can be enabled via the
+  ``-show-mcdc-summary`` option as long as code was also compiled using the
+  clang option ``-fcoverage-mcdc``.
+
+  * Boolean expressions that are only comprised of one condition (and therefore
+    have no logical operators) are not included in MC/DC analysis and are
+    trivially deducible using branch coverage.
+
+Of these six statistics, function coverage is usually the least granular while
+branch coverage (with MC/DC) is the most granular. 100% branch coverage for a
+function implies 100% region coverage for a function. The project-wide totals
+for each statistic are listed in the summary.
 
 Format compatibility guarantees
 ===============================
@@ -453,6 +480,21 @@ Branch coverage is tied directly to branch-generating conditions in the source
 code.  Users should not see hidden branches that aren't actually tied to the
 source code.
 
+MC/DC Instrumentation
+---------------------
+
+When instrumenting for Modified Condition/Decision Coverage (MC/DC) using the
+clang option ``-fcoverage-mcdc``, users are limited to at most **six** leaf-level
+conditions in a boolean expression.  A warning will be generated for boolean
+expressions that contain more than six, and they will not be instrumented for
+MC/DC.
+
+Also, if a boolean expression is embedded in the nest of another boolean
+expression but separated by a non-logical operator, this is also not supported.
+For example, in ``x = (a && b && c && func(d && f))``, the ``d && f`` case
+starts a new boolean expression that is separated from the other conditions by
+the operator ``func()``.  When this is encountered, a warning will be generated
+and the boolean expression will not be instrumented.
 
 Switch statements
 -----------------
diff --git a/llvm/docs/CoverageMappingFormat.rst b/llvm/docs/CoverageMappingFormat.rst
index c5cd1f795aab1..f2ae8df5ad7f8 100644
--- a/llvm/docs/CoverageMappingFormat.rst
+++ b/llvm/docs/CoverageMappingFormat.rst
@@ -146,6 +146,19 @@ There are several kinds of mapping regions:
   <span>}</span>
   </pre>`
 
+* Decision regions associate multiple branch regions with a boolean
+  expression in the source code.  This information also includes the number of
+  bitmap bytes needed to represent the expression's executed test vectors as
+  well as the total number of instrumentable branch conditions that comprise
+  the expression.  Decision regions are used to visualize Modified
+  Condition/Decision Coverage (MC/DC) in *llvm-cov* for each boolean
+  expression.  When decision regions are used, control flow IDs are assigned to
+  each associated branch region. One ID represents the current branch
+  condition, and two additional IDs represent the next branch condition in the
+  control flow given a true or false evaluation, respectively.  This allows
+  *llvm-cov* to reconstruct the control flow around the conditions in order to
+  comprehend the full list of potential executable test vectors.
+
 .. _source code range:
 
 Source Range:

From c532033e0fe7aa7f88127fcee72f67e1af399136 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Mon, 22 Jan 2024 16:40:02 -0500
Subject: [PATCH 489/843] [NFC][Clang] Fix compile warning caused by #78330

---
 clang/lib/Driver/ToolChains/Clang.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 2f33943de45c5..bcba7cbbdb58c 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -2726,7 +2726,7 @@ RenderComplexRangeOption(LangOptions::ComplexRangeKind Range) {
     ComplexRangeStr += "fortran";
     break;
   default:
-    assert("Unexpected range option");
+    assert(0 && "Unexpected range option");
   }
   return ComplexRangeStr;
 }

From cd6ed95e48354d14e59871f3c3f3a8665def22c8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Mon, 22 Jan 2024 22:23:26 +0100
Subject: [PATCH 490/843] Reland "[lli] Revisit Orc debug output tests
 (#79055)"

Integrate in-memory debug-info dumps into the `--orc-lazy-debug`
command-line option instead of exposing built-in functions to be called
from JITed code. This reduces overall amount of code (removing
`ExecutionUtils.cpp`) and seems cleaner anyway.

All existing items of `OrcDumpKind` work on IR level and run in the
IR-transform step of the JIT. The newly added `DumpDebugDescriptor` and
`DumpDebugObjects` must run after debug-registration and thus are
deferred to the Object-transform step of the JIT. This separation is the
major side-effect of the patch.

The original commit 263efb044add93b9 was reverted in #79055, because
the gcc 7.5 bot had found a missing std::move().
---
 ...tor-elf-minimal.ll => debug-descriptor.ll} |  30 ++--
 .../OrcLazy/debug-objects-elf-minimal.ll      |  82 -----------
 .../ExecutionEngine/OrcLazy/debug-objects.ll  |  62 ++++++++
 llvm/tools/lli/CMakeLists.txt                 |   1 -
 llvm/tools/lli/ExecutionUtils.cpp             | 122 ----------------
 llvm/tools/lli/ExecutionUtils.h               |  60 --------
 llvm/tools/lli/lli.cpp                        | 135 +++++++++++++-----
 7 files changed, 170 insertions(+), 322 deletions(-)
 rename llvm/test/ExecutionEngine/OrcLazy/{debug-descriptor-elf-minimal.ll => debug-descriptor.ll} (53%)
 delete mode 100644 llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
 create mode 100644 llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll
 delete mode 100644 llvm/tools/lli/ExecutionUtils.cpp
 delete mode 100644 llvm/tools/lli/ExecutionUtils.h

diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor.ll
similarity index 53%
rename from llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
rename to llvm/test/ExecutionEngine/OrcLazy/debug-descriptor.ll
index 9bac3897864f7..bf7f13fb5439f 100644
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor-elf-minimal.ll
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-descriptor.ll
@@ -1,29 +1,17 @@
-; REQUIRES: native && x86_64-linux
+; REQUIRES: native && target-x86_64
 
-; RUN: lli --jit-linker=rtdyld \
-; RUN:     --generate=__dump_jit_debug_descriptor %s | FileCheck %s
+; RUN: lli --jit-linker=rtdyld --orc-lazy-debug=jit-debug-descriptor %s 2>&1 | FileCheck %s
+; RUN: lli --jit-linker=jitlink --orc-lazy-debug=jit-debug-descriptor %s 2>&1 | FileCheck %s
 ;
-; RUN: lli --jit-linker=jitlink \
-; RUN:     --generate=__dump_jit_debug_descriptor %s | FileCheck %s
+; Initial entry should be empty:
+; CHECK: jit_debug_descriptor 0x0000000000000000
 ;
-; CHECK: Reading __jit_debug_descriptor at 0x{{.*}}
-; CHECK: Version: 1
-; CHECK: Action: JIT_REGISTER_FN
-; CHECK:       Entry               Symbol File             Size  Previous Entry
-; CHECK: [ 0]  0x{{.*}}            0x{{.*}}              {{.*}}  0x0000000000000000
-
-target triple = "x86_64-unknown-unknown-elf"
-
-; Built-in symbol provided by the JIT
-declare void @__dump_jit_debug_descriptor(ptr)
-
-; Host-process symbol from the GDB JIT interface
-@__jit_debug_descriptor = external global i8, align 1
+; After adding the module it must not be empty anymore:
+; CHECK: jit_debug_descriptor 0x
+; CHECK-NOT:                    000000000000000
+; CHECK-SAME:                               {{[048c]}}
 
 define i32 @main() !dbg !9 {
-  %1 = alloca i32, align 4
-  store i32 0, ptr %1, align 4
-  call void @__dump_jit_debug_descriptor(ptr @__jit_debug_descriptor), !dbg !13
   ret i32 0, !dbg !14
 }
 
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
deleted file mode 100644
index 31fe730b74036..0000000000000
--- a/llvm/test/ExecutionEngine/OrcLazy/debug-objects-elf-minimal.ll
+++ /dev/null
@@ -1,82 +0,0 @@
-; REQUIRES: native && x86_64-linux
-
-; In-memory debug-object contains some basic DWARF
-;
-; RUN: lli --jit-linker=rtdyld \
-; RUN:     --generate=__dump_jit_debug_objects %s | llvm-dwarfdump --diff - | FileCheck %s
-;
-; RUN: lli --jit-linker=jitlink \
-; RUN:     --generate=__dump_jit_debug_objects %s | llvm-dwarfdump --diff - | FileCheck %s
-;
-; CHECK: -:	file format elf64-x86-64
-; CHECK: .debug_info contents:
-; CHECK: 0x00000000: Compile Unit: length = 0x00000047, format = DWARF32, version = 0x0004, abbr_offset = 0x0000, addr_size = 0x08 (next unit at 0x0000004b)
-; CHECK: DW_TAG_compile_unit
-; CHECK:               DW_AT_producer	("compiler version")
-; CHECK:               DW_AT_language	(DW_LANG_C99)
-; CHECK:               DW_AT_name	("source-file.c")
-; CHECK:               DW_AT_stmt_list	()
-; CHECK:               DW_AT_comp_dir	("/workspace")
-; CHECK:               DW_AT_low_pc	()
-; CHECK:               DW_AT_high_pc	()
-; CHECK:   DW_TAG_subprogram
-; CHECK:                 DW_AT_low_pc	()
-; CHECK:                 DW_AT_high_pc	()
-; CHECK:                 DW_AT_frame_base	(DW_OP_reg7 RSP)
-; CHECK:                 DW_AT_name	("main")
-; CHECK:                 DW_AT_decl_file	("/workspace/source-file.c")
-; CHECK:                 DW_AT_decl_line	(4)
-; CHECK:                 DW_AT_type	("int")
-; CHECK:                 DW_AT_external	(true)
-; CHECK:   DW_TAG_base_type
-; CHECK:                 DW_AT_name	("int")
-; CHECK:                 DW_AT_encoding	(DW_ATE_signed)
-; CHECK:                 DW_AT_byte_size	(0x04)
-; CHECK:   NULL
-
-; Text section of the in-memory debug-object has a non-null load-address
-;
-; RUN: lli --jit-linker=rtdyld \
-; RUN:     --generate=__dump_jit_debug_objects %s | llvm-objdump --section-headers - | \
-; RUN:     FileCheck --check-prefix=CHECK_LOAD_ADDR %s
-;
-; RUN: lli --jit-linker=jitlink \
-; RUN:     --generate=__dump_jit_debug_objects %s | llvm-objdump --section-headers - | \
-; RUN:     FileCheck --check-prefix=CHECK_LOAD_ADDR %s
-;
-; CHECK_LOAD_ADDR-NOT: {{[0-9]*}} .ltext {{.*}} 0000000000000000 TEXT
-
-target triple = "x86_64-unknown-unknown-elf"
-
-; Built-in symbol provided by the JIT
-declare void @__dump_jit_debug_objects(ptr)
-
-; Host-process symbol from the GDB JIT interface
-@__jit_debug_descriptor = external global i8, align 1
-
-define i32 @main() !dbg !9 {
-  %1 = alloca i32, align 4
-  store i32 0, ptr %1, align 4
-  call void @__dump_jit_debug_objects(ptr @__jit_debug_descriptor), !dbg !13
-  ret i32 0, !dbg !14
-}
-
-!llvm.module.flags = !{!0, !1, !2, !3, !4}
-!llvm.dbg.cu = !{!5}
-!llvm.ident = !{!8}
-
-!0 = !{i32 2, !"SDK Version", [3 x i32] [i32 10, i32 15, i32 6]}
-!1 = !{i32 7, !"Dwarf Version", i32 4}
-!2 = !{i32 2, !"Debug Info Version", i32 3}
-!3 = !{i32 1, !"wchar_size", i32 4}
-!4 = !{i32 7, !"PIC Level", i32 2}
-!5 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, producer: "compiler version", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, nameTableKind: None)
-!6 = !DIFile(filename: "source-file.c", directory: "/workspace")
-!7 = !{}
-!8 = !{!"compiler version"}
-!9 = distinct !DISubprogram(name: "main", scope: !6, file: !6, line: 4, type: !10, scopeLine: 4, spFlags: DISPFlagDefinition, unit: !5, retainedNodes: !7)
-!10 = !DISubroutineType(types: !11)
-!11 = !{!12}
-!12 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!13 = !DILocation(line: 5, column: 3, scope: !9)
-!14 = !DILocation(line: 6, column: 3, scope: !9)
diff --git a/llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll b/llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll
new file mode 100644
index 0000000000000..1f48bd9a2318a
--- /dev/null
+++ b/llvm/test/ExecutionEngine/OrcLazy/debug-objects.ll
@@ -0,0 +1,62 @@
+; REQUIRES: native && x86_64-linux
+
+; In-memory debug-objects contain DWARF
+;
+; RUN: lli --jit-linker=rtdyld  --orc-lazy-debug=jit-debug-objects %s | llvm-dwarfdump --diff - | FileCheck %s
+; RUN: lli --jit-linker=jitlink --orc-lazy-debug=jit-debug-objects %s | llvm-dwarfdump --diff - | FileCheck %s
+;
+; CHECK: -:	file format elf64-x86-64
+; TODO: Synthesized Mach-O objects error out with:
+;       truncated or malformed object (offset field of section 8 in
+;       LC_SEGMENT_64 command 0 extends past the end of the file)
+;
+; CHECK: .debug_info contents:
+; CHECK: format = DWARF32
+; CHECK: DW_TAG_compile_unit
+; CHECK:               DW_AT_producer	("clang version 18.0.0git")
+; CHECK:               DW_AT_language	(DW_LANG_C11)
+; CHECK:               DW_AT_name	("source-file.c")
+; CHECK:               DW_AT_comp_dir	("/workspace")
+; CHECK:   DW_TAG_subprogram
+; CHECK:                 DW_AT_frame_base	(DW_OP_reg7 RSP)
+; CHECK:                 DW_AT_name	("main")
+; CHECK:                 DW_AT_decl_file	("/workspace/source-file.c")
+; CHECK:                 DW_AT_decl_line	(1)
+; CHECK:                 DW_AT_type	("int")
+; CHECK:                 DW_AT_external	(true)
+; CHECK:   DW_TAG_base_type
+; CHECK:                 DW_AT_name	("int")
+; CHECK:                 DW_AT_encoding	(DW_ATE_signed)
+; CHECK:                 DW_AT_byte_size	(0x04)
+; CHECK:   NULL
+
+; Text section of the in-memory debug-objects have non-null load-address
+;
+; RUN: lli --jit-linker=rtdyld --orc-lazy-debug=jit-debug-objects %s | \
+; RUN:                              llvm-objdump --section-headers - | \
+; RUN:                              FileCheck --check-prefix=CHECK_LOAD_ADDR %s
+; RUN: lli --jit-linker=jitlink --orc-lazy-debug=jit-debug-objects %s | \
+; RUN:                               llvm-objdump --section-headers - | \
+; RUN:                               FileCheck --check-prefix=CHECK_LOAD_ADDR %s
+;
+; CHECK_LOAD_ADDR:      .text
+; CHECK_LOAD_ADDR-NOT:  0000000000000000
+; CHECK_LOAD_ADDR-SAME: TEXT
+
+define i32 @main() !dbg !3 {
+entry:
+  ret i32 0, !dbg !8
+}
+
+!llvm.module.flags = !{!0}
+!llvm.dbg.cu = !{!2}
+
+!0 = !{i32 2, !"Debug Info Version", i32 3}
+!1 = !DIFile(filename: "source-file.c", directory: "/workspace")
+!2 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 18.0.0git", emissionKind: FullDebug)
+!3 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 1, type: !4, scopeLine: 1, unit: !2, retainedNodes: !7)
+!4 = !DISubroutineType(types: !5)
+!5 = !{!6}
+!6 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!7 = !{}
+!8 = !DILocation(line: 1, column: 14, scope: !3)
diff --git a/llvm/tools/lli/CMakeLists.txt b/llvm/tools/lli/CMakeLists.txt
index 315de28e12b57..e3fca225a2275 100644
--- a/llvm/tools/lli/CMakeLists.txt
+++ b/llvm/tools/lli/CMakeLists.txt
@@ -53,7 +53,6 @@ endif( LLVM_USE_PERF )
 
 add_llvm_tool(lli
   lli.cpp
-  ExecutionUtils.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/tools/lli/ExecutionUtils.cpp b/llvm/tools/lli/ExecutionUtils.cpp
deleted file mode 100644
index b6cc3bb174d3c..0000000000000
--- a/llvm/tools/lli/ExecutionUtils.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-//===---- ExecutionUtils.cpp - Utilities for executing functions in lli ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "ExecutionUtils.h"
-
-#include "llvm/ExecutionEngine/Orc/TargetProcess/JITLoaderGDB.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/FormatVariadic.h"
-#include "llvm/Support/raw_ostream.h"
-
-#include <cstdint>
-#include <vector>
-
-namespace llvm {
-
-template <typename... Ts> static void outsv(const char *Fmt, Ts &&...Vals) {
-  outs() << formatv(Fmt, Vals...);
-}
-
-static const char *actionFlagToStr(uint32_t ActionFlag) {
-  switch (ActionFlag) {
-  case JIT_NOACTION:
-    return "JIT_NOACTION";
-  case JIT_REGISTER_FN:
-    return "JIT_REGISTER_FN";
-  case JIT_UNREGISTER_FN:
-    return "JIT_UNREGISTER_FN";
-  }
-  return "<invalid action_flag>";
-}
-
-// Declarations follow the GDB JIT interface (version 1, 2009) and must match
-// those of the DYLD used for testing.
-//
-// Sample output:
-//
-//   Reading __jit_debug_descriptor at 0x0000000000404048
-//
-//   Version: 0
-//   Action: JIT_REGISTER_FN
-//
-//         Entry               Symbol File         Size  Previous Entry
-//   [ 0]  0x0000000000451290  0x0000000000002000   200  0x0000000000000000
-//   [ 1]  0x0000000000451260  0x0000000000001000   100  0x0000000000451290
-//   ...
-//
-static void dumpDebugDescriptor(void *Addr) {
-  outsv("Reading __jit_debug_descriptor at {0}\n\n", Addr);
-
-  jit_descriptor *Descriptor = reinterpret_cast<jit_descriptor *>(Addr);
-  outsv("Version: {0}\n", Descriptor->version);
-  outsv("Action: {0}\n\n", actionFlagToStr(Descriptor->action_flag));
-  outsv("{0,11}  {1,24}  {2,15}  {3,14}\n", "Entry", "Symbol File", "Size",
-        "Previous Entry");
-
-  unsigned Idx = 0;
-  for (auto *Entry = Descriptor->first_entry; Entry; Entry = Entry->next_entry)
-    outsv("[{0,2}]  {1:X16}  {2:X16}  {3,8:D}  {4}\n", Idx++, Entry,
-          reinterpret_cast<const void *>(Entry->symfile_addr),
-          Entry->symfile_size, Entry->prev_entry);
-}
-
-static LLIBuiltinFunctionGenerator *Generator = nullptr;
-
-static void dumpDebugObjects(void *Addr) {
-  jit_descriptor *Descriptor = reinterpret_cast<jit_descriptor *>(Addr);
-  for (auto *Entry = Descriptor->first_entry; Entry; Entry = Entry->next_entry)
-    Generator->appendDebugObject(Entry->symfile_addr, Entry->symfile_size);
-}
-
-LLIBuiltinFunctionGenerator::LLIBuiltinFunctionGenerator(
-    std::vector<BuiltinFunctionKind> Enabled, orc::MangleAndInterner &Mangle)
-    : TestOut(nullptr) {
-  Generator = this;
-  for (BuiltinFunctionKind F : Enabled) {
-    switch (F) {
-    case BuiltinFunctionKind::DumpDebugDescriptor:
-      expose(Mangle("__dump_jit_debug_descriptor"), &dumpDebugDescriptor);
-      break;
-    case BuiltinFunctionKind::DumpDebugObjects:
-      expose(Mangle("__dump_jit_debug_objects"), &dumpDebugObjects);
-      TestOut = createToolOutput();
-      break;
-    }
-  }
-}
-
-Error LLIBuiltinFunctionGenerator::tryToGenerate(
-    orc::LookupState &LS, orc::LookupKind K, orc::JITDylib &JD,
-    orc::JITDylibLookupFlags JDLookupFlags,
-    const orc::SymbolLookupSet &Symbols) {
-  orc::SymbolMap NewSymbols;
-  for (const auto &NameFlags : Symbols) {
-    auto It = BuiltinFunctions.find(NameFlags.first);
-    if (It != BuiltinFunctions.end())
-      NewSymbols.insert(*It);
-  }
-
-  if (NewSymbols.empty())
-    return Error::success();
-
-  return JD.define(absoluteSymbols(std::move(NewSymbols)));
-}
-
-// static
-std::unique_ptr<ToolOutputFile>
-LLIBuiltinFunctionGenerator::createToolOutput() {
-  std::error_code EC;
-  auto TestOut = std::make_unique<ToolOutputFile>("-", EC, sys::fs::OF_None);
-  if (EC) {
-    errs() << "Error creating tool output file: " << EC.message() << '\n';
-    exit(1);
-  }
-  return TestOut;
-}
-
-} // namespace llvm
diff --git a/llvm/tools/lli/ExecutionUtils.h b/llvm/tools/lli/ExecutionUtils.h
deleted file mode 100644
index 6bf9cd58e031b..0000000000000
--- a/llvm/tools/lli/ExecutionUtils.h
+++ /dev/null
@@ -1,60 +0,0 @@
-//===- ExecutionUtils.h - Utilities for executing code in lli ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Contains utilities for executing code in lli.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_TOOLS_LLI_EXECUTIONUTILS_H
-#define LLVM_TOOLS_LLI_EXECUTIONUTILS_H
-
-#include "llvm/ExecutionEngine/JITSymbol.h"
-#include "llvm/ExecutionEngine/Orc/Core.h"
-#include "llvm/ExecutionEngine/Orc/Mangling.h"
-#include "llvm/Support/Error.h"
-#include "llvm/Support/ToolOutputFile.h"
-
-#include <memory>
-#include <utility>
-
-namespace llvm {
-
-enum class BuiltinFunctionKind {
-  DumpDebugDescriptor,
-  DumpDebugObjects,
-};
-
-// Utility class to expose symbols for special-purpose functions to the JIT.
-class LLIBuiltinFunctionGenerator : public orc::DefinitionGenerator {
-public:
-  LLIBuiltinFunctionGenerator(std::vector<BuiltinFunctionKind> Enabled,
-                              orc::MangleAndInterner &Mangle);
-
-  Error tryToGenerate(orc::LookupState &LS, orc::LookupKind K,
-                      orc::JITDylib &JD, orc::JITDylibLookupFlags JDLookupFlags,
-                      const orc::SymbolLookupSet &Symbols) override;
-
-  void appendDebugObject(const char *Addr, size_t Size) {
-    TestOut->os().write(Addr, Size);
-  }
-
-private:
-  orc::SymbolMap BuiltinFunctions;
-  std::unique_ptr<ToolOutputFile> TestOut;
-
-  template <typename T> void expose(orc::SymbolStringPtr Name, T *Handler) {
-    BuiltinFunctions[Name] = {orc::ExecutorAddr::fromPtr(Handler),
-                              JITSymbolFlags::Exported};
-  }
-
-  static std::unique_ptr<ToolOutputFile> createToolOutput();
-};
-
-} // end namespace llvm
-
-#endif // LLVM_TOOLS_LLI_EXECUTIONUTILS_H
diff --git a/llvm/tools/lli/lli.cpp b/llvm/tools/lli/lli.cpp
index 8a7ea2d3d0c58..905ec23de45b9 100644
--- a/llvm/tools/lli/lli.cpp
+++ b/llvm/tools/lli/lli.cpp
@@ -12,7 +12,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "ExecutionUtils.h"
 #include "ForwardingMemoryManager.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -33,6 +32,7 @@
 #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/Orc/ObjectTransformLayer.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h"
 #include "llvm/ExecutionEngine/Orc/SymbolStringPool.h"
@@ -62,6 +62,7 @@
 #include "llvm/Support/Program.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TargetParser/Triple.h"
@@ -254,35 +255,29 @@ namespace {
     NoDump,
     DumpFuncsToStdOut,
     DumpModsToStdOut,
-    DumpModsToDisk
+    DumpModsToDisk,
+    DumpDebugDescriptor,
+    DumpDebugObjects,
   };
 
   cl::opt<DumpKind> OrcDumpKind(
       "orc-lazy-debug", cl::desc("Debug dumping for the orc-lazy JIT."),
       cl::init(DumpKind::NoDump),
-      cl::values(clEnumValN(DumpKind::NoDump, "no-dump",
-                            "Don't dump anything."),
-                 clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
-                            "Dump function names to stdout."),
-                 clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
-                            "Dump modules to stdout."),
-                 clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
-                            "Dump modules to the current "
-                            "working directory. (WARNING: "
-                            "will overwrite existing files).")),
-      cl::Hidden);
-
-  cl::list<BuiltinFunctionKind> GenerateBuiltinFunctions(
-      "generate",
-      cl::desc("Provide built-in functions for access by JITed code "
-               "(jit-kind=orc-lazy only)"),
-      cl::values(clEnumValN(BuiltinFunctionKind::DumpDebugDescriptor,
-                            "__dump_jit_debug_descriptor",
-                            "Dump __jit_debug_descriptor contents to stdout"),
-                 clEnumValN(BuiltinFunctionKind::DumpDebugObjects,
-                            "__dump_jit_debug_objects",
-                            "Dump __jit_debug_descriptor in-memory debug "
-                            "objects as tool output")),
+      cl::values(
+          clEnumValN(DumpKind::NoDump, "no-dump", "Don't dump anything."),
+          clEnumValN(DumpKind::DumpFuncsToStdOut, "funcs-to-stdout",
+                     "Dump function names to stdout."),
+          clEnumValN(DumpKind::DumpModsToStdOut, "mods-to-stdout",
+                     "Dump modules to stdout."),
+          clEnumValN(DumpKind::DumpModsToDisk, "mods-to-disk",
+                     "Dump modules to the current "
+                     "working directory. (WARNING: "
+                     "will overwrite existing files)."),
+          clEnumValN(DumpKind::DumpDebugDescriptor, "jit-debug-descriptor",
+                     "Dump __jit_debug_descriptor contents to stdout"),
+          clEnumValN(DumpKind::DumpDebugObjects, "jit-debug-objects",
+                     "Dump __jit_debug_descriptor in-memory debug "
+                     "objects as tool output")),
       cl::Hidden);
 
   ExitOnError ExitOnErr;
@@ -756,9 +751,41 @@ int main(int argc, char **argv, char * const *envp) {
   return Result;
 }
 
-static std::function<void(Module &)> createDebugDumper() {
+// JITLink debug support plugins put information about JITed code in this GDB
+// JIT Interface global from OrcTargetProcess.
+extern "C" struct jit_descriptor __jit_debug_descriptor;
+
+static struct jit_code_entry *
+findNextDebugDescriptorEntry(struct jit_code_entry *Latest) {
+  if (Latest == nullptr)
+    return __jit_debug_descriptor.first_entry;
+  if (Latest->next_entry)
+    return Latest->next_entry;
+  return nullptr;
+}
+
+static ToolOutputFile &claimToolOutput() {
+  static std::unique_ptr<ToolOutputFile> ToolOutput = nullptr;
+  if (ToolOutput) {
+    WithColor::error(errs(), "lli")
+        << "Can not claim stdout for tool output twice\n";
+    exit(1);
+  }
+  std::error_code EC;
+  ToolOutput = std::make_unique<ToolOutputFile>("-", EC, sys::fs::OF_None);
+  if (EC) {
+    WithColor::error(errs(), "lli")
+        << "Failed to create tool output file: " << EC.message() << "\n";
+    exit(1);
+  }
+  return *ToolOutput;
+}
+
+static std::function<void(Module &)> createIRDebugDumper() {
   switch (OrcDumpKind) {
   case DumpKind::NoDump:
+  case DumpKind::DumpDebugDescriptor:
+  case DumpKind::DumpDebugObjects:
     return [](Module &M) {};
 
   case DumpKind::DumpFuncsToStdOut:
@@ -800,6 +827,43 @@ static std::function<void(Module &)> createDebugDumper() {
   llvm_unreachable("Unknown DumpKind");
 }
 
+static std::function<void(MemoryBuffer &)> createObjDebugDumper() {
+  switch (OrcDumpKind) {
+  case DumpKind::NoDump:
+  case DumpKind::DumpFuncsToStdOut:
+  case DumpKind::DumpModsToStdOut:
+  case DumpKind::DumpModsToDisk:
+    return [](MemoryBuffer &) {};
+
+  case DumpKind::DumpDebugDescriptor: {
+    // Dump the empty descriptor at startup once
+    fprintf(stderr, "jit_debug_descriptor 0x%016" PRIx64 "\n",
+            pointerToJITTargetAddress(__jit_debug_descriptor.first_entry));
+    return [](MemoryBuffer &) {
+      // Dump new entries as they appear
+      static struct jit_code_entry *Latest = nullptr;
+      while (auto *NewEntry = findNextDebugDescriptorEntry(Latest)) {
+        fprintf(stderr, "jit_debug_descriptor 0x%016" PRIx64 "\n",
+                pointerToJITTargetAddress(NewEntry));
+        Latest = NewEntry;
+      }
+    };
+  }
+
+  case DumpKind::DumpDebugObjects: {
+    return [](MemoryBuffer &Obj) {
+      static struct jit_code_entry *Latest = nullptr;
+      static ToolOutputFile &ToolOutput = claimToolOutput();
+      while (auto *NewEntry = findNextDebugDescriptorEntry(Latest)) {
+        ToolOutput.os().write(NewEntry->symfile_addr, NewEntry->symfile_size);
+        Latest = NewEntry;
+      }
+    };
+  }
+  }
+  llvm_unreachable("Unknown DumpKind");
+}
+
 Error loadDylibs() {
   for (const auto &Dylib : Dylibs) {
     std::string ErrMsg;
@@ -1001,8 +1065,7 @@ int runOrcJIT(const char *ProgName) {
   if (PerModuleLazy)
     J->setPartitionFunction(orc::CompileOnDemandLayer::compileWholeModule);
 
-  auto Dump = createDebugDumper();
-
+  auto IRDump = createIRDebugDumper();
   J->getIRTransformLayer().setTransform(
       [&](orc::ThreadSafeModule TSM,
           const orc::MaterializationResponsibility &R) {
@@ -1011,18 +1074,18 @@ int runOrcJIT(const char *ProgName) {
             dbgs() << "Bad module: " << &M << "\n";
             exit(1);
           }
-          Dump(M);
+          IRDump(M);
         });
         return TSM;
       });
 
-  if (GenerateBuiltinFunctions.size() > 0) {
-    // Add LLI builtins.
-    orc::MangleAndInterner Mangle(J->getExecutionSession(), J->getDataLayout());
-    J->getMainJITDylib().addGenerator(
-        std::make_unique<LLIBuiltinFunctionGenerator>(GenerateBuiltinFunctions,
-                                                      Mangle));
-  }
+  auto ObjDump = createObjDebugDumper();
+  J->getObjTransformLayer().setTransform(
+      [&](std::unique_ptr<MemoryBuffer> Obj)
+          -> Expected<std::unique_ptr<MemoryBuffer>> {
+        ObjDump(*Obj);
+        return std::move(Obj);
+      });
 
   // If this is a Mingw or Cygwin executor then we need to alias __main to
   // orc_rt_int_void_return_0.

From cc2c8ab21fd9b831799bff9ca99be2a2243d23b9 Mon Sep 17 00:00:00 2001
From: Douglas Yung <douglas.yung@sony.com>
Date: Mon, 22 Jan 2024 13:51:03 -0800
Subject: [PATCH 491/843] Require asserts for
 llvm/test/CodeGen/PowerPC/sms-regpress.mir.

---
 llvm/test/CodeGen/PowerPC/sms-regpress.mir | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/test/CodeGen/PowerPC/sms-regpress.mir b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
index f523b4548eecc..cebd78af882df 100644
--- a/llvm/test/CodeGen/PowerPC/sms-regpress.mir
+++ b/llvm/test/CodeGen/PowerPC/sms-regpress.mir
@@ -1,5 +1,7 @@
 # RUN: llc --verify-machineinstrs -mcpu=pwr9 -o - %s -run-pass=pipeliner -ppc-enable-pipeliner  -pipeliner-register-pressure -pipeliner-max-mii=50 -pipeliner-ii-search-range=30 -pipeliner-max-stages=10 -debug-only=pipeliner 2>&1 | FileCheck %s
 
+# REQUIRES: asserts
+
 # Check that if the register pressure is too high, the schedule is rejected, II is incremented, and scheduling continues.
 # The specific value of II is not important.
 

From 8675952583b1c639e6bcbe2869aecda1d01320f2 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Mon, 22 Jan 2024 13:00:38 -0800
Subject: [PATCH 492/843] [RISCV] Add coverage for shuffles splitable using
 exact VLEN

Test coverage for an upcoming transform.
---
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   | 200 ++++++++++++++++++
 1 file changed, 200 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
new file mode 100644
index 0000000000000..b922ecdb8a2c2
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -0,0 +1,200 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s | FileCheck %s
+
+define <4 x i64> @m2_splat_0(<4 x i64> %v1) vscale_range(2,2) {
+; CHECK-LABEL: m2_splat_0:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v8, 0
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 0, i32 0>
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @m2_splat_in_chunks(<4 x i64> %v1) vscale_range(2,2) {
+; CHECK-LABEL: m2_splat_in_chunks:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 8224
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
+  ret <4 x i64> %res
+}
+
+define <8 x i64> @m4_splat_in_chunks(<8 x i64> %v1) vscale_range(2,2) {
+; CHECK-LABEL: m4_splat_in_chunks:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
+; CHECK-NEXT:    addi a0, a0, %lo(.LCPI2_0)
+; CHECK-NEXT:    vl1re16.v v16, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %res = shufflevector <8 x i64> %v1, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 7, i32 7>
+  ret <8 x i64> %res
+}
+
+
+define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) {
+; CHECK-LABEL: m2_splat_with_tail:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 12320
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @m2_pair_swap_vl4(<4 x i64> %v1) vscale_range(2,2) {
+; CHECK-LABEL: m2_pair_swap_vl4:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 8240
+; CHECK-NEXT:    addi a0, a0, 1
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+  ret <4 x i64> %res
+}
+
+define <8 x i32> @m2_pair_swap_vl8(<8 x i32> %v1) vscale_range(2,2) {
+; RV32-LABEL: m2_pair_swap_vl8:
+; RV32:       # %bb.0:
+; RV32-NEXT:    li a0, 32
+; RV32-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV32-NEXT:    vmv.v.x v10, a0
+; RV32-NEXT:    li a0, 63
+; RV32-NEXT:    vand.vx v12, v10, a0
+; RV32-NEXT:    vsll.vv v12, v8, v12
+; RV32-NEXT:    vrsub.vi v10, v10, 0
+; RV32-NEXT:    vand.vx v10, v10, a0
+; RV32-NEXT:    vsrl.vv v8, v8, v10
+; RV32-NEXT:    vor.vv v8, v12, v8
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: m2_pair_swap_vl8:
+; RV64:       # %bb.0:
+; RV64-NEXT:    li a0, 32
+; RV64-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; RV64-NEXT:    vsrl.vx v10, v8, a0
+; RV64-NEXT:    vsll.vx v8, v8, a0
+; RV64-NEXT:    vor.vv v8, v8, v10
+; RV64-NEXT:    ret
+  %res = shufflevector <8 x i32> %v1, <8 x i32> poison, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+  ret <8 x i32> %res
+}
+
+define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) {
+; CHECK-LABEL: m2_splat_into_identity:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 12320
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.s.x v10, a0
+; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vsext.vf2 v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @m2_broadcast_i128(<4 x i64> %v1) vscale_range(2,2) {
+; CHECK-LABEL: m2_broadcast_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v12, a0
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
+; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x i64> %res
+}
+
+define <8 x i64> @m4_broadcast_i128(<8 x i64> %v1) vscale_range(2,2) {
+; CHECK-LABEL: m4_broadcast_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a0, 16
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v16, a0
+; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
+; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %res = shufflevector <8 x i64> %v1, <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
+  ret <8 x i64> %res
+}
+
+
+define <4 x i64> @m2_splat_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
+; CHECK-LABEL: m2_splat_two_source:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vrgather.vi v12, v8, 0
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; CHECK-NEXT:    vrgather.vi v12, v10, 3, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 7, i32 7>
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
+; CHECK-LABEL: m2_splat_into_identity_two_source:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vrgather.vi v12, v8, 0
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v8
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
+; CHECK-NEXT:    vrgatherei16.vv v12, v10, v8, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 6, i32 7>
+  ret <4 x i64> %res
+}
+
+define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
+; CHECK-LABEL: m2_splat_into_slide_two_source:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vid.v v12
+; CHECK-NEXT:    vadd.vi v14, v12, -1
+; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vrgather.vi v12, v8, 0
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
+; CHECK-NEXT:    vmv.v.i v0, 12
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
+; CHECK-NEXT:    vrgatherei16.vv v12, v10, v14, v0.t
+; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    ret
+  %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 5, i32 6>
+  ret <4 x i64> %res
+}

From eaef645a5836df56b53800383022a3a384305c6b Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 22 Jan 2024 14:18:22 -0800
Subject: [PATCH 493/843] [test] Update stack_guard_remat.ll

---
 llvm/test/CodeGen/Thumb/stack_guard_remat.ll | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
index 748cc8744d669..b9c993332a698 100644
--- a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
+; RUN: cp %s %t.pic.ll
+; RUN: echo -e '!llvm.module.flags = !{!0}\n!0 = !{i32 7, !"PIC Level", i32 2}' >> %t.pic.ll
+; RUN: llc < %t.pic.ll -mtriple=thumb-apple-darwin -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
 ; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=NO-PIC  -check-prefix=STATIC
 ; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s  -check-prefix=NO-PIC -check-prefix=DYNAMIC-NO-PIC
 
@@ -19,7 +21,7 @@
 ;NO-PIC:                add [[SAVED_GUARD:r[0-9]+]], sp, #904
 ;NO-PIC-NEXT:           ldr [[SAVED_GUARD]], [[[SAVED_GUARD]], #124]
 ;NO-PIC-NEXT:           ldr [[ORIGINAL_GUARD:r[0-9]+]], [[ORIGINAL_GUARD_LABEL:LCPI[0-9_]+]]
-;NO-PIC-NOT: LPC
+;DYNAMIC-NO-PIC:        add [[ORIGINAL_GUARD]], pc
 ;NO-PIC-NEXT:           ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
 ;DYNAMIC-NO-PIC-NEXT:   ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]]
 ;NO-PIC-NEXT:           cmp [[ORIGINAL_GUARD]], [[SAVED_GUARD]]
@@ -49,6 +51,3 @@ declare void @foo3(ptr)
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 7, !"PIC Level", i32 2}

From fe0ec2c91cfbf2aad2a61e402f21b771db685b90 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 23 Jan 2024 07:28:10 +0900
Subject: [PATCH 494/843] [Coverage] Const-ize `MCDCRecordProcessor` stuff
 (#78918)

The life of `MCDCRecordProcessor`'s instance is short. It may accept
`const` objects to process.

On the other hand, the life of `MCDCBranches` is shorter than `Record`.
It may be rewritten with reference, rather than copying.
---
 .../ProfileData/Coverage/CoverageMapping.h    |  5 +--
 .../ProfileData/Coverage/CoverageMapping.cpp  | 32 ++++++++++---------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
index 2a857917136a8..33fa17ba811fa 100644
--- a/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
+++ b/llvm/include/llvm/ProfileData/Coverage/CoverageMapping.h
@@ -587,8 +587,9 @@ class CounterMappingContext {
   /// Return an MCDC record that indicates executed test vectors and condition
   /// pairs.
   Expected<MCDCRecord>
-  evaluateMCDCRegion(CounterMappingRegion Region, BitVector Bitmap,
-                     ArrayRef<CounterMappingRegion> Branches);
+  evaluateMCDCRegion(const CounterMappingRegion &Region,
+                     const BitVector &Bitmap,
+                     ArrayRef<const CounterMappingRegion *> Branches);
 
   unsigned getMaxCounterID(const Counter &C) const;
 };
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index d8be7be6d7a59..9c95cd5d76d21 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -247,14 +247,14 @@ class MCDCRecordProcessor {
   /// Each index of the bitmap corresponds to a possible test vector. An index
   /// with a bit value of '1' indicates that the corresponding Test Vector
   /// identified by that index was executed.
-  BitVector &ExecutedTestVectorBitmap;
+  const BitVector &ExecutedTestVectorBitmap;
 
   /// Decision Region to which the ExecutedTestVectorBitmap applies.
-  CounterMappingRegion &Region;
+  const CounterMappingRegion &Region;
 
   /// Array of branch regions corresponding each conditions in the boolean
   /// expression.
-  ArrayRef<CounterMappingRegion> Branches;
+  ArrayRef<const CounterMappingRegion *> Branches;
 
   /// Total number of conditions in the boolean expression.
   unsigned NumConditions;
@@ -276,8 +276,9 @@ class MCDCRecordProcessor {
   MCDCRecord::TestVectors ExecVectors;
 
 public:
-  MCDCRecordProcessor(BitVector &Bitmap, CounterMappingRegion &Region,
-                      ArrayRef<CounterMappingRegion> Branches)
+  MCDCRecordProcessor(const BitVector &Bitmap,
+                      const CounterMappingRegion &Region,
+                      ArrayRef<const CounterMappingRegion *> Branches)
       : ExecutedTestVectorBitmap(Bitmap), Region(Region), Branches(Branches),
         NumConditions(Region.MCDCParams.NumConditions),
         Folded(NumConditions, false), IndependencePairs(NumConditions),
@@ -342,7 +343,7 @@ class MCDCRecordProcessor {
 
   /// Walk the bits in the bitmap.  A bit set to '1' indicates that the test
   /// vector at the corresponding index was executed during a test run.
-  void findExecutedTestVectors(BitVector &ExecutedTestVectorBitmap) {
+  void findExecutedTestVectors(const BitVector &ExecutedTestVectorBitmap) {
     for (unsigned Idx = 0; Idx < ExecutedTestVectorBitmap.size(); ++Idx) {
       if (ExecutedTestVectorBitmap[Idx] == 0)
         continue;
@@ -445,11 +446,11 @@ class MCDCRecordProcessor {
     //   visualize where the condition is.
     // - Record whether the condition is constant folded so that we exclude it
     //   from being measured.
-    for (const auto &B : Branches) {
-      Map[B.MCDCParams.ID] = &B;
-      PosToID[I] = B.MCDCParams.ID - 1;
-      CondLoc[I] = B.startLoc();
-      Folded[I++] = (B.Count.isZero() && B.FalseCount.isZero());
+    for (const auto *B : Branches) {
+      Map[B->MCDCParams.ID] = B;
+      PosToID[I] = B->MCDCParams.ID - 1;
+      CondLoc[I] = B->startLoc();
+      Folded[I++] = (B->Count.isZero() && B->FalseCount.isZero());
     }
 
     // Initialize a base test vector as 'DontCare'.
@@ -473,8 +474,9 @@ class MCDCRecordProcessor {
 };
 
 Expected<MCDCRecord> CounterMappingContext::evaluateMCDCRegion(
-    CounterMappingRegion Region, BitVector ExecutedTestVectorBitmap,
-    ArrayRef<CounterMappingRegion> Branches) {
+    const CounterMappingRegion &Region,
+    const BitVector &ExecutedTestVectorBitmap,
+    ArrayRef<const CounterMappingRegion *> Branches) {
 
   MCDCRecordProcessor MCDCProcessor(ExecutedTestVectorBitmap, Region, Branches);
   return MCDCProcessor.processMCDCRecord();
@@ -638,7 +640,7 @@ Error CoverageMapping::loadFunctionRecord(
 
   unsigned NumConds = 0;
   const CounterMappingRegion *MCDCDecision;
-  std::vector<CounterMappingRegion> MCDCBranches;
+  std::vector<const CounterMappingRegion *> MCDCBranches;
 
   FunctionRecord Function(OrigFuncName, Record.Filenames);
   for (const auto &Region : Record.MappingRegions) {
@@ -666,7 +668,7 @@ Error CoverageMapping::loadFunctionRecord(
     // correspond to it in a vector, according to the number of conditions
     // recorded for the region (tracked by NumConds).
     if (NumConds > 0 && Region.Kind == CounterMappingRegion::MCDCBranchRegion) {
-      MCDCBranches.push_back(Region);
+      MCDCBranches.push_back(&Region);
 
       // As we move through all of the MCDCBranchRegions that follow the
       // MCDCDecisionRegion, decrement NumConds to make sure we account for

From 672fb5892eb1dd977fd36149fd45d794103b0408 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 22 Jan 2024 14:30:30 -0800
Subject: [PATCH 495/843] [RISCV] Remove extra semicolons. NFC

---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b41e2f40dc72f..66285a6bf26e3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -12769,14 +12769,14 @@ static bool narrowIndex(SDValue &N, ISD::MemIndexType IndexType, SelectionDAG &D
   SDValue N0 = N.getOperand(0);
   if (N0.getOpcode() != ISD::ZERO_EXTEND &&
       N0.getOpcode() != RISCVISD::VZEXT_VL)
-    return false;;
+    return false;
   if (!N0->hasOneUse())
-    return false;;
+    return false;
 
   APInt ShAmt;
   SDValue N1 = N.getOperand(1);
   if (!ISD::isConstantSplatVector(N1.getNode(), ShAmt))
-    return false;;
+    return false;
 
   SDValue Src = N0.getOperand(0);
   EVT SrcVT = Src.getValueType();

From 1000cefc049e313a532fc9df2a48d80826fc80c7 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:31:17 -0800
Subject: [PATCH 496/843] [AMDGPU] Remove s_set_inst_prefetch_distance support
 from GFX12 (#78786)

This instruction is not supported by GFX12.
---
 llvm/lib/Target/AMDGPU/GCNSubtarget.h                | 2 --
 llvm/lib/Target/AMDGPU/SOPInstructions.td            | 2 +-
 llvm/test/MC/AMDGPU/gfx12_asm_sopp.s                 | 9 ---------
 llvm/test/MC/AMDGPU/gfx12_unsupported.s              | 3 +++
 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt | 9 ---------
 5 files changed, 4 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 4886730c05c50..8019b98b1c68d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -856,8 +856,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   }
 
   bool hasInstPrefetch() const {
-    // GFX12 can still encode the s_set_inst_prefetch_distance instruction but
-    // it has no effect.
     return getGeneration() == GFX10 || getGeneration() == GFX11;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index d9c6b1a35e272..ae5ef0541929b 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -2625,7 +2625,7 @@ multiclass SOPP_Real_With_Relaxation_gfx11_gfx12<bits<7>op> :
 defm S_SETKILL                    : SOPP_Real_32_gfx11_gfx12<0x001>;
 defm S_SETHALT                    : SOPP_Real_32_gfx11_gfx12<0x002>;
 defm S_SLEEP                      : SOPP_Real_32_gfx11_gfx12<0x003>;
-defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11_gfx12<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">;
+defm S_SET_INST_PREFETCH_DISTANCE : SOPP_Real_32_Renamed_gfx11<0x004, S_INST_PREFETCH, "s_set_inst_prefetch_distance">;
 defm S_CLAUSE                     : SOPP_Real_32_gfx11_gfx12<0x005>;
 defm S_DELAY_ALU                  : SOPP_Real_32_gfx11_gfx12<0x007>;
 defm S_WAITCNT_DEPCTR             : SOPP_Real_32_gfx11<0x008>;
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
index 41ed4de6be8a7..f6c7c99847d66 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
@@ -69,15 +69,6 @@ s_wait_alu depctr_va_sdst(3)
 s_wait_alu depctr_va_vdst(14) depctr_va_sdst(6) depctr_vm_vsrc(6)
 // GFX12: encoding: [0x9b,0xed,0x88,0xbf]
 
-s_inst_prefetch 0x1234
-// GFX12: s_set_inst_prefetch_distance 0x1234 ; encoding: [0x34,0x12,0x84,0xbf]
-
-s_set_inst_prefetch_distance 0x1234
-// GFX12: s_set_inst_prefetch_distance 0x1234 ; encoding: [0x34,0x12,0x84,0xbf]
-
-s_set_inst_prefetch_distance 0xc1d1
-// GFX12: s_set_inst_prefetch_distance 0xc1d1 ; encoding: [0xd1,0xc1,0x84,0xbf]
-
 s_singleuse_vdst 0x0000
 // GFX12: encoding: [0x00,0x00,0x93,0xbf]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_unsupported.s b/llvm/test/MC/AMDGPU/gfx12_unsupported.s
index bf8f7437c0420..9d52a5dcb1a0a 100644
--- a/llvm/test/MC/AMDGPU/gfx12_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx12_unsupported.s
@@ -103,6 +103,9 @@ s_cmpk_lt_u32 s0, 0
 s_cmpk_le_u32 s0, 0
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+s_inst_prefetch 1
+// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 buffer_atomic_cmpswap_f32 v[5:6], off, s[96:99], s3
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
index 6f4dc2423487e..ea547fcd5d0ec 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
@@ -264,15 +264,6 @@
 # GFX12: s_sethalt 0xc1d1                        ; encoding: [0xd1,0xc1,0x82,0xbf]
 0xd1,0xc1,0x82,0xbf
 
-# GFX12: s_set_inst_prefetch_distance 0x0        ; encoding: [0x00,0x00,0x84,0xbf]
-0x00,0x00,0x84,0xbf
-
-# GFX12: s_set_inst_prefetch_distance 0x1234     ; encoding: [0x34,0x12,0x84,0xbf]
-0x34,0x12,0x84,0xbf
-
-# GFX12: s_set_inst_prefetch_distance 0xc1d1     ; encoding: [0xd1,0xc1,0x84,0xbf]
-0xd1,0xc1,0x84,0xbf
-
 # GFX12: s_setkill 0                             ; encoding: [0x00,0x00,0x81,0xbf]
 0x00,0x00,0x81,0xbf
 

From 424b9cf41abf376cc7a34640f5f451c91714f77b Mon Sep 17 00:00:00 2001
From: Alan Phipps <a-phipps@ti.com>
Date: Mon, 22 Jan 2024 16:33:20 -0600
Subject: [PATCH 497/843] [Coverage][clang] Ensure bitmap for ternary condition
 is updated before visiting children (#78814)

This is a fix for MC/DC issue https://github.com/llvm/llvm-project/issues/78453 in which a ConditionalOperator that evaluates a complex condition was incorrectly updating its global bitmap after visiting its LHS and RHS children. This
was wrong because if the LHS or RHS also evaluate a complex condition, the MCDC temporary bitmap value will get corrupted. The fix is to ensure that the bitmap is updated prior to visiting the LHS and RHS.
---
 clang/lib/CodeGen/CGExprScalar.cpp            | 18 ++++-
 clang/test/Profile/c-mcdc-logicalop-ternary.c | 81 +++++++++++++++++++
 2 files changed, 95 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Profile/c-mcdc-logicalop-ternary.c

diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
index 9ec185153d12b..181b15e9c7d0a 100644
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -4960,6 +4960,13 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
                            CGF.getProfileCount(lhsExpr));
 
   CGF.EmitBlock(LHSBlock);
+
+  // If the top of the logical operator nest, update the MCDC bitmap for the
+  // ConditionalOperator prior to visiting its LHS and RHS blocks, since they
+  // may also contain a boolean expression.
+  if (CGF.MCDCLogOpStack.empty())
+    CGF.maybeUpdateMCDCTestVectorBitmap(condExpr);
+
   CGF.incrementProfileCounter(E);
   eval.begin(CGF);
   Value *LHS = Visit(lhsExpr);
@@ -4969,6 +4976,13 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   Builder.CreateBr(ContBlock);
 
   CGF.EmitBlock(RHSBlock);
+
+  // If the top of the logical operator nest, update the MCDC bitmap for the
+  // ConditionalOperator prior to visiting its LHS and RHS blocks, since they
+  // may also contain a boolean expression.
+  if (CGF.MCDCLogOpStack.empty())
+    CGF.maybeUpdateMCDCTestVectorBitmap(condExpr);
+
   eval.begin(CGF);
   Value *RHS = Visit(rhsExpr);
   eval.end(CGF);
@@ -4987,10 +5001,6 @@ VisitAbstractConditionalOperator(const AbstractConditionalOperator *E) {
   PN->addIncoming(LHS, LHSBlock);
   PN->addIncoming(RHS, RHSBlock);
 
-  // If the top of the logical operator nest, update the MCDC bitmap.
-  if (CGF.MCDCLogOpStack.empty())
-    CGF.maybeUpdateMCDCTestVectorBitmap(condExpr);
-
   return PN;
 }
 
diff --git a/clang/test/Profile/c-mcdc-logicalop-ternary.c b/clang/test/Profile/c-mcdc-logicalop-ternary.c
new file mode 100644
index 0000000000000..558643f422021
--- /dev/null
+++ b/clang/test/Profile/c-mcdc-logicalop-ternary.c
@@ -0,0 +1,81 @@
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping -fcoverage-mcdc | FileCheck %s -check-prefix=MCDC
+// RUN: %clang_cc1 -triple %itanium_abi_triple %s -o - -emit-llvm -fprofile-instrument=clang -fcoverage-mapping | FileCheck %s -check-prefix=NOMCDC
+
+int test(int a, int b, int c, int d, int e, int f) {
+  return ((a || b) ? (c && d) : (e || f));
+}
+
+// NOMCDC-NOT: %mcdc.addr
+// NOMCDC-NOT: __profbm_test
+
+// MCDC BOOKKEEPING.
+// MCDC: @__profbm_test = private global [3 x i8] zeroinitializer
+
+// ALLOCATE MCDC TEMP AND ZERO IT.
+// MCDC-LABEL: @test(
+// MCDC: %mcdc.addr = alloca i32, align 4
+// MCDC: store i32 0, ptr %mcdc.addr, align 4
+
+// TERNARY TRUE SHOULD UPDATE THE BITMAP WITH RESULT AT ELEMENT 0.
+// MCDC-LABEL: cond.true:
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
+// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
+// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
+// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
+// MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
+// MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
+// MCDC:  %[[LAB8:mcdc.bits[0-9]*]] = load i8, ptr %[[LAB4]], align 1
+// MCDC:  %[[LAB9:[0-9]+]] = or i8 %[[LAB8]], %[[LAB7]]
+// MCDC:  store i8 %[[LAB9]], ptr %[[LAB4]], align 1
+
+// CHECK FOR ZERO OF MCDC TEMP
+// MCDC: store i32 0, ptr %mcdc.addr, align 4
+
+// TERNARY TRUE YIELDS TERNARY LHS LOGICAL-AND.
+// TERNARY LHS LOGICAL-AND SHOULD UPDATE THE BITMAP WITH RESULT AT ELEMENT 1.
+// MCDC-LABEL: land.end:
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
+// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
+// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr getelementptr inbounds ([3 x i8], ptr @__profbm_test, i32 0, i32 1) to i64), %[[LAB2]]
+// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
+// MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
+// MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
+// MCDC:  %[[LAB8:mcdc.bits[0-9]*]] = load i8, ptr %[[LAB4]], align 1
+// MCDC:  %[[LAB9:[0-9]+]] = or i8 %[[LAB8]], %[[LAB7]]
+// MCDC:  store i8 %[[LAB9]], ptr %[[LAB4]], align 1
+
+// TERNARY FALSE SHOULD UPDATE THE BITMAP WITH RESULT AT ELEMENT 0.
+// MCDC-LABEL: cond.false:
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
+// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
+// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr @__profbm_test to i64), %[[LAB2]]
+// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
+// MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
+// MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
+// MCDC:  %[[LAB8:mcdc.bits[0-9]*]] = load i8, ptr %[[LAB4]], align 1
+// MCDC:  %[[LAB9:[0-9]+]] = or i8 %[[LAB8]], %[[LAB7]]
+// MCDC:  store i8 %[[LAB9]], ptr %[[LAB4]], align 1
+
+// CHECK FOR ZERO OF MCDC TEMP
+// MCDC: store i32 0, ptr %mcdc.addr, align 4
+
+// TERNARY FALSE YIELDS TERNARY RHS LOGICAL-OR.
+// TERNARY RHS LOGICAL-OR SHOULD UPDATE THE BITMAP WITH RESULT AT ELEMENT 2.
+// MCDC-LABEL: lor.end:
+// MCDC-DAG:  %[[TEMP:mcdc.temp[0-9]*]] = load i32, ptr %mcdc.addr, align 4
+// MCDC:  %[[LAB1:[0-9]+]] = lshr i32 %[[TEMP]], 3
+// MCDC:  %[[LAB2:[0-9]+]] = zext i32 %[[LAB1]] to i64
+// MCDC:  %[[LAB3:[0-9]+]] = add i64 ptrtoint (ptr getelementptr inbounds ([3 x i8], ptr @__profbm_test, i32 0, i32 2) to i64), %[[LAB2]]
+// MCDC:  %[[LAB4:[0-9]+]] = inttoptr i64 %[[LAB3]] to ptr
+// MCDC:  %[[LAB5:[0-9]+]] = and i32 %[[TEMP]], 7
+// MCDC:  %[[LAB6:[0-9]+]] = trunc i32 %[[LAB5]] to i8
+// MCDC:  %[[LAB7:[0-9]+]] = shl i8 1, %[[LAB6]]
+// MCDC:  %[[LAB8:mcdc.bits[0-9]*]] = load i8, ptr %[[LAB4]], align 1
+// MCDC:  %[[LAB9:[0-9]+]] = or i8 %[[LAB8]], %[[LAB7]]
+// MCDC:  store i8 %[[LAB9]], ptr %[[LAB4]], align 1

From b83b8d3fd17885438b0ea154e07088d877d293a8 Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 22 Jan 2024 07:41:51 -0800
Subject: [PATCH 498/843] Reland [Clang][CMake] Support perf, LBR, and
 Instrument CLANG_BOLT options (#69133)

This reverts commit 6c47419703acfcd7dcca9e30ab9dba6a7a42f977.

Default to CLANG_BOLT=OFF

Test Plan:
Build a regular Clang build.
---
 clang/CMakeLists.txt                          | 44 +++++++++-----
 clang/cmake/caches/BOLT.cmake                 |  2 +-
 clang/utils/perf-training/CMakeLists.txt      | 29 +++++++++-
 clang/utils/perf-training/bolt.lit.cfg        | 47 ++++++++++++---
 .../utils/perf-training/bolt.lit.site.cfg.in  |  2 +
 clang/utils/perf-training/perf-helper.py      | 58 +++++++++++++++++++
 6 files changed, 156 insertions(+), 26 deletions(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 5f2b7f064da43..47fc2e4886cfc 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -876,23 +876,38 @@ if (CLANG_ENABLE_BOOTSTRAP)
   endforeach()
 endif()
 
-if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
+set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
+  May be specified as Instrument or Perf or LBR to use a particular profiling \
+  mechanism.")
+string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
+
+if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
   set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
-  set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst)
+  set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
   set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata)
 
-  # Instrument clang with BOLT
-  add_custom_target(clang-instrumented
-    DEPENDS ${CLANG_INSTRUMENTED}
-  )
-  add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
-    DEPENDS clang llvm-bolt
-    COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
-      -instrument --instrumentation-file-append-pid
-      --instrumentation-file=${BOLT_FDATA}
-    COMMENT "Instrumenting clang binary with BOLT"
-    VERBATIM
-  )
+  # Pass extra flag in no-LBR mode
+  if (CLANG_BOLT STREQUAL "PERF")
+    set(BOLT_NO_LBR "-nl")
+  endif()
+
+  if (CLANG_BOLT STREQUAL "INSTRUMENT")
+    # Instrument clang with BOLT
+    add_custom_target(clang-instrumented
+      DEPENDS ${CLANG_INSTRUMENTED}
+    )
+    add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
+      DEPENDS clang llvm-bolt
+      COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
+        -instrument --instrumentation-file-append-pid
+        --instrumentation-file=${BOLT_FDATA}
+      COMMENT "Instrumenting clang binary with BOLT"
+      VERBATIM
+    )
+    add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented)
+  else() # perf or LBR
+    add_custom_target(clang-bolt-training-deps DEPENDS clang)
+  endif()
 
   # Optimize original (pre-bolt) Clang using the collected profile
   set(CLANG_OPTIMIZED ${CMAKE_CURRENT_BINARY_DIR}/clang.bolt)
@@ -906,6 +921,7 @@ if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
       -data ${BOLT_FDATA}
       -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions
       -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack
+      ${BOLT_NO_LBR}
     COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} $<TARGET_FILE:clang>
     COMMENT "Optimizing Clang with BOLT"
     VERBATIM
diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake
index 0442f73e5426a..eba2346b2f4ca 100644
--- a/clang/cmake/caches/BOLT.cmake
+++ b/clang/cmake/caches/BOLT.cmake
@@ -1,5 +1,5 @@
 set(CMAKE_BUILD_TYPE Release CACHE STRING "")
-set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "")
+set(CLANG_BOLT "INSTRUMENT" CACHE STRING "")
 set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
 
 set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt
index c6d51863fb1b5..601f40902fa34 100644
--- a/clang/utils/perf-training/CMakeLists.txt
+++ b/clang/utils/perf-training/CMakeLists.txt
@@ -62,7 +62,9 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
     DEPENDS generate-dtrace-logs)
 endif()
 
-if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
+if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
+  set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
+    "Name of BOLT-instrumented Clang binary")
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg
@@ -71,16 +73,37 @@ if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
   add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang"
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/
     EXCLUDE_FROM_CHECK_ALL
-    DEPENDS clang-instrumented clear-bolt-fdata
+    DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data
     )
 
   add_custom_target(clear-bolt-fdata
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata
     COMMENT "Clearing old BOLT fdata")
 
+  add_custom_target(clear-perf-data
+    COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
+    COMMENT "Clearing old perf data")
+
+  string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
+  if (CLANG_BOLT STREQUAL "LBR")
+    set(BOLT_LBR "--lbr")
+  endif()
+
+  add_custom_target(merge-fdata-deps)
+  if (CLANG_BOLT STREQUAL "INSTRUMENT")
+    add_dependencies(merge-fdata-deps generate-bolt-fdata)
+  else()
+    # Convert perf profiles into fdata
+    add_custom_target(convert-perf-fdata
+      COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $<TARGET_FILE:llvm-bolt> ${CMAKE_CURRENT_BINARY_DIR} $<TARGET_FILE:clang> ${BOLT_LBR}
+      COMMENT "Converting perf files to BOLT fdata"
+      DEPENDS llvm-bolt generate-bolt-fdata)
+    add_dependencies(merge-fdata-deps convert-perf-fdata)
+  endif()
+
   # Merge profiles into one using merge-fdata
   add_custom_target(clang-bolt-profile
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR}
     COMMENT "Merging BOLT fdata"
-    DEPENDS merge-fdata generate-bolt-fdata)
+    DEPENDS merge-fdata merge-fdata-deps)
 endif()
diff --git a/clang/utils/perf-training/bolt.lit.cfg b/clang/utils/perf-training/bolt.lit.cfg
index 234ac855bd67c..0e81a5501e9fc 100644
--- a/clang/utils/perf-training/bolt.lit.cfg
+++ b/clang/utils/perf-training/bolt.lit.cfg
@@ -6,15 +6,46 @@ import lit.util
 import os
 import subprocess
 
-config.clang = os.path.realpath(lit.util.which('clang-bolt.inst', config.clang_tools_dir)).replace('\\', '/')
+clang_bolt_mode = config.clang_bolt_mode.lower()
+clang_binary = "clang"
+perf_wrapper = f"{config.python_exe} {config.perf_helper_dir}/perf-helper.py perf "
 
-config.name = 'Clang Perf Training'
-config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test']
+if clang_bolt_mode == "instrument":
+    perf_wrapper = ""
+    clang_binary = config.clang_bolt_name
+elif clang_bolt_mode == "lbr":
+    perf_wrapper += " --lbr -- "
+elif clang_bolt_mode == "perf":
+    perf_wrapper += " -- "
+else:
+    assert 0, "Unsupported CLANG_BOLT_MODE variable"
+
+config.clang = perf_wrapper + os.path.realpath(
+    lit.util.which(clang_binary, config.clang_tools_dir)
+).replace("\\", "/")
+
+config.name = "Clang Perf Training"
+config.suffixes = [
+    ".c",
+    ".cc",
+    ".cpp",
+    ".m",
+    ".mm",
+    ".cu",
+    ".ll",
+    ".cl",
+    ".s",
+    ".S",
+    ".modulemap",
+    ".test",
+]
 
 use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL")
 config.test_format = lit.formats.ShTest(use_lit_shell == "0")
-config.substitutions.append( ('%clang_cpp_skip_driver', ' %s --driver-mode=g++ ' % (config.clang)))
-config.substitutions.append( ('%clang_cpp', ' %s --driver-mode=g++ ' % (config.clang)))
-config.substitutions.append( ('%clang_skip_driver', ' %s ' % (config.clang)))
-config.substitutions.append( ('%clang', ' %s ' % (config.clang) ) )
-config.substitutions.append( ('%test_root', config.test_exec_root ) )
+config.substitutions.append(
+    ("%clang_cpp_skip_driver", f" {config.clang} --driver-mode=g++ ")
+)
+config.substitutions.append(("%clang_cpp", f" {config.clang} --driver-mode=g++ "))
+config.substitutions.append(("%clang_skip_driver", config.clang))
+config.substitutions.append(("%clang", config.clang))
+config.substitutions.append(("%test_root", config.test_exec_root))
diff --git a/clang/utils/perf-training/bolt.lit.site.cfg.in b/clang/utils/perf-training/bolt.lit.site.cfg.in
index 3029319673fc2..54de12701c1ae 100644
--- a/clang/utils/perf-training/bolt.lit.site.cfg.in
+++ b/clang/utils/perf-training/bolt.lit.site.cfg.in
@@ -9,6 +9,8 @@ config.test_source_root = "@CLANG_PGO_TRAINING_DATA@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.python_exe = "@Python3_EXECUTABLE@"
 config.clang_obj_root = path(r"@CLANG_BINARY_DIR@")
+config.clang_bolt_mode = "@CLANG_BOLT@"
+config.clang_bolt_name = "@CLANG_BOLT_INSTRUMENTED@"
 
 # Let the main config do the real work.
 lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/bolt.lit.cfg")
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 99d6a3333b6ef..959bdba5c98cc 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -67,6 +67,62 @@ def merge_fdata(args):
     return 0
 
 
+def perf(args):
+    parser = argparse.ArgumentParser(
+        prog="perf-helper perf", description="perf wrapper for BOLT profile collection"
+    )
+    parser.add_argument(
+        "--lbr", action="store_true", help="Use perf with branch stacks"
+    )
+    parser.add_argument("cmd", nargs=argparse.REMAINDER, help="")
+
+    opts = parser.parse_args(args)
+    cmd = opts.cmd[1:]
+
+    perf_args = [
+        "perf",
+        "record",
+        "--event=cycles:u",
+        "--freq=max",
+        "--output=%d.perf.data" % os.getpid(),
+    ]
+    if opts.lbr:
+        perf_args += ["--branch-filter=any,u"]
+    perf_args.extend(cmd)
+
+    start_time = time.time()
+    subprocess.check_call(perf_args)
+
+    elapsed = time.time() - start_time
+    print("... data collection took %.4fs" % elapsed)
+    return 0
+
+
+def perf2bolt(args):
+    parser = argparse.ArgumentParser(
+        prog="perf-helper perf2bolt",
+        description="perf2bolt conversion wrapper for perf.data files",
+    )
+    parser.add_argument("bolt", help="Path to llvm-bolt")
+    parser.add_argument("path", help="Path containing perf.data files")
+    parser.add_argument("binary", help="Input binary")
+    parser.add_argument("--lbr", action="store_true", help="Use LBR perf2bolt mode")
+    opts = parser.parse_args(args)
+
+    p2b_args = [
+        opts.bolt,
+        opts.binary,
+        "--aggregate-only",
+        "--profile-format=yaml",
+    ]
+    if not opts.lbr:
+        p2b_args += ["-nl"]
+    p2b_args += ["-p"]
+    for filename in findFilesWithExtension(opts.path, "perf.data"):
+        subprocess.check_call(p2b_args + [filename, "-o", filename + ".fdata"])
+    return 0
+
+
 def dtrace(args):
     parser = argparse.ArgumentParser(
         prog="perf-helper dtrace",
@@ -507,6 +563,8 @@ def genOrderFile(args):
     "cc1": cc1,
     "gen-order-file": genOrderFile,
     "merge-fdata": merge_fdata,
+    "perf": perf,
+    "perf2bolt": perf2bolt,
 }
 
 
From 1d5c16d7805806e920feee2b4ec93d5800293837 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Mon, 22 Jan 2024 14:55:51 -0800
Subject: [PATCH 499/843] [libc] default enable -ftrivial-auto-var-init=pattern
 (#78776)

Usage of uninitialized memory is a top memory safety issue in C++ codebases.
Help mitigate this somewhat by default initialize stack allocations to a
pattern (0xAA repeating).

Clang has received optimizations to sink these into control flow paths that
access such values to minimize the overhead of these added initializations.

If there's a measurable slowdown, we can add
-ftrivial-auto-var-init-max-size=<N> for some value N bytes if we have any
large stack allocations, or add attribute uninitialized to any variable
declarations.

Unsupported until GCC 12.1 / Clang 8.

Increases file size of libc.a from a full build by +8.79Ki (+0.2%).
---
 libc/cmake/modules/CheckCompilerFeatures.cmake             | 5 +++++
 libc/cmake/modules/LLVMLibCObjectRules.cmake               | 3 +++
 utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl | 7 ++++++-
 3 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
index 1ad81bbb5f666..c99a98d1567d3 100644
--- a/libc/cmake/modules/CheckCompilerFeatures.cmake
+++ b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -57,3 +57,8 @@ foreach(feature IN LISTS ALL_COMPILER_FEATURES)
 endforeach()
 
 message(STATUS "Compiler features available: ${AVAILABLE_COMPILER_FEATURES}")
+
+### Compiler Feature Detection ###
+
+# clang-8+, gcc-12+
+check_cxx_compiler_flag("-ftrivial-auto-var-init=pattern" LIBC_CC_SUPPORTS_PATTERN_INIT)
diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index 6eba17ae9201c..b6c2f2d53e88d 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -41,6 +41,9 @@ function(_get_common_compile_options output_var flags)
     list(APPEND compile_options "-fno-unwind-tables")
     list(APPEND compile_options "-fno-asynchronous-unwind-tables")
     list(APPEND compile_options "-fno-rtti")
+    if (LIBC_CC_SUPPORTS_PATTERN_INIT)
+      list(APPEND compile_options "-ftrivial-auto-var-init=pattern")
+    endif()
     list(APPEND compile_options "-Wall")
     list(APPEND compile_options "-Wextra")
     # -DLIBC_WNO_ERROR=ON if you can't build cleanly with -Werror.
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index cf27001be9dfe..a28fa51a39f99 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -81,7 +81,12 @@ def libc_function(
     # We use the explicit equals pattern here because append and += mutate the
     # original list, where this creates a new list and stores it in deps.
     copts = copts or []
-    copts = copts + ["-O3", "-fno-builtin", "-fno-lax-vector-conversions"]
+    copts = copts + [
+      "-O3",
+      "-fno-builtin",
+      "-fno-lax-vector-conversions",
+      "-ftrivial-auto-var-init=pattern"
+    ]
 
     # We compile the code twice, the first target is suffixed with ".__internal__" and contains the
     # C++ functions in the "LIBC_NAMESPACE" namespace. This allows us to test the function in the

From 6953b367027e4234607a6718a0a1d57eb52ef57e Mon Sep 17 00:00:00 2001
From: Amir Ayupov <aaupov@fb.com>
Date: Mon, 22 Jan 2024 14:56:17 -0800
Subject: [PATCH 500/843] Revert "Reland [Clang][CMake] Support perf, LBR, and
 Instrument CLANG_BOLT options (#69133)"

This reverts commit b83b8d3fd17885438b0ea154e07088d877d293a8.

Breaks buildbots e.g.
https://lab.llvm.org/buildbot/#/builders/225/builds/29950
---
 clang/CMakeLists.txt                          | 44 +++++---------
 clang/cmake/caches/BOLT.cmake                 |  2 +-
 clang/utils/perf-training/CMakeLists.txt      | 29 +---------
 clang/utils/perf-training/bolt.lit.cfg        | 47 +++------------
 .../utils/perf-training/bolt.lit.site.cfg.in  |  2 -
 clang/utils/perf-training/perf-helper.py      | 58 -------------------
 6 files changed, 26 insertions(+), 156 deletions(-)

diff --git a/clang/CMakeLists.txt b/clang/CMakeLists.txt
index 47fc2e4886cfc..5f2b7f064da43 100644
--- a/clang/CMakeLists.txt
+++ b/clang/CMakeLists.txt
@@ -876,38 +876,23 @@ if (CLANG_ENABLE_BOOTSTRAP)
   endforeach()
 endif()
 
-set(CLANG_BOLT OFF CACHE STRING "Apply BOLT optimization to Clang. \
-  May be specified as Instrument or Perf or LBR to use a particular profiling \
-  mechanism.")
-string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
-
-if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
+if (CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
   set(CLANG_PATH ${LLVM_RUNTIME_OUTPUT_INTDIR}/clang)
-  set(CLANG_INSTRUMENTED ${LLVM_RUNTIME_OUTPUT_INTDIR}/${CLANG_BOLT_INSTRUMENTED})
+  set(CLANG_INSTRUMENTED ${CLANG_PATH}-bolt.inst)
   set(BOLT_FDATA ${CMAKE_CURRENT_BINARY_DIR}/utils/perf-training/prof.fdata)
 
-  # Pass extra flag in no-LBR mode
-  if (CLANG_BOLT STREQUAL "PERF")
-    set(BOLT_NO_LBR "-nl")
-  endif()
-
-  if (CLANG_BOLT STREQUAL "INSTRUMENT")
-    # Instrument clang with BOLT
-    add_custom_target(clang-instrumented
-      DEPENDS ${CLANG_INSTRUMENTED}
-    )
-    add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
-      DEPENDS clang llvm-bolt
-      COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
-        -instrument --instrumentation-file-append-pid
-        --instrumentation-file=${BOLT_FDATA}
-      COMMENT "Instrumenting clang binary with BOLT"
-      VERBATIM
-    )
-    add_custom_target(clang-bolt-training-deps DEPENDS clang-instrumented)
-  else() # perf or LBR
-    add_custom_target(clang-bolt-training-deps DEPENDS clang)
-  endif()
+  # Instrument clang with BOLT
+  add_custom_target(clang-instrumented
+    DEPENDS ${CLANG_INSTRUMENTED}
+  )
+  add_custom_command(OUTPUT ${CLANG_INSTRUMENTED}
+    DEPENDS clang llvm-bolt
+    COMMAND llvm-bolt ${CLANG_PATH} -o ${CLANG_INSTRUMENTED}
+      -instrument --instrumentation-file-append-pid
+      --instrumentation-file=${BOLT_FDATA}
+    COMMENT "Instrumenting clang binary with BOLT"
+    VERBATIM
+  )
 
   # Optimize original (pre-bolt) Clang using the collected profile
   set(CLANG_OPTIMIZED ${CMAKE_CURRENT_BINARY_DIR}/clang.bolt)
@@ -921,7 +906,6 @@ if (CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
       -data ${BOLT_FDATA}
       -reorder-blocks=ext-tsp -reorder-functions=hfsort+ -split-functions
       -split-all-cold -split-eh -dyno-stats -icf=1 -use-gnu-stack
-      ${BOLT_NO_LBR}
     COMMAND ${CMAKE_COMMAND} -E rename ${CLANG_OPTIMIZED} $<TARGET_FILE:clang>
     COMMENT "Optimizing Clang with BOLT"
     VERBATIM
diff --git a/clang/cmake/caches/BOLT.cmake b/clang/cmake/caches/BOLT.cmake
index eba2346b2f4ca..0442f73e5426a 100644
--- a/clang/cmake/caches/BOLT.cmake
+++ b/clang/cmake/caches/BOLT.cmake
@@ -1,5 +1,5 @@
 set(CMAKE_BUILD_TYPE Release CACHE STRING "")
-set(CLANG_BOLT "INSTRUMENT" CACHE STRING "")
+set(CLANG_BOLT_INSTRUMENT ON CACHE BOOL "")
 set(CMAKE_EXE_LINKER_FLAGS "-Wl,--emit-relocs,-znow" CACHE STRING "")
 
 set(LLVM_ENABLE_PROJECTS "bolt;clang" CACHE STRING "")
diff --git a/clang/utils/perf-training/CMakeLists.txt b/clang/utils/perf-training/CMakeLists.txt
index 601f40902fa34..c6d51863fb1b5 100644
--- a/clang/utils/perf-training/CMakeLists.txt
+++ b/clang/utils/perf-training/CMakeLists.txt
@@ -62,9 +62,7 @@ if(APPLE AND DTRACE AND NOT LLVM_TOOL_LLVM_DRIVER_BUILD)
     DEPENDS generate-dtrace-logs)
 endif()
 
-if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
-  set(CLANG_BOLT_INSTRUMENTED "clang-bolt.inst" CACHE STRING
-    "Name of BOLT-instrumented Clang binary")
+if(CLANG_BOLT_INSTRUMENT AND NOT LLVM_BUILD_INSTRUMENTED)
   configure_lit_site_cfg(
     ${CMAKE_CURRENT_SOURCE_DIR}/bolt.lit.site.cfg.in
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/lit.site.cfg
@@ -73,37 +71,16 @@ if(CLANG_BOLT AND NOT LLVM_BUILD_INSTRUMENTED)
   add_lit_testsuite(generate-bolt-fdata "Generating BOLT profile for Clang"
     ${CMAKE_CURRENT_BINARY_DIR}/bolt-fdata/
     EXCLUDE_FROM_CHECK_ALL
-    DEPENDS clang-bolt-training-deps clear-bolt-fdata clear-perf-data
+    DEPENDS clang-instrumented clear-bolt-fdata
     )
 
   add_custom_target(clear-bolt-fdata
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} fdata
     COMMENT "Clearing old BOLT fdata")
 
-  add_custom_target(clear-perf-data
-    COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py clean ${CMAKE_CURRENT_BINARY_DIR} perf.data
-    COMMENT "Clearing old perf data")
-
-  string(TOUPPER "${CLANG_BOLT}" CLANG_BOLT)
-  if (CLANG_BOLT STREQUAL "LBR")
-    set(BOLT_LBR "--lbr")
-  endif()
-
-  add_custom_target(merge-fdata-deps)
-  if (CLANG_BOLT STREQUAL "INSTRUMENT")
-    add_dependencies(merge-fdata-deps generate-bolt-fdata)
-  else()
-    # Convert perf profiles into fdata
-    add_custom_target(convert-perf-fdata
-      COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py perf2bolt $<TARGET_FILE:llvm-bolt> ${CMAKE_CURRENT_BINARY_DIR} $<TARGET_FILE:clang> ${BOLT_LBR}
-      COMMENT "Converting perf files to BOLT fdata"
-      DEPENDS llvm-bolt generate-bolt-fdata)
-    add_dependencies(merge-fdata-deps convert-perf-fdata)
-  endif()
-
   # Merge profiles into one using merge-fdata
   add_custom_target(clang-bolt-profile
     COMMAND "${Python3_EXECUTABLE}" ${CMAKE_CURRENT_SOURCE_DIR}/perf-helper.py merge-fdata $<TARGET_FILE:merge-fdata> ${CMAKE_CURRENT_BINARY_DIR}/prof.fdata ${CMAKE_CURRENT_BINARY_DIR}
     COMMENT "Merging BOLT fdata"
-    DEPENDS merge-fdata merge-fdata-deps)
+    DEPENDS merge-fdata generate-bolt-fdata)
 endif()
diff --git a/clang/utils/perf-training/bolt.lit.cfg b/clang/utils/perf-training/bolt.lit.cfg
index 0e81a5501e9fc..234ac855bd67c 100644
--- a/clang/utils/perf-training/bolt.lit.cfg
+++ b/clang/utils/perf-training/bolt.lit.cfg
@@ -6,46 +6,15 @@ import lit.util
 import os
 import subprocess
 
-clang_bolt_mode = config.clang_bolt_mode.lower()
-clang_binary = "clang"
-perf_wrapper = f"{config.python_exe} {config.perf_helper_dir}/perf-helper.py perf "
+config.clang = os.path.realpath(lit.util.which('clang-bolt.inst', config.clang_tools_dir)).replace('\\', '/')
 
-if clang_bolt_mode == "instrument":
-    perf_wrapper = ""
-    clang_binary = config.clang_bolt_name
-elif clang_bolt_mode == "lbr":
-    perf_wrapper += " --lbr -- "
-elif clang_bolt_mode == "perf":
-    perf_wrapper += " -- "
-else:
-    assert 0, "Unsupported CLANG_BOLT_MODE variable"
-
-config.clang = perf_wrapper + os.path.realpath(
-    lit.util.which(clang_binary, config.clang_tools_dir)
-).replace("\\", "/")
-
-config.name = "Clang Perf Training"
-config.suffixes = [
-    ".c",
-    ".cc",
-    ".cpp",
-    ".m",
-    ".mm",
-    ".cu",
-    ".ll",
-    ".cl",
-    ".s",
-    ".S",
-    ".modulemap",
-    ".test",
-]
+config.name = 'Clang Perf Training'
+config.suffixes = ['.c', '.cc', '.cpp', '.m', '.mm', '.cu', '.ll', '.cl', '.s', '.S', '.modulemap', '.test']
 
 use_lit_shell = os.environ.get("LIT_USE_INTERNAL_SHELL")
 config.test_format = lit.formats.ShTest(use_lit_shell == "0")
-config.substitutions.append(
-    ("%clang_cpp_skip_driver", f" {config.clang} --driver-mode=g++ ")
-)
-config.substitutions.append(("%clang_cpp", f" {config.clang} --driver-mode=g++ "))
-config.substitutions.append(("%clang_skip_driver", config.clang))
-config.substitutions.append(("%clang", config.clang))
-config.substitutions.append(("%test_root", config.test_exec_root))
+config.substitutions.append( ('%clang_cpp_skip_driver', ' %s --driver-mode=g++ ' % (config.clang)))
+config.substitutions.append( ('%clang_cpp', ' %s --driver-mode=g++ ' % (config.clang)))
+config.substitutions.append( ('%clang_skip_driver', ' %s ' % (config.clang)))
+config.substitutions.append( ('%clang', ' %s ' % (config.clang) ) )
+config.substitutions.append( ('%test_root', config.test_exec_root ) )
diff --git a/clang/utils/perf-training/bolt.lit.site.cfg.in b/clang/utils/perf-training/bolt.lit.site.cfg.in
index 54de12701c1ae..3029319673fc2 100644
--- a/clang/utils/perf-training/bolt.lit.site.cfg.in
+++ b/clang/utils/perf-training/bolt.lit.site.cfg.in
@@ -9,8 +9,6 @@ config.test_source_root = "@CLANG_PGO_TRAINING_DATA@"
 config.target_triple = "@LLVM_TARGET_TRIPLE@"
 config.python_exe = "@Python3_EXECUTABLE@"
 config.clang_obj_root = path(r"@CLANG_BINARY_DIR@")
-config.clang_bolt_mode = "@CLANG_BOLT@"
-config.clang_bolt_name = "@CLANG_BOLT_INSTRUMENTED@"
 
 # Let the main config do the real work.
 lit_config.load_config(config, "@CLANG_SOURCE_DIR@/utils/perf-training/bolt.lit.cfg")
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 959bdba5c98cc..99d6a3333b6ef 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -67,62 +67,6 @@ def merge_fdata(args):
     return 0
 
 
-def perf(args):
-    parser = argparse.ArgumentParser(
-        prog="perf-helper perf", description="perf wrapper for BOLT profile collection"
-    )
-    parser.add_argument(
-        "--lbr", action="store_true", help="Use perf with branch stacks"
-    )
-    parser.add_argument("cmd", nargs=argparse.REMAINDER, help="")
-
-    opts = parser.parse_args(args)
-    cmd = opts.cmd[1:]
-
-    perf_args = [
-        "perf",
-        "record",
-        "--event=cycles:u",
-        "--freq=max",
-        "--output=%d.perf.data" % os.getpid(),
-    ]
-    if opts.lbr:
-        perf_args += ["--branch-filter=any,u"]
-    perf_args.extend(cmd)
-
-    start_time = time.time()
-    subprocess.check_call(perf_args)
-
-    elapsed = time.time() - start_time
-    print("... data collection took %.4fs" % elapsed)
-    return 0
-
-
-def perf2bolt(args):
-    parser = argparse.ArgumentParser(
-        prog="perf-helper perf2bolt",
-        description="perf2bolt conversion wrapper for perf.data files",
-    )
-    parser.add_argument("bolt", help="Path to llvm-bolt")
-    parser.add_argument("path", help="Path containing perf.data files")
-    parser.add_argument("binary", help="Input binary")
-    parser.add_argument("--lbr", action="store_true", help="Use LBR perf2bolt mode")
-    opts = parser.parse_args(args)
-
-    p2b_args = [
-        opts.bolt,
-        opts.binary,
-        "--aggregate-only",
-        "--profile-format=yaml",
-    ]
-    if not opts.lbr:
-        p2b_args += ["-nl"]
-    p2b_args += ["-p"]
-    for filename in findFilesWithExtension(opts.path, "perf.data"):
-        subprocess.check_call(p2b_args + [filename, "-o", filename + ".fdata"])
-    return 0
-
-
 def dtrace(args):
     parser = argparse.ArgumentParser(
         prog="perf-helper dtrace",
@@ -563,8 +507,6 @@ def genOrderFile(args):
     "cc1": cc1,
     "gen-order-file": genOrderFile,
     "merge-fdata": merge_fdata,
-    "perf": perf,
-    "perf2bolt": perf2bolt,
 }
 
 
From 042bb2850dcdd0b8a519f51678f2e40d0a97f548 Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dimitry@andric.com>
Date: Tue, 23 Jan 2024 00:06:07 +0100
Subject: [PATCH 501/843] [tsan] Fix build for FreeBSD and NetBSD after
 0784b1eefa36 (#79019)

In 0784b1eefa36 some code for re-execution was moved to
`ReExecIfNeeded()`, but also extended with a few Linux-only features.
This leads to compile errors on FreeBSD, or other non-Linux platforms:

compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp:247:25: error: use of
undeclared identifier 'personality'
      247 |   int old_personality = personality(0xffffffff);
          |                         ^
compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp:249:54: error: use of
undeclared identifier 'ADDR_NO_RANDOMIZE'
249 | (old_personality != -1) && ((old_personality & ADDR_NO_RANDOMIZE)
== 0);
          |                                                      ^
compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp:281:46: error: use of
undeclared identifier 'ADDR_NO_RANDOMIZE'
281 | CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
          |                                              ^

Surround the affected part with a `#if SANITIZER_LINUX` block for now.
---
 compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
index 9a66a7feb5b39..0d0b1aba1f852 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
+++ b/compiler-rt/lib/tsan/rtl/tsan_platform_linux.cpp
@@ -243,12 +243,13 @@ static void ReExecIfNeeded() {
     reexec = true;
   }
 
+#    if SANITIZER_LINUX
   // ASLR personality check.
   int old_personality = personality(0xffffffff);
   bool aslr_on =
       (old_personality != -1) && ((old_personality & ADDR_NO_RANDOMIZE) == 0);
 
-#    if SANITIZER_ANDROID && (defined(__aarch64__) || defined(__x86_64__))
+#      if SANITIZER_ANDROID && (defined(__aarch64__) || defined(__x86_64__))
   // After patch "arm64: mm: support ARCH_MMAP_RND_BITS." is introduced in
   // linux kernel, the random gap between stack and mapped area is increased
   // from 128M to 36G on 39-bit aarch64. As it is almost impossible to cover
@@ -261,7 +262,7 @@ static void ReExecIfNeeded() {
     CHECK_NE(personality(old_personality | ADDR_NO_RANDOMIZE), -1);
     reexec = true;
   }
-#    endif
+#      endif
 
   if (reexec) {
     // Don't check the address space since we're going to re-exec anyway.
@@ -288,6 +289,7 @@ static void ReExecIfNeeded() {
       Die();
     }
   }
+#    endif  // SANITIZER_LINUX
 
   if (reexec)
     ReExec();

From be0c8098364b4bc9b210db65f5799bf37aefb605 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 22 Jan 2024 22:36:05 +0000
Subject: [PATCH 502/843] [NFC][Debuginfo][RemoveDIs] Switch an insertion to
 use iterators

With the soon-to-land new-debug-info storage model, it's going to be
important to use iterators for instruction insertion rather than
instruction pointers. This (single line in instcombine) is the last place
that trips up our internal testing for debug-info, where we insert a PHI
and it should be using an iterator.
---
 llvm/lib/Transforms/InstCombine/InstructionCombining.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 45f06e3045599..249f4a7710e04 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -4522,7 +4522,7 @@ bool InstCombinerImpl::run() {
           if (isa<PHINode>(I)) // PHI -> Non-PHI
             InsertPos = InstParent->getFirstInsertionPt();
           else // Non-PHI -> PHI
-            InsertPos = InstParent->getFirstNonPHI()->getIterator();
+            InsertPos = InstParent->getFirstNonPHIIt();
         }
 
         Result->insertInto(InstParent, InsertPos);

From d37d1c8deaa34216248ad51e549a701852e29fa3 Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Mon, 22 Jan 2024 23:14:09 +0000
Subject: [PATCH 503/843] [flang][driver] deprecate manual usage of
 -lFortran_main (#79016)

Intended to warn users of the 18.x release not to do this.

A better solution should be found for the 19.x release. See discussion
in https://github.com/llvm/llvm-project/pull/78152.

Unfortunately there is no warning on Windows currently. I am rushing to
get this landed before 18.x branches.
---
 .../include/clang/Basic/DiagnosticDriverKinds.td |  2 ++
 clang/lib/Driver/ToolChains/CommonArgs.cpp       | 16 ++++++++++++++--
 flang/test/Driver/linker-flags.f90               |  4 ++++
 3 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticDriverKinds.td b/clang/include/clang/Basic/DiagnosticDriverKinds.td
index 090b169a0e724..094fe19509412 100644
--- a/clang/include/clang/Basic/DiagnosticDriverKinds.td
+++ b/clang/include/clang/Basic/DiagnosticDriverKinds.td
@@ -427,6 +427,8 @@ def warn_drv_clang_unsupported : Warning<
   "the clang compiler does not support '%0'">;
 def warn_drv_deprecated_arg : Warning<
   "argument '%0' is deprecated, use '%1' instead">, InGroup<Deprecated>;
+def warn_drv_deprecated_custom : Warning<
+  "argument '%0' is deprecated, %1">, InGroup<Deprecated>;
 def warn_drv_assuming_mfloat_abi_is : Warning<
   "unknown platform, assuming -mfloat-abi=%0">;
 def warn_drv_unsupported_float_abi_by_lib : Warning<
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 385f66f3782bc..fadaf3e60c661 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1194,6 +1194,18 @@ static void addFortranMain(const ToolChain &TC, const ArgList &Args,
   }
 
   // 2. GNU and similar
+  const Driver &D = TC.getDriver();
+  const char *FortranMainLinkFlag = "-lFortran_main";
+
+  // Warn if the user added `-lFortran_main` - this library is an implementation
+  // detail of Flang and should be handled automaticaly by the driver.
+  for (const char *arg : CmdArgs) {
+    if (strncmp(arg, FortranMainLinkFlag, strlen(FortranMainLinkFlag)) == 0)
+      D.Diag(diag::warn_drv_deprecated_custom)
+          << FortranMainLinkFlag
+          << "see the Flang driver documentation for correct usage";
+  }
+
   // The --whole-archive option needs to be part of the link line to make
   // sure that the main() function from Fortran_main.a is pulled in by the
   // linker. However, it shouldn't be used if it's already active.
@@ -1201,12 +1213,12 @@ static void addFortranMain(const ToolChain &TC, const ArgList &Args,
   if (!isWholeArchivePresent(Args) && !TC.getTriple().isMacOSX() &&
       !TC.getTriple().isOSAIX()) {
     CmdArgs.push_back("--whole-archive");
-    CmdArgs.push_back("-lFortran_main");
+    CmdArgs.push_back(FortranMainLinkFlag);
     CmdArgs.push_back("--no-whole-archive");
     return;
   }
 
-  CmdArgs.push_back("-lFortran_main");
+  CmdArgs.push_back(FortranMainLinkFlag);
 }
 
 /// Add Fortran runtime libs
diff --git a/flang/test/Driver/linker-flags.f90 b/flang/test/Driver/linker-flags.f90
index ea91946316cfa..5e00520fcc098 100644
--- a/flang/test/Driver/linker-flags.f90
+++ b/flang/test/Driver/linker-flags.f90
@@ -11,6 +11,7 @@
 ! RUN: %flang -### --target=x86_64-unknown-dragonfly %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,UNIX
 ! RUN: %flang -### --target=x86_64-unknown-haiku %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,HAIKU
 ! RUN: %flang -### --target=x86_64-windows-gnu %S/Inputs/hello.f90 2>&1 | FileCheck %s --check-prefixes=CHECK,MINGW
+! RUN: %flang -### --target=aarch64-unknown-linux-gnu %S/Inputs/hello.f90 -lFortran_main 2>&1 | FileCheck %s --check-prefixes=DEPRECATED
 
 ! NOTE: Clang's driver library, clangDriver, usually adds 'oldnames' on Windows,
 !       but it is not needed when compiling Fortran code and they might bring in
@@ -53,3 +54,6 @@
 ! MSVC-LABEL: link
 ! MSVC-SAME: /subsystem:console
 ! MSVC-SAME: "[[object_file]]"
+
+! Check that we warn when using -lFortran_main
+! DEPRECATED: warning: argument '-lFortran_main' is deprecated, see the Flang driver documentation for correct usage [-Wdeprecated]

From ee0b4d9681047f77d1f7bf53dc185fe140700464 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?F=C3=A9lix-Antoine=20Constantin?=
 <60141446+felix642@users.noreply.github.com>
Date: Mon, 22 Jan 2024 18:14:30 -0500
Subject: [PATCH 504/843] [clang-tidy] Ignore user-defined literals in
 google-runtime-int (#78859)

User-defined literals do not accept u?intXX(_t)? variables. So the check
should not emit a warning.

Fixes #54546 #25214
---
 .../clang-tidy/google/IntegerTypesCheck.cpp   | 19 ++++++++++++++-----
 clang-tools-extra/docs/ReleaseNotes.rst       |  3 +++
 .../checkers/google/runtime-int.cpp           | 10 ++++++----
 3 files changed, 23 insertions(+), 9 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
index bb4e1de8cc4b1..ef511e9108f2e 100644
--- a/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
+++ b/clang-tools-extra/clang-tidy/google/IntegerTypesCheck.cpp
@@ -36,6 +36,12 @@ static Token getTokenAtLoc(SourceLocation Loc,
   return Tok;
 }
 
+namespace {
+AST_MATCHER(FunctionDecl, isUserDefineLiteral) {
+  return Node.getLiteralIdentifier() != nullptr;
+}
+} // namespace
+
 namespace tidy::google::runtime {
 
 IntegerTypesCheck::IntegerTypesCheck(StringRef Name, ClangTidyContext *Context)
@@ -56,11 +62,14 @@ void IntegerTypesCheck::registerMatchers(MatchFinder *Finder) {
   // http://google.github.io/styleguide/cppguide.html#64-bit_Portability
   // "Where possible, avoid passing arguments of types specified by
   // bitwidth typedefs to printf-based APIs."
-  Finder->addMatcher(typeLoc(loc(isInteger()),
-                             unless(hasAncestor(callExpr(
-                                 callee(functionDecl(hasAttr(attr::Format)))))))
-                         .bind("tl"),
-                     this);
+  Finder->addMatcher(
+      typeLoc(loc(isInteger()),
+              unless(anyOf(hasAncestor(callExpr(
+                               callee(functionDecl(hasAttr(attr::Format))))),
+                           hasParent(parmVarDecl(hasAncestor(
+                               functionDecl(isUserDefineLiteral())))))))
+          .bind("tl"),
+      this);
   IdentTable = std::make_unique<IdentifierTable>(getLangOpts());
 }
 
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 27f098a54327d..5758b5acbc0b5 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -381,6 +381,9 @@ Changes in existing checks
   <clang-tidy/checks/google/readability-casting>` check to ignore constructor
   calls disguised as functional casts.
 
+- Improved :doc:`google-runtime-int <clang-tidy/checks/google/runtime-int>`
+  check to ignore false positives on user defined-literals.
+
 - Improved :doc:`llvm-namespace-comment
   <clang-tidy/checks/llvm/namespace-comment>` check to provide fixes for
   ``inline`` namespaces in the same format as :program:`clang-format`.
diff --git a/clang-tools-extra/test/clang-tidy/checkers/google/runtime-int.cpp b/clang-tools-extra/test/clang-tidy/checkers/google/runtime-int.cpp
index 4bb739876b7f4..88f0b519e9cbe 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/google/runtime-int.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/google/runtime-int.cpp
@@ -59,11 +59,13 @@ void qux() {
 // CHECK-MESSAGES: [[@LINE-1]]:3: warning: consider replacing 'short' with 'int16'
 }
 
-// FIXME: This shouldn't warn, as UD-literal operators require one of a handful
-// of types as an argument.
 struct some_value {};
-constexpr some_value operator"" _some_literal(unsigned long long int i);
-// CHECK-MESSAGES: [[@LINE-1]]:47: warning: consider replacing 'unsigned long long'
+constexpr some_value operator"" _some_literal(unsigned long long int i)
+{
+  short j;
+  // CHECK-MESSAGES: [[@LINE-1]]:3: warning: consider replacing 'short' with 'int16'
+  return some_value();
+}
 
 struct A { A& operator=(const A&); };
 class B { A a[0]; };

From 50e2581a199da9702841f103eefca8dca2fc7b4f Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Mon, 22 Jan 2024 23:16:22 +0000
Subject: [PATCH 505/843] [flang] Allow assumed-shape element pass to dummy arg
 with ignore_tkr (#78196)

This is allowed by gfortran and ifort with `![GCC|DEC]$ ATTRIBUTES
NO_ARG_CHECK`
---
 flang/docs/Directives.md                   |  7 ++++---
 flang/include/flang/Common/Fortran.h       |  4 ++--
 flang/lib/Semantics/check-call.cpp         |  6 ++++--
 flang/lib/Semantics/check-declarations.cpp |  5 -----
 flang/test/Semantics/ignore_tkr01.f90      |  6 ------
 flang/test/Semantics/ignore_tkr03.f90      | 21 +++++++++++++++++++++
 6 files changed, 31 insertions(+), 18 deletions(-)
 create mode 100644 flang/test/Semantics/ignore_tkr03.f90

diff --git a/flang/docs/Directives.md b/flang/docs/Directives.md
index c8a2c087dfad1..134de36f884d7 100644
--- a/flang/docs/Directives.md
+++ b/flang/docs/Directives.md
@@ -18,9 +18,10 @@ A list of non-standard directives supported by Flang
   The directive allow actual arguments that would otherwise be diagnosed
   as incompatible in type (T), kind (K), rank (R), CUDA device (D), or
   managed (M) status.  The letter (A) is a shorthand for all of these,
-  and is the default when no letters appear.  The letter (C) is a legacy
-  no-op.  For example, if one wanted to call a "set all bytes to zero"
-  utility that could be applied to arrays of any type or rank:
+  and is the default when no letters appear.  The letter (C) checks for
+  contiguity for example allowing an element of an assumed-shape array to be
+  passed as a dummy argument. For example, if one wanted to call a "set all
+  bytes to zero" utility that could be applied to arrays of any type or rank:
 ```
   interface
     subroutine clear(arr,bytes)
diff --git a/flang/include/flang/Common/Fortran.h b/flang/include/flang/Common/Fortran.h
index 1d3a85e250073..ac1973fdff667 100644
--- a/flang/include/flang/Common/Fortran.h
+++ b/flang/include/flang/Common/Fortran.h
@@ -105,8 +105,8 @@ ENUM_CLASS(IgnoreTKR,
     Rank, // R - don't check ranks
     Device, // D - don't check host/device residence
     Managed, // M - don't check managed storage
-    Contiguous) // C - legacy; disabled NVFORTRAN's convention that leading
-                // dimension of assumed-shape was contiguous
+    Contiguous) // C - don't check for storage sequence association with a
+                // potentially non-contiguous object
 using IgnoreTKRSet = EnumSet<IgnoreTKR, 8>;
 // IGNORE_TKR(A) = IGNORE_TKR(TKRDM)
 static constexpr IgnoreTKRSet ignoreTKRAll{IgnoreTKR::Type, IgnoreTKR::Kind,
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index a8927e94481d4..c924a817ec7e1 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -529,13 +529,15 @@ static void CheckExplicitDataArg(const characteristics::DummyDataObject &dummy,
               dummyName);
         }
         if (actualIsArrayElement && actualLastSymbol &&
-            !evaluate::IsContiguous(*actualLastSymbol, foldingContext)) {
+            !evaluate::IsContiguous(*actualLastSymbol, foldingContext) &&
+            !dummy.ignoreTKR.test(common::IgnoreTKR::Contiguous)) {
           if (IsPointer(*actualLastSymbol)) {
             basicError = true;
             messages.Say(
                 "Element of pointer array may not be associated with a %s array"_err_en_US,
                 dummyName);
-          } else if (IsAssumedShape(*actualLastSymbol)) {
+          } else if (IsAssumedShape(*actualLastSymbol) &&
+              !dummy.ignoreTKR.test(common::IgnoreTKR::Contiguous)) {
             basicError = true;
             messages.Say(
                 "Element of assumed-shape array may not be associated with a %s array"_err_en_US,
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 8315864bcb718..31ccc77d0993f 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -746,11 +746,6 @@ void CheckHelper::CheckObjectEntity(
         messages_.Say(
             "!DIR$ IGNORE_TKR may apply only in an interface or a module procedure"_err_en_US);
       }
-      if (ignoreTKR.test(common::IgnoreTKR::Contiguous) &&
-          !IsAssumedShape(symbol)) {
-        messages_.Say(
-            "!DIR$ IGNORE_TKR(C) may apply only to an assumed-shape array"_err_en_US);
-      }
       if (ownerSymbol && ownerSymbol->attrs().test(Attr::ELEMENTAL) &&
           details.ignoreTKR().test(common::IgnoreTKR::Rank)) {
         messages_.Say(
diff --git a/flang/test/Semantics/ignore_tkr01.f90 b/flang/test/Semantics/ignore_tkr01.f90
index a8fc9dadc1d83..5d1ce32cf81d0 100644
--- a/flang/test/Semantics/ignore_tkr01.f90
+++ b/flang/test/Semantics/ignore_tkr01.f90
@@ -138,12 +138,6 @@ subroutine t20(x)
     end block
   end
 
-  subroutine t21(x)
-!dir$ ignore_tkr(c) x
-!ERROR: !DIR$ IGNORE_TKR(C) may apply only to an assumed-shape array
-    real x(1)
-  end
-
   subroutine t22(x)
 !dir$ ignore_tkr(r) x
 !WARNING: !DIR$ IGNORE_TKR(R) is not meaningful for an assumed-rank array
diff --git a/flang/test/Semantics/ignore_tkr03.f90 b/flang/test/Semantics/ignore_tkr03.f90
new file mode 100644
index 0000000000000..4c48308a39964
--- /dev/null
+++ b/flang/test/Semantics/ignore_tkr03.f90
@@ -0,0 +1,21 @@
+! RUN: %flang_fc1 -fdebug-unparse %s 2>&1 | FileCheck %s
+module library
+contains
+  subroutine lib_sub(buf)
+!dir$ ignore_tkr(c) buf
+    real :: buf(1:*)
+  end subroutine
+end module
+
+module user
+  use library
+contains
+  subroutine sub(var, ptr)
+    real :: var(:,:,:)
+    real, pointer :: ptr(:)
+! CHECK: CALL lib_sub
+    call lib_sub(var(1, 2, 3))
+! CHECK: CALL lib_sub
+    call lib_sub(ptr(1))
+  end subroutine
+end module

From ff9627348e89b9e1a56714448f33dd42cf847777 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Mon, 22 Jan 2024 23:28:03 +0000
Subject: [PATCH 506/843] [NFC][DebugInfo] Set a testing flag to be hidden

This flag (--try-experimental-debuginfo-iterators) only exists for testing
purposes, to get some RUNlines running in new-debug-info mode before it's
properly supported. The flag isn't something that's going to be useful to
people using llvm 18, so hide it from the options list.
---
 llvm/tools/llc/llc.cpp | 2 +-
 llvm/tools/opt/opt.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index 4a1957588a224..02187f7048a10 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -189,7 +189,7 @@ static cl::opt<std::string> RemarksFormat(
 static cl::opt<bool> TryUseNewDbgInfoFormat(
     "try-experimental-debuginfo-iterators",
     cl::desc("Enable debuginfo iterator positions, if they're built in"),
-    cl::init(false));
+    cl::init(false), cl::Hidden);
 
 extern cl::opt<bool> UseNewDbgInfoFormat;
 
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index c649e6ecddc08..8c0e98976e602 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -282,7 +282,7 @@ static cl::list<std::string>
 static cl::opt<bool> TryUseNewDbgInfoFormat(
     "try-experimental-debuginfo-iterators",
     cl::desc("Enable debuginfo iterator positions, if they're built in"),
-    cl::init(false));
+    cl::init(false), cl::Hidden);
 
 extern cl::opt<bool> UseNewDbgInfoFormat;
 

From 3ab8d2aac7bc2dd45dda3db0b8a71fd27eefb749 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov@arm.com>
Date: Mon, 22 Jan 2024 23:40:53 +0000
Subject: [PATCH 507/843] [AArch64][compiler-rt] Add memcpy, memset, memmove,
 memchr builtins. (#77496)

Add naive implementation of memcpy, memset, memmove, memchr for SME
targets.
Co-authored-by: David Sherwood <david.sherwood@arm.com>
---
 compiler-rt/cmake/builtin-config-ix.cmake     |   8 +-
 compiler-rt/lib/builtins/CMakeLists.txt       |   5 +-
 .../lib/builtins/aarch64/sme-libc-routines.c  |  87 +++++++++++++
 .../test/builtins/Unit/sme-string-test.cpp    | 120 ++++++++++++++++++
 compiler-rt/test/lit.common.cfg.py            |   3 +
 compiler-rt/test/lit.common.configured.in     |   1 +
 .../unittests/lit.common.unit.configured.in   |   1 +
 7 files changed, 220 insertions(+), 5 deletions(-)
 create mode 100644 compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
 create mode 100644 compiler-rt/test/builtins/Unit/sme-string-test.cpp

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index b40138aa011f8..b17c43bf6a68b 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -35,10 +35,12 @@ asm(\".arch armv8-a+lse\");
 asm(\"cas w0, w1, [x2]\");
 ")
 
-builtin_check_c_compiler_source(COMPILER_RT_HAS_ASM_SME
+builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
 "
-asm(\".arch armv9-a+sme\");
-asm(\"smstart\");
+void foo(void)  __arm_streaming_compatible {
+  asm(\".arch armv9-a+sme\");
+  asm(\"smstart\");
+}
 ")
 
 if(ANDROID)
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 378884bcaf2e5..28ded8766f253 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -560,9 +560,10 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
-if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
-  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
+if(COMPILER_RT_HAS_AARCH64_SME AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
+  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
   message(STATUS "AArch64 SME ABI routines enabled")
+  set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
 else()
   message(STATUS "AArch64 SME ABI routines disabled")
 endif()
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
new file mode 100644
index 0000000000000..cd73025a19cc1
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -0,0 +1,87 @@
+#include <stdlib.h>
+
+// WARNING: When building the scalar versions of these functions you need to
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
+// from recognising a loop idiom and planting calls to memcpy!
+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = srcp[i];
+
+  return dest;
+}
+
+// If dest and src overlap then behaviour is undefined, hence we can add the
+// restrict keywords here. This also matches the definition of the libc memcpy
+// according to the man page.
+void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
+                      size_t n) __arm_streaming_compatible {
+  return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = c8;
+
+  return dest;
+}
+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  // TODO: Improve performance by copying larger chunks in reverse, or by
+  // using SVE.
+  while (n > 0) {
+    --n;
+    destp[n] = srcp[n];
+  }
+  return dest;
+}
+
+// Semantically a memmove is equivalent to the following:
+//   1. Copy the entire contents of src to a temporary array that does not
+//      overlap with src or dest.
+//   2. Copy the contents of the temporary array into dest.
+void *__arm_sc_memmove(void *dest, const void *src,
+                       size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  // If src and dest don't overlap then just invoke memcpy
+  if ((srcp > (destp + n)) || (destp > (srcp + n)))
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 1:
+  //     src: Low     |   ->   |     High
+  //    dest: Low  |   ->   |        High
+  // Here src is always ahead of dest at a higher addres. If we first read a
+  // chunk of data from src we can safely write the same chunk to dest without
+  // corrupting future reads of src.
+  if (srcp > destp)
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 2:
+  //     src: Low  |   ->   |        High
+  //    dest: Low     |   ->   |     High
+  // While we're in the overlap region we're always corrupting future reads of
+  // src when writing to dest. An efficient way to do this is to copy the data
+  // in reverse by starting at the highest address.
+  return __arm_sc_memcpy_rev(dest, src, n);
+}
+
+const void *__arm_sc_memchr(const void *src, int c,
+                            size_t n) __arm_streaming_compatible {
+  const unsigned char *srcp = (const unsigned char *)src;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    if (srcp[i] == c8)
+      return &srcp[i];
+
+  return NULL;
+}
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.cpp b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
new file mode 100644
index 0000000000000..3bc4559f9ae04
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
@@ -0,0 +1,120 @@
+// REQUIRES: aarch64-target-arch, aarch64-sme-available
+// RUN: %clangxx_builtins %s %librt -o %t && %run %t
+
+#include <cassert>
+#include <initializer_list>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+void *__arm_sc_memcpy(void *, const void *, size_t);
+void *__arm_sc_memset(void *, int, size_t);
+void *__arm_sc_memmove(void *, const void *, size_t);
+void *__arm_sc_memchr(const void *, int, size_t);
+}
+
+template <unsigned N> class Memory {
+public:
+  uint8_t ptr[N];
+  unsigned size;
+
+  Memory(unsigned stride = 0) {
+    size = N;
+    if (stride == 0)
+      return;
+    for (unsigned i = 0; i < N; i++)
+      ptr[i] = i * stride;
+  }
+
+  void assert_equal(const Memory &other) {
+    assert(N == other.size);
+    assert(memcmp(ptr, other.ptr, N) == 0);
+  }
+
+  void assert_equal(std::initializer_list<uint8_t> s) {
+    assert(N == s.size());
+    auto it = s.begin();
+    for (unsigned i = 0; i < N; ++i)
+      assert(ptr[i] == *it++);
+  }
+
+  void assert_elemt_equal_at(unsigned I, uint8_t elem) {
+    assert(ptr[I] == elem);
+  }
+};
+
+int main() {
+
+  // Testing memcpy from src to dst.
+  {
+    Memory<8> src(1);
+    Memory<8> dst;
+    if (!__arm_sc_memcpy(dst.ptr, src.ptr, 8))
+      abort();
+    dst.assert_equal(src);
+    dst.assert_equal({0, 1, 2, 3, 4, 5, 6, 7});
+  }
+
+  // Testing memcpy from src to dst with pointer offset.
+  {
+    Memory<8> src(1);
+    Memory<8> dst(1);
+    if (!__arm_sc_memcpy(dst.ptr + 1, src.ptr, 6))
+      abort();
+    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+  }
+
+  // Testing memchr.
+  {
+    Memory<8> src(4);
+    for (unsigned i = 0; i < 8; ++i) {
+      uint8_t e = src.ptr[i];
+      uint8_t *elem = (uint8_t *)memchr(src.ptr, e, 8);
+      if (!elem)
+        abort();
+      src.assert_elemt_equal_at(elem - src.ptr, *elem);
+      for (unsigned i = 0; i < 8; ++i)
+        assert(__arm_sc_memchr(src.ptr, src.ptr[i], 8) ==
+               memchr(src.ptr, src.ptr[i], 8));
+    }
+  }
+
+  // Testing memset.
+  {
+    Memory<8> array;
+    if (!__arm_sc_memset(array.ptr, 2, 8))
+      abort();
+    array.assert_equal({2, 2, 2, 2, 2, 2, 2, 2});
+  }
+
+  // Testing memset with pointer offset.
+  {
+    Memory<8> array(1);
+    if (!__arm_sc_memset(array.ptr + 1, 2, 6))
+      abort();
+    array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7});
+  }
+
+  // Testing memmove with a simple non-overlap case.
+  {
+    Memory<8> src(1);
+    Memory<8> dst(1);
+    if (!__arm_sc_memmove(dst.ptr + 1, src.ptr, 6))
+      abort();
+    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+  }
+
+  // Testing memove with overlap pointers dst > src, dst < src.
+  {
+    Memory<8> srcdst(1);
+    if (!__arm_sc_memmove(srcdst.ptr + 1, srcdst.ptr, 6))
+      abort();
+    srcdst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+    if (!__arm_sc_memmove(srcdst.ptr, srcdst.ptr + 1, 6))
+      abort();
+    srcdst.assert_equal({0, 1, 2, 3, 4, 5, 5, 7});
+  }
+
+  return 0;
+}
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 1753a55508c7c..113777b0ea8a1 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -454,6 +454,9 @@ def get_ios_commands_dir():
 if config.has_lld:
     config.available_features.add("lld-available")
 
+if config.aarch64_sme:
+    config.available_features.add("aarch64-sme-available")
+
 if config.use_lld:
     config.available_features.add("lld")
 
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 7c2d53520099a..b93e20e80a6ed 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -50,6 +50,7 @@ set_default("gwp_asan", @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@)
 set_default("expensive_checks", @LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL@)
 set_default("test_standalone_build_libs", @COMPILER_RT_TEST_STANDALONE_BUILD_LIBS_PYBOOL@)
 set_default("has_compiler_rt_libatomic", @COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL@)
+set_default("aarch64_sme", @COMPILER_RT_HAS_AARCH64_SME@)
 # True iff the test suite supports ignoring the test compiler's runtime library path
 # and using `config.compiler_rt_libdir` instead. This only matters when the runtime
 # library paths differ.
diff --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in
index 3e42e83c9e70a..0d4785e0f0387 100644
--- a/compiler-rt/unittests/lit.common.unit.configured.in
+++ b/compiler-rt/unittests/lit.common.unit.configured.in
@@ -7,6 +7,7 @@ config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
 config.compiler_rt_src_root = "@COMPILER_RT_SOURCE_DIR@"
 config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@")
+config.aarch64_sme = "@COMPILER_RT_HAS_AARCH64_SME@"
 config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.host_arch = "@HOST_ARCH@"

From d922c82447e51921ee0545c398f60eb3a6ee24a3 Mon Sep 17 00:00:00 2001
From: Anlun Xu <anlunx@google.com>
Date: Mon, 22 Jan 2024 15:41:44 -0800
Subject: [PATCH 508/843] [Blaze] Fix build file

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index c0c16957c5c89..632450f666587 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -9597,6 +9597,7 @@ td_library(
     includes = ["include"],
     deps = [
         ":AtomicInterfacesTdFiles",
+        ":LoopLikeInterfaceTdFiles",
         ":OpBaseTdFiles",
     ],
 )
@@ -9747,6 +9748,7 @@ cc_library(
         ":ControlFlowInterfaces",
         ":IR",
         ":LLVMDialect",
+        ":LoopLikeInterface",
         ":MemRefDialect",
         ":OpenACCOpsIncGen",
         ":OpenACCOpsInterfacesIncGen",

From a2d68b4bece54bcfa7bfde1e80058ab19b6d1775 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Mon, 22 Jan 2024 23:46:58 +0000
Subject: [PATCH 509/843] [SelectOpt] Add handling for Select-like operations.
 (#77284)

Some operations behave like selects. For example `or(zext(c), y)` is the
same as select(c, y|1, y)` and instcombine can canonicalize the select
to the or form. These operations can still be worthwhile converting to
branch as opposed to keeping as a select or or instruction.

This patch attempts to add some basic handling for them, creating a
SelectLike abstraction in the select optimization pass. The backend can
opt into handling `or(zext(c),x)` as a select if it could be profitable,
and the select optimization pass attempts to handle them in much the
same way as a `select(c, x|1, x)`. The Or(x, 1) may need to be added as
a new instruction, generated as the or is converted to branches.

This helps fix a regression from selects being converted to or's
recently.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  10 +
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   9 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   5 +
 llvm/lib/CodeGen/SelectOptimize.cpp           | 395 ++++++++++++------
 .../AArch64/AArch64TargetTransformInfo.cpp    |  15 +
 .../AArch64/AArch64TargetTransformInfo.h      |   2 +
 llvm/test/CodeGen/AArch64/selectopt.ll        |  31 +-
 7 files changed, 339 insertions(+), 128 deletions(-)

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 9697278eaeaee..3b615bc700bbb 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -934,6 +934,12 @@ class TargetTransformInfo {
   /// Should the Select Optimization pass be enabled and ran.
   bool enableSelectOptimize() const;
 
+  /// Should the Select Optimization pass treat the given instruction like a
+  /// select, potentially converting it to a conditional branch. This can
+  /// include select-like instructions like or(zext(c), x) that can be converted
+  /// to selects.
+  bool shouldTreatInstructionLikeSelect(const Instruction *I) const;
+
   /// Enable matching of interleaved access groups.
   bool enableInterleavedAccessVectorization() const;
 
@@ -1878,6 +1884,7 @@ class TargetTransformInfo::Concept {
   virtual MemCmpExpansionOptions
   enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const = 0;
   virtual bool enableSelectOptimize() = 0;
+  virtual bool shouldTreatInstructionLikeSelect(const Instruction *I) = 0;
   virtual bool enableInterleavedAccessVectorization() = 0;
   virtual bool enableMaskedInterleavedAccessVectorization() = 0;
   virtual bool isFPVectorizationPotentiallyUnsafe() = 0;
@@ -2415,6 +2422,9 @@ class TargetTransformInfo::Model final : public TargetTransformInfo::Concept {
   bool enableSelectOptimize() override {
     return Impl.enableSelectOptimize();
   }
+  bool shouldTreatInstructionLikeSelect(const Instruction *I) override {
+    return Impl.shouldTreatInstructionLikeSelect(I);
+  }
   bool enableInterleavedAccessVectorization() override {
     return Impl.enableInterleavedAccessVectorization();
   }
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 60eab53fa2f60..9958b4daa6ed8 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -378,6 +378,15 @@ class TargetTransformInfoImplBase {
 
   bool enableSelectOptimize() const { return true; }
 
+  bool shouldTreatInstructionLikeSelect(const Instruction *I) {
+    // If the select is a logical-and/logical-or then it is better treated as a
+    // and/or by the backend.
+    using namespace llvm::PatternMatch;
+    return isa<SelectInst>(I) &&
+           !match(I, m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),
+                                 m_LogicalOr(m_Value(), m_Value())));
+  }
+
   bool enableInterleavedAccessVectorization() const { return false; }
 
   bool enableMaskedInterleavedAccessVectorization() const { return false; }
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a5a18a538d768..8902dde37cbca 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -604,6 +604,11 @@ bool TargetTransformInfo::enableSelectOptimize() const {
   return TTIImpl->enableSelectOptimize();
 }
 
+bool TargetTransformInfo::shouldTreatInstructionLikeSelect(
+    const Instruction *I) const {
+  return TTIImpl->shouldTreatInstructionLikeSelect(I);
+}
+
 bool TargetTransformInfo::enableInterleavedAccessVectorization() const {
   return TTIImpl->enableInterleavedAccessVectorization();
 }
diff --git a/llvm/lib/CodeGen/SelectOptimize.cpp b/llvm/lib/CodeGen/SelectOptimize.cpp
index d14283c521554..9c720864358e0 100644
--- a/llvm/lib/CodeGen/SelectOptimize.cpp
+++ b/llvm/lib/CodeGen/SelectOptimize.cpp
@@ -42,6 +42,7 @@
 #include <stack>
 
 using namespace llvm;
+using namespace llvm::PatternMatch;
 
 #define DEBUG_TYPE "select-optimize"
 
@@ -114,12 +115,6 @@ class SelectOptimizeImpl {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
   bool runOnFunction(Function &F, Pass &P);
 
-private:
-  // Select groups consist of consecutive select instructions with the same
-  // condition.
-  using SelectGroup = SmallVector<SelectInst *, 2>;
-  using SelectGroups = SmallVector<SelectGroup, 2>;
-
   using Scaled64 = ScaledNumber<uint64_t>;
 
   struct CostInfo {
@@ -129,6 +124,145 @@ class SelectOptimizeImpl {
     Scaled64 NonPredCost;
   };
 
+  /// SelectLike is an abstraction over SelectInst and other operations that can
+  /// act like selects. For example Or(Zext(icmp), X) can be treated like
+  /// select(icmp, X|1, X).
+  class SelectLike {
+    SelectLike(Instruction *I) : I(I) {}
+
+    Instruction *I;
+
+  public:
+    /// Match a select or select-like instruction, returning a SelectLike.
+    static SelectLike match(Instruction *I) {
+      // Select instruction are what we are usually looking for.
+      if (isa<SelectInst>(I))
+        return SelectLike(I);
+
+      // An Or(zext(i1 X), Y) can also be treated like a select, with condition
+      // C and values Y|1 and Y.
+      Value *X;
+      if (PatternMatch::match(
+              I, m_c_Or(m_OneUse(m_ZExt(m_Value(X))), m_Value())) &&
+          X->getType()->isIntegerTy(1))
+        return SelectLike(I);
+
+      return SelectLike(nullptr);
+    }
+
+    bool isValid() { return I; }
+    operator bool() { return isValid(); }
+
+    Instruction *getI() { return I; }
+    const Instruction *getI() const { return I; }
+
+    Type *getType() const { return I->getType(); }
+
+    /// Return the condition for the SelectLike instruction. For example the
+    /// condition of a select or c in `or(zext(c), x)`
+    Value *getCondition() const {
+      if (auto *Sel = dyn_cast<SelectInst>(I))
+        return Sel->getCondition();
+      // Or(zext) case
+      if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+        Value *X;
+        if (PatternMatch::match(BO->getOperand(0),
+                                m_OneUse(m_ZExt(m_Value(X)))))
+          return X;
+        if (PatternMatch::match(BO->getOperand(1),
+                                m_OneUse(m_ZExt(m_Value(X)))))
+          return X;
+      }
+
+      llvm_unreachable("Unhandled case in getCondition");
+    }
+
+    /// Return the true value for the SelectLike instruction. Note this may not
+    /// exist for all SelectLike instructions. For example, for `or(zext(c), x)`
+    /// the true value would be `or(x,1)`. As this value does not exist, nullptr
+    /// is returned.
+    Value *getTrueValue() const {
+      if (auto *Sel = dyn_cast<SelectInst>(I))
+        return Sel->getTrueValue();
+      // Or(zext) case - The true value is Or(X), so return nullptr as the value
+      // does not yet exist.
+      if (isa<BinaryOperator>(I))
+        return nullptr;
+
+      llvm_unreachable("Unhandled case in getTrueValue");
+    }
+
+    /// Return the false value for the SelectLike instruction. For example the
+    /// getFalseValue of a select or `x` in `or(zext(c), x)` (which is
+    /// `select(c, x|1, x)`)
+    Value *getFalseValue() const {
+      if (auto *Sel = dyn_cast<SelectInst>(I))
+        return Sel->getFalseValue();
+      // Or(zext) case - return the operand which is not the zext.
+      if (auto *BO = dyn_cast<BinaryOperator>(I)) {
+        Value *X;
+        if (PatternMatch::match(BO->getOperand(0),
+                                m_OneUse(m_ZExt(m_Value(X)))))
+          return BO->getOperand(1);
+        if (PatternMatch::match(BO->getOperand(1),
+                                m_OneUse(m_ZExt(m_Value(X)))))
+          return BO->getOperand(0);
+      }
+
+      llvm_unreachable("Unhandled case in getFalseValue");
+    }
+
+    /// Return the NonPredCost cost of the true op, given the costs in
+    /// InstCostMap. This may need to be generated for select-like instructions.
+    Scaled64 getTrueOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
+                           const TargetTransformInfo *TTI) {
+      if (auto *Sel = dyn_cast<SelectInst>(I))
+        if (auto *I = dyn_cast<Instruction>(Sel->getTrueValue()))
+          return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
+                                         : Scaled64::getZero();
+
+      // Or case - add the cost of an extra Or to the cost of the False case.
+      if (isa<BinaryOperator>(I))
+        if (auto I = dyn_cast<Instruction>(getFalseValue()))
+          if (InstCostMap.contains(I)) {
+            InstructionCost OrCost = TTI->getArithmeticInstrCost(
+                Instruction::Or, I->getType(), TargetTransformInfo::TCK_Latency,
+                {TargetTransformInfo::OK_AnyValue,
+                 TargetTransformInfo::OP_None},
+                {TTI::OK_UniformConstantValue, TTI::OP_PowerOf2});
+            return InstCostMap[I].NonPredCost +
+                   Scaled64::get(*OrCost.getValue());
+          }
+
+      return Scaled64::getZero();
+    }
+
+    /// Return the NonPredCost cost of the false op, given the costs in
+    /// InstCostMap. This may need to be generated for select-like instructions.
+    Scaled64
+    getFalseOpCost(DenseMap<const Instruction *, CostInfo> &InstCostMap,
+                   const TargetTransformInfo *TTI) {
+      if (auto *Sel = dyn_cast<SelectInst>(I))
+        if (auto *I = dyn_cast<Instruction>(Sel->getFalseValue()))
+          return InstCostMap.contains(I) ? InstCostMap[I].NonPredCost
+                                         : Scaled64::getZero();
+
+      // Or case - return the cost of the false case
+      if (isa<BinaryOperator>(I))
+        if (auto I = dyn_cast<Instruction>(getFalseValue()))
+          if (InstCostMap.contains(I))
+            return InstCostMap[I].NonPredCost;
+
+      return Scaled64::getZero();
+    }
+  };
+
+private:
+  // Select groups consist of consecutive select instructions with the same
+  // condition.
+  using SelectGroup = SmallVector<SelectLike, 2>;
+  using SelectGroups = SmallVector<SelectGroup, 2>;
+
   // Converts select instructions of a function to conditional jumps when deemed
   // profitable. Returns true if at least one select was converted.
   bool optimizeSelects(Function &F);
@@ -156,12 +290,12 @@ class SelectOptimizeImpl {
 
   // Determines if a select group should be converted to a branch (base
   // heuristics).
-  bool isConvertToBranchProfitableBase(const SmallVector<SelectInst *, 2> &ASI);
+  bool isConvertToBranchProfitableBase(const SelectGroup &ASI);
 
   // Returns true if there are expensive instructions in the cold value
   // operand's (if any) dependence slice of any of the selects of the given
   // group.
-  bool hasExpensiveColdOperand(const SmallVector<SelectInst *, 2> &ASI);
+  bool hasExpensiveColdOperand(const SelectGroup &ASI);
 
   // For a given source instruction, collect its backwards dependence slice
   // consisting of instructions exclusively computed for producing the operands
@@ -170,7 +304,7 @@ class SelectOptimizeImpl {
                              Instruction *SI, bool ForSinking = false);
 
   // Returns true if the condition of the select is highly predictable.
-  bool isSelectHighlyPredictable(const SelectInst *SI);
+  bool isSelectHighlyPredictable(const SelectLike SI);
 
   // Loop-level checks to determine if a non-predicated version (with branches)
   // of the given loop is more profitable than its predicated version.
@@ -183,20 +317,21 @@ class SelectOptimizeImpl {
                         CostInfo *LoopCost);
 
   // Returns a set of all the select instructions in the given select groups.
-  SmallPtrSet<const Instruction *, 2> getSIset(const SelectGroups &SIGroups);
+  SmallDenseMap<const Instruction *, SelectLike, 2>
+  getSImap(const SelectGroups &SIGroups);
 
   // Returns the latency cost of a given instruction.
   std::optional<uint64_t> computeInstCost(const Instruction *I);
 
   // Returns the misprediction cost of a given select when converted to branch.
-  Scaled64 getMispredictionCost(const SelectInst *SI, const Scaled64 CondCost);
+  Scaled64 getMispredictionCost(const SelectLike SI, const Scaled64 CondCost);
 
   // Returns the cost of a branch when the prediction is correct.
   Scaled64 getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
-                                const SelectInst *SI);
+                                const SelectLike SI);
 
   // Returns true if the target architecture supports lowering a given select.
-  bool isSelectKindSupported(SelectInst *SI);
+  bool isSelectKindSupported(const SelectLike SI);
 };
 
 class SelectOptimize : public FunctionPass {
@@ -368,15 +503,26 @@ void SelectOptimizeImpl::optimizeSelectsInnerLoops(Function &F,
 /// select instructions in \p Selects, look through the defining select
 /// instruction until the true/false value is not defined in \p Selects.
 static Value *
-getTrueOrFalseValue(SelectInst *SI, bool isTrue,
-                    const SmallPtrSet<const Instruction *, 2> &Selects) {
+getTrueOrFalseValue(SelectOptimizeImpl::SelectLike SI, bool isTrue,
+                    const SmallPtrSet<const Instruction *, 2> &Selects,
+                    IRBuilder<> &IB) {
   Value *V = nullptr;
-  for (SelectInst *DefSI = SI; DefSI != nullptr && Selects.count(DefSI);
+  for (SelectInst *DefSI = dyn_cast<SelectInst>(SI.getI());
+       DefSI != nullptr && Selects.count(DefSI);
        DefSI = dyn_cast<SelectInst>(V)) {
-    assert(DefSI->getCondition() == SI->getCondition() &&
+    assert(DefSI->getCondition() == SI.getCondition() &&
            "The condition of DefSI does not match with SI");
     V = (isTrue ? DefSI->getTrueValue() : DefSI->getFalseValue());
   }
+
+  if (isa<BinaryOperator>(SI.getI())) {
+    assert(SI.getI()->getOpcode() == Instruction::Or &&
+           "Only currently handling Or instructions.");
+    V = SI.getFalseValue();
+    if (isTrue)
+      V = IB.CreateOr(V, ConstantInt::get(V->getType(), 1));
+  }
+
   assert(V && "Failed to get select true/false value");
   return V;
 }
@@ -424,20 +570,22 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     SmallVector<std::stack<Instruction *>, 2> TrueSlices, FalseSlices;
     typedef std::stack<Instruction *>::size_type StackSizeType;
     StackSizeType maxTrueSliceLen = 0, maxFalseSliceLen = 0;
-    for (SelectInst *SI : ASI) {
+    for (SelectLike SI : ASI) {
       // For each select, compute the sinkable dependence chains of the true and
       // false operands.
-      if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue())) {
+      if (auto *TI = dyn_cast_or_null<Instruction>(SI.getTrueValue())) {
         std::stack<Instruction *> TrueSlice;
-        getExclBackwardsSlice(TI, TrueSlice, SI, true);
+        getExclBackwardsSlice(TI, TrueSlice, SI.getI(), true);
         maxTrueSliceLen = std::max(maxTrueSliceLen, TrueSlice.size());
         TrueSlices.push_back(TrueSlice);
       }
-      if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue())) {
-        std::stack<Instruction *> FalseSlice;
-        getExclBackwardsSlice(FI, FalseSlice, SI, true);
-        maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
-        FalseSlices.push_back(FalseSlice);
+      if (auto *FI = dyn_cast_or_null<Instruction>(SI.getFalseValue())) {
+        if (isa<SelectInst>(SI.getI()) || !FI->hasOneUse()) {
+          std::stack<Instruction *> FalseSlice;
+          getExclBackwardsSlice(FI, FalseSlice, SI.getI(), true);
+          maxFalseSliceLen = std::max(maxFalseSliceLen, FalseSlice.size());
+          FalseSlices.push_back(FalseSlice);
+        }
       }
     }
     // In the case of multiple select instructions in the same group, the order
@@ -469,10 +617,10 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     }
 
     // We split the block containing the select(s) into two blocks.
-    SelectInst *SI = ASI.front();
-    SelectInst *LastSI = ASI.back();
-    BasicBlock *StartBlock = SI->getParent();
-    BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
+    SelectLike SI = ASI.front();
+    SelectLike LastSI = ASI.back();
+    BasicBlock *StartBlock = SI.getI()->getParent();
+    BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI.getI()));
     BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
     BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock));
     // Delete the unconditional branch that was just created by the split.
@@ -481,8 +629,8 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     // Move any debug/pseudo instructions that were in-between the select
     // group to the newly-created end block.
     SmallVector<Instruction *, 2> DebugPseudoINS;
-    auto DIt = SI->getIterator();
-    while (&*DIt != LastSI) {
+    auto DIt = SI.getI()->getIterator();
+    while (&*DIt != LastSI.getI()) {
       if (DIt->isDebugOrPseudoInst())
         DebugPseudoINS.push_back(&*DIt);
       DIt++;
@@ -504,8 +652,8 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     // Iterate over all instructions in between SI and LastSI, not including
     // SI itself. These are all the variable assignments that happen "in the
     // middle" of the select group.
-    auto R = make_range(std::next(SI->getIterator()),
-                        std::next(LastSI->getIterator()));
+    auto R = make_range(std::next(SI.getI()->getIterator()),
+                        std::next(LastSI.getI()->getIterator()));
     llvm::for_each(R, TransferDPValues);
 
     // These are the new basic blocks for the conditional branch.
@@ -513,18 +661,19 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
     BasicBlock *TrueBlock = nullptr, *FalseBlock = nullptr;
     BranchInst *TrueBranch = nullptr, *FalseBranch = nullptr;
     if (!TrueSlicesInterleaved.empty()) {
-      TrueBlock = BasicBlock::Create(LastSI->getContext(), "select.true.sink",
+      TrueBlock = BasicBlock::Create(EndBlock->getContext(), "select.true.sink",
                                      EndBlock->getParent(), EndBlock);
       TrueBranch = BranchInst::Create(EndBlock, TrueBlock);
-      TrueBranch->setDebugLoc(LastSI->getDebugLoc());
+      TrueBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
       for (Instruction *TrueInst : TrueSlicesInterleaved)
         TrueInst->moveBefore(TrueBranch);
     }
     if (!FalseSlicesInterleaved.empty()) {
-      FalseBlock = BasicBlock::Create(LastSI->getContext(), "select.false.sink",
-                                      EndBlock->getParent(), EndBlock);
+      FalseBlock =
+          BasicBlock::Create(EndBlock->getContext(), "select.false.sink",
+                             EndBlock->getParent(), EndBlock);
       FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
-      FalseBranch->setDebugLoc(LastSI->getDebugLoc());
+      FalseBranch->setDebugLoc(LastSI.getI()->getDebugLoc());
       for (Instruction *FalseInst : FalseSlicesInterleaved)
         FalseInst->moveBefore(FalseBranch);
     }
@@ -534,10 +683,10 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
       assert(TrueBlock == nullptr &&
              "Unexpected basic block transform while optimizing select");
 
-      FalseBlock = BasicBlock::Create(SI->getContext(), "select.false",
+      FalseBlock = BasicBlock::Create(StartBlock->getContext(), "select.false",
                                       EndBlock->getParent(), EndBlock);
       auto *FalseBranch = BranchInst::Create(EndBlock, FalseBlock);
-      FalseBranch->setDebugLoc(SI->getDebugLoc());
+      FalseBranch->setDebugLoc(SI.getI()->getDebugLoc());
     }
 
     // Insert the real conditional branch based on the original condition.
@@ -558,44 +707,36 @@ void SelectOptimizeImpl::convertProfitableSIGroups(SelectGroups &ProfSIGroups) {
       TT = TrueBlock;
       FT = FalseBlock;
     }
-    IRBuilder<> IB(SI);
-    auto *CondFr =
-        IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
-    IB.CreateCondBr(CondFr, TT, FT, SI);
+    IRBuilder<> IB(SI.getI());
+    auto *CondFr = IB.CreateFreeze(SI.getCondition(),
+                                   SI.getCondition()->getName() + ".frozen");
 
     SmallPtrSet<const Instruction *, 2> INS;
-    INS.insert(ASI.begin(), ASI.end());
+    for (auto SI : ASI)
+      INS.insert(SI.getI());
+
     // Use reverse iterator because later select may use the value of the
     // earlier select, and we need to propagate value through earlier select
     // to get the PHI operand.
     for (auto It = ASI.rbegin(); It != ASI.rend(); ++It) {
-      SelectInst *SI = *It;
+      SelectLike SI = *It;
       // The select itself is replaced with a PHI Node.
-      PHINode *PN = PHINode::Create(SI->getType(), 2, "");
+      PHINode *PN = PHINode::Create(SI.getType(), 2, "");
       PN->insertBefore(EndBlock->begin());
-      PN->takeName(SI);
-      PN->addIncoming(getTrueOrFalseValue(SI, true, INS), TrueBlock);
-      PN->addIncoming(getTrueOrFalseValue(SI, false, INS), FalseBlock);
-      PN->setDebugLoc(SI->getDebugLoc());
-
-      SI->replaceAllUsesWith(PN);
-      SI->eraseFromParent();
-      INS.erase(SI);
+      PN->takeName(SI.getI());
+      PN->addIncoming(getTrueOrFalseValue(SI, true, INS, IB), TrueBlock);
+      PN->addIncoming(getTrueOrFalseValue(SI, false, INS, IB), FalseBlock);
+      PN->setDebugLoc(SI.getI()->getDebugLoc());
+      SI.getI()->replaceAllUsesWith(PN);
+      INS.erase(SI.getI());
       ++NumSelectsConverted;
     }
-  }
-}
-
-static bool isSpecialSelect(SelectInst *SI) {
-  using namespace llvm::PatternMatch;
+    IB.CreateCondBr(CondFr, TT, FT, SI.getI());
 
-  // If the select is a logical-and/logical-or then it is better treated as a
-  // and/or by the backend.
-  if (match(SI, m_CombineOr(m_LogicalAnd(m_Value(), m_Value()),
-                            m_LogicalOr(m_Value(), m_Value()))))
-    return true;
-
-  return false;
+    // Remove the old select instructions, now that they are not longer used.
+    for (auto SI : ASI)
+      SI.getI()->eraseFromParent();
+  }
 }
 
 void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
@@ -603,22 +744,30 @@ void SelectOptimizeImpl::collectSelectGroups(BasicBlock &BB,
   BasicBlock::iterator BBIt = BB.begin();
   while (BBIt != BB.end()) {
     Instruction *I = &*BBIt++;
-    if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
-      if (isSpecialSelect(SI))
+    if (SelectLike SI = SelectLike::match(I)) {
+      if (!TTI->shouldTreatInstructionLikeSelect(I))
         continue;
 
       SelectGroup SIGroup;
       SIGroup.push_back(SI);
       while (BBIt != BB.end()) {
         Instruction *NI = &*BBIt;
-        SelectInst *NSI = dyn_cast<SelectInst>(NI);
-        if (NSI && SI->getCondition() == NSI->getCondition()) {
+        // Debug/pseudo instructions should be skipped and not prevent the
+        // formation of a select group.
+        if (NI->isDebugOrPseudoInst()) {
+          ++BBIt;
+          continue;
+        }
+        // We only allow selects in the same group, not other select-like
+        // instructions.
+        if (!isa<SelectInst>(NI))
+          break;
+
+        SelectLike NSI = SelectLike::match(NI);
+        if (NSI && SI.getCondition() == NSI.getCondition()) {
           SIGroup.push_back(NSI);
-        } else if (!NI->isDebugOrPseudoInst()) {
-          // Debug/pseudo instructions should be skipped and not prevent the
-          // formation of a select group.
+        } else
           break;
-        }
         ++BBIt;
       }
 
@@ -672,12 +821,12 @@ void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
     // Assuming infinite resources, the cost of a group of instructions is the
     // cost of the most expensive instruction of the group.
     Scaled64 SelectCost = Scaled64::getZero(), BranchCost = Scaled64::getZero();
-    for (SelectInst *SI : ASI) {
-      SelectCost = std::max(SelectCost, InstCostMap[SI].PredCost);
-      BranchCost = std::max(BranchCost, InstCostMap[SI].NonPredCost);
+    for (SelectLike SI : ASI) {
+      SelectCost = std::max(SelectCost, InstCostMap[SI.getI()].PredCost);
+      BranchCost = std::max(BranchCost, InstCostMap[SI.getI()].NonPredCost);
     }
     if (BranchCost < SelectCost) {
-      OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front());
+      OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", ASI.front().getI());
       OR << "Profitable to convert to branch (loop analysis). BranchCost="
          << BranchCost.toString() << ", SelectCost=" << SelectCost.toString()
          << ". ";
@@ -685,7 +834,8 @@ void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
       ++NumSelectConvertedLoop;
       ProfSIGroups.push_back(ASI);
     } else {
-      OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+      OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
+                                      ASI.front().getI());
       ORmiss << "Select is more profitable (loop analysis). BranchCost="
              << BranchCost.toString()
              << ", SelectCost=" << SelectCost.toString() << ". ";
@@ -695,14 +845,15 @@ void SelectOptimizeImpl::findProfitableSIGroupsInnerLoops(
 }
 
 bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
-    const SmallVector<SelectInst *, 2> &ASI) {
-  SelectInst *SI = ASI.front();
-  LLVM_DEBUG(dbgs() << "Analyzing select group containing " << *SI << "\n");
-  OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI);
-  OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI);
+    const SelectGroup &ASI) {
+  SelectLike SI = ASI.front();
+  LLVM_DEBUG(dbgs() << "Analyzing select group containing " << SI.getI()
+                    << "\n");
+  OptimizationRemark OR(DEBUG_TYPE, "SelectOpti", SI.getI());
+  OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", SI.getI());
 
   // Skip cold basic blocks. Better to optimize for size for cold blocks.
-  if (PSI->isColdBlock(SI->getParent(), BFI)) {
+  if (PSI->isColdBlock(SI.getI()->getParent(), BFI)) {
     ++NumSelectColdBB;
     ORmiss << "Not converted to branch because of cold basic block. ";
     EmitAndPrintRemark(ORE, ORmiss);
@@ -710,7 +861,7 @@ bool SelectOptimizeImpl::isConvertToBranchProfitableBase(
   }
 
   // If unpredictable, branch form is less profitable.
-  if (SI->getMetadata(LLVMContext::MD_unpredictable)) {
+  if (SI.getI()->getMetadata(LLVMContext::MD_unpredictable)) {
     ++NumSelectUnPred;
     ORmiss << "Not converted to branch because of unpredictable branch. ";
     EmitAndPrintRemark(ORE, ORmiss);
@@ -745,17 +896,24 @@ static InstructionCost divideNearest(InstructionCost Numerator,
   return (Numerator + (Denominator / 2)) / Denominator;
 }
 
-bool SelectOptimizeImpl::hasExpensiveColdOperand(
-    const SmallVector<SelectInst *, 2> &ASI) {
+static bool extractBranchWeights(const SelectOptimizeImpl::SelectLike SI,
+                                 uint64_t &TrueVal, uint64_t &FalseVal) {
+  if (isa<SelectInst>(SI.getI()))
+    return extractBranchWeights(*SI.getI(), TrueVal, FalseVal);
+  return false;
+}
+
+bool SelectOptimizeImpl::hasExpensiveColdOperand(const SelectGroup &ASI) {
   bool ColdOperand = false;
   uint64_t TrueWeight, FalseWeight, TotalWeight;
-  if (extractBranchWeights(*ASI.front(), TrueWeight, FalseWeight)) {
+  if (extractBranchWeights(ASI.front(), TrueWeight, FalseWeight)) {
     uint64_t MinWeight = std::min(TrueWeight, FalseWeight);
     TotalWeight = TrueWeight + FalseWeight;
     // Is there a path with frequency <ColdOperandThreshold% (default:20%) ?
     ColdOperand = TotalWeight * ColdOperandThreshold > 100 * MinWeight;
   } else if (PSI->hasProfileSummary()) {
-    OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti", ASI.front());
+    OptimizationRemarkMissed ORmiss(DEBUG_TYPE, "SelectOpti",
+                                    ASI.front().getI());
     ORmiss << "Profile data available but missing branch-weights metadata for "
               "select instruction. ";
     EmitAndPrintRemark(ORE, ORmiss);
@@ -764,19 +922,19 @@ bool SelectOptimizeImpl::hasExpensiveColdOperand(
     return false;
   // Check if the cold path's dependence slice is expensive for any of the
   // selects of the group.
-  for (SelectInst *SI : ASI) {
+  for (SelectLike SI : ASI) {
     Instruction *ColdI = nullptr;
     uint64_t HotWeight;
     if (TrueWeight < FalseWeight) {
-      ColdI = dyn_cast<Instruction>(SI->getTrueValue());
+      ColdI = dyn_cast_or_null<Instruction>(SI.getTrueValue());
       HotWeight = FalseWeight;
     } else {
-      ColdI = dyn_cast<Instruction>(SI->getFalseValue());
+      ColdI = dyn_cast_or_null<Instruction>(SI.getFalseValue());
       HotWeight = TrueWeight;
     }
     if (ColdI) {
       std::stack<Instruction *> ColdSlice;
-      getExclBackwardsSlice(ColdI, ColdSlice, SI);
+      getExclBackwardsSlice(ColdI, ColdSlice, SI.getI());
       InstructionCost SliceCost = 0;
       while (!ColdSlice.empty()) {
         SliceCost += TTI->getInstructionCost(ColdSlice.top(),
@@ -866,9 +1024,9 @@ void SelectOptimizeImpl::getExclBackwardsSlice(Instruction *I,
   }
 }
 
-bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectInst *SI) {
+bool SelectOptimizeImpl::isSelectHighlyPredictable(const SelectLike SI) {
   uint64_t TrueWeight, FalseWeight;
-  if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
+  if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
     uint64_t Max = std::max(TrueWeight, FalseWeight);
     uint64_t Sum = TrueWeight + FalseWeight;
     if (Sum != 0) {
@@ -954,7 +1112,7 @@ bool SelectOptimizeImpl::computeLoopCosts(
     DenseMap<const Instruction *, CostInfo> &InstCostMap, CostInfo *LoopCost) {
   LLVM_DEBUG(dbgs() << "Calculating Latency / IPredCost / INonPredCost of loop "
                     << L->getHeader()->getName() << "\n");
-  const auto &SIset = getSIset(SIGroups);
+  const auto &SImap = getSImap(SIGroups);
   // Compute instruction and loop-critical-path costs across two iterations for
   // both predicated and non-predicated version.
   const unsigned Iterations = 2;
@@ -999,22 +1157,15 @@ bool SelectOptimizeImpl::computeLoopCosts(
         // BranchCost = PredictedPathCost + MispredictCost
         // PredictedPathCost = TrueOpCost * TrueProb + FalseOpCost * FalseProb
         // MispredictCost = max(MispredictPenalty, CondCost) * MispredictRate
-        if (SIset.contains(&I)) {
-          auto SI = cast<SelectInst>(&I);
-
-          Scaled64 TrueOpCost = Scaled64::getZero(),
-                   FalseOpCost = Scaled64::getZero();
-          if (auto *TI = dyn_cast<Instruction>(SI->getTrueValue()))
-            if (InstCostMap.count(TI))
-              TrueOpCost = InstCostMap[TI].NonPredCost;
-          if (auto *FI = dyn_cast<Instruction>(SI->getFalseValue()))
-            if (InstCostMap.count(FI))
-              FalseOpCost = InstCostMap[FI].NonPredCost;
+        if (SImap.contains(&I)) {
+          auto SI = SImap.at(&I);
+          Scaled64 TrueOpCost = SI.getTrueOpCost(InstCostMap, TTI);
+          Scaled64 FalseOpCost = SI.getFalseOpCost(InstCostMap, TTI);
           Scaled64 PredictedPathCost =
               getPredictedPathCost(TrueOpCost, FalseOpCost, SI);
 
           Scaled64 CondCost = Scaled64::getZero();
-          if (auto *CI = dyn_cast<Instruction>(SI->getCondition()))
+          if (auto *CI = dyn_cast<Instruction>(SI.getCondition()))
             if (InstCostMap.count(CI))
               CondCost = InstCostMap[CI].NonPredCost;
           Scaled64 MispredictCost = getMispredictionCost(SI, CondCost);
@@ -1036,13 +1187,13 @@ bool SelectOptimizeImpl::computeLoopCosts(
   return true;
 }
 
-SmallPtrSet<const Instruction *, 2>
-SelectOptimizeImpl::getSIset(const SelectGroups &SIGroups) {
-  SmallPtrSet<const Instruction *, 2> SIset;
+SmallDenseMap<const Instruction *, SelectOptimizeImpl::SelectLike, 2>
+SelectOptimizeImpl::getSImap(const SelectGroups &SIGroups) {
+  SmallDenseMap<const Instruction *, SelectLike, 2> SImap;
   for (const SelectGroup &ASI : SIGroups)
-    for (const SelectInst *SI : ASI)
-      SIset.insert(SI);
-  return SIset;
+    for (SelectLike SI : ASI)
+      SImap.try_emplace(SI.getI(), SI);
+  return SImap;
 }
 
 std::optional<uint64_t>
@@ -1055,7 +1206,7 @@ SelectOptimizeImpl::computeInstCost(const Instruction *I) {
 }
 
 ScaledNumber<uint64_t>
-SelectOptimizeImpl::getMispredictionCost(const SelectInst *SI,
+SelectOptimizeImpl::getMispredictionCost(const SelectLike SI,
                                          const Scaled64 CondCost) {
   uint64_t MispredictPenalty = TSchedModel.getMCSchedModel()->MispredictPenalty;
 
@@ -1082,10 +1233,10 @@ SelectOptimizeImpl::getMispredictionCost(const SelectInst *SI,
 // TrueCost * TrueProbability + FalseCost * FalseProbability.
 ScaledNumber<uint64_t>
 SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
-                                         const SelectInst *SI) {
+                                         const SelectLike SI) {
   Scaled64 PredPathCost;
   uint64_t TrueWeight, FalseWeight;
-  if (extractBranchWeights(*SI, TrueWeight, FalseWeight)) {
+  if (extractBranchWeights(SI, TrueWeight, FalseWeight)) {
     uint64_t SumWeight = TrueWeight + FalseWeight;
     if (SumWeight != 0) {
       PredPathCost = TrueCost * Scaled64::get(TrueWeight) +
@@ -1102,12 +1253,12 @@ SelectOptimizeImpl::getPredictedPathCost(Scaled64 TrueCost, Scaled64 FalseCost,
   return PredPathCost;
 }
 
-bool SelectOptimizeImpl::isSelectKindSupported(SelectInst *SI) {
-  bool VectorCond = !SI->getCondition()->getType()->isIntegerTy(1);
+bool SelectOptimizeImpl::isSelectKindSupported(const SelectLike SI) {
+  bool VectorCond = !SI.getCondition()->getType()->isIntegerTy(1);
   if (VectorCond)
     return false;
   TargetLowering::SelectSupportKind SelectKind;
-  if (SI->getType()->isVectorTy())
+  if (SI.getType()->isVectorTy())
     SelectKind = TargetLowering::ScalarCondVectorVal;
   else
     SelectKind = TargetLowering::ScalarValSelect;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 08ae536fe9bbf..d611338fc268f 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -55,6 +55,9 @@ static cl::opt<unsigned> InlineCallPenaltyChangeSM(
     "inline-call-penalty-sm-change", cl::init(10), cl::Hidden,
     cl::desc("Penalty of inlining a call that requires a change to PSTATE.SM"));
 
+static cl::opt<bool> EnableOrLikeSelectOpt("enable-aarch64-or-like-select",
+                                           cl::init(true), cl::Hidden);
+
 namespace {
 class TailFoldingOption {
   // These bitfields will only ever be set to something non-zero in operator=,
@@ -4048,3 +4051,15 @@ AArch64TTIImpl::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
     return AM.Scale != 0 && AM.Scale != 1;
   return -1;
 }
+
+bool AArch64TTIImpl::shouldTreatInstructionLikeSelect(const Instruction *I) {
+  // For the binary operators (e.g. or) we need to be more careful than
+  // selects, here we only transform them if they are already at a natural
+  // break point in the code - the end of a block with an unconditional
+  // terminator.
+  if (EnableOrLikeSelectOpt && I->getOpcode() == Instruction::Or &&
+      isa<BranchInst>(I->getNextNode()) &&
+      cast<BranchInst>(I->getNextNode())->isUnconditional())
+    return true;
+  return BaseT::shouldTreatInstructionLikeSelect(I);
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index f471294ffc252..de39dea2be43e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -412,6 +412,8 @@ class AArch64TTIImpl : public BasicTTIImplBase<AArch64TTIImpl> {
 
   bool enableSelectOptimize() { return ST->enableSelectOptimize(); }
 
+  bool shouldTreatInstructionLikeSelect(const Instruction *I);
+
   unsigned getStoreMinimumVF(unsigned VF, Type *ScalarMemTy,
                              Type *ScalarValTy) const {
     // We can vectorize store v4i8.
diff --git a/llvm/test/CodeGen/AArch64/selectopt.ll b/llvm/test/CodeGen/AArch64/selectopt.ll
index f2b86dcb92111..876d9e68d538a 100644
--- a/llvm/test/CodeGen/AArch64/selectopt.ll
+++ b/llvm/test/CodeGen/AArch64/selectopt.ll
@@ -6,6 +6,7 @@
 ; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-n1 -S < %s | FileCheck %s --check-prefix=CHECKOO
 ; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=cortex-a710 -S < %s | FileCheck %s --check-prefix=CHECKOO
 ; RUN: opt -select-optimize -mtriple=aarch64-linux-gnu -mcpu=neoverse-v2 -S < %s | FileCheck %s --check-prefix=CHECKOO
+; RUN: opt -debugify-and-strip-all-safe -select-optimize -mtriple=aarch64-linux-gnu -mcpu=generic -S < %s | FileCheck %s --check-prefix=CHECKOO
 ; RUN: opt -passes='require<profile-summary>,function(select-optimize)' -mtriple=aarch64-linux-gnu -mcpu=generic -S < %s | FileCheck %s --check-prefix=CHECKOO
 ; RUN: opt -passes='require<profile-summary>,function(select-optimize)' -mtriple=aarch64-linux-gnu -mcpu=cortex-a55 -S < %s | FileCheck %s --check-prefix=CHECKII
 ; RUN: opt -passes='require<profile-summary>,function(select-optimize)' -mtriple=aarch64-linux-gnu -mcpu=cortex-a510 -S < %s | FileCheck %s --check-prefix=CHECKII
@@ -341,10 +342,16 @@ define void @replace_or(ptr nocapture noundef %newst, ptr noundef %t, ptr nounde
 ; CHECKOO-NEXT:    [[TMP8:%.*]] = load i64, ptr [[FLOW83]], align 8
 ; CHECKOO-NEXT:    [[CMP84:%.*]] = icmp slt i64 [[TMP7]], [[TMP8]]
 ; CHECKOO-NEXT:    [[ADD:%.*]] = zext i1 [[CMP84]] to i64
-; CHECKOO-NEXT:    [[SPEC_SELECT:%.*]] = or disjoint i64 [[MUL]], [[ADD]]
+; CHECKOO-NEXT:    [[CMP84_FROZEN:%.*]] = freeze i1 [[CMP84]]
+; CHECKOO-NEXT:    [[TMP9:%.*]] = or i64 [[MUL]], 1
+; CHECKOO-NEXT:    br i1 [[CMP84_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]]
+; CHECKOO:       select.false:
+; CHECKOO-NEXT:    br label [[SELECT_END]]
+; CHECKOO:       select.end:
+; CHECKOO-NEXT:    [[SPEC_SELECT:%.*]] = phi i64 [ [[TMP9]], [[IF_THEN]] ], [ [[MUL]], [[SELECT_FALSE]] ]
 ; CHECKOO-NEXT:    br label [[IF_END87]]
 ; CHECKOO:       if.end87:
-; CHECKOO-NEXT:    [[CMP_1]] = phi i64 [ [[MUL]], [[WHILE_BODY]] ], [ [[SPEC_SELECT]], [[IF_THEN]] ]
+; CHECKOO-NEXT:    [[CMP_1]] = phi i64 [ [[MUL]], [[WHILE_BODY]] ], [ [[SPEC_SELECT]], [[SELECT_END]] ]
 ; CHECKOO-NEXT:    [[CMP16_NOT:%.*]] = icmp sgt i64 [[CMP_1]], [[MA]]
 ; CHECKOO-NEXT:    br i1 [[CMP16_NOT]], label [[WHILE_END]], label [[LAND_RHS]]
 ; CHECKOO:       while.end:
@@ -663,10 +670,16 @@ define i32 @or_samegroup(ptr nocapture noundef %x, i32 noundef %n, ptr nocapture
 ; CHECKOO-NEXT:    br label [[SELECT_END]]
 ; CHECKOO:       select.end:
 ; CHECKOO-NEXT:    [[SEL:%.*]] = phi i32 [ [[ADD]], [[IF_THEN]] ], [ 1, [[SELECT_FALSE]] ]
-; CHECKOO-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[SEL]]
+; CHECKOO-NEXT:    [[CMP5_FROZEN3:%.*]] = freeze i1 [[CMP5]]
+; CHECKOO-NEXT:    [[TMP2:%.*]] = or i32 [[SEL]], 1
+; CHECKOO-NEXT:    br i1 [[CMP5_FROZEN3]], label [[SELECT_END1:%.*]], label [[SELECT_FALSE2:%.*]]
+; CHECKOO:       select.false2:
+; CHECKOO-NEXT:    br label [[SELECT_END1]]
+; CHECKOO:       select.end1:
+; CHECKOO-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP2]], [[SELECT_END]] ], [ [[SEL]], [[SELECT_FALSE2]] ]
 ; CHECKOO-NEXT:    br label [[IF_END]]
 ; CHECKOO:       if.end:
-; CHECKOO-NEXT:    [[Y_1]] = phi i32 [ [[SEL]], [[SELECT_END]] ], [ 0, [[FOR_BODY]] ]
+; CHECKOO-NEXT:    [[Y_1]] = phi i32 [ [[SEL]], [[SELECT_END1]] ], [ 0, [[FOR_BODY]] ]
 ; CHECKOO-NEXT:    store i32 [[Y_1]], ptr [[ARRAYIDX]], align 4
 ; CHECKOO-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECKOO-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]
@@ -776,10 +789,16 @@ define i32 @or_oneusevalues(ptr nocapture noundef %x, i32 noundef %n, ptr nocapt
 ; CHECKOO-NEXT:    [[CONV:%.*]] = zext i1 [[CMP5]] to i32
 ; CHECKOO-NEXT:    [[ADD1:%.*]] = add i32 [[ADD]], 1
 ; CHECKOO-NEXT:    [[ADD2:%.*]] = or i32 [[ADD1]], 1
-; CHECKOO-NEXT:    [[OR:%.*]] = or i32 [[CONV]], [[ADD2]]
+; CHECKOO-NEXT:    [[CMP5_FROZEN:%.*]] = freeze i1 [[CMP5]]
+; CHECKOO-NEXT:    [[TMP2:%.*]] = or i32 [[ADD2]], 1
+; CHECKOO-NEXT:    br i1 [[CMP5_FROZEN]], label [[SELECT_END:%.*]], label [[SELECT_FALSE:%.*]]
+; CHECKOO:       select.false:
+; CHECKOO-NEXT:    br label [[SELECT_END]]
+; CHECKOO:       select.end:
+; CHECKOO-NEXT:    [[OR:%.*]] = phi i32 [ [[TMP2]], [[IF_THEN]] ], [ [[ADD2]], [[SELECT_FALSE]] ]
 ; CHECKOO-NEXT:    br label [[IF_END]]
 ; CHECKOO:       if.end:
-; CHECKOO-NEXT:    [[Y_1]] = phi i32 [ [[OR]], [[IF_THEN]] ], [ 0, [[FOR_BODY]] ]
+; CHECKOO-NEXT:    [[Y_1]] = phi i32 [ [[OR]], [[SELECT_END]] ], [ 0, [[FOR_BODY]] ]
 ; CHECKOO-NEXT:    store i32 [[Y_1]], ptr [[ARRAYIDX]], align 4
 ; CHECKOO-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
 ; CHECKOO-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]]

From 7117a4e8409018f31c00b186ac4627586d839e59 Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Tue, 23 Jan 2024 00:24:41 +0000
Subject: [PATCH 510/843] [BitcodeWriter] Remove ThinLTO-specific bits from
 legacy pass

Since we shouldn't be doing any LTO logic from the legacy pass manager.
---
 llvm/include/llvm/Bitcode/BitcodeWriterPass.h | 10 +-------
 llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp | 24 +++++--------------
 2 files changed, 7 insertions(+), 27 deletions(-)

diff --git a/llvm/include/llvm/Bitcode/BitcodeWriterPass.h b/llvm/include/llvm/Bitcode/BitcodeWriterPass.h
index 3c2471237532b..4035c00b6793f 100644
--- a/llvm/include/llvm/Bitcode/BitcodeWriterPass.h
+++ b/llvm/include/llvm/Bitcode/BitcodeWriterPass.h
@@ -28,16 +28,8 @@ class raw_ostream;
 ///
 /// If \c ShouldPreserveUseListOrder, encode use-list order so it can be
 /// reproduced when deserialized.
-///
-/// If \c EmitSummaryIndex, emit the summary index (currently for use in ThinLTO
-/// optimization).
-///
-/// If \c EmitModuleHash, compute and emit the module hash in the bitcode
-/// (currently for use in ThinLTO incremental build).
 ModulePass *createBitcodeWriterPass(raw_ostream &Str,
-                                    bool ShouldPreserveUseListOrder = false,
-                                    bool EmitSummaryIndex = false,
-                                    bool EmitModuleHash = false);
+                                    bool ShouldPreserveUseListOrder = false);
 
 /// Check whether a pass is a BitcodeWriterPass.
 bool isBitcodeWriterPass(Pass *P);
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
index 28941d6c41cfe..93fb2a821dee7 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriterPass.cpp
@@ -40,8 +40,6 @@ namespace {
   class WriteBitcodePass : public ModulePass {
     raw_ostream &OS; // raw_ostream to print on
     bool ShouldPreserveUseListOrder;
-    bool EmitSummaryIndex;
-    bool EmitModuleHash;
 
   public:
     static char ID; // Pass identification, replacement for typeid
@@ -49,29 +47,23 @@ namespace {
       initializeWriteBitcodePassPass(*PassRegistry::getPassRegistry());
     }
 
-    explicit WriteBitcodePass(raw_ostream &o, bool ShouldPreserveUseListOrder,
-                              bool EmitSummaryIndex, bool EmitModuleHash)
+    explicit WriteBitcodePass(raw_ostream &o, bool ShouldPreserveUseListOrder)
         : ModulePass(ID), OS(o),
-          ShouldPreserveUseListOrder(ShouldPreserveUseListOrder),
-          EmitSummaryIndex(EmitSummaryIndex), EmitModuleHash(EmitModuleHash) {
+          ShouldPreserveUseListOrder(ShouldPreserveUseListOrder) {
       initializeWriteBitcodePassPass(*PassRegistry::getPassRegistry());
     }
 
     StringRef getPassName() const override { return "Bitcode Writer"; }
 
     bool runOnModule(Module &M) override {
-      const ModuleSummaryIndex *Index =
-          EmitSummaryIndex
-              ? &(getAnalysis<ModuleSummaryIndexWrapperPass>().getIndex())
-              : nullptr;
       // RemoveDIs: there's no bitcode representation of the DPValue debug-info,
       // convert to dbg.values before writing out.
       bool IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
       if (IsNewDbgInfoFormat)
         M.convertFromNewDbgValues();
 
-      WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, Index,
-                         EmitModuleHash);
+      WriteBitcodeToFile(M, OS, ShouldPreserveUseListOrder, /*Index=*/nullptr,
+                         /*EmitModuleHash=*/false);
 
       if (IsNewDbgInfoFormat)
         M.convertToNewDbgValues();
@@ -79,8 +71,6 @@ namespace {
     }
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesAll();
-      if (EmitSummaryIndex)
-        AU.addRequired<ModuleSummaryIndexWrapperPass>();
     }
   };
 }
@@ -93,10 +83,8 @@ INITIALIZE_PASS_END(WriteBitcodePass, "write-bitcode", "Write Bitcode", false,
                     true)
 
 ModulePass *llvm::createBitcodeWriterPass(raw_ostream &Str,
-                                          bool ShouldPreserveUseListOrder,
-                                          bool EmitSummaryIndex, bool EmitModuleHash) {
-  return new WriteBitcodePass(Str, ShouldPreserveUseListOrder,
-                              EmitSummaryIndex, EmitModuleHash);
+                                          bool ShouldPreserveUseListOrder) {
+  return new WriteBitcodePass(Str, ShouldPreserveUseListOrder);
 }
 
 bool llvm::isBitcodeWriterPass(Pass *P) {

From 029bfd6329aee0b60761998986dadba38e1de6f3 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Mon, 22 Jan 2024 19:27:27 -0500
Subject: [PATCH 511/843] [libc] Replace -nostdlib++ flag when building with
 gcc and add placement new operator to HermeticTestUtils.cpp. (#78906)

`-nostdlib++` is a clang-only flag. Replacing it with `-nostdlib` when
building with gcc.
---
 .../cmake/modules/CheckCompilerFeatures.cmake |  3 +++
 libc/cmake/modules/LLVMLibCTestRules.cmake    | 21 +++++++++++++++++--
 libc/test/UnitTest/HermeticTestUtils.cpp      |  4 ++++
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/libc/cmake/modules/CheckCompilerFeatures.cmake b/libc/cmake/modules/CheckCompilerFeatures.cmake
index c99a98d1567d3..983ce86ab1b25 100644
--- a/libc/cmake/modules/CheckCompilerFeatures.cmake
+++ b/libc/cmake/modules/CheckCompilerFeatures.cmake
@@ -62,3 +62,6 @@ message(STATUS "Compiler features available: ${AVAILABLE_COMPILER_FEATURES}")
 
 # clang-8+, gcc-12+
 check_cxx_compiler_flag("-ftrivial-auto-var-init=pattern" LIBC_CC_SUPPORTS_PATTERN_INIT)
+
+# clang-6+, gcc-13+
+check_cxx_compiler_flag("-nostdlib++" LIBC_CC_SUPPORTS_NOSTDLIBPP)
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 9b7e918027541..46ffc8316fc82 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -435,6 +435,13 @@ function(add_libc_fuzzer target_name)
 
 endfunction(add_libc_fuzzer)
 
+# Get libgcc_s to be used in hermetic and integration tests.
+if(NOT LIBC_CC_SUPPORTS_NOSTDLIBPP)
+  execute_process(COMMAND ${CMAKE_CXX_COMPILER} -print-file-name=libgcc_s.so.1
+                  OUTPUT_VARIABLE LIBGCC_S_LOCATION)
+  string(STRIP ${LIBGCC_S_LOCATION} LIBGCC_S_LOCATION)
+endif()
+
 # DEPRECATED: Use add_hermetic_test instead.
 #
 # Rule to add an integration test. An integration test is like a unit test
@@ -564,8 +571,13 @@ function(add_integration_test test_name)
 
   if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
     target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static)
-  else()
+  elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
     target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
+  else()
+    # Older version of gcc does not support `nostdlib++` flag.  We use
+    # `nostdlib` and link against libgcc_s, which cannot be linked statically.
+    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
+    list(APPEND link_libraries ${LIBGCC_S_LOCATION})
   endif()
   target_link_libraries(
     ${fq_build_target_name}
@@ -741,8 +753,13 @@ function(add_libc_hermetic_test test_name)
 
   if(LIBC_TARGET_ARCHITECTURE_IS_GPU)
     target_link_options(${fq_build_target_name} PRIVATE -nostdlib -static)
-  else()
+  elseif(LIBC_CC_SUPPORTS_NOSTDLIBPP)
     target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib++ -static)
+  else()
+    # Older version of gcc does not support `nostdlib++` flag.  We use
+    # `nostdlib` and link against libgcc_s, which cannot be linked statically.
+    target_link_options(${fq_build_target_name} PRIVATE -nolibc -nostartfiles -nostdlib)
+    list(APPEND link_libraries ${LIBGCC_S_LOCATION})
   endif()
   target_link_libraries(
     ${fq_build_target_name}
diff --git a/libc/test/UnitTest/HermeticTestUtils.cpp b/libc/test/UnitTest/HermeticTestUtils.cpp
index 68e31f478d79a..349c182ff2379 100644
--- a/libc/test/UnitTest/HermeticTestUtils.cpp
+++ b/libc/test/UnitTest/HermeticTestUtils.cpp
@@ -104,6 +104,8 @@ void *__dso_handle = nullptr;
 
 } // extern "C"
 
+void *operator new(unsigned long size, void *ptr) { return ptr; }
+
 void *operator new(size_t size) { return malloc(size); }
 
 void *operator new[](size_t size) { return malloc(size); }
@@ -113,3 +115,5 @@ void operator delete(void *) {
   // we just trap here to catch any such accidental usages.
   __builtin_trap();
 }
+
+void operator delete(void *ptr, size_t size) { __builtin_trap(); }

From 58cfd56356ed96690150245ca5bcf646d12edc7a Mon Sep 17 00:00:00 2001
From: Simeon K <5235180+simeonkr@users.noreply.github.com>
Date: Tue, 23 Jan 2024 00:46:39 +0000
Subject: [PATCH 512/843] [VP][RISCV] Introduce llvm.vp.minimum/maximum
 intrinsics (#74840)

Although there are predicated versions of minnum/maxnum, the ones for
minimum/maximum are currently missing. This patch introduces these
intrinsics and implements their lowering to RISC-V.
---
 llvm/docs/LangRef.rst                         |  100 ++
 llvm/include/llvm/IR/Intrinsics.td            |   10 +
 llvm/include/llvm/IR/VPIntrinsics.def         |   18 +-
 llvm/lib/CodeGen/ExpandVectorPredication.cpp  |    2 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |    4 +
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |   28 +-
 .../RISCV/rvv/fixed-vectors-fmaximum-vp.ll    |  815 +++++++++++
 .../RISCV/rvv/fixed-vectors-fminimum-vp.ll    |  815 +++++++++++
 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll    | 1245 +++++++++++++++++
 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll    | 1245 +++++++++++++++++
 llvm/unittests/IR/VPIntrinsicTest.cpp         |    5 +-
 11 files changed, 4279 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
 create mode 100644 llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index bc9cfb557c540..d58375c173545 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -20394,6 +20394,106 @@ Examples:
       %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> poison
 
 
+.. _int_vp_minimum:
+
+'``llvm.vp.minimum.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x float>  @llvm.vp.minimum.v16f32 (<16 x float> <left_op>, <16 x float> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x float>  @llvm.vp.minimum.nxv4f32 (<vscale x 4 x float> <left_op>, <vscale x 4 x float> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x double>  @llvm.vp.minimum.v256f64 (<256 x double> <left_op>, <256 x double> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point minimum of two vectors of floating-point values,
+propagating NaNs and treating -0.0 as less than +0.0.
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of floating-point type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.minimum``' intrinsic performs floating-point minimum (:ref:`minimum <i_minimum>`)
+of the first and second vector operand on each enabled lane, the result being 
+NaN if either operand is a NaN. -0.0 is considered to be less than +0.0 for this
+intrinsic. The result on disabled lanes is a :ref:`poison value <poisonvalues>`. 
+The operation is performed in the default floating-point environment.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x float> @llvm.vp.minimum.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
+      %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> poison
+
+
+.. _int_vp_maximum:
+
+'``llvm.vp.maximum.*``' Intrinsics
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+This is an overloaded intrinsic.
+
+::
+
+      declare <16 x float>  @llvm.vp.maximum.v16f32 (<16 x float> <left_op>, <16 x float> <right_op>, <16 x i1> <mask>, i32 <vector_length>)
+      declare <vscale x 4 x float>  @llvm.vp.maximum.nxv4f32 (<vscale x 4 x float> <left_op>, <vscale x 4 x float> <right_op>, <vscale x 4 x i1> <mask>, i32 <vector_length>)
+      declare <256 x double>  @llvm.vp.maximum.v256f64 (<256 x double> <left_op>, <256 x double> <right_op>, <256 x i1> <mask>, i32 <vector_length>)
+
+Overview:
+"""""""""
+
+Predicated floating-point maximum of two vectors of floating-point values,
+propagating NaNs and treating -0.0 as less than +0.0.
+
+Arguments:
+""""""""""
+
+The first two operands and the result have the same vector of floating-point type. The
+third operand is the vector mask and has the same number of elements as the
+result vector type. The fourth operand is the explicit vector length of the
+operation.
+
+Semantics:
+""""""""""
+
+The '``llvm.vp.maximum``' intrinsic performs floating-point maximum (:ref:`maximum <i_maximum>`)
+of the first and second vector operand on each enabled lane, the result being 
+NaN if either operand is a NaN. -0.0 is considered to be less than +0.0 for this
+intrinsic. The result on disabled lanes is a :ref:`poison value <poisonvalues>`. 
+The operation is performed in the default floating-point environment.
+
+Examples:
+"""""""""
+
+.. code-block:: llvm
+
+      %r = call <4 x float> @llvm.vp.maximum.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl)
+      ;; For all lanes below %evl, %r is lane-wise equivalent to %also.r
+
+      %t = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b, <4 x i1> %mask, i32 %evl)
+      %also.r = select <4 x i1> %mask, <4 x float> %t, <4 x float> poison
+
+
 .. _int_vp_fadd:
 
 '``llvm.vp.fadd.*``' Intrinsics
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index b54c697296b20..3c19c7b063652 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1991,6 +1991,16 @@ let IntrProperties = [IntrNoMem, IntrNoSync, IntrWillReturn] in {
                                LLVMMatchType<0>,
                                LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                llvm_i32_ty]>;
+  def int_vp_minimum : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
+  def int_vp_maximum : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
+                             [ LLVMMatchType<0>,
+                               LLVMMatchType<0>,
+                               LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
+                               llvm_i32_ty]>;
   def int_vp_copysign : DefaultAttrsIntrinsic<[ llvm_anyvector_ty ],
                              [ LLVMMatchType<0>,
                                LLVMMatchType<0>,
diff --git a/llvm/include/llvm/IR/VPIntrinsics.def b/llvm/include/llvm/IR/VPIntrinsics.def
index 671dc39db26df..3b32b60609f53 100644
--- a/llvm/include/llvm/IR/VPIntrinsics.def
+++ b/llvm/include/llvm/IR/VPIntrinsics.def
@@ -367,20 +367,34 @@ VP_PROPERTY_FUNCTIONAL_SDOPC(FCOPYSIGN)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(copysign)
 END_REGISTER_VP(vp_copysign, VP_FCOPYSIGN)
 
-// llvm.vp.minnum(x, y, mask,vlen)
+// llvm.vp.minnum(x,y,mask,vlen)
 BEGIN_REGISTER_VP(vp_minnum, 2, 3, VP_FMINNUM, -1)
 VP_PROPERTY_BINARYOP
 VP_PROPERTY_FUNCTIONAL_SDOPC(FMINNUM)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(minnum)
 END_REGISTER_VP(vp_minnum, VP_FMINNUM)
 
-// llvm.vp.maxnum(x, y, mask,vlen)
+// llvm.vp.maxnum(x,y,mask,vlen)
 BEGIN_REGISTER_VP(vp_maxnum, 2, 3, VP_FMAXNUM, -1)
 VP_PROPERTY_BINARYOP
 VP_PROPERTY_FUNCTIONAL_SDOPC(FMAXNUM)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(maxnum)
 END_REGISTER_VP(vp_maxnum, VP_FMAXNUM)
 
+// llvm.vp.minimum(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_minimum, 2, 3, VP_FMINIMUM, -1)
+VP_PROPERTY_BINARYOP
+VP_PROPERTY_FUNCTIONAL_SDOPC(FMINIMUM)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(minimum)
+END_REGISTER_VP(vp_minimum, VP_FMINIMUM)
+
+// llvm.vp.maximum(x,y,mask,vlen)
+BEGIN_REGISTER_VP(vp_maximum, 2, 3, VP_FMAXIMUM, -1)
+VP_PROPERTY_BINARYOP
+VP_PROPERTY_FUNCTIONAL_SDOPC(FMAXIMUM)
+VP_PROPERTY_FUNCTIONAL_INTRINSIC(maximum)
+END_REGISTER_VP(vp_maximum, VP_FMAXIMUM)
+
 // llvm.vp.ceil(x,mask,vlen)
 BEGIN_REGISTER_VP(vp_ceil, 1, 2, VP_FCEIL, -1)
 VP_PROPERTY_FUNCTIONAL_INTRINSIC(ceil)
diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 6c873a9aee27f..0fe4cfefdb160 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -729,6 +729,8 @@ Value *CachingVPExpander::expandPredication(VPIntrinsic &VPI) {
   case Intrinsic::vp_sqrt:
   case Intrinsic::vp_maxnum:
   case Intrinsic::vp_minnum:
+  case Intrinsic::vp_maximum:
+  case Intrinsic::vp_minimum:
     return expandPredicationToFPCall(Builder, VPI,
                                      VPI.getFunctionalIntrinsicID().value());
   case Intrinsic::vp_load:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index c278bdc073607..7fc252600534f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -1143,7 +1143,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMINNUM: case ISD::VP_FMINNUM:
   case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
   case ISD::FMINIMUM:
+  case ISD::VP_FMINIMUM:
   case ISD::FMAXIMUM:
+  case ISD::VP_FMAXIMUM:
   case ISD::SDIV: case ISD::VP_SDIV:
   case ISD::UDIV: case ISD::VP_UDIV:
   case ISD::FDIV: case ISD::VP_FDIV:
@@ -4131,7 +4133,9 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FMINNUM: case ISD::VP_FMINNUM:
   case ISD::FMAXNUM: case ISD::VP_FMAXNUM:
   case ISD::FMINIMUM:
+  case ISD::VP_FMINIMUM:
   case ISD::FMAXIMUM:
+  case ISD::VP_FMAXIMUM:
   case ISD::SMIN: case ISD::VP_SMIN:
   case ISD::SMAX: case ISD::VP_SMAX:
   case ISD::UMIN: case ISD::VP_UMIN:
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 66285a6bf26e3..76e8d21b818b2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -687,7 +687,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FCEIL,       ISD::VP_FFLOOR,      ISD::VP_FROUND,
         ISD::VP_FROUNDEVEN,  ISD::VP_FCOPYSIGN,   ISD::VP_FROUNDTOZERO,
         ISD::VP_FRINT,       ISD::VP_FNEARBYINT,  ISD::VP_IS_FPCLASS,
-        ISD::EXPERIMENTAL_VP_REVERSE, ISD::EXPERIMENTAL_VP_SPLICE};
+        ISD::VP_FMINIMUM,    ISD::VP_FMAXIMUM,    ISD::EXPERIMENTAL_VP_REVERSE,
+        ISD::EXPERIMENTAL_VP_SPLICE};
 
     static const unsigned IntegerVecReduceOps[] = {
         ISD::VECREDUCE_ADD,  ISD::VECREDUCE_AND,  ISD::VECREDUCE_OR,
@@ -927,7 +928,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VP_FMINNUM,     ISD::VP_FMAXNUM,      ISD::VP_FCEIL,
         ISD::VP_FFLOOR,      ISD::VP_FROUND,       ISD::VP_FROUNDEVEN,
         ISD::VP_FCOPYSIGN,   ISD::VP_FROUNDTOZERO, ISD::VP_FRINT,
-        ISD::VP_FNEARBYINT,  ISD::VP_SETCC};
+        ISD::VP_FNEARBYINT,  ISD::VP_SETCC,        ISD::VP_FMINIMUM,
+        ISD::VP_FMAXIMUM};
 
     // Sets common operation actions on RVV floating-point vector types.
     const auto SetCommonVFPActions = [&](MVT VT) {
@@ -5405,7 +5407,16 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG,
     Y = convertToScalableVector(ContainerVT, Y, DAG, Subtarget);
   }
 
-  auto [Mask, VL] = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+  SDValue Mask, VL;
+  if (Op->isVPOpcode()) {
+    Mask = Op.getOperand(2);
+    if (VT.isFixedLengthVector())
+      Mask = convertToScalableVector(getMaskTypeFor(ContainerVT), Mask, DAG,
+                                     Subtarget);
+    VL = Op.getOperand(3);
+  } else {
+    std::tie(Mask, VL) = getDefaultVLOps(VT, ContainerVT, DL, DAG, Subtarget);
+  }
 
   SDValue NewY = Y;
   if (!XIsNeverNan) {
@@ -5426,7 +5437,9 @@ static SDValue lowerFMAXIMUM_FMINIMUM(SDValue Op, SelectionDAG &DAG,
   }
 
   unsigned Opc =
-      Op.getOpcode() == ISD::FMAXIMUM ? RISCVISD::VFMAX_VL : RISCVISD::VFMIN_VL;
+      Op.getOpcode() == ISD::FMAXIMUM || Op->getOpcode() == ISD::VP_FMAXIMUM
+          ? RISCVISD::VFMAX_VL
+          : RISCVISD::VFMIN_VL;
   SDValue Res = DAG.getNode(Opc, DL, ContainerVT, NewX, NewY,
                             DAG.getUNDEF(ContainerVT), Mask, VL);
   if (VT.isFixedLengthVector())
@@ -6655,6 +6668,13 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
          !Subtarget.hasVInstructionsF16()))
       return SplitVPOp(Op, DAG);
     return lowerVectorFTRUNC_FCEIL_FFLOOR_FROUND(Op, DAG, Subtarget);
+  case ISD::VP_FMAXIMUM:
+  case ISD::VP_FMINIMUM:
+    if (Op.getValueType() == MVT::nxv32f16 &&
+        (Subtarget.hasVInstructionsF16Minimal() &&
+         !Subtarget.hasVInstructionsF16()))
+      return SplitVPOp(Op, DAG);
+    return lowerFMAXIMUM_FMINIMUM(Op, DAG, Subtarget);
   case ISD::EXPERIMENTAL_VP_SPLICE:
     return lowerVPSpliceExperimental(Op, DAG);
   case ISD::EXPERIMENTAL_VP_REVERSE:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
new file mode 100644
index 0000000000000..4a5ef21efdb96
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fmaximum-vp.ll
@@ -0,0 +1,815 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+
+declare <2 x half> @llvm.vp.maximum.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32)
+
+define <2 x half> @vfmax_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_v2f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_v2f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %v = call <2 x half> @llvm.vp.maximum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x half> %v
+}
+
+define <2 x half> @vfmax_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_v2f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_v2f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x half> @llvm.vp.maximum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x half> %v
+}
+
+declare <4 x half> @llvm.vp.maximum.v4f16(<4 x half>, <4 x half>, <4 x i1>, i32)
+
+define <4 x half> @vfmax_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_v4f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_v4f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %v = call <4 x half> @llvm.vp.maximum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x half> %v
+}
+
+define <4 x half> @vfmax_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_v4f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_v4f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x half> @llvm.vp.maximum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x half> %v
+}
+
+declare <8 x half> @llvm.vp.maximum.v8f16(<8 x half>, <8 x half>, <8 x i1>, i32)
+
+define <8 x half> @vfmax_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_v8f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_v8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v14, v12, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+  %v = call <8 x half> @llvm.vp.maximum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x half> %v
+}
+
+define <8 x half> @vfmax_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_v8f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_v8f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v14
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x half> @llvm.vp.maximum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x half> %v
+}
+
+declare <16 x half> @llvm.vp.maximum.v16f16(<16 x half>, <16 x half>, <16 x i1>, i32)
+
+define <16 x half> @vfmax_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_v16f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vfmax.vv v8, v8, v14, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_v16f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v12, v0
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v20, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+  %v = call <16 x half> @llvm.vp.maximum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x half> %v
+}
+
+define <16 x half> @vfmax_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_v16f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
+; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v14
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_v16f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v20
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x half> @llvm.vp.maximum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x half> %v
+}
+
+declare <2 x float> @llvm.vp.maximum.v2f32(<2 x float>, <2 x float>, <2 x i1>, i32)
+
+define <2 x float> @vfmax_vv_v2f32(<2 x float> %va, <2 x float> %vb, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x float> @llvm.vp.maximum.v2f32(<2 x float> %va, <2 x float> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x float> %v
+}
+
+define <2 x float> @vfmax_vv_v2f32_unmasked(<2 x float> %va, <2 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v2f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x float> @llvm.vp.maximum.v2f32(<2 x float> %va, <2 x float> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x float> %v
+}
+
+declare <4 x float> @llvm.vp.maximum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define <4 x float> @vfmax_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x float> @llvm.vp.maximum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x float> %v
+}
+
+define <4 x float> @vfmax_vv_v4f32_unmasked(<4 x float> %va, <4 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v4f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x float> @llvm.vp.maximum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x float> %v
+}
+
+declare <8 x float> @llvm.vp.maximum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
+
+define <8 x float> @vfmax_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vfmax.vv v8, v8, v14, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x float> @llvm.vp.maximum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x float> %v
+}
+
+define <8 x float> @vfmax_vv_v8f32_unmasked(<8 x float> %va, <8 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v8f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v14
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x float> @llvm.vp.maximum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x float> %v
+}
+
+declare <16 x float> @llvm.vp.maximum.v16f32(<16 x float>, <16 x float>, <16 x i1>, i32)
+
+define <16 x float> @vfmax_vv_v16f32(<16 x float> %va, <16 x float> %vb, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfmax.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x float> @llvm.vp.maximum.v16f32(<16 x float> %va, <16 x float> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x float> %v
+}
+
+define <16 x float> @vfmax_vv_v16f32_unmasked(<16 x float> %va, <16 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v16f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v20
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x float> @llvm.vp.maximum.v16f32(<16 x float> %va, <16 x float> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x float> %v
+}
+
+declare <2 x double> @llvm.vp.maximum.v2f64(<2 x double>, <2 x double>, <2 x i1>, i32)
+
+define <2 x double> @vfmax_vv_v2f64(<2 x double> %va, <2 x double> %vb, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x double> @llvm.vp.maximum.v2f64(<2 x double> %va, <2 x double> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x double> %v
+}
+
+define <2 x double> @vfmax_vv_v2f64_unmasked(<2 x double> %va, <2 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v2f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x double> @llvm.vp.maximum.v2f64(<2 x double> %va, <2 x double> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x double> %v
+}
+
+declare <4 x double> @llvm.vp.maximum.v4f64(<4 x double>, <4 x double>, <4 x i1>, i32)
+
+define <4 x double> @vfmax_vv_v4f64(<4 x double> %va, <4 x double> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vfmax.vv v8, v8, v14, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x double> @llvm.vp.maximum.v4f64(<4 x double> %va, <4 x double> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x double> %v
+}
+
+define <4 x double> @vfmax_vv_v4f64_unmasked(<4 x double> %va, <4 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v4f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v14
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x double> @llvm.vp.maximum.v4f64(<4 x double> %va, <4 x double> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x double> %v
+}
+
+declare <8 x double> @llvm.vp.maximum.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
+
+define <8 x double> @vfmax_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfmax.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x double> @llvm.vp.maximum.v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x double> %v
+}
+
+define <8 x double> @vfmax_vv_v8f64_unmasked(<8 x double> %va, <8 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v8f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v20
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x double> @llvm.vp.maximum.v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x double> %v
+}
+
+declare <16 x double> @llvm.vp.maximum.v16f64(<16 x double>, <16 x double>, <16 x i1>, i32)
+
+define <16 x double> @vfmax_vv_v16f64(<16 x double> %va, <16 x double> %vb, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v16f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %v = call <16 x double> @llvm.vp.maximum.v16f64(<16 x double> %va, <16 x double> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x double> %v
+}
+
+define <16 x double> @vfmax_vv_v16f64_unmasked(<16 x double> %va, <16 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v16f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v1, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x double> @llvm.vp.maximum.v16f64(<16 x double> %va, <16 x double> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x double> %v
+}
+
+declare <32 x double> @llvm.vp.maximum.v32f64(<32 x double>, <32 x double>, <32 x i1>, i32)
+
+define <32 x double> @vfmax_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v32f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    vmv1r.v v2, v0
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v1, v0, 2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    bltu a2, a1, .LBB24_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB24_2:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v2
+; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v2
+; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v2
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, a2, -16
+; CHECK-NEXT:    sltu a1, a2, a0
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %v = call <32 x double> @llvm.vp.maximum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x double> %v
+}
+
+define <32 x double> @vfmax_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_v32f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    bltu a2, a1, .LBB25_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB25_2:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v1, v24, v24
+; CHECK-NEXT:    vmv8r.v v16, v24
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, a2, -16
+; CHECK-NEXT:    sltu a1, a2, a0
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v1, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vfmax.vv v16, v16, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x double> @llvm.vp.maximum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x double> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
new file mode 100644
index 0000000000000..8ea08bdedd6d3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fminimum-vp.ll
@@ -0,0 +1,815 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+
+declare <2 x half> @llvm.vp.minimum.v2f16(<2 x half>, <2 x half>, <2 x i1>, i32)
+
+define <2 x half> @vfmin_vv_v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_v2f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_v2f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %v = call <2 x half> @llvm.vp.minimum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x half> %v
+}
+
+define <2 x half> @vfmin_vv_v2f16_unmasked(<2 x half> %va, <2 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_v2f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_v2f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x half> @llvm.vp.minimum.v2f16(<2 x half> %va, <2 x half> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x half> %v
+}
+
+declare <4 x half> @llvm.vp.minimum.v4f16(<4 x half>, <4 x half>, <4 x i1>, i32)
+
+define <4 x half> @vfmin_vv_v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_v4f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_v4f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %v = call <4 x half> @llvm.vp.minimum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x half> %v
+}
+
+define <4 x half> @vfmin_vv_v4f16_unmasked(<4 x half> %va, <4 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_v4f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_v4f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x half> @llvm.vp.minimum.v4f16(<4 x half> %va, <4 x half> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x half> %v
+}
+
+declare <8 x half> @llvm.vp.minimum.v8f16(<8 x half>, <8 x half>, <8 x i1>, i32)
+
+define <8 x half> @vfmin_vv_v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_v8f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_v8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v14, v12, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+  %v = call <8 x half> @llvm.vp.minimum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x half> %v
+}
+
+define <8 x half> @vfmin_vv_v8f16_unmasked(<8 x half> %va, <8 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_v8f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_v8f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v14
+; ZVFHMIN-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x half> @llvm.vp.minimum.v8f16(<8 x half> %va, <8 x half> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x half> %v
+}
+
+declare <16 x half> @llvm.vp.minimum.v16f16(<16 x half>, <16 x half>, <16 x i1>, i32)
+
+define <16 x half> @vfmin_vv_v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_v16f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vfmin.vv v8, v8, v14, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_v16f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v12, v0
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v20, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+  %v = call <16 x half> @llvm.vp.minimum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x half> %v
+}
+
+define <16 x half> @vfmin_vv_v16f16_unmasked(<16 x half> %va, <16 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_v16f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
+; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v14
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_v16f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v20
+; ZVFHMIN-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x half> @llvm.vp.minimum.v16f16(<16 x half> %va, <16 x half> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x half> %v
+}
+
+declare <2 x float> @llvm.vp.minimum.v2f32(<2 x float>, <2 x float>, <2 x i1>, i32)
+
+define <2 x float> @vfmin_vv_v2f32(<2 x float> %va, <2 x float> %vb, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x float> @llvm.vp.minimum.v2f32(<2 x float> %va, <2 x float> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x float> %v
+}
+
+define <2 x float> @vfmin_vv_v2f32_unmasked(<2 x float> %va, <2 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v2f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x float> @llvm.vp.minimum.v2f32(<2 x float> %va, <2 x float> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x float> %v
+}
+
+declare <4 x float> @llvm.vp.minimum.v4f32(<4 x float>, <4 x float>, <4 x i1>, i32)
+
+define <4 x float> @vfmin_vv_v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x float> @llvm.vp.minimum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x float> %v
+}
+
+define <4 x float> @vfmin_vv_v4f32_unmasked(<4 x float> %va, <4 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v4f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x float> @llvm.vp.minimum.v4f32(<4 x float> %va, <4 x float> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x float> %v
+}
+
+declare <8 x float> @llvm.vp.minimum.v8f32(<8 x float>, <8 x float>, <8 x i1>, i32)
+
+define <8 x float> @vfmin_vv_v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vfmin.vv v8, v8, v14, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x float> @llvm.vp.minimum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x float> %v
+}
+
+define <8 x float> @vfmin_vv_v8f32_unmasked(<8 x float> %va, <8 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v8f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v14
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x float> @llvm.vp.minimum.v8f32(<8 x float> %va, <8 x float> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x float> %v
+}
+
+declare <16 x float> @llvm.vp.minimum.v16f32(<16 x float>, <16 x float>, <16 x i1>, i32)
+
+define <16 x float> @vfmin_vv_v16f32(<16 x float> %va, <16 x float> %vb, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v16f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfmin.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    ret
+  %v = call <16 x float> @llvm.vp.minimum.v16f32(<16 x float> %va, <16 x float> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x float> %v
+}
+
+define <16 x float> @vfmin_vv_v16f32_unmasked(<16 x float> %va, <16 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v16f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v20
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x float> @llvm.vp.minimum.v16f32(<16 x float> %va, <16 x float> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x float> %v
+}
+
+declare <2 x double> @llvm.vp.minimum.v2f64(<2 x double>, <2 x double>, <2 x i1>, i32)
+
+define <2 x double> @vfmin_vv_v2f64(<2 x double> %va, <2 x double> %vb, <2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <2 x double> @llvm.vp.minimum.v2f64(<2 x double> %va, <2 x double> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x double> %v
+}
+
+define <2 x double> @vfmin_vv_v2f64_unmasked(<2 x double> %va, <2 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v2f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <2 x i1> %head, <2 x i1> poison, <2 x i32> zeroinitializer
+  %v = call <2 x double> @llvm.vp.minimum.v2f64(<2 x double> %va, <2 x double> %vb, <2 x i1> %m, i32 %evl)
+  ret <2 x double> %v
+}
+
+declare <4 x double> @llvm.vp.minimum.v4f64(<4 x double>, <4 x double>, <4 x i1>, i32)
+
+define <4 x double> @vfmin_vv_v4f64(<4 x double> %va, <4 x double> %vb, <4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vfmin.vv v8, v8, v14, v0.t
+; CHECK-NEXT:    ret
+  %v = call <4 x double> @llvm.vp.minimum.v4f64(<4 x double> %va, <4 x double> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x double> %v
+}
+
+define <4 x double> @vfmin_vv_v4f64_unmasked(<4 x double> %va, <4 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v4f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v14
+; CHECK-NEXT:    ret
+  %head = insertelement <4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <4 x i1> %head, <4 x i1> poison, <4 x i32> zeroinitializer
+  %v = call <4 x double> @llvm.vp.minimum.v4f64(<4 x double> %va, <4 x double> %vb, <4 x i1> %m, i32 %evl)
+  ret <4 x double> %v
+}
+
+declare <8 x double> @llvm.vp.minimum.v8f64(<8 x double>, <8 x double>, <8 x i1>, i32)
+
+define <8 x double> @vfmin_vv_v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfmin.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    ret
+  %v = call <8 x double> @llvm.vp.minimum.v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x double> %v
+}
+
+define <8 x double> @vfmin_vv_v8f64_unmasked(<8 x double> %va, <8 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v8f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v20
+; CHECK-NEXT:    ret
+  %head = insertelement <8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <8 x i1> %head, <8 x i1> poison, <8 x i32> zeroinitializer
+  %v = call <8 x double> @llvm.vp.minimum.v8f64(<8 x double> %va, <8 x double> %vb, <8 x i1> %m, i32 %evl)
+  ret <8 x double> %v
+}
+
+declare <16 x double> @llvm.vp.minimum.v16f64(<16 x double>, <16 x double>, <16 x i1>, i32)
+
+define <16 x double> @vfmin_vv_v16f64(<16 x double> %va, <16 x double> %vb, <16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v16f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %v = call <16 x double> @llvm.vp.minimum.v16f64(<16 x double> %va, <16 x double> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x double> %v
+}
+
+define <16 x double> @vfmin_vv_v16f64_unmasked(<16 x double> %va, <16 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v16f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v1, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
+; CHECK-NEXT:    ret
+  %head = insertelement <16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <16 x i1> %head, <16 x i1> poison, <16 x i32> zeroinitializer
+  %v = call <16 x double> @llvm.vp.minimum.v16f64(<16 x double> %va, <16 x double> %vb, <16 x i1> %m, i32 %evl)
+  ret <16 x double> %v
+}
+
+declare <32 x double> @llvm.vp.minimum.v32f64(<32 x double>, <32 x double>, <32 x i1>, i32)
+
+define <32 x double> @vfmin_vv_v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v32f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 5
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; CHECK-NEXT:    vmv1r.v v2, v0
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetivli zero, 2, e8, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v1, v0, 2
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    bltu a2, a1, .LBB24_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB24_2:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v2
+; CHECK-NEXT:    vmfeq.vv v26, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v2
+; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v2
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, a2, -16
+; CHECK-NEXT:    sltu a1, a2, a0
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v16, v16, v8, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %v = call <32 x double> @llvm.vp.minimum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x double> %v
+}
+
+define <32 x double> @vfmin_vv_v32f64_unmasked(<32 x double> %va, <32 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_v32f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    addi a1, a0, 128
+; CHECK-NEXT:    vsetivli zero, 16, e64, m8, ta, ma
+; CHECK-NEXT:    vle64.v v24, (a1)
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    vle64.v v24, (a0)
+; CHECK-NEXT:    li a1, 16
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    bltu a2, a1, .LBB25_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 16
+; CHECK-NEXT:  .LBB25_2:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v1, v24, v24
+; CHECK-NEXT:    vmv8r.v v16, v24
+; CHECK-NEXT:    vmerge.vvm v24, v8, v24, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    addi a0, a2, -16
+; CHECK-NEXT:    sltu a1, a2, a0
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v1, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v16, v8, v16, v0
+; CHECK-NEXT:    vfmin.vv v16, v16, v24
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %head = insertelement <32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <32 x i1> %head, <32 x i1> poison, <32 x i32> zeroinitializer
+  %v = call <32 x double> @llvm.vp.minimum.v32f64(<32 x double> %va, <32 x double> %vb, <32 x i1> %m, i32 %evl)
+  ret <32 x double> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
new file mode 100644
index 0000000000000..f4350a1e1ced3
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fmaximum-vp.ll
@@ -0,0 +1,1245 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+
+declare <vscale x 1 x half> @llvm.vp.maximum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x half> @vfmax_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv1f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv1f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 1 x half> @llvm.vp.maximum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x half> %v
+}
+
+define <vscale x 1 x half> @vfmax_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv1f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv1f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x half> @llvm.vp.maximum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x half> %v
+}
+
+declare <vscale x 2 x half> @llvm.vp.maximum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x half> @vfmax_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv2f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv2f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 2 x half> @llvm.vp.maximum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x half> %v
+}
+
+define <vscale x 2 x half> @vfmax_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv2f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv2f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT:    vfmax.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x half> @llvm.vp.maximum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x half> %v
+}
+
+declare <vscale x 4 x half> @llvm.vp.maximum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x half> @vfmax_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv4f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv4f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v14, v12, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 4 x half> @llvm.vp.maximum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x half> %v
+}
+
+define <vscale x 4 x half> @vfmax_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv4f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv4f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vfmax.vv v10, v8, v14
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x half> @llvm.vp.maximum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x half> %v
+}
+
+declare <vscale x 8 x half> @llvm.vp.maximum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x half> @vfmax_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv8f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vfmax.vv v8, v8, v14, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v12, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v20, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 8 x half> @llvm.vp.maximum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x half> %v
+}
+
+define <vscale x 8 x half> @vfmax_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv8f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
+; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v14
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv8f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vfmax.vv v12, v8, v20
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x half> @llvm.vp.maximum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x half> %v
+}
+
+declare <vscale x 16 x half> @llvm.vp.maximum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x half> @vfmax_vv_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv16f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v16, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v17
+; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
+; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v17
+; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vfmax.vv v8, v8, v20, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv16f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v1, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 16 x half> @llvm.vp.maximum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x half> %v
+}
+
+define <vscale x 16 x half> @vfmax_vv_nxv16f16_unmasked(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv16f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
+; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
+; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v20
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv16f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v1, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v8, v16
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x half> @llvm.vp.maximum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x half> %v
+}
+
+declare <vscale x 32 x half> @llvm.vp.maximum.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x half> @vfmax_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    vmv1r.v v1, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv1r.v v0, v1
+; ZVFH-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v1
+; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv32f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 34
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    li a5, 24
+; ZVFHMIN-NEXT:    mul a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    li a4, 25
+; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB10_2:
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 34
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.vp.maximum.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfmax_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmax_vv_nxv32f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v1, v16, v16
+; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    vmv1r.v v0, v1
+; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFH-NEXT:    vfmax.vv v8, v8, v24
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmax_vv_nxv32f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v1, v24, a2
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    li a4, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB11_2:
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v1, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    vfmax.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v0, v16
+; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x half> @llvm.vp.maximum.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+declare <vscale x 1 x float> @llvm.vp.maximum.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x float> @vfmax_vv_nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x float> @llvm.vp.maximum.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x float> %v
+}
+
+define <vscale x 1 x float> @vfmax_vv_nxv1f32_unmasked(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv1f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x float> @llvm.vp.maximum.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x float> %v
+}
+
+declare <vscale x 2 x float> @llvm.vp.maximum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x float> @vfmax_vv_nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x float> @llvm.vp.maximum.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 2 x float> @vfmax_vv_nxv2f32_unmasked(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv2f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x float> @llvm.vp.maximum.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x float> %v
+}
+
+declare <vscale x 4 x float> @llvm.vp.maximum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x float> @vfmax_vv_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vfmax.vv v8, v8, v14, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x float> @llvm.vp.maximum.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x float> %v
+}
+
+define <vscale x 4 x float> @vfmax_vv_nxv4f32_unmasked(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv4f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v14
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x float> @llvm.vp.maximum.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x float> %v
+}
+
+declare <vscale x 8 x float> @llvm.vp.maximum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x float> @vfmax_vv_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfmax.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x float> @llvm.vp.maximum.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x float> %v
+}
+
+define <vscale x 8 x float> @vfmax_vv_nxv8f32_unmasked(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv8f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v20
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x float> @llvm.vp.maximum.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x float> %v
+}
+
+declare <vscale x 1 x double> @llvm.vp.maximum.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x double> @vfmax_vv_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmax.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x double> @llvm.vp.maximum.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x double> %v
+}
+
+define <vscale x 1 x double> @vfmax_vv_nxv1f64_unmasked(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv1f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x double> @llvm.vp.maximum.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x double> %v
+}
+
+declare <vscale x 2 x double> @llvm.vp.maximum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x double> @vfmax_vv_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vfmax.vv v8, v8, v14, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x double> @llvm.vp.maximum.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x double> %v
+}
+
+define <vscale x 2 x double> @vfmax_vv_nxv2f64_unmasked(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv2f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v14
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x double> @llvm.vp.maximum.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x double> %v
+}
+
+declare <vscale x 4 x double> @llvm.vp.maximum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x double> @vfmax_vv_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfmax.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x double> @llvm.vp.maximum.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x double> %v
+}
+
+define <vscale x 4 x double> @vfmax_vv_nxv4f64_unmasked(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv4f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v20
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x double> @llvm.vp.maximum.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x double> %v
+}
+
+declare <vscale x 8 x double> @llvm.vp.maximum.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x double> @vfmax_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x double> @llvm.vp.maximum.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @vfmax_vv_nxv8f64_unmasked(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv8f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v1, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x double> @llvm.vp.maximum.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x double> %v
+}
+
+declare <vscale x 16 x double> @llvm.vp.maximum.nxv16f64(<vscale x 16 x double>, <vscale x 16 x double>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x double> @vfmax_vv_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv16f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 42
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v24, v0, a3
+; CHECK-NEXT:    sub a3, a2, a1
+; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    vl8re64.v v0, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a4, a0, 5
+; CHECK-NEXT:    add a0, a4, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v1, v24
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v17, v24, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a0, a0, a3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    bltu a2, a1, .LBB28_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:  .LBB28_2:
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v1, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 5
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmax.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 42
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x double> @llvm.vp.maximum.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x double> %v
+}
+
+define <vscale x 16 x double> @vfmax_vv_nxv16f64_unmasked(<vscale x 16 x double> %va, <vscale x 16 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmax_vv_nxv16f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    vl8re64.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a0, a2, a1
+; CHECK-NEXT:    sltu a3, a2, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v1, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v16, v8
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    bltu a2, a1, .LBB29_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:  .LBB29_2:
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v1, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vfmax.vv v8, v8, v24
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x double> @llvm.vp.maximum.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x double> %v
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
new file mode 100644
index 0000000000000..1f8af732e3f99
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/fminimum-vp.ll
@@ -0,0 +1,1245 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfh,+v,+m -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=ilp32d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+d,+zfh,+zvfhmin,+v,+m -target-abi=lp64d \
+; RUN:   -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+
+declare <vscale x 1 x half> @llvm.vp.minimum.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x half> @vfmin_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv1f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv1f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 1 x half> @llvm.vp.minimum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x half> %v
+}
+
+define <vscale x 1 x half> @vfmin_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv1f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv1f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x half> @llvm.vp.minimum.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x half> %v
+}
+
+declare <vscale x 2 x half> @llvm.vp.minimum.nxv2f16(<vscale x 2 x half>, <vscale x 2 x half>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x half> @vfmin_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv2f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv2f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v11, v11, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v11, v8, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v8, v11, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 2 x half> @llvm.vp.minimum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x half> %v
+}
+
+define <vscale x 2 x half> @vfmin_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv2f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv2f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v11, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v11, v11
+; ZVFHMIN-NEXT:    vmerge.vvm v9, v10, v11, v0
+; ZVFHMIN-NEXT:    vmv.v.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v11, v10, v0
+; ZVFHMIN-NEXT:    vfmin.vv v9, v8, v9
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x half> @llvm.vp.minimum.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x half> %v
+}
+
+declare <vscale x 4 x half> @llvm.vp.minimum.nxv4f16(<vscale x 4 x half>, <vscale x 4 x half>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x half> @vfmin_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv4f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v10, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v10
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv4f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v10, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v14, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v12, v14, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v14, v14, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v14, v12, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v10
+; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 4 x half> @llvm.vp.minimum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x half> %v
+}
+
+define <vscale x 4 x half> @vfmin_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv4f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v10, v9, v9
+; ZVFH-NEXT:    vmerge.vvm v11, v8, v9, v0
+; ZVFH-NEXT:    vmv.v.v v0, v10
+; ZVFH-NEXT:    vmerge.vvm v8, v9, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v11
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv4f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v10, v10
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v9
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v12, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v14, v10, v12, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v12, v10, v0
+; ZVFHMIN-NEXT:    vfmin.vv v10, v8, v14
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x half> @llvm.vp.minimum.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x half> %v
+}
+
+declare <vscale x 8 x half> @llvm.vp.minimum.nxv8f16(<vscale x 8 x half>, <vscale x 8 x half>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x half> @vfmin_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv8f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v12, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v13
+; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vfmin.vv v8, v8, v14, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv8f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vmv1r.v v12, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v20, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v16, v20, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v20, v20, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v20, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 8 x half> @llvm.vp.minimum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x half> %v
+}
+
+define <vscale x 8 x half> @vfmin_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv8f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v12, v10, v10
+; ZVFH-NEXT:    vmerge.vvm v14, v8, v10, v0
+; ZVFH-NEXT:    vmv1r.v v0, v12
+; ZVFH-NEXT:    vmerge.vvm v8, v10, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v14
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv8f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v12, v12
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v10
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v20, v12, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v12, v0
+; ZVFHMIN-NEXT:    vfmin.vv v12, v8, v20
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x half> @llvm.vp.minimum.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x half> %v
+}
+
+declare <vscale x 16 x half> @llvm.vp.minimum.nxv16f16(<vscale x 16 x half>, <vscale x 16 x half>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x half> @vfmin_vv_nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv16f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vmv1r.v v16, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v17
+; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
+; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v17
+; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vfmin.vv v8, v8, v20, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv16f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vmv1r.v v1, v0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 16 x half> @llvm.vp.minimum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x half> %v
+}
+
+define <vscale x 16 x half> @vfmin_vv_nxv16f16_unmasked(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv16f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v16, v12, v12
+; ZVFH-NEXT:    vmerge.vvm v20, v8, v12, v0
+; ZVFH-NEXT:    vmv1r.v v0, v16
+; ZVFH-NEXT:    vmerge.vvm v8, v12, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v20
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv16f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v16, v16
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v1, v24, v24
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v16, v8, v16
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x half> @llvm.vp.minimum.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x half> %v
+}
+
+declare <vscale x 32 x half> @llvm.vp.minimum.nxv32f16(<vscale x 32 x half>, <vscale x 32 x half>, <vscale x 32 x i1>, i32)
+
+define <vscale x 32 x half> @vfmin_vv_nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv32f16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    vmv1r.v v1, v0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFH-NEXT:    vmv1r.v v0, v1
+; ZVFH-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; ZVFH-NEXT:    vmv1r.v v0, v25
+; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFH-NEXT:    vmv1r.v v0, v1
+; ZVFH-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; ZVFH-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv32f16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 34
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x22, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 34 * vlenb
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv8r.v v16, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    csrr a4, vlenb
+; ZVFHMIN-NEXT:    li a5, 24
+; ZVFHMIN-NEXT:    mul a4, a4, a5
+; ZVFHMIN-NEXT:    add a4, sp, a4
+; ZVFHMIN-NEXT:    addi a4, a4, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs1r.v v0, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    li a4, 25
+; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v4
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl1r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB10_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB10_2:
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v24, v24, v0.t
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vs1r.v v8, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 25
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v0, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v24, v16, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 24
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl1r.v v9, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    vmfeq.vv v8, v16, v16, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v24, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v9
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 25
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    li a1, 34
+; ZVFHMIN-NEXT:    mul a0, a0, a1
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
+  %v = call <vscale x 32 x half> @llvm.vp.minimum.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+define <vscale x 32 x half> @vfmin_vv_nxv32f16_unmasked(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, i32 zeroext %evl) {
+; ZVFH-LABEL: vfmin_vv_nxv32f16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; ZVFH-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFH-NEXT:    vmfeq.vv v1, v16, v16
+; ZVFH-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFH-NEXT:    vmv1r.v v0, v1
+; ZVFH-NEXT:    vmerge.vvm v8, v16, v8, v0
+; ZVFH-NEXT:    vfmin.vv v8, v8, v24
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfmin_vv_nxv32f16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 5
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 32 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    sltu a4, a0, a3
+; ZVFHMIN-NEXT:    addi a4, a4, -1
+; ZVFHMIN-NEXT:    and a3, a4, a3
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v1, v24, a2
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv4r.v v8, v16
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    li a4, 24
+; ZVFHMIN-NEXT:    mul a2, a2, a4
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vsetvli zero, a3, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    addi a2, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vmerge.vvm v8, v16, v24, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vs8r.v v8, (a2) # Unknown-size Folded Spill
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmfeq.vv v12, v24, v24, v0.t
+; ZVFHMIN-NEXT:    vmv1r.v v0, v12
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v24, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a2, a2, 3
+; ZVFHMIN-NEXT:    add a2, sp, a2
+; ZVFHMIN-NEXT:    addi a2, a2, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a2) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli a2, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v4, v16
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB11_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB11_2:
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v0, v8, v8
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    li a2, 24
+; ZVFHMIN-NEXT:    mul a1, a1, a2
+; ZVFHMIN-NEXT:    add a1, sp, a1
+; ZVFHMIN-NEXT:    addi a1, a1, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; ZVFHMIN-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vmfeq.vv v1, v16, v16
+; ZVFHMIN-NEXT:    vmerge.vvm v24, v8, v16, v0
+; ZVFHMIN-NEXT:    vmv1r.v v0, v1
+; ZVFHMIN-NEXT:    vmerge.vvm v16, v16, v8, v0
+; ZVFHMIN-NEXT:    vfmin.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvt.f.f.w v0, v16
+; ZVFHMIN-NEXT:    vmv8r.v v8, v0
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 5
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    ret
+  %head = insertelement <vscale x 32 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 32 x i1> %head, <vscale x 32 x i1> poison, <vscale x 32 x i32> zeroinitializer
+  %v = call <vscale x 32 x half> @llvm.vp.minimum.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
+  ret <vscale x 32 x half> %v
+}
+
+declare <vscale x 1 x float> @llvm.vp.minimum.nxv1f32(<vscale x 1 x float>, <vscale x 1 x float>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x float> @vfmin_vv_nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv1f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x float> @llvm.vp.minimum.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x float> %v
+}
+
+define <vscale x 1 x float> @vfmin_vv_nxv1f32_unmasked(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv1f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x float> @llvm.vp.minimum.nxv1f32(<vscale x 1 x float> %va, <vscale x 1 x float> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x float> %v
+}
+
+declare <vscale x 2 x float> @llvm.vp.minimum.nxv2f32(<vscale x 2 x float>, <vscale x 2 x float>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x float> @vfmin_vv_nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x float> @llvm.vp.minimum.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x float> %v
+}
+
+define <vscale x 2 x float> @vfmin_vv_nxv2f32_unmasked(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv2f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x float> @llvm.vp.minimum.nxv2f32(<vscale x 2 x float> %va, <vscale x 2 x float> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x float> %v
+}
+
+declare <vscale x 4 x float> @llvm.vp.minimum.nxv4f32(<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x float> @vfmin_vv_nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vfmin.vv v8, v8, v14, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x float> @llvm.vp.minimum.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x float> %v
+}
+
+define <vscale x 4 x float> @vfmin_vv_nxv4f32_unmasked(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv4f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v14
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x float> @llvm.vp.minimum.nxv4f32(<vscale x 4 x float> %va, <vscale x 4 x float> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x float> %v
+}
+
+declare <vscale x 8 x float> @llvm.vp.minimum.nxv8f32(<vscale x 8 x float>, <vscale x 8 x float>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x float> @vfmin_vv_nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfmin.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x float> @llvm.vp.minimum.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x float> %v
+}
+
+define <vscale x 8 x float> @vfmin_vv_nxv8f32_unmasked(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv8f32_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v20
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x float> @llvm.vp.minimum.nxv8f32(<vscale x 8 x float> %va, <vscale x 8 x float> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x float> %v
+}
+
+declare <vscale x 1 x double> @llvm.vp.minimum.nxv1f64(<vscale x 1 x double>, <vscale x 1 x double>, <vscale x 1 x i1>, i32)
+
+define <vscale x 1 x double> @vfmin_vv_nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv1f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v10, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8, v0.t
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vmfeq.vv v0, v9, v9, v0.t
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v10
+; CHECK-NEXT:    vfmin.vv v8, v8, v11, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 1 x double> @llvm.vp.minimum.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x double> %v
+}
+
+define <vscale x 1 x double> @vfmin_vv_nxv1f64_unmasked(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv1f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v10, v9, v9
+; CHECK-NEXT:    vmerge.vvm v11, v8, v9, v0
+; CHECK-NEXT:    vmv.v.v v0, v10
+; CHECK-NEXT:    vmerge.vvm v8, v9, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v11
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 1 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 1 x i1> %head, <vscale x 1 x i1> poison, <vscale x 1 x i32> zeroinitializer
+  %v = call <vscale x 1 x double> @llvm.vp.minimum.nxv1f64(<vscale x 1 x double> %va, <vscale x 1 x double> %vb, <vscale x 1 x i1> %m, i32 %evl)
+  ret <vscale x 1 x double> %v
+}
+
+declare <vscale x 2 x double> @llvm.vp.minimum.nxv2f64(<vscale x 2 x double>, <vscale x 2 x double>, <vscale x 2 x i1>, i32)
+
+define <vscale x 2 x double> @vfmin_vv_nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, <vscale x 2 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v12, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v13, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmfeq.vv v13, v10, v10, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v13
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vfmin.vv v8, v8, v14, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 2 x double> @llvm.vp.minimum.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x double> %v
+}
+
+define <vscale x 2 x double> @vfmin_vv_nxv2f64_unmasked(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv2f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v12, v10, v10
+; CHECK-NEXT:    vmerge.vvm v14, v8, v10, v0
+; CHECK-NEXT:    vmv1r.v v0, v12
+; CHECK-NEXT:    vmerge.vvm v8, v10, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v14
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 2 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 2 x i1> %head, <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer
+  %v = call <vscale x 2 x double> @llvm.vp.minimum.nxv2f64(<vscale x 2 x double> %va, <vscale x 2 x double> %vb, <vscale x 2 x i1> %m, i32 %evl)
+  ret <vscale x 2 x double> %v
+}
+
+declare <vscale x 4 x double> @llvm.vp.minimum.nxv4f64(<vscale x 4 x double>, <vscale x 4 x double>, <vscale x 4 x i1>, i32)
+
+define <vscale x 4 x double> @vfmin_vv_nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, <vscale x 4 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vmv1r.v v16, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v17, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmfeq.vv v17, v12, v12, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vfmin.vv v8, v8, v20, v0.t
+; CHECK-NEXT:    ret
+  %v = call <vscale x 4 x double> @llvm.vp.minimum.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x double> %v
+}
+
+define <vscale x 4 x double> @vfmin_vv_nxv4f64_unmasked(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv4f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v16, v12, v12
+; CHECK-NEXT:    vmerge.vvm v20, v8, v12, v0
+; CHECK-NEXT:    vmv1r.v v0, v16
+; CHECK-NEXT:    vmerge.vvm v8, v12, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v20
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 4 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 4 x i1> %head, <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer
+  %v = call <vscale x 4 x double> @llvm.vp.minimum.nxv4f64(<vscale x 4 x double> %va, <vscale x 4 x double> %vb, <vscale x 4 x i1> %m, i32 %evl)
+  ret <vscale x 4 x double> %v
+}
+
+declare <vscale x 8 x double> @llvm.vp.minimum.nxv8f64(<vscale x 8 x double>, <vscale x 8 x double>, <vscale x 8 x i1>, i32)
+
+define <vscale x 8 x double> @vfmin_vv_nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, <vscale x 8 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv8f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; CHECK-NEXT:    vmv1r.v v1, v0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 8 x double> @llvm.vp.minimum.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x double> %v
+}
+
+define <vscale x 8 x double> @vfmin_vv_nxv8f64_unmasked(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv8f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v8, v8
+; CHECK-NEXT:    vmfeq.vv v1, v16, v16
+; CHECK-NEXT:    vmerge.vvm v24, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v8, v16, v8, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 8 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 8 x i1> %head, <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer
+  %v = call <vscale x 8 x double> @llvm.vp.minimum.nxv8f64(<vscale x 8 x double> %va, <vscale x 8 x double> %vb, <vscale x 8 x i1> %m, i32 %evl)
+  ret <vscale x 8 x double> %v
+}
+
+declare <vscale x 16 x double> @llvm.vp.minimum.nxv16f64(<vscale x 16 x double>, <vscale x 16 x double>, <vscale x 16 x i1>, i32)
+
+define <vscale x 16 x double> @vfmin_vv_nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %vb, <vscale x 16 x i1> %m, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv16f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 42
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x2a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 42 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    csrr a3, vlenb
+; CHECK-NEXT:    slli a3, a3, 4
+; CHECK-NEXT:    add a3, sp, a3
+; CHECK-NEXT:    addi a3, a3, 16
+; CHECK-NEXT:    vs8r.v v24, (a3) # Unknown-size Folded Spill
+; CHECK-NEXT:    srli a3, a1, 3
+; CHECK-NEXT:    vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT:    csrr a4, vlenb
+; CHECK-NEXT:    slli a4, a4, 5
+; CHECK-NEXT:    add a4, sp, a4
+; CHECK-NEXT:    addi a4, a4, 16
+; CHECK-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
+; CHECK-NEXT:    vslidedown.vx v24, v0, a3
+; CHECK-NEXT:    sub a3, a2, a1
+; CHECK-NEXT:    sltu a4, a2, a3
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    vl8re64.v v0, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a4, a0, 5
+; CHECK-NEXT:    add a0, a4, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v0, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vsetvli zero, a3, e64, m8, ta, ma
+; CHECK-NEXT:    vmv1r.v v1, v24
+; CHECK-NEXT:    vmv1r.v v0, v24
+; CHECK-NEXT:    vmfeq.vv v26, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v26
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v24, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmerge.vvm v16, v16, v24, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v17, v24, v24, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a0, a0, a3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v17
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v8, v24, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    bltu a2, a1, .LBB28_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:  .LBB28_2:
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 5
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl1r.v v1, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v25, v16, v16, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a1, a0, 5
+; CHECK-NEXT:    add a0, a1, a0
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v24, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmfeq.vv v25, v8, v8, v0.t
+; CHECK-NEXT:    vmv1r.v v0, v25
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vfmin.vv v8, v8, v16, v0.t
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 42
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %v = call <vscale x 16 x double> @llvm.vp.minimum.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x double> %v
+}
+
+define <vscale x 16 x double> @vfmin_vv_nxv16f64_unmasked(<vscale x 16 x double> %va, <vscale x 16 x double> %vb, i32 zeroext %evl) {
+; CHECK-LABEL: vfmin_vv_nxv16f64_unmasked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi sp, sp, -16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    li a3, 24
+; CHECK-NEXT:    mul a1, a1, a3
+; CHECK-NEXT:    sub sp, sp, a1
+; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 24 * vlenb
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a1, a1, 4
+; CHECK-NEXT:    add a1, sp, a1
+; CHECK-NEXT:    addi a1, a1, 16
+; CHECK-NEXT:    vs8r.v v8, (a1) # Unknown-size Folded Spill
+; CHECK-NEXT:    csrr a1, vlenb
+; CHECK-NEXT:    slli a3, a1, 3
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    vl8re64.v v24, (a3)
+; CHECK-NEXT:    vl8re64.v v8, (a0)
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    sub a0, a2, a1
+; CHECK-NEXT:    sltu a3, a2, a0
+; CHECK-NEXT:    addi a3, a3, -1
+; CHECK-NEXT:    and a0, a3, a0
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    vmfeq.vv v1, v24, v24
+; CHECK-NEXT:    vmerge.vvm v8, v16, v24, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v16, v24, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v16, v8
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vs8r.v v8, (a0) # Unknown-size Folded Spill
+; CHECK-NEXT:    bltu a2, a1, .LBB29_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    mv a2, a1
+; CHECK-NEXT:  .LBB29_2:
+; CHECK-NEXT:    vsetvli zero, a2, e64, m8, ta, ma
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 4
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v0, v16, v16
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 3
+; CHECK-NEXT:    add a0, sp, a0
+; CHECK-NEXT:    addi a0, a0, 16
+; CHECK-NEXT:    vl8r.v v8, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    vmfeq.vv v1, v8, v8
+; CHECK-NEXT:    vmerge.vvm v24, v16, v8, v0
+; CHECK-NEXT:    vmv1r.v v0, v1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v16, v0
+; CHECK-NEXT:    vfmin.vv v8, v8, v24
+; CHECK-NEXT:    addi a0, sp, 16
+; CHECK-NEXT:    vl8r.v v16, (a0) # Unknown-size Folded Reload
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    li a1, 24
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    add sp, sp, a0
+; CHECK-NEXT:    addi sp, sp, 16
+; CHECK-NEXT:    ret
+  %head = insertelement <vscale x 16 x i1> poison, i1 true, i32 0
+  %m = shufflevector <vscale x 16 x i1> %head, <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer
+  %v = call <vscale x 16 x double> @llvm.vp.minimum.nxv16f64(<vscale x 16 x double> %va, <vscale x 16 x double> %vb, <vscale x 16 x i1> %m, i32 %evl)
+  ret <vscale x 16 x double> %v
+}
diff --git a/llvm/unittests/IR/VPIntrinsicTest.cpp b/llvm/unittests/IR/VPIntrinsicTest.cpp
index a3bef3d42adb0..7a9d91cc76b34 100644
--- a/llvm/unittests/IR/VPIntrinsicTest.cpp
+++ b/llvm/unittests/IR/VPIntrinsicTest.cpp
@@ -48,8 +48,9 @@ class VPIntrinsicTest : public testing::Test {
       Str << " declare <8 x i32> @llvm.vp." << BinaryIntOpcode
           << ".v8i32(<8 x i32>, <8 x i32>, <8 x i1>, i32) ";
 
-    const char *BinaryFPOpcodes[] = {"fadd", "fsub",   "fmul",   "fdiv",
-                                     "frem", "minnum", "maxnum", "copysign"};
+    const char *BinaryFPOpcodes[] = {"fadd",    "fsub",    "fmul",   "fdiv",
+                                     "frem",    "minnum",  "maxnum", "minimum",
+                                     "maximum", "copysign"};
     for (const char *BinaryFPOpcode : BinaryFPOpcodes)
       Str << " declare <8 x float> @llvm.vp." << BinaryFPOpcode
           << ".v8f32(<8 x float>, <8 x float>, <8 x i1>, i32) ";

From 907f2a0927d94d72d59f27e411c595b95aa673e9 Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Tue, 23 Jan 2024 00:55:59 +0000
Subject: [PATCH 513/843] [HIP][Driver] Automatically include `hipstdpar`
 forwarding header (#78915)

The forwarding header used by `hipstdpar` on AMDGPU targets is now
pacakged with `rocThrust`. This change augments the ROCm Driver
component so that it can automatically pick up the packaged header iff
the user hasn't overridden it via the dedicated flag.
---
 clang/lib/Driver/ToolChains/AMDGPU.cpp | 33 ++++++++++++++++----------
 clang/test/Driver/hipstdpar.c          |  3 ++-
 2 files changed, 23 insertions(+), 13 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/AMDGPU.cpp b/clang/lib/Driver/ToolChains/AMDGPU.cpp
index 56f06fc5fccb7..b3c9d5908654f 100644
--- a/clang/lib/Driver/ToolChains/AMDGPU.cpp
+++ b/clang/lib/Driver/ToolChains/AMDGPU.cpp
@@ -545,26 +545,35 @@ void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
   }
 
   const auto HandleHipStdPar = [=, &DriverArgs, &CC1Args]() {
-    if (!hasHIPStdParLibrary()) {
-      D.Diag(diag::err_drv_no_hipstdpar_lib);
-      return;
-    }
-    if (!HasRocThrustLibrary &&
-        !D.getVFS().exists(getIncludePath() + "/thrust")) {
+    StringRef Inc = getIncludePath();
+    auto &FS = D.getVFS();
+
+    if (!hasHIPStdParLibrary())
+      if (!HIPStdParPathArg.empty() ||
+          !FS.exists(Inc + "/thrust/system/hip/hipstdpar/hipstdpar_lib.hpp")) {
+        D.Diag(diag::err_drv_no_hipstdpar_lib);
+        return;
+      }
+    if (!HasRocThrustLibrary && !FS.exists(Inc + "/thrust")) {
       D.Diag(diag::err_drv_no_hipstdpar_thrust_lib);
       return;
     }
-    if (!HasRocPrimLibrary &&
-        !D.getVFS().exists(getIncludePath() + "/rocprim")) {
+    if (!HasRocPrimLibrary && !FS.exists(Inc + "/rocprim")) {
       D.Diag(diag::err_drv_no_hipstdpar_prim_lib);
       return;
     }
-
     const char *ThrustPath;
     if (HasRocThrustLibrary)
       ThrustPath = DriverArgs.MakeArgString(HIPRocThrustPathArg);
     else
-      ThrustPath = DriverArgs.MakeArgString(getIncludePath() + "/thrust");
+      ThrustPath = DriverArgs.MakeArgString(Inc + "/thrust");
+
+    const char *HIPStdParPath;
+    if (hasHIPStdParLibrary())
+      HIPStdParPath = DriverArgs.MakeArgString(HIPStdParPathArg);
+    else
+      HIPStdParPath = DriverArgs.MakeArgString(StringRef(ThrustPath) +
+                                               "/system/hip/hipstdpar");
 
     const char *PrimPath;
     if (HasRocPrimLibrary)
@@ -573,8 +582,8 @@ void RocmInstallationDetector::AddHIPIncludeArgs(const ArgList &DriverArgs,
       PrimPath = DriverArgs.MakeArgString(getIncludePath() + "/rocprim");
 
     CC1Args.append({"-idirafter", ThrustPath, "-idirafter", PrimPath,
-                    "-idirafter", DriverArgs.MakeArgString(HIPStdParPathArg),
-                    "-include", "hipstdpar_lib.hpp"});
+                    "-idirafter", HIPStdParPath, "-include",
+                    "hipstdpar_lib.hpp"});
   };
 
   if (DriverArgs.hasArg(options::OPT_nogpuinc)) {
diff --git a/clang/test/Driver/hipstdpar.c b/clang/test/Driver/hipstdpar.c
index 69c5b177d170c..2f48bf6b5cf1e 100644
--- a/clang/test/Driver/hipstdpar.c
+++ b/clang/test/Driver/hipstdpar.c
@@ -5,7 +5,8 @@
 // XFAIL: target={{.*}}-scei{{.*}}
 // XFAIL: target={{.*}}-sie{{.*}}
 
-// RUN: not %clang -### --hipstdpar -nogpulib -nogpuinc --compile %s 2>&1 | \
+// RUN: not %clang -### --hipstdpar --hipstdpar-path=/does/not/exist -nogpulib \
+// RUN:   -nogpuinc --compile %s 2>&1 | \
 // RUN:   FileCheck --check-prefix=HIPSTDPAR-MISSING-LIB %s
 // RUN: %clang -### --hipstdpar --hipstdpar-path=%S/Inputs/hipstdpar \
 // RUN:   --hipstdpar-thrust-path=%S/Inputs/hipstdpar/thrust \

From fcff4582f01db2f5a99e3acf452aec9f2d8a126a Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Tue, 23 Jan 2024 09:06:35 +0800
Subject: [PATCH 514/843] [LoongArch] Permit auto-vectorization using LSX/LASX
 with `auto-vec` feature (#78943)

With enough codegen complete, we can now correctly report the size of
vector registers for LSX/LASX, allowing auto vectorization (The
`auto-vec` feature needs to be enabled simultaneously).

As described, the `auto-vec` feature is an experimental one. To ensure
that automatic vectorization is not enabled by default, because the
information provided by the current `TTI` cannot yield additional
benefits for automatic vectorization.
---
 llvm/lib/Target/LoongArch/LoongArch.td        |  5 ++
 .../lib/Target/LoongArch/LoongArchSubtarget.h |  2 +
 .../LoongArchTargetTransformInfo.cpp          | 18 +++++
 .../LoongArch/LoongArchTargetTransformInfo.h  |  2 +
 .../LoopVectorize/LoongArch/defaults.ll       | 65 +++++++++++++++++++
 .../LoopVectorize/LoongArch/lit.local.cfg     |  4 ++
 6 files changed, 96 insertions(+)
 create mode 100644 llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/LoongArch/lit.local.cfg

diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 75b65fe69f262..4cffaf573b918 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -106,6 +106,11 @@ def FeatureRelax
     : SubtargetFeature<"relax", "HasLinkerRelax", "true",
                        "Enable Linker relaxation">;
 
+// Experimental auto vectorization
+def FeatureAutoVec
+    : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
+                       "Experimental auto vectorization">;
+
 //===----------------------------------------------------------------------===//
 // Registers, instruction descriptions ...
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index 5c173675cca4c..174e4cba83263 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -44,6 +44,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   bool HasLaLocalWithAbs = false;
   bool HasUAL = false;
   bool HasLinkerRelax = false;
+  bool HasExpAutoVec = false;
   unsigned GRLen = 32;
   MVT GRLenVT = MVT::i32;
   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
@@ -102,6 +103,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   bool hasLaLocalWithAbs() const { return HasLaLocalWithAbs; }
   bool hasUAL() const { return HasUAL; }
   bool hasLinkerRelax() const { return HasLinkerRelax; }
+  bool hasExpAutoVec() const { return HasExpAutoVec; }
   MVT getGRLenVT() const { return GRLenVT; }
   unsigned getGRLen() const { return GRLen; }
   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
index a6de86eea1166..04349aa52b540 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp
@@ -19,4 +19,22 @@ using namespace llvm;
 
 #define DEBUG_TYPE "loongarchtti"
 
+TypeSize LoongArchTTIImpl::getRegisterBitWidth(
+    TargetTransformInfo::RegisterKind K) const {
+  switch (K) {
+  case TargetTransformInfo::RGK_Scalar:
+    return TypeSize::getFixed(ST->is64Bit() ? 64 : 32);
+  case TargetTransformInfo::RGK_FixedWidthVector:
+    if (ST->hasExtLASX() && ST->hasExpAutoVec())
+      return TypeSize::getFixed(256);
+    if (ST->hasExtLSX() && ST->hasExpAutoVec())
+      return TypeSize::getFixed(128);
+    return TypeSize::getFixed(0);
+  case TargetTransformInfo::RGK_ScalableVector:
+    return TypeSize::getScalable(0);
+  }
+
+  llvm_unreachable("Unsupported register kind");
+}
+
 // TODO: Implement more hooks to provide TTI machinery for LoongArch.
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
index 9e02f793ba8a9..d296c9ed576fb 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.h
@@ -39,6 +39,8 @@ class LoongArchTTIImpl : public BasicTTIImplBase<LoongArchTTIImpl> {
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
+  TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const;
+
   // TODO: Implement more hooks to provide TTI machinery for LoongArch.
 };
 
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
new file mode 100644
index 0000000000000..7172f0907e77e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/defaults.ll
@@ -0,0 +1,65 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt < %s -passes=loop-vectorize -mtriple loongarch64-linux-gnu -mattr=+lasx,+auto-vec -S | FileCheck %s
+
+;; This is a collection of tests whose only purpose is to show changes in the
+;; default configuration.  Please keep these tests minimal - if you're testing
+;; functionality of some specific configuration, please place that in a
+;; seperate test file with a hard coded configuration (even if that
+;; configuration is the current default).
+
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n64-S128"
+target triple = "loongarch64"
+
+define void @vector_add(ptr noalias nocapture %a, i64 %v) {
+; CHECK-LABEL: define void @vector_add(
+; CHECK-SAME: ptr noalias nocapture [[A:%.*]], i64 [[V:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[V]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[TMP3:%.*]] = add <4 x i64> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    store <4 x i64> [[TMP3]], ptr [[TMP2]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[A]], i64 [[IV]]
+; CHECK-NEXT:    [[ELEM:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[ADD:%.*]] = add i64 [[ELEM]], [[V]]
+; CHECK-NEXT:    store i64 [[ADD]], ptr [[ARRAYIDX]], align 8
+; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[IV_NEXT]], 1024
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ]
+  %arrayidx = getelementptr inbounds i64, ptr %a, i64 %iv
+  %elem = load i64, ptr %arrayidx
+  %add = add i64 %elem, %v
+  store i64 %add, ptr %arrayidx
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 1024
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+for.end:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/LoongArch/lit.local.cfg b/llvm/test/Transforms/LoopVectorize/LoongArch/lit.local.cfg
new file mode 100644
index 0000000000000..9570af17fe5f1
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/LoongArch/lit.local.cfg
@@ -0,0 +1,4 @@
+config.suffixes = [".ll"]
+
+if not "LoongArch" in config.root.targets:
+    config.unsupported = True

From ea75542db99136a4612d8624ecff8581128cac21 Mon Sep 17 00:00:00 2001
From: Ben Shi <2283975856@qq.com>
Date: Tue, 23 Jan 2024 09:19:38 +0800
Subject: [PATCH 515/843] [clang][analyzer] Support 'getdelim' and 'getline' in
 StreamChecker (#78693)

---
 clang/docs/ReleaseNotes.rst                   |  8 ++-
 .../StaticAnalyzer/Checkers/StreamChecker.cpp | 66 +++++++++++++++++++
 .../Analysis/Inputs/system-header-simulator.h |  3 +
 clang/test/Analysis/stream-error.c            | 44 +++++++++++++
 clang/test/Analysis/taint-tester.c            |  1 -
 5 files changed, 118 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 0b37b5fc29260..cb15dfb99bec9 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1400,9 +1400,11 @@ Improvements
   `0954dc3fb921 <https://github.com/llvm/llvm-project/commit/0954dc3fb9214b994623f5306473de075f8e3593>`_)
 
 - Improved the ``alpha.unix.Stream`` checker by modeling more functions
-  ``fputs``, ``fputc``, ``fgets``, ``fgetc``, ``fdopen``, ``ungetc``, ``fflush``
-  and no not recognize alternative ``fopen`` and ``tmpfile`` implementations.
-  (`#76776 <https://github.com/llvm/llvm-project/pull/76776>`_,
+  ``fputs``, ``fputc``, ``fgets``, ``fgetc``, ``fdopen``, ``ungetc``, ``fflush``,
+  ``getdelim``, ``getline`` and no not recognize alternative
+  ``fopen`` and ``tmpfile`` implementations.
+  (`#78693 <https://github.com/llvm/llvm-project/pull/78693>`_,
+  `#76776 <https://github.com/llvm/llvm-project/pull/76776>`_,
   `#74296 <https://github.com/llvm/llvm-project/pull/74296>`_,
   `#73335 <https://github.com/llvm/llvm-project/pull/73335>`_,
   `#72627 <https://github.com/llvm/llvm-project/pull/72627>`_,
diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 84923732fe1b2..2afb9ece9f9fa 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -272,6 +272,12 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
       {{{"ungetc"}, 2},
        {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, false),
         std::bind(&StreamChecker::evalUngetc, _1, _2, _3, _4), 1}},
+      {{{"getdelim"}, 4},
+       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
+        std::bind(&StreamChecker::evalGetdelim, _1, _2, _3, _4), 3}},
+      {{{"getline"}, 3},
+       {std::bind(&StreamChecker::preReadWrite, _1, _2, _3, _4, true),
+        std::bind(&StreamChecker::evalGetdelim, _1, _2, _3, _4), 2}},
       {{{"fseek"}, 3},
        {&StreamChecker::preFseek, &StreamChecker::evalFseek, 0}},
       {{{"fseeko"}, 3},
@@ -354,6 +360,9 @@ class StreamChecker : public Checker<check::PreCall, eval::Call,
   void evalUngetc(const FnDescription *Desc, const CallEvent &Call,
                   CheckerContext &C) const;
 
+  void evalGetdelim(const FnDescription *Desc, const CallEvent &Call,
+                    CheckerContext &C) const;
+
   void preFseek(const FnDescription *Desc, const CallEvent &Call,
                 CheckerContext &C) const;
   void evalFseek(const FnDescription *Desc, const CallEvent &Call,
@@ -1083,6 +1092,63 @@ void StreamChecker::evalUngetc(const FnDescription *Desc, const CallEvent &Call,
   C.addTransition(StateFailed);
 }
 
+void StreamChecker::evalGetdelim(const FnDescription *Desc,
+                                 const CallEvent &Call,
+                                 CheckerContext &C) const {
+  ProgramStateRef State = C.getState();
+  SymbolRef StreamSym = getStreamArg(Desc, Call).getAsSymbol();
+  if (!StreamSym)
+    return;
+
+  const CallExpr *CE = dyn_cast_or_null<CallExpr>(Call.getOriginExpr());
+  if (!CE)
+    return;
+
+  const StreamState *OldSS = State->get<StreamMap>(StreamSym);
+  if (!OldSS)
+    return;
+
+  assertStreamStateOpened(OldSS);
+
+  // Upon successful completion, the getline() and getdelim() functions shall
+  // return the number of bytes written into the buffer.
+  // If the end-of-file indicator for the stream is set, the function shall
+  // return -1.
+  // If an error occurs, the function shall return -1 and set 'errno'.
+
+  // Add transition for the successful state.
+  if (OldSS->ErrorState != ErrorFEof) {
+    NonLoc RetVal = makeRetVal(C, CE).castAs<NonLoc>();
+    ProgramStateRef StateNotFailed =
+        State->BindExpr(CE, C.getLocationContext(), RetVal);
+    SValBuilder &SVB = C.getSValBuilder();
+    ASTContext &ASTC = C.getASTContext();
+    auto Cond =
+        SVB.evalBinOp(State, BO_GE, RetVal, SVB.makeZeroVal(CE->getType()),
+                      SVB.getConditionType())
+            .getAs<DefinedOrUnknownSVal>();
+    if (!Cond)
+      return;
+    StateNotFailed = StateNotFailed->assume(*Cond, true);
+    if (!StateNotFailed)
+      return;
+    C.addTransition(StateNotFailed);
+  }
+
+  // Add transition for the failed state.
+  // If a (non-EOF) error occurs, the resulting value of the file position
+  // indicator for the stream is indeterminate.
+  ProgramStateRef StateFailed = bindInt(-1, State, C, CE);
+  StreamErrorState NewES =
+      OldSS->ErrorState == ErrorFEof ? ErrorFEof : ErrorFEof | ErrorFError;
+  StreamState NewSS = StreamState::getOpened(Desc, NewES, !NewES.isFEof());
+  StateFailed = StateFailed->set<StreamMap>(StreamSym, NewSS);
+  if (OldSS->ErrorState != ErrorFEof)
+    C.addTransition(StateFailed, constructSetEofNoteTag(C, StreamSym));
+  else
+    C.addTransition(StateFailed);
+}
+
 void StreamChecker::preFseek(const FnDescription *Desc, const CallEvent &Call,
                              CheckerContext &C) const {
   ProgramStateRef State = C.getState();
diff --git a/clang/test/Analysis/Inputs/system-header-simulator.h b/clang/test/Analysis/Inputs/system-header-simulator.h
index f8e3e546a7aed..96072741a8abc 100644
--- a/clang/test/Analysis/Inputs/system-header-simulator.h
+++ b/clang/test/Analysis/Inputs/system-header-simulator.h
@@ -14,6 +14,7 @@ typedef long long __int64_t;
 typedef __int64_t __darwin_off_t;
 typedef __darwin_off_t fpos_t;
 typedef int off_t;
+typedef long ssize_t;
 
 typedef struct _FILE FILE;
 #define SEEK_SET 0 /* Seek from beginning of file. */
@@ -55,6 +56,8 @@ char *fgets(char *restrict str, int count, FILE *restrict stream);
 int fputc(int ch, FILE *stream);
 int fputs(const char *restrict s, FILE *restrict stream);
 int ungetc(int c, FILE *stream);
+ssize_t getdelim(char **restrict lineptr, size_t *restrict n, int delimiter, FILE *restrict stream);
+ssize_t getline(char **restrict lineptr, size_t *restrict n, FILE *restrict stream);
 int fseek(FILE *__stream, long int __off, int __whence);
 int fseeko(FILE *__stream, off_t __off, int __whence);
 long int ftell(FILE *__stream);
diff --git a/clang/test/Analysis/stream-error.c b/clang/test/Analysis/stream-error.c
index 2cf46e1d4ad51..cd4b0093cfcb2 100644
--- a/clang/test/Analysis/stream-error.c
+++ b/clang/test/Analysis/stream-error.c
@@ -249,6 +249,50 @@ void error_ungetc() {
   ungetc('A', F); // expected-warning {{Stream might be already closed}}
 }
 
+void error_getdelim(char *P, size_t Sz) {
+  FILE *F = tmpfile();
+  if (!F)
+    return;
+  ssize_t Ret = getdelim(&P, &Sz, '\t', F);
+  if (Ret >= 0) {
+    clang_analyzer_eval(feof(F) || ferror(F)); // expected-warning {{FALSE}}
+  } else {
+    clang_analyzer_eval(Ret == -1);            // expected-warning {{TRUE}}
+    clang_analyzer_eval(feof(F) || ferror(F)); // expected-warning {{TRUE}}
+    if (feof(F)) {
+      clang_analyzer_eval(ferror(F)); // expected-warning {{FALSE}}
+      getdelim(&P, &Sz, '\n', F);     // expected-warning {{Read function called when stream is in EOF state}}
+    } else {
+      clang_analyzer_eval(ferror(F)); // expected-warning {{TRUE}}
+      getdelim(&P, &Sz, '\n', F);     // expected-warning {{might be 'indeterminate'}}
+    }
+  }
+  fclose(F);
+  getdelim(&P, &Sz, '\n', F);         // expected-warning {{Stream might be already closed}}
+}
+
+void error_getline(char *P, size_t Sz) {
+  FILE *F = tmpfile();
+  if (!F)
+    return;
+  ssize_t Ret = getline(&P, &Sz, F);
+  if (Ret >= 0) {
+    clang_analyzer_eval(feof(F) || ferror(F)); // expected-warning {{FALSE}}
+  } else {
+    clang_analyzer_eval(Ret == -1);            // expected-warning {{TRUE}}
+    clang_analyzer_eval(feof(F) || ferror(F)); // expected-warning {{TRUE}}
+    if (feof(F)) {
+      clang_analyzer_eval(ferror(F)); // expected-warning {{FALSE}}
+      getline(&P, &Sz, F);            // expected-warning {{Read function called when stream is in EOF state}}
+    } else {
+      clang_analyzer_eval(ferror(F)); // expected-warning {{TRUE}}
+      getline(&P, &Sz, F);            // expected-warning {{might be 'indeterminate'}}
+    }
+  }
+  fclose(F);
+  getline(&P, &Sz, F);                // expected-warning {{Stream might be already closed}}
+}
+
 void write_after_eof_is_allowed(void) {
   FILE *F = tmpfile();
   if (!F)
diff --git a/clang/test/Analysis/taint-tester.c b/clang/test/Analysis/taint-tester.c
index ddfa91021825f..302349fb662dd 100644
--- a/clang/test/Analysis/taint-tester.c
+++ b/clang/test/Analysis/taint-tester.c
@@ -154,7 +154,6 @@ void getwTest(void) {
   int i = getw(stdin); // expected-warning + {{tainted}}
 }
 
-typedef long ssize_t;
 ssize_t getline(char ** __restrict, size_t * __restrict, FILE * __restrict);
 int  printf(const char * __restrict, ...);
 void free(void *ptr);

From 45c84f8011d2be9a2378487607ef811f75b4bfca Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Tue, 23 Jan 2024 09:24:22 +0800
Subject: [PATCH 516/843] [clang][analyzer] Remove unused variable in
 StreamChecker.cpp (NFC)

llvm-project/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp:1125:17:
 error: unused variable 'ASTC' [-Werror,-Wunused-variable]
 1125 |     ASTContext &ASTC = C.getASTContext();
      |                 ^~~~
1 error generated.
---
 clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
index 2afb9ece9f9fa..07727b339d967 100644
--- a/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/StreamChecker.cpp
@@ -1122,7 +1122,6 @@ void StreamChecker::evalGetdelim(const FnDescription *Desc,
     ProgramStateRef StateNotFailed =
         State->BindExpr(CE, C.getLocationContext(), RetVal);
     SValBuilder &SVB = C.getSValBuilder();
-    ASTContext &ASTC = C.getASTContext();
     auto Cond =
         SVB.evalBinOp(State, BO_GE, RetVal, SVB.makeZeroVal(CE->getType()),
                       SVB.getConditionType())

From 9a03d94a4db6b6d3b7cb1582e9e9cc239e737a30 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 22 Jan 2024 20:28:43 -0500
Subject: [PATCH 517/843] [gn] port 3ab8d2aac7bc

Might want to set this to True (and add a few source files to
builtins) at some point, but for now heal the bots.
---
 llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
index bfe98652d0baf..e82ec494993d0 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
@@ -64,6 +64,7 @@ write_cmake_config("lit_common_configured") {
     "SANITIZER_CAN_USE_CXXABI_PYBOOL=True",
     "SANITIZER_USE_STATIC_CXX_ABI_PYBOOL=False",
     "SANITIZER_USE_STATIC_LLVM_UNWINDER_PYBOOL=False",
+    "COMPILER_RT_HAS_AARCH64_SME=False",
     "COMPILER_RT_HAS_LLD_PYBOOL=True",
     "COMPILER_RT_HAS_GWP_ASAN_PYBOOL=False",
     "HAVE_RPC_XDR_H=0",

From a0a16884ac678564787d3a939ad046b9b66bc3b9 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 22 Jan 2024 17:31:48 -0800
Subject: [PATCH 518/843] [Docs] Add anchors for llvm.minimum/maximum in
 LangRef.rst. NFC

This was missed in 58cfd56356ed96690150245ca5bcf646d12edc7a
---
 llvm/docs/LangRef.rst | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index d58375c173545..178029aca98a9 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15492,6 +15492,8 @@ NaN, the intrinsic lowering is responsible for quieting the inputs to
 correctly return the non-NaN input (e.g. by using the equivalent of
 ``llvm.canonicalize``).
 
+.. _i_minimum:
+
 '``llvm.minimum.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -15530,6 +15532,8 @@ of the two arguments. -0.0 is considered to be less than +0.0 for this
 intrinsic. Note that these are the semantics specified in the draft of
 IEEE 754-2018.
 
+.. _i_maximum:
+
 '``llvm.maximum.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 

From c0fe2b8963aad8ad247eeceffb8eb833ee63a0e7 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:55:28 -0700
Subject: [PATCH 519/843] Apply clang-tidy fixes for modernize-loop-convert in
 Transforms.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 51be5c9d5c573..e4d2da7a1200e 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -482,10 +482,10 @@ FailureOr<LowerUnPackOpResult> linalg::lowerUnPack(RewriterBase &rewriter,
 SmallVector<int64_t>
 PackedOperandsDimList::extractPackedDimsForOperand(int64_t operandPos) {
   SmallVector<int64_t> res;
-  for (int64_t i = 0, e = spec.size(); i < e; ++i) {
-    if (!spec[i].packedDimForEachOperand[operandPos].has_value())
+  for (auto &i : spec) {
+    if (!i.packedDimForEachOperand[operandPos].has_value())
       continue;
-    res.push_back(spec[i].packedDimForEachOperand[operandPos].value());
+    res.push_back(i.packedDimForEachOperand[operandPos].value());
   }
   return res;
 }
@@ -493,10 +493,10 @@ PackedOperandsDimList::extractPackedDimsForOperand(int64_t operandPos) {
 SmallVector<OpFoldResult>
 PackedOperandsDimList::extractPackSizesForOperand(int64_t operandPos) {
   SmallVector<OpFoldResult> res;
-  for (int64_t i = 0, e = spec.size(); i < e; ++i) {
-    if (!spec[i].packedDimForEachOperand[operandPos].has_value())
+  for (auto &i : spec) {
+    if (!i.packedDimForEachOperand[operandPos].has_value())
       continue;
-    res.push_back(spec[i].packedSize);
+    res.push_back(i.packedSize);
   }
   return res;
 }

From 3af5ab21b8da9015a5e336eb711e696781defcc9 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:56:14 -0700
Subject: [PATCH 520/843] Apply clang-tidy fixes for
 readability-identifier-naming in Transforms.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index e4d2da7a1200e..4df105af5bcd6 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -202,7 +202,7 @@ struct PackedOperandsDim {
 
 /// Helper struct to encode packing along all dimensions of a LinalgOp.
 struct PackedOperandsDimList {
-  void push_back(PackedOperandsDim &&packedOperandsDims) {
+  void pushBack(PackedOperandsDim &&packedOperandsDims) {
     spec.emplace_back(packedOperandsDims);
   }
   /// Return all the dims that have been packed for operand @ `operandPos`.
@@ -539,7 +539,7 @@ FailureOr<PackResult> linalg::pack(RewriterBase &rewriter,
     if (failed(maybePackedDimForEachOperand))
       return failure();
     packedOperandsDims.packedDimForEachOperand = *maybePackedDimForEachOperand;
-    listOfPackedOperandsDim.push_back(std::move(packedOperandsDims));
+    listOfPackedOperandsDim.pushBack(std::move(packedOperandsDims));
 
     LLVM_DEBUG(
         DBGS() << "++++ After pack size #" << i << ": " << packedSizes[i]

From 2e0909025ec010981f582d4e13b129e17329ee1b Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 05:58:12 -0700
Subject: [PATCH 521/843] Apply clang-tidy fixes for
 readability-simplify-boolean-expr in Vectorization.cpp (NFC)

---
 mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 0610f24ddaf47..0707625819d1a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2499,8 +2499,8 @@ struct Conv1DGenerator
       return;
     // (LHS has dimension NCW/NWC and RES has dimension NFW/NCW/NWF/NWC) OR
     // (non-channeled convolution -> LHS and RHS both have single dimensions).
-    if (!((lhsShapedType.getRank() == 3 && resShapedType.getRank() == 3) ||
-          (lhsShapedType.getRank() == 1 && resShapedType.getRank() == 1)))
+    if ((lhsShapedType.getRank() != 3 || resShapedType.getRank() != 3) &&
+        (lhsShapedType.getRank() != 1 || resShapedType.getRank() != 1))
       return;
 
     Operation *reduceOp = matchLinalgReduction(linalgOp.getDpsInitOperand(0));

From acf2f24ac3288c1342edbeaba0fdd4e432b4ccb8 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 06:03:45 -0700
Subject: [PATCH 522/843] Apply clang-tidy fixes for llvm-else-after-return in
 LLVMDialect.cpp (NFC)

---
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index c2336f44b33e3..f4042a60541a6 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -1248,8 +1248,7 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
 LLVMFunctionType CallOp::getCalleeFunctionType() {
   if (getCalleeType())
     return *getCalleeType();
-  else
-    return getLLVMFuncType(getContext(), getResultTypes(), getArgOperands());
+  return getLLVMFuncType(getContext(), getResultTypes(), getArgOperands());
 }
 
 ///===---------------------------------------------------------------------===//
@@ -1444,8 +1443,7 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
 LLVMFunctionType InvokeOp::getCalleeFunctionType() {
   if (getCalleeType())
     return *getCalleeType();
-  else
-    return getLLVMFuncType(getContext(), getResultTypes(), getArgOperands());
+  return getLLVMFuncType(getContext(), getResultTypes(), getArgOperands());
 }
 
 ///===----------------------------------------------------------------------===//

From d4933b3241f463871cae55bbeec8563e7ffe03a2 Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Fri, 20 Oct 2023 06:19:31 -0700
Subject: [PATCH 523/843] Apply clang-tidy fixes for
 readability-identifier-naming in PolynomialApproximation.cpp (NFC)

---
 .../Transforms/PolynomialApproximation.cpp    | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
index 533b14e061e93..71e4e13103f51 100644
--- a/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
+++ b/mlir/lib/Dialect/Math/Transforms/PolynomialApproximation.cpp
@@ -1016,20 +1016,20 @@ ExpApproximation::matchAndRewrite(math::ExpOp op,
   // 2^-126.
 
   // Constants.
-  Value cst_half = bcast(f32Cst(builder, 0.5f));
-  Value cst_one = bcast(f32Cst(builder, 1.0f));
+  Value cstHalf = bcast(f32Cst(builder, 0.5f));
+  Value cstOne = bcast(f32Cst(builder, 1.0f));
 
   // 1/log(2)
-  Value cst_log2ef = bcast(f32Cst(builder, 1.44269504088896341f));
+  Value cstLog2ef = bcast(f32Cst(builder, 1.44269504088896341f));
 
-  Value cst_exp_c1 = bcast(f32Cst(builder, -0.693359375f));
-  Value cst_exp_c2 = bcast(f32Cst(builder, 2.12194440e-4f));
-  Value cst_exp_p0 = bcast(f32Cst(builder, 1.9875691500E-4f));
-  Value cst_exp_p1 = bcast(f32Cst(builder, 1.3981999507E-3f));
-  Value cst_exp_p2 = bcast(f32Cst(builder, 8.3334519073E-3f));
-  Value cst_exp_p3 = bcast(f32Cst(builder, 4.1665795894E-2f));
-  Value cst_exp_p4 = bcast(f32Cst(builder, 1.6666665459E-1f));
-  Value cst_exp_p5 = bcast(f32Cst(builder, 5.0000001201E-1f));
+  Value cstExpC1 = bcast(f32Cst(builder, -0.693359375f));
+  Value cstExpC2 = bcast(f32Cst(builder, 2.12194440e-4f));
+  Value cstExpP0 = bcast(f32Cst(builder, 1.9875691500E-4f));
+  Value cstExpP1 = bcast(f32Cst(builder, 1.3981999507E-3f));
+  Value cstExpP2 = bcast(f32Cst(builder, 8.3334519073E-3f));
+  Value cstExpP3 = bcast(f32Cst(builder, 4.1665795894E-2f));
+  Value cstExpP4 = bcast(f32Cst(builder, 1.6666665459E-1f));
+  Value cstExpP5 = bcast(f32Cst(builder, 5.0000001201E-1f));
 
   // Our computations below aren't particularly sensitive to the exact choices
   // here, so we choose values a bit larger/smaller than
@@ -1038,7 +1038,7 @@ ExpApproximation::matchAndRewrite(math::ExpOp op,
   //   log(2^-126) = -87.337...
   Value x = op.getOperand();
   x = clampWithNormals(builder, shape, x, -87.8f, 88.8f);
-  Value n = floor(fmla(x, cst_log2ef, cst_half));
+  Value n = floor(fmla(x, cstLog2ef, cstHalf));
 
   // When we eventually do the multiplication in e^a * 2^n, we need to handle
   // the case when n > 127, the max fp32 exponent (so 2^n == inf) but e^a < 1
@@ -1082,24 +1082,24 @@ ExpApproximation::matchAndRewrite(math::ExpOp op,
   n = clampWithNormals(builder, shape, n, -127.0f, 127.0f);
 
   // Computes x = x - n' * log(2), the value for `a`
-  x = fmla(cst_exp_c1, n, x);
-  x = fmla(cst_exp_c2, n, x);
+  x = fmla(cstExpC1, n, x);
+  x = fmla(cstExpC2, n, x);
 
   // Polynomial to compute z = e^a, accurate for a in (-0.5, 0.5).
-  Value z = fmla(x, cst_exp_p0, cst_exp_p1);
-  z = fmla(z, x, cst_exp_p2);
-  z = fmla(z, x, cst_exp_p3);
-  z = fmla(z, x, cst_exp_p4);
-  z = fmla(z, x, cst_exp_p5);
+  Value z = fmla(x, cstExpP0, cstExpP1);
+  z = fmla(z, x, cstExpP2);
+  z = fmla(z, x, cstExpP3);
+  z = fmla(z, x, cstExpP4);
+  z = fmla(z, x, cstExpP5);
   z = fmla(z, mul(x, x), x);
-  z = add(cst_one, z);
+  z = add(cstOne, z);
 
   // Convert n' to an i32.  This is safe because we clamped it above.
-  auto i32_vec = broadcast(builder.getI32Type(), shape);
-  Value n_i32 = builder.create<arith::FPToSIOp>(i32_vec, n);
+  auto i32Vec = broadcast(builder.getI32Type(), shape);
+  Value nI32 = builder.create<arith::FPToSIOp>(i32Vec, n);
 
   // Creates the value 2^n' if -126 <= n' <= 127 and 0 if n' = -127.
-  Value pow2 = exp2I32(builder, n_i32);
+  Value pow2 = exp2I32(builder, nI32);
 
   // Return z * 2^n' if -126 <= n' <= 127 and 0 if n = -127.
   Value ret = mul(z, pow2);

From 4db4d7f282d17634966cdfda8ccecee1bfcfe1ff Mon Sep 17 00:00:00 2001
From: Carl Ritson <carl.ritson@amd.com>
Date: Tue, 23 Jan 2024 10:49:26 +0900
Subject: [PATCH 524/843] [AMDGPU] SILowerSGPRSpills: do not update MRI reserve
 registers (#77888)

VGPRs used for spilling do not require explicit reservation with MRI.
freezeReservedRegs() executed before register allocation ensures these
are placed in the reserve set.

The only pass after SILowerSGPRSpills is SIPreAllocateWWMRegs which
explicitly tests for interference before register allocation so should
not reuse a WWM VGPR holding spill data. reserveReg prevents calculation
of correct liveness for physical registers which could be used to extend
SIPreAllocateWWMRegs.
---
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp             | 9 ---------
 .../CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir    | 2 +-
 2 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 0ba7792ac436d..70ffb8ea0a622 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -332,7 +332,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   }
 
   bool MadeChange = false;
-  bool NewReservedRegs = false;
   bool SpilledToVirtVGPRLanes = false;
 
   // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be
@@ -370,7 +369,6 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
           // intermediate spills is implemented. There is no such support
           // currently exist in the LLVM compiler.
           if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
-            NewReservedRegs = true;
             bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
                 MI, FI, nullptr, Indexes, LIS, true);
             if (!Spilled)
@@ -442,12 +440,5 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   SaveBlocks.clear();
   RestoreBlocks.clear();
 
-  // Updated the reserved registers with any physical VGPRs added for SGPR
-  // spills.
-  if (NewReservedRegs) {
-    for (Register Reg : FuncInfo->getWWMReservedRegs())
-      MRI.reserveReg(Reg, TRI);
-  }
-
   return MadeChange;
 }
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 1473e667f894c..4544b177351ee 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=amdgcn-unknown-amdpal -mcpu=gfx1030 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs --stress-regalloc=2 -o - %s | FileCheck -check-prefix GCN %s
+# RUN: llc -mtriple=amdgcn-unknown-amdpal -mcpu=gfx1030 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs --stress-regalloc=5 -o - %s | FileCheck -check-prefix GCN %s
 
 --- |
   define amdgpu_gfx [13 x i32] @test_main() #0 {

From b8e708b9d39862c2b7595c02e7bdc4878a2d7186 Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Tue, 23 Jan 2024 09:57:05 +0800
Subject: [PATCH 525/843] [RISCV] Merge ADDI with X0 into base offset (#78940)

If offset is `addi rd, x0, imm`, merge imm into base offset.
---
 .../lib/Target/RISCV/RISCVMergeBaseOffset.cpp | 13 +++++++--
 .../test/CodeGen/RISCV/fold-addi-loadstore.ll | 28 ++++++-------------
 2 files changed, 18 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
index 313f7f396d4d4..410989177a8b9 100644
--- a/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
+++ b/llvm/lib/Target/RISCV/RISCVMergeBaseOffset.cpp
@@ -193,9 +193,16 @@ bool RISCVMergeBaseOffsetOpt::foldLargeOffset(MachineInstr &Hi,
     if (AddiImmOp.getTargetFlags() != RISCVII::MO_None)
       return false;
     Register AddiReg = OffsetTail.getOperand(1).getReg();
-    if (!AddiReg.isVirtual())
-      return false;
     int64_t OffLo = AddiImmOp.getImm();
+
+    // Handle rs1 of ADDI is X0.
+    if (AddiReg == RISCV::X0) {
+      LLVM_DEBUG(dbgs() << "  Offset Instrs: " << OffsetTail);
+      foldOffset(Hi, Lo, TailAdd, OffLo);
+      OffsetTail.eraseFromParent();
+      return true;
+    }
+
     MachineInstr &OffsetLui = *MRI->getVRegDef(AddiReg);
     MachineOperand &LuiImmOp = OffsetLui.getOperand(1);
     if (OffsetLui.getOpcode() != RISCV::LUI ||
@@ -206,7 +213,7 @@ bool RISCVMergeBaseOffsetOpt::foldLargeOffset(MachineInstr &Hi,
     Offset += OffLo;
     // RV32 ignores the upper 32 bits. ADDIW sign extends the result.
     if (!ST->is64Bit() || OffsetTail.getOpcode() == RISCV::ADDIW)
-       Offset = SignExtend64<32>(Offset);
+      Offset = SignExtend64<32>(Offset);
     // We can only fold simm32 offsets.
     if (!isInt<32>(Offset))
       return false;
diff --git a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
index 74652cbc73b60..91e73992bdfa3 100644
--- a/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
+++ b/llvm/test/CodeGen/RISCV/fold-addi-loadstore.ll
@@ -971,11 +971,8 @@ declare void @f(ptr)
 define i32 @crash() {
 ; RV32I-LABEL: crash:
 ; RV32I:       # %bb.0: # %entry
-; RV32I-NEXT:    li a0, 1
-; RV32I-NEXT:    lui a1, %hi(g)
-; RV32I-NEXT:    addi a1, a1, %lo(g)
-; RV32I-NEXT:    add a0, a1, a0
-; RV32I-NEXT:    lbu a0, 400(a0)
+; RV32I-NEXT:    lui a0, %hi(g+401)
+; RV32I-NEXT:    lbu a0, %lo(g+401)(a0)
 ; RV32I-NEXT:    seqz a0, a0
 ; RV32I-NEXT:    sw a0, 0(zero)
 ; RV32I-NEXT:    li a0, 0
@@ -983,12 +980,9 @@ define i32 @crash() {
 ;
 ; RV32I-MEDIUM-LABEL: crash:
 ; RV32I-MEDIUM:       # %bb.0: # %entry
-; RV32I-MEDIUM-NEXT:    li a0, 1
 ; RV32I-MEDIUM-NEXT:  .Lpcrel_hi14:
-; RV32I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(g)
-; RV32I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi14)
-; RV32I-MEDIUM-NEXT:    add a0, a1, a0
-; RV32I-MEDIUM-NEXT:    lbu a0, 400(a0)
+; RV32I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(g+401)
+; RV32I-MEDIUM-NEXT:    lbu a0, %pcrel_lo(.Lpcrel_hi14)(a0)
 ; RV32I-MEDIUM-NEXT:    seqz a0, a0
 ; RV32I-MEDIUM-NEXT:    sw a0, 0(zero)
 ; RV32I-MEDIUM-NEXT:    li a0, 0
@@ -996,11 +990,8 @@ define i32 @crash() {
 ;
 ; RV64I-LABEL: crash:
 ; RV64I:       # %bb.0: # %entry
-; RV64I-NEXT:    li a0, 1
-; RV64I-NEXT:    lui a1, %hi(g)
-; RV64I-NEXT:    addi a1, a1, %lo(g)
-; RV64I-NEXT:    add a0, a1, a0
-; RV64I-NEXT:    lbu a0, 400(a0)
+; RV64I-NEXT:    lui a0, %hi(g+401)
+; RV64I-NEXT:    lbu a0, %lo(g+401)(a0)
 ; RV64I-NEXT:    seqz a0, a0
 ; RV64I-NEXT:    sw a0, 0(zero)
 ; RV64I-NEXT:    li a0, 0
@@ -1008,12 +999,9 @@ define i32 @crash() {
 ;
 ; RV64I-MEDIUM-LABEL: crash:
 ; RV64I-MEDIUM:       # %bb.0: # %entry
-; RV64I-MEDIUM-NEXT:    li a0, 1
 ; RV64I-MEDIUM-NEXT:  .Lpcrel_hi14:
-; RV64I-MEDIUM-NEXT:    auipc a1, %pcrel_hi(g)
-; RV64I-MEDIUM-NEXT:    addi a1, a1, %pcrel_lo(.Lpcrel_hi14)
-; RV64I-MEDIUM-NEXT:    add a0, a1, a0
-; RV64I-MEDIUM-NEXT:    lbu a0, 400(a0)
+; RV64I-MEDIUM-NEXT:    auipc a0, %pcrel_hi(g+401)
+; RV64I-MEDIUM-NEXT:    lbu a0, %pcrel_lo(.Lpcrel_hi14)(a0)
 ; RV64I-MEDIUM-NEXT:    seqz a0, a0
 ; RV64I-MEDIUM-NEXT:    sw a0, 0(zero)
 ; RV64I-MEDIUM-NEXT:    li a0, 0

From e5ca202ef8870f3b46cc19a7a62624e6908be9a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 23 Jan 2024 02:59:30 +0100
Subject: [PATCH 526/843] [JITLink][AArch32] Multi-stub support for
 armv7/thumbv7 (#78371)

We want to emit stubs that match the instruction set state of the
relocation site. This is important for branches that have no built-in
switch for the instruction set state. It's the case for Jump24
relocations. Relocations on instructions that support switching on
the fly will be rewritten in a relaxation step in the future. This
affects Call relocations on `BL`/`BLX` instructions.

In this patch, the StubManager gains a second stub symbol slot for each
target and selects which one to use based on the relocation type. For
testing, we select the appropriate slot with a stub-kind filter, i.e.
`arm` or `thumb`. With that we can implement Armv7 stubs and test
that we can have both kinds of stubs for a single external symbol.
---
 .../llvm/ExecutionEngine/JITLink/aarch32.h    |  59 ++-------
 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp  | 125 ++++++++++++++++--
 .../JITLink/AArch32/ELF_stubs_arm.s           |  53 ++++++++
 .../JITLink/AArch32/ELF_stubs_multi.s         |  50 +++++++
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      |  46 ++++++-
 5 files changed, 274 insertions(+), 59 deletions(-)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_arm.s
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_multi.s

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
index 081f77a85e182..ed53fa409ade8 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
@@ -341,64 +341,31 @@ class GOTBuilder : public TableManager<GOTBuilder> {
   Section *GOTSection = nullptr;
 };
 
-/// Stubs builder for v7 emits non-position-independent Thumb stubs.
-///
-/// Right now we only have one default stub kind, but we want to extend this
-/// and allow creation of specific kinds in the future (e.g. branch range
-/// extension or interworking).
-///
-/// Let's keep it simple for the moment and not wire this through a GOT.
-///
-class StubsManager_v7 : public TableManager<StubsManager_v7> {
+/// Stubs builder for v7 emits non-position-independent Arm and Thumb stubs.
+class StubsManager_v7 {
 public:
   StubsManager_v7() = default;
 
   /// Name of the object file section that will contain all our stubs.
   static StringRef getSectionName() {
-    return "__llvm_jitlink_aarch32_STUBS_Thumbv7";
+    return "__llvm_jitlink_aarch32_STUBS_v7";
   }
 
   /// Implements link-graph traversal via visitExistingEdges().
-  bool visitEdge(LinkGraph &G, Block *B, Edge &E) {
-    if (E.getTarget().isDefined())
-      return false;
-
-    switch (E.getKind()) {
-    case Thumb_Call:
-    case Thumb_Jump24: {
-      DEBUG_WITH_TYPE("jitlink", {
-        dbgs() << "  Fixing " << G.getEdgeKindName(E.getKind()) << " edge at "
-               << B->getFixupAddress(E) << " (" << B->getAddress() << " + "
-               << formatv("{0:x}", E.getOffset()) << ")\n";
-      });
-      E.setTarget(this->getEntryForTarget(G, E.getTarget()));
-      return true;
-    }
-    }
-    return false;
-  }
-
-  /// Create a branch range extension stub with Thumb encoding for v7 CPUs.
-  Symbol &createEntry(LinkGraph &G, Symbol &Target);
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E);
 
 private:
-  /// Create a new node in the link-graph for the given stub template.
-  template <size_t Size>
-  Block &addStub(LinkGraph &G, const uint8_t (&Code)[Size],
-                 uint64_t Alignment) {
-    ArrayRef<char> Template(reinterpret_cast<const char *>(Code), Size);
-    return G.createContentBlock(getStubsSection(G), Template,
-                                orc::ExecutorAddr(), Alignment, 0);
-  }
-
-  /// Get or create the object file section that will contain all our stubs.
-  Section &getStubsSection(LinkGraph &G) {
-    if (!StubsSection)
-      StubsSection = &G.createSection(getSectionName(),
-                                      orc::MemProt::Read | orc::MemProt::Exec);
-    return *StubsSection;
+  // Two slots per external: Arm and Thumb
+  using StubMapEntry = std::tuple<Symbol *, Symbol *>;
+
+  Symbol *&getStubSymbolSlot(StringRef Name, bool Thumb) {
+    StubMapEntry &Stubs = StubMap.try_emplace(Name).first->second;
+    if (Thumb)
+      return std::get<1>(Stubs);
+    return std::get<0>(Stubs);
   }
 
+  DenseMap<StringRef, StubMapEntry> StubMap;
   Section *StubsSection = nullptr;
 };
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
index 1797a0068cd7c..9508cde07b42a 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/ExecutionEngine/Orc/Shared/MemoryFlags.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -724,27 +725,127 @@ bool GOTBuilder::visitEdge(LinkGraph &G, Block *B, Edge &E) {
   return true;
 }
 
+const uint8_t Armv7ABS[] = {
+    0x00, 0xc0, 0x00, 0xe3, // movw r12, #0x0000     ; lower 16-bit
+    0x00, 0xc0, 0x40, 0xe3, // movt r12, #0x0000     ; upper 16-bit
+    0x1c, 0xff, 0x2f, 0xe1  // bx   r12
+};
+
 const uint8_t Thumbv7ABS[] = {
     0x40, 0xf2, 0x00, 0x0c, // movw r12, #0x0000    ; lower 16-bit
     0xc0, 0xf2, 0x00, 0x0c, // movt r12, #0x0000    ; upper 16-bit
     0x60, 0x47              // bx   r12
 };
 
-Symbol &StubsManager_v7::createEntry(LinkGraph &G, Symbol &Target) {
+/// Create a new node in the link-graph for the given stub template.
+template <size_t Size>
+static Block &allocStub(LinkGraph &G, Section &S, const uint8_t (&Code)[Size]) {
   constexpr uint64_t Alignment = 4;
-  Block &B = addStub(G, Thumbv7ABS, Alignment);
-  LLVM_DEBUG({
-    const char *StubPtr = B.getContent().data();
-    HalfWords Reg12 = encodeRegMovtT1MovwT3(12);
-    assert(checkRegister<Thumb_MovwAbsNC>(StubPtr, Reg12) &&
-           checkRegister<Thumb_MovtAbs>(StubPtr + 4, Reg12) &&
-           "Linker generated stubs may only corrupt register r12 (IP)");
-  });
+  ArrayRef<char> Template(reinterpret_cast<const char *>(Code), Size);
+  return G.createContentBlock(S, Template, orc::ExecutorAddr(), Alignment, 0);
+}
+
+static Block &createStubThumbv7(LinkGraph &G, Section &S, Symbol &Target) {
+  Block &B = allocStub(G, S, Thumbv7ABS);
   B.addEdge(Thumb_MovwAbsNC, 0, Target, 0);
   B.addEdge(Thumb_MovtAbs, 4, Target, 0);
-  Symbol &Stub = G.addAnonymousSymbol(B, 0, B.getSize(), true, false);
-  Stub.setTargetFlags(ThumbSymbol);
-  return Stub;
+
+  [[maybe_unused]] const char *StubPtr = B.getContent().data();
+  [[maybe_unused]] HalfWords Reg12 = encodeRegMovtT1MovwT3(12);
+  assert(checkRegister<Thumb_MovwAbsNC>(StubPtr, Reg12) &&
+         checkRegister<Thumb_MovtAbs>(StubPtr + 4, Reg12) &&
+         "Linker generated stubs may only corrupt register r12 (IP)");
+  return B;
+}
+
+static Block &createStubArmv7(LinkGraph &G, Section &S, Symbol &Target) {
+  Block &B = allocStub(G, S, Armv7ABS);
+  B.addEdge(Arm_MovwAbsNC, 0, Target, 0);
+  B.addEdge(Arm_MovtAbs, 4, Target, 0);
+
+  [[maybe_unused]] const char *StubPtr = B.getContent().data();
+  [[maybe_unused]] uint32_t Reg12 = encodeRegMovtA1MovwA2(12);
+  assert(checkRegister<Arm_MovwAbsNC>(StubPtr, Reg12) &&
+         checkRegister<Arm_MovtAbs>(StubPtr + 4, Reg12) &&
+         "Linker generated stubs may only corrupt register r12 (IP)");
+  return B;
+}
+
+static bool needsStub(const Edge &E) {
+  Symbol &Target = E.getTarget();
+
+  // Create stubs for external branch targets.
+  if (!Target.isDefined()) {
+    switch (E.getKind()) {
+    case Arm_Call:
+    case Arm_Jump24:
+    case Thumb_Call:
+    case Thumb_Jump24:
+      return true;
+    default:
+      return false;
+    }
+  }
+
+  // For local targets, create interworking stubs if we switch Arm/Thumb with an
+  // instruction that cannot switch the instruction set state natively.
+  bool TargetIsThumb = Target.getTargetFlags() & ThumbSymbol;
+  switch (E.getKind()) {
+  case Arm_Jump24:
+    return TargetIsThumb; // Branch to Thumb needs interworking stub
+  case Thumb_Jump24:
+    return !TargetIsThumb; // Branch to Arm needs interworking stub
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool StubsManager_v7::visitEdge(LinkGraph &G, Block *B, Edge &E) {
+  if (!needsStub(E))
+    return false;
+
+  // Stub Arm/Thumb follows instruction set state at relocation site.
+  // TODO: We may reduce them at relaxation time and reuse freed slots.
+  bool MakeThumb = (E.getKind() > LastArmRelocation);
+  LLVM_DEBUG(dbgs() << "  Preparing " << (MakeThumb ? "Thumb" : "Arm")
+                    << " stub for " << G.getEdgeKindName(E.getKind())
+                    << " edge at " << B->getFixupAddress(E) << " ("
+                    << B->getAddress() << " + "
+                    << formatv("{0:x}", E.getOffset()) << ")\n");
+
+  Symbol &Target = E.getTarget();
+  assert(Target.hasName() && "Edge cannot point to anonymous target");
+  Symbol *&StubSymbol = getStubSymbolSlot(Target.getName(), MakeThumb);
+
+  if (!StubSymbol) {
+    if (!StubsSection)
+      StubsSection = &G.createSection(getSectionName(),
+                                      orc::MemProt::Read | orc::MemProt::Exec);
+    Block &B = MakeThumb ? createStubThumbv7(G, *StubsSection, Target)
+                         : createStubArmv7(G, *StubsSection, Target);
+    StubSymbol = &G.addAnonymousSymbol(B, 0, B.getSize(), true, false);
+    if (MakeThumb)
+      StubSymbol->setTargetFlags(ThumbSymbol);
+
+    LLVM_DEBUG({
+      dbgs() << "    Created " << (MakeThumb ? "Thumb" : "Arm") << " entry for "
+             << Target.getName() << " in " << StubsSection->getName() << ": "
+             << *StubSymbol << "\n";
+    });
+  }
+
+  assert(MakeThumb == (StubSymbol->getTargetFlags() & ThumbSymbol) &&
+         "Instruction set states of stub and relocation site should be equal");
+  LLVM_DEBUG({
+    dbgs() << "    Using " << (MakeThumb ? "Thumb" : "Arm") << " entry "
+           << *StubSymbol << " in "
+           << StubSymbol->getBlock().getSection().getName() << "\n";
+  });
+
+  E.setTarget(*StubSymbol);
+  return true;
 }
 
 const char *getEdgeKindName(Edge::Kind K) {
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_arm.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_arm.s
new file mode 100644
index 0000000000000..fb2e0eb2c0bf2
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_arm.s
@@ -0,0 +1,53 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes \
+# RUN:         -filetype=obj -o %t/out.o %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 \
+# RUN:              -slab-allocate 10Kb -slab-page-size 4096 \
+# RUN:              -abs ext=0x76bbe880 \
+# RUN:              -check %s %t/out.o
+
+	.text
+	.syntax unified
+
+# Check that calls/jumps to external functions trigger the generation of
+# branch-range extension stubs. These stubs don't follow the default PLT model
+# where the branch-target address is loaded from a GOT entry. Instead, they
+# hard-code it in the immediate field.
+
+# The external function ext will return to the caller directly.
+# jitlink-check: decode_operand(test_arm_jump, 0) = stub_addr(out.o, ext) - (test_arm_jump + 8)
+	.globl	test_arm_jump
+	.type	test_arm_jump,%function
+	.p2align	2
+test_arm_jump:
+	b	ext
+	.size	test_arm_jump, .-test_arm_jump
+
+# The branch-with-link sets the LR register so that the external function ext
+# returns to us. We have to save the register (push) and return to main manually
+# (pop). This adds the +4 offset for the bl instruction we decode:
+# jitlink-check: decode_operand(test_arm_call + 4, 0) = stub_addr(out.o, ext) - (test_arm_call + 8) - 4
+	.globl  test_arm_call
+	.type	test_arm_call,%function
+	.p2align	2
+test_arm_call:
+	push	{lr}
+	bl	ext
+	pop	{pc}
+	.size	test_arm_call, .-test_arm_call
+
+# This test is executable with both, Arm and Thumb `ext` functions. It only has
+# to return with `bx lr`. For example:
+#   > echo "void ext() {}" | clang -target armv7-linux-gnueabihf -o ext-arm.o -c -xc -
+#   > llvm-jitlink ext-arm.o out.o
+#
+	.globl	main
+	.type	main,%function
+	.p2align	2
+main:
+	push	{lr}
+	bl	test_arm_call
+	bl	test_arm_jump
+	movw	r0, #0
+	pop	{pc}
+	.size	main, .-main
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_multi.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_multi.s
new file mode 100644
index 0000000000000..d575f114dcba1
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_multi.s
@@ -0,0 +1,50 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes \
+# RUN:         -filetype=obj -o %t/out.o %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 \
+# RUN:              -slab-allocate=10Kb -slab-page-size=4096 \
+# RUN:              -abs ext=0x76bbe880 -check %s %t/out.o
+
+	.text
+	.syntax unified
+
+# Check that a single external symbol can have multiple stubs. We access them
+# with the extra stub-index argument to stub_addr(). Stubs are sorted by
+# ascending size (because the default memory manager lays out blocks by size).
+
+# Thumb relocation site emits thumb stub
+# jitlink-check: decode_operand(test_stub_thumb, 0) = stub_addr(out.o, ext, thumb) - (test_stub_thumb + 4)
+	.globl  test_stub_thumb
+	.type	test_stub_thumb,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+test_stub_thumb:
+	b	ext
+	.size	test_stub_thumb, .-test_stub_thumb
+
+# Arm relocation site emits arm stub
+# jitlink-check: decode_operand(test_stub_arm, 0) = stub_addr(out.o, ext, arm) - (test_stub_arm + 8)
+	.globl  test_stub_arm
+	.type	test_stub_arm,%function
+	.p2align	2
+	.code	32
+test_stub_arm:
+	b	ext
+	.size	test_stub_arm, .-test_stub_arm
+
+# This test is executable with both, Arm and Thumb `ext` functions. It only has
+# to return (directly to main) with `bx lr`. For example:
+#   > echo "void ext() {}" | clang -target armv7-linux-gnueabihf -o ext-arm.o -c -xc -
+#   > llvm-jitlink ext-arm.o out.o
+#
+	.globl	main
+	.type	main,%function
+	.p2align	2
+main:
+	push	{lr}
+	bl	test_stub_arm
+	bl	test_stub_thumb
+	movw	r0, #0
+	pop	{pc}
+	.size	main, .-main
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 7e213777a6727..b2a133860197d 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1265,8 +1265,52 @@ Session::findSectionInfo(StringRef FileName, StringRef SectionName) {
   return SecInfoItr->second;
 }
 
+class MemoryMatcher {
+public:
+  MemoryMatcher(ArrayRef<char> Content)
+      : Pos(Content.data()), End(Pos + Content.size()) {}
+
+  template <typename MaskType> bool matchMask(MaskType Mask) {
+    if (Mask == (Mask & *reinterpret_cast<const MaskType *>(Pos))) {
+      Pos += sizeof(MaskType);
+      return true;
+    }
+    return false;
+  }
+
+  template <typename ValueType> bool matchEqual(ValueType Value) {
+    if (Value == *reinterpret_cast<const ValueType *>(Pos)) {
+      Pos += sizeof(ValueType);
+      return true;
+    }
+    return false;
+  }
+
+  bool done() const { return Pos == End; }
+
+private:
+  const char *Pos;
+  const char *End;
+};
+
 static StringRef detectStubKind(const Session::MemoryRegionInfo &Stub) {
-  // Implement acutal stub kind detection
+  constexpr uint32_t Armv7MovWTle = 0xe300c000;
+  constexpr uint32_t Armv7BxR12le = 0xe12fff1c;
+  constexpr uint32_t Thumbv7MovWTle = 0x0c00f240;
+  constexpr uint16_t Thumbv7BxR12le = 0x4760;
+
+  MemoryMatcher M(Stub.getContent());
+  if (M.matchMask(Thumbv7MovWTle)) {
+    if (M.matchMask(Thumbv7MovWTle))
+      if (M.matchEqual(Thumbv7BxR12le))
+        if (M.done())
+          return "thumbv7_abs_le";
+  } else if (M.matchMask(Armv7MovWTle)) {
+    if (M.matchMask(Armv7MovWTle))
+      if (M.matchEqual(Armv7BxR12le))
+        if (M.done())
+          return "armv7_abs_le";
+  }
   return "";
 }
 

From 8c680451a5eff02dd7e5004b69e05b3744f4e1aa Mon Sep 17 00:00:00 2001
From: itrofimow <i.trofimow@yandex.ru>
Date: Tue, 23 Jan 2024 06:03:01 +0400
Subject: [PATCH 527/843] [libc++] Fix linking for platforms that don't
 implement std::exception_ptr (#79040)

This patch fixes linkage for platforms that don't implement
std::exception_ptr, as such setup was overlooked in #65534.
---
 .../src/support/runtime/exception_pointer_unimplemented.ipp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
index de0605d7774b8..e12b0caf419d2 100644
--- a/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
+++ b/libcxx/src/support/runtime/exception_pointer_unimplemented.ipp
@@ -30,6 +30,12 @@ exception_ptr& exception_ptr::operator=(const exception_ptr& other) noexcept {
   ::abort();
 }
 
+exception_ptr exception_ptr::__from_native_exception_pointer(void *__e) noexcept {
+#warning exception_ptr not yet implemented
+  fprintf(stderr, "exception_ptr not yet implemented\n");
+  ::abort();
+}
+
 nested_exception::nested_exception() noexcept : __ptr_(current_exception()) {}
 
 #if !defined(__GLIBCXX__)

From 6082478e1a5bde8ac03ac7220b733313a5417a13 Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconsteq@gmail.com>
Date: Mon, 22 Jan 2024 18:12:58 -0800
Subject: [PATCH 528/843] [libc++][hardening] Classify assertions related to
 leaks and syscalls. (#77164)

Introduce two new categories:
- `_LIBCPP_ASSERT_VALID_DEALLOCATION`;
- `_LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL`.
---
 libcxx/include/__config                       | 19 ++++++++++++++++-
 libcxx/include/__coroutine/coroutine_handle.h | 16 +++++++-------
 .../__memory_resource/polymorphic_allocator.h |  5 ++++-
 libcxx/src/filesystem/operations.cpp          | 21 ++++++++++++++++---
 libcxx/src/memory_resource.cpp                |  3 ++-
 libcxx/src/mutex.cpp                          |  8 ++++---
 ...te.pass.cpp => assert.deallocate.pass.cpp} |  5 ++++-
 7 files changed, 59 insertions(+), 18 deletions(-)
 rename libcxx/test/libcxx/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/{debug.deallocate.pass.cpp => assert.deallocate.pass.cpp} (80%)

diff --git a/libcxx/include/__config b/libcxx/include/__config
index 9a64cdb489119..f0275d0256d27 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -291,6 +291,15 @@
 // - `_LIBCPP_ASSERT_NON_OVERLAPPING_RANGES` -- for functions that take several ranges as arguments, checks that the
 //   given ranges do not overlap.
 //
+// - `_LIBCPP_ASSERT_VALID_DEALLOCATION` -- checks that an attempt to deallocate memory is valid (e.g. the given object
+//   was allocated by the given allocator). Violating this category typically results in a memory leak.
+//
+// - `_LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL` -- checks that a call to an external API doesn't fail in
+//   an unexpected manner. This includes triggering documented cases of undefined behavior in an external library (like
+//   attempting to unlock an unlocked mutex in pthreads). Any API external to the library falls under this category
+//   (from system calls to compiler intrinsics). We generally don't expect these failures to compromize memory safety or
+//   otherwise create an immediate security issue.
+//
 // - `_LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR` -- checks any operations that exchange nodes between containers to make sure
 //   the containers have compatible allocators.
 //
@@ -345,8 +354,10 @@ _LIBCPP_HARDENING_MODE_DEBUG
 // Overlapping ranges will make algorithms produce incorrect results but don't directly lead to a security
 // vulnerability.
 #    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)   _LIBCPP_ASSUME(expression)
-#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)   _LIBCPP_ASSUME(expression)
+#    define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)       _LIBCPP_ASSUME(expression)
+#    define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message)  _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)     _LIBCPP_ASSUME(expression)
+#    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)   _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                 _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                 _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)            _LIBCPP_ASSUME(expression)
@@ -360,6 +371,8 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)     _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_NON_NULL(expression, message)                 _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)   _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)       _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message)  _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)     _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)   _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                 _LIBCPP_ASSERT(expression, message)
@@ -376,6 +389,8 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)      _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_NON_NULL(expression, message)                  _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)    _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)        _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message)   _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)      _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)    _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                  _LIBCPP_ASSERT(expression, message)
@@ -391,6 +406,8 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(expression, message)      _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_NON_NULL(expression, message)                  _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_NON_OVERLAPPING_RANGES(expression, message)    _LIBCPP_ASSUME(expression)
+#    define _LIBCPP_ASSERT_VALID_DEALLOCATION(expression, message)        _LIBCPP_ASSUME(expression)
+#    define _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(expression, message)   _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)      _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)    _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                  _LIBCPP_ASSUME(expression)
diff --git a/libcxx/include/__coroutine/coroutine_handle.h b/libcxx/include/__coroutine/coroutine_handle.h
index 54bfe5b44f4c6..4557a6643c239 100644
--- a/libcxx/include/__coroutine/coroutine_handle.h
+++ b/libcxx/include/__coroutine/coroutine_handle.h
@@ -55,7 +55,7 @@ struct _LIBCPP_TEMPLATE_VIS coroutine_handle<void> {
   _LIBCPP_HIDE_FROM_ABI constexpr explicit operator bool() const noexcept { return __handle_ != nullptr; }
 
   _LIBCPP_HIDE_FROM_ABI bool done() const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__is_suspended(), "done() can be called only on suspended coroutines");
+    _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(__is_suspended(), "done() can be called only on suspended coroutines");
     return __builtin_coro_done(__handle_);
   }
 
@@ -63,13 +63,13 @@ struct _LIBCPP_TEMPLATE_VIS coroutine_handle<void> {
   _LIBCPP_HIDE_FROM_ABI void operator()() const { resume(); }
 
   _LIBCPP_HIDE_FROM_ABI void resume() const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__is_suspended(), "resume() can be called only on suspended coroutines");
-    _LIBCPP_ASSERT_UNCATEGORIZED(!done(), "resume() has undefined behavior when the coroutine is done");
+    _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(__is_suspended(), "resume() can be called only on suspended coroutines");
+    _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(!done(), "resume() has undefined behavior when the coroutine is done");
     __builtin_coro_resume(__handle_);
   }
 
   _LIBCPP_HIDE_FROM_ABI void destroy() const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__is_suspended(), "destroy() can be called only on suspended coroutines");
+    _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(__is_suspended(), "destroy() can be called only on suspended coroutines");
     __builtin_coro_destroy(__handle_);
   }
 
@@ -130,7 +130,7 @@ struct _LIBCPP_TEMPLATE_VIS coroutine_handle {
   _LIBCPP_HIDE_FROM_ABI constexpr explicit operator bool() const noexcept { return __handle_ != nullptr; }
 
   _LIBCPP_HIDE_FROM_ABI bool done() const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__is_suspended(), "done() can be called only on suspended coroutines");
+    _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(__is_suspended(), "done() can be called only on suspended coroutines");
     return __builtin_coro_done(__handle_);
   }
 
@@ -138,13 +138,13 @@ struct _LIBCPP_TEMPLATE_VIS coroutine_handle {
   _LIBCPP_HIDE_FROM_ABI void operator()() const { resume(); }
 
   _LIBCPP_HIDE_FROM_ABI void resume() const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__is_suspended(), "resume() can be called only on suspended coroutines");
-    _LIBCPP_ASSERT_UNCATEGORIZED(!done(), "resume() has undefined behavior when the coroutine is done");
+    _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(__is_suspended(), "resume() can be called only on suspended coroutines");
+    _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(!done(), "resume() has undefined behavior when the coroutine is done");
     __builtin_coro_resume(__handle_);
   }
 
   _LIBCPP_HIDE_FROM_ABI void destroy() const {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__is_suspended(), "destroy() can be called only on suspended coroutines");
+    _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(__is_suspended(), "destroy() can be called only on suspended coroutines");
     __builtin_coro_destroy(__handle_);
   }
 
diff --git a/libcxx/include/__memory_resource/polymorphic_allocator.h b/libcxx/include/__memory_resource/polymorphic_allocator.h
index 3a2794931767b..cfd07bc84fe8a 100644
--- a/libcxx/include/__memory_resource/polymorphic_allocator.h
+++ b/libcxx/include/__memory_resource/polymorphic_allocator.h
@@ -68,7 +68,10 @@ class _LIBCPP_AVAILABILITY_PMR _LIBCPP_TEMPLATE_VIS polymorphic_allocator {
   }
 
   _LIBCPP_HIDE_FROM_ABI void deallocate(_ValueType* __p, size_t __n) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(__n <= __max_size(), "deallocate called for size which exceeds max_size()");
+    _LIBCPP_ASSERT_VALID_DEALLOCATION(
+        __n <= __max_size(),
+        "deallocate() called for a size which exceeds max_size(), leading to a memory leak "
+        "(the argument will overflow and result in too few objects being deleted)");
     __res_->deallocate(__p, __n * sizeof(_ValueType), alignof(_ValueType));
   }
 
diff --git a/libcxx/src/filesystem/operations.cpp b/libcxx/src/filesystem/operations.cpp
index 8a7d6cc94c756..6253d1551b062 100644
--- a/libcxx/src/filesystem/operations.cpp
+++ b/libcxx/src/filesystem/operations.cpp
@@ -460,8 +460,21 @@ path __current_path(error_code* ec) {
   typedef decltype(&::free) Deleter;
   Deleter deleter = &::free;
 #else
+  errno     = 0; // Note: POSIX mandates that modifying `errno` is thread-safe.
   auto size = ::pathconf(".", _PC_PATH_MAX);
-  _LIBCPP_ASSERT_UNCATEGORIZED(size >= 0, "pathconf returned a 0 as max size");
+  if (size == -1) {
+    if (errno != 0) {
+      return err.report(capture_errno(), "call to pathconf failed");
+
+      // `pathconf` returns `-1` without an error to indicate no limit.
+    } else {
+#  if defined(__MVS__) && !defined(PATH_MAX)
+      size = _XOPEN_PATH_MAX + 1;
+#  else
+      size = PATH_MAX + 1;
+#  endif
+    }
+  }
 
   auto buff             = unique_ptr<path::value_type[]>(new path::value_type[size + 1]);
   path::value_type* ptr = buff.get();
@@ -620,7 +633,9 @@ void __permissions(const path& p, perms prms, perm_options opts, error_code* ec)
     set_sym_perms  = is_symlink(st);
     if (m_ec)
       return err.report(m_ec);
-    _LIBCPP_ASSERT_UNCATEGORIZED(st.permissions() != perms::unknown, "Permissions unexpectedly unknown");
+    // TODO(hardening): double-check this assertion -- it might be a valid (if rare) case when the permissions are
+    // unknown.
+    _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(st.permissions() != perms::unknown, "Permissions unexpectedly unknown");
     if (add_perms)
       prms |= st.permissions();
     else if (remove_perms)
@@ -667,7 +682,7 @@ path __read_symlink(const path& p, error_code* ec) {
   detail::SSizeT ret;
   if ((ret = detail::readlink(p.c_str(), buff.get(), size)) == -1)
     return err.report(capture_errno());
-  _LIBCPP_ASSERT_UNCATEGORIZED(ret > 0, "TODO");
+  // Note that `ret` returning `0` would work, resulting in a valid empty string being returned.
   if (static_cast<size_t>(ret) >= size)
     return err.report(errc::value_too_large);
   buff[ret] = 0;
diff --git a/libcxx/src/memory_resource.cpp b/libcxx/src/memory_resource.cpp
index 42c366893f736..2117238e63487 100644
--- a/libcxx/src/memory_resource.cpp
+++ b/libcxx/src/memory_resource.cpp
@@ -189,7 +189,8 @@ void unsynchronized_pool_resource::__adhoc_pool::__do_deallocate(
         return;
       }
     }
-    _LIBCPP_ASSERT_UNCATEGORIZED(false, "deallocating a block that was not allocated with this allocator");
+    // The request to deallocate memory ends up being a no-op, likely resulting in a memory leak.
+    _LIBCPP_ASSERT_VALID_DEALLOCATION(false, "deallocating a block that was not allocated with this allocator");
   }
 }
 
diff --git a/libcxx/src/mutex.cpp b/libcxx/src/mutex.cpp
index ce854757ac08d..2f8504d602dc9 100644
--- a/libcxx/src/mutex.cpp
+++ b/libcxx/src/mutex.cpp
@@ -36,7 +36,8 @@ bool mutex::try_lock() noexcept { return __libcpp_mutex_trylock(&__m_); }
 void mutex::unlock() noexcept {
   int ec = __libcpp_mutex_unlock(&__m_);
   (void)ec;
-  _LIBCPP_ASSERT_UNCATEGORIZED(ec == 0, "call to mutex::unlock failed");
+  _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(
+      ec == 0, "call to mutex::unlock failed. A possible reason is that the mutex wasn't locked");
 }
 
 // recursive_mutex
@@ -50,7 +51,7 @@ recursive_mutex::recursive_mutex() {
 recursive_mutex::~recursive_mutex() {
   int e = __libcpp_recursive_mutex_destroy(&__m_);
   (void)e;
-  _LIBCPP_ASSERT_UNCATEGORIZED(e == 0, "call to ~recursive_mutex() failed");
+  _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(e == 0, "call to ~recursive_mutex() failed");
 }
 
 void recursive_mutex::lock() {
@@ -62,7 +63,8 @@ void recursive_mutex::lock() {
 void recursive_mutex::unlock() noexcept {
   int e = __libcpp_recursive_mutex_unlock(&__m_);
   (void)e;
-  _LIBCPP_ASSERT_UNCATEGORIZED(e == 0, "call to recursive_mutex::unlock() failed");
+  _LIBCPP_ASSERT_VALID_EXTERNAL_API_CALL(
+      e == 0, "call to recursive_mutex::unlock() failed. A possible reason is that the mutex wasn't locked");
 }
 
 bool recursive_mutex::try_lock() noexcept { return __libcpp_recursive_mutex_trylock(&__m_); }
diff --git a/libcxx/test/libcxx/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/debug.deallocate.pass.cpp b/libcxx/test/libcxx/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/assert.deallocate.pass.cpp
similarity index 80%
rename from libcxx/test/libcxx/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/debug.deallocate.pass.cpp
rename to libcxx/test/libcxx/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/assert.deallocate.pass.cpp
index 305b06295bc44..5a9813a232b85 100644
--- a/libcxx/test/libcxx/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/debug.deallocate.pass.cpp
+++ b/libcxx/test/libcxx/utilities/utility/mem.res/mem.poly.allocator.class/mem.poly.allocator.mem/assert.deallocate.pass.cpp
@@ -32,7 +32,10 @@ int main(int, char**) {
   const std::size_t maxSize = Traits::max_size(a);
 
   a.deallocate(nullptr, maxSize); // no assertion
-  TEST_LIBCPP_ASSERT_FAILURE(a.deallocate(nullptr, maxSize + 1), "deallocate called for size which exceeds max_size()");
+  TEST_LIBCPP_ASSERT_FAILURE(
+      a.deallocate(nullptr, maxSize + 1),
+      "deallocate() called for a size which exceeds max_size(), leading to a memory leak "
+      "(the argument will overflow and result in too few objects being deleted)");
 
   return 0;
 }

From 37efa7038d30ea13f62ad919fec1e8c8ce35b3ef Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 22 Jan 2024 18:18:20 -0800
Subject: [PATCH 529/843] [ELF] Fix spurious warning for -z rel && -z rela

Fixes: 665f913e4509e3e4f531aa4a4ebe92ec2ea5c23f
---
 lld/ELF/Driver.cpp               | 1 +
 lld/test/ELF/x86-64-zrel-zrela.s | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 62e1c29504ba2..f4b7d1c9d5b97 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -560,6 +560,7 @@ static void checkZOptions(opt::InputArgList &args) {
   // initialized yet. Claim them here.
   args::getZOptionValue(args, OPT_z, "max-page-size", 0);
   args::getZOptionValue(args, OPT_z, "common-page-size", 0);
+  getZFlag(args, "rel", "rela", false);
   for (auto *arg : args.filtered(OPT_z))
     if (!arg->isClaimed())
       warn("unknown -z value: " + StringRef(arg->getValue()));
diff --git a/lld/test/ELF/x86-64-zrel-zrela.s b/lld/test/ELF/x86-64-zrel-zrela.s
index a6d9f1b52408b..98ec45b49c881 100644
--- a/lld/test/ELF/x86-64-zrel-zrela.s
+++ b/lld/test/ELF/x86-64-zrel-zrela.s
@@ -25,7 +25,7 @@
 # RELA-NEXT:   R_X86_64_JUMP_SLOT func 0x0
 # RELA-NEXT: }
 
-# RUN: ld.lld -shared -z rela -z rel %t.o -o %t2.so
+# RUN: ld.lld -shared -z rela -z rel %t.o -o %t2.so --fatal-warnings
 # RUN: llvm-readobj -d -r -x .data %t2.so | FileCheck --check-prefix=REL %s
 
 # REL:      REL      {{.*}}

From bffd80d6df6195c8069b655b1f2463ad8622a707 Mon Sep 17 00:00:00 2001
From: Douglas Yung <douglas.yung@sony.com>
Date: Mon, 22 Jan 2024 18:18:41 -0800
Subject: [PATCH 530/843] Revert "[AArch64][compiler-rt] Add memcpy, memset,
 memmove, memchr builtins. (#77496)"

This reverts commit 3ab8d2aac7bc2dd45dda3db0b8a71fd27eefb749.

This change is causing issues running lit tests on many bots including:
- https://lab.llvm.org/buildbot/#/builders/197/builds/12119
- https://lab.llvm.org/buildbot/#/builders/184/builds/9792
- https://lab.llvm.org/buildbot/#/builders/93/builds/18455
- https://lab.llvm.org/buildbot/#/builders/231/builds/19858
- https://lab.llvm.org/buildbot/#/builders/121/builds/38426
- https://lab.llvm.org/buildbot/#/builders/230/builds/23990
- https://lab.llvm.org/buildbot/#/builders/57/builds/32391
- https://lab.llvm.org/buildbot/#/builders/247/builds/13502
- https://lab.llvm.org/buildbot/#/builders/275/builds/3601
- https://lab.llvm.org/buildbot/#/builders/269/builds/4211
- https://lab.llvm.org/buildbot/#/builders/18/builds/14161
- https://lab.llvm.org/buildbot/#/builders/19/builds/23893
- https://lab.llvm.org/buildbot/#/builders/37/builds/30295
- https://lab.llvm.org/buildbot/#/builders/77/builds/33979
---
 compiler-rt/cmake/builtin-config-ix.cmake     |   8 +-
 compiler-rt/lib/builtins/CMakeLists.txt       |   5 +-
 .../lib/builtins/aarch64/sme-libc-routines.c  |  87 -------------
 .../test/builtins/Unit/sme-string-test.cpp    | 120 ------------------
 compiler-rt/test/lit.common.cfg.py            |   3 -
 compiler-rt/test/lit.common.configured.in     |   1 -
 .../unittests/lit.common.unit.configured.in   |   1 -
 7 files changed, 5 insertions(+), 220 deletions(-)
 delete mode 100644 compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
 delete mode 100644 compiler-rt/test/builtins/Unit/sme-string-test.cpp

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index b17c43bf6a68b..b40138aa011f8 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -35,12 +35,10 @@ asm(\".arch armv8-a+lse\");
 asm(\"cas w0, w1, [x2]\");
 ")
 
-builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
+builtin_check_c_compiler_source(COMPILER_RT_HAS_ASM_SME
 "
-void foo(void)  __arm_streaming_compatible {
-  asm(\".arch armv9-a+sme\");
-  asm(\"smstart\");
-}
+asm(\".arch armv9-a+sme\");
+asm(\"smstart\");
 ")
 
 if(ANDROID)
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 28ded8766f253..378884bcaf2e5 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -560,10 +560,9 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
-if(COMPILER_RT_HAS_AARCH64_SME AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
-  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
+if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
+  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
   message(STATUS "AArch64 SME ABI routines enabled")
-  set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
 else()
   message(STATUS "AArch64 SME ABI routines disabled")
 endif()
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
deleted file mode 100644
index cd73025a19cc1..0000000000000
--- a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
+++ /dev/null
@@ -1,87 +0,0 @@
-#include <stdlib.h>
-
-// WARNING: When building the scalar versions of these functions you need to
-// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
-// from recognising a loop idiom and planting calls to memcpy!
-
-static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
-                                 size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  const unsigned char *srcp = (const unsigned char *)src;
-  for (size_t i = 0; i < n; ++i)
-    destp[i] = srcp[i];
-
-  return dest;
-}
-
-// If dest and src overlap then behaviour is undefined, hence we can add the
-// restrict keywords here. This also matches the definition of the libc memcpy
-// according to the man page.
-void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
-                      size_t n) __arm_streaming_compatible {
-  return __arm_sc_memcpy_fwd(dest, src, n);
-}
-
-void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  unsigned char c8 = (unsigned char)c;
-  for (size_t i = 0; i < n; ++i)
-    destp[i] = c8;
-
-  return dest;
-}
-
-static void *__arm_sc_memcpy_rev(void *dest, const void *src,
-                                 size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  const unsigned char *srcp = (const unsigned char *)src;
-  // TODO: Improve performance by copying larger chunks in reverse, or by
-  // using SVE.
-  while (n > 0) {
-    --n;
-    destp[n] = srcp[n];
-  }
-  return dest;
-}
-
-// Semantically a memmove is equivalent to the following:
-//   1. Copy the entire contents of src to a temporary array that does not
-//      overlap with src or dest.
-//   2. Copy the contents of the temporary array into dest.
-void *__arm_sc_memmove(void *dest, const void *src,
-                       size_t n) __arm_streaming_compatible {
-  unsigned char *destp = (unsigned char *)dest;
-  const unsigned char *srcp = (const unsigned char *)src;
-
-  // If src and dest don't overlap then just invoke memcpy
-  if ((srcp > (destp + n)) || (destp > (srcp + n)))
-    return __arm_sc_memcpy_fwd(dest, src, n);
-
-  // Overlap case 1:
-  //     src: Low     |   ->   |     High
-  //    dest: Low  |   ->   |        High
-  // Here src is always ahead of dest at a higher addres. If we first read a
-  // chunk of data from src we can safely write the same chunk to dest without
-  // corrupting future reads of src.
-  if (srcp > destp)
-    return __arm_sc_memcpy_fwd(dest, src, n);
-
-  // Overlap case 2:
-  //     src: Low  |   ->   |        High
-  //    dest: Low     |   ->   |     High
-  // While we're in the overlap region we're always corrupting future reads of
-  // src when writing to dest. An efficient way to do this is to copy the data
-  // in reverse by starting at the highest address.
-  return __arm_sc_memcpy_rev(dest, src, n);
-}
-
-const void *__arm_sc_memchr(const void *src, int c,
-                            size_t n) __arm_streaming_compatible {
-  const unsigned char *srcp = (const unsigned char *)src;
-  unsigned char c8 = (unsigned char)c;
-  for (size_t i = 0; i < n; ++i)
-    if (srcp[i] == c8)
-      return &srcp[i];
-
-  return NULL;
-}
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.cpp b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
deleted file mode 100644
index 3bc4559f9ae04..0000000000000
--- a/compiler-rt/test/builtins/Unit/sme-string-test.cpp
+++ /dev/null
@@ -1,120 +0,0 @@
-// REQUIRES: aarch64-target-arch, aarch64-sme-available
-// RUN: %clangxx_builtins %s %librt -o %t && %run %t
-
-#include <cassert>
-#include <initializer_list>
-#include <stdint.h>
-#include <stdlib.h>
-#include <string.h>
-
-extern "C" {
-void *__arm_sc_memcpy(void *, const void *, size_t);
-void *__arm_sc_memset(void *, int, size_t);
-void *__arm_sc_memmove(void *, const void *, size_t);
-void *__arm_sc_memchr(const void *, int, size_t);
-}
-
-template <unsigned N> class Memory {
-public:
-  uint8_t ptr[N];
-  unsigned size;
-
-  Memory(unsigned stride = 0) {
-    size = N;
-    if (stride == 0)
-      return;
-    for (unsigned i = 0; i < N; i++)
-      ptr[i] = i * stride;
-  }
-
-  void assert_equal(const Memory &other) {
-    assert(N == other.size);
-    assert(memcmp(ptr, other.ptr, N) == 0);
-  }
-
-  void assert_equal(std::initializer_list<uint8_t> s) {
-    assert(N == s.size());
-    auto it = s.begin();
-    for (unsigned i = 0; i < N; ++i)
-      assert(ptr[i] == *it++);
-  }
-
-  void assert_elemt_equal_at(unsigned I, uint8_t elem) {
-    assert(ptr[I] == elem);
-  }
-};
-
-int main() {
-
-  // Testing memcpy from src to dst.
-  {
-    Memory<8> src(1);
-    Memory<8> dst;
-    if (!__arm_sc_memcpy(dst.ptr, src.ptr, 8))
-      abort();
-    dst.assert_equal(src);
-    dst.assert_equal({0, 1, 2, 3, 4, 5, 6, 7});
-  }
-
-  // Testing memcpy from src to dst with pointer offset.
-  {
-    Memory<8> src(1);
-    Memory<8> dst(1);
-    if (!__arm_sc_memcpy(dst.ptr + 1, src.ptr, 6))
-      abort();
-    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
-  }
-
-  // Testing memchr.
-  {
-    Memory<8> src(4);
-    for (unsigned i = 0; i < 8; ++i) {
-      uint8_t e = src.ptr[i];
-      uint8_t *elem = (uint8_t *)memchr(src.ptr, e, 8);
-      if (!elem)
-        abort();
-      src.assert_elemt_equal_at(elem - src.ptr, *elem);
-      for (unsigned i = 0; i < 8; ++i)
-        assert(__arm_sc_memchr(src.ptr, src.ptr[i], 8) ==
-               memchr(src.ptr, src.ptr[i], 8));
-    }
-  }
-
-  // Testing memset.
-  {
-    Memory<8> array;
-    if (!__arm_sc_memset(array.ptr, 2, 8))
-      abort();
-    array.assert_equal({2, 2, 2, 2, 2, 2, 2, 2});
-  }
-
-  // Testing memset with pointer offset.
-  {
-    Memory<8> array(1);
-    if (!__arm_sc_memset(array.ptr + 1, 2, 6))
-      abort();
-    array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7});
-  }
-
-  // Testing memmove with a simple non-overlap case.
-  {
-    Memory<8> src(1);
-    Memory<8> dst(1);
-    if (!__arm_sc_memmove(dst.ptr + 1, src.ptr, 6))
-      abort();
-    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
-  }
-
-  // Testing memove with overlap pointers dst > src, dst < src.
-  {
-    Memory<8> srcdst(1);
-    if (!__arm_sc_memmove(srcdst.ptr + 1, srcdst.ptr, 6))
-      abort();
-    srcdst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
-    if (!__arm_sc_memmove(srcdst.ptr, srcdst.ptr + 1, 6))
-      abort();
-    srcdst.assert_equal({0, 1, 2, 3, 4, 5, 5, 7});
-  }
-
-  return 0;
-}
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 113777b0ea8a1..1753a55508c7c 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -454,9 +454,6 @@ def get_ios_commands_dir():
 if config.has_lld:
     config.available_features.add("lld-available")
 
-if config.aarch64_sme:
-    config.available_features.add("aarch64-sme-available")
-
 if config.use_lld:
     config.available_features.add("lld")
 
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index b93e20e80a6ed..7c2d53520099a 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -50,7 +50,6 @@ set_default("gwp_asan", @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@)
 set_default("expensive_checks", @LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL@)
 set_default("test_standalone_build_libs", @COMPILER_RT_TEST_STANDALONE_BUILD_LIBS_PYBOOL@)
 set_default("has_compiler_rt_libatomic", @COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL@)
-set_default("aarch64_sme", @COMPILER_RT_HAS_AARCH64_SME@)
 # True iff the test suite supports ignoring the test compiler's runtime library path
 # and using `config.compiler_rt_libdir` instead. This only matters when the runtime
 # library paths differ.
diff --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in
index 0d4785e0f0387..3e42e83c9e70a 100644
--- a/compiler-rt/unittests/lit.common.unit.configured.in
+++ b/compiler-rt/unittests/lit.common.unit.configured.in
@@ -7,7 +7,6 @@ config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
 config.compiler_rt_src_root = "@COMPILER_RT_SOURCE_DIR@"
 config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@")
-config.aarch64_sme = "@COMPILER_RT_HAS_AARCH64_SME@"
 config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.host_arch = "@HOST_ARCH@"

From 5c68c6d70fc204b0efdb2af95dfb328d616129e3 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 23 Jan 2024 10:23:27 +0800
Subject: [PATCH 531/843] [X86] Support encoding/decoding and lowering for APX
 variant SHL/SHR/SAR/ROL/ROR/RCL/RCR/SHLD/SHRD (#78853)

Four variants: promoted legacy, ND (new data destination), NF (no flags
update) and NF_ND (NF + ND).

The syntax of NF instructions is aligned with GNU binutils.
https://sourceware.org/pipermail/binutils/2023-September/129545.html
---
 llvm/lib/Target/X86/X86InstrCompiler.td    | 128 +++--
 llvm/lib/Target/X86/X86InstrShiftRotate.td | 439 ++++++++++++--
 llvm/lib/Target/X86/X86InstrUtils.td       |  64 ++-
 llvm/test/CodeGen/X86/apx/rol.ll           | 582 +++++++++++++++++++
 llvm/test/CodeGen/X86/apx/ror.ll           | 638 +++++++++++++++++++++
 llvm/test/CodeGen/X86/apx/sar.ll           | 546 ++++++++++++++++++
 llvm/test/CodeGen/X86/apx/shl.ll           | 545 ++++++++++++++++++
 llvm/test/CodeGen/X86/apx/shld.ll          | 265 +++++++++
 llvm/test/CodeGen/X86/apx/shr.ll           | 546 ++++++++++++++++++
 llvm/test/CodeGen/X86/apx/shrd.ll          | 277 +++++++++
 llvm/test/MC/Disassembler/X86/apx/rcl.txt  | 194 +++++++
 llvm/test/MC/Disassembler/X86/apx/rcr.txt  | 194 +++++++
 llvm/test/MC/Disassembler/X86/apx/rol.txt  | 386 +++++++++++++
 llvm/test/MC/Disassembler/X86/apx/ror.txt  | 386 +++++++++++++
 llvm/test/MC/Disassembler/X86/apx/sar.txt  | 386 +++++++++++++
 llvm/test/MC/Disassembler/X86/apx/shl.txt  | 386 +++++++++++++
 llvm/test/MC/Disassembler/X86/apx/shld.txt | 194 +++++++
 llvm/test/MC/Disassembler/X86/apx/shr.txt  | 386 +++++++++++++
 llvm/test/MC/Disassembler/X86/apx/shrd.txt | 194 +++++++
 llvm/test/MC/X86/apx/rcl-att.s             | 146 +++++
 llvm/test/MC/X86/apx/rcl-intel.s           | 143 +++++
 llvm/test/MC/X86/apx/rcr-att.s             | 146 +++++
 llvm/test/MC/X86/apx/rcr-intel.s           | 143 +++++
 llvm/test/MC/X86/apx/rol-att.s             | 287 +++++++++
 llvm/test/MC/X86/apx/rol-intel.s           | 284 +++++++++
 llvm/test/MC/X86/apx/ror-att.s             | 287 +++++++++
 llvm/test/MC/X86/apx/ror-intel.s           | 284 +++++++++
 llvm/test/MC/X86/apx/sar-att.s             | 287 +++++++++
 llvm/test/MC/X86/apx/sar-intel.s           | 284 +++++++++
 llvm/test/MC/X86/apx/shl-att.s             | 287 +++++++++
 llvm/test/MC/X86/apx/shl-intel.s           | 284 +++++++++
 llvm/test/MC/X86/apx/shld-att.s            | 149 +++++
 llvm/test/MC/X86/apx/shld-intel.s          | 146 +++++
 llvm/test/MC/X86/apx/shr-att.s             | 287 +++++++++
 llvm/test/MC/X86/apx/shr-intel.s           | 284 +++++++++
 llvm/test/MC/X86/apx/shrd-att.s            | 149 +++++
 llvm/test/MC/X86/apx/shrd-intel.s          | 146 +++++
 llvm/test/TableGen/x86-fold-tables.inc     | 240 ++++++++
 38 files changed, 10877 insertions(+), 122 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/apx/rol.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/ror.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/sar.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/shl.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/shld.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/shr.ll
 create mode 100644 llvm/test/CodeGen/X86/apx/shrd.ll
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/rcl.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/rcr.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/rol.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/ror.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/sar.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/shl.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/shld.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/shr.txt
 create mode 100644 llvm/test/MC/Disassembler/X86/apx/shrd.txt
 create mode 100644 llvm/test/MC/X86/apx/rcl-att.s
 create mode 100644 llvm/test/MC/X86/apx/rcl-intel.s
 create mode 100644 llvm/test/MC/X86/apx/rcr-att.s
 create mode 100644 llvm/test/MC/X86/apx/rcr-intel.s
 create mode 100644 llvm/test/MC/X86/apx/rol-att.s
 create mode 100644 llvm/test/MC/X86/apx/rol-intel.s
 create mode 100644 llvm/test/MC/X86/apx/ror-att.s
 create mode 100644 llvm/test/MC/X86/apx/ror-intel.s
 create mode 100644 llvm/test/MC/X86/apx/sar-att.s
 create mode 100644 llvm/test/MC/X86/apx/sar-intel.s
 create mode 100644 llvm/test/MC/X86/apx/shl-att.s
 create mode 100644 llvm/test/MC/X86/apx/shl-intel.s
 create mode 100644 llvm/test/MC/X86/apx/shld-att.s
 create mode 100644 llvm/test/MC/X86/apx/shld-intel.s
 create mode 100644 llvm/test/MC/X86/apx/shr-att.s
 create mode 100644 llvm/test/MC/X86/apx/shr-intel.s
 create mode 100644 llvm/test/MC/X86/apx/shrd-att.s
 create mode 100644 llvm/test/MC/X86/apx/shrd-intel.s

diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 8e412204c989c..9f1712274bc30 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -1789,24 +1789,47 @@ let Predicates = [HasNDD] in {
 // Shift amount is implicitly masked.
 multiclass MaskedShiftAmountPats<SDNode frag> {
   // (shift x (and y, 31)) ==> (shift x, y)
-  def : Pat<(frag GR8:$src1, (shiftMask32 CL)),
-            (!cast<Instruction>(NAME # "8rCL") GR8:$src1)>;
-  def : Pat<(frag GR16:$src1, (shiftMask32 CL)),
-            (!cast<Instruction>(NAME # "16rCL") GR16:$src1)>;
-  def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
-            (!cast<Instruction>(NAME # "32rCL") GR32:$src1)>;
+  // (shift x (and y, 63)) ==> (shift x, y)
+  let Predicates = [NoNDD] in {
+    def : Pat<(frag GR8:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "8rCL") GR8:$src1)>;
+    def : Pat<(frag GR16:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "16rCL") GR16:$src1)>;
+    def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "32rCL") GR32:$src1)>;
+    def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
+              (!cast<Instruction>(NAME # "64rCL") GR64:$src1)>;
+  }
+  let Predicates = [HasNDD] in {
+    def : Pat<(frag GR8:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "8rCL_ND") GR8:$src1)>;
+    def : Pat<(frag GR16:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "16rCL_ND") GR16:$src1)>;
+    def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "32rCL_ND") GR32:$src1)>;
+    def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
+              (!cast<Instruction>(NAME # "64rCL_ND") GR64:$src1)>;
+  }
+
   def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask32 CL)), addr:$dst),
             (!cast<Instruction>(NAME # "8mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask32 CL)), addr:$dst),
             (!cast<Instruction>(NAME # "16mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
             (!cast<Instruction>(NAME # "32mCL") addr:$dst)>;
-
-  // (shift x (and y, 63)) ==> (shift x, y)
-  def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
-            (!cast<Instruction>(NAME # "64rCL") GR64:$src1)>;
   def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
             (!cast<Instruction>(NAME # "64mCL") addr:$dst)>;
+
+  let Predicates = [HasNDD] in {
+    def : Pat<(frag (loadi8 addr:$src), (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "8mCL_ND") addr:$src)>;
+    def : Pat<(frag (loadi16 addr:$src), (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "16mCL_ND") addr:$src)>;
+    def : Pat<(frag (loadi32 addr:$src), (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "32mCL_ND") addr:$src)>;
+    def : Pat<(frag (loadi64 addr:$src), (shiftMask64 CL)),
+              (!cast<Instruction>(NAME # "64mCL_ND") addr:$src)>;
+  }
 }
 
 defm SHL : MaskedShiftAmountPats<shl>;
@@ -1821,47 +1844,76 @@ defm SAR : MaskedShiftAmountPats<sra>;
 // not tracking flags for these nodes.
 multiclass MaskedRotateAmountPats<SDNode frag> {
   // (rot x (and y, BitWidth - 1)) ==> (rot x, y)
-  def : Pat<(frag GR8:$src1, (shiftMask8 CL)),
-            (!cast<Instruction>(NAME # "8rCL") GR8:$src1)>;
-  def : Pat<(frag GR16:$src1, (shiftMask16 CL)),
-            (!cast<Instruction>(NAME # "16rCL") GR16:$src1)>;
-  def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
-            (!cast<Instruction>(NAME # "32rCL") GR32:$src1)>;
+  let Predicates = [NoNDD] in {
+    def : Pat<(frag GR8:$src1, (shiftMask8 CL)),
+              (!cast<Instruction>(NAME # "8rCL") GR8:$src1)>;
+    def : Pat<(frag GR16:$src1, (shiftMask16 CL)),
+              (!cast<Instruction>(NAME # "16rCL") GR16:$src1)>;
+    def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "32rCL") GR32:$src1)>;
+    def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
+              (!cast<Instruction>(NAME # "64rCL") GR64:$src1)>;
+  }
+  let Predicates = [HasNDD] in {
+    def : Pat<(frag GR8:$src1, (shiftMask8 CL)),
+              (!cast<Instruction>(NAME # "8rCL_ND") GR8:$src1)>;
+    def : Pat<(frag GR16:$src1, (shiftMask16 CL)),
+              (!cast<Instruction>(NAME # "16rCL_ND") GR16:$src1)>;
+    def : Pat<(frag GR32:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "32rCL_ND") GR32:$src1)>;
+    def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
+              (!cast<Instruction>(NAME # "64rCL_ND") GR64:$src1)>;
+  }
+
   def : Pat<(store (frag (loadi8 addr:$dst), (shiftMask8 CL)), addr:$dst),
             (!cast<Instruction>(NAME # "8mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi16 addr:$dst), (shiftMask16 CL)), addr:$dst),
             (!cast<Instruction>(NAME # "16mCL") addr:$dst)>;
   def : Pat<(store (frag (loadi32 addr:$dst), (shiftMask32 CL)), addr:$dst),
             (!cast<Instruction>(NAME # "32mCL") addr:$dst)>;
-
-  // (rot x (and y, 63)) ==> (rot x, y)
-  def : Pat<(frag GR64:$src1, (shiftMask64 CL)),
-            (!cast<Instruction>(NAME # "64rCL") GR64:$src1)>;
   def : Pat<(store (frag (loadi64 addr:$dst), (shiftMask64 CL)), addr:$dst),
             (!cast<Instruction>(NAME # "64mCL") addr:$dst)>;
+
+  let Predicates = [HasNDD] in {
+    def : Pat<(frag (loadi8 addr:$src), (shiftMask8 CL)),
+              (!cast<Instruction>(NAME # "8mCL_ND") addr:$src)>;
+    def : Pat<(frag (loadi16 addr:$src), (shiftMask16 CL)),
+              (!cast<Instruction>(NAME # "16mCL_ND") addr:$src)>;
+    def : Pat<(frag (loadi32 addr:$src), (shiftMask32 CL)),
+              (!cast<Instruction>(NAME # "32mCL_ND") addr:$src)>;
+    def : Pat<(frag (loadi64 addr:$src), (shiftMask64 CL)),
+              (!cast<Instruction>(NAME # "64mCL_ND") addr:$src)>;
+  }
 }
 
 defm ROL : MaskedRotateAmountPats<rotl>;
 defm ROR : MaskedRotateAmountPats<rotr>;
 
-// Double "funnel" shift amount is implicitly masked.
-// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) (NOTE: modulo32)
-def : Pat<(X86fshl GR16:$src1, GR16:$src2, (shiftMask32 CL)),
-          (SHLD16rrCL GR16:$src1, GR16:$src2)>;
-def : Pat<(X86fshr GR16:$src2, GR16:$src1, (shiftMask32 CL)),
-          (SHRD16rrCL GR16:$src1, GR16:$src2)>;
-
-// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y)
-def : Pat<(fshl GR32:$src1, GR32:$src2, (shiftMask32 CL)),
-          (SHLD32rrCL GR32:$src1, GR32:$src2)>;
-def : Pat<(fshr GR32:$src2, GR32:$src1, (shiftMask32 CL)),
-          (SHRD32rrCL GR32:$src1, GR32:$src2)>;
-
-// (fshl/fshr x (and y, 63)) ==> (fshl/fshr x, y)
-def : Pat<(fshl GR64:$src1, GR64:$src2, (shiftMask64 CL)),
-          (SHLD64rrCL GR64:$src1, GR64:$src2)>;
-def : Pat<(fshr GR64:$src2, GR64:$src1, (shiftMask64 CL)),
-          (SHRD64rrCL GR64:$src1, GR64:$src2)>;
+multiclass MaskedShlrdAmountPats<string suffix, Predicate p> {
+  let Predicates = [p] in {
+    // Double "funnel" shift amount is implicitly masked.
+    // (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) (NOTE: modulo32)
+    def : Pat<(X86fshl GR16:$src1, GR16:$src2, (shiftMask32 CL)),
+              (!cast<Instruction>(SHLD16rrCL#suffix) GR16:$src1, GR16:$src2)>;
+    def : Pat<(X86fshr GR16:$src2, GR16:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(SHRD16rrCL#suffix) GR16:$src1, GR16:$src2)>;
+
+    // (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y)
+    def : Pat<(fshl GR32:$src1, GR32:$src2, (shiftMask32 CL)),
+              (!cast<Instruction>(SHLD32rrCL#suffix) GR32:$src1, GR32:$src2)>;
+    def : Pat<(fshr GR32:$src2, GR32:$src1, (shiftMask32 CL)),
+              (!cast<Instruction>(SHRD32rrCL#suffix) GR32:$src1, GR32:$src2)>;
+
+    // (fshl/fshr x (and y, 63)) ==> (fshl/fshr x, y)
+    def : Pat<(fshl GR64:$src1, GR64:$src2, (shiftMask64 CL)),
+              (!cast<Instruction>(SHLD64rrCL#suffix) GR64:$src1, GR64:$src2)>;
+    def : Pat<(fshr GR64:$src2, GR64:$src1, (shiftMask64 CL)),
+              (!cast<Instruction>(SHRD64rrCL#suffix) GR64:$src1, GR64:$src2)>;
+  }
+}
+
+defm : MaskedShlrdAmountPats<"", NoNDD>;
+defm : MaskedShlrdAmountPats<"_ND", HasNDD>;
 
 // Use BTR/BTS/BTC for clearing/setting/toggling a bit in a variable location.
 multiclass OneBitPats<RegisterClass rc, ValueType vt, Instruction btr,
diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index 7166e0bc39179..2a5488847e648 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -20,45 +20,209 @@ multiclass ShiftRotate<string m, Format RegMRM, Format MemMRM, SDPatternOperator
 
   let Uses = uses in {
     let isConvertibleToThreeAddress = !if(!eq(m, "shl"), 1, 0) in {
-      def 8ri  : BinOpRI8U_R<m, RegMRM, Xi8, node>, Sched<[ri]>, DefEFLAGS;
-      def 16ri : BinOpRI8U_R<m, RegMRM, Xi16, node>, Sched<[ri]>, DefEFLAGS, OpSize16;
-      def 32ri : BinOpRI8U_R<m, RegMRM, Xi32, node>, Sched<[ri]>, DefEFLAGS, OpSize32;
-      def 64ri : BinOpRI8U_R<m, RegMRM, Xi64, node>, Sched<[ri]>, DefEFLAGS;
+      let Predicates = [NoNDD] in {
+        def 8ri  : BinOpRI8U_R<m, RegMRM, Xi8, node>, Sched<[ri]>, DefEFLAGS;
+        def 16ri : BinOpRI8U_R<m, RegMRM, Xi16, node>, Sched<[ri]>, DefEFLAGS, OpSize16;
+        def 32ri : BinOpRI8U_R<m, RegMRM, Xi32, node>, Sched<[ri]>, DefEFLAGS, OpSize32;
+        def 64ri : BinOpRI8U_R<m, RegMRM, Xi64, node>, Sched<[ri]>, DefEFLAGS;
+      }
+      let Predicates = [HasNDD, In64BitMode] in {
+        def 8ri_ND  : BinOpRI8U_R<m, RegMRM, Xi8, node, 1>, Sched<[ri]>, DefEFLAGS;
+        def 16ri_ND : BinOpRI8U_R<m, RegMRM, Xi16, node, 1>, Sched<[ri]>, DefEFLAGS, PD;
+        def 32ri_ND : BinOpRI8U_R<m, RegMRM, Xi32, node, 1>, Sched<[ri]>, DefEFLAGS;
+        def 64ri_ND : BinOpRI8U_R<m, RegMRM, Xi64, node, 1>, Sched<[ri]>, DefEFLAGS;
+      }
+      let Predicates = [In64BitMode] in {
+        def 8ri_EVEX  : BinOpRI8U_R<m, RegMRM, Xi8>, Sched<[ri]>, DefEFLAGS, PL;
+        def 16ri_EVEX : BinOpRI8U_R<m, RegMRM, Xi16>, Sched<[ri]>, DefEFLAGS, PL, PD;
+        def 32ri_EVEX : BinOpRI8U_R<m, RegMRM, Xi32>, Sched<[ri]>, DefEFLAGS, PL;
+        def 64ri_EVEX : BinOpRI8U_R<m, RegMRM, Xi64>, Sched<[ri]>, DefEFLAGS, PL;
+      }
     }
 
     def 8mi  : BinOpMI8U_M<m, MemMRM, Xi8, node>, Sched<[mi, WriteRMW]>, DefEFLAGS;
     def 16mi : BinOpMI8U_M<m, MemMRM, Xi16, node>, Sched<[mi, WriteRMW]>, DefEFLAGS, OpSize16;
     def 32mi : BinOpMI8U_M<m, MemMRM, Xi32, node>, Sched<[mi, WriteRMW]>, DefEFLAGS, OpSize32;
     def 64mi : BinOpMI8U_M<m, MemMRM, Xi64, node>, Sched<[mi, WriteRMW]>, DefEFLAGS, Requires<[In64BitMode]>;
+    let Predicates = [HasNDD, In64BitMode] in {
+      def 8mi_ND  : BinOpMI8U_R<m, MemMRM, Xi8, node>, Sched<[mi, ri]>, DefEFLAGS;
+      def 16mi_ND : BinOpMI8U_R<m, MemMRM, Xi16, node>, Sched<[mi, ri]>, DefEFLAGS, PD;
+      def 32mi_ND : BinOpMI8U_R<m, MemMRM, Xi32, node>, Sched<[mi, ri]>, DefEFLAGS;
+      def 64mi_ND : BinOpMI8U_R<m, MemMRM, Xi64, node>, Sched<[mi, ri]>, DefEFLAGS;
+    }
+    let Predicates = [In64BitMode] in {
+      def 8mi_EVEX  : BinOpMI8U_M<m, MemMRM, Xi8>, Sched<[mi, WriteRMW]>, DefEFLAGS, PL;
+      def 16mi_EVEX : BinOpMI8U_M<m, MemMRM, Xi16>, Sched<[mi, WriteRMW]>, DefEFLAGS, PL, PD;
+      def 32mi_EVEX : BinOpMI8U_M<m, MemMRM, Xi32>, Sched<[mi, WriteRMW]>, DefEFLAGS, PL;
+      def 64mi_EVEX : BinOpMI8U_M<m, MemMRM, Xi64>, Sched<[mi, WriteRMW]>, DefEFLAGS, PL;
+    }
 
     let SchedRW = [ri] in {
-      def 8r1  : UnaryOpR_RF<0xD1, RegMRM, m, Xi8, null_frag>;
-      def 16r1 : UnaryOpR_RF<0xD1, RegMRM, m, Xi16, null_frag>, OpSize16;
-      def 32r1 : UnaryOpR_RF<0xD1, RegMRM, m, Xi32, null_frag>, OpSize32;
-      def 64r1 : UnaryOpR_RF<0xD1, RegMRM, m, Xi64, null_frag>;
+      def 8r1  : UnaryOpR_RF<0xD1, RegMRM, m, Xi8>;
+      def 16r1 : UnaryOpR_RF<0xD1, RegMRM, m, Xi16>, OpSize16;
+      def 32r1 : UnaryOpR_RF<0xD1, RegMRM, m, Xi32>, OpSize32;
+      def 64r1 : UnaryOpR_RF<0xD1, RegMRM, m, Xi64>;
+
+      // FIXME: Assembler can't tell whether it's 8r1_ND or 8rCL when the source register is cl, e.g.
+      //
+      //  shlb %cl, %al
+      //
+      // GNU binutils distinguish them by adding an explicit $1 to asm string of 8r1_ND. But we haven't support
+      // constant immediate in asm string for X86 in TD. So we add DisassembleOnly for 8r1_ND for the time being.
+      let Predicates = [In64BitMode] in {
+        def 8r1_ND  : UnaryOpR_RF<0xD1, RegMRM, m, Xi8, null_frag, 1>, DisassembleOnly;
+        def 16r1_ND : UnaryOpR_RF<0xD1, RegMRM, m, Xi16, null_frag, 1>, PD;
+        def 32r1_ND : UnaryOpR_RF<0xD1, RegMRM, m, Xi32, null_frag, 1>;
+        def 64r1_ND : UnaryOpR_RF<0xD1, RegMRM, m, Xi64, null_frag, 1>;
+
+        def 8r1_EVEX  : UnaryOpR_RF<0xD1, RegMRM, m, Xi8>, PL;
+        def 16r1_EVEX : UnaryOpR_RF<0xD1, RegMRM, m, Xi16>, PL, PD;
+        def 32r1_EVEX : UnaryOpR_RF<0xD1, RegMRM, m, Xi32>, PL;
+        def 64r1_EVEX : UnaryOpR_RF<0xD1, RegMRM, m, Xi64>, PL;
+      }
     }
 
     let SchedRW = [mi, WriteRMW] in {
-      def 8m1  : UnaryOpM_MF<0xD1, MemMRM, m, Xi8, null_frag>;
-      def 16m1 : UnaryOpM_MF<0xD1, MemMRM, m, Xi16, null_frag>, OpSize16;
-      def 32m1 : UnaryOpM_MF<0xD1, MemMRM, m, Xi32, null_frag>, OpSize32;
-      def 64m1 : UnaryOpM_MF<0xD1, MemMRM, m, Xi64, null_frag>, Requires<[In64BitMode]>;
+      def 8m1  : UnaryOpM_MF<0xD1, MemMRM, m, Xi8>;
+      def 16m1 : UnaryOpM_MF<0xD1, MemMRM, m, Xi16>, OpSize16;
+      def 32m1 : UnaryOpM_MF<0xD1, MemMRM, m, Xi32>, OpSize32;
+      def 64m1 : UnaryOpM_MF<0xD1, MemMRM, m, Xi64>, Requires<[In64BitMode]>;
+
+      let Predicates = [In64BitMode] in {
+        def 8m1_EVEX  : UnaryOpM_MF<0xD1, MemMRM, m, Xi8>, PL;
+        def 16m1_EVEX : UnaryOpM_MF<0xD1, MemMRM, m, Xi16>, PL, PD;
+        def 32m1_EVEX : UnaryOpM_MF<0xD1, MemMRM, m, Xi32>, PL;
+        def 64m1_EVEX : UnaryOpM_MF<0xD1, MemMRM, m, Xi64>, PL;
+      }
+    }
+    let SchedRW = [mi, ri], Predicates = [In64BitMode] in {
+      def 8m1_ND  : UnaryOpM_RF<0xD1, MemMRM, m, Xi8>;
+      def 16m1_ND : UnaryOpM_RF<0xD1, MemMRM, m, Xi16>, PD;
+      def 32m1_ND : UnaryOpM_RF<0xD1, MemMRM, m, Xi32>;
+      def 64m1_ND : UnaryOpM_RF<0xD1, MemMRM, m, Xi64>;
     }
   }
 
-  let Uses = !listconcat([CL], uses) in {
-    def 8rCL  : BinOpRC_R<m, RegMRM, Xi8, node>, Sched<[rCL]>, DefEFLAGS;
-    def 16rCL : BinOpRC_R<m, RegMRM, Xi16, node>, Sched<[rCL]>, DefEFLAGS, OpSize16;
-    def 32rCL : BinOpRC_R<m, RegMRM, Xi32, node>, Sched<[rCL]>, DefEFLAGS, OpSize32;
-    def 64rCL : BinOpRC_R<m, RegMRM, Xi64, node>, Sched<[rCL]>, DefEFLAGS;
+  let Uses = !listconcat([CL], uses), Defs = [EFLAGS] in {
+    let Predicates = [NoNDD] in {
+      def 8rCL  : BinOpRC_R<m, RegMRM, Xi8, node>, Sched<[rCL]>;
+      def 16rCL : BinOpRC_R<m, RegMRM, Xi16, node>, Sched<[rCL]>, OpSize16;
+      def 32rCL : BinOpRC_R<m, RegMRM, Xi32, node>, Sched<[rCL]>, OpSize32;
+      def 64rCL : BinOpRC_R<m, RegMRM, Xi64, node>, Sched<[rCL]>;
+    }
+    let Predicates = [HasNDD, In64BitMode] in {
+      def 8rCL_ND  : BinOpRC_R<m, RegMRM, Xi8, node, 1>, Sched<[rCL]>;
+      def 16rCL_ND : BinOpRC_R<m, RegMRM, Xi16, node, 1>, Sched<[rCL]>, PD;
+      def 32rCL_ND : BinOpRC_R<m, RegMRM, Xi32, node, 1>, Sched<[rCL]>;
+      def 64rCL_ND : BinOpRC_R<m, RegMRM, Xi64, node, 1>, Sched<[rCL]>;
+    }
+    let Predicates = [In64BitMode] in {
+      def 8rCL_EVEX  : BinOpRC_R<m, RegMRM, Xi8>, Sched<[rCL]>, PL;
+      def 16rCL_EVEX : BinOpRC_R<m, RegMRM, Xi16>, Sched<[rCL]>, PL, PD;
+      def 32rCL_EVEX : BinOpRC_R<m, RegMRM, Xi32>, Sched<[rCL]>, PL;
+      def 64rCL_EVEX : BinOpRC_R<m, RegMRM, Xi64>, Sched<[rCL]>, PL;
+    }
+
+    def 8mCL  : BinOpMC_M<m, MemMRM, Xi8, node>, Sched<[mCL, WriteRMW]>;
+    def 16mCL : BinOpMC_M<m, MemMRM, Xi16, node>, Sched<[mCL, WriteRMW]>, OpSize16;
+    def 32mCL : BinOpMC_M<m, MemMRM, Xi32, node>, Sched<[mCL, WriteRMW]>, OpSize32;
+    def 64mCL : BinOpMC_M<m, MemMRM, Xi64, node>, Sched<[mCL, WriteRMW]>, Requires<[In64BitMode]>;
+
+    let Predicates = [HasNDD, In64BitMode] in {
+      def 8mCL_ND  : BinOpMC_R<m, MemMRM, Xi8, node>, Sched<[mCL, rCL]>;
+      def 16mCL_ND : BinOpMC_R<m, MemMRM, Xi16, node>, Sched<[mCL, rCL]>, PD;
+      def 32mCL_ND : BinOpMC_R<m, MemMRM, Xi32, node>, Sched<[mCL, rCL]>;
+      def 64mCL_ND : BinOpMC_R<m, MemMRM, Xi64, node>, Sched<[mCL, rCL]>;
+    }
 
-    def 8mCL  : BinOpMC_M<m, MemMRM, Xi8, node>, Sched<[mCL, WriteRMW]>, DefEFLAGS;
-    def 16mCL : BinOpMC_M<m, MemMRM, Xi16, node>, Sched<[mCL, WriteRMW]>, DefEFLAGS, OpSize16;
-    def 32mCL : BinOpMC_M<m, MemMRM, Xi32, node>, Sched<[mCL, WriteRMW]>, DefEFLAGS, OpSize32;
-    def 64mCL : BinOpMC_M<m, MemMRM, Xi64, node>, Sched<[mCL, WriteRMW]>, DefEFLAGS, Requires<[In64BitMode]>;
+    let Predicates = [In64BitMode] in {
+      def 8mCL_EVEX  : BinOpMC_M<m, MemMRM, Xi8>, Sched<[mCL, WriteRMW]>, PL;
+      def 16mCL_EVEX : BinOpMC_M<m, MemMRM, Xi16>, Sched<[mCL, WriteRMW]>, PL, PD;
+      def 32mCL_EVEX : BinOpMC_M<m, MemMRM, Xi32>, Sched<[mCL, WriteRMW]>, PL;
+      def 64mCL_EVEX : BinOpMC_M<m, MemMRM, Xi64>, Sched<[mCL, WriteRMW]>, PL;
+    }
   }
 }
 
+multiclass ShiftRotate_NF<string m, Format RegMRM, Format MemMRM, SchedReadWrite rCL,
+                          SchedReadWrite ri, SchedReadWrite mCL, SchedReadWrite mi> {
+  let Predicates = [In64BitMode] in {
+    let isConvertibleToThreeAddress = !if(!eq(m, "shl"), 1, 0) in {
+      def 8ri_NF  : BinOpRI8U_R<m, RegMRM, Xi8>, Sched<[ri]>, NF;
+      def 16ri_NF : BinOpRI8U_R<m, RegMRM, Xi16>, Sched<[ri]>, NF, PD;
+      def 32ri_NF : BinOpRI8U_R<m, RegMRM, Xi32>, Sched<[ri]>, NF;
+      def 64ri_NF : BinOpRI8U_R<m, RegMRM, Xi64>, Sched<[ri]>, NF;
+
+      def 8ri_NF_ND  : BinOpRI8U_R<m, RegMRM, Xi8, null_frag, 1>, Sched<[ri]>, EVEX_NF;
+      def 16ri_NF_ND : BinOpRI8U_R<m, RegMRM, Xi16, null_frag, 1>, Sched<[ri]>, EVEX_NF, PD;
+      def 32ri_NF_ND : BinOpRI8U_R<m, RegMRM, Xi32, null_frag, 1>, Sched<[ri]>, EVEX_NF;
+      def 64ri_NF_ND : BinOpRI8U_R<m, RegMRM, Xi64, null_frag, 1>, Sched<[ri]>, EVEX_NF;
+    }
+
+    def 8mi_NF  : BinOpMI8U_M<m, MemMRM, Xi8>, Sched<[mi, WriteRMW]>, NF;
+    def 16mi_NF : BinOpMI8U_M<m, MemMRM, Xi16>, Sched<[mi, WriteRMW]>, NF, PD;
+    def 32mi_NF : BinOpMI8U_M<m, MemMRM, Xi32>, Sched<[mi, WriteRMW]>, NF;
+    def 64mi_NF : BinOpMI8U_M<m, MemMRM, Xi64>, Sched<[mi, WriteRMW]>, NF;
+
+    def 8mi_NF_ND  : BinOpMI8U_R<m, MemMRM, Xi8>, Sched<[mi, ri]>, EVEX_NF;
+    def 16mi_NF_ND : BinOpMI8U_R<m, MemMRM, Xi16>, Sched<[mi, ri]>, EVEX_NF, PD;
+    def 32mi_NF_ND : BinOpMI8U_R<m, MemMRM, Xi32>, Sched<[mi, ri]>, EVEX_NF;
+    def 64mi_NF_ND : BinOpMI8U_R<m, MemMRM, Xi64>, Sched<[mi, ri]>, EVEX_NF;
+
+    let SchedRW = [ri] in {
+      // FIXME: Assembler can't tell whether it's 8r1_NF_ND or 8rCL_NF when the source register is cl, e.g.
+      //
+      //  {nf} shlb %cl, %al
+      //
+      // GNU binutils distinguish them by adding an explicit $1 to asm string of 8r1_NF_ND. But we haven't support
+      // constant immediate in asm string for X86 in TD. So we add DisassembleOnly for 8r1_NF_ND for the time being.
+      def 8r1_NF  : UnaryOpR_R<0xD1, RegMRM, m, Xi8>, NF;
+      def 16r1_NF : UnaryOpR_R<0xD1, RegMRM, m, Xi16>, NF, PD;
+      def 32r1_NF : UnaryOpR_R<0xD1, RegMRM, m, Xi32>, NF;
+      def 64r1_NF : UnaryOpR_R<0xD1, RegMRM, m, Xi64>, NF;
+
+      def 8r1_NF_ND  : UnaryOpR_R<0xD1, RegMRM, m, Xi8, null_frag, 1>, EVEX_NF, DisassembleOnly;
+      def 16r1_NF_ND : UnaryOpR_R<0xD1, RegMRM, m, Xi16, null_frag, 1>, EVEX_NF, PD;
+      def 32r1_NF_ND : UnaryOpR_R<0xD1, RegMRM, m, Xi32, null_frag, 1>, EVEX_NF;
+      def 64r1_NF_ND : UnaryOpR_R<0xD1, RegMRM, m, Xi64, null_frag, 1>, EVEX_NF;
+    }
+
+    let SchedRW = [mi, WriteRMW] in {
+      def 8m1_NF  : UnaryOpM_M<0xD1, MemMRM, m, Xi8>, NF;
+      def 16m1_NF : UnaryOpM_M<0xD1, MemMRM, m, Xi16>, NF, PD;
+      def 32m1_NF : UnaryOpM_M<0xD1, MemMRM, m, Xi32>, NF;
+      def 64m1_NF : UnaryOpM_M<0xD1, MemMRM, m, Xi64>, NF;
+    }
+    let SchedRW = [mi, ri] in {
+      def 8m1_NF_ND  : UnaryOpM_R<0xD1, MemMRM, m, Xi8>, EVEX_NF;
+      def 16m1_NF_ND : UnaryOpM_R<0xD1, MemMRM, m, Xi16>, EVEX_NF, PD;
+      def 32m1_NF_ND : UnaryOpM_R<0xD1, MemMRM, m, Xi32>, EVEX_NF;
+      def 64m1_NF_ND : UnaryOpM_R<0xD1, MemMRM, m, Xi64>, EVEX_NF;
+    }
+
+    let Uses = [CL] in {
+      def 8rCL_NF  : BinOpRC_R<m, RegMRM, Xi8>, Sched<[rCL]>, NF;
+      def 16rCL_NF : BinOpRC_R<m, RegMRM, Xi16>, Sched<[rCL]>, NF, PD;
+      def 32rCL_NF : BinOpRC_R<m, RegMRM, Xi32>, Sched<[rCL]>, NF;
+      def 64rCL_NF : BinOpRC_R<m, RegMRM, Xi64>, Sched<[rCL]>, NF;
+
+      def 8rCL_NF_ND  : BinOpRC_R<m, RegMRM, Xi8, null_frag, 1>, Sched<[rCL]>, EVEX_NF;
+      def 16rCL_NF_ND : BinOpRC_R<m, RegMRM, Xi16, null_frag, 1>, Sched<[rCL]>, EVEX_NF, PD;
+      def 32rCL_NF_ND : BinOpRC_R<m, RegMRM, Xi32, null_frag, 1>, Sched<[rCL]>, EVEX_NF;
+      def 64rCL_NF_ND : BinOpRC_R<m, RegMRM, Xi64, null_frag, 1>, Sched<[rCL]>, EVEX_NF;
+
+      def 8mCL_NF  : BinOpMC_M<m, MemMRM, Xi8>, Sched<[mCL, WriteRMW]>, NF;
+      def 16mCL_NF : BinOpMC_M<m, MemMRM, Xi16>, Sched<[mCL, WriteRMW]>, NF, PD;
+      def 32mCL_NF : BinOpMC_M<m, MemMRM, Xi32>, Sched<[mCL, WriteRMW]>, NF;
+      def 64mCL_NF : BinOpMC_M<m, MemMRM, Xi64>, Sched<[mCL, WriteRMW]>, NF;
+
+      def 8mCL_NF_ND  : BinOpMC_R<m, MemMRM, Xi8>, Sched<[mCL, rCL]>, EVEX_NF;
+      def 16mCL_NF_ND : BinOpMC_R<m, MemMRM, Xi16>, Sched<[mCL, rCL]>, EVEX_NF, PD;
+      def 32mCL_NF_ND : BinOpMC_R<m, MemMRM, Xi32>, Sched<[mCL, rCL]>, EVEX_NF;
+      def 64mCL_NF_ND : BinOpMC_R<m, MemMRM, Xi64>, Sched<[mCL, rCL]>, EVEX_NF;
+    }
+  }
+}
 defm SHL: ShiftRotate<"shl", MRM4r, MRM4m, shl, WriteShiftCL, WriteShift, WriteShiftCLLd, WriteShiftLd>;
 defm SHR: ShiftRotate<"shr", MRM5r, MRM5m, srl, WriteShiftCL, WriteShift, WriteShiftCLLd, WriteShiftLd>;
 defm SAR: ShiftRotate<"sar", MRM7r, MRM7m, sra, WriteShiftCL, WriteShift, WriteShiftCLLd, WriteShiftLd>;
@@ -68,15 +232,34 @@ defm ROR: ShiftRotate<"ror", MRM1r, MRM1m, rotr, WriteRotateCL, WriteRotate, Wri
 defm RCL: ShiftRotate<"rcl", MRM2r, MRM2m, null_frag, WriteRotateCL, WriteRotate, WriteRotateCLLd, WriteRotateLd, [EFLAGS]>;
 defm RCR: ShiftRotate<"rcr", MRM3r, MRM3m, null_frag, WriteRotateCL, WriteRotate, WriteRotateCLLd, WriteRotateLd, [EFLAGS]>;
 
+defm SHL: ShiftRotate_NF<"shl", MRM4r, MRM4m, WriteShiftCL, WriteShift, WriteShiftCLLd, WriteShiftLd>;
+defm SHR: ShiftRotate_NF<"shr", MRM5r, MRM5m, WriteShiftCL, WriteShift, WriteShiftCLLd, WriteShiftLd>;
+defm SAR: ShiftRotate_NF<"sar", MRM7r, MRM7m, WriteShiftCL, WriteShift, WriteShiftCLLd, WriteShiftLd>;
+
+defm ROL: ShiftRotate_NF<"rol", MRM0r, MRM0m, WriteRotateCL, WriteRotate, WriteRotateCLLd, WriteRotateLd>;
+defm ROR: ShiftRotate_NF<"ror", MRM1r, MRM1m, WriteRotateCL, WriteRotate, WriteRotateCLLd, WriteRotateLd>;
+
 // Use the opposite rotate if allows us to use the rotate by 1 instruction.
-def : Pat<(rotl GR8:$src1,  (i8 7)),  (ROR8r1  GR8:$src1)>;
-def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1 GR16:$src1)>;
-def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1 GR32:$src1)>;
-def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1 GR64:$src1)>;
-def : Pat<(rotr GR8:$src1,  (i8 7)),  (ROL8r1  GR8:$src1)>;
-def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1 GR16:$src1)>;
-def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1 GR32:$src1)>;
-def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1 GR64:$src1)>;
+let Predicates = [NoNDD] in {
+  def : Pat<(rotl GR8:$src1,  (i8 7)),  (ROR8r1  GR8:$src1)>;
+  def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1 GR16:$src1)>;
+  def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1 GR32:$src1)>;
+  def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1 GR64:$src1)>;
+  def : Pat<(rotr GR8:$src1,  (i8 7)),  (ROL8r1  GR8:$src1)>;
+  def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1 GR16:$src1)>;
+  def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1 GR32:$src1)>;
+  def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1 GR64:$src1)>;
+}
+let Predicates = [HasNDD] in {
+  def : Pat<(rotl GR8:$src1,  (i8 7)),  (ROR8r1_ND  GR8:$src1)>;
+  def : Pat<(rotl GR16:$src1, (i8 15)), (ROR16r1_ND GR16:$src1)>;
+  def : Pat<(rotl GR32:$src1, (i8 31)), (ROR32r1_ND GR32:$src1)>;
+  def : Pat<(rotl GR64:$src1, (i8 63)), (ROR64r1_ND GR64:$src1)>;
+  def : Pat<(rotr GR8:$src1,  (i8 7)),  (ROL8r1_ND  GR8:$src1)>;
+  def : Pat<(rotr GR16:$src1, (i8 15)), (ROL16r1_ND GR16:$src1)>;
+  def : Pat<(rotr GR32:$src1, (i8 31)), (ROL32r1_ND GR32:$src1)>;
+  def : Pat<(rotr GR64:$src1, (i8 63)), (ROL64r1_ND GR64:$src1)>;
+}
 
 def : Pat<(store (rotl (loadi8 addr:$dst), (i8 7)), addr:$dst),
           (ROR8m1 addr:$dst)>;
@@ -96,34 +279,74 @@ def : Pat<(store (rotr (loadi32 addr:$dst), (i8 31)), addr:$dst),
 def : Pat<(store (rotr (loadi64 addr:$dst), (i8 63)), addr:$dst),
           (ROL64m1 addr:$dst)>, Requires<[In64BitMode]>;
 
+let Predicates = [HasNDD] in {
+def : Pat<(rotl (loadi8 addr:$src), (i8 7)),
+          (ROR8m1_ND addr:$src)>;
+def : Pat<(rotl (loadi16 addr:$src), (i8 15)),
+          (ROR16m1_ND addr:$src)>;
+def : Pat<(rotl (loadi32 addr:$src), (i8 31)),
+          (ROR32m1_ND addr:$src)>;
+def : Pat<(rotl (loadi64 addr:$src), (i8 63)),
+          (ROR64m1_ND addr:$src)>;
+
+def : Pat<(rotr (loadi8 addr:$src), (i8 7)),
+          (ROL8m1_ND addr:$src)>;
+def : Pat<(rotr (loadi16 addr:$src), (i8 15)),
+          (ROL16m1_ND addr:$src)>;
+def : Pat<(rotr (loadi32 addr:$src), (i8 31)),
+          (ROL32m1_ND addr:$src)>;
+def : Pat<(rotr (loadi64 addr:$src), (i8 63)),
+          (ROL64m1_ND addr:$src)>;
+}
 
 // Patterns for rotate with relocImm for the immediate field.
-def : Pat<(rotl GR8:$src1, (i8 relocImm:$src2)),
-          (ROL8ri GR8:$src1, relocImm:$src2)>;
-def : Pat<(rotl GR16:$src1, (i8 relocImm:$src2)),
-          (ROL16ri GR16:$src1, relocImm:$src2)>;
-def : Pat<(rotl GR32:$src1, (i8 relocImm:$src2)),
-          (ROL32ri GR32:$src1, relocImm:$src2)>;
-def : Pat<(rotl GR64:$src1, (i8 relocImm:$src2)),
-          (ROL64ri GR64:$src1, relocImm:$src2)>;
-
-def : Pat<(rotr GR8:$src1, (i8 relocImm:$src2)),
-          (ROR8ri GR8:$src1, relocImm:$src2)>;
-def : Pat<(rotr GR16:$src1, (i8 relocImm:$src2)),
-          (ROR16ri GR16:$src1, relocImm:$src2)>;
-def : Pat<(rotr GR32:$src1, (i8 relocImm:$src2)),
-          (ROR32ri GR32:$src1, relocImm:$src2)>;
-def : Pat<(rotr GR64:$src1, (i8 relocImm:$src2)),
-          (ROR64ri GR64:$src1, relocImm:$src2)>;
+let Predicates = [NoNDD] in {
+  def : Pat<(rotl GR8:$src1, (i8 relocImm:$src2)),
+            (ROL8ri GR8:$src1, relocImm:$src2)>;
+  def : Pat<(rotl GR16:$src1, (i8 relocImm:$src2)),
+            (ROL16ri GR16:$src1, relocImm:$src2)>;
+  def : Pat<(rotl GR32:$src1, (i8 relocImm:$src2)),
+            (ROL32ri GR32:$src1, relocImm:$src2)>;
+  def : Pat<(rotl GR64:$src1, (i8 relocImm:$src2)),
+            (ROL64ri GR64:$src1, relocImm:$src2)>;
+
+  def : Pat<(rotr GR8:$src1, (i8 relocImm:$src2)),
+            (ROR8ri GR8:$src1, relocImm:$src2)>;
+  def : Pat<(rotr GR16:$src1, (i8 relocImm:$src2)),
+            (ROR16ri GR16:$src1, relocImm:$src2)>;
+  def : Pat<(rotr GR32:$src1, (i8 relocImm:$src2)),
+            (ROR32ri GR32:$src1, relocImm:$src2)>;
+  def : Pat<(rotr GR64:$src1, (i8 relocImm:$src2)),
+            (ROR64ri GR64:$src1, relocImm:$src2)>;
+}
+let Predicates = [HasNDD] in {
+  def : Pat<(rotl GR8:$src1, (i8 relocImm:$src2)),
+            (ROL8ri_ND GR8:$src1, relocImm:$src2)>;
+  def : Pat<(rotl GR16:$src1, (i8 relocImm:$src2)),
+            (ROL16ri_ND GR16:$src1, relocImm:$src2)>;
+  def : Pat<(rotl GR32:$src1, (i8 relocImm:$src2)),
+            (ROL32ri_ND GR32:$src1, relocImm:$src2)>;
+  def : Pat<(rotl GR64:$src1, (i8 relocImm:$src2)),
+            (ROL64ri_ND GR64:$src1, relocImm:$src2)>;
+
+  def : Pat<(rotr GR8:$src1, (i8 relocImm:$src2)),
+            (ROR8ri_ND GR8:$src1, relocImm:$src2)>;
+  def : Pat<(rotr GR16:$src1, (i8 relocImm:$src2)),
+            (ROR16ri_ND GR16:$src1, relocImm:$src2)>;
+  def : Pat<(rotr GR32:$src1, (i8 relocImm:$src2)),
+            (ROR32ri_ND GR32:$src1, relocImm:$src2)>;
+  def : Pat<(rotr GR64:$src1, (i8 relocImm:$src2)),
+            (ROR64ri_ND GR64:$src1, relocImm:$src2)>;
+}
 
 //===----------------------------------------------------------------------===//
 // Double precision shift instructions (generalizations of rotate)
 //===----------------------------------------------------------------------===//
 
-class ShlrdOpRRI8U_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+class ShlrdOpRRI8U_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>
   : ITy<o, MRMDestReg, t, (outs t.RegClass:$dst),
-        (ins t.RegClass:$src1, t.RegClass:$src2, u8imm:$src3), m, triop_args,
-        []>, NDD<0, TB> {
+        (ins t.RegClass:$src1, t.RegClass:$src2, u8imm:$src3), m, !if(!eq(ndd, 0), triop_args, triop_ndd_args),
+        []>, NDD<ndd> {
   let isCommutable = 1;
   let ImmT = Imm8;
   let SchedRW = [WriteSHDrri];
@@ -132,8 +355,8 @@ class ShlrdOpRRI8U_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
                     [(set t.RegClass:$dst, (node t.RegClass:$src2, t.RegClass:$src1, (i8 imm:$src3)))]);
 }
 
-class ShlrdOpRRC_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
-  : BinOpRR<o, m, triop_cl_args, t, (outs t.RegClass:$dst), []>, NDD<0, TB> {
+class ShlrdOpRRC_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>
+  : BinOpRR<o, m, !if(!eq(ndd, 0), triop_cl_args, triop_cl_ndd_args), t, (outs t.RegClass:$dst), []>, NDD<ndd> {
   let Uses = [CL];
   let SchedRW = [WriteSHDrrcl];
   let Pattern = !if(!eq(m, "shld"),
@@ -141,7 +364,7 @@ class ShlrdOpRRC_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
                     [(set t.RegClass:$dst, (node t.RegClass:$src2, t.RegClass:$src1, CL))]);
 }
 
-class ShlrdOpMRI8U_M<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+class ShlrdOpMRI8U_M<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag>
   : ITy<o, MRMDestMem, t, (outs), (ins t.MemOperand:$src1, t.RegClass:$src2, u8imm:$src3),
         m, triop_args, []>, TB {
   let ImmT = Imm8;
@@ -153,7 +376,7 @@ class ShlrdOpMRI8U_M<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
                     [(store (node t.RegClass:$src2, (t.LoadNode addr:$src1), (i8 imm:$src3)), addr:$src1)]);
 }
 
-class ShlrdOpMRC_M<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
+class ShlrdOpMRC_M<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag>
   : BinOpMR<o, m, triop_cl_args, t, (outs), []>, TB {
   let Uses = [CL];
   let SchedRW = [WriteSHDmrcl];
@@ -163,15 +386,71 @@ class ShlrdOpMRC_M<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node>
                     [(store (node t.RegClass:$src2, (t.LoadNode addr:$src1), CL), addr:$src1)]);
 }
 
-multiclass Shlrd<bits<8> o1, bits<8> o2, string m, SDPatternOperator node, SDPatternOperator t_node> {
+class ShlrdOpMRI8U_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag>
+  : ITy<o, MRMDestMem, t, (outs t.RegClass:$dst), (ins t.MemOperand:$src1, t.RegClass:$src2, u8imm:$src3),
+        m, triop_ndd_args, []>, NDD<1> {
+  let ImmT = Imm8;
+  let SchedRW = [WriteSHDmri];
+  let mayLoad = 1;
+  let Pattern = !if(!eq(m, "shld"),
+                    [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1), t.RegClass:$src2, (i8 imm:$src3)))],
+                    [(set t.RegClass:$dst, (node t.RegClass:$src2, (t.LoadNode addr:$src1), (i8 imm:$src3)))]);
+}
+
+class ShlrdOpMRC_R<bits<8> o, string m, X86TypeInfo t, SDPatternOperator node = null_frag>
+  : BinOpMR<o, m, triop_cl_ndd_args, t, (outs t.RegClass:$dst), []>, NDD<1> {
+  let Uses = [CL];
+  let SchedRW = [WriteSHDmrcl];
+  let Pattern = !if(!eq(m, "shld"),
+                    [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1), t.RegClass:$src2, CL))],
+                    [(set t.RegClass:$dst, (node t.RegClass:$src2, (t.LoadNode addr:$src1), CL))]);
+}
+
+multiclass Shlrd<bits<8> o1, bits<8> o2, bits<8> o3, string m, SDPatternOperator node, SDPatternOperator t_node> {
+  let Predicates = [NoNDD] in {
+    def 16rri8 : ShlrdOpRRI8U_R<o1, m, Xi16, t_node>, TB, DefEFLAGS, OpSize16;
+    def 32rri8 : ShlrdOpRRI8U_R<o1, m, Xi32, node>, TB, DefEFLAGS, OpSize32;
+    def 64rri8 : ShlrdOpRRI8U_R<o1, m, Xi64, node>, TB, DefEFLAGS;
+
+    def 16rrCL : ShlrdOpRRC_R<o2, m, Xi16, t_node>, TB, DefEFLAGS, OpSize16;
+    def 32rrCL : ShlrdOpRRC_R<o2, m, Xi32, node>, TB, DefEFLAGS, OpSize32;
+    def 64rrCL : ShlrdOpRRC_R<o2, m, Xi64, node>, TB, DefEFLAGS;
+  }
+  let Predicates = [HasNDD, In64BitMode] in {
+    def 16rri8_ND : ShlrdOpRRI8U_R<o3, m, Xi16, t_node, 1>, DefEFLAGS, PD;
+    def 32rri8_ND : ShlrdOpRRI8U_R<o3, m, Xi32, node, 1>, DefEFLAGS;
+    def 64rri8_ND : ShlrdOpRRI8U_R<o3, m, Xi64, node, 1>, DefEFLAGS;
+
+    def 16rrCL_ND : ShlrdOpRRC_R<o2, m, Xi16, t_node, 1>, DefEFLAGS, PD;
+    def 32rrCL_ND : ShlrdOpRRC_R<o2, m, Xi32, node, 1>, DefEFLAGS;
+    def 64rrCL_ND : ShlrdOpRRC_R<o2, m, Xi64, node, 1>, DefEFLAGS;
+  }
+
+  let Predicates = [In64BitMode] in {
+    def 16rri8_NF : ShlrdOpRRI8U_R<o3, m, Xi16>, NF, PD;
+    def 32rri8_NF : ShlrdOpRRI8U_R<o3, m, Xi32>, NF;
+    def 64rri8_NF : ShlrdOpRRI8U_R<o3, m, Xi64>, NF;
+
+    def 16rrCL_NF : ShlrdOpRRC_R<o2, m, Xi16>, NF, PD;
+    def 32rrCL_NF : ShlrdOpRRC_R<o2, m, Xi32>, NF;
+    def 64rrCL_NF : ShlrdOpRRC_R<o2, m, Xi64>, NF;
+
+    def 16rri8_NF_ND : ShlrdOpRRI8U_R<o3, m, Xi16, null_frag, 1>, EVEX_NF, PD;
+    def 32rri8_NF_ND : ShlrdOpRRI8U_R<o3, m, Xi32, null_frag, 1>, EVEX_NF;
+    def 64rri8_NF_ND : ShlrdOpRRI8U_R<o3, m, Xi64, null_frag, 1>, EVEX_NF;
+
+    def 16rrCL_NF_ND : ShlrdOpRRC_R<o2, m, Xi16, null_frag, 1>, EVEX_NF, PD;
+    def 32rrCL_NF_ND : ShlrdOpRRC_R<o2, m, Xi32, null_frag, 1>, EVEX_NF;
+    def 64rrCL_NF_ND : ShlrdOpRRC_R<o2, m, Xi64, null_frag, 1>, EVEX_NF;
 
-  def 16rri8 : ShlrdOpRRI8U_R<o1, m, Xi16, t_node>, DefEFLAGS, OpSize16;
-  def 32rri8 : ShlrdOpRRI8U_R<o1, m, Xi32, node>, DefEFLAGS, OpSize32;
-  def 64rri8 : ShlrdOpRRI8U_R<o1, m, Xi64, node>, DefEFLAGS;
+    def 16rri8_EVEX : ShlrdOpRRI8U_R<o3, m, Xi16>, DefEFLAGS, PL, PD;
+    def 32rri8_EVEX : ShlrdOpRRI8U_R<o3, m, Xi32>, DefEFLAGS, PL;
+    def 64rri8_EVEX : ShlrdOpRRI8U_R<o3, m, Xi64>, DefEFLAGS, PL;
 
-  def 16rrCL : ShlrdOpRRC_R<o2, m, Xi16, t_node>, DefEFLAGS, OpSize16;
-  def 32rrCL : ShlrdOpRRC_R<o2, m, Xi32, node>, DefEFLAGS, OpSize32;
-  def 64rrCL : ShlrdOpRRC_R<o2, m, Xi64, node>, DefEFLAGS;
+    def 16rrCL_EVEX : ShlrdOpRRC_R<o2, m, Xi16>, DefEFLAGS, PL, PD;
+    def 32rrCL_EVEX : ShlrdOpRRC_R<o2, m, Xi32>, DefEFLAGS, PL;
+    def 64rrCL_EVEX : ShlrdOpRRC_R<o2, m, Xi64>, DefEFLAGS, PL;
+  }
 
   def 16mri8 : ShlrdOpMRI8U_M<o1, m, Xi16, t_node>, DefEFLAGS, OpSize16;
   def 32mri8 : ShlrdOpMRI8U_M<o1, m, Xi32, node>, DefEFLAGS, OpSize32;
@@ -180,10 +459,46 @@ multiclass Shlrd<bits<8> o1, bits<8> o2, string m, SDPatternOperator node, SDPat
   def 16mrCL : ShlrdOpMRC_M<o2, m, Xi16, t_node>, DefEFLAGS, OpSize16;
   def 32mrCL : ShlrdOpMRC_M<o2, m, Xi32, node>, DefEFLAGS, OpSize32;
   def 64mrCL : ShlrdOpMRC_M<o2, m, Xi64, node>, DefEFLAGS;
+
+  let Predicates = [HasNDD, In64BitMode] in {
+    def 16mri8_ND : ShlrdOpMRI8U_R<o3, m, Xi16, t_node>, DefEFLAGS, PD;
+    def 32mri8_ND : ShlrdOpMRI8U_R<o3, m, Xi32, node>, DefEFLAGS;
+    def 64mri8_ND : ShlrdOpMRI8U_R<o3, m, Xi64, node>, DefEFLAGS;
+
+    def 16mrCL_ND : ShlrdOpMRC_R<o2, m, Xi16, t_node>, DefEFLAGS, PD;
+    def 32mrCL_ND : ShlrdOpMRC_R<o2, m, Xi32, node>, DefEFLAGS;
+    def 64mrCL_ND : ShlrdOpMRC_R<o2, m, Xi64, node>, DefEFLAGS;
+  }
+
+  let Predicates = [In64BitMode] in {
+    def 16mri8_NF : ShlrdOpMRI8U_M<o3, m, Xi16>, NF, PD;
+    def 32mri8_NF : ShlrdOpMRI8U_M<o3, m, Xi32>, NF;
+    def 64mri8_NF : ShlrdOpMRI8U_M<o3, m, Xi64>, NF;
+
+    def 16mrCL_NF : ShlrdOpMRC_M<o2, m, Xi16>, NF, PD;
+    def 32mrCL_NF : ShlrdOpMRC_M<o2, m, Xi32>, NF;
+    def 64mrCL_NF : ShlrdOpMRC_M<o2, m, Xi64>, NF;
+
+    def 16mri8_NF_ND : ShlrdOpMRI8U_R<o3, m, Xi16>, EVEX_NF, PD;
+    def 32mri8_NF_ND : ShlrdOpMRI8U_R<o3, m, Xi32>, EVEX_NF;
+    def 64mri8_NF_ND : ShlrdOpMRI8U_R<o3, m, Xi64>, EVEX_NF;
+
+    def 16mrCL_NF_ND : ShlrdOpMRC_R<o2, m, Xi16>, EVEX_NF, PD;
+    def 32mrCL_NF_ND : ShlrdOpMRC_R<o2, m, Xi32>, EVEX_NF;
+    def 64mrCL_NF_ND : ShlrdOpMRC_R<o2, m, Xi64>, EVEX_NF;
+
+    def 16mri8_EVEX : ShlrdOpMRI8U_M<o3, m, Xi16>, DefEFLAGS, PL, PD;
+    def 32mri8_EVEX : ShlrdOpMRI8U_M<o3, m, Xi32>, DefEFLAGS, PL;
+    def 64mri8_EVEX : ShlrdOpMRI8U_M<o3, m, Xi64>, DefEFLAGS, PL;
+
+    def 16mrCL_EVEX : ShlrdOpMRC_M<o2, m, Xi16>, DefEFLAGS, PL, PD;
+    def 32mrCL_EVEX : ShlrdOpMRC_M<o2, m, Xi32>, DefEFLAGS, PL;
+    def 64mrCL_EVEX : ShlrdOpMRC_M<o2, m, Xi64>, DefEFLAGS, PL;
+  }
 }
 
-defm SHLD : Shlrd<0xA4, 0xA5, "shld", fshl, X86fshl>;
-defm SHRD : Shlrd<0xAC, 0xAD, "shrd", fshr, X86fshr>;
+defm SHLD : Shlrd<0xA4, 0xA5, 0x24, "shld", fshl, X86fshl>;
+defm SHRD : Shlrd<0xAC, 0xAD, 0x2C, "shrd", fshr, X86fshr>;
 
 // Sandy Bridge and newer Intel processors support faster rotates using
 // SHLD to avoid a partial flag update on the normal rotate instructions.
diff --git a/llvm/lib/Target/X86/X86InstrUtils.td b/llvm/lib/Target/X86/X86InstrUtils.td
index 27aeff1cd3ae2..05ddcfbf2726d 100644
--- a/llvm/lib/Target/X86/X86InstrUtils.td
+++ b/llvm/lib/Target/X86/X86InstrUtils.td
@@ -100,17 +100,20 @@ defvar unaryop_ndd_args = "{$src1, $dst|$dst, $src1}";
 defvar binop_args = "{$src2, $src1|$src1, $src2}";
 defvar binop_ndd_args = "{$src2, $src1, $dst|$dst, $src1, $src2}";
 defvar binop_cl_args = "{%cl, $src1|$src1, cl}";
+defvar binop_cl_ndd_args = "{%cl, $src1, $dst|$dst, $src1, cl}";
 defvar triop_args = "{$src3, $src2, $src1|$src1, $src2, $src3}";
+defvar triop_ndd_args = "{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}";
 defvar triop_cl_args = "{%cl, $src2, $src1|$src1, $src2, cl}";
+defvar triop_cl_ndd_args = "{%cl, $src2, $src1, $dst|$dst, $src1, $src2, cl}";
 defvar tie_dst_src1 = "$src1 = $dst";
 
 // NDD - Helper for new data destination instructions
-class NDD<bit ndd, Map map = OB> {
+class NDD<bit ndd> {
   string Constraints = !if(!eq(ndd, 0), tie_dst_src1, "");
   Encoding OpEnc = !if(!eq(ndd, 0), EncNormal, EncEVEX);
   bit hasEVEX_B = ndd;
   bit hasVEX_4V = ndd;
-  Map OpMap = !if(!eq(ndd, 0), map, T_MAP4);
+  Map OpMap = !if(!eq(ndd, 0), OB, T_MAP4);
 }
 // NF - Helper for NF (no flags update) instructions
 class NF: T_MAP4, EVEX, EVEX_NF;
@@ -1067,9 +1070,10 @@ class BinOpRI_R<bits<8> o, string m, X86TypeInfo t, Format f, bit ndd = 0>
   : BinOpRI<o, m, !if(!eq(ndd, 0), binop_args, binop_ndd_args), t, f, (outs t.RegClass:$dst),
             []>, NDD<ndd>;
 // BinOpRI8U_R - Instructions that read "reg, u8imm" and write "reg".
-class BinOpRI8U_R<string m, Format f, X86TypeInfo t, SDPatternOperator node>
+class BinOpRI8U_R<string m, Format f, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>
   : ITy<0xC1, f, t, (outs t.RegClass:$dst), (ins t.RegClass:$src1, u8imm:$src2), m,
-        binop_args, [(set t.RegClass:$dst, (node t.RegClass:$src1, (i8 imm:$src2)))]>, NDD<0> {
+        !if(!eq(ndd, 0), binop_args, binop_ndd_args),
+        [(set t.RegClass:$dst, (node t.RegClass:$src1, (i8 imm:$src2)))]>, NDD<ndd> {
   let ImmT = Imm8;
 }
 // BinOpRI_RF - Instructions that read "reg, imm" and write "reg", EFLAGS.
@@ -1232,20 +1236,22 @@ class BinOpMI8<string m, string args, X86TypeInfo t, Format f, dag out>
   let ImmT = Imm8;
   let mayLoad = 1;
 }
+// BinOpMI8U - Instructions that read "[mem], u8imm".
+class BinOpMI8U<string m, string args, X86TypeInfo t, Format f, dag out, list<dag> p>
+  : ITy<0xC1, f, t, out, (ins t.MemOperand:$src1, u8imm:$src2), m, args, p> {
+  let ImmT = Imm8;
+  let mayLoad = 1;
+}
 // BinOpMI8_F - Instructions that read "[mem], imm8" and write EFLAGS only.
 class BinOpMI8_F<string m, X86TypeInfo t, Format f>
   : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteALU.Folded]>, DefEFLAGS;
 // BinOpMI8_R - Instructions that read "[mem], imm8" and write "reg".
 class BinOpMI8_R<string m, X86TypeInfo t, Format f>
   : BinOpMI8<m, binop_ndd_args, t, f, (outs t.RegClass:$dst)>, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, NDD<1>;
-// BinOpMI8U_M - Instructions that read "[mem], u8imm" and write "[mem]".
-class BinOpMI8U_M<string m, Format f, X86TypeInfo t, SDPatternOperator node>
-  : ITy<0xC1, f, t, (outs), (ins t.MemOperand:$src1, u8imm:$src2), m,
-        binop_args, [(store (node (t.LoadNode addr:$src1), (i8 imm:$src2)), addr:$src1)]> {
-  let ImmT = Imm8;
-  let mayLoad = 1;
-  let mayStore = 1;
-}
+// BinOpMI8U_R - Instructions that read "[mem], u8imm" and write "reg".
+class BinOpMI8U_R<string m, Format f, X86TypeInfo t, SDPatternOperator node = null_frag>
+  : BinOpMI8U<m, binop_ndd_args, t, f, (outs t.RegClass:$dst),
+              [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1), (i8 imm:$src2)))]>, NDD<1>;
 // BinOpMI8_RF - Instructions that read "[mem], imm8" and write "reg"/EFLAGS.
 class BinOpMI8_RF<string m, X86TypeInfo t, Format f>
   : BinOpMI8<m, binop_ndd_args, t, f, (outs t.RegClass:$dst)>, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, DefEFLAGS, NDD<1>;
@@ -1254,6 +1260,12 @@ class BinOpMI8_M<string m, X86TypeInfo t, Format f>
   : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteALURMW]> {
   let mayStore = 1;
 }
+// BinOpMI8U_M - Instructions that read "[mem], u8imm" and write "[mem]".
+class BinOpMI8U_M<string m, Format f, X86TypeInfo t, SDPatternOperator node = null_frag>
+  : BinOpMI8U<m, binop_args, t, f, (outs),
+              [(store (node (t.LoadNode addr:$src1), (i8 imm:$src2)), addr:$src1)]> {
+  let mayStore = 1;
+}
 // BinOpMI8_MF - Instructions that read "[mem], imm8" and write "[mem]", EFLAGS.
 class BinOpMI8_MF<string m, X86TypeInfo t, Format f>
   : BinOpMI8<m, binop_args, t, f, (outs)>, Sched<[WriteALURMW]>, DefEFLAGS {
@@ -1296,19 +1308,27 @@ class BinOpAIF_AF<bits<8> o, string m, X86TypeInfo t, Register areg,
   let SchedRW = [WriteADC];
 }
 // BinOpRC_R - Instructions that read "reg, cl" and write reg.
-class BinOpRC_R<string m, Format f, X86TypeInfo t, SDPatternOperator node>
-  : ITy<0xD3, f, t, (outs t.RegClass:$dst), (ins t.RegClass:$src1), m, binop_cl_args,
-        [(set t.RegClass:$dst, (node t.RegClass:$src1, CL))]>, NDD<0> {
+class BinOpRC_R<string m, Format f, X86TypeInfo t, SDPatternOperator node = null_frag, bit ndd = 0>
+  : ITy<0xD3, f, t, (outs t.RegClass:$dst), (ins t.RegClass:$src1), m,
+        !if(!eq(ndd, 0), binop_cl_args, binop_cl_ndd_args),
+        [(set t.RegClass:$dst, (node t.RegClass:$src1, CL))]>, NDD<ndd> {
   let Uses = [CL];
 }
 // BinOpMC_M - Instructions that read "[mem], cl" and write [mem].
-class BinOpMC_M<string m, Format f, X86TypeInfo t, SDPatternOperator node>
+class BinOpMC_M<string m, Format f, X86TypeInfo t, SDPatternOperator node = null_frag>
   : ITy<0xD3, f, t, (outs), (ins t.MemOperand:$src1), m, binop_cl_args,
         [(store (node (t.LoadNode addr:$src1), CL), addr:$src1)]> {
   let Uses = [CL];
   let mayLoad = 1;
   let mayStore = 1;
 }
+// BinOpMC_R - Instructions that read "[mem], cl" and write reg.
+class BinOpMC_R<string m, Format f, X86TypeInfo t, SDPatternOperator node = null_frag>
+  : ITy<0xD3, f, t, (outs t.RegClass:$dst), (ins t.MemOperand:$src1), m, binop_cl_ndd_args,
+        [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1), CL))]>, NDD<1> {
+  let Uses = [CL];
+  let mayLoad = 1;
+}
 
 // UnaryOpR - Instructions that read "reg".
 class UnaryOpR<bits<8> o, Format f, string m, string args, X86TypeInfo t,
@@ -1316,13 +1336,13 @@ class UnaryOpR<bits<8> o, Format f, string m, string args, X86TypeInfo t,
   : ITy<o, f, t, out, (ins t.RegClass:$src1), m, args, p>, Sched<[WriteALU]>;
 // UnaryOpR_R - Instructions that read "reg" and write "reg".
 class UnaryOpR_R<bits<8> o, Format f, string m, X86TypeInfo t,
-                  SDPatternOperator node, bit ndd = 0>
+                  SDPatternOperator node = null_frag, bit ndd = 0>
   : UnaryOpR<o, f, m, !if(!eq(ndd, 0), unaryop_args, unaryop_ndd_args), t,
              (outs t.RegClass:$dst),
              [(set t.RegClass:$dst, (node t.RegClass:$src1))]>, NDD<ndd>;
 // UnaryOpR_RF - Instructions that read "reg" and write "reg"/EFLAGS.
 class UnaryOpR_RF<bits<8> o, Format f, string m, X86TypeInfo t,
-                  SDPatternOperator node, bit ndd = 0>
+                  SDPatternOperator node = null_frag, bit ndd = 0>
   : UnaryOpR<o, f, m, !if(!eq(ndd, 0), unaryop_args, unaryop_ndd_args), t,
              (outs t.RegClass:$dst),
              [(set t.RegClass:$dst, (node t.RegClass:$src1)),
@@ -1336,19 +1356,19 @@ class UnaryOpM<bits<8> o, Format f, string m, string args, X86TypeInfo t,
 }
 // UnaryOpM_R - Instructions that read "[mem]" and writes "reg".
 class UnaryOpM_R<bits<8> o, Format f, string m, X86TypeInfo t,
-                  SDPatternOperator node>
+                  SDPatternOperator node = null_frag>
   : UnaryOpM<o, f, m, unaryop_ndd_args, t, (outs t.RegClass:$dst),
              [(set t.RegClass:$dst, (node (t.LoadNode addr:$src1)))]>,
     Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, NDD<1>;
 // UnaryOpM_RF - Instructions that read "[mem]" and writes "reg"/EFLAGS.
 class UnaryOpM_RF<bits<8> o, Format f, string m, X86TypeInfo t,
-                  SDPatternOperator node>
+                  SDPatternOperator node = null_frag>
   : UnaryOpM<o, f, m, unaryop_ndd_args, t, (outs t.RegClass:$dst),
              [(set t.RegClass:$dst, EFLAGS, (node (t.LoadNode addr:$src1)))]>,
     Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>, DefEFLAGS, NDD<1>;
 // UnaryOpM_M - Instructions that read "[mem]" and writes "[mem]".
 class UnaryOpM_M<bits<8> o, Format f, string m, X86TypeInfo t,
-                  SDPatternOperator node>
+                  SDPatternOperator node = null_frag>
   : UnaryOpM<o, f, m, unaryop_args, t, (outs),
              [(store (node (t.LoadNode addr:$src1)), addr:$src1)]>,
     Sched<[WriteALURMW]>{
@@ -1356,7 +1376,7 @@ class UnaryOpM_M<bits<8> o, Format f, string m, X86TypeInfo t,
 }
 // UnaryOpM_MF - Instructions that read "[mem]" and writes "[mem]"/EFLAGS.
 class UnaryOpM_MF<bits<8> o, Format f, string m, X86TypeInfo t,
-                  SDPatternOperator node>
+                  SDPatternOperator node = null_frag>
   : UnaryOpM<o, f, m, unaryop_args, t, (outs),
              [(store (node (t.LoadNode addr:$src1)), addr:$src1),
               (implicit EFLAGS)]>, Sched<[WriteALURMW]>, DefEFLAGS {
diff --git a/llvm/test/CodeGen/X86/apx/rol.ll b/llvm/test/CodeGen/X86/apx/rol.ll
new file mode 100644
index 0000000000000..ebc08ef7f325b
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/rol.ll
@@ -0,0 +1,582 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+
+define i8 @rol8m1(ptr %ptr) {
+; CHECK-LABEL: rol8m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolb $1, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x07,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = shl i8 %a, 1
+  %1 = lshr i8 %a, 7
+  %rol = or i8 %0, %1
+  ret i8 %rol
+}
+
+define i8 @rol8m1_intrinsic(ptr %ptr)  {
+; CHECK-LABEL: rol8m1_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rolb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %a = load i8, ptr %ptr
+  %f = call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 7)
+  ret i8 %f
+}
+
+define i16 @rol16m1(ptr %ptr) {
+; CHECK-LABEL: rol16m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolw $1, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x07,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = shl i16 %a, 1
+  %1 = lshr i16 %a, 15
+  %rol = or i16 %0, %1
+  ret i16 %rol
+}
+
+define i16 @rol16m1_intrinsic(ptr %ptr)  {
+; CHECK-LABEL: rol16m1_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rolw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %a = load i16, ptr %ptr
+  %f = call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 15)
+  ret i16 %f
+}
+
+define i32 @rol32m1(ptr %ptr) {
+; CHECK-LABEL: rol32m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    roll $1, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x07,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = shl i32 %a, 1
+  %1 = lshr i32 %a, 31
+  %rol = or i32 %0, %1
+  ret i32 %rol
+}
+
+define i32 @rol32m1_intrinsic(ptr %ptr)  {
+; CHECK-LABEL: rol32m1_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    roll (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %a = load i32, ptr %ptr
+  %f = call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 31)
+  ret i32 %f
+}
+
+define i64 @rol64m1(ptr %ptr) {
+; CHECK-LABEL: rol64m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolq $1, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x07,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = shl i64 %a, 1
+  %1 = lshr i64 %a, 63
+  %rol = or i64 %0, %1
+  ret i64 %rol
+}
+
+define i64 @rol64m1_intrinsic(ptr %ptr)  {
+; CHECK-LABEL: rol64m1_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rolq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %a = load i64, ptr %ptr
+  %f = call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 63)
+  ret i64 %f
+}
+
+define i8 @rol8mcl(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: rol8mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rolb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = shl i8 %a, %cl
+  %1 = sub i8 8, %cl
+  %2 = lshr i8 %a, %1
+  %rol = or i8 %0, %2
+  ret i8 %rol
+}
+
+define i16 @rol16mcl(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: rol16mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rolw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = shl i16 %a, %cl
+  %1 = sub i16 16, %cl
+  %2 = lshr i16 %a, %1
+  %rol = or i16 %0, %2
+  ret i16 %rol
+}
+
+define i32 @rol32mcl(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: rol32mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    roll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = shl i32 %a, %cl
+  %1 = sub i32 32, %cl
+  %2 = lshr i32 %a, %1
+  %rol = or i32 %0, %2
+  ret i32 %rol
+}
+
+define i64 @rol64mcl(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: rol64mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    rolq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = shl i64 %a, %cl
+  %1 = sub i64 64, %cl
+  %2 = lshr i64 %a, %1
+  %rol = or i64 %0, %2
+  ret i64 %rol
+}
+
+define i8 @rol8mi(ptr %ptr) {
+; CHECK-LABEL: rol8mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolb $3, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x07,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = shl i8 %a, 3
+  %1 = lshr i8 %a, 5
+  %rol = or i8 %0, %1
+  ret i8 %rol
+}
+
+define i16 @rol16mi(ptr %ptr) {
+; CHECK-LABEL: rol16mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolw $3, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x07,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = shl i16 %a, 3
+  %1 = lshr i16 %a, 13
+  %rol = or i16 %0, %1
+  ret i16 %rol
+}
+
+define i32 @rol32mi(ptr %ptr) {
+; CHECK-LABEL: rol32mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    roll $3, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x07,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = shl i32 %a, 3
+  %1 = lshr i32 %a, 29
+  %rol = or i32 %0, %1
+  ret i32 %rol
+}
+
+define i64 @rol64mi(ptr %ptr) {
+; CHECK-LABEL: rol64mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolq $3, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x07,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = shl i64 %a, 3
+  %1 = lshr i64 %a, 61
+  %rol = or i64 %0, %1
+  ret i64 %rol
+}
+
+define i8 @rol8r1(i8 noundef %a) {
+; CHECK-LABEL: rol8r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolb $1, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xc7,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i8 %a, 1
+  %1 = lshr i8 %a, 7
+  %rol = or i8 %0, %1
+  ret i8 %rol
+}
+
+define i8 @rol8r1_intrinsic(i8 noundef %a)  {
+; CHECK-LABEL: rol8r1_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rolb %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0xc7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %f = call i8 @llvm.fshr.i8(i8 %a, i8 %a, i8 7)
+  ret i8 %f
+}
+
+define i16 @rol16r1(i16 noundef %a) {
+; CHECK-LABEL: rol16r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolw $1, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0xc7,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i16 %a, 1
+  %1 = lshr i16 %a, 15
+  %rol = or i16 %0, %1
+  ret i16 %rol
+}
+
+define i16 @rol16r1_intrinsic(i16 noundef %a)  {
+; CHECK-LABEL: rol16r1_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rolw %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0xc7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %f = call i16 @llvm.fshr.i16(i16 %a, i16 %a, i16 15)
+  ret i16 %f
+}
+
+define i32 @rol32r1(i32 noundef %a) {
+; CHECK-LABEL: rol32r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    roll $1, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xc7,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i32 %a, 1
+  %1 = lshr i32 %a, 31
+  %rol = or i32 %0, %1
+  ret i32 %rol
+}
+
+define i32 @rol32r1_intrinsic(i32 noundef %a)  {
+; CHECK-LABEL: rol32r1_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    roll %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0xc7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %f = call i32 @llvm.fshr.i32(i32 %a, i32 %a, i32 31)
+  ret i32 %f
+}
+
+define i64 @rol64r1(i64 noundef %a) {
+; CHECK-LABEL: rol64r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolq $1, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xc7,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i64 %a, 1
+  %1 = lshr i64 %a, 63
+  %rol = or i64 %0, %1
+  ret i64 %rol
+}
+
+define i64 @rol64r1_intrinsic(i64 noundef %a)  {
+; CHECK-LABEL: rol64r1_intrinsic:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rolq %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0xc7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+  %f = call i64 @llvm.fshr.i64(i64 %a, i64 %a, i64 63)
+  ret i64 %f
+}
+
+define i8 @rol8rcl(i8 noundef %a, i8 %cl) {
+; CHECK-LABEL: rol8rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rolb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xc7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i8 %a, %cl
+  %1 = sub i8 8, %cl
+  %2 = lshr i8 %a, %1
+  %rol = or i8 %0, %2
+  ret i8 %rol
+}
+
+define i16 @rol16rcl(i16 noundef %a, i16 %cl) {
+; CHECK-LABEL: rol16rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rolw %cl, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0xc7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i16 %a, %cl
+  %1 = sub i16 16, %cl
+  %2 = lshr i16 %a, %1
+  %rol = or i16 %0, %2
+  ret i16 %rol
+}
+
+define i32 @rol32rcl(i32 noundef %a, i32 %cl) {
+; CHECK-LABEL: rol32rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    roll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xc7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i32 %a, %cl
+  %1 = sub i32 32, %cl
+  %2 = lshr i32 %a, %1
+  %rol = or i32 %0, %2
+  ret i32 %rol
+}
+
+define i64 @rol64rcl(i64 noundef %a, i64 %cl) {
+; CHECK-LABEL: rol64rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    rolq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xc7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i64 %a, %cl
+  %1 = sub i64 64, %cl
+  %2 = lshr i64 %a, %1
+  %rol = or i64 %0, %2
+  ret i64 %rol
+}
+
+define i8 @rol8ri(i8 noundef %a) {
+; CHECK-LABEL: rol8ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolb $3, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xc7,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i8 %a, 3
+  %1 = lshr i8 %a, 5
+  %rol = or i8 %0, %1
+  ret i8 %rol
+}
+
+define i16 @rol16ri(i16 noundef %a) {
+; CHECK-LABEL: rol16ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolw $3, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0xc7,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i16 %a, 3
+  %1 = lshr i16 %a, 13
+  %rol = or i16 %0, %1
+  ret i16 %rol
+}
+
+define i32 @rol32ri(i32 noundef %a) {
+; CHECK-LABEL: rol32ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    roll $3, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xc7,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i32 %a, 3
+  %1 = lshr i32 %a, 29
+  %rol = or i32 %0, %1
+  ret i32 %rol
+}
+
+define i64 @rol64ri(i64 noundef %a) {
+; CHECK-LABEL: rol64ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolq $3, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xc7,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = shl i64 %a, 3
+  %1 = lshr i64 %a, 61
+  %rol = or i64 %0, %1
+  ret i64 %rol
+}
+
+define void @rol8m1_legacy(ptr %ptr) {
+; CHECK-LABEL: rol8m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolb (%rdi) # encoding: [0xd0,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = shl i8 %a, 1
+  %1 = lshr i8 %a, 7
+  %rol = or i8 %0, %1
+  store i8 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol16m1_legacy(ptr %ptr) {
+; CHECK-LABEL: rol16m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolw (%rdi) # encoding: [0x66,0xd1,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = shl i16 %a, 1
+  %1 = lshr i16 %a, 15
+  %rol = or i16 %0, %1
+  store i16 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol32m1_legacy(ptr %ptr) {
+; CHECK-LABEL: rol32m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    roll (%rdi) # encoding: [0xd1,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = shl i32 %a, 1
+  %1 = lshr i32 %a, 31
+  %rol = or i32 %0, %1
+  store i32 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol64m1_legacy(ptr %ptr) {
+; CHECK-LABEL: rol64m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolq (%rdi) # encoding: [0x48,0xd1,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = shl i64 %a, 1
+  %1 = lshr i64 %a, 63
+  %rol = or i64 %0, %1
+  store i64 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol8mcl_legacy(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: rol8mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rolb %cl, (%rdi) # encoding: [0xd2,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = shl i8 %a, %cl
+  %1 = sub i8 8, %cl
+  %2 = lshr i8 %a, %1
+  %rol = or i8 %0, %2
+  store i8 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol16mcl_legacy(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: rol16mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rolw %cl, (%rdi) # encoding: [0x66,0xd3,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = shl i16 %a, %cl
+  %1 = sub i16 16, %cl
+  %2 = lshr i16 %a, %1
+  %rol = or i16 %0, %2
+  store i16 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol32mcl_legacy(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: rol32mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    roll %cl, (%rdi) # encoding: [0xd3,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = shl i32 %a, %cl
+  %1 = sub i32 32, %cl
+  %2 = lshr i32 %a, %1
+  %rol = or i32 %0, %2
+  store i32 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol64mcl_legacy(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: rol64mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    rolq %cl, (%rdi) # encoding: [0x48,0xd3,0x07]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = shl i64 %a, %cl
+  %1 = sub i64 64, %cl
+  %2 = lshr i64 %a, %1
+  %rol = or i64 %0, %2
+  store i64 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol8mi_legacy(ptr %ptr) {
+; CHECK-LABEL: rol8mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolb $3, (%rdi) # encoding: [0xc0,0x07,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = shl i8 %a, 3
+  %1 = lshr i8 %a, 5
+  %rol = or i8 %0, %1
+  store i8 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol16mi_legacy(ptr %ptr) {
+; CHECK-LABEL: rol16mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolw $3, (%rdi) # encoding: [0x66,0xc1,0x07,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = shl i16 %a, 3
+  %1 = lshr i16 %a, 13
+  %rol = or i16 %0, %1
+  store i16 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol32mi_legacy(ptr %ptr) {
+; CHECK-LABEL: rol32mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    roll $3, (%rdi) # encoding: [0xc1,0x07,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = shl i32 %a, 3
+  %1 = lshr i32 %a, 29
+  %rol = or i32 %0, %1
+  store i32 %rol, ptr %ptr
+  ret void
+}
+
+define void @rol64mi_legacy(ptr %ptr) {
+; CHECK-LABEL: rol64mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolq $3, (%rdi) # encoding: [0x48,0xc1,0x07,0x03]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = shl i64 %a, 3
+  %1 = lshr i64 %a, 61
+  %rol = or i64 %0, %1
+  store i64 %rol, ptr %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/apx/ror.ll b/llvm/test/CodeGen/X86/apx/ror.ll
new file mode 100644
index 0000000000000..e2b65e776ed57
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/ror.ll
@@ -0,0 +1,638 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+
+define i8 @ror8m1(ptr %ptr) {
+; CHECK-LABEL: ror8m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorb (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = lshr i8 %a, 1
+  %1 = shl i8 %a, 7
+  %ror = or i8 %0, %1
+  ret i8 %ror
+}
+
+define i16 @ror16m1(ptr %ptr) {
+; CHECK-LABEL: ror16m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorw (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = lshr i16 %a, 1
+  %1 = shl i16 %a, 15
+  %ror = or i16 %0, %1
+  ret i16 %ror
+}
+
+define i32 @ror32m1(ptr %ptr) {
+; CHECK-LABEL: ror32m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorl (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = lshr i32 %a, 1
+  %1 = shl i32 %a, 31
+  %ror = or i32 %0, %1
+  ret i32 %ror
+}
+
+define i64 @ror64m1(ptr %ptr) {
+; CHECK-LABEL: ror64m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorq (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = lshr i64 %a, 1
+  %1 = shl i64 %a, 63
+  %ror = or i64 %0, %1
+  ret i64 %ror
+}
+
+define i8 @ror8mcl(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: ror8mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = lshr i8 %a, %cl
+  %1 = sub i8 8, %cl
+  %2 = shl i8 %a, %1
+  %ror = or i8 %0, %2
+  ret i8 %ror
+}
+
+define i8 @ror8mcl_mask(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: ror8mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i8 %cl, 31
+  %a = load i8, ptr %ptr
+  %0 = lshr i8 %a, %shamt
+  %1 = sub i8 8, %shamt
+  %2 = shl i8 %a, %1
+  %ror = or i8 %0, %2
+  ret i8 %ror
+}
+
+define i16 @ror16mcl(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: ror16mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = lshr i16 %a, %cl
+  %1 = sub i16 16, %cl
+  %2 = shl i16 %a, %1
+  %ror = or i16 %0, %2
+  ret i16 %ror
+}
+
+define i16 @ror16mcl_mask(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: ror16mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorw %cl, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i16 %cl, 31
+  %a = load i16, ptr %ptr
+  %0 = lshr i16 %a, %shamt
+  %1 = sub i16 16, %shamt
+  %2 = shl i16 %a, %1
+  %ror = or i16 %0, %2
+  ret i16 %ror
+}
+
+define i32 @ror32mcl(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: ror32mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = lshr i32 %a, %cl
+  %1 = sub i32 32, %cl
+  %2 = shl i32 %a, %1
+  %ror = or i32 %0, %2
+  ret i32 %ror
+}
+
+define i32 @ror32mcl_mask(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: ror32mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i32 %cl, 31
+  %a = load i32, ptr %ptr
+  %0 = lshr i32 %a, %shamt
+  %1 = sub i32 32, %shamt
+  %2 = shl i32 %a, %1
+  %ror = or i32 %0, %2
+  ret i32 %ror
+}
+
+define i64 @ror64mcl(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: ror64mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    rorq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = lshr i64 %a, %cl
+  %1 = sub i64 64, %cl
+  %2 = shl i64 %a, %1
+  %ror = or i64 %0, %2
+  ret i64 %ror
+}
+
+define i64 @ror64mcl_mask(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: ror64mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    rorq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i64 %cl, 63
+  %a = load i64, ptr %ptr
+  %0 = lshr i64 %a, %shamt
+  %1 = sub i64 64, %shamt
+  %2 = shl i64 %a, %1
+  %ror = or i64 %0, %2
+  ret i64 %ror
+}
+
+define i8 @ror8mi(ptr %ptr) {
+; CHECK-LABEL: ror8mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolb $5, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x07,0x05]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = lshr i8 %a, 3
+  %1 = shl i8 %a, 5
+  %ror = or i8 %0, %1
+  ret i8 %ror
+}
+
+define i16 @ror16mi(ptr %ptr) {
+; CHECK-LABEL: ror16mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolw $13, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0x07,0x0d]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = lshr i16 %a, 3
+  %1 = shl i16 %a, 13
+  %ror = or i16 %0, %1
+  ret i16 %ror
+}
+
+define i32 @ror32mi(ptr %ptr) {
+; CHECK-LABEL: ror32mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    roll $29, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x07,0x1d]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = lshr i32 %a, 3
+  %1 = shl i32 %a, 29
+  %ror = or i32 %0, %1
+  ret i32 %ror
+}
+
+define i64 @ror64mi(ptr %ptr) {
+; CHECK-LABEL: ror64mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolq $61, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x07,0x3d]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = lshr i64 %a, 3
+  %1 = shl i64 %a, 61
+  %ror = or i64 %0, %1
+  ret i64 %ror
+}
+
+define i8 @ror8r1(i8 noundef %a) {
+; CHECK-LABEL: ror8r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorb %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd0,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i8 %a, 1
+  %1 = shl i8 %a, 7
+  %ror = or i8 %0, %1
+  ret i8 %ror
+}
+
+define i16 @ror16r1(i16 noundef %a) {
+; CHECK-LABEL: ror16r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorw %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd1,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i16 %a, 1
+  %1 = shl i16 %a, 15
+  %ror = or i16 %0, %1
+  ret i16 %ror
+}
+
+define i32 @ror32r1(i32 noundef %a) {
+; CHECK-LABEL: ror32r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorl %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd1,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i32 %a, 1
+  %1 = shl i32 %a, 31
+  %ror = or i32 %0, %1
+  ret i32 %ror
+}
+
+define i64 @ror64r1(i64 noundef %a) {
+; CHECK-LABEL: ror64r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorq %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd1,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i64 %a, 1
+  %1 = shl i64 %a, 63
+  %ror = or i64 %0, %1
+  ret i64 %ror
+}
+
+define i8 @ror8rcl(i8 noundef %a, i8 %cl) {
+; CHECK-LABEL: ror8rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i8 %a, %cl
+  %1 = sub i8 8, %cl
+  %2 = shl i8 %a, %1
+  %ror = or i8 %0, %2
+  ret i8 %ror
+}
+
+define i8 @ror8rcl_mask(i8 noundef %a, i8 %cl) {
+; CHECK-LABEL: ror8rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i8 %cl, 31
+  %0 = lshr i8 %a, %shamt
+  %1 = sub i8 8, %shamt
+  %2 = shl i8 %a, %1
+  %ror = or i8 %0, %2
+  ret i8 %ror
+}
+
+define i16 @ror16rcl(i16 noundef %a, i16 %cl) {
+; CHECK-LABEL: ror16rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorw %cl, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i16 %a, %cl
+  %1 = sub i16 16, %cl
+  %2 = shl i16 %a, %1
+  %ror = or i16 %0, %2
+  ret i16 %ror
+}
+
+define i16 @ror16rcl_mask(i16 noundef %a, i16 %cl) {
+; CHECK-LABEL: ror16rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorw %cl, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xd3,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i16 %cl, 31
+  %0 = lshr i16 %a, %shamt
+  %1 = sub i16 16, %shamt
+  %2 = shl i16 %a, %1
+  %ror = or i16 %0, %2
+  ret i16 %ror
+}
+
+define i32 @ror32rcl(i32 noundef %a, i32 %cl) {
+; CHECK-LABEL: ror32rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i32 %a, %cl
+  %1 = sub i32 32, %cl
+  %2 = shl i32 %a, %1
+  %ror = or i32 %0, %2
+  ret i32 %ror
+}
+
+define i32 @ror32rcl_mask(i32 noundef %a, i32 %cl) {
+; CHECK-LABEL: ror32rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i32 %cl, 31
+  %0 = lshr i32 %a, %shamt
+  %1 = sub i32 32, %shamt
+  %2 = shl i32 %a, %1
+  %ror = or i32 %0, %2
+  ret i32 %ror
+}
+
+define i64 @ror64rcl(i64 noundef %a, i64 %cl) {
+; CHECK-LABEL: ror64rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    rorq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i64 %a, %cl
+  %1 = sub i64 64, %cl
+  %2 = shl i64 %a, %1
+  %ror = or i64 %0, %2
+  ret i64 %ror
+}
+
+define i64 @ror64rcl_mask(i64 noundef %a, i64 %cl) {
+; CHECK-LABEL: ror64rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    rorq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xcf]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i64 %cl, 63
+  %0 = lshr i64 %a, %shamt
+  %1 = sub i64 64, %shamt
+  %2 = shl i64 %a, %1
+  %ror = or i64 %0, %2
+  ret i64 %ror
+}
+
+define i8 @ror8ri(i8 noundef %a) {
+; CHECK-LABEL: ror8ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolb $5, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xc7,0x05]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i8 %a, 3
+  %1 = shl i8 %a, 5
+  %ror = or i8 %0, %1
+  ret i8 %ror
+}
+
+define i16 @ror16ri(i16 noundef %a) {
+; CHECK-LABEL: ror16ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolw $13, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0xc1,0xc7,0x0d]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i16 %a, 3
+  %1 = shl i16 %a, 13
+  %ror = or i16 %0, %1
+  ret i16 %ror
+}
+
+define i32 @ror32ri(i32 noundef %a) {
+; CHECK-LABEL: ror32ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    roll $29, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xc7,0x1d]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i32 %a, 3
+  %1 = shl i32 %a, 29
+  %ror = or i32 %0, %1
+  ret i32 %ror
+}
+
+define i64 @ror64ri(i64 noundef %a) {
+; CHECK-LABEL: ror64ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolq $61, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xc7,0x3d]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %0 = lshr i64 %a, 3
+  %1 = shl i64 %a, 61
+  %ror = or i64 %0, %1
+  ret i64 %ror
+}
+
+define void @ror8m1_legacy(ptr %ptr) {
+; CHECK-LABEL: ror8m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorb (%rdi) # encoding: [0xd0,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = lshr i8 %a, 1
+  %1 = shl i8 %a, 7
+  %ror = or i8 %0, %1
+  store i8 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror16m1_legacy(ptr %ptr) {
+; CHECK-LABEL: ror16m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorw (%rdi) # encoding: [0x66,0xd1,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = lshr i16 %a, 1
+  %1 = shl i16 %a, 15
+  %ror = or i16 %0, %1
+  store i16 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror32m1_legacy(ptr %ptr) {
+; CHECK-LABEL: ror32m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorl (%rdi) # encoding: [0xd1,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = lshr i32 %a, 1
+  %1 = shl i32 %a, 31
+  %ror = or i32 %0, %1
+  store i32 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror64m1_legacy(ptr %ptr) {
+; CHECK-LABEL: ror64m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rorq (%rdi) # encoding: [0x48,0xd1,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = lshr i64 %a, 1
+  %1 = shl i64 %a, 63
+  %ror = or i64 %0, %1
+  store i64 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror8mcl_legacy(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: ror8mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorb %cl, (%rdi) # encoding: [0xd2,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = lshr i8 %a, %cl
+  %1 = sub i8 8, %cl
+  %2 = shl i8 %a, %1
+  %ror = or i8 %0, %2
+  store i8 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror16mcl_legacy(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: ror16mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorw %cl, (%rdi) # encoding: [0x66,0xd3,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = lshr i16 %a, %cl
+  %1 = sub i16 16, %cl
+  %2 = shl i16 %a, %1
+  %ror = or i16 %0, %2
+  store i16 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror32mcl_legacy(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: ror32mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    rorl %cl, (%rdi) # encoding: [0xd3,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = lshr i32 %a, %cl
+  %1 = sub i32 32, %cl
+  %2 = shl i32 %a, %1
+  %ror = or i32 %0, %2
+  store i32 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror64mcl_legacy(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: ror64mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    rorq %cl, (%rdi) # encoding: [0x48,0xd3,0x0f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = lshr i64 %a, %cl
+  %1 = sub i64 64, %cl
+  %2 = shl i64 %a, %1
+  %ror = or i64 %0, %2
+  store i64 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror8mi_legacy(ptr %ptr) {
+; CHECK-LABEL: ror8mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolb $5, (%rdi) # encoding: [0xc0,0x07,0x05]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %0 = lshr i8 %a, 3
+  %1 = shl i8 %a, 5
+  %ror = or i8 %0, %1
+  store i8 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror16mi_legacy(ptr %ptr) {
+; CHECK-LABEL: ror16mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolw $13, (%rdi) # encoding: [0x66,0xc1,0x07,0x0d]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %0 = lshr i16 %a, 3
+  %1 = shl i16 %a, 13
+  %ror = or i16 %0, %1
+  store i16 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror32mi_legacy(ptr %ptr) {
+; CHECK-LABEL: ror32mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    roll $29, (%rdi) # encoding: [0xc1,0x07,0x1d]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %0 = lshr i32 %a, 3
+  %1 = shl i32 %a, 29
+  %ror = or i32 %0, %1
+  store i32 %ror, ptr %ptr
+  ret void
+}
+
+define void @ror64mi_legacy(ptr %ptr) {
+; CHECK-LABEL: ror64mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    rolq $61, (%rdi) # encoding: [0x48,0xc1,0x07,0x3d]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %0 = lshr i64 %a, 3
+  %1 = shl i64 %a, 61
+  %ror = or i64 %0, %1
+  store i64 %ror, ptr %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/apx/sar.ll b/llvm/test/CodeGen/X86/apx/sar.ll
new file mode 100644
index 0000000000000..091a0eec321cf
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/sar.ll
@@ -0,0 +1,546 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+
+define i8 @sar8m1(ptr %ptr) {
+; CHECK-LABEL: sar8m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarb $1, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x3f,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %sar = ashr i8 %a, 1
+  ret i8 %sar
+}
+
+define i16 @sar16m1(ptr %ptr) {
+; CHECK-LABEL: sar16m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movswl (%rdi), %eax # encoding: [0x0f,0xbf,0x07]
+; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %sar = ashr i16 %a, 1
+  ret i16 %sar
+}
+
+define i32 @sar32m1(ptr %ptr) {
+; CHECK-LABEL: sar32m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarl $1, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x3f,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %sar = ashr i32 %a, 1
+  ret i32 %sar
+}
+
+define i64 @sar64m1(ptr %ptr) {
+; CHECK-LABEL: sar64m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarq $1, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x3f,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %sar = ashr i64 %a, 1
+  ret i64 %sar
+}
+
+define i8 @sar8mcl(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: sar8mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %sar = ashr i8 %a, %cl
+  ret i8 %sar
+}
+
+define i8 @sar8mcl_mask(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: sar8mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shamt = and i8 %cl, 31
+  %sar = ashr i8 %a, %shamt
+  ret i8 %sar
+}
+
+define i16 @sar16mcl(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: sar16mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movswl (%rdi), %eax # encoding: [0x0f,0xbf,0x07]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %sar = ashr i16 %a, %cl
+  ret i16 %sar
+}
+
+define i16 @sar16mcl_mask(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: sar16mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movswl (%rdi), %eax # encoding: [0x0f,0xbf,0x07]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shamt = and i16 %cl, 31
+  %sar = ashr i16 %a, %shamt
+  ret i16 %sar
+}
+
+define i32 @sar32mcl(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: sar32mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %sar = ashr i32 %a, %cl
+  ret i32 %sar
+}
+
+define i32 @sar32mcl_mask(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: sar32mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shamt = and i32 %cl, 31
+  %sar = ashr i32 %a, %shamt
+  ret i32 %sar
+}
+
+define i64 @sar64mcl(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: sar64mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    sarq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %sar = ashr i64 %a, %cl
+  ret i64 %sar
+}
+
+define i64 @sar64mcl_mask(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: sar64mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    sarq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shamt = and i64 %cl, 63
+  %sar = ashr i64 %a, %shamt
+  ret i64 %sar
+}
+
+define i8 @sar8mi(ptr %ptr) {
+; CHECK-LABEL: sar8mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x3f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %sar = ashr i8 %a, 4
+  ret i8 %sar
+}
+
+define i16 @sar16mi(ptr %ptr) {
+; CHECK-LABEL: sar16mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movswl (%rdi), %eax # encoding: [0x0f,0xbf,0x07]
+; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %sar = ashr i16 %a, 4
+  ret i16 %sar
+}
+
+define i32 @sar32mi(ptr %ptr) {
+; CHECK-LABEL: sar32mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarl $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x3f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %sar = ashr i32 %a, 4
+  ret i32 %sar
+}
+
+define i64 @sar64mi(ptr %ptr) {
+; CHECK-LABEL: sar64mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x3f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %sar = ashr i64 %a, 4
+  ret i64 %sar
+}
+
+define i8 @sar8r1(i8 noundef %a) {
+; CHECK-LABEL: sar8r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarb $1, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xff,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i8 %a, 1
+  ret i8 %sar
+}
+
+define i16 @sar16r1(i16 noundef %a) {
+; CHECK-LABEL: sar16r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movswl %di, %eax # encoding: [0x0f,0xbf,0xc7]
+; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i16 %a, 1
+  ret i16 %sar
+}
+
+define i32 @sar32r1(i32 noundef %a) {
+; CHECK-LABEL: sar32r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarl $1, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xff,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i32 %a, 1
+  ret i32 %sar
+}
+
+define i64 @sar64r1(i64 noundef %a) {
+; CHECK-LABEL: sar64r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarq $1, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xff,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i64 %a, 1
+  ret i64 %sar
+}
+
+define i8 @sar8rcl(i8 noundef %a, i8 %cl) {
+; CHECK-LABEL: sar8rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xff]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i8 %a, %cl
+  ret i8 %sar
+}
+
+define i8 @sar8rcl_mask(i8 noundef %a, i8 %cl) {
+; CHECK-LABEL: sar8rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xff]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i8 %cl, 31
+  %sar = ashr i8 %a, %shamt
+  ret i8 %sar
+}
+
+define i16 @sar16rcl(i16 noundef %a, i16 %cl) {
+; CHECK-LABEL: sar16rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movswl %di, %eax # encoding: [0x0f,0xbf,0xc7]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i16 %a, %cl
+  ret i16 %sar
+}
+
+define i16 @sar16rcl_mask(i16 noundef %a, i16 %cl) {
+; CHECK-LABEL: sar16rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movswl %di, %eax # encoding: [0x0f,0xbf,0xc7]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xf8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i16 %cl, 31
+  %sar = ashr i16 %a, %shamt
+  ret i16 %sar
+}
+
+define i32 @sar32rcl(i32 noundef %a, i32 %cl) {
+; CHECK-LABEL: sar32rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xff]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i32 %a, %cl
+  ret i32 %sar
+}
+
+define i32 @sar32rcl_mask(i32 noundef %a, i32 %cl) {
+; CHECK-LABEL: sar32rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xff]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i32 %cl, 31
+  %sar = ashr i32 %a, %shamt
+  ret i32 %sar
+}
+
+define i64 @sar64rcl(i64 noundef %a, i64 %cl) {
+; CHECK-LABEL: sar64rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    sarq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xff]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i64 %a, %cl
+  ret i64 %sar
+}
+
+define i64 @sar64rcl_mask(i64 noundef %a, i64 %cl) {
+; CHECK-LABEL: sar64rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    sarq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xff]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i64 %cl, 63
+  %sar = ashr i64 %a, %shamt
+  ret i64 %sar
+}
+
+define i8 @sar8ri(i8 noundef %a) {
+; CHECK-LABEL: sar8ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarb $4, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xff,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i8 %a, 4
+  ret i8 %sar
+}
+
+define i16 @sar16ri(i16 noundef %a) {
+; CHECK-LABEL: sar16ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movswl %di, %eax # encoding: [0x0f,0xbf,0xc7]
+; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i16 %a, 4
+  ret i16 %sar
+}
+
+define i32 @sar32ri(i32 noundef %a) {
+; CHECK-LABEL: sar32ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarl $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xff,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i32 %a, 4
+  ret i32 %sar
+}
+
+define i64 @sar64ri(i64 noundef %a) {
+; CHECK-LABEL: sar64ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarq $4, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xff,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %sar = ashr i64 %a, 4
+  ret i64 %sar
+}
+
+define void @sar8m1_legacy(ptr %ptr) {
+; CHECK-LABEL: sar8m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarb (%rdi) # encoding: [0xd0,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %sar = ashr i8 %a, 1
+  store i8 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar16m1_legacy(ptr %ptr) {
+; CHECK-LABEL: sar16m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarw (%rdi) # encoding: [0x66,0xd1,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %sar = ashr i16 %a, 1
+  store i16 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar32m1_legacy(ptr %ptr) {
+; CHECK-LABEL: sar32m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarl (%rdi) # encoding: [0xd1,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %sar = ashr i32 %a, 1
+  store i32 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar64m1_legacy(ptr %ptr) {
+; CHECK-LABEL: sar64m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarq (%rdi) # encoding: [0x48,0xd1,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %sar = ashr i64 %a, 1
+  store i64 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar8mcl_legacy(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: sar8mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarb %cl, (%rdi) # encoding: [0xd2,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %sar = ashr i8 %a, %cl
+  store i8 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar16mcl_legacy(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: sar16mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarw %cl, (%rdi) # encoding: [0x66,0xd3,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %sar = ashr i16 %a, %cl
+  store i16 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar32mcl_legacy(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: sar32mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, (%rdi) # encoding: [0xd3,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %sar = ashr i32 %a, %cl
+  store i32 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar64mcl_legacy(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: sar64mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    sarq %cl, (%rdi) # encoding: [0x48,0xd3,0x3f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %sar = ashr i64 %a, %cl
+  store i64 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar8mi_legacy(ptr %ptr) {
+; CHECK-LABEL: sar8mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarb $4, (%rdi) # encoding: [0xc0,0x3f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %sar = ashr i8 %a, 4
+  store i8 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar16mi_legacy(ptr %ptr) {
+; CHECK-LABEL: sar16mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarw $4, (%rdi) # encoding: [0x66,0xc1,0x3f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %sar = ashr i16 %a, 4
+  store i16 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar32mi_legacy(ptr %ptr) {
+; CHECK-LABEL: sar32mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarl $4, (%rdi) # encoding: [0xc1,0x3f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %sar = ashr i32 %a, 4
+  store i32 %sar, ptr %ptr
+  ret void
+}
+
+define void @sar64mi_legacy(ptr %ptr) {
+; CHECK-LABEL: sar64mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    sarq $4, (%rdi) # encoding: [0x48,0xc1,0x3f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %sar = ashr i64 %a, 4
+  store i64 %sar, ptr %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/apx/shl.ll b/llvm/test/CodeGen/X86/apx/shl.ll
new file mode 100644
index 0000000000000..869caf932ff92
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/shl.ll
@@ -0,0 +1,545 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+
+define i8 @shl8ri(i8 noundef %a) {
+; CHECK-LABEL: shl8ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlb $4, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xe7,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i8 %a, 4
+  ret i8 %shl
+}
+
+define i16 @shl16ri(i16 noundef %a) {
+; CHECK-LABEL: shl16ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shll $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xe7,0x04]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i16 %a, 4
+  ret i16 %shl
+}
+
+define i32 @shl32ri(i32 noundef %a) {
+; CHECK-LABEL: shl32ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shll $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xe7,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i32 %a, 4
+  ret i32 %shl
+}
+
+define i64 @shl64ri(i64 noundef %a) {
+; CHECK-LABEL: shl64ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlq $4, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xe7,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i64 %a, 4
+  ret i64 %shl
+}
+
+define i8 @shl8m1(ptr %ptr) {
+; CHECK-LABEL: shl8m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzbl (%rdi), %eax # encoding: [0x0f,0xb6,0x07]
+; CHECK-NEXT:    addb %al, %al # EVEX TO LEGACY Compression encoding: [0x00,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shl = shl i8 %a, 1
+  ret i8 %shl
+}
+
+define i16 @shl16m1(ptr %ptr) {
+; CHECK-LABEL: shl16m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; CHECK-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shl = shl i16 %a, 1
+  ret i16 %shl
+}
+
+define i32 @shl32m1(ptr %ptr) {
+; CHECK-LABEL: shl32m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl (%rdi), %eax # encoding: [0x8b,0x07]
+; CHECK-NEXT:    addl %eax, %eax # EVEX TO LEGACY Compression encoding: [0x01,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shl = shl i32 %a, 1
+  ret i32 %shl
+}
+
+define i64 @shl64m1(ptr %ptr) {
+; CHECK-LABEL: shl64m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq (%rdi), %rax # encoding: [0x48,0x8b,0x07]
+; CHECK-NEXT:    addq %rax, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x01,0xc0]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shl = shl i64 %a, 1
+  ret i64 %shl
+}
+
+define i8 @shl8mcl(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: shl8mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shl = shl i8 %a, %cl
+  ret i8 %shl
+}
+
+define i8 @shl8mcl_mask(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: shl8mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shlb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shamt = and i8 %cl, 31
+  %shl = shl i8 %a, %shamt
+  ret i8 %shl
+}
+
+define i16 @shl16mcl(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: shl16mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shl = shl i16 %a, %cl
+  ret i16 %shl
+}
+
+define i16 @shl16mcl_mask(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: shl16mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe0]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shamt = and i16 %cl, 31
+  %shl = shl i16 %a, %shamt
+  ret i16 %shl
+}
+
+define i32 @shl32mcl(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: shl32mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shl = shl i32 %a, %cl
+  ret i32 %shl
+}
+
+define i32 @shl32mcl_mask(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: shl32mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shamt = and i32 %cl, 31
+  %shl = shl i32 %a, %shamt
+  ret i32 %shl
+}
+
+define i64 @shl64mcl(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: shl64mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shl = shl i64 %a, %cl
+  ret i64 %shl
+}
+
+define i64 @shl64mcl_mask(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: shl64mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shlq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shamt = and i64 %cl, 63
+  %shl = shl i64 %a, %shamt
+  ret i64 %shl
+}
+
+define i8 @shl8mi(ptr %ptr) {
+; CHECK-LABEL: shl8mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x27,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shl = shl i8 %a, 4
+  ret i8 %shl
+}
+
+define i16 @shl16mi(ptr %ptr) {
+; CHECK-LABEL: shl16mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; CHECK-NEXT:    shll $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe0,0x04]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shl = shl i16 %a, 4
+  ret i16 %shl
+}
+
+define i32 @shl32mi(ptr %ptr) {
+; CHECK-LABEL: shl32mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shll $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x27,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shl = shl i32 %a, 4
+  ret i32 %shl
+}
+
+define i64 @shl64mi(ptr %ptr) {
+; CHECK-LABEL: shl64mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x27,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shl = shl i64 %a, 4
+  ret i64 %shl
+}
+
+define i8 @shl8r1(i8 noundef %a) {
+; CHECK-LABEL: shl8r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addb %dil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x00,0xff]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i8 %a, 1
+  ret i8 %shl
+}
+
+define i16 @shl16r1(i16 noundef %a) {
+; CHECK-LABEL: shl16r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i16 %a, 1
+  ret i16 %shl
+}
+
+define i32 @shl32r1(i32 noundef %a) {
+; CHECK-LABEL: shl32r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addl %edi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x01,0xff]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i32 %a, 1
+  ret i32 %shl
+}
+
+define i64 @shl64r1(i64 noundef %a) {
+; CHECK-LABEL: shl64r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    addq %rdi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x01,0xff]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i64 %a, 1
+  ret i64 %shl
+}
+
+define i8 @shl8rcl(i8 noundef %a, i8 %cl) {
+; CHECK-LABEL: shl8rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shlb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xe7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i8 %a, %cl
+  ret i8 %shl
+}
+
+define i8 @shl8rcl_mask(i8 noundef %a, i8 %cl) {
+; CHECK-LABEL: shl8rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shlb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xe7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i8 %cl, 31
+  %shl = shl i8 %a, %shamt
+  ret i8 %shl
+}
+
+define i16 @shl16rcl(i16 noundef %a, i16 %cl) {
+; CHECK-LABEL: shl16rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i16 %a, %cl
+  ret i16 %shl
+}
+
+define i16 @shl16rcl_mask(i16 noundef %a, i16 %cl) {
+; CHECK-LABEL: shl16rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i16 %cl, 31
+  %shl = shl i16 %a, %shamt
+  ret i16 %shl
+}
+
+define i32 @shl32rcl(i32 noundef %a, i32 %cl) {
+; CHECK-LABEL: shl32rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i32 %a, %cl
+  ret i32 %shl
+}
+
+define i32 @shl32rcl_mask(i32 noundef %a, i32 %cl) {
+; CHECK-LABEL: shl32rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xe7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i32 %cl, 31
+  %shl = shl i32 %a, %shamt
+  ret i32 %shl
+}
+
+define i64 @shl64rcl(i64 noundef %a, i64 %cl) {
+; CHECK-LABEL: shl64rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shlq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xe7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shl = shl i64 %a, %cl
+  ret i64 %shl
+}
+
+define i64 @shl64rcl_mask(i64 noundef %a, i64 %cl) {
+; CHECK-LABEL: shl64rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shlq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xe7]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i64 %cl, 63
+  %shl = shl i64 %a, %shamt
+  ret i64 %shl
+}
+
+define void @shl8m1_legacy(ptr %ptr) {
+; CHECK-LABEL: shl8m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlb (%rdi) # encoding: [0xd0,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shl = shl i8 %a, 1
+  store i8 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl16m1_legacy(ptr %ptr) {
+; CHECK-LABEL: shl16m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlw (%rdi) # encoding: [0x66,0xd1,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shl = shl i16 %a, 1
+  store i16 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl32m1_legacy(ptr %ptr) {
+; CHECK-LABEL: shl32m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shll (%rdi) # encoding: [0xd1,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shl = shl i32 %a, 1
+  store i32 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl64m1_legacy(ptr %ptr) {
+; CHECK-LABEL: shl64m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlq (%rdi) # encoding: [0x48,0xd1,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shl = shl i64 %a, 1
+  store i64 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl8mi_legacy(ptr %ptr) {
+; CHECK-LABEL: shl8mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlb $4, (%rdi) # encoding: [0xc0,0x27,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shl = shl i8 %a, 4
+  store i8 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl16mi_legacy(ptr %ptr) {
+; CHECK-LABEL: shl16mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlw $4, (%rdi) # encoding: [0x66,0xc1,0x27,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shl = shl i16 %a, 4
+  store i16 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl32mi_legacy(ptr %ptr) {
+; CHECK-LABEL: shl32mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shll $4, (%rdi) # encoding: [0xc1,0x27,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shl = shl i32 %a, 4
+  store i32 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl64mi_legacy(ptr %ptr) {
+; CHECK-LABEL: shl64mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shlq $4, (%rdi) # encoding: [0x48,0xc1,0x27,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shl = shl i64 %a, 4
+  store i64 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl8mcl_legacy(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: shl8mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shlb %cl, (%rdi) # encoding: [0xd2,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shl = shl i8 %a, %cl
+  store i8 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl16mcl_legacy(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: shl16mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shlw %cl, (%rdi) # encoding: [0x66,0xd3,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shl = shl i16 %a, %cl
+  store i16 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl32mcl_legacy(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: shl32mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, (%rdi) # encoding: [0xd3,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shl = shl i32 %a, %cl
+  store i32 %shl, ptr %ptr
+  ret void
+}
+
+define void @shl64mcl_legacy(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: shl64mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shlq %cl, (%rdi) # encoding: [0x48,0xd3,0x27]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shl = shl i64 %a, %cl
+  store i64 %shl, ptr %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/apx/shld.ll b/llvm/test/CodeGen/X86/apx/shld.ll
new file mode 100644
index 0000000000000..5b99719ba537c
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/shld.ll
@@ -0,0 +1,265 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+
+define i16 @shld16rrcl(i16 noundef %a, i16 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld16rrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andb $15, %dl, %cl
+; CHECK-NEXT:    shldw %cl, %si, %di, %ax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i16
+    %shld = call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 %clin)
+    ret i16 %shld
+}
+
+define i16 @shld16rrcl_mask(i16 noundef %a, i16 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld16rrcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andb $15, %dl, %cl
+; CHECK-NEXT:    shldw %cl, %si, %di, %ax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i16
+    %shamt = and i16 %clin, 31
+    %shld = call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 %shamt)
+    ret i16 %shld
+}
+
+define i32 @shld32rrcl(i32 noundef %a, i32 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld32rrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shldl %cl, %esi, %edi, %eax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i32
+    %shld = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %clin)
+    ret i32 %shld
+}
+
+define i32 @shld32rrcl_mask(i32 noundef %a, i32 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld32rrcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shldl %cl, %esi, %edi, %eax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i32
+    %shamt = and i32 %clin, 31
+    %shld = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %shamt)
+    ret i32 %shld
+}
+
+define i64 @shld64rrcl(i64 noundef %a, i64 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld64rrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i64
+    %shld = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %clin)
+    ret i64 %shld
+}
+
+define i64 @shld64rrcl_mask(i64 noundef %a, i64 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld64rrcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shldq %cl, %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i64
+    %shamt = and i64 %clin, 63
+    %shld = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %clin)
+    ret i64 %shld
+}
+
+define i16 @shld16rri8(i16 noundef %a, i16 noundef %b) {
+; CHECK-LABEL: shld16rri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shldw $12, %si, %di, %ax
+; CHECK-NEXT:    retq
+entry:
+    %shld = call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 12)
+    ret i16 %shld
+}
+
+define i32 @shld32rri8(i32 noundef %a, i32 noundef %b) {
+; CHECK-LABEL: shld32rri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shldl $12, %esi, %edi, %eax
+; CHECK-NEXT:    retq
+entry:
+    %shld = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 12)
+    ret i32 %shld
+}
+
+define i64 @shld64rri8(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: shld64rri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shldq $12, %rsi, %rdi, %rax
+; CHECK-NEXT:    retq
+entry:
+    %shld = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 12)
+    ret i64 %shld
+}
+
+define i16 @shld16mrcl(ptr %ptr, i16 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld16mrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andb $15, %dl, %cl
+; CHECK-NEXT:    shldw %cl, %si, (%rdi), %ax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i16, ptr %ptr
+    %clin = sext i8 %cl to i16
+    %shld = call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 %clin)
+    ret i16 %shld
+}
+
+define i32 @shld32mrcl(ptr %ptr, i32 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld32mrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shldl %cl, %esi, (%rdi), %eax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i32, ptr %ptr
+    %clin = sext i8 %cl to i32
+    %shld = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %clin)
+    ret i32 %shld
+}
+
+define i64 @shld64mrcl(ptr %ptr, i64 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld64mrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shldq %cl, %rsi, (%rdi), %rax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i64, ptr %ptr
+    %clin = sext i8 %cl to i64
+    %shld = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %clin)
+    ret i64 %shld
+}
+
+define i16 @shld16mri8(ptr %ptr, i16 noundef %b) {
+; CHECK-LABEL: shld16mri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shldw $12, %si, (%rdi), %ax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i16, ptr %ptr
+    %shld = call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 12)
+    ret i16 %shld
+}
+
+define i32 @shld32mri8(ptr %ptr, i32 noundef %b) {
+; CHECK-LABEL: shld32mri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shldl $12, %esi, (%rdi), %eax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i32, ptr %ptr
+    %shld = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 12)
+    ret i32 %shld
+}
+
+define i64 @shld64mri8(ptr %ptr, i64 noundef %b) {
+; CHECK-LABEL: shld64mri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shldq $12, %rsi, (%rdi), %rax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i64, ptr %ptr
+    %shld = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 12)
+    ret i64 %shld
+}
+
+define void @shld16mrcl_legacy(ptr %ptr, i16 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld16mrcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andb $15, %dl, %cl
+; CHECK-NEXT:    shldw %cl, %si, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i16, ptr %ptr
+    %clin = sext i8 %cl to i16
+    %shld = call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 %clin)
+    store i16 %shld, ptr %ptr
+    ret void
+}
+
+define void @shld32mrcl_legacy(ptr %ptr, i32 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld32mrcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shldl %cl, %esi, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i32, ptr %ptr
+    %clin = sext i8 %cl to i32
+    %shld = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %clin)
+    store i32 %shld, ptr %ptr
+    ret void
+}
+
+define void @shld64mrcl_legacy(ptr %ptr, i64 noundef %b, i8 %cl) {
+; CHECK-LABEL: shld64mrcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shldq %cl, %rsi, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i64, ptr %ptr
+    %clin = sext i8 %cl to i64
+    %shld = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 %clin)
+    store i64 %shld, ptr %ptr
+    ret void
+}
+
+define void @shld16mri8_legacy(ptr %ptr, i16 noundef %b) {
+; CHECK-LABEL: shld16mri8_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shldw $12, %si, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i16, ptr %ptr
+    %shld = call i16 @llvm.fshl.i16(i16 %a, i16 %b, i16 12)
+    store i16 %shld, ptr %ptr
+    ret void
+}
+
+define void @shld32mri8_legacy(ptr %ptr, i32 noundef %b) {
+; CHECK-LABEL: shld32mri8_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shldl $12, %esi, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i32, ptr %ptr
+    %shld = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 12)
+    store i32 %shld, ptr %ptr
+    ret void
+}
+
+define void @shld64mri8_legacy(ptr %ptr, i64 noundef %b) {
+; CHECK-LABEL: shld64mri8_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shldq $12, %rsi, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i64, ptr %ptr
+    %shld = call i64 @llvm.fshl.i64(i64 %a, i64 %b, i64 12)
+    store i64 %shld, ptr %ptr
+    ret void
+}
diff --git a/llvm/test/CodeGen/X86/apx/shr.ll b/llvm/test/CodeGen/X86/apx/shr.ll
new file mode 100644
index 0000000000000..d633251fc6a25
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/shr.ll
@@ -0,0 +1,546 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs --show-mc-encoding | FileCheck %s
+
+define i8 @shr8m1(ptr %ptr) {
+; CHECK-LABEL: shr8m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrb $1, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x2f,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shr = lshr i8 %a, 1
+  ret i8 %shr
+}
+
+define i16 @shr16m1(ptr %ptr) {
+; CHECK-LABEL: shr16m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shr = lshr i16 %a, 1
+  ret i16 %shr
+}
+
+define i32 @shr32m1(ptr %ptr) {
+; CHECK-LABEL: shr32m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrl $1, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x2f,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shr = lshr i32 %a, 1
+  ret i32 %shr
+}
+
+define i64 @shr64m1(ptr %ptr) {
+; CHECK-LABEL: shr64m1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrq $1, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x2f,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shr = lshr i64 %a, 1
+  ret i64 %shr
+}
+
+define i8 @shr8mcl(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: shr8mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shr = lshr i8 %a, %cl
+  ret i8 %shr
+}
+
+define i8 @shr8mcl_mask(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: shr8mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrb %cl, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shamt = and i8 %cl, 31
+  %shr = lshr i8 %a, %shamt
+  ret i8 %shr
+}
+
+define i16 @shr16mcl(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: shr16mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shr = lshr i16 %a, %cl
+  ret i16 %shr
+}
+
+define i16 @shr16mcl_mask(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: shr16mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shamt = and i16 %cl, 31
+  %shr = lshr i16 %a, %shamt
+  ret i16 %shr
+}
+
+define i32 @shr32mcl(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: shr32mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shr = lshr i32 %a, %cl
+  ret i32 %shr
+}
+
+define i32 @shr32mcl_mask(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: shr32mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shamt = and i32 %cl, 31
+  %shr = lshr i32 %a, %shamt
+  ret i32 %shr
+}
+
+define i64 @shr64mcl(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: shr64mcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shr = lshr i64 %a, %cl
+  ret i64 %shr
+}
+
+define i64 @shr64mcl_mask(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: shr64mcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shrq %cl, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shamt = and i64 %cl, 63
+  %shr = lshr i64 %a, %shamt
+  ret i64 %shr
+}
+
+define i8 @shr8mi(ptr %ptr) {
+; CHECK-LABEL: shr8mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrb $4, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0x2f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shr = lshr i8 %a, 4
+  ret i8 %shr
+}
+
+define i16 @shr16mi(ptr %ptr) {
+; CHECK-LABEL: shr16mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl (%rdi), %eax # encoding: [0x0f,0xb7,0x07]
+; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shr = lshr i16 %a, 4
+  ret i16 %shr
+}
+
+define i32 @shr32mi(ptr %ptr) {
+; CHECK-LABEL: shr32mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrl $4, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0x2f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shr = lshr i32 %a, 4
+  ret i32 %shr
+}
+
+define i64 @shr64mi(ptr %ptr) {
+; CHECK-LABEL: shr64mi:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrq $4, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0x2f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shr = lshr i64 %a, 4
+  ret i64 %shr
+}
+
+define i8 @shr8r1(i8 noundef %a) {
+; CHECK-LABEL: shr8r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrb $1, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xef,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i8 %a, 1
+  ret i8 %shr
+}
+
+define i16 @shr16r1(i16 noundef %a) {
+; CHECK-LABEL: shr16r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; CHECK-NEXT:    shrl %eax # EVEX TO LEGACY Compression encoding: [0xd1,0xe8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i16 %a, 1
+  ret i16 %shr
+}
+
+define i32 @shr32r1(i32 noundef %a) {
+; CHECK-LABEL: shr32r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrl $1, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xef,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i32 %a, 1
+  ret i32 %shr
+}
+
+define i64 @shr64r1(i64 noundef %a) {
+; CHECK-LABEL: shr64r1:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrq $1, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xef,0x01]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i64 %a, 1
+  ret i64 %shr
+}
+
+define i8 @shr8rcl(i8 noundef %a, i8 %cl) {
+; CHECK-LABEL: shr8rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xef]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i8 %a, %cl
+  ret i8 %shr
+}
+
+define i8 @shr8rcl_mask(i8 noundef %a, i8 %cl) {
+; CHECK-LABEL: shr8rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrb %cl, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xd2,0xef]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i8 %cl, 31
+  %shr = lshr i8 %a, %shamt
+  ret i8 %shr
+}
+
+define i16 @shr16rcl(i16 noundef %a, i16 %cl) {
+; CHECK-LABEL: shr16rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i16 %a, %cl
+  ret i16 %shr
+}
+
+define i16 @shr16rcl_mask(i16 noundef %a, i16 %cl) {
+; CHECK-LABEL: shr16rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, %eax # EVEX TO LEGACY Compression encoding: [0xd3,0xe8]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i16 %cl, 31
+  %shr = lshr i16 %a, %shamt
+  ret i16 %shr
+}
+
+define i32 @shr32rcl(i32 noundef %a, i32 %cl) {
+; CHECK-LABEL: shr32rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xef]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i32 %a, %cl
+  ret i32 %shr
+}
+
+define i32 @shr32rcl_mask(i32 noundef %a, i32 %cl) {
+; CHECK-LABEL: shr32rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xd3,0xef]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i32 %cl, 31
+  %shr = lshr i32 %a, %shamt
+  ret i32 %shr
+}
+
+define i64 @shr64rcl(i64 noundef %a, i64 %cl) {
+; CHECK-LABEL: shr64rcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shrq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xef]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i64 %a, %cl
+  ret i64 %shr
+}
+
+define i64 @shr64rcl_mask(i64 noundef %a, i64 %cl) {
+; CHECK-LABEL: shr64rcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shrq %cl, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xd3,0xef]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shamt = and i64 %cl, 63
+  %shr = lshr i64 %a, %shamt
+  ret i64 %shr
+}
+
+define i8 @shr8ri(i8 noundef %a) {
+; CHECK-LABEL: shr8ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrb $4, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0xc0,0xef,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i8 %a, 4
+  ret i8 %shr
+}
+
+define i16 @shr16ri(i16 noundef %a) {
+; CHECK-LABEL: shr16ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl %di, %eax # encoding: [0x0f,0xb7,0xc7]
+; CHECK-NEXT:    shrl $4, %eax # EVEX TO LEGACY Compression encoding: [0xc1,0xe8,0x04]
+; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i16 %a, 4
+  ret i16 %shr
+}
+
+define i32 @shr32ri(i32 noundef %a) {
+; CHECK-LABEL: shr32ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrl $4, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0xc1,0xef,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i32 %a, 4
+  ret i32 %shr
+}
+
+define i64 @shr64ri(i64 noundef %a) {
+; CHECK-LABEL: shr64ri:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrq $4, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0xc1,0xef,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %shr = lshr i64 %a, 4
+  ret i64 %shr
+}
+
+define void @shr8m1_legacy(ptr %ptr) {
+; CHECK-LABEL: shr8m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrb (%rdi) # encoding: [0xd0,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shr = lshr i8 %a, 1
+  store i8 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr16m1_legacy(ptr %ptr) {
+; CHECK-LABEL: shr16m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrw (%rdi) # encoding: [0x66,0xd1,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shr = lshr i16 %a, 1
+  store i16 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr32m1_legacy(ptr %ptr) {
+; CHECK-LABEL: shr32m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrl (%rdi) # encoding: [0xd1,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shr = lshr i32 %a, 1
+  store i32 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr64m1_legacy(ptr %ptr) {
+; CHECK-LABEL: shr64m1_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrq (%rdi) # encoding: [0x48,0xd1,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shr = lshr i64 %a, 1
+  store i64 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr8mi_legacy(ptr %ptr) {
+; CHECK-LABEL: shr8mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrb $4, (%rdi) # encoding: [0xc0,0x2f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shr = lshr i8 %a, 4
+  store i8 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr16mi_legacy(ptr %ptr) {
+; CHECK-LABEL: shr16mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrw $4, (%rdi) # encoding: [0x66,0xc1,0x2f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shr = lshr i16 %a, 4
+  store i16 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr32mi_legacy(ptr %ptr) {
+; CHECK-LABEL: shr32mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrl $4, (%rdi) # encoding: [0xc1,0x2f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shr = lshr i32 %a, 4
+  store i32 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr64mi_legacy(ptr %ptr) {
+; CHECK-LABEL: shr64mi_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrq $4, (%rdi) # encoding: [0x48,0xc1,0x2f,0x04]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shr = lshr i64 %a, 4
+  store i64 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr8mcl_legacy(ptr %ptr, i8 %cl) {
+; CHECK-LABEL: shr8mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrb %cl, (%rdi) # encoding: [0xd2,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i8, ptr %ptr
+  %shr = lshr i8 %a, %cl
+  store i8 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr16mcl_legacy(ptr %ptr, i16 %cl) {
+; CHECK-LABEL: shr16mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrw %cl, (%rdi) # encoding: [0x66,0xd3,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i16, ptr %ptr
+  %shr = lshr i16 %a, %cl
+  store i16 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr32mcl_legacy(ptr %ptr, i32 %cl) {
+; CHECK-LABEL: shr32mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %esi, %ecx # encoding: [0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, (%rdi) # encoding: [0xd3,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i32, ptr %ptr
+  %shr = lshr i32 %a, %cl
+  store i32 %shr, ptr %ptr
+  ret void
+}
+
+define void @shr64mcl_legacy(ptr %ptr, i64 %cl) {
+; CHECK-LABEL: shr64mcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq %rsi, %rcx # encoding: [0x48,0x89,0xf1]
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $rcx
+; CHECK-NEXT:    shrq %cl, (%rdi) # encoding: [0x48,0xd3,0x2f]
+; CHECK-NEXT:    retq # encoding: [0xc3]
+entry:
+  %a = load i64, ptr %ptr
+  %shr = lshr i64 %a, %cl
+  store i64 %shr, ptr %ptr
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/apx/shrd.ll b/llvm/test/CodeGen/X86/apx/shrd.ll
new file mode 100644
index 0000000000000..453d7a5d17e05
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/shrd.ll
@@ -0,0 +1,277 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ndd -verify-machineinstrs | FileCheck %s
+
+define i16 @shrd16rrcl(i16 noundef %a, i16 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd16rrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andb $15, %dl, %cl
+; CHECK-NEXT:    shrdw %cl, %di, %si, %ax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i16
+    %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %clin)
+    ret i16 %shrd
+}
+
+define i16 @shrd16rrcl_mask(i16 noundef %a, i16 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd16rrcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    andb $15, %dl, %cl
+; CHECK-NEXT:    shrdw %cl, %di, %si, %ax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i16
+    %shamt = and i16 %clin, 31
+    %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %shamt)
+    ret i16 %shrd
+}
+
+define i32 @shrd32rrcl(i32 noundef %a, i32 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd32rrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrdl %cl, %edi, %esi, %eax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i32
+    %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %clin)
+    ret i32 %shrd
+}
+
+define i32 @shrd32rrcl_mask(i32 noundef %a, i32 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd32rrcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrdl %cl, %edi, %esi, %eax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i32
+    %shamt = and i32 %clin, 31
+    %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %shamt)
+    ret i32 %shrd
+}
+
+define i64 @shrd64rrcl(i64 noundef %a, i64 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd64rrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrdq %cl, %rdi, %rsi, %rax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i64
+    %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %clin)
+    ret i64 %shrd
+}
+
+define i64 @shrd64rrcl_mask(i64 noundef %a, i64 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd64rrcl_mask:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrdq %cl, %rdi, %rsi, %rax
+; CHECK-NEXT:    retq
+entry:
+    %clin = sext i8 %cl to i64
+    %shamt = and i64 %clin, 63
+    %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %shamt)
+    ret i64 %shrd
+}
+
+define i16 @shrd16rri8(i16 noundef %a, i16 noundef %b) {
+; CHECK-LABEL: shrd16rri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrdw $12, %di, %si, %ax
+; CHECK-NEXT:    retq
+entry:
+    %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 12)
+    ret i16 %shrd
+}
+
+define i32 @shrd32rri8(i32 noundef %a, i32 noundef %b) {
+; CHECK-LABEL: shrd32rri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrdl $12, %edi, %esi, %eax
+; CHECK-NEXT:    retq
+entry:
+    %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 12)
+    ret i32 %shrd
+}
+
+define i64 @shrd64rri8(i64 noundef %a, i64 noundef %b) {
+; CHECK-LABEL: shrd64rri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrdq $12, %rdi, %rsi, %rax
+; CHECK-NEXT:    retq
+entry:
+    %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 12)
+    ret i64 %shrd
+}
+
+define i16 @shrd16mrcl(ptr %ptr, i16 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd16mrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    andb $15, %dl, %cl
+; CHECK-NEXT:    shrdw %cl, %ax, %si, %ax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i16, ptr %ptr
+    %clin = sext i8 %cl to i16
+    %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %clin)
+    ret i16 %shrd
+}
+
+define i32 @shrd32mrcl(ptr %ptr, i32 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd32mrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrdl %cl, %eax, %esi, %eax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i32, ptr %ptr
+    %clin = sext i8 %cl to i32
+    %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %clin)
+    ret i32 %shrd
+}
+
+define i64 @shrd64mrcl(ptr %ptr, i64 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd64mrcl:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrdq %cl, %rax, %rsi, %rax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i64, ptr %ptr
+    %clin = sext i8 %cl to i64
+    %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %clin)
+    ret i64 %shrd
+}
+
+define i16 @shrd16mri8(ptr %ptr, i16 noundef %b) {
+; CHECK-LABEL: shrd16mri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrdw $12, %si, (%rdi), %ax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i16, ptr %ptr
+    %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 12)
+    ret i16 %shrd
+}
+
+define i32 @shrd32mri8(ptr %ptr, i32 noundef %b) {
+; CHECK-LABEL: shrd32mri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrdl $12, %esi, (%rdi), %eax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i32, ptr %ptr
+    %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 12)
+    ret i32 %shrd
+}
+
+define i64 @shrd64mri8(ptr %ptr, i64 noundef %b) {
+; CHECK-LABEL: shrd64mri8:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrdq $12, %rsi, (%rdi), %rax
+; CHECK-NEXT:    retq
+entry:
+    %a = load i64, ptr %ptr
+    %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 12)
+    ret i64 %shrd
+}
+
+define void @shrd16mrcl_legacy(ptr %ptr, i16 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd16mrcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movzwl (%rdi), %eax
+; CHECK-NEXT:    andb $15, %dl, %cl
+; CHECK-NEXT:    shrdw %cl, %ax, %si, %ax
+; CHECK-NEXT:    movw %ax, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i16, ptr %ptr
+    %clin = sext i8 %cl to i16
+    %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %clin)
+    store i16 %shrd, ptr %ptr
+    ret void
+}
+
+define void @shrd32mrcl_legacy(ptr %ptr, i32 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd32mrcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrdl %cl, %eax, %esi, %eax
+; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i32, ptr %ptr
+    %clin = sext i8 %cl to i32
+    %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %clin)
+    store i32 %shrd, ptr %ptr
+    ret void
+}
+
+define void @shrd64mrcl_legacy(ptr %ptr, i64 noundef %b, i8 %cl) {
+; CHECK-LABEL: shrd64mrcl_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movl %edx, %ecx
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrdq %cl, %rax, %rsi, %rax
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i64, ptr %ptr
+    %clin = sext i8 %cl to i64
+    %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %clin)
+    store i64 %shrd, ptr %ptr
+    ret void
+}
+
+define void @shrd16mri8_legacy(ptr %ptr, i16 noundef %b) {
+; CHECK-LABEL: shrd16mri8_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrdw $12, %si, (%rdi), %ax
+; CHECK-NEXT:    movw %ax, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i16, ptr %ptr
+    %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 12)
+    store i16 %shrd, ptr %ptr
+    ret void
+}
+
+define void @shrd32mri8_legacy(ptr %ptr, i32 noundef %b) {
+; CHECK-LABEL: shrd32mri8_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrdl $12, %esi, (%rdi), %eax
+; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i32, ptr %ptr
+    %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 12)
+    store i32 %shrd, ptr %ptr
+    ret void
+}
+
+define void @shrd64mri8_legacy(ptr %ptr, i64 noundef %b) {
+; CHECK-LABEL: shrd64mri8_legacy:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    shrdq $12, %rsi, (%rdi), %rax
+; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    retq
+entry:
+    %a = load i64, ptr %ptr
+    %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 12)
+    store i64 %shrd, ptr %ptr
+    ret void
+}
diff --git a/llvm/test/MC/Disassembler/X86/apx/rcl.txt b/llvm/test/MC/Disassembler/X86/apx/rcl.txt
new file mode 100644
index 0000000000000..3dc9cb466d38f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/rcl.txt
@@ -0,0 +1,194 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	rclb	$123, %bl
+# INTEL: {evex}	rcl	bl, 123
+0x62,0xf4,0x7c,0x08,0xc0,0xd3,0x7b
+
+# ATT:   rclb	$123, %bl, %bl
+# INTEL: rcl	bl, bl, 123
+0x62,0xf4,0x64,0x18,0xc0,0xd3,0x7b
+
+# ATT:   {evex}	rclw	$123, %dx
+# INTEL: {evex}	rcl	dx, 123
+0x62,0xf4,0x7d,0x08,0xc1,0xd2,0x7b
+
+# ATT:   rclw	$123, %dx, %dx
+# INTEL: rcl	dx, dx, 123
+0x62,0xf4,0x6d,0x18,0xc1,0xd2,0x7b
+
+# ATT:   {evex}	rcll	$123, %ecx
+# INTEL: {evex}	rcl	ecx, 123
+0x62,0xf4,0x7c,0x08,0xc1,0xd1,0x7b
+
+# ATT:   rcll	$123, %ecx, %ecx
+# INTEL: rcl	ecx, ecx, 123
+0x62,0xf4,0x74,0x18,0xc1,0xd1,0x7b
+
+# ATT:   {evex}	rclq	$123, %r9
+# INTEL: {evex}	rcl	r9, 123
+0x62,0xd4,0xfc,0x08,0xc1,0xd1,0x7b
+
+# ATT:   rclq	$123, %r9, %r9
+# INTEL: rcl	r9, r9, 123
+0x62,0xd4,0xb4,0x18,0xc1,0xd1,0x7b
+
+# ATT:   {evex}	rclb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rcl	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc0,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rclb	$123, 291(%r8,%rax,4), %bl
+# INTEL: rcl	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0xc0,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rclw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rcl	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rclw	$123, 291(%r8,%rax,4), %dx
+# INTEL: rcl	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rcll	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rcl	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rcll	$123, 291(%r8,%rax,4), %ecx
+# INTEL: rcl	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rclq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rcl	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rclq	$123, 291(%r8,%rax,4), %r9
+# INTEL: rcl	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rclb	%bl
+# INTEL: {evex}	rcl	bl
+0x62,0xf4,0x7c,0x08,0xd0,0xd3
+
+# ATT:   rclb	%bl, %bl
+# INTEL: rcl	bl, bl
+0x62,0xf4,0x64,0x18,0xd0,0xd3
+
+# ATT:   {evex}	rclb	%cl, %bl
+# INTEL: {evex}	rcl	bl, cl
+0x62,0xf4,0x7c,0x08,0xd2,0xd3
+
+# ATT:   rclb	%cl, %bl, %bl
+# INTEL: rcl	bl, bl, cl
+0x62,0xf4,0x64,0x18,0xd2,0xd3
+
+# ATT:   {evex}	rclw	%cl, %dx
+# INTEL: {evex}	rcl	dx, cl
+0x62,0xf4,0x7d,0x08,0xd3,0xd2
+
+# ATT:   rclw	%cl, %dx, %dx
+# INTEL: rcl	dx, dx, cl
+0x62,0xf4,0x6d,0x18,0xd3,0xd2
+
+# ATT:   {evex}	rcll	%cl, %ecx
+# INTEL: {evex}	rcl	ecx, cl
+0x62,0xf4,0x7c,0x08,0xd3,0xd1
+
+# ATT:   rcll	%cl, %ecx, %ecx
+# INTEL: rcl	ecx, ecx, cl
+0x62,0xf4,0x74,0x18,0xd3,0xd1
+
+# ATT:   {evex}	rclq	%cl, %r9
+# INTEL: {evex}	rcl	r9, cl
+0x62,0xd4,0xfc,0x08,0xd3,0xd1
+
+# ATT:   rclq	%cl, %r9, %r9
+# INTEL: rcl	r9, r9, cl
+0x62,0xd4,0xb4,0x18,0xd3,0xd1
+
+# ATT:   {evex}	rclb	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rcl	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd2,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rclb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: rcl	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x18,0xd2,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rclw	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rcl	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x08,0xd3,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rclw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: rcl	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x18,0xd3,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rcll	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rcl	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd3,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcll	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: rcl	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x18,0xd3,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rclq	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rcl	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x08,0xd3,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rclq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: rcl	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x18,0xd3,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rclw	%dx
+# INTEL: {evex}	rcl	dx
+0x62,0xf4,0x7d,0x08,0xd1,0xd2
+
+# ATT:   rclw	%dx, %dx
+# INTEL: rcl	dx, dx
+0x62,0xf4,0x6d,0x18,0xd1,0xd2
+
+# ATT:   {evex}	rcll	%ecx
+# INTEL: {evex}	rcl	ecx
+0x62,0xf4,0x7c,0x08,0xd1,0xd1
+
+# ATT:   rcll	%ecx, %ecx
+# INTEL: rcl	ecx, ecx
+0x62,0xf4,0x74,0x18,0xd1,0xd1
+
+# ATT:   {evex}	rclq	%r9
+# INTEL: {evex}	rcl	r9
+0x62,0xd4,0xfc,0x08,0xd1,0xd1
+
+# ATT:   rclq	%r9, %r9
+# INTEL: rcl	r9, r9
+0x62,0xd4,0xb4,0x18,0xd1,0xd1
+
+# ATT:   {evex}	rclb	291(%r8,%rax,4)
+# INTEL: {evex}	rcl	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd0,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rclb	291(%r8,%rax,4), %bl
+# INTEL: rcl	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xd0,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rclw	291(%r8,%rax,4)
+# INTEL: {evex}	rcl	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xd1,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rclw	291(%r8,%rax,4), %dx
+# INTEL: rcl	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xd1,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rcll	291(%r8,%rax,4)
+# INTEL: {evex}	rcl	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd1,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcll	291(%r8,%rax,4), %ecx
+# INTEL: rcl	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xd1,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rclq	291(%r8,%rax,4)
+# INTEL: {evex}	rcl	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xd1,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rclq	291(%r8,%rax,4), %r9
+# INTEL: rcl	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xd1,0x94,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/rcr.txt b/llvm/test/MC/Disassembler/X86/apx/rcr.txt
new file mode 100644
index 0000000000000..42b9849a9e16f
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/rcr.txt
@@ -0,0 +1,194 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	rcrb	$123, %bl
+# INTEL: {evex}	rcr	bl, 123
+0x62,0xf4,0x7c,0x08,0xc0,0xdb,0x7b
+
+# ATT:   rcrb	$123, %bl, %bl
+# INTEL: rcr	bl, bl, 123
+0x62,0xf4,0x64,0x18,0xc0,0xdb,0x7b
+
+# ATT:   {evex}	rcrw	$123, %dx
+# INTEL: {evex}	rcr	dx, 123
+0x62,0xf4,0x7d,0x08,0xc1,0xda,0x7b
+
+# ATT:   rcrw	$123, %dx, %dx
+# INTEL: rcr	dx, dx, 123
+0x62,0xf4,0x6d,0x18,0xc1,0xda,0x7b
+
+# ATT:   {evex}	rcrl	$123, %ecx
+# INTEL: {evex}	rcr	ecx, 123
+0x62,0xf4,0x7c,0x08,0xc1,0xd9,0x7b
+
+# ATT:   rcrl	$123, %ecx, %ecx
+# INTEL: rcr	ecx, ecx, 123
+0x62,0xf4,0x74,0x18,0xc1,0xd9,0x7b
+
+# ATT:   {evex}	rcrq	$123, %r9
+# INTEL: {evex}	rcr	r9, 123
+0x62,0xd4,0xfc,0x08,0xc1,0xd9,0x7b
+
+# ATT:   rcrq	$123, %r9, %r9
+# INTEL: rcr	r9, r9, 123
+0x62,0xd4,0xb4,0x18,0xc1,0xd9,0x7b
+
+# ATT:   {evex}	rcrb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rcr	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc0,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rcrb	$123, 291(%r8,%rax,4), %bl
+# INTEL: rcr	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0xc0,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rcrw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rcr	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rcrw	$123, 291(%r8,%rax,4), %dx
+# INTEL: rcr	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rcrl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rcr	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rcrl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: rcr	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rcrq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rcr	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rcrq	$123, 291(%r8,%rax,4), %r9
+# INTEL: rcr	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rcrb	%bl
+# INTEL: {evex}	rcr	bl
+0x62,0xf4,0x7c,0x08,0xd0,0xdb
+
+# ATT:   rcrb	%bl, %bl
+# INTEL: rcr	bl, bl
+0x62,0xf4,0x64,0x18,0xd0,0xdb
+
+# ATT:   {evex}	rcrb	%cl, %bl
+# INTEL: {evex}	rcr	bl, cl
+0x62,0xf4,0x7c,0x08,0xd2,0xdb
+
+# ATT:   rcrb	%cl, %bl, %bl
+# INTEL: rcr	bl, bl, cl
+0x62,0xf4,0x64,0x18,0xd2,0xdb
+
+# ATT:   {evex}	rcrw	%cl, %dx
+# INTEL: {evex}	rcr	dx, cl
+0x62,0xf4,0x7d,0x08,0xd3,0xda
+
+# ATT:   rcrw	%cl, %dx, %dx
+# INTEL: rcr	dx, dx, cl
+0x62,0xf4,0x6d,0x18,0xd3,0xda
+
+# ATT:   {evex}	rcrl	%cl, %ecx
+# INTEL: {evex}	rcr	ecx, cl
+0x62,0xf4,0x7c,0x08,0xd3,0xd9
+
+# ATT:   rcrl	%cl, %ecx, %ecx
+# INTEL: rcr	ecx, ecx, cl
+0x62,0xf4,0x74,0x18,0xd3,0xd9
+
+# ATT:   {evex}	rcrq	%cl, %r9
+# INTEL: {evex}	rcr	r9, cl
+0x62,0xd4,0xfc,0x08,0xd3,0xd9
+
+# ATT:   rcrq	%cl, %r9, %r9
+# INTEL: rcr	r9, r9, cl
+0x62,0xd4,0xb4,0x18,0xd3,0xd9
+
+# ATT:   {evex}	rcrb	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rcr	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd2,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcrb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: rcr	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x18,0xd2,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rcrw	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rcr	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x08,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcrw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: rcr	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x18,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rcrl	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rcr	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcrl	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: rcr	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x18,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rcrq	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rcr	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x08,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcrq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: rcr	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x18,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rcrw	%dx
+# INTEL: {evex}	rcr	dx
+0x62,0xf4,0x7d,0x08,0xd1,0xda
+
+# ATT:   rcrw	%dx, %dx
+# INTEL: rcr	dx, dx
+0x62,0xf4,0x6d,0x18,0xd1,0xda
+
+# ATT:   {evex}	rcrl	%ecx
+# INTEL: {evex}	rcr	ecx
+0x62,0xf4,0x7c,0x08,0xd1,0xd9
+
+# ATT:   rcrl	%ecx, %ecx
+# INTEL: rcr	ecx, ecx
+0x62,0xf4,0x74,0x18,0xd1,0xd9
+
+# ATT:   {evex}	rcrq	%r9
+# INTEL: {evex}	rcr	r9
+0x62,0xd4,0xfc,0x08,0xd1,0xd9
+
+# ATT:   rcrq	%r9, %r9
+# INTEL: rcr	r9, r9
+0x62,0xd4,0xb4,0x18,0xd1,0xd9
+
+# ATT:   {evex}	rcrb	291(%r8,%rax,4)
+# INTEL: {evex}	rcr	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd0,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcrb	291(%r8,%rax,4), %bl
+# INTEL: rcr	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xd0,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rcrw	291(%r8,%rax,4)
+# INTEL: {evex}	rcr	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcrw	291(%r8,%rax,4), %dx
+# INTEL: rcr	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rcrl	291(%r8,%rax,4)
+# INTEL: {evex}	rcr	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcrl	291(%r8,%rax,4), %ecx
+# INTEL: rcr	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rcrq	291(%r8,%rax,4)
+# INTEL: {evex}	rcr	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rcrq	291(%r8,%rax,4), %r9
+# INTEL: rcr	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/rol.txt b/llvm/test/MC/Disassembler/X86/apx/rol.txt
new file mode 100644
index 0000000000000..bb713d1c88566
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/rol.txt
@@ -0,0 +1,386 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	rolb	$123, %bl
+# INTEL: {evex}	rol	bl, 123
+0x62,0xf4,0x7c,0x08,0xc0,0xc3,0x7b
+
+# ATT:   {nf}	rolb	$123, %bl
+# INTEL: {nf}	rol	bl, 123
+0x62,0xf4,0x7c,0x0c,0xc0,0xc3,0x7b
+
+# ATT:   rolb	$123, %bl, %bl
+# INTEL: rol	bl, bl, 123
+0x62,0xf4,0x64,0x18,0xc0,0xc3,0x7b
+
+# ATT:   {nf}	rolb	$123, %bl, %bl
+# INTEL: {nf}	rol	bl, bl, 123
+0x62,0xf4,0x64,0x1c,0xc0,0xc3,0x7b
+
+# ATT:   {evex}	rolw	$123, %dx
+# INTEL: {evex}	rol	dx, 123
+0x62,0xf4,0x7d,0x08,0xc1,0xc2,0x7b
+
+# ATT:   {nf}	rolw	$123, %dx
+# INTEL: {nf}	rol	dx, 123
+0x62,0xf4,0x7d,0x0c,0xc1,0xc2,0x7b
+
+# ATT:   rolw	$123, %dx, %dx
+# INTEL: rol	dx, dx, 123
+0x62,0xf4,0x6d,0x18,0xc1,0xc2,0x7b
+
+# ATT:   {nf}	rolw	$123, %dx, %dx
+# INTEL: {nf}	rol	dx, dx, 123
+0x62,0xf4,0x6d,0x1c,0xc1,0xc2,0x7b
+
+# ATT:   {evex}	roll	$123, %ecx
+# INTEL: {evex}	rol	ecx, 123
+0x62,0xf4,0x7c,0x08,0xc1,0xc1,0x7b
+
+# ATT:   {nf}	roll	$123, %ecx
+# INTEL: {nf}	rol	ecx, 123
+0x62,0xf4,0x7c,0x0c,0xc1,0xc1,0x7b
+
+# ATT:   roll	$123, %ecx, %ecx
+# INTEL: rol	ecx, ecx, 123
+0x62,0xf4,0x74,0x18,0xc1,0xc1,0x7b
+
+# ATT:   {nf}	roll	$123, %ecx, %ecx
+# INTEL: {nf}	rol	ecx, ecx, 123
+0x62,0xf4,0x74,0x1c,0xc1,0xc1,0x7b
+
+# ATT:   {evex}	rolq	$123, %r9
+# INTEL: {evex}	rol	r9, 123
+0x62,0xd4,0xfc,0x08,0xc1,0xc1,0x7b
+
+# ATT:   {nf}	rolq	$123, %r9
+# INTEL: {nf}	rol	r9, 123
+0x62,0xd4,0xfc,0x0c,0xc1,0xc1,0x7b
+
+# ATT:   rolq	$123, %r9, %r9
+# INTEL: rol	r9, r9, 123
+0x62,0xd4,0xb4,0x18,0xc1,0xc1,0x7b
+
+# ATT:   {nf}	rolq	$123, %r9, %r9
+# INTEL: {nf}	rol	r9, r9, 123
+0x62,0xd4,0xb4,0x1c,0xc1,0xc1,0x7b
+
+# ATT:   {evex}	rolb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rol	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rolb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	rol	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rolb	$123, 291(%r8,%rax,4), %bl
+# INTEL: rol	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rolb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	rol	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rolw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rol	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rolw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	rol	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rolw	$123, 291(%r8,%rax,4), %dx
+# INTEL: rol	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rolw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	rol	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	roll	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rol	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	roll	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	rol	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   roll	$123, 291(%r8,%rax,4), %ecx
+# INTEL: rol	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	roll	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	rol	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rolq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	rol	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rolq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	rol	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rolq	$123, 291(%r8,%rax,4), %r9
+# INTEL: rol	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rolq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	rol	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rolb	%bl
+# INTEL: {evex}	rol	bl
+0x62,0xf4,0x7c,0x08,0xd0,0xc3
+
+# ATT:   {nf}	rolb	%bl
+# INTEL: {nf}	rol	bl
+0x62,0xf4,0x7c,0x0c,0xd0,0xc3
+
+# ATT:   rolb	%bl, %bl
+# INTEL: rol	bl, bl
+0x62,0xf4,0x64,0x18,0xd0,0xc3
+
+# ATT:   {nf}	rolb	%bl, %bl
+# INTEL: {nf}	rol	bl, bl
+0x62,0xf4,0x64,0x1c,0xd0,0xc3
+
+# ATT:   {evex}	rolb	%cl, %bl
+# INTEL: {evex}	rol	bl, cl
+0x62,0xf4,0x7c,0x08,0xd2,0xc3
+
+# ATT:   {nf}	rolb	%cl, %bl
+# INTEL: {nf}	rol	bl, cl
+0x62,0xf4,0x7c,0x0c,0xd2,0xc3
+
+# ATT:   rolb	%cl, %bl, %bl
+# INTEL: rol	bl, bl, cl
+0x62,0xf4,0x64,0x18,0xd2,0xc3
+
+# ATT:   {nf}	rolb	%cl, %bl, %bl
+# INTEL: {nf}	rol	bl, bl, cl
+0x62,0xf4,0x64,0x1c,0xd2,0xc3
+
+# ATT:   {evex}	rolw	%cl, %dx
+# INTEL: {evex}	rol	dx, cl
+0x62,0xf4,0x7d,0x08,0xd3,0xc2
+
+# ATT:   {nf}	rolw	%cl, %dx
+# INTEL: {nf}	rol	dx, cl
+0x62,0xf4,0x7d,0x0c,0xd3,0xc2
+
+# ATT:   rolw	%cl, %dx, %dx
+# INTEL: rol	dx, dx, cl
+0x62,0xf4,0x6d,0x18,0xd3,0xc2
+
+# ATT:   {nf}	rolw	%cl, %dx, %dx
+# INTEL: {nf}	rol	dx, dx, cl
+0x62,0xf4,0x6d,0x1c,0xd3,0xc2
+
+# ATT:   {evex}	roll	%cl, %ecx
+# INTEL: {evex}	rol	ecx, cl
+0x62,0xf4,0x7c,0x08,0xd3,0xc1
+
+# ATT:   {nf}	roll	%cl, %ecx
+# INTEL: {nf}	rol	ecx, cl
+0x62,0xf4,0x7c,0x0c,0xd3,0xc1
+
+# ATT:   roll	%cl, %ecx, %ecx
+# INTEL: rol	ecx, ecx, cl
+0x62,0xf4,0x74,0x18,0xd3,0xc1
+
+# ATT:   {nf}	roll	%cl, %ecx, %ecx
+# INTEL: {nf}	rol	ecx, ecx, cl
+0x62,0xf4,0x74,0x1c,0xd3,0xc1
+
+# ATT:   {evex}	rolq	%cl, %r9
+# INTEL: {evex}	rol	r9, cl
+0x62,0xd4,0xfc,0x08,0xd3,0xc1
+
+# ATT:   {nf}	rolq	%cl, %r9
+# INTEL: {nf}	rol	r9, cl
+0x62,0xd4,0xfc,0x0c,0xd3,0xc1
+
+# ATT:   rolq	%cl, %r9, %r9
+# INTEL: rol	r9, r9, cl
+0x62,0xd4,0xb4,0x18,0xd3,0xc1
+
+# ATT:   {nf}	rolq	%cl, %r9, %r9
+# INTEL: {nf}	rol	r9, r9, cl
+0x62,0xd4,0xb4,0x1c,0xd3,0xc1
+
+# ATT:   {evex}	rolb	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rol	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd2,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolb	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	rol	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd2,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rolb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: rol	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x18,0xd2,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	rol	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x1c,0xd2,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rolw	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rol	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x08,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolw	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	rol	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x0c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rolw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: rol	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x18,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	rol	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x1c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	roll	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rol	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	roll	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	rol	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   roll	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: rol	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x18,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	roll	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	rol	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x1c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rolq	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	rol	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x08,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolq	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	rol	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x0c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rolq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: rol	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x18,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	rol	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x1c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rolw	%dx
+# INTEL: {evex}	rol	dx
+0x62,0xf4,0x7d,0x08,0xd1,0xc2
+
+# ATT:   {nf}	rolw	%dx
+# INTEL: {nf}	rol	dx
+0x62,0xf4,0x7d,0x0c,0xd1,0xc2
+
+# ATT:   rolw	%dx, %dx
+# INTEL: rol	dx, dx
+0x62,0xf4,0x6d,0x18,0xd1,0xc2
+
+# ATT:   {nf}	rolw	%dx, %dx
+# INTEL: {nf}	rol	dx, dx
+0x62,0xf4,0x6d,0x1c,0xd1,0xc2
+
+# ATT:   {evex}	roll	%ecx
+# INTEL: {evex}	rol	ecx
+0x62,0xf4,0x7c,0x08,0xd1,0xc1
+
+# ATT:   {nf}	roll	%ecx
+# INTEL: {nf}	rol	ecx
+0x62,0xf4,0x7c,0x0c,0xd1,0xc1
+
+# ATT:   roll	%ecx, %ecx
+# INTEL: rol	ecx, ecx
+0x62,0xf4,0x74,0x18,0xd1,0xc1
+
+# ATT:   {nf}	roll	%ecx, %ecx
+# INTEL: {nf}	rol	ecx, ecx
+0x62,0xf4,0x74,0x1c,0xd1,0xc1
+
+# ATT:   {evex}	rolq	%r9
+# INTEL: {evex}	rol	r9
+0x62,0xd4,0xfc,0x08,0xd1,0xc1
+
+# ATT:   {nf}	rolq	%r9
+# INTEL: {nf}	rol	r9
+0x62,0xd4,0xfc,0x0c,0xd1,0xc1
+
+# ATT:   rolq	%r9, %r9
+# INTEL: rol	r9, r9
+0x62,0xd4,0xb4,0x18,0xd1,0xc1
+
+# ATT:   {nf}	rolq	%r9, %r9
+# INTEL: {nf}	rol	r9, r9
+0x62,0xd4,0xb4,0x1c,0xd1,0xc1
+
+# ATT:   {evex}	rolb	291(%r8,%rax,4)
+# INTEL: {evex}	rol	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd0,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolb	291(%r8,%rax,4)
+# INTEL: {nf}	rol	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd0,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rolb	291(%r8,%rax,4), %bl
+# INTEL: rol	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xd0,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	rol	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x1c,0xd0,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rolw	291(%r8,%rax,4)
+# INTEL: {evex}	rol	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolw	291(%r8,%rax,4)
+# INTEL: {nf}	rol	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rolw	291(%r8,%rax,4), %dx
+# INTEL: rol	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	rol	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x1c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	roll	291(%r8,%rax,4)
+# INTEL: {evex}	rol	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	roll	291(%r8,%rax,4)
+# INTEL: {nf}	rol	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   roll	291(%r8,%rax,4), %ecx
+# INTEL: rol	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	roll	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	rol	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rolq	291(%r8,%rax,4)
+# INTEL: {evex}	rol	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolq	291(%r8,%rax,4)
+# INTEL: {nf}	rol	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x0c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rolq	291(%r8,%rax,4), %r9
+# INTEL: rol	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rolq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	rol	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x1c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/ror.txt b/llvm/test/MC/Disassembler/X86/apx/ror.txt
new file mode 100644
index 0000000000000..484a3e143fdac
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/ror.txt
@@ -0,0 +1,386 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	rorb	$123, %bl
+# INTEL: {evex}	ror	bl, 123
+0x62,0xf4,0x7c,0x08,0xc0,0xcb,0x7b
+
+# ATT:   {nf}	rorb	$123, %bl
+# INTEL: {nf}	ror	bl, 123
+0x62,0xf4,0x7c,0x0c,0xc0,0xcb,0x7b
+
+# ATT:   rorb	$123, %bl, %bl
+# INTEL: ror	bl, bl, 123
+0x62,0xf4,0x64,0x18,0xc0,0xcb,0x7b
+
+# ATT:   {nf}	rorb	$123, %bl, %bl
+# INTEL: {nf}	ror	bl, bl, 123
+0x62,0xf4,0x64,0x1c,0xc0,0xcb,0x7b
+
+# ATT:   {evex}	rorw	$123, %dx
+# INTEL: {evex}	ror	dx, 123
+0x62,0xf4,0x7d,0x08,0xc1,0xca,0x7b
+
+# ATT:   {nf}	rorw	$123, %dx
+# INTEL: {nf}	ror	dx, 123
+0x62,0xf4,0x7d,0x0c,0xc1,0xca,0x7b
+
+# ATT:   rorw	$123, %dx, %dx
+# INTEL: ror	dx, dx, 123
+0x62,0xf4,0x6d,0x18,0xc1,0xca,0x7b
+
+# ATT:   {nf}	rorw	$123, %dx, %dx
+# INTEL: {nf}	ror	dx, dx, 123
+0x62,0xf4,0x6d,0x1c,0xc1,0xca,0x7b
+
+# ATT:   {evex}	rorl	$123, %ecx
+# INTEL: {evex}	ror	ecx, 123
+0x62,0xf4,0x7c,0x08,0xc1,0xc9,0x7b
+
+# ATT:   {nf}	rorl	$123, %ecx
+# INTEL: {nf}	ror	ecx, 123
+0x62,0xf4,0x7c,0x0c,0xc1,0xc9,0x7b
+
+# ATT:   rorl	$123, %ecx, %ecx
+# INTEL: ror	ecx, ecx, 123
+0x62,0xf4,0x74,0x18,0xc1,0xc9,0x7b
+
+# ATT:   {nf}	rorl	$123, %ecx, %ecx
+# INTEL: {nf}	ror	ecx, ecx, 123
+0x62,0xf4,0x74,0x1c,0xc1,0xc9,0x7b
+
+# ATT:   {evex}	rorq	$123, %r9
+# INTEL: {evex}	ror	r9, 123
+0x62,0xd4,0xfc,0x08,0xc1,0xc9,0x7b
+
+# ATT:   {nf}	rorq	$123, %r9
+# INTEL: {nf}	ror	r9, 123
+0x62,0xd4,0xfc,0x0c,0xc1,0xc9,0x7b
+
+# ATT:   rorq	$123, %r9, %r9
+# INTEL: ror	r9, r9, 123
+0x62,0xd4,0xb4,0x18,0xc1,0xc9,0x7b
+
+# ATT:   {nf}	rorq	$123, %r9, %r9
+# INTEL: {nf}	ror	r9, r9, 123
+0x62,0xd4,0xb4,0x1c,0xc1,0xc9,0x7b
+
+# ATT:   {evex}	rorb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	ror	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rorb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	ror	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rorb	$123, 291(%r8,%rax,4), %bl
+# INTEL: ror	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rorb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	ror	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rorw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	ror	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rorw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	ror	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rorw	$123, 291(%r8,%rax,4), %dx
+# INTEL: ror	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rorw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	ror	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rorl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	ror	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rorl	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	ror	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rorl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: ror	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rorl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	ror	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rorq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	ror	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rorq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	ror	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   rorq	$123, 291(%r8,%rax,4), %r9
+# INTEL: ror	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	rorq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	ror	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	rorb	%bl
+# INTEL: {evex}	ror	bl
+0x62,0xf4,0x7c,0x08,0xd0,0xcb
+
+# ATT:   {nf}	rorb	%bl
+# INTEL: {nf}	ror	bl
+0x62,0xf4,0x7c,0x0c,0xd0,0xcb
+
+# ATT:   rorb	%bl, %bl
+# INTEL: ror	bl, bl
+0x62,0xf4,0x64,0x18,0xd0,0xcb
+
+# ATT:   {nf}	rorb	%bl, %bl
+# INTEL: {nf}	ror	bl, bl
+0x62,0xf4,0x64,0x1c,0xd0,0xcb
+
+# ATT:   {evex}	rorb	%cl, %bl
+# INTEL: {evex}	ror	bl, cl
+0x62,0xf4,0x7c,0x08,0xd2,0xcb
+
+# ATT:   {nf}	rorb	%cl, %bl
+# INTEL: {nf}	ror	bl, cl
+0x62,0xf4,0x7c,0x0c,0xd2,0xcb
+
+# ATT:   rorb	%cl, %bl, %bl
+# INTEL: ror	bl, bl, cl
+0x62,0xf4,0x64,0x18,0xd2,0xcb
+
+# ATT:   {nf}	rorb	%cl, %bl, %bl
+# INTEL: {nf}	ror	bl, bl, cl
+0x62,0xf4,0x64,0x1c,0xd2,0xcb
+
+# ATT:   {evex}	rorw	%cl, %dx
+# INTEL: {evex}	ror	dx, cl
+0x62,0xf4,0x7d,0x08,0xd3,0xca
+
+# ATT:   {nf}	rorw	%cl, %dx
+# INTEL: {nf}	ror	dx, cl
+0x62,0xf4,0x7d,0x0c,0xd3,0xca
+
+# ATT:   rorw	%cl, %dx, %dx
+# INTEL: ror	dx, dx, cl
+0x62,0xf4,0x6d,0x18,0xd3,0xca
+
+# ATT:   {nf}	rorw	%cl, %dx, %dx
+# INTEL: {nf}	ror	dx, dx, cl
+0x62,0xf4,0x6d,0x1c,0xd3,0xca
+
+# ATT:   {evex}	rorl	%cl, %ecx
+# INTEL: {evex}	ror	ecx, cl
+0x62,0xf4,0x7c,0x08,0xd3,0xc9
+
+# ATT:   {nf}	rorl	%cl, %ecx
+# INTEL: {nf}	ror	ecx, cl
+0x62,0xf4,0x7c,0x0c,0xd3,0xc9
+
+# ATT:   rorl	%cl, %ecx, %ecx
+# INTEL: ror	ecx, ecx, cl
+0x62,0xf4,0x74,0x18,0xd3,0xc9
+
+# ATT:   {nf}	rorl	%cl, %ecx, %ecx
+# INTEL: {nf}	ror	ecx, ecx, cl
+0x62,0xf4,0x74,0x1c,0xd3,0xc9
+
+# ATT:   {evex}	rorq	%cl, %r9
+# INTEL: {evex}	ror	r9, cl
+0x62,0xd4,0xfc,0x08,0xd3,0xc9
+
+# ATT:   {nf}	rorq	%cl, %r9
+# INTEL: {nf}	ror	r9, cl
+0x62,0xd4,0xfc,0x0c,0xd3,0xc9
+
+# ATT:   rorq	%cl, %r9, %r9
+# INTEL: ror	r9, r9, cl
+0x62,0xd4,0xb4,0x18,0xd3,0xc9
+
+# ATT:   {nf}	rorq	%cl, %r9, %r9
+# INTEL: {nf}	ror	r9, r9, cl
+0x62,0xd4,0xb4,0x1c,0xd3,0xc9
+
+# ATT:   {evex}	rorb	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	ror	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorb	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	ror	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rorb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: ror	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x18,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	ror	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x1c,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rorw	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	ror	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x08,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorw	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	ror	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x0c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rorw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: ror	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x18,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	ror	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x1c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rorl	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	ror	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorl	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	ror	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rorl	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: ror	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x18,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorl	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	ror	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x1c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rorq	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	ror	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x08,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorq	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	ror	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x0c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rorq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: ror	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x18,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	ror	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x1c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rorw	%dx
+# INTEL: {evex}	ror	dx
+0x62,0xf4,0x7d,0x08,0xd1,0xca
+
+# ATT:   {nf}	rorw	%dx
+# INTEL: {nf}	ror	dx
+0x62,0xf4,0x7d,0x0c,0xd1,0xca
+
+# ATT:   rorw	%dx, %dx
+# INTEL: ror	dx, dx
+0x62,0xf4,0x6d,0x18,0xd1,0xca
+
+# ATT:   {nf}	rorw	%dx, %dx
+# INTEL: {nf}	ror	dx, dx
+0x62,0xf4,0x6d,0x1c,0xd1,0xca
+
+# ATT:   {evex}	rorl	%ecx
+# INTEL: {evex}	ror	ecx
+0x62,0xf4,0x7c,0x08,0xd1,0xc9
+
+# ATT:   {nf}	rorl	%ecx
+# INTEL: {nf}	ror	ecx
+0x62,0xf4,0x7c,0x0c,0xd1,0xc9
+
+# ATT:   rorl	%ecx, %ecx
+# INTEL: ror	ecx, ecx
+0x62,0xf4,0x74,0x18,0xd1,0xc9
+
+# ATT:   {nf}	rorl	%ecx, %ecx
+# INTEL: {nf}	ror	ecx, ecx
+0x62,0xf4,0x74,0x1c,0xd1,0xc9
+
+# ATT:   {evex}	rorq	%r9
+# INTEL: {evex}	ror	r9
+0x62,0xd4,0xfc,0x08,0xd1,0xc9
+
+# ATT:   {nf}	rorq	%r9
+# INTEL: {nf}	ror	r9
+0x62,0xd4,0xfc,0x0c,0xd1,0xc9
+
+# ATT:   rorq	%r9, %r9
+# INTEL: ror	r9, r9
+0x62,0xd4,0xb4,0x18,0xd1,0xc9
+
+# ATT:   {nf}	rorq	%r9, %r9
+# INTEL: {nf}	ror	r9, r9
+0x62,0xd4,0xb4,0x1c,0xd1,0xc9
+
+# ATT:   {evex}	rorb	291(%r8,%rax,4)
+# INTEL: {evex}	ror	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorb	291(%r8,%rax,4)
+# INTEL: {nf}	ror	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rorb	291(%r8,%rax,4), %bl
+# INTEL: ror	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	ror	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x1c,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rorw	291(%r8,%rax,4)
+# INTEL: {evex}	ror	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorw	291(%r8,%rax,4)
+# INTEL: {nf}	ror	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rorw	291(%r8,%rax,4), %dx
+# INTEL: ror	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	ror	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x1c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rorl	291(%r8,%rax,4)
+# INTEL: {evex}	ror	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorl	291(%r8,%rax,4)
+# INTEL: {nf}	ror	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rorl	291(%r8,%rax,4), %ecx
+# INTEL: ror	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	ror	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	rorq	291(%r8,%rax,4)
+# INTEL: {evex}	ror	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorq	291(%r8,%rax,4)
+# INTEL: {nf}	ror	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x0c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   rorq	291(%r8,%rax,4), %r9
+# INTEL: ror	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	rorq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	ror	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x1c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/sar.txt b/llvm/test/MC/Disassembler/X86/apx/sar.txt
new file mode 100644
index 0000000000000..b5e41ee956ccd
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/sar.txt
@@ -0,0 +1,386 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	sarb	$123, %bl
+# INTEL: {evex}	sar	bl, 123
+0x62,0xf4,0x7c,0x08,0xc0,0xfb,0x7b
+
+# ATT:   {nf}	sarb	$123, %bl
+# INTEL: {nf}	sar	bl, 123
+0x62,0xf4,0x7c,0x0c,0xc0,0xfb,0x7b
+
+# ATT:   sarb	$123, %bl, %bl
+# INTEL: sar	bl, bl, 123
+0x62,0xf4,0x64,0x18,0xc0,0xfb,0x7b
+
+# ATT:   {nf}	sarb	$123, %bl, %bl
+# INTEL: {nf}	sar	bl, bl, 123
+0x62,0xf4,0x64,0x1c,0xc0,0xfb,0x7b
+
+# ATT:   {evex}	sarw	$123, %dx
+# INTEL: {evex}	sar	dx, 123
+0x62,0xf4,0x7d,0x08,0xc1,0xfa,0x7b
+
+# ATT:   {nf}	sarw	$123, %dx
+# INTEL: {nf}	sar	dx, 123
+0x62,0xf4,0x7d,0x0c,0xc1,0xfa,0x7b
+
+# ATT:   sarw	$123, %dx, %dx
+# INTEL: sar	dx, dx, 123
+0x62,0xf4,0x6d,0x18,0xc1,0xfa,0x7b
+
+# ATT:   {nf}	sarw	$123, %dx, %dx
+# INTEL: {nf}	sar	dx, dx, 123
+0x62,0xf4,0x6d,0x1c,0xc1,0xfa,0x7b
+
+# ATT:   {evex}	sarl	$123, %ecx
+# INTEL: {evex}	sar	ecx, 123
+0x62,0xf4,0x7c,0x08,0xc1,0xf9,0x7b
+
+# ATT:   {nf}	sarl	$123, %ecx
+# INTEL: {nf}	sar	ecx, 123
+0x62,0xf4,0x7c,0x0c,0xc1,0xf9,0x7b
+
+# ATT:   sarl	$123, %ecx, %ecx
+# INTEL: sar	ecx, ecx, 123
+0x62,0xf4,0x74,0x18,0xc1,0xf9,0x7b
+
+# ATT:   {nf}	sarl	$123, %ecx, %ecx
+# INTEL: {nf}	sar	ecx, ecx, 123
+0x62,0xf4,0x74,0x1c,0xc1,0xf9,0x7b
+
+# ATT:   {evex}	sarq	$123, %r9
+# INTEL: {evex}	sar	r9, 123
+0x62,0xd4,0xfc,0x08,0xc1,0xf9,0x7b
+
+# ATT:   {nf}	sarq	$123, %r9
+# INTEL: {nf}	sar	r9, 123
+0x62,0xd4,0xfc,0x0c,0xc1,0xf9,0x7b
+
+# ATT:   sarq	$123, %r9, %r9
+# INTEL: sar	r9, r9, 123
+0x62,0xd4,0xb4,0x18,0xc1,0xf9,0x7b
+
+# ATT:   {nf}	sarq	$123, %r9, %r9
+# INTEL: {nf}	sar	r9, r9, 123
+0x62,0xd4,0xb4,0x1c,0xc1,0xf9,0x7b
+
+# ATT:   {evex}	sarb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sar	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	sarb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	sar	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   sarb	$123, 291(%r8,%rax,4), %bl
+# INTEL: sar	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	sarb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	sar	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	sarw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sar	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	sarw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	sar	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   sarw	$123, 291(%r8,%rax,4), %dx
+# INTEL: sar	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	sarw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	sar	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	sarl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sar	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	sarl	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	sar	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   sarl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: sar	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	sarl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	sar	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	sarq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	sar	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	sarq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	sar	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   sarq	$123, 291(%r8,%rax,4), %r9
+# INTEL: sar	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	sarq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	sar	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	sarb	%bl
+# INTEL: {evex}	sar	bl
+0x62,0xf4,0x7c,0x08,0xd0,0xfb
+
+# ATT:   {nf}	sarb	%bl
+# INTEL: {nf}	sar	bl
+0x62,0xf4,0x7c,0x0c,0xd0,0xfb
+
+# ATT:   sarb	%bl, %bl
+# INTEL: sar	bl, bl
+0x62,0xf4,0x64,0x18,0xd0,0xfb
+
+# ATT:   {nf}	sarb	%bl, %bl
+# INTEL: {nf}	sar	bl, bl
+0x62,0xf4,0x64,0x1c,0xd0,0xfb
+
+# ATT:   {evex}	sarb	%cl, %bl
+# INTEL: {evex}	sar	bl, cl
+0x62,0xf4,0x7c,0x08,0xd2,0xfb
+
+# ATT:   {nf}	sarb	%cl, %bl
+# INTEL: {nf}	sar	bl, cl
+0x62,0xf4,0x7c,0x0c,0xd2,0xfb
+
+# ATT:   sarb	%cl, %bl, %bl
+# INTEL: sar	bl, bl, cl
+0x62,0xf4,0x64,0x18,0xd2,0xfb
+
+# ATT:   {nf}	sarb	%cl, %bl, %bl
+# INTEL: {nf}	sar	bl, bl, cl
+0x62,0xf4,0x64,0x1c,0xd2,0xfb
+
+# ATT:   {evex}	sarw	%cl, %dx
+# INTEL: {evex}	sar	dx, cl
+0x62,0xf4,0x7d,0x08,0xd3,0xfa
+
+# ATT:   {nf}	sarw	%cl, %dx
+# INTEL: {nf}	sar	dx, cl
+0x62,0xf4,0x7d,0x0c,0xd3,0xfa
+
+# ATT:   sarw	%cl, %dx, %dx
+# INTEL: sar	dx, dx, cl
+0x62,0xf4,0x6d,0x18,0xd3,0xfa
+
+# ATT:   {nf}	sarw	%cl, %dx, %dx
+# INTEL: {nf}	sar	dx, dx, cl
+0x62,0xf4,0x6d,0x1c,0xd3,0xfa
+
+# ATT:   {evex}	sarl	%cl, %ecx
+# INTEL: {evex}	sar	ecx, cl
+0x62,0xf4,0x7c,0x08,0xd3,0xf9
+
+# ATT:   {nf}	sarl	%cl, %ecx
+# INTEL: {nf}	sar	ecx, cl
+0x62,0xf4,0x7c,0x0c,0xd3,0xf9
+
+# ATT:   sarl	%cl, %ecx, %ecx
+# INTEL: sar	ecx, ecx, cl
+0x62,0xf4,0x74,0x18,0xd3,0xf9
+
+# ATT:   {nf}	sarl	%cl, %ecx, %ecx
+# INTEL: {nf}	sar	ecx, ecx, cl
+0x62,0xf4,0x74,0x1c,0xd3,0xf9
+
+# ATT:   {evex}	sarq	%cl, %r9
+# INTEL: {evex}	sar	r9, cl
+0x62,0xd4,0xfc,0x08,0xd3,0xf9
+
+# ATT:   {nf}	sarq	%cl, %r9
+# INTEL: {nf}	sar	r9, cl
+0x62,0xd4,0xfc,0x0c,0xd3,0xf9
+
+# ATT:   sarq	%cl, %r9, %r9
+# INTEL: sar	r9, r9, cl
+0x62,0xd4,0xb4,0x18,0xd3,0xf9
+
+# ATT:   {nf}	sarq	%cl, %r9, %r9
+# INTEL: {nf}	sar	r9, r9, cl
+0x62,0xd4,0xb4,0x1c,0xd3,0xf9
+
+# ATT:   {evex}	sarb	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	sar	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarb	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	sar	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sarb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: sar	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x18,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	sar	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x1c,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sarw	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	sar	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x08,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarw	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	sar	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x0c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sarw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: sar	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x18,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	sar	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x1c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sarl	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	sar	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarl	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	sar	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sarl	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: sar	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x18,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarl	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	sar	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x1c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sarq	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	sar	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x08,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarq	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	sar	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x0c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sarq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: sar	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x18,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	sar	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x1c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sarw	%dx
+# INTEL: {evex}	sar	dx
+0x62,0xf4,0x7d,0x08,0xd1,0xfa
+
+# ATT:   {nf}	sarw	%dx
+# INTEL: {nf}	sar	dx
+0x62,0xf4,0x7d,0x0c,0xd1,0xfa
+
+# ATT:   sarw	%dx, %dx
+# INTEL: sar	dx, dx
+0x62,0xf4,0x6d,0x18,0xd1,0xfa
+
+# ATT:   {nf}	sarw	%dx, %dx
+# INTEL: {nf}	sar	dx, dx
+0x62,0xf4,0x6d,0x1c,0xd1,0xfa
+
+# ATT:   {evex}	sarl	%ecx
+# INTEL: {evex}	sar	ecx
+0x62,0xf4,0x7c,0x08,0xd1,0xf9
+
+# ATT:   {nf}	sarl	%ecx
+# INTEL: {nf}	sar	ecx
+0x62,0xf4,0x7c,0x0c,0xd1,0xf9
+
+# ATT:   sarl	%ecx, %ecx
+# INTEL: sar	ecx, ecx
+0x62,0xf4,0x74,0x18,0xd1,0xf9
+
+# ATT:   {nf}	sarl	%ecx, %ecx
+# INTEL: {nf}	sar	ecx, ecx
+0x62,0xf4,0x74,0x1c,0xd1,0xf9
+
+# ATT:   {evex}	sarq	%r9
+# INTEL: {evex}	sar	r9
+0x62,0xd4,0xfc,0x08,0xd1,0xf9
+
+# ATT:   {nf}	sarq	%r9
+# INTEL: {nf}	sar	r9
+0x62,0xd4,0xfc,0x0c,0xd1,0xf9
+
+# ATT:   sarq	%r9, %r9
+# INTEL: sar	r9, r9
+0x62,0xd4,0xb4,0x18,0xd1,0xf9
+
+# ATT:   {nf}	sarq	%r9, %r9
+# INTEL: {nf}	sar	r9, r9
+0x62,0xd4,0xb4,0x1c,0xd1,0xf9
+
+# ATT:   {evex}	sarb	291(%r8,%rax,4)
+# INTEL: {evex}	sar	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarb	291(%r8,%rax,4)
+# INTEL: {nf}	sar	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sarb	291(%r8,%rax,4), %bl
+# INTEL: sar	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	sar	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x1c,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sarw	291(%r8,%rax,4)
+# INTEL: {evex}	sar	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarw	291(%r8,%rax,4)
+# INTEL: {nf}	sar	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sarw	291(%r8,%rax,4), %dx
+# INTEL: sar	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	sar	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x1c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sarl	291(%r8,%rax,4)
+# INTEL: {evex}	sar	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarl	291(%r8,%rax,4)
+# INTEL: {nf}	sar	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sarl	291(%r8,%rax,4), %ecx
+# INTEL: sar	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	sar	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	sarq	291(%r8,%rax,4)
+# INTEL: {evex}	sar	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarq	291(%r8,%rax,4)
+# INTEL: {nf}	sar	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x0c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   sarq	291(%r8,%rax,4), %r9
+# INTEL: sar	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	sarq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	sar	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x1c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/shl.txt b/llvm/test/MC/Disassembler/X86/apx/shl.txt
new file mode 100644
index 0000000000000..6f0b1b9a2af2e
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/shl.txt
@@ -0,0 +1,386 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	shlb	$123, %bl
+# INTEL: {evex}	shl	bl, 123
+0x62,0xf4,0x7c,0x08,0xc0,0xe3,0x7b
+
+# ATT:   {nf}	shlb	$123, %bl
+# INTEL: {nf}	shl	bl, 123
+0x62,0xf4,0x7c,0x0c,0xc0,0xe3,0x7b
+
+# ATT:   shlb	$123, %bl, %bl
+# INTEL: shl	bl, bl, 123
+0x62,0xf4,0x64,0x18,0xc0,0xe3,0x7b
+
+# ATT:   {nf}	shlb	$123, %bl, %bl
+# INTEL: {nf}	shl	bl, bl, 123
+0x62,0xf4,0x64,0x1c,0xc0,0xe3,0x7b
+
+# ATT:   {evex}	shlw	$123, %dx
+# INTEL: {evex}	shl	dx, 123
+0x62,0xf4,0x7d,0x08,0xc1,0xe2,0x7b
+
+# ATT:   {nf}	shlw	$123, %dx
+# INTEL: {nf}	shl	dx, 123
+0x62,0xf4,0x7d,0x0c,0xc1,0xe2,0x7b
+
+# ATT:   shlw	$123, %dx, %dx
+# INTEL: shl	dx, dx, 123
+0x62,0xf4,0x6d,0x18,0xc1,0xe2,0x7b
+
+# ATT:   {nf}	shlw	$123, %dx, %dx
+# INTEL: {nf}	shl	dx, dx, 123
+0x62,0xf4,0x6d,0x1c,0xc1,0xe2,0x7b
+
+# ATT:   {evex}	shll	$123, %ecx
+# INTEL: {evex}	shl	ecx, 123
+0x62,0xf4,0x7c,0x08,0xc1,0xe1,0x7b
+
+# ATT:   {nf}	shll	$123, %ecx
+# INTEL: {nf}	shl	ecx, 123
+0x62,0xf4,0x7c,0x0c,0xc1,0xe1,0x7b
+
+# ATT:   shll	$123, %ecx, %ecx
+# INTEL: shl	ecx, ecx, 123
+0x62,0xf4,0x74,0x18,0xc1,0xe1,0x7b
+
+# ATT:   {nf}	shll	$123, %ecx, %ecx
+# INTEL: {nf}	shl	ecx, ecx, 123
+0x62,0xf4,0x74,0x1c,0xc1,0xe1,0x7b
+
+# ATT:   {evex}	shlq	$123, %r9
+# INTEL: {evex}	shl	r9, 123
+0x62,0xd4,0xfc,0x08,0xc1,0xe1,0x7b
+
+# ATT:   {nf}	shlq	$123, %r9
+# INTEL: {nf}	shl	r9, 123
+0x62,0xd4,0xfc,0x0c,0xc1,0xe1,0x7b
+
+# ATT:   shlq	$123, %r9, %r9
+# INTEL: shl	r9, r9, 123
+0x62,0xd4,0xb4,0x18,0xc1,0xe1,0x7b
+
+# ATT:   {nf}	shlq	$123, %r9, %r9
+# INTEL: {nf}	shl	r9, r9, 123
+0x62,0xd4,0xb4,0x1c,0xc1,0xe1,0x7b
+
+# ATT:   {evex}	shlb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	shl	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shlb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	shl	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shlb	$123, 291(%r8,%rax,4), %bl
+# INTEL: shl	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shlb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	shl	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shlw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	shl	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shlw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	shl	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shlw	$123, 291(%r8,%rax,4), %dx
+# INTEL: shl	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shlw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	shl	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shll	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	shl	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shll	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	shl	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shll	$123, 291(%r8,%rax,4), %ecx
+# INTEL: shl	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shll	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shl	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shlq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	shl	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shlq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	shl	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shlq	$123, 291(%r8,%rax,4), %r9
+# INTEL: shl	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shlq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	shl	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shlb	%bl
+# INTEL: {evex}	shl	bl
+0x62,0xf4,0x7c,0x08,0xd0,0xe3
+
+# ATT:   {nf}	shlb	%bl
+# INTEL: {nf}	shl	bl
+0x62,0xf4,0x7c,0x0c,0xd0,0xe3
+
+# ATT:   shlb	%bl, %bl
+# INTEL: shl	bl, bl
+0x62,0xf4,0x64,0x18,0xd0,0xe3
+
+# ATT:   {nf}	shlb	%bl, %bl
+# INTEL: {nf}	shl	bl, bl
+0x62,0xf4,0x64,0x1c,0xd0,0xe3
+
+# ATT:   {evex}	shlb	%cl, %bl
+# INTEL: {evex}	shl	bl, cl
+0x62,0xf4,0x7c,0x08,0xd2,0xe3
+
+# ATT:   {nf}	shlb	%cl, %bl
+# INTEL: {nf}	shl	bl, cl
+0x62,0xf4,0x7c,0x0c,0xd2,0xe3
+
+# ATT:   shlb	%cl, %bl, %bl
+# INTEL: shl	bl, bl, cl
+0x62,0xf4,0x64,0x18,0xd2,0xe3
+
+# ATT:   {nf}	shlb	%cl, %bl, %bl
+# INTEL: {nf}	shl	bl, bl, cl
+0x62,0xf4,0x64,0x1c,0xd2,0xe3
+
+# ATT:   {evex}	shlw	%cl, %dx
+# INTEL: {evex}	shl	dx, cl
+0x62,0xf4,0x7d,0x08,0xd3,0xe2
+
+# ATT:   {nf}	shlw	%cl, %dx
+# INTEL: {nf}	shl	dx, cl
+0x62,0xf4,0x7d,0x0c,0xd3,0xe2
+
+# ATT:   shlw	%cl, %dx, %dx
+# INTEL: shl	dx, dx, cl
+0x62,0xf4,0x6d,0x18,0xd3,0xe2
+
+# ATT:   {nf}	shlw	%cl, %dx, %dx
+# INTEL: {nf}	shl	dx, dx, cl
+0x62,0xf4,0x6d,0x1c,0xd3,0xe2
+
+# ATT:   {evex}	shll	%cl, %ecx
+# INTEL: {evex}	shl	ecx, cl
+0x62,0xf4,0x7c,0x08,0xd3,0xe1
+
+# ATT:   {nf}	shll	%cl, %ecx
+# INTEL: {nf}	shl	ecx, cl
+0x62,0xf4,0x7c,0x0c,0xd3,0xe1
+
+# ATT:   shll	%cl, %ecx, %ecx
+# INTEL: shl	ecx, ecx, cl
+0x62,0xf4,0x74,0x18,0xd3,0xe1
+
+# ATT:   {nf}	shll	%cl, %ecx, %ecx
+# INTEL: {nf}	shl	ecx, ecx, cl
+0x62,0xf4,0x74,0x1c,0xd3,0xe1
+
+# ATT:   {evex}	shlq	%cl, %r9
+# INTEL: {evex}	shl	r9, cl
+0x62,0xd4,0xfc,0x08,0xd3,0xe1
+
+# ATT:   {nf}	shlq	%cl, %r9
+# INTEL: {nf}	shl	r9, cl
+0x62,0xd4,0xfc,0x0c,0xd3,0xe1
+
+# ATT:   shlq	%cl, %r9, %r9
+# INTEL: shl	r9, r9, cl
+0x62,0xd4,0xb4,0x18,0xd3,0xe1
+
+# ATT:   {nf}	shlq	%cl, %r9, %r9
+# INTEL: {nf}	shl	r9, r9, cl
+0x62,0xd4,0xb4,0x1c,0xd3,0xe1
+
+# ATT:   {evex}	shlb	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	shl	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlb	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	shl	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shlb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: shl	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x18,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	shl	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x1c,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shlw	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	shl	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x08,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlw	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	shl	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x0c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shlw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: shl	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x18,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	shl	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x1c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shll	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	shl	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shll	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	shl	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shll	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: shl	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x18,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shll	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shl	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x1c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shlq	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	shl	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x08,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlq	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	shl	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x0c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shlq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: shl	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x18,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	shl	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x1c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shlw	%dx
+# INTEL: {evex}	shl	dx
+0x62,0xf4,0x7d,0x08,0xd1,0xe2
+
+# ATT:   {nf}	shlw	%dx
+# INTEL: {nf}	shl	dx
+0x62,0xf4,0x7d,0x0c,0xd1,0xe2
+
+# ATT:   shlw	%dx, %dx
+# INTEL: shl	dx, dx
+0x62,0xf4,0x6d,0x18,0xd1,0xe2
+
+# ATT:   {nf}	shlw	%dx, %dx
+# INTEL: {nf}	shl	dx, dx
+0x62,0xf4,0x6d,0x1c,0xd1,0xe2
+
+# ATT:   {evex}	shll	%ecx
+# INTEL: {evex}	shl	ecx
+0x62,0xf4,0x7c,0x08,0xd1,0xe1
+
+# ATT:   {nf}	shll	%ecx
+# INTEL: {nf}	shl	ecx
+0x62,0xf4,0x7c,0x0c,0xd1,0xe1
+
+# ATT:   shll	%ecx, %ecx
+# INTEL: shl	ecx, ecx
+0x62,0xf4,0x74,0x18,0xd1,0xe1
+
+# ATT:   {nf}	shll	%ecx, %ecx
+# INTEL: {nf}	shl	ecx, ecx
+0x62,0xf4,0x74,0x1c,0xd1,0xe1
+
+# ATT:   {evex}	shlq	%r9
+# INTEL: {evex}	shl	r9
+0x62,0xd4,0xfc,0x08,0xd1,0xe1
+
+# ATT:   {nf}	shlq	%r9
+# INTEL: {nf}	shl	r9
+0x62,0xd4,0xfc,0x0c,0xd1,0xe1
+
+# ATT:   shlq	%r9, %r9
+# INTEL: shl	r9, r9
+0x62,0xd4,0xb4,0x18,0xd1,0xe1
+
+# ATT:   {nf}	shlq	%r9, %r9
+# INTEL: {nf}	shl	r9, r9
+0x62,0xd4,0xb4,0x1c,0xd1,0xe1
+
+# ATT:   {evex}	shlb	291(%r8,%rax,4)
+# INTEL: {evex}	shl	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlb	291(%r8,%rax,4)
+# INTEL: {nf}	shl	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shlb	291(%r8,%rax,4), %bl
+# INTEL: shl	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	shl	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x1c,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shlw	291(%r8,%rax,4)
+# INTEL: {evex}	shl	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlw	291(%r8,%rax,4)
+# INTEL: {nf}	shl	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shlw	291(%r8,%rax,4), %dx
+# INTEL: shl	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	shl	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x1c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shll	291(%r8,%rax,4)
+# INTEL: {evex}	shl	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shll	291(%r8,%rax,4)
+# INTEL: {nf}	shl	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shll	291(%r8,%rax,4), %ecx
+# INTEL: shl	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shll	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shl	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shlq	291(%r8,%rax,4)
+# INTEL: {evex}	shl	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlq	291(%r8,%rax,4)
+# INTEL: {nf}	shl	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x0c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shlq	291(%r8,%rax,4), %r9
+# INTEL: shl	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shlq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	shl	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x1c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/shld.txt b/llvm/test/MC/Disassembler/X86/apx/shld.txt
new file mode 100644
index 0000000000000..2a633b9396040
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/shld.txt
@@ -0,0 +1,194 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	shldw	$123, %dx, %dx
+# INTEL: {evex}	shld	dx, dx, 123
+0x62,0xf4,0x7d,0x08,0x24,0xd2,0x7b
+
+# ATT:   {nf}	shldw	$123, %dx, %dx
+# INTEL: {nf}	shld	dx, dx, 123
+0x62,0xf4,0x7d,0x0c,0x24,0xd2,0x7b
+
+# ATT:   shldw	$123, %dx, %dx, %dx
+# INTEL: shld	dx, dx, dx, 123
+0x62,0xf4,0x6d,0x18,0x24,0xd2,0x7b
+
+# ATT:   {nf}	shldw	$123, %dx, %dx, %dx
+# INTEL: {nf}	shld	dx, dx, dx, 123
+0x62,0xf4,0x6d,0x1c,0x24,0xd2,0x7b
+
+# ATT:   {evex}	shldw	$123, %dx, 291(%r8,%rax,4)
+# INTEL: {evex}	shld	word ptr [r8 + 4*rax + 291], dx, 123
+0x62,0xd4,0x7d,0x08,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shldw	$123, %dx, 291(%r8,%rax,4)
+# INTEL: {nf}	shld	word ptr [r8 + 4*rax + 291], dx, 123
+0x62,0xd4,0x7d,0x0c,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shldw	$123, %dx, 291(%r8,%rax,4), %dx
+# INTEL: shld	dx, word ptr [r8 + 4*rax + 291], dx, 123
+0x62,0xd4,0x6d,0x18,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shldw	$123, %dx, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	shld	dx, word ptr [r8 + 4*rax + 291], dx, 123
+0x62,0xd4,0x6d,0x1c,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shldl	$123, %ecx, %ecx
+# INTEL: {evex}	shld	ecx, ecx, 123
+0x62,0xf4,0x7c,0x08,0x24,0xc9,0x7b
+
+# ATT:   {nf}	shldl	$123, %ecx, %ecx
+# INTEL: {nf}	shld	ecx, ecx, 123
+0x62,0xf4,0x7c,0x0c,0x24,0xc9,0x7b
+
+# ATT:   shldl	$123, %ecx, %ecx, %ecx
+# INTEL: shld	ecx, ecx, ecx, 123
+0x62,0xf4,0x74,0x18,0x24,0xc9,0x7b
+
+# ATT:   {nf}	shldl	$123, %ecx, %ecx, %ecx
+# INTEL: {nf}	shld	ecx, ecx, ecx, 123
+0x62,0xf4,0x74,0x1c,0x24,0xc9,0x7b
+
+# ATT:   {evex}	shldl	$123, %ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	shld	dword ptr [r8 + 4*rax + 291], ecx, 123
+0x62,0xd4,0x7c,0x08,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shldl	$123, %ecx, 291(%r8,%rax,4)
+# INTEL: {nf}	shld	dword ptr [r8 + 4*rax + 291], ecx, 123
+0x62,0xd4,0x7c,0x0c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shldl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# INTEL: shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+0x62,0xd4,0x74,0x18,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shldl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+0x62,0xd4,0x74,0x1c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shldq	$123, %r9, %r9
+# INTEL: {evex}	shld	r9, r9, 123
+0x62,0x54,0xfc,0x08,0x24,0xc9,0x7b
+
+# ATT:   {nf}	shldq	$123, %r9, %r9
+# INTEL: {nf}	shld	r9, r9, 123
+0x62,0x54,0xfc,0x0c,0x24,0xc9,0x7b
+
+# ATT:   shldq	$123, %r9, %r9, %r9
+# INTEL: shld	r9, r9, r9, 123
+0x62,0x54,0xb4,0x18,0x24,0xc9,0x7b
+
+# ATT:   {nf}	shldq	$123, %r9, %r9, %r9
+# INTEL: {nf}	shld	r9, r9, r9, 123
+0x62,0x54,0xb4,0x1c,0x24,0xc9,0x7b
+
+# ATT:   {evex}	shldq	$123, %r9, 291(%r8,%rax,4)
+# INTEL: {evex}	shld	qword ptr [r8 + 4*rax + 291], r9, 123
+0x62,0x54,0xfc,0x08,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shldq	$123, %r9, 291(%r8,%rax,4)
+# INTEL: {nf}	shld	qword ptr [r8 + 4*rax + 291], r9, 123
+0x62,0x54,0xfc,0x0c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shldq	$123, %r9, 291(%r8,%rax,4), %r9
+# INTEL: shld	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+0x62,0x54,0xb4,0x18,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shldq	$123, %r9, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	shld	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+0x62,0x54,0xb4,0x1c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shldw	%cl, %dx, %dx
+# INTEL: {evex}	shld	dx, dx, cl
+0x62,0xf4,0x7d,0x08,0xa5,0xd2
+
+# ATT:   {nf}	shldw	%cl, %dx, %dx
+# INTEL: {nf}	shld	dx, dx, cl
+0x62,0xf4,0x7d,0x0c,0xa5,0xd2
+
+# ATT:   shldw	%cl, %dx, %dx, %dx
+# INTEL: shld	dx, dx, dx, cl
+0x62,0xf4,0x6d,0x18,0xa5,0xd2
+
+# ATT:   {nf}	shldw	%cl, %dx, %dx, %dx
+# INTEL: {nf}	shld	dx, dx, dx, cl
+0x62,0xf4,0x6d,0x1c,0xa5,0xd2
+
+# ATT:   {evex}	shldw	%cl, %dx, 291(%r8,%rax,4)
+# INTEL: {evex}	shld	word ptr [r8 + 4*rax + 291], dx, cl
+0x62,0xd4,0x7d,0x08,0xa5,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shldw	%cl, %dx, 291(%r8,%rax,4)
+# INTEL: {nf}	shld	word ptr [r8 + 4*rax + 291], dx, cl
+0x62,0xd4,0x7d,0x0c,0xa5,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shldw	%cl, %dx, 291(%r8,%rax,4), %dx
+# INTEL: shld	dx, word ptr [r8 + 4*rax + 291], dx, cl
+0x62,0xd4,0x6d,0x18,0xa5,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shldw	%cl, %dx, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	shld	dx, word ptr [r8 + 4*rax + 291], dx, cl
+0x62,0xd4,0x6d,0x1c,0xa5,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shldl	%cl, %ecx, %ecx
+# INTEL: {evex}	shld	ecx, ecx, cl
+0x62,0xf4,0x7c,0x08,0xa5,0xc9
+
+# ATT:   {nf}	shldl	%cl, %ecx, %ecx
+# INTEL: {nf}	shld	ecx, ecx, cl
+0x62,0xf4,0x7c,0x0c,0xa5,0xc9
+
+# ATT:   shldl	%cl, %ecx, %ecx, %ecx
+# INTEL: shld	ecx, ecx, ecx, cl
+0x62,0xf4,0x74,0x18,0xa5,0xc9
+
+# ATT:   {nf}	shldl	%cl, %ecx, %ecx, %ecx
+# INTEL: {nf}	shld	ecx, ecx, ecx, cl
+0x62,0xf4,0x74,0x1c,0xa5,0xc9
+
+# ATT:   {evex}	shldl	%cl, %ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	shld	dword ptr [r8 + 4*rax + 291], ecx, cl
+0x62,0xd4,0x7c,0x08,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shldl	%cl, %ecx, 291(%r8,%rax,4)
+# INTEL: {nf}	shld	dword ptr [r8 + 4*rax + 291], ecx, cl
+0x62,0xd4,0x7c,0x0c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shldl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# INTEL: shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+0x62,0xd4,0x74,0x18,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shldl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+0x62,0xd4,0x74,0x1c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shldq	%cl, %r9, %r9
+# INTEL: {evex}	shld	r9, r9, cl
+0x62,0x54,0xfc,0x08,0xa5,0xc9
+
+# ATT:   {nf}	shldq	%cl, %r9, %r9
+# INTEL: {nf}	shld	r9, r9, cl
+0x62,0x54,0xfc,0x0c,0xa5,0xc9
+
+# ATT:   shldq	%cl, %r9, %r9, %r9
+# INTEL: shld	r9, r9, r9, cl
+0x62,0x54,0xb4,0x18,0xa5,0xc9
+
+# ATT:   {nf}	shldq	%cl, %r9, %r9, %r9
+# INTEL: {nf}	shld	r9, r9, r9, cl
+0x62,0x54,0xb4,0x1c,0xa5,0xc9
+
+# ATT:   {evex}	shldq	%cl, %r9, 291(%r8,%rax,4)
+# INTEL: {evex}	shld	qword ptr [r8 + 4*rax + 291], r9, cl
+0x62,0x54,0xfc,0x08,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shldq	%cl, %r9, 291(%r8,%rax,4)
+# INTEL: {nf}	shld	qword ptr [r8 + 4*rax + 291], r9, cl
+0x62,0x54,0xfc,0x0c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shldq	%cl, %r9, 291(%r8,%rax,4), %r9
+# INTEL: shld	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+0x62,0x54,0xb4,0x18,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shldq	%cl, %r9, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	shld	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+0x62,0x54,0xb4,0x1c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/shr.txt b/llvm/test/MC/Disassembler/X86/apx/shr.txt
new file mode 100644
index 0000000000000..1e7e1732b56fb
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/shr.txt
@@ -0,0 +1,386 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	shrb	$123, %bl
+# INTEL: {evex}	shr	bl, 123
+0x62,0xf4,0x7c,0x08,0xc0,0xeb,0x7b
+
+# ATT:   {nf}	shrb	$123, %bl
+# INTEL: {nf}	shr	bl, 123
+0x62,0xf4,0x7c,0x0c,0xc0,0xeb,0x7b
+
+# ATT:   shrb	$123, %bl, %bl
+# INTEL: shr	bl, bl, 123
+0x62,0xf4,0x64,0x18,0xc0,0xeb,0x7b
+
+# ATT:   {nf}	shrb	$123, %bl, %bl
+# INTEL: {nf}	shr	bl, bl, 123
+0x62,0xf4,0x64,0x1c,0xc0,0xeb,0x7b
+
+# ATT:   {evex}	shrw	$123, %dx
+# INTEL: {evex}	shr	dx, 123
+0x62,0xf4,0x7d,0x08,0xc1,0xea,0x7b
+
+# ATT:   {nf}	shrw	$123, %dx
+# INTEL: {nf}	shr	dx, 123
+0x62,0xf4,0x7d,0x0c,0xc1,0xea,0x7b
+
+# ATT:   shrw	$123, %dx, %dx
+# INTEL: shr	dx, dx, 123
+0x62,0xf4,0x6d,0x18,0xc1,0xea,0x7b
+
+# ATT:   {nf}	shrw	$123, %dx, %dx
+# INTEL: {nf}	shr	dx, dx, 123
+0x62,0xf4,0x6d,0x1c,0xc1,0xea,0x7b
+
+# ATT:   {evex}	shrl	$123, %ecx
+# INTEL: {evex}	shr	ecx, 123
+0x62,0xf4,0x7c,0x08,0xc1,0xe9,0x7b
+
+# ATT:   {nf}	shrl	$123, %ecx
+# INTEL: {nf}	shr	ecx, 123
+0x62,0xf4,0x7c,0x0c,0xc1,0xe9,0x7b
+
+# ATT:   shrl	$123, %ecx, %ecx
+# INTEL: shr	ecx, ecx, 123
+0x62,0xf4,0x74,0x18,0xc1,0xe9,0x7b
+
+# ATT:   {nf}	shrl	$123, %ecx, %ecx
+# INTEL: {nf}	shr	ecx, ecx, 123
+0x62,0xf4,0x74,0x1c,0xc1,0xe9,0x7b
+
+# ATT:   {evex}	shrq	$123, %r9
+# INTEL: {evex}	shr	r9, 123
+0x62,0xd4,0xfc,0x08,0xc1,0xe9,0x7b
+
+# ATT:   {nf}	shrq	$123, %r9
+# INTEL: {nf}	shr	r9, 123
+0x62,0xd4,0xfc,0x0c,0xc1,0xe9,0x7b
+
+# ATT:   shrq	$123, %r9, %r9
+# INTEL: shr	r9, r9, 123
+0x62,0xd4,0xb4,0x18,0xc1,0xe9,0x7b
+
+# ATT:   {nf}	shrq	$123, %r9, %r9
+# INTEL: {nf}	shr	r9, r9, 123
+0x62,0xd4,0xb4,0x1c,0xc1,0xe9,0x7b
+
+# ATT:   {evex}	shrb	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	shr	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrb	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	shr	byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shrb	$123, 291(%r8,%rax,4), %bl
+# INTEL: shr	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x18,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrb	$123, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	shr	bl, byte ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x64,0x1c,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shrw	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	shr	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x08,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrw	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	shr	word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7d,0x0c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shrw	$123, 291(%r8,%rax,4), %dx
+# INTEL: shr	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x18,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrw	$123, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	shr	dx, word ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x6d,0x1c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shrl	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	shr	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x08,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrl	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	shr	dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x7c,0x0c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shrl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: shr	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x18,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrl	$123, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shr	ecx, dword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0x74,0x1c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shrq	$123, 291(%r8,%rax,4)
+# INTEL: {evex}	shr	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x08,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrq	$123, 291(%r8,%rax,4)
+# INTEL: {nf}	shr	qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xfc,0x0c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shrq	$123, 291(%r8,%rax,4), %r9
+# INTEL: shr	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x18,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrq	$123, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	shr	r9, qword ptr [r8 + 4*rax + 291], 123
+0x62,0xd4,0xb4,0x1c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shrb	%bl
+# INTEL: {evex}	shr	bl
+0x62,0xf4,0x7c,0x08,0xd0,0xeb
+
+# ATT:   {nf}	shrb	%bl
+# INTEL: {nf}	shr	bl
+0x62,0xf4,0x7c,0x0c,0xd0,0xeb
+
+# ATT:   shrb	%bl, %bl
+# INTEL: shr	bl, bl
+0x62,0xf4,0x64,0x18,0xd0,0xeb
+
+# ATT:   {nf}	shrb	%bl, %bl
+# INTEL: {nf}	shr	bl, bl
+0x62,0xf4,0x64,0x1c,0xd0,0xeb
+
+# ATT:   {evex}	shrb	%cl, %bl
+# INTEL: {evex}	shr	bl, cl
+0x62,0xf4,0x7c,0x08,0xd2,0xeb
+
+# ATT:   {nf}	shrb	%cl, %bl
+# INTEL: {nf}	shr	bl, cl
+0x62,0xf4,0x7c,0x0c,0xd2,0xeb
+
+# ATT:   shrb	%cl, %bl, %bl
+# INTEL: shr	bl, bl, cl
+0x62,0xf4,0x64,0x18,0xd2,0xeb
+
+# ATT:   {nf}	shrb	%cl, %bl, %bl
+# INTEL: {nf}	shr	bl, bl, cl
+0x62,0xf4,0x64,0x1c,0xd2,0xeb
+
+# ATT:   {evex}	shrw	%cl, %dx
+# INTEL: {evex}	shr	dx, cl
+0x62,0xf4,0x7d,0x08,0xd3,0xea
+
+# ATT:   {nf}	shrw	%cl, %dx
+# INTEL: {nf}	shr	dx, cl
+0x62,0xf4,0x7d,0x0c,0xd3,0xea
+
+# ATT:   shrw	%cl, %dx, %dx
+# INTEL: shr	dx, dx, cl
+0x62,0xf4,0x6d,0x18,0xd3,0xea
+
+# ATT:   {nf}	shrw	%cl, %dx, %dx
+# INTEL: {nf}	shr	dx, dx, cl
+0x62,0xf4,0x6d,0x1c,0xd3,0xea
+
+# ATT:   {evex}	shrl	%cl, %ecx
+# INTEL: {evex}	shr	ecx, cl
+0x62,0xf4,0x7c,0x08,0xd3,0xe9
+
+# ATT:   {nf}	shrl	%cl, %ecx
+# INTEL: {nf}	shr	ecx, cl
+0x62,0xf4,0x7c,0x0c,0xd3,0xe9
+
+# ATT:   shrl	%cl, %ecx, %ecx
+# INTEL: shr	ecx, ecx, cl
+0x62,0xf4,0x74,0x18,0xd3,0xe9
+
+# ATT:   {nf}	shrl	%cl, %ecx, %ecx
+# INTEL: {nf}	shr	ecx, ecx, cl
+0x62,0xf4,0x74,0x1c,0xd3,0xe9
+
+# ATT:   {evex}	shrq	%cl, %r9
+# INTEL: {evex}	shr	r9, cl
+0x62,0xd4,0xfc,0x08,0xd3,0xe9
+
+# ATT:   {nf}	shrq	%cl, %r9
+# INTEL: {nf}	shr	r9, cl
+0x62,0xd4,0xfc,0x0c,0xd3,0xe9
+
+# ATT:   shrq	%cl, %r9, %r9
+# INTEL: shr	r9, r9, cl
+0x62,0xd4,0xb4,0x18,0xd3,0xe9
+
+# ATT:   {nf}	shrq	%cl, %r9, %r9
+# INTEL: {nf}	shr	r9, r9, cl
+0x62,0xd4,0xb4,0x1c,0xd3,0xe9
+
+# ATT:   {evex}	shrb	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	shr	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd2,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrb	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	shr	byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd2,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: shr	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x18,0xd2,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrb	%cl, 291(%r8,%rax,4), %bl
+# INTEL: {nf}	shr	bl, byte ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x64,0x1c,0xd2,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shrw	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	shr	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x08,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrw	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	shr	word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7d,0x0c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: shr	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x18,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrw	%cl, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	shr	dx, word ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x6d,0x1c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shrl	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	shr	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x08,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrl	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	shr	dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x7c,0x0c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrl	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: shr	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x18,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrl	%cl, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shr	ecx, dword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0x74,0x1c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shrq	%cl, 291(%r8,%rax,4)
+# INTEL: {evex}	shr	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x08,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrq	%cl, 291(%r8,%rax,4)
+# INTEL: {nf}	shr	qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xfc,0x0c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: shr	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x18,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrq	%cl, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	shr	r9, qword ptr [r8 + 4*rax + 291], cl
+0x62,0xd4,0xb4,0x1c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shrw	%dx
+# INTEL: {evex}	shr	dx
+0x62,0xf4,0x7d,0x08,0xd1,0xea
+
+# ATT:   {nf}	shrw	%dx
+# INTEL: {nf}	shr	dx
+0x62,0xf4,0x7d,0x0c,0xd1,0xea
+
+# ATT:   shrw	%dx, %dx
+# INTEL: shr	dx, dx
+0x62,0xf4,0x6d,0x18,0xd1,0xea
+
+# ATT:   {nf}	shrw	%dx, %dx
+# INTEL: {nf}	shr	dx, dx
+0x62,0xf4,0x6d,0x1c,0xd1,0xea
+
+# ATT:   {evex}	shrl	%ecx
+# INTEL: {evex}	shr	ecx
+0x62,0xf4,0x7c,0x08,0xd1,0xe9
+
+# ATT:   {nf}	shrl	%ecx
+# INTEL: {nf}	shr	ecx
+0x62,0xf4,0x7c,0x0c,0xd1,0xe9
+
+# ATT:   shrl	%ecx, %ecx
+# INTEL: shr	ecx, ecx
+0x62,0xf4,0x74,0x18,0xd1,0xe9
+
+# ATT:   {nf}	shrl	%ecx, %ecx
+# INTEL: {nf}	shr	ecx, ecx
+0x62,0xf4,0x74,0x1c,0xd1,0xe9
+
+# ATT:   {evex}	shrq	%r9
+# INTEL: {evex}	shr	r9
+0x62,0xd4,0xfc,0x08,0xd1,0xe9
+
+# ATT:   {nf}	shrq	%r9
+# INTEL: {nf}	shr	r9
+0x62,0xd4,0xfc,0x0c,0xd1,0xe9
+
+# ATT:   shrq	%r9, %r9
+# INTEL: shr	r9, r9
+0x62,0xd4,0xb4,0x18,0xd1,0xe9
+
+# ATT:   {nf}	shrq	%r9, %r9
+# INTEL: {nf}	shr	r9, r9
+0x62,0xd4,0xb4,0x1c,0xd1,0xe9
+
+# ATT:   {evex}	shrb	291(%r8,%rax,4)
+# INTEL: {evex}	shr	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd0,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrb	291(%r8,%rax,4)
+# INTEL: {nf}	shr	byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd0,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrb	291(%r8,%rax,4), %bl
+# INTEL: shr	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x18,0xd0,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrb	291(%r8,%rax,4), %bl
+# INTEL: {nf}	shr	bl, byte ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x64,0x1c,0xd0,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shrw	291(%r8,%rax,4)
+# INTEL: {evex}	shr	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x08,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrw	291(%r8,%rax,4)
+# INTEL: {nf}	shr	word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7d,0x0c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrw	291(%r8,%rax,4), %dx
+# INTEL: shr	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x18,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrw	291(%r8,%rax,4), %dx
+# INTEL: {nf}	shr	dx, word ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x6d,0x1c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shrl	291(%r8,%rax,4)
+# INTEL: {evex}	shr	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x08,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrl	291(%r8,%rax,4)
+# INTEL: {nf}	shr	dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x7c,0x0c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrl	291(%r8,%rax,4), %ecx
+# INTEL: shr	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x18,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrl	291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shr	ecx, dword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0x74,0x1c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shrq	291(%r8,%rax,4)
+# INTEL: {evex}	shr	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x08,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrq	291(%r8,%rax,4)
+# INTEL: {nf}	shr	qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xfc,0x0c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrq	291(%r8,%rax,4), %r9
+# INTEL: shr	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x18,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrq	291(%r8,%rax,4), %r9
+# INTEL: {nf}	shr	r9, qword ptr [r8 + 4*rax + 291]
+0x62,0xd4,0xb4,0x1c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/Disassembler/X86/apx/shrd.txt b/llvm/test/MC/Disassembler/X86/apx/shrd.txt
new file mode 100644
index 0000000000000..4f71bfda77f7d
--- /dev/null
+++ b/llvm/test/MC/Disassembler/X86/apx/shrd.txt
@@ -0,0 +1,194 @@
+# RUN: llvm-mc -triple x86_64 -disassemble %s | FileCheck %s --check-prefix=ATT
+# RUN: llvm-mc -triple x86_64 -disassemble -output-asm-variant=1 %s | FileCheck %s --check-prefix=INTEL
+
+# ATT:   {evex}	shrdw	$123, %dx, %dx
+# INTEL: {evex}	shrd	dx, dx, 123
+0x62,0xf4,0x7d,0x08,0x2c,0xd2,0x7b
+
+# ATT:   {nf}	shrdw	$123, %dx, %dx
+# INTEL: {nf}	shrd	dx, dx, 123
+0x62,0xf4,0x7d,0x0c,0x2c,0xd2,0x7b
+
+# ATT:   shrdw	$123, %dx, %dx, %dx
+# INTEL: shrd	dx, dx, dx, 123
+0x62,0xf4,0x6d,0x18,0x2c,0xd2,0x7b
+
+# ATT:   {nf}	shrdw	$123, %dx, %dx, %dx
+# INTEL: {nf}	shrd	dx, dx, dx, 123
+0x62,0xf4,0x6d,0x1c,0x2c,0xd2,0x7b
+
+# ATT:   {evex}	shrdw	$123, %dx, 291(%r8,%rax,4)
+# INTEL: {evex}	shrd	word ptr [r8 + 4*rax + 291], dx, 123
+0x62,0xd4,0x7d,0x08,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrdw	$123, %dx, 291(%r8,%rax,4)
+# INTEL: {nf}	shrd	word ptr [r8 + 4*rax + 291], dx, 123
+0x62,0xd4,0x7d,0x0c,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shrdw	$123, %dx, 291(%r8,%rax,4), %dx
+# INTEL: shrd	dx, word ptr [r8 + 4*rax + 291], dx, 123
+0x62,0xd4,0x6d,0x18,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrdw	$123, %dx, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	shrd	dx, word ptr [r8 + 4*rax + 291], dx, 123
+0x62,0xd4,0x6d,0x1c,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shrdl	$123, %ecx, %ecx
+# INTEL: {evex}	shrd	ecx, ecx, 123
+0x62,0xf4,0x7c,0x08,0x2c,0xc9,0x7b
+
+# ATT:   {nf}	shrdl	$123, %ecx, %ecx
+# INTEL: {nf}	shrd	ecx, ecx, 123
+0x62,0xf4,0x7c,0x0c,0x2c,0xc9,0x7b
+
+# ATT:   shrdl	$123, %ecx, %ecx, %ecx
+# INTEL: shrd	ecx, ecx, ecx, 123
+0x62,0xf4,0x74,0x18,0x2c,0xc9,0x7b
+
+# ATT:   {nf}	shrdl	$123, %ecx, %ecx, %ecx
+# INTEL: {nf}	shrd	ecx, ecx, ecx, 123
+0x62,0xf4,0x74,0x1c,0x2c,0xc9,0x7b
+
+# ATT:   {evex}	shrdl	$123, %ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	shrd	dword ptr [r8 + 4*rax + 291], ecx, 123
+0x62,0xd4,0x7c,0x08,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrdl	$123, %ecx, 291(%r8,%rax,4)
+# INTEL: {nf}	shrd	dword ptr [r8 + 4*rax + 291], ecx, 123
+0x62,0xd4,0x7c,0x0c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shrdl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# INTEL: shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+0x62,0xd4,0x74,0x18,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrdl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+0x62,0xd4,0x74,0x1c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shrdq	$123, %r9, %r9
+# INTEL: {evex}	shrd	r9, r9, 123
+0x62,0x54,0xfc,0x08,0x2c,0xc9,0x7b
+
+# ATT:   {nf}	shrdq	$123, %r9, %r9
+# INTEL: {nf}	shrd	r9, r9, 123
+0x62,0x54,0xfc,0x0c,0x2c,0xc9,0x7b
+
+# ATT:   shrdq	$123, %r9, %r9, %r9
+# INTEL: shrd	r9, r9, r9, 123
+0x62,0x54,0xb4,0x18,0x2c,0xc9,0x7b
+
+# ATT:   {nf}	shrdq	$123, %r9, %r9, %r9
+# INTEL: {nf}	shrd	r9, r9, r9, 123
+0x62,0x54,0xb4,0x1c,0x2c,0xc9,0x7b
+
+# ATT:   {evex}	shrdq	$123, %r9, 291(%r8,%rax,4)
+# INTEL: {evex}	shrd	qword ptr [r8 + 4*rax + 291], r9, 123
+0x62,0x54,0xfc,0x08,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrdq	$123, %r9, 291(%r8,%rax,4)
+# INTEL: {nf}	shrd	qword ptr [r8 + 4*rax + 291], r9, 123
+0x62,0x54,0xfc,0x0c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   shrdq	$123, %r9, 291(%r8,%rax,4), %r9
+# INTEL: shrd	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+0x62,0x54,0xb4,0x18,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {nf}	shrdq	$123, %r9, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	shrd	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+0x62,0x54,0xb4,0x1c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b
+
+# ATT:   {evex}	shrdw	%cl, %dx, %dx
+# INTEL: {evex}	shrd	dx, dx, cl
+0x62,0xf4,0x7d,0x08,0xad,0xd2
+
+# ATT:   {nf}	shrdw	%cl, %dx, %dx
+# INTEL: {nf}	shrd	dx, dx, cl
+0x62,0xf4,0x7d,0x0c,0xad,0xd2
+
+# ATT:   shrdw	%cl, %dx, %dx, %dx
+# INTEL: shrd	dx, dx, dx, cl
+0x62,0xf4,0x6d,0x18,0xad,0xd2
+
+# ATT:   {nf}	shrdw	%cl, %dx, %dx, %dx
+# INTEL: {nf}	shrd	dx, dx, dx, cl
+0x62,0xf4,0x6d,0x1c,0xad,0xd2
+
+# ATT:   {evex}	shrdw	%cl, %dx, 291(%r8,%rax,4)
+# INTEL: {evex}	shrd	word ptr [r8 + 4*rax + 291], dx, cl
+0x62,0xd4,0x7d,0x08,0xad,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrdw	%cl, %dx, 291(%r8,%rax,4)
+# INTEL: {nf}	shrd	word ptr [r8 + 4*rax + 291], dx, cl
+0x62,0xd4,0x7d,0x0c,0xad,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrdw	%cl, %dx, 291(%r8,%rax,4), %dx
+# INTEL: shrd	dx, word ptr [r8 + 4*rax + 291], dx, cl
+0x62,0xd4,0x6d,0x18,0xad,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrdw	%cl, %dx, 291(%r8,%rax,4), %dx
+# INTEL: {nf}	shrd	dx, word ptr [r8 + 4*rax + 291], dx, cl
+0x62,0xd4,0x6d,0x1c,0xad,0x94,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shrdl	%cl, %ecx, %ecx
+# INTEL: {evex}	shrd	ecx, ecx, cl
+0x62,0xf4,0x7c,0x08,0xad,0xc9
+
+# ATT:   {nf}	shrdl	%cl, %ecx, %ecx
+# INTEL: {nf}	shrd	ecx, ecx, cl
+0x62,0xf4,0x7c,0x0c,0xad,0xc9
+
+# ATT:   shrdl	%cl, %ecx, %ecx, %ecx
+# INTEL: shrd	ecx, ecx, ecx, cl
+0x62,0xf4,0x74,0x18,0xad,0xc9
+
+# ATT:   {nf}	shrdl	%cl, %ecx, %ecx, %ecx
+# INTEL: {nf}	shrd	ecx, ecx, ecx, cl
+0x62,0xf4,0x74,0x1c,0xad,0xc9
+
+# ATT:   {evex}	shrdl	%cl, %ecx, 291(%r8,%rax,4)
+# INTEL: {evex}	shrd	dword ptr [r8 + 4*rax + 291], ecx, cl
+0x62,0xd4,0x7c,0x08,0xad,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrdl	%cl, %ecx, 291(%r8,%rax,4)
+# INTEL: {nf}	shrd	dword ptr [r8 + 4*rax + 291], ecx, cl
+0x62,0xd4,0x7c,0x0c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrdl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# INTEL: shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+0x62,0xd4,0x74,0x18,0xad,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrdl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# INTEL: {nf}	shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+0x62,0xd4,0x74,0x1c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {evex}	shrdq	%cl, %r9, %r9
+# INTEL: {evex}	shrd	r9, r9, cl
+0x62,0x54,0xfc,0x08,0xad,0xc9
+
+# ATT:   {nf}	shrdq	%cl, %r9, %r9
+# INTEL: {nf}	shrd	r9, r9, cl
+0x62,0x54,0xfc,0x0c,0xad,0xc9
+
+# ATT:   shrdq	%cl, %r9, %r9, %r9
+# INTEL: shrd	r9, r9, r9, cl
+0x62,0x54,0xb4,0x18,0xad,0xc9
+
+# ATT:   {nf}	shrdq	%cl, %r9, %r9, %r9
+# INTEL: {nf}	shrd	r9, r9, r9, cl
+0x62,0x54,0xb4,0x1c,0xad,0xc9
+
+# ATT:   {evex}	shrdq	%cl, %r9, 291(%r8,%rax,4)
+# INTEL: {evex}	shrd	qword ptr [r8 + 4*rax + 291], r9, cl
+0x62,0x54,0xfc,0x08,0xad,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrdq	%cl, %r9, 291(%r8,%rax,4)
+# INTEL: {nf}	shrd	qword ptr [r8 + 4*rax + 291], r9, cl
+0x62,0x54,0xfc,0x0c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   shrdq	%cl, %r9, 291(%r8,%rax,4), %r9
+# INTEL: shrd	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+0x62,0x54,0xb4,0x18,0xad,0x8c,0x80,0x23,0x01,0x00,0x00
+
+# ATT:   {nf}	shrdq	%cl, %r9, 291(%r8,%rax,4), %r9
+# INTEL: {nf}	shrd	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+0x62,0x54,0xb4,0x1c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00
diff --git a/llvm/test/MC/X86/apx/rcl-att.s b/llvm/test/MC/X86/apx/rcl-att.s
new file mode 100644
index 0000000000000..68bbb970a1ec4
--- /dev/null
+++ b/llvm/test/MC/X86/apx/rcl-att.s
@@ -0,0 +1,146 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-47: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	rclb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xd3,0x7b]
+         {evex}	rclb	$123, %bl
+# CHECK: rclb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xd3,0x7b]
+         rclb	$123, %bl, %bl
+# CHECK: {evex}	rclw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xd2,0x7b]
+         {evex}	rclw	$123, %dx
+# CHECK: rclw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xd2,0x7b]
+         rclw	$123, %dx, %dx
+# CHECK: {evex}	rcll	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xd1,0x7b]
+         {evex}	rcll	$123, %ecx
+# CHECK: rcll	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xd1,0x7b]
+         rcll	$123, %ecx, %ecx
+# CHECK: {evex}	rclq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xd1,0x7b]
+         {evex}	rclq	$123, %r9
+# CHECK: rclq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xd1,0x7b]
+         rclq	$123, %r9, %r9
+# CHECK: {evex}	rclb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rclb	$123, 291(%r8,%rax,4)
+# CHECK: rclb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rclb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	rclw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rclw	$123, 291(%r8,%rax,4)
+# CHECK: rclw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rclw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	rcll	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcll	$123, 291(%r8,%rax,4)
+# CHECK: rcll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rclq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rclq	$123, 291(%r8,%rax,4)
+# CHECK: rclq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rclq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	rclb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xd3]
+         {evex}	rclb	%bl
+# CHECK: {evex}	rclb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xd3]
+         {evex}	rclb	%cl, %bl
+# CHECK: rclb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xd3]
+         rclb	%cl, %bl, %bl
+# CHECK: {evex}	rclw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xd2]
+         {evex}	rclw	%cl, %dx
+# CHECK: rclw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xd2]
+         rclw	%cl, %dx, %dx
+# CHECK: {evex}	rcll	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xd1]
+         {evex}	rcll	%cl, %ecx
+# CHECK: rcll	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xd1]
+         rcll	%cl, %ecx, %ecx
+# CHECK: {evex}	rclq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xd1]
+         {evex}	rclq	%cl, %r9
+# CHECK: rclq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xd1]
+         rclq	%cl, %r9, %r9
+# CHECK: {evex}	rclb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rclb	%cl, 291(%r8,%rax,4)
+# CHECK: rclb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0x94,0x80,0x23,0x01,0x00,0x00]
+         rclb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	rclw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rclw	%cl, 291(%r8,%rax,4)
+# CHECK: rclw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         rclw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	rcll	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcll	%cl, 291(%r8,%rax,4)
+# CHECK: rcll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rclq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rclq	%cl, 291(%r8,%rax,4)
+# CHECK: rclq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         rclq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	rclw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xd2]
+         {evex}	rclw	%dx
+# CHECK: rclw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xd2]
+         rclw	%dx, %dx
+# CHECK: {evex}	rcll	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xd1]
+         {evex}	rcll	%ecx
+# CHECK: rcll	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xd1]
+         rcll	%ecx, %ecx
+# CHECK: {evex}	rclq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xd1]
+         {evex}	rclq	%r9
+# CHECK: rclq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xd1]
+         rclq	%r9, %r9
+# CHECK: {evex}	rclb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rclb	291(%r8,%rax,4)
+# CHECK: rclb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0x94,0x80,0x23,0x01,0x00,0x00]
+         rclb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	rclw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rclw	291(%r8,%rax,4)
+# CHECK: rclw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         rclw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	rcll	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcll	291(%r8,%rax,4)
+# CHECK: rcll	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcll	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rclq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rclq	291(%r8,%rax,4)
+# CHECK: rclq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         rclq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/rcl-intel.s b/llvm/test/MC/X86/apx/rcl-intel.s
new file mode 100644
index 0000000000000..8477cb6461a36
--- /dev/null
+++ b/llvm/test/MC/X86/apx/rcl-intel.s
@@ -0,0 +1,143 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	rcl	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xd3,0x7b]
+         {evex}	rcl	bl, 123
+# CHECK: rcl	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xd3,0x7b]
+         rcl	bl, bl, 123
+# CHECK: {evex}	rcl	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xd2,0x7b]
+         {evex}	rcl	dx, 123
+# CHECK: rcl	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xd2,0x7b]
+         rcl	dx, dx, 123
+# CHECK: {evex}	rcl	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xd1,0x7b]
+         {evex}	rcl	ecx, 123
+# CHECK: rcl	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xd1,0x7b]
+         rcl	ecx, ecx, 123
+# CHECK: {evex}	rcl	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xd1,0x7b]
+         {evex}	rcl	r9, 123
+# CHECK: rcl	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xd1,0x7b]
+         rcl	r9, r9, 123
+# CHECK: {evex}	rcl	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcl	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: rcl	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcl	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rcl	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcl	word ptr [r8 + 4*rax + 291], 123
+# CHECK: rcl	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcl	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rcl	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcl	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: rcl	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcl	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rcl	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcl	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: rcl	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcl	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rcl	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xd3]
+         {evex}	rcl	bl
+# CHECK: {evex}	rcl	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xd3]
+         {evex}	rcl	bl, cl
+# CHECK: rcl	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xd3]
+         rcl	bl, bl, cl
+# CHECK: {evex}	rcl	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xd2]
+         {evex}	rcl	dx, cl
+# CHECK: rcl	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xd2]
+         rcl	dx, dx, cl
+# CHECK: {evex}	rcl	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xd1]
+         {evex}	rcl	ecx, cl
+# CHECK: rcl	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xd1]
+         rcl	ecx, ecx, cl
+# CHECK: {evex}	rcl	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xd1]
+         {evex}	rcl	r9, cl
+# CHECK: rcl	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xd1]
+         rcl	r9, r9, cl
+# CHECK: {evex}	rcl	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcl	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: rcl	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcl	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rcl	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcl	word ptr [r8 + 4*rax + 291], cl
+# CHECK: rcl	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcl	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rcl	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcl	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: rcl	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcl	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rcl	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcl	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: rcl	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcl	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rcl	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xd2]
+         {evex}	rcl	dx
+# CHECK: rcl	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xd2]
+         rcl	dx, dx
+# CHECK: {evex}	rcl	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xd1]
+         {evex}	rcl	ecx
+# CHECK: rcl	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xd1]
+         rcl	ecx, ecx
+# CHECK: {evex}	rcl	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xd1]
+         {evex}	rcl	r9
+# CHECK: rcl	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xd1]
+         rcl	r9, r9
+# CHECK: {evex}	rcl	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcl	byte ptr [r8 + 4*rax + 291]
+# CHECK: rcl	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcl	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	rcl	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcl	word ptr [r8 + 4*rax + 291]
+# CHECK: rcl	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcl	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	rcl	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcl	dword ptr [r8 + 4*rax + 291]
+# CHECK: rcl	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcl	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	rcl	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcl	qword ptr [r8 + 4*rax + 291]
+# CHECK: rcl	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0x94,0x80,0x23,0x01,0x00,0x00]
+         rcl	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/rcr-att.s b/llvm/test/MC/X86/apx/rcr-att.s
new file mode 100644
index 0000000000000..50136bca94447
--- /dev/null
+++ b/llvm/test/MC/X86/apx/rcr-att.s
@@ -0,0 +1,146 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-47: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	rcrb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xdb,0x7b]
+         {evex}	rcrb	$123, %bl
+# CHECK: rcrb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xdb,0x7b]
+         rcrb	$123, %bl, %bl
+# CHECK: {evex}	rcrw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xda,0x7b]
+         {evex}	rcrw	$123, %dx
+# CHECK: rcrw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xda,0x7b]
+         rcrw	$123, %dx, %dx
+# CHECK: {evex}	rcrl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xd9,0x7b]
+         {evex}	rcrl	$123, %ecx
+# CHECK: rcrl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xd9,0x7b]
+         rcrl	$123, %ecx, %ecx
+# CHECK: {evex}	rcrq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xd9,0x7b]
+         {evex}	rcrq	$123, %r9
+# CHECK: rcrq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xd9,0x7b]
+         rcrq	$123, %r9, %r9
+# CHECK: {evex}	rcrb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcrb	$123, 291(%r8,%rax,4)
+# CHECK: rcrb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcrb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	rcrw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcrw	$123, 291(%r8,%rax,4)
+# CHECK: rcrw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcrw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	rcrl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcrl	$123, 291(%r8,%rax,4)
+# CHECK: rcrl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcrl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rcrq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcrq	$123, 291(%r8,%rax,4)
+# CHECK: rcrq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcrq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	rcrb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xdb]
+         {evex}	rcrb	%bl
+# CHECK: {evex}	rcrb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xdb]
+         {evex}	rcrb	%cl, %bl
+# CHECK: rcrb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xdb]
+         rcrb	%cl, %bl, %bl
+# CHECK: {evex}	rcrw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xda]
+         {evex}	rcrw	%cl, %dx
+# CHECK: rcrw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xda]
+         rcrw	%cl, %dx, %dx
+# CHECK: {evex}	rcrl	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xd9]
+         {evex}	rcrl	%cl, %ecx
+# CHECK: rcrl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xd9]
+         rcrl	%cl, %ecx, %ecx
+# CHECK: {evex}	rcrq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xd9]
+         {evex}	rcrq	%cl, %r9
+# CHECK: rcrq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xd9]
+         rcrq	%cl, %r9, %r9
+# CHECK: {evex}	rcrb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcrb	%cl, 291(%r8,%rax,4)
+# CHECK: rcrb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcrb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	rcrw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcrw	%cl, 291(%r8,%rax,4)
+# CHECK: rcrw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcrw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	rcrl	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcrl	%cl, 291(%r8,%rax,4)
+# CHECK: rcrl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcrl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rcrq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcrq	%cl, 291(%r8,%rax,4)
+# CHECK: rcrq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcrq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	rcrw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xda]
+         {evex}	rcrw	%dx
+# CHECK: rcrw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xda]
+         rcrw	%dx, %dx
+# CHECK: {evex}	rcrl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xd9]
+         {evex}	rcrl	%ecx
+# CHECK: rcrl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xd9]
+         rcrl	%ecx, %ecx
+# CHECK: {evex}	rcrq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xd9]
+         {evex}	rcrq	%r9
+# CHECK: rcrq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xd9]
+         rcrq	%r9, %r9
+# CHECK: {evex}	rcrb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcrb	291(%r8,%rax,4)
+# CHECK: rcrb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcrb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	rcrw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcrw	291(%r8,%rax,4)
+# CHECK: rcrw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcrw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	rcrl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcrl	291(%r8,%rax,4)
+# CHECK: rcrl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcrl	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rcrq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcrq	291(%r8,%rax,4)
+# CHECK: rcrq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcrq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/rcr-intel.s b/llvm/test/MC/X86/apx/rcr-intel.s
new file mode 100644
index 0000000000000..91303ec6ce2c2
--- /dev/null
+++ b/llvm/test/MC/X86/apx/rcr-intel.s
@@ -0,0 +1,143 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	rcr	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xdb,0x7b]
+         {evex}	rcr	bl, 123
+# CHECK: rcr	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xdb,0x7b]
+         rcr	bl, bl, 123
+# CHECK: {evex}	rcr	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xda,0x7b]
+         {evex}	rcr	dx, 123
+# CHECK: rcr	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xda,0x7b]
+         rcr	dx, dx, 123
+# CHECK: {evex}	rcr	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xd9,0x7b]
+         {evex}	rcr	ecx, 123
+# CHECK: rcr	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xd9,0x7b]
+         rcr	ecx, ecx, 123
+# CHECK: {evex}	rcr	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xd9,0x7b]
+         {evex}	rcr	r9, 123
+# CHECK: rcr	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xd9,0x7b]
+         rcr	r9, r9, 123
+# CHECK: {evex}	rcr	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcr	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: rcr	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcr	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rcr	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcr	word ptr [r8 + 4*rax + 291], 123
+# CHECK: rcr	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcr	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rcr	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcr	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: rcr	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcr	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rcr	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rcr	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: rcr	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0x9c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rcr	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rcr	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xdb]
+         {evex}	rcr	bl
+# CHECK: {evex}	rcr	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xdb]
+         {evex}	rcr	bl, cl
+# CHECK: rcr	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xdb]
+         rcr	bl, bl, cl
+# CHECK: {evex}	rcr	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xda]
+         {evex}	rcr	dx, cl
+# CHECK: rcr	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xda]
+         rcr	dx, dx, cl
+# CHECK: {evex}	rcr	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xd9]
+         {evex}	rcr	ecx, cl
+# CHECK: rcr	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xd9]
+         rcr	ecx, ecx, cl
+# CHECK: {evex}	rcr	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xd9]
+         {evex}	rcr	r9, cl
+# CHECK: rcr	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xd9]
+         rcr	r9, r9, cl
+# CHECK: {evex}	rcr	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcr	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: rcr	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcr	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rcr	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcr	word ptr [r8 + 4*rax + 291], cl
+# CHECK: rcr	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcr	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rcr	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcr	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: rcr	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcr	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rcr	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcr	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: rcr	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcr	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rcr	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xda]
+         {evex}	rcr	dx
+# CHECK: rcr	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xda]
+         rcr	dx, dx
+# CHECK: {evex}	rcr	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xd9]
+         {evex}	rcr	ecx
+# CHECK: rcr	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xd9]
+         rcr	ecx, ecx
+# CHECK: {evex}	rcr	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xd9]
+         {evex}	rcr	r9
+# CHECK: rcr	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xd9]
+         rcr	r9, r9
+# CHECK: {evex}	rcr	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcr	byte ptr [r8 + 4*rax + 291]
+# CHECK: rcr	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcr	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	rcr	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcr	word ptr [r8 + 4*rax + 291]
+# CHECK: rcr	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcr	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	rcr	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcr	dword ptr [r8 + 4*rax + 291]
+# CHECK: rcr	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcr	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	rcr	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rcr	qword ptr [r8 + 4*rax + 291]
+# CHECK: rcr	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0x9c,0x80,0x23,0x01,0x00,0x00]
+         rcr	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/rol-att.s b/llvm/test/MC/X86/apx/rol-att.s
new file mode 100644
index 0000000000000..30c1c3a9ffe4c
--- /dev/null
+++ b/llvm/test/MC/X86/apx/rol-att.s
@@ -0,0 +1,287 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-94: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	rolb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xc3,0x7b]
+         {evex}	rolb	$123, %bl
+# CHECK: {nf}	rolb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xc3,0x7b]
+         {nf}	rolb	$123, %bl
+# CHECK: rolb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xc3,0x7b]
+         rolb	$123, %bl, %bl
+# CHECK: {nf}	rolb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xc3,0x7b]
+         {nf}	rolb	$123, %bl, %bl
+# CHECK: {evex}	rolw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xc2,0x7b]
+         {evex}	rolw	$123, %dx
+# CHECK: {nf}	rolw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xc2,0x7b]
+         {nf}	rolw	$123, %dx
+# CHECK: rolw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xc2,0x7b]
+         rolw	$123, %dx, %dx
+# CHECK: {nf}	rolw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xc2,0x7b]
+         {nf}	rolw	$123, %dx, %dx
+# CHECK: {evex}	roll	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xc1,0x7b]
+         {evex}	roll	$123, %ecx
+# CHECK: {nf}	roll	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xc1,0x7b]
+         {nf}	roll	$123, %ecx
+# CHECK: roll	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xc1,0x7b]
+         roll	$123, %ecx, %ecx
+# CHECK: {nf}	roll	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xc1,0x7b]
+         {nf}	roll	$123, %ecx, %ecx
+# CHECK: {evex}	rolq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xc1,0x7b]
+         {evex}	rolq	$123, %r9
+# CHECK: {nf}	rolq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xc1,0x7b]
+         {nf}	rolq	$123, %r9
+# CHECK: rolq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xc1,0x7b]
+         rolq	$123, %r9, %r9
+# CHECK: {nf}	rolq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xc1,0x7b]
+         {nf}	rolq	$123, %r9, %r9
+# CHECK: {evex}	rolb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rolb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	rolb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rolb	$123, 291(%r8,%rax,4)
+# CHECK: rolb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rolb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	rolb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rolb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	rolw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rolw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	rolw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rolw	$123, 291(%r8,%rax,4)
+# CHECK: rolw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rolw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	rolw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rolw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	roll	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	roll	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	roll	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	roll	$123, 291(%r8,%rax,4)
+# CHECK: roll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         roll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	roll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	roll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rolq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rolq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	rolq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rolq	$123, 291(%r8,%rax,4)
+# CHECK: rolq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rolq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	rolq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rolq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	rolb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xc3]
+         {evex}	rolb	%bl
+# CHECK: {nf}	rolb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xc3]
+         {nf}	rolb	%bl
+# CHECK: {evex}	rolb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xc3]
+         {evex}	rolb	%cl, %bl
+# CHECK: {nf}	rolb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xc3]
+         {nf}	rolb	%cl, %bl
+# CHECK: rolb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xc3]
+         rolb	%cl, %bl, %bl
+# CHECK: {nf}	rolb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xc3]
+         {nf}	rolb	%cl, %bl, %bl
+# CHECK: {evex}	rolw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xc2]
+         {evex}	rolw	%cl, %dx
+# CHECK: {nf}	rolw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xc2]
+         {nf}	rolw	%cl, %dx
+# CHECK: rolw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xc2]
+         rolw	%cl, %dx, %dx
+# CHECK: {nf}	rolw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xc2]
+         {nf}	rolw	%cl, %dx, %dx
+# CHECK: {evex}	roll	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xc1]
+         {evex}	roll	%cl, %ecx
+# CHECK: {nf}	roll	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xc1]
+         {nf}	roll	%cl, %ecx
+# CHECK: roll	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xc1]
+         roll	%cl, %ecx, %ecx
+# CHECK: {nf}	roll	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xc1]
+         {nf}	roll	%cl, %ecx, %ecx
+# CHECK: {evex}	rolq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xc1]
+         {evex}	rolq	%cl, %r9
+# CHECK: {nf}	rolq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xc1]
+         {nf}	rolq	%cl, %r9
+# CHECK: rolq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xc1]
+         rolq	%cl, %r9, %r9
+# CHECK: {nf}	rolq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xc1]
+         {nf}	rolq	%cl, %r9, %r9
+# CHECK: {evex}	rolb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rolb	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	rolb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolb	%cl, 291(%r8,%rax,4)
+# CHECK: rolb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0x84,0x80,0x23,0x01,0x00,0x00]
+         rolb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	rolb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	rolw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rolw	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	rolw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolw	%cl, 291(%r8,%rax,4)
+# CHECK: rolw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         rolw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	rolw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	roll	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	roll	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	roll	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	roll	%cl, 291(%r8,%rax,4)
+# CHECK: roll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         roll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	roll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	roll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rolq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rolq	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	rolq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolq	%cl, 291(%r8,%rax,4)
+# CHECK: rolq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         rolq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	rolq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	rolw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xc2]
+         {evex}	rolw	%dx
+# CHECK: {nf}	rolw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xc2]
+         {nf}	rolw	%dx
+# CHECK: rolw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xc2]
+         rolw	%dx, %dx
+# CHECK: {nf}	rolw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xc2]
+         {nf}	rolw	%dx, %dx
+# CHECK: {evex}	roll	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xc1]
+         {evex}	roll	%ecx
+# CHECK: {nf}	roll	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xc1]
+         {nf}	roll	%ecx
+# CHECK: roll	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xc1]
+         roll	%ecx, %ecx
+# CHECK: {nf}	roll	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xc1]
+         {nf}	roll	%ecx, %ecx
+# CHECK: {evex}	rolq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xc1]
+         {evex}	rolq	%r9
+# CHECK: {nf}	rolq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xc1]
+         {nf}	rolq	%r9
+# CHECK: rolq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xc1]
+         rolq	%r9, %r9
+# CHECK: {nf}	rolq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xc1]
+         {nf}	rolq	%r9, %r9
+# CHECK: {evex}	rolb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rolb	291(%r8,%rax,4)
+# CHECK: {nf}	rolb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolb	291(%r8,%rax,4)
+# CHECK: rolb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0x84,0x80,0x23,0x01,0x00,0x00]
+         rolb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	rolb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	rolw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rolw	291(%r8,%rax,4)
+# CHECK: {nf}	rolw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolw	291(%r8,%rax,4)
+# CHECK: rolw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         rolw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	rolw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	roll	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	roll	291(%r8,%rax,4)
+# CHECK: {nf}	roll	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	roll	291(%r8,%rax,4)
+# CHECK: roll	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         roll	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	roll	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	roll	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rolq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rolq	291(%r8,%rax,4)
+# CHECK: {nf}	rolq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolq	291(%r8,%rax,4)
+# CHECK: rolq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         rolq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	rolq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rolq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/rol-intel.s b/llvm/test/MC/X86/apx/rol-intel.s
new file mode 100644
index 0000000000000..87d2914c9d545
--- /dev/null
+++ b/llvm/test/MC/X86/apx/rol-intel.s
@@ -0,0 +1,284 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	rol	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xc3,0x7b]
+         {evex}	rol	bl, 123
+# CHECK: {nf}	rol	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xc3,0x7b]
+         {nf}	rol	bl, 123
+# CHECK: rol	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xc3,0x7b]
+         rol	bl, bl, 123
+# CHECK: {nf}	rol	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xc3,0x7b]
+         {nf}	rol	bl, bl, 123
+# CHECK: {evex}	rol	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xc2,0x7b]
+         {evex}	rol	dx, 123
+# CHECK: {nf}	rol	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xc2,0x7b]
+         {nf}	rol	dx, 123
+# CHECK: rol	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xc2,0x7b]
+         rol	dx, dx, 123
+# CHECK: {nf}	rol	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xc2,0x7b]
+         {nf}	rol	dx, dx, 123
+# CHECK: {evex}	rol	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xc1,0x7b]
+         {evex}	rol	ecx, 123
+# CHECK: {nf}	rol	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xc1,0x7b]
+         {nf}	rol	ecx, 123
+# CHECK: rol	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xc1,0x7b]
+         rol	ecx, ecx, 123
+# CHECK: {nf}	rol	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xc1,0x7b]
+         {nf}	rol	ecx, ecx, 123
+# CHECK: {evex}	rol	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xc1,0x7b]
+         {evex}	rol	r9, 123
+# CHECK: {nf}	rol	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xc1,0x7b]
+         {nf}	rol	r9, 123
+# CHECK: rol	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xc1,0x7b]
+         rol	r9, r9, 123
+# CHECK: {nf}	rol	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xc1,0x7b]
+         {nf}	rol	r9, r9, 123
+# CHECK: {evex}	rol	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rol	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	rol	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rol	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: rol	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rol	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	rol	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rol	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rol	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rol	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	rol	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rol	word ptr [r8 + 4*rax + 291], 123
+# CHECK: rol	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rol	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	rol	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rol	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rol	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rol	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	rol	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rol	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: rol	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rol	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	rol	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rol	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rol	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rol	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	rol	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rol	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: rol	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rol	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	rol	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0x84,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rol	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	rol	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xc3]
+         {evex}	rol	bl
+# CHECK: {nf}	rol	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xc3]
+         {nf}	rol	bl
+# CHECK: {evex}	rol	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xc3]
+         {evex}	rol	bl, cl
+# CHECK: {nf}	rol	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xc3]
+         {nf}	rol	bl, cl
+# CHECK: rol	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xc3]
+         rol	bl, bl, cl
+# CHECK: {nf}	rol	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xc3]
+         {nf}	rol	bl, bl, cl
+# CHECK: {evex}	rol	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xc2]
+         {evex}	rol	dx, cl
+# CHECK: {nf}	rol	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xc2]
+         {nf}	rol	dx, cl
+# CHECK: rol	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xc2]
+         rol	dx, dx, cl
+# CHECK: {nf}	rol	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xc2]
+         {nf}	rol	dx, dx, cl
+# CHECK: {evex}	rol	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xc1]
+         {evex}	rol	ecx, cl
+# CHECK: {nf}	rol	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xc1]
+         {nf}	rol	ecx, cl
+# CHECK: rol	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xc1]
+         rol	ecx, ecx, cl
+# CHECK: {nf}	rol	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xc1]
+         {nf}	rol	ecx, ecx, cl
+# CHECK: {evex}	rol	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xc1]
+         {evex}	rol	r9, cl
+# CHECK: {nf}	rol	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xc1]
+         {nf}	rol	r9, cl
+# CHECK: rol	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xc1]
+         rol	r9, r9, cl
+# CHECK: {nf}	rol	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xc1]
+         {nf}	rol	r9, r9, cl
+# CHECK: {evex}	rol	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rol	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	rol	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: rol	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0x84,0x80,0x23,0x01,0x00,0x00]
+         rol	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	rol	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rol	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rol	word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	rol	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	word ptr [r8 + 4*rax + 291], cl
+# CHECK: rol	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         rol	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	rol	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rol	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rol	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	rol	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: rol	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         rol	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	rol	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rol	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rol	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	rol	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: rol	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         rol	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	rol	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	rol	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xc2]
+         {evex}	rol	dx
+# CHECK: {nf}	rol	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xc2]
+         {nf}	rol	dx
+# CHECK: rol	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xc2]
+         rol	dx, dx
+# CHECK: {nf}	rol	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xc2]
+         {nf}	rol	dx, dx
+# CHECK: {evex}	rol	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xc1]
+         {evex}	rol	ecx
+# CHECK: {nf}	rol	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xc1]
+         {nf}	rol	ecx
+# CHECK: rol	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xc1]
+         rol	ecx, ecx
+# CHECK: {nf}	rol	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xc1]
+         {nf}	rol	ecx, ecx
+# CHECK: {evex}	rol	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xc1]
+         {evex}	rol	r9
+# CHECK: {nf}	rol	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xc1]
+         {nf}	rol	r9
+# CHECK: rol	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xc1]
+         rol	r9, r9
+# CHECK: {nf}	rol	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xc1]
+         {nf}	rol	r9, r9
+# CHECK: {evex}	rol	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rol	byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	rol	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	byte ptr [r8 + 4*rax + 291]
+# CHECK: rol	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0x84,0x80,0x23,0x01,0x00,0x00]
+         rol	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	rol	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	rol	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rol	word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	rol	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	word ptr [r8 + 4*rax + 291]
+# CHECK: rol	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         rol	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	rol	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	rol	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rol	dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	rol	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	dword ptr [r8 + 4*rax + 291]
+# CHECK: rol	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         rol	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	rol	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	rol	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rol	qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	rol	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	qword ptr [r8 + 4*rax + 291]
+# CHECK: rol	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         rol	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	rol	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0x84,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rol	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/ror-att.s b/llvm/test/MC/X86/apx/ror-att.s
new file mode 100644
index 0000000000000..aa877f20e4e5d
--- /dev/null
+++ b/llvm/test/MC/X86/apx/ror-att.s
@@ -0,0 +1,287 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-94: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	rorb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xcb,0x7b]
+         {evex}	rorb	$123, %bl
+# CHECK: {nf}	rorb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xcb,0x7b]
+         {nf}	rorb	$123, %bl
+# CHECK: rorb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xcb,0x7b]
+         rorb	$123, %bl, %bl
+# CHECK: {nf}	rorb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xcb,0x7b]
+         {nf}	rorb	$123, %bl, %bl
+# CHECK: {evex}	rorw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xca,0x7b]
+         {evex}	rorw	$123, %dx
+# CHECK: {nf}	rorw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xca,0x7b]
+         {nf}	rorw	$123, %dx
+# CHECK: rorw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xca,0x7b]
+         rorw	$123, %dx, %dx
+# CHECK: {nf}	rorw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xca,0x7b]
+         {nf}	rorw	$123, %dx, %dx
+# CHECK: {evex}	rorl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xc9,0x7b]
+         {evex}	rorl	$123, %ecx
+# CHECK: {nf}	rorl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xc9,0x7b]
+         {nf}	rorl	$123, %ecx
+# CHECK: rorl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xc9,0x7b]
+         rorl	$123, %ecx, %ecx
+# CHECK: {nf}	rorl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xc9,0x7b]
+         {nf}	rorl	$123, %ecx, %ecx
+# CHECK: {evex}	rorq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xc9,0x7b]
+         {evex}	rorq	$123, %r9
+# CHECK: {nf}	rorq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xc9,0x7b]
+         {nf}	rorq	$123, %r9
+# CHECK: rorq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xc9,0x7b]
+         rorq	$123, %r9, %r9
+# CHECK: {nf}	rorq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xc9,0x7b]
+         {nf}	rorq	$123, %r9, %r9
+# CHECK: {evex}	rorb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rorb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	rorb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rorb	$123, 291(%r8,%rax,4)
+# CHECK: rorb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rorb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	rorb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rorb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	rorw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rorw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	rorw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rorw	$123, 291(%r8,%rax,4)
+# CHECK: rorw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rorw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	rorw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rorw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	rorl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rorl	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	rorl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rorl	$123, 291(%r8,%rax,4)
+# CHECK: rorl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rorl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	rorl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rorl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rorq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	rorq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	rorq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rorq	$123, 291(%r8,%rax,4)
+# CHECK: rorq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         rorq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	rorq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	rorq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	rorb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xcb]
+         {evex}	rorb	%bl
+# CHECK: {nf}	rorb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xcb]
+         {nf}	rorb	%bl
+# CHECK: {evex}	rorb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xcb]
+         {evex}	rorb	%cl, %bl
+# CHECK: {nf}	rorb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xcb]
+         {nf}	rorb	%cl, %bl
+# CHECK: rorb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xcb]
+         rorb	%cl, %bl, %bl
+# CHECK: {nf}	rorb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xcb]
+         {nf}	rorb	%cl, %bl, %bl
+# CHECK: {evex}	rorw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xca]
+         {evex}	rorw	%cl, %dx
+# CHECK: {nf}	rorw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xca]
+         {nf}	rorw	%cl, %dx
+# CHECK: rorw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xca]
+         rorw	%cl, %dx, %dx
+# CHECK: {nf}	rorw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xca]
+         {nf}	rorw	%cl, %dx, %dx
+# CHECK: {evex}	rorl	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xc9]
+         {evex}	rorl	%cl, %ecx
+# CHECK: {nf}	rorl	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xc9]
+         {nf}	rorl	%cl, %ecx
+# CHECK: rorl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xc9]
+         rorl	%cl, %ecx, %ecx
+# CHECK: {nf}	rorl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xc9]
+         {nf}	rorl	%cl, %ecx, %ecx
+# CHECK: {evex}	rorq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xc9]
+         {evex}	rorq	%cl, %r9
+# CHECK: {nf}	rorq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xc9]
+         {nf}	rorq	%cl, %r9
+# CHECK: rorq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xc9]
+         rorq	%cl, %r9, %r9
+# CHECK: {nf}	rorq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xc9]
+         {nf}	rorq	%cl, %r9, %r9
+# CHECK: {evex}	rorb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rorb	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	rorb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorb	%cl, 291(%r8,%rax,4)
+# CHECK: rorb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00]
+         rorb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	rorb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	rorw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rorw	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	rorw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorw	%cl, 291(%r8,%rax,4)
+# CHECK: rorw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         rorw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	rorw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	rorl	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rorl	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	rorl	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorl	%cl, 291(%r8,%rax,4)
+# CHECK: rorl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         rorl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	rorl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rorq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rorq	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	rorq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorq	%cl, 291(%r8,%rax,4)
+# CHECK: rorq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         rorq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	rorq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	rorw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xca]
+         {evex}	rorw	%dx
+# CHECK: {nf}	rorw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xca]
+         {nf}	rorw	%dx
+# CHECK: rorw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xca]
+         rorw	%dx, %dx
+# CHECK: {nf}	rorw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xca]
+         {nf}	rorw	%dx, %dx
+# CHECK: {evex}	rorl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xc9]
+         {evex}	rorl	%ecx
+# CHECK: {nf}	rorl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xc9]
+         {nf}	rorl	%ecx
+# CHECK: rorl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xc9]
+         rorl	%ecx, %ecx
+# CHECK: {nf}	rorl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xc9]
+         {nf}	rorl	%ecx, %ecx
+# CHECK: {evex}	rorq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xc9]
+         {evex}	rorq	%r9
+# CHECK: {nf}	rorq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xc9]
+         {nf}	rorq	%r9
+# CHECK: rorq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xc9]
+         rorq	%r9, %r9
+# CHECK: {nf}	rorq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xc9]
+         {nf}	rorq	%r9, %r9
+# CHECK: {evex}	rorb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rorb	291(%r8,%rax,4)
+# CHECK: {nf}	rorb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorb	291(%r8,%rax,4)
+# CHECK: rorb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00]
+         rorb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	rorb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	rorw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rorw	291(%r8,%rax,4)
+# CHECK: {nf}	rorw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorw	291(%r8,%rax,4)
+# CHECK: rorw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         rorw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	rorw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	rorl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rorl	291(%r8,%rax,4)
+# CHECK: {nf}	rorl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorl	291(%r8,%rax,4)
+# CHECK: rorl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         rorl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	rorl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorl	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	rorq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	rorq	291(%r8,%rax,4)
+# CHECK: {nf}	rorq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorq	291(%r8,%rax,4)
+# CHECK: rorq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         rorq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	rorq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	rorq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/ror-intel.s b/llvm/test/MC/X86/apx/ror-intel.s
new file mode 100644
index 0000000000000..27e2d6b6d24e7
--- /dev/null
+++ b/llvm/test/MC/X86/apx/ror-intel.s
@@ -0,0 +1,284 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	ror	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xcb,0x7b]
+         {evex}	ror	bl, 123
+# CHECK: {nf}	ror	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xcb,0x7b]
+         {nf}	ror	bl, 123
+# CHECK: ror	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xcb,0x7b]
+         ror	bl, bl, 123
+# CHECK: {nf}	ror	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xcb,0x7b]
+         {nf}	ror	bl, bl, 123
+# CHECK: {evex}	ror	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xca,0x7b]
+         {evex}	ror	dx, 123
+# CHECK: {nf}	ror	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xca,0x7b]
+         {nf}	ror	dx, 123
+# CHECK: ror	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xca,0x7b]
+         ror	dx, dx, 123
+# CHECK: {nf}	ror	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xca,0x7b]
+         {nf}	ror	dx, dx, 123
+# CHECK: {evex}	ror	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xc9,0x7b]
+         {evex}	ror	ecx, 123
+# CHECK: {nf}	ror	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xc9,0x7b]
+         {nf}	ror	ecx, 123
+# CHECK: ror	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xc9,0x7b]
+         ror	ecx, ecx, 123
+# CHECK: {nf}	ror	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xc9,0x7b]
+         {nf}	ror	ecx, ecx, 123
+# CHECK: {evex}	ror	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xc9,0x7b]
+         {evex}	ror	r9, 123
+# CHECK: {nf}	ror	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xc9,0x7b]
+         {nf}	ror	r9, 123
+# CHECK: ror	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xc9,0x7b]
+         ror	r9, r9, 123
+# CHECK: {nf}	ror	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xc9,0x7b]
+         {nf}	ror	r9, r9, 123
+# CHECK: {evex}	ror	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	ror	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	ror	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	ror	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: ror	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         ror	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	ror	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	ror	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	ror	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	ror	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	ror	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	ror	word ptr [r8 + 4*rax + 291], 123
+# CHECK: ror	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         ror	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	ror	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	ror	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	ror	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	ror	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	ror	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	ror	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: ror	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         ror	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	ror	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	ror	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	ror	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	ror	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	ror	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	ror	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: ror	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         ror	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	ror	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	ror	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	ror	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xcb]
+         {evex}	ror	bl
+# CHECK: {nf}	ror	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xcb]
+         {nf}	ror	bl
+# CHECK: {evex}	ror	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xcb]
+         {evex}	ror	bl, cl
+# CHECK: {nf}	ror	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xcb]
+         {nf}	ror	bl, cl
+# CHECK: ror	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xcb]
+         ror	bl, bl, cl
+# CHECK: {nf}	ror	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xcb]
+         {nf}	ror	bl, bl, cl
+# CHECK: {evex}	ror	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xca]
+         {evex}	ror	dx, cl
+# CHECK: {nf}	ror	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xca]
+         {nf}	ror	dx, cl
+# CHECK: ror	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xca]
+         ror	dx, dx, cl
+# CHECK: {nf}	ror	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xca]
+         {nf}	ror	dx, dx, cl
+# CHECK: {evex}	ror	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xc9]
+         {evex}	ror	ecx, cl
+# CHECK: {nf}	ror	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xc9]
+         {nf}	ror	ecx, cl
+# CHECK: ror	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xc9]
+         ror	ecx, ecx, cl
+# CHECK: {nf}	ror	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xc9]
+         {nf}	ror	ecx, ecx, cl
+# CHECK: {evex}	ror	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xc9]
+         {evex}	ror	r9, cl
+# CHECK: {nf}	ror	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xc9]
+         {nf}	ror	r9, cl
+# CHECK: ror	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xc9]
+         ror	r9, r9, cl
+# CHECK: {nf}	ror	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xc9]
+         {nf}	ror	r9, r9, cl
+# CHECK: {evex}	ror	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	ror	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	ror	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: ror	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00]
+         ror	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	ror	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	ror	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	ror	word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	ror	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	word ptr [r8 + 4*rax + 291], cl
+# CHECK: ror	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         ror	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	ror	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	ror	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	ror	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	ror	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: ror	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         ror	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	ror	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	ror	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	ror	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	ror	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: ror	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         ror	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	ror	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	ror	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xca]
+         {evex}	ror	dx
+# CHECK: {nf}	ror	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xca]
+         {nf}	ror	dx
+# CHECK: ror	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xca]
+         ror	dx, dx
+# CHECK: {nf}	ror	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xca]
+         {nf}	ror	dx, dx
+# CHECK: {evex}	ror	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xc9]
+         {evex}	ror	ecx
+# CHECK: {nf}	ror	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xc9]
+         {nf}	ror	ecx
+# CHECK: ror	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xc9]
+         ror	ecx, ecx
+# CHECK: {nf}	ror	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xc9]
+         {nf}	ror	ecx, ecx
+# CHECK: {evex}	ror	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xc9]
+         {evex}	ror	r9
+# CHECK: {nf}	ror	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xc9]
+         {nf}	ror	r9
+# CHECK: ror	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xc9]
+         ror	r9, r9
+# CHECK: {nf}	ror	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xc9]
+         {nf}	ror	r9, r9
+# CHECK: {evex}	ror	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	ror	byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	ror	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	byte ptr [r8 + 4*rax + 291]
+# CHECK: ror	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00]
+         ror	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	ror	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	ror	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	ror	word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	ror	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	word ptr [r8 + 4*rax + 291]
+# CHECK: ror	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         ror	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	ror	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	ror	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	ror	dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	ror	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	dword ptr [r8 + 4*rax + 291]
+# CHECK: ror	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         ror	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	ror	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	ror	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	ror	qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	ror	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	qword ptr [r8 + 4*rax + 291]
+# CHECK: ror	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         ror	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	ror	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	ror	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/sar-att.s b/llvm/test/MC/X86/apx/sar-att.s
new file mode 100644
index 0000000000000..9ab96f277bc70
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sar-att.s
@@ -0,0 +1,287 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-94: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	sarb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xfb,0x7b]
+         {evex}	sarb	$123, %bl
+# CHECK: {nf}	sarb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xfb,0x7b]
+         {nf}	sarb	$123, %bl
+# CHECK: sarb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xfb,0x7b]
+         sarb	$123, %bl, %bl
+# CHECK: {nf}	sarb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xfb,0x7b]
+         {nf}	sarb	$123, %bl, %bl
+# CHECK: {evex}	sarw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xfa,0x7b]
+         {evex}	sarw	$123, %dx
+# CHECK: {nf}	sarw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xfa,0x7b]
+         {nf}	sarw	$123, %dx
+# CHECK: sarw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xfa,0x7b]
+         sarw	$123, %dx, %dx
+# CHECK: {nf}	sarw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xfa,0x7b]
+         {nf}	sarw	$123, %dx, %dx
+# CHECK: {evex}	sarl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xf9,0x7b]
+         {evex}	sarl	$123, %ecx
+# CHECK: {nf}	sarl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xf9,0x7b]
+         {nf}	sarl	$123, %ecx
+# CHECK: sarl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xf9,0x7b]
+         sarl	$123, %ecx, %ecx
+# CHECK: {nf}	sarl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xf9,0x7b]
+         {nf}	sarl	$123, %ecx, %ecx
+# CHECK: {evex}	sarq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xf9,0x7b]
+         {evex}	sarq	$123, %r9
+# CHECK: {nf}	sarq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xf9,0x7b]
+         {nf}	sarq	$123, %r9
+# CHECK: sarq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xf9,0x7b]
+         sarq	$123, %r9, %r9
+# CHECK: {nf}	sarq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xf9,0x7b]
+         {nf}	sarq	$123, %r9, %r9
+# CHECK: {evex}	sarb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sarb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	sarb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sarb	$123, 291(%r8,%rax,4)
+# CHECK: sarb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sarb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	sarb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sarb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	sarw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sarw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	sarw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sarw	$123, 291(%r8,%rax,4)
+# CHECK: sarw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sarw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	sarw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sarw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	sarl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sarl	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	sarl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sarl	$123, 291(%r8,%rax,4)
+# CHECK: sarl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sarl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	sarl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sarl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	sarq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sarq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	sarq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sarq	$123, 291(%r8,%rax,4)
+# CHECK: sarq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sarq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	sarq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sarq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	sarb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xfb]
+         {evex}	sarb	%bl
+# CHECK: {nf}	sarb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xfb]
+         {nf}	sarb	%bl
+# CHECK: {evex}	sarb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xfb]
+         {evex}	sarb	%cl, %bl
+# CHECK: {nf}	sarb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xfb]
+         {nf}	sarb	%cl, %bl
+# CHECK: sarb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xfb]
+         sarb	%cl, %bl, %bl
+# CHECK: {nf}	sarb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xfb]
+         {nf}	sarb	%cl, %bl, %bl
+# CHECK: {evex}	sarw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xfa]
+         {evex}	sarw	%cl, %dx
+# CHECK: {nf}	sarw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xfa]
+         {nf}	sarw	%cl, %dx
+# CHECK: sarw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xfa]
+         sarw	%cl, %dx, %dx
+# CHECK: {nf}	sarw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xfa]
+         {nf}	sarw	%cl, %dx, %dx
+# CHECK: {evex}	sarl	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xf9]
+         {evex}	sarl	%cl, %ecx
+# CHECK: {nf}	sarl	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xf9]
+         {nf}	sarl	%cl, %ecx
+# CHECK: sarl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xf9]
+         sarl	%cl, %ecx, %ecx
+# CHECK: {nf}	sarl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xf9]
+         {nf}	sarl	%cl, %ecx, %ecx
+# CHECK: {evex}	sarq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xf9]
+         {evex}	sarq	%cl, %r9
+# CHECK: {nf}	sarq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xf9]
+         {nf}	sarq	%cl, %r9
+# CHECK: sarq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xf9]
+         sarq	%cl, %r9, %r9
+# CHECK: {nf}	sarq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xf9]
+         {nf}	sarq	%cl, %r9, %r9
+# CHECK: {evex}	sarb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sarb	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	sarb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarb	%cl, 291(%r8,%rax,4)
+# CHECK: sarb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sarb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	sarb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	sarw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sarw	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	sarw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarw	%cl, 291(%r8,%rax,4)
+# CHECK: sarw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sarw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	sarw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	sarl	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sarl	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	sarl	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarl	%cl, 291(%r8,%rax,4)
+# CHECK: sarl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sarl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	sarl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	sarq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sarq	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	sarq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarq	%cl, 291(%r8,%rax,4)
+# CHECK: sarq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sarq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	sarq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	sarw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xfa]
+         {evex}	sarw	%dx
+# CHECK: {nf}	sarw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xfa]
+         {nf}	sarw	%dx
+# CHECK: sarw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xfa]
+         sarw	%dx, %dx
+# CHECK: {nf}	sarw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xfa]
+         {nf}	sarw	%dx, %dx
+# CHECK: {evex}	sarl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xf9]
+         {evex}	sarl	%ecx
+# CHECK: {nf}	sarl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xf9]
+         {nf}	sarl	%ecx
+# CHECK: sarl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xf9]
+         sarl	%ecx, %ecx
+# CHECK: {nf}	sarl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xf9]
+         {nf}	sarl	%ecx, %ecx
+# CHECK: {evex}	sarq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xf9]
+         {evex}	sarq	%r9
+# CHECK: {nf}	sarq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xf9]
+         {nf}	sarq	%r9
+# CHECK: sarq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xf9]
+         sarq	%r9, %r9
+# CHECK: {nf}	sarq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xf9]
+         {nf}	sarq	%r9, %r9
+# CHECK: {evex}	sarb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sarb	291(%r8,%rax,4)
+# CHECK: {nf}	sarb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarb	291(%r8,%rax,4)
+# CHECK: sarb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sarb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	sarb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	sarw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sarw	291(%r8,%rax,4)
+# CHECK: {nf}	sarw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarw	291(%r8,%rax,4)
+# CHECK: sarw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sarw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	sarw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	sarl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sarl	291(%r8,%rax,4)
+# CHECK: {nf}	sarl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarl	291(%r8,%rax,4)
+# CHECK: sarl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sarl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	sarl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarl	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	sarq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sarq	291(%r8,%rax,4)
+# CHECK: {nf}	sarq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarq	291(%r8,%rax,4)
+# CHECK: sarq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sarq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	sarq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sarq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/sar-intel.s b/llvm/test/MC/X86/apx/sar-intel.s
new file mode 100644
index 0000000000000..946758d4118c4
--- /dev/null
+++ b/llvm/test/MC/X86/apx/sar-intel.s
@@ -0,0 +1,284 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	sar	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xfb,0x7b]
+         {evex}	sar	bl, 123
+# CHECK: {nf}	sar	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xfb,0x7b]
+         {nf}	sar	bl, 123
+# CHECK: sar	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xfb,0x7b]
+         sar	bl, bl, 123
+# CHECK: {nf}	sar	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xfb,0x7b]
+         {nf}	sar	bl, bl, 123
+# CHECK: {evex}	sar	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xfa,0x7b]
+         {evex}	sar	dx, 123
+# CHECK: {nf}	sar	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xfa,0x7b]
+         {nf}	sar	dx, 123
+# CHECK: sar	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xfa,0x7b]
+         sar	dx, dx, 123
+# CHECK: {nf}	sar	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xfa,0x7b]
+         {nf}	sar	dx, dx, 123
+# CHECK: {evex}	sar	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xf9,0x7b]
+         {evex}	sar	ecx, 123
+# CHECK: {nf}	sar	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xf9,0x7b]
+         {nf}	sar	ecx, 123
+# CHECK: sar	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xf9,0x7b]
+         sar	ecx, ecx, 123
+# CHECK: {nf}	sar	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xf9,0x7b]
+         {nf}	sar	ecx, ecx, 123
+# CHECK: {evex}	sar	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xf9,0x7b]
+         {evex}	sar	r9, 123
+# CHECK: {nf}	sar	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xf9,0x7b]
+         {nf}	sar	r9, 123
+# CHECK: sar	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xf9,0x7b]
+         sar	r9, r9, 123
+# CHECK: {nf}	sar	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xf9,0x7b]
+         {nf}	sar	r9, r9, 123
+# CHECK: {evex}	sar	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sar	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sar	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sar	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: sar	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sar	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sar	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sar	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sar	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sar	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sar	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sar	word ptr [r8 + 4*rax + 291], 123
+# CHECK: sar	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sar	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sar	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sar	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sar	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sar	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sar	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sar	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: sar	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sar	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sar	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sar	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sar	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	sar	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sar	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sar	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: sar	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         sar	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	sar	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xbc,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	sar	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	sar	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xfb]
+         {evex}	sar	bl
+# CHECK: {nf}	sar	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xfb]
+         {nf}	sar	bl
+# CHECK: {evex}	sar	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xfb]
+         {evex}	sar	bl, cl
+# CHECK: {nf}	sar	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xfb]
+         {nf}	sar	bl, cl
+# CHECK: sar	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xfb]
+         sar	bl, bl, cl
+# CHECK: {nf}	sar	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xfb]
+         {nf}	sar	bl, bl, cl
+# CHECK: {evex}	sar	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xfa]
+         {evex}	sar	dx, cl
+# CHECK: {nf}	sar	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xfa]
+         {nf}	sar	dx, cl
+# CHECK: sar	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xfa]
+         sar	dx, dx, cl
+# CHECK: {nf}	sar	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xfa]
+         {nf}	sar	dx, dx, cl
+# CHECK: {evex}	sar	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xf9]
+         {evex}	sar	ecx, cl
+# CHECK: {nf}	sar	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xf9]
+         {nf}	sar	ecx, cl
+# CHECK: sar	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xf9]
+         sar	ecx, ecx, cl
+# CHECK: {nf}	sar	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xf9]
+         {nf}	sar	ecx, ecx, cl
+# CHECK: {evex}	sar	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xf9]
+         {evex}	sar	r9, cl
+# CHECK: {nf}	sar	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xf9]
+         {nf}	sar	r9, cl
+# CHECK: sar	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xf9]
+         sar	r9, r9, cl
+# CHECK: {nf}	sar	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xf9]
+         {nf}	sar	r9, r9, cl
+# CHECK: {evex}	sar	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sar	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	sar	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: sar	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sar	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	sar	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	sar	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sar	word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	sar	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	word ptr [r8 + 4*rax + 291], cl
+# CHECK: sar	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sar	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	sar	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	sar	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sar	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	sar	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: sar	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sar	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	sar	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	sar	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sar	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	sar	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: sar	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sar	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	sar	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	sar	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xfa]
+         {evex}	sar	dx
+# CHECK: {nf}	sar	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xfa]
+         {nf}	sar	dx
+# CHECK: sar	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xfa]
+         sar	dx, dx
+# CHECK: {nf}	sar	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xfa]
+         {nf}	sar	dx, dx
+# CHECK: {evex}	sar	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xf9]
+         {evex}	sar	ecx
+# CHECK: {nf}	sar	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xf9]
+         {nf}	sar	ecx
+# CHECK: sar	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xf9]
+         sar	ecx, ecx
+# CHECK: {nf}	sar	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xf9]
+         {nf}	sar	ecx, ecx
+# CHECK: {evex}	sar	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xf9]
+         {evex}	sar	r9
+# CHECK: {nf}	sar	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xf9]
+         {nf}	sar	r9
+# CHECK: sar	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xf9]
+         sar	r9, r9
+# CHECK: {nf}	sar	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xf9]
+         {nf}	sar	r9, r9
+# CHECK: {evex}	sar	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sar	byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sar	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	byte ptr [r8 + 4*rax + 291]
+# CHECK: sar	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sar	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sar	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	sar	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sar	word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sar	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	word ptr [r8 + 4*rax + 291]
+# CHECK: sar	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sar	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sar	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	sar	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sar	dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sar	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	dword ptr [r8 + 4*rax + 291]
+# CHECK: sar	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sar	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sar	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	sar	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {evex}	sar	qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sar	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	qword ptr [r8 + 4*rax + 291]
+# CHECK: sar	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         sar	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	sar	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xbc,0x80,0x23,0x01,0x00,0x00]
+         {nf}	sar	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/shl-att.s b/llvm/test/MC/X86/apx/shl-att.s
new file mode 100644
index 0000000000000..d86f6cf05a7db
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shl-att.s
@@ -0,0 +1,287 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-94: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	shlb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xe3,0x7b]
+         {evex}	shlb	$123, %bl
+# CHECK: {nf}	shlb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xe3,0x7b]
+         {nf}	shlb	$123, %bl
+# CHECK: shlb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xe3,0x7b]
+         shlb	$123, %bl, %bl
+# CHECK: {nf}	shlb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xe3,0x7b]
+         {nf}	shlb	$123, %bl, %bl
+# CHECK: {evex}	shlw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xe2,0x7b]
+         {evex}	shlw	$123, %dx
+# CHECK: {nf}	shlw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xe2,0x7b]
+         {nf}	shlw	$123, %dx
+# CHECK: shlw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xe2,0x7b]
+         shlw	$123, %dx, %dx
+# CHECK: {nf}	shlw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xe2,0x7b]
+         {nf}	shlw	$123, %dx, %dx
+# CHECK: {evex}	shll	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xe1,0x7b]
+         {evex}	shll	$123, %ecx
+# CHECK: {nf}	shll	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xe1,0x7b]
+         {nf}	shll	$123, %ecx
+# CHECK: shll	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xe1,0x7b]
+         shll	$123, %ecx, %ecx
+# CHECK: {nf}	shll	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xe1,0x7b]
+         {nf}	shll	$123, %ecx, %ecx
+# CHECK: {evex}	shlq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xe1,0x7b]
+         {evex}	shlq	$123, %r9
+# CHECK: {nf}	shlq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xe1,0x7b]
+         {nf}	shlq	$123, %r9
+# CHECK: shlq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xe1,0x7b]
+         shlq	$123, %r9, %r9
+# CHECK: {nf}	shlq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xe1,0x7b]
+         {nf}	shlq	$123, %r9, %r9
+# CHECK: {evex}	shlb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shlb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	shlb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shlb	$123, 291(%r8,%rax,4)
+# CHECK: shlb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shlb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	shlb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shlb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	shlw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shlw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	shlw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shlw	$123, 291(%r8,%rax,4)
+# CHECK: shlw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shlw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	shlw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shlw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	shll	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shll	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	shll	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shll	$123, 291(%r8,%rax,4)
+# CHECK: shll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shll	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shlq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shlq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	shlq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shlq	$123, 291(%r8,%rax,4)
+# CHECK: shlq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shlq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	shlq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shlq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	shlb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xe3]
+         {evex}	shlb	%bl
+# CHECK: {nf}	shlb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xe3]
+         {nf}	shlb	%bl
+# CHECK: {evex}	shlb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xe3]
+         {evex}	shlb	%cl, %bl
+# CHECK: {nf}	shlb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xe3]
+         {nf}	shlb	%cl, %bl
+# CHECK: shlb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xe3]
+         shlb	%cl, %bl, %bl
+# CHECK: {nf}	shlb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xe3]
+         {nf}	shlb	%cl, %bl, %bl
+# CHECK: {evex}	shlw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xe2]
+         {evex}	shlw	%cl, %dx
+# CHECK: {nf}	shlw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xe2]
+         {nf}	shlw	%cl, %dx
+# CHECK: shlw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xe2]
+         shlw	%cl, %dx, %dx
+# CHECK: {nf}	shlw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xe2]
+         {nf}	shlw	%cl, %dx, %dx
+# CHECK: {evex}	shll	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xe1]
+         {evex}	shll	%cl, %ecx
+# CHECK: {nf}	shll	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xe1]
+         {nf}	shll	%cl, %ecx
+# CHECK: shll	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xe1]
+         shll	%cl, %ecx, %ecx
+# CHECK: {nf}	shll	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xe1]
+         {nf}	shll	%cl, %ecx, %ecx
+# CHECK: {evex}	shlq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xe1]
+         {evex}	shlq	%cl, %r9
+# CHECK: {nf}	shlq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xe1]
+         {nf}	shlq	%cl, %r9
+# CHECK: shlq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xe1]
+         shlq	%cl, %r9, %r9
+# CHECK: {nf}	shlq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xe1]
+         {nf}	shlq	%cl, %r9, %r9
+# CHECK: {evex}	shlb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shlb	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	shlb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlb	%cl, 291(%r8,%rax,4)
+# CHECK: shlb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shlb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	shlb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	shlw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shlw	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	shlw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlw	%cl, 291(%r8,%rax,4)
+# CHECK: shlw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shlw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	shlw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	shll	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shll	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	shll	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shll	%cl, 291(%r8,%rax,4)
+# CHECK: shll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shll	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shlq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shlq	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	shlq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlq	%cl, 291(%r8,%rax,4)
+# CHECK: shlq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shlq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	shlq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	shlw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xe2]
+         {evex}	shlw	%dx
+# CHECK: {nf}	shlw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xe2]
+         {nf}	shlw	%dx
+# CHECK: shlw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xe2]
+         shlw	%dx, %dx
+# CHECK: {nf}	shlw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xe2]
+         {nf}	shlw	%dx, %dx
+# CHECK: {evex}	shll	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xe1]
+         {evex}	shll	%ecx
+# CHECK: {nf}	shll	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xe1]
+         {nf}	shll	%ecx
+# CHECK: shll	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xe1]
+         shll	%ecx, %ecx
+# CHECK: {nf}	shll	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xe1]
+         {nf}	shll	%ecx, %ecx
+# CHECK: {evex}	shlq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xe1]
+         {evex}	shlq	%r9
+# CHECK: {nf}	shlq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xe1]
+         {nf}	shlq	%r9
+# CHECK: shlq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xe1]
+         shlq	%r9, %r9
+# CHECK: {nf}	shlq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xe1]
+         {nf}	shlq	%r9, %r9
+# CHECK: {evex}	shlb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shlb	291(%r8,%rax,4)
+# CHECK: {nf}	shlb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlb	291(%r8,%rax,4)
+# CHECK: shlb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shlb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	shlb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	shlw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shlw	291(%r8,%rax,4)
+# CHECK: {nf}	shlw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlw	291(%r8,%rax,4)
+# CHECK: shlw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shlw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	shlw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	shll	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shll	291(%r8,%rax,4)
+# CHECK: {nf}	shll	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shll	291(%r8,%rax,4)
+# CHECK: shll	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shll	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shll	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shll	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shlq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shlq	291(%r8,%rax,4)
+# CHECK: {nf}	shlq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlq	291(%r8,%rax,4)
+# CHECK: shlq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shlq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	shlq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shlq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/shl-intel.s b/llvm/test/MC/X86/apx/shl-intel.s
new file mode 100644
index 0000000000000..2db5203e77c4f
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shl-intel.s
@@ -0,0 +1,284 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	shl	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xe3,0x7b]
+         {evex}	shl	bl, 123
+# CHECK: {nf}	shl	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xe3,0x7b]
+         {nf}	shl	bl, 123
+# CHECK: shl	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xe3,0x7b]
+         shl	bl, bl, 123
+# CHECK: {nf}	shl	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xe3,0x7b]
+         {nf}	shl	bl, bl, 123
+# CHECK: {evex}	shl	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xe2,0x7b]
+         {evex}	shl	dx, 123
+# CHECK: {nf}	shl	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xe2,0x7b]
+         {nf}	shl	dx, 123
+# CHECK: shl	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xe2,0x7b]
+         shl	dx, dx, 123
+# CHECK: {nf}	shl	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xe2,0x7b]
+         {nf}	shl	dx, dx, 123
+# CHECK: {evex}	shl	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xe1,0x7b]
+         {evex}	shl	ecx, 123
+# CHECK: {nf}	shl	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xe1,0x7b]
+         {nf}	shl	ecx, 123
+# CHECK: shl	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xe1,0x7b]
+         shl	ecx, ecx, 123
+# CHECK: {nf}	shl	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xe1,0x7b]
+         {nf}	shl	ecx, ecx, 123
+# CHECK: {evex}	shl	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xe1,0x7b]
+         {evex}	shl	r9, 123
+# CHECK: {nf}	shl	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xe1,0x7b]
+         {nf}	shl	r9, 123
+# CHECK: shl	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xe1,0x7b]
+         shl	r9, r9, 123
+# CHECK: {nf}	shl	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xe1,0x7b]
+         {nf}	shl	r9, r9, 123
+# CHECK: {evex}	shl	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shl	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shl	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shl	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: shl	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shl	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shl	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shl	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	shl	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shl	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shl	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shl	word ptr [r8 + 4*rax + 291], 123
+# CHECK: shl	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shl	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shl	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shl	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	shl	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shl	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shl	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shl	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: shl	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shl	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shl	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shl	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	shl	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shl	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shl	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shl	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: shl	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shl	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shl	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xa4,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shl	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	shl	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xe3]
+         {evex}	shl	bl
+# CHECK: {nf}	shl	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xe3]
+         {nf}	shl	bl
+# CHECK: {evex}	shl	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xe3]
+         {evex}	shl	bl, cl
+# CHECK: {nf}	shl	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xe3]
+         {nf}	shl	bl, cl
+# CHECK: shl	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xe3]
+         shl	bl, bl, cl
+# CHECK: {nf}	shl	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xe3]
+         {nf}	shl	bl, bl, cl
+# CHECK: {evex}	shl	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xe2]
+         {evex}	shl	dx, cl
+# CHECK: {nf}	shl	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xe2]
+         {nf}	shl	dx, cl
+# CHECK: shl	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xe2]
+         shl	dx, dx, cl
+# CHECK: {nf}	shl	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xe2]
+         {nf}	shl	dx, dx, cl
+# CHECK: {evex}	shl	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xe1]
+         {evex}	shl	ecx, cl
+# CHECK: {nf}	shl	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xe1]
+         {nf}	shl	ecx, cl
+# CHECK: shl	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xe1]
+         shl	ecx, ecx, cl
+# CHECK: {nf}	shl	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xe1]
+         {nf}	shl	ecx, ecx, cl
+# CHECK: {evex}	shl	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xe1]
+         {evex}	shl	r9, cl
+# CHECK: {nf}	shl	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xe1]
+         {nf}	shl	r9, cl
+# CHECK: shl	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xe1]
+         shl	r9, r9, cl
+# CHECK: {nf}	shl	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xe1]
+         {nf}	shl	r9, r9, cl
+# CHECK: {evex}	shl	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shl	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shl	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: shl	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shl	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shl	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	shl	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shl	word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shl	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	word ptr [r8 + 4*rax + 291], cl
+# CHECK: shl	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shl	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shl	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	shl	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shl	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shl	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: shl	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shl	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shl	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	shl	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shl	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shl	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: shl	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shl	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shl	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	shl	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xe2]
+         {evex}	shl	dx
+# CHECK: {nf}	shl	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xe2]
+         {nf}	shl	dx
+# CHECK: shl	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xe2]
+         shl	dx, dx
+# CHECK: {nf}	shl	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xe2]
+         {nf}	shl	dx, dx
+# CHECK: {evex}	shl	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xe1]
+         {evex}	shl	ecx
+# CHECK: {nf}	shl	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xe1]
+         {nf}	shl	ecx
+# CHECK: shl	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xe1]
+         shl	ecx, ecx
+# CHECK: {nf}	shl	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xe1]
+         {nf}	shl	ecx, ecx
+# CHECK: {evex}	shl	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xe1]
+         {evex}	shl	r9
+# CHECK: {nf}	shl	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xe1]
+         {nf}	shl	r9
+# CHECK: shl	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xe1]
+         shl	r9, r9
+# CHECK: {nf}	shl	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xe1]
+         {nf}	shl	r9, r9
+# CHECK: {evex}	shl	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shl	byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shl	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	byte ptr [r8 + 4*rax + 291]
+# CHECK: shl	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shl	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shl	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	shl	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shl	word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shl	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	word ptr [r8 + 4*rax + 291]
+# CHECK: shl	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shl	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shl	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	shl	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shl	dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shl	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	dword ptr [r8 + 4*rax + 291]
+# CHECK: shl	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shl	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shl	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	shl	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shl	qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shl	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	qword ptr [r8 + 4*rax + 291]
+# CHECK: shl	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         shl	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shl	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xa4,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shl	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/shld-att.s b/llvm/test/MC/X86/apx/shld-att.s
new file mode 100644
index 0000000000000..a279398ee6e60
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shld-att.s
@@ -0,0 +1,149 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-48: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	shldw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x24,0xd2,0x7b]
+         {evex}	shldw	$123, %dx, %dx
+# CHECK: {nf}	shldw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x24,0xd2,0x7b]
+         {nf}	shldw	$123, %dx, %dx
+# CHECK: shldw	$123, %dx, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0x24,0xd2,0x7b]
+         shldw	$123, %dx, %dx, %dx
+# CHECK: {nf}	shldw	$123, %dx, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0x24,0xd2,0x7b]
+         {nf}	shldw	$123, %dx, %dx, %dx
+# CHECK: {evex}	shldw	$123, %dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shldw	$123, %dx, 291(%r8,%rax,4)
+# CHECK: {nf}	shldw	$123, %dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shldw	$123, %dx, 291(%r8,%rax,4)
+# CHECK: shldw	$123, %dx, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shldw	$123, %dx, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	shldw	$123, %dx, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shldw	$123, %dx, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	shldl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x24,0xc9,0x7b]
+         {evex}	shldl	$123, %ecx, %ecx
+# CHECK: {nf}	shldl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x24,0xc9,0x7b]
+         {nf}	shldl	$123, %ecx, %ecx
+# CHECK: shldl	$123, %ecx, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x24,0xc9,0x7b]
+         shldl	$123, %ecx, %ecx, %ecx
+# CHECK: {nf}	shldl	$123, %ecx, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x24,0xc9,0x7b]
+         {nf}	shldl	$123, %ecx, %ecx, %ecx
+# CHECK: {evex}	shldl	$123, %ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shldl	$123, %ecx, 291(%r8,%rax,4)
+# CHECK: {nf}	shldl	$123, %ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shldl	$123, %ecx, 291(%r8,%rax,4)
+# CHECK: shldl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shldl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shldl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shldl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shldq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x24,0xc9,0x7b]
+         {evex}	shldq	$123, %r9, %r9
+# CHECK: {nf}	shldq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x24,0xc9,0x7b]
+         {nf}	shldq	$123, %r9, %r9
+# CHECK: shldq	$123, %r9, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0x24,0xc9,0x7b]
+         shldq	$123, %r9, %r9, %r9
+# CHECK: {nf}	shldq	$123, %r9, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0x24,0xc9,0x7b]
+         {nf}	shldq	$123, %r9, %r9, %r9
+# CHECK: {evex}	shldq	$123, %r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shldq	$123, %r9, 291(%r8,%rax,4)
+# CHECK: {nf}	shldq	$123, %r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shldq	$123, %r9, 291(%r8,%rax,4)
+# CHECK: shldq	$123, %r9, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shldq	$123, %r9, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	shldq	$123, %r9, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shldq	$123, %r9, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	shldw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xa5,0xd2]
+         {evex}	shldw	%cl, %dx, %dx
+# CHECK: {nf}	shldw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xa5,0xd2]
+         {nf}	shldw	%cl, %dx, %dx
+# CHECK: shldw	%cl, %dx, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xa5,0xd2]
+         shldw	%cl, %dx, %dx, %dx
+# CHECK: {nf}	shldw	%cl, %dx, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xa5,0xd2]
+         {nf}	shldw	%cl, %dx, %dx, %dx
+# CHECK: {evex}	shldw	%cl, %dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xa5,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shldw	%cl, %dx, 291(%r8,%rax,4)
+# CHECK: {nf}	shldw	%cl, %dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xa5,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shldw	%cl, %dx, 291(%r8,%rax,4)
+# CHECK: shldw	%cl, %dx, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xa5,0x94,0x80,0x23,0x01,0x00,0x00]
+         shldw	%cl, %dx, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	shldw	%cl, %dx, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xa5,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shldw	%cl, %dx, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	shldl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xa5,0xc9]
+         {evex}	shldl	%cl, %ecx, %ecx
+# CHECK: {nf}	shldl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xa5,0xc9]
+         {nf}	shldl	%cl, %ecx, %ecx
+# CHECK: shldl	%cl, %ecx, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xa5,0xc9]
+         shldl	%cl, %ecx, %ecx, %ecx
+# CHECK: {nf}	shldl	%cl, %ecx, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xa5,0xc9]
+         {nf}	shldl	%cl, %ecx, %ecx, %ecx
+# CHECK: {evex}	shldl	%cl, %ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shldl	%cl, %ecx, 291(%r8,%rax,4)
+# CHECK: {nf}	shldl	%cl, %ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shldl	%cl, %ecx, 291(%r8,%rax,4)
+# CHECK: shldl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         shldl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shldl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shldl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shldq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0xa5,0xc9]
+         {evex}	shldq	%cl, %r9, %r9
+# CHECK: {nf}	shldq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0xa5,0xc9]
+         {nf}	shldq	%cl, %r9, %r9
+# CHECK: shldq	%cl, %r9, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0xa5,0xc9]
+         shldq	%cl, %r9, %r9, %r9
+# CHECK: {nf}	shldq	%cl, %r9, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0xa5,0xc9]
+         {nf}	shldq	%cl, %r9, %r9, %r9
+# CHECK: {evex}	shldq	%cl, %r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shldq	%cl, %r9, 291(%r8,%rax,4)
+# CHECK: {nf}	shldq	%cl, %r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shldq	%cl, %r9, 291(%r8,%rax,4)
+# CHECK: shldq	%cl, %r9, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         shldq	%cl, %r9, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	shldq	%cl, %r9, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shldq	%cl, %r9, 291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/shld-intel.s b/llvm/test/MC/X86/apx/shld-intel.s
new file mode 100644
index 0000000000000..4bd7c30cc2eaf
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shld-intel.s
@@ -0,0 +1,146 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	shld	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x24,0xd2,0x7b]
+         {evex}	shld	dx, dx, 123
+# CHECK: {nf}	shld	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x24,0xd2,0x7b]
+         {nf}	shld	dx, dx, 123
+# CHECK: shld	dx, dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0x24,0xd2,0x7b]
+         shld	dx, dx, dx, 123
+# CHECK: {nf}	shld	dx, dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0x24,0xd2,0x7b]
+         {nf}	shld	dx, dx, dx, 123
+# CHECK: {evex}	shld	word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shld	word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: {nf}	shld	word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shld	word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: shld	dx, word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shld	dx, word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: {nf}	shld	dx, word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x24,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shld	dx, word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: {evex}	shld	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x24,0xc9,0x7b]
+         {evex}	shld	ecx, ecx, 123
+# CHECK: {nf}	shld	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x24,0xc9,0x7b]
+         {nf}	shld	ecx, ecx, 123
+# CHECK: shld	ecx, ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x24,0xc9,0x7b]
+         shld	ecx, ecx, ecx, 123
+# CHECK: {nf}	shld	ecx, ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x24,0xc9,0x7b]
+         {nf}	shld	ecx, ecx, ecx, 123
+# CHECK: {evex}	shld	dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shld	dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: {nf}	shld	dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shld	dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: {nf}	shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: {evex}	shld	r9, r9, 123
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x24,0xc9,0x7b]
+         {evex}	shld	r9, r9, 123
+# CHECK: {nf}	shld	r9, r9, 123
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x24,0xc9,0x7b]
+         {nf}	shld	r9, r9, 123
+# CHECK: shld	r9, r9, r9, 123
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0x24,0xc9,0x7b]
+         shld	r9, r9, r9, 123
+# CHECK: {nf}	shld	r9, r9, r9, 123
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0x24,0xc9,0x7b]
+         {nf}	shld	r9, r9, r9, 123
+# CHECK: {evex}	shld	qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shld	qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: {nf}	shld	qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shld	qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: shld	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shld	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: {nf}	shld	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0x24,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shld	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: {evex}	shld	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xa5,0xd2]
+         {evex}	shld	dx, dx, cl
+# CHECK: {nf}	shld	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xa5,0xd2]
+         {nf}	shld	dx, dx, cl
+# CHECK: shld	dx, dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xa5,0xd2]
+         shld	dx, dx, dx, cl
+# CHECK: {nf}	shld	dx, dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xa5,0xd2]
+         {nf}	shld	dx, dx, dx, cl
+# CHECK: {evex}	shld	word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xa5,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shld	word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: {nf}	shld	word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xa5,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shld	word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: shld	dx, word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xa5,0x94,0x80,0x23,0x01,0x00,0x00]
+         shld	dx, word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: {nf}	shld	dx, word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xa5,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shld	dx, word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: {evex}	shld	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xa5,0xc9]
+         {evex}	shld	ecx, ecx, cl
+# CHECK: {nf}	shld	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xa5,0xc9]
+         {nf}	shld	ecx, ecx, cl
+# CHECK: shld	ecx, ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xa5,0xc9]
+         shld	ecx, ecx, ecx, cl
+# CHECK: {nf}	shld	ecx, ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xa5,0xc9]
+         {nf}	shld	ecx, ecx, ecx, cl
+# CHECK: {evex}	shld	dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shld	dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: {nf}	shld	dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shld	dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: {nf}	shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shld	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: {evex}	shld	r9, r9, cl
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0xa5,0xc9]
+         {evex}	shld	r9, r9, cl
+# CHECK: {nf}	shld	r9, r9, cl
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0xa5,0xc9]
+         {nf}	shld	r9, r9, cl
+# CHECK: shld	r9, r9, r9, cl
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0xa5,0xc9]
+         shld	r9, r9, r9, cl
+# CHECK: {nf}	shld	r9, r9, r9, cl
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0xa5,0xc9]
+         {nf}	shld	r9, r9, r9, cl
+# CHECK: {evex}	shld	qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shld	qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: {nf}	shld	qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shld	qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: shld	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         shld	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: {nf}	shld	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0xa5,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shld	r9, qword ptr [r8 + 4*rax + 291], r9, cl
diff --git a/llvm/test/MC/X86/apx/shr-att.s b/llvm/test/MC/X86/apx/shr-att.s
new file mode 100644
index 0000000000000..86656c325de2a
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shr-att.s
@@ -0,0 +1,287 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-94: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	shrb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xeb,0x7b]
+         {evex}	shrb	$123, %bl
+# CHECK: {nf}	shrb	$123, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xeb,0x7b]
+         {nf}	shrb	$123, %bl
+# CHECK: shrb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xeb,0x7b]
+         shrb	$123, %bl, %bl
+# CHECK: {nf}	shrb	$123, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xeb,0x7b]
+         {nf}	shrb	$123, %bl, %bl
+# CHECK: {evex}	shrw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xea,0x7b]
+         {evex}	shrw	$123, %dx
+# CHECK: {nf}	shrw	$123, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xea,0x7b]
+         {nf}	shrw	$123, %dx
+# CHECK: shrw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xea,0x7b]
+         shrw	$123, %dx, %dx
+# CHECK: {nf}	shrw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xea,0x7b]
+         {nf}	shrw	$123, %dx, %dx
+# CHECK: {evex}	shrl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xe9,0x7b]
+         {evex}	shrl	$123, %ecx
+# CHECK: {nf}	shrl	$123, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xe9,0x7b]
+         {nf}	shrl	$123, %ecx
+# CHECK: shrl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xe9,0x7b]
+         shrl	$123, %ecx, %ecx
+# CHECK: {nf}	shrl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xe9,0x7b]
+         {nf}	shrl	$123, %ecx, %ecx
+# CHECK: {evex}	shrq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xe9,0x7b]
+         {evex}	shrq	$123, %r9
+# CHECK: {nf}	shrq	$123, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xe9,0x7b]
+         {nf}	shrq	$123, %r9
+# CHECK: shrq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xe9,0x7b]
+         shrq	$123, %r9, %r9
+# CHECK: {nf}	shrq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xe9,0x7b]
+         {nf}	shrq	$123, %r9, %r9
+# CHECK: {evex}	shrb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrb	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	shrb	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrb	$123, 291(%r8,%rax,4)
+# CHECK: shrb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	shrb	$123, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrb	$123, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	shrw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrw	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	shrw	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrw	$123, 291(%r8,%rax,4)
+# CHECK: shrw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	shrw	$123, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrw	$123, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	shrl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrl	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	shrl	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrl	$123, 291(%r8,%rax,4)
+# CHECK: shrl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shrl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrl	$123, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shrq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrq	$123, 291(%r8,%rax,4)
+# CHECK: {nf}	shrq	$123, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrq	$123, 291(%r8,%rax,4)
+# CHECK: shrq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	shrq	$123, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrq	$123, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	shrb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xeb]
+         {evex}	shrb	%bl
+# CHECK: {nf}	shrb	%bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xeb]
+         {nf}	shrb	%bl
+# CHECK: {evex}	shrb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xeb]
+         {evex}	shrb	%cl, %bl
+# CHECK: {nf}	shrb	%cl, %bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xeb]
+         {nf}	shrb	%cl, %bl
+# CHECK: shrb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xeb]
+         shrb	%cl, %bl, %bl
+# CHECK: {nf}	shrb	%cl, %bl, %bl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xeb]
+         {nf}	shrb	%cl, %bl, %bl
+# CHECK: {evex}	shrw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xea]
+         {evex}	shrw	%cl, %dx
+# CHECK: {nf}	shrw	%cl, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xea]
+         {nf}	shrw	%cl, %dx
+# CHECK: shrw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xea]
+         shrw	%cl, %dx, %dx
+# CHECK: {nf}	shrw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xea]
+         {nf}	shrw	%cl, %dx, %dx
+# CHECK: {evex}	shrl	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xe9]
+         {evex}	shrl	%cl, %ecx
+# CHECK: {nf}	shrl	%cl, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xe9]
+         {nf}	shrl	%cl, %ecx
+# CHECK: shrl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xe9]
+         shrl	%cl, %ecx, %ecx
+# CHECK: {nf}	shrl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xe9]
+         {nf}	shrl	%cl, %ecx, %ecx
+# CHECK: {evex}	shrq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xe9]
+         {evex}	shrq	%cl, %r9
+# CHECK: {nf}	shrq	%cl, %r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xe9]
+         {nf}	shrq	%cl, %r9
+# CHECK: shrq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xe9]
+         shrq	%cl, %r9, %r9
+# CHECK: {nf}	shrq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xe9]
+         {nf}	shrq	%cl, %r9, %r9
+# CHECK: {evex}	shrb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrb	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	shrb	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrb	%cl, 291(%r8,%rax,4)
+# CHECK: shrb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0xac,0x80,0x23,0x01,0x00,0x00]
+         shrb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {nf}	shrb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrb	%cl, 291(%r8,%rax,4), %bl
+# CHECK: {evex}	shrw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrw	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	shrw	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrw	%cl, 291(%r8,%rax,4)
+# CHECK: shrw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         shrw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	shrw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrw	%cl, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	shrl	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrl	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	shrl	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrl	%cl, 291(%r8,%rax,4)
+# CHECK: shrl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         shrl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shrl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrl	%cl, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shrq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrq	%cl, 291(%r8,%rax,4)
+# CHECK: {nf}	shrq	%cl, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrq	%cl, 291(%r8,%rax,4)
+# CHECK: shrq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         shrq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	shrq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrq	%cl, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	shrw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xea]
+         {evex}	shrw	%dx
+# CHECK: {nf}	shrw	%dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xea]
+         {nf}	shrw	%dx
+# CHECK: shrw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xea]
+         shrw	%dx, %dx
+# CHECK: {nf}	shrw	%dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xea]
+         {nf}	shrw	%dx, %dx
+# CHECK: {evex}	shrl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xe9]
+         {evex}	shrl	%ecx
+# CHECK: {nf}	shrl	%ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xe9]
+         {nf}	shrl	%ecx
+# CHECK: shrl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xe9]
+         shrl	%ecx, %ecx
+# CHECK: {nf}	shrl	%ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xe9]
+         {nf}	shrl	%ecx, %ecx
+# CHECK: {evex}	shrq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xe9]
+         {evex}	shrq	%r9
+# CHECK: {nf}	shrq	%r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xe9]
+         {nf}	shrq	%r9
+# CHECK: shrq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xe9]
+         shrq	%r9, %r9
+# CHECK: {nf}	shrq	%r9, %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xe9]
+         {nf}	shrq	%r9, %r9
+# CHECK: {evex}	shrb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrb	291(%r8,%rax,4)
+# CHECK: {nf}	shrb	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrb	291(%r8,%rax,4)
+# CHECK: shrb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0xac,0x80,0x23,0x01,0x00,0x00]
+         shrb	291(%r8,%rax,4), %bl
+# CHECK: {nf}	shrb	291(%r8,%rax,4), %bl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrb	291(%r8,%rax,4), %bl
+# CHECK: {evex}	shrw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrw	291(%r8,%rax,4)
+# CHECK: {nf}	shrw	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrw	291(%r8,%rax,4)
+# CHECK: shrw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         shrw	291(%r8,%rax,4), %dx
+# CHECK: {nf}	shrw	291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrw	291(%r8,%rax,4), %dx
+# CHECK: {evex}	shrl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrl	291(%r8,%rax,4)
+# CHECK: {nf}	shrl	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrl	291(%r8,%rax,4)
+# CHECK: shrl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         shrl	291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shrl	291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrl	291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shrq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrq	291(%r8,%rax,4)
+# CHECK: {nf}	shrq	291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrq	291(%r8,%rax,4)
+# CHECK: shrq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         shrq	291(%r8,%rax,4), %r9
+# CHECK: {nf}	shrq	291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrq	291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/shr-intel.s b/llvm/test/MC/X86/apx/shr-intel.s
new file mode 100644
index 0000000000000..2a44177a43810
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shr-intel.s
@@ -0,0 +1,284 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	shr	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc0,0xeb,0x7b]
+         {evex}	shr	bl, 123
+# CHECK: {nf}	shr	bl, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc0,0xeb,0x7b]
+         {nf}	shr	bl, 123
+# CHECK: shr	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xc0,0xeb,0x7b]
+         shr	bl, bl, 123
+# CHECK: {nf}	shr	bl, bl, 123
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xc0,0xeb,0x7b]
+         {nf}	shr	bl, bl, 123
+# CHECK: {evex}	shr	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xc1,0xea,0x7b]
+         {evex}	shr	dx, 123
+# CHECK: {nf}	shr	dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xc1,0xea,0x7b]
+         {nf}	shr	dx, 123
+# CHECK: shr	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xc1,0xea,0x7b]
+         shr	dx, dx, 123
+# CHECK: {nf}	shr	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xc1,0xea,0x7b]
+         {nf}	shr	dx, dx, 123
+# CHECK: {evex}	shr	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xc1,0xe9,0x7b]
+         {evex}	shr	ecx, 123
+# CHECK: {nf}	shr	ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xc1,0xe9,0x7b]
+         {nf}	shr	ecx, 123
+# CHECK: shr	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xc1,0xe9,0x7b]
+         shr	ecx, ecx, 123
+# CHECK: {nf}	shr	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xc1,0xe9,0x7b]
+         {nf}	shr	ecx, ecx, 123
+# CHECK: {evex}	shr	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xe9,0x7b]
+         {evex}	shr	r9, 123
+# CHECK: {nf}	shr	r9, 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xe9,0x7b]
+         {nf}	shr	r9, 123
+# CHECK: shr	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xe9,0x7b]
+         shr	r9, r9, 123
+# CHECK: {nf}	shr	r9, r9, 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xe9,0x7b]
+         {nf}	shr	r9, r9, 123
+# CHECK: {evex}	shr	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shr	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shr	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shr	byte ptr [r8 + 4*rax + 291], 123
+# CHECK: shr	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shr	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shr	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xc0,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shr	bl, byte ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	shr	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shr	word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shr	word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shr	word ptr [r8 + 4*rax + 291], 123
+# CHECK: shr	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shr	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shr	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shr	dx, word ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	shr	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shr	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shr	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shr	dword ptr [r8 + 4*rax + 291], 123
+# CHECK: shr	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shr	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shr	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shr	ecx, dword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	shr	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shr	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shr	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shr	qword ptr [r8 + 4*rax + 291], 123
+# CHECK: shr	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shr	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {nf}	shr	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xc1,0xac,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shr	r9, qword ptr [r8 + 4*rax + 291], 123
+# CHECK: {evex}	shr	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd0,0xeb]
+         {evex}	shr	bl
+# CHECK: {nf}	shr	bl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd0,0xeb]
+         {nf}	shr	bl
+# CHECK: {evex}	shr	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd2,0xeb]
+         {evex}	shr	bl, cl
+# CHECK: {nf}	shr	bl, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd2,0xeb]
+         {nf}	shr	bl, cl
+# CHECK: shr	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x18,0xd2,0xeb]
+         shr	bl, bl, cl
+# CHECK: {nf}	shr	bl, bl, cl
+# CHECK: encoding: [0x62,0xf4,0x64,0x1c,0xd2,0xeb]
+         {nf}	shr	bl, bl, cl
+# CHECK: {evex}	shr	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd3,0xea]
+         {evex}	shr	dx, cl
+# CHECK: {nf}	shr	dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd3,0xea]
+         {nf}	shr	dx, cl
+# CHECK: shr	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd3,0xea]
+         shr	dx, dx, cl
+# CHECK: {nf}	shr	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd3,0xea]
+         {nf}	shr	dx, dx, cl
+# CHECK: {evex}	shr	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd3,0xe9]
+         {evex}	shr	ecx, cl
+# CHECK: {nf}	shr	ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd3,0xe9]
+         {nf}	shr	ecx, cl
+# CHECK: shr	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd3,0xe9]
+         shr	ecx, ecx, cl
+# CHECK: {nf}	shr	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd3,0xe9]
+         {nf}	shr	ecx, ecx, cl
+# CHECK: {evex}	shr	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xe9]
+         {evex}	shr	r9, cl
+# CHECK: {nf}	shr	r9, cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xe9]
+         {nf}	shr	r9, cl
+# CHECK: shr	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xe9]
+         shr	r9, r9, cl
+# CHECK: {nf}	shr	r9, r9, cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xe9]
+         {nf}	shr	r9, r9, cl
+# CHECK: {evex}	shr	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd2,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shr	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shr	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd2,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	byte ptr [r8 + 4*rax + 291], cl
+# CHECK: shr	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd2,0xac,0x80,0x23,0x01,0x00,0x00]
+         shr	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shr	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd2,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	bl, byte ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	shr	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shr	word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shr	word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	word ptr [r8 + 4*rax + 291], cl
+# CHECK: shr	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         shr	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shr	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	dx, word ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	shr	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shr	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shr	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	dword ptr [r8 + 4*rax + 291], cl
+# CHECK: shr	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         shr	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shr	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	ecx, dword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	shr	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shr	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shr	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	qword ptr [r8 + 4*rax + 291], cl
+# CHECK: shr	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         shr	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {nf}	shr	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd3,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	r9, qword ptr [r8 + 4*rax + 291], cl
+# CHECK: {evex}	shr	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xd1,0xea]
+         {evex}	shr	dx
+# CHECK: {nf}	shr	dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xd1,0xea]
+         {nf}	shr	dx
+# CHECK: shr	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xd1,0xea]
+         shr	dx, dx
+# CHECK: {nf}	shr	dx, dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xd1,0xea]
+         {nf}	shr	dx, dx
+# CHECK: {evex}	shr	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xd1,0xe9]
+         {evex}	shr	ecx
+# CHECK: {nf}	shr	ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xd1,0xe9]
+         {nf}	shr	ecx
+# CHECK: shr	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xd1,0xe9]
+         shr	ecx, ecx
+# CHECK: {nf}	shr	ecx, ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xd1,0xe9]
+         {nf}	shr	ecx, ecx
+# CHECK: {evex}	shr	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xe9]
+         {evex}	shr	r9
+# CHECK: {nf}	shr	r9
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xe9]
+         {nf}	shr	r9
+# CHECK: shr	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xe9]
+         shr	r9, r9
+# CHECK: {nf}	shr	r9, r9
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xe9]
+         {nf}	shr	r9, r9
+# CHECK: {evex}	shr	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd0,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shr	byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shr	byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd0,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	byte ptr [r8 + 4*rax + 291]
+# CHECK: shr	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x18,0xd0,0xac,0x80,0x23,0x01,0x00,0x00]
+         shr	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shr	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x64,0x1c,0xd0,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	bl, byte ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	shr	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shr	word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shr	word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	word ptr [r8 + 4*rax + 291]
+# CHECK: shr	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         shr	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shr	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	dx, word ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	shr	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shr	dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shr	dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	dword ptr [r8 + 4*rax + 291]
+# CHECK: shr	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         shr	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shr	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	ecx, dword ptr [r8 + 4*rax + 291]
+# CHECK: {evex}	shr	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x08,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shr	qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shr	qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xfc,0x0c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	qword ptr [r8 + 4*rax + 291]
+# CHECK: shr	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x18,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         shr	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: {nf}	shr	r9, qword ptr [r8 + 4*rax + 291]
+# CHECK: encoding: [0x62,0xd4,0xb4,0x1c,0xd1,0xac,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shr	r9, qword ptr [r8 + 4*rax + 291]
diff --git a/llvm/test/MC/X86/apx/shrd-att.s b/llvm/test/MC/X86/apx/shrd-att.s
new file mode 100644
index 0000000000000..b3684a9299ab8
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shrd-att.s
@@ -0,0 +1,149 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding %s | FileCheck %s
+# RUN: not llvm-mc -triple i386 -show-encoding %s 2>&1 | FileCheck %s --check-prefix=ERROR
+
+# ERROR-COUNT-48: error:
+# ERROR-NOT: error:
+# CHECK: {evex}	shrdw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x2c,0xd2,0x7b]
+         {evex}	shrdw	$123, %dx, %dx
+# CHECK: {nf}	shrdw	$123, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x2c,0xd2,0x7b]
+         {nf}	shrdw	$123, %dx, %dx
+# CHECK: shrdw	$123, %dx, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0x2c,0xd2,0x7b]
+         shrdw	$123, %dx, %dx, %dx
+# CHECK: {nf}	shrdw	$123, %dx, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0x2c,0xd2,0x7b]
+         {nf}	shrdw	$123, %dx, %dx, %dx
+# CHECK: {evex}	shrdw	$123, %dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrdw	$123, %dx, 291(%r8,%rax,4)
+# CHECK: {nf}	shrdw	$123, %dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrdw	$123, %dx, 291(%r8,%rax,4)
+# CHECK: shrdw	$123, %dx, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrdw	$123, %dx, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	shrdw	$123, %dx, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrdw	$123, %dx, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	shrdl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x2c,0xc9,0x7b]
+         {evex}	shrdl	$123, %ecx, %ecx
+# CHECK: {nf}	shrdl	$123, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x2c,0xc9,0x7b]
+         {nf}	shrdl	$123, %ecx, %ecx
+# CHECK: shrdl	$123, %ecx, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x2c,0xc9,0x7b]
+         shrdl	$123, %ecx, %ecx, %ecx
+# CHECK: {nf}	shrdl	$123, %ecx, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x2c,0xc9,0x7b]
+         {nf}	shrdl	$123, %ecx, %ecx, %ecx
+# CHECK: {evex}	shrdl	$123, %ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrdl	$123, %ecx, 291(%r8,%rax,4)
+# CHECK: {nf}	shrdl	$123, %ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrdl	$123, %ecx, 291(%r8,%rax,4)
+# CHECK: shrdl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrdl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shrdl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrdl	$123, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shrdq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x2c,0xc9,0x7b]
+         {evex}	shrdq	$123, %r9, %r9
+# CHECK: {nf}	shrdq	$123, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x2c,0xc9,0x7b]
+         {nf}	shrdq	$123, %r9, %r9
+# CHECK: shrdq	$123, %r9, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0x2c,0xc9,0x7b]
+         shrdq	$123, %r9, %r9, %r9
+# CHECK: {nf}	shrdq	$123, %r9, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0x2c,0xc9,0x7b]
+         {nf}	shrdq	$123, %r9, %r9, %r9
+# CHECK: {evex}	shrdq	$123, %r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrdq	$123, %r9, 291(%r8,%rax,4)
+# CHECK: {nf}	shrdq	$123, %r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrdq	$123, %r9, 291(%r8,%rax,4)
+# CHECK: shrdq	$123, %r9, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrdq	$123, %r9, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	shrdq	$123, %r9, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrdq	$123, %r9, 291(%r8,%rax,4), %r9
+# CHECK: {evex}	shrdw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xad,0xd2]
+         {evex}	shrdw	%cl, %dx, %dx
+# CHECK: {nf}	shrdw	%cl, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xad,0xd2]
+         {nf}	shrdw	%cl, %dx, %dx
+# CHECK: shrdw	%cl, %dx, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xad,0xd2]
+         shrdw	%cl, %dx, %dx, %dx
+# CHECK: {nf}	shrdw	%cl, %dx, %dx, %dx
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xad,0xd2]
+         {nf}	shrdw	%cl, %dx, %dx, %dx
+# CHECK: {evex}	shrdw	%cl, %dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xad,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrdw	%cl, %dx, 291(%r8,%rax,4)
+# CHECK: {nf}	shrdw	%cl, %dx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xad,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrdw	%cl, %dx, 291(%r8,%rax,4)
+# CHECK: shrdw	%cl, %dx, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xad,0x94,0x80,0x23,0x01,0x00,0x00]
+         shrdw	%cl, %dx, 291(%r8,%rax,4), %dx
+# CHECK: {nf}	shrdw	%cl, %dx, 291(%r8,%rax,4), %dx
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xad,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrdw	%cl, %dx, 291(%r8,%rax,4), %dx
+# CHECK: {evex}	shrdl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xad,0xc9]
+         {evex}	shrdl	%cl, %ecx, %ecx
+# CHECK: {nf}	shrdl	%cl, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xad,0xc9]
+         {nf}	shrdl	%cl, %ecx, %ecx
+# CHECK: shrdl	%cl, %ecx, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xad,0xc9]
+         shrdl	%cl, %ecx, %ecx, %ecx
+# CHECK: {nf}	shrdl	%cl, %ecx, %ecx, %ecx
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xad,0xc9]
+         {nf}	shrdl	%cl, %ecx, %ecx, %ecx
+# CHECK: {evex}	shrdl	%cl, %ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrdl	%cl, %ecx, 291(%r8,%rax,4)
+# CHECK: {nf}	shrdl	%cl, %ecx, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrdl	%cl, %ecx, 291(%r8,%rax,4)
+# CHECK: shrdl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         shrdl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: {nf}	shrdl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrdl	%cl, %ecx, 291(%r8,%rax,4), %ecx
+# CHECK: {evex}	shrdq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0xad,0xc9]
+         {evex}	shrdq	%cl, %r9, %r9
+# CHECK: {nf}	shrdq	%cl, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0xad,0xc9]
+         {nf}	shrdq	%cl, %r9, %r9
+# CHECK: shrdq	%cl, %r9, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0xad,0xc9]
+         shrdq	%cl, %r9, %r9, %r9
+# CHECK: {nf}	shrdq	%cl, %r9, %r9, %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0xad,0xc9]
+         {nf}	shrdq	%cl, %r9, %r9, %r9
+# CHECK: {evex}	shrdq	%cl, %r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrdq	%cl, %r9, 291(%r8,%rax,4)
+# CHECK: {nf}	shrdq	%cl, %r9, 291(%r8,%rax,4)
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrdq	%cl, %r9, 291(%r8,%rax,4)
+# CHECK: shrdq	%cl, %r9, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         shrdq	%cl, %r9, 291(%r8,%rax,4), %r9
+# CHECK: {nf}	shrdq	%cl, %r9, 291(%r8,%rax,4), %r9
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrdq	%cl, %r9, 291(%r8,%rax,4), %r9
diff --git a/llvm/test/MC/X86/apx/shrd-intel.s b/llvm/test/MC/X86/apx/shrd-intel.s
new file mode 100644
index 0000000000000..d0f14eb78ee89
--- /dev/null
+++ b/llvm/test/MC/X86/apx/shrd-intel.s
@@ -0,0 +1,146 @@
+# RUN: llvm-mc -triple x86_64 -show-encoding -x86-asm-syntax=intel -output-asm-variant=1 %s | FileCheck %s
+
+# CHECK: {evex}	shrd	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0x2c,0xd2,0x7b]
+         {evex}	shrd	dx, dx, 123
+# CHECK: {nf}	shrd	dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0x2c,0xd2,0x7b]
+         {nf}	shrd	dx, dx, 123
+# CHECK: shrd	dx, dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0x2c,0xd2,0x7b]
+         shrd	dx, dx, dx, 123
+# CHECK: {nf}	shrd	dx, dx, dx, 123
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0x2c,0xd2,0x7b]
+         {nf}	shrd	dx, dx, dx, 123
+# CHECK: {evex}	shrd	word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrd	word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: {nf}	shrd	word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrd	word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: shrd	dx, word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrd	dx, word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: {nf}	shrd	dx, word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0x2c,0x94,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrd	dx, word ptr [r8 + 4*rax + 291], dx, 123
+# CHECK: {evex}	shrd	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0x2c,0xc9,0x7b]
+         {evex}	shrd	ecx, ecx, 123
+# CHECK: {nf}	shrd	ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0x2c,0xc9,0x7b]
+         {nf}	shrd	ecx, ecx, 123
+# CHECK: shrd	ecx, ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0x2c,0xc9,0x7b]
+         shrd	ecx, ecx, ecx, 123
+# CHECK: {nf}	shrd	ecx, ecx, ecx, 123
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0x2c,0xc9,0x7b]
+         {nf}	shrd	ecx, ecx, ecx, 123
+# CHECK: {evex}	shrd	dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrd	dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: {nf}	shrd	dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrd	dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: {nf}	shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, 123
+# CHECK: {evex}	shrd	r9, r9, 123
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x2c,0xc9,0x7b]
+         {evex}	shrd	r9, r9, 123
+# CHECK: {nf}	shrd	r9, r9, 123
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x2c,0xc9,0x7b]
+         {nf}	shrd	r9, r9, 123
+# CHECK: shrd	r9, r9, r9, 123
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0x2c,0xc9,0x7b]
+         shrd	r9, r9, r9, 123
+# CHECK: {nf}	shrd	r9, r9, r9, 123
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0x2c,0xc9,0x7b]
+         {nf}	shrd	r9, r9, r9, 123
+# CHECK: {evex}	shrd	qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {evex}	shrd	qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: {nf}	shrd	qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrd	qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: shrd	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         shrd	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: {nf}	shrd	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0x2c,0x8c,0x80,0x23,0x01,0x00,0x00,0x7b]
+         {nf}	shrd	r9, qword ptr [r8 + 4*rax + 291], r9, 123
+# CHECK: {evex}	shrd	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x08,0xad,0xd2]
+         {evex}	shrd	dx, dx, cl
+# CHECK: {nf}	shrd	dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x7d,0x0c,0xad,0xd2]
+         {nf}	shrd	dx, dx, cl
+# CHECK: shrd	dx, dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x18,0xad,0xd2]
+         shrd	dx, dx, dx, cl
+# CHECK: {nf}	shrd	dx, dx, dx, cl
+# CHECK: encoding: [0x62,0xf4,0x6d,0x1c,0xad,0xd2]
+         {nf}	shrd	dx, dx, dx, cl
+# CHECK: {evex}	shrd	word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x08,0xad,0x94,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrd	word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: {nf}	shrd	word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: encoding: [0x62,0xd4,0x7d,0x0c,0xad,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrd	word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: shrd	dx, word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x18,0xad,0x94,0x80,0x23,0x01,0x00,0x00]
+         shrd	dx, word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: {nf}	shrd	dx, word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: encoding: [0x62,0xd4,0x6d,0x1c,0xad,0x94,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrd	dx, word ptr [r8 + 4*rax + 291], dx, cl
+# CHECK: {evex}	shrd	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x08,0xad,0xc9]
+         {evex}	shrd	ecx, ecx, cl
+# CHECK: {nf}	shrd	ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x7c,0x0c,0xad,0xc9]
+         {nf}	shrd	ecx, ecx, cl
+# CHECK: shrd	ecx, ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x18,0xad,0xc9]
+         shrd	ecx, ecx, ecx, cl
+# CHECK: {nf}	shrd	ecx, ecx, ecx, cl
+# CHECK: encoding: [0x62,0xf4,0x74,0x1c,0xad,0xc9]
+         {nf}	shrd	ecx, ecx, ecx, cl
+# CHECK: {evex}	shrd	dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x08,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrd	dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: {nf}	shrd	dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: encoding: [0x62,0xd4,0x7c,0x0c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrd	dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x18,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: {nf}	shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: encoding: [0x62,0xd4,0x74,0x1c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrd	ecx, dword ptr [r8 + 4*rax + 291], ecx, cl
+# CHECK: {evex}	shrd	r9, r9, cl
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0xad,0xc9]
+         {evex}	shrd	r9, r9, cl
+# CHECK: {nf}	shrd	r9, r9, cl
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0xad,0xc9]
+         {nf}	shrd	r9, r9, cl
+# CHECK: shrd	r9, r9, r9, cl
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0xad,0xc9]
+         shrd	r9, r9, r9, cl
+# CHECK: {nf}	shrd	r9, r9, r9, cl
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0xad,0xc9]
+         {nf}	shrd	r9, r9, r9, cl
+# CHECK: {evex}	shrd	qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: encoding: [0x62,0x54,0xfc,0x08,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {evex}	shrd	qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: {nf}	shrd	qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: encoding: [0x62,0x54,0xfc,0x0c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrd	qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: shrd	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: encoding: [0x62,0x54,0xb4,0x18,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         shrd	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: {nf}	shrd	r9, qword ptr [r8 + 4*rax + 291], r9, cl
+# CHECK: encoding: [0x62,0x54,0xb4,0x1c,0xad,0x8c,0x80,0x23,0x01,0x00,0x00]
+         {nf}	shrd	r9, qword ptr [r8 + 4*rax + 291], r9, cl
diff --git a/llvm/test/TableGen/x86-fold-tables.inc b/llvm/test/TableGen/x86-fold-tables.inc
index 02e4ae52cc914..9b12e4af00bf7 100644
--- a/llvm/test/TableGen/x86-fold-tables.inc
+++ b/llvm/test/TableGen/x86-fold-tables.inc
@@ -150,41 +150,77 @@ static const X86FoldTableEntry Table2Addr[] = {
   {X86::RCR8rCL, X86::RCR8mCL, TB_NO_REVERSE},
   {X86::RCR8ri, X86::RCR8mi, TB_NO_REVERSE},
   {X86::ROL16r1, X86::ROL16m1, TB_NO_REVERSE},
+  {X86::ROL16r1_NF, X86::ROL16m1_NF, TB_NO_REVERSE},
   {X86::ROL16rCL, X86::ROL16mCL, TB_NO_REVERSE},
+  {X86::ROL16rCL_NF, X86::ROL16mCL_NF, TB_NO_REVERSE},
   {X86::ROL16ri, X86::ROL16mi, TB_NO_REVERSE},
+  {X86::ROL16ri_NF, X86::ROL16mi_NF, TB_NO_REVERSE},
   {X86::ROL32r1, X86::ROL32m1, TB_NO_REVERSE},
+  {X86::ROL32r1_NF, X86::ROL32m1_NF, TB_NO_REVERSE},
   {X86::ROL32rCL, X86::ROL32mCL, TB_NO_REVERSE},
+  {X86::ROL32rCL_NF, X86::ROL32mCL_NF, TB_NO_REVERSE},
   {X86::ROL32ri, X86::ROL32mi, TB_NO_REVERSE},
+  {X86::ROL32ri_NF, X86::ROL32mi_NF, TB_NO_REVERSE},
   {X86::ROL64r1, X86::ROL64m1, TB_NO_REVERSE},
+  {X86::ROL64r1_NF, X86::ROL64m1_NF, TB_NO_REVERSE},
   {X86::ROL64rCL, X86::ROL64mCL, TB_NO_REVERSE},
+  {X86::ROL64rCL_NF, X86::ROL64mCL_NF, TB_NO_REVERSE},
   {X86::ROL64ri, X86::ROL64mi, TB_NO_REVERSE},
+  {X86::ROL64ri_NF, X86::ROL64mi_NF, TB_NO_REVERSE},
   {X86::ROL8r1, X86::ROL8m1, TB_NO_REVERSE},
+  {X86::ROL8r1_NF, X86::ROL8m1_NF, TB_NO_REVERSE},
   {X86::ROL8rCL, X86::ROL8mCL, TB_NO_REVERSE},
+  {X86::ROL8rCL_NF, X86::ROL8mCL_NF, TB_NO_REVERSE},
   {X86::ROL8ri, X86::ROL8mi, TB_NO_REVERSE},
+  {X86::ROL8ri_NF, X86::ROL8mi_NF, TB_NO_REVERSE},
   {X86::ROR16r1, X86::ROR16m1, TB_NO_REVERSE},
+  {X86::ROR16r1_NF, X86::ROR16m1_NF, TB_NO_REVERSE},
   {X86::ROR16rCL, X86::ROR16mCL, TB_NO_REVERSE},
+  {X86::ROR16rCL_NF, X86::ROR16mCL_NF, TB_NO_REVERSE},
   {X86::ROR16ri, X86::ROR16mi, TB_NO_REVERSE},
+  {X86::ROR16ri_NF, X86::ROR16mi_NF, TB_NO_REVERSE},
   {X86::ROR32r1, X86::ROR32m1, TB_NO_REVERSE},
+  {X86::ROR32r1_NF, X86::ROR32m1_NF, TB_NO_REVERSE},
   {X86::ROR32rCL, X86::ROR32mCL, TB_NO_REVERSE},
+  {X86::ROR32rCL_NF, X86::ROR32mCL_NF, TB_NO_REVERSE},
   {X86::ROR32ri, X86::ROR32mi, TB_NO_REVERSE},
+  {X86::ROR32ri_NF, X86::ROR32mi_NF, TB_NO_REVERSE},
   {X86::ROR64r1, X86::ROR64m1, TB_NO_REVERSE},
+  {X86::ROR64r1_NF, X86::ROR64m1_NF, TB_NO_REVERSE},
   {X86::ROR64rCL, X86::ROR64mCL, TB_NO_REVERSE},
+  {X86::ROR64rCL_NF, X86::ROR64mCL_NF, TB_NO_REVERSE},
   {X86::ROR64ri, X86::ROR64mi, TB_NO_REVERSE},
+  {X86::ROR64ri_NF, X86::ROR64mi_NF, TB_NO_REVERSE},
   {X86::ROR8r1, X86::ROR8m1, TB_NO_REVERSE},
+  {X86::ROR8r1_NF, X86::ROR8m1_NF, TB_NO_REVERSE},
   {X86::ROR8rCL, X86::ROR8mCL, TB_NO_REVERSE},
+  {X86::ROR8rCL_NF, X86::ROR8mCL_NF, TB_NO_REVERSE},
   {X86::ROR8ri, X86::ROR8mi, TB_NO_REVERSE},
+  {X86::ROR8ri_NF, X86::ROR8mi_NF, TB_NO_REVERSE},
   {X86::SAR16r1, X86::SAR16m1, TB_NO_REVERSE},
+  {X86::SAR16r1_NF, X86::SAR16m1_NF, TB_NO_REVERSE},
   {X86::SAR16rCL, X86::SAR16mCL, TB_NO_REVERSE},
+  {X86::SAR16rCL_NF, X86::SAR16mCL_NF, TB_NO_REVERSE},
   {X86::SAR16ri, X86::SAR16mi, TB_NO_REVERSE},
+  {X86::SAR16ri_NF, X86::SAR16mi_NF, TB_NO_REVERSE},
   {X86::SAR32r1, X86::SAR32m1, TB_NO_REVERSE},
+  {X86::SAR32r1_NF, X86::SAR32m1_NF, TB_NO_REVERSE},
   {X86::SAR32rCL, X86::SAR32mCL, TB_NO_REVERSE},
+  {X86::SAR32rCL_NF, X86::SAR32mCL_NF, TB_NO_REVERSE},
   {X86::SAR32ri, X86::SAR32mi, TB_NO_REVERSE},
+  {X86::SAR32ri_NF, X86::SAR32mi_NF, TB_NO_REVERSE},
   {X86::SAR64r1, X86::SAR64m1, TB_NO_REVERSE},
+  {X86::SAR64r1_NF, X86::SAR64m1_NF, TB_NO_REVERSE},
   {X86::SAR64rCL, X86::SAR64mCL, TB_NO_REVERSE},
+  {X86::SAR64rCL_NF, X86::SAR64mCL_NF, TB_NO_REVERSE},
   {X86::SAR64ri, X86::SAR64mi, TB_NO_REVERSE},
+  {X86::SAR64ri_NF, X86::SAR64mi_NF, TB_NO_REVERSE},
   {X86::SAR8r1, X86::SAR8m1, TB_NO_REVERSE},
+  {X86::SAR8r1_NF, X86::SAR8m1_NF, TB_NO_REVERSE},
   {X86::SAR8rCL, X86::SAR8mCL, TB_NO_REVERSE},
+  {X86::SAR8rCL_NF, X86::SAR8mCL_NF, TB_NO_REVERSE},
   {X86::SAR8ri, X86::SAR8mi, TB_NO_REVERSE},
+  {X86::SAR8ri_NF, X86::SAR8mi_NF, TB_NO_REVERSE},
   {X86::SBB16ri, X86::SBB16mi, TB_NO_REVERSE},
   {X86::SBB16ri8, X86::SBB16mi8, TB_NO_REVERSE},
   {X86::SBB16rr, X86::SBB16mr, TB_NO_REVERSE},
@@ -198,41 +234,77 @@ static const X86FoldTableEntry Table2Addr[] = {
   {X86::SBB8ri8, X86::SBB8mi8, TB_NO_REVERSE},
   {X86::SBB8rr, X86::SBB8mr, TB_NO_REVERSE},
   {X86::SHL16r1, X86::SHL16m1, TB_NO_REVERSE},
+  {X86::SHL16r1_NF, X86::SHL16m1_NF, TB_NO_REVERSE},
   {X86::SHL16rCL, X86::SHL16mCL, TB_NO_REVERSE},
+  {X86::SHL16rCL_NF, X86::SHL16mCL_NF, TB_NO_REVERSE},
   {X86::SHL16ri, X86::SHL16mi, TB_NO_REVERSE},
+  {X86::SHL16ri_NF, X86::SHL16mi_NF, TB_NO_REVERSE},
   {X86::SHL32r1, X86::SHL32m1, TB_NO_REVERSE},
+  {X86::SHL32r1_NF, X86::SHL32m1_NF, TB_NO_REVERSE},
   {X86::SHL32rCL, X86::SHL32mCL, TB_NO_REVERSE},
+  {X86::SHL32rCL_NF, X86::SHL32mCL_NF, TB_NO_REVERSE},
   {X86::SHL32ri, X86::SHL32mi, TB_NO_REVERSE},
+  {X86::SHL32ri_NF, X86::SHL32mi_NF, TB_NO_REVERSE},
   {X86::SHL64r1, X86::SHL64m1, TB_NO_REVERSE},
+  {X86::SHL64r1_NF, X86::SHL64m1_NF, TB_NO_REVERSE},
   {X86::SHL64rCL, X86::SHL64mCL, TB_NO_REVERSE},
+  {X86::SHL64rCL_NF, X86::SHL64mCL_NF, TB_NO_REVERSE},
   {X86::SHL64ri, X86::SHL64mi, TB_NO_REVERSE},
+  {X86::SHL64ri_NF, X86::SHL64mi_NF, TB_NO_REVERSE},
   {X86::SHL8r1, X86::SHL8m1, TB_NO_REVERSE},
+  {X86::SHL8r1_NF, X86::SHL8m1_NF, TB_NO_REVERSE},
   {X86::SHL8rCL, X86::SHL8mCL, TB_NO_REVERSE},
+  {X86::SHL8rCL_NF, X86::SHL8mCL_NF, TB_NO_REVERSE},
   {X86::SHL8ri, X86::SHL8mi, TB_NO_REVERSE},
+  {X86::SHL8ri_NF, X86::SHL8mi_NF, TB_NO_REVERSE},
   {X86::SHLD16rrCL, X86::SHLD16mrCL, TB_NO_REVERSE},
+  {X86::SHLD16rrCL_NF, X86::SHLD16mrCL_NF, TB_NO_REVERSE},
   {X86::SHLD16rri8, X86::SHLD16mri8, TB_NO_REVERSE},
+  {X86::SHLD16rri8_NF, X86::SHLD16mri8_NF, TB_NO_REVERSE},
   {X86::SHLD32rrCL, X86::SHLD32mrCL, TB_NO_REVERSE},
+  {X86::SHLD32rrCL_NF, X86::SHLD32mrCL_NF, TB_NO_REVERSE},
   {X86::SHLD32rri8, X86::SHLD32mri8, TB_NO_REVERSE},
+  {X86::SHLD32rri8_NF, X86::SHLD32mri8_NF, TB_NO_REVERSE},
   {X86::SHLD64rrCL, X86::SHLD64mrCL, TB_NO_REVERSE},
+  {X86::SHLD64rrCL_NF, X86::SHLD64mrCL_NF, TB_NO_REVERSE},
   {X86::SHLD64rri8, X86::SHLD64mri8, TB_NO_REVERSE},
+  {X86::SHLD64rri8_NF, X86::SHLD64mri8_NF, TB_NO_REVERSE},
   {X86::SHR16r1, X86::SHR16m1, TB_NO_REVERSE},
+  {X86::SHR16r1_NF, X86::SHR16m1_NF, TB_NO_REVERSE},
   {X86::SHR16rCL, X86::SHR16mCL, TB_NO_REVERSE},
+  {X86::SHR16rCL_NF, X86::SHR16mCL_NF, TB_NO_REVERSE},
   {X86::SHR16ri, X86::SHR16mi, TB_NO_REVERSE},
+  {X86::SHR16ri_NF, X86::SHR16mi_NF, TB_NO_REVERSE},
   {X86::SHR32r1, X86::SHR32m1, TB_NO_REVERSE},
+  {X86::SHR32r1_NF, X86::SHR32m1_NF, TB_NO_REVERSE},
   {X86::SHR32rCL, X86::SHR32mCL, TB_NO_REVERSE},
+  {X86::SHR32rCL_NF, X86::SHR32mCL_NF, TB_NO_REVERSE},
   {X86::SHR32ri, X86::SHR32mi, TB_NO_REVERSE},
+  {X86::SHR32ri_NF, X86::SHR32mi_NF, TB_NO_REVERSE},
   {X86::SHR64r1, X86::SHR64m1, TB_NO_REVERSE},
+  {X86::SHR64r1_NF, X86::SHR64m1_NF, TB_NO_REVERSE},
   {X86::SHR64rCL, X86::SHR64mCL, TB_NO_REVERSE},
+  {X86::SHR64rCL_NF, X86::SHR64mCL_NF, TB_NO_REVERSE},
   {X86::SHR64ri, X86::SHR64mi, TB_NO_REVERSE},
+  {X86::SHR64ri_NF, X86::SHR64mi_NF, TB_NO_REVERSE},
   {X86::SHR8r1, X86::SHR8m1, TB_NO_REVERSE},
+  {X86::SHR8r1_NF, X86::SHR8m1_NF, TB_NO_REVERSE},
   {X86::SHR8rCL, X86::SHR8mCL, TB_NO_REVERSE},
+  {X86::SHR8rCL_NF, X86::SHR8mCL_NF, TB_NO_REVERSE},
   {X86::SHR8ri, X86::SHR8mi, TB_NO_REVERSE},
+  {X86::SHR8ri_NF, X86::SHR8mi_NF, TB_NO_REVERSE},
   {X86::SHRD16rrCL, X86::SHRD16mrCL, TB_NO_REVERSE},
+  {X86::SHRD16rrCL_NF, X86::SHRD16mrCL_NF, TB_NO_REVERSE},
   {X86::SHRD16rri8, X86::SHRD16mri8, TB_NO_REVERSE},
+  {X86::SHRD16rri8_NF, X86::SHRD16mri8_NF, TB_NO_REVERSE},
   {X86::SHRD32rrCL, X86::SHRD32mrCL, TB_NO_REVERSE},
+  {X86::SHRD32rrCL_NF, X86::SHRD32mrCL_NF, TB_NO_REVERSE},
   {X86::SHRD32rri8, X86::SHRD32mri8, TB_NO_REVERSE},
+  {X86::SHRD32rri8_NF, X86::SHRD32mri8_NF, TB_NO_REVERSE},
   {X86::SHRD64rrCL, X86::SHRD64mrCL, TB_NO_REVERSE},
+  {X86::SHRD64rrCL_NF, X86::SHRD64mrCL_NF, TB_NO_REVERSE},
   {X86::SHRD64rri8, X86::SHRD64mri8, TB_NO_REVERSE},
+  {X86::SHRD64rri8_NF, X86::SHRD64mri8_NF, TB_NO_REVERSE},
   {X86::SUB16ri, X86::SUB16mi, TB_NO_REVERSE},
   {X86::SUB16ri8, X86::SUB16mi8, TB_NO_REVERSE},
   {X86::SUB16ri8_NF, X86::SUB16mi8_NF, TB_NO_REVERSE},
@@ -783,8 +855,80 @@ static const X86FoldTableEntry Table1[] = {
   {X86::PSHUFLWri, X86::PSHUFLWmi, TB_ALIGN_16},
   {X86::PSWAPDrr, X86::PSWAPDrm, 0},
   {X86::PTESTrr, X86::PTESTrm, TB_ALIGN_16},
+  {X86::RCL16r1_ND, X86::RCL16m1_ND, 0},
+  {X86::RCL16rCL_ND, X86::RCL16mCL_ND, 0},
+  {X86::RCL16ri_ND, X86::RCL16mi_ND, 0},
+  {X86::RCL32r1_ND, X86::RCL32m1_ND, 0},
+  {X86::RCL32rCL_ND, X86::RCL32mCL_ND, 0},
+  {X86::RCL32ri_ND, X86::RCL32mi_ND, 0},
+  {X86::RCL64r1_ND, X86::RCL64m1_ND, 0},
+  {X86::RCL64rCL_ND, X86::RCL64mCL_ND, 0},
+  {X86::RCL64ri_ND, X86::RCL64mi_ND, 0},
+  {X86::RCL8r1_ND, X86::RCL8m1_ND, 0},
+  {X86::RCL8rCL_ND, X86::RCL8mCL_ND, 0},
+  {X86::RCL8ri_ND, X86::RCL8mi_ND, 0},
   {X86::RCPPSr, X86::RCPPSm, TB_ALIGN_16},
   {X86::RCPSSr, X86::RCPSSm, 0},
+  {X86::RCR16r1_ND, X86::RCR16m1_ND, 0},
+  {X86::RCR16rCL_ND, X86::RCR16mCL_ND, 0},
+  {X86::RCR16ri_ND, X86::RCR16mi_ND, 0},
+  {X86::RCR32r1_ND, X86::RCR32m1_ND, 0},
+  {X86::RCR32rCL_ND, X86::RCR32mCL_ND, 0},
+  {X86::RCR32ri_ND, X86::RCR32mi_ND, 0},
+  {X86::RCR64r1_ND, X86::RCR64m1_ND, 0},
+  {X86::RCR64rCL_ND, X86::RCR64mCL_ND, 0},
+  {X86::RCR64ri_ND, X86::RCR64mi_ND, 0},
+  {X86::RCR8r1_ND, X86::RCR8m1_ND, 0},
+  {X86::RCR8rCL_ND, X86::RCR8mCL_ND, 0},
+  {X86::RCR8ri_ND, X86::RCR8mi_ND, 0},
+  {X86::ROL16r1_ND, X86::ROL16m1_ND, 0},
+  {X86::ROL16r1_NF_ND, X86::ROL16m1_NF_ND, 0},
+  {X86::ROL16rCL_ND, X86::ROL16mCL_ND, 0},
+  {X86::ROL16rCL_NF_ND, X86::ROL16mCL_NF_ND, 0},
+  {X86::ROL16ri_ND, X86::ROL16mi_ND, 0},
+  {X86::ROL16ri_NF_ND, X86::ROL16mi_NF_ND, 0},
+  {X86::ROL32r1_ND, X86::ROL32m1_ND, 0},
+  {X86::ROL32r1_NF_ND, X86::ROL32m1_NF_ND, 0},
+  {X86::ROL32rCL_ND, X86::ROL32mCL_ND, 0},
+  {X86::ROL32rCL_NF_ND, X86::ROL32mCL_NF_ND, 0},
+  {X86::ROL32ri_ND, X86::ROL32mi_ND, 0},
+  {X86::ROL32ri_NF_ND, X86::ROL32mi_NF_ND, 0},
+  {X86::ROL64r1_ND, X86::ROL64m1_ND, 0},
+  {X86::ROL64r1_NF_ND, X86::ROL64m1_NF_ND, 0},
+  {X86::ROL64rCL_ND, X86::ROL64mCL_ND, 0},
+  {X86::ROL64rCL_NF_ND, X86::ROL64mCL_NF_ND, 0},
+  {X86::ROL64ri_ND, X86::ROL64mi_ND, 0},
+  {X86::ROL64ri_NF_ND, X86::ROL64mi_NF_ND, 0},
+  {X86::ROL8r1_ND, X86::ROL8m1_ND, 0},
+  {X86::ROL8r1_NF_ND, X86::ROL8m1_NF_ND, 0},
+  {X86::ROL8rCL_ND, X86::ROL8mCL_ND, 0},
+  {X86::ROL8rCL_NF_ND, X86::ROL8mCL_NF_ND, 0},
+  {X86::ROL8ri_ND, X86::ROL8mi_ND, 0},
+  {X86::ROL8ri_NF_ND, X86::ROL8mi_NF_ND, 0},
+  {X86::ROR16r1_ND, X86::ROR16m1_ND, 0},
+  {X86::ROR16r1_NF_ND, X86::ROR16m1_NF_ND, 0},
+  {X86::ROR16rCL_ND, X86::ROR16mCL_ND, 0},
+  {X86::ROR16rCL_NF_ND, X86::ROR16mCL_NF_ND, 0},
+  {X86::ROR16ri_ND, X86::ROR16mi_ND, 0},
+  {X86::ROR16ri_NF_ND, X86::ROR16mi_NF_ND, 0},
+  {X86::ROR32r1_ND, X86::ROR32m1_ND, 0},
+  {X86::ROR32r1_NF_ND, X86::ROR32m1_NF_ND, 0},
+  {X86::ROR32rCL_ND, X86::ROR32mCL_ND, 0},
+  {X86::ROR32rCL_NF_ND, X86::ROR32mCL_NF_ND, 0},
+  {X86::ROR32ri_ND, X86::ROR32mi_ND, 0},
+  {X86::ROR32ri_NF_ND, X86::ROR32mi_NF_ND, 0},
+  {X86::ROR64r1_ND, X86::ROR64m1_ND, 0},
+  {X86::ROR64r1_NF_ND, X86::ROR64m1_NF_ND, 0},
+  {X86::ROR64rCL_ND, X86::ROR64mCL_ND, 0},
+  {X86::ROR64rCL_NF_ND, X86::ROR64mCL_NF_ND, 0},
+  {X86::ROR64ri_ND, X86::ROR64mi_ND, 0},
+  {X86::ROR64ri_NF_ND, X86::ROR64mi_NF_ND, 0},
+  {X86::ROR8r1_ND, X86::ROR8m1_ND, 0},
+  {X86::ROR8r1_NF_ND, X86::ROR8m1_NF_ND, 0},
+  {X86::ROR8rCL_ND, X86::ROR8mCL_ND, 0},
+  {X86::ROR8rCL_NF_ND, X86::ROR8mCL_NF_ND, 0},
+  {X86::ROR8ri_ND, X86::ROR8mi_ND, 0},
+  {X86::ROR8ri_NF_ND, X86::ROR8mi_NF_ND, 0},
   {X86::RORX32ri, X86::RORX32mi, 0},
   {X86::RORX32ri_EVEX, X86::RORX32mi_EVEX, 0},
   {X86::RORX64ri, X86::RORX64mi, 0},
@@ -795,6 +939,30 @@ static const X86FoldTableEntry Table1[] = {
   {X86::ROUNDSSr, X86::ROUNDSSm, 0},
   {X86::RSQRTPSr, X86::RSQRTPSm, TB_ALIGN_16},
   {X86::RSQRTSSr, X86::RSQRTSSm, 0},
+  {X86::SAR16r1_ND, X86::SAR16m1_ND, 0},
+  {X86::SAR16r1_NF_ND, X86::SAR16m1_NF_ND, 0},
+  {X86::SAR16rCL_ND, X86::SAR16mCL_ND, 0},
+  {X86::SAR16rCL_NF_ND, X86::SAR16mCL_NF_ND, 0},
+  {X86::SAR16ri_ND, X86::SAR16mi_ND, 0},
+  {X86::SAR16ri_NF_ND, X86::SAR16mi_NF_ND, 0},
+  {X86::SAR32r1_ND, X86::SAR32m1_ND, 0},
+  {X86::SAR32r1_NF_ND, X86::SAR32m1_NF_ND, 0},
+  {X86::SAR32rCL_ND, X86::SAR32mCL_ND, 0},
+  {X86::SAR32rCL_NF_ND, X86::SAR32mCL_NF_ND, 0},
+  {X86::SAR32ri_ND, X86::SAR32mi_ND, 0},
+  {X86::SAR32ri_NF_ND, X86::SAR32mi_NF_ND, 0},
+  {X86::SAR64r1_ND, X86::SAR64m1_ND, 0},
+  {X86::SAR64r1_NF_ND, X86::SAR64m1_NF_ND, 0},
+  {X86::SAR64rCL_ND, X86::SAR64mCL_ND, 0},
+  {X86::SAR64rCL_NF_ND, X86::SAR64mCL_NF_ND, 0},
+  {X86::SAR64ri_ND, X86::SAR64mi_ND, 0},
+  {X86::SAR64ri_NF_ND, X86::SAR64mi_NF_ND, 0},
+  {X86::SAR8r1_ND, X86::SAR8m1_ND, 0},
+  {X86::SAR8r1_NF_ND, X86::SAR8m1_NF_ND, 0},
+  {X86::SAR8rCL_ND, X86::SAR8mCL_ND, 0},
+  {X86::SAR8rCL_NF_ND, X86::SAR8mCL_NF_ND, 0},
+  {X86::SAR8ri_ND, X86::SAR8mi_ND, 0},
+  {X86::SAR8ri_NF_ND, X86::SAR8mi_NF_ND, 0},
   {X86::SARX32rr, X86::SARX32rm, 0},
   {X86::SARX32rr_EVEX, X86::SARX32rm_EVEX, 0},
   {X86::SARX64rr, X86::SARX64rm, 0},
@@ -810,10 +978,82 @@ static const X86FoldTableEntry Table1[] = {
   {X86::SBB64rr_ND, X86::SBB64mr_ND, 0},
   {X86::SBB8ri_ND, X86::SBB8mi_ND, 0},
   {X86::SBB8rr_ND, X86::SBB8mr_ND, 0},
+  {X86::SHL16r1_ND, X86::SHL16m1_ND, 0},
+  {X86::SHL16r1_NF_ND, X86::SHL16m1_NF_ND, 0},
+  {X86::SHL16rCL_ND, X86::SHL16mCL_ND, 0},
+  {X86::SHL16rCL_NF_ND, X86::SHL16mCL_NF_ND, 0},
+  {X86::SHL16ri_ND, X86::SHL16mi_ND, 0},
+  {X86::SHL16ri_NF_ND, X86::SHL16mi_NF_ND, 0},
+  {X86::SHL32r1_ND, X86::SHL32m1_ND, 0},
+  {X86::SHL32r1_NF_ND, X86::SHL32m1_NF_ND, 0},
+  {X86::SHL32rCL_ND, X86::SHL32mCL_ND, 0},
+  {X86::SHL32rCL_NF_ND, X86::SHL32mCL_NF_ND, 0},
+  {X86::SHL32ri_ND, X86::SHL32mi_ND, 0},
+  {X86::SHL32ri_NF_ND, X86::SHL32mi_NF_ND, 0},
+  {X86::SHL64r1_ND, X86::SHL64m1_ND, 0},
+  {X86::SHL64r1_NF_ND, X86::SHL64m1_NF_ND, 0},
+  {X86::SHL64rCL_ND, X86::SHL64mCL_ND, 0},
+  {X86::SHL64rCL_NF_ND, X86::SHL64mCL_NF_ND, 0},
+  {X86::SHL64ri_ND, X86::SHL64mi_ND, 0},
+  {X86::SHL64ri_NF_ND, X86::SHL64mi_NF_ND, 0},
+  {X86::SHL8r1_ND, X86::SHL8m1_ND, 0},
+  {X86::SHL8r1_NF_ND, X86::SHL8m1_NF_ND, 0},
+  {X86::SHL8rCL_ND, X86::SHL8mCL_ND, 0},
+  {X86::SHL8rCL_NF_ND, X86::SHL8mCL_NF_ND, 0},
+  {X86::SHL8ri_ND, X86::SHL8mi_ND, 0},
+  {X86::SHL8ri_NF_ND, X86::SHL8mi_NF_ND, 0},
+  {X86::SHLD16rrCL_ND, X86::SHLD16mrCL_ND, 0},
+  {X86::SHLD16rrCL_NF_ND, X86::SHLD16mrCL_NF_ND, 0},
+  {X86::SHLD16rri8_ND, X86::SHLD16mri8_ND, 0},
+  {X86::SHLD16rri8_NF_ND, X86::SHLD16mri8_NF_ND, 0},
+  {X86::SHLD32rrCL_ND, X86::SHLD32mrCL_ND, 0},
+  {X86::SHLD32rrCL_NF_ND, X86::SHLD32mrCL_NF_ND, 0},
+  {X86::SHLD32rri8_ND, X86::SHLD32mri8_ND, 0},
+  {X86::SHLD32rri8_NF_ND, X86::SHLD32mri8_NF_ND, 0},
+  {X86::SHLD64rrCL_ND, X86::SHLD64mrCL_ND, 0},
+  {X86::SHLD64rrCL_NF_ND, X86::SHLD64mrCL_NF_ND, 0},
+  {X86::SHLD64rri8_ND, X86::SHLD64mri8_ND, 0},
+  {X86::SHLD64rri8_NF_ND, X86::SHLD64mri8_NF_ND, 0},
   {X86::SHLX32rr, X86::SHLX32rm, 0},
   {X86::SHLX32rr_EVEX, X86::SHLX32rm_EVEX, 0},
   {X86::SHLX64rr, X86::SHLX64rm, 0},
   {X86::SHLX64rr_EVEX, X86::SHLX64rm_EVEX, 0},
+  {X86::SHR16r1_ND, X86::SHR16m1_ND, 0},
+  {X86::SHR16r1_NF_ND, X86::SHR16m1_NF_ND, 0},
+  {X86::SHR16rCL_ND, X86::SHR16mCL_ND, 0},
+  {X86::SHR16rCL_NF_ND, X86::SHR16mCL_NF_ND, 0},
+  {X86::SHR16ri_ND, X86::SHR16mi_ND, 0},
+  {X86::SHR16ri_NF_ND, X86::SHR16mi_NF_ND, 0},
+  {X86::SHR32r1_ND, X86::SHR32m1_ND, 0},
+  {X86::SHR32r1_NF_ND, X86::SHR32m1_NF_ND, 0},
+  {X86::SHR32rCL_ND, X86::SHR32mCL_ND, 0},
+  {X86::SHR32rCL_NF_ND, X86::SHR32mCL_NF_ND, 0},
+  {X86::SHR32ri_ND, X86::SHR32mi_ND, 0},
+  {X86::SHR32ri_NF_ND, X86::SHR32mi_NF_ND, 0},
+  {X86::SHR64r1_ND, X86::SHR64m1_ND, 0},
+  {X86::SHR64r1_NF_ND, X86::SHR64m1_NF_ND, 0},
+  {X86::SHR64rCL_ND, X86::SHR64mCL_ND, 0},
+  {X86::SHR64rCL_NF_ND, X86::SHR64mCL_NF_ND, 0},
+  {X86::SHR64ri_ND, X86::SHR64mi_ND, 0},
+  {X86::SHR64ri_NF_ND, X86::SHR64mi_NF_ND, 0},
+  {X86::SHR8r1_ND, X86::SHR8m1_ND, 0},
+  {X86::SHR8r1_NF_ND, X86::SHR8m1_NF_ND, 0},
+  {X86::SHR8rCL_ND, X86::SHR8mCL_ND, 0},
+  {X86::SHR8rCL_NF_ND, X86::SHR8mCL_NF_ND, 0},
+  {X86::SHR8ri_ND, X86::SHR8mi_ND, 0},
+  {X86::SHR8ri_NF_ND, X86::SHR8mi_NF_ND, 0},
+  {X86::SHRD16rrCL_ND, X86::SHRD16mrCL_ND, 0},
+  {X86::SHRD16rrCL_NF_ND, X86::SHRD16mrCL_NF_ND, 0},
+  {X86::SHRD16rri8_ND, X86::SHRD16mri8_ND, 0},
+  {X86::SHRD16rri8_NF_ND, X86::SHRD16mri8_NF_ND, 0},
+  {X86::SHRD32rrCL_ND, X86::SHRD32mrCL_ND, 0},
+  {X86::SHRD32rrCL_NF_ND, X86::SHRD32mrCL_NF_ND, 0},
+  {X86::SHRD32rri8_ND, X86::SHRD32mri8_ND, 0},
+  {X86::SHRD32rri8_NF_ND, X86::SHRD32mri8_NF_ND, 0},
+  {X86::SHRD64rrCL_ND, X86::SHRD64mrCL_ND, 0},
+  {X86::SHRD64rrCL_NF_ND, X86::SHRD64mrCL_NF_ND, 0},
+  {X86::SHRD64rri8_ND, X86::SHRD64mri8_ND, 0},
+  {X86::SHRD64rri8_NF_ND, X86::SHRD64mri8_NF_ND, 0},
   {X86::SHRX32rr, X86::SHRX32rm, 0},
   {X86::SHRX32rr_EVEX, X86::SHRX32rm_EVEX, 0},
   {X86::SHRX64rr, X86::SHRX64rm, 0},

From afc229b217f499160b3228a0d7d1783613ac9b58 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 22 Jan 2024 21:31:36 -0500
Subject: [PATCH 532/843] Revert "[gn] port 3ab8d2aac7bc"

This reverts commit 9a03d94a4db6b6d3b7cb1582e9e9cc239e737a30.

3ab8d2aac7bc was reverted in bffd80d6df61.
---
 llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
index e82ec494993d0..bfe98652d0baf 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
@@ -64,7 +64,6 @@ write_cmake_config("lit_common_configured") {
     "SANITIZER_CAN_USE_CXXABI_PYBOOL=True",
     "SANITIZER_USE_STATIC_CXX_ABI_PYBOOL=False",
     "SANITIZER_USE_STATIC_LLVM_UNWINDER_PYBOOL=False",
-    "COMPILER_RT_HAS_AARCH64_SME=False",
     "COMPILER_RT_HAS_LLD_PYBOOL=True",
     "COMPILER_RT_HAS_GWP_ASAN_PYBOOL=False",
     "HAVE_RPC_XDR_H=0",

From 9577806b1e6ce06bece48d96b39035f4cee9137d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 23 Jan 2024 03:37:32 +0100
Subject: [PATCH 533/843] [JITLink][AArch32] Implement R_ARM_PREL31 and process
 .ARM.exidx sections (#79044)

`R_ARM_PREL31` is a 31-bits relative data relocation where the
most-significant bit is preserved. It's used primarily in `.ARM.exidx`
sections, which we skipped processing until now, because we didn't
support the relocation type. This was implemented in RuntimeDyld with
https://reviews.llvm.org/D25069 and I implemented it in a similar way in
JITLink in order to reach feature parity.
---
 .../llvm/ExecutionEngine/JITLink/aarch32.h    |  3 +++
 .../JITLink/ELFLinkGraphBuilder.h             |  6 ++++++
 .../ExecutionEngine/JITLink/ELF_aarch32.cpp   | 14 ++++---------
 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp  | 21 ++++++++++++++++---
 .../JITLink/AArch32/ELF_relocations_data.s    | 20 ++++++++++++++----
 5 files changed, 47 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
index ed53fa409ade8..ef28ba88431e4 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
@@ -41,6 +41,9 @@ enum EdgeKind_aarch32 : Edge::Kind {
   /// Absolute 32-bit value relocation
   Data_Pointer32,
 
+  /// Relative 31-bit value relocation that preserves the most-significant bit
+  Data_PRel31,
+
   /// Create GOT entry and store offset
   Data_RequestGOTAndTransformToDelta32,
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
index 56d1efa4bdef7..e1b11dfcfc211 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/ELFLinkGraphBuilder.h
@@ -397,6 +397,12 @@ template <typename ELFT> Error ELFLinkGraphBuilder<ELFT>::graphifySections() {
                                   orc::ExecutorAddr(Sec.sh_addr),
                                   Sec.sh_addralign, 0);
 
+    if (Sec.sh_type == ELF::SHT_ARM_EXIDX) {
+      // Add live symbol to avoid dead-stripping for .ARM.exidx sections
+      G->addAnonymousSymbol(*B, orc::ExecutorAddrDiff(),
+                            orc::ExecutorAddrDiff(), false, true);
+    }
+
     setGraphBlock(SecIndex, B);
   }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
index 15c209e1ebe5b..dfe6ab14adc27 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
@@ -50,6 +50,8 @@ getJITLinkEdgeKind(uint32_t ELFType, const aarch32::ArmConfig &ArmCfg) {
     return aarch32::Arm_MovtAbs;
   case ELF::R_ARM_NONE:
     return aarch32::None;
+  case ELF::R_ARM_PREL31:
+    return aarch32::Data_PRel31;
   case ELF::R_ARM_TARGET1:
     return (ArmCfg.Target1Rel) ? aarch32::Data_Delta32
                                : aarch32::Data_Pointer32;
@@ -79,6 +81,8 @@ Expected<uint32_t> getELFRelocationType(Edge::Kind Kind) {
     return ELF::R_ARM_REL32;
   case aarch32::Data_Pointer32:
     return ELF::R_ARM_ABS32;
+  case aarch32::Data_PRel31:
+    return ELF::R_ARM_PREL31;
   case aarch32::Data_RequestGOTAndTransformToDelta32:
     return ELF::R_ARM_GOT_PREL;
   case aarch32::Arm_Call:
@@ -140,16 +144,6 @@ class ELFLinkGraphBuilder_aarch32
   using ELFT = ELFType<DataEndianness, false>;
   using Base = ELFLinkGraphBuilder<ELFT>;
 
-  bool excludeSection(const typename ELFT::Shdr &Sect) const override {
-    // TODO: An .ARM.exidx (Exception Index table) entry is 8-bytes in size and
-    // consists of 2 words. It might be sufficient to process only relocations
-    // in the the second word (offset 4). Please find more details in: Exception
-    // Handling ABI for the Arm® Architecture -> Index table entries
-    if (Sect.sh_type == ELF::SHT_ARM_EXIDX)
-      return true;
-    return false;
-  }
-
   Error addRelocations() override {
     LLVM_DEBUG(dbgs() << "Processing relocations:\n");
     using Self = ELFLinkGraphBuilder_aarch32<DataEndianness>;
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
index 9508cde07b42a..96dff7656f17d 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -398,6 +398,8 @@ Expected<int64_t> readAddendData(LinkGraph &G, Block &B, Edge::OffsetT Offset,
   case Data_Pointer32:
   case Data_RequestGOTAndTransformToDelta32:
     return SignExtend64<32>(support::endian::read32(FixupPtr, Endian));
+  case Data_PRel31:
+    return SignExtend64<31>(support::endian::read32(FixupPtr, Endian));
   default:
     return make_error<JITLinkError>(
         "In graph " + G.getName() + ", section " + B.getSection().getName() +
@@ -472,9 +474,8 @@ Error applyFixupData(LinkGraph &G, Block &B, const Edge &E) {
   Symbol &TargetSymbol = E.getTarget();
   uint64_t TargetAddress = TargetSymbol.getAddress().getValue();
 
-  // Regular data relocations have size 4, alignment 1 and write the full 32-bit
-  // result to the place; no need for overflow checking. There are three
-  // exceptions: R_ARM_ABS8, R_ARM_ABS16, R_ARM_PREL31
+  // Data relocations have alignment 1, size 4 (except R_ARM_ABS8 and
+  // R_ARM_ABS16) and write the full 32-bit result (except R_ARM_PREL31).
   switch (Kind) {
   case Data_Delta32: {
     int64_t Value = TargetAddress - FixupAddress + Addend;
@@ -496,6 +497,19 @@ Error applyFixupData(LinkGraph &G, Block &B, const Edge &E) {
       endian::write32be(FixupPtr, Value);
     return Error::success();
   }
+  case Data_PRel31: {
+    int64_t Value = TargetAddress - FixupAddress + Addend;
+    if (!isInt<31>(Value))
+      return makeTargetOutOfRangeError(G, B, E);
+    if (LLVM_LIKELY(G.getEndianness() == endianness::little)) {
+      uint32_t MSB = endian::read32le(FixupPtr) & 0x80000000;
+      endian::write32le(FixupPtr, MSB | (Value & ~0x80000000));
+    } else {
+      uint32_t MSB = endian::read32be(FixupPtr) & 0x80000000;
+      endian::write32be(FixupPtr, MSB | (Value & ~0x80000000));
+    }
+    return Error::success();
+  }
   case Data_RequestGOTAndTransformToDelta32:
     llvm_unreachable("Should be transformed");
   default:
@@ -856,6 +870,7 @@ const char *getEdgeKindName(Edge::Kind K) {
   switch (K) {
     KIND_NAME_CASE(Data_Delta32)
     KIND_NAME_CASE(Data_Pointer32)
+    KIND_NAME_CASE(Data_PRel31)
     KIND_NAME_CASE(Data_RequestGOTAndTransformToDelta32)
     KIND_NAME_CASE(Arm_Call)
     KIND_NAME_CASE(Arm_Jump24)
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
index 7bd59f8a52de6..544e8c259d286 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
@@ -64,7 +64,7 @@ got_prel_offset:
 	.size	got_prel_offset, .-got_prel_offset
 	.size	got_prel, .-got_prel
 
-# EH personality routine
+# EH personality routine in .ARM.exidx
 # CHECK-TYPE: {{[0-9a-f]+}} R_ARM_NONE __aeabi_unwind_cpp_pr0
 	.globl __aeabi_unwind_cpp_pr0
 	.type __aeabi_unwind_cpp_pr0,%function
@@ -72,8 +72,19 @@ got_prel_offset:
 __aeabi_unwind_cpp_pr0:
 	bx lr
 
-# Generate reference to EH personality (right now we ignore the resulting
-# R_ARM_PREL31 relocation since it's in .ARM.exidx)
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_PREL31 .text
+#
+# An .ARM.exidx table entry is 8-bytes in size, made up of 2 4-byte entries.
+# First word contains offset to function for this entry:
+# jitlink-check: (*{4}section_addr(out.o, .ARM.exidx))[31:0] = prel31 - section_addr(out.o, .ARM.exidx)
+#
+# Most-significant bit in second word denotes inline entry when set (and
+# relocation to .ARM.extab otherwise). Inline entry with compact model index 0:
+#   0x9b      vsp = r11
+#   0x84 0x80 pop {r11, r14}
+#
+# jitlink-check: *{4}(section_addr(out.o, .ARM.exidx) + 4) = 0x809b8480
+#
 	.globl  prel31
 	.type   prel31,%function
 	.align  2
@@ -85,8 +96,8 @@ prel31:
 	mov     r11, sp
 	pop     {r11, lr}
 	mov     pc, lr
-	.size   prel31,.-prel31
 	.fnend
+	.size   prel31,.-prel31
 
 # This test is executable with any 4-byte external target:
 #  > echo "unsigned target = 42;" | clang -target armv7-linux-gnueabihf -o target.o -c -xc -
@@ -98,5 +109,6 @@ prel31:
 main:
 	push	{lr}
 	bl	got_prel
+	bl	prel31
 	pop	{pc}
 	.size   main, .-main

From 62eb65b36f8e8fa121de047c89fb7dbe5d4c96be Mon Sep 17 00:00:00 2001
From: Sean Fertile <35576261+mandlebug@users.noreply.github.com>
Date: Mon, 22 Jan 2024 21:41:19 -0500
Subject: [PATCH 534/843] [FatLTO] output of -ffat-lto-objects -S should be
 assembly. (#79041)

Fat lto with -c compiles to an object file with the IR embedded in a
section of the object, the combination of fat-lto with -S should then
produce an assembly file equivalent of that. The IR output can still be
genreated by using both -S and -emit-llvm.
---
 clang/lib/Driver/Driver.cpp         |  7 ++++---
 clang/test/Driver/fat-lto-objects.c | 19 ++++++++++++++++---
 2 files changed, 20 insertions(+), 6 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 1c32370208230..5c170915d58d5 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -4770,10 +4770,11 @@ Action *Driver::ConstructPhaseAction(
   case phases::Backend: {
     if (isUsingLTO() && TargetDeviceOffloadKind == Action::OFK_None) {
       types::ID Output;
-      if (Args.hasArg(options::OPT_S))
+      if (Args.hasArg(options::OPT_ffat_lto_objects))
+        Output = Args.hasArg(options::OPT_emit_llvm) ? types::TY_LTO_IR
+                                                     : types::TY_PP_Asm;
+      else if (Args.hasArg(options::OPT_S))
         Output = types::TY_LTO_IR;
-      else if (Args.hasArg(options::OPT_ffat_lto_objects))
-        Output = types::TY_PP_Asm;
       else
         Output = types::TY_LTO_BC;
       return C.MakeAction<BackendJobAction>(Input, Output);
diff --git a/clang/test/Driver/fat-lto-objects.c b/clang/test/Driver/fat-lto-objects.c
index e02359db3f0ae..203175d61b73d 100644
--- a/clang/test/Driver/fat-lto-objects.c
+++ b/clang/test/Driver/fat-lto-objects.c
@@ -12,14 +12,27 @@
 // CHECK-CC-S-NOT: -emit-llvm
 // CHECK-CC-S-NOT: -ffat-lto-objects
 
-/// When LTO is enabled, we expect LLVM IR output and -ffat-lto-objects to be passed to cc1.
+/// When fat LTO is enabled with -S, we expect asm output and -ffat-lto-objects to be passed to cc1.
 // RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -S 2>&1 | FileCheck %s -check-prefix=CHECK-CC-S-LTO
-// RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -S -emit-llvm 2>&1 | FileCheck %s -check-prefix=CHECK-CC-S-LTO
 // CHECK-CC-S-LTO: -cc1
 // CHECK-CC-S-LTO-SAME: -funified-lto
-// CHECK-CC-S-LTO-SAME: -emit-llvm
+// CHECK-CC-S-NOT: -emit-llvm
 // CHECK-CC-S-LTO-SAME: -ffat-lto-objects
 
+/// When fat LTO is enabled with -S and -emit-llvm, we expect IR output and -ffat-lto-objects to be passed to cc1.
+// RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -S -emit-llvm 2>&1 | FileCheck %s -check-prefix=CHECK-CC-S-EL-LTO
+// CHECK-CC-S-EL-LTO: -cc1
+// CHECK-CC-S-EL-LTO-SAME: -funified-lto
+// CHECK-CC-S-EL-LTO-SAME: -emit-llvm
+// CHECK-CC-S-EL-LTO-SAME: -ffat-lto-objects
+
+/// When fat LTO is enabled wihtout -S we expect native object output and -ffat-lto-object to be passed to cc1.
+// RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC-C-LTO
+// CHECK-CC-C-LTO: -cc1
+// CHECK-CC-C-LTO: -funified-lto
+// CHECK-CC-C-LTO: -emit-obj
+// CHECK-CC-C-LTO: -ffat-lto-objects
+
 /// Make sure we don't have a warning for -ffat-lto-objects being unused
 // RUN: %clang --target=x86_64-unknown-linux-gnu -ffat-lto-objects -fdriver-only -Werror -v %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC-NOLTO
 // CHECK-CC-NOLTO: -cc1

From 23edf782a268e750b18f720fa082a6891ae9d304 Mon Sep 17 00:00:00 2001
From: Petr Hosek <phosek@google.com>
Date: Mon, 22 Jan 2024 18:46:48 -0800
Subject: [PATCH 535/843] [libc] Include missing RISC-V stdlib.h and math.h
 entrypoints (#79034)

This matches the entrypoints for baremetal ARM.
---
 libc/config/baremetal/riscv/entrypoints.txt | 110 +++++++++
 libc/docs/math/index.rst                    | 236 ++++++++++----------
 2 files changed, 228 insertions(+), 118 deletions(-)

diff --git a/libc/config/baremetal/riscv/entrypoints.txt b/libc/config/baremetal/riscv/entrypoints.txt
index 5da755170fda9..f725b1c2394c6 100644
--- a/libc/config/baremetal/riscv/entrypoints.txt
+++ b/libc/config/baremetal/riscv/entrypoints.txt
@@ -77,6 +77,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.abort
     libc.src.stdlib.abs
     libc.src.stdlib.atoi
+    libc.src.stdlib.atof
     libc.src.stdlib.atol
     libc.src.stdlib.atoll
     libc.src.stdlib.bsearch
@@ -86,7 +87,10 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdlib.llabs
     libc.src.stdlib.lldiv
     libc.src.stdlib.qsort
+    libc.src.stdlib.strtod
+    libc.src.stdlib.strtof
     libc.src.stdlib.strtol
+    libc.src.stdlib.strtold
     libc.src.stdlib.strtoll
     libc.src.stdlib.strtoul
     libc.src.stdlib.strtoull
@@ -110,18 +114,124 @@ set(TARGET_LIBM_ENTRYPOINTS
     libc.src.fenv.feupdateenv
 
     # math.h entrypoints
+    libc.src.math.acosf
+    libc.src.math.acoshf
+    libc.src.math.asinf
+    libc.src.math.asinhf
+    libc.src.math.atanf
+    libc.src.math.atanhf
+    libc.src.math.ceil
+    libc.src.math.ceilf
+    libc.src.math.ceill
+    libc.src.math.copysign
+    libc.src.math.copysignf
+    libc.src.math.copysignl
+    libc.src.math.cosf
+    libc.src.math.coshf
+    libc.src.math.erff
+    libc.src.math.exp
+    libc.src.math.exp10
+    libc.src.math.exp10f
+    libc.src.math.exp2
+    libc.src.math.exp2f
+    libc.src.math.expf
+    libc.src.math.expm1
+    libc.src.math.expm1f
     libc.src.math.fabs
     libc.src.math.fabsf
     libc.src.math.fabsl
     libc.src.math.fdim
     libc.src.math.fdimf
     libc.src.math.fdiml
+    libc.src.math.floor
+    libc.src.math.floorf
+    libc.src.math.floorl
+    libc.src.math.fma
+    libc.src.math.fmaf
     libc.src.math.fmax
     libc.src.math.fmaxf
     libc.src.math.fmaxl
     libc.src.math.fmin
     libc.src.math.fminf
     libc.src.math.fminl
+    libc.src.math.fmod
+    libc.src.math.fmodf
+    libc.src.math.frexp
+    libc.src.math.frexpf
+    libc.src.math.frexpl
+    libc.src.math.hypot
+    libc.src.math.hypotf
+    libc.src.math.ilogb
+    libc.src.math.ilogbf
+    libc.src.math.ilogbl
+    libc.src.math.ldexp
+    libc.src.math.ldexpf
+    libc.src.math.ldexpl
+    libc.src.math.llrint
+    libc.src.math.llrintf
+    libc.src.math.llrintl
+    libc.src.math.llround
+    libc.src.math.llroundf
+    libc.src.math.llroundl
+    libc.src.math.log
+    libc.src.math.log10
+    libc.src.math.log10f
+    libc.src.math.log1p
+    libc.src.math.log1pf
+    libc.src.math.log2
+    libc.src.math.log2f
+    libc.src.math.logb
+    libc.src.math.logbf
+    libc.src.math.logbl
+    libc.src.math.logf
+    libc.src.math.lrint
+    libc.src.math.lrintf
+    libc.src.math.lrintl
+    libc.src.math.lround
+    libc.src.math.lroundf
+    libc.src.math.lroundl
+    libc.src.math.modf
+    libc.src.math.modff
+    libc.src.math.modfl
+    libc.src.math.nan
+    libc.src.math.nanf
+    libc.src.math.nanl
+    libc.src.math.nearbyint
+    libc.src.math.nearbyintf
+    libc.src.math.nearbyintl
+    libc.src.math.nextafter
+    libc.src.math.nextafterf
+    libc.src.math.nextafterl
+    libc.src.math.nexttoward
+    libc.src.math.nexttowardf
+    libc.src.math.nexttowardl
+    libc.src.math.powf
+    libc.src.math.remainder
+    libc.src.math.remainderf
+    libc.src.math.remainderl
+    libc.src.math.remquo
+    libc.src.math.remquof
+    libc.src.math.remquol
+    libc.src.math.rint
+    libc.src.math.rintf
+    libc.src.math.rintl
+    libc.src.math.round
+    libc.src.math.roundf
+    libc.src.math.roundl
+    libc.src.math.scalbn
+    libc.src.math.scalbnf
+    libc.src.math.scalbnl
+    libc.src.math.sincosf
+    libc.src.math.sinf
+    libc.src.math.sinhf
+    libc.src.math.sqrt
+    libc.src.math.sqrtf
+    libc.src.math.sqrtl
+    libc.src.math.tanf
+    libc.src.math.tanhf
+    libc.src.math.trunc
+    libc.src.math.truncf
+    libc.src.math.truncl
 )
 
 set(TARGET_LLVMLIBC_ENTRYPOINTS
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 13cd1dafe01f5..49593dfe01c81 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -108,171 +108,171 @@ Basic Operations
 |              +---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 |              | x86_64  | aarch64 | aarch32 | riscv64 | x86_64  | aarch64 | x86_64  | aarch64 | aarch32 | riscv32 | AMD     | nVidia  |
 +==============+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+
-| ceil         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| ceil         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ceilf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| ceilf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ceill        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| ceill        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| copysign     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| copysign     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| copysignf    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| copysignf    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| copysignl    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| copysignl    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | copysignf128 | |check| | |check| |         |         |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fabs         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fabs         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fabsf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fabsf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fabsl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fabsl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fabsf128     | |check| | |check| |         |         |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fdim         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fdim         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fdimf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fdimf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fdiml        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fdiml        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| floor        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| floor        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| floorf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| floorf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| floorl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| floorl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmax         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fmax         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmaxf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fmaxf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmaxl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fmaxl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmin         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fmin         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fminf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fminf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fminl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fminl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmod         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fmod         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmodf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fmodf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fmodl        |         |         |         |         |         |         |         |         |         |         |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| frexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| frexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| frexpf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| frexpf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| frexpl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| frexpl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ilogb        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| ilogb        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ilogbf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| ilogbf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ilogbl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| ilogbl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ldexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| ldexp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ldexpf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| ldexpf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| ldexpl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| ldexpl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llrint       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| llrint       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llrintf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| llrintf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llrintl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| llrintl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llround      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| llround      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llroundf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| llroundf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| llroundl     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| llroundl     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| logb         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| logb         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| logbf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| logbf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| logbl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| logbl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lrint        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| lrint        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lrintf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| lrintf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lrintl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| lrintl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lround       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| lround       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lroundf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| lroundf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| lroundl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| lroundl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| modf         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| modf         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| modff        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| modff        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| modfl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| modfl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nan          | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nan          | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nanf         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nanf         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nanl         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nanl         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nearbyint    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nearbyint    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nearbyintf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nearbyintf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nearbyintl   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nearbyintl   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nextafter    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nextafter    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nextafterf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nextafterf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nextafterl   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nextafterl   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nexttoward   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nexttoward   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nexttowardf  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nexttowardf  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| nexttowardl  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| nexttowardl  | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remainder    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| remainder    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remainderf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| remainderf   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remainderl   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| remainderl   | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remquo       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| remquo       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remquof      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| remquof      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| remquol      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| remquol      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| rint         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| rint         | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| rintf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| rintf        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| rintl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| rintl        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| round        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| round        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| roundf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| roundf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| roundl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| roundl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| scalbn       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| scalbn       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| scalbnf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| scalbnf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| scalbnl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| scalbnl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| trunc        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| trunc        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| truncf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| truncf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| truncl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| truncl       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +--------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 
 
@@ -286,31 +286,31 @@ Higher Math Functions
 +============+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+=========+
 | acos       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| acosf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| acosf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | acosl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | acosh      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| acoshf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| acoshf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | acoshl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | asin       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| asinf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| asinf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | asinl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | asinh      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| asinhf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| asinhf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | asinhl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | atan       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| atanf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| atanf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | atanl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -322,7 +322,7 @@ Higher Math Functions
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | atanh      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| atanhf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| atanhf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | atanhl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -334,19 +334,19 @@ Higher Math Functions
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | cos        | |check| |         |         |         | |check| |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| cosf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| cosf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | cosl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | cosh       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| coshf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| coshf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | coshl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | erf        |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| erff       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| erff       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | erfl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -356,39 +356,39 @@ Higher Math Functions
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | erfcl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| exp        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| expf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| expf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | expl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp10      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| exp10      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp10f     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| exp10f     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | exp10l     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp2       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| exp2       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| exp2f      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| exp2f      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | exp2l      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| expm1      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| expm1      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| expm1f     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| expm1f     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | expm1l     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fma        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fma        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| fmaf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| fmaf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | fmal       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| hypot      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| hypot      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| hypotf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| hypotf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | hypotl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
@@ -398,69 +398,69 @@ Higher Math Functions
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | lgammal    |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| log        | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| logf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| logf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | logl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log10      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| log10      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log10f     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| log10f     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | log10l     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log1p      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| log1p      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log1pf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| log1pf     | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | log1pl     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log2       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| log2       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| log2f      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| log2f      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | log2l      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | pow        |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| powf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| powf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | powl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sin        | |check| |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sinf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| sinf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sinl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sincos     |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sincosf    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| sincosf    | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sincosl    |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sinh       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sinhf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| sinhf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sinhl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sqrt       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| sqrt       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sqrtf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| sqrtf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| sqrtl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| sqrtl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tan        | |check| |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| tanf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| tanf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tanl       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tanh       |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
-| tanhf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| |         |         |         |
+| tanhf      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tanhl      |         |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+

From 4beb7237103a66e90b90a43b1ca0c77bca872d5b Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Mon, 22 Jan 2024 21:47:31 -0500
Subject: [PATCH 536/843] [libc] implement sys/getauxval (#78493)

This PR implements `sys/getauxval`
that can be used in both overlay builds and full builds.
---
 libc/config/linux/aarch64/entrypoints.txt     |   3 +
 libc/config/linux/app.h                       |   2 +-
 libc/config/linux/arm/entrypoints.txt         |   3 +
 libc/config/linux/riscv/entrypoints.txt       |   3 +
 libc/config/linux/x86_64/entrypoints.txt      |   3 +
 libc/src/sys/CMakeLists.txt                   |   1 +
 libc/src/sys/auxv/CMakeLists.txt              |  10 +
 libc/src/sys/auxv/getauxval.h                 |  20 ++
 libc/src/sys/auxv/linux/CMakeLists.txt        |  18 ++
 libc/src/sys/auxv/linux/getauxval.cpp         | 217 ++++++++++++++++++
 libc/test/src/sys/CMakeLists.txt              |   1 +
 libc/test/src/sys/auxv/CMakeLists.txt         |   3 +
 libc/test/src/sys/auxv/linux/CMakeLists.txt   |  14 ++
 .../src/sys/auxv/linux/getauxval_test.cpp     |  28 +++
 14 files changed, 325 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/sys/auxv/CMakeLists.txt
 create mode 100644 libc/src/sys/auxv/getauxval.h
 create mode 100644 libc/src/sys/auxv/linux/CMakeLists.txt
 create mode 100644 libc/src/sys/auxv/linux/getauxval.cpp
 create mode 100644 libc/test/src/sys/auxv/CMakeLists.txt
 create mode 100644 libc/test/src/sys/auxv/linux/CMakeLists.txt
 create mode 100644 libc/test/src/sys/auxv/linux/getauxval_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 625fa6bffe63c..3f66a582f5e3e 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -168,6 +168,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # sys/prctl.h entrypoints
     libc.src.sys.prctl.prctl
 
+    # sys/auxv.h entrypoints
+    libc.src.sys.auxv.getauxval
+
     # termios.h entrypoints
     libc.src.termios.cfgetispeed
     libc.src.termios.cfgetospeed
diff --git a/libc/config/linux/app.h b/libc/config/linux/app.h
index 1b3523deb1b23..766cd49e88f6f 100644
--- a/libc/config/linux/app.h
+++ b/libc/config/linux/app.h
@@ -93,7 +93,7 @@ struct AppProperties {
   AuxEntry *auxv_ptr;
 };
 
-extern AppProperties app;
+[[gnu::weak]] extern AppProperties app;
 
 // The descriptor of a thread's TLS area.
 struct TLSDescriptor {
diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index c75ac2302d4ac..301870d337ca0 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -95,6 +95,9 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # sys/prctl.h entrypoints
     libc.src.sys.prctl.prctl
+
+    # sys/auxv.h entrypoints
+    libc.src.sys.auxv.getauxval
 )
 
 set(TARGET_LIBM_ENTRYPOINTS
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index ec2a16f5cf473..0331ef782cf74 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -174,6 +174,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # sys/prctl.h entrypoints
     libc.src.sys.prctl.prctl
 
+    # sys/auxv.h entrypoints
+    libc.src.sys.auxv.getauxval
+
     # termios.h entrypoints
     libc.src.termios.cfgetispeed
     libc.src.termios.cfgetospeed
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 094bdde2e1589..d5ab891674a2d 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -174,6 +174,9 @@ set(TARGET_LIBC_ENTRYPOINTS
     # sys/prctl.h entrypoints
     libc.src.sys.prctl.prctl
 
+    # sys/auxv.h entrypoints
+    libc.src.sys.auxv.getauxval
+
     # termios.h entrypoints
     libc.src.termios.cfgetispeed
     libc.src.termios.cfgetospeed
diff --git a/libc/src/sys/CMakeLists.txt b/libc/src/sys/CMakeLists.txt
index 12e2020f013ab..81098294176ad 100644
--- a/libc/src/sys/CMakeLists.txt
+++ b/libc/src/sys/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(auxv)
 add_subdirectory(mman)
 add_subdirectory(random)
 add_subdirectory(resource)
diff --git a/libc/src/sys/auxv/CMakeLists.txt b/libc/src/sys/auxv/CMakeLists.txt
new file mode 100644
index 0000000000000..4065761064b12
--- /dev/null
+++ b/libc/src/sys/auxv/CMakeLists.txt
@@ -0,0 +1,10 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+endif()
+
+add_entrypoint_object(
+  getauxval
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.getauxval
+)
diff --git a/libc/src/sys/auxv/getauxval.h b/libc/src/sys/auxv/getauxval.h
new file mode 100644
index 0000000000000..7c9fb846e9198
--- /dev/null
+++ b/libc/src/sys/auxv/getauxval.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for getauxval function ------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H
+#define LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H
+
+#include <sys/auxv.h>
+
+namespace LIBC_NAMESPACE {
+
+unsigned long getauxval(unsigned long id);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_SYS_AUXV_GETAUXVAL_H
diff --git a/libc/src/sys/auxv/linux/CMakeLists.txt b/libc/src/sys/auxv/linux/CMakeLists.txt
new file mode 100644
index 0000000000000..b38d63ee0329c
--- /dev/null
+++ b/libc/src/sys/auxv/linux/CMakeLists.txt
@@ -0,0 +1,18 @@
+add_entrypoint_object(
+  getauxval
+  SRCS
+  getauxval.cpp
+  HDRS
+    ../getauxval.h
+  DEPENDS
+    libc.src.sys.prctl.prctl
+    libc.src.sys.mman.mmap
+    libc.src.sys.mman.munmap
+    libc.src.__support.threads.callonce
+    libc.src.__support.common
+    libc.src.errno.errno
+    libc.config.linux.app_h
+    libc.src.fcntl.open
+    libc.src.unistd.read
+    libc.src.unistd.close
+)
diff --git a/libc/src/sys/auxv/linux/getauxval.cpp b/libc/src/sys/auxv/linux/getauxval.cpp
new file mode 100644
index 0000000000000..b0db36732c06e
--- /dev/null
+++ b/libc/src/sys/auxv/linux/getauxval.cpp
@@ -0,0 +1,217 @@
+//===-- Implementation file for getauxval function --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/auxv/getauxval.h"
+#include "config/linux/app.h"
+#include "src/__support/common.h"
+#include "src/errno/libc_errno.h"
+#include <linux/auxvec.h>
+
+// for guarded initialization
+#include "src/__support/threads/callonce.h"
+#include "src/__support/threads/linux/futex_word.h"
+
+// for mallocing the global auxv
+#include "src/sys/mman/mmap.h"
+#include "src/sys/mman/munmap.h"
+
+// for reading /proc/self/auxv
+#include "src/fcntl/open.h"
+#include "src/sys/prctl/prctl.h"
+#include "src/unistd/close.h"
+#include "src/unistd/read.h"
+
+// getauxval will work either with or without __cxa_atexit support.
+// In order to detect if __cxa_atexit is supported, we define a weak symbol.
+// We prefer __cxa_atexit as it is always defined as a C symbol whileas atexit
+// may not be created via objcopy yet. Also, for glibc, atexit is provided via
+// libc_nonshared.a rather than libc.so. So, it is may not be made ready for
+// overlay builds.
+extern "C" [[gnu::weak]] int __cxa_atexit(void (*callback)(void *),
+                                          void *payload, void *);
+
+namespace LIBC_NAMESPACE {
+
+constexpr static size_t MAX_AUXV_ENTRIES = 64;
+
+// Helper to recover or set errno
+class AuxvErrnoGuard {
+public:
+  AuxvErrnoGuard() : saved(libc_errno), failure(false) {}
+  ~AuxvErrnoGuard() { libc_errno = failure ? ENOENT : saved; }
+  void mark_failure() { failure = true; }
+
+private:
+  int saved;
+  bool failure;
+};
+
+// Helper to manage the memory
+static AuxEntry *auxv = nullptr;
+
+class AuxvMMapGuard {
+public:
+  constexpr static size_t AUXV_MMAP_SIZE = sizeof(AuxEntry) * MAX_AUXV_ENTRIES;
+
+  AuxvMMapGuard()
+      : ptr(mmap(nullptr, AUXV_MMAP_SIZE, PROT_READ | PROT_WRITE,
+                 MAP_PRIVATE | MAP_ANONYMOUS, -1, 0)) {}
+  ~AuxvMMapGuard() {
+    if (ptr != MAP_FAILED)
+      munmap(ptr, AUXV_MMAP_SIZE);
+  }
+  void submit_to_global() {
+    // atexit may fail, we do not set it to global in that case.
+    int ret = __cxa_atexit(
+        [](void *) {
+          munmap(auxv, AUXV_MMAP_SIZE);
+          auxv = nullptr;
+        },
+        nullptr, nullptr);
+
+    if (ret != 0)
+      return;
+
+    auxv = reinterpret_cast<AuxEntry *>(ptr);
+    ptr = MAP_FAILED;
+  }
+  bool allocated() const { return ptr != MAP_FAILED; }
+  void *get() const { return ptr; }
+
+private:
+  void *ptr;
+};
+
+class AuxvFdGuard {
+public:
+  AuxvFdGuard() : fd(open("/proc/self/auxv", O_RDONLY | O_CLOEXEC)) {}
+  ~AuxvFdGuard() {
+    if (fd != -1)
+      close(fd);
+  }
+  bool valid() const { return fd != -1; }
+  int get() const { return fd; }
+
+private:
+  int fd;
+};
+
+static void initialize_auxv_once(void) {
+  // If we cannot get atexit, we cannot register the cleanup function.
+  if (&__cxa_atexit == nullptr)
+    return;
+
+  AuxvMMapGuard mmap_guard;
+  if (!mmap_guard.allocated())
+    return;
+  auto *ptr = reinterpret_cast<AuxEntry *>(mmap_guard.get());
+
+  // We get one less than the max size to make sure the search always
+  // terminates. MMAP private pages are zeroed out already.
+  size_t available_size = AuxvMMapGuard::AUXV_MMAP_SIZE - sizeof(AuxEntryType);
+  // PR_GET_AUXV is only available on Linux kernel 6.1 and above. If this is not
+  // defined, we direcly fall back to reading /proc/self/auxv. In case the libc
+  // is compiled and run on separate kernels, we also check the return value of
+  // prctl.
+#ifdef PR_GET_AUXV
+  int ret = prctl(PR_GET_AUXV, reinterpret_cast<unsigned long>(ptr),
+                  available_size, 0, 0);
+  if (ret >= 0) {
+    mmap_guard.submit_to_global();
+    return;
+  }
+#endif
+  AuxvFdGuard fd_guard;
+  if (!fd_guard.valid())
+    return;
+  auto *buf = reinterpret_cast<char *>(ptr);
+  libc_errno = 0;
+  bool error_detected = false;
+  // Read until we use up all the available space or we finish reading the file.
+  while (available_size != 0) {
+    ssize_t bytes_read = read(fd_guard.get(), buf, available_size);
+    if (bytes_read <= 0) {
+      if (libc_errno == EINTR)
+        continue;
+      // Now, we either have an non-recoverable error or we have reached the end
+      // of the file. Mark `error_detected` accordingly.
+      if (bytes_read == -1)
+        error_detected = true;
+      break;
+    }
+    buf += bytes_read;
+    available_size -= bytes_read;
+  }
+  // If we get out of the loop without an error, the auxv is ready.
+  if (!error_detected)
+    mmap_guard.submit_to_global();
+}
+
+static AuxEntry read_entry(int fd) {
+  AuxEntry buf;
+  ssize_t size = sizeof(AuxEntry);
+  char *ptr = reinterpret_cast<char *>(&buf);
+  while (size > 0) {
+    ssize_t ret = read(fd, ptr, size);
+    if (ret < 0) {
+      if (libc_errno == EINTR)
+        continue;
+      // Error detected, return AT_NULL
+      buf.id = AT_NULL;
+      buf.value = AT_NULL;
+      break;
+    }
+    ptr += ret;
+    size -= ret;
+  }
+  return buf;
+}
+
+LLVM_LIBC_FUNCTION(unsigned long, getauxval, (unsigned long id)) {
+  // Fast path when libc is loaded by its own initialization code. In this case,
+  // app.auxv_ptr is already set to the auxv passed on the initial stack of the
+  // process.
+  AuxvErrnoGuard errno_guard;
+
+  auto search_auxv = [&errno_guard](AuxEntry *auxv,
+                                    unsigned long id) -> AuxEntryType {
+    for (auto *ptr = auxv; ptr->id != AT_NULL; ptr++)
+      if (ptr->id == id)
+        return ptr->value;
+
+    errno_guard.mark_failure();
+    return AT_NULL;
+  };
+
+  // App is a weak symbol that is only defined if libc is linked to its own
+  // initialization routine. We need to check if it is null.
+  if (&app != nullptr)
+    return search_auxv(app.auxv_ptr, id);
+
+  static FutexWordType once_flag;
+  callonce(reinterpret_cast<CallOnceFlag *>(&once_flag), initialize_auxv_once);
+  if (auxv != nullptr)
+    return search_auxv(auxv, id);
+
+  // Fallback to use read without mmap
+  AuxvFdGuard fd_guard;
+  if (fd_guard.valid()) {
+    while (true) {
+      AuxEntry buf = read_entry(fd_guard.get());
+      if (buf.id == AT_NULL)
+        break;
+      if (buf.id == id)
+        return buf.value;
+    }
+  }
+
+  // cannot find the entry after all methods, mark failure and return 0
+  errno_guard.mark_failure();
+  return AT_NULL;
+}
+} // namespace LIBC_NAMESPACE
diff --git a/libc/test/src/sys/CMakeLists.txt b/libc/test/src/sys/CMakeLists.txt
index a87e77da7d2cd..c7095456383b3 100644
--- a/libc/test/src/sys/CMakeLists.txt
+++ b/libc/test/src/sys/CMakeLists.txt
@@ -8,3 +8,4 @@ add_subdirectory(stat)
 add_subdirectory(utsname)
 add_subdirectory(wait)
 add_subdirectory(prctl)
+add_subdirectory(auxv)
diff --git a/libc/test/src/sys/auxv/CMakeLists.txt b/libc/test/src/sys/auxv/CMakeLists.txt
new file mode 100644
index 0000000000000..b4bbe81c92ff2
--- /dev/null
+++ b/libc/test/src/sys/auxv/CMakeLists.txt
@@ -0,0 +1,3 @@
+if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
+  add_subdirectory(${LIBC_TARGET_OS})
+endif()
diff --git a/libc/test/src/sys/auxv/linux/CMakeLists.txt b/libc/test/src/sys/auxv/linux/CMakeLists.txt
new file mode 100644
index 0000000000000..c1e82a1f0a46c
--- /dev/null
+++ b/libc/test/src/sys/auxv/linux/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_custom_target(libc_sys_auxv_unittests)
+add_libc_unittest(
+  getauxval_test
+  SUITE
+    libc_sys_auxv_unittests
+  SRCS
+    getauxval_test.cpp
+  DEPENDS
+    libc.include.sys_auxv
+    libc.src.errno.errno
+    libc.src.sys.auxv.getauxval
+    libc.test.UnitTest.ErrnoSetterMatcher
+    libc.src.string.strstr
+)
diff --git a/libc/test/src/sys/auxv/linux/getauxval_test.cpp b/libc/test/src/sys/auxv/linux/getauxval_test.cpp
new file mode 100644
index 0000000000000..8811fd8dfbc3a
--- /dev/null
+++ b/libc/test/src/sys/auxv/linux/getauxval_test.cpp
@@ -0,0 +1,28 @@
+//===-- Unittests for getaxuval -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+#include "src/errno/libc_errno.h"
+#include "src/sys/auxv/getauxval.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/Test.h"
+#include <src/string/strstr.h>
+#include <sys/auxv.h>
+
+using namespace LIBC_NAMESPACE::testing::ErrnoSetterMatcher;
+
+TEST(LlvmLibcGetauxvalTest, Basic) {
+  EXPECT_THAT(LIBC_NAMESPACE::getauxval(AT_PAGESZ),
+              returns(GT(0ul)).with_errno(EQ(0)));
+  const char *filename;
+  auto getfilename = [&filename]() {
+    auto value = LIBC_NAMESPACE::getauxval(AT_EXECFN);
+    filename = reinterpret_cast<const char *>(value);
+    return value;
+  };
+  EXPECT_THAT(getfilename(), returns(NE(0ul)).with_errno(EQ(0)));
+  ASSERT_TRUE(LIBC_NAMESPACE::strstr(filename, "getauxval_test") != nullptr);
+}

From 3b171cb968b3f8495b096139fc57ff6263727e40 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Mon, 22 Jan 2024 18:59:08 -0800
Subject: [PATCH 537/843] [clang-format] Fix a bug in ContinuationIndenter
 (#78921)

Fixes #76991.
---
 clang/lib/Format/ContinuationIndenter.cpp | 3 ++-
 clang/lib/Format/WhitespaceManager.h      | 2 +-
 clang/unittests/Format/FormatTest.cpp     | 7 +++++++
 3 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index ddbee15cb159f..a099813c9100b 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -740,7 +740,8 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
 
   if (!DryRun) {
     Whitespaces.replaceWhitespace(Current, /*Newlines=*/0, Spaces,
-                                  State.Column + Spaces + PPColumnCorrection);
+                                  State.Column + Spaces + PPColumnCorrection,
+                                  /*IsAligned=*/false, State.Line->InMacroBody);
   }
 
   // If "BreakBeforeInheritanceComma" mode, don't break within the inheritance
diff --git a/clang/lib/Format/WhitespaceManager.h b/clang/lib/Format/WhitespaceManager.h
index dc6f60e5deeed..8ac73305871ae 100644
--- a/clang/lib/Format/WhitespaceManager.h
+++ b/clang/lib/Format/WhitespaceManager.h
@@ -55,7 +55,7 @@ class WhitespaceManager {
   /// this replacement. It is needed for determining how \p Spaces is turned
   /// into tabs and spaces for some format styles.
   void replaceWhitespace(FormatToken &Tok, unsigned Newlines, unsigned Spaces,
-                         unsigned StartOfTokenColumn, bool isAligned = false,
+                         unsigned StartOfTokenColumn, bool IsAligned = false,
                          bool InPPDirective = false);
 
   /// Adds information about an unchangeable token's whitespace.
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index a0825dc2ad4ef..fa3bea5ae5d75 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -21215,6 +21215,13 @@ TEST_F(FormatTest, CatchAlignArrayOfStructuresLeftAlignment) {
                "});",
                Style);
 
+  Style.AlignEscapedNewlines = FormatStyle::ENAS_DontAlign;
+  verifyFormat("#define FOO \\\n"
+               "  int foo[][2] = { \\\n"
+               "      {0, 1} \\\n"
+               "  };",
+               Style);
+
   Style.Cpp11BracedListStyle = false;
   verifyFormat("struct test demo[] = {\n"
                "  { 56, 23,    \"hello\" },\n"

From 7e63940f69d99c12ccc18c76e1fc6b861fd459ec Mon Sep 17 00:00:00 2001
From: Shih-Po Hung <shihpo.hung@sifive.com>
Date: Tue, 23 Jan 2024 11:00:19 +0800
Subject: [PATCH 538/843] [RISCV][CostModel] Make VMV_S_X and VMV_X_S cost
 independent of LMUL (#78739)

Following #77963, instructions like VMV_S_X/VMV_X_S
handle single element, so the cost don't scale with LMUL.
---
 llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp   | 11 +++++++----
 llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll    |  4 ++--
 .../Analysis/CostModel/RISCV/shuffle-broadcast.ll    | 12 ++++++------
 3 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 4ea3a51930899..866d5cf340e68 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -84,6 +84,10 @@ RISCVTTIImpl::getRISCVInstructionCost(ArrayRef<unsigned> OpCodes, MVT VT,
       Cost += VL;
       break;
     }
+    case RISCV::VMV_X_S:
+    case RISCV::VMV_S_X:
+      Cost += 1;
+      break;
     default:
       Cost += LMULCost;
     }
@@ -446,7 +450,7 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
     // should be a very small constant for the constant pool load.  As such,
     // we may bias towards large selects slightly more than truely warranted.
     return LT.first *
-           (2 + getRISCVInstructionCost({RISCV::VMERGE_VVM},
+           (1 + getRISCVInstructionCost({RISCV::VMV_S_X, RISCV::VMERGE_VVM},
                                         LT.second, CostKind));
   }
   case TTI::SK_Broadcast: {
@@ -475,10 +479,9 @@ InstructionCost RISCVTTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
 
       return LT.first *
              (TLI->getLMULCost(LT.second) + // FIXME: this should be 1 for andi
-              TLI->getLMULCost(
-                  LT.second) + // FIXME: vmv.x.s is the same as extractelement
               getRISCVInstructionCost({RISCV::VMV_V_I, RISCV::VMERGE_VIM,
-                                       RISCV::VMV_V_X, RISCV::VMSNE_VI},
+                                       RISCV::VMV_X_S, RISCV::VMV_V_X,
+                                       RISCV::VMSNE_VI},
                                       LT.second, CostKind));
     }
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
index bd9f6af89a5cd..4f3c7e2f90c65 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-shuffle.ll
@@ -14,7 +14,7 @@ define void  @vector_broadcast() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %5 = shufflevector <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = shufflevector <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %8 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %8 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %9 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %10 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %11 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
@@ -29,7 +29,7 @@ define void  @vector_broadcast() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = shufflevector <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = shufflevector <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, <vscale x 1 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = shufflevector <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, <vscale x 2 x i32> zeroinitializer
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %8 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %8 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %9 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %10 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %11 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
index 432b90d9305af..fc4a6b17d3f82 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-broadcast.ll
@@ -45,9 +45,9 @@ define void  @broadcast_scalable() #0{
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %41 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %41 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 41 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'broadcast_scalable'
@@ -92,9 +92,9 @@ define void  @broadcast_scalable() #0{
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %38 = shufflevector <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %39 = shufflevector <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %40 = shufflevector <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i32> zeroinitializer
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %41 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %41 = shufflevector <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %42 = shufflevector <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i32> zeroinitializer
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %43 = shufflevector <vscale x 64 x i1> undef, <vscale x 64 x i1> undef, <vscale x 64 x i32> zeroinitializer
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %zero = shufflevector <vscale x 1 x half> undef, <vscale x 1 x half> undef, <vscale x 1 x i32> zeroinitializer

From 4ef646eab3023b52ce8ee529d16605c92caba8ba Mon Sep 17 00:00:00 2001
From: Gedare Bloom <gedare@rtems.org>
Date: Mon, 22 Jan 2024 20:01:16 -0700
Subject: [PATCH 539/843] [clang-format]: Fix formatting of if statements with
 BlockIndent  (#77699)

A bug with BlockIndent prevents line breaks within if (and else if)
clauses.

While fixing this bug, it appears that AlignAfterOpenBracket is not
designed to work with loop and if statements, but AlwaysBreak works on
if clauses. The documentation and tests are not clear on whether or not
this behavior is intended.

This PR preserves the `AlwaysBreak` behavior on `if` clauses without
supporting `BlockIndent` on `if` clauses to avoid regressions while
fixing the bug.

It may be reasonable to create an explicit option for alignment of if
(and loop) clauses intentionally for both `AlwaysBreak` and
`BlockIndent`

Fixes #54663.

Migrated from Differential Revision: https://reviews.llvm.org/D154755
See more discussion there. Addressed last open comment from the rev
about refactoring the complex conditional logic involved with the
`AlignAfterOpenBracket` line break behavior.
---
 clang/lib/Format/ContinuationIndenter.cpp | 24 ++++++++++----
 clang/unittests/Format/FormatTest.cpp     | 40 ++++++++++++++++++++---
 2 files changed, 53 insertions(+), 11 deletions(-)

diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index a099813c9100b..a3eb9138b2183 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -771,15 +771,25 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   // parenthesis by disallowing any further line breaks if there is no line
   // break after the opening parenthesis. Don't break if it doesn't conserve
   // columns.
+  auto IsOpeningBracket = [&](const FormatToken &Tok) {
+    auto IsStartOfBracedList = [&]() {
+      return Tok.is(tok::l_brace) && Tok.isNot(BK_Block) &&
+             Style.Cpp11BracedListStyle;
+    };
+    if (!Tok.isOneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) &&
+        !IsStartOfBracedList()) {
+      return false;
+    }
+    if (!Tok.Previous)
+      return true;
+    if (Tok.Previous->isIf())
+      return Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak;
+    return !Tok.Previous->isOneOf(TT_CastRParen, tok::kw_for, tok::kw_while,
+                                  tok::kw_switch);
+  };
   if ((Style.AlignAfterOpenBracket == FormatStyle::BAS_AlwaysBreak ||
        Style.AlignAfterOpenBracket == FormatStyle::BAS_BlockIndent) &&
-      (Previous.isOneOf(tok::l_paren, TT_TemplateOpener, tok::l_square) ||
-       (Previous.is(tok::l_brace) && Previous.isNot(BK_Block) &&
-        Style.Cpp11BracedListStyle)) &&
-      State.Column > getNewLineColumn(State) &&
-      (!Previous.Previous ||
-       !Previous.Previous->isOneOf(TT_CastRParen, tok::kw_for, tok::kw_while,
-                                   tok::kw_switch)) &&
+      IsOpeningBracket(Previous) && State.Column > getNewLineColumn(State) &&
       // Don't do this for simple (no expressions) one-argument function calls
       // as that feels like needlessly wasting whitespace, e.g.:
       //
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index fa3bea5ae5d75..e5e763edf5b5b 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -26172,8 +26172,8 @@ TEST_F(FormatTest, AlignAfterOpenBracketBlockIndentIfStatement) {
                "}",
                Style);
 
-  verifyFormat("if (quitelongarg !=\n"
-               "    (alsolongarg - 1)) { // ABC is a very longgggggggggggg "
+  verifyFormat("if (quiteLongArg !=\n"
+               "    (alsoLongArg - 1)) { // ABC is a very longgggggggggggg "
                "comment\n"
                "  return;\n"
                "}",
@@ -26186,12 +26186,44 @@ TEST_F(FormatTest, AlignAfterOpenBracketBlockIndentIfStatement) {
                "}",
                Style);
 
-  verifyFormat("if (quitelongarg !=\n"
-               "    (alsolongarg - 1)) { // ABC is a very longgggggggggggg "
+  verifyFormat("if (quiteLongArg !=\n"
+               "    (alsoLongArg - 1)) { // ABC is a very longgggggggggggg "
                "comment\n"
                "  return;\n"
                "}",
                Style);
+
+  verifyFormat("void foo() {\n"
+               "  if (camelCaseName < alsoLongName ||\n"
+               "      anotherEvenLongerName <=\n"
+               "          thisReallyReallyReallyReallyReallyReallyLongerName ||"
+               "\n"
+               "      otherName < thisLastName) {\n"
+               "    return;\n"
+               "  } else if (quiteLongName < alsoLongName ||\n"
+               "             anotherEvenLongerName <=\n"
+               "                 thisReallyReallyReallyReallyReallyReallyLonger"
+               "Name ||\n"
+               "             otherName < thisLastName) {\n"
+               "    return;\n"
+               "  }\n"
+               "}",
+               Style);
+
+  Style.ContinuationIndentWidth = 2;
+  verifyFormat("void foo() {\n"
+               "  if (ThisIsRatherALongIfClause && thatIExpectToBeBroken ||\n"
+               "      ontoMultipleLines && whenFormattedCorrectly) {\n"
+               "    if (false) {\n"
+               "      return;\n"
+               "    } else if (thisIsRatherALongIfClause && "
+               "thatIExpectToBeBroken ||\n"
+               "               ontoMultipleLines && whenFormattedCorrectly) {\n"
+               "      return;\n"
+               "    }\n"
+               "  }\n"
+               "}",
+               Style);
 }
 
 TEST_F(FormatTest, AlignAfterOpenBracketBlockIndentForStatement) {

From 097a40ac899d0b88b4b62b85e6faa43802e87010 Mon Sep 17 00:00:00 2001
From: Chen Zheng <czhengsz@cn.ibm.com>
Date: Mon, 22 Jan 2024 21:27:38 -0500
Subject: [PATCH 540/843] nfc add test cases for PowerPC vector instructions
 cost analysis

---
 .../CostModel/PowerPC/insert_extract.ll       | 235 ++++++++++++++++++
 1 file changed, 235 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/PowerPC/insert_extract.ll b/llvm/test/Analysis/CostModel/PowerPC/insert_extract.ll
index 11ec7d355f04a..607d15790b5f3 100644
--- a/llvm/test/Analysis/CostModel/PowerPC/insert_extract.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/insert_extract.ll
@@ -3,6 +3,7 @@
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr8 -mattr=+vsx | FileCheck --check-prefix=CHECK-P8LE %s
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=powerpc64-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-P9BE %s
 ; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr9 -mattr=+vsx | FileCheck --check-prefix=CHECK-P9LE %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=powerpc64le-unknown-linux-gnu -mcpu=pwr10 -mattr=+vsx | FileCheck --check-prefix=CHECK-P10 %s
 
 target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f128:128:128-v128:128:128-n32:64"
 target triple = "powerpc64-unknown-linux-gnu"
@@ -24,6 +25,10 @@ define i32 @insert(i32 %arg) {
 ; CHECK-P9LE-LABEL: 'insert'
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x = insertelement <4 x i32> undef, i32 %arg, i32 0
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
+;
+; CHECK-P10-LABEL: 'insert'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x = insertelement <4 x i32> undef, i32 %arg, i32 0
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 undef
 ;
   %x = insertelement <4 x i32> undef, i32 %arg, i32 0
   ret i32 undef
@@ -46,6 +51,10 @@ define i32 @extract(<4 x i32> %arg) {
 ; CHECK-P9LE-LABEL: 'extract'
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %x = extractelement <4 x i32> %arg, i32 0
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %x
+;
+; CHECK-P10-LABEL: 'extract'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %x = extractelement <4 x i32> %arg, i32 0
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %x
 ;
   %x = extractelement <4 x i32> %arg, i32 0
   ret i32 %x
@@ -71,6 +80,11 @@ define void @test2xdouble(<2 x double> %arg1) {
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = extractelement <2 x double> %arg1, i32 0
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <2 x double> %arg1, i32 1
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-P10-LABEL: 'test2xdouble'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = extractelement <2 x double> %arg1, i32 0
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %v2 = extractelement <2 x double> %arg1, i32 1
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v1 = extractelement <2 x double> %arg1, i32 0
   %v2 = extractelement <2 x double> %arg1, i32 1
@@ -93,6 +107,10 @@ define void @test4xi32(<4 x i32> %v1, i32 %x1) {
 ; CHECK-P9LE-LABEL: 'test4xi32'
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2 = insertelement <4 x i32> %v1, i32 %x1, i32 2
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-P10-LABEL: 'test4xi32'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = insertelement <4 x i32> %v1, i32 %x1, i32 2
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %v2 = insertelement <4 x i32> %v1, i32 %x1, i32 2
   ret void
@@ -126,6 +144,13 @@ define void @vexti32(<4 x i32> %p1) {
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i3 = extractelement <4 x i32> %p1, i32 2
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %i4 = extractelement <4 x i32> %p1, i32 3
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-P10-LABEL: 'vexti32'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i1 = extractelement <4 x i32> %p1, i32 0
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i2 = extractelement <4 x i32> %p1, i32 1
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i3 = extractelement <4 x i32> %p1, i32 2
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i4 = extractelement <4 x i32> %p1, i32 3
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %i1 = extractelement <4 x i32> %p1, i32 0
   %i2 = extractelement <4 x i32> %p1, i32 1
@@ -154,6 +179,11 @@ define void @vexti64(<2 x i64> %p1) {
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %i1 = extractelement <2 x i64> %p1, i32 0
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i2 = extractelement <2 x i64> %p1, i32 1
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-P10-LABEL: 'vexti64'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i1 = extractelement <2 x i64> %p1, i32 0
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i2 = extractelement <2 x i64> %p1, i32 1
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %i1 = extractelement <2 x i64> %p1, i32 0
   %i2 = extractelement <2 x i64> %p1, i32 1
@@ -180,8 +210,213 @@ define void @vext(<8 x i16> %p1, <16 x i8> %p2) {
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %i1 = extractelement <8 x i16> %p1, i32 0
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %i2 = extractelement <16 x i8> %p2, i32 0
 ; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; CHECK-P10-LABEL: 'vext'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i1 = extractelement <8 x i16> %p1, i32 0
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %i2 = extractelement <16 x i8> %p2, i32 0
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %i1 = extractelement <8 x i16> %p1, i32 0
   %i2 = extractelement <16 x i8> %p2, i32 0
   ret void
 }
+
+define <2 x i64> @insert_i64_x(<2 x i64> %dest, i64 %arg, i32 %idx) {
+; CHECK-P7-LABEL: 'insert_i64_x'
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <2 x i64> %dest, i64 %arg, i32 %idx
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %x
+;
+; CHECK-P8LE-LABEL: 'insert_i64_x'
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <2 x i64> %dest, i64 %arg, i32 %idx
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %x
+;
+; CHECK-P9BE-LABEL: 'insert_i64_x'
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <2 x i64> %dest, i64 %arg, i32 %idx
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %x
+;
+; CHECK-P9LE-LABEL: 'insert_i64_x'
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <2 x i64> %dest, i64 %arg, i32 %idx
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %x
+;
+; CHECK-P10-LABEL: 'insert_i64_x'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <2 x i64> %dest, i64 %arg, i32 %idx
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %x
+;
+  %x = insertelement <2 x i64> %dest, i64 %arg, i32 %idx
+  ret <2 x i64> %x
+}
+
+define <4 x i32> @insert_i32_x(<4 x i32> %dest, i32 %arg, i32 %idx) {
+; CHECK-P7-LABEL: 'insert_i32_x'
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <4 x i32> %dest, i32 %arg, i32 %idx
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %x
+;
+; CHECK-P8LE-LABEL: 'insert_i32_x'
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <4 x i32> %dest, i32 %arg, i32 %idx
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %x
+;
+; CHECK-P9BE-LABEL: 'insert_i32_x'
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %x = insertelement <4 x i32> %dest, i32 %arg, i32 %idx
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %x
+;
+; CHECK-P9LE-LABEL: 'insert_i32_x'
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %x = insertelement <4 x i32> %dest, i32 %arg, i32 %idx
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %x
+;
+; CHECK-P10-LABEL: 'insert_i32_x'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <4 x i32> %dest, i32 %arg, i32 %idx
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %x
+;
+  %x = insertelement <4 x i32> %dest, i32 %arg, i32 %idx
+  ret <4 x i32> %x
+}
+
+define  <8 x i16> @insert_i16_x(<8 x i16> %dest, i16 %arg, i32 %idx) {
+; CHECK-P7-LABEL: 'insert_i16_x'
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <8 x i16> %dest, i16 %arg, i32 %idx
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %x
+;
+; CHECK-P8LE-LABEL: 'insert_i16_x'
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <8 x i16> %dest, i16 %arg, i32 %idx
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %x
+;
+; CHECK-P9BE-LABEL: 'insert_i16_x'
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %x = insertelement <8 x i16> %dest, i16 %arg, i32 %idx
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %x
+;
+; CHECK-P9LE-LABEL: 'insert_i16_x'
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %x = insertelement <8 x i16> %dest, i16 %arg, i32 %idx
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %x
+;
+; CHECK-P10-LABEL: 'insert_i16_x'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <8 x i16> %dest, i16 %arg, i32 %idx
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %x
+;
+  %x = insertelement <8 x i16> %dest, i16 %arg, i32 %idx
+  ret <8 x i16> %x
+}
+
+define  <16 x i8> @insert_i8_x(<16 x i8> %dest, i8 %arg, i32 %idx) {
+; CHECK-P7-LABEL: 'insert_i8_x'
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <16 x i8> %dest, i8 %arg, i32 %idx
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %x
+;
+; CHECK-P8LE-LABEL: 'insert_i8_x'
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <16 x i8> %dest, i8 %arg, i32 %idx
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %x
+;
+; CHECK-P9BE-LABEL: 'insert_i8_x'
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %x = insertelement <16 x i8> %dest, i8 %arg, i32 %idx
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %x
+;
+; CHECK-P9LE-LABEL: 'insert_i8_x'
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 11 for instruction: %x = insertelement <16 x i8> %dest, i8 %arg, i32 %idx
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %x
+;
+; CHECK-P10-LABEL: 'insert_i8_x'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %x = insertelement <16 x i8> %dest, i8 %arg, i32 %idx
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %x
+;
+  %x = insertelement <16 x i8> %dest, i8 %arg, i32 %idx
+  ret <16 x i8> %x
+}
+
+define i64 @extract_i64_x(<2 x i64> %arg, i32 %idx) {
+; CHECK-P7-LABEL: 'extract_i64_x'
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <2 x i64> %arg, i32 %idx
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %x
+;
+; CHECK-P8LE-LABEL: 'extract_i64_x'
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <2 x i64> %arg, i32 %idx
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %x
+;
+; CHECK-P9BE-LABEL: 'extract_i64_x'
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x = extractelement <2 x i64> %arg, i32 %idx
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %x
+;
+; CHECK-P9LE-LABEL: 'extract_i64_x'
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x = extractelement <2 x i64> %arg, i32 %idx
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %x
+;
+; CHECK-P10-LABEL: 'extract_i64_x'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <2 x i64> %arg, i32 %idx
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i64 %x
+;
+  %x = extractelement <2 x i64> %arg, i32 %idx
+  ret i64 %x
+}
+
+define i32 @extract_i32_x(<4 x i32> %arg, i32 %idx) {
+; CHECK-P7-LABEL: 'extract_i32_x'
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <4 x i32> %arg, i32 %idx
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %x
+;
+; CHECK-P8LE-LABEL: 'extract_i32_x'
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <4 x i32> %arg, i32 %idx
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %x
+;
+; CHECK-P9BE-LABEL: 'extract_i32_x'
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x = extractelement <4 x i32> %arg, i32 %idx
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %x
+;
+; CHECK-P9LE-LABEL: 'extract_i32_x'
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x = extractelement <4 x i32> %arg, i32 %idx
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %x
+;
+; CHECK-P10-LABEL: 'extract_i32_x'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <4 x i32> %arg, i32 %idx
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i32 %x
+;
+  %x = extractelement <4 x i32> %arg, i32 %idx
+  ret i32 %x
+}
+
+define i16 @extract_i16_x(<8 x i16> %arg, i32 %idx) {
+; CHECK-P7-LABEL: 'extract_i16_x'
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <8 x i16> %arg, i32 %idx
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %x
+;
+; CHECK-P8LE-LABEL: 'extract_i16_x'
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <8 x i16> %arg, i32 %idx
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %x
+;
+; CHECK-P9BE-LABEL: 'extract_i16_x'
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x = extractelement <8 x i16> %arg, i32 %idx
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %x
+;
+; CHECK-P9LE-LABEL: 'extract_i16_x'
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x = extractelement <8 x i16> %arg, i32 %idx
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %x
+;
+; CHECK-P10-LABEL: 'extract_i16_x'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <8 x i16> %arg, i32 %idx
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i16 %x
+;
+  %x = extractelement <8 x i16> %arg, i32 %idx
+  ret i16 %x
+}
+
+define i8 @extract_i8_x(<16 x i8> %arg, i32 %idx) {
+; CHECK-P7-LABEL: 'extract_i8_x'
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <16 x i8> %arg, i32 %idx
+; CHECK-P7-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %x
+;
+; CHECK-P8LE-LABEL: 'extract_i8_x'
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <16 x i8> %arg, i32 %idx
+; CHECK-P8LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %x
+;
+; CHECK-P9BE-LABEL: 'extract_i8_x'
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x = extractelement <16 x i8> %arg, i32 %idx
+; CHECK-P9BE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %x
+;
+; CHECK-P9LE-LABEL: 'extract_i8_x'
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %x = extractelement <16 x i8> %arg, i32 %idx
+; CHECK-P9LE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %x
+;
+; CHECK-P10-LABEL: 'extract_i8_x'
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %x = extractelement <16 x i8> %arg, i32 %idx
+; CHECK-P10-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret i8 %x
+;
+  %x = extractelement <16 x i8> %arg, i32 %idx
+  ret i8 %x
+}

From 314526557ec66ee627ae8a1c03f6ccc610668fdb Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 22 Jan 2024 22:33:04 -0500
Subject: [PATCH 541/843] [libc++] Fix the behavior of throwing `operator new`
 under -fno-exceptions (#69498)

In D144319, Clang tried to land a change that would cause some functions
that are not supposed to return nullptr to optimize better. As reported
in https://reviews.llvm.org/D144319#4203982, libc++ started seeing
failures in its CI shortly after this change was landed.

As explained in D146379, the reason for these failures is that libc++'s
throwing `operator new` can in fact return nullptr when compiled with
exceptions disabled. However, this contradicts the Standard, which
clearly says that the throwing version of `operator new(size_t)` should
never return nullptr. This is actually a long standing issue. I've
previously seen a case where LTO would optimize incorrectly based on the
assumption that `operator new` doesn't return nullptr, an assumption
that was violated in that case because libc++.dylib was compiled with
-fno-exceptions.

Unfortunately, fixing this is kind of tricky. The Standard has a few
requirements for the allocation functions, some of which are impossible
to satisfy under -fno-exceptions:
1. `operator new(size_t)` must never return nullptr
2. `operator new(size_t, nothrow_t)` must call the throwing version and
return nullptr on failure to allocate
3. We can't throw exceptions when compiled with -fno-exceptions

In the case where exceptions are enabled, things work nicely.
`new(size_t)` throws and `new(size_t, nothrow_t)` uses a try-catch to
return nullptr. However, when compiling the library with
-fno-exceptions, we can't throw an exception from `new(size_t)`, and we
can't catch anything from `new(size_t, nothrow_t)`. The only thing we
can do from `new(size_t)` is actually abort the program, which does not
make it possible for `new(size_t, nothrow_t)` to catch something and
return nullptr.

This patch makes the following changes:
1. When compiled with -fno-exceptions, the throwing version of `operator
new` will now abort on failure instead of returning nullptr on failure.
This resolves the issue that the compiler could mis-compile based on the
assumption that nullptr is never returned. This constitutes an API and
ABI breaking change for folks compiling the library with -fno-exceptions
(which is not the general public, who merely uses libc++ headers but use
a shared library that has already been compiled). This should mostly
impact vendors and other folks who compile libc++.dylib themselves.

2. When the library is compiled with -fexceptions, the nothrow version
of `operator new` has no change. When the library is compiled with
-fno-exceptions, the nothrow version of `operator new` will now check
whether the throwing version of `operator new` has been overridden. If
it has not been overridden, then it will use an implementation
equivalent to that of the throwing `operator new`, except it will return
nullptr on failure to allocate (instead of terminating). However, if the
throwing `operator new` has been overridden, it is now an error NOT to
also override the nothrow `operator new`. Indeed, there is no way for us
to implement a valid nothrow `operator new` without knowing the exact
implementation of the throwing version.

In summary, this change will impact people who fall into the following
intersection of conditions:
- They use the libc++ shared/static library built with `-fno-exceptions`
- They do not override `operator new(..., std::nothrow_t)`
- They override `operator new(...)` (the throwing version)
- They use `operator new(..., std::nothrow_t)`

We believe this represents a small number of people.

Fixes #60129
rdar://103958777

Differential Revision: https://reviews.llvm.org/D150610
---
 libcxx/docs/ReleaseNotes/18.rst               |  23 +++
 libcxx/src/include/overridable_function.h     | 119 ++++++++++++
 libcxx/src/new.cpp                            | 103 +++++++---
 ...new_not_overridden_fno_exceptions.pass.cpp |  58 ++++++
 .../new_dont_return_nullptr.pass.cpp          |  37 ++++
 ...ze_align_nothrow.replace.indirect.pass.cpp |   4 +
 ...new.size_nothrow.replace.indirect.pass.cpp |   4 +
 ...ze_align_nothrow.replace.indirect.pass.cpp |   4 +
 ...new.size_nothrow.replace.indirect.pass.cpp |   4 +
 libcxx/test/support/check_assertion.h         |  55 +++++-
 libcxx/test/support/count_new.h               | 181 +++++++++++++-----
 .../test_check_assertion.pass.cpp             |  15 +-
 libcxxabi/src/stdlib_new_delete.cpp           | 172 +++++++++++------
 13 files changed, 627 insertions(+), 152 deletions(-)
 create mode 100644 libcxx/src/include/overridable_function.h
 create mode 100644 libcxx/test/libcxx/language.support/support.dynamic/assert.nothrow_new_not_overridden_fno_exceptions.pass.cpp
 create mode 100644 libcxx/test/libcxx/language.support/support.dynamic/new_dont_return_nullptr.pass.cpp

diff --git a/libcxx/docs/ReleaseNotes/18.rst b/libcxx/docs/ReleaseNotes/18.rst
index fd882bafe19a5..fb3d2af544c28 100644
--- a/libcxx/docs/ReleaseNotes/18.rst
+++ b/libcxx/docs/ReleaseNotes/18.rst
@@ -237,6 +237,29 @@ LLVM 20
 ABI Affecting Changes
 ---------------------
 
+- When the shared/static library is built with ``-fno-exceptions``, the behavior of ``operator new`` was changed
+  to make it standards-conforming. In LLVM 17 and before, the throwing versions of ``operator new`` would return
+  ``nullptr`` upon failure to allocate, when the shared/static library was built with exceptions disabled. This
+  was non-conforming, since the throwing versions of ``operator new`` are never expected to return ``nullptr``, and
+  this non-conformance could actually lead to miscompiles in subtle cases.
+
+  Starting in LLVM 18, the throwing versions of ``operator new`` will abort the program when they fail to allocate
+  if the shared/static library has been built with ``-fno-exceptions``. This is consistent with the behavior of all
+  other potentially-throwing functions in the library, which abort the program instead of throwing when ``-fno-exceptions``
+  is used.
+
+  Furthermore, when the shared/static library is built with ``-fno-exceptions``, users who override the throwing
+  version of ``operator new`` will now need to also override the ``std::nothrow_t`` version of ``operator new`` if
+  they want to use it. Indeed, this is because there is no way to implement a conforming ``operator new(nothrow)``
+  from a conforming potentially-throwing ``operator new`` when compiled with ``-fno-exceptions``. In that case, using
+  ``operator new(nothrow)`` without overriding it explicitly but after overriding the throwing ``operator new`` will
+  result in an error.
+
+  Note that this change only impacts vendors/users that build the shared/static library themselves and pass
+  ``-DLIBCXX_ENABLE_EXCEPTIONS=OFF``, which is not the default configuration. If you are using the default
+  configuration of the library, the libc++ shared/static library will be built with exceptions enabled, and
+  there is no change between LLVM 17 and LLVM 18, even for users who build their own code using ``-fno-exceptions``.
+
 - The symbol of a non-visible function part of ``std::system_error`` was removed.
   This is not a breaking change as the private function ``__init`` was never referenced internally outside of the dylib.
 
diff --git a/libcxx/src/include/overridable_function.h b/libcxx/src/include/overridable_function.h
new file mode 100644
index 0000000000000..7b0fba10f47d4
--- /dev/null
+++ b/libcxx/src/include/overridable_function.h
@@ -0,0 +1,119 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _LIBCPP_SRC_INCLUDE_OVERRIDABLE_FUNCTION_H
+#define _LIBCPP_SRC_INCLUDE_OVERRIDABLE_FUNCTION_H
+
+#include <__config>
+#include <cstdint>
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+//
+// This file provides the std::__is_function_overridden utility, which allows checking
+// whether an overridable function (typically a weak symbol) like `operator new`
+// has been overridden by a user or not.
+//
+// This is a low-level utility which does not work on all platforms, since it needs
+// to make assumptions about the object file format in use. Furthermore, it requires
+// the "base definition" of the function (the one we want to check whether it has been
+// overridden) to be annotated with the _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
+//
+// This currently works with Mach-O files (used on Darwin) and with ELF files (used on Linux
+// and others). On platforms where we know how to implement this detection, the macro
+// _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION is defined to 1, and it is defined to 0 on
+// other platforms. The _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro is defined to
+// nothing on unsupported platforms so that it can be used to decorate functions regardless
+// of whether detection is actually supported.
+//
+// How does this work?
+// -------------------
+//
+// Let's say we want to check whether a weak function `f` has been overridden by the user.
+// The general mechanism works by placing `f`'s definition (in the libc++ built library)
+// inside a special section, which we do using the `__section__` attribute via the
+// _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE macro.
+//
+// Then, when comes the time to check whether the function has been overridden, we take
+// the address of the function and we check whether it falls inside the special function
+// we created. This can be done by finding pointers to the start and the end of the section
+// (which is done differently for ELF and Mach-O), and then checking whether `f` falls
+// within those bounds. If it falls within those bounds, then `f` is still inside the
+// special section and so it is the version we defined in the libc++ built library, i.e.
+// it was not overridden. Otherwise, it was overridden by the user because it falls
+// outside of the section.
+//
+// Important note
+// --------------
+//
+// This mechanism should never be used outside of the libc++ built library. In particular,
+// attempting to use this within the libc++ headers will not work at all because we don't
+// want to be defining special sections inside user's executables which use our headers.
+// This is provided inside libc++'s include tree solely to make it easier to share with
+// libc++abi, which needs the same mechanism.
+//
+
+#if defined(_LIBCPP_OBJECT_FORMAT_MACHO)
+
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE                                                                 \
+    __attribute__((__section__("__TEXT,__lcxx_override,regular,pure_instructions")))
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _Ret, class... _Args>
+_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
+  // Declare two dummy bytes and give them these special `__asm` values. These values are
+  // defined by the linker, which means that referring to `&__lcxx_override_start` will
+  // effectively refer to the address where the section starts (and same for the end).
+  extern char __lcxx_override_start __asm("section$start$__TEXT$__lcxx_override");
+  extern char __lcxx_override_end __asm("section$end$__TEXT$__lcxx_override");
+
+  // Now get a uintptr_t out of these locations, and out of the function pointer.
+  uintptr_t __start = reinterpret_cast<uintptr_t>(&__lcxx_override_start);
+  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__lcxx_override_end);
+  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
+
+  // Finally, the function was overridden if it falls outside of the section's bounds.
+  return __ptr < __start || __ptr > __end;
+}
+_LIBCPP_END_NAMESPACE_STD
+
+#elif defined(_LIBCPP_OBJECT_FORMAT_ELF)
+
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 1
+#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE __attribute__((__section__("__lcxx_override")))
+
+// This is very similar to what we do for Mach-O above. The ELF linker will implicitly define
+// variables with those names corresponding to the start and the end of the section.
+//
+// See https://stackoverflow.com/questions/16552710/how-do-you-get-the-start-and-end-addresses-of-a-custom-elf-section
+extern char __start___lcxx_override;
+extern char __stop___lcxx_override;
+
+_LIBCPP_BEGIN_NAMESPACE_STD
+template <class _Ret, class... _Args>
+_LIBCPP_HIDE_FROM_ABI bool __is_function_overridden(_Ret (*__fptr)(_Args...)) noexcept {
+  uintptr_t __start = reinterpret_cast<uintptr_t>(&__start___lcxx_override);
+  uintptr_t __end   = reinterpret_cast<uintptr_t>(&__stop___lcxx_override);
+  uintptr_t __ptr   = reinterpret_cast<uintptr_t>(__fptr);
+
+  return __ptr < __start || __ptr > __end;
+}
+_LIBCPP_END_NAMESPACE_STD
+
+#else
+
+#  define _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION 0
+#  define _LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE /* nothing */
+
+#endif
+
+#endif // _LIBCPP_SRC_INCLUDE_OVERRIDABLE_FUNCTION_H
diff --git a/libcxx/src/new.cpp b/libcxx/src/new.cpp
index cb8b4aae8d5f9..0869d90661dd5 100644
--- a/libcxx/src/new.cpp
+++ b/libcxx/src/new.cpp
@@ -6,7 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/overridable_function.h"
 #include <__memory/aligned_alloc.h>
+#include <cstddef>
 #include <cstdlib>
 #include <new>
 
@@ -15,6 +17,10 @@
 // The code below is copied as-is into libc++abi's libcxxabi/src/stdlib_new_delete.cpp
 // file. The version in this file is the canonical one.
 
+inline void __throw_bad_alloc_shim() { std::__throw_bad_alloc(); }
+
+#  define _LIBCPP_ASSERT_SHIM(expr, str) _LIBCPP_ASSERT(expr, str)
+
 // ------------------ BEGIN COPY ------------------
 // Implement all new and delete operators as weak definitions
 // in this shared library, so that they can be overridden by programs
@@ -36,41 +42,63 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   if (p == nullptr)
-    throw std::bad_alloc();
-#  endif
+    __throw_bad_alloc_shim();
   return p;
 }
 
 _LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
+#  ifdef _LIBCPP_HAS_NO_EXCEPTIONS
+#    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
+  _LIBCPP_ASSERT_SHIM(
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
+      "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
+      "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
+      "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
+      "it fails to allocate, making it impossible for `operator new(size_t, nothrow_t)` to fulfill its "
+      "contract (since it should return nullptr upon failure). Please make sure you override "
+      "`operator new(size_t, nothrow_t)` as well.");
+#    endif
+
+  return operator_new_impl(size);
+#  else
   void* p = nullptr;
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
     p = ::operator new(size);
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
   }
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
   return p;
+#  endif
 }
 
-_LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC { return ::operator new(size); }
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
+  return ::operator new(size);
+}
 
 _LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
+#  ifdef _LIBCPP_HAS_NO_EXCEPTIONS
+#    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
+  _LIBCPP_ASSERT_SHIM(
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
+      "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
+      "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
+      "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
+      "it fails to allocate, making it impossible for `operator new[](size_t, nothrow_t)` to fulfill its "
+      "contract (since it should return nullptr upon failure). Please make sure you override "
+      "`operator new[](size_t, nothrow_t)` as well.");
+#    endif
+
+  return operator_new_impl(size);
+#  else
   void* p = nullptr;
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
     p = ::operator new[](size);
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
   }
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
   return p;
+#  endif
 }
 
 _LIBCPP_WEAK void operator delete(void* ptr) noexcept { std::free(ptr); }
@@ -107,43 +135,66 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_WEAK void* operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
-#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   if (p == nullptr)
-    throw std::bad_alloc();
-#    endif
+    __throw_bad_alloc_shim();
   return p;
 }
 
 _LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
+#    ifdef _LIBCPP_HAS_NO_EXCEPTIONS
+#      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
+  _LIBCPP_ASSERT_SHIM(
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
+      "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
+      "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
+      "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
+      "terminate in case it fails to allocate, making it impossible for `operator new(size_t, align_val_t, nothrow_t)` "
+      "to fulfill its contract (since it should return nullptr upon failure). Please make sure you override "
+      "`operator new(size_t, align_val_t, nothrow_t)` as well.");
+#      endif
+
+  return operator_new_aligned_impl(size, alignment);
+#    else
   void* p = nullptr;
-#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#    endif // _LIBCPP_HAS_NO_EXCEPTIONS
     p = ::operator new(size, alignment);
-#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
   }
-#    endif // _LIBCPP_HAS_NO_EXCEPTIONS
   return p;
+#    endif
 }
 
-_LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
   return ::operator new(size, alignment);
 }
 
 _LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
+#    ifdef _LIBCPP_HAS_NO_EXCEPTIONS
+#      if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
+  _LIBCPP_ASSERT_SHIM(
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
+      "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
+      "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
+      "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "
+      "terminate in case it fails to allocate, making it impossible for `operator new[](size_t, align_val_t, "
+      "nothrow_t)` to fulfill its contract (since it should return nullptr upon failure). Please make sure you "
+      "override "
+      "`operator new[](size_t, align_val_t, nothrow_t)` as well.");
+#      endif
+
+  return operator_new_aligned_impl(size, alignment);
+#    else
   void* p = nullptr;
-#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#    endif // _LIBCPP_HAS_NO_EXCEPTIONS
     p = ::operator new[](size, alignment);
-#    ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
   }
-#    endif // _LIBCPP_HAS_NO_EXCEPTIONS
   return p;
+#    endif
 }
 
 _LIBCPP_WEAK void operator delete(void* ptr, std::align_val_t) noexcept { std::__libcpp_aligned_free(ptr); }
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/assert.nothrow_new_not_overridden_fno_exceptions.pass.cpp b/libcxx/test/libcxx/language.support/support.dynamic/assert.nothrow_new_not_overridden_fno_exceptions.pass.cpp
new file mode 100644
index 0000000000000..fb26bdf49cc3c
--- /dev/null
+++ b/libcxx/test/libcxx/language.support/support.dynamic/assert.nothrow_new_not_overridden_fno_exceptions.pass.cpp
@@ -0,0 +1,58 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// void* operator new(std::size_t, const std::nothrow_t&);
+// void* operator new(std::size_t, std::align_val_t, const std::nothrow_t&);
+// void* operator new[](std::size_t, const std::nothrow_t&);
+// void* operator new[](std::size_t, std::align_val_t, const std::nothrow_t&);
+
+// This test ensures that we catch the case where `new` has been overridden but `new(nothrow)`
+// has not been overridden, and the library is compiled with -fno-exceptions.
+//
+// In that case, it is impossible for libc++ to provide a Standards conforming implementation
+// of `new(nothrow)`, so the only viable option is to terminate the program.
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03
+
+// We only know how to diagnose this on platforms that use the ELF or Mach-O object file formats.
+// XFAIL: target={{.+}}-windows-{{.+}}
+
+// TODO: We currently don't have a way to express that the built library was
+//       compiled with -fno-exceptions, so if the library was built with support
+//       for exceptions but we run the test suite without exceptions, this will
+//       spuriously fail.
+// REQUIRES: no-exceptions
+
+#include <cstddef>
+#include <new>
+
+#include "check_assertion.h"
+
+// Override the throwing versions of operator new, but not the nothrow versions.
+alignas(32) char DummyData[32 * 3];
+void* operator new(std::size_t) { return DummyData; }
+void* operator new(std::size_t, std::align_val_t) { return DummyData; }
+void* operator new[](std::size_t) { return DummyData; }
+void* operator new[](std::size_t, std::align_val_t) { return DummyData; }
+
+void operator delete(void*) noexcept {}
+void operator delete(void*, std::align_val_t) noexcept {}
+void operator delete[](void*) noexcept {}
+void operator delete[](void*, std::align_val_t) noexcept {}
+
+int main(int, char**) {
+  std::size_t size       = 3;
+  std::align_val_t align = static_cast<std::align_val_t>(32);
+  EXPECT_ANY_DEATH((void)operator new(size, std::nothrow));
+  EXPECT_ANY_DEATH((void)operator new(size, align, std::nothrow));
+  EXPECT_ANY_DEATH((void)operator new[](size, std::nothrow));
+  EXPECT_ANY_DEATH((void)operator new[](size, align, std::nothrow));
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/language.support/support.dynamic/new_dont_return_nullptr.pass.cpp b/libcxx/test/libcxx/language.support/support.dynamic/new_dont_return_nullptr.pass.cpp
new file mode 100644
index 0000000000000..02b20a8c98b21
--- /dev/null
+++ b/libcxx/test/libcxx/language.support/support.dynamic/new_dont_return_nullptr.pass.cpp
@@ -0,0 +1,37 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// void* operator new(std::size_t);
+// void* operator new(std::size_t, std::align_val_t);
+// void* operator new[](std::size_t);
+// void* operator new[](std::size_t, std::align_val_t);
+
+// This test ensures that we abort the program instead of returning nullptr
+// when we fail to satisfy the allocation request. The throwing versions of
+// `operator new` must never return nullptr on failure to allocate (per the
+// Standard) and the compiler actually relies on that for optimizations.
+// Returning nullptr from the throwing `operator new` can basically result
+// in miscompiles.
+
+// REQUIRES: has-unix-headers
+// REQUIRES: no-exceptions
+// UNSUPPORTED: c++03, c++11, c++14
+
+#include <cstddef>
+#include <limits>
+#include <new>
+
+#include "check_assertion.h"
+
+int main(int, char**) {
+  EXPECT_ANY_DEATH((void)operator new(std::numeric_limits<std::size_t>::max()));
+  EXPECT_ANY_DEATH((void)operator new(std::numeric_limits<std::size_t>::max(), static_cast<std::align_val_t>(32)));
+  EXPECT_ANY_DEATH((void)operator new[](std::numeric_limits<std::size_t>::max()));
+  EXPECT_ANY_DEATH((void)operator new[](std::numeric_limits<std::size_t>::max(), static_cast<std::align_val_t>(32)));
+  return 0;
+}
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
index df8a651932cef..f6959172ea24e 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -11,6 +11,10 @@
 // Test that we can replace the operator by replacing `operator new[](std::size_t, std::align_val_t)`
 // (the throwing version).
 
+// This doesn't work when the shared library was built with exceptions disabled, because
+// we can't implement the non-throwing new from the throwing new in that case.
+// XFAIL: no-exceptions
+
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: sanitizer-new-delete
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_nothrow.replace.indirect.pass.cpp
index 70d891b2a82c0..84bfe8205b600 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.array/new.size_nothrow.replace.indirect.pass.cpp
@@ -10,6 +10,10 @@
 
 // Test that we can replace the operator by replacing `operator new[](std::size_t)` (the throwing version).
 
+// This doesn't work when the shared library was built with exceptions disabled, because
+// we can't implement the non-throwing new from the throwing new in that case.
+// XFAIL: no-exceptions
+
 // UNSUPPORTED: sanitizer-new-delete
 // XFAIL: libcpp-no-vcruntime
 // XFAIL: LIBCXX-AIX-FIXME
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
index a68cdab54528c..2e7fa132890b8 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_align_nothrow.replace.indirect.pass.cpp
@@ -10,6 +10,10 @@
 
 // Test that we can replace the operator by replacing `operator new(std::size_t, std::align_val_t)` (the throwing version).
 
+// This doesn't work when the shared library was built with exceptions disabled, because
+// we can't implement the non-throwing new from the throwing new in that case.
+// XFAIL: no-exceptions
+
 // UNSUPPORTED: c++03, c++11, c++14
 // UNSUPPORTED: sanitizer-new-delete
 
diff --git a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_nothrow.replace.indirect.pass.cpp b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_nothrow.replace.indirect.pass.cpp
index 64edbfd7e9af9..8b5019cf7eb63 100644
--- a/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_nothrow.replace.indirect.pass.cpp
+++ b/libcxx/test/std/language.support/support.dynamic/new.delete/new.delete.single/new.size_nothrow.replace.indirect.pass.cpp
@@ -10,6 +10,10 @@
 
 // Test that we can replace the operator by replacing `operator new(std::size_t)` (the throwing version).
 
+// This doesn't work when the shared library was built with exceptions disabled, because
+// we can't implement the non-throwing new from the throwing new in that case.
+// XFAIL: no-exceptions
+
 // UNSUPPORTED: sanitizer-new-delete
 // XFAIL: libcpp-no-vcruntime
 // XFAIL: LIBCXX-AIX-FIXME
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index 1811df5f474f5..c1d0bdcb33ec6 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -9,7 +9,9 @@
 #ifndef TEST_SUPPORT_CHECK_ASSERTION_H
 #define TEST_SUPPORT_CHECK_ASSERTION_H
 
+#include <array>
 #include <cassert>
+#include <csignal>
 #include <cstdarg>
 #include <cstddef>
 #include <cstdio>
@@ -85,6 +87,7 @@ Matcher MakeAnyMatcher() {
 enum class DeathCause {
   // Valid causes
   VerboseAbort = 1,
+  StdAbort,
   StdTerminate,
   Trap,
   // Invalid causes
@@ -96,6 +99,7 @@ enum class DeathCause {
 bool IsValidCause(DeathCause cause) {
   switch (cause) {
   case DeathCause::VerboseAbort:
+  case DeathCause::StdAbort:
   case DeathCause::StdTerminate:
   case DeathCause::Trap:
     return true;
@@ -108,6 +112,8 @@ std::string ToString(DeathCause cause) {
   switch (cause) {
   case DeathCause::VerboseAbort:
     return "verbose abort";
+  case DeathCause::StdAbort:
+    return "`std::abort`";
   case DeathCause::StdTerminate:
     return "`std::terminate`";
   case DeathCause::Trap:
@@ -123,6 +129,19 @@ std::string ToString(DeathCause cause) {
   assert(false && "Unreachable");
 }
 
+template <std::size_t N>
+std::string ToString(std::array<DeathCause, N> const& causes) {
+  std::stringstream ss;
+  ss << "{";
+  for (std::size_t i = 0; i != N; ++i) {
+    ss << ToString(causes[i]);
+    if (i+1 != N)
+      ss << ", ";
+  }
+  ss << "}";
+  return ss.str();
+}
+
 TEST_NORETURN void StopChildProcess(DeathCause cause) { std::exit(static_cast<int>(cause)); }
 
 DeathCause ConvertToDeathCause(int val) {
@@ -177,8 +196,9 @@ class DeathTest {
   DeathTest(DeathTest const&)            = delete;
   DeathTest& operator=(DeathTest const&) = delete;
 
-  template <class Func>
-  DeathTestResult Run(DeathCause expected_cause, Func&& func, const Matcher& matcher) {
+  template <std::size_t N, class Func>
+  DeathTestResult Run(const std::array<DeathCause, N>& expected_causes, Func&& func, const Matcher& matcher) {
+    std::signal(SIGABRT, [](int) { StopChildProcess(DeathCause::StdAbort); });
     std::set_terminate([] { StopChildProcess(DeathCause::StdTerminate); });
 
     DeathCause cause = Run(func);
@@ -187,12 +207,12 @@ class DeathTest {
       return DeathTestResult(Outcome::InvalidCause, cause, ToString(cause));
     }
 
-    if (expected_cause != cause) {
+    if (std::find(expected_causes.begin(), expected_causes.end(), cause) == expected_causes.end()) {
       std::stringstream failure_description;
       failure_description                                             //
           << "Child died, but with a different death cause\n"         //
-          << "Expected cause:   " << ToString(expected_cause) << "\n" //
-          << "Actual cause:     " << ToString(cause) << "\n";
+          << "Expected cause(s): " << ToString(expected_causes) << "\n" //
+          << "Actual cause:      " << ToString(cause) << "\n";
       return DeathTestResult(Outcome::UnexpectedCause, cause, failure_description.str());
     }
 
@@ -328,12 +348,13 @@ void std::__libcpp_verbose_abort(char const* format, ...) {
 }
 #endif // _LIBCPP_VERSION
 
-template <class Func>
-bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func, const Matcher& matcher) {
-  assert(IsValidCause(expected_cause));
+template <std::size_t N, class Func>
+bool ExpectDeath(const std::array<DeathCause, N>& expected_causes, const char* stmt, Func&& func, const Matcher& matcher) {
+  for (auto cause : expected_causes)
+    assert(IsValidCause(cause));
 
   DeathTest test_case;
-  DeathTestResult test_result = test_case.Run(expected_cause, func, matcher);
+  DeathTestResult test_result = test_case.Run(expected_causes, func, matcher);
   if (!test_result.success()) {
     test_case.PrintFailureDetails(test_result.failure_description(), stmt, test_result.cause());
   }
@@ -341,18 +362,32 @@ bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func, const
   return test_result.success();
 }
 
+template <class Func>
+bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func, const Matcher& matcher) {
+  return ExpectDeath(std::array<DeathCause, 1>{expected_cause}, stmt, func, matcher);
+}
+
+template <std::size_t N, class Func>
+bool ExpectDeath(const std::array<DeathCause, N>& expected_causes, const char* stmt, Func&& func) {
+  return ExpectDeath(expected_causes, stmt, func, MakeAnyMatcher());
+}
+
 template <class Func>
 bool ExpectDeath(DeathCause expected_cause, const char* stmt, Func&& func) {
-  return ExpectDeath(expected_cause, stmt, func, MakeAnyMatcher());
+  return ExpectDeath(std::array<DeathCause, 1>{expected_cause}, stmt, func, MakeAnyMatcher());
 }
 
 // clang-format off
 
 /// Assert that the specified expression aborts with the expected cause and, optionally, error message.
+#define EXPECT_ANY_DEATH(...)                         \
+    assert(( ExpectDeath(std::array<DeathCause, 4>{DeathCause::VerboseAbort, DeathCause::StdAbort, DeathCause::StdTerminate, DeathCause::Trap}, #__VA_ARGS__, [&]() { __VA_ARGS__; } ) ))
 #define EXPECT_DEATH(...)                         \
     assert(( ExpectDeath(DeathCause::VerboseAbort, #__VA_ARGS__, [&]() { __VA_ARGS__; } ) ))
 #define EXPECT_DEATH_MATCHES(matcher, ...)        \
     assert(( ExpectDeath(DeathCause::VerboseAbort, #__VA_ARGS__, [&]() { __VA_ARGS__; }, matcher) ))
+#define EXPECT_STD_ABORT(...)                 \
+    assert(  ExpectDeath(DeathCause::StdAbort, #__VA_ARGS__, [&]() { __VA_ARGS__; })  )
 #define EXPECT_STD_TERMINATE(...)                 \
     assert(  ExpectDeath(DeathCause::StdTerminate, #__VA_ARGS__, __VA_ARGS__)  )
 
diff --git a/libcxx/test/support/count_new.h b/libcxx/test/support/count_new.h
index ef4306e520b19..0d17e394d0312 100644
--- a/libcxx/test/support/count_new.h
+++ b/libcxx/test/support/count_new.h
@@ -379,78 +379,161 @@ TEST_DIAGNOSTIC_POP
 MemCounter &globalMemCounter = *getGlobalMemCounter();
 
 #ifndef DISABLE_NEW_COUNT
-void* operator new(std::size_t s) TEST_THROW_SPEC(std::bad_alloc)
-{
+// operator new(size_t[, nothrow_t]) and operator delete(size_t[, nothrow_t])
+void* operator new(std::size_t s) TEST_THROW_SPEC(std::bad_alloc) {
+  getGlobalMemCounter()->newCalled(s);
+  void* p = std::malloc(s);
+  if (p == nullptr)
+    detail::throw_bad_alloc_helper();
+  return p;
+}
+
+void* operator new(std::size_t s, std::nothrow_t const&) TEST_NOEXCEPT {
+#  ifdef TEST_HAS_NO_EXCEPTIONS
+  getGlobalMemCounter()->newCalled(s);
+#  else
+  try {
     getGlobalMemCounter()->newCalled(s);
-    void* ret = std::malloc(s);
-    if (ret == nullptr)
-        detail::throw_bad_alloc_helper();
-    return ret;
+  } catch (std::bad_alloc const&) {
+    return nullptr;
+  }
+#  endif
+  return std::malloc(s);
 }
 
-void  operator delete(void* p) TEST_NOEXCEPT
-{
-    getGlobalMemCounter()->deleteCalled(p);
-    std::free(p);
+void operator delete(void* p) TEST_NOEXCEPT {
+  getGlobalMemCounter()->deleteCalled(p);
+  std::free(p);
 }
 
-void* operator new[](std::size_t s) TEST_THROW_SPEC(std::bad_alloc)
-{
+void operator delete(void* p, std::nothrow_t const&) TEST_NOEXCEPT {
+  getGlobalMemCounter()->deleteCalled(p);
+  std::free(p);
+}
+
+// operator new[](size_t[, nothrow_t]) and operator delete[](size_t[, nothrow_t])
+void* operator new[](std::size_t s) TEST_THROW_SPEC(std::bad_alloc) {
+  getGlobalMemCounter()->newArrayCalled(s);
+  void* p = std::malloc(s);
+  if (p == nullptr)
+    detail::throw_bad_alloc_helper();
+  return p;
+}
+
+void* operator new[](std::size_t s, std::nothrow_t const&) TEST_NOEXCEPT {
+#  ifdef TEST_HAS_NO_EXCEPTIONS
+  getGlobalMemCounter()->newArrayCalled(s);
+#  else
+  try {
     getGlobalMemCounter()->newArrayCalled(s);
-    return operator new(s);
+  } catch (std::bad_alloc const&) {
+    return nullptr;
+  }
+#  endif
+  return std::malloc(s);
 }
 
-void operator delete[](void* p) TEST_NOEXCEPT
-{
-    getGlobalMemCounter()->deleteArrayCalled(p);
-    operator delete(p);
+void operator delete[](void* p) TEST_NOEXCEPT {
+  getGlobalMemCounter()->deleteArrayCalled(p);
+  std::free(p);
 }
 
-#ifndef TEST_HAS_NO_ALIGNED_ALLOCATION
-#if defined(_LIBCPP_MSVCRT_LIKE) || \
-  (!defined(_LIBCPP_VERSION) && defined(_WIN32))
-#define USE_ALIGNED_ALLOC
-#endif
+void operator delete[](void* p, std::nothrow_t const&) TEST_NOEXCEPT {
+  getGlobalMemCounter()->deleteArrayCalled(p);
+  std::free(p);
+}
+
+#  ifndef TEST_HAS_NO_ALIGNED_ALLOCATION
+#    if defined(_LIBCPP_MSVCRT_LIKE) || (!defined(_LIBCPP_VERSION) && defined(_WIN32))
+#      define USE_ALIGNED_ALLOC
+#    endif
+
+inline void* alocate_aligned_impl(std::size_t size, std::align_val_t align) {
+  const std::size_t alignment = static_cast<std::size_t>(align);
+  void* ret                   = nullptr;
+#    ifdef USE_ALIGNED_ALLOC
+  ret = _aligned_malloc(size, alignment);
+#    else
+  assert(posix_memalign(&ret, std::max(alignment, sizeof(void*)), size) != EINVAL);
+#    endif
+  return ret;
+}
 
+inline void free_aligned_impl(void* ptr, std::align_val_t) {
+  if (ptr) {
+#    ifdef USE_ALIGNED_ALLOC
+    ::_aligned_free(ptr);
+#    else
+    ::free(ptr);
+#    endif
+  }
+}
+
+// operator new(size_t, align_val_t[, nothrow_t]) and operator delete(size_t, align_val_t[, nothrow_t])
 void* operator new(std::size_t s, std::align_val_t av) TEST_THROW_SPEC(std::bad_alloc) {
-  const std::size_t a = static_cast<std::size_t>(av);
-  getGlobalMemCounter()->alignedNewCalled(s, a);
-  void *ret = nullptr;
-#ifdef USE_ALIGNED_ALLOC
-  ret = _aligned_malloc(s, a);
-#else
-  assert(posix_memalign(&ret, std::max(a, sizeof(void*)), s) != EINVAL);
-#endif
-  if (ret == nullptr)
+  getGlobalMemCounter()->alignedNewCalled(s, static_cast<std::size_t>(av));
+  void* p = alocate_aligned_impl(s, av);
+  if (p == nullptr)
     detail::throw_bad_alloc_helper();
-  return ret;
+  return p;
 }
 
-void operator delete(void *p, std::align_val_t av) TEST_NOEXCEPT {
-  const std::size_t a = static_cast<std::size_t>(av);
-  getGlobalMemCounter()->alignedDeleteCalled(p, a);
-  if (p) {
-#ifdef USE_ALIGNED_ALLOC
-    ::_aligned_free(p);
-#else
-    ::free(p);
-#endif
+void* operator new(std::size_t s, std::align_val_t av, std::nothrow_t const&) TEST_NOEXCEPT {
+#    ifdef TEST_HAS_NO_EXCEPTIONS
+  getGlobalMemCounter()->alignedNewCalled(s, static_cast<std::size_t>(av));
+#    else
+  try {
+    getGlobalMemCounter()->alignedNewCalled(s, static_cast<std::size_t>(av));
+  } catch (std::bad_alloc const&) {
+    return nullptr;
   }
+#    endif
+  return alocate_aligned_impl(s, av);
+}
+
+void operator delete(void* p, std::align_val_t av) TEST_NOEXCEPT {
+  getGlobalMemCounter()->alignedDeleteCalled(p, static_cast<std::size_t>(av));
+  free_aligned_impl(p, av);
+}
+
+void operator delete(void* p, std::align_val_t av, std::nothrow_t const&) TEST_NOEXCEPT {
+  getGlobalMemCounter()->alignedDeleteCalled(p, static_cast<std::size_t>(av));
+  free_aligned_impl(p, av);
 }
 
+// operator new[](size_t, align_val_t[, nothrow_t]) and operator delete[](size_t, align_val_t[, nothrow_t])
 void* operator new[](std::size_t s, std::align_val_t av) TEST_THROW_SPEC(std::bad_alloc) {
-  const std::size_t a = static_cast<std::size_t>(av);
-  getGlobalMemCounter()->alignedNewArrayCalled(s, a);
-  return operator new(s, av);
+  getGlobalMemCounter()->alignedNewArrayCalled(s, static_cast<std::size_t>(av));
+  void* p = alocate_aligned_impl(s, av);
+  if (p == nullptr)
+    detail::throw_bad_alloc_helper();
+  return p;
+}
+
+void* operator new[](std::size_t s, std::align_val_t av, std::nothrow_t const&) TEST_NOEXCEPT {
+#    ifdef TEST_HAS_NO_EXCEPTIONS
+  getGlobalMemCounter()->alignedNewArrayCalled(s, static_cast<std::size_t>(av));
+#    else
+  try {
+    getGlobalMemCounter()->alignedNewArrayCalled(s, static_cast<std::size_t>(av));
+  } catch (std::bad_alloc const&) {
+    return nullptr;
+  }
+#    endif
+  return alocate_aligned_impl(s, av);
+}
+
+void operator delete[](void* p, std::align_val_t av) TEST_NOEXCEPT {
+  getGlobalMemCounter()->alignedDeleteArrayCalled(p, static_cast<std::size_t>(av));
+  free_aligned_impl(p, av);
 }
 
-void operator delete[](void *p, std::align_val_t av) TEST_NOEXCEPT {
-  const std::size_t a = static_cast<std::size_t>(av);
-  getGlobalMemCounter()->alignedDeleteArrayCalled(p, a);
-  return operator delete(p, av);
+void operator delete[](void* p, std::align_val_t av, std::nothrow_t const&) TEST_NOEXCEPT {
+  getGlobalMemCounter()->alignedDeleteArrayCalled(p, static_cast<std::size_t>(av));
+  free_aligned_impl(p, av);
 }
 
-#endif // TEST_HAS_NO_ALIGNED_ALLOCATION
+#  endif // TEST_HAS_NO_ALIGNED_ALLOCATION
 
 #endif // DISABLE_NEW_COUNT
 
diff --git a/libcxx/test/support/test.support/test_check_assertion.pass.cpp b/libcxx/test/support/test.support/test_check_assertion.pass.cpp
index d1ac6717267f3..4dfc5319aaf97 100644
--- a/libcxx/test/support/test.support/test_check_assertion.pass.cpp
+++ b/libcxx/test/support/test.support/test_check_assertion.pass.cpp
@@ -30,7 +30,7 @@ bool TestDeathTest(
   };
 
   DeathTest test_case;
-  DeathTestResult test_result = test_case.Run(expected_cause, func, get_matcher());
+  DeathTestResult test_result = test_case.Run(std::array<DeathCause, 1>{expected_cause}, func, get_matcher());
   std::string maybe_failure_description;
 
   Outcome outcome = test_result.outcome();
@@ -109,16 +109,21 @@ int main(int, char**) {
 
   // Test the `EXPECT_DEATH` macros themselves. Since they assert success, we can only test successful cases.
   {
-    auto invoke_abort = [] { _LIBCPP_VERBOSE_ABORT("contains some message"); };
+    auto invoke_verbose_abort = [] { _LIBCPP_VERBOSE_ABORT("contains some message"); };
+    auto invoke_abort         = [] { std::abort(); };
 
     auto simple_matcher = [](const std::string& text) {
       bool success = text.find("some") != std::string::npos;
       return MatchResult(success, "");
     };
 
-    EXPECT_DEATH(invoke_abort());
-    EXPECT_DEATH_MATCHES(MakeAnyMatcher(), invoke_abort());
-    EXPECT_DEATH_MATCHES(simple_matcher, invoke_abort());
+    EXPECT_ANY_DEATH(_LIBCPP_VERBOSE_ABORT(""));
+    EXPECT_ANY_DEATH(std::abort());
+    EXPECT_ANY_DEATH(std::terminate());
+    EXPECT_DEATH(invoke_verbose_abort());
+    EXPECT_DEATH_MATCHES(MakeAnyMatcher(), invoke_verbose_abort());
+    EXPECT_DEATH_MATCHES(simple_matcher, invoke_verbose_abort());
+    EXPECT_STD_ABORT(invoke_abort());
     EXPECT_STD_TERMINATE([] { std::terminate(); });
     TEST_LIBCPP_ASSERT_FAILURE(fail_assert(), "Some message");
   }
diff --git a/libcxxabi/src/stdlib_new_delete.cpp b/libcxxabi/src/stdlib_new_delete.cpp
index f8a00ec584256..b802559d479e2 100644
--- a/libcxxabi/src/stdlib_new_delete.cpp
+++ b/libcxxabi/src/stdlib_new_delete.cpp
@@ -7,7 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "__cxxabi_config.h"
+#include "abort_message.h"
+#include "include/overridable_function.h" // from libc++
 #include <__memory/aligned_alloc.h>
+#include <cstddef>
 #include <cstdlib>
 #include <new>
 
@@ -25,6 +28,20 @@
 #  error libc++ and libc++abi seem to disagree on whether exceptions are enabled
 #endif
 
+inline void __throw_bad_alloc_shim() {
+#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
+  throw std::bad_alloc();
+#else
+  abort_message("bad_alloc was thrown in -fno-exceptions mode");
+#endif
+}
+
+#define _LIBCPP_ASSERT_SHIM(expr, str)                                                                                 \
+  do {                                                                                                                 \
+    if (!expr)                                                                                                         \
+      abort_message(str);                                                                                              \
+  } while (false)
+
 // ------------------ BEGIN COPY ------------------
 // Implement all new and delete operators as weak definitions
 // in this shared library, so that they can be overridden by programs
@@ -46,64 +63,76 @@ static void* operator_new_impl(std::size_t size) {
   return p;
 }
 
-_LIBCPP_WEAK
-void* operator new(std::size_t size) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new(std::size_t size) _THROW_BAD_ALLOC {
   void* p = operator_new_impl(size);
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   if (p == nullptr)
-    throw std::bad_alloc();
-#endif
+    __throw_bad_alloc_shim();
   return p;
 }
 
-_LIBCPP_WEAK
-void* operator new(size_t size, const std::nothrow_t&) noexcept {
+_LIBCPP_WEAK void* operator new(size_t size, const std::nothrow_t&) noexcept {
+#ifdef _LIBCPP_HAS_NO_EXCEPTIONS
+#  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
+  _LIBCPP_ASSERT_SHIM(
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new)),
+      "libc++ was configured with exceptions disabled and `operator new(size_t)` has been overridden, "
+      "but `operator new(size_t, nothrow_t)` has not been overridden. This is problematic because "
+      "`operator new(size_t, nothrow_t)` must call `operator new(size_t)`, which will terminate in case "
+      "it fails to allocate, making it impossible for `operator new(size_t, nothrow_t)` to fulfill its "
+      "contract (since it should return nullptr upon failure). Please make sure you override "
+      "`operator new(size_t, nothrow_t)` as well.");
+#  endif
+
+  return operator_new_impl(size);
+#else
   void* p = nullptr;
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
     p = ::operator new(size);
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
   }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
   return p;
+#endif
 }
 
-_LIBCPP_WEAK
-void* operator new[](size_t size) _THROW_BAD_ALLOC { return ::operator new(size); }
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void* operator new[](size_t size) _THROW_BAD_ALLOC {
+  return ::operator new(size);
+}
 
-_LIBCPP_WEAK
-void* operator new[](size_t size, const std::nothrow_t&) noexcept {
+_LIBCPP_WEAK void* operator new[](size_t size, const std::nothrow_t&) noexcept {
+#ifdef _LIBCPP_HAS_NO_EXCEPTIONS
+#  if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
+  _LIBCPP_ASSERT_SHIM(
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t)>(&operator new[])),
+      "libc++ was configured with exceptions disabled and `operator new[](size_t)` has been overridden, "
+      "but `operator new[](size_t, nothrow_t)` has not been overridden. This is problematic because "
+      "`operator new[](size_t, nothrow_t)` must call `operator new[](size_t)`, which will terminate in case "
+      "it fails to allocate, making it impossible for `operator new[](size_t, nothrow_t)` to fulfill its "
+      "contract (since it should return nullptr upon failure). Please make sure you override "
+      "`operator new[](size_t, nothrow_t)` as well.");
+#  endif
+
+  return operator_new_impl(size);
+#else
   void* p = nullptr;
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
     p = ::operator new[](size);
-#ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
   }
-#endif // _LIBCPP_HAS_NO_EXCEPTIONS
   return p;
+#endif
 }
 
-_LIBCPP_WEAK
-void operator delete(void* ptr) noexcept { std::free(ptr); }
+_LIBCPP_WEAK void operator delete(void* ptr) noexcept { std::free(ptr); }
 
-_LIBCPP_WEAK
-void operator delete(void* ptr, const std::nothrow_t&) noexcept { ::operator delete(ptr); }
+_LIBCPP_WEAK void operator delete(void* ptr, const std::nothrow_t&) noexcept { ::operator delete(ptr); }
 
-_LIBCPP_WEAK
-void operator delete(void* ptr, size_t) noexcept { ::operator delete(ptr); }
+_LIBCPP_WEAK void operator delete(void* ptr, size_t) noexcept { ::operator delete(ptr); }
 
-_LIBCPP_WEAK
-void operator delete[](void* ptr) noexcept { ::operator delete(ptr); }
+_LIBCPP_WEAK void operator delete[](void* ptr) noexcept { ::operator delete(ptr); }
 
-_LIBCPP_WEAK
-void operator delete[](void* ptr, const std::nothrow_t&) noexcept { ::operator delete[](ptr); }
+_LIBCPP_WEAK void operator delete[](void* ptr, const std::nothrow_t&) noexcept { ::operator delete[](ptr); }
 
-_LIBCPP_WEAK
-void operator delete[](void* ptr, size_t) noexcept { ::operator delete[](ptr); }
+_LIBCPP_WEAK void operator delete[](void* ptr, size_t) noexcept { ::operator delete[](ptr); }
 
 #if !defined(_LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION)
 
@@ -127,70 +156,89 @@ static void* operator_new_aligned_impl(std::size_t size, std::align_val_t alignm
   return p;
 }
 
-_LIBCPP_WEAK
-void* operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new(std::size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
   void* p = operator_new_aligned_impl(size, alignment);
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   if (p == nullptr)
-    throw std::bad_alloc();
-#  endif
+    __throw_bad_alloc_shim();
   return p;
 }
 
-_LIBCPP_WEAK
-void* operator new(size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
+_LIBCPP_WEAK void* operator new(size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
+#  ifdef _LIBCPP_HAS_NO_EXCEPTIONS
+#    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
+  _LIBCPP_ASSERT_SHIM(
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new)),
+      "libc++ was configured with exceptions disabled and `operator new(size_t, align_val_t)` has been overridden, "
+      "but `operator new(size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
+      "`operator new(size_t, align_val_t, nothrow_t)` must call `operator new(size_t, align_val_t)`, which will "
+      "terminate in case it fails to allocate, making it impossible for `operator new(size_t, align_val_t, nothrow_t)` "
+      "to fulfill its contract (since it should return nullptr upon failure). Please make sure you override "
+      "`operator new(size_t, align_val_t, nothrow_t)` as well.");
+#    endif
+
+  return operator_new_aligned_impl(size, alignment);
+#  else
   void* p = nullptr;
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
     p = ::operator new(size, alignment);
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
   }
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
   return p;
+#  endif
 }
 
-_LIBCPP_WEAK
-void* operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
+_LIBCPP_MAKE_OVERRIDABLE_FUNCTION_DETECTABLE _LIBCPP_WEAK void*
+operator new[](size_t size, std::align_val_t alignment) _THROW_BAD_ALLOC {
   return ::operator new(size, alignment);
 }
 
-_LIBCPP_WEAK
-void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
+_LIBCPP_WEAK void* operator new[](size_t size, std::align_val_t alignment, const std::nothrow_t&) noexcept {
+#  ifdef _LIBCPP_HAS_NO_EXCEPTIONS
+#    if _LIBCPP_CAN_DETECT_OVERRIDDEN_FUNCTION
+  _LIBCPP_ASSERT_SHIM(
+      !std::__is_function_overridden(static_cast<void* (*)(std::size_t, std::align_val_t)>(&operator new[])),
+      "libc++ was configured with exceptions disabled and `operator new[](size_t, align_val_t)` has been overridden, "
+      "but `operator new[](size_t, align_val_t, nothrow_t)` has not been overridden. This is problematic because "
+      "`operator new[](size_t, align_val_t, nothrow_t)` must call `operator new[](size_t, align_val_t)`, which will "
+      "terminate in case it fails to allocate, making it impossible for `operator new[](size_t, align_val_t, "
+      "nothrow_t)` to fulfill its contract (since it should return nullptr upon failure). Please make sure you "
+      "override "
+      "`operator new[](size_t, align_val_t, nothrow_t)` as well.");
+#    endif
+
+  return operator_new_aligned_impl(size, alignment);
+#  else
   void* p = nullptr;
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   try {
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
     p = ::operator new[](size, alignment);
-#  ifndef _LIBCPP_HAS_NO_EXCEPTIONS
   } catch (...) {
   }
-#  endif // _LIBCPP_HAS_NO_EXCEPTIONS
   return p;
+#  endif
 }
 
-_LIBCPP_WEAK
-void operator delete(void* ptr, std::align_val_t) noexcept { std::__libcpp_aligned_free(ptr); }
+_LIBCPP_WEAK void operator delete(void* ptr, std::align_val_t) noexcept { std::__libcpp_aligned_free(ptr); }
 
-_LIBCPP_WEAK
-void operator delete(void* ptr, std::align_val_t alignment, const std::nothrow_t&) noexcept {
+_LIBCPP_WEAK void operator delete(void* ptr, std::align_val_t alignment, const std::nothrow_t&) noexcept {
   ::operator delete(ptr, alignment);
 }
 
-_LIBCPP_WEAK
-void operator delete(void* ptr, size_t, std::align_val_t alignment) noexcept { ::operator delete(ptr, alignment); }
+_LIBCPP_WEAK void operator delete(void* ptr, size_t, std::align_val_t alignment) noexcept {
+  ::operator delete(ptr, alignment);
+}
 
-_LIBCPP_WEAK
-void operator delete[](void* ptr, std::align_val_t alignment) noexcept { ::operator delete(ptr, alignment); }
+_LIBCPP_WEAK void operator delete[](void* ptr, std::align_val_t alignment) noexcept {
+  ::operator delete(ptr, alignment);
+}
 
-_LIBCPP_WEAK
-void operator delete[](void* ptr, std::align_val_t alignment, const std::nothrow_t&) noexcept {
+_LIBCPP_WEAK void operator delete[](void* ptr, std::align_val_t alignment, const std::nothrow_t&) noexcept {
   ::operator delete[](ptr, alignment);
 }
 
-_LIBCPP_WEAK
-void operator delete[](void* ptr, size_t, std::align_val_t alignment) noexcept { ::operator delete[](ptr, alignment); }
+_LIBCPP_WEAK void operator delete[](void* ptr, size_t, std::align_val_t alignment) noexcept {
+  ::operator delete[](ptr, alignment);
+}
 
 #endif // !_LIBCPP_HAS_NO_LIBRARY_ALIGNED_ALLOCATION
 // ------------------ END COPY ------------------

From 888f2a58a9d441e29a6769ff0d96ec5306262336 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Mon, 22 Jan 2024 22:34:03 -0500
Subject: [PATCH 542/843] [libc++][NFC] Fix formatting in check_assertion.h

---
 libcxx/test/support/check_assertion.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index c1d0bdcb33ec6..cc841ceb237d8 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -135,7 +135,7 @@ std::string ToString(std::array<DeathCause, N> const& causes) {
   ss << "{";
   for (std::size_t i = 0; i != N; ++i) {
     ss << ToString(causes[i]);
-    if (i+1 != N)
+    if (i + 1 != N)
       ss << ", ";
   }
   ss << "}";
@@ -209,8 +209,8 @@ class DeathTest {
 
     if (std::find(expected_causes.begin(), expected_causes.end(), cause) == expected_causes.end()) {
       std::stringstream failure_description;
-      failure_description                                             //
-          << "Child died, but with a different death cause\n"         //
+      failure_description                                               //
+          << "Child died, but with a different death cause\n"           //
           << "Expected cause(s): " << ToString(expected_causes) << "\n" //
           << "Actual cause:      " << ToString(cause) << "\n";
       return DeathTestResult(Outcome::UnexpectedCause, cause, failure_description.str());
@@ -349,7 +349,8 @@ void std::__libcpp_verbose_abort(char const* format, ...) {
 #endif // _LIBCPP_VERSION
 
 template <std::size_t N, class Func>
-bool ExpectDeath(const std::array<DeathCause, N>& expected_causes, const char* stmt, Func&& func, const Matcher& matcher) {
+bool ExpectDeath(
+    const std::array<DeathCause, N>& expected_causes, const char* stmt, Func&& func, const Matcher& matcher) {
   for (auto cause : expected_causes)
     assert(IsValidCause(cause));
 

From f3dd8f10c77f29ff2563482c96c33f2c2bbc847c Mon Sep 17 00:00:00 2001
From: Yitzhak Mandelbaum <ymand@users.noreply.github.com>
Date: Mon, 22 Jan 2024 22:41:48 -0500
Subject: [PATCH 543/843] [clang][dataflow] Make cap on block visits
 configurable by caller. (#77481)

Previously, we hard-coded the cap on block visits inside the framework.
This
patch enables the caller to specify the cap in the APIs for running an
analysis.
---
 .../Analysis/FlowSensitive/DataflowAnalysis.h | 21 +++++++++++---
 .../TypeErasedDataflowAnalysis.h              |  9 +++++-
 .../TypeErasedDataflowAnalysis.cpp            | 28 ++++++++-----------
 .../Analysis/FlowSensitive/TestingSupport.cpp |  2 +-
 .../Analysis/FlowSensitive/TestingSupport.h   | 13 +++++++--
 .../Analysis/FlowSensitive/TransferTest.cpp   |  3 +-
 .../TypeErasedDataflowAnalysisTest.cpp        |  2 +-
 7 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
index 1dffbe8a09c36..b95095d2184c0 100644
--- a/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/DataflowAnalysis.h
@@ -186,6 +186,12 @@ template <typename LatticeT> struct DataflowAnalysisState {
 /// the dataflow analysis cannot be performed successfully. Otherwise, calls
 /// `PostVisitCFG` on each CFG element with the final analysis results at that
 /// program point.
+///
+/// `MaxBlockVisits` caps the number of block visits during analysis. See
+/// `runTypeErasedDataflowAnalysis` for a full description. The default value is
+/// essentially arbitrary -- large enough to accommodate what seems like any
+/// reasonable CFG, but still small enough to limit the cost of hitting the
+/// limit.
 template <typename AnalysisT>
 llvm::Expected<std::vector<
     std::optional<DataflowAnalysisState<typename AnalysisT::Lattice>>>>
@@ -194,7 +200,8 @@ runDataflowAnalysis(
     const Environment &InitEnv,
     std::function<void(const CFGElement &, const DataflowAnalysisState<
                                                typename AnalysisT::Lattice> &)>
-        PostVisitCFG = nullptr) {
+        PostVisitCFG = nullptr,
+    std::int32_t MaxBlockVisits = 20'000) {
   std::function<void(const CFGElement &,
                      const TypeErasedDataflowAnalysisState &)>
       PostVisitCFGClosure = nullptr;
@@ -212,7 +219,7 @@ runDataflowAnalysis(
   }
 
   auto TypeErasedBlockStates = runTypeErasedDataflowAnalysis(
-      CFCtx, Analysis, InitEnv, PostVisitCFGClosure);
+      CFCtx, Analysis, InitEnv, PostVisitCFGClosure, MaxBlockVisits);
   if (!TypeErasedBlockStates)
     return TypeErasedBlockStates.takeError();
 
@@ -261,6 +268,10 @@ auto createAnalysis(ASTContext &ASTCtx, Environment &Env)
 ///   iterations.
 /// - This limit is still low enough to keep runtimes acceptable (on typical
 ///   machines) in cases where we hit the limit.
+///
+/// `MaxBlockVisits` caps the number of block visits during analysis. See
+/// `runDataflowAnalysis` for a full description and explanation of the default
+/// value.
 template <typename AnalysisT, typename Diagnostic>
 llvm::Expected<llvm::SmallVector<Diagnostic>> diagnoseFunction(
     const FunctionDecl &FuncDecl, ASTContext &ASTCtx,
@@ -268,7 +279,8 @@ llvm::Expected<llvm::SmallVector<Diagnostic>> diagnoseFunction(
         const CFGElement &, ASTContext &,
         const TransferStateForDiagnostics<typename AnalysisT::Lattice> &)>
         Diagnoser,
-    std::int64_t MaxSATIterations = 1'000'000'000) {
+    std::int64_t MaxSATIterations = 1'000'000'000,
+    std::int32_t MaxBlockVisits = 20'000) {
   llvm::Expected<ControlFlowContext> Context =
       ControlFlowContext::build(FuncDecl);
   if (!Context)
@@ -293,7 +305,8 @@ llvm::Expected<llvm::SmallVector<Diagnostic>> diagnoseFunction(
                             State.Lattice.Value),
                         State.Env));
                 llvm::move(EltDiagnostics, std::back_inserter(Diagnostics));
-              })
+              },
+              MaxBlockVisits)
               .takeError())
     return std::move(Err);
 
diff --git a/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h b/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
index 67c323dbf45e1..a0ca7440230b0 100644
--- a/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
+++ b/clang/include/clang/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.h
@@ -138,13 +138,20 @@ struct TypeErasedDataflowAnalysisState {
 /// dataflow analysis cannot be performed successfully. Otherwise, calls
 /// `PostVisitCFG` on each CFG element with the final analysis results at that
 /// program point.
+///
+/// `MaxBlockVisits` caps the number of block visits during analysis. It doesn't
+/// distinguish between repeat visits to the same block and visits to distinct
+/// blocks. This parameter is a backstop to prevent infinite loops, in the case
+/// of bugs in the lattice and/or transfer functions that prevent the analysis
+/// from converging.
 llvm::Expected<std::vector<std::optional<TypeErasedDataflowAnalysisState>>>
 runTypeErasedDataflowAnalysis(
     const ControlFlowContext &CFCtx, TypeErasedDataflowAnalysis &Analysis,
     const Environment &InitEnv,
     std::function<void(const CFGElement &,
                        const TypeErasedDataflowAnalysisState &)>
-        PostVisitCFG = nullptr);
+        PostVisitCFG,
+    std::int32_t MaxBlockVisits);
 
 } // namespace dataflow
 } // namespace clang
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index faf83a8920d4e..f1899590f4125 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -498,7 +498,8 @@ runTypeErasedDataflowAnalysis(
     const Environment &InitEnv,
     std::function<void(const CFGElement &,
                        const TypeErasedDataflowAnalysisState &)>
-        PostVisitCFG) {
+        PostVisitCFG,
+    std::int32_t MaxBlockVisits) {
   PrettyStackTraceAnalysis CrashInfo(CFCtx, "runTypeErasedDataflowAnalysis");
 
   std::optional<Environment> MaybeStartingEnv;
@@ -524,27 +525,20 @@ runTypeErasedDataflowAnalysis(
 
   AnalysisContext AC(CFCtx, Analysis, StartingEnv, BlockStates);
 
-  // Bugs in lattices and transfer functions can prevent the analysis from
-  // converging. To limit the damage (infinite loops) that these bugs can cause,
-  // limit the number of iterations.
-  // FIXME: Consider making the maximum number of iterations configurable.
-  // FIXME: Consider restricting the number of backedges followed, rather than
-  // iterations.
-  // FIXME: Set up statistics (see llvm/ADT/Statistic.h) to count average number
-  // of iterations, number of functions that time out, etc.
-  static constexpr uint32_t MaxAverageVisitsPerBlock = 4;
-  static constexpr uint32_t AbsoluteMaxIterations = 1 << 16;
-  const uint32_t RelativeMaxIterations =
+  // FIXME: remove relative cap. There isn't really any good setting for
+  // `MaxAverageVisitsPerBlock`, so it has no clear value over using
+  // `MaxBlockVisits` directly.
+  static constexpr std::int32_t MaxAverageVisitsPerBlock = 4;
+  const std::int32_t RelativeMaxBlockVisits =
       MaxAverageVisitsPerBlock * BlockStates.size();
-  const uint32_t MaxIterations =
-      std::min(RelativeMaxIterations, AbsoluteMaxIterations);
-  uint32_t Iterations = 0;
+  MaxBlockVisits = std::min(RelativeMaxBlockVisits, MaxBlockVisits);
+  std::int32_t BlockVisits = 0;
   while (const CFGBlock *Block = Worklist.dequeue()) {
     LLVM_DEBUG(llvm::dbgs()
                << "Processing Block " << Block->getBlockID() << "\n");
-    if (++Iterations > MaxIterations) {
+    if (++BlockVisits > MaxBlockVisits) {
       return llvm::createStringError(std::errc::timed_out,
-                                     "maximum number of iterations reached");
+                                     "maximum number of blocks processed");
     }
 
     const std::optional<TypeErasedDataflowAnalysisState> &OldBlockState =
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
index 3726f56fc824d..09f5524e152c9 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.cpp
@@ -4,6 +4,7 @@
 #include "clang/AST/Stmt.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/FlowSensitive/NoopAnalysis.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Basic/LangOptions.h"
 #include "clang/Basic/SourceLocation.h"
@@ -18,7 +19,6 @@
 #include "gtest/gtest.h"
 #include <cassert>
 #include <functional>
-#include <memory>
 #include <string>
 #include <system_error>
 #include <utility>
diff --git a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
index 95ffcbd6f322e..0d36d2802897f 100644
--- a/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
+++ b/clang/unittests/Analysis/FlowSensitive/TestingSupport.h
@@ -33,7 +33,7 @@
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/MatchSwitch.h"
-#include "clang/Analysis/FlowSensitive/NoopAnalysis.h"
+#include "clang/Analysis/FlowSensitive/NoopLattice.h"
 #include "clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h"
 #include "clang/Basic/LLVM.h"
 #include "clang/Serialization/PCHContainerOperations.h"
@@ -62,6 +62,11 @@ std::ostream &operator<<(std::ostream &OS,
 
 namespace test {
 
+// Caps the number of block visits in any individual analysis. Given that test
+// code is typically quite small, we set a low number to help catch any problems
+// early. But, the choice is arbitrary.
+constexpr std::int32_t MaxBlockVisitsInAnalysis = 2'000;
+
 /// Returns the environment at the program point marked with `Annotation` from
 /// the mapping of annotated program points to analysis state.
 ///
@@ -277,8 +282,10 @@ checkDataflow(AnalysisInputs<AnalysisT> AI,
     // If successful, the dataflow analysis returns a mapping from block IDs to
     // the post-analysis states for the CFG blocks that have been evaluated.
     llvm::Expected<std::vector<std::optional<TypeErasedDataflowAnalysisState>>>
-        MaybeBlockStates = runTypeErasedDataflowAnalysis(
-            CFCtx, Analysis, InitEnv, TypeErasedPostVisitCFG);
+        MaybeBlockStates =
+            runTypeErasedDataflowAnalysis(CFCtx, Analysis, InitEnv,
+                                          TypeErasedPostVisitCFG,
+                                          MaxBlockVisitsInAnalysis);
     if (!MaybeBlockStates) return MaybeBlockStates.takeError();
     AO.BlockStates = *MaybeBlockStates;
 
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index 85ae24f0b6f16..c777aca78992d 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -12,14 +12,13 @@
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
+#include "clang/Analysis/FlowSensitive/NoopAnalysis.h"
 #include "clang/Analysis/FlowSensitive/RecordOps.h"
 #include "clang/Analysis/FlowSensitive/StorageLocation.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
 #include "clang/Basic/LangStandard.h"
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/Support/Casting.h"
 #include "llvm/Testing/Support/Error.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
diff --git a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
index fe5ba5ab5426f..466d33358fd38 100644
--- a/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TypeErasedDataflowAnalysisTest.cpp
@@ -263,7 +263,7 @@ TEST_F(DataflowAnalysisTest, NonConvergingAnalysis) {
   auto Res = runAnalysis<NonConvergingAnalysis>(
       Code, [](ASTContext &C) { return NonConvergingAnalysis(C); });
   EXPECT_EQ(llvm::toString(Res.takeError()),
-            "maximum number of iterations reached");
+            "maximum number of blocks processed");
 }
 
 // Regression test for joins of bool-typed lvalue expressions. The first loop

From 904b0901ef2d33e6232b7a51fa2525c30fd117e8 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Mon, 22 Jan 2024 20:17:36 -0800
Subject: [PATCH 544/843] [RISCV] Add FeatureFastUnalignedAccess to
 sifive-p450. (#79075)

---
 clang/test/Preprocessor/riscv-target-features.c | 2 ++
 llvm/lib/Target/RISCV/RISCVProcessors.td        | 3 ++-
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/clang/test/Preprocessor/riscv-target-features.c b/clang/test/Preprocessor/riscv-target-features.c
index 5fde5ccdbeacf..39d2c66f14b23 100644
--- a/clang/test/Preprocessor/riscv-target-features.c
+++ b/clang/test/Preprocessor/riscv-target-features.c
@@ -1400,4 +1400,6 @@
 // RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // RUN: %clang --target=riscv64-unknown-linux-gnu -march=rv64i -E -dM %s \
 // RUN:   -munaligned-access -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
+// RUN: %clang --target=riscv64-unknown-linux-gnu -mcpu=sifive-p450 -E -dM %s \
+// RUN:  -o - | FileCheck %s --check-prefix=CHECK-MISALIGNED-FAST
 // CHECK-MISALIGNED-FAST: __riscv_misaligned_fast 1
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index db60a5a1dab00..af621de9f802a 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -238,7 +238,8 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                        FeatureStdExtZba,
                                        FeatureStdExtZbb,
                                        FeatureStdExtZbs,
-                                       FeatureStdExtZfhmin],
+                                       FeatureStdExtZfhmin,
+                                       FeatureFastUnalignedAccess],
                                       [TuneNoDefaultUnroll,
                                        TuneConditionalCompressedMoveFusion,
                                        TuneLUIADDIFusion,

From 35ab0c78cf5f2294e251a6fb1e0d6e58dc80d955 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 23 Jan 2024 10:48:10 +0700
Subject: [PATCH 545/843] ValueTracking: Add tests fcmpToClassTest for fcmp
 true/false

---
 .../Attributor/nofpclass-implied-by-fcmp.ll   | 286 ++++++++++++++++++
 1 file changed, 286 insertions(+)

diff --git a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
index ea594398c5801..8c7d58bf43d6c 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
@@ -2395,5 +2395,291 @@ define float @assume_oeq_smallest_normal_known_pos(float nofpclass(ninf nsub nno
   ret float %arg
 }
 
+;---------------------------------------------------------------------
+; fcmp false
+;---------------------------------------------------------------------
+
+define float @assume_fabs_false_pinf(float %arg) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_false_pinf(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fabs = call float @llvm.fabs.f32(float %arg)
+  %fcmp = fcmp false float %fabs, 0x7FF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_fabs_false_ninf(float %arg) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_false_ninf(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fabs = call float @llvm.fabs.f32(float %arg)
+  %fcmp = fcmp false float %fabs, 0xFFF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_false_pinf(float %arg) {
+; CHECK-LABEL: define float @assume_false_pinf(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, 0x7FF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_false_ninf(float %arg) {
+; CHECK-LABEL: define float @assume_false_ninf(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, 0xFFF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @clamp_false_pinf_0.0(float %arg) {
+; CHECK-LABEL: define float @clamp_false_pinf_0.0(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, 0x7FF0000000000000
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @clamp_false_ninf_0.0(float %arg) {
+; CHECK-LABEL: define float @clamp_false_ninf_0.0(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, 0xFFF0000000000000
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @clamp_false_smallest_normal_0.0(float %arg) {
+; CHECK-LABEL: define float @clamp_false_smallest_normal_0.0(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, 0x3810000000000000
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @assume_false_p0(float %arg) {
+; CHECK-LABEL: define float @assume_false_p0(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, 0.0
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_false_n0(float %arg) {
+; CHECK-LABEL: define float @assume_false_n0(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, -0.0
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_false_smallest_normal(float %arg) {
+; CHECK-LABEL: define float @assume_false_smallest_normal(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, 0x3810000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @clamp_false_nan(float %arg) {
+; CHECK-LABEL: define float @clamp_false_nan(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, 0x7FF8000000000000
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @clamp_false_p0(float %arg) {
+; CHECK-LABEL: define float @clamp_false_p0(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, 0.0
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @clamp_false_n0(float %arg) {
+; CHECK-LABEL: define float @clamp_false_n0(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp false float %arg, -0.0
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+;---------------------------------------------------------------------
+; fcmp true
+;---------------------------------------------------------------------
+
+define float @assume_fabs_true_pinf(float %arg) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_true_pinf(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fabs = call float @llvm.fabs.f32(float %arg)
+  %fcmp = fcmp true float %fabs, 0x7FF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_fabs_true_ninf(float %arg) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_true_ninf(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fabs = call float @llvm.fabs.f32(float %arg)
+  %fcmp = fcmp true float %fabs, 0xFFF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_true_pinf(float %arg) {
+; CHECK-LABEL: define float @assume_true_pinf(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp true float %arg, 0x7FF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_true_ninf(float %arg) {
+; CHECK-LABEL: define float @assume_true_ninf(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp true float %arg, 0xFFF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_true_p0(float %arg) {
+; CHECK-LABEL: define float @assume_true_p0(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp true float %arg, 0.0
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_true_n0(float %arg) {
+; CHECK-LABEL: define float @assume_true_n0(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp true float %arg, -0.0
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_true_smallest_normal(float %arg) {
+; CHECK-LABEL: define float @assume_true_smallest_normal(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp true float %arg, 0x3810000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @clamp_true_pinf_0.0(float %arg) {
+; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_pinf_0.0(
+; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %fcmp = fcmp true float %arg, 0x7FF0000000000000
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @clamp_true_ninf_0.0(float %arg) {
+; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_ninf_0.0(
+; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %fcmp = fcmp true float %arg, 0xFFF0000000000000
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @clamp_true_smallest_normal_0.0(float %arg) {
+; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_smallest_normal_0.0(
+; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %fcmp = fcmp true float %arg, 0x3810000000000000
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @clamp_true_nan(float %arg) {
+; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_nan(
+; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %fcmp = fcmp true float %arg, 0x7FF8000000000000
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @clamp_true_p0(float %arg) {
+; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_p0(
+; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %fcmp = fcmp true float %arg, 0.0
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
+define float @clamp_true_n0(float %arg) {
+; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_n0(
+; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-NEXT:    ret float 0.000000e+00
+;
+  %fcmp = fcmp true float %arg, -0.0
+  %select = select i1 %fcmp, float 0.0, float %arg
+  ret float %select
+}
+
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; TUNIT: {{.*}}

From 1a99df9f3dfdfd6ff3a3d46113c9e0e9d55ad892 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 23 Jan 2024 11:52:18 +0700
Subject: [PATCH 546/843] ValueTracking: Add tests for fcmpToClassTest for fcmp
 ole/ugt inf

This catches an assertion in a recommit of
dc3faf0ed0e3f1ea9e435a006167d9649f865da1
---
 .../Attributor/nofpclass-implied-by-fcmp.ll   | 104 ++++++++++++++++++
 1 file changed, 104 insertions(+)

diff --git a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
index 8c7d58bf43d6c..3984dd9a25c8e 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
@@ -2395,6 +2395,110 @@ define float @assume_oeq_smallest_normal_known_pos(float nofpclass(ninf nsub nno
   ret float %arg
 }
 
+;---------------------------------------------------------------------
+; compare to inf
+;---------------------------------------------------------------------
+
+define float @assume_ole_pinf(float %arg) {
+; CHECK-LABEL: define float @assume_ole_pinf(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ole float [[ARG]], 0x7FF0000000000000
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp ole float %arg, 0x7FF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_ole_ninf(float %arg) {
+; CHECK-LABEL: define float @assume_ole_ninf(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ole float [[ARG]], 0xFFF0000000000000
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp ole float %arg, 0xFFF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_ugt_pinf(float %arg) {
+; CHECK-LABEL: define float @assume_ugt_pinf(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ugt float [[ARG]], 0x7FF0000000000000
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp ugt float %arg, 0x7FF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_ugt_ninf(float %arg) {
+; CHECK-LABEL: define float @assume_ugt_ninf(
+; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ugt float [[ARG]], 0xFFF0000000000000
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fcmp = fcmp ugt float %arg, 0xFFF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_fabs_ole_pinf(float %arg) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ole_pinf(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
+; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ole float [[FABS]], 0x7FF0000000000000
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fabs = call float @llvm.fabs.f32(float %arg)
+  %fcmp = fcmp ole float %fabs, 0x7FF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_fabs_ole_ninf(float %arg) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ole_ninf(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fabs = call float @llvm.fabs.f32(float %arg)
+  %fcmp = fcmp ole float %fabs, 0xFFF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_fabs_ugt_pinf(float %arg) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ugt_pinf(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
+; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ugt float [[FABS]], 0x7FF0000000000000
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fabs = call float @llvm.fabs.f32(float %arg)
+  %fcmp = fcmp ugt float %fabs, 0x7FF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
+define float @assume_fabs_ugt_ninf(float %arg) {
+; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ugt_ninf(
+; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-NEXT:    call void @llvm.assume(i1 noundef true) #[[ATTR5]]
+; CHECK-NEXT:    ret float [[ARG]]
+;
+  %fabs = call float @llvm.fabs.f32(float %arg)
+  %fcmp = fcmp ugt float %fabs, 0xFFF0000000000000
+  call void @llvm.assume(i1 %fcmp)
+  ret float %arg
+}
+
 ;---------------------------------------------------------------------
 ; fcmp false
 ;---------------------------------------------------------------------

From 8076b896955e91cc2d16283cf8206f3354702022 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 23 Jan 2024 11:43:24 +0700
Subject: [PATCH 547/843] ValueTracking: Handle fcmp true/false in
 fcmpToClassTest

This ensures full compare coverage for certain special constants.
---
 llvm/lib/Analysis/ValueTracking.cpp           |  6 ++++
 .../Attributor/nofpclass-implied-by-fcmp.ll   | 32 +++++++++----------
 2 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 940ae9eb7ee29..34d5010320988 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4019,6 +4019,12 @@ llvm::fcmpToClassTest(FCmpInst::Predicate Pred, const Function &F, Value *LHS,
   if (Pred == FCmpInst::FCMP_UNO && !ConstRHS->isNaN())
     return {LHS, fcNan};
 
+  if (Pred == FCmpInst::FCMP_TRUE)
+    return {LHS, fcAllFlags};
+
+  if (Pred == FCmpInst::FCMP_FALSE)
+    return {LHS, fcNone};
+
   if (ConstRHS->isZero()) {
     // Compares with fcNone are only exactly equal to fcZero if input denormals
     // are not flushed.
diff --git a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
index 3984dd9a25c8e..ccd01de458c3c 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
@@ -2528,8 +2528,8 @@ define float @assume_fabs_false_ninf(float %arg) {
 }
 
 define float @assume_false_pinf(float %arg) {
-; CHECK-LABEL: define float @assume_false_pinf(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(all) float @assume_false_pinf(
+; CHECK-SAME: float returned nofpclass(all) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -2539,8 +2539,8 @@ define float @assume_false_pinf(float %arg) {
 }
 
 define float @assume_false_ninf(float %arg) {
-; CHECK-LABEL: define float @assume_false_ninf(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(all) float @assume_false_ninf(
+; CHECK-SAME: float returned nofpclass(all) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -2580,8 +2580,8 @@ define float @clamp_false_smallest_normal_0.0(float %arg) {
 }
 
 define float @assume_false_p0(float %arg) {
-; CHECK-LABEL: define float @assume_false_p0(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(all) float @assume_false_p0(
+; CHECK-SAME: float returned nofpclass(all) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -2591,8 +2591,8 @@ define float @assume_false_p0(float %arg) {
 }
 
 define float @assume_false_n0(float %arg) {
-; CHECK-LABEL: define float @assume_false_n0(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(all) float @assume_false_n0(
+; CHECK-SAME: float returned nofpclass(all) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -2602,8 +2602,8 @@ define float @assume_false_n0(float %arg) {
 }
 
 define float @assume_false_smallest_normal(float %arg) {
-; CHECK-LABEL: define float @assume_false_smallest_normal(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(all) float @assume_false_smallest_normal(
+; CHECK-SAME: float returned nofpclass(all) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -2727,7 +2727,7 @@ define float @assume_true_smallest_normal(float %arg) {
 
 define float @clamp_true_pinf_0.0(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_pinf_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %fcmp = fcmp true float %arg, 0x7FF0000000000000
@@ -2737,7 +2737,7 @@ define float @clamp_true_pinf_0.0(float %arg) {
 
 define float @clamp_true_ninf_0.0(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_ninf_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %fcmp = fcmp true float %arg, 0xFFF0000000000000
@@ -2747,7 +2747,7 @@ define float @clamp_true_ninf_0.0(float %arg) {
 
 define float @clamp_true_smallest_normal_0.0(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_smallest_normal_0.0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %fcmp = fcmp true float %arg, 0x3810000000000000
@@ -2757,7 +2757,7 @@ define float @clamp_true_smallest_normal_0.0(float %arg) {
 
 define float @clamp_true_nan(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_nan(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %fcmp = fcmp true float %arg, 0x7FF8000000000000
@@ -2767,7 +2767,7 @@ define float @clamp_true_nan(float %arg) {
 
 define float @clamp_true_p0(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_p0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %fcmp = fcmp true float %arg, 0.0
@@ -2777,7 +2777,7 @@ define float @clamp_true_p0(float %arg) {
 
 define float @clamp_true_n0(float %arg) {
 ; CHECK-LABEL: define noundef nofpclass(nan inf nzero sub norm) float @clamp_true_n0(
-; CHECK-SAME: float [[ARG:%.*]]) #[[ATTR2]] {
+; CHECK-SAME: float nofpclass(nan inf nzero sub norm) [[ARG:%.*]]) #[[ATTR2]] {
 ; CHECK-NEXT:    ret float 0.000000e+00
 ;
   %fcmp = fcmp true float %arg, -0.0

From 28f9041879620b44505d54670f5ccb2ed48b24d9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 21:19:11 -0800
Subject: [PATCH 548/843] [DebugInfo] Use DenseMap::lookup (NFC)

---
 llvm/lib/DebugInfo/DWARF/DWARFContext.cpp | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index e6df590b8dd7a..792df53d304aa 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -1353,11 +1353,7 @@ DWARFTypeUnit *DWARFContext::getTypeUnitForHash(uint16_t Version, uint64_t Hash,
           DWOUnits.getUnitForIndexEntry(*R));
     return nullptr;
   }
-  const DenseMap<uint64_t, DWARFTypeUnit *> &Map = State->getTypeUnitMap(IsDWO);
-  auto Iter = Map.find(Hash);
-  if (Iter != Map.end())
-    return Iter->second;
-  return nullptr;
+  return State->getTypeUnitMap(IsDWO).lookup(Hash);
 }
 
 DWARFCompileUnit *DWARFContext::getDWOCompileUnitForHash(uint64_t Hash) {

From a0b459df256e8d16cade7f38a6fdaf2e763c847f Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 21:19:13 -0800
Subject: [PATCH 549/843] [IR] Use StringRef::consume_front (NFC)

---
 llvm/include/llvm/IR/GlobalValue.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/include/llvm/IR/GlobalValue.h b/llvm/include/llvm/IR/GlobalValue.h
index e97a7f2b96360..aa8188cd99fee 100644
--- a/llvm/include/llvm/IR/GlobalValue.h
+++ b/llvm/include/llvm/IR/GlobalValue.h
@@ -564,8 +564,7 @@ class GlobalValue : public Constant {
   /// arbitrary GlobalValue, this is not the function you're looking for; see
   /// Mangler.h.
   static StringRef dropLLVMManglingEscape(StringRef Name) {
-    if (!Name.empty() && Name[0] == '\1')
-      return Name.substr(1);
+    Name.consume_front("\1");
     return Name;
   }
 

From 47c76e7aba0cf823f7fa883e0d374ad1a160067a Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 21:19:14 -0800
Subject: [PATCH 550/843] [SPIRV] Use llvm::find (NFC)

---
 llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
index 7222a93357b79..370da046984f9 100644
--- a/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVModuleAnalysis.cpp
@@ -410,7 +410,7 @@ void SPIRV::RequirementHandler::pruneCapabilities(
     const CapabilityList &ToPrune) {
   for (const auto &Cap : ToPrune) {
     AllCaps.insert(Cap);
-    auto FoundIndex = std::find(MinimalCaps.begin(), MinimalCaps.end(), Cap);
+    auto FoundIndex = llvm::find(MinimalCaps, Cap);
     if (FoundIndex != MinimalCaps.end())
       MinimalCaps.erase(FoundIndex);
     CapabilityList ImplicitDecls =

From 8c3304453c22ad1b5a914e64a7f6435f58f4099c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Mon, 22 Jan 2024 21:19:16 -0800
Subject: [PATCH 551/843] [llvm-diff] Use llvm::predecessors (NFC)

---
 llvm/tools/llvm-diff/lib/DifferenceEngine.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp b/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp
index 0d4d82e5cadd3..64b5051af1489 100644
--- a/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp
+++ b/llvm/tools/llvm-diff/lib/DifferenceEngine.cpp
@@ -214,11 +214,9 @@ class FunctionDifferenceEngine {
   };
 
   unsigned getUnprocPredCount(const BasicBlock *Block) const {
-    unsigned Count = 0;
-    for (const_pred_iterator I = pred_begin(Block), E = pred_end(Block); I != E;
-         ++I)
-      if (!Blocks.count(*I)) Count++;
-    return Count;
+    return llvm::count_if(predecessors(Block), [&](const BasicBlock *Pred) {
+      return !Blocks.contains(Pred);
+    });
   }
 
   typedef std::pair<const BasicBlock *, const BasicBlock *> BlockPair;

From f1d3ebc9918aa6d1d04cda98da1ecef2dd51978c Mon Sep 17 00:00:00 2001
From: "S. B. Tam" <cpplearner@outlook.com>
Date: Tue, 23 Jan 2024 13:25:13 +0800
Subject: [PATCH 552/843]  [libc++][test] Use LIBCPP_ASSERT in some
 `system_category`-related tests (#78834)

---
 .../eq_error_code_error_code.pass.cpp         | 147 +++++++++---------
 .../syserr.errcat.derived/message.pass.cpp    |  23 ++-
 .../system_category.pass.cpp                  |  68 ++++----
 3 files changed, 119 insertions(+), 119 deletions(-)

diff --git a/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp
index c63bfcd955a69..f1f49733280b1 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.compare/eq_error_code_error_code.pass.cpp
@@ -22,88 +22,87 @@
 
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    std::error_code e_code1(5, std::generic_category());
-    std::error_code e_code2(5, std::system_category());
-    std::error_code e_code3(6, std::generic_category());
-    std::error_code e_code4(6, std::system_category());
-    std::error_condition e_condition1(5, std::generic_category());
-    std::error_condition e_condition2(5, std::system_category());
-    std::error_condition e_condition3(6, std::generic_category());
-    std::error_condition e_condition4(6, std::system_category());
+int main(int, char**) {
+  std::error_code e_code1(5, std::generic_category());
+  std::error_code e_code2(5, std::system_category());
+  std::error_code e_code3(6, std::generic_category());
+  std::error_code e_code4(6, std::system_category());
+  std::error_condition e_condition1(5, std::generic_category());
+  std::error_condition e_condition2(5, std::system_category());
+  std::error_condition e_condition3(6, std::generic_category());
+  std::error_condition e_condition4(6, std::system_category());
 
-    assert(e_code1 == e_code1);
-    assert(e_code1 != e_code2);
-    assert(e_code1 != e_code3);
-    assert(e_code1 != e_code4);
-    assert(e_code1 == e_condition1);
-    assert(e_code1 != e_condition2);
-    assert(e_code1 != e_condition3);
-    assert(e_code1 != e_condition4);
+  assert(e_code1 == e_code1);
+  assert(e_code1 != e_code2);
+  assert(e_code1 != e_code3);
+  assert(e_code1 != e_code4);
+  assert(e_code1 == e_condition1);
+  assert(e_code1 != e_condition2);
+  assert(e_code1 != e_condition3);
+  assert(e_code1 != e_condition4);
 
-    assert(e_code2 != e_code1);
-    assert(e_code2 == e_code2);
-    assert(e_code2 != e_code3);
-    assert(e_code2 != e_code4);
-    assert(e_code2 == e_condition1);  // ?
-    assert(e_code2 == e_condition2);
-    assert(e_code2 != e_condition3);
-    assert(e_code2 != e_condition4);
+  assert(e_code2 != e_code1);
+  assert(e_code2 == e_code2);
+  assert(e_code2 != e_code3);
+  assert(e_code2 != e_code4);
+  LIBCPP_ASSERT(e_code2 == e_condition1);
+  assert(e_code2 == e_condition2);
+  LIBCPP_ASSERT(e_code2 != e_condition3);
+  assert(e_code2 != e_condition4);
 
-    assert(e_code3 != e_code1);
-    assert(e_code3 != e_code2);
-    assert(e_code3 == e_code3);
-    assert(e_code3 != e_code4);
-    assert(e_code3 != e_condition1);
-    assert(e_code3 != e_condition2);
-    assert(e_code3 == e_condition3);
-    assert(e_code3 != e_condition4);
+  assert(e_code3 != e_code1);
+  assert(e_code3 != e_code2);
+  assert(e_code3 == e_code3);
+  assert(e_code3 != e_code4);
+  assert(e_code3 != e_condition1);
+  assert(e_code3 != e_condition2);
+  assert(e_code3 == e_condition3);
+  assert(e_code3 != e_condition4);
 
-    assert(e_code4 != e_code1);
-    assert(e_code4 != e_code2);
-    assert(e_code4 != e_code3);
-    assert(e_code4 == e_code4);
-    assert(e_code4 != e_condition1);
-    assert(e_code4 != e_condition2);
-    assert(e_code4 == e_condition3);  // ?
-    assert(e_code4 == e_condition4);
+  assert(e_code4 != e_code1);
+  assert(e_code4 != e_code2);
+  assert(e_code4 != e_code3);
+  assert(e_code4 == e_code4);
+  LIBCPP_ASSERT(e_code4 != e_condition1);
+  assert(e_code4 != e_condition2);
+  LIBCPP_ASSERT(e_code4 == e_condition3);
+  assert(e_code4 == e_condition4);
 
-    assert(e_condition1 == e_code1);
-    assert(e_condition1 == e_code2);  // ?
-    assert(e_condition1 != e_code3);
-    assert(e_condition1 != e_code4);
-    assert(e_condition1 == e_condition1);
-    assert(e_condition1 != e_condition2);
-    assert(e_condition1 != e_condition3);
-    assert(e_condition1 != e_condition4);
+  assert(e_condition1 == e_code1);
+  LIBCPP_ASSERT(e_condition1 == e_code2);
+  assert(e_condition1 != e_code3);
+  LIBCPP_ASSERT(e_condition1 != e_code4);
+  assert(e_condition1 == e_condition1);
+  assert(e_condition1 != e_condition2);
+  assert(e_condition1 != e_condition3);
+  assert(e_condition1 != e_condition4);
 
-    assert(e_condition2 != e_code1);
-    assert(e_condition2 == e_code2);
-    assert(e_condition2 != e_code3);
-    assert(e_condition2 != e_code4);
-    assert(e_condition2 != e_condition1);
-    assert(e_condition2 == e_condition2);
-    assert(e_condition2 != e_condition3);
-    assert(e_condition2 != e_condition4);
+  assert(e_condition2 != e_code1);
+  assert(e_condition2 == e_code2);
+  assert(e_condition2 != e_code3);
+  assert(e_condition2 != e_code4);
+  assert(e_condition2 != e_condition1);
+  assert(e_condition2 == e_condition2);
+  assert(e_condition2 != e_condition3);
+  assert(e_condition2 != e_condition4);
 
-    assert(e_condition3 != e_code1);
-    assert(e_condition3 != e_code2);
-    assert(e_condition3 == e_code3);
-    assert(e_condition3 == e_code4);  // ?
-    assert(e_condition3 != e_condition1);
-    assert(e_condition3 != e_condition2);
-    assert(e_condition3 == e_condition3);
-    assert(e_condition3 != e_condition4);
+  assert(e_condition3 != e_code1);
+  LIBCPP_ASSERT(e_condition3 != e_code2);
+  assert(e_condition3 == e_code3);
+  LIBCPP_ASSERT(e_condition3 == e_code4);
+  assert(e_condition3 != e_condition1);
+  assert(e_condition3 != e_condition2);
+  assert(e_condition3 == e_condition3);
+  assert(e_condition3 != e_condition4);
 
-    assert(e_condition4 != e_code1);
-    assert(e_condition4 != e_code2);
-    assert(e_condition4 != e_code3);
-    assert(e_condition4 == e_code4);
-    assert(e_condition4 != e_condition1);
-    assert(e_condition4 != e_condition2);
-    assert(e_condition4 != e_condition3);
-    assert(e_condition4 == e_condition4);
+  assert(e_condition4 != e_code1);
+  assert(e_condition4 != e_code2);
+  assert(e_condition4 != e_code3);
+  assert(e_condition4 == e_code4);
+  assert(e_condition4 != e_condition1);
+  assert(e_condition4 != e_condition2);
+  assert(e_condition4 != e_condition3);
+  assert(e_condition4 == e_condition4);
 
   return 0;
 }
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp
index a899638ce169a..9f7eb42bc78d9 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.derived/message.pass.cpp
@@ -20,18 +20,17 @@
 
 #include "test_macros.h"
 
-int main(int, char**)
-{
-    const std::error_category& e_cat1 = std::generic_category();
-    const std::error_category& e_cat2 = std::system_category();
-    std::string m1 = e_cat1.message(5);
-    std::string m2 = e_cat2.message(5);
-    std::string m3 = e_cat2.message(6);
-    assert(!m1.empty());
-    assert(!m2.empty());
-    assert(!m3.empty());
-    assert(m1 == m2);
-    assert(m1 != m3);
+int main(int, char**) {
+  const std::error_category& e_cat1 = std::generic_category();
+  const std::error_category& e_cat2 = std::system_category();
+  std::string m1                    = e_cat1.message(5);
+  std::string m2                    = e_cat2.message(5);
+  std::string m3                    = e_cat2.message(6);
+  assert(!m1.empty());
+  assert(!m2.empty());
+  assert(!m3.empty());
+  LIBCPP_ASSERT(m1 == m2);
+  assert(m1 != m3);
 
   return 0;
 }
diff --git a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
index 7c98a42b52010..42fdd1cb3b91b 100644
--- a/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
+++ b/libcxx/test/std/diagnostics/syserr/syserr.errcat/syserr.errcat.objects/system_category.pass.cpp
@@ -23,47 +23,49 @@
 
 // See https://llvm.org/D65667
 struct StaticInit {
-    const std::error_category* ec;
-    ~StaticInit() {
-        std::string str = ec->name();
-        assert(str == "system") ;
-    }
+  const std::error_category* ec;
+  ~StaticInit() {
+    std::string str = ec->name();
+    assert(str == "system");
+  }
 };
 static StaticInit foo;
 
-int main(int, char**)
-{
-    {
-        const std::error_category& e_cat1 = std::system_category();
-        std::error_condition e_cond = e_cat1.default_error_condition(5);
-        assert(e_cond.value() == 5);
-        assert(e_cond.category() == std::generic_category());
-        e_cond = e_cat1.default_error_condition(5000);
-        assert(e_cond.value() == 5000);
-        assert(e_cond.category() == std::system_category());
-    }
+int main(int, char**) {
+  {
+    const std::error_category& e_cat1 = std::system_category();
+    std::error_condition e_cond       = e_cat1.default_error_condition(5);
+    LIBCPP_ASSERT(e_cond.value() == 5);
+    LIBCPP_ASSERT(e_cond.category() == std::generic_category());
+    assert(e_cat1.equivalent(5, e_cond));
 
-    // Test the result of message(int cond) when given a bad error condition
-    {
-            errno = E2BIG; // something that message will never generate
-            const std::error_category& e_cat1 = std::system_category();
-            const std::string msg = e_cat1.message(-1);
-            // Exact message format varies by platform.
+    e_cond = e_cat1.default_error_condition(5000);
+    LIBCPP_ASSERT(e_cond.value() == 5000);
+    LIBCPP_ASSERT(e_cond.category() == std::system_category());
+    assert(e_cat1.equivalent(5000, e_cond));
+  }
+
+  // Test the result of message(int cond) when given a bad error condition
+  {
+    errno                             = E2BIG; // something that message will never generate
+    const std::error_category& e_cat1 = std::system_category();
+    const std::string msg             = e_cat1.message(-1);
+    // Exact message format varies by platform.
 #if defined(_AIX)
-            LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0);
+    LIBCPP_ASSERT(msg.rfind("Error -1 occurred", 0) == 0);
 #elif defined(_NEWLIB_VERSION)
-            LIBCPP_ASSERT(msg.empty());
+    LIBCPP_ASSERT(msg.empty());
 #else
-            LIBCPP_ASSERT(msg.rfind("Unknown error", 0) == 0);
+    LIBCPP_ASSERT(msg.rfind("Unknown error", 0) == 0);
 #endif
-            assert(errno == E2BIG);
-    }
+    assert(errno == E2BIG);
+  }
 
-    {
-        foo.ec = &std::system_category();
-        std::string m = foo.ec->name();
-        assert(m == "system");
-    }
+  {
+    foo.ec        = &std::system_category();
+    std::string m = foo.ec->name();
+    assert(m == "system");
+  }
 
-    return 0;
+  return 0;
 }

From a6065f0fa55aaf694b5f85ecad7badec5cc02425 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Mon, 22 Jan 2024 21:28:07 -0800
Subject: [PATCH 553/843] Arm64EC entry/exit thunks, consolidated. (#79067)

This combines the previously posted patches with some additional work
I've done to more closely match MSVC output.

Most of the important logic here is implemented in
AArch64Arm64ECCallLowering. The purpose of the
AArch64Arm64ECCallLowering is to take "normal" IR we'd generate for
other targets, and generate most of the Arm64EC-specific bits:
generating thunks, mangling symbols, generating aliases, and generating
the .hybmp$x table. This is all done late for a few reasons: to
consolidate the logic as much as possible, and to ensure the IR exposed
to optimization passes doesn't contain complex arm64ec-specific
constructs.

The other changes are supporting changes, to handle the new constructs
generated by that pass.

There's a global llvm.arm64ec.symbolmap representing the .hybmp$x
entries for the thunks. This gets handled directly by the AsmPrinter
because it needs symbol indexes that aren't available before that.

There are two new calling conventions used to represent calls to and
from thunks: ARM64EC_Thunk_X64 and ARM64EC_Thunk_Native. There are a few
changes to handle the associated exception-handling info,
SEH_SaveAnyRegQP and SEH_SaveAnyRegQPX.

I've intentionally left out handling for structs with small
non-power-of-two sizes, because that's easily separated out. The rest of
my current work is here. I squashed my current patches because they were
split in ways that didn't really make sense. Maybe I could split out
some bits, but it's hard to meaningfully test most of the parts
independently.

Thanks to @dpaoliello for extensive testing and suggestions.

(Originally posted as https://reviews.llvm.org/D157547 .)
---
 clang/lib/CodeGen/CGCXX.cpp                   |   5 +
 llvm/include/llvm/IR/CallingConv.h            |  10 +
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp    |  33 +
 llvm/lib/Target/AArch64/AArch64.h             |   2 +
 .../AArch64/AArch64Arm64ECCallLowering.cpp    | 769 ++++++++++++++++++
 llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp |  77 ++
 .../Target/AArch64/AArch64CallingConvention.h |  16 +
 .../AArch64/AArch64CallingConvention.td       | 128 +++
 llvm/lib/Target/AArch64/AArch64FastISel.cpp   |   3 +
 .../Target/AArch64/AArch64FrameLowering.cpp   |  26 +
 .../Target/AArch64/AArch64ISelLowering.cpp    | 111 ++-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   5 +
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  |   7 +-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  11 +
 .../lib/Target/AArch64/AArch64MCInstLower.cpp |  55 +-
 llvm/lib/Target/AArch64/AArch64MCInstLower.h  |   3 +
 .../Target/AArch64/AArch64RegisterInfo.cpp    |   3 +
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp  |  19 +-
 llvm/lib/Target/AArch64/AArch64Subtarget.h    |   4 +-
 .../Target/AArch64/AArch64TargetMachine.cpp   |   8 +-
 llvm/lib/Target/AArch64/CMakeLists.txt        |   1 +
 .../AArch64/GISel/AArch64CallLowering.cpp     |  24 +-
 .../Target/AArch64/Utils/AArch64BaseInfo.h    |  39 +-
 .../test/CodeGen/AArch64/arm64ec-dllimport.ll |   2 +-
 .../arm64ec-entry-thunks-local-linkage.ll     |  20 +
 .../CodeGen/AArch64/arm64ec-entry-thunks.ll   | 470 +++++++++++
 .../CodeGen/AArch64/arm64ec-exit-thunks.ll    | 533 ++++++++++++
 .../CodeGen/AArch64/arm64ec-reservedregs.ll   |   2 +-
 llvm/test/CodeGen/AArch64/arm64ec-varargs.ll  |  16 +-
 .../CodeGen/AArch64/stack-protector-target.ll |   4 +-
 llvm/test/CodeGen/AArch64/win-alloca.ll       |   2 +-
 31 files changed, 2362 insertions(+), 46 deletions(-)
 create mode 100644 llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
 create mode 100644 llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
 create mode 100644 llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
 create mode 100644 llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll

diff --git a/clang/lib/CodeGen/CGCXX.cpp b/clang/lib/CodeGen/CGCXX.cpp
index 110e21f7cb6d1..e95a735f92f74 100644
--- a/clang/lib/CodeGen/CGCXX.cpp
+++ b/clang/lib/CodeGen/CGCXX.cpp
@@ -40,6 +40,11 @@ bool CodeGenModule::TryEmitBaseDestructorAsAlias(const CXXDestructorDecl *D) {
   if (getCodeGenOpts().OptimizationLevel == 0)
     return true;
 
+  // Disable this optimization for ARM64EC.  FIXME: This probably should work,
+  // but getting the symbol table correct is complicated.
+  if (getTarget().getTriple().isWindowsArm64EC())
+    return true;
+
   // If sanitizing memory to check for use-after-dtor, do not emit as
   //  an alias, unless this class owns no members.
   if (getCodeGenOpts().SanitizeMemoryUseAfterDtor &&
diff --git a/llvm/include/llvm/IR/CallingConv.h b/llvm/include/llvm/IR/CallingConv.h
index 3a522c239ad59..bca31b2572eb4 100644
--- a/llvm/include/llvm/IR/CallingConv.h
+++ b/llvm/include/llvm/IR/CallingConv.h
@@ -251,6 +251,16 @@ namespace CallingConv {
     /// Used by GraalVM. Two additional registers are reserved.
     GRAAL = 107,
 
+    /// Calling convention used in the ARM64EC ABI to implement calls between
+    /// x64 code and thunks. This is basically the x64 calling convention using
+    /// ARM64 register names. The first parameter is mapped to x9.
+    ARM64EC_Thunk_X64 = 108,
+
+    /// Calling convention used in the ARM64EC ABI to implement calls between
+    /// ARM64 code and thunks. This is just the ARM64 calling convention,
+    /// except that the first parameter is mapped to x9.
+    ARM64EC_Thunk_Native = 109,
+
     /// The highest possible ID. Must be some 2^k - 1.
     MaxID = 1023
   };
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 7df1c82bf357f..29da2b1c29f83 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2892,6 +2892,39 @@ bool AsmPrinter::emitSpecialLLVMGlobal(const GlobalVariable *GV) {
       GV->hasAvailableExternallyLinkage())
     return true;
 
+  if (GV->getName() == "llvm.arm64ec.symbolmap") {
+    // For ARM64EC, print the table that maps between symbols and the
+    // corresponding thunks to translate between x64 and AArch64 code.
+    // This table is generated by AArch64Arm64ECCallLowering.
+    OutStreamer->switchSection(OutContext.getCOFFSection(
+        ".hybmp$x", COFF::IMAGE_SCN_LNK_INFO, SectionKind::getMetadata()));
+    auto *Arr = cast<ConstantArray>(GV->getInitializer());
+    for (auto &U : Arr->operands()) {
+      auto *C = cast<Constant>(U);
+      auto *Src = cast<Function>(C->getOperand(0)->stripPointerCasts());
+      auto *Dst = cast<Function>(C->getOperand(1)->stripPointerCasts());
+      int Kind = cast<ConstantInt>(C->getOperand(2))->getZExtValue();
+
+      if (Src->hasDLLImportStorageClass()) {
+        // For now, we assume dllimport functions aren't directly called.
+        // (We might change this later to match MSVC.)
+        OutStreamer->emitCOFFSymbolIndex(
+            OutContext.getOrCreateSymbol("__imp_" + Src->getName()));
+        OutStreamer->emitCOFFSymbolIndex(getSymbol(Dst));
+        OutStreamer->emitInt32(Kind);
+      } else {
+        // FIXME: For non-dllimport functions, MSVC emits the same entry
+        // twice, for reasons I don't understand.  I have to assume the linker
+        // ignores the redundant entry; there aren't any reasonable semantics
+        // to attach to it.
+        OutStreamer->emitCOFFSymbolIndex(getSymbol(Src));
+        OutStreamer->emitCOFFSymbolIndex(getSymbol(Dst));
+        OutStreamer->emitInt32(Kind);
+      }
+    }
+    return true;
+  }
+
   if (!GV->hasAppendingLinkage()) return false;
 
   assert(GV->hasInitializer() && "Not a special LLVM global!");
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index d20ef63a72e8f..f7d81f42ef5d8 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -71,6 +71,7 @@ FunctionPass *createAArch64PostSelectOptimize();
 FunctionPass *createAArch64StackTaggingPass(bool IsOptNone);
 FunctionPass *createAArch64StackTaggingPreRAPass();
 ModulePass *createAArch64GlobalsTaggingPass();
+ModulePass *createAArch64Arm64ECCallLoweringPass();
 
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
 void initializeAArch64A57FPLoadBalancingPass(PassRegistry&);
@@ -109,6 +110,7 @@ void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 void initializeSMEABIPass(PassRegistry &);
 void initializeSVEIntrinsicOptsPass(PassRegistry &);
+void initializeAArch64Arm64ECCallLoweringPass(PassRegistry &);
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
new file mode 100644
index 0000000000000..11248bb7aef31
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64Arm64ECCallLowering.cpp
@@ -0,0 +1,769 @@
+//===-- AArch64Arm64ECCallLowering.cpp - Lower Arm64EC calls ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file contains the IR transform to lower external or indirect calls for
+/// the ARM64EC calling convention. Such calls must go through the runtime, so
+/// we can translate the calling convention for calls into the emulator.
+///
+/// This subsumes Control Flow Guard handling.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/IR/CallingConv.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/TargetParser/Triple.h"
+
+using namespace llvm;
+
+using OperandBundleDef = OperandBundleDefT<Value *>;
+
+#define DEBUG_TYPE "arm64eccalllowering"
+
+STATISTIC(Arm64ECCallsLowered, "Number of Arm64EC calls lowered");
+
+static cl::opt<bool> LowerDirectToIndirect("arm64ec-lower-direct-to-indirect",
+                                           cl::Hidden, cl::init(true));
+static cl::opt<bool> GenerateThunks("arm64ec-generate-thunks", cl::Hidden,
+                                    cl::init(true));
+
+namespace {
+
+class AArch64Arm64ECCallLowering : public ModulePass {
+public:
+  static char ID;
+  AArch64Arm64ECCallLowering() : ModulePass(ID) {
+    initializeAArch64Arm64ECCallLoweringPass(*PassRegistry::getPassRegistry());
+  }
+
+  Function *buildExitThunk(FunctionType *FnTy, AttributeList Attrs);
+  Function *buildEntryThunk(Function *F);
+  void lowerCall(CallBase *CB);
+  Function *buildGuestExitThunk(Function *F);
+  bool processFunction(Function &F, SetVector<Function *> &DirectCalledFns);
+  bool runOnModule(Module &M) override;
+
+private:
+  int cfguard_module_flag = 0;
+  FunctionType *GuardFnType = nullptr;
+  PointerType *GuardFnPtrType = nullptr;
+  Constant *GuardFnCFGlobal = nullptr;
+  Constant *GuardFnGlobal = nullptr;
+  Module *M = nullptr;
+
+  Type *PtrTy;
+  Type *I64Ty;
+  Type *VoidTy;
+
+  void getThunkType(FunctionType *FT, AttributeList AttrList, bool EntryThunk,
+                    raw_ostream &Out, FunctionType *&Arm64Ty,
+                    FunctionType *&X64Ty);
+  void getThunkRetType(FunctionType *FT, AttributeList AttrList,
+                       raw_ostream &Out, Type *&Arm64RetTy, Type *&X64RetTy,
+                       SmallVectorImpl<Type *> &Arm64ArgTypes,
+                       SmallVectorImpl<Type *> &X64ArgTypes, bool &HasSretPtr);
+  void getThunkArgTypes(FunctionType *FT, AttributeList AttrList,
+                        raw_ostream &Out,
+                        SmallVectorImpl<Type *> &Arm64ArgTypes,
+                        SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr);
+  void canonicalizeThunkType(Type *T, Align Alignment, bool Ret,
+                             uint64_t ArgSizeBytes, raw_ostream &Out,
+                             Type *&Arm64Ty, Type *&X64Ty);
+};
+
+} // end anonymous namespace
+
+void AArch64Arm64ECCallLowering::getThunkType(FunctionType *FT,
+                                              AttributeList AttrList,
+                                              bool EntryThunk, raw_ostream &Out,
+                                              FunctionType *&Arm64Ty,
+                                              FunctionType *&X64Ty) {
+  Out << (EntryThunk ? "$ientry_thunk$cdecl$" : "$iexit_thunk$cdecl$");
+
+  Type *Arm64RetTy;
+  Type *X64RetTy;
+
+  SmallVector<Type *> Arm64ArgTypes;
+  SmallVector<Type *> X64ArgTypes;
+
+  // The first argument to a thunk is the called function, stored in x9.
+  // For exit thunks, we pass the called function down to the emulator;
+  // for entry thunks, we just call the Arm64 function directly.
+  if (!EntryThunk)
+    Arm64ArgTypes.push_back(PtrTy);
+  X64ArgTypes.push_back(PtrTy);
+
+  bool HasSretPtr = false;
+  getThunkRetType(FT, AttrList, Out, Arm64RetTy, X64RetTy, Arm64ArgTypes,
+                  X64ArgTypes, HasSretPtr);
+
+  getThunkArgTypes(FT, AttrList, Out, Arm64ArgTypes, X64ArgTypes, HasSretPtr);
+
+  Arm64Ty = FunctionType::get(Arm64RetTy, Arm64ArgTypes, false);
+  X64Ty = FunctionType::get(X64RetTy, X64ArgTypes, false);
+}
+
+void AArch64Arm64ECCallLowering::getThunkArgTypes(
+    FunctionType *FT, AttributeList AttrList, raw_ostream &Out,
+    SmallVectorImpl<Type *> &Arm64ArgTypes,
+    SmallVectorImpl<Type *> &X64ArgTypes, bool HasSretPtr) {
+
+  Out << "$";
+  if (FT->isVarArg()) {
+    // We treat the variadic function's thunk as a normal function
+    // with the following type on the ARM side:
+    //   rettype exitthunk(
+    //     ptr x9, ptr x0, i64 x1, i64 x2, i64 x3, ptr x4, i64 x5)
+    //
+    // that can coverage all types of variadic function.
+    // x9 is similar to normal exit thunk, store the called function.
+    // x0-x3 is the arguments be stored in registers.
+    // x4 is the address of the arguments on the stack.
+    // x5 is the size of the arguments on the stack.
+    //
+    // On the x64 side, it's the same except that x5 isn't set.
+    //
+    // If both the ARM and X64 sides are sret, there are only three
+    // arguments in registers.
+    //
+    // If the X64 side is sret, but the ARM side isn't, we pass an extra value
+    // to/from the X64 side, and let SelectionDAG transform it into a memory
+    // location.
+    Out << "varargs";
+
+    // x0-x3
+    for (int i = HasSretPtr ? 1 : 0; i < 4; i++) {
+      Arm64ArgTypes.push_back(I64Ty);
+      X64ArgTypes.push_back(I64Ty);
+    }
+
+    // x4
+    Arm64ArgTypes.push_back(PtrTy);
+    X64ArgTypes.push_back(PtrTy);
+    // x5
+    Arm64ArgTypes.push_back(I64Ty);
+    // FIXME: x5 isn't actually passed/used by the x64 side; revisit once we
+    // have proper isel for varargs
+    X64ArgTypes.push_back(I64Ty);
+    return;
+  }
+
+  unsigned I = 0;
+  if (HasSretPtr)
+    I++;
+
+  if (I == FT->getNumParams()) {
+    Out << "v";
+    return;
+  }
+
+  for (unsigned E = FT->getNumParams(); I != E; ++I) {
+    Align ParamAlign = AttrList.getParamAlignment(I).valueOrOne();
+#if 0
+    // FIXME: Need more information about argument size; see
+    // https://reviews.llvm.org/D132926
+    uint64_t ArgSizeBytes = AttrList.getParamArm64ECArgSizeBytes(I);
+#else
+    uint64_t ArgSizeBytes = 0;
+#endif
+    Type *Arm64Ty, *X64Ty;
+    canonicalizeThunkType(FT->getParamType(I), ParamAlign,
+                          /*Ret*/ false, ArgSizeBytes, Out, Arm64Ty, X64Ty);
+    Arm64ArgTypes.push_back(Arm64Ty);
+    X64ArgTypes.push_back(X64Ty);
+  }
+}
+
+void AArch64Arm64ECCallLowering::getThunkRetType(
+    FunctionType *FT, AttributeList AttrList, raw_ostream &Out,
+    Type *&Arm64RetTy, Type *&X64RetTy, SmallVectorImpl<Type *> &Arm64ArgTypes,
+    SmallVectorImpl<Type *> &X64ArgTypes, bool &HasSretPtr) {
+  Type *T = FT->getReturnType();
+#if 0
+  // FIXME: Need more information about argument size; see
+  // https://reviews.llvm.org/D132926
+  uint64_t ArgSizeBytes = AttrList.getRetArm64ECArgSizeBytes();
+#else
+  int64_t ArgSizeBytes = 0;
+#endif
+  if (T->isVoidTy()) {
+    if (FT->getNumParams()) {
+      auto SRetAttr = AttrList.getParamAttr(0, Attribute::StructRet);
+      auto InRegAttr = AttrList.getParamAttr(0, Attribute::InReg);
+      if (SRetAttr.isValid() && InRegAttr.isValid()) {
+        // sret+inreg indicates a call that returns a C++ class value. This is
+        // actually equivalent to just passing and returning a void* pointer
+        // as the first argument. Translate it that way, instead of trying
+        // to model "inreg" in the thunk's calling convention, to simplify
+        // the rest of the code.
+        Out << "i8";
+        Arm64RetTy = I64Ty;
+        X64RetTy = I64Ty;
+        return;
+      }
+      if (SRetAttr.isValid()) {
+        // FIXME: Sanity-check the sret type; if it's an integer or pointer,
+        // we'll get screwy mangling/codegen.
+        // FIXME: For large struct types, mangle as an integer argument and
+        // integer return, so we can reuse more thunks, instead of "m" syntax.
+        // (MSVC mangles this case as an integer return with no argument, but
+        // that's a miscompile.)
+        Type *SRetType = SRetAttr.getValueAsType();
+        Align SRetAlign = AttrList.getParamAlignment(0).valueOrOne();
+        Type *Arm64Ty, *X64Ty;
+        canonicalizeThunkType(SRetType, SRetAlign, /*Ret*/ true, ArgSizeBytes,
+                              Out, Arm64Ty, X64Ty);
+        Arm64RetTy = VoidTy;
+        X64RetTy = VoidTy;
+        Arm64ArgTypes.push_back(FT->getParamType(0));
+        X64ArgTypes.push_back(FT->getParamType(0));
+        HasSretPtr = true;
+        return;
+      }
+    }
+
+    Out << "v";
+    Arm64RetTy = VoidTy;
+    X64RetTy = VoidTy;
+    return;
+  }
+
+  canonicalizeThunkType(T, Align(), /*Ret*/ true, ArgSizeBytes, Out, Arm64RetTy,
+                        X64RetTy);
+  if (X64RetTy->isPointerTy()) {
+    // If the X64 type is canonicalized to a pointer, that means it's
+    // passed/returned indirectly. For a return value, that means it's an
+    // sret pointer.
+    X64ArgTypes.push_back(X64RetTy);
+    X64RetTy = VoidTy;
+  }
+}
+
+void AArch64Arm64ECCallLowering::canonicalizeThunkType(
+    Type *T, Align Alignment, bool Ret, uint64_t ArgSizeBytes, raw_ostream &Out,
+    Type *&Arm64Ty, Type *&X64Ty) {
+  if (T->isFloatTy()) {
+    Out << "f";
+    Arm64Ty = T;
+    X64Ty = T;
+    return;
+  }
+
+  if (T->isDoubleTy()) {
+    Out << "d";
+    Arm64Ty = T;
+    X64Ty = T;
+    return;
+  }
+
+  if (T->isFloatingPointTy()) {
+    report_fatal_error(
+        "Only 32 and 64 bit floating points are supported for ARM64EC thunks");
+  }
+
+  auto &DL = M->getDataLayout();
+
+  if (auto *StructTy = dyn_cast<StructType>(T))
+    if (StructTy->getNumElements() == 1)
+      T = StructTy->getElementType(0);
+
+  if (T->isArrayTy()) {
+    Type *ElementTy = T->getArrayElementType();
+    uint64_t ElementCnt = T->getArrayNumElements();
+    uint64_t ElementSizePerBytes = DL.getTypeSizeInBits(ElementTy) / 8;
+    uint64_t TotalSizeBytes = ElementCnt * ElementSizePerBytes;
+    if (ElementTy->isFloatTy() || ElementTy->isDoubleTy()) {
+      Out << (ElementTy->isFloatTy() ? "F" : "D") << TotalSizeBytes;
+      if (Alignment.value() >= 8 && !T->isPointerTy())
+        Out << "a" << Alignment.value();
+      Arm64Ty = T;
+      if (TotalSizeBytes <= 8) {
+        // Arm64 returns small structs of float/double in float registers;
+        // X64 uses RAX.
+        X64Ty = llvm::Type::getIntNTy(M->getContext(), TotalSizeBytes * 8);
+      } else {
+        // Struct is passed directly on Arm64, but indirectly on X64.
+        X64Ty = PtrTy;
+      }
+      return;
+    } else if (T->isFloatingPointTy()) {
+      report_fatal_error("Only 32 and 64 bit floating points are supported for "
+                         "ARM64EC thunks");
+    }
+  }
+
+  if ((T->isIntegerTy() || T->isPointerTy()) && DL.getTypeSizeInBits(T) <= 64) {
+    Out << "i8";
+    Arm64Ty = I64Ty;
+    X64Ty = I64Ty;
+    return;
+  }
+
+  unsigned TypeSize = ArgSizeBytes;
+  if (TypeSize == 0)
+    TypeSize = DL.getTypeSizeInBits(T) / 8;
+  Out << "m";
+  if (TypeSize != 4)
+    Out << TypeSize;
+  if (Alignment.value() >= 8 && !T->isPointerTy())
+    Out << "a" << Alignment.value();
+  // FIXME: Try to canonicalize Arm64Ty more thoroughly?
+  Arm64Ty = T;
+  if (TypeSize == 1 || TypeSize == 2 || TypeSize == 4 || TypeSize == 8) {
+    // Pass directly in an integer register
+    X64Ty = llvm::Type::getIntNTy(M->getContext(), TypeSize * 8);
+  } else {
+    // Passed directly on Arm64, but indirectly on X64.
+    X64Ty = PtrTy;
+  }
+}
+
+// This function builds the "exit thunk", a function which translates
+// arguments and return values when calling x64 code from AArch64 code.
+Function *AArch64Arm64ECCallLowering::buildExitThunk(FunctionType *FT,
+                                                     AttributeList Attrs) {
+  SmallString<256> ExitThunkName;
+  llvm::raw_svector_ostream ExitThunkStream(ExitThunkName);
+  FunctionType *Arm64Ty, *X64Ty;
+  getThunkType(FT, Attrs, /*EntryThunk*/ false, ExitThunkStream, Arm64Ty,
+               X64Ty);
+  if (Function *F = M->getFunction(ExitThunkName))
+    return F;
+
+  Function *F = Function::Create(Arm64Ty, GlobalValue::LinkOnceODRLinkage, 0,
+                                 ExitThunkName, M);
+  F->setCallingConv(CallingConv::ARM64EC_Thunk_Native);
+  F->setSection(".wowthk$aa");
+  F->setComdat(M->getOrInsertComdat(ExitThunkName));
+  // Copy MSVC, and always set up a frame pointer. (Maybe this isn't necessary.)
+  F->addFnAttr("frame-pointer", "all");
+  // Only copy sret from the first argument. For C++ instance methods, clang can
+  // stick an sret marking on a later argument, but it doesn't actually affect
+  // the ABI, so we can omit it. This avoids triggering a verifier assertion.
+  if (FT->getNumParams()) {
+    auto SRet = Attrs.getParamAttr(0, Attribute::StructRet);
+    auto InReg = Attrs.getParamAttr(0, Attribute::InReg);
+    if (SRet.isValid() && !InReg.isValid())
+      F->addParamAttr(1, SRet);
+  }
+  // FIXME: Copy anything other than sret?  Shouldn't be necessary for normal
+  // C ABI, but might show up in other cases.
+  BasicBlock *BB = BasicBlock::Create(M->getContext(), "", F);
+  IRBuilder<> IRB(BB);
+  Value *CalleePtr =
+      M->getOrInsertGlobal("__os_arm64x_dispatch_call_no_redirect", PtrTy);
+  Value *Callee = IRB.CreateLoad(PtrTy, CalleePtr);
+  auto &DL = M->getDataLayout();
+  SmallVector<Value *> Args;
+
+  // Pass the called function in x9.
+  Args.push_back(F->arg_begin());
+
+  Type *RetTy = Arm64Ty->getReturnType();
+  if (RetTy != X64Ty->getReturnType()) {
+    // If the return type is an array or struct, translate it. Values of size
+    // 8 or less go into RAX; bigger values go into memory, and we pass a
+    // pointer.
+    if (DL.getTypeStoreSize(RetTy) > 8) {
+      Args.push_back(IRB.CreateAlloca(RetTy));
+    }
+  }
+
+  for (auto &Arg : make_range(F->arg_begin() + 1, F->arg_end())) {
+    // Translate arguments from AArch64 calling convention to x86 calling
+    // convention.
+    //
+    // For simple types, we don't need to do any translation: they're
+    // represented the same way. (Implicit sign extension is not part of
+    // either convention.)
+    //
+    // The big thing we have to worry about is struct types... but
+    // fortunately AArch64 clang is pretty friendly here: the cases that need
+    // translation are always passed as a struct or array. (If we run into
+    // some cases where this doesn't work, we can teach clang to mark it up
+    // with an attribute.)
+    //
+    // The first argument is the called function, stored in x9.
+    if (Arg.getType()->isArrayTy() || Arg.getType()->isStructTy() ||
+        DL.getTypeStoreSize(Arg.getType()) > 8) {
+      Value *Mem = IRB.CreateAlloca(Arg.getType());
+      IRB.CreateStore(&Arg, Mem);
+      if (DL.getTypeStoreSize(Arg.getType()) <= 8) {
+        Type *IntTy = IRB.getIntNTy(DL.getTypeStoreSizeInBits(Arg.getType()));
+        Args.push_back(IRB.CreateLoad(IntTy, IRB.CreateBitCast(Mem, PtrTy)));
+      } else
+        Args.push_back(Mem);
+    } else {
+      Args.push_back(&Arg);
+    }
+  }
+  // FIXME: Transfer necessary attributes? sret? anything else?
+
+  Callee = IRB.CreateBitCast(Callee, PtrTy);
+  CallInst *Call = IRB.CreateCall(X64Ty, Callee, Args);
+  Call->setCallingConv(CallingConv::ARM64EC_Thunk_X64);
+
+  Value *RetVal = Call;
+  if (RetTy != X64Ty->getReturnType()) {
+    // If we rewrote the return type earlier, convert the return value to
+    // the proper type.
+    if (DL.getTypeStoreSize(RetTy) > 8) {
+      RetVal = IRB.CreateLoad(RetTy, Args[1]);
+    } else {
+      Value *CastAlloca = IRB.CreateAlloca(RetTy);
+      IRB.CreateStore(Call, IRB.CreateBitCast(CastAlloca, PtrTy));
+      RetVal = IRB.CreateLoad(RetTy, CastAlloca);
+    }
+  }
+
+  if (RetTy->isVoidTy())
+    IRB.CreateRetVoid();
+  else
+    IRB.CreateRet(RetVal);
+  return F;
+}
+
+// This function builds the "entry thunk", a function which translates
+// arguments and return values when calling AArch64 code from x64 code.
+Function *AArch64Arm64ECCallLowering::buildEntryThunk(Function *F) {
+  SmallString<256> EntryThunkName;
+  llvm::raw_svector_ostream EntryThunkStream(EntryThunkName);
+  FunctionType *Arm64Ty, *X64Ty;
+  getThunkType(F->getFunctionType(), F->getAttributes(), /*EntryThunk*/ true,
+               EntryThunkStream, Arm64Ty, X64Ty);
+  if (Function *F = M->getFunction(EntryThunkName))
+    return F;
+
+  Function *Thunk = Function::Create(X64Ty, GlobalValue::LinkOnceODRLinkage, 0,
+                                     EntryThunkName, M);
+  Thunk->setCallingConv(CallingConv::ARM64EC_Thunk_X64);
+  Thunk->setSection(".wowthk$aa");
+  Thunk->setComdat(M->getOrInsertComdat(EntryThunkName));
+  // Copy MSVC, and always set up a frame pointer. (Maybe this isn't necessary.)
+  Thunk->addFnAttr("frame-pointer", "all");
+
+  auto &DL = M->getDataLayout();
+  BasicBlock *BB = BasicBlock::Create(M->getContext(), "", Thunk);
+  IRBuilder<> IRB(BB);
+
+  Type *RetTy = Arm64Ty->getReturnType();
+  Type *X64RetType = X64Ty->getReturnType();
+
+  bool TransformDirectToSRet = X64RetType->isVoidTy() && !RetTy->isVoidTy();
+  unsigned ThunkArgOffset = TransformDirectToSRet ? 2 : 1;
+
+  // Translate arguments to call.
+  SmallVector<Value *> Args;
+  for (unsigned i = ThunkArgOffset, e = Thunk->arg_size(); i != e; ++i) {
+    Value *Arg = Thunk->getArg(i);
+    Type *ArgTy = Arm64Ty->getParamType(i - ThunkArgOffset);
+    if (ArgTy->isArrayTy() || ArgTy->isStructTy() ||
+        DL.getTypeStoreSize(ArgTy) > 8) {
+      // Translate array/struct arguments to the expected type.
+      if (DL.getTypeStoreSize(ArgTy) <= 8) {
+        Value *CastAlloca = IRB.CreateAlloca(ArgTy);
+        IRB.CreateStore(Arg, IRB.CreateBitCast(CastAlloca, PtrTy));
+        Arg = IRB.CreateLoad(ArgTy, CastAlloca);
+      } else {
+        Arg = IRB.CreateLoad(ArgTy, IRB.CreateBitCast(Arg, PtrTy));
+      }
+    }
+    Args.push_back(Arg);
+  }
+
+  // Call the function passed to the thunk.
+  Value *Callee = Thunk->getArg(0);
+  Callee = IRB.CreateBitCast(Callee, PtrTy);
+  Value *Call = IRB.CreateCall(Arm64Ty, Callee, Args);
+
+  Value *RetVal = Call;
+  if (TransformDirectToSRet) {
+    IRB.CreateStore(RetVal, IRB.CreateBitCast(Thunk->getArg(1), PtrTy));
+  } else if (X64RetType != RetTy) {
+    Value *CastAlloca = IRB.CreateAlloca(X64RetType);
+    IRB.CreateStore(Call, IRB.CreateBitCast(CastAlloca, PtrTy));
+    RetVal = IRB.CreateLoad(X64RetType, CastAlloca);
+  }
+
+  // Return to the caller.  Note that the isel has code to translate this
+  // "ret" to a tail call to __os_arm64x_dispatch_ret.  (Alternatively, we
+  // could emit a tail call here, but that would require a dedicated calling
+  // convention, which seems more complicated overall.)
+  if (X64RetType->isVoidTy())
+    IRB.CreateRetVoid();
+  else
+    IRB.CreateRet(RetVal);
+
+  return Thunk;
+}
+
+// Builds the "guest exit thunk", a helper to call a function which may or may
+// not be an exit thunk. (We optimistically assume non-dllimport function
+// declarations refer to functions defined in AArch64 code; if the linker
+// can't prove that, we use this routine instead.)
+Function *AArch64Arm64ECCallLowering::buildGuestExitThunk(Function *F) {
+  llvm::raw_null_ostream NullThunkName;
+  FunctionType *Arm64Ty, *X64Ty;
+  getThunkType(F->getFunctionType(), F->getAttributes(), /*EntryThunk*/ true,
+               NullThunkName, Arm64Ty, X64Ty);
+  auto MangledName = getArm64ECMangledFunctionName(F->getName().str());
+  assert(MangledName && "Can't guest exit to function that's already native");
+  std::string ThunkName = *MangledName;
+  if (ThunkName[0] == '?' && ThunkName.find("@") != std::string::npos) {
+    ThunkName.insert(ThunkName.find("@"), "$exit_thunk");
+  } else {
+    ThunkName.append("$exit_thunk");
+  }
+  Function *GuestExit =
+      Function::Create(Arm64Ty, GlobalValue::WeakODRLinkage, 0, ThunkName, M);
+  GuestExit->setComdat(M->getOrInsertComdat(ThunkName));
+  GuestExit->setSection(".wowthk$aa");
+  GuestExit->setMetadata(
+      "arm64ec_unmangled_name",
+      MDNode::get(M->getContext(),
+                  MDString::get(M->getContext(), F->getName())));
+  GuestExit->setMetadata(
+      "arm64ec_ecmangled_name",
+      MDNode::get(M->getContext(),
+                  MDString::get(M->getContext(), *MangledName)));
+  F->setMetadata("arm64ec_hasguestexit", MDNode::get(M->getContext(), {}));
+  BasicBlock *BB = BasicBlock::Create(M->getContext(), "", GuestExit);
+  IRBuilder<> B(BB);
+
+  // Load the global symbol as a pointer to the check function.
+  Value *GuardFn;
+  if (cfguard_module_flag == 2 && !F->hasFnAttribute("guard_nocf"))
+    GuardFn = GuardFnCFGlobal;
+  else
+    GuardFn = GuardFnGlobal;
+  LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFn);
+
+  // Create new call instruction. The CFGuard check should always be a call,
+  // even if the original CallBase is an Invoke or CallBr instruction.
+  Function *Thunk = buildExitThunk(F->getFunctionType(), F->getAttributes());
+  CallInst *GuardCheck = B.CreateCall(
+      GuardFnType, GuardCheckLoad,
+      {B.CreateBitCast(F, B.getPtrTy()), B.CreateBitCast(Thunk, B.getPtrTy())});
+
+  // Ensure that the first argument is passed in the correct register.
+  GuardCheck->setCallingConv(CallingConv::CFGuard_Check);
+
+  Value *GuardRetVal = B.CreateBitCast(GuardCheck, PtrTy);
+  SmallVector<Value *> Args;
+  for (Argument &Arg : GuestExit->args())
+    Args.push_back(&Arg);
+  CallInst *Call = B.CreateCall(Arm64Ty, GuardRetVal, Args);
+  Call->setTailCallKind(llvm::CallInst::TCK_MustTail);
+
+  if (Call->getType()->isVoidTy())
+    B.CreateRetVoid();
+  else
+    B.CreateRet(Call);
+
+  auto SRetAttr = F->getAttributes().getParamAttr(0, Attribute::StructRet);
+  auto InRegAttr = F->getAttributes().getParamAttr(0, Attribute::InReg);
+  if (SRetAttr.isValid() && !InRegAttr.isValid()) {
+    GuestExit->addParamAttr(0, SRetAttr);
+    Call->addParamAttr(0, SRetAttr);
+  }
+
+  return GuestExit;
+}
+
+// Lower an indirect call with inline code.
+void AArch64Arm64ECCallLowering::lowerCall(CallBase *CB) {
+  assert(Triple(CB->getModule()->getTargetTriple()).isOSWindows() &&
+         "Only applicable for Windows targets");
+
+  IRBuilder<> B(CB);
+  Value *CalledOperand = CB->getCalledOperand();
+
+  // If the indirect call is called within catchpad or cleanuppad,
+  // we need to copy "funclet" bundle of the call.
+  SmallVector<llvm::OperandBundleDef, 1> Bundles;
+  if (auto Bundle = CB->getOperandBundle(LLVMContext::OB_funclet))
+    Bundles.push_back(OperandBundleDef(*Bundle));
+
+  // Load the global symbol as a pointer to the check function.
+  Value *GuardFn;
+  if (cfguard_module_flag == 2 && !CB->hasFnAttr("guard_nocf"))
+    GuardFn = GuardFnCFGlobal;
+  else
+    GuardFn = GuardFnGlobal;
+  LoadInst *GuardCheckLoad = B.CreateLoad(GuardFnPtrType, GuardFn);
+
+  // Create new call instruction. The CFGuard check should always be a call,
+  // even if the original CallBase is an Invoke or CallBr instruction.
+  Function *Thunk = buildExitThunk(CB->getFunctionType(), CB->getAttributes());
+  CallInst *GuardCheck =
+      B.CreateCall(GuardFnType, GuardCheckLoad,
+                   {B.CreateBitCast(CalledOperand, B.getPtrTy()),
+                    B.CreateBitCast(Thunk, B.getPtrTy())},
+                   Bundles);
+
+  // Ensure that the first argument is passed in the correct register.
+  GuardCheck->setCallingConv(CallingConv::CFGuard_Check);
+
+  Value *GuardRetVal = B.CreateBitCast(GuardCheck, CalledOperand->getType());
+  CB->setCalledOperand(GuardRetVal);
+}
+
+bool AArch64Arm64ECCallLowering::runOnModule(Module &Mod) {
+  if (!GenerateThunks)
+    return false;
+
+  M = &Mod;
+
+  // Check if this module has the cfguard flag and read its value.
+  if (auto *MD =
+          mdconst::extract_or_null<ConstantInt>(M->getModuleFlag("cfguard")))
+    cfguard_module_flag = MD->getZExtValue();
+
+  PtrTy = PointerType::getUnqual(M->getContext());
+  I64Ty = Type::getInt64Ty(M->getContext());
+  VoidTy = Type::getVoidTy(M->getContext());
+
+  GuardFnType = FunctionType::get(PtrTy, {PtrTy, PtrTy}, false);
+  GuardFnPtrType = PointerType::get(GuardFnType, 0);
+  GuardFnCFGlobal =
+      M->getOrInsertGlobal("__os_arm64x_check_icall_cfg", GuardFnPtrType);
+  GuardFnGlobal =
+      M->getOrInsertGlobal("__os_arm64x_check_icall", GuardFnPtrType);
+
+  SetVector<Function *> DirectCalledFns;
+  for (Function &F : Mod)
+    if (!F.isDeclaration() &&
+        F.getCallingConv() != CallingConv::ARM64EC_Thunk_Native &&
+        F.getCallingConv() != CallingConv::ARM64EC_Thunk_X64)
+      processFunction(F, DirectCalledFns);
+
+  struct ThunkInfo {
+    Constant *Src;
+    Constant *Dst;
+    unsigned Kind;
+  };
+  SmallVector<ThunkInfo> ThunkMapping;
+  for (Function &F : Mod) {
+    if (!F.isDeclaration() && (!F.hasLocalLinkage() || F.hasAddressTaken()) &&
+        F.getCallingConv() != CallingConv::ARM64EC_Thunk_Native &&
+        F.getCallingConv() != CallingConv::ARM64EC_Thunk_X64) {
+      if (!F.hasComdat())
+        F.setComdat(Mod.getOrInsertComdat(F.getName()));
+      ThunkMapping.push_back({&F, buildEntryThunk(&F), 1});
+    }
+  }
+  for (Function *F : DirectCalledFns) {
+    ThunkMapping.push_back(
+        {F, buildExitThunk(F->getFunctionType(), F->getAttributes()), 4});
+    if (!F->hasDLLImportStorageClass())
+      ThunkMapping.push_back({buildGuestExitThunk(F), F, 0});
+  }
+
+  if (!ThunkMapping.empty()) {
+    SmallVector<Constant *> ThunkMappingArrayElems;
+    for (ThunkInfo &Thunk : ThunkMapping) {
+      ThunkMappingArrayElems.push_back(ConstantStruct::getAnon(
+          {ConstantExpr::getBitCast(Thunk.Src, PtrTy),
+           ConstantExpr::getBitCast(Thunk.Dst, PtrTy),
+           ConstantInt::get(M->getContext(), APInt(32, Thunk.Kind))}));
+    }
+    Constant *ThunkMappingArray = ConstantArray::get(
+        llvm::ArrayType::get(ThunkMappingArrayElems[0]->getType(),
+                             ThunkMappingArrayElems.size()),
+        ThunkMappingArrayElems);
+    new GlobalVariable(Mod, ThunkMappingArray->getType(), /*isConstant*/ false,
+                       GlobalValue::ExternalLinkage, ThunkMappingArray,
+                       "llvm.arm64ec.symbolmap");
+  }
+
+  return true;
+}
+
+bool AArch64Arm64ECCallLowering::processFunction(
+    Function &F, SetVector<Function *> &DirectCalledFns) {
+  SmallVector<CallBase *, 8> IndirectCalls;
+
+  // For ARM64EC targets, a function definition's name is mangled differently
+  // from the normal symbol. We currently have no representation of this sort
+  // of symbol in IR, so we change the name to the mangled name, then store
+  // the unmangled name as metadata.  Later passes that need the unmangled
+  // name (emitting the definition) can grab it from the metadata.
+  //
+  // FIXME: Handle functions with weak linkage?
+  if (F.hasExternalLinkage() || F.hasWeakLinkage() || F.hasLinkOnceLinkage()) {
+    if (std::optional<std::string> MangledName =
+            getArm64ECMangledFunctionName(F.getName().str())) {
+      F.setMetadata("arm64ec_unmangled_name",
+                    MDNode::get(M->getContext(),
+                                MDString::get(M->getContext(), F.getName())));
+      if (F.hasComdat() && F.getComdat()->getName() == F.getName()) {
+        Comdat *MangledComdat = M->getOrInsertComdat(MangledName.value());
+        SmallVector<GlobalObject *> ComdatUsers =
+            to_vector(F.getComdat()->getUsers());
+        for (GlobalObject *User : ComdatUsers)
+          User->setComdat(MangledComdat);
+      }
+      F.setName(MangledName.value());
+    }
+  }
+
+  // Iterate over the instructions to find all indirect call/invoke/callbr
+  // instructions. Make a separate list of pointers to indirect
+  // call/invoke/callbr instructions because the original instructions will be
+  // deleted as the checks are added.
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      auto *CB = dyn_cast<CallBase>(&I);
+      if (!CB || CB->getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
+          CB->isInlineAsm())
+        continue;
+
+      // We need to instrument any call that isn't directly calling an
+      // ARM64 function.
+      //
+      // FIXME: getCalledFunction() fails if there's a bitcast (e.g.
+      // unprototyped functions in C)
+      if (Function *F = CB->getCalledFunction()) {
+        if (!LowerDirectToIndirect || F->hasLocalLinkage() ||
+            F->isIntrinsic() || !F->isDeclaration())
+          continue;
+
+        DirectCalledFns.insert(F);
+        continue;
+      }
+
+      IndirectCalls.push_back(CB);
+      ++Arm64ECCallsLowered;
+    }
+  }
+
+  if (IndirectCalls.empty())
+    return false;
+
+  for (CallBase *CB : IndirectCalls)
+    lowerCall(CB);
+
+  return true;
+}
+
+char AArch64Arm64ECCallLowering::ID = 0;
+INITIALIZE_PASS(AArch64Arm64ECCallLowering, "Arm64ECCallLowering",
+                "AArch64Arm64ECCallLowering", false, false)
+
+ModulePass *llvm::createAArch64Arm64ECCallLoweringPass() {
+  return new AArch64Arm64ECCallLowering;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 7d2ff146a340b..de247253eb18a 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -167,6 +167,8 @@ class AArch64AsmPrinter : public AsmPrinter {
     return false;
   }
 
+  const MCExpr *lowerConstant(const Constant *CV) override;
+
 private:
   void printOperand(const MachineInstr *MI, unsigned OpNum, raw_ostream &O);
   bool printAsmMRegister(const MachineOperand &MO, char Mode, raw_ostream &O);
@@ -1119,6 +1121,50 @@ void AArch64AsmPrinter::emitFunctionEntryLabel() {
     TS->emitDirectiveVariantPCS(CurrentFnSym);
   }
 
+  if (TM.getTargetTriple().isWindowsArm64EC()) {
+    // For ARM64EC targets, a function definition's name is mangled differently
+    // from the normal symbol. We emit the alias from the unmangled symbol to
+    // mangled symbol name here.
+    if (MDNode *Unmangled =
+            MF->getFunction().getMetadata("arm64ec_unmangled_name")) {
+      AsmPrinter::emitFunctionEntryLabel();
+
+      if (MDNode *ECMangled =
+              MF->getFunction().getMetadata("arm64ec_ecmangled_name")) {
+        StringRef UnmangledStr =
+            cast<MDString>(Unmangled->getOperand(0))->getString();
+        MCSymbol *UnmangledSym =
+            MMI->getContext().getOrCreateSymbol(UnmangledStr);
+        StringRef ECMangledStr =
+            cast<MDString>(ECMangled->getOperand(0))->getString();
+        MCSymbol *ECMangledSym =
+            MMI->getContext().getOrCreateSymbol(ECMangledStr);
+        OutStreamer->emitSymbolAttribute(UnmangledSym, MCSA_WeakAntiDep);
+        OutStreamer->emitAssignment(
+            UnmangledSym,
+            MCSymbolRefExpr::create(ECMangledSym, MCSymbolRefExpr::VK_WEAKREF,
+                                    MMI->getContext()));
+        OutStreamer->emitSymbolAttribute(ECMangledSym, MCSA_WeakAntiDep);
+        OutStreamer->emitAssignment(
+            ECMangledSym,
+            MCSymbolRefExpr::create(CurrentFnSym, MCSymbolRefExpr::VK_WEAKREF,
+                                    MMI->getContext()));
+        return;
+      } else {
+        StringRef UnmangledStr =
+            cast<MDString>(Unmangled->getOperand(0))->getString();
+        MCSymbol *UnmangledSym =
+            MMI->getContext().getOrCreateSymbol(UnmangledStr);
+        OutStreamer->emitSymbolAttribute(UnmangledSym, MCSA_WeakAntiDep);
+        OutStreamer->emitAssignment(
+            UnmangledSym,
+            MCSymbolRefExpr::create(CurrentFnSym, MCSymbolRefExpr::VK_WEAKREF,
+                                    MMI->getContext()));
+        return;
+      }
+    }
+  }
+
   return AsmPrinter::emitFunctionEntryLabel();
 }
 
@@ -1818,6 +1864,28 @@ void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   case AArch64::SEH_PACSignLR:
     TS->emitARM64WinCFIPACSignLR();
     return;
+
+  case AArch64::SEH_SaveAnyRegQP:
+    assert(MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1 &&
+           "Non-consecutive registers not allowed for save_any_reg");
+    assert(MI->getOperand(2).getImm() >= 0 &&
+           "SaveAnyRegQP SEH opcode offset must be non-negative");
+    assert(MI->getOperand(2).getImm() <= 1008 &&
+           "SaveAnyRegQP SEH opcode offset must fit into 6 bits");
+    TS->emitARM64WinCFISaveAnyRegQP(MI->getOperand(0).getImm(),
+                                    MI->getOperand(2).getImm());
+    return;
+
+  case AArch64::SEH_SaveAnyRegQPX:
+    assert(MI->getOperand(1).getImm() - MI->getOperand(0).getImm() == 1 &&
+           "Non-consecutive registers not allowed for save_any_reg");
+    assert(MI->getOperand(2).getImm() < 0 &&
+           "SaveAnyRegQPX SEH opcode offset must be negative");
+    assert(MI->getOperand(2).getImm() >= -1008 &&
+           "SaveAnyRegQPX SEH opcode offset must fit into 6 bits");
+    TS->emitARM64WinCFISaveAnyRegQPX(MI->getOperand(0).getImm(),
+                                     -MI->getOperand(2).getImm());
+    return;
   }
 
   // Finally, do the automated lowerings for everything else.
@@ -2021,6 +2089,15 @@ void AArch64AsmPrinter::emitMachOIFuncStubHelperBody(Module &M,
                                *STI);
 }
 
+const MCExpr *AArch64AsmPrinter::lowerConstant(const Constant *CV) {
+  if (const GlobalValue *GV = dyn_cast<GlobalValue>(CV)) {
+    return MCSymbolRefExpr::create(MCInstLowering.GetGlobalValueSymbol(GV, 0),
+                                   OutContext);
+  }
+
+  return AsmPrinter::lowerConstant(CV);
+}
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64AsmPrinter() {
   RegisterAsmPrinter<AArch64AsmPrinter> X(getTheAArch64leTarget());
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.h b/llvm/lib/Target/AArch64/AArch64CallingConvention.h
index ec46f62d065f7..3b51ee12b7477 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.h
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.h
@@ -22,6 +22,12 @@ bool CC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
 bool CC_AArch64_Arm64EC_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                                CCValAssign::LocInfo LocInfo,
                                ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT,
+                              CCValAssign::LocInfo LocInfo,
+                              ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_Arm64EC_Thunk_Native(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                     CCValAssign::LocInfo LocInfo,
+                                     ISD::ArgFlagsTy ArgFlags, CCState &State);
 bool CC_AArch64_DarwinPCS_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                                  CCValAssign::LocInfo LocInfo,
                                  ISD::ArgFlagsTy ArgFlags, CCState &State);
@@ -40,12 +46,22 @@ bool CC_AArch64_Win64_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
 bool CC_AArch64_Win64_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
                                     CCValAssign::LocInfo LocInfo,
                                     ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool CC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                      CCValAssign::LocInfo LocInfo,
+                                      ISD::ArgFlagsTy ArgFlags, CCState &State);
 bool CC_AArch64_GHC(unsigned ValNo, MVT ValVT, MVT LocVT,
                     CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
                     CCState &State);
 bool RetCC_AArch64_AAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
                          CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
                          CCState &State);
+bool RetCC_AArch64_Arm64EC_Thunk(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                 CCValAssign::LocInfo LocInfo,
+                                 ISD::ArgFlagsTy ArgFlags, CCState &State);
+bool RetCC_AArch64_Arm64EC_CFGuard_Check(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                         CCValAssign::LocInfo LocInfo,
+                                         ISD::ArgFlagsTy ArgFlags,
+                                         CCState &State);
 } // namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index e47996bf38d45..78ea4a5180f70 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -208,6 +208,119 @@ def CC_AArch64_Arm64EC_VarArg : CallingConv<[
   CCIfType<[i32, i64], CCAssignToStack<8, 8>>,
 ]>;
 
+// Arm64EC thunks use a calling convention that's precisely the x64 calling
+// convention, except that the registers have different names, and the callee
+// address is passed in X9.
+let Entry = 1 in
+def CC_AArch64_Arm64EC_Thunk : CallingConv<[
+  // Byval aggregates are passed by pointer
+  CCIfByVal<CCPassIndirect<i64>>,
+
+  // ARM64EC-specific: promote small integers to i32. (x86 only promotes i1,
+  // but that would confuse ARM64 lowering code.)
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // The 'nest' parameter, if any, is passed in R10 (X4).
+  CCIfNest<CCAssignToReg<[X4]>>,
+
+  // A SwiftError is passed in R12 (X19).
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[X19]>>>,
+
+  // Pass SwiftSelf in R13 (X20).
+  CCIfSwiftSelf<CCIfType<[i64], CCAssignToReg<[X20]>>>,
+
+  // Pass SwiftAsync in an otherwise callee saved register so that calls to
+  // normal functions don't need to save it somewhere.
+  CCIfSwiftAsync<CCIfType<[i64], CCAssignToReg<[X21]>>>,
+
+  // The 'CFGuardTarget' parameter, if any, is passed in RAX (R8).
+  CCIfCFGuardTarget<CCAssignToReg<[X8]>>,
+
+  // 128 bit vectors are passed by pointer
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64], CCPassIndirect<i64>>,
+
+  // 256 bit vectors are passed by pointer
+  CCIfType<[v32i8, v16i16, v8i32, v4i64, v16f16, v8f32, v4f64], CCPassIndirect<i64>>,
+
+  // 512 bit vectors are passed by pointer
+  CCIfType<[v64i8, v32i16, v16i32, v32f16, v16f32, v8f64, v8i64], CCPassIndirect<i64>>,
+
+  // Long doubles are passed by pointer
+  CCIfType<[f80], CCPassIndirect<i64>>,
+
+  // The first 4 MMX vector arguments are passed in GPRs.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // The first 4 FP/Vector arguments are passed in XMM registers.
+  CCIfType<[f16],
+           CCAssignToRegWithShadow<[H0, H1, H2, H3],
+                                   [X0, X1, X2, X3]>>,
+  CCIfType<[f32],
+           CCAssignToRegWithShadow<[S0, S1, S2, S3],
+                                   [X0, X1, X2, X3]>>,
+  CCIfType<[f64],
+           CCAssignToRegWithShadow<[D0, D1, D2, D3],
+                                   [X0, X1, X2, X3]>>,
+
+  // The first 4 integer arguments are passed in integer registers.
+  CCIfType<[i32], CCAssignToRegWithShadow<[W0, W1, W2, W3],
+                                          [Q0, Q1, Q2, Q3]>>,
+
+  // Arm64EC thunks: the first argument is always a pointer to the destination
+  // address, stored in x9.
+  CCIfType<[i64], CCAssignToReg<[X9]>>,
+
+  CCIfType<[i64], CCAssignToRegWithShadow<[X0, X1, X2, X3],
+                                          [Q0, Q1, Q2, Q3]>>,
+
+  // Integer/FP values get stored in stack slots that are 8 bytes in size and
+  // 8-byte aligned if there are no more registers to hold them.
+  CCIfType<[i8, i16, i32, i64, f16, f32, f64], CCAssignToStack<8, 8>>
+]>;
+
+// The native side of ARM64EC thunks
+let Entry = 1 in
+def CC_AArch64_Arm64EC_Thunk_Native : CallingConv<[
+  CCIfType<[i64], CCAssignToReg<[X9]>>,
+  CCDelegateTo<CC_AArch64_AAPCS>
+]>;
+
+let Entry = 1 in
+def RetCC_AArch64_Arm64EC_Thunk : CallingConv<[
+  // The X86-Win64 calling convention always returns __m64 values in RAX.
+  CCIfType<[x86mmx], CCBitConvertToType<i64>>,
+
+  // Otherwise, everything is the same as 'normal' X86-64 C CC.
+
+  // The X86-64 calling convention always returns FP values in XMM0.
+  CCIfType<[f16], CCAssignToReg<[H0, H1]>>,
+  CCIfType<[f32], CCAssignToReg<[S0, S1]>>,
+  CCIfType<[f64], CCAssignToReg<[D0, D1]>>,
+  CCIfType<[f128], CCAssignToReg<[Q0, Q1]>>,
+
+  CCIfSwiftError<CCIfType<[i64], CCAssignToReg<[X19]>>>,
+
+  // Scalar values are returned in AX first, then DX.  For i8, the ABI
+  // requires the values to be in AL and AH, however this code uses AL and DL
+  // instead. This is because using AH for the second register conflicts with
+  // the way LLVM does multiple return values -- a return of {i16,i8} would end
+  // up in AX and AH, which overlap. Front-ends wishing to conform to the ABI
+  // for functions that return two i8 values are currently expected to pack the
+  // values into an i16 (which uses AX, and thus AL:AH).
+  //
+  // For code that doesn't care about the ABI, we allow returning more than two
+  // integer values in registers.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+  CCIfType<[i32], CCAssignToReg<[W8, W1, W0]>>,
+  CCIfType<[i64], CCAssignToReg<[X8, X1, X0]>>,
+
+  // Vector types are returned in XMM0 and XMM1, when they fit.  XMM2 and XMM3
+  // can only be used by ABI non-compliant code. If the target doesn't have XMM
+  // registers, it won't have vector types.
+  CCIfType<[v16i8, v8i16, v4i32, v2i64, v8f16, v4f32, v2f64],
+            CCAssignToReg<[Q0, Q1, Q2, Q3]>>
+]>;
+
 // Windows Control Flow Guard checks take a single argument (the target function
 // address) and have no return value.
 let Entry = 1 in
@@ -215,6 +328,16 @@ def CC_AArch64_Win64_CFGuard_Check : CallingConv<[
   CCIfType<[i64], CCAssignToReg<[X15]>>
 ]>;
 
+let Entry = 1 in
+def CC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
+  CCIfType<[i64], CCAssignToReg<[X11, X10]>>
+]>;
+
+let Entry = 1 in
+def RetCC_AArch64_Arm64EC_CFGuard_Check : CallingConv<[
+  CCIfType<[i64], CCAssignToReg<[X11]>>
+]>;
+
 
 // Darwin uses a calling convention which differs in only two ways
 // from the standard one at this level:
@@ -411,6 +534,11 @@ def CSR_Win_AArch64_CFGuard_Check : CalleeSavedRegs<(add CSR_Win_AArch64_AAPCS,
                                                (sequence "X%u", 0, 8),
                                                (sequence "Q%u", 0, 7))>;
 
+// To match the x64 calling convention, Arm64EC thunks preserve q6-q15.
+def CSR_Win_AArch64_Arm64EC_Thunk : CalleeSavedRegs<(add (sequence "Q%u", 6, 15),
+                                                         X19, X20, X21, X22, X23, X24,
+                                                         X25, X26, X27, X28, FP, LR)>;
+
 // AArch64 PCS for vector functions (VPCS)
 // must (additionally) preserve full Q8-Q23 registers
 def CSR_AArch64_AAVPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index e98f6c4984a75..b00235d73d6d1 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -3176,6 +3176,9 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   if (IsVarArg)
     return false;
 
+  if (Subtarget->isWindowsArm64EC())
+    return false;
+
   for (auto Flag : CLI.OutFlags)
     if (Flag.isInReg() || Flag.isSRet() || Flag.isNest() || Flag.isByVal() ||
         Flag.isSwiftSelf() || Flag.isSwiftAsync() || Flag.isSwiftError())
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index caab59201a8d6..d55deec976009 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -1281,6 +1281,30 @@ static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
               .setMIFlag(Flag);
     break;
   }
+  case AArch64::STPQi:
+  case AArch64::LDPQi: {
+    unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(0).getReg());
+    unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQP))
+              .addImm(Reg0)
+              .addImm(Reg1)
+              .addImm(Imm * 16)
+              .setMIFlag(Flag);
+    break;
+  }
+  case AArch64::LDPQpost:
+    Imm = -Imm;
+    LLVM_FALLTHROUGH;
+  case AArch64::STPQpre: {
+    unsigned Reg0 = RegInfo->getSEHRegNum(MBBI->getOperand(1).getReg());
+    unsigned Reg1 = RegInfo->getSEHRegNum(MBBI->getOperand(2).getReg());
+    MIB = BuildMI(MF, DL, TII.get(AArch64::SEH_SaveAnyRegQPX))
+              .addImm(Reg0)
+              .addImm(Reg1)
+              .addImm(Imm * 16)
+              .setMIFlag(Flag);
+    break;
+  }
   }
   auto I = MBB->insertAfter(MBBI, MIB);
   return I;
@@ -1299,6 +1323,8 @@ static void fixupSEHOpcode(MachineBasicBlock::iterator MBBI,
   case AArch64::SEH_SaveReg:
   case AArch64::SEH_SaveFRegP:
   case AArch64::SEH_SaveFReg:
+  case AArch64::SEH_SaveAnyRegQP:
+  case AArch64::SEH_SaveAnyRegQPX:
     ImmOpnd = &MBBI->getOperand(ImmIdx);
     break;
   }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 23d37d67864a5..332fb37655288 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1656,6 +1656,43 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   IsStrictFPEnabled = true;
   setMaxAtomicSizeInBitsSupported(128);
+
+  if (Subtarget->isWindowsArm64EC()) {
+    // FIXME: are there other intrinsics we need to add here?
+    setLibcallName(RTLIB::MEMCPY, "#memcpy");
+    setLibcallName(RTLIB::MEMSET, "#memset");
+    setLibcallName(RTLIB::MEMMOVE, "#memmove");
+    setLibcallName(RTLIB::REM_F32, "#fmodf");
+    setLibcallName(RTLIB::REM_F64, "#fmod");
+    setLibcallName(RTLIB::FMA_F32, "#fmaf");
+    setLibcallName(RTLIB::FMA_F64, "#fma");
+    setLibcallName(RTLIB::SQRT_F32, "#sqrtf");
+    setLibcallName(RTLIB::SQRT_F64, "#sqrt");
+    setLibcallName(RTLIB::CBRT_F32, "#cbrtf");
+    setLibcallName(RTLIB::CBRT_F64, "#cbrt");
+    setLibcallName(RTLIB::LOG_F32, "#logf");
+    setLibcallName(RTLIB::LOG_F64, "#log");
+    setLibcallName(RTLIB::LOG2_F32, "#log2f");
+    setLibcallName(RTLIB::LOG2_F64, "#log2");
+    setLibcallName(RTLIB::LOG10_F32, "#log10f");
+    setLibcallName(RTLIB::LOG10_F64, "#log10");
+    setLibcallName(RTLIB::EXP_F32, "#expf");
+    setLibcallName(RTLIB::EXP_F64, "#exp");
+    setLibcallName(RTLIB::EXP2_F32, "#exp2f");
+    setLibcallName(RTLIB::EXP2_F64, "#exp2");
+    setLibcallName(RTLIB::EXP10_F32, "#exp10f");
+    setLibcallName(RTLIB::EXP10_F64, "#exp10");
+    setLibcallName(RTLIB::SIN_F32, "#sinf");
+    setLibcallName(RTLIB::SIN_F64, "#sin");
+    setLibcallName(RTLIB::COS_F32, "#cosf");
+    setLibcallName(RTLIB::COS_F64, "#cos");
+    setLibcallName(RTLIB::POW_F32, "#powf");
+    setLibcallName(RTLIB::POW_F64, "#pow");
+    setLibcallName(RTLIB::LDEXP_F32, "#ldexpf");
+    setLibcallName(RTLIB::LDEXP_F64, "#ldexp");
+    setLibcallName(RTLIB::FREXP_F32, "#frexpf");
+    setLibcallName(RTLIB::FREXP_F64, "#frexp");
+  }
 }
 
 void AArch64TargetLowering::addTypeForNEON(MVT VT) {
@@ -2651,6 +2688,7 @@ const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
     MAKE_CASE(AArch64ISD::MSRR)
     MAKE_CASE(AArch64ISD::RSHRNB_I)
     MAKE_CASE(AArch64ISD::CTTZ_ELTS)
+    MAKE_CASE(AArch64ISD::CALL_ARM64EC_TO_X64)
   }
 #undef MAKE_CASE
   return nullptr;
@@ -6539,18 +6577,33 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
      }
      return CC_AArch64_Win64PCS;
    case CallingConv::CFGuard_Check:
+     if (Subtarget->isWindowsArm64EC())
+       return CC_AArch64_Arm64EC_CFGuard_Check;
      return CC_AArch64_Win64_CFGuard_Check;
    case CallingConv::AArch64_VectorCall:
    case CallingConv::AArch64_SVE_VectorCall:
    case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X0:
    case CallingConv::AArch64_SME_ABI_Support_Routines_PreserveMost_From_X2:
      return CC_AArch64_AAPCS;
+  case CallingConv::ARM64EC_Thunk_X64:
+    return CC_AArch64_Arm64EC_Thunk;
+  case CallingConv::ARM64EC_Thunk_Native:
+    return CC_AArch64_Arm64EC_Thunk_Native;
   }
 }
 
 CCAssignFn *
 AArch64TargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
-  return RetCC_AArch64_AAPCS;
+  switch (CC) {
+  default:
+    return RetCC_AArch64_AAPCS;
+  case CallingConv::ARM64EC_Thunk_X64:
+    return RetCC_AArch64_Arm64EC_Thunk;
+  case CallingConv::CFGuard_Check:
+    if (Subtarget->isWindowsArm64EC())
+      return RetCC_AArch64_Arm64EC_CFGuard_Check;
+    return RetCC_AArch64_AAPCS;
+  }
 }
 
 
@@ -6602,6 +6655,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   const Function &F = MF.getFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool IsWin64 = Subtarget->isCallingConvWin64(F.getCallingConv());
+  bool StackViaX4 = CallConv == CallingConv::ARM64EC_Thunk_X64 ||
+                    (isVarArg && Subtarget->isWindowsArm64EC());
   AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
   SmallVector<ISD::OutputArg, 4> Outs;
@@ -6771,10 +6826,14 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
 
       SDValue FIN;
       MachinePointerInfo PtrInfo;
-      if (isVarArg && Subtarget->isWindowsArm64EC()) {
-        // In the ARM64EC varargs convention, fixed arguments on the stack are
-        // accessed relative to x4, not sp.
+      if (StackViaX4) {
+        // In both the ARM64EC varargs convention and the thunk convention,
+        // arguments on the stack are accessed relative to x4, not sp. In
+        // the thunk convention, there's an additional offset of 32 bytes
+        // to account for the shadow store.
         unsigned ObjOffset = ArgOffset + BEAlign;
+        if (CallConv == CallingConv::ARM64EC_Thunk_X64)
+          ObjOffset += 32;
         Register VReg = MF.addLiveIn(AArch64::X4, &AArch64::GPR64RegClass);
         SDValue Val = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
         FIN = DAG.getNode(ISD::ADD, DL, MVT::i64, Val,
@@ -6950,9 +7009,11 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
   // On Windows, InReg pointers must be returned, so record the pointer in a
   // virtual register at the start of the function so it can be returned in the
   // epilogue.
-  if (IsWin64) {
+  if (IsWin64 || F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64) {
     for (unsigned I = 0, E = Ins.size(); I != E; ++I) {
-      if (Ins[I].Flags.isInReg() && Ins[I].Flags.isSRet()) {
+      if ((F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64 ||
+           Ins[I].Flags.isInReg()) &&
+          Ins[I].Flags.isSRet()) {
         assert(!FuncInfo->getSRetReturnReg());
 
         MVT PtrTy = getPointerTy(DAG.getDataLayout());
@@ -7183,6 +7244,11 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
   const SmallVector<ISD::OutputArg, 32> &Outs = CLI.Outs;
   bool IsCalleeWin64 = Subtarget->isCallingConvWin64(CalleeCC);
 
+  // For Arm64EC thunks, allocate 32 extra bytes at the bottom of the stack
+  // for the shadow store.
+  if (CalleeCC == CallingConv::ARM64EC_Thunk_X64)
+    CCInfo.AllocateStack(32, Align(16));
+
   unsigned NumArgs = Outs.size();
   for (unsigned i = 0; i != NumArgs; ++i) {
     MVT ArgVT = Outs[i].VT;
@@ -7966,7 +8032,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       Callee = DAG.getNode(AArch64ISD::LOADgot, DL, PtrVT, Callee);
     } else {
       const GlobalValue *GV = G->getGlobal();
-      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
+      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, OpFlags);
     }
   } else if (auto *S = dyn_cast<ExternalSymbolSDNode>(Callee)) {
     if (getTargetMachine().getCodeModel() == CodeModel::Large &&
@@ -8061,8 +8127,11 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     Function *ARCFn = *objcarc::getAttachedARCFunction(CLI.CB);
     auto GA = DAG.getTargetGlobalAddress(ARCFn, DL, PtrVT);
     Ops.insert(Ops.begin() + 1, GA);
-  } else if (GuardWithBTI)
+  } else if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
+    CallOpc = AArch64ISD::CALL_ARM64EC_TO_X64;
+  } else if (GuardWithBTI) {
     CallOpc = AArch64ISD::CALL_BTI;
+  }
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(CallOpc, DL, NodeTys, Ops);
@@ -8262,6 +8331,8 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                                      getPointerTy(MF.getDataLayout()));
 
     unsigned RetValReg = AArch64::X0;
+    if (CallConv == CallingConv::ARM64EC_Thunk_X64)
+      RetValReg = AArch64::X8;
     Chain = DAG.getCopyToReg(Chain, DL, RetValReg, Val, Glue);
     Glue = Chain.getValue(1);
 
@@ -8287,6 +8358,21 @@ AArch64TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   if (Glue.getNode())
     RetOps.push_back(Glue);
 
+  if (CallConv == CallingConv::ARM64EC_Thunk_X64) {
+    // ARM64EC entry thunks use a special return sequence: instead of a regular
+    // "ret" instruction, they need to explicitly call the emulator.
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    SDValue Arm64ECRetDest =
+        DAG.getExternalSymbol("__os_arm64x_dispatch_ret", PtrVT);
+    Arm64ECRetDest =
+        getAddr(cast<ExternalSymbolSDNode>(Arm64ECRetDest), DAG, 0);
+    Arm64ECRetDest = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Arm64ECRetDest,
+                                 MachinePointerInfo());
+    RetOps.insert(RetOps.begin() + 1, Arm64ECRetDest);
+    RetOps.insert(RetOps.begin() + 2, DAG.getTargetConstant(0, DL, MVT::i32));
+    return DAG.getNode(AArch64ISD::TC_RETURN, DL, MVT::Other, RetOps);
+  }
+
   return DAG.getNode(AArch64ISD::RET_GLUE, DL, MVT::Other, RetOps);
 }
 
@@ -8320,6 +8406,12 @@ SDValue AArch64TargetLowering::getTargetNode(BlockAddressSDNode* N, EVT Ty,
   return DAG.getTargetBlockAddress(N->getBlockAddress(), Ty, 0, Flag);
 }
 
+SDValue AArch64TargetLowering::getTargetNode(ExternalSymbolSDNode *N, EVT Ty,
+                                             SelectionDAG &DAG,
+                                             unsigned Flag) const {
+  return DAG.getTargetExternalSymbol(N->getSymbol(), Ty, Flag);
+}
+
 // (loadGOT sym)
 template <class NodeTy>
 SDValue AArch64TargetLowering::getGOT(NodeTy *N, SelectionDAG &DAG,
@@ -8401,8 +8493,7 @@ SDValue AArch64TargetLowering::LowerGlobalAddress(SDValue Op,
   }
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDLoc DL(GN);
-  if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX |
-                 AArch64II::MO_COFFSTUB))
+  if (OpFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB))
     Result = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Result,
                          MachinePointerInfo::getGOT(DAG.getMachineFunction()));
   return Result;
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index abecc3560ccbb..6505931e17e18 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -64,6 +64,9 @@ enum NodeType : unsigned {
   RESTORE_ZT,
   SAVE_ZT,
 
+  // A call with the callee in x16, i.e. "blr x16".
+  CALL_ARM64EC_TO_X64,
+
   // Produces the full sequence of instructions for getting the thread pointer
   // offset of a variable into X0, using the TLSDesc model.
   TLSDESC_CALLSEQ,
@@ -1071,6 +1074,8 @@ class AArch64TargetLowering : public TargetLowering {
                         unsigned Flag) const;
   SDValue getTargetNode(BlockAddressSDNode *N, EVT Ty, SelectionDAG &DAG,
                         unsigned Flag) const;
+  SDValue getTargetNode(ExternalSymbolSDNode *N, EVT Ty, SelectionDAG &DAG,
+                        unsigned Flag) const;
   template <class NodeTy>
   SDValue getGOT(NodeTy *N, SelectionDAG &DAG, unsigned Flags = 0) const;
   template <class NodeTy>
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 42b7a6418032a..656259727c124 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -1068,6 +1068,8 @@ bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
     case AArch64::SEH_EpilogStart:
     case AArch64::SEH_EpilogEnd:
     case AArch64::SEH_PACSignLR:
+    case AArch64::SEH_SaveAnyRegQP:
+    case AArch64::SEH_SaveAnyRegQPX:
       return true;
   }
 }
@@ -8067,9 +8069,10 @@ AArch64InstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
       {MO_S, "aarch64-s"},
       {MO_TLS, "aarch64-tls"},
       {MO_DLLIMPORT, "aarch64-dllimport"},
-      {MO_DLLIMPORTAUX, "aarch64-dllimportaux"},
       {MO_PREL, "aarch64-prel"},
-      {MO_TAGGED, "aarch64-tagged"}};
+      {MO_TAGGED, "aarch64-tagged"},
+      {MO_ARM64EC_CALLMANGLE, "aarch64-arm64ec-callmangle"},
+  };
   return ArrayRef(TargetFlags);
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index c63f23bda6805..03baa7497615e 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -642,6 +642,11 @@ def AArch64call_rvmarker: SDNode<"AArch64ISD::CALL_RVMARKER",
                              [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                               SDNPVariadic]>;
 
+def AArch64call_arm64ec_to_x64 : SDNode<"AArch64ISD::CALL_ARM64EC_TO_X64",
+                                      SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>,
+                                      [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                                       SDNPVariadic]>;
+
 def AArch64brcond        : SDNode<"AArch64ISD::BRCOND", SDT_AArch64Brcond,
                                 [SDNPHasChain]>;
 def AArch64cbz           : SDNode<"AArch64ISD::CBZ", SDT_AArch64cbz,
@@ -2801,6 +2806,10 @@ let isCall = 1, Defs = [LR], Uses = [SP] in {
                      Sched<[WriteBrReg]>;
   def BLR_BTI : Pseudo<(outs), (ins variable_ops), []>,
                 Sched<[WriteBrReg]>;
+  let Uses = [X16, SP] in
+  def BLR_X16 : Pseudo<(outs), (ins), [(AArch64call_arm64ec_to_x64 X16)]>,
+                Sched<[WriteBrReg]>,
+                PseudoInstExpansion<(BLR X16)>;
 } // isCall
 
 def : Pat<(AArch64call GPR64:$Rn),
@@ -4831,6 +4840,8 @@ let isPseudo = 1 in {
   def SEH_EpilogStart : Pseudo<(outs), (ins), []>, Sched<[]>;
   def SEH_EpilogEnd : Pseudo<(outs), (ins), []>, Sched<[]>;
   def SEH_PACSignLR : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SEH_SaveAnyRegQP : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
+  def SEH_SaveAnyRegQPX : Pseudo<(outs), (ins i32imm:$reg0, i32imm:$reg1, i32imm:$offs), []>, Sched<[]>;
 }
 
 // Pseudo instructions for Windows EH
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
index 59969f9dc31ce..1e12cf545fa77 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.cpp
@@ -36,8 +36,11 @@ AArch64MCInstLower::AArch64MCInstLower(MCContext &ctx, AsmPrinter &printer)
 
 MCSymbol *
 AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
-  const GlobalValue *GV = MO.getGlobal();
-  unsigned TargetFlags = MO.getTargetFlags();
+  return GetGlobalValueSymbol(MO.getGlobal(), MO.getTargetFlags());
+}
+
+MCSymbol *AArch64MCInstLower::GetGlobalValueSymbol(const GlobalValue *GV,
+                                                   unsigned TargetFlags) const {
   const Triple &TheTriple = Printer.TM.getTargetTriple();
   if (!TheTriple.isOSBinFormatCOFF())
     return Printer.getSymbolPreferLocal(*GV);
@@ -46,14 +49,54 @@ AArch64MCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
          "Windows is the only supported COFF target");
 
   bool IsIndirect =
-      (TargetFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_DLLIMPORTAUX |
-                      AArch64II::MO_COFFSTUB));
-  if (!IsIndirect)
+      (TargetFlags & (AArch64II::MO_DLLIMPORT | AArch64II::MO_COFFSTUB));
+  if (!IsIndirect) {
+    // For ARM64EC, symbol lookup in the MSVC linker has limited awareness
+    // of ARM64EC mangling ("#"/"$$h"). So object files need to refer to both
+    // the mangled and unmangled names of ARM64EC symbols, even if they aren't
+    // actually used by any relocations. Emit the necessary references here.
+    if (!TheTriple.isWindowsArm64EC() || !isa<Function>(GV) ||
+        !GV->hasExternalLinkage())
+      return Printer.getSymbol(GV);
+
+    StringRef Name = Printer.getSymbol(GV)->getName();
+    // Don't mangle ARM64EC runtime functions.
+    static constexpr StringLiteral ExcludedFns[] = {
+        "__os_arm64x_check_icall_cfg", "__os_arm64x_dispatch_call_no_redirect",
+        "__os_arm64x_check_icall"};
+    if (is_contained(ExcludedFns, Name))
+      return Printer.getSymbol(GV);
+
+    if (std::optional<std::string> MangledName =
+            getArm64ECMangledFunctionName(Name.str())) {
+      MCSymbol *MangledSym = Ctx.getOrCreateSymbol(MangledName.value());
+      if (!cast<Function>(GV)->hasMetadata("arm64ec_hasguestexit")) {
+        Printer.OutStreamer->emitSymbolAttribute(Printer.getSymbol(GV),
+                                                 MCSA_WeakAntiDep);
+        Printer.OutStreamer->emitAssignment(
+            Printer.getSymbol(GV),
+            MCSymbolRefExpr::create(MangledSym, MCSymbolRefExpr::VK_WEAKREF,
+                                    Ctx));
+        Printer.OutStreamer->emitSymbolAttribute(MangledSym, MCSA_WeakAntiDep);
+        Printer.OutStreamer->emitAssignment(
+            MangledSym,
+            MCSymbolRefExpr::create(Printer.getSymbol(GV),
+                                    MCSymbolRefExpr::VK_WEAKREF, Ctx));
+      }
+
+      if (TargetFlags & AArch64II::MO_ARM64EC_CALLMANGLE)
+        return MangledSym;
+    }
+
     return Printer.getSymbol(GV);
+  }
 
   SmallString<128> Name;
 
-  if (TargetFlags & AArch64II::MO_DLLIMPORTAUX) {
+  if ((TargetFlags & AArch64II::MO_DLLIMPORT) &&
+      TheTriple.isWindowsArm64EC() &&
+      !(TargetFlags & AArch64II::MO_ARM64EC_CALLMANGLE) &&
+      isa<Function>(GV)) {
     // __imp_aux is specific to arm64EC; it represents the actual address of
     // an imported function without any thunks.
     //
diff --git a/llvm/lib/Target/AArch64/AArch64MCInstLower.h b/llvm/lib/Target/AArch64/AArch64MCInstLower.h
index e4f8a1bc1a317..474ccff7d65fd 100644
--- a/llvm/lib/Target/AArch64/AArch64MCInstLower.h
+++ b/llvm/lib/Target/AArch64/AArch64MCInstLower.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
 #define LLVM_LIB_TARGET_AARCH64_AARCH64MCINSTLOWER_H
 
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 
@@ -42,6 +43,8 @@ class LLVM_LIBRARY_VISIBILITY AArch64MCInstLower {
                                    MCSymbol *Sym) const;
   MCOperand LowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym) const;
 
+  MCSymbol *GetGlobalValueSymbol(const GlobalValue *GV,
+                                 unsigned TargetFlags) const;
   MCSymbol *GetGlobalAddressSymbol(const MachineOperand &MO) const;
   MCSymbol *GetExternalSymbolSymbol(const MachineOperand &MO) const;
 };
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 24ba9dd95004c..ea9882160d6fb 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -78,6 +78,9 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   if (MF->getFunction().getCallingConv() == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_SaveList;
 
+  if (MF->getFunction().getCallingConv() == CallingConv::ARM64EC_Thunk_X64)
+    return CSR_Win_AArch64_Arm64EC_Thunk_SaveList;
+
   // Darwin has its own CSR_AArch64_AAPCS_SaveList, which means most CSR save
   // lists depending on that will need to have their Darwin variant as well.
   if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index cf57d950ae8d7..e3a0606331db1 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -398,8 +398,6 @@ AArch64Subtarget::ClassifyGlobalReference(const GlobalValue *GV,
 
   if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV)) {
     if (GV->hasDLLImportStorageClass()) {
-      if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy())
-        return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORTAUX;
       return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
     }
     if (getTargetTriple().isOSWindows())
@@ -439,11 +437,18 @@ unsigned AArch64Subtarget::classifyGlobalFunctionReference(
     return AArch64II::MO_GOT;
 
   if (getTargetTriple().isOSWindows()) {
-    if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy() &&
-        GV->hasDLLImportStorageClass()) {
-      // On Arm64EC, if we're calling a function directly, use MO_DLLIMPORT,
-      // not MO_DLLIMPORTAUX.
-      return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT;
+    if (isWindowsArm64EC() && GV->getValueType()->isFunctionTy()) {
+      if (GV->hasDLLImportStorageClass()) {
+        // On Arm64EC, if we're calling a symbol from the import table
+        // directly, use MO_ARM64EC_CALLMANGLE.
+        return AArch64II::MO_GOT | AArch64II::MO_DLLIMPORT |
+               AArch64II::MO_ARM64EC_CALLMANGLE;
+      }
+      if (GV->hasExternalLinkage()) {
+        // If we're calling a symbol directly, use the mangled form in the
+        // call instruction.
+        return AArch64II::MO_ARM64EC_CALLMANGLE;
+      }
     }
 
     // Use ClassifyGlobalReference for setting MO_DLLIMPORT/MO_COFFSTUB.
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index a131cf8a6f540..16864102df59b 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -434,13 +434,13 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 
   const char* getChkStkName() const {
     if (isWindowsArm64EC())
-      return "__chkstk_arm64ec";
+      return "#__chkstk_arm64ec";
     return "__chkstk";
   }
 
   const char* getSecurityCheckCookieName() const {
     if (isWindowsArm64EC())
-      return "__security_check_cookie_arm64ec";
+      return "#__security_check_cookie_arm64ec";
     return "__security_check_cookie";
   }
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 144610e021c58..6fbc13d8904f2 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -632,8 +632,12 @@ void AArch64PassConfig::addIRPasses() {
   addPass(createSMEABIPass());
 
   // Add Control Flow Guard checks.
-  if (TM->getTargetTriple().isOSWindows())
-    addPass(createCFGuardCheckPass());
+  if (TM->getTargetTriple().isOSWindows()) {
+    if (TM->getTargetTriple().isWindowsArm64EC())
+      addPass(createAArch64Arm64ECCallLoweringPass());
+    else
+      addPass(createCFGuardCheckPass());
+  }
 
   if (TM->Options.JMCInstrument)
     addPass(createJMCInstrumenterPass());
diff --git a/llvm/lib/Target/AArch64/CMakeLists.txt b/llvm/lib/Target/AArch64/CMakeLists.txt
index cb5f85801c65b..95b228f293204 100644
--- a/llvm/lib/Target/AArch64/CMakeLists.txt
+++ b/llvm/lib/Target/AArch64/CMakeLists.txt
@@ -42,6 +42,7 @@ add_llvm_target(AArch64CodeGen
   GISel/AArch64RegisterBankInfo.cpp
   AArch64A57FPLoadBalancing.cpp
   AArch64AdvSIMDScalarPass.cpp
+  AArch64Arm64ECCallLowering.cpp
   AArch64AsmPrinter.cpp
   AArch64BranchTargets.cpp
   AArch64CallingConvention.cpp
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 84057ea8d2214..ba70187994ad0 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -637,7 +637,18 @@ bool AArch64CallLowering::lowerFormalArguments(
   MachineRegisterInfo &MRI = MF.getRegInfo();
   auto &DL = F.getParent()->getDataLayout();
   auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
-  // TODO: Support Arm64EC
+
+  // Arm64EC has extra requirements for varargs calls which are only implemented
+  // in SelectionDAG; bail out for now.
+  if (F.isVarArg() && Subtarget.isWindowsArm64EC())
+    return false;
+
+  // Arm64EC thunks have a special calling convention which is only implemented
+  // in SelectionDAG; bail out for now.
+  if (F.getCallingConv() == CallingConv::ARM64EC_Thunk_Native ||
+      F.getCallingConv() == CallingConv::ARM64EC_Thunk_X64)
+    return false;
+
   bool IsWin64 = Subtarget.isCallingConvWin64(F.getCallingConv()) && !Subtarget.isWindowsArm64EC();
 
   SmallVector<ArgInfo, 8> SplitArgs;
@@ -1205,7 +1216,16 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
 
   // Arm64EC has extra requirements for varargs calls; bail out for now.
-  if (Info.IsVarArg && Subtarget.isWindowsArm64EC())
+  //
+  // Arm64EC has special mangling rules for calls; bail out on all calls for
+  // now.
+  if (Subtarget.isWindowsArm64EC())
+    return false;
+
+  // Arm64EC thunks have a special calling convention which is only implemented
+  // in SelectionDAG; bail out for now.
+  if (Info.CallConv == CallingConv::ARM64EC_Thunk_Native ||
+      Info.CallConv == CallingConv::ARM64EC_Thunk_X64)
     return false;
 
   SmallVector<ArgInfo, 8> OutArgs;
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 976e72e8aae47..10e69655f77e1 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -248,6 +248,34 @@ static inline bool atomicBarrierDroppedOnZero(unsigned Opcode) {
   return false;
 }
 
+static inline std::optional<std::string>
+getArm64ECMangledFunctionName(std::string Name) {
+  bool IsCppFn = Name[0] == '?';
+  if (IsCppFn && Name.find("$$h") != std::string::npos)
+    return std::nullopt;
+  if (!IsCppFn && Name[0] == '#')
+    return std::nullopt;
+
+  StringRef Prefix = "$$h";
+  size_t InsertIdx = 0;
+  if (IsCppFn) {
+    InsertIdx = Name.find("@@");
+    size_t ThreeAtSignsIdx = Name.find("@@@");
+    if (InsertIdx != std::string::npos && InsertIdx != ThreeAtSignsIdx) {
+      InsertIdx += 2;
+    } else {
+      InsertIdx = Name.find("@");
+      if (InsertIdx != std::string::npos)
+        InsertIdx++;
+    }
+  } else {
+    Prefix = "#";
+  }
+
+  Name.insert(Name.begin() + InsertIdx, Prefix.begin(), Prefix.end());
+  return std::optional<std::string>(Name);
+}
+
 namespace AArch64CC {
 
 // The CondCodes constants map directly to the 4-bit encoding of the condition
@@ -795,12 +823,11 @@ namespace AArch64II {
     /// an LDG instruction to obtain the tag value.
     MO_TAGGED = 0x400,
 
-    /// MO_DLLIMPORTAUX - Symbol refers to "auxilliary" import stub. On
-    /// Arm64EC, there are two kinds of import stubs used for DLL import of
-    /// functions: MO_DLLIMPORT refers to natively callable Arm64 code, and
-    /// MO_DLLIMPORTAUX refers to the original address which can be compared
-    /// for equality.
-    MO_DLLIMPORTAUX = 0x800,
+    /// MO_ARM64EC_CALLMANGLE - Operand refers to the Arm64EC-mangled version
+    /// of a symbol, not the original. For dllimport symbols, this means it
+    /// uses "__imp_aux".  For other symbols, this means it uses the mangled
+    /// ("#" prefix for C) name.
+    MO_ARM64EC_CALLMANGLE = 0x800,
   };
 } // end namespace AArch64II
 
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-dllimport.ll b/llvm/test/CodeGen/AArch64/arm64ec-dllimport.ll
index c1350d55f72e7..b51a291876b9d 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-dllimport.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-dllimport.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc -arm64ec-generate-thunks=false < %s | FileCheck %s
 
 @a = external dllimport global i32
 declare dllimport void @b()
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
new file mode 100644
index 0000000000000..00ae34bf4b00f
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
@@ -0,0 +1,20 @@
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
+
+; Validates when local linkage functions get a thunk generated.
+
+; Being called does not cause a thunk to be generated.
+; CHECK-NOT:  $ientry_thunk$cdecl$v$f;
+define internal void @does_not_have_addr_taken(float) nounwind {
+  ret void
+}
+define void @calls_does_not_have_addr_taken() nounwind {
+  call void @does_not_have_addr_taken(float 0.0)
+  ret void
+}
+
+; Having an address taken does cause a thunk to be generated.
+; CHECK: $ientry_thunk$cdecl$v$i8;
+define internal void @has_addr_taken(i64) nounwind {
+  ret void
+}
+@points_to_has_addr_taken = global ptr @has_addr_taken
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
new file mode 100644
index 0000000000000..5c56f51e1ca55
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks.ll
@@ -0,0 +1,470 @@
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
+
+define void @no_op() nounwind {
+; CHECK-LABEL     .def    $ientry_thunk$cdecl$v$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$v$v
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     stp     q6, q7, [sp, #-176]!            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     stp     q8, q9, [sp, #32]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     stp     q10, q11, [sp, #64]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     stp     q12, q13, [sp, #96]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     stp     q14, q15, [sp, #128]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     stp     x29, x30, [sp, #160]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     add     x29, sp, #160
+; CHECK-NEXT:     .seh_add_fp     160
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
+; CHECK-NEXT:     ldr     x0, [x8, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #160]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     ldp     q14, q15, [sp, #128]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     ldp     q12, q13, [sp, #96]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     ldp     q10, q11, [sp, #64]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     ldp     q8, q9, [sp, #32]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     ldp     q6, q7, [sp], #176              // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x0
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+  ret void
+}
+
+define i64 @simple_integers(i8, i16, i32, i64) nounwind {
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$i8$i8i8i8i8;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$i8$i8i8i8i8
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     stp     q6, q7, [sp, #-176]!            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     stp     q8, q9, [sp, #32]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     stp     q10, q11, [sp, #64]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     stp     q12, q13, [sp, #96]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     stp     q14, q15, [sp, #128]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     stp     x29, x30, [sp, #160]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     add     x29, sp, #160
+; CHECK-NEXT:     .seh_add_fp     160
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
+; CHECK-NEXT:     ldr     x1, [x8, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT:     mov     x8, x0
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #160]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     ldp     q14, q15, [sp, #128]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     ldp     q12, q13, [sp, #96]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     ldp     q10, q11, [sp, #64]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     ldp     q8, q9, [sp, #32]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     ldp     q6, q7, [sp], #176              // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x1
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+  ret i64 0
+}
+
+; NOTE: Only float and double are supported.
+define double @simple_floats(float, double) nounwind {
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$d$fd;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$d$fd
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     stp     q6, q7, [sp, #-176]!            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     stp     q8, q9, [sp, #32]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     stp     q10, q11, [sp, #64]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     stp     q12, q13, [sp, #96]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     stp     q14, q15, [sp, #128]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     stp     x29, x30, [sp, #160]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     add     x29, sp, #160
+; CHECK-NEXT:     .seh_add_fp     160
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
+; CHECK-NEXT:     ldr     x0, [x8, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #160]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     ldp     q14, q15, [sp, #128]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     ldp     q12, q13, [sp, #96]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     ldp     q10, q11, [sp, #64]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     ldp     q8, q9, [sp, #32]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     ldp     q6, q7, [sp], #176              // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x0
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+  ret double 0.0
+}
+
+define void @has_varargs(...) nounwind {
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$v$varargs;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$v$varargs
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     stp     q6, q7, [sp, #-176]!            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     stp     q8, q9, [sp, #32]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     stp     q10, q11, [sp, #64]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     stp     q12, q13, [sp, #96]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     stp     q14, q15, [sp, #128]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     stp     x29, x30, [sp, #160]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     add     x29, sp, #160
+; CHECK-NEXT:     .seh_add_fp     160
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     ldp     x8, x5, [x4, #32]
+; CHECK-NEXT:     mov     x4, x8
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
+; CHECK-NEXT:     ldr     x0, [x8, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #160]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     ldp     q14, q15, [sp, #128]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     ldp     q12, q13, [sp, #96]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     ldp     q10, q11, [sp, #64]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     ldp     q8, q9, [sp, #32]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     ldp     q6, q7, [sp], #176              // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x0
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+  ret void
+}
+
+define void @has_sret(ptr sret([100 x i8])) nounwind {
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$i8$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$i8$v
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     stp     q6, q7, [sp, #-176]!            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     stp     q8, q9, [sp, #32]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     stp     q10, q11, [sp, #64]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     stp     q12, q13, [sp, #96]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     stp     q14, q15, [sp, #128]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     stp     x29, x30, [sp, #160]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     add     x29, sp, #160
+; CHECK-NEXT:     .seh_add_fp     160
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
+; CHECK-NEXT:     ldr     x1, [x8, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT:     mov     x8, x0
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #160]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     ldp     q14, q15, [sp, #128]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     ldp     q12, q13, [sp, #96]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     ldp     q10, q11, [sp, #64]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     ldp     q8, q9, [sp, #32]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     ldp     q6, q7, [sp], #176              // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x1
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+  ret void
+}
+
+define i8 @matches_has_sret() nounwind {
+; Verify that $ientry_thunk$cdecl$i8$v is re-used by a function with matching signature.
+; CHECK-NOT: .def    $ientry_thunk$cdecl$i8$v;
+  ret i8 0
+}
+
+%TSRet = type { i64, i64 }
+define void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind {
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$m16a32$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m16a32$v
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     stp     q6, q7, [sp, #-176]!            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     stp     q8, q9, [sp, #32]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     stp     q10, q11, [sp, #64]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     stp     q12, q13, [sp, #96]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     stp     q14, q15, [sp, #128]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     stp     x29, x30, [sp, #160]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     add     x29, sp, #160
+; CHECK-NEXT:     .seh_add_fp     160
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
+; CHECK-NEXT:     ldr     x0, [x8, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #160]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  160
+; CHECK-NEXT:     ldp     q14, q15, [sp, #128]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     ldp     q12, q13, [sp, #96]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     ldp     q10, q11, [sp, #64]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     ldp     q8, q9, [sp, #32]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     ldp     q6, q7, [sp], #176              // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 176
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x0
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+  ret void
+}
+
+define [2 x i8] @small_array([2 x i8] %0, [2 x float]) nounwind {
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$m2$m2F8;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m2$m2F8
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #192
+; CHECK-NEXT:     .seh_stackalloc 192
+; CHECK-NEXT:     stp     q6, q7, [sp, #16]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q6, 16
+; CHECK-NEXT:     stp     q8, q9, [sp, #48]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 48
+; CHECK-NEXT:     stp     q10, q11, [sp, #80]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 80
+; CHECK-NEXT:     stp     q12, q13, [sp, #112]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 112
+; CHECK-NEXT:     stp     q14, q15, [sp, #144]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 144
+; CHECK-NEXT:     stp     x29, x30, [sp, #176]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  176
+; CHECK-NEXT:     add     x29, sp, #176
+; CHECK-NEXT:     .seh_add_fp     176
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     stur    x1, [sp, #4]
+; CHECK-NEXT:     ubfx    w1, w0, #8, #8
+; CHECK-NEXT:     ldp     s0, s1, [sp, #4]
+; CHECK-NEXT:     strh    w0, [sp, #14]
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     adrp    x9, __os_arm64x_dispatch_ret
+; CHECK-NEXT:     strb    w0, [sp, #2]
+; CHECK-NEXT:     strb    w1, [sp, #3]
+; CHECK-NEXT:     ldrh    w8, [sp, #2]
+; CHECK-NEXT:     ldr     x0, [x9, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #176]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  176
+; CHECK-NEXT:     ldp     q14, q15, [sp, #144]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 144
+; CHECK-NEXT:     ldp     q12, q13, [sp, #112]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 112
+; CHECK-NEXT:     ldp     q10, q11, [sp, #80]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 80
+; CHECK-NEXT:     ldp     q8, q9, [sp, #48]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 48
+; CHECK-NEXT:     ldp     q6, q7, [sp, #16]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q6, 16
+; CHECK-NEXT:     add     sp, sp, #192
+; CHECK-NEXT:     .seh_stackalloc 192
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x0
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+  ret [2 x i8] %0
+}
+
+define [3 x i64] @large_array([3 x i64] %0, [2 x double], [2 x [2 x i64]]) nounwind {
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$m24$m24D16m32;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m24$m24D16m32
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     stp     q6, q7, [sp, #-192]!            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 192
+; CHECK-NEXT:     stp     q8, q9, [sp, #32]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     stp     q10, q11, [sp, #64]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     stp     q12, q13, [sp, #96]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     stp     q14, q15, [sp, #128]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     str     x19, [sp, #160]                 // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg   x19, 160
+; CHECK-NEXT:     stp     x29, x30, [sp, #168]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  168
+; CHECK-NEXT:     add     x29, sp, #168
+; CHECK-NEXT:     .seh_add_fp     168
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     ldp     x10, x8, [x1, #8]
+; CHECK-NEXT:     mov     x19, x0
+; CHECK-NEXT:     ldp     d0, d1, [x2]
+; CHECK-NEXT:     ldr     x0, [x1]
+; CHECK-NEXT:     ldp     x5, x6, [x3, #16]
+; CHECK-NEXT:     ldp     x3, x4, [x3]
+; CHECK-NEXT:     mov     x1, x10
+; CHECK-NEXT:     mov     x2, x8
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     stp     x0, x1, [x19]
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_ret
+; CHECK-NEXT:     str     x2, [x19, #16]
+; CHECK-NEXT:     ldr     x0, [x8, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #168]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  168
+; CHECK-NEXT:     ldr     x19, [sp, #160]                 // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg   x19, 160
+; CHECK-NEXT:     ldp     q14, q15, [sp, #128]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 128
+; CHECK-NEXT:     ldp     q12, q13, [sp, #96]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 96
+; CHECK-NEXT:     ldp     q10, q11, [sp, #64]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 64
+; CHECK-NEXT:     ldp     q8, q9, [sp, #32]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 32
+; CHECK-NEXT:     ldp     q6, q7, [sp], #192              // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_px    q6, 192
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x0
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+  ret [3 x i64] %0
+}
+
+%T1 = type { i16 }
+%T2 = type { i32, float }
+%T3 = type { i64, double }
+%T4 = type { i64, double, i8 }
+define %T2 @simple_struct(%T1 %0, %T2 %1, %T3, %T4) nounwind {
+; CHECK-LABEL:    .def    $ientry_thunk$cdecl$m8$i8m8m16m24;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$ientry_thunk$cdecl$m8$i8m8m16m24
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #192
+; CHECK-NEXT:     .seh_stackalloc 192
+; CHECK-NEXT:     stp     q6, q7, [sp, #16]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q6, 16
+; CHECK-NEXT:     stp     q8, q9, [sp, #48]               // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 48
+; CHECK-NEXT:     stp     q10, q11, [sp, #80]             // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 80
+; CHECK-NEXT:     stp     q12, q13, [sp, #112]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 112
+; CHECK-NEXT:     stp     q14, q15, [sp, #144]            // 32-byte Folded Spill
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 144
+; CHECK-NEXT:     stp     x29, x30, [sp, #176]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  176
+; CHECK-NEXT:     add     x29, sp, #176
+; CHECK-NEXT:     .seh_add_fp     176
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     str     x1, [sp, #8]
+; CHECK-NEXT:     ldr     x8, [x2]
+; CHECK-NEXT:     ldr     x10, [x3]
+; CHECK-NEXT:     ldr     d1, [x2, #8]
+; CHECK-NEXT:                                     // kill: def $w1 killed $w1 killed $x1
+; CHECK-NEXT:     ldr     s0, [sp, #12]
+; CHECK-NEXT:     ldr     d2, [x3, #8]
+; CHECK-NEXT:     mov     x2, x8
+; CHECK-NEXT:     ldrb    w4, [x3, #16]
+; CHECK-NEXT:     mov     x3, x10
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     adrp    x9, __os_arm64x_dispatch_ret
+; CHECK-NEXT:     str     w0, [sp]
+; CHECK-NEXT:     str     s0, [sp, #4]
+; CHECK-NEXT:     ldr     x8, [sp]
+; CHECK-NEXT:     ldr     x0, [x9, :lo12:__os_arm64x_dispatch_ret]
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #176]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  176
+; CHECK-NEXT:     ldp     q14, q15, [sp, #144]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q14, 144
+; CHECK-NEXT:     ldp     q12, q13, [sp, #112]            // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q12, 112
+; CHECK-NEXT:     ldp     q10, q11, [sp, #80]             // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q10, 80
+; CHECK-NEXT:     ldp     q8, q9, [sp, #48]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q8, 48
+; CHECK-NEXT:     ldp     q6, q7, [sp, #16]               // 32-byte Folded Reload
+; CHECK-NEXT:     .seh_save_any_reg_p     q6, 16
+; CHECK-NEXT:     add     sp, sp, #192
+; CHECK-NEXT:     .seh_stackalloc 192
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x0
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+  ret %T2 %1
+}
+
+; Verify the hybrid bitmap
+; CHECK-LABEL:    .section        .hybmp$x,"yi"
+; CHECK-NEXT:     .symidx "#no_op"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$v$v
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx "#simple_integers"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$i8$i8i8i8i8
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx "#simple_floats"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$d$fd
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx "#has_varargs"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$v$varargs
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx "#has_sret"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m100$v
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx "#matches_has_sret"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$i8$v
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx "#has_aligned_sret"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m16a32$v
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx "#small_array"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m2$m2F8
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx "#large_array"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m24$m24D16m32
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx "#simple_struct"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$m8$i8m8m16m24
+; CHECK-NEXT:     .word   1
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
new file mode 100644
index 0000000000000..3b911e78aff2a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64ec-exit-thunks.ll
@@ -0,0 +1,533 @@
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
+
+declare void @no_op() nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$v$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$v$v
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     stp     x29, x30, [sp, #32]             // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     x29, sp, #32
+; CHECK-NEXT:     .seh_add_fp     32
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     blr     x16
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #32]             // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     ret
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+; CHECK-LABEL:    .def    "#no_op$exit_thunk";
+; CHECK:          .section        .wowthk$aa,"xr",discard,"#no_op$exit_thunk"
+; CHECK:          .weak_anti_dep  no_op
+; CHECK:          .weak_anti_dep  "#no_op"
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp    x11, no_op
+; CHECK-NEXT:     add     x11, x11, :lo12:no_op
+; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp    x10, ($iexit_thunk$cdecl$v$v)
+; CHECK-NEXT:     add     x10, x10, :lo12:($iexit_thunk$cdecl$v$v)
+; CHECK-NEXT:     blr     x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
+declare i64 @simple_integers(i8, i16, i32, i64) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$i8$i8i8i8i8;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$i8$i8i8i8i8
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     stp     x29, x30, [sp, #32]             // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     x29, sp, #32
+; CHECK-NEXT:     .seh_add_fp     32
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     blr     x16
+; CHECK-NEXT:     mov     x0, x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #32]             // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     ret
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+; CHECK-LABEL:    .def    "#simple_integers$exit_thunk";
+; CHECK:          .section        .wowthk$aa,"xr",discard,"#simple_integers$exit_thunk"
+; CHECK:          .weak_anti_dep  simple_integers
+; CHECK:          .weak_anti_dep  "#simple_integers"
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp    x11, simple_integers
+; CHECK-NEXT:     add     x11, x11, :lo12:simple_integers
+; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp    x10, ($iexit_thunk$cdecl$i8$i8i8i8i8)
+; CHECK-NEXT:     add     x10, x10, :lo12:($iexit_thunk$cdecl$i8$i8i8i8i8)
+; CHECK-NEXT:     blr     x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
+; NOTE: Only float and double are supported.
+declare double @simple_floats(float, double) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$d$fd;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$d$fd
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     stp     x29, x30, [sp, #32]             // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     x29, sp, #32
+; CHECK-NEXT:     .seh_add_fp     32
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     blr     x16
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #32]             // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     ret
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+; CHECK-LABEL:    .def    "#simple_floats$exit_thunk";
+; CHECK:          .section        .wowthk$aa,"xr",discard,"#simple_floats$exit_thunk"
+; CHECK:          .weak_anti_dep  simple_floats
+; CHECK:          .weak_anti_dep  "#simple_floats"
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp    x11, simple_floats
+; CHECK-NEXT:     add     x11, x11, :lo12:simple_floats
+; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp    x10, ($iexit_thunk$cdecl$d$fd)
+; CHECK-NEXT:     add     x10, x10, :lo12:($iexit_thunk$cdecl$d$fd)
+; CHECK-NEXT:     blr     x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
+declare void @has_varargs(...) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$v$varargs;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$v$varargs
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #64
+; CHECK-NEXT:     .seh_stackalloc 64
+; CHECK-NEXT:     stp     x29, x30, [sp, #48]             // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  48
+; CHECK-NEXT:     add     x29, sp, #48
+; CHECK-NEXT:     .seh_add_fp     48
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     stp     x4, x5, [sp, #32]
+; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     blr     x16
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #48]             // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  48
+; CHECK-NEXT:     add     sp, sp, #64
+; CHECK-NEXT:     .seh_stackalloc 64
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     ret
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+; CHECK-LABEL:    .def    "#has_varargs$exit_thunk";
+; CHECK:          .section        .wowthk$aa,"xr",discard,"#has_varargs$exit_thunk"
+; CHECK:          .weak_anti_dep  has_varargs
+; CHECK:          .weak_anti_dep  "#has_varargs"
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp    x11, has_varargs
+; CHECK-NEXT:     add     x11, x11, :lo12:has_varargs
+; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp    x10, ($iexit_thunk$cdecl$v$varargs)
+; CHECK-NEXT:     add     x10, x10, :lo12:($iexit_thunk$cdecl$v$varargs)
+; CHECK-NEXT:     blr     x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
+declare void @has_sret(ptr sret([100 x i8])) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m100$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m100$v
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     stp     x29, x30, [sp, #32]             // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     x29, sp, #32
+; CHECK-NEXT:     .seh_add_fp     32
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     mov     x0, x8
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     blr     x16
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #32]             // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     ret
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+; CHECK-LABEL:    .def    "#has_sret$exit_thunk";
+; CHECK:          .section        .wowthk$aa,"xr",discard,"#has_sret$exit_thunk"
+; CHECK:          .weak_anti_dep  has_sret
+; CHECK:          .weak_anti_dep  "#has_sret"
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x9, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp    x11, has_sret
+; CHECK-NEXT:     add     x11, x11, :lo12:has_sret
+; CHECK-NEXT:     ldr     x9, [x9, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp    x10, ($iexit_thunk$cdecl$m100$v)
+; CHECK-NEXT:     add     x10, x10, :lo12:($iexit_thunk$cdecl$m100$v)
+; CHECK-NEXT:     blr     x9
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
+%TSRet = type { i64, i64 }
+declare void @has_aligned_sret(ptr align 32 sret(%TSRet)) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m16a32$v;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m16a32$v
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     stp     x29, x30, [sp, #32]             // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     x29, sp, #32
+; CHECK-NEXT:     .seh_add_fp     32
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     mov     x0, x8
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     blr     x16
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #32]             // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  32
+; CHECK-NEXT:     add     sp, sp, #48
+; CHECK-NEXT:     .seh_stackalloc 48
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     ret
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+; CHECK-LABEL:    .def    "#has_aligned_sret$exit_thunk";
+; CHECK:          .section        .wowthk$aa,"xr",discard,"#has_aligned_sret$exit_thunk"
+; CHECK:          .weak_anti_dep  has_aligned_sret
+; CHECK:          .weak_anti_dep  "#has_aligned_sret"
+; CHECK:          // %bb.0:
+; CHECK:          str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK:          .seh_save_reg_x x30, 16
+; CHECK:          .seh_endprologue
+; CHECK:          adrp    x9, __os_arm64x_check_icall
+; CHECK:          adrp    x11, has_aligned_sret
+; CHECK:          add     x11, x11, :lo12:has_aligned_sret
+; CHECK:          ldr     x9, [x9, :lo12:__os_arm64x_check_icall]
+; CHECK:          adrp    x10, ($iexit_thunk$cdecl$m16a32$v)
+; CHECK:          add     x10, x10, :lo12:($iexit_thunk$cdecl$m16a32$v)
+; CHECK:          blr     x9
+; CHECK:          .seh_startepilogue
+; CHECK:          ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK:          .seh_save_reg_x x30, 16
+; CHECK:          .seh_endepilogue
+; CHECK:          br      x11
+; CHECK:          .seh_endfunclet
+; CHECK:          .seh_endproc
+
+declare [2 x i8] @small_array([2 x i8], [2 x float]) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m2$m2F8;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m2$m2F8
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #64
+; CHECK-NEXT:     .seh_stackalloc 64
+; CHECK-NEXT:     stp     x29, x30, [sp, #48]             // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  48
+; CHECK-NEXT:     add     x29, sp, #48
+; CHECK-NEXT:     .seh_add_fp     48
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     sturb   w1, [x29, #-1]
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     sturb   w0, [x29, #-2]
+; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     stp     s0, s1, [x29, #-12]
+; CHECK-NEXT:     ldurh   w0, [x29, #-2]
+; CHECK-NEXT:     ldur    x1, [x29, #-12]
+; CHECK-NEXT:     blr     x16
+; CHECK-NEXT:     mov     w0, w8
+; CHECK-NEXT:     sturh   w8, [x29, #-14]
+; CHECK-NEXT:     ubfx    w1, w8, #8, #8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #48]             // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  48
+; CHECK-NEXT:     add     sp, sp, #64
+; CHECK-NEXT:     .seh_stackalloc 64
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     ret
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+; CHECK-LABEL:    .def    "#small_array$exit_thunk";
+; CHECK:          .section        .wowthk$aa,"xr",discard,"#small_array$exit_thunk"
+; CHECK:          .weak_anti_dep  small_array
+; CHECK:          .weak_anti_dep  "#small_array"
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp    x11, small_array
+; CHECK-NEXT:     add     x11, x11, :lo12:small_array
+; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp    x10, ($iexit_thunk$cdecl$m2$m2F8)
+; CHECK-NEXT:     add     x10, x10, :lo12:($iexit_thunk$cdecl$m2$m2F8)
+; CHECK-NEXT:     blr     x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
+declare [3 x i64] @large_array([3 x i64], [2 x double], [2 x [2 x i64]]) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m24$m24D16m32;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m24$m24D16m32
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #144
+; CHECK-NEXT:     .seh_stackalloc 144
+; CHECK-NEXT:     stp     x29, x30, [sp, #128]            // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  128
+; CHECK-NEXT:     add     x29, sp, #128
+; CHECK-NEXT:     .seh_add_fp     128
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     stp     x0, x1, [x29, #-48]
+; CHECK-NEXT:     sub     x0, x29, #24
+; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     stur    x2, [x29, #-32]
+; CHECK-NEXT:     sub     x1, x29, #48
+; CHECK-NEXT:     stp     x3, x4, [sp, #32]
+; CHECK-NEXT:     add     x2, sp, #64
+; CHECK-NEXT:     add     x3, sp, #32
+; CHECK-NEXT:     stp     d0, d1, [sp, #64]
+; CHECK-NEXT:     stp     x5, x6, [sp, #48]
+; CHECK-NEXT:     blr     x16
+; CHECK-NEXT:     ldp     x0, x1, [x29, #-24]
+; CHECK-NEXT:     ldur    x2, [x29, #-8]
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #128]            // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  128
+; CHECK-NEXT:     add     sp, sp, #144
+; CHECK-NEXT:     .seh_stackalloc 144
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     ret
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+; CHECK-LABEL:    .def    "#large_array$exit_thunk";
+; CHECK:          .section        .wowthk$aa,"xr",discard,"#large_array$exit_thunk"
+; CHECK:          .weak_anti_dep  large_array
+; CHECK:          .weak_anti_dep  "#large_array"
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp    x11, large_array
+; CHECK-NEXT:     add     x11, x11, :lo12:large_array
+; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp    x10, ($iexit_thunk$cdecl$m24$m24D16m32)
+; CHECK-NEXT:     add     x10, x10, :lo12:($iexit_thunk$cdecl$m24$m24D16m32)
+; CHECK-NEXT:     blr     x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
+%T1 = type { i16 }
+%T2 = type { i32, float }
+%T3 = type { i64, double }
+%T4 = type { i64, double, i8 }
+declare %T2 @simple_struct(%T1, %T2, %T3, %T4) nounwind;
+; CHECK-LABEL:    .def    $iexit_thunk$cdecl$m8$i8m8m16m24;
+; CHECK:          .section        .wowthk$aa,"xr",discard,$iexit_thunk$cdecl$m8$i8m8m16m24
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     sub     sp, sp, #112
+; CHECK-NEXT:     .seh_stackalloc 112
+; CHECK-NEXT:     stp     x29, x30, [sp, #96]             // 16-byte Folded Spill
+; CHECK-NEXT:     .seh_save_fplr  96
+; CHECK-NEXT:     add     x29, sp, #96
+; CHECK-NEXT:     .seh_add_fp     96
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     stur    w1, [x29, #-8]
+; CHECK-NEXT:     adrp    x8, __os_arm64x_dispatch_call_no_redirect
+; CHECK-NEXT:     stur    s0, [x29, #-4]
+; CHECK-NEXT:     ldr     x16, [x8, :lo12:__os_arm64x_dispatch_call_no_redirect]
+; CHECK-NEXT:     ldur    x1, [x29, #-8]
+; CHECK-NEXT:     stur    x2, [x29, #-24]
+; CHECK-NEXT:     sub     x2, x29, #24
+; CHECK-NEXT:     str     x3, [sp, #48]
+; CHECK-NEXT:     add     x3, sp, #48
+; CHECK-NEXT:     stur    d1, [x29, #-16]
+; CHECK-NEXT:     str     d2, [sp, #56]
+; CHECK-NEXT:     strb    w4, [sp, #64]
+; CHECK-NEXT:     blr     x16
+; CHECK-NEXT:     str     x8, [sp, #40]
+; CHECK-NEXT:     mov     x0, x8
+; CHECK-NEXT:     ldr     s0, [sp, #44]
+; CHECK-NEXT:                                     // kill: def $w0 killed $w0 killed $x0
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldp     x29, x30, [sp, #96]             // 16-byte Folded Reload
+; CHECK-NEXT:     .seh_save_fplr  96
+; CHECK-NEXT:     add     sp, sp, #112
+; CHECK-NEXT:     .seh_stackalloc 112
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     ret
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+; CHECK-LABEL:    .def    "#simple_struct$exit_thunk";
+; CHECK:          .section        .wowthk$aa,"xr",discard,"#simple_struct$exit_thunk"
+; CHECK:          .weak_anti_dep  simple_struct
+; CHECK:          .weak_anti_dep  "#simple_struct"
+; CHECK:          // %bb.0:
+; CHECK-NEXT:     str     x30, [sp, #-16]!                // 8-byte Folded Spill
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endprologue
+; CHECK-NEXT:     adrp    x8, __os_arm64x_check_icall
+; CHECK-NEXT:     adrp    x11, simple_struct
+; CHECK-NEXT:     add     x11, x11, :lo12:simple_struct
+; CHECK-NEXT:     ldr     x8, [x8, :lo12:__os_arm64x_check_icall]
+; CHECK-NEXT:     adrp    x10, ($iexit_thunk$cdecl$m8$i8m8m16m24)
+; CHECK-NEXT:     add     x10, x10, :lo12:($iexit_thunk$cdecl$m8$i8m8m16m24)
+; CHECK-NEXT:     blr     x8
+; CHECK-NEXT:     .seh_startepilogue
+; CHECK-NEXT:     ldr     x30, [sp], #16                  // 8-byte Folded Reload
+; CHECK-NEXT:     .seh_save_reg_x x30, 16
+; CHECK-NEXT:     .seh_endepilogue
+; CHECK-NEXT:     br      x11
+; CHECK-NEXT:     .seh_endfunclet
+; CHECK-NEXT:     .seh_endproc
+
+; CHECK-LABEL:    .section        .hybmp$x,"yi"
+; CHECK-NEXT:     .symidx "#func_caller"
+; CHECK-NEXT:     .symidx $ientry_thunk$cdecl$v$v
+; CHECK-NEXT:     .word   1
+; CHECK-NEXT:     .symidx no_op
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$v$v
+; CHECK-NEXT:     .word   4
+; CHECK-NEXT:     .symidx "#no_op$exit_thunk"
+; CHECK-NEXT:     .symidx no_op
+; CHECK-NEXT:     .word   0
+; CHECK-NEXT:     .symidx simple_integers
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$i8$i8i8i8i8
+; CHECK-NEXT:     .word   4
+; CHECK-NEXT:     .symidx "#simple_integers$exit_thunk"
+; CHECK-NEXT:     .symidx simple_integers
+; CHECK-NEXT:     .word   0
+; CHECK-NEXT:     .symidx simple_floats
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$d$fd
+; CHECK-NEXT:     .word   4
+; CHECK-NEXT:     .symidx "#simple_floats$exit_thunk"
+; CHECK-NEXT:     .symidx simple_floats
+; CHECK-NEXT:     .word   0
+; CHECK-NEXT:     .symidx has_varargs
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$v$varargs
+; CHECK-NEXT:     .word   4
+; CHECK-NEXT:     .symidx "#has_varargs$exit_thunk"
+; CHECK-NEXT:     .symidx has_varargs
+; CHECK-NEXT:     .word 0
+; CHECK-NEXT:     .symidx has_sret
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m100$v
+; CHECK-NEXT:     .word   4
+; CHECK-NEXT:     .symidx "#has_sret$exit_thunk"
+; CHECK-NEXT:     .symidx has_sret
+; CHECK-NEXT:     .word   0
+; CHECK-NEXT:     .symidx has_aligned_sret
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m16a32$v
+; CHECK-NEXT:     .word   4
+; CHECK-NEXT:     .symidx "#has_aligned_sret$exit_thunk"
+; CHECK-NEXT:     .symidx has_aligned_sret
+; CHECK-NEXT:     .word   0
+; CHECK-NEXT:     .symidx small_array
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m2$m2F8
+; CHECK-NEXT:     .word   4
+; CHECK-NEXT:     .symidx "#small_array$exit_thunk"
+; CHECK-NEXT:     .symidx small_array
+; CHECK-NEXT:     .word   0
+; CHECK-NEXT:     .symidx large_array
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m24$m24D16m32
+; CHECK-NEXT:     .word   4
+; CHECK-NEXT:     .symidx "#large_array$exit_thunk"
+; CHECK-NEXT:     .symidx large_array
+; CHECK-NEXT:     .word   0
+; CHECK-NEXT:     .symidx simple_struct
+; CHECK-NEXT:     .symidx $iexit_thunk$cdecl$m8$i8m8m16m24
+; CHECK-NEXT:     .word   4
+; CHECK-NEXT:     .symidx "#simple_struct$exit_thunk"
+; CHECK-NEXT:     .symidx simple_struct
+; CHECK-NEXT:     .word   0
+
+define void @func_caller() nounwind {
+  call void @no_op()
+  call i64 @simple_integers(i8 0, i16 0, i32 0, i64 0)
+  call double @simple_floats(float 0.0, double 0.0)
+  call void (...) @has_varargs()
+  %c = alloca i8
+  call void @has_sret(ptr sret([100 x i8]) %c)
+  %aligned = alloca %TSRet, align 32
+  store %TSRet { i64 0, i64 0 }, ptr %aligned, align 32
+  call void @has_aligned_sret(ptr align 32 sret(%TSRet) %aligned)
+  call [2 x i8] @small_array([2 x i8] [i8 0, i8 0], [2 x float] [float 0.0, float 0.0])
+  call [3 x i64] @large_array([3 x i64] [i64 0, i64 0, i64 0], [2 x double] [double 0.0, double 0.0], [2 x [2 x i64]] [[2 x i64] [i64 0, i64 0], [2 x i64] [i64 0, i64 0]])
+  call %T2 @simple_struct(%T1 { i16 0 }, %T2 { i32 0, float 0.0 }, %T3 { i64 0, double 0.0 }, %T4 { i64 0, double 0.0, i8 0 })
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-reservedregs.ll b/llvm/test/CodeGen/AArch64/arm64ec-reservedregs.ll
index 9f1edd93e0bd7..91ec870dd6d0c 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-reservedregs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-reservedregs.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc -arm64ec-generate-thunks=false < %s | FileCheck %s
 
 ; Make sure we're reserving all the registers that are supposed to be
 ; reserved. Integer regs x13, x15, x23, x24, x28. Float regs v16-v31.
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
index 957ed995ee3bf..dc16b3a1a0f27 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-varargs.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s | FileCheck %s
-; RUN: llc -mtriple=arm64ec-pc-windows-msvc < %s -global-isel=1 -global-isel-abort=0 | FileCheck %s
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc -arm64ec-generate-thunks=false < %s | FileCheck %s
+; RUN: llc -mtriple=arm64ec-pc-windows-msvc -arm64ec-generate-thunks=false < %s -global-isel=1 -global-isel-abort=0 | FileCheck %s
 
 define void @varargs_callee(double %x, ...) nounwind {
 ; CHECK-LABEL: varargs_callee:
@@ -44,7 +44,11 @@ define void @varargs_caller() nounwind {
 ; CHECK-NEXT:    stp xzr, x30, [sp, #24] // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    str xzr, [sp, #16]
-; CHECK-NEXT:    bl varargs_callee
+; CHECK-NEXT:    .weak_anti_dep varargs_callee
+; CHECK-NEXT:  .set varargs_callee, "#varargs_callee"@WEAKREF
+; CHECK-NEXT:    .weak_anti_dep "#varargs_callee"
+; CHECK-NEXT:  .set "#varargs_callee", varargs_callee@WEAKREF
+; CHECK-NEXT:    bl "#varargs_callee"
 ; CHECK-NEXT:    ldr x30, [sp, #32] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #48
 ; CHECK-NEXT:    ret
@@ -81,7 +85,11 @@ define void @varargs_many_argscalleer() nounwind {
 ; CHECK-NEXT:    str x30, [sp, #48] // 8-byte Folded Spill
 ; CHECK-NEXT:    stp x9, x8, [sp]
 ; CHECK-NEXT:    stp q0, q0, [sp, #16]
-; CHECK-NEXT:    bl varargs_many_argscallee
+; CHECK-NEXT:    .weak_anti_dep varargs_many_argscallee
+; CHECK-NEXT:  .set varargs_many_argscallee, "#varargs_many_argscallee"@WEAKREF
+; CHECK-NEXT:    .weak_anti_dep "#varargs_many_argscallee"
+; CHECK-NEXT:  .set "#varargs_many_argscallee", varargs_many_argscallee@WEAKREF
+; CHECK-NEXT:    bl "#varargs_many_argscallee"
 ; CHECK-NEXT:    ldr x30, [sp, #48] // 8-byte Folded Reload
 ; CHECK-NEXT:    add sp, sp, #64
 ; CHECK-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/stack-protector-target.ll b/llvm/test/CodeGen/AArch64/stack-protector-target.ll
index 3bbb7567e10a2..b1ddd1d0d160f 100644
--- a/llvm/test/CodeGen/AArch64/stack-protector-target.ll
+++ b/llvm/test/CodeGen/AArch64/stack-protector-target.ll
@@ -39,6 +39,6 @@ declare void @_Z7CapturePi(ptr)
 ; WINDOWS-ARM64EC: adrp x8, __security_cookie
 ; WINDOWS-ARM64EC: ldr x8, [x8, :lo12:__security_cookie]
 ; WINDOWS-ARM64EC: str x8, [sp, #8]
-; WINDOWS-ARM64EC: bl  _Z7CapturePi
+; WINDOWS-ARM64EC: bl "#_Z7CapturePi"
 ; WINDOWS-ARM64EC: ldr x0, [sp, #8]
-; WINDOWS-ARM64EC: bl  __security_check_cookie_arm64ec
+; WINDOWS-ARM64EC: bl "#__security_check_cookie_arm64ec"
diff --git a/llvm/test/CodeGen/AArch64/win-alloca.ll b/llvm/test/CodeGen/AArch64/win-alloca.ll
index 08f3fcdf02405..94c3d88458169 100644
--- a/llvm/test/CodeGen/AArch64/win-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/win-alloca.ll
@@ -21,4 +21,4 @@ declare void @func2(ptr)
 ; CHECK-OPT: sub [[REG3:x[0-9]+]], sp, x15, lsl #4
 ; CHECK-OPT: mov sp, [[REG3]]
 ; CHECK: bl func2
-; CHECK-ARM64EC: bl __chkstk_arm64ec
+; CHECK-ARM64EC: bl "#__chkstk_arm64ec"

From 4ee195a2a791cc338feddfa8d82679eddb330bf1 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 23 Jan 2024 05:30:52 +0000
Subject: [PATCH 554/843] [gn build] Port a6065f0fa55a

---
 llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn
index 43c01cf5c7660..a61e6058053d8 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AArch64/BUILD.gn
@@ -109,6 +109,7 @@ static_library("LLVMAArch64CodeGen") {
     "AArch64A53Fix835769.cpp",
     "AArch64A57FPLoadBalancing.cpp",
     "AArch64AdvSIMDScalarPass.cpp",
+    "AArch64Arm64ECCallLowering.cpp",
     "AArch64AsmPrinter.cpp",
     "AArch64BranchTargets.cpp",
     "AArch64CallingConvention.cpp",

From ce3e767ac5ea1a1d1a166e88c152e2125ec7662b Mon Sep 17 00:00:00 2001
From: ManuelvOK <info@manuel-thieme.de>
Date: Tue, 23 Jan 2024 06:41:49 +0100
Subject: [PATCH 555/843] [Coverage] Map regions from system headers (#76950)

In 2155195131a57f2f01e7cfabb85bb027518c2dc6, the
"system-headers-coverage" option has been added but not used in all
necessary places.

Potential reviewers: @gulfemsavrun @petrhosek

Co-authored-by: Manuel Kalettka <manuel.kalettka@kernkonzept.com>
---
 clang/lib/CodeGen/CodeGenPGO.cpp         | 4 +++-
 clang/lib/CodeGen/CoverageMappingGen.cpp | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenPGO.cpp b/clang/lib/CodeGen/CodeGenPGO.cpp
index d68844d476eb4..5d7c384774576 100644
--- a/clang/lib/CodeGen/CodeGenPGO.cpp
+++ b/clang/lib/CodeGen/CodeGenPGO.cpp
@@ -28,6 +28,8 @@ static llvm::cl::opt<bool>
                          llvm::cl::desc("Enable value profiling"),
                          llvm::cl::Hidden, llvm::cl::init(false));
 
+extern llvm::cl::opt<bool> SystemHeadersCoverage;
+
 using namespace clang;
 using namespace CodeGen;
 
@@ -1022,7 +1024,7 @@ bool CodeGenPGO::skipRegionMappingForDecl(const Decl *D) {
   // Don't map the functions in system headers.
   const auto &SM = CGM.getContext().getSourceManager();
   auto Loc = D->getBody()->getBeginLoc();
-  return SM.isInSystemHeader(Loc);
+  return !SystemHeadersCoverage && SM.isInSystemHeader(Loc);
 }
 
 void CodeGenPGO::emitCounterRegionMapping(const Decl *D) {
diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 8b5e6c4ad8272..5eca00f22bb83 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -37,7 +37,7 @@ static llvm::cl::opt<bool> EmptyLineCommentCoverage(
                    "disable it on test)"),
     llvm::cl::init(true), llvm::cl::Hidden);
 
-static llvm::cl::opt<bool> SystemHeadersCoverage(
+llvm::cl::opt<bool> SystemHeadersCoverage(
     "system-headers-coverage",
     llvm::cl::desc("Enable collecting coverage from system headers"),
     llvm::cl::init(false), llvm::cl::Hidden);

From 3f740322c5b9b7d6f45a73d2fac948df3c74a3a7 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 23 Jan 2024 00:56:40 -0500
Subject: [PATCH 556/843] [libc] remove getauxval from arm32 entrypoint list
 (#79093)

`getauxval` depends on `open/read/close` which are not built on arm32.
Remove `getauxval` for now.
---
 libc/config/linux/arm/entrypoints.txt | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/libc/config/linux/arm/entrypoints.txt b/libc/config/linux/arm/entrypoints.txt
index 301870d337ca0..c75ac2302d4ac 100644
--- a/libc/config/linux/arm/entrypoints.txt
+++ b/libc/config/linux/arm/entrypoints.txt
@@ -95,9 +95,6 @@ set(TARGET_LIBC_ENTRYPOINTS
 
     # sys/prctl.h entrypoints
     libc.src.sys.prctl.prctl
-
-    # sys/auxv.h entrypoints
-    libc.src.sys.auxv.getauxval
 )
 
 set(TARGET_LIBM_ENTRYPOINTS

From 86f05477d0bd7347e9bec8c1f9a7d89c167ad556 Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 23 Jan 2024 00:57:07 -0500
Subject: [PATCH 557/843] [libc] add missing header deps to getauxval (#79091)

`getauxval` depends on `libc.include.sys_auxv`
---
 libc/src/sys/auxv/linux/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libc/src/sys/auxv/linux/CMakeLists.txt b/libc/src/sys/auxv/linux/CMakeLists.txt
index b38d63ee0329c..383c29eafda8d 100644
--- a/libc/src/sys/auxv/linux/CMakeLists.txt
+++ b/libc/src/sys/auxv/linux/CMakeLists.txt
@@ -15,4 +15,5 @@ add_entrypoint_object(
     libc.src.fcntl.open
     libc.src.unistd.read
     libc.src.unistd.close
+    libc.include.sys_auxv
 )

From fcb8342a219ada8ec641790a4c8a9f969d7d64ee Mon Sep 17 00:00:00 2001
From: Ami-zhang <zhanglimin@loongson.cn>
Date: Tue, 23 Jan 2024 14:24:58 +0800
Subject: [PATCH 558/843] [LoongArch] Add definitions and feature 'frecipe' for
 FP approximation intrinsics/builtins (#78962)

This PR adds definitions and 'frecipe' feature for FP approximation
intrinsics/builtins. In additions, this adds and complements relative
testcases.
---
 .../clang/Basic/BuiltinsLoongArchBase.def     |  5 +++
 .../clang/Basic/BuiltinsLoongArchLASX.def     |  6 +++
 .../clang/Basic/BuiltinsLoongArchLSX.def      |  6 +++
 clang/lib/Headers/larchintrin.h               | 12 +++++
 clang/lib/Headers/lasxintrin.h                | 24 ++++++++++
 clang/lib/Headers/lsxintrin.h                 | 24 ++++++++++
 .../LoongArch/builtin-dbl-approximate.c       | 45 +++++++++++++++++++
 .../LoongArch/builtin-flt-approximate.c       | 45 +++++++++++++++++++
 .../CodeGen/LoongArch/intrinsic-la64-error.c  | 21 +++++++++
 .../lasx/builtin-approximate-alias.c          | 37 +++++++++++++++
 .../LoongArch/lasx/builtin-approximate.c      | 38 ++++++++++++++++
 .../LoongArch/lsx/builtin-approximate-alias.c | 37 +++++++++++++++
 .../LoongArch/lsx/builtin-approximate.c       | 38 ++++++++++++++++
 llvm/include/llvm/IR/IntrinsicsLoongArch.td   | 13 ++++++
 llvm/lib/Target/LoongArch/LoongArch.td        |  7 +++
 .../LoongArch/LoongArchFloat32InstrInfo.td    |  6 +++
 .../LoongArch/LoongArchFloat64InstrInfo.td    |  6 +++
 .../LoongArch/LoongArchLASXInstrInfo.td       | 10 +++++
 .../Target/LoongArch/LoongArchLSXInstrInfo.td | 10 +++++
 .../lib/Target/LoongArch/LoongArchSubtarget.h |  2 +
 .../LoongArch/intrinsic-frecipe-dbl.ll        | 26 +++++++++++
 .../LoongArch/intrinsic-frecipe-flt.ll        | 26 +++++++++++
 .../LoongArch/lasx/intrinsic-frecipe.ll       | 26 +++++++++++
 .../LoongArch/lasx/intrinsic-frsqrte.ll       | 26 +++++++++++
 .../LoongArch/lsx/intrinsic-frecipe.ll        | 26 +++++++++++
 .../LoongArch/lsx/intrinsic-frsqrte.ll        | 26 +++++++++++
 26 files changed, 548 insertions(+)
 create mode 100644 clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c
 create mode 100644 clang/test/CodeGen/LoongArch/builtin-flt-approximate.c
 create mode 100644 clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
 create mode 100644 clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
 create mode 100644 clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c
 create mode 100644 clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c
 create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll

diff --git a/clang/include/clang/Basic/BuiltinsLoongArchBase.def b/clang/include/clang/Basic/BuiltinsLoongArchBase.def
index cbb239223aae3..a5a07c167908c 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchBase.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchBase.def
@@ -51,3 +51,8 @@ TARGET_BUILTIN(__builtin_loongarch_iocsrwr_d, "vUWiUi", "nc", "64bit")
 
 TARGET_BUILTIN(__builtin_loongarch_lddir_d, "WiWiIUWi", "nc", "64bit")
 TARGET_BUILTIN(__builtin_loongarch_ldpte_d, "vWiIUWi", "nc", "64bit")
+
+TARGET_BUILTIN(__builtin_loongarch_frecipe_s, "ff", "nc", "f,frecipe")
+TARGET_BUILTIN(__builtin_loongarch_frecipe_d, "dd", "nc", "d,frecipe")
+TARGET_BUILTIN(__builtin_loongarch_frsqrte_s, "ff", "nc", "f,frecipe")
+TARGET_BUILTIN(__builtin_loongarch_frsqrte_d, "dd", "nc", "d,frecipe")
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
index 3de200f665b68..4cf51cc000f6f 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLASX.def
@@ -657,9 +657,15 @@ TARGET_BUILTIN(__builtin_lasx_xvfsqrt_d, "V4dV4d", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfrecip_s, "V8fV8f", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfrecip_d, "V4dV4d", "nc", "lasx")
 
+TARGET_BUILTIN(__builtin_lasx_xvfrecipe_s, "V8fV8f", "nc", "lasx,frecipe")
+TARGET_BUILTIN(__builtin_lasx_xvfrecipe_d, "V4dV4d", "nc", "lasx,frecipe")
+
 TARGET_BUILTIN(__builtin_lasx_xvfrsqrt_s, "V8fV8f", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfrsqrt_d, "V4dV4d", "nc", "lasx")
 
+TARGET_BUILTIN(__builtin_lasx_xvfrsqrte_s, "V8fV8f", "nc", "lasx,frecipe")
+TARGET_BUILTIN(__builtin_lasx_xvfrsqrte_d, "V4dV4d", "nc", "lasx,frecipe")
+
 TARGET_BUILTIN(__builtin_lasx_xvfcvtl_s_h, "V8fV16s", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfcvth_s_h, "V8fV16s", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xvfcvtl_d_s, "V4dV8f", "nc", "lasx")
diff --git a/clang/include/clang/Basic/BuiltinsLoongArchLSX.def b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
index 8e6aec886c50c..c90f4dc5458fa 100644
--- a/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
+++ b/clang/include/clang/Basic/BuiltinsLoongArchLSX.def
@@ -641,9 +641,15 @@ TARGET_BUILTIN(__builtin_lsx_vfsqrt_d, "V2dV2d", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfrecip_s, "V4fV4f", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfrecip_d, "V2dV2d", "nc", "lsx")
 
+TARGET_BUILTIN(__builtin_lsx_vfrecipe_s, "V4fV4f", "nc", "lsx,frecipe")
+TARGET_BUILTIN(__builtin_lsx_vfrecipe_d, "V2dV2d", "nc", "lsx,frecipe")
+
 TARGET_BUILTIN(__builtin_lsx_vfrsqrt_s, "V4fV4f", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfrsqrt_d, "V2dV2d", "nc", "lsx")
 
+TARGET_BUILTIN(__builtin_lsx_vfrsqrte_s, "V4fV4f", "nc", "lsx,frecipe")
+TARGET_BUILTIN(__builtin_lsx_vfrsqrte_d, "V2dV2d", "nc", "lsx,frecipe")
+
 TARGET_BUILTIN(__builtin_lsx_vfcvtl_s_h, "V4fV8s", "nc", "lsx")
 TARGET_BUILTIN(__builtin_lsx_vfcvtl_d_s, "V2dV4f", "nc", "lsx")
 
diff --git a/clang/lib/Headers/larchintrin.h b/clang/lib/Headers/larchintrin.h
index c5c533ee0b8c1..a613e5ca0e5ec 100644
--- a/clang/lib/Headers/larchintrin.h
+++ b/clang/lib/Headers/larchintrin.h
@@ -228,6 +228,18 @@ extern __inline void
   ((void)__builtin_loongarch_ldpte_d((long int)(_1), (_2)))
 #endif
 
+#define __frecipe_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frecipe_s((float)_1)
+
+#define __frecipe_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frecipe_d((double)_1)
+
+#define __frsqrte_s(/*float*/ _1)                                              \
+  (float)__builtin_loongarch_frsqrte_s((float)_1)
+
+#define __frsqrte_d(/*double*/ _1)                                             \
+  (double)__builtin_loongarch_frsqrte_d((double)_1)
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/clang/lib/Headers/lasxintrin.h b/clang/lib/Headers/lasxintrin.h
index 6b4d5012a24b5..dafc2a2f3e6a7 100644
--- a/clang/lib/Headers/lasxintrin.h
+++ b/clang/lib/Headers/lasxintrin.h
@@ -1726,6 +1726,18 @@ extern __inline
   return (__m256d)__builtin_lasx_xvfrecip_d((v4f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrecipe_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrecipe_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrecipe_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrecipe_d((v4f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
     __lasx_xvfrint_s(__m256 _1) {
@@ -1750,6 +1762,18 @@ extern __inline
   return (__m256d)__builtin_lasx_xvfrsqrt_d((v4f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+    __lasx_xvfrsqrte_s(__m256 _1) {
+  return (__m256)__builtin_lasx_xvfrsqrte_s((v8f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+    __lasx_xvfrsqrte_d(__m256d _1) {
+  return (__m256d)__builtin_lasx_xvfrsqrte_d((v4f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
     __lasx_xvflogb_s(__m256 _1) {
diff --git a/clang/lib/Headers/lsxintrin.h b/clang/lib/Headers/lsxintrin.h
index a29bc7757ab56..f347955ce6fb5 100644
--- a/clang/lib/Headers/lsxintrin.h
+++ b/clang/lib/Headers/lsxintrin.h
@@ -1776,6 +1776,18 @@ extern __inline
   return (__m128d)__builtin_lsx_vfrecip_d((v2f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrecipe_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrecipe_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrecipe_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrecipe_d((v2f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
     __lsx_vfrint_s(__m128 _1) {
@@ -1800,6 +1812,18 @@ extern __inline
   return (__m128d)__builtin_lsx_vfrsqrt_d((v2f64)_1);
 }
 
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+    __lsx_vfrsqrte_s(__m128 _1) {
+  return (__m128)__builtin_lsx_vfrsqrte_s((v4f32)_1);
+}
+
+extern __inline
+    __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+    __lsx_vfrsqrte_d(__m128d _1) {
+  return (__m128d)__builtin_lsx_vfrsqrte_d((v2f64)_1);
+}
+
 extern __inline
     __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
     __lsx_vflogb_s(__m128 _1) {
diff --git a/clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c b/clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c
new file mode 100644
index 0000000000000..e5fe684346c00
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/builtin-dbl-approximate.c
@@ -0,0 +1,45 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple loongarch32 -target-feature +d -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +d -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <larchintrin.h>
+
+// CHECK-LABEL: @frecipe_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frecipe.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frecipe_d (double _1)
+{
+  return __builtin_loongarch_frecipe_d (_1);
+}
+
+// CHECK-LABEL: @frsqrte_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frsqrte.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frsqrte_d (double _1)
+{
+  return __builtin_loongarch_frsqrte_d (_1);
+}
+
+// CHECK-LABEL: @frecipe_d_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frecipe.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frecipe_d_alia (double _1)
+{
+  return __frecipe_d (_1);
+}
+
+// CHECK-LABEL: @frsqrte_d_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call double @llvm.loongarch.frsqrte.d(double [[A:%.*]])
+// CHECK-NEXT:    ret double [[TMP0]]
+//
+double frsqrte_d_alia (double _1)
+{
+  return __frsqrte_d (_1);
+}
diff --git a/clang/test/CodeGen/LoongArch/builtin-flt-approximate.c b/clang/test/CodeGen/LoongArch/builtin-flt-approximate.c
new file mode 100644
index 0000000000000..47bb47084364b
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/builtin-flt-approximate.c
@@ -0,0 +1,45 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// RUN: %clang_cc1 -triple loongarch32 -target-feature +f -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +f -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <larchintrin.h>
+
+// CHECK-LABEL: @frecipe_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frecipe.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frecipe_s (float _1)
+{
+  return __builtin_loongarch_frecipe_s (_1);
+}
+
+// CHECK-LABEL: @frsqrte_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frsqrte.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frsqrte_s (float _1)
+{
+  return __builtin_loongarch_frsqrte_s (_1);
+}
+
+// CHECK-LABEL: @frecipe_s_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frecipe.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frecipe_s_alia (float _1)
+{
+  return __frecipe_s (_1);
+}
+
+// CHECK-LABEL: @frsqrte_s_alia
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call float @llvm.loongarch.frsqrte.s(float [[A:%.*]])
+// CHECK-NEXT:    ret float [[TMP0]]
+//
+float frsqrte_s_alia (float _1)
+{
+  return __frsqrte_s (_1);
+}
diff --git a/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c b/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
index efb3b94175cfa..a3242dfd41e9b 100644
--- a/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
+++ b/clang/test/CodeGen/LoongArch/intrinsic-la64-error.c
@@ -1,7 +1,28 @@
 // RUN: %clang_cc1 -triple loongarch64 -emit-llvm -S -verify %s -o /dev/null
+// RUN: not %clang_cc1 -triple loongarch64 -DFEATURE_CHECK -emit-llvm %s -o /dev/null 2>&1 \
+// RUN:   | FileCheck %s
 
 #include <larchintrin.h>
 
+#ifdef FEATURE_CHECK
+void test_feature(unsigned long *v_ul, int *v_i, float a, double b) {
+// CHECK: error: '__builtin_loongarch_cacop_w' needs target feature 32bit
+  __builtin_loongarch_cacop_w(1, v_ul[0], 1024);
+// CHECK: error: '__builtin_loongarch_movfcsr2gr' needs target feature f
+  v_i[0] = __builtin_loongarch_movfcsr2gr(1);
+// CHECK: error: '__builtin_loongarch_movgr2fcsr' needs target feature f
+  __builtin_loongarch_movgr2fcsr(1, v_i[1]);
+// CHECK: error: '__builtin_loongarch_frecipe_s' needs target feature f,frecipe
+  float f1 = __builtin_loongarch_frecipe_s(a);
+// CHECK: error: '__builtin_loongarch_frsqrte_s' needs target feature f,frecipe
+  float f2 = __builtin_loongarch_frsqrte_s(a);
+// CHECK: error: '__builtin_loongarch_frecipe_d' needs target feature d,frecipe
+  double d1 = __builtin_loongarch_frecipe_d(b);
+// CHECK: error: '__builtin_loongarch_frsqrte_d' needs target feature d,frecipe
+  double d2 = __builtin_loongarch_frsqrte_d(b);
+}
+#endif
+
 void csrrd_d(int a) {
   __builtin_loongarch_csrrd_d(16384); // expected-error {{argument value 16384 is outside the valid range [0, 16383]}}
   __builtin_loongarch_csrrd_d(-1); // expected-error {{argument value 4294967295 is outside the valid range [0, 16383]}}
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
new file mode 100644
index 0000000000000..b79f939403993
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate-alias.c
@@ -0,0 +1,37 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <lasxintrin.h>
+
+// CHECK-LABEL: @xvfrecipe_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrecipe_s(v8f32 _1) { return __lasx_xvfrecipe_s(_1); }
+// CHECK-LABEL: @xvfrecipe_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrecipe_d(v4f64 _1) { return __lasx_xvfrecipe_d(_1); }
+// CHECK-LABEL: @xvfrsqrte_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrsqrte_s(v8f32 _1) { return __lasx_xvfrsqrte_s(_1); }
+// CHECK-LABEL: @xvfrsqrte_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrsqrte_d(v4f64 _1) { return __lasx_xvfrsqrte_d(_1); }
diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
new file mode 100644
index 0000000000000..63e9ba639ea2c
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-approximate.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+typedef float v8f32 __attribute__((vector_size(32), aligned(32)));
+typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
+
+// CHECK-LABEL: @xvfrecipe_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrecipe_s(v8f32 _1) { return __builtin_lasx_xvfrecipe_s(_1); }
+// CHECK-LABEL: @xvfrecipe_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrecipe_d(v4f64 _1) { return __builtin_lasx_xvfrecipe_d(_1); }
+// CHECK-LABEL: @xvfrsqrte_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> [[_1]])
+// CHECK-NEXT:    store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v8f32 xvfrsqrte_s(v8f32 _1) { return __builtin_lasx_xvfrsqrte_s(_1); }
+// CHECK-LABEL: @xvfrsqrte_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2:![0-9]+]]
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> [[_1]])
+// CHECK-NEXT:    store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT:    ret void
+//
+v4f64 xvfrsqrte_d(v4f64 _1) { return __builtin_lasx_xvfrsqrte_d(_1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c
new file mode 100644
index 0000000000000..f26f032c878e6
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate-alias.c
@@ -0,0 +1,37 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+#include <lsxintrin.h>
+
+// CHECK-LABEL: @vfrecipe_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrecipe_s(v4f32 _1) { return __lsx_vfrecipe_s(_1); }
+// CHECK-LABEL: @vfrecipe_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrecipe_d(v2f64 _1) { return __lsx_vfrecipe_d(_1); }
+// CHECK-LABEL: @vfrsqrte_s(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrsqrte_s(v4f32 _1) { return __lsx_vfrsqrte_s(_1); }
+// CHECK-LABEL: @vfrsqrte_d(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrsqrte_d(v2f64 _1) { return __lsx_vfrsqrte_d(_1); }
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c
new file mode 100644
index 0000000000000..39fa1663db349
--- /dev/null
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-approximate.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -target-feature +frecipe -O2 -emit-llvm %s -o - | FileCheck %s
+
+typedef float v4f32 __attribute__ ((vector_size(16), aligned(16)));
+typedef double v2f64 __attribute__ ((vector_size(16), aligned(16)));
+
+// CHECK-LABEL: @vfrecipe_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrecipe_s (v4f32 _1) { return __builtin_lsx_vfrecipe_s (_1); }
+// CHECK-LABEL: @vfrecipe_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrecipe_d (v2f64 _1) { return __builtin_lsx_vfrecipe_d (_1); }
+// CHECK-LABEL: @vfrsqrte_s
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v4f32 vfrsqrte_s (v4f32 _1) { return __builtin_lsx_vfrsqrte_s (_1); }
+// CHECK-LABEL: @vfrsqrte_d
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT:    [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> [[TMP0]])
+// CHECK-NEXT:    [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT:    ret i128 [[TMP2]]
+//
+v2f64 vfrsqrte_d (v2f64 _1) { return __builtin_lsx_vfrsqrte_d (_1); }
diff --git a/llvm/include/llvm/IR/IntrinsicsLoongArch.td b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
index 685deaec7709b..9002076e7aece 100644
--- a/llvm/include/llvm/IR/IntrinsicsLoongArch.td
+++ b/llvm/include/llvm/IR/IntrinsicsLoongArch.td
@@ -122,6 +122,15 @@ def int_loongarch_lddir_d : BaseInt<[llvm_i64_ty], [llvm_i64_ty, llvm_i64_ty],
                                     [ImmArg<ArgIndex<1>>]>;
 def int_loongarch_ldpte_d : BaseInt<[], [llvm_i64_ty, llvm_i64_ty],
                                     [ImmArg<ArgIndex<1>>]>;
+
+def int_loongarch_frecipe_s : BaseInt<[llvm_float_ty], [llvm_float_ty],
+                                      [IntrNoMem]>;
+def int_loongarch_frecipe_d : BaseInt<[llvm_double_ty], [llvm_double_ty],
+                                      [IntrNoMem]>;
+def int_loongarch_frsqrte_s : BaseInt<[llvm_float_ty], [llvm_float_ty],
+                                      [IntrNoMem]>;
+def int_loongarch_frsqrte_d : BaseInt<[llvm_double_ty], [llvm_double_ty],
+                                      [IntrNoMem]>;
 } // TargetPrefix = "loongarch"
 
 /// Vector intrinsic
@@ -527,10 +536,12 @@ foreach inst = ["vfmadd_d", "vfmsub_d", "vfnmadd_d", "vfnmsub_d"] in
              [IntrNoMem]>;
 
 foreach inst = ["vflogb_s", "vfsqrt_s", "vfrecip_s", "vfrsqrt_s", "vfrint_s",
+                "vfrecipe_s", "vfrsqrte_s",
                 "vfrintrne_s", "vfrintrz_s", "vfrintrp_s", "vfrintrm_s"] in
   def int_loongarch_lsx_#inst : VecInt<[llvm_v4f32_ty], [llvm_v4f32_ty],
                                        [IntrNoMem]>;
 foreach inst = ["vflogb_d", "vfsqrt_d", "vfrecip_d", "vfrsqrt_d", "vfrint_d",
+                "vfrecipe_d", "vfrsqrte_d",
                 "vfrintrne_d", "vfrintrz_d", "vfrintrp_d", "vfrintrm_d"] in
   def int_loongarch_lsx_#inst : VecInt<[llvm_v2f64_ty], [llvm_v2f64_ty],
                                        [IntrNoMem]>;
@@ -1044,10 +1055,12 @@ foreach inst = ["xvfmadd_d", "xvfmsub_d", "xvfnmadd_d", "xvfnmsub_d"] in
              [IntrNoMem]>;
 
 foreach inst = ["xvflogb_s", "xvfsqrt_s", "xvfrecip_s", "xvfrsqrt_s", "xvfrint_s",
+                "xvfrecipe_s", "xvfrsqrte_s",
                 "xvfrintrne_s", "xvfrintrz_s", "xvfrintrp_s", "xvfrintrm_s"] in
   def int_loongarch_lasx_#inst : VecInt<[llvm_v8f32_ty], [llvm_v8f32_ty],
                                         [IntrNoMem]>;
 foreach inst = ["xvflogb_d", "xvfsqrt_d", "xvfrecip_d", "xvfrsqrt_d", "xvfrint_d",
+                "xvfrecipe_d", "xvfrsqrte_d",
                 "xvfrintrne_d", "xvfrintrz_d", "xvfrintrp_d", "xvfrintrm_d"] in
   def int_loongarch_lasx_#inst : VecInt<[llvm_v4f64_ty], [llvm_v4f64_ty],
                                         [IntrNoMem]>;
diff --git a/llvm/lib/Target/LoongArch/LoongArch.td b/llvm/lib/Target/LoongArch/LoongArch.td
index 4cffaf573b918..c2a669931d78f 100644
--- a/llvm/lib/Target/LoongArch/LoongArch.td
+++ b/llvm/lib/Target/LoongArch/LoongArch.td
@@ -111,6 +111,13 @@ def FeatureAutoVec
     : SubtargetFeature<"auto-vec", "HasExpAutoVec", "true",
                        "Experimental auto vectorization">;
 
+// Floating point approximation operation
+def FeatureFrecipe
+    : SubtargetFeature<"frecipe", "HasFrecipe", "true",
+                       "Support frecipe.{s/d} and frsqrte.{s/d} instructions.">;
+def HasFrecipe : Predicate<"Subtarget->hasFrecipe()">;
+
+
 //===----------------------------------------------------------------------===//
 // Registers, instruction descriptions ...
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index 7750d59470522..d6a83c0c8cd8f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -282,6 +282,12 @@ def : Pat<(loongarch_ftint FPR32:$src), (FTINTRZ_W_S FPR32:$src)>;
 // FP reciprocal operation
 def : Pat<(fdiv fpimm1, FPR32:$src), (FRECIP_S $src)>;
 
+let Predicates = [HasFrecipe] in {
+// FP approximate reciprocal operation
+def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>;
+def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>;
+}
+
 // fmadd.s: fj * fk + fa
 def : Pat<(fma FPR32:$fj, FPR32:$fk, FPR32:$fa), (FMADD_S $fj, $fk, $fa)>;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 7eb2e5562783a..30cce8439640f 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -249,6 +249,12 @@ def : Pat<(f64 (fpextend FPR32:$src)), (FCVT_D_S FPR32:$src)>;
 // FP reciprocal operation
 def : Pat<(fdiv fpimm1, FPR64:$src), (FRECIP_D $src)>;
 
+let Predicates = [HasFrecipe] in {
+// FP approximate reciprocal operation
+def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>;
+def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>;
+}
+
 // fmadd.d: fj * fk + fa
 def : Pat<(fma FPR64:$fj, FPR64:$fk, FPR64:$fa), (FMADD_D $fj, $fk, $fa)>;
 
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 5459761bf2e20..1d73133ad1120 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1906,6 +1906,16 @@ foreach Inst = ["XVFLOGB_D", "XVFCLASS_D", "XVFSQRT_D", "XVFRECIP_D", "XVFRSQRT_
   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
             (!cast<LAInst>(Inst) LASX256:$xj)>;
 
+// 256-Bit vector FP approximate reciprocal operation
+let Predicates = [HasFrecipe] in {
+foreach Inst = ["XVFRECIPE_S", "XVFRSQRTE_S"] in
+  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v8f32 LASX256:$xj)),
+            (!cast<LAInst>(Inst) LASX256:$xj)>;
+foreach Inst = ["XVFRECIPE_D", "XVFRSQRTE_D"] in
+  def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
+            (!cast<LAInst>(Inst) LASX256:$xj)>;
+}
+
 def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
           (XVPICKVE_W v8f32:$xj, (to_valid_timm timm:$imm))>;
 def : Pat<(int_loongarch_lasx_xvpickve_d_f v4f64:$xj, timm:$imm),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index abcf17c8eef33..f9d0647161f5b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -2031,6 +2031,16 @@ foreach Inst = ["VFLOGB_D", "VFCLASS_D", "VFSQRT_D", "VFRECIP_D", "VFRSQRT_D",
   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
             (!cast<LAInst>(Inst) LSX128:$vj)>;
 
+// 128-Bit vector FP approximate reciprocal operation
+let Predicates = [HasFrecipe] in {
+foreach Inst = ["VFRECIPE_S", "VFRSQRTE_S"] in
+  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v4f32 LSX128:$vj)),
+            (!cast<LAInst>(Inst) LSX128:$vj)>;
+foreach Inst = ["VFRECIPE_D", "VFRSQRTE_D"] in
+  def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
+            (!cast<LAInst>(Inst) LSX128:$vj)>;
+}
+
 // load
 def : Pat<(int_loongarch_lsx_vld GPR:$rj, timm:$imm),
           (VLD GPR:$rj, (to_valid_timm timm:$imm))>;
diff --git a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
index 174e4cba83263..11c0b39e176e6 100644
--- a/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
+++ b/llvm/lib/Target/LoongArch/LoongArchSubtarget.h
@@ -45,6 +45,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   bool HasUAL = false;
   bool HasLinkerRelax = false;
   bool HasExpAutoVec = false;
+  bool HasFrecipe = false;
   unsigned GRLen = 32;
   MVT GRLenVT = MVT::i32;
   LoongArchABI::ABI TargetABI = LoongArchABI::ABI_Unknown;
@@ -104,6 +105,7 @@ class LoongArchSubtarget : public LoongArchGenSubtargetInfo {
   bool hasUAL() const { return HasUAL; }
   bool hasLinkerRelax() const { return HasLinkerRelax; }
   bool hasExpAutoVec() const { return HasExpAutoVec; }
+  bool hasFrecipe() const { return HasFrecipe; }
   MVT getGRLenVT() const { return GRLenVT; }
   unsigned getGRLen() const { return GRLen; }
   LoongArchABI::ABI getTargetABI() const { return TargetABI; }
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
new file mode 100644
index 0000000000000..9f572500caa0e
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-dbl.ll
@@ -0,0 +1,26 @@
+; RUN: llc --mtriple=loongarch32 --mattr=+d,+frecipe < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s
+
+declare double @llvm.loongarch.frecipe.d(double)
+
+define double @frecipe_d(double %a) {
+; CHECK-LABEL: frecipe_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frecipe.d $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call double @llvm.loongarch.frecipe.d(double %a)
+  ret double %res
+}
+
+declare double @llvm.loongarch.frsqrte.d(double)
+
+define double @frsqrte_d(double %a) {
+; CHECK-LABEL: frsqrte_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frsqrte.d $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call double @llvm.loongarch.frsqrte.d(double %a)
+  ret double %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
new file mode 100644
index 0000000000000..0b2029f2e44a0
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/intrinsic-frecipe-flt.ll
@@ -0,0 +1,26 @@
+; RUN: llc --mtriple=loongarch32 --mattr=+f,+frecipe < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+f,+frecipe < %s | FileCheck %s
+
+declare float @llvm.loongarch.frecipe.s(float)
+
+define float @frecipe_s(float %a) {
+; CHECK-LABEL: frecipe_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frecipe.s $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call float @llvm.loongarch.frecipe.s(float %a)
+  ret float %res
+}
+
+declare float @llvm.loongarch.frsqrte.s(float)
+
+define float @frsqrte_s(float %a) {
+; CHECK-LABEL: frsqrte_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    frsqrte.s $fa0, $fa0
+; CHECK-NEXT:    ret
+entry:
+  %res = call float @llvm.loongarch.frsqrte.s(float %a)
+  ret float %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
new file mode 100644
index 0000000000000..215436823af83
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frecipe.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float>)
+
+define <8 x float> @lasx_xvfrecipe_s(<8 x float> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrecipe_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrecipe.s $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.xvfrecipe.s(<8 x float> %va)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double>)
+
+define <4 x double> @lasx_xvfrecipe_d(<4 x double> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrecipe_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrecipe.d $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.xvfrecipe.d(<4 x double> %va)
+  ret <4 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
new file mode 100644
index 0000000000000..ad36c3aa5c29d
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/intrinsic-frsqrte.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
+
+declare <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float>)
+
+define <8 x float> @lasx_xvfrsqrte_s(<8 x float> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrsqrte_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrsqrte.s $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <8 x float> @llvm.loongarch.lasx.xvfrsqrte.s(<8 x float> %va)
+  ret <8 x float> %res
+}
+
+declare <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double>)
+
+define <4 x double> @lasx_xvfrsqrte_d(<4 x double> %va) nounwind {
+; CHECK-LABEL: lasx_xvfrsqrte_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvfrsqrte.d $xr0, $xr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x double> @llvm.loongarch.lasx.xvfrsqrte.d(<4 x double> %va)
+  ret <4 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
new file mode 100644
index 0000000000000..1b7a97d9f9720
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frecipe.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
+
+declare <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float>)
+
+define <4 x float> @lsx_vfrecipe_s(<4 x float> %va) nounwind {
+; CHECK-LABEL: lsx_vfrecipe_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrecipe.s $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x float> @llvm.loongarch.lsx.vfrecipe.s(<4 x float> %va)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double>)
+
+define <2 x double> @lsx_vfrecipe_d(<2 x double> %va) nounwind {
+; CHECK-LABEL: lsx_vfrecipe_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrecipe.d $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x double> @llvm.loongarch.lsx.vfrecipe.d(<2 x double> %va)
+  ret <2 x double> %res
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
new file mode 100644
index 0000000000000..3cd6c78e87d78
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/intrinsic-frsqrte.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
+
+declare <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float>)
+
+define <4 x float> @lsx_vfrsqrte_s(<4 x float> %va) nounwind {
+; CHECK-LABEL: lsx_vfrsqrte_s:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrsqrte.s $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <4 x float> @llvm.loongarch.lsx.vfrsqrte.s(<4 x float> %va)
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double>)
+
+define <2 x double> @lsx_vfrsqrte_d(<2 x double> %va) nounwind {
+; CHECK-LABEL: lsx_vfrsqrte_d:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vfrsqrte.d $vr0, $vr0
+; CHECK-NEXT:    ret
+entry:
+  %res = call <2 x double> @llvm.loongarch.lsx.vfrsqrte.d(<2 x double> %va)
+  ret <2 x double> %res
+}

From 39a1b280a170ff8e0e605693f8386b8dd161cf94 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 22 Jan 2024 22:51:44 -0800
Subject: [PATCH 559/843] [ELF] Improve ThinLTO tests

---
 lld/test/ELF/lto/cache.ll   | 118 +++++++++++++++++++-----------------
 lld/test/ELF/lto/thinlto.ll |  26 ++++----
 2 files changed, 76 insertions(+), 68 deletions(-)

diff --git a/lld/test/ELF/lto/cache.ll b/lld/test/ELF/lto/cache.ll
index 603bfce3bc12a..95c68ab2315e6 100644
--- a/lld/test/ELF/lto/cache.ll
+++ b/lld/test/ELF/lto/cache.ll
@@ -2,98 +2,106 @@
 ; NetBSD: noatime mounts currently inhibit 'touch' from updating atime
 ; UNSUPPORTED: system-netbsd
 
-; RUN: opt -module-hash -module-summary %s -o %t.o
-; RUN: opt -module-hash -module-summary %p/Inputs/cache.ll -o %t2.o
+; RUN: rm -rf %t && mkdir %t && cd %t
+; RUN: opt -module-hash -module-summary %s -o a.bc
+; RUN: opt -module-hash -module-summary %p/Inputs/cache.ll -o b.bc
 
-; RUN: rm -Rf %t.cache && mkdir %t.cache
+; RUN: mkdir cache
 ; Create two files that would be removed by cache pruning due to age.
 ; We should only remove files matching the pattern "llvmcache-*".
-; RUN: touch -t 197001011200 %t.cache/llvmcache-foo %t.cache/foo
-; RUN: ld.lld --thinlto-cache-dir=%t.cache --thinlto-cache-policy prune_after=1h:prune_interval=0s -o %t3 %t2.o %t.o
+; RUN: touch -t 197001011200 cache/llvmcache-foo cache/foo
+; RUN: ld.lld --thinlto-cache-dir=cache --thinlto-cache-policy prune_after=1h:prune_interval=0s -o out b.bc a.bc
 
 ; Two cached objects, plus a timestamp file and "foo", minus the file we removed.
-; RUN: ls %t.cache | count 4
+; RUN: ls cache | count 4
 
 ; Create a file of size 64KB.
-; RUN: %python -c "print(' ' * 65536)" > %t.cache/llvmcache-foo
+; RUN: %python -c "print(' ' * 65536)" > cache/llvmcache-foo
 
 ; This should leave the file in place.
-; RUN: ld.lld --thinlto-cache-dir=%t.cache --thinlto-cache-policy cache_size_bytes=128k:prune_interval=0s -o %t3 %t2.o %t.o
-; RUN: ls %t.cache | count 5
+; RUN: ld.lld --thinlto-cache-dir=cache --thinlto-cache-policy cache_size_bytes=128k:prune_interval=0s -o out b.bc a.bc
+; RUN: ls cache | count 5
 
 ; Increase the age of llvmcache-foo, which will give it the oldest time stamp
 ; so that it is processed and removed first.
-; RUN: %python -c 'import os,sys,time; t=time.time()-120; os.utime(sys.argv[1],(t,t))' %t.cache/llvmcache-foo
+; RUN: %python -c 'import os,sys,time; t=time.time()-120; os.utime(sys.argv[1],(t,t))' cache/llvmcache-foo
 
 ; This should remove it.
-; RUN: ld.lld --thinlto-cache-dir=%t.cache --thinlto-cache-policy cache_size_bytes=32k:prune_interval=0s -o %t3 %t2.o %t.o
-; RUN: ls %t.cache | count 4
+; RUN: ld.lld --thinlto-cache-dir=cache --thinlto-cache-policy cache_size_bytes=32k:prune_interval=0s -o out b.bc a.bc
+; RUN: ls cache | count 4
 
 ; Setting max number of files to 0 should disable the limit, not delete everything.
-; RUN: ld.lld --thinlto-cache-dir=%t.cache --thinlto-cache-policy prune_after=0s:cache_size=0%:cache_size_files=0:prune_interval=0s -o %t3 %t2.o %t.o
-; RUN: ls %t.cache | count 4
+; RUN: ld.lld --thinlto-cache-dir=cache --thinlto-cache-policy prune_after=0s:cache_size=0%:cache_size_files=0:prune_interval=0s -o out b.bc a.bc
+; RUN: ls cache | count 4
 
 ; Delete everything except for the timestamp, "foo" and one cache file.
-; RUN: ld.lld --thinlto-cache-dir=%t.cache --thinlto-cache-policy prune_after=0s:cache_size=0%:cache_size_files=1:prune_interval=0s -o %t3 %t2.o %t.o
-; RUN: ls %t.cache | count 3
+; RUN: ld.lld --thinlto-cache-dir=cache --thinlto-cache-policy prune_after=0s:cache_size=0%:cache_size_files=1:prune_interval=0s -o out b.bc a.bc
+; RUN: ls cache | count 3
 
 ; Check that we remove the least recently used file first.
-; RUN: rm -fr %t.cache && mkdir %t.cache
-; RUN: echo xyz > %t.cache/llvmcache-old
-; RUN: touch -t 198002011200 %t.cache/llvmcache-old
-; RUN: echo xyz > %t.cache/llvmcache-newer
-; RUN: touch -t 198002021200 %t.cache/llvmcache-newer
-; RUN: ld.lld --thinlto-cache-dir=%t.cache --thinlto-cache-policy prune_after=0s:cache_size=0%:cache_size_files=3:prune_interval=0s -o %t3 %t2.o %t.o
-; RUN: ls %t.cache | FileCheck %s
+; RUN: rm -fr cache && mkdir cache
+; RUN: echo xyz > cache/llvmcache-old
+; RUN: touch -t 198002011200 cache/llvmcache-old
+; RUN: echo xyz > cache/llvmcache-newer
+; RUN: touch -t 198002021200 cache/llvmcache-newer
+; RUN: ld.lld --thinlto-cache-dir=cache --thinlto-cache-policy prune_after=0s:cache_size=0%:cache_size_files=3:prune_interval=0s -o out b.bc a.bc
+; RUN: ls cache | FileCheck %s
 
 ; CHECK-NOT: llvmcache-old
 ; CHECK: llvmcache-newer
 ; CHECK-NOT: llvmcache-old
 
+; RUN: rm -fr cache && mkdir cache
+; RUN: ld.lld --thinlto-cache-dir=cache --save-temps -o out b.bc a.bc -M | FileCheck %s --check-prefix=MAP
+; RUN: ls out1.lto.o a.bc.0.preopt.bc b.bc.0.preopt.bc
+
+; MAP: llvmcache-{{.*}}:(.text)
+; MAP: llvmcache-{{.*}}:(.text)
+
 ;; Check that mllvm options participate in the cache key
-; RUN: rm -rf %t.cache && mkdir %t.cache
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o
-; RUN: ls %t.cache | count 3
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -enable-ml-inliner=default
-; RUN: ls %t.cache | count 5
+; RUN: rm -rf cache && mkdir cache
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc
+; RUN: ls cache | count 3
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -enable-ml-inliner=default
+; RUN: ls cache | count 5
 
 ;; Adding another option resuls in 2 more cache entries
-; RUN: rm -rf %t.cache && mkdir %t.cache
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o
-; RUN: ls %t.cache | count 3
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -enable-ml-inliner=default
-; RUN: ls %t.cache | count 5
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -enable-ml-inliner=default -mllvm -max-devirt-iterations=1
-; RUN: ls %t.cache | count 7
+; RUN: rm -rf cache && mkdir cache
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc
+; RUN: ls cache | count 3
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -enable-ml-inliner=default
+; RUN: ls cache | count 5
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -enable-ml-inliner=default -mllvm -max-devirt-iterations=1
+; RUN: ls cache | count 7
 
 ;; Changing order may matter - e.g. if overriding -mllvm options - so we get 2 more entries
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -max-devirt-iterations=1 -mllvm -enable-ml-inliner=default
-; RUN: ls %t.cache | count 9
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -max-devirt-iterations=1 -mllvm -enable-ml-inliner=default
+; RUN: ls cache | count 9
 
 ;; Going back to a pre-cached order doesn't create more entries.
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -enable-ml-inliner=default -mllvm -max-devirt-iterations=1
-; RUN: ls %t.cache | count 9
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -enable-ml-inliner=default -mllvm -max-devirt-iterations=1
+; RUN: ls cache | count 9
 
 ;; Different flag values matter
-; RUN: rm -rf %t.cache && mkdir %t.cache
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -enable-ml-inliner=default -mllvm -max-devirt-iterations=2
-; RUN: ls %t.cache | count 3
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -enable-ml-inliner=default -mllvm -max-devirt-iterations=1
-; RUN: ls %t.cache | count 5
+; RUN: rm -rf cache && mkdir cache
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -enable-ml-inliner=default -mllvm -max-devirt-iterations=2
+; RUN: ls cache | count 3
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -enable-ml-inliner=default -mllvm -max-devirt-iterations=1
+; RUN: ls cache | count 5
 
 ;; Same flag value passed to different flags matters, and switching the order
 ;; of the two flags matters.
-; RUN: rm -rf %t.cache && mkdir %t.cache
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -enable-ml-inliner=default
-; RUN: ls %t.cache | count 3
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -emit-dwarf-unwind=default
-; RUN: ls %t.cache | count 5
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -enable-ml-inliner=default
-; RUN: ls %t.cache | count 5
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -enable-ml-inliner=default -mllvm -emit-dwarf-unwind=default
-; RUN: ls %t.cache | count 7
-; RUN: ld.lld --thinlto-cache-dir=%t.cache -o %t3 %t2.o %t.o -mllvm -emit-dwarf-unwind=default -mllvm -enable-ml-inliner=default
-; RUN: ls %t.cache | count 9
+; RUN: rm -rf cache && mkdir cache
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -enable-ml-inliner=default
+; RUN: ls cache | count 3
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -emit-dwarf-unwind=default
+; RUN: ls cache | count 5
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -enable-ml-inliner=default
+; RUN: ls cache | count 5
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -enable-ml-inliner=default -mllvm -emit-dwarf-unwind=default
+; RUN: ls cache | count 7
+; RUN: ld.lld --thinlto-cache-dir=cache -o out b.bc a.bc -mllvm -emit-dwarf-unwind=default -mllvm -enable-ml-inliner=default
+; RUN: ls cache | count 9
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/lld/test/ELF/lto/thinlto.ll b/lld/test/ELF/lto/thinlto.ll
index 4145007d9b201..f42a0323ff044 100644
--- a/lld/test/ELF/lto/thinlto.ll
+++ b/lld/test/ELF/lto/thinlto.ll
@@ -3,66 +3,66 @@
 ; Basic ThinLTO tests.
 ; RUN: rm -rf %t && mkdir %t && cd %t
 ; RUN: mkdir d e
-; RUN: opt -module-summary %s -o 1.o
-; RUN: opt -module-summary %p/Inputs/thinlto.ll -o d/2.o
+; RUN: opt -module-summary %s -o a.o
+; RUN: opt -module-summary %p/Inputs/thinlto.ll -o d/b.o
 
 ; First force single-threaded mode
 ; RUN: rm -f e/out1.lto.o e/out2.lto.o
-; RUN: ld.lld -save-temps --thinlto-jobs=1 -shared 1.o d/2.o -o e/out
+; RUN: ld.lld -save-temps --thinlto-jobs=1 -shared a.o d/b.o -o e/out
 ; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Next force multi-threaded mode
 ; RUN: rm -f e/out1.lto.o e/out2.lto.o
-; RUN: ld.lld -save-temps --thinlto-jobs=2 -shared 1.o d/2.o -o e/out
+; RUN: ld.lld -save-temps --thinlto-jobs=2 -shared a.o d/b.o -o e/out
 ; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ;; --plugin-opt=jobs= is an alias.
 ; RUN: rm -f e/out1.lto.o e/out2.lto.o
-; RUN: ld.lld -save-temps --plugin-opt=jobs=2 -shared 1.o d/2.o -o e/out
+; RUN: ld.lld -save-temps --plugin-opt=jobs=2 -shared a.o d/b.o -o e/out
 ; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ;; --thinlto-jobs= defaults to --threads=.
 ; RUN: rm -f e/out1.lto.o e/out2.lto.o
-; RUN: ld.lld -save-temps --threads=2 -shared 1.o d/2.o -o e/out
+; RUN: ld.lld -save-temps --threads=2 -shared a.o d/b.o -o e/out
 ; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ;; --thinlto-jobs= overrides --threads=.
 ; RUN: rm -f e/out1.lto.o e/out2.lto.o
-; RUN: ld.lld -save-temps --threads=1 --plugin-opt=jobs=2 -shared 1.o d/2.o -o e/out
+; RUN: ld.lld -save-temps --threads=1 --plugin-opt=jobs=2 -shared a.o d/b.o -o e/out
 ; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Test with all threads, on all cores, on all CPU sockets
 ; RUN: rm -f e/out1.lto.o e/out2.lto.o
-; RUN: ld.lld -save-temps --thinlto-jobs=all -shared 1.o d/2.o -o e/out
+; RUN: ld.lld -save-temps --thinlto-jobs=all -shared a.o d/b.o -o e/out
 ; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Test with many more threads than the system has
 ; RUN: rm -f e/out1.lto.o e/out2.lto.o
-; RUN: ld.lld -save-temps --thinlto-jobs=100 -shared 1.o d/2.o -o e/out
+; RUN: ld.lld -save-temps --thinlto-jobs=100 -shared a.o d/b.o -o e/out
 ; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Test with a bad value
 ; RUN: rm -f e/out1.lto.o e/out2.lto.o
-; RUN: not ld.lld -save-temps --thinlto-jobs=foo -shared 1.o d/2.o -o e/out 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
+; RUN: not ld.lld -save-temps --thinlto-jobs=foo -shared a.o d/b.o -o e/out 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
 ; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo
 
 ; Then check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT)
-; RUN: ld.lld -shared -save-temps 1.o d/2.o -o e/out
+; RUN: ld.lld -shared -save-temps a.o d/b.o -o e/out
 ; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
 ; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
 
 ; Check that -save-temps is usable with thin archives
 ; RUN: mkdir dir
-; RUN: cp d/2.o dir/t.o
+; RUN: cp d/b.o dir/t.o
 ; RUN: llvm-ar rcsT dir/t.a dir/t.o
-; RUN: ld.lld -save-temps 1.o dir/t.a -o %t.null
+; RUN: ld.lld -save-temps a.o dir/t.a -o %t.null
 ; RUN: ls dir/t.a*.0.preopt.bc
 
 ; NM1: T f

From 150a58bed08f9f83ae4157a033ef1c67c210bbc4 Mon Sep 17 00:00:00 2001
From: Piotr Zegar <me@piotrzegar.pl>
Date: Tue, 23 Jan 2024 07:09:24 +0000
Subject: [PATCH 560/843] [clang-tidy][DOC] Update list.rst

Update list.rst to put checks in alphabetical order
---
 clang-tools-extra/docs/clang-tidy/checks/list.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/list.rst b/clang-tools-extra/docs/clang-tidy/checks/list.rst
index 1fac3874b3c32..f773e80b562e4 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/list.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/list.rst
@@ -355,7 +355,6 @@ Clang-Tidy Checks
    :doc:`readability-identifier-length <readability/identifier-length>`,
    :doc:`readability-identifier-naming <readability/identifier-naming>`, "Yes"
    :doc:`readability-implicit-bool-conversion <readability/implicit-bool-conversion>`, "Yes"
-   :doc:`readability-redundant-inline-specifier <readability/redundant-inline-specifier>`, "Yes"
    :doc:`readability-inconsistent-declaration-parameter-name <readability/inconsistent-declaration-parameter-name>`, "Yes"
    :doc:`readability-isolate-declaration <readability/isolate-declaration>`, "Yes"
    :doc:`readability-magic-numbers <readability/magic-numbers>`,
@@ -371,6 +370,7 @@ Clang-Tidy Checks
    :doc:`readability-redundant-control-flow <readability/redundant-control-flow>`, "Yes"
    :doc:`readability-redundant-declaration <readability/redundant-declaration>`, "Yes"
    :doc:`readability-redundant-function-ptr-dereference <readability/redundant-function-ptr-dereference>`, "Yes"
+   :doc:`readability-redundant-inline-specifier <readability/redundant-inline-specifier>`, "Yes"
    :doc:`readability-redundant-member-init <readability/redundant-member-init>`, "Yes"
    :doc:`readability-redundant-preprocessor <readability/redundant-preprocessor>`,
    :doc:`readability-redundant-smartptr-get <readability/redundant-smartptr-get>`, "Yes"

From 297b77036e82b7684e1823c14b318e365aed5b26 Mon Sep 17 00:00:00 2001
From: Simeon K <5235180+simeonkr@users.noreply.github.com>
Date: Tue, 23 Jan 2024 07:10:25 +0000
Subject: [PATCH 561/843] [RISCV] Fix stack size computation when M extension
 disabled (#78602)

Ensure that getVLENFactoredAmount does not fail when the scale amount
requires the use of a non-trivial multiplication but the M extension is
not enabled. In such case, perform the multiplication using shifts and
adds.
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      | 33 ++++++++--
 .../CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll  | 62 +++++++++++++++++++
 2 files changed, 89 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 9813c7a70dfc3..52a9667b44104 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3126,18 +3126,39 @@ void RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
         .addReg(ScaledRegister, RegState::Kill)
         .addReg(DestReg, RegState::Kill)
         .setMIFlag(Flag);
-  } else {
+  } else if (STI.hasStdExtM() || STI.hasStdExtZmmul()) {
     Register N = MRI.createVirtualRegister(&RISCV::GPRRegClass);
     movImm(MBB, II, DL, N, NumOfVReg, Flag);
-    if (!STI.hasStdExtM() && !STI.hasStdExtZmmul())
-      MF.getFunction().getContext().diagnose(DiagnosticInfoUnsupported{
-          MF.getFunction(),
-          "M- or Zmmul-extension must be enabled to calculate the vscaled size/"
-          "offset."});
     BuildMI(MBB, II, DL, get(RISCV::MUL), DestReg)
         .addReg(DestReg, RegState::Kill)
         .addReg(N, RegState::Kill)
         .setMIFlag(Flag);
+  } else {
+    Register Acc = MRI.createVirtualRegister(&RISCV::GPRRegClass);
+    BuildMI(MBB, II, DL, get(RISCV::ADDI), Acc)
+        .addReg(RISCV::X0)
+        .addImm(0)
+        .setMIFlag(Flag);
+    uint32_t PrevShiftAmount = 0;
+    for (uint32_t ShiftAmount = 0; NumOfVReg >> ShiftAmount; ShiftAmount++) {
+      if (NumOfVReg & (1 << ShiftAmount)) {
+        if (ShiftAmount)
+          BuildMI(MBB, II, DL, get(RISCV::SLLI), DestReg)
+              .addReg(DestReg, RegState::Kill)
+              .addImm(ShiftAmount - PrevShiftAmount)
+              .setMIFlag(Flag);
+        if (NumOfVReg >> (ShiftAmount + 1))
+          BuildMI(MBB, II, DL, get(RISCV::ADD), Acc)
+              .addReg(Acc, RegState::Kill)
+              .addReg(DestReg)
+              .setMIFlag(Flag);
+        PrevShiftAmount = ShiftAmount;
+      }
+    }
+    BuildMI(MBB, II, DL, get(RISCV::ADD), DestReg)
+        .addReg(DestReg, RegState::Kill)
+        .addReg(Acc)
+        .setMIFlag(Flag);
   }
 }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
index e6adb725110a4..78bec6c68c3f6 100644
--- a/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/allocate-lmul-2-4-8.ll
@@ -3,6 +3,8 @@
 ; RUN:    | FileCheck %s --check-prefixes=CHECK,NOZBA
 ; RUN: llc -mtriple=riscv64 -mattr=+m,+v,+zba -verify-machineinstrs < %s \
 ; RUN:    | FileCheck %s --check-prefixes=CHECK,ZBA
+; RUN: llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs < %s \
+; RUN:    | FileCheck %s --check-prefixes=CHECK,NOMUL
 
 define void @lmul1() nounwind {
 ; CHECK-LABEL: lmul1:
@@ -243,6 +245,26 @@ define void @lmul4_and_2_x2_1() nounwind {
 ; ZBA-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
 ; ZBA-NEXT:    addi sp, sp, 48
 ; ZBA-NEXT:    ret
+;
+; NOMUL-LABEL: lmul4_and_2_x2_1:
+; NOMUL:       # %bb.0:
+; NOMUL-NEXT:    addi sp, sp, -48
+; NOMUL-NEXT:    sd ra, 40(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    sd s0, 32(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    addi s0, sp, 48
+; NOMUL-NEXT:    csrr a0, vlenb
+; NOMUL-NEXT:    li a1, 0
+; NOMUL-NEXT:    slli a0, a0, 2
+; NOMUL-NEXT:    add a1, a1, a0
+; NOMUL-NEXT:    slli a0, a0, 1
+; NOMUL-NEXT:    add a0, a0, a1
+; NOMUL-NEXT:    sub sp, sp, a0
+; NOMUL-NEXT:    andi sp, sp, -32
+; NOMUL-NEXT:    addi sp, s0, -48
+; NOMUL-NEXT:    ld ra, 40(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    ld s0, 32(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    addi sp, sp, 48
+; NOMUL-NEXT:    ret
   %v1 = alloca <vscale x 4 x i64>
   %v3 = alloca <vscale x 4 x i64>
   %v2 = alloca <vscale x 2 x i64>
@@ -425,6 +447,26 @@ define void @lmul_8_x5() nounwind {
 ; ZBA-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZBA-NEXT:    addi sp, sp, 80
 ; ZBA-NEXT:    ret
+;
+; NOMUL-LABEL: lmul_8_x5:
+; NOMUL:       # %bb.0:
+; NOMUL-NEXT:    addi sp, sp, -80
+; NOMUL-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    addi s0, sp, 80
+; NOMUL-NEXT:    csrr a0, vlenb
+; NOMUL-NEXT:    li a1, 0
+; NOMUL-NEXT:    slli a0, a0, 3
+; NOMUL-NEXT:    add a1, a1, a0
+; NOMUL-NEXT:    slli a0, a0, 2
+; NOMUL-NEXT:    add a0, a0, a1
+; NOMUL-NEXT:    sub sp, sp, a0
+; NOMUL-NEXT:    andi sp, sp, -64
+; NOMUL-NEXT:    addi sp, s0, -80
+; NOMUL-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    addi sp, sp, 80
+; NOMUL-NEXT:    ret
   %v1 = alloca <vscale x 8 x i64>
   %v2 = alloca <vscale x 8 x i64>
   %v3 = alloca <vscale x 8 x i64>
@@ -467,6 +509,26 @@ define void @lmul_8_x9() nounwind {
 ; ZBA-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
 ; ZBA-NEXT:    addi sp, sp, 80
 ; ZBA-NEXT:    ret
+;
+; NOMUL-LABEL: lmul_8_x9:
+; NOMUL:       # %bb.0:
+; NOMUL-NEXT:    addi sp, sp, -80
+; NOMUL-NEXT:    sd ra, 72(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    sd s0, 64(sp) # 8-byte Folded Spill
+; NOMUL-NEXT:    addi s0, sp, 80
+; NOMUL-NEXT:    csrr a0, vlenb
+; NOMUL-NEXT:    li a1, 0
+; NOMUL-NEXT:    slli a0, a0, 3
+; NOMUL-NEXT:    add a1, a1, a0
+; NOMUL-NEXT:    slli a0, a0, 3
+; NOMUL-NEXT:    add a0, a0, a1
+; NOMUL-NEXT:    sub sp, sp, a0
+; NOMUL-NEXT:    andi sp, sp, -64
+; NOMUL-NEXT:    addi sp, s0, -80
+; NOMUL-NEXT:    ld ra, 72(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    ld s0, 64(sp) # 8-byte Folded Reload
+; NOMUL-NEXT:    addi sp, sp, 80
+; NOMUL-NEXT:    ret
   %v1 = alloca <vscale x 8 x i64>
   %v2 = alloca <vscale x 8 x i64>
   %v3 = alloca <vscale x 8 x i64>

From f799f936929c232a16abc7c520a10fecadbf05f9 Mon Sep 17 00:00:00 2001
From: yjijd <licongtian@loongson.cn>
Date: Tue, 23 Jan 2024 15:16:23 +0800
Subject: [PATCH 562/843] [CodeGen][LoongArch] Set SINT_TO_FP/UINT_TO_FP to
 legal for vector types (#78924)

Support the following conversions:
v4i32->v4f32, v2i64->v2f64(LSX)
v8i32->v8f32, v4i64->v4f64(LASX)
v4i32->v4f64, v4i64->v4f32(LASX)
---
 .../LoongArch/LoongArchISelLowering.cpp       |  4 ++
 .../LoongArch/LoongArchLASXInstrInfo.td       | 22 +++++++
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |  8 +++
 .../LoongArch/lasx/ir-instruction/sitofp.ll   | 57 +++++++++++++++++++
 .../LoongArch/lasx/ir-instruction/uitofp.ll   | 57 +++++++++++++++++++
 .../LoongArch/lsx/ir-instruction/sitofp.ll    | 28 +++++++++
 .../LoongArch/lsx/ir-instruction/uitofp.ll    | 28 +++++++++
 7 files changed, 204 insertions(+)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sitofp.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uitofp.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sitofp.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uitofp.ll

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 70f782b812702..0d6eb4b8149c4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -265,6 +265,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
     }
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP},
+                       {MVT::v4i32, MVT::v2i64}, Legal);
     for (MVT VT : {MVT::v4f32, MVT::v2f64}) {
       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
@@ -307,6 +309,8 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
     }
+    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP},
+                       {MVT::v8i32, MVT::v4i32, MVT::v4i64}, Legal);
     for (MVT VT : {MVT::v8f32, MVT::v4f64}) {
       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 1d73133ad1120..2426dad507802 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1615,6 +1615,28 @@ foreach vt = [v32i8, v16i16, v8i32, v4i64, v8f32, v4f64] in
 def : Pat<(fneg (v8f32 LASX256:$xj)), (XVBITREVI_W LASX256:$xj, 31)>;
 def : Pat<(fneg (v4f64 LASX256:$xj)), (XVBITREVI_D LASX256:$xj, 63)>;
 
+// XVFFINT_{S_W/D_L}
+def : Pat<(v8f32 (sint_to_fp v8i32:$vj)), (XVFFINT_S_W v8i32:$vj)>;
+def : Pat<(v4f64 (sint_to_fp v4i64:$vj)), (XVFFINT_D_L v4i64:$vj)>;
+def : Pat<(v4f64 (sint_to_fp v4i32:$vj)),
+          (XVFFINT_D_L (VEXT2XV_D_W (SUBREG_TO_REG (i64 0), v4i32:$vj,
+                                                   sub_128)))>;
+def : Pat<(v4f32 (sint_to_fp v4i64:$vj)),
+          (EXTRACT_SUBREG (XVFCVT_S_D (XVPERMI_D (XVFFINT_D_L v4i64:$vj), 238),
+                                      (XVFFINT_D_L v4i64:$vj)),
+                          sub_128)>;
+
+// XVFFINT_{S_WU/D_LU}
+def : Pat<(v8f32 (uint_to_fp v8i32:$vj)), (XVFFINT_S_WU v8i32:$vj)>;
+def : Pat<(v4f64 (uint_to_fp v4i64:$vj)), (XVFFINT_D_LU v4i64:$vj)>;
+def : Pat<(v4f64 (uint_to_fp v4i32:$vj)),
+          (XVFFINT_D_LU (VEXT2XV_DU_WU (SUBREG_TO_REG (i64 0), v4i32:$vj,
+                                                      sub_128)))>;
+def : Pat<(v4f32 (uint_to_fp v4i64:$vj)),
+          (EXTRACT_SUBREG (XVFCVT_S_D (XVPERMI_D (XVFFINT_D_LU v4i64:$vj), 238),
+                                       (XVFFINT_D_LU v4i64:$vj)),
+                          sub_128)>;
+
 } // Predicates = [HasExtLASX]
 
 /// Intrinsic pattern
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index f9d0647161f5b..71256353116a7 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1746,6 +1746,14 @@ foreach vt = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
 def : Pat<(fneg (v4f32 LSX128:$vj)), (VBITREVI_W LSX128:$vj, 31)>;
 def : Pat<(fneg (v2f64 LSX128:$vj)), (VBITREVI_D LSX128:$vj, 63)>;
 
+// VFFINT_{S_W/D_L}
+def : Pat<(v4f32 (sint_to_fp v4i32:$vj)), (VFFINT_S_W v4i32:$vj)>;
+def : Pat<(v2f64 (sint_to_fp v2i64:$vj)), (VFFINT_D_L v2i64:$vj)>;
+
+// VFFINT_{S_WU/D_LU}
+def : Pat<(v4f32 (uint_to_fp v4i32:$vj)), (VFFINT_S_WU v4i32:$vj)>;
+def : Pat<(v2f64 (uint_to_fp v2i64:$vj)), (VFFINT_D_LU v2i64:$vj)>;
+
 } // Predicates = [HasExtLSX]
 
 /// Intrinsic pattern
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sitofp.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sitofp.ll
new file mode 100644
index 0000000000000..208a758ea4e9a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/sitofp.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @sitofp_v8i32_v8f32(ptr %res, ptr %in){
+; CHECK-LABEL: sitofp_v8i32_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvffint.s.w $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i32>, ptr %in
+  %v1 = sitofp <8 x i32> %v0 to <8 x float>
+  store <8 x float> %v1, ptr %res
+  ret void
+}
+
+define void @sitofp_v4f64_v4f64(ptr %res, ptr %in){
+; CHECK-LABEL: sitofp_v4f64_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvffint.d.l $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i64>, ptr %in
+  %v1 = sitofp <4 x i64> %v0 to <4 x double>
+  store <4 x double> %v1, ptr %res
+  ret void
+}
+
+define void @sitofp_v4i64_v4f32(ptr %res, ptr %in){
+; CHECK-LABEL: sitofp_v4i64_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvffint.d.l $xr0, $xr0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
+; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i64>, ptr %in
+  %v1 = sitofp <4 x i64> %v0 to <4 x float>
+  store <4 x float> %v1, ptr %res
+  ret void
+}
+
+define void @sitofp_v4i32_v4f64(ptr %res, ptr %in){
+; CHECK-LABEL: sitofp_v4i32_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vext2xv.d.w $xr0, $xr0
+; CHECK-NEXT:    xvffint.d.l $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i32>, ptr %in
+  %v1 = sitofp <4 x i32> %v0 to <4 x double>
+  store <4 x double> %v1, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uitofp.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uitofp.ll
new file mode 100644
index 0000000000000..70cf71c4cec21
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/uitofp.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @uitofp_v8i32_v8f32(ptr %res, ptr %in){
+; CHECK-LABEL: uitofp_v8i32_v8f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvffint.s.wu $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x i32>, ptr %in
+  %v1 = uitofp <8 x i32> %v0 to <8 x float>
+  store <8 x float> %v1, ptr %res
+  ret void
+}
+
+define void @uitofp_v4f64_v4f64(ptr %res, ptr %in){
+; CHECK-LABEL: uitofp_v4f64_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvffint.d.lu $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i64>, ptr %in
+  %v1 = uitofp <4 x i64> %v0 to <4 x double>
+  store <4 x double> %v1, ptr %res
+  ret void
+}
+
+define void @uitofp_v4i64_v4f32(ptr %res, ptr %in){
+; CHECK-LABEL: uitofp_v4i64_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvffint.d.lu $xr0, $xr0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
+; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i64>, ptr %in
+  %v1 = uitofp <4 x i64> %v0 to <4 x float>
+  store <4 x float> %v1, ptr %res
+  ret void
+}
+
+define void @uitofp_v4i32_v4f64(ptr %res, ptr %in){
+; CHECK-LABEL: uitofp_v4i32_v4f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vext2xv.du.wu $xr0, $xr0
+; CHECK-NEXT:    xvffint.d.lu $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i32>, ptr %in
+  %v1 = uitofp <4 x i32> %v0 to <4 x double>
+  store <4 x double> %v1, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sitofp.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sitofp.ll
new file mode 100644
index 0000000000000..1e820a37a2409
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/sitofp.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @sitofp_v4i32_v4f32(ptr %res, ptr %in){
+; CHECK-LABEL: sitofp_v4i32_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vffint.s.w $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i32>, ptr %in
+  %v1 = sitofp <4 x i32> %v0 to <4 x float>
+  store <4 x float> %v1, ptr %res
+  ret void
+}
+
+define void @sitofp_v2i64_v2f64(ptr %res, ptr %in){
+; CHECK-LABEL: sitofp_v2i64_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vffint.d.l $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <2 x i64>, ptr %in
+  %v1 = sitofp <2 x i64> %v0 to <2 x double>
+  store <2 x double> %v1, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uitofp.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uitofp.ll
new file mode 100644
index 0000000000000..3d4913f12e57e
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/uitofp.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @uitofp_v4i32_v4f32(ptr %res, ptr %in){
+; CHECK-LABEL: uitofp_v4i32_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vffint.s.wu $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x i32>, ptr %in
+  %v1 = uitofp <4 x i32> %v0 to <4 x float>
+  store <4 x float> %v1, ptr %res
+  ret void
+}
+
+define void @uitofp_v2i64_v2f64(ptr %res, ptr %in){
+; CHECK-LABEL: uitofp_v2i64_v2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vffint.d.lu $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <2 x i64>, ptr %in
+  %v1 = uitofp <2 x i64> %v0 to <2 x double>
+  store <2 x double> %v1, ptr %res
+  ret void
+}

From 9e2c0f000549991ea5585d6799ade92811a8faae Mon Sep 17 00:00:00 2001
From: Lu Weining <luweining@loongson.cn>
Date: Tue, 23 Jan 2024 15:27:06 +0800
Subject: [PATCH 563/843] [docs] Add llvm & clang release notes for LoongArch
 (#79097)

---
 clang/docs/ReleaseNotes.rst | 14 ++++++++++++--
 llvm/docs/ReleaseNotes.rst  | 17 +++++++++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index cb15dfb99bec9..93eecbe0e1363 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1189,8 +1189,14 @@ Windows Support
 
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
-- The ``model`` attribute is now supported for overriding the default code
-  model used to access global variables. The following values are supported:
+- Added builtins support for all LSX (128-bits SIMD) and LASX (256-bits SIMD)
+  instructions.
+- Added builtins support for approximate calculation instructions that were
+  introduced in LoongArch Reference Manual V1.10.
+- Made ``-mcmodel=`` compatible with LoongArch gcc that accepted ``normal``,
+  ``medium`` and ``extreme``.
+- The ``model`` attribute was now supported for overriding the default code
+  model used to access global variables. The following values were supported:
   ``normal``, ``medium`` and ``extreme``.
 
   *Example Code*:
@@ -1199,6 +1205,10 @@ LoongArch Support
 
      int var __attribute((model("extreme")));
 
+- Default to ``-fno-direct-access-external-data`` for non-PIC.
+- An ABI mismatch with gcc/g++ about empty structs/unions passing was fixed.
+- ``_mcount`` was generated instead of ``mcount``.
+
 RISC-V Support
 ^^^^^^^^^^^^^^
 - Unaligned memory accesses can be toggled by ``-m[no-]unaligned-access`` or the
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index c17c834c8081b..8d75b6415e557 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -132,8 +132,21 @@ Changes to the Hexagon Backend
 
 Changes to the LoongArch Backend
 --------------------------------
-* The code model of global variables can now be overridden by means of
-  the newly added LLVM IR attribute, ``code_model``.
+
+* Added intrinsics support for all LSX (128-bits SIMD) and LASX (256-bits SIMD)
+  instructions.
+* Added definition and intrinsics support for new instructions that were
+  introduced in LoongArch Reference Manual V1.10.
+* Emitted adjacent ``pcaddu18i+jirl`` instrunction sequence with one relocation
+  ``R_LARCH_CALL36`` instead of ``pcalau12i+jirl`` with two relocations
+  ``R_LARCH_PCALA_{HI20,LO12}`` for function call in medium code model.
+* The code model of global variables can now be overridden by means of the newly
+  added LLVM IR attribute, ``code_model``.
+* Added support for the ``llvm.is.fpclass`` intrinsic.
+* ``mulodi4`` and ``muloti4`` libcalls were disabled due to absence in libgcc.
+* Added initial support for auto vectorization.
+* Added initial support for linker relaxation.
+* Assorted codegen improvements.
 
 Changes to the MIPS Backend
 ---------------------------

From 8938bc0ad0651f8d481cca2b2b98345d34b3aabe Mon Sep 17 00:00:00 2001
From: Konstantin Varlamov <varconsteq@gmail.com>
Date: Mon, 22 Jan 2024 23:31:58 -0800
Subject: [PATCH 564/843] [libc++][hardening] Categorize assertions related to
 strict weak ordering (#77405)

If a user passes a comparator that doesn't satisfy strict weak ordering
(see https://eel.is/c++draft/algorithms#alg.sorting.general) to
a sorting algorithm, the algorithm can produce an incorrect result or
even lead
to an out-of-bounds access. Unfortunately, comprehensively validating
that a given comparator indeed satisfies the strict weak ordering
requirement is prohibitively expensive (see [the related
RFC](https://discourse.llvm.org/t/rfc-strict-weak-ordering-checks-in-the-debug-libc/70217)).
As a result, we have three independent sets of checks:

- assertions that catch out-of-bounds accesses within the algorithms'
  implementation. These are relatively cheap; however, they cannot catch
  the underlying cause and cannot prevent the case where an invalid
  comparator would result in an incorrectly-sorted sequence without
  actually triggering an OOB access;

- debug comparators that wrap a given comparator and on each comparison
  check that if `(a < b)`, then `!(b < a)`, where `<` stands for the
  user-provided comparator. This performs up to 2x number of comparisons
  but doesn't affect the algorithmic complexity. While this approach can
  find more issues, it is still a heuristic;

- a comprehensive check of the comparator that validates up to 100
  elements in the resulting sorted sequence (see the RFC above for
details). The check is expensive but the 100 element limit can somewhat
  compensate for that, especially for large values of `N`.

The first set of checks is enabled in the fast hardening mode while the
other two are only enabled in the debug mode.

This patch also removes the
`_LIBCPP_DEBUG_STRICT_WEAK_ORDERING_CHECK` macro that
previously was used to selectively enable the 100-element check.
Now this check is enabled unconditionally in the debug mode.

Also, introduce a new category
`_LIBCPP_ASSERT_SEMANTIC_REQUIREMENT`. This category is
intended for checking the semantic requirements from the Standard.
Typically, these are hard or impossible to completely validate, so
these checks are expected to be heuristic in nature and potentially
quite expensive.

See https://reviews.llvm.org/D150264 for additional background.
Fixes #71496
---
 libcxx/include/__algorithm/comp_ref_type.h    |   5 +-
 libcxx/include/__algorithm/nth_element.h      |   8 +-
 libcxx/include/__algorithm/sort.h             |  22 +-
 .../__algorithm/three_way_comp_ref_type.h     |   6 +-
 libcxx/include/__config                       |   8 +
 .../strict_weak_ordering_check.h              |  14 +-
 .../assert.sort.invalid_comparator.pass.cpp   | 255 ------------------
 ...ssert.sort.invalid_comparator.oob.pass.cpp |  72 +++++
 .../assert.sort.invalid_comparator.pass.cpp   | 195 ++++++++++++++
 .../bad_comparator_values.h                   |   6 +-
 .../invalid_comparator_utilities.h            |  85 ++++++
 11 files changed, 390 insertions(+), 286 deletions(-)
 delete mode 100644 libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/assert.sort.invalid_comparator.oob.pass.cpp
 create mode 100644 libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/assert.sort.invalid_comparator.pass.cpp
 rename libcxx/test/libcxx/algorithms/alg.sorting/{ => assert.sort.invalid_comparator}/bad_comparator_values.h (98%)
 create mode 100644 libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/invalid_comparator_utilities.h

diff --git a/libcxx/include/__algorithm/comp_ref_type.h b/libcxx/include/__algorithm/comp_ref_type.h
index 15f4a535a30bf..aa9350c38caa9 100644
--- a/libcxx/include/__algorithm/comp_ref_type.h
+++ b/libcxx/include/__algorithm/comp_ref_type.h
@@ -44,7 +44,7 @@ struct __debug_less {
   _LIBCPP_CONSTEXPR_SINCE_CXX14 inline _LIBCPP_HIDE_FROM_ABI decltype((void)std::declval<_Compare&>()(
       std::declval<_LHS&>(), std::declval<_RHS&>()))
   __do_compare_assert(int, _LHS& __l, _RHS& __r) {
-    _LIBCPP_ASSERT_UNCATEGORIZED(!__comp_(__l, __r), "Comparator does not induce a strict weak ordering");
+    _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(!__comp_(__l, __r), "Comparator does not induce a strict weak ordering");
     (void)__l;
     (void)__r;
   }
@@ -53,8 +53,7 @@ struct __debug_less {
   _LIBCPP_CONSTEXPR_SINCE_CXX14 inline _LIBCPP_HIDE_FROM_ABI void __do_compare_assert(long, _LHS&, _RHS&) {}
 };
 
-// Pass the comparator by lvalue reference. Or in debug mode, using a
-// debugging wrapper that stores a reference.
+// Pass the comparator by lvalue reference. Or in the debug mode, using a debugging wrapper that stores a reference.
 #if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 template <class _Comp>
 using __comp_ref_type = __debug_less<_Comp>;
diff --git a/libcxx/include/__algorithm/nth_element.h b/libcxx/include/__algorithm/nth_element.h
index a059705125951..37ddfbdacf044 100644
--- a/libcxx/include/__algorithm/nth_element.h
+++ b/libcxx/include/__algorithm/nth_element.h
@@ -114,12 +114,12 @@ __nth_element(
         while (true) {
           while (!__comp(*__first, *__i)) {
             ++__i;
-            _LIBCPP_ASSERT_UNCATEGORIZED(
+            _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
                 __i != __last,
                 "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
           }
           do {
-            _LIBCPP_ASSERT_UNCATEGORIZED(
+            _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
                 __j != __first,
                 "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
             --__j;
@@ -150,13 +150,13 @@ __nth_element(
         // __m still guards upward moving __i
         while (__comp(*__i, *__m)) {
           ++__i;
-          _LIBCPP_ASSERT_UNCATEGORIZED(
+          _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
               __i != __last,
               "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
         }
         // It is now known that a guard exists for downward moving __j
         do {
-          _LIBCPP_ASSERT_UNCATEGORIZED(
+          _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
               __j != __first,
               "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
           --__j;
diff --git a/libcxx/include/__algorithm/sort.h b/libcxx/include/__algorithm/sort.h
index ac47489af0aac..451133a2d193d 100644
--- a/libcxx/include/__algorithm/sort.h
+++ b/libcxx/include/__algorithm/sort.h
@@ -325,7 +325,7 @@ __insertion_sort_unguarded(_RandomAccessIterator const __first, _RandomAccessIte
       do {
         *__j = _Ops::__iter_move(__k);
         __j  = __k;
-        _LIBCPP_ASSERT_UNCATEGORIZED(
+        _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
             __k != __leftmost,
             "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
       } while (__comp(__t, *--__k)); // No need for bounds check due to the assumption stated above.
@@ -544,7 +544,7 @@ __bitset_partition(_RandomAccessIterator __first, _RandomAccessIterator __last,
     // Not guarded since we know the last element is greater than the pivot.
     do {
       ++__first;
-      _LIBCPP_ASSERT_UNCATEGORIZED(
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
           __first != __end,
           "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
     } while (!__comp(__pivot, *__first));
@@ -557,7 +557,7 @@ __bitset_partition(_RandomAccessIterator __first, _RandomAccessIterator __last,
     // It will be always guarded because __introsort will do the median-of-three
     // before calling this.
     do {
-      _LIBCPP_ASSERT_UNCATEGORIZED(
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
           __last != __begin,
           "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
       --__last;
@@ -635,7 +635,7 @@ __partition_with_equals_on_right(_RandomAccessIterator __first, _RandomAccessIte
   // this.
   do {
     ++__first;
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
         __first != __end,
         "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
   } while (__comp(*__first, __pivot));
@@ -647,7 +647,7 @@ __partition_with_equals_on_right(_RandomAccessIterator __first, _RandomAccessIte
   } else {
     // Guarded.
     do {
-      _LIBCPP_ASSERT_UNCATEGORIZED(
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
           __last != __begin,
           "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
       --__last;
@@ -665,12 +665,12 @@ __partition_with_equals_on_right(_RandomAccessIterator __first, _RandomAccessIte
     _Ops::iter_swap(__first, __last);
     do {
       ++__first;
-      _LIBCPP_ASSERT_UNCATEGORIZED(
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
           __first != __end,
           "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
     } while (__comp(*__first, __pivot));
     do {
-      _LIBCPP_ASSERT_UNCATEGORIZED(
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
           __last != __begin,
           "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
       --__last;
@@ -702,7 +702,7 @@ __partition_with_equals_on_left(_RandomAccessIterator __first, _RandomAccessIter
     // Guarded.
     do {
       ++__first;
-      _LIBCPP_ASSERT_UNCATEGORIZED(
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
           __first != __end,
           "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
     } while (!__comp(__pivot, *__first));
@@ -715,7 +715,7 @@ __partition_with_equals_on_left(_RandomAccessIterator __first, _RandomAccessIter
     // It will be always guarded because __introsort will do the
     // median-of-three before calling this.
     do {
-      _LIBCPP_ASSERT_UNCATEGORIZED(
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
           __last != __begin,
           "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
       --__last;
@@ -725,12 +725,12 @@ __partition_with_equals_on_left(_RandomAccessIterator __first, _RandomAccessIter
     _Ops::iter_swap(__first, __last);
     do {
       ++__first;
-      _LIBCPP_ASSERT_UNCATEGORIZED(
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
           __first != __end,
           "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
     } while (!__comp(__pivot, *__first));
     do {
-      _LIBCPP_ASSERT_UNCATEGORIZED(
+      _LIBCPP_ASSERT_VALID_ELEMENT_ACCESS(
           __last != __begin,
           "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
       --__last;
diff --git a/libcxx/include/__algorithm/three_way_comp_ref_type.h b/libcxx/include/__algorithm/three_way_comp_ref_type.h
index 8fd15c5d66f2f..70c5818976f07 100644
--- a/libcxx/include/__algorithm/three_way_comp_ref_type.h
+++ b/libcxx/include/__algorithm/three_way_comp_ref_type.h
@@ -50,14 +50,14 @@ struct __debug_three_way_comp {
       __expected = _Order::greater;
     if (__o == _Order::greater)
       __expected = _Order::less;
-    _LIBCPP_ASSERT_UNCATEGORIZED(__comp_(__l, __r) == __expected, "Comparator does not induce a strict weak ordering");
+    _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(
+        __comp_(__l, __r) == __expected, "Comparator does not induce a strict weak ordering");
     (void)__l;
     (void)__r;
   }
 };
 
-// Pass the comparator by lvalue reference. Or in debug mode, using a
-// debugging wrapper that stores a reference.
+// Pass the comparator by lvalue reference. Or in the debug mode, using a debugging wrapper that stores a reference.
 #  if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 template <class _Comp>
 using __three_way_comp_ref_type = __debug_three_way_comp<_Comp>;
diff --git a/libcxx/include/__config b/libcxx/include/__config
index f0275d0256d27..eab37e7576bdc 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -313,6 +313,10 @@
 // - `_LIBCPP_ASSERT_PEDANTIC` -- checks prerequisites which are imposed by the Standard, but violating which happens to
 //   be benign in our implementation.
 //
+// - `_LIBCPP_ASSERT_SEMANTIC_REQUIREMENT` -- checks that the given argument satisfies the semantic requirements imposed
+//   by the Standard. Typically, there is no simple way to completely prove that a semantic requirement is satisfied;
+//   thus, this would often be a heuristic check and it might be quite expensive.
+//
 // - `_LIBCPP_ASSERT_INTERNAL` -- checks that internal invariants of the library hold. These assertions don't depend on
 //   user input.
 //
@@ -359,6 +363,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)     _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)   _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                 _LIBCPP_ASSUME(expression)
+#    define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)     _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                 _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)            _LIBCPP_ASSUME(expression)
 
@@ -378,6 +383,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                 _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)            _LIBCPP_ASSERT(expression, message)
 // Disabled checks.
+#    define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)     _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                 _LIBCPP_ASSUME(expression)
 
 // Debug hardening mode checks.
@@ -394,6 +400,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)      _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)    _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                  _LIBCPP_ASSERT(expression, message)
+#    define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)      _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                  _LIBCPP_ASSERT(expression, message)
 #    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)             _LIBCPP_ASSERT(expression, message)
 
@@ -411,6 +418,7 @@ _LIBCPP_HARDENING_MODE_DEBUG
 #    define _LIBCPP_ASSERT_COMPATIBLE_ALLOCATOR(expression, message)      _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_ARGUMENT_WITHIN_DOMAIN(expression, message)    _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_PEDANTIC(expression, message)                  _LIBCPP_ASSUME(expression)
+#    define _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(expression, message)      _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_INTERNAL(expression, message)                  _LIBCPP_ASSUME(expression)
 #    define _LIBCPP_ASSERT_UNCATEGORIZED(expression, message)             _LIBCPP_ASSUME(expression)
 
diff --git a/libcxx/include/__debug_utils/strict_weak_ordering_check.h b/libcxx/include/__debug_utils/strict_weak_ordering_check.h
index 99200ad65eac9..3a9d887284164 100644
--- a/libcxx/include/__debug_utils/strict_weak_ordering_check.h
+++ b/libcxx/include/__debug_utils/strict_weak_ordering_check.h
@@ -26,12 +26,12 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 template <class _RandomAccessIterator, class _Comp>
 _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX14 void
 __check_strict_weak_ordering_sorted(_RandomAccessIterator __first, _RandomAccessIterator __last, _Comp& __comp) {
-#ifdef _LIBCPP_DEBUG_STRICT_WEAK_ORDERING_CHECK
+#if _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
   using __diff_t  = __iter_diff_t<_RandomAccessIterator>;
   using _Comp_ref = __comp_ref_type<_Comp>;
   if (!__libcpp_is_constant_evaluated()) {
     // Check if the range is actually sorted.
-    _LIBCPP_ASSERT_UNCATEGORIZED(
+    _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(
         (std::is_sorted<_RandomAccessIterator, _Comp_ref>(__first, __last, _Comp_ref(__comp))),
         "The range is not sorted after the sort, your comparator is not a valid strict-weak ordering");
     // Limit the number of elements we need to check.
@@ -46,18 +46,18 @@ __check_strict_weak_ordering_sorted(_RandomAccessIterator __first, _RandomAccess
       // Check that the elements from __p to __q are equal between each other.
       for (__diff_t __b = __p; __b < __q; ++__b) {
         for (__diff_t __a = __p; __a <= __b; ++__a) {
-          _LIBCPP_ASSERT_UNCATEGORIZED(
+          _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(
               !__comp(*(__first + __a), *(__first + __b)), "Your comparator is not a valid strict-weak ordering");
-          _LIBCPP_ASSERT_UNCATEGORIZED(
+          _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(
               !__comp(*(__first + __b), *(__first + __a)), "Your comparator is not a valid strict-weak ordering");
         }
       }
       // Check that elements between __p and __q are less than between __q and __size.
       for (__diff_t __a = __p; __a < __q; ++__a) {
         for (__diff_t __b = __q; __b < __size; ++__b) {
-          _LIBCPP_ASSERT_UNCATEGORIZED(
+          _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(
               __comp(*(__first + __a), *(__first + __b)), "Your comparator is not a valid strict-weak ordering");
-          _LIBCPP_ASSERT_UNCATEGORIZED(
+          _LIBCPP_ASSERT_SEMANTIC_REQUIREMENT(
               !__comp(*(__first + __b), *(__first + __a)), "Your comparator is not a valid strict-weak ordering");
         }
       }
@@ -69,7 +69,7 @@ __check_strict_weak_ordering_sorted(_RandomAccessIterator __first, _RandomAccess
   (void)__first;
   (void)__last;
   (void)__comp;
-#endif
+#endif // _LIBCPP_HARDENING_MODE == _LIBCPP_HARDENING_MODE_DEBUG
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator.pass.cpp b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator.pass.cpp
deleted file mode 100644
index f90e2a3a1a71f..0000000000000
--- a/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator.pass.cpp
+++ /dev/null
@@ -1,255 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// REQUIRES: stdlib=libc++
-
-// REQUIRES: has-unix-headers
-// UNSUPPORTED: c++03, c++11, c++14, c++17
-// UNSUPPORTED: !libcpp-hardening-mode=debug
-// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
-// ADDITIONAL_COMPILE_FLAGS: -D_LIBCPP_DEBUG_STRICT_WEAK_ORDERING_CHECK
-// When the debug mode is enabled, this test fails because we actually catch on the fly that the comparator is not
-// a strict-weak ordering before we catch that we'd dereference out-of-bounds inside std::sort, which leads to different
-// errors than the ones tested below.
-// XFAIL: libcpp-hardening-mode=debug
-
-// This test uses a specific combination of an invalid comparator and sequence of values to
-// ensure that our sorting functions do not go out-of-bounds and satisfy strict weak ordering in that case.
-// Instead, we should fail loud with an assertion. The specific issue we're looking for here is when the comparator
-// does not satisfy the strict weak ordering:
-//
-//    Irreflexivity: comp(a, a) is false
-//    Antisymmetry: comp(a, b) implies that !comp(b, a)
-//    Transitivity: comp(a, b), comp(b, c) imply comp(a, c)
-//    Transitivity of equivalence: !comp(a, b), !comp(b, a), !comp(b, c), !comp(c, b) imply !comp(a, c), !comp(c, a)
-//
-// If this is not satisfied, we have seen issues in the past where the std::sort implementation
-// would proceed to do OOB reads. (rdar://106897934).
-// Other algorithms like std::stable_sort, std::sort_heap do not go out of bounds but can produce
-// incorrect results, we also want to assert on that.
-// Sometimes std::sort does not go out of bounds as well, for example, right now if transitivity
-// of equivalence is not met, std::sort can only produce incorrect result but would not fail.
-
-#include <algorithm>
-#include <cassert>
-#include <cstddef>
-#include <limits>
-#include <map>
-#include <memory>
-#include <ranges>
-#include <random>
-#include <set>
-#include <string>
-#include <vector>
-
-#include "bad_comparator_values.h"
-#include "check_assertion.h"
-
-class ComparisonResults {
-public:
-    explicit ComparisonResults(std::string_view data) {
-        for (auto line : std::views::split(data, '\n') | std::views::filter([](auto const& line) { return !line.empty(); })) {
-            auto values = std::views::split(line, ' ');
-            auto it = values.begin();
-            std::size_t left = std::stol(std::string((*it).data(), (*it).size()));
-            it = std::next(it);
-            std::size_t right = std::stol(std::string((*it).data(), (*it).size()));
-            it = std::next(it);
-            bool result = static_cast<bool>(std::stol(std::string((*it).data(), (*it).size())));
-            comparison_results[left][right] = result;
-        }
-    }
-
-    bool compare(size_t* left, size_t* right) const {
-        assert(left != nullptr && right != nullptr && "something is wrong with the test");
-        assert(comparison_results.contains(*left) && comparison_results.at(*left).contains(*right) && "malformed input data?");
-        return comparison_results.at(*left).at(*right);
-    }
-
-    size_t size() const { return comparison_results.size(); }
-private:
-    std::map<std::size_t, std::map<std::size_t, bool>> comparison_results; // terrible for performance, but really convenient
-};
-
-void check_oob_sort_read() {
-    ComparisonResults comparison_results(SORT_DATA);
-    std::vector<std::unique_ptr<std::size_t>> elements;
-    std::set<std::size_t*> valid_ptrs;
-    for (std::size_t i = 0; i != comparison_results.size(); ++i) {
-        elements.push_back(std::make_unique<std::size_t>(i));
-        valid_ptrs.insert(elements.back().get());
-    }
-
-    auto checked_predicate = [&](size_t* left, size_t* right) {
-        // If the pointers passed to the comparator are not in the set of pointers we
-        // set up above, then we're being passed garbage values from the algorithm
-        // because we're reading OOB.
-        assert(valid_ptrs.contains(left));
-        assert(valid_ptrs.contains(right));
-        return comparison_results.compare(left, right);
-    };
-
-    // Check the classic sorting algorithms
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        TEST_LIBCPP_ASSERT_FAILURE(std::sort(copy.begin(), copy.end(), checked_predicate), "Would read out of bounds");
-    }
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        TEST_LIBCPP_ASSERT_FAILURE(std::stable_sort(copy.begin(), copy.end(), checked_predicate), "not a valid strict-weak ordering");
-    }
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        std::make_heap(copy.begin(), copy.end(), checked_predicate); // doesn't go OOB even with invalid comparator
-        TEST_LIBCPP_ASSERT_FAILURE(std::sort_heap(copy.begin(), copy.end(), checked_predicate), "not a valid strict-weak ordering");
-    }
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        TEST_LIBCPP_ASSERT_FAILURE(std::partial_sort(copy.begin(), copy.end(), copy.end(), checked_predicate), "not a valid strict-weak ordering");
-    }
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        std::vector<std::size_t*> results(copy.size(), nullptr);
-       TEST_LIBCPP_ASSERT_FAILURE(std::partial_sort_copy(copy.begin(), copy.end(), results.begin(), results.end(), checked_predicate), "not a valid strict-weak ordering");
-    }
-
-    // Check the Ranges sorting algorithms
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        TEST_LIBCPP_ASSERT_FAILURE(std::ranges::sort(copy, checked_predicate), "Would read out of bounds");
-    }
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        TEST_LIBCPP_ASSERT_FAILURE(std::ranges::stable_sort(copy, checked_predicate), "not a valid strict-weak ordering");
-    }
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        std::ranges::make_heap(copy, checked_predicate); // doesn't go OOB even with invalid comparator
-        TEST_LIBCPP_ASSERT_FAILURE(std::ranges::sort_heap(copy, checked_predicate), "not a valid strict-weak ordering");
-    }
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        TEST_LIBCPP_ASSERT_FAILURE(std::ranges::partial_sort(copy, copy.end(), checked_predicate), "not a valid strict-weak ordering");
-    }
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        std::vector<std::size_t*> results(copy.size(), nullptr);
-        TEST_LIBCPP_ASSERT_FAILURE(std::ranges::partial_sort_copy(copy, results, checked_predicate), "not a valid strict-weak ordering");
-    }
-}
-
-void check_oob_nth_element_read() {
-    ComparisonResults results(NTH_ELEMENT_DATA);
-    std::vector<std::unique_ptr<std::size_t>> elements;
-    std::set<std::size_t*> valid_ptrs;
-    for (std::size_t i = 0; i != results.size(); ++i) {
-        elements.push_back(std::make_unique<std::size_t>(i));
-        valid_ptrs.insert(elements.back().get());
-    }
-
-    auto checked_predicate = [&](size_t* left, size_t* right) {
-        // If the pointers passed to the comparator are not in the set of pointers we
-        // set up above, then we're being passed garbage values from the algorithm
-        // because we're reading OOB.
-        assert(valid_ptrs.contains(left));
-        assert(valid_ptrs.contains(right));
-        return results.compare(left, right);
-    };
-
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        TEST_LIBCPP_ASSERT_FAILURE(std::nth_element(copy.begin(), copy.begin(), copy.end(), checked_predicate), "Would read out of bounds");
-    }
-
-    {
-        std::vector<std::size_t*> copy;
-        for (auto const& e : elements)
-            copy.push_back(e.get());
-        TEST_LIBCPP_ASSERT_FAILURE(std::ranges::nth_element(copy, copy.begin(), checked_predicate), "Would read out of bounds");
-    }
-}
-
-struct FloatContainer {
-  float value;
-  bool operator<(const FloatContainer& other) const {
-    return value < other.value;
-  }
-};
-
-// Nans in floats do not satisfy strict weak ordering by breaking transitivity of equivalence.
-std::vector<FloatContainer> generate_float_data() {
-    std::vector<FloatContainer> floats(50);
-    for (int i = 0; i < 50; ++i) {
-        floats[i].value = static_cast<float>(i);
-    }
-    floats.push_back(FloatContainer{std::numeric_limits<float>::quiet_NaN()});
-    std::shuffle(floats.begin(), floats.end(), std::default_random_engine());
-    return floats;
-}
-
-void check_nan_floats() {
-    auto floats = generate_float_data();
-    TEST_LIBCPP_ASSERT_FAILURE(std::sort(floats.begin(), floats.end()), "not a valid strict-weak ordering");
-    floats = generate_float_data();
-    TEST_LIBCPP_ASSERT_FAILURE(std::stable_sort(floats.begin(), floats.end()), "not a valid strict-weak ordering");
-    floats = generate_float_data();
-    std::make_heap(floats.begin(), floats.end());
-    TEST_LIBCPP_ASSERT_FAILURE(std::sort_heap(floats.begin(), floats.end()), "not a valid strict-weak ordering");
-    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::sort(generate_float_data(), std::less()), "not a valid strict-weak ordering");
-    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::stable_sort(generate_float_data(), std::less()), "not a valid strict-weak ordering");
-    floats = generate_float_data();
-    std::ranges::make_heap(floats, std::less());
-    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::sort_heap(floats, std::less()), "not a valid strict-weak ordering");
-}
-
-void check_irreflexive() {
-    std::vector<int> v(1);
-    TEST_LIBCPP_ASSERT_FAILURE(std::sort(v.begin(), v.end(), std::greater_equal<int>()), "not a valid strict-weak ordering");
-    TEST_LIBCPP_ASSERT_FAILURE(std::stable_sort(v.begin(), v.end(), std::greater_equal<int>()), "not a valid strict-weak ordering");
-    std::make_heap(v.begin(), v.end(), std::greater_equal<int>());
-    TEST_LIBCPP_ASSERT_FAILURE(std::sort_heap(v.begin(), v.end(), std::greater_equal<int>()), "not a valid strict-weak ordering");
-    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::sort(v, std::greater_equal<int>()), "not a valid strict-weak ordering");
-    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::stable_sort(v, std::greater_equal<int>()), "not a valid strict-weak ordering");
-    std::ranges::make_heap(v, std::greater_equal<int>());
-    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::sort_heap(v, std::greater_equal<int>()), "not a valid strict-weak ordering");
-}
-
-int main(int, char**) {
-
-    check_oob_sort_read();
-
-    check_oob_nth_element_read();
-
-    check_nan_floats();
-
-    check_irreflexive();
-
-    return 0;
-}
diff --git a/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/assert.sort.invalid_comparator.oob.pass.cpp b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/assert.sort.invalid_comparator.oob.pass.cpp
new file mode 100644
index 0000000000000..6ddee1b2aabe0
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/assert.sort.invalid_comparator.oob.pass.cpp
@@ -0,0 +1,72 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// In the debug mode, the comparator validations will notice that it doesn't satisfy strict weak ordering before the
+// algorithm actually runs and goes out of bounds, so the test will terminate before the tested assertions are
+// triggered.
+// UNSUPPORTED: libcpp-hardening-mode=none, libcpp-hardening-mode=debug
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "bad_comparator_values.h"
+#include "check_assertion.h"
+#include "invalid_comparator_utilities.h"
+
+void check_oob_sort_read() {
+  SortingFixture fixture(SORT_DATA);
+
+  // Check the classic sorting algorithms
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::sort(copy.begin(), copy.end(), fixture.checked_predicate()),
+        "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
+  }
+
+  // Check the Ranges sorting algorithms
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::sort(copy, fixture.checked_predicate()),
+        "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
+  }
+}
+
+void check_oob_nth_element_read() {
+  SortingFixture fixture(NTH_ELEMENT_DATA);
+
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::nth_element(copy.begin(), copy.begin(), copy.end(), fixture.checked_predicate()),
+        "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
+  }
+
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::nth_element(copy, copy.begin(), fixture.checked_predicate()),
+        "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
+  }
+}
+
+int main(int, char**) {
+  check_oob_sort_read();
+  check_oob_nth_element_read();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/assert.sort.invalid_comparator.pass.cpp b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/assert.sort.invalid_comparator.pass.cpp
new file mode 100644
index 0000000000000..92671617eb032
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/assert.sort.invalid_comparator.pass.cpp
@@ -0,0 +1,195 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: has-unix-headers
+// UNSUPPORTED: c++03, c++11, c++14, c++17
+// REQUIRES: libcpp-hardening-mode=debug
+// XFAIL: libcpp-hardening-mode=debug && availability-verbose_abort-missing
+
+// This test uses a specific combination of an invalid comparator and sequence of values to
+// ensure that our sorting functions do not go out-of-bounds and satisfy strict weak ordering in that case.
+// Instead, we should fail loud with an assertion. The specific issue we're looking for here is when the comparator
+// does not satisfy the strict weak ordering:
+//
+//    Irreflexivity: comp(a, a) is false
+//    Antisymmetry: comp(a, b) implies that !comp(b, a)
+//    Transitivity: comp(a, b), comp(b, c) imply comp(a, c)
+//    Transitivity of equivalence: !comp(a, b), !comp(b, a), !comp(b, c), !comp(c, b) imply !comp(a, c), !comp(c, a)
+//
+// If this is not satisfied, we have seen issues in the past where the std::sort implementation
+// would proceed to do OOB reads. (rdar://106897934).
+// Other algorithms like std::stable_sort, std::sort_heap do not go out of bounds but can produce
+// incorrect results, we also want to assert on that.
+// Sometimes std::sort does not go out of bounds as well, for example, right now if transitivity
+// of equivalence is not met, std::sort can only produce incorrect result but would not fail.
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <limits>
+#include <map>
+#include <memory>
+#include <ranges>
+#include <random>
+#include <set>
+#include <string>
+#include <vector>
+
+#include "bad_comparator_values.h"
+#include "check_assertion.h"
+#include "invalid_comparator_utilities.h"
+
+void check_oob_sort_read() {
+  SortingFixture fixture(SORT_DATA);
+
+  // Check the classic sorting algorithms
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::sort(copy.begin(), copy.end(), fixture.checked_predicate()),
+        "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
+  }
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(std::stable_sort(copy.begin(), copy.end(), fixture.checked_predicate()),
+                               "Comparator does not induce a strict weak ordering");
+  }
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(std::make_heap(copy.begin(), copy.end(), fixture.checked_predicate()),
+                               "Comparator does not induce a strict weak ordering");
+    TEST_LIBCPP_ASSERT_FAILURE(std::sort_heap(copy.begin(), copy.end(), fixture.checked_predicate()),
+                               "Comparator does not induce a strict weak ordering");
+  }
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(std::partial_sort(copy.begin(), copy.end(), copy.end(), fixture.checked_predicate()),
+                               "Comparator does not induce a strict weak ordering");
+  }
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    std::vector<std::size_t*> results(copy.size(), nullptr);
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::partial_sort_copy(copy.begin(), copy.end(), results.begin(), results.end(), fixture.checked_predicate()),
+        "Comparator does not induce a strict weak ordering");
+  }
+
+  // Check the Ranges sorting algorithms
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::sort(copy, fixture.checked_predicate()),
+        "Would read out of bounds, does your comparator satisfy the strict-weak ordering requirement?");
+  }
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::stable_sort(copy, fixture.checked_predicate()),
+                               "Comparator does not induce a strict weak ordering");
+  }
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::make_heap(copy, fixture.checked_predicate()), "Comparator does not induce a strict weak ordering");
+    TEST_LIBCPP_ASSERT_FAILURE(
+        std::ranges::sort_heap(copy, fixture.checked_predicate()), "Comparator does not induce a strict weak ordering");
+  }
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::partial_sort(copy, copy.end(), fixture.checked_predicate()),
+                               "Comparator does not induce a strict weak ordering");
+  }
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    std::vector<std::size_t*> results(copy.size(), nullptr);
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::partial_sort_copy(copy, results, fixture.checked_predicate()),
+                               "Comparator does not induce a strict weak ordering");
+  }
+}
+
+void check_oob_nth_element_read() {
+  SortingFixture fixture(NTH_ELEMENT_DATA);
+
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(std::nth_element(copy.begin(), copy.begin(), copy.end(), fixture.checked_predicate()),
+                               "Comparator does not induce a strict weak ordering");
+  }
+
+  {
+    std::vector<std::size_t*> copy = fixture.create_elements();
+    TEST_LIBCPP_ASSERT_FAILURE(std::ranges::nth_element(copy, copy.begin(), fixture.checked_predicate()),
+                               "Comparator does not induce a strict weak ordering");
+  }
+}
+
+struct FloatContainer {
+  float value;
+  bool operator<(const FloatContainer& other) const { return value < other.value; }
+};
+
+// Nans in floats do not satisfy strict weak ordering by breaking transitivity of equivalence.
+std::vector<FloatContainer> generate_float_data() {
+  std::vector<FloatContainer> floats(50);
+  for (int i = 0; i < 50; ++i) {
+    floats[i].value = static_cast<float>(i);
+  }
+  floats.push_back(FloatContainer{std::numeric_limits<float>::quiet_NaN()});
+  std::shuffle(floats.begin(), floats.end(), std::default_random_engine());
+  return floats;
+}
+
+void check_nan_floats() {
+  auto floats = generate_float_data();
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::sort(floats.begin(), floats.end()), "Your comparator is not a valid strict-weak ordering");
+  floats = generate_float_data();
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::stable_sort(floats.begin(), floats.end()), "Your comparator is not a valid strict-weak ordering");
+  floats = generate_float_data();
+  std::make_heap(floats.begin(), floats.end());
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::sort_heap(floats.begin(), floats.end()), "Your comparator is not a valid strict-weak ordering");
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::ranges::sort(generate_float_data(), std::less()), "Your comparator is not a valid strict-weak ordering");
+  TEST_LIBCPP_ASSERT_FAILURE(std::ranges::stable_sort(generate_float_data(), std::less()),
+                             "Your comparator is not a valid strict-weak ordering");
+  floats = generate_float_data();
+  std::ranges::make_heap(floats, std::less());
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::ranges::sort_heap(floats, std::less()), "Your comparator is not a valid strict-weak ordering");
+}
+
+void check_irreflexive() {
+  std::vector<int> v(1);
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::sort(v.begin(), v.end(), std::greater_equal<int>()), "Your comparator is not a valid strict-weak ordering");
+  TEST_LIBCPP_ASSERT_FAILURE(std::stable_sort(v.begin(), v.end(), std::greater_equal<int>()),
+                             "Your comparator is not a valid strict-weak ordering");
+  std::make_heap(v.begin(), v.end(), std::greater_equal<int>());
+  TEST_LIBCPP_ASSERT_FAILURE(std::sort_heap(v.begin(), v.end(), std::greater_equal<int>()),
+                             "Comparator does not induce a strict weak ordering");
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::ranges::sort(v, std::greater_equal<int>()), "Your comparator is not a valid strict-weak ordering");
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::ranges::stable_sort(v, std::greater_equal<int>()), "Your comparator is not a valid strict-weak ordering");
+  std::ranges::make_heap(v, std::greater_equal<int>());
+  TEST_LIBCPP_ASSERT_FAILURE(
+      std::ranges::sort_heap(v, std::greater_equal<int>()), "Comparator does not induce a strict weak ordering");
+}
+
+int main(int, char**) {
+  check_oob_sort_read();
+
+  check_oob_nth_element_read();
+
+  check_nan_floats();
+
+  check_irreflexive();
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/algorithms/alg.sorting/bad_comparator_values.h b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/bad_comparator_values.h
similarity index 98%
rename from libcxx/test/libcxx/algorithms/alg.sorting/bad_comparator_values.h
rename to libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/bad_comparator_values.h
index c0ffd16cd4ac4..1a5910cfcd937 100644
--- a/libcxx/test/libcxx/algorithms/alg.sorting/bad_comparator_values.h
+++ b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/bad_comparator_values.h
@@ -6,8 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#ifndef TEST_LIBCXX_ALGORITHMS_ALG_SORTING_BAD_COMPARATOR_VALUES_H
-#define TEST_LIBCXX_ALGORITHMS_ALG_SORTING_BAD_COMPARATOR_VALUES_H
+#ifndef TEST_LIBCXX_ALGORITHMS_ALG_SORTING_ASSERT_SORT_INVALID_COMPARATOR_BAD_COMPARATOR_VALUES_H
+#define TEST_LIBCXX_ALGORITHMS_ALG_SORTING_ASSERT_SORT_INVALID_COMPARATOR_BAD_COMPARATOR_VALUES_H
 
 #include <string_view>
 
@@ -3562,4 +3562,4 @@ inline constexpr std::string_view SORT_DATA = R"(
 58 58 0
 )";
 
-#endif // TEST_LIBCXX_ALGORITHMS_ALG_SORTING_BAD_COMPARATOR_VALUES_H
+#endif // TEST_LIBCXX_ALGORITHMS_ALG_SORTING_ASSERT_SORT_INVALID_COMPARATOR_BAD_COMPARATOR_VALUES_H
diff --git a/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/invalid_comparator_utilities.h b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/invalid_comparator_utilities.h
new file mode 100644
index 0000000000000..f5e602577cae2
--- /dev/null
+++ b/libcxx/test/libcxx/algorithms/alg.sorting/assert.sort.invalid_comparator/invalid_comparator_utilities.h
@@ -0,0 +1,85 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef TEST_LIBCXX_ALGORITHMS_ALG_SORTING_ASSERT_SORT_INVALID_COMPARATOR_INVALID_COMPARATOR_UTILITIES_H
+#define TEST_LIBCXX_ALGORITHMS_ALG_SORTING_ASSERT_SORT_INVALID_COMPARATOR_INVALID_COMPARATOR_UTILITIES_H
+
+#include <algorithm>
+#include <cassert>
+#include <cstddef>
+#include <map>
+#include <ranges>
+#include <set>
+#include <string>
+#include <string_view>
+#include <vector>
+
+class ComparisonResults {
+public:
+  explicit ComparisonResults(std::string_view data) {
+    for (auto line :
+         std::views::split(data, '\n') | std::views::filter([](auto const& line) { return !line.empty(); })) {
+      auto values                     = std::views::split(line, ' ');
+      auto it                         = values.begin();
+      std::size_t left                = std::stol(std::string((*it).data(), (*it).size()));
+      it                              = std::next(it);
+      std::size_t right               = std::stol(std::string((*it).data(), (*it).size()));
+      it                              = std::next(it);
+      bool result                     = static_cast<bool>(std::stol(std::string((*it).data(), (*it).size())));
+      comparison_results[left][right] = result;
+    }
+  }
+
+  bool compare(size_t* left, size_t* right) const {
+    assert(left != nullptr && right != nullptr && "something is wrong with the test");
+    assert(comparison_results.contains(*left) && comparison_results.at(*left).contains(*right) &&
+           "malformed input data?");
+    return comparison_results.at(*left).at(*right);
+  }
+
+  size_t size() const { return comparison_results.size(); }
+
+private:
+  std::map<std::size_t, std::map<std::size_t, bool>>
+      comparison_results; // terrible for performance, but really convenient
+};
+
+class SortingFixture {
+public:
+  explicit SortingFixture(std::string_view data) : comparison_results_(data) {
+    for (std::size_t i = 0; i != comparison_results_.size(); ++i) {
+      elements_.push_back(std::make_unique<std::size_t>(i));
+      valid_ptrs_.insert(elements_.back().get());
+    }
+  }
+
+  std::vector<std::size_t*> create_elements() {
+    std::vector<std::size_t*> copy;
+    for (auto const& e : elements_)
+      copy.push_back(e.get());
+    return copy;
+  }
+
+  auto checked_predicate() {
+    return [this](size_t* left, size_t* right) {
+      // If the pointers passed to the comparator are not in the set of pointers we
+      // set up above, then we're being passed garbage values from the algorithm
+      // because we're reading OOB.
+      assert(valid_ptrs_.contains(left));
+      assert(valid_ptrs_.contains(right));
+      return comparison_results_.compare(left, right);
+    };
+  }
+
+private:
+  ComparisonResults comparison_results_;
+  std::vector<std::unique_ptr<std::size_t>> elements_;
+  std::set<std::size_t*> valid_ptrs_;
+};
+
+#endif // TEST_LIBCXX_ALGORITHMS_ALG_SORTING_ASSERT_SORT_INVALID_COMPARATOR_INVALID_COMPARATOR_UTILITIES_H

From ab32a3c166e9eae12260b4c6eca9bcf21f500e87 Mon Sep 17 00:00:00 2001
From: Michael Klemm <michael.klemm@amd.com>
Date: Tue, 23 Jan 2024 08:43:29 +0100
Subject: [PATCH 565/843] [flang] Do not leak intrinsics used by ISO_C_BINDING
 and ISO_FORTRAN_ENV (#79006)

This resolves bug #78953. Intrinsics used by the MODULE definition are
being declared PRIVATE, so that they do not leak into the namespace of
the code that USEs the modules.
---
 flang/module/iso_c_binding.f90   | 43 +++++++++++++---------
 flang/module/iso_fortran_env.f90 | 62 ++++++++++++++++++--------------
 2 files changed, 62 insertions(+), 43 deletions(-)

diff --git a/flang/module/iso_c_binding.f90 b/flang/module/iso_c_binding.f90
index a34c1d84afbf1..9a7e68f331446 100644
--- a/flang/module/iso_c_binding.f90
+++ b/flang/module/iso_c_binding.f90
@@ -22,15 +22,25 @@ module iso_c_binding
     c_sizeof => sizeof, &
     operator(==), operator(/=)
 
+  implicit none
+
+  ! Set PRIVATE by default to explicitly only export what is meant
+  ! to be exported by this MODULE.
+  private
+
+  public :: c_associated, c_funloc, c_funptr, c_f_pointer, c_loc, &
+    c_null_funptr, c_null_ptr, c_ptr, c_sizeof, &
+    operator(==), operator(/=)
+
   ! Table 18.2 (in clause 18.3.1)
   ! TODO: Specialize (via macros?) for alternative targets
-  integer, parameter :: &
+  integer, parameter, public :: &
     c_int8_t = 1, &
     c_int16_t = 2, &
     c_int32_t = 4, &
     c_int64_t = 8, &
     c_int128_t = 16 ! anticipating future addition
-  integer, parameter :: &
+  integer, parameter, public :: &
     c_int = c_int32_t, &
     c_short = c_int16_t, &
     c_long = c_int64_t, &
@@ -40,7 +50,7 @@ module iso_c_binding
     c_intmax_t = c_int128_t, &
     c_intptr_t = c_size_t, &
     c_ptrdiff_t = c_size_t
-  integer, parameter :: &
+  integer, parameter, public :: &
     c_int_least8_t = c_int8_t, &
     c_int_fast8_t = c_int8_t, &
     c_int_least16_t = c_int16_t, &
@@ -52,7 +62,7 @@ module iso_c_binding
     c_int_least128_t = c_int128_t, &
     c_int_fast128_t = c_int128_t
 
-  integer, parameter :: &
+  integer, parameter, public :: &
     c_float = 4, &
     c_double = 8, &
 #if __x86_64__
@@ -61,30 +71,31 @@ module iso_c_binding
     c_long_double = 16
 #endif
 
-  integer, parameter :: &
+  integer, parameter, public :: &
     c_float_complex = c_float, &
     c_double_complex = c_double, &
     c_long_double_complex = c_long_double
 
-  integer, parameter :: c_bool = 1
-  integer, parameter :: c_char = 1
+  integer, parameter, public :: c_bool = 1
+  integer, parameter, public :: c_char = 1
 
   ! C characters with special semantics
-  character(kind=c_char, len=1), parameter :: c_null_char = achar(0)
-  character(kind=c_char, len=1), parameter :: c_alert = achar(7)
-  character(kind=c_char, len=1), parameter :: c_backspace = achar(8)
-  character(kind=c_char, len=1), parameter :: c_form_feed = achar(12)
-  character(kind=c_char, len=1), parameter :: c_new_line = achar(10)
-  character(kind=c_char, len=1), parameter :: c_carriage_return = achar(13)
-  character(kind=c_char, len=1), parameter :: c_horizontal_tab = achar(9)
-  character(kind=c_char, len=1), parameter :: c_vertical_tab =  achar(11)
+  character(kind=c_char, len=1), parameter, public :: c_null_char = achar(0)
+  character(kind=c_char, len=1), parameter, public :: c_alert = achar(7)
+  character(kind=c_char, len=1), parameter, public :: c_backspace = achar(8)
+  character(kind=c_char, len=1), parameter, public :: c_form_feed = achar(12)
+  character(kind=c_char, len=1), parameter, public :: c_new_line = achar(10)
+  character(kind=c_char, len=1), parameter, public :: c_carriage_return = achar(13)
+  character(kind=c_char, len=1), parameter, public :: c_horizontal_tab = achar(9)
+  character(kind=c_char, len=1), parameter, public :: c_vertical_tab =  achar(11)
 
   interface c_f_procpointer
     module procedure c_f_procpointer
   end interface
+  public :: c_f_procpointer
 
   ! gfortran extensions
-  integer, parameter :: &
+  integer, parameter, public :: &
     c_float128 = 16, &
     c_float128_complex = c_float128
 
diff --git a/flang/module/iso_fortran_env.f90 b/flang/module/iso_fortran_env.f90
index cd3c06f8c7566..23e22e1f64de6 100644
--- a/flang/module/iso_fortran_env.f90
+++ b/flang/module/iso_fortran_env.f90
@@ -24,21 +24,29 @@ module iso_fortran_env
     compiler_version => __builtin_compiler_version
 
   implicit none
-  private count
+
+  ! Set PRIVATE by default to explicitly only export what is meant
+  ! to be exported by this MODULE.
+  private
+
+  public :: event_type, notify_type, lock_type, team_type, &
+    atomic_int_kind, atomic_logical_kind, compiler_options, &
+    compiler_version
+
 
   ! TODO: Use PACK([x],test) in place of the array constructor idiom
   ! [(x, integer::j=1,COUNT([test]))] below once PACK() can be folded.
 
-  integer, parameter, private :: &
+  integer, parameter :: &
     selectedASCII = selected_char_kind('ASCII'), &
     selectedUCS_2 = selected_char_kind('UCS-2'), &
     selectedUnicode = selected_char_kind('ISO_10646')
-  integer, parameter :: character_kinds(*) = [ &
+  integer, parameter, public :: character_kinds(*) = [ &
     [(selectedASCII, integer :: j=1, count([selectedASCII >= 0]))], &
     [(selectedUCS_2, integer :: j=1, count([selectedUCS_2 >= 0]))], &
     [(selectedUnicode, integer :: j=1, count([selectedUnicode >= 0]))]]
 
-  integer, parameter, private :: &
+  integer, parameter :: &
     selectedInt8 = selected_int_kind(2), &
     selectedInt16 = selected_int_kind(4), &
     selectedInt32 = selected_int_kind(9), &
@@ -54,7 +62,7 @@ module iso_fortran_env
                       selectedInt64 >= 0), &
     safeInt128 = merge(selectedInt128, selected_int_kind(0), &
                        selectedInt128 >= 0)
-  integer, parameter :: &
+  integer, parameter, public :: &
     int8 = merge(selectedInt8, merge(-2, -1, selectedInt8 >= 0), &
                  digits(int(0,kind=safeInt8)) == 7), &
     int16 = merge(selectedInt16, merge(-2, -1, selectedInt16 >= 0), &
@@ -66,7 +74,7 @@ module iso_fortran_env
     int128 = merge(selectedInt128, merge(-2, -1, selectedInt128 >= 0), &
                    digits(int(0,kind=safeInt128)) == 127)
 
-  integer, parameter :: integer_kinds(*) = [ &
+  integer, parameter, public :: integer_kinds(*) = [ &
     selected_int_kind(0), &
     ((selected_int_kind(k), &
       integer :: j=1, count([selected_int_kind(k) >= 0 .and. &
@@ -74,15 +82,15 @@ module iso_fortran_env
                                selected_int_kind(k-1)])), &
      integer :: k=1, 39)]
 
-  integer, parameter :: &
+  integer, parameter, public :: &
     logical8 = int8, logical16 = int16, logical32 = int32, logical64 = int64
-  integer, parameter :: logical_kinds(*) = [ &
+  integer, parameter, public :: logical_kinds(*) = [ &
     [(logical8, integer :: j=1, count([logical8 >= 0]))], &
     [(logical16, integer :: j=1, count([logical16 >= 0]))], &
     [(logical32, integer :: j=1, count([logical32 >= 0]))], &
     [(logical64, integer :: j=1, count([logical64 >= 0]))]]
 
-  integer, parameter, private :: &
+  integer, parameter :: &
     selectedReal16 = selected_real_kind(3, 4), &      ! IEEE half
     selectedBfloat16 = selected_real_kind(2, 37), &   ! truncated IEEE single
     selectedReal32 = selected_real_kind(6, 37), &     ! IEEE single
@@ -104,7 +112,7 @@ module iso_fortran_env
                          selectedReal64x2 >= 0), &
     safeReal128 = merge(selectedReal128, selected_real_kind(0,0), &
                         selectedReal128 >= 0)
-  integer, parameter :: &
+  integer, parameter, public :: &
     real16 = merge(selectedReal16, merge(-2, -1, selectedReal16 >= 0), &
                    digits(real(0,kind=safeReal16)) == 11), &
     bfloat16 = merge(selectedBfloat16, merge(-2, -1, selectedBfloat16 >= 0), &
@@ -120,7 +128,7 @@ module iso_fortran_env
     real128 = merge(selectedReal128, merge(-2, -1, selectedReal128 >= 0), &
                     digits(real(0,kind=safeReal128)) == 113)
 
-  integer, parameter :: real_kinds(*) = [ &
+  integer, parameter, public :: real_kinds(*) = [ &
     [(real16, integer :: j=1, count([real16 >= 0]))], &
     [(bfloat16, integer :: j=1, count([bfloat16 >= 0]))], &
     [(real32, integer :: j=1, count([real32 >= 0]))], &
@@ -129,27 +137,27 @@ module iso_fortran_env
     [(real64x2, integer :: j=1, count([real64x2 >= 0]))], &
     [(real128, integer :: j=1, count([real128 >= 0]))]]
 
-  integer, parameter :: current_team = -1, initial_team = -2, parent_team = -3
+  integer, parameter, public :: current_team = -1, initial_team = -2, parent_team = -3
 
-  integer, parameter :: output_unit = FORTRAN_DEFAULT_OUTPUT_UNIT
-  integer, parameter :: input_unit = FORTRAN_DEFAULT_INPUT_UNIT
-  integer, parameter :: error_unit = FORTRAN_ERROR_UNIT
-  integer, parameter :: iostat_end = FORTRAN_RUNTIME_IOSTAT_END
-  integer, parameter :: iostat_eor = FORTRAN_RUNTIME_IOSTAT_EOR
-  integer, parameter :: iostat_inquire_internal_unit = &
+  integer, parameter, public :: output_unit = FORTRAN_DEFAULT_OUTPUT_UNIT
+  integer, parameter, public :: input_unit = FORTRAN_DEFAULT_INPUT_UNIT
+  integer, parameter, public :: error_unit = FORTRAN_ERROR_UNIT
+  integer, parameter, public :: iostat_end = FORTRAN_RUNTIME_IOSTAT_END
+  integer, parameter, public :: iostat_eor = FORTRAN_RUNTIME_IOSTAT_EOR
+  integer, parameter, public :: iostat_inquire_internal_unit = &
                           FORTRAN_RUNTIME_IOSTAT_INQUIRE_INTERNAL_UNIT
 
-  integer, parameter :: character_storage_size = 8
-  integer, parameter :: file_storage_size = 8
-  integer, parameter :: numeric_storage_size = 32
+  integer, parameter, public :: character_storage_size = 8
+  integer, parameter, public :: file_storage_size = 8
+  integer, parameter, public :: numeric_storage_size = 32
 
-  integer, parameter :: stat_failed_image = FORTRAN_RUNTIME_STAT_FAILED_IMAGE
-  integer, parameter :: stat_locked = FORTRAN_RUNTIME_STAT_LOCKED
-  integer, parameter :: &
+  integer, parameter, public :: stat_failed_image = FORTRAN_RUNTIME_STAT_FAILED_IMAGE
+  integer, parameter, public :: stat_locked = FORTRAN_RUNTIME_STAT_LOCKED
+  integer, parameter, public :: &
     stat_locked_other_image = FORTRAN_RUNTIME_STAT_LOCKED_OTHER_IMAGE
-  integer, parameter :: stat_stopped_image = FORTRAN_RUNTIME_STAT_STOPPED_IMAGE
-  integer, parameter :: stat_unlocked = FORTRAN_RUNTIME_STAT_UNLOCKED
-  integer, parameter :: &
+  integer, parameter, public :: stat_stopped_image = FORTRAN_RUNTIME_STAT_STOPPED_IMAGE
+  integer, parameter, public :: stat_unlocked = FORTRAN_RUNTIME_STAT_UNLOCKED
+  integer, parameter, public :: &
     stat_unlocked_failed_image = FORTRAN_RUNTIME_STAT_UNLOCKED_FAILED_IMAGE
 
 end module iso_fortran_env

From 44ba6ebc999d6e9b27bedfe04a993adfd204dc6a Mon Sep 17 00:00:00 2001
From: yjijd <licongtian@loongson.cn>
Date: Tue, 23 Jan 2024 15:57:06 +0800
Subject: [PATCH 566/843] [CodeGen][LoongArch] Set FP_TO_SINT/FP_TO_UINT to
 legal for vector types (#79107)

Support the following conversions:
v4f32->v4i32, v2f64->v2i64(LSX)
v8f32->v8i32, v4f64->v4i64(LASX)
v4f32->v4i64, v4f64->v4i32(LASX)
---
 .../LoongArch/LoongArchISelLowering.cpp       | 12 ++--
 .../LoongArch/LoongArchLASXInstrInfo.td       | 22 +++++++
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |  8 +++
 .../LoongArch/lasx/ir-instruction/fptosi.ll   | 57 +++++++++++++++++++
 .../LoongArch/lasx/ir-instruction/fptoui.ll   | 57 +++++++++++++++++++
 .../LoongArch/lsx/ir-instruction/fptosi.ll    | 28 +++++++++
 .../LoongArch/lsx/ir-instruction/fptoui.ll    | 28 +++++++++
 7 files changed, 208 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptosi.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptoui.ll

diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 0d6eb4b8149c4..76c1a14fe0156 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -265,8 +265,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
     }
-    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP},
-                       {MVT::v4i32, MVT::v2i64}, Legal);
+    for (MVT VT : {MVT::v4i32, MVT::v2i64}) {
+      setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal);
+      setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal);
+    }
     for (MVT VT : {MVT::v4f32, MVT::v2f64}) {
       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
@@ -309,8 +311,10 @@ LoongArchTargetLowering::LoongArchTargetLowering(const TargetMachine &TM,
           {ISD::SETNE, ISD::SETGE, ISD::SETGT, ISD::SETUGE, ISD::SETUGT}, VT,
           Expand);
     }
-    setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP},
-                       {MVT::v8i32, MVT::v4i32, MVT::v4i64}, Legal);
+    for (MVT VT : {MVT::v8i32, MVT::v4i32, MVT::v4i64}) {
+      setOperationAction({ISD::SINT_TO_FP, ISD::UINT_TO_FP}, VT, Legal);
+      setOperationAction({ISD::FP_TO_SINT, ISD::FP_TO_UINT}, VT, Legal);
+    }
     for (MVT VT : {MVT::v8f32, MVT::v4f64}) {
       setOperationAction({ISD::FADD, ISD::FSUB}, VT, Legal);
       setOperationAction({ISD::FMUL, ISD::FDIV}, VT, Legal);
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index 2426dad507802..3de1fe2b722e5 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -1637,6 +1637,28 @@ def : Pat<(v4f32 (uint_to_fp v4i64:$vj)),
                                        (XVFFINT_D_LU v4i64:$vj)),
                           sub_128)>;
 
+// XVFTINTRZ_{W_S/L_D}
+def : Pat<(v8i32 (fp_to_sint v8f32:$vj)), (XVFTINTRZ_W_S v8f32:$vj)>;
+def : Pat<(v4i64 (fp_to_sint v4f64:$vj)), (XVFTINTRZ_L_D v4f64:$vj)>;
+def : Pat<(v4i64 (fp_to_sint v4f32:$vj)),
+          (VEXT2XV_D_W (SUBREG_TO_REG (i64 0), (VFTINTRZ_W_S v4f32:$vj),
+                                      sub_128))>;
+def : Pat<(v4i32 (fp_to_sint (v4f64 LASX256:$vj))),
+          (EXTRACT_SUBREG (XVFTINTRZ_W_S (XVFCVT_S_D (XVPERMI_D v4f64:$vj, 238),
+                                                     v4f64:$vj)),
+                          sub_128)>;
+
+// XVFTINTRZ_{W_SU/L_DU}
+def : Pat<(v8i32 (fp_to_uint v8f32:$vj)), (XVFTINTRZ_WU_S v8f32:$vj)>;
+def : Pat<(v4i64 (fp_to_uint v4f64:$vj)), (XVFTINTRZ_LU_D v4f64:$vj)>;
+def : Pat<(v4i64 (fp_to_uint v4f32:$vj)),
+          (VEXT2XV_DU_WU (SUBREG_TO_REG (i64 0), (VFTINTRZ_WU_S v4f32:$vj),
+                                        sub_128))>;
+def : Pat<(v4i32 (fp_to_uint (v4f64 LASX256:$vj))),
+          (EXTRACT_SUBREG (XVFTINTRZ_W_S (XVFCVT_S_D (XVPERMI_D v4f64:$vj, 238),
+                                                     v4f64:$vj)),
+                          sub_128)>;
+
 } // Predicates = [HasExtLASX]
 
 /// Intrinsic pattern
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index 71256353116a7..39ee861cd0565 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -1754,6 +1754,14 @@ def : Pat<(v2f64 (sint_to_fp v2i64:$vj)), (VFFINT_D_L v2i64:$vj)>;
 def : Pat<(v4f32 (uint_to_fp v4i32:$vj)), (VFFINT_S_WU v4i32:$vj)>;
 def : Pat<(v2f64 (uint_to_fp v2i64:$vj)), (VFFINT_D_LU v2i64:$vj)>;
 
+// VFTINTRZ_{W_S/L_D}
+def : Pat<(v4i32 (fp_to_sint v4f32:$vj)), (VFTINTRZ_W_S v4f32:$vj)>;
+def : Pat<(v2i64 (fp_to_sint v2f64:$vj)), (VFTINTRZ_L_D v2f64:$vj)>;
+
+// VFTINTRZ_{W_SU/L_DU}
+def : Pat<(v4i32 (fp_to_uint v4f32:$vj)), (VFTINTRZ_WU_S v4f32:$vj)>;
+def : Pat<(v2i64 (fp_to_uint v2f64:$vj)), (VFTINTRZ_LU_D v2f64:$vj)>;
+
 } // Predicates = [HasExtLSX]
 
 /// Intrinsic pattern
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
new file mode 100644
index 0000000000000..0d9f57b57ffae
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptosi.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @fptosi_v8f32_v8i32(ptr %res, ptr %in){
+; CHECK-LABEL: fptosi_v8f32_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x float>, ptr %in
+  %v1 = fptosi <8 x float> %v0 to <8 x i32>
+  store <8 x i32> %v1, ptr %res
+  ret void
+}
+
+define void @fptosi_v4f64_v4i64(ptr %res, ptr %in){
+; CHECK-LABEL: fptosi_v4f64_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvftintrz.l.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x double>, ptr %in
+  %v1 = fptosi <4 x double> %v0 to <4 x i64>
+  store <4 x i64> %v1, ptr %res
+  ret void
+}
+
+define void @fptosi_v4f64_v4i32(ptr %res, ptr %in){
+; CHECK-LABEL: fptosi_v4f64_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
+; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x double>, ptr %in
+  %v1 = fptosi <4 x double> %v0 to <4 x i32>
+  store <4 x i32> %v1, ptr %res
+  ret void
+}
+
+define void @fptosi_v4f32_v4i64(ptr %res, ptr %in){
+; CHECK-LABEL: fptosi_v4f32_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vftintrz.w.s $vr0, $vr0
+; CHECK-NEXT:    vext2xv.d.w $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x float>, ptr %in
+  %v1 = fptosi <4 x float> %v0 to <4 x i64>
+  store <4 x i64> %v1, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
new file mode 100644
index 0000000000000..27d70f33cd34e
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/ir-instruction/fptoui.ll
@@ -0,0 +1,57 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @fptoui_v8f32_v8i32(ptr %res, ptr %in){
+; CHECK-LABEL: fptoui_v8f32_v8i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvftintrz.wu.s $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <8 x float>, ptr %in
+  %v1 = fptoui <8 x float> %v0 to <8 x i32>
+  store <8 x i32> %v1, ptr %res
+  ret void
+}
+
+define void @fptoui_v4f64_v4i64(ptr %res, ptr %in){
+; CHECK-LABEL: fptoui_v4f64_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvftintrz.lu.d $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x double>, ptr %in
+  %v1 = fptoui <4 x double> %v0 to <4 x i64>
+  store <4 x i64> %v1, ptr %res
+  ret void
+}
+
+define void @fptoui_v4f64_v4i32(ptr %res, ptr %in){
+; CHECK-LABEL: fptoui_v4f64_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    xvld $xr0, $a1, 0
+; CHECK-NEXT:    xvpermi.d $xr1, $xr0, 238
+; CHECK-NEXT:    xvfcvt.s.d $xr0, $xr1, $xr0
+; CHECK-NEXT:    xvftintrz.w.s $xr0, $xr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x double>, ptr %in
+  %v1 = fptoui <4 x double> %v0 to <4 x i32>
+  store <4 x i32> %v1, ptr %res
+  ret void
+}
+
+define void @fptoui_v4f32_v4i64(ptr %res, ptr %in){
+; CHECK-LABEL: fptoui_v4f32_v4i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vftintrz.wu.s $vr0, $vr0
+; CHECK-NEXT:    vext2xv.du.wu $xr0, $xr0
+; CHECK-NEXT:    xvst $xr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x float>, ptr %in
+  %v1 = fptoui <4 x float> %v0 to <4 x i64>
+  store <4 x i64> %v1, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptosi.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptosi.ll
new file mode 100644
index 0000000000000..c3008fe96e47d
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptosi.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @fptosi_v4f32_v4i32(ptr %res, ptr %in){
+; CHECK-LABEL: fptosi_v4f32_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vftintrz.w.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x float>, ptr %in
+  %v1 = fptosi <4 x float> %v0 to <4 x i32>
+  store <4 x i32> %v1, ptr %res
+  ret void
+}
+
+define void @fptosi_v2f64_v2i64(ptr %res, ptr %in){
+; CHECK-LABEL: fptosi_v2f64_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vftintrz.l.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <2 x double>, ptr %in
+  %v1 = fptosi <2 x double> %v0 to <2 x i64>
+  store <2 x i64> %v1, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptoui.ll b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptoui.ll
new file mode 100644
index 0000000000000..f0aeb0bd14e75
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/ir-instruction/fptoui.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @fptoui_v4f32_v4i32(ptr %res, ptr %in){
+; CHECK-LABEL: fptoui_v4f32_v4i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vftintrz.wu.s $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <4 x float>, ptr %in
+  %v1 = fptoui <4 x float> %v0 to <4 x i32>
+  store <4 x i32> %v1, ptr %res
+  ret void
+}
+
+define void @fptoui_v2f64_v2i64(ptr %res, ptr %in){
+; CHECK-LABEL: fptoui_v2f64_v2i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vld $vr0, $a1, 0
+; CHECK-NEXT:    vftintrz.lu.d $vr0, $vr0
+; CHECK-NEXT:    vst $vr0, $a0, 0
+; CHECK-NEXT:    ret
+  %v0 = load <2 x double>, ptr %in
+  %v1 = fptoui <2 x double> %v0 to <2 x i64>
+  store <2 x i64> %v1, ptr %res
+  ret void
+}

From 3ea92ea2f9d236569f82825cdba6d59bcc22495c Mon Sep 17 00:00:00 2001
From: Yi Kong <yikong@google.com>
Date: Tue, 23 Jan 2024 17:01:23 +0900
Subject: [PATCH 567/843] Fix MFS warning format

WithColor::warning() does not append new line automatically.
---
 llvm/lib/CodeGen/TargetPassConfig.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 76ba8da547e6b..46697480db52a 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -1246,7 +1246,7 @@ void TargetPassConfig::addMachinePasses() {
         // enabled, this may result in performance regression.
         WithColor::warning()
             << "Using AutoFDO without FSDiscriminator for MFS may regress "
-               "performance.";
+               "performance.\n";
       }
     }
     addPass(createMachineFunctionSplitterPass());

From 7da76958390a43b2fd1db506c90970aeae4367eb Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 23 Jan 2024 00:06:46 -0800
Subject: [PATCH 568/843] [llvm-exegesis] Add additional validation counters
 (#76788)

This patch adds support for additional types of validation counters and
also adds mappings between these new validation counter types and
physical counters on the hardware for microarchitectures that I have the
ability to test on.
---
 llvm/include/llvm/Target/TargetPfmCounters.td |  7 ++++++
 llvm/lib/Target/X86/X86PfmCounters.td         | 14 +++++++++--
 .../llvm-exegesis/lib/BenchmarkResult.cpp     | 24 +++++++++++++++++++
 .../tools/llvm-exegesis/lib/BenchmarkResult.h | 10 +++++++-
 llvm/tools/llvm-exegesis/llvm-exegesis.cpp    | 18 +++++++++++---
 5 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/llvm/include/llvm/Target/TargetPfmCounters.td b/llvm/include/llvm/Target/TargetPfmCounters.td
index 33dff741fa2ab..c56f388fbd94b 100644
--- a/llvm/include/llvm/Target/TargetPfmCounters.td
+++ b/llvm/include/llvm/Target/TargetPfmCounters.td
@@ -36,6 +36,13 @@ class ValidationEvent <int event_number> {
 }
 
 def InstructionRetired  : ValidationEvent<0>;
+def L1DCacheLoadMiss : ValidationEvent<1>;
+def L1DCacheStoreMiss : ValidationEvent<2>;
+def L1ICacheLoadMiss : ValidationEvent<3>;
+def DataTLBLoadMiss : ValidationEvent<4>;
+def DataTLBStoreMiss : ValidationEvent<5>;
+def InstructionTLBLoadMiss : ValidationEvent<6>;
+
 
 // PfmValidationCounter provides a mapping between the events that are
 // are interesting in regards to the snippet execution environment and
diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td
index 48d6895497091..da3acc5bbf56f 100644
--- a/llvm/lib/Target/X86/X86PfmCounters.td
+++ b/llvm/lib/Target/X86/X86PfmCounters.td
@@ -19,7 +19,12 @@ def : PfmCountersDefaultBinding<DefaultPfmCounters>;
 
 // Intel X86 Counters.
 defvar DefaultIntelPfmValidationCounters = [
-  PfmValidationCounter<InstructionRetired, "INSTRUCTIONS_RETIRED">
+  PfmValidationCounter<InstructionRetired, "INSTRUCTIONS_RETIRED">,
+  PfmValidationCounter<L1DCacheLoadMiss, "MEM_LOAD_UOPS_RETIRED:L1_MISS">,
+  PfmValidationCounter<L1ICacheLoadMiss, "L1-ICACHE-LOAD-MISSES">,
+  PfmValidationCounter<DataTLBLoadMiss, "DTLB_LOAD_MISSES:MISS_CAUSES_A_WALK">,
+  PfmValidationCounter<DataTLBStoreMiss, "DTLB_STORE_MISSES:MISS_CAUSES_A_WALK">,
+  PfmValidationCounter<InstructionTLBLoadMiss, "ITLB_MISSES:MISS_CAUSES_A_WALK">
 ];
 
 def PentiumPfmCounters : ProcPfmCounters {
@@ -200,7 +205,12 @@ def : PfmCountersBinding<"tigerlake", IceLakePfmCounters>;
 
 // AMD X86 Counters.
 defvar DefaultAMDPfmValidationCounters = [
-  PfmValidationCounter<InstructionRetired, "RETIRED_INSTRUCTIONS">
+  PfmValidationCounter<InstructionRetired, "RETIRED_INSTRUCTIONS">,
+  PfmValidationCounter<L1DCacheLoadMiss, "L1-DCACHE-LOAD-MISSES">,
+  PfmValidationCounter<L1DCacheStoreMiss, "L1-DCACHE-STORE-MISSES">,
+  PfmValidationCounter<L1ICacheLoadMiss, "L1-ICACHE-LOAD-MISSES">,
+  PfmValidationCounter<DataTLBLoadMiss, "DTLB-LOAD-MISSES">,
+  PfmValidationCounter<InstructionTLBLoadMiss, "ITLB-LOAD-MISSES">
 ];
 
 // Set basic counters for AMD cpus that we know libpfm4 supports.
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
index 969d9c163f8ab..e985c323ff059 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.cpp
@@ -197,6 +197,18 @@ const char *validationEventToString(exegesis::ValidationEvent VE) {
   switch (VE) {
   case exegesis::ValidationEvent::InstructionRetired:
     return "instructions-retired";
+  case exegesis::ValidationEvent::L1DCacheLoadMiss:
+    return "l1d-cache-load-misses";
+  case exegesis::ValidationEvent::L1DCacheStoreMiss:
+    return "l1d-cache-store-misses";
+  case exegesis::ValidationEvent::L1ICacheLoadMiss:
+    return "l1i-cache-load-misses";
+  case exegesis::ValidationEvent::DataTLBLoadMiss:
+    return "data-tlb-load-misses";
+  case exegesis::ValidationEvent::DataTLBStoreMiss:
+    return "data-tlb-store-misses";
+  case exegesis::ValidationEvent::InstructionTLBLoadMiss:
+    return "instruction-tlb-load-misses";
   }
   llvm_unreachable("Unhandled exegesis::ValidationEvent enum");
 }
@@ -204,6 +216,18 @@ const char *validationEventToString(exegesis::ValidationEvent VE) {
 Expected<exegesis::ValidationEvent> stringToValidationEvent(StringRef Input) {
   if (Input == "instructions-retired")
     return exegesis::ValidationEvent::InstructionRetired;
+  else if (Input == "l1d-cache-load-misses")
+    return exegesis::ValidationEvent::L1DCacheLoadMiss;
+  else if (Input == "l1d-cache-store-misses")
+    return exegesis::ValidationEvent::L1DCacheStoreMiss;
+  else if (Input == "l1i-cache-load-misses")
+    return exegesis::ValidationEvent::L1ICacheLoadMiss;
+  else if (Input == "data-tlb-load-misses")
+    return exegesis::ValidationEvent::DataTLBLoadMiss;
+  else if (Input == "data-tlb-store-misses")
+    return exegesis::ValidationEvent::DataTLBStoreMiss;
+  else if (Input == "instruction-tlb-load-misses")
+    return exegesis::ValidationEvent::InstructionTLBLoadMiss;
   else
     return make_error<StringError>("Invalid validation event string",
                                    errc::invalid_argument);
diff --git a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
index c983d6d6e00d9..7769c9d5a6136 100644
--- a/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
+++ b/llvm/tools/llvm-exegesis/lib/BenchmarkResult.h
@@ -32,7 +32,15 @@ class Error;
 
 namespace exegesis {
 
-enum ValidationEvent { InstructionRetired };
+enum ValidationEvent {
+  InstructionRetired,
+  L1DCacheLoadMiss,
+  L1DCacheStoreMiss,
+  L1ICacheLoadMiss,
+  DataTLBLoadMiss,
+  DataTLBStoreMiss,
+  InstructionTLBLoadMiss
+};
 
 enum class BenchmarkPhaseSelectorE {
   PrepareSnippet,
diff --git a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
index 2a121bec98d95..9b3fe7610f0b4 100644
--- a/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
+++ b/llvm/tools/llvm-exegesis/llvm-exegesis.cpp
@@ -274,9 +274,21 @@ static cl::list<ValidationEvent> ValidationCounters(
         "The name of a validation counter to run concurrently with the main "
         "counter to validate benchmarking assumptions"),
     cl::CommaSeparated, cl::cat(BenchmarkOptions),
-    cl::values(clEnumValN(ValidationEvent::InstructionRetired,
-                          "instructions-retired",
-                          "Count retired instructions")));
+    cl::values(
+        clEnumValN(ValidationEvent::InstructionRetired, "instructions-retired",
+                   "Count retired instructions"),
+        clEnumValN(ValidationEvent::L1DCacheLoadMiss, "l1d-cache-load-misses",
+                   "Count L1D load cache misses"),
+        clEnumValN(ValidationEvent::L1DCacheStoreMiss, "l1d-cache-store-misses",
+                   "Count L1D store cache misses"),
+        clEnumValN(ValidationEvent::L1ICacheLoadMiss, "l1i-cache-load-misses",
+                   "Count L1I load cache misses"),
+        clEnumValN(ValidationEvent::DataTLBLoadMiss, "data-tlb-load-misses",
+                   "Count DTLB load misses"),
+        clEnumValN(ValidationEvent::DataTLBStoreMiss, "data-tlb-store-misses",
+                   "Count DTLB store misses"),
+        clEnumValN(ValidationEvent::InstructionTLBLoadMiss,
+                   "instruction-tlb-load-misses", "Count ITLB load misses")));
 
 static ExitOnError ExitOnErr("llvm-exegesis error: ");
 

From ba1e84fb8f45e102f40f409fcfe9b420fbf9fb70 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Tue, 23 Jan 2024 15:57:24 +0800
Subject: [PATCH 569/843] [C++20] [Modules] Handle inconsistent deduced
 function return type from importing modules

Close https://github.com/llvm/llvm-project/issues/78830
Close https://github.com/llvm/llvm-project/issues/60085

The direct reason of the issues is that in a module unit, the return
type of a function is deduced to a concrete type (e.g., int) but in the
other module unit, the return type of the same function is not deduced
yet (e.g, auto). Then when we importing the 2 modules, the later
function is selected but the code generator doesn't know how to generate
the auto type. So here is the crash.

The tricky part here is that, when the ASTReader reads the second
unreduced function, it finds the reading function has the same signature
with the already read deduced one and they have the same ODRHash. So
that the ASTReader skips loading the function body of the unreduced
function then the code generator can't infer the undeduced type like it
usually can. Also this is generally fine for functions without deducing
type since it is sufficient to emit a function call without the function
body.

Also in fact, we've already handled the case that the functon has
deduced type and its deducing state is inconsist in different modules:
https://github.com/llvm/llvm-project/blob/3ea92ea2f9d236569f82825cdba6d59bcc22495c/clang/lib/Serialization/ASTReader.cpp#L9531-L9544
and
https://github.com/llvm/llvm-project/blob/3ea92ea2f9d236569f82825cdba6d59bcc22495c/clang/lib/Serialization/ASTReaderDecl.cpp#L3643-L3647.

We've handled the case:
(1) If we read the undeduced functions first and read the deduced functions
later, the compiler will propagate the deduced type info for redecls in
the end of the reading.
(2) If we read the deduced functions first and read the undeduced
functions later, we will propagae the deduced type info when we **complete
the redecl chain**.

However, in the reporting issues, we're in the second case and
reproducer didn't trigger the action to complete the redecl chain. So
here is the crash.

Then it is obvious how should fix the problem. We should complete the
redecl chain for undeduced function types in the end of the reading for
the second case.
---
 clang/docs/ReleaseNotes.rst                   |  5 ++
 clang/include/clang/Serialization/ASTReader.h |  4 +
 clang/lib/Serialization/ASTReader.cpp         | 28 ++++--
 clang/test/Modules/pr60085.cppm               | 85 +++++++++++++++++++
 clang/test/Modules/pr78830.cppm               | 56 ++++++++++++
 5 files changed, 172 insertions(+), 6 deletions(-)
 create mode 100644 clang/test/Modules/pr60085.cppm
 create mode 100644 clang/test/Modules/pr78830.cppm

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 93eecbe0e1363..01c4ee97662b6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1049,6 +1049,11 @@ Bug Fixes to C++ Support
 
 - Set the ``__cpp_auto_cast`` feature test macro in C++23 mode.
 
+- Fix crash for inconsistent deducing state of function return types
+  in importing modules.
+  Fixes (`#78830 <https://github.com/llvm/llvm-project/issues/78830>`_)
+  Fixes (`#60085 <https://github.com/llvm/llvm-project/issues/60085>`_)
+
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
 - Fixed an import failure of recursive friend class template.
diff --git a/clang/include/clang/Serialization/ASTReader.h b/clang/include/clang/Serialization/ASTReader.h
index 21d791f5cd89a..dd1451bbf2d2c 100644
--- a/clang/include/clang/Serialization/ASTReader.h
+++ b/clang/include/clang/Serialization/ASTReader.h
@@ -550,6 +550,10 @@ class ASTReader
   /// declaration and the value is the deduced return type.
   llvm::SmallMapVector<FunctionDecl *, QualType, 4> PendingDeducedTypeUpdates;
 
+  /// Functions has undededuced return type and we wish we can find the deduced
+  /// return type by iterating the redecls in other modules.
+  llvm::SmallVector<FunctionDecl *, 4> PendingUndeducedFunctionDecls;
+
   /// Declarations that have been imported and have typedef names for
   /// linkage purposes.
   llvm::DenseMap<std::pair<DeclContext *, IdentifierInfo *>, NamedDecl *>
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index fe8782a3eb9e7..fecd94e875f67 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -9534,12 +9534,21 @@ void ASTReader::finishPendingActions() {
       auto *FD = PendingDeducedFunctionTypes[I].first;
       FD->setType(GetType(PendingDeducedFunctionTypes[I].second));
 
-      // If we gave a function a deduced return type, remember that we need to
-      // propagate that along the redeclaration chain.
-      auto *DT = FD->getReturnType()->getContainedDeducedType();
-      if (DT && DT->isDeduced())
-        PendingDeducedTypeUpdates.insert(
-            {FD->getCanonicalDecl(), FD->getReturnType()});
+      if (auto *DT = FD->getReturnType()->getContainedDeducedType()) {
+        // If we gave a function a deduced return type, remember that we need to
+        // propagate that along the redeclaration chain.
+        if (DT->isDeduced()) {
+          PendingDeducedTypeUpdates.insert(
+              {FD->getCanonicalDecl(), FD->getReturnType()});
+          continue;
+        }
+
+        // The function has undeduced DeduceType return type. We hope we can
+        // find the deduced type by iterating the redecls in other modules
+        // later.
+        PendingUndeducedFunctionDecls.push_back(FD);
+        continue;
+      }
     }
     PendingDeducedFunctionTypes.clear();
 
@@ -10105,6 +10114,13 @@ void ASTReader::FinishedDeserializing() {
         getContext().adjustDeducedFunctionResultType(Update.first,
                                                      Update.second);
       }
+
+      auto UDTUpdates = std::move(PendingUndeducedFunctionDecls);
+      PendingUndeducedFunctionDecls.clear();
+      // We hope we can find the deduced type for the functions by iterating
+      // redeclarations in other modules.
+      for (FunctionDecl *UndeducedFD : UDTUpdates)
+        (void)UndeducedFD->getMostRecentDecl();
     }
 
     if (ReadTimer)
diff --git a/clang/test/Modules/pr60085.cppm b/clang/test/Modules/pr60085.cppm
new file mode 100644
index 0000000000000..fd6fd914a543c
--- /dev/null
+++ b/clang/test/Modules/pr60085.cppm
@@ -0,0 +1,85 @@
+// RUN: rm -rf %t
+// RUN: mkdir %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/d.cppm \
+// RUN:     -emit-module-interface -o %t/d.pcm
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/c.cppm \
+// RUN:     -emit-module-interface -o %t/c.pcm -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/b.cppm \
+// RUN:     -emit-module-interface -o %t/b.pcm -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.cppm \
+// RUN:     -emit-module-interface -o %t/a.pcm -fprebuilt-module-path=%t
+// RUN: %clang_cc1 -std=c++20 -triple %itanium_abi_triple %t/a.pcm \
+// RUN:     -S -emit-llvm -disable-llvm-passes -o - -fprebuilt-module-path=%t \
+// RUN:     | FileCheck %t/a.cppm
+
+//--- d.cppm
+export module d;
+
+export template<typename>
+struct integer {
+	using type = int;
+
+	static constexpr auto value() {
+		return 0;
+	}
+
+	friend constexpr void f(integer const x) {
+		x.value();
+	}
+};
+
+export constexpr void ddd(auto const value) {
+	f(value);
+}
+
+
+template<typename T>
+constexpr auto dd = T();
+
+export template<typename T>
+constexpr auto d() {
+	dd<T>;
+}
+
+//--- c.cppm
+export module c;
+
+import d;
+
+template<typename T>
+auto cc = T();
+
+auto c() {
+	cc<integer<int>>;
+	integer<int>().value();
+}
+
+//--- b.cppm
+export module b;
+
+import d;
+
+auto b() {
+	integer<int>::type;
+}
+
+//--- a.cppm
+export module a;
+
+import b;
+import c;
+import d;
+
+constexpr void aa() {
+	d<integer<unsigned>>();
+	ddd(integer<int>());
+}
+
+export extern "C" void a() {
+	aa();
+}
+
+// Checks that we emit the IR successfully.
+// CHECK: define{{.*}}@a(
diff --git a/clang/test/Modules/pr78830.cppm b/clang/test/Modules/pr78830.cppm
new file mode 100644
index 0000000000000..a3b1a8021ebea
--- /dev/null
+++ b/clang/test/Modules/pr78830.cppm
@@ -0,0 +1,56 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/Type.cppm -emit-module-interface -o \
+// RUN:     %t/MyVec-Type.pcm -triple=x86_64-linux-gnu 
+// RUN:%clang_cc1 -std=c++20 %t/Vec.cppm -emit-module-interface -o \
+// RUN:     %t/MyVec-Vec.pcm -fmodule-file=MyVec:Type=%t/MyVec-Type.pcm \
+// RUN:     -triple=x86_64-linux-gnu 
+// RUN: %clang_cc1 -std=c++20 %t/Vec2.cppm -emit-module-interface -o \
+// RUN:     %t/MyVec-Vec2.pcm -fmodule-file=MyVec:Type=%t/MyVec-Type.pcm \
+// RUN:     -triple=x86_64-linux-gnu 
+// RUN: %clang_cc1 -std=c++20 %t/Calculator.cppm -emit-module-interface -o \
+// RUN:     %t/MyVec-Calculator.pcm -fmodule-file=MyVec:Vec=%t/MyVec-Vec.pcm   \
+// RUN:     -fmodule-file=MyVec:Vec2=%t/MyVec-Vec2.pcm \
+// RUN:     -fmodule-file=MyVec:Type=%t/MyVec-Type.pcm \
+// RUN:     -triple=x86_64-linux-gnu 
+// RUN: %clang_cc1 -std=c++20 %t/MyVec-Calculator.pcm -S -emit-llvm \
+// RUN:     -fmodule-file=MyVec:Vec=%t/MyVec-Vec.pcm   \
+// RUN:     -fmodule-file=MyVec:Vec2=%t/MyVec-Vec2.pcm \
+// RUN:     -fmodule-file=MyVec:Type=%t/MyVec-Type.pcm \
+// RUN:     -triple=x86_64-linux-gnu -o - \
+// RUN:     | FileCheck %t/Calculator.cppm
+
+//--- Type.cppm
+export module MyVec:Type;
+
+template <class T> struct Size {
+  auto total() const { return 1; }
+};
+
+//--- Vec.cppm
+export module MyVec:Vec;
+import :Type;
+
+int size_ = Size<int>().total();
+
+//--- Vec2.cppm
+export module MyVec:Vec2;
+import :Type;
+
+struct Vec2 {
+    Size<int> size_;
+};
+
+//--- Calculator.cppm
+export module MyVec:Calculator;
+
+import :Vec;
+import :Vec2;
+
+auto Calculate() { return Size<int>().total(); };
+
+// Check the emitted module initializer to make sure we generate the module unit
+// successfully.
+// CHECK: @_ZW5MyVec9Calculatev

From 134fcc62786d31ab73439201dce2d73808d1785a Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 23 Jan 2024 16:31:29 +0800
Subject: [PATCH 570/843] [X86][NFC] Simplify function
 X86InstrInfo::commuteInstructionImpl

---
 llvm/lib/Target/X86/X86InstrInfo.cpp | 232 +++++++++++----------------
 1 file changed, 96 insertions(+), 136 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6082a569cd49a..b7c2a5c4771c2 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2260,23 +2260,24 @@ static unsigned getCommutedVPERMV3Opcode(unsigned Opcode) {
 MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
                                                    unsigned OpIdx1,
                                                    unsigned OpIdx2) const {
-  auto cloneIfNew = [NewMI](MachineInstr &MI) -> MachineInstr & {
-    if (NewMI)
-      return *MI.getParent()->getParent()->CloneMachineInstr(&MI);
-    return MI;
+  auto CloneIfNew = [&](MachineInstr &MI) {
+    return std::exchange(NewMI, false)
+               ? MI.getParent()->getParent()->CloneMachineInstr(&MI)
+               : &MI;
   };
+  MachineInstr *WorkingMI = nullptr;
+  unsigned Opc = MI.getOpcode();
 
-  switch (MI.getOpcode()) {
-  case X86::SHRD16rri8: // A = SHRD16rri8 B, C, I -> A = SHLD16rri8 C, B, (16-I)
-  case X86::SHLD16rri8: // A = SHLD16rri8 B, C, I -> A = SHRD16rri8 C, B, (16-I)
-  case X86::SHRD32rri8: // A = SHRD32rri8 B, C, I -> A = SHLD32rri8 C, B, (32-I)
-  case X86::SHLD32rri8: // A = SHLD32rri8 B, C, I -> A = SHRD32rri8 C, B, (32-I)
-  case X86::SHRD64rri8: // A = SHRD64rri8 B, C, I -> A = SHLD64rri8 C, B, (64-I)
-  case X86::SHLD64rri8: { // A = SHLD64rri8 B, C, I -> A = SHRD64rri8 C, B,
-                          // (64-I)
-    unsigned Opc;
+  switch (Opc) {
+  // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
+  case X86::SHRD16rri8:
+  case X86::SHLD16rri8:
+  case X86::SHRD32rri8:
+  case X86::SHLD32rri8:
+  case X86::SHRD64rri8:
+  case X86::SHLD64rri8: {
     unsigned Size;
-    switch (MI.getOpcode()) {
+    switch (Opc) {
     default:
       llvm_unreachable("Unreachable!");
     case X86::SHRD16rri8:
@@ -2304,32 +2305,27 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
       Opc = X86::SHRD64rri8;
       break;
     }
-    unsigned Amt = MI.getOperand(3).getImm();
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.setDesc(get(Opc));
-    WorkingMI.getOperand(3).setImm(Size - Amt);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
+    WorkingMI = CloneIfNew(MI);
+    WorkingMI->setDesc(get(Opc));
+    WorkingMI->getOperand(3).setImm(Size - MI.getOperand(3).getImm());
+    break;
   }
   case X86::PFSUBrr:
-  case X86::PFSUBRrr: {
+  case X86::PFSUBRrr:
     // PFSUB  x, y: x = x - y
     // PFSUBR x, y: x = y - x
-    unsigned Opc =
-        (X86::PFSUBRrr == MI.getOpcode() ? X86::PFSUBrr : X86::PFSUBRrr);
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.setDesc(get(Opc));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
-  }
+    WorkingMI = CloneIfNew(MI);
+    WorkingMI->setDesc(
+        get(X86::PFSUBRrr == Opc ? X86::PFSUBrr : X86::PFSUBRrr));
+    break;
   case X86::BLENDPDrri:
   case X86::BLENDPSrri:
   case X86::VBLENDPDrri:
   case X86::VBLENDPSrri:
     // If we're optimizing for size, try to use MOVSD/MOVSS.
     if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
-      unsigned Mask, Opc;
-      switch (MI.getOpcode()) {
+      unsigned Mask;
+      switch (Opc) {
       default:
         llvm_unreachable("Unreachable!");
       case X86::BLENDPDrri:
@@ -2350,12 +2346,10 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
         break;
       }
       if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
-        auto &WorkingMI = cloneIfNew(MI);
-        WorkingMI.setDesc(get(Opc));
-        WorkingMI.removeOperand(3);
-        return TargetInstrInfo::commuteInstructionImpl(WorkingMI,
-                                                       /*NewMI=*/false, OpIdx1,
-                                                       OpIdx2);
+        WorkingMI = CloneIfNew(MI);
+        WorkingMI->setDesc(get(Opc));
+        WorkingMI->removeOperand(3);
+        break;
       }
     }
     [[fallthrough]];
@@ -2367,7 +2361,7 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPBLENDDYrri:
   case X86::VPBLENDWYrri: {
     int8_t Mask;
-    switch (MI.getOpcode()) {
+    switch (Opc) {
     default:
       llvm_unreachable("Unreachable!");
     case X86::BLENDPDrri:
@@ -2408,10 +2402,9 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     // Using int8_t to ensure it will be sign extended to the int64_t that
     // setImm takes in order to match isel behavior.
     int8_t Imm = MI.getOperand(3).getImm() & Mask;
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.getOperand(3).setImm(Mask ^ Imm);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
+    WorkingMI = CloneIfNew(MI);
+    WorkingMI->getOperand(3).setImm(Mask ^ Imm);
+    break;
   }
   case X86::INSERTPSrr:
   case X86::VINSERTPSrr:
@@ -2428,10 +2421,9 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
       unsigned AltIdx = llvm::countr_zero((ZMask | (1 << DstIdx)) ^ 15);
       assert(AltIdx < 4 && "Illegal insertion index");
       unsigned AltImm = (AltIdx << 6) | (AltIdx << 4) | ZMask;
-      auto &WorkingMI = cloneIfNew(MI);
-      WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(AltImm);
-      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                     OpIdx1, OpIdx2);
+      WorkingMI = CloneIfNew(MI);
+      WorkingMI->getOperand(MI.getNumOperands() - 1).setImm(AltImm);
+      break;
     }
     return nullptr;
   }
@@ -2441,8 +2433,8 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VMOVSSrr: {
     // On SSE41 or later we can commute a MOVSS/MOVSD to a BLENDPS/BLENDPD.
     if (Subtarget.hasSSE41()) {
-      unsigned Mask, Opc;
-      switch (MI.getOpcode()) {
+      unsigned Mask;
+      switch (Opc) {
       default:
         llvm_unreachable("Unreachable!");
       case X86::MOVSDrr:
@@ -2463,31 +2455,24 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
         break;
       }
 
-      auto &WorkingMI = cloneIfNew(MI);
-      WorkingMI.setDesc(get(Opc));
-      WorkingMI.addOperand(MachineOperand::CreateImm(Mask));
-      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                     OpIdx1, OpIdx2);
+      WorkingMI = CloneIfNew(MI);
+      WorkingMI->setDesc(get(Opc));
+      WorkingMI->addOperand(MachineOperand::CreateImm(Mask));
+      break;
     }
 
-    // Convert to SHUFPD.
-    assert(MI.getOpcode() == X86::MOVSDrr &&
-           "Can only commute MOVSDrr without SSE4.1");
-
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.setDesc(get(X86::SHUFPDrri));
-    WorkingMI.addOperand(MachineOperand::CreateImm(0x02));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
+    WorkingMI = CloneIfNew(MI);
+    WorkingMI->setDesc(get(X86::SHUFPDrri));
+    WorkingMI->addOperand(MachineOperand::CreateImm(0x02));
+    break;
   }
   case X86::SHUFPDrri: {
     // Commute to MOVSD.
     assert(MI.getOperand(3).getImm() == 0x02 && "Unexpected immediate!");
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.setDesc(get(X86::MOVSDrr));
-    WorkingMI.removeOperand(3);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
+    WorkingMI = CloneIfNew(MI);
+    WorkingMI->setDesc(get(X86::MOVSDrr));
+    WorkingMI->removeOperand(3);
+    break;
   }
   case X86::PCLMULQDQrr:
   case X86::VPCLMULQDQrr:
@@ -2500,10 +2485,9 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
     unsigned Imm = MI.getOperand(3).getImm();
     unsigned Src1Hi = Imm & 0x01;
     unsigned Src2Hi = Imm & 0x10;
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
+    WorkingMI = CloneIfNew(MI);
+    WorkingMI->getOperand(3).setImm((Src1Hi << 4) | (Src2Hi >> 4));
+    break;
   }
   case X86::VPCMPBZ128rri:
   case X86::VPCMPUBZ128rri:
@@ -2552,15 +2536,13 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPCMPWZ256rrik:
   case X86::VPCMPUWZ256rrik:
   case X86::VPCMPWZrrik:
-  case X86::VPCMPUWZrrik: {
+  case X86::VPCMPUWZrrik:
+    WorkingMI = CloneIfNew(MI);
     // Flip comparison mode immediate (if necessary).
-    unsigned Imm = MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7;
-    Imm = X86::getSwappedVPCMPImm(Imm);
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.getOperand(MI.getNumOperands() - 1).setImm(Imm);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
-  }
+    WorkingMI->getOperand(MI.getNumOperands() - 1)
+        .setImm(X86::getSwappedVPCMPImm(
+            MI.getOperand(MI.getNumOperands() - 1).getImm() & 0x7));
+    break;
   case X86::VPCOMBri:
   case X86::VPCOMUBri:
   case X86::VPCOMDri:
@@ -2568,15 +2550,12 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPCOMQri:
   case X86::VPCOMUQri:
   case X86::VPCOMWri:
-  case X86::VPCOMUWri: {
+  case X86::VPCOMUWri:
+    WorkingMI = CloneIfNew(MI);
     // Flip comparison mode immediate (if necessary).
-    unsigned Imm = MI.getOperand(3).getImm() & 0x7;
-    Imm = X86::getSwappedVPCOMImm(Imm);
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.getOperand(3).setImm(Imm);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
-  }
+    WorkingMI->getOperand(3).setImm(
+        X86::getSwappedVPCOMImm(MI.getOperand(3).getImm() & 0x7));
+    break;
   case X86::VCMPSDZrr:
   case X86::VCMPSSZrr:
   case X86::VCMPPDZrri:
@@ -2594,35 +2573,28 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VCMPPDZ128rrik:
   case X86::VCMPPSZ128rrik:
   case X86::VCMPPDZ256rrik:
-  case X86::VCMPPSZ256rrik: {
-    unsigned Imm =
-        MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f;
-    Imm = X86::getSwappedVCMPImm(Imm);
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.getOperand(MI.getNumExplicitOperands() - 1).setImm(Imm);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
-  }
+  case X86::VCMPPSZ256rrik:
+    WorkingMI = CloneIfNew(MI);
+    WorkingMI->getOperand(MI.getNumExplicitOperands() - 1)
+        .setImm(X86::getSwappedVCMPImm(
+            MI.getOperand(MI.getNumExplicitOperands() - 1).getImm() & 0x1f));
+    break;
   case X86::VPERM2F128rr:
-  case X86::VPERM2I128rr: {
+  case X86::VPERM2I128rr:
     // Flip permute source immediate.
     // Imm & 0x02: lo = if set, select Op1.lo/hi else Op0.lo/hi.
     // Imm & 0x20: hi = if set, select Op1.lo/hi else Op0.lo/hi.
-    int8_t Imm = MI.getOperand(3).getImm() & 0xFF;
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.getOperand(3).setImm(Imm ^ 0x22);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
-  }
+    WorkingMI = CloneIfNew(MI);
+    WorkingMI->getOperand(3).setImm((MI.getOperand(3).getImm() & 0xFF) ^ 0x22);
+    break;
   case X86::MOVHLPSrr:
   case X86::UNPCKHPDrr:
   case X86::VMOVHLPSrr:
   case X86::VUNPCKHPDrr:
   case X86::VMOVHLPSZrr:
-  case X86::VUNPCKHPDZ128rr: {
+  case X86::VUNPCKHPDZ128rr:
     assert(Subtarget.hasSSE2() && "Commuting MOVHLP/UNPCKHPD requires SSE2!");
 
-    unsigned Opc = MI.getOpcode();
     switch (Opc) {
     default:
       llvm_unreachable("Unreachable!");
@@ -2645,20 +2617,17 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
       Opc = X86::VMOVHLPSZrr;
       break;
     }
-    auto &WorkingMI = cloneIfNew(MI);
-    WorkingMI.setDesc(get(Opc));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
-  }
+    WorkingMI = CloneIfNew(MI);
+    WorkingMI->setDesc(get(Opc));
+    break;
   case X86::CMOV16rr:
   case X86::CMOV32rr:
   case X86::CMOV64rr: {
-    auto &WorkingMI = cloneIfNew(MI);
+    WorkingMI = CloneIfNew(MI);
     unsigned OpNo = MI.getDesc().getNumOperands() - 1;
     X86::CondCode CC = static_cast<X86::CondCode>(MI.getOperand(OpNo).getImm());
-    WorkingMI.getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
+    WorkingMI->getOperand(OpNo).setImm(X86::GetOppositeBranchCondition(CC));
+    break;
   }
   case X86::VPTERNLOGDZrri:
   case X86::VPTERNLOGDZrmi:
@@ -2702,34 +2671,25 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VPTERNLOGQZ128rmbikz:
   case X86::VPTERNLOGQZ256rmbikz:
   case X86::VPTERNLOGQZrmbikz: {
-    auto &WorkingMI = cloneIfNew(MI);
-    commuteVPTERNLOG(WorkingMI, OpIdx1, OpIdx2);
-    return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                   OpIdx1, OpIdx2);
+    WorkingMI = CloneIfNew(MI);
+    commuteVPTERNLOG(*WorkingMI, OpIdx1, OpIdx2);
+    break;
   }
-  default: {
-    if (isCommutableVPERMV3Instruction(MI.getOpcode())) {
-      unsigned Opc = getCommutedVPERMV3Opcode(MI.getOpcode());
-      auto &WorkingMI = cloneIfNew(MI);
-      WorkingMI.setDesc(get(Opc));
-      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                     OpIdx1, OpIdx2);
+  default:
+    if (isCommutableVPERMV3Instruction(Opc)) {
+      WorkingMI = CloneIfNew(MI);
+      WorkingMI->setDesc(get(getCommutedVPERMV3Opcode(Opc)));
+      break;
     }
 
-    const X86InstrFMA3Group *FMA3Group =
-        getFMA3Group(MI.getOpcode(), MI.getDesc().TSFlags);
-    if (FMA3Group) {
-      unsigned Opc =
-          getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group);
-      auto &WorkingMI = cloneIfNew(MI);
-      WorkingMI.setDesc(get(Opc));
-      return TargetInstrInfo::commuteInstructionImpl(WorkingMI, /*NewMI=*/false,
-                                                     OpIdx1, OpIdx2);
+    if (auto *FMA3Group = getFMA3Group(Opc, MI.getDesc().TSFlags)) {
+      WorkingMI = CloneIfNew(MI);
+      WorkingMI->setDesc(
+          get(getFMA3OpcodeToCommuteOperands(MI, OpIdx1, OpIdx2, *FMA3Group)));
+      break;
     }
-
-    return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
-  }
   }
+  return TargetInstrInfo::commuteInstructionImpl(MI, NewMI, OpIdx1, OpIdx2);
 }
 
 bool X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,

From f20556678c70f89f96ed01b2c6092c5f7d6efedd Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 23 Jan 2024 08:45:47 +0000
Subject: [PATCH 571/843] Reland "[llvm][AArch64] Copy all operands when
 expanding BLR_BTI bundle (#78267)" (#78719)

This reverts commit 955417ade2648c2b1a4e5f0be697f61570590a88.

The problem with the previous version was that the bundle instruction
had arguments like "target arg1 arg2". When it's expanded we produced a
BL or BLR which can only accept one argument, the target of the branch.

Now I realise why expandCALL_RVMARKER has to copy them in mutiple steps.
The operands for the called function need to be changed to implicit
arguments of the branch instruction.

* Copy the branch target.
* Copy all register operands, marking them as implicit.
* Copy any other operands without modifying them.

Prior to any attempt to fix #77915:
BL @_setjmp, csr_aarch64_aapcs, implicit-def $lr, implicit $sp,
implicit-def dead $lr, implicit $sp, implicit-def $sp

Which is dropping the use of the arguments for the called function.

My first fix attempt produced:
BL @_setjmp, $x0, $w1, <regmask $fp ...>, implicit-def $lr, implicit
$sp, implicit-def dead $lr, implicit $sp, implicit-def $sp

It copied the arguments but as explicit arguments to the BL which only
expects 1, failing verification.

With this new change we produce:
BL @_setjmp, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit
$x0, implicit $w1, implicit-def dead $lr, implicit $sp, implicit-def $sp

Note specifically the added "implicit $x0, implicit $w1". So BL only has
1 explicit argument, but the arguments to the function are still used.
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      | 18 +++++++++++-
 .../AArch64/blr-bti-preserves-operands.mir    | 28 +++++++++++++++++++
 .../AArch64/blr-bti-preserves-regmask.mir     | 23 ---------------
 3 files changed, 45 insertions(+), 24 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir
 delete mode 100644 llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index bb7f4d907ffd7..352c61d48e2ff 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -843,8 +843,24 @@ bool AArch64ExpandPseudo::expandCALL_BTI(MachineBasicBlock &MBB,
   MachineInstr *Call =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)).getInstr();
   Call->addOperand(CallTarget);
+
+  // 1 because we already added the branch target above.
+  unsigned RegMaskStartIdx = 1;
+  // The branch is BL <target>, so we cannot attach the arguments of the called
+  // function to it. Those must be added as implicitly used by the branch.
+  while (!MI.getOperand(RegMaskStartIdx).isRegMask()) {
+    auto MOP = MI.getOperand(RegMaskStartIdx);
+    assert(MOP.isReg() && "can only add register operands");
+    Call->addOperand(MachineOperand::CreateReg(
+        MOP.getReg(), /*Def=*/false, /*Implicit=*/true, /*isKill=*/false,
+        /*isDead=*/false, /*isUndef=*/MOP.isUndef()));
+    RegMaskStartIdx++;
+  }
+  for (const MachineOperand &MO :
+       llvm::drop_begin(MI.operands(), RegMaskStartIdx))
+    Call->addOperand(MO);
+
   Call->setCFIType(*MBB.getParent(), MI.getCFIType());
-  Call->copyImplicitOps(*MBB.getParent(), MI);
 
   MachineInstr *BTI =
       BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(AArch64::HINT))
diff --git a/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir b/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir
new file mode 100644
index 0000000000000..760ae4794e304
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/blr-bti-preserves-operands.mir
@@ -0,0 +1,28 @@
+# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=aarch64-expand-pseudo -o - %s | FileCheck %s
+
+# When expanding a BLR_BTI, we should copy all the operands to the branch in the
+# bundle. Otherwise we could end up using a register after the BL which was
+# clobbered by the function that was called, or overwriting an argument to that
+# function before we take the branch.
+#
+# The arguments to the call must become implicit arguments, because the branch
+# only expects to get 1 explicit operand which is the branch target.
+
+# CHECK:    BUNDLE implicit-def $lr, implicit-def $w30, implicit-def $sp, implicit-def $wsp, implicit $sp, implicit $x0, implicit $w1 {
+# CHECK:      BL @_setjmp, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $x0, implicit $w1, implicit-def dead $lr, implicit $sp, implicit-def $sp
+# CHECK:      HINT 36
+# CHECK:    }
+
+--- |
+  define void @a() {
+    ret void
+  }
+
+  declare void @_setjmp(...)
+...
+---
+name: a
+body: |
+  bb.0:
+    BLR_BTI @_setjmp, $x0, $w1, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
+...
diff --git a/llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir b/llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir
deleted file mode 100644
index 91652c6e20c8f..0000000000000
--- a/llvm/test/CodeGen/AArch64/blr-bti-preserves-regmask.mir
+++ /dev/null
@@ -1,23 +0,0 @@
-# RUN: llc -mtriple=aarch64-none-linux-gnu -run-pass=aarch64-expand-pseudo -o - %s | FileCheck %s
-
-# When expanding a BLR_BTI, we should keep the regmask that was attached to it.
-# Otherwise we could end up using a register after the BL which was clobbered by
-# the function that was called.
-# CHECK:    BUNDLE implicit-def $lr, implicit-def $w30, implicit-def $sp, implicit-def $wsp, implicit $sp {
-# CHECK:      BL @_setjmp, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit-def dead $lr, implicit $sp, implicit-def $sp
-# CHECK:      HINT 36
-# CHECK:    }
-
---- |
-  define void @a() {
-    ret void
-  }
-
-  declare void @_setjmp(...)
-...
----
-name: a
-body: |
-  bb.0:
-    BLR_BTI @_setjmp, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp
-...

From d8628a00fd92c6f9db631f817a63eada90707ad2 Mon Sep 17 00:00:00 2001
From: Jianjian Guan <jacquesguan@me.com>
Date: Tue, 23 Jan 2024 16:55:40 +0800
Subject: [PATCH 572/843] [RISCV] Add IntrArgMemOnly for vector unit stride
 load/store intrinsics (#78415)

IntrArgMemOnly means the intrinsic only accesses memory that its
pointer-typed argument(s) points to. I think RVV load/store intrinsics
meets it. Add IntrArgMemOnly would help in some passes, by example, it
could add `alais.scope` to intrinsics callee when try to inline a
function that has noalais parameter(s).
---
 llvm/include/llvm/IR/IntrinsicsRISCV.td | 29 ++++++++++++++++---------
 1 file changed, 19 insertions(+), 10 deletions(-)

diff --git a/llvm/include/llvm/IR/IntrinsicsRISCV.td b/llvm/include/llvm/IR/IntrinsicsRISCV.td
index a391bc53cdb0e..9a63d14b0ef0a 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCV.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCV.td
@@ -147,7 +147,8 @@ let TargetPrefix = "riscv" in {
   class RISCVUSMLoad
         : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                     [llvm_ptr_ty, llvm_anyint_ty],
-                    [NoCapture<ArgIndex<0>>, IntrReadMem]>, RISCVVIntrinsic {
+                    [NoCapture<ArgIndex<0>>, IntrReadMem, IntrArgMemOnly]>,
+          RISCVVIntrinsic {
     let VLOperand = 1;
   }
   // For unit stride load
@@ -155,7 +156,8 @@ let TargetPrefix = "riscv" in {
   class RISCVUSLoad
         : DefaultAttrsIntrinsic<[llvm_anyvector_ty],
                     [LLVMMatchType<0>, llvm_ptr_ty, llvm_anyint_ty],
-                    [NoCapture<ArgIndex<1>>, IntrReadMem]>, RISCVVIntrinsic {
+                    [NoCapture<ArgIndex<1>>, IntrReadMem, IntrArgMemOnly]>,
+          RISCVVIntrinsic {
     let VLOperand = 2;
   }
   // For unit stride fault-only-first load
@@ -177,7 +179,8 @@ let TargetPrefix = "riscv" in {
                     [LLVMMatchType<0>, llvm_ptr_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                      llvm_anyint_ty, LLVMMatchType<1>],
-                    [NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<4>>, IntrReadMem]>,
+                    [NoCapture<ArgIndex<1>>, ImmArg<ArgIndex<4>>, IntrReadMem,
+                     IntrArgMemOnly]>,
                     RISCVVIntrinsic {
     let VLOperand = 3;
   }
@@ -239,7 +242,8 @@ let TargetPrefix = "riscv" in {
   class RISCVUSStore
         : DefaultAttrsIntrinsic<[],
                     [llvm_anyvector_ty, llvm_ptr_ty, llvm_anyint_ty],
-                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>, RISCVVIntrinsic {
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem, IntrArgMemOnly]>,
+          RISCVVIntrinsic {
     let VLOperand = 2;
   }
   // For unit stride store with mask
@@ -249,7 +253,8 @@ let TargetPrefix = "riscv" in {
                     [llvm_anyvector_ty, llvm_ptr_ty,
                      LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                      llvm_anyint_ty],
-                    [NoCapture<ArgIndex<1>>, IntrWriteMem]>, RISCVVIntrinsic {
+                    [NoCapture<ArgIndex<1>>, IntrWriteMem, IntrArgMemOnly]>,
+          RISCVVIntrinsic {
     let VLOperand = 3;
   }
   // For strided store
@@ -992,7 +997,8 @@ let TargetPrefix = "riscv" in {
                                 !add(nf, -1))),
                     !listconcat(!listsplat(LLVMMatchType<0>, nf),
                                 [llvm_ptr_ty, llvm_anyint_ty]),
-                    [NoCapture<ArgIndex<nf>>, IntrReadMem]>, RISCVVIntrinsic {
+                    [NoCapture<ArgIndex<nf>>, IntrReadMem, IntrArgMemOnly]>,
+          RISCVVIntrinsic {
     let VLOperand = !add(nf, 1);
   }
   // For unit stride segment load with mask
@@ -1004,8 +1010,9 @@ let TargetPrefix = "riscv" in {
                                 [llvm_ptr_ty,
                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                  llvm_anyint_ty, LLVMMatchType<1>]),
-                    [ImmArg<ArgIndex<!add(nf, 3)>>, NoCapture<ArgIndex<nf>>, IntrReadMem]>,
-                    RISCVVIntrinsic {
+                    [ImmArg<ArgIndex<!add(nf, 3)>>, NoCapture<ArgIndex<nf>>,
+                     IntrReadMem, IntrArgMemOnly]>,
+          RISCVVIntrinsic {
     let VLOperand = !add(nf, 2);
   }
 
@@ -1096,7 +1103,8 @@ let TargetPrefix = "riscv" in {
                     !listconcat([llvm_anyvector_ty],
                                 !listsplat(LLVMMatchType<0>, !add(nf, -1)),
                                 [llvm_ptr_ty, llvm_anyint_ty]),
-                    [NoCapture<ArgIndex<nf>>, IntrWriteMem]>, RISCVVIntrinsic {
+                    [NoCapture<ArgIndex<nf>>, IntrWriteMem, IntrArgMemOnly]>,
+          RISCVVIntrinsic {
     let VLOperand = !add(nf, 1);
   }
   // For unit stride segment store with mask
@@ -1108,7 +1116,8 @@ let TargetPrefix = "riscv" in {
                                 [llvm_ptr_ty,
                                  LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>,
                                  llvm_anyint_ty]),
-                    [NoCapture<ArgIndex<nf>>, IntrWriteMem]>, RISCVVIntrinsic {
+                    [NoCapture<ArgIndex<nf>>, IntrWriteMem, IntrArgMemOnly]>,
+          RISCVVIntrinsic {
     let VLOperand = !add(nf, 2);
   }
 

From c193bb7e9eee02a435944506c732db9b480f0c84 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 23 Jan 2024 17:59:44 +0900
Subject: [PATCH 573/843] [Coverage] getMaxBitmapSize: Scan `max(BitmapIdx)`
 instead of the last `Decision` (#78963)

In `CoverageMapping.cpp:getMaxBitmapSize()`,
this assumed that the last `Decision` has the maxmum `BitmapIdx`.

Let it scan `max(BitmapIdx)`.

Note that `<=` is used insted of `<`, because `BitmapIdx == 0` is valid
and `MaxBitmapID` is `unsigned`. `BitmapIdx` is unique in the record.

Fixes #78922
---
 .../ProfileData/Coverage/CoverageMapping.cpp  |  11 +++--
 llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.c  |  13 +++++
 llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o  | Bin 0 -> 4648 bytes
 .../tools/llvm-cov/Inputs/mcdc-maxbs.proftext |   9 ++++
 llvm/test/tools/llvm-cov/mcdc-maxbs.test      |  46 ++++++++++++++++++
 5 files changed, 74 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.c
 create mode 100644 llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o
 create mode 100644 llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.proftext
 create mode 100644 llvm/test/tools/llvm-cov/mcdc-maxbs.test

diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 9c95cd5d76d21..da8e1d87319dd 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -229,6 +229,7 @@ Expected<BitVector> CounterMappingContext::evaluateBitmap(
   unsigned SizeInBits = llvm::alignTo(uint64_t(1) << NC, CHAR_BIT);
   unsigned SizeInBytes = SizeInBits / CHAR_BIT;
 
+  assert(ID + SizeInBytes <= BitmapBytes.size() && "BitmapBytes overrun");
   ArrayRef<uint8_t> Bytes(&BitmapBytes[ID], SizeInBytes);
 
   // Mask each bitmap byte into the BitVector. Go in reverse so that the
@@ -568,14 +569,14 @@ static unsigned getMaxBitmapSize(const CounterMappingContext &Ctx,
                                  const CoverageMappingRecord &Record) {
   unsigned MaxBitmapID = 0;
   unsigned NumConditions = 0;
-  // The last DecisionRegion has the highest bitmap byte index used in the
-  // function, which when combined with its number of conditions, yields the
-  // full bitmap size.
+  // Scan max(BitmapIdx).
+  // Note that `<=` is used insted of `<`, because `BitmapIdx == 0` is valid
+  // and `MaxBitmapID is `unsigned`. `BitmapIdx` is unique in the record.
   for (const auto &Region : reverse(Record.MappingRegions)) {
-    if (Region.Kind == CounterMappingRegion::MCDCDecisionRegion) {
+    if (Region.Kind == CounterMappingRegion::MCDCDecisionRegion &&
+        MaxBitmapID <= Region.MCDCParams.BitmapIdx) {
       MaxBitmapID = Region.MCDCParams.BitmapIdx;
       NumConditions = Region.MCDCParams.NumConditions;
-      break;
     }
   }
   unsigned SizeInBits = llvm::alignTo(uint64_t(1) << NumConditions, CHAR_BIT);
diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.c b/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.c
new file mode 100644
index 0000000000000..83644b2954945
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.c
@@ -0,0 +1,13 @@
+#define RANGE(a,b,c) ((a) <= (b) && (b) <= (c))
+
+int sub(int c) {
+  if (RANGE('0', c, '9')) return 1;
+  return (('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'));
+}
+
+extern void __llvm_profile_write_file(void);
+
+int main(int c, char **v)
+{
+  return (c > 1 ? sub(c) : 0);
+}
diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o b/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o
new file mode 100644
index 0000000000000000000000000000000000000000..269902224ccae9ca01c2b858f020410d7f221ce5
GIT binary patch
literal 4648
zcmb_fVQf=X6u$4hb!)e7>jnb`OgltHlt&9}Y>r@oZB}9YFfp27Xu7T)leJyiZcGGh
zQhxx+46<lUH1UTf8iI-bK!HD&7>Pqj7L3H8Ch|v!{7{TBQAz4K@14`$-L;tso}@eX
zeCM2d?)mP0_r1N+x2s>*G((VRVyn>Dko20mk`b_m0kIMyg9Qg#xx)G*4;RPX2L@=I
ztw*nutg|@t7dEE(XM$5!uK#MVAdbx*yL90fBYUGJ@TnHK`g3r~$_zL&wHjsUxx%*8
z3{7<&r^*%fq#MrRBsA6SGvxN?{WG~jxN-Wz)Gn8ndXM2hbA^`!jVCWox$mT3<D_gX
zW;WjcDog_t+5PzQxq^OZge+3f<b>ShPQn*xvIXb$=Ys|PzqR;vvD}{YB8xI=(VFP?
zXXN(h&)FP=Qy&m?4!1E^IC$VqI}P_NIAy10VGMG7-w#_)?mhk2CosNv<ZfqdZAWnL
z;l;nUpL+2fI0rSFLvLwmZE9<4*IcgpCe82|x?8IeZiiSd-p*#TPKQ__9?->VEr*s4
zM`w#J>O}h*tqN3+F5F_VuccZutWk?tCcM7ZT0`hKY7!5Mo+j~-Fnw(edX>OYo7f;8
z_q8{Jg`;XsTOxd#SPD}@(o(VB=USra0s$AKulLn22Mb5=_wR3~Zgvj6@%>sDzmT&}
zF88B>II6WeO^43vgvaZ^g?b#KSxnoVEE26c9OE`f25GgGs0OV!ESbC>`Dtcg_fyxd
z44e+Gy|%Mj+*en1m}34f#rwf~vCzCO3Mf9XN&@r2Tqy2&CNe3D(=&PfQFCDM^1$WG
zhN$n+?}*4~I5F&<h^Epb$%NP6W%^9t@JPnaT1?5>!Doav>CslWYn&%w8$5uoB;S{$
z5%babI%(W7e1K8c5LQdt4H|EO2C^|9r5S#U;K#KQQ{^TARpL%T#nHyISWXP#oCp75
zA<X(*w;_0?C0w(PpmbT5Tu>J~nlHffJn&r$X`F{6U|8(Vb$w)+O>;rA42!Y(F2iE6
z9By41a~v+{t6AcW_w)=zAwEmdysx<T?K<wPEmue*J?%R0v=IG-^LfUdLGRsyEzAJp
z%-{;jW1P=IyM$x)e`uaZBpm0*%Lb*xR#s<n{1Xz6@qCu|NI1sxI)@dU>t-Yz=UGkm
zaRq;j>}dt(x~CL;6WO0D_!hFik#OXU*Lg$1nTJ~vj_c%j`~1Uw2pHnbKYt`J&iq&7
z27@3j^VuNbI1krdF5z<gY6-`9=3%RZ%ltpB;LQJX5{~mQ|8WIp{wEZi>mE~Z=Kp;K
zXa4!FS_U<-u55z|??Rg&)F`qaIhJkoV;tT#9PfoO3i6CU7G4+Qv*c)MHXJq1T6cYe
z2!$faiBvQa>fPSm(Ya}3XW!<o-rkP>ZQ~_ZfA_W>9ev#!Hum-I=<MnOmzj!=hRsZL
zG9%1LG9Hg6G7vsGIuQ?zrDCB-B9oTw!I8OkA{>vVg*lu`j*oF1EF&J4t=M=1d}FC(
zY%m^*hez0o*oK)%W}@bkyPsW`&V(cTg=tj}4-LVaX_yum-4}|bpq7Y7h9c|Y;mN_Y
z87WPg9v>tl62h^W&ZIJ7)Gx&?ja^D@4TUpdruLq6J)9H1<K&5cr&r4cI!EkNOo8VL
zG5j`>B|ye0C4tWm$8LrJK63#ENFSvMXXy~(y^2GA6Y_aG1X#JZOO_sFj>mkA<L8_8
z>wsBH)V(k&$#1l7em}5>=VQB4{i9{-<M#pAWnHO$p8DqZ5Z6=d=fGhv)ypB4cG9Nu
z{lQK(ztZ|olvzJQyl&R`-og6tRXs;GQGHTHAJJO0E7gCC3h;Ts{lg={by@EOtX%yr
zDjY0UvIlDat01{tewOkB3*@&!vb7ea|L5uX@ligmAHSx!{zB*XLu!!YF(0SozFAk|
zKd%xGxhUAHES32Gm<q`J<JTkCWnGDXuFv^gllx)2692RJ$p6tY{NuMd*Ju9s1BRUF
z!V7;(;-LodnUzK5Z#y^<<9+7XRg};FyD`rBtj|+E`&sue!3>b*JcfUS0bpaw@e={c
z4{#u9?w{fBV1S_k21uVR!yj_S`OM!Xz>HmmB&aO>igJtc@!J3e&o7F){uTh#wPReK
O*8deHpu%r%wf?{KA<;Jg

literal 0
HcmV?d00001

diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.proftext b/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.proftext
new file mode 100644
index 0000000000000..f24e984f657f2
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.proftext
@@ -0,0 +1,9 @@
+main
+# Func Hash:
+99164
+# Num Counters:
+2
+# Counter Values:
+1
+0
+
diff --git a/llvm/test/tools/llvm-cov/mcdc-maxbs.test b/llvm/test/tools/llvm-cov/mcdc-maxbs.test
new file mode 100644
index 0000000000000..ad5bc828529ce
--- /dev/null
+++ b/llvm/test/tools/llvm-cov/mcdc-maxbs.test
@@ -0,0 +1,46 @@
+# Test not to trigger the assertion failure in llvm-cov with empty bitmap.
+# REQUIRES: asserts
+
+# mcdc-maxbs.o contains the record:
+#
+# sub:
+#   Decision,File 0, 5:11 -> 5:59 = M:1, C:4
+#   Branch,File 0, 5:12 -> 5:20 = #5, ((#0 - #1) - #5) [1,3,2]
+#   Branch,File 0, 5:24 -> 5:32 = #6, (#5 - #6) [3,0,2]
+#   Branch,File 0, 5:38 -> 5:46 = #7, (#4 - #7) [2,4,0]
+#   Branch,File 0, 5:50 -> 5:58 = #8, (#7 - #8) [4,0,0]
+#   Decision,File 1, 1:23 -> 1:47 = M:0, C:2
+#   Branch,File 1, 1:23 -> 1:33 = #2, (#0 - #2) [1,2,0]
+#   Branch,File 1, 1:37 -> 1:47 = #3, (#2 - #3) [2,0,0]
+#
+# With the previous implementation, `Decision M:0 C:2` was picked up as
+# the last `Decision`.
+# The size of the bitmap was reported as `1` in this case.
+# Actually, the largest `Decision` is `M:1 C:4` and the size should be `3`
+# (Idx=1, len=16 bits).
+#
+# When a corresponding bitmap was not emitted, the dummy zero-ed bitmap is
+# allocated as `BitmapBytes` with `getMaxBitmapSize(Record) + 1`
+# (in this case, `1 + 1`, less than `3`).
+# It may overrun and would trigger `unexpected test vector`
+# by trailing uninitialized garbage.
+
+# RUN: llvm-profdata merge %S/Inputs/mcdc-maxbs.proftext -o %t.profdata
+# RUN: llvm-cov report --show-mcdc-summary %S/Inputs/mcdc-maxbs.o -instr-profile %t.profdata
+
+# Instructions for regenerating the test object:
+
+cd %S/Inputs # or copy %S/Inputs/mcdc-maxbs.c into the working directory
+clang -O3 -fcoverage-mcdc -fprofile-instr-generate \
+    -fcoverage-mapping -fcoverage-compilation-dir=. \
+    mcdc-maxbs.c -c -o mcdc-maxbs.o
+
+# Instructions for regenerating the test vector:
+
+clang -fprofile-instr-generate mcdc-maxbs.o
+
+# Doesn't crash if argc > 1
+./a.out
+
+llvm-profdata merge --sparse -o default.profdata default.profraw
+llvm-profdata merge --text -o mcdc-maxbs.proftext default.profdata

From 66237d647ed95d7df92a438f8181c11423addc7d Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 23 Jan 2024 17:02:19 +0800
Subject: [PATCH 574/843] [X86][CodeGen] Add entries for NDD SHLD/SHRD to the
 commuteInstructionImpl

---
 llvm/lib/Target/X86/X86CompressEVEX.cpp     |  1 -
 llvm/lib/Target/X86/X86InstrInfo.cpp        | 32 ++++++++++++++++-
 llvm/test/CodeGen/X86/apx/compress-evex.mir | 10 ++++++
 llvm/test/CodeGen/X86/apx/shrd.ll           | 39 ++++++++-------------
 4 files changed, 56 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/X86/X86CompressEVEX.cpp b/llvm/lib/Target/X86/X86CompressEVEX.cpp
index 78a7e9850abdb..a9704e30478d1 100644
--- a/llvm/lib/Target/X86/X86CompressEVEX.cpp
+++ b/llvm/lib/Target/X86/X86CompressEVEX.cpp
@@ -200,7 +200,6 @@ static bool isRedundantNewDataDest(MachineInstr &MI, const X86Subtarget &ST) {
       !MI.getOperand(2).isReg() || MI.getOperand(2).getReg() != Reg0)
     return false;
   // Opcode may change after commute, e.g. SHRD -> SHLD
-  // TODO: Add test for this after ND SHRD/SHLD is supported
   ST.getInstrInfo()->commuteInstruction(MI, false, 1, 2);
   return true;
 }
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b7c2a5c4771c2..6dcaaf754a18d 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2275,7 +2275,13 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::SHRD32rri8:
   case X86::SHLD32rri8:
   case X86::SHRD64rri8:
-  case X86::SHLD64rri8: {
+  case X86::SHLD64rri8:
+  case X86::SHRD16rri8_ND:
+  case X86::SHLD16rri8_ND:
+  case X86::SHRD32rri8_ND:
+  case X86::SHLD32rri8_ND:
+  case X86::SHRD64rri8_ND:
+  case X86::SHLD64rri8_ND: {
     unsigned Size;
     switch (Opc) {
     default:
@@ -2304,6 +2310,30 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
       Size = 64;
       Opc = X86::SHRD64rri8;
       break;
+    case X86::SHRD16rri8_ND:
+      Size = 16;
+      Opc = X86::SHLD16rri8_ND;
+      break;
+    case X86::SHLD16rri8_ND:
+      Size = 16;
+      Opc = X86::SHRD16rri8_ND;
+      break;
+    case X86::SHRD32rri8_ND:
+      Size = 32;
+      Opc = X86::SHLD32rri8_ND;
+      break;
+    case X86::SHLD32rri8_ND:
+      Size = 32;
+      Opc = X86::SHRD32rri8_ND;
+      break;
+    case X86::SHRD64rri8_ND:
+      Size = 64;
+      Opc = X86::SHLD64rri8_ND;
+      break;
+    case X86::SHLD64rri8_ND:
+      Size = 64;
+      Opc = X86::SHRD64rri8_ND;
+      break;
     }
     WorkingMI = CloneIfNew(MI);
     WorkingMI->setDesc(get(Opc));
diff --git a/llvm/test/CodeGen/X86/apx/compress-evex.mir b/llvm/test/CodeGen/X86/apx/compress-evex.mir
index 7e13896e98590..5a3d7ceb10c43 100644
--- a/llvm/test/CodeGen/X86/apx/compress-evex.mir
+++ b/llvm/test/CodeGen/X86/apx/compress-evex.mir
@@ -32,6 +32,16 @@ body:             |
     RET64 $rax
 ...
 ---
+name:            ndd_2_non_ndd_commutable_new_opcode
+body:             |
+  bb.0.entry:
+    liveins: $rdi, $rsi
+    ; CHECK: shldq   $52, %rsi, %rax                 # EVEX TO LEGACY Compression encoding: [0x48,0x0f,0xa4,0xf0,0x34]
+    renamable $rax = ADD64rr_ND killed renamable $rdi, renamable $rsi, implicit-def dead $eflags
+    renamable $rax = SHRD64rri8_ND killed renamable $rsi, killed renamable $rax, 12, implicit-def dead $eflags
+    RET64 $rax
+...
+---
 name:            ndd_2_non_ndd_incommutable
 body:             |
   bb.0.entry:
diff --git a/llvm/test/CodeGen/X86/apx/shrd.ll b/llvm/test/CodeGen/X86/apx/shrd.ll
index 453d7a5d17e05..3eaa06b123bd0 100644
--- a/llvm/test/CodeGen/X86/apx/shrd.ll
+++ b/llvm/test/CodeGen/X86/apx/shrd.ll
@@ -157,7 +157,7 @@ entry:
 define i16 @shrd16mri8(ptr %ptr, i16 noundef %b) {
 ; CHECK-LABEL: shrd16mri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrdw $12, %si, (%rdi), %ax
+; CHECK-NEXT:    shldw $4, %si, (%rdi), %ax
 ; CHECK-NEXT:    retq
 entry:
     %a = load i16, ptr %ptr
@@ -168,7 +168,7 @@ entry:
 define i32 @shrd32mri8(ptr %ptr, i32 noundef %b) {
 ; CHECK-LABEL: shrd32mri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrdl $12, %esi, (%rdi), %eax
+; CHECK-NEXT:    shldl $20, %esi, (%rdi), %eax
 ; CHECK-NEXT:    retq
 entry:
     %a = load i32, ptr %ptr
@@ -179,7 +179,7 @@ entry:
 define i64 @shrd64mri8(ptr %ptr, i64 noundef %b) {
 ; CHECK-LABEL: shrd64mri8:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrdq $12, %rsi, (%rdi), %rax
+; CHECK-NEXT:    shldq $52, %rsi, (%rdi), %rax
 ; CHECK-NEXT:    retq
 entry:
     %a = load i64, ptr %ptr
@@ -190,15 +190,13 @@ entry:
 define void @shrd16mrcl_legacy(ptr %ptr, i16 noundef %b, i8 %cl) {
 ; CHECK-LABEL: shrd16mrcl_legacy:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzwl (%rdi), %eax
 ; CHECK-NEXT:    andb $15, %dl, %cl
-; CHECK-NEXT:    shrdw %cl, %ax, %si, %ax
-; CHECK-NEXT:    movw %ax, (%rdi)
+; CHECK-NEXT:    shrdw %cl, %si, (%rdi)
 ; CHECK-NEXT:    retq
 entry:
     %a = load i16, ptr %ptr
     %clin = sext i8 %cl to i16
-    %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 %clin)
+    %shrd = call i16 @llvm.fshr.i16(i16 %b, i16 %a, i16 %clin)
     store i16 %shrd, ptr %ptr
     ret void
 }
@@ -207,15 +205,13 @@ define void @shrd32mrcl_legacy(ptr %ptr, i32 noundef %b, i8 %cl) {
 ; CHECK-LABEL: shrd32mrcl_legacy:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    movl (%rdi), %eax
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrdl %cl, %eax, %esi, %eax
-; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    shrdl %cl, %esi, (%rdi)
 ; CHECK-NEXT:    retq
 entry:
     %a = load i32, ptr %ptr
     %clin = sext i8 %cl to i32
-    %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 %clin)
+    %shrd = call i32 @llvm.fshr.i32(i32 %b, i32 %a, i32 %clin)
     store i32 %shrd, ptr %ptr
     ret void
 }
@@ -224,15 +220,13 @@ define void @shrd64mrcl_legacy(ptr %ptr, i64 noundef %b, i8 %cl) {
 ; CHECK-LABEL: shrd64mrcl_legacy:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    movq (%rdi), %rax
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    shrdq %cl, %rax, %rsi, %rax
-; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    shrdq %cl, %rsi, (%rdi)
 ; CHECK-NEXT:    retq
 entry:
     %a = load i64, ptr %ptr
     %clin = sext i8 %cl to i64
-    %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 %clin)
+    %shrd = call i64 @llvm.fshr.i64(i64 %b, i64 %a, i64 %clin)
     store i64 %shrd, ptr %ptr
     ret void
 }
@@ -240,12 +234,11 @@ entry:
 define void @shrd16mri8_legacy(ptr %ptr, i16 noundef %b) {
 ; CHECK-LABEL: shrd16mri8_legacy:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrdw $12, %si, (%rdi), %ax
-; CHECK-NEXT:    movw %ax, (%rdi)
+; CHECK-NEXT:    shrdw $12, %si, (%rdi)
 ; CHECK-NEXT:    retq
 entry:
     %a = load i16, ptr %ptr
-    %shrd = call i16 @llvm.fshr.i16(i16 %a, i16 %b, i16 12)
+    %shrd = call i16 @llvm.fshr.i16(i16 %b, i16 %a, i16 12)
     store i16 %shrd, ptr %ptr
     ret void
 }
@@ -253,12 +246,11 @@ entry:
 define void @shrd32mri8_legacy(ptr %ptr, i32 noundef %b) {
 ; CHECK-LABEL: shrd32mri8_legacy:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrdl $12, %esi, (%rdi), %eax
-; CHECK-NEXT:    movl %eax, (%rdi)
+; CHECK-NEXT:    shrdl $12, %esi, (%rdi)
 ; CHECK-NEXT:    retq
 entry:
     %a = load i32, ptr %ptr
-    %shrd = call i32 @llvm.fshr.i32(i32 %a, i32 %b, i32 12)
+    %shrd = call i32 @llvm.fshr.i32(i32 %b, i32 %a, i32 12)
     store i32 %shrd, ptr %ptr
     ret void
 }
@@ -266,12 +258,11 @@ entry:
 define void @shrd64mri8_legacy(ptr %ptr, i64 noundef %b) {
 ; CHECK-LABEL: shrd64mri8_legacy:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    shrdq $12, %rsi, (%rdi), %rax
-; CHECK-NEXT:    movq %rax, (%rdi)
+; CHECK-NEXT:    shrdq $12, %rsi, (%rdi)
 ; CHECK-NEXT:    retq
 entry:
     %a = load i64, ptr %ptr
-    %shrd = call i64 @llvm.fshr.i64(i64 %a, i64 %b, i64 12)
+    %shrd = call i64 @llvm.fshr.i64(i64 %b, i64 %a, i64 12)
     store i64 %shrd, ptr %ptr
     ret void
 }

From ccf1e322bd5828c74ac5213dc2e05ab506da25bd Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Tue, 23 Jan 2024 10:19:06 +0100
Subject: [PATCH 575/843] [clang][dataflow] Process terminator condition within
 `transferCFGBlock()`. (#78127)

In particular, it's important that we create the "fallback" atomic at
this point
(which we produce if the transfer function didn't produce a value for
the
expression) so that it is placed in the correct environment.

Previously, we processed the terminator condition in the
`TerminatorVisitor`,
which put the fallback atomic in a copy of the environment that is
produced as
input for the _successor_ block, rather than the environment for the
block
containing the expression for which we produce the fallback atomic.

As a result, we produce different fallback atomics every time we process
the
successor block, and hence we don't have a consistent representation of
the
terminator condition in the flow condition.

This patch includes a test (authored by ymand@) that fails without the
fix.
---
 .../clang/Analysis/FlowSensitive/Transfer.h   | 13 +++-
 clang/lib/Analysis/FlowSensitive/Transfer.cpp |  2 +
 .../TypeErasedDataflowAnalysis.cpp            | 64 ++++++++++++-------
 .../Analysis/FlowSensitive/LoggerTest.cpp     |  1 +
 .../FlowSensitive/SignAnalysisTest.cpp        | 27 ++++++++
 .../Analysis/FlowSensitive/TransferTest.cpp   | 31 +++++++++
 6 files changed, 112 insertions(+), 26 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/Transfer.h b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
index 58bb77c4905c5..7713df747cb76 100644
--- a/clang/include/clang/Analysis/FlowSensitive/Transfer.h
+++ b/clang/include/clang/Analysis/FlowSensitive/Transfer.h
@@ -25,10 +25,17 @@ namespace dataflow {
 /// Maps statements to the environments of basic blocks that contain them.
 class StmtToEnvMap {
 public:
+  // `CurBlockID` is the ID of the block currently being processed, and
+  // `CurState` is the pending state currently associated with this block. These
+  // are supplied separately as the pending state for the current block may not
+  // yet be represented in `BlockToState`.
   StmtToEnvMap(const ControlFlowContext &CFCtx,
                llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>>
-                   BlockToState)
-      : CFCtx(CFCtx), BlockToState(BlockToState) {}
+                   BlockToState,
+               unsigned CurBlockID,
+               const TypeErasedDataflowAnalysisState &CurState)
+      : CFCtx(CFCtx), BlockToState(BlockToState), CurBlockID(CurBlockID),
+        CurState(CurState) {}
 
   /// Returns the environment of the basic block that contains `S`.
   /// The result is guaranteed never to be null.
@@ -37,6 +44,8 @@ class StmtToEnvMap {
 private:
   const ControlFlowContext &CFCtx;
   llvm::ArrayRef<std::optional<TypeErasedDataflowAnalysisState>> BlockToState;
+  unsigned CurBlockID;
+  const TypeErasedDataflowAnalysisState &CurState;
 };
 
 /// Evaluates `S` and updates `Env` accordingly.
diff --git a/clang/lib/Analysis/FlowSensitive/Transfer.cpp b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
index 55093c2e2cdaf..2271a75fbcaf7 100644
--- a/clang/lib/Analysis/FlowSensitive/Transfer.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Transfer.cpp
@@ -42,6 +42,8 @@ const Environment *StmtToEnvMap::getEnvironment(const Stmt &S) const {
   assert(BlockIt != CFCtx.getStmtToBlock().end());
   if (!CFCtx.isBlockReachable(*BlockIt->getSecond()))
     return nullptr;
+  if (BlockIt->getSecond()->getBlockID() == CurBlockID)
+    return &CurState.Env;
   const auto &State = BlockToState[BlockIt->getSecond()->getBlockID()];
   if (!(State))
     return nullptr;
diff --git a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
index f1899590f4125..49e425bde66aa 100644
--- a/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
+++ b/clang/lib/Analysis/FlowSensitive/TypeErasedDataflowAnalysis.cpp
@@ -75,9 +75,8 @@ using TerminatorVisitorRetTy = std::pair<const Expr *, bool>;
 class TerminatorVisitor
     : public ConstStmtVisitor<TerminatorVisitor, TerminatorVisitorRetTy> {
 public:
-  TerminatorVisitor(const StmtToEnvMap &StmtToEnv, Environment &Env,
-                    int BlockSuccIdx)
-      : StmtToEnv(StmtToEnv), Env(Env), BlockSuccIdx(BlockSuccIdx) {}
+  TerminatorVisitor(Environment &Env, int BlockSuccIdx)
+      : Env(Env), BlockSuccIdx(BlockSuccIdx) {}
 
   TerminatorVisitorRetTy VisitIfStmt(const IfStmt *S) {
     auto *Cond = S->getCond();
@@ -126,19 +125,12 @@ class TerminatorVisitor
 
 private:
   TerminatorVisitorRetTy extendFlowCondition(const Expr &Cond) {
-    // The terminator sub-expression might not be evaluated.
-    if (Env.getValue(Cond) == nullptr)
-      transfer(StmtToEnv, Cond, Env);
-
     auto *Val = Env.get<BoolValue>(Cond);
-    // Value merging depends on flow conditions from different environments
-    // being mutually exclusive -- that is, they cannot both be true in their
-    // entirety (even if they may share some clauses). So, we need *some* value
-    // for the condition expression, even if just an atom.
-    if (Val == nullptr) {
-      Val = &Env.makeAtomicBoolValue();
-      Env.setValue(Cond, *Val);
-    }
+    // In transferCFGBlock(), we ensure that we always have a `Value` for the
+    // terminator condition, so assert this.
+    // We consciously assert ourselves instead of asserting via `cast()` so
+    // that we get a more meaningful line number if the assertion fails.
+    assert(Val != nullptr);
 
     bool ConditionValue = true;
     // The condition must be inverted for the successor that encompasses the
@@ -152,7 +144,6 @@ class TerminatorVisitor
     return {&Cond, ConditionValue};
   }
 
-  const StmtToEnvMap &StmtToEnv;
   Environment &Env;
   int BlockSuccIdx;
 };
@@ -335,10 +326,8 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
         // when the terminator is taken. Copy now.
         TypeErasedDataflowAnalysisState Copy = MaybePredState->fork();
 
-        const StmtToEnvMap StmtToEnv(AC.CFCtx, AC.BlockStates);
         auto [Cond, CondValue] =
-            TerminatorVisitor(StmtToEnv, Copy.Env,
-                              blockIndexInPredecessor(*Pred, Block))
+            TerminatorVisitor(Copy.Env, blockIndexInPredecessor(*Pred, Block))
                 .Visit(PredTerminatorStmt);
         if (Cond != nullptr)
           // FIXME: Call transferBranchTypeErased even if BuiltinTransferOpts
@@ -356,12 +345,13 @@ computeBlockInputState(const CFGBlock &Block, AnalysisContext &AC) {
 
 /// Built-in transfer function for `CFGStmt`.
 static void
-builtinTransferStatement(const CFGStmt &Elt,
+builtinTransferStatement(unsigned CurBlockID, const CFGStmt &Elt,
                          TypeErasedDataflowAnalysisState &InputState,
                          AnalysisContext &AC) {
   const Stmt *S = Elt.getStmt();
   assert(S != nullptr);
-  transfer(StmtToEnvMap(AC.CFCtx, AC.BlockStates), *S, InputState.Env);
+  transfer(StmtToEnvMap(AC.CFCtx, AC.BlockStates, CurBlockID, InputState), *S,
+           InputState.Env);
 }
 
 /// Built-in transfer function for `CFGInitializer`.
@@ -428,12 +418,12 @@ builtinTransferInitializer(const CFGInitializer &Elt,
   }
 }
 
-static void builtinTransfer(const CFGElement &Elt,
+static void builtinTransfer(unsigned CurBlockID, const CFGElement &Elt,
                             TypeErasedDataflowAnalysisState &State,
                             AnalysisContext &AC) {
   switch (Elt.getKind()) {
   case CFGElement::Statement:
-    builtinTransferStatement(Elt.castAs<CFGStmt>(), State, AC);
+    builtinTransferStatement(CurBlockID, Elt.castAs<CFGStmt>(), State, AC);
     break;
   case CFGElement::Initializer:
     builtinTransferInitializer(Elt.castAs<CFGInitializer>(), State);
@@ -477,7 +467,7 @@ transferCFGBlock(const CFGBlock &Block, AnalysisContext &AC,
     AC.Log.enterElement(Element);
     // Built-in analysis
     if (AC.Analysis.builtinOptions()) {
-      builtinTransfer(Element, State, AC);
+      builtinTransfer(Block.getBlockID(), Element, State, AC);
     }
 
     // User-provided analysis
@@ -489,6 +479,32 @@ transferCFGBlock(const CFGBlock &Block, AnalysisContext &AC,
     }
     AC.Log.recordState(State);
   }
+
+  // If we have a terminator, evaluate its condition.
+  // This `Expr` may not appear as a `CFGElement` anywhere else, and it's
+  // important that we evaluate it here (rather than while processing the
+  // terminator) so that we put the corresponding value in the right
+  // environment.
+  if (const Expr *TerminatorCond =
+          dyn_cast_or_null<Expr>(Block.getTerminatorCondition())) {
+    if (State.Env.getValue(*TerminatorCond) == nullptr)
+      // FIXME: This only runs the builtin transfer, not the analysis-specific
+      // transfer. Fixing this isn't trivial, as the analysis-specific transfer
+      // takes a `CFGElement` as input, but some expressions only show up as a
+      // terminator condition, but not as a `CFGElement`. The condition of an if
+      // statement is one such example.
+      transfer(
+          StmtToEnvMap(AC.CFCtx, AC.BlockStates, Block.getBlockID(), State),
+          *TerminatorCond, State.Env);
+
+    // If the transfer function didn't produce a value, create an atom so that
+    // we have *some* value for the condition expression. This ensures that
+    // when we extend the flow condition, it actually changes.
+    if (State.Env.getValue(*TerminatorCond) == nullptr)
+      State.Env.setValue(*TerminatorCond, State.Env.makeAtomicBoolValue());
+    AC.Log.recordState(State);
+  }
+
   return State;
 }
 
diff --git a/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp b/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
index a60dbe1f61f6d..c5594aa3fe9d1 100644
--- a/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/LoggerTest.cpp
@@ -123,6 +123,7 @@ recordState(Elements=1, Branches=0, Joins=0)
 enterElement(b (ImplicitCastExpr, LValueToRValue, _Bool))
 transfer()
 recordState(Elements=2, Branches=0, Joins=0)
+recordState(Elements=2, Branches=0, Joins=0)
 
 enterBlock(3, false)
 transferBranch(0)
diff --git a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
index b5fc7bbc431ea..a6f4c45504fa6 100644
--- a/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/SignAnalysisTest.cpp
@@ -895,6 +895,33 @@ TEST(SignAnalysisTest, BinaryEQ) {
       LangStandard::lang_cxx17);
 }
 
+TEST(SignAnalysisTest, ComplexLoopCondition) {
+  std::string Code = R"(
+    int foo();
+    void fun() {
+      int a, b;
+      while ((a = foo()) > 0 && (b = foo()) > 0) {
+        a;
+        b;
+        // [[p]]
+      }
+    }
+  )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+
+        const ValueDecl *A = findValueDecl(ASTCtx, "a");
+        const ValueDecl *B = findValueDecl(ASTCtx, "b");
+
+        EXPECT_TRUE(isPositive(A, ASTCtx, Env));
+        EXPECT_TRUE(isPositive(B, ASTCtx, Env));
+      },
+      LangStandard::lang_cxx17);
+}
+
 TEST(SignAnalysisTest, JoinToTop) {
   std::string Code = R"(
     int foo();
diff --git a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
index c777aca78992d..268ea4c7431f6 100644
--- a/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/TransferTest.cpp
@@ -6459,4 +6459,35 @@ TEST(TransferTest, DifferentReferenceLocInJoin) {
       });
 }
 
+// This test verifies correct modeling of a relational dependency that goes
+// through unmodeled functions (the simple `cond()` in this case).
+TEST(TransferTest, ConditionalRelation) {
+  std::string Code = R"(
+    bool cond();
+    void target() {
+       bool a = true;
+       bool b = true;
+       if (cond()) {
+         a = false;
+         if (cond()) {
+           b = false;
+         }
+       }
+       (void)0;
+       // [[p]]
+    }
+ )";
+  runDataflow(
+      Code,
+      [](const llvm::StringMap<DataflowAnalysisState<NoopLattice>> &Results,
+         ASTContext &ASTCtx) {
+        const Environment &Env = getEnvironmentAtAnnotation(Results, "p");
+        auto &A = Env.arena();
+        auto &VarA = getValueForDecl<BoolValue>(ASTCtx, Env, "a").formula();
+        auto &VarB = getValueForDecl<BoolValue>(ASTCtx, Env, "b").formula();
+
+        EXPECT_FALSE(Env.allows(A.makeAnd(VarA, A.makeNot(VarB))));
+      });
+}
+
 } // namespace

From 60e1c835d326a74ef67bf4b9399d439262d0380e Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 23 Jan 2024 09:37:27 +0000
Subject: [PATCH 576/843] [RemoveDIs][DebugInfo] Update SROA to handle
 DPVAssigns (#78475)

SROA needs to update llvm.dbg.assign intrinsics when it migrates debug
info in response to alloca splitting; this patch updates the debug info
migration code to handle DPVAssigns as well, making use of generic code
to avoid duplication as much as possible.
---
 llvm/include/llvm/IR/DebugInfo.h              |  6 ++
 llvm/lib/Transforms/Scalar/SROA.cpp           | 88 ++++++++++++++-----
 .../instcombine/remove-redundant-dbg.ll       |  2 +
 .../sroa/after-inlining.ll                    |  1 +
 .../sroa/alloca-single-slice.ll               |  2 +
 .../assignment-tracking/sroa/arglist.ll       |  1 +
 .../assignment-tracking/sroa/complex.ll       |  2 +
 .../assignment-tracking/sroa/fail-fragment.ll |  2 +
 .../assignment-tracking/sroa/frag-2.ll        |  1 +
 .../Generic/assignment-tracking/sroa/frag.ll  |  1 +
 .../Generic/assignment-tracking/sroa/id.ll    |  1 +
 .../assignment-tracking/sroa/memcpy.ll        |  2 +
 .../sroa/memmove-to-from-same-alloca.ll       |  2 +
 .../sroa/remove-redundant-dbg.ll              |  2 +
 .../assignment-tracking/sroa/rewrite.ll       |  2 +
 .../sroa/split-pre-fragmented-store-2.ll      |  2 +
 .../sroa/split-pre-fragmented-store.ll        |  2 +
 .../Generic/assignment-tracking/sroa/store.ll |  2 +
 .../sroa/unspecified-var-size.ll              |  1 +
 .../assignment-tracking/sroa/user-memcpy.ll   |  2 +
 .../sroa/var-sized-fragment.ll                |  1 +
 .../Generic/assignment-tracking/sroa/vec-1.ll |  1 +
 .../Generic/assignment-tracking/sroa/vec-2.ll |  1 +
 23 files changed, 107 insertions(+), 20 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 8f9f2594541a0..006fd02a17bfd 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -236,6 +236,12 @@ inline SmallVector<DPValue *> getDPVAssignmentMarkers(const Instruction *Inst) {
   return {};
 }
 
+inline SmallVector<DPValue *> getDPVAssignmentMarkers(const Instruction *Inst) {
+  if (auto *ID = Inst->getMetadata(LLVMContext::MD_DIAssignID))
+    return cast<DIAssignID>(ID)->getAllDPValueUsers();
+  return {};
+}
+
 /// Delete the llvm.dbg.assign intrinsics linked to \p Inst.
 void deleteAssignmentMarkers(const Instruction *Inst);
 
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index adf87ebb69f54..551a37b132445 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -319,6 +319,29 @@ static DebugVariable getAggregateVariable(DbgVariableIntrinsic *DVI) {
   return DebugVariable(DVI->getVariable(), std::nullopt,
                        DVI->getDebugLoc().getInlinedAt());
 }
+static DebugVariable getAggregateVariable(DPValue *DPV) {
+  return DebugVariable(DPV->getVariable(), std::nullopt,
+                       DPV->getDebugLoc().getInlinedAt());
+}
+
+static DPValue *createLinkedAssign(DPValue *, DIBuilder &DIB,
+                                   Instruction *LinkedInstr, Value *NewValue,
+                                   DILocalVariable *Variable,
+                                   DIExpression *Expression, Value *Address,
+                                   DIExpression *AddressExpression,
+                                   const DILocation *DI) {
+  (void)DIB;
+  return DPValue::createLinkedDPVAssign(LinkedInstr, NewValue, Variable,
+                                        Expression, Address, AddressExpression,
+                                        DI);
+}
+static DbgAssignIntrinsic *createLinkedAssign(
+    DbgAssignIntrinsic *, DIBuilder &DIB, Instruction *LinkedInstr,
+    Value *NewValue, DILocalVariable *Variable, DIExpression *Expression,
+    Value *Address, DIExpression *AddressExpression, const DILocation *DI) {
+  return DIB.insertDbgAssign(LinkedInstr, NewValue, Variable, Expression,
+                             Address, AddressExpression, DI);
+}
 
 /// Find linked dbg.assign and generate a new one with the correct
 /// FragmentInfo. Link Inst to the new dbg.assign.  If Value is nullptr the
@@ -340,8 +363,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
                              Instruction *Inst, Value *Dest, Value *Value,
                              const DataLayout &DL) {
   auto MarkerRange = at::getAssignmentMarkers(OldInst);
+  auto DPVAssignMarkerRange = at::getDPVAssignmentMarkers(OldInst);
   // Nothing to do if OldInst has no linked dbg.assign intrinsics.
-  if (MarkerRange.empty())
+  if (MarkerRange.empty() && DPVAssignMarkerRange.empty())
     return;
 
   LLVM_DEBUG(dbgs() << "  migrateDebugInfo\n");
@@ -362,6 +386,9 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   for (auto *DAI : at::getAssignmentMarkers(OldAlloca))
     BaseFragments[getAggregateVariable(DAI)] =
         DAI->getExpression()->getFragmentInfo();
+  for (auto *DPV : at::getDPVAssignmentMarkers(OldAlloca))
+    BaseFragments[getAggregateVariable(DPV)] =
+        DPV->getExpression()->getFragmentInfo();
 
   // The new inst needs a DIAssignID unique metadata tag (if OldInst has
   // one). It shouldn't already have one: assert this assumption.
@@ -371,7 +398,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
   DIBuilder DIB(*OldInst->getModule(), /*AllowUnresolved*/ false);
   assert(OldAlloca->isStaticAlloca());
 
-  for (DbgAssignIntrinsic *DbgAssign : MarkerRange) {
+  auto MigrateDbgAssign = [&](auto DbgAssign) {
     LLVM_DEBUG(dbgs() << "      existing dbg.assign is: " << *DbgAssign
                       << "\n");
     auto *Expr = DbgAssign->getExpression();
@@ -382,7 +409,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
       {
         auto R = BaseFragments.find(getAggregateVariable(DbgAssign));
         if (R == BaseFragments.end())
-          continue;
+          return;
         BaseFragment = R->second;
       }
       std::optional<DIExpression::FragmentInfo> CurrentFragment =
@@ -393,7 +420,7 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
           BaseFragment, CurrentFragment, NewFragment);
 
       if (Result == Skip)
-        continue;
+        return;
       if (Result == UseFrag && !(NewFragment == CurrentFragment)) {
         if (CurrentFragment) {
           // Rewrite NewFragment to be relative to the existing one (this is
@@ -425,9 +452,10 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
     }
 
     ::Value *NewValue = Value ? Value : DbgAssign->getValue();
-    auto *NewAssign = DIB.insertDbgAssign(
-        Inst, NewValue, DbgAssign->getVariable(), Expr, Dest,
-        DIExpression::get(Ctx, std::nullopt), DbgAssign->getDebugLoc());
+    auto *NewAssign = createLinkedAssign(
+        DbgAssign, DIB, Inst, NewValue, DbgAssign->getVariable(), Expr, Dest,
+        DIExpression::get(Expr->getContext(), std::nullopt),
+        DbgAssign->getDebugLoc());
 
     // If we've updated the value but the original dbg.assign has an arglist
     // then kill it now - we can't use the requested new value.
@@ -461,9 +489,11 @@ static void migrateDebugInfo(AllocaInst *OldAlloca, bool IsSplit,
     NewAssign->moveBefore(DbgAssign);
 
     NewAssign->setDebugLoc(DbgAssign->getDebugLoc());
-    LLVM_DEBUG(dbgs() << "Created new assign intrinsic: " << *NewAssign
-                      << "\n");
-  }
+    LLVM_DEBUG(dbgs() << "Created new assign: " << *NewAssign << "\n");
+  };
+
+  for_each(MarkerRange, MigrateDbgAssign);
+  for_each(DPVAssignMarkerRange, MigrateDbgAssign);
 }
 
 namespace {
@@ -3108,6 +3138,7 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       // emit dbg.assign intrinsics for mem intrinsics storing through non-
       // constant geps, or storing a variable number of bytes.
       assert(at::getAssignmentMarkers(&II).empty() &&
+             at::getDPVAssignmentMarkers(&II).empty() &&
              "AT: Unexpected link to non-const GEP");
       deleteIfTriviallyDead(OldPtr);
       return false;
@@ -3254,11 +3285,13 @@ class AllocaSliceRewriter : public InstVisitor<AllocaSliceRewriter, bool> {
       Value *AdjustedPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
       if (IsDest) {
         // Update the address component of linked dbg.assigns.
-        for (auto *DAI : at::getAssignmentMarkers(&II)) {
-          if (llvm::is_contained(DAI->location_ops(), II.getDest()) ||
-              DAI->getAddress() == II.getDest())
-            DAI->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
-        }
+        auto UpdateAssignAddress = [&](auto *DbgAssign) {
+          if (llvm::is_contained(DbgAssign->location_ops(), II.getDest()) ||
+              DbgAssign->getAddress() == II.getDest())
+            DbgAssign->replaceVariableLocationOp(II.getDest(), AdjustedPtr);
+        };
+        for_each(at::getAssignmentMarkers(&II), UpdateAssignAddress);
+        for_each(at::getDPVAssignmentMarkers(&II), UpdateAssignAddress);
         II.setDest(AdjustedPtr);
         II.setDestAlignment(SliceAlign);
       } else {
@@ -3842,6 +3875,7 @@ class AggLoadStoreRewriter : public InstVisitor<AggLoadStoreRewriter, bool> {
                          DL);
       } else {
         assert(at::getAssignmentMarkers(Store).empty() &&
+               at::getDPVAssignmentMarkers(Store).empty() &&
                "AT: unexpected debug.assign linked to store through "
                "unbounded GEP");
       }
@@ -4861,10 +4895,22 @@ static void insertNewDbgInst(DIBuilder &DIB, DPValue *Orig, AllocaInst *NewAddr,
                              DIExpression *NewFragmentExpr,
                              Instruction *BeforeInst) {
   (void)DIB;
-  DPValue *New = new DPValue(ValueAsMetadata::get(NewAddr), Orig->getVariable(),
-                             NewFragmentExpr, Orig->getDebugLoc(),
-                             DPValue::LocationType::Declare);
-  BeforeInst->getParent()->insertDPValueBefore(New, BeforeInst->getIterator());
+  if (Orig->isDbgDeclare()) {
+    DPValue *DPV = DPValue::createDPVDeclare(
+        NewAddr, Orig->getVariable(), NewFragmentExpr, Orig->getDebugLoc());
+    BeforeInst->getParent()->insertDPValueBefore(DPV,
+                                                 BeforeInst->getIterator());
+    return;
+  }
+  if (!NewAddr->hasMetadata(LLVMContext::MD_DIAssignID)) {
+    NewAddr->setMetadata(LLVMContext::MD_DIAssignID,
+                         DIAssignID::getDistinct(NewAddr->getContext()));
+  }
+  auto *NewAssign = DPValue::createLinkedDPVAssign(
+      NewAddr, Orig->getValue(), Orig->getVariable(), NewFragmentExpr, NewAddr,
+      Orig->getAddressExpression(), Orig->getDebugLoc());
+  LLVM_DEBUG(dbgs() << "Created new DPVAssign: " << *NewAssign << "\n");
+  (void)NewAssign;
 }
 
 /// Walks the slices of an alloca and form partitions based on them,
@@ -5042,6 +5088,7 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   for_each(findDbgDeclares(&AI), MigrateOne);
   for_each(findDPVDeclares(&AI), MigrateOne);
   for_each(at::getAssignmentMarkers(&AI), MigrateOne);
+  for_each(at::getDPVAssignmentMarkers(&AI), MigrateOne);
 
   return Changed;
 }
@@ -5262,8 +5309,9 @@ std::pair<bool /*Changed*/, bool /*CFGChanged*/> SROA::runSROA(Function &F) {
          "Should not have modified the CFG when told to preserve it.");
 
   if (Changed && isAssignmentTrackingEnabled(*F.getParent())) {
-    for (auto &BB : F)
+    for (auto &BB : F) {
       RemoveRedundantDbgInstrs(&BB);
+    }
   }
 
   return {Changed, CFGChanged};
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
index 11895098179eb..cffac06f8e545 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/after-inlining.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/after-inlining.ll
index 16acdb4453126..e386871699281 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/after-inlining.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/after-inlining.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=sroa -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=sroa -o - | FileCheck %s
 
 ;; Check that SROA preserves the InlinedAt status of new dbg.assign intriniscs
 ;; it inserts.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/alloca-single-slice.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/alloca-single-slice.ll
index af2c565986f50..d99bde32aa8ce 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/alloca-single-slice.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/alloca-single-slice.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa,verify -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa,verify -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ; Check that single sliced allocas retain their assignment tracking debug info.
 
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/arglist.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/arglist.ll
index 8a76081094086..83f257e77b0be 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/arglist.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/arglist.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=sroa -S %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - | FileCheck %s
 
 ;; Check that a dbg.assign for a promoted variable becomes a kill location if
 ;; it used an arglist.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/complex.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/complex.ll
index cc733a0dbbc1e..ed4c6fa947ce0 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/complex.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/complex.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S -o - %s \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S -o - %s \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 ;
 ;; Based on llvm/test/DebugInfo/ARM/sroa-complex.ll
 ;; generated from:
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/fail-fragment.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/fail-fragment.ll
index d9ded4839a982..234504d14442e 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/fail-fragment.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/fail-fragment.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that a dbg.assign for a promoted variable becomes a kill location if
 ;; it used a fragment that can't be split (the first check directive below).
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/frag-2.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/frag-2.ll
index a6dba664822c7..5016bd4dd1f03 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/frag-2.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/frag-2.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=sroa -S %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - | FileCheck %s
 
 ;; $ cat test.cpp
 ;; class a {
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/frag.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/frag.ll
index 38ebed1cf0b91..c8fb65b76de58 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/frag.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/frag.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=sroa -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=sroa -o - | FileCheck %s
 
 ;; $ cat test.cpp
 ;; class c {
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/id.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/id.ll
index 13c05081ffc52..ae185ebaf9591 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/id.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/id.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=sroa -S %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - | FileCheck %s
 
 ;; Check that multiple dbg.assign intrinsics linked to a store that is getting
 ;; split (or at least that is touched by SROA, causing a replacement store to
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/memcpy.ll
index 22fd7eb86273d..709f33125c87b 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/memcpy.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa,verify -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa,verify -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that the new slices of an alloca and memcpy intructions get dbg.assign
 ;; intrinsics with the correct fragment info.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/memmove-to-from-same-alloca.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/memmove-to-from-same-alloca.ll
index 3e744eaf985cf..07b29ab795b72 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/memmove-to-from-same-alloca.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/memmove-to-from-same-alloca.ll
@@ -1,5 +1,7 @@
 ; RUN: opt %s -passes=sroa -o - -S \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -passes=sroa -o - -S \
+; RUN: | FileCheck %s
 
 ;; Generated from this C++ source:
 ;; __attribute__((nodebug)) struct Blob {int P[6];} Glob;
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
index 11895098179eb..cffac06f8e545 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/rewrite.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/rewrite.ll
index f7ba8026df463..15fbfa04f7b0c 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/rewrite.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/rewrite.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa,verify -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa,verify -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ; Check that the new slices of an alloca and memset intructions get dbg.assign
 ; intrinsics with the correct fragment info. Ensure that only the
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/split-pre-fragmented-store-2.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/split-pre-fragmented-store-2.ll
index a7ec437989e16..1f126f6aa6b4f 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/split-pre-fragmented-store-2.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/split-pre-fragmented-store-2.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -passes=sroa -sroa-skip-mem2reg %s \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -S -passes=sroa -sroa-skip-mem2reg %s \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; NOTE: This is the same as split-pre-fragmented-store.ll except the base
 ;; alloca's dbg.assign has been altered to contain a fragment of the full
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/split-pre-fragmented-store.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/split-pre-fragmented-store.ll
index 4bf78aa441a36..458c3298b813d 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/split-pre-fragmented-store.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/split-pre-fragmented-store.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S -passes=sroa -sroa-skip-mem2reg %s \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -S -passes=sroa -sroa-skip-mem2reg %s \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; IR hand-modified, originally generated from:
 ;; struct Pair { int a; int b; };
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/store.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/store.ll
index 72d6ab234dcbe..7601bdd21ce85 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/store.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/store.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa,verify -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa,verify -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ; Check that the new slices of an alloca and memset intructions get dbg.assign
 ; intrinsics with the correct fragment info. Ensure that only the
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/unspecified-var-size.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/unspecified-var-size.ll
index c04e5a7a2146a..2a94622bbe553 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/unspecified-var-size.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/unspecified-var-size.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S %s -passes=sroa -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S %s -passes=sroa -o - | FileCheck %s
 
 ;; $ cat test.cpp
 ;; #include <cstddef>
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
index abc110273ab3b..23d0ff3461046 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that the fragments generated in SROA for a split alloca that has a
 ;; dbg.assign with non-zero-offset fragment are correct.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll
index 59406b30d735a..97bab2febcae9 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/var-sized-fragment.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S %s -o - -passes=sroa | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S %s -o - -passes=sroa | FileCheck %s
 
 ;; SROA splits the alloca into two. Each slice already has a 32-bit variable
 ;; associated with it (the structured binding variables). Check SROA doesn't
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/vec-1.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/vec-1.ll
index f09e88144e835..69b884431a357 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/vec-1.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/vec-1.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=sroa -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=sroa -o - | FileCheck %s
 
 ;; Ensure that only the value-expression gets fragment info; that the
 ;; address-expression remains untouched.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/vec-2.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/vec-2.ll
index 107a1e0a91bc6..3281c107317de 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/vec-2.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/vec-2.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=sroa -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=sroa -o - | FileCheck %s
 
 ;; $ cat test.cpp
 ;; class a {

From 47bcc91e06798e8730983b3848cc152a79c7afae Mon Sep 17 00:00:00 2001
From: harishch4 <harishcse44@gmail.com>
Date: Tue, 23 Jan 2024 15:16:49 +0530
Subject: [PATCH 577/843] [Flang][OpenMP] Fix to variables not inheriting data
 sharing attributes (#79017)

When a default(none) clause exists and a threadprivate variable is used
inside the construct, the variable does not inherit threadprivate
behavior and throws the below error.

> error: The DEFAULT(NONE) clause requires that 'a' must be listed in a
data-sharing attribute clause

Added a condition to skip the error if it is a threadprivate variable.

Fixes: https://github.com/llvm/llvm-project/issues/49545
---
 flang/lib/Semantics/resolve-directives.cpp    |  1 +
 .../OpenMP/threadprivate-default-clause.f90   | 60 +++++++++++++++++++
 2 files changed, 61 insertions(+)
 create mode 100644 flang/test/Lower/OpenMP/threadprivate-default-clause.f90

diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index 2c570bc3abeb2..e19f68eefa286 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -1954,6 +1954,7 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
         if (symbol != found) {
           name.symbol = found; // adjust the symbol within region
         } else if (GetContext().defaultDSA == Symbol::Flag::OmpNone &&
+            !symbol->test(Symbol::Flag::OmpThreadprivate) &&
             // Exclude indices of sequential loops that are privatised in
             // the scope of the parallel region, and not in this scope.
             // TODO: check whether this should be caught in IsObjectWithDSA
diff --git a/flang/test/Lower/OpenMP/threadprivate-default-clause.f90 b/flang/test/Lower/OpenMP/threadprivate-default-clause.f90
new file mode 100644
index 0000000000000..be07aeca2d5c3
--- /dev/null
+++ b/flang/test/Lower/OpenMP/threadprivate-default-clause.f90
@@ -0,0 +1,60 @@
+! Simple test for lowering of OpenMP Threadprivate Directive with HLFIR.
+
+!RUN: %flang_fc1  -emit-hlfir -fopenmp %s -o - | FileCheck %s
+
+!CHECK-LABEL: func.func @_QPsub1() {
+!CHECK:     %[[A:.*]] = fir.address_of(@_QFsub1Ea) : !fir.ref<i32>
+!CHECK:     %[[A_DECL:.*]]:2 = hlfir.declare %[[A]]  {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:     %[[A_TP0:.*]] = omp.threadprivate %[[A_DECL]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+!CHECK:     %[[A_CVT:.*]]:2 = hlfir.declare %[[A_TP0]] {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:     omp.parallel {
+!CHECK:       %[[A_TP:.*]] = omp.threadprivate %[[A_DECL]]#1 : !fir.ref<i32> -> !fir.ref<i32>
+!CHECK:       %[[A_TP_DECL:.*]]:2 = hlfir.declare %[[A_TP]] {uniq_name = "_QFsub1Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       %[[TID:.*]] = fir.call @omp_get_thread_num() fastmath<contract> : () -> i32
+!CHECK:       hlfir.assign %[[TID]] to %[[A_TP_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:       omp.terminator
+!CHECK:     }
+
+subroutine sub1()
+   use omp_lib
+   integer, save :: a
+   !$omp threadprivate(a)
+   !$omp parallel default(none)
+   a = omp_get_thread_num()
+   !$omp end parallel
+end subroutine
+
+!CHECK-LABEL: func.func @_QPsub2() {
+!CHECK:    %[[BLK:.*]] = fir.address_of(@blk_) : !fir.ref<!fir.array<4xi8>>
+!CHECK:    %[[BLK_CVT:.*]] = fir.convert %[[BLK]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %c0 = arith.constant 0 : index
+!CHECK:    %[[A_ADDR:.*]] = fir.coordinate_of %[[BLK_CVT]], %c0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[A_CVT:.*]] = fir.convert %[[A_ADDR]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK:    %[[A_DECL:.*]]:2 = hlfir.declare %[[A_CVT]] {uniq_name = "_QFsub2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    %[[A_TP0:.*]] = omp.threadprivate %[[BLK]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
+!CHECK:    %[[A_TP0_CVT:.*]] = fir.convert %[[A_TP0]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:    %c0_0 = arith.constant 0 : index
+!CHECK:    %[[A_TP0_ADDR:.*]] = fir.coordinate_of %[[A_TP0_CVT]], %c0_0 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:    %[[A_TP0_ADDR_CVT:.*]] = fir.convert %[[A_TP0_ADDR]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK:    %[[A_TP0_DECL:.*]]:2 = hlfir.declare %[[A_TP0_ADDR_CVT]] {uniq_name = "_QFsub2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:    omp.parallel {
+!CHECK:       %[[BLK_TP:.*]] = omp.threadprivate %[[BLK]] : !fir.ref<!fir.array<4xi8>> -> !fir.ref<!fir.array<4xi8>>
+!CHECK:       %[[BLK_TP_CVT:.*]] = fir.convert %[[BLK_TP]] : (!fir.ref<!fir.array<4xi8>>) -> !fir.ref<!fir.array<?xi8>>
+!CHECK:       %c0_1 = arith.constant 0 : index
+!CHECK:       %[[A_TP_ADDR:.*]] = fir.coordinate_of %[[BLK_TP_CVT]], %c0_1 : (!fir.ref<!fir.array<?xi8>>, index) -> !fir.ref<i8>
+!CHECK:       %[[A_TP_ADDR_CVT:.*]] = fir.convert %[[A_TP_ADDR]] : (!fir.ref<i8>) -> !fir.ref<i32>
+!CHECK:       %[[A_TP_DECL:.*]]:2 = hlfir.declare %[[A_TP_ADDR_CVT]] {uniq_name = "_QFsub2Ea"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+!CHECK:       %[[TID:.*]] = fir.call @omp_get_thread_num() fastmath<contract> : () -> i32
+!CHECK:       hlfir.assign %[[TID]] to %[[A_TP_DECL]]#0 : i32, !fir.ref<i32>
+!CHECK:       omp.terminator
+!CHECK:     }
+
+subroutine sub2()
+   use omp_lib
+   integer :: a
+   common/blk/a
+   !$omp threadprivate(/blk/)
+   !$omp parallel default(none)
+   a = omp_get_thread_num()
+   !$omp end parallel
+end subroutine

From 5a458767dd797cbfa2eeb12b9b8b64f2c63159a5 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Tue, 23 Jan 2024 09:52:49 +0000
Subject: [PATCH 578/843] [AMDGPU][True16] Support source DPP operands.
 (#79025)

---
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |   7 +-
 llvm/lib/Target/AMDGPU/BUFInstructions.td     |  17 +--
 .../MCTargetDesc/AMDGPUMCCodeEmitter.cpp      |  43 ++++++-
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         | 113 +++++++++---------
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |  24 ++--
 llvm/lib/Target/AMDGPU/VOPCInstructions.td    |  38 +++---
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s    |  64 +++++-----
 llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s     |  24 ++--
 .../AMDGPU/gfx11_dasm_vop1_dpp16.txt          | 102 ++++++++++------
 .../AMDGPU/gfx11_dasm_vop1_dpp8.txt           |  35 ++++--
 10 files changed, 277 insertions(+), 190 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 66267c9255f41..489cf85693edb 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -346,7 +346,7 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   }
 
   bool isVRegWithInputMods() const;
-  bool isT16VRegWithInputMods() const;
+  template <bool IsFake16> bool isT16VRegWithInputMods() const;
 
   bool isSDWAOperand(MVT type) const;
   bool isSDWAFP16Operand() const;
@@ -2054,8 +2054,9 @@ bool AMDGPUOperand::isVRegWithInputMods() const {
           AsmParser->getFeatureBits()[AMDGPU::FeatureDPALU_DPP]);
 }
 
-bool AMDGPUOperand::isT16VRegWithInputMods() const {
-  return isRegClass(AMDGPU::VGPR_32_Lo128RegClassID);
+template <bool IsFake16> bool AMDGPUOperand::isT16VRegWithInputMods() const {
+  return isRegClass(IsFake16 ? AMDGPU::VGPR_32_Lo128RegClassID
+                             : AMDGPU::VGPR_16_Lo128RegClassID);
 }
 
 bool AMDGPUOperand::isSDWAOperand(MVT type) const {
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index c3e5be8334a69..87247a134848b 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -485,8 +485,8 @@ class MUBUF_Load_Pseudo <string opName,
                          list<dag> pattern=[],
                          // Workaround bug bz30254
                          int addrKindCopy = addrKind,
-                         RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret,
-                         RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret>
+                         RegisterOperand vdata_rc = getVregSrcForVT<vdata_vt>.ret,
+                         RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc.RegClass, isTFE>.ret>
   : MUBUF_Pseudo<opName,
                  !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
                  !con(getMUBUFIns<addrKindCopy, [], isTFE, hasGFX12Enc>.ret,
@@ -601,7 +601,7 @@ class MUBUF_Store_Pseudo <string opName,
                           int addrKindCopy = addrKind>
   : MUBUF_Pseudo<opName,
                  (outs),
-                 getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret], isTFE, hasGFX12Enc>.ret,
+                 getMUBUFIns<addrKindCopy, [getVregSrcForVT<store_vt>.ret.RegClass], isTFE, hasGFX12Enc>.ret,
                  getMUBUFAsmOps<addrKindCopy, 0, 0, isTFE>.ret,
                  pattern>,
     MUBUF_SetupAddr<addrKindCopy> {
@@ -1569,27 +1569,28 @@ multiclass BufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, string
                                        # !if(!eq(RtnMode, "ret"), "", "_noret")
                                        # "_" # vt.Size);
   defvar InstSuffix = !if(!eq(RtnMode, "ret"), "_RTN", "");
+  defvar data_vt_RC = getVregSrcForVT<data_vt>.ret.RegClass;
 
   let AddedComplexity = !if(!eq(RtnMode, "ret"), 0, 1) in {
   defvar OffsetResDag = (!cast<MUBUF_Pseudo>(Inst # "_OFFSET" # InstSuffix)
-    getVregSrcForVT<data_vt>.ret:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
+    data_vt_RC:$vdata_in, SReg_128:$srsrc, SCSrc_b32:$soffset,
     offset:$offset);
   def : GCNPat<
     (vt (Op (MUBUFOffset v4i32:$srsrc, i32:$soffset, i32:$offset), data_vt:$vdata_in)),
     !if(!eq(RtnMode, "ret"),
-      (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, getVregSrcForVT<data_vt>.ret)),
+      (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS OffsetResDag, data_vt_RC)),
         !if(!eq(vt, i32), sub0, sub0_sub1)),
       OffsetResDag)
   >;
 
   defvar Addr64ResDag = (!cast<MUBUF_Pseudo>(Inst # "_ADDR64" # InstSuffix)
-    getVregSrcForVT<data_vt>.ret:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
+    data_vt_RC:$vdata_in, VReg_64:$vaddr, SReg_128:$srsrc,
     SCSrc_b32:$soffset, offset:$offset);
   def : GCNPat<
     (vt (Op (MUBUFAddr64 v4i32:$srsrc, i64:$vaddr, i32:$soffset, i32:$offset),
       data_vt:$vdata_in)),
     !if(!eq(RtnMode, "ret"),
-      (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, getVregSrcForVT<data_vt>.ret)),
+      (EXTRACT_SUBREG (vt (COPY_TO_REGCLASS Addr64ResDag, data_vt_RC)),
         !if(!eq(vt, i32), sub0, sub0_sub1)),
       Addr64ResDag)
   >;
@@ -1820,7 +1821,7 @@ multiclass SIBufferAtomicCmpSwapPat_Common<ValueType vt, ValueType data_vt, stri
       (extract_cpol_set_glc $auxiliary),
       (extract_cpol $auxiliary));
     defvar SrcRC = getVregSrcForVT<vt>.ret;
-    defvar DataRC = getVregSrcForVT<data_vt>.ret;
+    defvar DataRC = getVregSrcForVT<data_vt>.ret.RegClass;
     defvar SubLo = !if(!eq(vt, i32), sub0, sub0_sub1);
     defvar SubHi = !if(!eq(vt, i32), sub1, sub2_sub3);
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
index de1abaf29c56b..c3e87244c0c8d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.cpp
@@ -562,7 +562,48 @@ void AMDGPUMCCodeEmitter::getMachineOpValue(const MCInst &MI,
 void AMDGPUMCCodeEmitter::getMachineOpValueT16(
     const MCInst &MI, unsigned OpNo, APInt &Op,
     SmallVectorImpl<MCFixup> &Fixups, const MCSubtargetInfo &STI) const {
-  llvm_unreachable("TODO: Implement getMachineOpValueT16().");
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg()) {
+    unsigned Enc = MRI.getEncodingValue(MO.getReg());
+    unsigned Idx = Enc & AMDGPU::HWEncoding::REG_IDX_MASK;
+    bool IsVGPR = Enc & AMDGPU::HWEncoding::IS_VGPR_OR_AGPR;
+    Op = Idx | (IsVGPR << 8);
+    return;
+  }
+  getMachineOpValueCommon(MI, MO, OpNo, Op, Fixups, STI);
+  // VGPRs include the suffix/op_sel bit in the register encoding, but
+  // immediates and SGPRs include it in src_modifiers. Therefore, copy the
+  // op_sel bit from the src operands into src_modifier operands if Op is
+  // src_modifiers and the corresponding src is a VGPR
+  int SrcMOIdx = -1;
+  assert(OpNo < INT_MAX);
+  if ((int)OpNo == AMDGPU::getNamedOperandIdx(MI.getOpcode(),
+                                              AMDGPU::OpName::src0_modifiers)) {
+    SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
+    int VDstMOIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst);
+    if (VDstMOIdx != -1) {
+      auto DstReg = MI.getOperand(VDstMOIdx).getReg();
+      if (AMDGPU::isHi(DstReg, MRI))
+        Op |= SISrcMods::DST_OP_SEL;
+    }
+  } else if ((int)OpNo == AMDGPU::getNamedOperandIdx(
+                              MI.getOpcode(), AMDGPU::OpName::src1_modifiers))
+    SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src1);
+  else if ((int)OpNo == AMDGPU::getNamedOperandIdx(
+                            MI.getOpcode(), AMDGPU::OpName::src2_modifiers))
+    SrcMOIdx = AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src2);
+  if (SrcMOIdx == -1)
+    return;
+
+  const MCOperand &SrcMO = MI.getOperand(SrcMOIdx);
+  if (!SrcMO.isReg())
+    return;
+  auto SrcReg = SrcMO.getReg();
+  if (AMDGPU::isSGPR(SrcReg, &MRI))
+    return;
+  if (AMDGPU::isHi(SrcReg, MRI))
+    Op |= SISrcMods::OP_SEL_0;
 }
 
 void AMDGPUMCCodeEmitter::getMachineOpValueT16Lo128(
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 3aeed6aec3650..a6820544f4b4d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1223,17 +1223,20 @@ def FPVRegInputModsMatchClass : AsmOperandClass {
   let PredicateMethod = "isVRegWithInputMods";
 }
 
-def FPT16VRegInputModsMatchClass : AsmOperandClass {
-  let Name = "T16VRegWithFPInputMods";
+class FPT16VRegInputModsMatchClass<bit IsFake16> : AsmOperandClass {
+  let Name = !if(IsFake16, "Fake16VRegWithFPInputMods",
+                 "T16VRegWithFPInputMods");
   let ParserMethod = "parseRegWithFPInputMods";
-  let PredicateMethod = "isT16VRegWithInputMods";
+  let PredicateMethod = "isT16VRegWithInputMods<" #
+                        !if(IsFake16, "true", "false") # ">";
 }
 
 def FPVRegInputMods : InputMods <FPVRegInputModsMatchClass> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
-def FPT16VRegInputMods : InputMods <FPT16VRegInputModsMatchClass> {
+class FPT16VRegInputMods<bit IsFake16>
+    : InputMods <FPT16VRegInputModsMatchClass<IsFake16>> {
   let PrintMethod = "printOperandAndFPInputMods";
 }
 
@@ -1265,13 +1268,16 @@ def IntVRegInputModsMatchClass : AsmOperandClass {
   let PredicateMethod = "isVRegWithInputMods";
 }
 
-def IntT16VRegInputModsMatchClass : AsmOperandClass {
-  let Name = "T16VRegWithIntInputMods";
+class IntT16VRegInputModsMatchClass<bit IsFake16> : AsmOperandClass {
+  let Name = !if(IsFake16, "Fake16VRegWithIntInputMods",
+                 "T16VRegWithIntInputMods");
   let ParserMethod = "parseRegWithIntInputMods";
-  let PredicateMethod = "isT16VRegWithInputMods";
+  let PredicateMethod = "isT16VRegWithInputMods<" #
+                        !if(IsFake16, "true", "false") # ">";
 }
 
-def IntT16VRegInputMods : InputMods <IntT16VRegInputModsMatchClass> {
+class IntT16VRegInputMods<bit IsFake16>
+    : InputMods <IntT16VRegInputModsMatchClass<IsFake16>> {
   let PrintMethod = "printOperandAndIntInputMods";
 }
 
@@ -1510,25 +1516,17 @@ class getSOPSrcForVT<ValueType VT> {
 }
 
 // Returns the vreg register class to use for source operand given VT
-class getVregSrcForVT<ValueType VT> {
-  RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
-                        !if(!eq(VT.Size, 96), VReg_96,
-                          !if(!eq(VT.Size, 64), VReg_64,
-                            !if(!eq(VT.Size, 48), VReg_64,
-                              VGPR_32))));
-}
-
-class getVregSrcForVT_t16<ValueType VT, bit IsFake16 = 1> {
-  RegisterClass ret = !if(!eq(VT.Size, 128), VReg_128,
-                        !if(!eq(VT.Size, 96), VReg_96,
-                          !if(!eq(VT.Size, 64), VReg_64,
-                            !if(!eq(VT.Size, 48), VReg_64,
-                              !if(!eq(VT.Size, 16),
-                                  !if(IsFake16, VGPR_32_Lo128, VGPR_16_Lo128),
-                                  VGPR_32)))));
-
-  RegisterOperand op = !if (!and(!eq(VT.Size, 16), !not(IsFake16)),
-                            VGPRSrc_16_Lo128, RegisterOperand<ret>);
+class getVregSrcForVT<ValueType VT, bit IsTrue16 = 0, bit IsFake16 = 0> {
+  RegisterOperand ret =
+      !if (!eq(VT.Size, 128), RegisterOperand<VReg_128>,
+           !if (!eq(VT.Size, 96), RegisterOperand<VReg_96>,
+                !if (!eq(VT.Size, 64), RegisterOperand<VReg_64>,
+                     !if (!eq(VT.Size, 48), RegisterOperand<VReg_64>,
+                          !if (!eq(VT.Size, 16),
+                               !if (IsTrue16,
+                                    !if (IsFake16, VGPRSrc_32_Lo128, VGPRSrc_16_Lo128),
+                                    RegisterOperand<VGPR_32>),
+                               RegisterOperand<VGPR_32>)))));
 }
 
 class getSDWASrcForVT <ValueType VT> {
@@ -1635,13 +1633,13 @@ class getSrcModDPP <ValueType VT> {
   Operand ret = !if(VT.isFP, FPVRegInputMods, IntVRegInputMods);
 }
 
-class getSrcModDPP_t16 <ValueType VT> {
+class getSrcModDPP_t16 <ValueType VT, bit IsFake16 = 1> {
   Operand ret =
       !if (VT.isFP,
            !if (!or(!eq(VT.Value, f16.Value), !eq(VT.Value, bf16.Value)),
-                FPT16VRegInputMods, FPVRegInputMods),
-           !if (!eq(VT.Value, i16.Value), IntT16VRegInputMods,
-                IntVRegInputMods));
+                FPT16VRegInputMods<IsFake16>, FPVRegInputMods),
+           !if (!eq(VT.Value, i16.Value),
+                IntT16VRegInputMods<IsFake16>, IntVRegInputMods));
 }
 
 // Return type of input modifiers operand for specified input operand for DPP
@@ -1784,10 +1782,9 @@ class getInsVOP3OpSel <RegisterOperand Src0RC, RegisterOperand Src1RC,
                     Src0Mod, Src1Mod, Src2Mod, /*HasOpSel=*/1>.ret;
 }
 
-class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
-                 RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld> {
-
+class getInsDPPBase <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
+                     RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers,
+                     Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld> {
   dag ret = !if(!eq(NumSrcArgs, 0),
                 // VOP1 without input operands (V_NOP)
                 (ins ),
@@ -1827,8 +1824,8 @@ class getInsDPPBase <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass
             );
 }
 
-class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
-                 RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
+class getInsDPP <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
+                 RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers,
                  Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
                            HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
@@ -1836,17 +1833,17 @@ class getInsDPP <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1
                      bank_mask:$bank_mask, bound_ctrl:$bound_ctrl));
 }
 
-class getInsDPP16 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
-                 RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
+class getInsDPP16 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
+                   RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers,
+                   Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPP<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
                            HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
                  (ins FI:$fi));
 }
 
-class getInsDPP8 <RegisterOperand OldRC, RegisterClass Src0RC, RegisterClass Src1RC,
-                 RegisterClass Src2RC, int NumSrcArgs, bit HasModifiers,
-                 Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
+class getInsDPP8 <RegisterOperand OldRC, RegisterOperand Src0RC, RegisterOperand Src1RC,
+                  RegisterOperand Src2RC, int NumSrcArgs, bit HasModifiers,
+                  Operand Src0Mod, Operand Src1Mod, Operand Src2Mod, bit HasOld = 1> {
   dag ret = !con(getInsDPPBase<OldRC, Src0RC, Src1RC, Src2RC, NumSrcArgs,
                            HasModifiers, Src0Mod, Src1Mod, Src2Mod, HasOld>.ret,
                  (ins dpp8:$dpp8, FI:$fi));
@@ -2251,13 +2248,13 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field RegisterOperand DstRCVOP3DPP = DstRC64;
   field RegisterOperand DstRCSDWA = getSDWADstForVT<DstVT>.ret;
   field RegisterOperand Src0RC32 = getVOPSrc0ForVT<Src0VT, IsTrue16>.ret;
-  field RegisterOperand Src1RC32 = RegisterOperand<getVregSrcForVT<Src1VT>.ret>;
+  field RegisterOperand Src1RC32 = getVregSrcForVT<Src1VT>.ret;
   field RegisterOperand Src0RC64 = getVOP3SrcForVT<Src0VT>.ret;
   field RegisterOperand Src1RC64 = getVOP3SrcForVT<Src1VT>.ret;
   field RegisterOperand Src2RC64 = getVOP3SrcForVT<Src2VT>.ret;
-  field RegisterClass Src0DPP = getVregSrcForVT<Src0VT>.ret;
-  field RegisterClass Src1DPP = getVregSrcForVT<Src1VT>.ret;
-  field RegisterClass Src2DPP = getVregSrcForVT<Src2VT>.ret;
+  field RegisterOperand Src0DPP = getVregSrcForVT<Src0VT>.ret;
+  field RegisterOperand Src1DPP = getVregSrcForVT<Src1VT>.ret;
+  field RegisterOperand Src2DPP = getVregSrcForVT<Src2VT>.ret;
   field RegisterOperand Src0VOP3DPP = VGPRSrc_32;
   field RegisterOperand Src1VOP3DPP = getVOP3DPPSrcForVT<Src1VT>.ret;
   field RegisterOperand Src2VOP3DPP = getVOP3DPPSrcForVT<Src2VT>.ret;
@@ -2443,13 +2440,13 @@ class VOPProfile_True16<VOPProfile P> : VOPProfile<P.ArgVT> {
   let DstRC = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 0 /*IsVOP3Encoding*/>.ret;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
   let Src0RC32 = getVOPSrc0ForVT<Src0VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
-  let Src1RC32 = getVregSrcForVT_t16<Src1VT, 0 /*IsFake16*/>.op;
-  let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
-  let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
-  let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
-  let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
-  let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
-  let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
+  let Src1RC32 = getVregSrcForVT<Src1VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
+  let Src0DPP = getVregSrcForVT<Src0VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
+  let Src1DPP = getVregSrcForVT<Src1VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
+  let Src2DPP = getVregSrcForVT<Src2VT, 1 /*IsTrue16*/, 0 /*IsFake16*/>.ret;
+  let Src0ModDPP = getSrcModDPP_t16<Src0VT, 0 /*IsFake16*/>.ret;
+  let Src1ModDPP = getSrcModDPP_t16<Src1VT, 0 /*IsFake16*/>.ret;
+  let Src2ModDPP = getSrcModDPP_t16<Src2VT, 0 /*IsFake16*/>.ret;
 
   let DstRC64 = getVALUDstForVT<DstVT, 1 /*IsTrue16*/, 1 /*IsVOP3Encoding*/>.ret;
   let Src0RC64 = getVOP3SrcForVT<Src0VT, 1 /*IsTrue16*/>.ret;
@@ -2465,10 +2462,10 @@ class VOPProfile_Fake16<VOPProfile P> : VOPProfile<P.ArgVT> {
   // Most DstVT are 16-bit, but not all
   let DstRC = getVALUDstForVT_fake16<DstVT>.ret;
   let DstRC64 = getVALUDstForVT<DstVT>.ret;
-  let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
-  let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
-  let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
-  let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+  let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+  let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+  let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+  let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
   let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
   let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
   let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 9323372690065..27eec64f59a63 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -418,15 +418,11 @@ def VOP_MADMK_F16_t16 : VOP_MADMK <f16> {
 }
 def VOP_MADMK_F32 : VOP_MADMK <f32>;
 
-class getRegisterOperandForVT<ValueType VT> {
-  RegisterOperand ret = RegisterOperand<getVregSrcForVT<VT>.ret>;
-}
-
 // FIXME: Remove src2_modifiers. It isn't used, so is wasting memory
 // and processing time but it makes it easier to convert to mad.
 class VOP_MAC <ValueType vt0, ValueType vt1=vt0> : VOPProfile <[vt0, vt1, vt1, vt0]> {
   let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT>.ret:$src2);
-  let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret, 3,
+  let Ins64 = getIns64<Src0RC64, Src1RC64, getVregSrcForVT<Src2VT>.ret, 3,
                        0, HasModifiers, HasModifiers, HasOMod,
                        Src0Mod, Src1Mod, Src2Mod>.ret;
   let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
@@ -489,21 +485,21 @@ def VOP_MAC_F16_t16 : VOP_MAC <f16> {
   let DstRC = VOPDstOperand<VGPR_32_Lo128>;
   let DstRC64 = VOPDstOperand<VGPR_32>;
   let Src1RC32 = VGPRSrc_32_Lo128;
-  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT_t16<Src2VT>.ret:$src2);
-  let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
-  let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
-  let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+  let Ins32 = (ins Src0RC32:$src0, Src1RC32:$src1, getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2);
+  let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+  let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+  let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
   let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
   let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
   let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
   let InsDPP = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                     Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
-                    getVregSrcForVT_t16<Src2VT>.ret:$src2, // stub argument
+                    getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
                     dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
                     bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
   let InsDPP8 = (ins Src0ModDPP:$src0_modifiers, Src0DPP:$src0,
                      Src1ModDPP:$src1_modifiers, Src1DPP:$src1,
-                     getVregSrcForVT_t16<Src2VT>.ret:$src2, // stub argument
+                     getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret:$src2, // stub argument
                      dpp8:$dpp8, FI:$fi);
   let Src2Mod = FP32InputMods; // dummy unused modifiers
   let Src2RC64 = VGPRSrc_32;   // stub argument
@@ -535,7 +531,7 @@ def VOP_DOT_ACC_I32_I32   : VOP_DOT_ACC<i32, i32> {
 
   let Src0Mod = Int32InputMods;
   let Src1Mod = Int32InputMods;
-  let Ins64 = getIns64<Src0RC64, Src1RC64, getRegisterOperandForVT<Src2VT>.ret,
+  let Ins64 = getIns64<Src0RC64, Src1RC64, getVregSrcForVT<Src2VT>.ret,
                        3 /*NumSrcArgs*/, HasClamp, 1 /*HasModifiers*/,
                        1 /*HasSrc2Mods*/, HasOMod,
                        Src0Mod, Src1Mod, Src2Mod>.ret;
@@ -898,8 +894,8 @@ def LDEXP_F16_VOPProfile : VOPProfile <[f16, f16, f16, untyped]> {
 }
 def LDEXP_F16_VOPProfile_True16 : VOPProfile_Fake16<VOP_F16_F16_F16> {
   let Src1RC32 = RegisterOperand<VGPR_32_Lo128>;
-  let Src1DPP = VGPR_32_Lo128;
-  let Src1ModDPP = IntT16VRegInputMods;
+  let Src1DPP = RegisterOperand<VGPR_32_Lo128>;
+  let Src1ModDPP = IntT16VRegInputMods</* IsFake16= */ 1>;
 }
 
 let isReMaterializable = 1 in {
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index c3aa13a9b3c7d..3ca97f0291e01 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -61,13 +61,13 @@ class VOPC_Profile<list<SchedReadWrite> sched, ValueType vt0, ValueType vt1 = vt
   let AsmDPP16 = AsmDPP#"$fi";
   // VOPC DPP Instructions do not need an old operand
   let TieRegDPP = "";
-  let InsDPP = getInsDPP<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+  let InsDPP = getInsDPP<VOPDstOperand<Src0DPP.RegClass>, Src0DPP, Src1DPP, Src2DPP,
                          NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
                          Src2ModDPP, 0/*HasOld*/>.ret;
-  let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+  let InsDPP16 = getInsDPP16<VOPDstOperand<Src0DPP.RegClass>, Src0DPP, Src1DPP, Src2DPP,
                              NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
                              Src2ModDPP, 0/*HasOld*/>.ret;
-  let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP>, Src0DPP, Src1DPP, Src2DPP,
+  let InsDPP8 = getInsDPP8<VOPDstOperand<Src0DPP.RegClass>, Src0DPP, Src1DPP, Src2DPP,
                            NumSrcArgs, HasModifiers, Src0ModDPP, Src1ModDPP,
                            Src2ModDPP, 0/*HasOld*/>.ret;
 
@@ -88,10 +88,10 @@ multiclass VOPC_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, ValueType
   def NAME : VOPC_Profile<sched, vt0, vt1>;
   def _t16 : VOPC_Profile<sched, vt0, vt1> {
     let IsTrue16 = 1;
-    let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
-    let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
-    let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
-    let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+    let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
@@ -118,10 +118,10 @@ multiclass VOPC_NoSdst_Profile_t16<list<SchedReadWrite> sched, ValueType vt0, Va
   def NAME : VOPC_NoSdst_Profile<sched, vt0, vt1>;
   def _t16 : VOPC_NoSdst_Profile<sched, vt0, vt1> {
     let IsTrue16 = 1;
-    let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
-    let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
-    let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
-    let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+    let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
@@ -789,11 +789,11 @@ multiclass VOPC_Class_Profile_t16<list<SchedReadWrite> sched> {
   def NAME : VOPC_Class_Profile<sched, f16>;
   def _t16 : VOPC_Class_Profile<sched, f16, i16> {
     let IsTrue16 = 1;
-    let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
+    let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src1RC64 = VSrc_b32;
-    let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
-    let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
-    let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+    let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
@@ -816,11 +816,11 @@ multiclass VOPC_Class_NoSdst_Profile_t16<list<SchedReadWrite> sched> {
   def NAME : VOPC_Class_NoSdst_Profile<sched, f16>;
   def _t16 : VOPC_Class_NoSdst_Profile<sched, f16, i16> {
     let IsTrue16 = 1;
-    let Src1RC32 = RegisterOperand<getVregSrcForVT_t16<Src1VT>.ret>;
+    let Src1RC32 = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src1RC64 = VSrc_b32;
-    let Src0DPP = getVregSrcForVT_t16<Src0VT>.ret;
-    let Src1DPP = getVregSrcForVT_t16<Src1VT>.ret;
-    let Src2DPP = getVregSrcForVT_t16<Src2VT>.ret;
+    let Src0DPP = getVregSrcForVT<Src0VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src1DPP = getVregSrcForVT<Src1VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
+    let Src2DPP = getVregSrcForVT<Src2VT, 1/*IsTrue16*/, 1/*IsFake16*/>.ret;
     let Src0ModDPP = getSrcModDPP_t16<Src0VT>.ret;
     let Src1ModDPP = getSrcModDPP_t16<Src1VT>.ret;
     let Src2ModDPP = getSrcModDPP_t16<Src2VT>.ret;
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
index fa6df6affeb1e..d15de77f67c39 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp16.s
@@ -43,47 +43,47 @@ v_bfrev_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_bfrev_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: encoding: [0xfa,0x70,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_ceil_f16 v5.l, v1 quad_perm:[3,2,1,0]
+v_ceil_f16 v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_ceil_f16 v5.l, v1 quad_perm:[0,1,2,3]
+v_ceil_f16 v5.l, v1.l quad_perm:[0,1,2,3]
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_ceil_f16 v5.l, v1 row_mirror
+v_ceil_f16 v5.l, v1.l row_mirror
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_ceil_f16 v5.l, v1 row_half_mirror
+v_ceil_f16 v5.l, v1.l row_half_mirror
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_ceil_f16 v5.l, v1 row_shl:1
+v_ceil_f16 v5.l, v1.l row_shl:1
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_ceil_f16 v5.l, v1 row_shl:15
+v_ceil_f16 v5.l, v1.l row_shl:15
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_ceil_f16 v5.l, v1 row_shr:1
+v_ceil_f16 v5.l, v1.l row_shr:1
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_ceil_f16 v5.l, v1 row_shr:15
+v_ceil_f16 v5.l, v1.l row_shr:15
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_ceil_f16 v5.l, v1 row_ror:1
+v_ceil_f16 v5.l, v1.l row_ror:1
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_ceil_f16 v5.l, v1 row_ror:15
+v_ceil_f16 v5.l, v1.l row_ror:15
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_ceil_f16 v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf
+v_ceil_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_ceil_f16 v127.l, v127 row_share:15 row_mask:0x0 bank_mask:0x1
+v_ceil_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_ceil_f16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: encoding: [0xfa,0xb8,0x0a,0x7f,0x01,0x60,0x09,0x13]
+v_ceil_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: encoding: [0xfa,0xb8,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_ceil_f16 v127.h, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: encoding: [0xfa,0xb8,0xfe,0x7f,0x7f,0x6f,0x35,0x30]
+v_ceil_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: encoding: [0xfa,0xb8,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_ceil_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0xff]
@@ -1513,47 +1513,47 @@ v_ffbl_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_ffbl_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
-v_floor_f16 v5.l, v1 quad_perm:[3,2,1,0]
+v_floor_f16 v5.l, v1.l quad_perm:[3,2,1,0]
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
-v_floor_f16 v5.l, v1 quad_perm:[0,1,2,3]
+v_floor_f16 v5.l, v1.l quad_perm:[0,1,2,3]
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 
-v_floor_f16 v5.l, v1 row_mirror
+v_floor_f16 v5.l, v1.l row_mirror
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 
-v_floor_f16 v5.l, v1 row_half_mirror
+v_floor_f16 v5.l, v1.l row_half_mirror
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 
-v_floor_f16 v5.l, v1 row_shl:1
+v_floor_f16 v5.l, v1.l row_shl:1
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 
-v_floor_f16 v5.l, v1 row_shl:15
+v_floor_f16 v5.l, v1.l row_shl:15
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 
-v_floor_f16 v5.l, v1 row_shr:1
+v_floor_f16 v5.l, v1.l row_shr:1
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 
-v_floor_f16 v5.l, v1 row_shr:15
+v_floor_f16 v5.l, v1.l row_shr:15
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 
-v_floor_f16 v5.l, v1 row_ror:1
+v_floor_f16 v5.l, v1.l row_ror:1
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 
-v_floor_f16 v5.l, v1 row_ror:15
+v_floor_f16 v5.l, v1.l row_ror:15
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 
-v_floor_f16 v5.l, v1 row_share:0 row_mask:0xf bank_mask:0xf
+v_floor_f16 v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX11: encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 
-v_floor_f16 v127.l, v127 row_share:15 row_mask:0x0 bank_mask:0x1
+v_floor_f16 v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX11: encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
 
-v_floor_f16 v5.h, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: encoding: [0xfa,0xb6,0x0a,0x7f,0x01,0x60,0x09,0x13]
+v_floor_f16 v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: encoding: [0xfa,0xb6,0x0a,0x7f,0x81,0x60,0x09,0x13]
 
-v_floor_f16 v127.h, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: encoding: [0xfa,0xb6,0xfe,0x7f,0x7f,0x6f,0x35,0x30]
+v_floor_f16 v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: encoding: [0xfa,0xb6,0xfe,0x7f,0xff,0x6f,0x35,0x30]
 
 v_floor_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX11: encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0xff]
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
index 4fe3aa121b590..bc1b0ff8d00b4 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop1_dpp8.s
@@ -10,17 +10,17 @@ v_bfrev_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_bfrev_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: encoding: [0xe9,0x70,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_ceil_f16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0]
+v_ceil_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_ceil_f16 v127.l, v127 dpp8:[7,6,5,4,3,2,1,0]
+v_ceil_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: encoding: [0xe9,0xb8,0xfe,0x7e,0x7f,0x77,0x39,0x05]
 
-v_ceil_f16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: encoding: [0xea,0xb8,0x0a,0x7f,0x01,0x77,0x39,0x05]
+v_ceil_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: encoding: [0xea,0xb8,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_ceil_f16 v127.h, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: encoding: [0xe9,0xb8,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_ceil_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: encoding: [0xe9,0xb8,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_ceil_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: encoding: [0xe9,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05]
@@ -328,17 +328,17 @@ v_ffbl_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_ffbl_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
-v_floor_f16 v5.l, v1 dpp8:[7,6,5,4,3,2,1,0]
+v_floor_f16 v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
-v_floor_f16 v127.l, v127 dpp8:[7,6,5,4,3,2,1,0]
+v_floor_f16 v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: encoding: [0xe9,0xb6,0xfe,0x7e,0x7f,0x77,0x39,0x05]
 
-v_floor_f16 v5.h, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: encoding: [0xea,0xb6,0x0a,0x7f,0x01,0x77,0x39,0x05]
+v_floor_f16 v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: encoding: [0xea,0xb6,0x0a,0x7f,0x81,0x77,0x39,0x05]
 
-v_floor_f16 v127.h, v127 dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: encoding: [0xe9,0xb6,0xfe,0x7f,0x7f,0x00,0x00,0x00]
+v_floor_f16 v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: encoding: [0xe9,0xb6,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
 v_floor_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: encoding: [0xe9,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
index 8758305258387..3e1bcf2fd0345 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp16.txt
@@ -1,5 +1,7 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 # GFX11: v_bfrev_b32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x70,0x0a,0x7e,0x01,0x1b,0x00,0xff
@@ -43,47 +45,61 @@
 # GFX11: v_bfrev_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x70,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 0xfa,0x70,0xfe,0x7f,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_ceil_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0xe4,0x00,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x40,0x01,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x41,0x01,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x01,0x01,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x0f,0x01,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x11,0x01,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x1f,0x01,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x21,0x01,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x2f,0x01,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff]
 0xfa,0xb8,0x0a,0x7e,0x01,0x50,0x01,0xff
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01]
-0xfa,0xb8,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_ceil_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_ceil_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+0xfa,0xb8,0xfe,0x7e,0x7f,0x5f,0x01,0x01
 
-# GFX11: v_ceil_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13]
-0xfa,0xb8,0x0a,0x7e,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_ceil_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb8,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xfa,0xb8,0x0a,0x7f,0x81,0x60,0x01,0x13
 
-# GFX11: v_ceil_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
-0xfa,0xb8,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
+# GFX11-REAL16: v_ceil_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb8,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xfa,0xb8,0xfe,0x7f,0xff,0x6f,0x3d,0x30
 
 # GFX11: v_ceil_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x44,0x0a,0x7e,0x01,0x1b,0x00,0xff
@@ -1303,47 +1319,61 @@
 # GFX11: v_exp_f32_dpp v255, -|v255| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x4a,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
 0xfa,0x4a,0xfe,0x7f,0xff,0x6f,0x3d,0x30
 
-# GFX11: v_floor_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0xe4,0x00,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x40,0x01,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x41,0x01,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x01,0x01,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x0f,0x01,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x11,0x01,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x1f,0x01,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x21,0x01,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x2f,0x01,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff]
 0xfa,0xb6,0x0a,0x7e,0x01,0x50,0x01,0xff
 
-# GFX11: v_floor_f16_dpp v5, v1 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01]
-0xfa,0xb6,0x0a,0x7e,0x01,0x5f,0x01,0x01
+# GFX11-REAL16: v_floor_f16_dpp v127.l, v127.l row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+# GFX11-FAKE16: v_floor_f16_dpp v127, v127 row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x5f,0x01,0x01]
+0xfa,0xb6,0xfe,0x7e,0x7f,0x5f,0x01,0x01
 
-# GFX11: v_floor_f16_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13]
-0xfa,0xb6,0x0a,0x7e,0x01,0x60,0x01,0x13
+# GFX11-REAL16: v_floor_f16_dpp v5.h, v1.h row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0xfa,0xb6,0x0a,0x7f,0x81,0x60,0x01,0x13]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xfa,0xb6,0x0a,0x7f,0x81,0x60,0x01,0x13
 
-# GFX11: v_floor_f16_dpp v127, -|v127| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30]
-0xfa,0xb6,0xfe,0x7e,0x7f,0x6f,0x3d,0x30
+# GFX11-REAL16: v_floor_f16_dpp v127.h, -|v127.h| row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0xb6,0xfe,0x7f,0xff,0x6f,0x3d,0x30]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xfa,0xb6,0xfe,0x7f,0xff,0x6f,0x3d,0x30
 
 # GFX11: v_floor_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x48,0x0a,0x7e,0x01,0x1b,0x00,0xff
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
index a3531410ac401..aef470fe87fd3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop1_dpp8.txt
@@ -1,5 +1,6 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-REAL16 %s
+# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=-real-true16,-WavefrontSize32,+WavefrontSize64 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 # GFX11: v_bfrev_b32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x70,0x0a,0x7e,0x01,0x77,0x39,0x05
@@ -7,11 +8,21 @@
 # GFX11: v_bfrev_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x70,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x70,0xfe,0x7f,0xff,0x00,0x00,0x00
 
-# GFX11: v_ceil_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_ceil_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_ceil_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0xb8,0x0a,0x7e,0x01,0x77,0x39,0x05
 
-# GFX11: v_ceil_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00]
-0xea,0xb8,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX11-REAL16: v_ceil_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_ceil_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+0xe9,0xb8,0xfe,0x7e,0x7f,0x77,0x39,0x05
+
+# GFX11-REAL16: v_ceil_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb8,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xe9,0xb8,0x0a,0x7f,0x81,0x77,0x39,0x05
+
+# GFX11-REAL16: v_ceil_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb8,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xea,0xb8,0xfe,0x7f,0xff,0x00,0x00,0x00
 
 # GFX11: v_ceil_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x44,0x0a,0x7e,0x01,0x77,0x39,0x05
@@ -187,11 +198,21 @@
 # GFX11: v_exp_f32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x4a,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x4a,0xfe,0x7f,0xff,0x00,0x00,0x00
 
-# GFX11: v_floor_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-REAL16: v_floor_f16_dpp v5.l, v1.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
+# GFX11-FAKE16: v_floor_f16_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0xb6,0x0a,0x7e,0x01,0x77,0x39,0x05
 
-# GFX11: v_floor_f16_dpp v127, v127 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00]
-0xea,0xb6,0xfe,0x7e,0x7f,0x00,0x00,0x00
+# GFX11-REAL16: v_floor_f16_dpp v127.l, v127.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+# GFX11-FAKE16: v_floor_f16_dpp v127, v127 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0xfe,0x7e,0x7f,0x77,0x39,0x05]
+0xe9,0xb6,0xfe,0x7e,0x7f,0x77,0x39,0x05
+
+# GFX11-REAL16: v_floor_f16_dpp v5.h, v1.h dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xb6,0x0a,0x7f,0x81,0x77,0x39,0x05]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xe9,0xb6,0x0a,0x7f,0x81,0x77,0x39,0x05
+
+# GFX11-REAL16: v_floor_f16_dpp v127.h, v127.h dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0xb6,0xfe,0x7f,0xff,0x00,0x00,0x00]
+# COM: GFX11-FAKE16: warning: invalid instruction encoding
+0xea,0xb6,0xfe,0x7f,0xff,0x00,0x00,0x00
 
 # GFX11: v_floor_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x48,0x0a,0x7e,0x01,0x77,0x39,0x05

From 376f3be83444901fc4d50570bee492f0f666938f Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Tue, 23 Jan 2024 17:56:00 +0800
Subject: [PATCH 579/843] [DebugInfo] Remove redefinition of
 'getDPVAssignmentMarkers' (NFC)

llvm-project/llvm/include/llvm/IR/DebugInfo.h:239:31: error: redefinition of 'getDPVAssignmentMarkers'
inline SmallVector<DPValue *> getDPVAssignmentMarkers(const Instruction *Inst) {
                              ^
llvm-project/llvm/include/llvm/IR/DebugInfo.h:233:31: note: previous definition is here
inline SmallVector<DPValue *> getDPVAssignmentMarkers(const Instruction *Inst) {
                              ^
---
 llvm/include/llvm/IR/DebugInfo.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 006fd02a17bfd..b2f079393400b 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -230,11 +230,6 @@ inline AssignmentMarkerRange getAssignmentMarkers(const Instruction *Inst) {
   else
     return make_range(Value::user_iterator(), Value::user_iterator());
 }
-inline SmallVector<DPValue *> getDPVAssignmentMarkers(const Instruction *Inst) {
-  if (auto *ID = Inst->getMetadata(LLVMContext::MD_DIAssignID))
-    return cast<DIAssignID>(ID)->getAllDPValueUsers();
-  return {};
-}
 
 inline SmallVector<DPValue *> getDPVAssignmentMarkers(const Instruction *Inst) {
   if (auto *ID = Inst->getMetadata(LLVMContext::MD_DIAssignID))

From 0ed8194256c6c96875ed9dd102d15ee8a9260fec Mon Sep 17 00:00:00 2001
From: Alexey Lapshin <a.v.lapshin@mail.ru>
Date: Tue, 23 Jan 2024 13:00:49 +0300
Subject: [PATCH 580/843] [dsymutil] Add --linker parallel to more tests.
 (#78581)

This patch adds check for parallel linker to tests located in the root
of test/tools/dsymutil.
---
 llvm/include/llvm/DWARFLinker/AddressesMap.h  | 24 +++++++++----------
 llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp  |  7 +++---
 .../Parallel/DWARFLinkerCompileUnit.cpp       |  4 ++--
 .../DWARFLinker/Parallel/DWARFLinkerImpl.cpp  |  3 ++-
 .../Parallel/DependencyTracker.cpp            |  4 ++--
 llvm/test/tools/dsymutil/absolute_symbol.test |  2 ++
 llvm/test/tools/dsymutil/arch-option.test     | 11 +++++++++
 .../tools/dsymutil/archive-timestamp.test     |  2 ++
 llvm/test/tools/dsymutil/basic-linking.test   |  5 ++++
 .../tools/dsymutil/debug-map-parsing.test     |  7 ++++++
 llvm/test/tools/dsymutil/dump-symtab.test     |  3 +++
 .../tools/dsymutil/fat-binary-output.test     |  2 ++
 llvm/test/tools/dsymutil/fat-header.test      |  6 +++++
 .../dsymutil/yaml-object-address-rewrite.test |  2 ++
 llvm/tools/dsymutil/DwarfLinkerForBinary.cpp  | 22 +++++++++--------
 llvm/tools/dsymutil/DwarfLinkerForBinary.h    | 10 ++++----
 llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp |  6 ++---
 17 files changed, 83 insertions(+), 37 deletions(-)

diff --git a/llvm/include/llvm/DWARFLinker/AddressesMap.h b/llvm/include/llvm/DWARFLinker/AddressesMap.h
index d8b3b44074719..a232aafadc5ce 100644
--- a/llvm/include/llvm/DWARFLinker/AddressesMap.h
+++ b/llvm/include/llvm/DWARFLinker/AddressesMap.h
@@ -38,22 +38,22 @@ class AddressesMap {
   /// Checks that the specified DWARF expression operand \p Op references live
   /// code section and returns the relocation adjustment value (to get the
   /// linked address this value might be added to the source expression operand
-  /// address).
+  /// address). Print debug output if \p Verbose is true.
   /// \returns relocation adjustment value or std::nullopt if there is no
   /// corresponding live address.
-  virtual std::optional<int64_t>
-  getExprOpAddressRelocAdjustment(DWARFUnit &U,
-                                  const DWARFExpression::Operation &Op,
-                                  uint64_t StartOffset, uint64_t EndOffset) = 0;
+  virtual std::optional<int64_t> getExprOpAddressRelocAdjustment(
+      DWARFUnit &U, const DWARFExpression::Operation &Op, uint64_t StartOffset,
+      uint64_t EndOffset, bool Verbose) = 0;
 
   /// Checks that the specified subprogram \p DIE references the live code
   /// section and returns the relocation adjustment value (to get the linked
   /// address this value might be added to the source subprogram address).
   /// Allowed kinds of input DIE: DW_TAG_subprogram, DW_TAG_label.
+  /// Print debug output if \p Verbose is true.
   /// \returns relocation adjustment value or std::nullopt if there is no
   /// corresponding live address.
   virtual std::optional<int64_t>
-  getSubprogramRelocAdjustment(const DWARFDie &DIE) = 0;
+  getSubprogramRelocAdjustment(const DWARFDie &DIE, bool Verbose) = 0;
 
   // Returns the library install name associated to the AddessesMap.
   virtual std::optional<StringRef> getLibraryInstallName() = 0;
@@ -90,7 +90,7 @@ class AddressesMap {
   ///          second is the relocation adjustment value if the live address is
   ///          referenced.
   std::pair<bool, std::optional<int64_t>>
-  getVariableRelocAdjustment(const DWARFDie &DIE) {
+  getVariableRelocAdjustment(const DWARFDie &DIE, bool Verbose) {
     assert((DIE.getTag() == dwarf::DW_TAG_variable ||
             DIE.getTag() == dwarf::DW_TAG_constant) &&
            "Wrong type of input die");
@@ -149,9 +149,9 @@ class AddressesMap {
         HasLocationAddress = true;
         // Check relocation for the address.
         if (std::optional<int64_t> RelocAdjustment =
-                getExprOpAddressRelocAdjustment(*U, Op,
-                                                AttrOffset + CurExprOffset,
-                                                AttrOffset + Op.getEndOffset()))
+                getExprOpAddressRelocAdjustment(
+                    *U, Op, AttrOffset + CurExprOffset,
+                    AttrOffset + Op.getEndOffset(), Verbose))
           return std::make_pair(HasLocationAddress, *RelocAdjustment);
       } break;
       case dwarf::DW_OP_constx:
@@ -164,8 +164,8 @@ class AddressesMap {
           if (std::optional<int64_t> RelocAdjustment =
                   getExprOpAddressRelocAdjustment(
                       *U, Op, *AddressOffset,
-                      *AddressOffset +
-                          DIE.getDwarfUnit()->getAddressByteSize()))
+                      *AddressOffset + DIE.getDwarfUnit()->getAddressByteSize(),
+                      Verbose))
             return std::make_pair(HasLocationAddress, *RelocAdjustment);
         }
       } break;
diff --git a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
index bcb8b94c1fa5a..4f5a4e2ffc702 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFLinker.cpp
@@ -465,7 +465,7 @@ DWARFLinker::getVariableRelocAdjustment(AddressesMap &RelocMgr,
       if (std::optional<int64_t> RelocAdjustment =
               RelocMgr.getExprOpAddressRelocAdjustment(
                   *U, Op, AttrOffset + CurExprOffset,
-                  AttrOffset + Op.getEndOffset()))
+                  AttrOffset + Op.getEndOffset(), Options.Verbose))
         return std::make_pair(HasLocationAddress, *RelocAdjustment);
     } break;
     case dwarf::DW_OP_constx:
@@ -478,7 +478,8 @@ DWARFLinker::getVariableRelocAdjustment(AddressesMap &RelocMgr,
         if (std::optional<int64_t> RelocAdjustment =
                 RelocMgr.getExprOpAddressRelocAdjustment(
                     *U, Op, *AddressOffset,
-                    *AddressOffset + DIE.getDwarfUnit()->getAddressByteSize()))
+                    *AddressOffset + DIE.getDwarfUnit()->getAddressByteSize(),
+                    Options.Verbose))
           return std::make_pair(HasLocationAddress, *RelocAdjustment);
       }
     } break;
@@ -552,7 +553,7 @@ unsigned DWARFLinker::shouldKeepSubprogramDIE(
 
   assert(LowPc && "low_pc attribute is not an address.");
   std::optional<int64_t> RelocAdjustment =
-      RelocMgr.getSubprogramRelocAdjustment(DIE);
+      RelocMgr.getSubprogramRelocAdjustment(DIE, Options.Verbose);
   if (!RelocAdjustment)
     return Flags;
 
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
index fe2748f79c06d..8c52040fdbc92 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerCompileUnit.cpp
@@ -1372,7 +1372,7 @@ DIE *CompileUnit::createPlainDIEandCloneAttributes(
     // Get relocation adjustment value for the current function.
     FuncAddressAdjustment =
         getContaingFile().Addresses->getSubprogramRelocAdjustment(
-            getDIE(InputDieEntry));
+            getDIE(InputDieEntry), false);
   } else if (InputDieEntry->getTag() == dwarf::DW_TAG_label) {
     // Get relocation adjustment value for the current label.
     std::optional<uint64_t> lowPC =
@@ -1386,7 +1386,7 @@ DIE *CompileUnit::createPlainDIEandCloneAttributes(
     // Get relocation adjustment value for the current variable.
     std::pair<bool, std::optional<int64_t>> LocExprAddrAndRelocAdjustment =
         getContaingFile().Addresses->getVariableRelocAdjustment(
-            getDIE(InputDieEntry));
+            getDIE(InputDieEntry), false);
 
     HasLocationExpressionAddress = LocExprAddrAndRelocAdjustment.first;
     if (LocExprAddrAndRelocAdjustment.first &&
diff --git a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
index 56828e75ac0b4..a052969e74c0f 100644
--- a/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DWARFLinkerImpl.cpp
@@ -115,7 +115,8 @@ Error DWARFLinkerImpl::link() {
     }
 
     if (GlobalData.getOptions().Verbose) {
-      outs() << "OBJECT: " << Context->InputDWARFFile.FileName << "\n";
+      outs() << "DEBUG MAP OBJECT: " << Context->InputDWARFFile.FileName
+             << "\n";
 
       for (const std::unique_ptr<DWARFUnit> &OrigCU :
            Context->InputDWARFFile.Dwarf->compile_units()) {
diff --git a/llvm/lib/DWARFLinker/Parallel/DependencyTracker.cpp b/llvm/lib/DWARFLinker/Parallel/DependencyTracker.cpp
index 04152e7f9f2b0..5e30d9a8b6690 100644
--- a/llvm/lib/DWARFLinker/Parallel/DependencyTracker.cpp
+++ b/llvm/lib/DWARFLinker/Parallel/DependencyTracker.cpp
@@ -741,7 +741,7 @@ bool DependencyTracker::isLiveVariableEntry(const UnitEntryPairTy &Entry,
       // enclosing function, unless requested explicitly.
       std::pair<bool, std::optional<int64_t>> LocExprAddrAndRelocAdjustment =
           Entry.CU->getContaingFile().Addresses->getVariableRelocAdjustment(
-              DIE);
+              DIE, Entry.CU->getGlobalData().getOptions().Verbose);
 
       if (LocExprAddrAndRelocAdjustment.first)
         Info.setHasAnAddress();
@@ -784,7 +784,7 @@ bool DependencyTracker::isLiveSubprogramEntry(const UnitEntryPairTy &Entry) {
 
     RelocAdjustment =
         Entry.CU->getContaingFile().Addresses->getSubprogramRelocAdjustment(
-            DIE);
+            DIE, Entry.CU->getGlobalData().getOptions().Verbose);
     if (!RelocAdjustment)
       return false;
 
diff --git a/llvm/test/tools/dsymutil/absolute_symbol.test b/llvm/test/tools/dsymutil/absolute_symbol.test
index 1724cb7436622..ae61bf750738e 100644
--- a/llvm/test/tools/dsymutil/absolute_symbol.test
+++ b/llvm/test/tools/dsymutil/absolute_symbol.test
@@ -1,5 +1,7 @@
 RUN: dsymutil -dump-debug-map -oso-prepend-path %p %p/Inputs/absolute_sym.macho.i386 | FileCheck %s
 
+RUN: dsymutil --linker parallel -dump-debug-map -oso-prepend-path %p %p/Inputs/absolute_sym.macho.i386 | FileCheck %s
+
 The tested object file has been created by the dummy Objective-C code:
 @interface Foo
 @end
diff --git a/llvm/test/tools/dsymutil/arch-option.test b/llvm/test/tools/dsymutil/arch-option.test
index 723a9c3ac742c..d8b8e3d901f6d 100644
--- a/llvm/test/tools/dsymutil/arch-option.test
+++ b/llvm/test/tools/dsymutil/arch-option.test
@@ -11,6 +11,17 @@ RUN: dsymutil -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib
 RUN: not dsymutil -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch arm42 2>&1 | FileCheck %s -check-prefix=BADARCH
 RUN: not dsymutil -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch i386 2>&1 | FileCheck %s -check-prefix=EMPTY
 
+RUN: dsymutil --linker parallel -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib | FileCheck %s -check-prefixes=ARM64,ARMV7S,ARMV7,CHECK
+RUN: dsymutil --linker parallel -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch all | FileCheck %s -check-prefixes=ARM64,ARMV7S,ARMV7,CHECK
+RUN: dsymutil --linker parallel -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch='*' | FileCheck %s -check-prefixes=ARM64,ARMV7S,ARMV7,CHECK
+RUN: dsymutil --linker parallel -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch arm64 | FileCheck %s -check-prefixes=ARM64,CHECK
+RUN: dsymutil --linker parallel -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch arm | FileCheck %s -check-prefixes=ARMV7S,ARMV7,CHECK
+RUN: dsymutil --linker parallel -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch armv7 | FileCheck %s -check-prefixes=ARMV7,CHECK
+RUN: dsymutil --linker parallel -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch arm64 -arch armv7s | FileCheck %s -check-prefixes=ARM64,ARMV7S,CHECK
+RUN: not dsymutil --linker parallel -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch arm42 2>&1 | FileCheck %s -check-prefix=BADARCH
+RUN: not dsymutil --linker parallel -oso-prepend-path %p -dump-debug-map %p/Inputs/fat-test.arm.dylib -arch i386 2>&1 | FileCheck %s -check-prefix=EMPTY
+
+
 
 ARMV7: ---
 ARMV7-NOT: ...
diff --git a/llvm/test/tools/dsymutil/archive-timestamp.test b/llvm/test/tools/dsymutil/archive-timestamp.test
index 69fa261d59c8f..c86d19e57738c 100644
--- a/llvm/test/tools/dsymutil/archive-timestamp.test
+++ b/llvm/test/tools/dsymutil/archive-timestamp.test
@@ -1,5 +1,7 @@
 # RUN: dsymutil -no-output -oso-prepend-path=%p -y %s 2>&1 | FileCheck -DMSG=%errc_ENOENT %s
 
+# RUN: dsymutil --linker parallel -no-output -oso-prepend-path=%p -y %s 2>&1 | FileCheck -DMSG=%errc_ENOENT %s
+
 # This is the archive member part of basic-archive.macho.x86_64 debug map with corrupted timestamps.
 
 # CHECK: warning: {{.*}}libbasic.a(basic2.macho.x86_64.o): [[MSG]]
diff --git a/llvm/test/tools/dsymutil/basic-linking.test b/llvm/test/tools/dsymutil/basic-linking.test
index 88cd3293efa24..1e8027d05b3c3 100644
--- a/llvm/test/tools/dsymutil/basic-linking.test
+++ b/llvm/test/tools/dsymutil/basic-linking.test
@@ -6,6 +6,11 @@ RUN: dsymutil -no-output -verbose -oso-prepend-path=%p -D %p/Inputs %p/Inputs/ba
 RUN: dsymutil -no-output -verbose -oso-prepend-path=%p -D %p/Inputs %p/Inputs/two-level-relink.macho.arm64.dylib | FileCheck %s --check-prefix=CHECK-RELINK-TWO
 RUN: dsymutil -no-output -verbose -oso-prepend-path=%p -build-variant-suffix=_debug -D WrongPath -D %p/Inputs %p/Inputs/variant-relink.macho.arm64.dylib | FileCheck %s --check-prefix=CHECK-RELINK-VARIANT
 
+RUN: dsymutil --linker parallel -no-output -verbose -oso-prepend-path=%p %p/Inputs/basic.macho.x86_64 | FileCheck %s
+RUN: dsymutil --linker parallel -no-output -verbose -oso-prepend-path=%p %p/Inputs/basic-lto.macho.x86_64 | FileCheck %s --check-prefix=CHECK-LTO
+RUN: dsymutil --linker parallel -no-output -verbose -oso-prepend-path=%p %p/Inputs/basic-archive.macho.x86_64 | FileCheck %s --check-prefix=CHECK-ARCHIVE
+RUN: dsymutil --linker parallel -no-output -verbose -oso-prepend-path=%p %p/Inputs/basic.macho.x86_64 %p/Inputs/basic-lto.macho.x86_64 %p/Inputs/basic-archive.macho.x86_64 | FileCheck %s --check-prefixes=CHECK,CHECK-LTO,CHECK-ARCHIVE
+
 This test check the basic Dwarf linking process through the debug dumps.
 
 ================================= Simple link ================================
diff --git a/llvm/test/tools/dsymutil/debug-map-parsing.test b/llvm/test/tools/dsymutil/debug-map-parsing.test
index 2f2bf6a2d2c5a..29b9d65477207 100644
--- a/llvm/test/tools/dsymutil/debug-map-parsing.test
+++ b/llvm/test/tools/dsymutil/debug-map-parsing.test
@@ -4,6 +4,13 @@ RUN: dsymutil -verbose -dump-debug-map -oso-prepend-path=%p %p/Inputs/basic-arch
 RUN: dsymutil -dump-debug-map %p/Inputs/basic.macho.x86_64 2>&1 | FileCheck -DMSG=%errc_ENOENT %s --check-prefix=NOT-FOUND
 RUN: not dsymutil -dump-debug-map %p/Inputs/inexistant 2>&1 | FileCheck -DMSG=%errc_ENOENT %s --check-prefix=NO-EXECUTABLE
 
+RUN: dsymutil --linker parallel -dump-debug-map -oso-prepend-path=%p %p/Inputs/basic.macho.x86_64 | FileCheck %s
+RUN: dsymutil --linker parallel -dump-debug-map -oso-prepend-path=%p %p/Inputs/basic-lto.macho.x86_64 | FileCheck %s --check-prefix=CHECK-LTO
+RUN: dsymutil --linker parallel -verbose -dump-debug-map -oso-prepend-path=%p %p/Inputs/basic-archive.macho.x86_64 2>&1 | FileCheck %s --check-prefix=CHECK-ARCHIVE
+RUN: dsymutil --linker parallel -dump-debug-map %p/Inputs/basic.macho.x86_64 2>&1 | FileCheck -DMSG=%errc_ENOENT %s --check-prefix=NOT-FOUND
+RUN: not dsymutil --linker parallel -dump-debug-map %p/Inputs/inexistant 2>&1 | FileCheck -DMSG=%errc_ENOENT %s --check-prefix=NO-EXECUTABLE
+
+
 
 Check that We can parse the debug map of the basic executable.
 
diff --git a/llvm/test/tools/dsymutil/dump-symtab.test b/llvm/test/tools/dsymutil/dump-symtab.test
index c0b51446c93f0..6c97140b37017 100644
--- a/llvm/test/tools/dsymutil/dump-symtab.test
+++ b/llvm/test/tools/dsymutil/dump-symtab.test
@@ -1,6 +1,9 @@
 RUN: dsymutil -s %p/Inputs/fat-test.dylib | FileCheck -check-prefixes=ALL,I386 %s
 RUN: dsymutil -arch i386 -s %p/Inputs/fat-test.dylib | FileCheck -check-prefixes=I386,ONE %s
 
+RUN: dsymutil --linker parallel -s %p/Inputs/fat-test.dylib | FileCheck -check-prefixes=ALL,I386 %s
+RUN: dsymutil --linker parallel -arch i386 -s %p/Inputs/fat-test.dylib | FileCheck -check-prefixes=I386,ONE %s
+
 
 ALL:  ----------------------------------------------------------------------
 ALL-NEXT:  Symbol table for: '{{.*}}fat-test.dylib' (x86_64)
diff --git a/llvm/test/tools/dsymutil/fat-binary-output.test b/llvm/test/tools/dsymutil/fat-binary-output.test
index 7d379c8dc58b8..46b86b0513735 100644
--- a/llvm/test/tools/dsymutil/fat-binary-output.test
+++ b/llvm/test/tools/dsymutil/fat-binary-output.test
@@ -1,5 +1,7 @@
 RUN: dsymutil -f -verbose -no-output %p/Inputs/fat-test.dylib -oso-prepend-path %p | FileCheck %s
 
+RUN: dsymutil --linker parallel -f -verbose -no-output %p/Inputs/fat-test.dylib -oso-prepend-path %p | FileCheck %s
+
 This test doesn't produce any filesytstem output, we just look at the verbose
 log output.
 
diff --git a/llvm/test/tools/dsymutil/fat-header.test b/llvm/test/tools/dsymutil/fat-header.test
index d605183737975..3130f4608476e 100644
--- a/llvm/test/tools/dsymutil/fat-header.test
+++ b/llvm/test/tools/dsymutil/fat-header.test
@@ -3,8 +3,14 @@ REQUIRES: system-darwin,arm-registered-target,aarch64-registered-target
 RUN: dsymutil -oso-prepend-path %p %p/Inputs/fat-test.arm.dylib -o %t.fat32.dSYM
 RUN: llvm-objdump -m --universal-headers %t.fat32.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s -check-prefixes=FAT32
 
+RUN: dsymutil --linker parallel -oso-prepend-path %p %p/Inputs/fat-test.arm.dylib -o %t.fat32.dSYM
+RUN: llvm-objdump -m --universal-headers %t.fat32.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s -check-prefixes=FAT32
+
 RUN: dsymutil -oso-prepend-path %p %p/Inputs/fat-test.arm.dylib -o %t.fat64.dSYM -fat64
 RUN: llvm-objdump -m --universal-headers %t.fat64.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s -check-prefixes=FAT64
 
+RUN: dsymutil --linker parallel -oso-prepend-path %p %p/Inputs/fat-test.arm.dylib -o %t.fat64.dSYM -fat64
+RUN: llvm-objdump -m --universal-headers %t.fat64.dSYM/Contents/Resources/DWARF/fat-test.arm.dylib | FileCheck %s -check-prefixes=FAT64
+
 FAT32: fat_magic FAT_MAGIC
 FAT64: fat_magic FAT_MAGIC_64
diff --git a/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test b/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test
index 09b5ccf1e6d5a..dfa0f285c5ce5 100644
--- a/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test
+++ b/llvm/test/tools/dsymutil/yaml-object-address-rewrite.test
@@ -1,5 +1,7 @@
 # RUN: dsymutil -dump-debug-map -oso-prepend-path=%p -y %s | FileCheck %s
 #
+# RUN: dsymutil --linker parallel -dump-debug-map -oso-prepend-path=%p -y %s | FileCheck %s
+#
 # The YAML debug map bellow is the one from basic-archive.macho.x86_64 with
 # the object addresses set to zero. Check that the YAML import is able to
 # rewrite these addresses to the right values.
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
index d09dcf678e1e9..5ae5ecd556adb 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.cpp
@@ -1041,13 +1041,13 @@ DwarfLinkerForBinary::AddressManager::getRelocValue(const ValidReloc &Reloc) {
 std::optional<int64_t>
 DwarfLinkerForBinary::AddressManager::hasValidRelocationAt(
     const std::vector<ValidReloc> &AllRelocs, uint64_t StartOffset,
-    uint64_t EndOffset) {
+    uint64_t EndOffset, bool Verbose) {
   std::vector<ValidReloc> Relocs =
       getRelocations(AllRelocs, StartOffset, EndOffset);
   if (Relocs.size() == 0)
     return std::nullopt;
 
-  if (Linker.Options.Verbose)
+  if (Verbose)
     printReloc(Relocs[0]);
 
   return getRelocValue(Relocs[0]);
@@ -1077,7 +1077,7 @@ getAttributeOffsets(const DWARFAbbreviationDeclaration *Abbrev, unsigned Idx,
 std::optional<int64_t>
 DwarfLinkerForBinary::AddressManager::getExprOpAddressRelocAdjustment(
     DWARFUnit &U, const DWARFExpression::Operation &Op, uint64_t StartOffset,
-    uint64_t EndOffset) {
+    uint64_t EndOffset, bool Verbose) {
   switch (Op.getCode()) {
   default: {
     assert(false && "Specified operation does not have address operand");
@@ -1089,11 +1089,13 @@ DwarfLinkerForBinary::AddressManager::getExprOpAddressRelocAdjustment(
   case dwarf::DW_OP_const4s:
   case dwarf::DW_OP_const8s:
   case dwarf::DW_OP_addr: {
-    return hasValidRelocationAt(ValidDebugInfoRelocs, StartOffset, EndOffset);
+    return hasValidRelocationAt(ValidDebugInfoRelocs, StartOffset, EndOffset,
+                                Verbose);
   } break;
   case dwarf::DW_OP_constx:
   case dwarf::DW_OP_addrx: {
-    return hasValidRelocationAt(ValidDebugAddrRelocs, StartOffset, EndOffset);
+    return hasValidRelocationAt(ValidDebugAddrRelocs, StartOffset, EndOffset,
+                                Verbose);
   } break;
   }
 
@@ -1102,7 +1104,7 @@ DwarfLinkerForBinary::AddressManager::getExprOpAddressRelocAdjustment(
 
 std::optional<int64_t>
 DwarfLinkerForBinary::AddressManager::getSubprogramRelocAdjustment(
-    const DWARFDie &DIE) {
+    const DWARFDie &DIE, bool Verbose) {
   const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
 
   std::optional<uint32_t> LowPcIdx =
@@ -1119,7 +1121,7 @@ DwarfLinkerForBinary::AddressManager::getSubprogramRelocAdjustment(
     std::tie(LowPcOffset, LowPcEndOffset) =
         getAttributeOffsets(Abbrev, *LowPcIdx, Offset, *DIE.getDwarfUnit());
     return hasValidRelocationAt(ValidDebugInfoRelocs, LowPcOffset,
-                                LowPcEndOffset);
+                                LowPcEndOffset, Verbose);
   }
   case dwarf::DW_FORM_addrx:
   case dwarf::DW_FORM_addrx1:
@@ -1130,9 +1132,9 @@ DwarfLinkerForBinary::AddressManager::getSubprogramRelocAdjustment(
     if (std::optional<uint64_t> AddressOffset =
             DIE.getDwarfUnit()->getIndexedAddressOffset(
                 AddrValue->getRawUValue()))
-      return hasValidRelocationAt(ValidDebugAddrRelocs, *AddressOffset,
-                                  *AddressOffset +
-                                      DIE.getDwarfUnit()->getAddressByteSize());
+      return hasValidRelocationAt(
+          ValidDebugAddrRelocs, *AddressOffset,
+          *AddressOffset + DIE.getDwarfUnit()->getAddressByteSize(), Verbose);
 
     Linker.reportWarning("no base offset for address table", SrcFileName);
     return std::nullopt;
diff --git a/llvm/tools/dsymutil/DwarfLinkerForBinary.h b/llvm/tools/dsymutil/DwarfLinkerForBinary.h
index 14b9625117c2d..53f9e183ebe88 100644
--- a/llvm/tools/dsymutil/DwarfLinkerForBinary.h
+++ b/llvm/tools/dsymutil/DwarfLinkerForBinary.h
@@ -192,18 +192,20 @@ class DwarfLinkerForBinary {
 
     /// Checks that there is a relocation in the \p Relocs array against a
     /// debug map entry between \p StartOffset and \p NextOffset.
+    /// Print debug output if \p Verbose is set.
     ///
     /// \returns relocation value if relocation exist, otherwise std::nullopt.
     std::optional<int64_t>
     hasValidRelocationAt(const std::vector<ValidReloc> &Relocs,
-                         uint64_t StartOffset, uint64_t EndOffset);
+                         uint64_t StartOffset, uint64_t EndOffset,
+                         bool Verbose);
 
     std::optional<int64_t> getExprOpAddressRelocAdjustment(
         DWARFUnit &U, const DWARFExpression::Operation &Op,
-        uint64_t StartOffset, uint64_t EndOffset) override;
+        uint64_t StartOffset, uint64_t EndOffset, bool Verbose) override;
 
-    std::optional<int64_t>
-    getSubprogramRelocAdjustment(const DWARFDie &DIE) override;
+    std::optional<int64_t> getSubprogramRelocAdjustment(const DWARFDie &DIE,
+                                                        bool Verbose) override;
 
     std::optional<StringRef> getLibraryInstallName() override;
 
diff --git a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
index 19eb1bed7578b..bd17f3c4a6595 100644
--- a/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
+++ b/llvm/tools/llvm-dwarfutil/DebugInfoLinker.cpp
@@ -80,8 +80,8 @@ class ObjFileAddressMap : public AddressesMap {
   // should be renamed into has valid address ranges
   bool hasValidRelocs() override { return HasValidAddressRanges; }
 
-  std::optional<int64_t>
-  getSubprogramRelocAdjustment(const DWARFDie &DIE) override {
+  std::optional<int64_t> getSubprogramRelocAdjustment(const DWARFDie &DIE,
+                                                      bool Verbose) override {
     assert((DIE.getTag() == dwarf::DW_TAG_subprogram ||
             DIE.getTag() == dwarf::DW_TAG_label) &&
            "Wrong type of input die");
@@ -101,7 +101,7 @@ class ObjFileAddressMap : public AddressesMap {
   std::optional<int64_t>
   getExprOpAddressRelocAdjustment(DWARFUnit &U,
                                   const DWARFExpression::Operation &Op,
-                                  uint64_t, uint64_t) override {
+                                  uint64_t, uint64_t, bool Verbose) override {
     switch (Op.getCode()) {
     default: {
       assert(false && "Specified operation does not have address operand");

From 1715866a94b30f719f8db58591c9adfda6ff0ef6 Mon Sep 17 00:00:00 2001
From: Yi Wu <43659785+yi-wu-arm@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:05:42 +0000
Subject: [PATCH 581/843] fix optional wait wrongly treated as false (#78149)

---
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 31 +++++++++++--
 .../execute_command_line-optional.f90         | 25 ++++++-----
 .../Lower/Intrinsics/execute_command_line.f90 | 43 ++++++++++++-------
 3 files changed, 68 insertions(+), 31 deletions(-)

diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index ac7d4fbe23e67..d781650069442 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -217,7 +217,7 @@ static constexpr IntrinsicHandler handlers[]{
     {"execute_command_line",
      &I::genExecuteCommandLine,
      {{{"command", asBox},
-       {"wait", asValue, handleDynamicOptional},
+       {"wait", asAddr, handleDynamicOptional},
        {"exitstat", asBox, handleDynamicOptional},
        {"cmdstat", asBox, handleDynamicOptional},
        {"cmdmsg", asBox, handleDynamicOptional}}},
@@ -2914,7 +2914,9 @@ IntrinsicLibrary::genEoshift(mlir::Type resultType,
 void IntrinsicLibrary::genExecuteCommandLine(
     llvm::ArrayRef<fir::ExtendedValue> args) {
   assert(args.size() == 5);
+
   mlir::Value command = fir::getBase(args[0]);
+  // Optional arguments: wait, exitstat, cmdstat, cmdmsg.
   const fir::ExtendedValue &wait = args[1];
   const fir::ExtendedValue &exitstat = args[2];
   const fir::ExtendedValue &cmdstat = args[3];
@@ -2925,9 +2927,30 @@ void IntrinsicLibrary::genExecuteCommandLine(
 
   mlir::Type boxNoneTy = fir::BoxType::get(builder.getNoneType());
 
-  mlir::Value waitBool = isStaticallyPresent(wait)
-                             ? fir::getBase(wait)
-                             : builder.createBool(loc, true);
+  mlir::Value waitBool;
+  if (isStaticallyAbsent(wait)) {
+    waitBool = builder.createBool(loc, true);
+  } else {
+    mlir::Type i1Ty = builder.getI1Type();
+    mlir::Value waitAddr = fir::getBase(wait);
+    mlir::Value waitIsPresentAtRuntime =
+        builder.genIsNotNullAddr(loc, waitAddr);
+    waitBool = builder
+                   .genIfOp(loc, {i1Ty}, waitIsPresentAtRuntime,
+                            /*withElseRegion=*/true)
+                   .genThen([&]() {
+                     auto waitLoad = builder.create<fir::LoadOp>(loc, waitAddr);
+                     mlir::Value cast =
+                         builder.createConvert(loc, i1Ty, waitLoad);
+                     builder.create<fir::ResultOp>(loc, cast);
+                   })
+                   .genElse([&]() {
+                     mlir::Value trueVal = builder.createBool(loc, true);
+                     builder.create<fir::ResultOp>(loc, trueVal);
+                   })
+                   .getResults()[0];
+  }
+
   mlir::Value exitstatBox =
       isStaticallyPresent(exitstat)
           ? fir::getBase(exitstat)
diff --git a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90 b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
index e51c0e5fca300..0b75a20216af7 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line-optional.f90
@@ -12,7 +12,10 @@ subroutine all_args_optional(command, isWait, exitVal, cmdVal, msg)
   LOGICAL, OPTIONAL :: isWait
   ! Note: command is not optional in execute_command_line and must be present
   call execute_command_line(command, isWait, exitVal, cmdVal, msg)
-! CHECK:         %[[cmdstatDeclare:.*]] = fir.declare %[[cmdstatArg]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcmdval"} : (!fir.ref<i32>) -> !fir.ref<i32>
+! CHECK-NEXT:    %[[c14:.*]] = arith.constant 14 : i32 
+! CHECK-NEXT:    %true = arith.constant true 
+! CHECK-NEXT:    %[[c0:.*]] = arith.constant 0 : i64 
+! CHECK-NEXT:    %[[cmdstatDeclare:.*]] = fir.declare %[[cmdstatArg]] {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcmdval"} : (!fir.ref<i32>) -> !fir.ref<i32>
 ! CHECK-NEXT:    %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:    %[[commandDeclare:.*]] = fir.declare %[[commandUnbox]]#0 typeparams %[[commandUnbox]]#1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEcommand"} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.ref<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[commandBoxTemp:.*]] = fir.emboxchar %[[commandDeclare]], %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
@@ -21,18 +24,10 @@ subroutine all_args_optional(command, isWait, exitVal, cmdVal, msg)
 ! CHECK-NEXT:    %[[cmdmsgUnbox:.*]]:2 = fir.unboxchar %[[cmdmsgArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:    %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgUnbox]]#0 typeparams %[[cmdmsgUnbox]]#1 {fortran_attrs = #fir.var_attrs<optional>, uniq_name = "_QFall_args_optionalEmsg"} : (!fir.ref<!fir.char<1,?>>, index) -> !fir.ref<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[cmdmsgBoxTemp:.*]] = fir.emboxchar %[[cmdmsgDeclare]], %[[cmdmsgUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.boxchar<1>
-! CHECK-NEXT:    %[[waitIsPresent:.*]] = fir.is_present %[[waitDeclare]] : (!fir.ref<!fir.logical<4>>) -> i1
 ! CHECK-NEXT:    %[[exitstatIsPresent:.*]] = fir.is_present %[[exitstatDeclare]] : (!fir.ref<i32>) -> i1
 ! CHECK-NEXT:    %[[cmdstatIsPresent:.*]] = fir.is_present %[[cmdstatDeclare]] : (!fir.ref<i32>) -> i1
 ! CHECK-NEXT:    %[[cmdmsgIsPresent:.*]] = fir.is_present %[[cmdmsgBoxTemp]] : (!fir.boxchar<1>) -> i1
 ! CHECK-NEXT:    %[[commandBox:.*]] = fir.embox %[[commandDeclare]] typeparams %[[commandUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
-! CHECK-NEXT:    %[[waitLoaded:.*]] = fir.if %[[waitIsPresent]] -> (!fir.logical<4>) {
-! CHECK-NEXT:      %[[VAL_31:.*]] = fir.load %[[waitDeclare]] : !fir.ref<!fir.logical<4>>
-! CHECK-NEXT:      fir.result %[[VAL_31]] : !fir.logical<4>
-! CHECK-NEXT:    } else {
-! CHECK-NEXT:      %[[VAL_31:.*]] = fir.convert %false : (i1) -> !fir.logical<4>
-! CHECK-NEXT:      fir.result %[[VAL_31]] : !fir.logical<4>
-! CHECK-NEXT:    }
 ! CHECK-NEXT:    %[[exitstatArgBox:.*]] = fir.embox %[[exitstatDeclare]] : (!fir.ref<i32>) -> !fir.box<i32>
 ! CHECK-NEXT:    %[[absentBoxi32:.*]] = fir.absent !fir.box<i32>
 ! CHECK-NEXT:    %[[exitstatBox:.*]] = arith.select %[[exitstatIsPresent]], %[[exitstatArgBox]], %[[absentBoxi32]] : !fir.box<i32>
@@ -41,11 +36,19 @@ subroutine all_args_optional(command, isWait, exitVal, cmdVal, msg)
 ! CHECK-NEXT:    %[[cmdmsgArgBox:.*]] = fir.embox %[[cmdmsgDeclare]] typeparams %[[cmdmsgUnbox]]#1 : (!fir.ref<!fir.char<1,?>>, index) -> !fir.box<!fir.char<1,?>>
 ! CHECK-NEXT:    %[[absentBox:.*]] = fir.absent !fir.box<!fir.char<1,?>> 
 ! CHECK-NEXT:    %[[cmdmsgBox:.*]] = arith.select %[[cmdmsgIsPresent]], %[[cmdmsgArgBox]], %[[absentBox]] : !fir.box<!fir.char<1,?>>
+! CHECK-NEXT:    %[[waitCast:.*]] = fir.convert %[[waitDeclare]]  : (!fir.ref<!fir.logical<4>>) -> i64
+! CHECK-NEXT:    %[[waitPresent:.*]] = arith.cmpi ne, %[[waitCast]], %[[c0]] : i64
+! CHECK-NEXT:    %[[wait:.*]] = fir.if %[[waitPresent]] -> (i1) {
+! CHECK-NEXT:      %[[waitLoaded:.*]] = fir.load %[[waitDeclare]] : !fir.ref<!fir.logical<4>>
+! CHECK-NEXT:      %[[waitTrueVal:.*]] = fir.convert %[[waitLoaded]] : (!fir.logical<4>) -> i1
+! CHECK-NEXT:      fir.result %[[waitTrueVal]] : i1
+! CHECK-NEXT:    } else {
+! CHECK-NEXT:      fir.result %true : i1
+! CHECK-NEXT:    }
 ! CHECK:         %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
-! CHECK-NEXT:    %[[wait:.*]] = fir.convert %[[waitLoaded]] : (!fir.logical<4>) -> i1
 ! CHECK-NEXT:    %[[exitstat:.*]] = fir.convert %[[exitstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:    %[[cmdstat:.*]] = fir.convert %[[cmdstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:    %[[cmdmsg:.*]] = fir.convert %[[cmdmsgBox]] : (!fir.box<!fir.char<1,?>>) -> !fir.box<none>
-! CHECK:         %[[VAL_30:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[wait]], %[[exitstat]], %[[cmdstat]], %[[cmdmsg]], %[[VAL_29:.*]], %c14_i32) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:         %[[VAL_30:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[wait]], %[[exitstat]], %[[cmdstat]], %[[cmdmsg]], %[[VAL_29:.*]], %[[c14]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
 ! CHECK-NEXT:    return
 end subroutine all_args_optional
diff --git a/flang/test/Lower/Intrinsics/execute_command_line.f90 b/flang/test/Lower/Intrinsics/execute_command_line.f90
index 1b65bbd5e1550..8aacd34346b4d 100644
--- a/flang/test/Lower/Intrinsics/execute_command_line.f90
+++ b/flang/test/Lower/Intrinsics/execute_command_line.f90
@@ -11,26 +11,37 @@ subroutine all_args(command, isWait, exitVal, cmdVal, msg)
 INTEGER :: exitVal, cmdVal
 LOGICAL :: isWait
 call execute_command_line(command, isWait, exitVal, cmdVal, msg)
-! CHECK:             %[[cmdstatsDeclear:.*]] = fir.declare %[[cmdstatArg]] {uniq_name = "_QFall_argsEcmdval"} : (!fir.ref<i32>) -> !fir.ref<i32>
+! CHECK-NEXT:        %[[c13:.*]] = arith.constant 13 : i32 
+! CHECK-NEXT:        %true = arith.constant true 
+! CHECK-NEXT:        %[[c0:.*]] = arith.constant 0 : i64 
+! CHECK-NEXT:        %[[c30:.*]] = arith.constant 30 : index
+! CHECK-NEXT:        %[[cmdstatsDeclare:.*]] = fir.declare %[[cmdstatArg]] {uniq_name = "_QFall_argsEcmdval"} : (!fir.ref<i32>) -> !fir.ref<i32>
 ! CHECK-NEXT:        %[[commandUnbox:.*]]:2 = fir.unboxchar %[[commandArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:        %[[commandCast:.*]] = fir.convert %[[commandUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[commandDeclear:.*]] = fir.declare %[[commandCast]] typeparams %c30 {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,30>>, index) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[exitstatDeclear:.*]] = fir.declare %[[exitstatArg]] {uniq_name = "_QFall_argsEexitval"} : (!fir.ref<i32>) -> !fir.ref<i32>
-! CHECK-NEXT:        %[[waitDeclear:.*]] = fir.declare %[[waitArg]] {uniq_name = "_QFall_argsEiswait"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
+! CHECK-NEXT:        %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] {uniq_name = "_QFall_argsEcommand"} : (!fir.ref<!fir.char<1,30>>, index) -> !fir.ref<!fir.char<1,30>>
+! CHECK-NEXT:        %[[exitstatDeclare:.*]] = fir.declare %[[exitstatArg]] {uniq_name = "_QFall_argsEexitval"} : (!fir.ref<i32>) -> !fir.ref<i32>
+! CHECK-NEXT:        %[[waitDeclare:.*]] = fir.declare %[[waitArg]] {uniq_name = "_QFall_argsEiswait"} : (!fir.ref<!fir.logical<4>>) -> !fir.ref<!fir.logical<4>>
 ! CHECK-NEXT:        %[[cmdmsgUnbox:.*]]:2 = fir.unboxchar %[[cmdmsgArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:        %[[cmdmsgCast:.*]] = fir.convert %[[cmdmsgUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[cmdmsgDeclear:.*]] = fir.declare %[[cmdmsgCast]] typeparams %c30 {uniq_name = "_QFall_argsEmsg"} : (!fir.ref<!fir.char<1,30>>, index) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:        %[[commandBox:.*]] = fir.embox %[[commandDeclear]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
-! CHECK-NEXT:        %[[waitLoaded:.*]] = fir.load %[[waitDeclear]] : !fir.ref<!fir.logical<4>>
-! CHECK-NEXT:        %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclear]] : (!fir.ref<i32>) -> !fir.box<i32>
-! CHECK-NEXT:        %[[cmdstatBox:.*]] = fir.embox %[[cmdstatsDeclear]] : (!fir.ref<i32>) -> !fir.box<i32>
-! CHECK-NEXT:        %[[cmdmsgBox:.*]] = fir.embox %[[cmdmsgDeclear]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
+! CHECK-NEXT:        %[[cmdmsgDeclare:.*]] = fir.declare %[[cmdmsgCast]] typeparams %[[c30]] {uniq_name = "_QFall_argsEmsg"} : (!fir.ref<!fir.char<1,30>>, index) -> !fir.ref<!fir.char<1,30>>
+! CHECK-NEXT:        %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
+! CHECK-NEXT:        %[[exitstatBox:.*]] = fir.embox %[[exitstatDeclare]] : (!fir.ref<i32>) -> !fir.box<i32>
+! CHECK-NEXT:        %[[cmdstatBox:.*]] = fir.embox %[[cmdstatsDeclare]] : (!fir.ref<i32>) -> !fir.box<i32>
+! CHECK-NEXT:        %[[cmdmsgBox:.*]] = fir.embox %[[cmdmsgDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
+! CHECK-NEXT:        %[[waitCast:.*]] = fir.convert %[[waitDeclare]]  : (!fir.ref<!fir.logical<4>>) -> i64
+! CHECK-NEXT:        %[[waitPresent:.*]] = arith.cmpi ne, %[[waitCast]], %[[c0]] : i64
+! CHECK-NEXT:        %[[wait:.*]] = fir.if %[[waitPresent]] -> (i1) {
+! CHECK-NEXT:          %[[waitLoaded:.*]] = fir.load %[[waitDeclare]] : !fir.ref<!fir.logical<4>>
+! CHECK-NEXT:          %[[waitTrueVal:.*]] = fir.convert %[[waitLoaded]] : (!fir.logical<4>) -> i1
+! CHECK-NEXT:          fir.result %[[waitTrueVal]] : i1
+! CHECK-NEXT:        } else {
+! CHECK-NEXT:          fir.result %true : i1
+! CHECK-NEXT:        }
 ! CHECK:             %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none>
-! CHECK-NEXT:        %[[wait:.*]] = fir.convert %[[waitLoaded]] : (!fir.logical<4>) -> i1
 ! CHECK-NEXT:        %[[exitstat:.*]] = fir.convert %[[exitstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:        %[[cmdstat:.*]] = fir.convert %[[cmdstatBox]] : (!fir.box<i32>) -> !fir.box<none>
 ! CHECK-NEXT:        %[[cmdmsg:.*]] = fir.convert %[[cmdmsgBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none>
-! CHECK:             %[[VAL_21:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[wait]], %[[exitstat]], %[[cmdstat]], %[[cmdmsg]], %[[VAL_20:.*]], %c13_i32) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:             %[[VAL_22:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %[[wait]], %[[exitstat]], %[[cmdstat]], %[[cmdmsg]], %[[VAL_20:.*]], %[[c13]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
 ! CHECK-NEXT:        return
 end subroutine all_args
 
@@ -39,15 +50,15 @@ end subroutine all_args
 subroutine only_command_default_wait_true(command)
 CHARACTER(30) :: command
 call execute_command_line(command)
-! CHECK-NEXT:     %c41_i32 = arith.constant 41 : i32 
+! CHECK-NEXT:     %[[c52:.*]] = arith.constant 52 : i32 
 ! CHECK-NEXT:     %true = arith.constant true 
-! CHECK-NEXT:     %c30 = arith.constant 30 : index
+! CHECK-NEXT:     %[[c30:.*]] = arith.constant 30 : index
 ! CHECK-NEXT:     %[[commandUnbox:.*]]:2 = fir.unboxchar %[[cmdArg]] : (!fir.boxchar<1>) -> (!fir.ref<!fir.char<1,?>>, index)
 ! CHECK-NEXT:     %[[commandCast:.*]] = fir.convert %[[commandUnbox]]#0 : (!fir.ref<!fir.char<1,?>>) -> !fir.ref<!fir.char<1,30>>
-! CHECK-NEXT:     %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %c30 {uniq_name = "_QFonly_command_default_wait_trueEcommand"} : (!fir.ref<!fir.char<1,30>>, index) -> !fir.ref<!fir.char<1,30>>
+! CHECK-NEXT:     %[[commandDeclare:.*]] = fir.declare %[[commandCast]] typeparams %[[c30]] {uniq_name = "_QFonly_command_default_wait_trueEcommand"} : (!fir.ref<!fir.char<1,30>>, index) -> !fir.ref<!fir.char<1,30>>
 ! CHECK-NEXT:     %[[commandBox:.*]] = fir.embox %[[commandDeclare]] : (!fir.ref<!fir.char<1,30>>) -> !fir.box<!fir.char<1,30>>
 ! CHECK-NEXT:     %[[absent:.*]] = fir.absent !fir.box<none>
 ! CHECK:          %[[command:.*]] = fir.convert %[[commandBox]] : (!fir.box<!fir.char<1,30>>) -> !fir.box<none> 
-! CHECK:          %[[VAL_21:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %true, %[[absent]], %[[absent]], %[[absent]], %[[VAL_7:.*]], %c41_i32) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
+! CHECK:          %[[VAL_8:.*]] = fir.call @_FortranAExecuteCommandLine(%[[command]], %true, %[[absent]], %[[absent]], %[[absent]], %[[VAL_7:.*]], %[[c52]]) fastmath<contract> : (!fir.box<none>, i1, !fir.box<none>, !fir.box<none>, !fir.box<none>, !fir.ref<i8>, i32) -> none
 ! CHECK-NEXT:     return
 end subroutine only_command_default_wait_true

From fd9b33754198972b835f05ce58d8bb9f0ac8ff7e Mon Sep 17 00:00:00 2001
From: tltao <tony.le.tao@gmail.com>
Date: Tue, 23 Jan 2024 05:09:43 -0500
Subject: [PATCH 582/843] [TLI] Remove leftover assert in TargetLibraryInfoImpl
 initialize (#79056)

Remove unnecessary assert for a sorted StandardNames after
implementation of getLibFunc is changed from binary search to a DenseMap
Lookup in commit 7d950f040e3da66ec83f91b58d8c2220483d6335.

The original getLibFunc binary search implementation is in commit
c740e3f0d1ca9690100160ed2708d97d3c6562f0.

Co-authored-by: Tony Tao <tonytao@ca.ibm.com>
---
 llvm/lib/Analysis/TargetLibraryInfo.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index 58749e559040a..25951d2a7fe63 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -163,12 +163,6 @@ bool TargetLibraryInfoImpl::isCallingConvCCompatible(Function *F) {
 /// triple gets a sane set of defaults.
 static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
                        ArrayRef<StringLiteral> StandardNames) {
-  // Verify that the StandardNames array is in alphabetical order.
-  assert(
-      llvm::is_sorted(StandardNames,
-                      [](StringRef LHS, StringRef RHS) { return LHS < RHS; }) &&
-      "TargetLibraryInfoImpl function names must be sorted");
-
   // Set IO unlocked variants as unavailable
   // Set them as available per system below
   TLI.setUnavailable(LibFunc_getc_unlocked);

From d386c40b23798d6b80de5fea61b541dc8cbae36c Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 23 Jan 2024 18:52:09 +0900
Subject: [PATCH 583/843] test/llvm-cov: Regenerate mcdc-maxbs.o w/o zlib
 (#78963)

---
 llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o | Bin 4648 -> 4632 bytes
 llvm/test/tools/llvm-cov/mcdc-maxbs.test     |   1 +
 2 files changed, 1 insertion(+)

diff --git a/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o b/llvm/test/tools/llvm-cov/Inputs/mcdc-maxbs.o
index 269902224ccae9ca01c2b858f020410d7f221ce5..8e592cf6563da92b98429e83244084649d49fd81 100644
GIT binary patch
delta 246
zcmZ3XGDBs82BXAA&D)F|_jVb_JM9(zI+=&*1cGyi$v{?+0Rq^76cZl<15a*pO0sTl
zVntH1Uh-r^<_DY%#idD%xrv#1o0VAN*cl&9Zse|KteDKmW6xMH*^vjtKFHI~s4}^e
z$9%E_kI3XHygH0JlMnLRPd>oIF`0wUg3(~IBcDA`T_ce6n0%1Wei9#0g5QF1$7Dx-
zd&UKm8-e7G$p?Yt39zDy$&Lc{oE$6=w{cDG6fkELntV{eo>6-;qo6&=Kp?qdawCwu
QG5MgN{p1FIpou~X0G?$=+yDRo

delta 271
zcmbQCvO;Bo2BXGC&D)F|akV1v6<1}=nasm<0>Qb%WS}a^00C@3iiuCG;#Ts5lfHhQ
z*UtK`@zA>F&Bh?d%~CtrklB5*12f0u3g(G|0u{Hk^?Z}g_?|h#%)o2S^l`HgOB_4n
zr^$uf^^6^pKXThoF5nfJ?81{X`2#P<<Q+T>j5?DmdCVCnOfKZLXEd3-k=K6m2R@F;
zKX@$|EhZcC*)ujwE(DShlQ#m%6_Y;#$pe!O`GKkgfmZTc0C^kv?HMnCRdq}@6tL$M
vV1YPWXmX{1Iiu3#jRN+J){{R1Nr%aXg7%C%CKm$97n3&v$pe!=3aSGDp4L`?

diff --git a/llvm/test/tools/llvm-cov/mcdc-maxbs.test b/llvm/test/tools/llvm-cov/mcdc-maxbs.test
index ad5bc828529ce..bbb5fb8b6059a 100644
--- a/llvm/test/tools/llvm-cov/mcdc-maxbs.test
+++ b/llvm/test/tools/llvm-cov/mcdc-maxbs.test
@@ -33,6 +33,7 @@
 cd %S/Inputs # or copy %S/Inputs/mcdc-maxbs.c into the working directory
 clang -O3 -fcoverage-mcdc -fprofile-instr-generate \
     -fcoverage-mapping -fcoverage-compilation-dir=. \
+    -mllvm -enable-name-compression=false \
     mcdc-maxbs.c -c -o mcdc-maxbs.o
 
 # Instructions for regenerating the test vector:

From bb3a515f95513571886bb900d4ddefb9ea5b4bad Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Tue, 23 Jan 2024 10:07:10 +0000
Subject: [PATCH 584/843] [AMDGPU][NFC] Refine determining the vdata operand in
 MUBUF_Load_Pseudo<>.

A follow-up from <https://github.com/llvm/llvm-project/pull/79025>.
---
 llvm/lib/Target/AMDGPU/BUFInstructions.td | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 87247a134848b..ae0955f0cf6a4 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -485,8 +485,8 @@ class MUBUF_Load_Pseudo <string opName,
                          list<dag> pattern=[],
                          // Workaround bug bz30254
                          int addrKindCopy = addrKind,
-                         RegisterOperand vdata_rc = getVregSrcForVT<vdata_vt>.ret,
-                         RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc.RegClass, isTFE>.ret>
+                         RegisterClass vdata_rc = getVregSrcForVT<vdata_vt>.ret.RegClass,
+                         RegisterOperand vdata_op = getLdStVDataRegisterOperand<vdata_rc, isTFE>.ret>
   : MUBUF_Pseudo<opName,
                  !if(!or(isLds, isLdsOpc), (outs), (outs vdata_op:$vdata)),
                  !con(getMUBUFIns<addrKindCopy, [], isTFE, hasGFX12Enc>.ret,

From 9629c73aeb4609f58aa9edb0b87d18dd9e8fecc0 Mon Sep 17 00:00:00 2001
From: Dmitri Gribenko <gribozavr@gmail.com>
Date: Tue, 23 Jan 2024 11:43:26 +0100
Subject: [PATCH 585/843] [libc++] Remove a duplicated definition of
 _LIBCPP_NOINLINE (#79114)

I failed to delete the old definition as a part of
https://github.com/llvm/llvm-project/pull/73838.
---
 libcxx/include/__config | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/libcxx/include/__config b/libcxx/include/__config
index eab37e7576bdc..9557e8e8cf97f 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1235,12 +1235,6 @@ __sanitizer_verify_double_ended_contiguous_container(const void*, const void*, c
 #    define _LIBCPP_CONSTINIT
 #  endif
 
-#  if __has_attribute(__noinline__)
-#    define _LIBCPP_NOINLINE __attribute__((__noinline__))
-#  else
-#    define _LIBCPP_NOINLINE
-#  endif
-
 #  if defined(__CUDACC__) || defined(__CUDA_ARCH__) || defined(__CUDA_LIBDEVICE__)
 // The CUDA SDK contains an unfortunate definition for the __noinline__ macro,
 // which breaks the regular __attribute__((__noinline__)) syntax. Therefore,

From 3bc86bf3bf742506818cf4d94c9227e4afed6f19 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Tue, 23 Jan 2024 11:48:28 +0100
Subject: [PATCH 586/843] [libc] Remove unnecessary `FPBits` functions and
 properties (#79113)

This patch reduces the surface of `FPBits`.
---
 libc/fuzzing/stdlib/strtofloat_fuzz.cpp       |  2 +-
 .../FPUtil/DivisionAndRemainderOperations.h   |  2 +-
 libc/src/__support/FPUtil/FPBits.h            | 51 +++---------
 libc/src/__support/FPUtil/Hypot.h             |  4 +-
 .../__support/FPUtil/ManipulationFunctions.h  | 24 +++---
 libc/src/__support/FPUtil/NormalFloat.h       |  2 +-
 libc/src/__support/FPUtil/dyadic_float.h      |  2 +-
 .../src/__support/FPUtil/except_value_utils.h |  6 +-
 libc/src/__support/FPUtil/generic/FMA.h       | 10 +--
 libc/src/__support/FPUtil/generic/FMod.h      |  2 +-
 libc/src/__support/FPUtil/generic/sqrt.h      | 18 +++--
 .../FPUtil/generic/sqrt_80_bit_long_double.h  |  8 +-
 .../FPUtil/x86_64/NextAfterLongDouble.h       | 58 +++++++------
 libc/src/__support/common.h                   |  1 -
 libc/src/__support/str_to_float.h             |  4 +-
 libc/src/math/generic/acosf.cpp               |  2 +-
 libc/src/math/generic/acoshf.cpp              |  2 +-
 libc/src/math/generic/asinf.cpp               |  2 +-
 libc/src/math/generic/atanhf.cpp              |  4 +-
 libc/src/math/generic/cosf.cpp                |  2 +-
 libc/src/math/generic/coshf.cpp               |  6 +-
 libc/src/math/generic/exp.cpp                 |  4 +-
 libc/src/math/generic/exp10.cpp               |  4 +-
 libc/src/math/generic/exp10f_impl.h           |  6 +-
 libc/src/math/generic/exp2.cpp                |  4 +-
 libc/src/math/generic/exp2f_impl.h            |  6 +-
 libc/src/math/generic/expf.cpp                |  4 +-
 libc/src/math/generic/expm1.cpp               |  4 +-
 libc/src/math/generic/expm1f.cpp              |  4 +-
 libc/src/math/generic/log.cpp                 | 11 +--
 libc/src/math/generic/log10.cpp               | 11 +--
 libc/src/math/generic/log10f.cpp              |  8 +-
 libc/src/math/generic/log1p.cpp               |  7 +-
 libc/src/math/generic/log1pf.cpp              |  4 +-
 libc/src/math/generic/log2.cpp                | 11 +--
 libc/src/math/generic/log2f.cpp               |  8 +-
 libc/src/math/generic/logf.cpp                |  9 ++-
 libc/src/math/generic/powf.cpp                | 13 +--
 libc/src/math/generic/sincosf.cpp             |  4 +-
 libc/src/math/generic/sinf.cpp                |  2 +-
 libc/src/math/generic/sinhf.cpp               |  6 +-
 libc/src/math/generic/tanf.cpp                |  2 +-
 libc/test/UnitTest/FPMatcher.h                | 40 ++++-----
 .../test/src/__support/FPUtil/fpbits_test.cpp | 81 ++++++++-----------
 libc/test/src/math/FDimTest.h                 |  2 +-
 libc/test/src/math/FmaTest.h                  | 37 +++++----
 libc/test/src/math/HypotTest.h                | 43 +++++-----
 libc/test/src/math/ILogbTest.h                | 22 ++---
 libc/test/src/math/LdExpTest.h                |  2 +-
 libc/test/src/math/NextAfterTest.h            | 10 +--
 libc/test/src/math/RIntTest.h                 | 21 ++---
 libc/test/src/math/RemQuoTest.h               | 25 +++---
 libc/test/src/math/RoundToIntegerTest.h       | 22 ++---
 .../BinaryOpSingleOutputDiff.h                | 16 ++--
 .../SingleInputSingleOutputDiff.h             |  7 +-
 libc/test/src/math/smoke/FDimTest.h           |  2 +-
 libc/test/src/math/smoke/FmaTest.h            | 16 ++--
 libc/test/src/math/smoke/HypotTest.h          | 20 ++---
 libc/test/src/math/smoke/ILogbTest.h          | 18 ++---
 libc/test/src/math/smoke/LdExpTest.h          |  2 +-
 libc/test/src/math/smoke/NextAfterTest.h      | 14 ++--
 libc/test/src/math/smoke/NextTowardTest.h     | 22 ++---
 libc/test/src/math/smoke/RIntTest.h           |  2 +-
 libc/test/src/math/smoke/RemQuoTest.h         |  2 +-
 libc/test/src/math/smoke/RoundToIntegerTest.h | 22 ++---
 libc/test/src/stdio/sprintf_test.cpp          | 32 +++++---
 libc/test/src/stdio/sscanf_test.cpp           | 21 +++--
 libc/utils/MPFRWrapper/MPFRUtils.cpp          |  2 +-
 68 files changed, 426 insertions(+), 421 deletions(-)

diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index affef6fcf549e..b773043bda67d 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -28,7 +28,7 @@ using LIBC_NAMESPACE::fputil::FPBits;
 // exponent. Subnormals have a lower effective precision since they don't
 // necessarily use all of the bits of the mantissa.
 template <typename F> inline constexpr int effective_precision(int exponent) {
-  const int full_precision = FPBits<F>::MANTISSA_PRECISION;
+  const int full_precision = FPBits<F>::FRACTION_LEN + 1;
 
   // This is intended to be 0 when the exponent is the lowest normal and
   // increase as the exponent's magnitude increases.
diff --git a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
index 1798310c3e31e..ef9593a42b005 100644
--- a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
+++ b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
@@ -31,7 +31,7 @@ LIBC_INLINE T remquo(T x, T y, int &q) {
   if (ybits.is_nan())
     return y;
   if (xbits.is_inf() || ybits.is_zero())
-    return FPBits<T>::build_quiet_nan(1);
+    return FPBits<T>::build_quiet_nan(fputil::Sign::POS, 1).get_val();
 
   if (xbits.is_zero()) {
     q = 0;
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 2465158bb2cdf..0a79b505ecbe1 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -390,7 +390,7 @@ struct FPRepSem : public FPStorage<fp_type> {
     return exp_bits() == encode(BiasedExp::BITS_ALL_ZEROES());
   }
   LIBC_INLINE constexpr bool is_normal() const {
-    return is_finite() && !UP::is_subnormal();
+    return is_finite() && !is_subnormal();
   }
   // Returns the mantissa with the implicit bit set iff the current
   // value is a valid normal number.
@@ -556,6 +556,14 @@ struct FPRep : public FPRepSem<fp_type, RetT> {
   using UP::FRACTION_MASK;
   using UP::SIGN_MASK;
 
+  // Comparison
+  LIBC_INLINE constexpr friend bool operator==(FPRep a, FPRep b) {
+    return a.uintval() == b.uintval();
+  }
+  LIBC_INLINE constexpr friend bool operator!=(FPRep a, FPRep b) {
+    return a.uintval() != b.uintval();
+  }
+
   // Representation
   LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
   LIBC_INLINE constexpr void set_uintval(StorageType value) {
@@ -698,16 +706,6 @@ struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
   using UP::bits;
 
   // Constants.
-  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
-      UP::FRACTION_LEN + 1;
-  LIBC_INLINE_VAR static constexpr StorageType MIN_NORMAL =
-      UP::min_normal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr StorageType MAX_NORMAL =
-      UP::max_normal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr StorageType MIN_SUBNORMAL =
-      UP::min_subnormal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr StorageType MAX_SUBNORMAL =
-      UP::max_subnormal(Sign::POS).uintval();
   LIBC_INLINE_VAR static constexpr int MAX_BIASED_EXPONENT =
       (1 << UP::EXP_LEN) - 1;
 
@@ -731,37 +729,6 @@ struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
 
   LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
 
-  // Methods below this are used by tests.
-  // TODO: inline and remove.
-  LIBC_INLINE static constexpr T one(Sign sign = Sign::POS) {
-    return T(UP::one(sign));
-  }
-  LIBC_INLINE static constexpr T zero(Sign sign = Sign::POS) {
-    return T(UP::zero(sign));
-  }
-  LIBC_INLINE static constexpr T inf(Sign sign = Sign::POS) {
-    return T(UP::inf(sign));
-  }
-  LIBC_INLINE static constexpr T min_normal() {
-    return T(UP::min_normal(Sign::POS));
-  }
-  LIBC_INLINE static constexpr T max_normal() {
-    return T(UP::max_normal(Sign::POS));
-  }
-  LIBC_INLINE static constexpr T min_denormal() {
-    return T(UP::min_subnormal(Sign::POS));
-  }
-  LIBC_INLINE static constexpr T max_denormal() {
-    return T(UP::max_subnormal(Sign::POS));
-  }
-  LIBC_INLINE static constexpr T build_nan(StorageType v) {
-    return T(UP::build_nan(Sign::POS, v));
-  }
-  LIBC_INLINE static constexpr T build_quiet_nan(StorageType v,
-                                                 Sign sign = Sign::POS) {
-    return T(UP::build_quiet_nan(sign, v));
-  }
-
   // TODO: Use an uint32_t for 'biased_exp'.
   LIBC_INLINE static constexpr FPBits<T>
   create_value(Sign sign, StorageType biased_exp, StorageType mantissa) {
diff --git a/libc/src/__support/FPUtil/Hypot.h b/libc/src/__support/FPUtil/Hypot.h
index c38a40dfb0898..82237dec09e42 100644
--- a/libc/src/__support/FPUtil/Hypot.h
+++ b/libc/src/__support/FPUtil/Hypot.h
@@ -197,7 +197,7 @@ LIBC_INLINE T hypot(T x, T y) {
         if (int round_mode = quick_get_round();
             round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
           return T(FPBits_t::inf());
-        return T(FPBits_t(FPBits_t::MAX_NORMAL));
+        return T(FPBits_t::max_normal());
       }
     } else {
       // For denormal result, we simply move the leading bit of the result to
@@ -254,7 +254,7 @@ LIBC_INLINE T hypot(T x, T y) {
     if (out_exp >= FPBits_t::MAX_BIASED_EXPONENT) {
       if (round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
         return T(FPBits_t::inf());
-      return T(FPBits_t(FPBits_t::MAX_NORMAL));
+      return T(FPBits_t::max_normal());
     }
   }
 
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index 81c8281f3c7bb..d7114625a9b31 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -108,7 +108,7 @@ LIBC_INLINE T logb(T x) {
     return x;
   } else if (bits.is_inf()) {
     // Return positive infinity.
-    return T(FPBits<T>::inf(Sign::POS));
+    return T(FPBits<T>::inf());
   }
 
   NormalFloat<T> normal(bits);
@@ -127,7 +127,7 @@ LIBC_INLINE T ldexp(T x, int exp) {
   // that adding |exp| to it does not lead to integer rollover. But, if |exp|
   // value is larger the exponent range for type T, then we can return infinity
   // early. Because the result of the ldexp operation can be a subnormal number,
-  // we need to accommodate the (mantissaWidht + 1) worth of shift in
+  // we need to accommodate the (mantissaWidth + 1) worth of shift in
   // calculating the limit.
   int exp_limit = FPBits<T>::MAX_BIASED_EXPONENT + FPBits<T>::FRACTION_LEN + 1;
   if (exp > exp_limit)
@@ -164,26 +164,22 @@ LIBC_INLINE T nextafter(T from, U to) {
     return static_cast<T>(to);
 
   using StorageType = typename FPBits<T>::StorageType;
-  StorageType int_val = from_bits.uintval();
-  if (from != FPBits<T>::zero()) {
-    if ((static_cast<U>(from) < to) == (from > FPBits<T>::zero())) {
-      ++int_val;
+  if (from != T(0)) {
+    if ((static_cast<U>(from) < to) == (from > T(0))) {
+      from_bits = FPBits<T>(StorageType(from_bits.uintval() + 1));
     } else {
-      --int_val;
+      from_bits = FPBits<T>(StorageType(from_bits.uintval() - 1));
     }
   } else {
-    int_val = FPBits<T>::MIN_SUBNORMAL;
-    if (to_bits.is_neg())
-      int_val |= FPBits<T>::SIGN_MASK;
+    from_bits = FPBits<T>::min_subnormal(to_bits.sign());
   }
 
-  StorageType exponent_bits = int_val & FPBits<T>::EXP_MASK;
-  if (exponent_bits == StorageType(0))
+  if (from_bits.is_subnormal())
     raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
-  else if (exponent_bits == FPBits<T>::EXP_MASK)
+  else if (from_bits.is_inf())
     raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
 
-  return cpp::bit_cast<T>(int_val);
+  return from_bits.get_val();
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/FPUtil/NormalFloat.h b/libc/src/__support/FPUtil/NormalFloat.h
index cfa9e14175105..fa4da33b5b17f 100644
--- a/libc/src/__support/FPUtil/NormalFloat.h
+++ b/libc/src/__support/FPUtil/NormalFloat.h
@@ -215,7 +215,7 @@ template <> LIBC_INLINE NormalFloat<long double>::operator long double() const {
   // Max exponent is of the form 0xFF...E. That is why -2 and not -1.
   constexpr int MAX_EXPONENT_VALUE = (1 << LDBits::EXP_LEN) - 2;
   if (biased_exponent > MAX_EXPONENT_VALUE) {
-    return LDBits::inf(sign);
+    return LDBits::inf(sign).get_val();
   }
 
   FPBits<long double> result(0.0l);
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 5449f5561d569..888d7ffec241e 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -93,7 +93,7 @@ template <size_t Bits> struct DyadicFloat {
       return 0.0;
 
     // Assume that it is normalized, and output is also normal.
-    constexpr uint32_t PRECISION = FPBits<T>::MANTISSA_PRECISION;
+    constexpr uint32_t PRECISION = FPBits<T>::FRACTION_LEN + 1;
     using output_bits_t = typename FPBits<T>::StorageType;
 
     int exp_hi = exponent + static_cast<int>((Bits - 1) + FPBits<T>::EXP_BIAS);
diff --git a/libc/src/__support/FPUtil/except_value_utils.h b/libc/src/__support/FPUtil/except_value_utils.h
index 89849540315f6..1e0381194009d 100644
--- a/libc/src/__support/FPUtil/except_value_utils.h
+++ b/libc/src/__support/FPUtil/except_value_utils.h
@@ -102,15 +102,13 @@ template <typename T, size_t N> struct ExceptValues {
 // Helper functions to set results for exceptional cases.
 template <typename T> LIBC_INLINE T round_result_slightly_down(T value_rn) {
   volatile T tmp = value_rn;
-  const T MIN_NORMAL = FPBits<T>::min_normal();
-  tmp = tmp - MIN_NORMAL;
+  tmp -= FPBits<T>::min_normal().get_val();
   return tmp;
 }
 
 template <typename T> LIBC_INLINE T round_result_slightly_up(T value_rn) {
   volatile T tmp = value_rn;
-  const T MIN_NORMAL = FPBits<T>::min_normal();
-  tmp = tmp + MIN_NORMAL;
+  tmp += FPBits<T>::min_normal().get_val();
   return tmp;
 }
 
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 6285cac1983d1..5c36463ea5021 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -130,9 +130,9 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
     return x * y + z;
 
   // Extract mantissa and append hidden leading bits.
-  UInt128 x_mant = x_bits.get_mantissa() | FPBits::MIN_NORMAL;
-  UInt128 y_mant = y_bits.get_mantissa() | FPBits::MIN_NORMAL;
-  UInt128 z_mant = z_bits.get_mantissa() | FPBits::MIN_NORMAL;
+  UInt128 x_mant = x_bits.get_explicit_mantissa();
+  UInt128 y_mant = y_bits.get_explicit_mantissa();
+  UInt128 z_mant = z_bits.get_explicit_mantissa();
 
   // If the exponent of the product x*y > the exponent of z, then no extra
   // precision beside the entire product x*y is needed.  On the other hand, when
@@ -255,9 +255,7 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
     if ((round_mode == FE_TOWARDZERO) ||
         (round_mode == FE_UPWARD && prod_sign.is_neg()) ||
         (round_mode == FE_DOWNWARD && prod_sign.is_pos())) {
-      result = FPBits::MAX_NORMAL;
-      return prod_sign.is_neg() ? -cpp::bit_cast<double>(result)
-                                : cpp::bit_cast<double>(result);
+      return FPBits::max_normal(prod_sign).get_val();
     }
     return static_cast<double>(FPBits::inf(prod_sign));
   }
diff --git a/libc/src/__support/FPUtil/generic/FMod.h b/libc/src/__support/FPUtil/generic/FMod.h
index f4000b97751ef..18355b801dbc7 100644
--- a/libc/src/__support/FPUtil/generic/FMod.h
+++ b/libc/src/__support/FPUtil/generic/FMod.h
@@ -124,7 +124,7 @@ template <typename T> struct FModExceptionalInputHandler {
 
   LIBC_INLINE static bool pre_check(T x, T y, T &out) {
     using FPB = fputil::FPBits<T>;
-    const T quiet_nan = FPB::build_quiet_nan(0);
+    const T quiet_nan = FPB::build_quiet_nan().get_val();
     FPB sx(x), sy(y);
     if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() &&
                     !sx.is_inf_or_nan())) {
diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index 21ae9d081d3f1..0a0690ec1463b 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -71,15 +71,19 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
     return x86::sqrt(x);
   } else {
     // IEEE floating points formats.
-    using StorageType = typename FPBits<T>::StorageType;
-    constexpr StorageType ONE = StorageType(1) << FPBits<T>::FRACTION_LEN;
+    using Sign = fputil::Sign;
+    using FPBits_t = typename fputil::FPBits<T>;
+    using StorageType = typename FPBits_t::StorageType;
+    constexpr StorageType ONE = StorageType(1) << FPBits_t::FRACTION_LEN;
+    constexpr auto FLT_NAN =
+        FPBits_t::build_quiet_nan(Sign::POS, ONE >> 1).get_val();
 
-    FPBits<T> bits(x);
+    FPBits_t bits(x);
 
     if (bits.is_inf_or_nan()) {
       if (bits.is_neg() && (bits.get_mantissa() == 0)) {
         // sqrt(-Inf) = NaN
-        return FPBits<T>::build_quiet_nan(ONE >> 1);
+        return FLT_NAN;
       } else {
         // sqrt(NaN) = NaN
         // sqrt(+Inf) = +Inf
@@ -91,7 +95,7 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
       return x;
     } else if (bits.is_neg()) {
       // sqrt( negative numbers ) = NaN
-      return FPBits<T>::build_quiet_nan(ONE >> 1);
+      return FLT_NAN;
     } else {
       int x_exp = bits.get_exponent();
       StorageType x_mant = bits.get_mantissa();
@@ -145,10 +149,10 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
       }
 
       // Remove hidden bit and append the exponent field.
-      x_exp = ((x_exp >> 1) + FPBits<T>::EXP_BIAS);
+      x_exp = ((x_exp >> 1) + FPBits_t::EXP_BIAS);
 
       y = (y - ONE) |
-          (static_cast<StorageType>(x_exp) << FPBits<T>::FRACTION_LEN);
+          (static_cast<StorageType>(x_exp) << FPBits_t::FRACTION_LEN);
 
       switch (quick_get_round()) {
       case FE_TONEAREST:
diff --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
index 4f8d136938f56..b0a3776029ca7 100644
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@@ -38,14 +38,16 @@ LIBC_INLINE long double sqrt(long double x);
 LIBC_INLINE long double sqrt(long double x) {
   using LDBits = FPBits<long double>;
   using StorageType = typename LDBits::StorageType;
+  using Sign = fputil::Sign;
   constexpr StorageType ONE = StorageType(1) << int(LDBits::FRACTION_LEN);
+  constexpr auto LDNAN = LDBits::build_quiet_nan(Sign::POS, ONE >> 1).get_val();
 
-  FPBits<long double> bits(x);
+  LDBits bits(x);
 
   if (bits.is_inf_or_nan()) {
     if (bits.is_neg() && (bits.get_mantissa() == 0)) {
       // sqrt(-Inf) = NaN
-      return LDBits::build_quiet_nan(ONE >> 1);
+      return LDNAN;
     } else {
       // sqrt(NaN) = NaN
       // sqrt(+Inf) = +Inf
@@ -57,7 +59,7 @@ LIBC_INLINE long double sqrt(long double x) {
     return x;
   } else if (bits.is_neg()) {
     // sqrt( negative numbers ) = NaN
-    return LDBits::build_quiet_nan(ONE >> 1);
+    return LDNAN;
   } else {
     int x_exp = bits.get_explicit_exponent();
     StorageType x_mant = bits.get_mantissa();
diff --git a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
index 512f5de4e7931..d1c76ba954b93 100644
--- a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
+++ b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
@@ -43,16 +43,18 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
   }
 
   using StorageType = FPBits::StorageType;
-  constexpr StorageType SIGN_VAL = (StorageType(1) << 79);
+
   constexpr StorageType FRACTION_MASK = FPBits::FRACTION_MASK;
-  StorageType int_val = from_bits.uintval();
-  if (from < 0.0l) {
-    if (from > to) {
-      if (int_val == (SIGN_VAL + FPBits::MAX_SUBNORMAL)) {
+  // StorageType int_val = from_bits.uintval();
+  if (from == 0.0l) { // +0.0 / -0.0
+    from_bits = FPBits::min_subnormal(from > to ? Sign::NEG : Sign::POS);
+  } else if (from < 0.0l) {
+    if (to < from) { // toward -inf
+      if (from_bits == FPBits::max_subnormal(Sign::NEG)) {
         // We deal with normal/subnormal boundary separately to avoid
         // dealing with the implicit bit.
-        int_val = SIGN_VAL + FPBits::MIN_NORMAL;
-      } else if ((int_val & FRACTION_MASK) == FRACTION_MASK) {
+        from_bits = FPBits::min_normal(Sign::NEG);
+      } else if (from_bits.get_mantissa() == FRACTION_MASK) {
         from_bits.set_mantissa(0);
         // Incrementing exponent might overflow the value to infinity,
         // which is what is expected. Since NaNs are handling separately,
@@ -62,45 +64,40 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
           raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
         return from_bits.get_val();
       } else {
-        ++int_val;
+        from_bits = FPBits(StorageType(from_bits.uintval() + 1));
       }
-    } else {
-      if (int_val == (SIGN_VAL + FPBits::MIN_NORMAL)) {
+    } else { // toward +inf
+      if (from_bits == FPBits::min_normal(Sign::NEG)) {
         // We deal with normal/subnormal boundary separately to avoid
         // dealing with the implicit bit.
-        int_val = SIGN_VAL + FPBits::MAX_SUBNORMAL;
-      } else if ((int_val & FRACTION_MASK) == 0) {
+        from_bits = FPBits::max_subnormal(Sign::NEG);
+      } else if (from_bits.get_mantissa() == 0) {
         from_bits.set_mantissa(FRACTION_MASK);
         // from == 0 is handled separately so decrementing the exponent will not
         // lead to underflow.
         from_bits.set_biased_exponent(from_bits.get_biased_exponent() - 1);
         return from_bits.get_val();
       } else {
-        --int_val;
+        from_bits = FPBits(StorageType(from_bits.uintval() - 1));
       }
     }
-  } else if (from == 0.0l) {
-    if (from > to)
-      int_val = SIGN_VAL + 1;
-    else
-      int_val = 1;
   } else {
-    if (from > to) {
-      if (int_val == FPBits::MIN_NORMAL) {
-        int_val = FPBits::MAX_SUBNORMAL;
-      } else if ((int_val & FRACTION_MASK) == 0) {
+    if (to < from) { // toward -inf
+      if (from_bits == FPBits::min_normal(Sign::POS)) {
+        from_bits = FPBits::max_subnormal(Sign::POS);
+      } else if (from_bits.get_mantissa() == 0) {
         from_bits.set_mantissa(FRACTION_MASK);
         // from == 0 is handled separately so decrementing the exponent will not
         // lead to underflow.
         from_bits.set_biased_exponent(from_bits.get_biased_exponent() - 1);
         return from_bits.get_val();
       } else {
-        --int_val;
+        from_bits = FPBits(StorageType(from_bits.uintval() - 1));
       }
-    } else {
-      if (int_val == FPBits::MAX_SUBNORMAL) {
-        int_val = FPBits::MIN_NORMAL;
-      } else if ((int_val & FRACTION_MASK) == FRACTION_MASK) {
+    } else { // toward +inf
+      if (from_bits == FPBits::max_subnormal(Sign::POS)) {
+        from_bits = FPBits::min_normal(Sign::POS);
+      } else if (from_bits.get_mantissa() == FRACTION_MASK) {
         from_bits.set_mantissa(0);
         // Incrementing exponent might overflow the value to infinity,
         // which is what is expected. Since NaNs are handling separately,
@@ -110,16 +107,15 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
           raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
         return from_bits.get_val();
       } else {
-        ++int_val;
+        from_bits = FPBits(StorageType(from_bits.uintval() + 1));
       }
     }
   }
 
-  StorageType implicit_bit = int_val & (StorageType(1) << FPBits::FRACTION_LEN);
-  if (implicit_bit == StorageType(0))
+  if (!from_bits.get_implicit_bit())
     raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
 
-  return cpp::bit_cast<long double>(int_val);
+  return from_bits.get_val();
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/common.h b/libc/src/__support/common.h
index 53951dc131c28..a153dfc363d73 100644
--- a/libc/src/__support/common.h
+++ b/libc/src/__support/common.h
@@ -25,7 +25,6 @@
 #define LLVM_LIBC_FUNCTION_IMPL(type, name, arglist)                           \
   LLVM_LIBC_FUNCTION_ATTR decltype(LIBC_NAMESPACE::name)                       \
       __##name##_impl__ __asm__(#name);                                        \
-  decltype(LIBC_NAMESPACE::name) name [[gnu::alias(#name)]];                   \
   type __##name##_impl__ arglist
 #else
 #define LLVM_LIBC_FUNCTION_IMPL(type, name, arglist) type name arglist
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 8aeb3d2cea03d..9655c993bee28 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -1167,7 +1167,7 @@ LIBC_INLINE StrToNumResult<T> strtofloatingpoint(const char *__restrict src) {
           index = left_paren;
         }
       }
-      result = FPBits(result.build_quiet_nan(nan_mantissa, result.sign()));
+      result = FPBits(result.build_quiet_nan(result.sign(), nan_mantissa));
     }
   } else if (tolower(src[index]) == 'i') { // INF
     if (tolower(src[index + 1]) == inf_string[1] &&
@@ -1215,7 +1215,7 @@ template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) {
     nan_mantissa = static_cast<StorageType>(nan_mantissa_result);
   }
 
-  result = FPBits(result.build_quiet_nan(nan_mantissa));
+  result = FPBits::build_quiet_nan(fputil::Sign::POS, nan_mantissa);
   return {T(result), 0, error};
 }
 
diff --git a/libc/src/math/generic/acosf.cpp b/libc/src/math/generic/acosf.cpp
index 67832596a67fb..7b2a09101182e 100644
--- a/libc/src/math/generic/acosf.cpp
+++ b/libc/src/math/generic/acosf.cpp
@@ -85,7 +85,7 @@ LLVM_LIBC_FUNCTION(float, acosf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_quiet_nan(0);
+    return x + FPBits::build_quiet_nan().get_val();
   }
 
   // When 0.5 < |x| < 1, we perform range reduction as follow:
diff --git a/libc/src/math/generic/acoshf.cpp b/libc/src/math/generic/acoshf.cpp
index b0b87095fbb07..a546cc2268b04 100644
--- a/libc/src/math/generic/acoshf.cpp
+++ b/libc/src/math/generic/acoshf.cpp
@@ -29,7 +29,7 @@ LLVM_LIBC_FUNCTION(float, acoshf, (float x)) {
     // x < 1.
     fputil::set_errno_if_required(EDOM);
     fputil::raise_except_if_required(FE_INVALID);
-    return FPBits_t::build_quiet_nan(0);
+    return FPBits_t::build_quiet_nan().get_val();
   }
 
   if (LIBC_UNLIKELY(x_u >= 0x4f8ffb03)) {
diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp
index bc0d27c1eebc5..ee8e853063644 100644
--- a/libc/src/math/generic/asinf.cpp
+++ b/libc/src/math/generic/asinf.cpp
@@ -109,7 +109,7 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_nan(FPBits::FRACTION_MASK);
+    return x + FPBits::build_nan(Sign::POS, FPBits::FRACTION_MASK).get_val();
   }
 
   // Check for exceptional values
diff --git a/libc/src/math/generic/atanhf.cpp b/libc/src/math/generic/atanhf.cpp
index fd6f5c96b6b4e..cd0acbf24e928 100644
--- a/libc/src/math/generic/atanhf.cpp
+++ b/libc/src/math/generic/atanhf.cpp
@@ -29,11 +29,11 @@ LLVM_LIBC_FUNCTION(float, atanhf, (float x)) {
     if (x_abs == 0x3F80'0000U) {
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return FPBits::inf(sign);
+      return FPBits::inf(sign).get_val();
     } else {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::build_quiet_nan(0);
+      return FPBits::build_quiet_nan().get_val();
     }
   }
 
diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp
index 89333ab19e89f..132e72c0f65da 100644
--- a/libc/src/math/generic/cosf.cpp
+++ b/libc/src/math/generic/cosf.cpp
@@ -118,7 +118,7 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_quiet_nan(0);
+    return x + FPBits::build_quiet_nan().get_val();
   }
 
   // Combine the results with the sine of sum formula:
diff --git a/libc/src/math/generic/coshf.cpp b/libc/src/math/generic/coshf.cpp
index 3b01852e9f544..a618056a64dc8 100644
--- a/libc/src/math/generic/coshf.cpp
+++ b/libc/src/math/generic/coshf.cpp
@@ -32,16 +32,16 @@ LLVM_LIBC_FUNCTION(float, coshf, (float x)) {
     }
 
     if (xbits.is_inf_or_nan())
-      return x + FPBits::inf();
+      return x + FPBits::inf().get_val();
 
     int rounding = fputil::quick_get_round();
     if (LIBC_UNLIKELY(rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO))
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
 
-    return x + FPBits::inf();
+    return x + FPBits::inf().get_val();
   }
 
   // TODO: We should be able to reduce the latency and reciprocal throughput
diff --git a/libc/src/math/generic/exp.cpp b/libc/src/math/generic/exp.cpp
index a1b4d9a64f969..49ea1699bb209 100644
--- a/libc/src/math/generic/exp.cpp
+++ b/libc/src/math/generic/exp.cpp
@@ -205,7 +205,7 @@ double set_exceptional(double x) {
       return x;
 
     if (fputil::quick_get_round() == FE_UPWARD)
-      return FPBits::min_denormal();
+      return FPBits::min_subnormal().get_val();
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_UNDERFLOW);
     return 0.0;
@@ -216,7 +216,7 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp
index e441f2c0edc7d..f1da03cba0b30 100644
--- a/libc/src/math/generic/exp10.cpp
+++ b/libc/src/math/generic/exp10.cpp
@@ -248,7 +248,7 @@ double set_exceptional(double x) {
         return x;
 
       if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0;
@@ -262,7 +262,7 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/math/generic/exp10f_impl.h
index 2861659a6e57e..ff4c1c3aec67c 100644
--- a/libc/src/math/generic/exp10f_impl.h
+++ b/libc/src/math/generic/exp10f_impl.h
@@ -42,7 +42,7 @@ LIBC_INLINE float exp10f(float x) {
       if (xbits.is_nan())
         return x;
       if (fputil::fenv_is_round_up())
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0f;
@@ -53,13 +53,13 @@ LIBC_INLINE float exp10f(float x) {
       if (x_u < 0x7f80'0000U) {
         int rounding = fputil::quick_get_round();
         if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal();
+          return FPBits::max_normal().get_val();
 
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_OVERFLOW);
       }
       // x is +inf or nan
-      return x + FPBits::inf();
+      return x + FPBits::inf().get_val();
     }
   }
 
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 70bc4870806a9..508bff9bd9fc9 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -223,7 +223,7 @@ double set_exceptional(double x) {
         return x;
 
       if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0;
@@ -237,7 +237,7 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/exp2f_impl.h b/libc/src/math/generic/exp2f_impl.h
index 86360840b96e6..d2342e289fcba 100644
--- a/libc/src/math/generic/exp2f_impl.h
+++ b/libc/src/math/generic/exp2f_impl.h
@@ -76,13 +76,13 @@ LIBC_INLINE float exp2f(float x) {
       if (x_u < 0x7f80'0000U) {
         int rounding = fputil::quick_get_round();
         if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal();
+          return FPBits::max_normal().get_val();
 
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_OVERFLOW);
       }
       // x is +inf or nan
-      return x + FPBits::inf();
+      return x + FPBits::inf().get_val();
     }
     // x <= -150
     if (x_u >= 0xc316'0000U) {
@@ -93,7 +93,7 @@ LIBC_INLINE float exp2f(float x) {
       if (xbits.is_nan())
         return x;
       if (fputil::fenv_is_round_up())
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       if (x != 0.0f) {
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_UNDERFLOW);
diff --git a/libc/src/math/generic/expf.cpp b/libc/src/math/generic/expf.cpp
index 88d408994fe42..f3ce8400d0a41 100644
--- a/libc/src/math/generic/expf.cpp
+++ b/libc/src/math/generic/expf.cpp
@@ -50,7 +50,7 @@ LLVM_LIBC_FUNCTION(float, expf, (float x)) {
       if (xbits.is_nan())
         return x;
       if (fputil::fenv_is_round_up())
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0f;
@@ -61,7 +61,7 @@ LLVM_LIBC_FUNCTION(float, expf, (float x)) {
       if (xbits.uintval() < 0x7f80'0000U) {
         int rounding = fputil::quick_get_round();
         if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal();
+          return FPBits::max_normal().get_val();
 
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index d9fccf98e8caa..c1fb80309d7b4 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -267,13 +267,13 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
   }
   // x is +inf or nan
-  return x + static_cast<double>(FPBits::inf());
+  return x + FPBits::inf().get_val();
 }
 
 } // namespace
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index c6e0663ec46c3..037e60021b296 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -68,12 +68,12 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
         if (xbits.uintval() < 0x7f80'0000U) {
           int rounding = fputil::quick_get_round();
           if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-            return FPBits::max_normal();
+            return FPBits::max_normal().get_val();
 
           fputil::set_errno_if_required(ERANGE);
           fputil::raise_except_if_required(FE_OVERFLOW);
         }
-        return x + static_cast<float>(FPBits::inf());
+        return x + FPBits::inf().get_val();
       }
     }
   }
diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp
index 2db6b7f48fd09..b0f7e8c9afa54 100644
--- a/libc/src/math/generic/log.cpp
+++ b/libc/src/math/generic/log.cpp
@@ -732,28 +732,29 @@ double log_accurate(int e_x, int index, double m_x) {
 
 LLVM_LIBC_FUNCTION(double, log, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
+  using Sign = fputil::Sign;
   FPBits_t xbits(x);
   uint64_t x_u = xbits.uintval();
 
   int x_e = -FPBits_t::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(x_u == 0x3FF0'0000'0000'0000ULL)) {
+  if (LIBC_UNLIKELY(xbits == FPBits_t::one())) {
     // log(1.0) = +0.0
     return 0.0;
   }
 
-  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::MIN_NORMAL ||
-                    xbits.uintval() > FPBits_t::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::min_normal().uintval() ||
+                    xbits.uintval() > FPBits_t::max_normal().uintval())) {
     if (xbits.is_zero()) {
       // return -Inf and raise FE_DIVBYZERO.
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
+      return FPBits_t::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits_t::build_quiet_nan(0);
+      return FPBits_t::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index 3a4d321fdb18c..55a3fc5c061e5 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -733,28 +733,29 @@ double log10_accurate(int e_x, int index, double m_x) {
 
 LLVM_LIBC_FUNCTION(double, log10, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
+  using Sign = fputil::Sign;
   FPBits_t xbits(x);
   uint64_t x_u = xbits.uintval();
 
   int x_e = -FPBits_t::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(x_u == 0x3FF0'0000'0000'0000ULL)) {
+  if (LIBC_UNLIKELY(xbits == FPBits_t::one())) {
     // log10(1.0) = +0.0
     return 0.0;
   }
 
-  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::MIN_NORMAL ||
-                    xbits.uintval() > FPBits_t::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::min_normal().uintval() ||
+                    xbits.uintval() > FPBits_t::max_normal().uintval())) {
     if (xbits.is_zero()) {
       // return -Inf and raise FE_DIVBYZERO.
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
+      return FPBits_t::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits_t::build_quiet_nan(0);
+      return FPBits_t::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index 46505f4e07e67..f87e34ec5fb38 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -106,6 +106,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
   constexpr double LOG10_2 = 0x1.34413509f79ffp-2;
 
   using FPBits = typename fputil::FPBits<float>;
+  using Sign = fputil::Sign;
   FPBits xbits(x);
   uint32_t x_u = xbits.uintval();
 
@@ -160,18 +161,19 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
 
   int m = -FPBits::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL || x_u > FPBits::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(x_u < FPBits::min_normal().uintval() ||
+                    x_u > FPBits::max_normal().uintval())) {
     if (xbits.is_zero()) {
       // Return -inf and raise FE_DIVBYZERO
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
+      return FPBits::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       // Return NaN and raise FE_INVALID
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::build_quiet_nan(0);
+      return FPBits::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp
index 731fecae6f1b5..ae432620ba328 100644
--- a/libc/src/math/generic/log1p.cpp
+++ b/libc/src/math/generic/log1p.cpp
@@ -873,6 +873,7 @@ LIBC_INLINE double log1p_accurate(int e_x, int index,
 
 LLVM_LIBC_FUNCTION(double, log1p, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
+  using Sign = fputil::Sign;
   constexpr int EXP_BIAS = FPBits_t::EXP_BIAS;
   constexpr int FRACTION_LEN = FPBits_t::FRACTION_LEN;
   constexpr uint64_t FRACTION_MASK = FPBits_t::FRACTION_MASK;
@@ -887,19 +888,19 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) {
     // |x| >= 1
     if (LIBC_UNLIKELY(x_u >= 0x4650'0000'0000'0000ULL)) {
       // x >= 2^102 or x is negative, inf, or NaN
-      if (LIBC_UNLIKELY(x_u > FPBits_t::MAX_NORMAL)) {
+      if (LIBC_UNLIKELY(x_u > FPBits_t::max_normal().uintval())) {
         // x <= -1.0 or x is Inf or NaN
         if (x_u == 0xbff0'0000'0000'0000ULL) {
           // x = -1.0
           fputil::set_errno_if_required(ERANGE);
           fputil::raise_except_if_required(FE_DIVBYZERO);
-          return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
+          return FPBits_t::inf(Sign::NEG).get_val();
         }
         if (xbits.is_neg() && !xbits.is_nan()) {
           // x < -1.0
           fputil::set_errno_if_required(EDOM);
           fputil::raise_except_if_required(FE_INVALID);
-          return FPBits_t::build_quiet_nan(0);
+          return FPBits_t::build_quiet_nan().get_val();
         }
         // x is +Inf or NaN
         return x;
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index 0812569c624b8..bc472caf54f89 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -43,11 +43,11 @@ LIBC_INLINE float log(double x) {
 
   uint64_t x_u = xbits.uintval();
 
-  if (LIBC_UNLIKELY(x_u > FPBits::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(x_u > FPBits::max_normal().uintval())) {
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return fputil::FPBits<float>::build_quiet_nan(0);
+      return fputil::FPBits<float>::build_quiet_nan().get_val();
     }
     return static_cast<float>(x);
   }
diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp
index 5b7fb65d77385..480c690a65037 100644
--- a/libc/src/math/generic/log2.cpp
+++ b/libc/src/math/generic/log2.cpp
@@ -854,28 +854,29 @@ double log2_accurate(int e_x, int index, double m_x) {
 
 LLVM_LIBC_FUNCTION(double, log2, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
+  using Sign = fputil::Sign;
   FPBits_t xbits(x);
   uint64_t x_u = xbits.uintval();
 
   int x_e = -FPBits_t::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(x_u == 0x3FF0'0000'0000'0000ULL)) {
+  if (LIBC_UNLIKELY(xbits == FPBits_t::one())) {
     // log2(1.0) = +0.0
     return 0.0;
   }
 
-  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::MIN_NORMAL ||
-                    xbits.uintval() > FPBits_t::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::min_normal().uintval() ||
+                    xbits.uintval() > FPBits_t::max_normal().uintval())) {
     if (xbits.is_zero()) {
       // return -Inf and raise FE_DIVBYZERO.
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
+      return FPBits_t::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits_t::build_quiet_nan(0);
+      return FPBits_t::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index 9cddbb9e8ea48..7eaa5d53ccedd 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -55,6 +55,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
   using FPBits = typename fputil::FPBits<float>;
+  using Sign = fputil::Sign;
   FPBits xbits(x);
   uint32_t x_u = xbits.uintval();
 
@@ -68,16 +69,17 @@ LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
     return 0.0f;
 
   // Exceptional inputs.
-  if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL || x_u > FPBits::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(x_u < FPBits::min_normal().uintval() ||
+                    x_u > FPBits::max_normal().uintval())) {
     if (xbits.is_zero()) {
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
+      return FPBits::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except(FE_INVALID);
-      return FPBits::build_quiet_nan(0);
+      return FPBits::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index 8ccb55dcc9e33..88f7ea01b2f19 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -54,6 +54,7 @@ namespace LIBC_NAMESPACE {
 LLVM_LIBC_FUNCTION(float, logf, (float x)) {
   constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
   using FPBits = typename fputil::FPBits<float>;
+  using Sign = fputil::Sign;
   FPBits xbits(x);
   uint32_t x_u = xbits.uintval();
 
@@ -79,7 +80,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
 #endif // LIBC_TARGET_CPU_HAS_FMA
     }
     // Subnormal inputs.
-    if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL)) {
+    if (LIBC_UNLIKELY(x_u < FPBits::min_normal().uintval())) {
       if (x_u == 0) {
         // Return -inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(ERANGE);
@@ -112,18 +113,18 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
 #endif // LIBC_TARGET_CPU_HAS_FMA
     }
     // Exceptional inputs.
-    if (LIBC_UNLIKELY(x_u > FPBits::MAX_NORMAL)) {
+    if (LIBC_UNLIKELY(x_u > FPBits::max_normal().uintval())) {
       if (x_u == 0x8000'0000U) {
         // Return -inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
+        return FPBits::inf(Sign::NEG).get_val();
       }
       if (xbits.is_neg() && !xbits.is_nan()) {
         // Return NaN and raise FE_INVALID
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::build_quiet_nan(0);
+        return FPBits::build_quiet_nan().get_val();
       }
       // x is +inf or nan
       return x;
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index 932f1d70c33e8..0e164ab8b4225 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -547,14 +547,15 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return FloatBits::inf();
+        return FloatBits::inf().get_val();
       }
       // pow (|x| < 1, -inf) = +inf
       // pow (|x| < 1, +inf) = 0.0f
       // pow (|x| > 1, -inf) = 0.0f
       // pow (|x| > 1, +inf) = +inf
-      return ((x_abs < 0x3f80'0000) == (y_u == 0xff80'0000)) ? FloatBits::inf()
-                                                             : 0.0f;
+      return ((x_abs < 0x3f80'0000) == (y_u == 0xff80'0000))
+                 ? FloatBits::inf().get_val()
+                 : 0.0f;
     }
     default:
       // Speed up for common exponents
@@ -617,7 +618,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // pow(0, negative number) = inf
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS);
+        return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val();
       }
       // pow(0, positive number) = 0
       return out_is_neg ? -0.0f : 0.0f;
@@ -628,7 +629,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
       if (y_u >= FloatBits::SIGN_MASK) {
         return out_is_neg ? -0.0f : 0.0f;
       }
-      return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS);
+      return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val();
     }
     }
 
@@ -656,7 +657,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // pow( negative, non-integer ) = NaN
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_INVALID);
-        return FloatBits::build_quiet_nan(0);
+        return FloatBits::build_quiet_nan().get_val();
       }
     }
   }
diff --git a/libc/src/math/generic/sincosf.cpp b/libc/src/math/generic/sincosf.cpp
index 44371db710871..f12b93a0e6965 100644
--- a/libc/src/math/generic/sincosf.cpp
+++ b/libc/src/math/generic/sincosf.cpp
@@ -148,7 +148,9 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    *sinp = x + FPBits::build_nan(FPBits::FRACTION_MASK);
+    *sinp =
+        x +
+        FPBits::build_nan(fputil::Sign::POS, FPBits::FRACTION_MASK).get_val();
     *cosp = *sinp;
     return;
   }
diff --git a/libc/src/math/generic/sinf.cpp b/libc/src/math/generic/sinf.cpp
index 9e574d4e57240..7ba479f0a4598 100644
--- a/libc/src/math/generic/sinf.cpp
+++ b/libc/src/math/generic/sinf.cpp
@@ -139,7 +139,7 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_quiet_nan(0);
+    return x + FPBits::build_quiet_nan().get_val();
   }
 
   // Combine the results with the sine of sum formula:
diff --git a/libc/src/math/generic/sinhf.cpp b/libc/src/math/generic/sinhf.cpp
index b3850c6742706..780c9a1f8d6ac 100644
--- a/libc/src/math/generic/sinhf.cpp
+++ b/libc/src/math/generic/sinhf.cpp
@@ -56,16 +56,16 @@ LLVM_LIBC_FUNCTION(float, sinhf, (float x)) {
     int rounding = fputil::quick_get_round();
     if (xbits.is_neg()) {
       if (LIBC_UNLIKELY(rounding == FE_UPWARD || rounding == FE_TOWARDZERO))
-        return -FPBits::max_normal();
+        return -FPBits::max_normal().get_val();
     } else {
       if (LIBC_UNLIKELY(rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO))
-        return FPBits::max_normal();
+        return FPBits::max_normal().get_val();
     }
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
 
-    return x + FPBits::inf(xbits.sign());
+    return x + FPBits::inf(xbits.sign()).get_val();
   }
 
   // sinh(x) = (e^x - e^(-x)) / 2.
diff --git a/libc/src/math/generic/tanf.cpp b/libc/src/math/generic/tanf.cpp
index 7909e9e5d5568..09dd62eae03f8 100644
--- a/libc/src/math/generic/tanf.cpp
+++ b/libc/src/math/generic/tanf.cpp
@@ -114,7 +114,7 @@ LLVM_LIBC_FUNCTION(float, tanf, (float x)) {
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_INVALID);
       }
-      return x + FPBits::build_quiet_nan(0);
+      return x + FPBits::build_quiet_nan().get_val();
     }
     // Other large exceptional values
     if (auto r = TANF_EXCEPTS.lookup_odd(x_abs, x_sign);
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 26b8e3e60bdfd..9880fa51c5481 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -66,16 +66,16 @@ template <typename T> struct FPTest : public Test {
   using Sign = LIBC_NAMESPACE::fputil::Sign;
   static constexpr StorageType STORAGE_MAX =
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();
-  static constexpr T zero = FPBits::zero(Sign::POS);
-  static constexpr T neg_zero = FPBits::zero(Sign::NEG);
-  static constexpr T aNaN = FPBits::build_quiet_nan(1);
-  static constexpr T sNaN = FPBits::build_nan(1);
-  static constexpr T inf = FPBits::inf(Sign::POS);
-  static constexpr T neg_inf = FPBits::inf(Sign::NEG);
-  static constexpr T min_normal = FPBits::min_normal();
-  static constexpr T max_normal = FPBits::max_normal();
-  static constexpr T min_denormal = FPBits::min_denormal();
-  static constexpr T max_denormal = FPBits::max_denormal();
+  static constexpr T zero = T(FPBits::zero(Sign::POS));
+  static constexpr T neg_zero = T(FPBits::zero(Sign::NEG));
+  static constexpr T aNaN = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  static constexpr T sNaN = T(FPBits::build_nan(Sign::POS, 1));
+  static constexpr T inf = T(FPBits::inf(Sign::POS));
+  static constexpr T neg_inf = T(FPBits::inf(Sign::NEG));
+  static constexpr T min_normal = T(FPBits::min_normal());
+  static constexpr T max_normal = T(FPBits::max_normal());
+  static constexpr T min_denormal = T(FPBits::min_subnormal());
+  static constexpr T max_denormal = T(FPBits::max_subnormal());
 
   static constexpr int N_ROUNDING_MODES = 4;
   static constexpr fputil::testing::RoundingMode ROUNDING_MODES[4] = {
@@ -95,16 +95,16 @@ template <typename T> struct FPTest : public Test {
   using Sign = LIBC_NAMESPACE::fputil::Sign;                                   \
   static constexpr StorageType STORAGE_MAX =                                   \
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();                 \
-  const T zero = FPBits::zero(Sign::POS);                                      \
-  const T neg_zero = FPBits::zero(Sign::NEG);                                  \
-  const T aNaN = FPBits::build_quiet_nan(1);                                   \
-  const T sNaN = FPBits::build_nan(1);                                         \
-  const T inf = FPBits::inf(Sign::POS);                                        \
-  const T neg_inf = FPBits::inf(Sign::NEG);                                    \
-  const T min_normal = FPBits::min_normal();                                   \
-  const T max_normal = FPBits::max_normal();                                   \
-  const T min_denormal = FPBits::min_denormal();                               \
-  const T max_denormal = FPBits::max_denormal();
+  const T zero = T(FPBits::zero(Sign::POS));                                   \
+  const T neg_zero = T(FPBits::zero(Sign::NEG));                               \
+  const T aNaN = T(FPBits::build_quiet_nan(Sign::POS, 1));                     \
+  const T sNaN = T(FPBits::build_nan(Sign::POS, 1));                           \
+  const T inf = T(FPBits::inf(Sign::POS));                                     \
+  const T neg_inf = T(FPBits::inf(Sign::NEG));                                 \
+  const T min_normal = T(FPBits::min_normal());                                \
+  const T max_normal = T(FPBits::max_normal());                                \
+  const T min_denormal = T(FPBits::min_subnormal());                           \
+  const T max_denormal = T(FPBits::max_subnormal());
 
 #define EXPECT_FP_EQ(expected, actual)                                         \
   EXPECT_THAT(actual, LIBC_NAMESPACE::testing::getMatcher<                     \
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index e6b6d7d9ec780..db9f6750bedef 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -232,13 +232,11 @@ TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80_IsNan) {
 TEST(LlvmLibcFPBitsTest, FloatType) {
   using FloatBits = FPBits<float>;
 
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(FloatBits(FloatBits::inf(Sign::POS))).c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(FloatBits(FloatBits::inf(Sign::NEG))).c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits(FloatBits::build_nan(1))).c_str(),
+  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits::build_nan(Sign::POS, 1)).c_str(),
                "(NaN)");
 
   FloatBits zero(0.0f);
@@ -289,22 +287,19 @@ TEST(LlvmLibcFPBitsTest, FloatType) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBF900000 = (S: 1, E: 0x007F, M: 0x00100000)");
 
-  FloatBits quiet_nan = FloatBits(FloatBits::build_quiet_nan(1));
+  FloatBits quiet_nan = FloatBits::build_quiet_nan(Sign::POS, 1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
 TEST(LlvmLibcFPBitsTest, DoubleType) {
   using DoubleBits = FPBits<double>;
 
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(DoubleBits(DoubleBits::inf(Sign::POS))).c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(DoubleBits(DoubleBits::inf(Sign::NEG))).c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(DoubleBits(DoubleBits::build_nan(1))).c_str(),
-      "(NaN)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::build_nan(Sign::POS, 1)).c_str(),
+               "(NaN)");
 
   DoubleBits zero(0.0);
   EXPECT_TRUE(zero.is_pos());
@@ -354,7 +349,7 @@ TEST(LlvmLibcFPBitsTest, DoubleType) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBFF2000000000000 = (S: 1, E: 0x03FF, M: 0x0002000000000000)");
 
-  DoubleBits quiet_nan = DoubleBits(DoubleBits::build_quiet_nan(1));
+  DoubleBits quiet_nan = DoubleBits::build_quiet_nan(Sign::POS, 1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
@@ -365,16 +360,12 @@ TEST(LlvmLibcFPBitsTest, X86LongDoubleType) {
   if constexpr (sizeof(long double) == sizeof(double))
     return; // The tests for the "double" type cover for this case.
 
+  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
   EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::POS)))
-          .c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::NEG)))
-          .c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::build_nan(1))).c_str(),
+      LIBC_NAMESPACE::str(LongDoubleBits::build_nan(Sign::POS, 1)).c_str(),
       "(NaN)");
 
   LongDoubleBits zero(0.0l);
@@ -440,7 +431,7 @@ TEST(LlvmLibcFPBitsTest, X86LongDoubleType) {
       "0x000000000000BFFF9000000000000000 = "
       "(S: 1, E: 0x3FFF, I: 1, M: 0x00000000000000001000000000000000)");
 
-  LongDoubleBits quiet_nan = LongDoubleBits(LongDoubleBits::build_quiet_nan(1));
+  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan(Sign::POS, 1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #else
@@ -450,16 +441,12 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
 #else
   using LongDoubleBits = FPBits<long double>;
 
+  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
   EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::POS)))
-          .c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::NEG)))
-          .c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::build_nan(1))).c_str(),
+      LIBC_NAMESPACE::str(LongDoubleBits::build_nan(Sign::POS, 1)).c_str(),
       "(NaN)");
 
   LongDoubleBits zero(0.0l);
@@ -519,7 +506,7 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
 
-  LongDoubleBits quiet_nan = LongDoubleBits(LongDoubleBits::build_quiet_nan(1));
+  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan(1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 #endif
 }
@@ -529,17 +516,15 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
 TEST(LlvmLibcFPBitsTest, Float128Type) {
   using Float128Bits = FPBits<float128>;
 
+  EXPECT_STREQ(LIBC_NAMESPACE::str(Float128Bits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(Float128Bits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
   EXPECT_STREQ(
-      LIBC_NAMESPACE::str(Float128Bits(Float128Bits::inf(Sign::POS))).c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(Float128Bits(Float128Bits::inf(Sign::NEG))).c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(Float128Bits(Float128Bits::build_nan(1))).c_str(),
+      LIBC_NAMESPACE::str(Float128Bits::build_nan(Sign::POS, 1)).c_str(),
       "(NaN)");
 
-  Float128Bits zero(Float128Bits::zero());
+  Float128Bits zero = Float128Bits::zero(Sign::POS);
   EXPECT_TRUE(zero.is_pos());
   EXPECT_EQ(zero.get_biased_exponent(), static_cast<uint16_t>(0x0000));
   EXPECT_EQ(zero.get_mantissa(), static_cast<UInt128>(0x0000000000000000)
@@ -549,7 +534,7 @@ TEST(LlvmLibcFPBitsTest, Float128Type) {
                "0x00000000000000000000000000000000 = "
                "(S: 0, E: 0x0000, M: 0x00000000000000000000000000000000)");
 
-  Float128Bits negzero(Float128Bits::zero(Sign::NEG));
+  Float128Bits negzero = Float128Bits::zero(Sign::NEG);
   EXPECT_TRUE(negzero.is_neg());
   EXPECT_EQ(negzero.get_biased_exponent(), static_cast<uint16_t>(0x0000));
   EXPECT_EQ(negzero.get_mantissa(), static_cast<UInt128>(0x0000000000000000)
@@ -596,7 +581,7 @@ TEST(LlvmLibcFPBitsTest, Float128Type) {
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
 
-  Float128Bits quiet_nan = Float128Bits(Float128Bits::build_quiet_nan(1));
+  Float128Bits quiet_nan = Float128Bits::build_quiet_nan(Sign::POS, 1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #endif // LIBC_COMPILER_HAS_FLOAT128
diff --git a/libc/test/src/math/FDimTest.h b/libc/test/src/math/FDimTest.h
index 0744e6ea8fd8f..c3d9cb1801cd4 100644
--- a/libc/test/src/math/FDimTest.h
+++ b/libc/test/src/math/FDimTest.h
@@ -24,7 +24,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
diff --git a/libc/test/src/math/FmaTest.h b/libc/test/src/math/FmaTest.h
index 032a79821d590..4343b38053dc4 100644
--- a/libc/test/src/math/FmaTest.h
+++ b/libc/test/src/math/FmaTest.h
@@ -25,11 +25,21 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
+  const T min_subnormal = T(FPBits::min_subnormal(Sign::POS));
+  const T min_normal = T(FPBits::min_normal(Sign::POS));
+  const T max_normal = T(FPBits::max_normal(Sign::POS));
   const T inf = T(FPBits::inf(Sign::POS));
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
 
   StorageType get_random_bit_pattern() {
     StorageType bits{0};
@@ -52,14 +62,13 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
     EXPECT_FP_EQ(func(inf, neg_inf, nan), nan);
 
     // Test underflow rounding up.
-    EXPECT_FP_EQ(func(T(0.5), FPBits::min_denormal(), FPBits::min_denormal()),
+    EXPECT_FP_EQ(func(T(0.5), min_subnormal, min_subnormal),
                  T(FPBits(StorageType(2))));
     // Test underflow rounding down.
-    T v = T(FPBits(FPBits::MIN_NORMAL + StorageType(1)));
-    EXPECT_FP_EQ(
-        func(T(1) / T(FPBits::MIN_NORMAL << 1), v, FPBits::min_normal()), v);
+    T v = T(FPBits(MIN_NORMAL + StorageType(1)));
+    EXPECT_FP_EQ(func(T(1) / T(MIN_NORMAL << 1), v, min_normal), v);
     // Test overflow.
-    T z = FPBits::max_normal();
+    T z = max_normal;
     EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z);
     // Exact cancellation.
     EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0));
@@ -68,11 +77,9 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void test_subnormal_range(Func func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_SUBNORMAL, w = FPBits::MAX_SUBNORMAL;
-         v <= FPBits::MAX_SUBNORMAL && w >= FPBits::MIN_SUBNORMAL;
-         v += STEP, w -= STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = MIN_SUBNORMAL, w = MAX_SUBNORMAL;
+         v <= MAX_SUBNORMAL && w >= MIN_SUBNORMAL; v += STEP, w -= STEP) {
       T x = T(FPBits(get_random_bit_pattern())), y = T(FPBits(v)),
         z = T(FPBits(w));
       mpfr::TernaryInput<T> input{x, y, z};
@@ -83,11 +90,9 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void test_normal_range(Func func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_NORMAL, w = FPBits::MAX_NORMAL;
-         v <= FPBits::MAX_NORMAL && w >= FPBits::MIN_NORMAL;
-         v += STEP, w -= STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
+         v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w)),
         z = T(FPBits(get_random_bit_pattern()));
       mpfr::TernaryInput<T> input{x, y, z};
diff --git a/libc/test/src/math/HypotTest.h b/libc/test/src/math/HypotTest.h
index b7eb63c192c70..0b85f68fda82a 100644
--- a/libc/test/src/math/HypotTest.h
+++ b/libc/test/src/math/HypotTest.h
@@ -25,15 +25,22 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
   using StorageType = typename FPBits::StorageType;
-  const T nan = FPBits::build_quiet_nan(1);
-  const T inf = FPBits::inf();
-  const T neg_inf = FPBits::inf(Sign::NEG);
-  const T zero = FPBits::zero();
-  const T neg_zero = FPBits::zero(Sign::NEG);
-  const T max_normal = FPBits::max_normal();
-  const T min_normal = FPBits::min_normal();
-  const T max_subnormal = FPBits::max_denormal();
-  const T min_subnormal = FPBits::min_denormal();
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T inf = T(FPBits::inf());
+  const T neg_inf = T(FPBits::inf(Sign::NEG));
+  const T zero = T(FPBits::zero());
+  const T neg_zero = T(FPBits::zero(Sign::NEG));
+  const T max_normal = T(FPBits::max_normal());
+  const T min_normal = T(FPBits::min_normal());
+  const T max_subnormal = T(FPBits::max_subnormal());
+  const T min_subnormal = T(FPBits::min_subnormal());
+
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
 
 public:
   void test_special_numbers(Func func) {
@@ -62,12 +69,11 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   void test_subnormal_range(Func func) {
     constexpr StorageType COUNT = 10'001;
     for (unsigned scale = 0; scale < 4; ++scale) {
-      StorageType max_value = FPBits::MAX_SUBNORMAL << scale;
-      StorageType step = (max_value - FPBits::MIN_SUBNORMAL) / COUNT;
+      StorageType max_value = MAX_SUBNORMAL << scale;
+      StorageType step = (max_value - MIN_SUBNORMAL) / COUNT;
       for (int signs = 0; signs < 4; ++signs) {
-        for (StorageType v = FPBits::MIN_SUBNORMAL, w = max_value;
-             v <= max_value && w >= FPBits::MIN_SUBNORMAL;
-             v += step, w -= step) {
+        for (StorageType v = MIN_SUBNORMAL, w = max_value;
+             v <= max_value && w >= MIN_SUBNORMAL; v += step, w -= step) {
           T x = T(FPBits(v)), y = T(FPBits(w));
           if (signs % 2 == 1) {
             x = -x;
@@ -86,13 +92,10 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void test_normal_range(Func func) {
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (StorageType(FPBits::MAX_NORMAL) - StorageType(FPBits::MIN_NORMAL)) /
-        COUNT;
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (int signs = 0; signs < 4; ++signs) {
-      for (StorageType v = FPBits::MIN_NORMAL, w = FPBits::MAX_NORMAL;
-           v <= FPBits::MAX_NORMAL && w >= FPBits::MIN_NORMAL;
-           v += STEP, w -= STEP) {
+      for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
+           v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
         T x = T(FPBits(v)), y = T(FPBits(w));
         if (signs % 2 == 1) {
           x = -x;
diff --git a/libc/test/src/math/ILogbTest.h b/libc/test/src/math/ILogbTest.h
index 9fa25c9ff9861..223de789999af 100644
--- a/libc/test/src/math/ILogbTest.h
+++ b/libc/test/src/math/ILogbTest.h
@@ -26,10 +26,10 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_special_numbers(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using Sign = LIBC_NAMESPACE::fputil::Sign;
-    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero())));
+    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(1))));
-    EXPECT_EQ(INT_MAX, func(T(FPBits::inf())));
+    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(Sign::POS, 1))));
+    EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
   }
 
@@ -76,11 +76,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_subnormal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
+    constexpr StorageType MIN_SUBNORMAL = FPBits::min_subnormal().uintval();
+    constexpr StorageType MAX_SUBNORMAL = FPBits::max_subnormal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_SUBNORMAL; v <= FPBits::MAX_SUBNORMAL;
-         v += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
@@ -95,11 +95,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_normal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
+    constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+    constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_NORMAL; v <= FPBits::MAX_NORMAL;
-         v += STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
diff --git a/libc/test/src/math/LdExpTest.h b/libc/test/src/math/LdExpTest.h
index 25120ba3646fd..3a4baabbf10e6 100644
--- a/libc/test/src/math/LdExpTest.h
+++ b/libc/test/src/math/LdExpTest.h
@@ -29,7 +29,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/NextAfterTest.h b/libc/test/src/math/NextAfterTest.h
index aa9646fd921f8..9ff3bf73d2ede 100644
--- a/libc/test/src/math/NextAfterTest.h
+++ b/libc/test/src/math/NextAfterTest.h
@@ -27,12 +27,12 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
-  const StorageType min_subnormal = FPBits::MIN_SUBNORMAL;
-  const StorageType max_subnormal = FPBits::MAX_SUBNORMAL;
-  const StorageType min_normal = FPBits::MIN_NORMAL;
-  const StorageType max_normal = FPBits::MAX_NORMAL;
+  const StorageType min_subnormal = FPBits::min_subnormal().uintval();
+  const StorageType max_subnormal = FPBits::max_subnormal().uintval();
+  const StorageType min_normal = FPBits::min_normal().uintval();
+  const StorageType max_normal = FPBits::max_normal().uintval();
 
 public:
   typedef T (*NextAfterFunc)(T, T);
diff --git a/libc/test/src/math/RIntTest.h b/libc/test/src/math/RIntTest.h
index 6816e94d389f4..b478e3f65dbc8 100644
--- a/libc/test/src/math/RIntTest.h
+++ b/libc/test/src/math/RIntTest.h
@@ -38,7 +38,14 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
 
   static inline mpfr::RoundingMode to_mpfr_rounding_mode(int mode) {
     switch (mode) {
@@ -95,10 +102,8 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RIntFunc func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
       T x = T(FPBits(i));
       for (int mode : ROUNDING_MODES) {
         LIBC_NAMESPACE::fputil::set_round(mode);
@@ -110,10 +115,8 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testNormalRange(RIntFunc func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_NORMAL; i <= FPBits::MAX_NORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType i = MIN_NORMAL; i <= MAX_NORMAL; i += STEP) {
       T x = T(FPBits(i));
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. We will skip them.
diff --git a/libc/test/src/math/RemQuoTest.h b/libc/test/src/math/RemQuoTest.h
index 9ed9525662452..0ee41f4bf9acf 100644
--- a/libc/test/src/math/RemQuoTest.h
+++ b/libc/test/src/math/RemQuoTest.h
@@ -28,7 +28,14 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
 
 public:
   typedef T (*RemQuoFunc)(T, T, int *);
@@ -97,11 +104,9 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RemQuoFunc func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_SUBNORMAL, w = FPBits::MAX_SUBNORMAL;
-         v <= FPBits::MAX_SUBNORMAL && w >= FPBits::MIN_SUBNORMAL;
-         v += STEP, w -= STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = MIN_SUBNORMAL, w = MAX_SUBNORMAL;
+         v <= MAX_SUBNORMAL && w >= MIN_SUBNORMAL; v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w));
       mpfr::BinaryOutput<T> result;
       mpfr::BinaryInput<T> input{x, y};
@@ -112,11 +117,9 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testNormalRange(RemQuoFunc func) {
     constexpr StorageType COUNT = 1'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_NORMAL, w = FPBits::MAX_NORMAL;
-         v <= FPBits::MAX_NORMAL && w >= FPBits::MIN_NORMAL;
-         v += STEP, w -= STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
+         v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w));
       mpfr::BinaryOutput<T> result;
       mpfr::BinaryInput<T> input{x, y};
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index e8ada1b4c36c5..6866c23cb99ca 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -37,7 +37,15 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const F neg_zero = F(FPBits::zero(Sign::NEG));
   const F inf = F(FPBits::inf());
   const F neg_inf = F(FPBits::inf(Sign::NEG));
-  const F nan = F(FPBits::build_quiet_nan(1));
+  const F nan = F(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
+
   static constexpr I INTEGER_MIN = I(1) << (sizeof(I) * 8 - 1);
   static constexpr I INTEGER_MAX = -(INTEGER_MIN + 1);
 
@@ -215,10 +223,8 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RoundToIntegerFunc func) {
     constexpr StorageType COUNT = 1'000'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
       F x = F(FPBits(i));
       if (x == F(0.0))
         continue;
@@ -259,10 +265,8 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
       return;
 
     constexpr StorageType COUNT = 1'000'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_NORMAL; i <= FPBits::MAX_NORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType i = MIN_NORMAL; i <= MAX_NORMAL; i += STEP) {
       F x = F(FPBits(i));
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. We will skip them.
diff --git a/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h b/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
index ada7c9e495426..48572e78e5153 100644
--- a/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
+++ b/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
@@ -109,10 +109,13 @@ template <typename T> class BinaryOpSingleOutputDiff {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     run_perf_in_range(myFunc, otherFunc, /* startingBit= */ StorageType(0),
-                      /* endingBit= */ FPBits::MAX_SUBNORMAL, 1'000'001, log);
+                      /* endingBit= */ FPBits::max_subnormal().uintval(),
+                      1'000'001, log);
     log << "\n Performance tests with inputs in normal range:\n";
-    run_perf_in_range(myFunc, otherFunc, /* startingBit= */ FPBits::MIN_NORMAL,
-                      /* endingBit= */ FPBits::MAX_NORMAL, 100'000'001, log);
+    run_perf_in_range(myFunc, otherFunc,
+                      /* startingBit= */ FPBits::min_normal().uintval(),
+                      /* endingBit= */ FPBits::max_normal().uintval(),
+                      100'000'001, log);
     log << "\n Performance tests with inputs in normal range with exponents "
            "close to each other:\n";
     run_perf_in_range(
@@ -126,11 +129,12 @@ template <typename T> class BinaryOpSingleOutputDiff {
     log << " Diff tests with inputs in denormal range:\n";
     diffCount += run_diff_in_range(
         myFunc, otherFunc, /* startingBit= */ StorageType(0),
-        /* endingBit= */ FPBits::MAX_SUBNORMAL, 1'000'001, log);
+        /* endingBit= */ FPBits::max_subnormal().uintval(), 1'000'001, log);
     log << "\n Diff tests with inputs in normal range:\n";
     diffCount += run_diff_in_range(
-        myFunc, otherFunc, /* startingBit= */ FPBits::MIN_NORMAL,
-        /* endingBit= */ FPBits::MAX_NORMAL, 100'000'001, log);
+        myFunc, otherFunc,
+        /* startingBit= */ FPBits::min_normal().uintval(),
+        /* endingBit= */ FPBits::max_normal().uintval(), 100'000'001, log);
     log << "\n Diff tests with inputs in normal range with exponents "
            "close to each other:\n";
     diffCount += run_diff_in_range(
diff --git a/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h b/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
index e4cd06eb22b71..5e8310e889dc6 100644
--- a/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
+++ b/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
@@ -93,10 +93,11 @@ template <typename T> class SingleInputSingleOutputDiff {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     runPerfInRange(myFunc, otherFunc, /* startingBit= */ StorageType(0),
-                   /* endingBit= */ FPBits::MAX_SUBNORMAL, log);
+                   /* endingBit= */ FPBits::max_subnormal().uintval(), log);
     log << "\n Performance tests with inputs in normal range:\n";
-    runPerfInRange(myFunc, otherFunc, /* startingBit= */ FPBits::MIN_NORMAL,
-                   /* endingBit= */ FPBits::MAX_NORMAL, log);
+    runPerfInRange(myFunc, otherFunc,
+                   /* startingBit= */ FPBits::min_normal().uintval(),
+                   /* endingBit= */ FPBits::max_normal().uintval(), log);
   }
 };
 
diff --git a/libc/test/src/math/smoke/FDimTest.h b/libc/test/src/math/smoke/FDimTest.h
index 0744e6ea8fd8f..c3d9cb1801cd4 100644
--- a/libc/test/src/math/smoke/FDimTest.h
+++ b/libc/test/src/math/smoke/FDimTest.h
@@ -24,7 +24,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
diff --git a/libc/test/src/math/smoke/FmaTest.h b/libc/test/src/math/smoke/FmaTest.h
index dc624d871b990..337ce659a23e1 100644
--- a/libc/test/src/math/smoke/FmaTest.h
+++ b/libc/test/src/math/smoke/FmaTest.h
@@ -25,7 +25,7 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
 public:
   void test_special_numbers(Func func) {
@@ -39,14 +39,16 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
     EXPECT_FP_EQ(func(inf, neg_inf, nan), nan);
 
     // Test underflow rounding up.
-    EXPECT_FP_EQ(func(T(0.5), FPBits::min_denormal(), FPBits::min_denormal()),
-                 T(FPBits(StorageType(2))));
-    // Test underflow rounding down.
-    T v = T(FPBits(FPBits::MIN_NORMAL + StorageType(1)));
     EXPECT_FP_EQ(
-        func(T(1) / T(FPBits::MIN_NORMAL << 1), v, FPBits::min_normal()), v);
+        func(T(0.5), T(FPBits::min_subnormal()), T(FPBits::min_subnormal())),
+        T(FPBits(StorageType(2))));
+    // Test underflow rounding down.
+    StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+    T v = T(FPBits(MIN_NORMAL + StorageType(1)));
+    EXPECT_FP_EQ(func(T(1) / T(MIN_NORMAL << 1), v, T(FPBits::min_normal())),
+                 v);
     // Test overflow.
-    T z = FPBits::max_normal();
+    T z = T(FPBits::max_normal());
     EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z);
     // Exact cancellation.
     EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0));
diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h
index 77454c19a6538..67110536b9623 100644
--- a/libc/test/src/math/smoke/HypotTest.h
+++ b/libc/test/src/math/smoke/HypotTest.h
@@ -22,16 +22,16 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
-  const T nan = FPBits::build_quiet_nan(1);
-  const T inf = FPBits::inf(Sign::POS);
-  const T neg_inf = FPBits::inf(Sign::NEG);
-  const T zero = FPBits::zero(Sign::POS);
-  const T neg_zero = FPBits::zero(Sign::NEG);
-
-  const T max_normal = FPBits::max_normal();
-  const T min_normal = FPBits::min_normal();
-  const T max_subnormal = FPBits::max_denormal();
-  const T min_subnormal = FPBits::min_denormal();
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T inf = T(FPBits::inf(Sign::POS));
+  const T neg_inf = T(FPBits::inf(Sign::NEG));
+  const T zero = T(FPBits::zero(Sign::POS));
+  const T neg_zero = T(FPBits::zero(Sign::NEG));
+
+  const T max_normal = T(FPBits::max_normal());
+  const T min_normal = T(FPBits::min_normal());
+  const T max_subnormal = T(FPBits::max_subnormal());
+  const T min_subnormal = T(FPBits::min_subnormal());
 
 public:
   void test_special_numbers(Func func) {
diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h
index 0a50abc04f727..223de789999af 100644
--- a/libc/test/src/math/smoke/ILogbTest.h
+++ b/libc/test/src/math/smoke/ILogbTest.h
@@ -28,7 +28,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
     using Sign = LIBC_NAMESPACE::fputil::Sign;
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(1))));
+    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(Sign::POS, 1))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
   }
@@ -76,11 +76,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_subnormal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
+    constexpr StorageType MIN_SUBNORMAL = FPBits::min_subnormal().uintval();
+    constexpr StorageType MAX_SUBNORMAL = FPBits::max_subnormal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_SUBNORMAL; v <= FPBits::MAX_SUBNORMAL;
-         v += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
@@ -95,11 +95,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_normal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
+    constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+    constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_NORMAL; v <= FPBits::MAX_NORMAL;
-         v += STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
diff --git a/libc/test/src/math/smoke/LdExpTest.h b/libc/test/src/math/smoke/LdExpTest.h
index 25120ba3646fd..3a4baabbf10e6 100644
--- a/libc/test/src/math/smoke/LdExpTest.h
+++ b/libc/test/src/math/smoke/LdExpTest.h
@@ -29,7 +29,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h
index bdf3da627180a..1dd07b3d2f93d 100644
--- a/libc/test/src/math/smoke/NextAfterTest.h
+++ b/libc/test/src/math/smoke/NextAfterTest.h
@@ -38,12 +38,14 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
-
-  const StorageType min_subnormal = FPBits::MIN_SUBNORMAL;
-  const StorageType max_subnormal = FPBits::MAX_SUBNORMAL;
-  const StorageType min_normal = FPBits::MIN_NORMAL;
-  const StorageType max_normal = FPBits::MAX_NORMAL;
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType min_subnormal =
+      FPBits::min_subnormal().uintval();
+  static constexpr StorageType max_subnormal =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType min_normal = FPBits::min_normal().uintval();
+  static constexpr StorageType max_normal = FPBits::max_normal().uintval();
 
 public:
   typedef T (*NextAfterFunc)(T, T);
diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h
index af4e0ab14531c..d65cc5d84d35a 100644
--- a/libc/test/src/math/smoke/NextTowardTest.h
+++ b/libc/test/src/math/smoke/NextTowardTest.h
@@ -40,16 +40,18 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
-
-  const long double to_zero = ToFPBits::zero();
-  const long double to_neg_zero = ToFPBits::zero(Sign::NEG);
-  const long double to_nan = ToFPBits::build_quiet_nan(1);
-
-  const StorageType min_subnormal = FPBits::MIN_SUBNORMAL;
-  const StorageType max_subnormal = FPBits::MAX_SUBNORMAL;
-  const StorageType min_normal = FPBits::MIN_NORMAL;
-  const StorageType max_normal = FPBits::MAX_NORMAL;
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  const long double to_zero = ToFPBits::zero().get_val();
+  const long double to_neg_zero = ToFPBits::zero(Sign::NEG).get_val();
+  const long double to_nan = ToFPBits::build_quiet_nan(Sign::POS, 1).get_val();
+
+  static constexpr StorageType min_subnormal =
+      FPBits::min_subnormal().uintval();
+  static constexpr StorageType max_subnormal =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType min_normal = FPBits::min_normal().uintval();
+  static constexpr StorageType max_normal = FPBits::max_normal().uintval();
 
 public:
   typedef T (*NextTowardFunc)(T, long double);
diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h
index b242c7e441b69..7bbbe54301570 100644
--- a/libc/test/src/math/smoke/RIntTest.h
+++ b/libc/test/src/math/smoke/RIntTest.h
@@ -35,7 +35,7 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
 public:
   void testSpecialNumbers(RIntFunc func) {
diff --git a/libc/test/src/math/smoke/RemQuoTest.h b/libc/test/src/math/smoke/RemQuoTest.h
index 93e20747e5263..5f5cbd4964a62 100644
--- a/libc/test/src/math/smoke/RemQuoTest.h
+++ b/libc/test/src/math/smoke/RemQuoTest.h
@@ -25,7 +25,7 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
 public:
   typedef T (*RemQuoFunc)(T, T, int *);
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index 2703b78f00e0f..77c65aa492e22 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -30,11 +30,17 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const F zero = F(LIBC_NAMESPACE::fputil::FPBits<F>::zero(Sign::POS));
-  const F neg_zero = F(LIBC_NAMESPACE::fputil::FPBits<F>::zero(Sign::NEG));
-  const F inf = F(LIBC_NAMESPACE::fputil::FPBits<F>::inf(Sign::POS));
-  const F neg_inf = F(LIBC_NAMESPACE::fputil::FPBits<F>::inf(Sign::NEG));
-  const F nan = F(LIBC_NAMESPACE::fputil::FPBits<F>::build_quiet_nan(1));
+  const F zero = F(FPBits::zero(Sign::POS));
+  const F neg_zero = F(FPBits::zero(Sign::NEG));
+  const F inf = F(FPBits::inf(Sign::POS));
+  const F neg_inf = F(FPBits::inf(Sign::NEG));
+  const F nan = F(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
+
   static constexpr I INTEGER_MIN = I(1) << (sizeof(I) * 8 - 1);
   static constexpr I INTEGER_MAX = -(INTEGER_MIN + 1);
 
@@ -111,10 +117,8 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RoundToIntegerFunc func) {
     constexpr StorageType COUNT = 1'000'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
       F x = F(FPBits(i));
       if (x == F(0.0))
         continue;
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index 344853beaf9fa..b22378b22ab12 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -585,8 +585,10 @@ TEST(LlvmLibcSPrintfTest, OctConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
+                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                   .get_val();
   written = LIBC_NAMESPACE::sprintf(buff, "%a", 1.0);
   ASSERT_STREQ_LEN(written, buff, "0x1p+0");
 
@@ -949,11 +951,15 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
-  long double ld_inf = LIBC_NAMESPACE::fputil::FPBits<long double>::inf();
-  long double ld_nan =
-      LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(1);
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
+                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                   .get_val();
+  long double ld_inf =
+      LIBC_NAMESPACE::fputil::FPBits<long double>::inf().get_val();
+  long double ld_nan = LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(
+                           LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                           .get_val();
 
   char big_buff[10000]; // Used for long doubles and other extremely wide
                         // numbers.
@@ -1790,8 +1796,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
+                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                   .get_val();
 
   written = LIBC_NAMESPACE::sprintf(buff, "%e", 1.0);
   ASSERT_STREQ_LEN(written, buff, "1.000000e+00");
@@ -2422,8 +2430,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
+                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                   .get_val();
 
   written = LIBC_NAMESPACE::sprintf(buff, "%g", 1.0);
   ASSERT_STREQ_LEN(written, buff, "1");
diff --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp
index db3c48cdbf7a2..db67c25029133 100644
--- a/libc/test/src/stdio/sscanf_test.cpp
+++ b/libc/test/src/stdio/sscanf_test.cpp
@@ -230,8 +230,10 @@ TEST(LlvmLibcSScanfTest, FloatConvSimple) {
   int ret_val;
   float result = 0;
 
-  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf();
-  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(1);
+  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf().get_val();
+  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(
+                  LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                  .get_val();
 
   ret_val = LIBC_NAMESPACE::sscanf("123", "%f", &result);
   EXPECT_EQ(ret_val, 1);
@@ -294,9 +296,10 @@ TEST(LlvmLibcSScanfTest, FloatConvLengthModifier) {
   double d_result = 0;
   long double ld_result = 0;
 
-  double d_inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  long double ld_nan =
-      LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(1);
+  double d_inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  long double ld_nan = LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(
+                           LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                           .get_val();
 
   ret_val = LIBC_NAMESPACE::sscanf("123", "%lf", &d_result);
   EXPECT_EQ(ret_val, 1);
@@ -391,8 +394,10 @@ TEST(LlvmLibcSScanfTest, FloatConvComplexParsing) {
   int ret_val;
   float result = 0;
 
-  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf();
-  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(1);
+  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf().get_val();
+  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(
+                  LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                  .get_val();
 
   ret_val = LIBC_NAMESPACE::sscanf("0x1.0e3", "%f", &result);
   EXPECT_EQ(ret_val, 1);
@@ -463,7 +468,7 @@ TEST(LlvmLibcSScanfTest, FloatConvMaxWidth) {
   int ret_val;
   float result = 0;
 
-  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf();
+  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf().get_val();
 
   ret_val = LIBC_NAMESPACE::sscanf("123", "%3f", &result);
   EXPECT_EQ(ret_val, 1);
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index b6ca525db6cf7..06a231c7d94d0 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -49,7 +49,7 @@ template <> struct ExtraPrecision<long double> {
 template <typename T>
 static inline unsigned int get_precision(double ulp_tolerance) {
   if (ulp_tolerance <= 0.5) {
-    return LIBC_NAMESPACE::fputil::FPBits<T>::MANTISSA_PRECISION;
+    return LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN + 1;
   } else {
     return ExtraPrecision<T>::VALUE;
   }

From 626d0fa30addc6e8fb9af4a80df853af645733b4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 23 Jan 2024 11:34:26 +0100
Subject: [PATCH 587/843] [llvm-jitlink-executor] Fix unused function warning
 with LLVM_ENABLE_THREADS=0 (NFC)

---
 .../llvm-jitlink-executor/llvm-jitlink-executor.cpp           | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
index 20205eed82aa7..c98bb26b954d9 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink-executor/llvm-jitlink-executor.cpp
@@ -112,6 +112,8 @@ int openListener(std::string Host, std::string PortStr) {
 #endif // LLVM_ON_UNIX
 }
 
+#if LLVM_ENABLE_THREADS
+
 // JITLink debug support plugins put information about JITed code in this GDB
 // JIT Interface global from OrcTargetProcess.
 extern "C" struct jit_descriptor __jit_debug_descriptor;
@@ -123,6 +125,8 @@ static void *findLastDebugDescriptorEntryPtr() {
   return Last;
 }
 
+#endif
+
 int main(int argc, char *argv[]) {
 #if LLVM_ENABLE_THREADS
 

From 8e09f13d679df6c34dd03d8ad6116c832318cab0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 23 Jan 2024 11:35:42 +0100
Subject: [PATCH 588/843] [OrcJITTests] Fix warning: suggest explicit braces to
 avoid ambiguous 'else' (NFC)

Likely related to platform-specific expansion of gtest macros:
```
llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp:1276:12: warning: suggest explicit
braces to avoid ambiguous 'else' [-Wdangling-else]
```
---
 .../JITLink/EHFrameSupportTests.cpp            |  3 ++-
 .../ExecutionEngine/Orc/CoreAPIsTest.cpp       | 18 ++++++++++++------
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/llvm/unittests/ExecutionEngine/JITLink/EHFrameSupportTests.cpp b/llvm/unittests/ExecutionEngine/JITLink/EHFrameSupportTests.cpp
index 3be932aa77cd6..e7e09b3fe40b1 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/EHFrameSupportTests.cpp
+++ b/llvm/unittests/ExecutionEngine/JITLink/EHFrameSupportTests.cpp
@@ -185,8 +185,9 @@ TEST(EHFrameCFIBlockInspector, BasicSuccessCase) {
     if (CFIBI.isCIE()) {
       CIEs.push_back(B);
       // If this CIE has an edge, check that getPersonalityEdge returns it.
-      if (B->edges_size() != 0)
+      if (B->edges_size() != 0) {
         EXPECT_TRUE(!!CFIBI.getPersonalityEdge());
+      }
     }
   }
   ASSERT_EQ(CIEs.size(), 2U);
diff --git a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
index a136e941fa948..14934e000d205 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/CoreAPIsTest.cpp
@@ -1193,8 +1193,9 @@ TEST_F(CoreAPIsStandardTest, SimpleAsynchronousGeneratorTest) {
       [&](Expected<SymbolMap> Result) {
         LookupCompleted = true;
         EXPECT_THAT_EXPECTED(Result, Succeeded());
-        if (Result)
+        if (Result) {
           EXPECT_EQ(*Result, SymbolMap({{Foo, FooSym}}));
+        }
       },
       NoDependenciesToRegister);
 
@@ -1228,8 +1229,9 @@ TEST_F(CoreAPIsStandardTest, BlockedGeneratorAutoSuspensionTest) {
       [&](Expected<SymbolMap> Result) {
         Lookup1Completed = true;
         EXPECT_THAT_EXPECTED(Result, Succeeded());
-        if (Result)
+        if (Result) {
           EXPECT_EQ(*Result, SymbolMap({{Foo, FooSym}}));
+        }
       },
       NoDependenciesToRegister);
 
@@ -1256,8 +1258,9 @@ TEST_F(CoreAPIsStandardTest, BlockedGeneratorAutoSuspensionTest) {
       [&](Expected<SymbolMap> Result) {
         Lookup2Completed = true;
         EXPECT_THAT_EXPECTED(Result, Succeeded());
-        if (Result)
+        if (Result) {
           EXPECT_EQ(*Result, SymbolMap({{Bar, BarSym}}));
+        }
       },
       NoDependenciesToRegister);
 
@@ -1273,8 +1276,9 @@ TEST_F(CoreAPIsStandardTest, BlockedGeneratorAutoSuspensionTest) {
       [&](Expected<SymbolMap> Result) {
         Lookup3Completed = true;
         EXPECT_THAT_EXPECTED(Result, Succeeded());
-        if (Result)
+        if (Result) {
           EXPECT_EQ(*Result, SymbolMap({{Bar, BarSym}}));
+        }
       },
       NoDependenciesToRegister);
 
@@ -1288,8 +1292,9 @@ TEST_F(CoreAPIsStandardTest, BlockedGeneratorAutoSuspensionTest) {
       [&](Expected<SymbolMap> Result) {
         Lookup4Completed = true;
         EXPECT_THAT_EXPECTED(Result, Succeeded());
-        if (Result)
+        if (Result) {
           EXPECT_EQ(*Result, SymbolMap({{Baz, BazSym}}));
+        }
       },
       NoDependenciesToRegister);
 
@@ -1314,8 +1319,9 @@ TEST_F(CoreAPIsStandardTest, BlockedGeneratorAutoSuspensionTest) {
   EXPECT_NE(G.Lookup, std::nullopt);
 
   // Check that the most recently captured lookup is lookup 4 (for baz).
-  if (G.Lookup)
+  if (G.Lookup) {
     EXPECT_EQ(G.Lookup->Names.begin()->first, Baz);
+  }
 
   cantFail(JD.define(absoluteSymbols({{Baz, BazSym}})));
   G.takeLookup().LS.continueLookup(Error::success());

From b524eed9259e26b5812e2b55cc05e42fda217486 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Tue, 23 Jan 2024 11:51:18 +0100
Subject: [PATCH 589/843] Revert "[libc] Remove unnecessary `FPBits` functions
 and properties" (#79118)

Reverts llvm/llvm-project#79113
It broke aarch64 build bot machines.
---
 libc/fuzzing/stdlib/strtofloat_fuzz.cpp       |  2 +-
 .../FPUtil/DivisionAndRemainderOperations.h   |  2 +-
 libc/src/__support/FPUtil/FPBits.h            | 51 +++++++++---
 libc/src/__support/FPUtil/Hypot.h             |  4 +-
 .../__support/FPUtil/ManipulationFunctions.h  | 24 +++---
 libc/src/__support/FPUtil/NormalFloat.h       |  2 +-
 libc/src/__support/FPUtil/dyadic_float.h      |  2 +-
 .../src/__support/FPUtil/except_value_utils.h |  6 +-
 libc/src/__support/FPUtil/generic/FMA.h       | 10 ++-
 libc/src/__support/FPUtil/generic/FMod.h      |  2 +-
 libc/src/__support/FPUtil/generic/sqrt.h      | 18 ++---
 .../FPUtil/generic/sqrt_80_bit_long_double.h  |  8 +-
 .../FPUtil/x86_64/NextAfterLongDouble.h       | 58 ++++++-------
 libc/src/__support/common.h                   |  1 +
 libc/src/__support/str_to_float.h             |  4 +-
 libc/src/math/generic/acosf.cpp               |  2 +-
 libc/src/math/generic/acoshf.cpp              |  2 +-
 libc/src/math/generic/asinf.cpp               |  2 +-
 libc/src/math/generic/atanhf.cpp              |  4 +-
 libc/src/math/generic/cosf.cpp                |  2 +-
 libc/src/math/generic/coshf.cpp               |  6 +-
 libc/src/math/generic/exp.cpp                 |  4 +-
 libc/src/math/generic/exp10.cpp               |  4 +-
 libc/src/math/generic/exp10f_impl.h           |  6 +-
 libc/src/math/generic/exp2.cpp                |  4 +-
 libc/src/math/generic/exp2f_impl.h            |  6 +-
 libc/src/math/generic/expf.cpp                |  4 +-
 libc/src/math/generic/expm1.cpp               |  4 +-
 libc/src/math/generic/expm1f.cpp              |  4 +-
 libc/src/math/generic/log.cpp                 | 11 ++-
 libc/src/math/generic/log10.cpp               | 11 ++-
 libc/src/math/generic/log10f.cpp              |  8 +-
 libc/src/math/generic/log1p.cpp               |  7 +-
 libc/src/math/generic/log1pf.cpp              |  4 +-
 libc/src/math/generic/log2.cpp                | 11 ++-
 libc/src/math/generic/log2f.cpp               |  8 +-
 libc/src/math/generic/logf.cpp                |  9 +--
 libc/src/math/generic/powf.cpp                | 13 ++-
 libc/src/math/generic/sincosf.cpp             |  4 +-
 libc/src/math/generic/sinf.cpp                |  2 +-
 libc/src/math/generic/sinhf.cpp               |  6 +-
 libc/src/math/generic/tanf.cpp                |  2 +-
 libc/test/UnitTest/FPMatcher.h                | 40 ++++-----
 .../test/src/__support/FPUtil/fpbits_test.cpp | 81 +++++++++++--------
 libc/test/src/math/FDimTest.h                 |  2 +-
 libc/test/src/math/FmaTest.h                  | 37 ++++-----
 libc/test/src/math/HypotTest.h                | 43 +++++-----
 libc/test/src/math/ILogbTest.h                | 22 ++---
 libc/test/src/math/LdExpTest.h                |  2 +-
 libc/test/src/math/NextAfterTest.h            | 10 +--
 libc/test/src/math/RIntTest.h                 | 21 +++--
 libc/test/src/math/RemQuoTest.h               | 25 +++---
 libc/test/src/math/RoundToIntegerTest.h       | 22 +++--
 .../BinaryOpSingleOutputDiff.h                | 16 ++--
 .../SingleInputSingleOutputDiff.h             |  7 +-
 libc/test/src/math/smoke/FDimTest.h           |  2 +-
 libc/test/src/math/smoke/FmaTest.h            | 16 ++--
 libc/test/src/math/smoke/HypotTest.h          | 20 ++---
 libc/test/src/math/smoke/ILogbTest.h          | 18 ++---
 libc/test/src/math/smoke/LdExpTest.h          |  2 +-
 libc/test/src/math/smoke/NextAfterTest.h      | 14 ++--
 libc/test/src/math/smoke/NextTowardTest.h     | 22 +++--
 libc/test/src/math/smoke/RIntTest.h           |  2 +-
 libc/test/src/math/smoke/RemQuoTest.h         |  2 +-
 libc/test/src/math/smoke/RoundToIntegerTest.h | 22 +++--
 libc/test/src/stdio/sprintf_test.cpp          | 32 +++-----
 libc/test/src/stdio/sscanf_test.cpp           | 21 ++---
 libc/utils/MPFRWrapper/MPFRUtils.cpp          |  2 +-
 68 files changed, 421 insertions(+), 426 deletions(-)

diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index b773043bda67d..affef6fcf549e 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -28,7 +28,7 @@ using LIBC_NAMESPACE::fputil::FPBits;
 // exponent. Subnormals have a lower effective precision since they don't
 // necessarily use all of the bits of the mantissa.
 template <typename F> inline constexpr int effective_precision(int exponent) {
-  const int full_precision = FPBits<F>::FRACTION_LEN + 1;
+  const int full_precision = FPBits<F>::MANTISSA_PRECISION;
 
   // This is intended to be 0 when the exponent is the lowest normal and
   // increase as the exponent's magnitude increases.
diff --git a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
index ef9593a42b005..1798310c3e31e 100644
--- a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
+++ b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
@@ -31,7 +31,7 @@ LIBC_INLINE T remquo(T x, T y, int &q) {
   if (ybits.is_nan())
     return y;
   if (xbits.is_inf() || ybits.is_zero())
-    return FPBits<T>::build_quiet_nan(fputil::Sign::POS, 1).get_val();
+    return FPBits<T>::build_quiet_nan(1);
 
   if (xbits.is_zero()) {
     q = 0;
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 0a79b505ecbe1..2465158bb2cdf 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -390,7 +390,7 @@ struct FPRepSem : public FPStorage<fp_type> {
     return exp_bits() == encode(BiasedExp::BITS_ALL_ZEROES());
   }
   LIBC_INLINE constexpr bool is_normal() const {
-    return is_finite() && !is_subnormal();
+    return is_finite() && !UP::is_subnormal();
   }
   // Returns the mantissa with the implicit bit set iff the current
   // value is a valid normal number.
@@ -556,14 +556,6 @@ struct FPRep : public FPRepSem<fp_type, RetT> {
   using UP::FRACTION_MASK;
   using UP::SIGN_MASK;
 
-  // Comparison
-  LIBC_INLINE constexpr friend bool operator==(FPRep a, FPRep b) {
-    return a.uintval() == b.uintval();
-  }
-  LIBC_INLINE constexpr friend bool operator!=(FPRep a, FPRep b) {
-    return a.uintval() != b.uintval();
-  }
-
   // Representation
   LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
   LIBC_INLINE constexpr void set_uintval(StorageType value) {
@@ -706,6 +698,16 @@ struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
   using UP::bits;
 
   // Constants.
+  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
+      UP::FRACTION_LEN + 1;
+  LIBC_INLINE_VAR static constexpr StorageType MIN_NORMAL =
+      UP::min_normal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr StorageType MAX_NORMAL =
+      UP::max_normal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr StorageType MIN_SUBNORMAL =
+      UP::min_subnormal(Sign::POS).uintval();
+  LIBC_INLINE_VAR static constexpr StorageType MAX_SUBNORMAL =
+      UP::max_subnormal(Sign::POS).uintval();
   LIBC_INLINE_VAR static constexpr int MAX_BIASED_EXPONENT =
       (1 << UP::EXP_LEN) - 1;
 
@@ -729,6 +731,37 @@ struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
 
   LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
 
+  // Methods below this are used by tests.
+  // TODO: inline and remove.
+  LIBC_INLINE static constexpr T one(Sign sign = Sign::POS) {
+    return T(UP::one(sign));
+  }
+  LIBC_INLINE static constexpr T zero(Sign sign = Sign::POS) {
+    return T(UP::zero(sign));
+  }
+  LIBC_INLINE static constexpr T inf(Sign sign = Sign::POS) {
+    return T(UP::inf(sign));
+  }
+  LIBC_INLINE static constexpr T min_normal() {
+    return T(UP::min_normal(Sign::POS));
+  }
+  LIBC_INLINE static constexpr T max_normal() {
+    return T(UP::max_normal(Sign::POS));
+  }
+  LIBC_INLINE static constexpr T min_denormal() {
+    return T(UP::min_subnormal(Sign::POS));
+  }
+  LIBC_INLINE static constexpr T max_denormal() {
+    return T(UP::max_subnormal(Sign::POS));
+  }
+  LIBC_INLINE static constexpr T build_nan(StorageType v) {
+    return T(UP::build_nan(Sign::POS, v));
+  }
+  LIBC_INLINE static constexpr T build_quiet_nan(StorageType v,
+                                                 Sign sign = Sign::POS) {
+    return T(UP::build_quiet_nan(sign, v));
+  }
+
   // TODO: Use an uint32_t for 'biased_exp'.
   LIBC_INLINE static constexpr FPBits<T>
   create_value(Sign sign, StorageType biased_exp, StorageType mantissa) {
diff --git a/libc/src/__support/FPUtil/Hypot.h b/libc/src/__support/FPUtil/Hypot.h
index 82237dec09e42..c38a40dfb0898 100644
--- a/libc/src/__support/FPUtil/Hypot.h
+++ b/libc/src/__support/FPUtil/Hypot.h
@@ -197,7 +197,7 @@ LIBC_INLINE T hypot(T x, T y) {
         if (int round_mode = quick_get_round();
             round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
           return T(FPBits_t::inf());
-        return T(FPBits_t::max_normal());
+        return T(FPBits_t(FPBits_t::MAX_NORMAL));
       }
     } else {
       // For denormal result, we simply move the leading bit of the result to
@@ -254,7 +254,7 @@ LIBC_INLINE T hypot(T x, T y) {
     if (out_exp >= FPBits_t::MAX_BIASED_EXPONENT) {
       if (round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
         return T(FPBits_t::inf());
-      return T(FPBits_t::max_normal());
+      return T(FPBits_t(FPBits_t::MAX_NORMAL));
     }
   }
 
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index d7114625a9b31..81c8281f3c7bb 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -108,7 +108,7 @@ LIBC_INLINE T logb(T x) {
     return x;
   } else if (bits.is_inf()) {
     // Return positive infinity.
-    return T(FPBits<T>::inf());
+    return T(FPBits<T>::inf(Sign::POS));
   }
 
   NormalFloat<T> normal(bits);
@@ -127,7 +127,7 @@ LIBC_INLINE T ldexp(T x, int exp) {
   // that adding |exp| to it does not lead to integer rollover. But, if |exp|
   // value is larger the exponent range for type T, then we can return infinity
   // early. Because the result of the ldexp operation can be a subnormal number,
-  // we need to accommodate the (mantissaWidth + 1) worth of shift in
+  // we need to accommodate the (mantissaWidht + 1) worth of shift in
   // calculating the limit.
   int exp_limit = FPBits<T>::MAX_BIASED_EXPONENT + FPBits<T>::FRACTION_LEN + 1;
   if (exp > exp_limit)
@@ -164,22 +164,26 @@ LIBC_INLINE T nextafter(T from, U to) {
     return static_cast<T>(to);
 
   using StorageType = typename FPBits<T>::StorageType;
-  if (from != T(0)) {
-    if ((static_cast<U>(from) < to) == (from > T(0))) {
-      from_bits = FPBits<T>(StorageType(from_bits.uintval() + 1));
+  StorageType int_val = from_bits.uintval();
+  if (from != FPBits<T>::zero()) {
+    if ((static_cast<U>(from) < to) == (from > FPBits<T>::zero())) {
+      ++int_val;
     } else {
-      from_bits = FPBits<T>(StorageType(from_bits.uintval() - 1));
+      --int_val;
     }
   } else {
-    from_bits = FPBits<T>::min_subnormal(to_bits.sign());
+    int_val = FPBits<T>::MIN_SUBNORMAL;
+    if (to_bits.is_neg())
+      int_val |= FPBits<T>::SIGN_MASK;
   }
 
-  if (from_bits.is_subnormal())
+  StorageType exponent_bits = int_val & FPBits<T>::EXP_MASK;
+  if (exponent_bits == StorageType(0))
     raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
-  else if (from_bits.is_inf())
+  else if (exponent_bits == FPBits<T>::EXP_MASK)
     raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
 
-  return from_bits.get_val();
+  return cpp::bit_cast<T>(int_val);
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/FPUtil/NormalFloat.h b/libc/src/__support/FPUtil/NormalFloat.h
index fa4da33b5b17f..cfa9e14175105 100644
--- a/libc/src/__support/FPUtil/NormalFloat.h
+++ b/libc/src/__support/FPUtil/NormalFloat.h
@@ -215,7 +215,7 @@ template <> LIBC_INLINE NormalFloat<long double>::operator long double() const {
   // Max exponent is of the form 0xFF...E. That is why -2 and not -1.
   constexpr int MAX_EXPONENT_VALUE = (1 << LDBits::EXP_LEN) - 2;
   if (biased_exponent > MAX_EXPONENT_VALUE) {
-    return LDBits::inf(sign).get_val();
+    return LDBits::inf(sign);
   }
 
   FPBits<long double> result(0.0l);
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 888d7ffec241e..5449f5561d569 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -93,7 +93,7 @@ template <size_t Bits> struct DyadicFloat {
       return 0.0;
 
     // Assume that it is normalized, and output is also normal.
-    constexpr uint32_t PRECISION = FPBits<T>::FRACTION_LEN + 1;
+    constexpr uint32_t PRECISION = FPBits<T>::MANTISSA_PRECISION;
     using output_bits_t = typename FPBits<T>::StorageType;
 
     int exp_hi = exponent + static_cast<int>((Bits - 1) + FPBits<T>::EXP_BIAS);
diff --git a/libc/src/__support/FPUtil/except_value_utils.h b/libc/src/__support/FPUtil/except_value_utils.h
index 1e0381194009d..89849540315f6 100644
--- a/libc/src/__support/FPUtil/except_value_utils.h
+++ b/libc/src/__support/FPUtil/except_value_utils.h
@@ -102,13 +102,15 @@ template <typename T, size_t N> struct ExceptValues {
 // Helper functions to set results for exceptional cases.
 template <typename T> LIBC_INLINE T round_result_slightly_down(T value_rn) {
   volatile T tmp = value_rn;
-  tmp -= FPBits<T>::min_normal().get_val();
+  const T MIN_NORMAL = FPBits<T>::min_normal();
+  tmp = tmp - MIN_NORMAL;
   return tmp;
 }
 
 template <typename T> LIBC_INLINE T round_result_slightly_up(T value_rn) {
   volatile T tmp = value_rn;
-  tmp += FPBits<T>::min_normal().get_val();
+  const T MIN_NORMAL = FPBits<T>::min_normal();
+  tmp = tmp + MIN_NORMAL;
   return tmp;
 }
 
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 5c36463ea5021..6285cac1983d1 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -130,9 +130,9 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
     return x * y + z;
 
   // Extract mantissa and append hidden leading bits.
-  UInt128 x_mant = x_bits.get_explicit_mantissa();
-  UInt128 y_mant = y_bits.get_explicit_mantissa();
-  UInt128 z_mant = z_bits.get_explicit_mantissa();
+  UInt128 x_mant = x_bits.get_mantissa() | FPBits::MIN_NORMAL;
+  UInt128 y_mant = y_bits.get_mantissa() | FPBits::MIN_NORMAL;
+  UInt128 z_mant = z_bits.get_mantissa() | FPBits::MIN_NORMAL;
 
   // If the exponent of the product x*y > the exponent of z, then no extra
   // precision beside the entire product x*y is needed.  On the other hand, when
@@ -255,7 +255,9 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
     if ((round_mode == FE_TOWARDZERO) ||
         (round_mode == FE_UPWARD && prod_sign.is_neg()) ||
         (round_mode == FE_DOWNWARD && prod_sign.is_pos())) {
-      return FPBits::max_normal(prod_sign).get_val();
+      result = FPBits::MAX_NORMAL;
+      return prod_sign.is_neg() ? -cpp::bit_cast<double>(result)
+                                : cpp::bit_cast<double>(result);
     }
     return static_cast<double>(FPBits::inf(prod_sign));
   }
diff --git a/libc/src/__support/FPUtil/generic/FMod.h b/libc/src/__support/FPUtil/generic/FMod.h
index 18355b801dbc7..f4000b97751ef 100644
--- a/libc/src/__support/FPUtil/generic/FMod.h
+++ b/libc/src/__support/FPUtil/generic/FMod.h
@@ -124,7 +124,7 @@ template <typename T> struct FModExceptionalInputHandler {
 
   LIBC_INLINE static bool pre_check(T x, T y, T &out) {
     using FPB = fputil::FPBits<T>;
-    const T quiet_nan = FPB::build_quiet_nan().get_val();
+    const T quiet_nan = FPB::build_quiet_nan(0);
     FPB sx(x), sy(y);
     if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() &&
                     !sx.is_inf_or_nan())) {
diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index 0a0690ec1463b..21ae9d081d3f1 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -71,19 +71,15 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
     return x86::sqrt(x);
   } else {
     // IEEE floating points formats.
-    using Sign = fputil::Sign;
-    using FPBits_t = typename fputil::FPBits<T>;
-    using StorageType = typename FPBits_t::StorageType;
-    constexpr StorageType ONE = StorageType(1) << FPBits_t::FRACTION_LEN;
-    constexpr auto FLT_NAN =
-        FPBits_t::build_quiet_nan(Sign::POS, ONE >> 1).get_val();
+    using StorageType = typename FPBits<T>::StorageType;
+    constexpr StorageType ONE = StorageType(1) << FPBits<T>::FRACTION_LEN;
 
-    FPBits_t bits(x);
+    FPBits<T> bits(x);
 
     if (bits.is_inf_or_nan()) {
       if (bits.is_neg() && (bits.get_mantissa() == 0)) {
         // sqrt(-Inf) = NaN
-        return FLT_NAN;
+        return FPBits<T>::build_quiet_nan(ONE >> 1);
       } else {
         // sqrt(NaN) = NaN
         // sqrt(+Inf) = +Inf
@@ -95,7 +91,7 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
       return x;
     } else if (bits.is_neg()) {
       // sqrt( negative numbers ) = NaN
-      return FLT_NAN;
+      return FPBits<T>::build_quiet_nan(ONE >> 1);
     } else {
       int x_exp = bits.get_exponent();
       StorageType x_mant = bits.get_mantissa();
@@ -149,10 +145,10 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
       }
 
       // Remove hidden bit and append the exponent field.
-      x_exp = ((x_exp >> 1) + FPBits_t::EXP_BIAS);
+      x_exp = ((x_exp >> 1) + FPBits<T>::EXP_BIAS);
 
       y = (y - ONE) |
-          (static_cast<StorageType>(x_exp) << FPBits_t::FRACTION_LEN);
+          (static_cast<StorageType>(x_exp) << FPBits<T>::FRACTION_LEN);
 
       switch (quick_get_round()) {
       case FE_TONEAREST:
diff --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
index b0a3776029ca7..4f8d136938f56 100644
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@@ -38,16 +38,14 @@ LIBC_INLINE long double sqrt(long double x);
 LIBC_INLINE long double sqrt(long double x) {
   using LDBits = FPBits<long double>;
   using StorageType = typename LDBits::StorageType;
-  using Sign = fputil::Sign;
   constexpr StorageType ONE = StorageType(1) << int(LDBits::FRACTION_LEN);
-  constexpr auto LDNAN = LDBits::build_quiet_nan(Sign::POS, ONE >> 1).get_val();
 
-  LDBits bits(x);
+  FPBits<long double> bits(x);
 
   if (bits.is_inf_or_nan()) {
     if (bits.is_neg() && (bits.get_mantissa() == 0)) {
       // sqrt(-Inf) = NaN
-      return LDNAN;
+      return LDBits::build_quiet_nan(ONE >> 1);
     } else {
       // sqrt(NaN) = NaN
       // sqrt(+Inf) = +Inf
@@ -59,7 +57,7 @@ LIBC_INLINE long double sqrt(long double x) {
     return x;
   } else if (bits.is_neg()) {
     // sqrt( negative numbers ) = NaN
-    return LDNAN;
+    return LDBits::build_quiet_nan(ONE >> 1);
   } else {
     int x_exp = bits.get_explicit_exponent();
     StorageType x_mant = bits.get_mantissa();
diff --git a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
index d1c76ba954b93..512f5de4e7931 100644
--- a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
+++ b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
@@ -43,18 +43,16 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
   }
 
   using StorageType = FPBits::StorageType;
-
+  constexpr StorageType SIGN_VAL = (StorageType(1) << 79);
   constexpr StorageType FRACTION_MASK = FPBits::FRACTION_MASK;
-  // StorageType int_val = from_bits.uintval();
-  if (from == 0.0l) { // +0.0 / -0.0
-    from_bits = FPBits::min_subnormal(from > to ? Sign::NEG : Sign::POS);
-  } else if (from < 0.0l) {
-    if (to < from) { // toward -inf
-      if (from_bits == FPBits::max_subnormal(Sign::NEG)) {
+  StorageType int_val = from_bits.uintval();
+  if (from < 0.0l) {
+    if (from > to) {
+      if (int_val == (SIGN_VAL + FPBits::MAX_SUBNORMAL)) {
         // We deal with normal/subnormal boundary separately to avoid
         // dealing with the implicit bit.
-        from_bits = FPBits::min_normal(Sign::NEG);
-      } else if (from_bits.get_mantissa() == FRACTION_MASK) {
+        int_val = SIGN_VAL + FPBits::MIN_NORMAL;
+      } else if ((int_val & FRACTION_MASK) == FRACTION_MASK) {
         from_bits.set_mantissa(0);
         // Incrementing exponent might overflow the value to infinity,
         // which is what is expected. Since NaNs are handling separately,
@@ -64,40 +62,45 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
           raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
         return from_bits.get_val();
       } else {
-        from_bits = FPBits(StorageType(from_bits.uintval() + 1));
+        ++int_val;
       }
-    } else { // toward +inf
-      if (from_bits == FPBits::min_normal(Sign::NEG)) {
+    } else {
+      if (int_val == (SIGN_VAL + FPBits::MIN_NORMAL)) {
         // We deal with normal/subnormal boundary separately to avoid
         // dealing with the implicit bit.
-        from_bits = FPBits::max_subnormal(Sign::NEG);
-      } else if (from_bits.get_mantissa() == 0) {
+        int_val = SIGN_VAL + FPBits::MAX_SUBNORMAL;
+      } else if ((int_val & FRACTION_MASK) == 0) {
         from_bits.set_mantissa(FRACTION_MASK);
         // from == 0 is handled separately so decrementing the exponent will not
         // lead to underflow.
         from_bits.set_biased_exponent(from_bits.get_biased_exponent() - 1);
         return from_bits.get_val();
       } else {
-        from_bits = FPBits(StorageType(from_bits.uintval() - 1));
+        --int_val;
       }
     }
+  } else if (from == 0.0l) {
+    if (from > to)
+      int_val = SIGN_VAL + 1;
+    else
+      int_val = 1;
   } else {
-    if (to < from) { // toward -inf
-      if (from_bits == FPBits::min_normal(Sign::POS)) {
-        from_bits = FPBits::max_subnormal(Sign::POS);
-      } else if (from_bits.get_mantissa() == 0) {
+    if (from > to) {
+      if (int_val == FPBits::MIN_NORMAL) {
+        int_val = FPBits::MAX_SUBNORMAL;
+      } else if ((int_val & FRACTION_MASK) == 0) {
         from_bits.set_mantissa(FRACTION_MASK);
         // from == 0 is handled separately so decrementing the exponent will not
         // lead to underflow.
         from_bits.set_biased_exponent(from_bits.get_biased_exponent() - 1);
         return from_bits.get_val();
       } else {
-        from_bits = FPBits(StorageType(from_bits.uintval() - 1));
+        --int_val;
       }
-    } else { // toward +inf
-      if (from_bits == FPBits::max_subnormal(Sign::POS)) {
-        from_bits = FPBits::min_normal(Sign::POS);
-      } else if (from_bits.get_mantissa() == FRACTION_MASK) {
+    } else {
+      if (int_val == FPBits::MAX_SUBNORMAL) {
+        int_val = FPBits::MIN_NORMAL;
+      } else if ((int_val & FRACTION_MASK) == FRACTION_MASK) {
         from_bits.set_mantissa(0);
         // Incrementing exponent might overflow the value to infinity,
         // which is what is expected. Since NaNs are handling separately,
@@ -107,15 +110,16 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
           raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
         return from_bits.get_val();
       } else {
-        from_bits = FPBits(StorageType(from_bits.uintval() + 1));
+        ++int_val;
       }
     }
   }
 
-  if (!from_bits.get_implicit_bit())
+  StorageType implicit_bit = int_val & (StorageType(1) << FPBits::FRACTION_LEN);
+  if (implicit_bit == StorageType(0))
     raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
 
-  return from_bits.get_val();
+  return cpp::bit_cast<long double>(int_val);
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/common.h b/libc/src/__support/common.h
index a153dfc363d73..53951dc131c28 100644
--- a/libc/src/__support/common.h
+++ b/libc/src/__support/common.h
@@ -25,6 +25,7 @@
 #define LLVM_LIBC_FUNCTION_IMPL(type, name, arglist)                           \
   LLVM_LIBC_FUNCTION_ATTR decltype(LIBC_NAMESPACE::name)                       \
       __##name##_impl__ __asm__(#name);                                        \
+  decltype(LIBC_NAMESPACE::name) name [[gnu::alias(#name)]];                   \
   type __##name##_impl__ arglist
 #else
 #define LLVM_LIBC_FUNCTION_IMPL(type, name, arglist) type name arglist
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 9655c993bee28..8aeb3d2cea03d 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -1167,7 +1167,7 @@ LIBC_INLINE StrToNumResult<T> strtofloatingpoint(const char *__restrict src) {
           index = left_paren;
         }
       }
-      result = FPBits(result.build_quiet_nan(result.sign(), nan_mantissa));
+      result = FPBits(result.build_quiet_nan(nan_mantissa, result.sign()));
     }
   } else if (tolower(src[index]) == 'i') { // INF
     if (tolower(src[index + 1]) == inf_string[1] &&
@@ -1215,7 +1215,7 @@ template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) {
     nan_mantissa = static_cast<StorageType>(nan_mantissa_result);
   }
 
-  result = FPBits::build_quiet_nan(fputil::Sign::POS, nan_mantissa);
+  result = FPBits(result.build_quiet_nan(nan_mantissa));
   return {T(result), 0, error};
 }
 
diff --git a/libc/src/math/generic/acosf.cpp b/libc/src/math/generic/acosf.cpp
index 7b2a09101182e..67832596a67fb 100644
--- a/libc/src/math/generic/acosf.cpp
+++ b/libc/src/math/generic/acosf.cpp
@@ -85,7 +85,7 @@ LLVM_LIBC_FUNCTION(float, acosf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_quiet_nan().get_val();
+    return x + FPBits::build_quiet_nan(0);
   }
 
   // When 0.5 < |x| < 1, we perform range reduction as follow:
diff --git a/libc/src/math/generic/acoshf.cpp b/libc/src/math/generic/acoshf.cpp
index a546cc2268b04..b0b87095fbb07 100644
--- a/libc/src/math/generic/acoshf.cpp
+++ b/libc/src/math/generic/acoshf.cpp
@@ -29,7 +29,7 @@ LLVM_LIBC_FUNCTION(float, acoshf, (float x)) {
     // x < 1.
     fputil::set_errno_if_required(EDOM);
     fputil::raise_except_if_required(FE_INVALID);
-    return FPBits_t::build_quiet_nan().get_val();
+    return FPBits_t::build_quiet_nan(0);
   }
 
   if (LIBC_UNLIKELY(x_u >= 0x4f8ffb03)) {
diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp
index ee8e853063644..bc0d27c1eebc5 100644
--- a/libc/src/math/generic/asinf.cpp
+++ b/libc/src/math/generic/asinf.cpp
@@ -109,7 +109,7 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_nan(Sign::POS, FPBits::FRACTION_MASK).get_val();
+    return x + FPBits::build_nan(FPBits::FRACTION_MASK);
   }
 
   // Check for exceptional values
diff --git a/libc/src/math/generic/atanhf.cpp b/libc/src/math/generic/atanhf.cpp
index cd0acbf24e928..fd6f5c96b6b4e 100644
--- a/libc/src/math/generic/atanhf.cpp
+++ b/libc/src/math/generic/atanhf.cpp
@@ -29,11 +29,11 @@ LLVM_LIBC_FUNCTION(float, atanhf, (float x)) {
     if (x_abs == 0x3F80'0000U) {
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return FPBits::inf(sign).get_val();
+      return FPBits::inf(sign);
     } else {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::build_quiet_nan().get_val();
+      return FPBits::build_quiet_nan(0);
     }
   }
 
diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp
index 132e72c0f65da..89333ab19e89f 100644
--- a/libc/src/math/generic/cosf.cpp
+++ b/libc/src/math/generic/cosf.cpp
@@ -118,7 +118,7 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_quiet_nan().get_val();
+    return x + FPBits::build_quiet_nan(0);
   }
 
   // Combine the results with the sine of sum formula:
diff --git a/libc/src/math/generic/coshf.cpp b/libc/src/math/generic/coshf.cpp
index a618056a64dc8..3b01852e9f544 100644
--- a/libc/src/math/generic/coshf.cpp
+++ b/libc/src/math/generic/coshf.cpp
@@ -32,16 +32,16 @@ LLVM_LIBC_FUNCTION(float, coshf, (float x)) {
     }
 
     if (xbits.is_inf_or_nan())
-      return x + FPBits::inf().get_val();
+      return x + FPBits::inf();
 
     int rounding = fputil::quick_get_round();
     if (LIBC_UNLIKELY(rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO))
-      return FPBits::max_normal().get_val();
+      return FPBits::max_normal();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
 
-    return x + FPBits::inf().get_val();
+    return x + FPBits::inf();
   }
 
   // TODO: We should be able to reduce the latency and reciprocal throughput
diff --git a/libc/src/math/generic/exp.cpp b/libc/src/math/generic/exp.cpp
index 49ea1699bb209..a1b4d9a64f969 100644
--- a/libc/src/math/generic/exp.cpp
+++ b/libc/src/math/generic/exp.cpp
@@ -205,7 +205,7 @@ double set_exceptional(double x) {
       return x;
 
     if (fputil::quick_get_round() == FE_UPWARD)
-      return FPBits::min_subnormal().get_val();
+      return FPBits::min_denormal();
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_UNDERFLOW);
     return 0.0;
@@ -216,7 +216,7 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal().get_val();
+      return FPBits::max_normal();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp
index f1da03cba0b30..e441f2c0edc7d 100644
--- a/libc/src/math/generic/exp10.cpp
+++ b/libc/src/math/generic/exp10.cpp
@@ -248,7 +248,7 @@ double set_exceptional(double x) {
         return x;
 
       if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_subnormal().get_val();
+        return FPBits::min_denormal();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0;
@@ -262,7 +262,7 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal().get_val();
+      return FPBits::max_normal();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/math/generic/exp10f_impl.h
index ff4c1c3aec67c..2861659a6e57e 100644
--- a/libc/src/math/generic/exp10f_impl.h
+++ b/libc/src/math/generic/exp10f_impl.h
@@ -42,7 +42,7 @@ LIBC_INLINE float exp10f(float x) {
       if (xbits.is_nan())
         return x;
       if (fputil::fenv_is_round_up())
-        return FPBits::min_subnormal().get_val();
+        return FPBits::min_denormal();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0f;
@@ -53,13 +53,13 @@ LIBC_INLINE float exp10f(float x) {
       if (x_u < 0x7f80'0000U) {
         int rounding = fputil::quick_get_round();
         if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal().get_val();
+          return FPBits::max_normal();
 
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_OVERFLOW);
       }
       // x is +inf or nan
-      return x + FPBits::inf().get_val();
+      return x + FPBits::inf();
     }
   }
 
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 508bff9bd9fc9..70bc4870806a9 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -223,7 +223,7 @@ double set_exceptional(double x) {
         return x;
 
       if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_subnormal().get_val();
+        return FPBits::min_denormal();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0;
@@ -237,7 +237,7 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal().get_val();
+      return FPBits::max_normal();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/exp2f_impl.h b/libc/src/math/generic/exp2f_impl.h
index d2342e289fcba..86360840b96e6 100644
--- a/libc/src/math/generic/exp2f_impl.h
+++ b/libc/src/math/generic/exp2f_impl.h
@@ -76,13 +76,13 @@ LIBC_INLINE float exp2f(float x) {
       if (x_u < 0x7f80'0000U) {
         int rounding = fputil::quick_get_round();
         if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal().get_val();
+          return FPBits::max_normal();
 
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_OVERFLOW);
       }
       // x is +inf or nan
-      return x + FPBits::inf().get_val();
+      return x + FPBits::inf();
     }
     // x <= -150
     if (x_u >= 0xc316'0000U) {
@@ -93,7 +93,7 @@ LIBC_INLINE float exp2f(float x) {
       if (xbits.is_nan())
         return x;
       if (fputil::fenv_is_round_up())
-        return FPBits::min_subnormal().get_val();
+        return FPBits::min_denormal();
       if (x != 0.0f) {
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_UNDERFLOW);
diff --git a/libc/src/math/generic/expf.cpp b/libc/src/math/generic/expf.cpp
index f3ce8400d0a41..88d408994fe42 100644
--- a/libc/src/math/generic/expf.cpp
+++ b/libc/src/math/generic/expf.cpp
@@ -50,7 +50,7 @@ LLVM_LIBC_FUNCTION(float, expf, (float x)) {
       if (xbits.is_nan())
         return x;
       if (fputil::fenv_is_round_up())
-        return FPBits::min_subnormal().get_val();
+        return FPBits::min_denormal();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0f;
@@ -61,7 +61,7 @@ LLVM_LIBC_FUNCTION(float, expf, (float x)) {
       if (xbits.uintval() < 0x7f80'0000U) {
         int rounding = fputil::quick_get_round();
         if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal().get_val();
+          return FPBits::max_normal();
 
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index c1fb80309d7b4..d9fccf98e8caa 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -267,13 +267,13 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal().get_val();
+      return FPBits::max_normal();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
   }
   // x is +inf or nan
-  return x + FPBits::inf().get_val();
+  return x + static_cast<double>(FPBits::inf());
 }
 
 } // namespace
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index 037e60021b296..c6e0663ec46c3 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -68,12 +68,12 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
         if (xbits.uintval() < 0x7f80'0000U) {
           int rounding = fputil::quick_get_round();
           if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-            return FPBits::max_normal().get_val();
+            return FPBits::max_normal();
 
           fputil::set_errno_if_required(ERANGE);
           fputil::raise_except_if_required(FE_OVERFLOW);
         }
-        return x + FPBits::inf().get_val();
+        return x + static_cast<float>(FPBits::inf());
       }
     }
   }
diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp
index b0f7e8c9afa54..2db6b7f48fd09 100644
--- a/libc/src/math/generic/log.cpp
+++ b/libc/src/math/generic/log.cpp
@@ -732,29 +732,28 @@ double log_accurate(int e_x, int index, double m_x) {
 
 LLVM_LIBC_FUNCTION(double, log, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
-  using Sign = fputil::Sign;
   FPBits_t xbits(x);
   uint64_t x_u = xbits.uintval();
 
   int x_e = -FPBits_t::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(xbits == FPBits_t::one())) {
+  if (LIBC_UNLIKELY(x_u == 0x3FF0'0000'0000'0000ULL)) {
     // log(1.0) = +0.0
     return 0.0;
   }
 
-  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::min_normal().uintval() ||
-                    xbits.uintval() > FPBits_t::max_normal().uintval())) {
+  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::MIN_NORMAL ||
+                    xbits.uintval() > FPBits_t::MAX_NORMAL)) {
     if (xbits.is_zero()) {
       // return -Inf and raise FE_DIVBYZERO.
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return FPBits_t::inf(Sign::NEG).get_val();
+      return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits_t::build_quiet_nan().get_val();
+      return FPBits_t::build_quiet_nan(0);
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index 55a3fc5c061e5..3a4d321fdb18c 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -733,29 +733,28 @@ double log10_accurate(int e_x, int index, double m_x) {
 
 LLVM_LIBC_FUNCTION(double, log10, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
-  using Sign = fputil::Sign;
   FPBits_t xbits(x);
   uint64_t x_u = xbits.uintval();
 
   int x_e = -FPBits_t::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(xbits == FPBits_t::one())) {
+  if (LIBC_UNLIKELY(x_u == 0x3FF0'0000'0000'0000ULL)) {
     // log10(1.0) = +0.0
     return 0.0;
   }
 
-  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::min_normal().uintval() ||
-                    xbits.uintval() > FPBits_t::max_normal().uintval())) {
+  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::MIN_NORMAL ||
+                    xbits.uintval() > FPBits_t::MAX_NORMAL)) {
     if (xbits.is_zero()) {
       // return -Inf and raise FE_DIVBYZERO.
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return FPBits_t::inf(Sign::NEG).get_val();
+      return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits_t::build_quiet_nan().get_val();
+      return FPBits_t::build_quiet_nan(0);
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index f87e34ec5fb38..46505f4e07e67 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -106,7 +106,6 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
   constexpr double LOG10_2 = 0x1.34413509f79ffp-2;
 
   using FPBits = typename fputil::FPBits<float>;
-  using Sign = fputil::Sign;
   FPBits xbits(x);
   uint32_t x_u = xbits.uintval();
 
@@ -161,19 +160,18 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
 
   int m = -FPBits::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(x_u < FPBits::min_normal().uintval() ||
-                    x_u > FPBits::max_normal().uintval())) {
+  if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL || x_u > FPBits::MAX_NORMAL)) {
     if (xbits.is_zero()) {
       // Return -inf and raise FE_DIVBYZERO
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return FPBits::inf(Sign::NEG).get_val();
+      return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       // Return NaN and raise FE_INVALID
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::build_quiet_nan().get_val();
+      return FPBits::build_quiet_nan(0);
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp
index ae432620ba328..731fecae6f1b5 100644
--- a/libc/src/math/generic/log1p.cpp
+++ b/libc/src/math/generic/log1p.cpp
@@ -873,7 +873,6 @@ LIBC_INLINE double log1p_accurate(int e_x, int index,
 
 LLVM_LIBC_FUNCTION(double, log1p, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
-  using Sign = fputil::Sign;
   constexpr int EXP_BIAS = FPBits_t::EXP_BIAS;
   constexpr int FRACTION_LEN = FPBits_t::FRACTION_LEN;
   constexpr uint64_t FRACTION_MASK = FPBits_t::FRACTION_MASK;
@@ -888,19 +887,19 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) {
     // |x| >= 1
     if (LIBC_UNLIKELY(x_u >= 0x4650'0000'0000'0000ULL)) {
       // x >= 2^102 or x is negative, inf, or NaN
-      if (LIBC_UNLIKELY(x_u > FPBits_t::max_normal().uintval())) {
+      if (LIBC_UNLIKELY(x_u > FPBits_t::MAX_NORMAL)) {
         // x <= -1.0 or x is Inf or NaN
         if (x_u == 0xbff0'0000'0000'0000ULL) {
           // x = -1.0
           fputil::set_errno_if_required(ERANGE);
           fputil::raise_except_if_required(FE_DIVBYZERO);
-          return FPBits_t::inf(Sign::NEG).get_val();
+          return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
         }
         if (xbits.is_neg() && !xbits.is_nan()) {
           // x < -1.0
           fputil::set_errno_if_required(EDOM);
           fputil::raise_except_if_required(FE_INVALID);
-          return FPBits_t::build_quiet_nan().get_val();
+          return FPBits_t::build_quiet_nan(0);
         }
         // x is +Inf or NaN
         return x;
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index bc472caf54f89..0812569c624b8 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -43,11 +43,11 @@ LIBC_INLINE float log(double x) {
 
   uint64_t x_u = xbits.uintval();
 
-  if (LIBC_UNLIKELY(x_u > FPBits::max_normal().uintval())) {
+  if (LIBC_UNLIKELY(x_u > FPBits::MAX_NORMAL)) {
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return fputil::FPBits<float>::build_quiet_nan().get_val();
+      return fputil::FPBits<float>::build_quiet_nan(0);
     }
     return static_cast<float>(x);
   }
diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp
index 480c690a65037..5b7fb65d77385 100644
--- a/libc/src/math/generic/log2.cpp
+++ b/libc/src/math/generic/log2.cpp
@@ -854,29 +854,28 @@ double log2_accurate(int e_x, int index, double m_x) {
 
 LLVM_LIBC_FUNCTION(double, log2, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
-  using Sign = fputil::Sign;
   FPBits_t xbits(x);
   uint64_t x_u = xbits.uintval();
 
   int x_e = -FPBits_t::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(xbits == FPBits_t::one())) {
+  if (LIBC_UNLIKELY(x_u == 0x3FF0'0000'0000'0000ULL)) {
     // log2(1.0) = +0.0
     return 0.0;
   }
 
-  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::min_normal().uintval() ||
-                    xbits.uintval() > FPBits_t::max_normal().uintval())) {
+  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::MIN_NORMAL ||
+                    xbits.uintval() > FPBits_t::MAX_NORMAL)) {
     if (xbits.is_zero()) {
       // return -Inf and raise FE_DIVBYZERO.
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return FPBits_t::inf(Sign::NEG).get_val();
+      return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits_t::build_quiet_nan().get_val();
+      return FPBits_t::build_quiet_nan(0);
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index 7eaa5d53ccedd..9cddbb9e8ea48 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -55,7 +55,6 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
   using FPBits = typename fputil::FPBits<float>;
-  using Sign = fputil::Sign;
   FPBits xbits(x);
   uint32_t x_u = xbits.uintval();
 
@@ -69,17 +68,16 @@ LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
     return 0.0f;
 
   // Exceptional inputs.
-  if (LIBC_UNLIKELY(x_u < FPBits::min_normal().uintval() ||
-                    x_u > FPBits::max_normal().uintval())) {
+  if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL || x_u > FPBits::MAX_NORMAL)) {
     if (xbits.is_zero()) {
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return FPBits::inf(Sign::NEG).get_val();
+      return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except(FE_INVALID);
-      return FPBits::build_quiet_nan().get_val();
+      return FPBits::build_quiet_nan(0);
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index 88f7ea01b2f19..8ccb55dcc9e33 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -54,7 +54,6 @@ namespace LIBC_NAMESPACE {
 LLVM_LIBC_FUNCTION(float, logf, (float x)) {
   constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
   using FPBits = typename fputil::FPBits<float>;
-  using Sign = fputil::Sign;
   FPBits xbits(x);
   uint32_t x_u = xbits.uintval();
 
@@ -80,7 +79,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
 #endif // LIBC_TARGET_CPU_HAS_FMA
     }
     // Subnormal inputs.
-    if (LIBC_UNLIKELY(x_u < FPBits::min_normal().uintval())) {
+    if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL)) {
       if (x_u == 0) {
         // Return -inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(ERANGE);
@@ -113,18 +112,18 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
 #endif // LIBC_TARGET_CPU_HAS_FMA
     }
     // Exceptional inputs.
-    if (LIBC_UNLIKELY(x_u > FPBits::max_normal().uintval())) {
+    if (LIBC_UNLIKELY(x_u > FPBits::MAX_NORMAL)) {
       if (x_u == 0x8000'0000U) {
         // Return -inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return FPBits::inf(Sign::NEG).get_val();
+        return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
       }
       if (xbits.is_neg() && !xbits.is_nan()) {
         // Return NaN and raise FE_INVALID
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::build_quiet_nan().get_val();
+        return FPBits::build_quiet_nan(0);
       }
       // x is +inf or nan
       return x;
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index 0e164ab8b4225..932f1d70c33e8 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -547,15 +547,14 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return FloatBits::inf().get_val();
+        return FloatBits::inf();
       }
       // pow (|x| < 1, -inf) = +inf
       // pow (|x| < 1, +inf) = 0.0f
       // pow (|x| > 1, -inf) = 0.0f
       // pow (|x| > 1, +inf) = +inf
-      return ((x_abs < 0x3f80'0000) == (y_u == 0xff80'0000))
-                 ? FloatBits::inf().get_val()
-                 : 0.0f;
+      return ((x_abs < 0x3f80'0000) == (y_u == 0xff80'0000)) ? FloatBits::inf()
+                                                             : 0.0f;
     }
     default:
       // Speed up for common exponents
@@ -618,7 +617,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // pow(0, negative number) = inf
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val();
+        return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS);
       }
       // pow(0, positive number) = 0
       return out_is_neg ? -0.0f : 0.0f;
@@ -629,7 +628,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
       if (y_u >= FloatBits::SIGN_MASK) {
         return out_is_neg ? -0.0f : 0.0f;
       }
-      return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val();
+      return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS);
     }
     }
 
@@ -657,7 +656,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // pow( negative, non-integer ) = NaN
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_INVALID);
-        return FloatBits::build_quiet_nan().get_val();
+        return FloatBits::build_quiet_nan(0);
       }
     }
   }
diff --git a/libc/src/math/generic/sincosf.cpp b/libc/src/math/generic/sincosf.cpp
index f12b93a0e6965..44371db710871 100644
--- a/libc/src/math/generic/sincosf.cpp
+++ b/libc/src/math/generic/sincosf.cpp
@@ -148,9 +148,7 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    *sinp =
-        x +
-        FPBits::build_nan(fputil::Sign::POS, FPBits::FRACTION_MASK).get_val();
+    *sinp = x + FPBits::build_nan(FPBits::FRACTION_MASK);
     *cosp = *sinp;
     return;
   }
diff --git a/libc/src/math/generic/sinf.cpp b/libc/src/math/generic/sinf.cpp
index 7ba479f0a4598..9e574d4e57240 100644
--- a/libc/src/math/generic/sinf.cpp
+++ b/libc/src/math/generic/sinf.cpp
@@ -139,7 +139,7 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_quiet_nan().get_val();
+    return x + FPBits::build_quiet_nan(0);
   }
 
   // Combine the results with the sine of sum formula:
diff --git a/libc/src/math/generic/sinhf.cpp b/libc/src/math/generic/sinhf.cpp
index 780c9a1f8d6ac..b3850c6742706 100644
--- a/libc/src/math/generic/sinhf.cpp
+++ b/libc/src/math/generic/sinhf.cpp
@@ -56,16 +56,16 @@ LLVM_LIBC_FUNCTION(float, sinhf, (float x)) {
     int rounding = fputil::quick_get_round();
     if (xbits.is_neg()) {
       if (LIBC_UNLIKELY(rounding == FE_UPWARD || rounding == FE_TOWARDZERO))
-        return -FPBits::max_normal().get_val();
+        return -FPBits::max_normal();
     } else {
       if (LIBC_UNLIKELY(rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO))
-        return FPBits::max_normal().get_val();
+        return FPBits::max_normal();
     }
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
 
-    return x + FPBits::inf(xbits.sign()).get_val();
+    return x + FPBits::inf(xbits.sign());
   }
 
   // sinh(x) = (e^x - e^(-x)) / 2.
diff --git a/libc/src/math/generic/tanf.cpp b/libc/src/math/generic/tanf.cpp
index 09dd62eae03f8..7909e9e5d5568 100644
--- a/libc/src/math/generic/tanf.cpp
+++ b/libc/src/math/generic/tanf.cpp
@@ -114,7 +114,7 @@ LLVM_LIBC_FUNCTION(float, tanf, (float x)) {
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_INVALID);
       }
-      return x + FPBits::build_quiet_nan().get_val();
+      return x + FPBits::build_quiet_nan(0);
     }
     // Other large exceptional values
     if (auto r = TANF_EXCEPTS.lookup_odd(x_abs, x_sign);
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 9880fa51c5481..26b8e3e60bdfd 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -66,16 +66,16 @@ template <typename T> struct FPTest : public Test {
   using Sign = LIBC_NAMESPACE::fputil::Sign;
   static constexpr StorageType STORAGE_MAX =
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();
-  static constexpr T zero = T(FPBits::zero(Sign::POS));
-  static constexpr T neg_zero = T(FPBits::zero(Sign::NEG));
-  static constexpr T aNaN = T(FPBits::build_quiet_nan(Sign::POS, 1));
-  static constexpr T sNaN = T(FPBits::build_nan(Sign::POS, 1));
-  static constexpr T inf = T(FPBits::inf(Sign::POS));
-  static constexpr T neg_inf = T(FPBits::inf(Sign::NEG));
-  static constexpr T min_normal = T(FPBits::min_normal());
-  static constexpr T max_normal = T(FPBits::max_normal());
-  static constexpr T min_denormal = T(FPBits::min_subnormal());
-  static constexpr T max_denormal = T(FPBits::max_subnormal());
+  static constexpr T zero = FPBits::zero(Sign::POS);
+  static constexpr T neg_zero = FPBits::zero(Sign::NEG);
+  static constexpr T aNaN = FPBits::build_quiet_nan(1);
+  static constexpr T sNaN = FPBits::build_nan(1);
+  static constexpr T inf = FPBits::inf(Sign::POS);
+  static constexpr T neg_inf = FPBits::inf(Sign::NEG);
+  static constexpr T min_normal = FPBits::min_normal();
+  static constexpr T max_normal = FPBits::max_normal();
+  static constexpr T min_denormal = FPBits::min_denormal();
+  static constexpr T max_denormal = FPBits::max_denormal();
 
   static constexpr int N_ROUNDING_MODES = 4;
   static constexpr fputil::testing::RoundingMode ROUNDING_MODES[4] = {
@@ -95,16 +95,16 @@ template <typename T> struct FPTest : public Test {
   using Sign = LIBC_NAMESPACE::fputil::Sign;                                   \
   static constexpr StorageType STORAGE_MAX =                                   \
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();                 \
-  const T zero = T(FPBits::zero(Sign::POS));                                   \
-  const T neg_zero = T(FPBits::zero(Sign::NEG));                               \
-  const T aNaN = T(FPBits::build_quiet_nan(Sign::POS, 1));                     \
-  const T sNaN = T(FPBits::build_nan(Sign::POS, 1));                           \
-  const T inf = T(FPBits::inf(Sign::POS));                                     \
-  const T neg_inf = T(FPBits::inf(Sign::NEG));                                 \
-  const T min_normal = T(FPBits::min_normal());                                \
-  const T max_normal = T(FPBits::max_normal());                                \
-  const T min_denormal = T(FPBits::min_subnormal());                           \
-  const T max_denormal = T(FPBits::max_subnormal());
+  const T zero = FPBits::zero(Sign::POS);                                      \
+  const T neg_zero = FPBits::zero(Sign::NEG);                                  \
+  const T aNaN = FPBits::build_quiet_nan(1);                                   \
+  const T sNaN = FPBits::build_nan(1);                                         \
+  const T inf = FPBits::inf(Sign::POS);                                        \
+  const T neg_inf = FPBits::inf(Sign::NEG);                                    \
+  const T min_normal = FPBits::min_normal();                                   \
+  const T max_normal = FPBits::max_normal();                                   \
+  const T min_denormal = FPBits::min_denormal();                               \
+  const T max_denormal = FPBits::max_denormal();
 
 #define EXPECT_FP_EQ(expected, actual)                                         \
   EXPECT_THAT(actual, LIBC_NAMESPACE::testing::getMatcher<                     \
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index db9f6750bedef..e6b6d7d9ec780 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -232,11 +232,13 @@ TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80_IsNan) {
 TEST(LlvmLibcFPBitsTest, FloatType) {
   using FloatBits = FPBits<float>;
 
-  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits::inf(Sign::POS)).c_str(),
-               "(+Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits::inf(Sign::NEG)).c_str(),
-               "(-Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits::build_nan(Sign::POS, 1)).c_str(),
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(FloatBits(FloatBits::inf(Sign::POS))).c_str(),
+      "(+Infinity)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(FloatBits(FloatBits::inf(Sign::NEG))).c_str(),
+      "(-Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits(FloatBits::build_nan(1))).c_str(),
                "(NaN)");
 
   FloatBits zero(0.0f);
@@ -287,19 +289,22 @@ TEST(LlvmLibcFPBitsTest, FloatType) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBF900000 = (S: 1, E: 0x007F, M: 0x00100000)");
 
-  FloatBits quiet_nan = FloatBits::build_quiet_nan(Sign::POS, 1);
+  FloatBits quiet_nan = FloatBits(FloatBits::build_quiet_nan(1));
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
 TEST(LlvmLibcFPBitsTest, DoubleType) {
   using DoubleBits = FPBits<double>;
 
-  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::inf(Sign::POS)).c_str(),
-               "(+Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::inf(Sign::NEG)).c_str(),
-               "(-Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::build_nan(Sign::POS, 1)).c_str(),
-               "(NaN)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(DoubleBits(DoubleBits::inf(Sign::POS))).c_str(),
+      "(+Infinity)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(DoubleBits(DoubleBits::inf(Sign::NEG))).c_str(),
+      "(-Infinity)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(DoubleBits(DoubleBits::build_nan(1))).c_str(),
+      "(NaN)");
 
   DoubleBits zero(0.0);
   EXPECT_TRUE(zero.is_pos());
@@ -349,7 +354,7 @@ TEST(LlvmLibcFPBitsTest, DoubleType) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBFF2000000000000 = (S: 1, E: 0x03FF, M: 0x0002000000000000)");
 
-  DoubleBits quiet_nan = DoubleBits::build_quiet_nan(Sign::POS, 1);
+  DoubleBits quiet_nan = DoubleBits(DoubleBits::build_quiet_nan(1));
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
@@ -360,12 +365,16 @@ TEST(LlvmLibcFPBitsTest, X86LongDoubleType) {
   if constexpr (sizeof(long double) == sizeof(double))
     return; // The tests for the "double" type cover for this case.
 
-  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::POS)).c_str(),
-               "(+Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::NEG)).c_str(),
-               "(-Infinity)");
   EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits::build_nan(Sign::POS, 1)).c_str(),
+      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::POS)))
+          .c_str(),
+      "(+Infinity)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::NEG)))
+          .c_str(),
+      "(-Infinity)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::build_nan(1))).c_str(),
       "(NaN)");
 
   LongDoubleBits zero(0.0l);
@@ -431,7 +440,7 @@ TEST(LlvmLibcFPBitsTest, X86LongDoubleType) {
       "0x000000000000BFFF9000000000000000 = "
       "(S: 1, E: 0x3FFF, I: 1, M: 0x00000000000000001000000000000000)");
 
-  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan(Sign::POS, 1);
+  LongDoubleBits quiet_nan = LongDoubleBits(LongDoubleBits::build_quiet_nan(1));
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #else
@@ -441,12 +450,16 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
 #else
   using LongDoubleBits = FPBits<long double>;
 
-  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::POS)).c_str(),
-               "(+Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::NEG)).c_str(),
-               "(-Infinity)");
   EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits::build_nan(Sign::POS, 1)).c_str(),
+      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::POS)))
+          .c_str(),
+      "(+Infinity)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::NEG)))
+          .c_str(),
+      "(-Infinity)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::build_nan(1))).c_str(),
       "(NaN)");
 
   LongDoubleBits zero(0.0l);
@@ -506,7 +519,7 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
 
-  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan(1);
+  LongDoubleBits quiet_nan = LongDoubleBits(LongDoubleBits::build_quiet_nan(1));
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 #endif
 }
@@ -516,15 +529,17 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
 TEST(LlvmLibcFPBitsTest, Float128Type) {
   using Float128Bits = FPBits<float128>;
 
-  EXPECT_STREQ(LIBC_NAMESPACE::str(Float128Bits::inf(Sign::POS)).c_str(),
-               "(+Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(Float128Bits::inf(Sign::NEG)).c_str(),
-               "(-Infinity)");
   EXPECT_STREQ(
-      LIBC_NAMESPACE::str(Float128Bits::build_nan(Sign::POS, 1)).c_str(),
+      LIBC_NAMESPACE::str(Float128Bits(Float128Bits::inf(Sign::POS))).c_str(),
+      "(+Infinity)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(Float128Bits(Float128Bits::inf(Sign::NEG))).c_str(),
+      "(-Infinity)");
+  EXPECT_STREQ(
+      LIBC_NAMESPACE::str(Float128Bits(Float128Bits::build_nan(1))).c_str(),
       "(NaN)");
 
-  Float128Bits zero = Float128Bits::zero(Sign::POS);
+  Float128Bits zero(Float128Bits::zero());
   EXPECT_TRUE(zero.is_pos());
   EXPECT_EQ(zero.get_biased_exponent(), static_cast<uint16_t>(0x0000));
   EXPECT_EQ(zero.get_mantissa(), static_cast<UInt128>(0x0000000000000000)
@@ -534,7 +549,7 @@ TEST(LlvmLibcFPBitsTest, Float128Type) {
                "0x00000000000000000000000000000000 = "
                "(S: 0, E: 0x0000, M: 0x00000000000000000000000000000000)");
 
-  Float128Bits negzero = Float128Bits::zero(Sign::NEG);
+  Float128Bits negzero(Float128Bits::zero(Sign::NEG));
   EXPECT_TRUE(negzero.is_neg());
   EXPECT_EQ(negzero.get_biased_exponent(), static_cast<uint16_t>(0x0000));
   EXPECT_EQ(negzero.get_mantissa(), static_cast<UInt128>(0x0000000000000000)
@@ -581,7 +596,7 @@ TEST(LlvmLibcFPBitsTest, Float128Type) {
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
 
-  Float128Bits quiet_nan = Float128Bits::build_quiet_nan(Sign::POS, 1);
+  Float128Bits quiet_nan = Float128Bits(Float128Bits::build_quiet_nan(1));
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #endif // LIBC_COMPILER_HAS_FLOAT128
diff --git a/libc/test/src/math/FDimTest.h b/libc/test/src/math/FDimTest.h
index c3d9cb1801cd4..0744e6ea8fd8f 100644
--- a/libc/test/src/math/FDimTest.h
+++ b/libc/test/src/math/FDimTest.h
@@ -24,7 +24,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan(1));
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
diff --git a/libc/test/src/math/FmaTest.h b/libc/test/src/math/FmaTest.h
index 4343b38053dc4..032a79821d590 100644
--- a/libc/test/src/math/FmaTest.h
+++ b/libc/test/src/math/FmaTest.h
@@ -25,21 +25,11 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T min_subnormal = T(FPBits::min_subnormal(Sign::POS));
-  const T min_normal = T(FPBits::min_normal(Sign::POS));
-  const T max_normal = T(FPBits::max_normal(Sign::POS));
   const T inf = T(FPBits::inf(Sign::POS));
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
-
-  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
-  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-  static constexpr StorageType MAX_SUBNORMAL =
-      FPBits::max_subnormal().uintval();
-  static constexpr StorageType MIN_SUBNORMAL =
-      FPBits::min_subnormal().uintval();
+  const T nan = T(FPBits::build_quiet_nan(1));
 
   StorageType get_random_bit_pattern() {
     StorageType bits{0};
@@ -62,13 +52,14 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
     EXPECT_FP_EQ(func(inf, neg_inf, nan), nan);
 
     // Test underflow rounding up.
-    EXPECT_FP_EQ(func(T(0.5), min_subnormal, min_subnormal),
+    EXPECT_FP_EQ(func(T(0.5), FPBits::min_denormal(), FPBits::min_denormal()),
                  T(FPBits(StorageType(2))));
     // Test underflow rounding down.
-    T v = T(FPBits(MIN_NORMAL + StorageType(1)));
-    EXPECT_FP_EQ(func(T(1) / T(MIN_NORMAL << 1), v, min_normal), v);
+    T v = T(FPBits(FPBits::MIN_NORMAL + StorageType(1)));
+    EXPECT_FP_EQ(
+        func(T(1) / T(FPBits::MIN_NORMAL << 1), v, FPBits::min_normal()), v);
     // Test overflow.
-    T z = max_normal;
+    T z = FPBits::max_normal();
     EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z);
     // Exact cancellation.
     EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0));
@@ -77,9 +68,11 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void test_subnormal_range(Func func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = MIN_SUBNORMAL, w = MAX_SUBNORMAL;
-         v <= MAX_SUBNORMAL && w >= MIN_SUBNORMAL; v += STEP, w -= STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = FPBits::MIN_SUBNORMAL, w = FPBits::MAX_SUBNORMAL;
+         v <= FPBits::MAX_SUBNORMAL && w >= FPBits::MIN_SUBNORMAL;
+         v += STEP, w -= STEP) {
       T x = T(FPBits(get_random_bit_pattern())), y = T(FPBits(v)),
         z = T(FPBits(w));
       mpfr::TernaryInput<T> input{x, y, z};
@@ -90,9 +83,11 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void test_normal_range(Func func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
-    for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
-         v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
+    for (StorageType v = FPBits::MIN_NORMAL, w = FPBits::MAX_NORMAL;
+         v <= FPBits::MAX_NORMAL && w >= FPBits::MIN_NORMAL;
+         v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w)),
         z = T(FPBits(get_random_bit_pattern()));
       mpfr::TernaryInput<T> input{x, y, z};
diff --git a/libc/test/src/math/HypotTest.h b/libc/test/src/math/HypotTest.h
index 0b85f68fda82a..b7eb63c192c70 100644
--- a/libc/test/src/math/HypotTest.h
+++ b/libc/test/src/math/HypotTest.h
@@ -25,22 +25,15 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
   using StorageType = typename FPBits::StorageType;
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
-  const T inf = T(FPBits::inf());
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero());
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T max_normal = T(FPBits::max_normal());
-  const T min_normal = T(FPBits::min_normal());
-  const T max_subnormal = T(FPBits::max_subnormal());
-  const T min_subnormal = T(FPBits::min_subnormal());
-
-  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
-  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-  static constexpr StorageType MAX_SUBNORMAL =
-      FPBits::max_subnormal().uintval();
-  static constexpr StorageType MIN_SUBNORMAL =
-      FPBits::min_subnormal().uintval();
+  const T nan = FPBits::build_quiet_nan(1);
+  const T inf = FPBits::inf();
+  const T neg_inf = FPBits::inf(Sign::NEG);
+  const T zero = FPBits::zero();
+  const T neg_zero = FPBits::zero(Sign::NEG);
+  const T max_normal = FPBits::max_normal();
+  const T min_normal = FPBits::min_normal();
+  const T max_subnormal = FPBits::max_denormal();
+  const T min_subnormal = FPBits::min_denormal();
 
 public:
   void test_special_numbers(Func func) {
@@ -69,11 +62,12 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   void test_subnormal_range(Func func) {
     constexpr StorageType COUNT = 10'001;
     for (unsigned scale = 0; scale < 4; ++scale) {
-      StorageType max_value = MAX_SUBNORMAL << scale;
-      StorageType step = (max_value - MIN_SUBNORMAL) / COUNT;
+      StorageType max_value = FPBits::MAX_SUBNORMAL << scale;
+      StorageType step = (max_value - FPBits::MIN_SUBNORMAL) / COUNT;
       for (int signs = 0; signs < 4; ++signs) {
-        for (StorageType v = MIN_SUBNORMAL, w = max_value;
-             v <= max_value && w >= MIN_SUBNORMAL; v += step, w -= step) {
+        for (StorageType v = FPBits::MIN_SUBNORMAL, w = max_value;
+             v <= max_value && w >= FPBits::MIN_SUBNORMAL;
+             v += step, w -= step) {
           T x = T(FPBits(v)), y = T(FPBits(w));
           if (signs % 2 == 1) {
             x = -x;
@@ -92,10 +86,13 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void test_normal_range(Func func) {
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    constexpr StorageType STEP =
+        (StorageType(FPBits::MAX_NORMAL) - StorageType(FPBits::MIN_NORMAL)) /
+        COUNT;
     for (int signs = 0; signs < 4; ++signs) {
-      for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
-           v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
+      for (StorageType v = FPBits::MIN_NORMAL, w = FPBits::MAX_NORMAL;
+           v <= FPBits::MAX_NORMAL && w >= FPBits::MIN_NORMAL;
+           v += STEP, w -= STEP) {
         T x = T(FPBits(v)), y = T(FPBits(w));
         if (signs % 2 == 1) {
           x = -x;
diff --git a/libc/test/src/math/ILogbTest.h b/libc/test/src/math/ILogbTest.h
index 223de789999af..9fa25c9ff9861 100644
--- a/libc/test/src/math/ILogbTest.h
+++ b/libc/test/src/math/ILogbTest.h
@@ -26,10 +26,10 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_special_numbers(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using Sign = LIBC_NAMESPACE::fputil::Sign;
-    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
+    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero())));
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(Sign::POS, 1))));
-    EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
+    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(1))));
+    EXPECT_EQ(INT_MAX, func(T(FPBits::inf())));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
   }
 
@@ -76,11 +76,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_subnormal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
-    constexpr StorageType MIN_SUBNORMAL = FPBits::min_subnormal().uintval();
-    constexpr StorageType MAX_SUBNORMAL = FPBits::max_subnormal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = FPBits::MIN_SUBNORMAL; v <= FPBits::MAX_SUBNORMAL;
+         v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
@@ -95,11 +95,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_normal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
-    constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-    constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
-    for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
+    for (StorageType v = FPBits::MIN_NORMAL; v <= FPBits::MAX_NORMAL;
+         v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
diff --git a/libc/test/src/math/LdExpTest.h b/libc/test/src/math/LdExpTest.h
index 3a4baabbf10e6..25120ba3646fd 100644
--- a/libc/test/src/math/LdExpTest.h
+++ b/libc/test/src/math/LdExpTest.h
@@ -29,7 +29,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan(1));
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/NextAfterTest.h b/libc/test/src/math/NextAfterTest.h
index 9ff3bf73d2ede..aa9646fd921f8 100644
--- a/libc/test/src/math/NextAfterTest.h
+++ b/libc/test/src/math/NextAfterTest.h
@@ -27,12 +27,12 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan(1));
 
-  const StorageType min_subnormal = FPBits::min_subnormal().uintval();
-  const StorageType max_subnormal = FPBits::max_subnormal().uintval();
-  const StorageType min_normal = FPBits::min_normal().uintval();
-  const StorageType max_normal = FPBits::max_normal().uintval();
+  const StorageType min_subnormal = FPBits::MIN_SUBNORMAL;
+  const StorageType max_subnormal = FPBits::MAX_SUBNORMAL;
+  const StorageType min_normal = FPBits::MIN_NORMAL;
+  const StorageType max_normal = FPBits::MAX_NORMAL;
 
 public:
   typedef T (*NextAfterFunc)(T, T);
diff --git a/libc/test/src/math/RIntTest.h b/libc/test/src/math/RIntTest.h
index b478e3f65dbc8..6816e94d389f4 100644
--- a/libc/test/src/math/RIntTest.h
+++ b/libc/test/src/math/RIntTest.h
@@ -38,14 +38,7 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
-
-  static constexpr StorageType MIN_SUBNORMAL =
-      FPBits::min_subnormal().uintval();
-  static constexpr StorageType MAX_SUBNORMAL =
-      FPBits::max_subnormal().uintval();
-  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
+  const T nan = T(FPBits::build_quiet_nan(1));
 
   static inline mpfr::RoundingMode to_mpfr_rounding_mode(int mode) {
     switch (mode) {
@@ -102,8 +95,10 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RIntFunc func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
-    for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
+    for (StorageType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL;
+         i += STEP) {
       T x = T(FPBits(i));
       for (int mode : ROUNDING_MODES) {
         LIBC_NAMESPACE::fputil::set_round(mode);
@@ -115,8 +110,10 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testNormalRange(RIntFunc func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
-    for (StorageType i = MIN_NORMAL; i <= MAX_NORMAL; i += STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
+    for (StorageType i = FPBits::MIN_NORMAL; i <= FPBits::MAX_NORMAL;
+         i += STEP) {
       T x = T(FPBits(i));
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. We will skip them.
diff --git a/libc/test/src/math/RemQuoTest.h b/libc/test/src/math/RemQuoTest.h
index 0ee41f4bf9acf..9ed9525662452 100644
--- a/libc/test/src/math/RemQuoTest.h
+++ b/libc/test/src/math/RemQuoTest.h
@@ -28,14 +28,7 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
-
-  static constexpr StorageType MIN_SUBNORMAL =
-      FPBits::min_subnormal().uintval();
-  static constexpr StorageType MAX_SUBNORMAL =
-      FPBits::max_subnormal().uintval();
-  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
+  const T nan = T(FPBits::build_quiet_nan(1));
 
 public:
   typedef T (*RemQuoFunc)(T, T, int *);
@@ -104,9 +97,11 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RemQuoFunc func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = MIN_SUBNORMAL, w = MAX_SUBNORMAL;
-         v <= MAX_SUBNORMAL && w >= MIN_SUBNORMAL; v += STEP, w -= STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = FPBits::MIN_SUBNORMAL, w = FPBits::MAX_SUBNORMAL;
+         v <= FPBits::MAX_SUBNORMAL && w >= FPBits::MIN_SUBNORMAL;
+         v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w));
       mpfr::BinaryOutput<T> result;
       mpfr::BinaryInput<T> input{x, y};
@@ -117,9 +112,11 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testNormalRange(RemQuoFunc func) {
     constexpr StorageType COUNT = 1'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
-    for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
-         v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
+    for (StorageType v = FPBits::MIN_NORMAL, w = FPBits::MAX_NORMAL;
+         v <= FPBits::MAX_NORMAL && w >= FPBits::MIN_NORMAL;
+         v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w));
       mpfr::BinaryOutput<T> result;
       mpfr::BinaryInput<T> input{x, y};
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index 6866c23cb99ca..e8ada1b4c36c5 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -37,15 +37,7 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const F neg_zero = F(FPBits::zero(Sign::NEG));
   const F inf = F(FPBits::inf());
   const F neg_inf = F(FPBits::inf(Sign::NEG));
-  const F nan = F(FPBits::build_quiet_nan(Sign::POS, 1));
-
-  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
-  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-  static constexpr StorageType MAX_SUBNORMAL =
-      FPBits::max_subnormal().uintval();
-  static constexpr StorageType MIN_SUBNORMAL =
-      FPBits::min_subnormal().uintval();
-
+  const F nan = F(FPBits::build_quiet_nan(1));
   static constexpr I INTEGER_MIN = I(1) << (sizeof(I) * 8 - 1);
   static constexpr I INTEGER_MAX = -(INTEGER_MIN + 1);
 
@@ -223,8 +215,10 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RoundToIntegerFunc func) {
     constexpr StorageType COUNT = 1'000'001;
-    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
-    for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
+    for (StorageType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL;
+         i += STEP) {
       F x = F(FPBits(i));
       if (x == F(0.0))
         continue;
@@ -265,8 +259,10 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
       return;
 
     constexpr StorageType COUNT = 1'000'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
-    for (StorageType i = MIN_NORMAL; i <= MAX_NORMAL; i += STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
+    for (StorageType i = FPBits::MIN_NORMAL; i <= FPBits::MAX_NORMAL;
+         i += STEP) {
       F x = F(FPBits(i));
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. We will skip them.
diff --git a/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h b/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
index 48572e78e5153..ada7c9e495426 100644
--- a/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
+++ b/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
@@ -109,13 +109,10 @@ template <typename T> class BinaryOpSingleOutputDiff {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     run_perf_in_range(myFunc, otherFunc, /* startingBit= */ StorageType(0),
-                      /* endingBit= */ FPBits::max_subnormal().uintval(),
-                      1'000'001, log);
+                      /* endingBit= */ FPBits::MAX_SUBNORMAL, 1'000'001, log);
     log << "\n Performance tests with inputs in normal range:\n";
-    run_perf_in_range(myFunc, otherFunc,
-                      /* startingBit= */ FPBits::min_normal().uintval(),
-                      /* endingBit= */ FPBits::max_normal().uintval(),
-                      100'000'001, log);
+    run_perf_in_range(myFunc, otherFunc, /* startingBit= */ FPBits::MIN_NORMAL,
+                      /* endingBit= */ FPBits::MAX_NORMAL, 100'000'001, log);
     log << "\n Performance tests with inputs in normal range with exponents "
            "close to each other:\n";
     run_perf_in_range(
@@ -129,12 +126,11 @@ template <typename T> class BinaryOpSingleOutputDiff {
     log << " Diff tests with inputs in denormal range:\n";
     diffCount += run_diff_in_range(
         myFunc, otherFunc, /* startingBit= */ StorageType(0),
-        /* endingBit= */ FPBits::max_subnormal().uintval(), 1'000'001, log);
+        /* endingBit= */ FPBits::MAX_SUBNORMAL, 1'000'001, log);
     log << "\n Diff tests with inputs in normal range:\n";
     diffCount += run_diff_in_range(
-        myFunc, otherFunc,
-        /* startingBit= */ FPBits::min_normal().uintval(),
-        /* endingBit= */ FPBits::max_normal().uintval(), 100'000'001, log);
+        myFunc, otherFunc, /* startingBit= */ FPBits::MIN_NORMAL,
+        /* endingBit= */ FPBits::MAX_NORMAL, 100'000'001, log);
     log << "\n Diff tests with inputs in normal range with exponents "
            "close to each other:\n";
     diffCount += run_diff_in_range(
diff --git a/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h b/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
index 5e8310e889dc6..e4cd06eb22b71 100644
--- a/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
+++ b/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
@@ -93,11 +93,10 @@ template <typename T> class SingleInputSingleOutputDiff {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     runPerfInRange(myFunc, otherFunc, /* startingBit= */ StorageType(0),
-                   /* endingBit= */ FPBits::max_subnormal().uintval(), log);
+                   /* endingBit= */ FPBits::MAX_SUBNORMAL, log);
     log << "\n Performance tests with inputs in normal range:\n";
-    runPerfInRange(myFunc, otherFunc,
-                   /* startingBit= */ FPBits::min_normal().uintval(),
-                   /* endingBit= */ FPBits::max_normal().uintval(), log);
+    runPerfInRange(myFunc, otherFunc, /* startingBit= */ FPBits::MIN_NORMAL,
+                   /* endingBit= */ FPBits::MAX_NORMAL, log);
   }
 };
 
diff --git a/libc/test/src/math/smoke/FDimTest.h b/libc/test/src/math/smoke/FDimTest.h
index c3d9cb1801cd4..0744e6ea8fd8f 100644
--- a/libc/test/src/math/smoke/FDimTest.h
+++ b/libc/test/src/math/smoke/FDimTest.h
@@ -24,7 +24,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan(1));
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
diff --git a/libc/test/src/math/smoke/FmaTest.h b/libc/test/src/math/smoke/FmaTest.h
index 337ce659a23e1..dc624d871b990 100644
--- a/libc/test/src/math/smoke/FmaTest.h
+++ b/libc/test/src/math/smoke/FmaTest.h
@@ -25,7 +25,7 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan(1));
 
 public:
   void test_special_numbers(Func func) {
@@ -39,16 +39,14 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
     EXPECT_FP_EQ(func(inf, neg_inf, nan), nan);
 
     // Test underflow rounding up.
-    EXPECT_FP_EQ(
-        func(T(0.5), T(FPBits::min_subnormal()), T(FPBits::min_subnormal())),
-        T(FPBits(StorageType(2))));
+    EXPECT_FP_EQ(func(T(0.5), FPBits::min_denormal(), FPBits::min_denormal()),
+                 T(FPBits(StorageType(2))));
     // Test underflow rounding down.
-    StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-    T v = T(FPBits(MIN_NORMAL + StorageType(1)));
-    EXPECT_FP_EQ(func(T(1) / T(MIN_NORMAL << 1), v, T(FPBits::min_normal())),
-                 v);
+    T v = T(FPBits(FPBits::MIN_NORMAL + StorageType(1)));
+    EXPECT_FP_EQ(
+        func(T(1) / T(FPBits::MIN_NORMAL << 1), v, FPBits::min_normal()), v);
     // Test overflow.
-    T z = T(FPBits::max_normal());
+    T z = FPBits::max_normal();
     EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z);
     // Exact cancellation.
     EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0));
diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h
index 67110536b9623..77454c19a6538 100644
--- a/libc/test/src/math/smoke/HypotTest.h
+++ b/libc/test/src/math/smoke/HypotTest.h
@@ -22,16 +22,16 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-
-  const T max_normal = T(FPBits::max_normal());
-  const T min_normal = T(FPBits::min_normal());
-  const T max_subnormal = T(FPBits::max_subnormal());
-  const T min_subnormal = T(FPBits::min_subnormal());
+  const T nan = FPBits::build_quiet_nan(1);
+  const T inf = FPBits::inf(Sign::POS);
+  const T neg_inf = FPBits::inf(Sign::NEG);
+  const T zero = FPBits::zero(Sign::POS);
+  const T neg_zero = FPBits::zero(Sign::NEG);
+
+  const T max_normal = FPBits::max_normal();
+  const T min_normal = FPBits::min_normal();
+  const T max_subnormal = FPBits::max_denormal();
+  const T min_subnormal = FPBits::min_denormal();
 
 public:
   void test_special_numbers(Func func) {
diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h
index 223de789999af..0a50abc04f727 100644
--- a/libc/test/src/math/smoke/ILogbTest.h
+++ b/libc/test/src/math/smoke/ILogbTest.h
@@ -28,7 +28,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
     using Sign = LIBC_NAMESPACE::fputil::Sign;
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(Sign::POS, 1))));
+    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(1))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
   }
@@ -76,11 +76,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_subnormal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
-    constexpr StorageType MIN_SUBNORMAL = FPBits::min_subnormal().uintval();
-    constexpr StorageType MAX_SUBNORMAL = FPBits::max_subnormal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = FPBits::MIN_SUBNORMAL; v <= FPBits::MAX_SUBNORMAL;
+         v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
@@ -95,11 +95,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_normal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
-    constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-    constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
-    for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
+    for (StorageType v = FPBits::MIN_NORMAL; v <= FPBits::MAX_NORMAL;
+         v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
diff --git a/libc/test/src/math/smoke/LdExpTest.h b/libc/test/src/math/smoke/LdExpTest.h
index 3a4baabbf10e6..25120ba3646fd 100644
--- a/libc/test/src/math/smoke/LdExpTest.h
+++ b/libc/test/src/math/smoke/LdExpTest.h
@@ -29,7 +29,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan(1));
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h
index 1dd07b3d2f93d..bdf3da627180a 100644
--- a/libc/test/src/math/smoke/NextAfterTest.h
+++ b/libc/test/src/math/smoke/NextAfterTest.h
@@ -38,14 +38,12 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
-
-  static constexpr StorageType min_subnormal =
-      FPBits::min_subnormal().uintval();
-  static constexpr StorageType max_subnormal =
-      FPBits::max_subnormal().uintval();
-  static constexpr StorageType min_normal = FPBits::min_normal().uintval();
-  static constexpr StorageType max_normal = FPBits::max_normal().uintval();
+  const T nan = T(FPBits::build_quiet_nan(1));
+
+  const StorageType min_subnormal = FPBits::MIN_SUBNORMAL;
+  const StorageType max_subnormal = FPBits::MAX_SUBNORMAL;
+  const StorageType min_normal = FPBits::MIN_NORMAL;
+  const StorageType max_normal = FPBits::MAX_NORMAL;
 
 public:
   typedef T (*NextAfterFunc)(T, T);
diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h
index d65cc5d84d35a..af4e0ab14531c 100644
--- a/libc/test/src/math/smoke/NextTowardTest.h
+++ b/libc/test/src/math/smoke/NextTowardTest.h
@@ -40,18 +40,16 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
-
-  const long double to_zero = ToFPBits::zero().get_val();
-  const long double to_neg_zero = ToFPBits::zero(Sign::NEG).get_val();
-  const long double to_nan = ToFPBits::build_quiet_nan(Sign::POS, 1).get_val();
-
-  static constexpr StorageType min_subnormal =
-      FPBits::min_subnormal().uintval();
-  static constexpr StorageType max_subnormal =
-      FPBits::max_subnormal().uintval();
-  static constexpr StorageType min_normal = FPBits::min_normal().uintval();
-  static constexpr StorageType max_normal = FPBits::max_normal().uintval();
+  const T nan = T(FPBits::build_quiet_nan(1));
+
+  const long double to_zero = ToFPBits::zero();
+  const long double to_neg_zero = ToFPBits::zero(Sign::NEG);
+  const long double to_nan = ToFPBits::build_quiet_nan(1);
+
+  const StorageType min_subnormal = FPBits::MIN_SUBNORMAL;
+  const StorageType max_subnormal = FPBits::MAX_SUBNORMAL;
+  const StorageType min_normal = FPBits::MIN_NORMAL;
+  const StorageType max_normal = FPBits::MAX_NORMAL;
 
 public:
   typedef T (*NextTowardFunc)(T, long double);
diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h
index 7bbbe54301570..b242c7e441b69 100644
--- a/libc/test/src/math/smoke/RIntTest.h
+++ b/libc/test/src/math/smoke/RIntTest.h
@@ -35,7 +35,7 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan(1));
 
 public:
   void testSpecialNumbers(RIntFunc func) {
diff --git a/libc/test/src/math/smoke/RemQuoTest.h b/libc/test/src/math/smoke/RemQuoTest.h
index 5f5cbd4964a62..93e20747e5263 100644
--- a/libc/test/src/math/smoke/RemQuoTest.h
+++ b/libc/test/src/math/smoke/RemQuoTest.h
@@ -25,7 +25,7 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan(1));
 
 public:
   typedef T (*RemQuoFunc)(T, T, int *);
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index 77c65aa492e22..2703b78f00e0f 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -30,17 +30,11 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const F zero = F(FPBits::zero(Sign::POS));
-  const F neg_zero = F(FPBits::zero(Sign::NEG));
-  const F inf = F(FPBits::inf(Sign::POS));
-  const F neg_inf = F(FPBits::inf(Sign::NEG));
-  const F nan = F(FPBits::build_quiet_nan(Sign::POS, 1));
-
-  static constexpr StorageType MAX_SUBNORMAL =
-      FPBits::max_subnormal().uintval();
-  static constexpr StorageType MIN_SUBNORMAL =
-      FPBits::min_subnormal().uintval();
-
+  const F zero = F(LIBC_NAMESPACE::fputil::FPBits<F>::zero(Sign::POS));
+  const F neg_zero = F(LIBC_NAMESPACE::fputil::FPBits<F>::zero(Sign::NEG));
+  const F inf = F(LIBC_NAMESPACE::fputil::FPBits<F>::inf(Sign::POS));
+  const F neg_inf = F(LIBC_NAMESPACE::fputil::FPBits<F>::inf(Sign::NEG));
+  const F nan = F(LIBC_NAMESPACE::fputil::FPBits<F>::build_quiet_nan(1));
   static constexpr I INTEGER_MIN = I(1) << (sizeof(I) * 8 - 1);
   static constexpr I INTEGER_MAX = -(INTEGER_MIN + 1);
 
@@ -117,8 +111,10 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RoundToIntegerFunc func) {
     constexpr StorageType COUNT = 1'000'001;
-    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
-    for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
+    constexpr StorageType STEP =
+        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
+    for (StorageType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL;
+         i += STEP) {
       F x = F(FPBits(i));
       if (x == F(0.0))
         continue;
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index b22378b22ab12..344853beaf9fa 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -585,10 +585,8 @@ TEST(LlvmLibcSPrintfTest, OctConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
-                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
-                   .get_val();
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
   written = LIBC_NAMESPACE::sprintf(buff, "%a", 1.0);
   ASSERT_STREQ_LEN(written, buff, "0x1p+0");
 
@@ -951,15 +949,11 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
-                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
-                   .get_val();
-  long double ld_inf =
-      LIBC_NAMESPACE::fputil::FPBits<long double>::inf().get_val();
-  long double ld_nan = LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(
-                           LIBC_NAMESPACE::fputil::Sign::POS, 1)
-                           .get_val();
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
+  long double ld_inf = LIBC_NAMESPACE::fputil::FPBits<long double>::inf();
+  long double ld_nan =
+      LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(1);
 
   char big_buff[10000]; // Used for long doubles and other extremely wide
                         // numbers.
@@ -1796,10 +1790,8 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
-                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
-                   .get_val();
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
 
   written = LIBC_NAMESPACE::sprintf(buff, "%e", 1.0);
   ASSERT_STREQ_LEN(written, buff, "1.000000e+00");
@@ -2430,10 +2422,8 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
-                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
-                   .get_val();
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
 
   written = LIBC_NAMESPACE::sprintf(buff, "%g", 1.0);
   ASSERT_STREQ_LEN(written, buff, "1");
diff --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp
index db67c25029133..db3c48cdbf7a2 100644
--- a/libc/test/src/stdio/sscanf_test.cpp
+++ b/libc/test/src/stdio/sscanf_test.cpp
@@ -230,10 +230,8 @@ TEST(LlvmLibcSScanfTest, FloatConvSimple) {
   int ret_val;
   float result = 0;
 
-  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf().get_val();
-  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(
-                  LIBC_NAMESPACE::fputil::Sign::POS, 1)
-                  .get_val();
+  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf();
+  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(1);
 
   ret_val = LIBC_NAMESPACE::sscanf("123", "%f", &result);
   EXPECT_EQ(ret_val, 1);
@@ -296,10 +294,9 @@ TEST(LlvmLibcSScanfTest, FloatConvLengthModifier) {
   double d_result = 0;
   long double ld_result = 0;
 
-  double d_inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
-  long double ld_nan = LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(
-                           LIBC_NAMESPACE::fputil::Sign::POS, 1)
-                           .get_val();
+  double d_inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
+  long double ld_nan =
+      LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(1);
 
   ret_val = LIBC_NAMESPACE::sscanf("123", "%lf", &d_result);
   EXPECT_EQ(ret_val, 1);
@@ -394,10 +391,8 @@ TEST(LlvmLibcSScanfTest, FloatConvComplexParsing) {
   int ret_val;
   float result = 0;
 
-  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf().get_val();
-  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(
-                  LIBC_NAMESPACE::fputil::Sign::POS, 1)
-                  .get_val();
+  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf();
+  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(1);
 
   ret_val = LIBC_NAMESPACE::sscanf("0x1.0e3", "%f", &result);
   EXPECT_EQ(ret_val, 1);
@@ -468,7 +463,7 @@ TEST(LlvmLibcSScanfTest, FloatConvMaxWidth) {
   int ret_val;
   float result = 0;
 
-  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf().get_val();
+  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf();
 
   ret_val = LIBC_NAMESPACE::sscanf("123", "%3f", &result);
   EXPECT_EQ(ret_val, 1);
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 06a231c7d94d0..b6ca525db6cf7 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -49,7 +49,7 @@ template <> struct ExtraPrecision<long double> {
 template <typename T>
 static inline unsigned int get_precision(double ulp_tolerance) {
   if (ulp_tolerance <= 0.5) {
-    return LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN + 1;
+    return LIBC_NAMESPACE::fputil::FPBits<T>::MANTISSA_PRECISION;
   } else {
     return ExtraPrecision<T>::VALUE;
   }

From 74cb09ff90bff4b91f6c3527fd2715d5d4e77aea Mon Sep 17 00:00:00 2001
From: Stephen Tozer <Stephen.Tozer@Sony.com>
Date: Tue, 23 Jan 2024 10:49:51 +0000
Subject: [PATCH 590/843] [RemoveDIs][NFC] Disable RemoveDIs tests that are not
 yet enabled

As part of a recent patch landing, some tests that are not yet ready
to support RemoveDIs were turned on; this patch disables those tests
in RemoveDIs mode.

Fixes buildbot failure:
https://lab.llvm.org/buildbot/#/builders/275/builds/3640
---
 .../assignment-tracking/instcombine/remove-redundant-dbg.ll     | 2 --
 .../Generic/assignment-tracking/sroa/remove-redundant-dbg.ll    | 2 --
 .../DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll   | 2 --
 3 files changed, 6 deletions(-)

diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
index cffac06f8e545..11895098179eb 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
-; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
-; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
index cffac06f8e545..11895098179eb 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
-; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
-; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
index 23d0ff3461046..abc110273ab3b 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
-; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
-; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that the fragments generated in SROA for a split alloca that has a
 ;; dbg.assign with non-zero-offset fragment are correct.

From 96adf69ba93965956d1ee507d9f75a453d99b666 Mon Sep 17 00:00:00 2001
From: AtariDreams <83477269+AtariDreams@users.noreply.github.com>
Date: Tue, 23 Jan 2024 06:10:59 -0500
Subject: [PATCH 591/843] [InstCombine] Remove one-use check if other logic
 operand is constant (#77973)

By using `match(W, m_ImmConstant())`, we do not need to worry about
one-time use anymore.
---
 .../InstCombine/InstCombineShifts.cpp         | 16 +++++-----
 .../Transforms/InstCombine/shift-logic.ll     | 30 +++++++++++++++++++
 2 files changed, 38 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index b7958978c450c..54490c46dfaef 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -366,14 +366,14 @@ static Instruction *foldShiftOfShiftedBinOp(BinaryOperator &I,
 
   Type *Ty = I.getType();
 
-  // Find a matching one-use shift by constant. The fold is not valid if the sum
+  // Find a matching shift by constant. The fold is not valid if the sum
   // of the shift values equals or exceeds bitwidth.
-  // TODO: Remove the one-use check if the other logic operand (Y) is constant.
   Value *X, *Y;
-  auto matchFirstShift = [&](Value *V) {
-    APInt Threshold(Ty->getScalarSizeInBits(), Ty->getScalarSizeInBits());
-    return match(V,
-                 m_OneUse(m_BinOp(ShiftOpcode, m_Value(X), m_Constant(C0)))) &&
+  auto matchFirstShift = [&](Value *V, Value *W) {
+    unsigned Size = Ty->getScalarSizeInBits();
+    APInt Threshold(Size, Size);
+    return match(V, m_BinOp(ShiftOpcode, m_Value(X), m_Constant(C0))) &&
+           (V->hasOneUse() || match(W, m_ImmConstant())) &&
            match(ConstantExpr::getAdd(C0, C1),
                  m_SpecificInt_ICMP(ICmpInst::ICMP_ULT, Threshold));
   };
@@ -382,9 +382,9 @@ static Instruction *foldShiftOfShiftedBinOp(BinaryOperator &I,
   // is not so we cannot reoder if we match operand(1) and need to keep the
   // operands in their original positions.
   bool FirstShiftIsOp1 = false;
-  if (matchFirstShift(BinInst->getOperand(0)))
+  if (matchFirstShift(BinInst->getOperand(0), BinInst->getOperand(1)))
     Y = BinInst->getOperand(1);
-  else if (matchFirstShift(BinInst->getOperand(1))) {
+  else if (matchFirstShift(BinInst->getOperand(1), BinInst->getOperand(0))) {
     Y = BinInst->getOperand(0);
     FirstShiftIsOp1 = BinInst->getOpcode() == Instruction::Sub;
   } else
diff --git a/llvm/test/Transforms/InstCombine/shift-logic.ll b/llvm/test/Transforms/InstCombine/shift-logic.ll
index 544694d398431..c982b45b504e9 100644
--- a/llvm/test/Transforms/InstCombine/shift-logic.ll
+++ b/llvm/test/Transforms/InstCombine/shift-logic.ll
@@ -346,6 +346,36 @@ define i8 @shl_add(i8 %x, i8 %y) {
   ret i8 %sh1
 }
 
+define i8 @shl_add_multiuse(i8 %x) {
+; CHECK-LABEL: @shl_add_multiuse(
+; CHECK-NEXT:    [[SH0:%.*]] = shl i8 [[X:%.*]], 3
+; CHECK-NEXT:    call void @use(i8 [[SH0]])
+; CHECK-NEXT:    [[R:%.*]] = shl i8 [[X]], 5
+; CHECK-NEXT:    [[SH1:%.*]] = add i8 [[R]], 88
+; CHECK-NEXT:    ret i8 [[SH1]]
+;
+  %sh0 = shl i8 %x, 3
+  %r = add i8 %sh0, -42
+  call void @use(i8 %sh0)
+  %sh1 = shl i8 %r, 2
+  ret i8 %sh1
+}
+
+define i8 @shl_add_multiuse_nonconstant(i8 %x, i8 %y) {
+; CHECK-LABEL: @shl_add_multiuse_nonconstant(
+; CHECK-NEXT:    [[SH0:%.*]] = shl i8 [[X:%.*]], 3
+; CHECK-NEXT:    [[R:%.*]] = add i8 [[SH0]], [[Y:%.*]]
+; CHECK-NEXT:    call void @use(i8 [[SH0]])
+; CHECK-NEXT:    [[SH1:%.*]] = shl i8 [[R]], 2
+; CHECK-NEXT:    ret i8 [[SH1]]
+;
+  %sh0 = shl i8 %x, 3
+  %r = add i8 %sh0, %y
+  call void @use(i8 %sh0)
+  %sh1 = shl i8 %r, 2
+  ret i8 %sh1
+}
+
 define <2 x i8> @shl_add_nonuniform(<2 x i8> %x, <2 x i8> %y) {
 ; CHECK-LABEL: @shl_add_nonuniform(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl <2 x i8> [[X:%.*]], <i8 5, i8 4>

From ea4d22f22568ccce7985032d150e79197694d38f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 23 Jan 2024 12:07:30 +0100
Subject: [PATCH 592/843] [Lex] Avoid repeated calls to getIdentifierInfo()
 (NFC)

We're calling it four times in the same function.
---
 clang/include/clang/Lex/Preprocessor.h | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index fed3c1b770383..2d9c53cdf5bde 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -2829,17 +2829,18 @@ class Preprocessor {
   }
 
   void emitMacroExpansionWarnings(const Token &Identifier) const {
-    if (Identifier.getIdentifierInfo()->isDeprecatedMacro())
+    IdentifierInfo *Info = Identifier.getIdentifierInfo();
+    if (Info->isDeprecatedMacro())
       emitMacroDeprecationWarning(Identifier);
 
-    if (Identifier.getIdentifierInfo()->isRestrictExpansion() &&
+    if (Info->isRestrictExpansion() &&
         !SourceMgr.isInMainFile(Identifier.getLocation()))
       emitRestrictExpansionWarning(Identifier);
 
-    if (Identifier.getIdentifierInfo()->getName() == "INFINITY")
+    if (Info->getName() == "INFINITY")
       if (getLangOpts().NoHonorInfs)
         emitRestrictInfNaNWarning(Identifier, 0);
-    if (Identifier.getIdentifierInfo()->getName() == "NAN")
+    if (Info->getName() == "NAN")
       if (getLangOpts().NoHonorNaNs)
         emitRestrictInfNaNWarning(Identifier, 1);
   }

From 1f9de23e94af3ff50f67a3b16be3769919c2e146 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 23 Jan 2024 11:18:35 +0000
Subject: [PATCH 593/843] [SCEVExp] Add additional tests for hoisting IVs with
 NSW flags.

---
 .../Transforms/IndVarSimplify/iv-poison.ll    | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/llvm/test/Transforms/IndVarSimplify/iv-poison.ll b/llvm/test/Transforms/IndVarSimplify/iv-poison.ll
index 3d6c15ba7bc7c..91358a1477939 100644
--- a/llvm/test/Transforms/IndVarSimplify/iv-poison.ll
+++ b/llvm/test/Transforms/IndVarSimplify/iv-poison.ll
@@ -29,3 +29,91 @@ bb1:                                              ; preds = %bb1, %bb
 common.ret:                                       ; preds = %bb1
   ret i2 %i2
 }
+
+define i4 @iv_hoist_nsw_poison2(i4 %0, i4 %end, i4 %start) {
+; CHECK-LABEL: @iv_hoist_nsw_poison2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i4 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add i4 [[IV_0]], 1
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i4 [[IV_0]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i4 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i4 [[IV_1_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.0 = phi i4 [ %start, %entry ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i4 [ %start, %entry ], [ %iv.1.next, %loop ]
+  %iv.0.next = add i4 %iv.0, 1
+  %iv.1.next = add nsw i4 %iv.1, 1
+  %.not.not = icmp ult i4 %iv.0, %end
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i4 %iv.1.next
+}
+
+define i2 @iv_hoist_both_adds_nsw(i2 %arg) {
+; CHECK-LABEL: @iv_hoist_both_adds_nsw(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i2 [ 1, [[BB:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add nuw i2 [[IV_0]], 1
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i2 1, [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i2 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i2 [[IV_1_NEXT_LCSSA]]
+;
+bb:
+  br label %loop
+
+loop:
+  %iv.0 = phi i2 [ 1, %bb ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i2 [ 1, %bb ], [ %iv.1.next, %loop ]
+  %iv.0.next = add nsw i2 %iv.0, 1
+  %iv.1.next = add nsw i2 %iv.1, 1
+  %.not.not = icmp ult i2 %iv.0, %arg
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i2 %iv.1.next
+}
+
+define i4 @iv_hoist_nsw_poison_extra_use(i4 %0, i4 %end, i4 %start) {
+; CHECK-LABEL: @iv_hoist_nsw_poison_extra_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i4 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add i4 [[IV_0]], 1
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i4 [[IV_0]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i4 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i4 [[IV_1_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.0 = phi i4 [ %start, %entry ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i4 [ %start, %entry ], [ %iv.1.next, %loop ]
+  %iv.0.next = add i4 %iv.0, 1
+  call void @use(i4 %iv.0.next)
+  %iv.1.next = add nsw i4 %iv.1, 1
+  %.not.not = icmp ult i4 %iv.0, %end
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i4 %iv.1.next
+}
+
+declare void @use(i4)

From 13c6f1ea2e7eb15fe492d8fca4fa1857c6f86370 Mon Sep 17 00:00:00 2001
From: OCHyams <orlando.hyams@sony.com>
Date: Thu, 18 Jan 2024 14:28:18 +0000
Subject: [PATCH 594/843] Reapply [hwasan] Update dbg.assign intrinsics in
 HWAsan pass #78606

llvm.dbg.assign intrinsics have 2 {value, expression} pairs; fix hwasan to update
the second expression.

Fixes #76545
---
 llvm/lib/IR/DebugInfo.cpp                     |  4 --
 .../Instrumentation/HWAddressSanitizer.cpp    |  5 ++
 .../Transforms/Utils/MemoryTaggingSupport.cpp | 10 ++-
 .../AArch64/dbg-assign-tag-offset-mix-loc.ll  | 71 +++++++++++++++++++
 .../CodeGen/AArch64/dbg-assign-tag-offset.ll  | 71 +++++++++++++++++++
 .../declare-to-assign/hwasan.ll               |  2 +-
 .../dbg-assign-tag-offset.ll                  | 59 +++++++++++++++
 7 files changed, 214 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
 create mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
 create mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index fcd3f77f8f6e2..2e64d0db57b25 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2200,10 +2200,6 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return /*Changed*/ false;
 
-  // FIXME: https://github.com/llvm/llvm-project/issues/76545
-  if (F.hasFnAttribute(Attribute::SanitizeHWAddress))
-    return /*Changed*/ false;
-
   bool Changed = false;
   auto *DL = &F.getParent()->getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index efb621cde9065..3ca9c402b4719 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1435,6 +1435,11 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
         if (DDI->getVariableLocationOp(LocNo) == AI)
           DDI->setExpression(DIExpression::appendOpsToArg(DDI->getExpression(),
                                                           NewOps, LocNo));
+      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DDI)) {
+        if (DAI->getAddress() == AI)
+          DAI->setAddressExpression(DIExpression::prependOpcodes(
+              DAI->getAddressExpression(), NewOps));
+      }
     }
 
     auto TagEnd = [&](Instruction *Node) {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index f94047633022c..d2efcde5d3803 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -138,16 +138,20 @@ void StackInfoBuilder::visit(Instruction &Inst) {
     return;
   }
   if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    for (Value *V : DVI->location_ops()) {
+    auto AddIfInteresting = [&](Value *V) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
-          continue;
+          return;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
         auto &DVIVec = AInfo.DbgVariableIntrinsics;
         if (DVIVec.empty() || DVIVec.back() != DVI)
           DVIVec.push_back(DVI);
       }
-    }
+    };
+    for (Value *V : DVI->location_ops())
+      AddIfInteresting(V);
+    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
+      AddIfInteresting(DAI->getAddress());
   }
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
new file mode 100644
index 0000000000000..ef0dd46cb45c7
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
@@ -0,0 +1,71 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+;; Similar to dbg-assign-tag-offset.ll except the variable 'x' has been removed
+;; and 'y' has an implicit location range as well as stack location range
+;; (according to the hand-modified debug info -- see the dbg.value).
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @llvm.dbg.value(metadata i32 2, metadata !20, metadata !DIExpression()), !dbg !22
+  call void @use(ptr null), !dbg !28
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
new file mode 100644
index 0000000000000..a587f93d14d70
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
@@ -0,0 +1,71 @@
+; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-android24"
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x00)
+; CHECK-NEXT:   DW_AT_name    ("x")
+
+; CHECK:      DW_TAG_variable
+; CHECK-NOT:  DW_TAG
+; CHECK:        DW_AT_LLVM_tag_offset (0x80)
+; CHECK-NEXT:   DW_AT_name    ("y")
+
+define dso_local void @f() !dbg !14 {
+  %1 = alloca i32, align 4, !DIAssignID !31
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !18, metadata !DIExpression(), metadata !31, metadata ptr %1, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0)), !dbg !22
+  %2 = alloca i32, align 4, !DIAssignID !32
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
+  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
+  call void @use(ptr nonnull %1), !dbg !28
+  call void @use(ptr nonnull %2), !dbg !29
+  ret void, !dbg !30
+}
+
+declare !dbg !5 void @use(ptr)
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
+!llvm.ident = !{!13}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
+!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
+!2 = !{}
+!3 = !{!4, !5}
+!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
+!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
+!6 = !DISubroutineType(types: !7)
+!7 = !{null, !4}
+!8 = !{i32 7, !"Dwarf Version", i32 4}
+!9 = !{i32 2, !"Debug Info Version", i32 3}
+!10 = !{i32 1, !"wchar_size", i32 4}
+!11 = !{i32 7, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{!"clang version 10.0.0"}
+!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
+!15 = !DISubroutineType(types: !16)
+!16 = !{null}
+!17 = !{!18, !20}
+!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
+!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
+!21 = !DILocation(line: 5, column: 3, scope: !14)
+!22 = !DILocation(line: 0, scope: !14)
+!23 = !DILocation(line: 5, column: 10, scope: !14)
+!24 = !{!25, !25, i64 0}
+!25 = !{!"int", !26, i64 0}
+!26 = !{!"omnipotent char", !27, i64 0}
+!27 = !{!"Simple C++ TBAA"}
+!28 = !DILocation(line: 6, column: 3, scope: !14)
+!29 = !DILocation(line: 7, column: 3, scope: !14)
+!30 = !DILocation(line: 8, column: 1, scope: !14)
+!31 = distinct !DIAssignID()
+!32 = distinct !DIAssignID()
+!33 = distinct !DIAssignID()
+!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
index c4b209de77017..6c9366609cba2 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
 
-; CHECK: call void @llvm.dbg.declare
+; CHECK: call void @llvm.dbg.assign
 
 define dso_local void @f() sanitize_hwaddress !dbg !9 {
 entry:
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
new file mode 100644
index 0000000000000..f91d2aa110a87
--- /dev/null
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
@@ -0,0 +1,59 @@
+; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
+
+source_filename = "test.ll"
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android"
+
+declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
+
+; Function Attrs: sanitize_hwaddress
+define void @f() #0 !dbg !7 {
+entry:
+  %nodebug0 = alloca ptr, align 8
+  %nodebug1 = alloca ptr, align 8
+  %nodebug2 = alloca ptr, align 8
+  %nodebug3 = alloca ptr, align 8
+  ; CHECK: %a = alloca{{.*}} !DIAssignID ![[ID1:[0-9]+]]
+  %a = alloca ptr, align 8, !DIAssignID !13
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID1]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !15
+  ; CHECK: %b = alloca{{.*}} !DIAssignID ![[ID2:[0-9]+]]
+  %b = alloca ptr, align 8, !DIAssignID !16
+  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID2]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
+  call void @llvm.dbg.assign(metadata i1 undef, metadata !17, metadata !DIExpression(DW_OP_plus_uconst, 1), metadata !16, metadata ptr %b, metadata !DIExpression()), !dbg !15
+  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
+  ret void, !dbg !18
+}
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+
+; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
+declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #1
+
+attributes #0 = { sanitize_hwaddress }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
+!1 = !DIFile(filename: "x.c", directory: "/")
+!2 = !{}
+!3 = !{i32 2, !"Dwarf Version", i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!6 = !{!"clang"}
+!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
+!8 = !DISubroutineType(types: !9)
+!9 = !{null, !10}
+!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
+!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
+!12 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
+!13 = distinct !DIAssignID()
+!14 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 1, type: !10)
+!15 = !DILocation(line: 0, scope: !7)
+!16 = distinct !DIAssignID()
+!17 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 1, type: !10)
+!18 = !DILocation(line: 1, column: 37, scope: !7)

From 3112578597c031e6f00c4b126182bd0d8582c729 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov@arm.com>
Date: Tue, 23 Jan 2024 11:19:44 +0000
Subject: [PATCH 595/843] [AArch64][compiler-rt] Add memcpy, memset, memmove,
 memchr builtins. (#77496)

Add naive implementation of memcpy, memset, memmove, memchr for SME
targets.
Co-authored-by: David Sherwood <david.sherwood@arm.com>
---
 compiler-rt/cmake/builtin-config-ix.cmake     |   8 +-
 compiler-rt/lib/builtins/CMakeLists.txt       |   5 +-
 .../lib/builtins/aarch64/sme-libc-routines.c  |  87 +++++++++++++
 compiler-rt/test/CMakeLists.txt               |   2 +
 .../test/builtins/Unit/sme-string-test.cpp    | 120 ++++++++++++++++++
 compiler-rt/test/lit.common.cfg.py            |   3 +
 compiler-rt/test/lit.common.configured.in     |   1 +
 .../unittests/lit.common.unit.configured.in   |   1 +
 8 files changed, 222 insertions(+), 5 deletions(-)
 create mode 100644 compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
 create mode 100644 compiler-rt/test/builtins/Unit/sme-string-test.cpp

diff --git a/compiler-rt/cmake/builtin-config-ix.cmake b/compiler-rt/cmake/builtin-config-ix.cmake
index b40138aa011f8..b17c43bf6a68b 100644
--- a/compiler-rt/cmake/builtin-config-ix.cmake
+++ b/compiler-rt/cmake/builtin-config-ix.cmake
@@ -35,10 +35,12 @@ asm(\".arch armv8-a+lse\");
 asm(\"cas w0, w1, [x2]\");
 ")
 
-builtin_check_c_compiler_source(COMPILER_RT_HAS_ASM_SME
+builtin_check_c_compiler_source(COMPILER_RT_HAS_AARCH64_SME
 "
-asm(\".arch armv9-a+sme\");
-asm(\"smstart\");
+void foo(void)  __arm_streaming_compatible {
+  asm(\".arch armv9-a+sme\");
+  asm(\"smstart\");
+}
 ")
 
 if(ANDROID)
diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 378884bcaf2e5..28ded8766f253 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -560,9 +560,10 @@ set(aarch64_SOURCES
   aarch64/fp_mode.c
 )
 
-if(COMPILER_RT_HAS_ASM_SME AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
-  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c)
+if(COMPILER_RT_HAS_AARCH64_SME AND COMPILER_RT_HAS_FNO_BUILTIN_FLAG AND (COMPILER_RT_HAS_AUXV OR COMPILER_RT_BAREMETAL_BUILD))
+  list(APPEND aarch64_SOURCES aarch64/sme-abi.S aarch64/sme-abi-init.c aarch64/sme-libc-routines.c)
   message(STATUS "AArch64 SME ABI routines enabled")
+  set_source_files_properties(aarch64/sme-libc-routines.c PROPERTIES COMPILE_FLAGS "-fno-builtin")
 else()
   message(STATUS "AArch64 SME ABI routines disabled")
 endif()
diff --git a/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
new file mode 100644
index 0000000000000..cd73025a19cc1
--- /dev/null
+++ b/compiler-rt/lib/builtins/aarch64/sme-libc-routines.c
@@ -0,0 +1,87 @@
+#include <stdlib.h>
+
+// WARNING: When building the scalar versions of these functions you need to
+// use the compiler flag "-mllvm -disable-loop-idiom-all" to prevent clang
+// from recognising a loop idiom and planting calls to memcpy!
+
+static void *__arm_sc_memcpy_fwd(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = srcp[i];
+
+  return dest;
+}
+
+// If dest and src overlap then behaviour is undefined, hence we can add the
+// restrict keywords here. This also matches the definition of the libc memcpy
+// according to the man page.
+void *__arm_sc_memcpy(void *__restrict__ dest, const void *__restrict__ src,
+                      size_t n) __arm_streaming_compatible {
+  return __arm_sc_memcpy_fwd(dest, src, n);
+}
+
+void *__arm_sc_memset(void *dest, int c, size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    destp[i] = c8;
+
+  return dest;
+}
+
+static void *__arm_sc_memcpy_rev(void *dest, const void *src,
+                                 size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+  // TODO: Improve performance by copying larger chunks in reverse, or by
+  // using SVE.
+  while (n > 0) {
+    --n;
+    destp[n] = srcp[n];
+  }
+  return dest;
+}
+
+// Semantically a memmove is equivalent to the following:
+//   1. Copy the entire contents of src to a temporary array that does not
+//      overlap with src or dest.
+//   2. Copy the contents of the temporary array into dest.
+void *__arm_sc_memmove(void *dest, const void *src,
+                       size_t n) __arm_streaming_compatible {
+  unsigned char *destp = (unsigned char *)dest;
+  const unsigned char *srcp = (const unsigned char *)src;
+
+  // If src and dest don't overlap then just invoke memcpy
+  if ((srcp > (destp + n)) || (destp > (srcp + n)))
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 1:
+  //     src: Low     |   ->   |     High
+  //    dest: Low  |   ->   |        High
+  // Here src is always ahead of dest at a higher addres. If we first read a
+  // chunk of data from src we can safely write the same chunk to dest without
+  // corrupting future reads of src.
+  if (srcp > destp)
+    return __arm_sc_memcpy_fwd(dest, src, n);
+
+  // Overlap case 2:
+  //     src: Low  |   ->   |        High
+  //    dest: Low     |   ->   |     High
+  // While we're in the overlap region we're always corrupting future reads of
+  // src when writing to dest. An efficient way to do this is to copy the data
+  // in reverse by starting at the highest address.
+  return __arm_sc_memcpy_rev(dest, src, n);
+}
+
+const void *__arm_sc_memchr(const void *src, int c,
+                            size_t n) __arm_streaming_compatible {
+  const unsigned char *srcp = (const unsigned char *)src;
+  unsigned char c8 = (unsigned char)c;
+  for (size_t i = 0; i < n; ++i)
+    if (srcp[i] == c8)
+      return &srcp[i];
+
+  return NULL;
+}
diff --git a/compiler-rt/test/CMakeLists.txt b/compiler-rt/test/CMakeLists.txt
index 7357604b1f651..ee2ef907bcae4 100644
--- a/compiler-rt/test/CMakeLists.txt
+++ b/compiler-rt/test/CMakeLists.txt
@@ -18,6 +18,8 @@ pythonize_bool(COMPILER_RT_BUILD_STANDALONE_LIBATOMIC)
 
 pythonize_bool(COMPILER_RT_ENABLE_INTERNAL_SYMBOLIZER)
 
+pythonize_bool(COMPILER_RT_HAS_AARCH64_SME)
+
 configure_compiler_rt_lit_site_cfg(
   ${CMAKE_CURRENT_SOURCE_DIR}/lit.common.configured.in
   ${CMAKE_CURRENT_BINARY_DIR}/lit.common.configured)
diff --git a/compiler-rt/test/builtins/Unit/sme-string-test.cpp b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
new file mode 100644
index 0000000000000..3bc4559f9ae04
--- /dev/null
+++ b/compiler-rt/test/builtins/Unit/sme-string-test.cpp
@@ -0,0 +1,120 @@
+// REQUIRES: aarch64-target-arch, aarch64-sme-available
+// RUN: %clangxx_builtins %s %librt -o %t && %run %t
+
+#include <cassert>
+#include <initializer_list>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+
+extern "C" {
+void *__arm_sc_memcpy(void *, const void *, size_t);
+void *__arm_sc_memset(void *, int, size_t);
+void *__arm_sc_memmove(void *, const void *, size_t);
+void *__arm_sc_memchr(const void *, int, size_t);
+}
+
+template <unsigned N> class Memory {
+public:
+  uint8_t ptr[N];
+  unsigned size;
+
+  Memory(unsigned stride = 0) {
+    size = N;
+    if (stride == 0)
+      return;
+    for (unsigned i = 0; i < N; i++)
+      ptr[i] = i * stride;
+  }
+
+  void assert_equal(const Memory &other) {
+    assert(N == other.size);
+    assert(memcmp(ptr, other.ptr, N) == 0);
+  }
+
+  void assert_equal(std::initializer_list<uint8_t> s) {
+    assert(N == s.size());
+    auto it = s.begin();
+    for (unsigned i = 0; i < N; ++i)
+      assert(ptr[i] == *it++);
+  }
+
+  void assert_elemt_equal_at(unsigned I, uint8_t elem) {
+    assert(ptr[I] == elem);
+  }
+};
+
+int main() {
+
+  // Testing memcpy from src to dst.
+  {
+    Memory<8> src(1);
+    Memory<8> dst;
+    if (!__arm_sc_memcpy(dst.ptr, src.ptr, 8))
+      abort();
+    dst.assert_equal(src);
+    dst.assert_equal({0, 1, 2, 3, 4, 5, 6, 7});
+  }
+
+  // Testing memcpy from src to dst with pointer offset.
+  {
+    Memory<8> src(1);
+    Memory<8> dst(1);
+    if (!__arm_sc_memcpy(dst.ptr + 1, src.ptr, 6))
+      abort();
+    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+  }
+
+  // Testing memchr.
+  {
+    Memory<8> src(4);
+    for (unsigned i = 0; i < 8; ++i) {
+      uint8_t e = src.ptr[i];
+      uint8_t *elem = (uint8_t *)memchr(src.ptr, e, 8);
+      if (!elem)
+        abort();
+      src.assert_elemt_equal_at(elem - src.ptr, *elem);
+      for (unsigned i = 0; i < 8; ++i)
+        assert(__arm_sc_memchr(src.ptr, src.ptr[i], 8) ==
+               memchr(src.ptr, src.ptr[i], 8));
+    }
+  }
+
+  // Testing memset.
+  {
+    Memory<8> array;
+    if (!__arm_sc_memset(array.ptr, 2, 8))
+      abort();
+    array.assert_equal({2, 2, 2, 2, 2, 2, 2, 2});
+  }
+
+  // Testing memset with pointer offset.
+  {
+    Memory<8> array(1);
+    if (!__arm_sc_memset(array.ptr + 1, 2, 6))
+      abort();
+    array.assert_equal({0, 2, 2, 2, 2, 2, 2, 7});
+  }
+
+  // Testing memmove with a simple non-overlap case.
+  {
+    Memory<8> src(1);
+    Memory<8> dst(1);
+    if (!__arm_sc_memmove(dst.ptr + 1, src.ptr, 6))
+      abort();
+    dst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+  }
+
+  // Testing memove with overlap pointers dst > src, dst < src.
+  {
+    Memory<8> srcdst(1);
+    if (!__arm_sc_memmove(srcdst.ptr + 1, srcdst.ptr, 6))
+      abort();
+    srcdst.assert_equal({0, 0, 1, 2, 3, 4, 5, 7});
+    if (!__arm_sc_memmove(srcdst.ptr, srcdst.ptr + 1, 6))
+      abort();
+    srcdst.assert_equal({0, 1, 2, 3, 4, 5, 5, 7});
+  }
+
+  return 0;
+}
diff --git a/compiler-rt/test/lit.common.cfg.py b/compiler-rt/test/lit.common.cfg.py
index 1753a55508c7c..113777b0ea8a1 100644
--- a/compiler-rt/test/lit.common.cfg.py
+++ b/compiler-rt/test/lit.common.cfg.py
@@ -454,6 +454,9 @@ def get_ios_commands_dir():
 if config.has_lld:
     config.available_features.add("lld-available")
 
+if config.aarch64_sme:
+    config.available_features.add("aarch64-sme-available")
+
 if config.use_lld:
     config.available_features.add("lld")
 
diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index 7c2d53520099a..b93e20e80a6ed 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -50,6 +50,7 @@ set_default("gwp_asan", @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@)
 set_default("expensive_checks", @LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL@)
 set_default("test_standalone_build_libs", @COMPILER_RT_TEST_STANDALONE_BUILD_LIBS_PYBOOL@)
 set_default("has_compiler_rt_libatomic", @COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL@)
+set_default("aarch64_sme", @COMPILER_RT_HAS_AARCH64_SME@)
 # True iff the test suite supports ignoring the test compiler's runtime library path
 # and using `config.compiler_rt_libdir` instead. This only matters when the runtime
 # library paths differ.
diff --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in
index 3e42e83c9e70a..23ec222697712 100644
--- a/compiler-rt/unittests/lit.common.unit.configured.in
+++ b/compiler-rt/unittests/lit.common.unit.configured.in
@@ -7,6 +7,7 @@ config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
 config.compiler_rt_src_root = "@COMPILER_RT_SOURCE_DIR@"
 config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@")
+config.aarch64_sme = @COMPILER_RT_HAS_AARCH64_SME@
 config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.host_arch = "@HOST_ARCH@"

From a3696196949ad03de2db266eea257d28d1f35905 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Jan 2024 11:09:59 +0000
Subject: [PATCH 596/843] Fix MSVC "result of 32-bit shift implicitly converted
 to 64 bits" warning. NFC.

---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 52a9667b44104..874d10b979b80 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -3141,7 +3141,7 @@ void RISCVInstrInfo::getVLENFactoredAmount(MachineFunction &MF,
         .setMIFlag(Flag);
     uint32_t PrevShiftAmount = 0;
     for (uint32_t ShiftAmount = 0; NumOfVReg >> ShiftAmount; ShiftAmount++) {
-      if (NumOfVReg & (1 << ShiftAmount)) {
+      if (NumOfVReg & (1LL << ShiftAmount)) {
         if (ShiftAmount)
           BuildMI(MBB, II, DL, get(RISCV::SLLI), DestReg)
               .addReg(DestReg, RegState::Kill)

From 5c7bbe383bf2de0c1de36c7231bbd7f75bfccb1e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Jan 2024 11:29:56 +0000
Subject: [PATCH 597/843] [X86] canonicalizeShuffleWithOp - recognise constant
 vectors with getTargetConstantFromNode

Allows shuffle to fold constant vectors that have already been lowered to constant pool - shuffle combining can then constant fold this.

Noticed while triaging #79100
---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |    1 +
 llvm/test/CodeGen/X86/i64-to-float.ll         |   53 +-
 llvm/test/CodeGen/X86/packus.ll               |   60 +-
 llvm/test/CodeGen/X86/sext-vsetcc.ll          |    6 +-
 llvm/test/CodeGen/X86/test-shrink-bug.ll      |    2 +-
 .../vector-interleaved-load-i8-stride-4.ll    | 1418 ++++----
 .../vector-interleaved-load-i8-stride-6.ll    | 3035 ++++++++---------
 .../test/CodeGen/X86/vector-shift-ashr-128.ll |    2 +-
 .../test/CodeGen/X86/vector-shift-ashr-256.ll |    2 +-
 .../test/CodeGen/X86/vector-shift-lshr-128.ll |    2 +-
 .../test/CodeGen/X86/vector-shift-lshr-256.ll |    2 +-
 llvm/test/CodeGen/X86/vector-shift-shl-128.ll |    2 +-
 llvm/test/CodeGen/X86/vector-shift-shl-256.ll |    2 +-
 13 files changed, 2267 insertions(+), 2320 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 974b6a176f0dc..e158312caffde 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -39638,6 +39638,7 @@ static SDValue canonicalizeShuffleWithOp(SDValue N, SelectionDAG &DAG,
            ISD::isBuildVectorAllZeros(Op.getNode()) ||
            ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) ||
            ISD::isBuildVectorOfConstantFPSDNodes(Op.getNode()) ||
+           getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op)) ||
            (Op.getOpcode() == ISD::INSERT_SUBVECTOR && Op->hasOneUse()) ||
            (isTargetShuffle(Op.getOpcode()) && Op->hasOneUse()) ||
            (FoldLoad && isShuffleFoldableLoad(Op)) ||
diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll
index 9662542b71023..7f2fd4573de3b 100644
--- a/llvm/test/CodeGen/X86/i64-to-float.ll
+++ b/llvm/test/CodeGen/X86/i64-to-float.ll
@@ -257,35 +257,34 @@ define <4 x float> @mask_uitofp_4i64_4f32(<4 x i64> %a) nounwind {
 define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
 ; X86-SSE-LABEL: clamp_sitofp_2i64_2f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,0,2147483648,0]
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
-; X86-SSE-NEXT:    pxor %xmm1, %xmm2
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [2147483393,4294967295,2147483393,4294967295]
-; X86-SSE-NEXT:    movdqa %xmm2, %xmm4
-; X86-SSE-NEXT:    pcmpgtd %xmm3, %xmm4
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[0,0,2,2]
-; X86-SSE-NEXT:    pcmpeqd %xmm3, %xmm2
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[1,1,3,3]
-; X86-SSE-NEXT:    pand %xmm5, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[1,1,3,3]
-; X86-SSE-NEXT:    por %xmm3, %xmm2
-; X86-SSE-NEXT:    pand %xmm2, %xmm0
-; X86-SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE-NEXT:    por %xmm0, %xmm2
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [2147483648,0,2147483648,0]
+; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
 ; X86-SSE-NEXT:    pxor %xmm2, %xmm1
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483903,0,2147483903,0]
-; X86-SSE-NEXT:    movdqa %xmm0, %xmm3
-; X86-SSE-NEXT:    pcmpgtd %xmm1, %xmm3
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,0,2,2]
-; X86-SSE-NEXT:    pcmpeqd %xmm0, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,3,3]
-; X86-SSE-NEXT:    pand %xmm4, %xmm0
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[1,1,3,3]
-; X86-SSE-NEXT:    por %xmm0, %xmm1
-; X86-SSE-NEXT:    pand %xmm1, %xmm2
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
+; X86-SSE-NEXT:    pcmpeqd %xmm4, %xmm4
+; X86-SSE-NEXT:    pcmpeqd %xmm3, %xmm4
+; X86-SSE-NEXT:    pcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,0,2,2]
+; X86-SSE-NEXT:    pand %xmm4, %xmm3
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
+; X86-SSE-NEXT:    por %xmm3, %xmm1
+; X86-SSE-NEXT:    pand %xmm1, %xmm0
 ; X86-SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
-; X86-SSE-NEXT:    por %xmm2, %xmm1
-; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,2,3]
+; X86-SSE-NEXT:    por %xmm0, %xmm1
+; X86-SSE-NEXT:    pxor %xmm1, %xmm2
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,1,3,3]
+; X86-SSE-NEXT:    pxor %xmm3, %xmm3
+; X86-SSE-NEXT:    pcmpeqd %xmm0, %xmm3
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [2147483903,0,2147483903,0]
+; X86-SSE-NEXT:    pcmpgtd %xmm2, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,2,2]
+; X86-SSE-NEXT:    pand %xmm3, %xmm2
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3]
+; X86-SSE-NEXT:    por %xmm2, %xmm0
+; X86-SSE-NEXT:    pand %xmm0, %xmm1
+; X86-SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE-NEXT:    por %xmm1, %xmm0
+; X86-SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; X86-SSE-NEXT:    cvtdq2pd %xmm0, %xmm0
 ; X86-SSE-NEXT:    retl
 ;
diff --git a/llvm/test/CodeGen/X86/packus.ll b/llvm/test/CodeGen/X86/packus.ll
index bbc3443db2b03..1b0af955ae824 100644
--- a/llvm/test/CodeGen/X86/packus.ll
+++ b/llvm/test/CodeGen/X86/packus.ll
@@ -118,45 +118,25 @@ define <8 x i16> @trunc_lshr_v8i32(<8 x i32> %a) nounwind {
 }
 
 define <8 x i16> @trunc_lshr_v4i64_demandedelts(<4 x i64> %a0) {
-; X86-SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
-; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm1
-; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X86-SSE2-NEXT:    pand %xmm2, %xmm0
-; X86-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; X86-SSE2-NEXT:    retl
-;
-; X64-SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
-; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,18446744073709551615]
-; X64-SSE2-NEXT:    pand %xmm2, %xmm0
-; X64-SSE2-NEXT:    pand %xmm2, %xmm1
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X64-SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-SSE2-NEXT:    packuswb %xmm1, %xmm0
-; X64-SSE2-NEXT:    retq
-;
-; X86-SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
-; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
-; X86-SSE4-NEXT:    pand %xmm2, %xmm1
-; X86-SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X86-SSE4-NEXT:    pand %xmm2, %xmm0
-; X86-SSE4-NEXT:    packusdw %xmm1, %xmm0
-; X86-SSE4-NEXT:    retl
+; SSE2-LABEL: trunc_lshr_v4i64_demandedelts:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE2-NEXT:    pand %xmm2, %xmm1
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    pand %xmm2, %xmm0
+; SSE2-NEXT:    packuswb %xmm1, %xmm0
+; SSE2-NEXT:    ret{{[l|q]}}
 ;
-; X64-SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
-; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [1,18446744073709551615]
-; X64-SSE4-NEXT:    pand %xmm2, %xmm0
-; X64-SSE4-NEXT:    pand %xmm2, %xmm1
-; X64-SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; X64-SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; X64-SSE4-NEXT:    packusdw %xmm1, %xmm0
-; X64-SSE4-NEXT:    retq
+; SSE4-LABEL: trunc_lshr_v4i64_demandedelts:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,1,1]
+; SSE4-NEXT:    pand %xmm2, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE4-NEXT:    pand %xmm2, %xmm0
+; SSE4-NEXT:    packusdw %xmm1, %xmm0
+; SSE4-NEXT:    ret{{[l|q]}}
 ;
 ; X86-AVX1-LABEL: trunc_lshr_v4i64_demandedelts:
 ; X86-AVX1:       # %bb.0:
@@ -467,4 +447,8 @@ define <32 x i8> @packuswb_icmp_zero_trunc_256(<16 x i16> %a0) {
 }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; X64-AVX2: {{.*}}
+; X64-SSE2: {{.*}}
+; X64-SSE4: {{.*}}
 ; X86-AVX2: {{.*}}
+; X86-SSE2: {{.*}}
+; X86-SSE4: {{.*}}
diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll
index 65e3c1f0633d7..acf123bde41f0 100644
--- a/llvm/test/CodeGen/X86/sext-vsetcc.ll
+++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll
@@ -283,10 +283,10 @@ define <4 x i32> @cmp_slt_load_const(ptr %x) nounwind {
 ; SSE-LABEL: cmp_slt_load_const:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = <42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [10794,10794,54998,54998,0,0,65535,65535]
 ; SSE-NEXT:    pcmpgtb %xmm1, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: cmp_slt_load_const:
diff --git a/llvm/test/CodeGen/X86/test-shrink-bug.ll b/llvm/test/CodeGen/X86/test-shrink-bug.ll
index 51a00d211421b..74c5179b18789 100644
--- a/llvm/test/CodeGen/X86/test-shrink-bug.ll
+++ b/llvm/test/CodeGen/X86/test-shrink-bug.ll
@@ -67,8 +67,8 @@ define dso_local void @fail(i16 %a, <2 x i8> %b) {
 ; CHECK-X64-NEXT:    testl $263, %edi # imm = 0x107
 ; CHECK-X64-NEXT:    je .LBB1_3
 ; CHECK-X64-NEXT:  # %bb.1:
+; CHECK-X64-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; CHECK-X64-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-X64-NEXT:    pslld $8, %xmm0
 ; CHECK-X64-NEXT:    pextrw $1, %xmm0, %eax
 ; CHECK-X64-NEXT:    testb $1, %al
 ; CHECK-X64-NEXT:    jne .LBB1_3
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 3d0b2aeae5b30..e2311246e711f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -156,64 +156,62 @@ define void @load_i8_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
 ; SSE-LABEL: load_i8_stride4_vf8:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    movdqa (%rdi), %xmm3
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm4
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT:    movdqa %xmm4, %xmm1
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm4
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    packuswb %xmm4, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pxor %xmm7, %xmm7
-; SSE-NEXT:    movdqa %xmm4, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm7[8],xmm2[9],xmm7[9],xmm2[10],xmm7[10],xmm2[11],xmm7[11],xmm2[12],xmm7[12],xmm2[13],xmm7[13],xmm2[14],xmm7[14],xmm2[15],xmm7[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm3[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm4, %xmm3
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm6[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1]
-; SSE-NEXT:    movdqa %xmm1, %xmm5
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm7[8],xmm5[9],xmm7[9],xmm5[10],xmm7[10],xmm5[11],xmm7[11],xmm5[12],xmm7[12],xmm5[13],xmm7[13],xmm5[14],xmm7[14],xmm5[15],xmm7[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm1, %xmm6
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3],xmm6[4],xmm7[4],xmm6[5],xmm7[5],xmm6[6],xmm7[6],xmm6[7],xmm7[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1]
-; SSE-NEXT:    packuswb %xmm8, %xmm7
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,3,2,3]
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm8, %xmm4
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pand %xmm8, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm4, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT:    pxor %xmm6, %xmm6
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm6[8],xmm1[9],xmm6[9],xmm1[10],xmm6[10],xmm1[11],xmm6[11],xmm1[12],xmm6[12],xmm1[13],xmm6[13],xmm1[14],xmm6[14],xmm1[15],xmm6[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm6[0],xmm2[1],xmm6[1],xmm2[2],xmm6[2],xmm2[3],xmm6[3],xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
+; SSE-NEXT:    movdqa %xmm3, %xmm4
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm6[8],xmm3[9],xmm6[9],xmm3[10],xmm6[10],xmm3[11],xmm6[11],xmm3[12],xmm6[12],xmm3[13],xmm6[13],xmm3[14],xmm6[14],xmm3[15],xmm6[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm3[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3],xmm4[4],xmm6[4],xmm4[5],xmm6[5],xmm4[6],xmm6[6],xmm4[7],xmm6[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1]
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm7, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm7, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm8[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm5, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm7[0,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
-; SSE-NEXT:    packuswb %xmm3, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[0,3,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT:    packuswb %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,3,2,3]
 ; SSE-NEXT:    movq %xmm0, (%rsi)
-; SSE-NEXT:    movq %xmm7, (%rdx)
-; SSE-NEXT:    movq %xmm1, (%rcx)
-; SSE-NEXT:    movq %xmm2, (%r8)
+; SSE-NEXT:    movq %xmm6, (%rdx)
+; SSE-NEXT:    movq %xmm5, (%rcx)
+; SSE-NEXT:    movq %xmm1, (%r8)
 ; SSE-NEXT:    retq
 ;
 ; AVX1-ONLY-LABEL: load_i8_stride4_vf8:
@@ -296,111 +294,111 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-LABEL: load_i8_stride4_vf16:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm1
-; SSE-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE-NEXT:    movdqa 32(%rdi), %xmm8
-; SSE-NEXT:    movdqa 48(%rdi), %xmm13
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0]
-; SSE-NEXT:    movdqa %xmm13, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm8, %xmm4
-; SSE-NEXT:    pand %xmm0, %xmm4
-; SSE-NEXT:    packuswb %xmm3, %xmm4
-; SSE-NEXT:    movdqa %xmm2, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    packuswb %xmm3, %xmm0
-; SSE-NEXT:    packuswb %xmm4, %xmm0
-; SSE-NEXT:    pxor %xmm10, %xmm10
-; SSE-NEXT:    movdqa %xmm13, %xmm3
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm13, %xmm4
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm4[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm6[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
-; SSE-NEXT:    movdqa %xmm8, %xmm5
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm10[8],xmm5[9],xmm10[9],xmm5[10],xmm10[10],xmm5[11],xmm10[11],xmm5[12],xmm10[12],xmm5[13],xmm10[13],xmm5[14],xmm10[14],xmm5[15],xmm10[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm6[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm8, %xmm6
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm10[0],xmm6[1],xmm10[1],xmm6[2],xmm10[2],xmm6[3],xmm10[3],xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm6[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm11[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm9[0],xmm14[1],xmm9[1]
-; SSE-NEXT:    packuswb %xmm7, %xmm14
-; SSE-NEXT:    movdqa %xmm2, %xmm7
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm10[8],xmm7[9],xmm10[9],xmm7[10],xmm10[10],xmm7[11],xmm10[11],xmm7[12],xmm10[12],xmm7[13],xmm10[13],xmm7[14],xmm10[14],xmm7[15],xmm10[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm7[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm9[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm2, %xmm9
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm9[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm15 = xmm12[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
-; SSE-NEXT:    movdqa %xmm1, %xmm12
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm1, %xmm11
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3],xmm12[4],xmm10[4],xmm12[5],xmm10[5],xmm12[6],xmm10[6],xmm12[7],xmm10[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm11[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm10[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm12[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm1[0],xmm10[1],xmm1[1]
-; SSE-NEXT:    packuswb %xmm15, %xmm10
-; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,3],xmm14[0,3]
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm1, %xmm13
-; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm13[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pand %xmm1, %xmm8
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm13, %xmm8
+; SSE-NEXT:    movdqa 16(%rdi), %xmm3
+; SSE-NEXT:    movdqa 32(%rdi), %xmm9
+; SSE-NEXT:    movdqa 48(%rdi), %xmm10
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT:    movdqa %xmm10, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm4
+; SSE-NEXT:    movdqa %xmm10, %xmm7
+; SSE-NEXT:    pand %xmm2, %xmm7
+; SSE-NEXT:    movdqa %xmm9, %xmm5
+; SSE-NEXT:    movdqa %xmm9, %xmm6
+; SSE-NEXT:    movdqa %xmm9, %xmm11
+; SSE-NEXT:    pand %xmm2, %xmm11
+; SSE-NEXT:    packuswb %xmm7, %xmm11
+; SSE-NEXT:    movdqa %xmm3, %xmm7
+; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    movdqa %xmm3, %xmm12
+; SSE-NEXT:    pand %xmm2, %xmm12
 ; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; SSE-NEXT:    pand %xmm1, %xmm13
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm13[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm12, %xmm2
+; SSE-NEXT:    packuswb %xmm11, %xmm2
+; SSE-NEXT:    pxor %xmm11, %xmm11
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm11[8],xmm0[9],xmm11[9],xmm0[10],xmm11[10],xmm0[11],xmm11[11],xmm0[12],xmm11[12],xmm0[13],xmm11[13],xmm0[14],xmm11[14],xmm0[15],xmm11[15]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3],xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm4[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm13[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm11[8],xmm5[9],xmm11[9],xmm5[10],xmm11[10],xmm5[11],xmm11[11],xmm5[12],xmm11[12],xmm5[13],xmm11[13],xmm5[14],xmm11[14],xmm5[15],xmm11[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm5[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm12[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm11[0],xmm6[1],xmm11[1],xmm6[2],xmm11[2],xmm6[3],xmm11[3],xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm6[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm14[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm12[0],xmm14[1],xmm12[1]
+; SSE-NEXT:    packuswb %xmm13, %xmm14
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm7[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm12[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm8[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm15 = xmm13[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm12[0],xmm15[1],xmm12[1]
+; SSE-NEXT:    movdqa %xmm1, %xmm12
+; SSE-NEXT:    movdqa %xmm1, %xmm13
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm11[8],xmm13[9],xmm11[9],xmm13[10],xmm11[10],xmm13[11],xmm11[11],xmm13[12],xmm11[12],xmm13[13],xmm11[13],xmm13[14],xmm11[14],xmm13[15],xmm11[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm11[0],xmm12[1],xmm11[1],xmm12[2],xmm11[2],xmm12[3],xmm11[3],xmm12[4],xmm11[4],xmm12[5],xmm11[5],xmm12[6],xmm11[6],xmm12[7],xmm11[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm12[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1]
+; SSE-NEXT:    packuswb %xmm15, %xmm11
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,3],xmm14[0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm10, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm10, %xmm9
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm0, %xmm9
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm10, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm8[0,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm0, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm5[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm6[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
 ; SSE-NEXT:    packuswb %xmm3, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm9[3,1,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm8[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm11[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm12[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
 ; SSE-NEXT:    packuswb %xmm3, %xmm5
 ; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm4[0,3]
-; SSE-NEXT:    movdqa %xmm0, (%rsi)
-; SSE-NEXT:    movaps %xmm10, (%rdx)
+; SSE-NEXT:    movdqa %xmm2, (%rsi)
+; SSE-NEXT:    movaps %xmm11, (%rdx)
 ; SSE-NEXT:    movaps %xmm1, (%rcx)
 ; SSE-NEXT:    movaps %xmm5, (%r8)
 ; SSE-NEXT:    retq
@@ -530,249 +528,247 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    subq $136, %rsp
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 80(%rdi), %xmm5
-; SSE-NEXT:    movdqa 96(%rdi), %xmm15
-; SSE-NEXT:    movdqa 112(%rdi), %xmm14
-; SSE-NEXT:    movdqa (%rdi), %xmm12
-; SSE-NEXT:    movdqa 16(%rdi), %xmm11
-; SSE-NEXT:    movdqa 32(%rdi), %xmm7
-; SSE-NEXT:    movdqa 48(%rdi), %xmm3
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [255,0,255,0,255,0,255,0]
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    pand %xmm6, %xmm0
-; SSE-NEXT:    movdqa %xmm7, %xmm1
-; SSE-NEXT:    pand %xmm6, %xmm1
-; SSE-NEXT:    packuswb %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm11, %xmm0
-; SSE-NEXT:    pand %xmm6, %xmm0
-; SSE-NEXT:    movdqa %xmm12, %xmm2
-; SSE-NEXT:    pand %xmm6, %xmm2
-; SSE-NEXT:    packuswb %xmm0, %xmm2
-; SSE-NEXT:    packuswb %xmm1, %xmm2
+; SSE-NEXT:    movdqa 80(%rdi), %xmm9
+; SSE-NEXT:    movdqa 96(%rdi), %xmm11
+; SSE-NEXT:    movdqa 112(%rdi), %xmm12
+; SSE-NEXT:    movdqa (%rdi), %xmm13
+; SSE-NEXT:    movdqa 16(%rdi), %xmm14
+; SSE-NEXT:    movdqa 32(%rdi), %xmm2
+; SSE-NEXT:    movdqa 48(%rdi), %xmm10
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT:    movdqa %xmm10, %xmm3
+; SSE-NEXT:    movdqa %xmm10, %xmm5
+; SSE-NEXT:    movdqa %xmm10, %xmm0
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm6
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm15
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm8, %xmm1
+; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm14, %xmm0
-; SSE-NEXT:    pand %xmm6, %xmm0
-; SSE-NEXT:    movdqa %xmm15, %xmm1
-; SSE-NEXT:    pand %xmm6, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm13, %xmm7
+; SSE-NEXT:    pand %xmm8, %xmm7
+; SSE-NEXT:    packuswb %xmm0, %xmm7
+; SSE-NEXT:    packuswb %xmm1, %xmm7
+; SSE-NEXT:    movdqa %xmm12, %xmm0
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm11, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm1
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm5, %xmm2
-; SSE-NEXT:    pand %xmm6, %xmm0
-; SSE-NEXT:    pand %xmm4, %xmm6
-; SSE-NEXT:    packuswb %xmm0, %xmm6
-; SSE-NEXT:    packuswb %xmm1, %xmm6
-; SSE-NEXT:    pxor %xmm4, %xmm4
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    movdqa %xmm9, %xmm0
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    pand %xmm4, %xmm8
+; SSE-NEXT:    packuswb %xmm0, %xmm8
+; SSE-NEXT:    packuswb %xmm1, %xmm8
+; SSE-NEXT:    pxor %xmm2, %xmm2
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm15, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm7, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
-; SSE-NEXT:    packuswb %xmm5, %xmm8
-; SSE-NEXT:    movdqa %xmm11, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE-NEXT:    packuswb %xmm3, %xmm5
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm14, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm11, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT:    movdqa %xmm14, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm5[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm12, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm13, %xmm14
+; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm13, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
 ; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm12, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm5[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1]
-; SSE-NEXT:    packuswb %xmm9, %xmm13
-; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,3],xmm8[0,3]
-; SSE-NEXT:    movdqa %xmm14, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm14[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; SSE-NEXT:    packuswb %xmm3, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3]
+; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm12, %xmm15
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm12, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm14, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm15, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm15, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm10[0],xmm0[1],xmm10[1]
-; SSE-NEXT:    packuswb %xmm8, %xmm0
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm10
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm5, %xmm9
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
-; SSE-NEXT:    movdqa %xmm5, %xmm8
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm4[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm8[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm2[0],xmm15[1],xmm2[1],xmm15[2],xmm2[2],xmm15[3],xmm2[3],xmm15[4],xmm2[4],xmm15[5],xmm2[5],xmm15[6],xmm2[6],xmm15[7],xmm2[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm15[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm11, %xmm12
+; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, %xmm13
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm13[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm0[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm2[0],xmm12[1],xmm2[1],xmm12[2],xmm2[2],xmm12[3],xmm2[3],xmm12[4],xmm2[4],xmm12[5],xmm2[5],xmm12[6],xmm2[6],xmm12[7],xmm2[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm12[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1]
+; SSE-NEXT:    packuswb %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm9, %xmm11
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm9[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT:    movdqa %xmm4, %xmm5
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm4, %xmm6
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[1,3,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1]
 ; SSE-NEXT:    packuswb %xmm1, %xmm4
 ; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[0,3],xmm0[0,3]
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm1 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm2, %xmm1
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm2 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm10 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm3, %xmm10
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm2, %xmm10
+; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,3],xmm1[0,3]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm1 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm2 = mem[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pand %xmm0, %xmm7
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm7[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm3, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm11
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm1[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pand %xmm0, %xmm12
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm1 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = mem[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm3, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm2[0,3]
-; SSE-NEXT:    pand %xmm0, %xmm14
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pand %xmm0, %xmm15
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm15[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm3[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm7
-; SSE-NEXT:    pand %xmm0, %xmm10
-; SSE-NEXT:    pand %xmm0, %xmm5
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm5[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm2[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm7[0,3]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm2 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm7 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; SSE-NEXT:    packuswb %xmm2, %xmm7
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm1, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm2[0,3]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm1 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm2 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; SSE-NEXT:    pshufd $231, (%rsp), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1]
-; SSE-NEXT:    packuswb %xmm2, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm1 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT:    packuswb %xmm2, %xmm3
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm1 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm2 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm7 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm2 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm11 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1]
-; SSE-NEXT:    packuswb %xmm7, %xmm11
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm2 = mem[3,1,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    pshufd $231, (%rsp), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm1 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm14 = xmm14[0],xmm1[0],xmm14[1],xmm1[1]
+; SSE-NEXT:    packuswb %xmm2, %xmm14
+; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,3],xmm3[0,3]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm1 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm15[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm7 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
+; SSE-NEXT:    packuswb %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm11[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm2[0],xmm8[1],xmm2[1]
-; SSE-NEXT:    packuswb %xmm7, %xmm8
-; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,3],xmm11[0,3]
-; SSE-NEXT:    movdqa %xmm6, 16(%rsi)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm2, (%rsi)
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1]
+; SSE-NEXT:    packuswb %xmm2, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm3[0,3]
+; SSE-NEXT:    movdqa %xmm8, 16(%rsi)
+; SSE-NEXT:    movdqa %xmm7, (%rsi)
 ; SSE-NEXT:    movaps %xmm4, 16(%rdx)
-; SSE-NEXT:    movaps %xmm13, (%rdx)
-; SSE-NEXT:    movaps %xmm3, 16(%rcx)
-; SSE-NEXT:    movaps %xmm1, (%rcx)
-; SSE-NEXT:    movaps %xmm8, 16(%r8)
-; SSE-NEXT:    movaps %xmm0, (%r8)
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    movaps %xmm1, (%rdx)
+; SSE-NEXT:    movaps %xmm0, 16(%rcx)
+; SSE-NEXT:    movaps %xmm10, (%rcx)
+; SSE-NEXT:    movaps %xmm5, 16(%r8)
+; SSE-NEXT:    movaps %xmm14, (%r8)
 ; SSE-NEXT:    addq $136, %rsp
 ; SSE-NEXT:    retq
 ;
@@ -1023,522 +1019,518 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3) nounwind {
 ; SSE-LABEL: load_i8_stride4_vf64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    subq $664, %rsp # imm = 0x298
+; SSE-NEXT:    subq $600, %rsp # imm = 0x258
 ; SSE-NEXT:    movdqa 16(%rdi), %xmm8
 ; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 32(%rdi), %xmm6
+; SSE-NEXT:    movdqa 32(%rdi), %xmm15
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm14
-; SSE-NEXT:    movdqa 128(%rdi), %xmm15
-; SSE-NEXT:    movdqa 144(%rdi), %xmm10
-; SSE-NEXT:    movdqa 160(%rdi), %xmm11
-; SSE-NEXT:    movdqa 176(%rdi), %xmm3
-; SSE-NEXT:    movdqa 64(%rdi), %xmm13
-; SSE-NEXT:    movdqa 80(%rdi), %xmm7
+; SSE-NEXT:    movdqa 128(%rdi), %xmm4
+; SSE-NEXT:    movdqa 144(%rdi), %xmm7
+; SSE-NEXT:    movdqa 160(%rdi), %xmm10
+; SSE-NEXT:    movdqa 176(%rdi), %xmm12
+; SSE-NEXT:    movdqa 64(%rdi), %xmm6
+; SSE-NEXT:    movdqa 80(%rdi), %xmm13
 ; SSE-NEXT:    movdqa 96(%rdi), %xmm2
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,0,255,0,255,0,255,0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [255,0,255,0,255,0,255,0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm4
-; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm2, %xmm9
-; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    pand %xmm9, %xmm1
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm13, %xmm2
-; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    movdqa %xmm13, %xmm0
+; SSE-NEXT:    pand %xmm9, %xmm0
+; SSE-NEXT:    movdqa %xmm6, %xmm2
+; SSE-NEXT:    pand %xmm9, %xmm2
 ; SSE-NEXT:    packuswb %xmm0, %xmm2
 ; SSE-NEXT:    packuswb %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm11, %xmm1
-; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm12, %xmm0
+; SSE-NEXT:    pand %xmm9, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    pand %xmm9, %xmm1
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm15, %xmm2
-; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    pand %xmm9, %xmm0
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    pand %xmm9, %xmm2
 ; SSE-NEXT:    packuswb %xmm0, %xmm2
 ; SSE-NEXT:    packuswb %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm14, %xmm0
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm6, %xmm1
-; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm9, %xmm0
+; SSE-NEXT:    movdqa %xmm15, %xmm1
+; SSE-NEXT:    pand %xmm9, %xmm1
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm8, %xmm0
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    movdqa (%rdi), %xmm12
-; SSE-NEXT:    movdqa %xmm12, %xmm2
-; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pand %xmm9, %xmm0
+; SSE-NEXT:    movdqa (%rdi), %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm2
+; SSE-NEXT:    pand %xmm9, %xmm2
 ; SSE-NEXT:    packuswb %xmm0, %xmm2
 ; SSE-NEXT:    packuswb %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 240(%rdi), %xmm8
-; SSE-NEXT:    movdqa %xmm8, %xmm0
-; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    movdqa 240(%rdi), %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa 224(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm9, %xmm1
 ; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa 208(%rdi), %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm2, %xmm5
-; SSE-NEXT:    packuswb %xmm0, %xmm5
-; SSE-NEXT:    packuswb %xmm1, %xmm5
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm4, %xmm1
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    pand %xmm2, %xmm9
+; SSE-NEXT:    packuswb %xmm0, %xmm9
+; SSE-NEXT:    packuswb %xmm1, %xmm9
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm3, %xmm0
+; SSE-NEXT:    pxor %xmm8, %xmm8
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm9, %xmm1
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm9, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
-; SSE-NEXT:    packuswb %xmm4, %xmm9
-; SSE-NEXT:    movdqa %xmm7, %xmm1
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1]
+; SSE-NEXT:    packuswb %xmm3, %xmm5
+; SSE-NEXT:    movdqa %xmm13, %xmm1
+; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm13, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm13, %xmm1
-; SSE-NEXT:    movdqa %xmm13, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm13, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm6, %xmm1
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm6, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[1,3,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    packuswb %xmm4, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3]
+; SSE-NEXT:    packuswb %xmm3, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm5[0,3]
 ; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    movdqa %xmm12, %xmm1
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm12, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm11, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm10, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm2[0],xmm11[1],xmm2[1],xmm11[2],xmm2[2],xmm11[3],xmm2[3],xmm11[4],xmm2[4],xmm11[5],xmm2[5],xmm11[6],xmm2[6],xmm11[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm11[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1]
-; SSE-NEXT:    packuswb %xmm4, %xmm9
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1]
+; SSE-NEXT:    packuswb %xmm3, %xmm6
+; SSE-NEXT:    movdqa %xmm7, (%rsp) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm15, %xmm1
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm15, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    movdqa %xmm7, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
+; SSE-NEXT:    movdqa %xmm4, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm8[0],xmm1[1],xmm8[1],xmm1[2],xmm8[2],xmm1[3],xmm8[3],xmm1[4],xmm8[4],xmm1[5],xmm8[5],xmm1[6],xmm8[6],xmm1[7],xmm8[7]
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm13[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    packuswb %xmm4, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm9[0,3]
-; SSE-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm1[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1]
+; SSE-NEXT:    packuswb %xmm3, %xmm10
+; SSE-NEXT:    shufps {{.*#+}} xmm10 = xmm10[0,3],xmm6[0,3]
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm14, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm14, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT:    movdqa %xmm6, %xmm7
-; SSE-NEXT:    movdqa %xmm6, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm8[0],xmm14[1],xmm8[1],xmm14[2],xmm8[2],xmm14[3],xmm8[3],xmm14[4],xmm8[4],xmm14[5],xmm8[5],xmm14[6],xmm8[6],xmm14[7],xmm8[7]
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm14[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
+; SSE-NEXT:    movdqa %xmm15, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm6, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm0[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm9[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm13[0],xmm0[1],xmm13[1]
-; SSE-NEXT:    packuswb %xmm4, %xmm0
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm6, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm6, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm9[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
-; SSE-NEXT:    movdqa %xmm12, %xmm3
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm2[8],xmm3[9],xmm2[9],xmm3[10],xmm2[10],xmm3[11],xmm2[11],xmm3[12],xmm2[12],xmm3[13],xmm2[13],xmm3[14],xmm2[14],xmm3[15],xmm2[15]
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm4[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm12, %xmm4
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm15 = xmm4[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm3[0],xmm15[1],xmm3[1]
-; SSE-NEXT:    packuswb %xmm1, %xmm15
-; SSE-NEXT:    shufps {{.*#+}} xmm15 = xmm15[0,3],xmm0[0,3]
-; SSE-NEXT:    movdqa %xmm8, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm0[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm15, %xmm12
+; SSE-NEXT:    movdqa %xmm15, %xmm14
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm8[0],xmm12[1],xmm8[1],xmm12[2],xmm8[2],xmm12[3],xmm8[3],xmm12[4],xmm8[4],xmm12[5],xmm8[5],xmm12[6],xmm8[6],xmm12[7],xmm8[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm12[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm6[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1]
+; SSE-NEXT:    packuswb %xmm3, %xmm0
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm15, %xmm13
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm8[8],xmm13[9],xmm8[9],xmm13[10],xmm8[10],xmm13[11],xmm8[11],xmm13[12],xmm8[12],xmm13[13],xmm8[13],xmm13[14],xmm8[14],xmm13[15],xmm8[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm13[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm8[0],xmm15[1],xmm8[1],xmm15[2],xmm8[2],xmm15[3],xmm8[3],xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm15[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1]
+; SSE-NEXT:    movdqa %xmm11, %xmm7
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm11, %xmm6
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm8[0],xmm6[1],xmm8[1],xmm6[2],xmm8[2],xmm6[3],xmm8[3],xmm6[4],xmm8[4],xmm6[5],xmm8[5],xmm6[6],xmm8[6],xmm6[7],xmm8[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
+; SSE-NEXT:    packuswb %xmm1, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,3],xmm0[0,3]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm8, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm8[0],xmm3[1],xmm8[1],xmm3[2],xmm8[2],xmm3[3],xmm8[3],xmm3[4],xmm8[4],xmm3[5],xmm8[5],xmm3[6],xmm8[6],xmm3[7],xmm8[7]
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm3[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm10, %xmm3
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm8[0],xmm4[1],xmm8[1],xmm4[2],xmm8[2],xmm4[3],xmm8[3],xmm4[4],xmm8[4],xmm4[5],xmm8[5],xmm4[6],xmm8[6],xmm4[7],xmm8[7]
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm4[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,3,2,3,4,5,6,7]
 ; SSE-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1]
 ; SSE-NEXT:    packuswb %xmm1, %xmm3
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm11, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm8[8],xmm0[9],xmm8[9],xmm0[10],xmm8[10],xmm0[11],xmm8[11],xmm0[12],xmm8[12],xmm0[13],xmm8[13],xmm0[14],xmm8[14],xmm0[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm0[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm11, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm9, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm9, %xmm4
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3],xmm4[4],xmm2[4],xmm4[5],xmm2[5],xmm4[6],xmm2[6],xmm4[7],xmm2[7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm0, %xmm4
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm8[8],xmm4[9],xmm8[9],xmm4[10],xmm8[10],xmm4[11],xmm8[11],xmm4[12],xmm8[12],xmm4[13],xmm8[13],xmm4[14],xmm8[14],xmm4[15],xmm8[15]
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3],xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm4[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm1[1,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; SSE-NEXT:    packuswb %xmm0, %xmm1
+; SSE-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1]
+; SSE-NEXT:    packuswb %xmm2, %xmm1
 ; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,3],xmm3[0,3]
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm0, %xmm14
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm0 = mem[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm14[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pand %xmm0, %xmm7
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm6
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm6[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pand %xmm0, %xmm12
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm12[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm4[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm12
-; SSE-NEXT:    shufps {{.*#+}} xmm12 = xmm12[0,3],xmm3[0,3]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm3, %xmm4
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm2 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm11[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm3
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm2[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    movdqa (%rsp), %xmm2 # 16-byte Reload
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm2[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm4, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm3[0,3]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm8, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,3],xmm4[0,3]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm4 = mem[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    pand %xmm0, %xmm4
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm4
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm3, %xmm4
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm8 = mem[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm8[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm3, %xmm11
+; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[0,3],xmm4[0,3]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm4 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm4
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm3, %xmm4
+; SSE-NEXT:    pshuflw $231, (%rsp), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm8 = mem[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm6, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,3],xmm4[0,3]
+; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    pand %xmm0, %xmm8
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pand %xmm0, %xmm10
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm10[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm6[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm4, %xmm7
-; SSE-NEXT:    pand %xmm0, %xmm11
-; SSE-NEXT:    pand %xmm0, %xmm9
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,6,5,4]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm9[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm8[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm3, %xmm14
+; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[0,3],xmm4[0,3]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm4 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,6,5,4]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm4
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm4[1,0,3,2,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm6
-; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,3],xmm7[0,3]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm3, %xmm4
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm8 = mem[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm3[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm0[0,1,2,3,7,6,5,4]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm3, %xmm0
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm4[0,3]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm7 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1]
-; SSE-NEXT:    packuswb %xmm4, %xmm7
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm12[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
+; SSE-NEXT:    packuswb %xmm4, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm13[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm15[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm0 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm8[0],xmm0[1],xmm8[1]
-; SSE-NEXT:    packuswb %xmm4, %xmm0
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,3],xmm7[0,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm7[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1]
+; SSE-NEXT:    packuswb %xmm4, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,3],xmm8[0,3]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm7 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm8 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; SSE-NEXT:    packuswb %xmm7, %xmm8
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm7[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
+; SSE-NEXT:    packuswb %xmm4, %xmm8
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm7 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm7[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm7 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1]
-; SSE-NEXT:    packuswb %xmm10, %xmm7
+; SSE-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; SSE-NEXT:    packuswb %xmm4, %xmm7
 ; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,3],xmm8[0,3]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm8 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm10 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
-; SSE-NEXT:    packuswb %xmm8, %xmm10
+; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
+; SSE-NEXT:    packuswb %xmm4, %xmm12
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm8 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm8[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm8 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1]
-; SSE-NEXT:    packuswb %xmm11, %xmm8
-; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,3],xmm10[0,3]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1]
+; SSE-NEXT:    packuswb %xmm4, %xmm8
+; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[0,3],xmm12[0,3]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm10 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm11 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1]
-; SSE-NEXT:    packuswb %xmm10, %xmm11
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm12 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm12[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1]
+; SSE-NEXT:    packuswb %xmm4, %xmm12
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,1,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm10 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[0,1,3,1,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1]
-; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT:    # xmm4 = mem[3,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1]
+; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    # xmm3 = mem[3,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
 ; SSE-NEXT:    # xmm13 = mem[3,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1]
-; SSE-NEXT:    packuswb %xmm10, %xmm13
-; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,3],xmm11[0,3]
-; SSE-NEXT:    movdqa %xmm5, 48(%rsi)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm4, (%rsi)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm4, 32(%rsi)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm4, 16(%rsi)
+; SSE-NEXT:    punpckldq {{.*#+}} xmm13 = xmm13[0],xmm3[0],xmm13[1],xmm3[1]
+; SSE-NEXT:    packuswb %xmm4, %xmm13
+; SSE-NEXT:    shufps {{.*#+}} xmm13 = xmm13[0,3],xmm12[0,3]
+; SSE-NEXT:    movdqa %xmm9, 48(%rsi)
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT:    movaps %xmm3, (%rsi)
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT:    movaps %xmm3, 32(%rsi)
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT:    movaps %xmm3, 16(%rsi)
 ; SSE-NEXT:    movaps %xmm1, 48(%rdx)
-; SSE-NEXT:    movaps %xmm15, (%rdx)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm1, 32(%rdx)
+; SSE-NEXT:    movaps %xmm5, (%rdx)
+; SSE-NEXT:    movaps %xmm10, 32(%rdx)
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm1, 16(%rdx)
-; SSE-NEXT:    movaps %xmm6, 48(%rcx)
-; SSE-NEXT:    movaps %xmm3, 32(%rcx)
-; SSE-NEXT:    movaps %xmm2, 16(%rcx)
-; SSE-NEXT:    movaps %xmm12, (%rcx)
+; SSE-NEXT:    movaps %xmm0, 48(%rcx)
+; SSE-NEXT:    movaps %xmm14, 32(%rcx)
+; SSE-NEXT:    movaps %xmm11, 16(%rcx)
+; SSE-NEXT:    movaps %xmm2, (%rcx)
 ; SSE-NEXT:    movaps %xmm13, 48(%r8)
 ; SSE-NEXT:    movaps %xmm8, 32(%r8)
 ; SSE-NEXT:    movaps %xmm7, 16(%r8)
-; SSE-NEXT:    movaps %xmm0, (%r8)
-; SSE-NEXT:    addq $664, %rsp # imm = 0x298
+; SSE-NEXT:    movaps %xmm6, (%r8)
+; SSE-NEXT:    addq $600, %rsp # imm = 0x258
 ; SSE-NEXT:    retq
 ;
 ; AVX1-ONLY-LABEL: load_i8_stride4_vf64:
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index e21e37766451a..16808dca4511d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -104,9 +104,9 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm0, %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [16711935,16711935,16711935,16711935]
+; SSE-NEXT:    pand %xmm2, %xmm0
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
@@ -130,10 +130,9 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm7 = xmm7[2],xmm9[2],xmm7[3],xmm9[3]
 ; SSE-NEXT:    packuswb %xmm7, %xmm7
 ; SSE-NEXT:    por %xmm7, %xmm4
-; SSE-NEXT:    movaps %xmm6, %xmm7
-; SSE-NEXT:    andps %xmm2, %xmm7
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm6[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm7
 ; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[1,2,3,0,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm7, %xmm7
@@ -145,8 +144,8 @@ define void @load_i8_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
 ; SSE-NEXT:    packuswb %xmm6, %xmm6
 ; SSE-NEXT:    por %xmm1, %xmm8
-; SSE-NEXT:    pand %xmm8, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[3,1,2,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[3,1,2,0]
+; SSE-NEXT:    pand %xmm2, %xmm1
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,0,3,4,5,6,7]
@@ -225,10 +224,9 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm8, %xmm1
 ; SSE-NEXT:    pandn %xmm3, %xmm8
 ; SSE-NEXT:    por %xmm1, %xmm8
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    movdqa %xmm8, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [16711935,16711935,16711935,16711935]
 ; SSE-NEXT:    pand %xmm5, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
@@ -274,10 +272,9 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm4, %xmm12
 ; SSE-NEXT:    pand %xmm11, %xmm12
 ; SSE-NEXT:    por %xmm9, %xmm12
-; SSE-NEXT:    movdqa %xmm12, %xmm9
-; SSE-NEXT:    pand %xmm5, %xmm9
-; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm12[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5,4,7]
+; SSE-NEXT:    pand %xmm5, %xmm9
 ; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[1,2,3,0,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm13 = xmm9[0,1,2,3,5,5,5,5]
@@ -318,8 +315,8 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm11, %xmm3
 ; SSE-NEXT:    pandn %xmm4, %xmm11
 ; SSE-NEXT:    por %xmm3, %xmm11
-; SSE-NEXT:    pand %xmm11, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm5[3,1,2,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[3,1,2,0]
+; SSE-NEXT:    pand %xmm5, %xmm3
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm3[2,1,0,3,4,5,6,7]
@@ -470,79 +467,81 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
 ; SSE-LABEL: load_i8_stride6_vf16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa 64(%rdi), %xmm11
-; SSE-NEXT:    movdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 64(%rdi), %xmm10
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%rdi), %xmm5
-; SSE-NEXT:    movdqa 16(%rdi), %xmm2
-; SSE-NEXT:    movdqa 32(%rdi), %xmm8
+; SSE-NEXT:    movdqa 16(%rdi), %xmm1
+; SSE-NEXT:    movdqa 32(%rdi), %xmm7
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm6
 ; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT:    movdqa %xmm4, %xmm7
-; SSE-NEXT:    pandn %xmm8, %xmm7
-; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    pandn %xmm6, %xmm0
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    pandn %xmm6, %xmm0
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pandn %xmm7, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    pandn %xmm6, %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm4, %xmm3
+; SSE-NEXT:    pandn %xmm6, %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm4, %xmm6
-; SSE-NEXT:    por %xmm7, %xmm6
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    por %xmm0, %xmm6
 ; SSE-NEXT:    movdqa %xmm6, %xmm0
-; SSE-NEXT:    pand %xmm7, %xmm0
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    packuswb %xmm3, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT:    movdqa %xmm8, %xmm9
 ; SSE-NEXT:    pandn %xmm0, %xmm9
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    pandn %xmm2, %xmm10
-; SSE-NEXT:    pand %xmm4, %xmm11
-; SSE-NEXT:    movdqa %xmm4, %xmm3
-; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm11
+; SSE-NEXT:    pandn %xmm1, %xmm11
+; SSE-NEXT:    pand %xmm4, %xmm10
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm5, %xmm14
 ; SSE-NEXT:    pand %xmm4, %xmm14
-; SSE-NEXT:    movdqa 80(%rdi), %xmm1
-; SSE-NEXT:    movdqa %xmm1, %xmm13
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 80(%rdi), %xmm3
+; SSE-NEXT:    movdqa %xmm3, %xmm13
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm4, %xmm13
-; SSE-NEXT:    movdqa %xmm8, %xmm15
-; SSE-NEXT:    pand %xmm4, %xmm8
+; SSE-NEXT:    movdqa %xmm7, %xmm15
+; SSE-NEXT:    pand %xmm4, %xmm7
 ; SSE-NEXT:    pand %xmm4, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm4, %xmm12
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    pandn %xmm5, %xmm4
 ; SSE-NEXT:    pand %xmm0, %xmm5
-; SSE-NEXT:    por %xmm10, %xmm5
-; SSE-NEXT:    movdqa %xmm5, %xmm10
-; SSE-NEXT:    pand %xmm7, %xmm10
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[0,3,2,1,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    por %xmm11, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm5[0,2,1,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm1, %xmm11
+; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,1,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[0,3,2,1,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pand %xmm8, %xmm0
 ; SSE-NEXT:    por %xmm9, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm12
-; SSE-NEXT:    por %xmm12, %xmm11
-; SSE-NEXT:    movdqa %xmm11, %xmm9
-; SSE-NEXT:    pand %xmm7, %xmm9
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[3,1,2,0]
+; SSE-NEXT:    pandn %xmm3, %xmm12
+; SSE-NEXT:    por %xmm12, %xmm10
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm10[3,1,2,0]
+; SSE-NEXT:    pand %xmm1, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm3
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,7,6,5]
 ; SSE-NEXT:    packuswb %xmm9, %xmm9
-; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    pandn %xmm9, %xmm1
-; SSE-NEXT:    pand %xmm10, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; SSE-NEXT:    movdqa %xmm11, %xmm12
+; SSE-NEXT:    pandn %xmm9, %xmm12
+; SSE-NEXT:    pand %xmm11, %xmm0
+; SSE-NEXT:    por %xmm0, %xmm12
 ; SSE-NEXT:    pxor %xmm9, %xmm9
 ; SSE-NEXT:    movdqa %xmm6, %xmm0
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
@@ -568,15 +567,14 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm0, %xmm6
 ; SSE-NEXT:    por %xmm5, %xmm6
 ; SSE-NEXT:    packuswb %xmm6, %xmm6
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT:    pand %xmm0, %xmm6
-; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm6
-; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    pand %xmm8, %xmm6
+; SSE-NEXT:    pandn %xmm1, %xmm8
+; SSE-NEXT:    por %xmm8, %xmm6
+; SSE-NEXT:    movdqa %xmm10, %xmm0
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm11[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm10[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,0,65535,65535,0,65535]
@@ -584,14 +582,14 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm0, %xmm5
 ; SSE-NEXT:    por %xmm1, %xmm5
 ; SSE-NEXT:    packuswb %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm10, %xmm12
-; SSE-NEXT:    pandn %xmm0, %xmm12
-; SSE-NEXT:    pand %xmm10, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm12
+; SSE-NEXT:    movdqa %xmm11, %xmm10
+; SSE-NEXT:    pandn %xmm0, %xmm10
+; SSE-NEXT:    pand %xmm11, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm10
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm15
 ; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
-; SSE-NEXT:    pand %xmm7, %xmm0
+; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
@@ -599,11 +597,11 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    por %xmm3, %xmm14
-; SSE-NEXT:    movdqa %xmm14, %xmm1
-; SSE-NEXT:    pand %xmm7, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm14[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm3, %xmm8
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
@@ -613,117 +611,118 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm1, %xmm5
 ; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    por %xmm0, %xmm5
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT:    pandn %xmm11, %xmm2
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT:    pandn %xmm6, %xmm2
 ; SSE-NEXT:    por %xmm2, %xmm13
-; SSE-NEXT:    movdqa %xmm13, %xmm0
-; SSE-NEXT:    pand %xmm7, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm13[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pand %xmm8, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm10, %xmm5
-; SSE-NEXT:    por %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm11, %xmm8
+; SSE-NEXT:    pandn %xmm0, %xmm8
+; SSE-NEXT:    pand %xmm11, %xmm5
+; SSE-NEXT:    por %xmm5, %xmm8
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm9[8],xmm15[9],xmm9[9],xmm15[10],xmm9[10],xmm15[11],xmm9[11],xmm15[12],xmm9[12],xmm15[13],xmm9[13],xmm15[14],xmm9[14],xmm15[15],xmm9[15]
-; SSE-NEXT:    movdqa %xmm15, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
-; SSE-NEXT:    movaps %xmm0, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm2[0,2]
+; SSE-NEXT:    movdqa %xmm15, %xmm1
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
+; SSE-NEXT:    movaps %xmm0, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm15[0,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm15[2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm5[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm2
+; SSE-NEXT:    packuswb %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm14, %xmm0
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [0,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    pand %xmm6, %xmm5
-; SSE-NEXT:    pandn %xmm0, %xmm6
-; SSE-NEXT:    por %xmm5, %xmm6
-; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    packuswb %xmm6, %xmm6
-; SSE-NEXT:    pandn %xmm6, %xmm3
-; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [0,65535,65535,0,65535,65535,65535,65535]
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm5
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    packuswb %xmm5, %xmm5
+; SSE-NEXT:    pandn %xmm5, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
 ; SSE-NEXT:    movdqa %xmm13, %xmm0
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm9[0],xmm13[1],xmm9[1],xmm13[2],xmm9[2],xmm13[3],xmm9[3],xmm13[4],xmm9[4],xmm13[5],xmm9[5],xmm13[6],xmm9[6],xmm13[7],xmm9[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,2,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,65535,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    pand %xmm5, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm5
-; SSE-NEXT:    por %xmm2, %xmm5
-; SSE-NEXT:    pand %xmm10, %xmm3
-; SSE-NEXT:    packuswb %xmm5, %xmm0
-; SSE-NEXT:    pandn %xmm0, %xmm10
-; SSE-NEXT:    por %xmm3, %xmm10
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm8, %xmm0
-; SSE-NEXT:    pand %xmm7, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[0,2,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
+; SSE-NEXT:    pand %xmm11, %xmm3
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm0, %xmm11
+; SSE-NEXT:    por %xmm3, %xmm11
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
-; SSE-NEXT:    movdqa %xmm3, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm1
 ; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    pand %xmm7, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0]
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm5
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm0[2,1,0,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm5, %xmm5
-; SSE-NEXT:    pand %xmm3, %xmm5
-; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[2,1,0,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm2
 ; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    pand %xmm13, %xmm11
+; SSE-NEXT:    movdqa %xmm6, %xmm1
+; SSE-NEXT:    pand %xmm13, %xmm1
 ; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT:    pand %xmm0, %xmm5
-; SSE-NEXT:    por %xmm11, %xmm13
-; SSE-NEXT:    pand %xmm13, %xmm7
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm7[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    por %xmm1, %xmm13
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3]
+; SSE-NEXT:    pand %xmm5, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
+; SSE-NEXT:    packuswb %xmm1, %xmm1
 ; SSE-NEXT:    movdqa %xmm0, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm6
-; SSE-NEXT:    por %xmm5, %xmm6
-; SSE-NEXT:    movdqa %xmm8, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3],xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
-; SSE-NEXT:    movdqa %xmm8, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[1,0],xmm2[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm8 = xmm8[2,0],xmm2[2,3]
-; SSE-NEXT:    psrlq $48, %xmm2
-; SSE-NEXT:    psrldq {{.*#+}} xmm5 = xmm5[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm8[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm2[0,1,2,3,4,4,5,7]
-; SSE-NEXT:    packuswb %xmm5, %xmm7
+; SSE-NEXT:    pandn %xmm1, %xmm6
+; SSE-NEXT:    por %xmm2, %xmm6
+; SSE-NEXT:    movdqa %xmm7, %xmm1
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm9[8],xmm1[9],xmm9[9],xmm1[10],xmm9[10],xmm1[11],xmm9[11],xmm1[12],xmm9[12],xmm1[13],xmm9[13],xmm1[14],xmm9[14],xmm1[15],xmm9[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm9[0],xmm7[1],xmm9[1],xmm7[2],xmm9[2],xmm7[3],xmm9[3],xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
+; SSE-NEXT:    movdqa %xmm7, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm1[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm1[2,3]
+; SSE-NEXT:    psrlq $48, %xmm1
+; SSE-NEXT:    psrldq {{.*#+}} xmm2 = xmm2[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,7]
+; SSE-NEXT:    packuswb %xmm2, %xmm1
 ; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm9[8],xmm2[9],xmm9[9],xmm2[10],xmm9[10],xmm2[11],xmm9[11],xmm2[12],xmm9[12],xmm2[13],xmm9[13],xmm2[14],xmm9[14],xmm2[15],xmm9[15]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
@@ -738,29 +737,28 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    por %xmm4, %xmm2
 ; SSE-NEXT:    packuswb %xmm2, %xmm2
 ; SSE-NEXT:    pand %xmm3, %xmm2
-; SSE-NEXT:    pandn %xmm7, %xmm3
+; SSE-NEXT:    pandn %xmm1, %xmm3
 ; SSE-NEXT:    por %xmm3, %xmm2
-; SSE-NEXT:    movdqa %xmm13, %xmm3
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm9[0],xmm3[1],xmm9[1],xmm3[2],xmm9[2],xmm3[3],xmm9[3],xmm3[4],xmm9[4],xmm3[5],xmm9[5],xmm3[6],xmm9[6],xmm3[7],xmm9[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,1,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm13, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
-; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,0,0]
-; SSE-NEXT:    pand %xmm4, %xmm3
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4]
-; SSE-NEXT:    pandn %xmm5, %xmm4
-; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,0,0]
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm13[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4]
+; SSE-NEXT:    pandn %xmm4, %xmm3
+; SSE-NEXT:    por %xmm1, %xmm3
 ; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    packuswb %xmm4, %xmm3
-; SSE-NEXT:    pandn %xmm3, %xmm0
+; SSE-NEXT:    packuswb %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm2, (%rsi)
-; SSE-NEXT:    movdqa %xmm12, (%rdx)
-; SSE-NEXT:    movdqa %xmm1, (%rcx)
-; SSE-NEXT:    movdqa %xmm10, (%r8)
+; SSE-NEXT:    movdqa %xmm12, (%rsi)
+; SSE-NEXT:    movdqa %xmm10, (%rdx)
+; SSE-NEXT:    movdqa %xmm8, (%rcx)
+; SSE-NEXT:    movdqa %xmm11, (%r8)
 ; SSE-NEXT:    movdqa %xmm6, (%r9)
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movdqa %xmm0, (%rax)
@@ -1076,27 +1074,27 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
 ; SSE-LABEL: load_i8_stride6_vf32:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    subq $280, %rsp # imm = 0x118
-; SSE-NEXT:    movdqa 64(%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    subq $264, %rsp # imm = 0x108
+; SSE-NEXT:    movdqa 64(%rdi), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm9
-; SSE-NEXT:    movdqa (%rdi), %xmm14
-; SSE-NEXT:    movdqa 16(%rdi), %xmm2
+; SSE-NEXT:    movdqa (%rdi), %xmm12
+; SSE-NEXT:    movdqa 16(%rdi), %xmm14
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm5
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT:    movdqa %xmm10, %xmm0
 ; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    movdqa %xmm12, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    pandn %xmm5, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, %xmm1
+; SSE-NEXT:    movdqa %xmm10, %xmm1
 ; SSE-NEXT:    pandn %xmm5, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm5, %xmm15
-; SSE-NEXT:    pand %xmm7, %xmm15
+; SSE-NEXT:    pand %xmm10, %xmm15
 ; SSE-NEXT:    por %xmm0, %xmm15
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
@@ -1107,238 +1105,237 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT:    movdqa %xmm12, %xmm1
-; SSE-NEXT:    pandn %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm14, %xmm11
-; SSE-NEXT:    pand %xmm12, %xmm11
-; SSE-NEXT:    por %xmm1, %xmm11
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,0,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm11, %xmm1
+; SSE-NEXT:    pandn %xmm14, %xmm1
+; SSE-NEXT:    movdqa %xmm12, %xmm8
+; SSE-NEXT:    pand %xmm11, %xmm8
+; SSE-NEXT:    por %xmm1, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm8[0,2,1,3]
 ; SSE-NEXT:    pand %xmm3, %xmm1
 ; SSE-NEXT:    movdqa %xmm3, %xmm6
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    pand %xmm4, %xmm1
-; SSE-NEXT:    movdqa %xmm4, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm5
 ; SSE-NEXT:    pandn %xmm0, %xmm3
 ; SSE-NEXT:    por %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    pandn %xmm9, %xmm0
-; SSE-NEXT:    pand %xmm7, %xmm10
-; SSE-NEXT:    por %xmm0, %xmm10
 ; SSE-NEXT:    movdqa %xmm10, %xmm0
+; SSE-NEXT:    pandn %xmm9, %xmm0
+; SSE-NEXT:    pand %xmm10, %xmm7
+; SSE-NEXT:    por %xmm0, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0]
 ; SSE-NEXT:    pand %xmm6, %xmm0
-; SSE-NEXT:    movdqa %xmm6, %xmm8
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm5, %xmm3
-; SSE-NEXT:    pandn %xmm0, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm1
-; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 128(%rdi), %xmm13
-; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    pandn %xmm13, %xmm0
-; SSE-NEXT:    movdqa 144(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm12, %xmm3
-; SSE-NEXT:    pandn %xmm6, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, %xmm3
-; SSE-NEXT:    pandn %xmm6, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm7, %xmm6
-; SSE-NEXT:    por %xmm0, %xmm6
-; SSE-NEXT:    movdqa %xmm6, %xmm0
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
-; SSE-NEXT:    packuswb %xmm3, %xmm0
-; SSE-NEXT:    movdqa %xmm4, %xmm8
-; SSE-NEXT:    pandn %xmm0, %xmm8
-; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    movdqa %xmm7, %xmm1
-; SSE-NEXT:    pandn %xmm14, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 112(%rdi), %xmm1
-; SSE-NEXT:    movdqa %xmm12, %xmm4
-; SSE-NEXT:    pandn %xmm1, %xmm4
-; SSE-NEXT:    movdqa 160(%rdi), %xmm7
-; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm7
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pandn %xmm2, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm14
-; SSE-NEXT:    pandn %xmm9, %xmm12
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm9
-; SSE-NEXT:    movdqa %xmm9, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pandn %xmm1, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 96(%rdi), %xmm3
-; SSE-NEXT:    movdqa %xmm3, %xmm5
-; SSE-NEXT:    pand %xmm0, %xmm5
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 176(%rdi), %xmm5
-; SSE-NEXT:    movdqa %xmm5, %xmm12
-; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm12
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm9, %xmm12
-; SSE-NEXT:    pand %xmm0, %xmm9
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm13, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; SSE-NEXT:    movdqa %xmm3, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm3, %xmm1
+; SSE-NEXT:    por %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, %xmm2
-; SSE-NEXT:    pand %xmm0, %xmm1
+; SSE-NEXT:    movdqa 128(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm10, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm0
+; SSE-NEXT:    movdqa 144(%rdi), %xmm1
+; SSE-NEXT:    movdqa %xmm11, %xmm2
+; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm3, %xmm2
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    movdqa %xmm2, %xmm4
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm1, %xmm4
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,3,2,1,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    packuswb %xmm4, %xmm4
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
-; SSE-NEXT:    por %xmm8, %xmm4
-; SSE-NEXT:    pandn %xmm5, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm7
-; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm2
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pand %xmm10, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm6, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
+; SSE-NEXT:    packuswb %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pandn %xmm0, %xmm6
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm10, %xmm0
+; SSE-NEXT:    pandn %xmm12, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 112(%rdi), %xmm0
+; SSE-NEXT:    movdqa %xmm11, %xmm3
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    movdqa 160(%rdi), %xmm5
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm10, %xmm5
+; SSE-NEXT:    movdqa %xmm10, %xmm4
+; SSE-NEXT:    pandn %xmm14, %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm10, %xmm12
+; SSE-NEXT:    movdqa %xmm11, %xmm4
+; SSE-NEXT:    pandn %xmm9, %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm9, %xmm11
+; SSE-NEXT:    pand %xmm10, %xmm11
+; SSE-NEXT:    movdqa %xmm10, %xmm4
+; SSE-NEXT:    pandn %xmm0, %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 96(%rdi), %xmm13
+; SSE-NEXT:    movdqa %xmm13, %xmm4
+; SSE-NEXT:    pand %xmm10, %xmm4
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 176(%rdi), %xmm4
+; SSE-NEXT:    movdqa %xmm4, %xmm10
+; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm1, %xmm10
+; SSE-NEXT:    movdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm9, %xmm10
+; SSE-NEXT:    pand %xmm1, %xmm9
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm1, %xmm14
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm14, %xmm9
+; SSE-NEXT:    pand %xmm1, %xmm14
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm3, %xmm4
-; SSE-NEXT:    por %xmm4, %xmm1
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm1, %xmm14
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pxor %xmm9, %xmm9
-; SSE-NEXT:    movdqa %xmm15, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm15[2,2,3,3]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
+; SSE-NEXT:    pandn %xmm13, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm13, %xmm1
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    por %xmm3, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[0,2,1,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,3,2,1,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    packuswb %xmm3, %xmm3
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT:    por %xmm6, %xmm3
+; SSE-NEXT:    pandn %xmm4, %xmm14
+; SSE-NEXT:    por %xmm14, %xmm5
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[3,1,2,0]
+; SSE-NEXT:    pand %xmm0, %xmm4
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,7,6,5]
+; SSE-NEXT:    packuswb %xmm4, %xmm4
+; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; SSE-NEXT:    movdqa %xmm13, %xmm0
+; SSE-NEXT:    pandn %xmm4, %xmm0
+; SSE-NEXT:    pand %xmm13, %xmm3
+; SSE-NEXT:    por %xmm3, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    movdqa %xmm15, %xmm3
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3],xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm15[2,2,3,3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm3[0],xmm14[1],xmm3[1],xmm14[2],xmm3[2],xmm14[3],xmm3[3]
+; SSE-NEXT:    psrld $16, %xmm3
 ; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; SSE-NEXT:    packuswb %xmm15, %xmm5
-; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT:    movdqa %xmm4, %xmm1
-; SSE-NEXT:    pandn %xmm5, %xmm1
-; SSE-NEXT:    movdqa %xmm11, %xmm5
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm9[8],xmm5[9],xmm9[9],xmm5[10],xmm9[10],xmm5[11],xmm9[11],xmm5[12],xmm9[12],xmm5[13],xmm9[13],xmm5[14],xmm9[14],xmm5[15],xmm9[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,1,0,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,1,1,1,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm3[2],xmm15[3],xmm3[3]
+; SSE-NEXT:    packuswb %xmm15, %xmm14
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT:    movdqa %xmm6, %xmm3
+; SSE-NEXT:    pandn %xmm14, %xmm3
+; SSE-NEXT:    movdqa %xmm8, %xmm14
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm4[8],xmm14[9],xmm4[9],xmm14[10],xmm4[10],xmm14[11],xmm4[11],xmm14[12],xmm4[12],xmm14[13],xmm4[13],xmm14[14],xmm4[14],xmm14[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[2,1,0,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm14[1,1,1,1,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [65535,65535,0,65535,0,0,65535,65535]
 ; SSE-NEXT:    movdqa %xmm15, %xmm0
-; SSE-NEXT:    pandn %xmm5, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3],xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm11[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[1,3,2,0,4,5,6,7]
-; SSE-NEXT:    pand %xmm15, %xmm5
-; SSE-NEXT:    por %xmm0, %xmm5
-; SSE-NEXT:    packuswb %xmm5, %xmm5
-; SSE-NEXT:    pand %xmm4, %xmm5
-; SSE-NEXT:    por %xmm1, %xmm5
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE-NEXT:    pandn %xmm14, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm8[1,3,2,0,4,5,6,7]
+; SSE-NEXT:    pand %xmm15, %xmm14
+; SSE-NEXT:    por %xmm0, %xmm14
+; SSE-NEXT:    packuswb %xmm14, %xmm14
+; SSE-NEXT:    pand %xmm6, %xmm14
+; SSE-NEXT:    por %xmm3, %xmm14
+; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,65535,65535,0,65535]
-; SSE-NEXT:    movdqa %xmm8, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm9[8],xmm10[9],xmm9[9],xmm10[10],xmm9[10],xmm10[11],xmm9[11],xmm10[12],xmm9[12],xmm10[13],xmm9[13],xmm10[14],xmm9[14],xmm10[15],xmm9[15]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm10[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm8, %xmm3
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm7[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,4]
 ; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    por %xmm1, %xmm0
+; SSE-NEXT:    por %xmm3, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm3, %xmm5
-; SSE-NEXT:    por %xmm5, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm9[0],xmm6[1],xmm9[1],xmm6[2],xmm9[2],xmm6[3],xmm9[3],xmm6[4],xmm9[4],xmm6[5],xmm9[5],xmm6[6],xmm9[6],xmm6[7],xmm9[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm6[2,2,3,3]
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm6[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm5 = xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; SSE-NEXT:    packuswb %xmm5, %xmm1
+; SSE-NEXT:    movdqa %xmm13, %xmm3
+; SSE-NEXT:    pandn %xmm0, %xmm3
+; SSE-NEXT:    pand %xmm13, %xmm14
+; SSE-NEXT:    por %xmm14, %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm2, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm9[8],xmm0[9],xmm9[9],xmm0[10],xmm9[10],xmm0[11],xmm9[11],xmm0[12],xmm9[12],xmm0[13],xmm9[13],xmm0[14],xmm9[14],xmm0[15],xmm9[15]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3]
+; SSE-NEXT:    psrld $16, %xmm0
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm0[2],xmm2[3],xmm0[3]
+; SSE-NEXT:    packuswb %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm9[0],xmm2[1],xmm9[1],xmm2[2],xmm9[2],xmm2[3],xmm9[3],xmm2[4],xmm9[4],xmm2[5],xmm9[5],xmm2[6],xmm9[6],xmm2[7],xmm9[7]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,3,2,0,4,5,6,7]
-; SSE-NEXT:    pand %xmm15, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,3,2,0,4,5,6,7]
+; SSE-NEXT:    pand %xmm15, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm15
-; SSE-NEXT:    por %xmm2, %xmm15
+; SSE-NEXT:    por %xmm1, %xmm15
 ; SSE-NEXT:    packuswb %xmm15, %xmm15
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    pand %xmm4, %xmm15
-; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm15
-; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm9[0],xmm0[1],xmm9[1],xmm0[2],xmm9[2],xmm0[3],xmm9[3],xmm0[4],xmm9[4],xmm0[5],xmm9[5],xmm0[6],xmm9[6],xmm0[7],xmm9[7]
+; SSE-NEXT:    pand %xmm6, %xmm15
+; SSE-NEXT:    pandn %xmm3, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm15
+; SSE-NEXT:    movdqa %xmm5, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm9[8],xmm7[9],xmm9[9],xmm7[10],xmm9[10],xmm7[11],xmm9[11],xmm7[12],xmm9[12],xmm7[13],xmm9[13],xmm7[14],xmm9[14],xmm7[15],xmm9[15]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm7[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm5[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
 ; SSE-NEXT:    pand %xmm8, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm8
 ; SSE-NEXT:    por %xmm1, %xmm8
 ; SSE-NEXT:    packuswb %xmm8, %xmm0
-; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    movdqa %xmm13, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm3, %xmm15
-; SSE-NEXT:    movdqa %xmm3, %xmm8
+; SSE-NEXT:    pand %xmm13, %xmm15
+; SSE-NEXT:    movdqa %xmm13, %xmm7
 ; SSE-NEXT:    por %xmm15, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    pand %xmm4, %xmm12
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm12, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm11, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    pand %xmm5, %xmm10
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm10, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm15, %xmm0
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
@@ -1346,11 +1343,10 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,4,5,6]
 ; SSE-NEXT:    packuswb %xmm1, %xmm2
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm14, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm12[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,7]
+; SSE-NEXT:    pand %xmm15, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[1,2,3,0,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,5,5,5,5]
@@ -1360,27 +1356,26 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm1, %xmm3
 ; SSE-NEXT:    pand %xmm0, %xmm2
 ; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa (%rsp), %xmm15 # 16-byte Reload
-; SSE-NEXT:    por %xmm1, %xmm15
-; SSE-NEXT:    movdqa %xmm15, %xmm1
-; SSE-NEXT:    pand %xmm11, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    pandn %xmm14, %xmm1
+; SSE-NEXT:    por %xmm1, %xmm11
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm11[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pand %xmm15, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm8, %xmm2
+; SSE-NEXT:    movdqa %xmm13, %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm13, %xmm3
 ; SSE-NEXT:    por %xmm3, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm4, %xmm13
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm13, %xmm1
-; SSE-NEXT:    pand %xmm11, %xmm1
+; SSE-NEXT:    pand %xmm5, %xmm9
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm9, %xmm1
+; SSE-NEXT:    pand %xmm15, %xmm1
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
@@ -1388,12 +1383,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,4,5,6]
 ; SSE-NEXT:    packuswb %xmm2, %xmm1
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm4, %xmm2
-; SSE-NEXT:    pand %xmm11, %xmm2
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm13[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
+; SSE-NEXT:    pand %xmm15, %xmm2
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
@@ -1402,294 +1396,289 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm2, %xmm3
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
 ; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT:    por %xmm1, %xmm10
-; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    pand %xmm11, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT:    por %xmm1, %xmm8
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm8[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pand %xmm15, %xmm1
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,0]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,6,7,4]
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm8, %xmm2
+; SSE-NEXT:    movdqa %xmm7, %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    pand %xmm7, %xmm3
 ; SSE-NEXT:    por %xmm3, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm12, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm9[8],xmm12[9],xmm9[9],xmm12[10],xmm9[10],xmm12[11],xmm9[11],xmm12[12],xmm9[12],xmm12[13],xmm9[13],xmm12[14],xmm9[14],xmm12[15],xmm9[15]
-; SSE-NEXT:    movdqa %xmm12, %xmm3
-; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm1[3,0]
-; SSE-NEXT:    movaps %xmm1, %xmm5
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,1],xmm3[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm12[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm12[2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm5[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
+; SSE-NEXT:    movdqa %xmm10, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm1[3,0]
+; SSE-NEXT:    movaps %xmm1, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm10[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm10[2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm1, %xmm3
-; SSE-NEXT:    movdqa %xmm14, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm9[0],xmm1[1],xmm9[1],xmm1[2],xmm9[2],xmm1[3],xmm9[3],xmm1[4],xmm9[4],xmm1[5],xmm9[5],xmm1[6],xmm9[6],xmm1[7],xmm9[7]
+; SSE-NEXT:    packuswb %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm12, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm1[3,1,2,1,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[3,1,2,1,4,5,6,7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm1, %xmm6
-; SSE-NEXT:    pandn %xmm5, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm9[8],xmm14[9],xmm9[9],xmm14[10],xmm9[10],xmm14[11],xmm9[11],xmm14[12],xmm9[12],xmm14[13],xmm9[13],xmm14[14],xmm9[14],xmm14[15],xmm9[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm14[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
-; SSE-NEXT:    pand %xmm1, %xmm5
-; SSE-NEXT:    por %xmm6, %xmm5
-; SSE-NEXT:    packuswb %xmm5, %xmm5
-; SSE-NEXT:    movdqa %xmm0, %xmm6
-; SSE-NEXT:    pandn %xmm5, %xmm6
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    por %xmm3, %xmm6
-; SSE-NEXT:    movdqa %xmm15, %xmm3
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm9[8],xmm3[9],xmm9[9],xmm3[10],xmm9[10],xmm3[11],xmm9[11],xmm3[12],xmm9[12],xmm3[13],xmm9[13],xmm3[14],xmm9[14],xmm3[15],xmm9[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,2,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm3[0,1,2,3,7,5,6,5]
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    movdqa %xmm3, %xmm7
-; SSE-NEXT:    pandn %xmm5, %xmm7
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm15[0,2,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7]
-; SSE-NEXT:    pand %xmm3, %xmm5
-; SSE-NEXT:    por %xmm7, %xmm5
-; SSE-NEXT:    packuswb %xmm5, %xmm5
-; SSE-NEXT:    movdqa %xmm8, %xmm15
-; SSE-NEXT:    pandn %xmm5, %xmm15
-; SSE-NEXT:    pand %xmm8, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm15
-; SSE-NEXT:    movdqa %xmm13, %xmm5
-; SSE-NEXT:    pxor %xmm2, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15]
-; SSE-NEXT:    movdqa %xmm13, %xmm6
-; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm5[3,0]
-; SSE-NEXT:    movaps %xmm5, %xmm7
-; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,0],xmm13[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm13[2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm5, %xmm6
-; SSE-NEXT:    movdqa %xmm4, %xmm5
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[3,1,2,1,4,5,6,7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm4[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
-; SSE-NEXT:    pand %xmm1, %xmm7
-; SSE-NEXT:    pandn %xmm5, %xmm1
-; SSE-NEXT:    por %xmm7, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm6
+; SSE-NEXT:    movdqa %xmm1, %xmm5
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,7,7,7,7]
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    por %xmm5, %xmm3
+; SSE-NEXT:    packuswb %xmm3, %xmm3
+; SSE-NEXT:    movdqa %xmm0, %xmm5
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm11, %xmm2
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm2[0,1,2,3,7,5,6,5]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    movdqa %xmm2, %xmm6
+; SSE-NEXT:    pandn %xmm3, %xmm6
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm11[0,2,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    por %xmm6, %xmm3
+; SSE-NEXT:    packuswb %xmm3, %xmm3
+; SSE-NEXT:    movdqa %xmm7, %xmm6
+; SSE-NEXT:    pandn %xmm3, %xmm6
+; SSE-NEXT:    pand %xmm7, %xmm5
+; SSE-NEXT:    por %xmm5, %xmm6
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm9, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; SSE-NEXT:    movdqa %xmm9, %xmm5
+; SSE-NEXT:    shufps {{.*#+}} xmm5 = xmm5[2,0],xmm3[3,0]
+; SSE-NEXT:    movaps %xmm3, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm5[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,0,2]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm3, %xmm5
+; SSE-NEXT:    movdqa %xmm13, %xmm3
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm4[8],xmm13[9],xmm4[9],xmm13[10],xmm4[10],xmm13[11],xmm4[11],xmm13[12],xmm4[12],xmm13[13],xmm4[13],xmm13[14],xmm4[14],xmm13[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm13[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
+; SSE-NEXT:    pand %xmm1, %xmm6
+; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    por %xmm6, %xmm1
+; SSE-NEXT:    pand %xmm0, %xmm5
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
 ; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    por %xmm6, %xmm0
-; SSE-NEXT:    movdqa %xmm10, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
+; SSE-NEXT:    por %xmm5, %xmm0
+; SSE-NEXT:    movdqa %xmm8, %xmm1
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,7,5,6,5]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm10[0,2,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,5,7,7]
-; SSE-NEXT:    pand %xmm3, %xmm5
-; SSE-NEXT:    pandn %xmm1, %xmm3
-; SSE-NEXT:    por %xmm5, %xmm3
-; SSE-NEXT:    movdqa %xmm8, %xmm4
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    packuswb %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm1, %xmm4
-; SSE-NEXT:    por %xmm0, %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm9, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm8[0,2,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
+; SSE-NEXT:    pand %xmm2, %xmm3
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm7, %xmm13
+; SSE-NEXT:    pand %xmm7, %xmm0
+; SSE-NEXT:    packuswb %xmm2, %xmm1
+; SSE-NEXT:    pandn %xmm1, %xmm13
+; SSE-NEXT:    por %xmm0, %xmm13
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    pand %xmm15, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
-; SSE-NEXT:    movdqa %xmm5, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
+; SSE-NEXT:    movdqa %xmm2, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm9[3,1,2,0]
+; SSE-NEXT:    pand %xmm15, %xmm0
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm0[2,1,0,3,4,5,6,7]
 ; SSE-NEXT:    packuswb %xmm3, %xmm3
-; SSE-NEXT:    pand %xmm5, %xmm3
+; SSE-NEXT:    pand %xmm2, %xmm3
 ; SSE-NEXT:    por %xmm1, %xmm3
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    pand %xmm13, %xmm12
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm12, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    movdqa %xmm14, %xmm11
+; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    pand %xmm12, %xmm11
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,2,1,3]
+; SSE-NEXT:    pand %xmm15, %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm6
+; SSE-NEXT:    packuswb %xmm0, %xmm5
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,65535,65535,65535,0,0,0]
 ; SSE-NEXT:    movdqa %xmm0, %xmm8
-; SSE-NEXT:    pandn %xmm6, %xmm8
+; SSE-NEXT:    pandn %xmm5, %xmm8
 ; SSE-NEXT:    pand %xmm0, %xmm3
 ; SSE-NEXT:    por %xmm3, %xmm8
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
 ; SSE-NEXT:    movdqa %xmm14, %xmm3
-; SSE-NEXT:    pand %xmm11, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm3[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm15, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,2]
-; SSE-NEXT:    packuswb %xmm6, %xmm3
-; SSE-NEXT:    movdqa %xmm5, %xmm6
-; SSE-NEXT:    pandn %xmm3, %xmm6
+; SSE-NEXT:    packuswb %xmm5, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm5
+; SSE-NEXT:    pandn %xmm3, %xmm5
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm1, %xmm3
-; SSE-NEXT:    pand %xmm11, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[3,1,2,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm1[3,1,2,0]
+; SSE-NEXT:    pand %xmm15, %xmm3
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[2,1,0,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm3[2,1,0,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm7, %xmm7
-; SSE-NEXT:    pand %xmm5, %xmm7
-; SSE-NEXT:    por %xmm6, %xmm7
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm3[2,1,0,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm6, %xmm6
+; SSE-NEXT:    pand %xmm2, %xmm6
+; SSE-NEXT:    por %xmm5, %xmm6
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT:    pand %xmm13, %xmm3
-; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT:    por %xmm3, %xmm13
-; SSE-NEXT:    movdqa %xmm11, %xmm3
-; SSE-NEXT:    pand %xmm13, %xmm3
-; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
+; SSE-NEXT:    pand %xmm12, %xmm3
+; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT:    por %xmm3, %xmm12
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[0,2,1,3]
+; SSE-NEXT:    pand %xmm15, %xmm3
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm3[0,1,0,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,6,5,4,7]
-; SSE-NEXT:    packuswb %xmm3, %xmm6
+; SSE-NEXT:    packuswb %xmm3, %xmm5
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pandn %xmm6, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm7
-; SSE-NEXT:    por %xmm7, %xmm3
-; SSE-NEXT:    movdqa %xmm9, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3],xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm9, %xmm7
-; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3]
-; SSE-NEXT:    psrlq $48, %xmm6
-; SSE-NEXT:    psrldq {{.*#+}} xmm7 = xmm7[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7]
-; SSE-NEXT:    packuswb %xmm7, %xmm6
-; SSE-NEXT:    movdqa %xmm5, %xmm7
-; SSE-NEXT:    pandn %xmm6, %xmm7
-; SSE-NEXT:    movdqa %xmm10, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm2[8],xmm6[9],xmm2[9],xmm6[10],xmm2[10],xmm6[11],xmm2[11],xmm6[12],xmm2[12],xmm6[13],xmm2[13],xmm6[14],xmm2[14],xmm6[15],xmm2[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm6[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm6, %xmm11
-; SSE-NEXT:    pandn %xmm9, %xmm11
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm10[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm9[3,1,1,2,4,5,6,7]
-; SSE-NEXT:    pand %xmm6, %xmm10
-; SSE-NEXT:    por %xmm11, %xmm10
-; SSE-NEXT:    packuswb %xmm10, %xmm10
-; SSE-NEXT:    pand %xmm5, %xmm10
-; SSE-NEXT:    por %xmm7, %xmm10
-; SSE-NEXT:    movdqa %xmm12, %xmm7
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm2[8],xmm12[9],xmm2[9],xmm12[10],xmm2[10],xmm12[11],xmm2[11],xmm12[12],xmm2[12],xmm12[13],xmm2[13],xmm12[14],xmm2[14],xmm12[15],xmm2[15]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm12[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm9[0,1,2,3,5,5,7,4]
-; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,65535,65535,0,65535,0,0]
-; SSE-NEXT:    movdqa %xmm9, %xmm12
-; SSE-NEXT:    pandn %xmm11, %xmm12
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm2[0],xmm7[1],xmm2[1],xmm7[2],xmm2[2],xmm7[3],xmm2[3],xmm7[4],xmm2[4],xmm7[5],xmm2[5],xmm7[6],xmm2[6],xmm7[7],xmm2[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,3,1,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm9, %xmm7
-; SSE-NEXT:    por %xmm7, %xmm12
-; SSE-NEXT:    packuswb %xmm12, %xmm11
-; SSE-NEXT:    movdqa %xmm0, %xmm7
-; SSE-NEXT:    pandn %xmm11, %xmm7
-; SSE-NEXT:    pand %xmm0, %xmm10
-; SSE-NEXT:    por %xmm10, %xmm7
-; SSE-NEXT:    movdqa %xmm14, %xmm10
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm2[8],xmm10[9],xmm2[9],xmm10[10],xmm2[10],xmm10[11],xmm2[11],xmm10[12],xmm2[12],xmm10[13],xmm2[13],xmm10[14],xmm2[14],xmm10[15],xmm2[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3],xmm14[4],xmm2[4],xmm14[5],xmm2[5],xmm14[6],xmm2[6],xmm14[7],xmm2[7]
-; SSE-NEXT:    movdqa %xmm14, %xmm11
-; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[1,0],xmm10[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm14 = xmm14[2,0],xmm10[2,3]
-; SSE-NEXT:    psrlq $48, %xmm10
-; SSE-NEXT:    psrldq {{.*#+}} xmm11 = xmm11[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm14[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7]
+; SSE-NEXT:    pandn %xmm5, %xmm3
+; SSE-NEXT:    pand %xmm0, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm3
+; SSE-NEXT:    movdqa %xmm7, %xmm5
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3],xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7]
+; SSE-NEXT:    movdqa %xmm7, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[1,0],xmm5[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[2,0],xmm5[2,3]
+; SSE-NEXT:    psrlq $48, %xmm5
+; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm7[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,5,7]
+; SSE-NEXT:    packuswb %xmm6, %xmm5
+; SSE-NEXT:    movdqa %xmm2, %xmm6
+; SSE-NEXT:    pandn %xmm5, %xmm6
+; SSE-NEXT:    movdqa %xmm9, %xmm5
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[1,1,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm5[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [65535,0,65535,65535,0,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm5, %xmm10
+; SSE-NEXT:    pandn %xmm7, %xmm10
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm9[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm7[3,1,1,2,4,5,6,7]
+; SSE-NEXT:    pand %xmm5, %xmm9
+; SSE-NEXT:    por %xmm10, %xmm9
+; SSE-NEXT:    packuswb %xmm9, %xmm9
+; SSE-NEXT:    pand %xmm2, %xmm9
+; SSE-NEXT:    por %xmm6, %xmm9
+; SSE-NEXT:    movdqa %xmm11, %xmm6
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm4[8],xmm11[9],xmm4[9],xmm11[10],xmm4[10],xmm11[11],xmm4[11],xmm11[12],xmm4[12],xmm11[13],xmm4[13],xmm11[14],xmm4[14],xmm11[15],xmm4[15]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm11[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm7[0,1,2,3,5,5,7,4]
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [65535,65535,65535,65535,0,65535,0,0]
+; SSE-NEXT:    movdqa %xmm7, %xmm11
+; SSE-NEXT:    pandn %xmm10, %xmm11
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3],xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm7, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm11
 ; SSE-NEXT:    packuswb %xmm11, %xmm10
-; SSE-NEXT:    movdqa %xmm1, %xmm11
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm2[8],xmm11[9],xmm2[9],xmm11[10],xmm2[10],xmm11[11],xmm2[11],xmm11[12],xmm2[12],xmm11[13],xmm2[13],xmm11[14],xmm2[14],xmm11[15],xmm2[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[1,1,2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm1[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm12[3,1,1,2,4,5,6,7]
-; SSE-NEXT:    pand %xmm6, %xmm12
-; SSE-NEXT:    pandn %xmm11, %xmm6
-; SSE-NEXT:    por %xmm12, %xmm6
-; SSE-NEXT:    packuswb %xmm6, %xmm6
-; SSE-NEXT:    pand %xmm5, %xmm6
+; SSE-NEXT:    movdqa %xmm0, %xmm6
+; SSE-NEXT:    pandn %xmm10, %xmm6
+; SSE-NEXT:    pand %xmm0, %xmm9
+; SSE-NEXT:    por %xmm9, %xmm6
+; SSE-NEXT:    movdqa %xmm14, %xmm11
+; SSE-NEXT:    movdqa %xmm14, %xmm9
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7]
+; SSE-NEXT:    movdqa %xmm11, %xmm10
+; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[1,0],xmm9[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm11 = xmm11[2,0],xmm9[2,3]
+; SSE-NEXT:    psrlq $48, %xmm9
+; SSE-NEXT:    psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm11[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7]
+; SSE-NEXT:    packuswb %xmm10, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm10
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm11 = xmm1[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm11 = xmm11[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm11 = xmm11[3,1,1,2,4,5,6,7]
+; SSE-NEXT:    pand %xmm5, %xmm11
 ; SSE-NEXT:    pandn %xmm10, %xmm5
-; SSE-NEXT:    por %xmm5, %xmm6
-; SSE-NEXT:    movdqa %xmm13, %xmm5
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm2[8],xmm13[9],xmm2[9],xmm13[10],xmm2[10],xmm13[11],xmm2[11],xmm13[12],xmm2[12],xmm13[13],xmm2[13],xmm13[14],xmm2[14],xmm13[15],xmm2[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm5[0,3,1,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm9, %xmm4
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm13[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4]
-; SSE-NEXT:    pandn %xmm5, %xmm9
-; SSE-NEXT:    por %xmm4, %xmm9
-; SSE-NEXT:    pand %xmm0, %xmm6
-; SSE-NEXT:    packuswb %xmm9, %xmm4
-; SSE-NEXT:    pandn %xmm4, %xmm0
-; SSE-NEXT:    por %xmm6, %xmm0
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm4, 16(%rsi)
-; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm4, (%rsi)
+; SSE-NEXT:    por %xmm11, %xmm5
+; SSE-NEXT:    packuswb %xmm5, %xmm5
+; SSE-NEXT:    pand %xmm2, %xmm5
+; SSE-NEXT:    pandn %xmm9, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm5
+; SSE-NEXT:    movdqa %xmm12, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm4[8],xmm12[9],xmm4[9],xmm12[10],xmm4[10],xmm12[11],xmm4[11],xmm12[12],xmm4[12],xmm12[13],xmm4[13],xmm12[14],xmm4[14],xmm12[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm7, %xmm2
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm12[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,7,4]
+; SSE-NEXT:    pandn %xmm4, %xmm7
+; SSE-NEXT:    por %xmm2, %xmm7
+; SSE-NEXT:    pand %xmm0, %xmm5
+; SSE-NEXT:    packuswb %xmm7, %xmm2
+; SSE-NEXT:    pandn %xmm2, %xmm0
+; SSE-NEXT:    por %xmm5, %xmm0
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    movaps %xmm2, 16(%rsi)
+; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    movaps %xmm2, (%rsi)
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm1, 16(%rdx)
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -1698,15 +1687,15 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movaps %xmm1, 16(%rcx)
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm1, (%rcx)
+; SSE-NEXT:    movdqa %xmm13, 16(%r8)
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    movaps %xmm1, 16(%r8)
-; SSE-NEXT:    movdqa %xmm15, (%r8)
+; SSE-NEXT:    movaps %xmm1, (%r8)
 ; SSE-NEXT:    movdqa %xmm3, 16(%r9)
 ; SSE-NEXT:    movdqa %xmm8, (%r9)
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movdqa %xmm0, 16(%rax)
-; SSE-NEXT:    movdqa %xmm7, (%rax)
-; SSE-NEXT:    addq $280, %rsp # imm = 0x118
+; SSE-NEXT:    movdqa %xmm6, (%rax)
+; SSE-NEXT:    addq $264, %rsp # imm = 0x108
 ; SSE-NEXT:    retq
 ;
 ; AVX1-ONLY-LABEL: load_i8_stride6_vf32:
@@ -2301,7 +2290,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr %out.vec2, ptr %out.vec3, ptr %out.vec4, ptr %out.vec5) nounwind {
 ; SSE-LABEL: load_i8_stride6_vf64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    subq $808, %rsp # imm = 0x328
+; SSE-NEXT:    subq $792, %rsp # imm = 0x318
 ; SSE-NEXT:    movdqa 64(%rdi), %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 80(%rdi), %xmm5
@@ -2313,22 +2302,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa 32(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 48(%rdi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,0,65535,65535,0,65535,65535]
-; SSE-NEXT:    movdqa %xmm9, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [65535,65535,0,65535,65535,0,65535,65535]
+; SSE-NEXT:    movdqa %xmm13, %xmm1
 ; SSE-NEXT:    pandn %xmm2, %xmm1
 ; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [65535,0,65535,65535,0,65535,65535,0]
 ; SSE-NEXT:    movdqa %xmm3, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    movdqa %xmm13, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm9, %xmm0
+; SSE-NEXT:    pand %xmm13, %xmm0
 ; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
 ; SSE-NEXT:    pand %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm11
+; SSE-NEXT:    movdqa %xmm1, %xmm10
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
@@ -2341,9 +2330,8 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pand %xmm3, %xmm2
 ; SSE-NEXT:    por %xmm1, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm11, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[0,2,1,3]
+; SSE-NEXT:    pand %xmm10, %xmm1
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
@@ -2353,15 +2341,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm8, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
 ; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm9, %xmm6
-; SSE-NEXT:    movdqa %xmm9, %xmm0
+; SSE-NEXT:    movdqa %xmm13, %xmm0
 ; SSE-NEXT:    pandn %xmm5, %xmm0
-; SSE-NEXT:    pand %xmm9, %xmm4
+; SSE-NEXT:    pand %xmm13, %xmm4
 ; SSE-NEXT:    por %xmm0, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm4[3,1,2,0]
+; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
@@ -2374,282 +2360,272 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 320(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm9, %xmm0
+; SSE-NEXT:    movdqa %xmm13, %xmm0
 ; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    movdqa 336(%rdi), %xmm13
+; SSE-NEXT:    movdqa 336(%rdi), %xmm12
 ; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm13, %xmm1
+; SSE-NEXT:    pandn %xmm12, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm9, %xmm1
-; SSE-NEXT:    pandn %xmm13, %xmm1
+; SSE-NEXT:    movdqa %xmm13, %xmm1
+; SSE-NEXT:    pandn %xmm12, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm9, %xmm13
-; SSE-NEXT:    por %xmm0, %xmm13
-; SSE-NEXT:    movdqa %xmm13, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
+; SSE-NEXT:    pand %xmm13, %xmm12
+; SSE-NEXT:    por %xmm0, %xmm12
+; SSE-NEXT:    movdqa %xmm12, %xmm0
+; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm8, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    movdqa 304(%rdi), %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm8, %xmm1
+; SSE-NEXT:    pandn %xmm0, %xmm1
+; SSE-NEXT:    movdqa 304(%rdi), %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm3, %xmm7
 ; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    movdqa 288(%rdi), %xmm7
-; SSE-NEXT:    movdqa %xmm7, %xmm1
-; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    pandn %xmm2, %xmm0
+; SSE-NEXT:    movdqa 288(%rdi), %xmm6
+; SSE-NEXT:    movdqa %xmm6, %xmm2
+; SSE-NEXT:    pand %xmm3, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,1,3]
+; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    por %xmm1, %xmm0
 ; SSE-NEXT:    movdqa 368(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    movdqa %xmm13, %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    movdqa 352(%rdi), %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm9, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm11, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
+; SSE-NEXT:    movdqa 352(%rdi), %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm13, %xmm3
+; SSE-NEXT:    por %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[3,1,2,0]
+; SSE-NEXT:    pand %xmm10, %xmm2
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm4, %xmm1
-; SSE-NEXT:    pandn %xmm2, %xmm1
+; SSE-NEXT:    movdqa %xmm4, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm3
 ; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    movdqa %xmm4, %xmm9
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 224(%rdi), %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm0
+; SSE-NEXT:    movdqa %xmm13, %xmm0
 ; SSE-NEXT:    pandn %xmm1, %xmm0
-; SSE-NEXT:    movdqa 240(%rdi), %xmm14
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    pandn %xmm14, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm1
-; SSE-NEXT:    pandn %xmm14, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm6, %xmm14
-; SSE-NEXT:    por %xmm0, %xmm14
-; SSE-NEXT:    movdqa %xmm14, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
+; SSE-NEXT:    movdqa 240(%rdi), %xmm11
+; SSE-NEXT:    movdqa %xmm7, %xmm2
+; SSE-NEXT:    pandn %xmm11, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm13, %xmm2
+; SSE-NEXT:    pandn %xmm11, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm13, %xmm11
+; SSE-NEXT:    por %xmm0, %xmm11
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    packuswb %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm8, %xmm5
 ; SSE-NEXT:    movdqa %xmm8, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    movdqa 208(%rdi), %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm3, %xmm1
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    pandn %xmm4, %xmm0
+; SSE-NEXT:    movdqa 208(%rdi), %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    movdqa 192(%rdi), %xmm3
-; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    pand %xmm1, %xmm4
-; SSE-NEXT:    movdqa %xmm1, %xmm8
-; SSE-NEXT:    por %xmm0, %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; SSE-NEXT:    movdqa %xmm3, %xmm1
+; SSE-NEXT:    pand %xmm7, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,1,3]
+; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    pand %xmm10, %xmm0
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm5, %xmm1
+; SSE-NEXT:    pand %xmm8, %xmm0
+; SSE-NEXT:    movdqa %xmm8, %xmm10
 ; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    movdqa 272(%rdi), %xmm15
-; SSE-NEXT:    movdqa %xmm6, %xmm2
-; SSE-NEXT:    pandn %xmm15, %xmm2
-; SSE-NEXT:    movdqa 256(%rdi), %xmm12
-; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm6, %xmm12
-; SSE-NEXT:    por %xmm2, %xmm12
-; SSE-NEXT:    movdqa %xmm12, %xmm2
-; SSE-NEXT:    pand %xmm11, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[3,1,2,0]
+; SSE-NEXT:    movdqa 272(%rdi), %xmm14
+; SSE-NEXT:    movdqa %xmm13, %xmm2
+; SSE-NEXT:    pandn %xmm14, %xmm2
+; SSE-NEXT:    movdqa 256(%rdi), %xmm15
+; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm13, %xmm15
+; SSE-NEXT:    por %xmm2, %xmm15
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm15[3,1,2,0]
+; SSE-NEXT:    pand %xmm1, %xmm2
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,5]
 ; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm9, %xmm4
 ; SSE-NEXT:    pandn %xmm2, %xmm4
 ; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    por %xmm0, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa 128(%rdi), %xmm2
 ; SSE-NEXT:    movdqa %xmm2, (%rsp) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm0
+; SSE-NEXT:    movdqa %xmm13, %xmm0
 ; SSE-NEXT:    pandn %xmm2, %xmm0
-; SSE-NEXT:    movdqa 144(%rdi), %xmm10
-; SSE-NEXT:    movdqa %xmm8, %xmm4
-; SSE-NEXT:    pandn %xmm10, %xmm4
+; SSE-NEXT:    movdqa 144(%rdi), %xmm9
+; SSE-NEXT:    movdqa %xmm7, %xmm4
+; SSE-NEXT:    pandn %xmm9, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm4
-; SSE-NEXT:    pandn %xmm10, %xmm4
+; SSE-NEXT:    movdqa %xmm13, %xmm4
+; SSE-NEXT:    pandn %xmm9, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm6, %xmm10
-; SSE-NEXT:    por %xmm0, %xmm10
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
+; SSE-NEXT:    pand %xmm13, %xmm9
+; SSE-NEXT:    por %xmm0, %xmm9
+; SSE-NEXT:    movdqa %xmm9, %xmm0
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,3,2,3]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,6,5]
 ; SSE-NEXT:    packuswb %xmm5, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm11
-; SSE-NEXT:    pandn %xmm0, %xmm11
-; SSE-NEXT:    movdqa %xmm6, %xmm0
-; SSE-NEXT:    movdqa %xmm6, %xmm4
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    pandn %xmm1, %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm2
-; SSE-NEXT:    pandn %xmm7, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm10
+; SSE-NEXT:    movdqa %xmm13, %xmm0
+; SSE-NEXT:    movdqa %xmm13, %xmm2
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT:    pandn %xmm13, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, %xmm4
-; SSE-NEXT:    movdqa %xmm6, %xmm2
+; SSE-NEXT:    movdqa %xmm13, %xmm1
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pandn %xmm6, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm6, %xmm5
+; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    pandn %xmm3, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm3, %xmm5
+; SSE-NEXT:    movdqa %xmm3, %xmm4
 ; SSE-NEXT:    movdqa 112(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm8, %xmm7
-; SSE-NEXT:    movdqa %xmm8, %xmm9
-; SSE-NEXT:    pandn %xmm6, %xmm9
-; SSE-NEXT:    movdqa 160(%rdi), %xmm8
-; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm8
+; SSE-NEXT:    movdqa %xmm7, %xmm2
+; SSE-NEXT:    movdqa %xmm7, %xmm8
+; SSE-NEXT:    pandn %xmm6, %xmm8
+; SSE-NEXT:    movdqa 160(%rdi), %xmm7
+; SSE-NEXT:    movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm7
 ; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT:    pandn %xmm13, %xmm3
 ; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, %xmm1
+; SSE-NEXT:    movdqa %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm2, %xmm1
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT:    pandn %xmm3, %xmm1
+; SSE-NEXT:    pandn %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm5
+; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm1, %xmm5
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    pandn %xmm1, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm0, %xmm4
 ; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, %xmm3
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    pandn %xmm4, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm4, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; SSE-NEXT:    pandn %xmm4, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm5
+; SSE-NEXT:    pandn %xmm14, %xmm5
 ; SSE-NEXT:    movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, %xmm3
-; SSE-NEXT:    pandn %xmm15, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm15
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, %xmm3
-; SSE-NEXT:    pandn %xmm6, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm6, %xmm15
-; SSE-NEXT:    movdqa 96(%rdi), %xmm5
-; SSE-NEXT:    movdqa %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa 176(%rdi), %xmm6
-; SSE-NEXT:    movdqa %xmm6, %xmm3
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm3
-; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm14
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    pandn %xmm6, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 96(%rdi), %xmm4
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa 176(%rdi), %xmm14
+; SSE-NEXT:    movdqa %xmm14, %xmm2
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm13
+; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm0, %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm0, %xmm15
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm0, %xmm6
+; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm0, %xmm13
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pandn %xmm5, %xmm1
+; SSE-NEXT:    pandn %xmm4, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm5, %xmm3
-; SSE-NEXT:    pand %xmm7, %xmm3
-; SSE-NEXT:    por %xmm9, %xmm3
-; SSE-NEXT:    movdqa %xmm3, %xmm5
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm7, %xmm5
-; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
+; SSE-NEXT:    movdqa %xmm4, %xmm3
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE-NEXT:    por %xmm8, %xmm3
+; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm3[0,2,1,3]
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm1, %xmm5
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,6,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm5 = xmm5[0,3,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    packuswb %xmm5, %xmm5
-; SSE-NEXT:    movdqa {{.*#+}} xmm9 = [65535,65535,65535,0,0,0,65535,65535]
-; SSE-NEXT:    pand %xmm9, %xmm5
-; SSE-NEXT:    por %xmm11, %xmm5
-; SSE-NEXT:    pandn %xmm6, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm8
-; SSE-NEXT:    movdqa %xmm8, %xmm0
-; SSE-NEXT:    pand %xmm7, %xmm0
-; SSE-NEXT:    movdqa %xmm7, %xmm11
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [65535,65535,65535,0,0,0,65535,65535]
+; SSE-NEXT:    pand %xmm8, %xmm5
+; SSE-NEXT:    por %xmm10, %xmm5
+; SSE-NEXT:    pandn %xmm14, %xmm0
+; SSE-NEXT:    por %xmm0, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[3,1,2,0]
+; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,5]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm7, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; SSE-NEXT:    movdqa %xmm10, %xmm1
 ; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm7, %xmm5
+; SSE-NEXT:    pand %xmm10, %xmm5
 ; SSE-NEXT:    por %xmm5, %xmm1
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    pxor %xmm5, %xmm5
@@ -2660,11 +2636,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm1[2,2,3,3]
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
 ; SSE-NEXT:    psrld $16, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm1[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm15 = xmm15[2],xmm0[2],xmm15[3],xmm0[3]
-; SSE-NEXT:    packuswb %xmm15, %xmm4
-; SSE-NEXT:    movdqa %xmm9, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm1[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,7]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm14 = xmm14[2],xmm0[2],xmm14[3],xmm0[3]
+; SSE-NEXT:    packuswb %xmm14, %xmm4
+; SSE-NEXT:    movdqa %xmm8, %xmm1
 ; SSE-NEXT:    pandn %xmm4, %xmm1
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; SSE-NEXT:    movdqa %xmm2, %xmm4
@@ -2673,48 +2649,48 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,65535,0,65535,0,0,65535,65535]
-; SSE-NEXT:    movdqa %xmm0, %xmm15
-; SSE-NEXT:    pandn %xmm4, %xmm15
+; SSE-NEXT:    movdqa %xmm0, %xmm14
+; SSE-NEXT:    pandn %xmm4, %xmm14
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm2[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7]
 ; SSE-NEXT:    pand %xmm0, %xmm4
-; SSE-NEXT:    por %xmm15, %xmm4
+; SSE-NEXT:    por %xmm14, %xmm4
 ; SSE-NEXT:    packuswb %xmm4, %xmm4
-; SSE-NEXT:    pand %xmm9, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm4
 ; SSE-NEXT:    por %xmm1, %xmm4
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; SSE-NEXT:    movdqa %xmm6, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm1[0,1,2,3,5,5,5,5]
 ; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [65535,65535,65535,0,65535,65535,0,65535]
 ; SSE-NEXT:    movdqa %xmm2, %xmm1
-; SSE-NEXT:    pandn %xmm15, %xmm1
+; SSE-NEXT:    pandn %xmm14, %xmm1
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm15 = xmm6[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm15 = xmm15[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,5,7,6,4]
-; SSE-NEXT:    pand %xmm2, %xmm15
-; SSE-NEXT:    por %xmm1, %xmm15
-; SSE-NEXT:    packuswb %xmm15, %xmm1
-; SSE-NEXT:    movdqa %xmm7, %xmm15
-; SSE-NEXT:    pandn %xmm1, %xmm15
-; SSE-NEXT:    pand %xmm7, %xmm4
-; SSE-NEXT:    por %xmm4, %xmm15
-; SSE-NEXT:    movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm13, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm6[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,5,7,6,4]
+; SSE-NEXT:    pand %xmm2, %xmm14
+; SSE-NEXT:    por %xmm1, %xmm14
+; SSE-NEXT:    packuswb %xmm14, %xmm1
+; SSE-NEXT:    movdqa %xmm10, %xmm14
+; SSE-NEXT:    pandn %xmm1, %xmm14
+; SSE-NEXT:    pand %xmm10, %xmm4
+; SSE-NEXT:    por %xmm4, %xmm14
+; SSE-NEXT:    movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm12, %xmm1
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm5[0],xmm13[1],xmm5[1],xmm13[2],xmm5[2],xmm13[3],xmm5[3],xmm13[4],xmm5[4],xmm13[5],xmm5[5],xmm13[6],xmm5[6],xmm13[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm13[2,2,3,3]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm12[2,2,3,3]
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm13 = xmm13[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm13 = xmm13[2],xmm1[2],xmm13[3],xmm1[3]
-; SSE-NEXT:    packuswb %xmm13, %xmm4
-; SSE-NEXT:    movdqa %xmm9, %xmm15
-; SSE-NEXT:    movdqa %xmm9, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,7,6,7]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm12 = xmm12[2],xmm1[2],xmm12[3],xmm1[3]
+; SSE-NEXT:    packuswb %xmm12, %xmm4
+; SSE-NEXT:    movdqa %xmm8, %xmm14
+; SSE-NEXT:    movdqa %xmm8, %xmm1
 ; SSE-NEXT:    pandn %xmm4, %xmm1
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; SSE-NEXT:    movdqa %xmm6, %xmm4
@@ -2722,46 +2698,46 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm13
-; SSE-NEXT:    pandn %xmm4, %xmm13
+; SSE-NEXT:    movdqa %xmm0, %xmm12
+; SSE-NEXT:    pandn %xmm4, %xmm12
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7]
 ; SSE-NEXT:    pand %xmm0, %xmm4
-; SSE-NEXT:    por %xmm13, %xmm4
+; SSE-NEXT:    por %xmm12, %xmm4
 ; SSE-NEXT:    packuswb %xmm4, %xmm4
-; SSE-NEXT:    pand %xmm9, %xmm4
+; SSE-NEXT:    pand %xmm8, %xmm4
 ; SSE-NEXT:    por %xmm1, %xmm4
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; SSE-NEXT:    movdqa %xmm6, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    movdqa %xmm2, %xmm13
-; SSE-NEXT:    pandn %xmm1, %xmm13
+; SSE-NEXT:    movdqa %xmm2, %xmm12
+; SSE-NEXT:    pandn %xmm1, %xmm12
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
 ; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    por %xmm13, %xmm1
+; SSE-NEXT:    por %xmm12, %xmm1
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm7, %xmm13
-; SSE-NEXT:    pandn %xmm1, %xmm13
-; SSE-NEXT:    pand %xmm7, %xmm4
-; SSE-NEXT:    por %xmm4, %xmm13
-; SSE-NEXT:    movdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm14, %xmm1
+; SSE-NEXT:    movdqa %xmm10, %xmm12
+; SSE-NEXT:    pandn %xmm1, %xmm12
+; SSE-NEXT:    pand %xmm10, %xmm4
+; SSE-NEXT:    por %xmm4, %xmm12
+; SSE-NEXT:    movdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm11, %xmm1
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm14[2,2,3,3]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm11[2,2,3,3]
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm14[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    punpckhdq {{.*#+}} xmm9 = xmm9[2],xmm1[2],xmm9[3],xmm1[3]
-; SSE-NEXT:    packuswb %xmm9, %xmm4
-; SSE-NEXT:    movdqa %xmm15, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm11[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,7,6,7]
+; SSE-NEXT:    punpckhdq {{.*#+}} xmm8 = xmm8[2],xmm1[2],xmm8[3],xmm1[3]
+; SSE-NEXT:    packuswb %xmm8, %xmm4
+; SSE-NEXT:    movdqa %xmm14, %xmm1
 ; SSE-NEXT:    pandn %xmm4, %xmm1
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; SSE-NEXT:    movdqa %xmm6, %xmm4
@@ -2769,41 +2745,41 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[2,1,0,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,1,1,1,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm9
-; SSE-NEXT:    pandn %xmm4, %xmm9
+; SSE-NEXT:    movdqa %xmm0, %xmm8
+; SSE-NEXT:    pandn %xmm4, %xmm8
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm6[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[1,3,2,0,4,5,6,7]
 ; SSE-NEXT:    pand %xmm0, %xmm4
-; SSE-NEXT:    por %xmm9, %xmm4
+; SSE-NEXT:    por %xmm8, %xmm4
 ; SSE-NEXT:    packuswb %xmm4, %xmm4
-; SSE-NEXT:    pand %xmm15, %xmm4
+; SSE-NEXT:    pand %xmm14, %xmm4
 ; SSE-NEXT:    por %xmm1, %xmm4
-; SSE-NEXT:    movdqa %xmm12, %xmm1
+; SSE-NEXT:    movdqa %xmm15, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    movdqa %xmm2, %xmm9
-; SSE-NEXT:    pandn %xmm1, %xmm9
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm12[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm2, %xmm8
+; SSE-NEXT:    pandn %xmm1, %xmm8
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm15[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,7,6,4]
 ; SSE-NEXT:    pand %xmm2, %xmm1
-; SSE-NEXT:    por %xmm9, %xmm1
+; SSE-NEXT:    por %xmm8, %xmm1
 ; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm7, %xmm9
-; SSE-NEXT:    pandn %xmm1, %xmm9
-; SSE-NEXT:    pand %xmm7, %xmm4
-; SSE-NEXT:    por %xmm4, %xmm9
-; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm10, %xmm8
+; SSE-NEXT:    pandn %xmm1, %xmm8
+; SSE-NEXT:    pand %xmm10, %xmm4
+; SSE-NEXT:    por %xmm4, %xmm8
+; SSE-NEXT:    movdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm9, %xmm1
 ; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm10[2,2,3,3]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm9[2,2,3,3]
 ; SSE-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; SSE-NEXT:    psrld $16, %xmm1
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm10[0,1,0,3]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm9[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,7,6,7]
 ; SSE-NEXT:    punpckhdq {{.*#+}} xmm6 = xmm6[2],xmm1[2],xmm6[3],xmm1[3]
 ; SSE-NEXT:    packuswb %xmm6, %xmm4
@@ -2820,32 +2796,34 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    pandn %xmm1, %xmm0
 ; SSE-NEXT:    por %xmm3, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm15, %xmm1
-; SSE-NEXT:    pand %xmm15, %xmm0
+; SSE-NEXT:    movdqa %xmm14, %xmm1
+; SSE-NEXT:    pand %xmm14, %xmm0
 ; SSE-NEXT:    pandn %xmm4, %xmm1
 ; SSE-NEXT:    por %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm8, %xmm1
+; SSE-NEXT:    movdqa %xmm7, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm8[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm3 = xmm7[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm3[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,7,6,4]
 ; SSE-NEXT:    pand %xmm2, %xmm3
 ; SSE-NEXT:    pandn %xmm1, %xmm2
 ; SSE-NEXT:    por %xmm3, %xmm2
 ; SSE-NEXT:    packuswb %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm7, %xmm2
+; SSE-NEXT:    movdqa %xmm10, %xmm2
 ; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm7, %xmm0
+; SSE-NEXT:    pand %xmm10, %xmm0
+; SSE-NEXT:    movdqa %xmm10, %xmm11
 ; SSE-NEXT:    por %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{.*#+}} xmm12 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    pand %xmm12, %xmm0
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm11, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT:    pand %xmm14, %xmm4
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm4, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
@@ -2855,719 +2833,712 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    packuswb %xmm1, %xmm0
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm6, %xmm1
-; SSE-NEXT:    pand %xmm11, %xmm1
-; SSE-NEXT:    movdqa %xmm11, %xmm4
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm6[2,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
+; SSE-NEXT:    pand %xmm10, %xmm1
+; SSE-NEXT:    movdqa %xmm10, %xmm12
 ; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
-; SSE-NEXT:    movdqa %xmm3, %xmm2
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
+; SSE-NEXT:    movdqa %xmm5, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    movdqa %xmm13, %xmm0
 ; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT:    por %xmm0, %xmm14
-; SSE-NEXT:    movdqa %xmm14, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm1[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm7, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm7, %xmm2
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    pand %xmm12, %xmm1
-; SSE-NEXT:    movdqa %xmm12, %xmm10
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    movdqa %xmm1, %xmm12
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm11, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm11, %xmm3
+; SSE-NEXT:    movdqa %xmm11, %xmm8
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT:    pand %xmm14, %xmm13
+; SSE-NEXT:    movdqa %xmm14, %xmm7
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm13, %xmm0
+; SSE-NEXT:    pand %xmm10, %xmm0
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
-; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm15, %xmm1
-; SSE-NEXT:    pand %xmm11, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm3, %xmm2
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm14[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
+; SSE-NEXT:    pand %xmm10, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    movdqa %xmm5, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    por %xmm0, %xmm3
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT:    por %xmm0, %xmm9
-; SSE-NEXT:    movdqa %xmm9, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
+; SSE-NEXT:    por %xmm0, %xmm11
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm11[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pand %xmm10, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm7, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm7, %xmm2
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; SSE-NEXT:    pand %xmm10, %xmm8
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm8, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm8, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm8, %xmm3
+; SSE-NEXT:    movdqa %xmm8, %xmm9
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
+; SSE-NEXT:    pand %xmm7, %xmm10
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm10, %xmm0
+; SSE-NEXT:    pand %xmm12, %xmm0
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
-; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm13, %xmm1
-; SSE-NEXT:    pand %xmm11, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm3, %xmm2
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    packuswb %xmm2, %xmm0
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
 ; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm1, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
+; SSE-NEXT:    pand %xmm12, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    movdqa %xmm5, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    por %xmm0, %xmm3
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
+; SSE-NEXT:    por %xmm0, %xmm8
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm8[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pand %xmm12, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa %xmm7, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm7, %xmm2
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
-; SSE-NEXT:    pand %xmm10, %xmm7
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm7, %xmm0
-; SSE-NEXT:    pand %xmm11, %xmm0
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,3,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm9, %xmm3
+; SSE-NEXT:    movdqa %xmm9, %xmm1
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    pand %xmm7, %xmm0
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pand %xmm12, %xmm0
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm0[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,3,4,5,6,7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,3]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,5,6]
-; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm11, %xmm1
-; SSE-NEXT:    pand %xmm4, %xmm1
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,4,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[1,2,3,0,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    packuswb %xmm1, %xmm1
-; SSE-NEXT:    movdqa %xmm3, %xmm2
-; SSE-NEXT:    pandn %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm3, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    packuswb %xmm2, %xmm0
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm15[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,7]
+; SSE-NEXT:    pand %xmm12, %xmm2
+; SSE-NEXT:    movdqa %xmm12, %xmm9
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[1,2,3,0,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    movdqa %xmm5, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm0
+; SSE-NEXT:    por %xmm0, %xmm3
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT:    por %xmm0, %xmm10
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    pand %xmm4, %xmm0
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,3,2,3,4,5,6,7]
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT:    por %xmm0, %xmm12
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm12[0,3,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pand %xmm9, %xmm0
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,0]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,6,7,4]
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm4, %xmm1
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    pand %xmm4, %xmm2
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm1, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm3
+; SSE-NEXT:    por %xmm3, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa %xmm4, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
-; SSE-NEXT:    movdqa %xmm4, %xmm1
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm0[3,0]
-; SSE-NEXT:    movaps %xmm0, %xmm2
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0,2]
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; SSE-NEXT:    pxor %xmm7, %xmm7
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
+; SSE-NEXT:    movaps %xmm0, %xmm3
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[0,2]
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[0,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm4[2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm1
+; SSE-NEXT:    packuswb %xmm0, %xmm2
 ; SSE-NEXT:    movdqa %xmm6, %xmm0
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,1,4,5,6,7]
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [0,65535,65535,0,65535,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm0, %xmm4
-; SSE-NEXT:    pandn %xmm2, %xmm4
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm6[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [0,65535,65535,0,65535,65535,65535,65535]
 ; SSE-NEXT:    movdqa %xmm3, %xmm4
-; SSE-NEXT:    pandn %xmm2, %xmm4
-; SSE-NEXT:    pand %xmm3, %xmm1
-; SSE-NEXT:    por %xmm1, %xmm4
-; SSE-NEXT:    movdqa %xmm14, %xmm1
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,2,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,7,5,6,5]
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    movdqa %xmm1, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm6
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm14[0,2,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm14 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; SSE-NEXT:    movdqa %xmm14, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm6
-; SSE-NEXT:    pand %xmm14, %xmm4
-; SSE-NEXT:    por %xmm4, %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm12, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm5[8],xmm12[9],xmm5[9],xmm12[10],xmm5[10],xmm12[11],xmm5[11],xmm12[12],xmm5[12],xmm12[13],xmm5[13],xmm12[14],xmm5[14],xmm12[15],xmm5[15]
-; SSE-NEXT:    movdqa %xmm12, %xmm4
-; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
-; SSE-NEXT:    movaps %xmm2, %xmm6
-; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm12[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm12[2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm4
-; SSE-NEXT:    movdqa %xmm15, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm15[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; SSE-NEXT:    pand %xmm0, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm4
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm7[8],xmm6[9],xmm7[9],xmm6[10],xmm7[10],xmm6[11],xmm7[11],xmm6[12],xmm7[12],xmm6[13],xmm7[13],xmm6[14],xmm7[14],xmm6[15],xmm7[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm6[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    por %xmm4, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pandn %xmm0, %xmm6
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm6
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm1, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm7[8],xmm0[9],xmm7[9],xmm0[10],xmm7[10],xmm0[11],xmm7[11],xmm0[12],xmm7[12],xmm0[13],xmm7[13],xmm0[14],xmm7[14],xmm0[15],xmm7[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
+; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm7[0],xmm1[1],xmm7[1],xmm1[2],xmm7[2],xmm1[3],xmm7[3],xmm1[4],xmm7[4],xmm1[5],xmm7[5],xmm1[6],xmm7[6],xmm1[7],xmm7[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[0,2,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; SSE-NEXT:    pand %xmm4, %xmm0
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm0
+; SSE-NEXT:    movdqa %xmm9, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm9, %xmm6
 ; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm3, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm6
-; SSE-NEXT:    pand %xmm3, %xmm4
-; SSE-NEXT:    por %xmm4, %xmm6
-; SSE-NEXT:    movdqa %xmm9, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
-; SSE-NEXT:    movdqa %xmm1, %xmm4
-; SSE-NEXT:    pandn %xmm2, %xmm4
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm9[0,2,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm14, %xmm4
-; SSE-NEXT:    pandn %xmm2, %xmm4
-; SSE-NEXT:    pand %xmm14, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm8, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
-; SSE-NEXT:    movdqa %xmm8, %xmm4
-; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
-; SSE-NEXT:    movaps %xmm2, %xmm6
-; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm8[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm8[2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm4
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm13, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm1[8],xmm13[9],xmm1[9],xmm13[10],xmm1[10],xmm13[11],xmm1[11],xmm13[12],xmm1[12],xmm13[13],xmm1[13],xmm13[14],xmm1[14],xmm13[15],xmm1[15]
 ; SSE-NEXT:    movdqa %xmm13, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
-; SSE-NEXT:    movdqa %xmm0, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm13[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,7,7,7]
-; SSE-NEXT:    pand %xmm0, %xmm2
-; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    packuswb %xmm2, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
+; SSE-NEXT:    movaps %xmm0, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm13[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm13[2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm14, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
 ; SSE-NEXT:    movdqa %xmm3, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm6
-; SSE-NEXT:    pand %xmm3, %xmm4
-; SSE-NEXT:    por %xmm4, %xmm6
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm8, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
-; SSE-NEXT:    movdqa %xmm1, %xmm4
-; SSE-NEXT:    pandn %xmm2, %xmm4
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm8[0,2,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    por %xmm4, %xmm2
-; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm14, %xmm4
-; SSE-NEXT:    pandn %xmm2, %xmm4
-; SSE-NEXT:    pand %xmm14, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm4
-; SSE-NEXT:    movdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm7, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm5[8],xmm7[9],xmm5[9],xmm7[10],xmm5[10],xmm7[11],xmm5[11],xmm7[12],xmm5[12],xmm7[13],xmm5[13],xmm7[14],xmm5[14],xmm7[15],xmm5[15]
-; SSE-NEXT:    movdqa %xmm7, %xmm4
-; SSE-NEXT:    shufps {{.*#+}} xmm4 = xmm4[2,0],xmm2[3,0]
-; SSE-NEXT:    movaps %xmm2, %xmm6
-; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm4[0,2]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm7[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm7[2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm4 = xmm6[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,1,0,2]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm4
-; SSE-NEXT:    movdqa %xmm11, %xmm2
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm5[8],xmm11[9],xmm5[9],xmm11[10],xmm5[10],xmm11[11],xmm5[11],xmm11[12],xmm5[12],xmm11[13],xmm5[13],xmm11[14],xmm5[14],xmm11[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm11[0,3,2,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,3,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,7,7,7]
-; SSE-NEXT:    pand %xmm0, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm0
+; SSE-NEXT:    pandn %xmm0, %xmm6
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm1[8],xmm14[9],xmm1[9],xmm14[10],xmm1[10],xmm14[11],xmm1[11],xmm14[12],xmm1[12],xmm14[13],xmm1[13],xmm14[14],xmm1[14],xmm14[15],xmm1[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm14[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; SSE-NEXT:    pand %xmm3, %xmm0
 ; SSE-NEXT:    por %xmm6, %xmm0
-; SSE-NEXT:    pand %xmm3, %xmm4
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pandn %xmm0, %xmm3
-; SSE-NEXT:    por %xmm4, %xmm3
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm5[8],xmm0[9],xmm5[9],xmm0[10],xmm5[10],xmm0[11],xmm5[11],xmm0[12],xmm5[12],xmm0[13],xmm5[13],xmm0[14],xmm5[14],xmm0[15],xmm5[15]
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pandn %xmm0, %xmm6
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm6
+; SSE-NEXT:    movdqa %xmm11, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm10[0,2,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,7]
-; SSE-NEXT:    pand %xmm1, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm1
-; SSE-NEXT:    por %xmm2, %xmm1
-; SSE-NEXT:    movdqa %xmm14, %xmm2
-; SSE-NEXT:    pand %xmm14, %xmm3
-; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    por %xmm3, %xmm2
-; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm8 = [255,255,255,255,255,255,255,255]
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; SSE-NEXT:    packuswb %xmm1, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
-; SSE-NEXT:    movdqa %xmm13, %xmm2
+; SSE-NEXT:    movdqa %xmm4, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm11, %xmm0
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pand %xmm13, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm1[0],xmm11[1],xmm1[1],xmm11[2],xmm1[2],xmm11[3],xmm1[3],xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm11[0,2,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
+; SSE-NEXT:    pand %xmm4, %xmm0
 ; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [65535,0,65535,65535,0,65535,65535,0]
-; SSE-NEXT:    pand %xmm6, %xmm9
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Folded Reload
+; SSE-NEXT:    packuswb %xmm0, %xmm0
 ; SSE-NEXT:    movdqa %xmm9, %xmm2
-; SSE-NEXT:    pand %xmm8, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,65535,65535,0,0,0]
-; SSE-NEXT:    movdqa %xmm4, %xmm1
-; SSE-NEXT:    pandn %xmm2, %xmm1
-; SSE-NEXT:    pand %xmm4, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm1
-; SSE-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm12, %xmm0
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; SSE-NEXT:    packuswb %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm13, %xmm2
 ; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm3, %xmm0
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm9, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa %xmm10, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm1[8],xmm10[9],xmm1[9],xmm10[10],xmm1[10],xmm10[11],xmm1[11],xmm10[12],xmm1[12],xmm10[13],xmm1[13],xmm10[14],xmm1[14],xmm10[15],xmm1[15]
+; SSE-NEXT:    movdqa %xmm10, %xmm2
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[3,0]
+; SSE-NEXT:    movaps %xmm0, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[0,1],xmm2[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm10[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm10[2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm0, %xmm2
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm7, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[3,1,2,1,4,5,6,7]
+; SSE-NEXT:    movdqa %xmm3, %xmm6
+; SSE-NEXT:    pandn %xmm0, %xmm6
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm7[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7]
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    por %xmm6, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pand %xmm13, %xmm0
-; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE-NEXT:    pand %xmm6, %xmm1
-; SSE-NEXT:    movdqa %xmm6, %xmm7
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm1, %xmm2
-; SSE-NEXT:    pand %xmm8, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm4, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm6
+; SSE-NEXT:    movdqa %xmm5, %xmm6
+; SSE-NEXT:    pandn %xmm0, %xmm6
+; SSE-NEXT:    pand %xmm5, %xmm2
+; SSE-NEXT:    por %xmm2, %xmm6
+; SSE-NEXT:    movdqa %xmm8, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,2,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,5,6,5]
+; SSE-NEXT:    movdqa %xmm4, %xmm2
+; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm8[0,2,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,7,7]
 ; SSE-NEXT:    pand %xmm4, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm6
-; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    por %xmm2, %xmm0
+; SSE-NEXT:    packuswb %xmm0, %xmm2
+; SSE-NEXT:    movdqa %xmm9, %xmm0
+; SSE-NEXT:    pandn %xmm2, %xmm0
+; SSE-NEXT:    pand %xmm9, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm0
+; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm0, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; SSE-NEXT:    movdqa %xmm0, %xmm6
+; SSE-NEXT:    shufps {{.*#+}} xmm6 = xmm6[2,0],xmm2[3,0]
+; SSE-NEXT:    movaps %xmm2, %xmm7
+; SSE-NEXT:    shufps {{.*#+}} xmm7 = xmm7[0,1],xmm6[0,2]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,0],xmm0[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm2 = xmm2[2,0],xmm0[2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm7[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm2, %xmm6
+; SSE-NEXT:    movdqa %xmm15, %xmm2
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[3,1,2,1,4,5,6,7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm1[8],xmm15[9],xmm1[9],xmm15[10],xmm1[10],xmm15[11],xmm1[11],xmm15[12],xmm1[12],xmm15[13],xmm1[13],xmm15[14],xmm1[14],xmm15[15],xmm1[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm15[0,3,2,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,3,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,7,7,7]
+; SSE-NEXT:    pand %xmm3, %xmm7
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    por %xmm7, %xmm3
+; SSE-NEXT:    pand %xmm5, %xmm6
+; SSE-NEXT:    packuswb %xmm3, %xmm3
+; SSE-NEXT:    pandn %xmm3, %xmm5
+; SSE-NEXT:    por %xmm6, %xmm5
+; SSE-NEXT:    movdqa %xmm12, %xmm2
+; SSE-NEXT:    pxor %xmm0, %xmm0
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,7,5,6,5]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm3 = xmm12[0,2,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,7,7]
+; SSE-NEXT:    pand %xmm4, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm4
+; SSE-NEXT:    por %xmm3, %xmm4
+; SSE-NEXT:    pand %xmm9, %xmm5
+; SSE-NEXT:    packuswb %xmm4, %xmm2
+; SSE-NEXT:    pandn %xmm2, %xmm9
+; SSE-NEXT:    por %xmm5, %xmm9
+; SSE-NEXT:    movdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm10 = [255,255,255,255,255,255,255,255]
+; SSE-NEXT:    pand %xmm10, %xmm1
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,1,2,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; SSE-NEXT:    packuswb %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm13, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm2
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,1,0,2]
+; SSE-NEXT:    packuswb %xmm2, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255]
+; SSE-NEXT:    movdqa %xmm15, %xmm2
+; SSE-NEXT:    pandn %xmm1, %xmm2
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Folded Reload
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm12[3,1,2,0]
+; SSE-NEXT:    pand %xmm10, %xmm1
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[2,1,0,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm1[2,1,0,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm6, %xmm6
+; SSE-NEXT:    pand %xmm15, %xmm6
+; SSE-NEXT:    por %xmm2, %xmm6
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
+; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,0,65535,65535,0,65535,65535,0]
+; SSE-NEXT:    pand %xmm11, %xmm13
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Folded Reload
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm13[0,2,1,3]
+; SSE-NEXT:    pand %xmm10, %xmm1
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,2,1,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,1,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,0,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,5,4,7]
+; SSE-NEXT:    packuswb %xmm1, %xmm2
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,65535,65535,0,0,0]
+; SSE-NEXT:    movdqa %xmm1, %xmm3
+; SSE-NEXT:    pandn %xmm2, %xmm3
+; SSE-NEXT:    pand %xmm1, %xmm6
+; SSE-NEXT:    por %xmm6, %xmm3
+; SSE-NEXT:    movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm14, %xmm0
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[2,1,0,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pand %xmm13, %xmm0
-; SSE-NEXT:    por %xmm2, %xmm0
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
-; SSE-NEXT:    pand %xmm7, %xmm15
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm15, %xmm2
-; SSE-NEXT:    pand %xmm8, %xmm2
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
+; SSE-NEXT:    movdqa %xmm14, %xmm2
+; SSE-NEXT:    pand %xmm10, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,2]
+; SSE-NEXT:    packuswb %xmm6, %xmm2
+; SSE-NEXT:    movdqa %xmm15, %xmm6
+; SSE-NEXT:    pandn %xmm2, %xmm6
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Folded Reload
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm4[3,1,2,0]
+; SSE-NEXT:    pand %xmm10, %xmm2
+; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[2,1,0,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm2[2,1,0,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm7, %xmm7
+; SSE-NEXT:    pand %xmm15, %xmm7
+; SSE-NEXT:    por %xmm6, %xmm7
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT:    pand %xmm11, %xmm3
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
+; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm3[0,2,1,3]
+; SSE-NEXT:    pand %xmm10, %xmm2
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,2,1,4,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,1,3]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,0,3,4,5,6,7]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
-; SSE-NEXT:    packuswb %xmm2, %xmm2
-; SSE-NEXT:    movdqa %xmm4, %xmm6
-; SSE-NEXT:    pandn %xmm2, %xmm6
-; SSE-NEXT:    pand %xmm4, %xmm0
-; SSE-NEXT:    por %xmm0, %xmm6
+; SSE-NEXT:    packuswb %xmm2, %xmm6
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm6, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm7
+; SSE-NEXT:    por %xmm7, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
 ; SSE-NEXT:    movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[2,1,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,7,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,2]
-; SSE-NEXT:    packuswb %xmm2, %xmm0
-; SSE-NEXT:    movdqa %xmm13, %xmm2
-; SSE-NEXT:    pandn %xmm0, %xmm2
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
-; SSE-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    pand %xmm8, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[3,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[2,1,0,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm0[2,1,0,3,4,5,6,7]
-; SSE-NEXT:    packuswb %xmm6, %xmm6
-; SSE-NEXT:    pand %xmm13, %xmm6
-; SSE-NEXT:    por %xmm2, %xmm6
-; SSE-NEXT:    movdqa %xmm7, %xmm2
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; SSE-NEXT:    pand %xmm7, %xmm0
-; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
-; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    pand %xmm10, %xmm6
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,2]
+; SSE-NEXT:    packuswb %xmm7, %xmm6
+; SSE-NEXT:    movdqa %xmm15, %xmm7
+; SSE-NEXT:    pandn %xmm6, %xmm7
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE-NEXT:    movdqa %xmm8, %xmm0
-; SSE-NEXT:    pand %xmm2, %xmm0
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,2,1,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,1,0,3,4,5,6,7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,5,4,7]
-; SSE-NEXT:    packuswb %xmm0, %xmm2
-; SSE-NEXT:    movdqa %xmm4, %xmm7
-; SSE-NEXT:    pandn %xmm2, %xmm7
-; SSE-NEXT:    pand %xmm4, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm7
-; SSE-NEXT:    movdqa %xmm10, %xmm0
-; SSE-NEXT:    movdqa %xmm10, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
-; SSE-NEXT:    movdqa %xmm0, %xmm6
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm2[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm2[2,3]
-; SSE-NEXT:    psrlq $48, %xmm2
-; SSE-NEXT:    psrldq {{.*#+}} xmm6 = xmm6[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm0[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,4,5,7]
-; SSE-NEXT:    packuswb %xmm6, %xmm2
-; SSE-NEXT:    movdqa %xmm13, %xmm8
-; SSE-NEXT:    pandn %xmm2, %xmm8
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[3,1,2,0]
+; SSE-NEXT:    pand %xmm10, %xmm6
+; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[2,1,0,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm6[2,1,0,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm8, %xmm8
+; SSE-NEXT:    pand %xmm15, %xmm8
+; SSE-NEXT:    por %xmm7, %xmm8
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
+; SSE-NEXT:    pand %xmm11, %xmm2
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm2[0,2,1,3]
+; SSE-NEXT:    pand %xmm10, %xmm6
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,2,1,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,1,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,0,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,6,5,4,7]
+; SSE-NEXT:    packuswb %xmm6, %xmm7
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    pandn %xmm7, %xmm2
+; SSE-NEXT:    pand %xmm1, %xmm8
+; SSE-NEXT:    por %xmm8, %xmm2
+; SSE-NEXT:    movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE-NEXT:    movdqa (%rsp), %xmm7 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Folded Reload
+; SSE-NEXT:    movdqa %xmm7, (%rsp) # 16-byte Spill
+; SSE-NEXT:    pand %xmm10, %xmm7
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[2,1,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,7,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,1,0,2]
+; SSE-NEXT:    packuswb %xmm8, %xmm7
+; SSE-NEXT:    movdqa %xmm15, %xmm8
+; SSE-NEXT:    pandn %xmm7, %xmm8
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
+; SSE-NEXT:    por {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Folded Reload
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[3,1,2,0]
+; SSE-NEXT:    pand %xmm10, %xmm7
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[2,1,0,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm7[2,1,0,3,4,5,6,7]
+; SSE-NEXT:    packuswb %xmm9, %xmm9
+; SSE-NEXT:    pand %xmm15, %xmm9
+; SSE-NEXT:    por %xmm8, %xmm9
 ; SSE-NEXT:    movdqa %xmm11, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[1,1,2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,65535]
-; SSE-NEXT:    movdqa %xmm0, %xmm10
-; SSE-NEXT:    pandn %xmm2, %xmm10
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm5[0],xmm11[1],xmm5[1],xmm11[2],xmm5[2],xmm11[3],xmm5[3],xmm11[4],xmm5[4],xmm11[5],xmm5[5],xmm11[6],xmm5[6],xmm11[7],xmm5[7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm2 = xmm11[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm2[3,1,1,2,4,5,6,7]
-; SSE-NEXT:    pand %xmm0, %xmm6
-; SSE-NEXT:    por %xmm10, %xmm6
-; SSE-NEXT:    packuswb %xmm6, %xmm6
-; SSE-NEXT:    pand %xmm13, %xmm6
-; SSE-NEXT:    por %xmm8, %xmm6
-; SSE-NEXT:    movdqa %xmm9, %xmm2
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm9[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4]
-; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0]
-; SSE-NEXT:    movdqa %xmm11, %xmm10
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
+; SSE-NEXT:    pand %xmm11, %xmm7
+; SSE-NEXT:    pandn {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
+; SSE-NEXT:    por %xmm7, %xmm2
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm2[0,2,1,3]
+; SSE-NEXT:    pand %xmm10, %xmm7
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,2,1,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm7 = xmm7[0,1,0,3,4,5,6,7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,5,4,7]
+; SSE-NEXT:    packuswb %xmm7, %xmm8
+; SSE-NEXT:    movdqa %xmm1, %xmm7
+; SSE-NEXT:    pandn %xmm8, %xmm7
+; SSE-NEXT:    pand %xmm1, %xmm9
+; SSE-NEXT:    por %xmm9, %xmm7
+; SSE-NEXT:    movdqa %xmm0, %xmm8
+; SSE-NEXT:    pxor %xmm5, %xmm5
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3],xmm0[4],xmm5[4],xmm0[5],xmm5[5],xmm0[6],xmm5[6],xmm0[7],xmm5[7]
+; SSE-NEXT:    movdqa %xmm0, %xmm9
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm8[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm8[2,3]
+; SSE-NEXT:    psrlq $48, %xmm8
+; SSE-NEXT:    psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm0[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
+; SSE-NEXT:    packuswb %xmm9, %xmm8
+; SSE-NEXT:    movdqa %xmm15, %xmm10
 ; SSE-NEXT:    pandn %xmm8, %xmm10
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3],xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm2 = xmm2[0,3,1,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm11, %xmm2
-; SSE-NEXT:    por %xmm2, %xmm10
-; SSE-NEXT:    packuswb %xmm10, %xmm8
-; SSE-NEXT:    movdqa %xmm4, %xmm2
-; SSE-NEXT:    pandn %xmm8, %xmm2
-; SSE-NEXT:    pand %xmm4, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm2
-; SSE-NEXT:    movdqa %xmm12, %xmm9
-; SSE-NEXT:    movdqa %xmm12, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
-; SSE-NEXT:    movdqa %xmm9, %xmm8
-; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3]
-; SSE-NEXT:    psrlq $48, %xmm6
-; SSE-NEXT:    psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7]
-; SSE-NEXT:    packuswb %xmm8, %xmm6
+; SSE-NEXT:    movdqa %xmm12, %xmm8
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[1,1,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [65535,0,65535,65535,0,65535,65535,65535]
+; SSE-NEXT:    movdqa %xmm0, %xmm11
+; SSE-NEXT:    pandn %xmm8, %xmm11
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm5[0],xmm12[1],xmm5[1],xmm12[2],xmm5[2],xmm12[3],xmm5[3],xmm12[4],xmm5[4],xmm12[5],xmm5[5],xmm12[6],xmm5[6],xmm12[7],xmm5[7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm12[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm12 = xmm8[3,1,1,2,4,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm12
+; SSE-NEXT:    por %xmm11, %xmm12
+; SSE-NEXT:    packuswb %xmm12, %xmm12
+; SSE-NEXT:    pand %xmm15, %xmm12
+; SSE-NEXT:    por %xmm10, %xmm12
 ; SSE-NEXT:    movdqa %xmm13, %xmm8
-; SSE-NEXT:    pandn %xmm6, %xmm8
-; SSE-NEXT:    movdqa %xmm3, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    movdqa %xmm0, %xmm10
-; SSE-NEXT:    pandn %xmm6, %xmm10
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm3[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[3,1,1,2,4,5,6,7]
-; SSE-NEXT:    pand %xmm0, %xmm6
-; SSE-NEXT:    por %xmm10, %xmm6
-; SSE-NEXT:    packuswb %xmm6, %xmm6
-; SSE-NEXT:    pand %xmm13, %xmm6
-; SSE-NEXT:    por %xmm8, %xmm6
-; SSE-NEXT:    movdqa %xmm1, %xmm8
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm5[8],xmm1[9],xmm5[9],xmm1[10],xmm5[10],xmm1[11],xmm5[11],xmm1[12],xmm5[12],xmm1[13],xmm5[13],xmm1[14],xmm5[14],xmm1[15],xmm5[15]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm1[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm5[8],xmm13[9],xmm5[9],xmm13[10],xmm5[10],xmm13[11],xmm5[11],xmm13[12],xmm5[12],xmm13[13],xmm5[13],xmm13[14],xmm5[14],xmm13[15],xmm5[15]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4]
-; SSE-NEXT:    movdqa %xmm11, %xmm12
-; SSE-NEXT:    pandn %xmm10, %xmm12
+; SSE-NEXT:    movdqa {{.*#+}} xmm11 = [65535,65535,65535,65535,0,65535,0,0]
+; SSE-NEXT:    movdqa %xmm11, %xmm13
+; SSE-NEXT:    pandn %xmm10, %xmm13
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm5[0],xmm8[1],xmm5[1],xmm8[2],xmm5[2],xmm8[3],xmm5[3],xmm8[4],xmm5[4],xmm8[5],xmm5[5],xmm8[6],xmm5[6],xmm8[7],xmm5[7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,3,1,1]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm8[0,1,1,3,4,5,6,7]
 ; SSE-NEXT:    pand %xmm11, %xmm8
-; SSE-NEXT:    por %xmm8, %xmm12
-; SSE-NEXT:    packuswb %xmm12, %xmm8
-; SSE-NEXT:    movdqa %xmm4, %xmm12
-; SSE-NEXT:    pandn %xmm8, %xmm12
-; SSE-NEXT:    pand %xmm4, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm12
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm9, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
-; SSE-NEXT:    movdqa %xmm9, %xmm8
-; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm6[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm6[2,3]
-; SSE-NEXT:    psrlq $48, %xmm6
-; SSE-NEXT:    psrldq {{.*#+}} xmm8 = xmm8[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1],xmm8[2],xmm6[2],xmm8[3],xmm6[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm9[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,7]
-; SSE-NEXT:    packuswb %xmm8, %xmm6
-; SSE-NEXT:    movdqa %xmm13, %xmm8
-; SSE-NEXT:    pandn %xmm6, %xmm8
-; SSE-NEXT:    movdqa %xmm14, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm5[8],xmm6[9],xmm5[9],xmm6[10],xmm5[10],xmm6[11],xmm5[11],xmm6[12],xmm5[12],xmm6[13],xmm5[13],xmm6[14],xmm5[14],xmm6[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[1,1,2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    movdqa %xmm0, %xmm9
-; SSE-NEXT:    pandn %xmm6, %xmm9
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm6[3,1,1,2,4,5,6,7]
-; SSE-NEXT:    pand %xmm0, %xmm10
-; SSE-NEXT:    por %xmm9, %xmm10
-; SSE-NEXT:    packuswb %xmm10, %xmm10
-; SSE-NEXT:    pand %xmm13, %xmm10
-; SSE-NEXT:    por %xmm8, %xmm10
-; SSE-NEXT:    movdqa %xmm15, %xmm6
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm5[8],xmm15[9],xmm5[9],xmm15[10],xmm5[10],xmm15[11],xmm5[11],xmm15[12],xmm5[12],xmm15[13],xmm5[13],xmm15[14],xmm5[14],xmm15[15],xmm5[15]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm15[0,1,2,3,7,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,2,0]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,5,5,7,4]
-; SSE-NEXT:    movdqa %xmm11, %xmm9
-; SSE-NEXT:    pandn %xmm8, %xmm9
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm6 = xmm6[0,3,1,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm6 = xmm6[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm11, %xmm6
-; SSE-NEXT:    por %xmm6, %xmm9
-; SSE-NEXT:    packuswb %xmm9, %xmm8
-; SSE-NEXT:    movdqa %xmm4, %xmm6
-; SSE-NEXT:    pandn %xmm8, %xmm6
-; SSE-NEXT:    pand %xmm4, %xmm10
-; SSE-NEXT:    por %xmm10, %xmm6
-; SSE-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE-NEXT:    por %xmm8, %xmm13
+; SSE-NEXT:    packuswb %xmm13, %xmm10
 ; SSE-NEXT:    movdqa %xmm1, %xmm8
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm5[8],xmm8[9],xmm5[9],xmm8[10],xmm5[10],xmm8[11],xmm5[11],xmm8[12],xmm5[12],xmm8[13],xmm5[13],xmm8[14],xmm5[14],xmm8[15],xmm5[15]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE-NEXT:    movdqa %xmm1, %xmm9
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,0],xmm8[0,0]
-; SSE-NEXT:    shufps {{.*#+}} xmm1 = xmm1[2,0],xmm8[2,3]
-; SSE-NEXT:    psrlq $48, %xmm8
-; SSE-NEXT:    psrldq {{.*#+}} xmm9 = xmm9[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; SSE-NEXT:    punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm8 = xmm1[3,1,2,3,4,5,6,7]
-; SSE-NEXT:    pshufd {{.*#+}} xmm8 = xmm8[0,1,0,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,4,5,7]
-; SSE-NEXT:    packuswb %xmm9, %xmm8
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
+; SSE-NEXT:    pandn %xmm10, %xmm8
+; SSE-NEXT:    pand %xmm1, %xmm12
+; SSE-NEXT:    por %xmm12, %xmm8
 ; SSE-NEXT:    movdqa %xmm14, %xmm9
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm5[8],xmm9[9],xmm5[9],xmm9[10],xmm5[10],xmm9[11],xmm5[11],xmm9[12],xmm5[12],xmm9[13],xmm5[13],xmm9[14],xmm5[14],xmm9[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[1,1,2,3]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5]
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm5[0],xmm14[1],xmm5[1],xmm14[2],xmm5[2],xmm14[3],xmm5[3],xmm14[4],xmm5[4],xmm14[5],xmm5[5],xmm14[6],xmm5[6],xmm14[7],xmm5[7]
-; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm14[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    movdqa %xmm14, %xmm10
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm5[0],xmm9[1],xmm5[1],xmm9[2],xmm5[2],xmm9[3],xmm5[3],xmm9[4],xmm5[4],xmm9[5],xmm5[5],xmm9[6],xmm5[6],xmm9[7],xmm5[7]
+; SSE-NEXT:    movdqa %xmm9, %xmm12
+; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3]
+; SSE-NEXT:    psrlq $48, %xmm10
+; SSE-NEXT:    psrldq {{.*#+}} xmm12 = xmm12[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7]
+; SSE-NEXT:    packuswb %xmm12, %xmm10
+; SSE-NEXT:    movdqa %xmm15, %xmm12
+; SSE-NEXT:    pandn %xmm10, %xmm12
+; SSE-NEXT:    movdqa %xmm4, %xmm10
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    movdqa %xmm0, %xmm14
+; SSE-NEXT:    pandn %xmm10, %xmm14
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm4[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[3,1,1,2,4,5,6,7]
-; SSE-NEXT:    pand %xmm0, %xmm10
-; SSE-NEXT:    pandn %xmm9, %xmm0
-; SSE-NEXT:    por %xmm10, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm13 = xmm10[3,1,1,2,4,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm13
+; SSE-NEXT:    por %xmm14, %xmm13
+; SSE-NEXT:    packuswb %xmm13, %xmm13
+; SSE-NEXT:    pand %xmm15, %xmm13
+; SSE-NEXT:    por %xmm12, %xmm13
+; SSE-NEXT:    movdqa %xmm3, %xmm10
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm12 = xmm12[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,7,4]
+; SSE-NEXT:    movdqa %xmm11, %xmm14
+; SSE-NEXT:    pandn %xmm12, %xmm14
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3],xmm10[4],xmm5[4],xmm10[5],xmm5[5],xmm10[6],xmm5[6],xmm10[7],xmm5[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,3,1,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm10[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm11, %xmm10
+; SSE-NEXT:    por %xmm10, %xmm14
+; SSE-NEXT:    packuswb %xmm14, %xmm10
+; SSE-NEXT:    movdqa %xmm1, %xmm12
+; SSE-NEXT:    pandn %xmm10, %xmm12
+; SSE-NEXT:    pand %xmm1, %xmm13
+; SSE-NEXT:    por %xmm13, %xmm12
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm9, %xmm10
+; SSE-NEXT:    pxor %xmm3, %xmm3
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm3[8],xmm10[9],xmm3[9],xmm10[10],xmm3[10],xmm10[11],xmm3[11],xmm10[12],xmm3[12],xmm10[13],xmm3[13],xmm10[14],xmm3[14],xmm10[15],xmm3[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm3[0],xmm9[1],xmm3[1],xmm9[2],xmm3[2],xmm9[3],xmm3[3],xmm9[4],xmm3[4],xmm9[5],xmm3[5],xmm9[6],xmm3[6],xmm9[7],xmm3[7]
+; SSE-NEXT:    pxor %xmm4, %xmm4
+; SSE-NEXT:    movdqa %xmm9, %xmm13
+; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[1,0],xmm10[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm9 = xmm9[2,0],xmm10[2,3]
+; SSE-NEXT:    psrlq $48, %xmm10
+; SSE-NEXT:    psrldq {{.*#+}} xmm13 = xmm13[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm10[0],xmm13[1],xmm10[1],xmm13[2],xmm10[2],xmm13[3],xmm10[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm10 = xmm9[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,4,5,7]
+; SSE-NEXT:    packuswb %xmm13, %xmm10
+; SSE-NEXT:    movdqa %xmm15, %xmm13
+; SSE-NEXT:    pandn %xmm10, %xmm13
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm3, %xmm10
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    movdqa %xmm0, %xmm9
+; SSE-NEXT:    pandn %xmm10, %xmm9
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm10[3,1,1,2,4,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm14
+; SSE-NEXT:    por %xmm9, %xmm14
+; SSE-NEXT:    packuswb %xmm14, %xmm14
+; SSE-NEXT:    pand %xmm15, %xmm14
+; SSE-NEXT:    por %xmm13, %xmm14
+; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm3, %xmm9
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm3[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[0,1,2,0]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,7,4]
+; SSE-NEXT:    movdqa %xmm11, %xmm13
+; SSE-NEXT:    pandn %xmm10, %xmm13
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3],xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,3,1,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm9[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm11, %xmm9
+; SSE-NEXT:    por %xmm9, %xmm13
+; SSE-NEXT:    packuswb %xmm13, %xmm9
+; SSE-NEXT:    movdqa %xmm1, %xmm13
+; SSE-NEXT:    pandn %xmm9, %xmm13
+; SSE-NEXT:    pand %xmm1, %xmm14
+; SSE-NEXT:    por %xmm14, %xmm13
+; SSE-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
+; SSE-NEXT:    movdqa %xmm3, %xmm9
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm4[8],xmm9[9],xmm4[9],xmm9[10],xmm4[10],xmm9[11],xmm4[11],xmm9[12],xmm4[12],xmm9[13],xmm4[13],xmm9[14],xmm4[14],xmm9[15],xmm4[15]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7]
+; SSE-NEXT:    movdqa %xmm3, %xmm10
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[1,0],xmm9[0,0]
+; SSE-NEXT:    shufps {{.*#+}} xmm3 = xmm3[2,0],xmm9[2,3]
+; SSE-NEXT:    psrlq $48, %xmm9
+; SSE-NEXT:    psrldq {{.*#+}} xmm10 = xmm10[10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
+; SSE-NEXT:    punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm9 = xmm3[3,1,2,3,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm9 = xmm9[0,1,0,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,4,5,7]
+; SSE-NEXT:    packuswb %xmm10, %xmm9
+; SSE-NEXT:    movdqa %xmm6, %xmm10
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm5[8],xmm10[9],xmm5[9],xmm10[10],xmm5[10],xmm10[11],xmm5[11],xmm10[12],xmm5[12],xmm10[13],xmm5[13],xmm10[14],xmm5[14],xmm10[15],xmm5[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm10 = xmm10[1,1,2,3]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,5,5,5,5]
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; SSE-NEXT:    pshufhw {{.*#+}} xmm14 = xmm6[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm14 = xmm14[0,2,2,3]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm14 = xmm14[3,1,1,2,4,5,6,7]
+; SSE-NEXT:    pand %xmm0, %xmm14
+; SSE-NEXT:    pandn %xmm10, %xmm0
+; SSE-NEXT:    por %xmm14, %xmm0
 ; SSE-NEXT:    packuswb %xmm0, %xmm0
-; SSE-NEXT:    pand %xmm13, %xmm0
-; SSE-NEXT:    pandn %xmm8, %xmm13
-; SSE-NEXT:    por %xmm13, %xmm0
-; SSE-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
-; SSE-NEXT:    movdqa %xmm14, %xmm1
-; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3],xmm1[4],xmm5[4],xmm1[5],xmm5[5],xmm1[6],xmm5[6],xmm1[7],xmm5[7]
-; SSE-NEXT:    punpckhbw {{.*#+}} xmm14 = xmm14[8],xmm5[8],xmm14[9],xmm5[9],xmm14[10],xmm5[10],xmm14[11],xmm5[11],xmm14[12],xmm5[12],xmm14[13],xmm5[13],xmm14[14],xmm5[14],xmm14[15],xmm5[15]
-; SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,3,1,1]
-; SSE-NEXT:    pshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7]
-; SSE-NEXT:    pand %xmm11, %xmm1
-; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm14[0,1,2,3,7,5,6,7]
+; SSE-NEXT:    pand %xmm15, %xmm0
+; SSE-NEXT:    pandn %xmm9, %xmm15
+; SSE-NEXT:    por %xmm15, %xmm0
+; SSE-NEXT:    movdqa %xmm2, %xmm4
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
+; SSE-NEXT:    punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm5[8],xmm2[9],xmm5[9],xmm2[10],xmm5[10],xmm2[11],xmm5[11],xmm2[12],xmm5[12],xmm2[13],xmm5[13],xmm2[14],xmm5[14],xmm2[15],xmm5[15]
+; SSE-NEXT:    pshufd {{.*#+}} xmm4 = xmm4[0,3,1,1]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm4 = xmm4[0,1,1,3,4,5,6,7]
+; SSE-NEXT:    pand %xmm11, %xmm4
+; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm2[0,1,2,3,7,5,6,7]
 ; SSE-NEXT:    pshufd {{.*#+}} xmm5 = xmm5[0,1,2,0]
 ; SSE-NEXT:    pshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,7,4]
 ; SSE-NEXT:    pandn %xmm5, %xmm11
-; SSE-NEXT:    por %xmm1, %xmm11
-; SSE-NEXT:    pand %xmm4, %xmm0
-; SSE-NEXT:    packuswb %xmm11, %xmm1
-; SSE-NEXT:    pandn %xmm1, %xmm4
-; SSE-NEXT:    por %xmm0, %xmm4
+; SSE-NEXT:    por %xmm4, %xmm11
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    packuswb %xmm11, %xmm4
+; SSE-NEXT:    pandn %xmm4, %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm0, 16(%rsi)
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -3608,11 +3579,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE-NEXT:    movaps %xmm0, (%r9)
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    movdqa %xmm4, 16(%rax)
-; SSE-NEXT:    movdqa %xmm6, 32(%rax)
+; SSE-NEXT:    movdqa %xmm1, 16(%rax)
+; SSE-NEXT:    movdqa %xmm13, 32(%rax)
 ; SSE-NEXT:    movdqa %xmm12, 48(%rax)
-; SSE-NEXT:    movdqa %xmm2, (%rax)
-; SSE-NEXT:    addq $808, %rsp # imm = 0x328
+; SSE-NEXT:    movdqa %xmm8, (%rax)
+; SSE-NEXT:    addq $792, %rsp # imm = 0x318
 ; SSE-NEXT:    retq
 ;
 ; AVX1-ONLY-LABEL: load_i8_stride6_vf64:
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index 938fba0490b55..e6b5998d965f0 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1160,9 +1160,9 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshab %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
index 5a70e5d4a2b56..6d19a81d9fd78 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-256.ll
@@ -1265,9 +1265,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpshab %xmm1, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index e248aafab5255..7a2dcd1c8ca8c 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -941,9 +941,9 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
index 77f5f2660af7e..5fe661c7e7778 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-256.ll
@@ -1037,9 +1037,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
index deb1514e42c4a..12f971fb83b56 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll
@@ -848,9 +848,9 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v16i8:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
index c355eeaa42b66..76944994c87d1 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-256.ll
@@ -962,9 +962,9 @@ define <32 x i8> @splatvar_modulo_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwi
 ;
 ; XOPAVX1-LABEL: splatvar_modulo_shift_v32i8:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; XOPAVX1-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm2, %xmm2
 ; XOPAVX1-NEXT:    vpshlb %xmm1, %xmm0, %xmm0

From 1652d44d8da83b0a01d5f0b378f10882f5ec8d22 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 23 Jan 2024 12:35:16 +0100
Subject: [PATCH 598/843] [Clang] Amend SME attributes with support for ZT0.
 (#77941)

This patch builds on top of #76971 and implements support for:
* __arm_new("zt0")
* __arm_in("zt0")
* __arm_out("zt0")
* __arm_inout("zt0")
* __arm_preserves("zt0")
---
 clang/include/clang/AST/Type.h                | 16 ++++--
 clang/include/clang/Basic/Attr.td             |  3 +
 .../clang/Basic/DiagnosticSemaKinds.td        |  4 ++
 clang/lib/AST/TypePrinter.cpp                 |  8 +++
 clang/lib/CodeGen/CGCall.cpp                  | 10 ++++
 clang/lib/CodeGen/CodeGenModule.cpp           |  2 +
 clang/lib/Sema/SemaChecking.cpp               | 22 +++++++
 clang/lib/Sema/SemaDecl.cpp                   | 11 ++++
 clang/lib/Sema/SemaDeclAttr.cpp               |  8 +++
 clang/lib/Sema/SemaType.cpp                   |  3 +
 .../aarch64-sme2-attrs.cpp                    | 57 +++++++++++++++++++
 ...-sme-func-attrs-without-target-feature.cpp |  2 +
 clang/test/Sema/aarch64-sme-func-attrs.c      | 45 +++++++++++++++
 13 files changed, 186 insertions(+), 5 deletions(-)
 create mode 100644 clang/test/CodeGen/aarch64-sme2-intrinsics/aarch64-sme2-attrs.cpp

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 259e920acf9ff..ea425791fc97f 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -4056,10 +4056,12 @@ class FunctionType : public Type {
     // Describes the value of the state using ArmStateValue.
     SME_ZAShift = 2,
     SME_ZAMask = 0b111 << SME_ZAShift,
+    SME_ZT0Shift = 5,
+    SME_ZT0Mask = 0b111 << SME_ZT0Shift,
 
-    SME_AttributeMask = 0b111'111 // We only support maximum 6 bits because of
-                                  // the bitmask in FunctionTypeArmAttributes
-                                  // and ExtProtoInfo.
+    SME_AttributeMask =
+        0b111'111'11 // We can't support more than 8 bits because of
+                     // the bitmask in FunctionTypeExtraBitfields.
   };
 
   enum ArmStateValue : unsigned {
@@ -4074,13 +4076,17 @@ class FunctionType : public Type {
     return (ArmStateValue)((AttrBits & SME_ZAMask) >> SME_ZAShift);
   }
 
+  static ArmStateValue getArmZT0State(unsigned AttrBits) {
+    return (ArmStateValue)((AttrBits & SME_ZT0Mask) >> SME_ZT0Shift);
+  }
+
   /// A holder for Arm type attributes as described in the Arm C/C++
   /// Language extensions which are not particularly common to all
   /// types and therefore accounted separately from FunctionTypeBitfields.
   struct alignas(void *) FunctionTypeArmAttributes {
     /// Any AArch64 SME ACLE type attributes that need to be propagated
     /// on declarations and function pointers.
-    unsigned AArch64SMEAttributes : 6;
+    unsigned AArch64SMEAttributes : 8;
 
     FunctionTypeArmAttributes() : AArch64SMEAttributes(SME_NormalFunction) {}
   };
@@ -4266,7 +4272,7 @@ class FunctionProtoType final
     FunctionType::ExtInfo ExtInfo;
     unsigned Variadic : 1;
     unsigned HasTrailingReturn : 1;
-    unsigned AArch64SMEAttributes : 6;
+    unsigned AArch64SMEAttributes : 8;
     Qualifiers TypeQuals;
     RefQualifierKind RefQualifier = RQ_None;
     ExceptionSpecInfo ExceptionSpec;
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index 78a9229aeaf08..58838b01b4fd7 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -2552,6 +2552,9 @@ def ArmNew : InheritableAttr, TargetSpecificAttr<TargetAArch64> {
     bool isNewZA() const {
       return llvm::is_contained(newArgs(), "za");
     }
+    bool isNewZT0() const {
+      return llvm::is_contained(newArgs(), "zt0");
+    }
   }];
 }
 
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 501968cb7d5f9..e027e754477fc 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3706,10 +3706,14 @@ def err_sme_call_in_non_sme_target : Error<
   "call to a streaming function requires 'sme'">;
 def err_sme_za_call_no_za_state : Error<
   "call to a shared ZA function requires the caller to have ZA state">;
+def err_sme_zt0_call_no_zt0_state : Error<
+  "call to a shared ZT0 function requires the caller to have ZT0 state">;
 def err_sme_definition_using_sm_in_non_sme_target : Error<
   "function executed in streaming-SVE mode requires 'sme'">;
 def err_sme_definition_using_za_in_non_sme_target : Error<
   "function using ZA state requires 'sme'">;
+def err_sme_definition_using_zt0_in_non_sme2_target : Error<
+  "function using ZT0 state requires 'sme2'">;
 def err_conflicting_attributes_arm_state : Error<
   "conflicting attributes for state '%0'">;
 def err_unknown_arm_state : Error<
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 1baf895ebaec2..80b42c8f84a00 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -951,6 +951,14 @@ void TypePrinter::printFunctionProtoAfter(const FunctionProtoType *T,
     OS << " __arm_out(\"za\")";
   if (FunctionType::getArmZAState(SMEBits) == FunctionType::ARM_InOut)
     OS << " __arm_inout(\"za\")";
+  if (FunctionType::getArmZT0State(SMEBits) == FunctionType::ARM_Preserves)
+    OS << " __arm_preserves(\"zt0\")";
+  if (FunctionType::getArmZT0State(SMEBits) == FunctionType::ARM_In)
+    OS << " __arm_in(\"zt0\")";
+  if (FunctionType::getArmZT0State(SMEBits) == FunctionType::ARM_Out)
+    OS << " __arm_out(\"zt0\")";
+  if (FunctionType::getArmZT0State(SMEBits) == FunctionType::ARM_InOut)
+    OS << " __arm_inout(\"zt0\")";
 
   printFunctionAfter(Info, OS);
 
diff --git a/clang/lib/CodeGen/CGCall.cpp b/clang/lib/CodeGen/CGCall.cpp
index acf6cbad1c748..28c211aa631e4 100644
--- a/clang/lib/CodeGen/CGCall.cpp
+++ b/clang/lib/CodeGen/CGCall.cpp
@@ -1782,6 +1782,16 @@ static void AddAttributesFromFunctionProtoType(ASTContext &Ctx,
     FuncAttrs.addAttribute("aarch64_pstate_za_shared");
     FuncAttrs.addAttribute("aarch64_pstate_za_preserved");
   }
+
+  // ZT0
+  if (FunctionType::getArmZT0State(SMEBits) == FunctionType::ARM_Preserves)
+    FuncAttrs.addAttribute("aarch64_preserves_zt0");
+  if (FunctionType::getArmZT0State(SMEBits) == FunctionType::ARM_In)
+    FuncAttrs.addAttribute("aarch64_in_zt0");
+  if (FunctionType::getArmZT0State(SMEBits) == FunctionType::ARM_Out)
+    FuncAttrs.addAttribute("aarch64_out_zt0");
+  if (FunctionType::getArmZT0State(SMEBits) == FunctionType::ARM_InOut)
+    FuncAttrs.addAttribute("aarch64_inout_zt0");
 }
 
 static void AddAttributesFromAssumes(llvm::AttrBuilder &FuncAttrs,
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index 78b2f2ba29fdf..1280bcd36de94 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -2415,6 +2415,8 @@ void CodeGenModule::SetLLVMFunctionAttributesForDefinition(const Decl *D,
   if (auto *Attr = D->getAttr<ArmNewAttr>()) {
     if (Attr->isNewZA())
       B.addAttribute("aarch64_pstate_za_new");
+    if (Attr->isNewZT0())
+      B.addAttribute("aarch64_new_zt0");
   }
 
   // Track whether we need to add the optnone LLVM attribute,
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 28f5667a1b6ba..1f83dcf07b6f9 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -7548,6 +7548,28 @@ void Sema::checkCall(NamedDecl *FDecl, const FunctionProtoType *Proto,
       if (!CallerHasZAState)
         Diag(Loc, diag::err_sme_za_call_no_za_state);
     }
+
+    // If the callee uses AArch64 SME ZT0 state but the caller doesn't define
+    // any, then this is an error.
+    FunctionType::ArmStateValue ArmZT0State =
+        FunctionType::getArmZT0State(ExtInfo.AArch64SMEAttributes);
+    if (ArmZT0State != FunctionType::ARM_None) {
+      bool CallerHasZT0State = false;
+      if (const auto *CallerFD = dyn_cast<FunctionDecl>(CurContext)) {
+        auto *Attr = CallerFD->getAttr<ArmNewAttr>();
+        if (Attr && Attr->isNewZT0())
+          CallerHasZT0State = true;
+        else if (const auto *FPT =
+                     CallerFD->getType()->getAs<FunctionProtoType>())
+          CallerHasZT0State =
+              FunctionType::getArmZT0State(
+                  FPT->getExtProtoInfo().AArch64SMEAttributes) !=
+              FunctionType::ARM_None;
+      }
+
+      if (!CallerHasZT0State)
+        Diag(Loc, diag::err_sme_zt0_call_no_zt0_state);
+    }
   }
 
   if (FDecl && FDecl->hasAttr<AllocAlignAttr>()) {
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index 13ca438e6a487..f9bf1d14bdc4f 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -12234,12 +12234,15 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
     const auto *Attr = NewFD->getAttr<ArmNewAttr>();
     bool UsesSM = NewFD->hasAttr<ArmLocallyStreamingAttr>();
     bool UsesZA = Attr && Attr->isNewZA();
+    bool UsesZT0 = Attr && Attr->isNewZT0();
     if (const auto *FPT = NewFD->getType()->getAs<FunctionProtoType>()) {
       FunctionProtoType::ExtProtoInfo EPI = FPT->getExtProtoInfo();
       UsesSM |=
           EPI.AArch64SMEAttributes & FunctionType::SME_PStateSMEnabledMask;
       UsesZA |= FunctionType::getArmZAState(EPI.AArch64SMEAttributes) !=
                 FunctionType::ARM_None;
+      UsesZT0 |= FunctionType::getArmZT0State(EPI.AArch64SMEAttributes) !=
+                 FunctionType::ARM_None;
     }
 
     if (UsesSM || UsesZA) {
@@ -12254,6 +12257,14 @@ bool Sema::CheckFunctionDeclaration(Scope *S, FunctionDecl *NewFD,
                diag::err_sme_definition_using_za_in_non_sme_target);
       }
     }
+    if (UsesZT0) {
+      llvm::StringMap<bool> FeatureMap;
+      Context.getFunctionFeatureMap(FeatureMap, NewFD);
+      if (!FeatureMap.contains("sme2")) {
+        Diag(NewFD->getLocation(),
+             diag::err_sme_definition_using_zt0_in_non_sme2_target);
+      }
+    }
   }
 
   return Redeclaration;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 2a73567c2f051..6f462de4be78b 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -8994,6 +8994,7 @@ static void handleArmNewAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   }
 
   bool HasZA = false;
+  bool HasZT0 = false;
   for (unsigned I = 0, E = AL.getNumArgs(); I != E; ++I) {
     StringRef StateName;
     SourceLocation LiteralLoc;
@@ -9002,6 +9003,8 @@ static void handleArmNewAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 
     if (StateName == "za")
       HasZA = true;
+    else if (StateName == "zt0")
+      HasZT0 = true;
     else {
       S.Diag(LiteralLoc, diag::err_unknown_arm_state) << StateName;
       AL.setInvalid();
@@ -9018,6 +9021,11 @@ static void handleArmNewAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
     if (HasZA && ZAState != FunctionType::ARM_None &&
         checkArmNewAttrMutualExclusion(S, AL, FPT, ZAState, "za"))
       return;
+    FunctionType::ArmStateValue ZT0State =
+        FunctionType::getArmZT0State(FPT->getAArch64SMEAttributes());
+    if (HasZT0 && ZT0State != FunctionType::ARM_None &&
+        checkArmNewAttrMutualExclusion(S, AL, FPT, ZT0State, "zt0"))
+      return;
   }
 
   D->dropAttr<ArmNewAttr>();
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 933d8d6031363..9cb6c0a4ef248 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -7938,6 +7938,9 @@ static bool handleArmStateAttribute(Sema &S,
     if (StateName == "za") {
       Shift = FunctionType::SME_ZAShift;
       ExistingState = FunctionType::getArmZAState(EPI.AArch64SMEAttributes);
+    } else if (StateName == "zt0") {
+      Shift = FunctionType::SME_ZT0Shift;
+      ExistingState = FunctionType::getArmZT0State(EPI.AArch64SMEAttributes);
     } else {
       S.Diag(LiteralLoc, diag::err_unknown_arm_state) << StateName;
       Attr.setInvalid();
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/aarch64-sme2-attrs.cpp b/clang/test/CodeGen/aarch64-sme2-intrinsics/aarch64-sme2-attrs.cpp
new file mode 100644
index 0000000000000..1916f00e0eb91
--- /dev/null
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/aarch64-sme2-attrs.cpp
@@ -0,0 +1,57 @@
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 \
+// RUN:   -S -disable-O0-optnone -Werror -emit-llvm -o - %s \
+// RUN: | opt -S -passes=mem2reg \
+// RUN: | opt -S -passes=inline \
+// RUN: | FileCheck %s
+
+// Test the attributes for ZT0 and their mappings to LLVM IR.
+
+extern "C" {
+
+// CHECK-LABEL: @in_zt0()
+// CHECK-SAME: #[[ZT0_IN:[0-9]+]]
+void in_zt0() __arm_in("zt0") { }
+
+// CHECK-LABEL: @out_zt0()
+// CHECK-SAME: #[[ZT0_OUT:[0-9]+]]
+void out_zt0() __arm_out("zt0") { }
+
+// CHECK-LABEL: @inout_zt0()
+// CHECK-SAME: #[[ZT0_INOUT:[0-9]+]]
+void inout_zt0() __arm_inout("zt0") { }
+
+// CHECK-LABEL: @preserves_zt0()
+// CHECK-SAME: #[[ZT0_PRESERVED:[0-9]+]]
+void preserves_zt0() __arm_preserves("zt0") { }
+
+// CHECK-LABEL: @new_zt0()
+// CHECK-SAME: #[[ZT0_NEW:[0-9]+]]
+__arm_new("zt0") void new_zt0() { }
+
+// CHECK-LABEL: @in_za_zt0()
+// CHECK-SAME: #[[ZA_ZT0_IN:[0-9]+]]
+void in_za_zt0() __arm_in("za", "zt0") { }
+
+// CHECK-LABEL: @out_za_zt0()
+// CHECK-SAME: #[[ZA_ZT0_OUT:[0-9]+]]
+void out_za_zt0() __arm_out("za", "zt0") { }
+
+// CHECK-LABEL: @inout_za_zt0()
+// CHECK-SAME: #[[ZA_ZT0_INOUT:[0-9]+]]
+void inout_za_zt0() __arm_inout("za", "zt0") { }
+
+// CHECK-LABEL: @preserves_za_zt0()
+// CHECK-SAME: #[[ZA_ZT0_PRESERVED:[0-9]+]]
+void preserves_za_zt0() __arm_preserves("za", "zt0") { }
+
+// CHECK-LABEL: @new_za_zt0()
+// CHECK-SAME: #[[ZA_ZT0_NEW:[0-9]+]]
+__arm_new("za", "zt0") void new_za_zt0() { }
+
+}
+
+// CHECK: attributes #[[ZT0_IN]] = {{{.*}} "aarch64_in_zt0" {{.*}}}
+// CHECK: attributes #[[ZT0_OUT]] = {{{.*}} "aarch64_out_zt0" {{.*}}}
+// CHECK: attributes #[[ZT0_INOUT]] = {{{.*}} "aarch64_inout_zt0" {{.*}}}
+// CHECK: attributes #[[ZT0_PRESERVED]] = {{{.*}} "aarch64_preserves_zt0" {{.*}}}
+// CHECK: attributes #[[ZT0_NEW]] = {{{.*}} "aarch64_new_zt0" {{.*}}}
diff --git a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
index 0a54a94f408b7..ec6bb6f503578 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
+++ b/clang/test/Sema/aarch64-sme-func-attrs-without-target-feature.cpp
@@ -8,6 +8,8 @@ void shared_za_def() __arm_inout("za") { } // expected-error {{function using ZA
 __arm_new("za") void new_za_def() { } // expected-error {{function using ZA state requires 'sme'}}
 __arm_locally_streaming void locally_streaming_def() { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
 void streaming_shared_za_def() __arm_streaming __arm_inout("za") { } // expected-error {{function executed in streaming-SVE mode requires 'sme'}}
+void inout_za_def() __arm_inout("za") { } // expected-error {{function using ZA state requires 'sme'}}
+void inout_zt0_def() __arm_inout("zt0") { } // expected-error {{function using ZT0 state requires 'sme2'}}
 
 // It should work fine when we explicitly add the target("sme") attribute.
 __attribute__((target("sme"))) void streaming_compatible_def_sme_attr() __arm_streaming_compatible {} // OK
diff --git a/clang/test/Sema/aarch64-sme-func-attrs.c b/clang/test/Sema/aarch64-sme-func-attrs.c
index b986b0b3de2e1..97409ae7d6040 100644
--- a/clang/test/Sema/aarch64-sme-func-attrs.c
+++ b/clang/test/Sema/aarch64-sme-func-attrs.c
@@ -353,6 +353,11 @@ void invalid_arm_in_unknown_state(void) __arm_in("unknownstate");
 
 void valid_state_attrs_in_in1(void) __arm_in("za");
 void valid_state_attrs_in_in2(void) __arm_in("za", "za");
+void valid_state_attrs_in_in3(void) __arm_in("zt0");
+void valid_state_attrs_in_in4(void) __arm_in("zt0", "zt0");
+void valid_state_attrs_in_in5(void) __arm_in("za", "zt0");
+__arm_new("za") void valid_state_attrs_in_in6(void) __arm_in("zt0");
+__arm_new("zt0") void valid_state_attrs_in_in7(void) __arm_in("za");
 
 // expected-cpp-error@+2 {{missing state for '__arm_in'}}
 // expected-error@+1 {{missing state for '__arm_in'}}
@@ -400,3 +405,43 @@ void conflicting_state_attrs_preserves_out(void) __arm_preserves("za") __arm_out
 // expected-cpp-error@+2 {{conflicting attributes for state 'za'}}
 // expected-error@+1 {{conflicting attributes for state 'za'}}
 void conflicting_state_attrs_preserves_inout(void) __arm_preserves("za") __arm_inout("za");
+
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_in_out_zt0(void) __arm_in("zt0") __arm_out("zt0");
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_in_inout_zt0(void) __arm_in("zt0") __arm_inout("zt0");
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_in_preserves_zt0(void) __arm_in("zt0") __arm_preserves("zt0");
+
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_out_in_zt0(void) __arm_out("zt0") __arm_in("zt0");
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_out_inout_zt0(void) __arm_out("zt0") __arm_inout("zt0");
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_out_preserves_zt0(void) __arm_out("zt0") __arm_preserves("zt0");
+
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_inout_in_zt0(void) __arm_inout("zt0") __arm_in("zt0");
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_inout_out_zt0(void) __arm_inout("zt0") __arm_out("zt0");
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_inout_preserves_zt0(void) __arm_inout("zt0") __arm_preserves("zt0");
+
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_preserves_in_zt0(void) __arm_preserves("zt0") __arm_in("zt0");
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_preserves_out_zt0(void) __arm_preserves("zt0") __arm_out("zt0");
+// expected-cpp-error@+2 {{conflicting attributes for state 'zt0'}}
+// expected-error@+1 {{conflicting attributes for state 'zt0'}}
+void conflicting_state_attrs_preserves_inout_zt0(void) __arm_preserves("zt0") __arm_inout("zt0");

From f47c4067fd30ef4318f6ff4d4afafd1ef8a2e262 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 23 Jan 2024 11:37:03 +0000
Subject: [PATCH 599/843] [PhaseOrder] Add test where indvars dropping NSW
 prevents vectorization.

End-to-end test for https://github.com/llvm/llvm-project/issues/71517,
testing IndVars/LoopVectorize interaction
---
 .../AArch64/indvars-vectorization.ll          | 68 +++++++++++++++++++
 1 file changed, 68 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
new file mode 100644
index 0000000000000..a7e8e15804117
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/indvars-vectorization.ll
@@ -0,0 +1,68 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='default<O3>' -S -o - %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
+target triple = "arm64-apple-macosx14.0.0"
+
+define void @s172(i32 noundef %xa, i32 noundef %xb, ptr noundef %a, ptr noundef %b) {
+; CHECK-LABEL: define void @s172(
+; CHECK-SAME: i32 noundef [[XA:%.*]], i32 noundef [[XB:%.*]], ptr nocapture noundef [[A:%.*]], ptr nocapture noundef readonly [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp slt i32 [[XA]], 32001
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[SUB:%.*]] = add i32 [[XA]], -1
+; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[SUB]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[XB]] to i64
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[TMP0]], [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[GEP_B:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L_B:%.*]] = load i32, ptr [[GEP_B]], align 4
+; CHECK-NEXT:    [[GEP_A:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[INDVARS_IV]]
+; CHECK-NEXT:    [[L_A:%.*]] = load i32, ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[L_A]], [[L_B]]
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[GEP_A]], align 4
+; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add i64 [[INDVARS_IV]], [[TMP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[INDVARS_IV_NEXT]], 32000
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  %sub = sub nsw i32 %xa, 1
+  br label %for.cond
+
+for.cond:
+  %i.0 = phi i32 [ %sub, %entry ], [ %add3, %for.inc ]
+  %cmp = icmp slt i32 %i.0, 32000
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.body:
+  %idxprom = sext i32 %i.0 to i64
+  %gep.b = getelementptr inbounds i32, ptr %b, i64 %idxprom
+  %l.b = load i32, ptr %gep.b, align 4
+  %idxprom1 = sext i32 %i.0 to i64
+  %gep.a = getelementptr inbounds i32, ptr %a, i64 %idxprom1
+  %l.a = load i32, ptr %gep.a , align 4
+  %add = add nsw i32 %l.a, %l.b
+  store i32 %add, ptr %gep.a, align 4
+  br label %for.inc
+
+for.inc:
+  %add3 = add nsw i32 %i.0, %xb
+  br label %for.cond, !llvm.loop !0
+
+for.cond.cleanup:
+  br label %for.end
+
+for.end:
+  ret void
+}
+
+!0 = distinct !{!0, !1}
+!1 = !{!"llvm.loop.mustprogress"}
+;.
+; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]]}
+; CHECK: [[META1]] = !{!"llvm.loop.mustprogress"}
+;.

From 082f87c9d418eb7b5a731e16b50e6649e387cb5a Mon Sep 17 00:00:00 2001
From: Saiyedul Islam <Saiyedul.Islam@amd.com>
Date: Tue, 23 Jan 2024 17:08:18 +0530
Subject: [PATCH 600/843] [AMDGPU] Change default AMDHSA Code Object version to
 5 (#79038)

Also update LIT tests and docs.
For more details, see
https://llvm.org/docs/AMDGPUUsage.html#code-object-v5-metadata

Corresponding llvm-objdump AMDGPU lit tests are updated
in a follow-up PR.
---
 clang/docs/ReleaseNotes.rst                       |  3 +++
 clang/include/clang/Driver/Options.td             |  4 ++--
 clang/test/CodeGen/amdgpu-address-spaces.cpp      |  2 +-
 .../CodeGenCUDA/amdgpu-code-object-version.cu     |  2 +-
 clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu   |  4 ++--
 clang/test/CodeGenHIP/default-attributes.hip      |  4 ++--
 clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl |  4 ++--
 clang/test/CodeGenOpenCL/builtins-amdgcn.cl       | 10 +++++-----
 flang/test/Driver/driver-help-hidden.f90          |  2 +-
 flang/test/Driver/driver-help.f90                 |  4 ++--
 llvm/docs/AMDGPUUsage.rst                         | 15 +++++++--------
 llvm/docs/ReleaseNotes.rst                        |  2 ++
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp   |  2 +-
 .../Dialect/GPU/Transforms/SerializeToHsaco.cpp   |  2 +-
 .../Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp    |  1 +
 mlir/test/Target/LLVMIR/rocdl.mlir                |  2 +-
 16 files changed, 34 insertions(+), 29 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 01c4ee97662b6..060bc7669b72a 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -1118,6 +1118,9 @@ AMDGPU Support
   arguments in C ABI. Callee is responsible for allocating stack memory and
   copying the value of the struct if modified. Note that AMDGPU backend still
   supports byval for struct arguments.
+- The default value for ``-mcode-object-version`` is now 5.
+  See `AMDHSA Code Object V5 Metadata <https://llvm.org/docs/AMDGPUUsage.html#code-object-v5-metadata>`_
+  for more details.
 
 X86 Support
 ^^^^^^^^^^^
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 819f6f1a15c3f..f203a0fe7ede1 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -4781,12 +4781,12 @@ defm amdgpu_ieee : BoolOption<"m", "amdgpu-ieee",
   NegFlag<SetFalse, [], [ClangOption, CC1Option]>>, Group<m_Group>;
 
 def mcode_object_version_EQ : Joined<["-"], "mcode-object-version=">, Group<m_Group>,
-  HelpText<"Specify code object ABI version. Defaults to 4. (AMDGPU only)">,
+  HelpText<"Specify code object ABI version. Defaults to 5. (AMDGPU only)">,
   Visibility<[ClangOption, FlangOption, CC1Option, FC1Option]>,
   Values<"none,4,5">,
   NormalizedValuesScope<"llvm::CodeObjectVersionKind">,
   NormalizedValues<["COV_None", "COV_4", "COV_5"]>,
-  MarshallingInfoEnum<TargetOpts<"CodeObjectVersion">, "COV_4">;
+  MarshallingInfoEnum<TargetOpts<"CodeObjectVersion">, "COV_5">;
 
 defm cumode : SimpleMFlag<"cumode",
   "Specify CU wavefront", "Specify WGP wavefront",
diff --git a/clang/test/CodeGen/amdgpu-address-spaces.cpp b/clang/test/CodeGen/amdgpu-address-spaces.cpp
index 0a808aa6cc75e..ae2c61439f4ca 100644
--- a/clang/test/CodeGen/amdgpu-address-spaces.cpp
+++ b/clang/test/CodeGen/amdgpu-address-spaces.cpp
@@ -29,7 +29,7 @@ int [[clang::address_space(999)]] bbb = 1234;
 // CHECK: @u = addrspace(5) global i32 undef, align 4
 // CHECK: @aaa = addrspace(6) global i32 1000, align 4
 // CHECK: @bbb = addrspace(999) global i32 1234, align 4
-// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 400
+// CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
 //.
 // CHECK-LABEL: define dso_local amdgpu_kernel void @foo(
 // CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
diff --git a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
index ff5deaf9ab850..3cb6632fc0b63 100644
--- a/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-code-object-version.cu
@@ -1,7 +1,7 @@
 // Create module flag for code object version.
 
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
-// RUN:   -o - %s | FileCheck %s -check-prefix=V4
+// RUN:   -o - %s | FileCheck %s -check-prefix=V5
 
 // RUN: %clang_cc1 -fcuda-is-device -triple amdgcn-amd-amdhsa -emit-llvm \
 // RUN:   -mcode-object-version=4 -o - %s | FileCheck -check-prefix=V4 %s
diff --git a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
index 282e0a49b9aa1..0c846e0936b58 100644
--- a/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-workgroup-size.cu
@@ -1,10 +1,10 @@
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
-// RUN:     -fcuda-is-device -emit-llvm -o - -x hip %s \
+// RUN:     -fcuda-is-device -mcode-object-version=4 -emit-llvm -o - -x hip %s \
 // RUN:     | FileCheck -check-prefix=PRECOV5 %s
 
 
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
-// RUN:     -fcuda-is-device -mcode-object-version=5 -emit-llvm -o - -x hip %s \
+// RUN:     -fcuda-is-device -emit-llvm -o - -x hip %s \
 // RUN:     | FileCheck -check-prefix=COV5 %s
 
 // RUN: %clang_cc1 -triple amdgcn-amd-amdhsa \
diff --git a/clang/test/CodeGenHIP/default-attributes.hip b/clang/test/CodeGenHIP/default-attributes.hip
index 80aa1ee070062..9c9ea521271b9 100644
--- a/clang/test/CodeGenHIP/default-attributes.hip
+++ b/clang/test/CodeGenHIP/default-attributes.hip
@@ -46,11 +46,11 @@ __global__ void kernel() {
 // OPT: attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "no-trapping-math"="true" "stack-protector-buffer-size"="8" }
 // OPT: attributes #1 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "amdgpu-flat-work-group-size"="1,1024" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "uniform-work-group-size"="true" }
 //.
-// OPTNONE: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// OPTNONE: !0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
 // OPTNONE: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
 // OPTNONE: !2 = !{i32 1, !"wchar_size", i32 4}
 //.
-// OPT: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// OPT: !0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
 // OPT: !1 = !{i32 1, !"amdgpu_printf_kind", !"hostcall"}
 // OPT: !2 = !{i32 1, !"wchar_size", i32 4}
 //.
diff --git a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
index e574b1f64c499..2cf1286e2b54e 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-enqueue-kernel.cl
@@ -703,7 +703,7 @@ kernel void test_target_features_kernel(global int *i) {
 // GFX900: attributes #8 = { nounwind }
 // GFX900: attributes #9 = { convergent nounwind }
 //.
-// NOCPU: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// NOCPU: !0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
 // NOCPU: !1 = !{i32 1, !"wchar_size", i32 4}
 // NOCPU: !2 = !{i32 2, i32 0}
 // NOCPU: !3 = !{i32 1, i32 0, i32 1, i32 0}
@@ -721,7 +721,7 @@ kernel void test_target_features_kernel(global int *i) {
 // NOCPU: !15 = !{i32 1}
 // NOCPU: !16 = !{!"int*"}
 //.
-// GFX900: !0 = !{i32 1, !"amdgpu_code_object_version", i32 400}
+// GFX900: !0 = !{i32 1, !"amdgpu_code_object_version", i32 500}
 // GFX900: !1 = !{i32 1, !"wchar_size", i32 4}
 // GFX900: !2 = !{i32 2, i32 0}
 // GFX900: !3 = !{!4, !4, i64 0}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 0bc9a54682d3e..8d9e4e018b12e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -601,13 +601,13 @@ void test_get_local_id(int d, global int *out)
 }
 
 // CHECK-LABEL: @test_get_workgroup_size(
-// CHECK: call align 4 dereferenceable(64) ptr addrspace(4) @llvm.amdgcn.dispatch.ptr()
-// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 4
+// CHECK: call align 8 dereferenceable(256) ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
+// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 12
 // CHECK: load i16, ptr addrspace(4) %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 6
+// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 14
 // CHECK: load i16, ptr addrspace(4) %{{.*}}, align 2, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
-// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 8
-// CHECK: load i16, ptr addrspace(4) %{{.*}}, align 4, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
+// CHECK: getelementptr inbounds i8, ptr addrspace(4) %{{.*}}, i64 16
+// CHECK: load i16, ptr addrspace(4) %{{.*}}, align 8, !range [[$WS_RANGE:![0-9]*]], !invariant.load{{.*}}, !noundef
 void test_get_workgroup_size(int d, global int *out)
 {
 	switch (d) {
diff --git a/flang/test/Driver/driver-help-hidden.f90 b/flang/test/Driver/driver-help-hidden.f90
index 426b0e5a1c367..25dfcf3c70d8e 100644
--- a/flang/test/Driver/driver-help-hidden.f90
+++ b/flang/test/Driver/driver-help-hidden.f90
@@ -117,7 +117,7 @@
 ! CHECK-NEXT: -L <dir>                Add directory to library search path
 ! CHECK-NEXT: -march=<value>          For a list of available architectures for the target use '-mcpu=help'
 ! CHECK-NEXT: -mcode-object-version=<value>
-! CHECK-NEXT:                         Specify code object ABI version. Defaults to 4. (AMDGPU only)
+! CHECK-NEXT:                         Specify code object ABI version. Defaults to 5. (AMDGPU only)
 ! CHECK-NEXT: -mcpu=<value>           For a list of available CPUs for the target use '-mcpu=help'
 ! CHECK-NEXT: -mllvm=<arg>            Alias for -mllvm
 ! CHECK-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
diff --git a/flang/test/Driver/driver-help.f90 b/flang/test/Driver/driver-help.f90
index 221da6439764b..9f0aae51c3f5d 100644
--- a/flang/test/Driver/driver-help.f90
+++ b/flang/test/Driver/driver-help.f90
@@ -103,7 +103,7 @@
 ! HELP-NEXT: -L <dir>                Add directory to library search path
 ! HELP-NEXT: -march=<value>          For a list of available architectures for the target use '-mcpu=help'
 ! HELP-NEXT: -mcode-object-version=<value>
-! HELP-NEXT:                         Specify code object ABI version. Defaults to 4. (AMDGPU only)
+! HELP-NEXT:                         Specify code object ABI version. Defaults to 5. (AMDGPU only)
 ! HELP-NEXT: -mcpu=<value>           For a list of available CPUs for the target use '-mcpu=help'
 ! HELP-NEXT: -mllvm=<arg>            Alias for -mllvm
 ! HELP-NEXT: -mllvm <value>          Additional arguments to forward to LLVM's option processing
@@ -240,7 +240,7 @@
 ! HELP-FC1-NEXT: -I <dir>                Add directory to the end of the list of include search paths
 ! HELP-FC1-NEXT: -load <dsopath>         Load the named plugin (dynamic shared object)
 ! HELP-FC1-NEXT: -mcode-object-version=<value>
-! HELP-FC1-NEXT:                         Specify code object ABI version. Defaults to 4. (AMDGPU only)
+! HELP-FC1-NEXT:                         Specify code object ABI version. Defaults to 5. (AMDGPU only)
 ! HELP-FC1-NEXT: -menable-no-infs        Allow optimization to assume there are no infinities.
 ! HELP-FC1-NEXT: -menable-no-nans        Allow optimization to assume there are no NaNs.
 ! HELP-FC1-NEXT: -mframe-pointer=<value> Specify which frame pointers to retain.
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 548d677afdecb..6b2417143ca06 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -1510,12 +1510,12 @@ The AMDGPU backend uses the following ELF header:
 
   * ``ELFABIVERSION_AMDGPU_HSA_V4`` is used to specify the version of AMD HSA
     runtime ABI for code object V4. Specify using the Clang option
-    ``-mcode-object-version=4``. This is the default code object
-    version if not specified.
+    ``-mcode-object-version=4``.
 
   * ``ELFABIVERSION_AMDGPU_HSA_V5`` is used to specify the version of AMD HSA
     runtime ABI for code object V5. Specify using the Clang option
-    ``-mcode-object-version=5``.
+    ``-mcode-object-version=5``. This is the default code object
+    version if not specified.
 
   * ``ELFABIVERSION_AMDGPU_PAL`` is used to specify the version of AMD PAL
     runtime ABI.
@@ -3949,6 +3949,10 @@ same *vendor-name*.
 Code Object V4 Metadata
 +++++++++++++++++++++++
 
+. warning::
+  Code object V4 is not the default code object version emitted by this version
+  of LLVM.
+
 Code object V4 metadata is the same as
 :ref:`amdgpu-amdhsa-code-object-metadata-v3` with the changes and additions
 defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v4`.
@@ -3979,11 +3983,6 @@ defined in table :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v4`.
 Code Object V5 Metadata
 +++++++++++++++++++++++
 
-.. warning::
-  Code object V5 is not the default code object version emitted by this version
-  of LLVM.
-
-
 Code object V5 metadata is the same as
 :ref:`amdgpu-amdhsa-code-object-metadata-v4` with the changes defined in table
 :ref:`amdgpu-amdhsa-code-object-metadata-map-table-v5`, table
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 8d75b6415e557..4ef28e277228c 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -115,6 +115,8 @@ Changes to the AMDGPU Backend
 
 * Implemented :ref:`llvm.get.rounding <int_get_rounding>`
 
+* The default :ref:`AMDHSA code object version <amdgpu-amdhsa-code-object-metadata-v5>` is now 5.
+
 Changes to the ARM Backend
 --------------------------
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index f1c05446bf606..0bf9452d822e9 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -33,7 +33,7 @@
 
 static llvm::cl::opt<unsigned> DefaultAMDHSACodeObjectVersion(
     "amdhsa-code-object-version", llvm::cl::Hidden,
-    llvm::cl::init(llvm::AMDGPU::AMDHSA_COV4),
+    llvm::cl::init(llvm::AMDGPU::AMDHSA_COV5),
     llvm::cl::desc("Set default AMDHSA Code Object Version (module flag "
                    "or asm directive still take priority if present)"));
 
diff --git a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
index 5cce7befce528..eee7a680f5b3b 100644
--- a/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
+++ b/mlir/lib/Dialect/GPU/Transforms/SerializeToHsaco.cpp
@@ -264,7 +264,7 @@ SerializeToHsacoPass::translateToLLVMIR(llvm::LLVMContext &llvmContext) {
 
     // This constant must always match the default code object ABI version
     // of the AMDGPU backend.
-    addControlConstant("__oclc_ABI_version", 400, 32);
+    addControlConstant("__oclc_ABI_version", 500, 32);
   }
 
   // Determine libraries we need to link - order matters due to dependencies
diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index cbce23fd580e7..a230ead7c1883 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -99,6 +99,7 @@ class ROCDLDialectLLVMIRTranslationInterface
       if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
         llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
       }
+      llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "256");
     }
     // Override flat-work-group-size
     // TODO: update clients to rocdl.flat_work_group_size instead,
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3c9c70711ae23..f831d7bba864c 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -489,7 +489,7 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
   llvm.return %source5 : i32
 }
 
-// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
+// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="256" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
 // CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}

From 33ec6b3de7f72bc5da6351538f713777c1f8a199 Mon Sep 17 00:00:00 2001
From: Saiyedul Islam <Saiyedul.Islam@amd.com>
Date: Tue, 23 Jan 2024 17:08:38 +0530
Subject: [PATCH 601/843] [AMDGPU] Update llvm-objdump lit tests for COV5
 (#79039)

Depends on #79038 which makes cov5 as the default code
object version.
---
 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s        | 4 ++++
 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s        | 4 ++++
 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s        | 2 ++
 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s       | 3 +++
 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s         | 3 +++
 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s         | 3 +++
 llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s | 1 +
 7 files changed, 20 insertions(+)

diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s
index 58b01031afe38..781729d5c4cc1 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx10.s
@@ -48,6 +48,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 1
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
@@ -101,6 +102,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
@@ -154,6 +156,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
@@ -207,6 +210,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s
index 2133002908d9f..019c20754f389 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx11.s
@@ -49,6 +49,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 1
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
@@ -103,6 +104,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
@@ -157,6 +159,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
@@ -211,6 +214,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s
index e1d312d6035cb..86af4810059ec 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx12.s
@@ -46,6 +46,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 1
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
@@ -97,6 +98,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
 ; CHECK-NEXT: .amdhsa_wavefront_size32 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s
index d26189451829f..4978f6974fd33 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-gfx90a.s
@@ -45,6 +45,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 0
@@ -95,6 +96,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
@@ -145,6 +147,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_length  2
 ; CHECK-NEXT: .amdhsa_user_sgpr_kernarg_preload_offset  1
 ; CHECK-NEXT: .end_amdhsa_kernel
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s
index 1f6f134cd67ef..a40cf1d377693 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-sgpr.s
@@ -44,6 +44,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 0
@@ -95,6 +96,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 0
@@ -146,6 +148,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 0
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s
index 4d385a1c88578..b6b9c91b14246 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-vgpr.s
@@ -43,6 +43,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 23
@@ -90,6 +91,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 14
@@ -137,6 +139,7 @@
 ; CHECK-NEXT: .amdhsa_user_sgpr_dispatch_id 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_flat_scratch_init 0
 ; CHECK-NEXT: .amdhsa_user_sgpr_private_segment_size 0
+; CHECK-NEXT: .amdhsa_uses_dynamic_stack 0
 ; CHECK-NEXT: .end_amdhsa_kernel
 .amdhsa_kernel kernel
   .amdhsa_next_free_vgpr 32
diff --git a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s
index 39cef4da4278d..39739c957350c 100644
--- a/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s
+++ b/llvm/test/tools/llvm-objdump/ELF/AMDGPU/kd-zeroed-gfx10.s
@@ -62,6 +62,7 @@
 ; OBJDUMP-NEXT:         .amdhsa_user_sgpr_flat_scratch_init 0
 ; OBJDUMP-NEXT:         .amdhsa_user_sgpr_private_segment_size 0
 ; OBJDUMP-NEXT:         .amdhsa_wavefront_size32 0
+; OBJDUMP-NEXT:         .amdhsa_uses_dynamic_stack 0
 ; OBJDUMP-NEXT: .end_amdhsa_kernel
 
 .amdhsa_kernel my_kernel

From 7ec078ed4b7f6e7f44133a2abbc35c74b67787cc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 23 Jan 2024 13:39:48 +0200
Subject: [PATCH 602/843] [Support] Avoid a VirtualBox shared folders mmap bug
 (#78597)

In acd8791c2619f2afc0347c1bff073b32fbffb5d6, a call to FlushFileBuffers
was added to work around a rare kernel bug. In
3b9b4d2156673edda50584086fbfb0d66460b4d2, the scope of that workaround
was limited, for performance reasons, as the flushes are quite
expensive.

On VirtualBox shared folders, closing a memory mapping that has been
written to, also needs to be explicitly flushed, if renaming the output
file before it is closed. Contrary to the kernel bug, this always
happens on such mounts. In these cases, the output ends up as a file of
the right size, but the contents are all zeros.

The sequence to trigger the issue on the VirtualBox Shared Folders is
this, summarized:

    file = CreateFile()
    mapping = CreateFileMapping(file)
    mem = MapViewOfFile()
    CloseHandle(mapping)
    write(mem)
    UnmapViewOfFile(mem)
    SetFileInformationByHandle(file, FileRenameInfo)
    CloseHandle(file)

With this sequence, the output file always ends up with all zeros. See
https://github.com/mstorsjo/llvm-mingw/issues/393 for a full
reproduction example.

To avoid this issue, call FlushFileBuffers() when the file may reside on
a VitualBox shared folder. As the flushes are expensive, only do them
when the output isn't on a local file system.

The issue with VirtualBox shared folders could also be fixed by calling
FlushViewOfFile before UnmapViewOfFile, and doing that could be slightly
less expensive than FlushFileBuffers.

Empirically, the difference between the two is very small though, and as
it's not easy to verify whether switching FlushFileBuffers to
FlushViewOfFile helps with the rare kernel bug, keep using
FlushFileBuffers for both cases, for code simplicity.

This fixes downstream bug
https://github.com/mstorsjo/llvm-mingw/issues/393.
---
 llvm/lib/Support/Windows/Path.inc | 25 +++++++++++++++++++++++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index 7f5e76779412c..3e4c1f74161c6 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -961,7 +961,8 @@ void mapped_file_region::unmapImpl() {
 
     ::UnmapViewOfFile(Mapping);
 
-    if (Mode == mapmode::readwrite && Exe && hasFlushBufferKernelBug()) {
+    if (Mode == mapmode::readwrite) {
+      bool DoFlush = Exe && hasFlushBufferKernelBug();
       // There is a Windows kernel bug, the exact trigger conditions of which
       // are not well understood.  When triggered, dirty pages are not properly
       // flushed and subsequent process's attempts to read a file can return
@@ -969,7 +970,27 @@ void mapped_file_region::unmapImpl() {
       // sufficient to ensure that this bug is not triggered.
       // The bug only occurs when writing an executable and executing it right
       // after, under high I/O pressure.
-      ::FlushFileBuffers(FileHandle);
+      if (!DoFlush) {
+        // Separately, on VirtualBox Shared Folder mounts, writes via memory
+        // maps always end up unflushed (regardless of version of Windows),
+        // unless flushed with this explicit call, if they are renamed with
+        // SetFileInformationByHandle(FileRenameInfo) before closing the output
+        // handle.
+        //
+        // As the flushing is quite expensive, use a heuristic to limit the
+        // cases where we do the flushing. Only do the flushing if we aren't
+        // sure we are on a local file system.
+        bool IsLocal = false;
+        SmallVector<wchar_t, 128> FinalPath;
+        if (!realPathFromHandle(FileHandle, FinalPath)) {
+          // Not checking the return value here - if the check fails, assume the
+          // file isn't local.
+          is_local_internal(FinalPath, IsLocal);
+        }
+        DoFlush = !IsLocal;
+      }
+      if (DoFlush)
+        ::FlushFileBuffers(FileHandle);
     }
 
     ::CloseHandle(FileHandle);

From e3d73ad58c41b945d9fc5d5fb16ea44850ccf652 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 23 Jan 2024 13:42:24 +0200
Subject: [PATCH 603/843] [clang-repl] Fix CMake errors when cross compiling

These stem from 4821c90c24d52d4a42990fd9371caedb157bc58b.

When cross compiling, CMAKE_SYSTEM_PROCESSOR is empty, if the
target processor hasn't been set when setting up the cross
compilation. Ideally, when setting up cross compilation with
CMake, the user is supposed to set CMAKE_SYSTEM_PROCESSOR, but
so far, compilation has worked just fine even without it.

Quote the string where CMAKE_SYSTEM_PROCESSOR is expanded, to
avoid argument errors when it expands to an empty string.
---
 clang/tools/clang-repl/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index 2a0f617a2c0ff..031dcaba5e446 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -23,7 +23,7 @@ if(CLANG_PLUGIN_SUPPORT)
   export_executable_symbols_for_plugins(clang-repl)
 endif()
 
-string(TOUPPER ${CMAKE_SYSTEM_PROCESSOR} system_processor)
+string(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" system_processor)
 if(system_processor MATCHES "ARM")
   set(FLAG_LONG_PLT "-Wl,--long-plt")
   llvm_check_linker_flag(CXX ${FLAG_LONG_PLT} LINKER_HAS_FLAG_LONG_PLT)

From 5daf674feba0f57b083113ad7ed486cad433a916 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Jan 2024 11:38:10 +0000
Subject: [PATCH 604/843] ProfileSummary.h - remove unnecessary std::move.

The constructor args are passed by reference since d6790a0a3ce769fc6f37b2577c074d30cb92dbc2

Fixes MSVC static analysis warning
---
 llvm/include/llvm/IR/ProfileSummary.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/include/llvm/IR/ProfileSummary.h b/llvm/include/llvm/IR/ProfileSummary.h
index 4bb6bb8d4a405..b23d5a30846a2 100644
--- a/llvm/include/llvm/IR/ProfileSummary.h
+++ b/llvm/include/llvm/IR/ProfileSummary.h
@@ -73,10 +73,10 @@ class ProfileSummary {
                  uint64_t MaxInternalCount, uint64_t MaxFunctionCount,
                  uint32_t NumCounts, uint32_t NumFunctions,
                  bool Partial = false, double PartialProfileRatio = 0)
-      : PSK(K), DetailedSummary(std::move(DetailedSummary)),
-        TotalCount(TotalCount), MaxCount(MaxCount),
-        MaxInternalCount(MaxInternalCount), MaxFunctionCount(MaxFunctionCount),
-        NumCounts(NumCounts), NumFunctions(NumFunctions), Partial(Partial),
+      : PSK(K), DetailedSummary(DetailedSummary), TotalCount(TotalCount),
+        MaxCount(MaxCount), MaxInternalCount(MaxInternalCount),
+        MaxFunctionCount(MaxFunctionCount), NumCounts(NumCounts),
+        NumFunctions(NumFunctions), Partial(Partial),
         PartialProfileRatio(PartialProfileRatio) {}
 
   Kind getKind() const { return PSK; }

From 82f424f766be00b037a706a835d0a0663a2680f1 Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Tue, 23 Jan 2024 07:06:07 -0500
Subject: [PATCH 605/843] Revert "[clang][modules] Print library module
 manifest path. (#76451)"

This reverts commit a301fb11014f9cfdf4ee8cada173c46a7677d9d3.

The changes caused failures like:
https://lab.llvm.org/buildbot/#/builders/91/builds/22189
---
 clang/include/clang/Driver/Driver.h           | 10 -----
 clang/include/clang/Driver/Options.td         |  3 --
 clang/lib/Driver/Driver.cpp                   | 44 -------------------
 ...les-print-library-module-manifest-path.cpp | 36 ---------------
 4 files changed, 93 deletions(-)
 delete mode 100644 clang/test/Driver/modules-print-library-module-manifest-path.cpp

diff --git a/clang/include/clang/Driver/Driver.h b/clang/include/clang/Driver/Driver.h
index 595f104e406d3..3ee1bcf2a69c9 100644
--- a/clang/include/clang/Driver/Driver.h
+++ b/clang/include/clang/Driver/Driver.h
@@ -602,16 +602,6 @@ class Driver {
   // FIXME: This should be in CompilationInfo.
   std::string GetProgramPath(StringRef Name, const ToolChain &TC) const;
 
-  /// Lookup the path to the Standard library module manifest.
-  ///
-  /// \param C - The compilation.
-  /// \param TC - The tool chain for additional information on
-  /// directories to search.
-  //
-  // FIXME: This should be in CompilationInfo.
-  std::string GetStdModuleManifestPath(const Compilation &C,
-                                       const ToolChain &TC) const;
-
   /// HandleAutocompletions - Handle --autocomplete by searching and printing
   /// possible flags, descriptions, and its arguments.
   void HandleAutocompletions(StringRef PassedFlags) const;
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f203a0fe7ede1..7f4fa33748fac 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5310,9 +5310,6 @@ def print_resource_dir : Flag<["-", "--"], "print-resource-dir">,
 def print_search_dirs : Flag<["-", "--"], "print-search-dirs">,
   HelpText<"Print the paths used for finding libraries and programs">,
   Visibility<[ClangOption, CLOption]>;
-def print_std_module_manifest_path : Flag<["-", "--"], "print-library-module-manifest-path">,
-  HelpText<"Print the path for the C++ Standard library module manifest">,
-  Visibility<[ClangOption, CLOption]>;
 def print_targets : Flag<["-", "--"], "print-targets">,
   HelpText<"Print the registered targets">,
   Visibility<[ClangOption, CLOption]>;
diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 5c170915d58d5..7109faa1072de 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -2194,12 +2194,6 @@ bool Driver::HandleImmediateArgs(const Compilation &C) {
     return false;
   }
 
-  if (C.getArgs().hasArg(options::OPT_print_std_module_manifest_path)) {
-    llvm::outs() << GetStdModuleManifestPath(C, C.getDefaultToolChain())
-                 << '\n';
-    return false;
-  }
-
   if (C.getArgs().hasArg(options::OPT_print_runtime_dir)) {
     if (std::optional<std::string> RuntimePath = TC.getRuntimePath())
       llvm::outs() << *RuntimePath << '\n';
@@ -6172,44 +6166,6 @@ std::string Driver::GetProgramPath(StringRef Name, const ToolChain &TC) const {
   return std::string(Name);
 }
 
-std::string Driver::GetStdModuleManifestPath(const Compilation &C,
-                                             const ToolChain &TC) const {
-  std::string error = "<NOT PRESENT>";
-
-  switch (TC.GetCXXStdlibType(C.getArgs())) {
-  case ToolChain::CST_Libcxx: {
-    std::string lib = GetFilePath("libc++.so", TC);
-
-    // Note when there are multiple flavours of libc++ the module json needs to
-    // look at the command-line arguments for the proper json.
-    // These flavours do not exist at the moment, but there are plans to
-    // provide a variant that is built with sanitizer instrumentation enabled.
-
-    // For example
-    //  StringRef modules = [&] {
-    //    const SanitizerArgs &Sanitize = TC.getSanitizerArgs(C.getArgs());
-    //    if (Sanitize.needsAsanRt())
-    //      return "modules-asan.json";
-    //    return "modules.json";
-    //  }();
-
-    SmallString<128> path(lib.begin(), lib.end());
-    llvm::sys::path::remove_filename(path);
-    llvm::sys::path::append(path, "modules.json");
-    if (TC.getVFS().exists(path))
-      return static_cast<std::string>(path);
-
-    return error;
-  }
-
-  case ToolChain::CST_Libstdcxx:
-    // libstdc++ does not provide Standard library modules yet.
-    return error;
-  }
-
-  return error;
-}
-
 std::string Driver::GetTemporaryPath(StringRef Prefix, StringRef Suffix) const {
   SmallString<128> Path;
   std::error_code EC = llvm::sys::fs::createTemporaryFile(Prefix, Suffix, Path);
diff --git a/clang/test/Driver/modules-print-library-module-manifest-path.cpp b/clang/test/Driver/modules-print-library-module-manifest-path.cpp
deleted file mode 100644
index 693a992876019..0000000000000
--- a/clang/test/Driver/modules-print-library-module-manifest-path.cpp
+++ /dev/null
@@ -1,36 +0,0 @@
-// Test that -print-library-module-manifest-path finds the correct file.
-
-// RUN: rm -rf %t && split-file %s %t && cd %t
-// RUN: mkdir -p %t/Inputs/usr/lib/x86_64-linux-gnu
-// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/libc++.so
-
-// RUN: %clang -print-library-module-manifest-path \
-// RUN:     -stdlib=libc++ \
-// RUN:     --sysroot=%t/Inputs \
-// RUN:     --target=x86_64-linux-gnu 2>&1 \
-// RUN:   | FileCheck libcxx-no-module-json.cpp
-
-// RUN: touch %t/Inputs/usr/lib/x86_64-linux-gnu/modules.json
-// RUN: %clang -print-library-module-manifest-path \
-// RUN:     -stdlib=libc++ \
-// RUN:     --sysroot=%t/Inputs \
-// RUN:     --target=x86_64-linux-gnu 2>&1 \
-// RUN:   | FileCheck libcxx.cpp
-
-// RUN: %clang -print-library-module-manifest-path \
-// RUN:     -stdlib=libstdc++ \
-// RUN:     --sysroot=%t/Inputs \
-// RUN:     --target=x86_64-linux-gnu 2>&1 \
-// RUN:   | FileCheck libstdcxx.cpp
-
-//--- libcxx-no-module-json.cpp
-
-// CHECK: <NOT PRESENT>
-
-//--- libcxx.cpp
-
-// CHECK: {{.*}}/Inputs/usr/lib/x86_64-linux-gnu{{/|\\}}modules.json
-
-//--- libstdcxx.cpp
-
-// CHECK: <NOT PRESENT>

From 42b0884238aa3149785ac75fe983e3d6c7715a11 Mon Sep 17 00:00:00 2001
From: Pierre van Houtryve <pierre.vanhoutryve@amd.com>
Date: Tue, 23 Jan 2024 13:10:58 +0100
Subject: [PATCH 606/843] [AMDGPU] Handle V_PERMLANE64_B32 in
 fixVcmpxPermlaneHazards (#79125)

Fixes #78856
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |  1 +
 .../CodeGen/AMDGPU/vcmpx-permlane-hazard.mir  | 23 +++++++++++++++++++
 2 files changed, 24 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index ebad40b641820..b6e4e65ff5b03 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -163,6 +163,7 @@ static bool isSendMsgTraceDataOrGDS(const SIInstrInfo &TII,
 static bool isPermlane(const MachineInstr &MI) {
   unsigned Opcode = MI.getOpcode();
   return Opcode == AMDGPU::V_PERMLANE16_B32_e64 ||
+         Opcode == AMDGPU::V_PERMLANE64_B32 ||
          Opcode == AMDGPU::V_PERMLANEX16_B32_e64 ||
          Opcode == AMDGPU::V_PERMLANE16_VAR_B32_e64 ||
          Opcode == AMDGPU::V_PERMLANEX16_VAR_B32_e64;
diff --git a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
index c74d66239f22c..3f40a57ca1491 100644
--- a/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
+++ b/llvm/test/CodeGen/AMDGPU/vcmpx-permlane-hazard.mir
@@ -168,3 +168,26 @@ body:            |
     $vgpr1 = V_PERMLANE16_B32_e64 0, killed $vgpr1, 0, killed $sgpr1, 0, killed $sgpr0, $vgpr1, 0, implicit $exec
     S_ENDPGM 0
 ...
+
+# GCN-LABEL: name: hazard_vcmpx_permlane64
+# GCN:      V_CMPX_LE_F32_nosdst_e32
+# GCN:      S_ADD_U32
+# GCN-NEXT: $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec
+# GCN-NEXT: V_PERMLANE64_B32
+---
+name:            hazard_vcmpx_permlane64
+body:            |
+  bb.0:
+    successors: %bb.1
+    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
+    V_CMPX_LE_F32_nosdst_e32 0, $vgpr0, implicit-def $exec, implicit $mode, implicit $exec
+    S_BRANCH %bb.1
+
+  bb.1:
+    $vgpr1 = IMPLICIT_DEF
+    $vgpr2 = IMPLICIT_DEF
+    $sgpr0 = IMPLICIT_DEF
+    $sgpr1 = S_ADD_U32 $sgpr0, 0, implicit-def $scc
+    $vgpr1 = V_PERMLANE64_B32 killed $vgpr1, implicit $exec
+    S_ENDPGM 0
+...

From 52665432842fc187eac6404407df1fa6f37444d6 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 23 Jan 2024 12:32:24 +0000
Subject: [PATCH 607/843] [RemoveDIs][DebugInfo] Handle DPVAssigns in
 AssignmentTrackingLowering (#78980)

Following on from the previous patch 6aeb7a7,
this patch adds the necessary code to process the DPV equivalents of
llvm.dbg.assign intrinsics. Most of the content of this patch is simply
duplicating existing functionality, using generic code for simple
functions and PointerUnions where storage is required. The most complex
changes are in the places that iterate over instructions, as iterating
over DPValues between instructions is different to iterating over
instructions that may or may not be debug intrinsics; this is most
complex in `AssignmentTrackingLowering::process`, where I've added some
comments to explain the state of the program at each key point depending
on whether we are operating on intrinsics or DPValues.
---
 .../CodeGen/AssignmentTrackingAnalysis.cpp    | 428 ++++++++++++------
 1 file changed, 290 insertions(+), 138 deletions(-)

diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index 52b464cc76af4..a558a304e8f41 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -1019,13 +1019,14 @@ class AssignmentTrackingLowering {
   /// i.e. for all values x and y where x != y:
   /// join(x, x) = x
   /// join(x, y) = NoneOrPhi
+  using AssignRecord = PointerUnion<DbgAssignIntrinsic *, DPValue *>;
   struct Assignment {
     enum S { Known, NoneOrPhi } Status;
     /// ID of the assignment. nullptr if Status is not Known.
     DIAssignID *ID;
     /// The dbg.assign that marks this dbg-def. Mem-defs don't use this field.
     /// May be nullptr.
-    DbgAssignIntrinsic *Source;
+    AssignRecord Source;
 
     bool isSameSourceAssignment(const Assignment &Other) const {
       // Don't include Source in the equality check. Assignments are
@@ -1040,31 +1041,51 @@ class AssignmentTrackingLowering {
       else
         OS << "null";
       OS << ", s=";
-      if (Source)
-        OS << *Source;
-      else
+      if (Source.isNull())
         OS << "null";
+      else if (isa<DbgAssignIntrinsic *>(Source))
+        OS << Source.get<DbgAssignIntrinsic *>();
+      else
+        OS << Source.get<DPValue *>();
       OS << ")";
     }
 
     static Assignment make(DIAssignID *ID, DbgAssignIntrinsic *Source) {
       return Assignment(Known, ID, Source);
     }
-    static Assignment makeFromMemDef(DIAssignID *ID) {
-      return Assignment(Known, ID, nullptr);
+    static Assignment make(DIAssignID *ID, DPValue *Source) {
+      assert(Source->isDbgAssign() &&
+             "Cannot make an assignment from a non-assign DPValue");
+      return Assignment(Known, ID, Source);
     }
-    static Assignment makeNoneOrPhi() {
-      return Assignment(NoneOrPhi, nullptr, nullptr);
+    static Assignment make(DIAssignID *ID, AssignRecord Source) {
+      return Assignment(Known, ID, Source);
     }
+    static Assignment makeFromMemDef(DIAssignID *ID) {
+      return Assignment(Known, ID);
+    }
+    static Assignment makeNoneOrPhi() { return Assignment(NoneOrPhi, nullptr); }
     // Again, need a Top value?
-    Assignment()
-        : Status(NoneOrPhi), ID(nullptr), Source(nullptr) {
-    } // Can we delete this?
+    Assignment() : Status(NoneOrPhi), ID(nullptr) {} // Can we delete this?
+    Assignment(S Status, DIAssignID *ID) : Status(Status), ID(ID) {
+      // If the Status is Known then we expect there to be an assignment ID.
+      assert(Status == NoneOrPhi || ID);
+    }
     Assignment(S Status, DIAssignID *ID, DbgAssignIntrinsic *Source)
         : Status(Status), ID(ID), Source(Source) {
       // If the Status is Known then we expect there to be an assignment ID.
       assert(Status == NoneOrPhi || ID);
     }
+    Assignment(S Status, DIAssignID *ID, DPValue *Source)
+        : Status(Status), ID(ID), Source(Source) {
+      // If the Status is Known then we expect there to be an assignment ID.
+      assert(Status == NoneOrPhi || ID);
+    }
+    Assignment(S Status, DIAssignID *ID, AssignRecord Source)
+        : Status(Status), ID(ID), Source(Source) {
+      // If the Status is Known then we expect there to be an assignment ID.
+      assert(Status == NoneOrPhi || ID);
+    }
   };
 
   using AssignmentMap = SmallVector<Assignment>;
@@ -1091,8 +1112,16 @@ class AssignmentTrackingLowering {
   /// After.
   void resetInsertionPoint(Instruction &After);
   void resetInsertionPoint(DPValue &After);
-  void emitDbgValue(LocKind Kind, const DbgVariableIntrinsic *Source,
-                    Instruction *After);
+
+  // emitDbgValue can be called with:
+  //   Source=[AssignRecord|DbgValueInst*|DbgAssignIntrinsic*|DPValue*]
+  // Since AssignRecord can be cast to one of the latter two types, and all
+  // other types have a shared interface, we use a template to handle the latter
+  // three types, and an explicit overload for AssignRecord that forwards to
+  // the template version with the right type.
+  void emitDbgValue(LocKind Kind, AssignRecord Source, VarLocInsertPt After);
+  template <typename T>
+  void emitDbgValue(LocKind Kind, const T Source, VarLocInsertPt After);
 
   static bool mapsAreEqual(const BitVector &Mask, const AssignmentMap &A,
                            const AssignmentMap &B) {
@@ -1317,8 +1346,10 @@ class AssignmentTrackingLowering {
   /// Update \p LiveSet after encountering an instruciton without a DIAssignID
   /// attachment, \p I.
   void processUntaggedInstruction(Instruction &I, BlockInfo *LiveSet);
-  void processDbgAssign(DbgAssignIntrinsic &DAI, BlockInfo *LiveSet);
-  void processDbgValue(DbgValueInst &DVI, BlockInfo *LiveSet);
+  void processDbgAssign(AssignRecord Assign, BlockInfo *LiveSet);
+  void processDPValue(DPValue &DPV, BlockInfo *LiveSet);
+  void processDbgValue(PointerUnion<DbgValueInst *, DPValue *> DbgValueRecord,
+                       BlockInfo *LiveSet);
   /// Add an assignment to memory for the variable /p Var.
   void addMemDef(BlockInfo *LiveSet, VariableID Var, const Assignment &AV);
   /// Add an assignment to the variable /p Var.
@@ -1418,6 +1449,12 @@ static DIAssignID *getIDFromMarker(const DbgAssignIntrinsic &DAI) {
   return cast<DIAssignID>(DAI.getAssignID());
 }
 
+static DIAssignID *getIDFromMarker(const DPValue &DPV) {
+  assert(DPV.isDbgAssign() &&
+         "Cannot get a DIAssignID from a non-assign DPValue!");
+  return DPV.getAssignID();
+}
+
 /// Return true if \p Var has an assignment in \p M matching \p AV.
 bool AssignmentTrackingLowering::hasVarWithAssignment(
     BlockInfo *LiveSet, BlockInfo::AssignmentKind Kind, VariableID Var,
@@ -1466,9 +1503,28 @@ VarLocInsertPt getNextNode(VarLocInsertPt InsertPt) {
   return getNextNode(cast<const DPValue *>(InsertPt));
 }
 
+DbgAssignIntrinsic *CastToDbgAssign(DbgVariableIntrinsic *DVI) {
+  return cast<DbgAssignIntrinsic>(DVI);
+}
+
+DPValue *CastToDbgAssign(DPValue *DPV) {
+  assert(DPV->isDbgAssign() &&
+         "Attempted to cast non-assign DPValue to DPVAssign.");
+  return DPV;
+}
+
 void AssignmentTrackingLowering::emitDbgValue(
     AssignmentTrackingLowering::LocKind Kind,
-    const DbgVariableIntrinsic *Source, Instruction *After) {
+    AssignmentTrackingLowering::AssignRecord Source, VarLocInsertPt After) {
+  if (isa<DbgAssignIntrinsic *>(Source))
+    emitDbgValue(Kind, cast<DbgAssignIntrinsic *>(Source), After);
+  else
+    emitDbgValue(Kind, cast<DPValue *>(Source), After);
+}
+template <typename T>
+void AssignmentTrackingLowering::emitDbgValue(
+    AssignmentTrackingLowering::LocKind Kind, const T Source,
+    VarLocInsertPt After) {
 
   DILocation *DL = Source->getDebugLoc();
   auto Emit = [this, Source, After, DL](Metadata *Val, DIExpression *Expr) {
@@ -1493,15 +1549,15 @@ void AssignmentTrackingLowering::emitDbgValue(
 
   // NOTE: This block can mutate Kind.
   if (Kind == LocKind::Mem) {
-    const auto *DAI = cast<DbgAssignIntrinsic>(Source);
+    const auto *Assign = CastToDbgAssign(Source);
     // Check the address hasn't been dropped (e.g. the debug uses may not have
     // been replaced before deleting a Value).
-    if (DAI->isKillAddress()) {
+    if (Assign->isKillAddress()) {
       // The address isn't valid so treat this as a non-memory def.
       Kind = LocKind::Val;
     } else {
-      Value *Val = DAI->getAddress();
-      DIExpression *Expr = DAI->getAddressExpression();
+      Value *Val = Assign->getAddress();
+      DIExpression *Expr = Assign->getAddressExpression();
       assert(!Expr->getFragmentInfo() &&
              "fragment info should be stored in value-expression only");
       // Copy the fragment info over from the value-expression to the new
@@ -1610,25 +1666,26 @@ void AssignmentTrackingLowering::processUntaggedInstruction(
 void AssignmentTrackingLowering::processTaggedInstruction(
     Instruction &I, AssignmentTrackingLowering::BlockInfo *LiveSet) {
   auto Linked = at::getAssignmentMarkers(&I);
+  auto LinkedDPAssigns = at::getDPVAssignmentMarkers(&I);
   // No dbg.assign intrinsics linked.
   // FIXME: All vars that have a stack slot this store modifies that don't have
   // a dbg.assign linked to it should probably treat this like an untagged
   // store.
-  if (Linked.empty())
+  if (Linked.empty() && LinkedDPAssigns.empty())
     return;
 
   LLVM_DEBUG(dbgs() << "processTaggedInstruction on " << I << "\n");
-  for (DbgAssignIntrinsic *DAI : Linked) {
-    VariableID Var = getVariableID(DebugVariable(DAI));
+  auto ProcessLinkedAssign = [&](auto *Assign) {
+    VariableID Var = getVariableID(DebugVariable(Assign));
     // Something has gone wrong if VarsWithStackSlot doesn't contain a variable
     // that is linked to a store.
-    assert(VarsWithStackSlot->count(getAggregate(DAI)) &&
-           "expected DAI's variable to have stack slot");
+    assert(VarsWithStackSlot->count(getAggregate(Assign)) &&
+           "expected Assign's variable to have stack slot");
 
     Assignment AV = Assignment::makeFromMemDef(getIDFromInst(I));
     addMemDef(LiveSet, Var, AV);
 
-    LLVM_DEBUG(dbgs() << "   linked to " << *DAI << "\n");
+    LLVM_DEBUG(dbgs() << "   linked to " << *Assign << "\n");
     LLVM_DEBUG(dbgs() << "   LiveLoc " << locStr(getLocKind(LiveSet, Var))
                       << " -> ");
 
@@ -1643,8 +1700,8 @@ void AssignmentTrackingLowering::processTaggedInstruction(
                  LiveSet->DebugValue[static_cast<unsigned>(Var)].dump(dbgs());
                  dbgs() << "\n");
       setLocKind(LiveSet, Var, LocKind::Mem);
-      emitDbgValue(LocKind::Mem, DAI, &I);
-      continue;
+      emitDbgValue(LocKind::Mem, Assign, &I);
+      return;
     }
 
     // The StackHomeValue and DebugValue for this variable do not match. I.e.
@@ -1669,7 +1726,7 @@ void AssignmentTrackingLowering::processTaggedInstruction(
         // We need to terminate any previously open location now.
         LLVM_DEBUG(dbgs() << "None, No Debug value available\n";);
         setLocKind(LiveSet, Var, LocKind::None);
-        emitDbgValue(LocKind::None, DAI, &I);
+        emitDbgValue(LocKind::None, Assign, &I);
       } else {
         // The previous DebugValue Value can be used here.
         LLVM_DEBUG(dbgs() << "Val, Debug value is Known\n";);
@@ -1678,7 +1735,7 @@ void AssignmentTrackingLowering::processTaggedInstruction(
           emitDbgValue(LocKind::Val, DbgAV.Source, &I);
         } else {
           // PrevAV.Source is nullptr so we must emit undef here.
-          emitDbgValue(LocKind::None, DAI, &I);
+          emitDbgValue(LocKind::None, Assign, &I);
         }
       }
     } break;
@@ -1689,74 +1746,89 @@ void AssignmentTrackingLowering::processTaggedInstruction(
       setLocKind(LiveSet, Var, LocKind::None);
     } break;
     }
-  }
+  };
+  for (DbgAssignIntrinsic *DAI : Linked)
+    ProcessLinkedAssign(DAI);
+  for (DPValue *DPV : LinkedDPAssigns)
+    ProcessLinkedAssign(DPV);
 }
 
-void AssignmentTrackingLowering::processDbgAssign(DbgAssignIntrinsic &DAI,
+void AssignmentTrackingLowering::processDbgAssign(AssignRecord Assign,
                                                   BlockInfo *LiveSet) {
-  // Only bother tracking variables that are at some point stack homed. Other
-  // variables can be dealt with trivially later.
-  if (!VarsWithStackSlot->count(getAggregate(&DAI)))
-    return;
+  auto ProcessDbgAssignImpl = [&](auto *DbgAssign) {
+    // Only bother tracking variables that are at some point stack homed. Other
+    // variables can be dealt with trivially later.
+    if (!VarsWithStackSlot->count(getAggregate(DbgAssign)))
+      return;
 
-  VariableID Var = getVariableID(DebugVariable(&DAI));
-  Assignment AV = Assignment::make(getIDFromMarker(DAI), &DAI);
-  addDbgDef(LiveSet, Var, AV);
-
-  LLVM_DEBUG(dbgs() << "processDbgAssign on " << DAI << "\n";);
-  LLVM_DEBUG(dbgs() << "   LiveLoc " << locStr(getLocKind(LiveSet, Var))
-                    << " -> ");
-
-  // Check if the DebugValue and StackHomeValue both hold the same
-  // Assignment.
-  if (hasVarWithAssignment(LiveSet, BlockInfo::Stack, Var, AV)) {
-    // They match. We can use the stack home because the debug intrinsics state
-    // that an assignment happened here, and we know that specific assignment
-    // was the last one to take place in memory for this variable.
-    LocKind Kind;
-    if (DAI.isKillAddress()) {
-      LLVM_DEBUG(
-          dbgs()
-              << "Val, Stack matches Debug program but address is killed\n";);
-      Kind = LocKind::Val;
+    VariableID Var = getVariableID(DebugVariable(DbgAssign));
+    Assignment AV = Assignment::make(getIDFromMarker(*DbgAssign), DbgAssign);
+    addDbgDef(LiveSet, Var, AV);
+
+    LLVM_DEBUG(dbgs() << "processDbgAssign on " << *DbgAssign << "\n";);
+    LLVM_DEBUG(dbgs() << "   LiveLoc " << locStr(getLocKind(LiveSet, Var))
+                      << " -> ");
+
+    // Check if the DebugValue and StackHomeValue both hold the same
+    // Assignment.
+    if (hasVarWithAssignment(LiveSet, BlockInfo::Stack, Var, AV)) {
+      // They match. We can use the stack home because the debug intrinsics
+      // state that an assignment happened here, and we know that specific
+      // assignment was the last one to take place in memory for this variable.
+      LocKind Kind;
+      if (DbgAssign->isKillAddress()) {
+        LLVM_DEBUG(
+            dbgs()
+                << "Val, Stack matches Debug program but address is killed\n";);
+        Kind = LocKind::Val;
+      } else {
+        LLVM_DEBUG(dbgs() << "Mem, Stack matches Debug program\n";);
+        Kind = LocKind::Mem;
+      };
+      setLocKind(LiveSet, Var, Kind);
+      emitDbgValue(Kind, DbgAssign, DbgAssign);
     } else {
-      LLVM_DEBUG(dbgs() << "Mem, Stack matches Debug program\n";);
-      Kind = LocKind::Mem;
-    };
-    setLocKind(LiveSet, Var, Kind);
-    emitDbgValue(Kind, &DAI, &DAI);
-  } else {
-    // The last assignment to the memory location isn't the one that we want to
-    // show to the user so emit a dbg.value(Value). Value may be undef.
-    LLVM_DEBUG(dbgs() << "Val, Stack contents is unknown\n";);
-    setLocKind(LiveSet, Var, LocKind::Val);
-    emitDbgValue(LocKind::Val, &DAI, &DAI);
-  }
+      // The last assignment to the memory location isn't the one that we want
+      // to show to the user so emit a dbg.value(Value). Value may be undef.
+      LLVM_DEBUG(dbgs() << "Val, Stack contents is unknown\n";);
+      setLocKind(LiveSet, Var, LocKind::Val);
+      emitDbgValue(LocKind::Val, DbgAssign, DbgAssign);
+    }
+  };
+  if (isa<DPValue *>(Assign))
+    return ProcessDbgAssignImpl(cast<DPValue *>(Assign));
+  return ProcessDbgAssignImpl(cast<DbgAssignIntrinsic *>(Assign));
 }
 
-void AssignmentTrackingLowering::processDbgValue(DbgValueInst &DVI,
-                                                 BlockInfo *LiveSet) {
-  // Only other tracking variables that are at some point stack homed.
-  // Other variables can be dealt with trivally later.
-  if (!VarsWithStackSlot->count(getAggregate(&DVI)))
-    return;
+void AssignmentTrackingLowering::processDbgValue(
+    PointerUnion<DbgValueInst *, DPValue *> DbgValueRecord,
+    BlockInfo *LiveSet) {
+  auto ProcessDbgValueImpl = [&](auto *DbgValue) {
+    // Only other tracking variables that are at some point stack homed.
+    // Other variables can be dealt with trivally later.
+    if (!VarsWithStackSlot->count(getAggregate(DbgValue)))
+      return;
 
-  VariableID Var = getVariableID(DebugVariable(&DVI));
-  // We have no ID to create an Assignment with so we mark this assignment as
-  // NoneOrPhi. Note that the dbg.value still exists, we just cannot determine
-  // the assignment responsible for setting this value.
-  // This is fine; dbg.values are essentially interchangable with unlinked
-  // dbg.assigns, and some passes such as mem2reg and instcombine add them to
-  // PHIs for promoted variables.
-  Assignment AV = Assignment::makeNoneOrPhi();
-  addDbgDef(LiveSet, Var, AV);
-
-  LLVM_DEBUG(dbgs() << "processDbgValue on " << DVI << "\n";);
-  LLVM_DEBUG(dbgs() << "   LiveLoc " << locStr(getLocKind(LiveSet, Var))
-                    << " -> Val, dbg.value override");
-
-  setLocKind(LiveSet, Var, LocKind::Val);
-  emitDbgValue(LocKind::Val, &DVI, &DVI);
+    VariableID Var = getVariableID(DebugVariable(DbgValue));
+    // We have no ID to create an Assignment with so we mark this assignment as
+    // NoneOrPhi. Note that the dbg.value still exists, we just cannot determine
+    // the assignment responsible for setting this value.
+    // This is fine; dbg.values are essentially interchangable with unlinked
+    // dbg.assigns, and some passes such as mem2reg and instcombine add them to
+    // PHIs for promoted variables.
+    Assignment AV = Assignment::makeNoneOrPhi();
+    addDbgDef(LiveSet, Var, AV);
+
+    LLVM_DEBUG(dbgs() << "processDbgValue on " << *DbgValue << "\n";);
+    LLVM_DEBUG(dbgs() << "   LiveLoc " << locStr(getLocKind(LiveSet, Var))
+                      << " -> Val, dbg.value override");
+
+    setLocKind(LiveSet, Var, LocKind::Val);
+    emitDbgValue(LocKind::Val, DbgValue, DbgValue);
+  };
+  if (isa<DPValue *>(DbgValueRecord))
+    return ProcessDbgValueImpl(cast<DPValue *>(DbgValueRecord));
+  return ProcessDbgValueImpl(cast<DbgValueInst *>(DbgValueRecord));
 }
 
 template <typename T> static bool hasZeroSizedFragment(T &DbgValue) {
@@ -1776,33 +1848,73 @@ void AssignmentTrackingLowering::processDbgInstruction(
     return;
 
   if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
-    processDbgAssign(*DAI, LiveSet);
+    processDbgAssign(DAI, LiveSet);
   else if (auto *DVI = dyn_cast<DbgValueInst>(&I))
-    processDbgValue(*DVI, LiveSet);
+    processDbgValue(DVI, LiveSet);
+}
+void AssignmentTrackingLowering::processDPValue(
+    DPValue &DPV, AssignmentTrackingLowering::BlockInfo *LiveSet) {
+  // Ignore assignments to zero bits of the variable.
+  if (hasZeroSizedFragment(DPV))
+    return;
+
+  if (DPV.isDbgAssign())
+    processDbgAssign(&DPV, LiveSet);
+  else if (DPV.isDbgValue())
+    processDbgValue(&DPV, LiveSet);
 }
 
 void AssignmentTrackingLowering::resetInsertionPoint(Instruction &After) {
   assert(!After.isTerminator() && "Can't insert after a terminator");
-  auto R = InsertBeforeMap.find(After.getNextNode());
+  auto *R = InsertBeforeMap.find(getNextNode(&After));
+  if (R == InsertBeforeMap.end())
+    return;
+  R->second.clear();
+}
+void AssignmentTrackingLowering::resetInsertionPoint(DPValue &After) {
+  auto *R = InsertBeforeMap.find(getNextNode(&After));
   if (R == InsertBeforeMap.end())
     return;
   R->second.clear();
 }
 
 void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
+  // If the block starts with DPValues, we need to process those DPValues as
+  // their own frame without processing any instructions first.
+  bool ProcessedLeadingDPValues = !BB.begin()->hasDbgValues();
   for (auto II = BB.begin(), EI = BB.end(); II != EI;) {
     assert(VarsTouchedThisFrame.empty());
     // Process the instructions in "frames". A "frame" includes a single
     // non-debug instruction followed any debug instructions before the
     // next non-debug instruction.
-    if (!isa<DbgInfoIntrinsic>(&*II)) {
-      if (II->isTerminator())
-        break;
-      resetInsertionPoint(*II);
-      processNonDbgInstruction(*II, LiveSet);
-      assert(LiveSet->isValid());
-      ++II;
+
+    // Skip the current instruction if it has unprocessed DPValues attached (see
+    // comment above `ProcessedLeadingDPValues`).
+    if (ProcessedLeadingDPValues) {
+      // II is now either a debug intrinsic, a non-debug instruction with no
+      // attached DPValues, or a non-debug instruction with attached processed
+      // DPValues.
+      // II has not been processed.
+      if (!isa<DbgInfoIntrinsic>(&*II)) {
+        if (II->isTerminator())
+          break;
+        resetInsertionPoint(*II);
+        processNonDbgInstruction(*II, LiveSet);
+        assert(LiveSet->isValid());
+        ++II;
+      }
     }
+    // II is now either a debug intrinsic, a non-debug instruction with no
+    // attached DPValues, or a non-debug instruction with attached unprocessed
+    // DPValues.
+    if (II != EI && II->hasDbgValues()) {
+      for (DPValue &DPV : II->getDbgValueRange()) {
+        resetInsertionPoint(DPV);
+        processDPValue(DPV, LiveSet);
+        assert(LiveSet->isValid());
+      }
+    }
+    ProcessedLeadingDPValues = true;
     while (II != EI) {
       auto *Dbg = dyn_cast<DbgInfoIntrinsic>(&*II);
       if (!Dbg)
@@ -1812,6 +1924,10 @@ void AssignmentTrackingLowering::process(BasicBlock &BB, BlockInfo *LiveSet) {
       assert(LiveSet->isValid());
       ++II;
     }
+    // II is now a non-debug instruction either with no attached DPValues, or
+    // with attached processed DPValues. II has not been processed, and all
+    // debug instructions or DPValues in the frame preceding II have been
+    // processed.
 
     // We've processed everything in the "frame". Now determine which variables
     // cannot be represented by a dbg.declare.
@@ -1868,16 +1984,22 @@ AssignmentTrackingLowering::joinAssignment(const Assignment &A,
   // Here the same assignment (!1) was performed in both preds in the source,
   // but we can't use either one unless they are identical (e.g. .we don't
   // want to arbitrarily pick between constant values).
-  auto JoinSource = [&]() -> DbgAssignIntrinsic * {
+  auto JoinSource = [&]() -> AssignRecord {
     if (A.Source == B.Source)
       return A.Source;
-    if (A.Source == nullptr || B.Source == nullptr)
-      return nullptr;
-    if (A.Source->isIdenticalTo(B.Source))
+    if (!A.Source || !B.Source)
+      return AssignRecord();
+    assert(isa<DPValue *>(A.Source) == isa<DPValue *>(B.Source));
+    if (isa<DPValue *>(A.Source) &&
+        cast<DPValue *>(A.Source)->isEquivalentTo(*cast<DPValue *>(B.Source)))
       return A.Source;
-    return nullptr;
+    if (isa<DbgAssignIntrinsic *>(A.Source) &&
+        cast<DbgAssignIntrinsic *>(A.Source)->isIdenticalTo(
+            cast<DbgAssignIntrinsic *>(B.Source)))
+      return A.Source;
+    return AssignRecord();
   };
-  DbgAssignIntrinsic *Source = JoinSource();
+  AssignRecord Source = JoinSource();
   assert(A.Status == B.Status && A.Status == Assignment::Known);
   assert(A.ID == B.ID);
   return Assignment::make(A.ID, Source);
@@ -1986,6 +2108,14 @@ getUntaggedStoreAssignmentInfo(const Instruction &I, const DataLayout &Layout) {
   return std::nullopt;
 }
 
+DbgDeclareInst *DynCastToDbgDeclare(DbgVariableIntrinsic *DVI) {
+  return dyn_cast<DbgDeclareInst>(DVI);
+}
+
+DPValue *DynCastToDbgDeclare(DPValue *DPV) {
+  return DPV->isDbgDeclare() ? DPV : nullptr;
+}
+
 /// Build a map of {Variable x: Variables y} where all variable fragments
 /// contained within the variable fragment x are in set y. This means that
 /// y does not contain all overlaps because partial overlaps are excluded.
@@ -2019,31 +2149,39 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
   //                     UntaggedStoreVars.
   // We need to add fragments for untagged stores too so that we can correctly
   // clobber overlapped fragment locations later.
-  SmallVector<DbgDeclareInst *> Declares;
+  SmallVector<DbgDeclareInst *> InstDeclares;
+  SmallVector<DPValue *> DPDeclares;
+  auto ProcessDbgRecord = [&](auto *Record, auto &DeclareList) {
+    if (auto *Declare = DynCastToDbgDeclare(Record)) {
+      DeclareList.push_back(Declare);
+      return;
+    }
+    DebugVariable DV = DebugVariable(Record);
+    DebugAggregate DA = {DV.getVariable(), DV.getInlinedAt()};
+    if (!VarsWithStackSlot.contains(DA))
+      return;
+    if (Seen.insert(DV).second)
+      FragmentMap[DA].push_back(DV);
+  };
   for (auto &BB : Fn) {
     for (auto &I : BB) {
-      if (auto *DDI = dyn_cast<DbgDeclareInst>(&I)) {
-        Declares.push_back(DDI);
-      } else if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
-        DebugVariable DV = DebugVariable(DII);
-        DebugAggregate DA = {DV.getVariable(), DV.getInlinedAt()};
-        if (!VarsWithStackSlot.contains(DA))
-          continue;
-        if (Seen.insert(DV).second)
-          FragmentMap[DA].push_back(DV);
+      for (auto &DPV : I.getDbgValueRange())
+        ProcessDbgRecord(&DPV, DPDeclares);
+      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
+        ProcessDbgRecord(DII, InstDeclares);
       } else if (auto Info = getUntaggedStoreAssignmentInfo(
                      I, Fn.getParent()->getDataLayout())) {
         // Find markers linked to this alloca.
-        for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(Info->Base)) {
+        auto HandleDbgAssignForStore = [&](auto *Assign) {
           std::optional<DIExpression::FragmentInfo> FragInfo;
 
           // Skip this assignment if the affected bits are outside of the
           // variable fragment.
           if (!at::calculateFragmentIntersect(
                   I.getModule()->getDataLayout(), Info->Base,
-                  Info->OffsetInBits, Info->SizeInBits, DAI, FragInfo) ||
+                  Info->OffsetInBits, Info->SizeInBits, Assign, FragInfo) ||
               (FragInfo && FragInfo->SizeInBits == 0))
-            continue;
+            return;
 
           // FragInfo from calculateFragmentIntersect is nullopt if the
           // resultant fragment matches DAI's fragment or entire variable - in
@@ -2051,13 +2189,14 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
           // nullopt after the copy it means "no fragment info" instead, which
           // is how it is usually interpreted.
           if (!FragInfo)
-            FragInfo = DAI->getExpression()->getFragmentInfo();
+            FragInfo = Assign->getExpression()->getFragmentInfo();
 
-          DebugVariable DV = DebugVariable(DAI->getVariable(), FragInfo,
-                                           DAI->getDebugLoc().getInlinedAt());
+          DebugVariable DV =
+              DebugVariable(Assign->getVariable(), FragInfo,
+                            Assign->getDebugLoc().getInlinedAt());
           DebugAggregate DA = {DV.getVariable(), DV.getInlinedAt()};
           if (!VarsWithStackSlot.contains(DA))
-            continue;
+            return;
 
           // Cache this info for later.
           UntaggedStoreVars[&I].push_back(
@@ -2065,7 +2204,11 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
 
           if (Seen.insert(DV).second)
             FragmentMap[DA].push_back(DV);
-        }
+        };
+        for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(Info->Base))
+          HandleDbgAssignForStore(DAI);
+        for (DPValue *DPV : at::getDPVAssignmentMarkers(Info->Base))
+          HandleDbgAssignForStore(DPV);
       }
     }
   }
@@ -2112,9 +2255,13 @@ static AssignmentTrackingLowering::OverlapMap buildOverlapMapAndRecordDeclares(
 
   // Finally, insert the declares afterwards, so the first IDs are all
   // partially stack homed vars.
-  for (auto *DDI : Declares)
+  for (auto *DDI : InstDeclares)
     FnVarLocs->addSingleLocVar(DebugVariable(DDI), DDI->getExpression(),
                                DDI->getDebugLoc(), DDI->getWrappedLocation());
+  for (auto *DPV : DPDeclares)
+    FnVarLocs->addSingleLocVar(DebugVariable(DPV), DPV->getExpression(),
+                               DPV->getDebugLoc(),
+                               RawLocationWrapper(DPV->getRawLocation()));
   return Map;
 }
 
@@ -2227,7 +2374,7 @@ bool AssignmentTrackingLowering::run(FunctionVarLocsBuilder *FnVarLocsBuilder) {
   // we can identify those uneeded defs later.
   DenseSet<DebugAggregate> AlwaysStackHomed;
   for (const auto &Pair : InsertBeforeMap) {
-    const auto &Vec = Pair.second;
+    auto &Vec = Pair.second;
     for (VarLocInfo VarLoc : Vec) {
       DebugVariable Var = FnVarLocs->getVariable(VarLoc.VariableID);
       DebugAggregate Aggr{Var.getVariable(), Var.getInlinedAt()};
@@ -2293,22 +2440,27 @@ bool AssignmentTrackingLowering::emitPromotedVarLocs(
   bool InsertedAnyIntrinsics = false;
   // Go through every block, translating debug intrinsics for fully promoted
   // variables into FnVarLocs location defs. No analysis required for these.
+  auto TranslateDbgRecord = [&](auto *Record) {
+    // Skip variables that haven't been promoted - we've dealt with those
+    // already.
+    if (VarsWithStackSlot->contains(getAggregate(Record)))
+      return;
+    auto InsertBefore = getNextNode(Record);
+    assert(InsertBefore && "Unexpected: debug intrinsics after a terminator");
+    FnVarLocs->addVarLoc(InsertBefore, DebugVariable(Record),
+                         Record->getExpression(), Record->getDebugLoc(),
+                         RawLocationWrapper(Record->getRawLocation()));
+    InsertedAnyIntrinsics = true;
+  };
   for (auto &BB : Fn) {
     for (auto &I : BB) {
       // Skip instructions other than dbg.values and dbg.assigns.
+      for (DPValue &DPV : I.getDbgValueRange())
+        if (DPV.isDbgValue() || DPV.isDbgAssign())
+          TranslateDbgRecord(&DPV);
       auto *DVI = dyn_cast<DbgValueInst>(&I);
-      if (!DVI)
-        continue;
-      // Skip variables that haven't been promoted - we've dealt with those
-      // already.
-      if (VarsWithStackSlot->contains(getAggregate(DVI)))
-        continue;
-      Instruction *InsertBefore = I.getNextNode();
-      assert(InsertBefore && "Unexpected: debug intrinsics after a terminator");
-      FnVarLocs->addVarLoc(InsertBefore, DebugVariable(DVI),
-                           DVI->getExpression(), DVI->getDebugLoc(),
-                           DVI->getWrappedLocation());
-      InsertedAnyIntrinsics = true;
+      if (DVI)
+        TranslateDbgRecord(DVI);
     }
   }
   return InsertedAnyIntrinsics;

From 4318b033bddc64d5654f3e368fddde859ff4d02e Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <RKSimon@users.noreply.github.com>
Date: Tue, 23 Jan 2024 12:33:52 +0000
Subject: [PATCH 608/843] [MC][X86] Merge lane/element broadcast comment
 printers. (#79020)

This is /almost/ NFC - the only annoyance is that for some reason we were using "<C1,C2,..>" for ConstantVector types unlike all other cases - these now use the same "[C1,C2,..]" format as the other constant printers.
---
 llvm/lib/Target/X86/X86MCInstLower.cpp        |  96 +--
 ...2-x86_64-tail-call-conv-out-of-sync-bug.ll |   2 +-
 .../CodeGen/X86/2011-10-19-widen_vselect.ll   |   4 +-
 .../CodeGen/X86/addsub-constant-folding.ll    |  16 +-
 .../any_extend_vector_inreg_of_broadcast.ll   |   6 +-
 ...d_vector_inreg_of_broadcast_from_memory.ll |   2 +-
 .../test/CodeGen/X86/avx2-fma-fneg-combine.ll |   2 +-
 llvm/test/CodeGen/X86/avx2-vperm.ll           |   4 +-
 .../X86/avx512-shuffles/partial_permute.ll    |   2 +-
 llvm/test/CodeGen/X86/bitreverse.ll           |   2 +-
 llvm/test/CodeGen/X86/combine-sdiv.ll         |  50 +-
 llvm/test/CodeGen/X86/combine-subo.ll         |   2 +-
 llvm/test/CodeGen/X86/combine-udiv.ll         |   8 +-
 llvm/test/CodeGen/X86/fma-fneg-combine-2.ll   |   4 +-
 .../X86/fold-int-pow2-with-fmul-or-fdiv.ll    |   2 +-
 llvm/test/CodeGen/X86/fpclamptosat_vec.ll     |   4 +-
 ...st-and-by-const-from-lshr-in-eqcmp-zero.ll |   4 +-
 llvm/test/CodeGen/X86/icmp-abs-C-vec.ll       |  16 +-
 .../X86/insert-into-constant-vector.ll        |  90 +--
 llvm/test/CodeGen/X86/masked_store_trunc.ll   |   2 +-
 llvm/test/CodeGen/X86/matrix-multiply.ll      |   6 +-
 llvm/test/CodeGen/X86/oddshuffles.ll          |  30 +-
 ...of-two-or-zero-when-comparing-with-zero.ll |   2 +-
 llvm/test/CodeGen/X86/peephole-fold-movsd.ll  |   2 +-
 llvm/test/CodeGen/X86/pr63108.ll              |   2 +-
 llvm/test/CodeGen/X86/psubus.ll               |   2 +-
 llvm/test/CodeGen/X86/sext-vsetcc.ll          |   4 +-
 llvm/test/CodeGen/X86/shrink_vmul.ll          |   4 +-
 llvm/test/CodeGen/X86/sink-addsub-of-const.ll |   2 +-
 llvm/test/CodeGen/X86/slow-pmulld.ll          |   4 +-
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    |   6 +-
 llvm/test/CodeGen/X86/sse2.ll                 |   4 +-
 .../CodeGen/X86/urem-seteq-illegal-types.ll   |   2 +-
 .../CodeGen/X86/urem-seteq-vec-nonsplat.ll    |  10 +-
 llvm/test/CodeGen/X86/vec_fp_to_int.ll        |   4 +-
 llvm/test/CodeGen/X86/vector-blend.ll         |   2 +-
 llvm/test/CodeGen/X86/vector-fshl-512.ll      |   4 +-
 llvm/test/CodeGen/X86/vector-fshr-128.ll      |   2 +-
 llvm/test/CodeGen/X86/vector-fshr-512.ll      |   4 +-
 .../vector-interleaved-load-i16-stride-2.ll   |  36 +-
 .../vector-interleaved-load-i16-stride-3.ll   |  50 +-
 .../vector-interleaved-load-i16-stride-4.ll   |  30 +-
 .../vector-interleaved-load-i16-stride-5.ll   | 180 ++---
 .../vector-interleaved-load-i16-stride-6.ll   | 356 ++++-----
 .../vector-interleaved-load-i16-stride-7.ll   | 318 ++++----
 .../vector-interleaved-load-i32-stride-3.ll   |  90 +--
 .../vector-interleaved-load-i32-stride-5.ll   | 100 +--
 .../vector-interleaved-load-i32-stride-6.ll   | 136 ++--
 .../vector-interleaved-load-i32-stride-7.ll   |  86 +--
 .../vector-interleaved-load-i64-stride-3.ll   |  24 +-
 .../vector-interleaved-load-i64-stride-5.ll   |  64 +-
 .../vector-interleaved-load-i64-stride-6.ll   |  80 +-
 .../vector-interleaved-load-i64-stride-7.ll   |   8 +-
 .../vector-interleaved-load-i8-stride-3.ll    |   4 +-
 .../vector-interleaved-load-i8-stride-4.ll    |  16 +-
 .../vector-interleaved-load-i8-stride-5.ll    |  84 +--
 .../vector-interleaved-load-i8-stride-6.ll    | 182 ++---
 .../vector-interleaved-load-i8-stride-7.ll    | 458 ++++++------
 .../vector-interleaved-store-i16-stride-3.ll  | 128 ++--
 .../vector-interleaved-store-i16-stride-4.ll  |  40 +-
 .../vector-interleaved-store-i16-stride-5.ll  | 236 +++---
 .../vector-interleaved-store-i16-stride-6.ll  | 168 ++---
 .../vector-interleaved-store-i16-stride-7.ll  | 700 +++++++++---------
 .../vector-interleaved-store-i16-stride-8.ll  | 324 ++++----
 .../vector-interleaved-store-i32-stride-3.ll  |  28 +-
 .../vector-interleaved-store-i32-stride-4.ll  | 104 +--
 .../vector-interleaved-store-i32-stride-5.ll  |  44 +-
 .../vector-interleaved-store-i32-stride-6.ll  | 174 ++---
 .../vector-interleaved-store-i32-stride-7.ll  | 436 +++++------
 .../vector-interleaved-store-i32-stride-8.ll  | 416 +++++------
 .../vector-interleaved-store-i64-stride-3.ll  |  26 +-
 .../vector-interleaved-store-i64-stride-4.ll  | 128 ++--
 .../vector-interleaved-store-i64-stride-5.ll  | 134 ++--
 .../vector-interleaved-store-i64-stride-6.ll  |  68 +-
 .../vector-interleaved-store-i64-stride-7.ll  | 394 +++++-----
 .../vector-interleaved-store-i8-stride-3.ll   |   6 +-
 .../vector-interleaved-store-i8-stride-5.ll   | 242 +++---
 .../vector-interleaved-store-i8-stride-6.ll   | 120 +--
 .../vector-interleaved-store-i8-stride-7.ll   | 500 ++++++-------
 .../vector-interleaved-store-i8-stride-8.ll   | 112 +--
 .../CodeGen/X86/vector-mulfix-legalize.ll     |   4 +-
 llvm/test/CodeGen/X86/vector-partial-undef.ll |   2 +-
 .../CodeGen/X86/vector-reduce-add-mask.ll     |   2 +-
 .../CodeGen/X86/vector-replicaton-i1-mask.ll  |  46 +-
 .../test/CodeGen/X86/vector-shift-ashr-128.ll |   2 +-
 .../CodeGen/X86/vector-shift-ashr-sub128.ll   |   2 +-
 .../test/CodeGen/X86/vector-shift-lshr-128.ll |   2 +-
 .../test/CodeGen/X86/vector-shift-lshr-512.ll |   2 +-
 .../CodeGen/X86/vector-shift-lshr-sub128.ll   |   2 +-
 .../CodeGen/X86/vector-shuffle-128-v16.ll     |   6 +-
 .../CodeGen/X86/vector-shuffle-256-v16.ll     |  64 +-
 .../CodeGen/X86/vector-shuffle-256-v32.ll     |   6 +-
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll |  42 +-
 .../CodeGen/X86/vector-shuffle-512-v16.ll     |   4 +-
 .../CodeGen/X86/vector-shuffle-512-v32.ll     |   6 +-
 .../CodeGen/X86/vector-shuffle-512-v64.ll     |  92 +--
 .../test/CodeGen/X86/vector-shuffle-512-v8.ll |  32 +-
 .../X86/vector-shuffle-combining-avx.ll       |   4 +-
 .../X86/vector-shuffle-combining-avx2.ll      |   8 +-
 .../X86/vector-shuffle-combining-sse41.ll     |   8 +-
 .../X86/vector-shuffle-combining-ssse3.ll     |   4 +-
 .../CodeGen/X86/vector-shuffle-combining.ll   |  36 +-
 llvm/test/CodeGen/X86/vector-shuffle-v1.ll    |   2 +-
 llvm/test/CodeGen/X86/vector-shuffle-v192.ll  | 104 +--
 llvm/test/CodeGen/X86/widen_arith-2.ll        |   2 +-
 llvm/test/CodeGen/X86/widen_arith-4.ll        |   6 +-
 llvm/test/CodeGen/X86/widen_arith-5.ll        |   2 +-
 llvm/test/CodeGen/X86/widen_arith-6.ll        |   2 +-
 .../CodeGen/X86/x86-interleaved-access.ll     |  16 +-
 .../CodeGen/X86/zero_extend_vector_inreg.ll   |  26 +-
 .../zero_extend_vector_inreg_of_broadcast.ll  |  18 +-
 ...d_vector_inreg_of_broadcast_from_memory.ll |  44 +-
 112 files changed, 3896 insertions(+), 3946 deletions(-)

diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 6cf93dc51238c..58ebe023cd61e 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -1564,68 +1564,18 @@ static void printZeroUpperMove(const MachineInstr *MI, MCStreamer &OutStreamer,
   OutStreamer.AddComment(CS.str());
 }
 
-static void printLaneBroadcast(const MachineInstr *MI, MCStreamer &OutStreamer,
-                               int NumLanes, int BitWidth) {
-  if (auto *C = X86::getConstantFromPool(*MI, 1)) {
-    int CstEltSize = C->getType()->getScalarSizeInBits();
-
-    std::string Comment;
-    raw_string_ostream CS(Comment);
-    const MachineOperand &DstOp = MI->getOperand(0);
-    CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
-    if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
-      int NumElements = CDS->getNumElements();
-      if ((BitWidth % CstEltSize) == 0)
-        NumElements = std::min<int>(NumElements, BitWidth / CstEltSize);
-      CS << "[";
-      for (int l = 0; l != NumLanes; ++l) {
-        for (int i = 0; i < NumElements; ++i) {
-          if (i != 0 || l != 0)
-            CS << ",";
-          if (CDS->getElementType()->isIntegerTy())
-            printConstant(CDS->getElementAsAPInt(i), CS);
-          else if (CDS->getElementType()->isHalfTy() ||
-                   CDS->getElementType()->isFloatTy() ||
-                   CDS->getElementType()->isDoubleTy())
-            printConstant(CDS->getElementAsAPFloat(i), CS);
-          else
-            CS << "?";
-        }
-      }
-      CS << "]";
-      OutStreamer.AddComment(CS.str());
-    } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
-      int NumOperands = CV->getNumOperands();
-      if ((BitWidth % CstEltSize) == 0)
-        NumOperands = std::min<int>(NumOperands, BitWidth / CstEltSize);
-      CS << "<";
-      for (int l = 0; l != NumLanes; ++l) {
-        for (int i = 0; i < NumOperands; ++i) {
-          if (i != 0 || l != 0)
-            CS << ",";
-          printConstant(CV->getOperand(i),
-                        CV->getType()->getPrimitiveSizeInBits(), CS);
-        }
-      }
-      CS << ">";
-      OutStreamer.AddComment(CS.str());
-    }
-  }
-}
-
-static void printElementBroadcast(const MachineInstr *MI,
-                                  MCStreamer &OutStreamer, int NumElts,
-                                  int EltBits) {
+static void printBroadcast(const MachineInstr *MI, MCStreamer &OutStreamer,
+                           int Repeats, int BitWidth) {
   if (auto *C = X86::getConstantFromPool(*MI, 1)) {
     std::string Comment;
     raw_string_ostream CS(Comment);
     const MachineOperand &DstOp = MI->getOperand(0);
     CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
     CS << "[";
-    for (int i = 0; i != NumElts; ++i) {
-      if (i != 0)
+    for (int l = 0; l != Repeats; ++l) {
+      if (l != 0)
         CS << ",";
-      printConstant(C, EltBits, CS);
+      printConstant(C, BitWidth, CS);
     }
     CS << "]";
     OutStreamer.AddComment(CS.str());
@@ -1943,13 +1893,13 @@ static void addConstantComments(const MachineInstr *MI,
     // For loads from a constant pool to a vector register, print the constant
     // loaded.
     CASE_128_MOV_RM()
-    printLaneBroadcast(MI, OutStreamer, 1, 128);
+    printBroadcast(MI, OutStreamer, 1, 128);
     break;
     CASE_256_MOV_RM()
-    printLaneBroadcast(MI, OutStreamer, 1, 256);
+    printBroadcast(MI, OutStreamer, 1, 256);
     break;
     CASE_512_MOV_RM()
-    printLaneBroadcast(MI, OutStreamer, 1, 512);
+    printBroadcast(MI, OutStreamer, 1, 512);
     break;
   case X86::VBROADCASTF128rm:
   case X86::VBROADCASTI128rm:
@@ -1957,19 +1907,19 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VBROADCASTF64X2Z128rm:
   case X86::VBROADCASTI32X4Z256rm:
   case X86::VBROADCASTI64X2Z128rm:
-    printLaneBroadcast(MI, OutStreamer, 2, 128);
+    printBroadcast(MI, OutStreamer, 2, 128);
     break;
   case X86::VBROADCASTF32X4rm:
   case X86::VBROADCASTF64X2rm:
   case X86::VBROADCASTI32X4rm:
   case X86::VBROADCASTI64X2rm:
-    printLaneBroadcast(MI, OutStreamer, 4, 128);
+    printBroadcast(MI, OutStreamer, 4, 128);
     break;
   case X86::VBROADCASTF32X8rm:
   case X86::VBROADCASTF64X4rm:
   case X86::VBROADCASTI32X8rm:
   case X86::VBROADCASTI64X4rm:
-    printLaneBroadcast(MI, OutStreamer, 2, 256);
+    printBroadcast(MI, OutStreamer, 2, 256);
     break;
 
   // For broadcast loads from a constant pool to a vector register, repeatedly
@@ -1979,55 +1929,55 @@ static void addConstantComments(const MachineInstr *MI,
   case X86::VMOVDDUPZ128rm:
   case X86::VPBROADCASTQrm:
   case X86::VPBROADCASTQZ128rm:
-    printElementBroadcast(MI, OutStreamer, 2, 64);
+    printBroadcast(MI, OutStreamer, 2, 64);
     break;
   case X86::VBROADCASTSDYrm:
   case X86::VBROADCASTSDZ256rm:
   case X86::VPBROADCASTQYrm:
   case X86::VPBROADCASTQZ256rm:
-    printElementBroadcast(MI, OutStreamer, 4, 64);
+    printBroadcast(MI, OutStreamer, 4, 64);
     break;
   case X86::VBROADCASTSDZrm:
   case X86::VPBROADCASTQZrm:
-    printElementBroadcast(MI, OutStreamer, 8, 64);
+    printBroadcast(MI, OutStreamer, 8, 64);
     break;
   case X86::VBROADCASTSSrm:
   case X86::VBROADCASTSSZ128rm:
   case X86::VPBROADCASTDrm:
   case X86::VPBROADCASTDZ128rm:
-    printElementBroadcast(MI, OutStreamer, 4, 32);
+    printBroadcast(MI, OutStreamer, 4, 32);
     break;
   case X86::VBROADCASTSSYrm:
   case X86::VBROADCASTSSZ256rm:
   case X86::VPBROADCASTDYrm:
   case X86::VPBROADCASTDZ256rm:
-    printElementBroadcast(MI, OutStreamer, 8, 32);
+    printBroadcast(MI, OutStreamer, 8, 32);
     break;
   case X86::VBROADCASTSSZrm:
   case X86::VPBROADCASTDZrm:
-    printElementBroadcast(MI, OutStreamer, 16, 32);
+    printBroadcast(MI, OutStreamer, 16, 32);
     break;
   case X86::VPBROADCASTWrm:
   case X86::VPBROADCASTWZ128rm:
-    printElementBroadcast(MI, OutStreamer, 8, 16);
+    printBroadcast(MI, OutStreamer, 8, 16);
     break;
   case X86::VPBROADCASTWYrm:
   case X86::VPBROADCASTWZ256rm:
-    printElementBroadcast(MI, OutStreamer, 16, 16);
+    printBroadcast(MI, OutStreamer, 16, 16);
     break;
   case X86::VPBROADCASTWZrm:
-    printElementBroadcast(MI, OutStreamer, 32, 16);
+    printBroadcast(MI, OutStreamer, 32, 16);
     break;
   case X86::VPBROADCASTBrm:
   case X86::VPBROADCASTBZ128rm:
-    printElementBroadcast(MI, OutStreamer, 16, 8);
+    printBroadcast(MI, OutStreamer, 16, 8);
     break;
   case X86::VPBROADCASTBYrm:
   case X86::VPBROADCASTBZ256rm:
-    printElementBroadcast(MI, OutStreamer, 32, 8);
+    printBroadcast(MI, OutStreamer, 32, 8);
     break;
   case X86::VPBROADCASTBZrm:
-    printElementBroadcast(MI, OutStreamer, 64, 8);
+    printBroadcast(MI, OutStreamer, 64, 8);
     break;
   }
 }
diff --git a/llvm/test/CodeGen/X86/2009-06-12-x86_64-tail-call-conv-out-of-sync-bug.ll b/llvm/test/CodeGen/X86/2009-06-12-x86_64-tail-call-conv-out-of-sync-bug.ll
index dd8efe9715d3a..b5577c09c4320 100644
--- a/llvm/test/CodeGen/X86/2009-06-12-x86_64-tail-call-conv-out-of-sync-bug.ll
+++ b/llvm/test/CodeGen/X86/2009-06-12-x86_64-tail-call-conv-out-of-sync-bug.ll
@@ -16,7 +16,7 @@ define fastcc double @tailcall() {
 ; CHECK-NEXT:    movq %rax, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    fld1
 ; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <1.0E+0,1.0E+0,u,u>
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,u,u]
 ; CHECK-NEXT:    addq $24, %rsp
 ; CHECK-NEXT:    jmp _tailcallee ## TAILCALL
 entry:
diff --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index 171e16e35fc2f..5a4cbac57eeeb 100644
--- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -73,7 +73,7 @@ define void @full_test() {
 ; X86-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; X86-NEXT:    xorps %xmm0, %xmm0
 ; X86-NEXT:    cmpltps %xmm2, %xmm0
-; X86-NEXT:    movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u>
+; X86-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,u,u]
 ; X86-NEXT:    addps %xmm1, %xmm3
 ; X86-NEXT:    movaps %xmm1, %xmm4
 ; X86-NEXT:    blendvps %xmm0, %xmm3, %xmm4
@@ -93,7 +93,7 @@ define void @full_test() {
 ; X64-NEXT:    cvtdq2ps %xmm0, %xmm1
 ; X64-NEXT:    xorps %xmm0, %xmm0
 ; X64-NEXT:    cmpltps %xmm2, %xmm0
-; X64-NEXT:    movaps {{.*#+}} xmm3 = <1.0E+0,1.0E+0,u,u>
+; X64-NEXT:    movaps {{.*#+}} xmm3 = [1.0E+0,1.0E+0,u,u]
 ; X64-NEXT:    addps %xmm1, %xmm3
 ; X64-NEXT:    movaps %xmm1, %xmm4
 ; X64-NEXT:    blendvps %xmm0, %xmm3, %xmm4
diff --git a/llvm/test/CodeGen/X86/addsub-constant-folding.ll b/llvm/test/CodeGen/X86/addsub-constant-folding.ll
index 4dbaae5c1a74a..1cdc81223168f 100644
--- a/llvm/test/CodeGen/X86/addsub-constant-folding.ll
+++ b/llvm/test/CodeGen/X86/addsub-constant-folding.ll
@@ -367,14 +367,14 @@ define <4 x i32> @vec_add_const_const_sub_extrause(<4 x i32> %arg) {
 define <4 x i32> @vec_add_const_const_sub_nonsplat(<4 x i32> %arg) {
 ; X86-LABEL: vec_add_const_const_sub_nonsplat:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = <4294967277,u,u,4294967290>
+; X86-NEXT:    movdqa {{.*#+}} xmm1 = [4294967277,u,u,4294967290]
 ; X86-NEXT:    psubd %xmm0, %xmm1
 ; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vec_add_const_const_sub_nonsplat:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = <4294967277,u,u,4294967290>
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [4294967277,u,u,4294967290]
 ; X64-NEXT:    psubd %xmm0, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -733,14 +733,14 @@ define <4 x i32> @vec_sub_const_const_sub_extrause(<4 x i32> %arg) {
 define <4 x i32> @vec_sub_const_const_sub_nonsplat(<4 x i32> %arg) {
 ; X86-LABEL: vec_sub_const_const_sub_nonsplat:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = <23,u,u,10>
+; X86-NEXT:    movdqa {{.*#+}} xmm1 = [23,u,u,10]
 ; X86-NEXT:    psubd %xmm0, %xmm1
 ; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vec_sub_const_const_sub_nonsplat:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = <23,u,u,10>
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [23,u,u,10]
 ; X64-NEXT:    psubd %xmm0, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -867,14 +867,14 @@ define <4 x i32> @vec_const_sub_add_const_extrause(<4 x i32> %arg) {
 define <4 x i32> @vec_const_sub_add_const_nonsplat(<4 x i32> %arg) {
 ; X86-LABEL: vec_const_sub_add_const_nonsplat:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = <23,u,u,10>
+; X86-NEXT:    movdqa {{.*#+}} xmm1 = [23,u,u,10]
 ; X86-NEXT:    psubd %xmm0, %xmm1
 ; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vec_const_sub_add_const_nonsplat:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = <23,u,u,10>
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [23,u,u,10]
 ; X64-NEXT:    psubd %xmm0, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
@@ -1001,14 +1001,14 @@ define <4 x i32> @vec_const_sub_sub_const_extrause(<4 x i32> %arg) {
 define <4 x i32> @vec_const_sub_sub_const_nonsplat(<4 x i32> %arg) {
 ; X86-LABEL: vec_const_sub_sub_const_nonsplat:
 ; X86:       # %bb.0:
-; X86-NEXT:    movdqa {{.*#+}} xmm1 = <19,u,u,6>
+; X86-NEXT:    movdqa {{.*#+}} xmm1 = [19,u,u,6]
 ; X86-NEXT:    psubd %xmm0, %xmm1
 ; X86-NEXT:    movdqa %xmm1, %xmm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: vec_const_sub_sub_const_nonsplat:
 ; X64:       # %bb.0:
-; X64-NEXT:    movdqa {{.*#+}} xmm1 = <19,u,u,6>
+; X64-NEXT:    movdqa {{.*#+}} xmm1 = [19,u,u,6]
 ; X64-NEXT:    psubd %xmm0, %xmm1
 ; X64-NEXT:    movdqa %xmm1, %xmm0
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index d61e33ccb22a9..fe48059f9d0e6 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; SSE42-NEXT:    paddb 48(%rsi), %xmm2
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    paddb 32(%rsi), %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    pshufb %xmm3, %xmm1
 ; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -3941,7 +3941,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <16,9,10,11,12,13,16,15,u,u,u,u,16,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,9,10,11,12,13,16,15,u,u,u,u,16,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
 ; AVX512BW-NEXT:    vpbroadcastw %xmm0, %ymm0
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
@@ -4181,7 +4181,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <16,9,10,11,12,13,14,15,u,u,u,u,16,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,9,10,11,12,13,14,15,u,u,u,u,16,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 99d9f6b41e70d..2f576fe671590 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
 ; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
 ; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    pshufb %xmm3, %xmm1
 ; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll
index d6001da849e31..ee504e30302b1 100644
--- a/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll
+++ b/llvm/test/CodeGen/X86/avx2-fma-fneg-combine.ll
@@ -131,7 +131,7 @@ define <4 x double> @test9(<4 x double> %a) {
 define <4 x double> @test10(<4 x double> %a, <4 x double> %b) {
 ; CHECK-LABEL: test10:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = <-9.5E+0,u,-5.5E+0,-2.5E+0>
+; CHECK-NEXT:    vmovapd {{.*#+}} ymm2 = [-9.5E+0,u,-5.5E+0,-2.5E+0]
 ; CHECK-NEXT:    vmovapd %ymm2, %ymm3
 ; CHECK-NEXT:    vfmadd213pd {{.*#+}} ymm3 = (ymm0 * ymm3) + ymm1
 ; CHECK-NEXT:    vfnmadd213pd {{.*#+}} ymm2 = -(ymm0 * ymm2) + ymm1
diff --git a/llvm/test/CodeGen/X86/avx2-vperm.ll b/llvm/test/CodeGen/X86/avx2-vperm.ll
index 101fd19312788..90430aa51dcdc 100644
--- a/llvm/test/CodeGen/X86/avx2-vperm.ll
+++ b/llvm/test/CodeGen/X86/avx2-vperm.ll
@@ -23,13 +23,13 @@ entry:
 define <8 x float> @perm_cl_fp_8x32(<8 x float> %A) nounwind readnone {
 ; X86-LABEL: perm_cl_fp_8x32:
 ; X86:       # %bb.0: # %entry
-; X86-NEXT:    vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6>
+; X86-NEXT:    vmovaps {{.*#+}} ymm1 = [u,7,2,u,4,u,1,6]
 ; X86-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: perm_cl_fp_8x32:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    vmovaps {{.*#+}} ymm1 = <u,7,2,u,4,u,1,6>
+; X64-NEXT:    vmovaps {{.*#+}} ymm1 = [u,7,2,u,4,u,1,6]
 ; X64-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; X64-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
index ec0f14ae4e58e..b648b086a8b68 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/partial_permute.ll
@@ -1769,7 +1769,7 @@ define <4 x i32> @test_16xi32_to_4xi32_perm_mask9(<16 x i32> %vec) {
 ;
 ; CHECK-FAST-PERLANE-LABEL: test_16xi32_to_4xi32_perm_mask9:
 ; CHECK-FAST-PERLANE:       # %bb.0:
-; CHECK-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,1,u,2>
+; CHECK-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,1,u,2]
 ; CHECK-FAST-PERLANE-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; CHECK-FAST-PERLANE-NEXT:    vpermd %ymm2, %ymm1, %ymm1
 ; CHECK-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm2
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 9daac1df1d975..2d978b5e991c9 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -587,7 +587,7 @@ define <2 x i16> @fold_v2i16() {
 ;
 ; X64-LABEL: fold_v2i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = <61440,240,u,u,u,u,u,u>
+; X64-NEXT:    movaps {{.*#+}} xmm0 = [61440,240,u,u,u,u,u,u]
 ; X64-NEXT:    retq
 ;
 ; X86XOP-LABEL: fold_v2i16:
diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 549fe72626973..e10d94c16696a 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -581,7 +581,7 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; SSE2-NEXT:    psraw $15, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = <u,4,2,16,8,32,64,2>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm7 = [u,4,2,16,8,32,64,2]
 ; SSE2-NEXT:    pmulhuw %xmm7, %xmm0
 ; SSE2-NEXT:    paddw %xmm3, %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [65535,65535,65535,0,65535,0,0,65535]
@@ -639,10 +639,10 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psraw $15, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm3 = [u,4,2,16,8,32,64,2]
 ; SSE41-NEXT:    pmulhuw %xmm3, %xmm2
 ; SSE41-NEXT:    paddw %xmm0, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = <u,16384,32768,4096,8192,2048,1024,32768>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm5
 ; SSE41-NEXT:    pmulhw %xmm4, %xmm5
 ; SSE41-NEXT:    psraw $1, %xmm2
@@ -662,10 +662,10 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX1-NEXT:    vpsraw $15, %xmm1, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,4,2,16,8,32,64,2>
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,4,2,16,8,32,64,2]
 ; AVX1-NEXT:    vpmulhuw %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,16384,32768,4096,8192,2048,1024,32768>
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,16384,32768,4096,8192,2048,1024,32768]
 ; AVX1-NEXT:    vpmulhw %xmm2, %xmm1, %xmm4
 ; AVX1-NEXT:    vpsraw $1, %xmm1, %xmm1
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1],xmm1[2],xmm4[3,4,5,6],xmm1[7]
@@ -718,10 +718,10 @@ define <16 x i16> @combine_vec_sdiv_by_pow2b_v16i16(<16 x i16> %x) {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; XOP-NEXT:    vpsraw $15, %xmm1, %xmm2
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,65522,65521,65524,65523,65525,65526,65521>
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,65522,65521,65524,65523,65525,65526,65521]
 ; XOP-NEXT:    vpshlw %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddw %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,65534,65535,65532,65533,65531,65530,65535>
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,65534,65535,65532,65533,65531,65530,65535]
 ; XOP-NEXT:    vpshaw %xmm2, %xmm1, %xmm1
 ; XOP-NEXT:    vpsraw $15, %xmm0, %xmm4
 ; XOP-NEXT:    vpshlw %xmm3, %xmm4, %xmm3
@@ -742,7 +742,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; SSE2-NEXT:    movdqa %xmm1, %xmm5
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    psraw $15, %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = <u,4,2,16,8,32,64,2>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm9 = [u,4,2,16,8,32,64,2]
 ; SSE2-NEXT:    pmulhuw %xmm9, %xmm0
 ; SSE2-NEXT:    paddw %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm6 = [65535,65535,65535,0,65535,0,0,65535]
@@ -853,10 +853,10 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movdqa %xmm0, %xmm6
 ; SSE41-NEXT:    psraw $15, %xmm6
-; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = <u,4,2,16,8,32,64,2>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm5 = [u,4,2,16,8,32,64,2]
 ; SSE41-NEXT:    pmulhuw %xmm5, %xmm6
 ; SSE41-NEXT:    paddw %xmm0, %xmm6
-; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = <u,16384,32768,4096,8192,2048,1024,32768>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm4 = [u,16384,32768,4096,8192,2048,1024,32768]
 ; SSE41-NEXT:    movdqa %xmm6, %xmm7
 ; SSE41-NEXT:    pmulhw %xmm4, %xmm7
 ; SSE41-NEXT:    psraw $1, %xmm6
@@ -894,10 +894,10 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; AVX1-NEXT:    vpsraw $15, %xmm2, %xmm3
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,4,2,16,8,32,64,2>
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,4,2,16,8,32,64,2]
 ; AVX1-NEXT:    vpmulhuw %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,16384,32768,4096,8192,2048,1024,32768>
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,16384,32768,4096,8192,2048,1024,32768]
 ; AVX1-NEXT:    vpmulhw %xmm3, %xmm2, %xmm5
 ; AVX1-NEXT:    vpsraw $1, %xmm2, %xmm2
 ; AVX1-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2],xmm5[3,4,5,6],xmm2[7]
@@ -995,10 +995,10 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOP-NEXT:    vpsraw $15, %xmm2, %xmm3
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,65522,65521,65524,65523,65525,65526,65521>
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,65522,65521,65524,65523,65525,65526,65521]
 ; XOP-NEXT:    vpshlw %xmm4, %xmm3, %xmm3
 ; XOP-NEXT:    vpaddw %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,65534,65535,65532,65533,65531,65530,65535>
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,65534,65535,65532,65533,65531,65530,65535]
 ; XOP-NEXT:    vpshaw %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpsraw $15, %xmm0, %xmm5
 ; XOP-NEXT:    vpshlw %xmm4, %xmm5, %xmm5
@@ -1234,10 +1234,10 @@ define <8 x i32> @combine_vec_sdiv_by_pow2b_v8i32(<8 x i32> %x) {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; XOP-NEXT:    vpsrad $31, %xmm1, %xmm2
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,4294967266,4294967267,4294967268>
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,4294967266,4294967267,4294967268]
 ; XOP-NEXT:    vpshld %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpaddd %xmm2, %xmm1, %xmm1
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,4294967294,4294967293,4294967292>
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,4294967294,4294967293,4294967292]
 ; XOP-NEXT:    vpshad %xmm2, %xmm1, %xmm1
 ; XOP-NEXT:    vpsrad $31, %xmm0, %xmm4
 ; XOP-NEXT:    vpshld %xmm3, %xmm4, %xmm3
@@ -1510,10 +1510,10 @@ define <16 x i32> @combine_vec_sdiv_by_pow2b_v16i32(<16 x i32> %x) {
 ; XOP:       # %bb.0:
 ; XOP-NEXT:    vextractf128 $1, %ymm0, %xmm2
 ; XOP-NEXT:    vpsrad $31, %xmm2, %xmm3
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,4294967266,4294967267,4294967268>
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,4294967266,4294967267,4294967268]
 ; XOP-NEXT:    vpshld %xmm4, %xmm3, %xmm3
 ; XOP-NEXT:    vpaddd %xmm3, %xmm2, %xmm2
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,4294967294,4294967293,4294967292>
+; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,4294967294,4294967293,4294967292]
 ; XOP-NEXT:    vpshad %xmm3, %xmm2, %xmm2
 ; XOP-NEXT:    vpsrad $31, %xmm0, %xmm5
 ; XOP-NEXT:    vpshld %xmm4, %xmm5, %xmm5
@@ -1709,7 +1709,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX2-NEXT:    vpaddq %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,2305843009213693952,1152921504606846976,576460752303423488>
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,2305843009213693952,1152921504606846976,576460752303423488]
 ; AVX2-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpsubq %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3,4,5,6,7]
@@ -1718,7 +1718,7 @@ define <4 x i64> @combine_vec_sdiv_by_pow2b_v4i64(<4 x i64> %x) {
 ; AVX512F-LABEL: combine_vec_sdiv_by_pow2b_v4i64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $ymm0 killed $ymm0 def $zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,2,3,4>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,2,3,4]
 ; AVX512F-NEXT:    vpsraq $63, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpsrlvq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddq %ymm2, %ymm0, %ymm2
@@ -1904,12 +1904,12 @@ define <8 x i64> @combine_vec_sdiv_by_pow2b_v8i64(<8 x i64> %x) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vpcmpgtq %ymm0, %ymm2, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,62,61,60>
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,62,61,60]
 ; AVX2-NEXT:    vpsrlvq %ymm4, %ymm3, %ymm3
 ; AVX2-NEXT:    vpaddq %ymm3, %ymm0, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,2,3,4>
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,2,3,4]
 ; AVX2-NEXT:    vpsrlvq %ymm5, %ymm3, %ymm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,2305843009213693952,1152921504606846976,576460752303423488>
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,2305843009213693952,1152921504606846976,576460752303423488]
 ; AVX2-NEXT:    vpxor %ymm6, %ymm3, %ymm3
 ; AVX2-NEXT:    vpsubq %ymm6, %ymm3, %ymm3
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
@@ -2701,7 +2701,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform5(<8 x i16> %x) {
 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
 ; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <256,16384,4096,u,u,u,512,256>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [256,16384,4096,u,u,u,512,256]
 ; SSE41-NEXT:    pmulhw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
@@ -2814,7 +2814,7 @@ define <8 x i16> @combine_vec_sdiv_nonuniform6(<8 x i16> %x) {
 ; SSE41-NEXT:    pmullw %xmm0, %xmm1
 ; SSE41-NEXT:    pmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <4,256,256,u,u,512,256,8>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,256,256,u,u,512,256,8]
 ; SSE41-NEXT:    pmulhw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6,7]
 ; SSE41-NEXT:    psrlw $15, %xmm0
diff --git a/llvm/test/CodeGen/X86/combine-subo.ll b/llvm/test/CodeGen/X86/combine-subo.ll
index 99f26525d49e5..235df0a666ee9 100644
--- a/llvm/test/CodeGen/X86/combine-subo.ll
+++ b/llvm/test/CodeGen/X86/combine-subo.ll
@@ -217,7 +217,7 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
 define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind {
 ; SSE-LABEL: never_usub_const_vector:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = <127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE-NEXT:    xorps %xmm1, %xmm1
 ; SSE-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/combine-udiv.ll b/llvm/test/CodeGen/X86/combine-udiv.ll
index 12ac819c96339..f94ff72274d63 100644
--- a/llvm/test/CodeGen/X86/combine-udiv.ll
+++ b/llvm/test/CodeGen/X86/combine-udiv.ll
@@ -519,7 +519,7 @@ define <8 x i16> @combine_vec_udiv_nonuniform(<8 x i16> %x) {
 ; SSE41-NEXT:    psubw %xmm1, %xmm0
 ; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    paddw %xmm1, %xmm0
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <4096,2048,8,u,u,2,2,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4096,2048,8,u,u,2,2,u]
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3,4],xmm1[5,6],xmm0[7]
 ; SSE41-NEXT:    retq
@@ -683,7 +683,7 @@ define <16 x i8> @combine_vec_udiv_nonuniform4(<16 x i8> %x) {
 define <8 x i16> @pr38477(<8 x i16> %a0) {
 ; SSE2-LABEL: pr38477:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <u,4957,57457,4103,16385,35545,2048,2115>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115]
 ; SSE2-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    psubw %xmm1, %xmm2
@@ -702,13 +702,13 @@ define <8 x i16> @pr38477(<8 x i16> %a0) {
 ;
 ; SSE41-LABEL: pr38477:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,4957,57457,4103,16385,35545,2048,2115>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,4957,57457,4103,16385,35545,2048,2115]
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
 ; SSE41-NEXT:    psubw %xmm1, %xmm2
 ; SSE41-NEXT:    pmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
 ; SSE41-NEXT:    paddw %xmm1, %xmm2
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,1024,1024,16,4,1024,u,4096>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,1024,1024,16,4,1024,u,4096]
 ; SSE41-NEXT:    pmulhuw %xmm2, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5],xmm2[6],xmm1[7]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll
index bb8ee2238a004..b9a2772d3df70 100644
--- a/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll
+++ b/llvm/test/CodeGen/X86/fma-fneg-combine-2.ll
@@ -145,7 +145,7 @@ define <4 x double> @negated_constant_v4f64(<4 x double> %a) {
 define <4 x double> @negated_constant_v4f64_2fmas(<4 x double> %a, <4 x double> %b) {
 ; FMA3-LABEL: negated_constant_v4f64_2fmas:
 ; FMA3:       # %bb.0:
-; FMA3-NEXT:    vmovapd {{.*#+}} ymm2 = <-5.0E-1,u,-2.5E+0,-4.5E+0>
+; FMA3-NEXT:    vmovapd {{.*#+}} ymm2 = [-5.0E-1,u,-2.5E+0,-4.5E+0]
 ; FMA3-NEXT:    vmovapd %ymm2, %ymm3
 ; FMA3-NEXT:    vfmadd213pd {{.*#+}} ymm3 = (ymm0 * ymm3) + ymm1
 ; FMA3-NEXT:    vfnmadd213pd {{.*#+}} ymm2 = -(ymm0 * ymm2) + ymm1
@@ -154,7 +154,7 @@ define <4 x double> @negated_constant_v4f64_2fmas(<4 x double> %a, <4 x double>
 ;
 ; FMA4-LABEL: negated_constant_v4f64_2fmas:
 ; FMA4:       # %bb.0:
-; FMA4-NEXT:    vmovapd {{.*#+}} ymm2 = <-5.0E-1,u,-2.5E+0,-4.5E+0>
+; FMA4-NEXT:    vmovapd {{.*#+}} ymm2 = [-5.0E-1,u,-2.5E+0,-4.5E+0]
 ; FMA4-NEXT:    vfmaddpd {{.*#+}} ymm3 = (ymm0 * ymm2) + ymm1
 ; FMA4-NEXT:    vfnmaddpd {{.*#+}} ymm0 = -(ymm0 * ymm2) + ymm1
 ; FMA4-NEXT:    vaddpd %ymm0, %ymm3, %ymm0
diff --git a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
index abaefaee33ed6..7dbeba96a34cb 100644
--- a/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
+++ b/llvm/test/CodeGen/X86/fold-int-pow2-with-fmul-or-fdiv.ll
@@ -1236,7 +1236,7 @@ define <2 x float> @fdiv_pow_shl_cnt_vec_with_expensive_cast(<2 x i64> %cnt) nou
 ; CHECK-SSE:       # %bb.0:
 ; CHECK-SSE-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[0,2,2,3]
 ; CHECK-SSE-NEXT:    pslld $23, %xmm1
-; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm0 = <1065353216,1065353216,u,u>
+; CHECK-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [1065353216,1065353216,u,u]
 ; CHECK-SSE-NEXT:    psubd %xmm1, %xmm0
 ; CHECK-SSE-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 42d6e8139c6aa..933c546503048 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -1161,7 +1161,7 @@ define <2 x i16> @ustest_f64i16(<2 x double> %x) nounwind {
 ; SSE-LABEL: ustest_f64i16:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,65535,u,u>
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,u,u]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE-NEXT:    pand %xmm2, %xmm0
@@ -3803,7 +3803,7 @@ define <2 x i16> @ustest_f64i16_mm(<2 x double> %x) nounwind {
 ; SSE-LABEL: ustest_f64i16_mm:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    cvttpd2dq %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <65535,65535,u,u>
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,u,u]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pcmpgtd %xmm0, %xmm2
 ; SSE-NEXT:    pand %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
index d79aff5544b69..9121cf2d654a3 100644
--- a/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
+++ b/llvm/test/CodeGen/X86/hoist-and-by-const-from-lshr-in-eqcmp-zero.ll
@@ -690,7 +690,7 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 ; X86-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[2,3,3,3,4,5,6,7]
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = <1,1,u,1>
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [1,1,u,1]
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    psrld %xmm3, %xmm4
 ; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm3 = xmm1[0,1,1,1,4,5,6,7]
@@ -722,7 +722,7 @@ define <4 x i1> @vec_4xi32_nonsplat_undef2_eq(<4 x i32> %x, <4 x i32> %y) nounwi
 ; X64-SSE2-LABEL: vec_4xi32_nonsplat_undef2_eq:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[2,3,3,3,4,5,6,7]
-; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = <1,1,u,1>
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1,1,u,1]
 ; X64-SSE2-NEXT:    movdqa %xmm3, %xmm4
 ; X64-SSE2-NEXT:    psrld %xmm2, %xmm4
 ; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm2 = xmm1[0,1,1,1,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
index 1e50e5b811233..7ad2bb712ae93 100644
--- a/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
+++ b/llvm/test/CodeGen/X86/icmp-abs-C-vec.ll
@@ -944,7 +944,7 @@ define <4 x i1> @eq_or_to_abs_vec4x16(<4 x i16> %x) {
 ;
 ; SSE41-LABEL: eq_or_to_abs_vec4x16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <88,88,88,88,u,u,u,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u]
 ; SSE41-NEXT:    pcmpeqw %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    por %xmm1, %xmm0
@@ -953,7 +953,7 @@ define <4 x i1> @eq_or_to_abs_vec4x16(<4 x i16> %x) {
 ;
 ; SSE2-LABEL: eq_or_to_abs_vec4x16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <88,88,88,88,u,u,u,u>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u]
 ; SSE2-NEXT:    pcmpeqw %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
@@ -989,7 +989,7 @@ define <4 x i8> @eq_or_to_abs_vec4x8_sext(<4 x i8> %x) {
 ;
 ; SSE41-LABEL: eq_or_to_abs_vec4x8_sext:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE41-NEXT:    por %xmm1, %xmm0
@@ -997,7 +997,7 @@ define <4 x i8> @eq_or_to_abs_vec4x8_sext(<4 x i8> %x) {
 ;
 ; SSE2-LABEL: eq_or_to_abs_vec4x8_sext:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
@@ -1038,7 +1038,7 @@ define <4 x i1> @ne_and_to_abs_vec4x8(<4 x i8> %x) {
 ;
 ; SSE41-LABEL: ne_and_to_abs_vec4x8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE41-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1049,7 +1049,7 @@ define <4 x i1> @ne_and_to_abs_vec4x8(<4 x i8> %x) {
 ;
 ; SSE2-LABEL: ne_and_to_abs_vec4x8:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    por %xmm1, %xmm0
@@ -1093,7 +1093,7 @@ define <4 x i16> @ne_and_to_abs_vec4x16_sext(<4 x i16> %x) {
 ;
 ; SSE41-LABEL: ne_and_to_abs_vec4x16_sext:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <88,88,88,88,u,u,u,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u]
 ; SSE41-NEXT:    pcmpeqw %xmm0, %xmm1
 ; SSE41-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE41-NEXT:    pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1103,7 +1103,7 @@ define <4 x i16> @ne_and_to_abs_vec4x16_sext(<4 x i16> %x) {
 ;
 ; SSE2-LABEL: ne_and_to_abs_vec4x16_sext:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <88,88,88,88,u,u,u,u>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [88,88,88,88,u,u,u,u]
 ; SSE2-NEXT:    pcmpeqw %xmm0, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm2
 ; SSE2-NEXT:    pcmpeqw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 07787f3851bd9..418d632480328 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -27,25 +27,25 @@ define <16 x i8> @elt0_v16i8(i8 %x) {
 ;
 ; X86-SSE4-LABEL: elt0_v16i8:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; X86-SSE4-NEXT:    pinsrb $0, {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE4-LABEL: elt0_v16i8:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; X64-SSE4-NEXT:    pinsrb $0, %edi, %xmm0
 ; X64-SSE4-NEXT:    retq
 ;
 ; X86-AVX-LABEL: elt0_v16i8:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; X86-AVX-NEXT:    vpinsrb $0, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-LABEL: elt0_v16i8:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15>
+; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15]
 ; X64-AVX-NEXT:    vpinsrb $0, %edi, %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
    %ins = insertelement <16 x i8> <i8 42, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15>, i8 %x, i32 0
@@ -55,25 +55,25 @@ define <16 x i8> @elt0_v16i8(i8 %x) {
 define <8 x i16> @elt5_v8i16(i16 %x) {
 ; X86-SSE-LABEL: elt5_v8i16:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm0 = <42,1,2,3,4,u,6,7>
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7]
 ; X86-SSE-NEXT:    pinsrw $5, {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: elt5_v8i16:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm0 = <42,1,2,3,4,u,6,7>
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7]
 ; X64-SSE-NEXT:    pinsrw $5, %edi, %xmm0
 ; X64-SSE-NEXT:    retq
 ;
 ; X86-AVX-LABEL: elt5_v8i16:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <42,1,2,3,4,u,6,7>
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7]
 ; X86-AVX-NEXT:    vpinsrw $5, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-LABEL: elt5_v8i16:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <42,1,2,3,4,u,6,7>
+; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [42,1,2,3,4,u,6,7]
 ; X64-AVX-NEXT:    vpinsrw $5, %edi, %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
    %ins = insertelement <8 x i16> <i16 42, i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7>, i16 %x, i32 5
@@ -84,7 +84,7 @@ define <4 x i32> @elt3_v4i32(i32 %x) {
 ; X86-SSE2-LABEL: elt3_v4i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT:    movaps {{.*#+}} xmm0 = <42,1,2,u>
+; X86-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,u]
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; X86-SSE2-NEXT:    retl
@@ -92,32 +92,32 @@ define <4 x i32> @elt3_v4i32(i32 %x) {
 ; X64-SSE2-LABEL: elt3_v4i32:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movd %edi, %xmm1
-; X64-SSE2-NEXT:    movaps {{.*#+}} xmm0 = <42,1,2,u>
+; X64-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,u]
 ; X64-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,3]
 ; X64-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,0]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X86-SSE4-LABEL: elt3_v4i32:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = <42,1,2,u>
+; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [42,1,2,u]
 ; X86-SSE4-NEXT:    pinsrd $3, {{[0-9]+}}(%esp), %xmm0
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE4-LABEL: elt3_v4i32:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = <42,1,2,u>
+; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [42,1,2,u]
 ; X64-SSE4-NEXT:    pinsrd $3, %edi, %xmm0
 ; X64-SSE4-NEXT:    retq
 ;
 ; X86-AVX-LABEL: elt3_v4i32:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <42,1,2,u>
+; X86-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [42,1,2,u]
 ; X86-AVX-NEXT:    vpinsrd $3, {{[0-9]+}}(%esp), %xmm0, %xmm0
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-LABEL: elt3_v4i32:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <42,1,2,u>
+; X64-AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [42,1,2,u]
 ; X64-AVX-NEXT:    vpinsrd $3, %edi, %xmm0, %xmm0
 ; X64-AVX-NEXT:    retq
    %ins = insertelement <4 x i32> <i32 42, i32 1, i32 2, i32 3>, i32 %x, i32 3
@@ -134,13 +134,13 @@ define <2 x i64> @elt0_v2i64(i64 %x) {
 ; X64-SSE2-LABEL: elt0_v2i64:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movq %rdi, %xmm1
-; X64-SSE2-NEXT:    movapd {{.*#+}} xmm0 = <u,1>
+; X64-SSE2-NEXT:    movapd {{.*#+}} xmm0 = [u,1]
 ; X64-SSE2-NEXT:    movsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X64-SSE4-LABEL: elt0_v2i64:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = <u,1>
+; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [u,1]
 ; X64-SSE4-NEXT:    pinsrq $0, %rdi, %xmm0
 ; X64-SSE4-NEXT:    retq
 ;
@@ -175,7 +175,7 @@ define <2 x i64> @elt0_v2i64(i64 %x) {
 define <4 x float> @elt1_v4f32(float %x) {
 ; X86-SSE2-LABEL: elt1_v4f32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X86-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
@@ -183,33 +183,33 @@ define <4 x float> @elt1_v4f32(float %x) {
 ;
 ; X64-SSE2-LABEL: elt1_v4f32:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X64-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X64-SSE2-NEXT:    movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; X64-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[2,0],xmm1[2,3]
 ; X64-SSE2-NEXT:    retq
 ;
 ; X86-SSE4-LABEL: elt1_v4f32:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    movaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X86-SSE4-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X86-SSE4-NEXT:    insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE4-LABEL: elt1_v4f32:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X64-SSE4-NEXT:    movaps {{.*#+}} xmm1 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X64-SSE4-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[2,3]
 ; X64-SSE4-NEXT:    movaps %xmm1, %xmm0
 ; X64-SSE4-NEXT:    retq
 ;
 ; X86-AVX-LABEL: elt1_v4f32:
 ; X86-AVX:       # %bb.0:
-; X86-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X86-AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X86-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
 ; X86-AVX-NEXT:    retl
 ;
 ; X64-AVX-LABEL: elt1_v4f32:
 ; X64-AVX:       # %bb.0:
-; X64-AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X64-AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X64-AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
 ; X64-AVX-NEXT:    retq
    %ins = insertelement <4 x float> <float 42.0, float 1.0, float 2.0, float 3.0>, float %x, i32 1
@@ -219,13 +219,13 @@ define <4 x float> @elt1_v4f32(float %x) {
 define <2 x double> @elt1_v2f64(double %x) {
 ; X86-SSE-LABEL: elt1_v2f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = <4.2E+1,u>
+; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,u]
 ; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    retl
 ;
 ; X64-SSE-LABEL: elt1_v2f64:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm1 = <4.2E+1,u>
+; X64-SSE-NEXT:    movaps {{.*#+}} xmm1 = [4.2E+1,u]
 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; X64-SSE-NEXT:    movaps %xmm1, %xmm0
 ; X64-SSE-NEXT:    retq
@@ -251,7 +251,7 @@ define <8 x i32> @elt7_v8i32(i32 %x) {
 ; X86-SSE2-LABEL: elt7_v8i32:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4,5,6,u>
+; X86-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [4,5,6,u]
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
 ; X86-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
@@ -260,7 +260,7 @@ define <8 x i32> @elt7_v8i32(i32 %x) {
 ; X64-SSE2-LABEL: elt7_v8i32:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movd %edi, %xmm0
-; X64-SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4,5,6,u>
+; X64-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [4,5,6,u]
 ; X64-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
 ; X64-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[2,0]
 ; X64-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
@@ -268,14 +268,14 @@ define <8 x i32> @elt7_v8i32(i32 %x) {
 ;
 ; X86-SSE4-LABEL: elt7_v8i32:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm1 = <4,5,6,u>
+; X86-SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [4,5,6,u]
 ; X86-SSE4-NEXT:    pinsrd $3, {{[0-9]+}}(%esp), %xmm1
 ; X86-SSE4-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE4-LABEL: elt7_v8i32:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm1 = <4,5,6,u>
+; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [4,5,6,u]
 ; X64-SSE4-NEXT:    pinsrd $3, %edi, %xmm1
 ; X64-SSE4-NEXT:    movaps {{.*#+}} xmm0 = [42,1,2,3]
 ; X64-SSE4-NEXT:    retq
@@ -314,7 +314,7 @@ define <8 x i32> @elt7_v8i32(i32 %x) {
 define <8 x float> @elt6_v8f32(float %x) {
 ; X86-SSE2-LABEL: elt6_v8f32:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
+; X86-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [4.0E+0,5.0E+0,u,7.0E+0]
 ; X86-SSE2-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
 ; X86-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
@@ -323,7 +323,7 @@ define <8 x float> @elt6_v8f32(float %x) {
 ;
 ; X64-SSE2-LABEL: elt6_v8f32:
 ; X64-SSE2:       # %bb.0:
-; X64-SSE2-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
+; X64-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [4.0E+0,5.0E+0,u,7.0E+0]
 ; X64-SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[3,0]
 ; X64-SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0,2]
 ; X64-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
@@ -331,14 +331,14 @@ define <8 x float> @elt6_v8f32(float %x) {
 ;
 ; X86-SSE4-LABEL: elt6_v8f32:
 ; X86-SSE4:       # %bb.0:
-; X86-SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
+; X86-SSE4-NEXT:    movaps {{.*#+}} xmm1 = [4.0E+0,5.0E+0,u,7.0E+0]
 ; X86-SSE4-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],mem[0],xmm1[3]
 ; X86-SSE4-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X86-SSE4-NEXT:    retl
 ;
 ; X64-SSE4-LABEL: elt6_v8f32:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    movaps {{.*#+}} xmm1 = <4.0E+0,5.0E+0,u,7.0E+0>
+; X64-SSE4-NEXT:    movaps {{.*#+}} xmm1 = [4.0E+0,5.0E+0,u,7.0E+0]
 ; X64-SSE4-NEXT:    insertps {{.*#+}} xmm1 = xmm1[0,1],xmm0[0],xmm1[3]
 ; X64-SSE4-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,1.0E+0,2.0E+0,3.0E+0]
 ; X64-SSE4-NEXT:    retq
@@ -385,7 +385,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X64-SSE2-LABEL: elt5_v8i64:
 ; X64-SSE2:       # %bb.0:
 ; X64-SSE2-NEXT:    movq %rdi, %xmm0
-; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = <4,u>
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [4,u]
 ; X64-SSE2-NEXT:    punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm0[0]
 ; X64-SSE2-NEXT:    movaps {{.*#+}} xmm0 = [42,1]
 ; X64-SSE2-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
@@ -394,7 +394,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ;
 ; X64-SSE4-LABEL: elt5_v8i64:
 ; X64-SSE4:       # %bb.0:
-; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm2 = <4,u>
+; X64-SSE4-NEXT:    movdqa {{.*#+}} xmm2 = [4,u]
 ; X64-SSE4-NEXT:    pinsrq $1, %rdi, %xmm2
 ; X64-SSE4-NEXT:    movaps {{.*#+}} xmm0 = [42,1]
 ; X64-SSE4-NEXT:    movaps {{.*#+}} xmm1 = [2,3]
@@ -412,7 +412,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ;
 ; X64-AVX1-LABEL: elt5_v8i64:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovdqa {{.*#+}} ymm0 = <4,u,6,7>
+; X64-AVX1-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,u,6,7]
 ; X64-AVX1-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
 ; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,1,2,3]
@@ -429,7 +429,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ;
 ; X64-AVX2-LABEL: elt5_v8i64:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = <4,u,6,7>
+; X64-AVX2-NEXT:    vmovdqa {{.*#+}} ymm0 = [4,u,6,7]
 ; X64-AVX2-NEXT:    vpinsrq $1, %rdi, %xmm0, %xmm1
 ; X64-AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,1,2,3]
@@ -449,7 +449,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X64-AVX512F:       # %bb.0:
 ; X64-AVX512F-NEXT:    vmovq %rdi, %xmm1
 ; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,4,8,6,7]
-; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <42,1,2,3,4,u,6,7>
+; X64-AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [42,1,2,3,4,u,6,7]
 ; X64-AVX512F-NEXT:    vpermt2q %zmm1, %zmm2, %zmm0
 ; X64-AVX512F-NEXT:    retq
    %ins = insertelement <8 x i64> <i64 42, i64 1, i64 2, i64 3, i64 4, i64 5, i64 6, i64 7>, i64 %x, i32 5
@@ -459,7 +459,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 define <8 x double> @elt1_v8f64(double %x) {
 ; X86-SSE-LABEL: elt1_v8f64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = <4.2E+1,u>
+; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [4.2E+1,u]
 ; X86-SSE-NEXT:    movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1]
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
@@ -468,7 +468,7 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X64-SSE-LABEL: elt1_v8f64:
 ; X64-SSE:       # %bb.0:
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm4 = <4.2E+1,u>
+; X64-SSE-NEXT:    movaps {{.*#+}} xmm4 = [4.2E+1,u]
 ; X64-SSE-NEXT:    movlhps {{.*#+}} xmm4 = xmm4[0],xmm0[0]
 ; X64-SSE-NEXT:    movaps {{.*#+}} xmm1 = [2.0E+0,3.0E+0]
 ; X64-SSE-NEXT:    movaps {{.*#+}} xmm2 = [4.0E+0,5.0E+0]
@@ -478,7 +478,7 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X86-AVX1-LABEL: elt1_v8f64:
 ; X86-AVX1:       # %bb.0:
-; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X86-AVX1-NEXT:    vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
 ; X86-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
@@ -486,7 +486,7 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X64-AVX1-LABEL: elt1_v8f64:
 ; X64-AVX1:       # %bb.0:
-; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X64-AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX1-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; X64-AVX1-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
@@ -494,7 +494,7 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X86-AVX2-LABEL: elt1_v8f64:
 ; X86-AVX2:       # %bb.0:
-; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X86-AVX2-NEXT:    vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
 ; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
@@ -502,7 +502,7 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X64-AVX2-LABEL: elt1_v8f64:
 ; X64-AVX2:       # %bb.0:
-; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <4.2E+1,u,2.0E+0,3.0E+0>
+; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.2E+1,u,2.0E+0,3.0E+0]
 ; X64-AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [4.0E+0,5.0E+0,6.0E+0,7.0E+0]
@@ -510,14 +510,14 @@ define <8 x double> @elt1_v8f64(double %x) {
 ;
 ; X86-AVX512F-LABEL: elt1_v8f64:
 ; X86-AVX512F:       # %bb.0:
-; X86-AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
+; X86-AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X86-AVX512F-NEXT:    vmovhps {{.*#+}} xmm1 = xmm0[0,1],mem[0,1]
 ; X86-AVX512F-NEXT:    vinsertf32x4 $0, %xmm1, %zmm0, %zmm0
 ; X86-AVX512F-NEXT:    retl
 ;
 ; X64-AVX512F-LABEL: elt1_v8f64:
 ; X64-AVX512F:       # %bb.0:
-; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0>
+; X64-AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [4.2E+1,u,2.0E+0,3.0E+0,4.0E+0,5.0E+0,6.0E+0,7.0E+0]
 ; X64-AVX512F-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X64-AVX512F-NEXT:    vinsertf32x4 $0, %xmm0, %zmm1, %zmm0
 ; X64-AVX512F-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index 11803e32ad437..f78646afccb34 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -1395,7 +1395,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-LABEL: truncstore_v4i64_v4i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pxor %xmm3, %xmm3
-; SSE4-NEXT:    movdqa {{.*#+}} xmm4 = <0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE4-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE4-NEXT:    pshufb %xmm4, %xmm1
 ; SSE4-NEXT:    pshufb %xmm4, %xmm0
 ; SSE4-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll
index c8c5afbd579df..17a82ce5f6209 100644
--- a/llvm/test/CodeGen/X86/matrix-multiply.ll
+++ b/llvm/test/CodeGen/X86/matrix-multiply.ll
@@ -330,7 +330,7 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
 ; AVX2-NEXT:    vmulss %xmm3, %xmm8, %xmm3
 ; AVX2-NEXT:    vaddss %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm1
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm3 = <0,1,2,4,5,6,u,u>
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm3 = [0,1,2,4,5,6,u,u]
 ; AVX2-NEXT:    vpermps %ymm1, %ymm3, %ymm1
 ; AVX2-NEXT:    vbroadcastsd %xmm0, %ymm0
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -394,7 +394,7 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
 ; AVX512F-NEXT:    vaddss %xmm1, %xmm2, %xmm1
 ; AVX512F-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX512F-NEXT:    vinsertf32x4 $1, %xmm7, %zmm6, %zmm2
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
 ; AVX512F-NEXT:    retq
 ;
@@ -453,7 +453,7 @@ define <9 x float> @test_mul3x3_f32(<9 x float> %a0, <9 x float> %a1) nounwind {
 ; AVX512VL-NEXT:    vaddss %xmm1, %xmm2, %xmm1
 ; AVX512VL-NEXT:    vinsertps {{.*#+}} xmm1 = xmm0[0,1],xmm1[0],xmm0[3]
 ; AVX512VL-NEXT:    vinsertf32x4 $1, %xmm5, %zmm3, %zmm2
-; AVX512VL-NEXT:    vmovaps {{.*#+}} zmm0 = <0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u>
+; AVX512VL-NEXT:    vmovaps {{.*#+}} zmm0 = [0,1,2,4,5,6,16,17,18,u,u,u,u,u,u,u]
 ; AVX512VL-NEXT:    vpermi2ps %zmm1, %zmm2, %zmm0
 ; AVX512VL-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/oddshuffles.ll b/llvm/test/CodeGen/X86/oddshuffles.ll
index f0fb89496aa1e..de192aa038e5b 100644
--- a/llvm/test/CodeGen/X86/oddshuffles.ll
+++ b/llvm/test/CodeGen/X86/oddshuffles.ll
@@ -527,7 +527,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, ptr %p) nounwind {
 ;
 ; AVX2-SLOW-LABEL: v12i32:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,u,1,5,u,2,6]
 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vbroadcastsd %xmm1, %ymm3
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
@@ -542,7 +542,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, ptr %p) nounwind {
 ;
 ; AVX2-FAST-ALL-LABEL: v12i32:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,u,1,5,u,2,6]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-ALL-NEXT:    vbroadcastsd %xmm1, %ymm3
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
@@ -557,7 +557,7 @@ define void @v12i32(<8 x i32> %a, <8 x i32> %b, ptr %p) nounwind {
 ;
 ; AVX2-FAST-PERLANE-LABEL: v12i32:
 ; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,u,1,5,u,2,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastsd %xmm1, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
@@ -1308,7 +1308,7 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
 ; AVX2-SLOW-NEXT:    vpermd %ymm2, %ymm4, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
@@ -1327,7 +1327,7 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-FAST-ALL-NEXT:    vmovdqu (%rdx), %xmm1
 ; AVX2-FAST-ALL-NEXT:    vmovdqu (%rcx), %xmm2
 ; AVX2-FAST-ALL-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm2, %ymm4, %ymm4
 ; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm3, %ymm5, %ymm3
@@ -1353,7 +1353,7 @@ define void @interleave_24i16_in(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm2, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
@@ -1507,18 +1507,18 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-SLOW-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm4, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-SLOW-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
 ; AVX2-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm4, %ymm4
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
@@ -1537,21 +1537,21 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-FAST-ALL-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm2, %ymm3, %ymm3
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-FAST-ALL-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
 ; AVX2-FAST-ALL-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm2, %ymm4, %ymm4
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FAST-ALL-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
 ; AVX2-FAST-ALL-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm2, %ymm5, %ymm2
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FAST-ALL-NEXT:    vmovups %ymm3, (%rsi)
@@ -1568,18 +1568,18 @@ define void @interleave_24i32_out(ptr %p, ptr %q1, ptr %q2, ptr %q3) nounwind {
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
 ; AVX2-FAST-PERLANE-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
diff --git a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
index 83dd0b5b9dbfd..5052a08177d7f 100644
--- a/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
+++ b/llvm/test/CodeGen/X86/omit-urem-of-power-of-two-or-zero-when-comparing-with-zero.ll
@@ -149,7 +149,7 @@ define <4 x i1> @p5_vector_urem_by_const__nonsplat(<4 x i32> %x, <4 x i32> %y) {
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE4-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,2147483648,u>
+; SSE4-NEXT:    movdqa {{.*#+}} xmm1 = [1,u,2147483648,u]
 ; SSE4-NEXT:    pmuludq %xmm0, %xmm1
 ; SSE4-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; SSE4-NEXT:    psrlq $32, %xmm1
diff --git a/llvm/test/CodeGen/X86/peephole-fold-movsd.ll b/llvm/test/CodeGen/X86/peephole-fold-movsd.ll
index b44d0ff6dafd1..c0a6e00ec695e 100644
--- a/llvm/test/CodeGen/X86/peephole-fold-movsd.ll
+++ b/llvm/test/CodeGen/X86/peephole-fold-movsd.ll
@@ -18,7 +18,7 @@ define dso_local void @foo1(double %a.coerce0, double %a.coerce1, double %b.coer
 ; CHECK-NEXT:    movq %rsp, %rdi
 ; CHECK-NEXT:    callq foo3@PLT
 ; CHECK-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; CHECK-NEXT:    movapd {{.*#+}} xmm1 = <1.0E+0,u>
+; CHECK-NEXT:    movapd {{.*#+}} xmm1 = [1.0E+0,u]
 ; CHECK-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; CHECK-NEXT:    addpd %xmm0, %xmm1
 ; CHECK-NEXT:    movapd %xmm1, g(%rip)
diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 6b0543315d867..38e45595a8846 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -11,7 +11,7 @@ define i32 @PR63108() {
 ; SSE-NEXT:    testb %al, %al
 ; SSE-NEXT:    je .LBB0_2
 ; SSE-NEXT:  # %bb.1:
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = <251,223,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [251,223,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE-NEXT:    jmp .LBB0_5
 ; SSE-NEXT:  .LBB0_2: # %vector.body.preheader
 ; SSE-NEXT:    pxor %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index d2da4c0f3e86a..4f7f44d1d7c14 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -208,7 +208,7 @@ define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind {
 ;
 ; SSE41-LABEL: usubsat_custom:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <2147483648,2147483648,2147483648,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,2147483648,2147483648,u]
 ; SSE41-NEXT:    pmaxud %xmm1, %xmm0
 ; SSE41-NEXT:    psubd %xmm1, %xmm0
 ; SSE41-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll
index acf123bde41f0..839c972b23dba 100644
--- a/llvm/test/CodeGen/X86/sext-vsetcc.ll
+++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll
@@ -216,7 +216,7 @@ define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind {
 ; SSE-LABEL: cmp_ult_load_const:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE-NEXT:    pmaxub %xmm0, %xmm1
 ; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -243,7 +243,7 @@ define <3 x i32> @cmp_ult_load_const_bad_type(ptr %x) nounwind {
 ; SSE-LABEL: cmp_ult_load_const_bad_type:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = <42,214,0,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [42,214,0,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE-NEXT:    pmaxub %xmm0, %xmm1
 ; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 3d80c1554b6c3..8dd06e0d848d3 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1745,7 +1745,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u]
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm2
 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
@@ -1768,7 +1768,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = <0,65535,u,u,u,u,u,u>
+; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u]
 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm2
 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/sink-addsub-of-const.ll b/llvm/test/CodeGen/X86/sink-addsub-of-const.ll
index 9f08cc4cba32c..cc46f04b8dabe 100644
--- a/llvm/test/CodeGen/X86/sink-addsub-of-const.ll
+++ b/llvm/test/CodeGen/X86/sink-addsub-of-const.ll
@@ -448,7 +448,7 @@ define <4 x i32> @vec_sink_sub_of_const_to_sub2(<4 x i32> %a, <4 x i32> %b) {
 define <4 x i32> @vec_sink_sub_from_const_to_sub(<4 x i32> %a, <4 x i32> %b) {
 ; ALL-LABEL: vec_sink_sub_from_const_to_sub:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    movdqa {{.*#+}} xmm2 = <42,24,u,46>
+; ALL-NEXT:    movdqa {{.*#+}} xmm2 = [42,24,u,46]
 ; ALL-NEXT:    paddd %xmm1, %xmm0
 ; ALL-NEXT:    psubd %xmm0, %xmm2
 ; ALL-NEXT:    movdqa %xmm2, %xmm0
diff --git a/llvm/test/CodeGen/X86/slow-pmulld.ll b/llvm/test/CodeGen/X86/slow-pmulld.ll
index 8e330c3bfc676..913ae195c3631 100644
--- a/llvm/test/CodeGen/X86/slow-pmulld.ll
+++ b/llvm/test/CodeGen/X86/slow-pmulld.ll
@@ -317,7 +317,7 @@ define <16 x i32> @test_mul_v16i32_v16i8(<16 x i8> %A) {
 define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
 ; SLM-LABEL: test_mul_v4i32_v4i16:
 ; SLM:       # %bb.0:
-; SLM-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; SLM-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,u,u,u,u]
 ; SLM-NEXT:    movdqa %xmm0, %xmm2
 ; SLM-NEXT:    pmulhuw %xmm1, %xmm2
 ; SLM-NEXT:    pmullw %xmm1, %xmm0
@@ -326,7 +326,7 @@ define <4 x i32> @test_mul_v4i32_v4i16(<4 x i16> %A) {
 ;
 ; SLOW-LABEL: test_mul_v4i32_v4i16:
 ; SLOW:       # %bb.0:
-; SLOW-NEXT:    movdqa {{.*#+}} xmm1 = <18778,18778,18778,18778,u,u,u,u>
+; SLOW-NEXT:    movdqa {{.*#+}} xmm1 = [18778,18778,18778,18778,u,u,u,u]
 ; SLOW-NEXT:    movdqa %xmm0, %xmm2
 ; SLOW-NEXT:    pmulhuw %xmm1, %xmm2
 ; SLOW-NEXT:    pmullw %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 0d28dfc68f5bf..56d1fef5f6d6d 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -555,7 +555,7 @@ define <4 x i32> @test_srem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm1
@@ -1098,7 +1098,7 @@ define <4 x i32> @test_srem_even_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE2-LABEL: test_srem_even_INT_MIN:
 ; CHECK-SSE2:       # %bb.0:
 ; CHECK-SSE2-NEXT:    pxor %xmm2, %xmm2
-; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <3067833783,u,1,u>
+; CHECK-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [3067833783,u,1,u]
 ; CHECK-SSE2-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
 ; CHECK-SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[1,1,3,3]
@@ -1357,7 +1357,7 @@ define <4 x i32> @test_srem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; CHECK-SSE41-NEXT:    paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm1
diff --git a/llvm/test/CodeGen/X86/sse2.ll b/llvm/test/CodeGen/X86/sse2.ll
index 231f274db83cd..cf5f527b16114 100644
--- a/llvm/test/CodeGen/X86/sse2.ll
+++ b/llvm/test/CodeGen/X86/sse2.ll
@@ -597,7 +597,7 @@ define  <2 x double> @test16(ptr nocapture %srcA, ptr nocapture %dst) {
 define fastcc void @test17() nounwind {
 ; X86-SSE-LABEL: test17:
 ; X86-SSE:       # %bb.0: # %entry
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
+; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [u,u,32768,32768]
 ; X86-SSE-NEXT:    movaps %xmm0, (%eax)
 ; X86-SSE-NEXT:    retl
 ;
@@ -609,7 +609,7 @@ define fastcc void @test17() nounwind {
 ;
 ; X64-SSE-LABEL: test17:
 ; X64-SSE:       # %bb.0: # %entry
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = <u,u,32768,32768>
+; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = [u,u,32768,32768]
 ; X64-SSE-NEXT:    movaps %xmm0, (%rax)
 ; X64-SSE-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index f2819bcbfe8a0..f677e6737987e 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -137,7 +137,7 @@ define <3 x i1> @test_urem_vec(<3 x i11> %X) nounwind {
 ; SSE2-NEXT:    movd %edx, %xmm0
 ; SSE2-NEXT:    punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0]
 ; SSE2-NEXT:    psubd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = <683,u,819,u>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [683,u,819,u]
 ; SSE2-NEXT:    pmuludq %xmm1, %xmm0
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
diff --git a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
index 7846f0e8f6e03..484ae2bf95602 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-vec-nonsplat.ll
@@ -483,7 +483,7 @@ define <4 x i32> @test_urem_odd_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: test_urem_odd_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm1
@@ -915,7 +915,7 @@ define <4 x i32> @test_urem_odd_INT_MIN(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: test_urem_odd_INT_MIN:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,2,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,u,2,u]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm1
@@ -1145,7 +1145,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo(<4 x i32> %X) nounwind {
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm1
@@ -1797,7 +1797,7 @@ define <4 x i32> @test_urem_odd_allones_and_poweroftwo_and_one(<4 x i32> %X) nou
 ; CHECK-SSE41-LABEL: test_urem_odd_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <1,u,268435456,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [1,u,268435456,u]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm1
@@ -1865,7 +1865,7 @@ define <4 x i32> @test_urem_even_allones_and_poweroftwo_and_one(<4 x i32> %X) no
 ; CHECK-SSE41-LABEL: test_urem_even_allones_and_poweroftwo_and_one:
 ; CHECK-SSE41:       # %bb.0:
 ; CHECK-SSE41-NEXT:    pmulld {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <2147483648,u,268435456,u>
+; CHECK-SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [2147483648,u,268435456,u]
 ; CHECK-SSE41-NEXT:    pmuludq %xmm0, %xmm1
 ; CHECK-SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
 ; CHECK-SSE41-NEXT:    psrlq $32, %xmm1
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index cb19e202e7947..1d81cb3773080 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -1907,7 +1907,7 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() {
 define <4 x i32> @fptosi_2f64_to_2i32_const() {
 ; SSE-LABEL: fptosi_2f64_to_2i32_const:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = <4294967295,1,u,u>
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,1,u,u]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fptosi_2f64_to_2i32_const:
@@ -1966,7 +1966,7 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() {
 define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
 ; SSE-LABEL: fptoui_2f64_to_2i32_const:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = <2,4,u,u>
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4,u,u]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fptoui_2f64_to_2i32_const:
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index 502dc9c0b918b..73c2f4ca5b10b 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -79,7 +79,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 ; SSE41-LABEL: vsel_4xi8:
 ; SSE41:       # %bb.0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-fshl-512.ll b/llvm/test/CodeGen/X86/vector-fshl-512.ll
index 0724d879f557d..558e77ede0186 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-512.ll
@@ -806,7 +806,7 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512F-NEXT:    vpsrlw $1, %ymm2, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u]
 ; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7],ymm4[8,9,10,11,12,13,14],ymm2[15]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
@@ -827,7 +827,7 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512VL-NEXT:    vpsrlw $1, %ymm2, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768,u]
 ; AVX512VL-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7],ymm4[8,9,10,11,12,13,14],ymm2[15]
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3],ymm2[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index eb2df0dcda98a..1f51f02a197ac 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1868,7 +1868,7 @@ define <8 x i16> @constant_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y) nounwind {
 ;
 ; SSE41-LABEL: constant_funnnel_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <u,32768,16384,8192,4096,2048,1024,512>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [u,32768,16384,8192,4096,2048,1024,512]
 ; SSE41-NEXT:    pmulhuw %xmm1, %xmm2
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm2 = xmm1[0],xmm2[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    paddw %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-fshr-512.ll b/llvm/test/CodeGen/X86/vector-fshr-512.ll
index 1c6646152ab1f..29c258b857eec 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-512.ll
@@ -815,7 +815,7 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
 ; AVX512F-LABEL: constant_funnnel_v32i16:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
 ; AVX512F-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm4
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
@@ -834,7 +834,7 @@ define <32 x i16> @constant_funnnel_v32i16(<32 x i16> %x, <32 x i16> %y) nounwin
 ; AVX512VL-LABEL: constant_funnnel_v32i16:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
 ; AVX512VL-NEXT:    vpmulhuw %ymm3, %ymm2, %ymm4
 ; AVX512VL-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm4[1,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm4[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
index 04fd6d9300c18..8883b3b87df7a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-2.ll
@@ -211,7 +211,7 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST:       # %bb.0:
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm3
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm2
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
@@ -229,7 +229,7 @@ define void @load_i16_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST-PERLANE:       # %bb.0:
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm1, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm0, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm2 = ymm2[0,2],ymm3[0,2],ymm2[4,6],ymm3[4,6]
@@ -383,9 +383,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm6 = ymm6[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm5 = ymm5[0,2,1,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
@@ -406,7 +406,7 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa 96(%rdi), %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm5
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm2, %ymm6
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6]
@@ -415,9 +415,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm0, %ymm4
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6]
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
@@ -438,7 +438,7 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdi), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%rdi), %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm3, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm2, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm5 = ymm6[0,2],ymm5[0,2],ymm6[4,6],ymm5[4,6]
@@ -447,9 +447,9 @@ define void @load_i16_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm0, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,2],ymm6[0,2],ymm4[4,6],ymm6[4,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,2,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,1,3]
@@ -775,9 +775,9 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm12 = ymm12[0,1,2,3,4,6,6,7,8,9,10,11,12,14,14,15]
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm11 = ymm12[0,2],ymm11[0,2],ymm12[4,6],ymm11[4,6]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,2,1,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm13, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,1,3]
@@ -814,7 +814,7 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %ymm7
 ; AVX2-FAST-NEXT:    vmovdqa 96(%rdi), %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm8, %ymm2
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm7, %ymm10
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6]
@@ -831,9 +831,9 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm0, %ymm9
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[0,2],ymm12[0,2],ymm9[4,6],ymm12[4,6]
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm8, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3]
@@ -870,7 +870,7 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdi), %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%rdi), %ymm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,4,5,u,u,u,u,8,9,12,13,u,u,u,u,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm8, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm7, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,2],ymm2[0,2],ymm10[4,6],ymm2[4,6]
@@ -887,9 +887,9 @@ define void @load_i16_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) no
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm0, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[0,2],ymm12[0,2],ymm9[4,6],ymm12[4,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm9 = ymm9[0,2,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm12, %ymm8, %ymm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5],ymm8[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,1,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index 384e9cf36cf85..d2bef6b234e38 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -575,7 +575,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY:       # %bb.0:
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13,14],ymm3[15]
@@ -588,7 +588,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0,1,2],ymm5[3,4,5,6,7],ymm2[8,9,10],ymm5[11,12,13,14,15]
 ; AVX2-ONLY-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,5,4,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm5[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm5
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
@@ -599,11 +599,11 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2],ymm6[3,4,5,6,7],ymm5[8,9,10],ymm6[11,12,13,14,15]
 ; AVX2-ONLY-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,6,7,4]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3],xmm3[4],xmm4[5,6],xmm3[7]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
@@ -647,7 +647,7 @@ define void @load_i16_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpternlogq $202, %ymm2, %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3],ymm0[4,5],ymm1[6],ymm0[7],ymm1[8],ymm0[9,10],ymm1[11],ymm0[12,13],ymm1[14],ymm0[15]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,10,11,0,1,6,7,12,13,2,3,8,9,14,15,20,21,26,27,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm5[0],xmm4[1],xmm5[2,3],xmm4[4],xmm5[5,6],xmm4[7]
 ; AVX512F-NEXT:    vpshufb %xmm1, %xmm2, %xmm1
@@ -939,7 +939,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,3,2,3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4,5,6,7]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0],xmm8[1],xmm5[2,3],xmm8[4],xmm5[5,6],xmm8[7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpshufd {{.*#+}} xmm7 = xmm6[0,1,2,1]
 ; AVX1-ONLY-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,5,6,5]
@@ -973,7 +973,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm15 = xmm5[0,1],xmm8[2],xmm5[3,4],xmm8[5],xmm5[6,7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm15, %xmm15
 ; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13]
 ; AVX1-ONLY-NEXT:    # xmm0 = mem[0,0]
@@ -991,7 +991,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15]
 ; AVX1-ONLY-NEXT:    # xmm5 = mem[0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm6[5,6,7]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm14, %xmm5
@@ -1001,7 +1001,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpblendw $146, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm6 # 16-byte Folded Reload
 ; AVX1-ONLY-NEXT:    # xmm6 = xmm6[0],mem[1],xmm6[2,3],mem[4],xmm6[5,6],mem[7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
 ; AVX1-ONLY-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,1,2,3,4,5,6,7]
@@ -1033,7 +1033,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovdqa 96(%rdi), %ymm2
 ; AVX2-ONLY-NEXT:    vmovdqa 128(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm2, %ymm4, %ymm3
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm5[1],ymm3[2,3],ymm5[4],ymm3[5,6],ymm5[7],ymm3[8],ymm5[9],ymm3[10,11],ymm5[12],ymm3[13,14],ymm5[15]
@@ -1060,7 +1060,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm10 = ymm9[0,1,2],ymm10[3,4,5,6,7],ymm9[8,9,10],ymm10[11,12,13,14,15]
 ; AVX2-ONLY-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,6,5,4,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm4, %ymm2, %ymm10
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm10 = ymm10[0,1],ymm12[2],ymm10[3,4],ymm12[5],ymm10[6,7,8,9],ymm12[10],ymm10[11,12],ymm12[13],ymm10[14,15]
@@ -1083,7 +1083,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0,1,2],ymm12[3,4,5,6,7],ymm11[8,9,10],ymm12[11,12,13,14,15]
 ; AVX2-ONLY-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,6,7,4]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1,2],ymm4[3],ymm2[4,5],ymm4[6],ymm2[7],ymm4[8],ymm2[9,10],ymm4[11],ymm2[12,13],ymm4[14],ymm2[15]
@@ -1196,15 +1196,15 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm0
@@ -1725,7 +1725,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vmovdqa 208(%rdi), %xmm10
 ; AVX1-ONLY-NEXT:    vmovdqa 192(%rdi), %xmm9
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm3 = xmm9[0],xmm10[1],xmm9[2,3],xmm10[4],xmm9[5,6],xmm10[7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,6,7,12,13,2,3,8,9,14,15,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa 224(%rdi), %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1820,7 +1820,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0,1,2],xmm13[3,4,5,6,7]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm13 = xmm9[0,1],xmm10[2],xmm9[3,4],xmm10[5],xmm9[6,7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,8,9,14,15,4,5,10,11,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,0,1,6,7,12,13,0,0,0,1,6,7,12,13]
 ; AVX1-ONLY-NEXT:    # xmm0 = mem[0,0]
@@ -1872,7 +1872,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm13 = [0,0,2,3,8,9,14,15,0,0,2,3,8,9,14,15]
 ; AVX1-ONLY-NEXT:    # xmm13 = mem[0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm11, %xmm3
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,10,11,0,1,6,7,12,13,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0,1,2,3,4],xmm3[5,6,7]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -1890,7 +1890,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0],xmm5[1],xmm15[2,3],xmm5[4],xmm15[5,6],xmm5[7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,0,1,6,7,12,13,2,3,8,9,14,15]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[2,1,2,3]
 ; AVX1-ONLY-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm6[2,1,2,3,4,5,6,7]
@@ -1966,17 +1966,17 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vmovdqa 320(%rdi), %ymm10
 ; AVX2-ONLY-NEXT:    vmovdqa 96(%rdi), %ymm12
 ; AVX2-ONLY-NEXT:    vmovdqa 128(%rdi), %ymm13
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm12, %ymm13, %ymm0
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm3 = ymm0[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7],ymm0[8],ymm3[9],ymm0[10,11],ymm3[12],ymm0[13,14],ymm3[15]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm15 = ymm0[0,1,6,7,12,13,2,3,4,5,14,15,8,9,10,11,16,17,22,23,28,29,18,19,20,21,30,31,24,25,26,27]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm9, %ymm10, %ymm3
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm4, %ymm5, %ymm8
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm0, %ymm13, %ymm12, %ymm6
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm13, %ymm12, %ymm6
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm0, %ymm10, %ymm9, %ymm13
@@ -2308,21 +2308,21 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,33,36,39,42,45,48,51,54,57,60,63,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm7
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm6, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,34,37,40,43,46,49,52,55,58,61]
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm8, %zmm7
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
 ; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm8, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,16,19,22,25,28,31,34,37,40,43,46,49,52,55,58,61,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm9
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm8, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,32,35,38,41,44,47,50,53,56,59,62]
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm10, %zmm9
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm10, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [34,37,40,43,46,49,52,55,58,61,0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm10, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,33,36,39,42,45,48,51,54,57,60,63]
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm5, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
index 44f2d58cb4e96..241e3d6a1a3e4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-4.ll
@@ -832,7 +832,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm4, %ymm6
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm7, %ymm3
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm6[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
@@ -851,7 +851,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3]
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm4, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm11[0,1,2,3],ymm4[4,5,6,7]
@@ -1065,7 +1065,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rdi), %ymm2
 ; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm1, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vmovdqa 64(%rdi), %ymm6
 ; AVX512F-FAST-NEXT:    vpermd %ymm6, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm7
@@ -1076,7 +1076,7 @@ define void @load_i16_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm9[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $16, %zmm4, %zmm3
@@ -1884,7 +1884,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm10, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa %ymm4, %ymm9
 ; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm2, %ymm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm11, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa %ymm7, %ymm12
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
@@ -1920,7 +1920,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm0 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm10, %ymm13
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm11, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm10
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm13[6,7]
@@ -1981,7 +1981,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm4, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2,3,4,5],ymm4[6,7]
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,1,4,5,6,7]
@@ -2381,7 +2381,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa 224(%rdi), %ymm6
 ; AVX512F-FAST-NEXT:    vpermd %ymm6, %ymm4, %ymm5
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm5, %ymm7
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vmovdqa 192(%rdi), %ymm8
 ; AVX512F-FAST-NEXT:    vpermd %ymm8, %ymm4, %ymm9
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm9, %ymm10
@@ -2401,7 +2401,7 @@ define void @load_i16_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm4[0,1,2,3],zmm7[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm5, %ymm13
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm9, %ymm9
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm13[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $16, %zmm1, %zmm13
@@ -4096,7 +4096,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa %ymm4, %ymm8
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm2, %ymm10
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm10, %ymm4
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5,6,7]
@@ -4171,7 +4171,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm13 = xmm14[0,1],xmm13[2,3]
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm13[0,1,2,3],ymm3[4,5,6,7]
@@ -4237,7 +4237,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm12 = [16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29,16,17,20,21,24,25,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm15, %ymm6, %ymm11
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm11[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[3,1,2,3]
@@ -4331,7 +4331,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
@@ -5186,7 +5186,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 224(%rdi), %ymm24
 ; AVX512F-FAST-NEXT:    vpermd %ymm24, %ymm1, %ymm10
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm10, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,4,5,u,u,u,u,8,9,12,13,12,13,14,15,16,17,20,21,u,u,u,u,24,25,28,29,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vmovdqa64 192(%rdi), %ymm25
 ; AVX512F-FAST-NEXT:    vpermd %ymm25, %ymm1, %ymm11
 ; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm11, %ymm3
@@ -5225,7 +5225,7 @@ define void @load_i16_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm14[0,1,2,3],zmm9[0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm9 = [18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31,18,19,22,23,26,27,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm10, %ymm14
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,3,6,7,10,11,14,15,14,15,10,11,12,13,14,15,18,19,22,23,26,27,30,31,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm11, %ymm11
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3,4,5],ymm14[6,7]
 ; AVX512F-FAST-NEXT:    vpsrlq $16, %zmm4, %zmm14
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 30f96e0b3e149..904f8ddd2e7f4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -1325,7 +1325,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1],xmm9[2,3],xmm6[4,5,6],xmm9[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[2,3,12,13,6,7,0,1,10,11,4,5,14,15,10,11]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,u,u,u,4,7,1,6>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,u,u,u,4,7,1,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm10, %ymm9
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm6, %ymm9, %ymm6
@@ -1341,7 +1341,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm2[1],ymm1[2,3],ymm2[4],ymm1[5],ymm2[6],ymm1[7,8],ymm2[9],ymm1[10,11],ymm2[12],ymm1[13],ymm2[14],ymm1[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,2,u,u,5,7,2,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
@@ -1356,7 +1356,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm2[0],ymm1[1],ymm2[2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
@@ -1371,7 +1371,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3,4],xmm4[5,6,7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3],ymm1[4],ymm2[5,6],ymm1[7],ymm2[8,9],ymm1[10],ymm2[11],ymm1[12],ymm2[13,14],ymm1[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,u,u,6,0,3,5]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3,4,5,6,7]
@@ -1570,7 +1570,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm0[0],ymm1[1,2],ymm0[3],ymm1[4],ymm0[5],ymm1[6,7],ymm0[8],ymm1[9,10],ymm0[11],ymm1[12],ymm0[13],ymm1[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <1,u,u,u,4,6,1,3>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,u,u,u,4,6,1,3]
 ; AVX512F-FAST-NEXT:    vpermd %ymm4, %ymm5, %ymm4
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
@@ -1587,7 +1587,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm6 = ymm5[0,1,2,3,4],ymm6[5,6,7],ymm5[8,9,10,11,12],ymm6[13,14,15]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm6 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,u,u,u,4,7,1,6>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,u,u,u,4,7,1,6]
 ; AVX512F-FAST-NEXT:    vpermd %ymm6, %ymm8, %ymm6
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm3[0],ymm2[1],ymm3[2],ymm2[3],ymm3[4,5],ymm2[6],ymm3[7,8],ymm2[9],ymm3[10],ymm2[11],ymm3[12,13],ymm2[14],ymm3[15]
@@ -1607,7 +1607,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm10[3,4],xmm9[5,6,7]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,2,u,u,5,7,2,4>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,2,u,u,5,7,2,4]
 ; AVX512F-FAST-NEXT:    vpermd %ymm10, %ymm11, %ymm10
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
@@ -1622,7 +1622,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm10, %ymm11, %ymm10
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2],ymm10[3,4,5,6,7]
@@ -1637,7 +1637,7 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2],xmm2[3,4],xmm3[5,6,7]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,3,u,u,6,0,3,5>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,3,u,u,6,0,3,5]
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7]
@@ -1659,24 +1659,24 @@ define void @load_i16_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,2,3,4,5,6,7,8,9,10,11,12,17,22,27]
 ; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28]
 ; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29]
 ; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,20,25,30]
 ; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,6,7,8,9,10,11,16,21,26,31]
 ; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm2, %ymm0
@@ -2357,7 +2357,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm0 # 16-byte Folded Reload
 ; AVX1-ONLY-NEXT:    # xmm0 = mem[0,1,2,3],xmm6[4,5],mem[6,7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vpshufd {{.*#+}} xmm3 = xmm11[3,1,2,3]
 ; AVX1-ONLY-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[2,1,2,3,4,5,6,7]
@@ -2409,7 +2409,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm1, %ymm1
 ; AVX1-ONLY-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm0[2,3],xmm5[4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm3, %xmm0
 ; AVX1-ONLY-NEXT:    vpsrlq $48, %xmm2, %xmm3
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0],xmm0[1,2,3,4,5,6,7]
@@ -2623,7 +2623,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm4[0,1],ymm2[2],ymm4[3],ymm2[4],ymm4[5,6],ymm2[7],ymm4[8,9],ymm2[10],ymm4[11],ymm2[12],ymm4[13,14],ymm2[15]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm8
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[3,4],xmm1[5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
@@ -2659,9 +2659,9 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm4, %ymm7
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm8
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm8[0],xmm1[1],xmm8[2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm12, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm13[0,1],xmm14[2],xmm13[3]
@@ -2788,7 +2788,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm10, %ymm10
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm12, %ymm10, %ymm11
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm10 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13],ymm6[14],ymm7[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <2,u,u,u,4,7,1,6>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,u,u,u,4,7,1,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm10, %ymm12, %ymm10
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm10, %ymm10
@@ -2843,11 +2843,11 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <0,2,u,u,5,7,2,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,2,u,u,5,7,2,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm8, %ymm14, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm8, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm8[3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [1,4,6,0,1,4,6,0]
@@ -2878,11 +2878,11 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm5[1],ymm4[2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10],ymm5[11],ymm4[12,13],ymm5[14],ymm4[15]
 ; AVX2-FAST-NEXT:    vmovdqa %ymm4, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,3,u,u,5,0,2,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,3,u,u,5,0,2,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm11, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [2,4,7,0,2,4,7,0]
@@ -2914,7 +2914,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3,4],xmm2[5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,3,u,u,6,0,3,5>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,3,u,u,6,0,3,5]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
 ; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -3059,9 +3059,9 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm0[2],ymm5[3],ymm0[4],ymm5[5,6],ymm0[7],ymm5[8,9],ymm0[10],ymm5[11],ymm0[12],ymm5[13,14],ymm0[15]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm11[3,4],xmm2[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0],xmm9[1],xmm8[2,3]
@@ -3094,9 +3094,9 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm2 = ymm0[0],ymm5[1,2],ymm0[3],ymm5[4],ymm0[5],ymm5[6,7],ymm0[8],ymm5[9,10],ymm0[11],ymm5[12],ymm0[13],ymm5[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm2 = xmm11[0],xmm2[1],xmm11[2],xmm2[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm2 = xmm8[0,1],xmm9[2],xmm8[3]
@@ -3194,7 +3194,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0],ymm3[1],ymm9[2,3],ymm3[4],ymm9[5],ymm3[6],ymm9[7,8],ymm3[9],ymm9[10,11],ymm3[12],ymm9[13],ymm3[14],ymm9[15]
 ; AVX512F-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1,2],ymm6[3],ymm5[4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
 ; AVX512F-SLOW-NEXT:    vmovdqa 176(%rdi), %xmm1
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm1[3,1,2,3]
@@ -3384,10 +3384,10 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa 192(%rdi), %ymm4
 ; AVX512F-FAST-NEXT:    vmovdqa 224(%rdi), %ymm5
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13],ymm4[14],ymm5[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,4,7,1,4,6,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,4,7,1,4,6,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm6, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <8,9,3,2,4,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [8,9,3,2,4,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm0, %ymm1, %ymm6
 ; AVX512F-FAST-NEXT:    vmovdqa 256(%rdi), %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa 288(%rdi), %ymm1
@@ -3402,7 +3402,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa 64(%rdi), %ymm8
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rdi), %ymm9
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm9[1,2],ymm8[3],ymm9[4],ymm8[5],ymm9[6,7],ymm8[8],ymm9[9,10],ymm8[11],ymm9[12],ymm8[13],ymm9[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <1,u,u,u,4,6,1,3>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [1,u,u,u,4,6,1,3]
 ; AVX512F-FAST-NEXT:    vpermd %ymm7, %ymm12, %ymm7
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm7[2,3,16,17,22,23,24,25,30,31,20,21],zero,zero,zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm10[0],ymm11[1],ymm10[2,3],ymm11[4],ymm10[5],ymm11[6],ymm10[7,8],ymm11[9],ymm10[10,11],ymm11[12],ymm10[13],ymm11[14],ymm10[15]
@@ -3410,7 +3410,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0],xmm13[1,2,3],xmm12[4,5],xmm13[6,7]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,10,11,4,5,14,15,8,9,2,3,12,13],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpor %ymm7, %ymm12, %ymm12
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <0,3,1,u,0,3,5,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,3,1,u,0,3,5,u]
 ; AVX512F-FAST-NEXT:    vmovdqa 128(%rdi), %ymm7
 ; AVX512F-FAST-NEXT:    vpermd %ymm7, %ymm13, %ymm13
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [4,5,2,3,4,5,6,7,8,9,2,3,4,5,10,11,20,21,18,19,20,21,22,23,24,25,18,19,20,21,26,27]
@@ -3421,14 +3421,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0],ymm10[1],ymm11[2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm12, %xmm13
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3],xmm12[4,5,6],xmm13[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm12, %ymm12
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13],ymm8[14],ymm9[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <2,u,u,u,4,7,1,6>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [2,u,u,u,4,7,1,6]
 ; AVX512F-FAST-NEXT:    vpermd %ymm13, %ymm17, %ymm13
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm13[0,1,18,19,20,21,26,27,16,17,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vpor %ymm13, %ymm12, %ymm12
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <1,3,2,u,1,3,6,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [1,3,2,u,1,3,6,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm7, %ymm13, %ymm15
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,6,7,4,5,6,7,8,9,0,1,6,7,8,9,16,17,22,23,20,21,22,23,24,25,16,17,22,23,24,25]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm13, %ymm15, %ymm15
@@ -3437,7 +3437,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpsrlq $48, %xmm2, %xmm12
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm4[0],ymm5[1],ymm4[2,3],ymm5[4],ymm4[5],ymm5[6],ymm4[7,8],ymm5[9],ymm4[10,11],ymm5[12],ymm4[13],ymm5[14],ymm4[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <0,2,5,7,4,7,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [0,2,5,7,4,7,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm12, %ymm17, %ymm12
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[2,3,4,5,4,5,0,1,6,7,8,9,14,15,4,5,18,19,20,21,20,21,16,17,22,23,24,25,30,31,20,21]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0,1,2],xmm12[3,4,5,6,7]
@@ -3457,7 +3457,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm15 = xmm3[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm15[2],xmm2[2],xmm15[3],xmm2[3]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm5[0],ymm4[1],ymm5[2],ymm4[3],ymm5[4,5],ymm4[6],ymm5[7,8],ymm4[9],ymm5[10],ymm4[11],ymm5[12,13],ymm4[14],ymm5[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <0,3,5,2,5,7,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [0,3,5,2,5,7,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm12, %ymm19, %ymm12
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,6,7,2,3,2,3,4,5,10,11,0,1,14,15,16,17,22,23,18,19,18,19,20,21,26,27,16,17,30,31]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm15 = xmm15[0,1,2],xmm12[3,4,5,6,7]
@@ -3468,7 +3468,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4],xmm12[5,6,7]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm15 = ymm8[0],ymm9[1],ymm8[2,3],ymm9[4],ymm8[5],ymm9[6],ymm8[7,8],ymm9[9],ymm8[10,11],ymm9[12],ymm8[13],ymm9[14],ymm8[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <0,2,u,u,5,7,2,4>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [0,2,u,u,5,7,2,4]
 ; AVX512F-FAST-NEXT:    vpermd %ymm15, %ymm19, %ymm15
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm12[0,1,2],ymm15[3,4,5,6,7]
@@ -3483,14 +3483,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm6 = xmm12[0],xmm6[1],xmm12[2],xmm6[3]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm9[0],ymm8[1],ymm9[2],ymm8[3],ymm9[4,5],ymm8[6],ymm9[7,8],ymm8[9],ymm9[10],ymm8[11],ymm9[12,13],ymm8[14],ymm9[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,3,u,u,5,0,2,7>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,3,u,u,5,0,2,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm12, %ymm15, %ymm12
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm12[3,4,5,6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm12 = xmm2[0],xmm3[1],xmm2[2,3]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[6,7,0,1,10,11,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm15 = ymm5[0,1],ymm4[2],ymm5[3],ymm4[4],ymm5[5,6],ymm4[7],ymm5[8,9],ymm4[10],ymm5[11],ymm4[12],ymm5[13,14],ymm4[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <1,3,6,0,5,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm15, %ymm18, %ymm15
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[2,3,2,3,4,5,0,1,6,7,8,9,14,15,4,5,18,19,18,19,20,21,16,17,22,23,24,25,30,31,20,21]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1,2],xmm15[3,4,5,6,7]
@@ -3515,14 +3515,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3,4],xmm11[5,6,7]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[8,9,2,3,12,13,6,7,0,1,10,11,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,3,u,u,6,0,3,5>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,3,u,u,6,0,3,5]
 ; AVX512F-FAST-NEXT:    vpermd %ymm8, %ymm9, %ymm8
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm10[0,1,2],ymm8[3,4,5,6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[8,9,2,3,12,13,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm5[1,2],ymm4[3],ymm5[4],ymm4[5],ymm5[6,7],ymm4[8],ymm5[9,10],ymm4[11],ymm5[12],ymm4[13],ymm5[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,4,6,3,6,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,4,6,3,6,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3,4,5,6,7]
@@ -3561,7 +3561,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11,16,21,26,31,36,41,46,51,56,61,0,0,0,1,6,11]
 ; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm6
 ; AVX512BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
 ; AVX512BW-NEXT:    kmovd %eax, %k1
@@ -3571,7 +3571,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44,49,54,59,0,5,10,15,20,25,30,0,0,0,34,39,44]
 ; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm7
 ; AVX512BW-NEXT:    vmovdqu16 %zmm6, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,35,40,45,50,55,60]
@@ -3579,7 +3579,7 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45,50,55,60,1,6,11,16,21,26,31,0,0,0,35,40,45]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm8 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,36,41,46,51,56,61]
@@ -3587,14 +3587,14 @@ define void @load_i16_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14,19,24,29,34,39,44,49,54,59,0,0,0,0,4,9,14]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm3, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm2, %zmm9
 ; AVX512BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,32,37,42,47,52,57,62]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm9, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm9
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm1 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
 ; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
@@ -4933,7 +4933,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpblendw $207, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
 ; AVX1-ONLY-NEXT:    # xmm1 = mem[0,1,2,3],xmm0[4,5],mem[6,7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,8,9,2,3,12,13,12,13,12,13,12,13]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpshufd $231, {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Folded Reload
 ; AVX1-ONLY-NEXT:    # xmm2 = mem[3,1,2,3]
@@ -5071,7 +5071,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpblendw $204, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm3 # 16-byte Folded Reload
 ; AVX1-ONLY-NEXT:    # xmm3 = xmm0[0,1],mem[2,3],xmm0[4,5],mem[6,7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,0,1,10,11,4,5,14,15,14,15,14,15,14,15]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpsrlq $48, %xmm0, %xmm5
@@ -5514,9 +5514,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    # ymm1 = ymm15[0,1],mem[2],ymm15[3],mem[4],ymm15[5,6],mem[7],ymm15[8,9],mem[10],ymm15[11],mem[12],ymm15[13,14],mem[15]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm0, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm9[0],xmm14[1],xmm9[2,3]
@@ -5595,9 +5595,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    # ymm1 = mem[0],ymm15[1,2],mem[3],ymm15[4],mem[5],ymm15[6,7],mem[8],ymm15[9,10],mem[11],ymm15[12],mem[13],ymm15[14,15]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm0, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
@@ -5874,7 +5874,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendw $173, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3],ymm1[4],mem[5],ymm1[6],mem[7,8],ymm1[9],mem[10,11],ymm1[12],mem[13],ymm1[14],mem[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,u,u,u,4,7,1,6>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,u,u,u,4,7,1,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,4,5,10,11,0,1,14,15,2,3,12,13,0,1,18,19,20,21,26,27,16,17,30,31,18,19,28,29,16,17]
 ; AVX2-FAST-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
@@ -5975,14 +5975,14 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0,1],ymm4[2],ymm9[3],ymm4[4],ymm9[5,6],ymm4[7],ymm9[8,9],ymm4[10],ymm9[11],ymm4[12],ymm9[13,14],ymm4[15]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2],xmm1[3,4],xmm0[5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm6
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendw $82, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm3 = ymm1[0],mem[1],ymm1[2,3],mem[4],ymm1[5],mem[6],ymm1[7,8],mem[9],ymm1[10,11],mem[12],ymm1[13],mem[14],ymm1[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,u,u,5,7,2,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,u,u,5,7,2,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm1, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,6,7,16,17,22,23,24,25,30,31,20,21,22,23,16,17,22,23]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1,2],ymm7[3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [1,4,6,0,1,4,6,0]
@@ -6048,14 +6048,14 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0],ymm9[1,2],ymm4[3],ymm9[4],ymm4[5],ymm9[6,7],ymm4[8],ymm9[9,10],ymm4[11],ymm9[12],ymm4[13],ymm9[14,15]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendw $74, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11, %ymm3 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm3 = ymm11[0],mem[1],ymm11[2],mem[3],ymm11[4,5],mem[6],ymm11[7,8],mem[9],ymm11[10],mem[11],ymm11[12,13],mem[14],ymm11[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,3,u,u,5,0,2,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,3,u,u,5,0,2,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm9, %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,4,5,18,19,20,21,26,27,16,17,30,31,30,31,18,19,20,21]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm6[3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,4,7,0,2,4,7,0]
@@ -6121,7 +6121,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    # ymm2 = ymm11[0,1],mem[2],ymm11[3],mem[4],ymm11[5,6],mem[7],ymm11[8,9],mem[10],ymm11[11],mem[12],ymm11[13,14],mem[15]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,2,3,12,13,6,7,0,1,10,11,0,1,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <1,3,u,u,6,0,3,5>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,3,u,u,6,0,3,5]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm5, %ymm2
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7,16,17,22,23,24,25,30,31,0,0,0,0,0,1,6,7]
 ; AVX2-FAST-NEXT:    # ymm7 = mem[0,1,0,1]
@@ -6432,10 +6432,10 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    # ymm1 = ymm5[0,1],mem[2],ymm5[3],mem[4],ymm5[5,6],mem[7],ymm5[8,9],mem[10],ymm5[11],mem[12],ymm5[13,14],mem[15]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[3,4],xmm1[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm0, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm6, %ymm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,5,14,15,8,9,2,3,12,13,6,7,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0],xmm4[1],xmm14[2,3]
@@ -6473,7 +6473,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm11[2],ymm10[3],ymm11[4],ymm10[5,6],ymm11[7],ymm10[8,9],ymm11[10],ymm10[11],ymm11[12],ymm10[13,14],ymm11[15]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm9, %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2],xmm12[3,4],xmm9[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,10,11,20,21,30,31,24,25,18,19,28,29,26,27,16,17,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm12, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm9, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3,4,5,6,7]
@@ -6513,9 +6513,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm5[1,2],ymm8[3],ymm5[4],ymm8[5],ymm5[6,7],ymm8[8],ymm5[9,10],ymm8[11],ymm5[12],ymm8[13],ymm5[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2],xmm1[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,2,3,12,13,22,23,16,17,26,27,20,21,30,31,30,31,18,19,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,0,1,10,11,4,5,14,15,8,9,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -6722,7 +6722,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0],xmm3[1,2,3],xmm1[4,5],xmm3[6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
 ; AVX512F-SLOW-NEXT:    vporq %ymm2, %ymm3, %ymm19
 ; AVX512F-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm15
@@ -6752,7 +6752,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm8[0],ymm7[1],ymm8[2],ymm7[3],ymm8[4,5],ymm7[6],ymm8[7,8],ymm7[9],ymm8[10],ymm7[11],ymm8[12,13],ymm7[14],ymm8[15]
 ; AVX512F-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm1[0,1],xmm3[2,3],xmm1[4,5,6],xmm3[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
 ; AVX512F-SLOW-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -6854,7 +6854,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, %xmm7
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[0,3,2,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm14[0],xmm0[0],xmm14[1],xmm0[1]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,2,3,12,13,6,7,16,17,26,27,26,27,30,31,24,25,18,19,28,29,22,23]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [12,13,14,15,4,5,14,15,8,9,2,3,12,13,6,7]
@@ -7203,7 +7203,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm4, %ymm30
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0],xmm4[1,2,3],xmm1[4,5],xmm4[6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,10,11,4,5,14,15,8,9,2,3,12,13,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm4, %ymm4
 ; AVX512F-FAST-NEXT:    vmovdqa 384(%rdi), %ymm6
 ; AVX512F-FAST-NEXT:    vmovdqa 416(%rdi), %ymm9
@@ -7220,9 +7220,9 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm5, %xmm7
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1,2],xmm7[3,4],xmm5[5,6,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0],ymm10[1],ymm11[2,3],ymm10[4],ymm11[5],ymm10[6],ymm11[7,8],ymm10[9],ymm11[10,11],ymm10[12],ymm11[13],ymm10[14],ymm11[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,4,7,1,4,6,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,4,7,1,4,6,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm7, %ymm10, %ymm7
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,8,9,14,15,0,1,6,7,16,17,22,23,20,21,22,23,24,25,30,31,16,17,22,23]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm9, %ymm7, %ymm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [8,9,3,2,4,5,7,6]
 ; AVX512F-FAST-NEXT:    vpermt2d %ymm2, %ymm17, %ymm7
@@ -7290,20 +7290,20 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5,6],xmm1[7]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <2,u,u,u,4,7,1,6>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [2,u,u,u,4,7,1,6]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm11[1],ymm7[2,3],ymm11[4],ymm7[5],ymm11[6],ymm7[7,8],ymm11[9],ymm7[10,11],ymm11[12],ymm7[13],ymm11[14],ymm7[15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm11, %ymm19
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm7, %ymm22
 ; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm17, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,16,17,30,31,128,128,128,128,128,128]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm6, %ymm1, %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,12,13,6,7,0,1,10,11,4,5,14,15,128,128,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vpor %ymm1, %ymm0, %ymm10
 ; AVX512F-FAST-NEXT:    vpsrlq $48, %xmm31, %xmm0
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <0,2,5,7,4,7,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [0,2,5,7,4,7,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm13[1],ymm8[2,3],ymm13[4],ymm8[5],ymm13[6],ymm8[7,8],ymm13[9],ymm8[10,11],ymm13[12],ymm8[13],ymm13[14],ymm8[15]
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm8, %ymm5
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm20, %ymm4
@@ -7374,7 +7374,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm15[u,u,u,u,u,u,u,u,4,5,14,15,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm15, %xmm20
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm31[2],xmm0[3],xmm31[3]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm24 = <0,3,5,2,5,7,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [0,3,5,2,5,7,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0],ymm5[1],ymm13[2],ymm5[3],ymm13[4,5],ymm5[6],ymm13[7,8],ymm5[9],ymm13[10],ymm5[11],ymm13[12,13],ymm5[14],ymm13[15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm5, %ymm30
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm13, %ymm26
@@ -7392,7 +7392,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3],ymm0[4],ymm1[5,6],ymm0[7],ymm1[8,9],ymm0[10],ymm1[11],ymm0[12],ymm1[13,14],ymm0[15]
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm1[0,1,2],xmm3[3,4],xmm1[5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm25 = <0,2,u,u,5,7,2,4>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [0,2,u,u,5,7,2,4]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm19, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm22, %ymm1
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5],ymm1[6],ymm0[7,8],ymm1[9],ymm0[10,11],ymm1[12],ymm0[13],ymm1[14],ymm0[15]
@@ -7451,7 +7451,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <1,3,6,0,5,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm30, %ymm9
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm26, %ymm10
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm10[0,1],ymm9[2],ymm10[3],ymm9[4],ymm10[5,6],ymm9[7],ymm10[8,9],ymm9[10],ymm10[11],ymm9[12],ymm10[13,14],ymm9[15]
@@ -7484,7 +7484,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <1,4,6,3,6,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = [1,4,6,3,6,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm27, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,4,5,2,3,4,5,10,11,0,1,14,15,16,17,18,19,20,21,18,19,20,21,26,27,16,17,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
@@ -7502,7 +7502,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm4, %ymm18
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2],xmm3[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <0,3,u,u,5,0,2,7>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,3,u,u,5,0,2,7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm21, %ymm15
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm29, %ymm10
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm10[0],ymm15[1],ymm10[2],ymm15[3],ymm10[4,5],ymm15[6],ymm10[7,8],ymm15[9],ymm10[10],ymm15[11],ymm10[12,13],ymm15[14],ymm10[15]
@@ -7566,7 +7566,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm24, %zmm2
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm10[0,1],ymm15[2],ymm10[3],ymm15[4],ymm10[5,6],ymm15[7],ymm10[8,9],ymm15[10],ymm10[11],ymm15[12],ymm10[13,14],ymm15[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,3,u,u,6,0,3,5>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,3,u,u,6,0,3,5]
 ; AVX512F-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,6,7,8,9,14,15,0,1,6,7,0,1,6,7,16,17,22,23,24,25,30,31,16,17,22,23,16,17,22,23]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
@@ -7647,7 +7647,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
 ; AVX512BW-NEXT:    vpermt2w %zmm10, %zmm12, %zmm13
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,35,40,45,50,55,60,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm8
 ; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm6, %zmm8
 ; AVX512BW-NEXT:    movl $67100672, %eax # imm = 0x3FFE000
@@ -7663,7 +7663,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm15
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm14, %zmm15
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = <1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = [1,6,11,16,21,26,31,36,41,46,51,56,61,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
 ; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm12, %zmm13
 ; AVX512BW-NEXT:    vmovdqu16 %zmm15, %zmm13 {%k1}
@@ -7677,7 +7677,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm17
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [34,39,44,49,54,59,0,5,10,15,20,25,30,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm15
 ; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm14, %zmm15
 ; AVX512BW-NEXT:    vmovdqu16 %zmm17, %zmm15 {%k1}
@@ -7691,7 +7691,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512BW-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [35,40,45,50,55,60,1,6,11,16,21,26,31,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm19
 ; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm18, %zmm19
 ; AVX512BW-NEXT:    movl $33546240, %eax # imm = 0x1FFE000
@@ -7703,7 +7703,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm4, %zmm18
 ; AVX512BW-NEXT:    vmovdqu16 %zmm16, %zmm18 {%k1}
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm17, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [4,9,14,19,24,29,34,39,44,49,54,59,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm16, %zmm9
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15,20,25,30,35,40,45,50,55,60,0,0,0,0,5,10,15]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index 77207838c2d04..fffe46d3cbed1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -2126,41 +2126,41 @@ define void @load_i16_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm4
 ; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm5
 ; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
 ; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2],ymm1[3,4,5,6,7],ymm6[8,9,10],ymm1[11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,0,0,20,26,0,6,12,0,0,0,20,26,0,6,12]
 ; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %ymm5, %ymm4, %ymm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm7
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2],ymm6[3,4,5,6,7],ymm7[8,9,10],ymm6[11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,0,0,21,27,1,7,13,0,0,0,21,27,1,7,13]
 ; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %ymm5, %ymm4, %ymm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm8
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0,1,2],ymm7[3,4,5,6,7],ymm8[8,9,10],ymm7[11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,0,0,6,12,18,24,30,0,0,0,6,12,18,24,30]
 ; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4],ymm8[5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
 ; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %ymm4, %ymm5, %ymm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4],ymm9[5,6,7]
 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rsi)
@@ -3258,7 +3258,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm1, %ymm6
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm7, %xmm2, %xmm0
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm3[2,2,2,2,4,5,6,7]
@@ -3280,7 +3280,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm11, %ymm8
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm9, %ymm8, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
@@ -3295,7 +3295,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm7[1,1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm6[1],xmm3[2,3],xmm6[4],xmm3[5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm5
@@ -3303,7 +3303,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm2, %xmm3
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3],xmm0[4,5],xmm3[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm14[0],ymm13[1],ymm14[2,3,4,5],ymm13[6],ymm14[7]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm12, %ymm7, %ymm3
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm0[0,1,2],ymm3[3,4,5,6,7]
@@ -3314,7 +3314,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm0[2,2,2,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm3[0,1,2,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX2-SLOW-NEXT:    vpshufb %xmm15, %xmm4, %xmm11
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0,1,2],xmm9[3],xmm11[4,5],xmm9[6],xmm11[7]
@@ -3351,13 +3351,13 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3],xmm5[4,5],xmm2[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm7, %ymm7
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm4[0,1,2],xmm0[3],xmm4[4,5],xmm0[6],xmm4[7]
@@ -3400,7 +3400,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm10[2,1,2,0,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm11, %ymm8, %ymm2
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
@@ -3444,7 +3444,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm8[0,1,2],ymm6[3,4,5,6,7],ymm8[8,9,10],ymm6[11,12,13,14,15]
@@ -3548,7 +3548,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm1, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
@@ -3572,7 +3572,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
@@ -3584,7 +3584,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
@@ -3593,7 +3593,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7]
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm7, %ymm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
@@ -3604,7 +3604,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm1, %xmm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm10
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm10, %xmm8
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
@@ -3637,15 +3637,15 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm10, %xmm10
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7]
@@ -3674,7 +3674,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
@@ -3689,7 +3689,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm6, %ymm14
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4]
@@ -3722,7 +3722,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
@@ -3730,7 +3730,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm11, %xmm10
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
@@ -3754,7 +3754,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm1, %xmm8
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -3773,7 +3773,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -3830,7 +3830,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm0[1],ymm2[2,3,4,5],ymm0[6],ymm2[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm1, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm2, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm3[2,2,2,2,4,5,6,7]
@@ -3854,7 +3854,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3],xmm3[4],xmm2[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
@@ -3866,7 +3866,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm5 = ymm11[0],ymm9[1],ymm11[2,3],ymm9[4],ymm11[5,6],ymm9[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm3 = xmm6[0],xmm3[1],xmm6[2,3],xmm3[4],xmm6[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm15 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm3, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm15, %xmm5, %xmm0
@@ -3875,7 +3875,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm7 = ymm4[0],ymm13[1],ymm4[2,3,4,5],ymm13[6],ymm4[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm7, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
@@ -3886,7 +3886,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm1, %xmm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm10, %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm6 = xmm8[0,1,2],xmm6[3],xmm8[4,5],xmm6[6],xmm8[7]
@@ -3919,15 +3919,15 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm12[0,1,2],ymm4[3,4,5,6,7],ymm12[8,9,10],ymm4[11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3],ymm4[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm14 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm14, %xmm11, %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm11[2],xmm5[3],xmm11[4,5],xmm5[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm7, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm7[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm10, %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm10[0,1,2],xmm1[3],xmm10[4,5],xmm1[6],xmm10[7]
@@ -3956,7 +3956,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm5 = xmm0[2,1,0,3]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm4 = xmm0[0,1,2,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm5, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm4[0,1,2,3,6,5,6,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5,6],xmm2[7]
@@ -3971,7 +3971,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm3[2,1,2,0,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm7 = xmm13[0],xmm7[1,2],xmm13[3],xmm7[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm13, %ymm6, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm14[0,1,2],ymm1[3,4,5,6,7],ymm14[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,4,6,5,4]
@@ -4004,7 +4004,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,6,5,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4],xmm10[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,5,6,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4],xmm5[5,6],xmm4[7]
@@ -4012,7 +4012,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm11, %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,1,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0],xmm10[1,2],xmm3[3],xmm10[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm6[0,1,2],ymm4[3,4,5,6,7],ymm6[8,9,10],ymm4[11,12,13,14,15]
@@ -4036,7 +4036,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm2, %xmm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm1, %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm5 = xmm8[0,1,2,3],xmm5[4],xmm8[5],xmm5[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
@@ -4055,7 +4055,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    # ymm3 = mem[0,1,2,3,4],ymm3[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -4092,7 +4092,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf32:
 ; AVX512F-ONLY-SLOW:       # %bb.0:
 ; AVX512F-ONLY-SLOW-NEXT:    subq $136, %rsp
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 224(%rdi), %ymm12
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm12[1],ymm1[2,3],ymm12[4],ymm1[5,6],ymm12[7]
@@ -4111,7 +4111,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm7, %ymm23
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm5, %ymm25
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm1[2,2,2,2,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,1,2,2]
@@ -4158,10 +4158,10 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm9, %ymm29
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3],xmm0[4,5],xmm2[6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm5[0,1,2],xmm1[3],xmm5[4,5],xmm1[6],xmm5[7]
@@ -4276,7 +4276,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpternlogq $226, %zmm2, %zmm0, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
 ; AVX512F-ONLY-SLOW-NEXT:    vpternlogq $184, %zmm5, %zmm17, %zmm20
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm30, %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm31, %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
@@ -4325,7 +4325,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} ymm15 = ymm10[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2,3,4],ymm14[5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $1, %ymm14, %zmm0, %zmm14
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
@@ -4377,7 +4377,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf32:
 ; AVX512F-ONLY-FAST:       # %bb.0:
 ; AVX512F-ONLY-FAST-NEXT:    subq $136, %rsp
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 224(%rdi), %ymm15
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm15[1],ymm1[2,3],ymm15[4],ymm1[5,6],ymm15[7]
@@ -4396,7 +4396,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm4, %xmm9
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1,2],xmm8[3],xmm9[4,5],xmm8[6],xmm9[7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %ymm12
@@ -4440,12 +4440,12 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm10[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm29
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm11[2],xmm3[3],xmm11[4,5],xmm3[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm11, %xmm4, %xmm4
 ; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[3],xmm4[4,5],xmm1[6],xmm4[7]
@@ -4482,7 +4482,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm13, %ymm21
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm8 = xmm0[2,1,0,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm8, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm7 = xmm2[0,1,2,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm7[0,1,2,3,6,5,6,4]
@@ -4530,7 +4530,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,1,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1,2],xmm10[3],xmm9[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm8, %xmm8
 ; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,7,5,6,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1,2,3],xmm7[4],xmm8[5,6],xmm7[7]
@@ -4555,7 +4555,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm7, %zmm0, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm20
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm4, %zmm17, %zmm20
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm30, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm31, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
@@ -4570,7 +4570,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,3,2,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm8, %xmm6, %xmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm2, %xmm7
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm7[0,1,2,3],xmm3[4],xmm7[5],xmm3[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
@@ -4605,7 +4605,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm13, %zmm0, %zmm13
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -4615,7 +4615,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm5[1],xmm1[2,3],xmm5[4],xmm1[5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4],xmm2[5],xmm6[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -4653,7 +4653,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-LABEL: load_i16_stride6_vf32:
 ; AVX512DQ-SLOW:       # %bb.0:
 ; AVX512DQ-SLOW-NEXT:    pushq %rax
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm13
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm13[0],ymm0[1],ymm13[2,3],ymm0[4],ymm13[5,6],ymm0[7]
@@ -4672,7 +4672,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm6, %ymm22
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm23
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm15
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm7, %xmm15, %xmm4
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm5[2,2,2,2,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,1,2,2]
@@ -4722,10 +4722,10 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm0, %xmm14, %xmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm9[2],xmm8[3],xmm9[4,5],xmm8[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm9, %xmm15, %xmm10
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm10[0,1,2],xmm5[3],xmm10[4,5],xmm5[6],xmm10[7]
@@ -4837,7 +4837,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $226, %zmm7, %zmm0, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $184, %zmm3, %zmm17, %zmm16
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm20, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm30, %ymm1
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
@@ -4886,7 +4886,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} ymm14 = ymm11[8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1,2,3,4],ymm9[5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,5,5,5,5]
@@ -4930,7 +4930,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ;
 ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf32:
 ; AVX512DQ-FAST:       # %bb.0:
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vmovdqa 224(%rdi), %ymm12
 ; AVX512DQ-FAST-NEXT:    vmovdqa 192(%rdi), %ymm2
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm2[0],ymm12[1],ymm2[2,3],ymm12[4],ymm2[5,6],ymm12[7]
@@ -4950,7 +4950,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm6, %xmm4
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm6, %xmm15
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm10, %xmm15, %xmm5
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1,2],xmm4[3],xmm5[4,5],xmm4[6],xmm5[7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdi), %ymm9
@@ -4998,12 +4998,12 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FAST-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm11 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm14, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm14[2],xmm7[3],xmm14[4,5],xmm7[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm14, %xmm15, %xmm15
 ; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,5,5,5,5]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm15[0,1,2],xmm6[3],xmm15[4,5],xmm6[6],xmm15[7]
@@ -5039,7 +5039,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm2
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm14, %xmm3
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm6 = xmm2[0,1,2,1]
 ; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm6[0,1,2,3,6,5,6,4]
@@ -5087,7 +5087,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm9 = xmm15[3,1,2,1,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0],xmm8[1,2],xmm9[3],xmm8[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm9, %xmm14, %xmm10
 ; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,7,5,6,5]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm10[0,1,2,3],xmm6[4],xmm10[5,6],xmm6[7]
@@ -5110,7 +5110,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm6, %zmm0, %zmm2
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm16
 ; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm2, %zmm17, %zmm16
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm29, %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm30, %ymm1
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
@@ -5125,7 +5125,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,3,2,1]
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm5, %xmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm9, %xmm1, %xmm7
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm7[0,1,2,3],xmm4[4],xmm7[5],xmm4[6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -5161,7 +5161,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm0, %zmm8
 ; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm15 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm10[1],xmm7[2,3],xmm10[4],xmm7[5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[6,7,2,3,14,15,26,27,22,23],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5171,7 +5171,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm3[1],xmm0[2,3],xmm3[4],xmm0[5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm5[4],xmm1[5],xmm5[6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -5212,7 +5212,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58,0,6,12,18,24,30,0,0,0,0,0,34,40,46,52,58]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm6
 ; AVX512BW-NEXT:    movl $4192256, %edi # imm = 0x3FF800
 ; AVX512BW-NEXT:    kmovd %edi, %k1
@@ -5226,7 +5226,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59,1,7,13,19,25,31,0,0,0,0,0,35,41,47,53,59]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm7
 ; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k2}
@@ -5236,7 +5236,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28,34,40,46,52,58,0,0,0,0,0,0,4,10,16,22,28]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm5, %zmm10
 ; AVX512BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
 ; AVX512BW-NEXT:    kmovd %edi, %k2
@@ -5250,11 +5250,11 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29,35,41,47,53,59,0,0,0,0,0,0,5,11,17,23,29]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm2, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm5, %zmm11
 ; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm11 {%k2}
 ; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
@@ -5266,7 +5266,7 @@ define void @load_i16_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm3, %zmm8
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
 ; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
@@ -7501,7 +7501,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm5[2,2,2,2,4,5,6,7]
@@ -7553,7 +7553,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm11, %ymm2
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm14, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm9, %xmm9
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm13 = xmm13[1,1,2,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,5,5,5,5]
@@ -7589,10 +7589,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm13, %xmm14
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm14[0,2,0,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,6,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm13, %xmm1
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2],xmm1[3],xmm0[4,5],xmm1[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
@@ -7607,7 +7607,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm11[2,2,2,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm11, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
@@ -7705,14 +7705,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm14, %xmm15
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm13, %xmm13
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm15 = xmm13[0,1],xmm15[2],xmm13[3],xmm15[4,5],xmm13[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm13, %ymm0, %ymm14
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,5,5,5,5]
@@ -7790,7 +7790,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
@@ -7902,7 +7902,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm5 = xmm5[0,1,3,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,7,7,7,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm5[1,2],xmm7[3],xmm5[4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -8112,7 +8112,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7]
@@ -8166,7 +8166,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm11, %xmm14
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
@@ -8196,13 +8196,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm0
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm10, %xmm0
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm14, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa %xmm2, %xmm4
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
@@ -8217,7 +8217,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm9, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
@@ -8311,16 +8311,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm10, %xmm7
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm14, %xmm15
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm0, %ymm14
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5]
@@ -8383,7 +8383,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
 ; AVX2-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
@@ -8398,7 +8398,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
@@ -8494,7 +8494,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
@@ -8504,7 +8504,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm11, %ymm11
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -8562,7 +8562,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1]
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm7, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm8, %xmm4
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -8603,7 +8603,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm10, %xmm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
@@ -8702,7 +8702,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,9,4,5,4,5,6,7,0,1,4,5,0,1,12,13,24,25,20,21,20,21,22,23,16,17,20,21,16,17,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1],ymm9[2],ymm8[3,4],ymm9[5],ymm8[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm1, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm5[2,2,2,2,4,5,6,7]
@@ -8756,7 +8756,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm11, %xmm14
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm14 = xmm7[0],xmm14[1],xmm7[2,3],xmm14[4],xmm7[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,6,7,4,5,6,7,6,7,6,7,2,3,14,15,26,27,22,23,20,21,22,23,22,23,22,23,18,19,30,31]
@@ -8786,13 +8786,13 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    # ymm10 = mem[0],ymm0[1],mem[2,3],ymm0[4],mem[5,6],ymm0[7]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm10, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,0,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm10, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,12,13,0,1,0,0,8,9,12,13,0,1,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm14, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm2, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2],xmm0[3],xmm1[4,5],xmm0[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,u,u,u,u,8,9,4,5,16,17,28,29,24,25,28,29,24,25,28,29,24,25,20,21]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $189, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm1 = mem[0],ymm1[1],mem[2,3,4,5],ymm1[6],mem[7]
@@ -8807,7 +8807,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm9 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm9, %xmm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm13 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm9, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm13, %xmm3, %xmm3
@@ -8901,16 +8901,16 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm7 = ymm11[0,1,2],ymm7[3,4,5,6,7],ymm11[8,9,10],ymm7[11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm7[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm10, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm14, %xmm15
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm15 = xmm7[0,1],xmm15[2],xmm7[3],xmm15[4,5],xmm7[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,u,u,u,u,10,11,6,7,18,19,30,31,26,27,26,27,30,31,26,27,26,27,22,23]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm0, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm14 = ymm15[0,1,2],ymm14[3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm15 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm9[0,1,2,3,5,5,5,5]
@@ -8973,7 +8973,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,1,2,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm1, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,6,5,6,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4],xmm0[5,6],xmm1[7]
@@ -8988,7 +8988,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm7[2,1,2,0,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3],xmm1[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,6,5,4]
@@ -9084,7 +9084,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4],xmm3[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw $103, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload
@@ -9094,7 +9094,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[3,1,2,1,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm9 = xmm7[0],xmm9[1,2],xmm7[3],xmm9[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm11, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -9152,7 +9152,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm8 = xmm1[0,3,2,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm7, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm8, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0,1,2,3],xmm1[4],xmm4[5],xmm1[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -9193,7 +9193,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm10, %xmm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm6 = xmm12[0,1,2,3],xmm6[4],xmm12[5],xmm6[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
@@ -9263,7 +9263,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-LABEL: load_i16_stride6_vf64:
 ; AVX512F-ONLY-SLOW:       # %bb.0:
 ; AVX512F-ONLY-SLOW-NEXT:    subq $1480, %rsp # imm = 0x5C8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 608(%rdi), %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 576(%rdi), %ymm1
@@ -9282,7 +9282,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm14 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm14, %xmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm12, %xmm2, %xmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %xmm2, %xmm21
 ; AVX512F-ONLY-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
@@ -9331,7 +9331,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, 672(%rdi), %ymm1, %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm1[0],ymm2[1],ymm1[2,3,4,5],ymm2[6],ymm1[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm3, %ymm2, %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm3, %ymm17
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm29
@@ -9406,10 +9406,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm12, %xmm15, %xmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm15, %xmm13, %xmm13
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm13 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm13, %xmm6, %xmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7]
@@ -9426,7 +9426,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm13, %xmm5, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm9[0,1,2,3,5,5,5,5]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[3],xmm0[4,5],xmm2[6],xmm0[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
@@ -9524,7 +9524,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm31, %ymm4
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm4, %ymm13 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm13 = ymm4[0,1],mem[2],ymm4[3],mem[4],ymm4[5,6],mem[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm5, %ymm13, %ymm4
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm5, %ymm24
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
@@ -9627,7 +9627,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
@@ -9676,7 +9676,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX512F-ONLY-SLOW-NEXT:    # ymm2 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm1, %xmm2, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm4
 ; AVX512F-ONLY-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm4[2,2,2,2,4,5,6,7]
@@ -9785,7 +9785,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3,4],ymm1[5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm0, %ymm16
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm1, %xmm2, %xmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm24[1,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm3[0,1,2,3,5,5,5,5]
@@ -9894,7 +9894,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-LABEL: load_i16_stride6_vf64:
 ; AVX512F-ONLY-FAST:       # %bb.0:
 ; AVX512F-ONLY-FAST-NEXT:    subq $1480, %rsp # imm = 0x5C8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 608(%rdi), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 576(%rdi), %ymm1
@@ -9916,7 +9916,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm9 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm15, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm15, %xmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm4, %xmm22
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
@@ -9962,7 +9962,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, 672(%rdi), %ymm1, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3,4,5],ymm3[6],ymm1[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm4, %ymm16
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm3, %ymm29
@@ -10033,12 +10033,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,6]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm11, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
 ; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7]
@@ -10055,7 +10055,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
@@ -10098,7 +10098,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm0[2,1,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[0,3,2,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm2, %xmm24
@@ -10110,7 +10110,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm13, %xmm4, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm4, %xmm21
 ; AVX512F-ONLY-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
@@ -10153,7 +10153,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm31, %ymm6
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd $148, {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm15 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # ymm15 = ymm6[0,1],mem[2],ymm6[3],mem[4],ymm6[5,6],mem[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm7, %ymm15, %ymm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm7, %ymm25
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1,2],ymm5[3,4,5,6,7],ymm6[8,9,10],ymm5[11,12,13,14,15]
@@ -10223,14 +10223,14 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %zmm11, %zmm29, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm1, %zmm22, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm1, %xmm25
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm23, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,1,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm21, %xmm11
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm11, %xmm11
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm20, %xmm13
@@ -10254,7 +10254,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm13, %xmm13
 ; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm14[0,1,2,3,7,5,6,5]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm13 = xmm13[0,1,2,3],xmm14[4],xmm13[5,6],xmm14[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm14, %ymm15, %ymm15
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm13, %ymm0, %ymm13
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm15[0,1,2],ymm13[3,4,5,6,7],ymm15[8,9,10],ymm13[11,12,13,14,15]
@@ -10301,7 +10301,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm12 # 32-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    # ymm12 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm12, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm12, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -10315,7 +10315,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm3, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm3, %xmm24
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm15, %xmm4, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm4, %xmm25
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
@@ -10410,7 +10410,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm0, %ymm16
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm11, %xmm11
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm11[0],xmm9[1],xmm11[2,3],xmm9[4],xmm11[5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
@@ -10425,7 +10425,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm24, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm25, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm11, %xmm2, %xmm15
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1,2,3],xmm0[4],xmm15[5],xmm0[6,7]
@@ -10516,7 +10516,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-LABEL: load_i16_stride6_vf64:
 ; AVX512DQ-SLOW:       # %bb.0:
 ; AVX512DQ-SLOW-NEXT:    subq $840, %rsp # imm = 0x348
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 608(%rdi), %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 576(%rdi), %ymm1
@@ -10534,7 +10534,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm3, %xmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm12, %xmm2, %xmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %xmm2, %xmm23
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[2,2,2,2,4,5,6,7]
@@ -10582,7 +10582,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vinserti128 $1, 672(%rdi), %ymm2, %ymm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0],ymm4[1],ymm2[2,3,4,5],ymm4[6],ymm2[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm5, %ymm4, %ymm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm5, %ymm27
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm18
@@ -10662,10 +10662,10 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [2,3,14,15,10,11,0,0,2,3,14,15,10,11,0,0]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm12, %xmm14, %xmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1],xmm0[2],xmm13[3],xmm0[4,5],xmm13[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm13 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm13 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm13, %xmm8, %xmm6
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,5,5,5]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1,2],xmm4[3],xmm6[4,5],xmm4[6],xmm6[7]
@@ -10680,7 +10680,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm13, %xmm7, %xmm2
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm10[0,1,2,3,5,5,5,5]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[3],xmm2[4,5],xmm3[6],xmm2[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0,1,2],ymm2[3,4,5,6,7],ymm1[8,9,10],ymm2[11,12,13,14,15]
@@ -10781,7 +10781,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm26, %ymm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm29, %ymm5
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1],ymm5[2],ymm4[3],ymm5[4],ymm4[5,6],ymm5[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm5, %ymm13, %ymm4
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm5, %ymm27
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3,4,5,6,7],ymm4[8,9,10],ymm3[11,12,13,14,15]
@@ -10883,7 +10883,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm15[1,1,1,1,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm10 = xmm10[0,1,2,3,4,5,7,7]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm10[0,1,2,3],xmm2[4],xmm10[5,6],xmm2[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
 ; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm13[0,1,2],ymm2[3,4,5,6,7],ymm13[8,9,10],ymm2[11,12,13,14,15]
@@ -10924,7 +10924,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm4 # 32-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    # ymm4 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm0, %xmm4, %xmm1
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm4, %xmm14
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
@@ -11031,7 +11031,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3,4],ymm0[5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm21
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX512DQ-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[1,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,5,5,5,5]
@@ -11121,7 +11121,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-LABEL: load_i16_stride6_vf64:
 ; AVX512DQ-FAST:       # %bb.0:
 ; AVX512DQ-FAST-NEXT:    subq $904, %rsp # imm = 0x388
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,12,13,u,u,4,5,u,u,u,u,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vmovdqa 608(%rdi), %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa 576(%rdi), %ymm1
@@ -11144,7 +11144,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm10, %xmm2, %xmm1
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm2, %ymm23
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,4,5,4,5,u,u,0,1,12,13,u,u,4,5]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm4, %xmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm4, %xmm22
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
@@ -11188,7 +11188,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, 672(%rdi), %ymm3, %ymm3
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm3[0],ymm5[1],ymm3[2,3,4,5],ymm5[6],ymm3[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,12,13,8,9,12,13,8,9,12,13,4,5,u,u,16,17,28,29,24,25,28,29,24,25,28,29,20,21,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm5, %ymm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm7, %ymm26
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm5, %ymm18
@@ -11264,12 +11264,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm17, %zmm16, %zmm6
 ; AVX512DQ-FAST-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm6 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [2,3,14,15,u,u,6,7,u,u,u,u,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm10, %xmm11, %xmm0
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [10,11,14,15,2,3,0,0,10,11,14,15,2,3,0,0]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm11, %xmm7, %xmm7
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm0[0,1],xmm7[2],xmm0[3],xmm7[4,5],xmm0[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,7,2,3,4,5,u,u,2,3,14,15,u,u,6,7]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm13, %xmm6
 ; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,5,5,5,5]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1,2],xmm5[3],xmm6[4,5],xmm5[6],xmm6[7]
@@ -11284,7 +11284,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm3, %xmm3
 ; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,5,5,5,5]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2],xmm1[3],xmm3[4,5],xmm1[6],xmm3[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,14,15,10,11,10,11,14,15,10,11,u,u,6,7,18,19,30,31,26,27,26,27,30,31,26,27,u,u,22,23]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3,4,5,6,7],ymm2[8,9,10],ymm1[11,12,13,14,15]
@@ -11330,7 +11330,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm14 = xmm0[2,1,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,0,1,4,5,u,u,12,13,12,13,12,13,12,13]
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm15 = xmm1[0,3,2,1]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,1,2,0,4,5,6,7]
@@ -11340,7 +11340,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    # ymm2 = ymm2[0,1],mem[2],ymm2[3,4],mem[5],ymm2[6,7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[2,1,0,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,0,1,0,1,0,1,u,u,8,9,12,13,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm10, %xmm4, %xmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm4, %xmm19
 ; AVX512DQ-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm3[0,1,2,1]
@@ -11384,7 +11384,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm30, %ymm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm31, %ymm7
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm6[0,1],ymm7[2],ymm6[3],ymm7[4],ymm6[5,6],ymm7[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,5,0,1,12,13,14,15,8,9,12,13,0,1,u,u,20,21,16,17,28,29,30,31,24,25,28,29,16,17,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm7, %ymm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm8, %ymm27
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm7, %ymm17
@@ -11455,12 +11455,12 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm29
 ; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm9, %zmm20, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpternlogq $184, %zmm1, %zmm26, %zmm29
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,2,3,6,7,u,u,14,15,14,15,14,15,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm15, %xmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm1, %xmm28
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm14[3,1,2,1,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,3,2,3,2,3,2,3,u,u,10,11,14,15,u,u]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm19, %xmm1
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm9
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm21, %xmm1
@@ -11484,7 +11484,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm18, %xmm0
 ; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm14 = xmm0[0,1,2,3,7,5,6,5]
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm14 = xmm15[0,1,2,3],xmm14[4],xmm15[5,6],xmm14[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [6,7,2,3,14,15,14,15,14,15,10,11,u,u,2,3,22,23,18,19,30,31,30,31,30,31,26,27,u,u,18,19]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm17, %ymm0
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm0, %ymm0
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
@@ -11524,7 +11524,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX512DQ-FAST-NEXT:    # ymm13 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,u,u,0,1,12,13,u,u,12,13,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm13, %xmm1
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7]
@@ -11537,7 +11537,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [8,9,0,0,0,1,12,13,8,9,0,0,0,1,12,13]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm26
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,0,1,4,5,u,u,12,13,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm11, %xmm5, %xmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm5, %xmm28
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4],xmm3[5],xmm2[6,7]
@@ -11631,7 +11631,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm25
 ; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm1, %xmm10, %xmm11
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,u,u,2,3,14,15,u,u,10,11,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm10, %xmm12, %xmm12
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0],xmm11[1],xmm12[2,3],xmm11[4],xmm12[5,6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,128,128,128,128,128,128,128,6,7,2,3,14,15,26,27,22,23,128,128,128,128,128,128,128,128,128,128,128,128]
@@ -11645,7 +11645,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [10,11,0,0,2,3,14,15,10,11,0,0,2,3,14,15]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm26, %xmm4
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm4, %xmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,6,7,u,u,14,15,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm28, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm14[0,1,2,3],xmm4[4],xmm14[5],xmm4[6,7]
@@ -11737,7 +11737,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm17
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm16, %zmm17
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,36,42,48,54,60,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm7, %zmm8
 ; AVX512BW-NEXT:    movl $4192256, %edi # imm = 0x3FF800
@@ -11759,7 +11759,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm19
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm18, %zmm19
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,37,43,49,55,61,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm15
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm14, %zmm15
 ; AVX512BW-NEXT:    vmovdqu16 %zmm19, %zmm15 {%k1}
@@ -11777,7 +11777,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm21
 ; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm20, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [34,40,46,52,58,0,6,12,18,24,30,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm17
 ; AVX512BW-NEXT:    vpermt2w %zmm10, %zmm16, %zmm17
 ; AVX512BW-NEXT:    movl $2095104, %edi # imm = 0x1FF800
@@ -11799,7 +11799,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm23
 ; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm22, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [35,41,47,53,59,1,7,13,19,25,31,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm19
 ; AVX512BW-NEXT:    vpermt2w %zmm10, %zmm18, %zmm19
 ; AVX512BW-NEXT:    vmovdqu16 %zmm23, %zmm19 {%k2}
@@ -11809,7 +11809,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm18
 ; AVX512BW-NEXT:    vmovdqu16 %zmm22, %zmm18 {%k2}
 ; AVX512BW-NEXT:    vmovdqu16 %zmm20, %zmm18 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [4,10,16,22,28,34,40,46,52,58,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm21
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30,36,42,48,54,60,0,0,0,0,0,0,6,12,18,24,30]
@@ -11829,7 +11829,7 @@ define void @load_i16_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm20
 ; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm22 {%k2}
 ; AVX512BW-NEXT:    vmovdqu16 %zmm21, %zmm22 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [5,11,17,23,29,35,41,47,53,59,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm10
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31,37,43,49,55,61,0,0,0,0,0,1,7,13,19,25,31]
 ; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 08667aed4bb35..4b32647f6fb3e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -2145,7 +2145,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2317,7 +2317,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm10[0,1,2,3,4],ymm5[5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2326,7 +2326,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0],xmm10[1],xmm11[2,3,4,5],xmm10[6],xmm11[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,3,0,1,14,15,12,13,10,11,10,11,10,11,10,11]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,5,1,u,4,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,5,1,u,4,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm11, %ymm13, %ymm11
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm11[2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vmovdqa %xmm12, %xmm11
@@ -2346,7 +2346,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0],xmm12[1],xmm10[2,3,4,5],xmm12[6],xmm10[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[4,5,2,3,0,1,14,15,12,13,12,13,12,13,12,13]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,6,1,u,5,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,6,1,u,5,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm12, %ymm13, %ymm12
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm10, %ymm12, %ymm10
@@ -2386,7 +2386,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,3,7,2,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm15, %ymm9, %ymm9
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm15, %xmm5
@@ -2395,7 +2395,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1,6,7,4,5,2,3,0,1,14,15,u,u,u,u]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5,6],ymm13[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm9, %ymm9
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm14 = xmm14[0,1],xmm9[2,3]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm9[0],ymm5[1,2,3,4,5,6,7],ymm9[8],ymm5[9,10,11,12,13,14,15]
@@ -2411,7 +2411,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm14[5,6,7],ymm5[8,9,10,11,12],ymm14[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,4,7,3,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,4,7,3,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm14, %ymm15, %ymm14
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm1[0,1],ymm0[2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm15, %xmm8
@@ -2433,7 +2433,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,26,27,28,29,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm6[5,6,7],ymm4[8,9,10,11,12],ymm6[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,4,0,3,7,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,4,0,3,7,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vpshufb %ymm13, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7]
@@ -2484,7 +2484,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm11 = ymm3[0],ymm2[1],ymm3[2,3,4],ymm2[5],ymm3[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[0,1,14,15,12,13,14,15,2,3,6,7,12,13,2,3,16,17,30,31,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm8, %ymm11, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm4 = ymm8[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2573,7 +2573,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 208(%rdi), %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 192(%rdi), %xmm13
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm14 = xmm13[0,1,2,3,4,5],xmm12[6],xmm13[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm15, %xmm14, %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm14, %ymm0, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm14[5,6,7],ymm4[8,9,10,11,12],ymm14[13,14,15]
@@ -2595,7 +2595,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,10,11,8,9,6,7,4,5,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm5 = xmm12[0],xmm13[1],xmm12[2,3,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm0, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1,2,3,4],ymm5[5,6,7],ymm4[8,9,10,11,12],ymm5[13,14,15]
@@ -2803,9 +2803,9 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm16 = [2,5,9,12,2,5,9,12]
 ; AVX512F-FAST-NEXT:    # ymm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [10,3,6,15,12,13,6,15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,6,9,u,13,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,6,9,u,13,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm8
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,5,9,u,12,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,5,9,u,12,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm6
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,12,5,12,5,14,15]
 ; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm2, %zmm3
@@ -2877,7 +2877,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[6,7,4,5,2,3,0,1,14,15],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpor %ymm7, %ymm9, %ymm7
 ; AVX512F-FAST-NEXT:    vpermd %zmm1, %zmm16, %zmm13
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,3,7,10,14,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,3,7,10,14,u,u,u]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm15 = ymm7[0],ymm15[1,2,3,4,5,6,7],ymm7[8],ymm15[9,10,11,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3],ymm15[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm9, %zmm9
@@ -2894,12 +2894,12 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm10
 ; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm10[0],xmm14[1],xmm10[1],xmm14[2],xmm10[2],xmm14[3],xmm10[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm14[0,1],ymm9[2,3,4,5,6,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm11 = ymm9[0],ymm11[1,2,3,4,5,6,7],ymm9[8],ymm11[9,10,11,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3],ymm11[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,3,3,u,0,3,7,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm11, %ymm11
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[u,u,0,1,6,7,8,9,14,15,u,u,u,u,u,u,u,u,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
@@ -2907,10 +2907,10 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm13, %xmm14
 ; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm14, %xmm12
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,4,8,11,15,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [1,4,8,11,15,u,u,u]
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [2,6,9,13,2,6,9,13]
 ; AVX512F-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <0,4,7,11,14,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [0,4,7,11,14,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm13[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
 ; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm16, %zmm13
@@ -2958,48 +2958,48 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,13,20,27,34,41,0,0,6,13,20,27,34,41]
 ; AVX512BW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,0,7,14,21,28,35,42,0,0,7,14,21,28,35,42]
 ; AVX512BW-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1,2,3,4,5,6,7],ymm6[8],ymm5[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,1,8,15,22,29,36,43,0,1,8,15,22,29,36,43]
 ; AVX512BW-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1,2,3,4,5,6,7],ymm7[8],ymm6[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,2,9,16,23,30,37,44,0,2,9,16,23,30,37,44]
 ; AVX512BW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1,2,3,4,5,6,7],ymm8[8],ymm7[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,3,10,17,24,31,38,45,0,3,10,17,24,31,38,45]
 ; AVX512BW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm9
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1,2,3,4,5,6,7],ymm9[8],ymm8[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3],ymm8[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [0,4,11,18,25,32,39,46,0,4,11,18,25,32,39,46]
 ; AVX512BW-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm10
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1,2,3,4,5,6,7],ymm10[8],ymm9[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm9 = ymm10[0,1,2,3],ymm9[4,5,6,7]
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,5,12,19,26,33,40,47,0,5,12,19,26,33,40,47]
 ; AVX512BW-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0],ymm10[1,2,3,4,5,6,7],ymm2[8],ymm10[9,10,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
@@ -4704,7 +4704,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14, %ymm5 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm5 = mem[0,1],ymm14[2],mem[3,4,5],ymm14[6],mem[7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1,2,3,4,5,6,7],ymm8[8],ymm2[9,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,0,1]
@@ -4761,7 +4761,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,1,2,3]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[1,3,2,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm1[0],ymm2[1,2,3,4,5,6,7],ymm1[8],ymm2[9,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm7[0,1],xmm1[2,3]
@@ -4821,7 +4821,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,1,2,1]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,5,7,6,7]
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm8[0],ymm0[1,2,3,4,5,6,7],ymm8[8],ymm0[9,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm8[2,3]
@@ -4920,7 +4920,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm2, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm14[2],ymm13[3,4],ymm14[5],ymm13[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,5,1,u,4,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,5,1,u,4,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
@@ -4949,7 +4949,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0],xmm3[1],xmm2[2,3,4,5],xmm3[6],xmm2[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,6,1,u,5,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,6,1,u,5,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
@@ -5119,9 +5119,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1],ymm3[2],ymm0[3,4,5],ymm3[6],ymm0[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,3,7,2,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,3,7,2,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm9, %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm6[2,3]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm6[0],ymm2[1,2,3,4,5,6,7],ymm6[8],ymm2[9,10,11,12,13,14,15]
@@ -5168,9 +5168,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm15 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7]
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm9
 ; AVX2-FAST-NEXT:    vmovdqa %ymm3, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,7,3,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm15, %ymm2, %ymm15
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm15, %ymm15
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm15[0],ymm1[1,2,3,4,5,6,7],ymm15[8],ymm1[9,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm6 = xmm6[0,1],xmm15[2,3]
@@ -5223,9 +5223,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm9, %xmm9
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,6,4,6,7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm8[4],xmm9[4],xmm8[5],xmm9[5],xmm8[6],xmm9[6],xmm8[7],xmm9[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,4,0,3,7,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,4,0,3,7,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm9, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0],ymm0[1,2,3,4,5,6,7],ymm3[8],ymm0[9,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm8[0,1],xmm3[2,3]
@@ -5505,7 +5505,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm3 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm3 = mem[0,1],ymm1[2],mem[3,4,5],ymm1[6],mem[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm3, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm8[0,1,2,3,4,5],ymm3[6],ymm8[7,8,9,10,11,12,13],ymm3[14],ymm8[15]
@@ -5570,7 +5570,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm2 = xmm10[0,1],xmm2[2,3]
@@ -5627,7 +5627,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,6,4,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm5, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm5[0],ymm0[1,2,3,4,5,6,7],ymm5[8],ymm0[9,10,11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm5[2,3]
@@ -5762,7 +5762,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm9[0,1,2],ymm10[3],ymm9[4,5],ymm10[6],ymm9[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm28
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -6059,15 +6059,15 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
 ; AVX512F-ONLY-FAST-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm14
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,6,9,u,13,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm31, %zmm0, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm14, %zmm1, %zmm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <1,u,u,u,4,8,11,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,u,u,u,4,8,11,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm30, %zmm0, %zmm15
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,5,9,u,12,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm31, %zmm0, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm14, %zmm16, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,u,u,u,4,7,11,14>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,u,u,u,4,7,11,14]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm30, %zmm0, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,1,12,5,12,5,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm31, %zmm0, %zmm0
@@ -6110,7 +6110,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm5[3],ymm4[4,5],ymm5[6],ymm4[7]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm7
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm7[0],xmm0[1],xmm7[2,3,4,5],xmm0[6],xmm7[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-ONLY-FAST-NEXT:    vporq %ymm3, %ymm0, %ymm20
@@ -6146,7 +6146,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm9[0,1,2],ymm3[3],ymm9[4,5],ymm3[6],ymm9[7]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3,4,5],xmm0[6],xmm1[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm10 = [2,5,2,5,2,5,2,5]
@@ -6160,7 +6160,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm11
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm11[4],xmm0[5],xmm11[6],xmm0[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm31, %zmm18, %zmm11
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <1,u,u,u,5,8,12,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [1,u,u,u,5,8,12,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm30, %zmm12, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm1[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
@@ -6202,7 +6202,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,2,3,0,1,14,15,12,13,10,11],zero,zero
 ; AVX512F-ONLY-FAST-NEXT:    vpor %ymm1, %ymm10, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,3,7,10,14,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,3,7,10,14,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0],ymm9[1],ymm3[2,3,4],ymm9[5],ymm3[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
@@ -6222,13 +6222,13 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm9, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[3,1,2,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm9, %ymm0, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm25, %zmm16, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %zmm0, %zmm16, %zmm23
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,3,3,u,0,3,7,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,3,3,u,0,3,7,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %ymm28, %ymm0, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
@@ -6244,7 +6244,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3,4,5],xmm3[6],xmm1[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,u,u,u,6,9,13,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,u,u,u,6,9,13,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm30, %zmm3, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm3[2,3,16,17,22,23,24,25],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2],ymm0[3,4,5,6,7],ymm3[8,9,10],ymm0[11,12,13,14,15]
@@ -6258,7 +6258,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [2,6,9,13,2,6,9,13]
 ; AVX512F-ONLY-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm25, %zmm3, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,4,7,11,14,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,4,7,11,14,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm17, %zmm0, %zmm17
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm31, %zmm12, %zmm12
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
@@ -6280,7 +6280,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0],ymm6[1],ymm8[2,3,4],ymm6[5],ymm8[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1],xmm3[2],xmm1[3],xmm3[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,u,u,u,6,10,13,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,u,u,u,6,10,13,u]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm10, %zmm0, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm30, %zmm3, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
@@ -6290,7 +6290,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[0,1,2,3,6,7,4,5,2,3,0,1,14,15],zero,zero
 ; AVX512F-ONLY-FAST-NEXT:    vpor %ymm3, %ymm1, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,4,8,11,15,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,4,8,11,15,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm11, %zmm0, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm31, %zmm1, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm9, %ymm1, %ymm1
@@ -6422,7 +6422,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2],ymm9[3],ymm8[4,5],ymm9[6],ymm8[7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3,4,5],xmm1[6],xmm2[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm25
 ; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
@@ -6720,13 +6720,13 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm19 = [3,6,10,13,3,6,10,13]
 ; AVX512DQ-FAST-NEXT:    # ymm19 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%rdi), %zmm20
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <2,6,9,u,13,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,6,9,u,13,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm1, %zmm0, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <1,u,u,u,4,8,11,15>
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <2,5,9,u,12,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = [1,u,u,u,4,8,11,15]
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [2,5,9,u,12,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm1, %zmm0, %zmm10
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm20, %zmm18, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,u,u,u,4,7,11,14>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,u,u,u,4,7,11,14]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm26, %zmm3, %zmm9
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm1, %zmm3, %zmm3
@@ -6770,7 +6770,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2],ymm4[3],ymm3[4,5],ymm4[6],ymm3[7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm11
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm11[0],xmm0[1],xmm11[2,3,4,5],xmm0[6],xmm11[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm11, %ymm0, %ymm0
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[0,1,6,7,8,9,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512DQ-FAST-NEXT:    vporq %ymm10, %ymm0, %ymm21
@@ -6795,7 +6795,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm10[0,1,2],ymm12[3],ymm10[4,5],ymm12[6],ymm10[7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm7
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm7[0],xmm0[1],xmm7[2,3,4,5],xmm0[6],xmm7[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm0, %xmm7, %xmm7
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [2,5,2,5,2,5,2,5]
@@ -6823,7 +6823,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm14[4],xmm2[5],xmm14[6],xmm2[7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [10,3,6,15,12,13,6,15]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = <1,u,u,u,5,8,12,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = [1,u,u,u,5,8,12,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm0
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm26, %zmm27, %zmm2
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm2[2,3,16,17,22,23,24,25,30,31,u,u,u,u,u,u,u,u]
@@ -6878,9 +6878,9 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm8, %xmm8
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[3,1,2,3,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm8[0],xmm11[1],xmm8[1],xmm11[2],xmm8[2],xmm11[3],xmm8[3]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,3,7,10,14,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,3,7,10,14,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm1, %zmm8, %zmm14
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm14, %ymm14
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1],ymm14[2,3,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm12[0],ymm10[1],ymm12[2,3,4],ymm10[5],ymm12[6,7]
@@ -6902,7 +6902,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm6[0],ymm5[1],ymm6[2,3],ymm5[4],ymm6[5,6,7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0],xmm12[1],xmm11[2,3,4,5],xmm12[6],xmm11[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <2,u,u,u,6,9,13,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,u,u,u,6,9,13,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm26, %zmm12, %zmm12
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,u,u,u,u,2,3,0,1,14,15,12,13,10,11]
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -6914,7 +6914,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm23, %zmm17, %zmm0
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm23, %zmm18, %zmm12
 ; AVX512DQ-FAST-NEXT:    vinserti32x8 $1, %ymm11, %zmm0, %zmm10 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,3,3,u,0,3,7,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,3,3,u,0,3,7,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %ymm25, %ymm11, %ymm11
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,0,1,6,7,8,9,u,u,u,u,u,u,u,u,u,u,16,17,22,23,24,25]
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[0,1,0,1,6,7,8,9,14,15,u,u,u,u,u,u,16,17,16,17,22,23,24,25,30,31,u,u,u,u,u,u]
@@ -6925,7 +6925,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm4[0,1],ymm3[2,3],ymm4[4,5],ymm3[6,7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm12, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm7, %xmm14, %xmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <0,4,7,11,14,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,4,7,11,14,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm1, %zmm14, %zmm14
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm12 = xmm12[10,11,6,7,4,5,6,7,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm12[0],xmm7[0],xmm12[1],xmm7[1],xmm12[2],xmm7[2],xmm12[3],xmm7[3]
@@ -6947,7 +6947,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm14
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm0 = xmm14[0],xmm0[1],xmm14[2],xmm0[3],xmm14[4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <3,u,u,u,6,10,13,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [3,u,u,u,6,10,13,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm26, %zmm14, %zmm14
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[u,u,u,u,u,u,4,5,2,3,0,1,14,15,12,13]
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
@@ -6958,7 +6958,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm7[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpternlogq $226, %zmm11, %zmm19, %zmm12
 ; AVX512DQ-FAST-NEXT:    vinserti32x8 $1, %ymm0, %zmm0, %zmm12 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <1,4,8,11,15,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [1,4,8,11,15,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm28, %zmm0, %zmm7
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm1, %zmm0, %zmm0
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
@@ -7017,7 +7017,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9,16,23,30,0,0,0,0,0,0,0,38,45,52,59,2,9]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm5, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
 ; AVX512BW-NEXT:    kmovd %edi, %k1
@@ -7037,7 +7037,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm10
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
@@ -7055,7 +7055,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43,50,57,0,0,0,0,0,0,0,1,8,15,22,29,36,43]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm11
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm9
 ; AVX512BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
 ; AVX512BW-NEXT:    kmovd %edi, %k1
@@ -7072,7 +7072,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44,51,58,0,0,0,0,0,0,0,2,9,16,23,30,37,44]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm12
 ; AVX512BW-NEXT:    vmovdqu16 %zmm10, %zmm12 {%k1}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm12 {%k2}
@@ -7085,7 +7085,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45,52,59,0,0,0,0,0,0,0,3,10,17,24,31,38,45]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm13
 ; AVX512BW-NEXT:    vmovdqu16 %zmm10, %zmm13 {%k1}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm13 {%k2}
@@ -7098,7 +7098,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14,21,28,0,0,0,0,0,0,0,36,43,50,57,0,7,14]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm5, %zmm10
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm14
 ; AVX512BW-NEXT:    vmovdqu16 %zmm10, %zmm14 {%k1}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm14 {%k2}
@@ -7111,7 +7111,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
 ; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm5, %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vmovdqu16 %zmm3, %zmm4 {%k1}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm6, %zmm4 {%k2}
@@ -10519,7 +10519,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm14[0,1],ymm13[2],ymm14[3,4,5],ymm13[6],ymm14[7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm4, %ymm5
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,0,1]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6],ymm5[7,8,9,10,11,12,13],ymm4[14],ymm5[15]
@@ -10646,7 +10646,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,2,3]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm4[0,1],xmm0[2,3]
@@ -10772,7 +10772,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[0,1,2,1]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,5,7,6,7]
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm10 = xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm6, %ymm6
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm7 = ymm6[0],ymm7[1,2,3,4,5,6,7],ymm6[8],ymm7[9,10,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm6 = xmm10[0,1],xmm6[2,3]
@@ -11005,7 +11005,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2],ymm9[3,4],ymm11[5],ymm9[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,5,1,u,4,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,5,1,u,4,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm4, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,3,2,3,2,3,2,3,8,9,0,1,6,7,8,9,18,19,18,19,18,19,18,19,24,25,16,17,22,23,24,25]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm6, %ymm8
@@ -11051,7 +11051,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm4[0],xmm5[1],xmm4[2,3,4,5],xmm5[6],xmm4[7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm11[2,3],ymm9[4,5],ymm11[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,6,1,u,5,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,6,1,u,5,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm4, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,2,3,8,9,2,3,4,5,10,11,16,17,18,19,20,21,18,19,24,25,18,19,20,21,26,27]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm6, %ymm8
@@ -11393,9 +11393,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm5 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,3,7,2,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,3,7,2,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm12, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1],xmm5[2,3]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1,2,3,4,5,6,7],ymm5[8],ymm4[9,10,11,12,13,14,15]
@@ -11498,7 +11498,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1,2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,4,7,3,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,4,7,3,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa %ymm2, %ymm5
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
@@ -11530,7 +11530,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm7[0,1,2],ymm10[3],ymm7[4,5],ymm10[6],ymm7[7]
 ; AVX2-FAST-NEXT:    vmovdqa %ymm5, %ymm4
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm5, %ymm13
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm13, %ymm13
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm13[0],ymm0[1,2,3,4,5,6,7],ymm13[8],ymm0[9,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm12 = xmm12[0,1],xmm13[2,3]
@@ -11605,9 +11605,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm10, %xmm10
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <1,4,0,3,7,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [1,4,0,3,7,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm10, %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0],ymm0[1,2,3,4,5,6,7],ymm6[8],ymm0[9,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
@@ -12203,7 +12203,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $187, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13, %ymm6 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm6 = mem[0,1],ymm13[2],mem[3,4,5],ymm13[6],mem[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm6, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6],ymm7[7,8,9,10,11,12,13],ymm6[14],ymm7[15]
@@ -12329,7 +12329,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1,2,3,4,5,6,7],ymm0[8],ymm1[9,10,11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm8[0,1],xmm0[2,3]
@@ -12447,7 +12447,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,6,4,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{.*#+}} xmm9 = xmm7[4],xmm9[4],xmm7[5],xmm9[5],xmm7[6],xmm9[6],xmm7[7],xmm9[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,4,5,2,3,0,1,14,15,28,29,28,29,28,29,28,29,20,21,18,19,16,17,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm8 = ymm2[0],ymm8[1,2,3,4,5,6,7],ymm2[8],ymm8[9,10,11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm2 = xmm9[0,1],xmm2[2,3]
@@ -12599,7 +12599,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm18
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 512(%rdi), %ymm3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 544(%rdi), %ymm4
@@ -12655,7 +12655,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm6, %ymm19
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %ymm0, %ymm5, %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -12725,7 +12725,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %ymm6, %ymm8, %ymm6
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -12788,7 +12788,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm5, %ymm6, %ymm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %ymm7, %ymm8, %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -13041,7 +13041,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0],ymm10[1],ymm14[2,3,4],ymm10[5],ymm14[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm14, %ymm21
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm1, %ymm0, %ymm11
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0],ymm0[1],ymm11[2,3,4,5,6,7,8],ymm0[9],ymm11[10,11,12,13,14,15]
@@ -13049,7 +13049,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm2[0,1],ymm14[2],ymm2[3,4,5],ymm14[6],ymm2[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm12, %ymm11, %ymm11
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
 ; AVX512F-ONLY-SLOW-NEXT:    vpternlogq $242, %ymm0, %ymm20, %ymm11
@@ -13106,7 +13106,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm13, %ymm26
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm11
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm11[4],xmm1[5],xmm11[6],xmm1[7]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %ymm10, %ymm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm21, %ymm10
@@ -13164,7 +13164,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm16, %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5],ymm1[6],ymm0[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm31
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm3, %ymm0, %ymm11
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2,3,4,5],ymm0[6],ymm11[7,8,9,10,11,12,13],ymm0[14],ymm11[15]
@@ -13201,7 +13201,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm1, %ymm11, %ymm11
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm11[0,1,2],ymm0[3,4,5,6,7],ymm11[8,9,10],ymm0[11,12,13,14,15]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm13
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %ymm11, %ymm13, %ymm11
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
@@ -13273,7 +13273,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm11, %xmm1, %xmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm0[0,1,2],ymm1[3,4,5,6,7],ymm0[8,9,10],ymm1[11,12,13,14,15]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %xmm7, %xmm22
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %ymm0, %ymm2, %ymm0
@@ -13489,7 +13489,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    subq $1800, %rsp # imm = 0x708
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm5
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,5,9,u,12,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,5,9,u,12,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm4, %zmm1, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm4, %zmm3, %zmm2
@@ -13503,7 +13503,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm5, %ymm12
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm3, %xmm5
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
@@ -13539,7 +13539,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm7, %ymm19
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm3 = xmm4[0],xmm3[1],xmm4[2,3,4,5],xmm3[6],xmm4[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm3, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vpor %ymm0, %ymm3, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -13590,7 +13590,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm8 = xmm4[0],xmm2[1],xmm4[2,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm8, %xmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,6,9,u,13,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,6,9,u,13,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $1, %ymm8, %zmm6, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm18, %ymm1
@@ -13605,7 +13605,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm29, %zmm7, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm6, %ymm9, %ymm9
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
 ; AVX512F-ONLY-FAST-NEXT:    vpor %ymm9, %ymm8, %ymm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -13662,7 +13662,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm31, %zmm1, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm6, %ymm10, %ymm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm12, %ymm9, %ymm9
 ; AVX512F-ONLY-FAST-NEXT:    vpor %ymm10, %ymm9, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -13723,7 +13723,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm24 = [3,6,10,13,3,6,10,13]
 ; AVX512F-ONLY-FAST-NEXT:    # ymm24 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm21, %zmm24, %zmm6
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3,4,5,6],ymm4[7]
 ; AVX512F-ONLY-FAST-NEXT:    movw $992, %ax # imm = 0x3E0
@@ -13768,7 +13768,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm3, %xmm6, %xmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,u,u,u,4,7,11,14>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,u,u,u,4,7,11,14]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm30, %zmm6, %zmm13
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm10, %ymm13, %ymm13
@@ -13839,7 +13839,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <1,u,u,u,4,8,11,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [1,u,u,u,4,8,11,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm16, %zmm11, %zmm13
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm13, %ymm13
@@ -13868,9 +13868,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm15, %ymm13
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,u,u,u,5,8,12,15>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,u,u,u,5,8,12,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm16, %zmm3, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm11, %ymm6, %ymm6
@@ -13921,7 +13921,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm12, %ymm6, %ymm6
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpor %ymm6, %ymm1, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -13955,10 +13955,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[3,1,2,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,3,7,10,14,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,3,7,10,14,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm24 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm24, %zmm5, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm14, %ymm3, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm3[2,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [65535,65535,65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,0,0,0,0,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535,65535]
@@ -13976,12 +13976,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <2,u,u,u,6,9,13,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [2,u,u,u,6,9,13,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm16, %zmm20, %zmm13
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm13, %ymm13
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm13[0,1,2],ymm1[3,4,5,6,7],ymm13[8,9,10],ymm1[11,12,13,14,15]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpor %ymm3, %ymm13, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
@@ -14029,9 +14029,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm2, %xmm28
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <0,4,7,11,14,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm21 = [0,4,7,11,14,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm24, %zmm21, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -14047,12 +14047,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm13, %xmm0, %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <3,u,u,u,6,10,13,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [3,u,u,u,6,10,13,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm16, %zmm18, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3,4,5,6,7],ymm1[8,9,10],ymm0[11,12,13,14,15]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm10, %xmm6, %xmm6
 ; AVX512F-ONLY-FAST-NEXT:    vpor %ymm1, %ymm6, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7]
@@ -14096,9 +14096,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,4,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,4,8,11,15,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,4,8,11,15,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm24, %zmm3, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm10, %ymm8, %ymm8
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm8[2,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm19, %ymm5
@@ -14226,7 +14226,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm29
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm0, %ymm1, %ymm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 512(%rdi), %ymm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 544(%rdi), %ymm4
@@ -14281,7 +14281,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm6, %ymm19
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0],xmm5[1],xmm6[2,3,4,5],xmm5[6],xmm6[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX512DQ-SLOW-NEXT:    vpor %ymm0, %ymm5, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -14350,7 +14350,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm7[1],xmm6[2,3,4,5],xmm7[6],xmm6[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,10,11,8,9,6,7,20,21,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm5, %ymm6, %ymm6
 ; AVX512DQ-SLOW-NEXT:    vpor %ymm6, %ymm8, %ymm3
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -14416,7 +14416,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm5[1,3,2,3]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm5, %ymm6, %ymm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm6, %ymm7, %ymm7
 ; AVX512DQ-SLOW-NEXT:    vpor %ymm7, %ymm8, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -14669,7 +14669,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm10[6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,5,u,u,0,1,14,15,8,9,10,11,4,5,6,7,20,21,u,u,16,17,30,31,24,25,26,27,20,21,22,23]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm11, %ymm0, %ymm10
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm10[0],ymm0[1],ymm10[2,3,4,5,6,7,8],ymm0[9],ymm10[10,11,12,13,14,15]
@@ -14678,7 +14678,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm13[0,1],ymm15[2],ymm13[3,4,5],ymm15[6],ymm13[7]
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm10, %xmm12
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3],xmm12[4],xmm10[5],xmm12[6],xmm10[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm12, %ymm10, %ymm10
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [65535,65535,65535,65535,65535,65535,65535,0,0,0,0,0,65535,65535,65535,65535]
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $242, %ymm0, %ymm19, %ymm10
@@ -14743,7 +14743,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %ymm23, %ymm26
 ; AVX512DQ-SLOW-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} xmm11 = xmm11[0,1,2,3],xmm12[4],xmm11[5],xmm12[6],xmm11[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm12, %xmm0, %xmm0
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm14[0,1],ymm3[2],ymm14[3,4],ymm3[5],ymm14[6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[1,1,2,0]
@@ -14799,7 +14799,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpblendd $68, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX512DQ-SLOW-NEXT:    # ymm0 = ymm0[0,1],mem[2],ymm0[3,4,5],mem[6],ymm0[7]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [8,9,8,9,8,9,8,9,0,1,14,15,u,u,10,11,24,25,24,25,24,25,24,25,16,17,30,31,u,u,26,27]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm11, %ymm0, %ymm12
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2,3,4,5],ymm0[6],ymm12[7,8,9,10,11,12,13],ymm0[14],ymm12[15]
@@ -14836,7 +14836,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,10,11,24,25,22,23,20,21,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm1, %ymm12, %ymm12
 ; AVX512DQ-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm12[0,1,2],ymm0[3,4,5,6,7],ymm12[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm2, %xmm10, %xmm10
 ; AVX512DQ-SLOW-NEXT:    vpor %ymm12, %ymm10, %ymm10
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $1, %ymm22, %zmm0, %zmm28
@@ -15092,7 +15092,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    subq $1304, %rsp # imm = 0x518
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 512(%rdi), %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <2,5,9,u,12,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [2,5,9,u,12,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm4, %zmm17, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,1,12,5,12,5,14,15]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm4, %zmm3, %zmm1
@@ -15105,7 +15105,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm5, %ymm22
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm3, %xmm5
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm3[0,1,2,3],xmm5[4],xmm3[5],xmm5[6],xmm3[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,6,7,12,13,2,3,16,17,30,31,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
@@ -15142,7 +15142,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm7, %ymm16
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm4[0],xmm1[1],xmm4[2,3,4,5],xmm1[6],xmm4[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,3,0,1,14,15,12,13,10,11,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm1, %ymm1
 ; AVX512DQ-FAST-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -15194,7 +15194,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm14[0],xmm11[1],xmm14[2,3,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm7, %xmm4, %xmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,6,9,u,13,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,6,9,u,13,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm16, %ymm0
@@ -15208,7 +15208,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm27, %zmm4, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [128,128,128,128,128,128,128,128,128,128,2,3,4,5,10,11,16,17,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm7, %ymm8, %ymm8
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,5,2,3,0,1,14,15,12,13,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm9, %ymm6, %ymm6
 ; AVX512DQ-FAST-NEXT:    vpor %ymm6, %ymm8, %ymm2
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -15269,7 +15269,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm27, %zmm16, %zmm9
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,128,128,128,128,128,128,128,128,4,5,10,11,0,1,22,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm8, %ymm9, %ymm9
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [6,7,4,5,2,3,0,1,14,15,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm4, %ymm4
 ; AVX512DQ-FAST-NEXT:    vpor %ymm4, %ymm9, %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -15327,7 +15327,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} ymm28 = [3,6,10,13,3,6,10,13]
 ; AVX512DQ-FAST-NEXT:    # ymm28 = mem[0,1,2,3,0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm21, %zmm28, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,2,3,0,1,6,7,8,9,14,15,12,13,14,15,16,17,18,19,16,17,22,23,24,25,30,31,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
 ; AVX512DQ-FAST-NEXT:    movw $992, %ax # imm = 0x3E0
@@ -15372,7 +15372,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [8,9,6,7,4,5,10,11,8,9,6,7,4,5,10,11]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 768(%rdi), %zmm30
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <0,u,u,u,4,7,11,14>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [0,u,u,u,4,7,11,14]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm30, %zmm20, %zmm14
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,6,7,8,9,14,15,8,9,14,15,4,5,2,3,16,17,22,23,24,25,30,31,24,25,30,31,20,21,18,19]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm6, %ymm14, %ymm14
@@ -15440,7 +15440,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1,2,3],xmm1[4],xmm6[5],xmm1[6],xmm6[7]
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [10,11,8,9,6,7,12,13,10,11,8,9,6,7,12,13]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <1,u,u,u,4,8,11,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [1,u,u,u,4,8,11,15]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm18, %zmm20, %zmm10
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,3,4,5,10,11,12,13,0,1,0,1,0,1,0,1,18,19,20,21,26,27,28,29,16,17,16,17,16,17,16,17]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm10, %ymm10
@@ -15469,9 +15469,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm11, %ymm16
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4],xmm1[5],xmm2[6],xmm1[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,0,1,14,15,12,13,10,11,8,9,128,128,128,128,128,128,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,u,u,u,5,8,12,15>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,u,u,u,5,8,12,15]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm18, %zmm3, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,30,31,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
@@ -15531,7 +15531,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,22,23,28,29,18,19,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm10, %ymm6, %ymm6
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm6[0,1,2],ymm0[3,4,5,6,7],ymm6[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,2,3,0,1,14,15,12,13,10,11,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX512DQ-FAST-NEXT:    vpor %ymm6, %ymm2, %ymm2
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
@@ -15570,10 +15570,10 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <0,3,7,10,14,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm21 = [0,3,7,10,14,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm28 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm28, %zmm21, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,0,1,6,7,8,9,14,15,16,17,22,23,20,21,22,23,16,17,22,23,24,25,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm1, %ymm3, %ymm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, %ymm5
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
@@ -15590,12 +15590,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,3,2,3,2,3,2,3,0,1,14,15,12,13,10,11]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <2,u,u,u,6,9,13,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [2,u,u,u,6,9,13,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm18, %zmm16, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,2,3,16,17,22,23,24,25,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm1, %ymm2, %ymm2
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3,4,5,6,7],ymm2[8,9,10],ymm0[11,12,13,14,15]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,1,2,3,4,5,2,3,0,1,14,15,12,13,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
 ; AVX512DQ-FAST-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
@@ -15642,9 +15642,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm2, %xmm24
 ; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <0,4,7,11,14,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [0,4,7,11,14,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm28, %zmm22, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,u,u,2,3,4,5,10,11,12,13,18,19,18,19,18,19,18,19,18,19,20,21,26,27,28,29]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm9, %ymm3, %ymm3
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2,3,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm19, %ymm6
@@ -15659,12 +15659,12 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [4,5,4,5,4,5,4,5,2,3,0,1,14,15,12,13]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <3,u,u,u,6,10,13,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [3,u,u,u,6,10,13,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm18, %zmm16, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,0,1,18,19,20,21,26,27,128,128,128,128,128,128,128,128,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm1[0,1,2],ymm3[3,4,5,6,7],ymm1[8,9,10],ymm3[11,12,13,14,15]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,2,3,6,7,4,5,2,3,0,1,14,15,128,128]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
 ; AVX512DQ-FAST-NEXT:    vpor %ymm1, %ymm10, %ymm1
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
@@ -15706,7 +15706,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX512DQ-FAST-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,6,4,6,7]
 ; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <1,4,8,11,15,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,4,8,11,15,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm28, %zmm5, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm29, %ymm15
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm2, %ymm2
@@ -15821,7 +15821,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm17
 ; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm16, %zmm17
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,7,14,21,28,35,42,49,56,63,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm9
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm8, %zmm9
 ; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
@@ -15850,7 +15850,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58,0,0,0,19,20,21,22,23,24,25,26,27,37,44,51,58]
 ; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm21 = <1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm21 = [1,8,15,22,29,36,43,50,57,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm22
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm21, %zmm22
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42,49,56,63,0,0,0,0,0,0,0,7,14,21,28,35,42]
@@ -15878,7 +15878,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm23 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm24
 ; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm23, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [2,9,16,23,30,37,44,51,58,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm19
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm18, %zmm19
 ; AVX512BW-NEXT:    movl $261632, %edi # imm = 0x3FE00
@@ -15904,7 +15904,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm25 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm26
 ; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm25, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [3,10,17,24,31,38,45,52,59,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm21
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm20, %zmm21
 ; AVX512BW-NEXT:    vmovdqu16 %zmm26, %zmm21 {%k1}
@@ -15926,7 +15926,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm28
 ; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm27, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [36,43,50,57,0,7,14,21,28,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm23
 ; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm22, %zmm23
 ; AVX512BW-NEXT:    vmovdqu16 %zmm28, %zmm23 {%k1}
@@ -15948,7 +15948,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm27 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm28
 ; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm27, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm29 = <37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [37,44,51,58,1,8,15,22,29,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm30
 ; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm29, %zmm30
 ; AVX512BW-NEXT:    vmovdqu16 %zmm28, %zmm30 {%k1}
@@ -15968,7 +15968,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm13 = [22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15,22,29,0,0,0,0,0,0,0,37,44,51,58,1,8,15]
 ; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2w %zmm14, %zmm13, %zmm12
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [38,45,52,59,2,9,16,23,30,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm14, %zmm1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm12, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm10, %zmm1 {%k2}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index e398c68fa81ca..4ed9a99c58c3f 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -260,18 +260,18 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm4, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-SLOW-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
 ; AVX2-SLOW-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm4, %ymm4
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
@@ -290,21 +290,21 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
 ; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FAST-NEXT:    vbroadcastf128 {{.*#+}} ymm5 = [0,1,4,7,0,1,4,7]
 ; AVX2-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm5, %ymm2
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7]
 ; AVX2-FAST-NEXT:    vmovaps %ymm3, (%rsi)
@@ -321,18 +321,18 @@ define void @load_i32_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastsd {{.*#+}} ymm3 = [2,5,2,5,2,5,2,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm0[0],ymm1[1],ymm0[2,3],ymm1[4],ymm0[5,6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm5 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm5 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3,4,5],ymm3[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,0,3,6,0,0,3,6]
 ; AVX2-FAST-PERLANE-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm5 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm6 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm6 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm1 = ymm2[0,1,0,3,4,5,4,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
@@ -534,7 +534,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [2,5,2,5,2,5,2,5]
 ; AVX2-SLOW-NEXT:    vpermps %ymm4, %ymm6, %ymm7
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm9 = [0,3,6,1,4,7,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm8, %ymm9, %ymm8
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm6, %ymm6
@@ -545,7 +545,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpermps %ymm4, %ymm8, %ymm9
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm11 = <1,4,7,2,5,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm11 = [1,4,7,2,5,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm10, %ymm11, %ymm10
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7]
 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm8, %ymm8
@@ -553,7 +553,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpermps %ymm10, %ymm11, %ymm10
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
@@ -583,7 +583,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [2,5,2,5,2,5,2,5]
 ; AVX2-FAST-NEXT:    vpermps %ymm4, %ymm6, %ymm7
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm9 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm6, %ymm6
@@ -594,7 +594,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermps %ymm4, %ymm8, %ymm9
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm11 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm11 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm8, %ymm8
@@ -604,7 +604,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm10 = [0,1,0,3,0,1,4,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm4, %ymm10, %ymm4
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm4[5,6,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm10, %ymm1
@@ -631,7 +631,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastsd {{.*#+}} ymm6 = [2,5,2,5,2,5,2,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm4, %ymm6, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm8 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5,6],ymm3[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm9 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm9 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1,2,3,4,5],ymm7[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm0, %ymm6, %ymm6
@@ -642,7 +642,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm4, %ymm8, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm11 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm11 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm9 = ymm10[0,1,2,3,4],ymm9[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm0, %ymm8, %ymm8
@@ -650,7 +650,7 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm8 = ymm10[0,1,2,3,4],ymm8[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm3 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm3 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm4 = ymm4[0,1,0,3,4,5,4,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm4 = ymm4[0,1,0,3]
@@ -674,15 +674,15 @@ define void @load_i32_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
@@ -1089,7 +1089,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5]
 ; AVX2-SLOW-NEXT:    vpermps %ymm11, %ymm1, %ymm8
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm14 = [0,3,6,1,4,7,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm9, %ymm14, %ymm9
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1112,7 +1112,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
 ; AVX2-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
@@ -1136,7 +1136,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpermps %ymm15, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm10 = [2,5,0,3,6,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm10, %ymm2
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3]
@@ -1195,7 +1195,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [2,5,2,5,2,5,2,5]
 ; AVX2-FAST-NEXT:    vpermps %ymm12, %ymm8, %ymm0
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm14 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm9, %ymm14, %ymm9
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1218,7 +1218,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm9[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-FAST-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm8 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm8, %ymm1, %ymm8
 ; AVX2-FAST-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
 ; AVX2-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
@@ -1243,7 +1243,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm10 = [0,1,0,3,0,1,4,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm12, %ymm10, %ymm12
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm13 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm13 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm13, %ymm1
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm12[5,6,7]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1],ymm6[2],ymm3[3,4],ymm6[5],ymm3[6,7]
@@ -1298,7 +1298,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastsd {{.*#+}} ymm1 = [2,5,2,5,2,5,2,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm11, %ymm1, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm9 = ymm13[0],ymm10[1],ymm13[2,3],ymm10[4],ymm13[5,6],ymm10[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm14 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm14 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm9, %ymm14, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm2 = ymm9[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1321,7 +1321,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm8[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm1 = ymm13[0,1],ymm10[2],ymm13[3,4],ymm10[5],ymm13[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm2 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm2 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
 ; AVX2-FAST-PERLANE-NEXT:    # ymm0 = mem[0,1,0,1]
@@ -1345,7 +1345,7 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm15, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm2 = ymm10[0,1],ymm13[2],ymm10[3,4],ymm13[5],ymm10[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm10 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm10 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm10, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm11 = ymm11[0,1,0,3,4,5,4,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm11 = ymm11[0,1,0,3]
@@ -1395,21 +1395,21 @@ define void @load_i32_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm4
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm7
 ; AVX512-NEXT:    vpermt2d %zmm1, %zmm6, %zmm7
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
 ; AVX512-NEXT:    vpermt2d %zmm0, %zmm8, %zmm7
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
 ; AVX512-NEXT:    vpermt2d %zmm4, %zmm8, %zmm6
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm9
 ; AVX512-NEXT:    vpermt2d %zmm5, %zmm8, %zmm9
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
 ; AVX512-NEXT:    vpermt2d %zmm0, %zmm10, %zmm9
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
 ; AVX512-NEXT:    vpermt2d %zmm4, %zmm10, %zmm8
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpermt2d %zmm1, %zmm10, %zmm5
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
 ; AVX512-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
@@ -2243,7 +2243,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm8 = [0,3,6,1,4,7,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm8, %ymm2
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2310,7 +2310,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm1 = [1,4,7,2,5,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm1, %ymm2
 ; AVX2-SLOW-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
 ; AVX2-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
@@ -2370,7 +2370,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm14 = [2,5,0,3,6,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm1, %ymm14, %ymm1
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
@@ -2489,7 +2489,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0],ymm11[1],ymm14[2,3],ymm11[4],ymm14[5,6],ymm11[7]
 ; AVX2-FAST-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovups %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm6 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm6 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm6, %ymm2
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2553,7 +2553,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm11[2],ymm14[3,4],ymm11[5],ymm14[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm2
 ; AVX2-FAST-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
 ; AVX2-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
@@ -2614,7 +2614,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    # ymm2 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
 ; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm14 = [0,1,0,3,0,1,4,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm8, %ymm14, %ymm0
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm1, %ymm2
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm10 = ymm2[0,1,2,3,4],ymm0[5,6,7]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
@@ -2723,7 +2723,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm2 = ymm14[0],ymm13[1],ymm14[2,3],ymm13[4],ymm14[5,6],ymm13[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm8 = <0,3,6,1,4,7,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm8 = [0,3,6,1,4,7,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm8, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm1 = ymm2[0,1,2,3,4,5],ymm1[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2790,7 +2790,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm14[0,1],ymm13[2],ymm14[3,4],ymm13[5],ymm14[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm1 = <1,4,7,2,5,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm1 = [1,4,7,2,5,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm0, %ymm1, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastf128 {{.*#+}} ymm0 = [0,0,3,6,0,0,3,6]
 ; AVX2-FAST-PERLANE-NEXT:    # ymm0 = mem[0,1,0,1]
@@ -2850,7 +2850,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vblendps $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm14 = <2,5,0,3,6,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm14 = [2,5,0,3,6,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm1, %ymm14, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm2 = ymm10[0,1,0,3,4,5,4,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm2 = ymm2[0,1,0,3]
@@ -2957,7 +2957,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm10
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm11
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm3
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,18,21,24,27,30,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512-NEXT:    vpermt2d %zmm9, %zmm12, %zmm13
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,6,7,8,9,10,17,20,23,26,29]
@@ -2970,7 +2970,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermt2d %zmm4, %zmm14, %zmm16
 ; AVX512-NEXT:    vpermi2d %zmm10, %zmm2, %zmm12
 ; AVX512-NEXT:    vpermt2d %zmm11, %zmm14, %zmm12
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [17,20,23,26,29,0,3,6,9,12,15,u,u,u,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm17
 ; AVX512-NEXT:    vpermt2d %zmm1, %zmm14, %zmm17
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,5,6,7,8,9,10,18,21,24,27,30]
@@ -2983,7 +2983,7 @@ define void @load_i32_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermt2d %zmm4, %zmm18, %zmm20
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm10, %zmm14
 ; AVX512-NEXT:    vpermt2d %zmm11, %zmm18, %zmm14
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [2,5,8,11,14,17,20,23,26,29,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpermt2d %zmm9, %zmm18, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,5,6,7,8,9,16,19,22,25,28,31]
 ; AVX512-NEXT:    vpermt2d %zmm8, %zmm9, %zmm3
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
index 0998868da1f52..c4de574173b39 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-5.ll
@@ -219,13 +219,13 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,5,2,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm3 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm3, %ymm2, %ymm2
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,6,3,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,6,3,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm4 = ymm0[0,1],ymm1[2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm4, %ymm3, %ymm3
 ; AVX2-ONLY-NEXT:    vmovdqa 64(%rdi), %xmm4
 ; AVX2-ONLY-NEXT:    vpbroadcastd %xmm4, %xmm5
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,7,4,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,7,4,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm6, %ymm5, %ymm5
 ; AVX2-ONLY-NEXT:    vpbroadcastd 68(%rdi), %xmm6
@@ -234,7 +234,7 @@ define void @load_i32_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpalignr {{.*#+}} ymm7 = ymm0[12,13,14,15],ymm1[0,1,2,3,4,5,6,7,8,9,10,11],ymm0[28,29,30,31],ymm1[16,17,18,19,20,21,22,23,24,25,26,27]
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,3,2,3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm6 = xmm7[0,1,2],xmm6[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <4,1,6,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,1,6,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm0, %ymm7, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[3]
@@ -450,7 +450,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 128(%rdi), %ymm6, %ymm6
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2],ymm6[3],ymm5[4,5,6],ymm6[7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,6,3,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,6,3,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm7 = ymm1[0,1],ymm2[2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm7, %ymm6, %ymm6
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm7 = ymm3[0,1],ymm4[2,3],ymm3[4,5],ymm4[6,7]
@@ -460,7 +460,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2],ymm7[3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpbroadcastd 144(%rdi), %ymm7
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5,6],ymm7[7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,7,4,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,7,4,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm8 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm8, %ymm7, %ymm7
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm8
@@ -480,7 +480,7 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm8 = ymm9[0,1,2,3,4,5],ymm8[6,7]
 ; AVX2-ONLY-NEXT:    vperm2i128 {{.*#+}} ymm3 = ymm3[0,1],ymm4[0,1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <4,1,6,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,1,6,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2],ymm3[3,4,5,6,7]
@@ -499,24 +499,24 @@ define void @load_i32_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm2
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,5,10,15,20,25,30,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,5,10,15,20,25,30,u]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm0
 ; AVX512-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm3
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5,6],ymm3[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = <17,22,27,0,5,10,15,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [17,22,27,0,5,10,15,u]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm2, %zmm3
 ; AVX512-NEXT:    vpbroadcastd 144(%rdi), %ymm4
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,7,12,17,22,27,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,7,12,17,22,27,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,2,3,4,5,8,13]
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm6
 ; AVX512-NEXT:    vpermi2d %ymm6, %ymm4, %ymm5
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = <3,8,13,18,23,28,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,8,13,18,23,28,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,9,14]
 ; AVX512-NEXT:    vpermi2d %ymm6, %ymm4, %ymm7
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = <4,9,14,19,24,29,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm4 = [4,9,14,19,24,29,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,2,3,4,5,10,15]
 ; AVX512-NEXT:    vpermi2d %ymm6, %ymm4, %ymm1
@@ -947,7 +947,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 128(%rdi), %ymm7, %ymm7
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm7 = ymm10[0,1,2],ymm7[3],ymm10[4,5,6],ymm7[7]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <1,6,3,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,6,3,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm11 = ymm4[0,1],ymm6[2,3],ymm4[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm11, %ymm7, %ymm11
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm12 = ymm8[0,1],ymm9[2,3],ymm8[4,5],ymm9[6,7]
@@ -966,7 +966,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vpbroadcastd 304(%rdi), %ymm12
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5,6],ymm12[7]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,7,4,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,7,4,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm13 = ymm4[0,1,2,3],ymm6[4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm13, %ymm7, %ymm13
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 96(%rdi), %ymm0, %ymm14
@@ -1003,7 +1003,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vperm2i128 {{.*#+}} ymm8 = ymm8[0,1],ymm9[0,1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm9[5],ymm8[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm4[4,5],ymm6[6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <4,1,6,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,1,6,u]
 ; AVX2-ONLY-NEXT:    vpermd %ymm4, %ymm6, %ymm4
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [2,7,2,7,2,7,2,7]
@@ -1044,7 +1044,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
 ; AVX512F-NEXT:    movw $8064, %ax # imm = 0x1F80
 ; AVX512F-NEXT:    kmovw %eax, %k1
@@ -1054,12 +1054,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4]
 ; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <17,22,27,0,5,10,15,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm2, %zmm7
 ; AVX512F-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm7, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,7,12,17,22,27,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
@@ -1069,7 +1069,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm8, %zmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,8,13,18,23,28,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22]
 ; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
@@ -1080,7 +1080,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7]
 ; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm9
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <4,9,14,19,24,29,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
 ; AVX512F-NEXT:    movb $56, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
@@ -1105,7 +1105,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [8,13,18,23,28,0,0,3,8,13,18,23,28,0,0,3]
 ; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm6
 ; AVX512BW-NEXT:    movw $8064, %ax # imm = 0x1F80
 ; AVX512BW-NEXT:    kmovd %eax, %k1
@@ -1115,12 +1115,12 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [9,14,19,24,29,0,0,4,9,14,19,24,29,0,0,4]
 ; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <17,22,27,0,5,10,15,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [17,22,27,0,5,10,15,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm2, %zmm7
 ; AVX512BW-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,5,6,7,8,9,10,11,12,18,23,28]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,7,12,17,22,27,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,7,12,17,22,27,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm7
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
@@ -1130,7 +1130,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,19,24,29]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm8, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,8,13,18,23,28,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,8,13,18,23,28,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm8
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
@@ -1141,7 +1141,7 @@ define void @load_i32_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <4,9,14,19,24,29,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [4,9,14,19,24,29,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
 ; AVX512BW-NEXT:    movb $56, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
@@ -2069,7 +2069,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <1,6,3,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,6,3,u]
 ; AVX2-ONLY-NEXT:    vmovdqu (%rsp), %ymm8 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1],ymm8[2,3],ymm4[4,5,6,7]
@@ -2113,7 +2113,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vpbroadcastd 464(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <2,7,4,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,7,4,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3],ymm8[4,5],ymm4[6,7]
 ; AVX2-ONLY-NEXT:    vmovdqa %ymm8, %ymm14
 ; AVX2-ONLY-NEXT:    vpermd %ymm1, %ymm9, %ymm1
@@ -2217,7 +2217,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    # ymm0 = mem[0,1,2,3],ymm11[4,5],mem[6,7]
 ; AVX2-ONLY-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm8[0,1],ymm1[0,1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4],ymm1[5],ymm5[6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <4,1,6,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,1,6,u]
 ; AVX2-ONLY-NEXT:    vpermd %ymm0, %ymm10, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2],ymm5[3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [2,7,2,7,2,7,2,7]
@@ -2307,7 +2307,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm11, %zmm12, %zmm13
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm6, %zmm8
 ; AVX512F-NEXT:    movw $8064, %ax # imm = 0x1F80
@@ -2323,7 +2323,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm15
 ; AVX512F-NEXT:    vpermt2d %zmm11, %zmm14, %zmm15
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm12, %zmm13
 ; AVX512F-NEXT:    vmovdqa32 %zmm15, %zmm13 {%k1}
@@ -2333,7 +2333,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm12
 ; AVX512F-NEXT:    vmovdqa32 %zmm14, %zmm12 {%k1}
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm15, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <2,7,12,17,22,27,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm16, %zmm17
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5]
@@ -2349,7 +2349,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm0, %zmm16
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm15 {%k1}
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm17, %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <3,8,13,18,23,28,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm16, %zmm17
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22]
@@ -2366,7 +2366,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7]
 ; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermt2d %zmm11, %zmm16, %zmm9
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <4,9,14,19,24,29,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm11, %zmm1
 ; AVX512F-NEXT:    movb $56, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
@@ -2406,7 +2406,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm12 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm12, %zmm13
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,5,10,15,20,25,30,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,5,10,15,20,25,30,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm6, %zmm8
 ; AVX512BW-NEXT:    movw $8064, %ax # imm = 0x1F80
@@ -2422,7 +2422,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm14, %zmm15
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm12, %zmm13
 ; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm13 {%k1}
@@ -2432,7 +2432,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm12
 ; AVX512BW-NEXT:    vmovdqa32 %zmm14, %zmm12 {%k1}
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm15, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <2,7,12,17,22,27,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [2,7,12,17,22,27,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm16, %zmm17
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [10,15,20,25,30,0,0,5,10,15,20,25,30,0,0,5]
@@ -2448,7 +2448,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm0, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm15 {%k1}
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm17, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <3,8,13,18,23,28,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [3,8,13,18,23,28,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm17
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm16, %zmm17
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm18 = [27,0,5,10,15,0,17,22,27,0,5,10,15,0,17,22]
@@ -2465,7 +2465,7 @@ define void @load_i32_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm16 = [12,17,22,27,0,0,2,7,12,17,22,27,0,0,2,7]
 ; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm16, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <4,9,14,19,24,29,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = [4,9,14,19,24,29,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm11, %zmm1
 ; AVX512BW-NEXT:    movb $56, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
@@ -4381,7 +4381,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <1,6,3,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,6,3,u]
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6,7]
@@ -4476,7 +4476,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vpbroadcastd 1104(%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,7,4,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,7,4,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3],ymm5[4,5],ymm6[6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm0, %ymm3, %ymm0
 ; AVX2-ONLY-NEXT:    vinserti128 $1, 256(%rdi), %ymm0, %ymm1
@@ -4672,7 +4672,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-ONLY-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm7[0,1],ymm6[0,1]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm6[5],ymm0[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm1[0,1,2,3],ymm5[4,5],ymm1[6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,1,6,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,1,6,u]
 ; AVX2-ONLY-NEXT:    vpermd %ymm5, %ymm8, %ymm5
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2],ymm0[3,4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [2,7,2,7,2,7,2,7]
@@ -4843,7 +4843,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm19, %zmm4
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm21, %zmm15, %zmm4
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm5
@@ -4858,7 +4858,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm5
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm16, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm24
 ; AVX512F-NEXT:    vpermt2d %zmm13, %zmm12, %zmm24
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
@@ -4868,7 +4868,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermt2d %zmm26, %zmm16, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermi2d %zmm20, %zmm1, %zmm16
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm5
 ; AVX512F-NEXT:    vpermt2d %zmm21, %zmm25, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -4900,12 +4900,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm20, %zmm28
 ; AVX512F-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512F-NEXT:    vpermt2d %zmm21, %zmm5, %zmm29
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm21, %zmm9, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 320(%rdi), %zmm17
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm31
@@ -5047,7 +5047,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm19, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,5,10,15,20,25,30,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,5,10,15,20,25,30,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm15, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm5
@@ -5062,7 +5062,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm5
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm16, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = <17,22,27,0,5,10,15,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = [17,22,27,0,5,10,15,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm24
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm12, %zmm24
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5
@@ -5072,7 +5072,7 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm16, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermi2d %zmm20, %zmm1, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = <2,7,12,17,22,27,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm25 = [2,7,12,17,22,27,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm5
 ; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm25, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -5104,12 +5104,12 @@ define void @load_i32_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm20, %zmm28
 ; AVX512BW-NEXT:    vpermt2d %zmm20, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <3,8,13,18,23,28,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = [3,8,13,18,23,28,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm29
 ; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm5, %zmm29
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <4,9,14,19,24,29,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,9,14,19,24,29,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm9, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 320(%rdi), %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm31
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index 0c229457fb602..d3f9b1c4e15a7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -283,13 +283,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,6,4,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,6,4,u]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm3, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa 64(%rdi), %xmm4
 ; AVX2-ONLY-NEXT:    vpshufd {{.*#+}} xmm5 = xmm4[2,2,2,2]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm5[3]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,7,5,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,7,5,u]
 ; AVX2-ONLY-NEXT:    vpermd %ymm3, %ymm5, %ymm3
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %xmm5
@@ -572,7 +572,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovaps (%rdi), %ymm2
 ; AVX2-SLOW-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX2-SLOW-NEXT:    vmovaps 64(%rdi), %ymm6
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm3 = <0,6,4,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm3 = [0,6,4,u]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
 ; AVX2-SLOW-NEXT:    vpermps %ymm7, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1]
@@ -583,7 +583,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2]
 ; AVX2-SLOW-NEXT:    vpermps %ymm10, %ymm8, %ymm11
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm11 = <1,7,5,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm11 = [1,7,5,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm7, %ymm11, %ymm7
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
@@ -655,7 +655,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovaps (%rdi), %ymm2
 ; AVX2-FAST-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX2-FAST-NEXT:    vmovaps 64(%rdi), %ymm6
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm3 = <0,6,4,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm3 = [0,6,4,u]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm7, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1]
@@ -666,7 +666,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2]
 ; AVX2-FAST-NEXT:    vpermps %ymm10, %ymm8, %ymm11
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm11 = <1,7,5,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm11 = [1,7,5,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm7, %ymm11, %ymm7
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
@@ -738,7 +738,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps (%rdi), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps 32(%rdi), %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps 64(%rdi), %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm3 = <0,6,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm3 = [0,6,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm7 = ymm2[0,1,2,3],ymm4[4,5],ymm2[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm7, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm5[0,1]
@@ -749,7 +749,7 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastsd {{.*#+}} ymm8 = [4,2,4,2,4,2,4,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm10, %ymm8, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm11[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm11 = <1,7,5,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm11 = [1,7,5,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm7, %ymm11, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1,2],ymm9[3,4,5,6,7]
@@ -820,32 +820,32 @@ define void @load_i32_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa 128(%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0,1,2,3],ymm0[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,6,12,18,24,30,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,6,12,18,24,30,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,2,3,4,5,12,10]
 ; AVX512-NEXT:    vpermi2d %ymm4, %ymm5, %ymm6
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <1,7,13,19,25,31,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [1,7,13,19,25,31,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,13,11]
 ; AVX512-NEXT:    vpermi2d %ymm4, %ymm5, %ymm7
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,0,6,12,0,0,6,12]
 ; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %ymm1, %ymm0, %ymm4
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <2,8,14,20,26,u,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,8,14,20,26,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,1,7,13,0,1,7,13]
 ; AVX512-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %ymm1, %ymm0, %ymm5
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,9,15,21,27,u,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4],ymm5[5,6,7]
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3],ymm1[4,5,6,7]
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = <20,26,0,6,12,u,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [20,26,0,6,12,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,10,8,14]
 ; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm8
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = <21,27,1,7,13,u,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [21,27,1,7,13,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
 ; AVX512-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,11,9,15]
 ; AVX512-NEXT:    vpermi2d %ymm0, %ymm1, %ymm2
@@ -1370,7 +1370,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovaps 32(%rdi), %ymm13
 ; AVX2-SLOW-NEXT:    vmovaps 64(%rdi), %ymm6
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm14 = <0,6,4,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm14 = [0,6,4,u]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7]
 ; AVX2-SLOW-NEXT:    vpermps %ymm4, %ymm14, %ymm3
 ; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm6[0,1],ymm15[0,1]
@@ -1397,7 +1397,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm5, %ymm11
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm2 = <1,7,5,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm2 = [1,7,5,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm4, %ymm2, %ymm4
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7]
@@ -1556,7 +1556,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovaps 32(%rdi), %ymm13
 ; AVX2-FAST-NEXT:    vmovaps 64(%rdi), %ymm6
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm0 = <0,6,4,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm0 = [0,6,4,u]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm4, %ymm0, %ymm7
 ; AVX2-FAST-NEXT:    vperm2f128 {{.*#+}} ymm8 = ymm6[0,1],ymm15[0,1]
@@ -1583,7 +1583,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm5, %ymm11
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],ymm11[6,7]
 ; AVX2-FAST-NEXT:    vmovups %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm11 = <1,7,5,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm11 = [1,7,5,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm4, %ymm11, %ymm4
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm8 = ymm8[1,3,2,3,5,7,6,7]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm8[3,4,5,6,7]
@@ -1740,7 +1740,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps 32(%rdi), %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps 64(%rdi), %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm14 = <0,6,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm14 = [0,6,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm3[0,1,2,3],ymm13[4,5],ymm3[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm4, %ymm14, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm7 = ymm6[0,1],ymm15[0,1]
@@ -1767,7 +1767,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm0, %ymm5, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm11[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm2 = <1,7,5,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm2 = [1,7,5,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm4, %ymm2, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm9 = ymm9[1,3,2,3,5,7,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm4 = ymm4[0,1,2],ymm9[3,4,5,6,7]
@@ -1921,7 +1921,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
 ; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,6,12,18,24,30,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
 ; AVX512F-NEXT:    movb $56, %dil
 ; AVX512F-NEXT:    kmovw %edi, %k2
@@ -1935,14 +1935,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,7,13,19,25,31,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
 ; AVX512F-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,8,14,20,26,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
 ; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
@@ -1954,7 +1954,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
 ; AVX512F-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,9,15,21,27,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
 ; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
@@ -1967,7 +1967,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <20,26,0,6,12,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
 ; AVX512F-NEXT:    movw $992, %di # imm = 0x3E0
 ; AVX512F-NEXT:    kmovw %edi, %k1
@@ -1981,7 +1981,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <21,27,1,7,13,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
 ; AVX512F-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
@@ -2009,7 +2009,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [0,6,12,0,0,0,20,26,0,6,12,0,0,0,20,26]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,6,12,18,24,30,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,6,12,18,24,30,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm2
 ; AVX512BW-NEXT:    movb $56, %dil
 ; AVX512BW-NEXT:    kmovd %edi, %k2
@@ -2023,14 +2023,14 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [1,7,13,0,0,0,21,27,1,7,13,0,0,0,21,27]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm6, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,7,13,19,25,31,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,7,13,19,25,31,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k2}
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [0,0,0,3,9,15,21,27,0,0,0,3,9,15,21,27]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
 ; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,8,14,20,26,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,8,14,20,26,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
@@ -2042,7 +2042,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,9,15,21,27,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,9,15,21,27,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm3, %zmm8
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
@@ -2055,7 +2055,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [20,26,0,0,0,2,8,14,20,26,0,0,0,2,8,14]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <20,26,0,6,12,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = [20,26,0,6,12,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm4, %zmm11
 ; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
 ; AVX512BW-NEXT:    kmovd %edi, %k1
@@ -2069,7 +2069,7 @@ define void @load_i32_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <21,27,1,7,13,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = [21,27,1,7,13,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm4, %zmm5
 ; AVX512BW-NEXT:    vmovdqa32 %zmm8, %zmm5 {%k1}
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
@@ -3175,7 +3175,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovaps 64(%rdi), %ymm5
 ; AVX2-SLOW-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm8 = <0,6,4,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-SLOW-NEXT:    vpermps %ymm13, %ymm8, %ymm0
 ; AVX2-SLOW-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
@@ -3251,7 +3251,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpermps %ymm11, %ymm12, %ymm15
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm8 = <1,7,5,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm8 = [1,7,5,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm13, %ymm8, %ymm13
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3,4,5,6,7]
@@ -3591,7 +3591,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovaps 64(%rdi), %ymm5
 ; AVX2-FAST-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm8 = <0,6,4,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm13, %ymm8, %ymm0
 ; AVX2-FAST-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
@@ -3665,7 +3665,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpermps %ymm11, %ymm12, %ymm15
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FAST-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm8 = <1,7,5,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm8 = [1,7,5,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm13, %ymm8, %ymm13
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm6 = ymm6[1,3,2,3,5,7,6,7]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm6 = ymm13[0,1,2],ymm6[3,4,5,6,7]
@@ -4001,7 +4001,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps 64(%rdi), %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm8 = <0,6,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm13 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm13, %ymm8, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vperm2f128 {{.*#+}} ymm1 = ymm5[0,1],ymm4[0,1]
@@ -4077,7 +4077,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm11, %ymm12, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm8 = ymm8[0,1,2,3,4,5],ymm15[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm8 = <1,7,5,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm8 = [1,7,5,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm13, %ymm8, %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm5 = ymm5[1,3,2,3,5,7,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm5 = ymm13[0,1,2],ymm5[3,4,5,6,7]
@@ -4416,7 +4416,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,6,12,18,24,30,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm8
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
 ; AVX512F-NEXT:    movb $56, %dil
@@ -4438,7 +4438,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm17
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,7,13,19,25,31,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm15
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
@@ -4452,7 +4452,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
 ; AVX512F-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <2,8,14,20,26,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm19
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
@@ -4472,7 +4472,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
 ; AVX512F-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
 ; AVX512F-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <3,9,15,21,27,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm21
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
@@ -4494,7 +4494,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm21
 ; AVX512F-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <20,26,0,6,12,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm23
 ; AVX512F-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
 ; AVX512F-NEXT:    movw $992, %di # imm = 0x3E0
@@ -4515,7 +4515,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
 ; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <21,27,1,7,13,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
 ; AVX512F-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
@@ -4561,7 +4561,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm14, %zmm15
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,6,12,18,24,30,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,6,12,18,24,30,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm8
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm7, %zmm8
 ; AVX512BW-NEXT:    movb $56, %dil
@@ -4583,7 +4583,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm17
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm16, %zmm17
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,7,13,19,25,31,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [1,7,13,19,25,31,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm14, %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k2}
@@ -4597,7 +4597,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm14 {%k2}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm17, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <2,8,14,20,26,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [2,8,14,20,26,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm19
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm18, %zmm19
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm17 = [18,24,30,0,0,0,6,12,18,24,30,0,0,0,6,12]
@@ -4617,7 +4617,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm18
 ; AVX512BW-NEXT:    vmovdqa32 %zmm18, %zmm17 {%k2}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm19, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <3,9,15,21,27,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [3,9,15,21,27,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm21
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm20, %zmm21
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [19,25,31,0,0,1,7,13,19,25,31,0,0,1,7,13]
@@ -4639,7 +4639,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm21
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm20, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <20,26,0,6,12,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [20,26,0,6,12,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm23
 ; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm22, %zmm23
 ; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
@@ -4660,7 +4660,7 @@ define void @load_i32_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm20 = [21,27,0,0,0,3,9,15,21,27,0,0,0,3,9,15]
 ; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm20, %zmm12
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <21,27,1,7,13,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = [21,27,1,7,13,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm13, %zmm1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,0,1,7,13,19,25,31,0,0,1,7,13,19,25,31]
@@ -6963,7 +6963,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovaps 192(%rdi), %ymm1
 ; AVX2-SLOW-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm8 = <0,6,4,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpermps %ymm0, %ymm8, %ymm0
@@ -7140,7 +7140,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpermps %ymm15, %ymm14, %ymm9
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm0 = <1,7,5,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm0 = [1,7,5,u]
 ; AVX2-SLOW-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
@@ -7834,7 +7834,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovaps 192(%rdi), %ymm2
 ; AVX2-FAST-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm0 = <0,6,4,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm0 = [0,6,4,u]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm14 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm14, %ymm0, %ymm1
 ; AVX2-FAST-NEXT:    vperm2f128 {{.*#+}} ymm2 = ymm9[0,1],ymm8[0,1]
@@ -8006,7 +8006,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpermps %ymm1, %ymm13, %ymm7
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm7[6,7]
 ; AVX2-FAST-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm0 = <1,7,5,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm0 = [1,7,5,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm14, %ymm0, %ymm7
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm14 = ymm15[1,3,2,3,5,7,6,7]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm14 = ymm7[0,1,2],ymm14[3,4,5,6,7]
@@ -8695,7 +8695,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps 192(%rdi), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm8 = <0,6,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm8 = [0,6,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm0, %ymm8, %ymm0
@@ -8872,7 +8872,7 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm15, %ymm14, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm9[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm0 = <1,7,5,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm0 = [1,7,5,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm9 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpermilps $237, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm14 = mem[1,3,2,3,5,7,6,7]
@@ -9711,24 +9711,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
 ; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm2
 ; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm20
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm21
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm23
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm6
@@ -10061,24 +10061,24 @@ define void @load_i32_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm22
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,6,12,18,24,30,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,6,12,18,24,30,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm20
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm20
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,7,13,19,25,31,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,7,13,19,25,31,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm21
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm4, %zmm21
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = <2,8,14,20,26,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,8,14,20,26,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm12, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <3,9,15,21,27,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [3,9,15,21,27,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm14, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <20,26,0,6,12,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = [20,26,0,6,12,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm23
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm5, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = <21,27,1,7,13,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm24 = [21,27,1,7,13,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm24, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm6
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index 705fda41f71e1..e4f94b368b7fd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -329,7 +329,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-SLOW-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-SLOW-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm2 = <0,7,6,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm2 = [0,7,6,u]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
 ; AVX2-SLOW-NEXT:    vpermps %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vbroadcastss 84(%rdi), %xmm3
@@ -387,7 +387,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-FAST-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm2 = <0,7,6,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm2 = [0,7,6,u]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
 ; AVX2-FAST-NEXT:    vpermps %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vbroadcastss 84(%rdi), %xmm3
@@ -446,7 +446,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps (%rdi), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps 32(%rdi), %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm2 = <0,7,6,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm2 = [0,7,6,u]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm3 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastss 84(%rdi), %xmm3
@@ -803,7 +803,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa 96(%rdi), %ymm10
 ; AVX2-SLOW-NEXT:    vpbroadcastq 80(%rdi), %ymm2
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,7,6,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,7,6,u]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
 ; AVX2-SLOW-NEXT:    vpermd %ymm6, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
@@ -906,7 +906,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa 96(%rdi), %ymm10
 ; AVX2-FAST-NEXT:    vpbroadcastq 80(%rdi), %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,7,6,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,7,6,u]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
@@ -1011,7 +1011,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%rdi), %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 80(%rdi), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm10[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,7,6,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,7,6,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm6 = ymm0[0,1,2,3,4,5],ymm1[6],ymm0[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm6, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3,4,5,6,7]
@@ -1113,25 +1113,25 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [0,3,10,17,0,3,10,17]
 ; AVX512-NEXT:    # ymm4 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,7,14,21,28,u,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,7,14,21,28,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4],ymm4[5,6,7]
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [0,4,11,18,0,4,11,18]
 ; AVX512-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = <1,8,15,22,29,u,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm6 = [1,8,15,22,29,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0,1,2,3,4],ymm5[5,6,7]
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [0,5,12,19,0,5,12,19]
 ; AVX512-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm7
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4],ymm6[5,6,7]
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [0,6,13,20,0,6,13,20]
 ; AVX512-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX512-NEXT:    vpermi2d %zmm3, %zmm2, %zmm7
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = <19,26,1,8,15,u,u,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm8 = [19,26,1,8,15,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm8
 ; AVX512-NEXT:    vpblendd {{.*#+}} ymm7 = ymm8[0,1,2,3,4],ymm7[5,6,7]
 ; AVX512-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,7,14,21,0,7,14,21]
@@ -1771,7 +1771,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqa 96(%rdi), %ymm12
 ; AVX2-SLOW-NEXT:    vpbroadcastq 80(%rdi), %ymm1
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,7,6,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,6,u]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm4, %ymm7
 ; AVX2-SLOW-NEXT:    vpermd %ymm3, %ymm2, %ymm3
@@ -1997,7 +1997,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX2-FAST-NEXT:    vpbroadcastq 80(%rdi), %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <0,7,6,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,7,6,u]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm8 = ymm5[0,1,2,3,4,5],ymm6[6],ymm5[7]
 ; AVX2-FAST-NEXT:    vpermd %ymm8, %ymm7, %ymm8
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm8[0,1,2],ymm2[3,4,5,6,7]
@@ -2223,7 +2223,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%rdi), %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 80(%rdi), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm12[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,7,6,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,6,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm3 = ymm9[0,1,2,3,4,5],ymm4[6],ymm9[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm4, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm3, %ymm2, %ymm3
@@ -2449,7 +2449,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
 ; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
 ; AVX512F-NEXT:    movw $992, %di # imm = 0x3E0
 ; AVX512F-NEXT:    kmovw %edi, %k1
@@ -2479,7 +2479,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,8,15,22,29,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
 ; AVX512F-NEXT:    movw $480, %di # imm = 0x1E0
 ; AVX512F-NEXT:    kmovw %edi, %k2
@@ -2496,7 +2496,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
 ; AVX512F-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
@@ -2506,7 +2506,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
 ; AVX512F-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <19,26,1,8,15,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
 ; AVX512F-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
@@ -2566,7 +2566,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [8,15,0,0,0,19,26,1,8,15,0,0,0,19,26,1]
 ; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,7,14,21,28,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,7,14,21,28,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm3
 ; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
 ; AVX512BW-NEXT:    kmovd %edi, %k1
@@ -2596,7 +2596,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm13 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm6, %zmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,8,15,22,29,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [1,8,15,22,29,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm0, %zmm14
 ; AVX512BW-NEXT:    movw $480, %di # imm = 0x1E0
 ; AVX512BW-NEXT:    kmovd %edi, %k2
@@ -2613,7 +2613,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [26,5,12,19,26,5,12,19,26,5,12,19,26,5,12,19]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <18,25,0,7,14,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [18,25,0,7,14,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm7
 ; AVX512BW-NEXT:    vmovdqa32 %zmm9, %zmm7 {%k2}
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,1,8,15,22,29,0,0,0,1,8,15,22,29,0,0]
@@ -2623,7 +2623,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm9, %zmm15
 ; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <19,26,1,8,15,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [19,26,1,8,15,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm9
 ; AVX512BW-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [0,18,25,0,7,14,0,0,0,18,25,0,7,14,0,0]
@@ -3982,7 +3982,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpbroadcastq 80(%rdi), %ymm0
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,7,6,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,7,6,u]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm6, %ymm12
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4501,7 +4501,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpbroadcastq 80(%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm2[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa %ymm2, %ymm12
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,7,6,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,7,6,u]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm9[6],ymm3[7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa %ymm3, %ymm6
@@ -5003,7 +5003,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 80(%rdi), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,7,6,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,7,6,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5],ymm6[6],ymm3[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm6, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5526,7 +5526,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm17
 ; AVX512F-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,7,14,21,28,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8
 ; AVX512F-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
 ; AVX512F-NEXT:    movw $992, %di # imm = 0x3E0
@@ -5552,7 +5552,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm19
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <1,8,15,22,29,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm17
 ; AVX512F-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
 ; AVX512F-NEXT:    movw $480, %di # imm = 0x1E0
@@ -5579,7 +5579,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm22
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <18,25,0,7,14,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm20
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
 ; AVX512F-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
@@ -5602,7 +5602,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm25
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <19,26,1,8,15,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm23
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
 ; AVX512F-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
@@ -5718,7 +5718,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm16 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm17
 ; AVX512BW-NEXT:    vpermt2d %zmm14, %zmm16, %zmm17
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,7,14,21,28,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,7,14,21,28,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8
 ; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm7, %zmm8
 ; AVX512BW-NEXT:    movw $992, %di # imm = 0x3E0
@@ -5744,7 +5744,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm19
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm18, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <1,8,15,22,29,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [1,8,15,22,29,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm17
 ; AVX512BW-NEXT:    vpermt2d %zmm11, %zmm16, %zmm17
 ; AVX512BW-NEXT:    movw $480, %di # imm = 0x1E0
@@ -5771,7 +5771,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm21 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm22
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm21, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <18,25,0,7,14,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [18,25,0,7,14,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm20
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm19, %zmm20
 ; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm20 {%k2}
@@ -5794,7 +5794,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm25
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm24, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <19,26,1,8,15,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [19,26,1,8,15,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm23
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm22, %zmm23
 ; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm23 {%k2}
@@ -8578,7 +8578,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa 224(%rdi), %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,7,6,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,7,6,u]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm3, %ymm11
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9629,7 +9629,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqa 320(%rdi), %ymm12
 ; AVX2-FAST-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa 224(%rdi), %ymm10
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,7,6,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,7,6,u]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm10[0,1,2,3,4,5],ymm2[6],ymm10[7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa %ymm2, %ymm11
@@ -10669,7 +10669,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 256(%rdi), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 224(%rdi), %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,7,6,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,7,6,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5],ymm2[6],ymm3[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm3, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -11947,16 +11947,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 512(%rdi), %zmm0
 ; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm17
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm22
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm23
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm24
 ; AVX512F-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm29
 ; AVX512F-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25]
@@ -12399,16 +12399,16 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 512(%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm17
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,7,14,21,28,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,7,14,21,28,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm22
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm22
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,8,15,22,29,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,8,15,22,29,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm23
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm23
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <18,25,0,7,14,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [18,25,0,7,14,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm24
 ; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm4, %zmm24
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <19,26,1,8,15,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [19,26,1,8,15,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm29
 ; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm7, %zmm29
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [4,11,18,25]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
index 5ec2453c3564f..f08bee7d93f23 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-3.ll
@@ -301,15 +301,15 @@ define void @load_i64_stride3_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,3,6,9,12,15,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,3,6,9,12,15,u,u]
 ; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,10,13]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <1,4,7,10,13,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,4,7,10,13,u,u,u]
 ; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,8,11,14]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <10,13,0,3,6,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,13,0,3,6,u,u,u]
 ; AVX512-NEXT:    vpermi2q %zmm0, %zmm1, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,9,12,15]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm3, %zmm0
@@ -571,21 +571,21 @@ define void @load_i64_stride3_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm4
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,3,6,9,12,15,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,3,6,9,12,15,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm7
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm6, %zmm7
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,10,13]
 ; AVX512-NEXT:    vpermt2q %zmm0, %zmm8, %zmm7
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm2, %zmm6
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm8, %zmm6
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <1,4,7,10,13,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [1,4,7,10,13,u,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm5, %zmm9
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm8, %zmm9
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,8,11,14]
 ; AVX512-NEXT:    vpermt2q %zmm0, %zmm10, %zmm9
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm10, %zmm8
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <10,13,0,3,6,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [10,13,0,3,6,u,u,u]
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm10, %zmm1
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,9,12,15]
 ; AVX512-NEXT:    vpermt2q %zmm0, %zmm5, %zmm1
@@ -1154,7 +1154,7 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 64(%rdi), %zmm3
 ; AVX512-NEXT:    vmovdqa64 128(%rdi), %zmm10
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm11
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,3,6,9,12,15,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [0,3,6,9,12,15,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm11, %zmm13
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm12, %zmm13
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,4,5,10,13]
@@ -1167,7 +1167,7 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm14, %zmm16
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm9, %zmm12
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm14, %zmm12
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <1,4,7,10,13,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [1,4,7,10,13,u,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm17
 ; AVX512-NEXT:    vpermt2q %zmm2, %zmm14, %zmm17
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,4,8,11,14]
@@ -1180,7 +1180,7 @@ define void @load_i64_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm18, %zmm20
 ; AVX512-NEXT:    vpermi2q %zmm3, %zmm9, %zmm14
 ; AVX512-NEXT:    vpermt2q %zmm10, %zmm18, %zmm14
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <10,13,0,3,6,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [10,13,0,3,6,u,u,u]
 ; AVX512-NEXT:    vpermt2q %zmm11, %zmm18, %zmm1
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,4,9,12,15]
 ; AVX512-NEXT:    vpermt2q %zmm8, %zmm11, %zmm1
@@ -2429,7 +2429,7 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vmovdqa64 256(%rdi), %zmm7
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm25
 ; AVX512-NEXT:    vmovdqa64 192(%rdi), %zmm30
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,3,6,9,12,15,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,3,6,9,12,15,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm6
 ; AVX512-NEXT:    vpermt2q %zmm7, %zmm11, %zmm6
 ; AVX512-NEXT:    vmovdqa64 %zmm28, %zmm15
@@ -2444,10 +2444,10 @@ define void @load_i64_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512-NEXT:    vpermt2q %zmm13, %zmm11, %zmm20
 ; AVX512-NEXT:    vmovdqa64 %zmm22, %zmm23
 ; AVX512-NEXT:    vpermt2q %zmm14, %zmm11, %zmm23
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <1,4,7,10,13,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [1,4,7,10,13,u,u,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm30, %zmm24
 ; AVX512-NEXT:    vpermt2q %zmm7, %zmm21, %zmm24
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <10,13,0,3,6,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [10,13,0,3,6,u,u,u]
 ; AVX512-NEXT:    vpermt2q %zmm30, %zmm31, %zmm7
 ; AVX512-NEXT:    vmovdqa64 %zmm29, %zmm30
 ; AVX512-NEXT:    vpermt2q %zmm9, %zmm21, %zmm30
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
index ccc15a4119ba4..9b772bc109ef9 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-5.ll
@@ -226,20 +226,20 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,10,15]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,6,11,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,6,11,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
 ; AVX512-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm4
 ; AVX512-SLOW-NEXT:    vpbroadcastq %xmm4, %ymm5
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <2,7,12,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [2,7,12,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <11,0,5,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [11,0,5,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm5
 ; AVX512-SLOW-NEXT:    vpbroadcastq 144(%rdi), %ymm6
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm6[6,7]
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <12,1,6,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [12,1,6,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512-SLOW-NEXT:    vmovdqa %ymm2, (%rsi)
@@ -256,20 +256,20 @@ define void @load_i64_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,10,15]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,6,11,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,6,11,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
 ; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,1,2,4]
 ; AVX512-FAST-NEXT:    vmovdqa 128(%rdi), %ymm5
 ; AVX512-FAST-NEXT:    vpermi2q %ymm5, %ymm3, %ymm4
 ; AVX512-FAST-NEXT:    vinserti128 $1, 128(%rdi), %ymm0, %ymm3
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,7,12,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,7,12,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0,1,2,3,4,5],ymm3[6,7]
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <11,0,5,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [11,0,5,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm6
 ; AVX512-FAST-NEXT:    vpbroadcastq 144(%rdi), %ymm7
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],ymm7[6,7]
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <12,1,6,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [12,1,6,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm7
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512-FAST-NEXT:    vmovdqa %ymm2, (%rsi)
@@ -544,21 +544,21 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,6,11,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,6,11,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
 ; AVX512F-NEXT:    movb $7, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,7,12,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,7,12,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512F-NEXT:    movb $56, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm6, %zmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <11,0,5,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [11,0,5,u]
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14]
@@ -602,21 +602,21 @@ define void @load_i64_stride5_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm4 = zmm4[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,4,5,6,11]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm4, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,6,11,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,6,11,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm4
 ; AVX512BW-NEXT:    movb $7, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,5,6,12]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,7,12,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,7,12,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512BW-NEXT:    movb $56, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm6 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,4,5,8,13]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <11,0,5,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [11,0,5,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,4,5,9,14]
@@ -1200,7 +1200,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm12
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm12
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <1,6,11,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = [1,6,11,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm16
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm15, %zmm16
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0]
@@ -1220,7 +1220,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm16
 ; AVX512F-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <2,7,12,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm18
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm17, %zmm18
 ; AVX512F-NEXT:    movb $56, %al
@@ -1236,7 +1236,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm16
 ; AVX512F-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <11,0,5,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm20
 ; AVX512F-NEXT:    vpermt2q %zmm9, %zmm19, %zmm20
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm20 {%k1}
@@ -1297,7 +1297,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm12
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm12 = zmm12[0,1,2,3],zmm14[4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm12
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = <1,6,11,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm15 = [1,6,11,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm16
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm15, %zmm16
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm14 = [5,10,15,0,5,10,15,0]
@@ -1317,7 +1317,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm16
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm17 = <2,7,12,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm17 = [2,7,12,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm18
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm17, %zmm18
 ; AVX512BW-NEXT:    movb $56, %al
@@ -1333,7 +1333,7 @@ define void @load_i64_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm16
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm15, %zmm16
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = <11,0,5,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm19 = [11,0,5,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm20
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm19, %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm20 {%k1}
@@ -2587,7 +2587,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm21, %zmm11, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u]
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0]
 ; AVX512F-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm30
@@ -2605,7 +2605,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,7,12,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,7,12,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2635,7 +2635,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vpermt2q %zmm17, %zmm13, %zmm21
 ; AVX512F-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm31 = <11,0,5,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512F-NEXT:    vpermt2q %zmm19, %zmm31, %zmm25
 ; AVX512F-NEXT:    vpermt2q %zmm19, %zmm11, %zmm17
@@ -2788,7 +2788,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm11, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm28 = <1,6,11,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm28 = [1,6,11,u]
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm15 = [5,10,15,0,5,10,15,0]
 ; AVX512BW-NEXT:    # zmm15 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm30
@@ -2806,7 +2806,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm23, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,7,12,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,7,12,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm23, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2836,7 +2836,7 @@ define void @load_i64_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm13, %zmm21
 ; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm31 = <11,0,5,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm31 = [11,0,5,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm25
 ; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm31, %zmm25
 ; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm11, %zmm17
@@ -5630,15 +5630,15 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,10,15]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
 ; AVX512F-NEXT:    vpermt2q %zmm31, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm29 = <1,6,11,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm31, %zmm29, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,7,12,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,7,12,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm31, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <11,0,5,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [11,0,5,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -6098,15 +6098,15 @@ define void @load_i64_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,5,10,15]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm29 = <1,6,11,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm29 = [1,6,11,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm29, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <2,7,12,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [2,7,12,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm31, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <11,0,5,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [11,0,5,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm31, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
index 288b7219260c9..fb0f09c059d33 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-6.ll
@@ -220,21 +220,21 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm3
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,6,12,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,6,12,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm0
 ; AVX512-SLOW-NEXT:    vpbroadcastq 144(%rdi), %ymm1
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,7,13,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,7,13,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm1
 ; AVX512-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <10,0,6,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [10,0,6,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
 ; AVX512-SLOW-NEXT:    vmovdqa 160(%rdi), %xmm6
 ; AVX512-SLOW-NEXT:    vpbroadcastq %xmm6, %ymm7
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0,1,2,3,4,5],ymm7[6,7]
 ; AVX512-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <11,1,7,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [11,1,7,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm7
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0,1,2,3,4,5],ymm6[6,7]
 ; AVX512-SLOW-NEXT:    vmovdqa 160(%rdi), %ymm7
@@ -262,21 +262,21 @@ define void @load_i64_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512-FAST-NEXT:    vmovdqa64 (%rdi), %zmm2
 ; AVX512-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm3
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,6,12,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,6,12,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm0
 ; AVX512-FAST-NEXT:    vpbroadcastq 144(%rdi), %ymm1
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <1,7,13,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [1,7,13,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm1
 ; AVX512-FAST-NEXT:    vmovdqa 128(%rdi), %ymm4
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm4[6,7]
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <10,0,6,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [10,0,6,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
 ; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,2,4]
 ; AVX512-FAST-NEXT:    vmovdqa 160(%rdi), %ymm7
 ; AVX512-FAST-NEXT:    vpermi2q %ymm7, %ymm5, %ymm6
 ; AVX512-FAST-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm5
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <11,1,7,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [11,1,7,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm2, %zmm3, %zmm8
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm8[0,1,2,3,4,5],ymm5[6,7]
 ; AVX512-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [0,6,0,6]
@@ -607,7 +607,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm4, %zmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,6,12,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,6,12,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    movb $56, %dil
 ; AVX512F-NEXT:    kmovw %edi, %k1
@@ -634,7 +634,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
 ; AVX512F-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm4, %zmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <1,7,13,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = [1,7,13,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13 {%k1}
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11]
@@ -644,7 +644,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4]
 ; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <10,0,6,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [10,0,6,u]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
 ; AVX512F-NEXT:    movb $24, %dil
 ; AVX512F-NEXT:    kmovw %edi, %k2
@@ -655,7 +655,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5]
 ; AVX512F-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm3, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <11,1,7,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [11,1,7,u]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm9 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k1}
@@ -692,7 +692,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,6,0,10,0,6,0,10]
 ; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm4, %zmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,6,12,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,6,12,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    movb $56, %dil
 ; AVX512BW-NEXT:    kmovd %edi, %k1
@@ -719,7 +719,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm7 = [1,7,0,11,1,7,0,11]
 ; AVX512BW-NEXT:    # zmm7 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm4, %zmm7
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = <1,7,13,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm13 = [1,7,13,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13 {%k1}
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm7 = [5,11,5,11,5,11,5,11]
@@ -729,7 +729,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [10,4,10,4,10,4,10,4]
 ; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <10,0,6,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [10,0,6,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm8
 ; AVX512BW-NEXT:    movb $24, %dil
 ; AVX512BW-NEXT:    kmovd %edi, %k2
@@ -740,7 +740,7 @@ define void @load_i64_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm6 = [11,5,11,5,11,5,11,5]
 ; AVX512BW-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm3, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <11,1,7,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [11,1,7,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm9 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k1}
@@ -1459,7 +1459,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15
 ; AVX512F-NEXT:    vpermt2q %zmm10, %zmm14, %zmm15
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,6,12,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,6,12,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm7
 ; AVX512F-NEXT:    vpermt2q %zmm11, %zmm6, %zmm7
 ; AVX512F-NEXT:    movb $56, %dil
@@ -1482,7 +1482,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm17
 ; AVX512F-NEXT:    vpermt2q %zmm10, %zmm18, %zmm17
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,7,13,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = [1,7,13,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm15
 ; AVX512F-NEXT:    vpermt2q %zmm11, %zmm14, %zmm15
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k1}
@@ -1501,7 +1501,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512F-NEXT:    vpermt2q %zmm13, %zmm20, %zmm21
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <10,0,6,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm19
 ; AVX512F-NEXT:    vpermt2q %zmm9, %zmm18, %zmm19
 ; AVX512F-NEXT:    movb $24, %dil
@@ -1523,7 +1523,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm23
 ; AVX512F-NEXT:    vpermt2q %zmm13, %zmm22, %zmm23
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <11,1,7,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm21
 ; AVX512F-NEXT:    vpermt2q %zmm9, %zmm20, %zmm21
 ; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm21 {%k2}
@@ -1602,7 +1602,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm14 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm14, %zmm15
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,6,12,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,6,12,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm7
 ; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm6, %zmm7
 ; AVX512BW-NEXT:    movb $56, %dil
@@ -1625,7 +1625,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm17
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm18, %zmm17
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = <1,7,13,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm14 = [1,7,13,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm15
 ; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm14, %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm15 {%k1}
@@ -1644,7 +1644,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm20, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = <10,0,6,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [10,0,6,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm19
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm18, %zmm19
 ; AVX512BW-NEXT:    movb $24, %dil
@@ -1666,7 +1666,7 @@ define void @load_i64_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm23
 ; AVX512BW-NEXT:    vpermt2q %zmm13, %zmm22, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = <11,1,7,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm20 = [11,1,7,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm21
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm20, %zmm21
 ; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm21 {%k2}
@@ -3245,7 +3245,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm7
 ; AVX512F-NEXT:    vpermt2q %zmm5, %zmm6, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,6,12,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,6,12,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512F-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
 ; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3264,7 +3264,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm8
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = <1,7,13,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = [1,7,13,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512F-NEXT:    vpermt2q %zmm4, %zmm8, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3325,11 +3325,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm19, %zmm1, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <10,0,6,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = [10,0,6,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm11, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm31, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3597,7 +3597,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm7
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm6, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,6,12,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,6,12,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm6, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3616,7 +3616,7 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <1,7,13,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [1,7,13,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm8, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -3677,11 +3677,11 @@ define void @load_i64_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm19, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <10,0,6,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = [10,0,6,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm11, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm31 = <11,1,7,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} ymm31 = [11,1,7,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm31, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -7279,19 +7279,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 448(%rdi), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,6,12,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,6,12,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm8, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,7,13,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,7,13,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm9, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = <10,0,6,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = [10,0,6,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <11,1,7,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = [11,1,7,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm11, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -8035,19 +8035,19 @@ define void @load_i64_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 448(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 384(%rdi), %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,6,12,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,6,12,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm8, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,7,13,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,7,13,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm9, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = <10,0,6,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm10 = [10,0,6,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = <11,1,7,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm11 = [11,1,7,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm11, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
index db8bca5bc16bf..37f4d0d57076c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i64-stride-7.ll
@@ -302,10 +302,10 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512-SLOW-NEXT:    vmovdqa64 (%rdi), %zmm5
 ; AVX512-SLOW-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm0
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,7,14,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,7,14,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <9,0,7,u>
+; AVX512-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,0,7,u]
 ; AVX512-SLOW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512-SLOW-NEXT:    vpbroadcastq 176(%rdi), %ymm2
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
@@ -356,10 +356,10 @@ define void @load_i64_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FAST-NEXT:    vmovdqa64 64(%rdi), %zmm4
 ; AVX512-FAST-NEXT:    vmovdqa64 (%rdi), %zmm5
 ; AVX512-FAST-NEXT:    vinserti128 $1, 160(%rdi), %ymm0, %ymm0
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,7,14,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,7,14,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm4, %zmm5, %zmm1
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3,4,5],ymm0[6,7]
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <9,0,7,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [9,0,7,u]
 ; AVX512-FAST-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512-FAST-NEXT:    vpbroadcastq 176(%rdi), %ymm2
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
index bac0fbe9df6f6..d84733b12ff7d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-3.ll
@@ -1265,9 +1265,9 @@ define void @load_i8_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm10, %xmm8
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm11, %xmm9
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm7, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm6, %xmm6
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm0, %xmm12
 ; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm12, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index e2311246e711f..40894a14a681c 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -2007,9 +2007,9 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 128(%rdi), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <128,128,128,128,0,4,8,12,u,u,u,u,u,u,u,u,128,128,128,128,16,20,24,28,u,u,u,u,u,u,u,u,128,128,128,128,32,36,40,44,u,u,u,u,u,u,u,u,128,128,128,128,48,52,56,60,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [128,128,128,128,0,4,8,12,u,u,u,u,u,u,u,u,128,128,128,128,16,20,24,28,u,u,u,u,u,u,u,u,128,128,128,128,32,36,40,44,u,u,u,u,u,u,u,u,128,128,128,128,48,52,56,60,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %zmm4, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,4,8,12,128,128,128,128,u,u,u,u,u,u,u,u,16,20,24,28,128,128,128,128,u,u,u,u,u,u,u,u,32,36,40,44,128,128,128,128,u,u,u,u,u,u,u,u,48,52,56,60,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,128,128,128,128,u,u,u,u,u,u,u,u,16,20,24,28,128,128,128,128,u,u,u,u,u,u,u,u,32,36,40,44,128,128,128,128,u,u,u,u,u,u,u,u,48,52,56,60,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %zmm6, %zmm2, %zmm7
 ; AVX512BW-NEXT:    vporq %zmm5, %zmm7, %zmm5
 ; AVX512BW-NEXT:    vpshufb %zmm4, %zmm1, %zmm4
@@ -2017,27 +2017,27 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vporq %zmm4, %zmm6, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,1,5,9,13,16,20,24,28,17,21,25,29]
 ; AVX512BW-NEXT:    vpermt2d %zmm5, %zmm6, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [128,128,128,128,1,5,9,13,u,u,u,u,u,u,u,u,128,128,128,128,17,21,25,29,u,u,u,u,u,u,u,u,128,128,128,128,33,37,41,45,u,u,u,u,u,u,u,u,128,128,128,128,49,53,57,61,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %zmm5, %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <1,5,9,13,128,128,128,128,u,u,u,u,u,u,u,u,17,21,25,29,128,128,128,128,u,u,u,u,u,u,u,u,33,37,41,45,128,128,128,128,u,u,u,u,u,u,u,u,49,53,57,61,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [1,5,9,13,128,128,128,128,u,u,u,u,u,u,u,u,17,21,25,29,128,128,128,128,u,u,u,u,u,u,u,u,33,37,41,45,128,128,128,128,u,u,u,u,u,u,u,u,49,53,57,61,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %zmm8, %zmm2, %zmm9
 ; AVX512BW-NEXT:    vporq %zmm7, %zmm9, %zmm7
 ; AVX512BW-NEXT:    vpshufb %zmm5, %zmm1, %zmm5
 ; AVX512BW-NEXT:    vpshufb %zmm8, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vporq %zmm5, %zmm8, %zmm5
 ; AVX512BW-NEXT:    vpermt2d %zmm7, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <128,128,128,128,2,6,10,14,u,u,u,u,u,u,u,u,128,128,128,128,18,22,26,30,u,u,u,u,u,u,u,u,128,128,128,128,34,38,42,46,u,u,u,u,u,u,u,u,128,128,128,128,50,54,58,62,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [128,128,128,128,2,6,10,14,u,u,u,u,u,u,u,u,128,128,128,128,18,22,26,30,u,u,u,u,u,u,u,u,128,128,128,128,34,38,42,46,u,u,u,u,u,u,u,u,128,128,128,128,50,54,58,62,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %zmm7, %zmm3, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <2,6,10,14,128,128,128,128,u,u,u,u,u,u,u,u,18,22,26,30,128,128,128,128,u,u,u,u,u,u,u,u,34,38,42,46,128,128,128,128,u,u,u,u,u,u,u,u,50,54,58,62,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [2,6,10,14,128,128,128,128,u,u,u,u,u,u,u,u,18,22,26,30,128,128,128,128,u,u,u,u,u,u,u,u,34,38,42,46,128,128,128,128,u,u,u,u,u,u,u,u,50,54,58,62,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %zmm9, %zmm2, %zmm10
 ; AVX512BW-NEXT:    vporq %zmm8, %zmm10, %zmm8
 ; AVX512BW-NEXT:    vpshufb %zmm7, %zmm1, %zmm7
 ; AVX512BW-NEXT:    vpshufb %zmm9, %zmm0, %zmm9
 ; AVX512BW-NEXT:    vporq %zmm7, %zmm9, %zmm7
 ; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm6, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <128,128,128,128,3,7,11,15,u,u,u,u,u,u,u,u,128,128,128,128,19,23,27,31,u,u,u,u,u,u,u,u,128,128,128,128,35,39,43,47,u,u,u,u,u,u,u,u,128,128,128,128,51,55,59,63,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [128,128,128,128,3,7,11,15,u,u,u,u,u,u,u,u,128,128,128,128,19,23,27,31,u,u,u,u,u,u,u,u,128,128,128,128,35,39,43,47,u,u,u,u,u,u,u,u,128,128,128,128,51,55,59,63,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %zmm8, %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <3,7,11,15,128,128,128,128,u,u,u,u,u,u,u,u,19,23,27,31,128,128,128,128,u,u,u,u,u,u,u,u,35,39,43,47,128,128,128,128,u,u,u,u,u,u,u,u,51,55,59,63,128,128,128,128,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [3,7,11,15,128,128,128,128,u,u,u,u,u,u,u,u,19,23,27,31,128,128,128,128,u,u,u,u,u,u,u,u,35,39,43,47,128,128,128,128,u,u,u,u,u,u,u,u,51,55,59,63,128,128,128,128,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %zmm9, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vporq %zmm3, %zmm2, %zmm2
 ; AVX512BW-NEXT:    vpshufb %zmm8, %zmm1, %zmm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index 10ccd40e48655..130207fd9c2eb 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -707,7 +707,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm6, %xmm4, %xmm5, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm4, %xmm5
@@ -764,7 +764,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY:       # %bb.0:
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm3[4,9,14],zero,zero,zero,xmm3[2,7,12,u,u,u]
@@ -775,7 +775,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa 64(%rdi), %xmm2
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[1,6,11]
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm3, %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm5, %ymm0, %ymm1, %ymm5
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm5[1,6,11],zero,zero,zero,zero,xmm5[4,9,14],zero,zero,zero,xmm5[u,u,u]
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm5, %xmm5
@@ -784,7 +784,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[2,7,12]
 ; AVX2-ONLY-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[4,9,14,u,u,u]
@@ -793,7 +793,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm6, %xmm6
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[3,8,13]
 ; AVX2-ONLY-NEXT:    vpor %xmm7, %xmm6, %xmm6
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm7
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm8 = xmm7[3,8,13],zero,zero,zero,xmm7[1,6,11],zero,zero,zero,zero,xmm7[u,u,u]
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm7, %xmm7
@@ -802,7 +802,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[4,9,14]
 ; AVX2-ONLY-NEXT:    vpor %xmm7, %xmm4, %xmm4
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[3,8,13],zero,zero,zero,xmm1[1,6,11,u,u,u,u]
@@ -1551,7 +1551,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm13, %xmm5, %xmm11, %xmm5
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
 ; AVX1-ONLY-NEXT:    vandps %ymm5, %ymm12, %ymm11
@@ -1574,7 +1574,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,zero,xmm2[4,9,14,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,0,5,10,15],zero,zero,zero,xmm3[u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm13, %xmm14, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm15 = xmm4[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm15, %xmm13, %xmm13
@@ -1617,7 +1617,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[u,u,u],zero,zero,zero,xmm2[1,6,11,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm14 = xmm3[u,u,u,2,7,12],zero,zero,zero,xmm3[u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm13, %xmm14, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm13, %xmm13
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm15 = xmm4[3,8,13],zero,zero,zero,zero,zero,zero,xmm4[u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm15, %xmm13, %xmm13
@@ -1671,20 +1671,20 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX2-ONLY-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm3, %ymm4, %ymm5
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm5, %xmm6
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,zero,xmm6[4,9,14],zero,zero,zero,xmm6[2,7,12,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13],zero,zero,zero,xmm5[u,u,u]
 ; AVX2-ONLY-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm6
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm8 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
 ; AVX2-ONLY-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,1,6,11,16,21,26,31,20,25,30,19,24,29,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm8, %xmm7
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm6
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm5
@@ -1692,7 +1692,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm5, %xmm5
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[0,5,10,15],zero,zero,zero,xmm5[3,8,13,u,u,u]
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm9, %xmm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm10
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm12 = [0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0]
@@ -1705,7 +1705,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm10[1,6,11],zero,zero,zero,zero,xmm10[4,9,14,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,7,12],zero,zero,zero,xmm9[0,5,10,15],zero,zero,zero,xmm9[u,u,u]
 ; AVX2-ONLY-NEXT:    vpor %xmm10, %xmm9, %xmm9
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm10, %ymm1, %ymm0, %ymm11
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,255]
@@ -1713,7 +1713,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,3,8,13,18,23,28,17,22,27,16,21,26,31,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm9, %ymm11, %ymm7
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm11
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm13 = [0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,0]
@@ -1849,7 +1849,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpternlogq $202, %ymm0, %ymm1, %ymm11
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm12 = ymm11[2,3,0,1]
 ; AVX512F-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm11, %ymm12
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm11, %ymm12, %ymm12
 ; AVX512F-NEXT:    vmovdqa %ymm4, %ymm13
 ; AVX512F-NEXT:    vpternlogq $202, %ymm3, %ymm5, %ymm13
@@ -3169,7 +3169,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, %xmm12
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm6, %xmm6
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm6, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 176(%rdi), %xmm1
@@ -3238,7 +3238,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm12, %xmm3
 ; AVX1-ONLY-NEXT:    vpxor %xmm4, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm12 = xmm4[0,1],xmm3[2,3,4,5,6,7]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,5,10,15,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vmovdqa 80(%rdi), %xmm14
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm14, %xmm15
 ; AVX1-ONLY-NEXT:    vpor %xmm15, %xmm12, %xmm12
@@ -3319,7 +3319,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm9, %xmm8
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,3,4,5,6,7,8,9,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm10[1,6,11],zero,zero,zero,zero,zero,zero,zero,xmm10[u,u,u,u,u,u]
@@ -3340,7 +3340,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm7, %xmm5
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm3, %xmm3
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [2,7,12,128,128,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm6, %xmm14
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm14, %xmm3
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3,4],xmm0[5,6,7]
@@ -3413,7 +3413,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,7,12],zero,zero,zero,xmm1[u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm0[3,4,5,6,7,8,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,8,13,128,128,128,128,128,128,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm0, %xmm0
@@ -3446,7 +3446,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm11[u,u,u,2,7,12],zero,zero,zero,xmm11[u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm5, %xmm7, %xmm5
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,3,4,5,6,7,8,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
@@ -3563,7 +3563,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa 96(%rdi), %ymm4
 ; AVX2-ONLY-NEXT:    vmovdqa 224(%rdi), %ymm10
 ; AVX2-ONLY-NEXT:    vmovdqa 256(%rdi), %ymm9
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm10, %ymm9, %ymm0
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255,255,255,0,255,0,255,255,0,255,0,255,255,0,255,0,255]
@@ -3572,7 +3572,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm2, %ymm4, %ymm0
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm7
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa %ymm1, %ymm5
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
@@ -3583,7 +3583,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm5, %ymm4, %ymm2, %ymm0
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm11
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa %ymm1, %ymm5
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
@@ -3597,7 +3597,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm1, %ymm9, %ymm10, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa %ymm1, %ymm5
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[2,3,0,1]
@@ -3611,13 +3611,13 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovdqa 160(%rdi), %ymm13
 ; AVX2-ONLY-NEXT:    vmovdqa 192(%rdi), %ymm14
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm1, %ymm13, %ymm14, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa %ymm1, %ymm6
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,4,9,14,128,128,128,2,7,12,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,5,10,15,128,128,128,3,8,13,128,128,128,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpor %xmm1, %xmm0, %xmm1
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11,0,5,10,15,4,9,14,3,8,13,0,0,0,1,6,11]
@@ -3636,10 +3636,10 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm0, %ymm7, %ymm15
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm13, %ymm14, %ymm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,6,11,128,128,128,128,4,9,14,128,128,128,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm5
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,0,5,10,15,128,128,128,3,8,13,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12,1,6,11,0,5,10,15,4,9,14,0,0,0,2,7,12]
@@ -3654,12 +3654,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm5, %xmm4
 ; AVX2-ONLY-NEXT:    vpor %xmm6, %xmm4, %xmm4
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm11
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm4
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,1,6,11,128,128,128,128,4,9,14,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,7,12,128,128,128,0,5,10,15,128,128,128,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpor %xmm4, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm4 = [2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13,2,7,12,1,6,11,0,5,10,15,0,0,0,3,8,13]
@@ -3677,12 +3677,12 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm14, %ymm13, %ymm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [3,8,13,128,128,128,1,6,11,128,128,128,128,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,2,7,12,128,128,128,0,5,10,15,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14,3,8,13,2,7,12,1,6,11,0,0,0,0,4,9,14]
@@ -3699,7 +3699,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX2-ONLY-NEXT:    vpor %xmm4, %xmm5, %xmm4
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm4, %ymm0, %ymm6
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm4, %ymm10, %ymm9, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm4, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
@@ -3711,7 +3711,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [0,0,128,128,128,1,6,11,0,0,128,128,128,1,6,11]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u,255,255,0,0,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm0, %ymm14, %ymm13, %ymm13
 ; AVX2-ONLY-NEXT:    vmovdqa 304(%rdi), %xmm2
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm9
@@ -3763,9 +3763,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm14, %ymm6, %ymm12, %ymm12
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm13, %xmm15
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,3,8,13,128,128,128,1,6,11,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm15, %xmm15
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,9,14,128,128,128,2,7,12,128,128,128,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm13, %xmm13
 ; AVX2-ONLY-NEXT:    vpor %xmm15, %xmm13, %xmm13
 ; AVX2-ONLY-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15,4,9,14,3,8,13,2,7,12,0,0,0,0,5,10,15]
@@ -3972,7 +3972,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpternlogq $202, %ymm21, %ymm22, %ymm0
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
 ; AVX512F-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = <128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,4,9,14,19,24,29,18,23,28,17,22,27,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
 ; AVX512F-NEXT:    vmovdqa %ymm4, %ymm3
 ; AVX512F-NEXT:    vpternlogq $202, %ymm23, %ymm24, %ymm3
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 16808dca4511d..899f38951342e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -802,7 +802,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm10, %xmm11, %xmm10
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm11, %xmm8, %xmm10, %xmm8
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,u,u,u,u,u,u,4,10],zero,zero,zero
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm4[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm4[0,6,12]
@@ -825,7 +825,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
 ; AVX1-ONLY-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm12, %xmm10, %xmm11, %xmm10
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
@@ -855,7 +855,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm4
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm0, %ymm3, %ymm4, %ymm5
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm5[0,6,12],zero,zero,zero,xmm5[4,10],zero,zero,zero,xmm5[u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm5, %xmm6
@@ -875,7 +875,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
 ; AVX2-ONLY-NEXT:    vpor %xmm6, %xmm7, %xmm6
 ; AVX2-ONLY-NEXT:    vpblendvb %xmm8, %xmm5, %xmm6, %xmm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm6, %ymm4, %ymm3, %ymm6
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = zero,zero,zero,xmm7[4,10],zero,zero,zero,xmm7[2,8,14,u,u,u,u,u]
@@ -892,7 +892,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = xmm0[u,u,u,u,u,u,u,u,u,u,u],zero,zero,xmm0[1,7,13]
 ; AVX2-ONLY-NEXT:    vpor %xmm7, %xmm10, %xmm7
 ; AVX2-ONLY-NEXT:    vpblendvb %xmm8, %xmm6, %xmm7, %xmm6
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm4, %ymm3, %ymm3
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm3, %xmm4
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm4[0,6,12],zero,zero,zero,xmm4[4,10,u,u,u,u,u,u]
@@ -1711,7 +1711,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,xmm7[4,10,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[2,8,14],zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[3,9,15,u,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -1728,7 +1728,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
 ; AVX1-ONLY-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
@@ -1954,13 +1954,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX2-ONLY-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm8, %ymm2, %ymm3, %ymm9
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm9[0,6,12],zero,zero,zero,xmm9[4,10],zero,zero,zero,xmm9[u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm9, %xmm10
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm10[2,8,14],zero,zero,xmm10[0,6,12,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm6, %xmm11
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0]
 ; AVX2-ONLY-NEXT:    vperm2i128 {{.*#+}} ymm5 = ymm0[0,1],ymm1[0,1]
 ; AVX2-ONLY-NEXT:    vperm2i128 {{.*#+}} ymm6 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm1
@@ -1972,13 +1972,13 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpor %xmm9, %xmm10, %xmm9
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,9,15,5,11,17,23,29,19,25,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm9, %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm3, %ymm2, %ymm9
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm9, %xmm10
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,zero,xmm10[4,10],zero,zero,zero,xmm10[2,8,14,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm13 = xmm9[2,8,14],zero,zero,xmm9[0,6,12],zero,zero,zero,xmm9[u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpor %xmm12, %xmm13, %xmm12
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm13, %ymm6, %ymm5, %ymm13
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm14 = ymm13[u,u,u,u,u,u,u,u,u,u,u,4,10,0,6,12,18,24,30,20,26,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm7, %ymm12, %ymm14, %ymm12
@@ -1996,7 +1996,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm10, %ymm12, %ymm7, %ymm7
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[u,u,u,u,u,1,7,13],zero,zero,zero,xmm8[5,11],zero,zero,zero
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u],zero,zero,zero,xmm9[3,9,15],zero,zero,xmm9[1,7,13]
 ; AVX2-ONLY-NEXT:    vpor %xmm8, %xmm9, %xmm8
@@ -2010,7 +2010,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpor %xmm13, %xmm14, %xmm13
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm3, %ymm2, %ymm2
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm6, %ymm5, %ymm5
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm3[0,6,12],zero,zero,zero,xmm3[4,10,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[4,10],zero,zero,zero,xmm2[2,8,14],zero,zero,xmm2[u,u,u,u,u,u]
@@ -3614,7 +3614,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, %xmm4
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,0,0,0,0,255,255,255,255,255,255,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm9, %xmm1
@@ -3668,7 +3668,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
@@ -4221,21 +4221,21 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa 96(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0]
 ; AVX2-ONLY-NEXT:    vperm2i128 {{.*#+}} ymm2 = ymm0[0,1],ymm1[0,1]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vperm2i128 {{.*#+}} ymm0 = ymm0[2,3],ymm1[2,3]
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm13, %ymm2, %ymm0, %ymm4
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm0 = ymm4[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm1, %ymm3, %ymm5, %ymm2
 ; AVX2-ONLY-NEXT:    vmovdqa %ymm3, %ymm5
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm3, (%rsp) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm10, %xmm2, %xmm9
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm3, %xmm12
 ; AVX2-ONLY-NEXT:    vpor %xmm9, %xmm12, %xmm9
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
@@ -4256,9 +4256,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} ymm0 = ymm13[u,u,u,u,u,u,u,u,u,u,u,2,8,14,4,10,16,22,28,18,24,30,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-ONLY-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm3 = [1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11]
@@ -4271,17 +4271,17 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %ymm3, %ymm13, %ymm1
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm13, %ymm10, %ymm5, %ymm1
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm2, %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm5
 ; AVX2-ONLY-NEXT:    vpor %xmm3, %xmm5, %xmm5
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0]
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3, %ymm3 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    vpshufb %ymm14, %ymm3, %ymm15
@@ -4297,9 +4297,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %ymm14, %ymm0, %ymm6
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm4, %ymm6, %ymm4
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm2 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
@@ -4313,10 +4313,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm14
 ; AVX2-ONLY-NEXT:    vmovdqa 160(%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm0, %ymm3, %ymm1
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm1, %ymm3, %ymm0, %ymm5
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm13, %ymm0, %ymm3, %ymm15
 ; AVX2-ONLY-NEXT:    vmovdqa 352(%rdi), %ymm4
@@ -4329,15 +4329,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm10, %ymm0, %ymm8
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm7, %ymm0, %ymm10
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm13 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm11, %ymm9, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm6
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm5, %xmm11
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm11, %xmm9
 ; AVX2-ONLY-NEXT:    vpor %xmm6, %xmm9, %xmm6
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
@@ -4351,9 +4351,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm9, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm5, %xmm2
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm11, %xmm11
 ; AVX2-ONLY-NEXT:    vpor %xmm2, %xmm11, %xmm2
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -4366,15 +4366,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm9, %ymm14, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm15, %xmm14
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm14, %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm2, %xmm15, %xmm1
 ; AVX2-ONLY-NEXT:    vpor %xmm0, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm8, %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm8, %xmm11
 ; AVX2-ONLY-NEXT:    vpor %xmm6, %xmm11, %xmm6
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm11 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
@@ -4398,14 +4398,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm4 = ymm4[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm9, %ymm4, %ymm0, %ymm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm14, %xmm4
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm15, %xmm11
 ; AVX2-ONLY-NEXT:    vpor %xmm4, %xmm11, %xmm4
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm14, %xmm8, %xmm8
 ; AVX2-ONLY-NEXT:    vpor %xmm3, %xmm8, %xmm3
 ; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
@@ -4427,9 +4427,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm3
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm9, %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm1, %xmm0, %xmm2
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm9, %xmm8
 ; AVX2-ONLY-NEXT:    vmovdqa %ymm9, %ymm10
 ; AVX2-ONLY-NEXT:    vpor %xmm2, %xmm8, %xmm2
@@ -4447,9 +4447,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0,1,2],ymm1[3,4,5,6,7],ymm7[8,9,10],ymm1[11,12,13,14,15]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm7[0,1,2,3],ymm1[4,5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm9, %xmm10, %xmm10
 ; AVX2-ONLY-NEXT:    vpor %xmm0, %xmm10, %xmm0
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -4488,14 +4488,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-LABEL: load_i8_stride6_vf64:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    subq $40, %rsp
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm12 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512F-NEXT:    vmovdqa64 224(%rdi), %ymm25
 ; AVX512F-NEXT:    vmovdqa64 192(%rdi), %ymm26
 ; AVX512F-NEXT:    vmovdqa %ymm12, %ymm0
 ; AVX512F-NEXT:    vpternlogq $202, %ymm25, %ymm26, %ymm0
 ; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm4
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
 ; AVX512F-NEXT:    vpor %xmm3, %xmm6, %xmm9
@@ -4506,9 +4506,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vmovdqa %ymm12, %ymm6
 ; AVX512F-NEXT:    vpternlogq $202, %ymm24, %ymm18, %ymm6
 ; AVX512F-NEXT:    vextracti128 $1, %ymm6, %xmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm7, %xmm10
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
 ; AVX512F-NEXT:    vpshufb %xmm8, %xmm6, %xmm13
 ; AVX512F-NEXT:    vpor %xmm10, %xmm13, %xmm10
 ; AVX512F-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
@@ -4529,15 +4529,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
 ; AVX512F-NEXT:    vpor %xmm3, %xmm8, %xmm3
 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm8 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm8 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm10, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpor %xmm0, %xmm4, %xmm0
 ; AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
 ; AVX512F-NEXT:    vpshufb %xmm0, %xmm7, %xmm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
 ; AVX512F-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
 ; AVX512F-NEXT:    vporq %xmm4, %xmm6, %xmm28
 ; AVX512F-NEXT:    vpshufb %xmm8, %xmm9, %xmm4
@@ -4546,22 +4546,22 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpshufb %xmm0, %xmm5, %xmm0
 ; AVX512F-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX512F-NEXT:    vporq %xmm0, %xmm1, %xmm27
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512F-NEXT:    vmovdqa %ymm9, %ymm4
 ; AVX512F-NEXT:    vpternlogq $202, %ymm26, %ymm25, %ymm4
 ; AVX512F-NEXT:    vextracti128 $1, %ymm4, %xmm15
 ; AVX512F-NEXT:    vpshufb %xmm0, %xmm15, %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm6, %xmm4, %xmm5
 ; AVX512F-NEXT:    vpor %xmm1, %xmm5, %xmm1
 ; AVX512F-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-NEXT:    vmovdqa %ymm12, %ymm5
 ; AVX512F-NEXT:    vpternlogq $202, %ymm18, %ymm24, %ymm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
 ; AVX512F-NEXT:    vpshufb %xmm8, %xmm5, %xmm7
 ; AVX512F-NEXT:    vextracti128 $1, %ymm5, %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
 ; AVX512F-NEXT:    vpshufb %xmm10, %xmm1, %xmm13
 ; AVX512F-NEXT:    vpor %xmm7, %xmm13, %xmm2
 ; AVX512F-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4578,9 +4578,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpshufb %xmm10, %xmm7, %xmm10
 ; AVX512F-NEXT:    vpor %xmm8, %xmm10, %xmm0
 ; AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm10, %xmm15, %xmm15
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm8 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm8 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpor %xmm4, %xmm15, %xmm0
 ; AVX512F-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4631,16 +4631,16 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpternlogq $226, %ymm26, %ymm12, %ymm11
 ; AVX512F-NEXT:    vextracti128 $1, %ymm11, %xmm0
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,xmm0[0,6,12],zero,zero,zero,xmm0[4,10,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
 ; AVX512F-NEXT:    vmovdqa64 %xmm3, %xmm25
 ; AVX512F-NEXT:    vporq %xmm1, %xmm2, %xmm26
 ; AVX512F-NEXT:    vmovdqa64 %ymm18, %ymm14
 ; AVX512F-NEXT:    vpternlogq $226, %ymm24, %ymm9, %ymm14
 ; AVX512F-NEXT:    vextracti128 $1, %ymm14, %xmm10
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512F-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm14, %xmm4
 ; AVX512F-NEXT:    vporq %xmm2, %xmm4, %xmm27
 ; AVX512F-NEXT:    vpternlogq $202, %ymm30, %ymm31, %ymm12
@@ -4651,15 +4651,15 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm9, %xmm2
 ; AVX512F-NEXT:    vpor %xmm1, %xmm2, %xmm7
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa64 %xmm1, %xmm22
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm13 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm13 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm13, %xmm11, %xmm1
 ; AVX512F-NEXT:    vpor %xmm0, %xmm1, %xmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm10, %xmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
 ; AVX512F-NEXT:    vpshufb %xmm0, %xmm14, %xmm10
 ; AVX512F-NEXT:    vpor %xmm1, %xmm10, %xmm10
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,4,10,0,6,12,18,24,30,20,26,128,128,128,128,128,128,128,128,128,128,128]
@@ -4741,14 +4741,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-LABEL: load_i8_stride6_vf64:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,6,12,128,128,128,4,10,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa 224(%rdi), %ymm0
 ; AVX512BW-NEXT:    vmovdqa64 192(%rdi), %ymm23
 ; AVX512BW-NEXT:    movw $18724, %r10w # imm = 0x4924
 ; AVX512BW-NEXT:    kmovd %r10d, %k1
 ; AVX512BW-NEXT:    vpblendmw %ymm0, %ymm23, %ymm9 {%k1}
 ; AVX512BW-NEXT:    vpshufb %xmm2, %xmm9, %xmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,2,8,14,128,128,0,6,12,u,u,u,u,u]
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm9, %xmm12
 ; AVX512BW-NEXT:    vpshufb %xmm4, %xmm12, %xmm3
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm3, %xmm5
@@ -4759,9 +4759,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX512BW-NEXT:    vpblendmw %ymm26, %ymm1, %ymm15 {%k1}
 ; AVX512BW-NEXT:    vextracti32x4 $1, %ymm15, %xmm16
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,128,128,0,6,12,128,128,128,4,10]
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm16, %xmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = <u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [u,u,u,u,u,u,4,10,128,128,128,2,8,14,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm18, %xmm15, %xmm13
 ; AVX512BW-NEXT:    vpor %xmm11, %xmm13, %xmm11
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
@@ -4800,14 +4800,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    movabsq $-8796093022208, %rdi # imm = 0xFFFFF80000000000
 ; AVX512BW-NEXT:    kmovq %rdi, %k3
 ; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm2 {%k3}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = <1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [1,7,13,128,128,128,5,11,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm7, %xmm9, %xmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm9 = <128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm9 = [128,128,128,3,9,15,128,128,1,7,13,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm9, %xmm12, %xmm12
 ; AVX512BW-NEXT:    vpor %xmm8, %xmm12, %xmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,u,128,128,1,7,13,128,128,128,5,11]
 ; AVX512BW-NEXT:    vpshufb %xmm12, %xmm16, %xmm16
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,u,5,11,128,128,128,3,9,15,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm15, %xmm15
 ; AVX512BW-NEXT:    vporq %xmm16, %xmm15, %xmm15
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm15, %ymm0, %ymm15
@@ -4828,13 +4828,13 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm7, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm9 {%k3}
 ; AVX512BW-NEXT:    vpblendmw %ymm13, %ymm5, %ymm15 {%k4}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,4,10,128,128,128,2,8,14,u,u,u,u,u]
 ; AVX512BW-NEXT:    movw $9362, %di # imm = 0x2492
 ; AVX512BW-NEXT:    kmovd %edi, %k2
 ; AVX512BW-NEXT:    vpblendmw %ymm10, %ymm3, %ymm8 {%k2}
 ; AVX512BW-NEXT:    vextracti32x4 $1, %ymm8, %xmm16
 ; AVX512BW-NEXT:    vpshufb %xmm7, %xmm16, %xmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [2,8,14,128,128,0,6,12,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm8, %xmm18
 ; AVX512BW-NEXT:    vporq %xmm12, %xmm18, %xmm18
 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm19 = [2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12]
@@ -4847,10 +4847,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm20, %xmm12
 ; AVX512BW-NEXT:    vpor %xmm7, %xmm12, %xmm7
 ; AVX512BW-NEXT:    vpblendmw %ymm1, %ymm26, %ymm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = <u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [u,u,u,u,u,0,6,12,128,128,128,4,10,128,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm22, %xmm17, %xmm12
 ; AVX512BW-NEXT:    vextracti32x4 $1, %ymm17, %xmm24
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm25 = <u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm25 = [u,u,u,u,u,128,128,128,2,8,14,128,128,0,6,12]
 ; AVX512BW-NEXT:    vpshufb %xmm25, %xmm24, %xmm27
 ; AVX512BW-NEXT:    vporq %xmm12, %xmm27, %xmm12
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
@@ -4872,9 +4872,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    kmovd %edi, %k4
 ; AVX512BW-NEXT:    vmovdqu16 %ymm14, %ymm4 {%k4}
 ; AVX512BW-NEXT:    vmovdqu16 %ymm13, %ymm5 {%k4}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm13 = <128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,128,128,5,11,128,128,128,3,9,15,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm13, %xmm16, %xmm14
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = <3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [3,9,15,128,128,1,7,13,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm16, %xmm8, %xmm8
 ; AVX512BW-NEXT:    vpor %xmm14, %xmm8, %xmm8
 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13]
@@ -4882,9 +4882,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb %xmm13, %xmm21, %xmm13
 ; AVX512BW-NEXT:    vpshufb %xmm16, %xmm20, %xmm15
 ; AVX512BW-NEXT:    vpor %xmm13, %xmm15, %xmm13
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,u,u,1,7,13,128,128,128,5,11,128,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm15, %xmm17, %xmm16
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,3,9,15,128,128,1,7,13]
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm24, %xmm19
 ; AVX512BW-NEXT:    vporq %xmm16, %xmm19, %xmm16
 ; AVX512BW-NEXT:    vinserti32x4 $1, %xmm16, %ymm0, %ymm16
@@ -4899,11 +4899,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqu8 %zmm7, %zmm13 {%k3}
 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14,4,10,0,6,12,2,8,14]
 ; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,0,6,12,128,128,128,4,10,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqu16 %ymm10, %ymm3 {%k1}
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm3, %xmm15
 ; AVX512BW-NEXT:    vpshufb %xmm14, %xmm15, %xmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = <4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [4,10,128,128,128,2,8,14,128,128,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm16, %xmm3, %xmm17
 ; AVX512BW-NEXT:    vporq %xmm10, %xmm17, %xmm10
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm10 = xmm10[0,1,2,3,4],xmm8[5,6,7]
@@ -4915,9 +4915,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vporq %xmm14, %xmm16, %xmm14
 ; AVX512BW-NEXT:    vmovdqu16 %ymm26, %ymm1 {%k2}
 ; AVX512BW-NEXT:    vextracti32x4 $1, %ymm1, %xmm16
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [u,u,u,u,u,128,128,128,4,10,128,128,128,2,8,14]
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm16, %xmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [u,u,u,u,u,2,8,14,128,128,0,6,12,128,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm19, %xmm1, %xmm20
 ; AVX512BW-NEXT:    vporq %xmm18, %xmm20, %xmm18
 ; AVX512BW-NEXT:    vinserti32x4 $1, %xmm18, %ymm0, %ymm18
@@ -4939,9 +4939,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm10 {%k2}
 ; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} ymm7 = [5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15,5,11,1,7,13,3,9,15]
 ; AVX512BW-NEXT:    vpshufb %ymm7, %ymm5, %ymm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,1,7,13,128,128,128,5,11,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm14, %xmm15, %xmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [5,11,128,128,128,3,9,15,128,128,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm3, %xmm3
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm15, %xmm3
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm5[5,6,7]
@@ -4949,9 +4949,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    vpshufb %xmm14, %xmm8, %xmm5
 ; AVX512BW-NEXT:    vpshufb %xmm17, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vpor %xmm5, %xmm0, %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,5,11,128,128,128,3,9,15]
 ; AVX512BW-NEXT:    vpshufb %xmm5, %xmm16, %xmm8
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,3,9,15,128,128,1,7,13,128,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm14, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpor %xmm1, %xmm8, %xmm1
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 2b2cb554d6ac6..718d4973f97e2 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -688,7 +688,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,u,u,u,u,u,u,u,u]
@@ -699,7 +699,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm4 = zero,zero,zero,xmm4[6,13],zero,zero,xmm4[2,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[1,8,15],zero,zero,xmm3[4,11],zero,xmm3[u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpor %xmm4, %xmm3, %xmm3
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,u,u,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,xmm5[0,7,14],zero,zero,xmm5[3,u,u,u,u,u,u,u,u]
@@ -710,7 +710,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm6[1,8,15],zero,zero,xmm6[4,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[3,10],zero,zero,zero,xmm5[6,13],zero,xmm5[u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpor %xmm6, %xmm5, %xmm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm6
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm6, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[2,9],zero,zero,zero,xmm7[5,u,u,u,u,u,u,u,u]
@@ -721,7 +721,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm8 = zero,zero,xmm8[3,10],zero,zero,zero,xmm8[6,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[5,12],zero,zero,xmm7[1,8,15],zero,xmm7[u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpor %xmm7, %xmm8, %xmm7
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm8, %ymm1, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,11,0,7,u,u,u,u,u,u,u,u,u,u,u,u]
@@ -1403,7 +1403,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm5, %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 80(%rdi), %xmm1
@@ -1415,7 +1415,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm11, %xmm8, %xmm9, %xmm8
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u]
@@ -1525,7 +1525,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-ONLY-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm2
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[5,12],zero,zero,xmm3[1,8,15,u,u,u,u,u,u]
@@ -1539,7 +1539,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm2[u,u,u,u,u,u,u,u,u,u],zero,zero,xmm2[4,11],zero,zero
 ; AVX2-ONLY-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4],xmm4[5,6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm4
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
@@ -1553,7 +1553,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpor %xmm7, %xmm8, %xmm8
 ; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0]
 ; AVX2-ONLY-NEXT:    vpblendvb %xmm7, %xmm6, %xmm8, %xmm6
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm8
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm8[2,9],zero,zero,zero,xmm8[5,12],zero,zero,xmm8[u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm8, %xmm8
@@ -1564,7 +1564,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm2[6,13],zero,zero
 ; AVX2-ONLY-NEXT:    vpor %xmm12, %xmm11, %xmm11
 ; AVX2-ONLY-NEXT:    vpblendvb %xmm7, %xmm8, %xmm11, %xmm8
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm11
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm11[3,10],zero,zero,zero,xmm11[6,13],zero,zero,xmm11[u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm11, %xmm11
@@ -1575,7 +1575,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[0,7,14],zero,zero
 ; AVX2-ONLY-NEXT:    vpor %xmm12, %xmm9, %xmm9
 ; AVX2-ONLY-NEXT:    vpblendvb %xmm7, %xmm11, %xmm9, %xmm9
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
@@ -1585,7 +1585,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[1,8,15],zero,zero
 ; AVX2-ONLY-NEXT:    vpor %xmm12, %xmm10, %xmm10
 ; AVX2-ONLY-NEXT:    vpblendvb %xmm7, %xmm11, %xmm10, %xmm10
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm11, %ymm1, %ymm0, %ymm11
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm11, %xmm12
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[3,10],zero,zero,zero,xmm12[6,13,u,u,u,u,u,u,u]
@@ -1596,7 +1596,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm13 = xmm2[u,u,u,u,u,u,u,u,u],zero,zero,xmm2[2,9],zero,zero,zero
 ; AVX2-ONLY-NEXT:    vpor %xmm13, %xmm12, %xmm12
 ; AVX2-ONLY-NEXT:    vpblendvb %xmm7, %xmm11, %xmm12, %xmm11
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-ONLY-NEXT:    vpblendvb %ymm12, %ymm1, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm0[6,13],zero,zero,xmm0[2,9],zero,zero,zero,xmm0[u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vextracti128 $1, %ymm0, %xmm0
@@ -2921,7 +2921,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa 128(%rdi), %xmm9
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm9[u,u,u,5,12],zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm3
@@ -2935,7 +2935,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u]
@@ -2952,7 +2952,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u]
@@ -2961,7 +2961,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
@@ -2988,7 +2988,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u]
@@ -3009,7 +3009,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[5,12]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
 ; AVX1-ONLY-NEXT:    vpor %xmm12, %xmm13, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,255,255,255,255,255,255,255,255,255,0,0,0,0,0>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm12, %xmm2, %xmm13, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm1[u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm1[6,13]
@@ -3228,27 +3228,27 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm7
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm13
 ; AVX2-SLOW-NEXT:    vmovdqa 96(%rdi), %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm6, %ymm7, %ymm0
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm3, %ymm13, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, %ymm14
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm10, %ymm11, %ymm1
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpor %xmm4, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm6, %ymm7, %ymm4
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
@@ -3258,7 +3258,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm8
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm5, %ymm4, %ymm8
 ; AVX2-SLOW-NEXT:    vmovdqa 192(%rdi), %xmm4
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
@@ -3298,7 +3298,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm8, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm11, %ymm10, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm1
@@ -3311,7 +3311,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm1, %ymm8, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm11, %ymm10, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm1
@@ -3325,7 +3325,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm1, %ymm12, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm11, %ymm10, %ymm14
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
@@ -3338,14 +3338,14 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm10, %ymm11, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm7, %ymm6, %ymm10
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm6, %ymm7, %ymm11
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm15
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm7, %ymm6, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm13, %ymm3, %ymm0
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm13, %ymm3, %ymm9
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm13, %ymm3, %ymm8
@@ -3441,21 +3441,21 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa 96(%rdi), %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm5, %ymm6, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, %ymm13
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm4, %ymm3, %ymm1
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7,8,9],ymm2[10],ymm1[11,12],ymm2[13],ymm1[14,15]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm8, %ymm9, %ymm1
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u],zero,zero,xmm2[3,10],zero,zero,zero,xmm2[6,13,u,u,u,u]
@@ -3472,7 +3472,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm5, %ymm6, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, %ymm12
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -3483,7 +3483,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm7
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm7[2],ymm1[3,4,5],ymm7[6],ymm1[7,8,9],ymm7[10],ymm1[11,12,13],ymm7[14],ymm1[15]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm8, %ymm9, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[u,u,u,6,13],zero,zero,xmm1[2,9],zero,zero,zero,xmm1[u,u,u,u]
@@ -3514,7 +3514,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm10, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm9, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm10 = xmm7[u,u,2,9],zero,zero,zero,xmm7[5,12],zero,zero,xmm7[u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm7, %xmm7
@@ -3527,7 +3527,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm7, %ymm10, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm9, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[u,u,3,10],zero,zero,zero,xmm7[6,13],zero,zero,xmm7[u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm7, %xmm7
@@ -3539,9 +3539,9 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm7, %ymm12, %ymm15
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm9, %ymm8, %ymm12
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm8, %xmm9
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u],zero,zero,xmm9[2,9],zero,zero,zero,xmm9[5,12,u,u,u,u,u]
@@ -3557,9 +3557,9 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm5, %ymm6, %ymm2
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm5, %ymm6, %ymm8
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm6, %ymm5, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm3, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm3, %ymm4, %ymm13
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm3, %ymm4, %ymm10
@@ -3652,27 +3652,27 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdi), %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%rdi), %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm6, %ymm7, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = zero,zero,zero,xmm1[5,12],zero,zero,xmm1[1,8,15,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,7,14],zero,zero,xmm0[3,10],zero,zero,zero,xmm0[u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm3, %ymm13, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm2, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm4[2],ymm1[3,4],ymm4[5],ymm1[6,7,8,9],ymm4[10],ymm1[11,12],ymm4[13],ymm1[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,6,13,4,11,2,9,16,23,30,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm1, %ymm10, %ymm11, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm4 = xmm4[u,u,u],zero,zero,xmm4[3,10],zero,zero,zero,xmm4[6,13,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,5,12],zero,zero,xmm1[1,8,15],zero,zero,xmm1[u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm4, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm6, %ymm7, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm4, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u,u,u,u,u]
@@ -3682,7 +3682,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm5, %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm8[2],ymm5[3,4,5],ymm8[6],ymm5[7,8,9],ymm8[10],ymm5[11,12,13],ymm8[14],ymm5[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[u,u,u,u,u,u,u,u,u,0,7,14,5,12,3,10,17,24,31,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm5, %ymm4, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 192(%rdi), %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u,u,4,11,u,u,u,u,u,u,u,u]
@@ -3722,7 +3722,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm8, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm11, %ymm10, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[u,u,2,9],zero,zero,zero,xmm1[5,12],zero,zero,xmm1[u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm1
@@ -3735,7 +3735,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm8, %ymm0, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm1, %ymm8, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm11, %ymm10, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm12 = xmm1[u,u,3,10],zero,zero,zero,xmm1[6,13],zero,zero,xmm1[u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm1
@@ -3749,7 +3749,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm1, %ymm12, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm11, %ymm10, %ymm14
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm10, %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u],zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u]
@@ -3762,14 +3762,14 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm11, %ymm0, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm10, %ymm11, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm7, %ymm6, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm6, %ymm7, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm7, %ymm6, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm7, %ymm6, %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm13, %ymm3, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm13, %ymm3, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm13, %ymm3, %ymm8
@@ -4269,20 +4269,20 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-SLOW:       # %bb.0:
 ; AVX512BW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-SLOW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31]
 ; AVX512BW-SLOW-NEXT:    vpermw %zmm1, %zmm2, %zmm4
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u]
 ; AVX512BW-SLOW-NEXT:    vpermw %zmm1, %zmm2, %zmm5
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u]
 ; AVX512BW-SLOW-NEXT:    vpermw %zmm1, %zmm2, %zmm11
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15]
 ; AVX512BW-SLOW-NEXT:    vpermw %zmm1, %zmm2, %zmm12
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31]
 ; AVX512BW-SLOW-NEXT:    vpermw %zmm1, %zmm2, %zmm10
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31]
 ; AVX512BW-SLOW-NEXT:    vpermw %zmm1, %zmm2, %zmm6
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm2
@@ -6781,7 +6781,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    # xmm5 = mem[0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm8, %xmm8
 ; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm8, %xmm8
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm6, %xmm7, %xmm8, %xmm7
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 240(%rdi), %xmm7
@@ -6815,7 +6815,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm6, %xmm9
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm6, %xmm14
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm9, %xmm9
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,0,0,0,0,255,255,255,255,255,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm15, %xmm2, %xmm9, %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 400(%rdi), %xmm2
@@ -6877,7 +6877,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm10, %xmm13
 ; AVX1-ONLY-NEXT:    vpor %xmm4, %xmm13, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm4, %xmm5, %xmm13, %xmm5
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
@@ -6904,7 +6904,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm7, %xmm13
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm13, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm9, %xmm3, %xmm13, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -6984,7 +6984,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm12, %xmm13
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm13, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm2, %xmm1, %xmm13, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -7059,7 +7059,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm0, %xmm13
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, %xmm5
 ; AVX1-ONLY-NEXT:    vpor %xmm4, %xmm13, %xmm4
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,255,255,255,255,255,255,255,255,255,0,0,0,0,0>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,u,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm13, {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm0 # 16-byte Folded Reload
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 432(%rdi), %xmm1
@@ -7615,7 +7615,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa 96(%rdi), %ymm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm3, %ymm13
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, %ymm10
@@ -7623,12 +7623,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm1, %ymm12
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm5, %ymm4, %ymm3
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm9, %ymm14
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm5, %ymm9
@@ -7664,19 +7664,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm11, %ymm9, %ymm2
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm12, %ymm10, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm7
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpor %xmm7, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
 ; AVX2-SLOW-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm4, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm15, %ymm6, %ymm4
@@ -7692,14 +7692,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 160(%rdi), %ymm9
 ; AVX2-SLOW-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm9, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm3, %ymm15
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
@@ -7735,12 +7735,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm9, %ymm15, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm10, %xmm0, %xmm6
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpor %xmm6, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
@@ -7768,13 +7768,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm3, %ymm4, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
@@ -7805,12 +7805,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm6, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm11, %ymm4, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpor %xmm7, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
@@ -7837,13 +7837,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm1, %xmm7
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpor %xmm7, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
@@ -7871,15 +7871,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm3, %ymm4, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm4, %ymm3, %ymm1
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
@@ -7909,15 +7909,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm0, %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm12
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm2, %ymm0, %ymm13
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm3
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -7932,7 +7932,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm6, %ymm2, %ymm0
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm6, %ymm2, %ymm4
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm9
@@ -7947,12 +7947,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm7, %ymm8, %ymm10
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm8, %ymm7, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm8, %ymm7, %ymm11
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm1, %xmm15
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm15, %xmm1
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm15
@@ -7972,10 +7972,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm12, %xmm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm4, %xmm2
@@ -7993,9 +7993,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm14
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm13, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX2-SLOW-NEXT:    vpor %xmm0, %xmm3, %xmm0
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm9, %xmm3
@@ -8015,9 +8015,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
@@ -8037,11 +8037,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm1, %ymm3, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -8062,9 +8062,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm9, %xmm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
 ; AVX2-SLOW-NEXT:    vpor %xmm5, %xmm9, %xmm5
 ; AVX2-SLOW-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
@@ -8154,19 +8154,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %ymm12
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa 96(%rdi), %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm12, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa %ymm3, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, %ymm13
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm5, %ymm4, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa %ymm5, %ymm9
 ; AVX2-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8199,23 +8199,23 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm11, %ymm9, %ymm2
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm13, %ymm12, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, %ymm9
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm4, %xmm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX2-FAST-NEXT:    vpor %xmm5, %xmm4, %xmm4
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm5 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
 ; AVX2-FAST-NEXT:    # ymm5 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm8, %ymm0, %ymm3
@@ -8230,7 +8230,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm3, %ymm2, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa 160(%rdi), %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm1, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa %ymm3, %ymm12
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -8238,9 +8238,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm9
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -8273,12 +8273,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm10, %ymm12, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm4
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpor %xmm4, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -8300,13 +8300,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm5, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm14, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm14, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
@@ -8338,14 +8338,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm0, %ymm10
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm6, %ymm10, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm14, %ymm3, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa %ymm3, %ymm14
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm6, %xmm11
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm6, %xmm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm6, %xmm6
 ; AVX2-FAST-NEXT:    vpor %xmm6, %xmm11, %xmm6
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm11 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
@@ -8373,14 +8373,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm6, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vmovdqa %ymm14, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm2, %ymm14, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm14, %xmm1, %xmm10
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm10, %xmm1
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
@@ -8407,15 +8407,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm1, %ymm6, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm2, %ymm3, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
@@ -8446,15 +8446,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm0, %ymm1, %ymm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm9
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm1, %ymm0, %ymm10
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -8470,7 +8470,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm2, %ymm0, %ymm4
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm2, %ymm0, %ymm14
@@ -8488,10 +8488,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm5, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm11, %xmm0
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm11, %xmm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm11, %xmm11
 ; AVX2-FAST-NEXT:    vpor %xmm0, %xmm11, %xmm0
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm11
@@ -8511,10 +8511,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm9, %xmm1
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm9, %xmm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm4, %xmm2
@@ -8532,9 +8532,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm12
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
 ; AVX2-FAST-NEXT:    vpor %xmm0, %xmm3, %xmm0
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm3
@@ -8554,9 +8554,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm15
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm3, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vpor %xmm0, %xmm3, %xmm0
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -8576,11 +8576,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm2
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm4, %xmm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
 ; AVX2-FAST-NEXT:    vpor %xmm2, %xmm4, %xmm2
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
@@ -8602,9 +8602,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm7, %xmm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX2-FAST-NEXT:    vpor %xmm4, %xmm7, %xmm4
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
@@ -8686,7 +8686,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdi), %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%rdi), %ymm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm3, %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm2, %ymm10
@@ -8694,12 +8694,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm1, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,5,12,128,128,1,8,15,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,7,14,128,128,3,10,128,128,128,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm3, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm5, %ymm4, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm9, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm5, %ymm9
@@ -8735,19 +8735,19 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm11, %ymm9, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm2[0,1],ymm3[2],ymm2[3,4,5],ymm3[6],ymm2[7,8,9],ymm3[10],ymm2[11,12,13],ymm3[14],ymm2[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm12, %ymm10, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm2, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm5, %xmm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,128,6,13,128,128,2,9,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm3, %xmm7
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,8,15,128,128,4,11,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm7, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcasti128 {{.*#+}} ymm7 = [1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10,1,8,15,0,0,0,0,0,0,0,7,14,5,12,3,10]
 ; AVX2-FAST-PERLANE-NEXT:    # ymm7 = mem[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm15, %ymm6, %ymm4
@@ -8763,14 +8763,14 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 160(%rdi), %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 128(%rdi), %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm9, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm3, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,3,10,128,128,128,6,13,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm3, %xmm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,5,12,128,128,1,8,15,128,128,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm12 = [2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9]
@@ -8806,12 +8806,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm6[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm9, %ymm15, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,6,13,128,128,2,9,128,128,128,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm0, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,128,4,11,128,128,0,7,14,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm6, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm6 = [3,10,3,10,3,10,3,10,3,10,3,10,3,10,3,10]
@@ -8839,13 +8839,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm1[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm3, %ymm4, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,128,128,128,6,13,128,128,2,9,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,1,8,15,128,128,4,11,128,128,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,0,0,128,128,128,5,12,0,0,0,128,128,128,5,12]
@@ -8876,12 +8876,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm6, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm11, %ymm4, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,2,9,128,128,128,5,12,128,128,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,128,128,0,7,14,128,128,3,10,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm7, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,0,0,128,128,128,6,13,0,0,0,128,128,128,6,13]
@@ -8908,13 +8908,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,3,10,128,128,128,6,13,128,128,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm1, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,128,128,1,8,15,128,128,4,11,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm7, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [0,0,0,2,9,128,128,128,0,0,0,2,9,128,128,128]
@@ -8942,15 +8942,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm1, %ymm6, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm3, %ymm4, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm4, %ymm3, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,128,128,2,9,128,128,128,5,12,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,4,11,128,128,0,7,14,128,128,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm2, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,0,0,3,10,128,128,128,0,0,0,3,10,128,128,128]
@@ -8980,15 +8980,15 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm0, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm2, %ymm0, %ymm13
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm2, %ymm0, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -9003,7 +9003,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u,0,0,u,u,255,255,0,0,u,u,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm6, %ymm2, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm6, %ymm2, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm6, %ymm2, %ymm9
@@ -9018,12 +9018,12 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm7, %ymm8, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm8, %ymm7, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u,0,0,u,u,255,255,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm8, %ymm7, %ymm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,9,128,128,128,5,12,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm1, %xmm15
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,0,7,14,128,128,3,10,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm15, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm15
@@ -9043,10 +9043,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [3,10,128,128,128,6,13,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm12, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm12, %xmm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,1,8,15,128,128,4,11,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm4, %xmm2
@@ -9064,9 +9064,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm13, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,2,9,128,128,128,5,12,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,11,128,128,0,7,14,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm0, %xmm3, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm9, %xmm3
@@ -9086,9 +9086,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,3,10,128,128,128,6,13,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = <5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = [5,12,128,128,1,8,15,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
@@ -9108,11 +9108,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm1, %ymm3, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = <6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,13,128,128,2,9,128,128,128,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm5, %xmm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,4,11,128,128,0,7,14,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -9133,9 +9133,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm9, %xmm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,128,128,128,5,12,128,128,1,8,15,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm5, %xmm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,0,7,14,128,128,3,10,128,128,128,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm9, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm5, %xmm9, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw {{.*#+}} xmm13 = [4,11,4,11,4,11,4,11,4,11,4,11,4,11,4,11]
@@ -9374,7 +9374,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm2, %xmm10, %xmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm5[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm5, %xmm10
@@ -9497,7 +9497,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %ymm9, %ymm0
@@ -9551,7 +9551,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %xmm1, %xmm12, %xmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa %xmm10, %xmm13
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm10[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
@@ -9787,7 +9787,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpor %xmm7, %xmm2, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 208(%rdi), %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm8
@@ -9914,7 +9914,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpor %xmm5, %xmm3, %xmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm6, %ymm2
@@ -9967,7 +9967,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u],zero,zero,xmm3[0,7,14],zero,zero,xmm3[3,10,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
@@ -10224,7 +10224,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[u,u,1,8,15],zero,zero,xmm0[4,11],zero,zero,xmm0[u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpor %xmm2, %xmm0, %xmm0
 ; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm2, %xmm10, %xmm5
 ; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} xmm15 = xmm6[u,u,u,u,u,u,u,u,u,u,u,0,7,14],zero,zero
 ; AVX512DQ-SLOW-NEXT:    vpor %xmm5, %xmm15, %xmm5
@@ -10346,7 +10346,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u],zero,zero,zero,xmm3[6,13],zero,zero,xmm3[2,9,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpor %xmm3, %xmm2, %xmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa %ymm9, %ymm0
@@ -10402,7 +10402,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u],zero,zero,xmm2[0,7,14],zero,zero,xmm2[3,10,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqa %xmm10, %xmm1
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %xmm2, %xmm10, %xmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
@@ -10638,7 +10638,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,1,8,15],zero,zero,xmm2[4,11],zero,zero,xmm2[u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpor %xmm7, %xmm2, %xmm2
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm7
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,5,12,19,26,17,24,31,22,29,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vmovdqa 208(%rdi), %xmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm8
@@ -10765,7 +10765,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[u,u,u,u],zero,zero,zero,xmm5[6,13],zero,zero,xmm5[2,9,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,1,8,15],zero,zero,xmm3[4,11],zero,zero,xmm3[u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpor %xmm5, %xmm3, %xmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,128,128,128,128,128,128,128,128,128,0,7,14,21,28,19,26,17,24,31,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX512DQ-FAST-NEXT:    vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm6, %ymm2
@@ -10819,7 +10819,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,u],zero,zero,xmm6[0,7,14],zero,zero,xmm6[3,10,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpor %xmm2, %xmm6, %xmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,6,13,20,27,18,25,16,23,30,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm6, %xmm4, %xmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpshufb {{.*#+}} xmm8 = xmm12[u,u,u,u,u,u,u,u,u,u,u,1,8,15],zero,zero
@@ -10910,13 +10910,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-ONLY-SLOW-LABEL: load_i8_stride7_vf64:
 ; AVX512BW-ONLY-SLOW:       # %bb.0:
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm25
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm25, %zmm0, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm25, %zmm0, %zmm24
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm25, %zmm0, %zmm9
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm25, %zmm0, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %ymm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm1
@@ -11213,11 +11213,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-ONLY-SLOW-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm25, %zmm7, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm25, %zmm8, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermw %zmm25, %zmm10, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vextracti128 $1, %ymm12, %xmm11
 ; AVX512BW-ONLY-SLOW-NEXT:    vpshufb {{.*#+}} xmm11 = zero,zero,xmm11[2,9],zero,zero,zero,xmm11[5,12,u,u,u,u,u,u,u]
@@ -11646,13 +11646,13 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQBW-SLOW-LABEL: load_i8_stride7_vf64:
 ; AVX512DQBW-SLOW:       # %bb.0:
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 64(%rdi), %zmm25
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,1,18,11,4,5,22,15,u,25,10,u,12,29,14,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermw %zmm25, %zmm0, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,1,18,11,4,21,14,7,8,25,10,u,28,13,u,15]
 ; AVX512DQBW-SLOW-NEXT:    vpermw %zmm25, %zmm0, %zmm24
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,17,10,3,4,21,14,7,24,9,u,11,28,13,u,31]
 ; AVX512DQBW-SLOW-NEXT:    vpermw %zmm25, %zmm0, %zmm10
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [16,17,10,3,20,13,6,23,24,25,u,27,28,u,30,31]
 ; AVX512DQBW-SLOW-NEXT:    vpermw %zmm25, %zmm0, %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa (%rdi), %ymm9
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm1
@@ -11949,11 +11949,11 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQBW-SLOW-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm0, %ymm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu8 %ymm7, %ymm6 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [16,9,2,19,20,13,6,23,24,u,26,27,28,u,30,31]
 ; AVX512DQBW-SLOW-NEXT:    vpermw %zmm25, %zmm7, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [16,9,2,19,12,5,22,23,24,u,26,27,u,29,30,31]
 ; AVX512DQBW-SLOW-NEXT:    vpermw %zmm25, %zmm8, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [8,1,2,19,12,5,22,15,u,9,26,11,u,29,14,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermw %zmm25, %zmm11, %zmm11
 ; AVX512DQBW-SLOW-NEXT:    vextracti128 $1, %ymm2, %xmm12
 ; AVX512DQBW-SLOW-NEXT:    vpshufb {{.*#+}} xmm12 = zero,zero,xmm12[2,9],zero,zero,zero,xmm12[5,12,u,u,u,u,u,u,u]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
index 16b5e8db76ef1..afa114048077e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-3.ll
@@ -156,7 +156,7 @@ define void @store_i16_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BW-NEXT:    vmovq %xmm1, 16(%rcx)
@@ -249,7 +249,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
 ; AVX2-SLOW-NEXT:    vpermd %ymm2, %ymm4, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
@@ -268,7 +268,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm2
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,4,1,5,1,5,2,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm5, %ymm3
@@ -294,7 +294,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm2, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
@@ -317,7 +317,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,0,1]
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,0,1,u,u,u,u,2,3,u,u,u,u,4,5,u,u,22,23,u,u,u,u,24,25,u,u,u,u,26,27]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6],ymm3[7],ymm4[8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14],ymm3[15]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
 ; AVX512F-SLOW-NEXT:    vpermd %ymm2, %ymm4, %ymm4
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm4
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
@@ -341,7 +341,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm3 = xmm2[2,2,3,3]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3],xmm3[4],xmm1[5,6],xmm3[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,0,0,u,1,1,u,2>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,0,0,u,1,1,u,2]
 ; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm3, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,4,1,5,1,5,2,6]
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm3, %ymm0
@@ -358,7 +358,7 @@ define void @store_i16_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512BW-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinserti32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,8,16,1,9,17,2,10,18,3,11,19,4,12,20,5,13,21,6,14,22,7,15,23,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rcx)
 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -513,7 +513,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <5,5,u,6,6,u,7,7>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7]
 ; AVX2-SLOW-NEXT:    vpermd %ymm0, %ymm4, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
@@ -524,16 +524,16 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm2, %ymm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,u,1,1,u,2>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,0,0,u,1,1,u,2]
 ; AVX2-SLOW-NEXT:    vpermd %ymm0, %ymm4, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5]
 ; AVX2-SLOW-NEXT:    vpermd (%rdi), %ymm4, %ymm4
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u]
 ; AVX2-SLOW-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
@@ -558,7 +558,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
@@ -568,16 +568,16 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5]
 ; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
@@ -602,7 +602,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm6[0],xmm2[0],xmm6[1],xmm2[1],xmm6[2],xmm2[2],xmm6[3],xmm2[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm2, %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm0, %ymm5, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
@@ -612,16 +612,16 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,10,11,10,11,8,9,8,9,14,15,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm3, %ymm4, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [5,5,u,6,6,u,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm0, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd (%rdi), %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
@@ -650,7 +650,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[10,11,u,u],zero,zero,ymm2[12,13,u,u],zero,zero,ymm2[14,15,u,u],zero,zero,ymm2[16,17,u,u],zero,zero,ymm2[18,19,u,u],zero,zero,ymm2[20,21]
 ; AVX512F-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,0,0,u,1,1,u,2>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2]
 ; AVX512F-NEXT:    vpermd %ymm0, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,ymm0[10,11],zero,zero,zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18,19],zero,zero,zero,zero
@@ -662,7 +662,7 @@ define void @store_i16_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,u,u,10,11,8,9,u,u,14,15,12,13,u,u]
 ; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <5,5,u,6,6,u,7,7>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [5,5,u,6,6,u,7,7]
 ; AVX512F-NEXT:    vpermd %ymm0, %ymm3, %ymm0
 ; AVX512F-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, 64(%rcx)
@@ -973,7 +973,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm3[0,1],xmm5[2],xmm3[3,4],xmm5[5],xmm3[6,7]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm3, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <5,5,u,6,6,u,7,7>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [5,5,u,6,6,u,7,7]
 ; AVX2-SLOW-NEXT:    vpermd %ymm1, %ymm5, %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm3, %ymm6, %ymm3
@@ -1000,7 +1000,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm6, %xmm6
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm7, %ymm6, %ymm6
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,0,0,u,1,1,u,2>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,0,0,u,1,1,u,2]
 ; AVX2-SLOW-NEXT:    vpermd %ymm2, %ymm7, %ymm9
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm6, %ymm9, %ymm6
@@ -1015,11 +1015,11 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm8, %ymm7, %ymm7
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,3,3,u,4,4,u,5>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,3,3,u,4,4,u,5]
 ; AVX2-SLOW-NEXT:    vpermd (%rdi), %ymm9, %ymm10
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm4, %ymm10, %ymm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,u,3,3,u,4,4,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,u,3,3,u,4,4,u]
 ; AVX2-SLOW-NEXT:    vpermd %ymm2, %ymm10, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
@@ -1052,7 +1052,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm7, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm1, %ymm3, %ymm1
@@ -1074,7 +1074,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa 16(%rdi), %xmm12
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm10, %ymm4, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm7, %ymm5
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm10, %ymm5
@@ -1091,15 +1091,15 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm2, %ymm8, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,3,3,u,4,4,u,5]
 ; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm8, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm6, %ymm9, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rsi), %ymm9
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm9, %ymm7
 ; AVX2-FAST-NEXT:    vpermd 32(%rdi), %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,u,3,3,u,4,4,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm8, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm6, %ymm0, %ymm0
@@ -1129,7 +1129,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm0, %ymm7, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm1, %ymm3, %ymm1
@@ -1151,7 +1151,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 16(%rdi), %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm10, %ymm4, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm3, %ymm7, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm3, %ymm10, %ymm5
@@ -1168,15 +1168,15 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm2, %ymm8, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm6, %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,3,3,u,4,4,u,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd (%rdi), %ymm8, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm6, %ymm9, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rsi), %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm9, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermd 32(%rdi), %ymm8, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <2,u,3,3,u,4,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [2,u,3,3,u,4,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm0, %ymm8, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm6, %ymm0, %ymm0
@@ -1215,7 +1215,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0,1],xmm3[2],xmm5[3,4],xmm3[5],xmm5[6,7]
 ; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm3, %ymm3
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[0,1,2,3],zmm4[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10]
 ; AVX512F-NEXT:    vpermd (%rdx), %zmm4, %zmm5
 ; AVX512F-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm5
 ; AVX512F-NEXT:    vmovdqa 32(%rdi), %ymm3
@@ -1257,7 +1257,7 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vinserti128 $1, %xmm6, %ymm0, %ymm0
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[4,5,6,7]
 ; AVX512F-NEXT:    vpshufb %ymm10, %ymm7, %ymm1
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,0,0,u,1,1,u,2>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,0,0,u,1,1,u,2]
 ; AVX512F-NEXT:    vpermd %ymm7, %ymm2, %ymm2
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512F-NEXT:    vpandn %ymm2, %ymm3, %ymm2
@@ -1274,15 +1274,15 @@ define void @store_i16_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm0
@@ -1904,7 +1904,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm5[0,1,2,3,4,4,4,4]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm5[2],xmm2[3,4],xmm5[5],xmm2[6,7]
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm6, %ymm2, %ymm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <5,5,u,6,6,u,7,7>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [5,5,u,6,6,u,7,7]
 ; AVX2-SLOW-NEXT:    vpermd %ymm3, %ymm8, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm2, %ymm5, %ymm2
@@ -1954,7 +1954,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm11, %xmm9, %xmm9
 ; AVX2-SLOW-NEXT:    vinserti128 $1, %xmm8, %ymm9, %ymm8
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,0,0,u,1,1,u,2>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,0,0,u,1,1,u,2]
 ; AVX2-SLOW-NEXT:    vpermd %ymm4, %ymm12, %ymm9
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm8, %ymm9, %ymm8
@@ -1994,9 +1994,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm2, %ymm5, %ymm11
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm14, %ymm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,3,3,u,4,4,u,5>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,3,3,u,4,4,u,5]
 ; AVX2-SLOW-NEXT:    vpermd (%rdi), %ymm12, %ymm13
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm5, %ymm13, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rsi), %ymm13
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm13, %ymm13
@@ -2010,7 +2010,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm15, %ymm2
 ; AVX2-SLOW-NEXT:    vpermd 96(%rdi), %ymm12, %ymm12
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm2, %ymm12, %ymm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <2,u,3,3,u,4,4,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,u,3,3,u,4,4,u]
 ; AVX2-SLOW-NEXT:    vpermd %ymm4, %ymm12, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm5, %ymm4, %ymm4
@@ -2053,7 +2053,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm13, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm3, %ymm4, %ymm3
@@ -2067,7 +2067,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm7, %ymm4, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm10, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm4, %ymm5, %ymm3
@@ -2138,9 +2138,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm0, %ymm4, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5]
 ; AVX2-FAST-NEXT:    vpermd (%rdi), %ymm4, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rsi), %ymm9
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm9, %ymm9
@@ -2154,7 +2154,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
 ; AVX2-FAST-NEXT:    vpermd 96(%rdi), %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm0, %ymm4, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm4, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
@@ -2197,7 +2197,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm12 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm4, %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,0,0,u,1,1,u,2>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,0,0,u,1,1,u,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm2, %ymm13, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm3, %ymm4, %ymm3
@@ -2211,7 +2211,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1],xmm5[2],xmm4[3,4],xmm5[5],xmm4[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm7, %ymm4, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <5,5,u,6,6,u,7,7>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [5,5,u,6,6,u,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm1, %ymm10, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm4, %ymm5, %ymm3
@@ -2282,9 +2282,9 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm0, %ymm4, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [10,11,0,1,12,13,12,13,2,3,2,3,14,15,4,5,26,27,16,17,28,29,28,29,18,19,18,19,30,31,20,21]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,3,3,u,4,4,u,5>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,3,3,u,4,4,u,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd (%rdi), %ymm4, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rsi), %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm9, %ymm9
@@ -2298,7 +2298,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm14, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermd 96(%rdi), %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm0, %ymm4, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,u,3,3,u,4,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,u,3,3,u,4,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm2, %ymm4, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm3, %ymm2, %ymm2
@@ -2357,7 +2357,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa 64(%rdx), %ymm14
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm9 = [128,128,10,11,128,128,128,128,12,13,128,128,128,128,14,15,128,128,128,128,16,17,128,128,128,128,18,19,128,128,128,128]
 ; AVX512F-NEXT:    vpshufb %ymm9, %ymm3, %ymm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = <u,0,0,u,1,1,u,2>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} ymm16 = [u,0,0,u,1,1,u,2]
 ; AVX512F-NEXT:    vpermd %ymm3, %ymm16, %ymm3
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535]
 ; AVX512F-NEXT:    vpandn %ymm3, %ymm15, %ymm3
@@ -2382,7 +2382,7 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm12 = xmm12[0,1],xmm13[2],xmm12[3,4],xmm13[5],xmm12[6,7]
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm12, %ymm0
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm10[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [5,5,u,6,6,u,7,7,u,8,8,u,9,9,u,10]
 ; AVX512F-NEXT:    vpermd 64(%rdx), %zmm18, %zmm10
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535,65535,0,65535]
 ; AVX512F-NEXT:    vpternlogd $184, %zmm0, %zmm22, %zmm10
@@ -2486,17 +2486,17 @@ define void @store_i16_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,32,u,1,33,u,2,34,u,3,35,u,4,36,u,5,37,u,6,38,u,7,39,u,8,40,u,9,41,u,10,42]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm6, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,32,3,4,33,6,7,34,9,10,35,12,13,36,15,16,37,18,19,38,21,22,39,24,25,40,27,28,41,30,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm8, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [21,u,54,22,u,55,23,u,56,24,u,57,25,u,58,26,u,59,27,u,60,28,u,61,29,u,62,30,u,63,31,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm9, %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,53,2,3,54,5,6,55,8,9,56,11,12,57,14,15,58,17,18,59,20,21,60,23,24,61,26,27,62,29,30,63]
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm11, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,11,43,u,12,44,u,13,45,u,14,46,u,15,47,u,16,48,u,17,49,u,18,50,u,19,51,u,20,52,u,21]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
 ; AVX512BW-NEXT:    vpermt2w %zmm3, %zmm12, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [42,1,2,43,4,5,44,7,8,45,10,11,46,13,14,47,16,17,48,19,20,49,22,23,50,25,26,51,28,29,52,31]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
index e8da0c4dcf47a..5f0b08a024f8b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-4.ll
@@ -272,10 +272,10 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,3,1,3]
@@ -295,10 +295,10 @@ define void @store_i16_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vinserti128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti128 $1, (%rcx), %ymm1, %ymm1
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm2 = ymm1[1,3,1,3]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15]
 ; AVX512F-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm4 = ymm0[1,3,1,3]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm4
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4],ymm2[5],ymm4[6],ymm2[7]
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,0,2]
@@ -1409,26 +1409,26 @@ define void @store_i16_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
 ; AVX512BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
 ; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%r8)
@@ -3014,32 +3014,32 @@ define void @store_i16_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm9
 ; AVX512BW-NEXT:    vpermt2w %zmm6, %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,9,41,u,u,10,42,u,u,11,43,u,u,12,44,u,u,13,45,u,u,14,46,u,u,15,47,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm10, %zmm11
 ; AVX512BW-NEXT:    movw $-21846, %ax # imm = 0xAAAA
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm9, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512BW-NEXT:    vpermt2w %zmm6, %zmm9, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,4,36,u,u,5,37,u,u,6,38,u,u,7,39,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm13, %zmm14
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm15
 ; AVX512BW-NEXT:    vpermt2w %zmm6, %zmm12, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,25,57,u,u,26,58,u,u,27,59,u,u,28,60,u,u,29,61,u,u,30,62,u,u,31,63,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm17
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm16, %zmm17
 ; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55]
 ; AVX512BW-NEXT:    vpermt2w %zmm6, %zmm15, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [16,48,u,u,17,49,u,u,18,50,u,u,19,51,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm2, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vpermi2w %zmm7, %zmm5, %zmm8
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
index 413cd7d63d8c6..d174db5f66a81 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-5.ll
@@ -71,7 +71,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,0,1]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,3,0,3,4,7,4,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,0,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX2-SLOW-NEXT:    vmovd %xmm1, 16(%r9)
@@ -141,7 +141,7 @@ define void @store_i16_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BW-NEXT:    vmovd %xmm1, 16(%r9)
@@ -386,7 +386,7 @@ define void @store_i16_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
 ; AVX512BW-NEXT:    vinserti32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,1,5,9,13,17,2,6,10,14,18,3,7,11,15,19,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm1
 ; AVX512BW-NEXT:    vmovq %xmm1, 32(%r9)
@@ -572,7 +572,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm6[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,5,5,4,7,8,9,10,11,13,13,12,15]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2,3],ymm9[4,5,6],ymm8[7],ymm9[8,9],ymm8[10,11],ymm9[12,13,14],ymm8[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
@@ -608,10 +608,10 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa (%r8), %xmm4
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm5
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,5,2,6,2,6,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,5,2,6,2,6,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[2,3,6,7,2,3],zero,zero,zero,zero,ymm7[8,9,12,13,16,17],zero,zero,zero,zero,ymm7[18,19,22,23,28,29],zero,zero,zero,zero
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <5,2,6,u,2,6,3,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [5,2,6,u,2,6,3,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[2,3],zero,zero,zero,zero,zero,zero,ymm8[4,5,8,9],zero,zero,zero,zero,zero,zero,ymm8[18,19,22,23],zero,zero,zero,zero,zero,zero,ymm8[24,25,28,29]
 ; AVX2-FAST-NEXT:    vpor %ymm7, %ymm8, %ymm7
@@ -656,7 +656,7 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm6[2,3,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[u,u,u,u,6,7,10,11,u,u,u,u,u,u,8,9,u,u,u,u,22,23,26,27,u,u,u,u,u,u,24,25]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1],ymm9[2,3],ymm8[4,5,6],ymm9[7],ymm8[8,9],ymm9[10,11],ymm8[12,13,14],ymm9[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
@@ -739,13 +739,13 @@ define void @store_i16_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm6
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm6[0,2,2,0]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm7[0,1,8,9,u,u],zero,zero,zero,zero,ymm7[2,3,18,19,u,u],zero,zero,zero,zero,ymm7[28,29,20,21,u,u],zero,zero
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <1,5,2,6,2,6,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [1,5,2,6,2,6,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm6, %ymm8, %ymm6
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[2,3,6,7,u,u],zero,zero,zero,zero,ymm6[8,9,12,13,u,u],zero,zero,zero,zero,ymm6[18,19,22,23,u,u],zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm7, %zmm6
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm5[0,2,0,2]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[0,1,8,9],zero,zero,zero,zero,ymm7[u,u,2,3,10,11],zero,zero,zero,zero,ymm7[u,u,20,21,28,29],zero,zero,zero,zero,ymm7[u,u,22,23]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <5,2,6,u,2,6,3,7>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [5,2,6,u,2,6,3,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm5, %ymm8, %ymm5
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm5[2,3],zero,zero,zero,zero,ymm5[u,u,4,5,8,9],zero,zero,zero,zero,ymm5[u,u,18,19,22,23],zero,zero,zero,zero,ymm5[u,u,24,25,28,29]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm7, %zmm5
@@ -1136,7 +1136,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[0,2,1,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm6 = xmm6[0,1,2,3,4,4,5,6]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm6, %ymm5, %ymm5
 ; AVX2-SLOW-NEXT:    vpbroadcastq (%r8), %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
@@ -1150,7 +1150,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm11 = ymm11[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0,1],ymm10[2],ymm11[3],ymm10[4],ymm11[5,6],ymm10[7],ymm11[8,9],ymm10[10],ymm11[11],ymm10[12],ymm11[13,14],ymm10[15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm6, %ymm10, %ymm6
 ; AVX2-SLOW-NEXT:    vpbroadcastq 16(%r8), %ymm10
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
@@ -1163,7 +1163,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm9, %ymm7, %ymm7
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm1[0,1,1,1]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
@@ -1178,7 +1178,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm10 = ymm10[2,3,2,3,6,7,6,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-SLOW-NEXT:    vpbroadcastq 24(%r8), %ymm9
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
@@ -1189,7 +1189,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
@@ -1219,7 +1219,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm5, %ymm9, %ymm5
 ; AVX2-FAST-NEXT:    vpbroadcastq (%r8), %ymm9
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
@@ -1232,7 +1232,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
@@ -1245,7 +1245,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vpbroadcastq 16(%r8), %ymm8
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
@@ -1258,7 +1258,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-NEXT:    vpbroadcastq 24(%r8), %ymm9
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
@@ -1269,7 +1269,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
@@ -1299,7 +1299,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm5, %ymm9, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq (%r8), %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
@@ -1312,7 +1312,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0],xmm7[1],xmm8[2],xmm7[3],xmm8[4,5],xmm7[6],xmm8[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,0]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm1[0,1,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
@@ -1325,7 +1325,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,18,19,u,u,20,21,u,u,24,25,24,25,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3],ymm8[4],ymm9[5,6],ymm8[7],ymm9[8,9],ymm8[10],ymm9[11],ymm8[12],ymm9[13,14],ymm8[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 16(%r8), %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255]
@@ -1338,7 +1338,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,u,u,30,31,u,u,u,u,28,29,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0],ymm9[1],ymm10[2],ymm9[3,4],ymm10[5,6,7,8],ymm9[9],ymm10[10],ymm9[11,12],ymm10[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 24(%r8), %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0]
@@ -1349,7 +1349,7 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[1,1,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5],ymm2[6],ymm0[7,8],ymm2[9],ymm0[10,11],ymm2[12],ymm0[13],ymm2[14],ymm0[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm3, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255]
@@ -1524,13 +1524,13 @@ define void @store_i16_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa (%r8), %ymm2
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,13,29,45,61,u,14,30,46,62,u,15,31,47,63,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,16,32,48,u,1,17,33,49,u,2,18,34,50,u,3,19,35,51,u,4,20,36,52,u,5,21,37,53,u,6,22]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [6,22,u,39,55,7,23,u,40,56,8,24,u,41,57,9,25,u,42,58,10,26,u,43,59,11,27,u,44,60,12,28]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm4, %zmm0
@@ -2274,7 +2274,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm11 = xmm11[0,2,1,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm11[0,1,2,3,4,4,5,6]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm11, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpbroadcastq (%r8), %ymm11
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255]
@@ -2291,17 +2291,17 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpbroadcastq 32(%r8), %ymm5
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm0, %ymm5, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm11, %xmm12, %xmm0
 ; AVX2-SLOW-NEXT:    vpbroadcastq 8(%rdi), %xmm12
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm12[1],xmm0[2,3],xmm12[4],xmm0[5],xmm12[6],xmm0[7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm14, %xmm7, %xmm7
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm6 = xmm6[1,2,2,2]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0],xmm6[1],xmm7[2],xmm6[3],xmm7[4,5],xmm6[6],xmm7[7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,0]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm0, %ymm6, %ymm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm1[0,1,1,1]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
@@ -2331,7 +2331,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[2,3,2,3,6,7,6,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm13 = ymm14[0],ymm13[1],ymm14[2],ymm13[3,4],ymm14[5,6,7,8],ymm13[9],ymm14[10],ymm13[11,12],ymm14[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm0, %ymm13, %ymm1
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm13 = ymm6[0,1,1,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm9, %ymm13, %ymm0
@@ -2367,7 +2367,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3],ymm6[4],ymm5[5,6],ymm6[7],ymm5[8,9],ymm6[10],ymm5[11],ymm6[12],ymm5[13,14],ymm6[15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm1, %ymm5, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[1,1,1,2,5,5,5,6]
@@ -2385,15 +2385,15 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpbroadcastq 16(%r8), %ymm1
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm4, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm4, %ymm11, %ymm5
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[3,0,3,0,7,4,7,4]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2],ymm5[3],ymm6[4,5],ymm5[6],ymm6[7,8],ymm5[9],ymm6[10],ymm5[11],ymm6[12,13],ymm5[14],ymm6[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm7[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2,3],ymm3[4],ymm2[5],ymm3[6],ymm2[7,8],ymm3[9],ymm2[10,11],ymm3[12],ymm2[13],ymm3[14],ymm2[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm5, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpshufb %ymm4, %ymm13, %ymm4
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm15[3,0,3,0,7,4,7,4]
@@ -2439,20 +2439,20 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rsi), %xmm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm6
 ; AVX2-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm9
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
 ; AVX2-FAST-NEXT:    vmovdqa (%rcx), %xmm13
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rcx), %xmm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm13, %xmm9
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm14
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm6, %ymm12, %ymm12
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
@@ -2477,7 +2477,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm15, %xmm13, %xmm9
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm0, %ymm9, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm7, %ymm9, %ymm1
@@ -2509,7 +2509,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm1, %ymm11, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %ymm11
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm11, %ymm0
@@ -2540,7 +2540,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6]
@@ -2556,15 +2556,15 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vpbroadcastq 16(%r8), %ymm2
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm10, %ymm6
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm6, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm14, %ymm2
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4]
@@ -2609,20 +2609,20 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rsi), %xmm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm7, %xmm0, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm6 = xmm6[0],xmm9[1],xmm6[2,3],xmm9[4],xmm6[5],xmm9[6],xmm6[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rcx), %xmm13
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rcx), %xmm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm13, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm12 = xmm14[1,2,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0],xmm12[1],xmm9[2],xmm12[3],xmm9[4,5],xmm12[6],xmm9[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm9[0,1,0,0]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm6, %ymm12, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm15 = ymm1[0,1,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
@@ -2647,7 +2647,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm15 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm15, %xmm13, %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm0, %ymm9, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm2[0,1,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm7, %ymm9, %ymm1
@@ -2679,7 +2679,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm13 = ymm9[3,2,3,3,7,6,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm11 = ymm11[0],ymm13[1],ymm11[2],ymm13[3,4],ymm11[5,6,7,8],ymm13[9],ymm11[10],ymm13[11,12],ymm11[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,2,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm1, %ymm11, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm11, %ymm0
@@ -2710,7 +2710,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1],ymm7[2],ymm0[3],ymm7[4],ymm0[5,6],ymm7[7],ymm0[8,9],ymm7[10],ymm0[11],ymm7[12],ymm0[13,14],ymm7[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm14, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm15[1,1,1,2,5,5,5,6]
@@ -2726,15 +2726,15 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 16(%r8), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm10, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm7 = ymm9[3,0,3,0,7,4,7,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4,5],ymm6[6],ymm7[7,8],ymm6[9],ymm7[10],ymm6[11],ymm7[12,13],ymm6[14],ymm7[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[1,1,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm4[1],ymm3[2,3],ymm4[4],ymm3[5],ymm4[6],ymm3[7,8],ymm4[9],ymm3[10,11],ymm4[12],ymm3[13],ymm4[14],ymm3[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm6, %ymm3, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm14, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm6 = ymm15[3,0,3,0,7,4,7,4]
@@ -2806,25 +2806,25 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm0, %zmm19
 ; AVX512F-SLOW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm3, %zmm19
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rsi), %ymm3
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm3, %ymm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm22
 ; AVX512F-SLOW-NEXT:    vmovdqa64 32(%rdi), %ymm18
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm18[1,1,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm7[0],ymm0[1],ymm7[2,3],ymm0[4],ymm7[5],ymm0[6],ymm7[7,8],ymm0[9],ymm7[10,11],ymm0[12],ymm7[13],ymm0[14],ymm7[15]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm7, %xmm8, %xmm8
 ; AVX512F-SLOW-NEXT:    vpbroadcastq 40(%rdi), %xmm11
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm8 = xmm8[0],xmm11[1],xmm8[2,3],xmm11[4],xmm8[5],xmm11[6],xmm8[7]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,0,1]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm8, %zmm11
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rcx), %ymm0
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm4, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm4, %ymm23
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[3,0,3,0,7,4,7,4]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm8[0],ymm2[1],ymm8[2],ymm2[3],ymm8[4,5],ymm2[6],ymm8[7,8],ymm2[9],ymm8[10],ymm2[11],ymm8[12,13],ymm2[14],ymm8[15]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm8, %xmm10, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[1,2,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0],xmm9[1],xmm10[2],xmm9[3],xmm10[4,5],xmm9[6],xmm10[7]
@@ -2926,7 +2926,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-LABEL: store_i16_stride5_vf32:
 ; AVX512F-FAST:       # %bb.0:
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %ymm5
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm5, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm17
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm7
@@ -2934,7 +2934,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3],ymm0[4],ymm1[5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10,11],ymm0[12],ymm1[13],ymm0[14],ymm1[15]
 ; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %xmm10
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm2, %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm2, %xmm20
 ; AVX512F-FAST-NEXT:    vpbroadcastq 40(%rdi), %xmm2
@@ -2943,7 +2943,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqa (%rcx), %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %ymm8
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm8, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm24
 ; AVX512F-FAST-NEXT:    vmovdqa64 (%rdx), %ymm18
@@ -2951,7 +2951,7 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm9[3,0,3,0,7,4,7,4]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2],ymm2[3],ymm4[4,5],ymm2[6],ymm4[7,8],ymm2[9],ymm4[10],ymm2[11],ymm4[12,13],ymm2[14],ymm4[15]
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %xmm11
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm11, %xmm13
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %xmm6
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} xmm15 = xmm6[1,2,2,2]
@@ -3090,43 +3090,43 @@ define void @store_i16_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
 ; AVX512BW-NEXT:    movl $415641996, %eax # imm = 0x18C6318C
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm5, %zmm6 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm6, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
 ; AVX512BW-NEXT:    movl $831283992, %eax # imm = 0x318C6318
 ; AVX512BW-NEXT:    kmovd %eax, %k2
 ; AVX512BW-NEXT:    vmovdqu16 %zmm6, %zmm7 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,38,3,4,5,6,39,8,9,10,11,40,13,14,15,16,41,18,19,20,21,42,23,24,25,26,43,28,29,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm7, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
 ; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm8 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm8, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm9
 ; AVX512BW-NEXT:    movl $-1939662650, %eax # imm = 0x8C6318C6
 ; AVX512BW-NEXT:    kmovd %eax, %k2
 ; AVX512BW-NEXT:    vmovdqu16 %zmm8, %zmm9 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm4, %zmm9, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63]
@@ -4602,7 +4602,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,2,1,3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm7[0,1,2,3,4,4,5,6]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm7, %ymm1, %ymm8
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
@@ -4645,17 +4645,17 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpbroadcastq 96(%r8), %ymm5
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm4, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm8 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm11, %xmm4
 ; AVX2-SLOW-NEXT:    vpbroadcastq 8(%rdi), %xmm5
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm14, %xmm9, %xmm5
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[1,2,2,2]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm5[0],xmm3[1],xmm5[2],xmm3[3],xmm5[4,5],xmm3[6],xmm5[7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,0]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm4, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm10, %xmm4
 ; AVX2-SLOW-NEXT:    vpbroadcastq 40(%rdi), %xmm5
@@ -4726,7 +4726,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3],ymm3[4],ymm2[5,6],ymm3[7],ymm2[8,9],ymm3[10],ymm2[11],ymm3[12],ymm2[13,14],ymm3[15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm1, %ymm2, %ymm10
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rdx), %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4807,7 +4807,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm5[1],ymm1[2],ymm5[3,4],ymm1[5,6,7,8],ymm5[9],ymm1[10],ymm5[11,12],ymm1[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,3,2,2]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm13[0,1,2,3,7,6,5,7,8,9,10,11,15,14,13,15]
@@ -4862,17 +4862,17 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpbroadcastq 120(%r8), %ymm0
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm5, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[3,0,3,0,7,4,7,4]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm5[0],ymm1[1],ymm5[2],ymm1[3],ymm5[4,5],ymm1[6],ymm5[7,8],ymm1[9],ymm5[10],ymm1[11],ymm5[12,13],ymm1[14],ymm5[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm7, %ymm9
 ; AVX2-SLOW-NEXT:    vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm14 = mem[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm9 = ymm14[0],ymm9[1],ymm14[2,3],ymm9[4],ymm14[5],ymm9[6],ymm14[7,8],ymm9[9],ymm14[10,11],ymm9[12],ymm14[13],ymm9[14],ymm14[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm1, %ymm9, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm0, %ymm7, %ymm9
@@ -4978,7 +4978,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm0, %ymm15, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3]
@@ -5019,17 +5019,17 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpbroadcastq 96(%r8), %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm5, %xmm0
 ; AVX2-FAST-NEXT:    vpbroadcastq 8(%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm8, %xmm5
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm1, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
 ; AVX2-FAST-NEXT:    vpbroadcastq 40(%rdi), %xmm2
@@ -5101,7 +5101,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm2, %ymm3, %ymm8
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdx), %ymm4
 ; AVX2-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5179,7 +5179,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm6, %ymm13
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -5228,19 +5228,19 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpbroadcastq 120(%r8), %ymm9
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm0, %ymm9, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm7, %ymm11
 ; AVX2-FAST-NEXT:    vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm14 = mem[3,0,3,0,7,4,7,4]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm2, %ymm15
 ; AVX2-FAST-NEXT:    vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm13 = mem[1,1,2,2]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm11, %ymm13, %ymm11
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm4, %ymm13
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4]
@@ -5346,7 +5346,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,2,3,2,3,0,1,10,11,8,9,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm12, %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm15 = ymm12[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm0, %ymm15, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm2[0],xmm14[1],xmm2[1],xmm14[2],xmm2[2],xmm14[3],xmm2[3]
@@ -5387,17 +5387,17 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 96(%r8), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm5, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 8(%rdi), %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5],xmm1[6],xmm0[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm8, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[1,2,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3],xmm5[4,5],xmm7[6],xmm5[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,0]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm1, %ymm5, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 40(%rdi), %xmm2
@@ -5469,7 +5469,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3],ymm4[4],ymm3[5,6],ymm4[7],ymm3[8,9],ymm4[10],ymm3[11],ymm4[12],ymm3[13,14],ymm4[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm2, %ymm3, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdx), %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5547,7 +5547,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm14[1],ymm13[2],ymm14[3,4],ymm13[5,6,7,8],ymm14[9],ymm13[10],ymm14[11,12],ymm13[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,2,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm10, %ymm13, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm6, %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
@@ -5596,19 +5596,19 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq 120(%r8), %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm0, %ymm9, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm7, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm14 = mem[3,0,3,0,7,4,7,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2],ymm11[3],ymm14[4,5],ymm11[6],ymm14[7,8],ymm11[9],ymm14[10],ymm11[11],ymm14[12,13],ymm11[14],ymm14[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm2, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vpermq $165, {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm13 = mem[1,1,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm13 = ymm13[0],ymm15[1],ymm13[2,3],ymm15[4],ymm13[5],ymm15[6],ymm13[7,8],ymm15[9],ymm13[10,11],ymm15[12],ymm13[13],ymm15[14],ymm13[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255,255,255,u,u,0,0,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm11, %ymm13, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm4, %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[3,0,3,0,7,4,7,4]
@@ -5693,14 +5693,14 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW:       # %bb.0:
 ; AVX512F-SLOW-NEXT:    subq $488, %rsp # imm = 0x1E8
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rcx), %ymm11
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm1, %ymm11, %ymm0
 ; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, %ymm14
 ; AVX512F-SLOW-NEXT:    vmovdqa64 96(%rdx), %ymm17
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm17[3,0,3,0,7,4,7,4]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rcx), %xmm0
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm7, %xmm0, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdx), %xmm2
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm2[1,2,2,2]
@@ -5709,14 +5709,14 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rsi), %ymm9
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm3, %ymm9, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm3, %ymm20
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdi), %ymm5
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm5[1,1,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0],ymm1[1],ymm3[2,3],ymm1[4],ymm3[5],ymm1[6],ymm3[7,8],ymm1[9],ymm3[10,11],ymm1[12],ymm3[13],ymm1[14],ymm3[15]
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rsi), %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm12, %xmm1, %xmm4
 ; AVX512F-SLOW-NEXT:    vpbroadcastq 104(%rdi), %xmm6
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5],xmm6[6],xmm4[7]
@@ -5889,7 +5889,7 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0,1],ymm3[2],ymm1[3],ymm3[4],ymm1[5,6],ymm3[7],ymm1[8,9],ymm3[10],ymm1[11],ymm3[12],ymm1[13,14],ymm3[15]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm30
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm4, %ymm2, %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm16[3,0,3,0,7,4,7,4]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
@@ -6019,14 +6019,14 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST:       # %bb.0:
 ; AVX512F-FAST-NEXT:    subq $360, %rsp # imm = 0x168
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rcx), %ymm8
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,12,13,u,u,0,1,u,u,u,u,14,15,u,u,u,u,28,29,u,u,16,17,u,u,u,u,30,31,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm15, %ymm8, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 96(%rdx), %ymm19
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm19[3,0,3,0,7,4,7,4]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6],ymm1[7,8],ymm0[9],ymm1[10],ymm0[11],ymm1[12,13],ymm0[14],ymm1[15]
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rcx), %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [10,11,u,u,6,7,u,u,8,9,8,9,u,u,8,9]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rdx), %xmm3
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6036,14 +6036,14 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rsi), %ymm3
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,0,1,u,u,u,u,14,15,u,u,2,3,u,u,u,u,16,17,u,u,u,u,30,31,u,u,18,19,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm12, %ymm3, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 96(%rdi), %ymm24
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm24[1,1,2,2]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5],ymm1[6],ymm2[7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13],ymm1[14],ymm2[15]
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rsi), %xmm4
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [6,7,u,u,10,11,6,7,u,u,8,9,u,u,12,13]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX512F-FAST-NEXT:    vpbroadcastq 104(%rdi), %xmm5
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm5[1],xmm4[2,3],xmm5[4],xmm4[5],xmm5[6],xmm4[7]
@@ -6331,10 +6331,10 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [u,u,0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm7
 ; AVX512BW-NEXT:    vpermt2w %zmm6, %zmm16, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,32,u,u,u,1,33,u,u,u,2,34,u,u,u,3,35,u,u,u,4,36,u,u,u,5,37,u,u,u,6,38]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
 ; AVX512BW-NEXT:    vpermt2w %zmm4, %zmm17, %zmm3
 ; AVX512BW-NEXT:    movl $415641996, %eax # imm = 0x18C6318C
@@ -6342,19 +6342,19 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu16 %zmm7, %zmm3 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [0,1,2,3,32,5,6,7,8,33,10,11,12,13,34,15,16,17,18,35,20,21,22,23,36,25,26,27,28,37,30,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm5, %zmm18, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [u,u,26,58,u,u,u,27,59,u,u,u,28,60,u,u,u,29,61,u,u,u,30,62,u,u,u,31,63,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm9
 ; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm19, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [25,u,u,u,58,26,u,u,u,59,27,u,u,u,60,28,u,u,u,61,29,u,u,u,62,30,u,u,u,63,31,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm7
 ; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm8, %zmm7
 ; AVX512BW-NEXT:    vmovdqu16 %zmm9, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,57,2,3,4,5,58,7,8,9,10,59,12,13,14,15,60,17,18,19,20,61,22,23,24,25,62,27,28,29,30,63]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,19,51,u,u,u,20,52,u,u,u,21,53,u,u,u,22,54,u,u,u,23,55,u,u,u,24,56,u,u,u,25]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm14
 ; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm21, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [19,u,u,u,52,20,u,u,u,53,21,u,u,u,54,22,u,u,u,55,23,u,u,u,56,24,u,u,u,57,25,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm9
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm10, %zmm9
 ; AVX512BW-NEXT:    movl $-1939662650, %eax # imm = 0x8C6318C6
@@ -6362,10 +6362,10 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu16 %zmm14, %zmm9 {%k3}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,1,2,51,4,5,6,7,52,9,10,11,12,53,14,15,16,17,54,19,20,21,22,55,24,25,26,27,56,29,30,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm22, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [u,u,u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm24
 ; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm23, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [u,13,45,u,u,u,14,46,u,u,u,15,47,u,u,u,16,48,u,u,u,17,49,u,u,u,18,50,u,u,u,19]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14
 ; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm25, %zmm14
 ; AVX512BW-NEXT:    movl $831283992, %eax # imm = 0x318C6318
@@ -6373,10 +6373,10 @@ define void @store_i16_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu16 %zmm24, %zmm14 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [44,1,2,3,4,45,6,7,8,9,46,11,12,13,14,47,16,17,18,19,48,21,22,23,24,49,26,27,28,29,50,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm24, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27
 ; AVX512BW-NEXT:    vpermt2w %zmm11, %zmm26, %zmm27
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [6,38,u,u,u,7,39,u,u,u,8,40,u,u,u,9,41,u,u,u,10,42,u,u,u,11,43,u,u,u,12,44]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm29
 ; AVX512BW-NEXT:    vpermt2w %zmm15, %zmm28, %zmm29
 ; AVX512BW-NEXT:    vmovdqu16 %zmm27, %zmm29 {%k2}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
index b2c0e00825e63..962c7d942e0dc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-6.ll
@@ -185,7 +185,7 @@ define void @store_i16_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm1
 ; AVX512BW-NEXT:    vmovq %xmm1, 16(%rax)
@@ -458,7 +458,7 @@ define void @store_i16_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vinserti32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,1,5,9,13,17,21,2,6,10,14,18,22,3,7,11,15,19,23,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
 ; AVX512BW-NEXT:    vmovdqa %ymm0, (%rax)
@@ -1334,7 +1334,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm5, %ymm12, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm1, %ymm14
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm13, %ymm12
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm14 = ymm12[0],ymm14[0],ymm12[1],ymm14[1],ymm12[2],ymm14[2],ymm12[3],ymm14[3],ymm12[8],ymm14[8],ymm12[9],ymm14[9],ymm12[10],ymm14[10],ymm12[11],ymm14[11]
@@ -1515,7 +1515,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,2,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm0, %ymm6, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm11, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm9, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm3[0],ymm5[0],ymm3[1],ymm5[1],ymm3[2],ymm5[2],ymm3[3],ymm5[3],ymm3[8],ymm5[8],ymm3[9],ymm5[9],ymm3[10],ymm5[10],ymm3[11],ymm5[11]
@@ -1562,7 +1562,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[2,1,2,3,6,5,6,7]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,3,2,1,4,5,6,7,8,11,10,9,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm8[0],ymm7[0],ymm8[1],ymm7[1],ymm8[2],ymm7[2],ymm8[3],ymm7[3],ymm8[8],ymm7[8],ymm8[9],ymm7[9],ymm8[10],ymm7[10],ymm8[11],ymm7[11]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,5,12,u,4,13,u,7>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,5,12,u,4,13,u,7]
 ; AVX512F-SLOW-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm3[1,2,2,3,5,6,6,7]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
@@ -1571,7 +1571,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0],ymm8[1,2],ymm6[3],ymm8[4,5],ymm6[6],ymm8[7]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} ymm7 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} ymm8 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <5,u,14,6,u,15,7,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [5,u,14,6,u,15,7,u]
 ; AVX512F-SLOW-NEXT:    vpermi2d %ymm7, %ymm8, %ymm9
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [8,21,10,11,22,13,14,23]
@@ -1584,7 +1584,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm10
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <0,8,u,1,9,u,2,10>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,8,u,1,9,u,2,10]
 ; AVX512F-SLOW-NEXT:    vpermi2d %ymm9, %ymm11, %ymm13
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r8), %xmm11
@@ -1599,7 +1599,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm10[0,1,2,1]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm12[0,1,2,3,4,7,6,5]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,1,8,u,0,9,u,3>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3]
 ; AVX512F-SLOW-NEXT:    vpermi2d %ymm14, %ymm12, %ymm15
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm9[1,2,2,3]
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm14 = xmm11[1,2,2,3]
@@ -1609,14 +1609,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm12, %zmm13, %zmm12
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <4,12,u,5,13,u,6,14>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,12,u,5,13,u,6,14]
 ; AVX512F-SLOW-NEXT:    vpermi2d %ymm4, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1],ymm0[2],ymm2[3,4],ymm0[5],ymm2[6,7]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,u,10,2,u,11,3,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,u,10,2,u,11,3,u]
 ; AVX512F-SLOW-NEXT:    vpermi2d %ymm1, %ymm2, %ymm3
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,9,2,3,10,5,6,11]
@@ -1643,7 +1643,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %xmm8
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm10
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm10[0],xmm8[0],xmm10[1],xmm8[1],xmm10[2],xmm8[2],xmm10[3],xmm8[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,8,u,1,9,u,2,10>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,8,u,1,9,u,2,10]
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm9, %ymm11, %ymm12
 ; AVX512F-FAST-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX512F-FAST-NEXT:    vmovdqa (%r8), %xmm11
@@ -1657,7 +1657,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpsrldq {{.*#+}} xmm13 = xmm6[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vpsrldq {{.*#+}} xmm15 = xmm7[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,1,8,u,0,9,u,3>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,1,8,u,0,9,u,3]
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm13, %ymm12, %ymm15
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm15, %zmm0, %zmm12
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [16,9,10,17,12,13,18,15]
@@ -1668,14 +1668,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm13, %zmm14, %zmm0
 ; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm10[4],xmm8[4],xmm10[5],xmm8[5],xmm10[6],xmm8[6],xmm10[7],xmm8[7]
 ; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <1,u,10,2,u,11,3,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,u,10,2,u,11,3,u]
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm8, %ymm6, %ymm7
 ; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,9,2,3,10,5,6,11]
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm6, %ymm7, %ymm8
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm6 = ymm4[0],ymm5[0],ymm4[1],ymm5[1],ymm4[2],ymm5[2],ymm4[3],ymm5[3],ymm4[8],ymm5[8],ymm4[9],ymm5[9],ymm4[10],ymm5[10],ymm4[11],ymm5[11]
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm2[0],ymm3[0],ymm2[1],ymm3[1],ymm2[2],ymm3[2],ymm2[3],ymm3[3],ymm2[8],ymm3[8],ymm2[9],ymm3[9],ymm2[10],ymm3[10],ymm2[11],ymm3[11]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <4,12,u,5,13,u,6,14>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [4,12,u,5,13,u,6,14]
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm6, %ymm7, %ymm9
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm0, %zmm6
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [8,9,20,11,12,21,14,15]
@@ -1683,14 +1683,14 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm9 = ymm10[0],ymm1[0],ymm10[1],ymm1[1],ymm10[2],ymm1[2],ymm10[3],ymm1[3],ymm10[8],ymm1[8],ymm10[9],ymm1[9],ymm10[10],ymm1[10],ymm10[11],ymm1[11]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm9, %zmm6, %zmm7
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm8, %zmm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm3, %ymm8
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm2, %ymm7
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm7 = ymm7[0],ymm8[0],ymm7[1],ymm8[1],ymm7[2],ymm8[2],ymm7[3],ymm8[3],ymm7[8],ymm8[8],ymm7[9],ymm8[9],ymm7[10],ymm8[10],ymm7[11],ymm8[11]
 ; AVX512F-FAST-NEXT:    vpsrldq {{.*#+}} ymm8 = ymm5[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm5[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vpsrldq {{.*#+}} ymm9 = ymm4[6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,ymm4[22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm9[0],ymm8[0],ymm9[1],ymm8[1],ymm9[2],ymm8[2],ymm9[3],ymm8[3],ymm9[8],ymm8[8],ymm9[9],ymm8[9],ymm9[10],ymm8[10],ymm9[11],ymm8[11]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,5,12,u,4,13,u,7>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,5,12,u,4,13,u,7]
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm8, %ymm7, %ymm9
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm1[1,2,2,3,5,6,6,7]
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm8 = ymm16[1,2,2,3,5,6,6,7]
@@ -1699,7 +1699,7 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm7, %ymm9, %ymm8
 ; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} ymm2 = ymm2[4],ymm3[4],ymm2[5],ymm3[5],ymm2[6],ymm3[6],ymm2[7],ymm3[7],ymm2[12],ymm3[12],ymm2[13],ymm3[13],ymm2[14],ymm3[14],ymm2[15],ymm3[15]
 ; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm4[4],ymm5[4],ymm4[5],ymm5[5],ymm4[6],ymm5[6],ymm4[7],ymm5[7],ymm4[12],ymm5[12],ymm4[13],ymm5[13],ymm4[14],ymm5[14],ymm4[15],ymm5[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <5,u,14,6,u,15,7,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [5,u,14,6,u,15,7,u]
 ; AVX512F-FAST-NEXT:    vpermi2d %ymm2, %ymm3, %ymm4
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm0, %zmm2
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,21,10,11,22,13,14,23]
@@ -1721,15 +1721,15 @@ define void @store_i16_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,16,32,48,u,u,1,17,33,49,u,u,2,18,34,50,u,u,3,19,35,51,u,u,4,20,36,52,u,u,5,21]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,32,48,6,7,8,9,33,49,12,13,14,15,34,50,18,19,20,21,35,51,24,25,26,27,36,52,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,21,u,u,38,54,6,22,u,u,39,55,7,23,u,u,40,56,8,24,u,u,41,57,9,25,u,u,42,58,10,26]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,37,53,4,5,6,7,38,54,10,11,12,13,39,55,16,17,18,19,40,56,22,23,24,25,41,57,28,29,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,11,27,43,59,u,u,12,28,44,60,u,u,13,29,45,61,u,u,14,30,46,62,u,u,15,31,47,63,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [42,58,2,3,4,5,43,59,8,9,10,11,44,60,14,15,16,17,45,61,20,21,22,23,46,62,26,27,28,29,47,63]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm3, %zmm0
@@ -2606,7 +2606,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm6[1,2,3,3,5,6,7,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm6[0,1],ymm1[2],ymm6[3,4],ymm1[5],ymm6[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm2, %ymm14
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
@@ -2792,7 +2792,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rsi), %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11]
@@ -2847,7 +2847,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <1,2,1,2,u,u,3,3>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [1,2,1,2,u,u,3,3]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[1,1,1,1]
@@ -2889,12 +2889,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    # ymm3 = ymm3[4],mem[4],ymm3[5],mem[5],ymm3[6],mem[6],ymm3[7],mem[7],ymm3[12],mem[12],ymm3[13],mem[13],ymm3[14],mem[14],ymm3[15],mem[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[3,3,3,3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm3[2],ymm0[3,4],ymm3[5],ymm0[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm4, %ymm6
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,2,3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3],ymm6[4],ymm0[5,6],ymm6[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm4, %ymm13
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,1,2,3]
@@ -3068,7 +3068,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdi), %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rsi), %ymm13
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm13, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm11, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3165,7 +3165,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm1, %ymm11, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,1,2,3]
@@ -3307,7 +3307,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 32(%r8), %ymm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm4, %ymm1, %ymm3
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3],ymm3[4],ymm0[5,6],ymm3[7]
@@ -3367,7 +3367,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm14
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm7 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2d %zmm7, %zmm24, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rsi), %xmm11
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdi), %xmm15
@@ -3512,7 +3512,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST:       # %bb.0:
 ; AVX512F-ONLY-FAST-NEXT:    pushq %rax
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rsi), %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%rdi), %ymm4
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm3
@@ -3537,7 +3537,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa32 %zmm3, %zmm1 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [8,21,10,11,20,13,14,23]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r8), %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm7, %ymm2, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm1, %zmm4
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm3, %zmm11, %zmm4
@@ -3547,13 +3547,13 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2d %ymm3, %ymm17, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm16 = zmm1[0,1,2,3],zmm4[0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r9), %ymm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm14, %ymm3, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm15 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
 ; AVX512F-ONLY-FAST-NEXT:    # ymm15 = mem[0,1,0,1]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm15, %ymm3, %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm3, %ymm26
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <2,2,u,3,10,u,10,11>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [2,2,u,3,10,u,10,11]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rsi), %ymm9
@@ -3632,7 +3632,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm21, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,0,0,u,8,8,u,9>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,0,0,u,8,8,u,9]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 32(%r9), %xmm12
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm7, %xmm12, %xmm4
@@ -3661,7 +3661,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm8[4],xmm0[5],xmm8[5],xmm0[6],xmm8[6],xmm0[7],xmm8[7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [1,1,1,1,10,10,10,11]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm7, %zmm15, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd %zmm18, %zmm7, %zmm18
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa32 %zmm0, %zmm18 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [8,9,20,11,12,21,14,15]
@@ -3675,7 +3675,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm11[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm11, %xmm5, %xmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,u,0,1,u,10,10,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,u,0,1,u,10,10,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm4, %zmm5, %zmm8
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm24, %ymm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm25, %ymm9
@@ -3824,7 +3824,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 32(%r8), %ymm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm10, %ymm6, %ymm1
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
@@ -3875,7 +3875,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vpshuflw {{.*#+}} ymm10 = ymm10[0,2,2,1,4,5,6,7,8,10,10,9,12,13,14,15]
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm25 = ymm10[2,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[8],ymm3[8],ymm1[9],ymm3[9],ymm1[10],ymm3[10],ymm1[11],ymm3[11]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6]
 ; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm3
 ; AVX512DQ-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm12[4],xmm15[5],xmm12[5],xmm15[6],xmm12[6],xmm15[7],xmm12[7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa (%r9), %ymm1
@@ -3983,7 +3983,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa 32(%rdx), %xmm6
 ; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm2, %zmm21, %zmm19
 ; AVX512DQ-FAST-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
@@ -4008,7 +4008,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,9,2,3,8,5,6,11]
 ; AVX512DQ-FAST-NEXT:    vpermt2d %ymm1, %ymm6, %ymm19
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <0,u,0,1,u,10,10,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [0,u,0,1,u,10,10,u]
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm7[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4036,7 +4036,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa 32(%r9), %ymm5
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm5[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm18, %zmm13
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm6, %ymm2, %ymm0
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm6, %ymm1, %ymm15
 ; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm15[0],ymm0[0],ymm15[1],ymm0[1],ymm15[2],ymm0[2],ymm15[3],ymm0[3],ymm15[8],ymm0[8],ymm15[9],ymm0[9],ymm15[10],ymm0[10],ymm15[11],ymm0[11]
@@ -4053,7 +4053,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vinserti32x8 $1, %ymm1, %zmm4, %zmm0 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,21,10,11,20,13,14,23]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm0, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm1, %ymm3, %ymm15
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm1, %ymm16
 ; AVX512DQ-FAST-NEXT:    vpermt2d %zmm15, %zmm4, %zmm9
@@ -4067,12 +4067,12 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm0 = ymm3[0],ymm0[0],ymm3[1],ymm0[1],ymm3[2],ymm0[2],ymm3[3],ymm0[3],ymm3[8],ymm0[8],ymm3[9],ymm0[9],ymm3[10],ymm0[10],ymm3[11],ymm0[11]
 ; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} ymm3 = ymm9[4],ymm14[4],ymm9[5],ymm14[5],ymm9[6],ymm14[6],ymm9[7],ymm14[7],ymm9[12],ymm14[12],ymm9[13],ymm14[13],ymm9[14],ymm14[14],ymm9[15],ymm14[15]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm5, %ymm6
 ; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
 ; AVX512DQ-FAST-NEXT:    # ymm9 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm9, %ymm5, %ymm2
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <2,2,u,3,10,u,10,11>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [2,2,u,3,10,u,10,11]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm6, %zmm5, %zmm2
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm27, %ymm14
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm26, %ymm1
@@ -4141,7 +4141,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm31, %xmm7
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm8, %xmm7, %xmm4
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm7[0,0,2,1,4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,0,0,u,8,8,u,9>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,0,0,u,8,8,u,9]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm4, %zmm9, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm28, %xmm11
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm11[2,1,3,3,4,5,6,7]
@@ -4192,7 +4192,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movw $9362, %cx # imm = 0x2492
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm7, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,32,6,7,8,9,10,33,12,13,14,15,16,34,18,19,20,21,22,35,24,25,26,27,28,36,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm8, %zmm6
@@ -4205,7 +4205,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movw $18724, %cx # imm = 0x4924
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm8, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,37,4,5,6,7,8,38,10,11,12,13,14,39,16,17,18,19,20,40,22,23,24,25,26,41,28,29,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm8
@@ -4216,7 +4216,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm9
 ; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm9, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,42,2,3,4,5,6,43,8,9,10,11,12,44,14,15,16,17,18,45,20,21,22,23,24,46,26,27,28,29,30,47]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm9
@@ -4227,7 +4227,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm10
 ; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm10 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm10, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,48,6,7,8,9,10,49,12,13,14,15,16,50,18,19,20,21,22,51,24,25,26,27,28,52,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm10
@@ -4238,7 +4238,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm5, %zmm4, %zmm11
 ; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm11, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,53,4,5,6,7,8,54,10,11,12,13,14,55,16,17,18,19,20,56,22,23,24,25,26,57,28,29,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm7, %zmm11
@@ -4249,7 +4249,7 @@ define void @store_i16_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    # zmm4 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm4, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,58,2,3,4,5,6,59,8,9,10,11,12,60,14,15,16,17,18,61,20,21,22,23,24,62,26,27,28,29,30,63]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm2, %zmm1
@@ -6185,7 +6185,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[1,2,3,3,5,6,7,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm0, %ymm3
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
@@ -6413,7 +6413,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm3[0],ymm2[0],ymm3[1],ymm2[1],ymm3[2],ymm2[2],ymm3[3],ymm2[3],ymm3[8],ymm2[8],ymm3[9],ymm2[9],ymm3[10],ymm2[10],ymm3[11],ymm2[11]
@@ -6667,7 +6667,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX2-FAST-NEXT:    # xmm1 = xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,2,1,2,u,u,3,3>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,2,1,2,u,u,3,3]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm3, %ymm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[1,1,1,1]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4],ymm1[5],ymm0[6,7]
@@ -6745,12 +6745,12 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm1
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[3,3,3,3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1],ymm2[2],ymm1[3,4],ymm2[5],ymm1[6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm4
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm1[0],ymm4[1],ymm1[2,3],ymm4[4],ymm1[5,6],ymm4[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm14
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,1,2,3]
@@ -7139,7 +7139,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm14, %ymm6
@@ -7314,12 +7314,12 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm2 = ymm2[1,2,3,3,5,6,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm0, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm3 = ymm1[0],ymm3[1],ymm1[2,3],ymm3[4],ymm1[5,6],ymm3[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm0, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,2,3]
@@ -7447,7 +7447,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa32 %zmm0, %zmm1 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vextracti64x4 $1, %zmm1, %ymm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 96(%r8), %ymm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512F-ONLY-SLOW-NEXT:    vpshufb %ymm7, %ymm5, %ymm2
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,2,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2,3],ymm2[4],ymm0[5,6],ymm2[7]
@@ -7593,7 +7593,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm7 = zmm14[0,1,2,3],zmm7[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11]
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rcx), %xmm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm14
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm14[4],xmm7[4],xmm14[5],xmm7[5],xmm14[6],xmm7[6],xmm14[7],xmm7[7]
@@ -7945,7 +7945,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rsi), %ymm12
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm6, %ymm12, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%rdi), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -7969,7 +7969,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%r8), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm13, %ymm1, %ymm4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm13, %ymm8
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2d %zmm4, %zmm25, %zmm3
@@ -7978,10 +7978,10 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2d %ymm4, %ymm29, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm0[0,1,2,3],zmm3[0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <2,2,u,3,10,u,10,11>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [2,2,u,3,10,u,10,11]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%r9), %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm15, %ymm1, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm14 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
 ; AVX512F-ONLY-FAST-NEXT:    # ymm14 = mem[0,1,0,1]
@@ -8054,7 +8054,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2d %ymm3, %ymm16, %ymm4
 ; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm4[0,1,2,3],zmm2[0,1,2,3]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <u,0,0,u,8,8,u,9>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [u,0,0,u,8,8,u,9]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 96(%r9), %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,4,5,4,5,6,7,8,9,8,9,8,9,8,9]
@@ -8216,7 +8216,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [1,1,1,1,10,10,10,11]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm2, %zmm9, %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14]
 ; AVX512F-ONLY-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm15, %zmm2 # 64-byte Folded Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa32 %zmm0, %zmm2 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm3[0,1,1,3,4,5,6,7,8,9,9,11,12,13,14,15]
@@ -8231,7 +8231,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [8,9,12,13,12,13,10,11,8,9,10,11,12,13,14,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %xmm0, %xmm30
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm27 = <0,u,0,1,u,10,10,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,u,0,1,u,10,10,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm5, %zmm27, %zmm3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm5 # 32-byte Folded Reload
@@ -8529,7 +8529,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-SLOW-NEXT:    vinserti32x8 $1, %ymm3, %zmm2, %zmm0 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 96(%r8), %ymm9
-; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX512DQ-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512DQ-SLOW-NEXT:    vpshufb %ymm13, %ymm9, %ymm1
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,2,3]
 ; AVX512DQ-SLOW-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
@@ -8650,7 +8650,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vpshufhw {{.*#+}} xmm13 = xmm13[0,1,2,3,4,4,4,4]
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm31 = ymm13[0,0,2,1]
 ; AVX512DQ-SLOW-NEXT:    vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm4[0],ymm2[1],ymm4[1],ymm2[2],ymm4[2],ymm2[3],ymm4[3],ymm2[8],ymm4[8],ymm2[9],ymm4[9],ymm2[10],ymm4[10],ymm2[11],ymm4[11]
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [17,18,17,18,u,u,19,19,5,4,2,2,5,4,6,6]
 ; AVX512DQ-SLOW-NEXT:    vpermt2d %zmm0, %zmm4, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %xmm18, %xmm0
 ; AVX512DQ-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7]
@@ -8904,7 +8904,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm18, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [1,2,1,2,u,u,3,3,13,12,10,10,13,12,14,14]
 ; AVX512DQ-FAST-NEXT:    vpermd %zmm0, %zmm20, %zmm22
 ; AVX512DQ-FAST-NEXT:    movw $18724, %ax # imm = 0x4924
 ; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
@@ -8922,7 +8922,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [12,13,10,11,10,11,14,15,14,15,14,15,14,15,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
 ; AVX512DQ-FAST-NEXT:    vpermt2d %ymm0, %ymm19, %ymm22
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <0,u,0,1,u,10,10,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,u,0,1,u,10,10,u]
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
@@ -9014,7 +9014,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} ymm0 = ymm0[0,0,2,1,4,5,6,7,8,8,10,9,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm23, %zmm1
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,4,5,10,11,u,u,u,u,u,u,u,u,24,25,22,23,20,21,26,27,u,u,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm9, %ymm0
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm5, %ymm1
 ; AVX512DQ-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm8 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11]
@@ -9033,7 +9033,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vinserti32x8 $1, %ymm0, %zmm1, %zmm8 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm23 = [8,21,10,11,20,13,14,23]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,14,15,14,15,14,15,14,15,28,29,26,27,26,27,30,31,30,31,30,31,30,31,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm10, %ymm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm2, %ymm9
 ; AVX512DQ-FAST-NEXT:    vpermt2d %zmm1, %zmm23, %zmm0
@@ -9042,14 +9042,14 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpermt2d %ymm1, %ymm30, %ymm8
 ; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm0[0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm1, %ymm11, %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm1, %ymm8
 ; AVX512DQ-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25,16,17,20,21,20,21,22,23,24,25,24,25,24,25,24,25]
 ; AVX512DQ-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm2, %ymm11, %ymm1
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm2, %ymm11
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <2,2,u,3,10,u,10,11>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [2,2,u,3,10,u,10,11]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm31, %zmm1
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm13, %ymm0
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm15, %ymm12, %ymm2
@@ -9112,7 +9112,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa %xmm4, %xmm8
 ; AVX512DQ-FAST-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm5[0,0,2,1,4,5,6,7]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,0,0,u,8,8,u,9>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,0,0,u,8,8,u,9]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm18, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm9, %xmm5, %xmm3
@@ -9174,7 +9174,7 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm8, %zmm29, %zmm5
 ; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm29 = zmm3[0,1,2,3],zmm6[0,1,2,3]
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,8,9,10,11,12,13,14,15,24,25,28,29,28,29,26,27,24,25,26,27,28,29,30,31]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm13, %ymm3, %ymm8
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm11, %ymm15
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm11, %ymm3, %ymm3
@@ -9374,22 +9374,22 @@ define void @store_i16_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm4 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm13
 ; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,2,3,32,u,6,7,8,9,33,u,12,13,14,15,34,u,18,19,20,21,35,u,24,25,26,27,36,u,30,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm15, %zmm2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm9 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [58,u,2,3,4,5,59,u,8,9,10,11,60,u,14,15,16,17,61,u,20,21,22,23,62,u,26,27,28,29,63,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm19, %zmm4
 ; AVX512BW-NEXT:    vmovdqa32 %zmm26, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,53,u,4,5,6,7,54,u,10,11,12,13,55,u,16,17,18,19,56,u,22,23,24,25,57,u,28,29,30,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm20, %zmm6
 ; AVX512BW-NEXT:    vmovdqa32 %zmm25, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,48,u,6,7,8,9,49,u,12,13,14,15,50,u,18,19,20,21,51,u,24,25,26,27,52,u,30,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm22, %zmm9
 ; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm0 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [42,u,2,3,4,5,43,u,8,9,10,11,44,u,14,15,16,17,45,u,20,21,22,23,46,u,26,27,28,29,47,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm7, %zmm14
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm15, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,37,u,4,5,6,7,38,u,10,11,12,13,39,u,16,17,18,19,40,u,22,23,24,25,41,u,28,29,30,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm15, %zmm16
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm5 {%k1}
 ; AVX512BW-NEXT:    vpermt2w %zmm12, %zmm19, %zmm5
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
index f3ec4420e4dda..d3fb3bdc28da1 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-7.ll
@@ -139,7 +139,7 @@ define void @store_i16_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinserti128 $1, (%r8), %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[4],ymm1[4],ymm0[5],ymm1[5]
 ; AVX512BW-NEXT:    vinserti128 $1, (%r10), %ymm2, %ymm1
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,2,16,18,8,10,24,1,3,17,19,9,11,25,u,u]
 ; AVX512BW-NEXT:    vpermi2w %ymm1, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm0
 ; AVX512BW-NEXT:    vpextrd $2, %xmm0, 24(%rax)
@@ -329,9 +329,9 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm3[3,1,2,1]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[2,0,2,3,4,5,6,7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX2-SLOW-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -357,7 +357,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <5,7,1,3,7,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,7,1,3,7,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
@@ -414,9 +414,9 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[14,15,14,15,14,15,14,15,4,5,6,7,14,15,14,15,30,31,30,31,30,31,30,31,20,21,22,23,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm2 = xmm3[4,5,12,13,4,5,6,7,8,9,10,11,4,5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,0,0,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm4, (%rax)
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm0, %xmm1
@@ -481,7 +481,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,u]
 ; AVX512F-FAST-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
 ; AVX512F-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
@@ -496,7 +496,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm2[2,3,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[0,1,8,9,u,u,u,u,u,u],zero,zero,ymm3[26,27],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,20,21,28,29]
 ; AVX512F-FAST-NEXT:    vpternlogq $168, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <5,7,1,3,7,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,7,1,3,7,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm2, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,4,5],zero,zero,zero,zero,zero,zero,ymm1[10,11,14,15,2,3,18,19],zero,zero,zero,zero,zero,zero,ymm1[u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
@@ -524,7 +524,7 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512BW-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,32,40,36,1,5,9,13,33,41,37,2,6,10,14,34,42,38,3,7,11,15,35,43,39,u,u,u,u]
 ; AVX512BW-SLOW-NEXT:    vpermi2w %zmm0, %zmm2, %zmm1
 ; AVX512BW-SLOW-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
 ; AVX512BW-SLOW-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
@@ -548,10 +548,10 @@ define void @store_i16_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,u>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,u]
 ; AVX512BW-FAST-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6,10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; AVX512BW-FAST-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
 ; AVX512BW-FAST-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
@@ -840,12 +840,12 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11,12,13],ymm10[14],ymm6[15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm6, %ymm10, %ymm6
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpbroadcastd 4(%r10), %ymm12
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm6, %ymm11, %ymm6
@@ -857,7 +857,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX2-SLOW-NEXT:    vpbroadcastd 8(%r10), %ymm12
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm9, %ymm12, %ymm9
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
@@ -868,7 +868,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpor %ymm7, %ymm8, %ymm7
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpbroadcastd (%r10), %ymm10
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
@@ -904,14 +904,14 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm1, %ymm9
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm7[0,2,1,3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[6,7,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[16,17,24,25],zero,zero,zero,zero
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <1,5,u,u,5,2,6,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [1,5,u,u,5,2,6,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm8, %ymm10, %ymm10
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[0,1,4,5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[2,3,18,19],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm10[20,21,24,25]
 ; AVX2-FAST-NEXT:    vpor %ymm6, %ymm10, %ymm6
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpbroadcastd 4(%r10), %ymm12
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm6, %ymm11, %ymm6
@@ -923,7 +923,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX2-FAST-NEXT:    vpbroadcastd 8(%r10), %ymm12
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm9, %ymm12, %ymm9
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
@@ -934,7 +934,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpor %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpbroadcastd (%r10), %ymm10
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
@@ -974,12 +974,12 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm6 = ymm6[0],ymm10[1],ymm6[2,3,4,5],ymm10[6],ymm6[7,8],ymm10[9],ymm6[10,11,12,13],ymm10[14],ymm6[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm7[0,2,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,16,17,24,25,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm6, %ymm10, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm9[0,2,0,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm11 = ymm10[u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,22,23,30,31,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 4(%r10), %ymm12
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm6, %ymm11, %ymm6
@@ -991,7 +991,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[1,3,3,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm9 = ymm9[0,1,8,9,u,u,u,u,u,u,u,u,u,u,2,3,18,19,u,u,u,u,u,u,u,u,u,u,28,29,20,21]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 8(%r10), %ymm12
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm9, %ymm12, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
@@ -1002,7 +1002,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,18,19,26,27,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd (%r10), %ymm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm7, %ymm8, %ymm7
@@ -1128,7 +1128,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm2
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm6[0,2,2,0]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,1,8,9],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,2,3,18,19],zero,zero,zero,zero,ymm3[u,u,u,u,u,u,28,29,20,21]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <1,5,u,u,5,2,6,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [1,5,u,u,5,2,6,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm7, %ymm4, %ymm4
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[0,1,4,5,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[2,3,18,19,u,u,u,u,u,u],zero,zero,zero,zero,ymm4[20,21,24,25]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm4, %zmm3, %zmm3
@@ -1162,7 +1162,7 @@ define void @store_i16_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinserti32x4 $2, (%r10), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,8,16,24,32,40,48,1,9,17,25,33,41,49,2,10,18,26,34,42,50,3,11,19,27,35,43,51,4,12,20,28]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [36,44,52,5,13,21,29,37,45,53,6,14,22,30,38,46,54,7,15,23,31,39,47,55,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm3
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm3, 96(%rax)
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, (%rax)
@@ -1781,32 +1781,32 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqa (%r9), %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa (%rax), %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm7, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm6[0,3,2,3,4,7,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm6, %ymm7
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm8[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,3,u,u,u,4,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u]
 ; AVX2-SLOW-NEXT:    vpermd %ymm5, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm5, %ymm6
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm13[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,3,u,u,u,4,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,3,u,u,u,4,u]
 ; AVX2-SLOW-NEXT:    vpermd %ymm3, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm3, %ymm4
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm2[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [3,u,u,3,u,u,u,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm1, %ymm9, %ymm9
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
@@ -1822,7 +1822,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[2,1,2,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,5,4]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm8, %ymm0, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX2-SLOW-NEXT:    vmovdqa (%r8), %xmm14
@@ -1830,7 +1830,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm8, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
@@ -1844,14 +1844,14 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm9 = xmm0[0,1,2,3,4,5,7,6]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm9 = xmm9[0,2,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpbroadcastd 4(%rax), %ymm14
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm9, %ymm14, %ymm9
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm1, %ymm9, %ymm1
@@ -1869,7 +1869,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, %ymm7
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm2[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
@@ -1878,7 +1878,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm15 = ymm4[3,3,3,3,7,7,7,7]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm14 = ymm4[2,3,3,3,6,7,7,7]
@@ -1892,13 +1892,13 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[0,1,3,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm10 = xmm10[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm1, %ymm10, %ymm1
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vpbroadcastd (%rax), %ymm10
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm0, %ymm10, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm1, %ymm0, %ymm10
@@ -1913,7 +1913,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm11[2,2,2,2,6,6,6,6]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm1[0],ymm11[1],ymm1[2,3],ymm11[4],ymm1[5,6,7,8],ymm11[9],ymm1[10,11],ymm11[12],ymm1[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm12, %ymm1, %ymm3
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, %ymm15
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[1,2,2,3,5,6,6,7]
@@ -1925,7 +1925,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm4, %ymm13
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm4[0,1,2,2,4,5,6,6]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm1, %ymm11, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm3, %ymm1, %ymm7
@@ -1939,7 +1939,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm4[0,0,0,0,4,4,4,4]
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
@@ -1948,7 +1948,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
@@ -1986,7 +1986,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm5[2,2,2,2,6,6,6,6]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,5,2,2,6,6,6,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm8, %ymm8
@@ -1994,7 +1994,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm10 = ymm15[1,2,2,3,5,6,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1],ymm9[2],ymm10[3,4],ymm9[5],ymm10[6,7,8,9],ymm9[10],ymm10[11,12],ymm9[13],ymm10[14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm9, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
@@ -2007,7 +2007,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0,1],ymm8[2],ymm9[3,4],ymm8[5],ymm9[6,7,8,9],ymm8[10],ymm9[11,12],ymm8[13],ymm9[14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,4,5,4,5,5,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm8, %ymm8
@@ -2016,31 +2016,31 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm9 = ymm10[0,1,2],ymm9[3],ymm10[4,5],ymm9[6],ymm10[7,8,9,10],ymm9[11],ymm10[12,13],ymm9[14],ymm10[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm9, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm7[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,3,u,u,u,4,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm6[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,3,u,u,u,4,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,3,u,u,u,4,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm15, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm3[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [3,u,u,3,u,u,u,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm9, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
@@ -2055,7 +2055,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm0, %ymm8, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqa (%r9), %xmm8
 ; AVX2-FAST-NEXT:    vmovdqa (%r8), %xmm2
@@ -2063,7 +2063,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
 ; AVX2-FAST-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm0, %ymm11, %ymm11
@@ -2075,13 +2075,13 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm9[0,1],xmm1[2],xmm9[3,4],xmm1[5],xmm9[6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,1,3]
 ; AVX2-FAST-NEXT:    vpbroadcastd 4(%rax), %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm2, %ymm8, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm1, %ymm2, %ymm8
@@ -2093,13 +2093,13 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm15[3,3,3,3,7,7,7,7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [6,7,3,3,7,7,6,7]
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
@@ -2111,12 +2111,12 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 ; AVX2-FAST-NEXT:    vpbroadcastd (%rax), %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm0, %ymm3, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
@@ -2148,28 +2148,28 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r9), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rax), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm7, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,3,u,u,u,4,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm4, %ymm8, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm4, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm9 = ymm5[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm5, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,3,u,u,u,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,3,u,u,u,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm3, %ymm8, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <3,u,u,3,u,u,u,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [3,u,u,3,u,u,u,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm1, %ymm9, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
@@ -2184,7 +2184,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm5[4],xmm3[4],xmm5[5],xmm3[5],xmm5[6],xmm3[6],xmm5[7],xmm3[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm8 = xmm8[0,2,3,3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r9), %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %xmm14
@@ -2192,7 +2192,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm8 = xmm8[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 8(%rax), %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm8, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
@@ -2205,13 +2205,13 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm1 = xmm15[0,1],xmm1[2],xmm15[3,4],xmm1[5],xmm15[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm14[0],xmm9[0],xmm14[1],xmm9[1],xmm14[2],xmm9[2],xmm14[3],xmm9[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm9 = xmm0[0,1,2,3,8,9,10,11,14,15,12,13,14,15,12,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 4(%rax), %ymm14
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm9, %ymm14, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm1, %ymm9, %ymm1
@@ -2227,7 +2227,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm14 = ymm6[3,3,3,3,7,7,7,7]
@@ -2235,7 +2235,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm15 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm14 = ymm15[0,1],ymm14[2],ymm15[3,4],ymm14[5],ymm15[6,7,8,9],ymm14[10],ymm15[11,12],ymm14[13],ymm15[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u,255,255,255,255,255,255,255,255,0,0,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm1, %ymm14, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm14 = ymm13[2,3,3,3,6,7,7,7]
@@ -2248,12 +2248,12 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm1, %ymm10, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3,6,7,4,5,6,7,4,5,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd (%rax), %ymm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm0, %ymm10, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm1, %ymm0, %ymm10
@@ -2265,7 +2265,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm11 = ymm8[2,2,2,2,6,6,6,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm11[0],ymm1[1],ymm11[2,3],ymm1[4],ymm11[5,6,7,8],ymm1[9],ymm11[10,11],ymm1[12],ymm11[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm2, %ymm1, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm12, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm12[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
@@ -2275,7 +2275,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm11 = ymm13[0,1,2,2,4,5,6,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm1, %ymm11, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm3, %ymm1, %ymm6
@@ -2287,7 +2287,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[0,1,1,3,4,5,5,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm3[0,1],ymm4[2],ymm3[3,4],ymm4[5],ymm3[6,7,8,9],ymm4[10],ymm3[11,12],ymm4[13],ymm3[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm3 = ymm15[0,0,2,1,4,4,6,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19,20,21,18,19,u,u,18,19,20,21,u,u,20,21]
@@ -2295,7 +2295,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[0,1,1,3,4,5,5,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
@@ -2387,7 +2387,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm11[0],xmm15[1],xmm11[1],xmm15[2],xmm11[2],xmm15[3],xmm11[3]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm11 = xmm0[0,1,2,3,4,5,7,6]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm11, %zmm0, %zmm15
 ; AVX512F-SLOW-NEXT:    vprold $16, %ymm3, %ymm0
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} ymm11 = ymm2[1,2,2,3,5,6,6,7]
@@ -2479,7 +2479,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} ymm3 = ymm6[0,0,1,1,2,2,3,3,8,8,9,9,10,10,11,11]
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm9 = ymm4[0,1,1,3,4,5,5,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm9[0,1],ymm3[2],ymm9[3,4],ymm3[5],ymm9[6,7,8,9],ymm3[10],ymm9[11,12],ymm3[13],ymm9[14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <2,u,3,2,u,10,10,11>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [2,u,3,2,u,10,10,11]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm0, %zmm3, %zmm9
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm0 = ymm4[2,2,2,2,6,6,6,6]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,u,u,24,25,26,27,u,u,26,27,26,27]
@@ -2538,7 +2538,7 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm6 = [0,13,4,0,0,14,5,0,0,13,4,0,0,14,5,0]
 ; AVX512F-FAST-NEXT:    # zmm6 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpermd %zmm8, %zmm6, %zmm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <6,u,u,u,7,u,u,7>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [6,u,u,u,7,u,u,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm8, %ymm7, %ymm7
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,3]
@@ -2590,27 +2590,27 @@ define void @store_i16_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,14,30,46,62,u,u,u,15,31,47,63,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = [29,45,u,u,u,u,14,30,46,u,u,u,u,15,31,47]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,32,u,u,u,u,1,17,33,u,u,u,u,2,18,34,u,u,u,u,3,19,35,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,16,32,48,u,u,u,1,17,33,49,u,u,u,2,18,34,50,u,u,u,3,19,35,51,u,u,u,4,20,36,52]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
 ; AVX512BW-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm6, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [4,20,36,u,u,u,u,5,21,37,u,u,u,u,6,22,38,u,u,u,u,7,23,39,u,u,u,u,8,24,40,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,5,21,37,53,u,u,u,6,22,38,54,u,u,u,7,23,39,55,u,u,u,8,24,40,56,u,u,u,9]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm8
 ; AVX512BW-NEXT:    movl $1893843847, %ecx # imm = 0x70E1C387
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm6, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,9,25,41,u,u,u,u,10,26,42,u,u,u,u,11,27,43,u,u,u,u,12,28,44,u,u,u,u,13]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [57,9,25,u,u,u,42,58,10,26,u,u,u,43,59,11,27,u,u,u,44,60,12,28,u,u,u,45,61,13,29,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    movl $-2029118408, %ecx # imm = 0x870E1C38
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
@@ -3889,13 +3889,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rcx), %ymm9
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%r8), %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%r9), %ymm7
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm8, %ymm0, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm10[0,3,2,3,4,7,6,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,3,u,u,u,4,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,3,u,u,u,4,u,u]
 ; AVX2-SLOW-NEXT:    vpermd %ymm5, %ymm2, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm5, %ymm11
 ; AVX2-SLOW-NEXT:    vpermd %ymm13, %ymm0, %ymm0
@@ -3904,22 +3904,22 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm0, %ymm5, %ymm0
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm9[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
 ; AVX2-SLOW-NEXT:    vpermd %ymm12, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm3 = ymm14[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm2, %ymm3, %ymm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,3,u,u,u,4,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,3,u,u,u,4,u]
 ; AVX2-SLOW-NEXT:    vpermd %ymm6, %ymm3, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm6, %ymm12
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm2 = ymm7[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm5, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa (%r8), %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3931,10 +3931,10 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm3, %ymm5, %ymm3
 ; AVX2-SLOW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rax), %ymm6
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <3,u,u,3,u,u,u,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,u,u,3,u,u,u,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm6, %ymm4, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm6, %ymm13
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa (%rax), %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -3958,17 +3958,17 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm12[1,2,2,3,5,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[2,1,2,3,6,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6]
@@ -3986,14 +3986,14 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpbroadcastd 60(%r8), %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm7[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
@@ -4013,7 +4013,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm0, %ymm1, %ymm12
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4037,7 +4037,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vpbroadcastd 32(%rax), %ymm9
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm0, %ymm9, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa (%r9), %xmm5
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4064,7 +4064,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm3 = xmm12[0,1],xmm3[2],xmm12[3,4],xmm3[5],xmm12[6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm2, %ymm3, %ymm9
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm7[1,1,2,2]
@@ -4081,7 +4081,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,2,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpbroadcastd 36(%rax), %ymm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm3, %ymm4, %ymm3
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,2,3,3]
@@ -4099,7 +4099,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm8[4],xmm7[5],xmm8[5],xmm7[6],xmm8[6],xmm7[7],xmm8[7]
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
@@ -4116,7 +4116,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpbroadcastd 40(%rax), %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
@@ -4142,7 +4142,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm12[0,1,2,3,5,6,7,7,8,9,10,11,13,14,15,15]
@@ -4154,7 +4154,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm4 = ymm13[2,3,3,3,6,7,7,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
@@ -4167,7 +4167,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm4 = ymm12[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
@@ -4176,7 +4176,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm13[0,1,2,2,4,5,6,6]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
@@ -4194,7 +4194,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm5 = ymm5[0,1],ymm6[2],ymm5[3,4],ymm6[5],ymm5[6,7,8,9],ymm6[10],ymm5[11,12],ymm6[13],ymm5[14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm7[1,1,1,1,5,5,5,5]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm10[1,2,3,3,4,5,6,7,9,10,11,11,12,13,14,15]
@@ -4217,7 +4217,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm7 = mem[0,1,1,3,4,5,5,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm7 = ymm11[0,0,2,1,4,4,6,5]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm12[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
@@ -4278,7 +4278,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa %ymm6, %ymm8
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm1, %ymm1
@@ -4289,7 +4289,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm10[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,28,29,u,u,28,29,26,27,u,u,30,31,30,31]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0,1],ymm2[2],ymm3[3,4],ymm2[5],ymm3[6,7,8,9],ymm2[10],ymm3[11,12],ymm2[13],ymm3[14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -4301,7 +4301,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm4[1,2,2,3,5,6,6,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa %ymm7, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4318,7 +4318,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3],ymm2[4],ymm3[5,6,7,8],ymm2[9],ymm3[10,11],ymm2[12],ymm3[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -4337,7 +4337,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm7[0,1],ymm2[2],ymm7[3,4],ymm2[5],ymm7[6,7,8,9],ymm2[10],ymm7[11,12],ymm2[13],ymm7[14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm7
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm6, %ymm1
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm9[1,1,1,1,5,5,5,5]
@@ -4358,7 +4358,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rax), %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm13, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm10, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
@@ -4373,42 +4373,42 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm8, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm15, %ymm0, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm12, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm1, %ymm3, %ymm7
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,3,u,u,u,4,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,3,u,u,u,4,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm14, %ymm1, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm11, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm2, %ymm8, %ymm2
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm8, %ymm3
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm1, %ymm3, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm7, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,3,u,u,u,4,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,3,u,u,u,4,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm1, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm6, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm3, %ymm8, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm4, %ymm7
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm1, %ymm7, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <3,u,u,3,u,u,u,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [3,u,u,3,u,u,u,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm7, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm3, %ymm8, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm10, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm1, %ymm7, %ymm1
@@ -4425,15 +4425,15 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm1, %ymm1
@@ -4448,14 +4448,14 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm14[3,3,3,3,7,7,7,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpbroadcastd 60(%r8), %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm6[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm1, %ymm1
@@ -4479,7 +4479,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm0, %ymm1, %ymm12
 ; AVX2-FAST-NEXT:    vmovdqa (%rcx), %xmm13
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm9
@@ -4501,7 +4501,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm1
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
 ; AVX2-FAST-NEXT:    vpbroadcastd 32(%rax), %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa (%r9), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4530,7 +4530,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2],xmm12[3,4],xmm11[5],xmm12[6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,2,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm3, %ymm11, %ymm3
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm13, %xmm0
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} xmm11 = xmm9[1,1,2,2]
@@ -4545,7 +4545,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,1,3]
 ; AVX2-FAST-NEXT:    vpbroadcastd 36(%rax), %ymm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm5, %ymm11, %ymm5
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm6, %xmm4
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
@@ -4561,7 +4561,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm3[0,2,3,3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,2,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm0, %ymm3, %ymm0
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm15[4],xmm8[4],xmm15[5],xmm8[5],xmm15[6],xmm8[6],xmm15[7],xmm8[7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
@@ -4577,7 +4577,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm4, %xmm4
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
 ; AVX2-FAST-NEXT:    vpbroadcastd 40(%rax), %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm4, %ymm7, %ymm4
 ; AVX2-FAST-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm14, %xmm1 # 16-byte Folded Reload
 ; AVX2-FAST-NEXT:    # xmm1 = xmm14[4],mem[4],xmm14[5],mem[5],xmm14[6],mem[6],xmm14[7],mem[7]
@@ -4633,13 +4633,13 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm15, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rcx), %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%r8), %ymm7
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm8, %ymm0, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm9, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,3,u,u,u,4,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,3,u,u,u,4,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm6, %ymm3, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm6, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
@@ -4647,21 +4647,21 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm14, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm10, %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm5, %ymm2, %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%r9), %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm13, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm15, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,3,u,u,u,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,3,u,u,u,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm7, %ymm3, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm11, %ymm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4672,10 +4672,10 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rax), %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,u,u,3,u,u,u,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,u,u,3,u,u,u,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm6, %ymm3, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm6, %ymm13
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rax), %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4698,16 +4698,16 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm11, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm13, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm13[0,1,2,2,4,5,6,6]
@@ -4723,14 +4723,14 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 60(%r8), %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm11[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm13[2,3,3,3,6,7,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
@@ -4752,7 +4752,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rcx), %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %xmm13
@@ -4774,7 +4774,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm5, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 32(%rax), %ymm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm0, %ymm10, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r9), %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -4801,7 +4801,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm4 = xmm15[0,1],xmm4[2],xmm15[3,4],xmm4[5],xmm15[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm2, %ymm4, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm14, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm4 = xmm13[1,1,2,2]
@@ -4816,7 +4816,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm5, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 36(%rax), %ymm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,1,3]
@@ -4833,7 +4833,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm3, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
@@ -4849,7 +4849,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 40(%rax), %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm10, %xmm3 # 16-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # xmm3 = xmm10[4],mem[4],xmm10[5],mem[5],xmm10[6],mem[6],xmm10[7],mem[7]
@@ -4872,7 +4872,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0],ymm3[1],ymm4[2,3],ymm3[4],ymm4[5,6,7,8],ymm3[9],ymm4[10,11],ymm3[12],ymm4[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm3 = ymm12[3,3,3,3,7,7,7,7]
@@ -4883,7 +4883,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm4 = ymm14[2,3,3,3,6,7,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
@@ -4895,7 +4895,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3],ymm4[4],ymm5[5,6,7,8],ymm4[9],ymm5[10,11],ymm4[12],ymm5[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm4 = ymm13[u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm5 = ymm12[1,2,2,3,5,6,6,7]
@@ -4903,7 +4903,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm5 = ymm14[0,1,2,2,4,5,6,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
@@ -4921,7 +4921,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0,1],ymm6[2],ymm7[3,4],ymm6[5],ymm7[6,7,8,9],ymm6[10],ymm7[11,12],ymm6[13],ymm7[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,1,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm10, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm6 = ymm8[1,1,1,1,5,5,5,5]
@@ -4942,7 +4942,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $212, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm8 = mem[0,1,1,3,4,5,5,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm13, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm8 = ymm12[0,0,2,1,4,4,6,5]
@@ -4988,7 +4988,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm9, %ymm1, %ymm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm16
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdx), %ymm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm13, %ymm2, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm29
 ; AVX512F-SLOW-NEXT:    vpor %ymm0, %ymm1, %ymm0
@@ -4998,7 +4998,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm12, %ymm1, %ymm0
 ; AVX512F-SLOW-NEXT:    vmovdqa %ymm1, %ymm15
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm14, %ymm2, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm17
 ; AVX512F-SLOW-NEXT:    vpor %ymm0, %ymm1, %ymm0
@@ -5007,7 +5007,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm1, %ymm10, %ymm2
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm11, %ymm3
 ; AVX512F-SLOW-NEXT:    vpor %ymm2, %ymm3, %ymm2
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5026,7 +5026,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm6[4],xmm3[4],xmm6[5],xmm3[5],xmm6[6],xmm3[6],xmm6[7],xmm3[7]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm3
@@ -5055,7 +5055,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm0[0],xmm8[0],xmm0[1],xmm8[1],xmm0[2],xmm8[2],xmm0[3],xmm8[3]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,1,3,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,16,u,u,17,17,u,u,0,u,u,1,2,u,u,3>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,16,u,u,17,17,u,u,0,u,u,1,2,u,u,3]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm4
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r9), %xmm1
@@ -5063,7 +5063,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm4[0,1,2,3,4,5,7,6]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm4 = xmm4[0,1,3,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm9
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vprold $16, %ymm10, %ymm4
@@ -5323,7 +5323,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm1, %ymm10
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdx), %ymm4
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm4, %ymm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm4, %ymm18
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5336,7 +5336,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm4, %ymm9
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm6, %ymm4
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm6, %ymm16
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5346,7 +5346,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm13, %ymm6
 ; AVX512F-FAST-NEXT:    vmovdqa (%r8), %ymm14
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm14, %ymm7
 ; AVX512F-FAST-NEXT:    vporq %ymm6, %ymm7, %ymm25
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm11, %ymm6
@@ -5366,7 +5366,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %ymm6
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm6, %ymm1
 ; AVX512F-FAST-NEXT:    vporq %ymm0, %ymm1, %ymm21
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm11, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm8[3,3,3,3,7,7,7,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5],ymm1[6],ymm2[7,8,9,10],ymm1[11],ymm2[12,13],ymm1[14],ymm2[15]
@@ -5374,7 +5374,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm11, %ymm30
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm4 = ymm8[2,2,2,2,6,6,6,6]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm4[0],ymm2[1],ymm4[2,3],ymm2[4],ymm4[5,6,7,8],ymm2[9],ymm4[10,11],ymm2[12],ymm4[13,14,15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <u,2,2,3,10,u,11,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,2,2,3,10,u,11,u]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm1, %zmm2, %zmm24
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm10[14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm2 = ymm18[3,3,3,3,7,7,7,7]
@@ -5383,7 +5383,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %xmm5
 ; AVX512F-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm5[0],xmm10[0],xmm5[1],xmm10[1],xmm5[2],xmm10[2],xmm5[3],xmm10[3]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <u,2,2,3,8,u,9,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [u,2,2,3,8,u,9,u]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm2, %zmm1, %zmm23
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm9, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm1 = ymm16[3,3,3,3,7,7,7,7]
@@ -5414,7 +5414,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2],xmm1[3,4],xmm2[5],xmm1[6,7]
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,2,3,4,5,6,7,8,9,10,11,10,11,8,9]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <u,0,u,1,8,8,9,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [u,0,u,1,8,8,9,u]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm2, %zmm1, %zmm26
 ; AVX512F-FAST-NEXT:    vmovdqa (%r9), %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqa (%r8), %xmm9
@@ -5440,7 +5440,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm9[4],xmm2[4],xmm9[5],xmm2[5],xmm9[6],xmm2[6],xmm9[7],xmm2[7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm25, %zmm0, %zmm4
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <0,0,1,1,12,13,u,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [0,0,1,1,12,13,u,15]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm4, %zmm2, %zmm25
 ; AVX512F-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-FAST-NEXT:    vpbroadcastd 8(%rax), %ymm2
@@ -5462,7 +5462,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm15, %ymm2
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm6[0,0,2,1,4,4,6,5]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5],ymm3[6],ymm2[7,8,9,10],ymm3[11],ymm2[12,13],ymm3[14],ymm2[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <0,1,u,3,10,10,11,11>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,1,u,3,10,10,11,11]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm2, %zmm21, %zmm29
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rax), %ymm8
 ; AVX512F-FAST-NEXT:    vpshufb %ymm1, %ymm8, %ymm1
@@ -5565,7 +5565,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm14 = ymm14[0,1],ymm15[2],ymm14[3,4],ymm15[5],ymm14[6,7,8,9],ymm15[10],ymm14[11,12],ymm15[13],ymm14[14,15]
 ; AVX512F-FAST-NEXT:    vpermt2q %zmm0, %zmm30, %zmm14
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm4, %zmm14
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <6,u,u,u,7,u,u,7>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [6,u,u,u,7,u,u,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm9, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm9, %zmm3, %zmm3
 ; AVX512F-FAST-NEXT:    vpermd %zmm3, %zmm21, %zmm3
@@ -5640,7 +5640,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm9 = [0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0,0,0,2,34,0,32,0,0,0,3,35,1,33,0,0,0]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm9, %zmm10
 ; AVX512BW-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
@@ -5657,7 +5657,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38,4,36,0,0,0,7,39,5,37,0,0,0,8,40,6,38]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm10, %zmm11
 ; AVX512BW-NEXT:    movl $1893843847, %ecx # imm = 0x70E1C387
 ; AVX512BW-NEXT:    kmovd %ecx, %k3
@@ -5674,7 +5674,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13,0,11,43,9,41,0,0,0,12,44,10,42,0,0,0,13]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm11, %zmm12
 ; AVX512BW-NEXT:    movl $-2029118408, %ecx # imm = 0x870E1C38
 ; AVX512BW-NEXT:    kmovd %ecx, %k3
@@ -5689,7 +5689,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0,13,0,0,0,48,16,46,14,0,0,0,49,17,47,15,0]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm2, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm11, %zmm13
 ; AVX512BW-NEXT:    movl $946921923, %ecx # imm = 0x3870E1C3
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
@@ -5704,7 +5704,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
 ; AVX512BW-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm11, %zmm14
 ; AVX512BW-NEXT:    movl $-1014559204, %ecx # imm = 0xC3870E1C
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
@@ -5721,7 +5721,7 @@ define void @store_i16_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
 ; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm0, %zmm3, %zmm1
 ; AVX512BW-NEXT:    movl $473460961, %ecx # imm = 0x1C3870E1
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
@@ -8342,14 +8342,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rax), %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,3,u,u,u,4,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,3,u,u,u,4,u]
 ; AVX2-SLOW-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, %ymm11
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm1[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm1[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <3,u,u,3,u,u,u,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [3,u,u,3,u,u,u,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm3, %ymm11, %ymm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm3 = ymm8[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
@@ -8359,7 +8359,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm4 = ymm4[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm3, %ymm4, %ymm3
 ; AVX2-SLOW-NEXT:    vpermd %ymm9, %ymm10, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm5
 ; AVX2-SLOW-NEXT:    vpermd %ymm7, %ymm10, %ymm0
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm1
@@ -8371,10 +8371,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm3
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <3,u,u,u,4,u,u,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,u,u,u,4,u,u,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm3, %ymm4, %ymm3
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm4, %ymm7
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm4, %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqa (%rcx), %ymm3
@@ -8383,12 +8383,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdx), %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,3,u,u,u,4,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,3,u,u,u,4,u,u]
 ; AVX2-SLOW-NEXT:    vpermd %ymm4, %ymm8, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm8, %ymm9
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm4, %ymm3, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm4, %ymm10
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255]
@@ -8450,20 +8450,20 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa 96(%r8), %ymm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[0,0,2,1,4,4,6,5]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, %ymm7
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa 96(%r9), %ymm2
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm2[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, %ymm8
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa 96(%rax), %ymm2
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm2[0,1,1,3,4,5,5,7]
@@ -8484,17 +8484,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm7[1,2,2,3,5,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm8[2,1,2,3,6,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm1[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm9, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[0,1,2,2,4,5,6,6]
@@ -8512,14 +8512,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpbroadcastd 124(%r8), %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm1 = ymm9[2,3,3,3,6,7,7,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
@@ -8543,7 +8543,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,1,3,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,1,1,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm13
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm2
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8605,7 +8605,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vpbroadcastd (%rax), %ymm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm1, %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%r9), %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -8661,7 +8661,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm6[0,1],xmm1[2],xmm6[3,4],xmm1[5],xmm6[6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm0[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm6, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -8708,7 +8708,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[0,2,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpbroadcastd 4(%rax), %ymm8
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm4, %ymm8, %ymm4
 ; AVX2-SLOW-NEXT:    vpshufhw $180, {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # xmm5 = mem[0,1,2,3,4,5,7,6]
@@ -8745,7 +8745,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[2,1,2,3,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,5,4]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -8784,7 +8784,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vpbroadcastd 8(%rax), %ymm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
@@ -8817,32 +8817,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm6, %ymm3, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,u,u,u,4,u,u,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,u,u,u,4,u,u,4]
 ; AVX2-SLOW-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm1 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    vpshufd $236, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm2 = mem[0,3,2,3,4,7,6,7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm2[0,0,3,3,4,5,6,7,8,8,11,11,12,13,14,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,3,u,u,u,4,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,3,u,u,u,4,u,u]
 ; AVX2-SLOW-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    vpshuflw $0, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm3 = mem[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,3,u,u,u,4,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,3,u,u,u,4,u]
 ; AVX2-SLOW-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufd $196, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,0,3,4,5,4,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm2[0,1,2,3,4,4,7,7,8,9,10,11,12,12,15,15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,u,u,3,u,u,u,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,u,u,3,u,u,u,4]
 ; AVX2-SLOW-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm2 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm0
@@ -8861,7 +8861,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1],ymm3[2],ymm4[3,4],ymm3[5],ymm4[6,7,8,9],ymm3[10],ymm4[11,12],ymm3[13],ymm4[14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,1,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshuflw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Folded Reload
@@ -8906,7 +8906,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm10[0,1,1,3,4,5,5,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm6 = ymm11[1,2,2,3,4,5,6,7,9,10,10,11,12,13,14,15]
@@ -8952,7 +8952,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2,3],ymm6[4],ymm7[5,6,7,8],ymm6[9],ymm7[10,11],ymm6[12],ymm7[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm5, %ymm6, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Reload
@@ -8981,7 +8981,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm7, %ymm8, %ymm7
 ; AVX2-SLOW-NEXT:    vpshuflw $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm8 = mem[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
@@ -8992,7 +8992,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,2]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm10[0,1,2,2,4,5,6,6]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,1,3,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm9 = ymm11[3,3,3,3,4,5,6,7,11,11,11,11,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm9 = ymm9[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
@@ -9032,7 +9032,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpblendw {{.*#+}} ymm9 = ymm9[0],ymm10[1],ymm9[2,3],ymm10[4],ymm9[5,6,7,8],ymm10[9],ymm9[10,11],ymm10[12],ymm9[13,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,1,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm8, %ymm9, %ymm8
 ; AVX2-SLOW-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm9 = mem[3,3,3,3,7,7,7,7]
@@ -9071,7 +9071,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-SLOW-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm12 = mem[2,3,3,3,6,7,7,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm12 = ymm1[3,3,3,3,7,7,7,7]
 ; AVX2-SLOW-NEXT:    vpshufhw $249, {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Folded Reload
@@ -9165,31 +9165,31 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 96(%r8), %ymm3
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <3,u,u,u,4,u,u,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [3,u,u,u,4,u,u,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm5, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,3,u,u,u,4,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,3,u,u,u,4,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm4[0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,3,u,u,u,4,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,3,u,u,u,4,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa 96(%r9), %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FAST-NEXT:    vmovdqa 96(%rax), %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,u,u,3,u,u,u,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,u,u,3,u,u,u,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -9208,7 +9208,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm2, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa %ymm4, %ymm12
 ; AVX2-FAST-NEXT:    vmovdqa %ymm2, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r8), %ymm6
 ; AVX2-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9254,7 +9254,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm2 = ymm5[0,1],ymm2[2],ymm5[3,4],ymm2[5],ymm5[6,7,8,9],ymm2[10],ymm5[11,12],ymm2[13],ymm5[14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,1,3,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255]
@@ -9298,7 +9298,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
 ; AVX2-FAST-NEXT:    vmovdqa %ymm10, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqu %ymm10, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm10, %ymm1
@@ -9311,7 +9311,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa %ymm12, %ymm8
 ; AVX2-FAST-NEXT:    vmovdqu %ymm12, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpermd %ymm12, %ymm9, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm12
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm7, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
@@ -9338,7 +9338,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, %ymm13
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm15[2,2,2,2,6,6,6,6]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm4[0,1],ymm14[2],ymm4[3,4],ymm14[5],ymm4[6,7,8,9],ymm14[10],ymm4[11,12],ymm14[13],ymm4[14,15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm1, %ymm0, %ymm14
 ; AVX2-FAST-NEXT:    vmovdqa %ymm1, %ymm0
@@ -9347,7 +9347,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm4, %ymm14, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm4, %ymm12, %ymm1
@@ -9364,7 +9364,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm15[0],ymm4[1],ymm15[2,3],ymm4[4],ymm15[5,6,7,8],ymm4[9],ymm15[10,11],ymm4[12],ymm15[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -9392,7 +9392,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [6,7,3,3,7,7,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm8, %ymm5, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm0, %ymm0
@@ -9408,14 +9408,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm5, %ymm14
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm2, %ymm14, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm3 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm3, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa %ymm5, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufd {{.*#+}} ymm14 = ymm8[3,3,3,3,7,7,7,7]
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm14[0,1,2],ymm4[3],ymm14[4,5],ymm4[6],ymm14[7,8,9,10],ymm4[11],ymm14[12,13],ymm4[14],ymm14[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm2, %ymm14
 ; AVX2-FAST-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
@@ -9423,7 +9423,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm14 = ymm15[0],ymm14[1],ymm15[2,3],ymm14[4],ymm15[5,6,7,8],ymm14[9],ymm15[10,11],ymm14[12],ymm15[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,1,3,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm4, %ymm14, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0,255,255,255,255,255,255,255,255,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm4, %ymm1, %ymm1
@@ -9457,11 +9457,11 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,u,u,u,4,u,u,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,u,u,u,4,u,u,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm8, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm3, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpermd %ymm14, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa %ymm2, %ymm4
@@ -9470,12 +9470,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm4, %ymm2
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm7, %ymm3
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm2, %ymm3, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,3,u,u,u,4,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,3,u,u,u,4,u,u]
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm5, %ymm3 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm4, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm11, %ymm5, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa %ymm5, %ymm6
@@ -9484,16 +9484,16 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm6, %ymm5
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm15, %ymm6
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm0, %ymm3, %ymm0
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,3,u,u,u,4,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,3,u,u,u,4,u]
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm3 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm4, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm6, %ymm4 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vmovdqa %ymm6, %ymm7
@@ -9504,9 +9504,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm6, %ymm7, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,u,u,3,u,u,u,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,u,u,3,u,u,u,4]
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm3, %ymm6, %ymm3
 ; AVX2-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm6 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm4, %ymm6, %ymm4
@@ -9531,17 +9531,17 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0,1],ymm1[2],ymm2[3,4],ymm1[5],ymm2[6,7,8,9],ymm1[10],ymm2[11,12],ymm1[13],ymm2[14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,2,1,6,5,6,5]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm8[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,1,4,5,4,5,5,7]
@@ -9557,15 +9557,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,6,2,3,6,7,5,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,5,2,2,6,6,6,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm1, %ymm1
@@ -9580,14 +9580,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpbroadcastd 124(%r8), %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm8[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,7,3,3,7,7,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm1, %ymm1
@@ -9613,7 +9613,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm2, %ymm3, %ymm10
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rsi), %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9673,7 +9673,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm1
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
 ; AVX2-FAST-NEXT:    vpbroadcastd (%rax), %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r9), %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -9723,7 +9723,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} xmm2 = xmm4[0,1],xmm2[2],xmm4[3,4],xmm2[5],xmm4[6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm13, %xmm0
@@ -9768,7 +9768,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm7, %xmm7
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
 ; AVX2-FAST-NEXT:    vpbroadcastd 4(%rax), %ymm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm7, %ymm11, %ymm7
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm12, %xmm11
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,1,3]
@@ -9798,7 +9798,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,3]
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,2,3,3,4,5,6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,2,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm13[4],xmm10[4],xmm13[5],xmm10[5],xmm13[6],xmm10[6],xmm13[7],xmm10[7]
 ; AVX2-FAST-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
@@ -9835,7 +9835,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
 ; AVX2-FAST-NEXT:    vpbroadcastd 8(%rax), %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm6 # 16-byte Folded Reload
@@ -9930,32 +9930,32 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%rcx), %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%r8), %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <3,u,u,u,4,u,u,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [3,u,u,u,4,u,u,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm0, %ymm14, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,1,0,1,14,15,14,15,8,9,10,11,12,13,14,15,16,17,16,17,30,31,30,31,24,25,26,27,28,29,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm15, %ymm2, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm1, %ymm4, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,3,u,u,u,4,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,3,u,u,u,4,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm3, %ymm10, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,0,1,0,1,0,1,14,15,14,15,14,15,14,15,16,17,16,17,16,17,16,17,30,31,30,31,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm5, %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm4, %ymm6, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm1, %ymm4, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,3,u,u,u,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,3,u,u,u,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm7, %ymm0, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%r9), %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,1,2,3,4,5,6,7,0,1,0,1,14,15,14,15,16,17,18,19,20,21,22,23,16,17,16,17,30,31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm4, %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm1, %ymm6, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 96(%rax), %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <3,u,u,3,u,u,u,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [3,u,u,3,u,u,u,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm6, %ymm0, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255,255,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
@@ -10009,10 +10009,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r9), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,3,u,u,u,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,3,u,u,u,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm1, %ymm12, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm2, %ymm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u,u,u,u,u,u,u,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm8, %ymm10, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%r8), %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10031,9 +10031,9 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm12, %ymm9, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rax), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,u,u,3,u,u,u,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,u,u,3,u,u,u,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm1, %ymm2, %ymm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm8, %ymm11, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rax), %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -10061,16 +10061,16 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0,1],ymm1[2],ymm7[3,4],ymm1[5],ymm7[6,7,8,9],ymm1[10],ymm7[11,12],ymm1[13],ymm7[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[0,0,2,1,4,4,6,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} ymm1 = ymm4[1,1,2,2,4,5,6,7,9,9,10,10,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[0,1,1,3,4,5,5,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
@@ -10085,15 +10085,15 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm7[0],ymm1[1],ymm7[2,3],ymm1[4],ymm7[5,6,7,8],ymm1[9],ymm7[10,11],ymm1[12],ymm7[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm10[1,2,2,3,5,6,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[0,1,2,2,4,5,6,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,3]
@@ -10108,14 +10108,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1],ymm2[2,3],ymm1[4],ymm2[5,6,7,8],ymm1[9],ymm2[10,11],ymm1[12],ymm2[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,1,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 124(%r8), %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u,255,255,255,255,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufhw {{.*#+}} ymm1 = ymm4[0,1,2,3,5,5,7,6,8,9,10,11,13,13,15,14]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[3,3,3,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u,255,255,255,255,255,255,255,255,255,255,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm1 = ymm6[2,3,3,3,6,7,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,1,3,2]
@@ -10139,7 +10139,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,0,1,2,3,6,7,4,5,6,7,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm1[0,1,1,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm1, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rsi), %xmm3
@@ -10198,7 +10198,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm9, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd (%rax), %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm1, %ymm2, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%r9), %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -10249,7 +10249,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm4[2],xmm6[3,4],xmm4[5],xmm6[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm0, %ymm4, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm14, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm4 = xmm11[1,1,2,2]
@@ -10291,7 +10291,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm9, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 4(%rax), %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm7, %ymm9, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm8, %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,1,3]
@@ -10325,7 +10325,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,2,3,3,4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,2,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7]
@@ -10362,7 +10362,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 8(%rax), %ymm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm5 # 16-byte Folded Reload
@@ -10409,7 +10409,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm5 = ymm6[0,1],ymm5[2],ymm6[3,4],ymm5[5],ymm6[6,7,8,9],ymm5[10],ymm6[11,12],ymm5[13],ymm6[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,1,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm4, %ymm5, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -10450,7 +10450,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm9 = ymm13[0,1,1,3,4,5,5,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm1, %ymm9
@@ -10486,7 +10486,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm8 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm8 = mem[2,2,2,2,6,6,6,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm7 = ymm7[0,1],ymm8[2],ymm7[3,4],ymm8[5],ymm7[6,7,8,9],ymm8[10],ymm7[11,12],ymm8[13],ymm7[14,15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm1, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $170, {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Folded Reload
@@ -10494,7 +10494,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3],ymm8[4],ymm9[5,6,7,8],ymm8[9],ymm9[10,11],ymm8[12],ymm9[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = <255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u,u,u,u,u,0,0,0,0,255,255,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm1, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm7 # 32-byte Reload
@@ -10522,14 +10522,14 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm1, %ymm9, %ymm10, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,u,u,u,u,u,u,8,9,u,u,u,u,u,u,u,u,22,23,u,u,u,u,24,25,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm6, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm12 = ymm14[1,2,2,3,5,6,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0,1],ymm11[2],ymm12[3,4],ymm11[5],ymm12[6,7,8,9],ymm11[10],ymm12[11,12],ymm11[13],ymm12[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,1,3,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm12 = ymm13[0,1,2,2,4,5,6,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,1,3,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm4 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm4, %ymm12
@@ -10553,21 +10553,21 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm8, %ymm12, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm9, %ymm10, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm1, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm11 = mem[3,3,3,3,7,7,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0,1,2],ymm10[3],ymm11[4,5],ymm10[6],ymm11[7,8,9,10],ymm10[11],ymm11[12,13],ymm10[14],ymm11[15]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm13, %ymm1, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm12 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm12 = mem[3,3,3,3,7,7,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm11 = ymm12[0],ymm11[1],ymm12[2,3],ymm11[4],ymm12[5,6,7,8],ymm11[9],ymm12[10,11],ymm11[12],ymm12[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,1,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,2,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u,255,255,255,255,0,0,0,0,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm1, %ymm11
@@ -10604,7 +10604,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $254, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm15 = mem[2,3,3,3,6,7,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,1,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0,u,u,u,u,u,u,u,u,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm13, %ymm4, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Folded Reload
@@ -10692,12 +10692,12 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm2, %ymm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm18
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm11, %ymm9, %ymm2
 ; AVX512F-SLOW-NEXT:    vporq %ymm1, %ymm2, %ymm16
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm12, %ymm8, %ymm1
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm13, %ymm7, %ymm2
 ; AVX512F-SLOW-NEXT:    vporq %ymm1, %ymm2, %ymm17
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
@@ -10706,7 +10706,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa %ymm3, %ymm10
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm2, %ymm21
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%r8), %ymm3
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm15, %ymm3, %ymm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm3, %ymm20
 ; AVX512F-SLOW-NEXT:    vpor %ymm1, %ymm2, %ymm1
@@ -10996,7 +10996,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,7,6]
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [0,1,2,3,4,5,4,5,6,7,10,11,8,9,10,11]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [16,18,19,19,19,19,u,u,0,1,0,1,2,3,2,3]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm29, %zmm1
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 100(%rax), %ymm2
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 104(%rax), %ymm3
@@ -11013,7 +11013,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[1,1,2,2]
 ; AVX512F-SLOW-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1],xmm2[2,3],xmm1[4],xmm2[5,6],xmm1[7]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm3[0,1,3,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [16,16,17,17,17,17,u,u,0,1,0,1,2,3,2,3]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm3, %zmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa 64(%rsi), %xmm4
@@ -11037,7 +11037,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm1[0,1,2,3,4,5,7,6]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,1,3,2,4,5,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,0,1,0,1,1,3,16,18,19,19,19,19,u,u]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm6, %zmm1
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 64(%rax), %ymm2
 ; AVX512F-SLOW-NEXT:    vpbroadcastd 68(%rax), %ymm5
@@ -11395,7 +11395,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpshufb %ymm15, %ymm0, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm20
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rdx), %ymm9
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm9, %ymm3
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm0, %ymm10
 ; AVX512F-FAST-NEXT:    vporq %ymm2, %ymm3, %ymm16
@@ -11403,7 +11403,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128,128,128,128,128,128,128,128,128]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm14, %ymm6, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqa 96(%rdi), %ymm7
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u,u,u,u,u,16,17,18,19]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm11, %ymm7, %ymm3
 ; AVX512F-FAST-NEXT:    vporq %ymm2, %ymm3, %ymm17
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [128,128,128,128,128,128,128,128,128,128,128,128,14,15,128,128,128,128,128,128,128,128,128,128,128,128,16,17,128,128,128,128]
@@ -11412,7 +11412,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqa 64(%r8), %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,12,13,14,15,128,128,u,u,u,u,u,u,u,u,u,u,16,17,128,128,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm4
 ; AVX512F-FAST-NEXT:    vporq %ymm2, %ymm4, %ymm18
 ; AVX512F-FAST-NEXT:    vmovdqa 64(%rcx), %ymm0
@@ -11495,7 +11495,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535]
 ; AVX512F-FAST-NEXT:    vpternlogq $248, %ymm12, %ymm0, %ymm1
 ; AVX512F-FAST-NEXT:    vextracti64x4 $1, %zmm3, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,4,u,u,u,5,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,4,u,u,u,5,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm11, %ymm3, %ymm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,0]
 ; AVX512F-FAST-NEXT:    vpternlogq $184, %ymm0, %ymm17, %ymm3
@@ -11517,19 +11517,19 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm5, %ymm20
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm3, %zmm1
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,u,14,15,12,13,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm3, %ymm6, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm3, %ymm26
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm7[3,3,3,3,7,7,7,7]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5],ymm1[6],ymm3[7,8,9,10],ymm1[11],ymm3[12,13],ymm1[14],ymm3[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [10,11,8,9,u,u,u,u,u,u,u,u,u,u,u,u,26,27,24,25,u,u,u,u,26,27,24,25,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm6, %ymm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm5, %ymm29
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm6 = ymm7[2,2,2,2,6,6,6,6]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3],ymm3[4],ymm6[5,6,7,8],ymm3[9],ymm6[10,11],ymm3[12],ymm6[13,14,15]
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,2,2,3,10,9,11,11]
 ; AVX512F-FAST-NEXT:    vpermt2q %zmm1, %zmm6, %zmm3
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [14,15,12,13,u,u,u,u,u,u,u,u,u,u,u,u,30,31,28,29,u,u,u,u,30,31,28,29,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm2, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm7, %ymm27
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm7 = ymm9[3,3,3,3,7,7,7,7]
@@ -11546,7 +11546,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm3, %zmm1, %zmm7
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm1, %zmm24
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <5,u,u,u,6,u,u,6>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [5,u,u,u,6,u,u,6]
 ; AVX512F-FAST-NEXT:    vpermd %ymm11, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa64 64(%rax), %zmm12
@@ -11594,7 +11594,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpshufb %ymm7, %ymm14, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufd {{.*#+}} ymm3 = ymm13[0,0,2,1,4,4,6,5]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0,1,2],ymm3[3],ymm0[4,5],ymm3[6],ymm0[7,8,9,10],ymm3[11],ymm0[12,13],ymm3[14],ymm0[15]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,u,3,10,10,11,11>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,u,3,10,10,11,11]
 ; AVX512F-FAST-NEXT:    vpermi2q %zmm0, %zmm21, %zmm3
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm8, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -12057,7 +12057,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movl $101455920, %ecx # imm = 0x60C1830
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,59,u,u,4,5,6,7,60,u,u,11,12,13,14,61,u,u,18,19,20,21,62,u,u,25,26,27,28,63,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm30, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm24 = [0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,27]
@@ -12071,7 +12071,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movl $-2096755688, %ecx # imm = 0x83060C18
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqu16 %zmm0, %zmm3 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,1,2,3,4,55,u,u,8,9,10,11,56,u,u,15,16,17,18,57,u,u,22,23,24,25,58,u,u,29,30,31]
 ; AVX512BW-NEXT:    vpermi2w %zmm30, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm19 = [2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36,2,34,0,32,0,0,0,3,35,1,33,0,0,0,4,36]
@@ -12103,7 +12103,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2w %zmm13, %zmm19, %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 %zmm29, %zmm19
 ; AVX512BW-NEXT:    vpermt2w %zmm9, %zmm28, %zmm29
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,32,u,u,u,u,11,12,33,u,u,u,u,18,19,34,u,u,u,u,25,26,35,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm2, %zmm0
 ; AVX512BW-NEXT:    movl $236730480, %ecx # imm = 0xE1C3870
 ; AVX512BW-NEXT:    vmovdqu16 %zmm25, %zmm29 {%k3}
@@ -12127,7 +12127,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm2 = [61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29,61,27,59,0,0,0,30,62,28,60,0,0,0,31,63,29]
 ; AVX512BW-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm4, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,1,2,59,u,u,u,u,8,9,60,u,u,u,u,15,16,61,u,u,u,u,22,23,62,u,u,u,u,29,30,63]
 ; AVX512BW-NEXT:    vpermi2w %zmm20, %zmm2, %zmm21
 ; AVX512BW-NEXT:    movl $-507279602, %eax # imm = 0xE1C3870E
 ; AVX512BW-NEXT:    kmovd %eax, %k3
@@ -12147,7 +12147,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0,0,0,0,25,57,23,55,0,0,0,26,58,24,56,0,0]
 ; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm4, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [54,u,u,u,u,5,6,55,u,u,u,u,12,13,56,u,u,u,u,19,20,57,u,u,u,u,26,27,58,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm20, %zmm3, %zmm24
 ; AVX512BW-NEXT:    movl $473460961, %eax # imm = 0x1C3870E1
 ; AVX512BW-NEXT:    kmovd %eax, %k2
@@ -12155,7 +12155,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = [20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54,20,52,18,50,0,0,0,21,53,19,51,0,0,0,22,54]
 ; AVX512BW-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm3, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,2,3,50,u,u,u,u,9,10,51,u,u,u,u,16,17,52,u,u,u,u,23,24,53,u,u,u,u,30,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm25, %zmm24, %zmm22
 ; AVX512BW-NEXT:    movl $-1014559204, %eax # imm = 0xC3870E1C
 ; AVX512BW-NEXT:    kmovd %eax, %k2
@@ -12176,7 +12176,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermi2w %zmm12, %zmm5, %zmm2
 ; AVX512BW-NEXT:    vpermi2w %zmm15, %zmm14, %zmm3
 ; AVX512BW-NEXT:    vmovdqu16 %zmm2, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,45,u,u,u,u,6,7,46,u,u,u,u,13,14,47,u,u,u,u,20,21,48,u,u,u,u,27,28,49,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm25, %zmm2, %zmm10
 ; AVX512BW-NEXT:    movl $946921923, %eax # imm = 0x3870E1C3
 ; AVX512BW-NEXT:    kmovd %eax, %k1
@@ -12196,7 +12196,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermi2w %zmm14, %zmm15, %zmm23
 ; AVX512BW-NEXT:    vpermi2w %zmm12, %zmm5, %zmm0
 ; AVX512BW-NEXT:    vmovdqu16 %zmm23, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,3,4,41,u,u,u,u,10,11,42,u,u,u,u,17,18,43,u,u,u,u,24,25,44,u,u,u,u,31]
 ; AVX512BW-NEXT:    vpermt2w %zmm25, %zmm10, %zmm1
 ; AVX512BW-NEXT:    movl $-2029118408, %eax # imm = 0x870E1C38
 ; AVX512BW-NEXT:    kmovd %eax, %k1
@@ -12218,7 +12218,7 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    # zmm1 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermt2w %zmm8, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vpermt2w %zmm26, %zmm1, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,36,u,u,u,u,7,8,37,u,u,u,u,14,15,38,u,u,u,u,21,22,39,u,u,u,u,28,29,40,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm20, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vpermt2w %zmm25, %zmm1, %zmm28
 ; AVX512BW-NEXT:    vmovdqu16 %zmm5, %zmm14 {%k3}
@@ -12226,10 +12226,10 @@ define void @store_i16_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqu16 %zmm28, %zmm19 {%k1}
 ; AVX512BW-NEXT:    vmovdqu16 %zmm4, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,59,u,4,5,6,7,8,60,u,11,12,13,14,15,61,u,18,19,20,21,22,62,u,25,26,27,28,29,63,u]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermi2w %zmm26, %zmm2, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,55,u,8,9,10,11,12,56,u,15,16,17,18,19,57,u,22,23,24,25,26,58,u,29,30,31]
 ; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm4 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermi2w %zmm26, %zmm4, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,59,4,5,6,7,8,9,60,11,12,13,14,15,16,61,18,19,20,21,22,23,62,25,26,27,28,29,30,63]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
index bc77e91a2bd40..cd18176532c3b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i16-stride-8.ll
@@ -336,10 +336,10 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
 ; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
@@ -464,10 +464,10 @@ define void @store_i16_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7]
 ; AVX512F-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,u,u,u,u,0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,1,4,5,8,9,12,13,u,u,u,u,u,u,u,u,2,3,6,7,10,11,14,15,u,u,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1],ymm3[2,3],ymm2[4,5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
@@ -689,28 +689,28 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm4
 ; AVX2-ONLY-NEXT:    vinserti128 $1, (%r10), %ymm3, %ymm3
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm6, %ymm5, %ymm2
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm8, %ymm7, %ymm9
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7]
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm10, %ymm9, %ymm11
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = <0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm13, %ymm12, %ymm14
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0,1],ymm2[2,3],ymm11[4,5],ymm2[6,7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm15 = <4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm15 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm15, %ymm12, %ymm12
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0,1],ymm5[2,3],ymm9[4,5],ymm5[6,7]
@@ -753,28 +753,28 @@ define void @store_i16_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti128 $1, (%r10), %ymm3, %ymm3
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15]
 ; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm6
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm8, %ymm7, %ymm9
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7]
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,u,u,4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm10, %ymm9, %ymm11
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = [4,5,12,13,u,u,u,u,u,u,u,u,u,u,u,u,6,7,14,15,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm13, %ymm12, %ymm14
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0,1],ymm6[2,3],ymm11[4,5],ymm6[6,7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11]
 ; AVX512F-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,u,0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u,2,3,10,11,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm15, %ymm12, %ymm12
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0,1],ymm4[2,3],ymm9[4,5],ymm4[6,7]
@@ -1358,32 +1358,32 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa (%rax), %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa (%r10), %xmm4
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,0,0,0,u,u,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa (%r9), %xmm6
 ; AVX2-FAST-NEXT:    vmovdqa (%r8), %xmm9
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm6[0],xmm9[1],xmm6[1],xmm9[2],xmm6[2],xmm9[3],xmm6[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,0,u,u,u,1,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,u,0,u,u,u,1,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm11, %ymm2, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1,2],ymm0[3],ymm7[4,5,6],ymm0[7]
 ; AVX2-FAST-NEXT:    vmovdqa (%rcx), %xmm10
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm12
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm12[0],xmm10[0],xmm12[1],xmm10[1],xmm12[2],xmm10[2],xmm12[3],xmm10[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,0,1,1,1,1,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm0, %ymm14
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm15
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,u,1,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,u,1,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0],ymm14[1],ymm7[2,3,4],ymm14[5],ymm7[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm7[0,1],ymm3[2,3],ymm7[4,5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,2,2,2,u,u,3,3>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,2,2,2,u,u,3,3]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm7, %ymm1
 ; AVX2-FAST-NEXT:    vpermd %ymm11, %ymm7, %ymm3
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2],ymm1[3],ymm3[4,5,6],ymm1[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <2,2,3,3,3,3,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [2,2,3,3,3,3,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm11, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm11, %ymm0
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
@@ -1406,13 +1406,13 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1],ymm5[2,3],ymm2[4,5],ymm5[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa (%r10), %ymm13
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,0,0,u,u,1,1>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,0,u,u,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa (%rax), %ymm15
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,u,0,u,u,u,1,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,u,0,u,u,u,1,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm1, %ymm4
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,1,1,1,1,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,1,1,1,1,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm1, %ymm4
 ; AVX2-FAST-NEXT:    vpermd %ymm10, %ymm8, %ymm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0],ymm4[1],ymm5[2,3,4],ymm4[5],ymm5[6,7]
@@ -1685,9 +1685,9 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX512F-NEXT:    vinserti32x4 $2, %xmm11, %zmm11, %zmm11
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [0,u,0,16,u,u,1,17,10,10,10,26,u,u,11,27]
 ; AVX512F-NEXT:    vpermt2d %zmm16, %zmm12, %zmm20
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,16,1,u,1,17,u,u,10,26,11,11,11,27,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm17, %zmm13, %zmm0
 ; AVX512F-NEXT:    movb $-86, %cl
 ; AVX512F-NEXT:    kmovw %ecx, %k1
@@ -1723,26 +1723,26 @@ define void @store_i16_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%r10), %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,16,32,48,u,u,u,u,1,17,33,49,u,u,u,u,2,18,34,50,u,u,u,u,3,19,35,51,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm5
 ; AVX512BW-NEXT:    movb $-86, %cl
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [4,20,36,52,u,u,u,u,5,21,37,53,u,u,u,u,6,22,38,54,u,u,u,u,7,23,39,55,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,24,40,56,u,u,u,u,9,25,41,57,u,u,u,u,10,26,42,58,u,u,u,u,11,27,43,59,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,28,44,60,u,u,u,u,13,29,45,61,u,u,u,u,14,30,46,62,u,u,u,u,15,31,47,63,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
@@ -2763,7 +2763,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa (%r10), %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,2,2,2,u,u,3,3>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,2,2,2,u,u,3,3]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa (%r9), %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2777,7 +2777,7 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <2,2,3,3,3,3,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [2,2,3,3,3,3,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm7, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm6
 ; AVX2-FAST-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2788,17 +2788,17 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0],ymm4[1],ymm6[2,3,4],ymm4[5],ymm6[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm4[0,1],ymm0[2,3],ymm4[4,5],ymm0[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,0,0,0,u,u,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm0, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,u,0,u,u,u,1,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm0, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm7
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm2[0,1,2],ymm4[3],ymm2[4,5,6],ymm4[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,0,1,1,1,1,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm0, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <0,u,1,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,u,1,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm5, %ymm14, %ymm5
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm5[0],ymm6[1],ymm5[2,3,4],ymm6[5],ymm5[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
@@ -2823,11 +2823,11 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm14[0],ymm3[1],ymm14[2,3,4],ymm3[5],ymm14[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm3[0,1],ymm12[2,3],ymm3[4,5],ymm12[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <2,2,2,2,u,u,3,3>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,2,2,2,u,u,3,3]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm12, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm12, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm3[3],ymm2[4,5,6],ymm3[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <2,2,3,3,3,3,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [2,2,3,3,3,3,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm15, %ymm1
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm15, %ymm0
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3,4],ymm1[5],ymm0[6,7]
@@ -2835,16 +2835,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm10[0],xmm13[0],xmm10[1],xmm13[1],xmm10[2],xmm13[2],xmm10[3],xmm13[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <0,0,0,0,u,u,1,1>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,0,u,u,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm8, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,u,0,u,u,u,1,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm10, %ymm3
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3]
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,0,1,1,1,1,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,0,1,1,1,1,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm11, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,u,1,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,u,1,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm7, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
@@ -3351,9 +3351,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r8), %xmm5
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm1, %zmm26, %zmm30
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3]
 ; AVX512F-SLOW-NEXT:    movw $-30584, %r11w # imm = 0x8888
 ; AVX512F-SLOW-NEXT:    kmovw %r11d, %k1
 ; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm27, %zmm30 {%k1}
@@ -3363,9 +3363,9 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rsi), %xmm10
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %xmm11
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm1, %zmm28, %zmm3
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u]
 ; AVX512F-SLOW-NEXT:    movw $8738, %r11w # imm = 0x2222
 ; AVX512F-SLOW-NEXT:    kmovw %r11d, %k2
 ; AVX512F-SLOW-NEXT:    vpermd %zmm0, %zmm29, %zmm3 {%k2}
@@ -3636,21 +3636,21 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpermd %zmm6, %zmm13, %zmm3 {%k2}
 ; AVX512F-FAST-NEXT:    vpermd %zmm4, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vpermd %zmm5, %zmm13, %zmm0 {%k2}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11]
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 # 64-byte Folded Reload
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm5 {%k1} # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11]
 ; AVX512F-FAST-NEXT:    vpermd %zmm12, %zmm6, %zmm6
 ; AVX512F-FAST-NEXT:    vpermd %zmm28, %zmm4, %zmm6 {%k1}
 ; AVX512F-FAST-NEXT:    vpermd %zmm25, %zmm4, %zmm12
 ; AVX512F-FAST-NEXT:    vpermd %zmm16, %zmm4, %zmm12 {%k1}
 ; AVX512F-FAST-NEXT:    vpermd %zmm1, %zmm4, %zmm1
 ; AVX512F-FAST-NEXT:    vpermd %zmm2, %zmm4, %zmm1 {%k1}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,0,1,1,1,1,u,u,10,u,11,u,11,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,0,1,1,1,1,u,u,10,u,11,u,11,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm20, %zmm2, %zmm2
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u]
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm4, %zmm2 {%k2} # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm30, %zmm13, %zmm13
 ; AVX512F-FAST-NEXT:    vpermd %zmm29, %zmm4, %zmm13 {%k2}
 ; AVX512F-FAST-NEXT:    vpermd %zmm15, %zmm4, %zmm15
@@ -3693,16 +3693,16 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 (%r11), %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm7, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm9
 ; AVX512BW-NEXT:    movw $-30584, %cx # imm = 0x8888
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm4
 ; AVX512BW-NEXT:    movw $8738, %cx # imm = 0x2222
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
@@ -3710,80 +3710,80 @@ define void @store_i16_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movb $-86, %cl
 ; AVX512BW-NEXT:    kmovd %ecx, %k3
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm4 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm7, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm10
 ; AVX512BW-NEXT:    vmovdqa32 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm9
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm7, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm11
 ; AVX512BW-NEXT:    vmovdqa32 %zmm10, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm10
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm10 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm7, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm12
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm11
 ; AVX512BW-NEXT:    vmovdqa32 %zmm13, %zmm11 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm11 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm7, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm13
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm14
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm14 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm7, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm13
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm15
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm15 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm7, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm13
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm16
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm16 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm16 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63]
 ; AVX512BW-NEXT:    vpermi2w %zmm8, %zmm7, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm6, %zmm5, %zmm7
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm3, %zmm2, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm5, %zmm2 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k3}
@@ -5745,51 +5745,51 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqa (%rax), %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa (%r10), %xmm5
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,1,1>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,0,0,0,u,u,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm0, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa (%r9), %xmm6
 ; AVX2-FAST-NEXT:    vmovdqa (%r8), %xmm7
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,u,0,u,u,u,1,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,u,0,u,u,u,1,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm10, %ymm0, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
 ; AVX2-FAST-NEXT:    vmovdqa (%rcx), %xmm8
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm9
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <0,0,1,1,1,1,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [0,0,1,1,1,1,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm11, %ymm0, %ymm12
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm13
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm1[0],xmm13[0],xmm1[1],xmm13[1],xmm1[2],xmm13[2],xmm1[3],xmm13[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <0,u,1,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,u,1,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm14, %ymm15
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm12 = ymm15[0],ymm12[1],ymm15[2,3,4],ymm12[5],ymm15[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm12[0,1],ymm2[2,3],ymm12[4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <2,2,2,2,u,u,3,3>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,2,2,2,u,u,3,3]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm12, %ymm2
 ; AVX2-FAST-NEXT:    vpermd %ymm10, %ymm12, %ymm3
 ; AVX2-FAST-NEXT:    vmovdqa %ymm12, %ymm15
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,2,3,3,3,3,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,2,3,3,3,3,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm11, %ymm10, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm10, %ymm0
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm3[1],ymm0[2,3,4],ymm3[5],ymm0[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm2[2,3],ymm0[4,5],ymm2[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,0,0,0,u,u,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm11, %ymm2
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,u,0,u,u,u,1,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,u,0,u,u,u,1,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm12, %ymm4
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rax), %xmm10
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm8 = xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <0,0,1,1,1,1,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,0,1,1,1,1,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm8, %ymm14, %ymm5
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm1[4],xmm13[4],xmm1[5],xmm13[5],xmm1[6],xmm13[6],xmm1[7],xmm13[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <0,u,1,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,u,1,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r10), %xmm6
@@ -5800,7 +5800,7 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm15, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2],ymm0[3],ymm2[4,5,6],ymm0[7]
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r8), %xmm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,2,3,3,3,3,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,2,3,3,3,3,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm8, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1],ymm1[2,3,4],ymm2[5],ymm1[6,7]
@@ -5818,17 +5818,17 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm14, %ymm15
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,u,1,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,u,1,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm12, %ymm14
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm14[0],ymm15[1],ymm14[2,3,4],ymm15[5],ymm14[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm14[0,1],ymm1[2,3],ymm14[4,5],ymm1[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <2,2,2,2,u,u,3,3>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [2,2,2,2,u,u,3,3]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm14, %ymm1
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm14, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa %ymm14, %ymm15
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2],ymm1[3],ymm2[4,5,6],ymm1[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <2,2,3,3,3,3,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [2,2,3,3,3,3,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm4, %ymm2
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa %ymm4, %ymm13
@@ -5837,14 +5837,14 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm6[4],xmm10[4],xmm6[5],xmm10[5],xmm6[6],xmm10[6],xmm6[7],xmm10[7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,0,0,0,u,u,1,1>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,0,0,0,u,u,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm7, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <0,u,0,u,u,u,1,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,u,0,u,u,u,1,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm10, %ymm4
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2],ymm2[3],ymm4[4,5,6],ymm2[7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm3[4],xmm8[5],xmm3[5],xmm8[6],xmm3[6],xmm8[7],xmm3[7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm11[4],xmm9[4],xmm11[5],xmm9[5],xmm11[6],xmm9[6],xmm11[7],xmm9[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <0,0,1,1,1,1,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,0,1,1,1,1,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm14, %ymm5
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm12, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
@@ -5878,12 +5878,12 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm15[0],ymm13[1],ymm15[2,3,4],ymm13[5],ymm15[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm13[0,1],ymm5[2,3],ymm13[4,5],ymm5[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <2,2,2,2,u,u,3,3>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [2,2,2,2,u,u,3,3]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm13, %ymm4
 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm13, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa %ymm13, %ymm15
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,2,3,3,3,3,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm11, %ymm12, %ymm5
 ; AVX2-FAST-NEXT:    vpermd %ymm14, %ymm12, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa %ymm12, %ymm14
@@ -5892,16 +5892,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,0,0,0,u,u,1,1>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,0,0,0,u,u,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm11, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <0,u,0,u,u,u,1,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,u,0,u,u,u,1,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm13, %ymm3
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <0,0,1,1,1,1,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,0,1,1,1,1,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm10, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,u,1,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,u,1,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm12, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
@@ -5934,11 +5934,11 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0],ymm10[1],ymm15[2,3,4],ymm10[5],ymm15[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm10[0,1],ymm3[2,3],ymm10[4,5],ymm3[6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <2,2,2,2,u,u,3,3>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [2,2,2,2,u,u,3,3]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm10, %ymm2
 ; AVX2-FAST-NEXT:    vpermd %ymm6, %ymm10, %ymm3
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <2,2,3,3,3,3,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [2,2,3,3,3,3,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm12, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm14, %ymm12, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm3 = ymm6[0],ymm3[1],ymm6[2,3,4],ymm3[5],ymm6[6,7]
@@ -5946,16 +5946,16 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm0 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,0,0,0,u,u,1,1>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,0,0,u,u,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,u,0,u,u,u,1,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,u,0,u,u,u,1,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm3[0,1,2],ymm2[3],ymm3[4,5,6],ymm2[7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
 ; AVX2-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm4 = xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,0,1,1,1,1,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,0,1,1,1,1,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm5, %ymm5
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,u,1,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,u,1,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm4, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm5 = ymm6[0],ymm5[1],ymm6[2,3,4],ymm5[5],ymm6[6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm5[0,1],ymm2[2,3],ymm5[4,5],ymm2[6,7]
@@ -6857,9 +6857,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r8), %xmm7
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm2 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [0,u,0,u,u,u,1,u,2,2,2,2,u,u,3,3]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm2, %zmm30, %zmm0
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,0,0,0,u,u,1,1,2,2,2,2,u,u,3,3]
 ; AVX512F-SLOW-NEXT:    movw $-30584, %r11w # imm = 0x8888
 ; AVX512F-SLOW-NEXT:    kmovw %r11d, %k2
 ; AVX512F-SLOW-NEXT:    vpermd %zmm1, %zmm29, %zmm0 {%k2}
@@ -7022,9 +7022,9 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpermd %zmm6, %zmm30, %zmm8
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm6 = xmm12[4],xmm9[4],xmm12[5],xmm9[5],xmm12[6],xmm9[6],xmm12[7],xmm9[7]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm4, %zmm29, %zmm8 {%k2}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [0,u,1,u,1,u,u,u,2,2,3,3,3,3,u,u]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm6, %zmm29, %zmm7
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [0,0,1,1,1,1,u,u,2,2,3,3,3,3,u,u]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm3, %zmm30, %zmm7 {%k1}
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rsi), %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa 96(%rdi), %xmm2
@@ -7381,10 +7381,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpermd %zmm20, %zmm1, %zmm4 {%k1}
 ; AVX512F-FAST-NEXT:    vpermd %zmm5, %zmm0, %zmm5
 ; AVX512F-FAST-NEXT:    vpermd %zmm10, %zmm1, %zmm5 {%k1}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,0,0,0,u,u,1,1,10,10,10,10,u,u,11,11]
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 # 64-byte Folded Reload
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm1 {%k2} # 64-byte Folded Reload
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,u,0,u,u,u,1,u,10,10,10,10,u,u,11,11]
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm10, %zmm10 # 64-byte Folded Reload
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm10 {%k2} # 64-byte Folded Reload
 ; AVX512F-FAST-NEXT:    vpermd {{[-0-9]+}}(%r{{[sb]}}p), %zmm0, %zmm16 # 64-byte Folded Reload
@@ -7400,15 +7400,15 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpermd %zmm12, %zmm0, %zmm23
 ; AVX512F-FAST-NEXT:    vpermd %zmm14, %zmm0, %zmm23 {%k2}
 ; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm11, %zmm11, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,0,1,1,1,1,u,u,10,u,11,u,11,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,1,1,u,u,10,u,11,u,11,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm11, %zmm12
 ; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,0,1,1,1,1,u,u,10,10,11,11,11,11,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm11, %zmm12 {%k1}
 ; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm0, %zmm0, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,u,1,u,1,u,u,u,10,10,11,11,11,11,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm0, %zmm14, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm8, %zmm8, %zmm14
@@ -7502,57 +7502,57 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm30
 ; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
 ; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm3, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
 ; AVX512BW-NEXT:    vpermt2w %zmm27, %zmm2, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
 ; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm5, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
 ; AVX512BW-NEXT:    vpermt2w %zmm27, %zmm6, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
 ; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm7, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
 ; AVX512BW-NEXT:    vpermt2w %zmm27, %zmm8, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10
 ; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm9, %zmm10
 ; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm11
 ; AVX512BW-NEXT:    vpermt2w %zmm27, %zmm10, %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm11, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
 ; AVX512BW-NEXT:    vpermt2w %zmm27, %zmm12, %zmm13
 ; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm14
 ; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm13, %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u>
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm15, %zmm16
 ; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59]
 ; AVX512BW-NEXT:    vpermt2w %zmm30, %zmm31, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
@@ -7574,10 +7574,10 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2w %zmm28, %zmm31, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm31
 ; AVX512BW-NEXT:    vpermt2w %zmm27, %zmm28, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm27, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm27
@@ -7598,33 +7598,33 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2w %zmm1, %zmm0, %zmm27
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm20
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm21
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm21
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm22
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm22
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm23
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm24
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm24
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [u,u,16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm25
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm29, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [u,u,28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm26
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm30, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm0
@@ -7643,28 +7643,28 @@ define void @store_i16_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm1, %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [4,36,u,u,u,u,u,u,5,37,u,u,u,u,u,u,6,38,u,u,u,u,u,u,7,39,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm18
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm12, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,32,u,u,u,u,u,u,1,33,u,u,u,u,u,u,2,34,u,u,u,u,u,u,3,35,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm17
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm10, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [12,44,u,u,u,u,u,u,13,45,u,u,u,u,u,u,14,46,u,u,u,u,u,u,15,47,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm15
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm8, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,40,u,u,u,u,u,u,9,41,u,u,u,u,u,u,10,42,u,u,u,u,u,u,11,43,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm14
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm7, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [20,52,u,u,u,u,u,u,21,53,u,u,u,u,u,u,22,54,u,u,u,u,u,u,23,55,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm6, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [16,48,u,u,u,u,u,u,17,49,u,u,u,u,u,u,18,50,u,u,u,u,u,u,19,51,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm11
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm4, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [28,60,u,u,u,u,u,u,29,61,u,u,u,u,u,u,30,62,u,u,u,u,u,u,31,63,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm2, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [24,56,u,u,u,u,u,u,25,57,u,u,u,u,u,u,26,58,u,u,u,u,u,u,27,59,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2w %zmm0, %zmm19, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
index 1fc25ee329333..3abc8b1fb8250 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-3.ll
@@ -55,7 +55,7 @@ define void @store_i32_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-NEXT:    vmovsd {{.*#+}} xmm2 = mem[0],zero
 ; AVX2-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; AVX2-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <0,2,4,1,3,5,u,u>
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,2,4,1,3,5,u,u]
 ; AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2-NEXT:    vextractf128 $1, %ymm0, %xmm1
 ; AVX2-NEXT:    vmovlps %xmm1, 16(%rcx)
@@ -120,7 +120,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX2-SLOW-NEXT:    vmovaps (%rsi), %xmm1
 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm3 = [0,4,u,1,5,u,2,6]
 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vbroadcastsd (%rdx), %ymm3
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
@@ -140,7 +140,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovddup {{.*#+}} xmm2 = mem[0,0]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0],xmm1[1,2],xmm2[3]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = <0,4,u,1,5,u,2,6>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm2 = [0,4,u,1,5,u,2,6]
 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vbroadcastsd (%rdx), %ymm2
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm2[2],ymm0[3,4],ymm2[5],ymm0[6,7]
@@ -154,7 +154,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps (%rsi), %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm3 = <0,4,u,1,5,u,2,6>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm3 = [0,4,u,1,5,u,2,6]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastsd (%rdx), %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1],ymm3[2],ymm2[3,4],ymm3[5],ymm2[6,7]
@@ -171,7 +171,7 @@ define void @store_i32_stride3_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = <0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u>
+; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,4,8,1,5,9,2,6,10,3,7,11,u,u,u,u]
 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512-NEXT:    vmovaps %ymm0, (%rcx)
@@ -669,15 +669,15 @@ define void @store_i32_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm1
 ; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10]
 ; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
@@ -1328,17 +1328,17 @@ define void @store_i32_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 64(%rsi), %zmm3
 ; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5]
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512-NEXT:    vpermt2d %zmm2, %zmm6, %zmm7
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
 ; AVX512-NEXT:    vpermt2d %zmm4, %zmm8, %zmm7
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm10
 ; AVX512-NEXT:    vpermt2d %zmm3, %zmm9, %zmm10
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
 ; AVX512-NEXT:    vpermt2d %zmm5, %zmm11, %zmm10
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10]
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512-NEXT:    vpermt2d %zmm1, %zmm12, %zmm13
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
@@ -2734,17 +2734,17 @@ define void @store_i32_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm9
 ; AVX512-NEXT:    vmovdqa64 128(%rdx), %zmm10
 ; AVX512-NEXT:    vmovdqa64 192(%rdx), %zmm11
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [0,16,u,1,17,u,2,18,u,3,19,u,4,20,u,5]
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512-NEXT:    vpermt2d %zmm4, %zmm12, %zmm13
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,1,16,3,4,17,6,7,18,9,10,19,12,13,20,15]
 ; AVX512-NEXT:    vpermt2d %zmm8, %zmm14, %zmm13
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,11,27,u,12,28,u,13,29,u,14,30,u,15,31,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm16
 ; AVX512-NEXT:    vpermt2d %zmm7, %zmm15, %zmm16
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [26,1,2,27,4,5,28,7,8,29,10,11,30,13,14,31]
 ; AVX512-NEXT:    vpermt2d %zmm11, %zmm17, %zmm16
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [5,u,22,6,u,23,7,u,24,8,u,25,9,u,26,10]
 ; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm19
 ; AVX512-NEXT:    vpermt2d %zmm0, %zmm18, %zmm19
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,21,2,3,22,5,6,23,8,9,24,11,12,25,14,15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
index a4480d5d7e957..9d91423ac13d7 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-4.ll
@@ -192,14 +192,14 @@ define void @store_i32_stride4_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vmovaps (%rdx), %xmm1
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vinsertf128 $1, (%rcx), %ymm1, %ymm1
-; AVX2-ONLY-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,0,4,u,u,1,5>
+; AVX2-ONLY-NEXT:    vmovaps {{.*#+}} ymm2 = [u,u,0,4,u,u,1,5]
 ; AVX2-ONLY-NEXT:    vpermps %ymm1, %ymm2, %ymm2
-; AVX2-ONLY-NEXT:    vmovaps {{.*#+}} ymm3 = <0,4,u,u,1,5,u,u>
+; AVX2-ONLY-NEXT:    vmovaps {{.*#+}} ymm3 = [0,4,u,u,1,5,u,u]
 ; AVX2-ONLY-NEXT:    vpermps %ymm0, %ymm3, %ymm3
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7]
-; AVX2-ONLY-NEXT:    vmovaps {{.*#+}} ymm3 = <u,u,2,6,u,u,3,7>
+; AVX2-ONLY-NEXT:    vmovaps {{.*#+}} ymm3 = [u,u,2,6,u,u,3,7]
 ; AVX2-ONLY-NEXT:    vpermps %ymm1, %ymm3, %ymm1
-; AVX2-ONLY-NEXT:    vmovaps {{.*#+}} ymm3 = <2,6,u,u,3,7,u,u>
+; AVX2-ONLY-NEXT:    vmovaps {{.*#+}} ymm3 = [2,6,u,u,3,7,u,u]
 ; AVX2-ONLY-NEXT:    vpermps %ymm0, %ymm3, %ymm0
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-ONLY-NEXT:    vmovaps %ymm0, 32(%r8)
@@ -661,26 +661,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512F-NEXT:    movb $-86, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, 192(%r8)
@@ -696,26 +696,26 @@ define void @store_i32_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512BW-NEXT:    movb $-86, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%r8)
@@ -1337,32 +1337,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm6
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm9
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm8, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm10, %zmm11
 ; AVX512F-NEXT:    movb $-86, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm9, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm13, %zmm14
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm15
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm12, %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm17
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm16, %zmm17
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27]
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm15, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
 ; AVX512F-NEXT:    vpermi2d %zmm7, %zmm5, %zmm8
@@ -1398,32 +1398,32 @@ define void @store_i32_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm9
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm10, %zmm11
 ; AVX512BW-NEXT:    movb $-86, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm9, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm13, %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm12, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm17
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm16, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27]
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm15, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vpermi2d %zmm7, %zmm5, %zmm8
@@ -2725,32 +2725,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm26
 ; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm19
 ; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm8
 ; AVX512F-NEXT:    vpermt2d %zmm21, %zmm14, %zmm8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm17, %zmm7, %zmm4
 ; AVX512F-NEXT:    movb $-86, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm10
 ; AVX512F-NEXT:    vpermt2d %zmm21, %zmm16, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512F-NEXT:    vpermt2d %zmm17, %zmm11, %zmm8
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm20
 ; AVX512F-NEXT:    vpermt2d %zmm21, %zmm18, %zmm20
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
 ; AVX512F-NEXT:    vpermt2d %zmm17, %zmm15, %zmm10
 ; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27]
 ; AVX512F-NEXT:    vpermt2d %zmm21, %zmm20, %zmm22
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm17, %zmm21, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm22
@@ -2838,32 +2838,32 @@ define void @store_i32_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm19
 ; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,4,20,u,u,5,21,u,u,6,22,u,u,7,23]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm8
 ; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm14, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm7, %zmm4
 ; AVX512BW-NEXT:    movb $-86, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [u,u,0,16,u,u,1,17,u,u,2,18,u,u,3,19]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm10
 ; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm16, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,16,u,u,1,17,u,u,2,18,u,u,3,19,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm11, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,u,12,28,u,u,13,29,u,u,14,30,u,u,15,31]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm20
 ; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm18, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [12,28,u,u,13,29,u,u,14,30,u,u,15,31,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
 ; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm15, %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,8,24,u,u,9,25,u,u,10,26,u,u,11,27]
 ; AVX512BW-NEXT:    vpermt2d %zmm21, %zmm20, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [8,24,u,u,9,25,u,u,10,26,u,u,11,27,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm17, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm22
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
index 1073c24b228ba..2da05c85bf8f5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll
@@ -73,7 +73,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm1[0]
 ; AVX2-ONLY-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm0
 ; AVX2-ONLY-NEXT:    vmovq %rax, %xmm2
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,2,4,6,u,1,3,5>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,2,4,6,u,1,3,5]
 ; AVX2-ONLY-NEXT:    vpermd %ymm0, %ymm3, %ymm0
 ; AVX2-ONLY-NEXT:    vmovd %eax, %xmm3
 ; AVX2-ONLY-NEXT:    vpbroadcastd %xmm3, %ymm3
@@ -96,7 +96,7 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
 ; AVX512-NEXT:    vinsertf32x4 $2, %xmm1, %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = <0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u>
+; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,1,3,5,7,9,u,u,u,u,u,u]
 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, %xmm1
 ; AVX512-NEXT:    vmovlps %xmm1, 32(%r9)
@@ -199,16 +199,16 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovaps (%r8), %xmm4
 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm5
 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm6
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm7 = <u,u,0,4,u,u,u,1>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm7 = [u,u,0,4,u,u,u,1]
 ; AVX2-SLOW-NEXT:    vpermps %ymm6, %ymm7, %ymm7
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm8 = <u,4,u,u,u,1,5,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm8 = [u,4,u,u,u,1,5,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm5, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7]
 ; AVX2-SLOW-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1,2,3],ymm0[4],ymm7[5,6,7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm7 = <u,u,2,6,u,u,u,3>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm7 = [u,u,2,6,u,u,u,3]
 ; AVX2-SLOW-NEXT:    vpermps %ymm5, %ymm7, %ymm5
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm7 = <5,u,u,u,2,6,u,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm7 = [5,u,u,u,2,6,u,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm6, %ymm7, %ymm6
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm6 = ymm4[0,1,2,1]
@@ -230,16 +230,16 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovaps (%r8), %xmm3
 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm4
 ; AVX2-FAST-NEXT:    vinsertf128 $1, (%rcx), %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = <u,u,0,4,u,u,u,1>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [u,u,0,4,u,u,u,1]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm5, %ymm5
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm6 = <u,4,u,u,u,1,5,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm6 = [u,4,u,u,u,1,5,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm4, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5,6],ymm5[7]
 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm5[1,2,3],ymm0[4],ymm5[5,6,7]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = <u,u,2,6,u,u,u,3>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [u,u,2,6,u,u,u,3]
 ; AVX2-FAST-NEXT:    vpermps %ymm4, %ymm5, %ymm4
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = <5,u,u,u,2,6,u,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm5 = [5,u,u,u,2,6,u,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm4 = ymm5[0,1],ymm4[2,3],ymm5[4,5],ymm4[6,7]
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm5 = ymm3[0,1,2,1]
@@ -264,16 +264,16 @@ define void @store_i32_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps (%r8), %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm7 = <u,u,0,4,u,u,u,1>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm7 = [u,u,0,4,u,u,u,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm6, %ymm7, %ymm7
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm8 = <u,4,u,u,u,1,5,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm8 = [u,4,u,u,u,1,5,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm5, %ymm8, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm7 = ymm8[0,1],ymm7[2,3],ymm8[4,5,6],ymm7[7]
 ; AVX2-FAST-PERLANE-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm7[1,2,3],ymm0[4],ymm7[5,6,7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm7 = <u,u,2,6,u,u,u,3>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm7 = [u,u,2,6,u,u,u,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm5, %ymm7, %ymm5
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm7 = <5,u,u,u,2,6,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm7 = [5,u,u,u,2,6,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm6, %ymm7, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm5 = ymm6[0,1],ymm5[2,3],ymm6[4,5],ymm5[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm6 = ymm4[0,1,2,1]
@@ -528,7 +528,7 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovaps (%rsi), %xmm6
 ; AVX2-FAST-NEXT:    vmovaps (%rdi), %xmm7
 ; AVX2-FAST-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm8 = <0,1,0,1,u,u,2,2>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm8 = [0,1,0,1,u,u,2,2]
 ; AVX2-FAST-NEXT:    vpermps %ymm5, %ymm8, %ymm5
 ; AVX2-FAST-NEXT:    vmovaps (%rdx), %xmm9
 ; AVX2-FAST-NEXT:    vmovaps (%rcx), %xmm10
@@ -541,7 +541,7 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vunpcklps {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1]
 ; AVX2-FAST-NEXT:    vpermps %ymm9, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm7 = <0,1,u,u,3,2,3,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm7 = [0,1,u,u,3,2,3,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm6, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm7 = ymm7[0,1],ymm8[2,3],ymm7[4,5,6],ymm8[7]
 ; AVX2-FAST-NEXT:    vinsertf128 $1, (%r8), %ymm6, %ymm6
@@ -648,13 +648,13 @@ define void @store_i32_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa (%r8), %ymm2
 ; AVX512-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = <6,14,u,23,31,7,15,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [6,14,u,23,31,7,15,u]
 ; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [11,19,27,u,4,12,20,28,u,5,13,21,29,u,6,14]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,19,4,5,6,7,20,9,10,11,12,21,14,15]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm4, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,8,16,24,u,1,9,17,25,u,2,10,18,26,u,3]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,16,5,6,7,8,17,10,11,12,13,18,15]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm4, %zmm0
@@ -1125,7 +1125,7 @@ define void @store_i32_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX2-FAST-NEXT:    vmovaps 32(%rdi), %xmm10
 ; AVX2-FAST-NEXT:    vunpckhps {{.*#+}} xmm5 = xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm12 = <0,1,0,1,u,u,2,2>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm12 = [0,1,0,1,u,u,2,2]
 ; AVX2-FAST-NEXT:    vpermps %ymm5, %ymm12, %ymm5
 ; AVX2-FAST-NEXT:    vmovaps (%rdx), %xmm8
 ; AVX2-FAST-NEXT:    vmovaps 32(%rdx), %xmm14
@@ -2486,7 +2486,7 @@ define void @store_i32_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovaps 64(%rdi), %xmm13
 ; AVX2-FAST-NEXT:    vmovaps %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm7[2],xmm6[2],xmm7[3],xmm6[3]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm12 = <0,1,0,1,u,u,2,2>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm12 = [0,1,0,1,u,u,2,2]
 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm12, %ymm0
 ; AVX2-FAST-NEXT:    vmovaps (%rdx), %xmm8
 ; AVX2-FAST-NEXT:    vmovaps 32(%rdx), %xmm2
@@ -5324,7 +5324,7 @@ define void @store_i32_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX2-FAST-NEXT:    vmovaps 64(%rdi), %xmm6
 ; AVX2-FAST-NEXT:    vmovaps %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vunpckhps {{.*#+}} xmm0 = xmm14[2],xmm11[2],xmm14[3],xmm11[3]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm9 = <0,1,0,1,u,u,2,2>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm9 = [0,1,0,1,u,u,2,2]
 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm9, %ymm2
 ; AVX2-FAST-NEXT:    vmovaps (%rdx), %xmm15
 ; AVX2-FAST-NEXT:    vmovaps 32(%rdx), %xmm10
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
index a781ae5eb8967..e8d7eb34988b3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-6.ll
@@ -104,7 +104,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovlhps {{.*#+}} xmm3 = xmm2[0],xmm1[0]
 ; AVX2-FAST-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[1,3],ymm3[1,3],ymm0[5,7],ymm3[5,7]
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[2,1,2,3]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,1,3>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [0,2,4,6,u,u,1,3]
 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vunpcklps {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm1
@@ -152,7 +152,7 @@ define void @store_i32_stride6_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovlhps {{.*#+}} xmm2 = xmm3[0],xmm2[0]
 ; AVX512-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-NEXT:    vinsertf32x4 $2, %xmm2, %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = <0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u>
+; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,6,8,10,1,3,5,7,9,11,u,u,u,u]
 ; AVX512-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, 32(%rax)
 ; AVX512-NEXT:    vmovaps %ymm0, (%rax)
@@ -734,15 +734,15 @@ define void @store_i32_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
 ; AVX512-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,16,24,u,u,1,9,17,25,u,u,2,10,18,26]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,16,24,6,7,8,9,17,25,12,13,14,15]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm4
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,3,11,19,27,u,u,4,12,20,28,u,u,5,13]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [18,26,2,3,4,5,19,27,8,9,10,11,20,28,14,15]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,13,u,u,22,30,6,14,u,u,23,31,7,15,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm0, %zmm1, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,21,29,4,5,6,7,22,30,10,11,12,13,23,31]
 ; AVX512-NEXT:    vpermi2d %zmm2, %zmm3, %zmm0
@@ -1657,17 +1657,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    movb $36, %cl
 ; AVX512F-SLOW-NEXT:    kmovw %ecx, %k1
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm1, %zmm6, %zmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm6
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
 ; AVX512F-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512F-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
@@ -1680,7 +1680,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    movb $-110, %cl
 ; AVX512F-SLOW-NEXT:    kmovw %ecx, %k2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k2}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
@@ -1691,7 +1691,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
@@ -1701,7 +1701,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
 ; AVX512F-SLOW-NEXT:    vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm1, %zmm7, %zmm11
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm0, %zmm11, %zmm7
@@ -1710,7 +1710,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm11
 ; AVX512F-SLOW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15]
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm1, %zmm11, %zmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
@@ -1741,11 +1741,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    movb $-110, %cl
 ; AVX512F-FAST-NEXT:    kmovw %ecx, %k2
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k2}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm1, %zmm7, %zmm8
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm0, %zmm8, %zmm6
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
 ; AVX512F-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512F-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
@@ -1753,7 +1753,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    movb $36, %cl
 ; AVX512F-FAST-NEXT:    kmovw %ecx, %k1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
@@ -1764,7 +1764,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm9
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
@@ -1775,17 +1775,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm5, %zmm3, %zmm10
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
 ; AVX512F-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512F-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11 {%k1}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm1, %zmm11, %zmm7
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm0, %zmm7, %zmm11
@@ -1796,7 +1796,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm1, %zmm3, %zmm2
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
 ; AVX512F-FAST-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
@@ -1827,17 +1827,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    movb $36, %cl
 ; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm6 {%k1} = zmm8[0,1,0,1,2,3,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm1, %zmm6, %zmm7
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm6
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
 ; AVX512BW-SLOW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512BW-SLOW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm8
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
@@ -1850,7 +1850,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    movb $-110, %cl
 ; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
@@ -1861,7 +1861,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
@@ -1871,7 +1871,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %ymm11
 ; AVX512BW-SLOW-NEXT:    vpunpckhdq {{.*#+}} ymm11 = ymm11[2],mem[2],ymm11[3],mem[3],ymm11[6],mem[6],ymm11[7],mem[7]
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k1} = zmm11[2,3,2,3,2,3,2,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm1, %zmm7, %zmm11
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm0, %zmm11, %zmm7
@@ -1880,7 +1880,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm11
 ; AVX512BW-SLOW-NEXT:    vpunpckhdq {{.*#+}} zmm2 = zmm2[2],zmm3[2],zmm2[3],zmm3[3],zmm2[6],zmm3[6],zmm2[7],zmm3[7],zmm2[10],zmm3[10],zmm2[11],zmm3[11],zmm2[14],zmm3[14],zmm2[15],zmm3[15]
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm11 {%k1} = zmm2[6,7,6,7,6,7,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm1, %zmm11, %zmm2
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
@@ -1911,11 +1911,11 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FAST-NEXT:    movb $-110, %cl
 ; AVX512BW-FAST-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k2}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm1, %zmm7, %zmm8
 ; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm0, %zmm8, %zmm6
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,3,19,u,u,7,23,4,20,u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
 ; AVX512BW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm8 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512BW-FAST-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3]
@@ -1923,7 +1923,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FAST-NEXT:    movb $36, %cl
 ; AVX512BW-FAST-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm1, %zmm8, %zmm7
 ; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm0, %zmm7, %zmm8
@@ -1934,7 +1934,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FAST-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm9
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm1, %zmm9, %zmm7
 ; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm0, %zmm7, %zmm9
@@ -1945,17 +1945,17 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm5, %zmm3, %zmm10
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k2}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm1, %zmm10, %zmm7
 ; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm0, %zmm7, %zmm10
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,0,16,11,27,u,u,15,31,12,28,u,u,12,28]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm7
 ; AVX512BW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm11 = [12,28,11,27,0,0,13,29,12,28,11,27,0,0,13,29]
 ; AVX512BW-FAST-NEXT:    # zmm11 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm11 {%k1}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm1, %zmm11, %zmm7
 ; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm0, %zmm7, %zmm11
@@ -1966,7 +1966,7 @@ define void @store_i32_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FAST-NEXT:    # zmm3 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm4, %zmm2, %zmm3
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm3 {%k1}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm1, %zmm3, %zmm2
 ; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
@@ -3910,7 +3910,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    movb $36, %dl
 ; AVX512F-SLOW-NEXT:    kmovw %edx, %k1
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm3, %zmm18, %zmm17
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm22, %zmm17
@@ -3923,7 +3923,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm8, %zmm20, %zmm11
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm9, %zmm21, %zmm11
@@ -3934,7 +3934,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    movb $-110, %cl
 ; AVX512F-SLOW-NEXT:    kmovw %ecx, %k2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13 {%k2}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm8, %zmm23, %zmm13
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm9, %zmm24, %zmm13
@@ -3943,7 +3943,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm10, %zmm25, %zmm26
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k2}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm8, %zmm26, %zmm15
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm9, %zmm27, %zmm15
@@ -3967,7 +3967,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm22
 ; AVX512F-SLOW-NEXT:    vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7]
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm8, %zmm22, %zmm20
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm9, %zmm23, %zmm20
@@ -3981,7 +3981,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm10, %zmm21, %zmm0
 ; AVX512F-SLOW-NEXT:    vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15]
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm8, %zmm5, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm9, %zmm7, %zmm0
@@ -4051,7 +4051,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm24, %zmm16, %zmm15
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, %zmm25
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm24, %zmm26, %zmm7
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm24, %zmm26, %zmm25
 ; AVX512F-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512F-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
@@ -4081,22 +4081,22 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k1}
 ; AVX512F-FAST-NEXT:    vmovdqa64 64(%r8), %zmm13
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k1}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm11, %zmm14, %zmm2
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm23, %zmm9 {%k2}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm13, %zmm19, %zmm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k1}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm13, %zmm20, %zmm6
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm13, %zmm22, %zmm9
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k2}
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm13, %zmm7, %zmm15
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm13, %zmm14, %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm13, %zmm14, %zmm18
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5 {%k1}
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm11, %zmm19, %zmm5
@@ -4185,7 +4185,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    movb $36, %dl
 ; AVX512BW-SLOW-NEXT:    kmovd %edx, %k1
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm18[0,1,0,1,2,3,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm3, %zmm18, %zmm17
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,18,2,3,4,5,6,19,8,9,10,11,12,20,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm4, %zmm22, %zmm17
@@ -4198,7 +4198,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm20
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm10, %zmm18, %zmm20
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm20, %zmm11 {%k1}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm8, %zmm20, %zmm11
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,26,2,3,4,5,6,27,8,9,10,11,12,28,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm9, %zmm21, %zmm11
@@ -4209,7 +4209,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    movb $-110, %cl
 ; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm23, %zmm13 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm8, %zmm23, %zmm13
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,1,2,3,4,24,6,7,8,9,10,25,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm9, %zmm24, %zmm13
@@ -4218,7 +4218,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm26
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm10, %zmm25, %zmm26
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm8, %zmm26, %zmm15
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,1,2,3,4,16,6,7,8,9,10,17,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm9, %zmm27, %zmm15
@@ -4242,7 +4242,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 64(%rdi), %ymm22
 ; AVX512BW-SLOW-NEXT:    vpunpckhdq {{.*#+}} ymm22 = ymm22[2],mem[2],ymm22[3],mem[3],ymm22[6],mem[6],ymm22[7],mem[7]
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm20 {%k1} = zmm22[2,3,2,3,2,3,2,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm8, %zmm22, %zmm20
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,2,21,4,5,6,7,8,22,10,11,12,13,14,23]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm9, %zmm23, %zmm20
@@ -4256,7 +4256,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm10, %zmm21, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpunpckhdq {{.*#+}} zmm5 = zmm5[2],zmm7[2],zmm5[3],zmm7[3],zmm5[6],zmm7[6],zmm5[7],zmm7[7],zmm5[10],zmm7[10],zmm5[11],zmm7[11],zmm5[14],zmm7[14],zmm5[15],zmm7[15]
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm0 {%k1} = zmm5[6,7,6,7,6,7,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm8, %zmm5, %zmm0
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,29,4,5,6,7,8,30,10,11,12,13,14,31]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm9, %zmm7, %zmm0
@@ -4326,7 +4326,7 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm24, %zmm16, %zmm15
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm25
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm24, %zmm26, %zmm7
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm24, %zmm26, %zmm25
 ; AVX512BW-FAST-NEXT:    vbroadcasti32x4 {{.*#+}} zmm24 = [6,22,7,23,6,22,7,23,6,22,7,23,6,22,7,23]
 ; AVX512BW-FAST-NEXT:    # zmm24 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
@@ -4356,22 +4356,22 @@ define void @store_i32_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm20, %zmm3 {%k1}
 ; AVX512BW-FAST-NEXT:    vmovdqa64 64(%r8), %zmm13
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k1}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm11, %zmm14, %zmm2
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm23, %zmm9 {%k2}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm13, %zmm19, %zmm3
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm26, %zmm15 {%k1}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm13, %zmm20, %zmm6
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm18 {%k1}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm13, %zmm22, %zmm9
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm0 {%k2}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm13, %zmm7, %zmm15
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm13, %zmm14, %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm13, %zmm14, %zmm18
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm12, %zmm5 {%k1}
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm11, %zmm19, %zmm5
@@ -8476,7 +8476,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm29
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 (%r8), %zmm4
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm3
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm3
@@ -8532,7 +8532,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm31
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm16
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm10
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
@@ -8540,7 +8540,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm5
 ; AVX512F-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm20
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm25
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm9
@@ -8548,7 +8548,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm11
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm23
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm26
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm27
@@ -8565,14 +8565,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512F-SLOW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm19
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm22
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm18
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm21
 ; AVX512F-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm24
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7]
 ; AVX512F-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm17
@@ -8668,7 +8668,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm1, %zmm16
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512F-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm8, %zmm1
@@ -8803,22 +8803,22 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm7, %zmm19 {%k1}
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm25, %zmm5 {%k1}
 ; AVX512F-FAST-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm16, (%rsp) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm0, %zmm7, %zmm17
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm0, %zmm18, %zmm23
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm27
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm0, %zmm16, %zmm19
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm16, %zmm17
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512F-FAST-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
 ; AVX512F-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm22, %zmm27 {%k2}
@@ -9045,7 +9045,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm29
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm29 {%k1} = zmm0[0,1,0,1,2,3,6,7]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 (%r8), %zmm4
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm4, %zmm0, %zmm3
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 64(%r8), %zmm3
@@ -9101,7 +9101,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm31
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm31 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm16
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm10
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm8
@@ -9109,7 +9109,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm5
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 (%rsp), %zmm1 # 64-byte Reload
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm5 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm20
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm25
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm9
@@ -9117,7 +9117,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vpermi2d %zmm7, %zmm12, %zmm11
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11 {%k1}
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm23
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm26
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm27
@@ -9134,14 +9134,14 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-SLOW-NEXT:    vmovdqa 192(%rdi), %ymm1
 ; AVX512BW-SLOW-NEXT:    vpunpckhdq {{.*#+}} ymm1 = ymm1[2],mem[2],ymm1[3],mem[3],ymm1[6],mem[6],ymm1[7],mem[7]
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm21 {%k1} = zmm1[2,3,2,3,2,3,2,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm19
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm22
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm18
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm21
 ; AVX512BW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm24 {%k1} = zmm1[6,7,6,7,6,7,6,7]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm4, %zmm1, %zmm24
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm17 {%k1} = zmm30[6,7,6,7,6,7,6,7]
 ; AVX512BW-SLOW-NEXT:    vpermt2d %zmm3, %zmm1, %zmm17
@@ -9237,7 +9237,7 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm1
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm1, %zmm16
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [3,19,0,16,3,19,0,16,7,23,4,20,u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm10 = [4,20,3,19,0,0,5,21,4,20,3,19,0,0,5,21]
 ; AVX512BW-FAST-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm1
@@ -9372,22 +9372,22 @@ define void @store_i32_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm19 {%k1}
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm25, %zmm5 {%k1}
 ; AVX512BW-FAST-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,16,u,6,7,8,9,17,u,12,13,14,15]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm0, %zmm2, %zmm16
 ; AVX512BW-FAST-NEXT:    vmovdqu64 %zmm16, (%rsp) # 64-byte Spill
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [18,u,2,3,4,5,19,u,8,9,10,11,20,u,14,15]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm0, %zmm7, %zmm17
 ; AVX512BW-FAST-NEXT:    vmovdqu64 %zmm17, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [0,1,21,u,4,5,6,7,22,u,10,11,12,13,23,u]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm0, %zmm18, %zmm23
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [0,1,2,3,24,u,6,7,8,9,25,u,12,13,14,15]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm0, %zmm25, %zmm27
 ; AVX512BW-FAST-NEXT:    vmovdqu64 %zmm27, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [26,u,2,3,4,5,27,u,8,9,10,11,28,u,14,15]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm0, %zmm16, %zmm19
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm16, %zmm17
 ; AVX512BW-FAST-NEXT:    vmovdqu64 %zmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [0,1,29,u,4,5,6,7,30,u,10,11,12,13,31,u]
 ; AVX512BW-FAST-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
 ; AVX512BW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm27 # 64-byte Reload
 ; AVX512BW-FAST-NEXT:    vmovdqa64 %zmm22, %zmm27 {%k2}
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
index e5992acfaa2a7..9ea961e6802dd 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-7.ll
@@ -101,7 +101,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7]
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm3 = <3,5,7,u>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} xmm3 = [3,5,7,u]
 ; AVX2-SLOW-NEXT:    vpermps %ymm2, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1]
 ; AVX2-SLOW-NEXT:    # ymm3 = mem[0,1,0,1]
@@ -131,13 +131,13 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vunpcklpd {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
 ; AVX2-FAST-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm1 = <3,5,7,u>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} xmm1 = [3,5,7,u]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1]
 ; AVX2-FAST-NEXT:    # ymm3 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermps %ymm0, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[3]
-; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = <0,2,4,6,u,u,u,1>
+; AVX2-FAST-NEXT:    vmovaps {{.*#+}} ymm4 = [0,2,4,6,u,u,u,1]
 ; AVX2-FAST-NEXT:    vpermps %ymm2, %ymm4, %ymm2
 ; AVX2-FAST-NEXT:    vbroadcastf128 {{.*#+}} ymm4 = [0,2,4,0,0,2,4,0]
 ; AVX2-FAST-NEXT:    # ymm4 = mem[0,1,0,1]
@@ -172,7 +172,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} ymm3 = ymm0[0,2,2,3,4,6,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm3 = ymm3[0,1,0,2]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6],ymm1[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm3 = <3,5,7,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} xmm3 = [3,5,7,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastf128 {{.*#+}} ymm3 = [3,5,0,1,3,5,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    # ymm3 = mem[0,1,0,1]
@@ -201,7 +201,7 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
 ; AVX512-SLOW-NEXT:    vpunpcklqdq {{.*#+}} ymm0 = ymm0[0],ymm3[0],ymm0[2],ymm3[2]
-; AVX512-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u>
+; AVX512-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,16,20,18,1,3,5,7,17,21,19,u,u]
 ; AVX512-SLOW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm1
 ; AVX512-SLOW-NEXT:    vextracti32x4 $2, %zmm1, 32(%rax)
 ; AVX512-SLOW-NEXT:    vextracti32x4 $3, %zmm1, %xmm0
@@ -225,10 +225,10 @@ define void @store_i32_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,u>
+; AVX512-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,u]
 ; AVX512-FAST-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
 ; AVX512-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm0
-; AVX512-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u>
+; AVX512-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,2,4,6,8,10,12,1,3,5,7,9,11,13,u,u]
 ; AVX512-FAST-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512-FAST-NEXT:    vextracti32x4 $2, %zmm0, 32(%rax)
 ; AVX512-FAST-NEXT:    vextracti32x4 $3, %zmm0, %xmm1
@@ -529,7 +529,7 @@ define void @store_i32_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti32x4 $2, (%r10), %zmm1, %zmm1
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,4,8,12,16,20,24,1,5,9,13,17,21,25,2,6]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [10,14,18,22,26,3,7,11,15,19,23,27,u,u,u,u]
 ; AVX512-NEXT:    vpermi2d %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vextracti32x4 $2, %zmm3, 96(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
@@ -1118,23 +1118,23 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7]
 ; AVX512F-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
 ; AVX512F-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512F-NEXT:    kmovw %ecx, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm5, %zmm8
 ; AVX512F-NEXT:    movw $3612, %cx # imm = 0xE1C
 ; AVX512F-NEXT:    kmovw %ecx, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm6, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
 ; AVX512F-NEXT:    movw $15480, %cx # imm = 0x3C78
 ; AVX512F-NEXT:    kmovw %ecx, %k1
@@ -1164,23 +1164,23 @@ define void @store_i32_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vbroadcasti128 {{.*#+}} ymm1 = [6,23,31,7,6,23,31,7]
 ; AVX512BW-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,8,16,u,u,u,u,1,9,17,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,8,16,24,u,u,u,1,9,17,25,u,u,u,2,10]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm7
 ; AVX512BW-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm6, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,2,10,18,u,u,u,u,3,11,19,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [2,10,u,u,u,19,27,3,11,u,u,u,20,28,4,12]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm5, %zmm8
 ; AVX512BW-NEXT:    movw $3612, %cx # imm = 0xE1C
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm6, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,5,13,21,29,u,u,u,6,14,22,30,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm4, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [4,12,20,u,u,u,u,5,13,21,u,u,u,u,6,14]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
 ; AVX512BW-NEXT:    movw $15480, %cx # imm = 0x3C78
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
@@ -2518,96 +2518,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm2
 ; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm4, %zmm8
 ; AVX512F-NEXT:    movw $6192, %cx # imm = 0x1830
 ; AVX512F-NEXT:    kmovw %ecx, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm7, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm2, %zmm7
 ; AVX512F-NEXT:    movw $24769, %cx # imm = 0x60C1
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm7, %zmm8 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm8, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm3, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm4, %zmm8
 ; AVX512F-NEXT:    movw $1548, %cx # imm = 0x60C
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm9, %zmm8 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm9, %zmm10
 ; AVX512F-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm10, %zmm8 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm4, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm3, %zmm9
 ; AVX512F-NEXT:    movw $12384, %cx # imm = 0x3060
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm10, %zmm11
 ; AVX512F-NEXT:    movw $3612, %cx # imm = 0xE1C
 ; AVX512F-NEXT:    kmovw %ecx, %k3
 ; AVX512F-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm3, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm4, %zmm11
 ; AVX512F-NEXT:    vmovdqa32 %zmm10, %zmm11 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm12, %zmm10
 ; AVX512F-NEXT:    movw $15480, %cx # imm = 0x3C78
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm11, %zmm10 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm4, %zmm12
 ; AVX512F-NEXT:    movw $3096, %cx # imm = 0xC18
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm11, %zmm12 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm11, %zmm13
 ; AVX512F-NEXT:    movw $28897, %cx # imm = 0x70E1
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm13, %zmm12 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11]
 ; AVX512F-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm6, %zmm13
 ; AVX512F-NEXT:    movw $-31994, %cx # imm = 0x8306
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm11, %zmm13 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm11, %zmm14
 ; AVX512F-NEXT:    movw $7224, %cx # imm = 0x1C38
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm14, %zmm13 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm4, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm5, %zmm4
 ; AVX512F-NEXT:    vmovdqa32 %zmm11, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm3, %zmm1
 ; AVX512F-NEXT:    movw $-30962, %cx # imm = 0x870E
 ; AVX512F-NEXT:    kmovw %ecx, %k1
@@ -2633,96 +2633,96 @@ define void @store_i32_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm3, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm8
 ; AVX512BW-NEXT:    movw $6192, %cx # imm = 0x1830
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm2, %zmm7
 ; AVX512BW-NEXT:    movw $24769, %cx # imm = 0x60C1
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm7, %zmm8 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm8, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm3, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm8
 ; AVX512BW-NEXT:    movw $1548, %cx # imm = 0x60C
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm9, %zmm8 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm9, %zmm10
 ; AVX512BW-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm10, %zmm8 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm3, %zmm9
 ; AVX512BW-NEXT:    movw $12384, %cx # imm = 0x3060
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm10, %zmm9 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm10, %zmm11
 ; AVX512BW-NEXT:    movw $3612, %cx # imm = 0xE1C
 ; AVX512BW-NEXT:    kmovd %ecx, %k3
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm3, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm11
 ; AVX512BW-NEXT:    vmovdqa32 %zmm10, %zmm11 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm12, %zmm10
 ; AVX512BW-NEXT:    movw $15480, %cx # imm = 0x3C78
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm10 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm12
 ; AVX512BW-NEXT:    movw $3096, %cx # imm = 0xC18
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm12 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm11, %zmm13
 ; AVX512BW-NEXT:    movw $28897, %cx # imm = 0x70E1
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm13, %zmm12 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11]
 ; AVX512BW-NEXT:    vpermi2d %zmm5, %zmm3, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm6, %zmm13
 ; AVX512BW-NEXT:    movw $-31994, %cx # imm = 0x8306
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm13 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm11, %zmm14
 ; AVX512BW-NEXT:    movw $7224, %cx # imm = 0x1C38
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm14, %zmm13 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm4, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm5, %zmm4
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm3, %zmm1
 ; AVX512BW-NEXT:    movw $-30962, %cx # imm = 0x870E
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
@@ -5498,42 +5498,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm10
 ; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm26
 ; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm0
 ; AVX512F-NEXT:    vpermt2d %zmm13, %zmm21, %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm3
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm6, %zmm3
 ; AVX512F-NEXT:    movw $6192, %cx # imm = 0x1830
 ; AVX512F-NEXT:    kmovw %ecx, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm22, %zmm3, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm20, %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm3
 ; AVX512F-NEXT:    vpermt2d %zmm23, %zmm11, %zmm3
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29]
 ; AVX512F-NEXT:    vpermi2d %zmm22, %zmm3, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512F-NEXT:    vpermt2d %zmm9, %zmm28, %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18]
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm7
 ; AVX512F-NEXT:    vpermt2d %zmm8, %zmm25, %zmm7
 ; AVX512F-NEXT:    movw $1548, %cx # imm = 0x60C
 ; AVX512F-NEXT:    kmovw %ecx, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm0, %zmm5
 ; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm30
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm16
 ; AVX512F-NEXT:    vpermt2d %zmm18, %zmm17, %zmm16
 ; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm27
@@ -5544,7 +5544,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2d %zmm13, %zmm28, %zmm23
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm29
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
 ; AVX512F-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512F-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
@@ -5554,7 +5554,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm23
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm28
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm26, %zmm25, %zmm22
 ; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm25
 ; AVX512F-NEXT:    vpermt2d %zmm25, %zmm2, %zmm22
@@ -5562,14 +5562,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermi2d %zmm9, %zmm1, %zmm21
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm14, %zmm6
 ; AVX512F-NEXT:    vmovdqa32 %zmm21, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u]
 ; AVX512F-NEXT:    vpermi2d %zmm4, %zmm10, %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31]
 ; AVX512F-NEXT:    vpermi2d %zmm15, %zmm2, %zmm21
 ; AVX512F-NEXT:    movw $-7741, %ax # imm = 0xE1C3
 ; AVX512F-NEXT:    kmovw %eax, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm21, %zmm6 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11]
 ; AVX512F-NEXT:    vpermt2d %zmm13, %zmm2, %zmm30
 ; AVX512F-NEXT:    movw $-31994, %ax # imm = 0x8306
 ; AVX512F-NEXT:    kmovw %eax, %k2
@@ -5577,56 +5577,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm14, %zmm20
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm9, %zmm11
 ; AVX512F-NEXT:    vmovdqa32 %zmm20, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13]
 ; AVX512F-NEXT:    vpermi2d %zmm10, %zmm4, %zmm20
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15]
 ; AVX512F-NEXT:    vpermi2d %zmm15, %zmm20, %zmm21
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm26, %zmm20, %zmm5
 ; AVX512F-NEXT:    movw $-30962, %ax # imm = 0x870E
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm21, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm25, %zmm21, %zmm5
 ; AVX512F-NEXT:    movw $7224, %ax # imm = 0x1C38
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm5, %zmm16 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm13, %zmm22, %zmm27
 ; AVX512F-NEXT:    vpermi2d %zmm9, %zmm1, %zmm2
 ; AVX512F-NEXT:    vpermi2d %zmm14, %zmm8, %zmm17
 ; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm17 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9]
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm5, %zmm3
 ; AVX512F-NEXT:    movw $3096, %ax # imm = 0xC18
 ; AVX512F-NEXT:    kmovw %eax, %k2
 ; AVX512F-NEXT:    vmovdqa32 %zmm27, %zmm3 {%k2}
 ; AVX512F-NEXT:    vpermi2d %zmm10, %zmm4, %zmm20
 ; AVX512F-NEXT:    vpermt2d %zmm15, %zmm21, %zmm20
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm26, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u]
 ; AVX512F-NEXT:    vpermt2d %zmm25, %zmm20, %zmm0
 ; AVX512F-NEXT:    movw $28897, %ax # imm = 0x70E1
 ; AVX512F-NEXT:    kmovw %eax, %k3
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm13, %zmm0, %zmm31
 ; AVX512F-NEXT:    vpermi2d %zmm9, %zmm1, %zmm22
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm14, %zmm5
 ; AVX512F-NEXT:    vmovdqa32 %zmm22, %zmm5 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm21, %zmm24
 ; AVX512F-NEXT:    movw $12384, %ax # imm = 0x3060
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm31, %zmm24 {%k1}
 ; AVX512F-NEXT:    vpermi2d %zmm10, %zmm4, %zmm2
 ; AVX512F-NEXT:    vpermt2d %zmm15, %zmm20, %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22]
 ; AVX512F-NEXT:    vpermt2d %zmm26, %zmm20, %zmm23
 ; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm5 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15]
 ; AVX512F-NEXT:    vpermt2d %zmm25, %zmm2, %zmm23
 ; AVX512F-NEXT:    movw $15480, %ax # imm = 0x3C78
 ; AVX512F-NEXT:    kmovw %eax, %k2
@@ -5637,17 +5637,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermi2d %zmm10, %zmm4, %zmm20
 ; AVX512F-NEXT:    vpermt2d %zmm15, %zmm2, %zmm20
 ; AVX512F-NEXT:    vmovdqa32 %zmm21, %zmm20 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm12, %zmm0, %zmm29
 ; AVX512F-NEXT:    vpermt2d %zmm8, %zmm0, %zmm14
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20]
 ; AVX512F-NEXT:    vpermt2d %zmm13, %zmm0, %zmm19
 ; AVX512F-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa32 %zmm29, %zmm19 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm0, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm26, %zmm0, %zmm28
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm15, %zmm0, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm25, %zmm0, %zmm28
 ; AVX512F-NEXT:    vmovdqa32 %zmm14, %zmm1 {%k1}
@@ -5655,10 +5655,10 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa32 %zmm28, %zmm19 {%k1}
 ; AVX512F-NEXT:    vmovdqa32 %zmm4, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vpermi2d %zmm26, %zmm2, %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512F-NEXT:    vpermi2d %zmm26, %zmm4, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31]
@@ -5701,42 +5701,42 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm0
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm21, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm3
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm6, %zmm3
 ; AVX512BW-NEXT:    movw $6192, %cx # imm = 0x1830
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm22, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm20, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm3
 ; AVX512BW-NEXT:    vpermt2d %zmm23, %zmm11, %zmm3
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29]
 ; AVX512BW-NEXT:    vpermi2d %zmm22, %zmm3, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm28, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm7
 ; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm25, %zmm7
 ; AVX512BW-NEXT:    movw $1548, %cx # imm = 0x60C
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm7 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm16
 ; AVX512BW-NEXT:    vpermt2d %zmm18, %zmm17, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm27
@@ -5747,7 +5747,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm28, %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm29
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm25, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm2, %zmm5
 ; AVX512BW-NEXT:    movw $14448, %cx # imm = 0x3870
 ; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm18 {%k2}
@@ -5757,7 +5757,7 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm25, %zmm22
 ; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm25
 ; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm2, %zmm22
@@ -5765,14 +5765,14 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermi2d %zmm9, %zmm1, %zmm21
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm14, %zmm6
 ; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm4, %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31]
 ; AVX512BW-NEXT:    vpermi2d %zmm15, %zmm2, %zmm21
 ; AVX512BW-NEXT:    movw $-7741, %ax # imm = 0xE1C3
 ; AVX512BW-NEXT:    kmovd %eax, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm6 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11]
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm2, %zmm30
 ; AVX512BW-NEXT:    movw $-31994, %ax # imm = 0x8306
 ; AVX512BW-NEXT:    kmovd %eax, %k2
@@ -5780,56 +5780,56 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm14, %zmm20
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm9, %zmm11
 ; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13]
 ; AVX512BW-NEXT:    vpermi2d %zmm10, %zmm4, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15]
 ; AVX512BW-NEXT:    vpermi2d %zmm15, %zmm20, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm20, %zmm5
 ; AVX512BW-NEXT:    movw $-30962, %ax # imm = 0x870E
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm21, %zmm5
 ; AVX512BW-NEXT:    movw $7224, %ax # imm = 0x1C38
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm5, %zmm16 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm22, %zmm27
 ; AVX512BW-NEXT:    vpermi2d %zmm9, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vpermi2d %zmm14, %zmm8, %zmm17
 ; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm17 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9]
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm5, %zmm3
 ; AVX512BW-NEXT:    movw $3096, %ax # imm = 0xC18
 ; AVX512BW-NEXT:    kmovd %eax, %k2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm27, %zmm3 {%k2}
 ; AVX512BW-NEXT:    vpermi2d %zmm10, %zmm4, %zmm20
 ; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm21, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm20, %zmm0
 ; AVX512BW-NEXT:    movw $28897, %ax # imm = 0x70E1
 ; AVX512BW-NEXT:    kmovd %eax, %k3
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm3 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm31
 ; AVX512BW-NEXT:    vpermi2d %zmm9, %zmm1, %zmm22
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm14, %zmm5
 ; AVX512BW-NEXT:    vmovdqa32 %zmm22, %zmm5 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm21, %zmm24
 ; AVX512BW-NEXT:    movw $12384, %ax # imm = 0x3060
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm31, %zmm24 {%k1}
 ; AVX512BW-NEXT:    vpermi2d %zmm10, %zmm4, %zmm2
 ; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm20, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22]
 ; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm20, %zmm23
 ; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm5 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15]
 ; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm2, %zmm23
 ; AVX512BW-NEXT:    movw $15480, %ax # imm = 0x3C78
 ; AVX512BW-NEXT:    kmovd %eax, %k2
@@ -5840,17 +5840,17 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermi2d %zmm10, %zmm4, %zmm20
 ; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm2, %zmm20
 ; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm20 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm12, %zmm0, %zmm29
 ; AVX512BW-NEXT:    vpermt2d %zmm8, %zmm0, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20]
 ; AVX512BW-NEXT:    vpermt2d %zmm13, %zmm0, %zmm19
 ; AVX512BW-NEXT:    vpermt2d %zmm9, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm29, %zmm19 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm26, %zmm0, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm15, %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm25, %zmm0, %zmm28
 ; AVX512BW-NEXT:    vmovdqa32 %zmm14, %zmm1 {%k1}
@@ -5858,10 +5858,10 @@ define void @store_i32_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm28, %zmm19 {%k1}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm4, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermi2d %zmm26, %zmm2, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermi2d %zmm26, %zmm4, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,29,2,3,4,5,6,7,30,9,10,11,12,13,14,31]
@@ -11476,47 +11476,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm6
 ; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 128(%r9), %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm7
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm1, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm1, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm7, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm1, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm1, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm12
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm1, %zmm12
 ; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm1, %zmm13
 ; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14
 ; AVX512F-NEXT:    vpermt2d %zmm6, %zmm1, %zmm14
 ; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u]
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm0
@@ -11556,23 +11556,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2d %zmm22, %zmm1, %zmm4
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm11, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm30, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm3, %zmm4, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm1, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm24
@@ -11632,29 +11632,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm0, %zmm29
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm22
 ; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm9, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm27 = <u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm27, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; AVX512F-NEXT:    vpermt2d %zmm22, %zmm6, %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm18
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm18
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm8, %zmm22
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm0
@@ -11714,7 +11714,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa32 %zmm11, %zmm7 {%k1}
 ; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm28
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm8
 ; AVX512F-NEXT:    movw $28897, %cx # imm = 0x70E1
@@ -11741,7 +11741,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k2}
 ; AVX512F-NEXT:    vmovdqa32 %zmm30, %zmm6 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
 ; AVX512F-NEXT:    movw $7224, %cx # imm = 0x1C38
@@ -11756,7 +11756,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqa32 %zmm2, %zmm6 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
 ; AVX512F-NEXT:    movw $-30962, %cx # imm = 0x870E
@@ -11783,7 +11783,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
 ; AVX512F-NEXT:    vmovdqa32 %zmm24, %zmm5 {%k1}
 ; AVX512F-NEXT:    vmovdqa32 %zmm21, %zmm17 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm1
 ; AVX512F-NEXT:    movw $-7741, %cx # imm = 0xE1C3
@@ -11795,7 +11795,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm17 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm1
 ; AVX512F-NEXT:    movw $14448, %cx # imm = 0x3870
@@ -11821,12 +11821,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa32 %zmm1, %zmm19 {%k3}
 ; AVX512F-NEXT:    vmovdqa64 192(%r8), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm11
 ; AVX512F-NEXT:    vmovdqa64 192(%rax), %zmm10
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm0, %zmm11
 ; AVX512F-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
 ; AVX512F-NEXT:    movw $3612, %ax # imm = 0xE1C
@@ -11847,23 +11847,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa32 %zmm11, %zmm24 {%k2}
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa32 %zmm20, %zmm18 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm1, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm0, %zmm13
 ; AVX512F-NEXT:    vmovdqa32 %zmm13, %zmm18 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm16, %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm14, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm14, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm15, %zmm14
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa32 %zmm15, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15]
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2d %zmm28, %zmm15, %zmm16
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
@@ -11882,17 +11882,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm15, %zmm8
 ; AVX512F-NEXT:    vmovdqa32 %zmm23, %zmm8 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm20
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22]
 ; AVX512F-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm10, %zmm15, %zmm1
 ; AVX512F-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm11, %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm13, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u]
 ; AVX512F-NEXT:    vpermi2d %zmm2, %zmm14, %zmm12
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15]
 ; AVX512F-NEXT:    vpermi2d %zmm10, %zmm8, %zmm2
@@ -11952,47 +11952,47 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 128(%r9), %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,1,17,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm7
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [2,18,u,u,u,u,u,3,19,u,u,u,u,u,4,20]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm7, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,u,7,23,u,u,u,u,u,8,24,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm12
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,9,25,u,u,u,u,u,10,26,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm13
 ; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,11,27,u,u,u,u,u,12,28,u,u,u,u,u,13]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14
 ; AVX512BW-NEXT:    vpermt2d %zmm6, %zmm1, %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [13,u,u,u,u,u,30,14,u,u,u,u,u,31,15,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm0
@@ -12032,23 +12032,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,5,21,u,u,u,u,u,6,22,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,7,23,u,u,u,u,u,8,24,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm11, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [u,9,25,u,u,u,u,u,10,26,u,u,u,u,u,11]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm30, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [11,u,u,u,u,u,28,12,u,u,u,u,u,29,13,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm3, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,15,31,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm24
@@ -12108,29 +12108,29 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm0, %zmm29
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm22
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,16,u,u,u,u,u,1,17,u,u,u,u,u,2,18]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm9, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,u,u,u,3,19,u,u,u,u,u,4,20,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm20, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = <u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [u,u,u,5,21,u,u,u,u,u,6,22,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm27, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,7,23,u,u,u,u,u,8,24,u,u,u,u,u,9]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [9,u,u,u,u,u,26,10,u,u,u,u,u,27,11,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm3
 ; AVX512BW-NEXT:    vpermt2d %zmm22, %zmm6, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,13,29,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm18
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,14,30,u,u,u,u,u,15,31,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm8, %zmm22
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm0
@@ -12190,7 +12190,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm7 {%k1}
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [22,u,u,u,u,5,6,23,u,u,u,u,12,13,24,u]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm8
 ; AVX512BW-NEXT:    movw $28897, %cx # imm = 0x70E1
@@ -12217,7 +12217,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm25 {%k2}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm30, %zmm6 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,3,4,25,u,u,u,u,10,11,26,u,u,u]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
 ; AVX512BW-NEXT:    movw $7224, %cx # imm = 0x1C38
@@ -12232,7 +12232,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa32 %zmm2, %zmm6 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,1,2,27,u,u,u,u,8,9,28,u,u,u,u,15]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm2 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm2
 ; AVX512BW-NEXT:    movw $-30962, %cx # imm = 0x870E
@@ -12259,7 +12259,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa32 %zmm0, %zmm22 {%k1}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm24, %zmm5 {%k1}
 ; AVX512BW-NEXT:    vmovdqa32 %zmm21, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,29,u,u,u,u,6,7,30,u,u,u,u,13,14,31]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm1
 ; AVX512BW-NEXT:    movw $-7741, %cx # imm = 0xE1C3
@@ -12271,7 +12271,7 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,4,5,16,u,u,u,u,11,12,17,u,u]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm1
 ; AVX512BW-NEXT:    movw $14448, %cx # imm = 0x3870
@@ -12297,12 +12297,12 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa32 %zmm1, %zmm19 {%k3}
 ; AVX512BW-NEXT:    vmovdqa64 192(%r8), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,0,16,u,u,u,u,u,1,17,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 192(%rax), %zmm10
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm11
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm9 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,2,3,18,u,u,u,u,9,10,19,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm11 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm0, %zmm11
 ; AVX512BW-NEXT:    movw $3612, %ax # imm = 0xE1C
@@ -12323,23 +12323,23 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa32 %zmm11, %zmm24 {%k2}
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm18 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa32 %zmm20, %zmm18 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,2,18,u,u,u,u,u,3,19,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm1, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm0, %zmm13
 ; AVX512BW-NEXT:    vmovdqa32 %zmm13, %zmm18 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,23,u,u,8,9,10,11,24,u,u,15]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm16, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,25,u,u,6,7,8,9,26,u,u,13,14,15]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm14, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,27,u,u,4,5,6,7,28,u,u,11,12,13,14,29]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm14 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm14, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,2,3,4,5,30,u,u,9,10,11,12,31,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm15, %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm15 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa32 %zmm15, %zmm0 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,20,u,u,u,u,7,8,21,u,u,u,u,14,15]
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm16 # 64-byte Reload
 ; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm15, %zmm16
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm21 # 64-byte Reload
@@ -12358,17 +12358,17 @@ define void @store_i32_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm15, %zmm8
 ; AVX512BW-NEXT:    vmovdqa32 %zmm23, %zmm8 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [4,20,u,u,u,u,u,5,21,u,u,u,u,u,6,22]
 ; AVX512BW-NEXT:    vpermt2d %zmm2, %zmm8, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm10, %zmm15, %zmm1
 ; AVX512BW-NEXT:    vmovdqa32 %zmm12, %zmm1 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,23,u,8,9,10,11,12,24,u,15]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm3, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,25,u,6,7,8,9,10,26,u,13,14,15]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm11, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,27,u,4,5,6,7,8,28,u,11,12,13,14,15]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm13, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [29,u,2,3,4,5,6,30,u,9,10,11,12,13,31,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm2, %zmm14, %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [22,1,2,3,4,5,6,23,8,9,10,11,12,13,24,15]
 ; AVX512BW-NEXT:    vpermi2d %zmm10, %zmm8, %zmm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
index 5c844cf04e1f5..9b0e526af057d 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-8.ll
@@ -716,26 +716,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
 ; AVX512F-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
 ; AVX512F-NEXT:    vinserti64x4 $1, (%r10), %zmm3, %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512F-NEXT:    movb $-52, %cl
 ; AVX512F-NEXT:    kmovw %ecx, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, 192(%rax)
@@ -758,26 +758,26 @@ define void @store_i32_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%r10), %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,0,8,16,24,u,u,u,u,1,9,17,25]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,8,16,24,u,u,u,u,1,9,17,25,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm5
 ; AVX512BW-NEXT:    movb $-52, %cl
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,2,10,18,26,u,u,u,u,3,11,19,27]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [2,10,18,26,u,u,u,u,3,11,19,27,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,4,12,20,28,u,u,u,u,5,13,21,29]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,12,20,28,u,u,u,u,5,13,21,29,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,6,14,22,30,u,u,u,u,7,15,23,31]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,14,22,30,u,u,u,u,7,15,23,31,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%rax)
@@ -1557,16 +1557,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm6
 ; AVX512F-NEXT:    vmovdqa64 (%r11), %zmm7
 ; AVX512F-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
 ; AVX512F-NEXT:    movb $-120, %cl
 ; AVX512F-NEXT:    kmovw %ecx, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512F-NEXT:    movb $34, %cl
 ; AVX512F-NEXT:    kmovw %ecx, %k2
@@ -1574,80 +1574,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    movb $-52, %cl
 ; AVX512F-NEXT:    kmovw %ecx, %k3
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm4 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm9 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm11
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm10 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm12
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm11 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm15
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm16 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 %zmm13, %zmm16 {%k3}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31]
 ; AVX512F-NEXT:    vpermi2d %zmm8, %zmm7, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm6, %zmm5, %zmm7
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm2 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k3}
@@ -1675,16 +1675,16 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 (%r11), %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 (%r10), %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm9
 ; AVX512BW-NEXT:    movb $-120, %cl
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm9 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm4
 ; AVX512BW-NEXT:    movb $34, %cl
 ; AVX512BW-NEXT:    kmovd %ecx, %k2
@@ -1692,80 +1692,80 @@ define void @store_i32_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movb $-52, %cl
 ; AVX512BW-NEXT:    kmovd %ecx, %k3
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm4 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm9 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm9 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm10 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm10 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm12 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm11
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm11 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm11 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm14 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm15
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm15 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm15 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm13 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm16 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm13, %zmm16 {%k3}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31]
 ; AVX512BW-NEXT:    vpermi2d %zmm8, %zmm7, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm6, %zmm5, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm3, %zmm2, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm2 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k3}
@@ -3252,57 +3252,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%r10), %zmm0
 ; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm30
 ; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm28
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19]
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm5
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm3, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm2, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm5, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm8
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm7, %zmm8
 ; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm9
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm8, %zmm9
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm10
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm9, %zmm10
 ; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm11
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm10, %zmm11
 ; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm11, %zmm12
 ; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm12, %zmm13
 ; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm14
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm13, %zmm14
 ; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u>
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u]
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm15, %zmm16
 ; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29]
 ; AVX512F-NEXT:    vpermt2d %zmm30, %zmm31, %zmm4
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
@@ -3324,10 +3324,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2d %zmm28, %zmm31, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm31
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm28, %zmm31
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm27, %zmm0, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm27
@@ -3348,33 +3348,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm0, %zmm27
 ; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm16
 ; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm20
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm20
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm21
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm21
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm22
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm22
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm23
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm23
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm24
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm24
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm25
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm29, %zmm25
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm26
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm30, %zmm26
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm16
 ; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm0
@@ -3393,28 +3393,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm3
 ; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm18
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm12, %zmm18
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm17
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm10, %zmm17
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm15
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm8, %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm14
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm7, %zmm14
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm6, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm11
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm4, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm3, %zmm9
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm2, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm19, %zmm3
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm0
@@ -3546,57 +3546,57 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%r10), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm30
 ; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm3, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm2, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm5, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm6, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm8
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm7, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm8, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm10
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm9, %zmm10
 ; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm11
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm10, %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm11, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm12, %zmm13
 ; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm14
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm13, %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u>
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u]
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm16
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm15, %zmm16
 ; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29]
 ; AVX512BW-NEXT:    vpermt2d %zmm30, %zmm31, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
@@ -3618,10 +3618,10 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2d %zmm28, %zmm31, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm31
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm28, %zmm31
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm27, %zmm0, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm27
@@ -3642,33 +3642,33 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm0, %zmm27
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm20
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm21
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm21
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm22
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm22
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm23
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm24
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm24
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = <u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm29 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm25
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm29, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = <u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm30 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm26
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm30, %zmm26
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm16
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm0
@@ -3687,28 +3687,28 @@ define void @store_i32_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm18
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm12, %zmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm17
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm17
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm15
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm8, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm14
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm7, %zmm14
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm6, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm11
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm4, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm3, %zmm9
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm2, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm19, %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm0
@@ -6769,41 +6769,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 (%rax), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 64(%rax), %zmm0
 ; AVX512F-NEXT:    vmovdqa64 128(%rax), %zmm30
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm7
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm6
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm3, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm7
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm5, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21]
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm6, %zmm8
 ; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23]
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm8, %zmm9
 ; AVX512F-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25]
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm10
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm8, %zmm10
 ; AVX512F-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27]
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm11
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm8, %zmm11
 ; AVX512F-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29]
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm12
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm8, %zmm13
 ; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31]
 ; AVX512F-NEXT:    vpermt2d %zmm1, %zmm8, %zmm12
 ; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm1
@@ -6872,42 +6872,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm26
 ; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm2
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm3
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm4
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm5
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm6
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm7
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm7
 ; AVX512F-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm8
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm8
 ; AVX512F-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm26
 ; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm25
 ; AVX512F-NEXT:    vmovdqa64 64(%r9), %zmm0
@@ -6976,42 +6976,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm27
 ; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm8
 ; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm2
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm4
 ; AVX512F-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm5
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm6
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm6
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm6
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512F-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm8
 ; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm0
@@ -7080,35 +7080,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm17
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm14, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm9, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm4, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm3, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm12
 ; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm0
@@ -7423,41 +7423,41 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 (%rax), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 64(%rax), %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 128(%rax), %zmm30
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,u,u,u,u,0,16,u,u,u,u,u,u,1,17]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm6
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm3, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,2,18,u,u,u,u,u,u,3,19]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm7
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm5, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,u,u,u,u,4,20,u,u,u,u,u,u,5,21]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm6, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,6,22,u,u,u,u,u,u,7,23]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm9
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm8, %zmm9
 ; AVX512BW-NEXT:    vmovdqu64 %zmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,8,24,u,u,u,u,u,u,9,25]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm10
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm8, %zmm10
 ; AVX512BW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,10,26,u,u,u,u,u,u,11,27]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm11
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm8, %zmm11
 ; AVX512BW-NEXT:    vmovdqu64 %zmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,12,28,u,u,u,u,u,u,13,29]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm12
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm8, %zmm13
 ; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,u,u,u,u,14,30,u,u,u,u,u,u,15,31]
 ; AVX512BW-NEXT:    vpermt2d %zmm1, %zmm8, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm1
@@ -7526,42 +7526,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,0,16,u,u,u,u,u,u,1,17,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm2
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,2,18,u,u,u,u,u,u,3,19,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm3
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,4,20,u,u,u,u,u,u,5,21,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,6,22,u,u,u,u,u,u,7,23,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm5
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,8,24,u,u,u,u,u,u,9,25,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm6
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,10,26,u,u,u,u,u,u,11,27,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm7
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm7
 ; AVX512BW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,12,28,u,u,u,u,u,u,13,29,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm8
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm8
 ; AVX512BW-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,u,u,14,30,u,u,u,u,u,u,15,31,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 64(%r9), %zmm0
@@ -7630,42 +7630,42 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm27
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,0,16,u,u,u,u,u,u,1,17,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm2
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,2,18,u,u,u,u,u,u,3,19,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm3
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,4,20,u,u,u,u,u,u,5,21,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm4
 ; AVX512BW-NEXT:    vmovdqu64 %zmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,6,22,u,u,u,u,u,u,7,23,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm5
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,8,24,u,u,u,u,u,u,9,25,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm6
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,10,26,u,u,u,u,u,u,11,27,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm6
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,12,28,u,u,u,u,u,u,13,29,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm6
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm6
 ; AVX512BW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,u,14,30,u,u,u,u,u,u,15,31,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm0
@@ -7734,35 +7734,35 @@ define void @store_i32_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm1, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,16,u,u,u,u,u,u,1,17,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm14, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [2,18,u,u,u,u,u,u,3,19,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm11, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [4,20,u,u,u,u,u,u,5,21,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm10, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [6,22,u,u,u,u,u,u,7,23,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm9, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [8,24,u,u,u,u,u,u,9,25,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm7, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [10,26,u,u,u,u,u,u,11,27,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm4, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, (%rsp) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [12,28,u,u,u,u,u,u,13,29,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm1
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [14,30,u,u,u,u,u,u,15,31,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermt2d %zmm0, %zmm31, %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
index 3ffcf6d6d6f5a..5176db91e914e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-3.ll
@@ -64,7 +64,7 @@ define void @store_i64_stride3_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovaps (%rdi), %xmm0
 ; AVX512-NEXT:    vinsertf128 $1, (%rsi), %ymm0, %ymm0
 ; AVX512-NEXT:    vinsertf32x4 $2, (%rdx), %zmm0, %zmm0
-; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = <0,2,4,1,3,5,u,u>
+; AVX512-NEXT:    vmovaps {{.*#+}} zmm1 = [0,2,4,1,3,5,u,u]
 ; AVX512-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512-NEXT:    vextractf32x4 $2, %zmm0, 32(%rcx)
 ; AVX512-NEXT:    vmovaps %ymm0, (%rcx)
@@ -308,15 +308,15 @@ define void @store_i64_stride3_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512-NEXT:    vmovdqa64 (%rsi), %zmm1
 ; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,8,u,1,9,u,2,10>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,8,u,1,9,u,2,10]
 ; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,8,3,4,9,6,7]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,3,11,u,4,12,u,5>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,3,11,u,4,12,u,5]
 ; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [10,1,2,11,4,5,12,7]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <5,u,14,6,u,15,7,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [5,u,14,6,u,15,7,u]
 ; AVX512-NEXT:    vpermi2q %zmm0, %zmm1, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,13,2,3,14,5,6,15]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm3, %zmm0
@@ -605,17 +605,17 @@ define void @store_i64_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 64(%rsi), %zmm3
 ; AVX512-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,8,u,1,9,u,2,10>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,8,u,1,9,u,2,10]
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm7
 ; AVX512-NEXT:    vpermt2q %zmm2, %zmm6, %zmm7
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,8,3,4,9,6,7]
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm8, %zmm7
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <5,u,14,6,u,15,7,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [5,u,14,6,u,15,7,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm10
 ; AVX512-NEXT:    vpermt2q %zmm1, %zmm9, %zmm10
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,13,2,3,14,5,6,15]
 ; AVX512-NEXT:    vpermt2q %zmm5, %zmm11, %zmm10
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,3,11,u,4,12,u,5>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,3,11,u,4,12,u,5]
 ; AVX512-NEXT:    vmovdqa64 %zmm1, %zmm13
 ; AVX512-NEXT:    vpermt2q %zmm3, %zmm12, %zmm13
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [10,1,2,11,4,5,12,7]
@@ -1246,17 +1246,17 @@ define void @store_i64_stride3_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm9
 ; AVX512-NEXT:    vmovdqa64 128(%rdx), %zmm10
 ; AVX512-NEXT:    vmovdqa64 192(%rdx), %zmm11
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <0,8,u,1,9,u,2,10>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [0,8,u,1,9,u,2,10]
 ; AVX512-NEXT:    vmovdqa64 %zmm3, %zmm13
 ; AVX512-NEXT:    vpermt2q %zmm4, %zmm12, %zmm13
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,1,8,3,4,9,6,7]
 ; AVX512-NEXT:    vpermt2q %zmm8, %zmm14, %zmm13
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <5,u,14,6,u,15,7,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [5,u,14,6,u,15,7,u]
 ; AVX512-NEXT:    vmovdqa64 %zmm7, %zmm16
 ; AVX512-NEXT:    vpermt2q %zmm0, %zmm15, %zmm16
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,13,2,3,14,5,6,15]
 ; AVX512-NEXT:    vpermt2q %zmm11, %zmm17, %zmm16
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,3,11,u,4,12,u,5>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,3,11,u,4,12,u,5]
 ; AVX512-NEXT:    vmovdqa64 %zmm0, %zmm19
 ; AVX512-NEXT:    vpermt2q %zmm7, %zmm18, %zmm19
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [10,1,2,11,4,5,12,7]
@@ -2589,17 +2589,17 @@ define void @store_i64_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512-NEXT:    vmovdqa64 64(%rdx), %zmm25
 ; AVX512-NEXT:    vmovdqa64 128(%rdx), %zmm29
 ; AVX512-NEXT:    vmovdqa64 192(%rdx), %zmm31
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,8,u,1,9,u,2,10>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,8,u,1,9,u,2,10]
 ; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm3
 ; AVX512-NEXT:    vpermt2q %zmm0, %zmm14, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,1,8,3,4,9,6,7]
 ; AVX512-NEXT:    vpermt2q %zmm15, %zmm19, %zmm3
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <u,3,11,u,4,12,u,5>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [u,3,11,u,4,12,u,5]
 ; AVX512-NEXT:    vmovdqa64 %zmm13, %zmm10
 ; AVX512-NEXT:    vpermt2q %zmm0, %zmm17, %zmm10
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [10,1,2,11,4,5,12,7]
 ; AVX512-NEXT:    vpermt2q %zmm15, %zmm21, %zmm10
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <5,u,14,6,u,15,7,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [5,u,14,6,u,15,7,u]
 ; AVX512-NEXT:    vpermt2q %zmm13, %zmm22, %zmm0
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,13,2,3,14,5,6,15]
 ; AVX512-NEXT:    vpermt2q %zmm15, %zmm23, %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
index a2e7633f69554..18392469d52e3 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-4.ll
@@ -376,26 +376,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm1
 ; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,0,8,u,u,1,9>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,8,u,u,1,9,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
 ; AVX512F-NEXT:    movb $-52, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,2,10,u,u,3,11>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <2,10,u,u,3,11,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,4,12,u,u,5,13>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <4,12,u,u,5,13,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,6,14,u,u,7,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <6,14,u,u,7,15,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, 192(%r8)
@@ -411,26 +411,26 @@ define void @store_i64_stride4_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm1
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,0,8,u,u,1,9>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,0,8,u,u,1,9]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,8,u,u,1,9,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,8,u,u,1,9,u,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm5
 ; AVX512BW-NEXT:    movb $-52, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm5 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,2,10,u,u,3,11>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,2,10,u,u,3,11]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <2,10,u,u,3,11,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [2,10,u,u,3,11,u,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm6 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,4,12,u,u,5,13>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,4,12,u,u,5,13]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <4,12,u,u,5,13,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [4,12,u,u,5,13,u,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,u,6,14,u,u,7,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,u,6,14,u,u,7,15]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <6,14,u,u,7,15,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,7,15,u,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, 192(%r8)
@@ -830,32 +830,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm6
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,2,10,u,u,3,11>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm9
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm8, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <2,10,u,u,3,11,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm10, %zmm11
 ; AVX512F-NEXT:    movb $-52, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,0,8,u,u,1,9>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm9, %zmm12
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,8,u,u,1,9,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm14
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm13, %zmm14
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,6,14,u,u,7,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15]
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm15
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm12, %zmm15
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,7,15,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm17
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm17
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,4,12,u,u,5,13>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13]
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm15, %zmm4
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <4,12,u,u,5,13,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u]
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm6, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
 ; AVX512F-NEXT:    vpermi2q %zmm7, %zmm5, %zmm8
@@ -891,32 +891,32 @@ define void @store_i64_stride4_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,2,10,u,u,3,11>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,3,11]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm9
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <2,10,u,u,3,11,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [2,10,u,u,3,11,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm11
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm10, %zmm11
 ; AVX512BW-NEXT:    movb $-52, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm11 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,0,8,u,u,1,9>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm12
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm9, %zmm12
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <0,8,u,u,1,9,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [0,8,u,u,1,9,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm14
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm13, %zmm14
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm14 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <u,u,6,14,u,u,7,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [u,u,6,14,u,u,7,15]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm15
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm12, %zmm15
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,7,15,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,7,15,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm17
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm17 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,4,12,u,u,5,13>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13]
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm15, %zmm4
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <4,12,u,u,5,13,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [4,12,u,u,5,13,u,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm6, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vpermi2q %zmm7, %zmm5, %zmm8
@@ -1772,32 +1772,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm26
 ; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm19
 ; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,2,10,u,u,3,11>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm8
 ; AVX512F-NEXT:    vpermt2q %zmm21, %zmm14, %zmm8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,3,11,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; AVX512F-NEXT:    vpermt2q %zmm17, %zmm7, %zmm4
 ; AVX512F-NEXT:    movb $-52, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <u,u,0,8,u,u,1,9>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm10
 ; AVX512F-NEXT:    vpermt2q %zmm21, %zmm16, %zmm10
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,8,u,u,1,9,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512F-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,u,6,14,u,u,7,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15]
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm20
 ; AVX512F-NEXT:    vpermt2q %zmm21, %zmm18, %zmm20
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,7,15,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm10
 ; AVX512F-NEXT:    vpermt2q %zmm17, %zmm15, %zmm10
 ; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm10 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,4,12,u,u,5,13>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13]
 ; AVX512F-NEXT:    vpermt2q %zmm21, %zmm20, %zmm22
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <4,12,u,u,5,13,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u]
 ; AVX512F-NEXT:    vpermt2q %zmm17, %zmm21, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm22, %zmm0 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm22
@@ -1885,32 +1885,32 @@ define void @store_i64_stride4_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm19
 ; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,u,2,10,u,u,3,11>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,u,2,10,u,u,3,11]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm14, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <2,10,u,u,3,11,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [2,10,u,u,3,11,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm4
 ; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm7, %zmm4
 ; AVX512BW-NEXT:    movb $-52, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm4 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <u,u,0,8,u,u,1,9>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [u,u,0,8,u,u,1,9]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm10
 ; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm16, %zmm10
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,8,u,u,1,9,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,8,u,u,1,9,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm11, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm8 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,u,6,14,u,u,7,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,u,6,14,u,u,7,15]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm20
 ; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm18, %zmm20
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <6,14,u,u,7,15,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [6,14,u,u,7,15,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm10
 ; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm15, %zmm10
 ; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm10 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <u,u,4,12,u,u,5,13>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [u,u,4,12,u,u,5,13]
 ; AVX512BW-NEXT:    vpermt2q %zmm21, %zmm20, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <4,12,u,u,5,13,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [4,12,u,u,5,13,u,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm22, %zmm0 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm22
@@ -3673,19 +3673,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm2
 ; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,0,8,u,u,1,9>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9]
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512F-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
 ; AVX512F-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,2,10,u,u,3,11>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11]
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm15
 ; AVX512F-NEXT:    vpermt2q %zmm4, %zmm13, %zmm15
 ; AVX512F-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,4,12,u,u,5,13>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13]
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512F-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
 ; AVX512F-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <u,u,6,14,u,u,7,15>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15]
 ; AVX512F-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
 ; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm4
@@ -3765,19 +3765,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm26, %zmm28
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm26
 ; AVX512F-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm7, %zmm0
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <2,10,u,u,3,11,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <4,12,u,u,5,13,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <6,14,u,u,7,15,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u]
 ; AVX512F-NEXT:    vpermt2q %zmm3, %zmm22, %zmm26
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %zmm20
 ; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm3
@@ -3968,19 +3968,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,0,8,u,u,1,9>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,0,8,u,u,1,9]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm9, %zmm13
 ; AVX512BW-NEXT:    vmovdqu64 %zmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,2,10,u,u,3,11>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,3,11]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm15
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm13, %zmm15
 ; AVX512BW-NEXT:    vmovdqu64 %zmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,4,12,u,u,5,13>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,4,12,u,u,5,13]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm16
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm15, %zmm16
 ; AVX512BW-NEXT:    vmovdqu64 %zmm16, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <u,u,6,14,u,u,7,15>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [u,u,6,14,u,u,7,15]
 ; AVX512BW-NEXT:    vpermt2q %zmm4, %zmm26, %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm4
@@ -4060,19 +4060,19 @@ define void @store_i64_stride4_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm26, %zmm28
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 (%rsi), %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,1,9,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,1,9,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm7, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm30
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <2,10,u,u,3,11,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [2,10,u,u,3,11,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm5, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <4,12,u,u,5,13,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [4,12,u,u,5,13,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <6,14,u,u,7,15,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [6,14,u,u,7,15,u,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm3, %zmm22, %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %zmm20
 ; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm3
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
index e52fd4013bf46..4951d3fd8d2e4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-5.ll
@@ -220,13 +220,13 @@ define void @store_i64_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vmovdqa (%r8), %ymm2
 ; AVX512-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = <15,3,7,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm3 = [15,3,7,u]
 ; AVX512-NEXT:    vpermi2q %zmm0, %zmm1, %zmm3
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,4,8,12,u,1,5,9>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,4,8,12,u,1,5,9]
 ; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm4
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,2,3,8,5,6,7]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm4, %zmm5
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <13,u,2,6,10,14,u,3>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [13,u,2,6,10,14,u,3]
 ; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm4
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,9,2,3,4,5,10,7]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm4, %zmm0
@@ -470,9 +470,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm5
 ; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u]
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm3, %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,3,11,u,u,u,4,12>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12]
 ; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512F-NEXT:    movb $49, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
@@ -480,18 +480,18 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    movb $8, %al
 ; AVX512F-NEXT:    kmovw %eax, %k2
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,0,8,u,u,u,1>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1]
 ; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,u,1,9,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
 ; AVX512F-NEXT:    movb $-116, %al
 ; AVX512F-NEXT:    kmovw %eax, %k2
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm7, %zmm6
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <1,u,u,u,10,2,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u]
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,2,10,u,u,u,3>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7]
@@ -499,7 +499,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5]
 ; AVX512F-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,5,13,u,u,u,6,14>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm9
 ; AVX512F-NEXT:    movb $24, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
@@ -509,7 +509,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7]
 ; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm9
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <6,14,u,u,u,7,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u]
 ; AVX512F-NEXT:    vpermi2q %zmm5, %zmm4, %zmm2
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15]
@@ -529,9 +529,9 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <3,u,u,u,12,4,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,u,u,u,12,4,u,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,3,11,u,u,u,4,12>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,3,11,u,u,u,4,12]
 ; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm1
 ; AVX512BW-NEXT:    movb $49, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
@@ -539,18 +539,18 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    movb $8, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,u,0,8,u,u,u,1>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,u,0,8,u,u,u,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,8,u,u,u,1,9,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,8,u,u,u,1,9,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm7
 ; AVX512BW-NEXT:    movb $-116, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm7 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,1,2,3,8,5,6,7]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm7, %zmm6
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <1,u,u,u,10,2,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [1,u,u,u,10,2,u,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm5, %zmm7
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,u,2,10,u,u,u,3>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,u,2,10,u,u,u,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,9,2,3,4,5,10,7]
@@ -558,7 +558,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm8 = [13,5,13,5,13,5,13,5]
 ; AVX512BW-NEXT:    # zmm8 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm8
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,5,13,u,u,u,6,14>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,5,13,u,u,u,6,14]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm9
 ; AVX512BW-NEXT:    movb $24, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
@@ -568,7 +568,7 @@ define void @store_i64_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm9 = [15,7,15,7,15,7,15,7]
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm9
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <6,14,u,u,u,7,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [6,14,u,u,u,7,15,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm4, %zmm2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm2 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,14,3,4,5,6,15]
@@ -1107,10 +1107,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm12
 ; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm4
 ; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm11
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <3,u,u,u,12,4,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm9, %zmm13
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm8, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,3,11,u,u,u,4,12>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12]
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm3
 ; AVX512F-NEXT:    vpermt2q %zmm12, %zmm7, %zmm3
 ; AVX512F-NEXT:    movb $49, %al
@@ -1123,10 +1123,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermi2q %zmm5, %zmm6, %zmm7
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
 ; AVX512F-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,0,8,u,u,u,1>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm6, %zmm13
 ; AVX512F-NEXT:    vpermt2q %zmm5, %zmm15, %zmm13
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <0,8,u,u,u,1,9,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm16, %zmm8
 ; AVX512F-NEXT:    movb $-116, %al
@@ -1138,7 +1138,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm19
 ; AVX512F-NEXT:    vpermt2q %zmm9, %zmm18, %zmm19
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <6,14,u,u,u,7,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm13
 ; AVX512F-NEXT:    vpermt2q %zmm12, %zmm14, %zmm13
 ; AVX512F-NEXT:    movb $24, %al
@@ -1150,16 +1150,16 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512F-NEXT:    vpermt2q %zmm12, %zmm20, %zmm21
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <u,5,13,u,u,u,6,14>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm23
 ; AVX512F-NEXT:    vpermt2q %zmm9, %zmm22, %zmm23
 ; AVX512F-NEXT:    vmovdqa64 %zmm21, %zmm23 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm11, %zmm21, %zmm23
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <1,u,u,u,10,2,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm12, %zmm25
 ; AVX512F-NEXT:    vpermt2q %zmm10, %zmm24, %zmm25
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <u,u,2,10,u,u,u,3>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27
 ; AVX512F-NEXT:    vpermt2q %zmm9, %zmm26, %zmm27
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm27 {%k1}
@@ -1206,10 +1206,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm12
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm11
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <3,u,u,u,12,4,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [3,u,u,u,12,4,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm9, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm8, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,3,11,u,u,u,4,12>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,3,11,u,u,u,4,12]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm3
 ; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm7, %zmm3
 ; AVX512BW-NEXT:    movb $49, %al
@@ -1222,10 +1222,10 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermi2q %zmm5, %zmm6, %zmm7
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm7 {%k1}
 ; AVX512BW-NEXT:    vmovdqa64 %zmm4, %zmm7 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,0,8,u,u,u,1>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,0,8,u,u,u,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm6, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm15, %zmm13
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <0,8,u,u,u,1,9,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [0,8,u,u,u,1,9,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm8
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm16, %zmm8
 ; AVX512BW-NEXT:    movb $-116, %al
@@ -1237,7 +1237,7 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    # zmm18 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm19
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm18, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <6,14,u,u,u,7,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [6,14,u,u,u,7,15,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm14, %zmm13
 ; AVX512BW-NEXT:    movb $24, %al
@@ -1249,16 +1249,16 @@ define void @store_i64_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    # zmm20 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm10, %zmm21
 ; AVX512BW-NEXT:    vpermt2q %zmm12, %zmm20, %zmm21
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <u,5,13,u,u,u,6,14>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [u,5,13,u,u,u,6,14]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm23
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm22, %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 %zmm21, %zmm23 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [12,1,2,3,4,13,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm21, %zmm23
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <1,u,u,u,10,2,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [1,u,u,u,10,2,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm12, %zmm25
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm24, %zmm25
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <u,u,2,10,u,u,u,3>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [u,u,2,10,u,u,u,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27
 ; AVX512BW-NEXT:    vpermt2q %zmm9, %zmm26, %zmm27
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm27 {%k1}
@@ -2445,11 +2445,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm23
 ; AVX512F-NEXT:    vmovdqa64 128(%rcx), %zmm25
 ; AVX512F-NEXT:    vmovdqa64 192(%rcx), %zmm18
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm16, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm20, %zmm11, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <u,3,11,u,u,u,4,12>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12]
 ; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm18, %zmm19, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2460,17 +2460,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2q %zmm23, %zmm19, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm17, %zmm19
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,0,8,u,u,u,1>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm28, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u]
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7]
 ; AVX512F-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm16, %zmm29, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm26
 ; AVX512F-NEXT:    vpermt2q %zmm18, %zmm12, %zmm26
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5]
@@ -2478,7 +2478,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 %zmm27, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm18, %zmm21, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm27, %zmm31, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2497,7 +2497,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 %zmm23, %zmm30
 ; AVX512F-NEXT:    vpermt2q %zmm24, %zmm31, %zmm30
 ; AVX512F-NEXT:    vpermt2q %zmm23, %zmm28, %zmm24
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,5,13,u,u,u,6,14>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14]
 ; AVX512F-NEXT:    vpermt2q %zmm23, %zmm12, %zmm25
 ; AVX512F-NEXT:    vpermt2q %zmm23, %zmm21, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2507,7 +2507,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm17, %zmm21
 ; AVX512F-NEXT:    vpermt2q %zmm17, %zmm31, %zmm1
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,2,10,u,u,u,3>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm20, %zmm17
 ; AVX512F-NEXT:    vpermt2q %zmm16, %zmm10, %zmm17
 ; AVX512F-NEXT:    vpermt2q %zmm16, %zmm14, %zmm20
@@ -2647,11 +2647,11 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm23
 ; AVX512BW-NEXT:    vmovdqa64 128(%rcx), %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 192(%rcx), %zmm18
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <3,u,u,u,12,4,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [3,u,u,u,12,4,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm16, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm20, %zmm11, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <u,3,11,u,u,u,4,12>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [u,3,11,u,u,u,4,12]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm19, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2662,17 +2662,17 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm19, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm17, %zmm19
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,u,0,8,u,u,u,1>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [u,u,0,8,u,u,u,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm28, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <0,8,u,u,u,1,9,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [0,8,u,u,u,1,9,u]
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm29 = [15,7,15,7,15,7,15,7]
 ; AVX512BW-NEXT:    # zmm29 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm29, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <6,14,u,u,u,7,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [6,14,u,u,u,7,15,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm26
 ; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm12, %zmm26
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm21 = [13,5,13,5,13,5,13,5]
@@ -2680,7 +2680,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm27, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm18, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <1,u,u,u,10,2,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [1,u,u,u,10,2,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm18, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm27, %zmm31, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2699,7 +2699,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm23, %zmm30
 ; AVX512BW-NEXT:    vpermt2q %zmm24, %zmm31, %zmm30
 ; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm28, %zmm24
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = <u,5,13,u,u,u,6,14>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [u,5,13,u,u,u,6,14]
 ; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm12, %zmm25
 ; AVX512BW-NEXT:    vpermt2q %zmm23, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2709,7 +2709,7 @@ define void @store_i64_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm17, %zmm21
 ; AVX512BW-NEXT:    vpermt2q %zmm17, %zmm31, %zmm1
 ; AVX512BW-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,2,10,u,u,u,3>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,2,10,u,u,u,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm20, %zmm17
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm10, %zmm17
 ; AVX512BW-NEXT:    vpermt2q %zmm16, %zmm14, %zmm20
@@ -5252,7 +5252,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%rsi), %zmm4
 ; AVX512F-NEXT:    vmovdqa64 128(%rsi), %zmm3
 ; AVX512F-NEXT:    vmovdqa64 192(%rsi), %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <3,u,u,u,12,4,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm12
 ; AVX512F-NEXT:    vpermt2q %zmm11, %zmm20, %zmm12
 ; AVX512F-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
@@ -5268,15 +5268,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm12
 ; AVX512F-NEXT:    vpermt2q %zmm7, %zmm20, %zmm12
 ; AVX512F-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <0,8,u,u,u,1,9,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm5, %zmm21, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,2,10,u,u,u,3>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,5,13,u,u,u,6,14>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14]
 ; AVX512F-NEXT:    vmovdqa64 %zmm11, %zmm0
 ; AVX512F-NEXT:    vpermt2q %zmm5, %zmm14, %zmm0
 ; AVX512F-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -5371,20 +5371,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 (%rdx), %zmm25
 ; AVX512F-NEXT:    vmovdqa64 (%rcx), %zmm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,3,11,u,u,u,4,12>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12]
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm3
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm6, %zmm3
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,0,8,u,u,u,1>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1]
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm22
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm2, %zmm22
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <1,u,u,u,10,2,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u]
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27
 ; AVX512F-NEXT:    vpermt2q %zmm25, %zmm12, %zmm27
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5]
 ; AVX512F-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vmovdqa64 %zmm25, %zmm28
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm10, %zmm28
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,u,7,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u]
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm16, %zmm25
 ; AVX512F-NEXT:    vmovdqa64 64(%rdx), %zmm26
 ; AVX512F-NEXT:    vmovdqa64 64(%rcx), %zmm0
@@ -5715,7 +5715,7 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rsi), %zmm4
 ; AVX512BW-NEXT:    vmovdqa64 128(%rsi), %zmm3
 ; AVX512BW-NEXT:    vmovdqa64 192(%rsi), %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <3,u,u,u,12,4,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [3,u,u,u,12,4,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm12
 ; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm20, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, (%rsp) # 64-byte Spill
@@ -5731,15 +5731,15 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm12
 ; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm20, %zmm12
 ; AVX512BW-NEXT:    vmovdqu64 %zmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <0,8,u,u,u,1,9,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,8,u,u,u,1,9,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm21, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = <u,u,2,10,u,u,u,3>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm13 = [u,u,2,10,u,u,u,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm13, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = <u,5,13,u,u,u,6,14>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm14 = [u,5,13,u,u,u,6,14]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm11, %zmm0
 ; AVX512BW-NEXT:    vpermt2q %zmm5, %zmm14, %zmm0
 ; AVX512BW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -5834,20 +5834,20 @@ define void @store_i64_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 (%rdx), %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 (%rcx), %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,3,11,u,u,u,4,12>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,3,11,u,u,u,4,12]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm3
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm6, %zmm3
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,u,0,8,u,u,u,1>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,u,0,8,u,u,u,1]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm22
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm22
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <1,u,u,u,10,2,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [1,u,u,u,10,2,u,u]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27
 ; AVX512BW-NEXT:    vpermt2q %zmm25, %zmm12, %zmm27
 ; AVX512BW-NEXT:    vbroadcasti32x4 {{.*#+}} zmm10 = [13,5,13,5,13,5,13,5]
 ; AVX512BW-NEXT:    # zmm10 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vmovdqa64 %zmm25, %zmm28
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm10, %zmm28
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <6,14,u,u,u,7,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [6,14,u,u,u,7,15,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm16, %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdx), %zmm26
 ; AVX512BW-NEXT:    vmovdqa64 64(%rcx), %zmm0
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
index ab51f0bf9135c..37f197b1895b8 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-6.ll
@@ -271,11 +271,11 @@ define void @store_i64_stride6_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti64x4 $1, (%rsi), %zmm0, %zmm0
 ; AVX512-NEXT:    vinserti64x4 $1, (%rcx), %zmm1, %zmm1
 ; AVX512-NEXT:    vinserti64x4 $1, (%r9), %zmm2, %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,4,8,12,u,u,1,5>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,4,8,12,u,u,1,5]
 ; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,8,12,6,7]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm3, %zmm4
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <1,5,u,u,10,14,2,6>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [1,5,u,u,10,14,2,6]
 ; AVX512-NEXT:    vpermi2q %zmm0, %zmm1, %zmm3
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,9,13,4,5,6,7]
 ; AVX512-NEXT:    vpermi2q %zmm2, %zmm3, %zmm5
@@ -626,7 +626,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    movb $48, %r9b
 ; AVX512F-NEXT:    kmovw %r9d, %k2
 ; AVX512F-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,1,9,u,4,5,6,7>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm8, %zmm9
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7]
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm9, %zmm7
@@ -637,7 +637,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm6, %zmm5, %zmm9
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,13,u,4,5,6,7>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm9, %zmm8
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7]
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm8, %zmm9
@@ -649,7 +649,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm8, %zmm5
 ; AVX512F-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm5, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15]
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm6, %zmm5
@@ -666,7 +666,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <10,u,2,3,4,5,11,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11]
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm6, %zmm1
@@ -711,7 +711,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    movb $48, %r9b
 ; AVX512BW-NEXT:    kmovd %r9d, %k2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm7, %zmm8 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,1,9,u,4,5,6,7>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,9,u,4,5,6,7]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm8, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,1,2,9,4,5,6,7]
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm9, %zmm7
@@ -722,7 +722,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    # zmm9 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm6, %zmm5, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm9 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,13,u,4,5,6,7>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,13,u,4,5,6,7]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm9, %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,1,2,13,4,5,6,7]
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm8, %zmm9
@@ -734,7 +734,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm5
 ; AVX512BW-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <14,u,2,3,4,5,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,u,2,3,4,5,15,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,14,2,3,4,5,6,15]
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm6, %zmm5
@@ -751,7 +751,7 @@ define void @store_i64_stride6_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm2
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm2 = ymm2[1],mem[1],ymm2[3],mem[3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm2, %zmm10, %zmm2
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <10,u,2,3,4,5,11,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [10,u,2,3,4,5,11,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm2, %zmm6
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,10,2,3,4,5,6,11]
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm6, %zmm1
@@ -1478,7 +1478,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 %zmm8, %zmm13
 ; AVX512F-NEXT:    vpermt2q %zmm11, %zmm19, %zmm13
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <14,u,2,3,4,5,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u]
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm20, %zmm9
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
 ; AVX512F-NEXT:    vpermt2q %zmm14, %zmm21, %zmm9
@@ -1493,7 +1493,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    movb $48, %r9b
 ; AVX512F-NEXT:    kmovw %r9d, %k2
 ; AVX512F-NEXT:    vmovdqa64 %zmm17, %zmm13 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <0,1,13,u,4,5,6,7>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm23, %zmm13
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm14, %zmm24, %zmm13
@@ -1506,7 +1506,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 %zmm5, %zmm17
 ; AVX512F-NEXT:    vpermt2q %zmm7, %zmm18, %zmm17
 ; AVX512F-NEXT:    vmovdqa64 %zmm26, %zmm17 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <0,1,9,u,4,5,6,7>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm26, %zmm17
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm14, %zmm27, %zmm17
@@ -1552,7 +1552,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 64(%rdi), %ymm20
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3]
 ; AVX512F-NEXT:    vinserti64x4 $0, %ymm20, %zmm5, %zmm5
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <10,u,2,3,4,5,11,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u]
 ; AVX512F-NEXT:    vpermt2q %zmm6, %zmm20, %zmm5
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11]
 ; AVX512F-NEXT:    vpermt2q %zmm14, %zmm6, %zmm5
@@ -1617,7 +1617,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm8, %zmm13
 ; AVX512BW-NEXT:    vpermt2q %zmm11, %zmm19, %zmm13
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm9 = zmm13[0,1,2,3],zmm9[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <14,u,2,3,4,5,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [14,u,2,3,4,5,15,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm20, %zmm9
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,14,2,3,4,5,6,15]
 ; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm21, %zmm9
@@ -1632,7 +1632,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movb $48, %r9b
 ; AVX512BW-NEXT:    kmovd %r9d, %k2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm17, %zmm13 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = <0,1,13,u,4,5,6,7>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm23 = [0,1,13,u,4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm23, %zmm13
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [0,1,2,13,4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm24, %zmm13
@@ -1645,7 +1645,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 %zmm5, %zmm17
 ; AVX512BW-NEXT:    vpermt2q %zmm7, %zmm18, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 %zmm26, %zmm17 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = <0,1,9,u,4,5,6,7>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm26 = [0,1,9,u,4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm26, %zmm17
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm27 = [0,1,2,9,4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm27, %zmm17
@@ -1691,7 +1691,7 @@ define void @store_i64_stride6_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 64(%rdi), %ymm20
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm20 = ymm20[1],mem[1],ymm20[3],mem[3]
 ; AVX512BW-NEXT:    vinserti64x4 $0, %ymm20, %zmm5, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <10,u,2,3,4,5,11,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [10,u,2,3,4,5,11,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm6, %zmm20, %zmm5
 ; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,10,2,3,4,5,6,11]
 ; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm6, %zmm5
@@ -3294,7 +3294,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7]
 ; AVX512F-NEXT:    vmovdqa64 (%r9), %zmm8
 ; AVX512F-NEXT:    vmovdqa64 %zmm14, %zmm25 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,9,u,4,5,6,7>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm20
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm18
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm13
@@ -3308,7 +3308,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqa64 192(%r9), %zmm14
 ; AVX512F-NEXT:    vpermt2q %zmm14, %zmm5, %zmm25
 ; AVX512F-NEXT:    vmovdqa64 %zmm15, %zmm29 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,13,u,4,5,6,7>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm23
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm31
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm4
@@ -3319,7 +3319,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2q %zmm30, %zmm5, %zmm4
 ; AVX512F-NEXT:    vpermt2q %zmm14, %zmm5, %zmm29
 ; AVX512F-NEXT:    vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <14,u,2,3,4,5,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u]
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm5, %zmm12
 ; AVX512F-NEXT:    vpermt2q %zmm1, %zmm5, %zmm22
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm5, %zmm3
@@ -3363,7 +3363,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    movb $16, %al
 ; AVX512F-NEXT:    kmovw %eax, %k1
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <10,u,2,3,4,5,11,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u]
 ; AVX512F-NEXT:    vpermt2q %zmm0, %zmm19, %zmm6
 ; AVX512F-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX512F-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
@@ -3572,7 +3572,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm28[0,1,2,3],zmm17[4,5,6,7]
 ; AVX512BW-NEXT:    vmovdqa64 (%r9), %zmm8
 ; AVX512BW-NEXT:    vmovdqa64 %zmm14, %zmm25 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,9,u,4,5,6,7>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,9,u,4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm20
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm18
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm13
@@ -3586,7 +3586,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqa64 192(%r9), %zmm14
 ; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm25
 ; AVX512BW-NEXT:    vmovdqa64 %zmm15, %zmm29 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,13,u,4,5,6,7>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,13,u,4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm23
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm31
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm4
@@ -3597,7 +3597,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2q %zmm30, %zmm5, %zmm4
 ; AVX512BW-NEXT:    vpermt2q %zmm14, %zmm5, %zmm29
 ; AVX512BW-NEXT:    vshufi64x2 {{.*#+}} zmm15 = zmm16[0,1,2,3],zmm10[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <14,u,2,3,4,5,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [14,u,2,3,4,5,15,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm12
 ; AVX512BW-NEXT:    vpermt2q %zmm1, %zmm5, %zmm22
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm5, %zmm3
@@ -3641,7 +3641,7 @@ define void @store_i64_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    movb $16, %al
 ; AVX512BW-NEXT:    kmovd %eax, %k1
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k1}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = <10,u,2,3,4,5,11,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [10,u,2,3,4,5,11,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm0, %zmm19, %zmm6
 ; AVX512BW-NEXT:    vmovdqa 64(%rdi), %ymm0
 ; AVX512BW-NEXT:    vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],mem[1],ymm0[3],mem[3]
@@ -7013,7 +7013,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k2}
 ; AVX512F-NEXT:    vmovdqa64 (%r8), %zmm29
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,9,u,4,5,6,7>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm2
 ; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa64 64(%r8), %zmm2
@@ -7038,7 +7038,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,13,u,4,5,6,7>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7]
 ; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
 ; AVX512F-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
@@ -7056,7 +7056,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512F-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
 ; AVX512F-NEXT:    # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7]
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <14,u,2,3,4,5,15,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u]
 ; AVX512F-NEXT:    vpermt2q %zmm29, %zmm1, %zmm14
 ; AVX512F-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqu64 (%rsp), %zmm14 # 64-byte Reload
@@ -7084,7 +7084,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-NEXT:    kmovw %eax, %k2
 ; AVX512F-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k2}
 ; AVX512F-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u]
 ; AVX512F-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
 ; AVX512F-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-NEXT:    vmovdqa 128(%rdi), %ymm2
@@ -7709,7 +7709,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, %zmm27 {%k2}
 ; AVX512BW-NEXT:    vmovdqa64 (%r8), %zmm29
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,9,u,4,5,6,7>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,9,u,4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa64 64(%r8), %zmm2
@@ -7734,7 +7734,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vpermt2q %zmm10, %zmm1, %zmm27
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vmovdqa64 %zmm1, %zmm28 {%k2}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,13,u,4,5,6,7>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,13,u,4,5,6,7]
 ; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm3
 ; AVX512BW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
@@ -7752,7 +7752,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm1 # 64-byte Reload
 ; AVX512BW-NEXT:    vshufi64x2 $228, {{[-0-9]+}}(%r{{[sb]}}p), %zmm1, %zmm3 # 64-byte Folded Reload
 ; AVX512BW-NEXT:    # zmm3 = zmm1[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <14,u,2,3,4,5,15,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [14,u,2,3,4,5,15,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm29, %zmm1, %zmm14
 ; AVX512BW-NEXT:    vmovdqu64 %zmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqu64 (%rsp), %zmm14 # 64-byte Reload
@@ -7780,7 +7780,7 @@ define void @store_i64_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-NEXT:    kmovd %eax, %k2
 ; AVX512BW-NEXT:    vmovdqa64 %zmm2, %zmm31 {%k2}
 ; AVX512BW-NEXT:    vmovdqu64 %zmm31, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <10,u,2,3,4,5,11,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [10,u,2,3,4,5,11,u]
 ; AVX512BW-NEXT:    vpermt2q %zmm2, %zmm1, %zmm5
 ; AVX512BW-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-NEXT:    vmovdqa 128(%rdi), %ymm2
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
index 6fb5e43a2df68..18c0a59329612 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i64-stride-7.ll
@@ -114,7 +114,7 @@ define void @store_i64_stride7_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512-NEXT:    vinserti32x4 $2, (%r10), %zmm1, %zmm1
 ; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,2,4,6,8,10,12,1]
 ; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
-; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <3,5,7,9,11,13,u,u>
+; AVX512-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [3,5,7,9,11,13,u,u]
 ; AVX512-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
 ; AVX512-NEXT:    vextracti32x4 $2, %zmm3, 96(%rax)
 ; AVX512-NEXT:    vmovdqa64 %zmm2, (%rax)
@@ -307,12 +307,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vinserti64x4 $1, (%rsi), %zmm1, %zmm4
 ; AVX512F-NEXT:    vinserti64x4 $1, (%rcx), %zmm2, %zmm2
 ; AVX512F-NEXT:    vinserti64x4 $1, (%r9), %zmm3, %zmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = <15,3,7,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,3,7,u]
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm3, %zmm1
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,4,8,12,u,u,u,1>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1]
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm4, %zmm6
 ; AVX512F-NEXT:    movb $112, %cl
 ; AVX512F-NEXT:    kmovw %ecx, %k1
@@ -320,7 +320,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <5,9,13,u,u,u,2,6>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6]
 ; AVX512F-NEXT:    vpermi2q %zmm2, %zmm4, %zmm7
 ; AVX512F-NEXT:    movb $56, %cl
 ; AVX512F-NEXT:    kmovw %ecx, %k1
@@ -328,7 +328,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6]
 ; AVX512F-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512F-NEXT:    vpermi2q %zmm0, %zmm3, %zmm5
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <2,6,u,u,u,11,15,3>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3]
 ; AVX512F-NEXT:    vpermi2q %zmm4, %zmm2, %zmm3
 ; AVX512F-NEXT:    movb $28, %cl
 ; AVX512F-NEXT:    kmovw %ecx, %k1
@@ -352,12 +352,12 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rsi), %zmm1, %zmm4
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%rcx), %zmm2, %zmm2
 ; AVX512BW-NEXT:    vinserti64x4 $1, (%r9), %zmm3, %zmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <15,3,7,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [15,3,7,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm3, %zmm1
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [0,4,8,0,0,4,8,0]
 ; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,4,8,12,u,u,u,1>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,4,8,12,u,u,u,1]
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm4, %zmm6
 ; AVX512BW-NEXT:    movb $112, %cl
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
@@ -365,7 +365,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [5,9,0,1,5,9,0,1]
 ; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <5,9,13,u,u,u,2,6>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [5,9,13,u,u,u,2,6]
 ; AVX512BW-NEXT:    vpermi2q %zmm2, %zmm4, %zmm7
 ; AVX512BW-NEXT:    movb $56, %cl
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
@@ -373,7 +373,7 @@ define void @store_i64_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-NEXT:    vbroadcasti64x4 {{.*#+}} zmm5 = [10,0,2,6,10,0,2,6]
 ; AVX512BW-NEXT:    # zmm5 = mem[0,1,2,3,0,1,2,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm0, %zmm3, %zmm5
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <2,6,u,u,u,11,15,3>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [2,6,u,u,u,11,15,3]
 ; AVX512BW-NEXT:    vpermi2q %zmm4, %zmm2, %zmm3
 ; AVX512BW-NEXT:    movb $28, %cl
 ; AVX512BW-NEXT:    kmovd %ecx, %k1
@@ -934,7 +934,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm11
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm12
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <1,3,7,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [1,3,7,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %ymm11, %ymm12, %ymm13
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-FAST-NEXT:    movb $14, %sil
@@ -1147,7 +1147,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6]
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%r9), %ymm10
 ; AVX512DQ-FAST-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,3,7,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,3,7,u]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %ymm10, %ymm11, %ymm9
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-FAST-NEXT:    movb $14, %sil
@@ -1423,7 +1423,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm10 {%k2} = zmm0[0],zmm5[0],zmm0[2],zmm5[2],zmm0[4],zmm5[4],zmm0[6],zmm5[6]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r9), %ymm11
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%r8), %ymm12
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <1,3,7,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [1,3,7,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %ymm11, %ymm12, %ymm13
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm13[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    movb $14, %sil
@@ -1636,7 +1636,7 @@ define void @store_i64_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} zmm8 {%k1} = zmm0[0],zmm6[0],zmm0[2],zmm6[2],zmm0[4],zmm6[4],zmm0[6],zmm6[6]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa (%r9), %ymm10
 ; AVX512DQBW-FAST-NEXT:    vmovdqa (%r8), %ymm11
-; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <1,3,7,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [1,3,7,u]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %ymm10, %ymm11, %ymm9
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-FAST-NEXT:    movb $14, %sil
@@ -2556,13 +2556,13 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm22, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm18, %zmm21
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm21, %zmm18
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm19, %zmm21
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm21, %zmm19
@@ -2737,14 +2737,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    movb $24, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,1,2,3,4,15,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm7, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm8, %zmm9
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm8
@@ -2968,13 +2968,13 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm16, %zmm22, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm30, %zmm18, %zmm21
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm21, %zmm18
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm30, %zmm19, %zmm21
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm27, %zmm21, %zmm19
@@ -3150,7 +3150,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    movb $24, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm16 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [u,1,2,3,4,15,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm21, %zmm16, %zmm31
 ; AVX512DQ-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1]
 ; AVX512DQ-FAST-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
@@ -3293,9 +3293,9 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    kmovw %eax, %k1
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm27, %zmm19 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,1,2,3,4,5,15,u]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm31, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14]
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15]
@@ -3380,13 +3380,13 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm22, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm18, %zmm21
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm21, %zmm18
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm31, %zmm19, %zmm21
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm28, %zmm21, %zmm19
@@ -3561,14 +3561,14 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    movb $24, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm7, %zmm9 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,1,2,3,4,15,u,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm7, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [14,1,2,3,4,5,6,15]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm23, %zmm8, %zmm9
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,13,2,3,4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm12, %zmm9, %zmm8
@@ -3792,13 +3792,13 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm21, %zmm22 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [u,1,2,3,4,15,u,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm16, %zmm22, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,5,15,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm30, %zmm18, %zmm21
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm18 = [14,1,2,3,4,5,6,15]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm21, %zmm18
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [13,u,2,3,4,5,6,14]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm30, %zmm19, %zmm21
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm19 = [0,13,2,3,4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm27, %zmm21, %zmm19
@@ -3974,7 +3974,7 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    movb $24, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k1
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm16 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [u,1,2,3,4,15,u,u]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm21, %zmm16, %zmm31
 ; AVX512DQBW-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm22 = [0,8,0,1,0,8,0,1]
 ; AVX512DQBW-FAST-NEXT:    # zmm22 = mem[0,1,2,3,0,1,2,3]
@@ -4117,9 +4117,9 @@ define void @store_i64_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    kmovd %eax, %k1
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm27, %zmm19 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm2 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [u,1,2,3,4,5,15,u]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm31, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14]
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm6, %zmm9, %zmm7
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [14,1,2,3,4,5,6,15]
@@ -5952,15 +5952,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm24, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <12,u,u,3,4,5,6,13>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [12,u,u,3,4,5,6,13]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm25
 ; AVX512F-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k3
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,15,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
 ; AVX512F-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
@@ -6072,7 +6072,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,11,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,11,u,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm9, %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa 192(%r8), %ymm7
@@ -6102,11 +6102,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3]
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm25, %zmm7
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm31, %zmm8
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm1
@@ -6392,19 +6392,19 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,11,u,u,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm24, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,11,u,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm0, %zmm15
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [12,u,u,3,4,5,6,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    movb $24, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k3
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm9 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,1,2,3,4,15,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm9, %zmm24
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm0
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -6540,12 +6540,12 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX512F-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3]
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,12,u,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm4, %zmm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [13,u,2,3,4,5,6,14]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm6, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm24, %zmm6
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
@@ -6837,15 +6837,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm21, %zmm15, %zmm12
 ; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm19
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm19, %zmm22, %zmm10
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <12,u,u,3,4,5,6,13>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [12,u,u,3,4,5,6,13]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm19, %zmm3, %zmm22
 ; AVX512DQ-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k3
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm4 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,15,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm19, %zmm4, %zmm21
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm25 {%k1}
@@ -6952,7 +6952,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm19
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm14 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,11,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,11,u,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm10, %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm12 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa 192(%r8), %ymm6
@@ -6983,11 +6983,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3]
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,12,u,3,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm22, %zmm6
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm24, %zmm8
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm21, %zmm9
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm2
@@ -7273,18 +7273,18 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm19, %zmm10
 ; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r8), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <0,11,u,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,11,u,u,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm4, %zmm17
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 192(%r9), %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,11,u,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm17, %zmm20
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm8
 ; AVX512DQ-FAST-NEXT:    movb $24, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k3
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm9, %zmm21 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,15,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm21, %zmm6
 ; AVX512DQ-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
@@ -7416,11 +7416,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vmovdqa 192(%r8), %ymm1
 ; AVX512DQ-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
 ; AVX512DQ-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,12,u,3,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm8, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [13,u,2,3,4,5,6,14]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm23, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,5,15,u]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm4, %zmm6, %zmm5
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
@@ -7710,15 +7710,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm31 = zmm25[0,1,2,3],zmm9[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm24, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm5 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = <12,u,u,3,4,5,6,13>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [12,u,u,3,4,5,6,13]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm25
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $24, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k3
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm22, %zmm6 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,15,u,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa (%rdx), %xmm6
 ; AVX512BW-ONLY-SLOW-NEXT:    vpunpcklqdq {{.*#+}} xmm6 = xmm6[0],mem[0]
@@ -7830,7 +7830,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm8 # 64-byte Reload
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm6, %zmm8, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm0, %zmm11 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,1,11,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,11,u,4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm9, %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm7, %zmm10 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa 192(%r8), %ymm7
@@ -7860,11 +7860,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,2,3,3]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm7 = ymm7[0,1,2,3,4,5],mem[6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm7, %zmm0, %zmm28 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm25, %zmm7
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm31, %zmm8
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm5, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,12,3,4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm6, %zmm7, %zmm1
@@ -8150,19 +8150,19 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm8[0,1,2,3],zmm15[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r8), %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <0,11,u,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,11,u,u,4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm24, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 192(%r9), %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <0,1,11,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [0,1,11,u,4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm0, %zmm15
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm5, %zmm10 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <12,u,u,3,4,5,6,13>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [12,u,u,3,4,5,6,13]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm10, %zmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    movb $24, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k3
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm18, %zmm9 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,1,2,3,4,15,u,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm2, %zmm9, %zmm24
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa (%rdx), %xmm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],mem[0]
@@ -8298,12 +8298,12 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa 192(%r8), %ymm2
 ; AVX512BW-ONLY-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm2 = ymm2[0],mem[0],ymm2[2],mem[2]
 ; AVX512BW-ONLY-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm7 {%k2} = zmm2[2,3,2,3],zmm3[2,3,2,3]
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,12,u,3,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,12,u,3,4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm4 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm4, %zmm2
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [13,u,2,3,4,5,6,14]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm6, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,5,15,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm8, %zmm24, %zmm6
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,12,3,4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm8
@@ -8595,15 +8595,15 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm21, %zmm15, %zmm12
 ; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm24 = zmm24[0,1,2,3],zmm10[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 192(%r8), %zmm19
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm19, %zmm22, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm9, %zmm3 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <12,u,u,3,4,5,6,13>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [12,u,u,3,4,5,6,13]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm19, %zmm3, %zmm22
 ; AVX512DQBW-SLOW-NEXT:    movb $24, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k3
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm4 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,1,2,3,4,15,u,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm19, %zmm4, %zmm21
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm3 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm25 {%k1}
@@ -8710,7 +8710,7 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm5 # 64-byte Reload
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm5, %zmm19
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm19, %zmm14 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,1,11,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,1,11,u,4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm10, %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm12 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa 192(%r8), %ymm6
@@ -8741,11 +8741,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,2,3,3]
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm6[0,1,2,3,4,5],mem[6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm31 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <0,12,u,3,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,12,u,3,4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm22, %zmm6
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [13,u,2,3,4,5,6,14]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm24, %zmm8
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,1,2,3,4,5,15,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm21, %zmm9
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,12,3,4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm2
@@ -9031,18 +9031,18 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm19, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm17[0,1,2,3],zmm23[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r8), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm17 = <0,11,u,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm17 = [0,11,u,u,4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm4, %zmm17
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 192(%r9), %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <0,1,11,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,1,11,u,4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm17, %zmm20
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm8, %zmm6 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm6, %zmm8
 ; AVX512DQBW-FAST-NEXT:    movb $24, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k3
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm9, %zmm21 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,1,2,3,4,15,u,u]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm21, %zmm6
 ; AVX512DQBW-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm9 # 32-byte Reload
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm9 = ymm9[0,1,2,3,4,5],mem[6,7]
@@ -9174,11 +9174,11 @@ define void @store_i64_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vmovdqa 192(%r8), %ymm1
 ; AVX512DQBW-FAST-NEXT:    vpunpcklqdq {{.*#+}} ymm1 = ymm1[0],mem[0],ymm1[2],mem[2]
 ; AVX512DQBW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm10 {%k2} = zmm1[2,3,2,3],zmm2[2,3,2,3]
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,12,u,3,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,12,u,3,4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm8, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [13,u,2,3,4,5,6,14]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm23, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,1,2,3,4,5,15,u]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm4, %zmm6, %zmm5
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,12,3,4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm2, %zmm1, %zmm4
@@ -12843,10 +12843,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,5,15,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm10
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
@@ -13080,20 +13080,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm9, %zmm5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [12,u,u,3,4,5,6,13]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm9
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm17 {%k3}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm17
 ; AVX512F-ONLY-SLOW-NEXT:    movb $6, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k4
 ; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm1
 ; AVX512F-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4}
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,2,9,u,u,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,u,u,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm8
 ; AVX512F-ONLY-SLOW-NEXT:    movb $64, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k5
@@ -13102,16 +13102,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k5
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k5}
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm0
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm3
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm5
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [13,u,2,3,4,5,6,14]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm6
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm17
 ; AVX512F-ONLY-SLOW-NEXT:    movb $12, %sil
 ; AVX512F-ONLY-SLOW-NEXT:    kmovw %esi, %k5
@@ -13120,9 +13120,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5}
 ; AVX512F-ONLY-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm2
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm4
-; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7>
+; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7]
 ; AVX512F-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm2
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm0
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
@@ -13827,10 +13827,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm20, %zmm1
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm24
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm24
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm20
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
@@ -14065,21 +14065,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k1}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm11, %zmm10
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [12,u,u,3,4,5,6,13]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm11
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm15 {%k3}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm15
 ; AVX512F-ONLY-FAST-NEXT:    movb $6, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k5
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5}
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm2
 ; AVX512F-ONLY-FAST-NEXT:    movb $64, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
@@ -14088,15 +14088,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k4}
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm0
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <0,1,11,u,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [0,1,11,u,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm31
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm3
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,12,u,3,4,5,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,12,u,3,4,5,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm8
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [13,u,2,3,4,5,6,14]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm10
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm15
 ; AVX512F-ONLY-FAST-NEXT:    movb $12, %sil
 ; AVX512F-ONLY-FAST-NEXT:    kmovw %esi, %k4
@@ -14105,9 +14105,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
 ; AVX512F-ONLY-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm1
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,9,u,6,7]
 ; AVX512F-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm11
 ; AVX512F-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
@@ -14761,10 +14761,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,15,u]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
@@ -14998,20 +14998,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm5
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1 {%k1}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm8
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k3}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm13
 ; AVX512DQ-SLOW-NEXT:    movb $6, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k4
 ; AVX512DQ-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm1
 ; AVX512DQ-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4}
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,2,9,u,u,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,u,u,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm10
 ; AVX512DQ-SLOW-NEXT:    movb $64, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k5
@@ -15020,16 +15020,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k5
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k5}
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm0
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm7, %zmm3
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm5
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm7
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u]
 ; AVX512DQ-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm13
 ; AVX512DQ-SLOW-NEXT:    movb $12, %sil
 ; AVX512DQ-SLOW-NEXT:    kmovw %esi, %k5
@@ -15038,9 +15038,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5}
 ; AVX512DQ-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm2
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm4
-; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7>
+; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7]
 ; AVX512DQ-SLOW-NEXT:    vpermi2q %zmm0, %zmm10, %zmm2
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm0
 ; AVX512DQ-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
@@ -15734,10 +15734,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm7, %zmm18, %zmm8
 ; AVX512DQ-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r8), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm30
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm18
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm22
@@ -15976,20 +15976,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r8), %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm13, %zmm9
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2 {%k1}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [12,u,u,3,4,5,6,13]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm12
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm7, %zmm16 {%k3}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm16
 ; AVX512DQ-FAST-NEXT:    movb $6, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k5
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm2
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5}
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm2
 ; AVX512DQ-FAST-NEXT:    movb $64, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
@@ -15998,15 +15998,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k4}
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 448(%r9), %zmm0
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <0,1,11,u,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,1,11,u,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm9, %zmm21
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm3
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm7
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u]
 ; AVX512DQ-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm16
 ; AVX512DQ-FAST-NEXT:    movb $12, %sil
 ; AVX512DQ-FAST-NEXT:    kmovw %esi, %k4
@@ -16015,9 +16015,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQ-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
 ; AVX512DQ-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm1
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
-; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,9,u,6,7>
+; AVX512DQ-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,9,u,6,7]
 ; AVX512DQ-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm10
 ; AVX512DQ-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
 ; AVX512DQ-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
@@ -16672,10 +16672,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,1,2,3,4,5,15,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm1, %zmm3, %zmm10
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 %zmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
@@ -16909,20 +16909,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k1}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm9, %zmm5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm13, %zmm1 {%k1}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <12,u,u,3,4,5,6,13>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [12,u,u,3,4,5,6,13]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm9
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm8, %zmm17 {%k3}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm17
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $6, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k4
 ; AVX512BW-ONLY-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4}
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,1,2,9,u,u,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,1,2,9,u,u,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm8
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $64, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k5
@@ -16931,16 +16931,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k5
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm6 {%k5}
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm0
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm6, %zmm3
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm9, %zmm5
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [13,u,2,3,4,5,6,14]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm6
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm17
 ; AVX512BW-ONLY-SLOW-NEXT:    movb $12, %sil
 ; AVX512BW-ONLY-SLOW-NEXT:    kmovd %esi, %k5
@@ -16949,9 +16949,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5}
 ; AVX512BW-ONLY-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm2
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm4
-; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7>
+; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7]
 ; AVX512BW-ONLY-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm2
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm0
 ; AVX512BW-ONLY-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm9 # 64-byte Reload
@@ -17656,10 +17656,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm14, %zmm20, %zmm1
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r8), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm24
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm24
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm20
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm3
@@ -17894,21 +17894,21 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    # zmm1 = zmm10[0,1,2,3],mem[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k1}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r8), %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,11,u,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,11,u,u,4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm11, %zmm10
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm6 # 64-byte Reload
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm6, %zmm2 {%k1}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <12,u,u,3,4,5,6,13>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [12,u,u,3,4,5,6,13]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm11
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm30, %zmm15 {%k3}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm15
 ; AVX512BW-ONLY-FAST-NEXT:    movb $6, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k5
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm2
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5}
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm2
 ; AVX512BW-ONLY-FAST-NEXT:    movb $64, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
@@ -17917,15 +17917,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 %zmm3, %zmm8 {%k4}
 ; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 448(%r9), %zmm0
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = <0,1,11,u,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm31 = [0,1,11,u,4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm31
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm8, %zmm3
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,12,u,3,4,5,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,12,u,3,4,5,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm11, %zmm8
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <13,u,2,3,4,5,6,14>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [13,u,2,3,4,5,6,14]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm10
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm15
 ; AVX512BW-ONLY-FAST-NEXT:    movb $12, %sil
 ; AVX512BW-ONLY-FAST-NEXT:    kmovd %esi, %k4
@@ -17934,9 +17934,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
 ; AVX512BW-ONLY-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm1
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
-; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = <0,1,2,3,9,u,6,7>
+; AVX512BW-ONLY-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm11 = [0,1,2,3,9,u,6,7]
 ; AVX512BW-ONLY-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm11
 ; AVX512BW-ONLY-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
 ; AVX512BW-ONLY-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
@@ -18590,10 +18590,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm11, %zmm1, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r8), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm2, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%r9), %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,1,2,3,4,5,15,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm2, %zmm8, %zmm7
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 %zmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm2, %zmm0, %zmm1
@@ -18827,20 +18827,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm2[0,1,2,3],zmm12[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k1}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r8), %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,11,u,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,11,u,u,4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm8, %zmm5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm11, %zmm1 {%k1}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <12,u,u,3,4,5,6,13>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [12,u,u,3,4,5,6,13]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm1, %zmm8
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm10, %zmm13 {%k3}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm3, %zmm1, %zmm13
 ; AVX512DQBW-SLOW-NEXT:    movb $6, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k4
 ; AVX512DQBW-SLOW-NEXT:    vpbroadcastq 456(%rcx), %ymm1
 ; AVX512DQBW-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm0 {%k4}
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,2,9,u,u,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,9,u,u,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm3, %zmm0, %zmm10
 ; AVX512DQBW-SLOW-NEXT:    movb $64, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k5
@@ -18849,16 +18849,16 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k5
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm3, %zmm7 {%k5}
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 448(%r9), %zmm0
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,1,11,u,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,1,11,u,4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm5, %zmm1
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 %zmm1, %zmm11
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm7, %zmm3
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <0,12,u,3,4,5,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [0,12,u,3,4,5,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm8, %zmm5
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [13,u,2,3,4,5,6,14]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm7
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u]
 ; AVX512DQBW-SLOW-NEXT:    vpermt2q %zmm0, %zmm1, %zmm13
 ; AVX512DQBW-SLOW-NEXT:    movb $12, %sil
 ; AVX512DQBW-SLOW-NEXT:    kmovd %esi, %k5
@@ -18867,9 +18867,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQBW-SLOW-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm4 {%k5}
 ; AVX512DQBW-SLOW-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm2
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm2, %zmm4
-; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,3,9,u,6,7>
+; AVX512DQBW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,3,9,u,6,7]
 ; AVX512DQBW-SLOW-NEXT:    vpermi2q %zmm0, %zmm10, %zmm2
 ; AVX512DQBW-SLOW-NEXT:    vmovdqa64 384(%rax), %zmm0
 ; AVX512DQBW-SLOW-NEXT:    vmovdqu64 {{[-0-9]+}}(%r{{[sb]}}p), %zmm10 # 64-byte Reload
@@ -19563,10 +19563,10 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm7, %zmm18, %zmm8
 ; AVX512DQBW-FAST-NEXT:    vmovdqu64 %zmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r8), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,15,u,u]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm30
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 384(%r9), %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,5,15,u]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm1, %zmm2, %zmm30
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm18
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm1, %zmm0, %zmm22
@@ -19805,20 +19805,20 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    # zmm1 = zmm19[0,1,2,3],mem[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k1}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r8), %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <0,11,u,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [0,11,u,u,4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm13, %zmm9
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm17, %zmm2 {%k1}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = <12,u,u,3,4,5,6,13>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm12 = [12,u,u,3,4,5,6,13]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm2, %zmm12
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm7, %zmm16 {%k3}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <u,1,2,3,4,15,u,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [u,1,2,3,4,15,u,u]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm3, %zmm2, %zmm16
 ; AVX512DQBW-FAST-NEXT:    movb $6, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k5
 ; AVX512DQBW-FAST-NEXT:    vpbroadcastq 456(%rcx), %ymm2
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm2, %zmm0, %zmm0 {%k5}
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,1,2,9,u,u,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,1,2,9,u,u,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm3, %zmm0, %zmm2
 ; AVX512DQBW-FAST-NEXT:    movb $64, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
@@ -19827,15 +19827,15 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 %zmm3, %zmm10 {%k4}
 ; AVX512DQBW-FAST-NEXT:    vmovdqa64 448(%r9), %zmm0
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <0,1,11,u,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [0,1,11,u,4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm9, %zmm21
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,1,2,10,u,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,1,2,10,u,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm10, %zmm3
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <0,12,u,3,4,5,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [0,12,u,3,4,5,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm12, %zmm7
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <13,u,2,3,4,5,6,14>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [13,u,2,3,4,5,6,14]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm9
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <u,1,2,3,4,5,15,u>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [u,1,2,3,4,5,15,u]
 ; AVX512DQBW-FAST-NEXT:    vpermt2q %zmm0, %zmm1, %zmm16
 ; AVX512DQBW-FAST-NEXT:    movb $12, %sil
 ; AVX512DQBW-FAST-NEXT:    kmovd %esi, %k4
@@ -19844,9 +19844,9 @@ define void @store_i64_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.ve
 ; AVX512DQBW-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512DQBW-FAST-NEXT:    vinserti64x4 $0, %ymm1, %zmm0, %zmm4 {%k4}
 ; AVX512DQBW-FAST-NEXT:    vinserti32x4 $2, 448(%r8), %zmm4, %zmm1
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = <0,1,2,3,4,8,u,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,1,2,3,4,8,u,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm1, %zmm4
-; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,1,2,3,9,u,6,7>
+; AVX512DQBW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,1,2,3,9,u,6,7]
 ; AVX512DQBW-FAST-NEXT:    vpermi2q %zmm0, %zmm2, %zmm10
 ; AVX512DQBW-FAST-NEXT:    vpbroadcastq 8(%rcx), %ymm0
 ; AVX512DQBW-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
index f727622682cf2..c6c74acc618fc 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll
@@ -855,9 +855,9 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm6, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm2, %xmm8
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128]
 ; AVX1-ONLY-NEXT:    vmovdqa 16(%rsi), %xmm12
 ; AVX1-ONLY-NEXT:    vmovdqa 32(%rsi), %xmm13
 ; AVX1-ONLY-NEXT:    vmovdqa 48(%rsi), %xmm14
@@ -878,7 +878,7 @@ define void @store_i8_stride3_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpor %xmm7, %xmm10, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm3[8],xmm15[8],xmm3[9],xmm15[9],xmm3[10],xmm15[10],xmm3[11],xmm15[11],xmm3[12],xmm15[12],xmm3[13],xmm15[13],xmm3[14],xmm15[14],xmm3[15],xmm15[15]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm7, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm1[8],xmm12[8],xmm1[9],xmm12[9],xmm1[10],xmm12[10],xmm1[11],xmm12[11],xmm1[12],xmm12[12],xmm1[13],xmm12[13],xmm1[14],xmm12[14],xmm1[15],xmm12[15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 9c09bb7dca60c..c4bf0c3630e83 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -459,7 +459,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30]
 ; AVX512F-FAST-NEXT:    vpor %ymm2, %ymm4, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,0,0,0,0,u,1,1>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,0,0,0,0,u,1,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
@@ -730,7 +730,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u]
 ; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero
@@ -785,7 +785,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1]
@@ -872,7 +872,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1]
@@ -925,7 +925,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[0,8],zero,zero,ymm6[u,1,9],zero,zero,ymm6[u,2,10],zero,zero,ymm6[u,3,19],zero,zero,ymm6[u,28,20],zero,zero,ymm6[u,29,21],zero,zero,ymm6[u,30,22]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm5, %zmm6, %zmm5
 ; AVX512F-SLOW-NEXT:    vporq %zmm7, %zmm5, %zmm5
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm1, %zmm6, %zmm6
 ; AVX512F-SLOW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
@@ -964,7 +964,7 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm6, %zmm5, %zmm5
 ; AVX512F-FAST-NEXT:    vporq %zmm7, %zmm5, %zmm5
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm6
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm6, %zmm7, %zmm6
 ; AVX512F-FAST-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
@@ -1594,7 +1594,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    # ymm8 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm6 = ymm0[2,2,3,3,6,6,7,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
@@ -1610,7 +1610,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm10[0],xmm9[0],xmm10[1],xmm9[1],xmm10[2],xmm9[2],xmm10[3],xmm9[3],xmm10[4],xmm9[4],xmm10[5],xmm9[5],xmm10[6],xmm9[6],xmm10[7],xmm9[7]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm6, %ymm11, %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqa (%r8), %xmm11
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm11[0,0,1,1]
@@ -1625,7 +1625,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero
 ; AVX2-SLOW-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm9, %ymm7, %ymm7
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm8 = xmm11[1,1,2,2]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1]
@@ -1641,23 +1641,23 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [3,3,3,u,4,4,4,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm3, %ymm9, %ymm3
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm3
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,3,3,3,u,4,4,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1686,7 +1686,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm5, %ymm10, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [0,0,0,0,0,0,1,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm10, %ymm10
@@ -1700,7 +1700,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
 ; AVX2-FAST-NEXT:    vpor %xmm7, %xmm6, %xmm6
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm8, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm7, %ymm7
@@ -1716,7 +1716,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
 ; AVX2-FAST-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm8, %ymm8
@@ -1732,23 +1732,23 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3]
 ; AVX2-FAST-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm9, %ymm9
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [3,3,3,u,4,4,4,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm9, %ymm3
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
 ; AVX2-FAST-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,3,3,3,u,4,4,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1777,7 +1777,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm5, %ymm10, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1]
@@ -1792,7 +1792,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm7, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm8, %ymm6, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} xmm7 = xmm10[1,1,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
@@ -1808,7 +1808,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2]
@@ -1824,23 +1824,23 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm9, %ymm10, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm9 = ymm0[2,2,3,3,6,6,7,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <3,3,3,u,4,4,4,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [3,3,3,u,4,4,4,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm3, %ymm9, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,3,3,3,u,4,4,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -1884,7 +1884,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpternlogq $226, %ymm8, %ymm7, %ymm5
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r8), %xmm6
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm6, %zmm8, %zmm6
 ; AVX512F-SLOW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm6
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm4[u,u,u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u]
@@ -1966,7 +1966,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpternlogq $226, %ymm7, %ymm6, %ymm4
 ; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm4[0,1,2,3],zmm5[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vbroadcasti64x4 {{.*#+}} zmm4 = mem[0,1,2,3,0,1,2,3]
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm4, %zmm7, %zmm7
 ; AVX512F-FAST-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm5, %zmm7
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm5 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u]
@@ -1990,7 +1990,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm9, %ymm5
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm5, %zmm0, %zmm5
 ; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <4,u,5,5,5,5,u,6>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [4,u,5,5,5,5,u,6]
 ; AVX512F-FAST-NEXT:    vpermd %ymm4, %ymm8, %ymm8
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
 ; AVX512F-FAST-NEXT:    vpandn %ymm8, %ymm9, %ymm8
@@ -2008,7 +2008,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
 ; AVX512F-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
 ; AVX512F-FAST-NEXT:    vpternlogq $184, %ymm2, %ymm6, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <6,6,6,u,7,7,7,7>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [6,6,6,u,7,7,7,7]
 ; AVX512F-FAST-NEXT:    vpermd %ymm4, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm1
 ; AVX512F-FAST-NEXT:    vmovdqa %ymm1, 128(%r9)
@@ -2065,7 +2065,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
 ; AVX512BW-SLOW-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,3,3,u,4,4,4,4>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,3,3,u,4,4,4,4]
 ; AVX512BW-SLOW-NEXT:    vpermd %ymm4, %ymm8, %ymm8
 ; AVX512BW-SLOW-NEXT:    movl $138547332, %eax # imm = 0x8421084
 ; AVX512BW-SLOW-NEXT:    kmovd %eax, %k1
@@ -2074,7 +2074,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
 ; AVX512BW-SLOW-NEXT:    kmovq %rax, %k1
 ; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <3,3,3,3,u,4,4,4>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [3,3,3,3,u,4,4,4]
 ; AVX512BW-SLOW-NEXT:    vpermd %ymm0, %ymm6, %ymm6
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5]
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2]
@@ -2157,7 +2157,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
 ; AVX512BW-FAST-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <3,3,3,u,4,4,4,4>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [3,3,3,u,4,4,4,4]
 ; AVX512BW-FAST-NEXT:    vpermd %ymm1, %ymm8, %ymm8
 ; AVX512BW-FAST-NEXT:    movl $138547332, %eax # imm = 0x8421084
 ; AVX512BW-FAST-NEXT:    kmovd %eax, %k1
@@ -2166,7 +2166,7 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
 ; AVX512BW-FAST-NEXT:    kmovq %rax, %k1
 ; AVX512BW-FAST-NEXT:    vmovdqu8 %zmm6, %zmm7 {%k1}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14]
 ; AVX512BW-FAST-NEXT:    vpermd %zmm5, %zmm6, %zmm6
 ; AVX512BW-FAST-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
 ; AVX512BW-FAST-NEXT:    kmovq %rax, %k1
@@ -3235,26 +3235,26 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm11
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdx), %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm12
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-SLOW-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %xmm4
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-SLOW-NEXT:    vmovdqa (%rsi), %xmm14
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm5, %xmm14, %xmm8
 ; AVX2-SLOW-NEXT:    vpor %xmm4, %xmm8, %xmm4
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm1, %ymm4, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa (%r8), %xmm4
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm4, (%rsp) # 16-byte Spill
@@ -3304,7 +3304,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb %ymm4, %ymm13, %ymm12
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vpor %ymm9, %ymm12, %ymm9
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm1, %ymm9, %ymm9
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX2-SLOW-NEXT:    vpshufb %ymm15, %ymm6, %ymm1
@@ -3333,11 +3333,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm0, %ymm4, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,3,3,u,4,4,4,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm11, %ymm3, %ymm4
 ; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm2, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpermd %ymm6, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm15, %ymm4
@@ -3348,13 +3348,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm13, %ymm11
 ; AVX2-SLOW-NEXT:    vpor %ymm4, %ymm11, %ymm4
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm0, %ymm4, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb %ymm2, %ymm10, %ymm2
 ; AVX2-SLOW-NEXT:    vpshufb %ymm5, %ymm8, %ymm4
 ; AVX2-SLOW-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <3,3,3,3,u,4,4,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [3,3,3,3,u,4,4,4]
 ; AVX2-SLOW-NEXT:    vpermd %ymm12, %ymm2, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm0, %ymm4, %ymm0
@@ -3367,13 +3367,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm14, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm0, %ymm1, %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -3422,7 +3422,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
@@ -3464,27 +3464,27 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rcx), %xmm6
 ; AVX2-FAST-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdx), %xmm8
 ; AVX2-FAST-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm7
 ; AVX2-FAST-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm7, %xmm7
 ; AVX2-FAST-NEXT:    vpor %xmm4, %xmm7, %xmm4
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdi), %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3528,7 +3528,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %ymm15, %ymm2, %ymm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
 ; AVX2-FAST-NEXT:    vpor %ymm4, %ymm5, %ymm4
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm1, %ymm4, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa (%rdx), %ymm10
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm10, %ymm0
@@ -3569,7 +3569,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm13, %ymm14
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,3,3]
 ; AVX2-FAST-NEXT:    vpor %ymm15, %ymm14, %ymm14
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm7, %ymm14, %ymm7
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm1, %ymm6
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
@@ -3588,11 +3588,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm7, %ymm8, %ymm9
 ; AVX2-FAST-NEXT:    vpermd %ymm11, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm4, %ymm6, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <3,3,3,u,4,4,4,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,3,3,u,4,4,4,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm4, %ymm3
 ; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm3, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm0, %ymm0
@@ -3603,13 +3603,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm13, %ymm6
 ; AVX2-FAST-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vpshufb %ymm1, %ymm5, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm10, %ymm3
 ; AVX2-FAST-NEXT:    vpor %ymm1, %ymm3, %ymm1
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm0, %ymm1, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,3,3,3,u,4,4,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,3,3,3,u,4,4,4]
 ; AVX2-FAST-NEXT:    vpermd %ymm12, %ymm3, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
@@ -3621,13 +3621,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-FAST-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX2-FAST-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
@@ -3672,27 +3672,27 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm1, (%rsp) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rcx), %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdx), %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm4, %xmm6, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3741,7 +3741,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm2, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm6, %ymm7, %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm1, %ymm6, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdx), %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm13, %ymm0
@@ -3785,7 +3785,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm11, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm14, %ymm12, %ymm12
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm9, %ymm12, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm8, %ymm1, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
@@ -3805,11 +3805,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufd {{.*#+}} ymm8 = ymm3[0,2,1,1,4,6,5,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm6, %ymm8, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <3,3,3,u,4,4,4,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [3,3,3,u,4,4,4,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm4, %ymm6, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} ymm8 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm8, %ymm2, %ymm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm4, %ymm2, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm1, %ymm6, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm8, %ymm0, %ymm0
@@ -3820,13 +3820,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm11, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm4, %ymm8, %ymm4
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm2, %ymm4, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm1, %ymm7, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm13, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm0, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <3,3,3,3,u,4,4,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [3,3,3,3,u,4,4,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm5, %ymm4, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
@@ -3838,13 +3838,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsp), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # xmm3 = xmm3[0],mem[0],xmm3[1],mem[1],xmm3[2],mem[2],xmm3[3],mem[3],xmm3[4],mem[4],xmm3[5],mem[5],xmm3[6],mem[6],xmm3[7],mem[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm3, %xmm3 # 16-byte Folded Reload
@@ -3890,16 +3890,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm15, %ymm3, %ymm0
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm9, %ymm2, %ymm1
 ; AVX512F-SLOW-NEXT:    vpor %ymm0, %ymm1, %ymm0
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm1
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm1, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm1, %xmm16
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm4, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm4, %xmm31
 ; AVX512F-SLOW-NEXT:    vpor %xmm0, %xmm1, %xmm0
@@ -3908,16 +3908,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm0, %ymm8, %ymm4
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdx), %ymm11
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm1, %ymm11, %ymm10
 ; AVX512F-SLOW-NEXT:    vpor %ymm4, %ymm10, %ymm4
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm13
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm6, %xmm13, %xmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm6, %xmm25
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm10
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm6, %xmm10, %xmm12
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm6, %xmm26
 ; AVX512F-SLOW-NEXT:    vporq %xmm4, %xmm12, %xmm20
@@ -3986,14 +3986,14 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm16, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm4
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <4,u,5,5,5,5,u,6,6,6,6,u,7,7,7,7>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [4,u,5,5,5,5,u,6,6,6,6,u,7,7,7,7]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm15, %zmm1, %zmm31
 ; AVX512F-SLOW-NEXT:    vmovdqa64 (%r8), %zmm16
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <6,6,6,u,7,7,7,7,u,16,16,16,16,u,17,17>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,u,7,7,7,7,u,16,16,16,16,u,17,17]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm15, %zmm16, %zmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm17, %xmm15
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm15[0],xmm5[0],xmm15[1],xmm5[1],xmm15[2],xmm5[2],xmm15[3],xmm5[3],xmm15[4],xmm5[4],xmm15[5],xmm5[5],xmm15[6],xmm5[6],xmm15[7],xmm5[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm15, %xmm5, %xmm5
 ; AVX512F-SLOW-NEXT:    vinserti32x4 $2, %xmm27, %zmm5, %zmm5
@@ -4007,7 +4007,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm18, %xmm15
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm15[0],xmm9[0],xmm15[1],xmm9[1],xmm15[2],xmm9[2],xmm15[3],xmm9[3],xmm15[4],xmm9[4],xmm15[5],xmm9[5],xmm15[6],xmm9[6],xmm15[7],xmm9[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm15, %xmm10, %xmm10
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
@@ -4049,7 +4049,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} zmm0 = zmm5[0,0,1,1,4,4,5,5]
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} zmm2 = zmm9[0,0,1,1,4,4,5,5]
 ; AVX512F-SLOW-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,0,0,0,0,u,1,1,1,1,u,2,2,2,2,u]
 ; AVX512F-SLOW-NEXT:    vpermd %zmm16, %zmm0, %zmm0
 ; AVX512F-SLOW-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %zmm25, 64(%r9)
@@ -4067,16 +4067,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128,128]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm8, %ymm3, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [12,13,128,15,12,13,14,128,12,13,14,15,128,u,u,u,16,128,18,19,16,17,128,19,16,17,18,128,16,17,18,19]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm11, %ymm1, %ymm2
 ; AVX512F-FAST-NEXT:    vpor %ymm0, %ymm2, %ymm0
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm0, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm13, %xmm4, %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm4, %xmm19
 ; AVX512F-FAST-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -4085,16 +4085,16 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm0, %ymm7, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %ymm9
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,12,13,128,u,u,u,14,128,u,u,14,15,128,u,u,u,16,128,u,u,16,17,128,u,u,u,18,128,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm15, %ymm9, %ymm4
 ; AVX512F-FAST-NEXT:    vpor %ymm2, %ymm4, %ymm2
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %xmm12
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm12, %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm4, %xmm25
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %xmm10
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm6, %xmm10, %xmm4
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm6, %xmm26
 ; AVX512F-FAST-NEXT:    vpor %xmm2, %xmm4, %xmm2
@@ -4167,7 +4167,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm16, %xmm1
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm3, %xmm1
 ; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm28, %zmm1, %zmm28
@@ -4182,7 +4182,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm18, %xmm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm17, %xmm5
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm10, %xmm10
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,3]
@@ -4225,13 +4225,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm6, %zmm0
 ; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm1, %zmm7, %zmm0
 ; AVX512F-FAST-NEXT:    vpternlogq $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm27
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [6,6,6,u,7,7,7,7,u,8,8,8,8,u,9,9]
 ; AVX512F-FAST-NEXT:    vpermd %zmm5, %zmm1, %zmm1
 ; AVX512F-FAST-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm0 = zmm28[0,0,1,1,4,4,5,5]
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} zmm2 = zmm3[0,0,1,1,4,4,5,5]
 ; AVX512F-FAST-NEXT:    vpternlogq $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm2
-; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u>
+; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,0,0,0,0,u,1,1,9,9,u,10,10,10,10,u]
 ; AVX512F-FAST-NEXT:    vpermd %zmm11, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %zmm27, 64(%r9)
@@ -4258,7 +4258,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm6
 ; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm12
 ; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm12[0],xmm6[0],xmm12[1],xmm6[1],xmm12[2],xmm6[2],xmm12[3],xmm6[3],xmm12[4],xmm6[4],xmm12[5],xmm6[5],xmm12[6],xmm6[6],xmm12[7],xmm6[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm4, %zmm2, %zmm10
@@ -4275,7 +4275,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm13
 ; AVX512BW-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm14
 ; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm9, %xmm11, %xmm11
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,1,1]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm11, %zmm2, %zmm2
@@ -4295,21 +4295,21 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm11, %ymm24, %ymm18
 ; AVX512BW-SLOW-NEXT:    vporq %ymm17, %ymm18, %ymm17
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm20 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm20 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm20, %xmm12, %xmm12
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm22 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm22, %xmm6, %xmm6
 ; AVX512BW-SLOW-NEXT:    vpor %xmm6, %xmm12, %xmm6
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm17, %zmm6, %zmm6
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm19, %xmm14, %xmm12
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm21 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm21, %xmm13, %xmm13
 ; AVX512BW-SLOW-NEXT:    vpor %xmm12, %xmm13, %xmm12
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm13 = ymm12[0,0,1,1]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rdi), %ymm25
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <3,3,3,u,4,4,4,4>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [3,3,3,u,4,4,4,4]
 ; AVX512BW-SLOW-NEXT:    vpermd %ymm25, %ymm12, %ymm17
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rsi), %ymm26
 ; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} ymm14 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
@@ -4320,7 +4320,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
 ; AVX512BW-SLOW-NEXT:    kmovq %rax, %k2
 ; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm13, %zmm6 {%k2}
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <3,3,3,3,u,4,4,4>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [3,3,3,3,u,4,4,4]
 ; AVX512BW-SLOW-NEXT:    vpermd %ymm16, %ymm13, %ymm17
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm18 = mem[1,1,2,2]
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm18 = ymm18[0,1,1,1]
@@ -4434,27 +4434,27 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpor %ymm0, %ymm3, %ymm0
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rcx), %xmm6
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%rcx), %xmm18
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm8, %xmm18, %xmm3
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdx), %xmm7
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%rdx), %xmm20
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm10, %xmm20, %xmm4
 ; AVX512BW-FAST-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%rdi), %xmm17
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = <8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm12 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm12, %xmm17, %xmm3
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rsi), %xmm11
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%rsi), %xmm19
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm14, %xmm19, %xmm4
 ; AVX512BW-FAST-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm3[0,0,1,1]
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%rdi), %ymm16
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <3,3,3,u,4,4,4,4>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [3,3,3,u,4,4,4,4]
 ; AVX512BW-FAST-NEXT:    vpermd %ymm16, %ymm3, %ymm22
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%rsi), %ymm23
 ; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} ymm4 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
@@ -4466,13 +4466,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    kmovq %rax, %k2
 ; AVX512BW-FAST-NEXT:    vmovdqu8 %zmm15, %zmm0 {%k2}
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%r8), %ymm24
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <1,1,2,2,2,2,2,2,27,27,27,27,u,28,28,28>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [1,1,2,2,2,2,2,2,27,27,27,27,u,28,28,28]
 ; AVX512BW-FAST-NEXT:    vpermi2d %zmm5, %zmm24, %zmm15
 ; AVX512BW-FAST-NEXT:    movabsq $4760450083537948804, %rax # imm = 0x4210842108421084
 ; AVX512BW-FAST-NEXT:    kmovq %rax, %k3
 ; AVX512BW-FAST-NEXT:    vmovdqu8 %zmm15, %zmm0 {%k3}
 ; AVX512BW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm23[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm15 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,128,128,26,128,28,128,128,128,128,29,128,31,128,128,30]
 ; AVX512BW-FAST-NEXT:    vpshufb %zmm15, %zmm22, %zmm22
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm22 = zmm22[2,2,3,3,6,6,7,7]
 ; AVX512BW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm23 = zmm16[0,1,2,3],mem[4,5,6,7]
@@ -4481,11 +4481,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm23 = zmm23[2,2,3,3,6,6,7,7]
 ; AVX512BW-FAST-NEXT:    vporq %zmm22, %zmm23, %zmm23
 ; AVX512BW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm22 = zmm21[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
 ; AVX512BW-FAST-NEXT:    vpshufb %zmm21, %zmm22, %zmm22
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm25 = zmm22[2,2,3,3,6,6,7,7]
 ; AVX512BW-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm13 = zmm13[0,1,2,3],mem[4,5,6,7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm22 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
 ; AVX512BW-FAST-NEXT:    vpshufb %zmm22, %zmm13, %zmm13
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm13 = zmm13[2,2,3,3,6,6,7,7]
 ; AVX512BW-FAST-NEXT:    vporq %zmm25, %zmm13, %zmm13
@@ -4505,7 +4505,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm26 = ymm26[2,2,3,3]
 ; AVX512BW-FAST-NEXT:    vporq %ymm25, %ymm26, %ymm25
 ; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm18 = xmm18[0],xmm20[0],xmm18[1],xmm20[1],xmm18[2],xmm20[2],xmm18[3],xmm20[3],xmm18[4],xmm20[4],xmm18[5],xmm20[5],xmm18[6],xmm20[6],xmm18[7],xmm20[7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm26 = <2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm26, %xmm18, %xmm18
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm18 = ymm18[0,0,1,1]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm18, %zmm25, %zmm25
@@ -4517,7 +4517,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm28 = ymm28[2,2,3,3]
 ; AVX512BW-FAST-NEXT:    vporq %ymm27, %ymm28, %ymm27
 ; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm17 = xmm17[0],xmm19[0],xmm17[1],xmm19[1],xmm17[2],xmm19[2],xmm17[3],xmm19[3],xmm17[4],xmm19[4],xmm17[5],xmm19[5],xmm17[6],xmm19[6],xmm17[7],xmm19[7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm19, %xmm17, %xmm17
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm17 = ymm17[0,0,1,1]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm17, %zmm27, %zmm17
@@ -4568,7 +4568,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm4, %ymm18, %ymm3 {%k1}
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm3, %zmm2
 ; AVX512BW-FAST-NEXT:    vmovdqu8 %zmm1, %zmm2 {%k2}
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [3,3,3,3,u,4,4,4,12,14,13,13,13,13,12,14]
 ; AVX512BW-FAST-NEXT:    vpermd %zmm5, %zmm1, %zmm1
 ; AVX512BW-FAST-NEXT:    movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
 ; AVX512BW-FAST-NEXT:    kmovq %rax, %k1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
index 5554ff3b6dca0..de0c3ccbf51d5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-6.ll
@@ -1180,7 +1180,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vextractf128 $1, %ymm0, %xmm0
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm12, %xmm1
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
@@ -1238,7 +1238,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,zero,xmm13[0],zero,zero,zero,zero,zero,xmm13[1],zero,zero,zero,zero
 ; AVX1-ONLY-NEXT:    vpor %xmm10, %xmm11, %xmm11
 ; AVX1-ONLY-NEXT:    vextractf128 $1, %ymm15, %xmm10
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm12, %xmm12
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0],xmm10[1,2],xmm12[3],xmm10[4,5],xmm12[6],xmm10[7]
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
@@ -1339,7 +1339,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm13, %xmm9
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm9[8],xmm10[8],xmm9[9],xmm10[9],xmm9[10],xmm10[10],xmm9[11],xmm10[11],xmm9[12],xmm10[12],xmm9[13],xmm10[13],xmm9[14],xmm10[14],xmm9[15],xmm10[15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm5, %ymm9, %ymm9
 ; AVX2-SLOW-NEXT:    vmovdqa (%r8), %xmm5
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm5[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
@@ -1379,7 +1379,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm15 = xmm15[1,0,3,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm15 = xmm15[0,1,2,3,4,4,4,4]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm15 = ymm1[0],ymm3[0],ymm1[1],ymm3[1],ymm1[2],ymm3[2],ymm1[3],ymm3[3],ymm1[4],ymm3[4],ymm1[5],ymm3[5],ymm1[6],ymm3[6],ymm1[7],ymm3[7],ymm1[16],ymm3[16],ymm1[17],ymm3[17],ymm1[18],ymm3[18],ymm1[19],ymm3[19],ymm1[20],ymm3[20],ymm1[21],ymm3[21],ymm1[22],ymm3[22],ymm1[23],ymm3[23]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm3, %ymm12
@@ -1413,7 +1413,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm8[8],xmm6[8],xmm8[9],xmm6[9],xmm8[10],xmm6[10],xmm8[11],xmm6[11],xmm8[12],xmm6[12],xmm8[13],xmm6[13],xmm8[14],xmm6[14],xmm8[15],xmm6[15]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm0, %ymm6, %ymm0
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm2 = ymm2[8],ymm12[8],ymm2[9],ymm12[9],ymm2[10],ymm12[10],ymm2[11],ymm12[11],ymm2[12],ymm12[12],ymm2[13],ymm12[13],ymm2[14],ymm12[14],ymm2[15],ymm12[15],ymm2[24],ymm12[24],ymm2[25],ymm12[25],ymm2[26],ymm12[26],ymm2[27],ymm12[27],ymm2[28],ymm12[28],ymm2[29],ymm12[29],ymm2[30],ymm12[30],ymm2[31],ymm12[31]
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} ymm1 = ymm3[8],ymm1[8],ymm3[9],ymm1[9],ymm3[10],ymm1[10],ymm3[11],ymm1[11],ymm3[12],ymm1[12],ymm3[13],ymm1[13],ymm3[14],ymm1[14],ymm3[15],ymm1[15],ymm3[24],ymm1[24],ymm3[25],ymm1[25],ymm3[26],ymm1[26],ymm3[27],ymm1[27],ymm3[28],ymm1[28],ymm3[29],ymm1[29],ymm3[30],ymm1[30],ymm3[31],ymm1[31]
@@ -1475,7 +1475,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm5, %xmm11
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm7, %ymm11, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqa (%r8), %xmm7
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
@@ -1517,7 +1517,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
@@ -1549,7 +1549,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm0, %ymm5, %ymm0
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23]
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
@@ -1611,7 +1611,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm5, %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm7, %ymm11, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm13 = xmm7[6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
@@ -1653,7 +1653,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm15 = xmm15[10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm14, %ymm15, %ymm14
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} ymm15 = ymm8[8],ymm6[8],ymm8[9],ymm6[9],ymm8[10],ymm6[10],ymm8[11],ymm6[11],ymm8[12],ymm6[12],ymm8[13],ymm6[13],ymm8[14],ymm6[14],ymm8[15],ymm6[15],ymm8[24],ymm6[24],ymm8[25],ymm6[25],ymm8[26],ymm6[26],ymm8[27],ymm6[27],ymm8[28],ymm6[28],ymm8[29],ymm6[29],ymm8[30],ymm6[30],ymm8[31],ymm6[31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm15 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,22,23,28,29,26,27,30,31,30,31,30,31,30,31]
@@ -1685,7 +1685,7 @@ define void @store_i8_stride6_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm0, %ymm5, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} ymm2 = ymm8[0],ymm6[0],ymm8[1],ymm6[1],ymm8[2],ymm6[2],ymm8[3],ymm6[3],ymm8[4],ymm6[4],ymm8[5],ymm6[5],ymm8[6],ymm6[6],ymm8[7],ymm6[7],ymm8[16],ymm6[16],ymm8[17],ymm6[17],ymm8[18],ymm6[18],ymm8[19],ymm6[19],ymm8[20],ymm6[20],ymm8[21],ymm6[21],ymm8[22],ymm6[22],ymm8[23],ymm6[23]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} ymm1 = ymm3[0],ymm4[0],ymm3[1],ymm4[1],ymm3[2],ymm4[2],ymm3[3],ymm4[3],ymm3[4],ymm4[4],ymm3[5],ymm4[5],ymm3[6],ymm4[6],ymm3[7],ymm4[7],ymm3[16],ymm4[16],ymm3[17],ymm4[17],ymm3[18],ymm4[18],ymm3[19],ymm4[19],ymm3[20],ymm4[20],ymm3[21],ymm4[21],ymm3[22],ymm4[22],ymm3[23],ymm4[23]
@@ -2765,7 +2765,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vandnps %ymm4, %ymm10, %ymm4
 ; AVX1-ONLY-NEXT:    vorps %ymm4, %ymm1, %ymm1
 ; AVX1-ONLY-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm2, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, %xmm15
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0],xmm6[1],xmm4[2,3],xmm6[4],xmm4[5,6],xmm6[7]
@@ -2833,7 +2833,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm5, %xmm9
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm9, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2],xmm0[3,4],xmm2[5],xmm0[6,7]
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
@@ -2884,7 +2884,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovaps %ymm13, %ymm15
 ; AVX1-ONLY-NEXT:    vorps %ymm0, %ymm2, %ymm0
 ; AVX1-ONLY-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,8,u,128,128,128,128,9,u,128,128,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm3
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm3[2],xmm2[3,4],xmm3[5],xmm2[6,7]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
@@ -2893,7 +2893,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm8, %xmm3
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,5,u,128,128,128,128,6,u,128,128,128,128,7,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3],xmm1[4],xmm0[5,6],xmm1[7]
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,1,2,128,4,5,6,7,8,128,10,11,12,13,14,128]
@@ -2948,7 +2948,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vandnps %ymm1, %ymm15, %ymm1
 ; AVX1-ONLY-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-ONLY-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = [128,128,13,u,128,128,128,128,14,u,128,128,128,128,15,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm3, %xmm4
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm4[1],xmm1[2,3],xmm4[4],xmm1[5,6],xmm4[7]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
@@ -2956,7 +2956,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm2, %xmm4
 ; AVX1-ONLY-NEXT:    vpor %xmm4, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = <10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = [10,u,128,128,128,128,11,u,128,128,128,128,12,u,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm3, %xmm1
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2],xmm1[3],xmm0[4,5],xmm1[6],xmm0[7]
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = [0,128,2,3,4,5,6,128,8,9,10,11,12,128,14,15]
@@ -3012,7 +3012,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vandnps %ymm10, %ymm13, %ymm10
 ; AVX1-ONLY-NEXT:    vorps %ymm10, %ymm11, %ymm11
 ; AVX1-ONLY-NEXT:    vextractf128 $1, %ymm11, %xmm10
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = [2,u,128,128,128,128,3,u,128,128,128,128,4,u,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm3, %xmm15
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm10 = xmm15[0],xmm10[1,2],xmm15[3],xmm10[4,5],xmm15[6],xmm10[7]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm10, %xmm10
@@ -3020,7 +3020,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm2, %xmm15
 ; AVX1-ONLY-NEXT:    vpor %xmm15, %xmm10, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,128,128,0,u,128,128,128,128,1,u,128,128,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm3 = xmm11[0,1],xmm3[2],xmm11[3,4],xmm3[5],xmm11[6,7]
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [0,1,2,3,4,128,6,7,8,9,10,128,12,13,14,15]
@@ -3158,7 +3158,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm4, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm9, %xmm5
@@ -3204,7 +3204,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm5, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa (%r8), %xmm6
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm6, %xmm5
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm6, %xmm7
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
@@ -3229,7 +3229,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm0, %ymm3, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa (%r9), %xmm5
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
@@ -3264,7 +3264,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[1,0,3,2,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[0,0,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm11
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3],xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7]
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm1 = xmm1[0,3,2,1,4,5,6,7]
@@ -3303,7 +3303,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,4,4,4,4,8,9,10,11,12,12,12,12]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm7, %xmm14
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
@@ -3323,7 +3323,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm8, %ymm6
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm0, %ymm6, %ymm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm11, %xmm6
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
@@ -3358,7 +3358,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm1[0,0,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm2, %ymm0, %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
@@ -3394,7 +3394,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm0, %ymm2, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm7, %xmm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0]
@@ -3411,7 +3411,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb %ymm4, %ymm8, %ymm4
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,2,2,3]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm0, %ymm4, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm5, %xmm5
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
@@ -3483,7 +3483,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa %xmm5, %xmm14
@@ -3533,7 +3533,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm5, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa (%r8), %xmm5
 ; AVX2-FAST-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
@@ -3557,7 +3557,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm0, %ymm3, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm0, %xmm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
@@ -3591,7 +3591,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm5
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
@@ -3627,7 +3627,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
 ; AVX2-FAST-NEXT:    vmovdqa %xmm7, %xmm15
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm4
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
@@ -3647,7 +3647,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %ymm2, %ymm11, %ymm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
@@ -3683,7 +3683,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
@@ -3720,7 +3720,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm7, %xmm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
@@ -3736,7 +3736,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm11, %ymm3
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm0, %ymm3, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm8, %xmm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
@@ -3808,7 +3808,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm13, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm4[8],xmm2[8],xmm4[9],xmm2[9],xmm4[10],xmm2[10],xmm4[11],xmm2[11],xmm4[12],xmm2[12],xmm4[13],xmm2[13],xmm4[14],xmm2[14],xmm4[15],xmm2[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm2[0,0,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255,u,u,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm1, %ymm4, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm5, %xmm14
@@ -3858,7 +3858,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm5, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255]
@@ -3882,7 +3882,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm0, %ymm3, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r9), %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm0, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255]
@@ -3916,7 +3916,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [10,11,8,9,6,7,12,13,14,15,14,15,14,15,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
@@ -3952,7 +3952,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm4, %ymm1, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm7, %xmm15
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm7, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,0,0,1]
@@ -3972,7 +3972,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm2, %ymm11, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm0, %ymm2, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
@@ -4008,7 +4008,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,3,0,1,6,7,4,5,8,9,8,9,8,9,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255,0,0,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsp), %xmm2 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm5 # 16-byte Folded Reload
@@ -4045,7 +4045,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm7, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255,255,255,0,0,255,255]
@@ -4061,7 +4061,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm11, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[2,2,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm0, %ymm3, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm8, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,0,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255,255,255,0,255,255]
@@ -4128,10 +4128,10 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6,5]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm7, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm7, %xmm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm7, %xmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm3, %xmm26
 ; AVX512F-SLOW-NEXT:    vinserti32x4 $2, %xmm2, %zmm7, %zmm2
@@ -4228,7 +4228,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm5, %ymm21
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} ymm5 = ymm12[0],ymm6[0],ymm12[1],ymm6[1],ymm12[2],ymm6[2],ymm12[3],ymm6[3],ymm12[4],ymm6[4],ymm12[5],ymm6[5],ymm12[6],ymm6[6],ymm12[7],ymm6[7],ymm12[16],ymm6[16],ymm12[17],ymm6[17],ymm12[18],ymm6[18],ymm12[19],ymm6[19],ymm12[20],ymm6[20],ymm12[21],ymm6[21],ymm12[22],ymm6[22],ymm12[23],ymm6[23]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm5, %ymm20
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm6, %xmm5, %xmm13
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm5
@@ -4252,14 +4252,14 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm10[0],xmm2[0],xmm10[1],xmm2[1],xmm10[2],xmm2[2],xmm10[3],xmm2[3],xmm10[4],xmm2[4],xmm10[5],xmm2[5],xmm10[6],xmm2[6],xmm10[7],xmm2[7]
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%r8), %xmm3
 ; AVX512F-SLOW-NEXT:    vprold $16, %xmm2, %xmm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm26
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm1, %xmm6
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm3, %xmm15
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm10
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm10, %ymm17
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm3, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm1, %xmm14
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm1, %xmm0
@@ -4471,9 +4471,9 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm1, %zmm2, %zmm1
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%r9), %xmm4
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,6,u,5,u,8,u,7,u,9,u,9,u,9,u,9]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm13, %xmm4, %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,2,u,1,u,0,u,3,u,4,u,4,u,4,u,4]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm1, %xmm4, %xmm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm4, %xmm19
 ; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm2, %zmm3, %zmm26
@@ -4536,7 +4536,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm10 = [0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0,0,2,0,1,0,0,0,3,0,0,0,0,0,4,0,0]
 ; AVX512F-FAST-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm10, %ymm9, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,10,u,13,u,12,u,11,u,14,u,13,u,14,u,15]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm15, %xmm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm5, %xmm17
 ; AVX512F-FAST-NEXT:    vpermt2q %zmm0, %zmm7, %zmm3
@@ -4561,7 +4561,7 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm6[8],xmm8[8],xmm6[9],xmm8[9],xmm6[10],xmm8[10],xmm6[11],xmm8[11],xmm6[12],xmm8[12],xmm6[13],xmm8[13],xmm6[14],xmm8[14],xmm6[15],xmm8[15]
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %xmm15
 ; AVX512F-FAST-NEXT:    vpshufb %xmm12, %xmm6, %xmm8
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [6,u,5,u,8,u,7,u,9,u,9,u,9,u,9,u]
 ; AVX512F-FAST-NEXT:    vpermt2q %zmm4, %zmm7, %zmm8
 ; AVX512F-FAST-NEXT:    vpshufb %xmm6, %xmm15, %xmm4
 ; AVX512F-FAST-NEXT:    vpternlogq $226, %zmm2, %zmm16, %zmm8
@@ -4569,11 +4569,11 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm5, %ymm12
 ; AVX512F-FAST-NEXT:    vpshufb %ymm2, %ymm13, %ymm9
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [2,u,1,u,0,u,3,u,4,u,4,u,4,u,4,u]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm6, %xmm14, %xmm6
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm5
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm14, %xmm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [10,u,13,u,12,u,11,u,14,u,13,u,14,u,15,u]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm13, %xmm14, %xmm14
 ; AVX512F-FAST-NEXT:    vpshufb %xmm13, %xmm15, %xmm13
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm31, %ymm15
@@ -4972,14 +4972,14 @@ define void @store_i8_stride6_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm10, %zmm16, %zmm10
 ; AVX512BW-FAST-NEXT:    vmovdqu16 %zmm12, %zmm10 {%k2}
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%r8), %xmm12
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm27 = <8,u,9,u,u,u,u,u,u,u,5,u,6,u,7,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm27 = [8,u,9,u,u,u,u,u,u,u,5,u,6,u,7,u]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm27, %xmm12, %xmm16
 ; AVX512BW-FAST-NEXT:    vpmovzxbw {{.*#+}} xmm28 = xmm12[0],zero,xmm12[1],zero,xmm12[2],zero,xmm12[3],zero,xmm12[4],zero,xmm12[5],zero,xmm12[6],zero,xmm12[7],zero
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm16, %zmm28, %zmm16
 ; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm28 = [2,1,0,3,2,1,0,3,2,1,0,3,4,4,4,4,22,21,16,23,22,21,16,23,22,21,16,23,17,17,17,17]
 ; AVX512BW-FAST-NEXT:    vpermw %zmm16, %zmm28, %zmm10 {%k1}
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%r9), %xmm16
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm29 = <u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm29 = [u,8,u,9,u,10,u,11,u,4,u,5,u,6,u,7]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm29, %xmm16, %xmm30
 ; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm31 = xmm16[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm30, %zmm31, %zmm30
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index b7baf7b40d714..482da013d741b 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -525,7 +525,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
@@ -563,7 +563,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm2, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[1,5,9,13],zero,zero,zero,ymm1[2,6,10,14],zero,zero,zero,ymm1[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,u,5,1,3,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-NEXT:    vpor %ymm3, %ymm1, %ymm1
@@ -609,7 +609,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,ymm0[4],zero,zero,zero,zero,zero,zero,ymm0[5],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm0[23,31],zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm6 = ymm1[4,12],zero,zero,zero,zero,zero,ymm1[5,13],zero,zero,zero,zero,zero,ymm1[6,14,22],zero,zero,zero,zero,zero,zero,ymm1[23],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm3, %ymm5, %ymm3
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,8],zero,zero,zero,zero,zero,ymm2[1,9],zero,zero,zero,zero,zero,ymm2[2,10,18,26],zero,zero,zero,zero,zero,ymm2[19,27],zero,zero,zero,zero,zero,ymm2[20,28]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,zero,zero,ymm4[1,9],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[19,27],zero,zero,zero,zero,zero,ymm4[20,28],zero,zero
@@ -682,13 +682,13 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,u]
 ; AVX512F-FAST-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [0,2,4,6,0,2,4,6]
 ; AVX512F-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[0,4,8],zero,zero,zero,zero,ymm0[1,5,9],zero,zero,zero,zero,ymm0[18,22,26],zero,zero,zero,zero,ymm0[19,23,27],zero,zero,zero,zero
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,u,5,1,3,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,4,8],zero,zero,zero,zero,ymm1[1,5,9],zero,zero,zero,zero,ymm1[2,6,18],zero,zero,zero,zero,ymm1[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
@@ -761,13 +761,13 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
 ; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,u>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,u]
 ; AVX512BW-FAST-NEXT:    vpermi2q %ymm3, %ymm0, %ymm1
 ; AVX512BW-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm0 = [1,3,5,7,1,3,5,7]
 ; AVX512BW-FAST-NEXT:    # ymm0 = mem[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vpermd %ymm2, %ymm0, %ymm0
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[1,5,9,13],zero,zero,zero,ymm0[2,6,10,14],zero,zero,zero,ymm0[19,23,27,31],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <1,3,5,u,5,1,3,u>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [1,3,5,u,5,1,3,u]
 ; AVX512BW-FAST-NEXT:    vpermd %ymm1, %ymm3, %ymm3
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} ymm3 = ymm3[0,4,8],zero,zero,zero,zero,ymm3[1,5,9],zero,zero,zero,zero,ymm3[2,6,18],zero,zero,zero,zero,ymm3[23,27,19],zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-FAST-NEXT:    vpor %ymm0, %ymm3, %ymm0
@@ -1272,7 +1272,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm8[2,3,0,1]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,zero,zero,zero,zero,ymm12[5],zero,zero,zero,zero,zero,zero,ymm12[6],zero,zero,zero,zero,zero,ymm12[23],zero,zero,zero,zero,zero,zero,ymm12[24],zero,zero,zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpor %ymm12, %ymm11, %ymm11
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm11 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[2,3,0,1]
@@ -1280,7 +1280,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpor %ymm12, %ymm11, %ymm11
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm12 = xmm2[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
@@ -1289,7 +1289,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm12 = xmm2[1,1,0,0,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[0,1,2,0]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm8[0,2,0,2]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28]
@@ -1303,7 +1303,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm12 = xmm2[0,1,2,3,4,5,5,6]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm12 = xmm12[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm13, %ymm10, %ymm12, %ymm10
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[3,1,1,3]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm7[1],zero,zero,zero,zero,zero,ymm7[10,2],zero,zero,zero,zero,zero,ymm7[11,3],zero,zero,zero,zero,zero,ymm7[20,28],zero,zero,zero,zero,zero,ymm7[21,29],zero,zero,zero
@@ -1321,7 +1321,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,u,u,u,u,u,13,12,u,u,u,u,u,15,14,u]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,6,7,7,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm2 = xmm2[2,1,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
@@ -1351,7 +1351,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermd %ymm7, %ymm11, %ymm7
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[0,2,0,2]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = ymm11[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm11, %ymm7, %ymm7
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm9[0,2,0,2]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,ymm11[0,8],zero,zero,zero,zero,zero,ymm11[1,9],zero,zero,zero,zero,zero,ymm11[18,26],zero,zero,zero,zero,zero,ymm11[19,27],zero,zero,zero,zero,zero,ymm11[20,28]
@@ -1366,7 +1366,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    # ymm12 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm10, %ymm12, %ymm13
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm13[0,4,u,u,u,u,u,1,5,u,u,u,u,u,2,6,u,u,u,u,u,19,23,u,u,u,u,u,24,28,u,u]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm13, %ymm11, %ymm11
 ; AVX2-FAST-NEXT:    vpermd %ymm9, %ymm12, %ymm13
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = zero,zero,zero,zero,zero,ymm13[1,5],zero,zero,zero,zero,zero,ymm13[2,6],zero,zero,zero,zero,zero,ymm13[19,23],zero,zero,zero,zero,zero,ymm13[24,28],zero,zero,zero,zero
@@ -1381,7 +1381,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermd %ymm12, %ymm13, %ymm12
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm10, %ymm12, %ymm10
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[3,1,1,3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm8[1],zero,zero,zero,zero,zero,ymm8[10,2],zero,zero,zero,zero,zero,ymm8[11,3],zero,zero,zero,zero,zero,ymm8[20,28],zero,zero,zero,zero,zero,ymm8[21,29],zero,zero,zero
@@ -1429,7 +1429,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm7[2,3,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm11 = zero,zero,zero,zero,zero,zero,ymm11[5],zero,zero,zero,zero,zero,zero,ymm11[6],zero,zero,zero,zero,zero,ymm11[23],zero,zero,zero,zero,zero,zero,ymm11[24],zero,zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm11, %ymm9, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm8, %ymm9, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm8 = ymm10[4],zero,zero,zero,zero,zero,zero,ymm10[5],zero,zero,zero,zero,zero,zero,ymm10[6],zero,zero,zero,zero,zero,zero,zero,ymm10[23],zero,zero,zero,zero,zero,zero,ymm10[24],zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm10[2,3,0,1]
@@ -1438,7 +1438,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r10), %xmm8
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,0,1,0]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm9, %ymm11, %ymm9
@@ -1446,7 +1446,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,0,1,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm10[0,2,0,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm12 = ymm12[u,u,u,u,0,8,u,u,u,u,u,1,9,u,u,u,u,u,18,26,u,u,u,u,u,19,27,u,u,u,u,u]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm12, %ymm11, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm7[0,2,0,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm12 = zero,zero,ymm12[0,8],zero,zero,zero,zero,zero,ymm12[1,9],zero,zero,zero,zero,zero,ymm12[18,26],zero,zero,zero,zero,zero,ymm12[19,27],zero,zero,zero,zero,zero,ymm12[20,28]
@@ -1459,7 +1459,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[1,3,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm10 = ymm10[u,u,u,1,9,u,u,u,u,u,2,10,u,u,u,u,u,19,27,u,u,u,u,u,20,28,u,u,u,u,u,21]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm10, %ymm12, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[3,1,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm6 = ymm6[1],zero,zero,zero,zero,zero,ymm6[10,2],zero,zero,zero,zero,zero,ymm6[11,3],zero,zero,zero,zero,zero,ymm6[20,28],zero,zero,zero,zero,zero,ymm6[21,29],zero,zero,zero
@@ -1617,7 +1617,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm7, %zmm6, %zmm6
 ; AVX512F-FAST-NEXT:    vporq %zmm4, %zmm6, %zmm4
 ; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm6 = xmm0[1,1,0,0,4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,1,u,1,u,0,0,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,1,u,1,u,0,0,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm6, %ymm7, %ymm6
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,0,1,0]
@@ -2537,24 +2537,24 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,u,2,3],zero,xmm1[u,u,u,u,4,5],zero,xmm1[u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm1, %xmm0
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm2, %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm8 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm14, %xmm3
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm7
 ; AVX1-ONLY-NEXT:    vmovdqa 16(%rcx), %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa 16(%rdx), %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm3
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm3, %xmm0
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
@@ -2562,7 +2562,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm0, %ymm15
 ; AVX1-ONLY-NEXT:    vmovdqa 16(%rsi), %xmm10
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm10, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,7],zero,xmm6[u,u,u,u,u,8],zero,xmm6[u,u,u,u,u,9]
@@ -2592,7 +2592,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm8, %xmm2
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm0, %xmm0
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm8, %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm5, %xmm8
 ; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm7[0],xmm3[1],xmm7[1],xmm3[2],xmm7[2],xmm3[3],xmm7[3],xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
@@ -2617,7 +2617,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm11, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm9, %xmm11
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm11, %ymm11
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255]
@@ -2661,13 +2661,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm6[8],xmm10[8],xmm6[9],xmm10[9],xmm6[10],xmm10[10],xmm6[11],xmm10[11],xmm6[12],xmm10[12],xmm6[13],xmm10[13],xmm6[14],xmm10[14],xmm6[15],xmm10[15]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm6, %ymm4, %ymm6
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
@@ -2723,17 +2723,17 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vandnps %ymm2, %ymm5, %ymm2
 ; AVX1-ONLY-NEXT:    vorps %ymm2, %ymm1, %ymm6
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,0,1,u,u,u,u,u,2,3,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,0,1,u,u,u,u,u,2,3,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm9, %xmm3
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm1, %ymm1
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm12[u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm11, %xmm7
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm5, %ymm5
 ; AVX1-ONLY-NEXT:    vandnps %ymm1, %ymm4, %ymm1
 ; AVX1-ONLY-NEXT:    vandps %ymm4, %ymm5, %ymm4
 ; AVX1-ONLY-NEXT:    vorps %ymm1, %ymm4, %ymm4
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
 ; AVX1-ONLY-NEXT:    vmovdqa (%rsp), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm5 = xmm5[4,5,6,7,8,9,10,11,12,13,14,15],xmm14[0,1,2,3]
@@ -2741,7 +2741,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm7[10],zero,xmm7[u,u,u,u,13,12],zero,xmm7[u,u,u,u,15,14],zero
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = zero,xmm8[13,u,u,u,u],zero,zero,xmm8[14,u,u,u,u],zero,zero,xmm8[15]
 ; AVX1-ONLY-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
@@ -2816,14 +2816,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    # ymm10 = mem[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm10, %ymm9, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm7[27],zero,ymm7[27,28,29,30],zero,ymm7[28],zero,ymm7[26,27,30,31],zero,ymm7[29]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm2[27],zero,zero,zero,zero,ymm2[30],zero,ymm2[28],zero,zero,zero,zero,ymm2[31],zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpor %ymm8, %ymm9, %ymm8
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm0, %ymm8, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
@@ -2840,7 +2840,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm9 = xmm0[8],xmm14[8],xmm0[9],xmm14[9],xmm0[10],xmm14[10],xmm0[11],xmm14[11],xmm0[12],xmm14[12],xmm0[13],xmm14[13],xmm0[14],xmm14[14],xmm0[15],xmm14[15]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm8, %ymm9, %ymm8
 ; AVX2-SLOW-NEXT:    vmovdqa (%r9), %xmm12
 ; AVX2-SLOW-NEXT:    vmovdqa (%r8), %xmm13
@@ -2851,7 +2851,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm7 = xmm15[0,1,2,3,4,5,5,6]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm1, %ymm9, %ymm7, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm8, %ymm1, %ymm1
@@ -2864,7 +2864,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpor %xmm7, %xmm9, %xmm7
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm1, %ymm7, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm12[4,u,u,u],zero,xmm12[7],zero,xmm12[5,u,u,u],zero,xmm12[8],zero,xmm12[6]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm13[4],zero,xmm13[u,u,u,7],zero,xmm13[5],zero,xmm13[u,u,u,8],zero,xmm13[6],zero
@@ -2872,7 +2872,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm9 = xmm15[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,0,1,0]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm7, %ymm9, %ymm7
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm9
@@ -2882,7 +2882,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3],xmm10[4],xmm11[4],xmm10[5],xmm11[5],xmm10[6],xmm11[6],xmm10[7],xmm11[7]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
@@ -2890,7 +2890,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm15[1,1,0,0,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm7 = xmm7[0,1,2,0]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,0,1,0]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm10
@@ -2906,7 +2906,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18],zero,zero,zero,zero,ymm3[21],zero,ymm3[19],zero,zero,zero,zero,ymm3[22],zero,ymm3[20],zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpor %ymm1, %ymm7, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm1, %ymm0, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[20],zero,ymm2[18],zero,zero,zero,zero,ymm2[21],zero,ymm2[19],zero,zero,zero,zero,ymm2[22]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
@@ -2918,7 +2918,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} ymm7 = ymm11[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm7 = ymm7[0,1,1,3,4,5,5,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,2,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
@@ -2932,7 +2932,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero,zero,zero,ymm3[27]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm2[25],zero,ymm2[23],zero,zero,zero,zero,ymm2[26],zero,ymm2[24],zero,zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
@@ -2943,7 +2943,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm11, %ymm12
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm7, %ymm8, %ymm7
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm8, %ymm1, %ymm7, %ymm1
@@ -2953,13 +2953,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm5 = zero,zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpor %ymm5, %ymm3, %ymm3
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm4, %ymm3, %ymm3
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = zero,ymm13[1,2,3,0,1,14],zero,ymm13[0,1,0,1,14,15],zero,ymm13[15,16,17,18,19,16],zero,ymm13[30,31,16,17,16,17],zero,ymm13[31,30,31]
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[13],zero,zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero
 ; AVX2-SLOW-NEXT:    vpor %ymm4, %ymm2, %ymm2
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm4 = ymm12[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm3, %ymm2, %ymm2
@@ -2994,7 +2994,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa (%rax), %xmm11
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm5 = xmm11[0,1,2,3,4,5,5,6]
@@ -3006,7 +3006,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
@@ -3018,7 +3018,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u]
 ; AVX2-FAST-NEXT:    vpor %xmm6, %xmm7, %xmm6
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
@@ -3026,7 +3026,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero
 ; AVX2-FAST-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm7, %ymm6, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqa (%r8), %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
@@ -3039,7 +3039,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm10, %ymm13, %ymm9, %ymm9
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm11[1,1,0,0,4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [0,1,0,1,2,0,0,1]
@@ -3047,7 +3047,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm11, %ymm10, %ymm10
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm9, %ymm10, %ymm9
@@ -3061,7 +3061,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm11, %ymm12, %ymm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} ymm11 = ymm7[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [4,5,4,5,5,7,4,5]
@@ -3071,7 +3071,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22],zero
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm12, %ymm13, %ymm12
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm12, %ymm11, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
@@ -3085,14 +3085,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm12, %ymm13, %ymm12
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm12, %ymm13, %ymm12
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm12 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
@@ -3108,7 +3108,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm13, %ymm14, %ymm13
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
@@ -3117,7 +3117,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpor %ymm13, %ymm14, %ymm13
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
@@ -3127,13 +3127,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
 ; AVX2-FAST-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -3165,7 +3165,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} xmm5 = xmm15[8],xmm13[8],xmm15[9],xmm13[9],xmm15[10],xmm13[10],xmm15[11],xmm13[11],xmm15[12],xmm13[12],xmm15[13],xmm13[13],xmm15[14],xmm13[14],xmm15[15],xmm13[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm5 = xmm5[2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rax), %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm5 = xmm11[8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13]
@@ -3175,7 +3175,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm14[8],xmm12[8],xmm14[9],xmm12[9],xmm14[10],xmm12[10],xmm14[11],xmm12[11],xmm14[12],xmm12[12],xmm14[13],xmm12[13],xmm14[14],xmm12[14],xmm14[15],xmm12[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm6, %ymm5, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
@@ -3187,7 +3187,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm7 = xmm9[u,u,u,7],zero,xmm9[5],zero,xmm9[u,u,u,8],zero,xmm9[6],zero,xmm9[u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm6, %xmm7, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm6 = xmm11[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,0,1,0]
@@ -3195,7 +3195,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm8 = xmm14[4],zero,xmm14[u,u,u,7],zero,xmm14[5],zero,xmm14[u,u,u,8],zero,xmm14[6],zero
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm7, %xmm8, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm7, %ymm6, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255]
@@ -3208,14 +3208,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm10, %ymm13, %ymm9, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm10 = xmm11[2,3,2,3,0,1,0,1,8,9,10,11,2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[0,0,1,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm14[0],xmm12[0],xmm14[1],xmm12[1],xmm14[2],xmm12[2],xmm14[3],xmm12[3],xmm14[4],xmm12[4],xmm14[5],xmm12[5],xmm14[6],xmm12[6],xmm14[7],xmm12[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm11 = xmm11[u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm11, %ymm10, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm9, %ymm10, %ymm9
@@ -3229,14 +3229,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm12 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[30],zero,ymm1[28],zero,zero,zero,zero,ymm1[31],zero,ymm1[29],zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm11, %ymm12, %ymm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm11 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[27],zero,ymm6[27,28,29,30],zero,ymm6[28],zero,ymm6[26,27,30,31],zero,ymm6[29]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29],zero,ymm8[27],zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm11, %ymm12, %ymm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u,255,255,255,255,0,0,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm10, %ymm11, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm11 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
@@ -3252,7 +3252,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm13 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[25],zero,ymm0[23],zero,zero,zero,zero,ymm0[26],zero,ymm0[24],zero,zero,zero,zero,ymm0[27]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm12, %ymm13, %ymm12
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm12 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm8[25],zero,ymm8[23],zero,zero,zero,zero,ymm8[26],zero,ymm8[24],zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[2,3,2,3]
@@ -3261,7 +3261,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm12, %ymm13, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm13 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm13, %ymm11, %ymm12, %ymm11
@@ -3275,7 +3275,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm14 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,ymm1[21],zero,ymm1[19],zero,zero,zero,zero,ymm1[22],zero,ymm1[20],zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm13, %ymm14, %ymm13
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm13 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm8[20],zero,ymm8[18],zero,zero,zero,zero,ymm8[21],zero,ymm8[19],zero,zero,zero,zero,ymm8[22]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[2,3,2,3]
@@ -3284,7 +3284,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm13, %ymm14, %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm14 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,20,21,18,19,18,19,20,21,18,19,20,21,28,29,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm14 = ymm14[2,2,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm13, %ymm14, %ymm13
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm14, %ymm12, %ymm13, %ymm12
@@ -3294,13 +3294,13 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm2, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = zero,ymm6[1,2,3,0,1,14],zero,ymm6[0,1,0,1,14,15],zero,ymm6[15,16,17,18,19,16],zero,ymm6[30,31,16,17,16,17],zero,ymm6[31,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = ymm8[13],zero,zero,zero,zero,zero,zero,ymm8[14],zero,zero,zero,zero,zero,zero,ymm8[15],zero,zero,zero,zero,zero,zero,ymm8[16],zero,zero,zero,zero,zero,zero,ymm8[17],zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = ymm7[12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
@@ -3328,7 +3328,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqa (%r10), %ymm4
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,u,u,u,u,26,27,24,25]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm8 = ymm4[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm7, %zmm8, %zmm9
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm7 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm6[20],zero,ymm6[18],zero,zero,zero,zero,ymm6[21],zero,ymm6[19],zero,zero,zero,zero,ymm6[22]
 ; AVX512F-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero
@@ -3501,7 +3501,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm7, %zmm15
 ; AVX512F-FAST-NEXT:    vmovdqa (%r10), %xmm10
 ; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm7 = xmm10[1,1,0,0,4,5,6,7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = <u,1,u,1,u,0,0,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm13 = [u,1,u,1,u,0,0,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm7, %ymm13, %ymm7
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm13 = xmm10[4,5,4,5,4,5,8,9,6,7,6,7,6,7,6,7]
 ; AVX512F-FAST-NEXT:    vpermq {{.*#+}} ymm13 = ymm13[0,0,1,0]
@@ -5397,38 +5397,38 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u],zero,zero,xmm6[9,u,u,u,u],zero,zero,xmm6[10,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm0, %xmm0
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,7,u,u,u,u,u,128,8,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm8, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm3, %xmm8
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,7,128,u,u,u,u,u,8,128,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm10, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm4, %xmm10
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm3, %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,u,u,u,u,5,6,128,u,u,u,u,12,13,128,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,u,u,u,u,128,128,7,u,u,u,u,128,128,8,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm6, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm4, %xmm6
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,u,u,u,u,u,128,8,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm12, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm3, %xmm12
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,u,u,u,u,u,8,128,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm13, %xmm3
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,128,7,u,u,u,u,u,128,8,u,u,u,u,u,128]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm7, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,7,128,u,u,u,u,u,8,128,u,u,u,u,u,9>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,7,128,u,u,u,u,u,8,128,u,u,u,u,u,9]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm11, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm4, %xmm13
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm7[8],xmm11[8],xmm7[9],xmm11[9],xmm7[10],xmm11[10],xmm7[11],xmm11[11],xmm7[12],xmm11[12],xmm7[13],xmm11[13],xmm7[14],xmm11[14],xmm7[15],xmm11[15]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm2, %ymm2
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm3 = [255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0]
@@ -5476,7 +5476,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm7, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3],xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,u,u,10,11,u,u,u,u,u,12,13,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm7, %xmm7
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm12, %xmm9
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm6, %ymm7, %ymm6
@@ -5507,7 +5507,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm3, %xmm0
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm2, %ymm0
@@ -5525,11 +5525,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 48(%r9), %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,4,u,u,u,u,128,128,5,u,u,u,u,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm12, %xmm0
 ; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = [8,9,128,u,u,u,u,10,11,128,u,u,u,u,12,13]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm12[u,u],zero,zero,xmm12[2,u,u,u,u],zero,zero,xmm12[3,u,u,u,u]
@@ -5549,7 +5549,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovdqa 48(%rdx), %xmm13
 ; AVX1-ONLY-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm13[0],xmm2[0],xmm13[1],xmm2[1],xmm13[2],xmm2[2],xmm13[3],xmm2[3],xmm13[4],xmm2[4],xmm13[5],xmm2[5],xmm13[6],xmm2[6],xmm13[7],xmm2[7]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm4, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm11, %ymm3
@@ -5617,17 +5617,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm14[8],xmm2[8],xmm14[9],xmm2[9],xmm14[10],xmm2[10],xmm14[11],xmm2[11],xmm14[12],xmm2[12],xmm14[13],xmm2[13],xmm14[14],xmm2[14],xmm14[15],xmm2[15]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,u,u,u,u,u,5,4,u,u,u,u,u,7,6,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm0
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,u,8,9,u,u,u,u,u,10,11,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm0, %ymm0
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm5[8],xmm3[8],xmm5[9],xmm3[9],xmm5[10],xmm3[10],xmm5[11],xmm3[11],xmm5[12],xmm3[12],xmm5[13],xmm3[13],xmm5[14],xmm3[14],xmm5[15],xmm3[15]
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm5[8],xmm3[9],xmm5[9],xmm3[10],xmm5[10],xmm3[11],xmm5[11],xmm3[12],xmm5[12],xmm3[13],xmm5[13],xmm3[14],xmm5[14],xmm3[15],xmm5[15]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [6,u,u,u,u,u,9,8,u,u,u,u,u,11,10,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm4, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,2,3,u,u,u,u,u,4,5,u,u,u,u,u,6]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm2, %ymm3, %ymm2
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm3 = [0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255]
@@ -5663,7 +5663,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm4 = xmm1[u,u,u,u,u,10,11,u,u,u,u,u,12,13,u,u]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = <4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,5,u,u,u,u,u,6,7,u,u,u,u,u,8,9]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm1, %xmm11
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm11, %ymm4
 ; AVX1-ONLY-NEXT:    vandnps %ymm3, %ymm9, %ymm3
@@ -5718,12 +5718,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm3 = xmm10[8],xmm8[8],xmm10[9],xmm8[9],xmm10[10],xmm8[10],xmm10[11],xmm8[11],xmm10[12],xmm8[12],xmm10[13],xmm8[13],xmm10[14],xmm8[14],xmm10[15],xmm8[15]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm3, %xmm4
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = <u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [u,u,12,13,u,u,u,u,u,14,15,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm3, %ymm4, %ymm3
 ; AVX1-ONLY-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm9[8],xmm13[8],xmm9[9],xmm13[9],xmm9[10],xmm13[10],xmm9[11],xmm13[11],xmm9[12],xmm13[12],xmm9[13],xmm13[13],xmm9[14],xmm13[14],xmm9[15],xmm13[15]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,12,13,u,u,u,u,u,14,15,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm4, %ymm0
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255]
@@ -5750,14 +5750,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,xmm12[13],zero,zero,zero,zero,zero,zero,xmm12[14],zero,zero,zero,zero,zero,zero,xmm12[15]
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,0,1,u,u,u,u,u,2,3,u,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,0,1,u,u,u,u,u,2,3,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, %xmm9
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm3, %ymm0
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,1,u,u,u,u,u,2,3,u,u,u,u,u,4,5]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -5773,11 +5773,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = zero,xmm14[13,u,u,u,u],zero,zero,xmm14[14,u,u,u,u],zero,zero,xmm14[15]
 ; AVX1-ONLY-NEXT:    vpor %xmm7, %xmm4, %xmm7
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,0,1,u,u,u,u,u,2,3,u,u,u]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm10
 ; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm5 = xmm10[4,5,6,7,8,9,10,11,12,13,14,15],xmm12[0,1,2,3]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,u,0,1,12,u,u,u,u,7,8,13,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm7, %ymm5
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm7 = [0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255]
@@ -5790,7 +5790,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm8, %xmm10
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm5, %ymm10, %ymm5
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,6,7,u,u,u,u,u,8,9,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm8, %xmm10
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm8, %xmm12
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm12, %ymm10
@@ -5798,10 +5798,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX1-ONLY-NEXT:    vandnps %ymm5, %ymm12, %ymm5
 ; AVX1-ONLY-NEXT:    vandps %ymm12, %ymm10, %ymm10
 ; AVX1-ONLY-NEXT:    vorps %ymm5, %ymm10, %ymm5
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,128,128,2,u,u,u,u,128,128,3,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,128,128,2,u,u,u,u,128,128,3,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm14, %xmm10
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,4,5,128,u,u,u,u,6,7,128,u,u,u,u>
+; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,4,5,128,u,u,u,u,6,7,128,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm8, %xmm13
 ; AVX1-ONLY-NEXT:    vpor %xmm10, %xmm13, %xmm10
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm8, %xmm13
@@ -5926,15 +5926,15 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm6[25],zero,ymm6[23],zero,zero,zero,zero,ymm6[26],zero,ymm6[24],zero,zero,zero,zero,ymm6[27]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
@@ -5954,7 +5954,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdx), %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -5980,7 +5980,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
@@ -5990,16 +5990,16 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm2, %xmm14
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm11
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm9
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3],xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm10
 ; AVX2-SLOW-NEXT:    vmovdqa (%rsi), %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6021,7 +6021,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%r8), %xmm2
 ; AVX2-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm2, %xmm8
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vmovdqa 32(%rax), %xmm2
@@ -6029,7 +6029,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshuflw {{.*#+}} xmm3 = xmm2[1,1,0,0,4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm3 = xmm3[0,1,2,0]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,0,1,0]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm1, %ymm3, %ymm1
 ; AVX2-SLOW-NEXT:    vmovdqa (%r9), %xmm3
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6049,19 +6049,19 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm4, %ymm2, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm13, %xmm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm14, %xmm12
 ; AVX2-SLOW-NEXT:    vpor %xmm2, %xmm12, %xmm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm15, %xmm11, %xmm12
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm0, %xmm9, %xmm14
 ; AVX2-SLOW-NEXT:    vpor %xmm12, %xmm14, %xmm12
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm2, %ymm12, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
@@ -6075,10 +6075,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm14, %ymm1, %ymm0, %ymm12
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm10, %xmm2
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
 ; AVX2-SLOW-NEXT:    vmovdqa %xmm8, %xmm3
 ; AVX2-SLOW-NEXT:    vpshufb %xmm4, %xmm8, %xmm14
 ; AVX2-SLOW-NEXT:    vpor %xmm2, %xmm14, %xmm2
@@ -6087,7 +6087,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm14, %xmm5, %xmm15
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm0, %ymm2, %ymm15, %ymm2
 ; AVX2-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
@@ -6107,13 +6107,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm11[8],xmm9[8],xmm11[9],xmm9[9],xmm11[10],xmm9[10],xmm11[11],xmm9[11],xmm11[12],xmm9[12],xmm11[13],xmm9[13],xmm11[14],xmm9[14],xmm11[15],xmm9[15]
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm13, %xmm1 # 16-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # xmm1 = xmm13[8],mem[8],xmm13[9],mem[9],xmm13[10],mem[10],xmm13[11],mem[11],xmm13[12],mem[12],xmm13[13],mem[13],xmm13[14],mem[14],xmm13[15],mem[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm4 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm0, %ymm1, %ymm0
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm6, %xmm1 # 16-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # xmm1 = xmm6[8],mem[8],xmm6[9],mem[9],xmm6[10],mem[10],xmm6[11],mem[11],xmm6[12],mem[12],xmm6[13],mem[13],xmm6[14],mem[14],xmm6[15],mem[15]
@@ -6126,13 +6126,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm3[8],xmm10[8],xmm3[9],xmm10[9],xmm3[10],xmm10[10],xmm3[11],xmm10[11],xmm3[12],xmm10[12],xmm3[13],xmm10[13],xmm3[14],xmm10[14],xmm3[15],xmm10[15]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
 ; AVX2-SLOW-NEXT:    vpshufhw {{.*#+}} xmm4 = xmm5[0,1,2,3,4,5,5,6]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm4 = xmm4[2,2,3,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm5, %ymm2, %ymm4, %ymm2
 ; AVX2-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm8[8],xmm15[9],xmm8[9],xmm15[10],xmm8[10],xmm15[11],xmm8[11],xmm15[12],xmm8[12],xmm15[13],xmm8[13],xmm15[14],xmm8[14],xmm15[15],xmm8[15]
 ; AVX2-SLOW-NEXT:    vpshufb %xmm3, %xmm4, %xmm3
@@ -6160,7 +6160,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    # ymm5 = mem[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} ymm5 = ymm5[0,1,1,3,4,5,5,7]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,2,3,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm2, %ymm5, %ymm0
 ; AVX2-SLOW-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm14 # 32-byte Reload
@@ -6202,7 +6202,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm9 = ymm9[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpor %ymm7, %ymm9, %ymm7
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm7, %ymm4, %ymm4
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm15, %ymm6
@@ -6231,7 +6231,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm12[25],zero,ymm12[23],zero,zero,zero,zero,ymm12[26],zero,ymm12[24],zero,zero,zero,zero,ymm12[27]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpor %ymm5, %ymm6, %ymm5
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm3, %ymm4, %ymm5, %ymm4
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm5 = ymm14[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm14[25],zero,ymm14[23],zero,zero,zero,zero,ymm14[26],zero,ymm14[24],zero,zero
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[2,3,2,3]
@@ -6241,7 +6241,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm6 = ymm8[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm8, %ymm14
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm7 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm6, %ymm4, %ymm5, %ymm1
@@ -6258,7 +6258,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128]
 ; AVX2-SLOW-NEXT:    vpshufb %ymm10, %ymm2, %ymm11
 ; AVX2-SLOW-NEXT:    vpor %ymm9, %ymm11, %ymm9
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm11, %ymm5, %ymm9, %ymm5
 ; AVX2-SLOW-NEXT:    vpshufb %ymm6, %ymm0, %ymm6
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -6278,7 +6278,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm10 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpshufb %ymm10, %ymm1, %ymm11
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm12 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm12, %ymm8, %ymm11, %ymm8
 ; AVX2-SLOW-NEXT:    vpshufb %ymm7, %ymm13, %ymm7
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm11 # 32-byte Reload
@@ -6302,15 +6302,15 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm9, %ymm8, %ymm8
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm2, %ymm7, %ymm8, %ymm7
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm15[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,2,2,2]
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm9 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u]
 ; AVX2-SLOW-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-SLOW-NEXT:    vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX2-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
@@ -6374,17 +6374,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -6402,15 +6402,15 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
@@ -6423,16 +6423,16 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, %xmm14
 ; AVX2-FAST-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rcx), %xmm10
 ; AVX2-FAST-NEXT:    vmovdqa 32(%rdx), %xmm7
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm7[0],xmm10[0],xmm7[1],xmm10[1],xmm7[2],xmm10[2],xmm7[3],xmm10[3],xmm7[4],xmm10[4],xmm7[5],xmm10[5],xmm7[6],xmm10[6],xmm7[7],xmm10[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vmovdqa (%rsi), %xmm11
@@ -6458,10 +6458,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r9), %xmm8
 ; AVX2-FAST-NEXT:    vmovdqa 32(%r8), %xmm13
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm8[0],xmm13[1],xmm8[1],xmm13[2],xmm8[2],xmm13[3],xmm8[3],xmm13[4],xmm8[4],xmm13[5],xmm8[5],xmm13[6],xmm8[6],xmm13[7],xmm8[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm1, %ymm5, %ymm4, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqa (%rax), %xmm4
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6479,20 +6479,20 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpblendvb %ymm2, %ymm6, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm14, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX2-FAST-NEXT:    vpor %xmm0, %xmm3, %xmm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm10, %xmm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm15, %xmm7, %xmm14
 ; AVX2-FAST-NEXT:    vpor %xmm6, %xmm14, %xmm6
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm0, %ymm6, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm11, %xmm1
@@ -6506,10 +6506,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm14, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm8, %xmm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
 ; AVX2-FAST-NEXT:    vpshufb %xmm6, %xmm13, %xmm14
 ; AVX2-FAST-NEXT:    vpor %xmm3, %xmm14, %xmm3
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm3 = ymm3[0,1,0,1]
@@ -6517,7 +6517,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %xmm14, %xmm8, %xmm15
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm3, %ymm15, %ymm3
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm9, %xmm2
 ; AVX2-FAST-NEXT:    vmovdqa %xmm9, %xmm12
@@ -6538,13 +6538,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX2-FAST-NEXT:    # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
@@ -6560,10 +6560,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,2,3,3,2,2,3,3]
 ; AVX2-FAST-NEXT:    # ymm6 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm6, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX2-FAST-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm5, %ymm2, %ymm3, %ymm2
 ; AVX2-FAST-NEXT:    vpshufhw {{.*#+}} xmm3 = xmm15[0,1,2,3,4,5,5,6]
 ; AVX2-FAST-NEXT:    vpermd %ymm3, %ymm6, %ymm3
@@ -6593,7 +6593,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm12
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm6
 ; AVX2-FAST-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29]
@@ -6611,7 +6611,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa %ymm0, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, (%rsp) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm0
@@ -6632,7 +6632,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm6, %ymm7, %ymm6
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
@@ -6641,7 +6641,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpor %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm0
@@ -6666,7 +6666,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm1, %ymm15
 ; AVX2-FAST-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
 ; AVX2-FAST-NEXT:    vpor %ymm11, %ymm15, %ymm11
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm15, %ymm7, %ymm11, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-NEXT:    vpshufb %ymm8, %ymm14, %ymm8
@@ -6694,7 +6694,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} ymm12 = ymm13[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [4,5,4,5,5,7,4,5]
 ; AVX2-FAST-NEXT:    vpermd %ymm12, %ymm15, %ymm12
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm0, %ymm10, %ymm12, %ymm10
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpshufb %ymm9, %ymm2, %ymm9
@@ -6722,7 +6722,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm15 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128]
 ; AVX2-FAST-NEXT:    vpshufb %ymm15, %ymm6, %ymm6
 ; AVX2-FAST-NEXT:    vpor %ymm6, %ymm12, %ymm6
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm12, %ymm9, %ymm6, %ymm9
 ; AVX2-FAST-NEXT:    vpshufb %ymm0, %ymm5, %ymm0
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
@@ -6741,7 +6741,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpor %ymm4, %ymm6, %ymm4
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm13, %ymm10
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
 ; AVX2-FAST-NEXT:    vpblendvb %ymm11, %ymm4, %ymm10, %ymm4
 ; AVX2-FAST-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -6810,17 +6810,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,24,25,24,25,22,23,22,23,24,25,26,27,24,25,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm5, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,25,24,23,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -6838,15 +6838,15 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm8[30],zero,ymm8[28],zero,zero,zero,zero,ymm8[31],zero,ymm8[29],zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,26,27,26,27,30,31,30,31,28,29,28,29,28,29,28,29]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u,255,255,255,255,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,29,28,27,u,u,u,31,30,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,2,2,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u,255,255,255,255,255,0,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm1 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,28,29,26,27,28,29,30,31,30,31,28,29,28,29,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,2,3]
@@ -6860,17 +6860,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm1, %xmm15
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rcx), %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%rdx), %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm5[0],xmm9[0],xmm5[1],xmm9[1],xmm5[2],xmm9[2],xmm5[3],xmm9[3],xmm5[4],xmm9[4],xmm5[5],xmm9[5],xmm5[6],xmm9[6],xmm5[7],xmm9[7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm5, %xmm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsi), %xmm0
@@ -6898,10 +6898,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 32(%r8), %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm5 = ymm5[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm2, %ymm5, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r9), %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -6919,20 +6919,20 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm3, %ymm2, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm15, %xmm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm2, %xmm6, %xmm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm9, %xmm12
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm11, %xmm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm0, %xmm11, %xmm15
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm12, %xmm15, %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm12 = ymm12[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm2, %ymm12, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
@@ -6946,10 +6946,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm1, %ymm0, %ymm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rsp), %xmm8 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm8, %xmm2
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm14, %xmm6
 ; AVX2-FAST-PERLANE-NEXT:    vpor %xmm2, %xmm6, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
@@ -6957,7 +6957,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm4, %xmm15
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[0,0,1,0]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm2, %ymm15, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm10, %xmm1
@@ -6977,13 +6977,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # xmm1 = xmm1[8],mem[8],xmm1[9],mem[9],xmm1[10],mem[10],xmm1[11],mem[11],xmm1[12],mem[12],xmm1[13],mem[13],xmm1[14],mem[14],xmm1[15],mem[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm2 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = <0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm5, %xmm1 # 16-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # xmm1 = xmm5[8],mem[8],xmm5[9],mem[9],xmm5[10],mem[10],xmm5[11],mem[11],xmm5[12],mem[12],xmm5[13],mem[13],xmm5[14],mem[14],xmm5[15],mem[15]
@@ -6998,10 +6998,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [8,9,10,11,8,9,10,11,10,11,12,13,10,11,12,13]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm4 = ymm4[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[0,1,0,1]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm7, %ymm2, %ymm4, %ymm2
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm13, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm15[8],xmm10[8],xmm15[9],xmm10[9],xmm15[10],xmm10[10],xmm15[11],xmm10[11],xmm15[12],xmm10[12],xmm15[13],xmm10[13],xmm15[14],xmm10[14],xmm15[15],xmm10[15]
@@ -7031,7 +7031,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm6 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[30],zero,ymm5[28],zero,zero,zero,zero,ymm5[31],zero,ymm5[29],zero,zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm6 = ymm6[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm3, %ymm6, %ymm3
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm2, %ymm3, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%r8), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm2 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[27,28,29,30],zero,ymm0[28],zero,ymm0[26,27,30,31],zero,ymm0[29]
@@ -7049,7 +7049,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm0, %ymm11
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm0
@@ -7070,7 +7070,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[25],zero,ymm4[23],zero,zero,zero,zero,ymm4[26],zero,ymm4[24],zero,zero,zero,zero,ymm4[27]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm7, %ymm8, %ymm7
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm6, %ymm7, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,ymm3[25],zero,ymm3[23],zero,zero,zero,zero,ymm3[26],zero,ymm3[24],zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm7 = ymm7[2,3,2,3]
@@ -7079,7 +7079,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} ymm8 = ymm11[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,22,23,26,27,24,25,22,23,24,25,26,27,26,27,24,25]
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = <0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm9 = [0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm8 = [0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255,0,0,0,255,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm8, %ymm6, %ymm7, %ymm0
@@ -7104,7 +7104,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm12, %ymm1, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,3,2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm11, %ymm15, %ymm11
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = <255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm15 = [255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u,u,0,0,255,255,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm15, %ymm7, %ymm11, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm8, %ymm13, %ymm8
@@ -7133,7 +7133,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm13 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm12, %ymm13, %ymm15
 ; AVX2-FAST-PERLANE-NEXT:    vpermq {{.*#+}} ymm15 = ymm15[2,2,3,2]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = <u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm0 = [u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm0, %ymm10, %ymm15, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm15 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm9, %ymm15, %ymm9
@@ -7161,7 +7161,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [128,128,0,128,14,128,128,128,128,1,128,15,128,128,128,128,128,128,16,128,30,128,128,128,128,17,128,31,128,128,128,128]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm6, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm6, %ymm12, %ymm6
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = <u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255,0,0,u,u,u,255,255]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm12, %ymm9, %ymm6, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm0, %ymm1, %ymm0
@@ -7180,7 +7180,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpor %ymm4, %ymm6, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [12,13,2,3,12,13,0,1,14,15,2,3,0,1,14,15,28,29,18,19,28,29,16,17,30,31,18,19,16,17,30,31]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm13, %ymm10
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u,u,u,255,255,0,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %ymm11, %ymm4, %ymm10, %ymm4
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm5, %ymm15, %ymm3
@@ -7307,16 +7307,16 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rax), %ymm1
 ; AVX512F-SLOW-NEXT:    vpshufb %ymm2, %ymm1, %ymm0
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = <u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm16 = [u,5,4,u,5,u,4,u,20,21,u,23,u,21,u,23]
 ; AVX512F-SLOW-NEXT:    vpshuflw {{.*#+}} ymm2 = ymm1[2,1,1,2,4,5,6,7,10,9,9,10,12,13,14,15]
 ; AVX512F-SLOW-NEXT:    vpermi2d %zmm0, %zmm2, %zmm16
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdx), %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rcx), %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm2, %xmm29
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm3, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm4, %xmm31
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm3, %xmm30
@@ -7324,14 +7324,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rdi), %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rsi), %xmm4
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm12 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm12, %xmm4, %xmm0
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm15 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm15, %xmm3, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm3, %xmm21
 ; AVX512F-SLOW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u>
+; AVX512F-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,u,0,u,2,3,u,1,u,18,u,19,18,u,19,u]
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%rax), %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} xmm0 = xmm2[0,1,2,3,4,5,5,6]
@@ -7340,10 +7340,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%r9), %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa 32(%r8), %xmm2
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm6, %xmm0, %xmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm0, %xmm28
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm2, %xmm10
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm2, %xmm3
 ; AVX512F-SLOW-NEXT:    vporq %xmm8, %xmm10, %xmm26
@@ -7390,7 +7390,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm24, %zmm0, %zmm6
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm11 = xmm5[8],xmm2[8],xmm5[9],xmm2[9],xmm5[10],xmm2[10],xmm5[11],xmm2[11],xmm5[12],xmm2[12],xmm5[13],xmm2[13],xmm5[14],xmm2[14],xmm5[15],xmm2[15]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm7, %xmm11, %xmm11
 ; AVX512F-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm2 = zmm11[0,1,0,1],zmm6[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -7442,7 +7442,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm15 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm15, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
@@ -7453,7 +7453,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm3
 ; AVX512F-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm4 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
@@ -7506,13 +7506,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm14, %ymm14
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $1, %ymm14, %zmm2, %zmm2
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm12 = xmm12[0],xmm3[0],xmm12[1],xmm3[1],xmm12[2],xmm3[2],xmm12[3],xmm3[3],xmm12[4],xmm3[4],xmm12[5],xmm3[5],xmm12[6],xmm3[6],xmm12[7],xmm3[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm15, %xmm15
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm12, %xmm12
 ; AVX512F-SLOW-NEXT:    vinserti32x4 $2, %xmm22, %zmm12, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm13 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm11, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm11
 ; AVX512F-SLOW-NEXT:    vpermq {{.*#+}} ymm13 = ymm6[2,3,2,3]
@@ -7721,10 +7721,10 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vporq %ymm3, %ymm5, %ymm24
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdx), %xmm3
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rcx), %xmm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm6, %xmm5
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm6, %xmm28
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm9, %xmm3, %xmm6
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm9, %xmm19
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm3, %xmm29
@@ -7732,18 +7732,18 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rdi), %xmm10
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%rsi), %xmm6
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm6, %xmm5
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm10, %xmm27
 ; AVX512F-FAST-NEXT:    vpor %xmm5, %xmm9, %xmm5
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%r9), %xmm15
 ; AVX512F-FAST-NEXT:    vmovdqa 32(%r8), %xmm10
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm15, %xmm9
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm0, %xmm10, %xmm12
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm0, %xmm21
 ; AVX512F-FAST-NEXT:    vporq %xmm9, %xmm12, %xmm22
@@ -7811,7 +7811,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm24, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm3[8],xmm4[9],xmm3[9],xmm4[10],xmm3[10],xmm4[11],xmm3[11],xmm4[12],xmm3[12],xmm4[13],xmm3[13],xmm4[14],xmm3[14],xmm4[15],xmm3[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,6,7,2,3,u,u,u,8,9,4,5,u,u,u,10]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-FAST-NEXT:    vshufi64x2 {{.*#+}} zmm0 = zmm1[0,1,0,1],zmm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -7849,7 +7849,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm7 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7]
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm14[8],xmm8[8],xmm14[9],xmm8[9],xmm14[10],xmm8[10],xmm14[11],xmm8[11],xmm14[12],xmm8[12],xmm14[13],xmm8[13],xmm14[14],xmm8[14],xmm14[15],xmm8[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
@@ -7860,7 +7860,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm6 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15]
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm13[8],xmm9[8],xmm13[9],xmm9[9],xmm13[10],xmm9[10],xmm13[11],xmm9[11],xmm13[12],xmm9[12],xmm13[13],xmm9[13],xmm13[14],xmm9[14],xmm13[15],xmm9[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,u,u,u,9,8,5,4,u,u,u,11,10,7,6,u]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm4, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX512F-FAST-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
@@ -7902,7 +7902,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm16
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm9[0],xmm13[0],xmm9[1],xmm13[1],xmm9[2],xmm13[2],xmm9[3],xmm13[3],xmm9[4],xmm13[4],xmm9[5],xmm13[5],xmm9[6],xmm13[6],xmm9[7],xmm13[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm9, %xmm6, %xmm6
 ; AVX512F-FAST-NEXT:    vpshufb %xmm9, %xmm0, %xmm0
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm9 = [28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29,28,29,30,128,28,128,30,31,30,31,128,29,128,31,28,29]
@@ -7917,7 +7917,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vinserti32x4 $2, %xmm31, %zmm0, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm8[0],xmm14[0],xmm8[1],xmm14[1],xmm8[2],xmm14[2],xmm8[3],xmm14[3],xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm7, %xmm7
 ; AVX512F-FAST-NEXT:    vpshufb %xmm8, %xmm0, %xmm13
 ; AVX512F-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
@@ -8152,7 +8152,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rdx), %ymm29
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm20, %ymm29, %ymm8
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm8 = ymm8[2,3,2,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm21 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%rcx), %ymm30
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm21, %ymm30, %ymm25
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3]
@@ -8181,7 +8181,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 32(%r8), %ymm31
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm23, %ymm31, %ymm0
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm26 = ymm0[2,3,2,3]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm24 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
 ; AVX512BW-SLOW-NEXT:    vmovdqa 32(%r9), %ymm0
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm24, %ymm0, %ymm11
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[2,3,2,3]
@@ -8295,12 +8295,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm26 = ymm18[2,2,3,3,6,6,7,7]
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm27, %ymm19, %ymm26 {%k4}
 ; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm19 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = <0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [0,1,u,u,u,6,7,2,3,u,u,u,8,9,4,5]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm18, %xmm19, %xmm19
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm19 = zmm26[2,3,2,3],zmm19[0,1,0,1]
 ; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm26 = xmm24[0],xmm25[0],xmm24[1],xmm25[1],xmm24[2],xmm25[2],xmm24[3],xmm25[3],xmm24[4],xmm25[4],xmm24[5],xmm25[5],xmm24[6],xmm25[6],xmm24[7],xmm25[7]
 ; AVX512BW-SLOW-NEXT:    vpshufb %ymm11, %ymm16, %ymm11
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm16 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm16, %xmm26, %xmm26
 ; AVX512BW-SLOW-NEXT:    vpshufhw {{.*#+}} ymm14 = ymm14[0,1,2,3,7,6,6,7,8,9,10,11,15,14,14,15]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} ymm14 = ymm14[0,2,3,3,4,6,7,7]
@@ -8313,7 +8313,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm10 = ymm10[2,3,2,3]
 ; AVX512BW-SLOW-NEXT:    vpor %ymm9, %ymm10, %ymm10
 ; AVX512BW-SLOW-NEXT:    vpunpcklbw {{.*#+}} xmm11 = xmm23[0],xmm21[0],xmm23[1],xmm21[1],xmm23[2],xmm21[2],xmm23[3],xmm21[3],xmm23[4],xmm21[4],xmm23[5],xmm21[5],xmm23[6],xmm21[6],xmm23[7],xmm21[7]
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm9, %xmm11, %xmm11
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} ymm11 = ymm11[0,1,0,1]
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm11, %zmm10, %zmm10
@@ -8325,17 +8325,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
 ; AVX512BW-SLOW-NEXT:    kmovq %rax, %k1
 ; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm10, %zmm14 {%k1}
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm11, %xmm25, %xmm10
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm19, %xmm24, %xmm26
 ; AVX512BW-SLOW-NEXT:    vporq %xmm10, %xmm26, %xmm10
 ; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm24 = xmm25[8],xmm24[8],xmm25[9],xmm24[9],xmm25[10],xmm24[10],xmm25[11],xmm24[11],xmm25[12],xmm24[12],xmm25[13],xmm24[13],xmm25[14],xmm24[14],xmm25[15],xmm24[15]
 ; AVX512BW-SLOW-NEXT:    vpshufb {{.*#+}} xmm24 = xmm24[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm24, %zmm10, %zmm10
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm24 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm24 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm24, %xmm1, %xmm25
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm26 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm26 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm26, %xmm0, %xmm27
 ; AVX512BW-SLOW-NEXT:    vporq %xmm25, %xmm27, %xmm25
 ; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
@@ -8344,11 +8344,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $2, %xmm0, %zmm25, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm10 = zmm0[0,1,0,1,4,5,4,5]
 ; AVX512BW-SLOW-NEXT:    vmovdqu8 %zmm1, %zmm10 {%k3}
-; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX512BW-SLOW-NEXT:    vmovdqa {{.*#+}} xmm0 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm0, %xmm21, %xmm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm25 = [2,2,2,4,2,2,2,4,3,3,3,3,2,2,2,4,52,53,52,53,53,54,53,54,52,53,52,53,53,54,53,54]
 ; AVX512BW-SLOW-NEXT:    vpermi2w %zmm22, %zmm17, %zmm25
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} xmm17 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
 ; AVX512BW-SLOW-NEXT:    vpshufb %xmm17, %xmm23, %xmm22
 ; AVX512BW-SLOW-NEXT:    vporq %xmm1, %xmm22, %xmm1
 ; AVX512BW-SLOW-NEXT:    vpunpckhbw {{.*#+}} xmm21 = xmm23[8],xmm21[8],xmm23[9],xmm21[9],xmm23[10],xmm21[10],xmm23[11],xmm21[11],xmm23[12],xmm21[12],xmm23[13],xmm21[13],xmm23[14],xmm21[14],xmm23[15],xmm21[15]
@@ -8477,7 +8477,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rdx), %ymm2
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm0, %ymm2, %ymm3
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm25 = ymm3[2,3,2,3]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,18,128,128,128,128,21,128,19,128,128,128,128,22,128,20,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,25,128,23,128,128,128,128,26,128,24,128,128,128,128]
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rcx), %ymm3
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm0, %ymm3, %ymm26
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3]
@@ -8486,11 +8486,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm11, %ymm3, %ymm11
 ; AVX512BW-FAST-NEXT:    vpor %ymm4, %ymm11, %ymm4
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm25, %zmm4, %zmm4
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,20,21,128,19,128,21,20,21,22,128,20,128,22,23,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128]
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rdi), %ymm11
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm0, %ymm11, %ymm25
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm25 = ymm25[2,3,2,3]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,21,128,19,128,128,128,128,22,128,20,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,23,128,128,128,128,26,128,24,128,128,128,128,27,128,25]
 ; AVX512BW-FAST-NEXT:    vmovdqa64 32(%rsi), %ymm27
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm0, %ymm27, %ymm26
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3]
@@ -8506,7 +8506,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%r8), %ymm4
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm0, %ymm4, %ymm21
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm25 = ymm21[2,3,2,3]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,20,128,18,128,128,128,128,21,128,19,128,128,128,128,22,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,128,128,25,128,23,128,128,128,128,26,128,24,128,128]
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%r9), %ymm0
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm26
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm26 = ymm26[2,3,2,3]
@@ -8584,7 +8584,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rdx), %xmm2
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%rcx), %xmm1
 ; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm18 = <4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm18 = [4,5,0,1,u,u,u,6,7,2,3,u,u,u,8,9]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm18, %xmm0, %xmm0
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm4, %zmm9
@@ -8598,7 +8598,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%r9), %xmm5
 ; AVX512BW-FAST-NEXT:    vmovdqa 32(%r8), %xmm4
 ; AVX512BW-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm19 = <u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} xmm19 = [u,u,4,5,0,1,u,u,u,6,7,2,3,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm19, %xmm0, %xmm0
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1]
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm3, %zmm0
@@ -8610,17 +8610,17 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    movabsq $1016749673354069774, %rax # imm = 0xE1C3870E1C3870E
 ; AVX512BW-FAST-NEXT:    kmovq %rax, %k2
 ; AVX512BW-FAST-NEXT:    vmovdqu8 %zmm0, %zmm9 {%k2}
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [u,u,u,128,7,128,5,u,u,u,128,8,128,6,u,u]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm11, %xmm1, %xmm3
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,7,128,5,128,u,u,u,8,128,6,128,u,u]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm6, %xmm2, %xmm23
 ; AVX512BW-FAST-NEXT:    vporq %xmm3, %xmm23, %xmm3
 ; AVX512BW-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
 ; AVX512BW-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[6,3,2,u,u,u,9,8,5,4,u,u,u,11,10,7]
 ; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm1, %zmm3, %zmm1
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,128,7,128,5,u,u,u,128,8,128,6,u,u,u,128]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,7,128,5,128,u,u,u,8,128,6,128,u,u,u,9]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm0, %xmm8, %xmm23
 ; AVX512BW-FAST-NEXT:    vporq %xmm3, %xmm23, %xmm3
 ; AVX512BW-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm7 = xmm7[8],xmm8[8],xmm7[9],xmm8[9],xmm7[10],xmm8[10],xmm7[11],xmm8[11],xmm7[12],xmm8[12],xmm7[13],xmm8[13],xmm7[14],xmm8[14],xmm7[15],xmm8[15]
@@ -8629,9 +8629,9 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm1 = zmm1[0,1,0,1,4,5,4,5]
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm23 = zmm3[0,1,0,1,4,5,4,5]
 ; AVX512BW-FAST-NEXT:    vmovdqu8 %zmm1, %zmm23 {%k1}
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,4,u,u,u,128,7,128,5,u,u,u,128,8,128,6]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm1, %xmm5, %xmm3
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,128,u,u,u,7,128,5,128,u,u,u,8,128,6,128]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm7, %xmm4, %xmm8
 ; AVX512BW-FAST-NEXT:    vpor %xmm3, %xmm8, %xmm3
 ; AVX512BW-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm4 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index 7bb387481ff7b..e60034f7d6b75 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -474,10 +474,10 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [0,2,4,6,0,2,4,6]
 ; AVX2-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX2-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm3
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
 ; AVX2-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,5,7,1,3,5,7]
@@ -594,10 +594,10 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm2 = [1,3,5,7,1,3,5,7]
 ; AVX512F-FAST-NEXT:    # ymm2 = mem[0,1,0,1]
 ; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm2, %ymm3
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [u,u,u,u,0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [0,4,8,12,u,u,u,u,1,5,9,13,u,u,u,u,2,6,10,14,u,u,u,u,3,7,11,15,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm2, %ymm2
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm3[1],ymm2[2],ymm3[3],ymm2[4],ymm3[5],ymm2[6],ymm3[7]
 ; AVX512F-FAST-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,2,4,6,0,2,4,6]
@@ -1030,28 +1030,28 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-ONLY-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm4
 ; AVX2-ONLY-NEXT:    vinserti128 $1, (%r10), %ymm3, %ymm3
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm5 = ymm3[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm6 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm6, %ymm5, %ymm2
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm7 = ymm4[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm8, %ymm7, %ymm9
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm2 = ymm9[0,1,2],ymm2[3],ymm9[4,5,6],ymm2[7],ymm9[8,9,10],ymm2[11],ymm9[12,13,14],ymm2[15]
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm10, %ymm9, %ymm11
 ; AVX2-ONLY-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm13 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm13, %ymm12, %ymm14
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm2 = ymm11[0],ymm2[1],ymm11[2],ymm2[3],ymm11[4],ymm2[5],ymm11[6],ymm2[7]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm11, %ymm5, %ymm5
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm5 = ymm7[0,1,2],ymm5[3],ymm7[4,5,6],ymm5[7],ymm7[8,9,10],ymm5[11],ymm7[12,13,14],ymm5[15]
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
-; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm15 = <4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX2-ONLY-NEXT:    vmovdqa {{.*#+}} ymm15 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u]
 ; AVX2-ONLY-NEXT:    vpshufb %ymm15, %ymm12, %ymm12
 ; AVX2-ONLY-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm5 = ymm9[0],ymm5[1],ymm9[2],ymm5[3],ymm9[4],ymm5[5],ymm9[6],ymm5[7]
@@ -1094,28 +1094,28 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-NEXT:    vinserti128 $1, (%r9), %ymm2, %ymm2
 ; AVX512F-NEXT:    vinserti128 $1, (%r10), %ymm3, %ymm3
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm4 = ymm3[0,2,0,2]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
 ; AVX512F-NEXT:    vpshufb %ymm5, %ymm4, %ymm6
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm7 = ymm2[0,2,0,2]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = <u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm8 = [u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm8, %ymm7, %ymm9
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm6 = ymm9[0,1,2],ymm6[3],ymm9[4,5,6],ymm6[7],ymm9[8,9,10],ymm6[11],ymm9[12,13,14],ymm6[15]
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm9 = ymm1[0,2,0,2]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = <u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm10 = [u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm10, %ymm9, %ymm11
 ; AVX512F-NEXT:    vpermq {{.*#+}} ymm12 = ymm0[0,2,0,2]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = <4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm13 = [4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm13, %ymm12, %ymm14
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm11 = ymm14[0],ymm11[1],ymm14[2,3,4],ymm11[5],ymm14[6,7,8],ymm11[9],ymm14[10,11,12],ymm11[13],ymm14[14,15]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm6 = ymm11[0],ymm6[1],ymm11[2],ymm6[3],ymm11[4],ymm6[5],ymm11[6],ymm6[7]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm11 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11]
 ; AVX512F-NEXT:    vpshufb %ymm11, %ymm4, %ymm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm14 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm14, %ymm7, %ymm7
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm4 = ymm7[0,1,2],ymm4[3],ymm7[4,5,6],ymm4[7],ymm7[8,9,10],ymm4[11],ymm7[12,13,14],ymm4[15]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm7 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm7, %ymm9, %ymm9
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm15 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %ymm15, %ymm12, %ymm12
 ; AVX512F-NEXT:    vpblendw {{.*#+}} ymm9 = ymm12[0],ymm9[1],ymm12[2,3,4],ymm9[5],ymm12[6,7,8],ymm9[9],ymm12[10,11,12],ymm9[13],ymm12[14,15]
 ; AVX512F-NEXT:    vpblendd {{.*#+}} ymm4 = ymm9[0],ymm4[1],ymm9[2],ymm4[3],ymm9[4],ymm4[5],ymm9[6],ymm4[7]
@@ -1161,22 +1161,22 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm3
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm4 = zmm2[0,2,0,2,4,6,4,6]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm5 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u]
 ; AVX512BW-SLOW-NEXT:    vpshufb %zmm5, %zmm4, %zmm4
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm3 = zmm3[4,5,6,7,4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm6 = zmm3[0,2,0,2,4,6,4,6]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
 ; AVX512BW-SLOW-NEXT:    vpshufb %zmm7, %zmm6, %zmm6
 ; AVX512BW-SLOW-NEXT:    movl $8913032, %ecx # imm = 0x880088
 ; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-SLOW-NEXT:    vmovdqu16 %zmm6, %zmm4 {%k1}
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm8 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u]
 ; AVX512BW-SLOW-NEXT:    vpshufb %zmm8, %zmm6, %zmm6
 ; AVX512BW-SLOW-NEXT:    vshufi64x2 {{.*#+}} zmm1 = zmm1[4,5,6,7,4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vpermq {{.*#+}} zmm9 = zmm1[0,2,0,2,4,6,4,6]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u]
 ; AVX512BW-SLOW-NEXT:    vpshufb %zmm10, %zmm9, %zmm9
 ; AVX512BW-SLOW-NEXT:    movl $2228258, %ecx # imm = 0x220022
 ; AVX512BW-SLOW-NEXT:    kmovd %ecx, %k2
@@ -1217,21 +1217,21 @@ define void @store_i8_stride8_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm3, %zmm2, %zmm5
 ; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm6 = [0,2,0,2,12,14,12,14]
 ; AVX512BW-FAST-NEXT:    vpermt2q %zmm5, %zmm6, %zmm3
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = <u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm7 = [u,u,u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15]
 ; AVX512BW-FAST-NEXT:    vpshufb %zmm7, %zmm3, %zmm3
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm2, %zmm2, %zmm2
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm8 = zmm2[0,2,0,2,4,6,4,6]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = <u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm9 = [u,u,u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u]
 ; AVX512BW-FAST-NEXT:    vpshufb %zmm9, %zmm8, %zmm8
 ; AVX512BW-FAST-NEXT:    movl $8913032, %ecx # imm = 0x880088
 ; AVX512BW-FAST-NEXT:    kmovd %ecx, %k1
 ; AVX512BW-FAST-NEXT:    vmovdqu16 %zmm3, %zmm8 {%k1}
 ; AVX512BW-FAST-NEXT:    vpermt2q %zmm4, %zmm6, %zmm1
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = <u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [u,u,0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpshufb %zmm3, %zmm1, %zmm1
 ; AVX512BW-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm0, %zmm0
 ; AVX512BW-FAST-NEXT:    vpermq {{.*#+}} zmm6 = zmm0[0,2,0,2,4,6,4,6]
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = <0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm10 = [0,8,u,u,u,u,u,u,1,9,u,u,u,u,u,u,2,10,u,u,u,u,u,u,3,11,u,u,u,u,u,u,4,12,u,u,u,u,u,u,5,13,u,u,u,u,u,u,6,14,u,u,u,u,u,u,7,15,u,u,u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpshufb %zmm10, %zmm6, %zmm6
 ; AVX512BW-FAST-NEXT:    movl $2228258, %ecx # imm = 0x220022
 ; AVX512BW-FAST-NEXT:    kmovd %ecx, %k2
@@ -2235,7 +2235,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm12, %ymm12, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm9, %ymm9
 ; AVX2-FAST-NEXT:    vmovdqa %ymm11, %ymm12
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15]
@@ -2303,7 +2303,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
 ; AVX2-FAST-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm8, %xmm1, %xmm5
 ; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
@@ -2388,7 +2388,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm4, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm12, %ymm12, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm9, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm11, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm9[0],ymm0[1],ymm9[2,3,4],ymm0[5],ymm9[6,7,8],ymm0[9],ymm9[10,11,12],ymm0[13],ymm9[14,15]
@@ -2456,7 +2456,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm6 = [0,1,2,3,0,1,2,3,8,9,10,11,2,3,6,7,4,5,2,3,4,5,2,3,8,9,10,11,6,7,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm6, %ymm5, %ymm5
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm8 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm8, %xmm1, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm1, %ymm1
@@ -2759,7 +2759,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa (%rdi), %xmm13
 ; AVX512F-ONLY-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm13[8],xmm12[8],xmm13[9],xmm12[9],xmm13[10],xmm12[10],xmm13[11],xmm12[11],xmm13[12],xmm12[12],xmm13[13],xmm12[13],xmm13[14],xmm12[14],xmm13[15],xmm12[15]
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm1, %ymm2
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm2, %ymm2
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa %ymm3, %ymm14
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
@@ -2770,7 +2770,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa64 %ymm3, %ymm29
 ; AVX512F-ONLY-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} xmm9 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX512F-ONLY-FAST-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX512F-ONLY-FAST-NEXT:    vinserti128 $1, %xmm1, %ymm3, %ymm1
 ; AVX512F-ONLY-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7,8],ymm0[9],ymm1[10,11,12],ymm0[13],ymm1[14,15]
@@ -3117,14 +3117,14 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm4, %xmm21
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %xmm3, %xmm22
 ; AVX512DQ-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm2, %ymm3
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm4 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm4, %ymm3, %ymm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa %ymm4, %ymm5
 ; AVX512DQ-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm3[0],ymm1[1],ymm3[2,3,4],ymm1[5],ymm3[6,7,8],ymm1[9],ymm3[10,11,12],ymm1[13],ymm3[14,15]
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,1,0,1,4,5,2,3,4,5,2,3,12,13,14,15,0,1,4,5,4,5,6,7,4,5,6,7,12,13,14,15]
 ; AVX512DQ-FAST-NEXT:    vpshufb %ymm3, %ymm0, %ymm0
 ; AVX512DQ-FAST-NEXT:    vmovdqa64 %ymm3, %ymm23
-; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} xmm4 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX512DQ-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm3
 ; AVX512DQ-FAST-NEXT:    vmovdqa %xmm4, %xmm10
 ; AVX512DQ-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm2 = xmm2[0],zero,zero,zero,xmm2[1],zero,zero,zero
@@ -3299,7 +3299,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-SLOW-NEXT:    vpmovzxdq {{.*#+}} xmm22 = xmm22[0],zero,xmm22[1],zero
 ; AVX512BW-SLOW-NEXT:    vinserti32x4 $1, %xmm22, %ymm21, %ymm4
 ; AVX512BW-SLOW-NEXT:    vpblendw {{.*#+}} ymm4 = ymm10[0],ymm4[1],ymm10[2,3,4],ymm4[5],ymm10[6,7,8],ymm4[9],ymm10[10,11,12],ymm4[13],ymm10[14,15]
-; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = <4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u>
+; AVX512BW-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm22 = [4,20,u,u,5,21,u,u,6,22,u,u,7,23,u,u]
 ; AVX512BW-SLOW-NEXT:    vpermt2w %ymm20, %ymm22, %ymm7
 ; AVX512BW-SLOW-NEXT:    vinserti64x4 $1, %ymm7, %zmm4, %zmm7
 ; AVX512BW-SLOW-NEXT:    movw $-21846, %cx # imm = 0xAAAA
@@ -3403,7 +3403,7 @@ define void @store_i8_stride8_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa64 16(%rdi), %xmm19
 ; AVX512BW-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm6 = xmm18[8],xmm16[8],xmm18[9],xmm16[9],xmm18[10],xmm16[10],xmm18[11],xmm16[11],xmm18[12],xmm16[12],xmm18[13],xmm16[13],xmm18[14],xmm16[14],xmm18[15],xmm16[15]
 ; AVX512BW-FAST-NEXT:    vinserti32x4 $2, %xmm6, %zmm6, %zmm6
-; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = <0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa64 {{.*#+}} zmm20 = [0,32,u,u,1,33,u,u,2,34,u,u,3,35,u,u,20,52,u,u,21,53,u,u,22,54,u,u,23,55,u,u]
 ; AVX512BW-FAST-NEXT:    vpermt2w %zmm15, %zmm20, %zmm6
 ; AVX512BW-FAST-NEXT:    movw $-21846, %cx # imm = 0xAAAA
 ; AVX512BW-FAST-NEXT:    kmovd %ecx, %k1
@@ -5271,7 +5271,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vmovdqa %ymm3, %ymm10
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm3, %xmm4, %xmm1
 ; AVX2-FAST-NEXT:    vmovdqa %xmm3, %xmm13
 ; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
@@ -5314,7 +5314,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %ymm7, %ymm15, %ymm6
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
@@ -5330,7 +5330,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm10, %xmm4, %xmm5
 ; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm5
@@ -5381,7 +5381,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %ymm12, %ymm14, %ymm7
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm9, %ymm9, %ymm8
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm14, %ymm8, %ymm8
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
@@ -5397,7 +5397,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm11, %xmm0, %xmm2
 ; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm6, %ymm2
@@ -5450,7 +5450,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpshufb %ymm11, %ymm15, %ymm8
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm9, %ymm9, %ymm9
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm10 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
@@ -5466,7 +5466,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15]
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX2-FAST-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpshufb %xmm14, %xmm3, %xmm5
 ; AVX2-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 ; AVX2-FAST-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
@@ -5595,7 +5595,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm3, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm3, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm1[0,1,2],ymm0[3],ymm1[4,5,6],ymm0[7],ymm1[8,9,10],ymm0[11],ymm1[12,13,14],ymm0[15]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm3, %xmm4, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm3, %xmm13
 ; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm3 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
@@ -5638,7 +5638,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm7 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm7, %ymm15, %ymm6
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm0, %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm0[0],ymm6[1],ymm0[2,3,4],ymm6[5],ymm0[6,7,8],ymm6[9],ymm0[10,11,12],ymm6[13],ymm0[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7]
@@ -5654,7 +5654,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm3 = ymm4[0,1,2],ymm3[3],ymm4[4,5,6],ymm3[7],ymm4[8,9,10],ymm3[11],ymm4[12,13,14],ymm3[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm2 = xmm2[0],xmm13[0],xmm2[1],xmm13[1],xmm2[2],xmm13[2],xmm2[3],xmm13[3],xmm2[4],xmm13[4],xmm2[5],xmm13[5],xmm2[6],xmm13[6],xmm2[7],xmm13[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3],xmm9[4],xmm10[4],xmm9[5],xmm10[5],xmm9[6],xmm10[6],xmm9[7],xmm10[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm10, %xmm4, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm4[0],zero,zero,zero,xmm4[1],zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm6, %ymm5
@@ -5705,7 +5705,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm12 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm12, %ymm14, %ymm7
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm9, %ymm9, %ymm8
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm14 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm14, %ymm8, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm7 = ymm8[0],ymm7[1],ymm8[2,3,4],ymm7[5],ymm8[6,7,8],ymm7[9],ymm8[10,11,12],ymm7[13],ymm8[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm6 = ymm7[0],ymm6[1],ymm7[2],ymm6[3],ymm7[4],ymm6[5],ymm7[6],ymm6[7]
@@ -5721,7 +5721,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm4 = ymm5[0,1,2],ymm4[3],ymm5[4,5,6],ymm4[7],ymm5[8,9,10],ymm4[11],ymm5[12,13,14],ymm4[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm5 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3],xmm13[4],xmm11[4],xmm13[5],xmm11[5],xmm13[6],xmm11[6],xmm13[7],xmm11[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm11 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm0, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm6 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm2, %ymm6, %ymm2
@@ -5774,7 +5774,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm11 = [8,9,8,9,4,5,6,7,12,13,10,11,12,13,10,11,8,9,12,13,4,5,6,7,12,13,14,15,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm11, %ymm15, %ymm8
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm9, %ymm9, %ymm9
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm10 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %ymm10, %ymm9, %ymm9
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm8 = ymm9[0],ymm8[1],ymm9[2,3,4],ymm8[5],ymm9[6,7,8],ymm8[9],ymm9[10,11,12],ymm8[13],ymm9[14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm6 = ymm8[0],ymm6[1],ymm8[2],ymm6[3],ymm8[4],ymm6[5],ymm8[6],ymm6[7]
@@ -5790,7 +5790,7 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm0 = ymm4[0,1,2],ymm0[3],ymm4[4,5,6],ymm0[7],ymm4[8,9,10],ymm0[11],ymm4[12,13,14],ymm0[15]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm4 = xmm14[0],xmm13[0],xmm14[1],xmm13[1],xmm14[2],xmm13[2],xmm14[3],xmm13[3],xmm14[4],xmm13[4],xmm14[5],xmm13[5],xmm14[6],xmm13[6],xmm14[7],xmm13[7]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm14 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} xmm14 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm14, %xmm3, %xmm5
 ; AVX2-FAST-PERLANE-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
 ; AVX2-FAST-PERLANE-NEXT:    vinserti128 $1, %xmm5, %ymm8, %ymm5
@@ -6511,12 +6511,12 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512F-FAST-NEXT:    vmovdqa (%rsi), %xmm12
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %xmm9
 ; AVX512F-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm0 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15]
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} xmm7 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero
 ; AVX512F-FAST-NEXT:    vpshufb %xmm7, %xmm0, %xmm2
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm5 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX512F-FAST-NEXT:    vpshufb %ymm5, %ymm0, %ymm0
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm4
 ; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} zmm3 = [65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535,65535,0,65535,65535]
@@ -6918,9 +6918,9 @@ define void @store_i8_stride8_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
 ; AVX512BW-FAST-NEXT:    vmovdqa64 48(%rdi), %xmm28
 ; AVX512BW-FAST-NEXT:    vpunpckhbw {{.*#+}} xmm8 = xmm11[8],xmm10[8],xmm11[9],xmm10[9],xmm11[10],xmm10[10],xmm11[11],xmm10[11],xmm11[12],xmm10[12],xmm11[13],xmm10[13],xmm11[14],xmm10[14],xmm11[15],xmm10[15]
 ; AVX512BW-FAST-NEXT:    vinserti128 $1, %xmm8, %ymm8, %ymm13
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = <8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} ymm12 = [8,9,u,u,u,u,u,u,10,11,u,u,u,u,u,u,12,13,u,u,u,u,u,u,14,15,u,u,u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpshufb %ymm12, %ymm13, %ymm26
-; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = <4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u>
+; AVX512BW-FAST-NEXT:    vmovdqa {{.*#+}} xmm13 = [4,5,u,u,u,u,u,u,6,7,u,u,u,u,u,u]
 ; AVX512BW-FAST-NEXT:    vpshufb %xmm13, %xmm8, %xmm27
 ; AVX512BW-FAST-NEXT:    vpmovzxwq {{.*#+}} xmm8 = xmm8[0],zero,zero,zero,xmm8[1],zero,zero,zero
 ; AVX512BW-FAST-NEXT:    vinserti32x4 $1, %xmm27, %ymm8, %ymm8
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index 008188b52c200..82afa15079182 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -13,7 +13,7 @@ declare <4 x i16> @llvm.umul.fix.sat.v4i16(<4 x i16>, <4 x i16>, i32 immarg)
 define <4 x i16> @smulfix(<4 x i16> %a) {
 ; CHECK-LABEL: smulfix:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <1,2,3,4,u,u,u,u>
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
 ; CHECK-NEXT:    pmullw %xmm1, %xmm2
 ; CHECK-NEXT:    psrlw $15, %xmm2
@@ -28,7 +28,7 @@ define <4 x i16> @smulfix(<4 x i16> %a) {
 define <4 x i16> @umulfix(<4 x i16> %a) {
 ; CHECK-LABEL: umulfix:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <1,2,3,4,u,u,u,u>
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
 ; CHECK-NEXT:    pmullw %xmm1, %xmm2
 ; CHECK-NEXT:    psrlw $15, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-partial-undef.ll b/llvm/test/CodeGen/X86/vector-partial-undef.ll
index ffd95b38dc31b..fd41fd53e3be1 100644
--- a/llvm/test/CodeGen/X86/vector-partial-undef.ll
+++ b/llvm/test/CodeGen/X86/vector-partial-undef.ll
@@ -137,7 +137,7 @@ define <8 x i32> @xor_undef_elts_alt(<4 x i32> %x) {
 ; SSE-LABEL: xor_undef_elts_alt:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movaps %xmm0, %xmm1
-; SSE-NEXT:    movaps {{.*#+}} xmm2 = <u,u,44,12>
+; SSE-NEXT:    movaps {{.*#+}} xmm2 = [u,u,44,12]
 ; SSE-NEXT:    xorps %xmm0, %xmm2
 ; SSE-NEXT:    xorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE-NEXT:    movaps %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index 03e39e71aaaf1..dbdc3e09fcef0 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -864,7 +864,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
 ;
 ; SSE41-LABEL: test_v4i16_v4i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
diff --git a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
index 7f828fc293caa..a3d3ed6a4b243 100644
--- a/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-replicaton-i1-mask.ll
@@ -481,7 +481,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512F-ONLY-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u>
+; AVX512F-ONLY-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,1,1,1,u,u]
 ; AVX512F-ONLY-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512F-ONLY-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512F-ONLY-NEXT:    movb $63, %al
@@ -498,7 +498,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovb (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,1,1,1,u,u]
 ; AVX512DQ-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    movb $63, %al
@@ -516,7 +516,7 @@ define void @mask_replication_factor3_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW-NEXT:    kmovw (%rdi), %k1
 ; AVX512BW-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqa32 %ymm0, %ymm0 {%k1} {z}
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,0,1,1,1,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,0,1,1,1,u,u]
 ; AVX512BW-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX512BW-NEXT:    vpslld $31, %ymm0, %ymm0
 ; AVX512BW-NEXT:    movb $63, %al
@@ -542,7 +542,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u>
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-ONLY-NEXT:    movw $4095, %ax # imm = 0xFFF
@@ -558,7 +558,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    movw $4095, %ax # imm = 0xFFF
@@ -574,7 +574,7 @@ define void @mask_replication_factor3_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovw (%rdi), %k1
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,u,u,u,u]
 ; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512BW-NEXT:    movw $4095, %ax # imm = 0xFFF
@@ -646,7 +646,7 @@ define void @mask_replication_factor3_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovw (%rdi), %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,1,1,1,2,2,2,3,3,3,4,4,4,5,5,5,6,6,6,7,7,7,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    movl $16777215, %eax # imm = 0xFFFFFF
@@ -3121,7 +3121,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u>
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-ONLY-NEXT:    movw $1023, %ax # imm = 0x3FF
@@ -3138,7 +3138,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    movw $1023, %ax # imm = 0x3FF
@@ -3155,7 +3155,7 @@ define void @mask_replication_factor5_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovw (%rdi), %k1
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512BW-NEXT:    movw $1023, %ax # imm = 0x3FF
@@ -3219,7 +3219,7 @@ define void @mask_replication_factor5_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovd (%rdi), %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    movl $1048575, %eax # imm = 0xFFFFF
@@ -3324,7 +3324,7 @@ define void @mask_replication_factor5_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512VBMI-ONLY:       # %bb.0:
 ; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
 ; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,1,1,1,1,1,2,2,2,2,2,3,3,3,3,3,4,4,4,4,4,5,5,5,5,5,6,6,6,6,6,7,7,7,7,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VBMI-ONLY-NEXT:    movabsq $1099511627775, %rax # imm = 0xFFFFFFFFFF
@@ -5857,7 +5857,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u>
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-ONLY-NEXT:    movw $4095, %ax # imm = 0xFFF
@@ -5873,7 +5873,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    movw $4095, %ax # imm = 0xFFF
@@ -5889,7 +5889,7 @@ define void @mask_replication_factor6_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovw (%rdi), %k1
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,u,u,u,u]
 ; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512BW-NEXT:    movw $4095, %ax # imm = 0xFFF
@@ -5994,7 +5994,7 @@ define void @mask_replication_factor6_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovd (%rdi), %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,1,1,1,1,1,1,2,2,2,2,2,2,3,3,3,3,3,3,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    movl $16777215, %eax # imm = 0xFFFFFF
@@ -9071,7 +9071,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u>
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512F-ONLY-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512F-ONLY-NEXT:    movw $16383, %ax # imm = 0x3FFF
@@ -9089,7 +9089,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512DQ-NEXT:    movw $16383, %ax # imm = 0x3FFF
@@ -9107,7 +9107,7 @@ define void @mask_replication_factor7_vf2(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovw (%rdi), %k1
 ; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,u,u]
 ; AVX512BW-NEXT:    vpermd %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpslld $31, %zmm0, %zmm0
 ; AVX512BW-NEXT:    movw $16383, %ax # imm = 0x3FFF
@@ -9134,7 +9134,7 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512F-ONLY:       # %bb.0:
 ; AVX512F-ONLY-NEXT:    kmovw (%rdi), %k1
 ; AVX512F-ONLY-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u>
+; AVX512F-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u]
 ; AVX512F-ONLY-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    vpslld $31, %zmm1, %zmm1
 ; AVX512F-ONLY-NEXT:    movw $4095, %ax # imm = 0xFFF
@@ -9155,7 +9155,7 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    kmovw (%rdi), %k0
 ; AVX512DQ-NEXT:    vpmovm2d %k0, %zmm0
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermd %zmm0, %zmm1, %zmm1
 ; AVX512DQ-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    movw $4095, %ax # imm = 0xFFF
@@ -9176,7 +9176,7 @@ define void @mask_replication_factor7_vf4(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    kmovd (%rdi), %k0
 ; AVX512BW-NEXT:    vpmovm2w %k0, %zmm0
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,u,u,u,u]
 ; AVX512BW-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    movl $268435455, %eax # imm = 0xFFFFFFF
@@ -9362,7 +9362,7 @@ define void @mask_replication_factor7_vf8(ptr %in.maskvec, ptr %in.vec, ptr %out
 ; AVX512VBMI-ONLY:       # %bb.0:
 ; AVX512VBMI-ONLY-NEXT:    kmovw (%rdi), %k0
 ; AVX512VBMI-ONLY-NEXT:    vpmovm2b %k0, %zmm0
-; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,u,u,u,u,u,u,u,u>
+; AVX512VBMI-ONLY-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [0,0,0,0,0,0,0,1,1,1,1,1,1,1,2,2,2,2,2,2,2,3,3,3,3,3,3,3,4,4,4,4,4,4,4,5,5,5,5,5,5,5,6,6,6,6,6,6,6,7,7,7,7,7,7,7,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-ONLY-NEXT:    vpermb %zmm0, %zmm1, %zmm0
 ; AVX512VBMI-ONLY-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VBMI-ONLY-NEXT:    movabsq $72057594037927935, %rax # imm = 0xFFFFFFFFFFFFFF
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
index e6b5998d965f0..cdc2bbe56aee4 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll
@@ -1420,7 +1420,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,u,16384,8192,4096,2048,1024,512>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,u,16384,8192,4096,2048,1024,512]
 ; SSE41-NEXT:    pmulhw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    psraw $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index dfba0d985c1f0..e15be224bec8c 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1786,7 +1786,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v4i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,u,16384,8192,u,u,u,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,u,16384,8192,u,u,u,u]
 ; SSE41-NEXT:    pmulhw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    psraw $1, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
index 7a2dcd1c8ca8c..43da0f15abaeb 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll
@@ -1173,7 +1173,7 @@ define <8 x i16> @constant_shift_v8i16(<8 x i16> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,4096,2048,1024,512>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,4096,2048,1024,512]
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
index f647208a8000e..efd73b4ca132b 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-512.ll
@@ -288,7 +288,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) nounwind {
 ; AVX512DQ-LABEL: constant_shift_v32i16:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2]
 ; AVX512DQ-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm3
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0],xmm3[1,2,3,4,5,6,7]
 ; AVX512DQ-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index 510ae15ba0960..f340615464cfa 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1486,7 +1486,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v4i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = <u,32768,16384,8192,u,u,u,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index ec75631a9b5ed..b40be9452ddd7 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1003,7 +1003,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
 ;
 ; SSSE3-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1011,7 +1011,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
 ;
 ; SSE41-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
 ; SSE41-NEXT:    pshufb %xmm2, %xmm0
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1819,7 +1819,7 @@ define <16 x i8> @shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00(
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v16i8_uu_10_02_07_22_14_07_02_18_03_01_14_18_09_11_00:
 ; AVX512VLVBMI:       # %bb.0: # %entry
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0>
+; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,10,2,7,22,14,7,2,18,3,1,14,18,9,11,0]
 ; AVX512VLVBMI-NEXT:    vpermt2b %xmm1, %xmm2, %xmm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index f73081cfc404f..5b4f15b51ec00 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -3510,7 +3510,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_00_00_04_04_04_12_08_08_08_08_12_12_12_12:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -3569,7 +3569,7 @@ define <16 x i16> @shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_1
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_uu_00_uu_01_uu_02_uu_11_uu_08_uu_09_uu_10_uu_11:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,0,u,1,u,2,u,11,u,8,u,9,u,10,u,11>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,0,u,1,u,2,u,11,u,8,u,9,u,10,u,11]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -3609,7 +3609,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_1
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_uu_04_uu_05_uu_06_uu_15_uu_12_uu_13_uu_14_uu_15:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,4,u,5,u,6,u,15,u,12,u,13,u,14,u,15>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,4,u,5,u,6,u,15,u,12,u,13,u,14,u,15]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -3703,7 +3703,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_0
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_04_04_00_00_00_08_12_12_12_12_08_08_08_08:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,0,4,u,6,4,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,0,4,u,6,4,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,0,1,0,1,4,5,4,5,4,5,8,9,16,17,16,17,16,17,16,17,20,21,20,21,20,21,20,21]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -3940,7 +3940,7 @@ define <16 x i16> @shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_0
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_01_00_05_04_05_04_01_08_09_08_13_12_13_12_09_08:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,u,4,6,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -3999,7 +3999,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_0
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_05_04_01_00_05_04_01_08_13_12_09_08_13_12_09_08:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,0,4,u,6,4,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,0,4,u,6,4,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,2,3,0,1,6,7,8,9,18,19,16,17,22,23,20,21,18,19,16,17,22,23,20,21]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4058,7 +4058,7 @@ define <16 x i16> @shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_05_04_01_00_01_00_05_12_13_12_09_08_09_08_13_12:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5,6,7,4,5,2,3,8,9,18,19,16,17,22,23,20,21,22,23,20,21,18,19,16,17]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4117,7 +4117,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_0
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_00_04_04_08_08_12_12_08_08_12_12_08:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,u,4,6,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4176,7 +4176,7 @@ define <16 x i16> @shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_00_00_04_04_00_00_12_12_08_08_12_12_08_08_12:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,4,5,4,5,8,9,16,17,20,21,20,21,16,17,16,17,20,21,20,21,16,17]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4417,7 +4417,7 @@ define <16 x i16> @shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_00_04_04_04_04_04_12_08_08_12_12_12_12_12_12:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,4,5,4,5,4,5,8,9,16,17,16,17,20,21,20,21,20,21,20,21,20,21,20,21]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4476,7 +4476,7 @@ define <16 x i16> @shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_00_00_04_04_04_12_12_12_08_08_12_12_12_12:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5,0,1,0,1,0,1,8,9,16,17,16,17,20,21,20,21,16,17,16,17,16,17,16,17]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4535,7 +4535,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_04_04_04_12_08_12_12_08_12_12_12_12:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,4,5,4,5,8,9,16,17,20,21,20,21,16,17,20,21,20,21,20,21,20,21]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4594,7 +4594,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_0
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_00_00_00_08_08_12_12_08_08_08_08_08:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,4,u,4,6,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,4,u,4,6,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,0,1,0,1,0,1,8,9,16,17,20,21,20,21,16,17,16,17,16,17,16,17,16,17]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4654,7 +4654,7 @@ define <16 x i16> @shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_04_04_00_04_05_06_15_08_12_12_08_12_13_14_15:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,3,7,4,6,7,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,3,7,4,6,7,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,4,5,4,5,0,1,4,5,6,7,8,9,14,15,16,17,20,21,20,21,16,17,20,21,22,23,24,25,26,27]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4713,7 +4713,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,2,6,u,4,6,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,2,6,u,4,6,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,u,u,4,5,4,5,4,5,4,5,4,5,8,9,16,17,u,u,20,21,20,21,20,21,20,21,20,21,20,21]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4728,7 +4728,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_1
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_uu_04_04_04_04_04_12_08_uu_12_12_12_12_12_12:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,u,4,4,4,4,4,12,8,u,12,12,12,12,12,12>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,u,4,4,4,4,4,12,8,u,12,12,12,12,12,12]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -4772,7 +4772,7 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[0,1,0,1,u,u,4,5,0,1,0,1,0,1,8,9,16,17,16,17,u,u,20,21,16,17,16,17,16,17,16,17]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4787,7 +4787,7 @@ define <16 x i16> @shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_1
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_04_04_uu_00_04_04_04_12_12_12_uu_08_12_12_12_12:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <4,4,u,0,4,4,4,12,12,12,u,8,12,12,12,12>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [4,4,u,0,4,4,4,12,12,12,u,8,12,12,12,12]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -4831,7 +4831,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = <2,0,6,u,6,4,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm1 = [2,0,6,u,6,4,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,0,1,0,1,4,5,0,1,0,1,0,1,8,9,u,u,16,17,16,17,20,21,16,17,16,17,16,17,16,17]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -4846,7 +4846,7 @@ define <16 x i16> @shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_1
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_uu_04_04_00_04_04_04_12_uu_12_12_08_12_12_12_12:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = <u,4,4,0,4,4,4,12,u,12,12,8,12,12,12,12>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm1 = [u,4,4,0,4,4,4,12,u,12,12,8,12,12,12,12]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -5360,7 +5360,7 @@ define <16 x i16> @shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_3
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_16_01_17_06_22_07_31_08_24_09_25_14_30_15_31:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,3,7,u,4,7,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,3,7,u,4,7,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7]
@@ -5431,7 +5431,7 @@ define <16 x i16> @shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_2
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v16i16_00_20_01_21_06_16_07_25_08_28_09_29_14_24_15_25:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <2,0,4,u,6,4,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [2,0,4,u,6,4,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,10,11,u,u,u,u,u,u,u,u,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpshufd {{.*#+}} ymm0 = ymm0[0,3,2,3,4,7,6,7]
@@ -5721,7 +5721,7 @@ define <16 x i16> @shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_u
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_04_04_03_18_uu_uu_uu_uu_12_12_11_26_uu_uu_uu_uu:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,4,3,18,u,u,u,u,12,12,11,26,u,u,u,u]
 ; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -5766,7 +5766,7 @@ define <16 x i16> @shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_u
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_03_02_21_uu_uu_uu_uu_08_11_10_29_uu_uu_uu_uu:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,3,2,21,u,u,u,u,8,11,10,29,u,u,u,u]
 ; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -5818,7 +5818,7 @@ define <16 x i16> @shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_u
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_01_02_21_uu_uu_uu_uu_08_09_10_29_uu_uu_uu_uu:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,2,21,u,u,u,u,8,9,10,29,u,u,u,u]
 ; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -5861,7 +5861,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_1
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_uu_20_21_22_11_uu_uu_uu_uu_28_29_30_11:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,u,4,5,6,27,u,u,u,u,12,13,14,27>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,4,5,6,27,u,u,u,u,12,13,14,27]
 ; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -5905,7 +5905,7 @@ define <16 x i16> @shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_u
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_20_21_22_03_uu_uu_uu_uu_28_29_30_11_uu_uu_uu_uu:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [4,5,6,19,u,u,u,u,12,13,14,27,u,u,u,u]
 ; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -6063,7 +6063,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_2
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_uu_01_uu_05_07_25_uu_uu_uu_09_uu_13_15_25:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,u,1,u,5,7,25,u,u,u,9,u,13,15,25>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,1,u,5,7,25,u,u,u,9,u,13,15,25]
 ; AVX512VL-NEXT:    vpermt2w %ymm1, %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -6115,7 +6115,7 @@ define <16 x i16> @shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_u
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_uu_uu_04_uu_16_18_20_uu_uu_uu_12_uu_24_26_28_uu:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <u,u,20,u,0,2,4,u,u,u,28,u,8,10,12,u>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,20,u,0,2,4,u,u,u,28,u,8,10,12,u]
 ; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -6722,7 +6722,7 @@ define <16 x i16> @shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_u
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_23_uu_03_uu_20_20_05_uu_31_uu_11_uu_28_28_13_uu:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = <7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u>
+; AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm2 = [7,u,19,u,4,4,21,u,15,u,27,u,12,12,29,u]
 ; AVX512VL-NEXT:    vpermi2w %ymm0, %ymm1, %ymm2
 ; AVX512VL-NEXT:    vmovdqa %ymm2, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -7458,10 +7458,10 @@ define <16 x i16> @PR24935(<16 x i16> %a, <16 x i16> %b) {
 ;
 ; AVX2-FAST-ALL-LABEL: PR24935:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,5,u,u,0,4,6,2>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,5,u,u,0,4,6,2]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm0 = zero,zero,zero,zero,ymm0[2,3],zero,zero,zero,zero,zero,zero,ymm0[6,7],zero,zero,ymm0[18,19,22,23],zero,zero,zero,zero,ymm0[26,27,28,29,16,17],zero,zero
-; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = <5,6,3,0,0,6,4,u>
+; AVX2-FAST-ALL-NEXT:    vmovdqa {{.*#+}} ymm2 = [5,6,3,0,0,6,4,u]
 ; AVX2-FAST-ALL-NEXT:    vpermd %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-ALL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[2,3,0,1],zero,zero,ymm1[6,7,0,1,10,11],zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[16,17,20,21],zero,zero,zero,zero,zero,zero,ymm1[24,25]
 ; AVX2-FAST-ALL-NEXT:    vpor %ymm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index f5c5ba6631750..baa942794ccd8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -3411,7 +3411,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
 ; AVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,12,12,13,1,6,13,7,u,u,u,u,u,u,u,u,u,u,u,u,17,22,29,23,20,19,u,19,u,u,u,u]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,5,0,6,u,1,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,u,5,0,6,u,1,u]
 ; AVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
@@ -3424,7 +3424,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
 ; AVX512VLBW-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
 ; AVX512VLBW:       # %bb.0:
 ; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,12,12,13,1,6,13,7,u,u,u,u,u,u,u,u,u,u,u,u,17,22,29,23,20,19,u,19,u,u,u,u]
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,5,0,6,u,1,u>
+; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,u,5,0,6,u,1,u]
 ; AVX512VLBW-NEXT:    vpermd %ymm0, %ymm2, %ymm2
 ; AVX512VLBW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
 ; AVX512VLBW-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
@@ -3460,7 +3460,7 @@ define <32 x i8> @shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_
 ; XOPAVX2-LABEL: shuffle_v32i8_42_45_12_13_35_35_60_40_17_22_29_44_33_12_48_51_20_19_52_19_49_54_37_32_48_42_59_07_36_34_36_39:
 ; XOPAVX2:       # %bb.0:
 ; XOPAVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,12,12,13,1,6,13,7,u,u,u,u,u,u,u,u,u,u,u,u,17,22,29,23,20,19,u,19,u,u,u,u]
-; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,5,0,6,u,1,u>
+; XOPAVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,u,5,0,6,u,1,u]
 ; XOPAVX2-NEXT:    vpermd %ymm0, %ymm2, %ymm0
 ; XOPAVX2-NEXT:    vpshufb {{.*#+}} ymm2 = ymm1[10,13],zero,zero,ymm1[3,3],zero,ymm1[8],zero,zero,zero,ymm1[12,1],zero,zero,zero,zero,zero,ymm1[20],zero,ymm1[17,22],zero,zero,ymm1[16],zero,ymm1[27],zero,zero,zero,zero,zero
 ; XOPAVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm1[2,3,0,1]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index 30b743cb7bdfd..a69a1a18f26e5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -445,7 +445,7 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX2-SLOW-LABEL: shuffle_v8f32_08991abb:
 ; AVX2-SLOW:       # %bb.0:
-; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-SLOW-NEXT:    vmovaps {{.*#+}} ymm2 = [u,0,1,1,u,2,3,3]
 ; AVX2-SLOW-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-SLOW-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; AVX2-SLOW-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
@@ -454,16 +454,16 @@ define <8 x float> @shuffle_v8f32_08991abb(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v8f32_08991abb:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [u,0,1,1,u,2,3,3]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX2-FAST-ALL-NEXT:    retq
 ;
 ; AVX2-FAST-PERLANE-LABEL: shuffle_v8f32_08991abb:
 ; AVX2-FAST-PERLANE:       # %bb.0:
-; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-FAST-PERLANE-NEXT:    vmovaps {{.*#+}} ymm2 = [u,0,1,1,u,2,3,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vshufps {{.*#+}} xmm0 = xmm0[0,1,1,3]
 ; AVX2-FAST-PERLANE-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[0,1,1,3]
@@ -533,7 +533,7 @@ define <8 x float> @shuffle_v8f32_09ab1def(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v8f32_09ab1def:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -956,7 +956,7 @@ define <8 x float> @shuffle_v8f32_f511235a(<8 x float> %a, <8 x float> %b) {
 ; AVX2-FAST-ALL:       # %bb.0:
 ; AVX2-FAST-ALL-NEXT:    vbroadcastsd {{.*#+}} ymm2 = [7,2,7,2,7,2,7,2]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = <u,5,1,1,2,3,5,u>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [u,5,1,1,2,3,5,u]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6],ymm1[7]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -1397,7 +1397,7 @@ define <8 x float> @shuffle_v8f32_089abcde(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX2-LABEL: shuffle_v8f32_089abcde:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,1,2,3,4,5,6>
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [u,0,1,2,3,4,5,6]
 ; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
@@ -1452,7 +1452,7 @@ define <8 x float> @shuffle_v8f32_01289abc(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX2-LABEL: shuffle_v8f32_01289abc:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,u,0,1,2,3,4>
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [u,u,u,0,1,2,3,4]
 ; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-NEXT:    retq
@@ -2095,9 +2095,9 @@ define <8 x i32> @shuffle_v8i32_08991abb(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_08991abb:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,1,1,u,2,3,3>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [u,0,1,1,u,2,3,3]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -2174,7 +2174,7 @@ define <8 x i32> @shuffle_v8i32_09ab1def(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_09ab1def:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm0, %ymm2, %ymm0
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX2-FAST-ALL-NEXT:    retq
@@ -2526,7 +2526,7 @@ define <8 x i32> @shuffle_v8i32_002u6u44(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8i32_002u6u44:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,0,2,u,6,u,4,4>
+; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,2,u,6,u,4,4]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
@@ -2541,7 +2541,7 @@ define <8 x i32> @shuffle_v8i32_00uu66uu(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8i32_00uu66uu:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,0,u,u,6,6,u,u>
+; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,u,u,6,6,u,u]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
@@ -2556,7 +2556,7 @@ define <8 x i32> @shuffle_v8i32_103245uu(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8i32_103245uu:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <1,0,3,2,4,5,u,u>
+; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,0,3,2,4,5,u,u]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
@@ -2571,7 +2571,7 @@ define <8 x i32> @shuffle_v8i32_1133uu67(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8i32_1133uu67:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <1,1,3,3,u,u,6,7>
+; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [1,1,3,3,u,u,6,7]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
@@ -2586,7 +2586,7 @@ define <8 x i32> @shuffle_v8i32_0uu354uu(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8i32_0uu354uu:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <0,u,u,3,5,4,u,u>
+; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,u,u,3,5,4,u,u]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
@@ -2601,7 +2601,7 @@ define <8 x i32> @shuffle_v8i32_uuu3uu66(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8i32_uuu3uu66:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = <u,u,u,3,u,u,6,6>
+; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [u,u,u,3,u,u,6,6]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
@@ -2628,7 +2628,7 @@ define <8 x i32> @shuffle_v8i32_6caa87e5(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2-FAST-ALL-LABEL: shuffle_v8i32_6caa87e5:
 ; AVX2-FAST-ALL:       # %bb.0:
-; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = <u,4,2,2,0,u,6,u>
+; AVX2-FAST-ALL-NEXT:    vmovaps {{.*#+}} ymm2 = [u,4,2,2,0,u,6,u]
 ; AVX2-FAST-ALL-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-FAST-ALL-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[3,1,3,2]
 ; AVX2-FAST-ALL-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4],ymm0[5],ymm1[6],ymm0[7]
@@ -2958,7 +2958,7 @@ define <8 x i32> @shuffle_v8i32_089abcde(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2-LABEL: shuffle_v8i32_089abcde:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,0,1,2,3,4,5,6>
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [u,0,1,2,3,4,5,6]
 ; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3,4,5,6,7]
 ; AVX2-NEXT:    retq
@@ -3031,7 +3031,7 @@ define <8 x i32> @shuffle_v8i32_01289abc(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2-LABEL: shuffle_v8i32_01289abc:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,u,0,1,2,3,4>
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [u,u,u,0,1,2,3,4]
 ; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2],ymm1[3,4,5,6,7]
 ; AVX2-NEXT:    retq
@@ -3244,7 +3244,7 @@ define <8 x i32> @shuffle_v8i32_0dcd3f14(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2-LABEL: shuffle_v8i32_0dcd3f14:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <0,u,u,u,3,u,1,4>
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [0,u,u,u,3,u,1,4]
 ; AVX2-NEXT:    vpermps %ymm0, %ymm2, %ymm0
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[2,2,3,3]
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5],ymm0[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
index 370a69696d850..9c0ffd4558fc8 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v16.ll
@@ -233,7 +233,7 @@ define <16 x i32> @shuffle_v16i32_01_02_03_16_05_06_07_20_09_10_11_24_13_14_15_2
 define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x float> %a)  {
 ; ALL-LABEL: shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
+; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1]
 ; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %c = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
@@ -243,7 +243,7 @@ define <16 x float> @shuffle_v16f32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<
 define <16 x i32> @shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01(<16 x i32> %a)  {
 ; ALL-LABEL: shuffle_v16i32_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1>
+; ALL-NEXT:    vmovaps {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1]
 ; ALL-NEXT:    vpermps %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    retq
   %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index 4668d7b6870ef..fa0ec33bf3408 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -42,20 +42,20 @@ define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_
 ; KNL-NEXT:    vpshufb {{.*#+}} ymm1 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19]
 ; KNL-NEXT:    vpermq {{.*#+}} ymm2 = ymm0[2,3,0,1]
 ; KNL-NEXT:    vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31]
-; KNL-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u>
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u]
 ; KNL-NEXT:    vpblendvb %ymm4, %ymm1, %ymm3, %ymm3
 ; KNL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; KNL-NEXT:    vpblendw {{.*#+}} ymm0 = ymm3[0,1,2,3,4,5,6],ymm0[7],ymm3[8,9,10,11,12,13,14],ymm0[15]
 ; KNL-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; KNL-NEXT:    vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19]
-; KNL-NEXT:    vmovdqa {{.*#+}} ymm3 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255>
+; KNL-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255]
 ; KNL-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; KNL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31>
+; SKX-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31]
 ; SKX-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; SKX-NEXT:    retq
   %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> <i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 1, i32 2, i32 5, i32 undef, i32 undef, i32 7, i32 undef, i32 10, i32 1,  i32 0, i32 5, i32 undef, i32 4, i32 7, i32 undef, i32 10, i32 31>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 843b285ae1c36..4df5307316a42 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -556,14 +556,14 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_
 ; AVX512F-LABEL: shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_49_51_55_57_61_63_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
 ; AVX512F-NEXT:    vpor %xmm2, %xmm5, %xmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13]
@@ -583,14 +583,14 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_
 ; AVX512BW-LABEL: shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_49_51_55_57_61_63_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
 ; AVX512BW-NEXT:    vpor %xmm2, %xmm5, %xmm2
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13]
@@ -610,14 +610,14 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_
 ; AVX512DQ-LABEL: shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_49_51_55_57_61_63_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
 ; AVX512DQ-NEXT:    vpor %xmm2, %xmm5, %xmm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,1,5,7,11,13,17,19,23,25,29,31,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendvb %ymm5, %ymm2, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm2[1,5,7,11,13]
@@ -636,7 +636,7 @@ define <64 x i8> @shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_01_03_07_09_13_15_19_21_25_27_31_33_37_39_43_45_49_51_55_57_61_63_67_69_73_75_79_81_85_87_91_93_97_99_103_105_109_111_115_117_121_123_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-NEXT:    vpermt2b %zmm1, %zmm2, %zmm0
 ; AVX512VBMI-NEXT:    retq
   %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 1, i32 3, i32 7, i32 9, i32 13, i32 15, i32 19, i32 21, i32 25, i32 27, i32 31, i32 33, i32 37, i32 39, i32 43, i32 45, i32 49, i32 51, i32 55, i32 57, i32 61, i32 63, i32 67, i32 69, i32 73, i32 75, i32 79, i32 81, i32 85, i32 87, i32 91, i32 93, i32 97, i32 99, i32 103, i32 105, i32 109, i32 111, i32 115, i32 117, i32 121, i32 123, i32 127, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -647,10 +647,10 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_
 ; AVX512F-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
@@ -659,7 +659,7 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_
 ; AVX512F-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15]
@@ -674,10 +674,10 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_
 ; AVX512BW-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
@@ -686,7 +686,7 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15]
@@ -701,10 +701,10 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_
 ; AVX512DQ-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
@@ -713,7 +713,7 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[3,5,9,11,15]
@@ -727,7 +727,7 @@ define <64 x i8> @shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_01_05_07_11_13_17_19_23_25_29_31_35_37_41_43_47_49_53_55_59_61_65_67_71_73_77_79_83_85_89_91_95_97_101_103_107_109_113_115_119_121_125_127_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125,1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125,1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm1, %zmm2
 ; AVX512VBMI-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512VBMI-NEXT:    retq
@@ -740,9 +740,9 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vextracti128 $1, %ymm0, %xmm3
@@ -767,9 +767,9 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm0, %xmm3
@@ -794,9 +794,9 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm0, %xmm3
@@ -819,7 +819,7 @@ define <64 x i8> @shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_02_04_08_10_14_16_20_22_26_28_32_34_38_40_44_46_50_52_56_58_62_64_68_70_74_76_80_82_86_88_92_94_98_100_104_106_110_112_116_118_122_124_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <2,4,8,10,14,16,20,22,26,28,32,34,38,40,44,46,50,52,56,58,62,64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [2,4,8,10,14,16,20,22,26,28,32,34,38,40,44,46,50,52,56,58,62,64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-NEXT:    vpermt2b %zmm1, %zmm2, %zmm0
 ; AVX512VBMI-NEXT:    retq
   %r = shufflevector <64 x i8> %a0, <64 x i8> %a1, <64 x i32> <i32 2, i32 4, i32 8, i32 10, i32 14, i32 16, i32 20, i32 22, i32 26, i32 28, i32 32, i32 34, i32 38, i32 40, i32 44, i32 46, i32 50, i32 52, i32 56, i32 58, i32 62, i32 64, i32 68, i32 70, i32 74, i32 76, i32 80, i32 82, i32 86, i32 88, i32 92, i32 94, i32 98, i32 100, i32 104, i32 106, i32 110, i32 112, i32 116, i32 118, i32 122, i32 124, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -830,10 +830,10 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_
 ; AVX512F-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
@@ -842,7 +842,7 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_
 ; AVX512F-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX512F-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14]
@@ -857,10 +857,10 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_
 ; AVX512BW-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
@@ -869,7 +869,7 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14]
@@ -884,10 +884,10 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_
 ; AVX512DQ-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm3 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm2, %xmm2
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
@@ -896,7 +896,7 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = <255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512DQ-NEXT:    vpblendvb %ymm4, %ymm3, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vextracti128 $1, %ymm1, %xmm3
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} xmm3 = xmm3[u,u,u,u,u],zero,zero,zero,zero,zero,zero,xmm3[2,4,8,10,14]
@@ -910,7 +910,7 @@ define <64 x i8> @shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_00_04_06_10_12_16_18_22_24_28_30_34_36_40_42_46_48_52_54_58_60_64_66_70_72_76_78_82_84_88_90_94_96_100_102_106_108_112_114_118_120_124_126_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u_u:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,0,2,6,8,12,14,18,20,24,26,30,32,36,38,42,44,48,50,54,56,60,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,0,2,6,8,12,14,18,20,24,26,30,32,36,38,42,44,48,50,54,56,60,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-NEXT:    vpermi2b %zmm0, %zmm1, %zmm2
 ; AVX512VBMI-NEXT:    vmovdqa64 %zmm2, %zmm0
 ; AVX512VBMI-NEXT:    retq
@@ -989,7 +989,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -1006,7 +1006,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -1023,7 +1023,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,3,5,9,11,15,17,21,23,27,29,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -1050,7 +1050,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX512F-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
@@ -1067,7 +1067,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX512BW-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; AVX512BW-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
@@ -1084,7 +1084,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,0,2,6,8,12,14,18,20,24,26,30,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
 ; AVX512DQ-NEXT:    vextracti32x4 $2, %zmm0, %xmm2
 ; AVX512DQ-NEXT:    vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3,4],xmm1[5,6,7]
@@ -1111,7 +1111,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512F-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512F-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -1128,7 +1128,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512BW-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512BW-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -1145,7 +1145,7 @@ define <64 x i8> @shuffle_v64i8_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_15_
 ; AVX512DQ-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX512DQ-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512DQ-NEXT:    vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,2,4,8,10,14,16,20,22,26,28,u,u,u,u,u,u,u,u,u,u,u]
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = <u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255>
+; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [u,u,u,u,u,u,u,u,u,u,u,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255]
 ; AVX512DQ-NEXT:    vpblendvb %ymm3, %ymm2, %ymm1, %ymm1
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm0, %ymm2
 ; AVX512DQ-NEXT:    vmovdqa {{.*#+}} ymm3 = [0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 2bb7dd90cc05f..9bfca824fb71a 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -687,13 +687,13 @@ define <8 x double> @shuffle_v8f64_01235466(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
 ; AVX512F-LABEL: shuffle_v8f64_002u6u44:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4>
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,u,6,u,4,4]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: shuffle_v8f64_002u6u44:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0>
+; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0]
 ; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
@@ -703,13 +703,13 @@ define <8 x double> @shuffle_v8f64_002u6u44(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_00uu66uu(<8 x double> %a, <8 x double> %b) {
 ; AVX512F-LABEL: shuffle_v8f64_00uu66uu:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u>
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,u,u,6,6,u,u]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: shuffle_v8f64_00uu66uu:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u>
+; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u]
 ; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
@@ -1553,13 +1553,13 @@ define <8 x i64> @shuffle_v8i64_01235466(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
 ; AVX512F-LABEL: shuffle_v8i64_002u6u44:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <0,0,2,u,6,u,4,4>
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,2,u,6,u,4,4]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: shuffle_v8i64_002u6u44:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = <0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0>
+; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,2,0,u,u,6,0,u,u,4,0,4,0]
 ; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 2, i32 undef, i32 6, i32 undef, i32 4, i32 4>
@@ -1569,13 +1569,13 @@ define <8 x i64> @shuffle_v8i64_002u6u44(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
 ; AVX512F-LABEL: shuffle_v8i64_00uu66uu:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <0,0,u,u,6,6,u,u>
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,u,u,6,6,u,u]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: shuffle_v8i64_00uu66uu:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = <0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u>
+; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,0,u,u,u,u,6,0,6,0,u,u,u,u]
 ; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 0, i32 undef, i32 undef, i32 6, i32 6, i32 undef, i32 undef>
@@ -1585,13 +1585,13 @@ define <8 x i64> @shuffle_v8i64_00uu66uu(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
 ; AVX512F-LABEL: shuffle_v8i64_103245uu:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <1,0,3,2,4,5,u,u>
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,3,2,4,5,u,u]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: shuffle_v8i64_103245uu:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = <1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u>
+; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,0,0,3,0,2,0,4,0,5,0,u,u,u,u]
 ; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 4, i32 5, i32 undef, i32 undef>
@@ -1601,13 +1601,13 @@ define <8 x i64> @shuffle_v8i64_103245uu(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
 ; AVX512F-LABEL: shuffle_v8i64_1133uu67:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <1,1,3,3,u,u,6,7>
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [1,1,3,3,u,u,6,7]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: shuffle_v8i64_1133uu67:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = <1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0>
+; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [1,0,1,0,3,0,3,0,u,u,u,u,6,0,7,0]
 ; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 1, i32 1, i32 3, i32 3, i32 undef, i32 undef, i32 6, i32 7>
@@ -1617,13 +1617,13 @@ define <8 x i64> @shuffle_v8i64_1133uu67(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
 ; AVX512F-LABEL: shuffle_v8i64_0uu354uu:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <0,u,u,3,5,4,u,u>
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,u,u,3,5,4,u,u]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: shuffle_v8i64_0uu354uu:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = <0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u>
+; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,u,u,u,u,3,0,5,0,4,0,u,u,u,u]
 ; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 0, i32 undef, i32 undef, i32 3, i32 5, i32 4, i32 undef, i32 undef>
@@ -1633,13 +1633,13 @@ define <8 x i64> @shuffle_v8i64_0uu354uu(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_uuu3uu66(<8 x i64> %a, <8 x i64> %b) {
 ; AVX512F-LABEL: shuffle_v8i64_uuu3uu66:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = <u,u,u,3,u,u,6,6>
+; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [u,u,u,3,u,u,6,6]
 ; AVX512F-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512F-32-LABEL: shuffle_v8i64_uuu3uu66:
 ; AVX512F-32:       # %bb.0:
-; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = <u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0>
+; AVX512F-32-NEXT:    vmovaps {{.*#+}} zmm1 = [u,u,u,u,u,u,3,0,u,u,u,u,6,0,6,0]
 ; AVX512F-32-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; AVX512F-32-NEXT:    retl
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 6, i32 6>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
index 3aafb1af366ba..917f26dd3cadf 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx.ll
@@ -362,7 +362,7 @@ define void @PR39483() {
 ; X86-AVX2:       # %bb.0: # %entry
 ; X86-AVX2-NEXT:    vmovups 32, %ymm0
 ; X86-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; X86-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; X86-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
 ; X86-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
@@ -403,7 +403,7 @@ define void @PR39483() {
 ; X64-AVX2:       # %bb.0: # %entry
 ; X64-AVX2-NEXT:    vmovups 32, %ymm0
 ; X64-AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],mem[2],ymm0[3,4],mem[5],ymm0[6,7]
-; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = <2,5,0,3,6,u,u,u>
+; X64-AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [2,5,0,3,6,u,u,u]
 ; X64-AVX2-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; X64-AVX2-NEXT:    vpermilps {{.*#+}} ymm1 = mem[0,1,0,3,4,5,4,7]
 ; X64-AVX2-NEXT:    vpermpd {{.*#+}} ymm1 = ymm1[0,1,0,3]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
index 9b26f95f51478..f2e42d51a7eb6 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx2.ll
@@ -128,7 +128,7 @@ define <8 x i32> @combine_as_vpermd(<8 x i32> %a0) {
 define <8 x float> @combine_as_vpermps(<8 x float> %a0) {
 ; CHECK-LABEL: combine_as_vpermps:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = <6,4,7,5,1,u,4,7>
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm1 = [6,4,7,5,1,u,4,7]
 ; CHECK-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = shufflevector <8 x float> %a0, <8 x float> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 4, i32 5, i32 6, i32 7>
@@ -784,7 +784,7 @@ define <8 x float> @constant_fold_permps() {
 define <32 x i8> @constant_fold_pshufb_256() {
 ; CHECK-LABEL: constant_fold_pshufb_256:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250>
+; CHECK-NEXT:    vmovaps {{.*#+}} ymm0 = [14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9,255,0,0,0,u,u,0,0,241,0,0,0,0,0,249,250]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %1 = tail call <32 x i8> @llvm.x86.avx2.pshuf.b(<32 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0, i8 0, i8 -1, i8 -2, i8 -3, i8 -4, i8 -5, i8 -6, i8 -7, i8 -8, i8 -9, i8 -10, i8 -11, i8 -12, i8 -13, i8 -14, i8 -15>, <32 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6, i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
   ret <32 x i8> %1
@@ -832,7 +832,7 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1
 ; AVX2-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
 ; AVX2-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = <u,u,7,2,u,u,3,2>
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [u,u,7,2,u,u,3,2]
 ; AVX2-NEXT:    vpermps %ymm1, %ymm2, %ymm1
 ; AVX2-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7]
 ; AVX2-NEXT:    ret{{[l|q]}}
@@ -843,7 +843,7 @@ define internal fastcc <8 x float> @PR34577(<8 x float> %inp0, <8 x float> %inp1
 ; AVX512-NEXT:    vpermpd {{.*#+}} ymm0 = ymm0[1,1,1,1]
 ; AVX512-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm0[4,5],ymm2[6,7]
-; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = <23,18,7,2,20,u,3,2>
+; AVX512-NEXT:    vmovaps {{.*#+}} ymm0 = [23,18,7,2,20,u,3,2]
 ; AVX512-NEXT:    vpermi2ps %zmm2, %zmm1, %zmm0
 ; AVX512-NEXT:    # kill: def $ymm0 killed $ymm0 killed $zmm0
 ; AVX512-NEXT:    ret{{[l|q]}}
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index 6f128a0034397..5c16fbb7be4fd 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -31,9 +31,9 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
 ; SSE-NEXT:    movdqa (%rsi), %xmm4
 ; SSE-NEXT:    movdqa 16(%rsi), %xmm5
 ; SSE-NEXT:    movdqa 32(%rsi), %xmm3
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = <128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u>
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [128,128,128,128,128,128,2,5,8,11,14,u,u,u,u,u]
 ; SSE-NEXT:    pshufb %xmm6, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = <0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u>
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [0,3,6,9,12,15,128,128,128,128,128,u,u,u,u,u]
 ; SSE-NEXT:    pshufb %xmm7, %xmm2
 ; SSE-NEXT:    por %xmm0, %xmm2
 ; SSE-NEXT:    pshufb %xmm6, %xmm5
@@ -44,9 +44,9 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
 ; SSE-NEXT:    pmullw %xmm5, %xmm0
 ; SSE-NEXT:    movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
 ; SSE-NEXT:    pand %xmm5, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm6 = <8,u,9,u,10,u,128,u,128,u,128,u,128,u,128,u>
+; SSE-NEXT:    movdqa {{.*#+}} xmm6 = [8,u,9,u,10,u,128,u,128,u,128,u,128,u,128,u]
 ; SSE-NEXT:    pshufb %xmm6, %xmm4
-; SSE-NEXT:    movdqa {{.*#+}} xmm7 = <128,u,128,u,128,u,1,u,4,u,7,u,10,u,13,u>
+; SSE-NEXT:    movdqa {{.*#+}} xmm7 = [128,u,128,u,128,u,1,u,4,u,7,u,10,u,13,u]
 ; SSE-NEXT:    pshufb %xmm7, %xmm3
 ; SSE-NEXT:    por %xmm4, %xmm3
 ; SSE-NEXT:    pshufb %xmm6, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
index 83f40046d4b5f..5c035346415b0 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll
@@ -766,12 +766,12 @@ define <16 x i8> @combine_and_pshufb_or_pshufb(<16 x i8> %a0, <16 x i8> %a1) {
 define <16 x i8> @constant_fold_pshufb() {
 ; SSE-LABEL: constant_fold_pshufb:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: constant_fold_pshufb:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = <14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9>
+; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [14,0,0,0,u,u,0,0,0,0,0,0,0,0,8,9]
 ; AVX-NEXT:    retq
   %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> <i8 15, i8 14, i8 13, i8 12, i8 11, i8 10, i8 9, i8 8, i8 7, i8 6, i8 5, i8 4, i8 3, i8 2, i8 1, i8 0>, <16 x i8> <i8 1, i8 -1, i8 -1, i8 -1, i8 undef, i8 undef, i8 -1, i8 -1, i8 15, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 7, i8 6>)
   ret <16 x i8> %1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index d0334dfb66e8a..1e2ee7b99a608 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1744,7 +1744,7 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -1838,7 +1838,7 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u>
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -2671,14 +2671,14 @@ define void @combine_scalar_load_with_blend_with_zero(ptr %a0, ptr %a1) {
 define <4 x float> @combine_constant_insertion_v4f32(float %f) {
 ; SSE2-LABEL: combine_constant_insertion_v4f32:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
+; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [u,4.0E+0,5.0E+0,3.0E+0]
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_constant_insertion_v4f32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = <u,4.0E+0,5.0E+0,3.0E+0>
+; SSSE3-NEXT:    movaps {{.*#+}} xmm1 = [u,4.0E+0,5.0E+0,3.0E+0]
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
 ; SSSE3-NEXT:    movaps %xmm1, %xmm0
 ; SSSE3-NEXT:    retq
@@ -2701,26 +2701,26 @@ define <4 x i32> @combine_constant_insertion_v4i32(i32 %f) {
 ; SSE2-LABEL: combine_constant_insertion_v4i32:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movd %edi, %xmm1
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
+; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [u,4,5,30]
 ; SSE2-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSE2-NEXT:    retq
 ;
 ; SSSE3-LABEL: combine_constant_insertion_v4i32:
 ; SSSE3:       # %bb.0:
 ; SSSE3-NEXT:    movd %edi, %xmm1
-; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,4,5,30>
+; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [u,4,5,30]
 ; SSSE3-NEXT:    movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
 ; SSSE3-NEXT:    retq
 ;
 ; SSE41-LABEL: combine_constant_insertion_v4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <u,4,5,30>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [u,4,5,30]
 ; SSE41-NEXT:    pinsrd $0, %edi, %xmm0
 ; SSE41-NEXT:    retq
 ;
 ; AVX-LABEL: combine_constant_insertion_v4i32:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = <u,4,5,30>
+; AVX-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,4,5,30]
 ; AVX-NEXT:    vpinsrd $0, %edi, %xmm0, %xmm0
 ; AVX-NEXT:    retq
   %a0 = insertelement <4 x i32> undef, i32 %f, i32 0
@@ -2851,7 +2851,7 @@ define <4 x float> @PR30264(<4 x float> %x) {
 ;
 ; SSE41-LABEL: PR30264:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <u,u,4.0E+0,1.0E+0>
+; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [u,u,4.0E+0,1.0E+0]
 ; SSE41-NEXT:    insertps {{.*#+}} xmm1 = xmm0[0],zero,xmm1[2,3]
 ; SSE41-NEXT:    movaps %xmm1, %xmm0
 ; SSE41-NEXT:    retq
@@ -3298,7 +3298,7 @@ define void @PR45604(ptr %dst, ptr %src) {
 ; SSE41-NEXT:    movdqa (%rsi), %xmm0
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = <u,0,11,0,u,0,11,0>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [u,0,11,0,u,0,11,0]
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1,2,3],xmm1[4],xmm2[5,6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[2,3,2,3]
 ; SSE41-NEXT:    pmovzxwq {{.*#+}} xmm3 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
@@ -3341,7 +3341,7 @@ define void @PR45604(ptr %dst, ptr %src) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rsi), %xmm0
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[0,2,0,2]
-; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,1,8,9,u,u,u,u,2,3,10,11,u,u,u,u,4,5,12,13,u,u,u,u,6,7,14,15,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm3 = [11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0,11,0,0,0]
 ; AVX2-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0],ymm3[1],ymm1[2],ymm3[3],ymm1[4],ymm3[5],ymm1[6],ymm3[7]
@@ -3481,9 +3481,9 @@ define void @SpinningCube() {
 ; SSE2-LABEL: SpinningCube:
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
+; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
 ; SSE2-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSE2-NEXT:    movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u>
+; SSE2-NEXT:    movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u]
 ; SSE2-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSE2-NEXT:    xorps %xmm3, %xmm3
 ; SSE2-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
@@ -3500,9 +3500,9 @@ define void @SpinningCube() {
 ; SSSE3-LABEL: SpinningCube:
 ; SSSE3:       # %bb.0: # %entry
 ; SSSE3-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
-; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
+; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
 ; SSSE3-NEXT:    movss {{.*#+}} xmm1 = [NaN,0.0E+0,0.0E+0,0.0E+0]
-; SSSE3-NEXT:    movapd {{.*#+}} xmm2 = <u,u,-2.0E+0,u>
+; SSSE3-NEXT:    movapd {{.*#+}} xmm2 = [u,u,-2.0E+0,u]
 ; SSSE3-NEXT:    movsd {{.*#+}} xmm2 = xmm1[0],xmm2[1]
 ; SSSE3-NEXT:    xorps %xmm3, %xmm3
 ; SSSE3-NEXT:    shufps {{.*#+}} xmm3 = xmm3[0,1],xmm2[2,0]
@@ -3519,8 +3519,8 @@ define void @SpinningCube() {
 ; SSE41-LABEL: SpinningCube:
 ; SSE41:       # %bb.0: # %entry
 ; SSE41-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = <u,u,u,1.0E+0>
-; SSE41-NEXT:    movaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
+; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [u,u,u,1.0E+0]
+; SSE41-NEXT:    movaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u]
 ; SSE41-NEXT:    movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; SSE41-NEXT:    movaps %xmm1, %xmm3
 ; SSE41-NEXT:    insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm2[0]
@@ -3539,7 +3539,7 @@ define void @SpinningCube() {
 ; AVX:       # %bb.0: # %entry
 ; AVX-NEXT:    movl $1065353216, (%rax) # imm = 0x3F800000
 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
-; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = <0.0E+0,0.0E+0,-2.0E+0,u>
+; AVX-NEXT:    vmovaps {{.*#+}} xmm1 = [0.0E+0,0.0E+0,-2.0E+0,u]
 ; AVX-NEXT:    vmovss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm3 = xmm1[0,1,2],xmm2[0]
 ; AVX-NEXT:    vinsertps {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[2,3]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 6ef203999af6e..95a57d1bdf331 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -542,7 +542,7 @@ define i8 @shuf8i1_10_2_9_u_3_u_2_u(i8 %a) {
 ; AVX512F-NEXT:    kmovw %edi, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <8,2,10,u,3,u,2,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [8,2,10,u,3,u,2,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vpsllq $63, %zmm2, %zmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
index 75c26f38eb9ed..6941585c6a5f0 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v192.ll
@@ -11,9 +11,9 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
 ; AVX2-NEXT:    vpor %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13,1,3,7,9,13,15,0,0,0,0,0,1,5,7,11,13]
@@ -22,10 +22,10 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13]
 ; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128]
 ; AVX2-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
 ; AVX2-NEXT:    vpor %xmm2, %xmm8, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -51,17 +51,17 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX512F-LABEL: f1:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vmovdqa 176(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13]
 ; AVX512F-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX512F-NEXT:    vmovdqa 160(%rdi), %xmm5
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128]
 ; AVX512F-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX512F-NEXT:    vpor %xmm2, %xmm5, %xmm2
 ; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -92,17 +92,17 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX512BW-LABEL: f1:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa 112(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,3,5,9,11,15,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa 96(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [1,3,7,9,13,15,128,128,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = <u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,u,u,u,u,128,128,128,128,128,1,5,7,11,13]
 ; AVX512BW-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm5
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = <u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [u,u,u,u,u,u,3,5,9,11,15,128,128,128,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX512BW-NEXT:    vpor %xmm2, %xmm5, %xmm2
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -138,7 +138,7 @@ define <64 x i8> @f1(ptr %p0) {
 ; AVX512VBMI-LABEL: f1:
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-NEXT:    vpermi2b 64(%rdi), %zmm0, %zmm1
 ; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125]
 ; AVX512VBMI-NEXT:    vpermi2b 128(%rdi), %zmm1, %zmm0
@@ -155,9 +155,9 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15,1,5,7,11,13,0,0,0,0,0,0,3,5,9,11,15]
@@ -166,10 +166,10 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15]
 ; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128]
 ; AVX2-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
 ; AVX2-NEXT:    vpor %xmm2, %xmm8, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -194,18 +194,18 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX512F-LABEL: f2:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 176(%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15]
 ; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 160(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpor %xmm2, %xmm4, %xmm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
@@ -234,18 +234,18 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX512BW-LABEL: f2:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,3,5,9,11,15]
 ; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,1,3,7,9,13,15,128,128,128,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [1,5,7,11,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,1,3,7,9,13,15,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
@@ -275,7 +275,7 @@ define <64 x i8> @f2(ptr %p0) {
 ; AVX512VBMI-LABEL: f2:
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    vmovdqa64 64(%rdi), %zmm0
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125,1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [65,69,71,75,77,81,83,87,89,93,95,99,101,105,107,111,113,117,119,123,125,1,3,7,9,13,15,19,21,25,27,31,33,37,39,43,45,49,51,55,57,61,63,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-NEXT:    vpermi2b (%rdi), %zmm0, %zmm1
 ; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,67,69,73,75,79,81,85,87,91,93,97,99,103,105,109,111,115,117,121,123,127]
 ; AVX512VBMI-NEXT:    vpermi2b 128(%rdi), %zmm1, %zmm0
@@ -291,18 +291,18 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa 128(%rdi), %ymm1
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm4
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
 ; AVX2-NEXT:    vpor %xmm2, %xmm4, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm4
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm6
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm8 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX2-NEXT:    vpor %xmm6, %xmm4, %xmm4
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm6 = [2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14,2,4,8,10,14,0,0,0,0,0,0,2,6,8,12,14]
@@ -332,10 +332,10 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512F-LABEL: f3:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 160(%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128]
 ; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 176(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
@@ -346,10 +346,10 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512F-NEXT:    vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpternlogq $216, %ymm5, %ymm2, %ymm0
 ; AVX512F-NEXT:    vmovdqa 112(%rdi), %xmm6
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm7 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm7, %xmm6, %xmm6
 ; AVX512F-NEXT:    vmovdqa 96(%rdi), %xmm8
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm9 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm9 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
 ; AVX512F-NEXT:    vpor %xmm6, %xmm8, %xmm6
 ; AVX512F-NEXT:    vpblendw {{.*#+}} xmm2 = xmm6[0,1,2,3,4],xmm2[5,6,7]
@@ -376,18 +376,18 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512BW-LABEL: f3:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa 64(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,0,4,6,10,12,128,128,128,128,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,128,128,128,128,128,0,2,6,8,12,14]
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,4,6,10,12,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = <2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [2,4,8,10,14,128,128,128,128,128,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm4, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm4
@@ -421,7 +421,7 @@ define <64 x i8> @f3(ptr %p0) {
 ; AVX512VBMI-LABEL: f3:
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    vmovdqa64 (%rdi), %zmm0
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <2,4,8,10,14,16,20,22,26,28,32,34,38,40,44,46,50,52,56,58,62,64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [2,4,8,10,14,16,20,22,26,28,32,34,38,40,44,46,50,52,56,58,62,64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-NEXT:    vpermi2b 64(%rdi), %zmm0, %zmm1
 ; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,64,66,70,72,76,78,82,84,88,90,94,96,100,102,106,108,112,114,118,120,124,126]
 ; AVX512VBMI-NEXT:    vpermi2b 128(%rdi), %zmm1, %zmm0
@@ -438,9 +438,9 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa 32(%rdi), %ymm0
 ; AVX2-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm3
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm4 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm3
 ; AVX2-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX2-NEXT:    vbroadcasti128 {{.*#+}} ymm3 = [0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14,0,4,6,10,12,0,0,0,0,0,0,2,4,8,10,14]
@@ -449,10 +449,10 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX2-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX2-NEXT:    vpblendvb %ymm6, %ymm2, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa 80(%rdi), %xmm2
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = <u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm7 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14]
 ; AVX2-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm8
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = <u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128>
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm9 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128]
 ; AVX2-NEXT:    vpshufb %xmm9, %xmm8, %xmm8
 ; AVX2-NEXT:    vpor %xmm2, %xmm8, %xmm2
 ; AVX2-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
@@ -477,18 +477,18 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX512F-LABEL: f4:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa 176(%rdi), %xmm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14]
 ; AVX512F-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512F-NEXT:    vmovdqa 160(%rdi), %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX512F-NEXT:    vpor %xmm2, %xmm4, %xmm2
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
@@ -517,18 +517,18 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX512BW-LABEL: f4:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa 176(%rdi), %xmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = <u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [u,u,u,u,u,128,128,128,128,128,128,2,4,8,10,14]
 ; AVX512BW-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX512BW-NEXT:    vmovdqa 160(%rdi), %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = <u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,u,u,u,0,2,6,8,12,14,128,128,128,128,128]
 ; AVX512BW-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512BW-NEXT:    vpor %xmm0, %xmm2, %xmm0
 ; AVX512BW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %xmm2
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm4
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = <0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm5 = [0,4,6,10,12,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = <128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm6 = [128,128,128,128,128,0,2,6,8,12,14,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX512BW-NEXT:    vpor %xmm2, %xmm4, %xmm2
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm2, %zmm0
@@ -558,7 +558,7 @@ define <64 x i8> @f4(ptr %p0) {
 ; AVX512VBMI-LABEL: f4:
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    vmovdqa64 64(%rdi), %zmm0
-; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,0,2,6,8,12,14,18,20,24,26,30,32,36,38,42,44,48,50,54,56,60,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u>
+; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [64,68,70,74,76,80,82,86,88,92,94,98,100,104,106,110,112,116,118,122,124,0,2,6,8,12,14,18,20,24,26,30,32,36,38,42,44,48,50,54,56,60,62,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512VBMI-NEXT:    vpermi2b (%rdi), %zmm0, %zmm1
 ; AVX512VBMI-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,66,68,72,74,78,80,84,86,90,92,96,98,102,104,108,110,114,116,120,122,126]
 ; AVX512VBMI-NEXT:    vpermi2b 128(%rdi), %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/widen_arith-2.ll b/llvm/test/CodeGen/X86/widen_arith-2.ll
index 1a39b7635766b..a0f78db8746f4 100644
--- a/llvm/test/CodeGen/X86/widen_arith-2.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-2.ll
@@ -9,7 +9,7 @@ define void @update(ptr %dst_i, ptr %src_i, i32 %n) nounwind {
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    movl $0, (%esp)
 ; CHECK-NEXT:    pcmpeqd %xmm0, %xmm0
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = <4,4,4,4,4,4,4,4,u,u,u,u,u,u,u,u>
+; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [4,4,4,4,4,4,4,4,u,u,u,u,u,u,u,u]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %forcond
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/widen_arith-4.ll b/llvm/test/CodeGen/X86/widen_arith-4.ll
index 9f6232a69d1ae..8bb716374b482 100644
--- a/llvm/test/CodeGen/X86/widen_arith-4.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-4.ll
@@ -14,8 +14,8 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
 ; SSE2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
 ; SSE2-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
-; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = <2,4,2,2,2,u,u,u>
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [271,271,271,271,271,u,u,u]
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,4,2,2,2,u,u,u]
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB0_1: # %forcond
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -48,7 +48,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
 ; SSE41-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; SSE41-NEXT:    movw $0, -{{[0-9]+}}(%rsp)
 ; SSE41-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
-; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = <271,271,271,271,271,u,u,u>
+; SSE41-NEXT:    movdqa {{.*#+}} xmm0 = [271,271,271,271,271,u,u,u]
 ; SSE41-NEXT:    .p2align 4, 0x90
 ; SSE41-NEXT:  .LBB0_1: # %forcond
 ; SSE41-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/widen_arith-5.ll b/llvm/test/CodeGen/X86/widen_arith-5.ll
index 52fcbf4e0e885..8f1cbd1767f88 100644
--- a/llvm/test/CodeGen/X86/widen_arith-5.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-5.ll
@@ -13,7 +13,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
 ; CHECK-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl $1, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl $0, -{{[0-9]+}}(%rsp)
-; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = <3,3,3,u>
+; CHECK-NEXT:    movdqa {{.*#+}} xmm0 = [3,3,3,u]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %forcond
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/widen_arith-6.ll b/llvm/test/CodeGen/X86/widen_arith-6.ll
index 5fea31688e993..239fe25b0c4a9 100644
--- a/llvm/test/CodeGen/X86/widen_arith-6.ll
+++ b/llvm/test/CodeGen/X86/widen_arith-6.ll
@@ -14,7 +14,7 @@ define void @update(ptr %dst, ptr %src, i32 %n) nounwind {
 ; CHECK-NEXT:    movl $1073741824, {{[0-9]+}}(%esp) # imm = 0x40000000
 ; CHECK-NEXT:    movl $1065353216, {{[0-9]+}}(%esp) # imm = 0x3F800000
 ; CHECK-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = <1.97604004E+3,1.97604004E+3,1.97604004E+3,u>
+; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.97604004E+3,1.97604004E+3,1.97604004E+3,u]
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB0_1: # %forcond
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 4f20a9d5db6ac..742a3a0c66a2a 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -379,7 +379,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind {
 ; AVX1OR2:       # %bb.0:
 ; AVX1OR2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1OR2-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = <1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u>
+; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm2 = [1,u,5,u,9,u,13,u,13,u,5,u,12,u,13,u]
 ; AVX1OR2-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
 ; AVX1OR2-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
 ; AVX1OR2-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -388,7 +388,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind {
 ; AVX1OR2-NEXT:    vpblendw {{.*#+}} xmm3 = xmm0[0],xmm3[1],xmm0[2],xmm3[3],xmm0[4],xmm3[5],xmm0[6],xmm3[7]
 ; AVX1OR2-NEXT:    vpackusdw %xmm4, %xmm3, %xmm3
 ; AVX1OR2-NEXT:    vpaddb %xmm2, %xmm3, %xmm2
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm3 = <3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u>
+; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm3 = [3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u]
 ; AVX1OR2-NEXT:    vpshufb %xmm3, %xmm1, %xmm4
 ; AVX1OR2-NEXT:    vpshufb %xmm3, %xmm0, %xmm3
 ; AVX1OR2-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
@@ -404,7 +404,7 @@ define <8 x i8> @interleaved_load_vf8_i8_stride4(ptr %ptr) nounwind {
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqu (%rdi), %ymm0
 ; AVX512-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = <3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u>
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm2 = [3,u,7,u,11,u,15,u,7,u,15,u,6,u,7,u]
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
 ; AVX512-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
 ; AVX512-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0]
@@ -1080,9 +1080,9 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vmovdqa %ymm0, %ymm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm9
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm12
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm13 = <u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10>
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm13 = [u,u,u,u,u,128,128,128,128,128,128,6,7,8,9,10]
 ; AVX1-NEXT:    vpshufb %xmm13, %xmm12, %xmm6
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = <u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128>
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = [u,u,u,u,u,5,6,7,8,9,10,128,128,128,128,128]
 ; AVX1-NEXT:    vextractf128 $1, %ymm3, %xmm11
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm11, %xmm7
 ; AVX1-NEXT:    vpor %xmm6, %xmm7, %xmm0
@@ -1107,7 +1107,7 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
 ; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm6 # 32-byte Reload
 ; AVX1-NEXT:    vpunpckhbw {{.*#+}} xmm1 = xmm6[8],xmm4[8],xmm6[9],xmm4[9],xmm6[10],xmm4[10],xmm6[11],xmm4[11],xmm6[12],xmm4[12],xmm6[13],xmm4[13],xmm6[14],xmm4[14],xmm6[15],xmm4[15]
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = <u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15>
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,u,u,u,4,6,8,10,12,14,7,9,11,13,15]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vextractf128 $1, %ymm6, %xmm9
@@ -1290,9 +1290,9 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
 ; AVX1-NEXT:    vpshufb %xmm9, %xmm10, %xmm7
 ; AVX1-NEXT:    vpshufb %xmm9, %xmm11, %xmm8
 ; AVX1-NEXT:    vpshufb %xmm9, %xmm13, %xmm9
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = <1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u>
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm14 = [1,4,7,10,13,128,128,128,128,128,128,u,u,u,u,u]
 ; AVX1-NEXT:    vpshufb %xmm14, %xmm5, %xmm5
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm15 = <128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u>
+; AVX1-NEXT:    vmovdqa {{.*#+}} xmm15 = [128,128,128,128,128,0,3,6,9,12,15,u,u,u,u,u]
 ; AVX1-NEXT:    vpshufb %xmm15, %xmm2, %xmm12
 ; AVX1-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vpor %xmm5, %xmm12, %xmm0
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index 8d6c1483d817f..023ac96181d3c 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -2329,7 +2329,7 @@ define void @vec256_v8i32_to_v2i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-FAST:       # %bb.0:
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,u,u,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,u,u,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm0
 ; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
@@ -2855,7 +2855,7 @@ define void @vec384_v48i8_to_v16i24_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2,u,3,3,u,4,4,u,5>
+; AVX512BW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2,u,3,3,u,4,4,u,5]
 ; AVX512BW-NEXT:    vpermw %ymm0, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} xmm0 = zero,xmm0[11],zero,zero,xmm0[12],zero,zero,xmm0[13],zero,zero,xmm0[14],zero,zero,xmm0[15],zero,zero
@@ -3824,7 +3824,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-SLOW:       # %bb.0:
 ; AVX2-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2>
+; AVX2-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2]
 ; AVX2-SLOW-NEXT:    vpermd %ymm0, %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX2-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
@@ -3841,7 +3841,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-FAST-PERLANE:       # %bb.0:
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-PERLANE-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2>
+; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
@@ -3856,7 +3856,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX2-FAST:       # %bb.0:
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
@@ -3871,7 +3871,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-SLOW:       # %bb.0:
 ; AVX512F-SLOW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-SLOW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2>
+; AVX512F-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2]
 ; AVX512F-SLOW-NEXT:    vpermd %ymm0, %ymm1, %ymm1
 ; AVX512F-SLOW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512F-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
@@ -3888,7 +3888,7 @@ define void @vec384_v24i16_to_v8i48_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-FAST:       # %bb.0:
 ; AVX512F-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = <0,0,u,1,1,u,2,2>
+; AVX512F-FAST-NEXT:    vmovdqa {{.*#+}} ymm1 = [0,0,u,1,1,u,2,2]
 ; AVX512F-FAST-NEXT:    vpermd %ymm0, %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[12,13],zero,zero,zero,zero,xmm0[14,15],zero,zero,zero,zero
@@ -4855,7 +4855,7 @@ define void @vec384_v12i32_to_v4i96_factor3(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <16,1,2,17,4,5,18,7,8,19,10,11,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,2,17,4,5,18,7,8,19,10,11,u,u,u,u]
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
@@ -5009,7 +5009,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[8,9,10,11],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -5024,7 +5024,7 @@ define void @vec384_v12i32_to_v3i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = <16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [16,1,2,3,17,5,6,7,18,9,10,11,u,u,u,u]
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpermt2d %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
@@ -5369,7 +5369,7 @@ define void @vec384_v6i64_to_v3i128_factor2(ptr %in.vec.base.ptr, ptr %in.vec.bi
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,9,1,11,2,13,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,9,1,11,2,13,u,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -7178,10 +7178,10 @@ define void @vec512_v16i32_to_v4i128_factor4(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2-FAST-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-FAST-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX2-FAST-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = <0,u,u,u,1,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm2 = [0,u,u,u,1,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm2, %ymm2
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm2 = ymm2[0],ymm1[1,2,3],ymm2[4],ymm1[5,6,7]
-; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = <2,u,u,u,3,u,u,u>
+; AVX2-FAST-NEXT:    vmovdqa {{.*#+}} ymm3 = [2,u,u,u,3,u,u,u]
 ; AVX2-FAST-NEXT:    vpermd %ymm0, %ymm3, %ymm0
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3],ymm0[4],ymm1[5,6,7]
 ; AVX2-FAST-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index a7b1fa24ce17d..b4c79aff5cef1 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; SSE42-NEXT:    paddb 48(%rsi), %xmm2
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    paddb 32(%rsi), %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    pshufb %xmm3, %xmm1
 ; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -4736,7 +4736,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -4754,7 +4754,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -4941,7 +4941,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -4959,7 +4959,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -5099,7 +5099,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -5117,7 +5117,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -5374,7 +5374,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -5392,7 +5392,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u]
 ; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index b88e2921484d9..23f02e9245eed 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
 ; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
 ; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = <1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u>
+; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
 ; SSE42-NEXT:    pshufb %xmm3, %xmm1
 ; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -2968,7 +2968,7 @@ define void @vec384_i16_widen_to_i32_factor2_broadcast_to_v12i32_factor12(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,25,0,27,0,29,0,31,0,41,0,43,0,45,0,47,0,49,0,51,0,53,0,55,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -3123,7 +3123,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,25,26,0,28,29,0,31,40,0,42,43,0,45,46,0,48,49,0,51,52,0,54,55,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -3307,7 +3307,7 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,25,26,27,0,29,30,31,0,41,42,43,0,45,46,47,0,49,50,51,0,53,54,55,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2w %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -3465,7 +3465,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.
 ;
 ; AVX512BW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <32,57,58,59,60,61,32,63,8,9,10,11,32,13,14,15,16,17,32,19,20,21,22,23,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,60,61,32,63,8,9,10,11,32,13,14,15,16,17,32,19,20,21,22,23,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
@@ -3580,7 +3580,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i
 ;
 ; AVX512BW-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <32,57,58,59,60,61,62,63,32,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,32,9,10,11,12,13,14,15,32,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
@@ -3693,7 +3693,7 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr %
 ;
 ; AVX512BW-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <32,57,58,59,60,61,62,63,8,9,10,11,32,13,14,15,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [32,57,58,59,60,61,62,63,8,9,10,11,32,13,14,15,16,17,18,19,20,21,22,23,u,u,u,u,u,u,u,u]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpermt2w (%rdi), %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
@@ -3822,7 +3822,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -3836,7 +3836,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -3850,7 +3850,7 @@ define void @vec384_i32_widen_to_i64_factor2_broadcast_to_v6i64_factor6(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,0,15,0,21,0,23,0,25,0,27,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -3978,7 +3978,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -3992,7 +3992,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -4006,7 +4006,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,0,20,21,0,23,24,0,26,27,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -4092,7 +4092,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u]
 ; AVX512F-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -4106,7 +4106,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u]
 ; AVX512DQ-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -4120,7 +4120,7 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,13,14,15,0,21,22,23,0,25,26,27,u,u,u,u]
 ; AVX512BW-NEXT:    vpermi2d %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -4230,7 +4230,7 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i
 ;
 ; AVX512BW-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = <16,29,30,31,4,5,16,7,8,9,10,11,u,u,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [16,29,30,31,4,5,16,7,8,9,10,11,u,u,u,u]
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpermt2d (%rdi), %zmm0, %zmm1
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm1, %zmm0
@@ -4315,7 +4315,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u>
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u]
 ; AVX512F-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -4329,7 +4329,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512DQ-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u>
+; AVX512DQ-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u]
 ; AVX512DQ-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512DQ-NEXT:    vextracti64x4 $1, %zmm2, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm0
@@ -4343,7 +4343,7 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,0,11,0,13,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,7,0,11,0,13,u,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -4453,7 +4453,7 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,7,10,0,12,13,u,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,7,10,0,12,13,u,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rdx)
@@ -4919,7 +4919,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpxor %xmm1, %xmm1, %xmm1
-; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = <0,u,10,u,0,u,14,u>
+; AVX512BW-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,u,10,u,0,u,14,u]
 ; AVX512BW-NEXT:    vpermi2q %zmm1, %zmm0, %zmm2
 ; AVX512BW-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm2, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0

From e7b4ff8119403509da3e7941dcb86b1c6a6d61c5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 23 Jan 2024 12:37:58 +0000
Subject: [PATCH 609/843] [AArch64] Add vec3 tests with add between load and
 store.

Extra tests for
  https://github.com/llvm/llvm-project/pull/78637
  https://github.com/llvm/llvm-project/pull/78632
---
 .../AArch64/vec3-loads-ext-trunc-stores.ll    | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)

diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 9040e9007c084..2cb103bd0b3e9 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -278,6 +278,58 @@ entry:
   ret void
 }
 
+define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
+; CHECK-LABEL: store_trunc_add_from_64bits:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    add x9, x0, #4
+; CHECK-NEXT:  Lloh0:
+; CHECK-NEXT:    adrp x8, lCPI7_0@PAGE
+; CHECK-NEXT:  Lloh1:
+; CHECK-NEXT:    ldr d1, [x8, lCPI7_0@PAGEOFF]
+; CHECK-NEXT:    ld1.h { v0 }[2], [x9]
+; CHECK-NEXT:    add.4h v0, v0, v1
+; CHECK-NEXT:    xtn.8b v1, v0
+; CHECK-NEXT:    umov.h w8, v0[2]
+; CHECK-NEXT:    str s1, [sp, #12]
+; CHECK-NEXT:    ldrh w9, [sp, #12]
+; CHECK-NEXT:    strb w8, [x1, #2]
+; CHECK-NEXT:    strh w9, [x1]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh0, Lloh1
+;
+; BE-LABEL: store_trunc_add_from_64bits:
+; BE:       // %bb.0: // %entry
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldr s0, [x0]
+; BE-NEXT:    add x8, x0, #4
+; BE-NEXT:    rev32 v0.4h, v0.4h
+; BE-NEXT:    ld1 { v0.h }[2], [x8]
+; BE-NEXT:    adrp x8, .LCPI7_0
+; BE-NEXT:    add x8, x8, :lo12:.LCPI7_0
+; BE-NEXT:    ld1 { v1.4h }, [x8]
+; BE-NEXT:    add v0.4h, v0.4h, v1.4h
+; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    umov w8, v0.h[2]
+; BE-NEXT:    rev32 v1.16b, v1.16b
+; BE-NEXT:    str s1, [sp, #12]
+; BE-NEXT:    ldrh w9, [sp, #12]
+; BE-NEXT:    strb w8, [x1, #2]
+; BE-NEXT:    strh w9, [x1]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+entry:
+  %l = load <3 x i16>, ptr %src, align 1
+  %a = add <3 x i16> %l, <i16 3, i16 4, i16 5>
+  %t = trunc <3 x i16> %a to <3 x i8>
+  store <3 x i8> %t, ptr %dst, align 1
+  ret void
+}
+
 define void @load_ext_to_64bits(ptr %src, ptr %dst) {
 ; CHECK-LABEL: load_ext_to_64bits:
 ; CHECK:       ; %bb.0: ; %entry
@@ -321,6 +373,60 @@ entry:
   ret void
 }
 
+define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
+; CHECK-LABEL: load_ext_add_to_64bits:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldrh w9, [x0]
+; CHECK-NEXT:  Lloh2:
+; CHECK-NEXT:    adrp x8, lCPI9_0@PAGE
+; CHECK-NEXT:  Lloh3:
+; CHECK-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
+; CHECK-NEXT:    add x8, x1, #4
+; CHECK-NEXT:    strh w9, [sp, #12]
+; CHECK-NEXT:    add x9, x0, #2
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    ld1.b { v0 }[4], [x9]
+; CHECK-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-NEXT:    add.4h v0, v0, v1
+; CHECK-NEXT:    st1.h { v0 }[2], [x8]
+; CHECK-NEXT:    str s0, [x1]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+; CHECK-NEXT:    .loh AdrpLdr Lloh2, Lloh3
+;
+; BE-LABEL: load_ext_add_to_64bits:
+; BE:       // %bb.0: // %entry
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldrh w8, [x0]
+; BE-NEXT:    strh w8, [sp, #12]
+; BE-NEXT:    add x8, x0, #2
+; BE-NEXT:    ldr s0, [sp, #12]
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    ushll v0.8h, v0.8b, #0
+; BE-NEXT:    ld1 { v0.b }[4], [x8]
+; BE-NEXT:    adrp x8, .LCPI9_0
+; BE-NEXT:    add x8, x8, :lo12:.LCPI9_0
+; BE-NEXT:    ld1 { v1.4h }, [x8]
+; BE-NEXT:    add x8, x1, #4
+; BE-NEXT:    bic v0.4h, #255, lsl #8
+; BE-NEXT:    add v0.4h, v0.4h, v1.4h
+; BE-NEXT:    rev32 v1.8h, v0.8h
+; BE-NEXT:    st1 { v0.h }[2], [x8]
+; BE-NEXT:    str s1, [x1]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+entry:
+  %l = load <3 x i8>, ptr %src, align 1
+  %e = zext <3 x i8> %l to <3 x i16>
+  %a = add <3 x i16> %e, <i16 3, i16 4, i16 5>
+  store <3 x i16> %a, ptr %dst, align 1
+  ret void
+}
+
 define void @shift_trunc_store(ptr %src, ptr %dst) {
 ; CHECK-LABEL: shift_trunc_store:
 ; CHECK:       ; %bb.0:

From e60d780f4a9ac6dc79aebe197d3575e9ea636b7f Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Mon, 22 Jan 2024 20:28:43 -0500
Subject: [PATCH 610/843] [gn] port 3ab8d2aac7bc

3ab8d2aac7bc relanded in 3112578597c031, so reland this too.

Might want to set this to True (and add a few source files to
builtins) at some point, but for now heal the bots.
---
 llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
index bfe98652d0baf..e82ec494993d0 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
@@ -64,6 +64,7 @@ write_cmake_config("lit_common_configured") {
     "SANITIZER_CAN_USE_CXXABI_PYBOOL=True",
     "SANITIZER_USE_STATIC_CXX_ABI_PYBOOL=False",
     "SANITIZER_USE_STATIC_LLVM_UNWINDER_PYBOOL=False",
+    "COMPILER_RT_HAS_AARCH64_SME=False",
     "COMPILER_RT_HAS_LLD_PYBOOL=True",
     "COMPILER_RT_HAS_GWP_ASAN_PYBOOL=False",
     "HAVE_RPC_XDR_H=0",

From 6b02d2f86389d68a0cf2162377c5dda05bd4b68a Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Tue, 23 Jan 2024 13:48:03 +0100
Subject: [PATCH 611/843] [reland][libc] Remove unnecessary `FPBits` functions
 and properties (#79128)

- reland #79113
- Fix aarch64 RISC-V build
---
 libc/fuzzing/stdlib/strtofloat_fuzz.cpp       |  2 +-
 .../FPUtil/DivisionAndRemainderOperations.h   |  2 +-
 libc/src/__support/FPUtil/FPBits.h            | 51 +++---------
 libc/src/__support/FPUtil/Hypot.h             |  4 +-
 .../__support/FPUtil/ManipulationFunctions.h  | 24 +++---
 libc/src/__support/FPUtil/NormalFloat.h       |  2 +-
 libc/src/__support/FPUtil/aarch64/FEnvImpl.h  |  4 +-
 libc/src/__support/FPUtil/dyadic_float.h      |  2 +-
 .../src/__support/FPUtil/except_value_utils.h |  6 +-
 libc/src/__support/FPUtil/generic/FMA.h       | 10 +--
 libc/src/__support/FPUtil/generic/FMod.h      |  2 +-
 libc/src/__support/FPUtil/generic/sqrt.h      | 18 +++--
 .../FPUtil/generic/sqrt_80_bit_long_double.h  |  8 +-
 .../FPUtil/x86_64/NextAfterLongDouble.h       | 58 +++++++------
 libc/src/__support/common.h                   |  1 -
 libc/src/__support/str_to_float.h             |  4 +-
 libc/src/math/generic/acosf.cpp               |  2 +-
 libc/src/math/generic/acoshf.cpp              |  2 +-
 libc/src/math/generic/asinf.cpp               |  2 +-
 libc/src/math/generic/atanhf.cpp              |  4 +-
 libc/src/math/generic/cosf.cpp                |  2 +-
 libc/src/math/generic/coshf.cpp               |  6 +-
 libc/src/math/generic/exp.cpp                 |  4 +-
 libc/src/math/generic/exp10.cpp               |  4 +-
 libc/src/math/generic/exp10f_impl.h           |  6 +-
 libc/src/math/generic/exp2.cpp                |  4 +-
 libc/src/math/generic/exp2f_impl.h            |  6 +-
 libc/src/math/generic/expf.cpp                |  4 +-
 libc/src/math/generic/expm1.cpp               |  4 +-
 libc/src/math/generic/expm1f.cpp              |  4 +-
 libc/src/math/generic/log.cpp                 | 11 +--
 libc/src/math/generic/log10.cpp               | 11 +--
 libc/src/math/generic/log10f.cpp              |  8 +-
 libc/src/math/generic/log1p.cpp               |  7 +-
 libc/src/math/generic/log1pf.cpp              |  4 +-
 libc/src/math/generic/log2.cpp                | 11 +--
 libc/src/math/generic/log2f.cpp               |  8 +-
 libc/src/math/generic/logf.cpp                |  9 ++-
 libc/src/math/generic/powf.cpp                | 13 +--
 libc/src/math/generic/sincosf.cpp             |  4 +-
 libc/src/math/generic/sinf.cpp                |  2 +-
 libc/src/math/generic/sinhf.cpp               |  6 +-
 libc/src/math/generic/tanf.cpp                |  2 +-
 libc/test/UnitTest/FPMatcher.h                | 40 ++++-----
 .../test/src/__support/FPUtil/fpbits_test.cpp | 81 ++++++++-----------
 libc/test/src/math/FDimTest.h                 |  2 +-
 libc/test/src/math/FmaTest.h                  | 37 +++++----
 libc/test/src/math/HypotTest.h                | 43 +++++-----
 libc/test/src/math/ILogbTest.h                | 22 ++---
 libc/test/src/math/LdExpTest.h                |  2 +-
 libc/test/src/math/NextAfterTest.h            | 10 +--
 libc/test/src/math/RIntTest.h                 | 21 ++---
 libc/test/src/math/RemQuoTest.h               | 25 +++---
 libc/test/src/math/RoundToIntegerTest.h       | 22 ++---
 .../BinaryOpSingleOutputDiff.h                | 16 ++--
 .../SingleInputSingleOutputDiff.h             |  7 +-
 libc/test/src/math/smoke/FDimTest.h           |  2 +-
 libc/test/src/math/smoke/FmaTest.h            | 16 ++--
 libc/test/src/math/smoke/HypotTest.h          | 20 ++---
 libc/test/src/math/smoke/ILogbTest.h          | 18 ++---
 libc/test/src/math/smoke/LdExpTest.h          |  2 +-
 libc/test/src/math/smoke/NextAfterTest.h      | 14 ++--
 libc/test/src/math/smoke/NextTowardTest.h     | 22 ++---
 libc/test/src/math/smoke/RIntTest.h           |  2 +-
 libc/test/src/math/smoke/RemQuoTest.h         |  2 +-
 libc/test/src/math/smoke/RoundToIntegerTest.h | 22 ++---
 libc/test/src/stdio/sprintf_test.cpp          | 32 +++++---
 libc/test/src/stdio/sscanf_test.cpp           | 21 +++--
 libc/utils/MPFRWrapper/MPFRUtils.cpp          |  2 +-
 69 files changed, 428 insertions(+), 423 deletions(-)

diff --git a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
index affef6fcf549e..b773043bda67d 100644
--- a/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
+++ b/libc/fuzzing/stdlib/strtofloat_fuzz.cpp
@@ -28,7 +28,7 @@ using LIBC_NAMESPACE::fputil::FPBits;
 // exponent. Subnormals have a lower effective precision since they don't
 // necessarily use all of the bits of the mantissa.
 template <typename F> inline constexpr int effective_precision(int exponent) {
-  const int full_precision = FPBits<F>::MANTISSA_PRECISION;
+  const int full_precision = FPBits<F>::FRACTION_LEN + 1;
 
   // This is intended to be 0 when the exponent is the lowest normal and
   // increase as the exponent's magnitude increases.
diff --git a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
index 1798310c3e31e..ef9593a42b005 100644
--- a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
+++ b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
@@ -31,7 +31,7 @@ LIBC_INLINE T remquo(T x, T y, int &q) {
   if (ybits.is_nan())
     return y;
   if (xbits.is_inf() || ybits.is_zero())
-    return FPBits<T>::build_quiet_nan(1);
+    return FPBits<T>::build_quiet_nan(fputil::Sign::POS, 1).get_val();
 
   if (xbits.is_zero()) {
     q = 0;
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 2465158bb2cdf..0a79b505ecbe1 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -390,7 +390,7 @@ struct FPRepSem : public FPStorage<fp_type> {
     return exp_bits() == encode(BiasedExp::BITS_ALL_ZEROES());
   }
   LIBC_INLINE constexpr bool is_normal() const {
-    return is_finite() && !UP::is_subnormal();
+    return is_finite() && !is_subnormal();
   }
   // Returns the mantissa with the implicit bit set iff the current
   // value is a valid normal number.
@@ -556,6 +556,14 @@ struct FPRep : public FPRepSem<fp_type, RetT> {
   using UP::FRACTION_MASK;
   using UP::SIGN_MASK;
 
+  // Comparison
+  LIBC_INLINE constexpr friend bool operator==(FPRep a, FPRep b) {
+    return a.uintval() == b.uintval();
+  }
+  LIBC_INLINE constexpr friend bool operator!=(FPRep a, FPRep b) {
+    return a.uintval() != b.uintval();
+  }
+
   // Representation
   LIBC_INLINE constexpr StorageType uintval() const { return bits & FP_MASK; }
   LIBC_INLINE constexpr void set_uintval(StorageType value) {
@@ -698,16 +706,6 @@ struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
   using UP::bits;
 
   // Constants.
-  LIBC_INLINE_VAR static constexpr uint32_t MANTISSA_PRECISION =
-      UP::FRACTION_LEN + 1;
-  LIBC_INLINE_VAR static constexpr StorageType MIN_NORMAL =
-      UP::min_normal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr StorageType MAX_NORMAL =
-      UP::max_normal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr StorageType MIN_SUBNORMAL =
-      UP::min_subnormal(Sign::POS).uintval();
-  LIBC_INLINE_VAR static constexpr StorageType MAX_SUBNORMAL =
-      UP::max_subnormal(Sign::POS).uintval();
   LIBC_INLINE_VAR static constexpr int MAX_BIASED_EXPONENT =
       (1 << UP::EXP_LEN) - 1;
 
@@ -731,37 +729,6 @@ struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
 
   LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
 
-  // Methods below this are used by tests.
-  // TODO: inline and remove.
-  LIBC_INLINE static constexpr T one(Sign sign = Sign::POS) {
-    return T(UP::one(sign));
-  }
-  LIBC_INLINE static constexpr T zero(Sign sign = Sign::POS) {
-    return T(UP::zero(sign));
-  }
-  LIBC_INLINE static constexpr T inf(Sign sign = Sign::POS) {
-    return T(UP::inf(sign));
-  }
-  LIBC_INLINE static constexpr T min_normal() {
-    return T(UP::min_normal(Sign::POS));
-  }
-  LIBC_INLINE static constexpr T max_normal() {
-    return T(UP::max_normal(Sign::POS));
-  }
-  LIBC_INLINE static constexpr T min_denormal() {
-    return T(UP::min_subnormal(Sign::POS));
-  }
-  LIBC_INLINE static constexpr T max_denormal() {
-    return T(UP::max_subnormal(Sign::POS));
-  }
-  LIBC_INLINE static constexpr T build_nan(StorageType v) {
-    return T(UP::build_nan(Sign::POS, v));
-  }
-  LIBC_INLINE static constexpr T build_quiet_nan(StorageType v,
-                                                 Sign sign = Sign::POS) {
-    return T(UP::build_quiet_nan(sign, v));
-  }
-
   // TODO: Use an uint32_t for 'biased_exp'.
   LIBC_INLINE static constexpr FPBits<T>
   create_value(Sign sign, StorageType biased_exp, StorageType mantissa) {
diff --git a/libc/src/__support/FPUtil/Hypot.h b/libc/src/__support/FPUtil/Hypot.h
index c38a40dfb0898..82237dec09e42 100644
--- a/libc/src/__support/FPUtil/Hypot.h
+++ b/libc/src/__support/FPUtil/Hypot.h
@@ -197,7 +197,7 @@ LIBC_INLINE T hypot(T x, T y) {
         if (int round_mode = quick_get_round();
             round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
           return T(FPBits_t::inf());
-        return T(FPBits_t(FPBits_t::MAX_NORMAL));
+        return T(FPBits_t::max_normal());
       }
     } else {
       // For denormal result, we simply move the leading bit of the result to
@@ -254,7 +254,7 @@ LIBC_INLINE T hypot(T x, T y) {
     if (out_exp >= FPBits_t::MAX_BIASED_EXPONENT) {
       if (round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
         return T(FPBits_t::inf());
-      return T(FPBits_t(FPBits_t::MAX_NORMAL));
+      return T(FPBits_t::max_normal());
     }
   }
 
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index 81c8281f3c7bb..d7114625a9b31 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -108,7 +108,7 @@ LIBC_INLINE T logb(T x) {
     return x;
   } else if (bits.is_inf()) {
     // Return positive infinity.
-    return T(FPBits<T>::inf(Sign::POS));
+    return T(FPBits<T>::inf());
   }
 
   NormalFloat<T> normal(bits);
@@ -127,7 +127,7 @@ LIBC_INLINE T ldexp(T x, int exp) {
   // that adding |exp| to it does not lead to integer rollover. But, if |exp|
   // value is larger the exponent range for type T, then we can return infinity
   // early. Because the result of the ldexp operation can be a subnormal number,
-  // we need to accommodate the (mantissaWidht + 1) worth of shift in
+  // we need to accommodate the (mantissaWidth + 1) worth of shift in
   // calculating the limit.
   int exp_limit = FPBits<T>::MAX_BIASED_EXPONENT + FPBits<T>::FRACTION_LEN + 1;
   if (exp > exp_limit)
@@ -164,26 +164,22 @@ LIBC_INLINE T nextafter(T from, U to) {
     return static_cast<T>(to);
 
   using StorageType = typename FPBits<T>::StorageType;
-  StorageType int_val = from_bits.uintval();
-  if (from != FPBits<T>::zero()) {
-    if ((static_cast<U>(from) < to) == (from > FPBits<T>::zero())) {
-      ++int_val;
+  if (from != T(0)) {
+    if ((static_cast<U>(from) < to) == (from > T(0))) {
+      from_bits = FPBits<T>(StorageType(from_bits.uintval() + 1));
     } else {
-      --int_val;
+      from_bits = FPBits<T>(StorageType(from_bits.uintval() - 1));
     }
   } else {
-    int_val = FPBits<T>::MIN_SUBNORMAL;
-    if (to_bits.is_neg())
-      int_val |= FPBits<T>::SIGN_MASK;
+    from_bits = FPBits<T>::min_subnormal(to_bits.sign());
   }
 
-  StorageType exponent_bits = int_val & FPBits<T>::EXP_MASK;
-  if (exponent_bits == StorageType(0))
+  if (from_bits.is_subnormal())
     raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
-  else if (exponent_bits == FPBits<T>::EXP_MASK)
+  else if (from_bits.is_inf())
     raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
 
-  return cpp::bit_cast<T>(int_val);
+  return from_bits.get_val();
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/FPUtil/NormalFloat.h b/libc/src/__support/FPUtil/NormalFloat.h
index cfa9e14175105..fa4da33b5b17f 100644
--- a/libc/src/__support/FPUtil/NormalFloat.h
+++ b/libc/src/__support/FPUtil/NormalFloat.h
@@ -215,7 +215,7 @@ template <> LIBC_INLINE NormalFloat<long double>::operator long double() const {
   // Max exponent is of the form 0xFF...E. That is why -2 and not -1.
   constexpr int MAX_EXPONENT_VALUE = (1 << LDBits::EXP_LEN) - 2;
   if (biased_exponent > MAX_EXPONENT_VALUE) {
-    return LDBits::inf(sign);
+    return LDBits::inf(sign).get_val();
   }
 
   FPBits<long double> result(0.0l);
diff --git a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
index 23cde88c9c7c5..e0eec17e038c6 100644
--- a/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/aarch64/FEnvImpl.h
@@ -155,8 +155,8 @@ LIBC_INLINE int set_except(int excepts) {
 LIBC_INLINE int raise_except(int excepts) {
   float zero = 0.0f;
   float one = 1.0f;
-  float largeValue = FPBits<float>::max_normal();
-  float smallValue = FPBits<float>::min_normal();
+  float largeValue = FPBits<float>::max_normal().get_val();
+  float smallValue = FPBits<float>::min_normal().get_val();
   auto divfunc = [](float a, float b) {
     __asm__ __volatile__("ldr  s0, %0\n\t"
                          "ldr  s1, %1\n\t"
diff --git a/libc/src/__support/FPUtil/dyadic_float.h b/libc/src/__support/FPUtil/dyadic_float.h
index 5449f5561d569..888d7ffec241e 100644
--- a/libc/src/__support/FPUtil/dyadic_float.h
+++ b/libc/src/__support/FPUtil/dyadic_float.h
@@ -93,7 +93,7 @@ template <size_t Bits> struct DyadicFloat {
       return 0.0;
 
     // Assume that it is normalized, and output is also normal.
-    constexpr uint32_t PRECISION = FPBits<T>::MANTISSA_PRECISION;
+    constexpr uint32_t PRECISION = FPBits<T>::FRACTION_LEN + 1;
     using output_bits_t = typename FPBits<T>::StorageType;
 
     int exp_hi = exponent + static_cast<int>((Bits - 1) + FPBits<T>::EXP_BIAS);
diff --git a/libc/src/__support/FPUtil/except_value_utils.h b/libc/src/__support/FPUtil/except_value_utils.h
index 89849540315f6..1e0381194009d 100644
--- a/libc/src/__support/FPUtil/except_value_utils.h
+++ b/libc/src/__support/FPUtil/except_value_utils.h
@@ -102,15 +102,13 @@ template <typename T, size_t N> struct ExceptValues {
 // Helper functions to set results for exceptional cases.
 template <typename T> LIBC_INLINE T round_result_slightly_down(T value_rn) {
   volatile T tmp = value_rn;
-  const T MIN_NORMAL = FPBits<T>::min_normal();
-  tmp = tmp - MIN_NORMAL;
+  tmp -= FPBits<T>::min_normal().get_val();
   return tmp;
 }
 
 template <typename T> LIBC_INLINE T round_result_slightly_up(T value_rn) {
   volatile T tmp = value_rn;
-  const T MIN_NORMAL = FPBits<T>::min_normal();
-  tmp = tmp + MIN_NORMAL;
+  tmp += FPBits<T>::min_normal().get_val();
   return tmp;
 }
 
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 6285cac1983d1..5c36463ea5021 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -130,9 +130,9 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
     return x * y + z;
 
   // Extract mantissa and append hidden leading bits.
-  UInt128 x_mant = x_bits.get_mantissa() | FPBits::MIN_NORMAL;
-  UInt128 y_mant = y_bits.get_mantissa() | FPBits::MIN_NORMAL;
-  UInt128 z_mant = z_bits.get_mantissa() | FPBits::MIN_NORMAL;
+  UInt128 x_mant = x_bits.get_explicit_mantissa();
+  UInt128 y_mant = y_bits.get_explicit_mantissa();
+  UInt128 z_mant = z_bits.get_explicit_mantissa();
 
   // If the exponent of the product x*y > the exponent of z, then no extra
   // precision beside the entire product x*y is needed.  On the other hand, when
@@ -255,9 +255,7 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
     if ((round_mode == FE_TOWARDZERO) ||
         (round_mode == FE_UPWARD && prod_sign.is_neg()) ||
         (round_mode == FE_DOWNWARD && prod_sign.is_pos())) {
-      result = FPBits::MAX_NORMAL;
-      return prod_sign.is_neg() ? -cpp::bit_cast<double>(result)
-                                : cpp::bit_cast<double>(result);
+      return FPBits::max_normal(prod_sign).get_val();
     }
     return static_cast<double>(FPBits::inf(prod_sign));
   }
diff --git a/libc/src/__support/FPUtil/generic/FMod.h b/libc/src/__support/FPUtil/generic/FMod.h
index f4000b97751ef..18355b801dbc7 100644
--- a/libc/src/__support/FPUtil/generic/FMod.h
+++ b/libc/src/__support/FPUtil/generic/FMod.h
@@ -124,7 +124,7 @@ template <typename T> struct FModExceptionalInputHandler {
 
   LIBC_INLINE static bool pre_check(T x, T y, T &out) {
     using FPB = fputil::FPBits<T>;
-    const T quiet_nan = FPB::build_quiet_nan(0);
+    const T quiet_nan = FPB::build_quiet_nan().get_val();
     FPB sx(x), sy(y);
     if (LIBC_LIKELY(!sy.is_zero() && !sy.is_inf_or_nan() &&
                     !sx.is_inf_or_nan())) {
diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index 21ae9d081d3f1..0a0690ec1463b 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -71,15 +71,19 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
     return x86::sqrt(x);
   } else {
     // IEEE floating points formats.
-    using StorageType = typename FPBits<T>::StorageType;
-    constexpr StorageType ONE = StorageType(1) << FPBits<T>::FRACTION_LEN;
+    using Sign = fputil::Sign;
+    using FPBits_t = typename fputil::FPBits<T>;
+    using StorageType = typename FPBits_t::StorageType;
+    constexpr StorageType ONE = StorageType(1) << FPBits_t::FRACTION_LEN;
+    constexpr auto FLT_NAN =
+        FPBits_t::build_quiet_nan(Sign::POS, ONE >> 1).get_val();
 
-    FPBits<T> bits(x);
+    FPBits_t bits(x);
 
     if (bits.is_inf_or_nan()) {
       if (bits.is_neg() && (bits.get_mantissa() == 0)) {
         // sqrt(-Inf) = NaN
-        return FPBits<T>::build_quiet_nan(ONE >> 1);
+        return FLT_NAN;
       } else {
         // sqrt(NaN) = NaN
         // sqrt(+Inf) = +Inf
@@ -91,7 +95,7 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
       return x;
     } else if (bits.is_neg()) {
       // sqrt( negative numbers ) = NaN
-      return FPBits<T>::build_quiet_nan(ONE >> 1);
+      return FLT_NAN;
     } else {
       int x_exp = bits.get_exponent();
       StorageType x_mant = bits.get_mantissa();
@@ -145,10 +149,10 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
       }
 
       // Remove hidden bit and append the exponent field.
-      x_exp = ((x_exp >> 1) + FPBits<T>::EXP_BIAS);
+      x_exp = ((x_exp >> 1) + FPBits_t::EXP_BIAS);
 
       y = (y - ONE) |
-          (static_cast<StorageType>(x_exp) << FPBits<T>::FRACTION_LEN);
+          (static_cast<StorageType>(x_exp) << FPBits_t::FRACTION_LEN);
 
       switch (quick_get_round()) {
       case FE_TONEAREST:
diff --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
index 4f8d136938f56..b0a3776029ca7 100644
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@@ -38,14 +38,16 @@ LIBC_INLINE long double sqrt(long double x);
 LIBC_INLINE long double sqrt(long double x) {
   using LDBits = FPBits<long double>;
   using StorageType = typename LDBits::StorageType;
+  using Sign = fputil::Sign;
   constexpr StorageType ONE = StorageType(1) << int(LDBits::FRACTION_LEN);
+  constexpr auto LDNAN = LDBits::build_quiet_nan(Sign::POS, ONE >> 1).get_val();
 
-  FPBits<long double> bits(x);
+  LDBits bits(x);
 
   if (bits.is_inf_or_nan()) {
     if (bits.is_neg() && (bits.get_mantissa() == 0)) {
       // sqrt(-Inf) = NaN
-      return LDBits::build_quiet_nan(ONE >> 1);
+      return LDNAN;
     } else {
       // sqrt(NaN) = NaN
       // sqrt(+Inf) = +Inf
@@ -57,7 +59,7 @@ LIBC_INLINE long double sqrt(long double x) {
     return x;
   } else if (bits.is_neg()) {
     // sqrt( negative numbers ) = NaN
-    return LDBits::build_quiet_nan(ONE >> 1);
+    return LDNAN;
   } else {
     int x_exp = bits.get_explicit_exponent();
     StorageType x_mant = bits.get_mantissa();
diff --git a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
index 512f5de4e7931..d1c76ba954b93 100644
--- a/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
+++ b/libc/src/__support/FPUtil/x86_64/NextAfterLongDouble.h
@@ -43,16 +43,18 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
   }
 
   using StorageType = FPBits::StorageType;
-  constexpr StorageType SIGN_VAL = (StorageType(1) << 79);
+
   constexpr StorageType FRACTION_MASK = FPBits::FRACTION_MASK;
-  StorageType int_val = from_bits.uintval();
-  if (from < 0.0l) {
-    if (from > to) {
-      if (int_val == (SIGN_VAL + FPBits::MAX_SUBNORMAL)) {
+  // StorageType int_val = from_bits.uintval();
+  if (from == 0.0l) { // +0.0 / -0.0
+    from_bits = FPBits::min_subnormal(from > to ? Sign::NEG : Sign::POS);
+  } else if (from < 0.0l) {
+    if (to < from) { // toward -inf
+      if (from_bits == FPBits::max_subnormal(Sign::NEG)) {
         // We deal with normal/subnormal boundary separately to avoid
         // dealing with the implicit bit.
-        int_val = SIGN_VAL + FPBits::MIN_NORMAL;
-      } else if ((int_val & FRACTION_MASK) == FRACTION_MASK) {
+        from_bits = FPBits::min_normal(Sign::NEG);
+      } else if (from_bits.get_mantissa() == FRACTION_MASK) {
         from_bits.set_mantissa(0);
         // Incrementing exponent might overflow the value to infinity,
         // which is what is expected. Since NaNs are handling separately,
@@ -62,45 +64,40 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
           raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
         return from_bits.get_val();
       } else {
-        ++int_val;
+        from_bits = FPBits(StorageType(from_bits.uintval() + 1));
       }
-    } else {
-      if (int_val == (SIGN_VAL + FPBits::MIN_NORMAL)) {
+    } else { // toward +inf
+      if (from_bits == FPBits::min_normal(Sign::NEG)) {
         // We deal with normal/subnormal boundary separately to avoid
         // dealing with the implicit bit.
-        int_val = SIGN_VAL + FPBits::MAX_SUBNORMAL;
-      } else if ((int_val & FRACTION_MASK) == 0) {
+        from_bits = FPBits::max_subnormal(Sign::NEG);
+      } else if (from_bits.get_mantissa() == 0) {
         from_bits.set_mantissa(FRACTION_MASK);
         // from == 0 is handled separately so decrementing the exponent will not
         // lead to underflow.
         from_bits.set_biased_exponent(from_bits.get_biased_exponent() - 1);
         return from_bits.get_val();
       } else {
-        --int_val;
+        from_bits = FPBits(StorageType(from_bits.uintval() - 1));
       }
     }
-  } else if (from == 0.0l) {
-    if (from > to)
-      int_val = SIGN_VAL + 1;
-    else
-      int_val = 1;
   } else {
-    if (from > to) {
-      if (int_val == FPBits::MIN_NORMAL) {
-        int_val = FPBits::MAX_SUBNORMAL;
-      } else if ((int_val & FRACTION_MASK) == 0) {
+    if (to < from) { // toward -inf
+      if (from_bits == FPBits::min_normal(Sign::POS)) {
+        from_bits = FPBits::max_subnormal(Sign::POS);
+      } else if (from_bits.get_mantissa() == 0) {
         from_bits.set_mantissa(FRACTION_MASK);
         // from == 0 is handled separately so decrementing the exponent will not
         // lead to underflow.
         from_bits.set_biased_exponent(from_bits.get_biased_exponent() - 1);
         return from_bits.get_val();
       } else {
-        --int_val;
+        from_bits = FPBits(StorageType(from_bits.uintval() - 1));
       }
-    } else {
-      if (int_val == FPBits::MAX_SUBNORMAL) {
-        int_val = FPBits::MIN_NORMAL;
-      } else if ((int_val & FRACTION_MASK) == FRACTION_MASK) {
+    } else { // toward +inf
+      if (from_bits == FPBits::max_subnormal(Sign::POS)) {
+        from_bits = FPBits::min_normal(Sign::POS);
+      } else if (from_bits.get_mantissa() == FRACTION_MASK) {
         from_bits.set_mantissa(0);
         // Incrementing exponent might overflow the value to infinity,
         // which is what is expected. Since NaNs are handling separately,
@@ -110,16 +107,15 @@ LIBC_INLINE long double nextafter(long double from, long double to) {
           raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
         return from_bits.get_val();
       } else {
-        ++int_val;
+        from_bits = FPBits(StorageType(from_bits.uintval() + 1));
       }
     }
   }
 
-  StorageType implicit_bit = int_val & (StorageType(1) << FPBits::FRACTION_LEN);
-  if (implicit_bit == StorageType(0))
+  if (!from_bits.get_implicit_bit())
     raise_except_if_required(FE_UNDERFLOW | FE_INEXACT);
 
-  return cpp::bit_cast<long double>(int_val);
+  return from_bits.get_val();
 }
 
 } // namespace fputil
diff --git a/libc/src/__support/common.h b/libc/src/__support/common.h
index 53951dc131c28..a153dfc363d73 100644
--- a/libc/src/__support/common.h
+++ b/libc/src/__support/common.h
@@ -25,7 +25,6 @@
 #define LLVM_LIBC_FUNCTION_IMPL(type, name, arglist)                           \
   LLVM_LIBC_FUNCTION_ATTR decltype(LIBC_NAMESPACE::name)                       \
       __##name##_impl__ __asm__(#name);                                        \
-  decltype(LIBC_NAMESPACE::name) name [[gnu::alias(#name)]];                   \
   type __##name##_impl__ arglist
 #else
 #define LLVM_LIBC_FUNCTION_IMPL(type, name, arglist) type name arglist
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 8aeb3d2cea03d..9655c993bee28 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -1167,7 +1167,7 @@ LIBC_INLINE StrToNumResult<T> strtofloatingpoint(const char *__restrict src) {
           index = left_paren;
         }
       }
-      result = FPBits(result.build_quiet_nan(nan_mantissa, result.sign()));
+      result = FPBits(result.build_quiet_nan(result.sign(), nan_mantissa));
     }
   } else if (tolower(src[index]) == 'i') { // INF
     if (tolower(src[index + 1]) == inf_string[1] &&
@@ -1215,7 +1215,7 @@ template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) {
     nan_mantissa = static_cast<StorageType>(nan_mantissa_result);
   }
 
-  result = FPBits(result.build_quiet_nan(nan_mantissa));
+  result = FPBits::build_quiet_nan(fputil::Sign::POS, nan_mantissa);
   return {T(result), 0, error};
 }
 
diff --git a/libc/src/math/generic/acosf.cpp b/libc/src/math/generic/acosf.cpp
index 67832596a67fb..7b2a09101182e 100644
--- a/libc/src/math/generic/acosf.cpp
+++ b/libc/src/math/generic/acosf.cpp
@@ -85,7 +85,7 @@ LLVM_LIBC_FUNCTION(float, acosf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_quiet_nan(0);
+    return x + FPBits::build_quiet_nan().get_val();
   }
 
   // When 0.5 < |x| < 1, we perform range reduction as follow:
diff --git a/libc/src/math/generic/acoshf.cpp b/libc/src/math/generic/acoshf.cpp
index b0b87095fbb07..a546cc2268b04 100644
--- a/libc/src/math/generic/acoshf.cpp
+++ b/libc/src/math/generic/acoshf.cpp
@@ -29,7 +29,7 @@ LLVM_LIBC_FUNCTION(float, acoshf, (float x)) {
     // x < 1.
     fputil::set_errno_if_required(EDOM);
     fputil::raise_except_if_required(FE_INVALID);
-    return FPBits_t::build_quiet_nan(0);
+    return FPBits_t::build_quiet_nan().get_val();
   }
 
   if (LIBC_UNLIKELY(x_u >= 0x4f8ffb03)) {
diff --git a/libc/src/math/generic/asinf.cpp b/libc/src/math/generic/asinf.cpp
index bc0d27c1eebc5..ee8e853063644 100644
--- a/libc/src/math/generic/asinf.cpp
+++ b/libc/src/math/generic/asinf.cpp
@@ -109,7 +109,7 @@ LLVM_LIBC_FUNCTION(float, asinf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_nan(FPBits::FRACTION_MASK);
+    return x + FPBits::build_nan(Sign::POS, FPBits::FRACTION_MASK).get_val();
   }
 
   // Check for exceptional values
diff --git a/libc/src/math/generic/atanhf.cpp b/libc/src/math/generic/atanhf.cpp
index fd6f5c96b6b4e..cd0acbf24e928 100644
--- a/libc/src/math/generic/atanhf.cpp
+++ b/libc/src/math/generic/atanhf.cpp
@@ -29,11 +29,11 @@ LLVM_LIBC_FUNCTION(float, atanhf, (float x)) {
     if (x_abs == 0x3F80'0000U) {
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return FPBits::inf(sign);
+      return FPBits::inf(sign).get_val();
     } else {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::build_quiet_nan(0);
+      return FPBits::build_quiet_nan().get_val();
     }
   }
 
diff --git a/libc/src/math/generic/cosf.cpp b/libc/src/math/generic/cosf.cpp
index 89333ab19e89f..132e72c0f65da 100644
--- a/libc/src/math/generic/cosf.cpp
+++ b/libc/src/math/generic/cosf.cpp
@@ -118,7 +118,7 @@ LLVM_LIBC_FUNCTION(float, cosf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_quiet_nan(0);
+    return x + FPBits::build_quiet_nan().get_val();
   }
 
   // Combine the results with the sine of sum formula:
diff --git a/libc/src/math/generic/coshf.cpp b/libc/src/math/generic/coshf.cpp
index 3b01852e9f544..a618056a64dc8 100644
--- a/libc/src/math/generic/coshf.cpp
+++ b/libc/src/math/generic/coshf.cpp
@@ -32,16 +32,16 @@ LLVM_LIBC_FUNCTION(float, coshf, (float x)) {
     }
 
     if (xbits.is_inf_or_nan())
-      return x + FPBits::inf();
+      return x + FPBits::inf().get_val();
 
     int rounding = fputil::quick_get_round();
     if (LIBC_UNLIKELY(rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO))
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
 
-    return x + FPBits::inf();
+    return x + FPBits::inf().get_val();
   }
 
   // TODO: We should be able to reduce the latency and reciprocal throughput
diff --git a/libc/src/math/generic/exp.cpp b/libc/src/math/generic/exp.cpp
index a1b4d9a64f969..49ea1699bb209 100644
--- a/libc/src/math/generic/exp.cpp
+++ b/libc/src/math/generic/exp.cpp
@@ -205,7 +205,7 @@ double set_exceptional(double x) {
       return x;
 
     if (fputil::quick_get_round() == FE_UPWARD)
-      return FPBits::min_denormal();
+      return FPBits::min_subnormal().get_val();
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_UNDERFLOW);
     return 0.0;
@@ -216,7 +216,7 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp
index e441f2c0edc7d..f1da03cba0b30 100644
--- a/libc/src/math/generic/exp10.cpp
+++ b/libc/src/math/generic/exp10.cpp
@@ -248,7 +248,7 @@ double set_exceptional(double x) {
         return x;
 
       if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0;
@@ -262,7 +262,7 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/exp10f_impl.h b/libc/src/math/generic/exp10f_impl.h
index 2861659a6e57e..ff4c1c3aec67c 100644
--- a/libc/src/math/generic/exp10f_impl.h
+++ b/libc/src/math/generic/exp10f_impl.h
@@ -42,7 +42,7 @@ LIBC_INLINE float exp10f(float x) {
       if (xbits.is_nan())
         return x;
       if (fputil::fenv_is_round_up())
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0f;
@@ -53,13 +53,13 @@ LIBC_INLINE float exp10f(float x) {
       if (x_u < 0x7f80'0000U) {
         int rounding = fputil::quick_get_round();
         if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal();
+          return FPBits::max_normal().get_val();
 
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_OVERFLOW);
       }
       // x is +inf or nan
-      return x + FPBits::inf();
+      return x + FPBits::inf().get_val();
     }
   }
 
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 70bc4870806a9..508bff9bd9fc9 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -223,7 +223,7 @@ double set_exceptional(double x) {
         return x;
 
       if (fputil::quick_get_round() == FE_UPWARD)
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0;
@@ -237,7 +237,7 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/exp2f_impl.h b/libc/src/math/generic/exp2f_impl.h
index 86360840b96e6..d2342e289fcba 100644
--- a/libc/src/math/generic/exp2f_impl.h
+++ b/libc/src/math/generic/exp2f_impl.h
@@ -76,13 +76,13 @@ LIBC_INLINE float exp2f(float x) {
       if (x_u < 0x7f80'0000U) {
         int rounding = fputil::quick_get_round();
         if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal();
+          return FPBits::max_normal().get_val();
 
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_OVERFLOW);
       }
       // x is +inf or nan
-      return x + FPBits::inf();
+      return x + FPBits::inf().get_val();
     }
     // x <= -150
     if (x_u >= 0xc316'0000U) {
@@ -93,7 +93,7 @@ LIBC_INLINE float exp2f(float x) {
       if (xbits.is_nan())
         return x;
       if (fputil::fenv_is_round_up())
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       if (x != 0.0f) {
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_UNDERFLOW);
diff --git a/libc/src/math/generic/expf.cpp b/libc/src/math/generic/expf.cpp
index 88d408994fe42..f3ce8400d0a41 100644
--- a/libc/src/math/generic/expf.cpp
+++ b/libc/src/math/generic/expf.cpp
@@ -50,7 +50,7 @@ LLVM_LIBC_FUNCTION(float, expf, (float x)) {
       if (xbits.is_nan())
         return x;
       if (fputil::fenv_is_round_up())
-        return FPBits::min_denormal();
+        return FPBits::min_subnormal().get_val();
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_UNDERFLOW);
       return 0.0f;
@@ -61,7 +61,7 @@ LLVM_LIBC_FUNCTION(float, expf, (float x)) {
       if (xbits.uintval() < 0x7f80'0000U) {
         int rounding = fputil::quick_get_round();
         if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-          return FPBits::max_normal();
+          return FPBits::max_normal().get_val();
 
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_OVERFLOW);
diff --git a/libc/src/math/generic/expm1.cpp b/libc/src/math/generic/expm1.cpp
index d9fccf98e8caa..c1fb80309d7b4 100644
--- a/libc/src/math/generic/expm1.cpp
+++ b/libc/src/math/generic/expm1.cpp
@@ -267,13 +267,13 @@ double set_exceptional(double x) {
   if (x_u < 0x7ff0'0000'0000'0000ULL) {
     int rounding = fputil::quick_get_round();
     if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-      return FPBits::max_normal();
+      return FPBits::max_normal().get_val();
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
   }
   // x is +inf or nan
-  return x + static_cast<double>(FPBits::inf());
+  return x + FPBits::inf().get_val();
 }
 
 } // namespace
diff --git a/libc/src/math/generic/expm1f.cpp b/libc/src/math/generic/expm1f.cpp
index c6e0663ec46c3..037e60021b296 100644
--- a/libc/src/math/generic/expm1f.cpp
+++ b/libc/src/math/generic/expm1f.cpp
@@ -68,12 +68,12 @@ LLVM_LIBC_FUNCTION(float, expm1f, (float x)) {
         if (xbits.uintval() < 0x7f80'0000U) {
           int rounding = fputil::quick_get_round();
           if (rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO)
-            return FPBits::max_normal();
+            return FPBits::max_normal().get_val();
 
           fputil::set_errno_if_required(ERANGE);
           fputil::raise_except_if_required(FE_OVERFLOW);
         }
-        return x + static_cast<float>(FPBits::inf());
+        return x + FPBits::inf().get_val();
       }
     }
   }
diff --git a/libc/src/math/generic/log.cpp b/libc/src/math/generic/log.cpp
index 2db6b7f48fd09..b0f7e8c9afa54 100644
--- a/libc/src/math/generic/log.cpp
+++ b/libc/src/math/generic/log.cpp
@@ -732,28 +732,29 @@ double log_accurate(int e_x, int index, double m_x) {
 
 LLVM_LIBC_FUNCTION(double, log, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
+  using Sign = fputil::Sign;
   FPBits_t xbits(x);
   uint64_t x_u = xbits.uintval();
 
   int x_e = -FPBits_t::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(x_u == 0x3FF0'0000'0000'0000ULL)) {
+  if (LIBC_UNLIKELY(xbits == FPBits_t::one())) {
     // log(1.0) = +0.0
     return 0.0;
   }
 
-  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::MIN_NORMAL ||
-                    xbits.uintval() > FPBits_t::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::min_normal().uintval() ||
+                    xbits.uintval() > FPBits_t::max_normal().uintval())) {
     if (xbits.is_zero()) {
       // return -Inf and raise FE_DIVBYZERO.
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
+      return FPBits_t::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits_t::build_quiet_nan(0);
+      return FPBits_t::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log10.cpp b/libc/src/math/generic/log10.cpp
index 3a4d321fdb18c..55a3fc5c061e5 100644
--- a/libc/src/math/generic/log10.cpp
+++ b/libc/src/math/generic/log10.cpp
@@ -733,28 +733,29 @@ double log10_accurate(int e_x, int index, double m_x) {
 
 LLVM_LIBC_FUNCTION(double, log10, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
+  using Sign = fputil::Sign;
   FPBits_t xbits(x);
   uint64_t x_u = xbits.uintval();
 
   int x_e = -FPBits_t::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(x_u == 0x3FF0'0000'0000'0000ULL)) {
+  if (LIBC_UNLIKELY(xbits == FPBits_t::one())) {
     // log10(1.0) = +0.0
     return 0.0;
   }
 
-  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::MIN_NORMAL ||
-                    xbits.uintval() > FPBits_t::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::min_normal().uintval() ||
+                    xbits.uintval() > FPBits_t::max_normal().uintval())) {
     if (xbits.is_zero()) {
       // return -Inf and raise FE_DIVBYZERO.
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
+      return FPBits_t::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits_t::build_quiet_nan(0);
+      return FPBits_t::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index 46505f4e07e67..f87e34ec5fb38 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -106,6 +106,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
   constexpr double LOG10_2 = 0x1.34413509f79ffp-2;
 
   using FPBits = typename fputil::FPBits<float>;
+  using Sign = fputil::Sign;
   FPBits xbits(x);
   uint32_t x_u = xbits.uintval();
 
@@ -160,18 +161,19 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
 
   int m = -FPBits::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL || x_u > FPBits::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(x_u < FPBits::min_normal().uintval() ||
+                    x_u > FPBits::max_normal().uintval())) {
     if (xbits.is_zero()) {
       // Return -inf and raise FE_DIVBYZERO
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
+      return FPBits::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       // Return NaN and raise FE_INVALID
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits::build_quiet_nan(0);
+      return FPBits::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log1p.cpp b/libc/src/math/generic/log1p.cpp
index 731fecae6f1b5..ae432620ba328 100644
--- a/libc/src/math/generic/log1p.cpp
+++ b/libc/src/math/generic/log1p.cpp
@@ -873,6 +873,7 @@ LIBC_INLINE double log1p_accurate(int e_x, int index,
 
 LLVM_LIBC_FUNCTION(double, log1p, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
+  using Sign = fputil::Sign;
   constexpr int EXP_BIAS = FPBits_t::EXP_BIAS;
   constexpr int FRACTION_LEN = FPBits_t::FRACTION_LEN;
   constexpr uint64_t FRACTION_MASK = FPBits_t::FRACTION_MASK;
@@ -887,19 +888,19 @@ LLVM_LIBC_FUNCTION(double, log1p, (double x)) {
     // |x| >= 1
     if (LIBC_UNLIKELY(x_u >= 0x4650'0000'0000'0000ULL)) {
       // x >= 2^102 or x is negative, inf, or NaN
-      if (LIBC_UNLIKELY(x_u > FPBits_t::MAX_NORMAL)) {
+      if (LIBC_UNLIKELY(x_u > FPBits_t::max_normal().uintval())) {
         // x <= -1.0 or x is Inf or NaN
         if (x_u == 0xbff0'0000'0000'0000ULL) {
           // x = -1.0
           fputil::set_errno_if_required(ERANGE);
           fputil::raise_except_if_required(FE_DIVBYZERO);
-          return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
+          return FPBits_t::inf(Sign::NEG).get_val();
         }
         if (xbits.is_neg() && !xbits.is_nan()) {
           // x < -1.0
           fputil::set_errno_if_required(EDOM);
           fputil::raise_except_if_required(FE_INVALID);
-          return FPBits_t::build_quiet_nan(0);
+          return FPBits_t::build_quiet_nan().get_val();
         }
         // x is +Inf or NaN
         return x;
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index 0812569c624b8..bc472caf54f89 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -43,11 +43,11 @@ LIBC_INLINE float log(double x) {
 
   uint64_t x_u = xbits.uintval();
 
-  if (LIBC_UNLIKELY(x_u > FPBits::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(x_u > FPBits::max_normal().uintval())) {
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return fputil::FPBits<float>::build_quiet_nan(0);
+      return fputil::FPBits<float>::build_quiet_nan().get_val();
     }
     return static_cast<float>(x);
   }
diff --git a/libc/src/math/generic/log2.cpp b/libc/src/math/generic/log2.cpp
index 5b7fb65d77385..480c690a65037 100644
--- a/libc/src/math/generic/log2.cpp
+++ b/libc/src/math/generic/log2.cpp
@@ -854,28 +854,29 @@ double log2_accurate(int e_x, int index, double m_x) {
 
 LLVM_LIBC_FUNCTION(double, log2, (double x)) {
   using FPBits_t = typename fputil::FPBits<double>;
+  using Sign = fputil::Sign;
   FPBits_t xbits(x);
   uint64_t x_u = xbits.uintval();
 
   int x_e = -FPBits_t::EXP_BIAS;
 
-  if (LIBC_UNLIKELY(x_u == 0x3FF0'0000'0000'0000ULL)) {
+  if (LIBC_UNLIKELY(xbits == FPBits_t::one())) {
     // log2(1.0) = +0.0
     return 0.0;
   }
 
-  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::MIN_NORMAL ||
-                    xbits.uintval() > FPBits_t::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(xbits.uintval() < FPBits_t::min_normal().uintval() ||
+                    xbits.uintval() > FPBits_t::max_normal().uintval())) {
     if (xbits.is_zero()) {
       // return -Inf and raise FE_DIVBYZERO.
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<double>(FPBits_t::inf(fputil::Sign::NEG));
+      return FPBits_t::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
-      return FPBits_t::build_quiet_nan(0);
+      return FPBits_t::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index 9cddbb9e8ea48..7eaa5d53ccedd 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -55,6 +55,7 @@ namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
   using FPBits = typename fputil::FPBits<float>;
+  using Sign = fputil::Sign;
   FPBits xbits(x);
   uint32_t x_u = xbits.uintval();
 
@@ -68,16 +69,17 @@ LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
     return 0.0f;
 
   // Exceptional inputs.
-  if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL || x_u > FPBits::MAX_NORMAL)) {
+  if (LIBC_UNLIKELY(x_u < FPBits::min_normal().uintval() ||
+                    x_u > FPBits::max_normal().uintval())) {
     if (xbits.is_zero()) {
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
+      return FPBits::inf(Sign::NEG).get_val();
     }
     if (xbits.is_neg() && !xbits.is_nan()) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except(FE_INVALID);
-      return FPBits::build_quiet_nan(0);
+      return FPBits::build_quiet_nan().get_val();
     }
     if (xbits.is_inf_or_nan()) {
       return x;
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index 8ccb55dcc9e33..88f7ea01b2f19 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -54,6 +54,7 @@ namespace LIBC_NAMESPACE {
 LLVM_LIBC_FUNCTION(float, logf, (float x)) {
   constexpr double LOG_2 = 0x1.62e42fefa39efp-1;
   using FPBits = typename fputil::FPBits<float>;
+  using Sign = fputil::Sign;
   FPBits xbits(x);
   uint32_t x_u = xbits.uintval();
 
@@ -79,7 +80,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
 #endif // LIBC_TARGET_CPU_HAS_FMA
     }
     // Subnormal inputs.
-    if (LIBC_UNLIKELY(x_u < FPBits::MIN_NORMAL)) {
+    if (LIBC_UNLIKELY(x_u < FPBits::min_normal().uintval())) {
       if (x_u == 0) {
         // Return -inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(ERANGE);
@@ -112,18 +113,18 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
 #endif // LIBC_TARGET_CPU_HAS_FMA
     }
     // Exceptional inputs.
-    if (LIBC_UNLIKELY(x_u > FPBits::MAX_NORMAL)) {
+    if (LIBC_UNLIKELY(x_u > FPBits::max_normal().uintval())) {
       if (x_u == 0x8000'0000U) {
         // Return -inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
+        return FPBits::inf(Sign::NEG).get_val();
       }
       if (xbits.is_neg() && !xbits.is_nan()) {
         // Return NaN and raise FE_INVALID
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_INVALID);
-        return FPBits::build_quiet_nan(0);
+        return FPBits::build_quiet_nan().get_val();
       }
       // x is +inf or nan
       return x;
diff --git a/libc/src/math/generic/powf.cpp b/libc/src/math/generic/powf.cpp
index 932f1d70c33e8..0e164ab8b4225 100644
--- a/libc/src/math/generic/powf.cpp
+++ b/libc/src/math/generic/powf.cpp
@@ -547,14 +547,15 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // pow(+-0, -Inf) = +inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return FloatBits::inf();
+        return FloatBits::inf().get_val();
       }
       // pow (|x| < 1, -inf) = +inf
       // pow (|x| < 1, +inf) = 0.0f
       // pow (|x| > 1, -inf) = 0.0f
       // pow (|x| > 1, +inf) = +inf
-      return ((x_abs < 0x3f80'0000) == (y_u == 0xff80'0000)) ? FloatBits::inf()
-                                                             : 0.0f;
+      return ((x_abs < 0x3f80'0000) == (y_u == 0xff80'0000))
+                 ? FloatBits::inf().get_val()
+                 : 0.0f;
     }
     default:
       // Speed up for common exponents
@@ -617,7 +618,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // pow(0, negative number) = inf
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS);
+        return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val();
       }
       // pow(0, positive number) = 0
       return out_is_neg ? -0.0f : 0.0f;
@@ -628,7 +629,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
       if (y_u >= FloatBits::SIGN_MASK) {
         return out_is_neg ? -0.0f : 0.0f;
       }
-      return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS);
+      return FloatBits::inf(out_is_neg ? Sign::NEG : Sign::POS).get_val();
     }
     }
 
@@ -656,7 +657,7 @@ LLVM_LIBC_FUNCTION(float, powf, (float x, float y)) {
         // pow( negative, non-integer ) = NaN
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_INVALID);
-        return FloatBits::build_quiet_nan(0);
+        return FloatBits::build_quiet_nan().get_val();
       }
     }
   }
diff --git a/libc/src/math/generic/sincosf.cpp b/libc/src/math/generic/sincosf.cpp
index 44371db710871..f12b93a0e6965 100644
--- a/libc/src/math/generic/sincosf.cpp
+++ b/libc/src/math/generic/sincosf.cpp
@@ -148,7 +148,9 @@ LLVM_LIBC_FUNCTION(void, sincosf, (float x, float *sinp, float *cosp)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    *sinp = x + FPBits::build_nan(FPBits::FRACTION_MASK);
+    *sinp =
+        x +
+        FPBits::build_nan(fputil::Sign::POS, FPBits::FRACTION_MASK).get_val();
     *cosp = *sinp;
     return;
   }
diff --git a/libc/src/math/generic/sinf.cpp b/libc/src/math/generic/sinf.cpp
index 9e574d4e57240..7ba479f0a4598 100644
--- a/libc/src/math/generic/sinf.cpp
+++ b/libc/src/math/generic/sinf.cpp
@@ -139,7 +139,7 @@ LLVM_LIBC_FUNCTION(float, sinf, (float x)) {
       fputil::set_errno_if_required(EDOM);
       fputil::raise_except_if_required(FE_INVALID);
     }
-    return x + FPBits::build_quiet_nan(0);
+    return x + FPBits::build_quiet_nan().get_val();
   }
 
   // Combine the results with the sine of sum formula:
diff --git a/libc/src/math/generic/sinhf.cpp b/libc/src/math/generic/sinhf.cpp
index b3850c6742706..780c9a1f8d6ac 100644
--- a/libc/src/math/generic/sinhf.cpp
+++ b/libc/src/math/generic/sinhf.cpp
@@ -56,16 +56,16 @@ LLVM_LIBC_FUNCTION(float, sinhf, (float x)) {
     int rounding = fputil::quick_get_round();
     if (xbits.is_neg()) {
       if (LIBC_UNLIKELY(rounding == FE_UPWARD || rounding == FE_TOWARDZERO))
-        return -FPBits::max_normal();
+        return -FPBits::max_normal().get_val();
     } else {
       if (LIBC_UNLIKELY(rounding == FE_DOWNWARD || rounding == FE_TOWARDZERO))
-        return FPBits::max_normal();
+        return FPBits::max_normal().get_val();
     }
 
     fputil::set_errno_if_required(ERANGE);
     fputil::raise_except_if_required(FE_OVERFLOW);
 
-    return x + FPBits::inf(xbits.sign());
+    return x + FPBits::inf(xbits.sign()).get_val();
   }
 
   // sinh(x) = (e^x - e^(-x)) / 2.
diff --git a/libc/src/math/generic/tanf.cpp b/libc/src/math/generic/tanf.cpp
index 7909e9e5d5568..09dd62eae03f8 100644
--- a/libc/src/math/generic/tanf.cpp
+++ b/libc/src/math/generic/tanf.cpp
@@ -114,7 +114,7 @@ LLVM_LIBC_FUNCTION(float, tanf, (float x)) {
         fputil::set_errno_if_required(EDOM);
         fputil::raise_except_if_required(FE_INVALID);
       }
-      return x + FPBits::build_quiet_nan(0);
+      return x + FPBits::build_quiet_nan().get_val();
     }
     // Other large exceptional values
     if (auto r = TANF_EXCEPTS.lookup_odd(x_abs, x_sign);
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 26b8e3e60bdfd..9880fa51c5481 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -66,16 +66,16 @@ template <typename T> struct FPTest : public Test {
   using Sign = LIBC_NAMESPACE::fputil::Sign;
   static constexpr StorageType STORAGE_MAX =
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();
-  static constexpr T zero = FPBits::zero(Sign::POS);
-  static constexpr T neg_zero = FPBits::zero(Sign::NEG);
-  static constexpr T aNaN = FPBits::build_quiet_nan(1);
-  static constexpr T sNaN = FPBits::build_nan(1);
-  static constexpr T inf = FPBits::inf(Sign::POS);
-  static constexpr T neg_inf = FPBits::inf(Sign::NEG);
-  static constexpr T min_normal = FPBits::min_normal();
-  static constexpr T max_normal = FPBits::max_normal();
-  static constexpr T min_denormal = FPBits::min_denormal();
-  static constexpr T max_denormal = FPBits::max_denormal();
+  static constexpr T zero = T(FPBits::zero(Sign::POS));
+  static constexpr T neg_zero = T(FPBits::zero(Sign::NEG));
+  static constexpr T aNaN = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  static constexpr T sNaN = T(FPBits::build_nan(Sign::POS, 1));
+  static constexpr T inf = T(FPBits::inf(Sign::POS));
+  static constexpr T neg_inf = T(FPBits::inf(Sign::NEG));
+  static constexpr T min_normal = T(FPBits::min_normal());
+  static constexpr T max_normal = T(FPBits::max_normal());
+  static constexpr T min_denormal = T(FPBits::min_subnormal());
+  static constexpr T max_denormal = T(FPBits::max_subnormal());
 
   static constexpr int N_ROUNDING_MODES = 4;
   static constexpr fputil::testing::RoundingMode ROUNDING_MODES[4] = {
@@ -95,16 +95,16 @@ template <typename T> struct FPTest : public Test {
   using Sign = LIBC_NAMESPACE::fputil::Sign;                                   \
   static constexpr StorageType STORAGE_MAX =                                   \
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();                 \
-  const T zero = FPBits::zero(Sign::POS);                                      \
-  const T neg_zero = FPBits::zero(Sign::NEG);                                  \
-  const T aNaN = FPBits::build_quiet_nan(1);                                   \
-  const T sNaN = FPBits::build_nan(1);                                         \
-  const T inf = FPBits::inf(Sign::POS);                                        \
-  const T neg_inf = FPBits::inf(Sign::NEG);                                    \
-  const T min_normal = FPBits::min_normal();                                   \
-  const T max_normal = FPBits::max_normal();                                   \
-  const T min_denormal = FPBits::min_denormal();                               \
-  const T max_denormal = FPBits::max_denormal();
+  const T zero = T(FPBits::zero(Sign::POS));                                   \
+  const T neg_zero = T(FPBits::zero(Sign::NEG));                               \
+  const T aNaN = T(FPBits::build_quiet_nan(Sign::POS, 1));                     \
+  const T sNaN = T(FPBits::build_nan(Sign::POS, 1));                           \
+  const T inf = T(FPBits::inf(Sign::POS));                                     \
+  const T neg_inf = T(FPBits::inf(Sign::NEG));                                 \
+  const T min_normal = T(FPBits::min_normal());                                \
+  const T max_normal = T(FPBits::max_normal());                                \
+  const T min_denormal = T(FPBits::min_subnormal());                           \
+  const T max_denormal = T(FPBits::max_subnormal());
 
 #define EXPECT_FP_EQ(expected, actual)                                         \
   EXPECT_THAT(actual, LIBC_NAMESPACE::testing::getMatcher<                     \
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index e6b6d7d9ec780..6092945811ce1 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -232,13 +232,11 @@ TEST(LlvmLibcFPBitsTest, FPType_X86_Binary80_IsNan) {
 TEST(LlvmLibcFPBitsTest, FloatType) {
   using FloatBits = FPBits<float>;
 
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(FloatBits(FloatBits::inf(Sign::POS))).c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(FloatBits(FloatBits::inf(Sign::NEG))).c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits(FloatBits::build_nan(1))).c_str(),
+  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(FloatBits::build_nan(Sign::POS, 1)).c_str(),
                "(NaN)");
 
   FloatBits zero(0.0f);
@@ -289,22 +287,19 @@ TEST(LlvmLibcFPBitsTest, FloatType) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBF900000 = (S: 1, E: 0x007F, M: 0x00100000)");
 
-  FloatBits quiet_nan = FloatBits(FloatBits::build_quiet_nan(1));
+  FloatBits quiet_nan = FloatBits::build_quiet_nan(Sign::POS, 1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
 TEST(LlvmLibcFPBitsTest, DoubleType) {
   using DoubleBits = FPBits<double>;
 
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(DoubleBits(DoubleBits::inf(Sign::POS))).c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(DoubleBits(DoubleBits::inf(Sign::NEG))).c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(DoubleBits(DoubleBits::build_nan(1))).c_str(),
-      "(NaN)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::build_nan(Sign::POS, 1)).c_str(),
+               "(NaN)");
 
   DoubleBits zero(0.0);
   EXPECT_TRUE(zero.is_pos());
@@ -354,7 +349,7 @@ TEST(LlvmLibcFPBitsTest, DoubleType) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBFF2000000000000 = (S: 1, E: 0x03FF, M: 0x0002000000000000)");
 
-  DoubleBits quiet_nan = DoubleBits(DoubleBits::build_quiet_nan(1));
+  DoubleBits quiet_nan = DoubleBits::build_quiet_nan(Sign::POS, 1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
@@ -365,16 +360,12 @@ TEST(LlvmLibcFPBitsTest, X86LongDoubleType) {
   if constexpr (sizeof(long double) == sizeof(double))
     return; // The tests for the "double" type cover for this case.
 
+  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
   EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::POS)))
-          .c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::NEG)))
-          .c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::build_nan(1))).c_str(),
+      LIBC_NAMESPACE::str(LongDoubleBits::build_nan(Sign::POS, 1)).c_str(),
       "(NaN)");
 
   LongDoubleBits zero(0.0l);
@@ -440,7 +431,7 @@ TEST(LlvmLibcFPBitsTest, X86LongDoubleType) {
       "0x000000000000BFFF9000000000000000 = "
       "(S: 1, E: 0x3FFF, I: 1, M: 0x00000000000000001000000000000000)");
 
-  LongDoubleBits quiet_nan = LongDoubleBits(LongDoubleBits::build_quiet_nan(1));
+  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan(Sign::POS, 1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #else
@@ -450,16 +441,12 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
 #else
   using LongDoubleBits = FPBits<long double>;
 
+  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(LongDoubleBits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
   EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::POS)))
-          .c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::inf(Sign::NEG)))
-          .c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(LongDoubleBits(LongDoubleBits::build_nan(1))).c_str(),
+      LIBC_NAMESPACE::str(LongDoubleBits::build_nan(Sign::POS, 1)).c_str(),
       "(NaN)");
 
   LongDoubleBits zero(0.0l);
@@ -519,7 +506,7 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
 
-  LongDoubleBits quiet_nan = LongDoubleBits(LongDoubleBits::build_quiet_nan(1));
+  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan(Sign::POS, 1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 #endif
 }
@@ -529,17 +516,15 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
 TEST(LlvmLibcFPBitsTest, Float128Type) {
   using Float128Bits = FPBits<float128>;
 
+  EXPECT_STREQ(LIBC_NAMESPACE::str(Float128Bits::inf(Sign::POS)).c_str(),
+               "(+Infinity)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(Float128Bits::inf(Sign::NEG)).c_str(),
+               "(-Infinity)");
   EXPECT_STREQ(
-      LIBC_NAMESPACE::str(Float128Bits(Float128Bits::inf(Sign::POS))).c_str(),
-      "(+Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(Float128Bits(Float128Bits::inf(Sign::NEG))).c_str(),
-      "(-Infinity)");
-  EXPECT_STREQ(
-      LIBC_NAMESPACE::str(Float128Bits(Float128Bits::build_nan(1))).c_str(),
+      LIBC_NAMESPACE::str(Float128Bits::build_nan(Sign::POS, 1)).c_str(),
       "(NaN)");
 
-  Float128Bits zero(Float128Bits::zero());
+  Float128Bits zero = Float128Bits::zero(Sign::POS);
   EXPECT_TRUE(zero.is_pos());
   EXPECT_EQ(zero.get_biased_exponent(), static_cast<uint16_t>(0x0000));
   EXPECT_EQ(zero.get_mantissa(), static_cast<UInt128>(0x0000000000000000)
@@ -549,7 +534,7 @@ TEST(LlvmLibcFPBitsTest, Float128Type) {
                "0x00000000000000000000000000000000 = "
                "(S: 0, E: 0x0000, M: 0x00000000000000000000000000000000)");
 
-  Float128Bits negzero(Float128Bits::zero(Sign::NEG));
+  Float128Bits negzero = Float128Bits::zero(Sign::NEG);
   EXPECT_TRUE(negzero.is_neg());
   EXPECT_EQ(negzero.get_biased_exponent(), static_cast<uint16_t>(0x0000));
   EXPECT_EQ(negzero.get_mantissa(), static_cast<UInt128>(0x0000000000000000)
@@ -596,7 +581,7 @@ TEST(LlvmLibcFPBitsTest, Float128Type) {
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
 
-  Float128Bits quiet_nan = Float128Bits(Float128Bits::build_quiet_nan(1));
+  Float128Bits quiet_nan = Float128Bits::build_quiet_nan(Sign::POS, 1);
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #endif // LIBC_COMPILER_HAS_FLOAT128
diff --git a/libc/test/src/math/FDimTest.h b/libc/test/src/math/FDimTest.h
index 0744e6ea8fd8f..c3d9cb1801cd4 100644
--- a/libc/test/src/math/FDimTest.h
+++ b/libc/test/src/math/FDimTest.h
@@ -24,7 +24,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
diff --git a/libc/test/src/math/FmaTest.h b/libc/test/src/math/FmaTest.h
index 032a79821d590..4343b38053dc4 100644
--- a/libc/test/src/math/FmaTest.h
+++ b/libc/test/src/math/FmaTest.h
@@ -25,11 +25,21 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
+  const T min_subnormal = T(FPBits::min_subnormal(Sign::POS));
+  const T min_normal = T(FPBits::min_normal(Sign::POS));
+  const T max_normal = T(FPBits::max_normal(Sign::POS));
   const T inf = T(FPBits::inf(Sign::POS));
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
 
   StorageType get_random_bit_pattern() {
     StorageType bits{0};
@@ -52,14 +62,13 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
     EXPECT_FP_EQ(func(inf, neg_inf, nan), nan);
 
     // Test underflow rounding up.
-    EXPECT_FP_EQ(func(T(0.5), FPBits::min_denormal(), FPBits::min_denormal()),
+    EXPECT_FP_EQ(func(T(0.5), min_subnormal, min_subnormal),
                  T(FPBits(StorageType(2))));
     // Test underflow rounding down.
-    T v = T(FPBits(FPBits::MIN_NORMAL + StorageType(1)));
-    EXPECT_FP_EQ(
-        func(T(1) / T(FPBits::MIN_NORMAL << 1), v, FPBits::min_normal()), v);
+    T v = T(FPBits(MIN_NORMAL + StorageType(1)));
+    EXPECT_FP_EQ(func(T(1) / T(MIN_NORMAL << 1), v, min_normal), v);
     // Test overflow.
-    T z = FPBits::max_normal();
+    T z = max_normal;
     EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z);
     // Exact cancellation.
     EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0));
@@ -68,11 +77,9 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void test_subnormal_range(Func func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_SUBNORMAL, w = FPBits::MAX_SUBNORMAL;
-         v <= FPBits::MAX_SUBNORMAL && w >= FPBits::MIN_SUBNORMAL;
-         v += STEP, w -= STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = MIN_SUBNORMAL, w = MAX_SUBNORMAL;
+         v <= MAX_SUBNORMAL && w >= MIN_SUBNORMAL; v += STEP, w -= STEP) {
       T x = T(FPBits(get_random_bit_pattern())), y = T(FPBits(v)),
         z = T(FPBits(w));
       mpfr::TernaryInput<T> input{x, y, z};
@@ -83,11 +90,9 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void test_normal_range(Func func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_NORMAL, w = FPBits::MAX_NORMAL;
-         v <= FPBits::MAX_NORMAL && w >= FPBits::MIN_NORMAL;
-         v += STEP, w -= STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
+         v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w)),
         z = T(FPBits(get_random_bit_pattern()));
       mpfr::TernaryInput<T> input{x, y, z};
diff --git a/libc/test/src/math/HypotTest.h b/libc/test/src/math/HypotTest.h
index b7eb63c192c70..0b85f68fda82a 100644
--- a/libc/test/src/math/HypotTest.h
+++ b/libc/test/src/math/HypotTest.h
@@ -25,15 +25,22 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
   using StorageType = typename FPBits::StorageType;
-  const T nan = FPBits::build_quiet_nan(1);
-  const T inf = FPBits::inf();
-  const T neg_inf = FPBits::inf(Sign::NEG);
-  const T zero = FPBits::zero();
-  const T neg_zero = FPBits::zero(Sign::NEG);
-  const T max_normal = FPBits::max_normal();
-  const T min_normal = FPBits::min_normal();
-  const T max_subnormal = FPBits::max_denormal();
-  const T min_subnormal = FPBits::min_denormal();
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T inf = T(FPBits::inf());
+  const T neg_inf = T(FPBits::inf(Sign::NEG));
+  const T zero = T(FPBits::zero());
+  const T neg_zero = T(FPBits::zero(Sign::NEG));
+  const T max_normal = T(FPBits::max_normal());
+  const T min_normal = T(FPBits::min_normal());
+  const T max_subnormal = T(FPBits::max_subnormal());
+  const T min_subnormal = T(FPBits::min_subnormal());
+
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
 
 public:
   void test_special_numbers(Func func) {
@@ -62,12 +69,11 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   void test_subnormal_range(Func func) {
     constexpr StorageType COUNT = 10'001;
     for (unsigned scale = 0; scale < 4; ++scale) {
-      StorageType max_value = FPBits::MAX_SUBNORMAL << scale;
-      StorageType step = (max_value - FPBits::MIN_SUBNORMAL) / COUNT;
+      StorageType max_value = MAX_SUBNORMAL << scale;
+      StorageType step = (max_value - MIN_SUBNORMAL) / COUNT;
       for (int signs = 0; signs < 4; ++signs) {
-        for (StorageType v = FPBits::MIN_SUBNORMAL, w = max_value;
-             v <= max_value && w >= FPBits::MIN_SUBNORMAL;
-             v += step, w -= step) {
+        for (StorageType v = MIN_SUBNORMAL, w = max_value;
+             v <= max_value && w >= MIN_SUBNORMAL; v += step, w -= step) {
           T x = T(FPBits(v)), y = T(FPBits(w));
           if (signs % 2 == 1) {
             x = -x;
@@ -86,13 +92,10 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void test_normal_range(Func func) {
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (StorageType(FPBits::MAX_NORMAL) - StorageType(FPBits::MIN_NORMAL)) /
-        COUNT;
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (int signs = 0; signs < 4; ++signs) {
-      for (StorageType v = FPBits::MIN_NORMAL, w = FPBits::MAX_NORMAL;
-           v <= FPBits::MAX_NORMAL && w >= FPBits::MIN_NORMAL;
-           v += STEP, w -= STEP) {
+      for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
+           v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
         T x = T(FPBits(v)), y = T(FPBits(w));
         if (signs % 2 == 1) {
           x = -x;
diff --git a/libc/test/src/math/ILogbTest.h b/libc/test/src/math/ILogbTest.h
index 9fa25c9ff9861..223de789999af 100644
--- a/libc/test/src/math/ILogbTest.h
+++ b/libc/test/src/math/ILogbTest.h
@@ -26,10 +26,10 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_special_numbers(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using Sign = LIBC_NAMESPACE::fputil::Sign;
-    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero())));
+    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(1))));
-    EXPECT_EQ(INT_MAX, func(T(FPBits::inf())));
+    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(Sign::POS, 1))));
+    EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
   }
 
@@ -76,11 +76,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_subnormal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
+    constexpr StorageType MIN_SUBNORMAL = FPBits::min_subnormal().uintval();
+    constexpr StorageType MAX_SUBNORMAL = FPBits::max_subnormal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_SUBNORMAL; v <= FPBits::MAX_SUBNORMAL;
-         v += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
@@ -95,11 +95,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_normal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
+    constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+    constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_NORMAL; v <= FPBits::MAX_NORMAL;
-         v += STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
diff --git a/libc/test/src/math/LdExpTest.h b/libc/test/src/math/LdExpTest.h
index 25120ba3646fd..3a4baabbf10e6 100644
--- a/libc/test/src/math/LdExpTest.h
+++ b/libc/test/src/math/LdExpTest.h
@@ -29,7 +29,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/NextAfterTest.h b/libc/test/src/math/NextAfterTest.h
index aa9646fd921f8..9ff3bf73d2ede 100644
--- a/libc/test/src/math/NextAfterTest.h
+++ b/libc/test/src/math/NextAfterTest.h
@@ -27,12 +27,12 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
-  const StorageType min_subnormal = FPBits::MIN_SUBNORMAL;
-  const StorageType max_subnormal = FPBits::MAX_SUBNORMAL;
-  const StorageType min_normal = FPBits::MIN_NORMAL;
-  const StorageType max_normal = FPBits::MAX_NORMAL;
+  const StorageType min_subnormal = FPBits::min_subnormal().uintval();
+  const StorageType max_subnormal = FPBits::max_subnormal().uintval();
+  const StorageType min_normal = FPBits::min_normal().uintval();
+  const StorageType max_normal = FPBits::max_normal().uintval();
 
 public:
   typedef T (*NextAfterFunc)(T, T);
diff --git a/libc/test/src/math/RIntTest.h b/libc/test/src/math/RIntTest.h
index 6816e94d389f4..b478e3f65dbc8 100644
--- a/libc/test/src/math/RIntTest.h
+++ b/libc/test/src/math/RIntTest.h
@@ -38,7 +38,14 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
 
   static inline mpfr::RoundingMode to_mpfr_rounding_mode(int mode) {
     switch (mode) {
@@ -95,10 +102,8 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RIntFunc func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
       T x = T(FPBits(i));
       for (int mode : ROUNDING_MODES) {
         LIBC_NAMESPACE::fputil::set_round(mode);
@@ -110,10 +115,8 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testNormalRange(RIntFunc func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_NORMAL; i <= FPBits::MAX_NORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType i = MIN_NORMAL; i <= MAX_NORMAL; i += STEP) {
       T x = T(FPBits(i));
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. We will skip them.
diff --git a/libc/test/src/math/RemQuoTest.h b/libc/test/src/math/RemQuoTest.h
index 9ed9525662452..0ee41f4bf9acf 100644
--- a/libc/test/src/math/RemQuoTest.h
+++ b/libc/test/src/math/RemQuoTest.h
@@ -28,7 +28,14 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
 
 public:
   typedef T (*RemQuoFunc)(T, T, int *);
@@ -97,11 +104,9 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RemQuoFunc func) {
     constexpr StorageType COUNT = 100'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_SUBNORMAL, w = FPBits::MAX_SUBNORMAL;
-         v <= FPBits::MAX_SUBNORMAL && w >= FPBits::MIN_SUBNORMAL;
-         v += STEP, w -= STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = MIN_SUBNORMAL, w = MAX_SUBNORMAL;
+         v <= MAX_SUBNORMAL && w >= MIN_SUBNORMAL; v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w));
       mpfr::BinaryOutput<T> result;
       mpfr::BinaryInput<T> input{x, y};
@@ -112,11 +117,9 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testNormalRange(RemQuoFunc func) {
     constexpr StorageType COUNT = 1'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_NORMAL, w = FPBits::MAX_NORMAL;
-         v <= FPBits::MAX_NORMAL && w >= FPBits::MIN_NORMAL;
-         v += STEP, w -= STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
+         v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
       T x = T(FPBits(v)), y = T(FPBits(w));
       mpfr::BinaryOutput<T> result;
       mpfr::BinaryInput<T> input{x, y};
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index e8ada1b4c36c5..6866c23cb99ca 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -37,7 +37,15 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const F neg_zero = F(FPBits::zero(Sign::NEG));
   const F inf = F(FPBits::inf());
   const F neg_inf = F(FPBits::inf(Sign::NEG));
-  const F nan = F(FPBits::build_quiet_nan(1));
+  const F nan = F(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
+  static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
+
   static constexpr I INTEGER_MIN = I(1) << (sizeof(I) * 8 - 1);
   static constexpr I INTEGER_MAX = -(INTEGER_MIN + 1);
 
@@ -215,10 +223,8 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RoundToIntegerFunc func) {
     constexpr StorageType COUNT = 1'000'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
       F x = F(FPBits(i));
       if (x == F(0.0))
         continue;
@@ -259,10 +265,8 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
       return;
 
     constexpr StorageType COUNT = 1'000'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_NORMAL; i <= FPBits::MAX_NORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType i = MIN_NORMAL; i <= MAX_NORMAL; i += STEP) {
       F x = F(FPBits(i));
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. We will skip them.
diff --git a/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h b/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
index ada7c9e495426..48572e78e5153 100644
--- a/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
+++ b/libc/test/src/math/differential_testing/BinaryOpSingleOutputDiff.h
@@ -109,10 +109,13 @@ template <typename T> class BinaryOpSingleOutputDiff {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     run_perf_in_range(myFunc, otherFunc, /* startingBit= */ StorageType(0),
-                      /* endingBit= */ FPBits::MAX_SUBNORMAL, 1'000'001, log);
+                      /* endingBit= */ FPBits::max_subnormal().uintval(),
+                      1'000'001, log);
     log << "\n Performance tests with inputs in normal range:\n";
-    run_perf_in_range(myFunc, otherFunc, /* startingBit= */ FPBits::MIN_NORMAL,
-                      /* endingBit= */ FPBits::MAX_NORMAL, 100'000'001, log);
+    run_perf_in_range(myFunc, otherFunc,
+                      /* startingBit= */ FPBits::min_normal().uintval(),
+                      /* endingBit= */ FPBits::max_normal().uintval(),
+                      100'000'001, log);
     log << "\n Performance tests with inputs in normal range with exponents "
            "close to each other:\n";
     run_perf_in_range(
@@ -126,11 +129,12 @@ template <typename T> class BinaryOpSingleOutputDiff {
     log << " Diff tests with inputs in denormal range:\n";
     diffCount += run_diff_in_range(
         myFunc, otherFunc, /* startingBit= */ StorageType(0),
-        /* endingBit= */ FPBits::MAX_SUBNORMAL, 1'000'001, log);
+        /* endingBit= */ FPBits::max_subnormal().uintval(), 1'000'001, log);
     log << "\n Diff tests with inputs in normal range:\n";
     diffCount += run_diff_in_range(
-        myFunc, otherFunc, /* startingBit= */ FPBits::MIN_NORMAL,
-        /* endingBit= */ FPBits::MAX_NORMAL, 100'000'001, log);
+        myFunc, otherFunc,
+        /* startingBit= */ FPBits::min_normal().uintval(),
+        /* endingBit= */ FPBits::max_normal().uintval(), 100'000'001, log);
     log << "\n Diff tests with inputs in normal range with exponents "
            "close to each other:\n";
     diffCount += run_diff_in_range(
diff --git a/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h b/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
index e4cd06eb22b71..5e8310e889dc6 100644
--- a/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
+++ b/libc/test/src/math/differential_testing/SingleInputSingleOutputDiff.h
@@ -93,10 +93,11 @@ template <typename T> class SingleInputSingleOutputDiff {
     std::ofstream log(logFile);
     log << " Performance tests with inputs in denormal range:\n";
     runPerfInRange(myFunc, otherFunc, /* startingBit= */ StorageType(0),
-                   /* endingBit= */ FPBits::MAX_SUBNORMAL, log);
+                   /* endingBit= */ FPBits::max_subnormal().uintval(), log);
     log << "\n Performance tests with inputs in normal range:\n";
-    runPerfInRange(myFunc, otherFunc, /* startingBit= */ FPBits::MIN_NORMAL,
-                   /* endingBit= */ FPBits::MAX_NORMAL, log);
+    runPerfInRange(myFunc, otherFunc,
+                   /* startingBit= */ FPBits::min_normal().uintval(),
+                   /* endingBit= */ FPBits::max_normal().uintval(), log);
   }
 };
 
diff --git a/libc/test/src/math/smoke/FDimTest.h b/libc/test/src/math/smoke/FDimTest.h
index 0744e6ea8fd8f..c3d9cb1801cd4 100644
--- a/libc/test/src/math/smoke/FDimTest.h
+++ b/libc/test/src/math/smoke/FDimTest.h
@@ -24,7 +24,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
diff --git a/libc/test/src/math/smoke/FmaTest.h b/libc/test/src/math/smoke/FmaTest.h
index dc624d871b990..337ce659a23e1 100644
--- a/libc/test/src/math/smoke/FmaTest.h
+++ b/libc/test/src/math/smoke/FmaTest.h
@@ -25,7 +25,7 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
 public:
   void test_special_numbers(Func func) {
@@ -39,14 +39,16 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
     EXPECT_FP_EQ(func(inf, neg_inf, nan), nan);
 
     // Test underflow rounding up.
-    EXPECT_FP_EQ(func(T(0.5), FPBits::min_denormal(), FPBits::min_denormal()),
-                 T(FPBits(StorageType(2))));
-    // Test underflow rounding down.
-    T v = T(FPBits(FPBits::MIN_NORMAL + StorageType(1)));
     EXPECT_FP_EQ(
-        func(T(1) / T(FPBits::MIN_NORMAL << 1), v, FPBits::min_normal()), v);
+        func(T(0.5), T(FPBits::min_subnormal()), T(FPBits::min_subnormal())),
+        T(FPBits(StorageType(2))));
+    // Test underflow rounding down.
+    StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+    T v = T(FPBits(MIN_NORMAL + StorageType(1)));
+    EXPECT_FP_EQ(func(T(1) / T(MIN_NORMAL << 1), v, T(FPBits::min_normal())),
+                 v);
     // Test overflow.
-    T z = FPBits::max_normal();
+    T z = T(FPBits::max_normal());
     EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z);
     // Exact cancellation.
     EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0));
diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h
index 77454c19a6538..67110536b9623 100644
--- a/libc/test/src/math/smoke/HypotTest.h
+++ b/libc/test/src/math/smoke/HypotTest.h
@@ -22,16 +22,16 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
-  const T nan = FPBits::build_quiet_nan(1);
-  const T inf = FPBits::inf(Sign::POS);
-  const T neg_inf = FPBits::inf(Sign::NEG);
-  const T zero = FPBits::zero(Sign::POS);
-  const T neg_zero = FPBits::zero(Sign::NEG);
-
-  const T max_normal = FPBits::max_normal();
-  const T min_normal = FPBits::min_normal();
-  const T max_subnormal = FPBits::max_denormal();
-  const T min_subnormal = FPBits::min_denormal();
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T inf = T(FPBits::inf(Sign::POS));
+  const T neg_inf = T(FPBits::inf(Sign::NEG));
+  const T zero = T(FPBits::zero(Sign::POS));
+  const T neg_zero = T(FPBits::zero(Sign::NEG));
+
+  const T max_normal = T(FPBits::max_normal());
+  const T min_normal = T(FPBits::min_normal());
+  const T max_subnormal = T(FPBits::max_subnormal());
+  const T min_subnormal = T(FPBits::min_subnormal());
 
 public:
   void test_special_numbers(Func func) {
diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h
index 0a50abc04f727..223de789999af 100644
--- a/libc/test/src/math/smoke/ILogbTest.h
+++ b/libc/test/src/math/smoke/ILogbTest.h
@@ -28,7 +28,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
     using Sign = LIBC_NAMESPACE::fputil::Sign;
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(1))));
+    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(Sign::POS, 1))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
   }
@@ -76,11 +76,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_subnormal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
+    constexpr StorageType MIN_SUBNORMAL = FPBits::min_subnormal().uintval();
+    constexpr StorageType MAX_SUBNORMAL = FPBits::max_subnormal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_SUBNORMAL; v <= FPBits::MAX_SUBNORMAL;
-         v += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
@@ -95,11 +95,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_normal_range(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using StorageType = typename FPBits::StorageType;
+    constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
+    constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
     constexpr StorageType COUNT = 10'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_NORMAL - FPBits::MIN_NORMAL) / COUNT;
-    for (StorageType v = FPBits::MIN_NORMAL; v <= FPBits::MAX_NORMAL;
-         v += STEP) {
+    constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
+    for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
       T x = T(FPBits(v));
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
diff --git a/libc/test/src/math/smoke/LdExpTest.h b/libc/test/src/math/smoke/LdExpTest.h
index 25120ba3646fd..3a4baabbf10e6 100644
--- a/libc/test/src/math/smoke/LdExpTest.h
+++ b/libc/test/src/math/smoke/LdExpTest.h
@@ -29,7 +29,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h
index bdf3da627180a..1dd07b3d2f93d 100644
--- a/libc/test/src/math/smoke/NextAfterTest.h
+++ b/libc/test/src/math/smoke/NextAfterTest.h
@@ -38,12 +38,14 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
-
-  const StorageType min_subnormal = FPBits::MIN_SUBNORMAL;
-  const StorageType max_subnormal = FPBits::MAX_SUBNORMAL;
-  const StorageType min_normal = FPBits::MIN_NORMAL;
-  const StorageType max_normal = FPBits::MAX_NORMAL;
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType min_subnormal =
+      FPBits::min_subnormal().uintval();
+  static constexpr StorageType max_subnormal =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType min_normal = FPBits::min_normal().uintval();
+  static constexpr StorageType max_normal = FPBits::max_normal().uintval();
 
 public:
   typedef T (*NextAfterFunc)(T, T);
diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h
index af4e0ab14531c..d65cc5d84d35a 100644
--- a/libc/test/src/math/smoke/NextTowardTest.h
+++ b/libc/test/src/math/smoke/NextTowardTest.h
@@ -40,16 +40,18 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
-
-  const long double to_zero = ToFPBits::zero();
-  const long double to_neg_zero = ToFPBits::zero(Sign::NEG);
-  const long double to_nan = ToFPBits::build_quiet_nan(1);
-
-  const StorageType min_subnormal = FPBits::MIN_SUBNORMAL;
-  const StorageType max_subnormal = FPBits::MAX_SUBNORMAL;
-  const StorageType min_normal = FPBits::MIN_NORMAL;
-  const StorageType max_normal = FPBits::MAX_NORMAL;
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  const long double to_zero = ToFPBits::zero().get_val();
+  const long double to_neg_zero = ToFPBits::zero(Sign::NEG).get_val();
+  const long double to_nan = ToFPBits::build_quiet_nan(Sign::POS, 1).get_val();
+
+  static constexpr StorageType min_subnormal =
+      FPBits::min_subnormal().uintval();
+  static constexpr StorageType max_subnormal =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType min_normal = FPBits::min_normal().uintval();
+  static constexpr StorageType max_normal = FPBits::max_normal().uintval();
 
 public:
   typedef T (*NextTowardFunc)(T, long double);
diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h
index b242c7e441b69..7bbbe54301570 100644
--- a/libc/test/src/math/smoke/RIntTest.h
+++ b/libc/test/src/math/smoke/RIntTest.h
@@ -35,7 +35,7 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
 public:
   void testSpecialNumbers(RIntFunc func) {
diff --git a/libc/test/src/math/smoke/RemQuoTest.h b/libc/test/src/math/smoke/RemQuoTest.h
index 93e20747e5263..5f5cbd4964a62 100644
--- a/libc/test/src/math/smoke/RemQuoTest.h
+++ b/libc/test/src/math/smoke/RemQuoTest.h
@@ -25,7 +25,7 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(1));
+  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
 
 public:
   typedef T (*RemQuoFunc)(T, T, int *);
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index 2703b78f00e0f..77c65aa492e22 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -30,11 +30,17 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const F zero = F(LIBC_NAMESPACE::fputil::FPBits<F>::zero(Sign::POS));
-  const F neg_zero = F(LIBC_NAMESPACE::fputil::FPBits<F>::zero(Sign::NEG));
-  const F inf = F(LIBC_NAMESPACE::fputil::FPBits<F>::inf(Sign::POS));
-  const F neg_inf = F(LIBC_NAMESPACE::fputil::FPBits<F>::inf(Sign::NEG));
-  const F nan = F(LIBC_NAMESPACE::fputil::FPBits<F>::build_quiet_nan(1));
+  const F zero = F(FPBits::zero(Sign::POS));
+  const F neg_zero = F(FPBits::zero(Sign::NEG));
+  const F inf = F(FPBits::inf(Sign::POS));
+  const F neg_inf = F(FPBits::inf(Sign::NEG));
+  const F nan = F(FPBits::build_quiet_nan(Sign::POS, 1));
+
+  static constexpr StorageType MAX_SUBNORMAL =
+      FPBits::max_subnormal().uintval();
+  static constexpr StorageType MIN_SUBNORMAL =
+      FPBits::min_subnormal().uintval();
+
   static constexpr I INTEGER_MIN = I(1) << (sizeof(I) * 8 - 1);
   static constexpr I INTEGER_MAX = -(INTEGER_MIN + 1);
 
@@ -111,10 +117,8 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
   void testSubnormalRange(RoundToIntegerFunc func) {
     constexpr StorageType COUNT = 1'000'001;
-    constexpr StorageType STEP =
-        (FPBits::MAX_SUBNORMAL - FPBits::MIN_SUBNORMAL) / COUNT;
-    for (StorageType i = FPBits::MIN_SUBNORMAL; i <= FPBits::MAX_SUBNORMAL;
-         i += STEP) {
+    constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
+    for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
       F x = F(FPBits(i));
       if (x == F(0.0))
         continue;
diff --git a/libc/test/src/stdio/sprintf_test.cpp b/libc/test/src/stdio/sprintf_test.cpp
index 344853beaf9fa..b22378b22ab12 100644
--- a/libc/test/src/stdio/sprintf_test.cpp
+++ b/libc/test/src/stdio/sprintf_test.cpp
@@ -585,8 +585,10 @@ TEST(LlvmLibcSPrintfTest, OctConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
+                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                   .get_val();
   written = LIBC_NAMESPACE::sprintf(buff, "%a", 1.0);
   ASSERT_STREQ_LEN(written, buff, "0x1p+0");
 
@@ -949,11 +951,15 @@ TEST_F(LlvmLibcSPrintfTest, FloatHexExpConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
-  long double ld_inf = LIBC_NAMESPACE::fputil::FPBits<long double>::inf();
-  long double ld_nan =
-      LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(1);
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
+                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                   .get_val();
+  long double ld_inf =
+      LIBC_NAMESPACE::fputil::FPBits<long double>::inf().get_val();
+  long double ld_nan = LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(
+                           LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                           .get_val();
 
   char big_buff[10000]; // Used for long doubles and other extremely wide
                         // numbers.
@@ -1790,8 +1796,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatDecimalConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
+                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                   .get_val();
 
   written = LIBC_NAMESPACE::sprintf(buff, "%e", 1.0);
   ASSERT_STREQ_LEN(written, buff, "1.000000e+00");
@@ -2422,8 +2430,10 @@ TEST_F(LlvmLibcSPrintfTest, FloatExponentConv) {
 
 TEST_F(LlvmLibcSPrintfTest, FloatAutoConv) {
   ForceRoundingMode r(RoundingMode::Nearest);
-  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(1);
+  double inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  double nan = LIBC_NAMESPACE::fputil::FPBits<double>::build_nan(
+                   LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                   .get_val();
 
   written = LIBC_NAMESPACE::sprintf(buff, "%g", 1.0);
   ASSERT_STREQ_LEN(written, buff, "1");
diff --git a/libc/test/src/stdio/sscanf_test.cpp b/libc/test/src/stdio/sscanf_test.cpp
index db3c48cdbf7a2..db67c25029133 100644
--- a/libc/test/src/stdio/sscanf_test.cpp
+++ b/libc/test/src/stdio/sscanf_test.cpp
@@ -230,8 +230,10 @@ TEST(LlvmLibcSScanfTest, FloatConvSimple) {
   int ret_val;
   float result = 0;
 
-  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf();
-  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(1);
+  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf().get_val();
+  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(
+                  LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                  .get_val();
 
   ret_val = LIBC_NAMESPACE::sscanf("123", "%f", &result);
   EXPECT_EQ(ret_val, 1);
@@ -294,9 +296,10 @@ TEST(LlvmLibcSScanfTest, FloatConvLengthModifier) {
   double d_result = 0;
   long double ld_result = 0;
 
-  double d_inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf();
-  long double ld_nan =
-      LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(1);
+  double d_inf = LIBC_NAMESPACE::fputil::FPBits<double>::inf().get_val();
+  long double ld_nan = LIBC_NAMESPACE::fputil::FPBits<long double>::build_nan(
+                           LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                           .get_val();
 
   ret_val = LIBC_NAMESPACE::sscanf("123", "%lf", &d_result);
   EXPECT_EQ(ret_val, 1);
@@ -391,8 +394,10 @@ TEST(LlvmLibcSScanfTest, FloatConvComplexParsing) {
   int ret_val;
   float result = 0;
 
-  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf();
-  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(1);
+  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf().get_val();
+  float nan = LIBC_NAMESPACE::fputil::FPBits<float>::build_nan(
+                  LIBC_NAMESPACE::fputil::Sign::POS, 1)
+                  .get_val();
 
   ret_val = LIBC_NAMESPACE::sscanf("0x1.0e3", "%f", &result);
   EXPECT_EQ(ret_val, 1);
@@ -463,7 +468,7 @@ TEST(LlvmLibcSScanfTest, FloatConvMaxWidth) {
   int ret_val;
   float result = 0;
 
-  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf();
+  float inf = LIBC_NAMESPACE::fputil::FPBits<float>::inf().get_val();
 
   ret_val = LIBC_NAMESPACE::sscanf("123", "%3f", &result);
   EXPECT_EQ(ret_val, 1);
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index b6ca525db6cf7..06a231c7d94d0 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -49,7 +49,7 @@ template <> struct ExtraPrecision<long double> {
 template <typename T>
 static inline unsigned int get_precision(double ulp_tolerance) {
   if (ulp_tolerance <= 0.5) {
-    return LIBC_NAMESPACE::fputil::FPBits<T>::MANTISSA_PRECISION;
+    return LIBC_NAMESPACE::fputil::FPBits<T>::FRACTION_LEN + 1;
   } else {
     return ExtraPrecision<T>::VALUE;
   }

From d2398cca6f716bb48f11500faf90f087f5b5f5d1 Mon Sep 17 00:00:00 2001
From: Saiyedul Islam <Saiyedul.Islam@amd.com>
Date: Tue, 23 Jan 2024 18:48:39 +0530
Subject: [PATCH 612/843] Restore: [mlir][ROCDL] Stop setting
 amdgpu-implicitarg-num-bytes (#79129)

This patch restores PR#78498
---
 .../Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp    | 2 +-
 mlir/test/Target/LLVMIR/rocdl.mlir                              | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
index a230ead7c1883..0cbb3da79d151 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/ROCDL/ROCDLToLLVMIRTranslation.cpp
@@ -99,7 +99,7 @@ class ROCDLDialectLLVMIRTranslationInterface
       if (!llvmFunc->hasFnAttribute("amdgpu-flat-work-group-size")) {
         llvmFunc->addFnAttr("amdgpu-flat-work-group-size", "1,256");
       }
-      llvmFunc->addFnAttr("amdgpu-implicitarg-num-bytes", "256");
+
     }
     // Override flat-work-group-size
     // TODO: update clients to rocdl.flat_work_group_size instead,
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index f831d7bba864c..3c9c70711ae23 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -489,7 +489,7 @@ llvm.func @rocdl_8bit_floats(%source: i32, %stoch: i32) -> i32 {
   llvm.return %source5 : i32
 }
 
-// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-implicitarg-num-bytes"="256" }
+// CHECK-DAG: attributes #[[$KERNEL_ATTRS]] = { "amdgpu-flat-work-group-size"="1,256" }
 // CHECK-DAG: attributes #[[$KERNEL_WORKGROUP_ATTRS]] = { "amdgpu-flat-work-group-size"="1,1024"
 // CHECK-DAG: attributes #[[$KNOWN_BLOCK_SIZE_ATTRS]] = { "amdgpu-flat-work-group-size"="128,128"
 // CHECK-DAG: ![[$RANGE]] = !{i32 0, i32 64}

From 55f12299d890078708eb6d2e069dc117ad244d4d Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 23 Jan 2024 20:20:40 +0700
Subject: [PATCH 613/843] ValueTracking: Recognize fcmp ole/ugt with inf as a
 class test (#79095)

These were missed and hopefully avoids assertions when
dc3faf0ed0e3f1ea9e435a006167d9649f865da1 is recommitted.
---
 llvm/lib/Analysis/ValueTracking.cpp           | 18 ++++++++++--
 .../Attributor/nofpclass-implied-by-fcmp.ll   | 28 +++++++++----------
 llvm/test/Transforms/InstCombine/and-fcmp.ll  |  5 +---
 llvm/unittests/Analysis/ValueTrackingTest.cpp |  8 +++---
 4 files changed, 35 insertions(+), 24 deletions(-)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index 34d5010320988..5d6c3465a0c36 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -4065,7 +4065,7 @@ llvm::fcmpToClassTest(FCmpInst::Predicate Pred, const Function &F, Value *LHS,
     case FCmpInst::FCMP_ULE: // isnan(x) || x <= 0
       return {LHS, fcNegative | fcPosZero | fcNan};
     default:
-      break;
+      llvm_unreachable("all compare types are handled");
     }
 
     return {nullptr, fcAllFlags};
@@ -4184,8 +4184,22 @@ llvm::fcmpToClassTest(FCmpInst::Predicate Pred, const Function &F, Value *LHS,
       Mask = fcNone;
       break;
     }
+    case FCmpInst::FCMP_OLE:
+    case FCmpInst::FCMP_UGT: {
+      if (ConstRHS->isNegative()) {
+        Mask = IsFabs ? fcNone : fcNegInf;
+        break;
+      }
+
+      // fcmp ole x, +inf -> fcmp ord x, x
+      // fcmp ole fabs(x), +inf -> fcmp ord x, x
+      // fcmp ole x, -inf -> fcmp oeq x, -inf
+      // fcmp ole fabs(x), -inf -> false
+      Mask = ~fcNan;
+      break;
+    }
     default:
-      return {nullptr, fcAllFlags};
+      llvm_unreachable("all compare types are handled");
     }
   } else if (ConstRHS->isSmallestNormalized() && !ConstRHS->isNegative()) {
     // Match pattern that's used in __builtin_isnormal.
diff --git a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
index ccd01de458c3c..d34e8ad106b42 100644
--- a/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
+++ b/llvm/test/Transforms/Attributor/nofpclass-implied-by-fcmp.ll
@@ -2400,8 +2400,8 @@ define float @assume_oeq_smallest_normal_known_pos(float nofpclass(ninf nsub nno
 ;---------------------------------------------------------------------
 
 define float @assume_ole_pinf(float %arg) {
-; CHECK-LABEL: define float @assume_ole_pinf(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan) float @assume_ole_pinf(
+; CHECK-SAME: float returned nofpclass(nan) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ole float [[ARG]], 0x7FF0000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -2412,8 +2412,8 @@ define float @assume_ole_pinf(float %arg) {
 }
 
 define float @assume_ole_ninf(float %arg) {
-; CHECK-LABEL: define float @assume_ole_ninf(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan pinf zero sub norm) float @assume_ole_ninf(
+; CHECK-SAME: float returned nofpclass(nan pinf zero sub norm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ole float [[ARG]], 0xFFF0000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -2424,8 +2424,8 @@ define float @assume_ole_ninf(float %arg) {
 }
 
 define float @assume_ugt_pinf(float %arg) {
-; CHECK-LABEL: define float @assume_ugt_pinf(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(inf zero sub norm) float @assume_ugt_pinf(
+; CHECK-SAME: float returned nofpclass(inf zero sub norm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ugt float [[ARG]], 0x7FF0000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -2436,8 +2436,8 @@ define float @assume_ugt_pinf(float %arg) {
 }
 
 define float @assume_ugt_ninf(float %arg) {
-; CHECK-LABEL: define float @assume_ugt_ninf(
-; CHECK-SAME: float returned [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(ninf) float @assume_ugt_ninf(
+; CHECK-SAME: float returned nofpclass(ninf) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ugt float [[ARG]], 0xFFF0000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
@@ -2448,8 +2448,8 @@ define float @assume_ugt_ninf(float %arg) {
 }
 
 define float @assume_fabs_ole_pinf(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ole_pinf(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(nan ninf nzero nsub nnorm) float @assume_fabs_ole_pinf(
+; CHECK-SAME: float returned nofpclass(nan ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ole float [[FABS]], 0x7FF0000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
@@ -2462,8 +2462,8 @@ define float @assume_fabs_ole_pinf(float %arg) {
 }
 
 define float @assume_fabs_ole_ninf(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ole_ninf(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(all) float @assume_fabs_ole_ninf(
+; CHECK-SAME: float returned nofpclass(all) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef false) #[[ATTR5]]
 ; CHECK-NEXT:    ret float [[ARG]]
 ;
@@ -2474,8 +2474,8 @@ define float @assume_fabs_ole_ninf(float %arg) {
 }
 
 define float @assume_fabs_ugt_pinf(float %arg) {
-; CHECK-LABEL: define nofpclass(ninf nzero nsub nnorm) float @assume_fabs_ugt_pinf(
-; CHECK-SAME: float returned nofpclass(ninf nzero nsub nnorm) [[ARG:%.*]]) #[[ATTR3]] {
+; CHECK-LABEL: define nofpclass(inf zero sub norm) float @assume_fabs_ugt_pinf(
+; CHECK-SAME: float returned nofpclass(inf zero sub norm) [[ARG:%.*]]) #[[ATTR3]] {
 ; CHECK-NEXT:    [[FABS:%.*]] = call float @llvm.fabs.f32(float [[ARG]]) #[[ATTR4]]
 ; CHECK-NEXT:    [[FCMP:%.*]] = fcmp ugt float [[FABS]], 0x7FF0000000000000
 ; CHECK-NEXT:    call void @llvm.assume(i1 noundef [[FCMP]]) #[[ATTR5]]
diff --git a/llvm/test/Transforms/InstCombine/and-fcmp.ll b/llvm/test/Transforms/InstCombine/and-fcmp.ll
index 90d34171385c4..42e3f34d126d6 100644
--- a/llvm/test/Transforms/InstCombine/and-fcmp.ll
+++ b/llvm/test/Transforms/InstCombine/and-fcmp.ll
@@ -4820,10 +4820,7 @@ define i1 @clang_builtin_isnormal_inf_check_olt(half %x) {
 
 define i1 @clang_builtin_isnormal_inf_check_ole(half %x) {
 ; CHECK-LABEL: @clang_builtin_isnormal_inf_check_ole(
-; CHECK-NEXT:    [[FABS_X:%.*]] = call half @llvm.fabs.f16(half [[X:%.*]])
-; CHECK-NEXT:    [[ORD:%.*]] = fcmp ord half [[X]], 0xH0000
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp ole half [[FABS_X]], 0xH7C00
-; CHECK-NEXT:    [[AND:%.*]] = and i1 [[ORD]], [[CMP]]
+; CHECK-NEXT:    [[AND:%.*]] = fcmp ord half [[X:%.*]], 0xH0000
 ; CHECK-NEXT:    ret i1 [[AND]]
 ;
   %fabs.x = call half @llvm.fabs.f16(half %x)
diff --git a/llvm/unittests/Analysis/ValueTrackingTest.cpp b/llvm/unittests/Analysis/ValueTrackingTest.cpp
index 27f631884072b..8104a32909eac 100644
--- a/llvm/unittests/Analysis/ValueTrackingTest.cpp
+++ b/llvm/unittests/Analysis/ValueTrackingTest.cpp
@@ -1945,14 +1945,14 @@ TEST_F(ComputeKnownFPClassTest, FCmpToClassTest_PInf) {
   auto [OleVal, OleClass] =
       fcmpToClassTest(CmpInst::FCMP_OLE, *A3->getFunction(), A3->getOperand(0),
                       A3->getOperand(1));
-  EXPECT_EQ(nullptr, OleVal);
-  EXPECT_EQ(fcAllFlags, OleClass);
+  EXPECT_EQ(A->getOperand(0), OleVal);
+  EXPECT_EQ(~fcNan, OleClass);
 
   auto [UgtVal, UgtClass] =
       fcmpToClassTest(CmpInst::FCMP_UGT, *A4->getFunction(), A4->getOperand(0),
                       A4->getOperand(1));
-  EXPECT_EQ(nullptr, UgtVal);
-  EXPECT_EQ(fcAllFlags, UgtClass);
+  EXPECT_EQ(A4->getOperand(0), UgtVal);
+  EXPECT_EQ(fcNan, UgtClass);
 }
 
 TEST_F(ComputeKnownFPClassTest, SqrtNszSignBit) {

From 10f3296dd7d74c975f208a8569221dc8f96d1db1 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <37383324+aganea@users.noreply.github.com>
Date: Tue, 23 Jan 2024 08:38:18 -0500
Subject: [PATCH 614/843] [openmp] Fix warnings when building on Windows with
 latest MSVC or Clang ToT (#77853)

There were quite a few compilation warnings when building openmp on Windows with
the latest Visual Studios 2022 version 17.8.4. Some other warnings were visible
with the latest Clang at tip. This commit fixes all of them.
---
 openmp/cmake/HandleOpenMPOptions.cmake   |  9 ++++++++
 openmp/runtime/src/kmp_affinity.cpp      | 27 ++++++++++++++++++------
 openmp/runtime/src/kmp_barrier.cpp       |  8 +++----
 openmp/runtime/src/kmp_io.cpp            | 19 -----------------
 openmp/runtime/src/kmp_os.h              |  2 ++
 openmp/runtime/src/kmp_settings.cpp      | 21 +++++++++++++++++-
 openmp/runtime/src/kmp_wait_release.h    |  5 +++--
 openmp/runtime/src/z_Windows_NT_util.cpp |  3 ++-
 8 files changed, 60 insertions(+), 34 deletions(-)

diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake
index b0fd0b7de4bf7..201aeabbd3df9 100644
--- a/openmp/cmake/HandleOpenMPOptions.cmake
+++ b/openmp/cmake/HandleOpenMPOptions.cmake
@@ -41,3 +41,12 @@ append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE
 append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_FUNCTION_SECTIONS "-ffunction-section" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_DATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+
+if (MSVC)
+  # Disable "warning C4201: nonstandard extension used: nameless struct/union"
+  append("-wd4201" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  
+  # Disable "warning C4190: '__kmpc_atomic_cmplx8_rd' has C-linkage specified, but returns
+  # UDT '__kmp_cmplx64_t' which is incompatible with C"
+  append("-wd4190" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 7009730a49ba7..d9e53bf946aff 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -28,6 +28,8 @@
 #endif
 #include <ctype.h>
 
+#include "llvm/Support/Compiler.h"
+
 // The machine topology
 kmp_topology_t *__kmp_topology = nullptr;
 // KMP_HW_SUBSET environment variable
@@ -127,8 +129,12 @@ const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
     return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
   case KMP_HW_PROC_GROUP:
     return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return KMP_I18N_STR(Unknown);
   }
-  return KMP_I18N_STR(Unknown);
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  LLVM_BUILTIN_UNREACHABLE;
 }
 
 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
@@ -157,13 +163,18 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
     return ((plural) ? "threads" : "thread");
   case KMP_HW_PROC_GROUP:
     return ((plural) ? "proc_groups" : "proc_group");
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return ((plural) ? "unknowns" : "unknown");
   }
-  return ((plural) ? "unknowns" : "unknown");
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  LLVM_BUILTIN_UNREACHABLE;
 }
 
 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
   switch (type) {
   case KMP_HW_CORE_TYPE_UNKNOWN:
+  case KMP_HW_MAX_NUM_CORE_TYPES:
     return "unknown";
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   case KMP_HW_CORE_TYPE_ATOM:
@@ -172,7 +183,8 @@ const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
     return "Intel(R) Core(TM) processor";
 #endif
   }
-  return "unknown";
+  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+  LLVM_BUILTIN_UNREACHABLE;
 }
 
 #if KMP_AFFINITY_SUPPORTED
@@ -1238,17 +1250,18 @@ bool kmp_topology_t::filter_hw_subset() {
   struct core_type_indexer {
     int operator()(const kmp_hw_thread_t &t) const {
       switch (t.attrs.get_core_type()) {
+      case KMP_HW_CORE_TYPE_UNKNOWN:
+      case KMP_HW_MAX_NUM_CORE_TYPES:
+        return 0;
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
       case KMP_HW_CORE_TYPE_ATOM:
         return 1;
       case KMP_HW_CORE_TYPE_CORE:
         return 2;
 #endif
-      case KMP_HW_CORE_TYPE_UNKNOWN:
-        return 0;
       }
-      KMP_ASSERT(0);
-      return 0;
+      KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
+      LLVM_BUILTIN_UNREACHABLE;
     }
   };
   struct core_eff_indexer {
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index 281b8e9c2883d..e9ab15f1723b6 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -2403,11 +2403,11 @@ void __kmp_fork_barrier(int gtid, int tid) {
 #if USE_ITT_BUILD
   void *itt_sync_obj = NULL;
 #endif /* USE_ITT_BUILD */
+#ifdef KMP_DEBUG
   if (team)
-
-  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
-                (team != NULL) ? team->t.t_id : -1, tid));
-
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
+                  (team != NULL) ? team->t.t_id : -1, tid));
+#endif
   // th_team pointer only valid for primary thread here
   if (KMP_MASTER_TID(tid)) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
diff --git a/openmp/runtime/src/kmp_io.cpp b/openmp/runtime/src/kmp_io.cpp
index 578e6e671cdf2..0c52662bc2357 100644
--- a/openmp/runtime/src/kmp_io.cpp
+++ b/openmp/runtime/src/kmp_io.cpp
@@ -50,24 +50,6 @@ static HANDLE __kmp_stderr = NULL;
 static int __kmp_console_exists = FALSE;
 static kmp_str_buf_t __kmp_console_buf;
 
-static int is_console(void) {
-  char buffer[128];
-  DWORD rc = 0;
-  DWORD err = 0;
-  // Try to get console title.
-  SetLastError(0);
-  // GetConsoleTitle does not reset last error in case of success or short
-  // buffer, so we need to clear it explicitly.
-  rc = GetConsoleTitle(buffer, sizeof(buffer));
-  if (rc == 0) {
-    // rc == 0 means getting console title failed. Let us find out why.
-    err = GetLastError();
-    // err == 0 means buffer too short (we suppose console exists).
-    // In Window applications we usually have err == 6 (invalid handle).
-  }
-  return rc > 0 || err == 0;
-}
-
 void __kmp_close_console(void) {
   /* wait until user presses return before closing window */
   /* TODO only close if a window was opened */
@@ -84,7 +66,6 @@ void __kmp_close_console(void) {
 static void __kmp_redirect_output(void) {
   __kmp_acquire_bootstrap_lock(&__kmp_console_lock);
 
-  (void)is_console;
   if (!__kmp_console_exists) {
     HANDLE ho;
     HANDLE he;
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index 6862fd89b6302..025304caf8724 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -306,6 +306,8 @@ template <> struct traits_t<unsigned long long> {
    !KMP_MIC)
 
 #if KMP_OS_WINDOWS
+// Don't include everything related to NT status code, we'll do that explicitly
+#define WIN32_NO_STATUS
 #include <windows.h>
 
 static inline int KMP_GET_PAGE_SIZE(void) {
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index 30a4c05fe76b3..a9bbfdaf841bf 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -29,6 +29,8 @@
 #include "ompd-specific.h"
 #endif
 
+#include "llvm/Support/Compiler.h"
+
 static int __kmp_env_toPrint(char const *name, int flag);
 
 bool __kmp_env_format = 0; // 0 - old format; 1 - new format
@@ -873,6 +875,10 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "PASSIVE";
     } break;
+    case library_none:
+    case library_serial: {
+      value = NULL;
+    } break;
     }
   } else {
     switch (__kmp_library) {
@@ -885,6 +891,9 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "throughput";
     } break;
+    case library_none: {
+      value = NULL;
+    } break;
     }
   }
   if (value != NULL) {
@@ -2004,6 +2013,7 @@ static inline const char *
 __kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
   switch (type) {
   case KMP_HW_CORE_TYPE_UNKNOWN:
+  case KMP_HW_MAX_NUM_CORE_TYPES:
     return "unknown";
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   case KMP_HW_CORE_TYPE_ATOM:
@@ -2012,7 +2022,8 @@ __kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
     return "intel_core";
 #endif
   }
-  return "unknown";
+  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+  LLVM_BUILTIN_UNREACHABLE;
 }
 
 #if KMP_AFFINITY_SUPPORTED
@@ -4428,6 +4439,10 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s,%d'\n", "auto", __kmp_chunk);
       break;
+    default:
+      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
+      LLVM_BUILTIN_UNREACHABLE;
+      break;
     }
   } else {
     switch (sched) {
@@ -4453,6 +4468,10 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s'\n", "auto");
       break;
+    default:
+      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
+      LLVM_BUILTIN_UNREACHABLE;
+      break;
     }
   }
 } // __kmp_stg_print_omp_schedule
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
index 3fcae5687d124..c1a4c778b4b41 100644
--- a/openmp/runtime/src/kmp_wait_release.h
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -20,6 +20,8 @@
 #include "ompt-specific.h"
 #endif
 
+#include "llvm/Support/Compiler.h"
+
 /*!
 @defgroup WAIT_RELEASE Wait/Release operations
 
@@ -1038,7 +1040,6 @@ static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
   case flag_oncore:
     __kmp_resume_oncore(gtid, RCAST(kmp_flag_oncore *, flag));
     break;
-#ifdef KMP_DEBUG
   case flag_unset:
     KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d is unset\n", type));
     break;
@@ -1046,7 +1047,7 @@ static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
     KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d does not match any "
                    "known flag type\n",
                    type));
-#endif
+    LLVM_BUILTIN_UNREACHABLE;
   }
 }
 
diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp
index 9e264ab45b87f..d75b48b2c1bcf 100644
--- a/openmp/runtime/src/z_Windows_NT_util.cpp
+++ b/openmp/runtime/src/z_Windows_NT_util.cpp
@@ -22,6 +22,7 @@
    number of running threads in the system. */
 
 #include <ntsecapi.h> // UNICODE_STRING
+#undef WIN32_NO_STATUS
 #include <ntstatus.h>
 #include <psapi.h>
 #ifdef _MSC_VER
@@ -1635,7 +1636,7 @@ int __kmp_get_load_balance(int max) {
     // threads on all cores. So, we don't consider the running threads of this
     // process.
     if (pid != 0) {
-      for (int i = 0; i < num; ++i) {
+      for (ULONG i = 0; i < num; ++i) {
         THREAD_STATE state = spi->Threads[i].State;
         // Count threads that have Ready or Running state.
         // !!! TODO: Why comment does not match the code???

From 16df714e77e8d31619445e5f6d87d89da962eec4 Mon Sep 17 00:00:00 2001
From: Danial Klimkin <dklimkin@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:48:33 +0100
Subject: [PATCH 615/843] [test] Update stack_guard_remat.ll (#79139)

Replace cp with a cat. This allows to create a writable file when the
original one is read-only.
---
 llvm/test/CodeGen/Thumb/stack_guard_remat.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
index b9c993332a698..cc142392981d6 100644
--- a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
@@ -1,4 +1,4 @@
-; RUN: cp %s %t.pic.ll
+; RUN: cat %s > %t.pic.ll
 ; RUN: echo -e '!llvm.module.flags = !{!0}\n!0 = !{i32 7, !"PIC Level", i32 2}' >> %t.pic.ll
 ; RUN: llc < %t.pic.ll -mtriple=thumb-apple-darwin -relocation-model=pic -no-integrated-as | FileCheck %s -check-prefix=PIC
 ; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=static -no-integrated-as | FileCheck %s -check-prefix=NO-PIC  -check-prefix=STATIC

From 94f960925b7f609636fc2ffd83053814d5e45ed1 Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alex_toresh@yahoo.fr>
Date: Tue, 23 Jan 2024 08:51:04 -0500
Subject: [PATCH 616/843] Revert 10f3296dd7d74c975f208a8569221dc8f96d1db1 -
 [openmp] Fix warnings when building on Windows with latest MSVC or Clang ToT
 (#77853)

It broke the AMDGPU buildbot: https://lab.llvm.org/buildbot/#/builders/193/builds/45378
---
 openmp/cmake/HandleOpenMPOptions.cmake   |  9 --------
 openmp/runtime/src/kmp_affinity.cpp      | 27 ++++++------------------
 openmp/runtime/src/kmp_barrier.cpp       |  8 +++----
 openmp/runtime/src/kmp_io.cpp            | 19 +++++++++++++++++
 openmp/runtime/src/kmp_os.h              |  2 --
 openmp/runtime/src/kmp_settings.cpp      | 21 +-----------------
 openmp/runtime/src/kmp_wait_release.h    |  5 ++---
 openmp/runtime/src/z_Windows_NT_util.cpp |  3 +--
 8 files changed, 34 insertions(+), 60 deletions(-)

diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake
index 201aeabbd3df9..b0fd0b7de4bf7 100644
--- a/openmp/cmake/HandleOpenMPOptions.cmake
+++ b/openmp/cmake/HandleOpenMPOptions.cmake
@@ -41,12 +41,3 @@ append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE
 append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_FUNCTION_SECTIONS "-ffunction-section" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_DATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-
-if (MSVC)
-  # Disable "warning C4201: nonstandard extension used: nameless struct/union"
-  append("-wd4201" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-  
-  # Disable "warning C4190: '__kmpc_atomic_cmplx8_rd' has C-linkage specified, but returns
-  # UDT '__kmp_cmplx64_t' which is incompatible with C"
-  append("-wd4190" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
-endif()
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index d9e53bf946aff..7009730a49ba7 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -28,8 +28,6 @@
 #endif
 #include <ctype.h>
 
-#include "llvm/Support/Compiler.h"
-
 // The machine topology
 kmp_topology_t *__kmp_topology = nullptr;
 // KMP_HW_SUBSET environment variable
@@ -129,12 +127,8 @@ const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
     return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
   case KMP_HW_PROC_GROUP:
     return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
-  case KMP_HW_UNKNOWN:
-  case KMP_HW_LAST:
-    return KMP_I18N_STR(Unknown);
   }
-  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
-  LLVM_BUILTIN_UNREACHABLE;
+  return KMP_I18N_STR(Unknown);
 }
 
 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
@@ -163,18 +157,13 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
     return ((plural) ? "threads" : "thread");
   case KMP_HW_PROC_GROUP:
     return ((plural) ? "proc_groups" : "proc_group");
-  case KMP_HW_UNKNOWN:
-  case KMP_HW_LAST:
-    return ((plural) ? "unknowns" : "unknown");
   }
-  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
-  LLVM_BUILTIN_UNREACHABLE;
+  return ((plural) ? "unknowns" : "unknown");
 }
 
 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
   switch (type) {
   case KMP_HW_CORE_TYPE_UNKNOWN:
-  case KMP_HW_MAX_NUM_CORE_TYPES:
     return "unknown";
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   case KMP_HW_CORE_TYPE_ATOM:
@@ -183,8 +172,7 @@ const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
     return "Intel(R) Core(TM) processor";
 #endif
   }
-  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
-  LLVM_BUILTIN_UNREACHABLE;
+  return "unknown";
 }
 
 #if KMP_AFFINITY_SUPPORTED
@@ -1250,18 +1238,17 @@ bool kmp_topology_t::filter_hw_subset() {
   struct core_type_indexer {
     int operator()(const kmp_hw_thread_t &t) const {
       switch (t.attrs.get_core_type()) {
-      case KMP_HW_CORE_TYPE_UNKNOWN:
-      case KMP_HW_MAX_NUM_CORE_TYPES:
-        return 0;
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
       case KMP_HW_CORE_TYPE_ATOM:
         return 1;
       case KMP_HW_CORE_TYPE_CORE:
         return 2;
 #endif
+      case KMP_HW_CORE_TYPE_UNKNOWN:
+        return 0;
       }
-      KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
-      LLVM_BUILTIN_UNREACHABLE;
+      KMP_ASSERT(0);
+      return 0;
     }
   };
   struct core_eff_indexer {
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index e9ab15f1723b6..281b8e9c2883d 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -2403,11 +2403,11 @@ void __kmp_fork_barrier(int gtid, int tid) {
 #if USE_ITT_BUILD
   void *itt_sync_obj = NULL;
 #endif /* USE_ITT_BUILD */
-#ifdef KMP_DEBUG
   if (team)
-    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
-                  (team != NULL) ? team->t.t_id : -1, tid));
-#endif
+
+  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
+                (team != NULL) ? team->t.t_id : -1, tid));
+
   // th_team pointer only valid for primary thread here
   if (KMP_MASTER_TID(tid)) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
diff --git a/openmp/runtime/src/kmp_io.cpp b/openmp/runtime/src/kmp_io.cpp
index 0c52662bc2357..578e6e671cdf2 100644
--- a/openmp/runtime/src/kmp_io.cpp
+++ b/openmp/runtime/src/kmp_io.cpp
@@ -50,6 +50,24 @@ static HANDLE __kmp_stderr = NULL;
 static int __kmp_console_exists = FALSE;
 static kmp_str_buf_t __kmp_console_buf;
 
+static int is_console(void) {
+  char buffer[128];
+  DWORD rc = 0;
+  DWORD err = 0;
+  // Try to get console title.
+  SetLastError(0);
+  // GetConsoleTitle does not reset last error in case of success or short
+  // buffer, so we need to clear it explicitly.
+  rc = GetConsoleTitle(buffer, sizeof(buffer));
+  if (rc == 0) {
+    // rc == 0 means getting console title failed. Let us find out why.
+    err = GetLastError();
+    // err == 0 means buffer too short (we suppose console exists).
+    // In Window applications we usually have err == 6 (invalid handle).
+  }
+  return rc > 0 || err == 0;
+}
+
 void __kmp_close_console(void) {
   /* wait until user presses return before closing window */
   /* TODO only close if a window was opened */
@@ -66,6 +84,7 @@ void __kmp_close_console(void) {
 static void __kmp_redirect_output(void) {
   __kmp_acquire_bootstrap_lock(&__kmp_console_lock);
 
+  (void)is_console;
   if (!__kmp_console_exists) {
     HANDLE ho;
     HANDLE he;
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index 025304caf8724..6862fd89b6302 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -306,8 +306,6 @@ template <> struct traits_t<unsigned long long> {
    !KMP_MIC)
 
 #if KMP_OS_WINDOWS
-// Don't include everything related to NT status code, we'll do that explicitly
-#define WIN32_NO_STATUS
 #include <windows.h>
 
 static inline int KMP_GET_PAGE_SIZE(void) {
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index a9bbfdaf841bf..30a4c05fe76b3 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -29,8 +29,6 @@
 #include "ompd-specific.h"
 #endif
 
-#include "llvm/Support/Compiler.h"
-
 static int __kmp_env_toPrint(char const *name, int flag);
 
 bool __kmp_env_format = 0; // 0 - old format; 1 - new format
@@ -875,10 +873,6 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "PASSIVE";
     } break;
-    case library_none:
-    case library_serial: {
-      value = NULL;
-    } break;
     }
   } else {
     switch (__kmp_library) {
@@ -891,9 +885,6 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "throughput";
     } break;
-    case library_none: {
-      value = NULL;
-    } break;
     }
   }
   if (value != NULL) {
@@ -2013,7 +2004,6 @@ static inline const char *
 __kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
   switch (type) {
   case KMP_HW_CORE_TYPE_UNKNOWN:
-  case KMP_HW_MAX_NUM_CORE_TYPES:
     return "unknown";
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   case KMP_HW_CORE_TYPE_ATOM:
@@ -2022,8 +2012,7 @@ __kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
     return "intel_core";
 #endif
   }
-  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
-  LLVM_BUILTIN_UNREACHABLE;
+  return "unknown";
 }
 
 #if KMP_AFFINITY_SUPPORTED
@@ -4439,10 +4428,6 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s,%d'\n", "auto", __kmp_chunk);
       break;
-    default:
-      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
-      LLVM_BUILTIN_UNREACHABLE;
-      break;
     }
   } else {
     switch (sched) {
@@ -4468,10 +4453,6 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s'\n", "auto");
       break;
-    default:
-      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
-      LLVM_BUILTIN_UNREACHABLE;
-      break;
     }
   }
 } // __kmp_stg_print_omp_schedule
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
index c1a4c778b4b41..3fcae5687d124 100644
--- a/openmp/runtime/src/kmp_wait_release.h
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -20,8 +20,6 @@
 #include "ompt-specific.h"
 #endif
 
-#include "llvm/Support/Compiler.h"
-
 /*!
 @defgroup WAIT_RELEASE Wait/Release operations
 
@@ -1040,6 +1038,7 @@ static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
   case flag_oncore:
     __kmp_resume_oncore(gtid, RCAST(kmp_flag_oncore *, flag));
     break;
+#ifdef KMP_DEBUG
   case flag_unset:
     KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d is unset\n", type));
     break;
@@ -1047,7 +1046,7 @@ static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
     KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d does not match any "
                    "known flag type\n",
                    type));
-    LLVM_BUILTIN_UNREACHABLE;
+#endif
   }
 }
 
diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp
index d75b48b2c1bcf..9e264ab45b87f 100644
--- a/openmp/runtime/src/z_Windows_NT_util.cpp
+++ b/openmp/runtime/src/z_Windows_NT_util.cpp
@@ -22,7 +22,6 @@
    number of running threads in the system. */
 
 #include <ntsecapi.h> // UNICODE_STRING
-#undef WIN32_NO_STATUS
 #include <ntstatus.h>
 #include <psapi.h>
 #ifdef _MSC_VER
@@ -1636,7 +1635,7 @@ int __kmp_get_load_balance(int max) {
     // threads on all cores. So, we don't consider the running threads of this
     // process.
     if (pid != 0) {
-      for (ULONG i = 0; i < num; ++i) {
+      for (int i = 0; i < num; ++i) {
         THREAD_STATE state = spi->Threads[i].State;
         // Count threads that have Ready or Running state.
         // !!! TODO: Why comment does not match the code???

From e96242b4e4306ba690f9e2e8f12758da8f63805e Mon Sep 17 00:00:00 2001
From: Paul T Robinson <paul.robinson@sony.com>
Date: Tue, 23 Jan 2024 05:52:30 -0800
Subject: [PATCH 617/843] [Headers][X86] Add macro descriptions to bmiintrin.h
 (#79048)

These are largely copy-pasted from the corresponding function
descriptions. Added \see cross-references. Also changed <c> tags to \c.
---
 clang/lib/Headers/bmiintrin.h | 273 ++++++++++++++++++++++++++++------
 1 file changed, 229 insertions(+), 44 deletions(-)

diff --git a/clang/lib/Headers/bmiintrin.h b/clang/lib/Headers/bmiintrin.h
index bc7c8a03c5e2a..d8e57c0cb4940 100644
--- a/clang/lib/Headers/bmiintrin.h
+++ b/clang/lib/Headers/bmiintrin.h
@@ -19,18 +19,17 @@
    to use it as a potentially faster version of BSF. */
 #define __RELAXED_FN_ATTRS __attribute__((__always_inline__, __nodebug__))
 
-#define _tzcnt_u16 __tzcnt_u16
-
 /// Counts the number of trailing zero bits in the operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 16-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 16-bit integer containing the number of trailing zero
 ///    bits in the operand.
+/// \see _tzcnt_u16
 static __inline__ unsigned short __RELAXED_FN_ATTRS
 __tzcnt_u16(unsigned short __X)
 {
@@ -41,13 +40,30 @@ __tzcnt_u16(unsigned short __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// \code
+/// unsigned short _tzcnt_u16(unsigned short __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 16-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 16-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see __tzcnt_u16
+#define _tzcnt_u16 __tzcnt_u16
+
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 32-bit integer containing the number of trailing zero
 ///    bits in the operand.
-/// \see _mm_tzcnt_32
+/// \see { _mm_tzcnt_32 _tzcnt_u32 }
 static __inline__ unsigned int __RELAXED_FN_ATTRS
 __tzcnt_u32(unsigned int __X)
 {
@@ -58,19 +74,34 @@ __tzcnt_u32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 32-bit integer whose trailing zeros are to be counted.
-/// \returns An 32-bit integer containing the number of trailing zero bits in
+/// \returns A 32-bit integer containing the number of trailing zero bits in
 ///    the operand.
-/// \see __tzcnt_u32
+/// \see { __tzcnt_u32 _tzcnt_u32 }
 static __inline__ int __RELAXED_FN_ATTRS
 _mm_tzcnt_32(unsigned int __X)
 {
   return (int)__builtin_ia32_tzcnt_u32(__X);
 }
 
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _tzcnt_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 32-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 32-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_32 __tzcnt_u32 }
 #define _tzcnt_u32 __tzcnt_u32
 
 #ifdef __x86_64__
@@ -79,13 +110,13 @@ _mm_tzcnt_32(unsigned int __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An unsigned 64-bit integer containing the number of trailing zero
 ///    bits in the operand.
-/// \see _mm_tzcnt_64
+/// \see { _mm_tzcnt_64 _tzcnt_u64 }
 static __inline__ unsigned long long __RELAXED_FN_ATTRS
 __tzcnt_u64(unsigned long long __X)
 {
@@ -96,19 +127,34 @@ __tzcnt_u64(unsigned long long __X)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> TZCNT </c> instruction.
+/// This intrinsic corresponds to the \c TZCNT instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose trailing zeros are to be counted.
 /// \returns An 64-bit integer containing the number of trailing zero bits in
 ///    the operand.
-/// \see __tzcnt_u64
+/// \see { __tzcnt_u64 _tzcnt_u64 }
 static __inline__ long long __RELAXED_FN_ATTRS
 _mm_tzcnt_64(unsigned long long __X)
 {
   return (long long)__builtin_ia32_tzcnt_u64(__X);
 }
 
+/// Counts the number of trailing zero bits in the operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _tzcnt_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c TZCNT instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose trailing zeros are to be counted.
+/// \returns An unsigned 64-bit integer containing the number of trailing zero
+///    bits in the operand.
+/// \see { _mm_tzcnt_64 __tzcnt_u64
 #define _tzcnt_u64 __tzcnt_u64
 
 #endif /* __x86_64__ */
@@ -121,21 +167,12 @@ _mm_tzcnt_64(unsigned long long __X)
 /* Define the default attributes for the functions in this file. */
 #define __DEFAULT_FN_ATTRS __attribute__((__always_inline__, __nodebug__, __target__("bmi")))
 
-#define _andn_u32 __andn_u32
-
-/* _bextr_u32 != __bextr_u32 */
-#define _blsi_u32 __blsi_u32
-
-#define _blsmsk_u32 __blsmsk_u32
-
-#define _blsr_u32 __blsr_u32
-
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+/// This intrinsic corresponds to the \c ANDN instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing one of the operands.
@@ -143,19 +180,40 @@ _mm_tzcnt_64(unsigned long long __X)
 ///    An unsigned integer containing one of the operands.
 /// \returns An unsigned integer containing the bitwise AND of the second
 ///    operand with the one's complement of the first operand.
+/// \see _andn_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __andn_u32(unsigned int __X, unsigned int __Y)
 {
   return ~__X & __Y;
 }
 
+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _andn_u32(unsigned int __X, unsigned int __Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned integer containing one of the operands.
+/// \param __Y
+///    An unsigned integer containing one of the operands.
+/// \returns An unsigned integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see __andn_u32
+#define _andn_u32 __andn_u32
+
 /* AMD-specified, double-leading-underscore version of BEXTR */
 /// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@@ -178,7 +236,7 @@ __bextr_u32(unsigned int __X, unsigned int __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@@ -203,7 +261,7 @@ _bextr_u32(unsigned int __X, unsigned int __Y, unsigned int __Z)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be extracted.
@@ -224,69 +282,117 @@ _bextr2_u32(unsigned int __X, unsigned int __Y) {
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+/// This intrinsic corresponds to the \c BLSI instruction.
 ///
 /// \param __X
 ///    An unsigned integer whose bits are to be cleared.
 /// \returns An unsigned integer containing the result of clearing the bits from
 ///    the source operand.
+/// \see _blsi_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsi_u32(unsigned int __X)
 {
   return __X & -__X;
 }
 
+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsi_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned integer whose bits are to be cleared.
+/// \returns An unsigned integer containing the result of clearing the bits from
+///    the source operand.
+/// \see __blsi_u32
+#define _blsi_u32 __blsi_u32
+
 /// Creates a mask whose bits are set to 1, using bit 0 up to and
 ///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+/// This intrinsic corresponds to the \c BLSMSK instruction.
 ///
 /// \param __X
 ///    An unsigned integer used to create the mask.
 /// \returns An unsigned integer containing the newly created mask.
+/// \see _blsmsk_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsmsk_u32(unsigned int __X)
 {
   return __X ^ (__X - 1);
 }
 
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _blsmsk_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned integer used to create the mask.
+/// \returns An unsigned integer containing the newly created mask.
+/// \see __blsmsk_u32
+#define _blsmsk_u32 __blsmsk_u32
+
 /// Clears the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+/// This intrinsic corresponds to the \c BLSR instruction.
 ///
 /// \param __X
 ///    An unsigned integer containing the operand to be cleared.
 /// \returns An unsigned integer containing the result of clearing the source
 ///    operand.
+/// \see _blsr_u32
 static __inline__ unsigned int __DEFAULT_FN_ATTRS
 __blsr_u32(unsigned int __X)
 {
   return __X & (__X - 1);
 }
 
-#ifdef __x86_64__
-
-#define _andn_u64 __andn_u64
-
-/* _bextr_u64 != __bextr_u64 */
-#define _blsi_u64 __blsi_u64
-
-#define _blsmsk_u64 __blsmsk_u64
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned int _bls4_u32(unsigned int __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned integer containing the operand to be cleared.
+/// \returns An unsigned integer containing the result of clearing the source
+///    operand.
+/// \see __blsr_u32
+#define _blsr_u32 __blsr_u32
 
-#define _blsr_u64 __blsr_u64
+#ifdef __x86_64__
 
 /// Performs a bitwise AND of the second operand with the one's
 ///    complement of the first operand.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> ANDN </c> instruction.
+/// This intrinsic corresponds to the \c ANDN instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing one of the operands.
@@ -294,19 +400,41 @@ __blsr_u32(unsigned int __X)
 ///    An unsigned 64-bit integer containing one of the operands.
 /// \returns An unsigned 64-bit integer containing the bitwise AND of the second
 ///    operand with the one's complement of the first operand.
+/// \see _andn_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __andn_u64 (unsigned long long __X, unsigned long long __Y)
 {
   return ~__X & __Y;
 }
 
+/// Performs a bitwise AND of the second operand with the one's
+///    complement of the first operand.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _andn_u64(unsigned long long __X,
+///                              unsigned long long __Y);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c ANDN instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing one of the operands.
+/// \param __Y
+///    An unsigned 64-bit integer containing one of the operands.
+/// \returns An unsigned 64-bit integer containing the bitwise AND of the second
+///    operand with the one's complement of the first operand.
+/// \see __andn_u64
+#define _andn_u64 __andn_u64
+
 /* AMD-specified, double-leading-underscore version of BEXTR */
 /// Extracts the specified bits from the first operand and returns them
 ///    in the least significant bits of the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@@ -329,7 +457,7 @@ __bextr_u64(unsigned long long __X, unsigned long long __Y)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@@ -354,7 +482,7 @@ _bextr_u64(unsigned long long __X, unsigned int __Y, unsigned int __Z)
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BEXTR </c> instruction.
+/// This intrinsic corresponds to the \c BEXTR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be extracted.
@@ -375,52 +503,109 @@ _bextr2_u64(unsigned long long __X, unsigned long long __Y) {
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSI </c> instruction.
+/// This intrinsic corresponds to the \c BLSI instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer whose bits are to be cleared.
 /// \returns An unsigned 64-bit integer containing the result of clearing the
 ///    bits from the source operand.
+/// \see _blsi_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsi_u64(unsigned long long __X)
 {
   return __X & -__X;
 }
 
+/// Clears all bits in the source except for the least significant bit
+///    containing a value of 1 and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsi_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSI instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer whose bits are to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    bits from the source operand.
+/// \see __blsi_u64
+#define _blsi_u64 __blsi_u64
+
 /// Creates a mask whose bits are set to 1, using bit 0 up to and
 ///    including the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSMSK </c> instruction.
+/// This intrinsic corresponds to the \c BLSMSK instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer used to create the mask.
 /// \returns An unsigned 64-bit integer containing the newly created mask.
+/// \see _blsmsk_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsmsk_u64(unsigned long long __X)
 {
   return __X ^ (__X - 1);
 }
 
+/// Creates a mask whose bits are set to 1, using bit 0 up to and
+///    including the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsmsk_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSMSK instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer used to create the mask.
+/// \returns An unsigned 64-bit integer containing the newly created mask.
+/// \see __blsmsk_u64
+#define _blsmsk_u64 __blsmsk_u64
+
 /// Clears the least significant bit that is set to 1 in the source
 ///    operand and returns the result.
 ///
 /// \headerfile <x86intrin.h>
 ///
-/// This intrinsic corresponds to the <c> BLSR </c> instruction.
+/// This intrinsic corresponds to the \c BLSR instruction.
 ///
 /// \param __X
 ///    An unsigned 64-bit integer containing the operand to be cleared.
 /// \returns An unsigned 64-bit integer containing the result of clearing the
 ///    source operand.
+/// \see _blsr_u64
 static __inline__ unsigned long long __DEFAULT_FN_ATTRS
 __blsr_u64(unsigned long long __X)
 {
   return __X & (__X - 1);
 }
 
+/// Clears the least significant bit that is set to 1 in the source
+///    operand and returns the result.
+///
+/// \headerfile <x86intrin.h>
+///
+/// \code
+/// unsigned long long _blsr_u64(unsigned long long __X);
+/// \endcode
+///
+/// This intrinsic corresponds to the \c BLSR instruction.
+///
+/// \param __X
+///    An unsigned 64-bit integer containing the operand to be cleared.
+/// \returns An unsigned 64-bit integer containing the result of clearing the
+///    source operand.
+/// \see __blsr_u64
+#define _blsr_u64 __blsr_u64
+
 #endif /* __x86_64__ */
 
 #undef __DEFAULT_FN_ATTRS

From d38c61a13d89ec57b80e5174029b63071c6d7290 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Tue, 23 Jan 2024 14:04:10 +0100
Subject: [PATCH 618/843] [clang][Interp][NFC] Move ToVoid casts to the bottom

So we have all the complex casts together.
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index 0651828113268..cfcef067b92bd 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -291,9 +291,6 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
   case CK_FloatingComplexToReal:
     return this->emitComplexReal(SubExpr);
 
-  case CK_ToVoid:
-    return discard(SubExpr);
-
   case CK_IntegralRealToComplex:
   case CK_FloatingRealToComplex: {
     // We're creating a complex value here, so we need to
@@ -317,6 +314,9 @@ bool ByteCodeExprGen<Emitter>::VisitCastExpr(const CastExpr *CE) {
     return this->emitInitElem(T, 1, SubExpr);
   }
 
+  case CK_ToVoid:
+    return discard(SubExpr);
+
   default:
     assert(false && "Cast not implemented");
   }

From 654131fab22e71f156d8fa9d37389622f1652438 Mon Sep 17 00:00:00 2001
From: OldWorldOrdr <joey.t.reinhart@gmail.com>
Date: Tue, 23 Jan 2024 09:08:43 -0500
Subject: [PATCH 619/843] fix test (#79018)

Mistake with #78628 that got caught after being merged
---
 lld/test/MachO/link-csu-object.s | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/MachO/link-csu-object.s b/lld/test/MachO/link-csu-object.s
index e6f5ff7e52e32..a36e50a8901bd 100644
--- a/lld/test/MachO/link-csu-object.s
+++ b/lld/test/MachO/link-csu-object.s
@@ -2,7 +2,7 @@
 # RUN: mkdir -p %t
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %p/Inputs/libhello.s -o %t/hello.o
 # RUN: llvm-mc -filetype=obj -triple=x86_64-apple-darwin %s -o %t/main.o
-# RUN: %lld -L %t %t/main.o %t/hello.o -o %t/a.out
+# RUN: %lld -L %t %t/main.o -lhello.o -o %t/a.out
 # RUN: llvm-nm %t/a.out | FileCheck %s
 
 # CHECK: _main

From 10bd69a4f72a094f4e157ed3e226da426432ef74 Mon Sep 17 00:00:00 2001
From: Anatoly Trosinenko <atrosinenko@accesssoftek.com>
Date: Tue, 23 Jan 2024 17:21:40 +0300
Subject: [PATCH 620/843] [MachineOutliner] Refactor iterating over Candidate's
 instructions (#78972)

Make Candidate's front() and back() functions return references to
MachineInstr and introduce begin() and end() returning iterators, the
same way it is usually done in other container-like classes.

This makes possible to iterate over the instructions contained in
Candidate the same way one can iterate over MachineBasicBlock (note that
begin() and end() return bundled iterators, just like MachineBasicBlock
does, but no instr_begin() and instr_end() are defined yet).
---
 llvm/include/llvm/CodeGen/MachineOutliner.h  | 11 ++--
 llvm/lib/CodeGen/MachineOutliner.cpp         | 27 +++++----
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp | 61 +++++++++-----------
 llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp     | 19 +++---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp     |  6 +-
 llvm/lib/Target/X86/X86InstrInfo.cpp         | 25 ++++----
 6 files changed, 69 insertions(+), 80 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index d0ff02fea4ff9..eaba6c9b18f2b 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -87,7 +87,7 @@ struct Candidate {
     // Compute liveness from the end of the block up to the beginning of the
     // outlining candidate.
     for (auto &MI : make_range(MBB->rbegin(),
-                               (MachineBasicBlock::reverse_iterator)front()))
+                               (MachineBasicBlock::reverse_iterator)begin()))
       FromEndOfBlockToStartOfSeq.stepBackward(MI);
   }
 
@@ -100,7 +100,7 @@ struct Candidate {
       return;
     InSeqWasSet = true;
     InSeq.init(TRI);
-    for (auto &MI : make_range(front(), std::next(back())))
+    for (auto &MI : *this)
       InSeq.accumulate(MI);
   }
 
@@ -135,8 +135,11 @@ struct Candidate {
   /// Returns the call overhead of this candidate if it is in the list.
   unsigned getCallOverhead() const { return CallOverhead; }
 
-  MachineBasicBlock::iterator &front() { return FirstInst; }
-  MachineBasicBlock::iterator &back() { return LastInst; }
+  MachineBasicBlock::iterator begin() { return FirstInst; }
+  MachineBasicBlock::iterator end() { return std::next(LastInst); }
+
+  MachineInstr &front() { return *FirstInst; }
+  MachineInstr &back() { return *LastInst; }
   MachineFunction *getMF() const { return MBB->getParent(); }
   MachineBasicBlock *getMBB() const { return MBB; }
 
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index a0769105c9292..b8d3b2e30e6e6 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -525,7 +525,7 @@ void MachineOutliner::emitNotOutliningCheaperRemark(
   MachineOptimizationRemarkEmitter MORE(*(C.getMF()), nullptr);
   MORE.emit([&]() {
     MachineOptimizationRemarkMissed R(DEBUG_TYPE, "NotOutliningCheaper",
-                                      C.front()->getDebugLoc(), C.getMBB());
+                                      C.front().getDebugLoc(), C.getMBB());
     R << "Did not outline " << NV("Length", StringLen) << " instructions"
       << " from " << NV("NumOccurrences", CandidatesForRepeatedSeq.size())
       << " locations."
@@ -538,7 +538,7 @@ void MachineOutliner::emitNotOutliningCheaperRemark(
     // Tell the user the other places the candidate was found.
     for (unsigned i = 1, e = CandidatesForRepeatedSeq.size(); i < e; i++) {
       R << NV((Twine("OtherStartLoc") + Twine(i)).str(),
-              CandidatesForRepeatedSeq[i].front()->getDebugLoc());
+              CandidatesForRepeatedSeq[i].front().getDebugLoc());
       if (i != e - 1)
         R << ", ";
     }
@@ -563,7 +563,7 @@ void MachineOutliner::emitOutlinedFunctionRemark(OutlinedFunction &OF) {
   for (size_t i = 0, e = OF.Candidates.size(); i < e; i++) {
 
     R << NV((Twine("StartLoc") + Twine(i)).str(),
-            OF.Candidates[i].front()->getDebugLoc());
+            OF.Candidates[i].front().getDebugLoc());
     if (i != e - 1)
       R << ", ";
   }
@@ -732,23 +732,22 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   // Insert the new function into the module.
   MF.insert(MF.begin(), &MBB);
 
-  MachineFunction *OriginalMF = FirstCand.front()->getMF();
+  MachineFunction *OriginalMF = FirstCand.front().getMF();
   const std::vector<MCCFIInstruction> &Instrs =
       OriginalMF->getFrameInstructions();
-  for (auto I = FirstCand.front(), E = std::next(FirstCand.back()); I != E;
-       ++I) {
-    if (I->isDebugInstr())
+  for (auto &MI : FirstCand) {
+    if (MI.isDebugInstr())
       continue;
 
     // Don't keep debug information for outlined instructions.
     auto DL = DebugLoc();
-    if (I->isCFIInstruction()) {
-      unsigned CFIIndex = I->getOperand(0).getCFIIndex();
+    if (MI.isCFIInstruction()) {
+      unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
       MCCFIInstruction CFI = Instrs[CFIIndex];
       BuildMI(MBB, MBB.end(), DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(MF.addFrameInst(CFI));
     } else {
-      MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
+      MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
       NewMI->dropMemRefs(MF);
       NewMI->setDebugLoc(DL);
       MBB.insert(MBB.end(), NewMI);
@@ -768,11 +767,11 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   LivePhysRegs LiveIns(TRI);
   for (auto &Cand : OF.Candidates) {
     // Figure out live-ins at the first instruction.
-    MachineBasicBlock &OutlineBB = *Cand.front()->getParent();
+    MachineBasicBlock &OutlineBB = *Cand.front().getParent();
     LivePhysRegs CandLiveIns(TRI);
     CandLiveIns.addLiveOuts(OutlineBB);
     for (const MachineInstr &MI :
-         reverse(make_range(Cand.front(), OutlineBB.end())))
+         reverse(make_range(Cand.begin(), OutlineBB.end())))
       CandLiveIns.stepBackward(MI);
 
     // The live-in set for the outlined function is the union of the live-ins
@@ -884,8 +883,8 @@ bool MachineOutliner::outline(Module &M,
     LLVM_DEBUG(dbgs() << "CREATE OUTLINED CALLS\n");
     for (Candidate &C : OF.Candidates) {
       MachineBasicBlock &MBB = *C.getMBB();
-      MachineBasicBlock::iterator StartIt = C.front();
-      MachineBasicBlock::iterator EndIt = C.back();
+      MachineBasicBlock::iterator StartIt = C.begin();
+      MachineBasicBlock::iterator EndIt = std::prev(C.end());
 
       // Insert the call.
       auto CallInst = TII.insertOutlinedCall(M, MBB, StartIt, *MF, C);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 656259727c124..2e8d8c63d6bec 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -8230,11 +8230,11 @@ std::optional<outliner::OutlinedFunction>
 AArch64InstrInfo::getOutliningCandidateInfo(
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
-  unsigned SequenceSize =
-      std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
-                      [this](unsigned Sum, const MachineInstr &MI) {
-                        return Sum + getInstSizeInBytes(MI);
-                      });
+
+  unsigned SequenceSize = 0;
+  for (auto &MI : FirstCand)
+    SequenceSize += getInstSizeInBytes(MI);
+
   unsigned NumBytesToCreateFrame = 0;
 
   // We only allow outlining for functions having exactly matching return
@@ -8287,7 +8287,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
         AArch64PAuth::getCheckerSizeInBytes(LRCheckMethod);
     // Checking the authenticated LR value may significantly impact
     // SequenceSize, so account for it for more precise results.
-    if (isTailCallReturnInst(*RepeatedSequenceLocs[0].back()))
+    if (isTailCallReturnInst(RepeatedSequenceLocs[0].back()))
       SequenceSize += NumBytesToCheckLRInTCEpilogue;
 
     // We have to check if sp modifying instructions would get outlined.
@@ -8296,37 +8296,36 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     // are not
     auto hasIllegalSPModification = [&TRI](outliner::Candidate &C) {
       int SPValue = 0;
-      MachineBasicBlock::iterator MBBI = C.front();
-      for (;;) {
-        if (MBBI->modifiesRegister(AArch64::SP, &TRI)) {
-          switch (MBBI->getOpcode()) {
+      for (auto &MI : C) {
+        if (MI.modifiesRegister(AArch64::SP, &TRI)) {
+          switch (MI.getOpcode()) {
           case AArch64::ADDXri:
           case AArch64::ADDWri:
-            assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
-            assert(MBBI->getOperand(2).isImm() &&
+            assert(MI.getNumOperands() == 4 && "Wrong number of operands");
+            assert(MI.getOperand(2).isImm() &&
                    "Expected operand to be immediate");
-            assert(MBBI->getOperand(1).isReg() &&
+            assert(MI.getOperand(1).isReg() &&
                    "Expected operand to be a register");
             // Check if the add just increments sp. If so, we search for
             // matching sub instructions that decrement sp. If not, the
             // modification is illegal
-            if (MBBI->getOperand(1).getReg() == AArch64::SP)
-              SPValue += MBBI->getOperand(2).getImm();
+            if (MI.getOperand(1).getReg() == AArch64::SP)
+              SPValue += MI.getOperand(2).getImm();
             else
               return true;
             break;
           case AArch64::SUBXri:
           case AArch64::SUBWri:
-            assert(MBBI->getNumOperands() == 4 && "Wrong number of operands");
-            assert(MBBI->getOperand(2).isImm() &&
+            assert(MI.getNumOperands() == 4 && "Wrong number of operands");
+            assert(MI.getOperand(2).isImm() &&
                    "Expected operand to be immediate");
-            assert(MBBI->getOperand(1).isReg() &&
+            assert(MI.getOperand(1).isReg() &&
                    "Expected operand to be a register");
             // Check if the sub just decrements sp. If so, we search for
             // matching add instructions that increment sp. If not, the
             // modification is illegal
-            if (MBBI->getOperand(1).getReg() == AArch64::SP)
-              SPValue -= MBBI->getOperand(2).getImm();
+            if (MI.getOperand(1).getReg() == AArch64::SP)
+              SPValue -= MI.getOperand(2).getImm();
             else
               return true;
             break;
@@ -8334,9 +8333,6 @@ AArch64InstrInfo::getOutliningCandidateInfo(
             return true;
           }
         }
-        if (MBBI == C.back())
-          break;
-        ++MBBI;
       }
       if (SPValue)
         return true;
@@ -8357,7 +8353,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   for (outliner::Candidate &C : RepeatedSequenceLocs)
     FlagsSetInAll &= C.Flags;
 
-  unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
+  unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
 
   // Helper lambda which sets call information for every candidate.
   auto SetCandidateCallInfo =
@@ -8376,8 +8372,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
   // We check to see if CFI Instructions are present, and if they are
   // we find the number of CFI Instructions in the candidates.
   unsigned CFICount = 0;
-  for (auto &I : make_range(RepeatedSequenceLocs[0].front(),
-                            std::next(RepeatedSequenceLocs[0].back()))) {
+  for (auto &I : RepeatedSequenceLocs[0]) {
     if (I.isCFIInstruction())
       CFICount++;
   }
@@ -8452,12 +8447,11 @@ AArch64InstrInfo::getOutliningCandidateInfo(
 
   // True if it's possible to fix up each stack instruction in this sequence.
   // Important for frames/call variants that modify the stack.
-  bool AllStackInstrsSafe = std::all_of(
-      FirstCand.front(), std::next(FirstCand.back()), IsSafeToFixup);
+  bool AllStackInstrsSafe = llvm::all_of(FirstCand, IsSafeToFixup);
 
   // If the last instruction in any candidate is a terminator, then we should
   // tail call all of the candidates.
-  if (RepeatedSequenceLocs[0].back()->isTerminator()) {
+  if (RepeatedSequenceLocs[0].back().isTerminator()) {
     FrameID = MachineOutlinerTailCall;
     NumBytesToCreateFrame = 0;
     unsigned NumBytesForCall = 4 + NumBytesToCheckLRInTCEpilogue;
@@ -8583,9 +8577,8 @@ AArch64InstrInfo::getOutliningCandidateInfo(
       //
       if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
         erase_if(RepeatedSequenceLocs, [this, &TRI](outliner::Candidate &C) {
-          return (std::any_of(
-                     C.front(), std::next(C.back()),
-                     [](const MachineInstr &MI) { return MI.isCall(); })) &&
+          auto IsCall = [](const MachineInstr &MI) { return MI.isCall(); };
+          return (llvm::any_of(C, IsCall)) &&
                  (!C.isAvailableAcrossAndOutOfSeq(AArch64::LR, TRI) ||
                   !findRegisterToSaveLRTo(C));
         });
@@ -8605,7 +8598,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     // Check if the range contains a call. These require a save + restore of the
     // link register.
     bool ModStackToSaveLR = false;
-    if (std::any_of(FirstCand.front(), FirstCand.back(),
+    if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()),
                     [](const MachineInstr &MI) { return MI.isCall(); }))
       ModStackToSaveLR = true;
 
@@ -8615,7 +8608,7 @@ AArch64InstrInfo::getOutliningCandidateInfo(
     // it being valid to tail call this sequence. We should consider this as
     // well.
     else if (FrameID != MachineOutlinerThunk &&
-             FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
+             FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
       ModStackToSaveLR = true;
 
     if (ModStackToSaveLR) {
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index ef02dc9970114..4bf65be6f1026 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -5874,11 +5874,10 @@ std::optional<outliner::OutlinedFunction>
 ARMBaseInstrInfo::getOutliningCandidateInfo(
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
   outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
-  unsigned SequenceSize =
-      std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
-                      [this](unsigned Sum, const MachineInstr &MI) {
-                        return Sum + getInstSizeInBytes(MI);
-                      });
+
+  unsigned SequenceSize = 0;
+  for (auto &MI : FirstCand)
+    SequenceSize += getInstSizeInBytes(MI);
 
   // Properties about candidate MBBs that hold for all of them.
   unsigned FlagsSetInAll = 0xF;
@@ -5965,7 +5964,7 @@ ARMBaseInstrInfo::getOutliningCandidateInfo(
   // At this point, we have only "safe" candidates to outline. Figure out
   // frame + call instruction information.
 
-  unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
+  unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back().getOpcode();
 
   // Helper lambda which sets call information for every candidate.
   auto SetCandidateCallInfo =
@@ -5998,7 +5997,7 @@ ARMBaseInstrInfo::getOutliningCandidateInfo(
 
   // If the last instruction in any candidate is a terminator, then we should
   // tail call all of the candidates.
-  if (RepeatedSequenceLocs[0].back()->isTerminator()) {
+  if (RepeatedSequenceLocs[0].back().isTerminator()) {
     FrameID = MachineOutlinerTailCall;
     NumBytesToCreateFrame = Costs.FrameTailCall;
     SetCandidateCallInfo(MachineOutlinerTailCall, Costs.CallTailCall);
@@ -6024,7 +6023,7 @@ ARMBaseInstrInfo::getOutliningCandidateInfo(
       const bool LRIsAvailable =
           C.getMBB()->isReturnBlock() && !Last->isCall()
               ? isLRAvailable(TRI, Last,
-                              (MachineBasicBlock::reverse_iterator)C.front())
+                              (MachineBasicBlock::reverse_iterator)C.begin())
               : C.isAvailableAcrossAndOutOfSeq(ARM::LR, TRI);
       if (LRIsAvailable) {
         FrameID = MachineOutlinerNoLRSave;
@@ -6072,7 +6071,7 @@ ARMBaseInstrInfo::getOutliningCandidateInfo(
   if (FlagsSetInAll & MachineOutlinerMBBFlags::HasCalls) {
     // check if the range contains a call.  These require a save + restore of
     // the link register.
-    if (std::any_of(FirstCand.front(), FirstCand.back(),
+    if (std::any_of(FirstCand.begin(), std::prev(FirstCand.end()),
                     [](const MachineInstr &MI) { return MI.isCall(); }))
       NumBytesToCreateFrame += Costs.SaveRestoreLROnStack;
 
@@ -6082,7 +6081,7 @@ ARMBaseInstrInfo::getOutliningCandidateInfo(
     // call without it being valid to tail call this sequence.  We should
     // consider this as well.
     else if (FrameID != MachineOutlinerThunk &&
-             FrameID != MachineOutlinerTailCall && FirstCand.back()->isCall())
+             FrameID != MachineOutlinerTailCall && FirstCand.back().isCall())
       NumBytesToCreateFrame += Costs.SaveRestoreLROnStack;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 874d10b979b80..7c21f63584942 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2434,10 +2434,8 @@ RISCVInstrInfo::getOutliningCandidateInfo(
 
   unsigned SequenceSize = 0;
 
-  auto I = RepeatedSequenceLocs[0].front();
-  auto E = std::next(RepeatedSequenceLocs[0].back());
-  for (; I != E; ++I)
-    SequenceSize += getInstSizeInBytes(*I);
+  for (auto &MI : RepeatedSequenceLocs[0])
+    SequenceSize += getInstSizeInBytes(MI);
 
   // call t0, function = 8 bytes.
   unsigned CallOverhead = 8;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 6dcaaf754a18d..d6f9aa6d6acec 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -10387,23 +10387,20 @@ enum MachineOutlinerClass { MachineOutlinerDefault, MachineOutlinerTailCall };
 std::optional<outliner::OutlinedFunction>
 X86InstrInfo::getOutliningCandidateInfo(
     std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
-  unsigned SequenceSize =
-      std::accumulate(RepeatedSequenceLocs[0].front(),
-                      std::next(RepeatedSequenceLocs[0].back()), 0,
-                      [](unsigned Sum, const MachineInstr &MI) {
-                        // FIXME: x86 doesn't implement getInstSizeInBytes, so
-                        // we can't tell the cost.  Just assume each instruction
-                        // is one byte.
-                        if (MI.isDebugInstr() || MI.isKill())
-                          return Sum;
-                        return Sum + 1;
-                      });
+  unsigned SequenceSize = 0;
+  for (auto &MI : RepeatedSequenceLocs[0]) {
+    // FIXME: x86 doesn't implement getInstSizeInBytes, so
+    // we can't tell the cost.  Just assume each instruction
+    // is one byte.
+    if (MI.isDebugInstr() || MI.isKill())
+      continue;
+    SequenceSize += 1;
+  }
 
   // We check to see if CFI Instructions are present, and if they are
   // we find the number of CFI Instructions in the candidates.
   unsigned CFICount = 0;
-  for (auto &I : make_range(RepeatedSequenceLocs[0].front(),
-                            std::next(RepeatedSequenceLocs[0].back()))) {
+  for (auto &I : RepeatedSequenceLocs[0]) {
     if (I.isCFIInstruction())
       CFICount++;
   }
@@ -10422,7 +10419,7 @@ X86InstrInfo::getOutliningCandidateInfo(
   }
 
   // FIXME: Use real size in bytes for call and ret instructions.
-  if (RepeatedSequenceLocs[0].back()->isTerminator()) {
+  if (RepeatedSequenceLocs[0].back().isTerminator()) {
     for (outliner::Candidate &C : RepeatedSequenceLocs)
       C.setCallInfo(MachineOutlinerTailCall, 1);
 

From 818f13fc00bb29df98d730dca6561014db1d4e26 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 23 Jan 2024 14:17:58 +0000
Subject: [PATCH 621/843] [AMDGPU] Remove getWorkGroupIDSGPR, unused since
 aa6fb4c45e01

---
 llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index dc63ae44c528d..ecc31fbd9dd3d 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -1041,22 +1041,6 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
     return WavesPerEU.second;
   }
 
-  /// \returns SGPR used for \p Dim's work group ID.
-  Register getWorkGroupIDSGPR(unsigned Dim) const {
-    switch (Dim) {
-    case 0:
-      assert(hasWorkGroupIDX());
-      return ArgInfo.WorkGroupIDX.getRegister();
-    case 1:
-      assert(hasWorkGroupIDY());
-      return ArgInfo.WorkGroupIDY.getRegister();
-    case 2:
-      assert(hasWorkGroupIDZ());
-      return ArgInfo.WorkGroupIDZ.getRegister();
-    }
-    llvm_unreachable("unexpected dimension");
-  }
-
   const AMDGPUGWSResourcePseudoSourceValue *
   getGWSPSV(const AMDGPUTargetMachine &TM) {
     return &GWSResourcePSV;

From 30845e8ab46c416a2e333eb84239e9ec71e92617 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 23 Jan 2024 14:27:01 +0000
Subject: [PATCH 622/843] [RemoveDIs][DebugInfo] Handle DPVAssigns in
 Assignment Tracking excluding lowering (#78982)

This patch adds support for DPVAssigns across all of
AssignmentTrackingAnalysis except for AssignmentTrackingLowering, which
is implemented in a separate patch. This patch includes handling
DPValues in MemLocFragFill, the removal of redundant DPValues as part of
AssignmentTrackingAnalysis (which is different to the version in
`BasicBlockUtils.cpp`), and preventing the DPVAssigns from being
directly emitted in SelectionDAG (just as we don't emit llvm.dbg.assigns
directly, but receive a set of locations from
AssignmentTrackingAnalysis' output).
---
 .../CodeGen/AssignmentTrackingAnalysis.cpp    | 280 ++++++++++--------
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   5 +
 .../AArch64/scalable-vectors.ll               |   3 +
 .../DebugInfo/assignment-tracking/X86/DSE.ll  |   4 +
 .../X86/assignment-tracking-not-enabled.ll    |   4 +
 .../assignment-tracking/X86/coalesce-cfg.ll   |   4 +
 .../X86/dbg-phi-produces-undef.ll             |   4 +
 .../assignment-tracking/X86/diamond-1.ll      |   4 +
 .../assignment-tracking/X86/diamond-2.ll      |   4 +
 .../assignment-tracking/X86/diamond-3.ll      |   4 +
 .../assignment-tracking/X86/frag-size-zero.ll |   3 +
 .../assignment-tracking/X86/global-storage.ll |   3 +
 .../assignment-tracking/X86/large-type.ll     |   4 +
 .../assignment-tracking/X86/loop-hoist.ll     |   4 +
 .../assignment-tracking/X86/loop-sink.ll      |   4 +
 .../assignment-tracking/X86/loop-unroll.ll    |   3 +
 .../X86/lower-offset-expression.ll            |   4 +
 .../assignment-tracking/X86/lower-to-value.ll |  11 +
 .../X86/mem-loc-frag-fill-cfg.ll              |  10 +
 .../X86/mem-loc-frag-fill.ll                  |  10 +
 .../X86/negative-offset.ll                    |   3 +
 .../X86/nested-loop-frags.ll                  |   4 +
 .../X86/nested-loop-sroa.ll                   |   4 +
 .../assignment-tracking/X86/nested-loop.ll    |   4 +
 .../X86/no-redundant-def-after-alloca.ll      |   4 +
 .../assignment-tracking/X86/order-of-defs.ll  |   4 +
 .../X86/remove-redundant-defs-bwd-scan.ll     |   4 +
 ...ve-redundant-defs-to-prevent-reordering.ll |   9 +
 .../X86/sdag-dangling-dbgassign.ll            |   9 +
 .../X86/sdag-ir-salvage-assign.ll             |  11 +
 .../X86/sdag-transfer-dbgassign.ll            |   9 +
 .../X86/single-memory-location-2.ll           |   4 +
 .../X86/single-memory-location.ll             |   4 +
 .../assignment-tracking/X86/split-alloca.ll   |   4 +
 .../untagged-store-assignment-extra-checks.ll |   4 +
 ...agged-store-assignment-outside-variable.ll |   4 +
 .../X86/use-known-value-at-early-mem-def-2.ll |   4 +
 .../X86/use-known-value-at-early-mem-def.ll   |   4 +
 38 files changed, 338 insertions(+), 127 deletions(-)

diff --git a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
index a558a304e8f41..f8ce8f98864ea 100644
--- a/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
+++ b/llvm/lib/CodeGen/AssignmentTrackingAnalysis.cpp
@@ -829,6 +829,13 @@ class MemLocFragmentFill {
   void process(BasicBlock &BB, VarFragMap &LiveSet) {
     BBInsertBeforeMap[&BB].clear();
     for (auto &I : BB) {
+      for (auto &DPV : I.getDbgValueRange()) {
+        if (const auto *Locs = FnVarLocs->getWedge(&DPV)) {
+          for (const VarLocInfo &Loc : *Locs) {
+            addDef(Loc, &DPV, *I.getParent(), LiveSet);
+          }
+        }
+      }
       if (const auto *Locs = FnVarLocs->getWedge(&I)) {
         for (const VarLocInfo &Loc : *Locs) {
           addDef(Loc, &I, *I.getParent(), LiveSet);
@@ -2487,73 +2494,78 @@ removeRedundantDbgLocsUsingBackwardScan(const BasicBlock *BB,
       VariableDefinedBytes.clear();
     }
 
-    // Get the location defs that start just before this instruction.
-    const auto *Locs = FnVarLocs.getWedge(&I);
-    if (!Locs)
-      continue;
+    auto HandleLocsForWedge = [&](auto *WedgePosition) {
+      // Get the location defs that start just before this instruction.
+      const auto *Locs = FnVarLocs.getWedge(WedgePosition);
+      if (!Locs)
+        return;
+
+      NumWedgesScanned++;
+      bool ChangedThisWedge = false;
+      // The new pruned set of defs, reversed because we're scanning backwards.
+      SmallVector<VarLocInfo> NewDefsReversed;
+
+      // Iterate over the existing defs in reverse.
+      for (auto RIt = Locs->rbegin(), REnd = Locs->rend(); RIt != REnd; ++RIt) {
+        NumDefsScanned++;
+        DebugAggregate Aggr =
+            getAggregate(FnVarLocs.getVariable(RIt->VariableID));
+        uint64_t SizeInBits = Aggr.first->getSizeInBits().value_or(0);
+        uint64_t SizeInBytes = divideCeil(SizeInBits, 8);
+
+        // Cutoff for large variables to prevent expensive bitvector operations.
+        const uint64_t MaxSizeBytes = 2048;
+
+        if (SizeInBytes == 0 || SizeInBytes > MaxSizeBytes) {
+          // If the size is unknown (0) then keep this location def to be safe.
+          // Do the same for defs of large variables, which would be expensive
+          // to represent with a BitVector.
+          NewDefsReversed.push_back(*RIt);
+          continue;
+        }
 
-    NumWedgesScanned++;
-    bool ChangedThisWedge = false;
-    // The new pruned set of defs, reversed because we're scanning backwards.
-    SmallVector<VarLocInfo> NewDefsReversed;
-
-    // Iterate over the existing defs in reverse.
-    for (auto RIt = Locs->rbegin(), REnd = Locs->rend(); RIt != REnd; ++RIt) {
-      NumDefsScanned++;
-      DebugAggregate Aggr =
-          getAggregate(FnVarLocs.getVariable(RIt->VariableID));
-      uint64_t SizeInBits = Aggr.first->getSizeInBits().value_or(0);
-      uint64_t SizeInBytes = divideCeil(SizeInBits, 8);
-
-      // Cutoff for large variables to prevent expensive bitvector operations.
-      const uint64_t MaxSizeBytes = 2048;
-
-      if (SizeInBytes == 0 || SizeInBytes > MaxSizeBytes) {
-        // If the size is unknown (0) then keep this location def to be safe.
-        // Do the same for defs of large variables, which would be expensive
-        // to represent with a BitVector.
-        NewDefsReversed.push_back(*RIt);
-        continue;
-      }
+        // Only keep this location definition if it is not fully eclipsed by
+        // other definitions in this wedge that come after it
+
+        // Inert the bytes the location definition defines.
+        auto InsertResult =
+            VariableDefinedBytes.try_emplace(Aggr, BitVector(SizeInBytes));
+        bool FirstDefinition = InsertResult.second;
+        BitVector &DefinedBytes = InsertResult.first->second;
+
+        DIExpression::FragmentInfo Fragment =
+            RIt->Expr->getFragmentInfo().value_or(
+                DIExpression::FragmentInfo(SizeInBits, 0));
+        bool InvalidFragment = Fragment.endInBits() > SizeInBits;
+        uint64_t StartInBytes = Fragment.startInBits() / 8;
+        uint64_t EndInBytes = divideCeil(Fragment.endInBits(), 8);
+
+        // If this defines any previously undefined bytes, keep it.
+        if (FirstDefinition || InvalidFragment ||
+            DefinedBytes.find_first_unset_in(StartInBytes, EndInBytes) != -1) {
+          if (!InvalidFragment)
+            DefinedBytes.set(StartInBytes, EndInBytes);
+          NewDefsReversed.push_back(*RIt);
+          continue;
+        }
 
-      // Only keep this location definition if it is not fully eclipsed by
-      // other definitions in this wedge that come after it
-
-      // Inert the bytes the location definition defines.
-      auto InsertResult =
-          VariableDefinedBytes.try_emplace(Aggr, BitVector(SizeInBytes));
-      bool FirstDefinition = InsertResult.second;
-      BitVector &DefinedBytes = InsertResult.first->second;
-
-      DIExpression::FragmentInfo Fragment =
-          RIt->Expr->getFragmentInfo().value_or(
-              DIExpression::FragmentInfo(SizeInBits, 0));
-      bool InvalidFragment = Fragment.endInBits() > SizeInBits;
-      uint64_t StartInBytes = Fragment.startInBits() / 8;
-      uint64_t EndInBytes = divideCeil(Fragment.endInBits(), 8);
-
-      // If this defines any previously undefined bytes, keep it.
-      if (FirstDefinition || InvalidFragment ||
-          DefinedBytes.find_first_unset_in(StartInBytes, EndInBytes) != -1) {
-        if (!InvalidFragment)
-          DefinedBytes.set(StartInBytes, EndInBytes);
-        NewDefsReversed.push_back(*RIt);
-        continue;
+        // Redundant def found: throw it away. Since the wedge of defs is being
+        // rebuilt, doing nothing is the same as deleting an entry.
+        ChangedThisWedge = true;
+        NumDefsRemoved++;
       }
 
-      // Redundant def found: throw it away. Since the wedge of defs is being
-      // rebuilt, doing nothing is the same as deleting an entry.
-      ChangedThisWedge = true;
-      NumDefsRemoved++;
-    }
-
-    // Un-reverse the defs and replace the wedge with the pruned version.
-    if (ChangedThisWedge) {
-      std::reverse(NewDefsReversed.begin(), NewDefsReversed.end());
-      FnVarLocs.setWedge(&I, std::move(NewDefsReversed));
-      NumWedgesChanged++;
-      Changed = true;
-    }
+      // Un-reverse the defs and replace the wedge with the pruned version.
+      if (ChangedThisWedge) {
+        std::reverse(NewDefsReversed.begin(), NewDefsReversed.end());
+        FnVarLocs.setWedge(WedgePosition, std::move(NewDefsReversed));
+        NumWedgesChanged++;
+        Changed = true;
+      }
+    };
+    HandleLocsForWedge(&I);
+    for (DPValue &DPV : reverse(I.getDbgValueRange()))
+      HandleLocsForWedge(&DPV);
   }
 
   return Changed;
@@ -2578,42 +2590,48 @@ removeRedundantDbgLocsUsingForwardScan(const BasicBlock *BB,
   // instructions.
   for (const Instruction &I : *BB) {
     // Get the defs that come just before this instruction.
-    const auto *Locs = FnVarLocs.getWedge(&I);
-    if (!Locs)
-      continue;
-
-    NumWedgesScanned++;
-    bool ChangedThisWedge = false;
-    // The new pruned set of defs.
-    SmallVector<VarLocInfo> NewDefs;
+    auto HandleLocsForWedge = [&](auto *WedgePosition) {
+      const auto *Locs = FnVarLocs.getWedge(WedgePosition);
+      if (!Locs)
+        return;
+
+      NumWedgesScanned++;
+      bool ChangedThisWedge = false;
+      // The new pruned set of defs.
+      SmallVector<VarLocInfo> NewDefs;
+
+      // Iterate over the existing defs.
+      for (const VarLocInfo &Loc : *Locs) {
+        NumDefsScanned++;
+        DebugVariable Key(FnVarLocs.getVariable(Loc.VariableID).getVariable(),
+                          std::nullopt, Loc.DL.getInlinedAt());
+        auto VMI = VariableMap.find(Key);
+
+        // Update the map if we found a new value/expression describing the
+        // variable, or if the variable wasn't mapped already.
+        if (VMI == VariableMap.end() || VMI->second.first != Loc.Values ||
+            VMI->second.second != Loc.Expr) {
+          VariableMap[Key] = {Loc.Values, Loc.Expr};
+          NewDefs.push_back(Loc);
+          continue;
+        }
 
-    // Iterate over the existing defs.
-    for (const VarLocInfo &Loc : *Locs) {
-      NumDefsScanned++;
-      DebugVariable Key(FnVarLocs.getVariable(Loc.VariableID).getVariable(),
-                        std::nullopt, Loc.DL.getInlinedAt());
-      auto VMI = VariableMap.find(Key);
-
-      // Update the map if we found a new value/expression describing the
-      // variable, or if the variable wasn't mapped already.
-      if (VMI == VariableMap.end() || VMI->second.first != Loc.Values ||
-          VMI->second.second != Loc.Expr) {
-        VariableMap[Key] = {Loc.Values, Loc.Expr};
-        NewDefs.push_back(Loc);
-        continue;
+        // Did not insert this Loc, which is the same as removing it.
+        ChangedThisWedge = true;
+        NumDefsRemoved++;
       }
 
-      // Did not insert this Loc, which is the same as removing it.
-      ChangedThisWedge = true;
-      NumDefsRemoved++;
-    }
+      // Replace the existing wedge with the pruned version.
+      if (ChangedThisWedge) {
+        FnVarLocs.setWedge(WedgePosition, std::move(NewDefs));
+        NumWedgesChanged++;
+        Changed = true;
+      }
+    };
 
-    // Replace the existing wedge with the pruned version.
-    if (ChangedThisWedge) {
-      FnVarLocs.setWedge(&I, std::move(NewDefs));
-      NumWedgesChanged++;
-      Changed = true;
-    }
+    for (DPValue &DPV : I.getDbgValueRange())
+      HandleLocsForWedge(&DPV);
+    HandleLocsForWedge(&I);
   }
 
   return Changed;
@@ -2660,41 +2678,46 @@ removeUndefDbgLocsFromEntryBlock(const BasicBlock *BB,
   // instructions.
   for (const Instruction &I : *BB) {
     // Get the defs that come just before this instruction.
-    const auto *Locs = FnVarLocs.getWedge(&I);
-    if (!Locs)
-      continue;
-
-    NumWedgesScanned++;
-    bool ChangedThisWedge = false;
-    // The new pruned set of defs.
-    SmallVector<VarLocInfo> NewDefs;
-
-    // Iterate over the existing defs.
-    for (const VarLocInfo &Loc : *Locs) {
-      NumDefsScanned++;
-      DebugAggregate Aggr{FnVarLocs.getVariable(Loc.VariableID).getVariable(),
-                          Loc.DL.getInlinedAt()};
-      DebugVariable Var = FnVarLocs.getVariable(Loc.VariableID);
+    auto HandleLocsForWedge = [&](auto *WedgePosition) {
+      const auto *Locs = FnVarLocs.getWedge(WedgePosition);
+      if (!Locs)
+        return;
+
+      NumWedgesScanned++;
+      bool ChangedThisWedge = false;
+      // The new pruned set of defs.
+      SmallVector<VarLocInfo> NewDefs;
+
+      // Iterate over the existing defs.
+      for (const VarLocInfo &Loc : *Locs) {
+        NumDefsScanned++;
+        DebugAggregate Aggr{FnVarLocs.getVariable(Loc.VariableID).getVariable(),
+                            Loc.DL.getInlinedAt()};
+        DebugVariable Var = FnVarLocs.getVariable(Loc.VariableID);
+
+        // Remove undef entries that are encountered before any non-undef
+        // intrinsics from the entry block.
+        if (Loc.Values.isKillLocation(Loc.Expr) && !HasDefinedBits(Aggr, Var)) {
+          // Did not insert this Loc, which is the same as removing it.
+          NumDefsRemoved++;
+          ChangedThisWedge = true;
+          continue;
+        }
 
-      // Remove undef entries that are encountered before any non-undef
-      // intrinsics from the entry block.
-      if (Loc.Values.isKillLocation(Loc.Expr) && !HasDefinedBits(Aggr, Var)) {
-        // Did not insert this Loc, which is the same as removing it.
-        NumDefsRemoved++;
-        ChangedThisWedge = true;
-        continue;
+        DefineBits(Aggr, Var);
+        NewDefs.push_back(Loc);
       }
 
-      DefineBits(Aggr, Var);
-      NewDefs.push_back(Loc);
-    }
-
-    // Replace the existing wedge with the pruned version.
-    if (ChangedThisWedge) {
-      FnVarLocs.setWedge(&I, std::move(NewDefs));
-      NumWedgesChanged++;
-      Changed = true;
-    }
+      // Replace the existing wedge with the pruned version.
+      if (ChangedThisWedge) {
+        FnVarLocs.setWedge(WedgePosition, std::move(NewDefs));
+        NumWedgesChanged++;
+        Changed = true;
+      }
+    };
+    for (DPValue &DPV : I.getDbgValueRange())
+      HandleLocsForWedge(&DPV);
+    HandleLocsForWedge(&I);
   }
 
   return Changed;
@@ -2726,6 +2749,9 @@ static DenseSet<DebugAggregate> findVarsWithStackSlot(Function &Fn) {
       for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(&I)) {
         Result.insert({DAI->getVariable(), DAI->getDebugLoc().getInlinedAt()});
       }
+      for (DPValue *DPV : at::getDPVAssignmentMarkers(&I)) {
+        Result.insert({DPV->getVariable(), DPV->getDebugLoc().getInlinedAt()});
+      }
     }
   }
   return Result;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index a7dd23f78993f..5ce1013f30fd1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1241,6 +1241,11 @@ void SelectionDAGBuilder::visitDbgInfo(const Instruction &I) {
                              It->Expr, Vals.size() > 1, It->DL, SDNodeOrder);
       }
     }
+    // We must early-exit here to prevent any DPValues from being emitted below,
+    // as we have just emitted the debug values resulting from assignment
+    // tracking analysis, making any existing DPValues redundant (and probably
+    // less correct).
+    return;
   }
 
   // Is there is any debug-info attached to this instruction, in the form of
diff --git a/llvm/test/DebugInfo/assignment-tracking/AArch64/scalable-vectors.ll b/llvm/test/DebugInfo/assignment-tracking/AArch64/scalable-vectors.ll
index 4f3db535f6eb8..81a1542221650 100644
--- a/llvm/test/DebugInfo/assignment-tracking/AArch64/scalable-vectors.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/AArch64/scalable-vectors.ll
@@ -1,5 +1,8 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - | FileCheck %s
+
 ;; Hand written. Check AssignmentTrackingAnalysis doesn't try to get the size
 ;; of scalable vectors (which causes an assertion failure).
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/DSE.ll b/llvm/test/DebugInfo/assignment-tracking/X86/DSE.ll
index 057c82b0872a6..57bb53d8269c3 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/DSE.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/DSE.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-before=finalize-isel -o - \
 ; RUN: | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before=finalize-isel -o - \
+; RUN: | FileCheck %s
+
 ; Check basic lowering behaviour of dbg.assign intrinsics. The first
 ; assignment to `local`, which has been DSE'd, should be represented with a
 ; constant value DBG_VALUE. The second assignment should have a DBG_VALUE
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/assignment-tracking-not-enabled.ll b/llvm/test/DebugInfo/assignment-tracking/X86/assignment-tracking-not-enabled.ll
index 73b4fd0b75dab..2f0b4fd284b9a 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/assignment-tracking-not-enabled.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/assignment-tracking-not-enabled.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Check that SelectionDAG downgrades dbg.assigns to dbg.values if assignment
 ;; tracking isn't enabled (e.g. if the module flag
 ;; "debug-info-assignment-tracking" is missing / false).
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-cfg.ll b/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-cfg.ll
index 49fbcb95818ae..5c0d24e1155c5 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-cfg.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-cfg.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -o - -stop-after=finalize-isel \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -o - -stop-after=finalize-isel \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Test coalescing of contiguous fragments in adjacent location definitions.
 ;; Further details and check directives inline.
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/dbg-phi-produces-undef.ll b/llvm/test/DebugInfo/assignment-tracking/X86/dbg-phi-produces-undef.ll
index 37a2dbd452161..2ab4e57f470af 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/dbg-phi-produces-undef.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/dbg-phi-produces-undef.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s
+
 ;; Hand written test because the scenario is unlikely. Check that the "value"
 ;; of a debug def PHIs is "undef" (because we don't actually track PHIs).
 ;;
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/diamond-1.ll b/llvm/test/DebugInfo/assignment-tracking/X86/diamond-1.ll
index b4e84a9978301..07d0540107f47 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/diamond-1.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/diamond-1.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s
+
 ;; cat test.cpp
 ;; void d();
 ;; void e();
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/diamond-2.ll b/llvm/test/DebugInfo/assignment-tracking/X86/diamond-2.ll
index 1526b3471b2e0..2dd82ff9e595a 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/diamond-2.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/diamond-2.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s
+
 ;; Same as diamond-1.ll except that the DIAssignID attached to the store has
 ;; been deleted. In this case, we expect the same output as for diamond-1.ll
 ;; because we choose to interpret stores to stack slots that don't link to
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/diamond-3.ll b/llvm/test/DebugInfo/assignment-tracking/X86/diamond-3.ll
index b20b166cb9cd4..69ae720276a70 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/diamond-3.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/diamond-3.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Hand written to test scenario we can definitely run into in the wild. This
 ;; file name includes "diamond" because the idea is that we lose (while
 ;; optimizing) one of the diamond branches which was empty except for a debug
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/frag-size-zero.ll b/llvm/test/DebugInfo/assignment-tracking/X86/frag-size-zero.ll
index 3a3865d0511b9..c6124df1b26e9 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/frag-size-zero.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/frag-size-zero.ll
@@ -1,5 +1,8 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - | FileCheck %s
+
 ;; Check that a zero-sized fragment (the final dbg.assign) is ignored by
 ;; AssignmentTrackingAnalysis.
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/global-storage.ll b/llvm/test/DebugInfo/assignment-tracking/X86/global-storage.ll
index 7d265acdb5bb4..d7b8d76fdecaa 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/global-storage.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/global-storage.ll
@@ -1,5 +1,8 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - | FileCheck %s
+
 ;; Local variable has global storage. Check AssignmentTrackingAnalysis doesn't
 ;; crash/assert.
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/large-type.ll b/llvm/test/DebugInfo/assignment-tracking/X86/large-type.ll
index cebbc162fcb3a..9ab0f36597887 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/large-type.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/large-type.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Based on optimized IR from C source:
 ;; int main () {
 ;;   char a1[__INT_MAX__];
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/loop-hoist.ll b/llvm/test/DebugInfo/assignment-tracking/X86/loop-hoist.ll
index 559cdc59dffd9..c83d530c28526 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/loop-hoist.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/loop-hoist.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; $ cat test.cpp
 ;; int d();
 ;; void e();
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/loop-sink.ll b/llvm/test/DebugInfo/assignment-tracking/X86/loop-sink.ll
index fcd51960b67cc..77d730ff1d359 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/loop-sink.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/loop-sink.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG
+
 ;; Tiny loop with a store sunk out of it:
 ;; void e();
 ;; void es(int*);
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/loop-unroll.ll b/llvm/test/DebugInfo/assignment-tracking/X86/loop-unroll.ll
index 5ef5cb4dde9e1..51661d4b030c8 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/loop-unroll.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/loop-unroll.ll
@@ -1,5 +1,8 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s
 ;;
 ;; Backend counterpart to ../Generic/dbg-assign-loop-unroll.  This IR was
 ;; generated by running `opt -loop-unroll -S` on the IR in that test.
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/lower-offset-expression.ll b/llvm/test/DebugInfo/assignment-tracking/X86/lower-offset-expression.ll
index b4d85959af15b..2d0bf287079f0 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/lower-offset-expression.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/lower-offset-expression.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s
+
 ;; Handwritten test.
 
 ;; Here we have dbg.assign intrinsics with fragments (in the value-expression)
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/lower-to-value.ll b/llvm/test/DebugInfo/assignment-tracking/X86/lower-to-value.ll
index 859056aff6337..28195bc22a62d 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/lower-to-value.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/lower-to-value.ll
@@ -2,11 +2,22 @@
 ; RUN:    -experimental-debug-variable-locations=false \
 ; RUN:    -debug-ata-coalesce-frags=true \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE --implicit-check-not=DBG_VALUE
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=false \
+; RUN:    -debug-ata-coalesce-frags=true \
+; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE --implicit-check-not=DBG_VALUE
 ; RUN: llc %s -stop-before finalize-isel -o - \
 ; RUN:    -experimental-debug-variable-locations=true \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF --implicit-check-not=DBG_VALUE \
 ; RUN:    --implicit-check-not=DBG_INSTR_REF
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=true \
+; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF --implicit-check-not=DBG_VALUE \
+; RUN:    --implicit-check-not=DBG_INSTR_REF
+
 ;; Check that dbg.assigns for an aggregate variable which lives on the stack
 ;; for some of its lifetime are lowered into an appropriate set of DBG_VALUEs.
 ;;
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/mem-loc-frag-fill-cfg.ll b/llvm/test/DebugInfo/assignment-tracking/X86/mem-loc-frag-fill-cfg.ll
index 0ea8373cef512..0211fb7e74bd0 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/mem-loc-frag-fill-cfg.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/mem-loc-frag-fill-cfg.ll
@@ -2,10 +2,20 @@
 ; RUN:    -experimental-debug-variable-locations=false \
 ; RUN:    -debug-ata-coalesce-frags=true \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=false \
+; RUN:    -debug-ata-coalesce-frags=true \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
 ; RUN: llc %s -stop-before finalize-isel -o - \
 ; RUN:    -experimental-debug-variable-locations=true \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=true \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Check that the mem-loc-frag-fill pseudo-pass works on a simple CFG. When
 ;; LLVM sees a dbg.value with an overlapping fragment it essentially considers
 ;; the previous location as valid for all bits in that fragment. The pass
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/mem-loc-frag-fill.ll b/llvm/test/DebugInfo/assignment-tracking/X86/mem-loc-frag-fill.ll
index 630678153de00..597531aaf1d3f 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/mem-loc-frag-fill.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/mem-loc-frag-fill.ll
@@ -2,10 +2,20 @@
 ; RUN:    -experimental-debug-variable-locations=false \
 ; RUN:    -debug-ata-coalesce-frags=true \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=false \
+; RUN:    -debug-ata-coalesce-frags=true \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
 ; RUN: llc %s -stop-before finalize-isel -o - \
 ; RUN:    -experimental-debug-variable-locations=true \
 ; RUN: | FileCheck %s  --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=true \
+; RUN: | FileCheck %s  --implicit-check-not=DBG_
+
 ;; Check that the mem-loc-frag-fill analysis works on a simple case; ensure
 ;; that location definitions are added to preserve memory locations of
 ;; fragments of variables at subsequent location definitions for other
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/negative-offset.ll b/llvm/test/DebugInfo/assignment-tracking/X86/negative-offset.ll
index 74486c4822397..20ea85bd27375 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/negative-offset.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/negative-offset.ll
@@ -1,5 +1,8 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - | FileCheck %s --implicit-check-not=DBG_VALUE
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - | FileCheck %s --implicit-check-not=DBG_VALUE
+
 ;; Check stores to an address computed as a negative offset from an alloca are
 ;; ignored by the assignment tracking analysis. For this example that should
 ;; result in no DBG_VALUEs in the while.body.lr.ph branch.
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop-frags.ll b/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop-frags.ll
index 6e066fa90f4cb..39132dd5471b1 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop-frags.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop-frags.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG
+
 ;; Test a variety of block inputs and lattice configurations for the assignment
 ;; tracking analysis (debug-ata). This is similar to nested-loop.ll and
 ;; nested-loop-sroa.ll except that the allocas are 64 bits and the stores are a
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop-sroa.ll b/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop-sroa.ll
index 1f4351e6d3516..39c91724946f4 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop-sroa.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop-sroa.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG
+
 ;; Test a variety of block inputs and lattice configurations for the assignment
 ;; tracking analysis (debug-ata). This is the same as nested-loop.ll except each
 ;; alloca holds a fragment of a variable instead of a whole variable.
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop.ll b/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop.ll
index 3f4350fd15af7..b88a58566e4f6 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/nested-loop.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG
+
 ;; Test a variety of block inputs and lattice configurations for the assignment
 ;; tracking analysis (debug-ata).
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/no-redundant-def-after-alloca.ll b/llvm/test/DebugInfo/assignment-tracking/X86/no-redundant-def-after-alloca.ll
index 626757e964dcd..9ef3ea59247e7 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/no-redundant-def-after-alloca.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/no-redundant-def-after-alloca.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -o - -stop-after=finalize-isel \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -o - -stop-after=finalize-isel \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Hand written. Check that no unnecessary undef is inserted after an alloca
 ;; that has a linked dbg.assign that doesn't immediately follow it.
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/order-of-defs.ll b/llvm/test/DebugInfo/assignment-tracking/X86/order-of-defs.ll
index 191d61360feb4..9b8f3b9b67a3f 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/order-of-defs.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/order-of-defs.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Ensure that the order of several debug intrinsics between non-debug
 ;; instructions is maintained.
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/remove-redundant-defs-bwd-scan.ll b/llvm/test/DebugInfo/assignment-tracking/X86/remove-redundant-defs-bwd-scan.ll
index dc2638c69969d..211098b4a8d55 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/remove-redundant-defs-bwd-scan.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/remove-redundant-defs-bwd-scan.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Hand-written to test assignment tracking analysis' removal of redundant
 ;; debug loc definitions. Checks written inline.
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/remove-redundant-defs-to-prevent-reordering.ll b/llvm/test/DebugInfo/assignment-tracking/X86/remove-redundant-defs-to-prevent-reordering.ll
index b0628aecdc849..5d52cc342cf28 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/remove-redundant-defs-to-prevent-reordering.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/remove-redundant-defs-to-prevent-reordering.ll
@@ -1,10 +1,19 @@
 ; RUN: llc %s -stop-before finalize-isel -o - \
 ; RUN:    -experimental-debug-variable-locations=false \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE --implicit-check-not="DBG_VALUE \$noreg"
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=false \
+; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE --implicit-check-not="DBG_VALUE \$noreg"
 ; RUN: llc %s -stop-before finalize-isel -o - \
 ; RUN:    -experimental-debug-variable-locations=true \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF --implicit-check-not="DBG_VALUE \$noreg"
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=true \
+; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF --implicit-check-not="DBG_VALUE \$noreg"
+
 ;; Found in the wild, but this test involves modifications from:
 ;; int b;
 ;; void ext();
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/sdag-dangling-dbgassign.ll b/llvm/test/DebugInfo/assignment-tracking/X86/sdag-dangling-dbgassign.ll
index fc05d398e6fdb..ba7d3f6a67e27 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/sdag-dangling-dbgassign.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/sdag-dangling-dbgassign.ll
@@ -1,10 +1,19 @@
 ; RUN: llc %s -stop-before finalize-isel -o - \
 ; RUN:    -experimental-debug-variable-locations=false \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=false \
+; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE
 ; RUN: llc %s -stop-before finalize-isel -o - \
 ; RUN:    -experimental-debug-variable-locations=true \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=true \
+; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF
+
 ;--------------------------------------------------------------------
 ; Adapted from sdag-dangling-dbgvalue.ll to test dbg.assign intrinsics. This
 ; ensures that dbg.assigns with no linked store are treated as dbg.values.  For
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/sdag-ir-salvage-assign.ll b/llvm/test/DebugInfo/assignment-tracking/X86/sdag-ir-salvage-assign.ll
index 57a88bcdd3bcb..38a0a8675bef2 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/sdag-ir-salvage-assign.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/sdag-ir-salvage-assign.ll
@@ -2,11 +2,22 @@
 ; RUN:    -stop-before finalize-isel %s -o -  \
 ; RUN:    -experimental-debug-variable-locations=false \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE
+
+; RUN: llc --try-experimental-debuginfo-iterators -mtriple=x86_64-unknown-unknown -start-after=codegenprepare \
+; RUN:    -stop-before finalize-isel %s -o -  \
+; RUN:    -experimental-debug-variable-locations=false \
+; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE
 ; RUN: llc -mtriple=x86_64-unknown-unknown -start-after=codegenprepare \
 ; RUN:    -stop-before finalize-isel %s -o -  \
 ; RUN:    -experimental-debug-variable-locations=true \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF
 
+
+; RUN: llc --try-experimental-debuginfo-iterators -mtriple=x86_64-unknown-unknown -start-after=codegenprepare \
+; RUN:    -stop-before finalize-isel %s -o -  \
+; RUN:    -experimental-debug-variable-locations=true \
+; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF
+
 ; Adapted from sdag-ir-salvage.ll to test dbg.assign intrinsics. This ensures
 ; that dbg.assigns with no linked store are treated as dbg.values.
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/sdag-transfer-dbgassign.ll b/llvm/test/DebugInfo/assignment-tracking/X86/sdag-transfer-dbgassign.ll
index 1aea122f3cc2c..fba127b5f5266 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/sdag-transfer-dbgassign.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/sdag-transfer-dbgassign.ll
@@ -1,10 +1,19 @@
 ; RUN: llc %s -start-after=codegenprepare -stop-before finalize-isel -o - \
 ; RUN:    -experimental-debug-variable-locations=false \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -start-after=codegenprepare -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=false \
+; RUN: | FileCheck %s --check-prefixes=CHECK,DBGVALUE
 ; RUN: llc %s -start-after=codegenprepare -stop-before finalize-isel -o - \
 ; RUN:    -experimental-debug-variable-locations=true \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -start-after=codegenprepare -stop-before finalize-isel -o - \
+; RUN:    -experimental-debug-variable-locations=true \
+; RUN: | FileCheck %s --check-prefixes=CHECK,INSTRREF
+
 ; Adapted from sdag-transfer-dbgvalue.ll to test dbg.assign intrinsics. This
 ; ensures that dbg.assigns with no linked store are treated as dbg.values.
 
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/single-memory-location-2.ll b/llvm/test/DebugInfo/assignment-tracking/X86/single-memory-location-2.ll
index 485ee833ae2db..1b2d047815755 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/single-memory-location-2.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/single-memory-location-2.ll
@@ -1,6 +1,10 @@
 ; RUN: llc -stop-after=finalize-isel %s -o - \
 ; RUN: | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators -stop-after=finalize-isel %s -o - \
+; RUN: | FileCheck %s
+
 ;; Check that a dbg.assign for a fully stack-homed variable causes the variable
 ;; location to appear in the Machine Function side table. Similar to
 ;; single-memory-location.ll except this has slightly more complicated input
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/single-memory-location.ll b/llvm/test/DebugInfo/assignment-tracking/X86/single-memory-location.ll
index 1f806e5255fc5..a9cc55dbe8474 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/single-memory-location.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/single-memory-location.ll
@@ -1,6 +1,10 @@
 ; RUN: llc -stop-after=finalize-isel %s -o - \
 ; RUN: | FileCheck %s
 
+
+; RUN: llc --try-experimental-debuginfo-iterators -stop-after=finalize-isel %s -o - \
+; RUN: | FileCheck %s
+
 ;; Check that a dbg.assign for a fully stack-homed variable causes the variable
 ;; location to appear in the Machine Function side table (variable 'local').
 ;;
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/split-alloca.ll b/llvm/test/DebugInfo/assignment-tracking/X86/split-alloca.ll
index 15f5374568bc3..045c9fd31af4c 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/split-alloca.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/split-alloca.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -o - -stop-after=finalize-isel \
 ; RUN: | FileCheck %s --implicit-check-not=DBG
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -o - -stop-after=finalize-isel \
+; RUN: | FileCheck %s --implicit-check-not=DBG
+
 ;; Hand written. Check that we fall back to emitting a list of defs for
 ;; variables with split allocas (i.e. we want to see DBG_VALUEs and no
 ;; debug-info-variable entry in the stack slot table).
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-assignment-extra-checks.ll b/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-assignment-extra-checks.ll
index c407a7da7fa2c..3dd9098df8382 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-assignment-extra-checks.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-assignment-extra-checks.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Similarly to untagged-store-assignment-outside-variable.ll this test checks
 ;; that out of bounds stores that have no DIAssignID are interpreted correctly
 ;; (see inline comments and checks). Hand-written IR.
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-assignment-outside-variable.ll b/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-assignment-outside-variable.ll
index 3a7c528d320a7..fe1af199d5e7e 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-assignment-outside-variable.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-assignment-outside-variable.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
+
 ;; Generated from following C source that contains UB (read and write to
 ;; out of bounds static array element.
 ;; int a;
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/use-known-value-at-early-mem-def-2.ll b/llvm/test/DebugInfo/assignment-tracking/X86/use-known-value-at-early-mem-def-2.ll
index ebab33f73c86b..cbe9af2f7d818 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/use-known-value-at-early-mem-def-2.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/use-known-value-at-early-mem-def-2.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_VALUE
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_VALUE
+
 ;; Check that sandwiching instructions between a linked store and dbg.assign
 ;; results in a dbg.value(prev_value) being inserted at the store, and a
 ;; dbg.value(deref) at the dbg.assign.
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/use-known-value-at-early-mem-def.ll b/llvm/test/DebugInfo/assignment-tracking/X86/use-known-value-at-early-mem-def.ll
index d85784f1b56c9..135123ef86066 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/use-known-value-at-early-mem-def.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/use-known-value-at-early-mem-def.ll
@@ -1,6 +1,10 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_VALUE
 
+
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_VALUE
+
 ;; Check that sandwiching instructions between a linked store and dbg.assign
 ;; results in a dbg.value(prev_value) being inserted at the store, and a
 ;; dbg.value(deref) at the dbg.assign.

From 40bdfd39e394baa08fa67c5943c1b53c66c94bed Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 23 Jan 2024 14:30:56 +0000
Subject: [PATCH 623/843] [llvm-reduce][DebugInfo] Support reducing
 non-instruction debug-info (#78995)

LLVM will shortly be able to represent variable locations without
encoding information into intrinsics -- they'll be stored as DPValue
objects instead. We'll still need to be able to llvm-reduce these
variable location assignments just like we can with intrinsics today,
thus, here's an llvm-reduce pass that enumerates and reduces the DPValue
objects.

The test for this is paradoxically written with dbg.value intrinsics:
this is because we're changing all the core parts of LLVM to support
this first, with the textual IR format coming last. Until that arrives,
testing the llvm-reduce'ing of DPValues needs the added test using
intrinsics. We should be able to drop the variable assignment using
%alsoloaded using this method. As with the other llvm-reduce tests, I've
got one set of check lines for making the reduction happen as desired,
and the other set to check the final output.
---
 .../tools/llvm-reduce/remove-dp-values.ll     | 51 +++++++++++++++++++
 llvm/tools/llvm-reduce/CMakeLists.txt         |  1 +
 llvm/tools/llvm-reduce/DeltaManager.cpp       |  8 ++-
 .../llvm-reduce/deltas/ReduceDPValues.cpp     | 39 ++++++++++++++
 .../tools/llvm-reduce/deltas/ReduceDPValues.h | 25 +++++++++
 llvm/tools/llvm-reduce/llvm-reduce.cpp        | 18 +++++++
 6 files changed, 140 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/tools/llvm-reduce/remove-dp-values.ll
 create mode 100644 llvm/tools/llvm-reduce/deltas/ReduceDPValues.cpp
 create mode 100644 llvm/tools/llvm-reduce/deltas/ReduceDPValues.h

diff --git a/llvm/test/tools/llvm-reduce/remove-dp-values.ll b/llvm/test/tools/llvm-reduce/remove-dp-values.ll
new file mode 100644
index 0000000000000..526dbce8456c3
--- /dev/null
+++ b/llvm/test/tools/llvm-reduce/remove-dp-values.ll
@@ -0,0 +1,51 @@
+; RUN: llvm-reduce --test FileCheck --test-arg --check-prefixes=CHECK-INTERESTINGNESS --test-arg %s --test-arg --input-file %s -o %t --try-experimental-debuginfo-iterators
+; RUN: FileCheck --check-prefixes=CHECK-FINAL --input-file=%t %s --implicit-check-not=dbg.value
+
+; Test that we can, in RemoveDIs mode / DPValues mode (where variable location
+; information isn't an instruction), remove one variable location assignment
+; but not another.
+
+; CHECK-INTERESTINGNESS:     call void @llvm.dbg.value(metadata i32 %added,
+
+; CHECK-FINAL:     declare void @llvm.dbg.value(metadata,
+; CHECK-FINAL:     %added = add
+; CHECK-FINAL-NEXT: call void @llvm.dbg.value(metadata i32 %added,
+
+declare void @llvm.dbg.value(metadata, metadata, metadata)
+
+define i32 @main() !dbg !7 {
+entry:
+  %uninteresting1 = alloca i32, align 4
+  %interesting = alloca i32, align 4
+  %uninteresting2 = alloca i32, align 4
+  store i32 0, ptr %uninteresting1, align 4
+  store i32 0, ptr %interesting, align 4
+  %0 = load i32, ptr %interesting, align 4
+  %added = add nsw i32 %0, 1
+  tail call void @llvm.dbg.value(metadata i32 %added, metadata !13, metadata !DIExpression()), !dbg !14
+  store i32 %added, ptr %interesting, align 4
+  %alsoloaded = load i32, ptr %interesting, align 4
+  tail call void @llvm.dbg.value(metadata i32 %alsoloaded, metadata !13, metadata !DIExpression()), !dbg !14
+  store i32 %alsoloaded, ptr %uninteresting2, align 4
+  ret i32 0
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang version 14.0.0", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "/tmp/a.c", directory: "/")
+!2 = !{i32 7, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"uwtable", i32 1}
+!6 = !{i32 7, !"frame-pointer", i32 2}
+!7 = distinct !DISubprogram(name: "main", scope: !8, file: !8, line: 1, type: !9, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !12)
+!8 = !DIFile(filename: "/tmp/a.c", directory: "")
+!9 = !DISubroutineType(types: !10)
+!10 = !{!11}
+!11 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!12 = !{}
+!13 = !DILocalVariable(name: "a", scope: !7, file: !8, line: 2, type: !11)
+!14 = !DILocation(line: 2, column: 7, scope: !7)
+
diff --git a/llvm/tools/llvm-reduce/CMakeLists.txt b/llvm/tools/llvm-reduce/CMakeLists.txt
index 43753768d89cb..2f1164b047853 100644
--- a/llvm/tools/llvm-reduce/CMakeLists.txt
+++ b/llvm/tools/llvm-reduce/CMakeLists.txt
@@ -31,6 +31,7 @@ add_llvm_tool(llvm-reduce
   deltas/ReduceAttributes.cpp
   deltas/ReduceBasicBlocks.cpp
   deltas/ReduceDIMetadata.cpp
+  deltas/ReduceDPValues.cpp
   deltas/ReduceFunctionBodies.cpp
   deltas/ReduceFunctions.cpp
   deltas/ReduceGlobalObjects.cpp
diff --git a/llvm/tools/llvm-reduce/DeltaManager.cpp b/llvm/tools/llvm-reduce/DeltaManager.cpp
index bfe299c8b7575..56e39b8f9316f 100644
--- a/llvm/tools/llvm-reduce/DeltaManager.cpp
+++ b/llvm/tools/llvm-reduce/DeltaManager.cpp
@@ -20,6 +20,7 @@
 #include "deltas/ReduceAttributes.h"
 #include "deltas/ReduceBasicBlocks.h"
 #include "deltas/ReduceDIMetadata.h"
+#include "deltas/ReduceDPValues.h"
 #include "deltas/ReduceFunctionBodies.h"
 #include "deltas/ReduceFunctions.h"
 #include "deltas/ReduceGlobalObjects.h"
@@ -78,9 +79,11 @@ static cl::list<std::string>
     DELTA_PASS("aliases", reduceAliasesDeltaPass)                              \
     DELTA_PASS("ifuncs", reduceIFuncsDeltaPass)                                \
     DELTA_PASS("simplify-conditionals-true", reduceConditionalsTrueDeltaPass)  \
-    DELTA_PASS("simplify-conditionals-false", reduceConditionalsFalseDeltaPass)\
+    DELTA_PASS("simplify-conditionals-false",                                  \
+               reduceConditionalsFalseDeltaPass)                               \
     DELTA_PASS("invokes", reduceInvokesDeltaPass)                              \
-    DELTA_PASS("unreachable-basic-blocks", reduceUnreachableBasicBlocksDeltaPass) \
+    DELTA_PASS("unreachable-basic-blocks",                                     \
+               reduceUnreachableBasicBlocksDeltaPass)                          \
     DELTA_PASS("basic-blocks", reduceBasicBlocksDeltaPass)                     \
     DELTA_PASS("simplify-cfg", reduceUsingSimplifyCFGDeltaPass)                \
     DELTA_PASS("function-data", reduceFunctionDataDeltaPass)                   \
@@ -89,6 +92,7 @@ static cl::list<std::string>
     DELTA_PASS("global-initializers", reduceGlobalsInitializersDeltaPass)      \
     DELTA_PASS("global-variables", reduceGlobalsDeltaPass)                     \
     DELTA_PASS("di-metadata", reduceDIMetadataDeltaPass)                       \
+    DELTA_PASS("dpvalues", reduceDPValuesDeltaPass)                            \
     DELTA_PASS("metadata", reduceMetadataDeltaPass)                            \
     DELTA_PASS("named-metadata", reduceNamedMetadataDeltaPass)                 \
     DELTA_PASS("arguments", reduceArgumentsDeltaPass)                          \
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDPValues.cpp b/llvm/tools/llvm-reduce/deltas/ReduceDPValues.cpp
new file mode 100644
index 0000000000000..8f5bafd950e34
--- /dev/null
+++ b/llvm/tools/llvm-reduce/deltas/ReduceDPValues.cpp
@@ -0,0 +1,39 @@
+//===- ReduceDPValues.cpp - Specialized Delta Pass ------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce uninteresting DPValues from defined functions.
+//
+// DPValues store variable-location debug-info and are attached to instructions.
+// This information used to be represented by intrinsics such as dbg.value, and
+// would naturally get reduced by llvm-reduce like any other instruction. As
+// DPValues get stored elsewhere, they need to be enumerated and eliminated like
+// any other data structure in LLVM.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ReduceDPValues.h"
+#include "Utils.h"
+#include "llvm/ADT/STLExtras.h"
+
+using namespace llvm;
+
+static void extractDPValuesFromModule(Oracle &O, ReducerWorkItem &WorkItem) {
+  Module &M = WorkItem.getModule();
+
+  for (auto &F : M)
+    for (auto &BB : F)
+      for (auto &I : BB)
+        for (DPValue &DPV : llvm::make_early_inc_range(I.getDbgValueRange()))
+          if (!O.shouldKeep())
+            DPV.eraseFromParent();
+}
+
+void llvm::reduceDPValuesDeltaPass(TestRunner &Test) {
+  runDeltaPass(Test, extractDPValuesFromModule, "Reducing DPValues");
+}
diff --git a/llvm/tools/llvm-reduce/deltas/ReduceDPValues.h b/llvm/tools/llvm-reduce/deltas/ReduceDPValues.h
new file mode 100644
index 0000000000000..34ebd271b4f24
--- /dev/null
+++ b/llvm/tools/llvm-reduce/deltas/ReduceDPValues.h
@@ -0,0 +1,25 @@
+//===- ReduceDPValues.h -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a function which calls the Generic Delta pass in order
+// to reduce uninteresting DPValues from defined functions.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDPVALUES_H
+#define LLVM_TOOLS_LLVM_REDUCE_DELTAS_REDUCEDPVALUES_H
+
+#include "Delta.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/DebugProgramInstruction.h"
+
+namespace llvm {
+void reduceDPValuesDeltaPass(TestRunner &Test);
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llvm-reduce/llvm-reduce.cpp b/llvm/tools/llvm-reduce/llvm-reduce.cpp
index bd2db635b4d8d..71ce0ca5ab6ab 100644
--- a/llvm/tools/llvm-reduce/llvm-reduce.cpp
+++ b/llvm/tools/llvm-reduce/llvm-reduce.cpp
@@ -100,6 +100,13 @@ static cl::opt<int>
                                "of delta passes (default=5)"),
                       cl::init(5), cl::cat(LLVMReduceOptions));
 
+static cl::opt<bool> TryUseNewDbgInfoFormat(
+    "try-experimental-debuginfo-iterators",
+    cl::desc("Enable debuginfo iterator positions, if they're built in"),
+    cl::init(false));
+
+extern cl::opt<bool> UseNewDbgInfoFormat;
+
 static codegen::RegisterCodeGenFlags CGF;
 
 /// Turn off crash debugging features
@@ -143,6 +150,17 @@ int main(int Argc, char **Argv) {
   cl::HideUnrelatedOptions({&LLVMReduceOptions, &getColorCategory()});
   cl::ParseCommandLineOptions(Argc, Argv, "LLVM automatic testcase reducer.\n");
 
+  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
+  // new debug-info format, if it's built in.
+#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  if (TryUseNewDbgInfoFormat) {
+    // If LLVM was built with support for this, turn the new debug-info format
+    // on.
+    UseNewDbgInfoFormat = true;
+  }
+#endif
+  (void)TryUseNewDbgInfoFormat;
+
   if (Argc == 1) {
     cl::PrintHelpMessage();
     return 0;

From 6bbaad1ed402490a648ca184a1f945988789b7a1 Mon Sep 17 00:00:00 2001
From: Lucas Duarte Prates <lucas.prates@arm.com>
Date: Tue, 23 Jan 2024 14:39:15 +0000
Subject: [PATCH 624/843] [ARM] Introduce the v9.5-A architecture version to
 Arm targets (#78994)

This introduces the Armv9.5-A architecture version to the Arm backend,
following on from the existing implementation for AArch64 targets.

Mode details about the Armv9.5-A architecture version can be found at:
* https://community.arm.com/arm-community-blogs/b/architectures-and-processors-blog/posts/arm-a-profile-architecture-developments-2023
* https://developer.arm.com/documentation/ddi0602/2023-09/
---
 clang/lib/Basic/Targets/ARM.cpp                |  4 ++++
 clang/test/CodeGen/arm-acle-coproc.c           |  2 ++
 clang/test/Driver/arm-cortex-cpus-1.c          | 17 +++++++++++++++++
 clang/test/Preprocessor/arm-target-features.c  |  5 +++++
 .../llvm/TargetParser/ARMTargetParser.def      |  5 +++++
 llvm/lib/Target/ARM/ARM.td                     | 18 ++++++++++++++++++
 llvm/lib/Target/ARM/ARMSubtarget.h             |  1 +
 .../Target/ARM/MCTargetDesc/ARMELFStreamer.cpp |  1 +
 llvm/lib/TargetParser/ARMTargetParser.cpp      |  2 ++
 llvm/lib/TargetParser/Triple.cpp               |  2 ++
 .../TargetParser/TargetParserTest.cpp          |  6 +++++-
 11 files changed, 62 insertions(+), 1 deletion(-)

diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index a72bd42bad415..55b71557452fa 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -227,6 +227,8 @@ StringRef ARMTargetInfo::getCPUAttr() const {
     return "9_3A";
   case llvm::ARM::ArchKind::ARMV9_4A:
     return "9_4A";
+  case llvm::ARM::ArchKind::ARMV9_5A:
+    return "9_5A";
   case llvm::ARM::ArchKind::ARMV8MBaseline:
     return "8M_BASE";
   case llvm::ARM::ArchKind::ARMV8MMainline:
@@ -889,6 +891,7 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   case llvm::ARM::ArchKind::ARMV9_2A:
   case llvm::ARM::ArchKind::ARMV9_3A:
   case llvm::ARM::ArchKind::ARMV9_4A:
+  case llvm::ARM::ArchKind::ARMV9_5A:
     // Filter __arm_cdp, __arm_ldcl, __arm_stcl in arm_acle.h
     FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B3;
     break;
@@ -1057,6 +1060,7 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   case llvm::ARM::ArchKind::ARMV9_2A:
   case llvm::ARM::ArchKind::ARMV9_3A:
   case llvm::ARM::ArchKind::ARMV9_4A:
+  case llvm::ARM::ArchKind::ARMV9_5A:
     getTargetDefinesARMV83A(Opts, Builder);
     break;
   }
diff --git a/clang/test/CodeGen/arm-acle-coproc.c b/clang/test/CodeGen/arm-acle-coproc.c
index cf87130932edf..0354d1297ece1 100644
--- a/clang/test/CodeGen/arm-acle-coproc.c
+++ b/clang/test/CodeGen/arm-acle-coproc.c
@@ -24,6 +24,7 @@
 // RUN: %clang_cc1 -triple armv9.2a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv9.3a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv9.4a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
+// RUN: %clang_cc1 -triple armv9.5a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv4 %s -E -dD -o - | FileCheck --check-prefix=CHECK-V4-THUMB %s
 // RUN: %clang_cc1 -triple thumbv4t %s -E -dD -o - | FileCheck --check-prefix=CHECK-V4-THUMB %s
 // RUN: %clang_cc1 -triple thumbv5 %s -E -dD -o - | FileCheck --check-prefix=CHECK-V5-THUMB %s
@@ -52,6 +53,7 @@
 // RUN: %clang_cc1 -triple thumbv9.2a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv9.3a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv9.4a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
+// RUN: %clang_cc1 -triple thumbv9.5a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv8m.base %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8-BASE %s
 // RUN: %clang_cc1 -triple thumbv8m.main %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8-MAIN %s
 // RUN: %clang_cc1 -triple thumbv8.1m.main %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8-MAIN %s
diff --git a/clang/test/Driver/arm-cortex-cpus-1.c b/clang/test/Driver/arm-cortex-cpus-1.c
index 2f300efee75ed..25abbe1e3a8ad 100644
--- a/clang/test/Driver/arm-cortex-cpus-1.c
+++ b/clang/test/Driver/arm-cortex-cpus-1.c
@@ -478,3 +478,20 @@
 // RUN: %clang -target arm -march=armebv9.4a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V94A %s
 // RUN: %clang -target arm -march=armebv9.4-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V94A %s
 // CHECK-BE-V94A: "-cc1"{{.*}} "-triple" "armebv9.4{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V95A %s
+// RUN: %clang -target arm -march=armv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V95A %s
+// RUN: %clang -target arm -march=armv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V95A %s
+// RUN: %clang -target arm -march=armv9.5a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V95A %s
+// RUN: %clang -target armv9.5a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V95A %s
+// RUN: %clang -target arm -march=armv9.5a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V95A %s
+// RUN: %clang -target arm -mlittle-endian -march=armv9.5-a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V95A %s
+// CHECK-V95A: "-cc1"{{.*}} "-triple" "armv9.5{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armebv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V95A %s
+// RUN: %clang -target armv9.5a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V95A %s
+// RUN: %clang -target armeb -march=armebv9.5a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V95A %s
+// RUN: %clang -target armeb -march=armebv9.5-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V95A %s
+// RUN: %clang -target arm -march=armebv9.5a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V95A %s
+// RUN: %clang -target arm -march=armebv9.5-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V95A %s
+// CHECK-BE-V95A: "-cc1"{{.*}} "-triple" "armebv9.5{{.*}}" "-target-cpu" "generic"
diff --git a/clang/test/Preprocessor/arm-target-features.c b/clang/test/Preprocessor/arm-target-features.c
index 1539d03a044fa..236c9f2479b70 100644
--- a/clang/test/Preprocessor/arm-target-features.c
+++ b/clang/test/Preprocessor/arm-target-features.c
@@ -897,6 +897,11 @@
 // CHECK-V94A: #define __ARM_ARCH_9_4A__ 1
 // CHECK-V94A: #define __ARM_ARCH_PROFILE 'A'
 
+// RUN: %clang -target armv9.5a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V95A %s
+// CHECK-V95A: #define __ARM_ARCH 9
+// CHECK-V95A: #define __ARM_ARCH_9_5A__ 1
+// CHECK-V95A: #define __ARM_ARCH_PROFILE 'A'
+
 // RUN: %clang -target arm-none-none-eabi -march=armv7-m -mfpu=softvfp -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SOFTVFP %s
 // CHECK-SOFTVFP-NOT: #define __ARM_FP 0x
 
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.def b/llvm/include/llvm/TargetParser/ARMTargetParser.def
index c520ab898cb90..1797a1b238d34 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.def
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.def
@@ -151,6 +151,11 @@ ARM_ARCH("armv9.4-a", ARMV9_4A, "9.4-A", "+v9.4a",
          (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
           ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
+ARM_ARCH("armv9.5-a", ARMV9_5A, "9.5-A", "+v9.5a",
+         ARMBuildAttrs::CPUArch::v9_A, FK_NEON_FP_ARMV8,
+         (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
+          ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
 ARM_ARCH("armv8-r", ARMV8R, "8-R", "+v8r", ARMBuildAttrs::CPUArch::v8_R,
           FK_NEON_FP_ARMV8,
           (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB |
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 97d1444a553eb..877781568307d 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -704,6 +704,12 @@ def HasV9_4aOps   : SubtargetFeature<"v9.4a", "HasV9_4aOps", "true",
                                    "Support ARM v9.4a instructions",
                                    [HasV8_9aOps, HasV9_3aOps]>;
 
+// Armv9.5-A is a v9-only architecture. From v9.5-A onwards there's no mapping
+// to an equivalent v8.x version.
+def HasV9_5aOps   : SubtargetFeature<"v9.5a", "HasV9_5aOps", "true",
+                                   "Support ARM v9.5a instructions",
+                                   [HasV9_4aOps]>;
+
 def HasV8_1MMainlineOps : SubtargetFeature<
                "v8.1m.main", "HasV8_1MMainlineOps", "true",
                "Support ARM v8-1M Mainline instructions",
@@ -1139,6 +1145,18 @@ def ARMv94a   : Architecture<"armv9.4-a", "ARMv94a",  [HasV9_4aOps,
                                                        FeatureCRC,
                                                        FeatureRAS,
                                                        FeatureDotProd]>;
+def ARMv95a   : Architecture<"armv9.5-a", "ARMv95a",  [HasV9_5aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
 
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 43b4123a1b557..91f3978b041a3 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -132,6 +132,7 @@ class ARMSubtarget : public ARMGenSubtargetInfo {
     ARMv92a,
     ARMv93a,
     ARMv94a,
+    ARMv95a,
   };
 
 public:
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 9c9af6068079d..1d80af590d16e 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -898,6 +898,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   case ARM::ArchKind::ARMV9_2A:
   case ARM::ArchKind::ARMV9_3A:
   case ARM::ArchKind::ARMV9_4A:
+  case ARM::ArchKind::ARMV9_5A:
     S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     S.setAttributeItem(ARM_ISA_use, Allowed, false);
     S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index c470a1321fcbd..67f937ebc33f9 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -86,6 +86,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   case ArchKind::ARMV9_2A:
   case ArchKind::ARMV9_3A:
   case ArchKind::ARMV9_4A:
+  case ArchKind::ARMV9_5A:
     return 9;
   case ArchKind::INVALID:
     return 0;
@@ -123,6 +124,7 @@ static ARM::ProfileKind getProfileKind(ARM::ArchKind AK) {
   case ARM::ArchKind::ARMV9_2A:
   case ARM::ArchKind::ARMV9_3A:
   case ARM::ArchKind::ARMV9_4A:
+  case ARM::ArchKind::ARMV9_5A:
     return ARM::ProfileKind::A;
   case ARM::ArchKind::ARMV4:
   case ARM::ArchKind::ARMV4T:
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 3212ba77de9bd..0bbe8a3cedfd7 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -811,6 +811,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v9_3a;
   case ARM::ArchKind::ARMV9_4A:
     return Triple::ARMSubArch_v9_4a;
+  case ARM::ArchKind::ARMV9_5A:
+    return Triple::ARMSubArch_v9_5a;
   case ARM::ArchKind::ARMV8R:
     return Triple::ARMSubArch_v8r;
   case ARM::ArchKind::ARMV8MBaseline:
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index f41b856da7c18..d31fcd1bb1b00 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -45,7 +45,7 @@ const char *ARMArch[] = {
     "armv8m.main",  "iwmmxt",      "iwmmxt2",        "xscale",      "armv8.1-m.main",
     "armv9-a",      "armv9",       "armv9a",         "armv9.1-a",   "armv9.1a",
     "armv9.2-a",    "armv9.2a",    "armv9.3-a",      "armv9.3a",    "armv9.4-a",
-    "armv9.4a",
+    "armv9.4a",     "armv9.5-a",   "armv9.5a",
 };
 
 std::string FormatExtensionFlags(int64_t Flags) {
@@ -605,6 +605,9 @@ TEST(TargetParserTest, testARMArch) {
   EXPECT_TRUE(
       testARMArch("armv9.4-a", "generic", "v9.4a",
                           ARMBuildAttrs::CPUArch::v9_A));
+  EXPECT_TRUE(
+      testARMArch("armv9.5-a", "generic", "v9.5a",
+                          ARMBuildAttrs::CPUArch::v9_A));
   EXPECT_TRUE(
       testARMArch("armv8-r", "cortex-r52", "v8r",
                           ARMBuildAttrs::CPUArch::v8_R));
@@ -940,6 +943,7 @@ TEST(TargetParserTest, ARMparseArchProfile) {
     case ARM::ArchKind::ARMV9_2A:
     case ARM::ArchKind::ARMV9_3A:
     case ARM::ArchKind::ARMV9_4A:
+    case ARM::ArchKind::ARMV9_5A:
       EXPECT_EQ(ARM::ProfileKind::A, ARM::parseArchProfile(ARMArch[i]));
       break;
     default:

From f2a2f8082bbb16fadf3583f7ac1b85e3e93a9f4c Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 23 Jan 2024 06:46:12 -0800
Subject: [PATCH 625/843] [CMake][PGO] Add libunwind to list of stage1 runtimes
 (#78869)

This fixes the build since 8f90e6937a1fac80873bb2dab5f382c82ba1ba4e
which made libcxxabi use llvm's libunwind by default.

Fixes #78487
---
 clang/cmake/caches/PGO.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/cmake/caches/PGO.cmake b/clang/cmake/caches/PGO.cmake
index e1d0585e453f8..15bc755d110d1 100644
--- a/clang/cmake/caches/PGO.cmake
+++ b/clang/cmake/caches/PGO.cmake
@@ -2,7 +2,7 @@ set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "")
 set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
 
 set(LLVM_ENABLE_PROJECTS "clang;lld" CACHE STRING "")
-set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi" CACHE STRING "")
+set(LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
 
 set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
 set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED ON CACHE BOOL "")

From c173a5bdc422ed457a04b68ec7c136add9eee11c Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Tue, 23 Jan 2024 14:52:33 +0000
Subject: [PATCH 626/843] [gn build] Port 40bdfd39e394

---
 llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn
index 02619de102dc7..02a1db908af3f 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-reduce/BUILD.gn
@@ -22,6 +22,7 @@ executable("llvm-reduce") {
     "deltas/ReduceAttributes.cpp",
     "deltas/ReduceBasicBlocks.cpp",
     "deltas/ReduceDIMetadata.cpp",
+    "deltas/ReduceDPValues.cpp",
     "deltas/ReduceFunctionBodies.cpp",
     "deltas/ReduceFunctions.cpp",
     "deltas/ReduceGlobalObjects.cpp",

From 9edd1c4daeba59b3db3390d449543b71c73f161b Mon Sep 17 00:00:00 2001
From: Saiyedul Islam <Saiyedul.Islam@amd.com>
Date: Tue, 23 Jan 2024 20:30:44 +0530
Subject: [PATCH 627/843] [MLIR][AMDGPU] Switch to code object version 5
 (#79144)

As AMDGPU backend has moved to cov5 as default, mlir should also switch
to it.
---
 mlir/lib/Target/LLVM/ROCDL/Target.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Target/LLVM/ROCDL/Target.cpp b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
index a589e5cbbb619..cdcef1d6b459c 100644
--- a/mlir/lib/Target/LLVM/ROCDL/Target.cpp
+++ b/mlir/lib/Target/LLVM/ROCDL/Target.cpp
@@ -238,7 +238,7 @@ void SerializeGPUModuleBase::addControlVariables(
   addControlVariable("__oclc_wavefrontsize64", wave64);
 
   llvm::Type *i32Ty = llvm::Type::getInt32Ty(module.getContext());
-  int abi = 400;
+  int abi = 500;
   abiVer.getAsInteger(0, abi);
   llvm::GlobalVariable *abiVersion = new llvm::GlobalVariable(
       module, i32Ty, true, llvm::GlobalValue::LinkageTypes::LinkOnceODRLinkage,

From 087172258a50d5bcabe43aff072a20701f0808ef Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 12 Dec 2023 17:28:32 +0000
Subject: [PATCH 628/843] [DebugInfo][RemoveDIs] Handle non-instr debug-info in
 GlobalISel (#75228)

The RemoveDIs project is aiming to eliminate debug intrinsics like
dbg.value and dbg.declare from LLVM, and replace them with DPValue objects
attached to instructions. ISel is one of the "terminals" where that
information needs to be converted into MIR format: this patch implements
support for that in GlobalISel. We aim for the output of LLVM to be
identical with/without RemoveDIs debug-info.

This patch should be NFC, as we're handling the same data about variables
stored in a different format -- it now appears in a DPValue object rather
than as an intrinsic. To that end, I've refactored the handling of
dbg.values into a dedicated function, and call it whenever a dbg.value or a
DPValue is encountered. dbg.declare is handled in a similar way.

Testing: adding the --try-experimental-debuginfo-iterators switch to llc
causes it to try and convert to the "new" debug-info format if it's built
in (LLVM_EXPERIMENTAL_DEBUGINFO_ITERATORS=On), and it'll be covered by our
buildbot. One test has a few extra wildcard-regexes added: this is because
there's some extra data printed about attached debug-info, which is safe to
ignore.
---
 .../llvm/CodeGen/GlobalISel/IRTranslator.h    |  37 ++-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  | 212 ++++++++++--------
 ...ine-shift-of-shifted-dbg-value-fallback.ll |   1 +
 .../CodeGen/AArch64/GlobalISel/debug-cpp.ll   |   2 +
 .../CodeGen/AArch64/GlobalISel/debug-insts.ll |   4 +
 .../GlobalISel/irtranslator-dilocation.ll     |  18 +-
 .../irtranslator-extract-used-by-dbg.ll       |   1 +
 .../GlobalISel/x86-calllowering-dbg-trunc.ll  |   1 +
 8 files changed, 170 insertions(+), 106 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 1b094d9d9fe77..5454df02914af 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -204,6 +204,27 @@ class IRTranslator : public MachineFunctionPass {
   /// \return true if the materialization succeeded.
   bool translate(const Constant &C, Register Reg);
 
+  /// Examine any debug-info attached to the instruction (in the form of
+  /// DPValues) and translate it.
+  void translateDbgInfo(const Instruction &Inst,
+                          MachineIRBuilder &MIRBuilder);
+
+  /// Translate a debug-info record of a dbg.value into a DBG_* instruction.
+  /// Pass in all the contents of the record, rather than relying on how it's
+  /// stored.
+  void translateDbgValueRecord(Value *V, bool HasArgList,
+                         const DILocalVariable *Variable,
+                         const DIExpression *Expression, const DebugLoc &DL,
+                         MachineIRBuilder &MIRBuilder);
+
+  /// Translate a debug-info record of a dbg.declare into an indirect DBG_*
+  /// instruction. Pass in all the contents of the record, rather than relying
+  /// on how it's stored.
+  void translateDbgDeclareRecord(Value *Address, bool HasArgList,
+                         const DILocalVariable *Variable,
+                         const DIExpression *Expression, const DebugLoc &DL,
+                         MachineIRBuilder &MIRBuilder);
+
   // Translate U as a copy of V.
   bool translateCopy(const User &U, const Value &V,
                      MachineIRBuilder &MIRBuilder);
@@ -250,14 +271,14 @@ class IRTranslator : public MachineFunctionPass {
   /// possible.
   std::optional<MCRegister> getArgPhysReg(Argument &Arg);
 
-  /// If DebugInst targets an Argument and its expression is an EntryValue,
-  /// lower it as an entry in the MF debug table.
-  bool translateIfEntryValueArgument(const DbgDeclareInst &DebugInst);
-
-  /// If DebugInst targets an Argument and its expression is an EntryValue,
-  /// lower as a DBG_VALUE targeting the corresponding livein register for that
-  /// Argument.
-  bool translateIfEntryValueArgument(const DbgValueInst &DebugInst,
+  /// If debug-info targets an Argument and its expression is an EntryValue,
+  /// lower it as either an entry in the MF debug table (dbg.declare), or a
+  /// DBG_VALUE targeting the corresponding livein register for that Argument
+  /// (dbg.value).
+  bool translateIfEntryValueArgument(bool isDeclare, Value *Arg,
+                                     const DILocalVariable *Var,
+                                     const DIExpression *Expr,
+                                     const DebugLoc &DL,
                                      MachineIRBuilder &MIRBuilder);
 
   bool translateInlineAsm(const CallBase &CB, MachineIRBuilder &MIRBuilder);
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 662de0f3fe0e5..1a71c1232c706 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -162,7 +162,8 @@ class DILocationVerifier : public GISelChangeObserver {
     // they could have originated from constants, and we don't want a jumpy
     // debug experience.
     assert((CurrInst->getDebugLoc() == MI.getDebugLoc() ||
-            (MI.getParent()->isEntryBlock() && !MI.getDebugLoc())) &&
+            (MI.getParent()->isEntryBlock() && !MI.getDebugLoc()) ||
+            (MI.isDebugInstr())) &&
            "Line info was not transferred to all instructions");
   }
 };
@@ -2006,47 +2007,35 @@ std::optional<MCRegister> IRTranslator::getArgPhysReg(Argument &Arg) {
   return VRegDef->getOperand(1).getReg().asMCReg();
 }
 
-bool IRTranslator::translateIfEntryValueArgument(const DbgValueInst &DebugInst,
+bool IRTranslator::translateIfEntryValueArgument(bool isDeclare, Value *Val,
+                                                 const DILocalVariable *Var,
+                                                 const DIExpression *Expr,
+                                                 const DebugLoc &DL,
                                                  MachineIRBuilder &MIRBuilder) {
-  auto *Arg = dyn_cast<Argument>(DebugInst.getValue());
+  auto *Arg = dyn_cast<Argument>(Val);
   if (!Arg)
     return false;
 
-  const DIExpression *Expr = DebugInst.getExpression();
   if (!Expr->isEntryValue())
     return false;
 
   std::optional<MCRegister> PhysReg = getArgPhysReg(*Arg);
   if (!PhysReg) {
-    LLVM_DEBUG(dbgs() << "Dropping dbg.value: expression is entry_value but "
-                         "couldn't find a physical register\n"
-                      << DebugInst << "\n");
+    LLVM_DEBUG(dbgs() << "Dropping dbg." << (isDeclare ? "declare" : "value")
+                      << ": expression is entry_value but "
+                      << "couldn't find a physical register\n");
+    LLVM_DEBUG(dbgs() << *Var << "\n");
     return true;
   }
 
-  MIRBuilder.buildDirectDbgValue(*PhysReg, DebugInst.getVariable(),
-                                 DebugInst.getExpression());
-  return true;
-}
-
-bool IRTranslator::translateIfEntryValueArgument(
-    const DbgDeclareInst &DebugInst) {
-  auto *Arg = dyn_cast<Argument>(DebugInst.getAddress());
-  if (!Arg)
-    return false;
-
-  const DIExpression *Expr = DebugInst.getExpression();
-  if (!Expr->isEntryValue())
-    return false;
-
-  std::optional<MCRegister> PhysReg = getArgPhysReg(*Arg);
-  if (!PhysReg)
-    return false;
+  if (isDeclare) {
+    // Append an op deref to account for the fact that this is a dbg_declare.
+    Expr = DIExpression::append(Expr, dwarf::DW_OP_deref);
+    MF->setVariableDbgInfo(Var, Expr, *PhysReg, DL);
+  } else {
+    MIRBuilder.buildDirectDbgValue(*PhysReg, Var, Expr);
+  }
 
-  // Append an op deref to account for the fact that this is a dbg_declare.
-  Expr = DIExpression::append(Expr, dwarf::DW_OP_deref);
-  MF->setVariableDbgInfo(DebugInst.getVariable(), Expr, *PhysReg,
-                         DebugInst.getDebugLoc());
   return true;
 }
 
@@ -2100,32 +2089,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   case Intrinsic::dbg_declare: {
     const DbgDeclareInst &DI = cast<DbgDeclareInst>(CI);
     assert(DI.getVariable() && "Missing variable");
-
-    const Value *Address = DI.getAddress();
-    if (!Address || isa<UndefValue>(Address)) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
-      return true;
-    }
-
-    assert(DI.getVariable()->isValidLocationForIntrinsic(
-               MIRBuilder.getDebugLoc()) &&
-           "Expected inlined-at fields to agree");
-    auto AI = dyn_cast<AllocaInst>(Address);
-    if (AI && AI->isStaticAlloca()) {
-      // Static allocas are tracked at the MF level, no need for DBG_VALUE
-      // instructions (in fact, they get ignored if they *do* exist).
-      MF->setVariableDbgInfo(DI.getVariable(), DI.getExpression(),
-                             getOrCreateFrameIndex(*AI), DI.getDebugLoc());
-      return true;
-    }
-
-    if (translateIfEntryValueArgument(DI))
-      return true;
-
-    // A dbg.declare describes the address of a source variable, so lower it
-    // into an indirect DBG_VALUE.
-    MIRBuilder.buildIndirectDbgValue(getOrCreateVReg(*Address),
-                                     DI.getVariable(), DI.getExpression());
+    translateDbgDeclareRecord(DI.getAddress(), DI.hasArgList(), DI.getVariable(),
+                       DI.getExpression(), DI.getDebugLoc(), MIRBuilder);
     return true;
   }
   case Intrinsic::dbg_label: {
@@ -2158,41 +2123,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   case Intrinsic::dbg_value: {
     // This form of DBG_VALUE is target-independent.
     const DbgValueInst &DI = cast<DbgValueInst>(CI);
-    const Value *V = DI.getValue();
-    assert(DI.getVariable()->isValidLocationForIntrinsic(
-               MIRBuilder.getDebugLoc()) &&
-           "Expected inlined-at fields to agree");
-    if (!V || DI.hasArgList()) {
-      // DI cannot produce a valid DBG_VALUE, so produce an undef DBG_VALUE to
-      // terminate any prior location.
-      MIRBuilder.buildIndirectDbgValue(0, DI.getVariable(), DI.getExpression());
-      return true;
-    }
-    if (const auto *CI = dyn_cast<Constant>(V)) {
-      MIRBuilder.buildConstDbgValue(*CI, DI.getVariable(), DI.getExpression());
-      return true;
-    }
-    if (auto *AI = dyn_cast<AllocaInst>(V);
-        AI && AI->isStaticAlloca() && DI.getExpression()->startsWithDeref()) {
-      // If the value is an alloca and the expression starts with a
-      // dereference, track a stack slot instead of a register, as registers
-      // may be clobbered.
-      auto ExprOperands = DI.getExpression()->getElements();
-      auto *ExprDerefRemoved =
-          DIExpression::get(AI->getContext(), ExprOperands.drop_front());
-      MIRBuilder.buildFIDbgValue(getOrCreateFrameIndex(*AI), DI.getVariable(),
-                                 ExprDerefRemoved);
-      return true;
-    }
-    if (translateIfEntryValueArgument(DI, MIRBuilder))
-      return true;
-    for (Register Reg : getOrCreateVRegs(*V)) {
-      // FIXME: This does not handle register-indirect values at offset 0. The
-      // direct/indirect thing shouldn't really be handled by something as
-      // implicit as reg+noreg vs reg+imm in the first place, but it seems
-      // pretty baked in right now.
-      MIRBuilder.buildDirectDbgValue(Reg, DI.getVariable(), DI.getExpression());
-    }
+    translateDbgValueRecord(DI.getValue(), DI.hasArgList(), DI.getVariable(),
+                       DI.getExpression(), DI.getDebugLoc(), MIRBuilder);
     return true;
   }
   case Intrinsic::uadd_with_overflow:
@@ -3250,6 +3182,102 @@ void IRTranslator::finishPendingPhis() {
   }
 }
 
+void IRTranslator::translateDbgValueRecord(Value *V, bool HasArgList,
+                                     const DILocalVariable *Variable,
+                                     const DIExpression *Expression,
+                                     const DebugLoc &DL,
+                                     MachineIRBuilder &MIRBuilder) {
+  assert(Variable->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  // Act as if we're handling a debug intrinsic.
+  MIRBuilder.setDebugLoc(DL);
+
+  if (!V || HasArgList) {
+    // DI cannot produce a valid DBG_VALUE, so produce an undef DBG_VALUE to
+    // terminate any prior location.
+    MIRBuilder.buildIndirectDbgValue(0, Variable, Expression);
+    return;
+  }
+
+  if (const auto *CI = dyn_cast<Constant>(V)) {
+    MIRBuilder.buildConstDbgValue(*CI, Variable, Expression);
+    return;
+  }
+
+  if (auto *AI = dyn_cast<AllocaInst>(V);
+      AI && AI->isStaticAlloca() && Expression->startsWithDeref()) {
+    // If the value is an alloca and the expression starts with a
+    // dereference, track a stack slot instead of a register, as registers
+    // may be clobbered.
+    auto ExprOperands = Expression->getElements();
+    auto *ExprDerefRemoved =
+        DIExpression::get(AI->getContext(), ExprOperands.drop_front());
+    MIRBuilder.buildFIDbgValue(getOrCreateFrameIndex(*AI), Variable,
+                               ExprDerefRemoved);
+    return;
+  }
+  if (translateIfEntryValueArgument(false, V, Variable, Expression, DL,
+                                    MIRBuilder))
+    return;
+  for (Register Reg : getOrCreateVRegs(*V)) {
+    // FIXME: This does not handle register-indirect values at offset 0. The
+    // direct/indirect thing shouldn't really be handled by something as
+    // implicit as reg+noreg vs reg+imm in the first place, but it seems
+    // pretty baked in right now.
+    MIRBuilder.buildDirectDbgValue(Reg, Variable, Expression);
+  }
+  return;
+}
+
+void IRTranslator::translateDbgDeclareRecord(Value *Address, bool HasArgList,
+                                     const DILocalVariable *Variable,
+                                     const DIExpression *Expression,
+                                     const DebugLoc &DL,
+                                     MachineIRBuilder &MIRBuilder) {
+  if (!Address || isa<UndefValue>(Address)) {
+    LLVM_DEBUG(dbgs() << "Dropping debug info for " << *Variable << "\n");
+    return;
+  }
+
+  assert(Variable->isValidLocationForIntrinsic(DL) &&
+         "Expected inlined-at fields to agree");
+  auto AI = dyn_cast<AllocaInst>(Address);
+  if (AI && AI->isStaticAlloca()) {
+    // Static allocas are tracked at the MF level, no need for DBG_VALUE
+    // instructions (in fact, they get ignored if they *do* exist).
+    MF->setVariableDbgInfo(Variable, Expression,
+                           getOrCreateFrameIndex(*AI), DL);
+    return;
+  }
+
+  if (translateIfEntryValueArgument(true, Address, Variable,
+                                    Expression, DL,
+                                    MIRBuilder))
+    return;
+
+  // A dbg.declare describes the address of a source variable, so lower it
+  // into an indirect DBG_VALUE.
+  MIRBuilder.setDebugLoc(DL);
+  MIRBuilder.buildIndirectDbgValue(getOrCreateVReg(*Address),
+                                   Variable, Expression);
+  return;
+}
+
+void IRTranslator::translateDbgInfo(const Instruction &Inst,
+                                      MachineIRBuilder &MIRBuilder) {
+  for (DPValue &DPV : Inst.getDbgValueRange()) {
+    const DILocalVariable *Variable = DPV.getVariable();
+    const DIExpression *Expression = DPV.getExpression();
+    Value *V = DPV.getVariableLocationOp(0);
+    if (DPV.isDbgDeclare())
+      translateDbgDeclareRecord(V, DPV.hasArgList(), Variable,
+                         Expression, DPV.getDebugLoc(), MIRBuilder);
+    else
+      translateDbgValueRecord(V, DPV.hasArgList(), Variable,
+                         Expression, DPV.getDebugLoc(), MIRBuilder);
+  }
+}
+
 bool IRTranslator::translate(const Instruction &Inst) {
   CurBuilder->setDebugLoc(Inst.getDebugLoc());
   CurBuilder->setPCSections(Inst.getMetadata(LLVMContext::MD_pcsections));
@@ -3760,6 +3788,10 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
 #ifndef NDEBUG
         Verifier.setCurrentInst(&Inst);
 #endif // ifndef NDEBUG
+
+        // Translate any debug-info attached to the instruction.
+        translateDbgInfo(Inst, *CurBuilder.get());
+
         if (translate(Inst))
           continue;
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-of-shifted-dbg-value-fallback.ll b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-of-shifted-dbg-value-fallback.ll
index adaf54cbc9620..930d39dfa298b 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-of-shifted-dbg-value-fallback.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/combine-shift-of-shifted-dbg-value-fallback.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -global-isel -mtriple=arm64-linux-gnu -global-isel-abort=1 | FileCheck %s
+; RUN: llc < %s -global-isel -mtriple=arm64-linux-gnu -global-isel-abort=1 --try-experimental-debuginfo-iterators | FileCheck %s
 target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
 target triple = "arm64-apple-ios9.0.0"
 
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll b/llvm/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll
index cb4a01cbcf092..485ab18ec7db9 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/debug-cpp.ll
@@ -1,5 +1,7 @@
 ; RUN: llc -global-isel -mtriple=aarch64 %s -stop-after=irtranslator -o - | FileCheck %s
 ; RUN: llc -mtriple=aarch64 -global-isel --global-isel-abort=0 -o /dev/null
+; RUN: llc -global-isel -mtriple=aarch64 %s -stop-after=irtranslator -o - --try-experimental-debuginfo-iterators | FileCheck %s
+; RUN: llc -mtriple=aarch64 -global-isel --global-isel-abort=0 -o /dev/null --try-experimental-debuginfo-iterators
 
 ; struct NTCopy {
 ;   NTCopy();
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/debug-insts.ll b/llvm/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
index 47ddca8fd0577..960ea4e5b9f42 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/debug-insts.ll
@@ -2,6 +2,10 @@
 ; RUN: llc -global-isel -mtriple=aarch64 %s -stop-after=irtranslator -o - | FileCheck %s
 ; RUN: llc -mtriple=aarch64 -global-isel --global-isel-abort=0 %s -o /dev/null
 ; RUN: llc -mtriple=aarch64 -global-isel --global-isel-abort=0 %s -o /dev/null -debug
+;
+; RUN: llc -global-isel -mtriple=aarch64 %s -stop-after=irtranslator -o - --try-experimental-debuginfo-iterators | FileCheck %s
+; RUN: llc -mtriple=aarch64 -global-isel --global-isel-abort=0 %s -o /dev/null --try-experimental-debuginfo-iterators
+; RUN: llc -mtriple=aarch64 -global-isel --global-isel-abort=0 %s -o /dev/null -debug --try-experimental-debuginfo-iterators
 
 ; CHECK-LABEL: name: debug_declare
 ; CHECK: stack:
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
index a8fc761a3a533..fc17a62a4d704 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-dilocation.ll
@@ -1,16 +1,18 @@
 ; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -debug-only=irtranslator \
 ; RUN:     -stop-after=irtranslator %s -o - 2>&1 | FileCheck %s
+; RUN: llc -O0 -mtriple=aarch64-apple-ios -global-isel -debug-only=irtranslator \
+; RUN:     -stop-after=irtranslator %s -o - 2>&1 --try-experimental-debuginfo-iterators | FileCheck %s
 
 ; REQUIRES: asserts
 
-; CHECK: Checking DILocation from   %retval = alloca i32, align 4 was copied to G_FRAME_INDEX
-; CHECK: Checking DILocation from   %rv = alloca i32, align 4 was copied to G_FRAME_INDEX
-; CHECK: Checking DILocation from   store i32 0, ptr %retval, align 4 was copied to G_CONSTANT
-; CHECK: Checking DILocation from   store i32 0, ptr %retval, align 4 was copied to G_STORE
-; CHECK: Checking DILocation from   store i32 0, ptr %rv, align 4, !dbg !12 was copied to G_STORE debug-location !12; t.cpp:2:5
-; CHECK: Checking DILocation from   %0 = load i32, ptr %rv, align 4, !dbg !13 was copied to G_LOAD debug-location !13; t.cpp:3:8
-; CHECK: Checking DILocation from   ret i32 %0, !dbg !14 was copied to COPY debug-location !14; t.cpp:3:1
-; CHECK: Checking DILocation from   ret i32 %0, !dbg !14 was copied to RET_ReallyLR implicit $w0, debug-location !14; t.cpp:3:1
+; CHECK: Checking DILocation from   %retval = alloca i32, align 4{{.*}} was copied to G_FRAME_INDEX
+; CHECK: Checking DILocation from   %rv = alloca i32, align 4{{.*}} was copied to G_FRAME_INDEX
+; CHECK: Checking DILocation from   store i32 0, ptr %retval, align 4{{.*}} was copied to G_CONSTANT
+; CHECK: Checking DILocation from   store i32 0, ptr %retval, align 4{{.*}} was copied to G_STORE
+; CHECK: Checking DILocation from   store i32 0, ptr %rv, align 4, !dbg !12{{.*}} was copied to G_STORE debug-location !12; t.cpp:2:5
+; CHECK: Checking DILocation from   %0 = load i32, ptr %rv, align 4, !dbg !13{{.*}} was copied to G_LOAD debug-location !13; t.cpp:3:8
+; CHECK: Checking DILocation from   ret i32 %0, !dbg !14{{.*}} was copied to COPY debug-location !14; t.cpp:3:1
+; CHECK: Checking DILocation from   ret i32 %0, !dbg !14{{.*}} was copied to RET_ReallyLR implicit $w0, debug-location !14; t.cpp:3:1
 
 source_filename = "t.cpp"
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-extract-used-by-dbg.ll b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-extract-used-by-dbg.ll
index 9f398b4a9d3b1..1304747789f2a 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-extract-used-by-dbg.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/irtranslator-extract-used-by-dbg.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s
+; RUN: llc -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 --try-experimental-debuginfo-iterators | FileCheck %s
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-fuchsia"
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86-calllowering-dbg-trunc.ll b/llvm/test/CodeGen/X86/GlobalISel/x86-calllowering-dbg-trunc.ll
index 90c9290a83bbd..f9c34d3b0bb27 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86-calllowering-dbg-trunc.ll
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86-calllowering-dbg-trunc.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - | FileCheck %s --check-prefix=ALL
+; RUN: llc -mtriple=i386-linux-gnu -global-isel -verify-machineinstrs < %s -o - --try-experimental-debuginfo-iterators | FileCheck %s --check-prefix=ALL
 
 ; This file is the output of clang -g -O2
 ; int test_dbg_trunc(unsigned long long a) { return a; }

From 10a3e9138ec9fd9f3f563bb1b6e014f4e3d4ebc9 Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Tue, 23 Jan 2024 16:14:04 +0100
Subject: [PATCH 629/843] [libc] Fix forward arm32 buildbot (#79151)

Fix forward #79128
---
 libc/src/__support/FPUtil/arm/FEnvImpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/FPUtil/arm/FEnvImpl.h b/libc/src/__support/FPUtil/arm/FEnvImpl.h
index 1a89de50b6b60..ac4673cf20f63 100644
--- a/libc/src/__support/FPUtil/arm/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/arm/FEnvImpl.h
@@ -135,8 +135,8 @@ LIBC_INLINE int set_except(int excepts) {
 LIBC_INLINE int raise_except(int excepts) {
   float zero = 0.0f;
   float one = 1.0f;
-  float large_value = FPBits<float>::max_normal();
-  float small_value = FPBits<float>::min_normal();
+  float large_value = FPBits<float>::max_normal().get_val();
+  float small_value = FPBits<float>::min_normal().get_val();
   auto divfunc = [](float a, float b) {
     __asm__ __volatile__("flds  s0, %0\n\t"
                          "flds  s1, %1\n\t"

From 5f47687c82e6a9246f9fd852185fbba43385abe3 Mon Sep 17 00:00:00 2001
From: Dinar Temirbulatov <Dinar.Temirbulatov@arm.com>
Date: Tue, 23 Jan 2024 15:16:55 +0000
Subject: [PATCH 630/843] Remove config.aarch64_sme from
 compiler-rt/unittests/lit.common.unit.configured.in

---
 compiler-rt/unittests/lit.common.unit.configured.in | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler-rt/unittests/lit.common.unit.configured.in b/compiler-rt/unittests/lit.common.unit.configured.in
index 23ec222697712..3e42e83c9e70a 100644
--- a/compiler-rt/unittests/lit.common.unit.configured.in
+++ b/compiler-rt/unittests/lit.common.unit.configured.in
@@ -7,7 +7,6 @@ config.llvm_obj_root = "@LLVM_BINARY_DIR@"
 config.llvm_tools_dir = lit_config.substitute("@LLVM_TOOLS_DIR@")
 config.compiler_rt_src_root = "@COMPILER_RT_SOURCE_DIR@"
 config.compiler_rt_libdir = lit_config.substitute("@COMPILER_RT_RESOLVED_LIBRARY_OUTPUT_DIR@")
-config.aarch64_sme = @COMPILER_RT_HAS_AARCH64_SME@
 config.enable_per_target_runtime_dir = @LLVM_ENABLE_PER_TARGET_RUNTIME_DIR_PYBOOL@
 config.llvm_build_mode = lit_config.substitute("@LLVM_BUILD_MODE@")
 config.host_arch = "@HOST_ARCH@"

From 291ac25298f797f99d880833c20756d82d779c02 Mon Sep 17 00:00:00 2001
From: Qiongsi Wu <274595+qiongsiwu@users.noreply.github.com>
Date: Tue, 23 Jan 2024 10:26:59 -0500
Subject: [PATCH 631/843] [PGO] Remove calls to `__llvm_orderfile_dump()` in
 `instrprof-api.c` test (#79150)

https://github.com/llvm/llvm-project/pull/78285 added a test which calls
`__llvm_orderfile_dump()`, a functionality that is not supported on many
platforms. This PR removes the call to `__llvm_orderfile_dump()` to
avoid it failing on unsupported platforms, and turn on the test for
Windows, so we test the rest of the API that are supported.
---
 compiler-rt/test/profile/instrprof-api.c | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/compiler-rt/test/profile/instrprof-api.c b/compiler-rt/test/profile/instrprof-api.c
index fedec2d1afc48..07c5b2f453907 100644
--- a/compiler-rt/test/profile/instrprof-api.c
+++ b/compiler-rt/test/profile/instrprof-api.c
@@ -1,6 +1,3 @@
-// UNSUPPORTED: target={{.*windows.*}}
-// __llvm_orderfile_dump() is not supported on Windows.
-
 // Testing profile generate.
 // RUN: %clang_profgen %s -S -emit-llvm -o - | FileCheck %s --check-prefix=PROFGEN
 // RUN: %clang_pgogen %s -S -emit-llvm -o - | FileCheck %s --check-prefix=PROFGEN
@@ -38,12 +35,8 @@ int main() {
     return 2;
   // PROFGEN: %{{.*}} = call {{(signext )*}}i32 @__llvm_profile_dump()
   // PROFUSE-NOT: %{{.*}} = call {{(signext )*}}i32 @__llvm_profile_dump()
-  __llvm_orderfile_dump();
-  // PROFGEN: %{{.*}} = call {{(signext )*}}i32 @__llvm_orderfile_dump()
-  // PROFUSE-NOT: %{{.*}} = call {{(signext )*}}i32 @__llvm_orderfile_dump()
   return z + bar() - 11;
 }
 
 // PROFUSE-NOT: declare void @__llvm_profile_set_filename(ptr noundef)
 // PROFUSE-NOT: declare signext i32 @__llvm_profile_dump()
-// PROFUSE-NOT: declare signext i32 @__llvm_orderfile_dump()

From 77e204c7b04f1f516db9a1dd5602e4a853bb0d1c Mon Sep 17 00:00:00 2001
From: Kyungwoo Lee <kyulee@meta.com>
Date: Tue, 23 Jan 2024 07:31:34 -0800
Subject: [PATCH 632/843] [lld-macho][arm64] implement -objc_stubs_small
 (#78665)

This patch implements `-objc_stubs_small` targeting arm64, aiming to
align with ld64's behavior.
1. `-objc_stubs_fast`: As previously implemented, this always uses the
Global Offset Table (GOT) to invoke `objc_msgSend`. The alignment of the
objc stub is 32 bytes.
2. `-objc_stubs_small`: This behavior depends on whether `objc_msgSend`
is defined. If it is, it directly jumps to `objc_msgSend`. If not, it
creates another stub to indirectly jump to `objc_msgSend`, minimizing
the size. The alignment of the objc stub in this case is 4 bytes.
---
 lld/MachO/Arch/ARM64.cpp              | 51 ++++++++++++++----
 lld/MachO/Arch/ARM64Common.h          | 32 +++++++++--
 lld/MachO/Arch/ARM64_32.cpp           | 11 ++--
 lld/MachO/Arch/X86_64.cpp             | 20 ++++---
 lld/MachO/Driver.cpp                  | 10 ++--
 lld/MachO/SyntheticSections.cpp       | 42 +++++++++++----
 lld/MachO/SyntheticSections.h         |  2 +-
 lld/MachO/Target.h                    |  9 ++--
 lld/test/MachO/arm64-objc-stubs-dyn.s | 76 +++++++++++++++++++++++++++
 lld/test/MachO/arm64-objc-stubs.s     | 32 +++++++++--
 lld/test/MachO/x86-64-objc-stubs.s    |  3 ++
 11 files changed, 237 insertions(+), 51 deletions(-)
 create mode 100644 lld/test/MachO/arm64-objc-stubs-dyn.s

diff --git a/lld/MachO/Arch/ARM64.cpp b/lld/MachO/Arch/ARM64.cpp
index e3781763c6102..2741df9c3070e 100644
--- a/lld/MachO/Arch/ARM64.cpp
+++ b/lld/MachO/Arch/ARM64.cpp
@@ -37,9 +37,9 @@ struct ARM64 : ARM64Common {
                             uint64_t entryAddr) const override;
 
   void writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr,
-                            uint64_t stubOffset, uint64_t selrefsVA,
-                            uint64_t selectorIndex, uint64_t gotAddr,
-                            uint64_t msgSendIndex) const override;
+                            uint64_t &stubOffset, uint64_t selrefsVA,
+                            uint64_t selectorIndex,
+                            Symbol *objcMsgSend) const override;
   void populateThunk(InputSection *thunk, Symbol *funcSym) override;
   void applyOptimizationHints(uint8_t *, const ObjFile &) const override;
 };
@@ -117,13 +117,42 @@ static constexpr uint32_t objcStubsFastCode[] = {
     0xd4200020, // brk   #0x1
 };
 
+static constexpr uint32_t objcStubsSmallCode[] = {
+    0x90000001, // adrp  x1, __objc_selrefs@page
+    0xf9400021, // ldr   x1, [x1, @selector("foo")@pageoff]
+    0x14000000, // b     _objc_msgSend
+};
+
 void ARM64::writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr,
-                                 uint64_t stubOffset, uint64_t selrefsVA,
-                                 uint64_t selectorIndex, uint64_t gotAddr,
-                                 uint64_t msgSendIndex) const {
-  ::writeObjCMsgSendStub<LP64>(buf, objcStubsFastCode, sym, stubsAddr,
-                               stubOffset, selrefsVA, selectorIndex, gotAddr,
-                               msgSendIndex);
+                                 uint64_t &stubOffset, uint64_t selrefsVA,
+                                 uint64_t selectorIndex,
+                                 Symbol *objcMsgSend) const {
+  uint64_t objcMsgSendAddr;
+  uint64_t objcStubSize;
+  uint64_t objcMsgSendIndex;
+
+  if (config->objcStubsMode == ObjCStubsMode::fast) {
+    objcStubSize = target->objcStubsFastSize;
+    objcMsgSendAddr = in.got->addr;
+    objcMsgSendIndex = objcMsgSend->gotIndex;
+    ::writeObjCMsgSendFastStub<LP64>(buf, objcStubsFastCode, sym, stubsAddr,
+                                     stubOffset, selrefsVA, selectorIndex,
+                                     objcMsgSendAddr, objcMsgSendIndex);
+  } else {
+    assert(config->objcStubsMode == ObjCStubsMode::small);
+    objcStubSize = target->objcStubsSmallSize;
+    if (auto *d = dyn_cast<Defined>(objcMsgSend)) {
+      objcMsgSendAddr = d->getVA();
+      objcMsgSendIndex = 0;
+    } else {
+      objcMsgSendAddr = in.stubs->addr;
+      objcMsgSendIndex = objcMsgSend->stubsIndex;
+    }
+    ::writeObjCMsgSendSmallStub<LP64>(buf, objcStubsSmallCode, sym, stubsAddr,
+                                      stubOffset, selrefsVA, selectorIndex,
+                                      objcMsgSendAddr, objcMsgSendIndex);
+  }
+  stubOffset += objcStubSize;
 }
 
 // A thunk is the relaxed variation of stubCode. We don't need the
@@ -157,7 +186,9 @@ ARM64::ARM64() : ARM64Common(LP64()) {
   thunkSize = sizeof(thunkCode);
 
   objcStubsFastSize = sizeof(objcStubsFastCode);
-  objcStubsAlignment = 32;
+  objcStubsFastAlignment = 32;
+  objcStubsSmallSize = sizeof(objcStubsSmallCode);
+  objcStubsSmallAlignment = 4;
 
   // Branch immediate is two's complement 26 bits, which is implicitly
   // multiplied by 4 (since all functions are 4-aligned: The branch range
diff --git a/lld/MachO/Arch/ARM64Common.h b/lld/MachO/Arch/ARM64Common.h
index 9cfccb6cec761..b038b6200f4d5 100644
--- a/lld/MachO/Arch/ARM64Common.h
+++ b/lld/MachO/Arch/ARM64Common.h
@@ -154,10 +154,10 @@ inline void writeStubHelperEntry(uint8_t *buf8,
 
 template <class LP>
 inline void
-writeObjCMsgSendStub(uint8_t *buf, const uint32_t objcStubsFastCode[8],
-                     Symbol *sym, uint64_t stubsAddr, uint64_t stubOffset,
-                     uint64_t selrefsVA, uint64_t selectorIndex,
-                     uint64_t gotAddr, uint64_t msgSendIndex) {
+writeObjCMsgSendFastStub(uint8_t *buf, const uint32_t objcStubsFastCode[8],
+                         Symbol *sym, uint64_t stubsAddr, uint64_t stubOffset,
+                         uint64_t selrefsVA, uint64_t selectorIndex,
+                         uint64_t gotAddr, uint64_t msgSendIndex) {
   SymbolDiagnostic d = {sym, sym->getName()};
   auto *buf32 = reinterpret_cast<uint32_t *>(buf);
 
@@ -180,6 +180,30 @@ writeObjCMsgSendStub(uint8_t *buf, const uint32_t objcStubsFastCode[8],
   buf32[7] = objcStubsFastCode[7];
 }
 
+template <class LP>
+inline void
+writeObjCMsgSendSmallStub(uint8_t *buf, const uint32_t objcStubsSmallCode[3],
+                          Symbol *sym, uint64_t stubsAddr, uint64_t stubOffset,
+                          uint64_t selrefsVA, uint64_t selectorIndex,
+                          uint64_t msgSendAddr, uint64_t msgSendIndex) {
+  SymbolDiagnostic d = {sym, sym->getName()};
+  auto *buf32 = reinterpret_cast<uint32_t *>(buf);
+
+  auto pcPageBits = [stubsAddr, stubOffset](int i) {
+    return pageBits(stubsAddr + stubOffset + i * sizeof(uint32_t));
+  };
+
+  uint64_t selectorOffset = selectorIndex * LP::wordSize;
+  encodePage21(&buf32[0], d, objcStubsSmallCode[0],
+               pageBits(selrefsVA + selectorOffset) - pcPageBits(0));
+  encodePageOff12(&buf32[1], d, objcStubsSmallCode[1],
+                  selrefsVA + selectorOffset);
+  uint64_t msgSendStubVA = msgSendAddr + msgSendIndex * target->stubSize;
+  uint64_t pcVA = stubsAddr + stubOffset + 2 * sizeof(uint32_t);
+  encodeBranch26(&buf32[2], {nullptr, "objc_msgSend stub"},
+                 objcStubsSmallCode[2], msgSendStubVA - pcVA);
+}
+
 } // namespace lld::macho
 
 #endif
diff --git a/lld/MachO/Arch/ARM64_32.cpp b/lld/MachO/Arch/ARM64_32.cpp
index c6bb6ee6c1c7d..16c7cbee9ba7e 100644
--- a/lld/MachO/Arch/ARM64_32.cpp
+++ b/lld/MachO/Arch/ARM64_32.cpp
@@ -34,9 +34,9 @@ struct ARM64_32 : ARM64Common {
   void writeStubHelperEntry(uint8_t *buf, const Symbol &,
                             uint64_t entryAddr) const override;
   void writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr,
-                            uint64_t stubOffset, uint64_t selrefsVA,
-                            uint64_t selectorIndex, uint64_t gotAddr,
-                            uint64_t msgSendIndex) const override;
+                            uint64_t &stubOffset, uint64_t selrefsVA,
+                            uint64_t selectorIndex,
+                            Symbol *objcMsgSend) const override;
 };
 
 } // namespace
@@ -100,10 +100,9 @@ void ARM64_32::writeStubHelperEntry(uint8_t *buf8, const Symbol &sym,
 }
 
 void ARM64_32::writeObjCMsgSendStub(uint8_t *buf, Symbol *sym,
-                                    uint64_t stubsAddr, uint64_t stubOffset,
+                                    uint64_t stubsAddr, uint64_t &stubOffset,
                                     uint64_t selrefsVA, uint64_t selectorIndex,
-                                    uint64_t gotAddr,
-                                    uint64_t msgSendIndex) const {
+                                    Symbol *objcMsgSend) const {
   fatal("TODO: implement this");
 }
 
diff --git a/lld/MachO/Arch/X86_64.cpp b/lld/MachO/Arch/X86_64.cpp
index a0d4e1a28a14a..9e8e1d01e493a 100644
--- a/lld/MachO/Arch/X86_64.cpp
+++ b/lld/MachO/Arch/X86_64.cpp
@@ -38,9 +38,9 @@ struct X86_64 : TargetInfo {
                             uint64_t entryAddr) const override;
 
   void writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr,
-                            uint64_t stubOffset, uint64_t selrefsVA,
-                            uint64_t selectorIndex, uint64_t gotAddr,
-                            uint64_t msgSendIndex) const override;
+                            uint64_t &stubOffset, uint64_t selrefsVA,
+                            uint64_t selectorIndex,
+                            Symbol *objcMsgSend) const override;
 
   void relaxGotLoad(uint8_t *loc, uint8_t type) const override;
   uint64_t getPageSize() const override { return 4 * 1024; }
@@ -182,16 +182,20 @@ static constexpr uint8_t objcStubsFastCode[] = {
 };
 
 void X86_64::writeObjCMsgSendStub(uint8_t *buf, Symbol *sym, uint64_t stubsAddr,
-                                  uint64_t stubOffset, uint64_t selrefsVA,
-                                  uint64_t selectorIndex, uint64_t gotAddr,
-                                  uint64_t msgSendIndex) const {
+                                  uint64_t &stubOffset, uint64_t selrefsVA,
+                                  uint64_t selectorIndex,
+                                  Symbol *objcMsgSend) const {
+  uint64_t objcMsgSendAddr = in.got->addr;
+  uint64_t objcMsgSendIndex = objcMsgSend->gotIndex;
+
   memcpy(buf, objcStubsFastCode, sizeof(objcStubsFastCode));
   SymbolDiagnostic d = {sym, sym->getName()};
   uint64_t stubAddr = stubsAddr + stubOffset;
   writeRipRelative(d, buf, stubAddr, 7,
                    selrefsVA + selectorIndex * LP64::wordSize);
   writeRipRelative(d, buf, stubAddr, 0xd,
-                   gotAddr + msgSendIndex * LP64::wordSize);
+                   objcMsgSendAddr + objcMsgSendIndex * LP64::wordSize);
+  stubOffset += target->objcStubsFastSize;
 }
 
 void X86_64::relaxGotLoad(uint8_t *loc, uint8_t type) const {
@@ -214,7 +218,7 @@ X86_64::X86_64() : TargetInfo(LP64()) {
   stubHelperEntrySize = sizeof(stubHelperEntry);
 
   objcStubsFastSize = sizeof(objcStubsFastCode);
-  objcStubsAlignment = 1;
+  objcStubsFastAlignment = 1;
 
   relocAttrs = {relocAttrsArray.data(), relocAttrsArray.size()};
 }
diff --git a/lld/MachO/Driver.cpp b/lld/MachO/Driver.cpp
index 7ac3f51cec103..411fbcfcf233e 100644
--- a/lld/MachO/Driver.cpp
+++ b/lld/MachO/Driver.cpp
@@ -827,9 +827,13 @@ static ObjCStubsMode getObjCStubsMode(const ArgList &args) {
   if (!arg)
     return ObjCStubsMode::fast;
 
-  if (arg->getOption().getID() == OPT_objc_stubs_small)
-    warn("-objc_stubs_small is not yet implemented, defaulting to "
-         "-objc_stubs_fast");
+  if (arg->getOption().getID() == OPT_objc_stubs_small) {
+    if (is_contained({AK_arm64e, AK_arm64}, config->arch()))
+      return ObjCStubsMode::small;
+    else
+      warn("-objc_stubs_small is not yet implemented, defaulting to "
+           "-objc_stubs_fast");
+  }
   return ObjCStubsMode::fast;
 }
 
diff --git a/lld/MachO/SyntheticSections.cpp b/lld/MachO/SyntheticSections.cpp
index e123dcb6803c1..53220ad04b842 100644
--- a/lld/MachO/SyntheticSections.cpp
+++ b/lld/MachO/SyntheticSections.cpp
@@ -809,7 +809,9 @@ void StubHelperSection::setUp() {
 ObjCStubsSection::ObjCStubsSection()
     : SyntheticSection(segment_names::text, section_names::objcStubs) {
   flags = S_ATTR_SOME_INSTRUCTIONS | S_ATTR_PURE_INSTRUCTIONS;
-  align = target->objcStubsAlignment;
+  align = config->objcStubsMode == ObjCStubsMode::fast
+              ? target->objcStubsFastAlignment
+              : target->objcStubsSmallAlignment;
 }
 
 void ObjCStubsSection::addEntry(Symbol *sym) {
@@ -817,10 +819,14 @@ void ObjCStubsSection::addEntry(Symbol *sym) {
   StringRef methname = sym->getName().drop_front(symbolPrefix.size());
   offsets.push_back(
       in.objcMethnameSection->getStringOffset(methname).outSecOff);
+
+  auto stubSize = config->objcStubsMode == ObjCStubsMode::fast
+                      ? target->objcStubsFastSize
+                      : target->objcStubsSmallSize;
   Defined *newSym = replaceSymbol<Defined>(
       sym, sym->getName(), nullptr, isec,
-      /*value=*/symbols.size() * target->objcStubsFastSize,
-      /*size=*/target->objcStubsFastSize,
+      /*value=*/symbols.size() * stubSize,
+      /*size=*/stubSize,
       /*isWeakDef=*/false, /*isExternal=*/true, /*isPrivateExtern=*/true,
       /*includeInSymtab=*/true, /*isReferencedDynamically=*/false,
       /*noDeadStrip=*/false);
@@ -828,12 +834,24 @@ void ObjCStubsSection::addEntry(Symbol *sym) {
 }
 
 void ObjCStubsSection::setUp() {
-  Symbol *objcMsgSend = symtab->addUndefined("_objc_msgSend", /*file=*/nullptr,
-                                             /*isWeakRef=*/false);
+  objcMsgSend = symtab->addUndefined("_objc_msgSend", /*file=*/nullptr,
+                                     /*isWeakRef=*/false);
+  if (auto *undefined = dyn_cast<Undefined>(objcMsgSend))
+    treatUndefinedSymbol(*undefined,
+                         "lazy binding (normally in libobjc.dylib)");
   objcMsgSend->used = true;
-  in.got->addEntry(objcMsgSend);
-  assert(objcMsgSend->isInGot());
-  objcMsgSendGotIndex = objcMsgSend->gotIndex;
+  if (config->objcStubsMode == ObjCStubsMode::fast) {
+    in.got->addEntry(objcMsgSend);
+    assert(objcMsgSend->isInGot());
+  } else {
+    assert(config->objcStubsMode == ObjCStubsMode::small);
+    // In line with ld64's behavior, when objc_msgSend is a direct symbol,
+    // we directly reference it.
+    // In other cases, typically when binding in libobjc.dylib,
+    // we generate a stub to invoke objc_msgSend.
+    if (!isa<Defined>(objcMsgSend))
+      in.stubs->addEntry(objcMsgSend);
+  }
 
   size_t size = offsets.size() * target->wordSize;
   uint8_t *selrefsData = bAlloc().Allocate<uint8_t>(size);
@@ -863,7 +881,10 @@ void ObjCStubsSection::setUp() {
 }
 
 uint64_t ObjCStubsSection::getSize() const {
-  return target->objcStubsFastSize * symbols.size();
+  auto stubSize = config->objcStubsMode == ObjCStubsMode::fast
+                      ? target->objcStubsFastSize
+                      : target->objcStubsSmallSize;
+  return stubSize * symbols.size();
 }
 
 void ObjCStubsSection::writeTo(uint8_t *buf) const {
@@ -875,8 +896,7 @@ void ObjCStubsSection::writeTo(uint8_t *buf) const {
     Defined *sym = symbols[i];
     target->writeObjCMsgSendStub(buf + stubOffset, sym, in.objcStubs->addr,
                                  stubOffset, in.objcSelrefs->getVA(), i,
-                                 in.got->addr, objcMsgSendGotIndex);
-    stubOffset += target->objcStubsFastSize;
+                                 objcMsgSend);
   }
 }
 
diff --git a/lld/MachO/SyntheticSections.h b/lld/MachO/SyntheticSections.h
index e9d564f3c8361..5fb7b6e09e8e6 100644
--- a/lld/MachO/SyntheticSections.h
+++ b/lld/MachO/SyntheticSections.h
@@ -336,7 +336,7 @@ class ObjCStubsSection final : public SyntheticSection {
 private:
   std::vector<Defined *> symbols;
   std::vector<uint32_t> offsets;
-  int objcMsgSendGotIndex = 0;
+  Symbol *objcMsgSend = nullptr;
 };
 
 // Note that this section may also be targeted by non-lazy bindings. In
diff --git a/lld/MachO/Target.h b/lld/MachO/Target.h
index bc7e09d394d24..b07967d0abb7c 100644
--- a/lld/MachO/Target.h
+++ b/lld/MachO/Target.h
@@ -70,10 +70,9 @@ class TargetInfo {
                                     uint64_t entryAddr) const = 0;
 
   virtual void writeObjCMsgSendStub(uint8_t *buf, Symbol *sym,
-                                    uint64_t stubsAddr, uint64_t stubOffset,
+                                    uint64_t stubsAddr, uint64_t &stubOffset,
                                     uint64_t selrefsVA, uint64_t selectorIndex,
-                                    uint64_t gotAddr,
-                                    uint64_t msgSendIndex) const = 0;
+                                    Symbol *objcMsgSend) const = 0;
 
   // Symbols may be referenced via either the GOT or the stubs section,
   // depending on the relocation type. prepareSymbolRelocation() will set up the
@@ -121,7 +120,9 @@ class TargetInfo {
   size_t stubHelperHeaderSize;
   size_t stubHelperEntrySize;
   size_t objcStubsFastSize;
-  size_t objcStubsAlignment;
+  size_t objcStubsSmallSize;
+  size_t objcStubsFastAlignment;
+  size_t objcStubsSmallAlignment;
   uint8_t p2WordSize;
   size_t wordSize;
 
diff --git a/lld/test/MachO/arm64-objc-stubs-dyn.s b/lld/test/MachO/arm64-objc-stubs-dyn.s
new file mode 100644
index 0000000000000..9358fc5b31c2b
--- /dev/null
+++ b/lld/test/MachO/arm64-objc-stubs-dyn.s
@@ -0,0 +1,76 @@
+# REQUIRES: aarch64
+
+# RUN: llvm-mc -filetype=obj -triple=arm64-apple-darwin %s -o %t.o
+# RUN: %lld -arch arm64 -lSystem -U _objc_msgSend -o %t.out %t.o
+# RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s
+# RUN: %lld -arch arm64 -lSystem -U _objc_msgSend -o %t.out %t.o -dead_strip
+# RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s
+# RUN: %lld -arch arm64 -lSystem -U _objc_msgSend -o %t.out %t.o -objc_stubs_fast
+# RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s
+# RUN: %lld -arch arm64 -lSystem -U _objc_msgSend -o %t.out %t.o -objc_stubs_small
+# RUN: llvm-otool -vs __TEXT __stubs  %t.out | FileCheck %s --check-prefix=STUB
+# RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s --check-prefix=SMALL
+
+# Unlike arm64-objc-stubs.s, in this test, _objc_msgSend is not defined,
+#  which usually binds with libobjc.dylib.
+# 1. -objc_stubs_fast: No change as it uses GOT.
+# 2. -objc_stubs_small: Create a (shared) stub to invoke _objc_msgSend, to minimize the size.
+
+# CHECK: Contents of (__TEXT,__objc_stubs) section
+
+# CHECK-NEXT: _objc_msgSend$foo:
+# CHECK-NEXT: adrp    x1, 8 ; 0x100008000
+# CHECK-NEXT: ldr     x1, [x1, #0x10]
+# CHECK-NEXT: adrp    x16, 4 ; 0x100004000
+# CHECK-NEXT: ldr     x16, [x16]
+# CHECK-NEXT: br      x16
+# CHECK-NEXT: brk     #0x1
+# CHECK-NEXT: brk     #0x1
+# CHECK-NEXT: brk     #0x1
+
+# CHECK-NEXT: _objc_msgSend$length:
+# CHECK-NEXT: adrp    x1, 8 ; 0x100008000
+# CHECK-NEXT: ldr     x1, [x1, #0x18]
+# CHECK-NEXT: adrp    x16, 4 ; 0x100004000
+# CHECK-NEXT: ldr     x16, [x16]
+# CHECK-NEXT: br      x16
+# CHECK-NEXT: brk     #0x1
+# CHECK-NEXT: brk     #0x1
+# CHECK-NEXT: brk     #0x1
+
+# CHECK-EMPTY:
+
+# STUB: Contents of (__TEXT,__stubs) section
+# STUB-NEXT:  adrp    x16, 8 ; 0x100008000
+# STUB-NEXT:  ldr     x16, [x16]
+# STUB-NEXT:  br      x16
+
+# SMALL: Contents of (__TEXT,__objc_stubs) section
+# SMALL-NEXT: _objc_msgSend$foo:
+# SMALL-NEXT: adrp    x1, 8 ; 0x100008000
+# SMALL-NEXT: ldr     x1, [x1, #0x18]
+# SMALL-NEXT: b
+# SMALL-NEXT: _objc_msgSend$length:
+# SMALL-NEXT: adrp    x1, 8 ; 0x100008000
+# SMALL-NEXT: ldr     x1, [x1, #0x20]
+# SMALL-NEXT: b
+
+.section  __TEXT,__objc_methname,cstring_literals
+lselref1:
+  .asciz  "foo"
+lselref2:
+  .asciz  "bar"
+
+.section  __DATA,__objc_selrefs,literal_pointers,no_dead_strip
+.p2align  3
+.quad lselref1
+.quad lselref2
+
+.text
+
+.globl _main
+_main:
+  bl  _objc_msgSend$length
+  bl  _objc_msgSend$foo
+  bl  _objc_msgSend$foo
+  ret
diff --git a/lld/test/MachO/arm64-objc-stubs.s b/lld/test/MachO/arm64-objc-stubs.s
index feba40ac36d84..1b8ebff924300 100644
--- a/lld/test/MachO/arm64-objc-stubs.s
+++ b/lld/test/MachO/arm64-objc-stubs.s
@@ -7,10 +7,10 @@
 # RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s
 # RUN: %lld -arch arm64 -lSystem -o %t.out %t.o -objc_stubs_fast
 # RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s
-# RUN: %no-fatal-warnings-lld -arch arm64 -lSystem -o %t.out %t.o -objc_stubs_small 2>&1 | FileCheck %s --check-prefix=WARNING
-# RUN: llvm-otool -vs __TEXT __objc_stubs %t.out | FileCheck %s
-
-# WARNING: warning: -objc_stubs_small is not yet implemented, defaulting to -objc_stubs_fast
+# RUN: llvm-otool -l %t.out | FileCheck %s --check-prefix=FASTALIGN
+# RUN: %lld -arch arm64 -lSystem -o %t.out %t.o -objc_stubs_small
+# RUN: llvm-otool -vs __TEXT __objc_stubs  %t.out | FileCheck %s --check-prefix=SMALL
+# RUN: llvm-otool -l %t.out | FileCheck %s --check-prefix=SMALLALIGN
 
 # CHECK: Contents of (__TEXT,__objc_stubs) section
 
@@ -36,6 +36,30 @@
 
 # CHECK-EMPTY:
 
+# FASTALIGN:       sectname __objc_stubs
+# FASTALIGN-NEXT:   segname __TEXT
+# FASTALIGN-NEXT:      addr
+# FASTALIGN-NEXT:      size
+# FASTALIGN-NEXT:    offset
+# FASTALIGN-NEXT:     align 2^5 (32)
+
+# SMALL: _objc_msgSend$foo:
+# SMALL-NEXT: adrp    x1, 4 ; 0x100004000
+# SMALL-NEXT: ldr     x1, [x1, #0x10]
+# SMALL-NEXT: b
+
+# SMALL-NEXT: _objc_msgSend$length:
+# SMALL-NEXT: adrp    x1, 4 ; 0x100004000
+# SMALL-NEXT: ldr     x1, [x1, #0x18]
+# SMALL-NEXT: b
+
+# SMALLALIGN:       sectname __objc_stubs
+# SMALLALIGN-NEXT:   segname __TEXT
+# SMALLALIGN-NEXT:      addr
+# SMALLALIGN-NEXT:      size
+# SMALLALIGN-NEXT:    offset
+# SMALLALIGN-NEXT:     align 2^2 (4)
+
 .section  __TEXT,__objc_methname,cstring_literals
 lselref1:
   .asciz  "foo"
diff --git a/lld/test/MachO/x86-64-objc-stubs.s b/lld/test/MachO/x86-64-objc-stubs.s
index 5e8b9fb165f25..2dd8d55937715 100644
--- a/lld/test/MachO/x86-64-objc-stubs.s
+++ b/lld/test/MachO/x86-64-objc-stubs.s
@@ -6,6 +6,9 @@
 # RUN: llvm-otool -vs __DATA __objc_selrefs %t.out >> %t.txt
 # RUN: llvm-otool -vs __TEXT __objc_stubs %t.out >> %t.txt
 # RUN: FileCheck %s < %t.txt
+# RUN: %no-fatal-warnings-lld -arch x86_64 -lSystem -o %t.out %t.o -objc_stubs_small 2>&1 | FileCheck %s --check-prefix=WARNING
+
+# WARNING: warning: -objc_stubs_small is not yet implemented, defaulting to -objc_stubs_fast
 
 # CHECK: Sections:
 # CHECK: __got            {{[0-9a-f]*}} [[#%x, GOTSTART:]] DATA

From dffa8039b10823f67347453a0ef445ee785ed4aa Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Tue, 23 Jan 2024 16:37:36 +0100
Subject: [PATCH 633/843] [libc] Remove specific nan payload in math functions
 (#79133)

---
 .../FPUtil/DivisionAndRemainderOperations.h         |  2 +-
 libc/src/__support/FPUtil/generic/sqrt.h            |  4 +---
 .../FPUtil/generic/sqrt_80_bit_long_double.h        |  3 +--
 libc/test/UnitTest/FPMatcher.h                      |  4 ++--
 libc/test/src/__support/FPUtil/fpbits_test.cpp      | 13 ++++++-------
 libc/test/src/math/FDimTest.h                       |  2 +-
 libc/test/src/math/FmaTest.h                        |  2 +-
 libc/test/src/math/HypotTest.h                      |  2 +-
 libc/test/src/math/ILogbTest.h                      |  2 +-
 libc/test/src/math/LdExpTest.h                      |  2 +-
 libc/test/src/math/NextAfterTest.h                  |  2 +-
 libc/test/src/math/RIntTest.h                       |  2 +-
 libc/test/src/math/RemQuoTest.h                     |  2 +-
 libc/test/src/math/RoundToIntegerTest.h             |  2 +-
 libc/test/src/math/smoke/FDimTest.h                 |  2 +-
 libc/test/src/math/smoke/FmaTest.h                  |  2 +-
 libc/test/src/math/smoke/HypotTest.h                |  2 +-
 libc/test/src/math/smoke/ILogbTest.h                |  2 +-
 libc/test/src/math/smoke/LdExpTest.h                |  2 +-
 libc/test/src/math/smoke/NextAfterTest.h            |  2 +-
 libc/test/src/math/smoke/NextTowardTest.h           |  4 ++--
 libc/test/src/math/smoke/RIntTest.h                 |  2 +-
 libc/test/src/math/smoke/RemQuoTest.h               |  2 +-
 libc/test/src/math/smoke/RoundToIntegerTest.h       |  2 +-
 24 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
index ef9593a42b005..2859a248b95e3 100644
--- a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
+++ b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
@@ -31,7 +31,7 @@ LIBC_INLINE T remquo(T x, T y, int &q) {
   if (ybits.is_nan())
     return y;
   if (xbits.is_inf() || ybits.is_zero())
-    return FPBits<T>::build_quiet_nan(fputil::Sign::POS, 1).get_val();
+    return FPBits<T>::build_quiet_nan().get_val();
 
   if (xbits.is_zero()) {
     q = 0;
diff --git a/libc/src/__support/FPUtil/generic/sqrt.h b/libc/src/__support/FPUtil/generic/sqrt.h
index 0a0690ec1463b..6650227a014d2 100644
--- a/libc/src/__support/FPUtil/generic/sqrt.h
+++ b/libc/src/__support/FPUtil/generic/sqrt.h
@@ -71,12 +71,10 @@ LIBC_INLINE cpp::enable_if_t<cpp::is_floating_point_v<T>, T> sqrt(T x) {
     return x86::sqrt(x);
   } else {
     // IEEE floating points formats.
-    using Sign = fputil::Sign;
     using FPBits_t = typename fputil::FPBits<T>;
     using StorageType = typename FPBits_t::StorageType;
     constexpr StorageType ONE = StorageType(1) << FPBits_t::FRACTION_LEN;
-    constexpr auto FLT_NAN =
-        FPBits_t::build_quiet_nan(Sign::POS, ONE >> 1).get_val();
+    constexpr auto FLT_NAN = FPBits_t::build_quiet_nan().get_val();
 
     FPBits_t bits(x);
 
diff --git a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
index b0a3776029ca7..72977f616b5dd 100644
--- a/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
+++ b/libc/src/__support/FPUtil/generic/sqrt_80_bit_long_double.h
@@ -38,9 +38,8 @@ LIBC_INLINE long double sqrt(long double x);
 LIBC_INLINE long double sqrt(long double x) {
   using LDBits = FPBits<long double>;
   using StorageType = typename LDBits::StorageType;
-  using Sign = fputil::Sign;
   constexpr StorageType ONE = StorageType(1) << int(LDBits::FRACTION_LEN);
-  constexpr auto LDNAN = LDBits::build_quiet_nan(Sign::POS, ONE >> 1).get_val();
+  constexpr auto LDNAN = LDBits::build_quiet_nan().get_val();
 
   LDBits bits(x);
 
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index 9880fa51c5481..ad2fc0439fd79 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -68,7 +68,7 @@ template <typename T> struct FPTest : public Test {
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();
   static constexpr T zero = T(FPBits::zero(Sign::POS));
   static constexpr T neg_zero = T(FPBits::zero(Sign::NEG));
-  static constexpr T aNaN = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  static constexpr T aNaN = T(FPBits::build_quiet_nan());
   static constexpr T sNaN = T(FPBits::build_nan(Sign::POS, 1));
   static constexpr T inf = T(FPBits::inf(Sign::POS));
   static constexpr T neg_inf = T(FPBits::inf(Sign::NEG));
@@ -97,7 +97,7 @@ template <typename T> struct FPTest : public Test {
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();                 \
   const T zero = T(FPBits::zero(Sign::POS));                                   \
   const T neg_zero = T(FPBits::zero(Sign::NEG));                               \
-  const T aNaN = T(FPBits::build_quiet_nan(Sign::POS, 1));                     \
+  const T aNaN = T(FPBits::build_quiet_nan());                                 \
   const T sNaN = T(FPBits::build_nan(Sign::POS, 1));                           \
   const T inf = T(FPBits::inf(Sign::POS));                                     \
   const T neg_inf = T(FPBits::inf(Sign::NEG));                                 \
diff --git a/libc/test/src/__support/FPUtil/fpbits_test.cpp b/libc/test/src/__support/FPUtil/fpbits_test.cpp
index 6092945811ce1..aadcd3fe92b9e 100644
--- a/libc/test/src/__support/FPUtil/fpbits_test.cpp
+++ b/libc/test/src/__support/FPUtil/fpbits_test.cpp
@@ -287,7 +287,7 @@ TEST(LlvmLibcFPBitsTest, FloatType) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBF900000 = (S: 1, E: 0x007F, M: 0x00100000)");
 
-  FloatBits quiet_nan = FloatBits::build_quiet_nan(Sign::POS, 1);
+  FloatBits quiet_nan = FloatBits::build_quiet_nan();
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
@@ -298,8 +298,7 @@ TEST(LlvmLibcFPBitsTest, DoubleType) {
                "(+Infinity)");
   EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::inf(Sign::NEG)).c_str(),
                "(-Infinity)");
-  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::build_nan(Sign::POS, 1)).c_str(),
-               "(NaN)");
+  EXPECT_STREQ(LIBC_NAMESPACE::str(DoubleBits::build_nan()).c_str(), "(NaN)");
 
   DoubleBits zero(0.0);
   EXPECT_TRUE(zero.is_pos());
@@ -349,7 +348,7 @@ TEST(LlvmLibcFPBitsTest, DoubleType) {
   EXPECT_STREQ(LIBC_NAMESPACE::str(negnum).c_str(),
                "0xBFF2000000000000 = (S: 1, E: 0x03FF, M: 0x0002000000000000)");
 
-  DoubleBits quiet_nan = DoubleBits::build_quiet_nan(Sign::POS, 1);
+  DoubleBits quiet_nan = DoubleBits::build_quiet_nan();
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 
@@ -431,7 +430,7 @@ TEST(LlvmLibcFPBitsTest, X86LongDoubleType) {
       "0x000000000000BFFF9000000000000000 = "
       "(S: 1, E: 0x3FFF, I: 1, M: 0x00000000000000001000000000000000)");
 
-  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan(Sign::POS, 1);
+  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan();
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #else
@@ -506,7 +505,7 @@ TEST(LlvmLibcFPBitsTest, LongDoubleType) {
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
 
-  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan(Sign::POS, 1);
+  LongDoubleBits quiet_nan = LongDoubleBits::build_quiet_nan();
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 #endif
 }
@@ -581,7 +580,7 @@ TEST(LlvmLibcFPBitsTest, Float128Type) {
                "0xBFFF2000000000000000000000000000 = "
                "(S: 1, E: 0x3FFF, M: 0x00002000000000000000000000000000)");
 
-  Float128Bits quiet_nan = Float128Bits::build_quiet_nan(Sign::POS, 1);
+  Float128Bits quiet_nan = Float128Bits::build_quiet_nan();
   EXPECT_EQ(quiet_nan.is_quiet_nan(), true);
 }
 #endif // LIBC_COMPILER_HAS_FLOAT128
diff --git a/libc/test/src/math/FDimTest.h b/libc/test/src/math/FDimTest.h
index c3d9cb1801cd4..46df2b40e64bd 100644
--- a/libc/test/src/math/FDimTest.h
+++ b/libc/test/src/math/FDimTest.h
@@ -24,7 +24,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
diff --git a/libc/test/src/math/FmaTest.h b/libc/test/src/math/FmaTest.h
index 4343b38053dc4..8cc340e9260ff 100644
--- a/libc/test/src/math/FmaTest.h
+++ b/libc/test/src/math/FmaTest.h
@@ -32,7 +32,7 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
   static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
diff --git a/libc/test/src/math/HypotTest.h b/libc/test/src/math/HypotTest.h
index 0b85f68fda82a..2990072fd5c66 100644
--- a/libc/test/src/math/HypotTest.h
+++ b/libc/test/src/math/HypotTest.h
@@ -25,7 +25,7 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
   using StorageType = typename FPBits::StorageType;
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
   const T inf = T(FPBits::inf());
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero());
diff --git a/libc/test/src/math/ILogbTest.h b/libc/test/src/math/ILogbTest.h
index 223de789999af..fe28fab9d777c 100644
--- a/libc/test/src/math/ILogbTest.h
+++ b/libc/test/src/math/ILogbTest.h
@@ -28,7 +28,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
     using Sign = LIBC_NAMESPACE::fputil::Sign;
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(Sign::POS, 1))));
+    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan())));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
   }
diff --git a/libc/test/src/math/LdExpTest.h b/libc/test/src/math/LdExpTest.h
index 3a4baabbf10e6..08a14a90c1226 100644
--- a/libc/test/src/math/LdExpTest.h
+++ b/libc/test/src/math/LdExpTest.h
@@ -29,7 +29,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/NextAfterTest.h b/libc/test/src/math/NextAfterTest.h
index 9ff3bf73d2ede..f9ae4a4ec5c02 100644
--- a/libc/test/src/math/NextAfterTest.h
+++ b/libc/test/src/math/NextAfterTest.h
@@ -27,7 +27,7 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   const StorageType min_subnormal = FPBits::min_subnormal().uintval();
   const StorageType max_subnormal = FPBits::max_subnormal().uintval();
diff --git a/libc/test/src/math/RIntTest.h b/libc/test/src/math/RIntTest.h
index b478e3f65dbc8..3af7b6fe6f6d4 100644
--- a/libc/test/src/math/RIntTest.h
+++ b/libc/test/src/math/RIntTest.h
@@ -38,7 +38,7 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   static constexpr StorageType MIN_SUBNORMAL =
       FPBits::min_subnormal().uintval();
diff --git a/libc/test/src/math/RemQuoTest.h b/libc/test/src/math/RemQuoTest.h
index 0ee41f4bf9acf..bbc266764c1c2 100644
--- a/libc/test/src/math/RemQuoTest.h
+++ b/libc/test/src/math/RemQuoTest.h
@@ -28,7 +28,7 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   static constexpr StorageType MIN_SUBNORMAL =
       FPBits::min_subnormal().uintval();
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index 6866c23cb99ca..f75aaa348bd00 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -37,7 +37,7 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const F neg_zero = F(FPBits::zero(Sign::NEG));
   const F inf = F(FPBits::inf());
   const F neg_inf = F(FPBits::inf(Sign::NEG));
-  const F nan = F(FPBits::build_quiet_nan(Sign::POS, 1));
+  const F nan = F(FPBits::build_quiet_nan());
 
   static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
   static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
diff --git a/libc/test/src/math/smoke/FDimTest.h b/libc/test/src/math/smoke/FDimTest.h
index c3d9cb1801cd4..46df2b40e64bd 100644
--- a/libc/test/src/math/smoke/FDimTest.h
+++ b/libc/test/src/math/smoke/FDimTest.h
@@ -24,7 +24,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
diff --git a/libc/test/src/math/smoke/FmaTest.h b/libc/test/src/math/smoke/FmaTest.h
index 337ce659a23e1..106ba1405a968 100644
--- a/libc/test/src/math/smoke/FmaTest.h
+++ b/libc/test/src/math/smoke/FmaTest.h
@@ -25,7 +25,7 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
 public:
   void test_special_numbers(Func func) {
diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h
index 67110536b9623..5400fc0730d8a 100644
--- a/libc/test/src/math/smoke/HypotTest.h
+++ b/libc/test/src/math/smoke/HypotTest.h
@@ -22,7 +22,7 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
   const T inf = T(FPBits::inf(Sign::POS));
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h
index 223de789999af..fe28fab9d777c 100644
--- a/libc/test/src/math/smoke/ILogbTest.h
+++ b/libc/test/src/math/smoke/ILogbTest.h
@@ -28,7 +28,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
     using Sign = LIBC_NAMESPACE::fputil::Sign;
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
     EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan(Sign::POS, 1))));
+    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan())));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
     EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
   }
diff --git a/libc/test/src/math/smoke/LdExpTest.h b/libc/test/src/math/smoke/LdExpTest.h
index 3a4baabbf10e6..08a14a90c1226 100644
--- a/libc/test/src/math/smoke/LdExpTest.h
+++ b/libc/test/src/math/smoke/LdExpTest.h
@@ -29,7 +29,7 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h
index 1dd07b3d2f93d..859e2c74a3693 100644
--- a/libc/test/src/math/smoke/NextAfterTest.h
+++ b/libc/test/src/math/smoke/NextAfterTest.h
@@ -38,7 +38,7 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   static constexpr StorageType min_subnormal =
       FPBits::min_subnormal().uintval();
diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h
index d65cc5d84d35a..c9f07b993f04b 100644
--- a/libc/test/src/math/smoke/NextTowardTest.h
+++ b/libc/test/src/math/smoke/NextTowardTest.h
@@ -40,11 +40,11 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
   const long double to_zero = ToFPBits::zero().get_val();
   const long double to_neg_zero = ToFPBits::zero(Sign::NEG).get_val();
-  const long double to_nan = ToFPBits::build_quiet_nan(Sign::POS, 1).get_val();
+  const long double to_nan = ToFPBits::build_quiet_nan().get_val();
 
   static constexpr StorageType min_subnormal =
       FPBits::min_subnormal().uintval();
diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h
index 7bbbe54301570..bd99e634badbd 100644
--- a/libc/test/src/math/smoke/RIntTest.h
+++ b/libc/test/src/math/smoke/RIntTest.h
@@ -35,7 +35,7 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
 public:
   void testSpecialNumbers(RIntFunc func) {
diff --git a/libc/test/src/math/smoke/RemQuoTest.h b/libc/test/src/math/smoke/RemQuoTest.h
index 5f5cbd4964a62..739661f2d9619 100644
--- a/libc/test/src/math/smoke/RemQuoTest.h
+++ b/libc/test/src/math/smoke/RemQuoTest.h
@@ -25,7 +25,7 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const T neg_inf = T(FPBits::inf(Sign::NEG));
   const T zero = T(FPBits::zero(Sign::POS));
   const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan(Sign::POS, 1));
+  const T nan = T(FPBits::build_quiet_nan());
 
 public:
   typedef T (*RemQuoFunc)(T, T, int *);
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index 77c65aa492e22..404155604645c 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -34,7 +34,7 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   const F neg_zero = F(FPBits::zero(Sign::NEG));
   const F inf = F(FPBits::inf(Sign::POS));
   const F neg_inf = F(FPBits::inf(Sign::NEG));
-  const F nan = F(FPBits::build_quiet_nan(Sign::POS, 1));
+  const F nan = F(FPBits::build_quiet_nan());
 
   static constexpr StorageType MAX_SUBNORMAL =
       FPBits::max_subnormal().uintval();

From 2cff46f8c0aa88222aba776de094c650aa09a33d Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 23 Jan 2024 10:43:24 -0500
Subject: [PATCH 634/843] [libc][NFC] use builder pattern for
 ErrnoSetterMatcher (#79153)

ErrnoSetterMatcher::returns can be misleading as it does not initialize
the errno. This is made worse as later on there is a switch statement on
the errno comparator using __builtin_unreachable(). This patch make
ErrnoSetterMatcher::returns give back a builder that is nomially
different from ErrnoSetterMatcher.
---
 libc/test/UnitTest/ErrnoSetterMatcher.h | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

diff --git a/libc/test/UnitTest/ErrnoSetterMatcher.h b/libc/test/UnitTest/ErrnoSetterMatcher.h
index b748c29751c32..6b15bd4e9b79a 100644
--- a/libc/test/UnitTest/ErrnoSetterMatcher.h
+++ b/libc/test/UnitTest/ErrnoSetterMatcher.h
@@ -161,10 +161,22 @@ static internal::ErrnoSetterMatcher<RetT> Fails(int ExpectedErrno,
                                             EQ(ExpectedErrno));
 }
 
+template <typename RetT = int> class ErrnoSetterMatcherBuilder {
+public:
+  template <typename T> using Cmp = internal::Comparator<T>;
+  ErrnoSetterMatcherBuilder(Cmp<RetT> cmp) : return_cmp(cmp) {}
+
+  internal::ErrnoSetterMatcher<RetT> with_errno(Cmp<int> cmp) {
+    return internal::ErrnoSetterMatcher<RetT>(return_cmp, cmp);
+  }
+
+private:
+  Cmp<RetT> return_cmp;
+};
+
 template <typename RetT>
-static internal::ErrnoSetterMatcher<RetT>
-returns(internal::Comparator<RetT> cmp) {
-  return internal::ErrnoSetterMatcher<RetT>(cmp);
+static ErrnoSetterMatcherBuilder<RetT> returns(internal::Comparator<RetT> cmp) {
+  return ErrnoSetterMatcherBuilder<RetT>(cmp);
 }
 
 } // namespace ErrnoSetterMatcher

From 8c41e3fcb18c2bf7c369f50473367e9cdd072ecc Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Jan 2024 15:33:17 +0000
Subject: [PATCH 635/843] [X86] Add test case for Issue #78897

---
 llvm/test/CodeGen/X86/pr78897.ll | 313 +++++++++++++++++++++++++++++++
 1 file changed, 313 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/pr78897.ll

diff --git a/llvm/test/CodeGen/X86/pr78897.ll b/llvm/test/CodeGen/X86/pr78897.ll
new file mode 100644
index 0000000000000..d2562e28584fb
--- /dev/null
+++ b/llvm/test/CodeGen/X86/pr78897.ll
@@ -0,0 +1,313 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=i686--   -mcpu=x86-64    | FileCheck %s --check-prefixes=X86-SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=X64-SSE2
+; RUN: llc < %s -mtriple=i686--   -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X86-SSE42
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64-SSE42
+; RUN: llc < %s -mtriple=i686--   -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64-AVX2
+; RUN: llc < %s -mtriple=i686--   -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64-AVX512
+
+; FIXME: PR78897 - Don't vectorize a mul if we still need the extract
+define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
+; X86-SSE2-LABEL: produceShuffleVectorForByte:
+; X86-SSE2:       # %bb.0: # %entry
+; X86-SSE2-NEXT:    pushl %ebx
+; X86-SSE2-NEXT:    pushl %edi
+; X86-SSE2-NEXT:    pushl %esi
+; X86-SSE2-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
+; X86-SSE2-NEXT:    movd %xmm2, %esi
+; X86-SSE2-NEXT:    movd %xmm1, %ecx
+; X86-SSE2-NEXT:    movl $286331152, %edi # imm = 0x11111110
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    mull %edi
+; X86-SSE2-NEXT:    imull $286331153, %ecx, %ebx # imm = 0x11111111
+; X86-SSE2-NEXT:    addl %edx, %ebx
+; X86-SSE2-NEXT:    imull $286331152, %esi, %edx # imm = 0x11111110
+; X86-SSE2-NEXT:    addl %ebx, %edx
+; X86-SSE2-NEXT:    movd %edx, %xmm2
+; X86-SSE2-NEXT:    movd %eax, %xmm1
+; X86-SSE2-NEXT:    xorl $286331153, %ecx # imm = 0x11111111
+; X86-SSE2-NEXT:    movl %ecx, %eax
+; X86-SSE2-NEXT:    mull %edi
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; X86-SSE2-NEXT:    xorl $17895697, %esi # imm = 0x1111111
+; X86-SSE2-NEXT:    imull $286331153, %ecx, %ecx # imm = 0x11111111
+; X86-SSE2-NEXT:    addl %edx, %ecx
+; X86-SSE2-NEXT:    imull $286331152, %esi, %edx # imm = 0x11111110
+; X86-SSE2-NEXT:    addl %ecx, %edx
+; X86-SSE2-NEXT:    movd %edx, %xmm2
+; X86-SSE2-NEXT:    movd %eax, %xmm3
+; X86-SSE2-NEXT:    punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
+; X86-SSE2-NEXT:    pand %xmm0, %xmm1
+; X86-SSE2-NEXT:    pandn %xmm3, %xmm0
+; X86-SSE2-NEXT:    por %xmm1, %xmm0
+; X86-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X86-SSE2-NEXT:    psrlw $4, %xmm1
+; X86-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT:    popl %esi
+; X86-SSE2-NEXT:    popl %edi
+; X86-SSE2-NEXT:    popl %ebx
+; X86-SSE2-NEXT:    retl
+;
+; X64-SSE2-LABEL: produceShuffleVectorForByte:
+; X64-SSE2:       # %bb.0: # %entry
+; X64-SSE2-NEXT:    movd %edi, %xmm0
+; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE2-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT:    pxor %xmm0, %xmm0
+; X64-SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
+; X64-SSE2-NEXT:    pand %xmm0, %xmm1
+; X64-SSE2-NEXT:    movq %xmm1, %rax
+; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
+; X64-SSE2-NEXT:    psrlq $32, %xmm2
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440]
+; X64-SSE2-NEXT:    pmuludq %xmm3, %xmm2
+; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [286331153,286331153]
+; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm4
+; X64-SSE2-NEXT:    paddq %xmm2, %xmm4
+; X64-SSE2-NEXT:    psllq $32, %xmm4
+; X64-SSE2-NEXT:    pmuludq %xmm3, %xmm1
+; X64-SSE2-NEXT:    paddq %xmm4, %xmm1
+; X64-SSE2-NEXT:    movabsq $76861433640456465, %rcx # imm = 0x111111111111111
+; X64-SSE2-NEXT:    xorq %rax, %rcx
+; X64-SSE2-NEXT:    movabsq $1229782938247303440, %rax # imm = 0x1111111111111110
+; X64-SSE2-NEXT:    imulq %rcx, %rax
+; X64-SSE2-NEXT:    movq %rax, %xmm2
+; X64-SSE2-NEXT:    pand %xmm0, %xmm1
+; X64-SSE2-NEXT:    pandn %xmm2, %xmm0
+; X64-SSE2-NEXT:    por %xmm1, %xmm0
+; X64-SSE2-NEXT:    movdqa %xmm0, %xmm1
+; X64-SSE2-NEXT:    psrlw $4, %xmm1
+; X64-SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT:    retq
+;
+; X86-SSE42-LABEL: produceShuffleVectorForByte:
+; X86-SSE42:       # %bb.0: # %entry
+; X86-SSE42-NEXT:    pushl %ebx
+; X86-SSE42-NEXT:    pushl %edi
+; X86-SSE42-NEXT:    pushl %esi
+; X86-SSE42-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; X86-SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X86-SSE42-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; X86-SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE42-NEXT:    pxor %xmm0, %xmm0
+; X86-SSE42-NEXT:    pcmpeqb %xmm1, %xmm0
+; X86-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
+; X86-SSE42-NEXT:    pand %xmm0, %xmm1
+; X86-SSE42-NEXT:    movd %xmm1, %ecx
+; X86-SSE42-NEXT:    movl $286331152, %edi # imm = 0x11111110
+; X86-SSE42-NEXT:    movl %ecx, %eax
+; X86-SSE42-NEXT:    mull %edi
+; X86-SSE42-NEXT:    pextrd $1, %xmm1, %esi
+; X86-SSE42-NEXT:    imull $286331153, %ecx, %ebx # imm = 0x11111111
+; X86-SSE42-NEXT:    addl %edx, %ebx
+; X86-SSE42-NEXT:    imull $286331152, %esi, %edx # imm = 0x11111110
+; X86-SSE42-NEXT:    addl %ebx, %edx
+; X86-SSE42-NEXT:    movd %eax, %xmm2
+; X86-SSE42-NEXT:    pinsrd $1, %edx, %xmm2
+; X86-SSE42-NEXT:    xorl $286331153, %ecx # imm = 0x11111111
+; X86-SSE42-NEXT:    movl %ecx, %eax
+; X86-SSE42-NEXT:    mull %edi
+; X86-SSE42-NEXT:    xorl $17895697, %esi # imm = 0x1111111
+; X86-SSE42-NEXT:    imull $286331153, %ecx, %ecx # imm = 0x11111111
+; X86-SSE42-NEXT:    addl %edx, %ecx
+; X86-SSE42-NEXT:    imull $286331152, %esi, %edx # imm = 0x11111110
+; X86-SSE42-NEXT:    addl %ecx, %edx
+; X86-SSE42-NEXT:    movd %eax, %xmm1
+; X86-SSE42-NEXT:    pinsrd $1, %edx, %xmm1
+; X86-SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    psrlw $4, %xmm0
+; X86-SSE42-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X86-SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X86-SSE42-NEXT:    popl %esi
+; X86-SSE42-NEXT:    popl %edi
+; X86-SSE42-NEXT:    popl %ebx
+; X86-SSE42-NEXT:    retl
+;
+; X64-SSE42-LABEL: produceShuffleVectorForByte:
+; X64-SSE42:       # %bb.0: # %entry
+; X64-SSE42-NEXT:    movd %edi, %xmm0
+; X64-SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; X64-SSE42-NEXT:    pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7]
+; X64-SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE42-NEXT:    pxor %xmm0, %xmm0
+; X64-SSE42-NEXT:    pcmpeqb %xmm1, %xmm0
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
+; X64-SSE42-NEXT:    pand %xmm0, %xmm2
+; X64-SSE42-NEXT:    movq %xmm2, %rax
+; X64-SSE42-NEXT:    movdqa %xmm2, %xmm1
+; X64-SSE42-NEXT:    psrlq $32, %xmm1
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440]
+; X64-SSE42-NEXT:    pmuludq %xmm3, %xmm1
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [286331153,286331153]
+; X64-SSE42-NEXT:    pmuludq %xmm2, %xmm4
+; X64-SSE42-NEXT:    paddq %xmm1, %xmm4
+; X64-SSE42-NEXT:    psllq $32, %xmm4
+; X64-SSE42-NEXT:    pmuludq %xmm3, %xmm2
+; X64-SSE42-NEXT:    paddq %xmm4, %xmm2
+; X64-SSE42-NEXT:    movabsq $76861433640456465, %rcx # imm = 0x111111111111111
+; X64-SSE42-NEXT:    xorq %rax, %rcx
+; X64-SSE42-NEXT:    movabsq $1229782938247303440, %rax # imm = 0x1111111111111110
+; X64-SSE42-NEXT:    imulq %rcx, %rax
+; X64-SSE42-NEXT:    movq %rax, %xmm1
+; X64-SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    psrlw $4, %xmm0
+; X64-SSE42-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
+; X64-SSE42-NEXT:    retq
+;
+; X86-AVX2-LABEL: produceShuffleVectorForByte:
+; X86-AVX2:       # %bb.0: # %entry
+; X86-AVX2-NEXT:    pushl %ebx
+; X86-AVX2-NEXT:    pushl %edi
+; X86-AVX2-NEXT:    pushl %esi
+; X86-AVX2-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %xmm0
+; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X86-AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vmovd %xmm1, %edx
+; X86-AVX2-NEXT:    movl $286331152, %ecx # imm = 0x11111110
+; X86-AVX2-NEXT:    mulxl %ecx, %edi, %esi
+; X86-AVX2-NEXT:    vpextrd $1, %xmm1, %eax
+; X86-AVX2-NEXT:    imull $286331153, %edx, %ebx # imm = 0x11111111
+; X86-AVX2-NEXT:    addl %esi, %ebx
+; X86-AVX2-NEXT:    imull $286331152, %eax, %esi # imm = 0x11111110
+; X86-AVX2-NEXT:    addl %ebx, %esi
+; X86-AVX2-NEXT:    vmovd %edi, %xmm1
+; X86-AVX2-NEXT:    xorl $286331153, %edx # imm = 0x11111111
+; X86-AVX2-NEXT:    mulxl %ecx, %edi, %ecx
+; X86-AVX2-NEXT:    vpinsrd $1, %esi, %xmm1, %xmm1
+; X86-AVX2-NEXT:    xorl $17895697, %eax # imm = 0x1111111
+; X86-AVX2-NEXT:    imull $286331153, %edx, %edx # imm = 0x11111111
+; X86-AVX2-NEXT:    addl %ecx, %edx
+; X86-AVX2-NEXT:    imull $286331152, %eax, %eax # imm = 0x11111110
+; X86-AVX2-NEXT:    addl %edx, %eax
+; X86-AVX2-NEXT:    vmovd %edi, %xmm2
+; X86-AVX2-NEXT:    vpinsrd $1, %eax, %xmm2, %xmm2
+; X86-AVX2-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
+; X86-AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; X86-AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X86-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-AVX2-NEXT:    popl %esi
+; X86-AVX2-NEXT:    popl %edi
+; X86-AVX2-NEXT:    popl %ebx
+; X86-AVX2-NEXT:    retl
+;
+; X64-AVX2-LABEL: produceShuffleVectorForByte:
+; X64-AVX2:       # %bb.0: # %entry
+; X64-AVX2-NEXT:    vmovd %edi, %xmm0
+; X64-AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; X64-AVX2-NEXT:    vmovq %xmm1, %rax
+; X64-AVX2-NEXT:    vpsrlq $32, %xmm1, %xmm2
+; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440]
+; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
+; X64-AVX2-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
+; X64-AVX2-NEXT:    vpsllq $32, %xmm2, %xmm2
+; X64-AVX2-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
+; X64-AVX2-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
+; X64-AVX2-NEXT:    movabsq $76861433640456465, %rcx # imm = 0x111111111111111
+; X64-AVX2-NEXT:    xorq %rax, %rcx
+; X64-AVX2-NEXT:    movabsq $1229782938247303440, %rax # imm = 0x1111111111111110
+; X64-AVX2-NEXT:    imulq %rcx, %rax
+; X64-AVX2-NEXT:    vmovq %rax, %xmm2
+; X64-AVX2-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
+; X64-AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; X64-AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-AVX2-NEXT:    retq
+;
+; X86-AVX512-LABEL: produceShuffleVectorForByte:
+; X86-AVX512:       # %bb.0: # %entry
+; X86-AVX512-NEXT:    pushl %ebx
+; X86-AVX512-NEXT:    pushl %edi
+; X86-AVX512-NEXT:    pushl %esi
+; X86-AVX512-NEXT:    vpbroadcastb {{[0-9]+}}(%esp), %xmm0
+; X86-AVX512-NEXT:    vptestnmb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %k1
+; X86-AVX512-NEXT:    vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0 {%k1} {z}
+; X86-AVX512-NEXT:    vpextrd $1, %xmm0, %eax
+; X86-AVX512-NEXT:    vmovd %xmm0, %edx
+; X86-AVX512-NEXT:    movl $286331152, %ecx # imm = 0x11111110
+; X86-AVX512-NEXT:    mulxl %ecx, %edi, %esi
+; X86-AVX512-NEXT:    imull $286331153, %edx, %ebx # imm = 0x11111111
+; X86-AVX512-NEXT:    addl %esi, %ebx
+; X86-AVX512-NEXT:    imull $286331152, %eax, %esi # imm = 0x11111110
+; X86-AVX512-NEXT:    addl %ebx, %esi
+; X86-AVX512-NEXT:    vmovd %edi, %xmm0
+; X86-AVX512-NEXT:    vpinsrd $1, %esi, %xmm0, %xmm0
+; X86-AVX512-NEXT:    xorl $17895697, %eax # imm = 0x1111111
+; X86-AVX512-NEXT:    xorl $286331153, %edx # imm = 0x11111111
+; X86-AVX512-NEXT:    mulxl %ecx, %esi, %ecx
+; X86-AVX512-NEXT:    imull $286331153, %edx, %edx # imm = 0x11111111
+; X86-AVX512-NEXT:    addl %ecx, %edx
+; X86-AVX512-NEXT:    imull $286331152, %eax, %eax # imm = 0x11111110
+; X86-AVX512-NEXT:    addl %edx, %eax
+; X86-AVX512-NEXT:    vmovd %esi, %xmm1
+; X86-AVX512-NEXT:    vpinsrd $1, %eax, %xmm1, %xmm1
+; X86-AVX512-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
+; X86-AVX512-NEXT:    vpsrlw $4, %xmm1, %xmm0
+; X86-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X86-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to4}, %xmm0, %xmm0
+; X86-AVX512-NEXT:    popl %esi
+; X86-AVX512-NEXT:    popl %edi
+; X86-AVX512-NEXT:    popl %ebx
+; X86-AVX512-NEXT:    retl
+;
+; X64-AVX512-LABEL: produceShuffleVectorForByte:
+; X64-AVX512:       # %bb.0: # %entry
+; X64-AVX512-NEXT:    vpbroadcastb %edi, %xmm0
+; X64-AVX512-NEXT:    vptestnmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
+; X64-AVX512-NEXT:    vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
+; X64-AVX512-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vmovq %xmm0, %rax
+; X64-AVX512-NEXT:    movabsq $76861433640456465, %rcx # imm = 0x111111111111111
+; X64-AVX512-NEXT:    xorq %rax, %rcx
+; X64-AVX512-NEXT:    movabsq $1229782938247303440, %rax # imm = 0x1111111111111110
+; X64-AVX512-NEXT:    imulq %rcx, %rax
+; X64-AVX512-NEXT:    vmovq %rax, %xmm0
+; X64-AVX512-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
+; X64-AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
+; X64-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
+; X64-AVX512-NEXT:    retq
+entry:
+  %const = bitcast i64 1229782938247303440 to i64
+  %1 = insertelement <1 x i8> poison, i8 %0, i64 0
+  %2 = shufflevector <1 x i8> %1, <1 x i8> poison, <8 x i32> zeroinitializer
+  %3 = and <8 x i8> %2, <i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64, i8 -128>
+  %.not.not = icmp eq <8 x i8> %3, zeroinitializer
+  %4 = select <8 x i1> %.not.not, <8 x i8> <i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17, i8 17>, <8 x i8> zeroinitializer
+  %5 = bitcast <8 x i8> %4 to i64
+  %6 = mul i64 %5, %const
+  %7 = bitcast i64 %6 to <8 x i8>
+  %8 = xor i64 %5, 76861433640456465
+  %9 = mul i64 %8, %const
+  %10 = bitcast i64 %9 to <8 x i8>
+  %11 = select <8 x i1> %.not.not, <8 x i8> %7, <8 x i8> %10
+  %12 = and <8 x i8> %11, <i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15, i8 15>
+  %13 = lshr <8 x i8> %11, <i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4, i8 4>
+  %14 = shufflevector <8 x i8> %12, <8 x i8> %13, <16 x i32> <i32 0, i32 8, i32 1, i32 9, i32 2, i32 10, i32 3, i32 11, i32 4, i32 12, i32 5, i32 13, i32 6, i32 14, i32 7, i32 15>
+  ret <16 x i8> %14
+}

From 179ba129f50aefe6e670800aec7091d958aa6f90 Mon Sep 17 00:00:00 2001
From: Alexandros Lamprineas <alexandros.lamprineas@arm.com>
Date: Tue, 23 Jan 2024 15:48:12 +0000
Subject: [PATCH 636/843] [AArch64][FMV] Support feature MOPS in Function Multi
 Versioning. (#78788)

The patch adds support for FEAT_MOPS (Memory Copy and Memory Set
instructions) in Function Multi Versioning. The bits [19:16] of the
system register ID_AA64ISAR2_EL1 indicate whether FEAT_MOPS is
implemented in AArch64 state. This information is accessible via ELF
hwcaps.
---
 clang/test/CodeGen/attr-target-version.c      | 30 +++++++++----------
 clang/test/Sema/attr-target-clones-aarch64.c  |  2 +-
 clang/test/SemaCXX/attr-target-version.cpp    |  1 +
 compiler-rt/lib/builtins/cpu_model/aarch64.c  |  1 +
 .../builtins/cpu_model/aarch64/fmv/mrs.inc    |  2 ++
 .../lib/builtins/cpu_model/aarch64/hwcap.inc  |  3 ++
 .../llvm/TargetParser/AArch64TargetParser.h   |  3 +-
 7 files changed, 25 insertions(+), 17 deletions(-)

diff --git a/clang/test/CodeGen/attr-target-version.c b/clang/test/CodeGen/attr-target-version.c
index 13b895ad8e9b8..89279852a8c91 100644
--- a/clang/test/CodeGen/attr-target-version.c
+++ b/clang/test/CodeGen/attr-target-version.c
@@ -35,7 +35,7 @@ inline int __attribute__((target_version("sve+sve-bf16"))) fmv_inline(void) { re
 inline int __attribute__((target_version("sve2-aes+sve2-sha3"))) fmv_inline(void) { return 5; }
 inline int __attribute__((target_version("sve2+sve2-pmull128+sve2-bitperm"))) fmv_inline(void) { return 9; }
 inline int __attribute__((target_version("sve2-sm4+memtag2"))) fmv_inline(void) { return 10; }
-inline int __attribute__((target_version("memtag3+rcpc3"))) fmv_inline(void) { return 11; }
+inline int __attribute__((target_version("memtag3+rcpc3+mops"))) fmv_inline(void) { return 11; }
 inline int __attribute__((target_version("default"))) fmv_inline(void) { return 3; }
 
 __attribute__((target_version("ls64"))) int fmv_e(void);
@@ -272,36 +272,36 @@ int hoo(void) {
 // CHECK-NEXT:    ret ptr @fmv_inline._Mfp16Mfp16MfcmaMsme
 // CHECK:       resolver_else:
 // CHECK-NEXT:    [[TMP4:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 893353197568
-// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 893353197568
+// CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 864726312827224064
+// CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[TMP5]], 864726312827224064
 // CHECK-NEXT:    [[TMP7:%.*]] = and i1 true, [[TMP6]]
 // CHECK-NEXT:    br i1 [[TMP7]], label [[RESOLVER_RETURN1:%.*]], label [[RESOLVER_ELSE2:%.*]]
 // CHECK:       resolver_return1:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msve2Msve2-pmull128Msve2-bitperm
+// CHECK-NEXT:    ret ptr @fmv_inline._Mrcpc3Mmemtag3Mmops
 // CHECK:       resolver_else2:
 // CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 34359773184
-// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 34359773184
+// CHECK-NEXT:    [[TMP9:%.*]] = and i64 [[TMP8]], 893353197568
+// CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[TMP9]], 893353197568
 // CHECK-NEXT:    [[TMP11:%.*]] = and i1 true, [[TMP10]]
 // CHECK-NEXT:    br i1 [[TMP11]], label [[RESOLVER_RETURN3:%.*]], label [[RESOLVER_ELSE4:%.*]]
 // CHECK:       resolver_return3:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msha1MpmullMf64mm
+// CHECK-NEXT:    ret ptr @fmv_inline._Msve2Msve2-pmull128Msve2-bitperm
 // CHECK:       resolver_else4:
 // CHECK-NEXT:    [[TMP12:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 17246986240
-// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 17246986240
+// CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[TMP12]], 34359773184
+// CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[TMP13]], 34359773184
 // CHECK-NEXT:    [[TMP15:%.*]] = and i1 true, [[TMP14]]
 // CHECK-NEXT:    br i1 [[TMP15]], label [[RESOLVER_RETURN5:%.*]], label [[RESOLVER_ELSE6:%.*]]
 // CHECK:       resolver_return5:
-// CHECK-NEXT:    ret ptr @fmv_inline._Msha3Mi8mmMf32mm
+// CHECK-NEXT:    ret ptr @fmv_inline._Msha1MpmullMf64mm
 // CHECK:       resolver_else6:
 // CHECK-NEXT:    [[TMP16:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
-// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 288265560523800576
-// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 288265560523800576
+// CHECK-NEXT:    [[TMP17:%.*]] = and i64 [[TMP16]], 17246986240
+// CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[TMP17]], 17246986240
 // CHECK-NEXT:    [[TMP19:%.*]] = and i1 true, [[TMP18]]
 // CHECK-NEXT:    br i1 [[TMP19]], label [[RESOLVER_RETURN7:%.*]], label [[RESOLVER_ELSE8:%.*]]
 // CHECK:       resolver_return7:
-// CHECK-NEXT:    ret ptr @fmv_inline._Mrcpc3Mmemtag3
+// CHECK-NEXT:    ret ptr @fmv_inline._Msha3Mi8mmMf32mm
 // CHECK:       resolver_else8:
 // CHECK-NEXT:    [[TMP20:%.*]] = load i64, ptr @__aarch64_cpu_features, align 8
 // CHECK-NEXT:    [[TMP21:%.*]] = and i64 [[TMP20]], 19791209299968
@@ -609,7 +609,7 @@ int hoo(void) {
 //
 //
 // CHECK: Function Attrs: noinline nounwind optnone
-// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mrcpc3Mmemtag3
+// CHECK-LABEL: define {{[^@]+}}@fmv_inline._Mrcpc3Mmemtag3Mmops
 // CHECK-SAME: () #[[ATTR23:[0-9]+]] {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    ret i32 11
@@ -768,7 +768,7 @@ int hoo(void) {
 // CHECK: attributes #[[ATTR20]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon,+sve,+sve2,+sve2-aes,+sve2-sha3" }
 // CHECK: attributes #[[ATTR21]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+neon,+sve,+sve2,+sve2-aes,+sve2-bitperm" }
 // CHECK: attributes #[[ATTR22]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fp-armv8,+fullfp16,+ls64,+mte,+neon,+sve,+sve2,+sve2-sm4" }
-// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+mte,+rcpc,+rcpc3" }
+// CHECK: attributes #[[ATTR23]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+mops,+mte,+rcpc,+rcpc3" }
 // CHECK: attributes #[[ATTR24]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+fullfp16,+ls64,+sb" }
 //.
 // CHECK-NOFMV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="-fmv" }
diff --git a/clang/test/Sema/attr-target-clones-aarch64.c b/clang/test/Sema/attr-target-clones-aarch64.c
index 9adabf8773213..4054b7c837ec9 100644
--- a/clang/test/Sema/attr-target-clones-aarch64.c
+++ b/clang/test/Sema/attr-target-clones-aarch64.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple aarch64-linux-gnu  -fsyntax-only -verify %s
 
-void __attribute__((target_clones("fp16+sve2-aes", "sb+sve2-sha3+rcpc3"))) no_def(void);
+void __attribute__((target_clones("fp16+sve2-aes", "sb+sve2-sha3+rcpc3+mops"))) no_def(void);
 
 // expected-warning@+1 {{unsupported 'default' in the 'target_clones' attribute string; 'target_clones' attribute ignored}}
 void __attribute__((target_clones("default+sha3"))) warn1(void);
diff --git a/clang/test/SemaCXX/attr-target-version.cpp b/clang/test/SemaCXX/attr-target-version.cpp
index 2e262cda36774..5c542ad2e2dca 100644
--- a/clang/test/SemaCXX/attr-target-version.cpp
+++ b/clang/test/SemaCXX/attr-target-version.cpp
@@ -6,6 +6,7 @@ void __attribute__((target_version("vmull"))) wrong_tv(void);
 void __attribute__((target_version("dotprod"))) no_def(void);
 void __attribute__((target_version("rdm+fp"))) no_def(void);
 void __attribute__((target_version("rcpc3"))) no_def(void);
+void __attribute__((target_version("mops"))) no_def(void);
 
 // expected-error@+1 {{no matching function for call to 'no_def'}}
 void foo(void) { no_def(); }
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64.c b/compiler-rt/lib/builtins/cpu_model/aarch64.c
index 44e1cf49d1e92..17bddfca46f09 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64.c
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64.c
@@ -115,6 +115,7 @@ enum CPUFeatures {
   FEAT_SME_I64,
   FEAT_SME2,
   FEAT_RCPC3,
+  FEAT_MOPS,
   FEAT_MAX,
   FEAT_EXT = 62, // Reserved to indicate presence of additional features field
                  // in __aarch64_cpu_features
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
index 2f712f41f4979..32a21a2fba9a3 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/fmv/mrs.inc
@@ -109,6 +109,8 @@ static void __init_cpu_features_constructor(unsigned long hwcap,
     setCPUFeature(FEAT_SME_I64);
   if (hwcap2 & HWCAP2_SME_F64F64)
     setCPUFeature(FEAT_SME_F64);
+  if (hwcap2 & HWCAP2_MOPS)
+    setCPUFeature(FEAT_MOPS);
   if (hwcap & HWCAP_CPUID) {
     unsigned long ftr;
     getCPUFeature(ID_AA64PFR1_EL1, ftr);
diff --git a/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc b/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc
index 328d9c4140b9c..7ddc125b26da7 100644
--- a/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc
+++ b/compiler-rt/lib/builtins/cpu_model/aarch64/hwcap.inc
@@ -178,3 +178,6 @@
 #ifndef HWCAP2_SVE_EBF16
 #define HWCAP2_SVE_EBF16 (1ULL << 33)
 #endif
+#ifndef HWCAP2_MOPS
+#define HWCAP2_MOPS (1ULL << 43)
+#endif
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index ddf5ab4d2df36..623fdc21ba65a 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -94,6 +94,7 @@ enum CPUFeatures {
   FEAT_SME_I64,
   FEAT_SME2,
   FEAT_RCPC3,
+  FEAT_MOPS,
   FEAT_MAX,
   FEAT_EXT = 62,
   FEAT_INIT
@@ -246,7 +247,7 @@ inline constexpr ExtensionInfo Extensions[] = {
     {"memtag", AArch64::AEK_MTE, "+mte", "-mte", FEAT_MEMTAG, "", 440},
     {"memtag2", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG2, "+mte", 450},
     {"memtag3", AArch64::AEK_NONE, {}, {}, FEAT_MEMTAG3, "+mte", 460},
-    {"mops", AArch64::AEK_MOPS, "+mops", "-mops", FEAT_INIT, "", 0},
+    {"mops", AArch64::AEK_MOPS, "+mops", "-mops", FEAT_MOPS, "+mops", 650},
     {"pauth", AArch64::AEK_PAUTH, "+pauth", "-pauth", FEAT_INIT, "", 0},
     {"pmull", AArch64::AEK_NONE, {}, {}, FEAT_PMULL, "+aes,+fp-armv8,+neon", 160},
     {"pmuv3", AArch64::AEK_PERFMON, "+perfmon", "-perfmon", FEAT_INIT, "", 0},

From 0c02b2e0e0397b5d73dc4c8cc10561f05789ee41 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 23 Jan 2024 16:53:45 +0100
Subject: [PATCH 637/843] [LAA] Add test for #79137 (NFC)

---
 .../LoopAccessAnalysis/noalias-scope-decl.ll  | 94 +++++++++++++++++++
 1 file changed, 94 insertions(+)
 create mode 100644 llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll

diff --git a/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll b/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll
new file mode 100644
index 0000000000000..98bb5f99a40a1
--- /dev/null
+++ b/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll
@@ -0,0 +1,94 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes='print<access-info>' -disable-output 2>&1 < %s | FileCheck %s
+
+; PR79137: If the noalias.scope.decl is located inside the loop, we cannot
+; assume that the accesses don't alias across iterations.
+
+define void @test_scope_in_loop(ptr %arg, i64 %num) {
+; CHECK-LABEL: 'test_scope_in_loop'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %icmp = icmp ult i64 %num, 2
+  br i1 %icmp, label %exit, label %preheader
+
+preheader:
+  %arg.1 = getelementptr inbounds i8, ptr %arg, i64 1
+  %end = add i64 %num, -2
+  br label %loop
+
+loop:
+  %prev.ptr = phi ptr [ %cur.ptr, %loop ], [ %arg, %preheader ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %preheader ]
+  %cur.ptr = getelementptr inbounds i8, ptr %arg.1, i64 %iv
+  call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  call void @llvm.experimental.noalias.scope.decl(metadata !3)
+  %load.prev = load i8, ptr %prev.ptr, align 1, !alias.scope !0, !noalias !3
+  %load.cur = load i8, ptr %cur.ptr, align 1, !alias.scope !3
+  %add = add i8 %load.cur, %load.prev
+  store i8 %add, ptr %cur.ptr, align 1, !alias.scope !3
+  %iv.next = add nuw i64 %iv, 1
+  %cmp = icmp eq i64 %iv, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @test_scope_out_of_loop(ptr %arg, i64 %num) {
+; CHECK-LABEL: 'test_scope_out_of_loop'
+; CHECK-NEXT:    loop:
+; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:      Run-time memory checks:
+; CHECK-NEXT:      Grouped accesses:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Non vectorizable stores to invariant address were not found in loop.
+; CHECK-NEXT:      SCEV assumptions:
+; CHECK-EMPTY:
+; CHECK-NEXT:      Expressions re-written:
+;
+entry:
+  %icmp = icmp ult i64 %num, 2
+  br i1 %icmp, label %exit, label %preheader
+
+preheader:
+  call void @llvm.experimental.noalias.scope.decl(metadata !0)
+  call void @llvm.experimental.noalias.scope.decl(metadata !3)
+  %arg.1 = getelementptr inbounds i8, ptr %arg, i64 1
+  %end = add i64 %num, -2
+  br label %loop
+
+loop:
+  %prev.ptr = phi ptr [ %cur.ptr, %loop ], [ %arg, %preheader ]
+  %iv = phi i64 [ %iv.next, %loop ], [ 0, %preheader ]
+  %cur.ptr = getelementptr inbounds i8, ptr %arg.1, i64 %iv
+  %load.prev = load i8, ptr %prev.ptr, align 1, !alias.scope !0, !noalias !3
+  %load.cur = load i8, ptr %cur.ptr, align 1, !alias.scope !3
+  %add = add i8 %load.cur, %load.prev
+  store i8 %add, ptr %cur.ptr, align 1, !alias.scope !3
+  %iv.next = add nuw i64 %iv, 1
+  %cmp = icmp eq i64 %iv, %end
+  br i1 %cmp, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+declare void @llvm.experimental.noalias.scope.decl(metadata)
+
+!0 = !{!1}
+!1 = distinct !{!1, !2}
+!2 = distinct !{!2}
+!3 = !{!4}
+!4 = distinct !{!4, !5}
+!5 = distinct !{!5}

From 6ad4ed5f1dd0a8160cd3c7b75e63a0e2f6061b5b Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Tue, 23 Jan 2024 15:57:19 +0000
Subject: [PATCH 638/843] [VecLib] Fix: Restore DebugFlag state in
 ReplaceWithVecLibTest (#78989)

It appears that Google Tests run multiple modules from the same
invocation.
As a result setting `llvm::DebugFlag` in this pass kept it on in
subsequent passes, which is not the expected behavior.
---
 llvm/unittests/Analysis/ReplaceWithVecLibTest.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/unittests/Analysis/ReplaceWithVecLibTest.cpp b/llvm/unittests/Analysis/ReplaceWithVecLibTest.cpp
index a1f0a4a894c8d..95f17f93bf4c0 100644
--- a/llvm/unittests/Analysis/ReplaceWithVecLibTest.cpp
+++ b/llvm/unittests/Analysis/ReplaceWithVecLibTest.cpp
@@ -72,9 +72,11 @@ class ReplaceWithVecLibTest : public ::testing::Test {
     PB.registerFunctionAnalyses(FAM);
 
     // Enable debugging and capture std error
+    bool DebugFlagPrev = llvm::DebugFlag;
     llvm::DebugFlag = true;
     testing::internal::CaptureStderr();
     FPM.run(*M->getFunction("foo"), FAM);
+    llvm::DebugFlag = DebugFlagPrev;
     return getLastLine(testing::internal::GetCapturedStderr());
   }
 };

From 632f44e5edee6395ef26953d41a0e681a121aad3 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 23 Jan 2024 16:16:59 +0000
Subject: [PATCH 639/843] [RemoveDIs][DebugInfo] Handle DPVAssign in most
 transforms (#78986)

This patch trivially updates various opt passes to handle DPVAssigns. In
all cases, this means some combination of generifying existing code to
handle DPValues and DbgAssignIntrinsics, iterating over DPValues where
previously we did not, or duplicating code for DbgAssignIntrinsics to
the equivalent DPValue function (in inlining and salvageDebugInfo).
---
 llvm/lib/Transforms/Coroutines/CoroFrame.cpp  |   6 +-
 .../InstCombine/InstCombineCalls.cpp          |  10 +-
 llvm/lib/Transforms/Scalar/ADCE.cpp           |   5 +
 .../Scalar/DeadStoreElimination.cpp           |  35 +++---
 llvm/lib/Transforms/Utils/CodeExtractor.cpp   |   5 +-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  18 ++-
 llvm/lib/Transforms/Utils/Local.cpp           |  33 +++--
 .../Utils/PromoteMemoryToRegister.cpp         | 116 ++++++++++++------
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  19 ++-
 llvm/lib/Transforms/Utils/ValueMapper.cpp     |  12 +-
 .../Transforms/Vectorize/SLPVectorizer.cpp    |   4 +-
 .../assignment-tracking/adce/no-delete.ll     |   2 +
 .../dse/dse-after-memcpyopt-merge.ll          |   1 +
 .../assignment-tracking/dse/shorten-offset.ll |   1 +
 .../assignment-tracking/dse/shorten.ll        |   1 +
 .../Generic/assignment-tracking/inline/id.ll  |   4 +-
 .../inline/inline-stores.ll                   |   2 +
 .../inline/shared-alloca.ll                   |   2 +
 .../inline/use-before-def.ll                  |   2 +
 .../do-not-remove-redundant-dbg.ll            |   2 +
 .../assignment-tracking/instcombine/memset.ll |   2 +
 .../instcombine/sink-store.ll                 |   1 +
 .../assignment-tracking/instcombine/sink.ll   |   2 +
 .../instcombine/store-new-type.ll             |   1 +
 .../instcombine/storemerge.ll                 |   1 +
 .../Generic/assignment-tracking/licm/merge.ll |   1 +
 .../assignment-tracking/licm/multi-exit.ll    |   1 +
 .../loop-deletion/dead-loop.ll                |   2 +
 .../assignment-tracking/mem2reg/phi.ll        |   2 +
 .../mem2reg/single-block-alloca.ll            |   2 +
 .../mem2reg/single-store-alloca.ll            |   2 +
 .../mem2reg/store-to-part-of-alloca.ll        |   1 +
 .../memcpyopt/merge-stores.ll                 |   1 +
 .../mldst-motion/diamond.ll                   |   2 +
 .../parse-and-verify/instruction-type.ll      |   2 +
 .../parse-and-verify/verify.ll                |   2 +
 .../remove-redundant-fwd-scan-linked.ll       |   2 +
 .../assignment-tracking/salvage-value.ll      |   2 +
 .../simplifycfg/empty-block.ll                |   2 +
 .../simplifycfg/speculated-store.ll           |   2 +
 .../slp-vectorizer/merge-scalars.ll           |   2 +
 .../HotColdSplit/invalid-dbg-assign.ll        |   1 +
 42 files changed, 227 insertions(+), 89 deletions(-)

diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index db09337215c10..e69c718f0ae3a 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -963,7 +963,7 @@ static void cacheDIVar(FrameDataInfo &FrameData,
     if (DIVarCache.contains(V))
       continue;
 
-    auto CacheIt = [&DIVarCache, V](auto Container) {
+    auto CacheIt = [&DIVarCache, V](const auto &Container) {
       auto *I = llvm::find_if(Container, [](auto *DDI) {
         return DDI->getExpression()->getNumElements() == 0;
       });
@@ -1868,7 +1868,7 @@ static void insertSpills(const FrameDataInfo &FrameData, coro::Shape &Shape) {
         // alias.
         if (F->getSubprogram()) {
           auto *CurDef = Def;
-          while (DIs.empty() && isa<LoadInst>(CurDef)) {
+          while (DIs.empty() && DPVs.empty() && isa<LoadInst>(CurDef)) {
             auto *LdInst = cast<LoadInst>(CurDef);
             // Only consider ptr to ptr same type load.
             if (LdInst->getPointerOperandType() != LdInst->getType())
@@ -2966,7 +2966,7 @@ void coro::salvageDebugInfo(
   Function *F = DPV.getFunction();
   // Follow the pointer arithmetic all the way to the incoming
   // function argument and convert into a DIExpression.
-  bool SkipOutermostLoad = DPV.getType() == DPValue::LocationType::Declare;
+  bool SkipOutermostLoad = DPV.isDbgDeclare();
   Value *OriginalStorage = DPV.getVariableLocationOp(0);
 
   auto SalvagedInfo = ::salvageDebugInfoImpl(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 64fbd5543a9e2..a647be2d26c76 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -282,10 +282,12 @@ Instruction *InstCombinerImpl::SimplifyAnyMemSet(AnyMemSetInst *MI) {
     Constant *FillVal = ConstantInt::get(ITy, Fill);
     StoreInst *S = Builder.CreateStore(FillVal, Dest, MI->isVolatile());
     S->copyMetadata(*MI, LLVMContext::MD_DIAssignID);
-    for (auto *DAI : at::getAssignmentMarkers(S)) {
-      if (llvm::is_contained(DAI->location_ops(), FillC))
-        DAI->replaceVariableLocationOp(FillC, FillVal);
-    }
+    auto replaceOpForAssignmentMarkers = [FillC, FillVal](auto *DbgAssign) {
+      if (llvm::is_contained(DbgAssign->location_ops(), FillC))
+        DbgAssign->replaceVariableLocationOp(FillC, FillVal);
+    };
+    for_each(at::getAssignmentMarkers(S), replaceOpForAssignmentMarkers);
+    for_each(at::getDPVAssignmentMarkers(S), replaceOpForAssignmentMarkers);
 
     S->setAlignment(Alignment);
     if (isa<AtomicMemSetInst>(MI))
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index 9af275a9f4e20..90b544c892262 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -549,6 +549,11 @@ ADCEChanged AggressiveDeadCodeElimination::removeDeadInstructions() {
     // like the rest of this loop does. Extending support to assignment tracking
     // is future work.
     for (DPValue &DPV : make_early_inc_range(I.getDbgValueRange())) {
+      // Avoid removing a DPV that is linked to instructions because it holds
+      // information about an existing store.
+      if (DPV.isDbgAssign())
+        if (!at::getAssignmentInsts(&DPV).empty())
+          continue;
       if (AliveScopes.count(DPV.getDebugLoc()->getScope()))
         continue;
       I.dropOneDbgValue(&DPV);
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 008dcc53fd44f..11a91bfbe5baf 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -488,27 +488,27 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
   uint64_t DeadSliceSizeInBits = OldSizeInBits - NewSizeInBits;
   uint64_t DeadSliceOffsetInBits =
       OldOffsetInBits + (IsOverwriteEnd ? NewSizeInBits : 0);
-  auto SetDeadFragExpr = [](DbgAssignIntrinsic *DAI,
+  auto SetDeadFragExpr = [](auto *Assign,
                             DIExpression::FragmentInfo DeadFragment) {
     // createFragmentExpression expects an offset relative to the existing
     // fragment offset if there is one.
     uint64_t RelativeOffset = DeadFragment.OffsetInBits -
-                              DAI->getExpression()
+                              Assign->getExpression()
                                   ->getFragmentInfo()
                                   .value_or(DIExpression::FragmentInfo(0, 0))
                                   .OffsetInBits;
     if (auto NewExpr = DIExpression::createFragmentExpression(
-            DAI->getExpression(), RelativeOffset, DeadFragment.SizeInBits)) {
-      DAI->setExpression(*NewExpr);
+            Assign->getExpression(), RelativeOffset, DeadFragment.SizeInBits)) {
+      Assign->setExpression(*NewExpr);
       return;
     }
     // Failed to create a fragment expression for this so discard the value,
     // making this a kill location.
     auto *Expr = *DIExpression::createFragmentExpression(
-        DIExpression::get(DAI->getContext(), std::nullopt),
+        DIExpression::get(Assign->getContext(), std::nullopt),
         DeadFragment.OffsetInBits, DeadFragment.SizeInBits);
-    DAI->setExpression(Expr);
-    DAI->setKillLocation();
+    Assign->setExpression(Expr);
+    Assign->setKillLocation();
   };
 
   // A DIAssignID to use so that the inserted dbg.assign intrinsics do not
@@ -526,32 +526,35 @@ static void shortenAssignment(Instruction *Inst, Value *OriginalDest,
   // returned by getAssignmentMarkers so save a copy of the markers to iterate
   // over.
   auto LinkedRange = at::getAssignmentMarkers(Inst);
+  SmallVector<DPValue *> LinkedDPVAssigns = at::getDPVAssignmentMarkers(Inst);
   SmallVector<DbgAssignIntrinsic *> Linked(LinkedRange.begin(),
                                            LinkedRange.end());
-  for (auto *DAI : Linked) {
+  auto InsertAssignForOverlap = [&](auto *Assign) {
     std::optional<DIExpression::FragmentInfo> NewFragment;
     if (!at::calculateFragmentIntersect(DL, OriginalDest, DeadSliceOffsetInBits,
-                                        DeadSliceSizeInBits, DAI,
+                                        DeadSliceSizeInBits, Assign,
                                         NewFragment) ||
         !NewFragment) {
       // We couldn't calculate the intersecting fragment for some reason. Be
       // cautious and unlink the whole assignment from the store.
-      DAI->setKillAddress();
-      DAI->setAssignId(GetDeadLink());
-      continue;
+      Assign->setKillAddress();
+      Assign->setAssignId(GetDeadLink());
+      return;
     }
     // No intersect.
     if (NewFragment->SizeInBits == 0)
-      continue;
+      return;
 
     // Fragments overlap: insert a new dbg.assign for this dead part.
-    auto *NewAssign = cast<DbgAssignIntrinsic>(DAI->clone());
-    NewAssign->insertAfter(DAI);
+    auto *NewAssign = static_cast<decltype(Assign)>(Assign->clone());
+    NewAssign->insertAfter(Assign);
     NewAssign->setAssignId(GetDeadLink());
     if (NewFragment)
       SetDeadFragExpr(NewAssign, *NewFragment);
     NewAssign->setKillAddress();
-  }
+  };
+  for_each(Linked, InsertAssignForOverlap);
+  for_each(LinkedDPVAssigns, InsertAssignForOverlap);
 }
 
 static bool tryToShorten(Instruction *DeadI, int64_t &DeadStart,
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index c6a2a26458297..278111883459b 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -1589,11 +1589,14 @@ static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
     for (auto &DPV : I.getDbgValueRange()) {
       // Apply the two updates that dbg.values get: invalid operands, and
       // variable metadata fixup.
-      // FIXME: support dbg.assign form of DPValues.
       if (any_of(DPV.location_ops(), IsInvalidLocation)) {
         DPVsToDelete.push_back(&DPV);
         continue;
       }
+      if (DPV.isDbgAssign() && IsInvalidLocation(DPV.getAddress())) {
+        DPVsToDelete.push_back(&DPV);
+        continue;
+      }
       if (!DPV.getDebugLoc().getInlinedAt())
         DPV.setVariable(GetUpdatedDIVariable(DPV.getVariable()));
       DPV.setDebugLoc(DebugLoc::replaceInlinedAtSubprogram(DPV.getDebugLoc(),
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 39d5f6e53c1de..d4d4bf5ebdf36 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -1789,13 +1789,15 @@ static at::StorageToVarsMap collectEscapedLocals(const DataLayout &DL,
       continue;
 
     // Find all local variables associated with the backing storage.
-    for (auto *DAI : at::getAssignmentMarkers(Base)) {
+    auto CollectAssignsForStorage = [&](auto *DbgAssign) {
       // Skip variables from inlined functions - they are not local variables.
-      if (DAI->getDebugLoc().getInlinedAt())
-        continue;
-      LLVM_DEBUG(errs() << " > DEF : " << *DAI << "\n");
-      EscapedLocals[Base].insert(at::VarRecord(DAI));
-    }
+      if (DbgAssign->getDebugLoc().getInlinedAt())
+        return;
+      LLVM_DEBUG(errs() << " > DEF : " << *DbgAssign << "\n");
+      EscapedLocals[Base].insert(at::VarRecord(DbgAssign));
+    };
+    for_each(at::getAssignmentMarkers(Base), CollectAssignsForStorage);
+    for_each(at::getDPVAssignmentMarkers(Base), CollectAssignsForStorage);
   }
   return EscapedLocals;
 }
@@ -1827,6 +1829,10 @@ static void fixupAssignments(Function::iterator Start, Function::iterator End) {
   // attachment or use, replace it with a new version.
   for (auto BBI = Start; BBI != End; ++BBI) {
     for (Instruction &I : *BBI) {
+      for (DPValue &DPV : I.getDbgValueRange()) {
+        if (DPV.isDbgAssign())
+          DPV.setAssignId(GetNewID(DPV.getAssignID()));
+      }
       if (auto *ID = I.getMetadata(LLVMContext::MD_DIAssignID))
         I.setMetadata(LLVMContext::MD_DIAssignID, GetNewID(ID));
       else if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index 2a1ac85ee55bf..459e3d9805928 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -1753,7 +1753,7 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
 
 void llvm::ConvertDebugDeclareToDebugValue(DPValue *DPV, StoreInst *SI,
                                            DIBuilder &Builder) {
-  assert(DPV->isAddressOfVariable());
+  assert(DPV->isAddressOfVariable() || DPV->isDbgAssign());
   auto *DIVar = DPV->getVariable();
   assert(DIVar && "Missing variable");
   auto *DIExpr = DPV->getExpression();
@@ -2189,14 +2189,13 @@ void llvm::salvageDebugInfo(Instruction &I) {
   salvageDebugInfoForDbgValues(I, DbgUsers, DPUsers);
 }
 
-/// Salvage the address component of \p DAI.
-static void salvageDbgAssignAddress(DbgAssignIntrinsic *DAI) {
-  Instruction *I = dyn_cast<Instruction>(DAI->getAddress());
+template <typename T> static void salvageDbgAssignAddress(T *Assign) {
+  Instruction *I = dyn_cast<Instruction>(Assign->getAddress());
   // Only instructions can be salvaged at the moment.
   if (!I)
     return;
 
-  assert(!DAI->getAddressExpression()->getFragmentInfo().has_value() &&
+  assert(!Assign->getAddressExpression()->getFragmentInfo().has_value() &&
          "address-expression shouldn't have fragment info");
 
   // The address component of a dbg.assign cannot be variadic.
@@ -2210,16 +2209,16 @@ static void salvageDbgAssignAddress(DbgAssignIntrinsic *DAI) {
     return;
 
   DIExpression *SalvagedExpr = DIExpression::appendOpsToArg(
-      DAI->getAddressExpression(), Ops, 0, /*StackValue=*/false);
+      Assign->getAddressExpression(), Ops, 0, /*StackValue=*/false);
   assert(!SalvagedExpr->getFragmentInfo().has_value() &&
          "address-expression shouldn't have fragment info");
 
   // Salvage succeeds if no additional values are required.
   if (AdditionalValues.empty()) {
-    DAI->setAddress(NewV);
-    DAI->setAddressExpression(SalvagedExpr);
+    Assign->setAddress(NewV);
+    Assign->setAddressExpression(SalvagedExpr);
   } else {
-    DAI->setKillAddress();
+    Assign->setKillAddress();
   }
 }
 
@@ -2293,10 +2292,19 @@ void llvm::salvageDebugInfoForDbgValues(
   }
   // Duplicate of above block for DPValues.
   for (auto *DPV : DPUsers) {
+    if (DPV->isDbgAssign()) {
+      if (DPV->getAddress() == &I) {
+        salvageDbgAssignAddress(DPV);
+        Salvaged = true;
+      }
+      if (DPV->getValue() != &I)
+        continue;
+    }
+
     // Do not add DW_OP_stack_value for DbgDeclare and DbgAddr, because they
     // are implicitly pointing out the value as a DWARF memory location
     // description.
-    bool StackValue = DPV->getType() == DPValue::LocationType::Value;
+    bool StackValue = DPV->getType() != DPValue::LocationType::Declare;
     auto DPVLocation = DPV->location_ops();
     assert(
         is_contained(DPVLocation, &I) &&
@@ -2330,7 +2338,7 @@ void llvm::salvageDebugInfoForDbgValues(
         SalvagedExpr->getNumElements() <= MaxExpressionSize;
     if (AdditionalValues.empty() && IsValidSalvageExpr) {
       DPV->setExpression(SalvagedExpr);
-    } else if (DPV->getType() == DPValue::LocationType::Value &&
+    } else if (DPV->getType() != DPValue::LocationType::Declare &&
                IsValidSalvageExpr &&
                DPV->getNumVariableLocationOps() + AdditionalValues.size() <=
                    MaxDebugArgs) {
@@ -2340,8 +2348,7 @@ void llvm::salvageDebugInfoForDbgValues(
       // currently only valid for stack value expressions.
       // Also do not salvage if the resulting DIArgList would contain an
       // unreasonably large number of values.
-      Value *Undef = UndefValue::get(I.getOperand(0)->getType());
-      DPV->replaceVariableLocationOp(I.getOperand(0), Undef);
+      DPV->setKillLocation();
     }
     LLVM_DEBUG(dbgs() << "SALVAGE: " << DPV << '\n');
     Salvaged = true;
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index 717b6d301c8c9..88b05aab8db4d 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -101,12 +101,30 @@ bool llvm::isAllocaPromotable(const AllocaInst *AI) {
 
 namespace {
 
+static DPValue *createDebugValue(DIBuilder &DIB, Value *NewValue,
+                                 DILocalVariable *Variable,
+                                 DIExpression *Expression, const DILocation *DI,
+                                 DPValue *InsertBefore) {
+  (void)DIB;
+  return DPValue::createDPValue(NewValue, Variable, Expression, DI,
+                                *InsertBefore);
+}
+static DbgValueInst *createDebugValue(DIBuilder &DIB, Value *NewValue,
+                                      DILocalVariable *Variable,
+                                      DIExpression *Expression,
+                                      const DILocation *DI,
+                                      Instruction *InsertBefore) {
+  return static_cast<DbgValueInst *>(DIB.insertDbgValueIntrinsic(
+      NewValue, Variable, Expression, DI, InsertBefore));
+}
+
 /// Helper for updating assignment tracking debug info when promoting allocas.
 class AssignmentTrackingInfo {
   /// DbgAssignIntrinsics linked to the alloca with at most one per variable
   /// fragment. (i.e. not be a comprehensive set if there are multiple
   /// dbg.assigns for one variable fragment).
   SmallVector<DbgVariableIntrinsic *> DbgAssigns;
+  SmallVector<DPValue *> DPVAssigns;
 
 public:
   void init(AllocaInst *AI) {
@@ -115,16 +133,21 @@ class AssignmentTrackingInfo {
       if (Vars.insert(DebugVariable(DAI)).second)
         DbgAssigns.push_back(DAI);
     }
+    for (DPValue *DPV : at::getDPVAssignmentMarkers(AI)) {
+      if (Vars.insert(DebugVariable(DPV)).second)
+        DPVAssigns.push_back(DPV);
+    }
   }
 
   /// Update assignment tracking debug info given for the to-be-deleted store
   /// \p ToDelete that stores to this alloca.
-  void updateForDeletedStore(
-      StoreInst *ToDelete, DIBuilder &DIB,
-      SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete) const {
+  void
+  updateForDeletedStore(StoreInst *ToDelete, DIBuilder &DIB,
+                        SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
+                        SmallSet<DPValue *, 8> *DPVAssignsToDelete) const {
     // There's nothing to do if the alloca doesn't have any variables using
     // assignment tracking.
-    if (DbgAssigns.empty())
+    if (DbgAssigns.empty() && DPVAssigns.empty())
       return;
 
     // Insert a dbg.value where the linked dbg.assign is and remember to delete
@@ -134,13 +157,17 @@ class AssignmentTrackingInfo {
     // dbg.assign for each variable fragment for the untracked store handling
     // (after this loop).
     SmallSet<DebugVariableAggregate, 2> VarHasDbgAssignForStore;
-    for (DbgAssignIntrinsic *DAI : at::getAssignmentMarkers(ToDelete)) {
-      VarHasDbgAssignForStore.insert(DebugVariableAggregate(DAI));
-      DbgAssignsToDelete->insert(DAI);
-      DIB.insertDbgValueIntrinsic(DAI->getValue(), DAI->getVariable(),
-                                  DAI->getExpression(), DAI->getDebugLoc(),
-                                  DAI);
-    }
+    auto InsertValueForAssign = [&](auto *DbgAssign, auto *&AssignList) {
+      VarHasDbgAssignForStore.insert(DebugVariableAggregate(DbgAssign));
+      AssignList->insert(DbgAssign);
+      createDebugValue(DIB, DbgAssign->getValue(), DbgAssign->getVariable(),
+                       DbgAssign->getExpression(), DbgAssign->getDebugLoc(),
+                       DbgAssign);
+    };
+    for (auto *Assign : at::getAssignmentMarkers(ToDelete))
+      InsertValueForAssign(Assign, DbgAssignsToDelete);
+    for (auto *Assign : at::getDPVAssignmentMarkers(ToDelete))
+      InsertValueForAssign(Assign, DPVAssignsToDelete);
 
     // It's possible for variables using assignment tracking to have no
     // dbg.assign linked to this store. These are variables in DbgAssigns that
@@ -150,11 +177,13 @@ class AssignmentTrackingInfo {
     // cannot be represented using assignment tracking (non-const offset or
     // size) or one that is trackable but has had its DIAssignID attachment
     // dropped accidentally.
-    for (auto *DAI : DbgAssigns) {
-      if (VarHasDbgAssignForStore.contains(DebugVariableAggregate(DAI)))
-        continue;
-      ConvertDebugDeclareToDebugValue(DAI, ToDelete, DIB);
-    }
+    auto ConvertUnlinkedAssignToValue = [&](auto *Assign) {
+      if (VarHasDbgAssignForStore.contains(DebugVariableAggregate(Assign)))
+        return;
+      ConvertDebugDeclareToDebugValue(Assign, ToDelete, DIB);
+    };
+    for_each(DbgAssigns, ConvertUnlinkedAssignToValue);
+    for_each(DPVAssigns, ConvertUnlinkedAssignToValue);
   }
 
   /// Update assignment tracking debug info given for the newly inserted PHI \p
@@ -165,10 +194,15 @@ class AssignmentTrackingInfo {
     // debug-phi.
     for (auto *DAI : DbgAssigns)
       ConvertDebugDeclareToDebugValue(DAI, NewPhi, DIB);
+    for (auto *DPV : DPVAssigns)
+      ConvertDebugDeclareToDebugValue(DPV, NewPhi, DIB);
   }
 
-  void clear() { DbgAssigns.clear(); }
-  bool empty() { return DbgAssigns.empty(); }
+  void clear() {
+    DbgAssigns.clear();
+    DPVAssigns.clear();
+  }
+  bool empty() { return DbgAssigns.empty() && DPVAssigns.empty(); }
 };
 
 struct AllocaInfo {
@@ -229,11 +263,15 @@ struct AllocaInfo {
       }
     }
     DbgUserVec AllDbgUsers;
-    findDbgUsers(AllDbgUsers, AI, &DPUsers);
+    SmallVector<DPValue *> AllDPUsers;
+    findDbgUsers(AllDbgUsers, AI, &AllDPUsers);
     std::copy_if(AllDbgUsers.begin(), AllDbgUsers.end(),
                  std::back_inserter(DbgUsers), [](DbgVariableIntrinsic *DII) {
                    return !isa<DbgAssignIntrinsic>(DII);
                  });
+    std::copy_if(AllDPUsers.begin(), AllDPUsers.end(),
+                 std::back_inserter(DPUsers),
+                 [](DPValue *DPV) { return !DPV->isDbgAssign(); });
     AssignmentTracking.init(AI);
   }
 };
@@ -341,6 +379,7 @@ struct PromoteMem2Reg {
   /// A set of dbg.assigns to delete because they've been demoted to
   /// dbg.values. Call cleanUpDbgAssigns to delete them.
   SmallSet<DbgAssignIntrinsic *, 8> DbgAssignsToDelete;
+  SmallSet<DPValue *, 8> DPVAssignsToDelete;
 
   /// The set of basic blocks the renamer has already visited.
   SmallPtrSet<BasicBlock *, 16> Visited;
@@ -390,6 +429,9 @@ struct PromoteMem2Reg {
     for (auto *DAI : DbgAssignsToDelete)
       DAI->eraseFromParent();
     DbgAssignsToDelete.clear();
+    for (auto *DPV : DPVAssignsToDelete)
+      DPV->eraseFromParent();
+    DPVAssignsToDelete.clear();
   }
 };
 
@@ -462,10 +504,12 @@ static void removeIntrinsicUsers(AllocaInst *AI) {
 /// false there were some loads which were not dominated by the single store
 /// and thus must be phi-ed with undef. We fall back to the standard alloca
 /// promotion algorithm in that case.
-static bool rewriteSingleStoreAlloca(
-    AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI, const DataLayout &DL,
-    DominatorTree &DT, AssumptionCache *AC,
-    SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete) {
+static bool
+rewriteSingleStoreAlloca(AllocaInst *AI, AllocaInfo &Info, LargeBlockInfo &LBI,
+                         const DataLayout &DL, DominatorTree &DT,
+                         AssumptionCache *AC,
+                         SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
+                         SmallSet<DPValue *, 8> *DPVAssignsToDelete) {
   StoreInst *OnlyStore = Info.OnlyStore;
   bool StoringGlobalVal = !isa<Instruction>(OnlyStore->getOperand(0));
   BasicBlock *StoreBB = OnlyStore->getParent();
@@ -525,8 +569,8 @@ static bool rewriteSingleStoreAlloca(
 
   DIBuilder DIB(*AI->getModule(), /*AllowUnresolved*/ false);
   // Update assignment tracking info for the store we're going to delete.
-  Info.AssignmentTracking.updateForDeletedStore(Info.OnlyStore, DIB,
-                                                DbgAssignsToDelete);
+  Info.AssignmentTracking.updateForDeletedStore(
+      Info.OnlyStore, DIB, DbgAssignsToDelete, DPVAssignsToDelete);
 
   // Record debuginfo for the store and remove the declaration's
   // debuginfo.
@@ -570,10 +614,12 @@ static bool rewriteSingleStoreAlloca(
 ///      use(t);
 ///    *A = 42;
 ///  }
-static bool promoteSingleBlockAlloca(
-    AllocaInst *AI, const AllocaInfo &Info, LargeBlockInfo &LBI,
-    const DataLayout &DL, DominatorTree &DT, AssumptionCache *AC,
-    SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete) {
+static bool
+promoteSingleBlockAlloca(AllocaInst *AI, const AllocaInfo &Info,
+                         LargeBlockInfo &LBI, const DataLayout &DL,
+                         DominatorTree &DT, AssumptionCache *AC,
+                         SmallSet<DbgAssignIntrinsic *, 8> *DbgAssignsToDelete,
+                         SmallSet<DPValue *, 8> *DPVAssignsToDelete) {
   // The trickiest case to handle is when we have large blocks. Because of this,
   // this code is optimized assuming that large blocks happen.  This does not
   // significantly pessimize the small block case.  This uses LargeBlockInfo to
@@ -637,8 +683,8 @@ static bool promoteSingleBlockAlloca(
   while (!AI->use_empty()) {
     StoreInst *SI = cast<StoreInst>(AI->user_back());
     // Update assignment tracking info for the store we're going to delete.
-    Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DbgAssignsToDelete);
-
+    Info.AssignmentTracking.updateForDeletedStore(SI, DIB, DbgAssignsToDelete,
+                                                  DPVAssignsToDelete);
     // Record debuginfo for the store before removing it.
     auto DbgUpdateForStore = [&](auto &Container) {
       for (auto *DbgItem : Container) {
@@ -710,7 +756,7 @@ void PromoteMem2Reg::run() {
     // it that are directly dominated by the definition with the value stored.
     if (Info.DefiningBlocks.size() == 1) {
       if (rewriteSingleStoreAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                   &DbgAssignsToDelete)) {
+                                   &DbgAssignsToDelete, &DPVAssignsToDelete)) {
         // The alloca has been processed, move on.
         RemoveFromAllocasList(AllocaNum);
         ++NumSingleStore;
@@ -722,7 +768,7 @@ void PromoteMem2Reg::run() {
     // linear sweep over the block to eliminate it.
     if (Info.OnlyUsedInOneBlock &&
         promoteSingleBlockAlloca(AI, Info, LBI, SQ.DL, DT, AC,
-                                 &DbgAssignsToDelete)) {
+                                 &DbgAssignsToDelete, &DPVAssignsToDelete)) {
       // The alloca has been processed, move on.
       RemoveFromAllocasList(AllocaNum);
       continue;
@@ -1128,8 +1174,8 @@ void PromoteMem2Reg::RenamePass(BasicBlock *BB, BasicBlock *Pred,
 
       // Record debuginfo for the store before removing it.
       IncomingLocs[AllocaNo] = SI->getDebugLoc();
-      AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB,
-                                                   &DbgAssignsToDelete);
+      AllocaATInfo[AllocaNo].updateForDeletedStore(SI, DIB, &DbgAssignsToDelete,
+                                                   &DPVAssignsToDelete);
       auto ConvertDbgDeclares = [&](auto &Container) {
         for (auto *DbgItem : Container)
           if (DbgItem->isAddressOfVariable())
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index f3994b6cc39fe..13eae549b2ce4 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -1770,6 +1770,11 @@ bool SimplifyCFGOpt::hoistSuccIdenticalTerminatorToSwitchOrIf(
   Locs.push_back(I1->getDebugLoc());
   for (auto *OtherSuccTI : OtherSuccTIs)
     Locs.push_back(OtherSuccTI->getDebugLoc());
+  // Also clone DPValues from the existing terminator, and all others (to
+  // duplicate existing hoisting behaviour).
+  NT->cloneDebugInfoFrom(I1);
+  for (Instruction *OtherSuccTI : OtherSuccTIs)
+    NT->cloneDebugInfoFrom(OtherSuccTI);
   NT->setDebugLoc(DILocation::getMergedLocations(Locs));
 
   // PHIs created below will adopt NT's merged DebugLoc.
@@ -3101,10 +3106,12 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
     //   %merge = select %cond, %two, %one
     //   store %merge, %x.dest, !DIAssignID !2
     //   dbg.assign %merge, "x", ..., !2
-    for (auto *DAI : at::getAssignmentMarkers(SpeculatedStore)) {
-      if (llvm::is_contained(DAI->location_ops(), OrigV))
-        DAI->replaceVariableLocationOp(OrigV, S);
-    }
+    auto replaceVariable = [OrigV, S](auto *DbgAssign) {
+      if (llvm::is_contained(DbgAssign->location_ops(), OrigV))
+        DbgAssign->replaceVariableLocationOp(OrigV, S);
+    };
+    for_each(at::getAssignmentMarkers(SpeculatedStore), replaceVariable);
+    for_each(at::getDPVAssignmentMarkers(SpeculatedStore), replaceVariable);
   }
 
   // Metadata can be dependent on the condition we are hoisting above.
@@ -3133,7 +3140,9 @@ bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI,
   // instructions, in the same way that dbg.value intrinsics are dropped at the
   // end of this block.
   for (auto &It : make_range(ThenBB->begin(), ThenBB->end()))
-    It.dropDbgValues();
+    for (DPValue &DPV : make_early_inc_range(It.getDbgValueRange()))
+      if (!DPV.isDbgAssign())
+        It.dropOneDbgValue(&DPV);
   BB->splice(BI->getIterator(), ThenBB, ThenBB->begin(),
              std::prev(ThenBB->end()));
 
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index 71d0f09e47713..380541ffdd49d 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -544,6 +544,16 @@ void Mapper::remapDPValue(DPValue &V) {
   V.setVariable(cast<DILocalVariable>(MappedVar));
   V.setDebugLoc(DebugLoc(cast<DILocation>(MappedDILoc)));
 
+  bool IgnoreMissingLocals = Flags & RF_IgnoreMissingLocals;
+
+  if (V.isDbgAssign()) {
+    auto *NewAddr = mapValue(V.getAddress());
+    if (!IgnoreMissingLocals && !NewAddr)
+      V.setKillAddress();
+    else if (NewAddr)
+      V.setAddress(NewAddr);
+  }
+
   // Find Value operands and remap those.
   SmallVector<Value *, 4> Vals, NewVals;
   for (Value *Val : V.location_ops())
@@ -555,8 +565,6 @@ void Mapper::remapDPValue(DPValue &V) {
   if (Vals == NewVals)
     return;
 
-  bool IgnoreMissingLocals = Flags & RF_IgnoreMissingLocals;
-
   // Otherwise, do some replacement.
   if (!IgnoreMissingLocals &&
       llvm::any_of(NewVals, [&](Value *V) { return V == nullptr; })) {
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 482970bbf3061..809d740aae3e0 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -12988,8 +12988,8 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
     for (ScheduleData *BundleMember = Picked; BundleMember;
          BundleMember = BundleMember->NextInBundle) {
       Instruction *PickedInst = BundleMember->Inst;
-      if (PickedInst->getNextNode() != LastScheduledInst)
-        PickedInst->moveBefore(LastScheduledInst);
+      if (PickedInst->getNextNonDebugInstruction() != LastScheduledInst)
+        PickedInst->moveAfter(LastScheduledInst->getPrevNode());
       LastScheduledInst = PickedInst;
     }
 
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/adce/no-delete.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/adce/no-delete.ll
index 8fce6ab629666..93931c6f68e34 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/adce/no-delete.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/adce/no-delete.ll
@@ -1,5 +1,7 @@
 ; RUN: opt %s -passes=adce -S -o - \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -passes=adce -S -o - \
+; RUN: | FileCheck %s
 
 ;; $ cat test.c
 ;; void fun(int local) {}
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
index f1704f4bcc98b..5c92b1aa2b36f 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/dse-after-memcpyopt-merge.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=dse -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=dse -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Observed in the wild, but test is created by running memcpyopt on
 ;; assignment-tracking/memcpyopt/merge-stores.ll then manually inserting
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/shorten-offset.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/shorten-offset.ll
index b6dc56acf7636..c7420dfa31775 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/shorten-offset.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/shorten-offset.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=dse -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=dse -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Based on the test shorten.ll with some adjustments.
 ;;
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/shorten.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/shorten.ll
index 37d22f4df576a..a2cf5923d680f 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/dse/shorten.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/dse/shorten.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=dse -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=dse -o - | FileCheck %s
 
 ;; $ cat test.cpp
 ;; void esc(int*);
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/inline/id.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/inline/id.ll
index 51d9511ec17b9..7260ec54360d1 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/inline/id.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/inline/id.ll
@@ -1,5 +1,5 @@
-; RUN: opt %s -S -passes=inline -o - \
-; RUN: | FileCheck %s
+; RUN: opt %s -S -passes=inline -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=inline -o - | FileCheck %s
 
 ;; Check that all DIAssignID metadata that are inlined are replaced with new
 ;; versions. Otherwise two inlined instances of an assignment will be considered
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/inline/inline-stores.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/inline/inline-stores.ll
index 4d4a62947177c..1b004011af839 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/inline/inline-stores.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/inline/inline-stores.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=inline %s -S -o - \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=inline %s -S -o - \
+; RUN: | FileCheck %s
 
 ;; $ cat test.cpp
 ;; __attribute__((always_inline))
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/inline/shared-alloca.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/inline/shared-alloca.ll
index e77a3a58bdad2..33bc2f196872f 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/inline/shared-alloca.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/inline/shared-alloca.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S %s -passes=inline -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -S %s -passes=inline -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; The dbg.assign linked to the large alloca describes a variable sitting at
 ;; offset 0, size 64. Check:
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/inline/use-before-def.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/inline/use-before-def.ll
index 84f00d5dd7a69..ff7e5b18a9439 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/inline/use-before-def.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/inline/use-before-def.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=inline %s -S -o - \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=inline %s -S -o - \
+; RUN: | FileCheck %s
 
 ;; Hand modified from:
 ;; $ cat test.c
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/do-not-remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/do-not-remove-redundant-dbg.ll
index f14a6e63b63f0..9763064dac2b0 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/do-not-remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/do-not-remove-redundant-dbg.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=instcombine -S %s -o -  \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=instcombine -S %s -o -  \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Based on the test remove-redundant-dbg.ll.
 ;;
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/memset.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/memset.ll
index b3a529597e20a..3e343ceb3d1bd 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/memset.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/memset.ll
@@ -1,5 +1,7 @@
 ; RUN: opt %s -S -passes=instcombine -o - \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=instcombine -o - \
+; RUN: | FileCheck %s
 
 ;; $ cat test.cpp
 ;; void esc(int*);
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/sink-store.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/sink-store.ll
index 697df132b086d..668e9567e4fe4 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/sink-store.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/sink-store.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=instcombine | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=instcombine | FileCheck %s
 
 ;; Check that instcombine merges the DIAssignID metadata when merging two
 ;; stores into a successor. Filecheck directives inline.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/sink.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/sink.ll
index cdbf876fdfaf9..f233e9ff704bb 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/sink.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/sink.ll
@@ -1,5 +1,7 @@
 ; RUN: opt %s -S -passes=instcombine -o - \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=instcombine -o - \
+; RUN: | FileCheck %s
 
 ;; Check that when instcombine sinks an instruction used by a dbg.assign, the
 ;; usual debug intrinsic updating doesn't take place (i.e. do not
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/store-new-type.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/store-new-type.ll
index cbd35af94c512..cf427323dc2e5 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/store-new-type.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/store-new-type.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -passes=instcombine -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -passes=instcombine -S | FileCheck %s
 
 ;; Based on test/Transforms/InstCombine/shufflevec-bitcast.ll in which the
 ;; store of <4 x i4> is replaced with a store of type <2 x i8>. Debug info
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/storemerge.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/storemerge.ll
index ce0c88775e5bd..c0ef6750d785f 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/storemerge.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/storemerge.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=instcombine -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=instcombine -o - | FileCheck %s
 
 ;; $ cat test.cpp
 ;; class a {
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/licm/merge.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/licm/merge.ll
index e4db307b14ebe..187194dd93261 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/licm/merge.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/licm/merge.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=licm %s -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=licm %s -S | FileCheck %s
 
 ;; Ensure that we correctly merge the DIAssignID's from the sunk stores, add it
 ;; to the new new store instruction, and update the dbg.assign intrinsics using
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/licm/multi-exit.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/licm/multi-exit.ll
index 9f8723bd1dded..5684f4d42a8a5 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/licm/multi-exit.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/licm/multi-exit.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=licm -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=licm -o - | FileCheck %s
 
 ;; $ cat test.c
 ;; int b, c, d;
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/loop-deletion/dead-loop.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/loop-deletion/dead-loop.ll
index 0e3a700246279..afd5b18370535 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/loop-deletion/dead-loop.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/loop-deletion/dead-loop.ll
@@ -1,5 +1,7 @@
 ; RUN: opt %s -passes=loop-deletion -S -o - \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -passes=loop-deletion -S -o - \
+; RUN: | FileCheck %s
 
 ;; $ cat test.cpp:
 ;; void esc(int*);
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/phi.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/phi.ll
index d7c1299788703..8a72377c6375f 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/phi.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/phi.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=mem2reg -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=mem2reg -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Test assignment tracking debug info when mem2reg promotes an alloca with
 ;; stores requiring insertion of a phi. Check the output when the stores are
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/single-block-alloca.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/single-block-alloca.ll
index 26e61cd50e451..e3cbe89dceecb 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/single-block-alloca.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/single-block-alloca.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=mem2reg -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=mem2reg -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Test assignment tracking debug info when mem2reg promotes a single-block
 ;; alloca. Check the output when the stores are tagged and also untagged (test
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/single-store-alloca.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/single-store-alloca.ll
index ea8f0b8a10e8c..1753ca5644e3e 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/single-store-alloca.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/single-store-alloca.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=mem2reg -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=mem2reg -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Test assignment tracking debug info when mem2reg promotes a single-store
 ;; alloca. Additionally, check that all the dbg.assigns linked to the alloca
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/store-to-part-of-alloca.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/store-to-part-of-alloca.ll
index 5cf673cde2629..7242c4eafafce 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/store-to-part-of-alloca.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/mem2reg/store-to-part-of-alloca.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=mem2reg -S %s -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=mem2reg -S %s -o - | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ; CHECK: llvm.dbg.value(metadata i64 0, metadata ![[#]], metadata !DIExpression(DW_OP_LLVM_fragment, 0, 32))
 
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/memcpyopt/merge-stores.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/memcpyopt/merge-stores.ll
index ed4f1dd1b628d..b62f1966d961e 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/memcpyopt/merge-stores.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/memcpyopt/merge-stores.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=memcpyopt -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=memcpyopt -o - | FileCheck %s
 
 ;; $ cat test.cpp
 ;; struct v {
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/mldst-motion/diamond.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/mldst-motion/diamond.ll
index ae86e747fc389..395c5b581c84c 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/mldst-motion/diamond.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/mldst-motion/diamond.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=mldst-motion -S %s -o - \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=mldst-motion -S %s -o - \
+; RUN: | FileCheck %s
 
 ;; $ cat test.cpp
 ;; int cond;
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/instruction-type.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/instruction-type.ll
index 2ca9f0d431afd..95cba59a987e9 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/instruction-type.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/instruction-type.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S %s -passes=verify 2>&1 \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S %s -passes=verify 2>&1 \
+; RUN: | FileCheck %s
 
 ;; NOTE: Expect opt to return zero because the badly formed debug info
 ;; is going to be stripped.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll
index 0a4b7c255dc71..55ad1814463fb 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/verify.ll
@@ -1,5 +1,7 @@
 ; RUN: opt %s -S -passes=verify 2>&1 \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=verify 2>&1 \
+; RUN: | FileCheck %s
 
 ;; Check that badly formed assignment tracking metadata is caught either
 ;; while parsing or by the verifier.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant-fwd-scan-linked.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant-fwd-scan-linked.ll
index 0d9c6021a5302..d2cc9d9180fc4 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant-fwd-scan-linked.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant-fwd-scan-linked.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=redundant-dbg-inst-elim -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=redundant-dbg-inst-elim -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; $ cat -n reduce.c
 ;;  1	void ext();
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/salvage-value.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/salvage-value.ll
index ba591b1aa5425..3c1ef0791945f 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/salvage-value.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/salvage-value.ll
@@ -1,5 +1,7 @@
 ; RUN: opt %s -S -o - -passes=instcombine \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -o - -passes=instcombine \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Hand-written (the debug info doesn't necessarily make sense and isn't fully
 ;; formed). Test salvaging a dbg.assign value and address. Checks and comments
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/simplifycfg/empty-block.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/simplifycfg/empty-block.ll
index e4a22dc1d7026..338625eb27f76 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/simplifycfg/empty-block.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/simplifycfg/empty-block.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S %s -passes=simplifycfg -o - \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S %s -passes=simplifycfg -o - \
+; RUN: | FileCheck %s
 
 ;; $ cat test.cpp
 ;; class a {};
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/simplifycfg/speculated-store.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/simplifycfg/speculated-store.ll
index fd0110d16912c..7b1a80540c48a 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/simplifycfg/speculated-store.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/simplifycfg/speculated-store.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=simplifycfg %s -S  \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=simplifycfg %s -S  \
+; RUN: | FileCheck %s
 
 ;; Ensure that we correctly update the value component of dbg.assign intrinsics
 ;; after merging a conditional block with a store its the predecessor. The
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll
index d675300395acf..ab1e1e966139b 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/slp-vectorizer/merge-scalars.ll
@@ -1,6 +1,8 @@
 ; REQUIRES: x86-registered-target
 ; RUN: opt -passes=slp-vectorizer -S -o - %s \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=slp-vectorizer -S -o - %s \
+; RUN: | FileCheck %s
 
 ;; $ cat test.cpp
 ;; float get();
diff --git a/llvm/test/Transforms/HotColdSplit/invalid-dbg-assign.ll b/llvm/test/Transforms/HotColdSplit/invalid-dbg-assign.ll
index 3163fc7071e97..74a096019c664 100644
--- a/llvm/test/Transforms/HotColdSplit/invalid-dbg-assign.ll
+++ b/llvm/test/Transforms/HotColdSplit/invalid-dbg-assign.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=hotcoldsplit -hotcoldsplit-threshold=-1 -S %s | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=hotcoldsplit -hotcoldsplit-threshold=-1 -S %s | FileCheck %s
 declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
 
 ; CHECK: define void @foo

From 6bb7d515c3577afbae9293caa050e02643a19355 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= <Mirko.Brkusanin@amd.com>
Date: Tue, 23 Jan 2024 17:21:16 +0100
Subject: [PATCH 640/843] [AMDGPU] Properly check op_sel in GCNDPPCombine
 (#79122)

---
 llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp      | 28 +++++--
 .../test/CodeGen/AMDGPU/dpp_combine_gfx11.mir | 84 ++++++++++++++++++-
 2 files changed, 100 insertions(+), 12 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index a75082268c773..94d28dc0a2c74 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -274,8 +274,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       break;
     }
 
-    if (auto *Mod0 = TII->getNamedOperand(OrigMI,
-                                          AMDGPU::OpName::src0_modifiers)) {
+    auto *Mod0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0_modifiers);
+    if (Mod0) {
       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
                                           AMDGPU::OpName::src0_modifiers));
       assert(HasVOP3DPP ||
@@ -298,8 +298,8 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
     DPPInst->getOperand(NumOperands).setIsKill(false);
     ++NumOperands;
 
-    if (auto *Mod1 = TII->getNamedOperand(OrigMI,
-                                          AMDGPU::OpName::src1_modifiers)) {
+    auto *Mod1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1_modifiers);
+    if (Mod1) {
       assert(NumOperands == AMDGPU::getNamedOperandIdx(DPPOp,
                                           AMDGPU::OpName::src1_modifiers));
       assert(HasVOP3DPP ||
@@ -330,8 +330,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       DPPInst.add(*Src1);
       ++NumOperands;
     }
-    if (auto *Mod2 =
-            TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers)) {
+
+    auto *Mod2 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src2_modifiers);
+    if (Mod2) {
       assert(NumOperands ==
              AMDGPU::getNamedOperandIdx(DPPOp, AMDGPU::OpName::src2_modifiers));
       assert(HasVOP3DPP ||
@@ -350,6 +351,7 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       DPPInst.add(*Src2);
       ++NumOperands;
     }
+
     if (HasVOP3DPP) {
       auto *ClampOpr = TII->getNamedOperand(OrigMI, AMDGPU::OpName::clamp);
       if (ClampOpr && AMDGPU::hasNamedOperand(DPPOp, AMDGPU::OpName::clamp)) {
@@ -368,7 +370,13 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       // all 1.
       if (auto *OpSelOpr =
               TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel)) {
-        auto OpSel = OpSelOpr->getImm();
+        int64_t OpSel = 0;
+        OpSel |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_0) << 0) : 0);
+        OpSel |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_0) << 1) : 0);
+        OpSel |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_0) << 2) : 0);
+        if (Mod0 && TII->isVOP3(OrigMI) && !TII->isVOP3P(OrigMI))
+          OpSel |= !!(Mod0->getImm() & SISrcMods::DST_OP_SEL) << 3;
+
         if (OpSel != 0) {
           LLVM_DEBUG(dbgs() << "  failed: op_sel must be zero\n");
           Fail = true;
@@ -379,7 +387,11 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
       }
       if (auto *OpSelHiOpr =
               TII->getNamedOperand(OrigMI, AMDGPU::OpName::op_sel_hi)) {
-        auto OpSelHi = OpSelHiOpr->getImm();
+        int64_t OpSelHi = 0;
+        OpSelHi |= (Mod0 ? (!!(Mod0->getImm() & SISrcMods::OP_SEL_1) << 0) : 0);
+        OpSelHi |= (Mod1 ? (!!(Mod1->getImm() & SISrcMods::OP_SEL_1) << 1) : 0);
+        OpSelHi |= (Mod2 ? (!!(Mod2->getImm() & SISrcMods::OP_SEL_1) << 2) : 0);
+
         // Only vop3p has op_sel_hi, and all vop3p have 3 operands, so check
         // the bitmask for 3 op_sel_hi bits set
         assert(Src2 && "Expected vop3p with 3 operands");
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 89ea3ed12aea3..c48231f3851a7 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -83,7 +83,8 @@ body:             |
 # Regression test for src_modifiers on base u16 opcode
 # GCN-label: name: vop3_u16
 # GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 4, %5, 8, %5, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
 name: vop3_u16
 tracksRegLiveness: true
 body:             |
@@ -97,7 +98,9 @@ body:             |
     %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
     %5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec
     %6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec
-    %7:vgpr_32 = V_ADD_NC_U16_e64 4, %6, 8, %5, 0, 0, implicit $exec
+    %7:vgpr_32 = V_ADD_NC_U16_e64 1, %6, 2, %5, 0, 0, implicit $exec
+    %8:vgpr_32 = V_MOV_B32_dpp %3, %7, 1, 15, 15, 1, implicit $exec
+    %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
 ...
 
 name: vop3p
@@ -116,7 +119,7 @@ body:             |
     ; GCN: [[V_DOT2_F32_F16_:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp]], 0, [[COPY]], 0, [[COPY2]], 0, 5, 0, 0, 0, implicit $mode, implicit $exec
     ; GCN: [[V_MOV_B32_dpp1:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[DEF]], [[COPY1]], 1, 15, 15, 1, implicit $exec
     ; GCN: [[V_DOT2_F32_F16_1:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16 0, [[V_MOV_B32_dpp1]], 0, [[COPY]], 0, [[COPY2]], 0, 0, 4, 0, 0, implicit $mode, implicit $exec
-    ; GCN: [[V_DOT2_F32_F16_dpp:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16_dpp [[DEF]], 10, [[COPY1]], 8, [[COPY]], 13, [[COPY2]], 1, 0, 7, 4, 5, 1, 15, 15, 1, implicit $mode, implicit $exec
+    ; GCN: [[V_DOT2_F32_F16_dpp:%[0-9]+]]:vgpr_32 = V_DOT2_F32_F16_dpp [[DEF]], 10, [[COPY1]], 8, [[COPY]], 9, [[COPY2]], 1, 0, 7, 4, 5, 1, 15, 15, 1, implicit $mode, implicit $exec
     ; GCN: [[V_FMA_MIX_F32_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIX_F32_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
     ; GCN: [[V_FMA_MIXLO_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXLO_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 0, [[COPY2]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
     ; GCN: [[V_FMA_MIXHI_F16_dpp:%[0-9]+]]:vgpr_32 = V_FMA_MIXHI_F16_dpp [[DEF]], 8, [[COPY1]], 8, [[COPY]], 8, [[COPY2]], 1, [[COPY]], 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
@@ -134,7 +137,7 @@ body:             |
     %7:vgpr_32 = V_DOT2_F32_F16 0, %6, 0, %0, 0, %2, 0, 0, 4, 0, 0, implicit $mode, implicit $exec
 
     %8:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
-    %9:vgpr_32 = V_DOT2_F32_F16 10, %8, 8, %0, 13, %2, 1, 0, 7, 4, 5, implicit $mode, implicit $exec
+    %9:vgpr_32 = V_DOT2_F32_F16 10, %8, 8, %0, 9, %2, 1, 0, 7, 4, 5, implicit $mode, implicit $exec
 
     %10:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
     %11:vgpr_32 = V_FMA_MIX_F32 8, %10, 8, %0, 8, %2, 1, 0, 7, implicit $mode, implicit $exec
@@ -871,3 +874,76 @@ body: |
     %5:vgpr_32 = V_ADD_U32_e32 %4.sub0, %4.sub0, implicit $exec
     %6:vgpr_32 = V_ADDC_U32_e32 %4.sub1, %4.sub1, implicit-def $vcc, implicit $vcc, implicit $exec
 ...
+
+# Check op_sel is all 0s when combining
+# GCN-LABEL: name: opsel_vop3
+# GCN: %4:vgpr_32 = V_ADD_I16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+# GCN: %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+# GCN: %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+# GCN: %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec
+name:            opsel_vop3
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = IMPLICIT_DEF
+
+    ; Combine for op_sel:[0,0,0]
+    %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %4:vgpr_32 = V_ADD_I16_e64 0, %3, 0, %1, 0, 0, implicit $exec
+
+    ; Do not combine for op_sel:[1,0,0]
+    %5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+
+    ; Do not combine for op_sel:[0,1,0]
+    %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+
+    ; Do not combine for op_sel:[1,1,0]
+    %9:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+
+    ; Do not combine for op_sel:[0,0,1] (dst_op_sel only)
+    %11:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
+    %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec
+...
+
+# Check op_sel is all 0s and op_sel_hi is all 1s when combining
+# GCN-LABEL: name: opsel_vop3p
+# GCN: %5:vgpr_32 = V_FMA_MIX_F32 0, %4, 0, %1, 0, %2, 0, 0, 0, implicit $mode, implicit $exec
+# GCN: %7:vgpr_32 = V_FMA_MIX_F32 4, %6, 4, %1, 4, %2, 0, 0, 0, implicit $mode, implicit $exec
+# GCN: %9:vgpr_32 = V_FMA_MIX_F32_dpp %3, 8, %0, 8, %1, 8, %2, 0, 0, 7, 1, 15, 15, 1, implicit $mode, implicit $exec
+# GCN: %11:vgpr_32 = V_FMA_MIX_F32 12, %10, 12, %1, 12, %2, 0, 0, 0, implicit $mode, implicit $exec
+
+name: opsel_vop3p
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2
+
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = COPY $vgpr1
+    %2:vgpr_32 = COPY $vgpr2
+    %3:vgpr_32 = IMPLICIT_DEF
+
+    ; Do not combine for op_sel:[0,0,0] op_sel_hi:[0,0,0]
+    %4:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
+    %5:vgpr_32 = V_FMA_MIX_F32 0, %4, 0, %1, 0, %2, 0, 0, 0, implicit $mode, implicit $exec
+
+    ; Do not combine for op_sel:[1,1,1] op_sel_hi:[0,0,0]
+    %6:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
+    %7:vgpr_32 = V_FMA_MIX_F32 4, %6, 4, %1, 4, %2, 0, 0, 0, implicit $mode, implicit $exec
+
+    ; Combine for op_sel:[0,0,0] op_sel_hi:[1,1,1]
+    %8:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
+    %9:vgpr_32 = V_FMA_MIX_F32 8, %8, 8, %1, 8, %2, 0, 0, 0, implicit $mode, implicit $exec
+
+    ; Do not combine for op_sel:[1,1,1] op_sel_hi:[1,1,1]
+    %10:vgpr_32 = V_MOV_B32_dpp %3, %0, 1, 15, 15, 1, implicit $exec
+    %11:vgpr_32 = V_FMA_MIX_F32 12, %10, 12, %1, 12, %2, 0, 0, 0, implicit $mode, implicit $exec
+...

From 4782ac8dd3cfa96e14ad4eff1389bbcfda27240f Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 23 Jan 2024 16:23:48 +0000
Subject: [PATCH 641/843] [DebugInfo][RemoveDIs] Use splice in Outliner rather
 than moveBefore (#79124)

This patch replaces a utility in the outliner that moves the contents of
one basic block into another basic block, with a call to splice instead.
I think it's NFC, however I'd like a second pair of eyes to look at it
just in case.

The reason for doing this is an edge case in the handling of DPValue
objects, the replacement for dbg.values. If there's a variable
assignment "dangling" at the end of a block (which happens when we
delete the terminator), inserting instructions at end() doesn't shift
the DPValue up into the block. We could probably fix this; but it's much
easier to use splice at the only call site that does this.

Patch adds --try-experimental-debuginfo-iterators to a test to exercise
this code path.
---
 llvm/lib/Transforms/IPO/IROutliner.cpp         | 3 +--
 llvm/test/Transforms/IROutliner/legal-debug.ll | 1 +
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/IPO/IROutliner.cpp b/llvm/lib/Transforms/IPO/IROutliner.cpp
index a6e19df7c5f1f..8e6d0e814372d 100644
--- a/llvm/lib/Transforms/IPO/IROutliner.cpp
+++ b/llvm/lib/Transforms/IPO/IROutliner.cpp
@@ -154,8 +154,7 @@ struct OutlinableGroup {
 /// \param SourceBB - the BasicBlock to pull Instructions from.
 /// \param TargetBB - the BasicBlock to put Instruction into.
 static void moveBBContents(BasicBlock &SourceBB, BasicBlock &TargetBB) {
-  for (Instruction &I : llvm::make_early_inc_range(SourceBB))
-    I.moveBeforePreserving(TargetBB, TargetBB.end());
+  TargetBB.splice(TargetBB.end(), &SourceBB);
 }
 
 /// A function to sort the keys of \p Map, which must be a mapping of constant
diff --git a/llvm/test/Transforms/IROutliner/legal-debug.ll b/llvm/test/Transforms/IROutliner/legal-debug.ll
index b7b472fa20b3e..be1182b38fa2d 100644
--- a/llvm/test/Transforms/IROutliner/legal-debug.ll
+++ b/llvm/test/Transforms/IROutliner/legal-debug.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
 ; RUN: opt -S -passes=verify,iroutliner -ir-outlining-no-cost < %s | FileCheck %s
+; RUN: opt -S -passes=verify,iroutliner -ir-outlining-no-cost < %s --try-experimental-debuginfo-iterators | FileCheck %s
 
 ; This test checks that debug info is recognized as able to be extracted along
 ; with the other instructions, but is not included in the consolidated function.

From e1aa5b1fd12d548863cf73c1ec09f98fe89d117b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 23 Jan 2024 16:22:51 +0000
Subject: [PATCH 642/843] [DAG] visitSCALAR_TO_VECTOR - don't fold
 scalar_to_vector(bin(extract(x),extract(y)) -> bin(x,y) if extracts have
 other uses

Fixes #78897 - although the test case still has a number of poor codegen issues (in particular for i686 triples) that will need addressing (combining the nodes in topological order should help).
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  3 +
 llvm/test/CodeGen/X86/pr78897.ll              | 82 +++++++------------
 2 files changed, 34 insertions(+), 51 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9365e65544577..98d8a6d9409f2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -26021,6 +26021,7 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
   // Try to convert a scalar binop with an extracted vector element to a vector
   // binop. This is intended to reduce potentially expensive register moves.
   // TODO: Check if both operands are extracted.
+  // TODO: How to prefer scalar/vector ops with multiple uses of the extact?
   // TODO: Generalize this, so it can be called from visitINSERT_VECTOR_ELT().
   SDValue Scalar = N->getOperand(0);
   unsigned Opcode = Scalar.getOpcode();
@@ -26029,6 +26030,8 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
       TLI.isBinOp(Opcode) && Scalar.getValueType() == VecEltVT &&
       Scalar.getOperand(0).getValueType() == VecEltVT &&
       Scalar.getOperand(1).getValueType() == VecEltVT &&
+      Scalar->isOnlyUserOf(Scalar.getOperand(0).getNode()) &&
+      Scalar->isOnlyUserOf(Scalar.getOperand(1).getNode()) &&
       DAG.isSafeToSpeculativelyExecute(Opcode) && hasOperation(Opcode, VT)) {
     // Match an extract element and get a shuffle mask equivalent.
     SmallVector<int, 8> ShufMask(VT.getVectorNumElements(), -1);
diff --git a/llvm/test/CodeGen/X86/pr78897.ll b/llvm/test/CodeGen/X86/pr78897.ll
index d2562e28584fb..0c1c3cafc4ea6 100644
--- a/llvm/test/CodeGen/X86/pr78897.ll
+++ b/llvm/test/CodeGen/X86/pr78897.ll
@@ -8,7 +8,8 @@
 ; RUN: llc < %s -mtriple=i686--   -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X86-AVX512
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64-AVX512
 
-; FIXME: PR78897 - Don't vectorize a mul if we still need the extract
+; PR78897 - Don't vectorize a mul of extracted values if we'd still need the extract.
+; TODO: We should vectorize on 32-bit targets.
 define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X86-SSE2-LABEL: produceShuffleVectorForByte:
 ; X86-SSE2:       # %bb.0: # %entry
@@ -70,21 +71,13 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
 ; X64-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X64-SSE2-NEXT:    movq %xmm1, %rax
-; X64-SSE2-NEXT:    movdqa %xmm1, %xmm2
-; X64-SSE2-NEXT:    psrlq $32, %xmm2
-; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440]
-; X64-SSE2-NEXT:    pmuludq %xmm3, %xmm2
-; X64-SSE2-NEXT:    movdqa {{.*#+}} xmm4 = [286331153,286331153]
-; X64-SSE2-NEXT:    pmuludq %xmm1, %xmm4
-; X64-SSE2-NEXT:    paddq %xmm2, %xmm4
-; X64-SSE2-NEXT:    psllq $32, %xmm4
-; X64-SSE2-NEXT:    pmuludq %xmm3, %xmm1
-; X64-SSE2-NEXT:    paddq %xmm4, %xmm1
-; X64-SSE2-NEXT:    movabsq $76861433640456465, %rcx # imm = 0x111111111111111
-; X64-SSE2-NEXT:    xorq %rax, %rcx
-; X64-SSE2-NEXT:    movabsq $1229782938247303440, %rax # imm = 0x1111111111111110
+; X64-SSE2-NEXT:    movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110
+; X64-SSE2-NEXT:    movabsq $76861433640456465, %rdx # imm = 0x111111111111111
+; X64-SSE2-NEXT:    xorq %rax, %rdx
 ; X64-SSE2-NEXT:    imulq %rcx, %rax
-; X64-SSE2-NEXT:    movq %rax, %xmm2
+; X64-SSE2-NEXT:    movq %rax, %xmm1
+; X64-SSE2-NEXT:    imulq %rcx, %rdx
+; X64-SSE2-NEXT:    movq %rdx, %xmm2
 ; X64-SSE2-NEXT:    pand %xmm0, %xmm1
 ; X64-SSE2-NEXT:    pandn %xmm2, %xmm0
 ; X64-SSE2-NEXT:    por %xmm1, %xmm0
@@ -147,24 +140,16 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X64-SSE42-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; X64-SSE42-NEXT:    pxor %xmm0, %xmm0
 ; X64-SSE42-NEXT:    pcmpeqb %xmm1, %xmm0
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm2 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
-; X64-SSE42-NEXT:    pand %xmm0, %xmm2
-; X64-SSE42-NEXT:    movq %xmm2, %rax
-; X64-SSE42-NEXT:    movdqa %xmm2, %xmm1
-; X64-SSE42-NEXT:    psrlq $32, %xmm1
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440]
-; X64-SSE42-NEXT:    pmuludq %xmm3, %xmm1
-; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm4 = [286331153,286331153]
-; X64-SSE42-NEXT:    pmuludq %xmm2, %xmm4
-; X64-SSE42-NEXT:    paddq %xmm1, %xmm4
-; X64-SSE42-NEXT:    psllq $32, %xmm4
-; X64-SSE42-NEXT:    pmuludq %xmm3, %xmm2
-; X64-SSE42-NEXT:    paddq %xmm4, %xmm2
-; X64-SSE42-NEXT:    movabsq $76861433640456465, %rcx # imm = 0x111111111111111
-; X64-SSE42-NEXT:    xorq %rax, %rcx
-; X64-SSE42-NEXT:    movabsq $1229782938247303440, %rax # imm = 0x1111111111111110
+; X64-SSE42-NEXT:    movdqa {{.*#+}} xmm1 = [17,17,17,17,17,17,17,17,u,u,u,u,u,u,u,u]
+; X64-SSE42-NEXT:    pand %xmm0, %xmm1
+; X64-SSE42-NEXT:    movq %xmm1, %rax
+; X64-SSE42-NEXT:    movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110
+; X64-SSE42-NEXT:    movabsq $76861433640456465, %rdx # imm = 0x111111111111111
+; X64-SSE42-NEXT:    xorq %rax, %rdx
 ; X64-SSE42-NEXT:    imulq %rcx, %rax
-; X64-SSE42-NEXT:    movq %rax, %xmm1
+; X64-SSE42-NEXT:    movq %rax, %xmm2
+; X64-SSE42-NEXT:    imulq %rcx, %rdx
+; X64-SSE42-NEXT:    movq %rdx, %xmm1
 ; X64-SSE42-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; X64-SSE42-NEXT:    movdqa %xmm1, %xmm0
 ; X64-SSE42-NEXT:    psrlw $4, %xmm0
@@ -220,19 +205,13 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X64-AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
 ; X64-AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
 ; X64-AVX2-NEXT:    vmovq %xmm1, %rax
-; X64-AVX2-NEXT:    vpsrlq $32, %xmm1, %xmm2
-; X64-AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1229782938247303440,1229782938247303440]
-; X64-AVX2-NEXT:    vpmuludq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm4
-; X64-AVX2-NEXT:    vpmuludq %xmm3, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpaddq %xmm2, %xmm4, %xmm2
-; X64-AVX2-NEXT:    vpsllq $32, %xmm2, %xmm2
-; X64-AVX2-NEXT:    vpmuludq %xmm3, %xmm1, %xmm1
-; X64-AVX2-NEXT:    vpaddq %xmm2, %xmm1, %xmm1
-; X64-AVX2-NEXT:    movabsq $76861433640456465, %rcx # imm = 0x111111111111111
-; X64-AVX2-NEXT:    xorq %rax, %rcx
-; X64-AVX2-NEXT:    movabsq $1229782938247303440, %rax # imm = 0x1111111111111110
+; X64-AVX2-NEXT:    movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110
+; X64-AVX2-NEXT:    movabsq $76861433640456465, %rdx # imm = 0x111111111111111
+; X64-AVX2-NEXT:    xorq %rax, %rdx
 ; X64-AVX2-NEXT:    imulq %rcx, %rax
-; X64-AVX2-NEXT:    vmovq %rax, %xmm2
+; X64-AVX2-NEXT:    vmovq %rax, %xmm1
+; X64-AVX2-NEXT:    imulq %rcx, %rdx
+; X64-AVX2-NEXT:    vmovq %rdx, %xmm2
 ; X64-AVX2-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
 ; X64-AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm1
 ; X64-AVX2-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
@@ -280,16 +259,17 @@ define <16 x i8> @produceShuffleVectorForByte(i8 zeroext %0) nounwind {
 ; X64-AVX512-NEXT:    vpbroadcastb %edi, %xmm0
 ; X64-AVX512-NEXT:    vptestnmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k1
 ; X64-AVX512-NEXT:    vmovdqu8 {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 {%k1} {z}
-; X64-AVX512-NEXT:    vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm1
 ; X64-AVX512-NEXT:    vmovq %xmm0, %rax
-; X64-AVX512-NEXT:    movabsq $76861433640456465, %rcx # imm = 0x111111111111111
-; X64-AVX512-NEXT:    xorq %rax, %rcx
-; X64-AVX512-NEXT:    movabsq $1229782938247303440, %rax # imm = 0x1111111111111110
+; X64-AVX512-NEXT:    movabsq $1229782938247303440, %rcx # imm = 0x1111111111111110
+; X64-AVX512-NEXT:    movabsq $76861433640456465, %rdx # imm = 0x111111111111111
+; X64-AVX512-NEXT:    xorq %rax, %rdx
 ; X64-AVX512-NEXT:    imulq %rcx, %rax
 ; X64-AVX512-NEXT:    vmovq %rax, %xmm0
-; X64-AVX512-NEXT:    vmovdqu8 %xmm1, %xmm0 {%k1}
-; X64-AVX512-NEXT:    vpsrlw $4, %xmm0, %xmm1
-; X64-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7]
+; X64-AVX512-NEXT:    imulq %rcx, %rdx
+; X64-AVX512-NEXT:    vmovq %rdx, %xmm1
+; X64-AVX512-NEXT:    vmovdqu8 %xmm0, %xmm1 {%k1}
+; X64-AVX512-NEXT:    vpsrlw $4, %xmm1, %xmm0
+; X64-AVX512-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
 ; X64-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
 ; X64-AVX512-NEXT:    retq
 entry:

From 2856db0d3b691907e055265c10f3ccc9b04f594e Mon Sep 17 00:00:00 2001
From: Guillaume Chatelet <gchatelet@google.com>
Date: Tue, 23 Jan 2024 17:30:19 +0100
Subject: [PATCH 643/843] [libc][NFC] Remove `FPBits` cast operator (#79142)

The semantics for casting can range from "bitcast" (same representation)
to "different representation", to "type promotion". Here we remove the
cast operator and force usage of `get_val` as the only function to get
the floating point value, making the intent clearer and more consistent.
---
 libc/src/__support/FPUtil/BasicOperations.h   |  2 +-
 .../FPUtil/DivisionAndRemainderOperations.h   |  2 +-
 libc/src/__support/FPUtil/FPBits.h            |  2 -
 libc/src/__support/FPUtil/Hypot.h             | 10 ++---
 .../__support/FPUtil/ManipulationFunctions.h  | 14 +++----
 .../FPUtil/NearestIntegerOperations.h         | 14 +++----
 libc/src/__support/FPUtil/NormalFloat.h       | 14 +++----
 libc/src/__support/FPUtil/generic/FMA.h       |  8 ++--
 libc/src/__support/str_to_float.h             |  8 ++--
 libc/src/math/generic/exp.cpp                 |  2 +-
 libc/src/math/generic/exp10.cpp               |  2 +-
 libc/src/math/generic/exp2.cpp                |  2 +-
 libc/src/math/generic/expf.cpp                |  2 +-
 libc/src/math/generic/hypotf.cpp              |  6 +--
 libc/src/math/generic/log10f.cpp              |  2 +-
 libc/src/math/generic/log1pf.cpp              |  4 +-
 libc/src/math/generic/log2f.cpp               |  2 +-
 libc/src/math/generic/logf.cpp                |  4 +-
 libc/src/math/generic/range_reduction_fma.h   |  4 +-
 .../stdio/printf_core/float_dec_converter.h   |  6 +--
 libc/test/UnitTest/FPMatcher.h                | 40 +++++++++----------
 libc/test/src/math/CeilTest.h                 |  2 +-
 libc/test/src/math/CopySignTest.h             |  2 +-
 libc/test/src/math/FAbsTest.h                 |  2 +-
 libc/test/src/math/FDimTest.h                 | 12 +++---
 libc/test/src/math/FMaxTest.h                 |  2 +-
 libc/test/src/math/FMinTest.h                 |  2 +-
 libc/test/src/math/FloorTest.h                |  2 +-
 libc/test/src/math/FmaTest.h                  | 28 ++++++-------
 libc/test/src/math/FrexpTest.h                |  2 +-
 libc/test/src/math/HypotTest.h                | 22 +++++-----
 libc/test/src/math/ILogbTest.h                | 14 +++----
 libc/test/src/math/LdExpTest.h                | 10 ++---
 libc/test/src/math/LogbTest.h                 |  2 +-
 libc/test/src/math/ModfTest.h                 |  2 +-
 libc/test/src/math/NextAfterTest.h            | 10 ++---
 libc/test/src/math/RIntTest.h                 | 14 +++----
 libc/test/src/math/RemQuoTest.h               | 14 +++----
 libc/test/src/math/RoundTest.h                |  2 +-
 libc/test/src/math/RoundToIntegerTest.h       | 18 ++++-----
 libc/test/src/math/SqrtTest.h                 |  2 +-
 libc/test/src/math/TruncTest.h                |  2 +-
 libc/test/src/math/acosf_test.cpp             |  4 +-
 libc/test/src/math/acoshf_test.cpp            |  4 +-
 libc/test/src/math/asinf_test.cpp             |  4 +-
 libc/test/src/math/asinhf_test.cpp            |  4 +-
 libc/test/src/math/atanf_test.cpp             |  4 +-
 libc/test/src/math/atanhf_test.cpp            |  6 +--
 libc/test/src/math/cos_test.cpp               |  2 +-
 libc/test/src/math/cosf_test.cpp              |  6 +--
 libc/test/src/math/coshf_test.cpp             | 12 +++---
 libc/test/src/math/erff_test.cpp              |  2 +-
 .../src/math/exhaustive/exhaustive_test.h     |  7 ++--
 libc/test/src/math/exhaustive/hypotf_test.cpp |  4 +-
 .../test/src/math/exhaustive/sincosf_test.cpp |  2 +-
 libc/test/src/math/exp10_test.cpp             |  2 +-
 libc/test/src/math/exp10f_test.cpp            | 17 ++++----
 libc/test/src/math/exp2_test.cpp              |  2 +-
 libc/test/src/math/exp2f_test.cpp             | 18 ++++-----
 libc/test/src/math/exp_test.cpp               |  2 +-
 libc/test/src/math/expf_test.cpp              | 24 +++++------
 libc/test/src/math/expm1_test.cpp             |  2 +-
 libc/test/src/math/expm1f_test.cpp            | 30 +++++++-------
 libc/test/src/math/log10_test.cpp             |  2 +-
 libc/test/src/math/log10f_test.cpp            |  4 +-
 libc/test/src/math/log1p_test.cpp             |  2 +-
 libc/test/src/math/log1pf_test.cpp            |  4 +-
 libc/test/src/math/log2_test.cpp              |  2 +-
 libc/test/src/math/log2f_test.cpp             |  4 +-
 libc/test/src/math/log_test.cpp               |  2 +-
 libc/test/src/math/logf_test.cpp              |  4 +-
 libc/test/src/math/sin_test.cpp               |  2 +-
 libc/test/src/math/sincosf_test.cpp           |  6 +--
 libc/test/src/math/sinf_test.cpp              | 10 ++---
 libc/test/src/math/sinhf_test.cpp             | 16 ++++----
 libc/test/src/math/smoke/FDimTest.h           | 12 +++---
 libc/test/src/math/smoke/FMaxTest.h           |  2 +-
 libc/test/src/math/smoke/FMinTest.h           |  2 +-
 libc/test/src/math/smoke/FmaTest.h            | 24 +++++------
 libc/test/src/math/smoke/HypotTest.h          | 20 +++++-----
 libc/test/src/math/smoke/ILogbTest.h          | 14 +++----
 libc/test/src/math/smoke/LdExpTest.h          | 10 ++---
 libc/test/src/math/smoke/LogbTest.h           |  2 +-
 libc/test/src/math/smoke/ModfTest.h           |  2 +-
 libc/test/src/math/smoke/NextAfterTest.h      | 10 ++---
 libc/test/src/math/smoke/NextTowardTest.h     | 10 ++---
 libc/test/src/math/smoke/RIntTest.h           | 10 ++---
 libc/test/src/math/smoke/RemQuoTest.h         | 10 ++---
 libc/test/src/math/smoke/RoundToIntegerTest.h | 12 +++---
 libc/test/src/math/smoke/coshf_test.cpp       |  6 +--
 libc/test/src/math/smoke/exp10f_test.cpp      |  6 +--
 libc/test/src/math/smoke/exp2f_test.cpp       |  6 +--
 libc/test/src/math/smoke/expf_test.cpp        |  6 +--
 libc/test/src/math/smoke/expm1f_test.cpp      |  6 +--
 libc/test/src/math/smoke/sinhf_test.cpp       | 10 ++---
 libc/test/src/math/tan_test.cpp               |  2 +-
 libc/test/src/math/tanf_test.cpp              |  6 +--
 libc/test/src/math/tanhf_test.cpp             |  4 +-
 libc/test/src/stdlib/atof_test.cpp            |  2 +-
 libc/test/src/stdlib/strtod_test.cpp          |  5 +--
 libc/test/src/stdlib/strtof_test.cpp          |  2 +-
 libc/utils/MPFRWrapper/MPFRUtils.cpp          |  2 +-
 102 files changed, 366 insertions(+), 369 deletions(-)

diff --git a/libc/src/__support/FPUtil/BasicOperations.h b/libc/src/__support/FPUtil/BasicOperations.h
index ea78809dfc7f7..ccc61a89c5f83 100644
--- a/libc/src/__support/FPUtil/BasicOperations.h
+++ b/libc/src/__support/FPUtil/BasicOperations.h
@@ -21,7 +21,7 @@ template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
 LIBC_INLINE T abs(T x) {
   FPBits<T> bits(x);
   bits.set_sign(Sign::POS);
-  return T(bits);
+  return bits.get_val();
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
diff --git a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
index 2859a248b95e3..5f542f6503559 100644
--- a/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
+++ b/libc/src/__support/FPUtil/DivisionAndRemainderOperations.h
@@ -86,7 +86,7 @@ LIBC_INLINE T remquo(T x, T y, int &q) {
   // then the conversion to native remainder value should be updated
   // appropriately and some directed tests added.
   T native_remainder(remainder);
-  T absy = T(ybits);
+  T absy = ybits.get_val();
   int cmp = remainder.mul2(1).cmp(normaly);
   if (cmp > 0) {
     q = q + 1;
diff --git a/libc/src/__support/FPUtil/FPBits.h b/libc/src/__support/FPUtil/FPBits.h
index 0a79b505ecbe1..e43c6168f1f47 100644
--- a/libc/src/__support/FPUtil/FPBits.h
+++ b/libc/src/__support/FPUtil/FPBits.h
@@ -727,8 +727,6 @@ struct FPBits final : public internal::FPRep<get_fp_type<T>(), FPBits<T>> {
   // Floating-point conversions.
   LIBC_INLINE constexpr T get_val() const { return cpp::bit_cast<T>(bits); }
 
-  LIBC_INLINE constexpr explicit operator T() const { return get_val(); }
-
   // TODO: Use an uint32_t for 'biased_exp'.
   LIBC_INLINE static constexpr FPBits<T>
   create_value(Sign sign, StorageType biased_exp, StorageType mantissa) {
diff --git a/libc/src/__support/FPUtil/Hypot.h b/libc/src/__support/FPUtil/Hypot.h
index 82237dec09e42..2e69965734644 100644
--- a/libc/src/__support/FPUtil/Hypot.h
+++ b/libc/src/__support/FPUtil/Hypot.h
@@ -111,7 +111,7 @@ LIBC_INLINE T hypot(T x, T y) {
   FPBits_t x_bits(x), y_bits(y);
 
   if (x_bits.is_inf() || y_bits.is_inf()) {
-    return T(FPBits_t::inf());
+    return FPBits_t::inf().get_val();
   }
   if (x_bits.is_nan()) {
     return x;
@@ -196,8 +196,8 @@ LIBC_INLINE T hypot(T x, T y) {
       if (out_exp >= FPBits_t::MAX_BIASED_EXPONENT) {
         if (int round_mode = quick_get_round();
             round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
-          return T(FPBits_t::inf());
-        return T(FPBits_t::max_normal());
+          return FPBits_t::inf().get_val();
+        return FPBits_t::max_normal().get_val();
       }
     } else {
       // For denormal result, we simply move the leading bit of the result to
@@ -253,8 +253,8 @@ LIBC_INLINE T hypot(T x, T y) {
     ++out_exp;
     if (out_exp >= FPBits_t::MAX_BIASED_EXPONENT) {
       if (round_mode == FE_TONEAREST || round_mode == FE_UPWARD)
-        return T(FPBits_t::inf());
-      return T(FPBits_t::max_normal());
+        return FPBits_t::inf().get_val();
+      return FPBits_t::max_normal().get_val();
     }
   }
 
diff --git a/libc/src/__support/FPUtil/ManipulationFunctions.h b/libc/src/__support/FPUtil/ManipulationFunctions.h
index d7114625a9b31..56e53e3d4f497 100644
--- a/libc/src/__support/FPUtil/ManipulationFunctions.h
+++ b/libc/src/__support/FPUtil/ManipulationFunctions.h
@@ -49,13 +49,13 @@ LIBC_INLINE T modf(T x, T &iptr) {
     return x;
   } else if (bits.is_inf()) {
     iptr = x;
-    return T(FPBits<T>::zero(bits.sign()));
+    return FPBits<T>::zero(bits.sign()).get_val();
   } else {
     iptr = trunc(x);
     if (x == iptr) {
       // If x is already an integer value, then return zero with the right
       // sign.
-      return T(FPBits<T>::zero(bits.sign()));
+      return FPBits<T>::zero(bits.sign()).get_val();
     } else {
       return x - iptr;
     }
@@ -66,7 +66,7 @@ template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
 LIBC_INLINE T copysign(T x, T y) {
   FPBits<T> xbits(x);
   xbits.set_sign(FPBits<T>(y).sign());
-  return T(xbits);
+  return xbits.get_val();
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -103,12 +103,12 @@ LIBC_INLINE T logb(T x) {
   if (bits.is_zero()) {
     // TODO(Floating point exception): Raise div-by-zero exception.
     // TODO(errno): POSIX requires setting errno to ERANGE.
-    return T(FPBits<T>::inf(Sign::NEG));
+    return FPBits<T>::inf(Sign::NEG).get_val();
   } else if (bits.is_nan()) {
     return x;
   } else if (bits.is_inf()) {
     // Return positive infinity.
-    return T(FPBits<T>::inf());
+    return FPBits<T>::inf().get_val();
   }
 
   NormalFloat<T> normal(bits);
@@ -131,11 +131,11 @@ LIBC_INLINE T ldexp(T x, int exp) {
   // calculating the limit.
   int exp_limit = FPBits<T>::MAX_BIASED_EXPONENT + FPBits<T>::FRACTION_LEN + 1;
   if (exp > exp_limit)
-    return T(FPBits<T>::inf(bits.sign()));
+    return FPBits<T>::inf(bits.sign()).get_val();
 
   // Similarly on the negative side we return zero early if |exp| is too small.
   if (exp < -exp_limit)
-    return T(FPBits<T>::zero(bits.sign()));
+    return FPBits<T>::zero(bits.sign()).get_val();
 
   // For all other values, NormalFloat to T conversion handles it the right way.
   NormalFloat<T> normal(bits);
diff --git a/libc/src/__support/FPUtil/NearestIntegerOperations.h b/libc/src/__support/FPUtil/NearestIntegerOperations.h
index 62568977dc0c8..19ae75ea78891 100644
--- a/libc/src/__support/FPUtil/NearestIntegerOperations.h
+++ b/libc/src/__support/FPUtil/NearestIntegerOperations.h
@@ -41,11 +41,11 @@ LIBC_INLINE T trunc(T x) {
 
   // If the exponent is such that abs(x) is less than 1, then return 0.
   if (exponent <= -1)
-    return T(FPBits<T>::zero(bits.sign()));
+    return FPBits<T>::zero(bits.sign()).get_val();
 
   int trim_size = FPBits<T>::FRACTION_LEN - exponent;
   bits.set_mantissa((bits.get_mantissa() >> trim_size) << trim_size);
-  return T(bits);
+  return bits.get_val();
 }
 
 template <typename T, cpp::enable_if_t<cpp::is_floating_point_v<T>, int> = 0>
@@ -73,7 +73,7 @@ LIBC_INLINE T ceil(T x) {
 
   uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
   bits.set_mantissa((bits.get_mantissa() >> trim_size) << trim_size);
-  T trunc_value = T(bits);
+  T trunc_value = bits.get_val();
 
   // If x is already an integer, return it.
   if (trunc_value == x)
@@ -114,19 +114,19 @@ LIBC_INLINE T round(T x) {
 
   if (exponent == -1) {
     // Absolute value of x is greater than equal to 0.5 but less than 1.
-    return T(FPBits<T>::one(bits.sign()));
+    return FPBits<T>::one(bits.sign()).get_val();
   }
 
   if (exponent <= -2) {
     // Absolute value of x is less than 0.5.
-    return T(FPBits<T>::zero(bits.sign()));
+    return FPBits<T>::zero(bits.sign()).get_val();
   }
 
   uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
   bool half_bit_set =
       bool(bits.get_mantissa() & (StorageType(1) << (trim_size - 1)));
   bits.set_mantissa((bits.get_mantissa() >> trim_size) << trim_size);
-  T trunc_value = T(bits);
+  T trunc_value = bits.get_val();
 
   // If x is already an integer, return it.
   if (trunc_value == x)
@@ -180,7 +180,7 @@ LIBC_INLINE T round_using_current_rounding_mode(T x) {
   uint32_t trim_size = FPBits<T>::FRACTION_LEN - exponent;
   FPBits<T> new_bits = bits;
   new_bits.set_mantissa((bits.get_mantissa() >> trim_size) << trim_size);
-  T trunc_value = T(new_bits);
+  T trunc_value = new_bits.get_val();
 
   // If x is already an integer, return it.
   if (trunc_value == x)
diff --git a/libc/src/__support/FPUtil/NormalFloat.h b/libc/src/__support/FPUtil/NormalFloat.h
index fa4da33b5b17f..57a401d911fc6 100644
--- a/libc/src/__support/FPUtil/NormalFloat.h
+++ b/libc/src/__support/FPUtil/NormalFloat.h
@@ -96,7 +96,7 @@ template <typename T> struct NormalFloat {
     // Max exponent is of the form 0xFF...E. That is why -2 and not -1.
     constexpr int MAX_EXPONENT_VALUE = (1 << FPBits<T>::EXP_LEN) - 2;
     if (biased_exponent > MAX_EXPONENT_VALUE) {
-      return T(FPBits<T>::inf(sign));
+      return FPBits<T>::inf(sign).get_val();
     }
 
     FPBits<T> result(T(0.0));
@@ -129,15 +129,15 @@ template <typename T> struct NormalFloat {
         // the overflow into the exponent.
         if (new_mantissa == ONE)
           result.set_biased_exponent(1);
-        return T(result);
+        return result.get_val();
       } else {
-        return T(result);
+        return result.get_val();
       }
     }
 
     result.set_biased_exponent(exponent + FPBits<T>::EXP_BIAS);
     result.set_mantissa(mantissa);
-    return T(result);
+    return result.get_val();
   }
 
 private:
@@ -250,16 +250,16 @@ template <> LIBC_INLINE NormalFloat<long double>::operator long double() const {
       } else {
         result.set_implicit_bit(0);
       }
-      return static_cast<long double>(result);
+      return result.get_val();
     } else {
-      return static_cast<long double>(result);
+      return result.get_val();
     }
   }
 
   result.set_biased_exponent(biased_exponent);
   result.set_mantissa(mantissa);
   result.set_implicit_bit(1);
-  return static_cast<long double>(result);
+  return result.get_val();
 }
 #endif // LIBC_LONG_DOUBLE_IS_X86_FLOAT80
 
diff --git a/libc/src/__support/FPUtil/generic/FMA.h b/libc/src/__support/FPUtil/generic/FMA.h
index 5c36463ea5021..f03af9246337f 100644
--- a/libc/src/__support/FPUtil/generic/FMA.h
+++ b/libc/src/__support/FPUtil/generic/FMA.h
@@ -58,8 +58,8 @@ template <> LIBC_INLINE float fma<float>(float x, float y, float z) {
     // correct (when it matters).
     fputil::FPBits<double> t(
         (bit_prod.get_biased_exponent() >= bitz.get_biased_exponent())
-            ? ((double(bit_sum) - double(bit_prod)) - double(bitz))
-            : ((double(bit_sum) - double(bitz)) - double(bit_prod)));
+            ? ((bit_sum.get_val() - bit_prod.get_val()) - bitz.get_val())
+            : ((bit_sum.get_val() - bitz.get_val()) - bit_prod.get_val()));
 
     // Update sticky bits if t != 0.0 and the least (52 - 23 - 1 = 28) bits are
     // zero.
@@ -72,7 +72,7 @@ template <> LIBC_INLINE float fma<float>(float x, float y, float z) {
     }
   }
 
-  return static_cast<float>(static_cast<double>(bit_sum));
+  return static_cast<float>(bit_sum.get_val());
 }
 
 namespace internal {
@@ -257,7 +257,7 @@ template <> LIBC_INLINE double fma<double>(double x, double y, double z) {
         (round_mode == FE_DOWNWARD && prod_sign.is_pos())) {
       return FPBits::max_normal(prod_sign).get_val();
     }
-    return static_cast<double>(FPBits::inf(prod_sign));
+    return FPBits::inf(prod_sign).get_val();
   }
 
   // Remove hidden bit and append the exponent field and sign bit.
diff --git a/libc/src/__support/str_to_float.h b/libc/src/__support/str_to_float.h
index 9655c993bee28..7ecb6c3e02fde 100644
--- a/libc/src/__support/str_to_float.h
+++ b/libc/src/__support/str_to_float.h
@@ -568,11 +568,11 @@ clinger_fast_path(ExpandedFloat<T> init_num,
                              ClingerConsts<T>::POWERS_OF_TEN_ARRAY[exp10]);
 
     // If the results are equal, then we don't need to use the rounding mode.
-    if (T(result) != -T(negative_result)) {
+    if (result.get_val() != -negative_result.get_val()) {
       FPBits lower_result;
       FPBits higher_result;
 
-      if (T(result) < -T(negative_result)) {
+      if (result.get_val() < -negative_result.get_val()) {
         lower_result = result;
         higher_result = negative_result;
       } else {
@@ -1194,7 +1194,7 @@ LIBC_INLINE StrToNumResult<T> strtofloatingpoint(const char *__restrict src) {
   // special 80 bit long doubles. Otherwise it should be inlined out.
   set_implicit_bit<T>(result);
 
-  return {T(result), index, error};
+  return {result.get_val(), index, error};
 }
 
 template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) {
@@ -1216,7 +1216,7 @@ template <class T> LIBC_INLINE StrToNumResult<T> strtonan(const char *arg) {
   }
 
   result = FPBits::build_quiet_nan(fputil::Sign::POS, nan_mantissa);
-  return {T(result), 0, error};
+  return {result.get_val(), 0, error};
 }
 
 } // namespace internal
diff --git a/libc/src/math/generic/exp.cpp b/libc/src/math/generic/exp.cpp
index 49ea1699bb209..f23170f8ed425 100644
--- a/libc/src/math/generic/exp.cpp
+++ b/libc/src/math/generic/exp.cpp
@@ -222,7 +222,7 @@ double set_exceptional(double x) {
     fputil::raise_except_if_required(FE_OVERFLOW);
   }
   // x is +inf or nan
-  return x + static_cast<double>(FPBits::inf());
+  return x + FPBits::inf().get_val();
 }
 
 } // namespace
diff --git a/libc/src/math/generic/exp10.cpp b/libc/src/math/generic/exp10.cpp
index f1da03cba0b30..6b40f5561845d 100644
--- a/libc/src/math/generic/exp10.cpp
+++ b/libc/src/math/generic/exp10.cpp
@@ -268,7 +268,7 @@ double set_exceptional(double x) {
     fputil::raise_except_if_required(FE_OVERFLOW);
   }
   // x is +inf or nan
-  return x + static_cast<double>(FPBits::inf());
+  return x + FPBits::inf().get_val();
 }
 
 } // namespace
diff --git a/libc/src/math/generic/exp2.cpp b/libc/src/math/generic/exp2.cpp
index 508bff9bd9fc9..01e66d1ae00f7 100644
--- a/libc/src/math/generic/exp2.cpp
+++ b/libc/src/math/generic/exp2.cpp
@@ -243,7 +243,7 @@ double set_exceptional(double x) {
     fputil::raise_except_if_required(FE_OVERFLOW);
   }
   // x is +inf or nan
-  return x + static_cast<double>(FPBits::inf());
+  return x + FPBits::inf().get_val();
 }
 
 } // namespace
diff --git a/libc/src/math/generic/expf.cpp b/libc/src/math/generic/expf.cpp
index f3ce8400d0a41..c7ab974850a82 100644
--- a/libc/src/math/generic/expf.cpp
+++ b/libc/src/math/generic/expf.cpp
@@ -67,7 +67,7 @@ LLVM_LIBC_FUNCTION(float, expf, (float x)) {
         fputil::raise_except_if_required(FE_OVERFLOW);
       }
       // x is +inf or nan
-      return x + static_cast<float>(FPBits::inf());
+      return x + FPBits::inf().get_val();
     }
   }
   // For -104 < x < 89, to compute exp(x), we perform the following range
diff --git a/libc/src/math/generic/hypotf.cpp b/libc/src/math/generic/hypotf.cpp
index 93dd4feb36bee..4c94b3eb7b988 100644
--- a/libc/src/math/generic/hypotf.cpp
+++ b/libc/src/math/generic/hypotf.cpp
@@ -46,7 +46,7 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
 
   if (!DoubleBits(sum_sq).is_inf_or_nan()) {
     // Correct rounding.
-    double r_sq = static_cast<double>(result) * static_cast<double>(result);
+    double r_sq = result.get_val() * result.get_val();
     double diff = sum_sq - r_sq;
     constexpr uint64_t mask = 0x0000'0000'3FFF'FFFFULL;
     uint64_t lrs = result.uintval() & mask;
@@ -60,14 +60,14 @@ LLVM_LIBC_FUNCTION(float, hypotf, (float x, float y)) {
     FPBits bits_x(x), bits_y(y);
     if (bits_x.is_inf_or_nan() || bits_y.is_inf_or_nan()) {
       if (bits_x.is_inf() || bits_y.is_inf())
-        return static_cast<float>(FPBits::inf());
+        return FPBits::inf().get_val();
       if (bits_x.is_nan())
         return x;
       return y;
     }
   }
 
-  return static_cast<float>(static_cast<double>(result));
+  return static_cast<float>(result.get_val());
 }
 
 } // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/generic/log10f.cpp b/libc/src/math/generic/log10f.cpp
index f87e34ec5fb38..ac2e3b9cf925a 100644
--- a/libc/src/math/generic/log10f.cpp
+++ b/libc/src/math/generic/log10f.cpp
@@ -191,7 +191,7 @@ LLVM_LIBC_FUNCTION(float, log10f, (float x)) {
   // Set bits to 1.m
   xbits.set_biased_exponent(0x7F);
 
-  float u = static_cast<float>(xbits);
+  float u = xbits.get_val();
   double v;
 #ifdef LIBC_TARGET_CPU_HAS_FMA
   v = static_cast<double>(fputil::multiply_add(u, R[index], -1.0f)); // Exact.
diff --git a/libc/src/math/generic/log1pf.cpp b/libc/src/math/generic/log1pf.cpp
index bc472caf54f89..8de4a2067f9d3 100644
--- a/libc/src/math/generic/log1pf.cpp
+++ b/libc/src/math/generic/log1pf.cpp
@@ -66,7 +66,7 @@ LIBC_INLINE float log(double x) {
   // Clear the lowest 45 bits.
   f.bits &= ~0x0000'1FFF'FFFF'FFFFULL;
 
-  double d = static_cast<double>(xbits) - static_cast<double>(f);
+  double d = xbits.get_val() - f.get_val();
   d *= ONE_OVER_F[f_index];
 
   double extra_factor = fputil::multiply_add(m, LOG_2, LOG_F[f_index]);
@@ -106,7 +106,7 @@ LLVM_LIBC_FUNCTION(float, log1pf, (float x)) {
     case 0xbf800000U: // x = -1.0
       fputil::set_errno_if_required(ERANGE);
       fputil::raise_except_if_required(FE_DIVBYZERO);
-      return static_cast<float>(fputil::FPBits<float>::inf(fputil::Sign::NEG));
+      return FPBits::inf(fputil::Sign::NEG).get_val();
 #ifndef LIBC_TARGET_CPU_HAS_FMA
     case 0x4cc1c80bU: // x = 0x1.839016p+26f
       return fputil::round_result_slightly_down(0x1.26fc04p+4f);
diff --git a/libc/src/math/generic/log2f.cpp b/libc/src/math/generic/log2f.cpp
index 7eaa5d53ccedd..e3417552ad35a 100644
--- a/libc/src/math/generic/log2f.cpp
+++ b/libc/src/math/generic/log2f.cpp
@@ -94,7 +94,7 @@ LLVM_LIBC_FUNCTION(float, log2f, (float x)) {
   // Set bits to 1.m
   xbits.set_biased_exponent(0x7F);
 
-  float u = static_cast<float>(xbits);
+  float u = xbits.get_val();
   double v;
 #ifdef LIBC_TARGET_CPU_HAS_FMA
   v = static_cast<double>(fputil::multiply_add(u, R[index], -1.0f)); // Exact.
diff --git a/libc/src/math/generic/logf.cpp b/libc/src/math/generic/logf.cpp
index 88f7ea01b2f19..4ad5a4e9b6ad8 100644
--- a/libc/src/math/generic/logf.cpp
+++ b/libc/src/math/generic/logf.cpp
@@ -85,7 +85,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
         // Return -inf and raise FE_DIVBYZERO
         fputil::set_errno_if_required(ERANGE);
         fputil::raise_except_if_required(FE_DIVBYZERO);
-        return static_cast<float>(FPBits::inf(fputil::Sign::NEG));
+        return FPBits::inf(Sign::NEG).get_val();
       }
       // Normalize denormal inputs.
       xbits = FPBits(xbits.get_val() * 0x1.0p23f);
@@ -149,7 +149,7 @@ LLVM_LIBC_FUNCTION(float, logf, (float x)) {
   // Set bits to 1.m
   xbits.set_biased_exponent(0x7F);
 
-  float u = static_cast<float>(xbits);
+  float u = xbits.get_val();
   double v;
 #ifdef LIBC_TARGET_CPU_HAS_FMA
   v = static_cast<double>(fputil::multiply_add(u, R[index], -1.0f)); // Exact.
diff --git a/libc/src/math/generic/range_reduction_fma.h b/libc/src/math/generic/range_reduction_fma.h
index 13a7360b42332..d2ca5d348aadd 100644
--- a/libc/src/math/generic/range_reduction_fma.h
+++ b/libc/src/math/generic/range_reduction_fma.h
@@ -52,7 +52,7 @@ LIBC_INLINE int64_t large_range_reduction(double x, int x_exp, double &y) {
     // least 2^6.
     fputil::FPBits<double> prod_hi(x * THIRTYTWO_OVER_PI[0]);
     prod_hi.bits &= (x_exp < 55) ? (~0xfffULL) : (~0ULL); // |x| < 2^55
-    double k_hi = fputil::nearest_integer(static_cast<double>(prod_hi));
+    double k_hi = fputil::nearest_integer(prod_hi.get_val());
     double truncated_prod = fputil::fma(x, THIRTYTWO_OVER_PI[0], -k_hi);
     double prod_lo = fputil::fma(x, THIRTYTWO_OVER_PI[1], truncated_prod);
     double k_lo = fputil::nearest_integer(prod_lo);
@@ -71,7 +71,7 @@ LIBC_INLINE int64_t large_range_reduction(double x, int x_exp, double &y) {
   // least 64.
   fputil::FPBits<double> prod_hi(x * THIRTYTWO_OVER_PI[1]);
   prod_hi.bits &= (x_exp < 110) ? (~0xfffULL) : (~0ULL); // |x| < 2^110
-  double k_hi = fputil::nearest_integer(static_cast<double>(prod_hi));
+  double k_hi = fputil::nearest_integer(prod_hi.get_val());
   double truncated_prod = fputil::fma(x, THIRTYTWO_OVER_PI[1], -k_hi);
   double prod_lo = fputil::fma(x, THIRTYTWO_OVER_PI[2], truncated_prod);
   double k_lo = fputil::nearest_integer(prod_lo);
diff --git a/libc/src/stdio/printf_core/float_dec_converter.h b/libc/src/stdio/printf_core/float_dec_converter.h
index 6171d1d5946e6..b54526d371086 100644
--- a/libc/src/stdio/printf_core/float_dec_converter.h
+++ b/libc/src/stdio/printf_core/float_dec_converter.h
@@ -498,7 +498,7 @@ LIBC_INLINE int convert_float_decimal_typed(Writer *writer,
 
   PaddingWriter padding_writer(to_conv, sign_char);
   FloatWriter float_writer(writer, has_decimal_point, padding_writer);
-  FloatToString<T> float_converter(static_cast<T>(float_bits));
+  FloatToString<T> float_converter(float_bits.get_val());
 
   const size_t positive_blocks = float_converter.get_positive_blocks();
 
@@ -608,7 +608,7 @@ LIBC_INLINE int convert_float_dec_exp_typed(Writer *writer,
 
   PaddingWriter padding_writer(to_conv, sign_char);
   FloatWriter float_writer(writer, has_decimal_point, padding_writer);
-  FloatToString<T> float_converter(static_cast<T>(float_bits));
+  FloatToString<T> float_converter(float_bits.get_val());
 
   size_t digits_written = 0;
   int final_exponent = 0;
@@ -767,7 +767,7 @@ LIBC_INLINE int convert_float_dec_auto_typed(Writer *writer,
   // it has style E, so here we calculate the precision we'll use in that case.
   const unsigned int exp_precision = init_precision - 1;
 
-  FloatToString<T> float_converter(static_cast<T>(float_bits));
+  FloatToString<T> float_converter(float_bits.get_val());
 
   // Here we would subtract 1 to account for the fact that block 0 counts as a
   // positive block, but the loop below accounts for this by starting with
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index ad2fc0439fd79..69da5387d382b 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -66,16 +66,16 @@ template <typename T> struct FPTest : public Test {
   using Sign = LIBC_NAMESPACE::fputil::Sign;
   static constexpr StorageType STORAGE_MAX =
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();
-  static constexpr T zero = T(FPBits::zero(Sign::POS));
-  static constexpr T neg_zero = T(FPBits::zero(Sign::NEG));
-  static constexpr T aNaN = T(FPBits::build_quiet_nan());
-  static constexpr T sNaN = T(FPBits::build_nan(Sign::POS, 1));
-  static constexpr T inf = T(FPBits::inf(Sign::POS));
-  static constexpr T neg_inf = T(FPBits::inf(Sign::NEG));
-  static constexpr T min_normal = T(FPBits::min_normal());
-  static constexpr T max_normal = T(FPBits::max_normal());
-  static constexpr T min_denormal = T(FPBits::min_subnormal());
-  static constexpr T max_denormal = T(FPBits::max_subnormal());
+  static constexpr T zero = FPBits::zero(Sign::POS).get_val();
+  static constexpr T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  static constexpr T aNaN = FPBits::build_quiet_nan().get_val();
+  static constexpr T sNaN = FPBits::build_nan(Sign::POS, 1).get_val();
+  static constexpr T inf = FPBits::inf(Sign::POS).get_val();
+  static constexpr T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  static constexpr T min_normal = FPBits::min_normal().get_val();
+  static constexpr T max_normal = FPBits::max_normal().get_val();
+  static constexpr T min_denormal = FPBits::min_subnormal().get_val();
+  static constexpr T max_denormal = FPBits::max_subnormal().get_val();
 
   static constexpr int N_ROUNDING_MODES = 4;
   static constexpr fputil::testing::RoundingMode ROUNDING_MODES[4] = {
@@ -95,16 +95,16 @@ template <typename T> struct FPTest : public Test {
   using Sign = LIBC_NAMESPACE::fputil::Sign;                                   \
   static constexpr StorageType STORAGE_MAX =                                   \
       LIBC_NAMESPACE::cpp::numeric_limits<StorageType>::max();                 \
-  const T zero = T(FPBits::zero(Sign::POS));                                   \
-  const T neg_zero = T(FPBits::zero(Sign::NEG));                               \
-  const T aNaN = T(FPBits::build_quiet_nan());                                 \
-  const T sNaN = T(FPBits::build_nan(Sign::POS, 1));                           \
-  const T inf = T(FPBits::inf(Sign::POS));                                     \
-  const T neg_inf = T(FPBits::inf(Sign::NEG));                                 \
-  const T min_normal = T(FPBits::min_normal());                                \
-  const T max_normal = T(FPBits::max_normal());                                \
-  const T min_denormal = T(FPBits::min_subnormal());                           \
-  const T max_denormal = T(FPBits::max_subnormal());
+  const T zero = FPBits::zero(Sign::POS).get_val();                            \
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();                        \
+  const T aNaN = FPBits::build_quiet_nan().get_val();                          \
+  const T sNaN = FPBits::build_nan(Sign::POS, 1).get_val();                    \
+  const T inf = FPBits::inf(Sign::POS).get_val();                              \
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();                          \
+  const T min_normal = FPBits::min_normal().get_val();                         \
+  const T max_normal = FPBits::max_normal().get_val();                         \
+  const T min_denormal = FPBits::min_subnormal().get_val();                    \
+  const T max_denormal = FPBits::max_subnormal().get_val();
 
 #define EXPECT_FP_EQ(expected, actual)                                         \
   EXPECT_THAT(actual, LIBC_NAMESPACE::testing::getMatcher<                     \
diff --git a/libc/test/src/math/CeilTest.h b/libc/test/src/math/CeilTest.h
index 67b33cb14110b..5ea4f349d008b 100644
--- a/libc/test/src/math/CeilTest.h
+++ b/libc/test/src/math/CeilTest.h
@@ -67,7 +67,7 @@ template <typename T> class CeilTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x))
         continue;
 
diff --git a/libc/test/src/math/CopySignTest.h b/libc/test/src/math/CopySignTest.h
index c916416a2e5b9..8b81e8d7de252 100644
--- a/libc/test/src/math/CopySignTest.h
+++ b/libc/test/src/math/CopySignTest.h
@@ -37,7 +37,7 @@ class CopySignTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x))
         continue;
 
diff --git a/libc/test/src/math/FAbsTest.h b/libc/test/src/math/FAbsTest.h
index b2b146167e625..bf3052afc816f 100644
--- a/libc/test/src/math/FAbsTest.h
+++ b/libc/test/src/math/FAbsTest.h
@@ -35,7 +35,7 @@ template <typename T> class FAbsTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x))
         continue;
       ASSERT_MPFR_MATCH(mpfr::Operation::Abs, x, func(x), 0.0);
diff --git a/libc/test/src/math/FDimTest.h b/libc/test/src/math/FDimTest.h
index 46df2b40e64bd..31df2f5607c26 100644
--- a/libc/test/src/math/FDimTest.h
+++ b/libc/test/src/math/FDimTest.h
@@ -20,11 +20,11 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
@@ -66,7 +66,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
-      T x = T(FPBits(v)), y = T(FPBits(w));
+      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
       if (isnan(x) || isinf(x))
         continue;
       if (isnan(y) || isinf(y))
diff --git a/libc/test/src/math/FMaxTest.h b/libc/test/src/math/FMaxTest.h
index 1a1d2a77268d1..edc46ae5bb0fe 100644
--- a/libc/test/src/math/FMaxTest.h
+++ b/libc/test/src/math/FMaxTest.h
@@ -59,7 +59,7 @@ template <typename T> class FMaxTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
-      T x = T(FPBits(v)), y = T(FPBits(w));
+      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
       if (isnan(x) || isinf(x))
         continue;
       if (isnan(y) || isinf(y))
diff --git a/libc/test/src/math/FMinTest.h b/libc/test/src/math/FMinTest.h
index 742bb5cdd1021..5ff583604ebc5 100644
--- a/libc/test/src/math/FMinTest.h
+++ b/libc/test/src/math/FMinTest.h
@@ -59,7 +59,7 @@ template <typename T> class FMinTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
-      T x = T(FPBits(v)), y = T(FPBits(w));
+      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
       if (isnan(x) || isinf(x))
         continue;
       if (isnan(y) || isinf(y))
diff --git a/libc/test/src/math/FloorTest.h b/libc/test/src/math/FloorTest.h
index c1b05c5d13083..5e459ebd49289 100644
--- a/libc/test/src/math/FloorTest.h
+++ b/libc/test/src/math/FloorTest.h
@@ -67,7 +67,7 @@ template <typename T> class FloorTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x))
         continue;
 
diff --git a/libc/test/src/math/FmaTest.h b/libc/test/src/math/FmaTest.h
index 8cc340e9260ff..dd6832e129144 100644
--- a/libc/test/src/math/FmaTest.h
+++ b/libc/test/src/math/FmaTest.h
@@ -25,14 +25,14 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T min_subnormal = T(FPBits::min_subnormal(Sign::POS));
-  const T min_normal = T(FPBits::min_normal(Sign::POS));
-  const T max_normal = T(FPBits::max_normal(Sign::POS));
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T min_subnormal = FPBits::min_subnormal(Sign::POS).get_val();
+  const T min_normal = FPBits::min_normal(Sign::POS).get_val();
+  const T max_normal = FPBits::max_normal(Sign::POS).get_val();
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
   static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
@@ -63,9 +63,9 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
 
     // Test underflow rounding up.
     EXPECT_FP_EQ(func(T(0.5), min_subnormal, min_subnormal),
-                 T(FPBits(StorageType(2))));
+                 FPBits(StorageType(2)).get_val());
     // Test underflow rounding down.
-    T v = T(FPBits(MIN_NORMAL + StorageType(1)));
+    T v = FPBits(MIN_NORMAL + StorageType(1)).get_val();
     EXPECT_FP_EQ(func(T(1) / T(MIN_NORMAL << 1), v, min_normal), v);
     // Test overflow.
     T z = max_normal;
@@ -80,8 +80,8 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType v = MIN_SUBNORMAL, w = MAX_SUBNORMAL;
          v <= MAX_SUBNORMAL && w >= MIN_SUBNORMAL; v += STEP, w -= STEP) {
-      T x = T(FPBits(get_random_bit_pattern())), y = T(FPBits(v)),
-        z = T(FPBits(w));
+      T x = FPBits(get_random_bit_pattern()).get_val(), y = FPBits(v).get_val(),
+        z = FPBits(w).get_val();
       mpfr::TernaryInput<T> input{x, y, z};
       ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fma, input, func(x, y, z),
                                      0.5);
@@ -93,8 +93,8 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
          v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
-      T x = T(FPBits(v)), y = T(FPBits(w)),
-        z = T(FPBits(get_random_bit_pattern()));
+      T x = FPBits(v).get_val(), y = FPBits(w).get_val(),
+        z = FPBits(get_random_bit_pattern()).get_val();
       mpfr::TernaryInput<T> input{x, y, z};
       ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Fma, input, func(x, y, z),
                                      0.5);
diff --git a/libc/test/src/math/FrexpTest.h b/libc/test/src/math/FrexpTest.h
index 20ddce807da45..f3a64ce4aac31 100644
--- a/libc/test/src/math/FrexpTest.h
+++ b/libc/test/src/math/FrexpTest.h
@@ -96,7 +96,7 @@ template <typename T> class FrexpTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = static_cast<T>(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x) || x == 0.0l)
         continue;
 
diff --git a/libc/test/src/math/HypotTest.h b/libc/test/src/math/HypotTest.h
index 2990072fd5c66..9f2d148be6b91 100644
--- a/libc/test/src/math/HypotTest.h
+++ b/libc/test/src/math/HypotTest.h
@@ -25,15 +25,15 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
   using StorageType = typename FPBits::StorageType;
-  const T nan = T(FPBits::build_quiet_nan());
-  const T inf = T(FPBits::inf());
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero());
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T max_normal = T(FPBits::max_normal());
-  const T min_normal = T(FPBits::min_normal());
-  const T max_subnormal = T(FPBits::max_subnormal());
-  const T min_subnormal = T(FPBits::min_subnormal());
+  const T nan = FPBits::build_quiet_nan().get_val();
+  const T inf = FPBits::inf().get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero().get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T max_normal = FPBits::max_normal().get_val();
+  const T min_normal = FPBits::min_normal().get_val();
+  const T max_subnormal = FPBits::max_subnormal().get_val();
+  const T min_subnormal = FPBits::min_subnormal().get_val();
 
   static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
   static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
@@ -74,7 +74,7 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
       for (int signs = 0; signs < 4; ++signs) {
         for (StorageType v = MIN_SUBNORMAL, w = max_value;
              v <= max_value && w >= MIN_SUBNORMAL; v += step, w -= step) {
-          T x = T(FPBits(v)), y = T(FPBits(w));
+          T x = FPBits(v).get_val(), y = FPBits(w).get_val();
           if (signs % 2 == 1) {
             x = -x;
           }
@@ -96,7 +96,7 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
     for (int signs = 0; signs < 4; ++signs) {
       for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
            v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
-        T x = T(FPBits(v)), y = T(FPBits(w));
+        T x = FPBits(v).get_val(), y = FPBits(w).get_val();
         if (signs % 2 == 1) {
           x = -x;
         }
diff --git a/libc/test/src/math/ILogbTest.h b/libc/test/src/math/ILogbTest.h
index fe28fab9d777c..fb8d77093eba3 100644
--- a/libc/test/src/math/ILogbTest.h
+++ b/libc/test/src/math/ILogbTest.h
@@ -26,11 +26,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_special_numbers(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using Sign = LIBC_NAMESPACE::fputil::Sign;
-    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
-    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan())));
-    EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
-    EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
+    EXPECT_EQ(FP_ILOGB0, func(FPBits::zero(Sign::POS).get_val()));
+    EXPECT_EQ(FP_ILOGB0, func(FPBits::zero(Sign::NEG).get_val()));
+    EXPECT_EQ(FP_ILOGBNAN, func(FPBits::build_quiet_nan().get_val()));
+    EXPECT_EQ(INT_MAX, func(FPBits::inf(Sign::POS).get_val()));
+    EXPECT_EQ(INT_MAX, func(FPBits::inf(Sign::NEG).get_val()));
   }
 
   template <typename T>
@@ -81,7 +81,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 10'001;
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
 
@@ -100,7 +100,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 10'001;
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
 
diff --git a/libc/test/src/math/LdExpTest.h b/libc/test/src/math/LdExpTest.h
index 08a14a90c1226..1d581f2c0a02f 100644
--- a/libc/test/src/math/LdExpTest.h
+++ b/libc/test/src/math/LdExpTest.h
@@ -25,11 +25,11 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/LogbTest.h b/libc/test/src/math/LogbTest.h
index 196da5e96b076..d64c5c44e4281 100644
--- a/libc/test/src/math/LogbTest.h
+++ b/libc/test/src/math/LogbTest.h
@@ -75,7 +75,7 @@ template <typename T> class LogbTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = static_cast<T>(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x) || x == 0.0l)
         continue;
 
diff --git a/libc/test/src/math/ModfTest.h b/libc/test/src/math/ModfTest.h
index f5196a74fa742..a7e5e8810611b 100644
--- a/libc/test/src/math/ModfTest.h
+++ b/libc/test/src/math/ModfTest.h
@@ -87,7 +87,7 @@ template <typename T> class ModfTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x) || x == T(0.0))
         continue;
 
diff --git a/libc/test/src/math/NextAfterTest.h b/libc/test/src/math/NextAfterTest.h
index f9ae4a4ec5c02..2bbc3891205a5 100644
--- a/libc/test/src/math/NextAfterTest.h
+++ b/libc/test/src/math/NextAfterTest.h
@@ -23,11 +23,11 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   const StorageType min_subnormal = FPBits::min_subnormal().uintval();
   const StorageType max_subnormal = FPBits::max_subnormal().uintval();
diff --git a/libc/test/src/math/RIntTest.h b/libc/test/src/math/RIntTest.h
index 3af7b6fe6f6d4..c6693981df5eb 100644
--- a/libc/test/src/math/RIntTest.h
+++ b/libc/test/src/math/RIntTest.h
@@ -34,11 +34,11 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   static constexpr StorageType MIN_SUBNORMAL =
       FPBits::min_subnormal().uintval();
@@ -104,7 +104,7 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'001;
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
-      T x = T(FPBits(i));
+      T x = FPBits(i).get_val();
       for (int mode : ROUNDING_MODES) {
         LIBC_NAMESPACE::fputil::set_round(mode);
         mpfr::RoundingMode mpfr_mode = to_mpfr_rounding_mode(mode);
@@ -117,7 +117,7 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'001;
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType i = MIN_NORMAL; i <= MAX_NORMAL; i += STEP) {
-      T x = T(FPBits(i));
+      T x = FPBits(i).get_val();
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. We will skip them.
       if (isnan(x)) {
diff --git a/libc/test/src/math/RemQuoTest.h b/libc/test/src/math/RemQuoTest.h
index bbc266764c1c2..16a439a5fea6d 100644
--- a/libc/test/src/math/RemQuoTest.h
+++ b/libc/test/src/math/RemQuoTest.h
@@ -24,11 +24,11 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   static constexpr StorageType MIN_SUBNORMAL =
       FPBits::min_subnormal().uintval();
@@ -107,7 +107,7 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType v = MIN_SUBNORMAL, w = MAX_SUBNORMAL;
          v <= MAX_SUBNORMAL && w >= MIN_SUBNORMAL; v += STEP, w -= STEP) {
-      T x = T(FPBits(v)), y = T(FPBits(w));
+      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
       mpfr::BinaryOutput<T> result;
       mpfr::BinaryInput<T> input{x, y};
       result.f = func(x, y, &result.i);
@@ -120,7 +120,7 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType v = MIN_NORMAL, w = MAX_NORMAL;
          v <= MAX_NORMAL && w >= MIN_NORMAL; v += STEP, w -= STEP) {
-      T x = T(FPBits(v)), y = T(FPBits(w));
+      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
       mpfr::BinaryOutput<T> result;
       mpfr::BinaryInput<T> input{x, y};
       result.f = func(x, y, &result.i);
diff --git a/libc/test/src/math/RoundTest.h b/libc/test/src/math/RoundTest.h
index 3ec46159c65d9..4860464be9089 100644
--- a/libc/test/src/math/RoundTest.h
+++ b/libc/test/src/math/RoundTest.h
@@ -67,7 +67,7 @@ template <typename T> class RoundTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x))
         continue;
 
diff --git a/libc/test/src/math/RoundToIntegerTest.h b/libc/test/src/math/RoundToIntegerTest.h
index f75aaa348bd00..a2a88c75a733f 100644
--- a/libc/test/src/math/RoundToIntegerTest.h
+++ b/libc/test/src/math/RoundToIntegerTest.h
@@ -33,11 +33,11 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const F zero = F(FPBits::zero());
-  const F neg_zero = F(FPBits::zero(Sign::NEG));
-  const F inf = F(FPBits::inf());
-  const F neg_inf = F(FPBits::inf(Sign::NEG));
-  const F nan = F(FPBits::build_quiet_nan());
+  const F zero = FPBits::zero().get_val();
+  const F neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const F inf = FPBits::inf().get_val();
+  const F neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const F nan = FPBits::build_quiet_nan().get_val();
 
   static constexpr StorageType MAX_NORMAL = FPBits::max_normal().uintval();
   static constexpr StorageType MIN_NORMAL = FPBits::min_normal().uintval();
@@ -139,7 +139,7 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
     bits.set_sign(Sign::NEG);
     bits.set_mantissa(0);
 
-    F x = F(bits);
+    F x = bits.get_val();
     long mpfr_result;
     bool erangeflag = mpfr::round_to_long(x, mpfr_result);
     ASSERT_FALSE(erangeflag);
@@ -203,7 +203,7 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
     bits.set_sign(Sign::NEG);
     bits.set_mantissa(FPBits::FRACTION_MASK);
 
-    F x = F(bits);
+    F x = bits.get_val();
     if (TestModes) {
       for (int m : ROUNDING_MODES) {
         LIBC_NAMESPACE::fputil::set_round(m);
@@ -225,7 +225,7 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 1'000'001;
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
-      F x = F(FPBits(i));
+      F x = FPBits(i).get_val();
       if (x == F(0.0))
         continue;
       // All subnormal numbers should round to zero.
@@ -267,7 +267,7 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 1'000'001;
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType i = MIN_NORMAL; i <= MAX_NORMAL; i += STEP) {
-      F x = F(FPBits(i));
+      F x = FPBits(i).get_val();
       // In normal range on x86 platforms, the long double implicit 1 bit can be
       // zero making the numbers NaN. We will skip them.
       if (isnan(x)) {
diff --git a/libc/test/src/math/SqrtTest.h b/libc/test/src/math/SqrtTest.h
index d58c3ccfdd5a2..75eb411810f5e 100644
--- a/libc/test/src/math/SqrtTest.h
+++ b/libc/test/src/math/SqrtTest.h
@@ -41,7 +41,7 @@ template <typename T> class SqrtTest : public LIBC_NAMESPACE::testing::Test {
     for (StorageType mant = 1; mant < HIDDEN_BIT; mant <<= 1) {
       FPBits denormal(T(0.0));
       denormal.set_mantissa(mant);
-      T x = T(denormal);
+      T x = denormal.get_val();
       EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sqrt, x, func(x), 0.5);
     }
 
diff --git a/libc/test/src/math/TruncTest.h b/libc/test/src/math/TruncTest.h
index 2be40790258ef..0d99363526e8a 100644
--- a/libc/test/src/math/TruncTest.h
+++ b/libc/test/src/math/TruncTest.h
@@ -67,7 +67,7 @@ template <typename T> class TruncTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x))
         continue;
 
diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp
index 409cf2bc89133..81f697c315a28 100644
--- a/libc/test/src/math/acosf_test.cpp
+++ b/libc/test/src/math/acosf_test.cpp
@@ -47,7 +47,7 @@ TEST_F(LlvmLibcAcosfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x,
@@ -74,7 +74,7 @@ TEST_F(LlvmLibcAcosfTest, SpecificBitPatterns) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x,
                                    LIBC_NAMESPACE::acosf(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, -x,
diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp
index fe8d76918d486..6d43105c83c28 100644
--- a/libc/test/src/math/acoshf_test.cpp
+++ b/libc/test/src/math/acoshf_test.cpp
@@ -44,7 +44,7 @@ TEST_F(LlvmLibcAcoshfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acosh, x,
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcAcoshfTest, SpecificBitPatterns) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acosh, x,
                                    LIBC_NAMESPACE::acoshf(x), 0.5);
   }
diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp
index db9dd0d78404a..77ac2bc216f77 100644
--- a/libc/test/src/math/asinf_test.cpp
+++ b/libc/test/src/math/asinf_test.cpp
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcAsinfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, x,
@@ -70,7 +70,7 @@ TEST_F(LlvmLibcAsinfTest, SpecificBitPatterns) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, x,
                                    LIBC_NAMESPACE::asinf(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, -x,
diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp
index 2afb5b3a9ff8d..9b925bf254a93 100644
--- a/libc/test/src/math/asinhf_test.cpp
+++ b/libc/test/src/math/asinhf_test.cpp
@@ -44,7 +44,7 @@ TEST_F(LlvmLibcAsinhfTest, InFloatRange) {
   constexpr uint32_t COUNT = 1'001;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x,
@@ -71,7 +71,7 @@ TEST_F(LlvmLibcAsinhfTest, SpecificBitPatterns) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x,
                                    LIBC_NAMESPACE::asinhf(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, -x,
diff --git a/libc/test/src/math/atanf_test.cpp b/libc/test/src/math/atanf_test.cpp
index 61d202e22bdc6..b5d30fbd5679e 100644
--- a/libc/test/src/math/atanf_test.cpp
+++ b/libc/test/src/math/atanf_test.cpp
@@ -43,7 +43,7 @@ TEST_F(LlvmLibcAtanfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   const uint32_t STEP = FPBits(inf).uintval() / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, x,
                                    LIBC_NAMESPACE::atanf(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, -x,
@@ -56,7 +56,7 @@ TEST_F(LlvmLibcAtanfTest, SpecialValues) {
   uint32_t val_arr[] = {0x3d8d6b23U, 0x3feefcfbU, 0xbd8d6b23U,
                         0xbfeefcfbU, 0x7F800000U, 0xFF800000U};
   for (uint32_t v : val_arr) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, x,
                                    LIBC_NAMESPACE::atanf(x), 0.5);
   }
diff --git a/libc/test/src/math/atanhf_test.cpp b/libc/test/src/math/atanhf_test.cpp
index fcc9f44dbea93..0080a328fbea6 100644
--- a/libc/test/src/math/atanhf_test.cpp
+++ b/libc/test/src/math/atanhf_test.cpp
@@ -88,7 +88,7 @@ TEST_F(LlvmLibcAtanhfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   const uint32_t STEP = FPBits(1.0f).uintval() / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     ASSERT_MPFR_MATCH(mpfr::Operation::Atanh, x, LIBC_NAMESPACE::atanhf(x),
                       0.5);
     ASSERT_MPFR_MATCH(mpfr::Operation::Atanh, -x, LIBC_NAMESPACE::atanhf(-x),
@@ -98,12 +98,12 @@ TEST_F(LlvmLibcAtanhfTest, InFloatRange) {
 
 // For small values, atanh(x) is x.
 TEST_F(LlvmLibcAtanhfTest, SmallValues) {
-  float x = float(FPBits(uint32_t(0x17800000)));
+  float x = FPBits(uint32_t(0x17800000)).get_val();
   float result = LIBC_NAMESPACE::atanhf(x);
   EXPECT_MPFR_MATCH(mpfr::Operation::Atanh, x, result, 0.5);
   EXPECT_FP_EQ(x, result);
 
-  x = float(FPBits(uint32_t(0x00400000)));
+  x = FPBits(uint32_t(0x00400000)).get_val();
   result = LIBC_NAMESPACE::atanhf(x);
   EXPECT_MPFR_MATCH(mpfr::Operation::Atanh, x, result, 0.5);
   EXPECT_FP_EQ(x, result);
diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp
index a4c332bc7fb52..1f55e9e9a1495 100644
--- a/libc/test/src/math/cos_test.cpp
+++ b/libc/test/src/math/cos_test.cpp
@@ -22,7 +22,7 @@ TEST_F(LlvmLibcCosTest, Range) {
   constexpr StorageType COUNT = 100'000;
   constexpr StorageType STEP = STORAGE_MAX / COUNT;
   for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    double x = double(FPBits(v));
+    double x = FPBits(v).get_val();
     // TODO: Expand the range of testing after range reduction is implemented.
     if (isnan(x) || isinf(x) || x > _2pi || x < -_2pi)
       continue;
diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index 5a16520439af0..9a988d76c598f 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -46,7 +46,7 @@ TEST_F(LlvmLibcCosfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
@@ -102,7 +102,7 @@ TEST_F(LlvmLibcCosfTest, SpecificBitPatterns) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
                                    LIBC_NAMESPACE::cosf(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, -x,
@@ -114,7 +114,7 @@ TEST_F(LlvmLibcCosfTest, SpecificBitPatterns) {
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcCosfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cosf(x), 0.5);
   }
 }
diff --git a/libc/test/src/math/coshf_test.cpp b/libc/test/src/math/coshf_test.cpp
index 797cfec566ac8..843ecb8925ad3 100644
--- a/libc/test/src/math/coshf_test.cpp
+++ b/libc/test/src/math/coshf_test.cpp
@@ -44,15 +44,15 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
 TEST_F(LlvmLibcCoshfTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::coshf(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::coshf(float(FPBits(0x42cffff8U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::coshf(FPBits(0x42cffff8U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::coshf(float(FPBits(0x42d00008U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::coshf(FPBits(0x42d00008U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
 
@@ -60,7 +60,7 @@ TEST_F(LlvmLibcCoshfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH(mpfr::Operation::Cosh, x, LIBC_NAMESPACE::coshf(x), 0.5);
@@ -68,12 +68,12 @@ TEST_F(LlvmLibcCoshfTest, InFloatRange) {
 }
 
 TEST_F(LlvmLibcCoshfTest, SmallValues) {
-  float x = float(FPBits(0x17800000U));
+  float x = FPBits(0x17800000U).get_val();
   float result = LIBC_NAMESPACE::coshf(x);
   EXPECT_MPFR_MATCH(mpfr::Operation::Cosh, x, result, 0.5);
   EXPECT_FP_EQ(1.0f, result);
 
-  x = float(FPBits(0x0040000U));
+  x = FPBits(0x0040000U).get_val();
   result = LIBC_NAMESPACE::coshf(x);
   EXPECT_MPFR_MATCH(mpfr::Operation::Cosh, x, result, 0.5);
   EXPECT_FP_EQ(1.0f, result);
diff --git a/libc/test/src/math/erff_test.cpp b/libc/test/src/math/erff_test.cpp
index 933f77c0b5a67..8ebde4ec24fb5 100644
--- a/libc/test/src/math/erff_test.cpp
+++ b/libc/test/src/math/erff_test.cpp
@@ -36,7 +36,7 @@ TEST_F(LlvmLibcErffTest, TrickyInputs) {
       0x4004'1e6aU, // |x| = 0x1.083cd4p+1f
   };
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Erf, x,
                                    LIBC_NAMESPACE::erff(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Erf, -x,
diff --git a/libc/test/src/math/exhaustive/exhaustive_test.h b/libc/test/src/math/exhaustive/exhaustive_test.h
index dbb59e5269dc3..c4ae382688a03 100644
--- a/libc/test/src/math/exhaustive/exhaustive_test.h
+++ b/libc/test/src/math/exhaustive/exhaustive_test.h
@@ -55,7 +55,7 @@ struct UnaryOpChecker : public virtual LIBC_NAMESPACE::testing::Test {
     uint64_t failed = 0;
     do {
       FPBits xbits(bits);
-      FloatType x = FloatType(xbits);
+      FloatType x = xbits.get_val();
       bool correct =
           TEST_MPFR_MATCH_ROUNDING_SILENTLY(Op, x, FUNC(x), 0.5, rounding);
       failed += (!correct);
@@ -127,9 +127,8 @@ struct LlvmLibcExhaustiveMathTest
             msg << "Test failed for " << std::dec << failed_in_range
                 << " inputs in range: " << range_begin << " to " << range_end
                 << " [0x" << std::hex << range_begin << ", 0x" << range_end
-                << "), [" << std::hexfloat
-                << static_cast<FloatType>(FPBits(range_begin)) << ", "
-                << static_cast<FloatType>(FPBits(range_end)) << ")\n";
+                << "), [" << std::hexfloat << FPBits(range_begin).get_val()
+                << ", " << FPBits(range_end).get_val() << ")\n";
             std::cerr << msg.str() << std::flush;
 
             failed.fetch_add(failed_in_range);
diff --git a/libc/test/src/math/exhaustive/hypotf_test.cpp b/libc/test/src/math/exhaustive/hypotf_test.cpp
index df597563ab0cd..04da55d4d3a9f 100644
--- a/libc/test/src/math/exhaustive/hypotf_test.cpp
+++ b/libc/test/src/math/exhaustive/hypotf_test.cpp
@@ -31,10 +31,10 @@ struct HypotfChecker : public virtual LIBC_NAMESPACE::testing::Test {
     uint32_t xbits = start;
     uint64_t failed = 0;
     do {
-      float x = float(FPBits(xbits));
+      float x = FPBits(xbits).get_val();
       uint32_t ybits = Y_START;
       do {
-        float y = float(FPBits(ybits));
+        float y = FPBits(ybits).get_val();
         bool correct = TEST_FP_EQ(LIBC_NAMESPACE::fputil::hypot(x, y),
                                   LIBC_NAMESPACE::hypotf(x, y));
         // Using MPFR will be much slower.
diff --git a/libc/test/src/math/exhaustive/sincosf_test.cpp b/libc/test/src/math/exhaustive/sincosf_test.cpp
index 4104034d17c6f..4d53d24f05328 100644
--- a/libc/test/src/math/exhaustive/sincosf_test.cpp
+++ b/libc/test/src/math/exhaustive/sincosf_test.cpp
@@ -26,7 +26,7 @@ struct SincosfChecker : public virtual LIBC_NAMESPACE::testing::Test {
     uint64_t failed = 0;
     do {
       FPBits xbits(bits);
-      FloatType x = FloatType(xbits);
+      FloatType x = xbits.get_val();
       FloatType sinx, cosx;
       LIBC_NAMESPACE::sincosf(x, &sinx, &cosx);
 
diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp
index ec3925846dba4..e9990b3ed8e6d 100644
--- a/libc/test/src/math/exp10_test.cpp
+++ b/libc/test/src/math/exp10_test.cpp
@@ -79,7 +79,7 @@ TEST_F(LlvmLibcExp10Test, TrickyInputs) {
       0x4037000000000000, // x = 23
   };
   for (int i = 0; i < N; ++i) {
-    double x = double(FPBits(INPUTS[i]));
+    double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                    LIBC_NAMESPACE::exp10(x), 0.5);
   }
diff --git a/libc/test/src/math/exp10f_test.cpp b/libc/test/src/math/exp10f_test.cpp
index e3151dafa9429..0866488935c20 100644
--- a/libc/test/src/math/exp10f_test.cpp
+++ b/libc/test/src/math/exp10f_test.cpp
@@ -42,30 +42,31 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
 TEST_F(LlvmLibcExp10fTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp10f(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp10f(float(FPBits(0x43000000U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp10f(FPBits(0x43000000U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp10f(float(FPBits(0x43000001U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp10f(FPBits(0x43000001U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
 
 TEST_F(LlvmLibcExp10fTest, Underflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      0.0f, LIBC_NAMESPACE::exp10f(float(FPBits(0xff7fffffU))), FE_UNDERFLOW);
+      0.0f, LIBC_NAMESPACE::exp10f(FPBits(0xff7fffffU).get_val()),
+      FE_UNDERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
-  float x = float(FPBits(0xc2cffff8U));
+  float x = FPBits(0xc2cffff8U).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                  LIBC_NAMESPACE::exp10f(x), 0.5);
   EXPECT_MATH_ERRNO(ERANGE);
 
-  x = float(FPBits(0xc2d00008U));
+  x = FPBits(0xc2d00008U).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                  LIBC_NAMESPACE::exp10f(x), 0.5);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -97,7 +98,7 @@ TEST_F(LlvmLibcExp10fTest, TrickyInputs) {
   };
   for (int i = 0; i < N; ++i) {
     libc_errno = 0;
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
                                    LIBC_NAMESPACE::exp10f(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, -x,
@@ -109,7 +110,7 @@ TEST_F(LlvmLibcExp10fTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     libc_errno = 0;
diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp
index 539ec7b9368f4..d66c9b757625d 100644
--- a/libc/test/src/math/exp2_test.cpp
+++ b/libc/test/src/math/exp2_test.cpp
@@ -54,7 +54,7 @@ TEST_F(LlvmLibcExp2Test, TrickyInputs) {
       0xbc971547652b82fe, // x=-0x1.71547652b82fep-54
   };
   for (int i = 0; i < N; ++i) {
-    double x = double(FPBits(INPUTS[i]));
+    double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                    LIBC_NAMESPACE::exp2(x), 0.5);
   }
diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
index 4c69c1d48d621..18607b1d04910 100644
--- a/libc/test/src/math/exp2f_test.cpp
+++ b/libc/test/src/math/exp2f_test.cpp
@@ -43,15 +43,15 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
 TEST_F(LlvmLibcExp2fTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp2f(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp2f(float(FPBits(0x43000000U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp2f(FPBits(0x43000000U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp2f(float(FPBits(0x43000001U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp2f(FPBits(0x43000001U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
 
@@ -73,7 +73,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) {
   };
   for (int i = 0; i < N; ++i) {
     libc_errno = 0;
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                    LIBC_NAMESPACE::exp2f(x), 0.5);
     EXPECT_MATH_ERRNO(0);
@@ -83,20 +83,20 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) {
 TEST_F(LlvmLibcExp2fTest, Underflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      0.0f, LIBC_NAMESPACE::exp2f(float(FPBits(0xff7fffffU))), FE_UNDERFLOW);
+      0.0f, LIBC_NAMESPACE::exp2f(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
-  float x = float(FPBits(0xc3158000U));
+  float x = FPBits(0xc3158000U).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                  LIBC_NAMESPACE::exp2f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0xc3160000U));
+  x = FPBits(0xc3160000U).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                  LIBC_NAMESPACE::exp2f(x), 0.5);
   EXPECT_MATH_ERRNO(ERANGE);
 
-  x = float(FPBits(0xc3165432U));
+  x = FPBits(0xc3165432U).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
                                  LIBC_NAMESPACE::exp2f(x), 0.5);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -106,7 +106,7 @@ TEST_F(LlvmLibcExp2fTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     libc_errno = 0;
diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp
index 1de2d7507acc3..454107f307d86 100644
--- a/libc/test/src/math/exp_test.cpp
+++ b/libc/test/src/math/exp_test.cpp
@@ -52,7 +52,7 @@ TEST_F(LlvmLibcExpTest, TrickyInputs) {
       0xc0867a172ceb0990, // x=-0x1.67a172ceb099p+9
   };
   for (int i = 0; i < N; ++i) {
-    double x = double(FPBits(INPUTS[i]));
+    double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                    LIBC_NAMESPACE::exp(x), 0.5);
   }
diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
index 521eba705b69f..0ac64ceec1c71 100644
--- a/libc/test/src/math/expf_test.cpp
+++ b/libc/test/src/math/expf_test.cpp
@@ -42,30 +42,30 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
 TEST_F(LlvmLibcExpfTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expf(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expf(float(FPBits(0x42cffff8U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expf(FPBits(0x42cffff8U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expf(float(FPBits(0x42d00008U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expf(FPBits(0x42d00008U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
 
 TEST_F(LlvmLibcExpfTest, Underflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      0.0f, LIBC_NAMESPACE::expf(float(FPBits(0xff7fffffU))), FE_UNDERFLOW);
+      0.0f, LIBC_NAMESPACE::expf(FPBits(0xff7fffffU).get_val()), FE_UNDERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
-  float x = float(FPBits(0xc2cffff8U));
+  float x = FPBits(0xc2cffff8U).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                  LIBC_NAMESPACE::expf(x), 0.5);
   EXPECT_MATH_ERRNO(ERANGE);
 
-  x = float(FPBits(0xc2d00008U));
+  x = FPBits(0xc2d00008U).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                  LIBC_NAMESPACE::expf(x), 0.5);
   EXPECT_MATH_ERRNO(ERANGE);
@@ -77,27 +77,27 @@ TEST_F(LlvmLibcExpfTest, Borderline) {
   float x;
 
   libc_errno = 0;
-  x = float(FPBits(0x42affff8U));
+  x = FPBits(0x42affff8U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                  LIBC_NAMESPACE::expf(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0x42b00008U));
+  x = FPBits(0x42b00008U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                  LIBC_NAMESPACE::expf(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0xc2affff8U));
+  x = FPBits(0xc2affff8U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                  LIBC_NAMESPACE::expf(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0xc2b00008U));
+  x = FPBits(0xc2b00008U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                  LIBC_NAMESPACE::expf(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0xc236bd8cU));
+  x = FPBits(0xc236bd8cU).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
                                  LIBC_NAMESPACE::expf(x), 0.5);
   EXPECT_MATH_ERRNO(0);
@@ -107,7 +107,7 @@ TEST_F(LlvmLibcExpfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     libc_errno = 0;
diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp
index ad53ffb6e8af1..99ef8275eab73 100644
--- a/libc/test/src/math/expm1_test.cpp
+++ b/libc/test/src/math/expm1_test.cpp
@@ -48,7 +48,7 @@ TEST_F(LlvmLibcExpm1Test, TrickyInputs) {
       0xc042b708872320dd, // x=-0x1.2b708872320ddp+5
   };
   for (int i = 0; i < N; ++i) {
-    double x = double(FPBits(INPUTS[i]));
+    double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                    LIBC_NAMESPACE::expm1(x), 0.5);
   }
diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
index a9eaa4dbe5381..cc820803cc30b 100644
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@@ -42,26 +42,26 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
 TEST_F(LlvmLibcExpm1fTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expm1f(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expm1f(float(FPBits(0x42cffff8U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expm1f(FPBits(0x42cffff8U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expm1f(float(FPBits(0x42d00008U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expm1f(FPBits(0x42d00008U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
 
 TEST_F(LlvmLibcExpm1fTest, Underflow) {
   libc_errno = 0;
-  EXPECT_FP_EQ(-1.0f, LIBC_NAMESPACE::expm1f(float(FPBits(0xff7fffffU))));
+  EXPECT_FP_EQ(-1.0f, LIBC_NAMESPACE::expm1f(FPBits(0xff7fffffU).get_val()));
 
-  float x = float(FPBits(0xc2cffff8U));
+  float x = FPBits(0xc2cffff8U).get_val();
   EXPECT_FP_EQ(-1.0f, LIBC_NAMESPACE::expm1f(x));
 
-  x = float(FPBits(0xc2d00008U));
+  x = FPBits(0xc2d00008U).get_val();
   EXPECT_FP_EQ(-1.0f, LIBC_NAMESPACE::expm1f(x));
 }
 
@@ -71,42 +71,42 @@ TEST_F(LlvmLibcExpm1fTest, Borderline) {
   float x;
 
   libc_errno = 0;
-  x = float(FPBits(0x42affff8U));
+  x = FPBits(0x42affff8U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0x42b00008U));
+  x = FPBits(0x42b00008U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0xc2affff8U));
+  x = FPBits(0xc2affff8U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0xc2b00008U));
+  x = FPBits(0xc2b00008U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0x3dc252ddU));
+  x = FPBits(0x3dc252ddU).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0x3e35bec5U));
+  x = FPBits(0x3e35bec5U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0x942ed494U));
+  x = FPBits(0x942ed494U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
 
-  x = float(FPBits(0xbdc1c6cbU));
+  x = FPBits(0xbdc1c6cbU).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
                                  LIBC_NAMESPACE::expm1f(x), 0.5);
   EXPECT_MATH_ERRNO(0);
@@ -116,7 +116,7 @@ TEST_F(LlvmLibcExpm1fTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     libc_errno = 0;
diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp
index 72224c2471800..de9206f2daec2 100644
--- a/libc/test/src/math/log10_test.cpp
+++ b/libc/test/src/math/log10_test.cpp
@@ -66,7 +66,7 @@ TEST_F(LlvmLibcLog10Test, TrickyInputs) {
       0x30160580e7268a99, 0x5ca04103b7eaa345, 0x19ad77dc4a40093f,
       0x0000449fb5c8a96e};
   for (int i = 0; i < N; ++i) {
-    double x = double(FPBits(INPUTS[i]));
+    double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x,
                                    LIBC_NAMESPACE::log10(x), 0.5);
   }
diff --git a/libc/test/src/math/log10f_test.cpp b/libc/test/src/math/log10f_test.cpp
index 3448ea7570eec..c38a5159682cf 100644
--- a/libc/test/src/math/log10f_test.cpp
+++ b/libc/test/src/math/log10f_test.cpp
@@ -59,7 +59,7 @@ TEST_F(LlvmLibcLog10fTest, TrickyInputs) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x,
                                    LIBC_NAMESPACE::log10f(x), 0.5);
   }
@@ -69,7 +69,7 @@ TEST_F(LlvmLibcLog10fTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x,
                                    LIBC_NAMESPACE::log10f(x), 0.5);
   }
diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp
index 5bec911937dca..f70c0f87560cd 100644
--- a/libc/test/src/math/log1p_test.cpp
+++ b/libc/test/src/math/log1p_test.cpp
@@ -67,7 +67,7 @@ TEST_F(LlvmLibcLog1pTest, TrickyInputs) {
       0x5671e2f1628093e4, 0x73dac56e2bf1a951, 0x8001bc6879ea14c5,
   };
   for (int i = 0; i < N; ++i) {
-    double x = double(FPBits(INPUTS[i]));
+    double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x,
                                    LIBC_NAMESPACE::log1p(x), 0.5);
   }
diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp
index 16cdc4704cb8a..37144838552cc 100644
--- a/libc/test/src/math/log1pf_test.cpp
+++ b/libc/test/src/math/log1pf_test.cpp
@@ -63,7 +63,7 @@ TEST_F(LlvmLibcLog1pfTest, TrickyInputs) {
       0xbf800000U, /*-1.0f*/
   };
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x,
                                    LIBC_NAMESPACE::log1pf(x), 0.5);
   }
@@ -73,7 +73,7 @@ TEST_F(LlvmLibcLog1pfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     libc_errno = 0;
diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp
index b471b8eb540fe..65eae58b3508a 100644
--- a/libc/test/src/math/log2_test.cpp
+++ b/libc/test/src/math/log2_test.cpp
@@ -64,7 +64,7 @@ TEST_F(LlvmLibcLog2Test, TrickyInputs) {
       0x3fefbfdaa448ed98,
   };
   for (int i = 0; i < N; ++i) {
-    double x = double(FPBits(INPUTS[i]));
+    double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x,
                                    LIBC_NAMESPACE::log2(x), 0.5);
   }
diff --git a/libc/test/src/math/log2f_test.cpp b/libc/test/src/math/log2f_test.cpp
index aaa6320fb26b1..0793bf8a04091 100644
--- a/libc/test/src/math/log2f_test.cpp
+++ b/libc/test/src/math/log2f_test.cpp
@@ -39,7 +39,7 @@ TEST_F(LlvmLibcLog2fTest, TrickyInputs) {
       0x3f80079bU, 0x3f81d0b5U, 0x3f82e602U, 0x3f83c98dU, 0x3f8cba39U};
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x,
                                    LIBC_NAMESPACE::log2f(x), 0.5);
   }
@@ -49,7 +49,7 @@ TEST_F(LlvmLibcLog2fTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     libc_errno = 0;
diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp
index e3f41e1e264a3..457117c1757a7 100644
--- a/libc/test/src/math/log_test.cpp
+++ b/libc/test/src/math/log_test.cpp
@@ -63,7 +63,7 @@ TEST_F(LlvmLibcLogTest, TrickyInputs) {
       0x3fefbfdaa448ed98,
   };
   for (int i = 0; i < N; ++i) {
-    double x = double(FPBits(INPUTS[i]));
+    double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x,
                                    LIBC_NAMESPACE::log(x), 0.5);
   }
diff --git a/libc/test/src/math/logf_test.cpp b/libc/test/src/math/logf_test.cpp
index c02e95cb6f800..3ab67ba807823 100644
--- a/libc/test/src/math/logf_test.cpp
+++ b/libc/test/src/math/logf_test.cpp
@@ -71,7 +71,7 @@ TEST_F(LlvmLibcLogfTest, TrickyInputs) {
       0x7a17f30aU, /*0x1.2fe614p+117f*/
   };
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x,
                                    LIBC_NAMESPACE::logf(x), 0.5);
   }
@@ -81,7 +81,7 @@ TEST_F(LlvmLibcLogfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x,
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index cd9eeaff598fe..69981ed22ee69 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -23,7 +23,7 @@ TEST_F(LlvmLibcSinTest, Range) {
   constexpr StorageType COUNT = 100'000;
   constexpr StorageType STEP = STORAGE_MAX / COUNT;
   for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    double x = double(FPBits(v));
+    double x = FPBits(v).get_val();
     // TODO: Expand the range of testing after range reduction is implemented.
     if (isnan(x) || isinf(x) || x > _2pi || x < -_2pi)
       continue;
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index fde707f892a9e..76fda6354f63a 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -100,7 +100,7 @@ TEST_F(LlvmLibcSinCosfTest, InFloatRange) {
   constexpr uint32_t COUNT = 1'001;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits((v)));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
 
@@ -159,7 +159,7 @@ TEST_F(LlvmLibcSinCosfTest, SpecialValues) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_SINCOS_MATCH_ALL_ROUNDING(x);
     EXPECT_SINCOS_MATCH_ALL_ROUNDING(-x);
   }
@@ -169,7 +169,7 @@ TEST_F(LlvmLibcSinCosfTest, SpecialValues) {
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcSinCosfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
-    float x = float(FPBits((v)));
+    float x = FPBits(v).get_val();
     EXPECT_SINCOS_MATCH_ALL_ROUNDING(x);
     EXPECT_SINCOS_MATCH_ALL_ROUNDING(-x);
   }
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index 107b015512990..32afc4f1c60d1 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -47,7 +47,7 @@ TEST_F(LlvmLibcSinfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
@@ -97,7 +97,7 @@ TEST_F(LlvmLibcSinfTest, SpecificBitPatterns) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
                                    LIBC_NAMESPACE::sinf(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, -x,
@@ -107,11 +107,11 @@ TEST_F(LlvmLibcSinfTest, SpecificBitPatterns) {
 
 // For small values, sin(x) is x.
 TEST_F(LlvmLibcSinfTest, SmallValues) {
-  float x = float(FPBits(0x1780'0000U));
+  float x = FPBits(0x1780'0000U).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
                                  LIBC_NAMESPACE::sinf(x), 0.5);
 
-  x = float(FPBits(0x0040'0000U));
+  x = FPBits(0x0040'0000U).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
                                  LIBC_NAMESPACE::sinf(x), 0.5);
 }
@@ -120,7 +120,7 @@ TEST_F(LlvmLibcSinfTest, SmallValues) {
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcSinfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
-    float x = float(FPBits((v)));
+    float x = FPBits((v)).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
                                    LIBC_NAMESPACE::sinf(x), 0.5);
   }
diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp
index dc80a1b7aedf9..765fdc6c2bcc4 100644
--- a/libc/test/src/math/sinhf_test.cpp
+++ b/libc/test/src/math/sinhf_test.cpp
@@ -45,7 +45,7 @@ TEST_F(LlvmLibcSinhfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH(mpfr::Operation::Sinh, x, LIBC_NAMESPACE::sinhf(x), 0.5);
@@ -54,12 +54,12 @@ TEST_F(LlvmLibcSinhfTest, InFloatRange) {
 
 // For small values, sinh(x) is x.
 TEST_F(LlvmLibcSinhfTest, SmallValues) {
-  float x = float(FPBits(uint32_t(0x17800000)));
+  float x = FPBits(uint32_t(0x17800000)).get_val();
   float result = LIBC_NAMESPACE::sinhf(x);
   EXPECT_MPFR_MATCH(mpfr::Operation::Sinh, x, result, 0.5);
   EXPECT_FP_EQ(x, result);
 
-  x = float(FPBits(uint32_t(0x00400000)));
+  x = FPBits(uint32_t(0x00400000)).get_val();
   result = LIBC_NAMESPACE::sinhf(x);
   EXPECT_MPFR_MATCH(mpfr::Operation::Sinh, x, result, 0.5);
   EXPECT_FP_EQ(x, result);
@@ -68,24 +68,24 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) {
 TEST_F(LlvmLibcSinhfTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::sinhf(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::sinhf(float(FPBits(0x42cffff8U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::sinhf(FPBits(0x42cffff8U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::sinhf(float(FPBits(0x42d00008U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::sinhf(FPBits(0x42d00008U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
 
 TEST_F(LlvmLibcSinhfTest, ExceptionalValues) {
-  float x = float(FPBits(uint32_t(0x3a12'85ffU)));
+  float x = FPBits(uint32_t(0x3a12'85ffU)).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x,
                                  LIBC_NAMESPACE::sinhf(x), 0.5);
 
-  x = -float(FPBits(uint32_t(0x3a12'85ffU)));
+  x = -FPBits(uint32_t(0x3a12'85ffU)).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x,
                                  LIBC_NAMESPACE::sinhf(x), 0.5);
 }
diff --git a/libc/test/src/math/smoke/FDimTest.h b/libc/test/src/math/smoke/FDimTest.h
index 46df2b40e64bd..31df2f5607c26 100644
--- a/libc/test/src/math/smoke/FDimTest.h
+++ b/libc/test/src/math/smoke/FDimTest.h
@@ -20,11 +20,11 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   void test_na_n_arg(FuncPtr func) {
     EXPECT_FP_EQ(nan, func(nan, inf));
@@ -66,7 +66,7 @@ class FDimTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
-      T x = T(FPBits(v)), y = T(FPBits(w));
+      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
       if (isnan(x) || isinf(x))
         continue;
       if (isnan(y) || isinf(y))
diff --git a/libc/test/src/math/smoke/FMaxTest.h b/libc/test/src/math/smoke/FMaxTest.h
index 372d9c5571d31..98edc8e971e81 100644
--- a/libc/test/src/math/smoke/FMaxTest.h
+++ b/libc/test/src/math/smoke/FMaxTest.h
@@ -56,7 +56,7 @@ template <typename T> class FMaxTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
-      T x = T(FPBits(v)), y = T(FPBits(w));
+      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
       if (isnan(x) || isinf(x))
         continue;
       if (isnan(y) || isinf(y))
diff --git a/libc/test/src/math/smoke/FMinTest.h b/libc/test/src/math/smoke/FMinTest.h
index a51f30803ba82..834d757c85d04 100644
--- a/libc/test/src/math/smoke/FMinTest.h
+++ b/libc/test/src/math/smoke/FMinTest.h
@@ -56,7 +56,7 @@ template <typename T> class FMinTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0, w = STORAGE_MAX; i <= COUNT;
          ++i, v += STEP, w -= STEP) {
-      T x = T(FPBits(v)), y = T(FPBits(w));
+      T x = FPBits(v).get_val(), y = FPBits(w).get_val();
       if (isnan(x) || isinf(x))
         continue;
       if (isnan(y) || isinf(y))
diff --git a/libc/test/src/math/smoke/FmaTest.h b/libc/test/src/math/smoke/FmaTest.h
index 106ba1405a968..9d76d3bde3121 100644
--- a/libc/test/src/math/smoke/FmaTest.h
+++ b/libc/test/src/math/smoke/FmaTest.h
@@ -21,11 +21,11 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
 public:
   void test_special_numbers(Func func) {
@@ -39,16 +39,16 @@ class FmaTestTemplate : public LIBC_NAMESPACE::testing::Test {
     EXPECT_FP_EQ(func(inf, neg_inf, nan), nan);
 
     // Test underflow rounding up.
-    EXPECT_FP_EQ(
-        func(T(0.5), T(FPBits::min_subnormal()), T(FPBits::min_subnormal())),
-        T(FPBits(StorageType(2))));
+    EXPECT_FP_EQ(func(T(0.5), FPBits::min_subnormal().get_val(),
+                      FPBits::min_subnormal().get_val()),
+                 FPBits(StorageType(2)).get_val());
     // Test underflow rounding down.
     StorageType MIN_NORMAL = FPBits::min_normal().uintval();
-    T v = T(FPBits(MIN_NORMAL + StorageType(1)));
-    EXPECT_FP_EQ(func(T(1) / T(MIN_NORMAL << 1), v, T(FPBits::min_normal())),
-                 v);
+    T v = FPBits(MIN_NORMAL + StorageType(1)).get_val();
+    EXPECT_FP_EQ(
+        func(T(1) / T(MIN_NORMAL << 1), v, FPBits::min_normal().get_val()), v);
     // Test overflow.
-    T z = T(FPBits::max_normal());
+    T z = FPBits::max_normal().get_val();
     EXPECT_FP_EQ(func(T(1.75), z, -z), T(0.75) * z);
     // Exact cancellation.
     EXPECT_FP_EQ(func(T(3.0), T(5.0), -T(15.0)), T(0.0));
diff --git a/libc/test/src/math/smoke/HypotTest.h b/libc/test/src/math/smoke/HypotTest.h
index 5400fc0730d8a..0f3b52d53f49c 100644
--- a/libc/test/src/math/smoke/HypotTest.h
+++ b/libc/test/src/math/smoke/HypotTest.h
@@ -22,16 +22,16 @@ class HypotTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
-  const T nan = T(FPBits::build_quiet_nan());
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-
-  const T max_normal = T(FPBits::max_normal());
-  const T min_normal = T(FPBits::min_normal());
-  const T max_subnormal = T(FPBits::max_subnormal());
-  const T min_subnormal = T(FPBits::min_subnormal());
+  const T nan = FPBits::build_quiet_nan().get_val();
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+
+  const T max_normal = FPBits::max_normal().get_val();
+  const T min_normal = FPBits::min_normal().get_val();
+  const T max_subnormal = FPBits::max_subnormal().get_val();
+  const T min_subnormal = FPBits::min_subnormal().get_val();
 
 public:
   void test_special_numbers(Func func) {
diff --git a/libc/test/src/math/smoke/ILogbTest.h b/libc/test/src/math/smoke/ILogbTest.h
index fe28fab9d777c..fb8d77093eba3 100644
--- a/libc/test/src/math/smoke/ILogbTest.h
+++ b/libc/test/src/math/smoke/ILogbTest.h
@@ -26,11 +26,11 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
   void test_special_numbers(typename ILogbFunc<T>::Func func) {
     using FPBits = LIBC_NAMESPACE::fputil::FPBits<T>;
     using Sign = LIBC_NAMESPACE::fputil::Sign;
-    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::POS))));
-    EXPECT_EQ(FP_ILOGB0, func(T(FPBits::zero(Sign::NEG))));
-    EXPECT_EQ(FP_ILOGBNAN, func(T(FPBits::build_quiet_nan())));
-    EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::POS))));
-    EXPECT_EQ(INT_MAX, func(T(FPBits::inf(Sign::NEG))));
+    EXPECT_EQ(FP_ILOGB0, func(FPBits::zero(Sign::POS).get_val()));
+    EXPECT_EQ(FP_ILOGB0, func(FPBits::zero(Sign::NEG).get_val()));
+    EXPECT_EQ(FP_ILOGBNAN, func(FPBits::build_quiet_nan().get_val()));
+    EXPECT_EQ(INT_MAX, func(FPBits::inf(Sign::POS).get_val()));
+    EXPECT_EQ(INT_MAX, func(FPBits::inf(Sign::NEG).get_val()));
   }
 
   template <typename T>
@@ -81,7 +81,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 10'001;
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType v = MIN_SUBNORMAL; v <= MAX_SUBNORMAL; v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
 
@@ -100,7 +100,7 @@ class LlvmLibcILogbTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 10'001;
     constexpr StorageType STEP = (MAX_NORMAL - MIN_NORMAL) / COUNT;
     for (StorageType v = MIN_NORMAL; v <= MAX_NORMAL; v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x) || x == 0.0)
         continue;
 
diff --git a/libc/test/src/math/smoke/LdExpTest.h b/libc/test/src/math/smoke/LdExpTest.h
index 08a14a90c1226..1d581f2c0a02f 100644
--- a/libc/test/src/math/smoke/LdExpTest.h
+++ b/libc/test/src/math/smoke/LdExpTest.h
@@ -25,11 +25,11 @@ class LdExpTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   // A normalized mantissa to be used with tests.
   static constexpr StorageType MANTISSA = NormalFloat::ONE + 0x1234;
diff --git a/libc/test/src/math/smoke/LogbTest.h b/libc/test/src/math/smoke/LogbTest.h
index a2628273cecc9..e2698e2b7b81b 100644
--- a/libc/test/src/math/smoke/LogbTest.h
+++ b/libc/test/src/math/smoke/LogbTest.h
@@ -72,7 +72,7 @@ template <typename T> class LogbTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = static_cast<T>(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x) || x == 0.0l)
         continue;
 
diff --git a/libc/test/src/math/smoke/ModfTest.h b/libc/test/src/math/smoke/ModfTest.h
index ea7e36b67d11f..a73e5ae4298fa 100644
--- a/libc/test/src/math/smoke/ModfTest.h
+++ b/libc/test/src/math/smoke/ModfTest.h
@@ -84,7 +84,7 @@ template <typename T> class ModfTest : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 100'000;
     constexpr StorageType STEP = STORAGE_MAX / COUNT;
     for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-      T x = T(FPBits(v));
+      T x = FPBits(v).get_val();
       if (isnan(x) || isinf(x) || x == T(0.0))
         continue;
 
diff --git a/libc/test/src/math/smoke/NextAfterTest.h b/libc/test/src/math/smoke/NextAfterTest.h
index 859e2c74a3693..d2870562b238d 100644
--- a/libc/test/src/math/smoke/NextAfterTest.h
+++ b/libc/test/src/math/smoke/NextAfterTest.h
@@ -34,11 +34,11 @@ class NextAfterTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   static constexpr StorageType min_subnormal =
       FPBits::min_subnormal().uintval();
diff --git a/libc/test/src/math/smoke/NextTowardTest.h b/libc/test/src/math/smoke/NextTowardTest.h
index c9f07b993f04b..5bbc89b9e5f76 100644
--- a/libc/test/src/math/smoke/NextTowardTest.h
+++ b/libc/test/src/math/smoke/NextTowardTest.h
@@ -36,11 +36,11 @@ class NextTowardTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
   const long double to_zero = ToFPBits::zero().get_val();
   const long double to_neg_zero = ToFPBits::zero(Sign::NEG).get_val();
diff --git a/libc/test/src/math/smoke/RIntTest.h b/libc/test/src/math/smoke/RIntTest.h
index bd99e634badbd..88c4c560fffaf 100644
--- a/libc/test/src/math/smoke/RIntTest.h
+++ b/libc/test/src/math/smoke/RIntTest.h
@@ -31,11 +31,11 @@ class RIntTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
 public:
   void testSpecialNumbers(RIntFunc func) {
diff --git a/libc/test/src/math/smoke/RemQuoTest.h b/libc/test/src/math/smoke/RemQuoTest.h
index 739661f2d9619..e0fbe4ee935fb 100644
--- a/libc/test/src/math/smoke/RemQuoTest.h
+++ b/libc/test/src/math/smoke/RemQuoTest.h
@@ -21,11 +21,11 @@ class RemQuoTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const T inf = T(FPBits::inf(Sign::POS));
-  const T neg_inf = T(FPBits::inf(Sign::NEG));
-  const T zero = T(FPBits::zero(Sign::POS));
-  const T neg_zero = T(FPBits::zero(Sign::NEG));
-  const T nan = T(FPBits::build_quiet_nan());
+  const T inf = FPBits::inf(Sign::POS).get_val();
+  const T neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const T zero = FPBits::zero(Sign::POS).get_val();
+  const T neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const T nan = FPBits::build_quiet_nan().get_val();
 
 public:
   typedef T (*RemQuoFunc)(T, T, int *);
diff --git a/libc/test/src/math/smoke/RoundToIntegerTest.h b/libc/test/src/math/smoke/RoundToIntegerTest.h
index 404155604645c..836f827c14504 100644
--- a/libc/test/src/math/smoke/RoundToIntegerTest.h
+++ b/libc/test/src/math/smoke/RoundToIntegerTest.h
@@ -30,11 +30,11 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
   using StorageType = typename FPBits::StorageType;
   using Sign = LIBC_NAMESPACE::fputil::Sign;
 
-  const F zero = F(FPBits::zero(Sign::POS));
-  const F neg_zero = F(FPBits::zero(Sign::NEG));
-  const F inf = F(FPBits::inf(Sign::POS));
-  const F neg_inf = F(FPBits::inf(Sign::NEG));
-  const F nan = F(FPBits::build_quiet_nan());
+  const F zero = FPBits::zero(Sign::POS).get_val();
+  const F neg_zero = FPBits::zero(Sign::NEG).get_val();
+  const F inf = FPBits::inf(Sign::POS).get_val();
+  const F neg_inf = FPBits::inf(Sign::NEG).get_val();
+  const F nan = FPBits::build_quiet_nan().get_val();
 
   static constexpr StorageType MAX_SUBNORMAL =
       FPBits::max_subnormal().uintval();
@@ -119,7 +119,7 @@ class RoundToIntegerTestTemplate : public LIBC_NAMESPACE::testing::Test {
     constexpr StorageType COUNT = 1'000'001;
     constexpr StorageType STEP = (MAX_SUBNORMAL - MIN_SUBNORMAL) / COUNT;
     for (StorageType i = MIN_SUBNORMAL; i <= MAX_SUBNORMAL; i += STEP) {
-      F x = F(FPBits(i));
+      F x = FPBits(i).get_val();
       if (x == F(0.0))
         continue;
       // All subnormal numbers should round to zero.
diff --git a/libc/test/src/math/smoke/coshf_test.cpp b/libc/test/src/math/smoke/coshf_test.cpp
index 93fdd126896cf..aa224dcc156f0 100644
--- a/libc/test/src/math/smoke/coshf_test.cpp
+++ b/libc/test/src/math/smoke/coshf_test.cpp
@@ -41,14 +41,14 @@ TEST_F(LlvmLibcCoshfTest, SpecialNumbers) {
 TEST_F(LlvmLibcCoshfTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::coshf(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::coshf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::coshf(float(FPBits(0x42cffff8U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::coshf(FPBits(0x42cffff8U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::coshf(float(FPBits(0x42d00008U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::coshf(FPBits(0x42d00008U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
diff --git a/libc/test/src/math/smoke/exp10f_test.cpp b/libc/test/src/math/smoke/exp10f_test.cpp
index 8b758dd3f62f9..d242edf04fb37 100644
--- a/libc/test/src/math/smoke/exp10f_test.cpp
+++ b/libc/test/src/math/smoke/exp10f_test.cpp
@@ -43,14 +43,14 @@ TEST_F(LlvmLibcExp10fTest, SpecialNumbers) {
 TEST_F(LlvmLibcExp10fTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp10f(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp10f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp10f(float(FPBits(0x43000000U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp10f(FPBits(0x43000000U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp10f(float(FPBits(0x43000001U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp10f(FPBits(0x43000001U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
diff --git a/libc/test/src/math/smoke/exp2f_test.cpp b/libc/test/src/math/smoke/exp2f_test.cpp
index 6623d449aeebd..e4e56ed23bcff 100644
--- a/libc/test/src/math/smoke/exp2f_test.cpp
+++ b/libc/test/src/math/smoke/exp2f_test.cpp
@@ -45,14 +45,14 @@ TEST_F(LlvmLibcExp2fTest, SpecialNumbers) {
 TEST_F(LlvmLibcExp2fTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp2f(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp2f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp2f(float(FPBits(0x43000000U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp2f(FPBits(0x43000000U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::exp2f(float(FPBits(0x43000001U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::exp2f(FPBits(0x43000001U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
diff --git a/libc/test/src/math/smoke/expf_test.cpp b/libc/test/src/math/smoke/expf_test.cpp
index ff33e95f90427..24fc35b552ca2 100644
--- a/libc/test/src/math/smoke/expf_test.cpp
+++ b/libc/test/src/math/smoke/expf_test.cpp
@@ -39,14 +39,14 @@ TEST_F(LlvmLibcExpfTest, SpecialNumbers) {
 TEST_F(LlvmLibcExpfTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expf(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expf(float(FPBits(0x42cffff8U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expf(FPBits(0x42cffff8U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expf(float(FPBits(0x42d00008U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expf(FPBits(0x42d00008U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
diff --git a/libc/test/src/math/smoke/expm1f_test.cpp b/libc/test/src/math/smoke/expm1f_test.cpp
index 6999a987f9d6a..3d6dae77ec004 100644
--- a/libc/test/src/math/smoke/expm1f_test.cpp
+++ b/libc/test/src/math/smoke/expm1f_test.cpp
@@ -39,14 +39,14 @@ TEST_F(LlvmLibcExpm1fTest, SpecialNumbers) {
 TEST_F(LlvmLibcExpm1fTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expm1f(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expm1f(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expm1f(float(FPBits(0x42cffff8U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expm1f(FPBits(0x42cffff8U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::expm1f(float(FPBits(0x42d00008U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::expm1f(FPBits(0x42d00008U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
diff --git a/libc/test/src/math/smoke/sinhf_test.cpp b/libc/test/src/math/smoke/sinhf_test.cpp
index aa11ed9cbe105..0563ccbf77aaa 100644
--- a/libc/test/src/math/smoke/sinhf_test.cpp
+++ b/libc/test/src/math/smoke/sinhf_test.cpp
@@ -40,11 +40,11 @@ TEST_F(LlvmLibcSinhfTest, SpecialNumbers) {
 
 // For small values, sinh(x) is x.
 TEST_F(LlvmLibcSinhfTest, SmallValues) {
-  float x = float(FPBits(uint32_t(0x17800000)));
+  float x = FPBits(uint32_t(0x17800000)).get_val();
   float result = LIBC_NAMESPACE::sinhf(x);
   EXPECT_FP_EQ(x, result);
 
-  x = float(FPBits(uint32_t(0x00400000)));
+  x = FPBits(uint32_t(0x00400000)).get_val();
   result = LIBC_NAMESPACE::sinhf(x);
   EXPECT_FP_EQ(x, result);
 }
@@ -52,14 +52,14 @@ TEST_F(LlvmLibcSinhfTest, SmallValues) {
 TEST_F(LlvmLibcSinhfTest, Overflow) {
   libc_errno = 0;
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::sinhf(float(FPBits(0x7f7fffffU))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::sinhf(FPBits(0x7f7fffffU).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::sinhf(float(FPBits(0x42cffff8U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::sinhf(FPBits(0x42cffff8U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 
   EXPECT_FP_EQ_WITH_EXCEPTION(
-      inf, LIBC_NAMESPACE::sinhf(float(FPBits(0x42d00008U))), FE_OVERFLOW);
+      inf, LIBC_NAMESPACE::sinhf(FPBits(0x42d00008U).get_val()), FE_OVERFLOW);
   EXPECT_MATH_ERRNO(ERANGE);
 }
diff --git a/libc/test/src/math/tan_test.cpp b/libc/test/src/math/tan_test.cpp
index 8fae425f96f8f..9cdc7c4abd32c 100644
--- a/libc/test/src/math/tan_test.cpp
+++ b/libc/test/src/math/tan_test.cpp
@@ -22,7 +22,7 @@ TEST_F(LlvmLibcTanTest, Range) {
   constexpr StorageType COUNT = 100'000;
   constexpr StorageType STEP = STORAGE_MAX / COUNT;
   for (StorageType i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    double x = double(FPBits(v));
+    double x = FPBits(v).get_val();
     // TODO: Expand the range of testing after range reduction is implemented.
     if (isnan(x) || isinf(x) || x > _2pi || x < -_2pi)
       continue;
diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp
index 6005202e754b1..5621e819522fc 100644
--- a/libc/test/src/math/tanf_test.cpp
+++ b/libc/test/src/math/tanf_test.cpp
@@ -47,7 +47,7 @@ TEST_F(LlvmLibcTanfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'000;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
@@ -115,7 +115,7 @@ TEST_F(LlvmLibcTanfTest, SpecificBitPatterns) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
                                    LIBC_NAMESPACE::tanf(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, -x,
@@ -127,7 +127,7 @@ TEST_F(LlvmLibcTanfTest, SpecificBitPatterns) {
 // returns values furthest beyond its nominal upper bound of pi/4.
 TEST_F(LlvmLibcTanfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     ASSERT_MPFR_MATCH(mpfr::Operation::Tan, x, LIBC_NAMESPACE::tanf(x), 0.5);
   }
 }
diff --git a/libc/test/src/math/tanhf_test.cpp b/libc/test/src/math/tanhf_test.cpp
index 1dc736d369303..862ba6cc7eeb7 100644
--- a/libc/test/src/math/tanhf_test.cpp
+++ b/libc/test/src/math/tanhf_test.cpp
@@ -44,7 +44,7 @@ TEST_F(LlvmLibcTanhfTest, InFloatRange) {
   constexpr uint32_t COUNT = 100'001;
   constexpr uint32_t STEP = UINT32_MAX / COUNT;
   for (uint32_t i = 0, v = 0; i <= COUNT; ++i, v += STEP) {
-    float x = float(FPBits(v));
+    float x = FPBits(v).get_val();
     if (isnan(x) || isinf(x))
       continue;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
@@ -62,7 +62,7 @@ TEST_F(LlvmLibcTanhfTest, ExceptionalValues) {
   };
 
   for (int i = 0; i < N; ++i) {
-    float x = float(FPBits(INPUTS[i]));
+    float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
                                    LIBC_NAMESPACE::tanhf(x), 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, -x,
diff --git a/libc/test/src/stdlib/atof_test.cpp b/libc/test/src/stdlib/atof_test.cpp
index ed3d4c26308cb..613b56d05091e 100644
--- a/libc/test/src/stdlib/atof_test.cpp
+++ b/libc/test/src/stdlib/atof_test.cpp
@@ -26,7 +26,7 @@ TEST(LlvmLibcAToFTest, SimpleTest) {
 
   libc_errno = 0;
   EXPECT_THAT(LIBC_NAMESPACE::atof("123"),
-              Succeeds<double>(static_cast<double>(expected_fp)));
+              Succeeds<double>(expected_fp.get_val()));
 }
 
 TEST(LlvmLibcAToFTest, FailedParsingTest) {
diff --git a/libc/test/src/stdlib/strtod_test.cpp b/libc/test/src/stdlib/strtod_test.cpp
index b1bdd89e41fd1..452a180d70cea 100644
--- a/libc/test/src/stdlib/strtod_test.cpp
+++ b/libc/test/src/stdlib/strtod_test.cpp
@@ -50,10 +50,9 @@ class LlvmLibcStrToDTest : public LIBC_NAMESPACE::testing::Test,
     libc_errno = 0;
     double result = LIBC_NAMESPACE::strtod(inputString, &str_end);
     if (expectedErrno == 0)
-      EXPECT_THAT(result, Succeeds<double>(static_cast<double>(expected_fp)));
+      EXPECT_THAT(result, Succeeds<double>(expected_fp.get_val()));
     else
-      EXPECT_THAT(result, Fails<double>(expectedErrno,
-                                        static_cast<double>(expected_fp)));
+      EXPECT_THAT(result, Fails<double>(expectedErrno, expected_fp.get_val()));
     EXPECT_EQ(str_end - inputString, expectedStrLen);
   }
 };
diff --git a/libc/test/src/stdlib/strtof_test.cpp b/libc/test/src/stdlib/strtof_test.cpp
index 15a8a34ef4fb1..31b3f69f63a0c 100644
--- a/libc/test/src/stdlib/strtof_test.cpp
+++ b/libc/test/src/stdlib/strtof_test.cpp
@@ -48,7 +48,7 @@ class LlvmLibcStrToFTest : public LIBC_NAMESPACE::testing::Test,
     float result = LIBC_NAMESPACE::strtof(inputString, &str_end);
 
     EXPECT_EQ(str_end - inputString, expectedStrLen);
-    EXPECT_FP_EQ(result, static_cast<float>(expected_fp));
+    EXPECT_FP_EQ(result, expected_fp.get_val());
     EXPECT_EQ(libc_errno, expectedErrno);
   }
 };
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 06a231c7d94d0..7cc18cfb62438 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -451,7 +451,7 @@ class MPFRNumber {
     if (is_nan()) {
       if (FPBits<T>(input).is_nan())
         return MPFRNumber(0.0);
-      return MPFRNumber(static_cast<T>(FPBits<T>::inf()));
+      return MPFRNumber(FPBits<T>::inf().get_val());
     }
 
     int thisExponent = FPBits<T>(thisAsT).get_exponent();

From 6cf37dd504d0466bd9f34e29712d60759ef8180f Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 23 Jan 2024 16:36:30 +0000
Subject: [PATCH 644/843] [AMDGPU] Enable architected SGPRs for GFX12 (#79160)

---
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   1 +
 .../AMDGPU/indirect-call-known-callees.ll     |  30 +--
 .../AMDGPU/lower-work-group-id-intrinsics.ll  |  51 ++++-
 .../AMDGPU/workgroup-id-in-arch-sgprs.ll      | 179 +++++++++++-------
 4 files changed, 176 insertions(+), 85 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 92985f971f17a..cb29d5d947598 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1496,6 +1496,7 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureWavefrontSize32,
    FeatureShaderCyclesHiLoRegisters,
    FeatureArchitectedFlatScratch,
+   FeatureArchitectedSGPRs,
    FeatureAtomicFaddRtnInsts,
    FeatureAtomicFaddNoRtnInsts,
    FeatureAtomicDsPkAdd16Insts,
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 9965d214cc9b3..380a13ed16128 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -41,30 +41,30 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ;
 ; GFX12-LABEL: indirect_call_known_no_special_inputs:
 ; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_getpc_b64 s[2:3]
+; GFX12-NEXT:    s_sext_i32_i16 s3, s3
+; GFX12-NEXT:    s_add_co_u32 s2, s2, snork@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s3, s3, snork@gotpcrel32@hi+16
+; GFX12-NEXT:    s_mov_b64 s[0:1], 0
 ; GFX12-NEXT:    s_getpc_b64 s[4:5]
 ; GFX12-NEXT:    s_sext_i32_i16 s5, s5
-; GFX12-NEXT:    s_add_co_u32 s4, s4, snork@gotpcrel32@lo+8
-; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, snork@gotpcrel32@hi+16
-; GFX12-NEXT:    s_mov_b64 s[2:3], 0
-; GFX12-NEXT:    s_getpc_b64 s[6:7]
-; GFX12-NEXT:    s_sext_i32_i16 s7, s7
-; GFX12-NEXT:    s_add_co_u32 s6, s6, wobble@gotpcrel32@lo+8
-; GFX12-NEXT:    s_add_co_ci_u32 s7, s7, wobble@gotpcrel32@hi+16
-; GFX12-NEXT:    s_load_u8 s1, s[2:3], 0x0
+; GFX12-NEXT:    s_add_co_u32 s4, s4, wobble@gotpcrel32@lo+8
+; GFX12-NEXT:    s_add_co_ci_u32 s5, s5, wobble@gotpcrel32@hi+16
+; GFX12-NEXT:    s_load_u8 s6, s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x0
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
-; GFX12-NEXT:    s_load_b64 s[4:5], s[6:7], 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v31, v0
+; GFX12-NEXT:    s_mov_b32 s12, ttmp9
 ; GFX12-NEXT:    s_mov_b64 s[8:9], 0
-; GFX12-NEXT:    s_mov_b32 s12, s0
 ; GFX12-NEXT:    s_mov_b32 s32, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_and_b32 s1, 1, s1
+; GFX12-NEXT:    s_and_b32 s4, 1, s6
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX12-NEXT:    s_cmp_eq_u32 s1, 1
-; GFX12-NEXT:    s_cselect_b32 s3, s5, s3
-; GFX12-NEXT:    s_cselect_b32 s2, s4, s2
-; GFX12-NEXT:    s_swappc_b64 s[30:31], s[2:3]
+; GFX12-NEXT:    s_cmp_eq_u32 s4, 1
+; GFX12-NEXT:    s_cselect_b32 s1, s3, s1
+; GFX12-NEXT:    s_cselect_b32 s0, s2, s0
+; GFX12-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX12-NEXT:    s_endpgm
 
 bb:
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
index c732ff7094255..495b54758de04 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs --verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel --verify-machineinstrs < %s | FileCheck -check-prefix=GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_cs void @_amdgpu_cs_main() {
 ; GFX9-SDAG-LABEL: _amdgpu_cs_main:
@@ -23,6 +25,30 @@ define amdgpu_cs void @_amdgpu_cs_main() {
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
 ; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
 ; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: _amdgpu_cs_main:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: _amdgpu_cs_main:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 .entry:
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   %idy = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -68,6 +94,24 @@ define amdgpu_cs void @caller() {
 ; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
 ; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: caller:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, callee@abs32@hi
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: caller:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, callee@abs32@hi
+; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   call amdgpu_gfx void @callee(i32 %idx)
   ret void
@@ -79,3 +123,6 @@ declare i32 @llvm.amdgcn.workgroup.id.x()
 declare i32 @llvm.amdgcn.workgroup.id.y()
 declare i32 @llvm.amdgcn.workgroup.id.z()
 declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12: {{.*}}
+; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index c492b54759d82..769e6b0964abd 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -1,25 +1,47 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs --verify-machineinstrs < %s | FileCheck -check-prefix=GCN-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel --verify-machineinstrs < %s | FileCheck -check-prefix=GCN-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
-; GCN-SDAG-LABEL: workgroup_id_x:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GCN-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GCN-SDAG-NEXT:    s_endpgm
+; GFX9-SDAG-LABEL: workgroup_id_x:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GCN-GISEL-LABEL: workgroup_id_x:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GCN-GISEL-NEXT:    s_endpgm
+; GFX9-GISEL-LABEL: workgroup_id_x:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: workgroup_id_x:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: workgroup_id_x:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx
 
@@ -27,27 +49,29 @@ define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
 }
 
 define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry) {
-; GCN-SDAG-LABEL: workgroup_id_xy:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GCN-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp7
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
-; GCN-SDAG-NEXT:    s_endpgm
+; GFX9-LABEL: workgroup_id_xy:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v1, ttmp9
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v1, ttmp7
+; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_endpgm
 ;
-; GCN-GISEL-LABEL: workgroup_id_xy:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GCN-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-GISEL-NEXT:    global_store_dword v0, v1, s[0:1]
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, ttmp7
-; GCN-GISEL-NEXT:    global_store_dword v0, v1, s[2:3]
-; GCN-GISEL-NEXT:    s_endpgm
+; GFX12-LABEL: workgroup_id_xy:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
+; GFX12-NEXT:    v_mov_b32_e32 v2, ttmp7
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx
   %idy = call i32 @llvm.amdgcn.workgroup.id.y()
@@ -57,37 +81,56 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 }
 
 define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) {
-; GCN-SDAG-LABEL: workgroup_id_xyz:
-; GCN-SDAG:       ; %bb.0:
-; GCN-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GCN-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GCN-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
-; GCN-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GCN-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
-; GCN-SDAG-NEXT:    s_endpgm
+; GFX9-SDAG-LABEL: workgroup_id_xyz:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: workgroup_id_xyz:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-GISEL-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
+; GFX9-GISEL-NEXT:    s_endpgm
 ;
-; GCN-GISEL-LABEL: workgroup_id_xyz:
-; GCN-GISEL:       ; %bb.0:
-; GCN-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GCN-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GCN-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GCN-GISEL-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-GISEL-NEXT:    s_lshr_b32 s0, ttmp7, 16
-; GCN-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
-; GCN-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
-; GCN-GISEL-NEXT:    s_endpgm
+; GFX12-LABEL: workgroup_id_xyz:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
+; GFX12-NEXT:    s_and_b32 s2, ttmp7, 0xffff
+; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
+; GFX12-NEXT:    s_lshr_b32 s3, ttmp7, 16
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_clause 0x2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
+; GFX12-NEXT:    global_store_b32 v0, v2, s[6:7]
+; GFX12-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx
   %idy = call i32 @llvm.amdgcn.workgroup.id.y()

From d3a6a90ae5e80b074293ebfbcb2f15da9c57acc5 Mon Sep 17 00:00:00 2001
From: Stephen Tozer <stephen.tozer@sony.com>
Date: Tue, 23 Jan 2024 16:38:49 +0000
Subject: [PATCH 645/843] [RemoveDIs][DebugInfo] Enable creation of DPVAssigns,
 update outstanding AT tests (#79148)

This is the final patch for DPVAssign support, implementing the actual
creation of DPVAssigns and allowing them to be converted along with
dbg.values and dbg.declares. Numerous tests landed in previous patches
will no longer be rotten after this patch lands (previously they would
trivially pass due to DPVAssigns not actually being used), and a further
batch of tests have been added here that require the changes in this
patch before they pass.
---
 llvm/lib/IR/BasicBlock.cpp                    |   3 -
 llvm/lib/IR/DebugInfo.cpp                     | 114 ++++++++++--------
 .../declare-to-assign/long-double-x87.ll      |   1 +
 .../declare-to-assign/nullptr-declare.ll      |   1 +
 .../declare-to-assign/scalable-vector.ll      |   1 +
 .../set-flag-only-if-modified.ll              |   2 +
 .../declare-to-assign/structured-bindings.ll  |   1 +
 .../declare-to-assign/var-not-alloca-sized.ll |   1 +
 .../declare-to-assign/vla.ll                  |   1 +
 .../instcombine/remove-redundant-dbg.ll       |   2 +
 .../loop-vectorize/remove-redundant-dbg.ll    |   2 +
 .../Generic/assignment-tracking/optnone.ll    |   2 +
 .../parse-and-verify/roundtrip.ll             |   3 +
 .../assignment-tracking/remove-redundant.ll   |   2 +
 .../sroa/remove-redundant-dbg.ll              |   2 +
 .../assignment-tracking/sroa/user-memcpy.ll   |   2 +
 .../assignment-tracking/track-assignments.ll  |   2 +
 .../X86/coalesce-options.ll                   |  10 ++
 .../X86/coalesce-simple.ll                    |   2 +
 .../X86/remove-undef-fragment.ll              |   2 +
 .../X86/untagged-store-frag.ll                |   2 +
 21 files changed, 104 insertions(+), 54 deletions(-)

diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 03b74b0480f07..15b7e50fe6eca 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -74,9 +74,6 @@ void BasicBlock::convertToNewDbgValues() {
   for (Instruction &I : make_early_inc_range(InstList)) {
     assert(!I.DbgMarker && "DbgMarker already set on old-format instrs?");
     if (DbgVariableIntrinsic *DVI = dyn_cast<DbgVariableIntrinsic>(&I)) {
-      if (isa<DbgAssignIntrinsic>(DVI))
-        continue;
-
       // Convert this dbg.value to a DPValue.
       DPValue *Value = new DPValue(DVI);
       DPVals.push_back(Value);
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 2e64d0db57b25..e33274895ef43 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -1816,8 +1816,12 @@ void at::RAUW(DIAssignID *Old, DIAssignID *New) {
 
 void at::deleteAll(Function *F) {
   SmallVector<DbgAssignIntrinsic *, 12> ToDelete;
+  SmallVector<DPValue *, 12> DPToDelete;
   for (BasicBlock &BB : *F) {
     for (Instruction &I : BB) {
+      for (auto &DPV : I.getDbgValueRange())
+        if (DPV.isDbgAssign())
+          DPToDelete.push_back(&DPV);
       if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(&I))
         ToDelete.push_back(DAI);
       else
@@ -1826,6 +1830,8 @@ void at::deleteAll(Function *F) {
   }
   for (auto *DAI : ToDelete)
     DAI->eraseFromParent();
+  for (auto *DPV : DPToDelete)
+    DPV->eraseFromParent();
 }
 
 /// Get the FragmentInfo for the variable if it exists, otherwise return a
@@ -2056,9 +2062,9 @@ std::optional<AssignmentInfo> at::getAssignmentInfo(const DataLayout &DL,
 }
 
 /// Returns nullptr if the assignment shouldn't be attributed to this variable.
-static CallInst *emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
-                               Instruction &StoreLikeInst,
-                               const VarRecord &VarRec, DIBuilder &DIB) {
+static void emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
+                          Instruction &StoreLikeInst, const VarRecord &VarRec,
+                          DIBuilder &DIB) {
   auto *ID = StoreLikeInst.getMetadata(LLVMContext::MD_DIAssignID);
   assert(ID && "Store instruction must have DIAssignID metadata");
   (void)ID;
@@ -2082,7 +2088,7 @@ static CallInst *emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
 
     // Discard stores to bits outside this variable.
     if (FragStartBit >= FragEndBit)
-      return nullptr;
+      return;
 
     StoreToWholeVariable = FragStartBit <= VarStartBit && FragEndBit >= *Size;
   }
@@ -2097,8 +2103,17 @@ static CallInst *emitDbgAssign(AssignmentInfo Info, Value *Val, Value *Dest,
   }
   DIExpression *AddrExpr =
       DIExpression::get(StoreLikeInst.getContext(), std::nullopt);
-  return DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr, Dest,
-                             AddrExpr, VarRec.DL);
+  if (StoreLikeInst.getParent()->IsNewDbgInfoFormat) {
+    auto *Assign = DPValue::createLinkedDPVAssign(
+        &StoreLikeInst, Val, VarRec.Var, Expr, Dest, AddrExpr, VarRec.DL);
+    (void)Assign;
+    LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
+    return;
+  }
+  auto *Assign = DIB.insertDbgAssign(&StoreLikeInst, Val, VarRec.Var, Expr,
+                                     Dest, AddrExpr, VarRec.DL);
+  (void)Assign;
+  LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
 }
 
 #undef DEBUG_TYPE // Silence redefinition warning (from ConstantsContext.h).
@@ -2185,12 +2200,8 @@ void at::trackAssignments(Function::iterator Start, Function::iterator End,
         I.setMetadata(LLVMContext::MD_DIAssignID, ID);
       }
 
-      for (const VarRecord &R : LocalIt->second) {
-        auto *Assign =
-            emitDbgAssign(*Info, ValueComponent, DestComponent, I, R, DIB);
-        (void)Assign;
-        LLVM_DEBUG(if (Assign) errs() << " > INSERT: " << *Assign << "\n");
-      }
+      for (const VarRecord &R : LocalIt->second)
+        emitDbgAssign(*Info, ValueComponent, DestComponent, I, R, DIB);
     }
   }
 }
@@ -2206,32 +2217,38 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   // storage" is limited to Allocas). We'll use this to find dbg.declares to
   // delete after running `trackAssignments`.
   DenseMap<const AllocaInst *, SmallPtrSet<DbgDeclareInst *, 2>> DbgDeclares;
+  DenseMap<const AllocaInst *, SmallPtrSet<DPValue *, 2>> DPVDeclares;
   // Create another similar map of {storage : variables} that we'll pass to
   // trackAssignments.
   StorageToVarsMap Vars;
+  auto ProcessDeclare = [&](auto *Declare, auto &DeclareList) {
+    // FIXME: trackAssignments doesn't let you specify any modifiers to the
+    // variable (e.g. fragment) or location (e.g. offset), so we have to
+    // leave dbg.declares with non-empty expressions in place.
+    if (Declare->getExpression()->getNumElements() != 0)
+      return;
+    if (!Declare->getAddress())
+      return;
+    if (AllocaInst *Alloca =
+            dyn_cast<AllocaInst>(Declare->getAddress()->stripPointerCasts())) {
+      // FIXME: Skip VLAs for now (let these variables use dbg.declares).
+      if (!Alloca->isStaticAlloca())
+        return;
+      // Similarly, skip scalable vectors (use dbg.declares instead).
+      if (auto Sz = Alloca->getAllocationSize(*DL); Sz && Sz->isScalable())
+        return;
+      DeclareList[Alloca].insert(Declare);
+      Vars[Alloca].insert(VarRecord(Declare));
+    }
+  };
   for (auto &BB : F) {
     for (auto &I : BB) {
-      DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(&I);
-      if (!DDI)
-        continue;
-      // FIXME: trackAssignments doesn't let you specify any modifiers to the
-      // variable (e.g. fragment) or location (e.g. offset), so we have to
-      // leave dbg.declares with non-empty expressions in place.
-      if (DDI->getExpression()->getNumElements() != 0)
-        continue;
-      if (!DDI->getAddress())
-        continue;
-      if (AllocaInst *Alloca =
-              dyn_cast<AllocaInst>(DDI->getAddress()->stripPointerCasts())) {
-        // FIXME: Skip VLAs for now (let these variables use dbg.declares).
-        if (!Alloca->isStaticAlloca())
-          continue;
-        // Similarly, skip scalable vectors (use dbg.declares instead).
-        if (auto Sz = Alloca->getAllocationSize(*DL); Sz && Sz->isScalable())
-          continue;
-        DbgDeclares[Alloca].insert(DDI);
-        Vars[Alloca].insert(VarRecord(DDI));
+      for (auto &DPV : I.getDbgValueRange()) {
+        if (DPV.isDbgDeclare())
+          ProcessDeclare(&DPV, DPVDeclares);
       }
+      if (DbgDeclareInst *DDI = dyn_cast<DbgDeclareInst>(&I))
+        ProcessDeclare(DDI, DbgDeclares);
     }
   }
 
@@ -2247,35 +2264,30 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   trackAssignments(F.begin(), F.end(), Vars, *DL);
 
   // Delete dbg.declares for variables now tracked with assignment tracking.
-  for (auto &P : DbgDeclares) {
-    const AllocaInst *Alloca = P.first;
-    auto Markers = at::getAssignmentMarkers(Alloca);
-    SmallVector<DPValue *> DPMarkers = at::getDPVAssignmentMarkers(Alloca);
+  auto DeleteSubsumedDeclare = [&](const auto &Markers, auto &Declares) {
     (void)Markers;
-    (void)DPMarkers;
-    for (DbgDeclareInst *DDI : P.second) {
-      // Assert that the alloca that DDI uses is now linked to a dbg.assign
+    for (auto *Declare : Declares) {
+      // Assert that the alloca that Declare uses is now linked to a dbg.assign
       // describing the same variable (i.e. check that this dbg.declare has
       // been replaced by a dbg.assign). Use DebugVariableAggregate to Discard
       // the fragment part because trackAssignments may alter the
       // fragment. e.g. if the alloca is smaller than the variable, then
       // trackAssignments will create an alloca-sized fragment for the
       // dbg.assign.
-      assert(llvm::any_of(Markers,
-                          [DDI](DbgAssignIntrinsic *DAI) {
-                            return DebugVariableAggregate(DAI) ==
-                                   DebugVariableAggregate(DDI);
-                          }) ||
-             llvm::any_of(DPMarkers, [DDI](DPValue *DPV) {
-               return DebugVariableAggregate(DPV) ==
-                      DebugVariableAggregate(DDI);
-             }));
-      // Delete DDI because the variable location is now tracked using
+      assert(llvm::any_of(Markers, [Declare](auto *Assign) {
+        return DebugVariableAggregate(Assign) ==
+               DebugVariableAggregate(Declare);
+      }));
+      // Delete Declare because the variable location is now tracked using
       // assignment tracking.
-      DDI->eraseFromParent();
+      Declare->eraseFromParent();
       Changed = true;
     }
-  }
+  };
+  for (auto &P : DbgDeclares)
+    DeleteSubsumedDeclare(at::getAssignmentMarkers(P.first), P.second);
+  for (auto &P : DPVDeclares)
+    DeleteSubsumedDeclare(at::getDPVAssignmentMarkers(P.first), P.second);
   return Changed;
 }
 
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/long-double-x87.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/long-double-x87.ll
index 5e308097fd1ae..3149dcb6ebc31 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/long-double-x87.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/long-double-x87.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -S -passes=declare-to-assign -o - | FileCheck %s
 
 ;; Generated from this C++:
 ;; long double get();
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/nullptr-declare.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/nullptr-declare.ll
index 82c710767c185..a795cc4c2dae3 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/nullptr-declare.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/nullptr-declare.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -passes=declare-to-assign -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -passes=declare-to-assign -S | FileCheck %s
 
 ;; Check AssignmentTrackingPass ignores a dbg.declare with an empty metadata
 ;; location operand.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/scalable-vector.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/scalable-vector.ll
index 4abe5f475aafe..2b9c9bf16c9a4 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/scalable-vector.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/scalable-vector.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=declare-to-assign %s -S | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=declare-to-assign %s -S | FileCheck %s
 
 ;; Check declare-to-assign skips scalable vectors for now. i.e. do not replace
 ;; the dbg.declare with a dbg.assign intrinsic.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/set-flag-only-if-modified.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/set-flag-only-if-modified.ll
index 3481bfe019914..849c763da9fc0 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/set-flag-only-if-modified.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/set-flag-only-if-modified.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=declare-to-assign -S %s -o - \
 ; RUN: | FileCheck %s --check-prefix=WITHOUT-INTRINSIC
+; RUN: opt --try-experimental-debuginfo-iterators -passes=declare-to-assign -S %s -o - \
+; RUN: | FileCheck %s --check-prefix=WITHOUT-INTRINSIC
 
 ; RUN: sed 's/;Uncomment-with-sed//g' < %s \
 ; RUN: | opt -passes=declare-to-assign -S - -o - \
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/structured-bindings.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/structured-bindings.ll
index 776026fcbc013..892e8501ebf35 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/structured-bindings.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/structured-bindings.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=declare-to-assign -S %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=declare-to-assign -S %s -o - | FileCheck %s
 
 ;; Check assignment tracking debug info for structured bindings. FIXME only
 ;; variables at offset 0 in the backing alloca are currently tracked with the
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/var-not-alloca-sized.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/var-not-alloca-sized.ll
index 56b631a59200d..c009fdcc238cb 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/var-not-alloca-sized.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/var-not-alloca-sized.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -passes=declare-to-assign -S %s -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -passes=declare-to-assign -S %s -o - | FileCheck %s
 
 ;; The variable doesn't fill the whole alloca which has a range of different
 ;; sized stores to it, overlapping (or not) the variable in various ways. Check
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/vla.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/vla.ll
index 72d54cba7b4b6..b4e619e0e62ee 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/vla.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/vla.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S %s -passes=declare-to-assign -o - | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators -S %s -passes=declare-to-assign -o - | FileCheck %s
 
 ;; Check declare-to-assign ignores VLA-backed variables (for now).
 ;; From C++ source:
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
index 11895098179eb..cffac06f8e545 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/instcombine/remove-redundant-dbg.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/loop-vectorize/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/loop-vectorize/remove-redundant-dbg.ll
index b02203dd99ba8..5c897187086d2 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/loop-vectorize/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/loop-vectorize/remove-redundant-dbg.ll
@@ -1,5 +1,7 @@
 ; RUN: opt %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators %s -passes=loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that loop-vectorize removes redundant debug intrinsics after it makes
 ;; a change. This has a significant positive impact on peak memory and compiler
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/optnone.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/optnone.ll
index 6177448e2e6aa..502d46640713a 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/optnone.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/optnone.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -S %s -o - --passes=declare-to-assign \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -S %s -o - --passes=declare-to-assign \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Assignment tracking doesn't add any value when optimisations are disabled.
 ;; Check it doesn't get applied to functions marked optnone.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/roundtrip.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/roundtrip.ll
index 0751e9ec0d493..c8fc014fcadf1 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/roundtrip.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/parse-and-verify/roundtrip.ll
@@ -1,6 +1,9 @@
 ; RUN: opt %s -passes=verify   \
 ; RUN: | opt -passes=verify -S \
 ; RUN: | FileCheck %s
+; RUN: opt --try-experimental-debuginfo-iterators %s -passes=verify   \
+; RUN: | opt -passes=verify -S \
+; RUN: | FileCheck %s
 
 ;; Roundtrip test (text -> bitcode -> text) for DIAssignID metadata and
 ;; llvm.dbg.assign intrinsics.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll
index efb20b6edee2d..24ec3e94ed275 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/remove-redundant.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=redundant-dbg-inst-elim -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=redundant-dbg-inst-elim -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Hand-written. Test how RemoveRedundantDbgInstrs interacts with dbg.assign
 ;; intrinsics. FileCehck directives are inline.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
index 11895098179eb..cffac06f8e545 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/remove-redundant-dbg.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that sroa removes redundant debug intrinsics after it makes a
 ;; change. This has a significant positive impact on peak memory and compiler
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
index abc110273ab3b..23d0ff3461046 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/sroa/user-memcpy.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes=sroa -S %s -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes=sroa -S %s -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; Check that the fragments generated in SROA for a split alloca that has a
 ;; dbg.assign with non-zero-offset fragment are correct.
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/track-assignments.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/track-assignments.ll
index 98daf702f63b8..127e84a348ba6 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/track-assignments.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/track-assignments.ll
@@ -1,5 +1,7 @@
 ; RUN: opt -passes='declare-to-assign,verify' %s -S -o - \
 ; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
+; RUN: opt --try-experimental-debuginfo-iterators -passes='declare-to-assign,verify' %s -S -o - \
+; RUN: | FileCheck %s --implicit-check-not="call void @llvm.dbg"
 
 ;; This test checks that `trackAssignments` is working correctly by using the
 ;; pass-wrapper `declare-to-assign`. Each function checks some specific
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-options.ll b/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-options.ll
index 39bfcdaa41716..e872e492eb26c 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-options.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-options.ll
@@ -14,20 +14,30 @@
 ;; Coalescing default + instructino-referencing enabled = enable.
 ; RUN: llc %s -o - -stop-after=finalize-isel -experimental-debug-variable-locations=true \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,ENABLE
+; RUN: llc --try-experimental-debuginfo-iterators %s -o - -stop-after=finalize-isel -experimental-debug-variable-locations=true \
+; RUN: | FileCheck %s --check-prefixes=CHECK,ENABLE
 
 ;; Coalescing default + instructino-referencing disabled = disable.
 ; RUN: llc %s -o - -stop-after=finalize-isel -experimental-debug-variable-locations=false \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,DISABLE
+; RUN: llc --try-experimental-debuginfo-iterators %s -o - -stop-after=finalize-isel -experimental-debug-variable-locations=false \
+; RUN: | FileCheck %s --check-prefixes=CHECK,DISABLE
 
 ;; Coalescing enabled + instructino-referencing disabled = enable.
 ; RUN: llc %s -o - -stop-after=finalize-isel -experimental-debug-variable-locations=false \
 ; RUN:     -debug-ata-coalesce-frags=true \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,ENABLE
+; RUN: llc --try-experimental-debuginfo-iterators %s -o - -stop-after=finalize-isel -experimental-debug-variable-locations=false \
+; RUN:     -debug-ata-coalesce-frags=true \
+; RUN: | FileCheck %s --check-prefixes=CHECK,ENABLE
 
 ;; Coalescing disabled + instructino-referencing enabled = disable.
 ; RUN: llc %s -o - -stop-after=finalize-isel -experimental-debug-variable-locations=true \
 ; RUN:     -debug-ata-coalesce-frags=false \
 ; RUN: | FileCheck %s --check-prefixes=CHECK,DISABLE
+; RUN: llc --try-experimental-debuginfo-iterators %s -o - -stop-after=finalize-isel -experimental-debug-variable-locations=true \
+; RUN:     -debug-ata-coalesce-frags=false \
+; RUN: | FileCheck %s --check-prefixes=CHECK,DISABLE
 
 ; CHECK: MOV32mi %stack.0.a, 1, $noreg, 0, $noreg, 5
 ; ENABLE-NEXT: DBG_VALUE %stack.0.a, $noreg, ![[#]], !DIExpression(DW_OP_deref)
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-simple.ll b/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-simple.ll
index ccbaca0e61dab..274dc2f9bf2a4 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-simple.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/coalesce-simple.ll
@@ -1,5 +1,7 @@
 ; RUN: llc %s -o - -stop-after=finalize-isel \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
+; RUN: llc --try-experimental-debuginfo-iterators %s -o - -stop-after=finalize-isel \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
 
 ;; Test coalescing of contiguous fragments in adjacent location definitions.
 ;; Further details and check directives inline.
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/remove-undef-fragment.ll b/llvm/test/DebugInfo/assignment-tracking/X86/remove-undef-fragment.ll
index b006fb0514626..c65a6e4aadc7f 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/remove-undef-fragment.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/remove-undef-fragment.ll
@@ -1,5 +1,7 @@
 ; RUN: llc %s -o - -stop-after=finalize-isel \
 ; RUN: | FileCheck %s --implicit-check-not=DBG
+; RUN: llc --try-experimental-debuginfo-iterators %s -o - -stop-after=finalize-isel \
+; RUN: | FileCheck %s --implicit-check-not=DBG
 
 ;; In the IR below, for variable n, we get dbg intrinsics that describe this:
 ;;
diff --git a/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-frag.ll b/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-frag.ll
index b7e713c7224f4..e3ec9b40c1683 100644
--- a/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-frag.ll
+++ b/llvm/test/DebugInfo/assignment-tracking/X86/untagged-store-frag.ll
@@ -1,5 +1,7 @@
 ; RUN: llc %s -stop-after=finalize-isel -o - \
 ; RUN: | FileCheck %s --implicit-check-not=DBG_
+; RUN: llc --try-experimental-debuginfo-iterators %s -stop-after=finalize-isel -o - \
+; RUN: | FileCheck %s --implicit-check-not=DBG_
 
 ;; Hand-written to test untagged store handling on a simple case. Here's what
 ;; we're looking at in the IR:

From 1f6f19935c1b4512190f1bc94ebf94f3d2b69911 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 23 Jan 2024 17:41:12 +0100
Subject: [PATCH 646/843] [Clang][AArch64] Add diagnostics for builtins that
 use ZT0. (#79140)

Similar to what we did for ZA, this patch adds diagnostics to flag when
using a ZT0 builtin in a function that does not have ZT0 state.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  3 +++
 clang/include/clang/Basic/arm_sme.td          | 18 ++++++++--------
 clang/include/clang/Basic/arm_sve_sme_incl.td |  3 +++
 clang/lib/Sema/SemaChecking.cpp               | 17 +++++++++++++++
 .../acle_sme2_ldr_str_zt.c                    |  4 ++--
 .../acle_sme2_luti2_lane_zt.c                 | 18 ++++++++--------
 .../acle_sme2_luti2_lane_zt_x2.c              | 18 ++++++++--------
 .../acle_sme2_luti2_lane_zt_x4.c              | 18 ++++++++--------
 .../acle_sme2_luti4_lane_zt.c                 | 18 ++++++++--------
 .../acle_sme2_luti4_lane_zt_x2.c              | 18 ++++++++--------
 .../acle_sme2_luti4_lane_zt_x4.c              | 14 ++++++-------
 .../acle_sme2_zero_zt.c                       |  2 +-
 .../Sema/aarch64-incompat-sm-builtin-calls.c  | 10 ++++++++-
 .../aarch64-sme2-intrinsics/acle_sme2_imm.cpp | 16 +++++++-------
 clang/utils/TableGen/SveEmitter.cpp           | 21 ++++++++++++++++---
 15 files changed, 122 insertions(+), 76 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index e027e754477fc..a1c32abb4dcd8 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -3170,6 +3170,9 @@ def warn_attribute_arm_sm_incompat_builtin : Warning<
 def warn_attribute_arm_za_builtin_no_za_state : Warning<
   "builtin call is not valid when calling from a function without active ZA state">,
   InGroup<DiagGroup<"undefined-arm-za">>;
+def warn_attribute_arm_zt0_builtin_no_zt0_state : Warning<
+  "builtin call is not valid when calling from a function without active ZT0 state">,
+  InGroup<DiagGroup<"undefined-arm-zt0">>;
 def err_sve_vector_in_non_sve_target : Error<
   "SVE vector type %0 cannot be used in a target without sve">;
 def err_attribute_riscv_rvv_bits_unsupported : Error<
diff --git a/clang/include/clang/Basic/arm_sme.td b/clang/include/clang/Basic/arm_sme.td
index 4fb50b8e4e4e5..695e1bddf9ffc 100644
--- a/clang/include/clang/Basic/arm_sme.td
+++ b/clang/include/clang/Basic/arm_sme.td
@@ -636,37 +636,37 @@ let TargetGuard = "sme2,sme-i16i64" in {
 // Spill and fill of ZT0
 //
 let TargetGuard = "sme2" in {
-  def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<0, ImmCheck0_0>]>;
-  def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVLDR_ZT : Inst<"svldr_zt", "viQ", "", MergeNone, "aarch64_sme_ldr_zt", [IsOverloadNone, IsStreamingCompatible, IsInOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVSTR_ZT : Inst<"svstr_zt", "vi%", "", MergeNone, "aarch64_sme_str_zt", [IsOverloadNone, IsStreamingCompatible, IsInZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
 //
 // Zero ZT0
 //
 let TargetGuard = "sme2" in {
-  def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible], [ImmCheck<0, ImmCheck0_0>]>;
+  def SVZERO_ZT : Inst<"svzero_zt", "vi", "", MergeNone, "aarch64_sme_zero_zt", [IsOverloadNone, IsStreamingCompatible, IsOutZT0], [ImmCheck<0, ImmCheck0_0>]>;
 }
 
 //
 // lookup table expand four contiguous registers
 //
 let TargetGuard = "sme2" in {
-  def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
-  def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
+  def SVLUTI2_LANE_ZT_X4 : Inst<"svluti2_lane_zt_{d}_x4", "4.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI4_LANE_ZT_X4 : Inst<"svluti4_lane_zt_{d}_x4", "4.di[i", "sUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x4", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_1>]>;
 }
 
 //
 // lookup table expand one register
 //
 let TargetGuard = "sme2" in {
-  def SVLUTI2_LANE_ZT : Inst<"svluti2_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
-  def SVLUTI4_LANE_ZT : Inst<"svluti4_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
+  def SVLUTI2_LANE_ZT : Inst<"svluti2_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_15>]>;
+  def SVLUTI4_LANE_ZT : Inst<"svluti4_lane_zt_{d}", "di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
 }
 
 //
 // lookup table expand two contiguous registers
 //
 let TargetGuard = "sme2" in {
-  def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
-  def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
+  def SVLUTI2_LANE_ZT_X2 : Inst<"svluti2_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti2_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_7>]>;
+  def SVLUTI4_LANE_ZT_X2 : Inst<"svluti4_lane_zt_{d}_x2", "2.di[i", "cUcsUsiUibhf", MergeNone, "aarch64_sme_luti4_lane_zt_x2", [IsStreaming, IsInZT0], [ImmCheck<0, ImmCheck0_0>, ImmCheck<2, ImmCheck0_3>]>;
 }
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index e6ad84676053b..9a6ea9898ef70 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -229,6 +229,9 @@ def IsStreamingOrSVE2p1             : FlagType<0x40000000000>; // Use for intrin
 def IsInZA                          : FlagType<0x80000000000>;
 def IsOutZA                         : FlagType<0x100000000000>;
 def IsInOutZA                       : FlagType<0x200000000000>;
+def IsInZT0                         : FlagType<0x400000000000>;
+def IsOutZT0                        : FlagType<0x800000000000>;
+def IsInOutZT0                      : FlagType<0x1000000000000>;
 
 // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h
 class ImmCheckType<int val> {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 1f83dcf07b6f9..7833d5a2ea20e 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -3013,6 +3013,11 @@ enum ArmSMEState : unsigned {
   ArmOutZA = 0b10,
   ArmInOutZA = 0b11,
   ArmZAMask = 0b11,
+
+  ArmInZT0 = 0b01 << 2,
+  ArmOutZT0 = 0b10 << 2,
+  ArmInOutZT0 = 0b11 << 2,
+  ArmZT0Mask = 0b11 << 2
 };
 
 bool Sema::ParseSVEImmChecks(
@@ -3206,6 +3211,13 @@ static bool hasArmZAState(const FunctionDecl *FD) {
          (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZA());
 }
 
+static bool hasArmZT0State(const FunctionDecl *FD) {
+  const auto *T = FD->getType()->getAs<FunctionProtoType>();
+  return (T && FunctionType::getArmZT0State(T->getAArch64SMEAttributes()) !=
+                   FunctionType::ARM_None) ||
+         (FD->hasAttr<ArmNewAttr>() && FD->getAttr<ArmNewAttr>()->isNewZT0());
+}
+
 static ArmSMEState getSMEState(unsigned BuiltinID) {
   switch (BuiltinID) {
   default:
@@ -3233,6 +3245,11 @@ bool Sema::CheckSMEBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       Diag(TheCall->getBeginLoc(),
            diag::warn_attribute_arm_za_builtin_no_za_state)
           << TheCall->getSourceRange();
+
+    if ((getSMEState(BuiltinID) & ArmZT0Mask) && !hasArmZT0State(FD))
+      Diag(TheCall->getBeginLoc(),
+           diag::warn_attribute_arm_zt0_builtin_no_zt0_state)
+          << TheCall->getSourceRange();
   }
 
   // Range check SME intrinsics that take immediate values.
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
index 1a495d3b117ec..3e4454d943358 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_ldr_str_zt.c
@@ -20,7 +20,7 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_out("za") {
+void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_out("zt0") {
   svldr_zt(0, base);
 }
 
@@ -36,6 +36,6 @@ void test_svldr_zt(const void *base) __arm_streaming_compatible __arm_out("za")
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.str.zt(i32 0, ptr [[BASE:%.*]])
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svstr_zt(void *base) __arm_streaming_compatible __arm_in("za") {
+void test_svstr_zt(void *base) __arm_streaming_compatible __arm_in("zt0") {
   svstr_zt(0, base);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
index d656178fdf7a1..7c210fbe6923e 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt.c
@@ -19,7 +19,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u8(0, zn, 15);
 }
 
@@ -34,7 +34,7 @@ svuint8_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti2.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint8_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s8(0, zn, 15);
 }
 
@@ -48,7 +48,7 @@ svint8_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint16_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u16(0, zn, 15);
 }
 
@@ -63,7 +63,7 @@ svuint16_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti2.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint16_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s16(0, zn, 15);
 }
 
@@ -77,7 +77,7 @@ svint16_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.luti2.lane.zt.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat16_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f16(0, zn, 15);
 }
 
@@ -91,7 +91,7 @@ svfloat16_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.luti2.lane.zt.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svbfloat16_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_bf16(0, zn, 15);
 }
 
@@ -105,7 +105,7 @@ svbfloat16_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("z
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint32_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u32(0, zn, 15);
 }
 
@@ -119,7 +119,7 @@ svuint32_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti2.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint32_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s32(0, zn, 15);
 }
 
@@ -133,6 +133,6 @@ svint32_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.luti2.lane.zt.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 15)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat32_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f32(0, zn, 15);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
index 60cd24fdf4630..d7ef75ce01dd7 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x2.c
@@ -26,7 +26,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u8_x2(0, zn, 7);
 }
 
@@ -49,7 +49,7 @@ svuint8x2_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s8_x2(0, zn, 7);
 }
 
@@ -71,7 +71,7 @@ svint8x2_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u16_x2(0, zn, 7);
 }
 
@@ -94,7 +94,7 @@ svuint16x2_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s16_x2(0, zn, 7);
 }
 
@@ -116,7 +116,7 @@ svint16x2_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f16_x2(0, zn, 7);
 }
 
@@ -138,7 +138,7 @@ svfloat16x2_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("z
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
 //
-svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_bf16_x2(0, zn, 7);
 }
 
@@ -160,7 +160,7 @@ svbfloat16x2_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in(
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u32_x2(0, zn, 7);
 }
 
@@ -182,7 +182,7 @@ svuint32x2_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s32_x2(0, zn, 7);
 }
 
@@ -204,6 +204,6 @@ svint32x2_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat32x2_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f32_x2(0, zn, 7);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
index e05748c8a6424..f65c0ef61f818 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti2_lane_zt_x4.c
@@ -34,7 +34,7 @@
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u8_x4(0, zn, 3);
 }
 
@@ -65,7 +65,7 @@ svuint8x4_t test_svluti2_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 64 x i8> @llvm.vector.insert.nxv64i8.nxv16i8(<vscale x 64 x i8> [[TMP6]], <vscale x 16 x i8> [[TMP7]], i64 48)
 // CPP-CHECK-NEXT:    ret <vscale x 64 x i8> [[TMP8]]
 //
-svint8x4_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint8x4_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s8_x4(0, zn, 3);
 }
 
@@ -95,7 +95,7 @@ svint8x4_t test_svluti2_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint16x4_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u16_x4(0, zn, 3);
 }
 
@@ -125,7 +125,7 @@ svuint16x4_t test_svluti2_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint16x4_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s16_x4(0, zn, 3);
 }
 
@@ -155,7 +155,7 @@ svint16x4_t test_svluti2_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat16x4_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f16_x4(0, zn, 3);
 }
 
@@ -185,7 +185,7 @@ svfloat16x4_t test_svluti2_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("z
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
 //
-svbfloat16x4_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svbfloat16x4_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_bf16_x4(0, zn, 3);
 }
 
@@ -215,7 +215,7 @@ svbfloat16x4_t test_svluti2_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in(
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint32x4_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_u32_x4(0, zn, 3);
 }
 
@@ -245,7 +245,7 @@ svuint32x4_t test_svluti2_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint32x4_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_s32_x4(0, zn, 3);
 }
 
@@ -275,6 +275,6 @@ svint32x4_t test_svluti2_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat32x4_t test_svluti2_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti2_lane_zt_f32_x4(0, zn, 3);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
index 1e303a9a661d7..cbab1cc8e81bd 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt.c
@@ -19,7 +19,7 @@
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svuint8_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint8_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u8(0, zn, 7);
 }
 
@@ -34,7 +34,7 @@ svuint8_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sme.luti4.lane.zt.nxv16i8(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i8> [[TMP0]]
 //
-svint8_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint8_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s8(0, zn, 7);
 }
 
@@ -48,7 +48,7 @@ svint8_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svuint16_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint16_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u16(0, zn, 7);
 }
 
@@ -62,7 +62,7 @@ svuint16_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sme.luti4.lane.zt.nxv8i16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i16> [[TMP0]]
 //
-svint16_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint16_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s16(0, zn, 7);
 }
 
@@ -76,7 +76,7 @@ svint16_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x half> @llvm.aarch64.sme.luti4.lane.zt.nxv8f16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x half> [[TMP0]]
 //
-svfloat16_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat16_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f16(0, zn, 7);
 }
 
@@ -90,7 +90,7 @@ svfloat16_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 8 x bfloat> @llvm.aarch64.sme.luti4.lane.zt.nxv8bf16(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x bfloat> [[TMP0]]
 //
-svbfloat16_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svbfloat16_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_bf16(0, zn, 7);
 }
 
@@ -104,7 +104,7 @@ svbfloat16_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("z
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svuint32_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint32_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u32(0, zn, 7);
 }
 
@@ -118,7 +118,7 @@ svuint32_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x i32> @llvm.aarch64.sme.luti4.lane.zt.nxv4i32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x i32> [[TMP0]]
 //
-svint32_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint32_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s32(0, zn, 7);
 }
 
@@ -132,6 +132,6 @@ svint32_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP0:%.*]] = tail call <vscale x 4 x float> @llvm.aarch64.sme.luti4.lane.zt.nxv4f32(i32 0, <vscale x 16 x i8> [[ZN:%.*]], i32 7)
 // CPP-CHECK-NEXT:    ret <vscale x 4 x float> [[TMP0]]
 //
-svfloat32_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat32_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f32(0, zn, 7);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
index 3544f62527aca..f7f16281ff406 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x2.c
@@ -26,7 +26,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u8_x2(0, zn, 3);
 }
 
@@ -49,7 +49,7 @@ svuint8x2_t test_svluti4_lane_zt_u8(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i8> @llvm.vector.insert.nxv32i8.nxv16i8(<vscale x 32 x i8> [[TMP2]], <vscale x 16 x i8> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i8> [[TMP4]]
 //
-svint8x2_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint8x2_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s8_x2(0, zn, 3);
 }
 
@@ -71,7 +71,7 @@ svint8x2_t test_svluti4_lane_zt_s8(svuint8_t zn) __arm_streaming __arm_in("za")
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svuint16x2_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint16x2_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u16_x2(0, zn, 3);
 }
 
@@ -94,7 +94,7 @@ svuint16x2_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x i16> @llvm.vector.insert.nxv16i16.nxv8i16(<vscale x 16 x i16> [[TMP2]], <vscale x 8 x i16> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i16> [[TMP4]]
 //
-svint16x2_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint16x2_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s16_x2(0, zn, 3);
 }
 
@@ -116,7 +116,7 @@ svint16x2_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x half> @llvm.vector.insert.nxv16f16.nxv8f16(<vscale x 16 x half> [[TMP2]], <vscale x 8 x half> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x half> [[TMP4]]
 //
-svfloat16x2_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat16x2_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f16_x2(0, zn, 3);
 }
 
@@ -138,7 +138,7 @@ svfloat16x2_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("z
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 16 x bfloat> @llvm.vector.insert.nxv16bf16.nxv8bf16(<vscale x 16 x bfloat> [[TMP2]], <vscale x 8 x bfloat> [[TMP3]], i64 8)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x bfloat> [[TMP4]]
 //
-svbfloat16x2_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svbfloat16x2_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_bf16_x2(0, zn, 3);
 }
 
@@ -160,7 +160,7 @@ svbfloat16x2_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in(
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svuint32x2_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint32x2_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u32_x2(0, zn, 3);
 }
 
@@ -182,7 +182,7 @@ svuint32x2_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> [[TMP2]], <vscale x 4 x i32> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x i32> [[TMP4]]
 //
-svint32x2_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint32x2_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s32_x2(0, zn, 3);
 }
 
@@ -204,6 +204,6 @@ svint32x2_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 8 x float> @llvm.vector.insert.nxv8f32.nxv4f32(<vscale x 8 x float> [[TMP2]], <vscale x 4 x float> [[TMP3]], i64 4)
 // CPP-CHECK-NEXT:    ret <vscale x 8 x float> [[TMP4]]
 //
-svfloat32x2_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat32x2_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f32_x2(0, zn, 3);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
index e7151e8256039..3fedfdc330893 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_luti4_lane_zt_x4.c
@@ -36,7 +36,7 @@
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svuint16x4_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint16x4_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u16_x4(0, zn, 1);
 }
 
@@ -68,7 +68,7 @@ svuint16x4_t test_svluti4_lane_zt_u16(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x half> @llvm.vector.insert.nxv32f16.nxv8f16(<vscale x 32 x half> [[TMP6]], <vscale x 8 x half> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x half> [[TMP8]]
 //
-svfloat16x4_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat16x4_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f16_x4(0, zn, 1);
 }
 
@@ -100,7 +100,7 @@ svfloat16x4_t test_svluti4_lane_zt_f16(svuint8_t zn) __arm_streaming __arm_in("z
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x bfloat> @llvm.vector.insert.nxv32bf16.nxv8bf16(<vscale x 32 x bfloat> [[TMP6]], <vscale x 8 x bfloat> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x bfloat> [[TMP8]]
 //
-svbfloat16x4_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svbfloat16x4_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_bf16_x4(0, zn, 1);
 }
 
@@ -132,7 +132,7 @@ svbfloat16x4_t test_svluti4_lane_zt_bf16(svuint8_t zn) __arm_streaming __arm_in(
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 32 x i16> @llvm.vector.insert.nxv32i16.nxv8i16(<vscale x 32 x i16> [[TMP6]], <vscale x 8 x i16> [[TMP7]], i64 24)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i16> [[TMP8]]
 //
-svint16x4_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint16x4_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s16_x4(0, zn, 1);
 }
 
@@ -164,7 +164,7 @@ svint16x4_t test_svluti4_lane_zt_s16(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svuint32x4_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svuint32x4_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_u32_x4(0, zn, 1);
 }
 
@@ -196,7 +196,7 @@ svuint32x4_t test_svluti4_lane_zt_u32(svuint8_t zn) __arm_streaming __arm_in("za
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x i32> @llvm.vector.insert.nxv16i32.nxv4i32(<vscale x 16 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x i32> [[TMP8]]
 //
-svint32x4_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svint32x4_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_s32_x4(0, zn, 1);
 }
 
@@ -228,6 +228,6 @@ svint32x4_t test_svluti4_lane_zt_s32(svuint8_t zn) __arm_streaming __arm_in("za"
 // CPP-CHECK-NEXT:    [[TMP8:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> [[TMP6]], <vscale x 4 x float> [[TMP7]], i64 12)
 // CPP-CHECK-NEXT:    ret <vscale x 16 x float> [[TMP8]]
 //
-svfloat32x4_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("za") {
+svfloat32x4_t test_svluti4_lane_zt_f32(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   return svluti4_lane_zt_f32_x4(0, zn, 1);
 }
diff --git a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
index 4a038ce61e3bf..4105cc3e78ec2 100644
--- a/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
+++ b/clang/test/CodeGen/aarch64-sme2-intrinsics/acle_sme2_zero_zt.c
@@ -18,6 +18,6 @@
 // CPP-CHECK-NEXT:    tail call void @llvm.aarch64.sme.zero.zt(i32 0)
 // CPP-CHECK-NEXT:    ret void
 //
-void test_svzero_zt(void) __arm_streaming_compatible __arm_out("za") {
+void test_svzero_zt(void) __arm_streaming_compatible __arm_out("zt0") {
   svzero_zt(0);
 }
diff --git a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
index 079cff5a5bbae..55c97c73e8b69 100644
--- a/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
+++ b/clang/test/Sema/aarch64-incompat-sm-builtin-calls.c
@@ -1,6 +1,6 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
 // RUN: %clang_cc1  -triple aarch64-none-linux-gnu -target-feature +sve \
-// RUN:   -target-feature +sme -target-feature +sve2 -target-feature +neon -fsyntax-only -verify %s
+// RUN:   -target-feature +sme2 -target-feature +sve2 -target-feature +neon -fsyntax-only -verify %s
 
 // REQUIRES: aarch64-registered-target
 
@@ -108,3 +108,11 @@ svint8_t new_za(svint8_t zd, svbool_t pg, uint32_t slice_base) __arm_streaming {
     // expected-no-warning
     return svread_hor_za8_s8_m(zd, pg, 0, slice_base);
 }
+
+void missing_zt0(void) __arm_streaming {
+  // expected-warning@+1 {{builtin call is not valid when calling from a function without active ZT0 state}}
+  svzero_zt(0);
+}
+
+__arm_new("zt0")
+void new_zt0(void) __arm_streaming { svzero_zt(0); } // no warning
diff --git a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
index 0d9f090637eec..a627ef9c01ae2 100644
--- a/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
+++ b/clang/test/Sema/aarch64-sme2-intrinsics/acle_sme2_imm.cpp
@@ -71,15 +71,15 @@ void test_outer_product(svbool_t pred, svint16_t s16, svuint16_t u16, svint32_t
   svbmops_za32_s32_m(4, pred, pred, s32, s32); // expected-error {{argument value 4 is outside the valid range [0, 3]}}
 }
 
-void test_ldr_zt(const void *const_base) __arm_streaming_compatible __arm_inout("za") {
+void test_ldr_zt(const void *const_base) __arm_streaming_compatible __arm_inout("zt0") {
   svldr_zt(1, const_base); // expected-error {{argument value 1 is outside the valid range [0, 0]}}
 }
 
-void test_str_zt(void *base) __arm_streaming_compatible __arm_in("za") {
+void test_str_zt(void *base) __arm_streaming_compatible __arm_in("zt0") {
   svstr_zt(1, base);       // expected-error {{argument value 1 is outside the valid range [0, 0]}}
 }
 
-void test_svluti2_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_in("za") {
+void test_svluti2_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   // Test Reg Offset
   svluti2_lane_zt_u8_x4(1, zn, 0);   // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -106,7 +106,7 @@ void test_svluti2_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_in("za") {
   svluti2_lane_zt_f32_x4(0, zn, 4);   // expected-error {{argument value 4 is outside the valid range [0, 3]}}
 }
 
-void test_svluti4_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_in("za") {
+void test_svluti4_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_in("zt0") {
   // Test Reg Offset
   svluti4_lane_zt_u16_x4(1, zn, 0);   // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -129,7 +129,7 @@ void test_svluti4_lane_zt_x4(svuint8_t zn) __arm_streaming __arm_in("za") {
   svluti4_lane_zt_f32_x4(0, zn, 2);   // expected-error {{argument value 2 is outside the valid range [0, 1]}}
 }
 
-void test_svluti2_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
+void test_svluti2_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_in("zt0") {
   // Test Reg Offset
   svluti2_lane_zt_u8(1, zn_u8, 2);    // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -156,7 +156,7 @@ void test_svluti2_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
   svluti2_lane_zt_f32(0, zn_u8, 16); // expected-error {{argument value 16 is outside the valid range [0, 15]}}
 }
 
-void test_svluti4_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
+void test_svluti4_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_in("zt0") {
   // Test Reg Offset
   svluti4_lane_zt_u8(1, zn_u8, 2);   // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -183,7 +183,7 @@ void test_svluti4_lane_zt(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
   svluti4_lane_zt_f32(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
 
-void test_svluti2_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
+void test_svluti2_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_in("zt0") {
   // Test Reg Offset
   svluti2_lane_zt_u8_x2(1, zn_u8, 2);    // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
@@ -210,7 +210,7 @@ void test_svluti2_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
   svluti2_lane_zt_f32_x2(0, zn_u8, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
 }
 
-void test_svluti4_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_in("za") {
+void test_svluti4_lane_zt_x2(svuint8_t zn_u8) __arm_streaming __arm_in("zt0") {
   // Test Reg Offset
   svluti4_lane_zt_u8_x2(1, zn_u8, 2);   // expected-error {{argument value 1 is outside the valid range [0, 0]}}
   // Test index value range
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index fbedd27bf998d..174304f09007b 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -1722,12 +1722,27 @@ void SVEEmitter::createBuiltinZAState(raw_ostream &OS) {
 
   std::map<std::string, std::set<std::string>> IntrinsicsPerState;
   for (auto &Def : Defs) {
+    std::string Key;
+    auto AddToKey = [&Key](const std::string &S) -> void {
+      Key = Key.empty() ? S : (Key + " | " + S);
+    };
+
     if (Def->isFlagSet(getEnumValueForFlag("IsInZA")))
-      IntrinsicsPerState["ArmInZA"].insert(Def->getMangledName());
+      AddToKey("ArmInZA");
     else if (Def->isFlagSet(getEnumValueForFlag("IsOutZA")))
-      IntrinsicsPerState["ArmOutZA"].insert(Def->getMangledName());
+      AddToKey("ArmOutZA");
     else if (Def->isFlagSet(getEnumValueForFlag("IsInOutZA")))
-      IntrinsicsPerState["ArmInOutZA"].insert(Def->getMangledName());
+      AddToKey("ArmInOutZA");
+
+    if (Def->isFlagSet(getEnumValueForFlag("IsInZT0")))
+      AddToKey("ArmInZT0");
+    else if (Def->isFlagSet(getEnumValueForFlag("IsOutZT0")))
+      AddToKey("ArmOutZT0");
+    else if (Def->isFlagSet(getEnumValueForFlag("IsInOutZT0")))
+      AddToKey("ArmInOutZT0");
+
+    if (!Key.empty())
+      IntrinsicsPerState[Key].insert(Def->getMangledName());
   }
 
   OS << "#ifdef GET_SME_BUILTIN_GET_STATE\n";

From 5176df55d3afcbecd6e6c42176fa7175897f0016 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 23 Jan 2024 16:51:47 +0000
Subject: [PATCH 647/843] [CompilerRT] Attempt to fix a lit-config issue

This is a follow-up to 3112578597c03 -- it looks like passing the added
cmake flag to pythonize_bool also appends "_PYBOOL" to the flag name, which
this lit config file is missing. This trips up builds such as:

  https://lab.llvm.org/buildbot/#/builders/275/builds/3661
  https://lab.llvm.org/buildbot/#/builders/184/builds/9811

Where COMPILER_RT_HAS_AARCH64_SME ends up expanding to nothing.
---
 compiler-rt/test/lit.common.configured.in | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/test/lit.common.configured.in b/compiler-rt/test/lit.common.configured.in
index b93e20e80a6ed..db5d7c598b731 100644
--- a/compiler-rt/test/lit.common.configured.in
+++ b/compiler-rt/test/lit.common.configured.in
@@ -50,7 +50,7 @@ set_default("gwp_asan", @COMPILER_RT_HAS_GWP_ASAN_PYBOOL@)
 set_default("expensive_checks", @LLVM_ENABLE_EXPENSIVE_CHECKS_PYBOOL@)
 set_default("test_standalone_build_libs", @COMPILER_RT_TEST_STANDALONE_BUILD_LIBS_PYBOOL@)
 set_default("has_compiler_rt_libatomic", @COMPILER_RT_BUILD_STANDALONE_LIBATOMIC_PYBOOL@)
-set_default("aarch64_sme", @COMPILER_RT_HAS_AARCH64_SME@)
+set_default("aarch64_sme", @COMPILER_RT_HAS_AARCH64_SME_PYBOOL@)
 # True iff the test suite supports ignoring the test compiler's runtime library path
 # and using `config.compiler_rt_libdir` instead. This only matters when the runtime
 # library paths differ.

From 55929cd679375d6ee5111edcb881103f57588d0e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 23 Jan 2024 18:02:17 +0100
Subject: [PATCH 648/843] [JITLink][AArch32] Implement Armv5 ldr-pc stubs and
 use them for all pre-v7 targets (#79082)

This stub type loads an absolute address directly into the PC register.
It's the simplest and most compatible way to implement a branch
indirection across the entire address space (and probably the slowest as
well). It's the ideal fallback for all targets for which we did not
(yet) implement a more performant solution.
---
 .../llvm/ExecutionEngine/JITLink/aarch32.h    |  56 +++++--
 .../ExecutionEngine/JITLink/ELF_aarch32.cpp   |  23 +--
 llvm/lib/ExecutionEngine/JITLink/aarch32.cpp  |  67 ++++++++
 .../JITLink/AArch32/ELF_relocations_arm.s     |  54 +++----
 .../AArch32/ELF_relocations_armv7plus.s       |  49 ++++++
 .../JITLink/AArch32/ELF_relocations_data.s    |   7 +-
 .../JITLink/AArch32/ELF_relocations_thumb.s   | 145 ------------------
 .../AArch32/ELF_relocations_thumbv6m.s        |  60 ++++++++
 .../AArch32/ELF_relocations_thumbv7a.s        |  45 ++++++
 .../AArch32/ELF_relocations_thumbv7m.s        | 107 +++++++++++++
 .../JITLink/AArch32/ELF_stubs_arm.s           |  30 ++--
 .../JITLink/AArch32/ELF_stubs_thumb.s         |  23 ++-
 12 files changed, 439 insertions(+), 227 deletions(-)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_armv7plus.s
 delete mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumb.s
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv6m.s
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv7a.s
 create mode 100644 llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv7m.s

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
index ef28ba88431e4..eda6feb441e67 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/aarch32.h
@@ -134,14 +134,15 @@ const char *getEdgeKindName(Edge::Kind K);
 /// Stubs are often called "veneers" in the official docs and online.
 ///
 enum class StubsFlavor {
-  Unsupported = 0,
+  Undefined = 0,
+  pre_v7,
   v7,
 };
 
 /// JITLink sub-arch configuration for Arm CPU models
 struct ArmConfig {
   bool J1J2BranchEncoding = false;
-  StubsFlavor Stubs = StubsFlavor::Unsupported;
+  StubsFlavor Stubs = StubsFlavor::Undefined;
   // In the long term, we might want a linker switch like --target1-rel
   bool Target1Rel = false;
 };
@@ -149,18 +150,12 @@ struct ArmConfig {
 /// Obtain the sub-arch configuration for a given Arm CPU model.
 inline ArmConfig getArmConfigForCPUArch(ARMBuildAttrs::CPUArch CPUArch) {
   ArmConfig ArmCfg;
-  switch (CPUArch) {
-  case ARMBuildAttrs::v7:
-  case ARMBuildAttrs::v8_A:
+  if (CPUArch == ARMBuildAttrs::v7 || CPUArch >= ARMBuildAttrs::v7E_M) {
     ArmCfg.J1J2BranchEncoding = true;
     ArmCfg.Stubs = StubsFlavor::v7;
-    break;
-  default:
-    DEBUG_WITH_TYPE("jitlink", {
-      dbgs() << "  Warning: ARM config not defined for CPU architecture "
-             << getCPUArchName(CPUArch) << " (" << CPUArch << ")\n";
-    });
-    break;
+  } else {
+    ArmCfg.J1J2BranchEncoding = false;
+    ArmCfg.Stubs = StubsFlavor::pre_v7;
   }
   return ArmCfg;
 }
@@ -344,6 +339,43 @@ class GOTBuilder : public TableManager<GOTBuilder> {
   Section *GOTSection = nullptr;
 };
 
+/// Stubs builder emits non-position-independent Arm stubs for pre-v7 CPUs.
+/// These architectures have no MovT/MovW instructions and don't support Thumb2.
+/// BL is the only Thumb instruction that can generate stubs and they can always
+/// be transformed into BLX.
+class StubsManager_prev7 {
+public:
+  StubsManager_prev7() = default;
+
+  /// Name of the object file section that will contain all our stubs.
+  static StringRef getSectionName() {
+    return "__llvm_jitlink_aarch32_STUBS_prev7";
+  }
+
+  /// Implements link-graph traversal via visitExistingEdges()
+  bool visitEdge(LinkGraph &G, Block *B, Edge &E);
+
+private:
+  // Each stub uses a single block that can have 2 entryponts, one for Arm and
+  // one for Thumb
+  struct StubMapEntry {
+    Block *B = nullptr;
+    Symbol *ArmEntry = nullptr;
+    Symbol *ThumbEntry = nullptr;
+  };
+
+  std::pair<StubMapEntry *, bool> getStubMapSlot(StringRef Name) {
+    auto &&[Stubs, NewStub] = StubMap.try_emplace(Name);
+    return std::make_pair(&Stubs->second, NewStub);
+  }
+
+  Symbol *getOrCreateSlotEntrypoint(LinkGraph &G, StubMapEntry &Slot,
+                                    bool Thumb);
+
+  DenseMap<StringRef, StubMapEntry> StubMap;
+  Section *StubsSection = nullptr;
+};
+
 /// Stubs builder for v7 emits non-position-independent Arm and Thumb stubs.
 class StubsManager_v7 {
 public:
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
index dfe6ab14adc27..908b88fef1b31 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_aarch32.cpp
@@ -259,21 +259,8 @@ createLinkGraphFromELFObject_aarch32(MemoryBufferRef ObjectBuffer) {
   // Resolve our internal configuration for the target. If at some point the
   // CPUArch alone becomes too unprecise, we can find more details in the
   // Tag_CPU_arch_profile.
-  aarch32::ArmConfig ArmCfg;
-  using namespace ARMBuildAttrs;
-  auto Arch = static_cast<CPUArch>(ARM::getArchAttr(AK));
-  switch (Arch) {
-  case v7:
-  case v8_A:
-    ArmCfg = aarch32::getArmConfigForCPUArch(Arch);
-    assert(ArmCfg.Stubs != aarch32::StubsFlavor::Unsupported &&
-           "Provide a config for each supported CPU");
-    break;
-  default:
-    return make_error<JITLinkError>(
-        "Failed to build ELF link graph: Unsupported CPU arch " +
-        StringRef(aarch32::getCPUArchName(Arch)));
-  }
+  auto Arch = static_cast<ARMBuildAttrs::CPUArch>(ARM::getArchAttr(AK));
+  aarch32::ArmConfig ArmCfg = aarch32::getArmConfigForCPUArch(Arch);
 
   // Populate the link-graph.
   switch (TT.getArch()) {
@@ -318,11 +305,15 @@ void link_ELF_aarch32(std::unique_ptr<LinkGraph> G,
       PassCfg.PrePrunePasses.push_back(markAllSymbolsLive);
 
     switch (ArmCfg.Stubs) {
+    case aarch32::StubsFlavor::pre_v7:
+      PassCfg.PostPrunePasses.push_back(
+          buildTables_ELF_aarch32<aarch32::StubsManager_prev7>);
+      break;
     case aarch32::StubsFlavor::v7:
       PassCfg.PostPrunePasses.push_back(
           buildTables_ELF_aarch32<aarch32::StubsManager_v7>);
       break;
-    case aarch32::StubsFlavor::Unsupported:
+    case aarch32::StubsFlavor::Undefined:
       llvm_unreachable("Check before building graph");
     }
   }
diff --git a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
index 96dff7656f17d..00be2f57d0664 100644
--- a/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/aarch32.cpp
@@ -739,6 +739,13 @@ bool GOTBuilder::visitEdge(LinkGraph &G, Block *B, Edge &E) {
   return true;
 }
 
+const uint8_t ArmThumbv5LdrPc[] = {
+    0x78, 0x47,             // bx pc
+    0xfd, 0xe7,             // b #-6 ; Arm recommended sequence to follow bx pc
+    0x04, 0xf0, 0x1f, 0xe5, // ldr pc, [pc,#-4] ; L1
+    0x00, 0x00, 0x00, 0x00, // L1: .word S
+};
+
 const uint8_t Armv7ABS[] = {
     0x00, 0xc0, 0x00, 0xe3, // movw r12, #0x0000     ; lower 16-bit
     0x00, 0xc0, 0x40, 0xe3, // movt r12, #0x0000     ; upper 16-bit
@@ -759,6 +766,12 @@ static Block &allocStub(LinkGraph &G, Section &S, const uint8_t (&Code)[Size]) {
   return G.createContentBlock(S, Template, orc::ExecutorAddr(), Alignment, 0);
 }
 
+static Block &createStubPrev7(LinkGraph &G, Section &S, Symbol &Target) {
+  Block &B = allocStub(G, S, ArmThumbv5LdrPc);
+  B.addEdge(Data_Pointer32, 8, Target, 0);
+  return B;
+}
+
 static Block &createStubThumbv7(LinkGraph &G, Section &S, Symbol &Target) {
   Block &B = allocStub(G, S, Thumbv7ABS);
   B.addEdge(Thumb_MovwAbsNC, 0, Target, 0);
@@ -816,6 +829,60 @@ static bool needsStub(const Edge &E) {
   return false;
 }
 
+// The ArmThumbv5LdrPc stub has 2 entrypoints: Thumb at offset 0 is taken only
+// for Thumb B instructions. Thumb BL is rewritten to BLX and takes the Arm
+// entrypoint at offset 4. Arm branches always use that one.
+Symbol *StubsManager_prev7::getOrCreateSlotEntrypoint(LinkGraph &G,
+                                                      StubMapEntry &Slot,
+                                                      bool Thumb) {
+  constexpr orc::ExecutorAddrDiff ThumbEntrypointOffset = 0;
+  constexpr orc::ExecutorAddrDiff ArmEntrypointOffset = 4;
+  if (Thumb && !Slot.ThumbEntry) {
+    Slot.ThumbEntry =
+        &G.addAnonymousSymbol(*Slot.B, ThumbEntrypointOffset, 4, true, false);
+    Slot.ThumbEntry->setTargetFlags(ThumbSymbol);
+  }
+  if (!Thumb && !Slot.ArmEntry)
+    Slot.ArmEntry =
+        &G.addAnonymousSymbol(*Slot.B, ArmEntrypointOffset, 8, true, false);
+  return Thumb ? Slot.ThumbEntry : Slot.ArmEntry;
+}
+
+bool StubsManager_prev7::visitEdge(LinkGraph &G, Block *B, Edge &E) {
+  if (!needsStub(E))
+    return false;
+
+  Symbol &Target = E.getTarget();
+  assert(Target.hasName() && "Edge cannot point to anonymous target");
+  auto [Slot, NewStub] = getStubMapSlot(Target.getName());
+
+  if (NewStub) {
+    if (!StubsSection)
+      StubsSection = &G.createSection(getSectionName(),
+                                      orc::MemProt::Read | orc::MemProt::Exec);
+    LLVM_DEBUG({
+      dbgs() << "    Created stub entry for " << Target.getName() << " in "
+             << StubsSection->getName() << "\n";
+    });
+    Slot->B = &createStubPrev7(G, *StubsSection, Target);
+  }
+
+  // The ArmThumbv5LdrPc stub has 2 entrypoints: Thumb at offset 0 is taken only
+  // for Thumb B instructions. Thumb BL is rewritten to BLX and takes the Arm
+  // entrypoint at offset 4. Arm branches always use that one.
+  bool UseThumb = E.getKind() == Thumb_Jump24;
+  Symbol *StubEntrypoint = getOrCreateSlotEntrypoint(G, *Slot, UseThumb);
+
+  LLVM_DEBUG({
+    dbgs() << "    Using " << (UseThumb ? "Thumb" : "Arm") << " entrypoint "
+           << *StubEntrypoint << " in "
+           << StubEntrypoint->getBlock().getSection().getName() << "\n";
+  });
+
+  E.setTarget(*StubEntrypoint);
+  return true;
+}
+
 bool StubsManager_v7::visitEdge(LinkGraph &G, Block *B, Edge &E) {
   if (!needsStub(E))
     return false;
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_arm.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_arm.s
index 6fd383e2cce5c..3dec8c96f5cd5 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_arm.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_arm.s
@@ -1,8 +1,22 @@
-# RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t.o %s
-# RUN: llvm-objdump -r %t.o | FileCheck --check-prefix=CHECK-TYPE %s
-# RUN: llvm-objdump --disassemble %t.o | FileCheck --check-prefix=CHECK-INSTR %s
+# Test pre-v7 Arm features
+#
+# RUN: llvm-mc -triple=armv4t-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv4t.o %s
+# RUN: llvm-objdump -r %t_armv4t.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_armv4t.o | FileCheck --check-prefix=CHECK-INSTR %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
-# RUN:              -slab-page-size 4096 -show-entry-es -check %s %t.o
+# RUN:              -slab-page-size 4096 -check %s %t_armv4t.o
+#
+# RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv7.o %s
+# RUN: llvm-objdump -r %t_armv7.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_armv7.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -check %s %t_armv7.o
+#
+# RUN: llvm-mc -triple=armv9-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv9.o %s
+# RUN: llvm-objdump -r %t_armv9.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_armv9.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -check %s %t_armv9.o
 
 
 	.text
@@ -63,38 +77,6 @@ jump24_target:
 	bx	lr
 	.size	jump24_target,	.-jump24_target
 
-
-# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_MOVW_ABS_NC data_symbol
-# CHECK-INSTR: 	0000001c <movw>:
-# CHECK-INSTR: 	      1c: e3000000     movw      r0, #0x0
-# jitlink-check: decode_operand(movw, 1) = (data_symbol&0x0000ffff)
-	.globl	movw
-	.type	movw,%function
-	.p2align	2
-movw:
-	movw r0, :lower16:data_symbol
-	.size	movw,	.-movw
-
-# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_MOVT_ABS data_symbol
-# CHECK-INSTR: 	00000020 <movt>:
-# CHECK-INSTR: 	      20: e3400000     movt      r0, #0x0
-# We decode the operand with index 2, because movt generates one leading implicit
-# predicate operand that we have to skip in order to decode the data_symbol operand
-# jitlink-check: decode_operand(movt, 2) = (data_symbol&0xffff0000>>16)
-	.globl	movt
-	.type	movt,%function
-	.p2align	2
-movt:
-	movt r0, :upper16:data_symbol
-	.size	movt,	.-movt
-
-	.data
-	.global data_symbol
-data_symbol:
-	.long 1073741822
-
-	.text
-
 # Empty main function for jitlink to be happy
 	.globl	main
 	.type	main,%function
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_armv7plus.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_armv7plus.s
new file mode 100644
index 0000000000000..890b2136959ef
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_armv7plus.s
@@ -0,0 +1,49 @@
+# Test v7 Arm features
+#
+# RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv7.o %s
+# RUN: llvm-objdump -r %t_armv7.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_armv7.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -abs data_symbol=0x00001234 -check %s %t_armv7.o
+#
+# RUN: llvm-mc -triple=armv9-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_armv9.o %s
+# RUN: llvm-objdump -r %t_armv9.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_armv9.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -abs data_symbol=0x00001234 -check %s %t_armv9.o
+
+
+	.text
+	.syntax unified
+
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_MOVW_ABS_NC data_symbol
+# CHECK-INSTR: <movw>:
+# CHECK-INSTR: e3000000 movw r0, #0x0
+# jitlink-check: decode_operand(movw, 1) = data_symbol[15:0]
+	.globl	movw
+	.type	movw,%function
+	.p2align	2
+movw:
+	movw r0, :lower16:data_symbol
+	.size	movw,	.-movw
+
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_MOVT_ABS data_symbol
+# CHECK-INSTR: <movt>:
+# CHECK-INSTR: e3400000 movt r0, #0x0
+# We decode the operand with index 2, because movt generates one leading implicit
+# predicate operand that we have to skip in order to decode the data_symbol operand
+# jitlink-check: decode_operand(movt, 2) = data_symbol[31:16]
+	.globl	movt
+	.type	movt,%function
+	.p2align	2
+movt:
+	movt r0, :upper16:data_symbol
+	.size	movt,	.-movt
+
+# Empty main function for jitlink to be happy
+	.globl	main
+	.type	main,%function
+	.p2align	2
+main:
+	bx	lr
+	.size	main,	.-main
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
index 544e8c259d286..4a2b281816d2b 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_data.s
@@ -1,4 +1,9 @@
-# RUN: rm -rf %t && mkdir -p %t/armv7 && mkdir -p %t/thumbv7
+# RUN: rm -rf %t && mkdir -p %t/armv6 && mkdir -p %t/armv7 && mkdir -p %t/thumbv7
+# RUN: llvm-mc -triple=armv6-none-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t/armv6/out.o %s
+# RUN: llvm-objdump -r %t/armv6/out.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb -slab-page-size 4096 \
+# RUN:              -abs target=0x76bbe88f -check %s %t/armv6/out.o
+
 # RUN: llvm-mc -triple=armv7-none-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t/armv7/out.o %s
 # RUN: llvm-objdump -r %t/armv7/out.o | FileCheck --check-prefix=CHECK-TYPE %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb -slab-page-size 4096 \
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumb.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumb.s
deleted file mode 100644
index 86f011834baae..0000000000000
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumb.s
+++ /dev/null
@@ -1,145 +0,0 @@
-# RUN: llvm-mc -triple=thumbv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t.o %s
-# RUN: llvm-objdump -r %t.o | FileCheck --check-prefix=CHECK-TYPE %s
-# RUN: llvm-objdump --disassemble %t.o | FileCheck --check-prefix=CHECK-INSTR %s
-# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
-# RUN:              -slab-page-size 4096 -abs external_func=0x76bbe880 \
-# RUN:              -check %s %t.o
-
-
-	.text
-	.syntax unified
-
-# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_CALL call_target_thumb
-# CHECK-INSTR: 	00000000 <call_site>:
-# CHECK-INSTR: 	       0: f7ff fffe     bl
-# CHECK-INSTR: 	       4: f7ff fffe     bl
-# CHECK-INSTR: 	00000008 <call_target_thumb>
-# CHECK-INSTR: 	0000000c <call_target_arm>
-# We decode the operand with index 2, because bl generates two leading implicit
-# predicate operands that we have to skip in order to decode the call_target operand
-# jitlink-check: decode_operand(call_site + 0, 2) = call_target_thumb - (call_site + 4)
-# jitlink-check: decode_operand(call_site + 4, 2) = call_target_arm   - (call_site + 8)
-	.globl	call_site
-	.type	call_site,%function
-	.p2align	1
-	.code	16
-	.thumb_func
-call_site:
-	bl	call_target_thumb
-	bl	call_target_arm
-	.size	call_site, .-call_site
-
-	.globl	call_target_thumb
-	.type	call_target_thumb,%function
-	.p2align	1
-	.code	16
-	.thumb_func
-call_target_thumb:
-	bx	lr
-	.size	call_target_thumb, .-call_target_thumb
-
-	.globl	call_target_arm
-	.type	call_target_arm,%function
-	.p2align	2
-	.code	32
-call_target_arm:
-	bx	lr
-	.size	call_target_arm, .-call_target_arm
-
-# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_JUMP24 jump24_target
-# CHECK-INSTR: 	00000010 <jump24_site>:
-# CHECK-INSTR: 	      10: f7ff bffe     b.w
-# CHECK-INSTR: 	00000014 <jump24_target>
-# b.w generates two implicit predicate operands as well, but they are trailing
-# operands, so there is no need to adjust the operand index.
-# jitlink-check: decode_operand(jump24_site, 0) = jump24_target - next_pc(jump24_site)
-	.globl	jump24_site
-	.type	jump24_site,%function
-	.p2align	1
-	.code	16
-	.thumb_func
-jump24_site:
-	b.w	jump24_target
-	.size	jump24_site,	.-jump24_site
-
-	.globl	jump24_target
-	.type	jump24_target,%function
-	.p2align	1
-	.code	16
-	.thumb_func
-jump24_target:
-	bx	lr
-	.size	jump24_target,	.-jump24_target
-
-# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_MOVW_ABS_NC data_symbol
-# CHECK-INSTR: 	00000016 <movw>:
-# CHECK-INSTR: 	      16: f240 0000     movw    r0, #0x0
-# jitlink-check: decode_operand(movw, 1) = (data_symbol&0x0000ffff)
-	.globl	movw
-	.type	movw,%function
-	.p2align	1
-	.code	16
-	.thumb_func
-movw:
-	movw r0, :lower16:data_symbol
-	.size	movw,	.-movw
-
-# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_MOVT_ABS data_symbol
-# CHECK-INSTR: 	0000001a <movt>:
-# CHECK-INSTR: 	      1a: f2c0 0000     movt    r0, #0x0
-# We decode the operand with index 2, because movt generates one leading implicit
-# predicate operand that we have to skip in order to decode the data_symbol operand
-# jitlink-check: decode_operand(movt, 2) = (data_symbol&0xffff0000>>16)
-	.globl	movt
-	.type	movt,%function
-	.p2align	1
-	.code	16
-	.thumb_func
-movt:
-	movt r0, :upper16:data_symbol
-	.size	movt,	.-movt
-
-	.data
-	.global data_symbol
-data_symbol:
-	.long 1073741822
-
-	.text
-
-# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_MOVW_PREL_NC external_func
-# CHECK-INSTR: 	0000001e <movw_prel>:
-# CHECK-INSTR: 	      1e: f240 0000     movw    r0, #0x0
-# jitlink-check: decode_operand(movw_prel, 1) = \
-# jitlink-check:              ((external_func - movw_prel)&0x0000ffff)
-.globl	movw_prel
-.type	movw_prel,%function
-.p2align	1
-.code	16
-.thumb_func
-movw_prel:
-	movw r0, :lower16:external_func - .
-	.size	movw_prel,	.-movw_prel
-
-# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_MOVT_PREL external_func 
-# CHECK-INSTR: 	00000022 <movt_prel>:
-# CHECK-INSTR: 	      22: f2c0 0000    movt    r0, #0x0
-# jitlink-check: decode_operand(movt_prel, 2) = \
-# jitlink-check:               ((external_func - movt_prel)&0xffff0000>>16)
-.globl	movt_prel
-.type	movt_prel,%function
-.p2align	1
-.code	16
-.thumb_func
-movt_prel:
-	movt r0, :upper16:external_func - .
-	.size	movt_prel,	.-movt_prel
-
-# Empty main function for jitlink to be happy
-	.globl	main
-	.type	main,%function
-	.p2align	1
-	.code	16
-	.thumb_func
-main:
-	bx	lr
-	.size	main,	.-main
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv6m.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv6m.s
new file mode 100644
index 0000000000000..e0a224d9c7106
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv6m.s
@@ -0,0 +1,60 @@
+# Test pre-v7 Thumb features for Thumb-only targets
+#
+# RUN: llvm-mc -triple=thumbv6m-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv6m.o %s
+# RUN: llvm-objdump -r %t_thumbv6m.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_thumbv6m.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -abs external_func=0x76bbe880 \
+# RUN:              -check %s %t_thumbv6m.o
+#
+# RUN: llvm-mc -triple=thumbv7m-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7m.o %s
+# RUN: llvm-objdump -r %t_thumbv7m.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_thumbv7m.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -abs external_func=0x76bbe880 \
+# RUN:              -check %s %t_thumbv7m.o
+#
+# RUN: llvm-mc -triple=thumbv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7.o %s
+# RUN: llvm-objdump -r %t_thumbv7.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_thumbv7.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -abs external_func=0x76bbe880 \
+# RUN:              -check %s %t_thumbv7.o
+
+
+	.text
+	.syntax unified
+
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_CALL call_target_thumb
+# CHECK-INSTR: <call_site>:
+# CHECK-INSTR: f7ff fffe     bl
+# We decode the operand with index 2, because bl generates two leading implicit
+# predicate operands that we have to skip in order to decode the call_target operand
+# jitlink-check: decode_operand(call_site, 2) = call_target_thumb - (call_site + 4)
+	.globl	call_site
+	.type	call_site,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+call_site:
+	bl	call_target_thumb
+	.size	call_site, .-call_site
+
+	.globl	call_target_thumb
+	.type	call_target_thumb,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+call_target_thumb:
+	bx	lr
+	.size	call_target_thumb, .-call_target_thumb
+
+# Empty main function for jitlink to be happy
+	.globl	main
+	.type	main,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+main:
+	bx	lr
+	.size	main,	.-main
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv7a.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv7a.s
new file mode 100644
index 0000000000000..0e4a2cfb2c349
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv7a.s
@@ -0,0 +1,45 @@
+# Test v7 Thumb features for mixed Arm/Thumb targets
+#
+# RUN: llvm-mc -triple=thumbv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7.o %s
+# RUN: llvm-objdump -r %t_thumbv7.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_thumbv7.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -abs external_func=0x76bbe880 \
+# RUN:              -check %s %t_thumbv7.o
+
+
+	.text
+	.syntax unified
+
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_CALL call_target_arm
+# CHECK-INSTR: <call_site>:
+# CHECK-INSTR: f7ff fffe     bl
+# We decode the operand with index 2, because bl generates two leading implicit
+# predicate operands that we have to skip in order to decode the call_target operand
+# jitlink-check: decode_operand(call_site, 2) = call_target_arm - next_pc(call_site)
+	.globl	call_site
+	.type	call_site,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+call_site:
+	bl	call_target_arm
+	.size	call_site, .-call_site
+
+	.globl	call_target_arm
+	.type	call_target_arm,%function
+	.p2align	2
+	.code	32
+call_target_arm:
+	bx	lr
+	.size	call_target_arm, .-call_target_arm
+
+# Empty main function for jitlink to be happy
+	.globl	main
+	.type	main,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+main:
+	bx	lr
+	.size	main,	.-main
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv7m.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv7m.s
new file mode 100644
index 0000000000000..4997fb3cf8ab1
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_relocations_thumbv7m.s
@@ -0,0 +1,107 @@
+# Test v7 Thumb features for Thumb-only targets
+#
+# RUN: llvm-mc -triple=thumbv7m-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7m.o %s
+# RUN: llvm-objdump -r %t_thumbv7m.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_thumbv7m.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -abs ext_func=0x76bbe880 -abs ext_data=0x00001234 \
+# RUN:              -check %s %t_thumbv7m.o
+#
+# RUN: llvm-mc -triple=thumbv7-linux-gnueabi -arm-add-build-attributes -filetype=obj -o %t_thumbv7.o %s
+# RUN: llvm-objdump -r %t_thumbv7.o | FileCheck --check-prefix=CHECK-TYPE %s
+# RUN: llvm-objdump --disassemble %t_thumbv7.o | FileCheck --check-prefix=CHECK-INSTR %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 -slab-allocate 10Kb \
+# RUN:              -slab-page-size 4096 -abs ext_func=0x76bbe880 -abs ext_data=0x00001234 \
+# RUN:              -check %s %t_thumbv7.o
+
+	.text
+	.syntax unified
+
+
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_JUMP24 jump24_target
+# CHECK-INSTR: <jump24_site>:
+# CHECK-INSTR: f7ff bffe     b.w
+# b.w generates two implicit predicate operands as well, but they are trailing
+# operands, so there is no need to adjust the operand index.
+# jitlink-check: decode_operand(jump24_site, 0) = jump24_target - next_pc(jump24_site)
+	.globl	jump24_site
+	.type	jump24_site,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+jump24_site:
+	b.w	jump24_target
+	.size	jump24_site, .-jump24_site
+
+	.globl	jump24_target
+	.type	jump24_target,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+jump24_target:
+	bx	lr
+	.size	jump24_target, .-jump24_target
+
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_MOVW_ABS_NC ext_data
+# CHECK-INSTR: <movw>:
+# CHECK-INSTR: f240 0000     movw    r0, #0x0
+# jitlink-check: decode_operand(movw, 1) = ext_data[15:0]
+	.globl	movw
+	.type	movw,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+movw:
+	movw r0, :lower16:ext_data
+	.size	movw,	.-movw
+
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_MOVT_ABS ext_data
+# CHECK-INSTR: <movt>:
+# CHECK-INSTR: f2c0 0000     movt    r0, #0x0
+# We decode the operand with index 2, because movt generates one leading implicit
+# predicate operand that we have to skip in order to decode the ext_data operand
+# jitlink-check: decode_operand(movt, 2) = ext_data[31:16]
+	.globl	movt
+	.type	movt,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+movt:
+	movt r0, :upper16:ext_data
+	.size	movt,	.-movt
+
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_MOVW_PREL_NC ext_func
+# CHECK-INSTR: <movw_prel>:
+# CHECK-INSTR: f240 0000     movw    r0, #0x0
+# jitlink-check: decode_operand(movw_prel, 1) = (ext_func - movw_prel)[15:0]
+  .globl	movw_prel
+  .type	movw_prel,%function
+  .p2align	1
+  .code	16
+  .thumb_func
+movw_prel:
+	movw r0, :lower16:ext_func - .
+	.size	movw_prel, .-movw_prel
+
+# CHECK-TYPE: {{[0-9a-f]+}} R_ARM_THM_MOVT_PREL ext_func
+# CHECK-INSTR: <movt_prel>:
+# CHECK-INSTR: f2c0 0000    movt    r0, #0x0
+# jitlink-check: decode_operand(movt_prel, 2) = (ext_func - movt_prel)[31:16]
+  .globl	movt_prel
+  .type	movt_prel,%function
+  .p2align	1
+  .code	16
+  .thumb_func
+movt_prel:
+	movt r0, :upper16:ext_func - .
+	.size	movt_prel, .-movt_prel
+
+# Empty main function for jitlink to be happy
+	.globl	main
+	.type	main,%function
+	.p2align	1
+	.code	16
+	.thumb_func
+main:
+	bx	lr
+	.size	main,	.-main
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_arm.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_arm.s
index fb2e0eb2c0bf2..d3a596c811ec4 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_arm.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_arm.s
@@ -1,10 +1,22 @@
-# RUN: rm -rf %t && mkdir -p %t
+# RUN: rm -rf %t && mkdir -p %t/armv4t && mkdir -p %t/armv6 && mkdir -p %t/armv7
+#
+# RUN: llvm-mc -triple=armv4t-linux-gnueabi -arm-add-build-attributes \
+# RUN:         -filetype=obj -o %t/armv4t/out.o %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 \
+# RUN:              -slab-allocate 10Kb -slab-page-size 4096 \
+# RUN:              -abs ext=0x76bbe880 -check %s %t/armv4t/out.o
+#
+# RUN: llvm-mc -triple=armv6-linux-gnueabi -arm-add-build-attributes \
+# RUN:         -filetype=obj -o %t/armv6/out.o %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 \
+# RUN:              -slab-allocate 10Kb -slab-page-size 4096 \
+# RUN:              -abs ext=0x76bbe880 -check %s %t/armv6/out.o
+#
 # RUN: llvm-mc -triple=armv7-linux-gnueabi -arm-add-build-attributes \
-# RUN:         -filetype=obj -o %t/out.o %s
+# RUN:         -filetype=obj -o %t/armv7/out.o %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 \
 # RUN:              -slab-allocate 10Kb -slab-page-size 4096 \
-# RUN:              -abs ext=0x76bbe880 \
-# RUN:              -check %s %t/out.o
+# RUN:              -abs ext=0x76bbe880 -check %s %t/armv7/out.o
 
 	.text
 	.syntax unified
@@ -36,10 +48,10 @@ test_arm_call:
 	pop	{pc}
 	.size	test_arm_call, .-test_arm_call
 
-# This test is executable with both, Arm and Thumb `ext` functions. It only has
-# to return with `bx lr`. For example:
-#   > echo "void ext() {}" | clang -target armv7-linux-gnueabihf -o ext-arm.o -c -xc -
-#   > llvm-jitlink ext-arm.o out.o
+# This test is executable with any Arm (and for v7+ also Thumb) `ext` functions.
+# It only has to return with `bx lr`. For example:
+#   > echo "void ext() {}" | clang -target armv7-linux-gnueabihf -o ext.o -c -xc -
+#   > llvm-jitlink ext.o out.o
 #
 	.globl	main
 	.type	main,%function
@@ -48,6 +60,6 @@ main:
 	push	{lr}
 	bl	test_arm_call
 	bl	test_arm_jump
-	movw	r0, #0
+	mov	r0, #0
 	pop	{pc}
 	.size	main, .-main
diff --git a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_thumb.s b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_thumb.s
index f6156628ce2a9..aa8c917a08809 100644
--- a/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_thumb.s
+++ b/llvm/test/ExecutionEngine/JITLink/AArch32/ELF_stubs_thumb.s
@@ -1,10 +1,17 @@
-# RUN: rm -rf %t && mkdir -p %t
+# RUN: rm -rf %t && mkdir -p %t/thumbv7m && mkdir -p %t/thumbv7
+#
+# RUN: llvm-mc -triple=thumbv7m-linux-gnueabi -arm-add-build-attributes \
+# RUN:         -filetype=obj -o %t/thumbv7m/out.o %s
+# RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 \
+# RUN:              -slab-allocate 10Kb -slab-page-size 4096 \
+# RUN:              -abs ext=0x76bbe880 -check %s %t/thumbv7m/out.o
+#
 # RUN: llvm-mc -triple=thumbv7-linux-gnueabi -arm-add-build-attributes \
-# RUN:         -filetype=obj -o %t/elf_stubs.o %s
+# RUN:         -filetype=obj -o %t/thumbv7/out.o %s
 # RUN: llvm-jitlink -noexec -slab-address 0x76ff0000 \
 # RUN:              -slab-allocate 10Kb -slab-page-size 4096 \
-# RUN:              -abs external_func=0x76bbe880 \
-# RUN:              -check %s %t/elf_stubs.o
+# RUN:              -abs ext=0x76bbe880 -check %s %t/thumbv7/out.o
+
 
 	.text
 	.syntax unified
@@ -14,15 +21,15 @@
 # where the branch-target address is loaded from a GOT entry. Instead, they
 # hard-code it in the immediate field.
 #
-# jitlink-check: decode_operand(test_external_call, 2) = stub_addr(elf_stubs.o, external_func) - next_pc(test_external_call)
-# jitlink-check: decode_operand(test_external_jump, 0) = stub_addr(elf_stubs.o, external_func) - next_pc(test_external_jump)
+# jitlink-check: decode_operand(test_external_call, 2) = stub_addr(out.o, ext) - next_pc(test_external_call)
+# jitlink-check: decode_operand(test_external_jump, 0) = stub_addr(out.o, ext) - next_pc(test_external_jump)
 	.globl  test_external_call
 	.type	test_external_call,%function
 	.p2align	1
 	.code	16
 	.thumb_func
 test_external_call:
-	bl	external_func
+	bl	ext
 	.size test_external_call, .-test_external_call
 
 	.globl  test_external_jump
@@ -31,7 +38,7 @@ test_external_call:
 	.code	16
 	.thumb_func
 test_external_jump:
-	b	external_func
+	b	ext
 	.size test_external_jump, .-test_external_jump
 
 # Empty main function for jitlink to be happy

From 39420279125c8d2fe0a2dae8b8c2e80d0118789c Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 23 Jan 2024 17:14:48 +0000
Subject: [PATCH 649/843] [DebugInfo][RemoveDIs] Disable a run-line while
 investigating a problem

This just reduces coverage for RemoveDIs temporarily, and it's almost
certainly a patch-ordering problem.
---
 llvm/test/Transforms/IROutliner/legal-debug.ll | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/test/Transforms/IROutliner/legal-debug.ll b/llvm/test/Transforms/IROutliner/legal-debug.ll
index be1182b38fa2d..829629c617583 100644
--- a/llvm/test/Transforms/IROutliner/legal-debug.ll
+++ b/llvm/test/Transforms/IROutliner/legal-debug.ll
@@ -1,6 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --include-generated-funcs
 ; RUN: opt -S -passes=verify,iroutliner -ir-outlining-no-cost < %s | FileCheck %s
-; RUN: opt -S -passes=verify,iroutliner -ir-outlining-no-cost < %s --try-experimental-debuginfo-iterators | FileCheck %s
+; FIXME: Disabled run-line after running into some turbulence with debuginfo-iterators
+; run: opt -S -passes=verify,iroutliner -ir-outlining-no-cost < %s --try-experimental-debuginfo-iterators | FileCheck %s
 
 ; This test checks that debug info is recognized as able to be extracted along
 ; with the other instructions, but is not included in the consolidated function.

From 8a45cec934697747ac3d3a18e75833e0058fe9a1 Mon Sep 17 00:00:00 2001
From: James Y Knight <jyknight@google.com>
Date: Tue, 23 Jan 2024 12:18:51 -0500
Subject: [PATCH 650/843] [LangRef] adjust IR atomics specification following
 C++20 model tweaks. (#77263)

C++20 accepted two papers, [P0668](https://wg21.link/P0668) and
[P0982](https://wg21.link/P0982), which changed the atomics memory model
slightly in order to reflect the realities of the existing
implementations.

The rationale for these changes applies as well to the LLVM IR atomics
model. No code changes are expected to be required from this change: it
is primarily a matter of more-correctly-documenting the existing state
of the world.

There's three changes: two of them weaken guarantees, and one
strengthens them:

1. The memory ordering guaranteed by some backends/CPUs when seq_cst
operations are mixed with acquire/release operations on the same
location was weaker than the spec guaranteed. Therefore, the
specification is changed to remove the requirement that seq_cst ordering
is consistent with happens-before, and replaces it with a slightly
weaker requirement of consistency with a new relation named
strongly-happens-before.

2. The rules for a "release sequence" were weakened. Previously, an
acquire synchronizes with an release even if it observes a later
monotonic store from the same thread as the release store. That has now
been removed: now, only read-modify-write operations can extend a
release sequence.

3. The model for a a seq_cst fence is strengthened, such that placing a
seq_cst between monotonic accesses now _is_ sufficient to guarantee
sequential consistency in the model (as it always has been on existing
implementations.)

Note that I've directly referenced the C++ standard's atomics.order
section for the precise semantics of seq_cst, instead of fully
describing them. They are quite complex, and a lot of work has gone into
refining the words in the standard. I'm afraid if I attempt to reiterate
them, I would only introduce errors.
---
 llvm/docs/Atomics.rst                      | 40 +++++++++-------
 llvm/docs/LangRef.rst                      | 54 +++++++++++++---------
 llvm/include/llvm/CodeGen/TargetLowering.h | 28 +++--------
 3 files changed, 63 insertions(+), 59 deletions(-)

diff --git a/llvm/docs/Atomics.rst b/llvm/docs/Atomics.rst
index 6ad6e1812cb0a..4dee3e6bd9f4f 100644
--- a/llvm/docs/Atomics.rst
+++ b/llvm/docs/Atomics.rst
@@ -14,9 +14,16 @@ asynchronous signals.
 The atomic instructions are designed specifically to provide readable IR and
 optimized code generation for the following:
 
-* The C++11 ``<atomic>`` header.  (`C++11 draft available here
-  <http://www.open-std.org/jtc1/sc22/wg21/>`_.) (`C11 draft available here
-  <http://www.open-std.org/jtc1/sc22/wg14/>`_.)
+* The C++ ``<atomic>`` header and C ``<stdatomic.h>`` headers. These
+  were originally added in C++11 and C11. The memory model has been
+  subsequently adjusted to correct errors in the initial
+  specification, so LLVM currently intends to implement the version
+  specified by C++20. (See the `C++20 draft standard
+  <https://isocpp.org/files/papers/N4860.pdf>`_ or the unofficial
+  `latest C++ draft <https://eel.is/c++draft/>`_. A `C2x draft
+  <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n3047.pdf>`_ is
+  also available, though the text has not yet been updated with the
+  errata corrected by C++20.)
 
 * Proper semantics for Java-style memory, for both ``volatile`` and regular
   shared variables. (`Java Specification
@@ -110,13 +117,14 @@ where threads and signals are involved.
 atomic store (where the store is conditional for ``cmpxchg``), but no other
 memory operation can happen on any thread between the load and store.
 
-A ``fence`` provides Acquire and/or Release ordering which is not part of
-another operation; it is normally used along with Monotonic memory operations.
-A Monotonic load followed by an Acquire fence is roughly equivalent to an
-Acquire load, and a Monotonic store following a Release fence is roughly
-equivalent to a Release store. SequentiallyConsistent fences behave as both
-an Acquire and a Release fence, and offer some additional complicated
-guarantees, see the C++11 standard for details.
+A ``fence`` provides Acquire and/or Release ordering which is not part
+of another operation; it is normally used along with Monotonic memory
+operations.  A Monotonic load followed by an Acquire fence is roughly
+equivalent to an Acquire load, and a Monotonic store following a
+Release fence is roughly equivalent to a Release
+store. SequentiallyConsistent fences behave as both an Acquire and a
+Release fence, and additionally provide a total ordering with some
+complicated guarantees, see the C++ standard for details.
 
 Frontends generating atomic instructions generally need to be aware of the
 target to some degree; atomic instructions are guaranteed to be lock-free, and
@@ -222,7 +230,7 @@ essentially guarantees that if you take all the operations affecting a specific
 address, a consistent ordering exists.
 
 Relevant standard
-  This corresponds to the C++11/C11 ``memory_order_relaxed``; see those
+  This corresponds to the C++/C ``memory_order_relaxed``; see those
   standards for the exact definition.
 
 Notes for frontends
@@ -252,8 +260,8 @@ Acquire provides a barrier of the sort necessary to acquire a lock to access
 other memory with normal loads and stores.
 
 Relevant standard
-  This corresponds to the C++11/C11 ``memory_order_acquire``. It should also be
-  used for C++11/C11 ``memory_order_consume``.
+  This corresponds to the C++/C ``memory_order_acquire``. It should also be
+  used for C++/C ``memory_order_consume``.
 
 Notes for frontends
   If you are writing a frontend which uses this directly, use with caution.
@@ -282,7 +290,7 @@ Release is similar to Acquire, but with a barrier of the sort necessary to
 release a lock.
 
 Relevant standard
-  This corresponds to the C++11/C11 ``memory_order_release``.
+  This corresponds to the C++/C ``memory_order_release``.
 
 Notes for frontends
   If you are writing a frontend which uses this directly, use with caution.
@@ -308,7 +316,7 @@ AcquireRelease (``acq_rel`` in IR) provides both an Acquire and a Release
 barrier (for fences and operations which both read and write memory).
 
 Relevant standard
-  This corresponds to the C++11/C11 ``memory_order_acq_rel``.
+  This corresponds to the C++/C ``memory_order_acq_rel``.
 
 Notes for frontends
   If you are writing a frontend which uses this directly, use with caution.
@@ -331,7 +339,7 @@ and Release semantics for stores. Additionally, it guarantees that a total
 ordering exists between all SequentiallyConsistent operations.
 
 Relevant standard
-  This corresponds to the C++11/C11 ``memory_order_seq_cst``, Java volatile, and
+  This corresponds to the C++/C ``memory_order_seq_cst``, Java volatile, and
   the gcc-compatible ``__sync_*`` builtins which do not specify otherwise.
 
 Notes for frontends
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 178029aca98a9..7a7ddc59ba985 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -3312,7 +3312,7 @@ Memory Model for Concurrent Operations
 The LLVM IR does not define any way to start parallel threads of
 execution or to register signal handlers. Nonetheless, there are
 platform-specific ways to create them, and we define LLVM IR's behavior
-in their presence. This model is inspired by the C++0x memory model.
+in their presence. This model is inspired by the C++ memory model.
 
 For a more informal introduction to this model, see the :doc:`Atomics`.
 
@@ -3320,7 +3320,7 @@ We define a *happens-before* partial order as the least partial order
 that
 
 -  Is a superset of single-thread program order, and
--  When a *synchronizes-with* ``b``, includes an edge from ``a`` to
+-  When ``a`` *synchronizes-with* ``b``, includes an edge from ``a`` to
    ``b``. *Synchronizes-with* pairs are introduced by platform-specific
    techniques, like pthread locks, thread creation, thread joining,
    etc., and by atomic instructions. (See also :ref:`Atomic Memory Ordering
@@ -3384,13 +3384,12 @@ Atomic instructions (:ref:`cmpxchg <i_cmpxchg>`,
 :ref:`atomicrmw <i_atomicrmw>`, :ref:`fence <i_fence>`,
 :ref:`atomic load <i_load>`, and :ref:`atomic store <i_store>`) take
 ordering parameters that determine which other atomic instructions on
-the same address they *synchronize with*. These semantics are borrowed
-from Java and C++0x, but are somewhat more colloquial. If these
-descriptions aren't precise enough, check those specs (see spec
-references in the :doc:`atomics guide <Atomics>`).
-:ref:`fence <i_fence>` instructions treat these orderings somewhat
-differently since they don't take an address. See that instruction's
-documentation for details.
+the same address they *synchronize with*. These semantics implement
+the Java or C++ memory models; if these descriptions aren't precise
+enough, check those specs (see spec references in the
+:doc:`atomics guide <Atomics>`). :ref:`fence <i_fence>` instructions
+treat these orderings somewhat differently since they don't take an
+address. See that instruction's documentation for details.
 
 For a simpler introduction to the ordering constraints, see the
 :doc:`Atomics`.
@@ -3418,32 +3417,37 @@ For a simpler introduction to the ordering constraints, see the
     stronger) operations on the same address. If an address is written
     ``monotonic``-ally by one thread, and other threads ``monotonic``-ally
     read that address repeatedly, the other threads must eventually see
-    the write. This corresponds to the C++0x/C1x
-    ``memory_order_relaxed``.
+    the write. This corresponds to the C/C++ ``memory_order_relaxed``.
 ``acquire``
     In addition to the guarantees of ``monotonic``, a
     *synchronizes-with* edge may be formed with a ``release`` operation.
-    This is intended to model C++'s ``memory_order_acquire``.
+    This is intended to model C/C++'s ``memory_order_acquire``.
 ``release``
     In addition to the guarantees of ``monotonic``, if this operation
     writes a value which is subsequently read by an ``acquire``
-    operation, it *synchronizes-with* that operation. (This isn't a
-    complete description; see the C++0x definition of a release
-    sequence.) This corresponds to the C++0x/C1x
+    operation, it *synchronizes-with* that operation. Furthermore,
+    this occurs even if the value written by a ``release`` operation
+    has been modified by a read-modify-write operation before being
+    read. (Such a set of operations comprises a *release
+    sequence*). This corresponds to the C/C++
     ``memory_order_release``.
 ``acq_rel`` (acquire+release)
     Acts as both an ``acquire`` and ``release`` operation on its
-    address. This corresponds to the C++0x/C1x ``memory_order_acq_rel``.
+    address. This corresponds to the C/C++ ``memory_order_acq_rel``.
 ``seq_cst`` (sequentially consistent)
     In addition to the guarantees of ``acq_rel`` (``acquire`` for an
     operation that only reads, ``release`` for an operation that only
     writes), there is a global total order on all
-    sequentially-consistent operations on all addresses, which is
-    consistent with the *happens-before* partial order and with the
-    modification orders of all the affected addresses. Each
+    sequentially-consistent operations on all addresses. Each
     sequentially-consistent read sees the last preceding write to the
-    same address in this global order. This corresponds to the C++0x/C1x
-    ``memory_order_seq_cst`` and Java volatile.
+    same address in this global order. This corresponds to the C/C++
+    ``memory_order_seq_cst`` and Java ``volatile``.
+
+    Note: this global total order is *not* guaranteed to be fully
+    consistent with the *happens-before* partial order if
+    non-``seq_cst`` accesses are involved. See the C++ standard
+    `[atomics.order] <https://wg21.link/atomics.order>`_ section
+    for more details on the exact guarantees.
 
 .. _syncscope:
 
@@ -10762,7 +10766,13 @@ still *synchronize-with* the explicit ``fence`` and establish the
 
 A ``fence`` which has ``seq_cst`` ordering, in addition to having both
 ``acquire`` and ``release`` semantics specified above, participates in
-the global program order of other ``seq_cst`` operations and/or fences.
+the global program order of other ``seq_cst`` operations and/or
+fences. Furthermore, the global ordering created by a ``seq_cst``
+fence must be compatible with the individual total orders of
+``monotonic`` (or stronger) memory accesses occurring before and after
+such a fence. The exact semantics of this interaction are somewhat
+complicated, see the C++ standard's `[atomics.order]
+<https://wg21.link/atomics.order>`_ section for more details.
 
 A ``fence`` instruction can also take an optional
 ":ref:`syncscope <syncscope>`" argument.
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index ed2b513be9608..c9492b4cf778b 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2166,27 +2166,13 @@ class TargetLoweringBase {
   /// This function should either return a nullptr, or a pointer to an IR-level
   ///   Instruction*. Even complex fence sequences can be represented by a
   ///   single Instruction* through an intrinsic to be lowered later.
-  /// Backends should override this method to produce target-specific intrinsic
-  ///   for their fences.
-  /// FIXME: Please note that the default implementation here in terms of
-  ///   IR-level fences exists for historical/compatibility reasons and is
-  ///   *unsound* ! Fences cannot, in general, be used to restore sequential
-  ///   consistency. For example, consider the following example:
-  /// atomic<int> x = y = 0;
-  /// int r1, r2, r3, r4;
-  /// Thread 0:
-  ///   x.store(1);
-  /// Thread 1:
-  ///   y.store(1);
-  /// Thread 2:
-  ///   r1 = x.load();
-  ///   r2 = y.load();
-  /// Thread 3:
-  ///   r3 = y.load();
-  ///   r4 = x.load();
-  ///  r1 = r3 = 1 and r2 = r4 = 0 is impossible as long as the accesses are all
-  ///  seq_cst. But if they are lowered to monotonic accesses, no amount of
-  ///  IR-level fences can prevent it.
+  ///
+  /// The default implementation emits an IR fence before any release (or
+  ///   stronger) operation that stores, and after any acquire (or stronger)
+  ///   operation. This is generally a correct implementation, but backends may
+  ///   override if they wish to use alternative schemes (e.g. the PowerPC
+  ///   standard ABI uses a fence before a seq_cst load instead of after a
+  ///   seq_cst store).
   /// @{
   virtual Instruction *emitLeadingFence(IRBuilderBase &Builder,
                                         Instruction *Inst,

From 51f9e982ed5c9a1f39c50d0501ab1bcb6ebf5de4 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 23 Jan 2024 08:55:48 -0800
Subject: [PATCH 651/843] [RISCV] Use early return for select shuffle lowering
 [nfc]

Minor rework of the fallback case for two argument shuffles in lowerVECTOR_SHUFFLE.  We had some common code which wasn't actually common, and simplified significantly once specialized for whether we had a select or not.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 64 ++++++++++++---------
 1 file changed, 38 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 76e8d21b818b2..ff730cdd272b2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4852,41 +4852,56 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
 
   assert(!V1.isUndef() && "Unexpected shuffle canonicalization");
 
-  SmallVector<SDValue> MaskVals;
-  // As a backup, shuffles can be lowered via a vrgather instruction, possibly
-  // merged with a second vrgather.
-  SmallVector<SDValue> GatherIndicesLHS, GatherIndicesRHS;
-
   // By default we preserve the original operand order, and use a mask to
   // select LHS as true and RHS as false. However, since RVV vector selects may
   // feature splats but only on the LHS, we may choose to invert our mask and
   // instead select between RHS and LHS.
   bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
-  bool InvertMask = IsSelect == SwapOps;
+
+  if (IsSelect) {
+    // Now construct the mask that will be used by the vselect operation.
+    SmallVector<SDValue> MaskVals;
+    for (int MaskIndex : Mask) {
+      bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ SwapOps;
+      MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
+    }
+
+    if (SwapOps)
+      std::swap(V1, V2);
+
+    assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+    SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
+    return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
+  }
+
+
+  // As a backup, shuffles can be lowered via a vrgather instruction, possibly
+  // merged with a second vrgather.
+  SmallVector<SDValue> GatherIndicesLHS, GatherIndicesRHS;
 
   // Keep a track of which non-undef indices are used by each LHS/RHS shuffle
   // half.
   DenseMap<int, unsigned> LHSIndexCounts, RHSIndexCounts;
 
-  // Now construct the mask that will be used by the vselect or blended
-  // vrgather operation. For vrgathers, construct the appropriate indices into
-  // each vector.
+  SmallVector<SDValue> MaskVals;
+
+  // Now construct the mask that will be used by the blended vrgather operation.
+  // Cconstruct the appropriate indices into each vector.
   for (int MaskIndex : Mask) {
-    bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ InvertMask;
+    bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ !SwapOps;
     MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
-    if (!IsSelect) {
-      bool IsLHSOrUndefIndex = MaskIndex < (int)NumElts;
-      GatherIndicesLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
-                                     ? DAG.getConstant(MaskIndex, DL, XLenVT)
-                                     : DAG.getUNDEF(XLenVT));
-      GatherIndicesRHS.push_back(
-          IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
-                            : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
-      if (IsLHSOrUndefIndex && MaskIndex >= 0)
-        ++LHSIndexCounts[MaskIndex];
-      if (!IsLHSOrUndefIndex)
-        ++RHSIndexCounts[MaskIndex - NumElts];
-    }
+    bool IsLHSOrUndefIndex = MaskIndex < (int)NumElts;
+    GatherIndicesLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
+                               ? DAG.getConstant(MaskIndex, DL, XLenVT)
+                               : DAG.getUNDEF(XLenVT));
+    GatherIndicesRHS.push_back(
+                               IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
+                               : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
+    if (IsLHSOrUndefIndex && MaskIndex >= 0)
+      ++LHSIndexCounts[MaskIndex];
+    if (!IsLHSOrUndefIndex)
+      ++RHSIndexCounts[MaskIndex - NumElts];
   }
 
   if (SwapOps) {
@@ -4898,9 +4913,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
 
-  if (IsSelect)
-    return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
-
   // We might be able to express the shuffle as a bitrotate. But even if we
   // don't have Zvkb and have to expand, the expanded sequence of approx. 2
   // shifts and a vor will have a higher throughput than a vrgather.

From 42b28c694ebc8a4ab09ef0b419308d47fc1e21ba Mon Sep 17 00:00:00 2001
From: Sam Tebbs <samuel.tebbs@arm.com>
Date: Tue, 23 Jan 2024 17:21:52 +0000
Subject: [PATCH 652/843] [clang] Add missing streaming attributes to SVE
 builtins (#79134)

This patch adds `IsStreamingCompatible` or `IsStreamingOrSVE2p1` to the
SVE builtins that missed them.
---
 clang/include/clang/Basic/arm_sve.td          | 86 +++++++++----------
 .../acle_sve2p1_while_x2.c                    | 74 ++++++++--------
 2 files changed, 83 insertions(+), 77 deletions(-)

diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td
index 4c5c1b5603f18..6da30e08e7521 100644
--- a/clang/include/clang/Basic/arm_sve.td
+++ b/clang/include/clang/Basic/arm_sve.td
@@ -543,13 +543,13 @@ def SVADRD : SInst<"svadrd[_{0}base]_[{2}]index",  "uud", "ilUiUl", MergeNone, "
 ////////////////////////////////////////////////////////////////////////////////
 // Scalar to vector
 
-def SVDUPQ_8  : SInst<"svdupq[_n]_{d}", "dssssssssssssssss",  "cUc", MergeNone>;
-def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "sUsh", MergeNone>;
+def SVDUPQ_8  : SInst<"svdupq[_n]_{d}", "dssssssssssssssss",  "cUc", MergeNone, "", [IsStreamingCompatible]>;
+def SVDUPQ_16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "sUsh", MergeNone, "", [IsStreamingCompatible]>;
 let TargetGuard = "sve,bf16" in {
-  def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "b", MergeNone>;
+  def SVDUPQ_BF16 : SInst<"svdupq[_n]_{d}", "dssssssss",  "b", MergeNone, "", [IsStreamingCompatible]>;
 }
-def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss",  "iUif", MergeNone>;
-def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss",  "lUld", MergeNone>;
+def SVDUPQ_32 : SInst<"svdupq[_n]_{d}", "dssss",  "iUif", MergeNone, "", [IsStreamingCompatible]>;
+def SVDUPQ_64 : SInst<"svdupq[_n]_{d}", "dss",  "lUld", MergeNone, "", [IsStreamingCompatible]>;
 
 multiclass svdup_base<string n, string p, MergeType mt, string i> {
   def NAME : SInst<n, p, "csilUcUsUiUlhfd", mt, i, [IsStreamingCompatible]>;
@@ -638,8 +638,8 @@ def SVQADD_N_U : SInst<"svqadd[_n_{d}]", "dda",  "UcUsUiUl", MergeNone, "aarch64
 def SVQSUB_N_S : SInst<"svqsub[_n_{d}]", "dda",  "csil",     MergeNone, "aarch64_sve_sqsub_x", [IsStreamingCompatible]>;
 def SVQSUB_N_U : SInst<"svqsub[_n_{d}]", "dda",  "UcUsUiUl", MergeNone, "aarch64_sve_uqsub_x", [IsStreamingCompatible]>;
 
-def SVDOT_LANE_S : SInst<"svdot_lane[_{d}]",  "ddqqi",  "il",   MergeNone, "aarch64_sve_sdot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
-def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]",  "ddqqi",  "UiUl", MergeNone, "aarch64_sve_udot_lane", [], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
+def SVDOT_LANE_S : SInst<"svdot_lane[_{d}]",  "ddqqi",  "il",   MergeNone, "aarch64_sve_sdot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
+def SVDOT_LANE_U : SInst<"svdot_lane[_{d}]",  "ddqqi",  "UiUl", MergeNone, "aarch64_sve_udot_lane", [IsStreamingCompatible], [ImmCheck<3, ImmCheckLaneIndexDot, 2>]>;
 
 ////////////////////////////////////////////////////////////////////////////////
 // Logical operations
@@ -835,14 +835,14 @@ def SVSCALE_N_M : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeOp1,  "aarch64_sv
 def SVSCALE_N_X : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeAny,  "aarch64_sve_fscale", [IsStreamingCompatible]>;
 def SVSCALE_N_Z : SInst<"svscale[_n_{d}]", "dPdK", "hfd", MergeZero, "aarch64_sve_fscale", [IsStreamingCompatible]>;
 
-defm SVMAD_F  : SInstZPZZZ<"svmad",  "hfd", "aarch64_sve_fmad",  "aarch64_sve_fmla_u",  [ReverseMergeAnyAccOp]>;
-defm SVMLA_F  : SInstZPZZZ<"svmla",  "hfd", "aarch64_sve_fmla",  "aarch64_sve_fmla_u">;
-defm SVMLS_F  : SInstZPZZZ<"svmls",  "hfd", "aarch64_sve_fmls",  "aarch64_sve_fmls_u">;
-defm SVMSB_F  : SInstZPZZZ<"svmsb",  "hfd", "aarch64_sve_fmsb",  "aarch64_sve_fmls_u",  [ReverseMergeAnyAccOp]>;
-defm SVNMAD_F : SInstZPZZZ<"svnmad", "hfd", "aarch64_sve_fnmad", "aarch64_sve_fnmla_u", [ReverseMergeAnyAccOp]>;
-defm SVNMLA_F : SInstZPZZZ<"svnmla", "hfd", "aarch64_sve_fnmla", "aarch64_sve_fnmla_u">;
-defm SVNMLS_F : SInstZPZZZ<"svnmls", "hfd", "aarch64_sve_fnmls", "aarch64_sve_fnmls_u">;
-defm SVNMSB_F : SInstZPZZZ<"svnmsb", "hfd", "aarch64_sve_fnmsb", "aarch64_sve_fnmls_u", [ReverseMergeAnyAccOp]>;
+defm SVMAD_F  : SInstZPZZZ<"svmad",  "hfd", "aarch64_sve_fmad",  "aarch64_sve_fmla_u",  [IsStreamingCompatible, ReverseMergeAnyAccOp]>;
+defm SVMLA_F  : SInstZPZZZ<"svmla",  "hfd", "aarch64_sve_fmla",  "aarch64_sve_fmla_u", [IsStreamingCompatible]>;
+defm SVMLS_F  : SInstZPZZZ<"svmls",  "hfd", "aarch64_sve_fmls",  "aarch64_sve_fmls_u", [IsStreamingCompatible]>;
+defm SVMSB_F  : SInstZPZZZ<"svmsb",  "hfd", "aarch64_sve_fmsb",  "aarch64_sve_fmls_u",  [IsStreamingCompatible, ReverseMergeAnyAccOp]>;
+defm SVNMAD_F : SInstZPZZZ<"svnmad", "hfd", "aarch64_sve_fnmad", "aarch64_sve_fnmla_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>;
+defm SVNMLA_F : SInstZPZZZ<"svnmla", "hfd", "aarch64_sve_fnmla", "aarch64_sve_fnmla_u", [IsStreamingCompatible]>;
+defm SVNMLS_F : SInstZPZZZ<"svnmls", "hfd", "aarch64_sve_fnmls", "aarch64_sve_fnmls_u", [IsStreamingCompatible]>;
+defm SVNMSB_F : SInstZPZZZ<"svnmsb", "hfd", "aarch64_sve_fnmsb", "aarch64_sve_fnmls_u", [IsStreamingCompatible, ReverseMergeAnyAccOp]>;
 
 def SVCADD_M : SInst<"svcadd[_{d}]", "dPddi",  "hfd", MergeOp1,  "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>;
 def SVCADD_X : SInst<"svcadd[_{d}]", "dPddi",  "hfd", MergeAny,  "aarch64_sve_fcadd", [IsStreamingCompatible], [ImmCheck<3, ImmCheckComplexRot90_270>]>;
@@ -881,11 +881,11 @@ def SVACLE  : SInst<"svacle[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facg
 def SVACLT  : SInst<"svaclt[_{d}]",  "PPdd", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare, IsStreamingCompatible]>;
 def SVCMPUO : SInst<"svcmpuo[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpuo", [IsStreamingCompatible]>;
 
-def SVACGE_N  : SInst<"svacge[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facge">;
-def SVACGT_N  : SInst<"svacgt[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facgt">;
-def SVACLE_N  : SInst<"svacle[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare]>;
-def SVACLT_N  : SInst<"svaclt[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare]>;
-def SVCMPUO_N : SInst<"svcmpuo[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpuo">;
+def SVACGE_N  : SInst<"svacge[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facge", [IsStreamingCompatible]>;
+def SVACGT_N  : SInst<"svacgt[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [IsStreamingCompatible]>;
+def SVACLE_N  : SInst<"svacle[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facge", [ReverseCompare, IsStreamingCompatible]>;
+def SVACLT_N  : SInst<"svaclt[_n_{d}]",  "PPda", "hfd", MergeNone, "aarch64_sve_facgt", [ReverseCompare, IsStreamingCompatible]>;
+def SVCMPUO_N : SInst<"svcmpuo[_n_{d}]", "PPda", "hfd", MergeNone, "aarch64_sve_fcmpuo", [IsStreamingCompatible]>;
 
 def SVCMPEQ_F : SInst<"svcmpeq[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpeq", [IsStreamingCompatible]>;
 def SVCMPNE_F : SInst<"svcmpne[_{d}]", "PPdd", "hfd", MergeNone, "aarch64_sve_fcmpne", [IsStreamingCompatible]>;
@@ -1023,15 +1023,15 @@ def SVCOMPACT    : SInst<"svcompact[_{d}]",   "dPd",  "ilUiUlfd",        MergeNo
 // splat of any possible lane. It is upto LLVM to pick a more efficient
 // instruction such as DUP (indexed) if the lane index fits the range of the
 // instruction's immediate.
-def SVDUP_LANE   : SInst<"svdup_lane[_{d}]",  "ddL",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl">;
+def SVDUP_LANE   : SInst<"svdup_lane[_{d}]",  "ddL",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>;
 let TargetGuard = "sve,bf16" in {
 def SVDUP_LANE_BF16 :
-                   SInst<"svdup_lane[_{d}]",  "ddL",  "b",               MergeNone, "aarch64_sve_tbl">;
+                   SInst<"svdup_lane[_{d}]",  "ddL",  "b",               MergeNone, "aarch64_sve_tbl", [IsStreamingCompatible]>;
 }
 
-def SVDUPQ_LANE  : SInst<"svdupq_lane[_{d}]", "ddn",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane">;
+def SVDUPQ_LANE  : SInst<"svdupq_lane[_{d}]", "ddn",  "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_dupq_lane", [IsStreamingCompatible]>;
 let TargetGuard = "sve,bf16" in {
-  def SVDUPQ_LANE_BF16  : SInst<"svdupq_lane[_{d}]", "ddn",  "b", MergeNone, "aarch64_sve_dupq_lane">;
+  def SVDUPQ_LANE_BF16  : SInst<"svdupq_lane[_{d}]", "ddn",  "b", MergeNone, "aarch64_sve_dupq_lane", [IsStreamingCompatible]>;
 }
 def SVEXT        : SInst<"svext[_{d}]",       "dddi", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ext", [IsStreamingCompatible], [ImmCheck<2, ImmCheckExtract, 1>]>;
 defm SVLASTA     : SVEPerm<"svlasta[_{d}]",   "sPd",  "aarch64_sve_lasta">;
@@ -1109,11 +1109,11 @@ def SVPFALSE : SInst<"svpfalse[_b]", "Pv", "", MergeNone, "", [IsOverloadNone, I
 def SVPTRUE_PAT : SInst<"svptrue_pat_{d}", "PI", "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsStreamingCompatible]>;
 def SVPTRUE     : SInst<"svptrue_{d}",     "Pv",  "PcPsPiPl", MergeNone, "aarch64_sve_ptrue", [IsAppendSVALL, IsStreamingCompatible]>;
 
-def SVDUPQ_B8      : SInst<"svdupq[_n]_{d}",  "Pssssssssssssssss",  "Pc", MergeNone>;
-def SVDUPQ_B16     : SInst<"svdupq[_n]_{d}", "Pssssssss",  "Ps", MergeNone>;
-def SVDUPQ_B32     : SInst<"svdupq[_n]_{d}", "Pssss",  "Pi", MergeNone>;
-def SVDUPQ_B64     : SInst<"svdupq[_n]_{d}", "Pss",  "Pl", MergeNone>;
-def SVDUP_N_B      : SInst<"svdup[_n]_{d}",  "Ps", "PcPsPiPl", MergeNone>;
+def SVDUPQ_B8      : SInst<"svdupq[_n]_{d}",  "Pssssssssssssssss",  "Pc", MergeNone, "", [IsStreamingCompatible]>;
+def SVDUPQ_B16     : SInst<"svdupq[_n]_{d}", "Pssssssss",  "Ps", MergeNone, "", [IsStreamingCompatible]>;
+def SVDUPQ_B32     : SInst<"svdupq[_n]_{d}", "Pssss",  "Pi", MergeNone, "", [IsStreamingCompatible]>;
+def SVDUPQ_B64     : SInst<"svdupq[_n]_{d}", "Pss",  "Pl", MergeNone, "", [IsStreamingCompatible]>;
+def SVDUP_N_B      : SInst<"svdup[_n]_{d}",  "Ps", "PcPsPiPl", MergeNone, "", [IsStreamingCompatible]>;
 
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1268,10 +1268,10 @@ def SVZIP2Q      : SInst<"svzip2q[_{d}]",     "ddd",  "csilUcUsUiUlhfd", MergeNo
 let TargetGuard = "sve,bf16,f64mm" in {
 def SVTRN1Q_BF16      : SInst<"svtrn1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_trn1q">;
 def SVTRN2Q_BF16      : SInst<"svtrn2q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_trn2q">;
-def SVUZP1Q_BF16      : SInst<"svuzp1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_uzp1q">;
-def SVUZP2Q_BF16      : SInst<"svuzp2q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_uzp2q">;
-def SVZIP1Q_BF16      : SInst<"svzip1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_zip1q">;
-def SVZIP2Q_BF16      : SInst<"svzip2q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_zip2q">;
+def SVUZP1Q_BF16      : SInst<"svuzp1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_uzp1q", [IsStreamingCompatible]>;
+def SVUZP2Q_BF16      : SInst<"svuzp2q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_uzp2q", [IsStreamingCompatible]>;
+def SVZIP1Q_BF16      : SInst<"svzip1q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_zip1q", [IsStreamingCompatible]>;
+def SVZIP2Q_BF16      : SInst<"svzip2q[_{d}]",     "ddd",  "b", MergeNone, "aarch64_sve_zip2q", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -1347,14 +1347,14 @@ def SVWHILEHS_U64 : SInst<"svwhilege_{d}[_{1}]", "Pnn", "PUcPUsPUiPUl", MergeNon
 }
 
 let TargetGuard = "sve2p1|sme2"  in {
-  def SVWHILEGE_S64_X2 : SInst<"svwhilege_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege_x2">;
-  def SVWHILEGT_S64_X2 : SInst<"svwhilegt_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt_x2">;
-  def SVWHILEHI_U64_X2 : SInst<"svwhilegt_{d}[_{1}]_x2", "2nn", "PcPsPiPl",     MergeNone, "aarch64_sve_whilehi_x2">;
-  def SVWHILEHS_U64_X2 : SInst<"svwhilege_{d}[_{1}]_x2", "2nn", "PcPsPiPl",     MergeNone, "aarch64_sve_whilehs_x2">;
-  def SVWHILELE_S64_X2 : SInst<"svwhilele_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele_x2">;
-  def SVWHILELT_S64_X2 : SInst<"svwhilelt_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt_x2">;
-  def SVWHILELO_U64_X2 : SInst<"svwhilelt_{d}[_{1}]_x2", "2nn", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelo_x2">;
-  def SVWHILELS_U64_X2 : SInst<"svwhilele_{d}[_{1}]_x2", "2nn", "PcPsPiPl",     MergeNone, "aarch64_sve_whilels_x2">;
+  def SVWHILEGE_S64_X2 : SInst<"svwhilege_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilege_x2", [IsStreamingOrSVE2p1]>;
+  def SVWHILEGT_S64_X2 : SInst<"svwhilegt_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilegt_x2", [IsStreamingOrSVE2p1]>;
+  def SVWHILEHI_U64_X2 : SInst<"svwhilegt_{d}[_{1}]_x2", "2nn", "PcPsPiPl",     MergeNone, "aarch64_sve_whilehi_x2", [IsStreamingOrSVE2p1]>;
+  def SVWHILEHS_U64_X2 : SInst<"svwhilege_{d}[_{1}]_x2", "2nn", "PcPsPiPl",     MergeNone, "aarch64_sve_whilehs_x2", [IsStreamingOrSVE2p1]>;
+  def SVWHILELE_S64_X2 : SInst<"svwhilele_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilele_x2", [IsStreamingOrSVE2p1]>;
+  def SVWHILELT_S64_X2 : SInst<"svwhilelt_{d}[_{1}]_x2", "2ll", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelt_x2", [IsStreamingOrSVE2p1]>;
+  def SVWHILELO_U64_X2 : SInst<"svwhilelt_{d}[_{1}]_x2", "2nn", "PcPsPiPl",     MergeNone, "aarch64_sve_whilelo_x2", [IsStreamingOrSVE2p1]>;
+  def SVWHILELS_U64_X2 : SInst<"svwhilele_{d}[_{1}]_x2", "2nn", "PcPsPiPl",     MergeNone, "aarch64_sve_whilels_x2", [IsStreamingOrSVE2p1]>;
 
 }
 
@@ -1831,8 +1831,8 @@ def SVPMULLB_PAIR   : SInst<"svpmullb_pair[_{d}]",   "ddd",  "UcUi",         Mer
 def SVPMULLB_PAIR_N : SInst<"svpmullb_pair[_n_{d}]", "dda",  "UcUi",         MergeNone, "aarch64_sve_pmullb_pair", [IsStreamingCompatible]>;
 def SVPMULLT        : SInst<"svpmullt[_{d}]",        "dhh",  "UsUl",         MergeNone, "", [IsStreamingCompatible]>;
 def SVPMULLT_N      : SInst<"svpmullt[_n_{d}]",      "dhR",  "UsUl",         MergeNone, "", [IsStreamingCompatible]>;
-def SVPMULLT_PAIR   : SInst<"svpmullt_pair[_{d}]",   "ddd",  "UcUi",         MergeNone, "aarch64_sve_pmullt_pair">;
-def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda",  "UcUi",         MergeNone, "aarch64_sve_pmullt_pair">;
+def SVPMULLT_PAIR   : SInst<"svpmullt_pair[_{d}]",   "ddd",  "UcUi",         MergeNone, "aarch64_sve_pmullt_pair", [IsStreamingCompatible]>;
+def SVPMULLT_PAIR_N : SInst<"svpmullt_pair[_n_{d}]", "dda",  "UcUi",         MergeNone, "aarch64_sve_pmullt_pair", [IsStreamingCompatible]>;
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c
index acead9be3f01d..475fa14e1165a 100644
--- a/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c
+++ b/clang/test/CodeGen/aarch64-sve2p1-intrinsics/acle_sve2p1_while_x2.c
@@ -1,11 +1,11 @@
 // NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
 // REQUIRES: aarch64-registered-target
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s
 // RUN: %clang_cc1 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -Wall -emit-llvm -o - -x c++ %s | opt -S -passes=mem2reg,tailcallelim | FileCheck %s -check-prefix=CPP-CHECK
 // RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve2p1 -S -disable-O0-optnone -Werror -o /dev/null %s
-// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -S -disable-O0-optnone -Werror -o /dev/null %s
+// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sme2 -DTEST_SME -S -disable-O0-optnone -Werror -o /dev/null %s
 #include <arm_sve.h>
 
 #ifdef SVE_OVERLOADED_FORMS
@@ -14,6 +14,12 @@
 #define SVE_ACLE_FUNC(A1,A2,A3) A1##A2##A3
 #endif
 
+#ifndef TEST_SME
+#define ATTR
+#else
+#define ATTR __arm_streaming
+#endif
+
 // CHECK-LABEL: define dso_local <vscale x 32 x i1> @test_svwhilege_b8_s64(
 // CHECK-SAME: i64 noundef [[OP1:%.*]], i64 noundef [[OP2:%.*]]) #[[ATTR0:[0-9]+]] {
 // CHECK-NEXT:  entry:
@@ -34,7 +40,7 @@
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP4]]
 //
-svboolx2_t test_svwhilege_b8_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilege_b8_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b8,_s64,_x2)(op1, op2);
 }
 
@@ -58,7 +64,7 @@ svboolx2_t test_svwhilege_b8_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP4]]
 //
-svboolx2_t test_svwhilege_b8_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilege_b8_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b8,_u64,_x2)(op1, op2);
 }
 
@@ -86,7 +92,7 @@ svboolx2_t test_svwhilege_b8_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilege_b16_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilege_b16_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b16,_s64,_x2)(op1, op2);
 }
 
@@ -114,7 +120,7 @@ svboolx2_t test_svwhilege_b16_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilege_b16_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilege_b16_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b16,_u64,_x2)(op1, op2);
 }
 
@@ -142,7 +148,7 @@ svboolx2_t test_svwhilege_b16_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilege_b32_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilege_b32_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b32,_s64,_x2)(op1, op2);
 }
 
@@ -170,7 +176,7 @@ svboolx2_t test_svwhilege_b32_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilege_b32_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilege_b32_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b32,_u64,_x2)(op1, op2);
 }
 
@@ -198,7 +204,7 @@ svboolx2_t test_svwhilege_b32_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilege_b64_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilege_b64_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b64,_s64,_x2)(op1, op2);
 }
 
@@ -226,7 +232,7 @@ svboolx2_t test_svwhilege_b64_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilege_b64_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilege_b64_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilege_b64,_u64,_x2)(op1, op2);
 }
 
@@ -250,7 +256,7 @@ svboolx2_t test_svwhilege_b64_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP4]]
 //
-svboolx2_t test_svwhilegt_b8_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilegt_b8_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b8,_s64,_x2)(op1, op2);
 }
 
@@ -274,7 +280,7 @@ svboolx2_t test_svwhilegt_b8_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP4]]
 //
-svboolx2_t test_svwhilegt_b8_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilegt_b8_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b8,_u64,_x2)(op1, op2);
 }
 
@@ -302,7 +308,7 @@ svboolx2_t test_svwhilegt_b8_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilegt_b16_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilegt_b16_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b16,_s64,_x2)(op1, op2);
 }
 
@@ -330,7 +336,7 @@ svboolx2_t test_svwhilegt_b16_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilegt_b16_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilegt_b16_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b16,_u64,_x2)(op1, op2);
 }
 
@@ -358,7 +364,7 @@ svboolx2_t test_svwhilegt_b16_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilegt_b32_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilegt_b32_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b32,_s64,_x2)(op1, op2);
 }
 
@@ -386,7 +392,7 @@ svboolx2_t test_svwhilegt_b32_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilegt_b32_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilegt_b32_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b32,_u64,_x2)(op1, op2);
 }
 
@@ -414,7 +420,7 @@ svboolx2_t test_svwhilegt_b32_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilegt_b64_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilegt_b64_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b64,_s64,_x2)(op1, op2);
 }
 
@@ -442,7 +448,7 @@ svboolx2_t test_svwhilegt_b64_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilegt_b64_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilegt_b64_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilegt_b64,_u64,_x2)(op1, op2);
 }
 
@@ -466,7 +472,7 @@ svboolx2_t test_svwhilegt_b64_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP4]]
 //
-svboolx2_t test_svwhilele_b8_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilele_b8_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b8,_s64,_x2)(op1, op2);
 }
 
@@ -490,7 +496,7 @@ svboolx2_t test_svwhilele_b8_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP4]]
 //
-svboolx2_t test_svwhilele_b8_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilele_b8_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b8,_u64,_x2)(op1, op2);
 }
 
@@ -518,7 +524,7 @@ svboolx2_t test_svwhilele_b8_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilele_b16_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilele_b16_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b16,_s64,_x2)(op1, op2);
 }
 
@@ -546,7 +552,7 @@ svboolx2_t test_svwhilele_b16_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilele_b16_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilele_b16_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b16,_u64,_x2)(op1, op2);
 }
 
@@ -574,7 +580,7 @@ svboolx2_t test_svwhilele_b16_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilele_b32_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilele_b32_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b32,_s64,_x2)(op1, op2);
 }
 
@@ -602,7 +608,7 @@ svboolx2_t test_svwhilele_b32_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilele_b32_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilele_b32_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b32,_u64,_x2)(op1, op2);
 }
 
@@ -630,7 +636,7 @@ svboolx2_t test_svwhilele_b32_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilele_b64_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilele_b64_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b64,_s64,_x2)(op1, op2);
 }
 
@@ -658,7 +664,7 @@ svboolx2_t test_svwhilele_b64_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilele_b64_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilele_b64_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilele_b64,_u64,_x2)(op1, op2);
 }
 
@@ -682,7 +688,7 @@ svboolx2_t test_svwhilele_b64_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP4]]
 //
-svboolx2_t test_svwhilelt_b8_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilelt_b8_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b8,_s64,_x2)(op1, op2);
 }
 
@@ -706,7 +712,7 @@ svboolx2_t test_svwhilelt_b8_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP2]], <vscale x 16 x i1> [[TMP3]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP4]]
 //
-svboolx2_t test_svwhilelt_b8_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilelt_b8_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b8,_u64,_x2)(op1, op2);
 }
 
@@ -734,7 +740,7 @@ svboolx2_t test_svwhilelt_b8_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilelt_b16_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilelt_b16_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b16,_s64,_x2)(op1, op2);
 }
 
@@ -762,7 +768,7 @@ svboolx2_t test_svwhilelt_b16_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilelt_b16_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilelt_b16_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b16,_u64,_x2)(op1, op2);
 }
 
@@ -790,7 +796,7 @@ svboolx2_t test_svwhilelt_b16_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilelt_b32_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilelt_b32_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b32,_s64,_x2)(op1, op2);
 }
 
@@ -818,7 +824,7 @@ svboolx2_t test_svwhilelt_b32_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilelt_b32_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilelt_b32_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b32,_u64,_x2)(op1, op2);
 }
 
@@ -846,7 +852,7 @@ svboolx2_t test_svwhilelt_b32_u64(uint64_t op1, uint64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilelt_b64_s64(int64_t op1, int64_t op2) {
+svboolx2_t test_svwhilelt_b64_s64(int64_t op1, int64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b64,_s64,_x2)(op1, op2);
 }
 
@@ -874,6 +880,6 @@ svboolx2_t test_svwhilelt_b64_s64(int64_t op1, int64_t op2) {
 // CPP-CHECK-NEXT:    [[TMP6:%.*]] = tail call <vscale x 32 x i1> @llvm.vector.insert.nxv32i1.nxv16i1(<vscale x 32 x i1> [[TMP3]], <vscale x 16 x i1> [[TMP5]], i64 16)
 // CPP-CHECK-NEXT:    ret <vscale x 32 x i1> [[TMP6]]
 //
-svboolx2_t test_svwhilelt_b64_u64(uint64_t op1, uint64_t op2) {
+svboolx2_t test_svwhilelt_b64_u64(uint64_t op1, uint64_t op2) ATTR {
   return SVE_ACLE_FUNC(svwhilelt_b64,_u64,_x2)(op1, op2);
 }

From 2531a15dfd4eafda21622e3a828a93fc9a27e3b6 Mon Sep 17 00:00:00 2001
From: erichkeane <ekeane@nvidia.com>
Date: Mon, 22 Jan 2024 14:22:12 -0800
Subject: [PATCH 653/843] [OpenACC] Implement 'device_type' clause parsing

'device_type' takes either an asterisk or a list of impementation
specific identifiers. This patch implements the parsing for it.
Additionally, 'dtype' is an alias for 'device_type', though we're
implementing it as its own clause kind to improve future diagnostics, as
this will allow us to differentiate the spellings.
---
 clang/include/clang/Basic/OpenACCKinds.h |  12 +++
 clang/include/clang/Parse/Parser.h       |   3 +
 clang/lib/Parse/ParseOpenACC.cpp         |  46 ++++++++++
 clang/test/ParserOpenACC/parse-clauses.c | 112 +++++++++++++++++++++++
 4 files changed, 173 insertions(+)

diff --git a/clang/include/clang/Basic/OpenACCKinds.h b/clang/include/clang/Basic/OpenACCKinds.h
index 872faec0deaeb..6487a95910edd 100644
--- a/clang/include/clang/Basic/OpenACCKinds.h
+++ b/clang/include/clang/Basic/OpenACCKinds.h
@@ -232,6 +232,12 @@ enum class OpenACCClauseKind {
   DeviceNum,
   /// 'default_async' clause, allowed on 'set' construct.
   DefaultAsync,
+  /// 'device_type' clause, allowed on Constructs, 'data', 'init', 'shutdown',
+  /// 'set', update', 'loop', 'routine', and Combined constructs.
+  DeviceType,
+  /// 'dtype' clause, an alias for 'device_type', stored separately for
+  /// diagnostic purposes.
+  DType,
 
   /// Represents an invalid clause, for the purposes of parsing.
   Invalid,
@@ -348,6 +354,12 @@ inline const StreamingDiagnostic &operator<<(const StreamingDiagnostic &Out,
   case OpenACCClauseKind::DefaultAsync:
     return Out << "default_async";
 
+  case OpenACCClauseKind::DeviceType:
+    return Out << "device_type";
+
+  case OpenACCClauseKind::DType:
+    return Out << "dtype";
+
   case OpenACCClauseKind::Invalid:
     return Out << "<invalid>";
   }
diff --git a/clang/include/clang/Parse/Parser.h b/clang/include/clang/Parse/Parser.h
index 6bfcd3a0d8412..ffbde370e8f9c 100644
--- a/clang/include/clang/Parse/Parser.h
+++ b/clang/include/clang/Parse/Parser.h
@@ -3587,6 +3587,9 @@ class Parser : public CodeCompletionHandler {
   /// Parses the clause kind of 'int-expr', which can be any integral
   /// expression.
   ExprResult ParseOpenACCIntExpr();
+  /// Parses the 'device-type-list', which is a list of identifiers.
+  bool ParseOpenACCDeviceTypeList();
+
 private:
   //===--------------------------------------------------------------------===//
   // C++ 14: Templates [temp]
diff --git a/clang/lib/Parse/ParseOpenACC.cpp b/clang/lib/Parse/ParseOpenACC.cpp
index 8020c455feecb..9f7e63ecdc951 100644
--- a/clang/lib/Parse/ParseOpenACC.cpp
+++ b/clang/lib/Parse/ParseOpenACC.cpp
@@ -104,7 +104,9 @@ OpenACCClauseKind getOpenACCClauseKind(Token Tok) {
       .Case("device", OpenACCClauseKind::Device)
       .Case("device_num", OpenACCClauseKind::DeviceNum)
       .Case("device_resident", OpenACCClauseKind::DeviceResident)
+      .Case("device_type", OpenACCClauseKind::DeviceType)
       .Case("deviceptr", OpenACCClauseKind::DevicePtr)
+      .Case("dtype", OpenACCClauseKind::DType)
       .Case("finalize", OpenACCClauseKind::Finalize)
       .Case("firstprivate", OpenACCClauseKind::FirstPrivate)
       .Case("host", OpenACCClauseKind::Host)
@@ -488,6 +490,8 @@ ClauseParensKind getClauseParensKind(OpenACCDirectiveKind DirKind,
   case OpenACCClauseKind::NumWorkers:
   case OpenACCClauseKind::DeviceNum:
   case OpenACCClauseKind::DefaultAsync:
+  case OpenACCClauseKind::DeviceType:
+  case OpenACCClauseKind::DType:
     return ClauseParensKind::Required;
 
   case OpenACCClauseKind::Auto:
@@ -580,6 +584,38 @@ bool Parser::ParseOpenACCClauseVarList(OpenACCClauseKind Kind) {
   }
   return false;
 }
+
+/// OpenACC 3.3 Section 2.4:
+/// The argument to the device_type clause is a comma-separated list of one or
+/// more device architecture name identifiers, or an asterisk.
+///
+/// The syntax of the device_type clause is
+/// device_type( * )
+/// device_type( device-type-list )
+///
+/// The device_type clause may be abbreviated to dtype.
+bool Parser::ParseOpenACCDeviceTypeList() {
+
+  if (expectIdentifierOrKeyword(*this)) {
+    SkipUntil(tok::r_paren, tok::annot_pragma_openacc_end,
+              Parser::StopBeforeMatch);
+    return false;
+  }
+  ConsumeToken();
+
+  while (!getCurToken().isOneOf(tok::r_paren, tok::annot_pragma_openacc_end)) {
+    ExpectAndConsume(tok::comma);
+
+    if (expectIdentifierOrKeyword(*this)) {
+      SkipUntil(tok::r_paren, tok::annot_pragma_openacc_end,
+                Parser::StopBeforeMatch);
+      return false;
+    }
+    ConsumeToken();
+  }
+  return false;
+}
+
 // The OpenACC Clause List is a comma or space-delimited list of clauses (see
 // the comment on ParseOpenACCClauseList).  The concept of a 'clause' doesn't
 // really have its owner grammar and each individual one has its own definition.
@@ -709,6 +745,16 @@ bool Parser::ParseOpenACCClauseParams(OpenACCDirectiveKind DirKind,
         return true;
       break;
     }
+    case OpenACCClauseKind::DType:
+    case OpenACCClauseKind::DeviceType:
+      if (getCurToken().is(tok::star)) {
+        // FIXME: We want to mark that this is an 'everything else' type of
+        // device_type in Sema.
+        ConsumeToken();
+      } else if (ParseOpenACCDeviceTypeList()) {
+        return true;
+      }
+      break;
     default:
       llvm_unreachable("Not a required parens type?");
     }
diff --git a/clang/test/ParserOpenACC/parse-clauses.c b/clang/test/ParserOpenACC/parse-clauses.c
index a80173c6eca3d..faf3caa2b1cfd 100644
--- a/clang/test/ParserOpenACC/parse-clauses.c
+++ b/clang/test/ParserOpenACC/parse-clauses.c
@@ -907,6 +907,118 @@ void IntExprParsing() {
   // expected-error@+2{{invalid tag 'length' on 'worker' clause}}
   // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
 #pragma acc loop worker(length:returns_int())
+}
+
+void device_type() {
+  // expected-error@+2{{expected '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type
+  // expected-error@+2{{expected '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype
+
+  // expected-error@+4{{expected identifier}}
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(
+  // expected-error@+4{{expected identifier}}
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(
+
+  // expected-error@+2{{expected identifier}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type()
+  // expected-error@+2{{expected identifier}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype()
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(*
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(*
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(ident
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(ident
+
+  // expected-error@+4{{expected ','}}
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(ident ident2
+  // expected-error@+4{{expected ','}}
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(ident ident2
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(ident, ident2
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(ident, ident2
+
+  // expected-error@+2{{expected identifier}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(ident, ident2,)
+  // expected-error@+2{{expected identifier}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(ident, ident2,)
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(*,)
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(*,)
+
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(*,ident)
+  // expected-error@+3{{expected ')'}}
+  // expected-note@+2{{to match this '('}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(*,ident)
+
+  // expected-error@+2{{expected identifier}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(ident, *)
+  // expected-error@+2{{expected identifier}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(ident, *)
+
+  // expected-error@+2{{expected identifier}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type("foo", 54)
+  // expected-error@+2{{expected identifier}}
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(31, "bar")
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(ident, auto, int, float)
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel dtype(ident, auto, int, float)
+
+  // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}
+#pragma acc parallel device_type(ident, auto, int, float) dtype(ident, auto, int, float)
 }
 
   // expected-warning@+1{{OpenACC directives not yet implemented, pragma ignored}}

From 8ed1291d96eaf230ce289c9e62d28df3851d4372 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 23 Jan 2024 09:27:18 -0800
Subject: [PATCH 654/843] [MachineCopyPropagation] Make a SmallVector larger
 (NFC) (#79106)

This patch makes a SmallVector slightly larger.  We encounter quite a
few instructions with 3 or 4 defs but very few beyond that on X86.

This saves 0.39% of heap allocations during the compilation of a large
preprocessed file, namely X86ISelLowering.cpp, for the X86 target.
---
 llvm/lib/CodeGen/MachineCopyPropagation.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index 51e944d0279f2..9a0ab300b21b7 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -853,7 +853,7 @@ void MachineCopyPropagation::ForwardCopyPropagateBlock(MachineBasicBlock &MBB) {
     forwardUses(MI);
 
     // Not a copy.
-    SmallVector<Register, 2> Defs;
+    SmallVector<Register, 4> Defs;
     const MachineOperand *RegMask = nullptr;
     for (const MachineOperand &MO : MI.operands()) {
       if (MO.isRegMask())

From c5a33befcc328339a84c35f6899ff3f3309399fc Mon Sep 17 00:00:00 2001
From: Juergen Ributzka <juergen@ributzka.de>
Date: Tue, 23 Jan 2024 09:30:58 -0800
Subject: [PATCH 655/843] [clang][modules] Fix CodeGen options that can affect
 the AST. (#78816)

`OptimizationLevel` and `OptimizeSize` can affect the generated AST. They indirectly affect the `Optimize` and `OptimizeSize` frontend options, which in turn set predefined macro definitions.

This fixes rdar://121228252.
---
 clang/include/clang/Basic/CodeGenOptions.def  | 15 +++++++++++++--
 clang/lib/Basic/CodeGenOptions.cpp            |  2 ++
 clang/test/ClangScanDeps/strip-codegen-args.m | 11 +++++++----
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/clang/include/clang/Basic/CodeGenOptions.def b/clang/include/clang/Basic/CodeGenOptions.def
index 2c4fb6745bc17..2f2e45d5cf63d 100644
--- a/clang/include/clang/Basic/CodeGenOptions.def
+++ b/clang/include/clang/Basic/CodeGenOptions.def
@@ -12,6 +12,9 @@
 // that have enumeration type and VALUE_CODEGENOPT is a code
 // generation option that describes a value rather than a flag.
 //
+// AFFECTING_VALUE_CODEGENOPT is used for code generation options that can
+// affect the AST.
+//
 //===----------------------------------------------------------------------===//
 #ifndef CODEGENOPT
 #  error Define the CODEGENOPT macro to handle language options
@@ -27,6 +30,11 @@ CODEGENOPT(Name, Bits, Default)
 CODEGENOPT(Name, Bits, Default)
 #endif
 
+#ifndef AFFECTING_VALUE_CODEGENOPT
+#  define AFFECTING_VALUE_CODEGENOPT(Name, Bits, Default) \
+VALUE_CODEGENOPT(Name, Bits, Default)
+#endif
+
 CODEGENOPT(DisableIntegratedAS, 1, 0) ///< -no-integrated-as
 CODEGENOPT(RelaxELFRelocations, 1, 1) ///< -Wa,-mrelax-relocations={yes,no}
 CODEGENOPT(AsmVerbose        , 1, 0) ///< -dA, -fverbose-asm.
@@ -193,8 +201,10 @@ ENUM_CODEGENOPT(ObjCDispatchMethod, ObjCDispatchMethodKind, 2, Legacy)
 CODEGENOPT(ObjCConvertMessagesToRuntimeCalls , 1, 1)
 CODEGENOPT(ObjCAvoidHeapifyLocalBlocks, 1, 0)
 
-VALUE_CODEGENOPT(OptimizationLevel, 2, 0) ///< The -O[0-3] option specified.
-VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os (==1) or -Oz (==2) is specified.
+
+// The optimization options affect frontend options, whicn in turn do affect the AST.
+AFFECTING_VALUE_CODEGENOPT(OptimizationLevel, 2, 0) ///< The -O[0-3] option specified.
+AFFECTING_VALUE_CODEGENOPT(OptimizeSize, 2, 0) ///< If -Os (==1) or -Oz (==2) is specified.
 
 CODEGENOPT(AtomicProfileUpdate , 1, 0) ///< Set -fprofile-update=atomic
 /// Choose profile instrumenation kind or no instrumentation.
@@ -437,3 +447,4 @@ CODEGENOPT(CtorDtorReturnThis, 1, 0)
 #undef CODEGENOPT
 #undef ENUM_CODEGENOPT
 #undef VALUE_CODEGENOPT
+#undef AFFECTING_VALUE_CODEGENOPT
diff --git a/clang/lib/Basic/CodeGenOptions.cpp b/clang/lib/Basic/CodeGenOptions.cpp
index 182d0a2fa4d88..79d715305ef20 100644
--- a/clang/lib/Basic/CodeGenOptions.cpp
+++ b/clang/lib/Basic/CodeGenOptions.cpp
@@ -27,6 +27,8 @@ void CodeGenOptions::resetNonModularOptions(StringRef ModuleFormat) {
 #define ENUM_DEBUGOPT(Name, Type, Bits, Default)
 #define CODEGENOPT(Name, Bits, Default) Name = Default;
 #define ENUM_CODEGENOPT(Name, Type, Bits, Default) set##Name(Default);
+// Do not reset AST affecting code generation options.
+#define AFFECTING_VALUE_CODEGENOPT(Name, Bits, Default)
 #include "clang/Basic/CodeGenOptions.def"
 
   // Next reset all debug options that can always be reset, because they never
diff --git a/clang/test/ClangScanDeps/strip-codegen-args.m b/clang/test/ClangScanDeps/strip-codegen-args.m
index bb7e76e86aa2f..71171f4983386 100644
--- a/clang/test/ClangScanDeps/strip-codegen-args.m
+++ b/clang/test/ClangScanDeps/strip-codegen-args.m
@@ -5,11 +5,14 @@
 // RUN: clang-scan-deps -compilation-database %t/cdb1.json -format experimental-full > %t/result1.txt
 // RUN: FileCheck %s -input-file %t/result1.txt
 
-// This tests that codegen option that do not affect the AST or generation of a module are removed. 
+// This tests that codegen option that do not affect the AST or generation of a
+// module are removed. It also tests that the optimization options that affect
+// the AST are not reset to -O0.
 
 // CHECK:        "modules": [
 // CHECK-NEXT:     {
 // CHECK:            "command-line": [
+// CHECK-NOT:          "-O0"
 // CHECK-NOT:          "-flto"
 // CHECK-NOT:          "-fno-autolink"
 // CHECK-NOT:          "-mrelax-relocations=no"
@@ -23,17 +26,17 @@
 [
   {
     "directory": "DIR",
-    "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -flto -fno-autolink -Xclang -mrelax-relocations=no -fsyntax-only DIR/t1.m",
+    "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -O2 -flto -fno-autolink -Xclang -mrelax-relocations=no -fsyntax-only DIR/t1.m",
     "file": "DIR/t1.m"
   },
   {
     "directory": "DIR",
-    "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -flto=thin -fautolink -fsyntax-only DIR/t2.m",
+    "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -O2 -flto=thin -fautolink -fsyntax-only DIR/t2.m",
     "file": "DIR/t2.m"
   },
   {
     "directory": "DIR",
-    "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -flto=full -fsyntax-only DIR/t3.m",
+    "command": "clang -Imodules/A -fmodules -fmodules-cache-path=DIR/module-cache -fimplicit-modules -O2 -flto=full -fsyntax-only DIR/t3.m",
     "file": "DIR/t2.m"
   }
 ]

From a0f69be26293dfb3b6c65ca65bd68f735f60c5a3 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 23 Jan 2024 09:22:29 -0800
Subject: [PATCH 656/843] [RISCV] Continue with early return for shuffle
 lowering [nfc]

Move two cases where we're not actually going to use any of our computed index vectors or mask values above the computation of the same.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 27 ++++++++++-----------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ff730cdd272b2..2fd4479ca5fe0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4875,6 +4875,19 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
   }
 
+  // We might be able to express the shuffle as a bitrotate. But even if we
+  // don't have Zvkb and have to expand, the expanded sequence of approx. 2
+  // shifts and a vor will have a higher throughput than a vrgather.
+  if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
+    return V;
+
+  if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) {
+    // On such a large vector we're unable to use i8 as the index type.
+    // FIXME: We could promote the index to i16 and use vrgatherei16, but that
+    // may involve vector splitting if we're already at LMUL=8, or our
+    // user-supplied maximum fixed-length LMUL.
+    return SDValue();
+  }
 
   // As a backup, shuffles can be lowered via a vrgather instruction, possibly
   // merged with a second vrgather.
@@ -4913,20 +4926,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
 
-  // We might be able to express the shuffle as a bitrotate. But even if we
-  // don't have Zvkb and have to expand, the expanded sequence of approx. 2
-  // shifts and a vor will have a higher throughput than a vrgather.
-  if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
-    return V;
-
-  if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) {
-    // On such a large vector we're unable to use i8 as the index type.
-    // FIXME: We could promote the index to i16 and use vrgatherei16, but that
-    // may involve vector splitting if we're already at LMUL=8, or our
-    // user-supplied maximum fixed-length LMUL.
-    return SDValue();
-  }
-
   unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL;
   unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
   MVT IndexVT = VT.changeTypeToInteger();

From d360963aaa90710752d684035404db80c3dc1645 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 23 Jan 2024 09:33:06 -0800
Subject: [PATCH 657/843] [RISCV] Add regalloc hints for Zcb instructions.
 (#78949)

This hints the register allocator to use the same register for source
and destination to enable more compression.
---
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp   | 40 +++++++--
 llvm/test/CodeGen/RISCV/zcb-regalloc-hints.ll | 86 +++++++++++++++++++
 2 files changed, 120 insertions(+), 6 deletions(-)
 create mode 100644 llvm/test/CodeGen/RISCV/zcb-regalloc-hints.ll

diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 730838ea004aa..d7bb46a221dd2 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -753,6 +753,7 @@ bool RISCVRegisterInfo::getRegAllocationHints(
     SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
     const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
+  auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
 
   bool BaseImplRetVal = TargetRegisterInfo::getRegAllocationHints(
       VirtReg, Order, Hints, MF, VRM, Matrix);
@@ -776,7 +777,7 @@ bool RISCVRegisterInfo::getRegAllocationHints(
 
   // This is all of the compressible binary instructions. If an instruction
   // needs GPRC register class operands \p NeedGPRC will be set to true.
-  auto isCompressible = [](const MachineInstr &MI, bool &NeedGPRC) {
+  auto isCompressible = [&Subtarget](const MachineInstr &MI, bool &NeedGPRC) {
     NeedGPRC = false;
     switch (MI.getOpcode()) {
     default:
@@ -789,9 +790,16 @@ bool RISCVRegisterInfo::getRegAllocationHints(
     case RISCV::SUBW:
       NeedGPRC = true;
       return true;
-    case RISCV::ANDI:
+    case RISCV::ANDI: {
       NeedGPRC = true;
-      return MI.getOperand(2).isImm() && isInt<6>(MI.getOperand(2).getImm());
+      if (!MI.getOperand(2).isImm())
+        return false;
+      int64_t Imm = MI.getOperand(2).getImm();
+      if (isInt<6>(Imm))
+        return true;
+      // c.zext.b
+      return Subtarget.hasStdExtZcb() && Imm == 255;
+    }
     case RISCV::SRAI:
     case RISCV::SRLI:
       NeedGPRC = true;
@@ -802,6 +810,24 @@ bool RISCVRegisterInfo::getRegAllocationHints(
     case RISCV::ADDI:
     case RISCV::ADDIW:
       return MI.getOperand(2).isImm() && isInt<6>(MI.getOperand(2).getImm());
+    case RISCV::MUL:
+    case RISCV::SEXT_B:
+    case RISCV::SEXT_H:
+    case RISCV::ZEXT_H_RV32:
+    case RISCV::ZEXT_H_RV64:
+      // c.mul, c.sext.b, c.sext.h, c.zext.h
+      NeedGPRC = true;
+      return Subtarget.hasStdExtZcb();
+    case RISCV::ADD_UW:
+      // c.zext.w
+      NeedGPRC = true;
+      return Subtarget.hasStdExtZcb() && MI.getOperand(2).isReg() &&
+             MI.getOperand(2).getReg() == RISCV::X0;
+    case RISCV::XORI:
+      // c.not
+      NeedGPRC = true;
+      return Subtarget.hasStdExtZcb() && MI.getOperand(2).isImm() &&
+             MI.getOperand(2).getImm() == -1;
     }
   };
 
@@ -823,13 +849,15 @@ bool RISCVRegisterInfo::getRegAllocationHints(
     bool NeedGPRC;
     if (isCompressible(MI, NeedGPRC)) {
       if (OpIdx == 0 && MI.getOperand(1).isReg()) {
-        if (!NeedGPRC || isCompressibleOpnd(MI.getOperand(2)))
+        if (!NeedGPRC || MI.getNumExplicitOperands() < 3 ||
+            MI.getOpcode() == RISCV::ADD_UW ||
+            isCompressibleOpnd(MI.getOperand(2)))
           tryAddHint(MO, MI.getOperand(1), NeedGPRC);
         if (MI.isCommutable() && MI.getOperand(2).isReg() &&
             (!NeedGPRC || isCompressibleOpnd(MI.getOperand(1))))
           tryAddHint(MO, MI.getOperand(2), NeedGPRC);
-      } else if (OpIdx == 1 &&
-                 (!NeedGPRC || isCompressibleOpnd(MI.getOperand(2)))) {
+      } else if (OpIdx == 1 && (!NeedGPRC || MI.getNumExplicitOperands() < 3 ||
+                                isCompressibleOpnd(MI.getOperand(2)))) {
         tryAddHint(MO, MI.getOperand(0), NeedGPRC);
       } else if (MI.isCommutable() && OpIdx == 2 &&
                  (!NeedGPRC || isCompressibleOpnd(MI.getOperand(1)))) {
diff --git a/llvm/test/CodeGen/RISCV/zcb-regalloc-hints.ll b/llvm/test/CodeGen/RISCV/zcb-regalloc-hints.ll
new file mode 100644
index 0000000000000..545d6c6aca041
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/zcb-regalloc-hints.ll
@@ -0,0 +1,86 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=riscv64 -mattr=+m,+zba,+zbb,+zcb | FileCheck %s
+
+define i64 @c_not(i64 %x, i64 %y, i64 %z) {
+; CHECK-LABEL: c_not:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    li a0, 1234
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = xor i64 %y, -1
+  %b = mul i64 %a, 1234
+  ret i64 %b
+}
+
+define i64 @c_mul(i64 %x, i64 %y, i64 %z, i64 %w) {
+; CHECK-LABEL: c_mul:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    mul a1, a1, a2
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = mul i64 %y, %z
+  %b = or i64 %a, 4096
+  ret i64 %b
+}
+
+define i64 @c_sext_b(i64 %x, i8 %y, i64 %z) {
+; CHECK-LABEL: c_sext_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a1, a1
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = sext i8 %y to i64
+  %b = or i64 %a, 4096
+  ret i64 %b
+}
+
+define i64 @c_sext_h(i64 %x, i16 %y, i64 %z) {
+; CHECK-LABEL: c_sext_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a1, a1
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = sext i16 %y to i64
+  %b = or i64 %a, 4096
+  ret i64 %b
+}
+
+define i64 @c_zext_b(i64 %x, i8 %y, i64 %z) {
+; CHECK-LABEL: c_zext_b:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    andi a1, a1, 255
+; CHECK-NEXT:    lui a0, 1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = zext i8 %y to i64
+  %b = or i64 %a, 4096
+  ret i64 %b
+}
+
+define i64 @c_zext_h(i64 %x, i16 %y) {
+; CHECK-LABEL: c_zext_h:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.h a1, a1
+; CHECK-NEXT:    lui a0, 4096
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = zext i16 %y to i64
+  %b = or i64 %a, 16777216
+  ret i64 %b
+}
+
+define i64 @c_zext_w(i64 %x, i32 %y) {
+; CHECK-LABEL: c_zext_w:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.w a1, a1
+; CHECK-NEXT:    li a0, 1234
+; CHECK-NEXT:    mul a0, a0, a1
+; CHECK-NEXT:    ret
+  %a = zext i32 %y to i64
+  %b = mul i64 %a, 1234
+  ret i64 %b
+}

From 03d362db7463f3451692fa6a80204b35dfe95037 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 23 Jan 2024 11:40:58 -0600
Subject: [PATCH 658/843] [libc][Docs] Update the GPU RPC documentation
 (#79069)

Summary:
This adds some more concrete information on the RPC interface. Hopefully
this is intelligable and provides some useful examples.
---
 libc/docs/gpu/rpc-diagram.svg |   1 +
 libc/docs/gpu/rpc.rst         | 296 +++++++++++++++++++++++++++++++++-
 libc/docs/gpu/testing.rst     |   6 +-
 3 files changed, 296 insertions(+), 7 deletions(-)
 create mode 100644 libc/docs/gpu/rpc-diagram.svg

diff --git a/libc/docs/gpu/rpc-diagram.svg b/libc/docs/gpu/rpc-diagram.svg
new file mode 100644
index 0000000000000..eb9b807e52004
--- /dev/null
+++ b/libc/docs/gpu/rpc-diagram.svg
@@ -0,0 +1 @@
+<svg version="1.1" viewBox="0.0 0.0 960.0 540.0" fill="none" stroke="none" stroke-linecap="square" stroke-miterlimit="10" xmlns:xlink="http://www.w3.org/1999/xlink" xmlns="http://www.w3.org/2000/svg"><clipPath id="g2b223c31cff_0_0.0"><path d="m0 0l960.0 0l0 540.0l-960.0 0l0 -540.0z" clip-rule="nonzero"/></clipPath><g clip-path="url(#g2b223c31cff_0_0.0)"><path fill="#000000" fill-opacity="0.0" d="m0 0l960.0 0l0 540.0l-960.0 0z" fill-rule="evenodd"/><path fill="#b6d7a8" d="m271.76743 215.15164l109.921265 0l0 79.93701l-109.921265 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m271.76743 215.15164l109.921265 0l0 79.93701l-109.921265 0z" fill-rule="evenodd"/><path fill="#000000" d="m294.84525 262.04013l0 -11.624985l1.03125 0l2.875 5.6874847l2.921875 -5.7031097l0.984375 0l0 11.64061l-1.234375 0l0 -8.76561l-2.515625 4.6874847l-0.5 0l-2.34375 -4.6406097l0 8.718735l-1.21875 0zm12.5 0.203125q-1.453125 0 -2.234375 -0.65625q-0.78125 -0.671875 -0.78125 -1.6875q0 -0.6875 0.328125 -1.234375q0.328125 -0.5625 0.921875 -0.9375q0.59375 -0.390625 1.375 -0.59375q0.78125 -0.203125 1.65625 -0.203125q0.3125 0 0.640625 0.015625q0.328125 0 0.6875 0.03125q0.375 0.03125 0.765625 0.078125l0.03125 1.03125q-0.328125 -0.0625 -0.671875 -0.078125q-0.34375 -0.03125 -0.6875 -0.046875q-0.328125 -0.03125 -0.65625 -0.03125q-0.671875 0 -1.234375 0.125q-0.546875 0.109375 -0.96875 0.328125q-0.40625 0.21875 -0.640625 0.578125q-0.21875 0.359375 -0.21875 0.859375q0 0.390625 0.140625 0.671875q0.140625 0.265625 0.375 0.421875q0.25 0.15625 0.578125 0.234375q0.328125 0.0625 0.71875 0.0625q0.734375 0 1.296875 -0.21875q0.578125 -0.234375 0.96875 -0.6875q0.40625 -0.453125 0.609375 -1.09375q0.203125 -0.65625 0.203125 -1.5q0 -1.265625 -0.296875 -1.9687347q-0.296875 -0.71875 -0.890625 -1.015625q-0.578125 -0.296875 -1.5 -0.296875q-0.609375 0 -1.203125 0.234375q-0.59375 0.234375 -1.078125 0.71875l-0.65625 -0.859375q0.59375 -0.59375 1.375 -0.890625q0.78125 -0.3125 1.65625 -0.3125q0.84375 0 1.53125 0.21875q0.6875 0.203125 1.1875 0.6875q0.5 0.46875 0.765625 1.265625q0.265625 0.79685974 0.265625 2.0156097l0 4.53125l-1.453125 0l0 -1.3125q-0.234375 0.46875 -0.59375 0.765625q-0.34375 0.296875 -0.734375 0.46875q-0.390625 0.15625 -0.796875 0.21875q-0.40625 0.0625 -0.78125 0.0625zm7.25 -0.203125l0 -1.078125l2.1875 0l0 -6.3593597l-2.0625 0l0 -1.09375l3.40625 0l0 7.4531097l2.0 0l0 1.078125l-5.53125 0zm2.78125 -10.312485q-0.390625 0 -0.671875 -0.28125q-0.28125 -0.28125 -0.28125 -0.671875q0 -0.40625 0.265625 -0.6875q0.28125 -0.28125 0.6875 -0.28125q0.390625 0 0.671875 0.296875q0.296875 0.28125 0.296875 0.671875q0 0.390625 -0.296875 0.671875q-0.28125 0.28125 -0.671875 0.28125zm6.15625 10.312485l0 -1.078125l2.53125 0l0 -10.249985l-2.421875 0l0 -1.078125l3.78125 0l0 11.32811l2.5 0l0 1.078125l-6.390625 0zm12.828125 0.203125q-0.53125 0 -1.0 -0.15625q-0.46875 -0.15625 -0.84375 -0.453125q-0.375 -0.296875 -0.6875 -0.71875l-0.453125 1.125l-0.859375 0l0 -12.406235l1.34375 0l0.1875 0l0 0.125q-0.125 0.125 -0.15625 0.25q-0.015625 0.125 -0.015625 0.453125l0 4.359375q0.203125 -0.34375 0.484375 -0.609375q0.28125 -0.28125 0.609375 -0.484375q0.34375 -0.21875 0.71875 -0.3125q0.375 -0.109375 0.71875 -0.109375q0.78125 0 1.421875 0.28125q0.65625 0.265625 1.109375 0.8125q0.46875 0.546875 0.71875 1.375q0.265625 0.81248474 0.265625 1.9218597q0 1.140625 -0.3125 2.0q-0.296875 0.84375 -0.8125 1.421875q-0.5 0.5625 -1.140625 0.84375q-0.625 0.28125 -1.296875 0.28125l0 0zm-0.171875 -1.1875q0.453125 0 0.859375 -0.171875q0.421875 -0.171875 0.75 -0.546875q0.34375 -0.390625 0.53125 -1.015625q0.203125 -0.640625 0.203125 -1.546875q0 -0.84375 -0.171875 -1.46875q-0.15625 -0.62498474 -0.46875 -1.0312347q-0.3125 -0.421875 -0.75 -0.609375q-0.421875 -0.203125 -0.921875 -0.203125q-0.6875 0 -1.21875 0.421875q-0.53125 0.421875 -0.828125 1.1874847q-0.296875 0.75 -0.296875 1.734375q0 0.734375 0.125 1.328125q0.140625 0.59375 0.40625 1.03125q0.265625 0.421875 0.703125 0.65625q0.4375 0.234375 1.078125 0.234375l0 0zm9.203125 1.140625q-1.171875 0 -2.0625 -0.5625q-0.875 -0.578125 -1.375 -1.578125q-0.5 -1.0 -0.5 -2.28125q0 -1.3125 0.5 -2.3124847q0.5 -1.0 1.375 -1.5625q0.890625 -0.578125 2.0625 -0.578125q1.15625 0 2.03125 0.578125q0.890625 0.5625 1.390625 1.5625q0.5 0.99998474 0.5 2.3124847q0 1.28125 -0.5 2.28125q-0.5 1.0 -1.390625 1.578125q-0.875 0.5625 -2.03125 0.5625zm0 -1.125q0.734375 0 1.296875 -0.421875q0.5625 -0.4375 0.890625 -1.1875q0.328125 -0.75 0.328125 -1.71875q0 -0.96875 -0.328125 -1.703125q-0.328125 -0.74998474 -0.890625 -1.1718597q-0.5625 -0.421875 -1.296875 -0.421875q-0.734375 0 -1.3125 0.421875q-0.5625 0.421875 -0.890625 1.1718597q-0.328125 0.734375 -0.328125 1.703125q0 0.96875 0.328125 1.71875q0.328125 0.75 0.890625 1.1875q0.578125 0.421875 1.3125 0.421875zm5.53125 0.96875l3.0 -4.328125l-2.9375 -4.2031097l1.46875 0l2.21875 3.1562347l2.125 -3.1562347l1.390625 0l-2.765625 4.1406097l3.125 4.390625l-1.515625 0l-2.359375 -3.328125l-2.21875 3.328125l-1.53125 0z" fill-rule="nonzero"/><path fill="#ffe599" d="m716.0079 215.15164l109.921265 0l0 79.93701l-109.921265 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m716.0079 215.15164l109.921265 0l0 79.93701l-109.921265 0z" fill-rule="evenodd"/><path fill="#000000" d="m739.0857 262.04013l0 -11.624985l1.03125 0l2.875 5.6874847l2.921875 -5.7031097l0.984375 0l0 11.64061l-1.234375 0l0 -8.76561l-2.515625 4.6874847l-0.5 0l-2.34375 -4.6406097l0 8.718735l-1.21875 0zm12.5 0.203125q-1.453125 0 -2.234375 -0.65625q-0.78125 -0.671875 -0.78125 -1.6875q0 -0.6875 0.328125 -1.234375q0.328125 -0.5625 0.921875 -0.9375q0.59375 -0.390625 1.375 -0.59375q0.78125 -0.203125 1.65625 -0.203125q0.3125 0 0.640625 0.015625q0.328125 0 0.6875 0.03125q0.375 0.03125 0.765625 0.078125l0.03125 1.03125q-0.328125 -0.0625 -0.671875 -0.078125q-0.34375 -0.03125 -0.6875 -0.046875q-0.328125 -0.03125 -0.65625 -0.03125q-0.671875 0 -1.234375 0.125q-0.546875 0.109375 -0.96875 0.328125q-0.40625 0.21875 -0.640625 0.578125q-0.21875 0.359375 -0.21875 0.859375q0 0.390625 0.140625 0.671875q0.140625 0.265625 0.375 0.421875q0.25 0.15625 0.578125 0.234375q0.328125 0.0625 0.71875 0.0625q0.734375 0 1.296875 -0.21875q0.578125 -0.234375 0.96875 -0.6875q0.40625 -0.453125 0.609375 -1.09375q0.203125 -0.65625 0.203125 -1.5q0 -1.265625 -0.296875 -1.9687347q-0.296875 -0.71875 -0.890625 -1.015625q-0.578125 -0.296875 -1.5 -0.296875q-0.609375 0 -1.203125 0.234375q-0.59375 0.234375 -1.078125 0.71875l-0.65625 -0.859375q0.59375 -0.59375 1.375 -0.890625q0.78125 -0.3125 1.65625 -0.3125q0.84375 0 1.53125 0.21875q0.6875 0.203125 1.1875 0.6875q0.5 0.46875 0.765625 1.265625q0.265625 0.79685974 0.265625 2.0156097l0 4.53125l-1.453125 0l0 -1.3125q-0.234375 0.46875 -0.59375 0.765625q-0.34375 0.296875 -0.734375 0.46875q-0.390625 0.15625 -0.796875 0.21875q-0.40625 0.0625 -0.78125 0.0625zm7.25 -0.203125l0 -1.078125l2.1875 0l0 -6.3593597l-2.0625 0l0 -1.09375l3.40625 0l0 7.4531097l2.0 0l0 1.078125l-5.53125 0zm2.78125 -10.312485q-0.390625 0 -0.671875 -0.28125q-0.28125 -0.28125 -0.28125 -0.671875q0 -0.40625 0.265625 -0.6875q0.28125 -0.28125 0.6875 -0.28125q0.390625 0 0.671875 0.296875q0.296875 0.28125 0.296875 0.671875q0 0.390625 -0.296875 0.671875q-0.28125 0.28125 -0.671875 0.28125zm6.15625 10.312485l0 -1.078125l2.53125 0l0 -10.249985l-2.421875 0l0 -1.078125l3.78125 0l0 11.32811l2.5 0l0 1.078125l-6.390625 0zm12.828125 0.203125q-0.53125 0 -1.0 -0.15625q-0.46875 -0.15625 -0.84375 -0.453125q-0.375 -0.296875 -0.6875 -0.71875l-0.453125 1.125l-0.859375 0l0 -12.406235l1.34375 0l0.1875 0l0 0.125q-0.125 0.125 -0.15625 0.25q-0.015625 0.125 -0.015625 0.453125l0 4.359375q0.203125 -0.34375 0.484375 -0.609375q0.28125 -0.28125 0.609375 -0.484375q0.34375 -0.21875 0.71875 -0.3125q0.375 -0.109375 0.71875 -0.109375q0.78125 0 1.421875 0.28125q0.65625 0.265625 1.109375 0.8125q0.46875 0.546875 0.71875 1.375q0.265625 0.81248474 0.265625 1.9218597q0 1.140625 -0.3125 2.0q-0.296875 0.84375 -0.8125 1.421875q-0.5 0.5625 -1.140625 0.84375q-0.625 0.28125 -1.296875 0.28125l0 0zm-0.171875 -1.1875q0.453125 0 0.859375 -0.171875q0.421875 -0.171875 0.75 -0.546875q0.34375 -0.390625 0.53125 -1.015625q0.203125 -0.640625 0.203125 -1.546875q0 -0.84375 -0.171875 -1.46875q-0.15625 -0.62498474 -0.46875 -1.0312347q-0.3125 -0.421875 -0.75 -0.609375q-0.421875 -0.203125 -0.921875 -0.203125q-0.6875 0 -1.21875 0.421875q-0.53125 0.421875 -0.828125 1.1874847q-0.296875 0.75 -0.296875 1.734375q0 0.734375 0.125 1.328125q0.140625 0.59375 0.40625 1.03125q0.265625 0.421875 0.703125 0.65625q0.4375 0.234375 1.078125 0.234375l0 0zm9.203125 1.140625q-1.171875 0 -2.0625 -0.5625q-0.875 -0.578125 -1.375 -1.578125q-0.5 -1.0 -0.5 -2.28125q0 -1.3125 0.5 -2.3124847q0.5 -1.0 1.375 -1.5625q0.890625 -0.578125 2.0625 -0.578125q1.15625 0 2.03125 0.578125q0.890625 0.5625 1.390625 1.5625q0.5 0.99998474 0.5 2.3124847q0 1.28125 -0.5 2.28125q-0.5 1.0 -1.390625 1.578125q-0.875 0.5625 -2.03125 0.5625zm0 -1.125q0.734375 0 1.296875 -0.421875q0.5625 -0.4375 0.890625 -1.1875q0.328125 -0.75 0.328125 -1.71875q0 -0.96875 -0.328125 -1.703125q-0.328125 -0.74998474 -0.890625 -1.1718597q-0.5625 -0.421875 -1.296875 -0.421875q-0.734375 0 -1.3125 0.421875q-0.5625 0.421875 -0.890625 1.1718597q-0.328125 0.734375 -0.328125 1.703125q0 0.96875 0.328125 1.71875q0.328125 0.75 0.890625 1.1875q0.578125 0.421875 1.3125 0.421875zm5.53125 0.96875l3.0 -4.328125l-2.9375 -4.2031097l1.46875 0l2.21875 3.1562347l2.125 -3.1562347l1.390625 0l-2.765625 4.1406097l3.125 4.390625l-1.515625 0l-2.359375 -3.328125l-2.21875 3.328125l-1.53125 0z" fill-rule="nonzero"/><path fill="#6aa84f" d="m417.9425 -3.874016l261.7953 0l0 91.71654l-261.7953 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.9425 -3.874016l261.7953 0l0 91.71654l-261.7953 0z" fill-rule="evenodd"/><path fill="#000000" d="m531.27765 48.90425l0 -11.625l3.734375 0q1.3125 0 2.109375 0.453125q0.8125 0.4375 1.1875 1.1875q0.375 0.734375 0.375 1.640625q0 0.671875 -0.203125 1.25q-0.203125 0.578125 -0.640625 1.03125q-0.4375 0.453125 -1.109375 0.703125q-0.671875 0.234375 -1.625 0.234375l-2.484375 0l0 5.125l-1.34375 0zm1.34375 -6.28125l2.34375 0q0.859375 0 1.375 -0.25q0.515625 -0.265625 0.75 -0.71875q0.25 -0.453125 0.25 -1.046875q0 -0.578125 -0.25 -1.046875q-0.234375 -0.46875 -0.75 -0.75q-0.515625 -0.296875 -1.359375 -0.296875l-2.359375 0l0 4.109375zm11.5625 6.4375q-1.171875 0 -2.0625 -0.5625q-0.875 -0.578125 -1.375 -1.578125q-0.5 -1.0 -0.5 -2.28125q0 -1.3125 0.5 -2.3125q0.5 -1.0 1.375 -1.5625q0.890625 -0.578125 2.0625 -0.578125q1.15625 0 2.03125 0.578125q0.890625 0.5625 1.390625 1.5625q0.5 1.0 0.5 2.3125q0 1.28125 -0.5 2.28125q-0.5 1.0 -1.390625 1.578125q-0.875 0.5625 -2.03125 0.5625zm0 -1.125q0.734375 0 1.296875 -0.421875q0.5625 -0.4375 0.890625 -1.1875q0.328125 -0.75 0.328125 -1.71875q0 -0.96875 -0.328125 -1.703125q-0.328125 -0.75 -0.890625 -1.171875q-0.5625 -0.421875 -1.296875 -0.421875q-0.734375 0 -1.3125 0.421875q-0.5625 0.421875 -0.890625 1.171875q-0.328125 0.734375 -0.328125 1.703125q0 0.96875 0.328125 1.71875q0.328125 0.75 0.890625 1.1875q0.578125 0.421875 1.3125 0.421875zm6.875 -7.5625l1.40625 0l-0.0625 2.484375l-0.171875 -0.28125q0.109375 -0.578125 0.390625 -1.015625q0.28125 -0.453125 0.671875 -0.75q0.390625 -0.3125 0.859375 -0.46875q0.46875 -0.171875 0.96875 -0.171875q0.671875 0 1.234375 0.234375q0.578125 0.234375 1.03125 0.703125l-0.59375 1.046875l-0.078125 0.171875l-0.125 -0.078125q-0.046875 -0.15625 -0.125 -0.28125q-0.0625 -0.140625 -0.3125 -0.3125q-0.296875 -0.1875 -0.546875 -0.25q-0.25 -0.078125 -0.578125 -0.078125q-0.453125 0 -0.921875 0.1875q-0.46875 0.171875 -0.859375 0.546875q-0.375 0.375 -0.609375 0.953125q-0.21875 0.578125 -0.21875 1.375l0 4.53125l-1.359375 0l0 -8.546875zm12.6875 8.65625q-0.828125 0 -1.34375 -0.265625q-0.515625 -0.28125 -0.796875 -0.875q-0.28125 -0.59375 -0.375 -1.484375q-0.078125 -0.890625 -0.046875 -2.125l0.234375 -6.078125l1.390625 -0.171875l0.1875 -0.015625l0.015625 0.125q-0.09375 0.125 -0.15625 0.265625q-0.0625 0.125 -0.09375 0.4375l-0.171875 2.109375l0.0625 0.34375l-0.140625 3.078125q-0.046875 1.421875 0.078125 2.203125q0.125 0.765625 0.453125 1.0625q0.34375 0.28125 0.90625 0.28125q0.625 0 1.09375 -0.234375q0.46875 -0.25 1.0 -0.625l0.40625 1.0625q-0.625 0.46875 -1.296875 0.6875q-0.671875 0.21875 -1.40625 0.21875l0 0zm-4.265625 -8.65625l6.125 0l0 1.109375l-6.125 0l0 -1.109375z" fill-rule="nonzero"/><path fill="#6aa84f" d="m417.95084 420.30414l261.79526 0l0 91.71655l-261.79526 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.95084 420.30414l261.79526 0l0 91.71655l-261.79526 0z" fill-rule="evenodd"/><path fill="#000000" d="m531.28595 473.0824l0 -11.625l3.734375 0q1.3125 0 2.109375 0.453125q0.8125 0.4375 1.1875 1.1875q0.375 0.734375 0.375 1.640625q0 0.671875 -0.203125 1.25q-0.203125 0.578125 -0.640625 1.03125q-0.4375 0.453125 -1.109375 0.703125q-0.671875 0.234375 -1.625 0.234375l-2.484375 0l0 5.125l-1.34375 0zm1.34375 -6.28125l2.34375 0q0.859375 0 1.375 -0.25q0.515625 -0.265625 0.75 -0.71875q0.25 -0.453125 0.25 -1.046875q0 -0.578125 -0.25 -1.046875q-0.234375 -0.46875 -0.75 -0.75q-0.515625 -0.296875 -1.359375 -0.296875l-2.359375 0l0 4.109375zm11.5625 6.4375q-1.171875 0 -2.0625 -0.5625q-0.875 -0.578125 -1.375 -1.578125q-0.5 -1.0 -0.5 -2.28125q0 -1.3125 0.5 -2.3125q0.5 -1.0 1.375 -1.5625q0.890625 -0.578125 2.0625 -0.578125q1.15625 0 2.03125 0.578125q0.890625 0.5625 1.390625 1.5625q0.5 1.0 0.5 2.3125q0 1.28125 -0.5 2.28125q-0.5 1.0 -1.390625 1.578125q-0.875 0.5625 -2.03125 0.5625zm0 -1.125q0.734375 0 1.296875 -0.421875q0.5625 -0.4375 0.890625 -1.1875q0.328125 -0.75 0.328125 -1.71875q0 -0.96875 -0.328125 -1.703125q-0.328125 -0.75 -0.890625 -1.171875q-0.5625 -0.421875 -1.296875 -0.421875q-0.734375 0 -1.3125 0.421875q-0.5625 0.421875 -0.890625 1.171875q-0.328125 0.734375 -0.328125 1.703125q0 0.96875 0.328125 1.71875q0.328125 0.75 0.890625 1.1875q0.578125 0.421875 1.3125 0.421875zm6.875 -7.5625l1.40625 0l-0.0625 2.484375l-0.171875 -0.28125q0.109375 -0.578125 0.390625 -1.015625q0.28125 -0.453125 0.671875 -0.75q0.390625 -0.3125 0.859375 -0.46875q0.46875 -0.171875 0.96875 -0.171875q0.671875 0 1.234375 0.234375q0.578125 0.234375 1.03125 0.703125l-0.59375 1.046875l-0.078125 0.171875l-0.125 -0.078125q-0.046875 -0.15625 -0.125 -0.28125q-0.0625 -0.140625 -0.3125 -0.3125q-0.296875 -0.1875 -0.546875 -0.25q-0.25 -0.078125 -0.578125 -0.078125q-0.453125 0 -0.921875 0.1875q-0.46875 0.171875 -0.859375 0.546875q-0.375 0.375 -0.609375 0.953125q-0.21875 0.578125 -0.21875 1.375l0 4.53125l-1.359375 0l0 -8.546875zm12.6875 8.65625q-0.828125 0 -1.34375 -0.265625q-0.515625 -0.28125 -0.796875 -0.875q-0.28125 -0.59375 -0.375 -1.484375q-0.078125 -0.890625 -0.046875 -2.125l0.234375 -6.078125l1.390625 -0.171875l0.1875 -0.015625l0.015625 0.125q-0.09375 0.125 -0.15625 0.265625q-0.0625 0.125 -0.09375 0.4375l-0.171875 2.109375l0.0625 0.34375l-0.140625 3.078125q-0.046875 1.421875 0.078125 2.203125q0.125 0.765625 0.453125 1.0625q0.34375 0.28125 0.90625 0.28125q0.625 0 1.09375 -0.234375q0.46875 -0.25 1.0 -0.625l0.40625 1.0625q-0.625 0.46875 -1.296875 0.6875q-0.671875 0.21875 -1.40625 0.21875l0 0zm-4.265625 -8.65625l6.125 0l0 1.109375l-6.125 0l0 -1.109375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m417.95084 466.1624c-45.606293 0 -91.212585 -85.543335 -91.212585 -171.08664" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.95084 466.1624c-22.803162 0 -45.606293 -21.385834 -62.70868 -53.46457c-8.551178 -16.039368 -15.677155 -34.751984 -20.665344 -54.801178c-2.4940796 -10.024628 -4.4537354 -20.383392 -5.789856 -30.90921c-0.6680603 -5.2629395 -1.1802368 -10.567627 -1.5254211 -15.893188c-0.08627319 -1.3313904 -0.1621399 -2.664093 -0.22738647 -3.9977722l-0.0011291504 -0.024353027" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m330.33548 306.99103l-3.5254211 -8.99231l-3.0794983 9.154602z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m326.72806 215.15164c0 -86.58269 45.606293 -173.16536 91.212585 -173.16536" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m326.72806 215.15164c0 -43.29135 11.401581 -86.58269 28.503937 -119.051186c8.551178 -16.234253 18.527557 -29.762794 29.216522 -39.232777c5.344513 -4.734989 10.867126 -8.455341 16.478851 -10.99194c1.4029236 -0.63415146 2.811432 -1.1943207 4.2240906 -1.6778603c0.1765747 -0.06044388 0.35327148 -0.11968994 0.52993774 -0.17773438l0.4109192 -0.13211441" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m406.61584 47.149746l8.437958 -4.7001038l-9.485016 -1.8233299z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m770.9685 295.08865c0 85.543304 -45.606323 171.08661 -91.212585 171.08661" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m770.9685 295.08865c0 42.771637 -11.40155 85.543274 -28.503967 117.62201c-8.551147 16.039368 -18.527527 29.405518 -29.216492 38.76181c-5.3444824 4.6781616 -10.867126 8.353851 -16.478882 10.859985c-1.402832 0.6265869 -2.8114014 1.1800232 -4.22406 1.6577454c-0.1765747 0.059692383 -0.35327148 0.1182251 -0.5299072 0.17556763l-0.40753174 0.12945557" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m691.09015 461.03253l-8.446533 4.6846313l9.481628 1.8407288z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m679.7378 41.984253c45.62207 0 91.24408 86.58267 91.24408 173.16536" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m679.7378 41.984253c22.811035 0 45.62201 21.645668 62.730286 54.114174c8.554138 16.234253 15.682617 35.174217 20.672485 55.467033c2.494934 10.146393 4.4553223 20.631027 5.79187 31.284744c0.6682739 5.326874 1.1806641 10.696014 1.526001 16.086288c0.086242676 1.3475647 0.16210938 2.696457 0.22741699 4.046341l0.007873535 0.17024231" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m767.39124 203.23239l3.5204468 8.994247l3.0845947 -9.152893z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m534.39343 151.2861l0 -63.46457" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m534.39343 151.2861l0 -51.46457" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m537.6969 99.82153l-3.3034668 -9.076195l-3.3034668 9.076195z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m563.27734 87.80994l0 63.46457" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m563.27734 87.80994l0 51.46457" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m559.9739 139.2745l3.3034668 9.076187l3.3034668 -9.076187z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m563.28564 420.307l0 -63.46457" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m563.2857 420.307l0 -51.46457" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m566.5892 368.84244l-3.3034668 -9.076202l-3.3034668 9.076202z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m534.40173 356.83084l0 63.46457" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m534.40173 356.83084l0 51.46457" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m531.09827 408.2954l3.3034668 9.076172l3.3034668 -9.076172z" fill-rule="evenodd"/><path fill="#ea9999" d="m417.9342 151.2861l131.9685 0l0 54.929123l-131.9685 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.9342 151.2861l131.9685 0l0 54.929123l-131.9685 0z" fill-rule="evenodd"/><path fill="#000000" d="m460.60596 185.8738q-1.328125 0 -2.265625 -0.6875q-0.9375 -0.6875 -1.421875 -2.015625q-0.484375 -1.34375 -0.484375 -3.25q0 -1.84375 0.484375 -3.171875q0.484375 -1.34375 1.421875 -2.0625q0.9375 -0.71875 2.265625 -0.71875q1.328125 0 2.25 0.71875q0.9375 0.71875 1.421875 2.0625q0.484375 1.328125 0.484375 3.171875q0 1.90625 -0.484375 3.25q-0.484375 1.328125 -1.421875 2.015625q-0.921875 0.6875 -2.25 0.6875zm0 -1.140625q0.890625 0 1.515625 -0.546875q0.625 -0.5625 0.953125 -1.640625q0.34375 -1.078125 0.34375 -2.625q0 -1.46875 -0.34375 -2.53125q-0.328125 -1.078125 -0.953125 -1.671875q-0.625 -0.609375 -1.515625 -0.609375q-0.890625 0 -1.53125 0.609375q-0.625 0.59375 -0.96875 1.671875q-0.328125 1.0625 -0.328125 2.53125q0 1.546875 0.328125 2.625q0.34375 1.078125 0.96875 1.640625q0.640625 0.546875 1.53125 0.546875zm5.78125 4.046875l0 -11.640625l1.359375 0l0 1.6875q0.203125 -0.65625 0.578125 -1.0625q0.390625 -0.421875 0.90625 -0.609375q0.515625 -0.203125 1.109375 -0.203125q0.703125 0 1.34375 0.28125q0.65625 0.265625 1.140625 0.8125q0.5 0.53125 0.78125 1.359375q0.296875 0.8125 0.296875 1.9375q0 1.125 -0.3125 1.984375q-0.3125 0.84375 -0.828125 1.421875q-0.5 0.5625 -1.15625 0.859375q-0.640625 0.28125 -1.3125 0.28125q-0.5 0 -0.984375 -0.15625q-0.46875 -0.15625 -0.859375 -0.453125q-0.390625 -0.3125 -0.6875 -0.734375l0 4.234375l-1.375 0zm3.6875 -4.078125q0.453125 0 0.875 -0.15625q0.4375 -0.15625 0.78125 -0.53125q0.359375 -0.375 0.5625 -1.015625q0.203125 -0.65625 0.203125 -1.59375q0 -1.03125 -0.265625 -1.75q-0.25 -0.71875 -0.75 -1.109375q-0.484375 -0.390625 -1.203125 -0.4375q-0.515625 -0.03125 -0.984375 0.15625q-0.453125 0.1875 -0.8125 0.625q-0.34375 0.421875 -0.546875 1.125q-0.1875 0.6875 -0.1875 1.6875q0.015625 0.59375 0.125 1.078125q0.109375 0.484375 0.3125 0.859375q0.21875 0.359375 0.484375 0.59375q0.28125 0.21875 0.640625 0.34375q0.359375 0.125 0.765625 0.125zm9.828125 1.171875q-0.921875 0 -1.71875 -0.328125q-0.78125 -0.34375 -1.359375 -0.9375q-0.5625 -0.59375 -0.875 -1.40625q-0.3125 -0.8125 -0.3125 -1.78125q0 -0.953125 0.3125 -1.765625q0.3125 -0.8125 0.875 -1.40625q0.578125 -0.59375 1.359375 -0.921875q0.796875 -0.34375 1.703125 -0.34375q1.046875 0 1.875 0.421875q0.828125 0.40625 1.359375 1.15625l-0.84375 0.828125l-0.125 0.125l-0.09375 -0.09375q0 -0.15625 -0.078125 -0.28125q-0.0625 -0.125 -0.28125 -0.359375q-0.40625 -0.359375 -0.875 -0.515625q-0.46875 -0.15625 -1.078125 -0.15625q-0.546875 0 -1.03125 0.234375q-0.484375 0.21875 -0.875 0.640625q-0.375 0.421875 -0.609375 1.015625q-0.21875 0.59375 -0.21875 1.328125q0 0.71875 0.21875 1.328125q0.234375 0.609375 0.625 1.0625q0.390625 0.4375 0.9375 0.703125q0.546875 0.25 1.1875 0.25q0.4375 0 0.828125 -0.125q0.390625 -0.125 0.734375 -0.359375q0.359375 -0.234375 0.65625 -0.5625l0.78125 0.90625q-0.625 0.671875 -1.40625 1.015625q-0.765625 0.328125 -1.671875 0.328125l0 0zm8.6875 -0.046875q-1.171875 0 -2.0625 -0.5625q-0.875 -0.578125 -1.375 -1.578125q-0.5 -1.0 -0.5 -2.28125q0 -1.3125 0.5 -2.3125q0.5 -1.0 1.375 -1.5625q0.890625 -0.578125 2.0625 -0.578125q1.15625 0 2.03125 0.578125q0.890625 0.5625 1.390625 1.5625q0.5 1.0 0.5 2.3125q0 1.28125 -0.5 2.28125q-0.5 1.0 -1.390625 1.578125q-0.875 0.5625 -2.03125 0.5625zm0 -1.125q0.734375 0 1.296875 -0.421875q0.5625 -0.4375 0.890625 -1.1875q0.328125 -0.75 0.328125 -1.71875q0 -0.96875 -0.328125 -1.703125q-0.328125 -0.75 -0.890625 -1.171875q-0.5625 -0.421875 -1.296875 -0.421875q-0.734375 0 -1.3125 0.421875q-0.5625 0.421875 -0.890625 1.171875q-0.328125 0.734375 -0.328125 1.703125q0 0.96875 0.328125 1.71875q0.328125 0.75 0.890625 1.1875q0.578125 0.421875 1.3125 0.421875zm9.015625 1.15625q-0.640625 0 -1.265625 -0.28125q-0.625 -0.28125 -1.140625 -0.84375q-0.515625 -0.5625 -0.8125 -1.40625q-0.296875 -0.859375 -0.296875 -2.0q0 -1.171875 0.3125 -2.0q0.3125 -0.84375 0.828125 -1.375q0.53125 -0.53125 1.15625 -0.78125q0.640625 -0.25 1.296875 -0.25q0.59375 0 1.0625 0.1875q0.484375 0.1875 0.84375 0.515625q0.375 0.328125 0.578125 0.765625l0 -5.125l1.234375 0l0.1875 0l0 0.125q-0.109375 0.125 -0.140625 0.25q-0.03125 0.125 -0.03125 0.453125l0.015625 10.25q0 0.34375 0.03125 0.671875q0.03125 0.328125 0.15625 0.65625l-1.328125 0q-0.09375 -0.21875 -0.125 -0.4375q-0.03125 -0.234375 -0.046875 -0.4375q-0.015625 -0.21875 -0.015625 -0.453125q-0.25 0.453125 -0.625 0.796875q-0.375 0.34375 -0.84375 0.53125q-0.46875 0.1875 -1.03125 0.1875l0 0zm0.21875 -1.171875q0.609375 0 1.03125 -0.25q0.4375 -0.265625 0.6875 -0.71875q0.265625 -0.453125 0.390625 -1.0625q0.125 -0.609375 0.125 -1.296875q0 -0.953125 -0.25 -1.703125q-0.234375 -0.75 -0.765625 -1.1875q-0.515625 -0.453125 -1.34375 -0.453125q-0.546875 0 -1.0 0.21875q-0.4375 0.21875 -0.734375 0.625q-0.28125 0.40625 -0.4375 0.984375q-0.140625 0.5625 -0.140625 1.28125q0 1.0625 0.28125 1.859375q0.28125 0.796875 0.8125 1.25q0.546875 0.453125 1.34375 0.453125l0 0zm9.828125 1.171875q-0.9375 0 -1.703125 -0.28125q-0.765625 -0.296875 -1.3125 -0.859375q-0.546875 -0.5625 -0.84375 -1.390625q-0.296875 -0.828125 -0.296875 -1.890625q0 -1.078125 0.296875 -1.921875q0.3125 -0.84375 0.84375 -1.40625q0.53125 -0.578125 1.25 -0.875q0.71875 -0.296875 1.515625 -0.296875q0.734375 0 1.359375 0.25q0.640625 0.234375 1.109375 0.734375q0.46875 0.5 0.734375 1.265625q0.265625 0.75 0.265625 1.796875q0 0.15625 -0.015625 0.3125q0 0.15625 -0.015625 0.3125l-6.0625 0q0.03125 0.8125 0.265625 1.421875q0.25 0.609375 0.640625 1.0q0.390625 0.375 0.90625 0.5625q0.515625 0.1875 1.09375 0.1875q0.453125 0 0.84375 -0.09375q0.390625 -0.109375 0.734375 -0.328125q0.34375 -0.21875 0.625 -0.53125l0.75 0.71875q-0.359375 0.4375 -0.8125 0.734375q-0.453125 0.296875 -1.0 0.4375q-0.546875 0.140625 -1.171875 0.140625zm-2.828125 -5.296875l4.6875 0q0 -0.59375 -0.15625 -1.0625q-0.15625 -0.484375 -0.453125 -0.828125q-0.28125 -0.34375 -0.6875 -0.515625q-0.390625 -0.1875 -0.90625 -0.1875q-0.4375 0 -0.84375 0.15625q-0.40625 0.140625 -0.75 0.46875q-0.328125 0.328125 -0.5625 0.8125q-0.234375 0.484375 -0.328125 1.15625z" fill-rule="nonzero"/><path fill="#a4c2f4" d="m134.06955 -3.874016l101.44882 0l0 150.61418l-101.44882 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m134.06955 -3.874016l101.44882 0l0 150.61418l-101.44882 0z" fill-rule="evenodd"/><path fill="#000000" d="m161.41896 78.540565q-1.140625 0 -2.046875 -0.375q-0.90625 -0.375 -1.59375 -1.09375l0.671875 -1.1875l0.078125 -0.15625l0.125 0.0625q0.046875 0.140625 0.09375 0.296875q0.0625 0.15625 0.265625 0.375q0.484375 0.421875 1.125 0.671875q0.640625 0.234375 1.46875 0.234375q0.515625 0 0.953125 -0.15625q0.4375 -0.171875 0.765625 -0.4375q0.328125 -0.265625 0.5 -0.625q0.1875 -0.375 0.1875 -0.78125q0 -0.3125 -0.0625 -0.578125q-0.0625 -0.265625 -0.25 -0.46875q-0.171875 -0.21875 -0.46875 -0.4375q-0.296875 -0.21875 -0.765625 -0.453125q-0.46875 -0.25 -1.125 -0.53125q-1.203125 -0.453125 -1.90625 -1.0q-0.703125 -0.5625 -1.0 -1.171875q-0.28125 -0.625 -0.28125 -1.28125q0 -0.84375 0.421875 -1.484375q0.4375 -0.65625 1.21875 -1.015625q0.796875 -0.359375 1.90625 -0.359375q0.625 0 1.21875 0.171875q0.59375 0.171875 1.09375 0.5q0.515625 0.3125 0.90625 0.78125l-0.71875 0.96875l-0.109375 0.140625l-0.125 -0.0625q-0.015625 -0.15625 -0.078125 -0.296875q-0.046875 -0.15625 -0.25 -0.375q-0.453125 -0.4375 -0.953125 -0.59375q-0.5 -0.15625 -1.1875 -0.15625q-0.46875 0 -0.84375 0.140625q-0.359375 0.125 -0.625 0.375q-0.25 0.234375 -0.390625 0.546875q-0.140625 0.296875 -0.140625 0.625q0 0.328125 0.0625 0.59375q0.078125 0.25 0.265625 0.5q0.1875 0.234375 0.515625 0.484375q0.328125 0.234375 0.8125 0.484375q0.5 0.234375 1.21875 0.515625q0.859375 0.359375 1.421875 0.703125q0.578125 0.34375 0.890625 0.734375q0.328125 0.375 0.46875 0.828125q0.140625 0.4375 0.140625 0.984375q0 0.796875 -0.40625 1.578125q-0.390625 0.765625 -1.25 1.28125q-0.859375 0.5 -2.21875 0.5l0 0zm9.796875 0q-0.9375 0 -1.703125 -0.28125q-0.765625 -0.296875 -1.3125 -0.859375q-0.546875 -0.5625 -0.84375 -1.390625q-0.296875 -0.828125 -0.296875 -1.890625q0 -1.078125 0.296875 -1.921875q0.3125 -0.84375 0.84375 -1.40625q0.53125 -0.578125 1.25 -0.875q0.71875 -0.296875 1.515625 -0.296875q0.734375 0 1.359375 0.25q0.640625 0.234375 1.109375 0.734375q0.46875 0.5 0.734375 1.265625q0.265625 0.75 0.265625 1.796875q0 0.15625 -0.015625 0.3125q0 0.15625 -0.015625 0.3125l-6.0625 0q0.03125 0.8125 0.265625 1.421875q0.25 0.609375 0.640625 1.0q0.390625 0.375 0.90625 0.5625q0.515625 0.1875 1.09375 0.1875q0.453125 0 0.84375 -0.09375q0.390625 -0.109375 0.734375 -0.328125q0.34375 -0.21875 0.625 -0.53125l0.75 0.71875q-0.359375 0.4375 -0.8125 0.734375q-0.453125 0.296875 -1.0 0.4375q-0.546875 0.140625 -1.171875 0.140625zm-2.828125 -5.296875l4.6875 0q0 -0.59375 -0.15625 -1.0625q-0.15625 -0.484375 -0.453125 -0.828125q-0.28125 -0.34375 -0.6875 -0.515625q-0.390625 -0.1875 -0.90625 -0.1875q-0.4375 0 -0.84375 0.15625q-0.40625 0.140625 -0.75 0.46875q-0.328125 0.328125 -0.5625 0.8125q-0.234375 0.484375 -0.328125 1.15625zm9.296875 -3.421875l1.40625 0l-0.0625 2.484375l-0.171875 -0.28125q0.109375 -0.578125 0.390625 -1.015625q0.28125 -0.453125 0.671875 -0.75q0.390625 -0.3125 0.859375 -0.46875q0.46875 -0.171875 0.96875 -0.171875q0.671875 0 1.234375 0.234375q0.578125 0.234375 1.03125 0.703125l-0.59375 1.046875l-0.078125 0.171875l-0.125 -0.078125q-0.046875 -0.15625 -0.125 -0.28125q-0.0625 -0.140625 -0.3125 -0.3125q-0.296875 -0.1875 -0.546875 -0.25q-0.25 -0.078125 -0.578125 -0.078125q-0.453125 0 -0.921875 0.1875q-0.46875 0.171875 -0.859375 0.546875q-0.375 0.375 -0.609375 0.953125q-0.21875 0.578125 -0.21875 1.375l0 4.53125l-1.359375 0l0 -8.546875zm11.234375 8.53125l-3.375 -8.53125l1.34375 0l2.75 7.0l-0.25 0l1.5 -3.46875q0.453125 -1.03125 0.71875 -1.90625q0.28125 -0.875 0.40625 -1.625l1.265625 0q-0.140625 0.78125 -0.46875 1.71875q-0.3125 0.9375 -0.78125 2.015625l-2.0625 4.796875l-1.046875 0zm10.28125 0.1875q-0.9375 0 -1.703125 -0.28125q-0.765625 -0.296875 -1.3125 -0.859375q-0.546875 -0.5625 -0.84375 -1.390625q-0.296875 -0.828125 -0.296875 -1.890625q0 -1.078125 0.296875 -1.921875q0.3125 -0.84375 0.84375 -1.40625q0.53125 -0.578125 1.25 -0.875q0.71875 -0.296875 1.515625 -0.296875q0.734375 0 1.359375 0.25q0.640625 0.234375 1.109375 0.734375q0.46875 0.5 0.734375 1.265625q0.265625 0.75 0.265625 1.796875q0 0.15625 -0.015625 0.3125q0 0.15625 -0.015625 0.3125l-6.0625 0q0.03125 0.8125 0.265625 1.421875q0.25 0.609375 0.640625 1.0q0.390625 0.375 0.90625 0.5625q0.515625 0.1875 1.09375 0.1875q0.453125 0 0.84375 -0.09375q0.390625 -0.109375 0.734375 -0.328125q0.34375 -0.21875 0.625 -0.53125l0.75 0.71875q-0.359375 0.4375 -0.8125 0.734375q-0.453125 0.296875 -1.0 0.4375q-0.546875 0.140625 -1.171875 0.140625zm-2.828125 -5.296875l4.6875 0q0 -0.59375 -0.15625 -1.0625q-0.15625 -0.484375 -0.453125 -0.828125q-0.28125 -0.34375 -0.6875 -0.515625q-0.390625 -0.1875 -0.90625 -0.1875q-0.4375 0 -0.84375 0.15625q-0.40625 0.140625 -0.75 0.46875q-0.328125 0.328125 -0.5625 0.8125q-0.234375 0.484375 -0.328125 1.15625zm9.296875 -3.421875l1.40625 0l-0.0625 2.484375l-0.171875 -0.28125q0.109375 -0.578125 0.390625 -1.015625q0.28125 -0.453125 0.671875 -0.75q0.390625 -0.3125 0.859375 -0.46875q0.46875 -0.171875 0.96875 -0.171875q0.671875 0 1.234375 0.234375q0.578125 0.234375 1.03125 0.703125l-0.59375 1.046875l-0.078125 0.171875l-0.125 -0.078125q-0.046875 -0.15625 -0.125 -0.28125q-0.0625 -0.140625 -0.3125 -0.3125q-0.296875 -0.1875 -0.546875 -0.25q-0.25 -0.078125 -0.578125 -0.078125q-0.453125 0 -0.921875 0.1875q-0.46875 0.171875 -0.859375 0.546875q-0.375 0.375 -0.609375 0.953125q-0.21875 0.578125 -0.21875 1.375l0 4.53125l-1.359375 0l0 -8.546875z" fill-rule="nonzero"/><path fill="#a4c2f4" d="m134.06955 356.83084l101.44882 0l0 155.14963l-101.44882 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m134.06955 356.83084l101.44882 0l0 155.14963l-101.44882 0z" fill-rule="evenodd"/><path fill="#000000" d="m162.12209 441.51315q-0.953125 0 -1.78125 -0.34375q-0.8125 -0.34375 -1.4375 -1.0625q-0.625 -0.734375 -0.984375 -1.859375q-0.34375 -1.125 -0.34375 -2.671875q0 -1.359375 0.25 -2.375q0.265625 -1.015625 0.71875 -1.703125q0.453125 -0.6875 1.03125 -1.09375q0.578125 -0.421875 1.203125 -0.609375q0.640625 -0.1875 1.28125 -0.1875q1.265625 0 2.15625 0.625q0.890625 0.625 1.34375 1.671875l-1.125 0.515625l-0.15625 0.078125l-0.0625 -0.125q0.0625 -0.15625 0 -0.265625q-0.0625 -0.125 -0.203125 -0.390625q-0.46875 -0.640625 -0.96875 -0.84375q-0.484375 -0.21875 -1.03125 -0.21875q-0.71875 0 -1.3125 0.328125q-0.59375 0.328125 -1.015625 0.953125q-0.40625 0.625 -0.625 1.515625q-0.21875 0.890625 -0.21875 2.03125q0 1.109375 0.25 2.0q0.25 0.890625 0.6875 1.546875q0.453125 0.65625 1.03125 1.015625q0.59375 0.34375 1.265625 0.34375q0.46875 0 0.90625 -0.15625q0.4375 -0.171875 0.796875 -0.5q0.375 -0.328125 0.640625 -0.75l0.96875 0.625q-0.5625 0.9375 -1.34375 1.421875q-0.78125 0.484375 -1.921875 0.484375l0 0zm5.484375 -0.1875l0 -1.078125l2.53125 0l0 -10.25l-2.421875 0l0 -1.078125l3.78125 0l0 11.328125l2.5 0l0 1.078125l-6.390625 0zm9.71875 0l0 -1.078125l2.1875 0l0 -6.359375l-2.0625 0l0 -1.09375l3.40625 0l0 7.453125l2.0 0l0 1.078125l-5.53125 0zm2.78125 -10.3125q-0.390625 0 -0.671875 -0.28125q-0.28125 -0.28125 -0.28125 -0.671875q0 -0.40625 0.265625 -0.6875q0.28125 -0.28125 0.6875 -0.28125q0.390625 0 0.671875 0.296875q0.296875 0.28125 0.296875 0.671875q0 0.390625 -0.296875 0.671875q-0.28125 0.28125 -0.671875 0.28125zm9.765625 10.5q-0.9375 0 -1.703125 -0.28125q-0.765625 -0.296875 -1.3125 -0.859375q-0.546875 -0.5625 -0.84375 -1.390625q-0.296875 -0.828125 -0.296875 -1.890625q0 -1.078125 0.296875 -1.921875q0.3125 -0.84375 0.84375 -1.40625q0.53125 -0.578125 1.25 -0.875q0.71875 -0.296875 1.515625 -0.296875q0.734375 0 1.359375 0.25q0.640625 0.234375 1.109375 0.734375q0.46875 0.5 0.734375 1.265625q0.265625 0.75 0.265625 1.796875q0 0.15625 -0.015625 0.3125q0 0.15625 -0.015625 0.3125l-6.0625 0q0.03125 0.8125 0.265625 1.421875q0.25 0.609375 0.640625 1.0q0.390625 0.375 0.90625 0.5625q0.515625 0.1875 1.09375 0.1875q0.453125 0 0.84375 -0.09375q0.390625 -0.109375 0.734375 -0.328125q0.34375 -0.21875 0.625 -0.53125l0.75 0.71875q-0.359375 0.4375 -0.8125 0.734375q-0.453125 0.296875 -1.0 0.4375q-0.546875 0.140625 -1.171875 0.140625zm-2.828125 -5.296875l4.6875 0q0 -0.59375 -0.15625 -1.0625q-0.15625 -0.484375 -0.453125 -0.828125q-0.28125 -0.34375 -0.6875 -0.515625q-0.390625 -0.1875 -0.90625 -0.1875q-0.4375 0 -0.84375 0.15625q-0.40625 0.140625 -0.75 0.46875q-0.328125 0.328125 -0.5625 0.8125q-0.234375 0.484375 -0.328125 1.15625zm8.421875 5.109375l0 -8.53125l1.3125 0l0 1.515625q0.328125 -0.515625 0.765625 -0.890625q0.453125 -0.390625 1.0 -0.609375q0.546875 -0.21875 1.125 -0.21875q0.703125 0 1.265625 0.359375q0.5625 0.359375 0.890625 1.140625q0.34375 0.78125 0.34375 2.0625l0 5.171875l-1.3125 0l0 -5.125q0 -0.90625 -0.203125 -1.4375q-0.203125 -0.53125 -0.5625 -0.765625q-0.359375 -0.234375 -0.796875 -0.234375q-0.4375 0 -0.875 0.1875q-0.4375 0.1875 -0.8125 0.53125q-0.375 0.34375 -0.609375 0.84375q-0.21875 0.484375 -0.21875 1.09375l0 4.90625l-1.3125 0zm13.5625 0.125q-0.828125 0 -1.34375 -0.265625q-0.515625 -0.28125 -0.796875 -0.875q-0.28125 -0.59375 -0.375 -1.484375q-0.078125 -0.890625 -0.046875 -2.125l0.234375 -6.078125l1.390625 -0.171875l0.1875 -0.015625l0.015625 0.125q-0.09375 0.125 -0.15625 0.265625q-0.0625 0.125 -0.09375 0.4375l-0.171875 2.109375l0.0625 0.34375l-0.140625 3.078125q-0.046875 1.421875 0.078125 2.203125q0.125 0.765625 0.453125 1.0625q0.34375 0.28125 0.90625 0.28125q0.625 0 1.09375 -0.234375q0.46875 -0.25 1.0 -0.625l0.40625 1.0625q-0.625 0.46875 -1.296875 0.6875q-0.671875 0.21875 -1.40625 0.21875l0 0zm-4.265625 -8.65625l6.125 0l0 1.109375l-6.125 0l0 -1.109375z" fill-rule="nonzero"/><path fill="#e06666" d="m271.76746 -3.874016l77.700775 0l0 91.71654l-77.700775 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m271.76746 -3.874016l77.700775 0l0 91.71654l-77.700775 0z" fill-rule="evenodd"/><path fill="#000000" d="m293.19595 37.27925l1.34375 0l0.1875 0l0 0.125q-0.109375 0.125 -0.140625 0.25q-0.03125 0.125 -0.03125 0.453125l0 9.671875l5.59375 0l0 1.125l-6.953125 0l0 -11.625zm12.7656555 11.78125q-1.171875 0 -2.0625 -0.5625q-0.8750305 -0.578125 -1.3750305 -1.578125q-0.5 -1.0 -0.5 -2.28125q0 -1.3125 0.5 -2.3125q0.5 -1.0 1.3750305 -1.5625q0.890625 -0.578125 2.0625 -0.578125q1.15625 0 2.03125 0.578125q0.890625 0.5625 1.390625 1.5625q0.5 1.0 0.5 2.3125q0 1.28125 -0.5 2.28125q-0.5 1.0 -1.390625 1.578125q-0.875 0.5625 -2.03125 0.5625zm0 -1.125q0.734375 0 1.296875 -0.421875q0.5625 -0.4375 0.890625 -1.1875q0.328125 -0.75 0.328125 -1.71875q0 -0.96875 -0.328125 -1.703125q-0.328125 -0.75 -0.890625 -1.171875q-0.5625 -0.421875 -1.296875 -0.421875q-0.734375 0 -1.3125 0.421875q-0.5625 0.421875 -0.8906555 1.171875q-0.328125 0.734375 -0.328125 1.703125q0 0.96875 0.328125 1.71875q0.32815552 0.75 0.8906555 1.1875q0.578125 0.421875 1.3125 0.421875zm9.96875 1.171875q-0.921875 0 -1.71875 -0.328125q-0.78125 -0.34375 -1.359375 -0.9375q-0.5625 -0.59375 -0.875 -1.40625q-0.3125 -0.8125 -0.3125 -1.78125q0 -0.953125 0.3125 -1.765625q0.3125 -0.8125 0.875 -1.40625q0.578125 -0.59375 1.359375 -0.921875q0.796875 -0.34375 1.703125 -0.34375q1.046875 0 1.875 0.421875q0.828125 0.40625 1.359375 1.15625l-0.84375 0.828125l-0.125 0.125l-0.09375 -0.09375q0 -0.15625 -0.078125 -0.28125q-0.0625 -0.125 -0.28125 -0.359375q-0.40625 -0.359375 -0.875 -0.515625q-0.46875 -0.15625 -1.078125 -0.15625q-0.546875 0 -1.03125 0.234375q-0.484375 0.21875 -0.875 0.640625q-0.375 0.421875 -0.609375 1.015625q-0.21875 0.59375 -0.21875 1.328125q0 0.71875 0.21875 1.328125q0.234375 0.609375 0.625 1.0625q0.390625 0.4375 0.9375 0.703125q0.546875 0.25 1.1875 0.25q0.4375 0 0.828125 -0.125q0.390625 -0.125 0.734375 -0.359375q0.359375 -0.234375 0.65625 -0.5625l0.78125 0.90625q-0.625 0.671875 -1.40625 1.015625q-0.765625 0.328125 -1.671875 0.328125l0 0zm7.734375 -4.46875l-1.125 1.03125l0 3.234375l-1.359375 0l0 -12.40625l1.34375 0l0.1875 0l0 0.125q-0.109375 0.125 -0.140625 0.25q-0.03125 0.125 -0.03125 0.453125l0 6.96875l4.265625 -3.953125q0.125 0.015625 0.234375 0.046875q0.109375 0.015625 0.21875 0.03125q0.125 0 0.265625 0.015625q0.140625 0 0.28125 0l0.484375 0l-3.59375 3.375l4.1875 5.09375l-1.734375 0.078125l-3.484375 -4.34375z" fill-rule="nonzero"/><path fill="#e06666" d="m271.76746 421.31134l77.700775 0l0 91.71655l-77.700775 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m271.76746 421.31134l77.700775 0l0 91.71655l-77.700775 0z" fill-rule="evenodd"/><path fill="#000000" d="m293.19595 462.46463l1.34375 0l0.1875 0l0 0.125q-0.109375 0.125 -0.140625 0.25q-0.03125 0.125 -0.03125 0.453125l0 9.671875l5.59375 0l0 1.125l-6.953125 0l0 -11.625zm12.7656555 11.78125q-1.171875 0 -2.0625 -0.5625q-0.8750305 -0.578125 -1.3750305 -1.578125q-0.5 -1.0 -0.5 -2.28125q0 -1.3125 0.5 -2.3125q0.5 -1.0 1.3750305 -1.5625q0.890625 -0.578125 2.0625 -0.578125q1.15625 0 2.03125 0.578125q0.890625 0.5625 1.390625 1.5625q0.5 1.0 0.5 2.3125q0 1.28125 -0.5 2.28125q-0.5 1.0 -1.390625 1.578125q-0.875 0.5625 -2.03125 0.5625zm0 -1.125q0.734375 0 1.296875 -0.421875q0.5625 -0.4375 0.890625 -1.1875q0.328125 -0.75 0.328125 -1.71875q0 -0.96875 -0.328125 -1.703125q-0.328125 -0.75 -0.890625 -1.171875q-0.5625 -0.421875 -1.296875 -0.421875q-0.734375 0 -1.3125 0.421875q-0.5625 0.421875 -0.8906555 1.171875q-0.328125 0.734375 -0.328125 1.703125q0 0.96875 0.328125 1.71875q0.32815552 0.75 0.8906555 1.1875q0.578125 0.421875 1.3125 0.421875zm9.96875 1.171875q-0.921875 0 -1.71875 -0.328125q-0.78125 -0.34375 -1.359375 -0.9375q-0.5625 -0.59375 -0.875 -1.40625q-0.3125 -0.8125 -0.3125 -1.78125q0 -0.953125 0.3125 -1.765625q0.3125 -0.8125 0.875 -1.40625q0.578125 -0.59375 1.359375 -0.921875q0.796875 -0.34375 1.703125 -0.34375q1.046875 0 1.875 0.421875q0.828125 0.40625 1.359375 1.15625l-0.84375 0.828125l-0.125 0.125l-0.09375 -0.09375q0 -0.15625 -0.078125 -0.28125q-0.0625 -0.125 -0.28125 -0.359375q-0.40625 -0.359375 -0.875 -0.515625q-0.46875 -0.15625 -1.078125 -0.15625q-0.546875 0 -1.03125 0.234375q-0.484375 0.21875 -0.875 0.640625q-0.375 0.421875 -0.609375 1.015625q-0.21875 0.59375 -0.21875 1.328125q0 0.71875 0.21875 1.328125q0.234375 0.609375 0.625 1.0625q0.390625 0.4375 0.9375 0.703125q0.546875 0.25 1.1875 0.25q0.4375 0 0.828125 -0.125q0.390625 -0.125 0.734375 -0.359375q0.359375 -0.234375 0.65625 -0.5625l0.78125 0.90625q-0.625 0.671875 -1.40625 1.015625q-0.765625 0.328125 -1.671875 0.328125l0 0zm7.734375 -4.46875l-1.125 1.03125l0 3.234375l-1.359375 0l0 -12.40625l1.34375 0l0.1875 0l0 0.125q-0.109375 0.125 -0.140625 0.25q-0.03125 0.125 -0.03125 0.453125l0 6.96875l4.265625 -3.953125q0.125 0.015625 0.234375 0.046875q0.109375 0.015625 0.21875 0.03125q0.125 0 0.265625 0.015625q0.140625 0 0.28125 0l0.484375 0l-3.59375 3.375l4.1875 5.09375l-1.734375 0.078125l-3.484375 -4.34375z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m349.46823 41.984253l68.47244 0" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m349.46823 41.984253l56.472412 0" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m405.94064 45.287716l9.076202 -3.303463l-9.076202 -3.3034668z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m349.46823 467.16962l68.47244 -1.0078735" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m349.46823 467.16962l56.473724 -0.8312378" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m405.99057 469.64148l9.026611 -3.4367065l-9.12384 -3.1695251z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m237.31076 41.984253l34.456696 0" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m237.31076 41.984253l22.456696 0" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m259.76746 45.287716l9.076202 -3.303463l-9.076202 -3.3034668z" fill-rule="evenodd"/><path fill="#000000" fill-opacity="0.0" d="m235.10603 467.16962l36.661423 0" fill-rule="evenodd"/><path stroke="#ffab40" stroke-width="2.0" stroke-linejoin="round" stroke-linecap="butt" d="m235.10603 467.16962l24.661423 0" fill-rule="evenodd"/><path fill="#ffab40" stroke="#ffab40" stroke-width="2.0" stroke-linecap="butt" d="m259.76746 470.47308l9.076202 -3.3034668l-9.076202 -3.3034668z" fill-rule="evenodd"/><path fill="#0097a7" d="m417.95084 206.67488l261.79526 0l0 37.669296l-261.79526 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.95084 206.67488l261.79526 0l0 37.669296l-261.79526 0z" fill-rule="evenodd"/><path fill="#000000" d="m521.7547 232.42952l0 -11.625l3.3125 0q1.0 0 1.71875 0.203125q0.734375 0.203125 1.203125 0.578125q0.484375 0.375 0.703125 0.90625q0.234375 0.515625 0.234375 1.125q0 0.546875 -0.203125 1.078125q-0.1875 0.515625 -0.578125 0.90625q-0.375 0.390625 -0.9375 0.59375q0.484375 0.171875 0.875 0.46875q0.40625 0.296875 0.6875 0.703125q0.296875 0.390625 0.453125 0.859375q0.171875 0.453125 0.171875 0.9375q0 0.734375 -0.265625 1.34375q-0.265625 0.59375 -0.796875 1.03125q-0.53125 0.421875 -1.359375 0.65625q-0.8125 0.234375 -1.9375 0.234375l-3.28125 0zm1.296875 -1.140625l2.21875 0q0.8125 0 1.421875 -0.203125q0.625 -0.21875 0.96875 -0.703125q0.34375 -0.484375 0.34375 -1.296875q0 -0.6875 -0.296875 -1.21875q-0.296875 -0.53125 -0.96875 -0.828125q-0.65625 -0.3125 -1.796875 -0.3125l-1.890625 0l0 4.5625zm0 -5.671875l1.828125 0q1.015625 0 1.609375 -0.234375q0.609375 -0.25 0.859375 -0.671875q0.265625 -0.421875 0.265625 -0.9375q0 -0.390625 -0.140625 -0.71875q-0.140625 -0.34375 -0.4375 -0.59375q-0.296875 -0.25 -0.78125 -0.390625q-0.484375 -0.15625 -1.171875 -0.15625l-2.03125 0l0 3.703125zm11.265625 7.015625q-0.5 0 -0.953125 -0.15625q-0.453125 -0.140625 -0.8125 -0.4375q-0.359375 -0.3125 -0.625 -0.78125q-0.265625 -0.46875 -0.40625 -1.125q-0.140625 -0.65625 -0.140625 -1.46875l0.03125 -4.765625l1.296875 0l0 4.765625q0 0.796875 0.140625 1.359375q0.15625 0.5625 0.421875 0.90625q0.28125 0.328125 0.625 0.484375q0.359375 0.140625 0.75 0.140625q0.453125 0 0.859375 -0.171875q0.421875 -0.1875 0.734375 -0.53125q0.328125 -0.359375 0.515625 -0.921875q0.1875 -0.5625 0.1875 -1.328125l0 -4.703125l1.3125 0l0 7.203125q0 0.34375 0.015625 0.671875q0.03125 0.328125 0.171875 0.65625l-1.296875 0q-0.09375 -0.21875 -0.125 -0.4375q-0.03125 -0.21875 -0.046875 -0.4375q-0.015625 -0.21875 -0.015625 -0.4375q-0.203125 0.359375 -0.484375 0.640625q-0.28125 0.28125 -0.625 0.484375q-0.34375 0.1875 -0.734375 0.28125q-0.375 0.109375 -0.796875 0.109375zm8.21875 -7.671875l-1.84375 0l0 -1.0625l1.84375 0l0 -0.703125q0 -1.140625 0.40625 -1.859375q0.40625 -0.734375 1.109375 -1.078125q0.71875 -0.34375 1.625 -0.34375q0.796875 0 1.5 0.296875q0.703125 0.28125 1.171875 0.875l-0.71875 0.984375l-0.109375 0.15625l-0.109375 -0.078125q-0.015625 -0.140625 -0.078125 -0.296875q-0.046875 -0.15625 -0.25 -0.375q-0.3125 -0.265625 -0.609375 -0.375q-0.296875 -0.109375 -0.765625 -0.109375q-0.53125 0 -0.96875 0.21875q-0.421875 0.21875 -0.671875 0.6875q-0.234375 0.46875 -0.234375 1.296875l0 0.703125l2.796875 0l0 1.0625l-2.796875 0l0 7.46875l-1.296875 0l0 -7.46875zm9.328125 0l-1.84375 0l0 -1.0625l1.84375 0l0 -0.703125q0 -1.140625 0.40625 -1.859375q0.40625 -0.734375 1.109375 -1.078125q0.71875 -0.34375 1.625 -0.34375q0.796875 0 1.5 0.296875q0.703125 0.28125 1.171875 0.875l-0.71875 0.984375l-0.109375 0.15625l-0.109375 -0.078125q-0.015625 -0.140625 -0.078125 -0.296875q-0.046875 -0.15625 -0.25 -0.375q-0.3125 -0.265625 -0.609375 -0.375q-0.296875 -0.109375 -0.765625 -0.109375q-0.53125 0 -0.96875 0.21875q-0.421875 0.21875 -0.671875 0.6875q-0.234375 0.46875 -0.234375 1.296875l0 0.703125l2.796875 0l0 1.0625l-2.796875 0l0 7.46875l-1.296875 0l0 -7.46875zm11.390625 7.65625q-0.9375 0 -1.703125 -0.28125q-0.765625 -0.296875 -1.3125 -0.859375q-0.546875 -0.5625 -0.84375 -1.390625q-0.296875 -0.828125 -0.296875 -1.890625q0 -1.078125 0.296875 -1.921875q0.3125 -0.84375 0.84375 -1.40625q0.53125 -0.578125 1.25 -0.875q0.71875 -0.296875 1.515625 -0.296875q0.734375 0 1.359375 0.25q0.640625 0.234375 1.109375 0.734375q0.46875 0.5 0.734375 1.265625q0.265625 0.75 0.265625 1.796875q0 0.15625 -0.015625 0.3125q0 0.15625 -0.015625 0.3125l-6.0625 0q0.03125 0.8125 0.265625 1.421875q0.25 0.609375 0.640625 1.0q0.390625 0.375 0.90625 0.5625q0.515625 0.1875 1.09375 0.1875q0.453125 0 0.84375 -0.09375q0.390625 -0.109375 0.734375 -0.328125q0.34375 -0.21875 0.625 -0.53125l0.75 0.71875q-0.359375 0.4375 -0.8125 0.734375q-0.453125 0.296875 -1.0 0.4375q-0.546875 0.140625 -1.171875 0.140625zm-2.828125 -5.296875l4.6875 0q0 -0.59375 -0.15625 -1.0625q-0.15625 -0.484375 -0.453125 -0.828125q-0.28125 -0.34375 -0.6875 -0.515625q-0.390625 -0.1875 -0.90625 -0.1875q-0.4375 0 -0.84375 0.15625q-0.40625 0.140625 -0.75 0.46875q-0.328125 0.328125 -0.5625 0.8125q-0.234375 0.484375 -0.328125 1.15625zm9.296875 -3.421875l1.40625 0l-0.0625 2.484375l-0.171875 -0.28125q0.109375 -0.578125 0.390625 -1.015625q0.28125 -0.453125 0.671875 -0.75q0.390625 -0.3125 0.859375 -0.46875q0.46875 -0.171875 0.96875 -0.171875q0.671875 0 1.234375 0.234375q0.578125 0.234375 1.03125 0.703125l-0.59375 1.046875l-0.078125 0.171875l-0.125 -0.078125q-0.046875 -0.15625 -0.125 -0.28125q-0.0625 -0.140625 -0.3125 -0.3125q-0.296875 -0.1875 -0.546875 -0.25q-0.25 -0.078125 -0.578125 -0.078125q-0.453125 0 -0.921875 0.1875q-0.46875 0.171875 -0.859375 0.546875q-0.375 0.375 -0.609375 0.953125q-0.21875 0.578125 -0.21875 1.375l0 4.53125l-1.359375 0l0 -8.546875z" fill-rule="nonzero"/><path fill="#0097a7" d="m417.95084 244.3293l261.79526 0l0 37.669296l-261.79526 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.95084 244.3293l261.79526 0l0 37.669296l-261.79526 0z" fill-rule="evenodd"/><path fill="#000000" d="m521.7547 270.08395l0 -11.625l3.3125 0q1.0 0 1.71875 0.203125q0.734375 0.203125 1.203125 0.578125q0.484375 0.375 0.703125 0.90625q0.234375 0.515625 0.234375 1.125q0 0.546875 -0.203125 1.078125q-0.1875 0.515625 -0.578125 0.90625q-0.375 0.390625 -0.9375 0.59375q0.484375 0.171875 0.875 0.46875q0.40625 0.296875 0.6875 0.703125q0.296875 0.390625 0.453125 0.859375q0.171875 0.453125 0.171875 0.9375q0 0.734375 -0.265625 1.34375q-0.265625 0.59375 -0.796875 1.03125q-0.53125 0.421875 -1.359375 0.65625q-0.8125 0.234375 -1.9375 0.234375l-3.28125 0zm1.296875 -1.140625l2.21875 0q0.8125 0 1.421875 -0.203125q0.625 -0.21875 0.96875 -0.703125q0.34375 -0.484375 0.34375 -1.296875q0 -0.6875 -0.296875 -1.21875q-0.296875 -0.53125 -0.96875 -0.828125q-0.65625 -0.3125 -1.796875 -0.3125l-1.890625 0l0 4.5625zm0 -5.671875l1.828125 0q1.015625 0 1.609375 -0.234375q0.609375 -0.25 0.859375 -0.671875q0.265625 -0.421875 0.265625 -0.9375q0 -0.390625 -0.140625 -0.71875q-0.140625 -0.34375 -0.4375 -0.59375q-0.296875 -0.25 -0.78125 -0.390625q-0.484375 -0.15625 -1.171875 -0.15625l-2.03125 0l0 3.703125zm11.265625 7.015625q-0.5 0 -0.953125 -0.15625q-0.453125 -0.140625 -0.8125 -0.4375q-0.359375 -0.3125 -0.625 -0.78125q-0.265625 -0.46875 -0.40625 -1.125q-0.140625 -0.65625 -0.140625 -1.46875l0.03125 -4.765625l1.296875 0l0 4.765625q0 0.796875 0.140625 1.359375q0.15625 0.5625 0.421875 0.90625q0.28125 0.328125 0.625 0.484375q0.359375 0.140625 0.75 0.140625q0.453125 0 0.859375 -0.171875q0.421875 -0.1875 0.734375 -0.53125q0.328125 -0.359375 0.515625 -0.921875q0.1875 -0.5625 0.1875 -1.328125l0 -4.703125l1.3125 0l0 7.203125q0 0.34375 0.015625 0.671875q0.03125 0.328125 0.171875 0.65625l-1.296875 0q-0.09375 -0.21875 -0.125 -0.4375q-0.03125 -0.21875 -0.046875 -0.4375q-0.015625 -0.21875 -0.015625 -0.4375q-0.203125 0.359375 -0.484375 0.640625q-0.28125 0.28125 -0.625 0.484375q-0.34375 0.1875 -0.734375 0.28125q-0.375 0.109375 -0.796875 0.109375zm8.21875 -7.671875l-1.84375 0l0 -1.0625l1.84375 0l0 -0.703125q0 -1.140625 0.40625 -1.859375q0.40625 -0.734375 1.109375 -1.078125q0.71875 -0.34375 1.625 -0.34375q0.796875 0 1.5 0.296875q0.703125 0.28125 1.171875 0.875l-0.71875 0.984375l-0.109375 0.15625l-0.109375 -0.078125q-0.015625 -0.140625 -0.078125 -0.296875q-0.046875 -0.15625 -0.25 -0.375q-0.3125 -0.265625 -0.609375 -0.375q-0.296875 -0.109375 -0.765625 -0.109375q-0.53125 0 -0.96875 0.21875q-0.421875 0.21875 -0.671875 0.6875q-0.234375 0.46875 -0.234375 1.296875l0 0.703125l2.796875 0l0 1.0625l-2.796875 0l0 7.46875l-1.296875 0l0 -7.46875zm9.328125 0l-1.84375 0l0 -1.0625l1.84375 0l0 -0.703125q0 -1.140625 0.40625 -1.859375q0.40625 -0.734375 1.109375 -1.078125q0.71875 -0.34375 1.625 -0.34375q0.796875 0 1.5 0.296875q0.703125 0.28125 1.171875 0.875l-0.71875 0.984375l-0.109375 0.15625l-0.109375 -0.078125q-0.015625 -0.140625 -0.078125 -0.296875q-0.046875 -0.15625 -0.25 -0.375q-0.3125 -0.265625 -0.609375 -0.375q-0.296875 -0.109375 -0.765625 -0.109375q-0.53125 0 -0.96875 0.21875q-0.421875 0.21875 -0.671875 0.6875q-0.234375 0.46875 -0.234375 1.296875l0 0.703125l2.796875 0l0 1.0625l-2.796875 0l0 7.46875l-1.296875 0l0 -7.46875zm11.390625 7.65625q-0.9375 0 -1.703125 -0.28125q-0.765625 -0.296875 -1.3125 -0.859375q-0.546875 -0.5625 -0.84375 -1.390625q-0.296875 -0.828125 -0.296875 -1.890625q0 -1.078125 0.296875 -1.921875q0.3125 -0.84375 0.84375 -1.40625q0.53125 -0.578125 1.25 -0.875q0.71875 -0.296875 1.515625 -0.296875q0.734375 0 1.359375 0.25q0.640625 0.234375 1.109375 0.734375q0.46875 0.5 0.734375 1.265625q0.265625 0.75 0.265625 1.796875q0 0.15625 -0.015625 0.3125q0 0.15625 -0.015625 0.3125l-6.0625 0q0.03125 0.8125 0.265625 1.421875q0.25 0.609375 0.640625 1.0q0.390625 0.375 0.90625 0.5625q0.515625 0.1875 1.09375 0.1875q0.453125 0 0.84375 -0.09375q0.390625 -0.109375 0.734375 -0.328125q0.34375 -0.21875 0.625 -0.53125l0.75 0.71875q-0.359375 0.4375 -0.8125 0.734375q-0.453125 0.296875 -1.0 0.4375q-0.546875 0.140625 -1.171875 0.140625zm-2.828125 -5.296875l4.6875 0q0 -0.59375 -0.15625 -1.0625q-0.15625 -0.484375 -0.453125 -0.828125q-0.28125 -0.34375 -0.6875 -0.515625q-0.390625 -0.1875 -0.90625 -0.1875q-0.4375 0 -0.84375 0.15625q-0.40625 0.140625 -0.75 0.46875q-0.328125 0.328125 -0.5625 0.8125q-0.234375 0.484375 -0.328125 1.15625zm9.296875 -3.421875l1.40625 0l-0.0625 2.484375l-0.171875 -0.28125q0.109375 -0.578125 0.390625 -1.015625q0.28125 -0.453125 0.671875 -0.75q0.390625 -0.3125 0.859375 -0.46875q0.46875 -0.171875 0.96875 -0.171875q0.671875 0 1.234375 0.234375q0.578125 0.234375 1.03125 0.703125l-0.59375 1.046875l-0.078125 0.171875l-0.125 -0.078125q-0.046875 -0.15625 -0.125 -0.28125q-0.0625 -0.140625 -0.3125 -0.3125q-0.296875 -0.1875 -0.546875 -0.25q-0.25 -0.078125 -0.578125 -0.078125q-0.453125 0 -0.921875 0.1875q-0.46875 0.171875 -0.859375 0.546875q-0.375 0.375 -0.609375 0.953125q-0.21875 0.578125 -0.21875 1.375l0 4.53125l-1.359375 0l0 -8.546875z" fill-rule="nonzero"/><path fill="#0097a7" d="m417.95084 281.98373l261.79526 0l0 37.66928l-261.79526 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.95084 281.98373l261.79526 0l0 37.66928l-261.79526 0z" fill-rule="evenodd"/><path fill="#000000" d="m545.41876 307.9415q-0.453125 0 -0.765625 -0.3125q-0.3125 -0.3125 -0.3125 -0.75q0 -0.421875 0.3125 -0.734375q0.3125 -0.328125 0.765625 -0.328125q0.421875 0 0.734375 0.328125q0.328125 0.3125 0.328125 0.734375q0 0.4375 -0.328125 0.75q-0.3125 0.3125 -0.734375 0.3125zm3.203125 0q-0.4375 0 -0.765625 -0.3125q-0.3125 -0.3125 -0.3125 -0.75q0 -0.421875 0.3125 -0.734375q0.328125 -0.328125 0.765625 -0.328125q0.4375 0 0.75 0.328125q0.3125 0.3125 0.3125 0.734375q0 0.4375 -0.3125 0.75q-0.3125 0.3125 -0.75 0.3125zm3.25 0q-0.453125 0 -0.765625 -0.3125q-0.3125 -0.3125 -0.3125 -0.75q0 -0.421875 0.3125 -0.734375q0.3125 -0.328125 0.765625 -0.328125q0.421875 0 0.734375 0.328125q0.328125 0.3125 0.328125 0.734375q0 0.4375 -0.328125 0.75q-0.3125 0.3125 -0.734375 0.3125z" fill-rule="nonzero"/><path fill="#0097a7" d="m417.95084 319.63815l261.79526 0l0 37.66928l-261.79526 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m417.95084 319.63815l261.79526 0l0 37.66928l-261.79526 0z" fill-rule="evenodd"/><path fill="#000000" d="m521.7547 345.3928l0 -11.625l3.3125 0q1.0 0 1.71875 0.203125q0.734375 0.203125 1.203125 0.578125q0.484375 0.375 0.703125 0.90625q0.234375 0.515625 0.234375 1.125q0 0.546875 -0.203125 1.078125q-0.1875 0.515625 -0.578125 0.90625q-0.375 0.390625 -0.9375 0.59375q0.484375 0.171875 0.875 0.46875q0.40625 0.296875 0.6875 0.703125q0.296875 0.390625 0.453125 0.859375q0.171875 0.453125 0.171875 0.9375q0 0.734375 -0.265625 1.34375q-0.265625 0.59375 -0.796875 1.03125q-0.53125 0.421875 -1.359375 0.65625q-0.8125 0.234375 -1.9375 0.234375l-3.28125 0zm1.296875 -1.140625l2.21875 0q0.8125 0 1.421875 -0.203125q0.625 -0.21875 0.96875 -0.703125q0.34375 -0.484375 0.34375 -1.296875q0 -0.6875 -0.296875 -1.21875q-0.296875 -0.53125 -0.96875 -0.828125q-0.65625 -0.3125 -1.796875 -0.3125l-1.890625 0l0 4.5625zm0 -5.671875l1.828125 0q1.015625 0 1.609375 -0.234375q0.609375 -0.25 0.859375 -0.671875q0.265625 -0.421875 0.265625 -0.9375q0 -0.390625 -0.140625 -0.71875q-0.140625 -0.34375 -0.4375 -0.59375q-0.296875 -0.25 -0.78125 -0.390625q-0.484375 -0.15625 -1.171875 -0.15625l-2.03125 0l0 3.703125zm11.265625 7.015625q-0.5 0 -0.953125 -0.15625q-0.453125 -0.140625 -0.8125 -0.4375q-0.359375 -0.3125 -0.625 -0.78125q-0.265625 -0.46875 -0.40625 -1.125q-0.140625 -0.65625 -0.140625 -1.46875l0.03125 -4.765625l1.296875 0l0 4.765625q0 0.796875 0.140625 1.359375q0.15625 0.5625 0.421875 0.90625q0.28125 0.328125 0.625 0.484375q0.359375 0.140625 0.75 0.140625q0.453125 0 0.859375 -0.171875q0.421875 -0.1875 0.734375 -0.53125q0.328125 -0.359375 0.515625 -0.921875q0.1875 -0.5625 0.1875 -1.328125l0 -4.703125l1.3125 0l0 7.203125q0 0.34375 0.015625 0.671875q0.03125 0.328125 0.171875 0.65625l-1.296875 0q-0.09375 -0.21875 -0.125 -0.4375q-0.03125 -0.21875 -0.046875 -0.4375q-0.015625 -0.21875 -0.015625 -0.4375q-0.203125 0.359375 -0.484375 0.640625q-0.28125 0.28125 -0.625 0.484375q-0.34375 0.1875 -0.734375 0.28125q-0.375 0.109375 -0.796875 0.109375zm8.21875 -7.671875l-1.84375 0l0 -1.0625l1.84375 0l0 -0.703125q0 -1.140625 0.40625 -1.859375q0.40625 -0.734375 1.109375 -1.078125q0.71875 -0.34375 1.625 -0.34375q0.796875 0 1.5 0.296875q0.703125 0.28125 1.171875 0.875l-0.71875 0.984375l-0.109375 0.15625l-0.109375 -0.078125q-0.015625 -0.140625 -0.078125 -0.296875q-0.046875 -0.15625 -0.25 -0.375q-0.3125 -0.265625 -0.609375 -0.375q-0.296875 -0.109375 -0.765625 -0.109375q-0.53125 0 -0.96875 0.21875q-0.421875 0.21875 -0.671875 0.6875q-0.234375 0.46875 -0.234375 1.296875l0 0.703125l2.796875 0l0 1.0625l-2.796875 0l0 7.46875l-1.296875 0l0 -7.46875zm9.328125 0l-1.84375 0l0 -1.0625l1.84375 0l0 -0.703125q0 -1.140625 0.40625 -1.859375q0.40625 -0.734375 1.109375 -1.078125q0.71875 -0.34375 1.625 -0.34375q0.796875 0 1.5 0.296875q0.703125 0.28125 1.171875 0.875l-0.71875 0.984375l-0.109375 0.15625l-0.109375 -0.078125q-0.015625 -0.140625 -0.078125 -0.296875q-0.046875 -0.15625 -0.25 -0.375q-0.3125 -0.265625 -0.609375 -0.375q-0.296875 -0.109375 -0.765625 -0.109375q-0.53125 0 -0.96875 0.21875q-0.421875 0.21875 -0.671875 0.6875q-0.234375 0.46875 -0.234375 1.296875l0 0.703125l2.796875 0l0 1.0625l-2.796875 0l0 7.46875l-1.296875 0l0 -7.46875zm11.390625 7.65625q-0.9375 0 -1.703125 -0.28125q-0.765625 -0.296875 -1.3125 -0.859375q-0.546875 -0.5625 -0.84375 -1.390625q-0.296875 -0.828125 -0.296875 -1.890625q0 -1.078125 0.296875 -1.921875q0.3125 -0.84375 0.84375 -1.40625q0.53125 -0.578125 1.25 -0.875q0.71875 -0.296875 1.515625 -0.296875q0.734375 0 1.359375 0.25q0.640625 0.234375 1.109375 0.734375q0.46875 0.5 0.734375 1.265625q0.265625 0.75 0.265625 1.796875q0 0.15625 -0.015625 0.3125q0 0.15625 -0.015625 0.3125l-6.0625 0q0.03125 0.8125 0.265625 1.421875q0.25 0.609375 0.640625 1.0q0.390625 0.375 0.90625 0.5625q0.515625 0.1875 1.09375 0.1875q0.453125 0 0.84375 -0.09375q0.390625 -0.109375 0.734375 -0.328125q0.34375 -0.21875 0.625 -0.53125l0.75 0.71875q-0.359375 0.4375 -0.8125 0.734375q-0.453125 0.296875 -1.0 0.4375q-0.546875 0.140625 -1.171875 0.140625zm-2.828125 -5.296875l4.6875 0q0 -0.59375 -0.15625 -1.0625q-0.15625 -0.484375 -0.453125 -0.828125q-0.28125 -0.34375 -0.6875 -0.515625q-0.390625 -0.1875 -0.90625 -0.1875q-0.4375 0 -0.84375 0.15625q-0.40625 0.140625 -0.75 0.46875q-0.328125 0.328125 -0.5625 0.8125q-0.234375 0.484375 -0.328125 1.15625zm9.296875 -3.421875l1.40625 0l-0.0625 2.484375l-0.171875 -0.28125q0.109375 -0.578125 0.390625 -1.015625q0.28125 -0.453125 0.671875 -0.75q0.390625 -0.3125 0.859375 -0.46875q0.46875 -0.171875 0.96875 -0.171875q0.671875 0 1.234375 0.234375q0.578125 0.234375 1.03125 0.703125l-0.59375 1.046875l-0.078125 0.171875l-0.125 -0.078125q-0.046875 -0.15625 -0.125 -0.28125q-0.0625 -0.140625 -0.3125 -0.3125q-0.296875 -0.1875 -0.546875 -0.25q-0.25 -0.078125 -0.578125 -0.078125q-0.453125 0 -0.921875 0.1875q-0.46875 0.171875 -0.859375 0.546875q-0.375 0.375 -0.609375 0.953125q-0.21875 0.578125 -0.21875 1.375l0 4.53125l-1.359375 0l0 -8.546875z" fill-rule="nonzero"/><path fill="#000000" fill-opacity="0.0" d="m689.96375 206.21051l0 150.61417" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="3.0" stroke-linejoin="round" stroke-linecap="butt" d="m689.96375 217.58052l0 127.87416" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="3.0" stroke-linecap="butt" d="m685.02875 217.58052l0 -9.869995l9.869995 0l0 9.869995l-9.869995 0z" fill-rule="nonzero"/><path stroke="#ffffff" stroke-width="3.0" stroke-linecap="butt" d="m694.89874 345.45468l0 9.869995l-9.869995 0l0 -9.869995l9.869995 0z" fill-rule="nonzero"/><path fill="#ea9999" d="m549.8757 151.28491l129.88977 0l0 54.92914l-129.88977 0z" fill-rule="evenodd"/><path stroke="#ffffff" stroke-width="1.0" stroke-linejoin="round" stroke-linecap="butt" d="m549.8757 151.28491l129.88977 0l0 54.92914l-129.88977 0z" fill-rule="evenodd"/><path fill="#000000" d="m596.92993 185.66948l0 -11.625l1.03125 0l2.875 5.6875l2.921875 -5.703125l0.984375 0l0 11.640625l-1.234375 0l0 -8.765625l-2.515625 4.6875l-0.5 0l-2.34375 -4.640625l0 8.71875l-1.21875 0zm12.5 0.203125q-1.453125 0 -2.234375 -0.65625q-0.78125 -0.671875 -0.78125 -1.6875q0 -0.6875 0.328125 -1.234375q0.328125 -0.5625 0.921875 -0.9375q0.59375 -0.390625 1.375 -0.59375q0.78125 -0.203125 1.65625 -0.203125q0.3125 0 0.640625 0.015625q0.328125 0 0.6875 0.03125q0.375 0.03125 0.765625 0.078125l0.03125 1.03125q-0.328125 -0.0625 -0.671875 -0.078125q-0.34375 -0.03125 -0.6875 -0.046875q-0.328125 -0.03125 -0.65625 -0.03125q-0.671875 0 -1.234375 0.125q-0.546875 0.109375 -0.96875 0.328125q-0.40625 0.21875 -0.640625 0.578125q-0.21875 0.359375 -0.21875 0.859375q0 0.390625 0.140625 0.671875q0.140625 0.265625 0.375 0.421875q0.25 0.15625 0.578125 0.234375q0.328125 0.0625 0.71875 0.0625q0.734375 0 1.296875 -0.21875q0.578125 -0.234375 0.96875 -0.6875q0.40625 -0.453125 0.609375 -1.09375q0.203125 -0.65625 0.203125 -1.5q0 -1.265625 -0.296875 -1.96875q-0.296875 -0.71875 -0.890625 -1.015625q-0.578125 -0.296875 -1.5 -0.296875q-0.609375 0 -1.203125 0.234375q-0.59375 0.234375 -1.078125 0.71875l-0.65625 -0.859375q0.59375 -0.59375 1.375 -0.890625q0.78125 -0.3125 1.65625 -0.3125q0.84375 0 1.53125 0.21875q0.6875 0.203125 1.1875 0.6875q0.5 0.46875 0.765625 1.265625q0.265625 0.796875 0.265625 2.015625l0 4.53125l-1.453125 0l0 -1.3125q-0.234375 0.46875 -0.59375 0.765625q-0.34375 0.296875 -0.734375 0.46875q-0.390625 0.15625 -0.796875 0.21875q-0.40625 0.0625 -0.78125 0.0625zm10.078125 -0.015625q-0.71875 0 -1.359375 -0.15625q-0.640625 -0.140625 -1.21875 -0.4375q-0.5625 -0.296875 -1.046875 -0.75l0.75 -1.265625l0.109375 -0.171875l0.109375 0.078125q0.046875 0.140625 0.09375 0.3125q0.046875 0.15625 0.25 0.375q0.390625 0.390625 0.9375 0.640625q0.546875 0.25 1.40625 0.25q0.6875 0 1.15625 -0.15625q0.484375 -0.171875 0.75 -0.484375q0.265625 -0.3125 0.265625 -0.71875q0 -0.28125 -0.125 -0.484375q-0.109375 -0.21875 -0.359375 -0.40625q-0.25 -0.1875 -0.671875 -0.375q-0.40625 -0.1875 -1.0 -0.375q-0.96875 -0.28125 -1.703125 -0.65625q-0.71875 -0.375 -1.109375 -0.859375q-0.390625 -0.5 -0.390625 -1.125q0 -0.65625 0.34375 -1.125q0.359375 -0.484375 1.0625 -0.75q0.703125 -0.28125 1.734375 -0.28125q0.59375 0 1.078125 0.09375q0.484375 0.078125 0.890625 0.25q0.40625 0.15625 0.75 0.421875q0.359375 0.25 0.671875 0.578125l-0.765625 0.9375l-0.140625 0.140625l-0.078125 -0.09375q-0.03125 -0.140625 -0.09375 -0.28125q-0.0625 -0.140625 -0.265625 -0.34375q-0.4375 -0.34375 -0.984375 -0.515625q-0.53125 -0.171875 -1.03125 -0.171875q-0.75 0 -1.296875 0.265625q-0.53125 0.25 -0.53125 0.796875q0 0.28125 0.1875 0.546875q0.203125 0.265625 0.71875 0.546875q0.53125 0.265625 1.5 0.546875q1.0625 0.328125 1.703125 0.671875q0.640625 0.34375 0.921875 0.796875q0.296875 0.453125 0.296875 1.0625q0 0.71875 -0.390625 1.328125q-0.375 0.609375 -1.15625 0.984375q-0.78125 0.359375 -1.96875 0.359375l0 0zm8.359375 -4.453125l-1.125 1.03125l0 3.234375l-1.359375 0l0 -12.40625l1.34375 0l0.1875 0l0 0.125q-0.109375 0.125 -0.140625 0.25q-0.03125 0.125 -0.03125 0.453125l0 6.96875l4.265625 -3.953125q0.125 0.015625 0.234375 0.046875q0.109375 0.015625 0.21875 0.03125q0.125 0 0.265625 0.015625q0.140625 0 0.28125 0l0.484375 0l-3.59375 3.375l4.1875 5.09375l-1.734375 0.078125l-3.484375 -4.34375z" fill-rule="nonzero"/></g></svg>
\ No newline at end of file
diff --git a/libc/docs/gpu/rpc.rst b/libc/docs/gpu/rpc.rst
index b02053e2dc72a..78ae778671881 100644
--- a/libc/docs/gpu/rpc.rst
+++ b/libc/docs/gpu/rpc.rst
@@ -11,10 +11,298 @@ Remote Procedure Calls
 Remote Procedure Call Implementation
 ====================================
 
-Certain features from the standard C library, such as allocation or printing,
-require support from the operating system. We instead implement a remote
-procedure call (RPC) interface to allow submitting work from the GPU to a host
-server that forwards it to the host system.
+Traditionally, the C library abstracts over several functions that interface 
+with the platform's operating system through system calls. The GPU however does 
+not provide an operating system that can handle target dependent operations.
+Instead, we implemented remote procedure calls to interface with the host's 
+operating system while executing on a GPU.
+
+We implemented remote procedure calls using unified virtual memory to create a 
+shared communicate channel between the two processes. This memory is often 
+pinned memory that can be accessed asynchronously and atomically by multiple 
+processes simultaneously. This supports means that we can simply provide mutual 
+exclusion on a shared better to swap work back and forth between the host system 
+and the GPU. We can then use this to create a simple client-server protocol 
+using this shared memory.
+
+This work treats the GPU as a client and the host as a server. The client 
+initiates a communication while the server listens for them. In order to 
+communicate between the host and the device, we simply maintain a buffer of 
+memory and two mailboxes. One mailbox is write-only while the other is 
+read-only. This exposes three primitive operations: using the buffer, giving 
+away ownership, and waiting for ownership. This is implemented as a half-duplex 
+transmission channel between the two sides. We decided to assign ownership of 
+the buffer to the client when the inbox and outbox bits are equal and to the 
+server when they are not.
+
+In order to make this transmission channel thread-safe, we abstract ownership of 
+the given mailbox pair and buffer around a port, effectively acting as a lock 
+and an index into the allocated buffer slice. The server and device have 
+independent locks around the given port. In this scheme, the buffer can be used 
+to communicate intent and data generically with the server. We them simply 
+provide multiple copies of this protocol and expose them as multiple ports.
+
+If this were simply a standard CPU system, this would be sufficient. However, 
+GPUs have my unique architectural challenges. First, GPU threads execute in 
+lock-step with each other in groups typically called warps or wavefronts. We 
+need to target the smallest unit of independent parallelism, so the RPC 
+interface needs to handle an entire group of threads at once. This is done by 
+increasing the size of the buffer and adding a thread mask argument so the 
+server knows which threads are active when it handles the communication. Second, 
+GPUs generally have no forward progress guarantees. In order to guarantee we do 
+not encounter deadlocks while executing it is required that the number of ports 
+matches the maximum amount of hardware parallelism on the device. It is also 
+very important that the thread mask remains consistent while interfacing with 
+the port.
+
+.. image:: ./rpc-diagram.svg
+   :width: 75%
+   :align: center
+
+The above diagram outlines the architecture of the RPC interface. For clarity 
+the following list will explain the operations done by the client and server 
+respectively when initiating a communication.
+
+First, a communication from the perspective of the client:
+
+* The client searches for an available port and claims the lock.
+* The client checks that the port is still available to the current device and 
+  continues if so.
+* The client writes its data to the fixed-size packet and toggles its outbox.
+* The client waits until its inbox matches its outbox.
+* The client reads the data from the fixed-size packet.
+* The client closes the port and continues executing.
+
+Now, the same communication from the perspective of the server:
+
+* The server searches for an available port with pending work and claims the 
+  lock.
+* The server checks that the port is still available to the current device.
+* The server reads the opcode to perform the expected operation, in this 
+  case a receive and then send.
+* The server reads the data from the fixed-size packet.
+* The server writes its data to the fixed-size packet and toggles its outbox.
+* The server closes the port and continues searching for ports that need to be 
+  serviced
+
+This architecture currently requires that the host periodically checks the RPC 
+server's buffer for ports with pending work. Note that a port can be closed 
+without waiting for its submitted work to be completed. This allows us to model 
+asynchronous operations that do not need to wait until the server has completed 
+them. If an operation requires more data than the fixed size buffer, we simply 
+send multiple packets back and forth in a streaming fashion.
+
+Server Library
+--------------
+
+The RPC server's basic functionality is provided by the LLVM C library. A static 
+library called ``libllvmlibc_rpc_server.a`` includes handling for the basic 
+operations, such as printing or exiting. This has a small API that handles 
+setting up the unified buffer and an interface to check the opcodes.
+
+Some operations are too divergent to provide generic implementations for, such 
+as allocating device accessible memory. For these cases, we provide a callback 
+registration scheme to add a custom handler for any given opcode through the 
+port API. More information can be found in the installed header 
+``<install>/include/gpu-none-llvm/rpc_server.h``.
+
+Client Example
+--------------
+
+The Client API is not currently exported by the LLVM C library. This is 
+primarily due to being written in C++ and relying on internal data structures. 
+It uses a simple send and receive interface with a fixed-size packet. The 
+following example uses the RPC interface to call a function pointer on the 
+server.
+
+This code first opens a port with the given opcode to facilitate the 
+communication. It then copies over the argument struct to the server using the 
+``send_n`` interface to stream arbitrary bytes. The next send operation provides 
+the server with the function pointer that will be executed. The final receive 
+operation is a no-op and simply forces the client to wait until the server is 
+done. It can be omitted if asynchronous execution is desired.
+
+.. code-block:: c++
+
+  void rpc_host_call(void *fn, void *data, size_t size) {
+    rpc::Client::Port port = rpc::client.open<RPC_HOST_CALL>();
+    port.send_n(data, size);
+    port.send([=](rpc::Buffer *buffer) {
+      buffer->data[0] = reinterpret_cast<uintptr_t>(fn);
+    });
+    port.recv([](rpc::Buffer *) {});
+    port.close();
+  }
+
+Server Example
+--------------
+
+This example shows the server-side handling of the previous client example. When 
+the server is checked, if there are any ports with pending work it will check 
+the opcode and perform the appropriate action. In this case, the action is to 
+call a function pointer provided by the client.
+
+In this example, the server simply runs forever in a separate thread for 
+brevity's sake. Because the client is a GPU potentially handling several threads 
+at once, the server needs to loop over all the active threads on the GPU. We 
+abstract this into the ``lane_size`` variable, which is simply the device's warp 
+or wavefront size. The identifier is simply the threads index into the current 
+warp or wavefront. We allocate memory to copy the struct data into, and then 
+call the given function pointer with that copied data. The final send simply 
+signals completion and uses the implicit thread mask to delete the temporary 
+data.
+
+.. code-block:: c++
+  
+  for(;;) {
+    auto port = server.try_open(index);
+    if (!port)
+      return continue;
+
+    switch(port->get_opcode()) {
+    case RPC_HOST_CALL: {
+      uint64_t sizes[LANE_SIZE];
+      void *args[LANE_SIZE];
+      port->recv_n(args, sizes, [&](uint64_t size) { return new char[size]; });
+      port->recv([&](rpc::Buffer *buffer, uint32_t id) {
+        reinterpret_cast<void (*)(void *)>(buffer->data[0])(args[id]);
+      });
+      port->send([&](rpc::Buffer *, uint32_t id) {
+        delete[] reinterpret_cast<uint8_t *>(args[id]);
+      });
+      break;
+    }
+    default:
+      port->recv([](rpc::Buffer *) {});
+      break;
+    }
+  }
+
+CUDA Server Example
+-------------------
+
+The following code shows an example of using the exported RPC interface along 
+with the C library to manually configure a working server using the CUDA 
+language. Other runtimes can use the presence of the ``__llvm_libc_rpc_client`` 
+in the GPU executable as an indicator for whether or not the server can be 
+checked. These details should ideally be handled by the GPU language runtime, 
+but the following example shows how it can be used by a standard user.
+
+.. code-block:: cuda
+
+  #include <cstdio>
+  #include <cstdlib>
+  #include <cuda_runtime.h>
+  
+  #include <gpu-none-llvm/rpc_server.h>
+  
+  [[noreturn]] void handle_error(cudaError_t err) {
+    fprintf(stderr, "CUDA error: %s\n", cudaGetErrorString(err));
+    exit(EXIT_FAILURE);
+  }
+  
+  [[noreturn]] void handle_error(rpc_status_t err) {
+    fprintf(stderr, "RPC error: %d\n", err);
+    exit(EXIT_FAILURE);
+  }
+  
+  // The handle to the RPC client provided by the C library.
+  extern "C" __device__ void *__llvm_libc_rpc_client;
+  
+  __global__ void get_client_ptr(void **ptr) { *ptr = __llvm_libc_rpc_client; }
+  
+  // Obtain the RPC client's handle from the device. The CUDA language cannot look
+  // up the symbol directly like the driver API, so we launch a kernel to read it.
+  void *get_rpc_client() {
+    void *rpc_client = nullptr;
+    void **rpc_client_d = nullptr;
+  
+    if (cudaError_t err = cudaMalloc(&rpc_client_d, sizeof(void *)))
+      handle_error(err);
+    get_client_ptr<<<1, 1>>>(rpc_client_d);
+    if (cudaError_t err = cudaDeviceSynchronize())
+      handle_error(err);
+    if (cudaError_t err = cudaMemcpy(&rpc_client, rpc_client_d, sizeof(void *),
+                                     cudaMemcpyDeviceToHost))
+      handle_error(err);
+    return rpc_client;
+  }
+  
+  // Routines to allocate mapped memory that both the host and the device can
+  // access asychonrously to communicate with eachother.
+  void *alloc_host(size_t size, void *) {
+    void *sharable_ptr;
+    if (cudaError_t err = cudaMallocHost(&sharable_ptr, sizeof(void *)))
+      handle_error(err);
+    return sharable_ptr;
+  };
+  
+  void free_host(void *ptr, void *) {
+    if (cudaError_t err = cudaFreeHost(ptr))
+      handle_error(err);
+  }
+  
+  // The device-side overload of the standard C function to call.
+  extern "C" __device__ int puts(const char *);
+  
+  // Calls the C library function from the GPU C library.
+  __global__ void hello() { puts("Hello world!"); }
+  
+  int main() {
+    int device = 0;
+    // Initialize the RPC server to run on a single device.
+    if (rpc_status_t err = rpc_init(/*num_device=*/1))
+      handle_error(err);
+  
+    // Initialize the RPC server to run on the given device.
+    if (rpc_status_t err =
+            rpc_server_init(device, RPC_MAXIMUM_PORT_COUNT,
+                            /*warp_size=*/32, alloc_host, /*data=*/nullptr))
+      handle_error(err);
+  
+    // Initialize the RPC client by copying the buffer to the device's handle.
+    void *rpc_client = get_rpc_client();
+    if (cudaError_t err =
+            cudaMemcpy(rpc_client, rpc_get_client_buffer(device),
+                       rpc_get_client_size(), cudaMemcpyHostToDevice))
+      handle_error(err);
+  
+    cudaStream_t stream;
+    if (cudaError_t err = cudaStreamCreate(&stream))
+      handle_error(err);
+  
+    // Execute the kernel.
+    hello<<<1, 1, 0, stream>>>();
+  
+    // While the kernel is executing, check the RPC server for work to do.
+    while (cudaStreamQuery(stream) == cudaErrorNotReady)
+      if (rpc_status_t err = rpc_handle_server(device))
+        handle_error(err);
+  
+    // Shut down the server running on the given device.
+    if (rpc_status_t err =
+            rpc_server_shutdown(device, free_host, /*data=*/nullptr))
+      handle_error(err);
+  
+    // Shut down the entire RPC server interface.
+    if (rpc_status_t err = rpc_shutdown())
+      handle_error(err);
+  
+    return EXIT_SUCCESS;
+  }
+
+The above code must be compiled in CUDA's relocatable device code mode and with 
+the advanced offloading driver to link in the library. Currently this can be 
+done with the following invocation. Using LTO avoids the overhead normally 
+associated with relocatable device code linking.
+
+.. code-block:: sh
+
+  $> clang++ -x cuda rpc.cpp --offload-arch=native -fgpu-rdc -lcudart -lcgpu \
+       -I<install-path>include -L<install-path>/lib -lllvmlibc_rpc_server \
+       -O3 -foffload-lto -o hello
+  $> ./hello
+  Hello world!
 
 Extensions
 ----------
diff --git a/libc/docs/gpu/testing.rst b/libc/docs/gpu/testing.rst
index d0f162694562d..f793e938e9806 100644
--- a/libc/docs/gpu/testing.rst
+++ b/libc/docs/gpu/testing.rst
@@ -18,9 +18,9 @@ Testing Infrastructure
 ======================
 
 The testing support in LLVM's libc implementation for GPUs is designed to mimic
-the standard unit tests as much as possible. We use the `remote procedure call
-<libc_gpu_rpc>`_ support to provide the necessary utilities like printing from
-the GPU. Execution is performed by emitting a ``_start`` kernel from the GPU
+the standard unit tests as much as possible. We use the :ref:`libc_gpu_rpc` 
+support to provide the necessary utilities like printing from the GPU. Execution 
+is performed by emitting a ``_start`` kernel from the GPU
 that is then called by an external loader utility. This is an example of how
 this can be done manually:
 

From 3428c28b3d611ac608bb8e22633c65f605a6511e Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 23 Jan 2024 12:41:46 -0500
Subject: [PATCH 659/843] [gn] port 5176df55d3a

---
 llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
index e82ec494993d0..efb324713cfea 100644
--- a/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
+++ b/llvm/utils/gn/secondary/compiler-rt/test/BUILD.gn
@@ -64,7 +64,7 @@ write_cmake_config("lit_common_configured") {
     "SANITIZER_CAN_USE_CXXABI_PYBOOL=True",
     "SANITIZER_USE_STATIC_CXX_ABI_PYBOOL=False",
     "SANITIZER_USE_STATIC_LLVM_UNWINDER_PYBOOL=False",
-    "COMPILER_RT_HAS_AARCH64_SME=False",
+    "COMPILER_RT_HAS_AARCH64_SME_PYBOOL=False",
     "COMPILER_RT_HAS_LLD_PYBOOL=True",
     "COMPILER_RT_HAS_GWP_ASAN_PYBOOL=False",
     "HAVE_RPC_XDR_H=0",

From edeaf41e22cc7b1ab9ef30954762f2a08a3c3561 Mon Sep 17 00:00:00 2001
From: Alina Sbirlea <asbirlea@google.com>
Date: Tue, 23 Jan 2024 09:42:47 -0800
Subject: [PATCH 660/843] [ConstantHoisting] Cache OptForSize. (#79170)

CacheOptForSize to remove quadratic behavior.

For each constant analyzed, ConstantHoising calls
`shouldOptimizeForSize(F)`, which calls `PSI.getTotalCallCount(F)`.
PSI.getTotalCallCount(F) goes through all the instructions in all basic
blocks, and checks if each is a call, to count them up.

This reduces `llc` time for a very large IR from ~10min to under 3min.
Reproducer testcase is much too large to share.
---
 llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h | 1 +
 llvm/lib/Transforms/Scalar/ConstantHoisting.cpp        | 7 ++++---
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
index fa13ed73d506a..a4d1653fdf4bc 100644
--- a/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
+++ b/llvm/include/llvm/Transforms/Scalar/ConstantHoisting.h
@@ -153,6 +153,7 @@ class ConstantHoistingPass : public PassInfoMixin<ConstantHoistingPass> {
   const DataLayout *DL;
   BasicBlock *Entry;
   ProfileSummaryInfo *PSI;
+  bool OptForSize;
 
   /// Keeps track of constant candidates found in the function.
   using ConstCandVecType = std::vector<consthoist::ConstantCandidate>;
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 9e40d94dd73c7..49f8761a13923 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -576,9 +576,6 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
                                            ConstCandVecType::iterator &MaxCostItr) {
   unsigned NumUses = 0;
 
-  bool OptForSize = Entry->getParent()->hasOptSize() ||
-                    llvm::shouldOptimizeForSize(Entry->getParent(), PSI, BFI,
-                                                PGSOQueryType::IRPass);
   if (!OptForSize || std::distance(S,E) > 100) {
     for (auto ConstCand = S; ConstCand != E; ++ConstCand) {
       NumUses += ConstCand->Uses.size();
@@ -948,6 +945,10 @@ bool ConstantHoistingPass::runImpl(Function &Fn, TargetTransformInfo &TTI,
   this->Ctx = &Fn.getContext();
   this->Entry = &Entry;
   this->PSI = PSI;
+  this->OptForSize = Entry.getParent()->hasOptSize() ||
+                     llvm::shouldOptimizeForSize(Entry.getParent(), PSI, BFI,
+                                                 PGSOQueryType::IRPass);
+
   // Collect all constant candidates.
   collectConstantCandidates(Fn);
 

From 6a7abea47418721616c9dd7a16f5cc3e5a67d49c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 23 Jan 2024 18:44:07 +0100
Subject: [PATCH 661/843] Revert "[clang-repl] Enable native CPU detection by
 default (#77491)" (#79178)

Reverting because `clang-repl -Xcc -mcpu=arm1176jzf-s` isn't overwriting
this as I had expected. We need to check whether a specific CPU flag was
given by the user first.

Reverts llvm/llvm-project#77491
---
 clang/lib/Interpreter/Interpreter.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index d1764d07dfd24..9f97a3c6b0be9 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -148,7 +148,6 @@ IncrementalCompilerBuilder::create(std::vector<const char *> &ClangArgv) {
   // We do C++ by default; append right after argv[0] if no "-x" given
   ClangArgv.insert(ClangArgv.end(), "-Xclang");
   ClangArgv.insert(ClangArgv.end(), "-fincremental-extensions");
-  ClangArgv.insert(ClangArgv.end(), "-mcpu=native");
   ClangArgv.insert(ClangArgv.end(), "-c");
 
   // Put a dummy C++ file on to ensure there's at least one compile job for the

From 15fdc7646ca24506ccb6a59861da8f3b52ab031c Mon Sep 17 00:00:00 2001
From: Alexandre Ganea <alex_toresh@yahoo.fr>
Date: Tue, 23 Jan 2024 12:03:12 -0500
Subject: [PATCH 662/843] Re-land [openmp] Fix warnings when building on
 Windows with latest MSVC or Clang ToT (#77853)

The reverts 94f960925b7f609636fc2ffd83053814d5e45ed1 and fixes it.
---
 openmp/cmake/HandleOpenMPOptions.cmake   |  9 +++++++++
 openmp/runtime/src/kmp_affinity.cpp      | 25 +++++++++++++++++-------
 openmp/runtime/src/kmp_barrier.cpp       |  8 ++++----
 openmp/runtime/src/kmp_io.cpp            | 19 ------------------
 openmp/runtime/src/kmp_os.h              | 16 +++++++++++++++
 openmp/runtime/src/kmp_settings.cpp      | 19 +++++++++++++++++-
 openmp/runtime/src/kmp_wait_release.h    |  6 ------
 openmp/runtime/src/z_Windows_NT_util.cpp |  3 ++-
 8 files changed, 67 insertions(+), 38 deletions(-)

diff --git a/openmp/cmake/HandleOpenMPOptions.cmake b/openmp/cmake/HandleOpenMPOptions.cmake
index b0fd0b7de4bf7..201aeabbd3df9 100644
--- a/openmp/cmake/HandleOpenMPOptions.cmake
+++ b/openmp/cmake/HandleOpenMPOptions.cmake
@@ -41,3 +41,12 @@ append_if(OPENMP_HAVE_WMAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE
 append_if(OPENMP_HAVE_NO_SEMANTIC_INTERPOSITION "-fno-semantic-interposition" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_FUNCTION_SECTIONS "-ffunction-section" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
 append_if(OPENMP_HAVE_DATA_SECTIONS "-fdata-sections" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+
+if (MSVC)
+  # Disable "warning C4201: nonstandard extension used: nameless struct/union"
+  append("-wd4201" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+  
+  # Disable "warning C4190: '__kmpc_atomic_cmplx8_rd' has C-linkage specified, but returns
+  # UDT '__kmp_cmplx64_t' which is incompatible with C"
+  append("-wd4190" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
+endif()
diff --git a/openmp/runtime/src/kmp_affinity.cpp b/openmp/runtime/src/kmp_affinity.cpp
index 7009730a49ba7..6a41d34b02372 100644
--- a/openmp/runtime/src/kmp_affinity.cpp
+++ b/openmp/runtime/src/kmp_affinity.cpp
@@ -127,8 +127,12 @@ const char *__kmp_hw_get_catalog_string(kmp_hw_t type, bool plural) {
     return ((plural) ? KMP_I18N_STR(Threads) : KMP_I18N_STR(Thread));
   case KMP_HW_PROC_GROUP:
     return ((plural) ? KMP_I18N_STR(ProcGroups) : KMP_I18N_STR(ProcGroup));
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return KMP_I18N_STR(Unknown);
   }
-  return KMP_I18N_STR(Unknown);
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
@@ -157,13 +161,18 @@ const char *__kmp_hw_get_keyword(kmp_hw_t type, bool plural) {
     return ((plural) ? "threads" : "thread");
   case KMP_HW_PROC_GROUP:
     return ((plural) ? "proc_groups" : "proc_group");
+  case KMP_HW_UNKNOWN:
+  case KMP_HW_LAST:
+    return ((plural) ? "unknowns" : "unknown");
   }
-  return ((plural) ? "unknowns" : "unknown");
+  KMP_ASSERT2(false, "Unhandled kmp_hw_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
   switch (type) {
   case KMP_HW_CORE_TYPE_UNKNOWN:
+  case KMP_HW_MAX_NUM_CORE_TYPES:
     return "unknown";
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   case KMP_HW_CORE_TYPE_ATOM:
@@ -172,7 +181,8 @@ const char *__kmp_hw_get_core_type_string(kmp_hw_core_type_t type) {
     return "Intel(R) Core(TM) processor";
 #endif
   }
-  return "unknown";
+  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 #if KMP_AFFINITY_SUPPORTED
@@ -1238,17 +1248,18 @@ bool kmp_topology_t::filter_hw_subset() {
   struct core_type_indexer {
     int operator()(const kmp_hw_thread_t &t) const {
       switch (t.attrs.get_core_type()) {
+      case KMP_HW_CORE_TYPE_UNKNOWN:
+      case KMP_HW_MAX_NUM_CORE_TYPES:
+        return 0;
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
       case KMP_HW_CORE_TYPE_ATOM:
         return 1;
       case KMP_HW_CORE_TYPE_CORE:
         return 2;
 #endif
-      case KMP_HW_CORE_TYPE_UNKNOWN:
-        return 0;
       }
-      KMP_ASSERT(0);
-      return 0;
+      KMP_ASSERT2(false, "Unhandled kmp_hw_thread_t enumeration");
+      KMP_BUILTIN_UNREACHABLE;
     }
   };
   struct core_eff_indexer {
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index 281b8e9c2883d..e9ab15f1723b6 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -2403,11 +2403,11 @@ void __kmp_fork_barrier(int gtid, int tid) {
 #if USE_ITT_BUILD
   void *itt_sync_obj = NULL;
 #endif /* USE_ITT_BUILD */
+#ifdef KMP_DEBUG
   if (team)
-
-  KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
-                (team != NULL) ? team->t.t_id : -1, tid));
-
+    KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d:%d) has arrived\n", gtid,
+                  (team != NULL) ? team->t.t_id : -1, tid));
+#endif
   // th_team pointer only valid for primary thread here
   if (KMP_MASTER_TID(tid)) {
 #if USE_ITT_BUILD && USE_ITT_NOTIFY
diff --git a/openmp/runtime/src/kmp_io.cpp b/openmp/runtime/src/kmp_io.cpp
index 578e6e671cdf2..0c52662bc2357 100644
--- a/openmp/runtime/src/kmp_io.cpp
+++ b/openmp/runtime/src/kmp_io.cpp
@@ -50,24 +50,6 @@ static HANDLE __kmp_stderr = NULL;
 static int __kmp_console_exists = FALSE;
 static kmp_str_buf_t __kmp_console_buf;
 
-static int is_console(void) {
-  char buffer[128];
-  DWORD rc = 0;
-  DWORD err = 0;
-  // Try to get console title.
-  SetLastError(0);
-  // GetConsoleTitle does not reset last error in case of success or short
-  // buffer, so we need to clear it explicitly.
-  rc = GetConsoleTitle(buffer, sizeof(buffer));
-  if (rc == 0) {
-    // rc == 0 means getting console title failed. Let us find out why.
-    err = GetLastError();
-    // err == 0 means buffer too short (we suppose console exists).
-    // In Window applications we usually have err == 6 (invalid handle).
-  }
-  return rc > 0 || err == 0;
-}
-
 void __kmp_close_console(void) {
   /* wait until user presses return before closing window */
   /* TODO only close if a window was opened */
@@ -84,7 +66,6 @@ void __kmp_close_console(void) {
 static void __kmp_redirect_output(void) {
   __kmp_acquire_bootstrap_lock(&__kmp_console_lock);
 
-  (void)is_console;
   if (!__kmp_console_exists) {
     HANDLE ho;
     HANDLE he;
diff --git a/openmp/runtime/src/kmp_os.h b/openmp/runtime/src/kmp_os.h
index 6862fd89b6302..a0552dd930a62 100644
--- a/openmp/runtime/src/kmp_os.h
+++ b/openmp/runtime/src/kmp_os.h
@@ -306,6 +306,8 @@ template <> struct traits_t<unsigned long long> {
    !KMP_MIC)
 
 #if KMP_OS_WINDOWS
+// Don't include everything related to NT status code, we'll do that explicitly
+#define WIN32_NO_STATUS
 #include <windows.h>
 
 static inline int KMP_GET_PAGE_SIZE(void) {
@@ -1297,4 +1299,18 @@ extern void *__kmp_lookup_symbol(const char *name, bool next = false);
 #define KMP_DLSYM_NEXT(name) dlsym(RTLD_NEXT, name)
 #endif
 
+// MSVC doesn't have this, but clang/clang-cl does.
+#ifndef __has_builtin
+#define __has_builtin(x) 0
+#endif
+
+// Same as LLVM_BUILTIN_UNREACHABLE. States that it is UB to reach this point.
+#if __has_builtin(__builtin_unreachable) || defined(__GNUC__)
+#define KMP_BUILTIN_UNREACHABLE __builtin_unreachable()
+#elif defined(_MSC_VER)
+#define KMP_BUILTIN_UNREACHABLE __assume(false)
+#else
+#define KMP_BUILTIN_UNREACHABLE
+#endif
+
 #endif /* KMP_OS_H */
diff --git a/openmp/runtime/src/kmp_settings.cpp b/openmp/runtime/src/kmp_settings.cpp
index 30a4c05fe76b3..d2157b10b7819 100644
--- a/openmp/runtime/src/kmp_settings.cpp
+++ b/openmp/runtime/src/kmp_settings.cpp
@@ -873,6 +873,10 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "PASSIVE";
     } break;
+    case library_none:
+    case library_serial: {
+      value = NULL;
+    } break;
     }
   } else {
     switch (__kmp_library) {
@@ -885,6 +889,9 @@ static void __kmp_stg_print_wait_policy(kmp_str_buf_t *buffer, char const *name,
     case library_throughput: {
       value = "throughput";
     } break;
+    case library_none: {
+      value = NULL;
+    } break;
     }
   }
   if (value != NULL) {
@@ -2004,6 +2011,7 @@ static inline const char *
 __kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
   switch (type) {
   case KMP_HW_CORE_TYPE_UNKNOWN:
+  case KMP_HW_MAX_NUM_CORE_TYPES:
     return "unknown";
 #if KMP_ARCH_X86 || KMP_ARCH_X86_64
   case KMP_HW_CORE_TYPE_ATOM:
@@ -2012,7 +2020,8 @@ __kmp_hw_get_core_type_keyword(kmp_hw_core_type_t type) {
     return "intel_core";
 #endif
   }
-  return "unknown";
+  KMP_ASSERT2(false, "Unhandled kmp_hw_core_type_t enumeration");
+  KMP_BUILTIN_UNREACHABLE;
 }
 
 #if KMP_AFFINITY_SUPPORTED
@@ -4428,6 +4437,10 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s,%d'\n", "auto", __kmp_chunk);
       break;
+    default:
+      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
+      KMP_BUILTIN_UNREACHABLE;
+      break;
     }
   } else {
     switch (sched) {
@@ -4453,6 +4466,10 @@ static void __kmp_stg_print_omp_schedule(kmp_str_buf_t *buffer,
     case kmp_sch_auto:
       __kmp_str_buf_print(buffer, "%s'\n", "auto");
       break;
+    default:
+      KMP_ASSERT2(false, "Unhandled sched_type enumeration");
+      KMP_BUILTIN_UNREACHABLE;
+      break;
     }
   }
 } // __kmp_stg_print_omp_schedule
diff --git a/openmp/runtime/src/kmp_wait_release.h b/openmp/runtime/src/kmp_wait_release.h
index 3fcae5687d124..12d5d0677a90a 100644
--- a/openmp/runtime/src/kmp_wait_release.h
+++ b/openmp/runtime/src/kmp_wait_release.h
@@ -1038,15 +1038,9 @@ static inline void __kmp_null_resume_wrapper(kmp_info_t *thr) {
   case flag_oncore:
     __kmp_resume_oncore(gtid, RCAST(kmp_flag_oncore *, flag));
     break;
-#ifdef KMP_DEBUG
   case flag_unset:
     KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d is unset\n", type));
     break;
-  default:
-    KF_TRACE(100, ("__kmp_null_resume_wrapper: flag type %d does not match any "
-                   "known flag type\n",
-                   type));
-#endif
   }
 }
 
diff --git a/openmp/runtime/src/z_Windows_NT_util.cpp b/openmp/runtime/src/z_Windows_NT_util.cpp
index 9e264ab45b87f..d75b48b2c1bcf 100644
--- a/openmp/runtime/src/z_Windows_NT_util.cpp
+++ b/openmp/runtime/src/z_Windows_NT_util.cpp
@@ -22,6 +22,7 @@
    number of running threads in the system. */
 
 #include <ntsecapi.h> // UNICODE_STRING
+#undef WIN32_NO_STATUS
 #include <ntstatus.h>
 #include <psapi.h>
 #ifdef _MSC_VER
@@ -1635,7 +1636,7 @@ int __kmp_get_load_balance(int max) {
     // threads on all cores. So, we don't consider the running threads of this
     // process.
     if (pid != 0) {
-      for (int i = 0; i < num; ++i) {
+      for (ULONG i = 0; i < num; ++i) {
         THREAD_STATE state = spi->Threads[i].State;
         // Count threads that have Ready or Running state.
         // !!! TODO: Why comment does not match the code???

From e4f0829d80aabe7d0b80d8ba60828dd0c9774300 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 23 Jan 2024 18:50:41 +0100
Subject: [PATCH 663/843] [Orc] Let LLJIT default to JITLink for ELF-based ARM
 targets  (#77313)

The JITLink AArch32 backend reached feature-parity with RuntimeDyld
on ELF-based systems. This patch changes the default JIT-linker in Orc's
LLJIT for these platforms.

This allows us to run clang-repl with JITLink on ARM and use all the
features we had with RuntimeDyld before. All existing tests for
clang-repl are passing.
---
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index e259c393d07e0..833dcb9d5bf2e 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -734,6 +734,12 @@ Error LLJITBuilderState::prepareForConstruction() {
     case Triple::aarch64:
       UseJITLink = !TT.isOSBinFormatCOFF();
       break;
+    case Triple::arm:
+    case Triple::armeb:
+    case Triple::thumb:
+    case Triple::thumbeb:
+      UseJITLink = TT.isOSBinFormatELF();
+      break;
     case Triple::x86_64:
       UseJITLink = !TT.isOSBinFormatCOFF();
       break;

From 55a7bb0ff68a97a8ec640f846c40aae19a2d58c0 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 23 Jan 2024 09:56:13 -0800
Subject: [PATCH 664/843] [RISCV] Re-format RISCVFeatures.td so it doesn't look
 like AssemblerPredicate is an operand to Predicate. (#79076)

AssemblerPredicate was almost always indented to the same column as the
first operand to Predicate. But AssemblerPredicate is a separate base
class so should have the same indentation as Predicate.

For the string passed to AssemblePredicate, I aligned it to the other
arguments on the previous if it fit in 80 columns. Otherwise I indented
4 spaces past the start of AssemblerPredicate.

For some vendor extensions I put the 2 classes on new lines instead of
the same line as the def. This gave more room for the strings and was
more consistent with other formatting in that portion of the file.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td | 423 +++++++++++++------------
 1 file changed, 215 insertions(+), 208 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index c84436ca5d109..30ede62944253 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -16,8 +16,8 @@ def FeatureStdExtI
     : SubtargetFeature<"i", "HasStdExtI", "true",
                        "'I' (Base Integer Instruction Set)">;
 def HasStdExtI : Predicate<"Subtarget->hasStdExtI()">,
-                           AssemblerPredicate<(all_of FeatureStdExtI),
-                           "'I' (Base Integer Instruction Set)">;
+                 AssemblerPredicate<(all_of FeatureStdExtI),
+                                    "'I' (Base Integer Instruction Set)">;
 def FeatureStdExtZic64b
     : SubtargetFeature<"zic64b", "HasStdExtZic64b", "true",
                        "'Zic64b' (Cache Block Size Is 64 Bytes)">;
@@ -26,22 +26,22 @@ def FeatureStdExtZicbom
     : SubtargetFeature<"zicbom", "HasStdExtZicbom", "true",
                        "'Zicbom' (Cache-Block Management Instructions)">;
 def HasStdExtZicbom : Predicate<"Subtarget->hasStdExtZicbom()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicbom),
-                                "'Zicbom' (Cache-Block Management Instructions)">;
+                      AssemblerPredicate<(all_of FeatureStdExtZicbom),
+                          "'Zicbom' (Cache-Block Management Instructions)">;
 
 def FeatureStdExtZicbop
     : SubtargetFeature<"zicbop", "HasStdExtZicbop", "true",
                        "'Zicbop' (Cache-Block Prefetch Instructions)">;
 def HasStdExtZicbop : Predicate<"Subtarget->hasStdExtZicbop()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicbop),
-                                "'Zicbop' (Cache-Block Prefetch Instructions)">;
+                      AssemblerPredicate<(all_of FeatureStdExtZicbop),
+                          "'Zicbop' (Cache-Block Prefetch Instructions)">;
 
 def FeatureStdExtZicboz
     : SubtargetFeature<"zicboz", "HasStdExtZicboz", "true",
                        "'Zicboz' (Cache-Block Zero Instructions)">;
 def HasStdExtZicboz : Predicate<"Subtarget->hasStdExtZicboz()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicboz),
-                                "'Zicboz' (Cache-Block Zero Instructions)">;
+                      AssemblerPredicate<(all_of FeatureStdExtZicboz),
+                          "'Zicboz' (Cache-Block Zero Instructions)">;
 
 def FeatureStdExtZiccamoa
     : SubtargetFeature<"ziccamoa", "HasStdExtZiccamoa", "true",
@@ -63,8 +63,8 @@ def FeatureStdExtZicsr
     : SubtargetFeature<"zicsr", "HasStdExtZicsr", "true",
                        "'zicsr' (CSRs)">;
 def HasStdExtZicsr : Predicate<"Subtarget->hasStdExtZicsr()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicsr),
-                                "'Zicsr' (CSRs)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZicsr),
+                                        "'Zicsr' (CSRs)">;
 
 def FeatureStdExtZicntr
     : SubtargetFeature<"zicntr", "HasStdExtZicntr", "true",
@@ -75,29 +75,29 @@ def FeatureStdExtZicond
     : SubtargetFeature<"experimental-zicond", "HasStdExtZicond", "true",
                        "'Zicond' (Integer Conditional Operations)">;
 def HasStdExtZicond : Predicate<"Subtarget->hasStdExtZicond()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZicond),
-                                "'Zicond' (Integer Conditional Operations)">;
+                      AssemblerPredicate<(all_of FeatureStdExtZicond),
+                          "'Zicond' (Integer Conditional Operations)">;
 
 def FeatureStdExtZifencei
     : SubtargetFeature<"zifencei", "HasStdExtZifencei", "true",
                        "'Zifencei' (fence.i)">;
 def HasStdExtZifencei : Predicate<"Subtarget->hasStdExtZifencei()">,
-                                   AssemblerPredicate<(all_of FeatureStdExtZifencei),
-                                   "'Zifencei' (fence.i)">;
+                        AssemblerPredicate<(all_of FeatureStdExtZifencei),
+                                           "'Zifencei' (fence.i)">;
 
 def FeatureStdExtZihintpause
     : SubtargetFeature<"zihintpause", "HasStdExtZihintpause", "true",
                        "'Zihintpause' (Pause Hint)">;
 def HasStdExtZihintpause : Predicate<"Subtarget->hasStdExtZihintpause()">,
-                                     AssemblerPredicate<(all_of FeatureStdExtZihintpause),
-                                     "'Zihintpause' (Pause Hint)">;
+                           AssemblerPredicate<(all_of FeatureStdExtZihintpause),
+                                              "'Zihintpause' (Pause Hint)">;
 
 def FeatureStdExtZihintntl
     : SubtargetFeature<"zihintntl", "HasStdExtZihintntl", "true",
                        "'Zihintntl' (Non-Temporal Locality Hints)">;
 def HasStdExtZihintntl : Predicate<"Subtarget->hasStdExtZihintntl()">,
-                                    AssemblerPredicate<(all_of FeatureStdExtZihintntl),
-                                    "'Zihintntl' (Non-Temporal Locality Hints)">;
+                         AssemblerPredicate<(all_of FeatureStdExtZihintntl),
+                             "'Zihintntl' (Non-Temporal Locality Hints)">;
 
 def FeatureStdExtZihpm
     : SubtargetFeature<"zihpm", "HasStdExtZihpm", "true",
@@ -107,23 +107,23 @@ def FeatureStdExtZihpm
 def FeatureStdExtZimop : SubtargetFeature<"experimental-zimop", "HasStdExtZimop", "true",
                                           "'Zimop' (May-Be-Operations)">;
 def HasStdExtZimop : Predicate<"Subtarget->hasStdExtZimop()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZimop),
-                               "'Zimop' (May-Be-Operations)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZimop),
+                                        "'Zimop' (May-Be-Operations)">;
 
 def FeatureStdExtZicfilp
     : SubtargetFeature<"experimental-zicfilp", "HasStdExtZicfilp", "true",
                        "'Zicfilp' (Landing pad)">;
 def HasStdExtZicfilp : Predicate<"Subtarget->hasStdExtZicfilp()">,
-                                 AssemblerPredicate<(all_of FeatureStdExtZicfilp),
-                                 "'Zicfilp' (Landing pad)">;
+                       AssemblerPredicate<(all_of FeatureStdExtZicfilp),
+                                          "'Zicfilp' (Landing pad)">;
 
 def FeatureStdExtZicfiss
     : SubtargetFeature<"experimental-zicfiss", "HasStdExtZicfiss", "true",
                        "'Zicfiss' (Shadow stack)",
                        [FeatureStdExtZicsr, FeatureStdExtZimop]>;
 def HasStdExtZicfiss : Predicate<"Subtarget->hasStdExtZicfiss()">,
-                                 AssemblerPredicate<(all_of FeatureStdExtZicfiss),
-                                 "'Zicfiss' (Shadow stack)">;
+                       AssemblerPredicate<(all_of FeatureStdExtZicfiss),
+                                          "'Zicfiss' (Shadow stack)">;
 def NoHasStdExtZicfiss : Predicate<"!Subtarget->hasStdExtZicfiss()">;
 
 // Multiply Extensions
@@ -132,8 +132,8 @@ def FeatureStdExtM
     : SubtargetFeature<"m", "HasStdExtM", "true",
                        "'M' (Integer Multiplication and Division)">;
 def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
-                           AssemblerPredicate<(all_of FeatureStdExtM),
-                           "'M' (Integer Multiplication and Division)">;
+                 AssemblerPredicate<(all_of FeatureStdExtM),
+                     "'M' (Integer Multiplication and Division)">;
 
 def FeatureStdExtZmmul
     : SubtargetFeature<"zmmul", "HasStdExtZmmul", "true",
@@ -141,9 +141,9 @@ def FeatureStdExtZmmul
 
 def HasStdExtMOrZmmul
     : Predicate<"Subtarget->hasStdExtM() || Subtarget->hasStdExtZmmul()">,
-                AssemblerPredicate<(any_of FeatureStdExtM, FeatureStdExtZmmul),
-                                   "'M' (Integer Multiplication and Division) or "
-                                   "'Zmmul' (Integer Multiplication)">;
+      AssemblerPredicate<(any_of FeatureStdExtM, FeatureStdExtZmmul),
+                         "'M' (Integer Multiplication and Division) or "
+                         "'Zmmul' (Integer Multiplication)">;
 
 // Atomic Extensions
 
@@ -151,15 +151,15 @@ def FeatureStdExtA
     : SubtargetFeature<"a", "HasStdExtA", "true",
                        "'A' (Atomic Instructions)">;
 def HasStdExtA : Predicate<"Subtarget->hasStdExtA()">,
-                           AssemblerPredicate<(all_of FeatureStdExtA),
-                           "'A' (Atomic Instructions)">;
+                 AssemblerPredicate<(all_of FeatureStdExtA),
+                                    "'A' (Atomic Instructions)">;
 
 def FeatureStdExtZtso
     : SubtargetFeature<"experimental-ztso", "HasStdExtZtso", "true",
                        "'Ztso' (Memory Model - Total Store Order)">;
 def HasStdExtZtso : Predicate<"Subtarget->hasStdExtZtso()">,
-                              AssemblerPredicate<(all_of FeatureStdExtZtso),
-                              "'Ztso' (Memory Model - Total Store Order)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZtso),
+                        "'Ztso' (Memory Model - Total Store Order)">;
 def NotHasStdExtZtso : Predicate<"!Subtarget->hasStdExtZtso()">;
 
 def FeatureStdExtZa64rs : SubtargetFeature<"za64rs", "HasStdExtZa64rs", "true",
@@ -172,15 +172,15 @@ def FeatureStdExtZacas
     : SubtargetFeature<"experimental-zacas", "HasStdExtZacas", "true",
                        "'Zacas' (Atomic Compare-And-Swap Instructions)">;
 def HasStdExtZacas : Predicate<"Subtarget->hasStdExtZacas()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZacas),
-                               "'Zacas' (Atomic Compare-And-Swap Instructions)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZacas),
+                         "'Zacas' (Atomic Compare-And-Swap Instructions)">;
 def NoStdExtZacas : Predicate<"!Subtarget->hasStdExtZacas()">;
 
 def FeatureStdExtZawrs : SubtargetFeature<"zawrs", "HasStdExtZawrs", "true",
                                           "'Zawrs' (Wait on Reservation Set)">;
 def HasStdExtZawrs : Predicate<"Subtarget->hasStdExtZawrs()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZawrs),
-                               "'Zawrs' (Wait on Reservation Set)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZawrs),
+                                        "'Zawrs' (Wait on Reservation Set)">;
 
 // Floating Point Extensions
 
@@ -189,41 +189,41 @@ def FeatureStdExtF
                        "'F' (Single-Precision Floating-Point)",
                        [FeatureStdExtZicsr]>;
 def HasStdExtF : Predicate<"Subtarget->hasStdExtF()">,
-                           AssemblerPredicate<(all_of FeatureStdExtF),
-                           "'F' (Single-Precision Floating-Point)">;
+                 AssemblerPredicate<(all_of FeatureStdExtF),
+                                    "'F' (Single-Precision Floating-Point)">;
 
 def FeatureStdExtD
     : SubtargetFeature<"d", "HasStdExtD", "true",
                        "'D' (Double-Precision Floating-Point)",
                        [FeatureStdExtF]>;
 def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
-                           AssemblerPredicate<(all_of FeatureStdExtD),
-                           "'D' (Double-Precision Floating-Point)">;
+                 AssemblerPredicate<(all_of FeatureStdExtD),
+                                    "'D' (Double-Precision Floating-Point)">;
 
 def FeatureStdExtH
     : SubtargetFeature<"h", "HasStdExtH", "true",
                        "'H' (Hypervisor)">;
 
 def HasStdExtH : Predicate<"Subtarget->hasStdExtH()">,
-                           AssemblerPredicate<(all_of FeatureStdExtH),
-                           "'H' (Hypervisor)">;
+                 AssemblerPredicate<(all_of FeatureStdExtH),
+                                    "'H' (Hypervisor)">;
 
 def FeatureStdExtZfhmin
     : SubtargetFeature<"zfhmin", "HasStdExtZfhmin", "true",
                        "'Zfhmin' (Half-Precision Floating-Point Minimal)",
                        [FeatureStdExtF]>;
 def HasStdExtZfhmin : Predicate<"Subtarget->hasStdExtZfhmin()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZfhmin),
-                             "'Zfh' (Half-Precision Floating-Point) or "
-                             "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
+                      AssemblerPredicate<(all_of FeatureStdExtZfhmin),
+                          "'Zfh' (Half-Precision Floating-Point) or "
+                          "'Zfhmin' (Half-Precision Floating-Point Minimal)">;
 
 def FeatureStdExtZfh
     : SubtargetFeature<"zfh", "HasStdExtZfh", "true",
                        "'Zfh' (Half-Precision Floating-Point)",
                        [FeatureStdExtZfhmin]>;
 def HasStdExtZfh : Predicate<"Subtarget->hasStdExtZfh()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZfh),
-                             "'Zfh' (Half-Precision Floating-Point)">;
+                   AssemblerPredicate<(all_of FeatureStdExtZfh),
+                       "'Zfh' (Half-Precision Floating-Point)">;
 def NoStdExtZfh : Predicate<"!Subtarget->hasStdExtZfh()">;
 
 def FeatureStdExtZfbfmin
@@ -231,14 +231,14 @@ def FeatureStdExtZfbfmin
                        "'Zfbfmin' (Scalar BF16 Converts)",
                        [FeatureStdExtF]>;
 def HasStdExtZfbfmin : Predicate<"Subtarget->hasStdExtZfbfmin()">,
-                                 AssemblerPredicate<(all_of FeatureStdExtZfbfmin),
-                                 "'Zfbfmin' (Scalar BF16 Converts)">;
+                       AssemblerPredicate<(all_of FeatureStdExtZfbfmin),
+                                          "'Zfbfmin' (Scalar BF16 Converts)">;
 
 def HasHalfFPLoadStoreMove
     : Predicate<"Subtarget->hasHalfFPLoadStoreMove()">,
-                AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZfhmin,
-                                    FeatureStdExtZfbfmin),
-                                    "'Zfh' (Half-Precision Floating-Point) or "
+      AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZfhmin,
+                                 FeatureStdExtZfbfmin),
+                         "'Zfh' (Half-Precision Floating-Point) or "
                                     "'Zfhmin' (Half-Precision Floating-Point Minimal) or "
                                     "'Zfbfmin' (Scalar BF16 Converts)">;
 
@@ -247,41 +247,41 @@ def FeatureStdExtZfa
                        "'Zfa' (Additional Floating-Point)",
                        [FeatureStdExtF]>;
 def HasStdExtZfa : Predicate<"Subtarget->hasStdExtZfa()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZfa),
-                             "'Zfa' (Additional Floating-Point)">;
+                   AssemblerPredicate<(all_of FeatureStdExtZfa),
+                                      "'Zfa' (Additional Floating-Point)">;
 
 def FeatureStdExtZfinx
     : SubtargetFeature<"zfinx", "HasStdExtZfinx", "true",
                        "'Zfinx' (Float in Integer)",
                        [FeatureStdExtZicsr]>;
 def HasStdExtZfinx : Predicate<"Subtarget->hasStdExtZfinx()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZfinx),
-                               "'Zfinx' (Float in Integer)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZfinx),
+                                        "'Zfinx' (Float in Integer)">;
 
 def FeatureStdExtZdinx
     : SubtargetFeature<"zdinx", "HasStdExtZdinx", "true",
                        "'Zdinx' (Double in Integer)",
                        [FeatureStdExtZfinx]>;
 def HasStdExtZdinx : Predicate<"Subtarget->hasStdExtZdinx()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZdinx),
-                               "'Zdinx' (Double in Integer)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZdinx),
+                                        "'Zdinx' (Double in Integer)">;
 
 def FeatureStdExtZhinxmin
     : SubtargetFeature<"zhinxmin", "HasStdExtZhinxmin", "true",
                        "'Zhinxmin' (Half Float in Integer Minimal)",
                        [FeatureStdExtZfinx]>;
 def HasStdExtZhinxmin : Predicate<"Subtarget->hasStdExtZhinxmin()">,
-                                  AssemblerPredicate<(all_of FeatureStdExtZhinxmin),
-                                  "'Zhinx' (Half Float in Integer) or "
-                                  "'Zhinxmin' (Half Float in Integer Minimal)">;
+                        AssemblerPredicate<(all_of FeatureStdExtZhinxmin),
+                            "'Zhinx' (Half Float in Integer) or "
+                            "'Zhinxmin' (Half Float in Integer Minimal)">;
 
 def FeatureStdExtZhinx
     : SubtargetFeature<"zhinx", "HasStdExtZhinx", "true",
                        "'Zhinx' (Half Float in Integer)",
                        [FeatureStdExtZhinxmin]>;
 def HasStdExtZhinx : Predicate<"Subtarget->hasStdExtZhinx()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZhinx),
-                               "'Zhinx' (Half Float in Integer)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZhinx),
+                                        "'Zhinx' (Half Float in Integer)">;
 def NoStdExtZhinx : Predicate<"!Subtarget->hasStdExtZhinx()">;
 
 // Compressed Extensions
@@ -290,15 +290,15 @@ def FeatureStdExtC
     : SubtargetFeature<"c", "HasStdExtC", "true",
                        "'C' (Compressed Instructions)">;
 def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
-                           AssemblerPredicate<(all_of FeatureStdExtC),
-                           "'C' (Compressed Instructions)">;
+                 AssemblerPredicate<(all_of FeatureStdExtC),
+                                    "'C' (Compressed Instructions)">;
 
 def FeatureNoRVCHints
     : SubtargetFeature<"no-rvc-hints", "EnableRVCHintInstrs", "false",
                        "Disable RVC Hint Instructions.">;
 def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
                   AssemblerPredicate<(all_of(not FeatureNoRVCHints)),
-                                      "RVC Hint Instructions">;
+                                     "RVC Hint Instructions">;
 
 def FeatureStdExtZca
     : SubtargetFeature<"zca", "HasStdExtZca", "true",
@@ -307,18 +307,18 @@ def FeatureStdExtZca
 
 def HasStdExtCOrZca
     : Predicate<"Subtarget->hasStdExtCOrZca()">,
-                AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZca),
-                                   "'C' (Compressed Instructions) or "
-                                   "'Zca' (part of the C extension, excluding "
-                                   "compressed floating point loads/stores)">;
+      AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZca),
+                         "'C' (Compressed Instructions) or "
+                         "'Zca' (part of the C extension, excluding "
+                         "compressed floating point loads/stores)">;
 
 def FeatureStdExtZcb
     : SubtargetFeature<"zcb", "HasStdExtZcb", "true",
                        "'Zcb' (Compressed basic bit manipulation instructions)",
                        [FeatureStdExtZca]>;
 def HasStdExtZcb : Predicate<"Subtarget->hasStdExtZcb()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZcb),
-                             "'Zcb' (Compressed basic bit manipulation instructions)">;
+                   AssemblerPredicate<(all_of FeatureStdExtZcb),
+                       "'Zcb' (Compressed basic bit manipulation instructions)">;
 
 def FeatureStdExtZcd
     : SubtargetFeature<"zcd", "HasStdExtZcd", "true",
@@ -327,9 +327,9 @@ def FeatureStdExtZcd
 
 def HasStdExtCOrZcd
     : Predicate<"Subtarget->hasStdExtC() || Subtarget->hasStdExtZcd()">,
-                AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZcd),
-                                   "'C' (Compressed Instructions) or "
-                                   "'Zcd' (Compressed Double-Precision Floating-Point Instructions)">;
+      AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZcd),
+                         "'C' (Compressed Instructions) or "
+                         "'Zcd' (Compressed Double-Precision Floating-Point Instructions)">;
 
 def FeatureStdExtZcf
     : SubtargetFeature<"zcf", "HasStdExtZcf", "true",
@@ -341,8 +341,8 @@ def FeatureStdExtZcmp
                        "'Zcmp' (sequenced instuctions for code-size reduction)",
                        [FeatureStdExtZca]>;
 def HasStdExtZcmp : Predicate<"Subtarget->hasStdExtZcmp() && !Subtarget->hasStdExtC()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZcmp),
-                               "'Zcmp' (sequenced instuctions for code-size reduction)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZcmp),
+                        "'Zcmp' (sequenced instuctions for code-size reduction)">;
 
 def FeatureStdExtZcmt
     : SubtargetFeature<"zcmt", "HasStdExtZcmt", "true",
@@ -361,17 +361,17 @@ def FeatureStdExtZce
 def HasStdExtCOrZcfOrZce
     : Predicate<"Subtarget->hasStdExtC() || Subtarget->hasStdExtZcf() "
                 "Subtarget->hasStdExtZce()">,
-                AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZcf,
-                                           FeatureStdExtZce),
-                                   "'C' (Compressed Instructions) or "
-                                   "'Zcf' (Compressed Single-Precision Floating-Point Instructions)">;
+      AssemblerPredicate<(any_of FeatureStdExtC, FeatureStdExtZcf,
+                                 FeatureStdExtZce),
+                         "'C' (Compressed Instructions) or "
+                         "'Zcf' (Compressed Single-Precision Floating-Point Instructions)">;
 
 def FeatureStdExtZcmop : SubtargetFeature<"experimental-zcmop", "HasStdExtZcmop", "true",
                                           "'Zcmop' (Compressed May-Be-Operations)",
                                           [FeatureStdExtZca]>;
 def HasStdExtZcmop : Predicate<"Subtarget->hasStdExtZcmop()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZcmop),
-                               "'Zcmop' (Compressed May-Be-Operations)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZcmop),
+                         "'Zcmop' (Compressed May-Be-Operations)">;
 
 // Bitmanip Extensions
 
@@ -379,30 +379,30 @@ def FeatureStdExtZba
     : SubtargetFeature<"zba", "HasStdExtZba", "true",
                        "'Zba' (Address Generation Instructions)">;
 def HasStdExtZba : Predicate<"Subtarget->hasStdExtZba()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZba),
-                             "'Zba' (Address Generation Instructions)">;
+                   AssemblerPredicate<(all_of FeatureStdExtZba),
+                                      "'Zba' (Address Generation Instructions)">;
 def NotHasStdExtZba : Predicate<"!Subtarget->hasStdExtZba()">;
 
 def FeatureStdExtZbb
     : SubtargetFeature<"zbb", "HasStdExtZbb", "true",
                        "'Zbb' (Basic Bit-Manipulation)">;
 def HasStdExtZbb : Predicate<"Subtarget->hasStdExtZbb()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZbb),
-                             "'Zbb' (Basic Bit-Manipulation)">;
+                   AssemblerPredicate<(all_of FeatureStdExtZbb),
+                                      "'Zbb' (Basic Bit-Manipulation)">;
 
 def FeatureStdExtZbc
     : SubtargetFeature<"zbc", "HasStdExtZbc", "true",
                        "'Zbc' (Carry-Less Multiplication)">;
 def HasStdExtZbc : Predicate<"Subtarget->hasStdExtZbc()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZbc),
-                             "'Zbc' (Carry-Less Multiplication)">;
+                   AssemblerPredicate<(all_of FeatureStdExtZbc),
+                                      "'Zbc' (Carry-Less Multiplication)">;
 
 def FeatureStdExtZbs
     : SubtargetFeature<"zbs", "HasStdExtZbs", "true",
                        "'Zbs' (Single-Bit Instructions)">;
 def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZbs),
-                             "'Zbs' (Single-Bit Instructions)">;
+                   AssemblerPredicate<(all_of FeatureStdExtZbs),
+                                      "'Zbs' (Single-Bit Instructions)">;
 
 // Bitmanip Extensions for Cryptography Extensions
 
@@ -410,21 +410,21 @@ def FeatureStdExtZbkb
     : SubtargetFeature<"zbkb", "HasStdExtZbkb", "true",
                        "'Zbkb' (Bitmanip instructions for Cryptography)">;
 def HasStdExtZbkb : Predicate<"Subtarget->hasStdExtZbkb()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZbkb),
-                             "'Zbkb' (Bitmanip instructions for Cryptography)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZbkb),
+                        "'Zbkb' (Bitmanip instructions for Cryptography)">;
 
 def FeatureStdExtZbkx
     : SubtargetFeature<"zbkx", "HasStdExtZbkx", "true",
                        "'Zbkx' (Crossbar permutation instructions)">;
 def HasStdExtZbkx : Predicate<"Subtarget->hasStdExtZbkx()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZbkx),
-                             "'Zbkx' (Crossbar permutation instructions)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZbkx),
+                        "'Zbkx' (Crossbar permutation instructions)">;
 
 def HasStdExtZbbOrZbkb
     : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb()">,
-                AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb),
-                                   "'Zbb' (Basic Bit-Manipulation) or "
-                                   "'Zbkb' (Bitmanip instructions for Cryptography)">;
+      AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb),
+                         "'Zbb' (Basic Bit-Manipulation) or "
+                         "'Zbkb' (Bitmanip instructions for Cryptography)">;
 
 // The Carry-less multiply subextension for cryptography is a subset of basic
 // carry-less multiply subextension. The former should be enabled if the latter
@@ -435,15 +435,15 @@ def FeatureStdExtZbkc
                        "Cryptography)">;
 def HasStdExtZbkc
     : Predicate<"Subtarget->hasStdExtZbkc()">,
-                AssemblerPredicate<(all_of FeatureStdExtZbkc),
-                "'Zbkc' (Carry-less multiply instructions for Cryptography)">;
+      AssemblerPredicate<(all_of FeatureStdExtZbkc),
+          "'Zbkc' (Carry-less multiply instructions for Cryptography)">;
 
 def HasStdExtZbcOrZbkc
     : Predicate<"Subtarget->hasStdExtZbc() || Subtarget->hasStdExtZbkc()">,
-                AssemblerPredicate<(any_of FeatureStdExtZbc, FeatureStdExtZbkc),
-                                   "'Zbc' (Carry-Less Multiplication) or "
-                                   "'Zbkc' (Carry-less multiply instructions "
-                                   "for Cryptography)">;
+      AssemblerPredicate<(any_of FeatureStdExtZbc, FeatureStdExtZbkc),
+                         "'Zbc' (Carry-Less Multiplication) or "
+                         "'Zbkc' (Carry-less multiply instructions "
+                         "for Cryptography)">;
 
 // Cryptography Extensions
 
@@ -451,52 +451,51 @@ def FeatureStdExtZknd
     : SubtargetFeature<"zknd", "HasStdExtZknd", "true",
                        "'Zknd' (NIST Suite: AES Decryption)">;
 def HasStdExtZknd : Predicate<"Subtarget->hasStdExtZknd()">,
-                              AssemblerPredicate<(all_of FeatureStdExtZknd),
-                              "'Zknd' (NIST Suite: AES Decryption)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZknd),
+                                       "'Zknd' (NIST Suite: AES Decryption)">;
 
 def FeatureStdExtZkne
     : SubtargetFeature<"zkne", "HasStdExtZkne", "true",
                        "'Zkne' (NIST Suite: AES Encryption)">;
 def HasStdExtZkne : Predicate<"Subtarget->hasStdExtZkne()">,
-                              AssemblerPredicate<(all_of FeatureStdExtZkne),
-                              "'Zkne' (NIST Suite: AES Encryption)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZkne),
+                                       "'Zkne' (NIST Suite: AES Encryption)">;
 
 // Some instructions belong to both Zknd and Zkne subextensions.
 // They should be enabled if either has been specified.
 def HasStdExtZkndOrZkne
     : Predicate<"Subtarget->hasStdExtZknd() || Subtarget->hasStdExtZkne()">,
-                AssemblerPredicate<(any_of FeatureStdExtZknd, FeatureStdExtZkne),
-                                   "'Zknd' (NIST Suite: AES Decryption) or "
-                                   "'Zkne' (NIST Suite: AES Encryption)">;
+      AssemblerPredicate<(any_of FeatureStdExtZknd, FeatureStdExtZkne),
+                         "'Zknd' (NIST Suite: AES Decryption) or "
+                         "'Zkne' (NIST Suite: AES Encryption)">;
 
 def FeatureStdExtZknh
     : SubtargetFeature<"zknh", "HasStdExtZknh", "true",
                        "'Zknh' (NIST Suite: Hash Function Instructions)">;
 def HasStdExtZknh : Predicate<"Subtarget->hasStdExtZknh()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZknh),
-                             "'Zknh' (NIST Suite: Hash Function Instructions)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZknh),
+                        "'Zknh' (NIST Suite: Hash Function Instructions)">;
 
 def FeatureStdExtZksed
     : SubtargetFeature<"zksed", "HasStdExtZksed", "true",
                        "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
 def HasStdExtZksed : Predicate<"Subtarget->hasStdExtZksed()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZksed),
-                             "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZksed),
+                         "'Zksed' (ShangMi Suite: SM4 Block Cipher Instructions)">;
 
 def FeatureStdExtZksh
     : SubtargetFeature<"zksh", "HasStdExtZksh", "true",
                        "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
 def HasStdExtZksh : Predicate<"Subtarget->hasStdExtZksh()">,
-                              AssemblerPredicate<(all_of FeatureStdExtZksh),
-                              "'Zksh' (ShangMi Suite: SM3 Hash Function "
-                              "Instructions)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZksh),
+                        "'Zksh' (ShangMi Suite: SM3 Hash Function Instructions)">;
 
 def FeatureStdExtZkr
     : SubtargetFeature<"zkr", "HasStdExtZkr", "true",
                        "'Zkr' (Entropy Source Extension)">;
 def HasStdExtZkr : Predicate<"Subtarget->hasStdExtZkr()">,
-                             AssemblerPredicate<(all_of FeatureStdExtZkr),
-                             "'Zkr' (Entropy Source Extension)">;
+                   AssemblerPredicate<(all_of FeatureStdExtZkr),
+                                      "'Zkr' (Entropy Source Extension)">;
 
 def FeatureStdExtZkn
     : SubtargetFeature<"zkn", "HasStdExtZkn", "true",
@@ -581,16 +580,16 @@ def FeatureStdExtZvfbfmin
                        "'Zvbfmin' (Vector BF16 Converts)",
                        [FeatureStdExtZve32f]>;
 def HasStdExtZvfbfmin : Predicate<"Subtarget->hasStdExtZvfbfmin()">,
-                                  AssemblerPredicate<(all_of FeatureStdExtZvfbfmin),
-                                  "'Zvfbfmin' (Vector BF16 Converts)">;
+                        AssemblerPredicate<(all_of FeatureStdExtZvfbfmin),
+                            "'Zvfbfmin' (Vector BF16 Converts)">;
 
 def FeatureStdExtZvfbfwma
     : SubtargetFeature<"experimental-zvfbfwma", "HasStdExtZvfbfwma", "true",
                        "'Zvfbfwma' (Vector BF16 widening mul-add)",
                        [FeatureStdExtZvfbfmin, FeatureStdExtZfbfmin]>;
 def HasStdExtZvfbfwma : Predicate<"Subtarget->hasStdExtZvfbfwma()">,
-                                  AssemblerPredicate<(all_of FeatureStdExtZvfbfwma),
-                                  "'Zvfbfwma' (Vector BF16 widening mul-add)">;
+                        AssemblerPredicate<(all_of FeatureStdExtZvfbfwma),
+                            "'Zvfbfwma' (Vector BF16 widening mul-add)">;
 
 def FeatureStdExtZvfhmin
     : SubtargetFeature<"zvfhmin", "HasStdExtZvfhmin", "true",
@@ -604,9 +603,9 @@ def FeatureStdExtZvfh
 
 def HasStdExtZfhOrZvfh
     : Predicate<"Subtarget->hasStdExtZfh() || Subtarget->hasStdExtZvfh()">,
-                AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZvfh),
-                                   "'Zfh' (Half-Precision Floating-Point) or "
-                                   "'Zvfh' (Vector Half-Precision Floating-Point)">;
+      AssemblerPredicate<(any_of FeatureStdExtZfh, FeatureStdExtZvfh),
+                         "'Zfh' (Half-Precision Floating-Point) or "
+                         "'Zvfh' (Vector Half-Precision Floating-Point)">;
 
 // Vector Cryptography and Bitmanip Extensions
 
@@ -614,70 +613,70 @@ def FeatureStdExtZvkb
     : SubtargetFeature<"zvkb", "HasStdExtZvkb", "true",
                        "'Zvkb' (Vector Bit-manipulation used in Cryptography)">;
 def HasStdExtZvkb : Predicate<"Subtarget->hasStdExtZvkb()">,
-                              AssemblerPredicate<(all_of FeatureStdExtZvkb),
-                              "'Zvkb' (Vector Bit-manipulation used in Cryptography)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZvkb),
+                        "'Zvkb' (Vector Bit-manipulation used in Cryptography)">;
 
 def FeatureStdExtZvbb
     : SubtargetFeature<"zvbb", "HasStdExtZvbb", "true",
                        "'Zvbb' (Vector basic bit-manipulation instructions)",
                        [FeatureStdExtZvkb]>;
 def HasStdExtZvbb : Predicate<"Subtarget->hasStdExtZvbb()">,
-                              AssemblerPredicate<(all_of FeatureStdExtZvbb),
-                              "'Zvbb' (Vector basic bit-manipulation instructions)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZvbb),
+                        "'Zvbb' (Vector basic bit-manipulation instructions)">;
 
 def FeatureStdExtZvbc
     : SubtargetFeature<"zvbc", "HasStdExtZvbc", "true",
                        "'Zvbc' (Vector Carryless Multiplication)">;
 def HasStdExtZvbc : Predicate<"Subtarget->hasStdExtZvbc()">,
-                              AssemblerPredicate<(all_of FeatureStdExtZvbc),
-                              "'Zvbc' (Vector Carryless Multiplication)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZvbc),
+                        "'Zvbc' (Vector Carryless Multiplication)">;
 
 def FeatureStdExtZvkg
     : SubtargetFeature<"zvkg", "HasStdExtZvkg", "true",
                        "'Zvkg' (Vector GCM instructions for Cryptography)">;
 def HasStdExtZvkg : Predicate<"Subtarget->hasStdExtZvkg()">,
-                              AssemblerPredicate<(all_of FeatureStdExtZvkg),
-                              "'Zvkg' (Vector GCM instructions for Cryptography)">;
+                    AssemblerPredicate<(all_of FeatureStdExtZvkg),
+                        "'Zvkg' (Vector GCM instructions for Cryptography)">;
 
 def FeatureStdExtZvkned
     : SubtargetFeature<"zvkned", "HasStdExtZvkned", "true",
                        "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">;
 def HasStdExtZvkned : Predicate<"Subtarget->hasStdExtZvkned()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZvkned),
-                                "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">;
+                      AssemblerPredicate<(all_of FeatureStdExtZvkned),
+                          "'Zvkned' (Vector AES Encryption & Decryption (Single Round))">;
 
 def FeatureStdExtZvknha
     : SubtargetFeature<"zvknha", "HasStdExtZvknha", "true",
                        "'Zvknha' (Vector SHA-2 (SHA-256 only))">;
 def HasStdExtZvknha : Predicate<"Subtarget->hasStdExtZvknha()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZvknha),
-                                "'Zvknha' (Vector SHA-2 (SHA-256 only))">;
+                      AssemblerPredicate<(all_of FeatureStdExtZvknha),
+                          "'Zvknha' (Vector SHA-2 (SHA-256 only))">;
 
 def FeatureStdExtZvknhb
     : SubtargetFeature<"zvknhb", "HasStdExtZvknhb", "true",
                        "'Zvknhb' (Vector SHA-2 (SHA-256 and SHA-512))",
                        [FeatureStdExtZve64x]>;
 def HasStdExtZvknhb : Predicate<"Subtarget->hasStdExtZvknhb()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZvknhb),
-                                "'Zvknhb' (Vector SHA-2 (SHA-256 and SHA-512))">;
+                      AssemblerPredicate<(all_of FeatureStdExtZvknhb),
+                          "'Zvknhb' (Vector SHA-2 (SHA-256 and SHA-512))">;
 
 def HasStdExtZvknhaOrZvknhb : Predicate<"Subtarget->hasStdExtZvknha() || Subtarget->hasStdExtZvknhb()">,
-                                        AssemblerPredicate<(any_of FeatureStdExtZvknha, FeatureStdExtZvknhb),
-                                        "'Zvknha' or 'Zvknhb' (Vector SHA-2)">;
+                              AssemblerPredicate<(any_of FeatureStdExtZvknha, FeatureStdExtZvknhb),
+                                  "'Zvknha' or 'Zvknhb' (Vector SHA-2)">;
 
 def FeatureStdExtZvksed
     : SubtargetFeature<"zvksed", "HasStdExtZvksed", "true",
                        "'Zvksed' (SM4 Block Cipher Instructions)">;
 def HasStdExtZvksed : Predicate<"Subtarget->hasStdExtZvksed()">,
-                                AssemblerPredicate<(all_of FeatureStdExtZvksed),
-                                "'Zvksed' (SM4 Block Cipher Instructions)">;
+                      AssemblerPredicate<(all_of FeatureStdExtZvksed),
+                          "'Zvksed' (SM4 Block Cipher Instructions)">;
 
 def FeatureStdExtZvksh
     : SubtargetFeature<"zvksh", "HasStdExtZvksh", "true",
                        "'Zvksh' (SM3 Hash Function Instructions)">;
 def HasStdExtZvksh : Predicate<"Subtarget->hasStdExtZvksh()">,
-                               AssemblerPredicate<(all_of FeatureStdExtZvksh),
-                               "'Zvksh' (SM3 Hash Function Instructions)">;
+                     AssemblerPredicate<(all_of FeatureStdExtZvksh),
+                         "'Zvksh' (SM3 Hash Function Instructions)">;
 
 def FeatureStdExtZvkt
     : SubtargetFeature<"zvkt", "HasStdExtZvkt", "true",
@@ -767,8 +766,8 @@ def FeatureStdExtSvinval
     : SubtargetFeature<"svinval", "HasStdExtSvinval", "true",
                        "'Svinval' (Fine-Grained Address-Translation Cache Invalidation)">;
 def HasStdExtSvinval : Predicate<"Subtarget->hasStdExtSvinval()">,
-                                AssemblerPredicate<(all_of FeatureStdExtSvinval),
-                                "'Svinval' (Fine-Grained Address-Translation Cache Invalidation)">;
+                       AssemblerPredicate<(all_of FeatureStdExtSvinval),
+                           "'Svinval' (Fine-Grained Address-Translation Cache Invalidation)">;
 
 def FeatureStdExtSvnapot
     : SubtargetFeature<"svnapot", "HasStdExtSvnapot", "true",
@@ -788,8 +787,8 @@ def FeatureVendorXVentanaCondOps
     : SubtargetFeature<"xventanacondops", "HasVendorXVentanaCondOps", "true",
                        "'XVentanaCondOps' (Ventana Conditional Ops)">;
 def HasVendorXVentanaCondOps : Predicate<"Subtarget->hasVendorXVentanaCondOps()">,
-                                AssemblerPredicate<(all_of FeatureVendorXVentanaCondOps),
-                                "'XVentanaCondOps' (Ventana Conditional Ops)">;
+                               AssemblerPredicate<(all_of FeatureVendorXVentanaCondOps),
+                                   "'XVentanaCondOps' (Ventana Conditional Ops)">;
 
 // T-Head Extensions
 
@@ -797,80 +796,80 @@ def FeatureVendorXTHeadBa
     : SubtargetFeature<"xtheadba", "HasVendorXTHeadBa", "true",
                        "'xtheadba' (T-Head address calculation instructions)">;
 def HasVendorXTHeadBa : Predicate<"Subtarget->hasVendorXTHeadBa()">,
-                                  AssemblerPredicate<(all_of FeatureVendorXTHeadBa),
-                                  "'xtheadba' (T-Head address calculation instructions)">;
+                        AssemblerPredicate<(all_of FeatureVendorXTHeadBa),
+                            "'xtheadba' (T-Head address calculation instructions)">;
 
 def FeatureVendorXTHeadBb
     : SubtargetFeature<"xtheadbb", "HasVendorXTHeadBb", "true",
                        "'xtheadbb' (T-Head basic bit-manipulation instructions)">;
 def HasVendorXTHeadBb : Predicate<"Subtarget->hasVendorXTHeadBb()">,
-                                  AssemblerPredicate<(all_of FeatureVendorXTHeadBb),
-                                  "'xtheadbb' (T-Head basic bit-manipulation instructions)">;
+                        AssemblerPredicate<(all_of FeatureVendorXTHeadBb),
+                            "'xtheadbb' (T-Head basic bit-manipulation instructions)">;
 
 def FeatureVendorXTHeadBs
     : SubtargetFeature<"xtheadbs", "HasVendorXTHeadBs", "true",
                        "'xtheadbs' (T-Head single-bit instructions)">;
 def HasVendorXTHeadBs : Predicate<"Subtarget->hasVendorXTHeadBs()">,
-                                  AssemblerPredicate<(all_of FeatureVendorXTHeadBs),
-                                  "'xtheadbs' (T-Head single-bit instructions)">;
+                        AssemblerPredicate<(all_of FeatureVendorXTHeadBs),
+                            "'xtheadbs' (T-Head single-bit instructions)">;
 
 def FeatureVendorXTHeadCondMov
     : SubtargetFeature<"xtheadcondmov", "HasVendorXTHeadCondMov", "true",
                        "'xtheadcondmov' (T-Head conditional move instructions)">;
 def HasVendorXTHeadCondMov : Predicate<"Subtarget->hasVendorXTHeadCondMov()">,
-                                       AssemblerPredicate<(all_of FeatureVendorXTHeadCondMov),
-                                       "'xtheadcondmov' (T-Head conditional move instructions)">;
+                             AssemblerPredicate<(all_of FeatureVendorXTHeadCondMov),
+                                 "'xtheadcondmov' (T-Head conditional move instructions)">;
 
 def FeatureVendorXTHeadCmo
     : SubtargetFeature<"xtheadcmo", "HasVendorXTHeadCmo", "true",
                        "'xtheadcmo' (T-Head cache management instructions)">;
 def HasVendorXTHeadCmo : Predicate<"Subtarget->hasVendorXTHeadCmo()">,
-                                   AssemblerPredicate<(all_of FeatureVendorXTHeadCmo),
-                                   "'xtheadcmo' (T-Head cache management instructions)">;
+                         AssemblerPredicate<(all_of FeatureVendorXTHeadCmo),
+                             "'xtheadcmo' (T-Head cache management instructions)">;
 
 def FeatureVendorXTHeadFMemIdx
     : SubtargetFeature<"xtheadfmemidx", "HasVendorXTHeadFMemIdx", "true",
                        "'xtheadfmemidx' (T-Head FP Indexed Memory Operations)",
                        [FeatureStdExtF]>;
 def HasVendorXTHeadFMemIdx : Predicate<"Subtarget->hasVendorXTHeadFMemIdx()">,
-                                       AssemblerPredicate<(all_of FeatureVendorXTHeadFMemIdx),
-                                       "'xtheadfmemidx' (T-Head FP Indexed Memory Operations)">;
+                             AssemblerPredicate<(all_of FeatureVendorXTHeadFMemIdx),
+                                 "'xtheadfmemidx' (T-Head FP Indexed Memory Operations)">;
 
 def FeatureVendorXTHeadMac
     : SubtargetFeature<"xtheadmac", "HasVendorXTHeadMac", "true",
                        "'xtheadmac' (T-Head Multiply-Accumulate Instructions)">;
 def HasVendorXTHeadMac : Predicate<"Subtarget->hasVendorXTHeadMac()">,
-                                   AssemblerPredicate<(all_of FeatureVendorXTHeadMac),
-                                   "'xtheadmac' (T-Head Multiply-Accumulate Instructions)">;
+                         AssemblerPredicate<(all_of FeatureVendorXTHeadMac),
+                             "'xtheadmac' (T-Head Multiply-Accumulate Instructions)">;
 
 def FeatureVendorXTHeadMemIdx
     : SubtargetFeature<"xtheadmemidx", "HasVendorXTHeadMemIdx", "true",
                        "'xtheadmemidx' (T-Head Indexed Memory Operations)">;
 def HasVendorXTHeadMemIdx : Predicate<"Subtarget->hasVendorXTHeadMemIdx()">,
-                                      AssemblerPredicate<(all_of FeatureVendorXTHeadMemIdx),
-                                      "'xtheadmemidx' (T-Head Indexed Memory Operations)">;
+                            AssemblerPredicate<(all_of FeatureVendorXTHeadMemIdx),
+                                "'xtheadmemidx' (T-Head Indexed Memory Operations)">;
 
 def FeatureVendorXTHeadMemPair
     : SubtargetFeature<"xtheadmempair", "HasVendorXTHeadMemPair", "true",
                        "'xtheadmempair' (T-Head two-GPR Memory Operations)">;
 def HasVendorXTHeadMemPair : Predicate<"Subtarget->hasVendorXTHeadMemPair()">,
-                                    AssemblerPredicate<(all_of FeatureVendorXTHeadMemPair),
-                                    "'xtheadmempair' (T-Head two-GPR Memory Operations)">;
+                             AssemblerPredicate<(all_of FeatureVendorXTHeadMemPair),
+                                 "'xtheadmempair' (T-Head two-GPR Memory Operations)">;
 
 def FeatureVendorXTHeadSync
     : SubtargetFeature<"xtheadsync", "HasVendorXTHeadSync", "true",
                        "'xtheadsync' (T-Head multicore synchronization instructions)">;
 def HasVendorXTHeadSync : Predicate<"Subtarget->hasVendorXTHeadSync()">,
-                                    AssemblerPredicate<(all_of FeatureVendorXTHeadSync),
-                                    "'xtheadsync' (T-Head multicore synchronization instructions)">;
+                          AssemblerPredicate<(all_of FeatureVendorXTHeadSync),
+                              "'xtheadsync' (T-Head multicore synchronization instructions)">;
 
 def FeatureVendorXTHeadVdot
     : SubtargetFeature<"xtheadvdot", "HasVendorXTHeadVdot", "true",
                        "'xtheadvdot' (T-Head Vector Extensions for Dot)",
                        [FeatureStdExtV]>;
 def HasVendorXTHeadVdot : Predicate<"Subtarget->hasVendorXTHeadVdot()">,
-                                    AssemblerPredicate<(all_of FeatureVendorXTHeadVdot),
-                                    "'xtheadvdot' (T-Head Vector Extensions for Dot)">;
+                          AssemblerPredicate<(all_of FeatureVendorXTHeadVdot),
+                              "'xtheadvdot' (T-Head Vector Extensions for Dot)">;
 
 // SiFive Extensions
 
@@ -879,40 +878,44 @@ def FeatureVendorXSfvcp
                        "'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions)",
                        [FeatureStdExtZve32x]>;
 def HasVendorXSfvcp : Predicate<"Subtarget->hasVendorXSfvcp()">,
-                                AssemblerPredicate<(all_of FeatureVendorXSfvcp),
-                                "'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions)">;
+                      AssemblerPredicate<(all_of FeatureVendorXSfvcp),
+                          "'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions)">;
 
 def FeatureVendorXSfvqmaccdod
     : SubtargetFeature<"xsfvqmaccdod", "HasVendorXSfvqmaccdod", "true",
                        "'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))",
                        [FeatureStdExtZve32x]>;
-def HasVendorXSfvqmaccdod : Predicate<"Subtarget->hasVendorXSfvqmaccdod()">,
-                         AssemblerPredicate<(all_of FeatureVendorXSfvqmaccdod),
+def HasVendorXSfvqmaccdod
+    : Predicate<"Subtarget->hasVendorXSfvqmaccdod()">,
+      AssemblerPredicate<(all_of FeatureVendorXSfvqmaccdod),
                          "'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))">;
 
 def FeatureVendorXSfvqmaccqoq
     : SubtargetFeature<"xsfvqmaccqoq", "HasVendorXSfvqmaccqoq", "true",
                        "'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))",
                        [FeatureStdExtZve32x]>;
-def HasVendorXSfvqmaccqoq : Predicate<"Subtarget->hasVendorXSfvqmaccqoq()">,
-                         AssemblerPredicate<(all_of FeatureVendorXSfvqmaccqoq),
+def HasVendorXSfvqmaccqoq
+    : Predicate<"Subtarget->hasVendorXSfvqmaccqoq()">,
+      AssemblerPredicate<(all_of FeatureVendorXSfvqmaccqoq),
                          "'XSfvqmaccqoq' (SiFive Int8 Matrix Multiplication Instructions (4-by-8 and 8-by-4))">;
 
 def FeatureVendorXSfvfwmaccqqq
     : SubtargetFeature<"xsfvfwmaccqqq", "HasVendorXSfvfwmaccqqq", "true",
                        "'XSfvfwmaccqqq' (SiFive Matrix Multiply Accumulate Instruction and 4-by-4))",
                        [FeatureStdExtZve32f, FeatureStdExtZvfbfmin]>;
-def HasVendorXSfvfwmaccqqq : Predicate<"Subtarget->hasVendorXSfvfwmaccqqq()">,
-                         AssemblerPredicate<(all_of FeatureVendorXSfvfwmaccqqq),
+def HasVendorXSfvfwmaccqqq
+    : Predicate<"Subtarget->hasVendorXSfvfwmaccqqq()">,
+      AssemblerPredicate<(all_of FeatureVendorXSfvfwmaccqqq),
                          "'XSfvfwmaccqqq' (SiFive Matrix Multiply Accumulate Instruction and 4-by-4))">;
 
 def FeatureVendorXSfvfnrclipxfqf
     : SubtargetFeature<"xsfvfnrclipxfqf", "HasVendorXSfvfnrclipxfqf", "true",
                        "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)",
                        [FeatureStdExtZve32f]>;
-def HasVendorXSfvfnrclipxfqf : Predicate<"Subtarget->hasVendorXSfvfnrclipxfqf()">,
-                               AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf),
-                               "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">;
+def HasVendorXSfvfnrclipxfqf
+    : Predicate<"Subtarget->hasVendorXSfvfnrclipxfqf()">,
+      AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf),
+                         "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">;
 
 // Core-V Extensions
 
@@ -921,52 +924,56 @@ def FeatureVendorXCVelw
                       "'XCVelw' (CORE-V Event Load Word)">;
 def HasVendorXCVelw
    : Predicate<"Subtarget->hasVendorXCVelw()">,
-              AssemblerPredicate<(any_of FeatureVendorXCVelw),
-              "'XCVelw' (CORE-V Event Load Word)">;
+     AssemblerPredicate<(any_of FeatureVendorXCVelw),
+                        "'XCVelw' (CORE-V Event Load Word)">;
 
 def FeatureVendorXCVbitmanip
     : SubtargetFeature<"xcvbitmanip", "HasVendorXCVbitmanip", "true",
                        "'XCVbitmanip' (CORE-V Bit Manipulation)">;
-def HasVendorXCVbitmanip : Predicate<"Subtarget->hasVendorXCVbitmanip()">,
-                                AssemblerPredicate<(all_of FeatureVendorXCVbitmanip),
-                                "'XCVbitmanip' (CORE-V Bit Manipulation)">;
+def HasVendorXCVbitmanip
+    : Predicate<"Subtarget->hasVendorXCVbitmanip()">,
+      AssemblerPredicate<(all_of FeatureVendorXCVbitmanip),
+                         "'XCVbitmanip' (CORE-V Bit Manipulation)">;
 
 def FeatureVendorXCVmac
     : SubtargetFeature<"xcvmac", "HasVendorXCVmac", "true",
                        "'XCVmac' (CORE-V Multiply-Accumulate)">;
-def HasVendorXCVmac : Predicate<"Subtarget->hasVendorXCVmac()">,
-                                AssemblerPredicate<(all_of FeatureVendorXCVmac),
-                                "'XCVmac' (CORE-V Multiply-Accumulate)">;
+def HasVendorXCVmac
+    : Predicate<"Subtarget->hasVendorXCVmac()">,
+      AssemblerPredicate<(all_of FeatureVendorXCVmac),
+                         "'XCVmac' (CORE-V Multiply-Accumulate)">;
 
 def FeatureVendorXCVmem
     : SubtargetFeature<"xcvmem", "HasVendorXCVmem", "true",
                        "'XCVmem' (CORE-V Post-incrementing Load & Store)">;
 def HasVendorXCVmem
     : Predicate<"Subtarget->hasVendorXCVmem()">,
-               AssemblerPredicate<(any_of FeatureVendorXCVmem),
-               "'XCVmem' (CORE-V Post-incrementing Load & Store)">;
+      AssemblerPredicate<(any_of FeatureVendorXCVmem),
+                         "'XCVmem' (CORE-V Post-incrementing Load & Store)">;
 
 def FeatureVendorXCValu
     : SubtargetFeature<"xcvalu", "HasVendorXCValu", "true",
                        "'XCValu' (CORE-V ALU Operations)">;
-def HasVendorXCValu : Predicate<"Subtarget->hasVendorXCValu()">,
-                                AssemblerPredicate<(all_of FeatureVendorXCValu),
-                                "'XCValu' (CORE-V ALU Operations)">;
+def HasVendorXCValu
+    : Predicate<"Subtarget->hasVendorXCValu()">,
+      AssemblerPredicate<(all_of FeatureVendorXCValu),
+                         "'XCValu' (CORE-V ALU Operations)">;
 
 def FeatureVendorXCVsimd
     : SubtargetFeature<"xcvsimd", "HasVendorXCvsimd", "true",
                        "'XCVsimd' (CORE-V SIMD ALU)">;
 def HasVendorXCVsimd
     : Predicate<"Subtarget->hasVendorXCVsimd()">,
-                AssemblerPredicate<(any_of FeatureVendorXCVsimd),
-                "'XCVsimd' (CORE-V SIMD ALU)">;
+      AssemblerPredicate<(any_of FeatureVendorXCVsimd),
+                         "'XCVsimd' (CORE-V SIMD ALU)">;
 
 def FeatureVendorXCVbi
     : SubtargetFeature<"xcvbi", "HasVendorXCVbi", "true",
                        "'XCVbi' (CORE-V Immediate Branching)">;
-def HasVendorXCVbi : Predicate<"Subtarget->hasVendorXCVbi()">,
-                               AssemblerPredicate<(all_of FeatureVendorXCVbi),
-                               "'XCVbi' (CORE-V Immediate Branching)">;
+def HasVendorXCVbi
+    : Predicate<"Subtarget->hasVendorXCVbi()">,
+      AssemblerPredicate<(all_of FeatureVendorXCVbi),
+                         "'XCVbi' (CORE-V Immediate Branching)">;
 
 //===----------------------------------------------------------------------===//
 // LLVM specific features and extensions
@@ -979,11 +986,11 @@ def Feature32Bit
 def Feature64Bit
     : SubtargetFeature<"64bit", "IsRV64", "true", "Implements RV64">;
 def IsRV64 : Predicate<"Subtarget->is64Bit()">,
-                       AssemblerPredicate<(all_of Feature64Bit),
-                                         "RV64I Base Instruction Set">;
+             AssemblerPredicate<(all_of Feature64Bit),
+                                "RV64I Base Instruction Set">;
 def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
-                       AssemblerPredicate<(all_of (not Feature64Bit)),
-                                          "RV32I Base Instruction Set">;
+             AssemblerPredicate<(all_of (not Feature64Bit)),
+                                "RV32I Base Instruction Set">;
 
 defvar RV32 = DefaultMode;
 def RV64           : HwMode<"+64bit", [IsRV64]>;
@@ -992,7 +999,7 @@ def FeatureRVE
     : SubtargetFeature<"e", "IsRVE", "true",
                        "Implements RV{32,64}E (provides 16 rather than 32 GPRs)">;
 def IsRVE : Predicate<"Subtarget->isRVE()">,
-                        AssemblerPredicate<(all_of FeatureRVE)>;
+            AssemblerPredicate<(all_of FeatureRVE)>;
 
 def FeatureRelax
     : SubtargetFeature<"relax", "EnableLinkerRelax", "true",

From 32073b835674a9e7bc3e1ee9708efb7c58e7394f Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Tue, 23 Jan 2024 10:05:32 -0800
Subject: [PATCH 665/843] AMDGPU: Do not generate non-temporal hint when
 Load_Tr intrinsic did not specify it (#79104)

int_amdgcn_global_load_tr did not specify non-temporal load transpose,
thus we should
not genetrate the non-temporal hint for the load. We need to implement
getTgtMemIntrinsic
to create the corresponding MemSDNode. And we don't set the non-temporal
flag because
the intrinsic did not specify it.

NOTE: We need to implement getTgtMemIntrinsic for any memory intrinsics.
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  9 +++++++
 .../AMDGPU/llvm.amdgcn.global.load.tr-w32.ll  | 24 +++++++------------
 .../AMDGPU/llvm.amdgcn.global.load.tr-w64.ll  | 24 +++++++------------
 3 files changed, 25 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 073c8cc721173..cf947dccafac5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1348,6 +1348,14 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                   MachineMemOperand::MOVolatile;
     return true;
   }
+  case Intrinsic::amdgcn_global_load_tr: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align.reset();
+    Info.flags |= MachineMemOperand::MOLoad;
+    return true;
+  }
   case Intrinsic::amdgcn_ds_gws_init:
   case Intrinsic::amdgcn_ds_gws_barrier:
   case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1407,6 +1415,7 @@ bool SITargetLowering::getAddrModeArguments(IntrinsicInst *II,
                                             SmallVectorImpl<Value*> &Ops,
                                             Type *&AccessTy) const {
   switch (II->getIntrinsicID()) {
+  case Intrinsic::amdgcn_global_load_tr:
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap:
   case Intrinsic::amdgcn_ds_append:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index 5382b56b92fb1..8f1e6f3ac1a0c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -13,9 +13,8 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp
 ; GFX12-SDAG-W32-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-W32-NEXT:    global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W32-NEXT:    global_load_tr_b64 v[0:1], v2, s[0:1] offset:32
 ; GFX12-SDAG-W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-W32-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-W32-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-SDAG-W32-NEXT:    s_nop 0
 ; GFX12-SDAG-W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -26,9 +25,8 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp
 ; GFX12-GISEL-W32-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-W32-NEXT:    global_load_tr_b64 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W32-NEXT:    global_load_tr_b64 v[0:1], v2, s[0:1] offset:32
 ; GFX12-GISEL-W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-W32-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-W32-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-W32-NEXT:    s_nop 0
 ; GFX12-GISEL-W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -46,9 +44,8 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a
 ; GFX12-SDAG-W32-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-W32-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-SDAG-W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
 ; GFX12-SDAG-W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-W32-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-W32-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
 ; GFX12-SDAG-W32-NEXT:    s_nop 0
 ; GFX12-SDAG-W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -59,9 +56,8 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a
 ; GFX12-GISEL-W32-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-W32-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-GISEL-W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
 ; GFX12-GISEL-W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-W32-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-W32-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
 ; GFX12-GISEL-W32-NEXT:    s_nop 0
 ; GFX12-GISEL-W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -79,9 +75,8 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr
 ; GFX12-SDAG-W32-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-W32-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-SDAG-W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
 ; GFX12-SDAG-W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-W32-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-W32-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
 ; GFX12-SDAG-W32-NEXT:    s_nop 0
 ; GFX12-SDAG-W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -92,9 +87,8 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr
 ; GFX12-GISEL-W32-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-W32-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-GISEL-W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
 ; GFX12-GISEL-W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-W32-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-W32-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
 ; GFX12-GISEL-W32-NEXT:    s_nop 0
 ; GFX12-GISEL-W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -112,9 +106,8 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt
 ; GFX12-SDAG-W32-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-W32-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-SDAG-W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
 ; GFX12-SDAG-W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-W32-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-W32-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
 ; GFX12-SDAG-W32-NEXT:    s_nop 0
 ; GFX12-SDAG-W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -125,9 +118,8 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt
 ; GFX12-GISEL-W32-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-W32-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX12-GISEL-W32-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W32-NEXT:    global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
 ; GFX12-GISEL-W32-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-W32-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-W32-NEXT:    global_store_b128 v4, v[0:3], s[2:3]
 ; GFX12-GISEL-W32-NEXT:    s_nop 0
 ; GFX12-GISEL-W32-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index 0936d17563644..d5a45fb838fc7 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -13,9 +13,8 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp
 ; GFX12-SDAG-W64-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-SDAG-W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-W64-NEXT:    global_load_tr_b64 v1, v0, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W64-NEXT:    global_load_tr_b64 v1, v0, s[0:1] offset:32
 ; GFX12-SDAG-W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-W64-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-W64-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX12-SDAG-W64-NEXT:    s_nop 0
 ; GFX12-SDAG-W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -26,9 +25,8 @@ define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrsp
 ; GFX12-GISEL-W64-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-W64-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX12-GISEL-W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-W64-NEXT:    global_load_tr_b64 v1, v0, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W64-NEXT:    global_load_tr_b64 v1, v0, s[0:1] offset:32
 ; GFX12-GISEL-W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-W64-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-W64-NEXT:    global_store_b32 v0, v1, s[2:3]
 ; GFX12-GISEL-W64-NEXT:    s_nop 0
 ; GFX12-GISEL-W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -46,9 +44,8 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a
 ; GFX12-SDAG-W64-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
 ; GFX12-SDAG-W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-W64-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-W64-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-SDAG-W64-NEXT:    s_nop 0
 ; GFX12-SDAG-W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -59,9 +56,8 @@ define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr a
 ; GFX12-GISEL-W64-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
 ; GFX12-GISEL-W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-W64-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-W64-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-W64-NEXT:    s_nop 0
 ; GFX12-GISEL-W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -79,9 +75,8 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr
 ; GFX12-SDAG-W64-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
 ; GFX12-SDAG-W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-W64-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-W64-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-SDAG-W64-NEXT:    s_nop 0
 ; GFX12-SDAG-W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -92,9 +87,8 @@ define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr
 ; GFX12-GISEL-W64-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
 ; GFX12-GISEL-W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-W64-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-W64-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-W64-NEXT:    s_nop 0
 ; GFX12-GISEL-W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -112,9 +106,8 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt
 ; GFX12-SDAG-W64-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-SDAG-W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-SDAG-W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-SDAG-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
 ; GFX12-SDAG-W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-W64-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-SDAG-W64-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-SDAG-W64-NEXT:    s_nop 0
 ; GFX12-SDAG-W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
@@ -125,9 +118,8 @@ define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, pt
 ; GFX12-GISEL-W64-NEXT:    s_load_b128 s[0:3], s[0:1], 0x24
 ; GFX12-GISEL-W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-GISEL-W64-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32 th:TH_LOAD_NT
+; GFX12-GISEL-W64-NEXT:    global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
 ; GFX12-GISEL-W64-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-W64-NEXT:    global_inv scope:SCOPE_SYS
 ; GFX12-GISEL-W64-NEXT:    global_store_b64 v2, v[0:1], s[2:3]
 ; GFX12-GISEL-W64-NEXT:    s_nop 0
 ; GFX12-GISEL-W64-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)

From 7fe951ad8af24d9b7072cbad58d22f2e6cb5da7e Mon Sep 17 00:00:00 2001
From: gulfemsavrun <gulfem@google.com>
Date: Tue, 23 Jan 2024 10:12:10 -0800
Subject: [PATCH 666/843] =?UTF-8?q?Revert=20"Reapply=20[hwasan]=20Update?=
 =?UTF-8?q?=20dbg.assign=20intrinsics=20in=20HWAsan=20pass=20=E2=80=A6=20(?=
 =?UTF-8?q?#79186)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…#78606"

This reverts commit 13c6f1ea2e7eb15fe492d8fca4fa1857c6f86370 because it
causes an assertion in DebugInfoMetadata.cpp:1968 in Clang Linux
builders for Fuchsia.

https://logs.chromium.org/logs/fuchsia/buildbucket/cr-buildbucket/8758111613576762817/+/u/clang/build/stdout
---
 llvm/lib/IR/DebugInfo.cpp                     |  4 ++
 .../Instrumentation/HWAddressSanitizer.cpp    |  5 --
 .../Transforms/Utils/MemoryTaggingSupport.cpp | 10 +--
 .../AArch64/dbg-assign-tag-offset-mix-loc.ll  | 71 -------------------
 .../CodeGen/AArch64/dbg-assign-tag-offset.ll  | 71 -------------------
 .../declare-to-assign/hwasan.ll               |  2 +-
 .../dbg-assign-tag-offset.ll                  | 59 ---------------
 7 files changed, 8 insertions(+), 214 deletions(-)
 delete mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
 delete mode 100644 llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
 delete mode 100644 llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll

diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index e33274895ef43..53cbf01918969 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -2211,6 +2211,10 @@ bool AssignmentTrackingPass::runOnFunction(Function &F) {
   if (F.hasFnAttribute(Attribute::OptimizeNone))
     return /*Changed*/ false;
 
+  // FIXME: https://github.com/llvm/llvm-project/issues/76545
+  if (F.hasFnAttribute(Attribute::SanitizeHWAddress))
+    return /*Changed*/ false;
+
   bool Changed = false;
   auto *DL = &F.getParent()->getDataLayout();
   // Collect a map of {backing storage : dbg.declares} (currently "backing
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 3ca9c402b4719..efb621cde9065 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1435,11 +1435,6 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
         if (DDI->getVariableLocationOp(LocNo) == AI)
           DDI->setExpression(DIExpression::appendOpsToArg(DDI->getExpression(),
                                                           NewOps, LocNo));
-      if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DDI)) {
-        if (DAI->getAddress() == AI)
-          DAI->setAddressExpression(DIExpression::prependOpcodes(
-              DAI->getAddressExpression(), NewOps));
-      }
     }
 
     auto TagEnd = [&](Instruction *Node) {
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index d2efcde5d3803..f94047633022c 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -138,20 +138,16 @@ void StackInfoBuilder::visit(Instruction &Inst) {
     return;
   }
   if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&Inst)) {
-    auto AddIfInteresting = [&](Value *V) {
+    for (Value *V : DVI->location_ops()) {
       if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
         if (!isInterestingAlloca(*AI))
-          return;
+          continue;
         AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
         auto &DVIVec = AInfo.DbgVariableIntrinsics;
         if (DVIVec.empty() || DVIVec.back() != DVI)
           DVIVec.push_back(DVI);
       }
-    };
-    for (Value *V : DVI->location_ops())
-      AddIfInteresting(V);
-    if (auto *DAI = dyn_cast<DbgAssignIntrinsic>(DVI))
-      AddIfInteresting(DAI->getAddress());
+    }
   }
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
deleted file mode 100644
index ef0dd46cb45c7..0000000000000
--- a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset-mix-loc.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
-
-;; Similar to dbg-assign-tag-offset.ll except the variable 'x' has been removed
-;; and 'y' has an implicit location range as well as stack location range
-;; (according to the hand-modified debug info -- see the dbg.value).
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-android24"
-
-; CHECK:      DW_TAG_variable
-; CHECK-NOT:  DW_TAG
-; CHECK:        DW_AT_LLVM_tag_offset (0x80)
-; CHECK-NEXT:   DW_AT_name    ("y")
-
-define dso_local void @f() !dbg !14 {
-  %1 = alloca i32, align 4, !DIAssignID !31
-  %2 = alloca i32, align 4, !DIAssignID !32
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  call void @llvm.dbg.value(metadata i32 2, metadata !20, metadata !DIExpression()), !dbg !22
-  call void @use(ptr null), !dbg !28
-  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
-  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  call void @use(ptr nonnull %1), !dbg !28
-  call void @use(ptr nonnull %2), !dbg !29
-  ret void, !dbg !30
-}
-
-declare !dbg !5 void @use(ptr)
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
-!llvm.ident = !{!13}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
-!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{!4, !5}
-!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
-!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
-!6 = !DISubroutineType(types: !7)
-!7 = !{null, !4}
-!8 = !{i32 7, !"Dwarf Version", i32 4}
-!9 = !{i32 2, !"Debug Info Version", i32 3}
-!10 = !{i32 1, !"wchar_size", i32 4}
-!11 = !{i32 7, !"PIC Level", i32 2}
-!12 = !{i32 7, !"PIE Level", i32 2}
-!13 = !{!"clang version 10.0.0"}
-!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
-!15 = !DISubroutineType(types: !16)
-!16 = !{null}
-!17 = !{!18, !20}
-!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
-!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
-!21 = !DILocation(line: 5, column: 3, scope: !14)
-!22 = !DILocation(line: 0, scope: !14)
-!23 = !DILocation(line: 5, column: 10, scope: !14)
-!24 = !{!25, !25, i64 0}
-!25 = !{!"int", !26, i64 0}
-!26 = !{!"omnipotent char", !27, i64 0}
-!27 = !{!"Simple C++ TBAA"}
-!28 = !DILocation(line: 6, column: 3, scope: !14)
-!29 = !DILocation(line: 7, column: 3, scope: !14)
-!30 = !DILocation(line: 8, column: 1, scope: !14)
-!31 = distinct !DIAssignID()
-!32 = distinct !DIAssignID()
-!33 = distinct !DIAssignID()
-!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll b/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
deleted file mode 100644
index a587f93d14d70..0000000000000
--- a/llvm/test/CodeGen/AArch64/dbg-assign-tag-offset.ll
+++ /dev/null
@@ -1,71 +0,0 @@
-; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s
-
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64-unknown-linux-android24"
-
-; CHECK:      DW_TAG_variable
-; CHECK-NOT:  DW_TAG
-; CHECK:        DW_AT_LLVM_tag_offset (0x00)
-; CHECK-NEXT:   DW_AT_name    ("x")
-
-; CHECK:      DW_TAG_variable
-; CHECK-NOT:  DW_TAG
-; CHECK:        DW_AT_LLVM_tag_offset (0x80)
-; CHECK-NEXT:   DW_AT_name    ("y")
-
-define dso_local void @f() !dbg !14 {
-  %1 = alloca i32, align 4, !DIAssignID !31
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !18, metadata !DIExpression(), metadata !31, metadata ptr %1, metadata !DIExpression(DW_OP_LLVM_tag_offset, 0)), !dbg !22
-  %2 = alloca i32, align 4, !DIAssignID !32
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !20, metadata !DIExpression(), metadata !32, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  store i32 1, ptr %2, align 4, !dbg !23, !tbaa !24, !DIAssignID !33
-  call void @llvm.dbg.assign(metadata i32 1, metadata !20, metadata !DIExpression(), metadata !33, metadata ptr %2, metadata !DIExpression(DW_OP_LLVM_tag_offset, 128)), !dbg !22
-  call void @use(ptr nonnull %1), !dbg !28
-  call void @use(ptr nonnull %2), !dbg !29
-  ret void, !dbg !30
-}
-
-declare !dbg !5 void @use(ptr)
-
-declare void @llvm.dbg.value(metadata, metadata, metadata)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata)
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!8, !9, !10, !11, !12, !34}
-!llvm.ident = !{!13}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus_14, file: !1, producer: "clang version 10.0.0", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !2, retainedTypes: !3, nameTableKind: None)
-!1 = !DIFile(filename: "dbg.cc", directory: "/tmp")
-!2 = !{}
-!3 = !{!4, !5}
-!4 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: null, size: 64)
-!5 = !DISubprogram(name: "use", scope: !1, file: !1, line: 2, type: !6, flags: DIFlagPrototyped, spFlags: DISPFlagOptimized, retainedNodes: !2)
-!6 = !DISubroutineType(types: !7)
-!7 = !{null, !4}
-!8 = !{i32 7, !"Dwarf Version", i32 4}
-!9 = !{i32 2, !"Debug Info Version", i32 3}
-!10 = !{i32 1, !"wchar_size", i32 4}
-!11 = !{i32 7, !"PIC Level", i32 2}
-!12 = !{i32 7, !"PIE Level", i32 2}
-!13 = !{!"clang version 10.0.0"}
-!14 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 4, type: !15, scopeLine: 4, flags: DIFlagPrototyped | DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !17)
-!15 = !DISubroutineType(types: !16)
-!16 = !{null}
-!17 = !{!18, !20}
-!18 = !DILocalVariable(name: "x", scope: !14, file: !1, line: 5, type: !19)
-!19 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
-!20 = !DILocalVariable(name: "y", scope: !14, file: !1, line: 5, type: !19)
-!21 = !DILocation(line: 5, column: 3, scope: !14)
-!22 = !DILocation(line: 0, scope: !14)
-!23 = !DILocation(line: 5, column: 10, scope: !14)
-!24 = !{!25, !25, i64 0}
-!25 = !{!"int", !26, i64 0}
-!26 = !{!"omnipotent char", !27, i64 0}
-!27 = !{!"Simple C++ TBAA"}
-!28 = !DILocation(line: 6, column: 3, scope: !14)
-!29 = !DILocation(line: 7, column: 3, scope: !14)
-!30 = !DILocation(line: 8, column: 1, scope: !14)
-!31 = distinct !DIAssignID()
-!32 = distinct !DIAssignID()
-!33 = distinct !DIAssignID()
-!34 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
diff --git a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
index 6c9366609cba2..c4b209de77017 100644
--- a/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
+++ b/llvm/test/DebugInfo/Generic/assignment-tracking/declare-to-assign/hwasan.ll
@@ -1,6 +1,6 @@
 ; RUN: opt %s -S -passes=declare-to-assign -o - | FileCheck %s
 
-; CHECK: call void @llvm.dbg.assign
+; CHECK: call void @llvm.dbg.declare
 
 define dso_local void @f() sanitize_hwaddress !dbg !9 {
 entry:
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll b/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
deleted file mode 100644
index f91d2aa110a87..0000000000000
--- a/llvm/test/Instrumentation/HWAddressSanitizer/dbg-assign-tag-offset.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; RUN: opt -passes=hwasan -S -o - %s | FileCheck %s
-
-source_filename = "test.ll"
-target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
-target triple = "aarch64--linux-android"
-
-declare void @g(ptr, ptr, ptr, ptr, ptr, ptr)
-
-; Function Attrs: sanitize_hwaddress
-define void @f() #0 !dbg !7 {
-entry:
-  %nodebug0 = alloca ptr, align 8
-  %nodebug1 = alloca ptr, align 8
-  %nodebug2 = alloca ptr, align 8
-  %nodebug3 = alloca ptr, align 8
-  ; CHECK: %a = alloca{{.*}} !DIAssignID ![[ID1:[0-9]+]]
-  %a = alloca ptr, align 8, !DIAssignID !13
-  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID1]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 32)
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !14, metadata !DIExpression(), metadata !13, metadata ptr %a, metadata !DIExpression()), !dbg !15
-  ; CHECK: %b = alloca{{.*}} !DIAssignID ![[ID2:[0-9]+]]
-  %b = alloca ptr, align 8, !DIAssignID !16
-  ; CHECK: @llvm.dbg.assign{{.*}} metadata ![[ID2]]{{.*}} !DIExpression(DW_OP_LLVM_tag_offset, 96)
-  call void @llvm.dbg.assign(metadata i1 undef, metadata !17, metadata !DIExpression(DW_OP_plus_uconst, 1), metadata !16, metadata ptr %b, metadata !DIExpression()), !dbg !15
-  call void @g(ptr %nodebug0, ptr %nodebug1, ptr %nodebug2, ptr %nodebug3, ptr %a, ptr %b)
-  ret void, !dbg !18
-}
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
-declare void @llvm.dbg.assign(metadata, metadata, metadata, metadata, metadata, metadata) #1
-
-attributes #0 = { sanitize_hwaddress }
-attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-
-!llvm.dbg.cu = !{!0}
-!llvm.module.flags = !{!3, !4, !5}
-!llvm.ident = !{!6}
-
-!0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !2)
-!1 = !DIFile(filename: "x.c", directory: "/")
-!2 = !{}
-!3 = !{i32 2, !"Dwarf Version", i32 4}
-!4 = !{i32 2, !"Debug Info Version", i32 3}
-!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
-!6 = !{!"clang"}
-!7 = distinct !DISubprogram(name: "f", scope: !1, file: !1, line: 1, type: !8, scopeLine: 1, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !0, retainedNodes: !2)
-!8 = !DISubroutineType(types: !9)
-!9 = !{null, !10}
-!10 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !11, size: 64)
-!11 = !DIDerivedType(tag: DW_TAG_const_type, baseType: !12)
-!12 = !DIBasicType(name: "char", size: 8, encoding: DW_ATE_signed_char)
-!13 = distinct !DIAssignID()
-!14 = !DILocalVariable(name: "a", scope: !7, file: !1, line: 1, type: !10)
-!15 = !DILocation(line: 0, scope: !7)
-!16 = distinct !DIAssignID()
-!17 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 1, type: !10)
-!18 = !DILocation(line: 1, column: 37, scope: !7)

From 51a11f1c44b7bd7655687471d8c8912afb637efc Mon Sep 17 00:00:00 2001
From: Tacet <advenam.tacet@trailofbits.com>
Date: Tue, 23 Jan 2024 19:12:57 +0100
Subject: [PATCH 667/843] [ASan][ADT] Don't scribble with ASan (#79066)

With this commit, scribbling under AddressSanitizer (ASan) is disabled to prevent  overwriting poisoned objects (e.g., annotated short strings).
Needed by https://github.com/llvm/llvm-project/pull/79049
---
 llvm/include/llvm/ADT/FunctionExtras.h | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/ADT/FunctionExtras.h b/llvm/include/llvm/ADT/FunctionExtras.h
index 4cf1de488c7bd..c0bc30c7450fe 100644
--- a/llvm/include/llvm/ADT/FunctionExtras.h
+++ b/llvm/include/llvm/ADT/FunctionExtras.h
@@ -35,6 +35,7 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/PointerUnion.h"
 #include "llvm/ADT/STLForwardCompat.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MemAlloc.h"
 #include "llvm/Support/type_traits.h"
 #include <cstring>
@@ -317,8 +318,10 @@ template <typename ReturnT, typename... ParamTs> class UniqueFunctionBase {
     // Clear the old callback and inline flag to get back to as-if-null.
     RHS.CallbackAndInlineFlag = {};
 
-#ifndef NDEBUG
-    // In debug builds, we also scribble across the rest of the storage.
+#if !defined(NDEBUG) && !LLVM_ADDRESS_SANITIZER_BUILD
+    // In debug builds without ASan, we also scribble across the rest of the
+    // storage. Scribbling under AddressSanitizer (ASan) is disabled to prevent
+    // overwriting poisoned objects (e.g., annotated short strings).
     memset(RHS.getInlineStorage(), 0xAD, InlineStorageSize);
 #endif
   }

From 5a7d68c8c4b13aea9922c860e56f6c6cdc223d87 Mon Sep 17 00:00:00 2001
From: Danial Klimkin <dklimkin@users.noreply.github.com>
Date: Tue, 23 Jan 2024 19:14:57 +0100
Subject: [PATCH 668/843] =?UTF-8?q?[test]=20Avoid=20libc=20dep=20in=20Upda?=
 =?UTF-8?q?te=20warn-unsafe-buffer-usage-warning-data=E2=80=A6=20(#79183)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoid libc dep in warn-unsafe-buffer-usage-warning-data-invocation.

To keep the test hermetic. This is in line with other existing
declarations in the file that avoid includes.
---
 .../warn-unsafe-buffer-usage-warning-data-invocation.cpp       | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
index 574afcd0eb6dc..5c9df12513e52 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
@@ -7,7 +7,6 @@
 // RUN: %clang_cc1 -std=c++20 -fblocks -include %s %s 2>&1 | FileCheck --allow-empty %s
 // CHECK-NOT: [-Wunsafe-buffer-usage]
 
-#include <stdint.h>
 #ifndef INCLUDED
 #define INCLUDED
 #pragma clang system_header
@@ -15,6 +14,8 @@
 // no spanification warnings for system headers
 #else
 
+typedef __INTPTR_TYPE__ intptr_t;
+
 namespace std {
   class type_info;
   class bad_cast;

From 80fcc9247a166ca056f049fc4662417b2ae157ce Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Tue, 23 Jan 2024 12:16:18 -0600
Subject: [PATCH 669/843] [mlir][AMDGPU] Actually update the default ABI
 version, add comments (#79185)

Much confusion occurred earlier today when updating the fallback `int
abi;` in addControlVariables() didn't do anything. THis was because that
that value is the fallback for if the ABI version fails to parse ...
which it always should, because it has a default value that comes from
multiple different places.

This commit updates all the places said default variable can come from,
namely:
1. The ROCDL target attribute definition
2. The ROCDL target attribute's builders
3. The rocdl-attach-target pass's default option values.

With this, the printf test is passing.
---
 mlir/include/mlir/Dialect/GPU/Transforms/Passes.td | 4 ++--
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td       | 6 ++++--
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
index 3e0f6a3022f93..a8235bed6f276 100644
--- a/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/GPU/Transforms/Passes.td
@@ -178,8 +178,8 @@ def GpuROCDLAttachTarget: Pass<"rocdl-attach-target", ""> {
            /*default=*/"\"\"",
            "Target features.">,
     Option<"abiVersion", "abi", "std::string",
-           /*default=*/"\"400\"",
-           "Optimization level.">,
+           /*default=*/"\"500\"",
+           "ABI version.">,
     Option<"optLevel", "O", "unsigned",
            /*default=*/"2",
            "Optimization level.">,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 48b830ae34f29..516a984399ff8 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -635,7 +635,9 @@ def ROCDL_TargettAttr :
     StringRefParameter<"Target triple.", "\"amdgcn-amd-amdhsa\"">:$triple,
     StringRefParameter<"Target chip.", "\"gfx900\"">:$chip,
     StringRefParameter<"Target chip features.", "\"\"">:$features,
-    StringRefParameter<"ABI version.", "\"400\"">:$abi,
+    // Also update the default builder below and rocdl-attach-target in
+    // Dialect/GPU/Transforms/Passes.td .
+    StringRefParameter<"ABI version.", "\"500\"">:$abi,
     OptionalParameter<"DictionaryAttr", "Target specific flags.">:$flags,
     OptionalParameter<"ArrayAttr", "Files to link to the LLVM module.">:$link
   );
@@ -647,7 +649,7 @@ def ROCDL_TargettAttr :
                      CArg<"StringRef", "\"amdgcn-amd-amdhsa\"">:$triple,
                      CArg<"StringRef", "\"gfx900\"">:$chip,
                      CArg<"StringRef", "\"\"">:$features,
-                     CArg<"StringRef", "\"400\"">:$abiVersion,
+                     CArg<"StringRef", "\"500\"">:$abiVersion,
                      CArg<"DictionaryAttr", "nullptr">:$targetFlags,
                      CArg<"ArrayAttr", "nullptr">:$linkFiles), [{
       return Base::get($_ctxt, optLevel, triple, chip, features, abiVersion,

From c416b2efe89c11db593fe8041c366e0cb63d4eeb Mon Sep 17 00:00:00 2001
From: Tacet <advenam.tacet@trailofbits.com>
Date: Tue, 23 Jan 2024 19:16:53 +0100
Subject: [PATCH 670/843] [ASan][JSON] Unpoison memory before its reuse
 (#79065)

This commit unpoisons memory before its reuse (with reinterpret_cast).
Required by https://github.com/llvm/llvm-project/pull/79049

Notice that it's a temporary solution to prevent buildbots from failing.
Read FIXME for details.
---
 llvm/include/llvm/Support/JSON.h | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/llvm/include/llvm/Support/JSON.h b/llvm/include/llvm/Support/JSON.h
index a81881c52d6c9..8b437bbabd962 100644
--- a/llvm/include/llvm/Support/JSON.h
+++ b/llvm/include/llvm/Support/JSON.h
@@ -47,9 +47,10 @@
 #define LLVM_SUPPORT_JSON_H
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/STLFunctionalExtras.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
@@ -482,6 +483,18 @@ class Value {
   friend class Object;
 
   template <typename T, typename... U> void create(U &&... V) {
+#if LLVM_ADDRESS_SANITIZER_BUILD
+    // Unpoisoning to prevent overwriting poisoned object (e.g., annotated short
+    // string). Objects that have had their memory poisoned may cause an ASan
+    // error if their memory is reused without calling their destructor.
+    // Unpoisoning the memory prevents this error from occurring.
+    // FIXME: This is a temporary solution to prevent buildbots from failing.
+    //  The more appropriate approach would be to call the object's destructor
+    //  to unpoison memory. This would prevent any potential memory leaks (long
+    //  strings). Read for details:
+    //  https://github.com/llvm/llvm-project/pull/79065#discussion_r1462621761
+    __asan_unpoison_memory_region(&Union, sizeof(T));
+#endif
     new (reinterpret_cast<T *>(&Union)) T(std::forward<U>(V)...);
   }
   template <typename T> T &as() const {

From cb528ec5e6331ce207c7b835d7ab963bd5e13af7 Mon Sep 17 00:00:00 2001
From: Tacet <advenam.tacet@trailofbits.com>
Date: Tue, 23 Jan 2024 19:18:53 +0100
Subject: [PATCH 671/843] [ASan][libc++] Turn on ASan annotations for short
 strings (#79049)

Originally merged here: https://github.com/llvm/llvm-project/pull/75882
Reverted here: https://github.com/llvm/llvm-project/pull/78627

Reverted due to failing buildbots. The problem was not caused by the
annotations code, but by code in the `UniqueFunctionBase` class and in
the `JSON.h` file. That code caused the program to write to memory that
was already being used by string objects, which resulted in an ASan
error.

Fixes are implemented in:
- https://github.com/llvm/llvm-project/pull/79065
- https://github.com/llvm/llvm-project/pull/79066

Problematic code from `UniqueFunctionBase` for example:
```cpp
#ifndef NDEBUG
    // In debug builds, we also scribble across the rest of the storage.
    memset(RHS.getInlineStorage(), 0xAD, InlineStorageSize);
#endif
```

---

Original description:

This commit turns on ASan annotations in `std::basic_string` for short
stings (SSO case).

Originally suggested here: https://reviews.llvm.org/D147680

String annotations added here:
https://github.com/llvm/llvm-project/pull/72677

Requires to pass CI without fails:
- https://github.com/llvm/llvm-project/pull/75845
- https://github.com/llvm/llvm-project/pull/75858

Annotating `std::basic_string` with default allocator is implemented in
https://github.com/llvm/llvm-project/pull/72677 but annotations for
short strings (SSO - Short String Optimization) are turned off there.
This commit turns them on. This also removes
`_LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED`, because we do not plan to
support turning on and off short string annotations.

Support in ASan API exists since
https://github.com/llvm/llvm-project/commit/dd1b7b797a116eed588fd752fbe61d34deeb24e4.
You can turn off annotations for a specific allocator based on changes
from
https://github.com/llvm/llvm-project/commit/2fa1bec7a20bb23f2e6620085adb257dafaa3be0.

This PR is a part of a series of patches extending AddressSanitizer C++
container overflow detection capabilities by adding annotations, similar
to those existing in `std::vector` and `std::deque` collections. These
enhancements empower ASan to effectively detect instances where the
instrumented program attempts to access memory within a collection's
internal allocation that remains unused. This includes cases where
access occurs before or after the stored elements in `std::deque`, or
between the `std::basic_string`'s size (including the null terminator)
and capacity bounds.

The introduction of these annotations was spurred by a real-world
software bug discovered by Trail of Bits, involving an out-of-bounds
memory access during the comparison of two strings using the
`std::equals` function. This function was taking iterators
(`iter1_begin`, `iter1_end`, `iter2_begin`) to perform the comparison,
using a custom comparison function. When the `iter1` object exceeded the
length of `iter2`, an out-of-bounds read could occur on the `iter2`
object. Container sanitization, upon enabling these annotations, would
effectively identify and flag this potential vulnerability.

If you have any questions, please email:

    advenam.tacet@trailofbits.com
    disconnect3d@trailofbits.com
---
 libcxx/include/string                         |  14 +-
 .../asan_deque_integration.pass.cpp           | 182 ++++++++++++++++++
 .../strings/basic.string/asan_short.pass.cpp  |  56 ++++++
 .../asan_vector_integration.pass.cpp          | 182 ++++++++++++++++++
 libcxx/test/support/asan_testing.h            |  29 +--
 5 files changed, 429 insertions(+), 34 deletions(-)
 create mode 100644 libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
 create mode 100644 libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp

diff --git a/libcxx/include/string b/libcxx/include/string
index e97139206d4fa..4116f350a8047 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -659,7 +659,6 @@ _LIBCPP_PUSH_MACROS
 #else
 #  define _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS
 #endif
-#define _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED false
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -1896,22 +1895,17 @@ private:
 #endif
   }
 
-  // ASan: short string is poisoned if and only if this function returns true.
-  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __asan_short_string_is_annotated() const _NOEXCEPT {
-    return _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED && !__libcpp_is_constant_evaluated();
-  }
-
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT {
     (void) __current_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
+    if (!__libcpp_is_constant_evaluated())
       __annotate_contiguous_container(data() + capacity() + 1, data() + __current_size + 1);
 #endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_delete() const _NOEXCEPT {
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
+    if (!__libcpp_is_constant_evaluated())
       __annotate_contiguous_container(data() + size() + 1, data() + capacity() + 1);
 #endif
   }
@@ -1919,7 +1913,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT {
     (void) __n;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
+    if (!__libcpp_is_constant_evaluated())
       __annotate_contiguous_container(data() + size() + 1, data() + size() + 1 + __n);
 #endif
   }
@@ -1927,7 +1921,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
     (void) __old_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
+    if (!__libcpp_is_constant_evaluated())
       __annotate_contiguous_container(data() + __old_size + 1, data() + size() + 1);
 #endif
   }
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
new file mode 100644
index 0000000000000..1205190b3a6e1
--- /dev/null
+++ b/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
@@ -0,0 +1,182 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: asan
+// UNSUPPORTED: c++03
+
+#include <cassert>
+#include <string>
+#include <array>
+#include <deque>
+#include "test_macros.h"
+#include "asan_testing.h"
+#include "min_allocator.h"
+
+// This tests exists to check if strings work well with deque, as those
+// may be partialy annotated, we cannot simply call
+// is_double_ended_contiguous_container_asan_correct, as it assumes that
+// object memory inside is not annotated, so we check everything in a more careful way.
+
+template <typename D>
+void verify_inside(D const& d) {
+  for (size_t i = 0; i < d.size(); ++i) {
+    assert(is_string_asan_correct(d[i]));
+  }
+}
+
+template <typename S, size_t N>
+S get_s(char c) {
+  S s;
+  for (size_t i = 0; i < N; ++i)
+    s.push_back(c);
+
+  return s;
+}
+
+template <class C, class S>
+void test_string() {
+  size_t const N = sizeof(S) < 256 ? (4096 / sizeof(S)) : 16;
+
+  {
+    C d1a(1), d1b(N), d1c(N + 1), d1d(5 * N);
+    verify_inside(d1a);
+    verify_inside(d1b);
+    verify_inside(d1c);
+    verify_inside(d1d);
+  }
+  {
+    C d2;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      d2.push_back(get_s<S, 1>(i % 10 + 'a'));
+      verify_inside(d2);
+      d2.push_back(get_s<S, 22>(i % 10 + 'b'));
+      verify_inside(d2);
+
+      d2.pop_front();
+      verify_inside(d2);
+    }
+  }
+  {
+    C d3;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      d3.push_front(get_s<S, 1>(i % 10 + 'a'));
+      verify_inside(d3);
+      d3.push_front(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d3);
+
+      d3.pop_back();
+      verify_inside(d3);
+    }
+  }
+  {
+    C d4;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      // When there is no SSO, all elements inside should not be poisoned,
+      // so we can verify deque poisoning.
+      d4.push_front(get_s<S, 33>(i % 10 + 'a'));
+      verify_inside(d4);
+      assert(is_double_ended_contiguous_container_asan_correct(d4));
+      d4.push_back(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d4);
+      assert(is_double_ended_contiguous_container_asan_correct(d4));
+    }
+  }
+  {
+    C d5;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      // In d4 we never had poisoned memory inside deque.
+      // Here we start with SSO, so part of the inside of the container,
+      // will be poisoned.
+      d5.push_front(S());
+      verify_inside(d5);
+    }
+    for (size_t i = 0; i < d5.size(); ++i) {
+      // We change the size to have long string.
+      // Memory owne by deque should not be poisoned by string.
+      d5[i].resize(100);
+      verify_inside(d5);
+    }
+
+    assert(is_double_ended_contiguous_container_asan_correct(d5));
+
+    d5.erase(d5.begin() + 2);
+    verify_inside(d5);
+
+    d5.erase(d5.end() - 2);
+    verify_inside(d5);
+
+    assert(is_double_ended_contiguous_container_asan_correct(d5));
+  }
+  {
+    C d6a;
+    assert(is_double_ended_contiguous_container_asan_correct(d6a));
+
+    C d6b(N + 2, get_s<S, 100>('a'));
+    d6b.push_front(get_s<S, 101>('b'));
+    while (!d6b.empty()) {
+      d6b.pop_back();
+      assert(is_double_ended_contiguous_container_asan_correct(d6b));
+    }
+
+    C d6c(N + 2, get_s<S, 102>('c'));
+    while (!d6c.empty()) {
+      d6c.pop_back();
+      assert(is_double_ended_contiguous_container_asan_correct(d6c));
+    }
+  }
+  {
+    C d7(9 * N + 2);
+
+    d7.insert(d7.begin() + 1, S());
+    verify_inside(d7);
+
+    d7.insert(d7.end() - 3, S());
+    verify_inside(d7);
+
+    d7.insert(d7.begin() + 2 * N, get_s<S, 1>('a'));
+    verify_inside(d7);
+
+    d7.insert(d7.end() - 2 * N, get_s<S, 1>('b'));
+    verify_inside(d7);
+
+    d7.insert(d7.begin() + 2 * N, 3 * N, get_s<S, 1>('c'));
+    verify_inside(d7);
+
+    // It may not be short for big element types, but it will be checked correctly:
+    d7.insert(d7.end() - 2 * N, 3 * N, get_s<S, 2>('d'));
+    verify_inside(d7);
+
+    d7.erase(d7.begin() + 2);
+    verify_inside(d7);
+
+    d7.erase(d7.end() - 2);
+    verify_inside(d7);
+  }
+}
+
+template <class S>
+void test_container() {
+  test_string<std::deque<S, std::allocator<S>>, S>();
+  test_string<std::deque<S, min_allocator<S>>, S>();
+  test_string<std::deque<S, safe_allocator<S>>, S>();
+}
+
+int main(int, char**) {
+  // Those tests support only types based on std::basic_string.
+  test_container<std::string>();
+  test_container<std::wstring>();
+#if TEST_STD_VER >= 11
+  test_container<std::u16string>();
+  test_container<std::u32string>();
+#endif
+#if TEST_STD_VER >= 20
+  test_container<std::u8string>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
new file mode 100644
index 0000000000000..53c70bed189b5
--- /dev/null
+++ b/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
@@ -0,0 +1,56 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: asan
+// UNSUPPORTED: c++03
+
+// <string>
+
+// Basic test if ASan annotations work for short strings.
+
+#include <string>
+#include <cassert>
+#include <cstdlib>
+
+#include "asan_testing.h"
+#include "min_allocator.h"
+#include "test_iterators.h"
+#include "test_macros.h"
+
+extern "C" void __sanitizer_set_death_callback(void (*callback)(void));
+
+void do_exit() { exit(0); }
+
+int main(int, char**) {
+  {
+    typedef cpp17_input_iterator<char*> MyInputIter;
+    // Should not trigger ASan.
+    std::basic_string<char, std::char_traits<char>, safe_allocator<char>> v;
+    char i[] = {'a', 'b', 'c', 'd'};
+
+    v.insert(v.begin(), MyInputIter(i), MyInputIter(i + 4));
+    assert(v[0] == 'a');
+    assert(is_string_asan_correct(v));
+  }
+
+  __sanitizer_set_death_callback(do_exit);
+  {
+    using T     = char;
+    using C     = std::basic_string<T, std::char_traits<T>, safe_allocator<T>>;
+    const T t[] = {'a', 'b', 'c', 'd', 'e', 'f', 'g'};
+    C c(std::begin(t), std::end(t));
+    assert(is_string_asan_correct(c));
+    assert(__sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) !=
+           0);
+    volatile T foo = c[c.size() + 1]; // should trigger ASAN. Use volatile to prevent being optimized away.
+    assert(false);                    // if we got here, ASAN didn't trigger
+    ((void)foo);
+  }
+
+  return 0;
+}
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp
new file mode 100644
index 0000000000000..b7d95b7069083
--- /dev/null
+++ b/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp
@@ -0,0 +1,182 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// REQUIRES: asan
+// UNSUPPORTED: c++03
+
+#include <cassert>
+#include <string>
+#include <vector>
+#include <array>
+#include "test_macros.h"
+#include "asan_testing.h"
+#include "min_allocator.h"
+
+// This tests exists to check if strings work well with vector, as those
+// may be partialy annotated, we cannot simply call
+// is_contiguous_container_asan_correct, as it assumes that
+// object memory inside is not annotated, so we check everything in a more careful way.
+
+template <typename D>
+void verify_inside(D const& d) {
+  for (size_t i = 0; i < d.size(); ++i) {
+    assert(is_string_asan_correct(d[i]));
+  }
+}
+
+template <typename S, size_t N>
+S get_s(char c) {
+  S s;
+  for (size_t i = 0; i < N; ++i)
+    s.push_back(c);
+
+  return s;
+}
+
+template <class C, class S>
+void test_string() {
+  size_t const N = sizeof(S) < 256 ? (4096 / sizeof(S)) : 16;
+
+  {
+    C d1a(1), d1b(N), d1c(N + 1), d1d(5 * N);
+    verify_inside(d1a);
+    verify_inside(d1b);
+    verify_inside(d1c);
+    verify_inside(d1d);
+  }
+  {
+    C d2;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      d2.push_back(get_s<S, 1>(i % 10 + 'a'));
+      verify_inside(d2);
+      d2.push_back(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d2);
+
+      d2.erase(d2.cbegin());
+      verify_inside(d2);
+    }
+  }
+  {
+    C d3;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      d3.push_back(get_s<S, 1>(i % 10 + 'a'));
+      verify_inside(d3);
+      d3.push_back(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d3);
+
+      d3.pop_back();
+      verify_inside(d3);
+    }
+  }
+  {
+    C d4;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      // When there is no SSO, all elements inside should not be poisoned,
+      // so we can verify vector poisoning.
+      d4.push_back(get_s<S, 33>(i % 10 + 'a'));
+      verify_inside(d4);
+      assert(is_contiguous_container_asan_correct(d4));
+      d4.push_back(get_s<S, 28>(i % 10 + 'b'));
+      verify_inside(d4);
+      assert(is_contiguous_container_asan_correct(d4));
+    }
+  }
+  {
+    C d5;
+    for (size_t i = 0; i < 3 * N + 2; ++i) {
+      // In d4 we never had poisoned memory inside vector.
+      // Here we start with SSO, so part of the inside of the container,
+      // will be poisoned.
+      d5.push_back(S());
+      verify_inside(d5);
+    }
+    for (size_t i = 0; i < d5.size(); ++i) {
+      // We change the size to have long string.
+      // Memory owne by vector should not be poisoned by string.
+      d5[i].resize(100);
+      verify_inside(d5);
+    }
+
+    assert(is_contiguous_container_asan_correct(d5));
+
+    d5.erase(d5.begin() + 2);
+    verify_inside(d5);
+
+    d5.erase(d5.end() - 2);
+    verify_inside(d5);
+
+    assert(is_contiguous_container_asan_correct(d5));
+  }
+  {
+    C d6a;
+    assert(is_contiguous_container_asan_correct(d6a));
+
+    C d6b(N + 2, get_s<S, 100>('a'));
+    d6b.push_back(get_s<S, 101>('b'));
+    while (!d6b.empty()) {
+      d6b.pop_back();
+      assert(is_contiguous_container_asan_correct(d6b));
+    }
+
+    C d6c(N + 2, get_s<S, 102>('c'));
+    while (!d6c.empty()) {
+      d6c.pop_back();
+      assert(is_contiguous_container_asan_correct(d6c));
+    }
+  }
+  {
+    C d7(9 * N + 2);
+
+    d7.insert(d7.begin() + 1, S());
+    verify_inside(d7);
+
+    d7.insert(d7.end() - 3, S());
+    verify_inside(d7);
+
+    d7.insert(d7.begin() + 2 * N, get_s<S, 1>('a'));
+    verify_inside(d7);
+
+    d7.insert(d7.end() - 2 * N, get_s<S, 1>('b'));
+    verify_inside(d7);
+
+    d7.insert(d7.begin() + 2 * N, 3 * N, get_s<S, 1>('c'));
+    verify_inside(d7);
+
+    // It may not be short for big element types, but it will be checked correctly:
+    d7.insert(d7.end() - 2 * N, 3 * N, get_s<S, 2>('d'));
+    verify_inside(d7);
+
+    d7.erase(d7.begin() + 2);
+    verify_inside(d7);
+
+    d7.erase(d7.end() - 2);
+    verify_inside(d7);
+  }
+}
+
+template <class S>
+void test_container() {
+  test_string<std::vector<S, std::allocator<S>>, S>();
+  test_string<std::vector<S, min_allocator<S>>, S>();
+  test_string<std::vector<S, safe_allocator<S>>, S>();
+}
+
+int main(int, char**) {
+  // Those tests support only types based on std::basic_string.
+  test_container<std::string>();
+  test_container<std::wstring>();
+#if TEST_STD_VER >= 11
+  test_container<std::u16string>();
+  test_container<std::u32string>();
+#endif
+#if TEST_STD_VER >= 20
+  test_container<std::u8string>();
+#endif
+
+  return 0;
+}
diff --git a/libcxx/test/support/asan_testing.h b/libcxx/test/support/asan_testing.h
index 6bfc8280a4ead..3785c1f9c20de 100644
--- a/libcxx/test/support/asan_testing.h
+++ b/libcxx/test/support/asan_testing.h
@@ -56,35 +56,16 @@ TEST_CONSTEXPR bool is_double_ended_contiguous_container_asan_correct(const std:
 #endif
 
 #if TEST_HAS_FEATURE(address_sanitizer)
-template <typename S>
-bool is_string_short(S const& s) {
-  // We do not have access to __is_long(), but we can check if strings
-  // buffer is inside strings memory. If strings memory contains its content,
-  // SSO is in use. To check it, we can just confirm that the beginning is in
-  // the string object memory block.
-  // &s    - beginning of objects memory
-  // &s[0] - beginning of the buffer
-  // (&s+1) - end of objects memory
-  return (void*)std::addressof(s) <= (void*)std::addressof(s[0]) &&
-         (void*)std::addressof(s[0]) < (void*)(std::addressof(s) + 1);
-}
-
 template <typename ChrT, typename TraitsT, typename Alloc>
 TEST_CONSTEXPR bool is_string_asan_correct(const std::basic_string<ChrT, TraitsT, Alloc>& c) {
   if (TEST_IS_CONSTANT_EVALUATED)
     return true;
 
-  if (!is_string_short(c) || _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED) {
-    if (std::__asan_annotate_container_with_allocator<Alloc>::value)
-      return __sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) !=
-             0;
-    else
-      return __sanitizer_verify_contiguous_container(
-                 c.data(), c.data() + c.capacity() + 1, c.data() + c.capacity() + 1) != 0;
-  } else {
-    return __sanitizer_verify_contiguous_container(std::addressof(c), std::addressof(c) + 1, std::addressof(c) + 1) !=
-           0;
-  }
+  if (std::__asan_annotate_container_with_allocator<Alloc>::value)
+    return __sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) != 0;
+  else
+    return __sanitizer_verify_contiguous_container(
+               c.data(), c.data() + c.capacity() + 1, c.data() + c.capacity() + 1) != 0;
 }
 #else
 #  include <string>

From 0cf20c2971375ea200035f2e99bd81bd00f5f27d Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Tue, 23 Jan 2024 13:31:48 -0500
Subject: [PATCH 672/843] [libc] fix sysconf (#79159)

Apply previously discussed fix for `sysconf`
---
 libc/src/unistd/linux/CMakeLists.txt |  2 ++
 libc/src/unistd/linux/sysconf.cpp    | 10 +++++-----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/libc/src/unistd/linux/CMakeLists.txt b/libc/src/unistd/linux/CMakeLists.txt
index ab9eca19e6508..42190079141b0 100644
--- a/libc/src/unistd/linux/CMakeLists.txt
+++ b/libc/src/unistd/linux/CMakeLists.txt
@@ -402,7 +402,9 @@ add_entrypoint_object(
     ../sysconf.h
   DEPENDS
     libc.include.unistd
+    libc.include.sys_auxv
     libc.src.errno.errno
+    libc.src.sys.auxv.getauxval
 )
 
 add_entrypoint_object(
diff --git a/libc/src/unistd/linux/sysconf.cpp b/libc/src/unistd/linux/sysconf.cpp
index b16e15551fc78..d4577c8d3d769 100644
--- a/libc/src/unistd/linux/sysconf.cpp
+++ b/libc/src/unistd/linux/sysconf.cpp
@@ -11,17 +11,17 @@
 #include "src/__support/common.h"
 
 #include "src/errno/libc_errno.h"
-#include <linux/param.h> // For EXEC_PAGESIZE.
+#include "src/sys/auxv/getauxval.h"
+#include <sys/auxv.h>
 #include <unistd.h>
 
 namespace LIBC_NAMESPACE {
 
 LLVM_LIBC_FUNCTION(long, sysconf, (int name)) {
   long ret = 0;
-  if (name == _SC_PAGESIZE) {
-    // TODO: get this information from the auxvector.
-    return EXEC_PAGESIZE;
-  }
+  if (name == _SC_PAGESIZE)
+    return static_cast<long>(getauxval(AT_PAGESZ));
+
   // TODO: Complete the rest of the sysconf options.
   if (ret < 0) {
     libc_errno = EINVAL;

From bb8a8770e203ba027d141cd1200e93809ea66c8f Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 23 Jan 2024 10:36:22 -0800
Subject: [PATCH 673/843] [RISCV] Exploit register boundaries when lowering
 shuffle with exact vlen (#79072)

If we have a shuffle which is larger than m1, we may be able to split it
into a series of individual m1 shuffles. This patch starts with the
subcase where the mask allows a 1-to-1 mapping from source register to
destination register - each with a possible permutation of their own. We
can potentially extend this later, thought in practice this seems to
already catch a number of the most interesting cases.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  84 +++++++++++++++
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   | 100 ++++++------------
 2 files changed, 118 insertions(+), 66 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2fd4479ca5fe0..ae1ebcb51c065 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4652,6 +4652,85 @@ static SDValue lowerVECTOR_SHUFFLEAsRotate(ShuffleVectorSDNode *SVN,
   return DAG.getBitcast(VT, Rotate);
 }
 
+// If compiling with an exactly known VLEN, see if we can split a
+// shuffle on m2 or larger into a small number of m1 sized shuffles
+// which write each destination registers exactly once.
+static SDValue lowerShuffleViaVRegSplitting(ShuffleVectorSDNode *SVN,
+                                            SelectionDAG &DAG,
+                                            const RISCVSubtarget &Subtarget) {
+  SDLoc DL(SVN);
+  MVT VT = SVN->getSimpleValueType(0);
+  SDValue V1 = SVN->getOperand(0);
+  SDValue V2 = SVN->getOperand(1);
+  ArrayRef<int> Mask = SVN->getMask();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // If we don't know exact data layout, not much we can do.  If this
+  // is already m1 or smaller, no point in splitting further.
+  const unsigned MinVLen = Subtarget.getRealMinVLen();
+  const unsigned MaxVLen = Subtarget.getRealMaxVLen();
+  if (MinVLen != MaxVLen || VT.getSizeInBits().getFixedValue() <= MinVLen)
+    return SDValue();
+
+  MVT ElemVT = VT.getVectorElementType();
+  unsigned ElemsPerVReg = MinVLen / ElemVT.getFixedSizeInBits();
+  unsigned VRegsPerSrc = NumElts / ElemsPerVReg;
+
+  SmallVector<std::pair<int, SmallVector<int>>>
+    OutMasks(VRegsPerSrc, {-1, {}});
+
+  // Check if our mask can be done as a 1-to-1 mapping from source
+  // to destination registers in the group without needing to
+  // write each destination more than once.
+  for (unsigned DstIdx = 0; DstIdx < Mask.size(); DstIdx++) {
+    int DstVecIdx = DstIdx / ElemsPerVReg;
+    int DstSubIdx = DstIdx % ElemsPerVReg;
+    int SrcIdx = Mask[DstIdx];
+    if (SrcIdx < 0 || (unsigned)SrcIdx >= 2 * NumElts)
+      continue;
+    int SrcVecIdx = SrcIdx / ElemsPerVReg;
+    int SrcSubIdx = SrcIdx % ElemsPerVReg;
+    if (OutMasks[DstVecIdx].first == -1)
+      OutMasks[DstVecIdx].first = SrcVecIdx;
+    if (OutMasks[DstVecIdx].first != SrcVecIdx)
+      // Note: This case could easily be handled by keeping track of a chain
+      // of source values and generating two element shuffles below.  This is
+      // less an implementation question, and more a profitability one.
+      return SDValue();
+
+    OutMasks[DstVecIdx].second.resize(ElemsPerVReg, -1);
+    OutMasks[DstVecIdx].second[DstSubIdx] = SrcSubIdx;
+  }
+
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT, Subtarget);
+  MVT OneRegVT = MVT::getVectorVT(ElemVT, ElemsPerVReg);
+  MVT M1VT = getContainerForFixedLengthVector(DAG, OneRegVT, Subtarget);
+  assert(M1VT == getLMUL1VT(M1VT));
+  unsigned NumOpElts = M1VT.getVectorMinNumElements();
+  SDValue Vec = DAG.getUNDEF(ContainerVT);
+  // The following semantically builds up a fixed length concat_vector
+  // of the component shuffle_vectors.  We eagerly lower to scalable here
+  // to avoid DAG combining it back to a large shuffle_vector again.
+  V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+  V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+  for (unsigned DstVecIdx = 0 ; DstVecIdx < OutMasks.size(); DstVecIdx++) {
+    auto &[SrcVecIdx, SrcSubMask] = OutMasks[DstVecIdx];
+    if (SrcVecIdx == -1)
+      continue;
+    unsigned ExtractIdx = (SrcVecIdx % VRegsPerSrc) * NumOpElts;
+    SDValue SrcVec = (unsigned)SrcVecIdx > VRegsPerSrc ? V2 : V1;
+    SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, M1VT, SrcVec,
+                                 DAG.getVectorIdxConstant(ExtractIdx, DL));
+    SubVec = convertFromScalableVector(OneRegVT, SubVec, DAG, Subtarget);
+    SubVec = DAG.getVectorShuffle(OneRegVT, DL, SubVec, SubVec, SrcSubMask);
+    SubVec = convertToScalableVector(M1VT, SubVec, DAG, Subtarget);
+    unsigned InsertIdx = DstVecIdx * NumOpElts;
+    Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, ContainerVT, Vec, SubVec,
+                      DAG.getVectorIdxConstant(InsertIdx, DL));
+  }
+  return convertFromScalableVector(VT, Vec, DAG, Subtarget);
+}
+
 static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
                                    const RISCVSubtarget &Subtarget) {
   SDValue V1 = Op.getOperand(0);
@@ -4759,6 +4838,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     }
   }
 
+  // For exact VLEN m2 or greater, try to split to m1 operations if we
+  // can split cleanly.
+  if (SDValue V = lowerShuffleViaVRegSplitting(SVN, DAG, Subtarget))
+    return V;
+
   ArrayRef<int> Mask = SVN->getMask();
 
   if (SDValue V =
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index b922ecdb8a2c2..f53b51e05c572 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -16,14 +16,10 @@ define <4 x i64> @m2_splat_0(<4 x i64> %v1) vscale_range(2,2) {
 define <4 x i64> @m2_splat_in_chunks(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_in_chunks:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8224
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v8, 0
+; CHECK-NEXT:    vrgather.vi v11, v9, 0
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 2>
   ret <4 x i64> %res
@@ -32,12 +28,12 @@ define <4 x i64> @m2_splat_in_chunks(<4 x i64> %v1) vscale_range(2,2) {
 define <8 x i64> @m4_splat_in_chunks(<8 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m4_splat_in_chunks:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, %hi(.LCPI2_0)
-; CHECK-NEXT:    addi a0, a0, %lo(.LCPI2_0)
-; CHECK-NEXT:    vl1re16.v v16, (a0)
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v12, v8, 0
+; CHECK-NEXT:    vrgather.vi v13, v9, 0
+; CHECK-NEXT:    vrgather.vi v14, v10, 0
+; CHECK-NEXT:    vrgather.vi v15, v11, 1
+; CHECK-NEXT:    vmv4r.v v8, v12
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i64> %v1, <8 x i64> poison, <8 x i32> <i32 0, i32 0, i32 2, i32 2, i32 4, i32 4, i32 7, i32 7>
   ret <8 x i64> %res
@@ -47,14 +43,10 @@ define <8 x i64> @m4_splat_in_chunks(<8 x i64> %v1) vscale_range(2,2) {
 define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_with_tail:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 12320
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v8, 0
+; CHECK-NEXT:    vmv1r.v v11, v9
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
   ret <4 x i64> %res
@@ -63,15 +55,12 @@ define <4 x i64> @m2_splat_with_tail(<4 x i64> %v1) vscale_range(2,2) {
 define <4 x i64> @m2_pair_swap_vl4(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m2_pair_swap_vl4:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 8240
-; CHECK-NEXT:    addi a0, a0, 1
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v11, v9, 1
+; CHECK-NEXT:    vslideup.vi v11, v9, 1
+; CHECK-NEXT:    vslidedown.vi v10, v8, 1
+; CHECK-NEXT:    vslideup.vi v10, v8, 1
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   ret <4 x i64> %res
@@ -107,14 +96,10 @@ define <8 x i32> @m2_pair_swap_vl8(<8 x i32> %v1) vscale_range(2,2) {
 define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_identity:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 12320
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.s.x v10, a0
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vsext.vf2 v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v8, 0
+; CHECK-NEXT:    vmv1r.v v11, v9
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 0, i32 2, i32 3>
   ret <4 x i64> %res
@@ -123,12 +108,7 @@ define <4 x i64> @m2_splat_into_identity(<4 x i64> %v1) vscale_range(2,2) {
 define <4 x i64> @m2_broadcast_i128(<4 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m2_broadcast_i128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a0
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v10, v8, v12
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmv1r.v v9, v8
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
   ret <4 x i64> %res
@@ -137,12 +117,9 @@ define <4 x i64> @m2_broadcast_i128(<4 x i64> %v1) vscale_range(2,2) {
 define <8 x i64> @m4_broadcast_i128(<8 x i64> %v1) vscale_range(2,2) {
 ; CHECK-LABEL: m4_broadcast_i128:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    lui a0, 16
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a0
-; CHECK-NEXT:    vsetivli zero, 8, e64, m4, ta, ma
-; CHECK-NEXT:    vrgatherei16.vv v12, v8, v16
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vmv1r.v v9, v8
+; CHECK-NEXT:    vmv1r.v v10, v8
+; CHECK-NEXT:    vmv1r.v v11, v8
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i64> %v1, <8 x i64> poison, <8 x i32> <i32 0, i32 1, i32 0, i32 1, i32 0, i32 1, i32 0, i32 1>
   ret <8 x i64> %res
@@ -152,13 +129,10 @@ define <8 x i64> @m4_broadcast_i128(<8 x i64> %v1) vscale_range(2,2) {
 define <4 x i64> @m2_splat_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_two_source:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 0
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; CHECK-NEXT:    vrgather.vi v12, v10, 3, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vrgather.vi v13, v11, 1
+; CHECK-NEXT:    vmv2r.v v8, v12
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 7, i32 7>
   ret <4 x i64> %res
@@ -167,15 +141,9 @@ define <4 x i64> @m2_splat_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range
 define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_identity_two_source:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
-; CHECK-NEXT:    vrgather.vi v12, v8, 0
-; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
-; CHECK-NEXT:    vmv.v.i v0, 12
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v12, v10, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v12
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
+; CHECK-NEXT:    vrgather.vi v10, v8, 0
+; CHECK-NEXT:    vmv2r.v v8, v10
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 6, i32 7>
   ret <4 x i64> %res

From 50d33c62ad8786400a712b01150f6decaf070782 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Martin=20Storsj=C3=B6?= <martin@martin.st>
Date: Tue, 23 Jan 2024 20:37:03 +0200
Subject: [PATCH 674/843] [LLD] [COFF] Fix crashes for cfguard with undefined
 weak symbols (#79063)

When marking symbols as having their address taken, we can have the
sitaution where we have the address taken of a weak symbol. If there's
no strong definition of the symbol, the symbol ends up as an absolute
symbol with the value null. In those cases, we don't have any Chunk.
Skip such symbols from the cfguard tables.

This fixes https://github.com/llvm/llvm-project/issues/78619.
---
 lld/COFF/Writer.cpp                |  2 ++
 lld/test/COFF/cfguard-weak-undef.s | 27 +++++++++++++++++++++++++++
 2 files changed, 29 insertions(+)
 create mode 100644 lld/test/COFF/cfguard-weak-undef.s

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 2e34a6c5cfa2c..9c20bbb83d86d 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1802,6 +1802,8 @@ void Writer::createSEHTable() {
 // symbol's offset into that Chunk.
 static void addSymbolToRVASet(SymbolRVASet &rvaSet, Defined *s) {
   Chunk *c = s->getChunk();
+  if (!c)
+    return;
   if (auto *sc = dyn_cast<SectionChunk>(c))
     c = sc->repl; // Look through ICF replacement.
   uint32_t off = s->getRVA() - (c ? c->getRVA() : 0);
diff --git a/lld/test/COFF/cfguard-weak-undef.s b/lld/test/COFF/cfguard-weak-undef.s
new file mode 100644
index 0000000000000..fd4121ac27dfc
--- /dev/null
+++ b/lld/test/COFF/cfguard-weak-undef.s
@@ -0,0 +1,27 @@
+# REQUIRES: x86
+# RUN: llvm-mc -triple=x86_64-windows-gnu -filetype=obj -o %t.obj %s
+# RUN: lld-link %t.obj /out:%t.exe /entry:entry /subsystem:console /guard:cf
+
+	.def	@feat.00;
+	.scl	3;
+	.type	0;
+	.endef
+	.globl	@feat.00
+.set @feat.00, 2048
+
+	.globl	entry
+entry:
+	retq
+
+	.data
+	.globl	funcs
+funcs:
+	.quad	weakfunc
+
+	.section	.gfids$y,"dr"
+	.symidx	weakfunc
+	.section	.giats$y,"dr"
+	.section	.gljmp$y,"dr"
+	.weak	weakfunc
+	.addrsig
+	.addrsig_sym weakfunc

From bdc41106ee48dce59c500c9a3957af947f30c8c3 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 23 Jan 2024 10:49:55 -0800
Subject: [PATCH 675/843] [RISCV] Recurse on first operand of two operand
 shuffles (#79180)

This is the first step towards an alternate shuffle lowering design for
the general two vector argument case. The goal is to leverage the
existing lowering for single vector permutes to avoid as many of the
vrgathers as required - even if we do need the other.

This patch handles only the first argument, and is arguably a slightly
weird half-step. However, the test changes from the full two argument
recurse patch are a lot harder to reason about. Taking this half step
gives much more easily reviewable changes, and is thus worthwhile. I
intend to post the patch for the second argument once this has landed.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  92 +++--
 .../RISCV/rvv/fixed-vectors-fp-interleave.ll  |  41 +-
 .../RISCV/rvv/fixed-vectors-int-interleave.ll |  63 ++-
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   |  43 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 387 +++++++++---------
 .../rvv/fixed-vectors-shuffle-transpose.ll    | 128 +++---
 6 files changed, 347 insertions(+), 407 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index ae1ebcb51c065..e39888637062c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -5033,56 +5033,60 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   MVT IndexContainerVT =
       ContainerVT.changeVectorElementType(IndexVT.getScalarType());
 
-  SDValue Gather;
-  // TODO: This doesn't trigger for i64 vectors on RV32, since there we
-  // encounter a bitcasted BUILD_VECTOR with low/high i32 values.
-  if (SDValue SplatValue = DAG.getSplatValue(V1, /*LegalTypes*/ true)) {
-    Gather = lowerScalarSplat(SDValue(), SplatValue, VL, ContainerVT, DL, DAG,
-                              Subtarget);
-  } else {
+  // Base case for the recursion just below - handle the worst case
+  // single source permutation.  Note that all the splat variants
+  // are handled above.
+  if (V2.isUndef()) {
     V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
-    // If only one index is used, we can use a "splat" vrgather.
-    // TODO: We can splat the most-common index and fix-up any stragglers, if
-    // that's beneficial.
-    if (LHSIndexCounts.size() == 1) {
-      int SplatIndex = LHSIndexCounts.begin()->getFirst();
-      Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V1,
-                           DAG.getConstant(SplatIndex, DL, XLenVT),
-                           DAG.getUNDEF(ContainerVT), TrueMask, VL);
-    } else {
-      SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
-      LHSIndices =
-          convertToScalableVector(IndexContainerVT, LHSIndices, DAG, Subtarget);
-
-      Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
-                           DAG.getUNDEF(ContainerVT), TrueMask, VL);
+    SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+    LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
+                                         Subtarget);
+    SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
+                                 DAG.getUNDEF(ContainerVT), TrueMask, VL);
+    return convertFromScalableVector(VT, Gather, DAG, Subtarget);
+  }
+
+  // Translate the gather index we computed above (and possibly swapped)
+  // back to a shuffle mask.  This step should disappear once we complete
+  // the migration to recursive design.
+  SmallVector<int> ShuffleMaskLHS;
+  ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
+  for (SDValue GatherIndex : GatherIndicesLHS) {
+    if (GatherIndex.isUndef()) {
+      ShuffleMaskLHS.push_back(-1);
+      continue;
     }
+    auto *IdxC = cast<ConstantSDNode>(GatherIndex);
+    ShuffleMaskLHS.push_back(IdxC->getZExtValue());
   }
 
-  // If a second vector operand is used by this shuffle, blend it in with an
-  // additional vrgather.
-  if (!V2.isUndef()) {
-    V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
+  // Recursively invoke lowering for the LHS as if there were no RHS.
+  // This allows us to leverage all of our single source permute tricks.
+  SDValue Gather =
+    DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
+  Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);
 
-    MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
-    SelectMask =
-        convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
+  // Blend in second vector source with an additional vrgather.
+  V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
 
-    // If only one index is used, we can use a "splat" vrgather.
-    // TODO: We can splat the most-common index and fix-up any stragglers, if
-    // that's beneficial.
-    if (RHSIndexCounts.size() == 1) {
-      int SplatIndex = RHSIndexCounts.begin()->getFirst();
-      Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
-                           DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
-                           SelectMask, VL);
-    } else {
-      SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
-      RHSIndices =
-          convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
-      Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
-                           SelectMask, VL);
-    }
+  MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
+  SelectMask =
+    convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
+
+  // If only one index is used, we can use a "splat" vrgather.
+  // TODO: We can splat the most-common index and fix-up any stragglers, if
+  // that's beneficial.
+  if (RHSIndexCounts.size() == 1) {
+    int SplatIndex = RHSIndexCounts.begin()->getFirst();
+    Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
+                         DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
+                         SelectMask, VL);
+  } else {
+    SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
+    RHSIndices =
+      convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
+    Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
+                         SelectMask, VL);
   }
 
   return convertFromScalableVector(VT, Gather, DAG, Subtarget);
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index 799aebcaa6302..dab530751ef96 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -238,39 +238,26 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
 define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ; V128-LABEL: interleave_v32f32:
 ; V128:       # %bb.0:
-; V128-NEXT:    addi sp, sp, -16
-; V128-NEXT:    .cfi_def_cfa_offset 16
-; V128-NEXT:    csrr a0, vlenb
-; V128-NEXT:    slli a0, a0, 2
-; V128-NEXT:    sub sp, sp, a0
-; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; V128-NEXT:    lui a0, %hi(.LCPI10_0)
-; V128-NEXT:    addi a0, a0, %lo(.LCPI10_0)
-; V128-NEXT:    li a1, 32
-; V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; V128-NEXT:    vle16.v v4, (a0)
-; V128-NEXT:    lui a0, %hi(.LCPI10_1)
-; V128-NEXT:    addi a0, a0, %lo(.LCPI10_1)
-; V128-NEXT:    vle16.v v24, (a0)
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vs4r.v v24, (a0) # Unknown-size Folded Spill
-; V128-NEXT:    lui a0, 699051
-; V128-NEXT:    addi a0, a0, -1366
-; V128-NEXT:    vmv.s.x v0, a0
-; V128-NEXT:    vrgatherei16.vv v24, v8, v4
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vl4r.v v12, (a0) # Unknown-size Folded Reload
+; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; V128-NEXT:    vslidedown.vi v0, v8, 16
+; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT:    vwaddu.vv v24, v0, v8
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v24, a0, v8
+; V128-NEXT:    lui a1, %hi(.LCPI10_0)
+; V128-NEXT:    addi a1, a1, %lo(.LCPI10_0)
+; V128-NEXT:    li a2, 32
+; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
+; V128-NEXT:    vle16.v v12, (a1)
+; V128-NEXT:    lui a1, 699051
+; V128-NEXT:    addi a1, a1, -1366
+; V128-NEXT:    vmv.s.x v0, a1
 ; V128-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v0, v8, v16
-; V128-NEXT:    li a0, -1
 ; V128-NEXT:    vwmaccu.vx v0, a0, v16
 ; V128-NEXT:    vmv8r.v v8, v0
 ; V128-NEXT:    vmv8r.v v16, v24
-; V128-NEXT:    csrr a0, vlenb
-; V128-NEXT:    slli a0, a0, 2
-; V128-NEXT:    add sp, sp, a0
-; V128-NEXT:    addi sp, sp, 16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v32f32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index e1bd16649eede..9e21cc9e3d624 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -188,24 +188,30 @@ define <4 x i32> @interleave_v4i32_offset_2(<4 x i32> %x, <4 x i32> %y) {
 define <4 x i32> @interleave_v4i32_offset_1(<4 x i32> %x, <4 x i32> %y) {
 ; V128-LABEL: interleave_v4i32_offset_1:
 ; V128:       # %bb.0:
+; V128-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V128-NEXT:    vwaddu.vv v10, v8, v8
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v10, a0, v8
 ; V128-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; V128-NEXT:    vid.v v10
-; V128-NEXT:    vsrl.vi v11, v10, 1
-; V128-NEXT:    vrgather.vv v10, v8, v11
+; V128-NEXT:    vid.v v8
+; V128-NEXT:    vsrl.vi v8, v8, 1
 ; V128-NEXT:    vmv.v.i v0, 10
-; V128-NEXT:    vadd.vi v8, v11, 1
+; V128-NEXT:    vadd.vi v8, v8, 1
 ; V128-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; V128-NEXT:    vmv.v.v v8, v10
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v4i32_offset_1:
 ; V512:       # %bb.0:
+; V512-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; V512-NEXT:    vwaddu.vv v10, v8, v8
+; V512-NEXT:    li a0, -1
+; V512-NEXT:    vwmaccu.vx v10, a0, v8
 ; V512-NEXT:    vsetivli zero, 4, e32, mf2, ta, mu
-; V512-NEXT:    vid.v v10
-; V512-NEXT:    vsrl.vi v11, v10, 1
-; V512-NEXT:    vrgather.vv v10, v8, v11
+; V512-NEXT:    vid.v v8
+; V512-NEXT:    vsrl.vi v8, v8, 1
 ; V512-NEXT:    vmv.v.i v0, 10
-; V512-NEXT:    vadd.vi v8, v11, 1
+; V512-NEXT:    vadd.vi v8, v8, 1
 ; V512-NEXT:    vrgather.vv v10, v9, v8, v0.t
 ; V512-NEXT:    vmv1r.v v8, v10
 ; V512-NEXT:    ret
@@ -397,39 +403,26 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
 define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ; V128-LABEL: interleave_v32i32:
 ; V128:       # %bb.0:
-; V128-NEXT:    addi sp, sp, -16
-; V128-NEXT:    .cfi_def_cfa_offset 16
-; V128-NEXT:    csrr a0, vlenb
-; V128-NEXT:    slli a0, a0, 2
-; V128-NEXT:    sub sp, sp, a0
-; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x04, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 4 * vlenb
-; V128-NEXT:    lui a0, %hi(.LCPI17_0)
-; V128-NEXT:    addi a0, a0, %lo(.LCPI17_0)
-; V128-NEXT:    li a1, 32
-; V128-NEXT:    vsetvli zero, a1, e32, m8, ta, mu
-; V128-NEXT:    vle16.v v4, (a0)
-; V128-NEXT:    lui a0, %hi(.LCPI17_1)
-; V128-NEXT:    addi a0, a0, %lo(.LCPI17_1)
-; V128-NEXT:    vle16.v v24, (a0)
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vs4r.v v24, (a0) # Unknown-size Folded Spill
-; V128-NEXT:    lui a0, 699051
-; V128-NEXT:    addi a0, a0, -1366
-; V128-NEXT:    vmv.s.x v0, a0
-; V128-NEXT:    vrgatherei16.vv v24, v8, v4
-; V128-NEXT:    addi a0, sp, 16
-; V128-NEXT:    vl4r.v v12, (a0) # Unknown-size Folded Reload
+; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; V128-NEXT:    vslidedown.vi v0, v8, 16
+; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT:    vwaddu.vv v24, v0, v8
+; V128-NEXT:    li a0, -1
+; V128-NEXT:    vwmaccu.vx v24, a0, v8
+; V128-NEXT:    lui a1, %hi(.LCPI17_0)
+; V128-NEXT:    addi a1, a1, %lo(.LCPI17_0)
+; V128-NEXT:    li a2, 32
+; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
+; V128-NEXT:    vle16.v v12, (a1)
+; V128-NEXT:    lui a1, 699051
+; V128-NEXT:    addi a1, a1, -1366
+; V128-NEXT:    vmv.s.x v0, a1
 ; V128-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v0, v8, v16
-; V128-NEXT:    li a0, -1
 ; V128-NEXT:    vwmaccu.vx v0, a0, v16
 ; V128-NEXT:    vmv8r.v v8, v0
 ; V128-NEXT:    vmv8r.v v16, v24
-; V128-NEXT:    csrr a0, vlenb
-; V128-NEXT:    slli a0, a0, 2
-; V128-NEXT:    add sp, sp, a0
-; V128-NEXT:    addi sp, sp, 16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v32i32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index a56a81f5f793b..a26a87a1f3c13 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -612,13 +612,11 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: concat_4xi8_start_undef_at_start:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 224
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v8, v11, -4
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vadd.vi v10, v10, -4
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
   ret <8 x i8> %res
@@ -628,13 +626,11 @@ define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_start_into_end_non_contiguous:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 144
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v8, v11, -4
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vadd.vi v10, v10, -4
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
   ret <8 x i8> %res
@@ -675,13 +671,11 @@ define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_slidedown:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    li a0, 195
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
   ret <8 x i8> %res
@@ -692,14 +686,12 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
 ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 2
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vadd.vi v10, v10, -1
 ; CHECK-NEXT:    li a0, 234
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v8, v11, -1
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
   ret <8 x i8> %res
@@ -710,16 +702,13 @@ define <8 x i8> @unmergable(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: unmergable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vadd.vi v11, v10, 2
 ; CHECK-NEXT:    lui a0, %hi(.LCPI46_0)
 ; CHECK-NEXT:    addi a0, a0, %lo(.LCPI46_0)
-; CHECK-NEXT:    vle8.v v12, (a0)
+; CHECK-NEXT:    vle8.v v10, (a0)
 ; CHECK-NEXT:    li a0, 234
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    vrgather.vv v10, v9, v12, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vslidedown.vi v8, v8, 2
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 9, i32 4, i32 11, i32 6, i32 13, i32 8, i32 15>
   ret <8 x i8> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index eeb8e517d01d2..f889041647b23 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -8,23 +8,51 @@
 
 ; FIXME: This should be widened to a vlseg2 of <4 x i32> with VL set to 3
 define {<3 x i32>, <3 x i32>} @load_factor2_v3(ptr %ptr) {
-; CHECK-LABEL: load_factor2_v3:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
-; CHECK-NEXT:    vle32.v v10, (a0)
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
-; CHECK-NEXT:    vid.v v8
-; CHECK-NEXT:    vadd.vv v9, v8, v8
-; CHECK-NEXT:    vrgather.vv v8, v10, v9
-; CHECK-NEXT:    vmv.v.i v0, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vslidedown.vi v12, v10, 4
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vrgather.vi v8, v12, 0, v0.t
-; CHECK-NEXT:    vadd.vi v11, v9, 1
-; CHECK-NEXT:    vrgather.vv v9, v10, v11
-; CHECK-NEXT:    vrgather.vi v9, v12, 1, v0.t
-; CHECK-NEXT:    ret
+; RV32-LABEL: load_factor2_v3:
+; RV32:       # %bb.0:
+; RV32-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV32-NEXT:    vle32.v v10, (a0)
+; RV32-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; RV32-NEXT:    vslidedown.vi v9, v10, 2
+; RV32-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV32-NEXT:    vwaddu.vv v8, v10, v9
+; RV32-NEXT:    li a0, -1
+; RV32-NEXT:    vwmaccu.vx v8, a0, v9
+; RV32-NEXT:    vmv.v.i v0, 4
+; RV32-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; RV32-NEXT:    vslidedown.vi v12, v10, 4
+; RV32-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV32-NEXT:    vrgather.vi v8, v12, 0, v0.t
+; RV32-NEXT:    vid.v v9
+; RV32-NEXT:    vadd.vv v9, v9, v9
+; RV32-NEXT:    vadd.vi v11, v9, 1
+; RV32-NEXT:    vrgather.vv v9, v10, v11
+; RV32-NEXT:    vrgather.vi v9, v12, 1, v0.t
+; RV32-NEXT:    ret
+;
+; RV64-LABEL: load_factor2_v3:
+; RV64:       # %bb.0:
+; RV64-NEXT:    vsetivli zero, 6, e32, m2, ta, ma
+; RV64-NEXT:    vle32.v v10, (a0)
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
+; RV64-NEXT:    vid.v v8
+; RV64-NEXT:    vadd.vv v8, v8, v8
+; RV64-NEXT:    vadd.vi v8, v8, 1
+; RV64-NEXT:    vrgather.vv v9, v10, v8
+; RV64-NEXT:    vmv.v.i v0, 4
+; RV64-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; RV64-NEXT:    vslidedown.vi v12, v10, 4
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV64-NEXT:    vrgather.vi v9, v12, 1, v0.t
+; RV64-NEXT:    vsetivli zero, 2, e32, m1, ta, ma
+; RV64-NEXT:    vslidedown.vi v11, v10, 2
+; RV64-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
+; RV64-NEXT:    vwaddu.vv v8, v10, v11
+; RV64-NEXT:    li a0, -1
+; RV64-NEXT:    vwmaccu.vx v8, a0, v11
+; RV64-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; RV64-NEXT:    vrgather.vi v8, v12, 0, v0.t
+; RV64-NEXT:    ret
   %interleaved.vec = load <6 x i32>, ptr %ptr
   %v0 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 0, i32 2, i32 4>
   %v1 = shufflevector <6 x i32> %interleaved.vec, <6 x i32> poison, <3 x i32> <i32 1, i32 3, i32 5>
@@ -131,163 +159,142 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 62
+; RV32-NEXT:    li a3, 58
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3e, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 62 * vlenb
-; RV32-NEXT:    addi a3, a1, 128
-; RV32-NEXT:    addi a4, a1, 256
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 58 * vlenb
+; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v16, (a4)
+; RV32-NEXT:    vle32.v v8, (a3)
+; RV32-NEXT:    csrr a3, vlenb
+; RV32-NEXT:    li a4, 25
+; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    add a3, sp, a3
+; RV32-NEXT:    addi a3, a3, 16
+; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    addi a3, a1, 128
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vslideup.vi v16, v8, 4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 29
+; RV32-NEXT:    li a5, 12
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    vid.v v10
+; RV32-NEXT:    vs4r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vid.v v20
+; RV32-NEXT:    vadd.vi v4, v20, -10
+; RV32-NEXT:    vmv.v.v v2, v20
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 3
+; RV32-NEXT:    slli a5, a4, 4
 ; RV32-NEXT:    add a4, a5, a4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs2r.v v10, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vadd.vi v8, v10, -4
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 13
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 21
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v12, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vadd.vi v8, v10, -10
+; RV32-NEXT:    vs2r.v v20, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 12
-; RV32-NEXT:    vmv.s.x v0, a4
+; RV32-NEXT:    vmv.s.x v1, a4
+; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; RV32-NEXT:    vslidedown.vi v8, v8, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 3
+; RV32-NEXT:    slli a5, a4, 5
+; RV32-NEXT:    add a4, a5, a4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs1r.v v0, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v16, v16, 16
+; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 45
-; RV32-NEXT:    mul a4, a4, a5
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
+; RV32-NEXT:    vs1r.v v1, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vrgatherei16.vv v16, v8, v4, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 25
+; RV32-NEXT:    li a5, 21
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v12, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, %hi(.LCPI6_0)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; RV32-NEXT:    lui a5, %hi(.LCPI6_1)
-; RV32-NEXT:    addi a5, a5, %lo(.LCPI6_1)
-; RV32-NEXT:    lui a6, 1
 ; RV32-NEXT:    vle16.v v8, (a4)
-; RV32-NEXT:    addi a4, sp, 16
-; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vle16.v v8, (a5)
 ; RV32-NEXT:    csrr a4, vlenb
 ; RV32-NEXT:    slli a4, a4, 2
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a4, %hi(.LCPI6_1)
+; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_1)
+; RV32-NEXT:    lui a5, 1
+; RV32-NEXT:    vle16.v v8, (a4)
+; RV32-NEXT:    addi a4, sp, 16
+; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v16, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 37
+; RV32-NEXT:    li a4, 49
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 53
+; RV32-NEXT:    li a3, 41
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, a6, -64
+; RV32-NEXT:    addi a1, a5, -64
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4
+; RV32-NEXT:    addi a1, sp, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 21
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv.v.v v12, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 21
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 3
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl2r.v v10, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vi v8, v10, -2
-; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v12, v8, 2
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vadd.vi v8, v10, -8
-; RV32-NEXT:    vmv2r.v v30, v10
+; RV32-NEXT:    vadd.vi v8, v2, -8
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v28, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vmv1r.v v0, v28
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV32-NEXT:    vmv.v.v v24, v12
+; RV32-NEXT:    vmv.v.v v20, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
@@ -301,166 +308,165 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 49
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 53
+; RV32-NEXT:    li a3, 41
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
+; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v8
+; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v4, v16, v8
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v8
 ; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vadd.vi v8, v30, -6
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a3, a1, 4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs2r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vi v8, v8, -6
 ; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; RV32-NEXT:    vmv1r.v v0, v28
-; RV32-NEXT:    vmv1r.v v2, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v4, v16, v8, v0.t
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
-; RV32-NEXT:    vle16.v v20, (a1)
-; RV32-NEXT:    vle16.v v8, (a3)
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 3
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    vle16.v v28, (a3)
 ; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v1, a1
+; RV32-NEXT:    vmv.s.x v20, a1
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vs1r.v v20, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 49
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v8, v20
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
+; RV32-NEXT:    vmv1r.v v0, v20
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 53
+; RV32-NEXT:    li a3, 41
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 3
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v24
+; RV32-NEXT:    vmv.v.v v24, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 3
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v8
-; RV32-NEXT:    vmv1r.v v0, v2
+; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vadd.vi v8, v8, -4
+; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    add a1, sp, a1
+; RV32-NEXT:    addi a1, a1, 16
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    csrr a1, vlenb
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV32-NEXT:    vmv.v.v v4, v12
+; RV32-NEXT:    vmv.v.v v24, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_8)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_8)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_9)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_9)
 ; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vle16.v v20, (a3)
+; RV32-NEXT:    vle16.v v28, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 49
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16
-; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
+; RV32-NEXT:    addi a1, sp, 16
+; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 53
+; RV32-NEXT:    li a3, 41
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v20, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v4, v8
+; RV32-NEXT:    vmv.v.v v24, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 4
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
@@ -468,25 +474,20 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    lui a1, 15
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v20, v16, 6
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl2r.v v10, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v20, v16, v10
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -501,13 +502,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    li a1, 1008
 ; RV32-NEXT:    vmv.s.x v28, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 29
+; RV32-NEXT:    li a3, 25
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v28, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 37
+; RV32-NEXT:    li a3, 49
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -515,7 +516,7 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
 ; RV32-NEXT:    vmv1r.v v0, v28
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 53
+; RV32-NEXT:    li a3, 41
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -528,19 +529,19 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
+; RV32-NEXT:    li a3, 12
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 45
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a3, a1, 5
+; RV32-NEXT:    add a1, a3, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
@@ -553,33 +554,33 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vle16.v v24, (a1)
 ; RV32-NEXT:    vle16.v v8, (a2)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 45
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a2, a1, 5
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 37
+; RV32-NEXT:    li a2, 49
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 29
+; RV32-NEXT:    li a2, 25
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 53
+; RV32-NEXT:    li a2, 41
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 45
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a2, a1, 5
+; RV32-NEXT:    add a1, a2, a1
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
@@ -593,37 +594,35 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vse32.v v20, (a1)
 ; RV32-NEXT:    addi a1, a0, 192
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 13
-; RV32-NEXT:    mul a2, a2, a3
+; RV32-NEXT:    slli a3, a2, 4
+; RV32-NEXT:    add a2, a3, a2
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a3, a2, 3
-; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    slli a2, a2, 2
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a3, a2, 4
-; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 25
+; RV32-NEXT:    li a2, 21
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 62
+; RV32-NEXT:    li a1, 58
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
index d0777962a7565..a34fa9502d93b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
@@ -8,13 +8,11 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: trn1.v8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v8, v11, -1
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vadd.vi v10, v10, -1
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i8> %tmp0
@@ -24,13 +22,11 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: trn2.v8i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i8> %tmp0
@@ -40,16 +36,14 @@ define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: trn1.v16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
-; CHECK-NEXT:    vadd.vi v8, v11, -1
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vadd.vi v10, v10, -1
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   ret <16 x i8> %tmp0
@@ -59,16 +53,14 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: trn2.v16i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 1
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   ret <16 x i8> %tmp0
@@ -78,12 +70,10 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: trn1.v4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v8, v11, -1
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vadd.vi v10, v10, -1
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i16> %tmp0
@@ -93,12 +83,10 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: trn2.v4i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i16> %tmp0
@@ -108,13 +96,11 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: trn1.v8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v8, v11, -1
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vadd.vi v10, v10, -1
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i16> %tmp0
@@ -124,13 +110,11 @@ define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: trn2.v8i16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i16> %tmp0
@@ -163,12 +147,10 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: trn1.v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v8, v11, -1
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vadd.vi v10, v10, -1
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i32> %tmp0
@@ -178,12 +160,10 @@ define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: trn2.v4i32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i32> %tmp0
@@ -239,12 +219,10 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: trn1.v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v8, v11, -1
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vadd.vi v10, v10, -1
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x float> %tmp0
@@ -254,12 +232,10 @@ define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: trn2.v4f32:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x float> %tmp0
@@ -292,12 +268,10 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: trn1.v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v8, v11, -1
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vadd.vi v10, v10, -1
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x half> %tmp0
@@ -307,12 +281,10 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: trn2.v4f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x half> %tmp0
@@ -322,13 +294,11 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: trn1.v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vrgather.vv v10, v8, v11
+; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v8, v11, -1
-; CHECK-NEXT:    vrgather.vv v10, v9, v8, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vadd.vi v10, v10, -1
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x half> %tmp0
@@ -338,13 +308,11 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: trn2.v8f16:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vid.v v11
-; CHECK-NEXT:    vadd.vi v12, v11, 1
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vrgather.vv v10, v8, v12
-; CHECK-NEXT:    vrgather.vv v10, v9, v11, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x half> %tmp0

From f05dd29ceea29080d18ec31414a7ae2edf86c39d Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 23 Jan 2024 10:57:54 -0800
Subject: [PATCH 676/843] [RISCV] Regenerate autogen test to remove spurious
 diff

---
 llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
index 4852850f234ba..924094c00fe7b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-buildvec.ll
@@ -1394,6 +1394,3 @@ define <2 x double> @vid_step2_v2f64() {
 ; CHECK-NEXT:    ret
   ret <2 x double> <double 0.0, double 2.0>
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; RV32: {{.*}}
-; RV64: {{.*}}

From 3c20e25b0c51c67e35d028ba0d1d5f1dd5e206bb Mon Sep 17 00:00:00 2001
From: AtariDreams <83477269+AtariDreams@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:13:55 -0500
Subject: [PATCH 677/843] [NFC] Size and element numbers are often swapped when
 calling calloc (#79081)

gcc-14 will now throw a warning if size and elements are swapped.
---
 clang/test/Analysis/malloc.mm                | 6 +++---
 clang/test/Analysis/uninit-vals.m            | 6 +++---
 clang/test/CodeGen/alloc-size.c              | 4 ++--
 compiler-rt/lib/profile/InstrProfilingFile.c | 4 ++--
 compiler-rt/test/tsan/java_finalizer2.cpp    | 2 +-
 5 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/clang/test/Analysis/malloc.mm b/clang/test/Analysis/malloc.mm
index 9c0f013c4df88..94a46d731090b 100644
--- a/clang/test/Analysis/malloc.mm
+++ b/clang/test/Analysis/malloc.mm
@@ -116,17 +116,17 @@ void testUseAfterFree() {
 }
 
 void testNoCopy() {
-  char *p = (char *)calloc(sizeof(int), 1);
+  char *p = (char *)calloc(1, sizeof(int));
   CustomData *w = [CustomData somethingNoCopy:p]; // no-warning
 }
 
 void testFreeWhenDone() {
-  char *p = (char *)calloc(sizeof(int), 1);
+  char *p = (char *)calloc(1, sizeof(int));
   CustomData *w = [CustomData something:p freeWhenDone:1]; // no-warning
 }
 
 void testFreeWhenDonePositive() {
-  char *p = (char *)calloc(sizeof(int), 1);
+  char *p = (char *)calloc(1, sizeof(int));
   CustomData *w = [CustomData something:p freeWhenDone:0]; // expected-warning{{leak}}
 }
 
diff --git a/clang/test/Analysis/uninit-vals.m b/clang/test/Analysis/uninit-vals.m
index 9d18f0ef69b92..a6ec4fb74e128 100644
--- a/clang/test/Analysis/uninit-vals.m
+++ b/clang/test/Analysis/uninit-vals.m
@@ -158,7 +158,7 @@ Point makePoint(float x, float y) {
 }
 
 void PR14765_test(void) {
-  Circle *testObj = calloc(sizeof(Circle), 1);
+  Circle *testObj = calloc(1, sizeof(Circle));
 
   clang_analyzer_eval(testObj->size == 0); // expected-warning{{TRUE}}
                                            // expected-note@-1{{TRUE}}
@@ -207,7 +207,7 @@ IntPoint makeIntPoint(int x, int y) {
 }
 
 void PR14765_test_int(void) {
-  IntCircle *testObj = calloc(sizeof(IntCircle), 1);
+  IntCircle *testObj = calloc(1, sizeof(IntCircle));
 
   clang_analyzer_eval(testObj->size == 0); // expected-warning{{TRUE}}
                                            // expected-note@-1{{TRUE}}
@@ -311,7 +311,7 @@ void testLargeStructsNotCopiedPerField(void) {
 }
 
 void testSmallStructInLargerStruct(void) {
-  IntCircle2D *testObj = calloc(sizeof(IntCircle2D), 1);
+  IntCircle2D *testObj = calloc(1, sizeof(IntCircle2D));
 
   clang_analyzer_eval(testObj->size == 0); // expected-warning{{TRUE}}
                                            // expected-note@-1{{TRUE}}
diff --git a/clang/test/CodeGen/alloc-size.c b/clang/test/CodeGen/alloc-size.c
index 370f61058c493..bbac7965521b6 100644
--- a/clang/test/CodeGen/alloc-size.c
+++ b/clang/test/CodeGen/alloc-size.c
@@ -137,7 +137,7 @@ void test5(void) {
   // CHECK: store i32 36
   gi = OBJECT_SIZE_BUILTIN(&data->t[1], 3);
 
-  struct Data *const arr = my_calloc(sizeof(*data), 2);
+  struct Data *const arr = my_calloc(2, sizeof(*data));
   // CHECK: store i32 96
   gi = OBJECT_SIZE_BUILTIN(arr, 0);
   // CHECK: store i32 96
@@ -171,7 +171,7 @@ void test6(void) {
   // CHECK: store i32 11
   gi = OBJECT_SIZE_BUILTIN(data->end, 3);
 
-  struct Data *const arr = my_calloc(sizeof(*arr) + 5, 3);
+  struct Data *const arr = my_calloc(3, sizeof(*arr) + 5);
   // AFAICT, GCC treats malloc and calloc identically. So, we should do the
   // same.
   //
diff --git a/compiler-rt/lib/profile/InstrProfilingFile.c b/compiler-rt/lib/profile/InstrProfilingFile.c
index e72a2ba86f546..867ae73f0d3b2 100644
--- a/compiler-rt/lib/profile/InstrProfilingFile.c
+++ b/compiler-rt/lib/profile/InstrProfilingFile.c
@@ -335,10 +335,10 @@ static void initFileWriter(ProfDataWriter *This, FILE *File) {
 COMPILER_RT_VISIBILITY ProfBufferIO *
 lprofCreateBufferIOInternal(void *File, uint32_t BufferSz) {
   FreeHook = &free;
-  DynamicBufferIOBuffer = (uint8_t *)calloc(BufferSz, 1);
+  DynamicBufferIOBuffer = (uint8_t *)calloc(1, BufferSz);
   VPBufferSize = BufferSz;
   ProfDataWriter *fileWriter =
-      (ProfDataWriter *)calloc(sizeof(ProfDataWriter), 1);
+      (ProfDataWriter *)calloc(1, sizeof(ProfDataWriter));
   initFileWriter(fileWriter, File);
   ProfBufferIO *IO = lprofCreateBufferIO(fileWriter);
   IO->OwnFileWriter = 1;
diff --git a/compiler-rt/test/tsan/java_finalizer2.cpp b/compiler-rt/test/tsan/java_finalizer2.cpp
index 87528900541a8..0d677be2ba1a0 100644
--- a/compiler-rt/test/tsan/java_finalizer2.cpp
+++ b/compiler-rt/test/tsan/java_finalizer2.cpp
@@ -51,7 +51,7 @@ void *Ballast(void *p) {
 }
 
 int main() {
-  Heap* heap = (Heap*)calloc(sizeof(Heap), 2) + 1;
+  Heap *heap = (Heap *)calloc(2, sizeof(Heap)) + 1;
   __tsan_java_init((jptr)heap, sizeof(*heap));
   __tsan_java_alloc((jptr)heap, sizeof(*heap));
   // Ballast threads merely make the bug a bit easier to trigger.

From 17bc4497847eb0388bedc5c8483bf8a1cbdd3a42 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 23 Jan 2024 11:26:21 -0800
Subject: [PATCH 678/843] [ELF] Improve thin-archivecollision.ll

---
 lld/test/ELF/lto/thin-archivecollision.ll | 32 ++++++++++++-----------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/lld/test/ELF/lto/thin-archivecollision.ll b/lld/test/ELF/lto/thin-archivecollision.ll
index 47a0beef7a270..da3d08801e96f 100644
--- a/lld/test/ELF/lto/thin-archivecollision.ll
+++ b/lld/test/ELF/lto/thin-archivecollision.ll
@@ -1,28 +1,30 @@
 ; REQUIRES: x86
-; RUN: opt -module-summary %s -o %t.o
+; RUN: rm -rf %t && mkdir %t && cd %t
+; RUN: mkdir d e
+; RUN: opt -module-summary %s -o b.bc
 ; RUN: mkdir -p %t1 %t2
-; RUN: opt -module-summary %p/Inputs/thin1.ll -o %t1/t.coll.o
-; RUN: opt -module-summary %p/Inputs/thin2.ll -o %t2/t.coll.o
+; RUN: opt -module-summary %p/Inputs/thin1.ll -o d/coll.o
+; RUN: opt -module-summary %p/Inputs/thin2.ll -o e/coll.o
 
-; RUN: rm -f %t.a
-; RUN: llvm-ar rcs %t.a %t1/t.coll.o %t2/t.coll.o
-; RUN: ld.lld %t.o %t.a -o %t
-; RUN: llvm-nm %t | FileCheck %s
-
-; Check without a archive symbol table
-; RUN: rm -f %t.a
-; RUN: llvm-ar rcS %t.a %t1/t.coll.o %t2/t.coll.o
-; RUN: ld.lld %t.o %t.a -o %t
-; RUN: llvm-nm %t | FileCheck %s
+; RUN: llvm-ar rcS d/a.a d/coll.o e/coll.o
+; RUN: ld.lld b.bc d/a.a -o out --save-temps
+; RUN: llvm-nm out | FileCheck %s
+; RUN: llvm-nm out2.lto.o | FileCheck %s --check-prefix=MOD2
+; RUN: llvm-nm out3.lto.o | FileCheck %s --check-prefix=MOD3
 
 ; Check we handle this case correctly even in presence of --whole-archive.
-; RUN: ld.lld %t.o --whole-archive %t.a -o %t
-; RUN: llvm-nm %t | FileCheck %s
+; RUN: rm out1.lto.o out2.lto.o out3.lto.o
+; RUN: ld.lld b.bc --whole-archive d/a.a -o out --save-temps
+; RUN: llvm-nm out | FileCheck %s
+; RUN: ls out1.lto.o out2.lto.o out3.lto.o
 
 ; CHECK: T _start
 ; CHECK: T blah
 ; CHECK: T foo
 
+; MOD2: T foo
+; MOD3: T blah
+
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-scei-ps4"
 

From f6ced3579a8c7fe1ef93e25ecf521629afa928dd Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 23 Jan 2024 11:32:37 -0800
Subject: [PATCH 679/843] [CMake][Release] Add option for enabling PGO to
 release cache file. (#78823)

The option is LLVM_RELEASE_ENABLE_PGO and it's turned on by default.

---------

Co-authored-by: Petr Hosek <phosek@google.com>
---
 clang/cmake/caches/Release.cmake   | 73 +++++++++++++++++++++++-------
 llvm/utils/release/test-release.sh |  4 +-
 2 files changed, 59 insertions(+), 18 deletions(-)

diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index a7b9a8d0e29f8..1ca9138b98073 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -4,27 +4,53 @@
 
 # General Options
 set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "")
+set(LLVM_RELEASE_ENABLE_PGO ON CACHE BOOL "")
 
 set(CMAKE_BUILD_TYPE RELEASE CACHE STRING "")
 
 # Stage 1 Bootstrap Setup
 set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
-set(CLANG_BOOTSTRAP_TARGETS
-  clang
-  check-all
-  check-llvm
-  check-clang
-  test-suite
-  stage3
-  stage3-clang
-  stage3-check-all
-  stage3-check-llvm
-  stage3-check-clang
-  stage3-install
-  stage3-test-suite CACHE STRING "")
+if (LLVM_RELEASE_ENABLE_PGO)
+  set(CLANG_BOOTSTRAP_TARGETS
+    generate-profdata
+    stage2
+    stage2-clang
+    stage2-distribution
+    stage2-install
+    stage2-install-distribution
+    stage2-install-distribution-toolchain
+    stage2-check-all
+    stage2-check-llvm
+    stage2-check-clang
+    stage2-test-suite CACHE STRING "")
+else()
+  set(CLANG_BOOTSTRAP_TARGETS
+    clang
+    check-all
+    check-llvm
+    check-clang
+    test-suite
+    stage3
+    stage3-clang
+    stage3-check-all
+    stage3-check-llvm
+    stage3-check-clang
+    stage3-install
+    stage3-test-suite CACHE STRING "")
+endif()
 
 # Stage 1 Options
-set(LLVM_ENABLE_PROJECTS "clang" CACHE STRING "")
+set(STAGE1_PROJECTS "clang")
+set(STAGE1_RUNTIMES "")
+
+if (LLVM_RELEASE_ENABLE_PGO)
+  list(APPEND STAGE1_PROJECTS "lld")
+  list(APPEND STAGE1_RUNTIMES "compiler-rt")
+endif()
+
+set(LLVM_ENABLE_RUNTIMES ${STAGE1_RUNTIMES} CACHE STRING "")
+set(LLVM_ENABLE_PROJECTS ${STAGE1_PROJECTS} CACHE STRING "")
+
 set(LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
 
 # Stage 2 Bootstrap Setup
@@ -37,11 +63,26 @@ set(BOOTSTRAP_CLANG_BOOTSTRAP_TARGETS
 
 # Stage 2 Options
 set(STAGE2_PROJECTS "clang")
-if (LLVM_RELEASE_ENABLE_LTO)
+set(STAGE2_RUNTIMES "")
+
+if (LLVM_RELEASE_ENABLE_LTO OR LLVM_RELEASE_ENABLE_PGO)
  list(APPEND STAGE2_PROJECTS "lld")
 endif()
+
+if (LLVM_RELEASE_ENABLE_PGO)
+  set(BOOTSTRAP_LLVM_BUILD_INSTRUMENTED IR CACHE STRING "")
+  list(APPEND STAGE2_RUNTIMES "compiler-rt")
+  set(BOOTSTRAP_LLVM_ENABLE_LTO ${LLVM_RELEASE_ENABLE_LTO})
+  if (LLVM_RELEASE_ENABLE_LTO)
+    set(BOOTSTRAP_LLVM_ENABLE_LLD ON CACHE BOOL "")
+  endif()
+endif()
+
 set(BOOTSTRAP_LLVM_ENABLE_PROJECTS ${STAGE2_PROJECTS} CACHE STRING "")
-set(BOOTSTRAP_LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
+set(BOOTSTRAP_LLVM_ENABLE_RUNTIMES ${STAGE2_RUNTIMES} CACHE STRING "")
+if (NOT LLVM_RELEASE_ENABLE_PGO)
+  set(BOOTSTRAP_LLVM_TARGETS_TO_BUILD Native CACHE STRING "")
+endif()
 
 # Stage 3 Options
 set(BOOTSTRAP_BOOTSTRAP_LLVM_ENABLE_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind" CACHE STRING "")
diff --git a/llvm/utils/release/test-release.sh b/llvm/utils/release/test-release.sh
index 544d4bfdd799c..5b1945df47d24 100755
--- a/llvm/utils/release/test-release.sh
+++ b/llvm/utils/release/test-release.sh
@@ -359,11 +359,11 @@ function build_with_cmake_cache() {
         $ExtraConfigureFlags
         2>&1 | tee $LogDir/llvm.configure-$Flavor.log
 
-  ${MAKE} $J_ARG $Verbose -C $CMakeBuildDir stage3-check-all \
+  ${MAKE} $J_ARG $Verbose -C $CMakeBuildDir stage2-check-all \
           2>&1 | tee $LogDir/llvm.make-$Flavor.log > $redir
 
   DESTDIR="${InstallDir}" \
-  ${MAKE} -C $CMakeBuildDir stage3-install \
+  ${MAKE} -C $CMakeBuildDir stage2-install \
           2>&1 | tee $LogDir/llvm.install-$Flavor.log > $redir
 
  mkdir -p $BuildDir/Release

From f7669ba3d9443bc95dd63fa25beea13e6265fdc5 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 23 Jan 2024 11:38:15 -0800
Subject: [PATCH 680/843] [ELF] --save-temps --lto-emit-asm: derive ELF/asm
 file names from bitcode file names

Port COFF's https://reviews.llvm.org/D78221 and
https://reviews.llvm.org/D137217 to ELF. For the in-process ThinLTO
link, `ld.lld --save-temps a.o d/b.o -o out` will create
ELF relocatable files `out.lto.a.o`/`d/out.lto.b.o` instead of
`out1.lto.o`/`out2.lto.o`. Deriving the LTO-generated relocatable file
name from bitcode file names helps debugging.

The relocatable file name from the first regular LTO partition does not
change: `out.lto.o`. The second, if present due to `--lto-partition=`,
changes from `out1.lto.o` to `lto.1.o`.

For an archive member, e.g. `d/a.a(coll.o at 8)`,
the relocatable file is `d/out.lto.a.a(coll.o at 8).o`.

`--lto-emit-asm` file names are changed similarly. `--lto-emit-asm -o
out` now creates `out.lto.s` instead of `out`, therefore the
`--lto-emit-asm -o -` idiom no longer works. However, I think this new
behavior (which matches COFF) is better since keeping or removing
`--lto-emit-asm` will dump different files, instead of overwriting the
`-o` output file from an executable/shared object to an assembly file.

Reviewers: rnk, igorkudrin, xur-llvm, teresajohnson, ZequanWu

Reviewed By: teresajohnson

Pull Request: https://github.com/llvm/llvm-project/pull/78835
---
 lld/ELF/LTO.cpp                            | 78 +++++++++++++++-------
 lld/ELF/LTO.h                              |  4 +-
 lld/test/ELF/common-archive-lookup.s       |  8 ++-
 lld/test/ELF/lto/cache.ll                  |  6 +-
 lld/test/ELF/lto/comdat-mixed-archive.test |  4 +-
 lld/test/ELF/lto/emit-asm.ll               | 13 ++--
 lld/test/ELF/lto/exclude-libs-libcall.ll   |  2 +-
 lld/test/ELF/lto/obj-path.ll               |  4 +-
 lld/test/ELF/lto/parallel-internalize.ll   |  2 +-
 lld/test/ELF/lto/parallel.ll               |  2 +-
 lld/test/ELF/lto/pseudo-probe-lto.ll       |  3 +-
 lld/test/ELF/lto/save-temps-eq.ll          | 10 +--
 lld/test/ELF/lto/thin-archivecollision.ll  | 12 ++--
 lld/test/ELF/lto/thinlto.ll                | 48 ++++++-------
 14 files changed, 116 insertions(+), 80 deletions(-)

diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index 728e15a6976ab..c39c6e6ea74ba 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -12,6 +12,7 @@
 #include "SymbolTable.h"
 #include "Symbols.h"
 #include "lld/Common/Args.h"
+#include "lld/Common/CommonLinkerContext.h"
 #include "lld/Common/ErrorHandler.h"
 #include "lld/Common/Filesystem.h"
 #include "lld/Common/Strings.h"
@@ -28,6 +29,7 @@
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 #include <algorithm>
 #include <cstddef>
 #include <memory>
@@ -305,6 +307,7 @@ std::vector<InputFile *> BitcodeCompiler::compile() {
   unsigned maxTasks = ltoObj->getMaxTasks();
   buf.resize(maxTasks);
   files.resize(maxTasks);
+  filenames.resize(maxTasks);
 
   // The --thinlto-cache-dir option specifies the path to a directory in which
   // to cache native object files for ThinLTO incremental builds. If a path was
@@ -315,13 +318,15 @@ std::vector<InputFile *> BitcodeCompiler::compile() {
                              [&](size_t task, const Twine &moduleName,
                                  std::unique_ptr<MemoryBuffer> mb) {
                                files[task] = std::move(mb);
+                               filenames[task] = moduleName.str();
                              }));
 
   if (!ctx.bitcodeFiles.empty())
     checkError(ltoObj->run(
         [&](size_t task, const Twine &moduleName) {
+          buf[task].first = moduleName.str();
           return std::make_unique<CachedFileStream>(
-              std::make_unique<raw_svector_ostream>(buf[task]));
+              std::make_unique<raw_svector_ostream>(buf[task].second));
         },
         cache));
 
@@ -340,7 +345,7 @@ std::vector<InputFile *> BitcodeCompiler::compile() {
 
   if (config->thinLTOIndexOnly) {
     if (!config->ltoObjPath.empty())
-      saveBuffer(buf[0], config->ltoObjPath);
+      saveBuffer(buf[0].second, config->ltoObjPath);
 
     // ThinLTO with index only option is required to generate only the index
     // files. After that, we exit from linker and ThinLTO backend runs in a
@@ -354,32 +359,57 @@ std::vector<InputFile *> BitcodeCompiler::compile() {
     pruneCache(config->thinLTOCacheDir, config->thinLTOCachePolicy, files);
 
   if (!config->ltoObjPath.empty()) {
-    saveBuffer(buf[0], config->ltoObjPath);
+    saveBuffer(buf[0].second, config->ltoObjPath);
     for (unsigned i = 1; i != maxTasks; ++i)
-      saveBuffer(buf[i], config->ltoObjPath + Twine(i));
-  }
-
-  if (config->saveTempsArgs.contains("prelink")) {
-    if (!buf[0].empty())
-      saveBuffer(buf[0], config->outputFile + ".lto.o");
-    for (unsigned i = 1; i != maxTasks; ++i)
-      saveBuffer(buf[i], config->outputFile + Twine(i) + ".lto.o");
-  }
-
-  if (config->ltoEmitAsm) {
-    saveBuffer(buf[0], config->outputFile);
-    for (unsigned i = 1; i != maxTasks; ++i)
-      saveBuffer(buf[i], config->outputFile + Twine(i));
-    return {};
+      saveBuffer(buf[i].second, config->ltoObjPath + Twine(i));
   }
 
+  bool savePrelink = config->saveTempsArgs.contains("prelink");
   std::vector<InputFile *> ret;
-  for (unsigned i = 0; i != maxTasks; ++i)
-    if (!buf[i].empty())
-      ret.push_back(createObjFile(MemoryBufferRef(buf[i], "lto.tmp")));
+  const char *ext = config->ltoEmitAsm ? ".s" : ".o";
+  for (unsigned i = 0; i != maxTasks; ++i) {
+    StringRef bitcodeFilePath;
+    StringRef objBuf;
+    if (files[i]) {
+      // When files[i] is not null, we get the native relocatable file from the
+      // cache. filenames[i] contains the original BitcodeFile's identifier.
+      objBuf = files[i]->getBuffer();
+      bitcodeFilePath = filenames[i];
+    } else {
+      // Get the native relocatable file after in-process LTO compilation.
+      objBuf = buf[i].second;
+      bitcodeFilePath = buf[i].first;
+    }
+    if (objBuf.empty())
+      continue;
 
-  for (std::unique_ptr<MemoryBuffer> &file : files)
-    if (file)
-      ret.push_back(createObjFile(*file));
+    // If the input bitcode file is path/to/x.o and -o specifies a.out, the
+    // corresponding native relocatable file path will look like:
+    // path/to/a.out.lto.x.o.
+    StringRef ltoObjName;
+    if (bitcodeFilePath == "ld-temp.o") {
+      ltoObjName =
+          saver().save(Twine(config->outputFile) + ".lto" +
+                       (i == 0 ? Twine("") : Twine('.') + Twine(i)) + ext);
+    } else {
+      StringRef directory = sys::path::parent_path(bitcodeFilePath);
+      // For an archive member, which has an identifier like "d/a.a(coll.o at
+      // 8)" (see BitcodeFile::BitcodeFile), use the filename; otherwise, use
+      // the stem (d/a.o => a).
+      StringRef baseName = bitcodeFilePath.ends_with(")")
+                               ? sys::path::filename(bitcodeFilePath)
+                               : sys::path::stem(bitcodeFilePath);
+      StringRef outputFileBaseName = sys::path::filename(config->outputFile);
+      SmallString<256> path;
+      sys::path::append(path, directory,
+                        outputFileBaseName + ".lto." + baseName + ext);
+      sys::path::remove_dots(path, true);
+      ltoObjName = saver().save(path.str());
+    }
+    if (savePrelink || config->ltoEmitAsm)
+      saveBuffer(buf[i].second, ltoObjName);
+    if (!config->ltoEmitAsm)
+      ret.push_back(createObjFile(MemoryBufferRef(objBuf, ltoObjName)));
+  }
   return ret;
 }
diff --git a/lld/ELF/LTO.h b/lld/ELF/LTO.h
index 7ab654101f248..405e7f4a649e8 100644
--- a/lld/ELF/LTO.h
+++ b/lld/ELF/LTO.h
@@ -46,8 +46,10 @@ class BitcodeCompiler {
 
 private:
   std::unique_ptr<llvm::lto::LTO> ltoObj;
-  std::vector<SmallString<0>> buf;
+  // An array of (module name, native relocatable file content) pairs.
+  SmallVector<std::pair<std::string, SmallString<0>>, 0> buf;
   std::vector<std::unique_ptr<MemoryBuffer>> files;
+  SmallVector<std::string, 0> filenames;
   llvm::DenseSet<StringRef> usedStartStop;
   std::unique_ptr<llvm::raw_fd_ostream> indexFile;
   llvm::DenseSet<StringRef> thinIndices;
diff --git a/lld/test/ELF/common-archive-lookup.s b/lld/test/ELF/common-archive-lookup.s
index 297d3d9b6f36e..a30d0f18d01ad 100644
--- a/lld/test/ELF/common-archive-lookup.s
+++ b/lld/test/ELF/common-archive-lookup.s
@@ -61,10 +61,12 @@
 # RUN: ld.lld --no-fortran-common -o 11 main.o --start-lib 1.o strong_data_only.o --end-lib
 # RUN: llvm-readobj --syms 11 | FileCheck --check-prefix=NFC %s
 
-# RUN: ld.lld -o - main.o 4.a --fortran-common --lto-emit-asm | FileCheck --check-prefix=ASM %s
+# RUN: ld.lld -o out main.o 4.a --fortran-common --lto-emit-asm
+# RUN: FileCheck --check-prefix=ASM %s < out.lto.s
 
-# RUN: ld.lld -o - main.o  --start-lib 1.bc 2.bc --end-lib --fortran-common --lto-emit-asm | \
-# RUN:   FileCheck --check-prefix=ASM %s
+# RUN: rm out.lto.s
+# RUN: ld.lld -o out main.o --start-lib 1.bc 2.bc --end-lib --fortran-common --lto-emit-asm
+# RUN: FileCheck --check-prefix=ASM %s < out.lto.s
 
 ## COMMON overrides weak. Don't extract 3.bc which provides a weak definition.
 # RUN: ld.lld -o /dev/null main.o --start-lib 1.bc 3.bc --end-lib -y block | FileCheck --check-prefix=LTO_WEAK %s
diff --git a/lld/test/ELF/lto/cache.ll b/lld/test/ELF/lto/cache.ll
index 95c68ab2315e6..aa1729853d091 100644
--- a/lld/test/ELF/lto/cache.ll
+++ b/lld/test/ELF/lto/cache.ll
@@ -53,10 +53,10 @@
 
 ; RUN: rm -fr cache && mkdir cache
 ; RUN: ld.lld --thinlto-cache-dir=cache --save-temps -o out b.bc a.bc -M | FileCheck %s --check-prefix=MAP
-; RUN: ls out1.lto.o a.bc.0.preopt.bc b.bc.0.preopt.bc
+; RUN: ls out.lto.a.o a.bc.0.preopt.bc b.bc.0.preopt.bc
 
-; MAP: llvmcache-{{.*}}:(.text)
-; MAP: llvmcache-{{.*}}:(.text)
+; MAP: out.lto.b.o:(.text)
+; MAP: out.lto.a.o:(.text)
 
 ;; Check that mllvm options participate in the cache key
 ; RUN: rm -rf cache && mkdir cache
diff --git a/lld/test/ELF/lto/comdat-mixed-archive.test b/lld/test/ELF/lto/comdat-mixed-archive.test
index d7ea4eb18c72e..88e294ed98722 100644
--- a/lld/test/ELF/lto/comdat-mixed-archive.test
+++ b/lld/test/ELF/lto/comdat-mixed-archive.test
@@ -35,8 +35,8 @@ TRACE-NEXT: lib.a(obj.o): definition of bar
 TRACE-NEXT: lib.a(obj.o): reference to foo
 TRACE-NEXT: <internal>: reference to foo
 ;; The definition of "foo" is visible outside the LTO result.
-TRACE-NEXT: lto.tmp: definition of foo
-TRACE-NEXT: lto.tmp: reference to bar
+TRACE-NEXT: {{.*}}.lto.o: definition of foo
+TRACE-NEXT: {{.*}}.lto.o: reference to bar
 
 ;--- start.s
   .global _start, baz
diff --git a/lld/test/ELF/lto/emit-asm.ll b/lld/test/ELF/lto/emit-asm.ll
index 03cf32d904520..463b4fc3c5f62 100644
--- a/lld/test/ELF/lto/emit-asm.ll
+++ b/lld/test/ELF/lto/emit-asm.ll
@@ -1,13 +1,14 @@
 ; REQUIRES: x86
 ; RUN: rm -rf %t && mkdir %t && cd %t
 ; RUN: llvm-as %s -o a.bc
-; RUN: ld.lld --lto-emit-asm -shared a.bc -o - | FileCheck %s
-; RUN: ld.lld --plugin-opt=emit-asm --plugin-opt=lto-partitions=2 -shared a.bc -o out.s
-; RUN: cat out.s out.s1 | FileCheck %s
+; RUN: ld.lld --lto-emit-asm -shared a.bc -o out 2>&1 | count 0
+; RUN: FileCheck %s < out.lto.s
+; RUN: ld.lld --plugin-opt=emit-asm --plugin-opt=lto-partitions=2 -shared a.bc -o out
+; RUN: cat out.lto.s out.lto.1.s | FileCheck %s
 
-; RUN: ld.lld --lto-emit-asm --save-temps -shared a.bc -o out.s
-; RUN: FileCheck --input-file out.s %s
-; RUN: llvm-dis out.s.0.4.opt.bc -o - | FileCheck --check-prefix=OPT %s
+; RUN: ld.lld --lto-emit-asm --save-temps -shared a.bc -o out
+; RUN: FileCheck --input-file out.lto.s %s
+; RUN: llvm-dis out.0.4.opt.bc -o - | FileCheck --check-prefix=OPT %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/lld/test/ELF/lto/exclude-libs-libcall.ll b/lld/test/ELF/lto/exclude-libs-libcall.ll
index 3fdffb4276d9e..8e1d54853cc37 100644
--- a/lld/test/ELF/lto/exclude-libs-libcall.ll
+++ b/lld/test/ELF/lto/exclude-libs-libcall.ll
@@ -7,7 +7,7 @@
 ; RUN: llvm-readelf --dyn-syms %t.so | FileCheck %s
 
 ; TRACE:      {{.*}}/b.a(b.o): lazy definition of __divti3
-; TRACE-NEXT: lto.tmp: reference to __divti3
+; TRACE-NEXT: {{.*}}.lto.o: reference to __divti3
 ; TRACE-NEXT: {{.*}}/b.a(b.o): definition of __divti3
 
 ; CHECK:     Symbol table '.dynsym' contains 2 entries:
diff --git a/lld/test/ELF/lto/obj-path.ll b/lld/test/ELF/lto/obj-path.ll
index 23b593bd36fa8..c0bb4addf2466 100644
--- a/lld/test/ELF/lto/obj-path.ll
+++ b/lld/test/ELF/lto/obj-path.ll
@@ -67,12 +67,12 @@
 ;; Ensure lld emits empty combined module if specific obj-path.
 ; RUN: mkdir obj
 ; RUN: ld.lld --plugin-opt=obj-path=objpath.o -shared 1.bc d/2.bc -o obj/out --save-temps
-; RUN: ls obj/out.lto.o obj/out1.lto.o obj/out2.lto.o
+; RUN: ls obj/out.lto.o out.lto.1.o d/out.lto.2.o
 
 ;; Ensure lld does not emit empty combined module by default.
 ; RUN: rm -fr obj && mkdir obj
 ; RUN: ld.lld -shared 1.bc d/2.bc -o obj/out --save-temps
-; RUN: ls obj/out*.lto.* | count 2
+; RUN: not test -e obj/out.lto.o
 
 ; EMPTY:     file format elf64-x86-64
 ; EMPTY-NOT: {{.}}
diff --git a/lld/test/ELF/lto/parallel-internalize.ll b/lld/test/ELF/lto/parallel-internalize.ll
index 9cd080360d396..a1511d6f6867d 100644
--- a/lld/test/ELF/lto/parallel-internalize.ll
+++ b/lld/test/ELF/lto/parallel-internalize.ll
@@ -4,7 +4,7 @@
 ; RUN: ld.lld --lto-partitions=2 -save-temps -o out a.bc -e foo --lto-O0
 ; RUN: llvm-readobj --symbols --dyn-syms out | FileCheck %s
 ; RUN: llvm-nm out.lto.o | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-nm out1.lto.o | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-nm out.lto.1.o | FileCheck --check-prefix=CHECK1 %s
 
 ; CHECK:      Symbols [
 ; CHECK-NEXT:   Symbol {
diff --git a/lld/test/ELF/lto/parallel.ll b/lld/test/ELF/lto/parallel.ll
index c6c9a185677b8..6b2c352b0a965 100644
--- a/lld/test/ELF/lto/parallel.ll
+++ b/lld/test/ELF/lto/parallel.ll
@@ -3,7 +3,7 @@
 ; RUN: llvm-as -o a.bc %s
 ; RUN: ld.lld --lto-partitions=2 -save-temps -o out a.bc -shared
 ; RUN: llvm-nm out.lto.o | FileCheck --check-prefix=CHECK0 %s
-; RUN: llvm-nm out1.lto.o | FileCheck --check-prefix=CHECK1 %s
+; RUN: llvm-nm out.lto.1.o | FileCheck --check-prefix=CHECK1 %s
 
 ; RUN: not ld.lld --lto-partitions=0 a.bc -o /dev/null 2>&1 | FileCheck --check-prefix=INVALID %s
 ; INVALID: --lto-partitions: number of threads must be > 0
diff --git a/lld/test/ELF/lto/pseudo-probe-lto.ll b/lld/test/ELF/lto/pseudo-probe-lto.ll
index 3840c7c2281c0..cabd69da33fe2 100644
--- a/lld/test/ELF/lto/pseudo-probe-lto.ll
+++ b/lld/test/ELF/lto/pseudo-probe-lto.ll
@@ -1,6 +1,7 @@
 ; REQUIRES: x86
 ; RUN: opt < %s -passes=pseudo-probe -function-sections -o %t.o
-; RUN: ld.lld %t.o -shared --lto-emit-asm -o - | FileCheck %s
+; RUN: ld.lld %t.o -shared --lto-emit-asm -o %t
+; RUN: FileCheck %s < %t.lto.s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-scei-ps4"
diff --git a/lld/test/ELF/lto/save-temps-eq.ll b/lld/test/ELF/lto/save-temps-eq.ll
index ed9c3970347f3..9befc99af9253 100644
--- a/lld/test/ELF/lto/save-temps-eq.ll
+++ b/lld/test/ELF/lto/save-temps-eq.ll
@@ -14,14 +14,14 @@
 ;; Create the .all dir with save-temps saving everything, this will be used to compare
 ;; with the output from individualized save-temps later
 ; RUN: ld.lld main.o thin1.o --save-temps -o %t/all/a.out
-; RUN: mv *.o.* %t/all
+; RUN: mv a.out.lto.* *.o.*.bc %t/all
 ;; Sanity check that everything got moved
 ; RUN: ls | count 2
 
 ;; Check precedence if both --save-temps and --save-temps= are present
 ; RUN: ld.lld main.o thin1.o --save-temps=preopt --save-temps --save-temps=\opt -o %t/all2/a.out
 ; RUN: cmp %t/all2/a.out %t/all/a.out
-; RUN: mv *.o.* %t/all2
+; RUN: mv a.out.lto.* *.o.* %t/all2
 ; RUN: ls | count 2
 ; RUN: diff -r %t/all %t/all2
 
@@ -83,8 +83,8 @@
 ;; Check prelink
 ; RUN: ld.lld main.o thin1.o --save-temps=prelink
 ; RUN: cmp %t/all/a.out a.out && rm -f a.out
-; RUN: cp *.lto.o %t/subset2
-; RUN: mv *.lto.o %t/all3
+; RUN: cp a.out.lto.*.o %t/subset2
+; RUN: mv a.out.lto.*.o %t/all3
 ; RUN: ls | count 2
 
 ;; Check resolution
@@ -104,7 +104,7 @@
 ; RUN: cmp %t/all/a.out a.out && rm -f a.out
 ; RUN: mv *.0.preopt.* %t/subset
 ; RUN: mv *.4.opt* %t/subset
-; RUN: mv *.lto.o %t/subset
+; RUN: mv a.out.lto.*.o %t/subset
 ; RUN: ls | count 2
 ; RUN: diff -r %t/subset2 %t/subset
 
diff --git a/lld/test/ELF/lto/thin-archivecollision.ll b/lld/test/ELF/lto/thin-archivecollision.ll
index da3d08801e96f..4fbb20d9c1280 100644
--- a/lld/test/ELF/lto/thin-archivecollision.ll
+++ b/lld/test/ELF/lto/thin-archivecollision.ll
@@ -9,21 +9,21 @@
 ; RUN: llvm-ar rcS d/a.a d/coll.o e/coll.o
 ; RUN: ld.lld b.bc d/a.a -o out --save-temps
 ; RUN: llvm-nm out | FileCheck %s
-; RUN: llvm-nm out2.lto.o | FileCheck %s --check-prefix=MOD2
-; RUN: llvm-nm out3.lto.o | FileCheck %s --check-prefix=MOD3
+;; d/out.lto.a.a(coll.o at 8).o out.lto.a.a(coll.o at 1916).o
+; RUN: llvm-nm d/out.lto.a.a*at*.o | FileCheck %s --check-prefix=MOD2
 
 ; Check we handle this case correctly even in presence of --whole-archive.
-; RUN: rm out1.lto.o out2.lto.o out3.lto.o
+; RUN: rm d/out.lto.a.a*at*.o
 ; RUN: ld.lld b.bc --whole-archive d/a.a -o out --save-temps
 ; RUN: llvm-nm out | FileCheck %s
-; RUN: ls out1.lto.o out2.lto.o out3.lto.o
+; RUN: llvm-nm d/out.lto.a.a*at*.o | FileCheck %s --check-prefix=MOD2
 
 ; CHECK: T _start
 ; CHECK: T blah
 ; CHECK: T foo
 
-; MOD2: T foo
-; MOD3: T blah
+; MOD2-DAG: T foo
+; MOD2-DAG: T blah
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-scei-ps4"
diff --git a/lld/test/ELF/lto/thinlto.ll b/lld/test/ELF/lto/thinlto.ll
index f42a0323ff044..e0e2ca8e2e8ea 100644
--- a/lld/test/ELF/lto/thinlto.ll
+++ b/lld/test/ELF/lto/thinlto.ll
@@ -7,56 +7,56 @@
 ; RUN: opt -module-summary %p/Inputs/thinlto.ll -o d/b.o
 
 ; First force single-threaded mode
-; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: rm -f out.lto.a.o d/out.lto.b.o
 ; RUN: ld.lld -save-temps --thinlto-jobs=1 -shared a.o d/b.o -o e/out
-; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: llvm-nm out.lto.a.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm d/out.lto.b.o | FileCheck %s --check-prefix=NM2
 
 ; Next force multi-threaded mode
-; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: rm -f out.lto.a.o d/out.lto.b.o
 ; RUN: ld.lld -save-temps --thinlto-jobs=2 -shared a.o d/b.o -o e/out
-; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: llvm-nm out.lto.a.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm d/out.lto.b.o | FileCheck %s --check-prefix=NM2
 
 ;; --plugin-opt=jobs= is an alias.
-; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: rm -f out.lto.a.o d/out.lto.b.o
 ; RUN: ld.lld -save-temps --plugin-opt=jobs=2 -shared a.o d/b.o -o e/out
-; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: llvm-nm out.lto.a.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm d/out.lto.b.o | FileCheck %s --check-prefix=NM2
 
 ;; --thinlto-jobs= defaults to --threads=.
-; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: rm -f out.lto.a.o d/out.lto.b.o
 ; RUN: ld.lld -save-temps --threads=2 -shared a.o d/b.o -o e/out
-; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: llvm-nm out.lto.a.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm d/out.lto.b.o | FileCheck %s --check-prefix=NM2
 
 ;; --thinlto-jobs= overrides --threads=.
-; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: rm -f out.lto.a.o d/out.lto.b.o
 ; RUN: ld.lld -save-temps --threads=1 --plugin-opt=jobs=2 -shared a.o d/b.o -o e/out
-; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: llvm-nm out.lto.a.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm d/out.lto.b.o | FileCheck %s --check-prefix=NM2
 
 ; Test with all threads, on all cores, on all CPU sockets
-; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: rm -f out.lto.a.o d/out.lto.b.o
 ; RUN: ld.lld -save-temps --thinlto-jobs=all -shared a.o d/b.o -o e/out
-; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: llvm-nm out.lto.a.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm d/out.lto.b.o | FileCheck %s --check-prefix=NM2
 
 ; Test with many more threads than the system has
-; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: rm -f out.lto.a.o d/out.lto.b.o
 ; RUN: ld.lld -save-temps --thinlto-jobs=100 -shared a.o d/b.o -o e/out
-; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: llvm-nm out.lto.a.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm d/out.lto.b.o | FileCheck %s --check-prefix=NM2
 
 ; Test with a bad value
-; RUN: rm -f e/out1.lto.o e/out2.lto.o
+; RUN: rm -f out.lto.a.o d/out.lto.b.o
 ; RUN: not ld.lld -save-temps --thinlto-jobs=foo -shared a.o d/b.o -o e/out 2>&1 | FileCheck %s --check-prefix=BAD-JOBS
 ; BAD-JOBS: error: --thinlto-jobs: invalid job count: foo
 
 ; Then check without --thinlto-jobs (which currently defaults to heavyweight_hardware_concurrency, meanning one thread per hardware core -- not SMT)
 ; RUN: ld.lld -shared -save-temps a.o d/b.o -o e/out
-; RUN: llvm-nm e/out1.lto.o | FileCheck %s --check-prefix=NM1
-; RUN: llvm-nm e/out2.lto.o | FileCheck %s --check-prefix=NM2
+; RUN: llvm-nm out.lto.a.o | FileCheck %s --check-prefix=NM1
+; RUN: llvm-nm d/out.lto.b.o | FileCheck %s --check-prefix=NM2
 
 ; Check that -save-temps is usable with thin archives
 ; RUN: mkdir dir

From 7ca8feb1b89dfe62c9c3a06aa0585adbcd019eff Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Tue, 23 Jan 2024 12:14:06 -0800
Subject: [PATCH 681/843] [lldb] Include SBFormat.h in LLDB.h (#79194)

This was likely overlooked when SBFormat was added.
---
 lldb/include/lldb/API/LLDB.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/include/lldb/API/LLDB.h b/lldb/include/lldb/API/LLDB.h
index f652d1bdb835b..f5f1b87a046c2 100644
--- a/lldb/include/lldb/API/LLDB.h
+++ b/lldb/include/lldb/API/LLDB.h
@@ -33,6 +33,7 @@
 #include "lldb/API/SBFile.h"
 #include "lldb/API/SBFileSpec.h"
 #include "lldb/API/SBFileSpecList.h"
+#include "lldb/API/SBFormat.h"
 #include "lldb/API/SBFrame.h"
 #include "lldb/API/SBFunction.h"
 #include "lldb/API/SBHostOS.h"

From 9122d19de4a237ededfd25c120bcb6e5efb22b1c Mon Sep 17 00:00:00 2001
From: Roland McGrath <mcgrathr@google.com>
Date: Tue, 23 Jan 2024 12:15:22 -0800
Subject: [PATCH 682/843] Revert "[libc] Fix forward arm32 buildbot" (#79201)

Reverts llvm/llvm-project#79151, necessary to revert #79128, which broke
all production builds and landed without review.
---
 libc/src/__support/FPUtil/arm/FEnvImpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/FPUtil/arm/FEnvImpl.h b/libc/src/__support/FPUtil/arm/FEnvImpl.h
index ac4673cf20f63..1a89de50b6b60 100644
--- a/libc/src/__support/FPUtil/arm/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/arm/FEnvImpl.h
@@ -135,8 +135,8 @@ LIBC_INLINE int set_except(int excepts) {
 LIBC_INLINE int raise_except(int excepts) {
   float zero = 0.0f;
   float one = 1.0f;
-  float large_value = FPBits<float>::max_normal().get_val();
-  float small_value = FPBits<float>::min_normal().get_val();
+  float large_value = FPBits<float>::max_normal();
+  float small_value = FPBits<float>::min_normal();
   auto divfunc = [](float a, float b) {
     __asm__ __volatile__("flds  s0, %0\n\t"
                          "flds  s1, %1\n\t"

From f1abe78e6b6adadb9fd9d6a239f40317c5d56f94 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 23 Jan 2024 12:09:13 -0800
Subject: [PATCH 683/843] [RISCV] Move FeatureStdExtH in RISCVFeatures.td. NFC

It was accidentally in the middle of the floating point extensions
after the recent reordering.
---
 llvm/lib/Target/RISCV/RISCVFeatures.td | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 30ede62944253..3878be680c049 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -200,14 +200,6 @@ def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
                  AssemblerPredicate<(all_of FeatureStdExtD),
                                     "'D' (Double-Precision Floating-Point)">;
 
-def FeatureStdExtH
-    : SubtargetFeature<"h", "HasStdExtH", "true",
-                       "'H' (Hypervisor)">;
-
-def HasStdExtH : Predicate<"Subtarget->hasStdExtH()">,
-                 AssemblerPredicate<(all_of FeatureStdExtH),
-                                    "'H' (Hypervisor)">;
-
 def FeatureStdExtZfhmin
     : SubtargetFeature<"zfhmin", "HasStdExtZfhmin", "true",
                        "'Zfhmin' (Half-Precision Floating-Point Minimal)",
@@ -747,6 +739,16 @@ def HasVInstructionsF64 : Predicate<"Subtarget->hasVInstructionsF64()">;
 
 def HasVInstructionsFullMultiply : Predicate<"Subtarget->hasVInstructionsFullMultiply()">;
 
+// Hypervisor Extensions
+
+def FeatureStdExtH
+    : SubtargetFeature<"h", "HasStdExtH", "true",
+                       "'H' (Hypervisor)">;
+
+def HasStdExtH : Predicate<"Subtarget->hasStdExtH()">,
+                 AssemblerPredicate<(all_of FeatureStdExtH),
+                                    "'H' (Hypervisor)">;
+
 // Supervisor extensions
 
 def FeatureStdExtSmaia

From d86a6eacf661f2bf05ea486b7a04b0cc1f97919c Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Tue, 23 Jan 2024 15:19:34 -0500
Subject: [PATCH 684/843] [libc] Fix aliasing function name got accidentally
 deleted in #79128. (#79203)

---
 libc/src/__support/common.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/libc/src/__support/common.h b/libc/src/__support/common.h
index a153dfc363d73..53951dc131c28 100644
--- a/libc/src/__support/common.h
+++ b/libc/src/__support/common.h
@@ -25,6 +25,7 @@
 #define LLVM_LIBC_FUNCTION_IMPL(type, name, arglist)                           \
   LLVM_LIBC_FUNCTION_ATTR decltype(LIBC_NAMESPACE::name)                       \
       __##name##_impl__ __asm__(#name);                                        \
+  decltype(LIBC_NAMESPACE::name) name [[gnu::alias(#name)]];                   \
   type __##name##_impl__ arglist
 #else
 #define LLVM_LIBC_FUNCTION_IMPL(type, name, arglist) type name arglist

From eabddf22e20bc5fd62edda677056e1d635f8e7f4 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 23 Jan 2024 15:27:54 -0500
Subject: [PATCH 685/843] [libc++] Run the nightly libc++ build at 03:00
 Eastern for real (#79184)

The nightly libc++ build was incorrectly set up to build at 22:00
Eastern when it intended to run at 03:00 Eastern. This patch fixes that.
---
 .github/workflows/libcxx-build-and-test.yaml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/libcxx-build-and-test.yaml b/.github/workflows/libcxx-build-and-test.yaml
index 0cab9b841e4ee..5727b956dc6dd 100644
--- a/.github/workflows/libcxx-build-and-test.yaml
+++ b/.github/workflows/libcxx-build-and-test.yaml
@@ -23,8 +23,8 @@ on:
       - 'cmake/**'
       - '.github/workflows/libcxx-build-and-test.yaml'
   schedule:
-    # Run nightly at 8 AM UTC (or roughly 3 AM eastern)
-    - cron: '0 3 * * *'
+    # Run nightly at 08:00 UTC (aka 00:00 Pacific, aka 03:00 Eastern)
+    - cron: '0 8 * * *'
 
 permissions:
   contents: read # Default everything to read-only

From d813af73f70f6b2fd41621d640cc35e595b9da4c Mon Sep 17 00:00:00 2001
From: "Ben Hamilton (Ben Gertzfield)" <benhamilton@google.com>
Date: Tue, 23 Jan 2024 13:32:41 -0700
Subject: [PATCH 686/843] [Format] Fix detection of languages when reading from
 stdin (#79051)

The code cleanup in #74794 accidentally broke detection of languages by
reading file content from stdin, e.g. via `clang-format -dump-config - <
/path/to/filename`.

This PR adds unit and integration tests to reproduce the issue and adds
a fix.

Fixes: #79023
---
 clang/test/Format/dump-config-objc-stdin.m |  5 +++++
 clang/tools/clang-format/ClangFormat.cpp   | 24 ++++++++++++----------
 clang/unittests/Format/FormatTestObjC.cpp  |  8 ++++++++
 3 files changed, 26 insertions(+), 11 deletions(-)
 create mode 100644 clang/test/Format/dump-config-objc-stdin.m

diff --git a/clang/test/Format/dump-config-objc-stdin.m b/clang/test/Format/dump-config-objc-stdin.m
new file mode 100644
index 0000000000000..b22ff7b3328ca
--- /dev/null
+++ b/clang/test/Format/dump-config-objc-stdin.m
@@ -0,0 +1,5 @@
+// RUN: clang-format -dump-config - < %s | FileCheck %s
+
+// CHECK: Language: ObjC
+@interface Foo
+@end
diff --git a/clang/tools/clang-format/ClangFormat.cpp b/clang/tools/clang-format/ClangFormat.cpp
index 49ab7677a3ee9..5ee6092bb9bb7 100644
--- a/clang/tools/clang-format/ClangFormat.cpp
+++ b/clang/tools/clang-format/ClangFormat.cpp
@@ -547,18 +547,20 @@ static void PrintVersion(raw_ostream &OS) {
 // Dump the configuration.
 static int dumpConfig(bool IsSTDIN) {
   std::unique_ptr<llvm::MemoryBuffer> Code;
-  // We can't read the code to detect the language if there's no file name.
-  if (!IsSTDIN) {
-    // Read in the code in case the filename alone isn't enough to detect the
-    // language.
-    ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
-        MemoryBuffer::getFileOrSTDIN(FileNames[0]);
-    if (std::error_code EC = CodeOrErr.getError()) {
-      llvm::errs() << EC.message() << "\n";
-      return 1;
-    }
-    Code = std::move(CodeOrErr.get());
+
+  // `FileNames` must have at least "-" in it even if no file was specified.
+  assert(!FileNames.empty());
+
+  // Read in the code in case the filename alone isn't enough to detect the
+  // language.
+  ErrorOr<std::unique_ptr<MemoryBuffer>> CodeOrErr =
+      MemoryBuffer::getFileOrSTDIN(FileNames[0]);
+  if (std::error_code EC = CodeOrErr.getError()) {
+    llvm::errs() << EC.message() << "\n";
+    return 1;
   }
+  Code = std::move(CodeOrErr.get());
+
   llvm::Expected<clang::format::FormatStyle> FormatStyle =
       clang::format::getStyle(Style, IsSTDIN ? AssumeFileName : FileNames[0],
                               FallbackStyle, Code ? Code->getBuffer() : "");
diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp
index cd4f9d934127b..d2c3459e0f846 100644
--- a/clang/unittests/Format/FormatTestObjC.cpp
+++ b/clang/unittests/Format/FormatTestObjC.cpp
@@ -31,6 +31,14 @@ class FormatTestObjC : public FormatTestBase {
   _verifyIncompleteFormat(__FILE__, __LINE__, __VA_ARGS__)
 #define verifyFormat(...) _verifyFormat(__FILE__, __LINE__, __VA_ARGS__)
 
+TEST(FormatTestObjCStyle, DetectsObjCInStdin) {
+  auto Style = getStyle("LLVM", "<stdin>", "none",
+                        "@interface\n"
+                        "- (id)init;");
+  ASSERT_TRUE((bool)Style);
+  EXPECT_EQ(FormatStyle::LK_ObjC, Style->Language);
+}
+
 TEST(FormatTestObjCStyle, DetectsObjCInHeaders) {
   auto Style = getStyle("LLVM", "a.h", "none",
                         "@interface\n"

From b504e97d921d72cc4407ab909ec371d1af957ab7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 23 Jan 2024 20:42:50 +0000
Subject: [PATCH 687/843] [IndVars] Add NUW variants to iv-poison.ll and
 variants with extra uses.

---
 .../Transforms/IndVarSimplify/iv-poison.ll    | 242 ++++++++++++++++++
 1 file changed, 242 insertions(+)

diff --git a/llvm/test/Transforms/IndVarSimplify/iv-poison.ll b/llvm/test/Transforms/IndVarSimplify/iv-poison.ll
index 91358a1477939..38299e0a6b353 100644
--- a/llvm/test/Transforms/IndVarSimplify/iv-poison.ll
+++ b/llvm/test/Transforms/IndVarSimplify/iv-poison.ll
@@ -86,6 +86,70 @@ exit:
   ret i2 %iv.1.next
 }
 
+define i4 @iv_hoist_both_adds_nsw_extra_use(i4 %arg) {
+; CHECK-LABEL: @iv_hoist_both_adds_nsw_extra_use(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i4 [ 1, [[BB:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add nuw i4 [[IV_0]], 1
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i4 1, [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i4 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i4 [[IV_1_NEXT_LCSSA]]
+;
+bb:
+  br label %loop
+
+loop:
+  %iv.0 = phi i4 [ 1, %bb ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i4 [ 1, %bb ], [ %iv.1.next, %loop ]
+  %iv.0.next = add nsw i4 %iv.0, 1
+  call void @use(i4 %iv.0.next)
+  %iv.1.next = add nsw i4 %iv.1, 1
+  call void @use(i4 %iv.1.next)
+  %.not.not = icmp ult i4 %iv.0, %arg
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i4 %iv.1.next
+}
+
+define i4 @iv_hoist_both_adds_nsw_extra_use_incs_reordered(i4 %arg) {
+; CHECK-LABEL: @iv_hoist_both_adds_nsw_extra_use_incs_reordered(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i4 [ 1, [[BB:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add nuw i4 [[IV_0]], 1
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i4 1, [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i4 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i4 [[IV_1_NEXT_LCSSA]]
+;
+bb:
+  br label %loop
+
+loop:
+  %iv.0 = phi i4 [ 1, %bb ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i4 [ 1, %bb ], [ %iv.1.next, %loop ]
+  %iv.1.next = add nsw i4 %iv.1, 1
+  call void @use(i4 %iv.1.next)
+  %iv.0.next = add nsw i4 %iv.0, 1
+  call void @use(i4 %iv.0.next)
+  %.not.not = icmp ult i4 %iv.0, %arg
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i4 %iv.1.next
+}
+
 define i4 @iv_hoist_nsw_poison_extra_use(i4 %0, i4 %end, i4 %start) {
 ; CHECK-LABEL: @iv_hoist_nsw_poison_extra_use(
 ; CHECK-NEXT:  entry:
@@ -117,3 +181,181 @@ exit:
 }
 
 declare void @use(i4)
+
+define i2 @iv_hoist_nuw_poison(i2 %arg, i2 %start) {
+; CHECK-LABEL: @iv_hoist_nuw_poison(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[DOT07:%.*]] = phi i2 [ [[START:%.*]], [[BB:%.*]] ], [ [[I:%.*]], [[BB1]] ]
+; CHECK-NEXT:    [[I]] = add i2 [[DOT07]], 1
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i2 [[START]], [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[COMMON_RET:%.*]], label [[BB1]]
+; CHECK:       common.ret:
+; CHECK-NEXT:    [[I2_LCSSA:%.*]] = phi i2 [ [[I]], [[BB1]] ]
+; CHECK-NEXT:    ret i2 [[I2_LCSSA]]
+;
+bb:
+  br label %bb1
+
+bb1:                                              ; preds = %bb1, %bb
+  %.07 = phi i2 [ %start, %bb ], [ %i, %bb1 ]
+  %.0 = phi i2 [ %start, %bb ], [ %i2, %bb1 ]
+  %i = add nuw i2 %.07, 1
+  %i2 = add i2 %.0, 1
+  %.not.not = icmp ult i2 %.07, %arg
+  br i1 %.not.not, label %common.ret, label %bb1
+
+common.ret:                                       ; preds = %bb1
+  ret i2 %i2
+}
+
+define i4 @iv_hoist_nuw_poison2(i4 %0, i4 %end, i4 %start) {
+; CHECK-LABEL: @iv_hoist_nuw_poison2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i4 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add i4 [[IV_0]], 1
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i4 [[START]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i4 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i4 [[IV_1_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.0 = phi i4 [ %start, %entry ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i4 [ %start, %entry ], [ %iv.1.next, %loop ]
+  %iv.0.next = add i4 %iv.0, 1
+  %iv.1.next = add nuw i4 %iv.1, 1
+  %.not.not = icmp ult i4 %iv.0, %end
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i4 %iv.1.next
+}
+
+define i2 @iv_hoist_both_adds_nuw(i2 %arg, i2 %start) {
+; CHECK-LABEL: @iv_hoist_both_adds_nuw(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i2 [ [[START:%.*]], [[BB:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add i2 [[IV_0]], 1
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i2 [[START]], [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i2 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i2 [[IV_1_NEXT_LCSSA]]
+;
+bb:
+  br label %loop
+
+loop:
+  %iv.0 = phi i2 [ %start, %bb ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i2 [ %start, %bb ], [ %iv.1.next, %loop ]
+  %iv.0.next = add nuw i2 %iv.0, 1
+  %iv.1.next = add nuw i2 %iv.1, 1
+  %.not.not = icmp ult i2 %iv.0, %arg
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i2 %iv.1.next
+}
+
+define i4 @iv_hoist_both_adds_nuw_extra_use(i4 %arg, i4 %start) {
+; CHECK-LABEL: @iv_hoist_both_adds_nuw_extra_use(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i4 [ [[START:%.*]], [[BB:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add i4 [[IV_0]], 1
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i4 [[START]], [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i4 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i4 [[IV_1_NEXT_LCSSA]]
+;
+bb:
+  br label %loop
+
+loop:
+  %iv.0 = phi i4 [ %start, %bb ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i4 [ %start, %bb ], [ %iv.1.next, %loop ]
+  %iv.0.next = add nuw i4 %iv.0, 1
+  call void @use(i4 %iv.0.next)
+  %iv.1.next = add nuw i4 %iv.1, 1
+  call void @use(i4 %iv.1.next)
+  %.not.not = icmp ult i4 %iv.0, %arg
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i4 %iv.1.next
+}
+
+define i4 @iv_hoist_both_adds_nuw_extra_use_incs_reordered(i4 %arg, i4 %start) {
+; CHECK-LABEL: @iv_hoist_both_adds_nuw_extra_use_incs_reordered(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i4 [ [[START:%.*]], [[BB:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add i4 [[IV_0]], 1
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i4 [[START]], [[ARG:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i4 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i4 [[IV_1_NEXT_LCSSA]]
+;
+bb:
+  br label %loop
+
+loop:
+  %iv.0 = phi i4 [ %start, %bb ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i4 [ %start, %bb ], [ %iv.1.next, %loop ]
+  %iv.1.next = add nuw i4 %iv.1, 1
+  call void @use(i4 %iv.1.next)
+  %iv.0.next = add nuw i4 %iv.0, 1
+  call void @use(i4 %iv.0.next)
+  %.not.not = icmp ult i4 %iv.0, %arg
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i4 %iv.1.next
+}
+
+define i4 @iv_hoist_nuw_poison_extra_use(i4 %0, i4 %end, i4 %start) {
+; CHECK-LABEL: @iv_hoist_nuw_poison_extra_use(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[IV_0:%.*]] = phi i4 [ [[START:%.*]], [[ENTRY:%.*]] ], [ [[IV_0_NEXT:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[IV_0_NEXT]] = add i4 [[IV_0]], 1
+; CHECK-NEXT:    call void @use(i4 [[IV_0_NEXT]])
+; CHECK-NEXT:    [[DOTNOT_NOT:%.*]] = icmp ult i4 [[START]], [[END:%.*]]
+; CHECK-NEXT:    br i1 [[DOTNOT_NOT]], label [[EXIT:%.*]], label [[LOOP]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[IV_1_NEXT_LCSSA:%.*]] = phi i4 [ [[IV_0_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i4 [[IV_1_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv.0 = phi i4 [ %start, %entry ], [ %iv.0.next, %loop ]
+  %iv.1 = phi i4 [ %start, %entry ], [ %iv.1.next, %loop ]
+  %iv.0.next = add i4 %iv.0, 1
+  call void @use(i4 %iv.0.next)
+  %iv.1.next = add nuw i4 %iv.1, 1
+  %.not.not = icmp ult i4 %iv.0, %end
+  br i1 %.not.not, label %exit, label %loop
+
+exit:
+  ret i4 %iv.1.next
+}

From bb3e0d7fc34899f22cbaea311982098bae3de061 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 23 Jan 2024 11:34:11 -0800
Subject: [PATCH 688/843] [SLP]Fix PR79193: skip analysis of gather nodes for
 minbitwidth.

No need in trying to analyze small graphs with gather node only to avoid
crash.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp  |  2 +-
 .../SystemZ/minbitwidth-non-vector-root.ll       | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-non-vector-root.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 809d740aae3e0..601d2454c1e16 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -13183,7 +13183,7 @@ void BoUpSLP::computeMinimumValueSizes() {
   // We only attempt to truncate integer expressions.
   auto &TreeRoot = VectorizableTree[0]->Scalars;
   auto *TreeRootIT = dyn_cast<IntegerType>(TreeRoot[0]->getType());
-  if (!TreeRootIT)
+  if (!TreeRootIT || VectorizableTree.front()->State == TreeEntry::NeedToGather)
     return;
 
   // Ensure the roots of the vectorizable tree don't form a cycle.
diff --git a/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-non-vector-root.ll b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-non-vector-root.ll
new file mode 100644
index 0000000000000..6524b378f3d8b
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/SystemZ/minbitwidth-non-vector-root.ll
@@ -0,0 +1,16 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -passes=slp-vectorizer -S -slp-optimize-identity-hor-reduction-ops=false < %s -mtriple=s390x-ibm-linux -mcpu=arch13 | FileCheck %s
+
+define void @foo() {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> zeroinitializer)
+; CHECK-NEXT:    store i32 [[TMP1]], ptr null, align 4
+; CHECK-NEXT:    ret void
+;
+  %1 = add i32 0, 0
+  %2 = add i32 %1, 0
+  %3 = add i32 %2, 0
+  store i32 %3, ptr null, align 4
+  ret void
+}

From 16343f0db286a5d2aaf9494dd402cda0d088d0a1 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 23 Jan 2024 12:49:57 -0800
Subject: [PATCH 689/843] [ELF,test] Fix defsym.ll

---
 lld/test/ELF/lto/defsym.ll | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/lld/test/ELF/lto/defsym.ll b/lld/test/ELF/lto/defsym.ll
index 750c393629066..142201889f5c7 100644
--- a/lld/test/ELF/lto/defsym.ll
+++ b/lld/test/ELF/lto/defsym.ll
@@ -1,17 +1,19 @@
 ; REQUIRES: x86
+; RUN: rm -rf %t && mkdir %t && cd %t
+
 ; LTO
-; RUN: llvm-as %s -o %t.o
-; RUN: llvm-as %S/Inputs/defsym-bar.ll -o %t1.o
-; RUN: ld.lld %t.o %t1.o -shared -o %t.so -defsym=bar2=bar3 -save-temps
-; RUN: llvm-readelf --symbols %t.so.lto.o | FileCheck --check-prefix=OBJ %s
-; RUN: llvm-objdump -d %t.so | FileCheck %s
+; RUN: llvm-as %s -o a.o
+; RUN: llvm-as %S/Inputs/defsym-bar.ll -o b.o
+; RUN: ld.lld a.o b.o -shared -o a.so -defsym=bar2=bar3 -save-temps
+; RUN: llvm-readelf --symbols a.so.lto.o | FileCheck --check-prefix=OBJ %s
+; RUN: llvm-objdump -d a.so | FileCheck %s
 
 ; ThinLTO
-; RUN: opt -module-summary %s -o %t.o
-; RUN: opt -module-summary %S/Inputs/defsym-bar.ll -o %t1.o
-; RUN: ld.lld %t.o %t1.o -shared -o %t2.so -defsym=bar2=bar3 -save-temps
-; RUN: llvm-readelf --symbols %t2.so1.lto.o | FileCheck --check-prefix=OBJ %s
-; RUN: llvm-objdump -d %t2.so | FileCheck %s
+; RUN: opt -module-summary %s -o a.o
+; RUN: opt -module-summary %S/Inputs/defsym-bar.ll -o b.o
+; RUN: ld.lld a.o b.o -shared -o a2.so -defsym=bar2=bar3 -save-temps
+; RUN: llvm-readelf --symbols a2.so1.lto.o | FileCheck --check-prefix=OBJ %s
+; RUN: llvm-objdump -d a2.so | FileCheck %s
 
 ; OBJ:  UND bar2
 

From 4fcd7cf22deff4a63d2bac12c909be7266f8b353 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 23 Jan 2024 12:52:36 -0800
Subject: [PATCH 690/843] [ELF,test] Actually fix defsym.ll

---
 lld/test/ELF/lto/defsym.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/test/ELF/lto/defsym.ll b/lld/test/ELF/lto/defsym.ll
index 142201889f5c7..ef50737aa4f1f 100644
--- a/lld/test/ELF/lto/defsym.ll
+++ b/lld/test/ELF/lto/defsym.ll
@@ -12,7 +12,7 @@
 ; RUN: opt -module-summary %s -o a.o
 ; RUN: opt -module-summary %S/Inputs/defsym-bar.ll -o b.o
 ; RUN: ld.lld a.o b.o -shared -o a2.so -defsym=bar2=bar3 -save-temps
-; RUN: llvm-readelf --symbols a2.so1.lto.o | FileCheck --check-prefix=OBJ %s
+; RUN: llvm-readelf --symbols a2.so.lto.a.o | FileCheck --check-prefix=OBJ %s
 ; RUN: llvm-objdump -d a2.so | FileCheck %s
 
 ; OBJ:  UND bar2

From 4beea6b195585e4d9e196a9a2b0bb79a9c040788 Mon Sep 17 00:00:00 2001
From: RolandF77 <55763885+RolandF77@users.noreply.github.com>
Date: Tue, 23 Jan 2024 16:07:18 -0500
Subject: [PATCH 691/843] [PowerPC] lower partial vector store cost (#78358)

There are matching store opcodes (stfd, stxsiwx) for the load opcodes
that make 32-bit and 64-bit vector operations cheap with VSX, so stores
should also be cheap.
---
 .../Target/PowerPC/PPCTargetTransformInfo.cpp |  5 +++--
 .../Analysis/CostModel/PowerPC/load_store.ll  | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index 062b53e24a0d7..958353f2b4f6f 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -788,9 +788,10 @@ InstructionCost PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   // VSX has 32b/64b load instructions. Legalization can handle loading of
   // 32b/64b to VSR correctly and cheaply. But BaseT::getMemoryOpCost and
   // PPCTargetLowering can't compute the cost appropriately. So here we
-  // explicitly check this case.
+  // explicitly check this case. There are also corresponding store
+  // instructions.
   unsigned MemBytes = Src->getPrimitiveSizeInBits();
-  if (Opcode == Instruction::Load && ST->hasVSX() && IsAltivecType &&
+  if (ST->hasVSX() && IsAltivecType &&
       (MemBytes == 64 || (ST->hasP8Vector() && MemBytes == 32)))
     return 1;
 
diff --git a/llvm/test/Analysis/CostModel/PowerPC/load_store.ll b/llvm/test/Analysis/CostModel/PowerPC/load_store.ll
index 574a3d40d2734..167cdf9c33303 100644
--- a/llvm/test/Analysis/CostModel/PowerPC/load_store.ll
+++ b/llvm/test/Analysis/CostModel/PowerPC/load_store.ll
@@ -43,3 +43,22 @@ define i32 @loads(i32 %arg) {
   ret i32 undef
 }
 
+define i32 @partialvector32(i32 %arg) #0 {
+
+  ; CHECK: cost of 1 {{.*}} store
+  store <4 x i8> undef, ptr undef, align 16
+
+  ret i32 undef
+}
+
+define i32 @partialvector64(i32 %arg) #1 {
+
+  ; CHECK: cost of 1 {{.*}} store
+  store <4 x i16> undef, ptr undef, align 16
+
+  ret i32 undef
+}
+
+attributes #0 = { "target-features"="+power8-vector,+vsx" }
+
+attributes #1 = { "target-features"="+vsx" }

From 4ad4c8e90e120687e768475fecb8cc03d422b3a1 Mon Sep 17 00:00:00 2001
From: Juergen Ributzka <juergen@ributzka.de>
Date: Tue, 23 Jan 2024 13:14:17 -0800
Subject: [PATCH 692/843] [clang] Use LazyDetector for all toolchains. (#79073)

Use the LazyDetector also for the remaining toolchains to avoid unnecessarily checking for the Cuda and Rocm installations.

This fixes rdar://121397534.
---
 clang/lib/Driver/ToolChains/Darwin.cpp |  8 ++++----
 clang/lib/Driver/ToolChains/Darwin.h   |  5 +++--
 clang/lib/Driver/ToolChains/MSVC.cpp   | 10 +++++-----
 clang/lib/Driver/ToolChains/MSVC.h     |  5 +++--
 clang/lib/Driver/ToolChains/MinGW.cpp  |  8 ++++----
 clang/lib/Driver/ToolChains/MinGW.h    |  5 +++--
 6 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/clang/lib/Driver/ToolChains/Darwin.cpp b/clang/lib/Driver/ToolChains/Darwin.cpp
index 9050c545cf544..fae8ad1a958ad 100644
--- a/clang/lib/Driver/ToolChains/Darwin.cpp
+++ b/clang/lib/Driver/ToolChains/Darwin.cpp
@@ -985,12 +985,12 @@ bool Darwin::hasBlocksRuntime() const {
 
 void Darwin::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                 ArgStringList &CC1Args) const {
-  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
+  CudaInstallation->AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
 void Darwin::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                ArgStringList &CC1Args) const {
-  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
+  RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args);
 }
 
 // This is just a MachO name translation routine and there's no
@@ -3436,6 +3436,6 @@ SanitizerMask Darwin::getSupportedSanitizers() const {
 }
 
 void Darwin::printVerboseInfo(raw_ostream &OS) const {
-  CudaInstallation.print(OS);
-  RocmInstallation.print(OS);
+  CudaInstallation->print(OS);
+  RocmInstallation->print(OS);
 }
diff --git a/clang/lib/Driver/ToolChains/Darwin.h b/clang/lib/Driver/ToolChains/Darwin.h
index fda3b1a3639a8..5e60b0841d6d5 100644
--- a/clang/lib/Driver/ToolChains/Darwin.h
+++ b/clang/lib/Driver/ToolChains/Darwin.h
@@ -10,6 +10,7 @@
 #define LLVM_CLANG_LIB_DRIVER_TOOLCHAINS_DARWIN_H
 
 #include "Cuda.h"
+#include "LazyDetector.h"
 #include "ROCm.h"
 #include "clang/Basic/DarwinSDKInfo.h"
 #include "clang/Basic/LangOptions.h"
@@ -321,8 +322,8 @@ class LLVM_LIBRARY_VISIBILITY Darwin : public MachO {
   /// The target variant triple that was specified (if any).
   mutable std::optional<llvm::Triple> TargetVariantTriple;
 
-  CudaInstallationDetector CudaInstallation;
-  RocmInstallationDetector RocmInstallation;
+  LazyDetector<CudaInstallationDetector> CudaInstallation;
+  LazyDetector<RocmInstallationDetector> RocmInstallation;
 
 private:
   void AddDeploymentTarget(llvm::opt::DerivedArgList &Args) const;
diff --git a/clang/lib/Driver/ToolChains/MSVC.cpp b/clang/lib/Driver/ToolChains/MSVC.cpp
index 4914604f0a286..396522225158d 100644
--- a/clang/lib/Driver/ToolChains/MSVC.cpp
+++ b/clang/lib/Driver/ToolChains/MSVC.cpp
@@ -495,24 +495,24 @@ bool MSVCToolChain::isPICDefaultForced() const {
 
 void MSVCToolChain::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                        ArgStringList &CC1Args) const {
-  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
+  CudaInstallation->AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
 void MSVCToolChain::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                       ArgStringList &CC1Args) const {
-  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
+  RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args);
 }
 
 void MSVCToolChain::AddHIPRuntimeLibArgs(const ArgList &Args,
                                          ArgStringList &CmdArgs) const {
   CmdArgs.append({Args.MakeArgString(StringRef("-libpath:") +
-                                     RocmInstallation.getLibPath()),
+                                     RocmInstallation->getLibPath()),
                   "amdhip64.lib"});
 }
 
 void MSVCToolChain::printVerboseInfo(raw_ostream &OS) const {
-  CudaInstallation.print(OS);
-  RocmInstallation.print(OS);
+  CudaInstallation->print(OS);
+  RocmInstallation->print(OS);
 }
 
 std::string
diff --git a/clang/lib/Driver/ToolChains/MSVC.h b/clang/lib/Driver/ToolChains/MSVC.h
index fb27c621f2d1f..48369e030aade 100644
--- a/clang/lib/Driver/ToolChains/MSVC.h
+++ b/clang/lib/Driver/ToolChains/MSVC.h
@@ -11,6 +11,7 @@
 
 #include "AMDGPU.h"
 #include "Cuda.h"
+#include "LazyDetector.h"
 #include "clang/Driver/Compilation.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
@@ -136,8 +137,8 @@ class LLVM_LIBRARY_VISIBILITY MSVCToolChain : public ToolChain {
   std::optional<llvm::StringRef> WinSdkDir, WinSdkVersion, WinSysRoot;
   std::string VCToolChainPath;
   llvm::ToolsetLayout VSLayout = llvm::ToolsetLayout::OlderVS;
-  CudaInstallationDetector CudaInstallation;
-  RocmInstallationDetector RocmInstallation;
+  LazyDetector<CudaInstallationDetector> CudaInstallation;
+  LazyDetector<RocmInstallationDetector> RocmInstallation;
 };
 
 } // end namespace toolchains
diff --git a/clang/lib/Driver/ToolChains/MinGW.cpp b/clang/lib/Driver/ToolChains/MinGW.cpp
index 18fc9d4b6807e..067758c05e97c 100644
--- a/clang/lib/Driver/ToolChains/MinGW.cpp
+++ b/clang/lib/Driver/ToolChains/MinGW.cpp
@@ -614,17 +614,17 @@ SanitizerMask toolchains::MinGW::getSupportedSanitizers() const {
 
 void toolchains::MinGW::AddCudaIncludeArgs(const ArgList &DriverArgs,
                                            ArgStringList &CC1Args) const {
-  CudaInstallation.AddCudaIncludeArgs(DriverArgs, CC1Args);
+  CudaInstallation->AddCudaIncludeArgs(DriverArgs, CC1Args);
 }
 
 void toolchains::MinGW::AddHIPIncludeArgs(const ArgList &DriverArgs,
                                           ArgStringList &CC1Args) const {
-  RocmInstallation.AddHIPIncludeArgs(DriverArgs, CC1Args);
+  RocmInstallation->AddHIPIncludeArgs(DriverArgs, CC1Args);
 }
 
 void toolchains::MinGW::printVerboseInfo(raw_ostream &OS) const {
-  CudaInstallation.print(OS);
-  RocmInstallation.print(OS);
+  CudaInstallation->print(OS);
+  RocmInstallation->print(OS);
 }
 
 // Include directories for various hosts:
diff --git a/clang/lib/Driver/ToolChains/MinGW.h b/clang/lib/Driver/ToolChains/MinGW.h
index 75d629d5d1cb3..a9963d8d06c29 100644
--- a/clang/lib/Driver/ToolChains/MinGW.h
+++ b/clang/lib/Driver/ToolChains/MinGW.h
@@ -11,6 +11,7 @@
 
 #include "Cuda.h"
 #include "Gnu.h"
+#include "LazyDetector.h"
 #include "ROCm.h"
 #include "clang/Driver/Tool.h"
 #include "clang/Driver/ToolChain.h"
@@ -102,8 +103,8 @@ class LLVM_LIBRARY_VISIBILITY MinGW : public ToolChain {
   Tool *buildAssembler() const override;
 
 private:
-  CudaInstallationDetector CudaInstallation;
-  RocmInstallationDetector RocmInstallation;
+  LazyDetector<CudaInstallationDetector> CudaInstallation;
+  LazyDetector<RocmInstallationDetector> RocmInstallation;
 
   std::string Base;
   std::string GccLibDir;

From 3a9ff32354a95305c523ab3b13bf3684854d1327 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Tue, 23 Jan 2024 16:14:50 -0500
Subject: [PATCH 693/843] [libc] Fix implicit conversion in FEnvImpl for arm32
 targets. (#79210)

---
 libc/src/__support/FPUtil/arm/FEnvImpl.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libc/src/__support/FPUtil/arm/FEnvImpl.h b/libc/src/__support/FPUtil/arm/FEnvImpl.h
index 1a89de50b6b60..ac4673cf20f63 100644
--- a/libc/src/__support/FPUtil/arm/FEnvImpl.h
+++ b/libc/src/__support/FPUtil/arm/FEnvImpl.h
@@ -135,8 +135,8 @@ LIBC_INLINE int set_except(int excepts) {
 LIBC_INLINE int raise_except(int excepts) {
   float zero = 0.0f;
   float one = 1.0f;
-  float large_value = FPBits<float>::max_normal();
-  float small_value = FPBits<float>::min_normal();
+  float large_value = FPBits<float>::max_normal().get_val();
+  float small_value = FPBits<float>::min_normal().get_val();
   auto divfunc = [](float a, float b) {
     __asm__ __volatile__("flds  s0, %0\n\t"
                          "flds  s1, %1\n\t"

From 2b8649fbecdc300cde1032d739484690f75a27ba Mon Sep 17 00:00:00 2001
From: William Junda Huang <williamjhuang@google.com>
Date: Tue, 23 Jan 2024 16:19:45 -0500
Subject: [PATCH 694/843] Added feature in llvm-profdata merge to filter
 functions from the profile (#78378)

`--function=<regex>` Include functions matching regex in the output
`--no-function=<regex>` Exclude functions matching regex from the output

If both are specified, `--no-function` has a higher precedence if a
function name matches both filters
---
 llvm/docs/CommandGuide/llvm-profdata.rst      | 10 +++
 llvm/include/llvm/ProfileData/SampleProf.h    |  2 +
 .../tools/llvm-profdata/merge-filter.test     | 69 ++++++++++++++++++
 llvm/tools/llvm-profdata/llvm-profdata.cpp    | 73 ++++++++++++++++++-
 4 files changed, 151 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/tools/llvm-profdata/merge-filter.test

diff --git a/llvm/docs/CommandGuide/llvm-profdata.rst b/llvm/docs/CommandGuide/llvm-profdata.rst
index f5e3c13ffbc8e..acf016a6dbcd7 100644
--- a/llvm/docs/CommandGuide/llvm-profdata.rst
+++ b/llvm/docs/CommandGuide/llvm-profdata.rst
@@ -217,6 +217,16 @@ OPTIONS
  The maximum number of functions in a single temporal profile trace. Longer
  traces will be truncated. The default value is 1000.
 
+.. option:: --function=<string>
+
+ Only keep functions matching the regex in the output, all others are erased
+ from the profile.
+
+.. option:: --no-function=<string>
+
+ Remove functions matching the regex from the profile. If both --function and
+ --no-function are specified and a function matches both, it is removed.
+
 EXAMPLES
 ^^^^^^^^
 Basic Usage
diff --git a/llvm/include/llvm/ProfileData/SampleProf.h b/llvm/include/llvm/ProfileData/SampleProf.h
index 66aaf602d0e1d..8ac84d4b933f2 100644
--- a/llvm/include/llvm/ProfileData/SampleProf.h
+++ b/llvm/include/llvm/ProfileData/SampleProf.h
@@ -1330,6 +1330,8 @@ class SampleProfileMap
   }
 
   size_t erase(const key_type &Key) { return base_type::erase(Key); }
+
+  iterator erase(iterator It) { return base_type::erase(It); }
 };
 
 using NameFunctionSamples = std::pair<hash_code, const FunctionSamples *>;
diff --git a/llvm/test/tools/llvm-profdata/merge-filter.test b/llvm/test/tools/llvm-profdata/merge-filter.test
new file mode 100644
index 0000000000000..5c47c6a75a7c4
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/merge-filter.test
@@ -0,0 +1,69 @@
+Test llvm-profdata merge with function filters.
+
+RUN: llvm-profdata merge --sample %p/Inputs/sample-profile.proftext --text --function="_Z3.*" | FileCheck %s --check-prefix=CHECK-FILTER1
+RUN: llvm-profdata merge --sample %p/Inputs/sample-profile.proftext --text --no-function="main" | FileCheck %s --check-prefix=CHECK-FILTER1
+CHECK-FILTER1: _Z3bari:20301:1437
+CHECK-NEXT:  1: 1437
+CHECK-NEXT: _Z3fooi:7711:610
+CHECK-NEXT:  1: 610
+CHECK-NOT: main
+
+RUN: llvm-profdata merge --sample %p/Inputs/sample-profile.proftext --text --function="_Z3.*" --no-function="fooi$" | FileCheck %s --check-prefix=CHECK-FILTER2
+CHECK-FILTER2: _Z3bari:20301:1437
+CHECK-NEXT:  1: 1437
+CHECK-NOT: main
+CHECK-NOT: _Z3fooi
+
+RUN: llvm-profdata merge --sample --extbinary --use-md5 -output=%t.0.profdata %p/Inputs/sample-profile.proftext
+RUN: llvm-profdata merge --sample %t.0.profdata --text --function="_Z3fooi" | FileCheck %s --check-prefix=CHECK-FILTER-MD5
+CHECK-FILTER-MD5: 1228452328526475178:7711:610
+CHECK-NEXT:  1: 610
+CHECK-NOT: 15822663052811949562
+CHECK-NOT: 3727899762981752933
+
+RUN: llvm-profdata merge --instr %p/Inputs/basic.proftext --text --function="foo" | FileCheck %s --check-prefix=CHECK-FILTER3
+RUN: llvm-profdata merge --instr %p/Inputs/basic.proftext --text --no-function="main" | FileCheck %s --check-prefix=CHECK-FILTER3
+CHECK-FILTER3: foo
+CHECK-NEXT: # Func Hash:
+CHECK-NEXT: 10
+CHECK-NEXT: # Num Counters:
+CHECK-NEXT: 2
+CHECK-NEXT: # Counter Values:
+CHECK-NEXT: 499500
+CHECK-NEXT: 179900
+CHECK-NEXT: 
+CHECK-NEXT: foo2
+CHECK-NEXT: # Func Hash:
+CHECK-NEXT: 10
+CHECK-NEXT: # Num Counters:
+CHECK-NEXT: 2
+CHECK-NEXT: # Counter Values:
+CHECK-NEXT: 500500
+CHECK-NEXT: 180100
+
+RUN: llvm-profdata merge --instr %p/Inputs/basic.proftext --text --function="foo" --no-function="^foo$" | FileCheck %s --check-prefix=CHECK-FILTER4
+CHECK-FILTER4: foo2
+CHECK-NEXT: # Func Hash:
+CHECK-NEXT: 10
+CHECK-NEXT: # Num Counters:
+CHECK-NEXT: 2
+CHECK-NEXT: # Counter Values:
+CHECK-NEXT: 500500
+CHECK-NEXT: 180100
+
+RUN: llvm-profdata merge --sample %p/Inputs/cs-sample.proftext --text --function="main.*@.*_Z5funcBi" | FileCheck %s --check-prefix=CHECK-FILTER5
+CHECK-FILTER5: [main:3.1 @ _Z5funcBi:1 @ _Z8funcLeafi]:500853:20
+CHECK-NEXT:  0: 15
+CHECK-NEXT:  1: 15
+CHECK-NEXT:  3: 74946
+CHECK-NEXT:  4: 74941 _Z3fibi:82359
+CHECK-NEXT:  10: 23324
+CHECK-NEXT:  11: 23327 _Z3fibi:25228
+CHECK-NEXT:  15: 11
+CHECK-NEXT:  !Attributes: 1
+CHECK-NEXT: [main:3.1 @ _Z5funcBi]:120:19
+CHECK-NEXT:  0: 19
+CHECK-NEXT:  1: 19 _Z8funcLeafi:20
+CHECK-NEXT:  3: 12
+CHECK-NEXT:  !Attributes: 1
+
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index e6dc81ba1f5b0..239aa1c93a2c7 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Regex.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -131,9 +132,11 @@ cl::opt<std::string>
                    cl::sub(MergeSubcommand));
 cl::opt<std::string> FuncNameFilter(
     "function",
-    cl::desc("Details for matching functions. For overlapping CSSPGO, this "
-             "takes a function name with calling context."),
-    cl::sub(ShowSubcommand), cl::sub(OverlapSubcommand));
+    cl::desc("Only functions matching the filter are shown in the output. For "
+             "overlapping CSSPGO, this takes a function name with calling "
+             "context."),
+    cl::sub(ShowSubcommand), cl::sub(OverlapSubcommand),
+    cl::sub(MergeSubcommand));
 
 // TODO: Consider creating a template class (e.g., MergeOption, ShowOption) to
 // factor out the common cl::sub in cl::opt constructor for subcommand-specific
@@ -243,6 +246,10 @@ cl::opt<uint64_t> TemporalProfMaxTraceLength(
     cl::sub(MergeSubcommand),
     cl::desc("The maximum length of a single temporal profile trace "
              "(default: 10000)"));
+cl::opt<std::string> FuncNameNegativeFilter(
+    "no-function", cl::init(""),
+    cl::sub(MergeSubcommand),
+    cl::desc("Exclude functions matching the filter from the output."));
 
 cl::opt<FailureMode>
     FailMode("failure-mode", cl::init(failIfAnyAreInvalid),
@@ -759,6 +766,62 @@ static void mergeWriterContexts(WriterContext *Dst, WriterContext *Src) {
   });
 }
 
+static StringRef
+getFuncName(const StringMap<InstrProfWriter::ProfilingData>::value_type &Val) {
+  return Val.first();
+}
+
+static std::string
+getFuncName(const SampleProfileMap::value_type &Val) {
+  return Val.second.getContext().toString();
+}
+
+template <typename T>
+static void filterFunctions(T &ProfileMap) {
+  bool hasFilter = !FuncNameFilter.empty();
+  bool hasNegativeFilter = !FuncNameNegativeFilter.empty();
+  if (!hasFilter && !hasNegativeFilter)
+    return;
+
+  // If filter starts with '?' it is MSVC mangled name, not a regex.
+  llvm::Regex ProbablyMSVCMangledName("[?@$_0-9A-Za-z]+");
+  if (hasFilter && FuncNameFilter[0] == '?' &&
+      ProbablyMSVCMangledName.match(FuncNameFilter))
+    FuncNameFilter = llvm::Regex::escape(FuncNameFilter);
+  if (hasNegativeFilter && FuncNameNegativeFilter[0] == '?' &&
+      ProbablyMSVCMangledName.match(FuncNameNegativeFilter))
+    FuncNameNegativeFilter = llvm::Regex::escape(FuncNameNegativeFilter);
+
+  size_t Count = ProfileMap.size();
+  llvm::Regex Pattern(FuncNameFilter);
+  llvm::Regex NegativePattern(FuncNameNegativeFilter);
+  std::string Error;
+  if (hasFilter && !Pattern.isValid(Error))
+    exitWithError(Error);
+  if (hasNegativeFilter && !NegativePattern.isValid(Error))
+    exitWithError(Error);
+
+  // Handle MD5 profile, so it is still able to match using the original name.
+  std::string MD5Name = std::to_string(llvm::MD5Hash(FuncNameFilter));
+  std::string NegativeMD5Name =
+      std::to_string(llvm::MD5Hash(FuncNameNegativeFilter));
+
+  for (auto I = ProfileMap.begin(); I != ProfileMap.end();) {
+    auto Tmp = I++;
+    const auto &FuncName = getFuncName(*Tmp);
+    // Negative filter has higher precedence than positive filter.
+    if ((hasNegativeFilter &&
+         (NegativePattern.match(FuncName) ||
+          (FunctionSamples::UseMD5 && NegativeMD5Name == FuncName))) ||
+        (hasFilter && !(Pattern.match(FuncName) ||
+                        (FunctionSamples::UseMD5 && MD5Name == FuncName))))
+      ProfileMap.erase(Tmp);
+  }
+
+  llvm::dbgs() << Count - ProfileMap.size() << " of " << Count << " functions "
+               << "in the original profile are filtered.\n";
+}
+
 static void writeInstrProfile(StringRef OutputFilename,
                               ProfileFormat OutputFormat,
                               InstrProfWriter &Writer) {
@@ -878,6 +941,8 @@ static void mergeInstrProfile(const WeightedFileVector &Inputs,
       (NumErrors > 0 && FailMode == failIfAnyAreInvalid))
     exitWithError("no profile can be merged");
 
+  filterFunctions(Contexts[0]->Writer.getProfileData());
+
   writeInstrProfile(OutputFilename, OutputFormat, Contexts[0]->Writer);
 }
 
@@ -1459,6 +1524,8 @@ static void mergeSampleProfile(const WeightedFileVector &Inputs,
     ProfileIsCS = FunctionSamples::ProfileIsCS = false;
   }
 
+  filterFunctions(ProfileMap);
+
   auto WriterOrErr =
       SampleProfileWriter::create(OutputFilename, FormatMap[OutputFormat]);
   if (std::error_code EC = WriterOrErr.getError())

From 9261ab708e37c2d6499ac063045f816d25a5919c Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Wed, 24 Jan 2024 03:00:34 +0530
Subject: [PATCH 695/843] [mlir][Target] Teach dense_resource conversion to
 LLVMIR Target (#78958)

This patch adds support for translating dense_resource attributes to
LLVMIR Target.
The support added is similar to how DenseElementsAttr is handled, except
we
don't need to handle splats.

Another possible way of doing this is adding iteration on
dense_resource, but that is
non-trivial as DenseResourceAttr is not meant to be something you should
directly
access. It has subclasses which you are supposed to use to iterate on
it.
---
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp | 99 ++++++++++++++++++++
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir  | 52 ++++++++++
 mlir/test/Target/LLVMIR/llvmir.mlir          | 23 +++++
 3 files changed, 174 insertions(+)

diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 1499a71c7c9a7..69a1cbe5969e8 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -27,6 +27,7 @@
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinOps.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/DialectResourceBlobManager.h"
 #include "mlir/IR/RegionGraphTraits.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/LogicalResult.h"
@@ -446,6 +447,99 @@ convertDenseElementsAttr(Location loc, DenseElementsAttr denseElementsAttr,
   return buildSequentialConstant(constantsRef, outerShape, llvmType, loc);
 }
 
+/// Convert a dense resource elements attribute to an LLVM IR constant using its
+/// raw data storage if possible. This supports elements attributes of tensor or
+/// vector type and avoids constructing separate objects for individual values
+/// of the innermost dimension. Constants for other dimensions are still
+/// constructed recursively. Returns nullptr on failure and emits errors at
+/// `loc`.
+static llvm::Constant *convertDenseResourceElementsAttr(
+    Location loc, DenseResourceElementsAttr denseResourceAttr,
+    llvm::Type *llvmType, const ModuleTranslation &moduleTranslation) {
+  assert(denseResourceAttr && "expected non-null attribute");
+
+  llvm::Type *innermostLLVMType = getInnermostElementType(llvmType);
+  if (!llvm::ConstantDataSequential::isElementTypeCompatible(
+          innermostLLVMType)) {
+    emitError(loc, "no known conversion for innermost element type");
+    return nullptr;
+  }
+
+  ShapedType type = denseResourceAttr.getType();
+  assert(type.getNumElements() > 0 && "Expected non-empty elements attribute");
+
+  AsmResourceBlob *blob = denseResourceAttr.getRawHandle().getBlob();
+  if (!blob) {
+    emitError(loc, "resource does not exist");
+    return nullptr;
+  }
+
+  ArrayRef<char> rawData = blob->getData();
+
+  // Check that the raw data size matches what is expected for the scalar size.
+  // TODO: in theory, we could repack the data here to keep constructing from
+  // raw data.
+  // TODO: we may also need to consider endianness when cross-compiling to an
+  // architecture where it is different.
+  int64_t numElements = denseResourceAttr.getType().getNumElements();
+  int64_t elementByteSize = rawData.size() / numElements;
+  if (8 * elementByteSize != innermostLLVMType->getScalarSizeInBits()) {
+    emitError(loc, "raw data size does not match element type size");
+    return nullptr;
+  }
+
+  // Compute the shape of all dimensions but the innermost. Note that the
+  // innermost dimension may be that of the vector element type.
+  bool hasVectorElementType = isa<VectorType>(type.getElementType());
+  int64_t numAggregates =
+      numElements / (hasVectorElementType
+                         ? 1
+                         : denseResourceAttr.getType().getShape().back());
+  ArrayRef<int64_t> outerShape = type.getShape();
+  if (!hasVectorElementType)
+    outerShape = outerShape.drop_back();
+
+  // Create a constructor for the innermost constant from a piece of raw data.
+  std::function<llvm::Constant *(StringRef)> buildCstData;
+  if (isa<TensorType>(type)) {
+    auto vectorElementType = dyn_cast<VectorType>(type.getElementType());
+    if (vectorElementType && vectorElementType.getRank() == 1) {
+      buildCstData = [&](StringRef data) {
+        return llvm::ConstantDataVector::getRaw(
+            data, vectorElementType.getShape().back(), innermostLLVMType);
+      };
+    } else if (!vectorElementType) {
+      buildCstData = [&](StringRef data) {
+        return llvm::ConstantDataArray::getRaw(data, type.getShape().back(),
+                                               innermostLLVMType);
+      };
+    }
+  } else if (isa<VectorType>(type)) {
+    buildCstData = [&](StringRef data) {
+      return llvm::ConstantDataVector::getRaw(data, type.getShape().back(),
+                                              innermostLLVMType);
+    };
+  }
+  if (!buildCstData) {
+    emitError(loc, "unsupported dense_resource type");
+    return nullptr;
+  }
+
+  // Create innermost constants and defer to the default constant creation
+  // mechanism for other dimensions.
+  SmallVector<llvm::Constant *> constants;
+  int64_t aggregateSize = denseResourceAttr.getType().getShape().back() *
+                          (innermostLLVMType->getScalarSizeInBits() / 8);
+  constants.reserve(numAggregates);
+  for (unsigned i = 0; i < numAggregates; ++i) {
+    StringRef data(rawData.data() + i * aggregateSize, aggregateSize);
+    constants.push_back(buildCstData(data));
+  }
+
+  ArrayRef<llvm::Constant *> constantsRef = constants;
+  return buildSequentialConstant(constantsRef, outerShape, llvmType, loc);
+}
+
 /// Create an LLVM IR constant of `llvmType` from the MLIR attribute `attr`.
 /// This currently supports integer, floating point, splat and dense element
 /// attributes and combinations thereof. Also, an array attribute with two
@@ -546,6 +640,11 @@ llvm::Constant *mlir::LLVM::detail::getLLVMConstant(
     return result;
   }
 
+  if (auto denseResourceAttr = dyn_cast<DenseResourceElementsAttr>(attr)) {
+    return convertDenseResourceElementsAttr(loc, denseResourceAttr, llvmType,
+                                            moduleTranslation);
+  }
+
   // Fall back to element-by-element construction otherwise.
   if (auto elementsAttr = dyn_cast<ElementsAttr>(attr)) {
     assert(elementsAttr.getShapedType().hasStaticShape());
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index 38601c863b982..1b685d3783002 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -313,3 +313,55 @@ llvm.func @foo() {
   // expected-error @below{{must appear at the module level}}
   llvm.linker_options ["test"]
 }
+
+// -----
+
+module @does_not_exist {
+  // expected-error @below{{resource does not exist}}
+  llvm.mlir.global internal constant @constant(dense_resource<test0> : tensor<4xf32>) : !llvm.array<4 x f32>
+}
+
+// -----
+
+module @raw_data_does_not_match_element_type_size {
+  // expected-error @below{{raw data size does not match element type size}}
+  llvm.mlir.global internal constant @constant(dense_resource<test1> : tensor<5xf32>) : !llvm.array<4 x f32>
+}
+
+{-#
+  dialect_resources: {
+    builtin: {
+      test1: "0x0800000054A3B53ED6C0B33E55D1A2BDE5D2BB3E"
+    }
+  }
+#-}
+
+// -----
+
+module @does_not_exist {
+  // expected-error @below{{unsupported dense_resource type}}
+  llvm.mlir.global internal constant @constant(dense_resource<test1> : memref<4xf32>) : !llvm.array<4 x f32>
+}
+
+{-#
+  dialect_resources: {
+    builtin: {
+      test1: "0x0800000054A3B53ED6C0B33E55D1A2BDE5D2BB3E"
+    }
+  }
+#-}
+
+// -----
+
+module @no_known_conversion_innermost_eltype {
+  // expected-error @below{{no known conversion for innermost element type}}
+  llvm.mlir.global internal constant @constant(dense_resource<test0> : tensor<4xi4>) : !llvm.array<4 x i4>
+}
+
+{-#
+  dialect_resources: {
+    builtin: {
+      test1: "0x0800000054A3B53ED6C0B33E55D1A2BDE5D2BB3E"
+    }
+  }
+#-}
diff --git a/mlir/test/Target/LLVMIR/llvmir.mlir b/mlir/test/Target/LLVMIR/llvmir.mlir
index 961c948444684..448aa3a5d85d7 100644
--- a/mlir/test/Target/LLVMIR/llvmir.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir.mlir
@@ -101,6 +101,19 @@ llvm.mlir.global internal @dense_float_vector_3d(dense<[[[1.0, 2.0], [3.0, 4.0]]
 // CHECK{LITERAL}: @splat_float_vector_3d = internal global [2 x [2 x <2 x float>]] [[2 x <2 x float>] [<2 x float> <float 4.200000e+01, float 4.200000e+01>, <2 x float> <float 4.200000e+01, float 4.200000e+01>], [2 x <2 x float>] [<2 x float> <float 4.200000e+01, float 4.200000e+01>, <2 x float> <float 4.200000e+01, float 4.200000e+01>]]
 llvm.mlir.global internal @splat_float_vector_3d(dense<42.0> : vector<2x2x2xf32>) : !llvm.array<2 x !llvm.array<2 x vector<2xf32>>>
 
+// CHECK{LITERAL}: @dense_resource_tensor_constant = internal constant [5 x float] [float 0x3FCA034080000000, float 0xBFD0466300000000, float 0xBFD75DDF80000000, float 0xBFDE074F40000000, float 0x3FDDD3A1C0000000]
+llvm.mlir.global internal constant @dense_resource_tensor_constant(dense_resource<dense_resource_test_5xf32> : tensor<5xf32>) : !llvm.array<5 x f32>
+
+// CHECK{LITERAL}: @dense_resource_vector_constant = internal constant <5 x float> <float 0x3FCA034080000000, float 0xBFD0466300000000, float 0xBFD75DDF80000000, float 0xBFDE074F40000000, float 0x3FDDD3A1C0000000>
+llvm.mlir.global internal constant @dense_resource_vector_constant(dense_resource<dense_resource_test_5xf32> : vector<5xf32>) : vector<5xf32>
+
+
+// CHECK{LITERAL}: @dense_resource_multidim_tensor_constant = internal constant [1 x [2 x [2 x float]]] [[2 x [2 x float]] [[2 x float] [float 0x3FD6B46A80000000, float 0x3FD6781AC0000000], [2 x float] [float 0xBFB45A2AA0000000, float 0x3FD77A5CA0000000]]]
+llvm.mlir.global internal constant @dense_resource_multidim_tensor_constant(dense_resource<dense_resource_test_2x2xf32> : tensor<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x !llvm.array<2 x f32>>>
+
+// CHECK{LITERAL}: @dense_resource_multidim_vector_constant = internal constant [1 x [2 x <2 x float>]] [[2 x <2 x float>] [<2 x float> <float 0x3FD6B46A80000000, float 0x3FD6781AC0000000>, <2 x float> <float 0xBFB45A2AA0000000, float 0x3FD77A5CA0000000>]]
+llvm.mlir.global internal constant @dense_resource_multidim_vector_constant(dense_resource<dense_resource_test_2x2xf32> : vector<1x2x2xf32>) : !llvm.array<1 x !llvm.array<2 x vector<2 x f32>>>
+
 //
 // Linkage attribute.
 //
@@ -1577,6 +1590,16 @@ llvm.func @invokeLandingpad() -> i32 attributes { personality = @__gxx_personali
   llvm.invoke %9(%6, %0) to ^bb2 unwind ^bb1 vararg(!llvm.func<void (ptr, ...)>) : !llvm.ptr, (!llvm.ptr, i32) -> ()
 }
 
+// Resources are kept at end of file. New tests should be added above this.
+{-#
+  dialect_resources: {
+    builtin: {
+      dense_resource_test_5xf32: "0x08000000041A503E183382BEFCEEBABE7A3AF0BE0E9DEE3E",
+      dense_resource_test_2x2xf32: "0x0800000054A3B53ED6C0B33E55D1A2BDE5D2BB3E"
+    }
+  }
+#-}
+
 // -----
 
 llvm.func @foo() -> i8

From 6c98c5bd99b4c71a7895337cd4e437e023c2ec7a Mon Sep 17 00:00:00 2001
From: Alex Langford <alangford@apple.com>
Date: Tue, 23 Jan 2024 13:41:36 -0800
Subject: [PATCH 696/843] [lldb][NFCI] Remove unused method
 BreakpointIDList::AddBreakpointID(const char *) (#79189)

This overload is completely unused.
---
 lldb/include/lldb/Breakpoint/BreakpointIDList.h | 2 --
 lldb/source/Breakpoint/BreakpointIDList.cpp     | 9 ---------
 2 files changed, 11 deletions(-)

diff --git a/lldb/include/lldb/Breakpoint/BreakpointIDList.h b/lldb/include/lldb/Breakpoint/BreakpointIDList.h
index 3cda1860dc167..6c57d9bc50795 100644
--- a/lldb/include/lldb/Breakpoint/BreakpointIDList.h
+++ b/lldb/include/lldb/Breakpoint/BreakpointIDList.h
@@ -42,8 +42,6 @@ class BreakpointIDList {
 
   bool AddBreakpointID(BreakpointID bp_id);
 
-  bool AddBreakpointID(const char *bp_id);
-
   // TODO: This should take a const BreakpointID.
   bool FindBreakpointID(BreakpointID &bp_id, size_t *position) const;
 
diff --git a/lldb/source/Breakpoint/BreakpointIDList.cpp b/lldb/source/Breakpoint/BreakpointIDList.cpp
index 51185c30dabad..5ab2c9a8dc386 100644
--- a/lldb/source/Breakpoint/BreakpointIDList.cpp
+++ b/lldb/source/Breakpoint/BreakpointIDList.cpp
@@ -48,15 +48,6 @@ bool BreakpointIDList::AddBreakpointID(BreakpointID bp_id) {
                // return true.
 }
 
-bool BreakpointIDList::AddBreakpointID(const char *bp_id_str) {
-  auto bp_id = BreakpointID::ParseCanonicalReference(bp_id_str);
-  if (!bp_id)
-    return false;
-
-  m_breakpoint_ids.push_back(*bp_id);
-  return true;
-}
-
 bool BreakpointIDList::FindBreakpointID(BreakpointID &bp_id,
                                         size_t *position) const {
   for (size_t i = 0; i < m_breakpoint_ids.size(); ++i) {

From 2227e50c618ecc6f83e8c69286a418ee04750361 Mon Sep 17 00:00:00 2001
From: Billy Laws <blaws05@gmail.com>
Date: Tue, 23 Jan 2024 21:57:28 +0000
Subject: [PATCH 697/843] [Preprocessor][test] Test ARM64EC definitions
 (#78916)

---
 clang/test/Preprocessor/init-aarch64.c | 370 +++++++++++++++++++++++++
 1 file changed, 370 insertions(+)

diff --git a/clang/test/Preprocessor/init-aarch64.c b/clang/test/Preprocessor/init-aarch64.c
index 94c091a69e844..f1f1bbbf66945 100644
--- a/clang/test/Preprocessor/init-aarch64.c
+++ b/clang/test/Preprocessor/init-aarch64.c
@@ -728,6 +728,376 @@
 // AARCH64-MSVC: #define __WINT_WIDTH__ 16
 // AARCH64-MSVC: #define __aarch64__ 1
 
+// RUN: %clang_cc1 -E -dM -ffreestanding -triple=arm64ec-windows-msvc < /dev/null | FileCheck -match-full-lines -check-prefix ARM64EC-MSVC %s
+
+// ARM64EC-MSVC: #define _INTEGRAL_MAX_BITS 64
+// ARM64EC-MSVC: #define _M_AMD64 100
+// ARM64EC-MSVC: #define _M_ARM64EC 1
+// ARM64EC-MSVC: #define _M_X64 100
+// ARM64EC-MSVC: #define _WIN32 1
+// ARM64EC-MSVC: #define _WIN64 1
+// ARM64EC-MSVC: #define __AARCH64EL__ 1
+// ARM64EC-MSVC: #define __AARCH64_CMODEL_SMALL__ 1
+// ARM64EC-MSVC: #define __ARM_64BIT_STATE 1
+// ARM64EC-MSVC: #define __ARM_ACLE 200
+// ARM64EC-MSVC: #define __ARM_ALIGN_MAX_STACK_PWR 4
+// ARM64EC-MSVC: #define __ARM_ARCH 8
+// ARM64EC-MSVC: #define __ARM_ARCH_ISA_A64 1
+// ARM64EC-MSVC: #define __ARM_ARCH_PROFILE 'A'
+// ARM64EC-MSVC: #define __ARM_FEATURE_CLZ 1
+// ARM64EC-MSVC: #define __ARM_FEATURE_DIRECTED_ROUNDING 1
+// ARM64EC-MSVC: #define __ARM_FEATURE_DIV 1
+// ARM64EC-MSVC: #define __ARM_FEATURE_FMA 1
+// ARM64EC-MSVC: #define __ARM_FEATURE_IDIV 1
+// ARM64EC-MSVC: #define __ARM_FEATURE_LDREX 0xF
+// ARM64EC-MSVC: #define __ARM_FEATURE_NUMERIC_MAXMIN 1
+// ARM64EC-MSVC: #define __ARM_FEATURE_UNALIGNED 1
+// ARM64EC-MSVC: #define __ARM_FP 0xE
+// ARM64EC-MSVC: #define __ARM_FP16_ARGS 1
+// ARM64EC-MSVC: #define __ARM_FP16_FORMAT_IEEE 1
+// ARM64EC-MSVC: #define __ARM_PCS_AAPCS64 1
+// ARM64EC-MSVC: #define __ARM_SIZEOF_MINIMAL_ENUM 4
+// ARM64EC-MSVC: #define __ARM_SIZEOF_WCHAR_T 4
+// ARM64EC-MSVC: #define __ATOMIC_ACQUIRE 2
+// ARM64EC-MSVC: #define __ATOMIC_ACQ_REL 4
+// ARM64EC-MSVC: #define __ATOMIC_CONSUME 1
+// ARM64EC-MSVC: #define __ATOMIC_RELAXED 0
+// ARM64EC-MSVC: #define __ATOMIC_RELEASE 3
+// ARM64EC-MSVC: #define __ATOMIC_SEQ_CST 5
+// ARM64EC-MSVC: #define __BIGGEST_ALIGNMENT__ 16
+// ARM64EC-MSVC: #define __BITINT_MAXWIDTH__ 128
+// ARM64EC-MSVC: #define __BOOL_WIDTH__ 8
+// ARM64EC-MSVC: #define __BYTE_ORDER__ __ORDER_LITTLE_ENDIAN__
+// ARM64EC-MSVC: #define __CHAR16_TYPE__ unsigned short
+// ARM64EC-MSVC: #define __CHAR32_TYPE__ unsigned int
+// ARM64EC-MSVC: #define __CHAR_BIT__ 8
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_BOOL_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_CHAR16_T_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_CHAR32_T_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_CHAR_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_INT_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_LLONG_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_LONG_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_POINTER_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_SHORT_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CLANG_ATOMIC_WCHAR_T_LOCK_FREE 2
+// ARM64EC-MSVC: #define __CONSTANT_CFSTRINGS__ 1
+// ARM64EC-MSVC: #define __DBL_DECIMAL_DIG__ 17
+// ARM64EC-MSVC: #define __DBL_DENORM_MIN__ 4.9406564584124654e-324
+// ARM64EC-MSVC: #define __DBL_DIG__ 15
+// ARM64EC-MSVC: #define __DBL_EPSILON__ 2.2204460492503131e-16
+// ARM64EC-MSVC: #define __DBL_HAS_DENORM__ 1
+// ARM64EC-MSVC: #define __DBL_HAS_INFINITY__ 1
+// ARM64EC-MSVC: #define __DBL_HAS_QUIET_NAN__ 1
+// ARM64EC-MSVC: #define __DBL_MANT_DIG__ 53
+// ARM64EC-MSVC: #define __DBL_MAX_10_EXP__ 308
+// ARM64EC-MSVC: #define __DBL_MAX_EXP__ 1024
+// ARM64EC-MSVC: #define __DBL_MAX__ 1.7976931348623157e+308
+// ARM64EC-MSVC: #define __DBL_MIN_10_EXP__ (-307)
+// ARM64EC-MSVC: #define __DBL_MIN_EXP__ (-1021)
+// ARM64EC-MSVC: #define __DBL_MIN__ 2.2250738585072014e-308
+// ARM64EC-MSVC: #define __DECIMAL_DIG__ __LDBL_DECIMAL_DIG__
+// ARM64EC-MSVC: #define __FINITE_MATH_ONLY__ 0
+// ARM64EC-MSVC: #define __FLT16_DECIMAL_DIG__ 5
+// ARM64EC-MSVC: #define __FLT16_DENORM_MIN__ 5.9604644775390625e-8F16
+// ARM64EC-MSVC: #define __FLT16_DIG__ 3
+// ARM64EC-MSVC: #define __FLT16_EPSILON__ 9.765625e-4F16
+// ARM64EC-MSVC: #define __FLT16_HAS_DENORM__ 1
+// ARM64EC-MSVC: #define __FLT16_HAS_INFINITY__ 1
+// ARM64EC-MSVC: #define __FLT16_HAS_QUIET_NAN__ 1
+// ARM64EC-MSVC: #define __FLT16_MANT_DIG__ 11
+// ARM64EC-MSVC: #define __FLT16_MAX_10_EXP__ 4
+// ARM64EC-MSVC: #define __FLT16_MAX_EXP__ 16
+// ARM64EC-MSVC: #define __FLT16_MAX__ 6.5504e+4F16
+// ARM64EC-MSVC: #define __FLT16_MIN_10_EXP__ (-4)
+// ARM64EC-MSVC: #define __FLT16_MIN_EXP__ (-13)
+// ARM64EC-MSVC: #define __FLT16_MIN__ 6.103515625e-5F16
+// ARM64EC-MSVC: #define __FLT_DECIMAL_DIG__ 9
+// ARM64EC-MSVC: #define __FLT_DENORM_MIN__ 1.40129846e-45F
+// ARM64EC-MSVC: #define __FLT_DIG__ 6
+// ARM64EC-MSVC: #define __FLT_EPSILON__ 1.19209290e-7F
+// ARM64EC-MSVC: #define __FLT_HAS_DENORM__ 1
+// ARM64EC-MSVC: #define __FLT_HAS_INFINITY__ 1
+// ARM64EC-MSVC: #define __FLT_HAS_QUIET_NAN__ 1
+// ARM64EC-MSVC: #define __FLT_MANT_DIG__ 24
+// ARM64EC-MSVC: #define __FLT_MAX_10_EXP__ 38
+// ARM64EC-MSVC: #define __FLT_MAX_EXP__ 128
+// ARM64EC-MSVC: #define __FLT_MAX__ 3.40282347e+38F
+// ARM64EC-MSVC: #define __FLT_MIN_10_EXP__ (-37)
+// ARM64EC-MSVC: #define __FLT_MIN_EXP__ (-125)
+// ARM64EC-MSVC: #define __FLT_MIN__ 1.17549435e-38F
+// ARM64EC-MSVC: #define __FLT_RADIX__ 2
+// ARM64EC-MSVC: #define __FPCLASS_NEGINF 0x0004
+// ARM64EC-MSVC: #define __FPCLASS_NEGNORMAL 0x0008
+// ARM64EC-MSVC: #define __FPCLASS_NEGSUBNORMAL 0x0010
+// ARM64EC-MSVC: #define __FPCLASS_NEGZERO 0x0020
+// ARM64EC-MSVC: #define __FPCLASS_POSINF 0x0200
+// ARM64EC-MSVC: #define __FPCLASS_POSNORMAL 0x0100
+// ARM64EC-MSVC: #define __FPCLASS_POSSUBNORMAL 0x0080
+// ARM64EC-MSVC: #define __FPCLASS_POSZERO 0x0040
+// ARM64EC-MSVC: #define __FPCLASS_QNAN 0x0002
+// ARM64EC-MSVC: #define __FPCLASS_SNAN 0x0001
+// ARM64EC-MSVC: #define __FP_FAST_FMA 1
+// ARM64EC-MSVC: #define __FP_FAST_FMAF 1
+// ARM64EC-MSVC: #define __GCC_ASM_FLAG_OUTPUTS__ 1
+// ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_1 1
+// ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_16 1
+// ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_2 1
+// ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_4 1
+// ARM64EC-MSVC: #define __GCC_HAVE_SYNC_COMPARE_AND_SWAP_8 1
+// ARM64EC-MSVC: #define __HAVE_FUNCTION_MULTI_VERSIONING 1
+// ARM64EC-MSVC: #define __INT16_C_SUFFIX__
+// ARM64EC-MSVC: #define __INT16_FMTd__ "hd"
+// ARM64EC-MSVC: #define __INT16_FMTi__ "hi"
+// ARM64EC-MSVC: #define __INT16_MAX__ 32767
+// ARM64EC-MSVC: #define __INT16_TYPE__ short
+// ARM64EC-MSVC: #define __INT32_C_SUFFIX__
+// ARM64EC-MSVC: #define __INT32_FMTd__ "d"
+// ARM64EC-MSVC: #define __INT32_FMTi__ "i"
+// ARM64EC-MSVC: #define __INT32_MAX__ 2147483647
+// ARM64EC-MSVC: #define __INT32_TYPE__ int
+// ARM64EC-MSVC: #define __INT64_C_SUFFIX__ LL
+// ARM64EC-MSVC: #define __INT64_FMTd__ "lld"
+// ARM64EC-MSVC: #define __INT64_FMTi__ "lli"
+// ARM64EC-MSVC: #define __INT64_MAX__ 9223372036854775807LL
+// ARM64EC-MSVC: #define __INT64_TYPE__ long long int
+// ARM64EC-MSVC: #define __INT8_C_SUFFIX__
+// ARM64EC-MSVC: #define __INT8_FMTd__ "hhd"
+// ARM64EC-MSVC: #define __INT8_FMTi__ "hhi"
+// ARM64EC-MSVC: #define __INT8_MAX__ 127
+// ARM64EC-MSVC: #define __INT8_TYPE__ signed char
+// ARM64EC-MSVC: #define __INTMAX_C_SUFFIX__ LL
+// ARM64EC-MSVC: #define __INTMAX_FMTd__ "lld"
+// ARM64EC-MSVC: #define __INTMAX_FMTi__ "lli"
+// ARM64EC-MSVC: #define __INTMAX_MAX__ 9223372036854775807LL
+// ARM64EC-MSVC: #define __INTMAX_TYPE__ long long int
+// ARM64EC-MSVC: #define __INTMAX_WIDTH__ 64
+// ARM64EC-MSVC: #define __INTPTR_FMTd__ "lld"
+// ARM64EC-MSVC: #define __INTPTR_FMTi__ "lli"
+// ARM64EC-MSVC: #define __INTPTR_MAX__ 9223372036854775807LL
+// ARM64EC-MSVC: #define __INTPTR_TYPE__ long long int
+// ARM64EC-MSVC: #define __INTPTR_WIDTH__ 64
+// ARM64EC-MSVC: #define __INT_FAST16_FMTd__ "hd"
+// ARM64EC-MSVC: #define __INT_FAST16_FMTi__ "hi"
+// ARM64EC-MSVC: #define __INT_FAST16_MAX__ 32767
+// ARM64EC-MSVC: #define __INT_FAST16_TYPE__ short
+// ARM64EC-MSVC: #define __INT_FAST16_WIDTH__ 16
+// ARM64EC-MSVC: #define __INT_FAST32_FMTd__ "d"
+// ARM64EC-MSVC: #define __INT_FAST32_FMTi__ "i"
+// ARM64EC-MSVC: #define __INT_FAST32_MAX__ 2147483647
+// ARM64EC-MSVC: #define __INT_FAST32_TYPE__ int
+// ARM64EC-MSVC: #define __INT_FAST32_WIDTH__ 32
+// ARM64EC-MSVC: #define __INT_FAST64_FMTd__ "lld"
+// ARM64EC-MSVC: #define __INT_FAST64_FMTi__ "lli"
+// ARM64EC-MSVC: #define __INT_FAST64_MAX__ 9223372036854775807LL
+// ARM64EC-MSVC: #define __INT_FAST64_TYPE__ long long int
+// ARM64EC-MSVC: #define __INT_FAST64_WIDTH__ 64
+// ARM64EC-MSVC: #define __INT_FAST8_FMTd__ "hhd"
+// ARM64EC-MSVC: #define __INT_FAST8_FMTi__ "hhi"
+// ARM64EC-MSVC: #define __INT_FAST8_MAX__ 127
+// ARM64EC-MSVC: #define __INT_FAST8_TYPE__ signed char
+// ARM64EC-MSVC: #define __INT_FAST8_WIDTH__ 8
+// ARM64EC-MSVC: #define __INT_LEAST16_FMTd__ "hd"
+// ARM64EC-MSVC: #define __INT_LEAST16_FMTi__ "hi"
+// ARM64EC-MSVC: #define __INT_LEAST16_MAX__ 32767
+// ARM64EC-MSVC: #define __INT_LEAST16_TYPE__ short
+// ARM64EC-MSVC: #define __INT_LEAST16_WIDTH__ 16
+// ARM64EC-MSVC: #define __INT_LEAST32_FMTd__ "d"
+// ARM64EC-MSVC: #define __INT_LEAST32_FMTi__ "i"
+// ARM64EC-MSVC: #define __INT_LEAST32_MAX__ 2147483647
+// ARM64EC-MSVC: #define __INT_LEAST32_TYPE__ int
+// ARM64EC-MSVC: #define __INT_LEAST32_WIDTH__ 32
+// ARM64EC-MSVC: #define __INT_LEAST64_FMTd__ "lld"
+// ARM64EC-MSVC: #define __INT_LEAST64_FMTi__ "lli"
+// ARM64EC-MSVC: #define __INT_LEAST64_MAX__ 9223372036854775807LL
+// ARM64EC-MSVC: #define __INT_LEAST64_TYPE__ long long int
+// ARM64EC-MSVC: #define __INT_LEAST64_WIDTH__ 64
+// ARM64EC-MSVC: #define __INT_LEAST8_FMTd__ "hhd"
+// ARM64EC-MSVC: #define __INT_LEAST8_FMTi__ "hhi"
+// ARM64EC-MSVC: #define __INT_LEAST8_MAX__ 127
+// ARM64EC-MSVC: #define __INT_LEAST8_TYPE__ signed char
+// ARM64EC-MSVC: #define __INT_LEAST8_WIDTH__ 8
+// ARM64EC-MSVC: #define __INT_MAX__ 2147483647
+// ARM64EC-MSVC: #define __INT_WIDTH__ 32
+// ARM64EC-MSVC: #define __LDBL_DECIMAL_DIG__ 17
+// ARM64EC-MSVC: #define __LDBL_DENORM_MIN__ 4.9406564584124654e-324L
+// ARM64EC-MSVC: #define __LDBL_DIG__ 15
+// ARM64EC-MSVC: #define __LDBL_EPSILON__ 2.2204460492503131e-16L
+// ARM64EC-MSVC: #define __LDBL_HAS_DENORM__ 1
+// ARM64EC-MSVC: #define __LDBL_HAS_INFINITY__ 1
+// ARM64EC-MSVC: #define __LDBL_HAS_QUIET_NAN__ 1
+// ARM64EC-MSVC: #define __LDBL_MANT_DIG__ 53
+// ARM64EC-MSVC: #define __LDBL_MAX_10_EXP__ 308
+// ARM64EC-MSVC: #define __LDBL_MAX_EXP__ 1024
+// ARM64EC-MSVC: #define __LDBL_MAX__ 1.7976931348623157e+308L
+// ARM64EC-MSVC: #define __LDBL_MIN_10_EXP__ (-307)
+// ARM64EC-MSVC: #define __LDBL_MIN_EXP__ (-1021)
+// ARM64EC-MSVC: #define __LDBL_MIN__ 2.2250738585072014e-308L
+// ARM64EC-MSVC: #define __LITTLE_ENDIAN__ 1
+// ARM64EC-MSVC: #define __LLONG_WIDTH__ 64
+// ARM64EC-MSVC: #define __LONG_LONG_MAX__ 9223372036854775807LL
+// ARM64EC-MSVC: #define __LONG_MAX__ 2147483647L
+// ARM64EC-MSVC: #define __LONG_WIDTH__ 32
+// ARM64EC-MSVC: #define __MEMORY_SCOPE_DEVICE 1
+// ARM64EC-MSVC: #define __MEMORY_SCOPE_SINGLE 4
+// ARM64EC-MSVC: #define __MEMORY_SCOPE_SYSTEM 0
+// ARM64EC-MSVC: #define __MEMORY_SCOPE_WRKGRP 2
+// ARM64EC-MSVC: #define __MEMORY_SCOPE_WVFRNT 3
+// ARM64EC-MSVC: #define __NO_INLINE__ 1
+// ARM64EC-MSVC: #define __NO_MATH_ERRNO__ 1
+// ARM64EC-MSVC: #define __OBJC_BOOL_IS_BOOL 0
+// ARM64EC-MSVC: #define __OPENCL_MEMORY_SCOPE_ALL_SVM_DEVICES 3
+// ARM64EC-MSVC: #define __OPENCL_MEMORY_SCOPE_DEVICE 2
+// ARM64EC-MSVC: #define __OPENCL_MEMORY_SCOPE_SUB_GROUP 4
+// ARM64EC-MSVC: #define __OPENCL_MEMORY_SCOPE_WORK_GROUP 1
+// ARM64EC-MSVC: #define __OPENCL_MEMORY_SCOPE_WORK_ITEM 0
+// ARM64EC-MSVC: #define __ORDER_BIG_ENDIAN__ 4321
+// ARM64EC-MSVC: #define __ORDER_LITTLE_ENDIAN__ 1234
+// ARM64EC-MSVC: #define __ORDER_PDP_ENDIAN__ 3412
+// ARM64EC-MSVC: #define __POINTER_WIDTH__ 64
+// ARM64EC-MSVC: #define __PRAGMA_REDEFINE_EXTNAME 1
+// ARM64EC-MSVC: #define __PTRDIFF_FMTd__ "lld"
+// ARM64EC-MSVC: #define __PTRDIFF_FMTi__ "lli"
+// ARM64EC-MSVC: #define __PTRDIFF_MAX__ 9223372036854775807LL
+// ARM64EC-MSVC: #define __PTRDIFF_TYPE__ long long int
+// ARM64EC-MSVC: #define __PTRDIFF_WIDTH__ 64
+// ARM64EC-MSVC: #define __SCHAR_MAX__ 127
+// ARM64EC-MSVC: #define __SHRT_MAX__ 32767
+// ARM64EC-MSVC: #define __SHRT_WIDTH__ 16
+// ARM64EC-MSVC: #define __SIG_ATOMIC_MAX__ 2147483647
+// ARM64EC-MSVC: #define __SIG_ATOMIC_WIDTH__ 32
+// ARM64EC-MSVC: #define __SIZEOF_DOUBLE__ 8
+// ARM64EC-MSVC: #define __SIZEOF_FLOAT__ 4
+// ARM64EC-MSVC: #define __SIZEOF_INT128__ 16
+// ARM64EC-MSVC: #define __SIZEOF_INT__ 4
+// ARM64EC-MSVC: #define __SIZEOF_LONG_DOUBLE__ 8
+// ARM64EC-MSVC: #define __SIZEOF_LONG_LONG__ 8
+// ARM64EC-MSVC: #define __SIZEOF_LONG__ 4
+// ARM64EC-MSVC: #define __SIZEOF_POINTER__ 8
+// ARM64EC-MSVC: #define __SIZEOF_PTRDIFF_T__ 8
+// ARM64EC-MSVC: #define __SIZEOF_SHORT__ 2
+// ARM64EC-MSVC: #define __SIZEOF_SIZE_T__ 8
+// ARM64EC-MSVC: #define __SIZEOF_WCHAR_T__ 2
+// ARM64EC-MSVC: #define __SIZEOF_WINT_T__ 2
+// ARM64EC-MSVC: #define __SIZE_FMTX__ "llX"
+// ARM64EC-MSVC: #define __SIZE_FMTo__ "llo"
+// ARM64EC-MSVC: #define __SIZE_FMTu__ "llu"
+// ARM64EC-MSVC: #define __SIZE_FMTx__ "llx"
+// ARM64EC-MSVC: #define __SIZE_MAX__ 18446744073709551615ULL
+// ARM64EC-MSVC: #define __SIZE_TYPE__ long long unsigned int
+// ARM64EC-MSVC: #define __SIZE_WIDTH__ 64
+// ARM64EC-MSVC: #define __STDC_HOSTED__ 0
+// ARM64EC-MSVC: #define __STDC_NO_THREADS__ 1
+// ARM64EC-MSVC: #define __STDC_UTF_16__ 1
+// ARM64EC-MSVC: #define __STDC_UTF_32__ 1
+// ARM64EC-MSVC: #define __STDC_VERSION__ 201710L
+// ARM64EC-MSVC: #define __STDC__ 1
+// ARM64EC-MSVC: #define __UINT16_C_SUFFIX__
+// ARM64EC-MSVC: #define __UINT16_FMTX__ "hX"
+// ARM64EC-MSVC: #define __UINT16_FMTo__ "ho"
+// ARM64EC-MSVC: #define __UINT16_FMTu__ "hu"
+// ARM64EC-MSVC: #define __UINT16_FMTx__ "hx"
+// ARM64EC-MSVC: #define __UINT16_MAX__ 65535
+// ARM64EC-MSVC: #define __UINT16_TYPE__ unsigned short
+// ARM64EC-MSVC: #define __UINT32_C_SUFFIX__ U
+// ARM64EC-MSVC: #define __UINT32_FMTX__ "X"
+// ARM64EC-MSVC: #define __UINT32_FMTo__ "o"
+// ARM64EC-MSVC: #define __UINT32_FMTu__ "u"
+// ARM64EC-MSVC: #define __UINT32_FMTx__ "x"
+// ARM64EC-MSVC: #define __UINT32_MAX__ 4294967295U
+// ARM64EC-MSVC: #define __UINT32_TYPE__ unsigned int
+// ARM64EC-MSVC: #define __UINT64_C_SUFFIX__ ULL
+// ARM64EC-MSVC: #define __UINT64_FMTX__ "llX"
+// ARM64EC-MSVC: #define __UINT64_FMTo__ "llo"
+// ARM64EC-MSVC: #define __UINT64_FMTu__ "llu"
+// ARM64EC-MSVC: #define __UINT64_FMTx__ "llx"
+// ARM64EC-MSVC: #define __UINT64_MAX__ 18446744073709551615ULL
+// ARM64EC-MSVC: #define __UINT64_TYPE__ long long unsigned int
+// ARM64EC-MSVC: #define __UINT8_C_SUFFIX__
+// ARM64EC-MSVC: #define __UINT8_FMTX__ "hhX"
+// ARM64EC-MSVC: #define __UINT8_FMTo__ "hho"
+// ARM64EC-MSVC: #define __UINT8_FMTu__ "hhu"
+// ARM64EC-MSVC: #define __UINT8_FMTx__ "hhx"
+// ARM64EC-MSVC: #define __UINT8_MAX__ 255
+// ARM64EC-MSVC: #define __UINT8_TYPE__ unsigned char
+// ARM64EC-MSVC: #define __UINTMAX_C_SUFFIX__ ULL
+// ARM64EC-MSVC: #define __UINTMAX_FMTX__ "llX"
+// ARM64EC-MSVC: #define __UINTMAX_FMTo__ "llo"
+// ARM64EC-MSVC: #define __UINTMAX_FMTu__ "llu"
+// ARM64EC-MSVC: #define __UINTMAX_FMTx__ "llx"
+// ARM64EC-MSVC: #define __UINTMAX_MAX__ 18446744073709551615ULL
+// ARM64EC-MSVC: #define __UINTMAX_TYPE__ long long unsigned int
+// ARM64EC-MSVC: #define __UINTMAX_WIDTH__ 64
+// ARM64EC-MSVC: #define __UINTPTR_FMTX__ "llX"
+// ARM64EC-MSVC: #define __UINTPTR_FMTo__ "llo"
+// ARM64EC-MSVC: #define __UINTPTR_FMTu__ "llu"
+// ARM64EC-MSVC: #define __UINTPTR_FMTx__ "llx"
+// ARM64EC-MSVC: #define __UINTPTR_MAX__ 18446744073709551615ULL
+// ARM64EC-MSVC: #define __UINTPTR_TYPE__ long long unsigned int
+// ARM64EC-MSVC: #define __UINTPTR_WIDTH__ 64
+// ARM64EC-MSVC: #define __UINT_FAST16_FMTX__ "hX"
+// ARM64EC-MSVC: #define __UINT_FAST16_FMTo__ "ho"
+// ARM64EC-MSVC: #define __UINT_FAST16_FMTu__ "hu"
+// ARM64EC-MSVC: #define __UINT_FAST16_FMTx__ "hx"
+// ARM64EC-MSVC: #define __UINT_FAST16_MAX__ 65535
+// ARM64EC-MSVC: #define __UINT_FAST16_TYPE__ unsigned short
+// ARM64EC-MSVC: #define __UINT_FAST32_FMTX__ "X"
+// ARM64EC-MSVC: #define __UINT_FAST32_FMTo__ "o"
+// ARM64EC-MSVC: #define __UINT_FAST32_FMTu__ "u"
+// ARM64EC-MSVC: #define __UINT_FAST32_FMTx__ "x"
+// ARM64EC-MSVC: #define __UINT_FAST32_MAX__ 4294967295U
+// ARM64EC-MSVC: #define __UINT_FAST32_TYPE__ unsigned int
+// ARM64EC-MSVC: #define __UINT_FAST64_FMTX__ "llX"
+// ARM64EC-MSVC: #define __UINT_FAST64_FMTo__ "llo"
+// ARM64EC-MSVC: #define __UINT_FAST64_FMTu__ "llu"
+// ARM64EC-MSVC: #define __UINT_FAST64_FMTx__ "llx"
+// ARM64EC-MSVC: #define __UINT_FAST64_MAX__ 18446744073709551615ULL
+// ARM64EC-MSVC: #define __UINT_FAST64_TYPE__ long long unsigned int
+// ARM64EC-MSVC: #define __UINT_FAST8_FMTX__ "hhX"
+// ARM64EC-MSVC: #define __UINT_FAST8_FMTo__ "hho"
+// ARM64EC-MSVC: #define __UINT_FAST8_FMTu__ "hhu"
+// ARM64EC-MSVC: #define __UINT_FAST8_FMTx__ "hhx"
+// ARM64EC-MSVC: #define __UINT_FAST8_MAX__ 255
+// ARM64EC-MSVC: #define __UINT_FAST8_TYPE__ unsigned char
+// ARM64EC-MSVC: #define __UINT_LEAST16_FMTX__ "hX"
+// ARM64EC-MSVC: #define __UINT_LEAST16_FMTo__ "ho"
+// ARM64EC-MSVC: #define __UINT_LEAST16_FMTu__ "hu"
+// ARM64EC-MSVC: #define __UINT_LEAST16_FMTx__ "hx"
+// ARM64EC-MSVC: #define __UINT_LEAST16_MAX__ 65535
+// ARM64EC-MSVC: #define __UINT_LEAST16_TYPE__ unsigned short
+// ARM64EC-MSVC: #define __UINT_LEAST32_FMTX__ "X"
+// ARM64EC-MSVC: #define __UINT_LEAST32_FMTo__ "o"
+// ARM64EC-MSVC: #define __UINT_LEAST32_FMTu__ "u"
+// ARM64EC-MSVC: #define __UINT_LEAST32_FMTx__ "x"
+// ARM64EC-MSVC: #define __UINT_LEAST32_MAX__ 4294967295U
+// ARM64EC-MSVC: #define __UINT_LEAST32_TYPE__ unsigned int
+// ARM64EC-MSVC: #define __UINT_LEAST64_FMTX__ "llX"
+// ARM64EC-MSVC: #define __UINT_LEAST64_FMTo__ "llo"
+// ARM64EC-MSVC: #define __UINT_LEAST64_FMTu__ "llu"
+// ARM64EC-MSVC: #define __UINT_LEAST64_FMTx__ "llx"
+// ARM64EC-MSVC: #define __UINT_LEAST64_MAX__ 18446744073709551615ULL
+// ARM64EC-MSVC: #define __UINT_LEAST64_TYPE__ long long unsigned int
+// ARM64EC-MSVC: #define __UINT_LEAST8_FMTX__ "hhX"
+// ARM64EC-MSVC: #define __UINT_LEAST8_FMTo__ "hho"
+// ARM64EC-MSVC: #define __UINT_LEAST8_FMTu__ "hhu"
+// ARM64EC-MSVC: #define __UINT_LEAST8_FMTx__ "hhx"
+// ARM64EC-MSVC: #define __UINT_LEAST8_MAX__ 255
+// ARM64EC-MSVC: #define __UINT_LEAST8_TYPE__ unsigned char
+// ARM64EC-MSVC: #define __USER_LABEL_PREFIX__
+// ARM64EC-MSVC: #define __WCHAR_MAX__ 65535
+// ARM64EC-MSVC: #define __WCHAR_TYPE__ unsigned short
+// ARM64EC-MSVC: #define __WCHAR_UNSIGNED__ 1
+// ARM64EC-MSVC: #define __WCHAR_WIDTH__ 16
+// ARM64EC-MSVC: #define __WINT_MAX__ 65535
+// ARM64EC-MSVC: #define __WINT_TYPE__ unsigned short
+// ARM64EC-MSVC: #define __WINT_UNSIGNED__ 1
+// ARM64EC-MSVC: #define __WINT_WIDTH__ 16
+// ARM64EC-MSVC: #define __amd64 1
+// ARM64EC-MSVC: #define __amd64__ 1
+// ARM64EC-MSVC: #define __arm64ec__ 1
+// ARM64EC-MSVC: #define __x86_64 1
+// ARM64EC-MSVC: #define __x86_64__ 1
+
 // RUN: %clang_cc1 -triple=aarch64 -E -dM -mcmodel=small -xc /dev/null | FileCheck --check-prefix=CMODEL_SMALL %s
 // RUN: %clang_cc1 -triple=aarch64 -E -dM -mcmodel=tiny -xc /dev/null | FileCheck --check-prefix=CMODEL_TINY %s
 // RUN: %clang_cc1 -triple=aarch64 -E -dM -mcmodel=large -xc /dev/null | FileCheck --check-prefix=CMODEL_LARGE %s

From a829f109ffd518230df5191c64aa427603cf10d9 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Tue, 23 Jan 2024 17:00:06 -0500
Subject: [PATCH 698/843] [libc++] Fix outdated release procedure for release
 notes

---
 libcxx/docs/ReleaseProcedure.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libcxx/docs/ReleaseProcedure.rst b/libcxx/docs/ReleaseProcedure.rst
index 7981bb25c583a..ea44349ac30ae 100644
--- a/libcxx/docs/ReleaseProcedure.rst
+++ b/libcxx/docs/ReleaseProcedure.rst
@@ -49,7 +49,7 @@ After branching for an LLVM release:
 3. Update ``_LIBCPPABI_VERSION`` in ``libcxxabi/include/cxxabi.h``
 4. Update ``_LIBUNWIND_VERSION`` in ``libunwind/include/__libunwind_config.h``
 5. Create a release notes file for the next release from the template
-6. Point to the new release notes file from ``libcxx/docs/index.rst``
+6. Point to the new release notes file from ``libcxx/docs/ReleaseNotes.rst``
 
 Post branching
 ==============

From 9d476e1e1a18af390e3455a6622ee67a69c64103 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Tue, 23 Jan 2024 14:04:52 -0800
Subject: [PATCH 699/843] [clang][FatLTO] Avoid UnifiedLTO until it can support
 WPD/CFI (#79061)

Currently, the UnifiedLTO pipeline seems to have trouble with several
LTO features, like SplitLTO units, which means we cannot use important
optimizations like Whole Program Devirtualization or security hardening
instrumentation like CFI.

This patch reverts FatLTO to using distinct pipelines for Full LTO and
ThinLTO. It still avoids module cloning, since that was error prone.
---
 clang/lib/CodeGen/BackendUtil.cpp             |  8 ++--
 clang/lib/Driver/ToolChains/Clang.cpp         |  4 +-
 clang/lib/Frontend/CompilerInvocation.cpp     | 14 -------
 clang/test/CodeGen/fat-lto-objects.c          | 38 ++++++++++---------
 clang/test/Driver/fat-lto-objects.c           | 14 -------
 llvm/docs/FatLTO.rst                          |  4 +-
 llvm/include/llvm/Passes/PassBuilder.h        |  9 ++---
 .../llvm/Transforms/IPO/EmbedBitcodePass.h    | 16 +++++++-
 llvm/lib/Passes/PassBuilder.cpp               | 20 ++++++++++
 llvm/lib/Passes/PassBuilderPipelines.cpp      | 15 +++++---
 llvm/lib/Passes/PassRegistry.def              |  5 ++-
 llvm/lib/Transforms/IPO/EmbedBitcodePass.cpp  | 12 +++++-
 llvm/test/CodeGen/X86/fat-lto-section.ll      |  2 +-
 llvm/test/Transforms/EmbedBitcode/embed.ll    |  3 ++
 14 files changed, 92 insertions(+), 72 deletions(-)

diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index a6142d99f3b68..ec203f6f28bc1 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -1001,8 +1001,9 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     }
 
     if (CodeGenOpts.FatLTO) {
-      assert(CodeGenOpts.UnifiedLTO && "FatLTO requires UnifiedLTO");
-      MPM.addPass(PB.buildFatLTODefaultPipeline(Level));
+      MPM.addPass(PB.buildFatLTODefaultPipeline(
+          Level, PrepareForThinLTO,
+          PrepareForThinLTO || shouldEmitRegularLTOSummary()));
     } else if (PrepareForThinLTO) {
       MPM.addPass(PB.buildThinLTOPreLinkDefaultPipeline(Level));
     } else if (PrepareForLTO) {
@@ -1073,8 +1074,7 @@ void EmitAssemblyHelper::RunOptimizationPipeline(
     if (!TheModule->getModuleFlag("EnableSplitLTOUnit"))
       TheModule->addModuleFlag(llvm::Module::Error, "EnableSplitLTOUnit",
                                uint32_t(CodeGenOpts.EnableSplitLTOUnit));
-    // FatLTO always means UnifiedLTO
-    if (!TheModule->getModuleFlag("UnifiedLTO"))
+    if (CodeGenOpts.UnifiedLTO && !TheModule->getModuleFlag("UnifiedLTO"))
       TheModule->addModuleFlag(llvm::Module::Error, "UnifiedLTO", uint32_t(1));
   }
 
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index bcba7cbbdb58c..5dc614e11aab5 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -4854,9 +4854,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
   bool UnifiedLTO = false;
   if (IsUsingLTO) {
     UnifiedLTO = Args.hasFlag(options::OPT_funified_lto,
-                              options::OPT_fno_unified_lto, Triple.isPS()) ||
-                 Args.hasFlag(options::OPT_ffat_lto_objects,
-                              options::OPT_fno_fat_lto_objects, false);
+                              options::OPT_fno_unified_lto, Triple.isPS());
     if (UnifiedLTO)
       CmdArgs.push_back("-funified-lto");
   }
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 7edea7798af1e..feb4de2084b83 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1861,20 +1861,6 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
     if (Args.hasArg(OPT_funified_lto))
       Opts.PrepareForThinLTO = true;
   }
-  if (Arg *A = Args.getLastArg(options::OPT_ffat_lto_objects,
-                               options::OPT_fno_fat_lto_objects)) {
-    if (A->getOption().matches(options::OPT_ffat_lto_objects)) {
-      if (Arg *Uni = Args.getLastArg(options::OPT_funified_lto,
-                                     options::OPT_fno_unified_lto)) {
-        if (Uni->getOption().matches(options::OPT_fno_unified_lto))
-          Diags.Report(diag::err_drv_incompatible_options)
-              << A->getAsString(Args) << "-fno-unified-lto";
-      } else
-        Diags.Report(diag::err_drv_argument_only_allowed_with)
-            << A->getAsString(Args) << "-funified-lto";
-    }
-  }
-
   if (Arg *A = Args.getLastArg(OPT_fthinlto_index_EQ)) {
     if (IK.getLanguage() != Language::LLVM_IR)
       Diags.Report(diag::err_drv_argument_only_allowed_with)
diff --git a/clang/test/CodeGen/fat-lto-objects.c b/clang/test/CodeGen/fat-lto-objects.c
index 5c8ad1fd93c4b..afce798c5c819 100644
--- a/clang/test/CodeGen/fat-lto-objects.c
+++ b/clang/test/CodeGen/fat-lto-objects.c
@@ -1,34 +1,37 @@
 // REQUIRES: x86-registered-target
 
-// RUN: not %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -fsplit-lto-unit -emit-llvm < %s 2>&1 | FileCheck %s --check-prefixes=NO-UNIFIED
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -fsplit-lto-unit -emit-llvm < %s  | FileCheck %s --check-prefixes=FULL,SPLIT
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -emit-llvm < %s  | FileCheck %s --check-prefixes=FULL,SPLIT
 
-// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -funified-lto -ffat-lto-objects -fsplit-lto-unit -emit-llvm < %s  | FileCheck %s --check-prefixes=FULL,SPLIT,UNIFIED
-// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -funified-lto -ffat-lto-objects -emit-llvm < %s  | FileCheck %s --check-prefixes=FULL,NOSPLIT,UNIFIED
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -fsplit-lto-unit -ffat-lto-objects -emit-llvm < %s  | FileCheck %s --check-prefixes=THIN,SPLIT
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -ffat-lto-objects -emit-llvm < %s  | FileCheck %s --check-prefixes=THIN,NOSPLIT
 
-// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -funified-lto -fsplit-lto-unit -ffat-lto-objects -emit-llvm < %s  | FileCheck %s --check-prefixes=THIN,SPLIT,UNIFIED
-// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -funified-lto -ffat-lto-objects -emit-llvm < %s  | FileCheck %s --check-prefixes=THIN,NOSPLIT,UNIFIED
-
-// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -funified-lto -ffat-lto-objects -fsplit-lto-unit -emit-obj < %s -o %t.full.split.o
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -fsplit-lto-unit -emit-obj < %s -o %t.full.split.o
 // RUN: llvm-readelf -S %t.full.split.o | FileCheck %s --check-prefixes=ELF
 // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.full.split.bc %t.full.split.o
-// RUN: llvm-dis %t.full.split.bc -o - | FileCheck %s --check-prefixes=THIN,SPLIT,UNIFIED
+// RUN: llvm-dis %t.full.split.bc -o - | FileCheck %s --check-prefixes=FULL,SPLIT,NOUNIFIED
 
-// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -funified-lto -ffat-lto-objects -emit-obj < %s -o %t.full.nosplit.o
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -emit-obj < %s -o %t.full.nosplit.o
 // RUN: llvm-readelf -S %t.full.nosplit.o | FileCheck %s --check-prefixes=ELF
 // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.full.nosplit.bc %t.full.nosplit.o
-// RUN: llvm-dis %t.full.nosplit.bc -o - | FileCheck %s --check-prefixes=THIN,NOSPLIT,UNIFIED
+// RUN: llvm-dis %t.full.nosplit.bc -o - | FileCheck %s --check-prefixes=FULL,NOSPLIT,NOUNIFIED
 
-// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -funified-lto -fsplit-lto-unit -ffat-lto-objects -emit-obj < %s -o %t.thin.split.o
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -fsplit-lto-unit -ffat-lto-objects -emit-obj < %s -o %t.thin.split.o
 // RUN: llvm-readelf -S %t.thin.split.o | FileCheck %s --check-prefixes=ELF
 // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.thin.split.bc %t.thin.split.o
-// RUN: llvm-dis %t.thin.split.bc -o - | FileCheck %s --check-prefixes=THIN,SPLIT,UNIFIED
+// RUN: llvm-dis %t.thin.split.bc -o - | FileCheck %s --check-prefixes=THIN,SPLIT,NOUNIFIED
 
-// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -funified-lto -ffat-lto-objects -emit-obj < %s -o %t.thin.nosplit.o
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -ffat-lto-objects -emit-obj < %s -o %t.thin.nosplit.o
 // RUN: llvm-readelf -S %t.thin.nosplit.o | FileCheck %s --check-prefixes=ELF
 // RUN: llvm-objcopy --dump-section=.llvm.lto=%t.thin.nosplit.bc %t.thin.nosplit.o
-// RUN: llvm-dis %t.thin.nosplit.bc -o -  | FileCheck %s --check-prefixes=THIN,NOSPLIT,UNIFIED
+// RUN: llvm-dis %t.thin.nosplit.bc -o -  | FileCheck %s --check-prefixes=THIN,NOSPLIT,NOUNIFIED
+
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=thin -funified-lto -ffat-lto-objects -emit-obj < %s -o %t.unified.o
+// RUN: llvm-readelf -S %t.unified.o | FileCheck %s --check-prefixes=ELF
+// RUN: llvm-objcopy --dump-section=.llvm.lto=%t.unified.bc %t.unified.o
+// RUN: llvm-dis %t.unified.bc -o - | FileCheck %s --check-prefixes=THIN,NOSPLIT,UNIFIED
 
-// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -funified-lto -ffat-lto-objects -fsplit-lto-unit -S < %s -o - \
+// RUN: %clang -cc1 -triple x86_64-unknown-linux-gnu -flto=full -ffat-lto-objects -fsplit-lto-unit -S < %s -o - \
 // RUN: | FileCheck %s --check-prefixes=ASM
 
 /// Be sure we enable split LTO units correctly under -ffat-lto-objects.
@@ -38,9 +41,8 @@
 // FULL-NOT: ![[#]] = !{i32 1, !"ThinLTO", i32 0}
 // THIN-NOT: ![[#]] = !{i32 1, !"ThinLTO", i32 0}
 
-/// FatLTO always uses UnifiedLTO. It's an error if they aren't set together
-//    UNIFIED: ![[#]] = !{i32 1, !"UnifiedLTO", i32 1}
-// NO-UNIFIED: error: invalid argument '-ffat-lto-objects' only allowed with '-funified-lto'
+// UNIFIED: ![[#]] = !{i32 1, !"UnifiedLTO", i32 1}
+// NOUNIFIED-NOT: ![[#]] = !{i32 1, !"UnifiedLTO", i32 1}
 
 // ELF: .llvm.lto
 
diff --git a/clang/test/Driver/fat-lto-objects.c b/clang/test/Driver/fat-lto-objects.c
index 203175d61b73d..97002db6edc51 100644
--- a/clang/test/Driver/fat-lto-objects.c
+++ b/clang/test/Driver/fat-lto-objects.c
@@ -1,6 +1,5 @@
 // RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC
 // CHECK-CC: -cc1
-// CHECK-CC-SAME: -funified-lto
 // CHECK-CC-SAME: -emit-obj
 // CHECK-CC-SAME: -ffat-lto-objects
 
@@ -15,21 +14,18 @@
 /// When fat LTO is enabled with -S, we expect asm output and -ffat-lto-objects to be passed to cc1.
 // RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -S 2>&1 | FileCheck %s -check-prefix=CHECK-CC-S-LTO
 // CHECK-CC-S-LTO: -cc1
-// CHECK-CC-S-LTO-SAME: -funified-lto
 // CHECK-CC-S-NOT: -emit-llvm
 // CHECK-CC-S-LTO-SAME: -ffat-lto-objects
 
 /// When fat LTO is enabled with -S and -emit-llvm, we expect IR output and -ffat-lto-objects to be passed to cc1.
 // RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -S -emit-llvm 2>&1 | FileCheck %s -check-prefix=CHECK-CC-S-EL-LTO
 // CHECK-CC-S-EL-LTO: -cc1
-// CHECK-CC-S-EL-LTO-SAME: -funified-lto
 // CHECK-CC-S-EL-LTO-SAME: -emit-llvm
 // CHECK-CC-S-EL-LTO-SAME: -ffat-lto-objects
 
 /// When fat LTO is enabled wihtout -S we expect native object output and -ffat-lto-object to be passed to cc1.
 // RUN: %clang --target=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -### %s -c 2>&1 | FileCheck %s -check-prefix=CHECK-CC-C-LTO
 // CHECK-CC-C-LTO: -cc1
-// CHECK-CC-C-LTO: -funified-lto
 // CHECK-CC-C-LTO: -emit-obj
 // CHECK-CC-C-LTO: -ffat-lto-objects
 
@@ -47,13 +43,3 @@
 // RUN:   -fuse-ld=lld -fno-lto -ffat-lto-objects -### 2>&1 | FileCheck --check-prefix=NOLTO %s
 // LTO: "--fat-lto-objects"
 // NOLTO-NOT: "--fat-lto-objects"
-
-/// Make sure that incompatible options emit the correct diagnostics, since -ffat-lto-objects requires -funified-lto
-// RUN: %clang -cc1 -triple=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -funified-lto -emit-llvm-only %s  2>&1 | FileCheck %s -check-prefix=UNIFIED --allow-empty
-// UNIFIED-NOT: error:
-
-// RUN: not %clang -cc1 -triple=x86_64-unknown-linux-gnu -flto -ffat-lto-objects -emit-llvm-only %s  2>&1 | FileCheck %s -check-prefix=MISSING_UNIFIED
-// MISSING_UNIFIED: error: invalid argument '-ffat-lto-objects' only allowed with '-funified-lto'
-
-// RUN: not %clang -cc1 -triple=x86_64-unknown-linux-gnu -flto -fno-unified-lto -ffat-lto-objects -emit-llvm-only %s  2>&1 | FileCheck %s -check-prefix=NO-UNIFIED
-// NO-UNIFIED: error: the combination of '-ffat-lto-objects' and '-fno-unified-lto' is incompatible
diff --git a/llvm/docs/FatLTO.rst b/llvm/docs/FatLTO.rst
index 76b849b16fc96..5864944332fc0 100644
--- a/llvm/docs/FatLTO.rst
+++ b/llvm/docs/FatLTO.rst
@@ -29,9 +29,9 @@ Overview
 Within LLVM, FatLTO is supported by choosing the ``FatLTODefaultPipeline``.
 This pipeline will:
 
-#) Run the pre-link UnifiedLTO pipeline on the current module.
+#) Run the pre-link (Thin)LTO pipeline on the current module.
 #) Embed the pre-link bitcode in a special ``.llvm.lto`` section.
-#) Finish optimizing the module using the post-link ThinLTO pipeline.
+#) Finish optimizing the module using the ModuleOptimization pipeline.
 #) Emit the object file, including the new ``.llvm.lto`` section.
 
 .. NOTE
diff --git a/llvm/include/llvm/Passes/PassBuilder.h b/llvm/include/llvm/Passes/PassBuilder.h
index 33cf8af87381f..7339b8a988232 100644
--- a/llvm/include/llvm/Passes/PassBuilder.h
+++ b/llvm/include/llvm/Passes/PassBuilder.h
@@ -250,12 +250,9 @@ class PassBuilder {
   ///
   /// This builds a pipeline that runs the LTO/ThinLTO  pre-link pipeline, and
   /// emits a section containing the pre-link bitcode along side the object code
-  /// generated by running the PerModuleDefaultPipeline, used when compiling
-  /// without LTO. It clones the module and runs the LTO/non-LTO pipelines
-  /// separately to avoid any inconsistencies with an ad-hoc pipeline that tries
-  /// to approximate the PerModuleDefaultPipeline from the pre-link LTO
-  /// pipelines.
-  ModulePassManager buildFatLTODefaultPipeline(OptimizationLevel Level);
+  /// generated in non-LTO compilation.
+  ModulePassManager buildFatLTODefaultPipeline(OptimizationLevel Level,
+                                               bool ThinLTO, bool EmitSummary);
 
   /// Build a pre-link, ThinLTO-targeting default optimization pipeline to
   /// a pass manager.
diff --git a/llvm/include/llvm/Transforms/IPO/EmbedBitcodePass.h b/llvm/include/llvm/Transforms/IPO/EmbedBitcodePass.h
index 2bb7d5f1fcf14..12bf0dd6581ce 100644
--- a/llvm/include/llvm/Transforms/IPO/EmbedBitcodePass.h
+++ b/llvm/include/llvm/Transforms/IPO/EmbedBitcodePass.h
@@ -24,13 +24,25 @@ namespace llvm {
 class Module;
 class Pass;
 
+struct EmbedBitcodeOptions {
+  EmbedBitcodeOptions() : EmbedBitcodeOptions(false, false) {}
+  EmbedBitcodeOptions(bool IsThinLTO, bool EmitLTOSummary)
+      : IsThinLTO(IsThinLTO), EmitLTOSummary(EmitLTOSummary) {}
+  bool IsThinLTO;
+  bool EmitLTOSummary;
+};
+
 /// Pass embeds a copy of the module optimized with the provided pass pipeline
 /// into a global variable.
 class EmbedBitcodePass : public PassInfoMixin<EmbedBitcodePass> {
-  ModulePassManager MPM;
+  bool IsThinLTO;
+  bool EmitLTOSummary;
 
 public:
-  EmbedBitcodePass() {}
+  EmbedBitcodePass(EmbedBitcodeOptions Opts)
+      : EmbedBitcodePass(Opts.IsThinLTO, Opts.EmitLTOSummary) {}
+  EmbedBitcodePass(bool IsThinLTO, bool EmitLTOSummary)
+      : IsThinLTO(IsThinLTO), EmitLTOSummary(EmitLTOSummary) {}
 
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
 
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 19fb136f37563..000594f0e7f4b 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -752,6 +752,26 @@ Expected<HWAddressSanitizerOptions> parseHWASanPassOptions(StringRef Params) {
   return Result;
 }
 
+Expected<EmbedBitcodeOptions> parseEmbedBitcodePassOptions(StringRef Params) {
+  EmbedBitcodeOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName == "thinlto") {
+      Result.IsThinLTO = true;
+    } else if (ParamName == "emit-summary") {
+      Result.EmitLTOSummary = true;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid EmbedBitcode pass parameter '{0}' ", ParamName)
+              .str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
 Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) {
   MemorySanitizerOptions Result;
   while (!Params.empty()) {
diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp
index 525b83dee79b0..6ede863829120 100644
--- a/llvm/lib/Passes/PassBuilderPipelines.cpp
+++ b/llvm/lib/Passes/PassBuilderPipelines.cpp
@@ -1531,14 +1531,17 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
 }
 
 ModulePassManager
-PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level) {
+PassBuilder::buildFatLTODefaultPipeline(OptimizationLevel Level, bool ThinLTO,
+                                        bool EmitSummary) {
   ModulePassManager MPM;
-  // FatLTO always uses UnifiedLTO, so use the ThinLTOPreLink pipeline
-  MPM.addPass(buildThinLTOPreLinkDefaultPipeline(Level));
-  MPM.addPass(EmbedBitcodePass());
+  if (ThinLTO)
+    MPM.addPass(buildThinLTOPreLinkDefaultPipeline(Level));
+  else
+    MPM.addPass(buildLTOPreLinkDefaultPipeline(Level));
+  MPM.addPass(EmbedBitcodePass(ThinLTO, EmitSummary));
 
-  // Use the ThinLTO post-link pipeline with sample profiling, other
-  if (PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)
+  // Use the ThinLTO post-link pipeline with sample profiling
+  if (ThinLTO && PGOOpt && PGOOpt->Action == PGOOptions::SampleUse)
     MPM.addPass(buildThinLTODefaultPipeline(Level, /*ImportSummary=*/nullptr));
   else {
     // otherwise, just use module optimization
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 5f25302091124..e59795c7b0840 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -58,7 +58,6 @@ MODULE_PASS("dfsan", DataFlowSanitizerPass())
 MODULE_PASS("dot-callgraph", CallGraphDOTPrinterPass())
 MODULE_PASS("dxil-upgrade", DXILUpgradePass())
 MODULE_PASS("elim-avail-extern", EliminateAvailableExternallyPass())
-MODULE_PASS("embed-bitcode", EmbedBitcodePass())
 MODULE_PASS("extract-blocks", BlockExtractorPass({}, false))
 MODULE_PASS("forceattrs", ForceFunctionAttrsPass())
 MODULE_PASS("function-import", FunctionImportPass())
@@ -161,6 +160,10 @@ MODULE_PASS_WITH_PARAMS(
     "group-by-use;ignore-single-use;max-offset=N;merge-const;merge-external;"
     "no-group-by-use;no-ignore-single-use;no-merge-const;no-merge-external;"
     "size-only")
+MODULE_PASS_WITH_PARAMS(
+    "embed-bitcode", "EmbedBitcodePass",
+    [](EmbedBitcodeOptions Opts) { return EmbedBitcodePass(Opts); },
+    parseEmbedBitcodePassOptions, "thinlto;emit-summary")
 MODULE_PASS_WITH_PARAMS(
     "globaldce", "GlobalDCEPass",
     [](bool InLTOPostLink) { return GlobalDCEPass(InLTOPostLink); },
diff --git a/llvm/lib/Transforms/IPO/EmbedBitcodePass.cpp b/llvm/lib/Transforms/IPO/EmbedBitcodePass.cpp
index 48ef0772e800e..6af3a45701bc5 100644
--- a/llvm/lib/Transforms/IPO/EmbedBitcodePass.cpp
+++ b/llvm/lib/Transforms/IPO/EmbedBitcodePass.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/EmbedBitcodePass.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/Bitcode/BitcodeWriterPass.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -16,6 +18,7 @@
 #include "llvm/Transforms/IPO/ThinLTOBitcodeWriter.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
+#include <memory>
 #include <string>
 
 using namespace llvm;
@@ -30,9 +33,16 @@ PreservedAnalyses EmbedBitcodePass::run(Module &M, ModuleAnalysisManager &AM) {
     report_fatal_error(
         "EmbedBitcode pass currently only supports ELF object format",
         /*gen_crash_diag=*/false);
+
   std::string Data;
   raw_string_ostream OS(Data);
-  ThinLTOBitcodeWriterPass(OS, /*ThinLinkOS=*/nullptr).run(M, AM);
+  if (IsThinLTO)
+    ThinLTOBitcodeWriterPass(OS, /*ThinLinkOS=*/nullptr).run(M, AM);
+  else
+    BitcodeWriterPass(OS, /*ShouldPreserveUseListOrder=*/false, EmitLTOSummary)
+        .run(M, AM);
+
   embedBufferInModule(M, MemoryBufferRef(Data, "ModuleData"), ".llvm.lto");
+
   return PreservedAnalyses::all();
 }
diff --git a/llvm/test/CodeGen/X86/fat-lto-section.ll b/llvm/test/CodeGen/X86/fat-lto-section.ll
index 9a4359bab6b5d..30c56229a0e2a 100644
--- a/llvm/test/CodeGen/X86/fat-lto-section.ll
+++ b/llvm/test/CodeGen/X86/fat-lto-section.ll
@@ -1,5 +1,5 @@
 ;; Ensure that the .llvm.lto section has SHT_EXCLUDE set.
-; RUN: opt --mtriple x86_64-unknown-linux-gnu < %s -passes="embed-bitcode" -S \
+; RUN: opt --mtriple x86_64-unknown-linux-gnu < %s -passes="embed-bitcode<thinlto;emit-summary>" -S \
 ; RUN:   | llc --mtriple x86_64-unknown-linux-gnu -filetype=obj \
 ; RUN:   | llvm-readelf - --sections \
 ; RUN:   | FileCheck %s --check-prefix=EXCLUDE
diff --git a/llvm/test/Transforms/EmbedBitcode/embed.ll b/llvm/test/Transforms/EmbedBitcode/embed.ll
index 734bf5274a5f2..dffb5cf755477 100644
--- a/llvm/test/Transforms/EmbedBitcode/embed.ll
+++ b/llvm/test/Transforms/EmbedBitcode/embed.ll
@@ -1,4 +1,7 @@
 ; RUN: opt --mtriple x86_64-unknown-linux-gnu < %s -passes="embed-bitcode" -S | FileCheck %s
+; RUN: opt --mtriple x86_64-unknown-linux-gnu < %s -passes="embed-bitcode<thinlto>" -S | FileCheck %s
+; RUN: opt --mtriple x86_64-unknown-linux-gnu < %s -passes="embed-bitcode<emit-summary>" -S | FileCheck %s
+; RUN: opt --mtriple x86_64-unknown-linux-gnu < %s -passes="embed-bitcode<thinlto;emit-summary>" -S | FileCheck %s
 
 @a = global i32 1
 

From 729657d6e15d0455557f35485deb87313ccdde10 Mon Sep 17 00:00:00 2001
From: Utkarsh Saxena <usx@google.com>
Date: Tue, 23 Jan 2024 23:07:00 +0100
Subject: [PATCH 700/843] [misc-coroutine-hostile-raii] Use getOperand instead
 of getCommonExpr. (#79206)

We were previously allowlisting awaitable types returned by
`await_transform` instead of the type of the operand of the `co_await`
expression.

This previously used to give false positives and not respect the
`AllowedAwaitablesList` flag when `await_transform` is used. See added
test cases for such examples.
---
 .../misc/CoroutineHostileRAIICheck.cpp        |  2 +-
 .../checkers/misc/coroutine-hostile-raii.cpp  | 34 ++++++++++++++++++-
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
index a0e8700b0522b..360335b86c641 100644
--- a/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/CoroutineHostileRAIICheck.cpp
@@ -56,7 +56,7 @@ AST_MATCHER_P(Stmt, forEachPrevStmt, ast_matchers::internal::Matcher<Stmt>,
 // Matches the expression awaited by the `co_await`.
 AST_MATCHER_P(CoawaitExpr, awaitable, ast_matchers::internal::Matcher<Expr>,
               InnerMatcher) {
-  if (Expr *E = Node.getCommonExpr())
+  if (Expr *E = Node.getOperand())
     return InnerMatcher.matches(*E, Finder, Builder);
   return false;
 }
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp
index 55a7e4b8f2954..c23c355dac1b2 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/coroutine-hostile-raii.cpp
@@ -1,7 +1,7 @@
 // RUN: %check_clang_tidy -std=c++20 %s misc-coroutine-hostile-raii %t \
 // RUN:   -config="{CheckOptions: {\
 // RUN:             misc-coroutine-hostile-raii.RAIITypesList: 'my::Mutex; ::my::other::Mutex', \
-// RUN:             misc-coroutine-hostile-raii.AllowedAwaitablesList: 'safe::awaitable; ::my::other::awaitable' \
+// RUN:             misc-coroutine-hostile-raii.AllowedAwaitablesList: 'safe::awaitable; ::transformable::awaitable' \
 // RUN:             }}"
 
 namespace std {
@@ -136,6 +136,9 @@ ReturnObject scopedLockableTest() {
     absl::Mutex no_warning_5;
 }
 
+// ================================================================================
+// Safe awaitable
+// ================================================================================
 namespace safe {
   struct awaitable {
   bool await_ready() noexcept { return false; }
@@ -150,6 +153,32 @@ ReturnObject RAIISafeSuspendTest() {
   co_await other{};
 } 
 
+// ================================================================================
+// Safe transformable awaitable
+// ================================================================================
+struct transformable { struct awaitable{}; };
+using alias_transformable_awaitable = transformable::awaitable;
+struct UseTransformAwaitable {
+  struct promise_type {
+    UseTransformAwaitable get_return_object() { return {}; }
+    std::suspend_always initial_suspend() { return {}; }
+    std::suspend_always final_suspend() noexcept { return {}; }
+    void unhandled_exception() {}
+    std::suspend_always await_transform(transformable::awaitable) { return {}; }
+  };
+};
+
+auto retAwaitable() { return transformable::awaitable{}; }
+UseTransformAwaitable RAIISafeSuspendTest2() {
+  absl::Mutex a;
+  co_await retAwaitable();
+  co_await transformable::awaitable{};
+  co_await alias_transformable_awaitable{};
+}
+
+// ================================================================================
+// Lambdas
+// ================================================================================
 void lambda() {
   absl::Mutex no_warning;
   auto lambda = []() -> ReturnObject {
@@ -164,6 +193,9 @@ void lambda() {
   absl::Mutex no_warning_2;
 }
 
+// ================================================================================
+// Denylisted RAII
+// ================================================================================
 template<class T>
 ReturnObject raii_in_template(){
   T a;

From dc410f94f602390a65c832cf348b9ee6556b1809 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Tue, 23 Jan 2024 17:08:04 -0500
Subject: [PATCH 701/843] [Clang][Driver] Fix `--save-temps` for OpenCL AoT
 compilation (#78333)

We can directly call `clang -c -x cl -target amdgcn -mcpu=gfx90a test.cl
-o test.o`
to compile an OpenCL kernel file. However, when `--save-temps` is
enabled, it doesn't
work because the preprocessed file (`.i` file) is taken as C source file
when it
is fed to the front end, thus causing compilation error because those
OpenCL keywords
can't be recognized. This patch fixes the issue.
---
 clang/include/clang/Driver/Types.def       |  6 ++++--
 clang/lib/Driver/Types.cpp                 |  7 ++++++-
 clang/test/Driver/opencl_aot_save_temps.cl | 16 ++++++++++++++++
 3 files changed, 26 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/Driver/opencl_aot_save_temps.cl

diff --git a/clang/include/clang/Driver/Types.def b/clang/include/clang/Driver/Types.def
index b889883125c4c..f72c27e1ee701 100644
--- a/clang/include/clang/Driver/Types.def
+++ b/clang/include/clang/Driver/Types.def
@@ -37,8 +37,10 @@
 // C family source language (with and without preprocessing).
 TYPE("cpp-output",               PP_C,         INVALID,         "i",      phases::Compile, phases::Backend, phases::Assemble, phases::Link)
 TYPE("c",                        C,            PP_C,            "c",      phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link)
-TYPE("cl",                       CL,           PP_C,            "cl",     phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link)
-TYPE("clcpp",                    CLCXX,        PP_CXX,          "clcpp",  phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link)
+TYPE("cl",                       CL,           PP_CL,           "cl",     phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link)
+TYPE("cl-cpp-output",            PP_CL,        INVALID,         "cli",    phases::Compile, phases::Backend, phases::Assemble, phases::Link)
+TYPE("clcpp",                    CLCXX,        PP_CLCXX,        "clcpp",  phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link)
+TYPE("clcpp-cpp-output",         PP_CLCXX,     INVALID,         "clii",   phases::Compile, phases::Backend, phases::Assemble, phases::Link)
 TYPE("cuda-cpp-output",          PP_CUDA,      INVALID,         "cui",    phases::Compile, phases::Backend, phases::Assemble, phases::Link)
 TYPE("cuda",                     CUDA,         PP_CUDA,         "cu",     phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link)
 TYPE("cuda",                     CUDA_DEVICE,  PP_CUDA,         "cu",     phases::Preprocess, phases::Compile, phases::Backend, phases::Assemble, phases::Link)
diff --git a/clang/lib/Driver/Types.cpp b/clang/lib/Driver/Types.cpp
index 08df34ade7b65..a7b6b9000e1d2 100644
--- a/clang/lib/Driver/Types.cpp
+++ b/clang/lib/Driver/Types.cpp
@@ -133,7 +133,7 @@ bool types::isAcceptedByClang(ID Id) {
 
   case TY_Asm:
   case TY_C: case TY_PP_C:
-  case TY_CL: case TY_CLCXX:
+  case TY_CL: case TY_PP_CL: case TY_CLCXX: case TY_PP_CLCXX:
   case TY_CUDA: case TY_PP_CUDA:
   case TY_CUDA_DEVICE:
   case TY_HIP:
@@ -181,7 +181,9 @@ bool types::isDerivedFromC(ID Id) {
   case TY_PP_C:
   case TY_C:
   case TY_CL:
+  case TY_PP_CL:
   case TY_CLCXX:
+  case TY_PP_CLCXX:
   case TY_PP_CUDA:
   case TY_CUDA:
   case TY_CUDA_DEVICE:
@@ -241,6 +243,7 @@ bool types::isCXX(ID Id) {
   case TY_PP_CXXHeaderUnit:
   case TY_ObjCXXHeader: case TY_PP_ObjCXXHeader:
   case TY_CXXModule: case TY_PP_CXXModule:
+  case TY_PP_CLCXX:
   case TY_CUDA: case TY_PP_CUDA: case TY_CUDA_DEVICE:
   case TY_HIP:
   case TY_PP_HIP:
@@ -310,7 +313,9 @@ types::ID types::lookupTypeForExtension(llvm::StringRef Ext) {
       .Case("cc", TY_CXX)
       .Case("CC", TY_CXX)
       .Case("cl", TY_CL)
+      .Case("cli", TY_PP_CL)
       .Case("clcpp", TY_CLCXX)
+      .Case("clii", TY_PP_CLCXX)
       .Case("cp", TY_CXX)
       .Case("cu", TY_CUDA)
       .Case("hh", TY_CXXHeader)
diff --git a/clang/test/Driver/opencl_aot_save_temps.cl b/clang/test/Driver/opencl_aot_save_temps.cl
new file mode 100644
index 0000000000000..7b3149871f8fc
--- /dev/null
+++ b/clang/test/Driver/opencl_aot_save_temps.cl
@@ -0,0 +1,16 @@
+// RUN: %clang -x cl --save-temps -c -### %s 2>&1 | FileCheck %s
+// RUN: %clang -x cl -ccc-print-phases -c %s 2>&1 | FileCheck %s -check-prefix=CHECK-PHASES
+
+// CHECK: "-o" "[[CLI_NAME:.+]].cli" "-x" "cl"
+// CHECK-NEXT:  "-o" "[[CLI_NAME]].bc" "-x" "cl-cpp-output"{{.*}}"[[CLI_NAME:.+]].cli"
+
+// CHECK-PHASES: 0: input, {{.*}}, cl
+// CHECK-PHASES: 1: preprocessor, {0}, cl-cpp-output
+// CHECK-PHASES: 2: compiler, {1}, ir
+
+uint3 add(uint3 a, uint3 b) {
+  ulong x = a.x + (ulong)b.x;
+  ulong y = a.y + (ulong)b.y + (x >> 32);
+  uint z = a.z + b.z + (y >> 32);
+  return (uint3)(x, y, z);
+}

From 1a300d6da3f3d10e02d9580f8f3f2080bba8adf9 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang@amd.com>
Date: Tue, 23 Jan 2024 14:30:11 -0800
Subject: [PATCH 702/843] AMDGPU: Add SourceOfDivergence for
 int_amdgcn_global_load_tr (#79218)

---
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |  1 +
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   | 81 +++++++++++++++++++
 2 files changed, 82 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index b0ea4aba01894..67263f23b9831 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -414,6 +414,7 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
 def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
 def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
 def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
+def : SourceOfDivergence<int_amdgcn_global_load_tr>;
 
 // The dummy boolean output is divergent from the IR's perspective,
 // but the mask results are uniform. These produce a divergent and
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index 8826263eabb69..a08ca86c8a619 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -109,6 +109,78 @@ bb:
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
+define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
+  store <2 x i32> %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep)
+define amdgpu_kernel void @global_load_tr_b128_v8i16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %tmp0 = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1) %gep)
+  store <8 x i16> %tmp0, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep)
+define amdgpu_kernel void @global_load_tr_b128_v8f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %tmp0 = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1) %gep)
+  store <8 x half> %tmp0, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep)
+define amdgpu_kernel void @global_load_tr_b128_v8bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %tmp0 = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1) %gep)
+  store <8 x bfloat> %tmp0, ptr addrspace(1) %out, align 16
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep)
+define amdgpu_kernel void @global_load_tr_b64_i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %tmp0 = call i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1) %gep)
+  store i32 %tmp0, ptr addrspace(1) %out, align 4
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep)
+define amdgpu_kernel void @global_load_tr_b128_v4i16_(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %tmp0 = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1) %gep)
+  store <4 x i16> %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep)
+define amdgpu_kernel void @global_load_tr_b128_v4f16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %tmp0 = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1) %gep)
+  store <4 x half> %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep)
+define amdgpu_kernel void @global_load_tr_b128_v4bf16(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
+bb:
+  %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
+  %tmp0 = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1) %gep)
+  store <4 x bfloat> %tmp0, ptr addrspace(1) %out, align 8
+  ret void
+}
+
 declare i32 @llvm.amdgcn.ds.swizzle(i32, i32) #1
 declare i32 @llvm.amdgcn.permlane16(i32, i32, i32, i32, i1, i1) #1
 declare i32 @llvm.amdgcn.permlanex16(i32, i32, i32, i32, i1, i1) #1
@@ -125,5 +197,14 @@ declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
 declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
 
+declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1))
+declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1))
+declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16(ptr addrspace(1))
+declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16(ptr addrspace(1))
+declare i32 @llvm.amdgcn.global.load.tr.i32(ptr addrspace(1))
+declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16(ptr addrspace(1))
+declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16(ptr addrspace(1))
+declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16(ptr addrspace(1))
+
 attributes #0 = { nounwind convergent }
 attributes #1 = { nounwind readnone convergent }

From c28ab6274322da693ea41a49e301c15b551c974a Mon Sep 17 00:00:00 2001
From: Chris Apple <14171107+cjappl@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:42:59 -0800
Subject: [PATCH 703/843] [NFCI] Move SANITIZER_WEAK_IMPORT to sanitizer_common
 (#79208)

SANITIZER_WEAK_IMPORT is useful for any call that needs to be
conditionally linked in. This is currently used for the
tsan_dispatch_interceptors, but can be used for other calls introduced
in newer versions of MacOS. (such as `aligned_alloc` in this PR
https://github.com/llvm/llvm-project/pull/79198).

This PR moves the definition to a higher level so it can be used in
other sanitizers.
---
 .../lib/sanitizer_common/sanitizer_internal_defs.h       | 9 ++++++++-
 compiler-rt/lib/tsan/rtl/tsan_dispatch_defs.h            | 7 -------
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
index 3809669dd48bb..992721757e88d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_internal_defs.h
@@ -35,13 +35,20 @@
 # define SANITIZER_INTERFACE_ATTRIBUTE __declspec(dllexport)
 #endif
 # define SANITIZER_WEAK_ATTRIBUTE
+#  define SANITIZER_WEAK_IMPORT
 #elif SANITIZER_GO
 # define SANITIZER_INTERFACE_ATTRIBUTE
 # define SANITIZER_WEAK_ATTRIBUTE
+#  define SANITIZER_WEAK_IMPORT
 #else
 # define SANITIZER_INTERFACE_ATTRIBUTE __attribute__((visibility("default")))
 # define SANITIZER_WEAK_ATTRIBUTE  __attribute__((weak))
-#endif
+#  if SANITIZER_APPLE
+#    define SANITIZER_WEAK_IMPORT extern "C" __attribute((weak_import))
+#  else
+#    define SANITIZER_WEAK_IMPORT extern "C" SANITIZER_WEAK_ATTRIBUTE
+#  endif  // SANITIZER_APPLE
+#endif    // SANITIZER_WINDOWS
 
 //--------------------------- WEAK FUNCTIONS ---------------------------------//
 // When working with weak functions, to simplify the code and make it more
diff --git a/compiler-rt/lib/tsan/rtl/tsan_dispatch_defs.h b/compiler-rt/lib/tsan/rtl/tsan_dispatch_defs.h
index 54c0b0ba4b409..8d38beb0b0a20 100644
--- a/compiler-rt/lib/tsan/rtl/tsan_dispatch_defs.h
+++ b/compiler-rt/lib/tsan/rtl/tsan_dispatch_defs.h
@@ -56,13 +56,6 @@ extern const dispatch_block_t _dispatch_data_destructor_munmap;
 # define DISPATCH_NOESCAPE
 #endif
 
-#if SANITIZER_APPLE
-# define SANITIZER_WEAK_IMPORT extern "C" __attribute((weak_import))
-#else
-# define SANITIZER_WEAK_IMPORT extern "C" __attribute((weak))
-#endif
-
-
 // Data types used in dispatch APIs
 typedef unsigned long size_t;
 typedef unsigned long uintptr_t;

From 575568de4166bf69e0a5bc68978580afbe936878 Mon Sep 17 00:00:00 2001
From: Aart Bik <39774503+aartbik@users.noreply.github.com>
Date: Tue, 23 Jan 2024 14:51:46 -0800
Subject: [PATCH 704/843] [mlir][sparse] adjust compression scheme for example
 (#79212)

---
 .../Dialect/SparseTensor/CPU/sparse_block3d.mlir            | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
index df12a6e042dde..024e86b4f165b 100755
--- a/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
+++ b/mlir/test/Integration/Dialect/SparseTensor/CPU/sparse_block3d.mlir
@@ -32,9 +32,9 @@
 
 #Sparse1 = #sparse_tensor.encoding<{
   map = (i, j, k) -> (
-    i : compressed,
     j : compressed,
-    k : compressed
+    k : compressed,
+    i : dense
   )
 }>
 
@@ -91,7 +91,7 @@ module {
     // dense “fibers” in the i-dim, we end up with 8 stored entries.
     //
     // CHECK: 8
-    // CHECK-NEXT: ( 1, 5, 2, 6, 3, 7, 4, 8 )
+    // CHECK-NEXT: ( 1, 2, 3, 4, 5, 6, 7, 8 )
     //
     %na = sparse_tensor.number_of_entries %a : tensor<4x4x4xi32, #Sparse1>
     vector.print %na : index

From 750e90e4403df23d6b271afb90e6b4d463739965 Mon Sep 17 00:00:00 2001
From: Krzysztof Drewniak <Krzysztof.Drewniak@amd.com>
Date: Tue, 23 Jan 2024 16:52:21 -0600
Subject: [PATCH 705/843] [mlir][ArithToAMDGPU] Add option for saturating
 truncation to fp8 (#74153)

Many machine-learning applications (and most software written at AMD)
expect the operation that truncates floats to 8-bit floats to be
saturatinng. That is, they expect `truncf 256.0 : f32 to f8E4M3FNUZ` to
yield `240.0`, not `NaN`, and similarly for negative numbers. However,
the underlying hardware instruction that can be used for this truncation
implements overflow-to-NaN semantics.

To enable handling this usecase, we add the saturate-fp8-truncf option
to ArithToAMDGPU (off by default), which causes the requisite clamping
code to be emitted. Said clamping code ensures that Inf and NaN are
passed through exactly (and thus trancate to NaN).

Per review feedback, this commit efactors
createScalarOrSplatConstant() to the Arith dialect utilities and uses
it in this code. It also fixes naming of existing patterns and
switches from vector.extractelement/insertelement to
vector.extract/insert.
---
 .../Conversion/ArithToAMDGPU/ArithToAMDGPU.h  |   8 +-
 mlir/include/mlir/Conversion/Passes.td        |   6 +
 mlir/include/mlir/Dialect/Arith/Utils/Utils.h |  11 ++
 .../ArithToAMDGPU/ArithToAMDGPU.cpp           | 108 +++++++++++++-----
 .../Conversion/ArithToAMDGPU/CMakeLists.txt   |   1 +
 .../Arith/Transforms/EmulateWideInt.cpp       |  30 +----
 mlir/lib/Dialect/Arith/Utils/Utils.cpp        |  34 ++++++
 .../ArithToAMDGPU/8-bit-float-saturation.mlir |  54 +++++++++
 .../ArithToAMDGPU/8-bit-floats.mlir           |  33 +++---
 9 files changed, 208 insertions(+), 77 deletions(-)
 create mode 100644 mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir

diff --git a/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h b/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h
index 7f445fee5ba6b..78c79c915e060 100644
--- a/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h
+++ b/mlir/include/mlir/Conversion/ArithToAMDGPU/ArithToAMDGPU.h
@@ -20,7 +20,13 @@ class Pass;
 #include "mlir/Conversion/Passes.h.inc"
 
 namespace arith {
-void populateArithToAMDGPUConversionPatterns(RewritePatternSet &patterns);
+/// Add patterns for rewriting `arith.extf` and `arith.truncf` on FP8 types
+/// to wrappers around AMDGPU--specific intrinsics. If `saturateFP8TruncF`
+/// is set, values outside the range of the destination type are clamped
+/// to the largest value of that type instead of being rewritten to Inf (aka
+/// NaN).
+void populateArithToAMDGPUConversionPatterns(RewritePatternSet &patterns,
+                                             bool saturateFP8TruncF);
 } // namespace arith
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Conversion/Passes.td b/mlir/include/mlir/Conversion/Passes.td
index 3467e042c493e..ec0a6284fe97d 100644
--- a/mlir/include/mlir/Conversion/Passes.td
+++ b/mlir/include/mlir/Conversion/Passes.td
@@ -125,6 +125,12 @@ def ArithToAMDGPUConversionPass : Pass<"convert-arith-to-amdgpu"> {
   }];
 
   let dependentDialects = ["amdgpu::AMDGPUDialect", "vector::VectorDialect"];
+
+  let options = [
+    Option<"saturateFP8Truncf", "saturate-fp8-truncf", "bool",
+           /*default=*/"false",
+           "Use saturating truncation for 8-bit float types">,
+  ];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/Arith/Utils/Utils.h b/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
index 62a84ee7903d7..402bd196f0736 100644
--- a/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
+++ b/mlir/include/mlir/Dialect/Arith/Utils/Utils.h
@@ -53,6 +53,17 @@ Value getValueOrCreateCastToIndexLike(OpBuilder &b, Location loc,
 Value convertScalarToDtype(OpBuilder &b, Location loc, Value operand,
                            Type toType, bool isUnsignedCast);
 
+/// Create a constant of type `type` at location `loc` whose value is `value`
+/// (an APInt or APFloat whose type must match the element type of `type`).
+/// If `type` is a shaped type, create a splat constant of the given value.
+/// Constants are folded if possible.
+Value createScalarOrSplatConstant(OpBuilder &builder, Location loc, Type type,
+                                  const APInt &value);
+Value createScalarOrSplatConstant(OpBuilder &builder, Location loc, Type type,
+                                  int64_t value);
+Value createScalarOrSplatConstant(OpBuilder &builder, Location loc, Type type,
+                                  const APFloat &value);
+
 /// Helper struct to build simple arithmetic quantities with minimal type
 /// inference support.
 struct ArithBuilder {
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
index 7785405eae67b..c625a302a3970 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
+++ b/mlir/lib/Conversion/ArithToAMDGPU/ArithToAMDGPU.cpp
@@ -10,6 +10,7 @@
 
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/PatternMatch.h"
@@ -34,17 +35,17 @@ struct ArithToAMDGPUConversionPass final
   void runOnOperation() override;
 };
 
-struct ExtfOnFloat8RewritePattern final
-    : public OpRewritePattern<arith::ExtFOp> {
-  using OpRewritePattern<arith::ExtFOp>::OpRewritePattern;
+struct ExtFOnFloat8RewritePattern final : OpRewritePattern<arith::ExtFOp> {
+  using OpRewritePattern::OpRewritePattern;
 
   LogicalResult match(arith::ExtFOp op) const override;
   void rewrite(arith::ExtFOp op, PatternRewriter &rewriter) const override;
 };
 
-struct TruncfToFloat8RewritePattern final
-    : public OpRewritePattern<arith::TruncFOp> {
-  using OpRewritePattern<arith::TruncFOp>::OpRewritePattern;
+struct TruncFToFloat8RewritePattern final : OpRewritePattern<arith::TruncFOp> {
+  bool saturateFP8 = false;
+  TruncFToFloat8RewritePattern(MLIRContext *ctx, bool saturateFP8)
+      : OpRewritePattern::OpRewritePattern(ctx), saturateFP8(saturateFP8) {}
 
   LogicalResult match(arith::TruncFOp op) const override;
   void rewrite(arith::TruncFOp op, PatternRewriter &rewriter) const override;
@@ -62,7 +63,7 @@ static Value castF32To(Type elementType, Value f32, Location loc,
   llvm_unreachable("The only 32-bit float type is f32");
 }
 
-LogicalResult ExtfOnFloat8RewritePattern::match(arith::ExtFOp op) const {
+LogicalResult ExtFOnFloat8RewritePattern::match(arith::ExtFOp op) const {
   Type inType = op.getIn().getType();
   if (auto inVecType = inType.dyn_cast<VectorType>()) {
     if (inVecType.isScalable())
@@ -75,7 +76,7 @@ LogicalResult ExtfOnFloat8RewritePattern::match(arith::ExtFOp op) const {
   return success(inType.isFloat8E5M2FNUZ() || inType.isFloat8E4M3FNUZ());
 }
 
-void ExtfOnFloat8RewritePattern::rewrite(arith::ExtFOp op,
+void ExtFOnFloat8RewritePattern::rewrite(arith::ExtFOp op,
                                          PatternRewriter &rewriter) const {
   Location loc = op.getLoc();
   Value in = op.getIn();
@@ -93,11 +94,13 @@ void ExtfOnFloat8RewritePattern::rewrite(arith::ExtFOp op,
   Value result =
       rewriter.createOrFold<vector::SplatOp>(loc, op.getOut().getType(), zero);
   if (inType.getShape().empty()) {
-    Value scalarIn = rewriter.create<vector::ExtractElementOp>(loc, in);
+    Value scalarIn =
+        rewriter.create<vector::ExtractOp>(loc, in, ArrayRef<int64_t>{});
     // Recurse to send the 0-D vector case to the 1-D vector case
     Value scalarExt =
         rewriter.create<arith::ExtFOp>(loc, outElemType, scalarIn);
-    result = rewriter.create<vector::InsertElementOp>(loc, scalarExt, zero);
+    result = rewriter.create<vector::InsertOp>(loc, scalarExt, zero,
+                                               ArrayRef<int64_t>{});
     return rewriter.replaceOp(op, result);
   }
   for (int64_t i = 0; i < numElements; i += 4) {
@@ -108,9 +111,7 @@ void ExtfOnFloat8RewritePattern::rewrite(arith::ExtFOp op,
       Value asFloat = rewriter.create<amdgpu::ExtPackedFp8Op>(
           loc, rewriter.getF32Type(), inSlice, j);
       Value asType = castF32To(outElemType, asFloat, loc, rewriter);
-      result = rewriter.create<vector::InsertElementOp>(
-          loc, asType, result,
-          rewriter.createOrFold<arith::ConstantIndexOp>(loc, i + j));
+      result = rewriter.create<vector::InsertOp>(loc, asType, result, i + j);
     }
   }
   rewriter.replaceOp(op, result);
@@ -127,7 +128,53 @@ static Value castToF32(Value value, Location loc, PatternRewriter &rewriter) {
   llvm_unreachable("The only 32-bit float type is f32");
 }
 
-LogicalResult TruncfToFloat8RewritePattern::match(arith::TruncFOp op) const {
+// If `in` is a finite value, clamp it between the maximum and minimum values
+// of `outElemType` so that subsequent conversion instructions don't
+// overflow those out-of-range values to NaN. These semantics are commonly
+// used in machine-learning contexts where failure to clamp would lead to
+// excessive NaN production.
+static Value clampInput(PatternRewriter &rewriter, Location loc,
+                        Type outElemType, Value source) {
+  Type sourceType = source.getType();
+  const llvm::fltSemantics &sourceSem =
+      cast<FloatType>(getElementTypeOrSelf(sourceType)).getFloatSemantics();
+  const llvm::fltSemantics &targetSem =
+      cast<FloatType>(outElemType).getFloatSemantics();
+
+  APFloat min = APFloat::getLargest(targetSem, /*Negative=*/true);
+  APFloat max = APFloat::getLargest(targetSem, /*Negative=*/false);
+  bool ignoredLosesInfo = false;
+  // We can ignore conversion failures here because this conversion promotes
+  // from a smaller type to a larger one - ex. there can be no loss of precision
+  // when casting fp8 to f16.
+  (void)min.convert(sourceSem, APFloat::rmNearestTiesToEven, &ignoredLosesInfo);
+  (void)max.convert(sourceSem, APFloat::rmNearestTiesToEven, &ignoredLosesInfo);
+
+  Value minCst = createScalarOrSplatConstant(rewriter, loc, sourceType, min);
+  Value maxCst = createScalarOrSplatConstant(rewriter, loc, sourceType, max);
+
+  Value inf = createScalarOrSplatConstant(
+      rewriter, loc, sourceType,
+      APFloat::getInf(sourceSem, /*Negative=*/false));
+  Value negInf = createScalarOrSplatConstant(
+      rewriter, loc, sourceType, APFloat::getInf(sourceSem, /*Negative=*/true));
+  Value isInf = rewriter.createOrFold<arith::CmpFOp>(
+      loc, arith::CmpFPredicate::OEQ, source, inf);
+  Value isNegInf = rewriter.createOrFold<arith::CmpFOp>(
+      loc, arith::CmpFPredicate::OEQ, source, negInf);
+  Value isNan = rewriter.createOrFold<arith::CmpFOp>(
+      loc, arith::CmpFPredicate::UNO, source, source);
+  Value isNonFinite = rewriter.create<arith::OrIOp>(
+      loc, rewriter.create<arith::OrIOp>(loc, isInf, isNegInf), isNan);
+
+  Value clampedBelow = rewriter.create<arith::MaximumFOp>(loc, source, minCst);
+  Value clamped = rewriter.create<arith::MinimumFOp>(loc, clampedBelow, maxCst);
+  Value res =
+      rewriter.create<arith::SelectOp>(loc, isNonFinite, source, clamped);
+  return res;
+}
+
+LogicalResult TruncFToFloat8RewritePattern::match(arith::TruncFOp op) const {
   Type outType = op.getOut().getType();
   if (auto outVecType = outType.dyn_cast<VectorType>()) {
     if (outVecType.isScalable())
@@ -137,22 +184,27 @@ LogicalResult TruncfToFloat8RewritePattern::match(arith::TruncFOp op) const {
       return failure();
     outType = outVecType.getElementType();
   }
+  auto inType = dyn_cast<FloatType>(getElementTypeOrSelf(op.getIn().getType()));
+  if (inType && inType.getWidth() <= 8 && saturateFP8)
+    // Conversion between 8-bit floats is not supported with truncation enabled.
+    return failure();
   return success(outType.isFloat8E5M2FNUZ() || outType.isFloat8E4M3FNUZ());
 }
 
-void TruncfToFloat8RewritePattern::rewrite(arith::TruncFOp op,
+void TruncFToFloat8RewritePattern::rewrite(arith::TruncFOp op,
                                            PatternRewriter &rewriter) const {
   Location loc = op.getLoc();
   Value in = op.getIn();
   Type outElemType = getElementTypeOrSelf(op.getOut().getType());
+  if (saturateFP8)
+    in = clampInput(rewriter, loc, outElemType, in);
   VectorType truncResType = VectorType::get(4, outElemType);
   if (!in.getType().isa<VectorType>()) {
     Value asFloat = castToF32(in, loc, rewriter);
     Value asF8s = rewriter.create<amdgpu::PackedTrunc2xFp8Op>(
         loc, truncResType, asFloat, /*sourceB=*/nullptr, 0,
         /*existing=*/nullptr);
-    Value result = rewriter.create<vector::ExtractElementOp>(
-        loc, asF8s, rewriter.createOrFold<arith::ConstantIndexOp>(loc, 0));
+    Value result = rewriter.create<vector::ExtractOp>(loc, asF8s, 0);
     return rewriter.replaceOp(op, result);
   }
   VectorType outType = op.getOut().getType().cast<VectorType>();
@@ -161,11 +213,13 @@ void TruncfToFloat8RewritePattern::rewrite(arith::TruncFOp op,
       loc, outElemType, rewriter.getFloatAttr(outElemType, 0.0));
   Value result = rewriter.createOrFold<vector::SplatOp>(loc, outType, zero);
   if (outType.getShape().empty()) {
-    Value scalarIn = rewriter.create<vector::ExtractElementOp>(loc, in);
+    Value scalarIn =
+        rewriter.create<vector::ExtractOp>(loc, in, ArrayRef<int64_t>{});
     // Recurse to send the 0-D vector case to the 1-D vector case
     Value scalarTrunc =
         rewriter.create<arith::TruncFOp>(loc, outElemType, scalarIn);
-    result = rewriter.create<vector::InsertElementOp>(loc, scalarTrunc, zero);
+    result = rewriter.create<vector::InsertOp>(loc, scalarTrunc, zero,
+                                               ArrayRef<int64_t>{});
     return rewriter.replaceOp(op, result);
   }
 
@@ -173,14 +227,11 @@ void TruncfToFloat8RewritePattern::rewrite(arith::TruncFOp op,
     int64_t elemsThisOp = std::min(numElements, i + 4) - i;
     Value thisResult = nullptr;
     for (int64_t j = 0; j < elemsThisOp; j += 2) {
-      Value elemA = rewriter.create<vector::ExtractElementOp>(
-          loc, in, rewriter.create<arith::ConstantIndexOp>(loc, i + j));
+      Value elemA = rewriter.create<vector::ExtractOp>(loc, in, i + j);
       Value asFloatA = castToF32(elemA, loc, rewriter);
       Value asFloatB = nullptr;
       if (j + 1 < elemsThisOp) {
-        Value elemB = rewriter.create<vector::ExtractElementOp>(
-            loc, in,
-            rewriter.createOrFold<arith::ConstantIndexOp>(loc, i + j + 1));
+        Value elemB = rewriter.create<vector::ExtractOp>(loc, in, i + j + 1);
         asFloatB = castToF32(elemB, loc, rewriter);
       }
       thisResult = rewriter.create<amdgpu::PackedTrunc2xFp8Op>(
@@ -196,15 +247,16 @@ void TruncfToFloat8RewritePattern::rewrite(arith::TruncFOp op,
 }
 
 void mlir::arith::populateArithToAMDGPUConversionPatterns(
-    RewritePatternSet &patterns) {
-  patterns.add<ExtfOnFloat8RewritePattern, TruncfToFloat8RewritePattern>(
-      patterns.getContext());
+    RewritePatternSet &patterns, bool saturateFP8TruncF) {
+  patterns.add<ExtFOnFloat8RewritePattern>(patterns.getContext());
+  patterns.add<TruncFToFloat8RewritePattern>(patterns.getContext(),
+                                             saturateFP8TruncF);
 }
 
 void ArithToAMDGPUConversionPass::runOnOperation() {
   Operation *op = getOperation();
   RewritePatternSet patterns(op->getContext());
-  arith::populateArithToAMDGPUConversionPatterns(patterns);
+  arith::populateArithToAMDGPUConversionPatterns(patterns, saturateFP8Truncf);
   if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns))))
     return signalPassFailure();
 }
diff --git a/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt b/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt
index 359015b6f86ad..e2c951b0b34d8 100644
--- a/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt
+++ b/mlir/lib/Conversion/ArithToAMDGPU/CMakeLists.txt
@@ -13,6 +13,7 @@ add_mlir_conversion_library(MLIRArithToAMDGPU
   LINK_LIBS PUBLIC
   MLIRAMDGPUDialect
   MLIRArithDialect
+  MLIRArithUtils
   MLIRVectorDialect
   MLIRPass
   MLIRTransforms
diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
index 9e783c51c63d1..8a4080ea01970 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateWideInt.cpp
@@ -10,6 +10,7 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Arith/Transforms/WideIntEmulationConverter.h"
+#include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Func/Transforms/FuncConversions.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -58,35 +59,6 @@ static Type reduceInnermostDim(VectorType type) {
   return VectorType::get(newShape, type.getElementType());
 }
 
-/// Returns a constant of integer of vector type filled with (repeated) `value`.
-static Value createScalarOrSplatConstant(ConversionPatternRewriter &rewriter,
-                                         Location loc, Type type,
-                                         const APInt &value) {
-  TypedAttr attr;
-  if (dyn_cast<IntegerType>(type)) {
-    attr = rewriter.getIntegerAttr(type, value);
-  } else {
-    auto vecTy = cast<VectorType>(type);
-    attr = SplatElementsAttr::get(vecTy, value);
-  }
-
-  return rewriter.create<arith::ConstantOp>(loc, attr);
-}
-
-/// Returns a constant of integer of vector type filled with (repeated) `value`.
-static Value createScalarOrSplatConstant(ConversionPatternRewriter &rewriter,
-                                         Location loc, Type type,
-                                         int64_t value) {
-  unsigned elementBitWidth = 0;
-  if (auto intTy = dyn_cast<IntegerType>(type))
-    elementBitWidth = intTy.getWidth();
-  else
-    elementBitWidth = cast<VectorType>(type).getElementTypeBitWidth();
-
-  return createScalarOrSplatConstant(rewriter, loc, type,
-                                     APInt(elementBitWidth, value));
-}
-
 /// Extracts the `input` vector slice with elements at the last dimension offset
 /// by `lastOffset`. Returns a value of vector type with the last dimension
 /// reduced to x1 or fully scalarized, e.g.:
diff --git a/mlir/lib/Dialect/Arith/Utils/Utils.cpp b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
index 0f39c24fb917d..bf274d4ae27ed 100644
--- a/mlir/lib/Dialect/Arith/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Arith/Utils/Utils.cpp
@@ -197,6 +197,40 @@ mlir::getValueOrCreateConstantIndexOp(OpBuilder &b, Location loc,
       }));
 }
 
+Value mlir::createScalarOrSplatConstant(OpBuilder &builder, Location loc,
+                                        Type type, const APInt &value) {
+  TypedAttr attr;
+  if (isa<IntegerType>(type)) {
+    attr = builder.getIntegerAttr(type, value);
+  } else {
+    auto vecTy = cast<ShapedType>(type);
+    attr = SplatElementsAttr::get(vecTy, value);
+  }
+
+  return builder.create<arith::ConstantOp>(loc, attr);
+}
+
+Value mlir::createScalarOrSplatConstant(OpBuilder &builder, Location loc,
+                                        Type type, int64_t value) {
+  unsigned elementBitWidth = 0;
+  if (auto intTy = dyn_cast<IntegerType>(type))
+    elementBitWidth = intTy.getWidth();
+  else
+    elementBitWidth = cast<ShapedType>(type).getElementTypeBitWidth();
+
+  return createScalarOrSplatConstant(builder, loc, type,
+                                     APInt(elementBitWidth, value));
+}
+
+Value mlir::createScalarOrSplatConstant(OpBuilder &builder, Location loc,
+                                        Type type, const APFloat &value) {
+  if (isa<FloatType>(type))
+    return builder.createOrFold<arith::ConstantOp>(
+        loc, type, builder.getFloatAttr(type, value));
+  TypedAttr splat = SplatElementsAttr::get(cast<ShapedType>(type), value);
+  return builder.createOrFold<arith::ConstantOp>(loc, type, splat);
+}
+
 Value ArithBuilder::_and(Value lhs, Value rhs) {
   return b.create<arith::AndIOp>(loc, lhs, rhs);
 }
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
new file mode 100644
index 0000000000000..c7f39440a349b
--- /dev/null
+++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-float-saturation.mlir
@@ -0,0 +1,54 @@
+// RUN: mlir-opt --split-input-file %s \
+// RUN:   --pass-pipeline='builtin.module(func.func(convert-arith-to-amdgpu{saturate-fp8-truncf=true}))' \
+// RUN:   | FileCheck %s
+
+// CHECK-LABEL: func.func @scalar_trunc
+// CHECK-SAME: ([[V:%.+]]: f16)
+// CHECK-DAG: [[CMin:%.+]] = arith.constant -5.734400e+04 : f16
+// CHECK-DAG: [[CMax:%.+]] = arith.constant 5.734400e+04 : f16
+// CHECK-DAG: [[CInf:%.+]] = arith.constant 0x7C00 : f16
+// CHECK-DAG: [[CNegInf:%.+]] = arith.constant 0xFC00 : f16
+// CHECK: [[ISINF:%.+]] = arith.cmpf oeq, [[V]], [[CInf]]
+// CHECK: [[ISNEGINF:%.+]] = arith.cmpf oeq, [[V]], [[CNegInf]]
+// CHECK: [[ISNAN:%.+]] = arith.cmpf uno, [[V]], [[V]]
+// CHECK: [[ISNONFINITE_1:%.+]] = arith.ori [[ISINF]], [[ISNEGINF]]
+// CHECK: [[ISNONFINITE:%.+]] = arith.ori [[ISNONFINITE_1]], [[ISNAN]]
+// CHECK: [[CLAMPEDBELOW:%.+]] = arith.maximumf [[V]], [[CMin]]
+// CHECK: [[CLAMPED:%.+]] = arith.minimumf [[CLAMPEDBELOW]], [[CMax]]
+// CHECK: [[SATURATED:%.+]] = arith.select [[ISNONFINITE]], [[V]], [[CLAMPED]]
+// CHECK: [[FLOAT:%.+]] = arith.extf [[SATURATED]] : f16 to f32
+// CHECK: [[TRUNCV:%.+]] = amdgpu.packed_trunc_2xfp8 [[FLOAT]], undef into undef[word 0] : f32 to vector<4xf8E5M2FNUZ>
+// CHECK: [[W:%.+]] = vector.extract [[TRUNCV]][0] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
+// CHECK: return [[W]] : f8E5M2FNUZ
+func.func @scalar_trunc(%v: f16) -> f8E5M2FNUZ {
+  %w = arith.truncf %v : f16 to f8E5M2FNUZ
+  return %w : f8E5M2FNUZ
+}
+
+// No 0-D test because arith.truncf hasn't been extended to support it.
+
+// -----
+
+// CHECK-LABEL: func.func @vector_trunc
+// CHECK-SAME: ([[V:%.+]]: vector<2xf32>) -> vector<2xf8E4M3FNUZ> {
+// CHECK-DAG: [[CMin:%.+]] = arith.constant dense<-2.400000e+02> : vector<2xf32>
+// CHECK-DAG: [[CMax:%.+]] = arith.constant dense<2.400000e+02> : vector<2xf32>
+// CHECK-DAG: [[CInf:%.+]] = arith.constant dense<0x7F800000> : vector<2xf32>
+// CHECK-DAG: [[CNegInf:%.+]] = arith.constant dense<0xFF800000> : vector<2xf32>
+// CHECK: [[ISINF:%.+]] = arith.cmpf oeq, [[V]], [[CInf]]
+// CHECK: [[ISNEGINF:%.+]] = arith.cmpf oeq, [[V]], [[CNegInf]]
+// CHECK: [[ISNAN:%.+]] = arith.cmpf uno, [[V]], [[V]]
+// CHECK: [[ISNONFINITE_1:%.+]] = arith.ori [[ISINF]], [[ISNEGINF]]
+// CHECK: [[ISNONFINITE:%.+]] = arith.ori [[ISNONFINITE_1]], [[ISNAN]]
+// CHECK: [[CLAMPEDBELOW:%.+]] = arith.maximumf [[V]], [[CMin]]
+// CHECK: [[CLAMPED:%.+]] = arith.minimumf [[CLAMPEDBELOW]], [[CMax]]
+// CHECK: [[SATURATED:%.+]] = arith.select [[ISNONFINITE]], [[V]], [[CLAMPED]]
+// CHECK: [[F0:%.+]] = vector.extract [[SATURATED]][0]
+// CHECK: [[F1:%.+]] = vector.extract [[SATURATED]][1]
+// CHECK: [[W0:%.+]] = amdgpu.packed_trunc_2xfp8 [[F0]], [[F1]] into undef[word 0] : f32 to vector<4xf8E4M3FNUZ>
+// CHECK: [[W:%.+]] = vector.extract_strided_slice [[W0]] {offsets = [0], sizes = [2], strides = [1]} : vector<4xf8E4M3FNUZ> to vector<2xf8E4M3FNUZ>
+// CHECK: return [[W]] : vector<2xf8E4M3FNUZ>
+func.func @vector_trunc_short(%v: vector<2xf32>) -> vector<2xf8E4M3FNUZ> {
+  %w = arith.truncf %v : vector<2xf32> to vector<2xf8E4M3FNUZ>
+  return %w : vector<2xf8E4M3FNUZ>
+}
diff --git a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
index a6c11d022e2c1..159a2f02f0560 100644
--- a/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
+++ b/mlir/test/Conversion/ArithToAMDGPU/8-bit-floats.mlir
@@ -17,14 +17,12 @@ func.func @scalar_ext(%v: f8E5M2FNUZ) -> f16 {
 // CHECK-LABEL: func.func @vector_ext_short
 // CHECK-SAME: ([[V:%.+]]: vector<2xf8E5M2FNUZ>)
 // CHECK-DAG: [[ZEROES:%.+]] = arith.constant dense<0.000000e+00> : vector<2xf64>
-// CHECK-DAG: [[C0:%.+]] = arith.constant 0 : index
-// CHECK-DAG: [[C1:%.+]] = arith.constant 1 : index
 // CHECK: [[FLOAT0:%.+]] = amdgpu.ext_packed_fp8 [[V]][0] : vector<2xf8E5M2FNUZ> to f32
 // CHECK: [[EXT0:%.+]] = arith.extf [[FLOAT0]] : f32 to f64
-// CHECK: [[W0:%.+]] = vector.insertelement [[EXT0]], [[ZEROES]]{{\[}}[[C0]]
+// CHECK: [[W0:%.+]] = vector.insert [[EXT0]], [[ZEROES]] [0]
 // CHECK: [[FLOAT1:%.+]] = amdgpu.ext_packed_fp8 [[V]][1] : vector<2xf8E5M2FNUZ> to f32
 // CHECK: [[EXT1:%.+]] = arith.extf [[FLOAT1]]
-// CHECK: [[W1:%.+]] = vector.insertelement [[EXT1]], [[W0]]{{\[}}[[C1]]
+// CHECK: [[W1:%.+]] = vector.insert [[EXT1]], [[W0]] [1]
 // CHECK: return [[W1]] : vector<2xf64>
 
 func.func @vector_ext_short(%v: vector<2xf8E5M2FNUZ>) -> vector<2xf64> {
@@ -38,27 +36,27 @@ func.func @vector_ext_short(%v: vector<2xf8E5M2FNUZ>) -> vector<2xf64> {
 // CHECK-SAME: ([[V:%.+]]: vector<9xf8E4M3FNUZ>)
 // CHECK: [[V0:%.+]] = vector.extract_strided_slice [[V]] {offsets = [0], sizes = [4], strides = [1]}
 // CHECK: [[F0:%.+]] = amdgpu.ext_packed_fp8 [[V0]][0]
-// CHECK: [[W0:%.+]] = vector.insertelement [[F0]]
+// CHECK: [[W0:%.+]] = vector.insert [[F0]]
 // CHECK: [[F1:%.+]] = amdgpu.ext_packed_fp8 [[V0]][1]
-// CHECK: [[W1:%.+]] = vector.insertelement [[F1]], [[W0]]
+// CHECK: [[W1:%.+]] = vector.insert [[F1]], [[W0]]
 // CHECK: [[F2:%.+]] = amdgpu.ext_packed_fp8 [[V0]][2]
-// CHECK: [[W2:%.+]] = vector.insertelement [[F2]], [[W1]]
+// CHECK: [[W2:%.+]] = vector.insert [[F2]], [[W1]]
 // CHECK: [[F3:%.+]] = amdgpu.ext_packed_fp8 [[V0]][3]
-// CHECK: [[W3:%.+]] = vector.insertelement [[F3]], [[W2]]
+// CHECK: [[W3:%.+]] = vector.insert [[F3]], [[W2]]
 
 // CHECK: [[V1:%.+]] = vector.extract_strided_slice [[V]] {offsets = [4], sizes = [4], strides = [1]} : vector<9xf8E4M3FNUZ> to vector<4xf8E4M3FNUZ>
 // CHECK: [[F4:%.+]] = amdgpu.ext_packed_fp8 [[V1]][0]
-// CHECK: [[W4:%.+]] = vector.insertelement [[F4]], [[W3]]
+// CHECK: [[W4:%.+]] = vector.insert [[F4]], [[W3]]
 // CHECK: [[F5:%.+]] = amdgpu.ext_packed_fp8 [[V1]][1]
-// CHECK: [[W5:%.+]] = vector.insertelement [[F5]], [[W4]]
+// CHECK: [[W5:%.+]] = vector.insert [[F5]], [[W4]]
 // CHECK: [[F6:%.+]] = amdgpu.ext_packed_fp8 [[V1]][2]
-// CHECK: [[W6:%.+]] = vector.insertelement [[F6]], [[W5]]
+// CHECK: [[W6:%.+]] = vector.insert [[F6]], [[W5]]
 // CHECK: [[F7:%.+]] = amdgpu.ext_packed_fp8 [[V1]][3]
-// CHECK: [[W7:%.+]] = vector.insertelement [[F7]], [[W6]]
+// CHECK: [[W7:%.+]] = vector.insert [[F7]], [[W6]]
 
 // CHECK: [[V2:%.+]] = vector.extract_strided_slice [[V]] {offsets = [8], sizes = [1], strides = [1]} : vector<9xf8E4M3FNUZ> to vector<1xf8E4M3FNUZ>
 // CHECK: [[F8:%.+]] = amdgpu.ext_packed_fp8 [[V2]][0]
-// CHECK: [[W8:%.+]] = vector.insertelement [[F8]], [[W7]]
+// CHECK: [[W8:%.+]] = vector.insert [[F8]], [[W7]]
 // CHECK: return [[W8]]
 func.func @vector_ext_long(%v: vector<9xf8E4M3FNUZ>) -> vector<9xf32> {
   %w = arith.extf %v : vector<9xf8E4M3FNUZ> to vector<9xf32>
@@ -69,10 +67,9 @@ func.func @vector_ext_long(%v: vector<9xf8E4M3FNUZ>) -> vector<9xf32> {
 
 // CHECK-LABEL: func.func @scalar_trunc
 // CHECK-SAME: ([[V:%.+]]: f16)
-// CHECK: [[C0:%.+]] = arith.constant 0 : index
 // CHECK: [[FLOAT:%.+]] = arith.extf [[V]] : f16 to f32
 // CHECK: [[TRUNCV:%.+]] = amdgpu.packed_trunc_2xfp8 [[FLOAT]], undef into undef[word 0] : f32 to vector<4xf8E5M2FNUZ>
-// CHECK: [[W:%.+]] = vector.extractelement [[TRUNCV]]{{\[}}[[C0]] : index] : vector<4xf8E5M2FNUZ>
+// CHECK: [[W:%.+]] = vector.extract [[TRUNCV]][0] : f8E5M2FNUZ from vector<4xf8E5M2FNUZ>
 // CHECK: return [[W]] : f8E5M2FNUZ
 func.func @scalar_trunc(%v: f16) -> f8E5M2FNUZ {
   %w = arith.truncf %v : f16 to f8E5M2FNUZ
@@ -85,11 +82,9 @@ func.func @scalar_trunc(%v: f16) -> f8E5M2FNUZ {
 
 // CHECK-LABEL: func.func @vector_trunc_short
 // CHECK-SAME: ([[V:%.+]]: vector<2xf64>) -> vector<2xf8E5M2FNUZ> {
-// CHECK-DAG: [[C0:%.+]] = arith.constant 0 : index
-// CHECK-DAG: [[C1:%.+]] = arith.constant 1 : index
-// CHECK: [[V0:%.+]] = vector.extractelement [[V]]{{\[}}[[C0]] : index]
+// CHECK: [[V0:%.+]] = vector.extract [[V]][0]
 // CHECK: [[F0:%.+]] = arith.truncf [[V0]] : f64 to f32
-// CHECK: [[V1:%.+]] = vector.extractelement [[V]]{{\[}}[[C1]] : index]
+// CHECK: [[V1:%.+]] = vector.extract [[V]][1]
 // CHECK: [[F1:%.+]] = arith.truncf [[V1]] : f64 to f32
 // CHECK: [[W0:%.+]] = amdgpu.packed_trunc_2xfp8 [[F0]], [[F1]] into undef[word 0] : f32 to vector<4xf8E5M2FNUZ>
 // CHECK: [[W:%.+]] = vector.extract_strided_slice [[W0]] {offsets = [0], sizes = [2], strides = [1]} : vector<4xf8E5M2FNUZ> to vector<2xf8E5M2FNUZ>

From 7fc25928233c133a4af1dadf0e060fb5d42ebd4e Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 23 Jan 2024 10:43:37 +0000
Subject: [PATCH 706/843] [DebugInfo][RemoveDIs] "Final" cleanup for non-instr
 debug-info (#79121)

Here's a raft of minor fixes for the RemoveDIs project that's replacing
dbg.value intrinsics with DPValue objects, all IMO trivial:
 * When inserting functions or blocks and calling setIsNewDbgInfoFormat,
   do that after setting the Parent pointer, just in case conversion from
   (or to) dbg.value mode is triggered.
 * When transferring DPValues from an empty range in a splice call, don't
   transfer if there are no DPValues attached to the source block at all.
 * stripNonLineTableDebugInfo should drop DPValues.
 * In insertBefore, don't try to transfer DPValues if there aren't any.
---
 llvm/include/llvm/IR/Function.h                  |  3 ++-
 llvm/lib/IR/BasicBlock.cpp                       | 16 +++++++++++-----
 llvm/lib/IR/DebugInfo.cpp                        |  3 +++
 llvm/lib/IR/Instruction.cpp                      |  6 +++---
 llvm/test/DebugInfo/salvage-limit-expr-size.ll   |  1 +
 .../strip-nonlinetable-debuginfo-localvars.ll    |  1 +
 6 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/llvm/include/llvm/IR/Function.h b/llvm/include/llvm/IR/Function.h
index 6ac768b51395c..cb87a44980321 100644
--- a/llvm/include/llvm/IR/Function.h
+++ b/llvm/include/llvm/IR/Function.h
@@ -722,8 +722,9 @@ class LLVM_EXTERNAL_VISIBILITY Function : public GlobalObject,
   /// Insert \p BB in the basic block list at \p Position. \Returns an iterator
   /// to the newly inserted BB.
   Function::iterator insert(Function::iterator Position, BasicBlock *BB) {
+    Function::iterator FIt = BasicBlocks.insert(Position, BB);
     BB->setIsNewDbgInfoFormat(IsNewDbgInfoFormat);
-    return BasicBlocks.insert(Position, BB);
+    return FIt;
   }
 
   /// Transfer all blocks from \p FromF to this function at \p ToIt.
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index 15b7e50fe6eca..dca5283283847 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -237,12 +237,12 @@ void BasicBlock::insertInto(Function *NewParent, BasicBlock *InsertBefore) {
   assert(NewParent && "Expected a parent");
   assert(!Parent && "Already has a parent");
 
-  setIsNewDbgInfoFormat(NewParent->IsNewDbgInfoFormat);
-
   if (InsertBefore)
     NewParent->insert(InsertBefore->getIterator(), this);
   else
     NewParent->insert(NewParent->end(), this);
+
+  setIsNewDbgInfoFormat(NewParent->IsNewDbgInfoFormat);
 }
 
 BasicBlock::~BasicBlock() {
@@ -821,9 +821,15 @@ void BasicBlock::spliceDebugInfoEmptyBlock(BasicBlock::iterator Dest,
 
   // There are instructions in this block; if the First iterator was
   // with begin() / getFirstInsertionPt() then the caller intended debug-info
-  // at the start of the block to be transferred.
-  if (!Src->empty() && First == Src->begin() && ReadFromHead)
-    Dest->DbgMarker->absorbDebugValues(*First->DbgMarker, InsertAtHead);
+  // at the start of the block to be transferred. Return otherwise.
+  if (Src->empty() || First != Src->begin() || !ReadFromHead)
+    return;
+
+  // Is there actually anything to transfer?
+  if (!First->hasDbgValues())
+    return;
+
+  createMarker(Dest)->absorbDebugValues(*First->DbgMarker, InsertAtHead);
 
   return;
 }
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index 53cbf01918969..d8c1b0d534f61 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -892,6 +892,9 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
         // Strip heapallocsite attachments, they point into the DIType system.
         if (I.hasMetadataOtherThanDebugLoc())
           I.setMetadata("heapallocsite", nullptr);
+
+        // Strip any DPValues attached.
+        I.dropDbgValues();
       }
     }
   }
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 717e33f1857b8..d7bf1447921fe 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -146,9 +146,9 @@ void Instruction::insertBefore(BasicBlock &BB,
   bool InsertAtHead = InsertPos.getHeadBit();
   if (!InsertAtHead) {
     DPMarker *SrcMarker = BB.getMarker(InsertPos);
-    if (!SrcMarker)
-      SrcMarker = BB.createMarker(InsertPos);
-    DbgMarker->absorbDebugValues(*SrcMarker, false);
+    // If there's no source marker, InsertPos is very likely end().
+    if (SrcMarker)
+      DbgMarker->absorbDebugValues(*SrcMarker, false);
   }
 
   // If we're inserting a terminator, check if we need to flush out
diff --git a/llvm/test/DebugInfo/salvage-limit-expr-size.ll b/llvm/test/DebugInfo/salvage-limit-expr-size.ll
index 39bf14b062e1e..94e451327b214 100644
--- a/llvm/test/DebugInfo/salvage-limit-expr-size.ll
+++ b/llvm/test/DebugInfo/salvage-limit-expr-size.ll
@@ -1,4 +1,5 @@
 ; RUN: opt %s -passes=dce -S | FileCheck %s
+; RUN: opt %s -passes=dce -S --try-experimental-debuginfo-iterators | FileCheck %s
 
 ;; Tests that a DIExpression will only be salvaged up to a certain length, and
 ;; will produce an undef value if an expression would need to exceed that length.
diff --git a/llvm/test/Transforms/Util/strip-nonlinetable-debuginfo-localvars.ll b/llvm/test/Transforms/Util/strip-nonlinetable-debuginfo-localvars.ll
index 83887397459db..19558a227b9f7 100644
--- a/llvm/test/Transforms/Util/strip-nonlinetable-debuginfo-localvars.ll
+++ b/llvm/test/Transforms/Util/strip-nonlinetable-debuginfo-localvars.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -passes=strip-nonlinetable-debuginfo %s -o - | FileCheck %s
+; RUN: opt -S -passes=strip-nonlinetable-debuginfo %s -o - --try-experimental-debuginfo-iterators | FileCheck %s
 ; CHECK: define void @f() !dbg ![[F:[0-9]+]]
 define void @f() !dbg !4 {
 entry:

From a16f81f5e3313e88f96de35e5edfe8bee463d308 Mon Sep 17 00:00:00 2001
From: Thurston Dang <thurston@google.com>
Date: Tue, 23 Jan 2024 22:56:22 +0000
Subject: [PATCH 707/843] Revert "[ASan][libc++] Turn on ASan annotations for
 short strings (#79049)"

This reverts commit cb528ec5e6331ce207c7b835d7ab963bd5e13af7.

Reason: buildbot breakage (https://lab.llvm.org/buildbot/#/builders/5/builds/40364):
SUMMARY: AddressSanitizer: container-overflow /b/sanitizer-x86_64-linux-fast/build/libcxx_build_asan_ubsan/include/c++/v1/string:1870:29 in __get_long_pointer
---
 libcxx/include/string                         |  14 +-
 .../asan_deque_integration.pass.cpp           | 182 ------------------
 .../strings/basic.string/asan_short.pass.cpp  |  56 ------
 .../asan_vector_integration.pass.cpp          | 182 ------------------
 libcxx/test/support/asan_testing.h            |  29 ++-
 5 files changed, 34 insertions(+), 429 deletions(-)
 delete mode 100644 libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
 delete mode 100644 libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp

diff --git a/libcxx/include/string b/libcxx/include/string
index 4116f350a8047..e97139206d4fa 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -659,6 +659,7 @@ _LIBCPP_PUSH_MACROS
 #else
 #  define _LIBCPP_STRING_INTERNAL_MEMORY_ACCESS
 #endif
+#define _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED false
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
@@ -1895,17 +1896,22 @@ private:
 #endif
   }
 
+  // ASan: short string is poisoned if and only if this function returns true.
+  _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 bool __asan_short_string_is_annotated() const _NOEXCEPT {
+    return _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED && !__libcpp_is_constant_evaluated();
+  }
+
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_new(size_type __current_size) const _NOEXCEPT {
     (void) __current_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated())
+    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + capacity() + 1, data() + __current_size + 1);
 #endif
   }
 
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_delete() const _NOEXCEPT {
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated())
+    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + capacity() + 1);
 #endif
   }
@@ -1913,7 +1919,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_increase(size_type __n) const _NOEXCEPT {
     (void) __n;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated())
+    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + size() + 1, data() + size() + 1 + __n);
 #endif
   }
@@ -1921,7 +1927,7 @@ private:
   _LIBCPP_HIDE_FROM_ABI _LIBCPP_CONSTEXPR_SINCE_CXX20 void __annotate_shrink(size_type __old_size) const _NOEXCEPT {
     (void) __old_size;
 #if !defined(_LIBCPP_HAS_NO_ASAN) && defined(_LIBCPP_INSTRUMENTED_WITH_ASAN)
-    if (!__libcpp_is_constant_evaluated())
+    if (!__libcpp_is_constant_evaluated() && (__asan_short_string_is_annotated() || __is_long()))
       __annotate_contiguous_container(data() + __old_size + 1, data() + size() + 1);
 #endif
   }
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
deleted file mode 100644
index 1205190b3a6e1..0000000000000
--- a/libcxx/test/libcxx/containers/strings/basic.string/asan_deque_integration.pass.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// REQUIRES: asan
-// UNSUPPORTED: c++03
-
-#include <cassert>
-#include <string>
-#include <array>
-#include <deque>
-#include "test_macros.h"
-#include "asan_testing.h"
-#include "min_allocator.h"
-
-// This tests exists to check if strings work well with deque, as those
-// may be partialy annotated, we cannot simply call
-// is_double_ended_contiguous_container_asan_correct, as it assumes that
-// object memory inside is not annotated, so we check everything in a more careful way.
-
-template <typename D>
-void verify_inside(D const& d) {
-  for (size_t i = 0; i < d.size(); ++i) {
-    assert(is_string_asan_correct(d[i]));
-  }
-}
-
-template <typename S, size_t N>
-S get_s(char c) {
-  S s;
-  for (size_t i = 0; i < N; ++i)
-    s.push_back(c);
-
-  return s;
-}
-
-template <class C, class S>
-void test_string() {
-  size_t const N = sizeof(S) < 256 ? (4096 / sizeof(S)) : 16;
-
-  {
-    C d1a(1), d1b(N), d1c(N + 1), d1d(5 * N);
-    verify_inside(d1a);
-    verify_inside(d1b);
-    verify_inside(d1c);
-    verify_inside(d1d);
-  }
-  {
-    C d2;
-    for (size_t i = 0; i < 3 * N + 2; ++i) {
-      d2.push_back(get_s<S, 1>(i % 10 + 'a'));
-      verify_inside(d2);
-      d2.push_back(get_s<S, 22>(i % 10 + 'b'));
-      verify_inside(d2);
-
-      d2.pop_front();
-      verify_inside(d2);
-    }
-  }
-  {
-    C d3;
-    for (size_t i = 0; i < 3 * N + 2; ++i) {
-      d3.push_front(get_s<S, 1>(i % 10 + 'a'));
-      verify_inside(d3);
-      d3.push_front(get_s<S, 28>(i % 10 + 'b'));
-      verify_inside(d3);
-
-      d3.pop_back();
-      verify_inside(d3);
-    }
-  }
-  {
-    C d4;
-    for (size_t i = 0; i < 3 * N + 2; ++i) {
-      // When there is no SSO, all elements inside should not be poisoned,
-      // so we can verify deque poisoning.
-      d4.push_front(get_s<S, 33>(i % 10 + 'a'));
-      verify_inside(d4);
-      assert(is_double_ended_contiguous_container_asan_correct(d4));
-      d4.push_back(get_s<S, 28>(i % 10 + 'b'));
-      verify_inside(d4);
-      assert(is_double_ended_contiguous_container_asan_correct(d4));
-    }
-  }
-  {
-    C d5;
-    for (size_t i = 0; i < 3 * N + 2; ++i) {
-      // In d4 we never had poisoned memory inside deque.
-      // Here we start with SSO, so part of the inside of the container,
-      // will be poisoned.
-      d5.push_front(S());
-      verify_inside(d5);
-    }
-    for (size_t i = 0; i < d5.size(); ++i) {
-      // We change the size to have long string.
-      // Memory owne by deque should not be poisoned by string.
-      d5[i].resize(100);
-      verify_inside(d5);
-    }
-
-    assert(is_double_ended_contiguous_container_asan_correct(d5));
-
-    d5.erase(d5.begin() + 2);
-    verify_inside(d5);
-
-    d5.erase(d5.end() - 2);
-    verify_inside(d5);
-
-    assert(is_double_ended_contiguous_container_asan_correct(d5));
-  }
-  {
-    C d6a;
-    assert(is_double_ended_contiguous_container_asan_correct(d6a));
-
-    C d6b(N + 2, get_s<S, 100>('a'));
-    d6b.push_front(get_s<S, 101>('b'));
-    while (!d6b.empty()) {
-      d6b.pop_back();
-      assert(is_double_ended_contiguous_container_asan_correct(d6b));
-    }
-
-    C d6c(N + 2, get_s<S, 102>('c'));
-    while (!d6c.empty()) {
-      d6c.pop_back();
-      assert(is_double_ended_contiguous_container_asan_correct(d6c));
-    }
-  }
-  {
-    C d7(9 * N + 2);
-
-    d7.insert(d7.begin() + 1, S());
-    verify_inside(d7);
-
-    d7.insert(d7.end() - 3, S());
-    verify_inside(d7);
-
-    d7.insert(d7.begin() + 2 * N, get_s<S, 1>('a'));
-    verify_inside(d7);
-
-    d7.insert(d7.end() - 2 * N, get_s<S, 1>('b'));
-    verify_inside(d7);
-
-    d7.insert(d7.begin() + 2 * N, 3 * N, get_s<S, 1>('c'));
-    verify_inside(d7);
-
-    // It may not be short for big element types, but it will be checked correctly:
-    d7.insert(d7.end() - 2 * N, 3 * N, get_s<S, 2>('d'));
-    verify_inside(d7);
-
-    d7.erase(d7.begin() + 2);
-    verify_inside(d7);
-
-    d7.erase(d7.end() - 2);
-    verify_inside(d7);
-  }
-}
-
-template <class S>
-void test_container() {
-  test_string<std::deque<S, std::allocator<S>>, S>();
-  test_string<std::deque<S, min_allocator<S>>, S>();
-  test_string<std::deque<S, safe_allocator<S>>, S>();
-}
-
-int main(int, char**) {
-  // Those tests support only types based on std::basic_string.
-  test_container<std::string>();
-  test_container<std::wstring>();
-#if TEST_STD_VER >= 11
-  test_container<std::u16string>();
-  test_container<std::u32string>();
-#endif
-#if TEST_STD_VER >= 20
-  test_container<std::u8string>();
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
deleted file mode 100644
index 53c70bed189b5..0000000000000
--- a/libcxx/test/libcxx/containers/strings/basic.string/asan_short.pass.cpp
+++ /dev/null
@@ -1,56 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// REQUIRES: asan
-// UNSUPPORTED: c++03
-
-// <string>
-
-// Basic test if ASan annotations work for short strings.
-
-#include <string>
-#include <cassert>
-#include <cstdlib>
-
-#include "asan_testing.h"
-#include "min_allocator.h"
-#include "test_iterators.h"
-#include "test_macros.h"
-
-extern "C" void __sanitizer_set_death_callback(void (*callback)(void));
-
-void do_exit() { exit(0); }
-
-int main(int, char**) {
-  {
-    typedef cpp17_input_iterator<char*> MyInputIter;
-    // Should not trigger ASan.
-    std::basic_string<char, std::char_traits<char>, safe_allocator<char>> v;
-    char i[] = {'a', 'b', 'c', 'd'};
-
-    v.insert(v.begin(), MyInputIter(i), MyInputIter(i + 4));
-    assert(v[0] == 'a');
-    assert(is_string_asan_correct(v));
-  }
-
-  __sanitizer_set_death_callback(do_exit);
-  {
-    using T     = char;
-    using C     = std::basic_string<T, std::char_traits<T>, safe_allocator<T>>;
-    const T t[] = {'a', 'b', 'c', 'd', 'e', 'f', 'g'};
-    C c(std::begin(t), std::end(t));
-    assert(is_string_asan_correct(c));
-    assert(__sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) !=
-           0);
-    volatile T foo = c[c.size() + 1]; // should trigger ASAN. Use volatile to prevent being optimized away.
-    assert(false);                    // if we got here, ASAN didn't trigger
-    ((void)foo);
-  }
-
-  return 0;
-}
diff --git a/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp b/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp
deleted file mode 100644
index b7d95b7069083..0000000000000
--- a/libcxx/test/libcxx/containers/strings/basic.string/asan_vector_integration.pass.cpp
+++ /dev/null
@@ -1,182 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// REQUIRES: asan
-// UNSUPPORTED: c++03
-
-#include <cassert>
-#include <string>
-#include <vector>
-#include <array>
-#include "test_macros.h"
-#include "asan_testing.h"
-#include "min_allocator.h"
-
-// This tests exists to check if strings work well with vector, as those
-// may be partialy annotated, we cannot simply call
-// is_contiguous_container_asan_correct, as it assumes that
-// object memory inside is not annotated, so we check everything in a more careful way.
-
-template <typename D>
-void verify_inside(D const& d) {
-  for (size_t i = 0; i < d.size(); ++i) {
-    assert(is_string_asan_correct(d[i]));
-  }
-}
-
-template <typename S, size_t N>
-S get_s(char c) {
-  S s;
-  for (size_t i = 0; i < N; ++i)
-    s.push_back(c);
-
-  return s;
-}
-
-template <class C, class S>
-void test_string() {
-  size_t const N = sizeof(S) < 256 ? (4096 / sizeof(S)) : 16;
-
-  {
-    C d1a(1), d1b(N), d1c(N + 1), d1d(5 * N);
-    verify_inside(d1a);
-    verify_inside(d1b);
-    verify_inside(d1c);
-    verify_inside(d1d);
-  }
-  {
-    C d2;
-    for (size_t i = 0; i < 3 * N + 2; ++i) {
-      d2.push_back(get_s<S, 1>(i % 10 + 'a'));
-      verify_inside(d2);
-      d2.push_back(get_s<S, 28>(i % 10 + 'b'));
-      verify_inside(d2);
-
-      d2.erase(d2.cbegin());
-      verify_inside(d2);
-    }
-  }
-  {
-    C d3;
-    for (size_t i = 0; i < 3 * N + 2; ++i) {
-      d3.push_back(get_s<S, 1>(i % 10 + 'a'));
-      verify_inside(d3);
-      d3.push_back(get_s<S, 28>(i % 10 + 'b'));
-      verify_inside(d3);
-
-      d3.pop_back();
-      verify_inside(d3);
-    }
-  }
-  {
-    C d4;
-    for (size_t i = 0; i < 3 * N + 2; ++i) {
-      // When there is no SSO, all elements inside should not be poisoned,
-      // so we can verify vector poisoning.
-      d4.push_back(get_s<S, 33>(i % 10 + 'a'));
-      verify_inside(d4);
-      assert(is_contiguous_container_asan_correct(d4));
-      d4.push_back(get_s<S, 28>(i % 10 + 'b'));
-      verify_inside(d4);
-      assert(is_contiguous_container_asan_correct(d4));
-    }
-  }
-  {
-    C d5;
-    for (size_t i = 0; i < 3 * N + 2; ++i) {
-      // In d4 we never had poisoned memory inside vector.
-      // Here we start with SSO, so part of the inside of the container,
-      // will be poisoned.
-      d5.push_back(S());
-      verify_inside(d5);
-    }
-    for (size_t i = 0; i < d5.size(); ++i) {
-      // We change the size to have long string.
-      // Memory owne by vector should not be poisoned by string.
-      d5[i].resize(100);
-      verify_inside(d5);
-    }
-
-    assert(is_contiguous_container_asan_correct(d5));
-
-    d5.erase(d5.begin() + 2);
-    verify_inside(d5);
-
-    d5.erase(d5.end() - 2);
-    verify_inside(d5);
-
-    assert(is_contiguous_container_asan_correct(d5));
-  }
-  {
-    C d6a;
-    assert(is_contiguous_container_asan_correct(d6a));
-
-    C d6b(N + 2, get_s<S, 100>('a'));
-    d6b.push_back(get_s<S, 101>('b'));
-    while (!d6b.empty()) {
-      d6b.pop_back();
-      assert(is_contiguous_container_asan_correct(d6b));
-    }
-
-    C d6c(N + 2, get_s<S, 102>('c'));
-    while (!d6c.empty()) {
-      d6c.pop_back();
-      assert(is_contiguous_container_asan_correct(d6c));
-    }
-  }
-  {
-    C d7(9 * N + 2);
-
-    d7.insert(d7.begin() + 1, S());
-    verify_inside(d7);
-
-    d7.insert(d7.end() - 3, S());
-    verify_inside(d7);
-
-    d7.insert(d7.begin() + 2 * N, get_s<S, 1>('a'));
-    verify_inside(d7);
-
-    d7.insert(d7.end() - 2 * N, get_s<S, 1>('b'));
-    verify_inside(d7);
-
-    d7.insert(d7.begin() + 2 * N, 3 * N, get_s<S, 1>('c'));
-    verify_inside(d7);
-
-    // It may not be short for big element types, but it will be checked correctly:
-    d7.insert(d7.end() - 2 * N, 3 * N, get_s<S, 2>('d'));
-    verify_inside(d7);
-
-    d7.erase(d7.begin() + 2);
-    verify_inside(d7);
-
-    d7.erase(d7.end() - 2);
-    verify_inside(d7);
-  }
-}
-
-template <class S>
-void test_container() {
-  test_string<std::vector<S, std::allocator<S>>, S>();
-  test_string<std::vector<S, min_allocator<S>>, S>();
-  test_string<std::vector<S, safe_allocator<S>>, S>();
-}
-
-int main(int, char**) {
-  // Those tests support only types based on std::basic_string.
-  test_container<std::string>();
-  test_container<std::wstring>();
-#if TEST_STD_VER >= 11
-  test_container<std::u16string>();
-  test_container<std::u32string>();
-#endif
-#if TEST_STD_VER >= 20
-  test_container<std::u8string>();
-#endif
-
-  return 0;
-}
diff --git a/libcxx/test/support/asan_testing.h b/libcxx/test/support/asan_testing.h
index 3785c1f9c20de..6bfc8280a4ead 100644
--- a/libcxx/test/support/asan_testing.h
+++ b/libcxx/test/support/asan_testing.h
@@ -56,16 +56,35 @@ TEST_CONSTEXPR bool is_double_ended_contiguous_container_asan_correct(const std:
 #endif
 
 #if TEST_HAS_FEATURE(address_sanitizer)
+template <typename S>
+bool is_string_short(S const& s) {
+  // We do not have access to __is_long(), but we can check if strings
+  // buffer is inside strings memory. If strings memory contains its content,
+  // SSO is in use. To check it, we can just confirm that the beginning is in
+  // the string object memory block.
+  // &s    - beginning of objects memory
+  // &s[0] - beginning of the buffer
+  // (&s+1) - end of objects memory
+  return (void*)std::addressof(s) <= (void*)std::addressof(s[0]) &&
+         (void*)std::addressof(s[0]) < (void*)(std::addressof(s) + 1);
+}
+
 template <typename ChrT, typename TraitsT, typename Alloc>
 TEST_CONSTEXPR bool is_string_asan_correct(const std::basic_string<ChrT, TraitsT, Alloc>& c) {
   if (TEST_IS_CONSTANT_EVALUATED)
     return true;
 
-  if (std::__asan_annotate_container_with_allocator<Alloc>::value)
-    return __sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) != 0;
-  else
-    return __sanitizer_verify_contiguous_container(
-               c.data(), c.data() + c.capacity() + 1, c.data() + c.capacity() + 1) != 0;
+  if (!is_string_short(c) || _LIBCPP_SHORT_STRING_ANNOTATIONS_ALLOWED) {
+    if (std::__asan_annotate_container_with_allocator<Alloc>::value)
+      return __sanitizer_verify_contiguous_container(c.data(), c.data() + c.size() + 1, c.data() + c.capacity() + 1) !=
+             0;
+    else
+      return __sanitizer_verify_contiguous_container(
+                 c.data(), c.data() + c.capacity() + 1, c.data() + c.capacity() + 1) != 0;
+  } else {
+    return __sanitizer_verify_contiguous_container(std::addressof(c), std::addressof(c) + 1, std::addressof(c) + 1) !=
+           0;
+  }
 }
 #else
 #  include <string>

From 22da8096020fcf54bae2f885882bbd088a4dcd9b Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Tue, 23 Jan 2024 23:26:52 +0000
Subject: [PATCH 708/843] [Docs][DebugInfo][RemoveDIs] Document some debug-info
 transition info (#79167)

This is a high level description and FAQ for what we're doing in
RemoveDIs, and how old code should be behave with new debug-info
(exactly the same 99% of the time).
---
 llvm/docs/RemoveDIsDebugInfo.md | 106 ++++++++++++++++++++++++++++++++
 llvm/docs/UserGuides.rst        |   5 ++
 2 files changed, 111 insertions(+)
 create mode 100644 llvm/docs/RemoveDIsDebugInfo.md

diff --git a/llvm/docs/RemoveDIsDebugInfo.md b/llvm/docs/RemoveDIsDebugInfo.md
new file mode 100644
index 0000000000000..e871fed43a7cc
--- /dev/null
+++ b/llvm/docs/RemoveDIsDebugInfo.md
@@ -0,0 +1,106 @@
+# What's all this then?
+
+We're planning on removing debug info intrinsics from LLVM, as they're slow, unwieldy and can confuse optimisation passes if they're not expecting them. Instead of having a sequence of instructions that looks like this:
+
+```text
+    %add = add i32 %foo, %bar
+    call void @llvm.dbg.value(metadata %add, ...
+    %sub = sub i32 %add, %tosub
+    call void @llvm.dbg.value(metadata %sub, ...
+    call void @a_normal_function()
+```
+
+with `dbg.value` intrinsics representing debug info records, it would instead be printed as:
+
+```text
+    %add = add i32 %foo, %bar
+      #dbg_value(%add, ...
+    %sub = sub i32 %add, %tosub
+      #dbg_value(%sub, ...
+    call void @a_normal_function()
+```
+
+The debug records are not instructions, do not appear in the instruction list, and won't appear in your optimisation passes unless you go digging for them deliberately.
+
+# Great, what do I need to do!
+
+Approximately nothing -- we've already instrumented all of LLVM to handle these new records ("`DPValues`") and behave identically to past LLVM behaviour. We plan on turning this on by default some time soon, with IR converted to the intrinsic form of debug info at terminals (textual IR, bitcode) for a short while, before then changing the textual IR and bitcode formats.
+
+There are two significant changes to be aware of. Firstly, we're adding a single bit of debug relevant data to the `BasicBlock::iterator` class (it's so that we can determine whether ranges intend on including debug info at the beginning of a block or not). That means when writing passes that insert LLVM IR instructions, you need to identify positions with `BasicBlock::iterator` rather than just a bare `Instruction *`. Most of the time this means that after identifying where you intend on inserting something, you must also call `getIterator` on the instruction position -- however when inserting at the start of a block you _must_ use `getFirstInsertionPt`, `getFirstNonPHIIt` or `begin` and use that iterator to insert, rather than just fetching a pointer to the first instruction.
+
+The second matter is that if you transfer sequences of instructions from one place to another manually, i.e. repeatedly using `moveBefore` where you might have used `splice`, then you should instead use the method `moveBeforePreserving`. `moveBeforePreserving` will transfer debug info records with the instruction they're attached to. This is something that happens automatically today -- if you use `moveBefore` on every element of an instruction sequence, then debug intrinsics will be moved in the normal course of your code, but we lose this behaviour with non-instruction debug info.
+
+# Anything else?
+
+Not really, but here's an "old vs new" comparison of how to do certain things and quickstart for how this "new" debug info is structured.
+
+## Skipping debug records, ignoring debug-uses of Values, stably counting instructions...
+
+This will all happen transparently without needing to think about it!
+
+## What exactly have you replaced debug intrinsics with?
+
+We're using a dedicated C++ class called `DPValue` to store debug info, with a one-to-one relationship between each instance of a debug intrinsic and each `DPValue` object in any LLVM IR program. This class stores exactly the same information as is stored in debugging intrinsics. It also has almost entirely the same set of methods, that behave in the same way:
+
+  https://llvm.org/docs/doxygen/classllvm_1_1DPValue.html
+
+This allows you to treat a `DPValue` as if it's a `dbg.value` intrinsic most of the time, for example in generic (auto-param) lambdas.
+
+## How do these DPValues fit into the instruction stream?
+
+Like so:
+
+```text
+                 +---------------+          +---------------+
+---------------->|  Instruction  +--------->|  Instruction  |
+                 +-------+-------+          +---------------+
+                         |
+                         |
+                         |
+                         |
+                         v
+                  +------------+
+            <-----+  DPMarker  |<----
+           /      +------------+     \
+          /                           \
+         /                             \
+        v                               ^
+ +-----------+    +-----------+   +-----------+
+ |  DPValue  +--->|  DPValue  +-->|  DPValue  |
+ +-----------+    +-----------+   +-----------+
+```
+
+Each instruction has a pointer to a `DPMarker` (which will become optional), that contains a list of `DPValue` objects. No debugging records appear in the instruction list at all. `DPValue`s have a parent pointer to their owning `DPMarker`, and each `DPMarker` has a pointer back to it's owning instruction.
+
+Not shown are the links from DPValues to other parts of the `Value`/`Metadata` hierachy: `DPValue`s have raw pointers to `DILocalVariable`, `DIExpression` and `DILocation` objects, and references to `Value`s are stored in a `DebugValueUser` base class. This refers to a `ValueAsMetadata` object referring to `Value`s, via the `TrackingMetadata` facility.
+
+The various kinds of debug intrinsic (value, declare, assign) are all stored in the `DPValue` object, with a "Type" field disamgibuating which is which.
+
+## Finding debug info records
+
+Utilities such as `findDbgUsers` and the like now have an optional argument that will return the set of `DPValue` records that refer to a `Value`. You should be able to treat them the same as intrinsics.
+
+## Examining debug info records at positions
+
+Call `Instruction::getDbgValueRange()` to get the range of `DPValue` objects that are attached to an instruction.
+
+## Moving around, deleting
+
+You can use `DPValue::removeFromParent` to unlink a `DPValue` from it's marker, and then `BasicBlock::insertDPValueBefore` or `BasicBlock::insertDPValueAfter` to re-insert the `DPValue` somewhere else. You cannot insert a `DPValue` at an arbitary point in a list of `DPValue`s (if you're doing this with `dbg.value`s then it's unlikely to be correct).
+
+Erase `DPValue`s by calling `eraseFromParent` or `deleteInstr` if it's already been removed.
+
+## What about dangling `DPValue`s?
+
+If you have a block like so:
+
+```text
+    foo:
+      %bar = add i32 %baz...
+      dbg.value(metadata i32 %bar,...
+      br label %xyzzy
+```
+
+your optimisation pass may wish to erase the terminator and then do something to the block. This is easy to do when debug info is kept in instructions, but with `DPValue`s there is no trailing instruction to attach the variable information to in the block above, once the terminator is erased. For such degenerate blocks, `DPValue`s are stored temporarily in a map in `LLVMContext`, and are re-inserted when a terminator is reinserted to the block or other instruction inserted at `end()`.
+
+This can technically lead to trouble in the vanishingly rare scenario where an optimisation pass erases a terminator and then decides to erase the whole block. (We recommend not doing that).
diff --git a/llvm/docs/UserGuides.rst b/llvm/docs/UserGuides.rst
index 2f450ef46025a..9ae0d36011154 100644
--- a/llvm/docs/UserGuides.rst
+++ b/llvm/docs/UserGuides.rst
@@ -62,6 +62,7 @@ intermediate LLVM representation.
    ReportingGuide
    ResponseGuide
    Remarks
+   RemoveDIsDebugInfo
    RISCVUsage
    SourceLevelDebugging
    SPIRVUsage
@@ -178,6 +179,10 @@ Optimizations
    referencing, to determine variable locations for debug info in the final
    stages of compilation.
 
+:doc:`RemoveDIsDebugInfo`
+   This is a migration guide describing how to move from debug info using
+   intrinsics such as dbg.value to using the non-instruction DPValue object.
+
 :doc:`InstrProfileFormat`
    This document explains two binary formats of instrumentation-based profiles.
 

From 6a3ace20c80191159009668fff82fc3feeeca0a6 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Tue, 23 Jan 2024 15:38:12 -0800
Subject: [PATCH 709/843] [libc] remove redundant call_once (#79226)

Missed cleanup from https://reviews.llvm.org/D134716.

Fixes: #79220
---
 libc/src/threads/linux/call_once.cpp          | 67 -------------------
 .../src/threads/linux/thread_start_args.h.def | 11 ---
 2 files changed, 78 deletions(-)
 delete mode 100644 libc/src/threads/linux/call_once.cpp
 delete mode 100644 libc/src/threads/linux/thread_start_args.h.def

diff --git a/libc/src/threads/linux/call_once.cpp b/libc/src/threads/linux/call_once.cpp
deleted file mode 100644
index 5cdd8ebfd190e..0000000000000
--- a/libc/src/threads/linux/call_once.cpp
+++ /dev/null
@@ -1,67 +0,0 @@
-//===-- Linux implementation of the call_once function --------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "Futex.h"
-
-#include "src/__support/CPP/atomic.h"
-#include "src/__support/OSUtil/syscall.h" // For syscall functions.
-#include "src/__support/common.h"
-#include "src/threads/call_once.h"
-#include "src/threads/linux/Futex.h"
-
-#include <limits.h>
-#include <linux/futex.h>
-#include <sys/syscall.h> // For syscall numbers.
-#include <threads.h>     // For call_once related type definition.
-
-namespace LIBC_NAMESPACE {
-
-static constexpr FutexWordType START = 0x11;
-static constexpr FutexWordType WAITING = 0x22;
-static constexpr FutexWordType FINISH = 0x33;
-static constexpr once_flag ONCE_FLAG_INIT_VAL = ONCE_FLAG_INIT;
-
-LLVM_LIBC_FUNCTION(void, call_once,
-                   (once_flag * flag, __call_once_func_t func)) {
-  auto *futex_word = reinterpret_cast<cpp::Atomic<FutexWordType> *>(flag);
-  static_assert(sizeof(*futex_word) == sizeof(once_flag));
-
-  FutexWordType not_called = ONCE_FLAG_INIT_VAL.__word;
-
-  // The C standard wording says:
-  //
-  //     The completion of the function func synchronizes with all
-  //     previous or subsequent calls to call_once with the same
-  //     flag variable.
-  //
-  // What this means is that, the call_once call can return only after
-  // the called function |func| returns. So, we use futexes to synchronize
-  // calls with the same flag value.
-  if (futex_word->compare_exchange_strong(not_called, START)) {
-    func();
-    auto status = futex_word->exchange(FINISH);
-    if (status == WAITING) {
-      LIBC_NAMESPACE::syscall_impl(FUTEX_SYSCALL_ID, &futex_word->val,
-                                   FUTEX_WAKE_PRIVATE,
-                                   INT_MAX, // Wake all waiters.
-                                   0, 0, 0);
-    }
-    return;
-  }
-
-  FutexWordType status = START;
-  if (futex_word->compare_exchange_strong(status, WAITING) ||
-      status == WAITING) {
-    LIBC_NAMESPACE::syscall_impl(
-        FUTEX_SYSCALL_ID, &futex_word->val, FUTEX_WAIT_PRIVATE,
-        WAITING, // Block only if status is still |WAITING|.
-        0, 0, 0);
-  }
-}
-
-} // namespace LIBC_NAMESPACE
diff --git a/libc/src/threads/linux/thread_start_args.h.def b/libc/src/threads/linux/thread_start_args.h.def
deleted file mode 100644
index cc990a37548b6..0000000000000
--- a/libc/src/threads/linux/thread_start_args.h.def
+++ /dev/null
@@ -1,11 +0,0 @@
-//===-- Implementation of the get_start_args_addr function ------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include <stdint.h>
-
-%%include_file(${thread_start_args})

From 2a61be4e4ca481016516403f634b475197221991 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Fri, 12 Jan 2024 09:00:08 -0800
Subject: [PATCH 710/843] [SROA] NFC: Extract code to
 checkVectorTypesForPromotion

Change-Id: Ib6f237cc791a097f8f2411bc1d6502f11d4a748e
---
 llvm/lib/Transforms/Scalar/SROA.cpp | 181 +++++++++++++++-------------
 1 file changed, 99 insertions(+), 82 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 551a37b132445..10c25e2a03229 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2138,8 +2138,9 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
 
 /// Test whether a vector type is viable for promotion.
 ///
-/// This implements the necessary checking for \c isVectorPromotionViable over
-/// all slices of the alloca for the given VectorType.
+/// This implements the necessary checking for \c checkVectorTypesForPromotion
+/// (and thus isVectorPromotionViable) over all slices of the alloca for the
+/// given VectorType.
 static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
                                         const DataLayout &DL) {
   uint64_t ElementSize =
@@ -2164,6 +2165,98 @@ static bool checkVectorTypeForPromotion(Partition &P, VectorType *VTy,
   return true;
 }
 
+/// Test whether any vector type in \p CandidateTys is viable for promotion.
+///
+/// This implements the necessary checking for \c isVectorPromotionViable over
+/// all slices of the alloca for the given VectorType.
+static VectorType *
+checkVectorTypesForPromotion(Partition &P, const DataLayout &DL,
+                             SmallVectorImpl<VectorType *> &CandidateTys,
+                             bool HaveCommonEltTy, Type *CommonEltTy,
+                             bool HaveVecPtrTy, bool HaveCommonVecPtrTy,
+                             VectorType *CommonVecPtrTy) {
+  // If we didn't find a vector type, nothing to do here.
+  if (CandidateTys.empty())
+    return nullptr;
+
+  // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
+  // then we should choose it, not some other alternative.
+  // But, we can't perform a no-op pointer address space change via bitcast,
+  // so if we didn't have a common pointer element type, bail.
+  if (HaveVecPtrTy && !HaveCommonVecPtrTy)
+    return nullptr;
+
+  // Try to pick the "best" element type out of the choices.
+  if (!HaveCommonEltTy && HaveVecPtrTy) {
+    // If there was a pointer element type, there's really only one choice.
+    CandidateTys.clear();
+    CandidateTys.push_back(CommonVecPtrTy);
+  } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
+    // Integer-ify vector types.
+    for (VectorType *&VTy : CandidateTys) {
+      if (!VTy->getElementType()->isIntegerTy())
+        VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
+            VTy->getContext(), VTy->getScalarSizeInBits())));
+    }
+
+    // Rank the remaining candidate vector types. This is easy because we know
+    // they're all integer vectors. We sort by ascending number of elements.
+    auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      (void)DL;
+      assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
+                 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
+             "Cannot have vector types of different sizes!");
+      assert(RHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      assert(LHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      return cast<FixedVectorType>(RHSTy)->getNumElements() <
+             cast<FixedVectorType>(LHSTy)->getNumElements();
+    };
+    auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
+      (void)DL;
+      assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
+                 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
+             "Cannot have vector types of different sizes!");
+      assert(RHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      assert(LHSTy->getElementType()->isIntegerTy() &&
+             "All non-integer types eliminated!");
+      return cast<FixedVectorType>(RHSTy)->getNumElements() ==
+             cast<FixedVectorType>(LHSTy)->getNumElements();
+    };
+    llvm::sort(CandidateTys, RankVectorTypesComp);
+    CandidateTys.erase(std::unique(CandidateTys.begin(), CandidateTys.end(),
+                                   RankVectorTypesEq),
+                       CandidateTys.end());
+  } else {
+// The only way to have the same element type in every vector type is to
+// have the same vector type. Check that and remove all but one.
+#ifndef NDEBUG
+    for (VectorType *VTy : CandidateTys) {
+      assert(VTy->getElementType() == CommonEltTy &&
+             "Unaccounted for element type!");
+      assert(VTy == CandidateTys[0] &&
+             "Different vector types with the same element type!");
+    }
+#endif
+    CandidateTys.resize(1);
+  }
+
+  // FIXME: hack. Do we have a named constant for this?
+  // SDAG SDNode can't have more than 65535 operands.
+  llvm::erase_if(CandidateTys, [](VectorType *VTy) {
+    return cast<FixedVectorType>(VTy)->getNumElements() >
+           std::numeric_limits<unsigned short>::max();
+  });
+
+  for (VectorType *VTy : CandidateTys)
+    if (checkVectorTypeForPromotion(P, VTy, DL))
+      return VTy;
+
+  return nullptr;
+}
+
 /// Test whether the given alloca partitioning and range of slices can be
 /// promoted to a vector.
 ///
@@ -2211,6 +2304,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
       }
     }
   };
+
   // Put load and store types into a set for de-duplication.
   for (const Slice &S : P) {
     Type *Ty;
@@ -2246,86 +2340,9 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     }
   }
 
-  // If we didn't find a vector type, nothing to do here.
-  if (CandidateTys.empty())
-    return nullptr;
-
-  // Pointer-ness is sticky, if we had a vector-of-pointers candidate type,
-  // then we should choose it, not some other alternative.
-  // But, we can't perform a no-op pointer address space change via bitcast,
-  // so if we didn't have a common pointer element type, bail.
-  if (HaveVecPtrTy && !HaveCommonVecPtrTy)
-    return nullptr;
-
-  // Try to pick the "best" element type out of the choices.
-  if (!HaveCommonEltTy && HaveVecPtrTy) {
-    // If there was a pointer element type, there's really only one choice.
-    CandidateTys.clear();
-    CandidateTys.push_back(CommonVecPtrTy);
-  } else if (!HaveCommonEltTy && !HaveVecPtrTy) {
-    // Integer-ify vector types.
-    for (VectorType *&VTy : CandidateTys) {
-      if (!VTy->getElementType()->isIntegerTy())
-        VTy = cast<VectorType>(VTy->getWithNewType(IntegerType::getIntNTy(
-            VTy->getContext(), VTy->getScalarSizeInBits())));
-    }
-
-    // Rank the remaining candidate vector types. This is easy because we know
-    // they're all integer vectors. We sort by ascending number of elements.
-    auto RankVectorTypesComp = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
-      (void)DL;
-      assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
-                 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
-             "Cannot have vector types of different sizes!");
-      assert(RHSTy->getElementType()->isIntegerTy() &&
-             "All non-integer types eliminated!");
-      assert(LHSTy->getElementType()->isIntegerTy() &&
-             "All non-integer types eliminated!");
-      return cast<FixedVectorType>(RHSTy)->getNumElements() <
-             cast<FixedVectorType>(LHSTy)->getNumElements();
-    };
-    auto RankVectorTypesEq = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
-      (void)DL;
-      assert(DL.getTypeSizeInBits(RHSTy).getFixedValue() ==
-                 DL.getTypeSizeInBits(LHSTy).getFixedValue() &&
-             "Cannot have vector types of different sizes!");
-      assert(RHSTy->getElementType()->isIntegerTy() &&
-             "All non-integer types eliminated!");
-      assert(LHSTy->getElementType()->isIntegerTy() &&
-             "All non-integer types eliminated!");
-      return cast<FixedVectorType>(RHSTy)->getNumElements() ==
-             cast<FixedVectorType>(LHSTy)->getNumElements();
-    };
-    llvm::sort(CandidateTys, RankVectorTypesComp);
-    CandidateTys.erase(std::unique(CandidateTys.begin(), CandidateTys.end(),
-                                   RankVectorTypesEq),
-                       CandidateTys.end());
-  } else {
-// The only way to have the same element type in every vector type is to
-// have the same vector type. Check that and remove all but one.
-#ifndef NDEBUG
-    for (VectorType *VTy : CandidateTys) {
-      assert(VTy->getElementType() == CommonEltTy &&
-             "Unaccounted for element type!");
-      assert(VTy == CandidateTys[0] &&
-             "Different vector types with the same element type!");
-    }
-#endif
-    CandidateTys.resize(1);
-  }
-
-  // FIXME: hack. Do we have a named constant for this?
-  // SDAG SDNode can't have more than 65535 operands.
-  llvm::erase_if(CandidateTys, [](VectorType *VTy) {
-    return cast<FixedVectorType>(VTy)->getNumElements() >
-           std::numeric_limits<unsigned short>::max();
-  });
-
-  for (VectorType *VTy : CandidateTys)
-    if (checkVectorTypeForPromotion(P, VTy, DL))
-      return VTy;
-
-  return nullptr;
+  return checkVectorTypesForPromotion(P, DL, CandidateTys, HaveCommonEltTy,
+                                      CommonEltTy, HaveVecPtrTy,
+                                      HaveCommonVecPtrTy, CommonVecPtrTy);
 }
 
 /// Test whether a slice of an alloca is valid for integer widening.

From 25e1916d88ebeef786956b678a4eb9a757e219d9 Mon Sep 17 00:00:00 2001
From: Alan Zhao <alanzhao1@users.noreply.github.com>
Date: Tue, 23 Jan 2024 15:42:49 -0800
Subject: [PATCH 711/843] [nfc][clang] Fix test in new-array-init.cpp (#79225)

This test was originally introduced in
https://github.com/llvm/llvm-project/pull/76976, but it incorrectly
tests braced-list initialization instead of parenthesized
initialization.
---
 clang/test/CodeGenCXX/new-array-init.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/CodeGenCXX/new-array-init.cpp b/clang/test/CodeGenCXX/new-array-init.cpp
index fe1bdf425ab14..781c4728df450 100644
--- a/clang/test/CodeGenCXX/new-array-init.cpp
+++ b/clang/test/CodeGenCXX/new-array-init.cpp
@@ -164,7 +164,7 @@ void string_sufficient_paren() {
   // FIXME: For very large arrays, it would be preferable to emit a small copy and a memset.
   // CHECKCXX20: call void @llvm.memcpy{{.*}}(ptr align {{[0-9]+}} %[[PTR]], ptr align {{[0-9]+}} @[[ABC15]], i32 15,
   // CHECKCXX20-NOT: memset
-  new char[15] { "abc" };
+  new char[15]("abc");
 }
 #endif
 

From d657519838e4b2310e13ec5ff52599e041860825 Mon Sep 17 00:00:00 2001
From: Pete Lawrence <plawrence@apple.com>
Date: Tue, 23 Jan 2024 14:07:52 -1000
Subject: [PATCH 712/843] [lldb] Improve maintainability and readability for
 ValueObject methods (#75865)

As I worked through changes to another PR
(https://github.com/llvm/llvm-project/pull/74912), I couldn't help but
rewrite a few methods for readability, maintainability, and possibly
some behavior correctness too.

1. Exiting early instead of nested `if`-statements, which:
	- Reduces indentation levels for all subsequent lines
	- Treats missing pre-conditions similar to an error
- Clearly indicates that the full length of the method is the "happy
path".
2. Explicitly return empty Value Object shared pointers for those error
(like) situations, which
- Reduces the time it takes a maintainer to figure out what the method
actually returns based on those conditions.

3. Converting a mix of `if` and `if`-`else`-statements around an enum
into one `switch` statement, which:
	- Consolidates the former branching logic
	- Lets the compiler warn you of a (future) missing enum case
- This one may actually change behavior slightly, because what was an
early test for one enum case, now happens later on in the `switch`.

4. Consolidating near-identical, "copy-pasta" logic into one place,
which:
	- Separates the common code to the diverging paths.
	- Highlights the differences between the code paths.


rdar://119833526
---
 lldb/source/Core/ValueObject.cpp | 330 +++++++++++++++----------------
 1 file changed, 164 insertions(+), 166 deletions(-)

diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index 9208047be3666..d58bf2ca763d9 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -1582,62 +1582,64 @@ bool ValueObject::IsUninitializedReference() {
 
 ValueObjectSP ValueObject::GetSyntheticArrayMember(size_t index,
                                                    bool can_create) {
-  ValueObjectSP synthetic_child_sp;
-  if (IsPointerType() || IsArrayType()) {
-    std::string index_str = llvm::formatv("[{0}]", index);
-    ConstString index_const_str(index_str);
-    // Check if we have already created a synthetic array member in this valid
-    // object. If we have we will re-use it.
-    synthetic_child_sp = GetSyntheticChild(index_const_str);
-    if (!synthetic_child_sp) {
-      ValueObject *synthetic_child;
-      // We haven't made a synthetic array member for INDEX yet, so lets make
-      // one and cache it for any future reference.
-      synthetic_child = CreateChildAtIndex(0, true, index);
-
-      // Cache the value if we got one back...
-      if (synthetic_child) {
-        AddSyntheticChild(index_const_str, synthetic_child);
-        synthetic_child_sp = synthetic_child->GetSP();
-        synthetic_child_sp->SetName(ConstString(index_str));
-        synthetic_child_sp->m_flags.m_is_array_item_for_pointer = true;
-      }
-    }
-  }
+  if (!IsPointerType() && !IsArrayType())
+    return ValueObjectSP();
+
+  std::string index_str = llvm::formatv("[{0}]", index);
+  ConstString index_const_str(index_str);
+  // Check if we have already created a synthetic array member in this valid
+  // object. If we have we will re-use it.
+  if (auto existing_synthetic_child = GetSyntheticChild(index_const_str))
+    return existing_synthetic_child;
+
+  // We haven't made a synthetic array member for INDEX yet, so lets make
+  // one and cache it for any future reference.
+  ValueObject *synthetic_child = CreateChildAtIndex(0, true, index);
+
+  if (!synthetic_child)
+    return ValueObjectSP();
+
+  // Cache the synthetic child's value because it's valid.
+  AddSyntheticChild(index_const_str, synthetic_child);
+  auto synthetic_child_sp = synthetic_child->GetSP();
+  synthetic_child_sp->SetName(ConstString(index_str));
+  synthetic_child_sp->m_flags.m_is_array_item_for_pointer = true;
   return synthetic_child_sp;
 }
 
 ValueObjectSP ValueObject::GetSyntheticBitFieldChild(uint32_t from, uint32_t to,
                                                      bool can_create) {
-  ValueObjectSP synthetic_child_sp;
-  if (IsScalarType()) {
-    std::string index_str = llvm::formatv("[{0}-{1}]", from, to);
-    ConstString index_const_str(index_str);
-    // Check if we have already created a synthetic array member in this valid
-    // object. If we have we will re-use it.
-    synthetic_child_sp = GetSyntheticChild(index_const_str);
-    if (!synthetic_child_sp) {
-      uint32_t bit_field_size = to - from + 1;
-      uint32_t bit_field_offset = from;
-      if (GetDataExtractor().GetByteOrder() == eByteOrderBig)
-        bit_field_offset =
-            GetByteSize().value_or(0) * 8 - bit_field_size - bit_field_offset;
-      // We haven't made a synthetic array member for INDEX yet, so lets make
-      // one and cache it for any future reference.
-      ValueObjectChild *synthetic_child = new ValueObjectChild(
-          *this, GetCompilerType(), index_const_str, GetByteSize().value_or(0),
-          0, bit_field_size, bit_field_offset, false, false,
-          eAddressTypeInvalid, 0);
-
-      // Cache the value if we got one back...
-      if (synthetic_child) {
-        AddSyntheticChild(index_const_str, synthetic_child);
-        synthetic_child_sp = synthetic_child->GetSP();
-        synthetic_child_sp->SetName(ConstString(index_str));
-        synthetic_child_sp->m_flags.m_is_bitfield_for_scalar = true;
-      }
-    }
-  }
+  if (!IsScalarType())
+    return ValueObjectSP();
+
+  std::string index_str = llvm::formatv("[{0}-{1}]", from, to);
+  ConstString index_const_str(index_str);
+
+  // Check if we have already created a synthetic array member in this valid
+  // object. If we have we will re-use it.
+  if (auto existing_synthetic_child = GetSyntheticChild(index_const_str))
+    return existing_synthetic_child;
+
+  uint32_t bit_field_size = to - from + 1;
+  uint32_t bit_field_offset = from;
+  if (GetDataExtractor().GetByteOrder() == eByteOrderBig)
+    bit_field_offset =
+        GetByteSize().value_or(0) * 8 - bit_field_size - bit_field_offset;
+
+  // We haven't made a synthetic array member for INDEX yet, so lets make
+  // one and cache it for any future reference.
+  ValueObjectChild *synthetic_child = new ValueObjectChild(
+      *this, GetCompilerType(), index_const_str, GetByteSize().value_or(0), 0,
+      bit_field_size, bit_field_offset, false, false, eAddressTypeInvalid, 0);
+
+  if (!synthetic_child)
+    return ValueObjectSP();
+
+  // Cache the synthetic child's value because it's valid.
+  AddSyntheticChild(index_const_str, synthetic_child);
+  auto synthetic_child_sp = synthetic_child->GetSP();
+  synthetic_child_sp->SetName(ConstString(index_str));
+  synthetic_child_sp->m_flags.m_is_bitfield_for_scalar = true;
   return synthetic_child_sp;
 }
 
@@ -1647,9 +1649,8 @@ ValueObjectSP ValueObject::GetSyntheticChildAtOffset(
 
   ValueObjectSP synthetic_child_sp;
 
-  if (name_const_str.IsEmpty()) {
+  if (name_const_str.IsEmpty())
     name_const_str.SetString("@" + std::to_string(offset));
-  }
 
   // Check if we have already created a synthetic array member in this valid
   // object. If we have we will re-use it.
@@ -1659,13 +1660,13 @@ ValueObjectSP ValueObject::GetSyntheticChildAtOffset(
     return synthetic_child_sp;
 
   if (!can_create)
-    return {};
+    return ValueObjectSP();
 
   ExecutionContext exe_ctx(GetExecutionContextRef());
   std::optional<uint64_t> size =
       type.GetByteSize(exe_ctx.GetBestExecutionContextScope());
   if (!size)
-    return {};
+    return ValueObjectSP();
   ValueObjectChild *synthetic_child =
       new ValueObjectChild(*this, type, name_const_str, *size, offset, 0, 0,
                            false, false, eAddressTypeInvalid, 0);
@@ -1699,7 +1700,7 @@ ValueObjectSP ValueObject::GetSyntheticBase(uint32_t offset,
     return synthetic_child_sp;
 
   if (!can_create)
-    return {};
+    return ValueObjectSP();
 
   const bool is_base_class = true;
 
@@ -1707,7 +1708,7 @@ ValueObjectSP ValueObject::GetSyntheticBase(uint32_t offset,
   std::optional<uint64_t> size =
       type.GetByteSize(exe_ctx.GetBestExecutionContextScope());
   if (!size)
-    return {};
+    return ValueObjectSP();
   ValueObjectChild *synthetic_child =
       new ValueObjectChild(*this, type, name_const_str, *size, offset, 0, 0,
                            is_base_class, false, eAddressTypeInvalid, 0);
@@ -1736,30 +1737,30 @@ static const char *SkipLeadingExpressionPathSeparators(const char *expression) {
 ValueObjectSP
 ValueObject::GetSyntheticExpressionPathChild(const char *expression,
                                              bool can_create) {
-  ValueObjectSP synthetic_child_sp;
   ConstString name_const_string(expression);
   // Check if we have already created a synthetic array member in this valid
   // object. If we have we will re-use it.
-  synthetic_child_sp = GetSyntheticChild(name_const_string);
-  if (!synthetic_child_sp) {
-    // We haven't made a synthetic array member for expression yet, so lets
-    // make one and cache it for any future reference.
-    synthetic_child_sp = GetValueForExpressionPath(
-        expression, nullptr, nullptr,
-        GetValueForExpressionPathOptions().SetSyntheticChildrenTraversal(
-            GetValueForExpressionPathOptions::SyntheticChildrenTraversal::
-                None));
-
-    // Cache the value if we got one back...
-    if (synthetic_child_sp.get()) {
-      // FIXME: this causes a "real" child to end up with its name changed to
-      // the contents of expression
-      AddSyntheticChild(name_const_string, synthetic_child_sp.get());
-      synthetic_child_sp->SetName(
-          ConstString(SkipLeadingExpressionPathSeparators(expression)));
-    }
-  }
-  return synthetic_child_sp;
+  if (auto existing_synthetic_child = GetSyntheticChild(name_const_string))
+    return existing_synthetic_child;
+
+  // We haven't made a synthetic array member for expression yet, so lets
+  // make one and cache it for any future reference.
+  auto path_options = GetValueForExpressionPathOptions();
+  path_options.SetSyntheticChildrenTraversal(
+      GetValueForExpressionPathOptions::SyntheticChildrenTraversal::None);
+  auto synthetic_child =
+      GetValueForExpressionPath(expression, nullptr, nullptr, path_options);
+
+  if (!synthetic_child)
+    return ValueObjectSP();
+
+  // Cache the synthetic child's value because it's valid.
+  // FIXME: this causes a "real" child to end up with its name changed to
+  // the contents of expression
+  AddSyntheticChild(name_const_string, synthetic_child.get());
+  synthetic_child->SetName(
+      ConstString(SkipLeadingExpressionPathSeparators(expression)));
+  return synthetic_child;
 }
 
 void ValueObject::CalculateSyntheticValue() {
@@ -1956,66 +1957,55 @@ ValueObjectSP ValueObject::GetValueForExpressionPath(
     const GetValueForExpressionPathOptions &options,
     ExpressionPathAftermath *final_task_on_target) {
 
-  ExpressionPathScanEndReason dummy_reason_to_stop =
-      ValueObject::eExpressionPathScanEndReasonUnknown;
-  ExpressionPathEndResultType dummy_final_value_type =
-      ValueObject::eExpressionPathEndResultTypeInvalid;
-  ExpressionPathAftermath dummy_final_task_on_target =
-      ValueObject::eExpressionPathAftermathNothing;
-
-  ValueObjectSP ret_val = GetValueForExpressionPath_Impl(
-      expression, reason_to_stop ? reason_to_stop : &dummy_reason_to_stop,
-      final_value_type ? final_value_type : &dummy_final_value_type, options,
-      final_task_on_target ? final_task_on_target
-                           : &dummy_final_task_on_target);
-
-  if (!final_task_on_target ||
-      *final_task_on_target == ValueObject::eExpressionPathAftermathNothing)
-    return ret_val;
-
-  if (ret_val.get() &&
-      ((final_value_type ? *final_value_type : dummy_final_value_type) ==
-       eExpressionPathEndResultTypePlain)) // I can only deref and takeaddress
-                                           // of plain objects
-  {
-    if ((final_task_on_target ? *final_task_on_target
-                              : dummy_final_task_on_target) ==
-        ValueObject::eExpressionPathAftermathDereference) {
-      Status error;
-      ValueObjectSP final_value = ret_val->Dereference(error);
-      if (error.Fail() || !final_value.get()) {
-        if (reason_to_stop)
-          *reason_to_stop =
-              ValueObject::eExpressionPathScanEndReasonDereferencingFailed;
-        if (final_value_type)
-          *final_value_type = ValueObject::eExpressionPathEndResultTypeInvalid;
-        return ValueObjectSP();
-      } else {
-        if (final_task_on_target)
-          *final_task_on_target = ValueObject::eExpressionPathAftermathNothing;
-        return final_value;
-      }
-    }
-    if (*final_task_on_target ==
-        ValueObject::eExpressionPathAftermathTakeAddress) {
-      Status error;
-      ValueObjectSP final_value = ret_val->AddressOf(error);
-      if (error.Fail() || !final_value.get()) {
-        if (reason_to_stop)
-          *reason_to_stop =
-              ValueObject::eExpressionPathScanEndReasonTakingAddressFailed;
-        if (final_value_type)
-          *final_value_type = ValueObject::eExpressionPathEndResultTypeInvalid;
-        return ValueObjectSP();
-      } else {
-        if (final_task_on_target)
-          *final_task_on_target = ValueObject::eExpressionPathAftermathNothing;
-        return final_value;
-      }
-    }
+  auto dummy_stop_reason = eExpressionPathScanEndReasonUnknown;
+  auto dummy_value_type = eExpressionPathEndResultTypeInvalid;
+  auto dummy_final_task = eExpressionPathAftermathNothing;
+
+  auto proxy_stop_reason = reason_to_stop ? reason_to_stop : &dummy_stop_reason;
+  auto proxy_value_type =
+      final_value_type ? final_value_type : &dummy_value_type;
+  auto proxy_final_task =
+      final_task_on_target ? final_task_on_target : &dummy_final_task;
+
+  auto ret_value = GetValueForExpressionPath_Impl(expression, proxy_stop_reason,
+                                                  proxy_value_type, options,
+                                                  proxy_final_task);
+
+  // The caller knows nothing happened if `final_task_on_target` doesn't change.
+  if (!ret_value || (*proxy_value_type) != eExpressionPathEndResultTypePlain ||
+      !final_task_on_target)
+    return ValueObjectSP();
+
+  ExpressionPathAftermath &final_task_on_target_ref = (*final_task_on_target);
+  ExpressionPathScanEndReason stop_reason_for_error;
+  Status error;
+  // The method can only dereference and take the address of plain objects.
+  switch (final_task_on_target_ref) {
+  case eExpressionPathAftermathNothing:
+    return ret_value;
+
+  case eExpressionPathAftermathDereference:
+    ret_value = ret_value->Dereference(error);
+    stop_reason_for_error = eExpressionPathScanEndReasonDereferencingFailed;
+    break;
+
+  case eExpressionPathAftermathTakeAddress:
+    ret_value = ret_value->AddressOf(error);
+    stop_reason_for_error = eExpressionPathScanEndReasonTakingAddressFailed;
+    break;
+  }
+
+  if (ret_value && error.Success()) {
+    final_task_on_target_ref = eExpressionPathAftermathNothing;
+    return ret_value;
   }
-  return ret_val; // final_task_on_target will still have its original value, so
-                  // you know I did not do it
+
+  if (reason_to_stop)
+    *reason_to_stop = stop_reason_for_error;
+
+  if (final_value_type)
+    *final_value_type = eExpressionPathEndResultTypeInvalid;
+  return ValueObjectSP();
 }
 
 ValueObjectSP ValueObject::GetValueForExpressionPath_Impl(
@@ -2686,39 +2676,47 @@ ValueObjectSP ValueObject::AddressOf(Status &error) {
   const bool scalar_is_load_address = false;
   addr_t addr = GetAddressOf(scalar_is_load_address, &address_type);
   error.Clear();
-  if (addr != LLDB_INVALID_ADDRESS && address_type != eAddressTypeHost) {
-    switch (address_type) {
-    case eAddressTypeInvalid: {
-      StreamString expr_path_strm;
-      GetExpressionPath(expr_path_strm);
-      error.SetErrorStringWithFormat("'%s' is not in memory",
-                                     expr_path_strm.GetData());
-    } break;
 
-    case eAddressTypeFile:
-    case eAddressTypeLoad: {
-      CompilerType compiler_type = GetCompilerType();
-      if (compiler_type) {
-        std::string name(1, '&');
-        name.append(m_name.AsCString(""));
-        ExecutionContext exe_ctx(GetExecutionContextRef());
-        m_addr_of_valobj_sp = ValueObjectConstResult::Create(
-            exe_ctx.GetBestExecutionContextScope(),
-            compiler_type.GetPointerType(), ConstString(name.c_str()), addr,
-            eAddressTypeInvalid, m_data.GetAddressByteSize());
-      }
-    } break;
-    default:
-      break;
-    }
-  } else {
-    StreamString expr_path_strm;
-    GetExpressionPath(expr_path_strm);
+  StreamString expr_path_strm;
+  GetExpressionPath(expr_path_strm);
+  const char *expr_path_str = expr_path_strm.GetData();
+
+  ExecutionContext exe_ctx(GetExecutionContextRef());
+  auto scope = exe_ctx.GetBestExecutionContextScope();
+
+  if (addr == LLDB_INVALID_ADDRESS) {
     error.SetErrorStringWithFormat("'%s' doesn't have a valid address",
-                                   expr_path_strm.GetData());
+                                   expr_path_str);
+    return ValueObjectSP();
   }
 
-  return m_addr_of_valobj_sp;
+  switch (address_type) {
+  case eAddressTypeInvalid:
+    error.SetErrorStringWithFormat("'%s' is not in memory", expr_path_str);
+    return ValueObjectSP();
+
+  case eAddressTypeHost:
+    error.SetErrorStringWithFormat("'%s' is in host process (LLDB) memory",
+                                   expr_path_str);
+    return ValueObjectSP();
+
+  case eAddressTypeFile:
+  case eAddressTypeLoad: {
+    CompilerType compiler_type = GetCompilerType();
+    if (!compiler_type) {
+      error.SetErrorStringWithFormat("'%s' doesn't have a compiler type",
+                                     expr_path_str);
+      return ValueObjectSP();
+    }
+
+    std::string name(1, '&');
+    name.append(m_name.AsCString(""));
+    m_addr_of_valobj_sp = ValueObjectConstResult::Create(
+        scope, compiler_type.GetPointerType(), ConstString(name.c_str()), addr,
+        eAddressTypeInvalid, m_data.GetAddressByteSize());
+    return m_addr_of_valobj_sp;
+  }
+  }
 }
 
 ValueObjectSP ValueObject::DoCast(const CompilerType &compiler_type) {

From 03a61d34ebf4f8eeaa6861bec3ab39c75bb41778 Mon Sep 17 00:00:00 2001
From: Paul Kirth <paulkirth@google.com>
Date: Tue, 23 Jan 2024 16:16:07 -0800
Subject: [PATCH 713/843] [RISCV] Support TLSDESC in the RISC-V backend
 (#66915)

This patch adds basic TLSDESC support in the RISC-V backend.

Specifically, we add new relocation types for TLSDESC, as prescribed in
https://github.com/riscv-non-isa/riscv-elf-psabi-doc/pull/373, and add a
new pseudo instruction to simplify code generation.

This patch does not try to optimize the local dynamic case, which can be
improved in separate patches.

Linker side changes will also be handled separately.

The current implementation is only enabled when passing the new
`-enable-tlsdesc` codegen flag.
---
 .../llvm/BinaryFormat/ELFRelocs/RISCV.def     |   5 +
 llvm/include/llvm/CodeGen/CommandFlags.h      |   3 +
 llvm/include/llvm/Target/TargetMachine.h      |   3 +
 llvm/include/llvm/Target/TargetOptions.h      |  17 +--
 llvm/include/llvm/TargetParser/Triple.h       |   7 ++
 llvm/lib/CodeGen/CommandFlags.cpp             |   8 ++
 .../Target/RISCV/AsmParser/RISCVAsmParser.cpp |  63 +++++++++--
 .../RISCV/MCTargetDesc/RISCVAsmBackend.cpp    |  10 ++
 .../Target/RISCV/MCTargetDesc/RISCVBaseInfo.h |   6 +-
 .../MCTargetDesc/RISCVELFObjectWriter.cpp     |  15 +++
 .../RISCV/MCTargetDesc/RISCVFixupKinds.h      |   6 ++
 .../RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp |  40 +++++++
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp |  18 ++++
 .../Target/RISCV/MCTargetDesc/RISCVMCExpr.h   |   4 +
 llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp     |  12 +++
 .../Target/RISCV/RISCVExpandPseudoInsts.cpp   |  51 +++++++++
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  20 +++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h     |   1 +
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      |   6 +-
 llvm/lib/Target/RISCV/RISCVInstrInfo.td       |  29 +++++
 llvm/lib/Target/TargetMachine.cpp             |   1 +
 llvm/test/CodeGen/RISCV/tls-models.ll         | 102 ++++++++++++++++++
 llvm/test/MC/RISCV/relocations.s              |  21 ++++
 llvm/test/MC/RISCV/tlsdesc.s                  |  50 +++++++++
 24 files changed, 479 insertions(+), 19 deletions(-)
 create mode 100644 llvm/test/MC/RISCV/tlsdesc.s

diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def
index b478799c91fb2..d4be34e3b37e5 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/RISCV.def
@@ -15,6 +15,7 @@ ELF_RELOC(R_RISCV_TLS_DTPREL32,       8)
 ELF_RELOC(R_RISCV_TLS_DTPREL64,       9)
 ELF_RELOC(R_RISCV_TLS_TPREL32,       10)
 ELF_RELOC(R_RISCV_TLS_TPREL64,       11)
+ELF_RELOC(R_RISCV_TLSDESC,           12)
 ELF_RELOC(R_RISCV_BRANCH,            16)
 ELF_RELOC(R_RISCV_JAL,               17)
 ELF_RELOC(R_RISCV_CALL,              18)
@@ -56,3 +57,7 @@ ELF_RELOC(R_RISCV_IRELATIVE,         58)
 ELF_RELOC(R_RISCV_PLT32,             59)
 ELF_RELOC(R_RISCV_SET_ULEB128,       60)
 ELF_RELOC(R_RISCV_SUB_ULEB128,       61)
+ELF_RELOC(R_RISCV_TLSDESC_HI20,      62)
+ELF_RELOC(R_RISCV_TLSDESC_LOAD_LO12, 63)
+ELF_RELOC(R_RISCV_TLSDESC_ADD_LO12,  64)
+ELF_RELOC(R_RISCV_TLSDESC_CALL,      65)
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index 6407dde5bcd6c..bf166a63edb07 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -117,6 +117,9 @@ unsigned getTLSSize();
 bool getEmulatedTLS();
 std::optional<bool> getExplicitEmulatedTLS();
 
+bool getEnableTLSDESC();
+std::optional<bool> getExplicitEnableTLSDESC();
+
 bool getUniqueSectionNames();
 
 bool getUniqueBasicBlockSectionNames();
diff --git a/llvm/include/llvm/Target/TargetMachine.h b/llvm/include/llvm/Target/TargetMachine.h
index 1fe47dec70b16..a522a12299bb0 100644
--- a/llvm/include/llvm/Target/TargetMachine.h
+++ b/llvm/include/llvm/Target/TargetMachine.h
@@ -248,6 +248,9 @@ class TargetMachine {
   /// Returns true if this target uses emulated TLS.
   bool useEmulatedTLS() const;
 
+  /// Returns true if this target uses TLS Descriptors.
+  bool useTLSDESC() const;
+
   /// Returns the TLS model which should be used for the given global variable.
   TLSModel::Model getTLSModel(const GlobalValue *GV) const;
 
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 4df897c047a38..d02d1699813c9 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -145,13 +145,13 @@ namespace llvm {
           IgnoreXCOFFVisibility(false), XCOFFTracebackTable(true),
           UniqueSectionNames(true), UniqueBasicBlockSectionNames(false),
           TrapUnreachable(false), NoTrapAfterNoreturn(false), TLSSize(0),
-          EmulatedTLS(false), EnableIPRA(false), EmitStackSizeSection(false),
-          EnableMachineOutliner(false), EnableMachineFunctionSplitter(false),
-          SupportsDefaultOutlining(false), EmitAddrsig(false),
-          EmitCallSiteInfo(false), SupportsDebugEntryValues(false),
-          EnableDebugEntryValues(false), ValueTrackingVariableLocations(false),
-          ForceDwarfFrameSection(false), XRayFunctionIndex(true),
-          DebugStrictDwarf(false), Hotpatch(false),
+          EmulatedTLS(false), EnableTLSDESC(false), EnableIPRA(false),
+          EmitStackSizeSection(false), EnableMachineOutliner(false),
+          EnableMachineFunctionSplitter(false), SupportsDefaultOutlining(false),
+          EmitAddrsig(false), EmitCallSiteInfo(false),
+          SupportsDebugEntryValues(false), EnableDebugEntryValues(false),
+          ValueTrackingVariableLocations(false), ForceDwarfFrameSection(false),
+          XRayFunctionIndex(true), DebugStrictDwarf(false), Hotpatch(false),
           PPCGenScalarMASSEntries(false), JMCInstrument(false),
           EnableCFIFixup(false), MisExpect(false), XCOFFReadOnlyPointers(false),
           FPDenormalMode(DenormalMode::IEEE, DenormalMode::IEEE) {}
@@ -295,6 +295,9 @@ namespace llvm {
     /// function in the runtime library..
     unsigned EmulatedTLS : 1;
 
+    /// EnableTLSDESC - This flag enables TLS Descriptors.
+    unsigned EnableTLSDESC : 1;
+
     /// This flag enables InterProcedural Register Allocation (IPRA).
     unsigned EnableIPRA : 1;
 
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 1f8c7a060e249..870dc75b1c1f8 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -1033,6 +1033,13 @@ class Triple {
            isWindowsCygwinEnvironment() || isOHOSFamily();
   }
 
+  /// Tests whether the target uses TLS Descriptor by default.
+  bool hasDefaultTLSDESC() const {
+    // TODO: Improve check for other platforms, like Android, and RISC-V
+    // Note: This is currently only used on RISC-V.
+    return isOSBinFormatELF() && isAArch64();
+  }
+
   /// Tests whether the target uses -data-sections as default.
   bool hasDefaultDataSections() const {
     return isOSBinFormatXCOFF() || isWasm();
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index c6d7827f36dfd..51406fb287e66 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -93,6 +93,7 @@ CGOPT(bool, XCOFFTracebackTable)
 CGOPT(std::string, BBSections)
 CGOPT(unsigned, TLSSize)
 CGOPT_EXP(bool, EmulatedTLS)
+CGOPT_EXP(bool, EnableTLSDESC)
 CGOPT(bool, UniqueSectionNames)
 CGOPT(bool, UniqueBasicBlockSectionNames)
 CGOPT(EABI, EABIVersion)
@@ -404,6 +405,11 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
       "emulated-tls", cl::desc("Use emulated TLS model"), cl::init(false));
   CGBINDOPT(EmulatedTLS);
 
+  static cl::opt<bool> EnableTLSDESC(
+      "enable-tlsdesc", cl::desc("Enable the use of TLS Descriptors"),
+      cl::init(false));
+  CGBINDOPT(EnableTLSDESC);
+
   static cl::opt<bool> UniqueSectionNames(
       "unique-section-names", cl::desc("Give unique names to every section"),
       cl::init(true));
@@ -568,6 +574,8 @@ codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   Options.TLSSize = getTLSSize();
   Options.EmulatedTLS =
       getExplicitEmulatedTLS().value_or(TheTriple.hasDefaultEmulatedTLS());
+  Options.EnableTLSDESC =
+      getExplicitEnableTLSDESC().value_or(TheTriple.hasDefaultTLSDESC());
   Options.ExceptionModel = getExceptionModel();
   Options.EmitStackSizeSection = getEnableStackSizeSection();
   Options.EnableMachineFunctionSplitter = getEnableMachineFunctionSplitter();
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 7d42481db57fa..f6e8386aff451 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -169,6 +169,12 @@ class RISCVAsmParser : public MCTargetAsmParser {
   // 'add' is an overloaded mnemonic.
   bool checkPseudoAddTPRel(MCInst &Inst, OperandVector &Operands);
 
+  // Checks that a PseudoTLSDESCCall is using x5/t0 in its output operand.
+  // Enforcing this using a restricted register class for the output
+  // operand of PseudoTLSDESCCall results in a poor diagnostic due to the fact
+  // 'jalr' is an overloaded mnemonic.
+  bool checkPseudoTLSDESCCall(MCInst &Inst, OperandVector &Operands);
+
   // Check instruction constraints.
   bool validateInstruction(MCInst &Inst, OperandVector &Operands);
 
@@ -549,6 +555,16 @@ struct RISCVOperand final : public MCParsedAsmOperand {
            VK == RISCVMCExpr::VK_RISCV_TPREL_ADD;
   }
 
+  bool isTLSDESCCallSymbol() const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    // Must be of 'immediate' type but not a constant.
+    if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
+      return false;
+    return RISCVAsmParser::classifySymbolRef(getImm(), VK) &&
+           VK == RISCVMCExpr::VK_RISCV_TLSDESC_CALL;
+  }
+
   bool isCSRSystemRegister() const { return isSystemRegister(); }
 
   bool isVTypeImm(unsigned N) const {
@@ -601,7 +617,10 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     if (!isImm())
       return false;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
-    if (VK == RISCVMCExpr::VK_RISCV_LO || VK == RISCVMCExpr::VK_RISCV_PCREL_LO)
+    if (VK == RISCVMCExpr::VK_RISCV_LO ||
+        VK == RISCVMCExpr::VK_RISCV_PCREL_LO ||
+        VK == RISCVMCExpr::VK_RISCV_TLSDESC_LOAD_LO ||
+        VK == RISCVMCExpr::VK_RISCV_TLSDESC_ADD_LO)
       return true;
     // Given only Imm, ensuring that the actually specified constant is either
     // a signed or unsigned 64-bit number is unfortunately impossible.
@@ -854,7 +873,9 @@ struct RISCVOperand final : public MCParsedAsmOperand {
     return IsValid && ((IsConstantImm && VK == RISCVMCExpr::VK_RISCV_None) ||
                        VK == RISCVMCExpr::VK_RISCV_LO ||
                        VK == RISCVMCExpr::VK_RISCV_PCREL_LO ||
-                       VK == RISCVMCExpr::VK_RISCV_TPREL_LO);
+                       VK == RISCVMCExpr::VK_RISCV_TPREL_LO ||
+                       VK == RISCVMCExpr::VK_RISCV_TLSDESC_LOAD_LO ||
+                       VK == RISCVMCExpr::VK_RISCV_TLSDESC_ADD_LO);
   }
 
   bool isSImm12Lsb0() const { return isBareSimmNLsb0<12>(); }
@@ -911,14 +932,16 @@ struct RISCVOperand final : public MCParsedAsmOperand {
       return IsValid && (VK == RISCVMCExpr::VK_RISCV_PCREL_HI ||
                          VK == RISCVMCExpr::VK_RISCV_GOT_HI ||
                          VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI ||
-                         VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI);
-    } else {
-      return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None ||
-                                 VK == RISCVMCExpr::VK_RISCV_PCREL_HI ||
-                                 VK == RISCVMCExpr::VK_RISCV_GOT_HI ||
-                                 VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI ||
-                                 VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI);
+                         VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI ||
+                         VK == RISCVMCExpr::VK_RISCV_TLSDESC_HI);
     }
+
+    return isUInt<20>(Imm) && (VK == RISCVMCExpr::VK_RISCV_None ||
+                               VK == RISCVMCExpr::VK_RISCV_PCREL_HI ||
+                               VK == RISCVMCExpr::VK_RISCV_GOT_HI ||
+                               VK == RISCVMCExpr::VK_RISCV_TLS_GOT_HI ||
+                               VK == RISCVMCExpr::VK_RISCV_TLS_GD_HI ||
+                               VK == RISCVMCExpr::VK_RISCV_TLSDESC_HI);
   }
 
   bool isSImm21Lsb0JAL() const { return isBareSimmNLsb0<21>(); }
@@ -1556,6 +1579,11 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a symbol with %tprel_add modifier");
   }
+  case Match_InvalidTLSDESCCallSymbol: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc,
+                 "operand must be a symbol with %tlsdesc_call modifier");
+  }
   case Match_InvalidRTZArg: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be 'rtz' floating-point rounding mode");
@@ -3324,6 +3352,19 @@ bool RISCVAsmParser::checkPseudoAddTPRel(MCInst &Inst,
   return false;
 }
 
+bool RISCVAsmParser::checkPseudoTLSDESCCall(MCInst &Inst,
+                                            OperandVector &Operands) {
+  assert(Inst.getOpcode() == RISCV::PseudoTLSDESCCall && "Invalid instruction");
+  assert(Inst.getOperand(0).isReg() && "Unexpected operand kind");
+  if (Inst.getOperand(0).getReg() != RISCV::X5) {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[3]).getStartLoc();
+    return Error(ErrorLoc, "the output operand must be t0/x5 when using "
+                           "%tlsdesc_call modifier");
+  }
+
+  return false;
+}
+
 std::unique_ptr<RISCVOperand> RISCVAsmParser::defaultMaskRegOp() const {
   return RISCVOperand::createReg(RISCV::NoRegister, llvm::SMLoc(),
                                  llvm::SMLoc());
@@ -3559,6 +3600,10 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
     if (checkPseudoAddTPRel(Inst, Operands))
       return true;
     break;
+  case RISCV::PseudoTLSDESCCall:
+    if (checkPseudoTLSDESCCall(Inst, Operands))
+      return true;
+    break;
   case RISCV::PseudoSEXT_B:
     emitPseudoExtend(Inst, /*SignExtend=*/true, /*Width=*/8, IDLoc, Out);
     return false;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 7ce08eabdeb61..bd49875c9591d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -86,6 +86,12 @@ RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_riscv_call_plt", 0, 64, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_riscv_relax", 0, 0, 0},
       {"fixup_riscv_align", 0, 0, 0},
+
+      {"fixup_riscv_tlsdesc_hi20", 12, 20,
+       MCFixupKindInfo::FKF_IsPCRel | MCFixupKindInfo::FKF_IsTarget},
+      {"fixup_riscv_tlsdesc_load_lo12", 20, 12, 0},
+      {"fixup_riscv_tlsdesc_add_lo12", 20, 12, 0},
+      {"fixup_riscv_tlsdesc_call", 0, 0, 0},
   };
   static_assert((std::size(Infos)) == RISCV::NumTargetFixupKinds,
                 "Not all fixup kinds added to Infos array");
@@ -126,6 +132,7 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   case RISCV::fixup_riscv_got_hi20:
   case RISCV::fixup_riscv_tls_got_hi20:
   case RISCV::fixup_riscv_tls_gd_hi20:
+  case RISCV::fixup_riscv_tlsdesc_hi20:
     return true;
   }
 
@@ -411,6 +418,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case RISCV::fixup_riscv_got_hi20:
   case RISCV::fixup_riscv_tls_got_hi20:
   case RISCV::fixup_riscv_tls_gd_hi20:
+  case RISCV::fixup_riscv_tlsdesc_hi20:
     llvm_unreachable("Relocation should be unconditionally forced\n");
   case FK_Data_1:
   case FK_Data_2:
@@ -421,6 +429,7 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   case RISCV::fixup_riscv_lo12_i:
   case RISCV::fixup_riscv_pcrel_lo12_i:
   case RISCV::fixup_riscv_tprel_lo12_i:
+  case RISCV::fixup_riscv_tlsdesc_load_lo12:
     return Value & 0xfff;
   case RISCV::fixup_riscv_12_i:
     if (!isInt<12>(Value)) {
@@ -524,6 +533,7 @@ bool RISCVAsmBackend::evaluateTargetFixup(
   switch (Fixup.getTargetKind()) {
   default:
     llvm_unreachable("Unexpected fixup kind!");
+  case RISCV::fixup_riscv_tlsdesc_hi20:
   case RISCV::fixup_riscv_pcrel_hi20:
     AUIPCFixup = &Fixup;
     AUIPCDF = DF;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
index 433e2e6f80bd6..d7f7859ce4399 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVBaseInfo.h
@@ -264,11 +264,15 @@ enum {
   MO_TPREL_ADD = 10,
   MO_TLS_GOT_HI = 11,
   MO_TLS_GD_HI = 12,
+  MO_TLSDESC_HI = 13,
+  MO_TLSDESC_LOAD_LO = 14,
+  MO_TLSDESC_ADD_LO = 15,
+  MO_TLSDESC_CALL = 16,
 
   // Used to differentiate between target-specific "direct" flags and "bitmask"
   // flags. A machine operand can only have one "direct" flag, but can have
   // multiple "bitmask" flags.
-  MO_DIRECT_FLAG_MASK = 15
+  MO_DIRECT_FLAG_MASK = 31
 };
 } // namespace RISCVII
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 76e5b3ed40254..2343c5fb2535a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -77,6 +77,14 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
       return ELF::R_RISCV_TLS_GOT_HI20;
     case RISCV::fixup_riscv_tls_gd_hi20:
       return ELF::R_RISCV_TLS_GD_HI20;
+    case RISCV::fixup_riscv_tlsdesc_hi20:
+      return ELF::R_RISCV_TLSDESC_HI20;
+    case RISCV::fixup_riscv_tlsdesc_load_lo12:
+      return ELF::R_RISCV_TLSDESC_LOAD_LO12;
+    case RISCV::fixup_riscv_tlsdesc_add_lo12:
+      return ELF::R_RISCV_TLSDESC_ADD_LO12;
+    case RISCV::fixup_riscv_tlsdesc_call:
+      return ELF::R_RISCV_TLSDESC_CALL;
     case RISCV::fixup_riscv_jal:
       return ELF::R_RISCV_JAL;
     case RISCV::fixup_riscv_branch:
@@ -96,6 +104,13 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
   default:
     Ctx.reportError(Fixup.getLoc(), "unsupported relocation type");
     return ELF::R_RISCV_NONE;
+  case RISCV::fixup_riscv_tlsdesc_load_lo12:
+    return ELF::R_RISCV_TLSDESC_LOAD_LO12;
+  case RISCV::fixup_riscv_tlsdesc_add_lo12:
+    return ELF::R_RISCV_TLSDESC_ADD_LO12;
+  case RISCV::fixup_riscv_tlsdesc_call:
+    return ELF::R_RISCV_TLSDESC_CALL;
+
   case FK_Data_1:
     Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
     return ELF::R_RISCV_NONE;
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
index 74bd9398a9ef6..821372d3d39a6 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVFixupKinds.h
@@ -71,6 +71,12 @@ enum Fixups {
   // Used to generate an R_RISCV_ALIGN relocation, which indicates the linker
   // should fixup the alignment after linker relaxation.
   fixup_riscv_align,
+  // Fixups indicating a TLS descriptor code sequence, corresponding to auipc,
+  // lw/ld, addi, and jalr, respectively.
+  fixup_riscv_tlsdesc_hi20,
+  fixup_riscv_tlsdesc_load_lo12,
+  fixup_riscv_tlsdesc_add_lo12,
+  fixup_riscv_tlsdesc_call,
 
   // Used as a sentinel, must be the last
   fixup_riscv_invalid,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index 82fed50bce753..5ea386c3c32a3 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -57,6 +57,10 @@ class RISCVMCCodeEmitter : public MCCodeEmitter {
                           SmallVectorImpl<MCFixup> &Fixups,
                           const MCSubtargetInfo &STI) const;
 
+  void expandTLSDESCCall(const MCInst &MI, SmallVectorImpl<char> &CB,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const;
+
   void expandAddTPRel(const MCInst &MI, SmallVectorImpl<char> &CB,
                       SmallVectorImpl<MCFixup> &Fixups,
                       const MCSubtargetInfo &STI) const;
@@ -154,6 +158,26 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI,
   support::endian::write(CB, Binary, llvm::endianness::little);
 }
 
+void RISCVMCCodeEmitter::expandTLSDESCCall(const MCInst &MI,
+                                           SmallVectorImpl<char> &CB,
+                                           SmallVectorImpl<MCFixup> &Fixups,
+                                           const MCSubtargetInfo &STI) const {
+  MCOperand SrcSymbol = MI.getOperand(3);
+  assert(SrcSymbol.isExpr() &&
+         "Expected expression as first input to TLSDESCCALL");
+  const RISCVMCExpr *Expr = dyn_cast<RISCVMCExpr>(SrcSymbol.getExpr());
+  MCRegister Link = MI.getOperand(0).getReg();
+  MCRegister Dest = MI.getOperand(1).getReg();
+  MCRegister Imm = MI.getOperand(2).getImm();
+  Fixups.push_back(MCFixup::create(
+      0, Expr, MCFixupKind(RISCV::fixup_riscv_tlsdesc_call), MI.getLoc()));
+  MCInst Call =
+      MCInstBuilder(RISCV::JALR).addReg(Link).addReg(Dest).addImm(Imm);
+
+  uint32_t Binary = getBinaryCodeForInstr(Call, Fixups, STI);
+  support::endian::write(CB, Binary, llvm::endianness::little);
+}
+
 // Expand PseudoAddTPRel to a simple ADD with the correct relocation.
 void RISCVMCCodeEmitter::expandAddTPRel(const MCInst &MI,
                                         SmallVectorImpl<char> &CB,
@@ -303,6 +327,10 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI,
     expandLongCondBr(MI, CB, Fixups, STI);
     MCNumEmitted += 2;
     return;
+  case RISCV::PseudoTLSDESCCall:
+    expandTLSDESCCall(MI, CB, Fixups, STI);
+    MCNumEmitted += 1;
+    return;
   }
 
   switch (Size) {
@@ -445,6 +473,18 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
       FixupKind = RISCV::fixup_riscv_call_plt;
       RelaxCandidate = true;
       break;
+    case RISCVMCExpr::VK_RISCV_TLSDESC_HI:
+      FixupKind = RISCV::fixup_riscv_tlsdesc_hi20;
+      break;
+    case RISCVMCExpr::VK_RISCV_TLSDESC_LOAD_LO:
+      FixupKind = RISCV::fixup_riscv_tlsdesc_load_lo12;
+      break;
+    case RISCVMCExpr::VK_RISCV_TLSDESC_ADD_LO:
+      FixupKind = RISCV::fixup_riscv_tlsdesc_add_lo12;
+      break;
+    case RISCVMCExpr::VK_RISCV_TLSDESC_CALL:
+      FixupKind = RISCV::fixup_riscv_tlsdesc_call;
+      break;
     }
   } else if ((Kind == MCExpr::SymbolRef &&
                  cast<MCSymbolRefExpr>(Expr)->getKind() ==
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index 64ddae61b1bc1..254a9a4bc0ef0 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -79,6 +79,7 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const {
     case RISCV::fixup_riscv_tls_got_hi20:
     case RISCV::fixup_riscv_tls_gd_hi20:
     case RISCV::fixup_riscv_pcrel_hi20:
+    case RISCV::fixup_riscv_tlsdesc_hi20:
       if (DFOut)
         *DFOut = DF;
       return &F;
@@ -119,6 +120,10 @@ RISCVMCExpr::VariantKind RISCVMCExpr::getVariantKindForName(StringRef name) {
       .Case("tprel_add", VK_RISCV_TPREL_ADD)
       .Case("tls_ie_pcrel_hi", VK_RISCV_TLS_GOT_HI)
       .Case("tls_gd_pcrel_hi", VK_RISCV_TLS_GD_HI)
+      .Case("tlsdesc_hi", VK_RISCV_TLSDESC_HI)
+      .Case("tlsdesc_load_lo", VK_RISCV_TLSDESC_LOAD_LO)
+      .Case("tlsdesc_add_lo", VK_RISCV_TLSDESC_ADD_LO)
+      .Case("tlsdesc_call", VK_RISCV_TLSDESC_CALL)
       .Default(VK_RISCV_Invalid);
 }
 
@@ -145,6 +150,14 @@ StringRef RISCVMCExpr::getVariantKindName(VariantKind Kind) {
     return "tprel_add";
   case VK_RISCV_TLS_GOT_HI:
     return "tls_ie_pcrel_hi";
+  case VK_RISCV_TLSDESC_HI:
+    return "tlsdesc_hi";
+  case VK_RISCV_TLSDESC_LOAD_LO:
+    return "tlsdesc_load_lo";
+  case VK_RISCV_TLSDESC_ADD_LO:
+    return "tlsdesc_add_lo";
+  case VK_RISCV_TLSDESC_CALL:
+    return "tlsdesc_call";
   case VK_RISCV_TLS_GD_HI:
     return "tls_gd_pcrel_hi";
   case VK_RISCV_CALL:
@@ -193,6 +206,9 @@ void RISCVMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
   case VK_RISCV_TPREL_HI:
   case VK_RISCV_TLS_GOT_HI:
   case VK_RISCV_TLS_GD_HI:
+  case VK_RISCV_TLSDESC_HI:
+  case VK_RISCV_TLSDESC_ADD_LO:
+  case VK_RISCV_TLSDESC_LOAD_LO:
     break;
   }
 
@@ -206,6 +222,8 @@ bool RISCVMCExpr::evaluateAsConstant(int64_t &Res) const {
       Kind == VK_RISCV_GOT_HI || Kind == VK_RISCV_TPREL_HI ||
       Kind == VK_RISCV_TPREL_LO || Kind == VK_RISCV_TPREL_ADD ||
       Kind == VK_RISCV_TLS_GOT_HI || Kind == VK_RISCV_TLS_GD_HI ||
+      Kind == VK_RISCV_TLSDESC_HI || Kind == VK_RISCV_TLSDESC_LOAD_LO ||
+      Kind == VK_RISCV_TLSDESC_ADD_LO || Kind == VK_RISCV_TLSDESC_CALL ||
       Kind == VK_RISCV_CALL || Kind == VK_RISCV_CALL_PLT)
     return false;
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index ee83bf0208ef4..fcc4c5c439645 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -37,6 +37,10 @@ class RISCVMCExpr : public MCTargetExpr {
     VK_RISCV_CALL,
     VK_RISCV_CALL_PLT,
     VK_RISCV_32_PCREL,
+    VK_RISCV_TLSDESC_HI,
+    VK_RISCV_TLSDESC_LOAD_LO,
+    VK_RISCV_TLSDESC_ADD_LO,
+    VK_RISCV_TLSDESC_CALL,
     VK_RISCV_Invalid // Must be the last item
   };
 
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index aee6ec05f1f9c..b2e9cd87373b0 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -780,6 +780,18 @@ static MCOperand lowerSymbolOperand(const MachineOperand &MO, MCSymbol *Sym,
   case RISCVII::MO_TLS_GD_HI:
     Kind = RISCVMCExpr::VK_RISCV_TLS_GD_HI;
     break;
+  case RISCVII::MO_TLSDESC_HI:
+    Kind = RISCVMCExpr::VK_RISCV_TLSDESC_HI;
+    break;
+  case RISCVII::MO_TLSDESC_LOAD_LO:
+    Kind = RISCVMCExpr::VK_RISCV_TLSDESC_LOAD_LO;
+    break;
+  case RISCVII::MO_TLSDESC_ADD_LO:
+    Kind = RISCVMCExpr::VK_RISCV_TLSDESC_ADD_LO;
+    break;
+  case RISCVII::MO_TLSDESC_CALL:
+    Kind = RISCVMCExpr::VK_RISCV_TLSDESC_CALL;
+    break;
   }
 
   const MCExpr *ME =
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index ed2b1ceb7d6f0..0a314fdd41cbe 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -423,6 +423,10 @@ class RISCVPreRAExpandPseudo : public MachineFunctionPass {
   bool expandLoadTLSGDAddress(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MBBI,
                               MachineBasicBlock::iterator &NextMBBI);
+  bool expandLoadTLSDescAddress(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                MachineBasicBlock::iterator &NextMBBI);
+
 #ifndef NDEBUG
   unsigned getInstSizeInBytes(const MachineFunction &MF) const {
     unsigned Size = 0;
@@ -481,6 +485,8 @@ bool RISCVPreRAExpandPseudo::expandMI(MachineBasicBlock &MBB,
     return expandLoadTLSIEAddress(MBB, MBBI, NextMBBI);
   case RISCV::PseudoLA_TLS_GD:
     return expandLoadTLSGDAddress(MBB, MBBI, NextMBBI);
+  case RISCV::PseudoLA_TLSDESC:
+    return expandLoadTLSDescAddress(MBB, MBBI, NextMBBI);
   }
   return false;
 }
@@ -547,6 +553,51 @@ bool RISCVPreRAExpandPseudo::expandLoadTLSGDAddress(
                              RISCV::ADDI);
 }
 
+bool RISCVPreRAExpandPseudo::expandLoadTLSDescAddress(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MachineBasicBlock::iterator &NextMBBI) {
+  MachineFunction *MF = MBB.getParent();
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  const auto &STI = MF->getSubtarget<RISCVSubtarget>();
+  unsigned SecondOpcode = STI.is64Bit() ? RISCV::LD : RISCV::LW;
+
+  Register FinalReg = MI.getOperand(0).getReg();
+  Register DestReg =
+      MF->getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+  Register ScratchReg =
+      MF->getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
+
+  MachineOperand &Symbol = MI.getOperand(1);
+  Symbol.setTargetFlags(RISCVII::MO_TLSDESC_HI);
+  MCSymbol *AUIPCSymbol = MF->getContext().createNamedTempSymbol("tlsdesc_hi");
+
+  MachineInstr *MIAUIPC =
+      BuildMI(MBB, MBBI, DL, TII->get(RISCV::AUIPC), ScratchReg).add(Symbol);
+  MIAUIPC->setPreInstrSymbol(*MF, AUIPCSymbol);
+
+  BuildMI(MBB, MBBI, DL, TII->get(SecondOpcode), DestReg)
+      .addReg(ScratchReg)
+      .addSym(AUIPCSymbol, RISCVII::MO_TLSDESC_LOAD_LO);
+
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADDI), RISCV::X10)
+      .addReg(ScratchReg)
+      .addSym(AUIPCSymbol, RISCVII::MO_TLSDESC_ADD_LO);
+
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::PseudoTLSDESCCall), RISCV::X5)
+      .addReg(DestReg)
+      .addImm(0)
+      .addSym(AUIPCSymbol, RISCVII::MO_TLSDESC_CALL);
+
+  BuildMI(MBB, MBBI, DL, TII->get(RISCV::ADD), FinalReg)
+      .addReg(RISCV::X10)
+      .addReg(RISCV::X4);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 } // end of anonymous namespace
 
 INITIALIZE_PASS(RISCVExpandPseudo, "riscv-expand-pseudo",
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index e39888637062c..47c6cd6e5487b 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -6982,6 +6982,23 @@ SDValue RISCVTargetLowering::getDynamicTLSAddr(GlobalAddressSDNode *N,
   return LowerCallTo(CLI).first;
 }
 
+SDValue RISCVTargetLowering::getTLSDescAddr(GlobalAddressSDNode *N,
+                                            SelectionDAG &DAG) const {
+  SDLoc DL(N);
+  EVT Ty = getPointerTy(DAG.getDataLayout());
+  const GlobalValue *GV = N->getGlobal();
+
+  // Use a PC-relative addressing mode to access the global dynamic GOT address.
+  // This generates the pattern (PseudoLA_TLSDESC sym), which expands to
+  //
+  // auipc tX, %tlsdesc_hi(symbol)         // R_RISCV_TLSDESC_HI20(symbol)
+  // lw    tY, tX, %tlsdesc_lo_load(label) // R_RISCV_TLSDESC_LOAD_LO12_I(label)
+  // addi  a0, tX, %tlsdesc_lo_add(label)  // R_RISCV_TLSDESC_ADD_LO12_I(label)
+  // jalr  t0, tY                          // R_RISCV_TLSDESC_CALL(label)
+  SDValue Addr = DAG.getTargetGlobalAddress(GV, DL, Ty, 0, 0);
+  return SDValue(DAG.getMachineNode(RISCV::PseudoLA_TLSDESC, DL, Ty, Addr), 0);
+}
+
 SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
                                                    SelectionDAG &DAG) const {
   GlobalAddressSDNode *N = cast<GlobalAddressSDNode>(Op);
@@ -7006,7 +7023,8 @@ SDValue RISCVTargetLowering::lowerGlobalTLSAddress(SDValue Op,
     break;
   case TLSModel::LocalDynamic:
   case TLSModel::GeneralDynamic:
-    Addr = getDynamicTLSAddr(N, DAG);
+    Addr = DAG.getTarget().useTLSDESC() ? getTLSDescAddr(N, DAG)
+                                        : getDynamicTLSAddr(N, DAG);
     break;
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index a55a604616671..30b9ad7e6f6f3 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -855,6 +855,7 @@ class RISCVTargetLowering : public TargetLowering {
   SDValue getStaticTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG,
                            bool UseGOT) const;
   SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
+  SDValue getTLSDescAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
 
   SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 7c21f63584942..592962cebe897 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -2379,7 +2379,11 @@ RISCVInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
       {MO_TPREL_HI, "riscv-tprel-hi"},
       {MO_TPREL_ADD, "riscv-tprel-add"},
       {MO_TLS_GOT_HI, "riscv-tls-got-hi"},
-      {MO_TLS_GD_HI, "riscv-tls-gd-hi"}};
+      {MO_TLS_GD_HI, "riscv-tls-gd-hi"},
+      {MO_TLSDESC_HI, "riscv-tlsdesc-hi"},
+      {MO_TLSDESC_LOAD_LO, "riscv-tlsdesc-load-lo"},
+      {MO_TLSDESC_ADD_LO, "riscv-tlsdesc-add-lo"},
+      {MO_TLSDESC_CALL, "riscv-tlsdesc-call"}};
   return ArrayRef(TargetFlags);
 }
 bool RISCVInstrInfo::isFunctionSafeToOutlineFrom(
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 792e0bbdf5816..114329c2c7c5f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -1742,6 +1742,35 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Size = 8, isCodeGenOnly = 0,
 def PseudoLA_TLS_GD : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
                              "la.tls.gd", "$dst, $src">;
 
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0, Size = 32, isCodeGenOnly = 0 in
+def PseudoLA_TLSDESC : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
+                             "la.tlsdesc", "$dst, $src">;
+
+def TLSDESCCallSymbol : AsmOperandClass {
+  let Name = "TLSDESCCallSymbol";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidTLSDESCCallSymbol";
+  let ParserMethod = "parseOperandWithModifier";
+}
+
+// A bare symbol with the %tlsdesc_call variant.
+def tlsdesc_call_symbol : Operand<XLenVT> {
+  let ParserMatchClass = TLSDESCCallSymbol;
+}
+// This is a special case of the JALR instruction used to facilitate the use of a
+// fourth operand to emit a relocation on a symbol relating to this instruction.
+// The relocation does not affect any bits of the instruction itself but is used
+// as a hint to the linker.
+let isCall = 1, isBarrier = 1, isCodeGenOnly = 0, Size = 8, hasSideEffects = 0,
+    mayStore = 0, mayLoad = 0 in
+def PseudoTLSDESCCall : Pseudo<(outs GPR:$rd),
+                         (ins GPR:$rs1, simm12:$imm12, tlsdesc_call_symbol:$src), [],
+                         "jalr", "$rd, ${imm12}(${rs1}), $src">,
+                         Sched<[WriteJalr, ReadJalr]> {
+  let Defs = [X10];
+  let Uses = [X10];
+}
+
 
 /// Sign/Zero Extends
 
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index abd0fdf2390c0..0839fb22d35a8 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -213,6 +213,7 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
 }
 
 bool TargetMachine::useEmulatedTLS() const { return Options.EmulatedTLS; }
+bool TargetMachine::useTLSDESC() const { return Options.EnableTLSDESC; }
 
 TLSModel::Model TargetMachine::getTLSModel(const GlobalValue *GV) const {
   bool IsPIE = GV->getParent()->getPIELevel() != PIELevel::Default;
diff --git a/llvm/test/CodeGen/RISCV/tls-models.ll b/llvm/test/CodeGen/RISCV/tls-models.ll
index c2ed44073baac..b99896e350191 100644
--- a/llvm/test/CodeGen/RISCV/tls-models.ll
+++ b/llvm/test/CodeGen/RISCV/tls-models.ll
@@ -1,10 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=riscv32 -relocation-model=pic < %s \
 ; RUN:     | FileCheck -check-prefix=RV32-PIC %s
+; RUN: llc -mtriple=riscv32 -relocation-model=pic < %s -enable-tlsdesc \
+; RUN:     | FileCheck -check-prefix=RV32-PIC-TLSDESC %s
 ; RUN: llc -mtriple=riscv64 -relocation-model=pic < %s \
 ; RUN:     | FileCheck -check-prefix=RV64-PIC %s
+; RUN: llc -mtriple=riscv64 -relocation-model=pic -enable-tlsdesc < %s \
+; RUN:     | FileCheck -check-prefix=RV64-PIC-TLSDESC %s
 ; RUN: llc -mtriple=riscv32 < %s | FileCheck -check-prefix=RV32-NOPIC %s
+; RUN: llc -mtriple=riscv32 < %s -enable-tlsdesc | FileCheck -check-prefix=RV32-NOPIC-TLSDESC %s
 ; RUN: llc -mtriple=riscv64 < %s | FileCheck -check-prefix=RV64-NOPIC %s
+; RUN: llc -mtriple=riscv64 < %s -enable-tlsdesc | FileCheck -check-prefix=RV64-NOPIC-TLSDESC %s
 
 ; Check that TLS symbols are lowered correctly based on the specified
 ; model. Make sure they're external to avoid them all being optimised to Local
@@ -58,6 +64,42 @@ define ptr @f1() nounwind {
 ; RV64-NOPIC-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi0)(a0)
 ; RV64-NOPIC-NEXT:    add a0, a0, tp
 ; RV64-NOPIC-NEXT:    ret
+;
+; RV32-PIC-TLSDESC-LABEL: f1:
+; RV32-PIC-TLSDESC:       # %bb.0: # %entry
+; RV32-PIC-TLSDESC-NEXT:  .Ltlsdesc_hi0:
+; RV32-PIC-TLSDESC-NEXT:    auipc a0, %tlsdesc_hi(unspecified)
+; RV32-PIC-TLSDESC-NEXT:    lw a1, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a0)
+; RV32-PIC-TLSDESC-NEXT:    addi a0, a0, %tlsdesc_add_lo(.Ltlsdesc_hi0)
+; RV32-PIC-TLSDESC-NEXT:    jalr t0, 0(a1), %tlsdesc_call(.Ltlsdesc_hi0)
+; RV32-PIC-TLSDESC-NEXT:    add a0, a0, tp
+; RV32-PIC-TLSDESC-NEXT:    ret
+;
+; RV64-PIC-TLSDESC-LABEL: f1:
+; RV64-PIC-TLSDESC:       # %bb.0: # %entry
+; RV64-PIC-TLSDESC-NEXT:  .Ltlsdesc_hi0:
+; RV64-PIC-TLSDESC-NEXT:    auipc a0, %tlsdesc_hi(unspecified)
+; RV64-PIC-TLSDESC-NEXT:    ld a1, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a0)
+; RV64-PIC-TLSDESC-NEXT:    addi a0, a0, %tlsdesc_add_lo(.Ltlsdesc_hi0)
+; RV64-PIC-TLSDESC-NEXT:    jalr t0, 0(a1), %tlsdesc_call(.Ltlsdesc_hi0)
+; RV64-PIC-TLSDESC-NEXT:    add a0, a0, tp
+; RV64-PIC-TLSDESC-NEXT:    ret
+;
+; RV32-NOPIC-TLSDESC-LABEL: f1:
+; RV32-NOPIC-TLSDESC:       # %bb.0: # %entry
+; RV32-NOPIC-TLSDESC-NEXT:  .Lpcrel_hi0:
+; RV32-NOPIC-TLSDESC-NEXT:    auipc a0, %tls_ie_pcrel_hi(unspecified)
+; RV32-NOPIC-TLSDESC-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi0)(a0)
+; RV32-NOPIC-TLSDESC-NEXT:    add a0, a0, tp
+; RV32-NOPIC-TLSDESC-NEXT:    ret
+;
+; RV64-NOPIC-TLSDESC-LABEL: f1:
+; RV64-NOPIC-TLSDESC:       # %bb.0: # %entry
+; RV64-NOPIC-TLSDESC-NEXT:  .Lpcrel_hi0:
+; RV64-NOPIC-TLSDESC-NEXT:    auipc a0, %tls_ie_pcrel_hi(unspecified)
+; RV64-NOPIC-TLSDESC-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi0)(a0)
+; RV64-NOPIC-TLSDESC-NEXT:    add a0, a0, tp
+; RV64-NOPIC-TLSDESC-NEXT:    ret
 entry:
   ret ptr @unspecified
 }
@@ -144,6 +186,38 @@ define ptr @f3() nounwind {
 ; RV64-NOPIC-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi2)(a0)
 ; RV64-NOPIC-NEXT:    add a0, a0, tp
 ; RV64-NOPIC-NEXT:    ret
+;
+; RV32-PIC-TLSDESC-LABEL: f3:
+; RV32-PIC-TLSDESC:       # %bb.0: # %entry
+; RV32-PIC-TLSDESC-NEXT:  .Lpcrel_hi0:
+; RV32-PIC-TLSDESC-NEXT:    auipc a0, %tls_ie_pcrel_hi(ie)
+; RV32-PIC-TLSDESC-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi0)(a0)
+; RV32-PIC-TLSDESC-NEXT:    add a0, a0, tp
+; RV32-PIC-TLSDESC-NEXT:    ret
+;
+; RV64-PIC-TLSDESC-LABEL: f3:
+; RV64-PIC-TLSDESC:       # %bb.0: # %entry
+; RV64-PIC-TLSDESC-NEXT:  .Lpcrel_hi0:
+; RV64-PIC-TLSDESC-NEXT:    auipc a0, %tls_ie_pcrel_hi(ie)
+; RV64-PIC-TLSDESC-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi0)(a0)
+; RV64-PIC-TLSDESC-NEXT:    add a0, a0, tp
+; RV64-PIC-TLSDESC-NEXT:    ret
+;
+; RV32-NOPIC-TLSDESC-LABEL: f3:
+; RV32-NOPIC-TLSDESC:       # %bb.0: # %entry
+; RV32-NOPIC-TLSDESC-NEXT:  .Lpcrel_hi2:
+; RV32-NOPIC-TLSDESC-NEXT:    auipc a0, %tls_ie_pcrel_hi(ie)
+; RV32-NOPIC-TLSDESC-NEXT:    lw a0, %pcrel_lo(.Lpcrel_hi2)(a0)
+; RV32-NOPIC-TLSDESC-NEXT:    add a0, a0, tp
+; RV32-NOPIC-TLSDESC-NEXT:    ret
+;
+; RV64-NOPIC-TLSDESC-LABEL: f3:
+; RV64-NOPIC-TLSDESC:       # %bb.0: # %entry
+; RV64-NOPIC-TLSDESC-NEXT:  .Lpcrel_hi2:
+; RV64-NOPIC-TLSDESC-NEXT:    auipc a0, %tls_ie_pcrel_hi(ie)
+; RV64-NOPIC-TLSDESC-NEXT:    ld a0, %pcrel_lo(.Lpcrel_hi2)(a0)
+; RV64-NOPIC-TLSDESC-NEXT:    add a0, a0, tp
+; RV64-NOPIC-TLSDESC-NEXT:    ret
 entry:
   ret ptr @ie
 }
@@ -179,6 +253,34 @@ define ptr @f4() nounwind {
 ; RV64-NOPIC-NEXT:    add a0, a0, tp, %tprel_add(le)
 ; RV64-NOPIC-NEXT:    addi a0, a0, %tprel_lo(le)
 ; RV64-NOPIC-NEXT:    ret
+;
+; RV32-PIC-TLSDESC-LABEL: f4:
+; RV32-PIC-TLSDESC:       # %bb.0: # %entry
+; RV32-PIC-TLSDESC-NEXT:    lui a0, %tprel_hi(le)
+; RV32-PIC-TLSDESC-NEXT:    add a0, a0, tp, %tprel_add(le)
+; RV32-PIC-TLSDESC-NEXT:    addi a0, a0, %tprel_lo(le)
+; RV32-PIC-TLSDESC-NEXT:    ret
+;
+; RV64-PIC-TLSDESC-LABEL: f4:
+; RV64-PIC-TLSDESC:       # %bb.0: # %entry
+; RV64-PIC-TLSDESC-NEXT:    lui a0, %tprel_hi(le)
+; RV64-PIC-TLSDESC-NEXT:    add a0, a0, tp, %tprel_add(le)
+; RV64-PIC-TLSDESC-NEXT:    addi a0, a0, %tprel_lo(le)
+; RV64-PIC-TLSDESC-NEXT:    ret
+;
+; RV32-NOPIC-TLSDESC-LABEL: f4:
+; RV32-NOPIC-TLSDESC:       # %bb.0: # %entry
+; RV32-NOPIC-TLSDESC-NEXT:    lui a0, %tprel_hi(le)
+; RV32-NOPIC-TLSDESC-NEXT:    add a0, a0, tp, %tprel_add(le)
+; RV32-NOPIC-TLSDESC-NEXT:    addi a0, a0, %tprel_lo(le)
+; RV32-NOPIC-TLSDESC-NEXT:    ret
+;
+; RV64-NOPIC-TLSDESC-LABEL: f4:
+; RV64-NOPIC-TLSDESC:       # %bb.0: # %entry
+; RV64-NOPIC-TLSDESC-NEXT:    lui a0, %tprel_hi(le)
+; RV64-NOPIC-TLSDESC-NEXT:    add a0, a0, tp, %tprel_add(le)
+; RV64-NOPIC-TLSDESC-NEXT:    addi a0, a0, %tprel_lo(le)
+; RV64-NOPIC-TLSDESC-NEXT:    ret
 entry:
   ret ptr @le
 }
diff --git a/llvm/test/MC/RISCV/relocations.s b/llvm/test/MC/RISCV/relocations.s
index 262b3e44c6f00..d9d941697704c 100644
--- a/llvm/test/MC/RISCV/relocations.s
+++ b/llvm/test/MC/RISCV/relocations.s
@@ -176,3 +176,24 @@ bgeu a0, a1, foo
 # RELOC: R_RISCV_JAL
 # INSTR: bgeu a0, a1, foo
 # FIXUP: fixup A - offset: 0, value: foo, kind: fixup_riscv_branch
+
+.L5:
+auipc a0, %tlsdesc_hi(a_symbol)
+# RELOC: R_RISCV_TLSDESC_HI20
+# INST: auipc a0, 0x0
+# FIXUP: fixup A - offset: 0, value: %tlsdesc_hi(a_symbol), kind: fixup_riscv_tlsdesc_hi20
+
+lw a1, %tlsdesc_load_lo(.L5)(a0)
+# RELOC: R_RISCV_TLSDESC_LOAD_LO12
+# INST: lw a1, 0x0(a0)
+# FIXUP: fixup A - offset: 0, value: %tlsdesc_load_lo(.L5), kind: fixup_riscv_tlsdesc_load_lo12
+
+addi a0, a0, %tlsdesc_add_lo(.L5)
+# RELOC: R_RISCV_TLSDESC_ADD_LO12
+# INST: addi a0, a0, 0x0
+# FIXUP: fixup A - offset: 0, value: %tlsdesc_add_lo(.L5), kind: fixup_riscv_tlsdesc_add_lo12
+
+jalr t0, 0(a1), %tlsdesc_call(.L5)
+# RELOC: R_RISCV_TLSDESC_CALL
+# INST: jalr t0, 0x0(a1)
+# FIXUP: fixup A - offset: 0, value: %tlsdesc_call(.L5), kind: fixup_riscv_tlsdesc_call
diff --git a/llvm/test/MC/RISCV/tlsdesc.s b/llvm/test/MC/RISCV/tlsdesc.s
new file mode 100644
index 0000000000000..f016bf1d5a60d
--- /dev/null
+++ b/llvm/test/MC/RISCV/tlsdesc.s
@@ -0,0 +1,50 @@
+# RUN: llvm-mc -filetype=obj -triple riscv32 < %s --defsym RV32=1  | llvm-objdump -dr -M no-aliases - | FileCheck %s --check-prefixes=INST,RV32
+# RUN: llvm-mc -filetype=obj -triple riscv64 < %s | llvm-objdump -dr -M no-aliases - | FileCheck %s --check-prefixes=INST,RV64
+
+# RUN: not llvm-mc -triple riscv32 < %s --defsym RV32=1 --defsym ERR=1 2>&1 | FileCheck %s --check-prefixes=ERR
+# RUN: not llvm-mc -triple riscv64 < %s --defsym ERR=1 2>&1 | FileCheck %s --check-prefixes=ERR
+
+start:                                  # @start
+# %bb.0:                                # %entry
+.Ltlsdesc_hi0:
+	auipc a0, %tlsdesc_hi(a-4)
+	# INST: auipc a0, 0x0
+	# INST-NEXT: R_RISCV_TLSDESC_HI20 a-0x4
+	auipc	a0, %tlsdesc_hi(unspecified)
+	# INST-NEXT: auipc a0, 0x0
+	# INST-NEXT: R_RISCV_TLSDESC_HI20 unspecified
+.ifdef RV32
+	lw	a1, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a0)
+	# RV32: lw a1, 0x0(a0)
+	# RV32-NEXT: R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi0
+.else
+	ld	a1, %tlsdesc_load_lo(.Ltlsdesc_hi0)(a0)
+	# RV64: ld a1, 0x0(a0)
+	# RV64-NEXT: R_RISCV_TLSDESC_LOAD_LO12 .Ltlsdesc_hi0
+.endif
+	addi	a0, a0, %tlsdesc_add_lo(.Ltlsdesc_hi0)
+	# INST: addi a0, a0, 0x0
+	# INST-NEXT: R_RISCV_TLSDESC_ADD_LO12 .Ltlsdesc_hi0
+	jalr	t0, 0(a1), %tlsdesc_call(.Ltlsdesc_hi0)
+	# INST-NEXT: jalr t0, 0x0(a1)
+	# INST-NEXT: R_RISCV_TLSDESC_CALL .Ltlsdesc_hi0
+	add	a0, a0, tp
+	# INST-NEXT: add a0, a0, tp
+	ret
+
+## Check invalid usage
+.ifdef ERR
+	auipc x1, %tlsdesc_call(foo) # ERR: :[[#@LINE]]:12: error: operand must be a symbol with a %pcrel_hi/%got_pcrel_hi/%tls_ie_pcrel_hi/%tls_gd_pcrel_hi modifier or an integer in the range
+	auipc x1, %tlsdesc_call(1234) # ERR: :[[#@LINE]]:12: error: operand must be a symbol with a %pcrel_hi/%got_pcrel_hi/%tls_ie_pcrel_hi/%tls_gd_pcrel_hi modifier or an integer in the range
+	auipc a0, %tlsdesc_hi(a+b) # ERR: :[[#@LINE]]:12: error: operand must be a symbol with a %pcrel_hi/%got_pcrel_hi/%tls_ie_pcrel_hi/%tls_gd_pcrel_hi modifier or an integer in the range
+
+	lw   a0, t0, %tlsdesc_load_lo(a_symbol) # ERR: :[[#@LINE]]:15: error: invalid operand for instruction
+	lw   a0, t0, %tlsdesc_load_lo(a_symbol)(a4) # ERR: :[[#@LINE]]:15: error: invalid operand for instruction
+
+	addi a0, t0, %tlsdesc_add_lo(a_symbol)(a4) # ERR: :[[#@LINE]]:41: error: invalid operand for instruction
+	addi a0, %tlsdesc_add_lo(a_symbol) # ERR: :[[#@LINE]]:11: error: invalid operand for instruction
+	addi x1, %tlsdesc_load_lo(a_symbol)(a0) # ERR: :[[#@LINE]]:11: error: invalid operand for instruction
+
+	jalr x5, 0(a1), %tlsdesc_hi(a_symbol) # ERR: :[[#@LINE]]:18: error: operand must be a symbol with %tlsdesc_call modifier
+	jalr x1, 0(a1), %tlsdesc_call(a_symbol) # ERR: :[[#@LINE]]:13: error: the output operand must be t0/x5 when using %tlsdesc_call modifier
+.endif

From df1e01b316b174b5aa10727bd2ee0ec1082a9f65 Mon Sep 17 00:00:00 2001
From: Jeff Niu <jeff@modular.com>
Date: Tue, 23 Jan 2024 16:29:57 -0800
Subject: [PATCH 714/843] [mlir] Add example of `printAlias` to test dialect
 (NFC) (#79232)

Follow-up from previous pull request. Motivate the API change with an
attribute that decides between sugaring a sub-attribute or using an
alias
---
 mlir/test/IR/print-attr-type-aliases.mlir     | 14 +++++++++++
 mlir/test/lib/Dialect/Test/TestAttrDefs.td    |  9 +++++--
 mlir/test/lib/Dialect/Test/TestAttributes.cpp | 24 +++++++++++++++++++
 3 files changed, 45 insertions(+), 2 deletions(-)

diff --git a/mlir/test/IR/print-attr-type-aliases.mlir b/mlir/test/IR/print-attr-type-aliases.mlir
index a3db1f06009d8..162eacd002283 100644
--- a/mlir/test/IR/print-attr-type-aliases.mlir
+++ b/mlir/test/IR/print-attr-type-aliases.mlir
@@ -54,3 +54,17 @@
 // CHECK: #loc1 = loc(fused<memref<1xi32>
 // CHECK-NOT: #map
 "test.op"() {alias_test = loc(fused<memref<1xi32, affine_map<(d0) -> (d0)>>>["test.mlir":10:8])} : () -> ()
+
+// -----
+
+#unalias_me = "goodbye"
+#keep_aliased = "alias_test:dot_in_name"
+
+// CHECK: #test.conditional_alias<hello>
+"test.op"() {attr = #test.conditional_alias<"hello">} : () -> ()
+// CHECK-NEXT: #test.conditional_alias<#test_encoding>
+"test.op"() {attr = #test.conditional_alias<"alias_test:tensor_encoding">} : () -> ()
+// CHECK: #test.conditional_alias<goodbye>
+"test.op"() {attr = #test.conditional_alias<#unalias_me>} : () -> ()
+// CHECK-NEXT: #test.conditional_alias<#test2Ealias>
+"test.op"() {attr = #test.conditional_alias<#keep_aliased>} : () -> ()
diff --git a/mlir/test/lib/Dialect/Test/TestAttrDefs.td b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
index 945c54c04d47c..40f035a3e3a4e 100644
--- a/mlir/test/lib/Dialect/Test/TestAttrDefs.td
+++ b/mlir/test/lib/Dialect/Test/TestAttrDefs.td
@@ -332,7 +332,12 @@ def TestCopyCount : Test_Attr<"TestCopyCount"> {
   let assemblyFormat = "`<` $copy_count `>`";
 }
 
-
-
+def TestConditionalAliasAttr : Test_Attr<"TestConditionalAlias"> {
+  let mnemonic = "conditional_alias";
+  let parameters = (ins "mlir::StringAttr":$value);
+  let assemblyFormat = [{
+    `<` custom<ConditionalAlias>($value) `>`
+  }];
+}
 
 #endif // TEST_ATTRDEFS
diff --git a/mlir/test/lib/Dialect/Test/TestAttributes.cpp b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
index c240354e5d990..8819f6cbe94e2 100644
--- a/mlir/test/lib/Dialect/Test/TestAttributes.cpp
+++ b/mlir/test/lib/Dialect/Test/TestAttributes.cpp
@@ -215,6 +215,30 @@ llvm::hash_code hash_value(const test::CopyCount &copyCount) {
   return llvm::hash_value(copyCount.value);
 }
 } // namespace test
+
+//===----------------------------------------------------------------------===//
+// TestConditionalAliasAttr
+//===----------------------------------------------------------------------===//
+
+/// Attempt to parse the conditionally-aliased string attribute as a keyword or
+/// string, else try to parse an alias.
+static ParseResult parseConditionalAlias(AsmParser &p, StringAttr &value) {
+  std::string str;
+  if (succeeded(p.parseOptionalKeywordOrString(&str))) {
+    value = StringAttr::get(p.getContext(), str);
+    return success();
+  }
+  return p.parseAttribute(value);
+}
+
+/// Print the string attribute as an alias if it has one, otherwise print it as
+/// a keyword if possible.
+static void printConditionalAlias(AsmPrinter &p, StringAttr value) {
+  if (succeeded(p.printAlias(value)))
+    return;
+  p.printKeywordOrString(value);
+}
+
 //===----------------------------------------------------------------------===//
 // Tablegen Generated Definitions
 //===----------------------------------------------------------------------===//

From 766e645d8d1e9e999386b90e3e19554d72324a66 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <Jeffrey.Byrnes@amd.com>
Date: Tue, 23 Jan 2024 16:37:35 -0800
Subject: [PATCH 715/843] [SROA] NFC: Precommit test for pull/77678

Change-Id: I6b2346301f9bd840a0adceba4a0d03e9932af245
---
 llvm/test/Transforms/SROA/vector-promotion.ll | 168 ++++++++++++++++++
 1 file changed, 168 insertions(+)

diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index 9643a51064f04..06914aa8fe167 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -1227,6 +1227,174 @@ define void @swap-15bytes(ptr %x, ptr %y) {
   ret void
 }
 
+define <4 x i32> @ptrLoadStoreTys(ptr %init, i32 %val2) {
+; CHECK-LABEL: @ptrLoadStoreTys(
+; CHECK-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8
+; CHECK-NEXT:    [[OBJ:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[OBJ]], align 16
+; CHECK-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16
+; CHECK-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8
+; CHECK-NEXT:    store i32 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8
+; CHECK-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12
+; CHECK-NEXT:    store i32 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4
+; CHECK-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x i32>, ptr [[OBJ]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[OBJ_0_SROAVAL]]
+;
+; DEBUG-LABEL: @ptrLoadStoreTys(
+; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG492:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META487:![0-9]+]], metadata !DIExpression()), !dbg [[DBG492]]
+; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x i32>, align 16, !dbg [[DBG493:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META488:![0-9]+]], metadata !DIExpression()), !dbg [[DBG493]]
+; DEBUG-NEXT:    store <4 x i32> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG494:![0-9]+]]
+; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG495:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META489:![0-9]+]], metadata !DIExpression()), !dbg [[DBG496:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG497:![0-9]+]]
+; DEBUG-NEXT:    store i32 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG497]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META490:![0-9]+]], metadata !DIExpression()), !dbg [[DBG498:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG499:![0-9]+]]
+; DEBUG-NEXT:    store i32 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG499]]
+; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x i32>, ptr [[OBJ]], align 16, !dbg [[DBG500:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_0_SROAVAL]], metadata [[META491:![0-9]+]], metadata !DIExpression()), !dbg [[DBG500]]
+; DEBUG-NEXT:    ret <4 x i32> [[OBJ_0_SROAVAL]], !dbg [[DBG501:![0-9]+]]
+;
+  %val0 = load ptr, ptr %init, align 8
+  %obj = alloca <4 x i32>, align 16
+  store <4 x i32> zeroinitializer, ptr %obj, align 16
+  store ptr %val0, ptr %obj, align 8
+  %ptr2 = getelementptr inbounds i8, ptr %obj, i64 8
+  store i32 %val2, ptr %ptr2, align 4
+  %ptr3 = getelementptr inbounds i8, ptr %obj, i64 12
+  store i32 131072, ptr %ptr3, align 4
+  %sroaval = load <4 x i32>, ptr %obj, align 16
+  ret <4 x i32> %sroaval
+}
+
+define <4 x float> @ptrLoadStoreTysFloat(ptr %init, float %val2) {
+; CHECK-LABEL: @ptrLoadStoreTysFloat(
+; CHECK-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8
+; CHECK-NEXT:    [[OBJ:%.*]] = alloca <4 x float>, align 16
+; CHECK-NEXT:    store <4 x float> zeroinitializer, ptr [[OBJ]], align 16
+; CHECK-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16
+; CHECK-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8
+; CHECK-NEXT:    store float [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8
+; CHECK-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12
+; CHECK-NEXT:    store float 1.310720e+05, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4
+; CHECK-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x float>, ptr [[OBJ]], align 16
+; CHECK-NEXT:    ret <4 x float> [[OBJ_0_SROAVAL]]
+;
+; DEBUG-LABEL: @ptrLoadStoreTysFloat(
+; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG509:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META504:![0-9]+]], metadata !DIExpression()), !dbg [[DBG509]]
+; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x float>, align 16, !dbg [[DBG510:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META505:![0-9]+]], metadata !DIExpression()), !dbg [[DBG510]]
+; DEBUG-NEXT:    store <4 x float> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG511:![0-9]+]]
+; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG512:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META506:![0-9]+]], metadata !DIExpression()), !dbg [[DBG513:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG514:![0-9]+]]
+; DEBUG-NEXT:    store float [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG514]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META507:![0-9]+]], metadata !DIExpression()), !dbg [[DBG515:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG516:![0-9]+]]
+; DEBUG-NEXT:    store float 1.310720e+05, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG516]]
+; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x float>, ptr [[OBJ]], align 16, !dbg [[DBG517:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x float> [[OBJ_0_SROAVAL]], metadata [[META508:![0-9]+]], metadata !DIExpression()), !dbg [[DBG517]]
+; DEBUG-NEXT:    ret <4 x float> [[OBJ_0_SROAVAL]], !dbg [[DBG518:![0-9]+]]
+;
+  %val0 = load ptr, ptr %init, align 8
+  %obj = alloca <4 x float>, align 16
+  store <4 x float> zeroinitializer, ptr %obj, align 16
+  store ptr %val0, ptr %obj, align 8
+  %ptr2 = getelementptr inbounds i8, ptr %obj, i64 8
+  store float %val2, ptr %ptr2, align 4
+  %ptr3 = getelementptr inbounds i8, ptr %obj, i64 12
+  store float 131072.0, ptr %ptr3, align 4
+  %sroaval = load <4 x float>, ptr %obj, align 16
+  ret <4 x float> %sroaval
+}
+
+define <4 x i32> @ptrLoadStoreTysAS3(ptr %init, i32 %val2) {
+; CHECK-LABEL: @ptrLoadStoreTysAS3(
+; CHECK-NEXT:    [[VAL0:%.*]] = load ptr addrspace(3), ptr [[INIT:%.*]], align 8
+; CHECK-NEXT:    [[OBJ:%.*]] = alloca <4 x i32>, align 16
+; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[OBJ]], align 16
+; CHECK-NEXT:    store ptr addrspace(3) [[VAL0]], ptr [[OBJ]], align 16
+; CHECK-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8
+; CHECK-NEXT:    store i32 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8
+; CHECK-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12
+; CHECK-NEXT:    store i32 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4
+; CHECK-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x i32>, ptr [[OBJ]], align 16
+; CHECK-NEXT:    ret <4 x i32> [[OBJ_0_SROAVAL]]
+;
+; DEBUG-LABEL: @ptrLoadStoreTysAS3(
+; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr addrspace(3), ptr [[INIT:%.*]], align 8, !dbg [[DBG526:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr addrspace(3) [[VAL0]], metadata [[META521:![0-9]+]], metadata !DIExpression()), !dbg [[DBG526]]
+; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x i32>, align 16, !dbg [[DBG527:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META522:![0-9]+]], metadata !DIExpression()), !dbg [[DBG527]]
+; DEBUG-NEXT:    store <4 x i32> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG528:![0-9]+]]
+; DEBUG-NEXT:    store ptr addrspace(3) [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG529:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META523:![0-9]+]], metadata !DIExpression()), !dbg [[DBG530:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG531:![0-9]+]]
+; DEBUG-NEXT:    store i32 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG531]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META524:![0-9]+]], metadata !DIExpression()), !dbg [[DBG532:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG533:![0-9]+]]
+; DEBUG-NEXT:    store i32 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG533]]
+; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x i32>, ptr [[OBJ]], align 16, !dbg [[DBG534:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_0_SROAVAL]], metadata [[META525:![0-9]+]], metadata !DIExpression()), !dbg [[DBG534]]
+; DEBUG-NEXT:    ret <4 x i32> [[OBJ_0_SROAVAL]], !dbg [[DBG535:![0-9]+]]
+;
+  %val0 = load ptr addrspace(3), ptr %init, align 8
+  %obj = alloca <4 x i32>, align 16
+  store <4 x i32> zeroinitializer, ptr %obj, align 16
+  store ptr addrspace(3) %val0, ptr %obj, align 8
+  %ptr2 = getelementptr inbounds i8, ptr %obj, i64 8
+  store i32 %val2, ptr %ptr2, align 4
+  %ptr3 = getelementptr inbounds i8, ptr %obj, i64 12
+  store i32 131072, ptr %ptr3, align 4
+  %sroaval = load <4 x i32>, ptr %obj, align 16
+  ret <4 x i32> %sroaval
+}
+
+define <4 x ptr> @ptrLoadStoreTysPtr(ptr %init, i64 %val2) {
+; CHECK-LABEL: @ptrLoadStoreTysPtr(
+; CHECK-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8
+; CHECK-NEXT:    [[OBJ:%.*]] = alloca <4 x ptr>, align 16
+; CHECK-NEXT:    store <4 x ptr> zeroinitializer, ptr [[OBJ]], align 16
+; CHECK-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16
+; CHECK-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8
+; CHECK-NEXT:    store i64 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8
+; CHECK-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12
+; CHECK-NEXT:    store i64 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4
+; CHECK-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x ptr>, ptr [[OBJ]], align 16
+; CHECK-NEXT:    ret <4 x ptr> [[OBJ_0_SROAVAL]]
+;
+; DEBUG-LABEL: @ptrLoadStoreTysPtr(
+; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG543:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META538:![0-9]+]], metadata !DIExpression()), !dbg [[DBG543]]
+; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x ptr>, align 16, !dbg [[DBG544:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META539:![0-9]+]], metadata !DIExpression()), !dbg [[DBG544]]
+; DEBUG-NEXT:    store <4 x ptr> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG545:![0-9]+]]
+; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG546:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META540:![0-9]+]], metadata !DIExpression()), !dbg [[DBG547:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG548:![0-9]+]]
+; DEBUG-NEXT:    store i64 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG548]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META541:![0-9]+]], metadata !DIExpression()), !dbg [[DBG549:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG550:![0-9]+]]
+; DEBUG-NEXT:    store i64 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG550]]
+; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x ptr>, ptr [[OBJ]], align 16, !dbg [[DBG551:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x ptr> [[OBJ_0_SROAVAL]], metadata [[META542:![0-9]+]], metadata !DIExpression()), !dbg [[DBG551]]
+; DEBUG-NEXT:    ret <4 x ptr> [[OBJ_0_SROAVAL]], !dbg [[DBG552:![0-9]+]]
+;
+  %val0 = load ptr, ptr %init, align 8
+  %obj = alloca <4 x ptr>, align 16
+  store <4 x ptr> zeroinitializer, ptr %obj, align 16
+  store ptr %val0, ptr %obj, align 8
+  %ptr2 = getelementptr inbounds i32, ptr %obj, i64 2
+  store i64 %val2, ptr %ptr2, align 4
+  %ptr3 = getelementptr inbounds i32, ptr %obj, i64 3
+  store i64 131072, ptr %ptr3, align 4
+  %sroaval = load <4 x ptr>, ptr %obj, align 16
+  ret <4 x ptr> %sroaval
+}
+
 declare void @llvm.memcpy.p0.p0.i64(ptr, ptr, i64, i1)
 declare void @llvm.lifetime.end.p0(i64, ptr)
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:

From b1778c7d7b9ec07c075118546b85a7cc9741e1f1 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 23 Jan 2024 16:48:10 -0800
Subject: [PATCH 716/843] [AsmPrinter] Remove mbb-profile-dump flag (#76595)

Now that the work embedding PGO information in SHT_LLVM_BB_ADDR_MAP ELF
sections has landed, there is no longer a need to keep around the
mbb-profile-dump flag.
---
 llvm/include/llvm/CodeGen/AsmPrinter.h       |  4 --
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp   | 51 -----------------
 llvm/test/CodeGen/Generic/bb-profile-dump.ll | 59 --------------------
 3 files changed, 114 deletions(-)
 delete mode 100644 llvm/test/CodeGen/Generic/bb-profile-dump.ll

diff --git a/llvm/include/llvm/CodeGen/AsmPrinter.h b/llvm/include/llvm/CodeGen/AsmPrinter.h
index 5ec246ee7015c..fbd198a75a243 100644
--- a/llvm/include/llvm/CodeGen/AsmPrinter.h
+++ b/llvm/include/llvm/CodeGen/AsmPrinter.h
@@ -236,10 +236,6 @@ class AsmPrinter : public MachineFunctionPass {
   /// split stack prologue.
   bool HasNoSplitStack = false;
 
-  /// Raw FDOstream for outputting machine basic block frequncies if the
-  /// --mbb-profile-dump flag is set for downstream cost modelling applications
-  std::unique_ptr<raw_fd_ostream> MBBProfileDumpFileOutput;
-
 protected:
   explicit AsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 29da2b1c29f83..0d573562de969 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -134,13 +134,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "asm-printer"
 
-static cl::opt<std::string> BasicBlockProfileDump(
-    "mbb-profile-dump", cl::Hidden,
-    cl::desc("Basic block profile dump for external cost modelling. If "
-             "matching up BBs with afterwards, the compilation must be "
-             "performed with -basic-block-sections=labels. Enabling this "
-             "flag during in-process ThinLTO is not supported."));
-
 // This is a replication of fields of object::PGOAnalysisMap::Features. It
 // should match the order of the fields so that
 // `object::PGOAnalysisMap::Features::decode(PgoAnalysisMapFeatures.getBits())`
@@ -646,16 +639,6 @@ bool AsmPrinter::doInitialization(Module &M) {
     HI.Handler->beginModule(&M);
   }
 
-  if (!BasicBlockProfileDump.empty()) {
-    std::error_code PossibleFileError;
-    MBBProfileDumpFileOutput = std::make_unique<raw_fd_ostream>(
-        BasicBlockProfileDump, PossibleFileError);
-    if (PossibleFileError) {
-      M.getContext().emitError("Failed to open file for MBB Profile Dump: " +
-                               PossibleFileError.message() + "\n");
-    }
-  }
-
   return false;
 }
 
@@ -2026,40 +2009,6 @@ void AsmPrinter::emitFunctionBody() {
     OutStreamer->getCommentOS() << "-- End function\n";
 
   OutStreamer->addBlankLine();
-
-  // Output MBB ids, function names, and frequencies if the flag to dump
-  // MBB profile information has been set
-  if (MBBProfileDumpFileOutput && !MF->empty() &&
-      MF->getFunction().getEntryCount()) {
-    if (!MF->hasBBLabels()) {
-      MF->getContext().reportError(
-          SMLoc(),
-          "Unable to find BB labels for MBB profile dump. -mbb-profile-dump "
-          "must be called with -basic-block-sections=labels");
-    } else {
-      MachineBlockFrequencyInfo &MBFI =
-          getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI();
-      // The entry count and the entry basic block frequency aren't the same. We
-      // want to capture "absolute" frequencies, i.e. the frequency with which a
-      // MBB is executed when the program is executed. From there, we can derive
-      // Function-relative frequencies (divide by the value for the first MBB).
-      // We also have the information about frequency with which functions
-      // were called. This helps, for example, in a type of integration tests
-      // where we want to cross-validate the compiler's profile with a real
-      // profile.
-      // Using double precision because uint64 values used to encode mbb
-      // "frequencies" may be quite large.
-      const double EntryCount =
-          static_cast<double>(MF->getFunction().getEntryCount()->getCount());
-      for (const auto &MBB : *MF) {
-        const double MBBRelFreq = MBFI.getBlockFreqRelativeToEntryBlock(&MBB);
-        const double AbsMBBFreq = MBBRelFreq * EntryCount;
-        *MBBProfileDumpFileOutput.get()
-            << MF->getName() << "," << MBB.getBBID()->BaseID << ","
-            << AbsMBBFreq << "\n";
-      }
-    }
-  }
 }
 
 /// Compute the number of Global Variables that uses a Constant.
diff --git a/llvm/test/CodeGen/Generic/bb-profile-dump.ll b/llvm/test/CodeGen/Generic/bb-profile-dump.ll
deleted file mode 100644
index 7391a6ee6f912..0000000000000
--- a/llvm/test/CodeGen/Generic/bb-profile-dump.ll
+++ /dev/null
@@ -1,59 +0,0 @@
-; REQUIRES: x86-registered-target
-;
-; Check that the basic block profile dump outputs data and in the correct
-; format.
-;
-; RUN: llc -mtriple=x86_64-linux-unknown -o /dev/null -basic-block-sections=labels -mbb-profile-dump=- %s | FileCheck %s
-
-; Check that given a simple case, we can return the default MBFI
-
-define i64 @f2(i64 %a, i64 %b) !prof !1{
-    %sum = add i64 %a, %b
-    ret i64 %sum
-}
-
-; CHECK: f2,0,1.000000e+03
-
-define i64 @f1() !prof !2{
-    %sum = call i64 @f2(i64 2, i64 2)
-    %isEqual = icmp eq i64 %sum, 4
-    br i1 %isEqual, label %ifEqual, label %ifNotEqual, !prof !3
-ifEqual:
-    ret i64 0
-ifNotEqual:
-    ret i64 %sum
-}
-
-; CHECK-NEXT: f1,0,1.000000e+01
-; CHECK-NEXT: f1,2,6.000000e+00
-; CHECK-NEXT: f1,1,4.000000e+00
-
-define void @f3(i32 %iter) !prof !4 {
-entry:
-    br label %loop
-loop:
-    %i = phi i32 [0, %entry], [%i_next, %loop]
-    %i_next = add i32 %i, 1
-    %exit_cond = icmp slt i32 %i_next, %iter
-    br i1 %exit_cond, label %loop, label %exit, !prof !5
-exit:
-    ret void
-}
-
-; CHECK-NEXT: f3,0,2.000000e+00
-; CHECK-NEXT: f3,1,2.002000e+03
-; CHECK-NEXT: f3,2,2.000000e+00
-
-!1 = !{!"function_entry_count", i64 1000}
-!2 = !{!"function_entry_count", i64 10}
-!3 = !{!"branch_weights", i32 2, i32 3}
-!4 = !{!"function_entry_count", i64 2}
-!5 = !{!"branch_weights", i32 1000, i32 1}
-
-; Check that if we pass -mbb-profile-dump but don't set -basic-block-sections,
-; we get an appropriate error message
-
-; RUN: not llc -mtriple=x86_64-linux-unknown -o /dev/null -mbb-profile-dump=- %s 2>&1 | FileCheck --check-prefix=NO-SECTIONS %s
-
-; NO-SECTIONS: <unknown>:0: error: Unable to find BB labels for MBB profile dump. -mbb-profile-dump must be called with -basic-block-sections=labels
-

From c41472dbafd0dcacd943a95a9a099c1942d50394 Mon Sep 17 00:00:00 2001
From: Aiden Grossman <agrossman154@yahoo.com>
Date: Tue, 23 Jan 2024 16:50:10 -0800
Subject: [PATCH 717/843] [Github] Only run libclang-python-tests on monorepo
 main

The libclang python binding test CI job currently doesn't have any
restrictions on what branches it will run on when something is pushed
and also isn't restricted to the monorepo. This patch adds a branch
restriction for the push event, only running the CI job when something
is pushed to the main branch (and the path filter is met), and also adds
a filter to ensure that the job comes from a PR against the monorepo or
a push to a branch in the monorepo.
---
 .github/workflows/libclang-python-tests.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.github/workflows/libclang-python-tests.yml b/.github/workflows/libclang-python-tests.yml
index e12acbc0f6ce8..497f6ca5c5547 100644
--- a/.github/workflows/libclang-python-tests.yml
+++ b/.github/workflows/libclang-python-tests.yml
@@ -6,6 +6,8 @@ permissions:
 on:
   workflow_dispatch:
   push:
+    branches:
+      - 'main'
     paths:
       - 'clang/bindings/python/**'
       - 'clang/tools/libclang/**'
@@ -30,6 +32,7 @@ jobs:
   check-clang-python:
     # Build libclang and then run the libclang Python binding's unit tests.
     name: Build and run Python unit tests
+    if: github.repository == 'llvm/llvm-project'
     strategy:
       fail-fast: false
       matrix:

From c51ab483e6c2d991a01179584705b83fbea1940d Mon Sep 17 00:00:00 2001
From: Jinyang He <hejinyang@loongson.cn>
Date: Wed, 24 Jan 2024 09:17:49 +0800
Subject: [PATCH 718/843] [LoongArch] Insert nops and emit align reloc when
 handle alignment directive (#72962)

Refer to RISCV, we will fix up the alignment if linker relaxation
changes code size and breaks alignment. Insert enough Nops and emit
R_LARCH_ALIGN relocation type so that linker could satisfy the alignment
by removing Nops.
It does so only in sections with the SHF_EXECINSTR flag.

In LoongArch psABI v2.30, R_LARCH_ALIGN requires symbol index. The
lowest 8 bits of addend represent alignment and the other bits of addend
represent the maximum number of bytes to emit.
---
 .../MCTargetDesc/LoongArchAsmBackend.cpp      | 67 ++++++++++++++++
 .../MCTargetDesc/LoongArchAsmBackend.h        | 15 ++++
 .../MCTargetDesc/LoongArchFixupKinds.h        |  2 +
 .../Relocations/align-non-executable.s        | 27 +++++++
 .../MC/LoongArch/Relocations/relax-addsub.s   | 15 +++-
 .../MC/LoongArch/Relocations/relax-align.s    | 79 +++++++++++++++++++
 6 files changed, 203 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/MC/LoongArch/Relocations/align-non-executable.s
 create mode 100644 llvm/test/MC/LoongArch/Relocations/relax-align.s

diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
index a6630d1adb08a..de492f2b1f0a4 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.cpp
@@ -17,9 +17,12 @@
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/Support/EndianStream.h"
 #include "llvm/Support/LEB128.h"
+#include "llvm/Support/MathExtras.h"
 
 #define DEBUG_TYPE "loongarch-asmbackend"
 
@@ -176,6 +179,70 @@ void LoongArchAsmBackend::applyFixup(const MCAssembler &Asm,
   }
 }
 
+// Linker relaxation may change code size. We have to insert Nops
+// for .align directive when linker relaxation enabled. So then Linker
+// could satisfy alignment by removing Nops.
+// The function returns the total Nops Size we need to insert.
+bool LoongArchAsmBackend::shouldInsertExtraNopBytesForCodeAlign(
+    const MCAlignFragment &AF, unsigned &Size) {
+  // Calculate Nops Size only when linker relaxation enabled.
+  if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
+    return false;
+
+  // Ignore alignment if MaxBytesToEmit is less than the minimum Nop size.
+  const unsigned MinNopLen = 4;
+  if (AF.getMaxBytesToEmit() < MinNopLen)
+    return false;
+  Size = AF.getAlignment().value() - MinNopLen;
+  return AF.getAlignment() > MinNopLen;
+}
+
+// We need to insert R_LARCH_ALIGN relocation type to indicate the
+// position of Nops and the total bytes of the Nops have been inserted
+// when linker relaxation enabled.
+// The function inserts fixup_loongarch_align fixup which eventually will
+// transfer to R_LARCH_ALIGN relocation type.
+// The improved R_LARCH_ALIGN requires symbol index. The lowest 8 bits of
+// addend represent alignment and the other bits of addend represent the
+// maximum number of bytes to emit. The maximum number of bytes is zero
+// means ignore the emit limit.
+bool LoongArchAsmBackend::shouldInsertFixupForCodeAlign(
+    MCAssembler &Asm, const MCAsmLayout &Layout, MCAlignFragment &AF) {
+  // Insert the fixup only when linker relaxation enabled.
+  if (!AF.getSubtargetInfo()->hasFeature(LoongArch::FeatureRelax))
+    return false;
+
+  // Calculate total Nops we need to insert. If there are none to insert
+  // then simply return.
+  unsigned Count;
+  if (!shouldInsertExtraNopBytesForCodeAlign(AF, Count))
+    return false;
+
+  MCSection *Sec = AF.getParent();
+  MCContext &Ctx = Asm.getContext();
+  const MCExpr *Dummy = MCConstantExpr::create(0, Ctx);
+  // Create fixup_loongarch_align fixup.
+  MCFixup Fixup =
+      MCFixup::create(0, Dummy, MCFixupKind(LoongArch::fixup_loongarch_align));
+  const MCSymbolRefExpr *MCSym = getSecToAlignSym()[Sec];
+  if (MCSym == nullptr) {
+    // Create a symbol and make the value of symbol is zero.
+    MCSymbol *Sym = Ctx.createNamedTempSymbol("la-relax-align");
+    Sym->setFragment(&*Sec->getBeginSymbol()->getFragment());
+    Asm.registerSymbol(*Sym);
+    MCSym = MCSymbolRefExpr::create(Sym, Ctx);
+    getSecToAlignSym()[Sec] = MCSym;
+  }
+
+  uint64_t FixedValue = 0;
+  unsigned Lo = Log2_64(Count) + 1;
+  unsigned Hi = AF.getMaxBytesToEmit() >= Count ? 0 : AF.getMaxBytesToEmit();
+  MCValue Value = MCValue::get(MCSym, nullptr, Hi << 8 | Lo);
+  Asm.getWriter().recordRelocation(Asm, Layout, &AF, Fixup, Value, FixedValue);
+
+  return true;
+}
+
 bool LoongArchAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                                 const MCFixup &Fixup,
                                                 const MCValue &Target,
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
index 518766d38f7ef..9d81781fca190 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchAsmBackend.h
@@ -17,7 +17,9 @@
 #include "MCTargetDesc/LoongArchFixupKinds.h"
 #include "MCTargetDesc/LoongArchMCTargetDesc.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
@@ -27,6 +29,7 @@ class LoongArchAsmBackend : public MCAsmBackend {
   uint8_t OSABI;
   bool Is64Bit;
   const MCTargetOptions &TargetOptions;
+  DenseMap<MCSection *, const MCSymbolRefExpr *> SecToAlignSym;
 
 public:
   LoongArchAsmBackend(const MCSubtargetInfo &STI, uint8_t OSABI, bool Is64Bit,
@@ -45,6 +48,15 @@ class LoongArchAsmBackend : public MCAsmBackend {
                   uint64_t Value, bool IsResolved,
                   const MCSubtargetInfo *STI) const override;
 
+  // Return Size with extra Nop Bytes for alignment directive in code section.
+  bool shouldInsertExtraNopBytesForCodeAlign(const MCAlignFragment &AF,
+                                             unsigned &Size) override;
+
+  // Insert target specific fixup type for alignment directive in code section.
+  bool shouldInsertFixupForCodeAlign(MCAssembler &Asm,
+                                     const MCAsmLayout &Layout,
+                                     MCAlignFragment &AF) override;
+
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target,
                              const MCSubtargetInfo *STI) override;
@@ -80,6 +92,9 @@ class LoongArchAsmBackend : public MCAsmBackend {
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override;
   const MCTargetOptions &getTargetOptions() const { return TargetOptions; }
+  DenseMap<MCSection *, const MCSymbolRefExpr *> &getSecToAlignSym() {
+    return SecToAlignSym;
+  }
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
index e827bae1f3e37..0d19d2b0fb1fe 100644
--- a/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
+++ b/llvm/lib/Target/LoongArch/MCTargetDesc/LoongArchFixupKinds.h
@@ -109,6 +109,8 @@ enum Fixups {
   fixup_loongarch_tls_gd_hi20,
   // Generate an R_LARCH_RELAX which indicates the linker may relax here.
   fixup_loongarch_relax = FirstLiteralRelocationKind + ELF::R_LARCH_RELAX,
+  // Generate an R_LARCH_ALIGN which indicates the linker may fixup align here.
+  fixup_loongarch_align = FirstLiteralRelocationKind + ELF::R_LARCH_ALIGN,
   // 36-bit fixup corresponding to %call36(foo) for a pair instructions:
   // pcaddu18i+jirl.
   fixup_loongarch_call36 = FirstLiteralRelocationKind + ELF::R_LARCH_CALL36,
diff --git a/llvm/test/MC/LoongArch/Relocations/align-non-executable.s b/llvm/test/MC/LoongArch/Relocations/align-non-executable.s
new file mode 100644
index 0000000000000..47834acd9521f
--- /dev/null
+++ b/llvm/test/MC/LoongArch/Relocations/align-non-executable.s
@@ -0,0 +1,27 @@
+## A label difference separated by an alignment directive, when the
+## referenced symbols are in a non-executable section with instructions,
+## should generate ADD/SUB relocations.
+## https://github.com/llvm/llvm-project/pull/76552
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck --check-prefixes=CHECK,RELAX %s
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s \
+# RUN:     | llvm-readobj -r - | FileCheck %s
+
+.section ".dummy", "a"
+.L1:
+  la.pcrel $t0, sym
+.p2align 3
+.L2:
+.dword .L2 - .L1
+
+# CHECK:       Relocations [
+# CHECK-NEXT:    Section ({{.*}}) .rela.dummy {
+# CHECK-NEXT:      0x0 R_LARCH_PCALA_HI20 sym 0x0
+# RELAX-NEXT:      0x0 R_LARCH_RELAX - 0x0
+# CHECK-NEXT:      0x4 R_LARCH_PCALA_LO12 sym 0x0
+# RELAX-NEXT:      0x4 R_LARCH_RELAX - 0x0
+# RELAX-NEXT:      0x8 R_LARCH_ADD64 .L2 0x0
+# RELAX-NEXT:      0x8 R_LARCH_SUB64 .L1 0x0
+# CHECK-NEXT:    }
+# CHECK-NEXT:  ]
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
index cd01332afd0be..18e0ede5e2937 100644
--- a/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
+++ b/llvm/test/MC/LoongArch/Relocations/relax-addsub.s
@@ -28,12 +28,23 @@
 
 # RELAX:       Relocations [
 # RELAX-NEXT:    Section ({{.*}}) .rela.text {
+# RELAX-NEXT:      0x4 R_LARCH_ALIGN {{.*}} 0x4
 # RELAX-NEXT:      0x10 R_LARCH_PCALA_HI20 .L1 0x0
 # RELAX-NEXT:      0x10 R_LARCH_RELAX - 0x0
 # RELAX-NEXT:      0x14 R_LARCH_PCALA_LO12 .L1 0x0
 # RELAX-NEXT:      0x14 R_LARCH_RELAX - 0x0
 # RELAX-NEXT:    }
 # RELAX-NEXT:    Section ({{.*}}) .rela.data {
+# RELAX-NEXT:      0x10 R_LARCH_ADD8 .L3 0x0
+# RELAX-NEXT:      0x10 R_LARCH_SUB8 .L2 0x0
+# RELAX-NEXT:      0x11 R_LARCH_ADD16 .L3 0x0
+# RELAX-NEXT:      0x11 R_LARCH_SUB16 .L2 0x0
+# RELAX-NEXT:      0x13 R_LARCH_ADD32 .L3 0x0
+# RELAX-NEXT:      0x13 R_LARCH_SUB32 .L2 0x0
+# RELAX-NEXT:      0x17 R_LARCH_ADD64 .L3 0x0
+# RELAX-NEXT:      0x17 R_LARCH_SUB64 .L2 0x0
+# RELAX-NEXT:      0x1F R_LARCH_ADD_ULEB128 .L3 0x0
+# RELAX-NEXT:      0x1F R_LARCH_SUB_ULEB128 .L2 0x0
 # RELAX-NEXT:      0x20 R_LARCH_ADD8 .L4 0x0
 # RELAX-NEXT:      0x20 R_LARCH_SUB8 .L3 0x0
 # RELAX-NEXT:      0x21 R_LARCH_ADD16 .L4 0x0
@@ -57,7 +68,7 @@
 
 # RELAX:      Hex dump of section '.data':
 # RELAX-NEXT: 0x00000000 04040004 00000004 00000000 00000004
-# RELAX-NEXT: 0x00000010 0c0c000c 0000000c 00000000 0000000c
+# RELAX-NEXT: 0x00000010 00000000 00000000 00000000 00000000
 # RELAX-NEXT: 0x00000020 00000000 00000000 00000000 00000000
 # RELAX-NEXT: 0x00000030 00000000 00000000 00000000 000000
 
@@ -78,7 +89,7 @@
 .word  .L2 - .L1
 .dword .L2 - .L1
 .uleb128 .L2 - .L1
-## TODO Handle alignment directive.
+## With relaxation, emit relocs because the .align makes the diff variable.
 .byte  .L3 - .L2
 .short .L3 - .L2
 .word  .L3 - .L2
diff --git a/llvm/test/MC/LoongArch/Relocations/relax-align.s b/llvm/test/MC/LoongArch/Relocations/relax-align.s
new file mode 100644
index 0000000000000..294fd9fb916c7
--- /dev/null
+++ b/llvm/test/MC/LoongArch/Relocations/relax-align.s
@@ -0,0 +1,79 @@
+## The file testing Nop insertion with R_LARCH_ALIGN for relaxation.
+
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=-relax %s -o %t
+# RUN: llvm-objdump -d %t | FileCheck %s --check-prefix=INSTR
+# RUN: llvm-readobj -r %t | FileCheck %s --check-prefix=RELOC
+# RUN: llvm-mc --filetype=obj --triple=loongarch64 --mattr=+relax %s -o %t.r
+# RUN: llvm-objdump -d %t.r | FileCheck %s --check-prefixes=INSTR,RELAX-INSTR
+# RUN: llvm-readobj -r %t.r | FileCheck %s --check-prefixes=RELOC,RELAX-RELOC
+
+.text
+break 0
+# INSTR: break 0
+
+## Not emit R_LARCH_ALIGN if alignment directive is less than or equal to
+## minimum code alignment(a.k.a 4).
+.p2align 2
+.p2align 1
+.p2align 0
+
+## Not emit instructions if max emit bytes less than min nop size.
+.p2align 4, , 2
+
+## Not emit R_LARCH_ALIGN if alignment directive with specific padding value.
+## The behavior is the same as GNU assembler.
+break 1
+.p2align 4, 1
+# INSTR-NEXT:    break 1
+# INSTR-COUNT-2: 01 01 01 01
+
+break 2
+.p2align 4, 1, 12
+# INSTR-NEXT:    break 2
+# INSTR-COUNT-3: 01 01 01 01
+
+break 3
+.p2align 4
+# INSTR-NEXT:    break 3
+# INSTR-COUNT-3: nop
+
+break 4
+.p2align 5
+.p2align 4
+# INSTR-NEXT:          break 4
+# INSTR-COUNT-3:       nop
+# RELAX-INSTR-COUNT-7: nop
+
+break 5
+.p2align 4, , 11
+# INSTR-NEXT: break 5
+# RELAX-INSTR-COUNT-3: nop
+
+break 6
+## Not emit the third parameter.
+.p2align 4, , 12
+# INSTR-NEXT:       break 6
+# INSTR-NEXT:       nop
+# INSTR-NEXT:       nop
+# RELAX-INSTR-NEXT: nop
+
+ret
+# INSNR-NEXT: ret
+
+## Test the symbol index is different from .text.
+.section .text2, "ax"
+.p2align 4
+break 7
+
+# RELOC:            Relocations [
+# RELAX-RELOC-NEXT:   Section ({{.*}}) .rela.text {
+# RELAX-RELOC-NEXT:     0x24 R_LARCH_ALIGN .Lla-relax-align0 0x4
+# RELAX-RELOC-NEXT:     0x34 R_LARCH_ALIGN .Lla-relax-align0 0x5
+# RELAX-RELOC-NEXT:     0x50 R_LARCH_ALIGN .Lla-relax-align0 0x4
+# RELAX-RELOC-NEXT:     0x60 R_LARCH_ALIGN .Lla-relax-align0 0xB04
+# RELAX-RELOC-NEXT:     0x70 R_LARCH_ALIGN .Lla-relax-align0 0x4
+# RELAX-RELOC-NEXT:   }
+# RELAX-RELOC-NEXT:   Section ({{.*}}) .rela.text2 {
+# RELAX-RELOC-NEXT:     0x0 R_LARCH_ALIGN .Lla-relax-align1 0x4
+# RELAX-RELOC-NEXT:   }
+# RELOC-NEXT:       ]

From f709fbb1bb5e6240aad4edeb2f0e417df74cfa27 Mon Sep 17 00:00:00 2001
From: Jeffrey Byrnes <jeffrey.byrnes@amd.com>
Date: Tue, 23 Jan 2024 17:22:49 -0800
Subject: [PATCH 719/843] [SROA] Only try additional vector type candidates
 when needed (#77678)

https://github.com/llvm/llvm-project/commit/f9c2a341b94ca71508dcefa109ece843459f7f13
causes regressions when we have a slice with integer vector type that is
the same size as the partition, and a ptr load/store slice that is not
the size of the element type.

Ref `vector-promotion.ll:ptrLoadStoreTys`.

Before the patch, we would only consider `<4 x i32>` as a candidate type
for vector promotion, and would find that it is a viable type for all
the slices.

After the patch, we now add `<2 x ptr>` as a candidate type due to slice
with user `store ptr %val0, ptr %obj, align 8` -- and flag that we
`HaveVecPtrTy`. The pre-existing behavior of this flag results in
removing the viable `<4 x i32>` and keeping only the unviable `<2 x
ptr>`, which results in a failure to promote.

The end result is failing to promote an alloca that was previously
promoted -- this does not appear to be the intent of that patch, which
has the goal of increasing promotions by providing more promotion
opportunities.

This PR preserves this behavior via a simple reorganization of the
implemention: try first the slice types with same size as the partition,
then, if there is no promotable type, try the `LoadStoreTys.`
---
 llvm/lib/Transforms/Scalar/SROA.cpp           |   7 +
 llvm/test/Transforms/SROA/vector-promotion.ll | 143 +++++++++---------
 2 files changed, 75 insertions(+), 75 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 10c25e2a03229..bdbaf4f55c96d 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -2319,6 +2319,12 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     if (S.beginOffset() == P.beginOffset() && S.endOffset() == P.endOffset())
       CheckCandidateType(Ty);
   }
+
+  if (auto *VTy = checkVectorTypesForPromotion(
+          P, DL, CandidateTys, HaveCommonEltTy, CommonEltTy, HaveVecPtrTy,
+          HaveCommonVecPtrTy, CommonVecPtrTy))
+    return VTy;
+
   // Consider additional vector types where the element type size is a
   // multiple of load/store element size.
   for (Type *Ty : LoadStoreTys) {
@@ -2328,6 +2334,7 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     // Make a copy of CandidateTys and iterate through it, because we might
     // append to CandidateTys in the loop.
     SmallVector<VectorType *, 4> CandidateTysCopy = CandidateTys;
+    CandidateTys.clear();
     for (VectorType *&VTy : CandidateTysCopy) {
       unsigned VectorSize = DL.getTypeSizeInBits(VTy).getFixedValue();
       unsigned ElementSize =
diff --git a/llvm/test/Transforms/SROA/vector-promotion.ll b/llvm/test/Transforms/SROA/vector-promotion.ll
index 06914aa8fe167..ee35a0fd5fea6 100644
--- a/llvm/test/Transforms/SROA/vector-promotion.ll
+++ b/llvm/test/Transforms/SROA/vector-promotion.ll
@@ -1227,35 +1227,32 @@ define void @swap-15bytes(ptr %x, ptr %y) {
   ret void
 }
 
+
 define <4 x i32> @ptrLoadStoreTys(ptr %init, i32 %val2) {
 ; CHECK-LABEL: @ptrLoadStoreTys(
 ; CHECK-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8
-; CHECK-NEXT:    [[OBJ:%.*]] = alloca <4 x i32>, align 16
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[OBJ]], align 16
-; CHECK-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16
-; CHECK-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8
-; CHECK-NEXT:    store i32 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8
-; CHECK-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12
-; CHECK-NEXT:    store i32 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4
-; CHECK-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x i32>, ptr [[OBJ]], align 16
-; CHECK-NEXT:    ret <4 x i32> [[OBJ_0_SROAVAL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VAL0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[OBJ_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[OBJ_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> [[OBJ_0_VEC_EXPAND]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[OBJ_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_0_VECBLEND]], i32 [[VAL2:%.*]], i32 2
+; CHECK-NEXT:    [[OBJ_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_8_VEC_INSERT]], i32 131072, i32 3
+; CHECK-NEXT:    ret <4 x i32> [[OBJ_12_VEC_INSERT]]
 ;
 ; DEBUG-LABEL: @ptrLoadStoreTys(
 ; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG492:![0-9]+]]
 ; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META487:![0-9]+]], metadata !DIExpression()), !dbg [[DBG492]]
-; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x i32>, align 16, !dbg [[DBG493:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META488:![0-9]+]], metadata !DIExpression()), !dbg [[DBG493]]
-; DEBUG-NEXT:    store <4 x i32> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG494:![0-9]+]]
-; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG495:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META489:![0-9]+]], metadata !DIExpression()), !dbg [[DBG496:![0-9]+]]
-; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG497:![0-9]+]]
-; DEBUG-NEXT:    store i32 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG497]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META490:![0-9]+]], metadata !DIExpression()), !dbg [[DBG498:![0-9]+]]
-; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG499:![0-9]+]]
-; DEBUG-NEXT:    store i32 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG499]]
-; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x i32>, ptr [[OBJ]], align 16, !dbg [[DBG500:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_0_SROAVAL]], metadata [[META491:![0-9]+]], metadata !DIExpression()), !dbg [[DBG500]]
-; DEBUG-NEXT:    ret <4 x i32> [[OBJ_0_SROAVAL]], !dbg [[DBG501:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META488:![0-9]+]], metadata !DIExpression()), !dbg [[DBG493:![0-9]+]]
+; DEBUG-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[VAL0]] to i64, !dbg [[DBG494:![0-9]+]]
+; DEBUG-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>, !dbg [[DBG494]]
+; DEBUG-NEXT:    [[OBJ_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>, !dbg [[DBG494]]
+; DEBUG-NEXT:    [[OBJ_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> [[OBJ_0_VEC_EXPAND]], <4 x i32> zeroinitializer, !dbg [[DBG494]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META489:![0-9]+]], metadata !DIExpression()), !dbg [[DBG495:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_0_VECBLEND]], i32 [[VAL2:%.*]], i32 2, !dbg [[DBG496:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META490:![0-9]+]], metadata !DIExpression()), !dbg [[DBG497:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_8_VEC_INSERT]], i32 131072, i32 3, !dbg [[DBG498:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_12_VEC_INSERT]], metadata [[META491:![0-9]+]], metadata !DIExpression()), !dbg [[DBG499:![0-9]+]]
+; DEBUG-NEXT:    ret <4 x i32> [[OBJ_12_VEC_INSERT]], !dbg [[DBG500:![0-9]+]]
 ;
   %val0 = load ptr, ptr %init, align 8
   %obj = alloca <4 x i32>, align 16
@@ -1283,21 +1280,21 @@ define <4 x float> @ptrLoadStoreTysFloat(ptr %init, float %val2) {
 ; CHECK-NEXT:    ret <4 x float> [[OBJ_0_SROAVAL]]
 ;
 ; DEBUG-LABEL: @ptrLoadStoreTysFloat(
-; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG509:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META504:![0-9]+]], metadata !DIExpression()), !dbg [[DBG509]]
-; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x float>, align 16, !dbg [[DBG510:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META505:![0-9]+]], metadata !DIExpression()), !dbg [[DBG510]]
-; DEBUG-NEXT:    store <4 x float> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG511:![0-9]+]]
-; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG512:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META506:![0-9]+]], metadata !DIExpression()), !dbg [[DBG513:![0-9]+]]
-; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG514:![0-9]+]]
-; DEBUG-NEXT:    store float [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG514]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META507:![0-9]+]], metadata !DIExpression()), !dbg [[DBG515:![0-9]+]]
-; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG516:![0-9]+]]
-; DEBUG-NEXT:    store float 1.310720e+05, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG516]]
-; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x float>, ptr [[OBJ]], align 16, !dbg [[DBG517:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x float> [[OBJ_0_SROAVAL]], metadata [[META508:![0-9]+]], metadata !DIExpression()), !dbg [[DBG517]]
-; DEBUG-NEXT:    ret <4 x float> [[OBJ_0_SROAVAL]], !dbg [[DBG518:![0-9]+]]
+; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG508:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META503:![0-9]+]], metadata !DIExpression()), !dbg [[DBG508]]
+; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x float>, align 16, !dbg [[DBG509:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META504:![0-9]+]], metadata !DIExpression()), !dbg [[DBG509]]
+; DEBUG-NEXT:    store <4 x float> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG510:![0-9]+]]
+; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG511:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META505:![0-9]+]], metadata !DIExpression()), !dbg [[DBG512:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG513:![0-9]+]]
+; DEBUG-NEXT:    store float [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG513]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META506:![0-9]+]], metadata !DIExpression()), !dbg [[DBG514:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG515:![0-9]+]]
+; DEBUG-NEXT:    store float 1.310720e+05, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG515]]
+; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x float>, ptr [[OBJ]], align 16, !dbg [[DBG516:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x float> [[OBJ_0_SROAVAL]], metadata [[META507:![0-9]+]], metadata !DIExpression()), !dbg [[DBG516]]
+; DEBUG-NEXT:    ret <4 x float> [[OBJ_0_SROAVAL]], !dbg [[DBG517:![0-9]+]]
 ;
   %val0 = load ptr, ptr %init, align 8
   %obj = alloca <4 x float>, align 16
@@ -1314,32 +1311,28 @@ define <4 x float> @ptrLoadStoreTysFloat(ptr %init, float %val2) {
 define <4 x i32> @ptrLoadStoreTysAS3(ptr %init, i32 %val2) {
 ; CHECK-LABEL: @ptrLoadStoreTysAS3(
 ; CHECK-NEXT:    [[VAL0:%.*]] = load ptr addrspace(3), ptr [[INIT:%.*]], align 8
-; CHECK-NEXT:    [[OBJ:%.*]] = alloca <4 x i32>, align 16
-; CHECK-NEXT:    store <4 x i32> zeroinitializer, ptr [[OBJ]], align 16
-; CHECK-NEXT:    store ptr addrspace(3) [[VAL0]], ptr [[OBJ]], align 16
-; CHECK-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8
-; CHECK-NEXT:    store i32 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8
-; CHECK-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12
-; CHECK-NEXT:    store i32 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4
-; CHECK-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x i32>, ptr [[OBJ]], align 16
-; CHECK-NEXT:    ret <4 x i32> [[OBJ_0_SROAVAL]]
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[VAL0]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>
+; CHECK-NEXT:    [[OBJ_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; CHECK-NEXT:    [[OBJ_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> [[OBJ_0_VEC_EXPAND]], <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[OBJ_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_0_VECBLEND]], i32 [[VAL2:%.*]], i32 2
+; CHECK-NEXT:    [[OBJ_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_8_VEC_INSERT]], i32 131072, i32 3
+; CHECK-NEXT:    ret <4 x i32> [[OBJ_12_VEC_INSERT]]
 ;
 ; DEBUG-LABEL: @ptrLoadStoreTysAS3(
-; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr addrspace(3), ptr [[INIT:%.*]], align 8, !dbg [[DBG526:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr addrspace(3) [[VAL0]], metadata [[META521:![0-9]+]], metadata !DIExpression()), !dbg [[DBG526]]
-; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x i32>, align 16, !dbg [[DBG527:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META522:![0-9]+]], metadata !DIExpression()), !dbg [[DBG527]]
-; DEBUG-NEXT:    store <4 x i32> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG528:![0-9]+]]
-; DEBUG-NEXT:    store ptr addrspace(3) [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG529:![0-9]+]]
+; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr addrspace(3), ptr [[INIT:%.*]], align 8, !dbg [[DBG525:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr addrspace(3) [[VAL0]], metadata [[META520:![0-9]+]], metadata !DIExpression()), !dbg [[DBG525]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META521:![0-9]+]], metadata !DIExpression()), !dbg [[DBG526:![0-9]+]]
+; DEBUG-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(3) [[VAL0]] to i64, !dbg [[DBG527:![0-9]+]]
+; DEBUG-NEXT:    [[TMP2:%.*]] = bitcast i64 [[TMP1]] to <2 x i32>, !dbg [[DBG527]]
+; DEBUG-NEXT:    [[OBJ_0_VEC_EXPAND:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>, !dbg [[DBG527]]
+; DEBUG-NEXT:    [[OBJ_0_VECBLEND:%.*]] = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i32> [[OBJ_0_VEC_EXPAND]], <4 x i32> zeroinitializer, !dbg [[DBG527]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META522:![0-9]+]], metadata !DIExpression()), !dbg [[DBG528:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_8_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_0_VECBLEND]], i32 [[VAL2:%.*]], i32 2, !dbg [[DBG529:![0-9]+]]
 ; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META523:![0-9]+]], metadata !DIExpression()), !dbg [[DBG530:![0-9]+]]
-; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG531:![0-9]+]]
-; DEBUG-NEXT:    store i32 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG531]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META524:![0-9]+]], metadata !DIExpression()), !dbg [[DBG532:![0-9]+]]
-; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG533:![0-9]+]]
-; DEBUG-NEXT:    store i32 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG533]]
-; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x i32>, ptr [[OBJ]], align 16, !dbg [[DBG534:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_0_SROAVAL]], metadata [[META525:![0-9]+]], metadata !DIExpression()), !dbg [[DBG534]]
-; DEBUG-NEXT:    ret <4 x i32> [[OBJ_0_SROAVAL]], !dbg [[DBG535:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_12_VEC_INSERT:%.*]] = insertelement <4 x i32> [[OBJ_8_VEC_INSERT]], i32 131072, i32 3, !dbg [[DBG531:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x i32> [[OBJ_12_VEC_INSERT]], metadata [[META524:![0-9]+]], metadata !DIExpression()), !dbg [[DBG532:![0-9]+]]
+; DEBUG-NEXT:    ret <4 x i32> [[OBJ_12_VEC_INSERT]], !dbg [[DBG533:![0-9]+]]
 ;
   %val0 = load ptr addrspace(3), ptr %init, align 8
   %obj = alloca <4 x i32>, align 16
@@ -1367,21 +1360,21 @@ define <4 x ptr> @ptrLoadStoreTysPtr(ptr %init, i64 %val2) {
 ; CHECK-NEXT:    ret <4 x ptr> [[OBJ_0_SROAVAL]]
 ;
 ; DEBUG-LABEL: @ptrLoadStoreTysPtr(
-; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG543:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META538:![0-9]+]], metadata !DIExpression()), !dbg [[DBG543]]
-; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x ptr>, align 16, !dbg [[DBG544:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META539:![0-9]+]], metadata !DIExpression()), !dbg [[DBG544]]
-; DEBUG-NEXT:    store <4 x ptr> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG545:![0-9]+]]
-; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG546:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META540:![0-9]+]], metadata !DIExpression()), !dbg [[DBG547:![0-9]+]]
-; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG548:![0-9]+]]
-; DEBUG-NEXT:    store i64 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG548]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META541:![0-9]+]], metadata !DIExpression()), !dbg [[DBG549:![0-9]+]]
-; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG550:![0-9]+]]
-; DEBUG-NEXT:    store i64 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG550]]
-; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x ptr>, ptr [[OBJ]], align 16, !dbg [[DBG551:![0-9]+]]
-; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x ptr> [[OBJ_0_SROAVAL]], metadata [[META542:![0-9]+]], metadata !DIExpression()), !dbg [[DBG551]]
-; DEBUG-NEXT:    ret <4 x ptr> [[OBJ_0_SROAVAL]], !dbg [[DBG552:![0-9]+]]
+; DEBUG-NEXT:    [[VAL0:%.*]] = load ptr, ptr [[INIT:%.*]], align 8, !dbg [[DBG541:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[VAL0]], metadata [[META536:![0-9]+]], metadata !DIExpression()), !dbg [[DBG541]]
+; DEBUG-NEXT:    [[OBJ:%.*]] = alloca <4 x ptr>, align 16, !dbg [[DBG542:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr [[OBJ]], metadata [[META537:![0-9]+]], metadata !DIExpression()), !dbg [[DBG542]]
+; DEBUG-NEXT:    store <4 x ptr> zeroinitializer, ptr [[OBJ]], align 16, !dbg [[DBG543:![0-9]+]]
+; DEBUG-NEXT:    store ptr [[VAL0]], ptr [[OBJ]], align 16, !dbg [[DBG544:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META538:![0-9]+]], metadata !DIExpression()), !dbg [[DBG545:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_8_PTR2_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 8, !dbg [[DBG546:![0-9]+]]
+; DEBUG-NEXT:    store i64 [[VAL2:%.*]], ptr [[OBJ_8_PTR2_SROA_IDX]], align 8, !dbg [[DBG546]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata ptr undef, metadata [[META539:![0-9]+]], metadata !DIExpression()), !dbg [[DBG547:![0-9]+]]
+; DEBUG-NEXT:    [[OBJ_12_PTR3_SROA_IDX:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 12, !dbg [[DBG548:![0-9]+]]
+; DEBUG-NEXT:    store i64 131072, ptr [[OBJ_12_PTR3_SROA_IDX]], align 4, !dbg [[DBG548]]
+; DEBUG-NEXT:    [[OBJ_0_SROAVAL:%.*]] = load <4 x ptr>, ptr [[OBJ]], align 16, !dbg [[DBG549:![0-9]+]]
+; DEBUG-NEXT:    call void @llvm.dbg.value(metadata <4 x ptr> [[OBJ_0_SROAVAL]], metadata [[META540:![0-9]+]], metadata !DIExpression()), !dbg [[DBG549]]
+; DEBUG-NEXT:    ret <4 x ptr> [[OBJ_0_SROAVAL]], !dbg [[DBG550:![0-9]+]]
 ;
   %val0 = load ptr, ptr %init, align 8
   %obj = alloca <4 x ptr>, align 16

From c663c8b8839e8a3e780426c1ab4d0005df90a116 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 23 Jan 2024 17:23:52 -0800
Subject: [PATCH 720/843] [ELF,test] Improve dead-reloc-in-nonalloc.s

Test an absolute relocation referencing a DSO symbol, relocating a
non-SHF_ALLOC section. Also test --gc-sections.
---
 lld/test/ELF/dead-reloc-in-nonalloc.s | 35 ++++++++++++++++++---------
 1 file changed, 23 insertions(+), 12 deletions(-)

diff --git a/lld/test/ELF/dead-reloc-in-nonalloc.s b/lld/test/ELF/dead-reloc-in-nonalloc.s
index 3857ead6e55d0..145604eb883a9 100644
--- a/lld/test/ELF/dead-reloc-in-nonalloc.s
+++ b/lld/test/ELF/dead-reloc-in-nonalloc.s
@@ -1,14 +1,15 @@
 # REQUIRES: x86
-## Test that -z dead-reloc-in-nonalloc= can customize the tombstone value we
-## use for an absolute relocation referencing a discarded symbol.
+## Test an absolute relocation referencing an undefined or DSO symbol, relocating
+## a non-SHF_ALLOC section. Also test -z dead-reloc-in-nonalloc=.
 
 # RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
-# RUN: ld.lld --icf=all -z dead-reloc-in-nonalloc=.debug_info=0xaaaaaaaa \
-# RUN:   -z dead-reloc-in-nonalloc=.not_debug=0xbbbbbbbb %t.o -o %t
+# RUN: echo '.globl bar; bar = 42' | llvm-mc -filetype=obj -triple=x86_64 - -o %tabs.o
+# RUN: ld.lld --gc-sections -z dead-reloc-in-nonalloc=.debug_info=0xaaaaaaaa \
+# RUN:   -z dead-reloc-in-nonalloc=.not_debug=0xbbbbbbbb %t.o %tabs.o -o %t
 # RUN: llvm-objdump -s %t | FileCheck %s --check-prefixes=COMMON,AA
 ## 0xaaaaaaaa == 2863311530
-# RUN: ld.lld --icf=all -z dead-reloc-in-nonalloc=.debug_info=2863311530 \
-# RUN:   -z dead-reloc-in-nonalloc=.not_debug=0xbbbbbbbb %t.o -o - | cmp %t -
+# RUN: ld.lld --gc-sections -z dead-reloc-in-nonalloc=.debug_info=2863311530 \
+# RUN:   -z dead-reloc-in-nonalloc=.not_debug=0xbbbbbbbb %t.o %tabs.o -o - | cmp %t -
 
 # COMMON:      Contents of section .debug_addr:
 # COMMON-NEXT:  0000 [[ADDR:[0-9a-f]+]] 00000000 00000000 00000000
@@ -16,22 +17,30 @@
 # AA:          Contents of section .debug_info:
 # AA-NEXT:      0000 [[ADDR]] 00000000 aaaaaaaa 00000000
 # AA:          Contents of section .not_debug:
-# AA-NEXT:      0000 bbbbbbbb
+# AA-NEXT:      0000 bbbbbbbb bbbbbbbb 00000000          .
 
 ## Specifying zero can get a behavior similar to GNU ld.
-# RUN: ld.lld --icf=all -z dead-reloc-in-nonalloc=.debug_info=0 %t.o -o %tzero
+# RUN: ld.lld --icf=all -z dead-reloc-in-nonalloc=.debug_info=0 %t.o %tabs.o -o %tzero
 # RUN: llvm-objdump -s %tzero | FileCheck %s --check-prefixes=COMMON,ZERO
 
 # ZERO:        Contents of section .debug_info:
 # ZERO-NEXT:    0000 {{[0-9a-f]+}}000 00000000 00000000 00000000
 
 ## Glob works.
-# RUN: ld.lld --icf=all -z dead-reloc-in-nonalloc='.debug_i*=0xaaaaaaaa' \
-# RUN:   -z dead-reloc-in-nonalloc='[.]not_debug=0xbbbbbbbb' %t.o -o - | cmp %t -
+# RUN: ld.lld --gc-sections -z dead-reloc-in-nonalloc='.debug_i*=0xaaaaaaaa' \
+# RUN:   -z dead-reloc-in-nonalloc='[.]not_debug=0xbbbbbbbb' %t.o %tabs.o -o - | cmp %t -
 
 ## If a section matches multiple option. The last option wins.
 # RUN: ld.lld --icf=all -z dead-reloc-in-nonalloc='.debug_info=1' \
-# RUN:   -z dead-reloc-in-nonalloc='.debug_i*=0' %t.o -o - | cmp %tzero -
+# RUN:   -z dead-reloc-in-nonalloc='.debug_i*=0' %t.o %tabs.o -o - | cmp %tzero -
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %S/Inputs/shared.s -o %t1.o
+# RUN: ld.lld -shared -soname=t1.so %t1.o -o %t1.so
+# RUN: ld.lld --gc-sections %t.o %t1.so -o %tso
+# RUN: llvm-objdump -s %tso | FileCheck %s --check-prefix=SHARED
+
+# SHARED:      Contents of section .not_debug:
+# SHARED-NEXT: 0000 08000000 00000000 00000000           .
 
 ## Test all possible invalid cases.
 # RUN: not ld.lld -z dead-reloc-in-nonalloc= 2>&1 | FileCheck %s --check-prefix=USAGE
@@ -52,7 +61,7 @@
 _start:
   ret
 
-## .text.1 will be folded by ICF.
+## .text.1 will be folded by ICF or discarded by --gc-sections.
 .section .text.1,"ax"
   ret
 
@@ -67,3 +76,5 @@ _start:
 ## Test a non-.debug_ section.
 .section .not_debug
   .long .text.1+8
+
+  .quad bar

From 7e50f006f7f652b9a5ac5ddd64deba5f1c9388a8 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Wed, 24 Jan 2024 09:27:25 +0800
Subject: [PATCH 721/843] [NewPM][CodeGen][llc] Add NPM support (#70922)

Add new pass manager support to `llc`. Users can use
`--passes=pass1,pass2...` to run mir passes, and use `--enable-new-pm`
to run default codegen pipeline.
This patch is taken from [D83612](https://reviews.llvm.org/D83612), the
original author is @yuanfang-chen.

---------

Co-authored-by: Yuanfang Chen <455423+yuanfang-chen@users.noreply.github.com>
---
 .../include/llvm/CodeGen/CodeGenPassBuilder.h |  21 +-
 llvm/include/llvm/CodeGen/TargetPassConfig.h  |   9 +-
 .../include/llvm/Target/CGPassBuilderOption.h |   2 +-
 llvm/lib/CodeGen/TargetPassConfig.cpp         |   5 +-
 llvm/lib/Target/X86/CMakeLists.txt            |   3 +
 llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp |  56 +++++
 llvm/lib/Target/X86/X86TargetMachine.h        |   6 +
 llvm/test/tools/llc/new-pm/lit.local.cfg      |   2 +
 llvm/test/tools/llc/new-pm/option-conflict.ll |   3 +
 llvm/test/tools/llc/new-pm/pipeline.ll        |   5 +
 llvm/test/tools/llc/new-pm/start-stop.ll      |   4 +
 llvm/tools/llc/CMakeLists.txt                 |   3 +
 llvm/tools/llc/NewPMDriver.cpp                | 236 ++++++++++++++++++
 llvm/tools/llc/NewPMDriver.h                  |  49 ++++
 llvm/tools/llc/llc.cpp                        |  68 ++---
 llvm/unittests/CodeGen/CMakeLists.txt         |   1 -
 .../CodeGen/CodeGenPassBuilderTest.cpp        | 141 -----------
 17 files changed, 411 insertions(+), 203 deletions(-)
 create mode 100644 llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
 create mode 100644 llvm/test/tools/llc/new-pm/lit.local.cfg
 create mode 100644 llvm/test/tools/llc/new-pm/option-conflict.ll
 create mode 100644 llvm/test/tools/llc/new-pm/pipeline.ll
 create mode 100644 llvm/test/tools/llc/new-pm/start-stop.ll
 create mode 100644 llvm/tools/llc/NewPMDriver.cpp
 create mode 100644 llvm/tools/llc/NewPMDriver.h
 delete mode 100644 llvm/unittests/CodeGen/CodeGenPassBuilderTest.cpp

diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
index 12088f6fc35e0..96d6f891af4e2 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
@@ -1113,30 +1113,13 @@ void CodeGenPassBuilder<Derived>::addTargetRegisterAllocator(
 template <typename Derived>
 void CodeGenPassBuilder<Derived>::addRegAllocPass(AddMachinePass &addPass,
                                                   bool Optimized) const {
-  if (Opt.RegAlloc == RegAllocType::Default)
-    // With no -regalloc= override, ask the target for a regalloc pass.
-    derived().addTargetRegisterAllocator(addPass, Optimized);
-  else if (Opt.RegAlloc == RegAllocType::Basic)
-    addPass(RABasicPass());
-  else if (Opt.RegAlloc == RegAllocType::Fast)
-    addPass(RAFastPass());
-  else if (Opt.RegAlloc == RegAllocType::Greedy)
-    addPass(RAGreedyPass());
-  else if (Opt.RegAlloc == RegAllocType::PBQP)
-    addPass(RAPBQPPass());
-  else
-    llvm_unreachable("unknonwn register allocator type");
+  // TODO: Parse Opt.RegAlloc to add register allocator.
 }
 
 template <typename Derived>
 Error CodeGenPassBuilder<Derived>::addRegAssignmentFast(
     AddMachinePass &addPass) const {
-  if (Opt.RegAlloc != RegAllocType::Default &&
-      Opt.RegAlloc != RegAllocType::Fast)
-    return make_error<StringError>(
-        "Must use fast (default) register allocator for unoptimized regalloc.",
-        inconvertibleErrorCode());
-
+  // TODO: Ensure allocator is default or fast.
   addRegAllocPass(addPass, false);
   return Error::success();
 }
diff --git a/llvm/include/llvm/CodeGen/TargetPassConfig.h b/llvm/include/llvm/CodeGen/TargetPassConfig.h
index de6a760c4e4fd..d00e0bed91a45 100644
--- a/llvm/include/llvm/CodeGen/TargetPassConfig.h
+++ b/llvm/include/llvm/CodeGen/TargetPassConfig.h
@@ -171,11 +171,10 @@ class TargetPassConfig : public ImmutablePass {
   /// set.
   static bool willCompleteCodeGenPipeline();
 
-  /// If hasLimitedCodeGenPipeline is true, this method
-  /// returns a string with the name of the options, separated
-  /// by \p Separator that caused this pipeline to be limited.
-  static std::string
-  getLimitedCodeGenPipelineReason(const char *Separator = "/");
+  /// If hasLimitedCodeGenPipeline is true, this method returns
+  /// a string with the name of the options that caused this
+  /// pipeline to be limited.
+  static std::string getLimitedCodeGenPipelineReason();
 
   struct StartStopInfo {
     bool StartAfter;
diff --git a/llvm/include/llvm/Target/CGPassBuilderOption.h b/llvm/include/llvm/Target/CGPassBuilderOption.h
index 6b5c023a9b6ad..8ab6d63a00056 100644
--- a/llvm/include/llvm/Target/CGPassBuilderOption.h
+++ b/llvm/include/llvm/Target/CGPassBuilderOption.h
@@ -52,7 +52,7 @@ struct CGPassBuilderOption {
   bool RequiresCodeGenSCCOrder = false;
 
   RunOutliner EnableMachineOutliner = RunOutliner::TargetDefault;
-  RegAllocType RegAlloc = RegAllocType::Default;
+  StringRef RegAlloc = "default";
   std::optional<GlobalISelAbortMode> EnableGlobalISelAbort;
   std::string FSProfileFile;
   std::string FSRemappingFile;
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 46697480db52a..599ca48189048 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -635,8 +635,7 @@ bool TargetPassConfig::hasLimitedCodeGenPipeline() {
          !willCompleteCodeGenPipeline();
 }
 
-std::string
-TargetPassConfig::getLimitedCodeGenPipelineReason(const char *Separator) {
+std::string TargetPassConfig::getLimitedCodeGenPipelineReason() {
   if (!hasLimitedCodeGenPipeline())
     return std::string();
   std::string Res;
@@ -648,7 +647,7 @@ TargetPassConfig::getLimitedCodeGenPipelineReason(const char *Separator) {
   for (int Idx = 0; Idx < 4; ++Idx)
     if (!PassNames[Idx]->empty()) {
       if (!IsFirst)
-        Res += Separator;
+        Res += " and ";
       IsFirst = false;
       Res += OptNames[Idx];
     }
diff --git a/llvm/lib/Target/X86/CMakeLists.txt b/llvm/lib/Target/X86/CMakeLists.txt
index 4d6300cad2a7a..610999f0cc3cf 100644
--- a/llvm/lib/Target/X86/CMakeLists.txt
+++ b/llvm/lib/Target/X86/CMakeLists.txt
@@ -29,6 +29,7 @@ set(sources
   X86CallFrameOptimization.cpp
   X86CallingConv.cpp
   X86CmovConversion.cpp
+  X86CodeGenPassBuilder.cpp
   X86DomainReassignment.cpp
   X86DiscriminateMemOps.cpp
   X86LowerTileCopy.cpp
@@ -98,9 +99,11 @@ add_llvm_target(X86CodeGen ${sources}
   CodeGenTypes
   Core
   GlobalISel
+  IRPrinter
   Instrumentation
   MC
   ProfileData
+  Scalar
   SelectionDAG
   Support
   Target
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
new file mode 100644
index 0000000000000..616f777833e56
--- /dev/null
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -0,0 +1,56 @@
+//===-- X86CodeGenPassBuilder.cpp ---------------------------------*- C++ -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+/// This file contains X86 CodeGen pipeline builder.
+/// TODO: Port CodeGen passes to new pass manager.
+//===----------------------------------------------------------------------===//
+
+#include "X86TargetMachine.h"
+
+#include "llvm/CodeGen/CodeGenPassBuilder.h"
+#include "llvm/MC/MCStreamer.h"
+
+using namespace llvm;
+
+namespace {
+
+class X86CodeGenPassBuilder : public CodeGenPassBuilder<X86CodeGenPassBuilder> {
+public:
+  explicit X86CodeGenPassBuilder(LLVMTargetMachine &TM,
+                                 CGPassBuilderOption Opts,
+                                 PassInstrumentationCallbacks *PIC)
+      : CodeGenPassBuilder(TM, Opts, PIC) {}
+  void addPreISel(AddIRPass &addPass) const;
+  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
+  Error addInstSelector(AddMachinePass &) const;
+};
+
+void X86CodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
+  // TODO: Add passes pre instruction selection.
+}
+
+void X86CodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
+                                          CreateMCStreamer) const {
+  // TODO: Add AsmPrinter.
+}
+
+Error X86CodeGenPassBuilder::addInstSelector(AddMachinePass &) const {
+  // TODO: Add instruction selector.
+  return Error::success();
+}
+
+} // namespace
+
+Error X86TargetMachine::buildCodeGenPipeline(
+    ModulePassManager &MPM, MachineFunctionPassManager &MFPM,
+    MachineFunctionAnalysisManager &, raw_pwrite_stream &Out,
+    raw_pwrite_stream *DwoOut, CodeGenFileType FileType,
+    CGPassBuilderOption Opt, PassInstrumentationCallbacks *PIC) {
+  auto CGPB = X86CodeGenPassBuilder(*this, Opt, PIC);
+  return CGPB.buildPipeline(MPM, MFPM, Out, DwoOut, FileType);
+}
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index 4836be4db0e8e..f31c971df9584 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -58,6 +58,12 @@ class X86TargetMachine final : public LLVMTargetMachine {
   createMachineFunctionInfo(BumpPtrAllocator &Allocator, const Function &F,
                             const TargetSubtargetInfo *STI) const override;
 
+  Error buildCodeGenPipeline(ModulePassManager &, MachineFunctionPassManager &,
+                             MachineFunctionAnalysisManager &,
+                             raw_pwrite_stream &, raw_pwrite_stream *,
+                             CodeGenFileType, CGPassBuilderOption,
+                             PassInstrumentationCallbacks *) override;
+
   bool isJIT() const { return IsJIT; }
 
   bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
diff --git a/llvm/test/tools/llc/new-pm/lit.local.cfg b/llvm/test/tools/llc/new-pm/lit.local.cfg
new file mode 100644
index 0000000000000..42bf50dcc13c3
--- /dev/null
+++ b/llvm/test/tools/llc/new-pm/lit.local.cfg
@@ -0,0 +1,2 @@
+if not "X86" in config.root.targets:
+    config.unsupported = True
diff --git a/llvm/test/tools/llc/new-pm/option-conflict.ll b/llvm/test/tools/llc/new-pm/option-conflict.ll
new file mode 100644
index 0000000000000..5847a32058756
--- /dev/null
+++ b/llvm/test/tools/llc/new-pm/option-conflict.ll
@@ -0,0 +1,3 @@
+; RUN: not llc -mtriple=x86_64-pc-linux-gnu -passes=foo -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s 2>&1 | FileCheck  %s
+
+; CHECK: warning: --passes cannot be used with start-before and stop-after.
diff --git a/llvm/test/tools/llc/new-pm/pipeline.ll b/llvm/test/tools/llc/new-pm/pipeline.ll
new file mode 100644
index 0000000000000..1ace5963e4ef8
--- /dev/null
+++ b/llvm/test/tools/llc/new-pm/pipeline.ll
@@ -0,0 +1,5 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -filetype=null %s | FileCheck %s
+
+; CHECK: require<profile-summary>,require<collector-metadata>
+; CHECK: MachineSanitizerBinaryMetadata,FreeMachineFunctionPass
+
diff --git a/llvm/test/tools/llc/new-pm/start-stop.ll b/llvm/test/tools/llc/new-pm/start-stop.ll
new file mode 100644
index 0000000000000..c25e45d1f7ab9
--- /dev/null
+++ b/llvm/test/tools/llc/new-pm/start-stop.ll
@@ -0,0 +1,4 @@
+; RUN: llc -mtriple=x86_64-pc-linux-gnu -enable-new-pm -print-pipeline-passes -start-before=mergeicmps -stop-after=gc-lowering -filetype=null %s | FileCheck --match-full-lines %s
+
+; CHECK: IR pipeline: function(mergeicmps,expand-memcmp,gc-lowering)
+
diff --git a/llvm/tools/llc/CMakeLists.txt b/llvm/tools/llc/CMakeLists.txt
index 257d5b519f040..01825c6e4c64c 100644
--- a/llvm/tools/llc/CMakeLists.txt
+++ b/llvm/tools/llc/CMakeLists.txt
@@ -8,9 +8,11 @@ set(LLVM_LINK_COMPONENTS
   CodeGen
   CodeGenTypes
   Core
+  IRPrinter
   IRReader
   MC
   MIRParser
+  Passes
   Remarks
   ScalarOpts
   SelectionDAG
@@ -23,6 +25,7 @@ set(LLVM_LINK_COMPONENTS
 
 add_llvm_tool(llc
   llc.cpp
+  NewPMDriver.cpp
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
new file mode 100644
index 0000000000000..13020f3dd07fe
--- /dev/null
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -0,0 +1,236 @@
+//===- NewPMDriver.cpp - Driver for llc using new PM ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file is just a split of the code that logically belongs in llc.cpp but
+/// that includes the new pass manager headers.
+///
+//===----------------------------------------------------------------------===//
+
+#include "NewPMDriver.h"
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/CodeGen/CodeGenPassBuilder.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MIRPrinter.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachinePassManager.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/StandardInstrumentations.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Target/CGPassBuilderOption.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Target/TargetOptions.h"
+#include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+
+namespace llvm {
+extern cl::opt<bool> PrintPipelinePasses;
+} // namespace llvm
+
+using namespace llvm;
+
+static cl::opt<std::string>
+    RegAlloc("regalloc-npm",
+             cl::desc("Register allocator to use for new pass manager"),
+             cl::Hidden, cl::init("default"));
+
+static cl::opt<bool>
+    DebugPM("debug-pass-manager", cl::Hidden,
+            cl::desc("Print pass management debugging information"));
+
+bool LLCDiagnosticHandler::handleDiagnostics(const DiagnosticInfo &DI) {
+  DiagnosticHandler::handleDiagnostics(DI);
+  if (DI.getKind() == llvm::DK_SrcMgr) {
+    const auto &DISM = cast<DiagnosticInfoSrcMgr>(DI);
+    const SMDiagnostic &SMD = DISM.getSMDiag();
+
+    SMD.print(nullptr, errs());
+
+    // For testing purposes, we print the LocCookie here.
+    if (DISM.isInlineAsmDiag() && DISM.getLocCookie())
+      WithColor::note() << "!srcloc = " << DISM.getLocCookie() << "\n";
+
+    return true;
+  }
+
+  if (auto *Remark = dyn_cast<DiagnosticInfoOptimizationBase>(&DI))
+    if (!Remark->isEnabled())
+      return true;
+
+  DiagnosticPrinterRawOStream DP(errs());
+  errs() << LLVMContext::getDiagnosticMessagePrefix(DI.getSeverity()) << ": ";
+  DI.print(DP);
+  errs() << "\n";
+  return true;
+}
+
+static llvm::ExitOnError ExitOnErr;
+
+static void RunPasses(bool BOS, ToolOutputFile *Out, Module *M,
+                      LLVMContext &Context, SmallString<0> &Buffer,
+                      ModulePassManager *MPM, ModuleAnalysisManager *MAM,
+                      MachineFunctionPassManager &MFPM,
+                      MachineFunctionAnalysisManager &MFAM) {
+  assert(M && "invalid input module!");
+
+  // Before executing passes, print the final values of the LLVM options.
+  cl::PrintOptionValues();
+
+  if (MPM) {
+    assert(MAM && "expect a ModuleAnalysisManager!");
+    MPM->run(*M, *MAM);
+  }
+
+  ExitOnErr(MFPM.run(*M, MFAM));
+
+  if (Context.getDiagHandlerPtr()->HasErrors)
+    exit(1);
+
+  if (BOS)
+    Out->os() << Buffer;
+}
+
+int llvm::compileModuleWithNewPM(
+    StringRef Arg0, std::unique_ptr<Module> M, std::unique_ptr<MIRParser> MIR,
+    std::unique_ptr<TargetMachine> Target, std::unique_ptr<ToolOutputFile> Out,
+    std::unique_ptr<ToolOutputFile> DwoOut, LLVMContext &Context,
+    const TargetLibraryInfoImpl &TLII, bool NoVerify, StringRef PassPipeline,
+    CodeGenFileType FileType) {
+
+  if (!PassPipeline.empty() && TargetPassConfig::hasLimitedCodeGenPipeline()) {
+    WithColor::warning(errs(), Arg0)
+        << "--passes cannot be used with "
+        << TargetPassConfig::getLimitedCodeGenPipelineReason() << ".\n";
+    return 1;
+  }
+
+  LLVMTargetMachine &LLVMTM = static_cast<LLVMTargetMachine &>(*Target);
+
+  raw_pwrite_stream *OS = &Out->os();
+
+  // Manually do the buffering rather than using buffer_ostream,
+  // so we can memcmp the contents in CompileTwice mode in future.
+  SmallString<0> Buffer;
+  std::unique_ptr<raw_svector_ostream> BOS;
+  if ((codegen::getFileType() != CodeGenFileType::AssemblyFile &&
+       !Out->os().supportsSeeking())) {
+    BOS = std::make_unique<raw_svector_ostream>(Buffer);
+    OS = BOS.get();
+  }
+
+  // Fetch options from TargetPassConfig
+  CGPassBuilderOption Opt = getCGPassBuilderOption();
+  Opt.DisableVerify = NoVerify;
+  Opt.DebugPM = DebugPM;
+  Opt.RegAlloc = RegAlloc;
+
+  PassInstrumentationCallbacks PIC;
+  StandardInstrumentations SI(Context, Opt.DebugPM);
+  SI.registerCallbacks(PIC);
+  registerCodeGenCallback(PIC, LLVMTM);
+
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+  PassBuilder PB(Target.get(), PipelineTuningOptions(), std::nullopt, &PIC);
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  FAM.registerPass([&] { return TargetLibraryAnalysis(TLII); });
+  MAM.registerPass([&] { return MachineModuleAnalysis(&LLVMTM); });
+
+  MachineFunctionAnalysisManager MFAM(FAM, MAM);
+
+  if (!PassPipeline.empty()) {
+    // Construct a custom pass pipeline that starts after instruction
+    // selection.
+
+    if (!MIR) {
+      WithColor::warning(errs(), Arg0) << "-passes is for .mir file only.\n";
+      return 1;
+    }
+
+    MachineFunctionPassManager MFPM;
+    ExitOnErr(PB.parsePassPipeline(MFPM, PassPipeline));
+    MFPM.addPass(PrintMIRPass(*OS));
+    MFPM.addPass(FreeMachineFunctionPass());
+
+    auto &MMI = MFAM.getResult<MachineModuleAnalysis>(*M);
+    if (MIR->parseMachineFunctions(*M, MMI))
+      return 1;
+
+    RunPasses(BOS.get(), Out.get(), M.get(), Context, Buffer, nullptr, nullptr,
+              MFPM, MFAM);
+  } else {
+    ModulePassManager MPM;
+    MachineFunctionPassManager MFPM;
+
+    ExitOnErr(LLVMTM.buildCodeGenPipeline(MPM, MFPM, MFAM, *OS,
+                                          DwoOut ? &DwoOut->os() : nullptr,
+                                          FileType, Opt, &PIC));
+
+    auto StartStopInfo = TargetPassConfig::getStartStopInfo(PIC);
+    assert(StartStopInfo && "Expect StartStopInfo!");
+    // Add IR or MIR printing pass according the pass type.
+
+    if (auto StopPassName = StartStopInfo->StopPass; !StopPassName.empty()) {
+      MFPM.addPass(PrintMIRPass(*OS));
+      MFPM.addPass(FreeMachineFunctionPass());
+    }
+
+    if (PrintPipelinePasses) {
+      std::string IRPipeline;
+      raw_string_ostream IRSOS(IRPipeline);
+      MPM.printPipeline(IRSOS, [&PIC](StringRef ClassName) {
+        auto PassName = PIC.getPassNameForClassName(ClassName);
+        return PassName.empty() ? ClassName : PassName;
+      });
+      outs() << "IR pipeline: " << IRPipeline << '\n';
+
+      std::string MIRPipeline;
+      raw_string_ostream MIRSOS(MIRPipeline);
+      MFPM.printPipeline(MIRSOS, [&PIC](StringRef ClassName) {
+        auto PassName = PIC.getPassNameForClassName(ClassName);
+        return PassName.empty() ? ClassName : PassName;
+      });
+      outs() << "MIR pipeline: " << MIRPipeline << '\n';
+      return 0;
+    }
+
+    RunPasses(BOS.get(), Out.get(), M.get(), Context, Buffer, &MPM, &MAM, MFPM,
+              MFAM);
+  }
+
+  // Declare success.
+  Out->keep();
+  if (DwoOut)
+    DwoOut->keep();
+
+  return 0;
+}
diff --git a/llvm/tools/llc/NewPMDriver.h b/llvm/tools/llc/NewPMDriver.h
new file mode 100644
index 0000000000000..b0beeaf596c8f
--- /dev/null
+++ b/llvm/tools/llc/NewPMDriver.h
@@ -0,0 +1,49 @@
+//===- NewPMDriver.h - Function to drive llc with the new PM ----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// A single function which is called to drive the llc behavior for the new
+/// PassManager.
+///
+/// This is only in a separate TU with a header to avoid including all of the
+/// old pass manager headers and the new pass manager headers into the same
+/// file. Eventually all of the routines here will get folded back into
+/// llc.cpp.
+///
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_TOOLS_LLC_NEWPMDRIVER_H
+#define LLVM_TOOLS_LLC_NEWPMDRIVER_H
+
+#include "llvm/IR/DiagnosticHandler.h"
+#include "llvm/Support/CodeGen.h"
+#include <memory>
+#include <vector>
+
+namespace llvm {
+class Module;
+class TargetLibraryInfoImpl;
+class TargetMachine;
+class ToolOutputFile;
+class LLVMContext;
+class MIRParser;
+
+struct LLCDiagnosticHandler : public DiagnosticHandler {
+  bool handleDiagnostics(const DiagnosticInfo &DI) override;
+};
+
+int compileModuleWithNewPM(StringRef Arg0, std::unique_ptr<Module> M,
+                           std::unique_ptr<MIRParser> MIR,
+                           std::unique_ptr<TargetMachine> Target,
+                           std::unique_ptr<ToolOutputFile> Out,
+                           std::unique_ptr<ToolOutputFile> DwoOut,
+                           LLVMContext &Context,
+                           const TargetLibraryInfoImpl &TLII, bool NoVerify,
+                           StringRef PassPipeline, CodeGenFileType FileType);
+} // namespace llvm
+
+#endif
diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index 02187f7048a10..d76d89eae3b18 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -12,6 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "NewPMDriver.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -186,6 +187,21 @@ static cl::opt<std::string> RemarksFormat(
     cl::desc("The format used for serializing remarks (default: YAML)"),
     cl::value_desc("format"), cl::init("yaml"));
 
+static cl::opt<bool> EnableNewPassManager(
+    "enable-new-pm", cl::desc("Enable the new pass manager"), cl::init(false));
+
+// This flag specifies a textual description of the optimization pass pipeline
+// to run over the module. This flag switches opt to use the new pass manager
+// infrastructure, completely disabling all of the flags specific to the old
+// pass management.
+static cl::opt<std::string> PassPipeline(
+    "passes",
+    cl::desc(
+        "A textual description of the pass pipeline. To have analysis passes "
+        "available before a certain pass, add 'require<foo-analysis>'."));
+static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
+                               cl::desc("Alias for -passes"));
+
 static cl::opt<bool> TryUseNewDbgInfoFormat(
     "try-experimental-debuginfo-iterators",
     cl::desc("Enable debuginfo iterator positions, if they're built in"),
@@ -306,34 +322,6 @@ static std::unique_ptr<ToolOutputFile> GetOutputStream(const char *TargetName,
   return FDOut;
 }
 
-struct LLCDiagnosticHandler : public DiagnosticHandler {
-  bool handleDiagnostics(const DiagnosticInfo &DI) override {
-    DiagnosticHandler::handleDiagnostics(DI);
-    if (DI.getKind() == llvm::DK_SrcMgr) {
-      const auto &DISM = cast<DiagnosticInfoSrcMgr>(DI);
-      const SMDiagnostic &SMD = DISM.getSMDiag();
-
-      SMD.print(nullptr, errs());
-
-      // For testing purposes, we print the LocCookie here.
-      if (DISM.isInlineAsmDiag() && DISM.getLocCookie())
-        WithColor::note() << "!srcloc = " << DISM.getLocCookie() << "\n";
-
-      return true;
-    }
-
-    if (auto *Remark = dyn_cast<DiagnosticInfoOptimizationBase>(&DI))
-      if (!Remark->isEnabled())
-        return true;
-
-    DiagnosticPrinterRawOStream DP(errs());
-    errs() << LLVMContext::getDiagnosticMessagePrefix(DI.getSeverity()) << ": ";
-    DI.print(DP);
-    errs() << "\n";
-    return true;
-  }
-};
-
 // main - Entry point for the llc compiler.
 //
 int main(int argc, char **argv) {
@@ -377,6 +365,13 @@ int main(int argc, char **argv) {
 
   cl::ParseCommandLineOptions(argc, argv, "llvm system compiler\n");
 
+  if (!PassPipeline.empty() && !getRunPassNames().empty()) {
+    errs() << "The `llc -run-pass=...` syntax for the new pass manager is "
+              "not supported, please use `llc -passes=<pipeline>` (or the `-p` "
+              "alias for a more concise version).\n";
+    return 1;
+  }
+
   // RemoveDIs debug-info transition: tests may request that we /try/ to use the
   // new debug-info format, if it's built in.
 #ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
@@ -642,16 +637,12 @@ static int compileModule(char **argv, LLVMContext &Context) {
       reportError(EC.message(), SplitDwarfOutputFile);
   }
 
-  // Build up all of the passes that we want to do to the module.
-  legacy::PassManager PM;
-
   // Add an appropriate TargetLibraryInfo pass for the module's triple.
   TargetLibraryInfoImpl TLII(Triple(M->getTargetTriple()));
 
   // The -disable-simplify-libcalls flag actually disables all builtin optzns.
   if (DisableSimplifyLibCalls)
     TLII.disableAllFunctions();
-  PM.add(new TargetLibraryInfoWrapperPass(TLII));
 
   // Verify module immediately to catch problems before doInitialization() is
   // called on any passes.
@@ -667,6 +658,17 @@ static int compileModule(char **argv, LLVMContext &Context) {
     WithColor::warning(errs(), argv[0])
         << ": warning: ignoring -mc-relax-all because filetype != obj";
 
+  if (EnableNewPassManager || !PassPipeline.empty()) {
+    return compileModuleWithNewPM(argv[0], std::move(M), std::move(MIR),
+                                  std::move(Target), std::move(Out),
+                                  std::move(DwoOut), Context, TLII, NoVerify,
+                                  PassPipeline, codegen::getFileType());
+  }
+
+  // Build up all of the passes that we want to do to the module.
+  legacy::PassManager PM;
+  PM.add(new TargetLibraryInfoWrapperPass(TLII));
+
   {
     raw_pwrite_stream *OS = &Out->os();
 
@@ -700,7 +702,7 @@ static int compileModule(char **argv, LLVMContext &Context) {
       if (TPC.hasLimitedCodeGenPipeline()) {
         WithColor::warning(errs(), argv[0])
             << "run-pass cannot be used with "
-            << TPC.getLimitedCodeGenPipelineReason(" and ") << ".\n";
+            << TPC.getLimitedCodeGenPipelineReason() << ".\n";
         delete PTPC;
         delete MMIWP;
         return 1;
diff --git a/llvm/unittests/CodeGen/CMakeLists.txt b/llvm/unittests/CodeGen/CMakeLists.txt
index c78cbfcc28193..6140e0d6fb370 100644
--- a/llvm/unittests/CodeGen/CMakeLists.txt
+++ b/llvm/unittests/CodeGen/CMakeLists.txt
@@ -25,7 +25,6 @@ add_llvm_unittest(CodeGenTests
   AMDGPUMetadataTest.cpp
   AsmPrinterDwarfTest.cpp
   CCStateTest.cpp
-  CodeGenPassBuilderTest.cpp
   DIEHashTest.cpp
   DIETest.cpp
   DwarfStringPoolEntryRefTest.cpp
diff --git a/llvm/unittests/CodeGen/CodeGenPassBuilderTest.cpp b/llvm/unittests/CodeGen/CodeGenPassBuilderTest.cpp
deleted file mode 100644
index d6ec393155cf0..0000000000000
--- a/llvm/unittests/CodeGen/CodeGenPassBuilderTest.cpp
+++ /dev/null
@@ -1,141 +0,0 @@
-//===- llvm/unittest/CodeGen/CodeGenPassBuilderTest.cpp -------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/CodeGen/CodeGenPassBuilder.h"
-#include "llvm/CodeGen/MachinePassManager.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Passes/PassBuilder.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/TargetParser/Host.h"
-#include "gtest/gtest.h"
-#include <string>
-
-using namespace llvm;
-
-namespace {
-
-class DummyCodeGenPassBuilder
-    : public CodeGenPassBuilder<DummyCodeGenPassBuilder> {
-public:
-  DummyCodeGenPassBuilder(LLVMTargetMachine &TM, CGPassBuilderOption Opts,
-                          PassInstrumentationCallbacks *PIC)
-      : CodeGenPassBuilder(TM, Opts, PIC){};
-
-  void addPreISel(AddIRPass &addPass) const {
-    addPass(NoOpModulePass());
-    addPass(NoOpFunctionPass());
-    addPass(NoOpFunctionPass());
-    addPass(NoOpFunctionPass());
-    addPass(NoOpModulePass());
-    addPass(NoOpFunctionPass());
-  }
-
-  void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const {}
-
-  Error addInstSelector(AddMachinePass &) const { return Error::success(); }
-};
-
-class CodeGenPassBuilderTest : public testing::Test {
-public:
-  std::unique_ptr<LLVMTargetMachine> TM;
-
-  static void SetUpTestCase() {
-    InitializeAllTargets();
-    InitializeAllTargetMCs();
-
-    // TODO: Move this test to normal lit test when llc supports new pm.
-    static const char *argv[] = {
-        "test",
-        "-print-pipeline-passes",
-    };
-    int argc = std::size(argv);
-    cl::ParseCommandLineOptions(argc, argv);
-  }
-
-  void SetUp() override {
-    std::string TripleName = Triple::normalize("x86_64-pc-linux-gnu");
-    std::string Error;
-    const Target *TheTarget = TargetRegistry::lookupTarget(TripleName, Error);
-    if (!TheTarget)
-      GTEST_SKIP();
-
-    TargetOptions Options;
-    TM = std::unique_ptr<LLVMTargetMachine>(static_cast<LLVMTargetMachine *>(
-        TheTarget->createTargetMachine("", "", "", Options, std::nullopt)));
-    if (!TM)
-      GTEST_SKIP();
-  }
-};
-
-TEST_F(CodeGenPassBuilderTest, basic) {
-  LoopAnalysisManager LAM;
-  FunctionAnalysisManager FAM;
-  CGSCCAnalysisManager CGAM;
-  ModuleAnalysisManager MAM;
-
-  PassInstrumentationCallbacks PIC;
-  DummyCodeGenPassBuilder CGPB(*TM, getCGPassBuilderOption(), &PIC);
-  PipelineTuningOptions PTO;
-  PassBuilder PB(TM.get(), PTO, std::nullopt, &PIC);
-
-  PB.registerModuleAnalyses(MAM);
-  PB.registerCGSCCAnalyses(CGAM);
-  PB.registerFunctionAnalyses(FAM);
-  PB.registerLoopAnalyses(LAM);
-  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
-
-  ModulePassManager MPM;
-  MachineFunctionPassManager MFPM;
-  Error Err =
-      CGPB.buildPipeline(MPM, MFPM, outs(), nullptr, CodeGenFileType::Null);
-  EXPECT_FALSE(Err);
-
-  std::string IRPipeline;
-  raw_string_ostream IROS(IRPipeline);
-  MPM.printPipeline(IROS, [&PIC](StringRef Name) {
-    auto PassName = PIC.getPassNameForClassName(Name);
-    return PassName.empty() ? Name : PassName;
-  });
-  const char ExpectedIRPipeline[] =
-      "no-op-module,function(no-op-function,"
-      "no-op-function,no-op-function),no-op-module";
-  // TODO: Move this test to normal lit test when llc supports new pm.
-  EXPECT_TRUE(StringRef(IRPipeline).contains(ExpectedIRPipeline));
-
-  std::string MIRPipeline;
-  raw_string_ostream MIROS(MIRPipeline);
-  MFPM.printPipeline(MIROS, [&PIC](StringRef Name) {
-    auto PassName = PIC.getPassNameForClassName(Name);
-    return PassName.empty() ? Name : PassName;
-  });
-  const char ExpectedMIRPipeline[] =
-      "FinalizeISelPass,EarlyTailDuplicatePass,OptimizePHIsPass,"
-      "StackColoringPass,LocalStackSlotPass,DeadMachineInstructionElimPass,"
-      "EarlyMachineLICMPass,MachineCSEPass,MachineSinkingPass,"
-      "PeepholeOptimizerPass,DeadMachineInstructionElimPass,"
-      "DetectDeadLanesPass,ProcessImplicitDefsPass,PHIEliminationPass,"
-      "TwoAddressInstructionPass,RegisterCoalescerPass,"
-      "RenameIndependentSubregsPass,MachineSchedulerPass,RAGreedyPass,"
-      "VirtRegRewriterPass,StackSlotColoringPass,"
-      "RemoveRedundantDebugValuesPass,PostRAMachineSinkingPass,ShrinkWrapPass,"
-      "PrologEpilogInserterPass,BranchFolderPass,TailDuplicatePass,"
-      "MachineLateInstrsCleanupPass,MachineCopyPropagationPass,"
-      "ExpandPostRAPseudosPass,PostRASchedulerPass,MachineBlockPlacementPass,"
-      "FEntryInserterPass,XRayInstrumentationPass,PatchableFunctionPass,"
-      "FuncletLayoutPass,StackMapLivenessPass,LiveDebugValuesPass,"
-      "MachineSanitizerBinaryMetadata,FreeMachineFunctionPass";
-  // TODO: Check pipeline string when all pass names are populated.
-  // TODO: Move this test to normal lit test when llc supports new pm.
-  EXPECT_EQ(MIRPipeline, ExpectedMIRPipeline);
-}
-
-} // namespace

From 230c13d59d0843c3b738920b85c341cc78a61fa9 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan@amd.com>
Date: Wed, 24 Jan 2024 07:08:43 +0530
Subject: [PATCH 722/843] [AMDGPU] Pick available high VGPR for CSR SGPR
 spilling (#78669)

CSR SGPR spilling currently uses the early available physical VGPRs. It
currently imposes a high register pressure while trying to allocate
large VGPR tuples within the default register budget.

This patch changes the spilling strategy by picking the VGPRs in the
reverse order, the highest available VGPR first and later after regalloc
shift them back to the lowest available range. With that, the initial
VGPRs would be available for allocation and possibility
of finding large number of contiguous registers will be more.
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |    5 +-
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  |    3 +-
 .../Target/AMDGPU/SIMachineFunctionInfo.cpp   |   54 +-
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |    9 +-
 .../amdgpu-simplify-libcall-pow-codegen.ll    |  440 +++----
 llvm/test/CodeGen/AMDGPU/bf16.ll              |  898 ++++++-------
 .../test/CodeGen/AMDGPU/callee-frame-setup.ll |   12 +-
 .../AMDGPU/dwarf-multi-register-use-crash.ll  |   80 +-
 .../AMDGPU/gfx-callable-argument-types.ll     | 1026 +++++++--------
 .../gfx-callable-preserved-registers.ll       |  156 +--
 .../AMDGPU/gfx-callable-return-types.ll       |  474 ++++---
 .../AMDGPU/global_atomics_i32_system.ll       |  806 ++++++------
 .../AMDGPU/global_atomics_i64_system.ll       | 1052 ++++++++-------
 .../identical-subrange-spill-infloop.ll       |  556 ++++----
 llvm/test/CodeGen/AMDGPU/indirect-call.ll     |  284 ++--
 .../CodeGen/AMDGPU/insert-waitcnts-crash.ll   |    8 +-
 llvm/test/CodeGen/AMDGPU/ipra.ll              |    8 +-
 .../CodeGen/AMDGPU/mul24-pass-ordering.ll     |   54 +-
 llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll |   52 +-
 .../AMDGPU/sgpr-spill-overlap-wwm-reserve.mir |  314 ++---
 .../AMDGPU/sgpr-spills-split-regalloc.ll      |   44 +-
 llvm/test/CodeGen/AMDGPU/sibling-call.ll      |    8 +-
 .../AMDGPU/spill-sgpr-csr-live-ins.mir        |    4 +-
 .../AMDGPU/spill-sgpr-to-virtual-vgpr.mir     |   66 +-
 .../spill_more_than_wavesize_csr_sgprs.ll     |    8 +-
 .../CodeGen/AMDGPU/stacksave_stackrestore.ll  |   28 +-
 .../AMDGPU/strictfp_f16_abi_promote.ll        |  136 +-
 .../AMDGPU/unstructured-cfg-def-use-issue.ll  |  250 ++--
 .../AMDGPU/vgpr-large-tuple-alloc-error.ll    | 1169 ++++++++++-------
 .../CodeGen/AMDGPU/vgpr-tuple-allocation.ll   |  264 ++--
 .../test/CodeGen/AMDGPU/wwm-reserved-spill.ll |  476 +++----
 31 files changed, 4513 insertions(+), 4231 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index a02c2a4659082..9d062eb156d5c 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -95,7 +95,8 @@ static void getVGPRSpillLaneOrTempRegister(
                                          TargetStackID::SGPRSpill);
 
     if (TRI->spillSGPRToVGPR() &&
-        MFI->allocateSGPRSpillToVGPRLane(MF, FI, /* IsPrologEpilog */ true)) {
+        MFI->allocateSGPRSpillToVGPRLane(MF, FI, /*SpillToPhysVGPRLane=*/true,
+                                         /*IsPrologEpilog=*/true)) {
       // 2: There's no free lane to spill, and no free register to save the
       // SGPR, so we're forced to take another VGPR to use for the spill.
       MFI->addToPrologEpilogSGPRSpills(
@@ -1560,6 +1561,8 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MFI->isChainFunction() && !MF.getFrameInfo().hasTailCall())
     return;
 
+  MFI->shiftSpillPhysVGPRsToLowestRange(MF);
+
   TargetFrameLowering::determineCalleeSaves(MF, SavedVGPRs, RS);
   if (MFI->isEntryFunction())
     return;
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 70ffb8ea0a622..4b13825040ebe 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -368,7 +368,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
           // regalloc aware CFI generation to insert new CFIs along with the
           // intermediate spills is implemented. There is no such support
           // currently exist in the LLVM compiler.
-          if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) {
+          if (FuncInfo->allocateSGPRSpillToVGPRLane(
+                  MF, FI, /*SpillToPhysVGPRLane=*/true)) {
             bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex(
                 MI, FI, nullptr, Indexes, LIS, true);
             if (!Spilled)
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index e8142244b7db6..b94d143a75e5e 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -312,6 +312,33 @@ bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
   return false;
 }
 
+void SIMachineFunctionInfo::shiftSpillPhysVGPRsToLowestRange(
+    MachineFunction &MF) {
+  const SIRegisterInfo *TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  for (unsigned I = 0, E = SpillPhysVGPRs.size(); I < E; ++I) {
+    Register Reg = SpillPhysVGPRs[I];
+    Register NewReg =
+        TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+    if (!NewReg || NewReg >= Reg)
+      break;
+
+    MRI.replaceRegWith(Reg, NewReg);
+
+    // Update various tables with the new VGPR.
+    SpillPhysVGPRs[I] = NewReg;
+    WWMReservedRegs.remove(Reg);
+    WWMReservedRegs.insert(NewReg);
+    WWMSpills.insert(std::make_pair(NewReg, WWMSpills[Reg]));
+    WWMSpills.erase(Reg);
+
+    for (MachineBasicBlock &MBB : MF) {
+      MBB.removeLiveIn(Reg);
+      MBB.sortUniqueLiveIns();
+    }
+  }
+}
+
 bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
     MachineFunction &MF, int FI, unsigned LaneIndex) {
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -329,13 +356,17 @@ bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills(
 }
 
 bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
-    MachineFunction &MF, int FI, unsigned LaneIndex) {
+    MachineFunction &MF, int FI, unsigned LaneIndex, bool IsPrologEpilog) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   Register LaneVGPR;
   if (!LaneIndex) {
-    LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
+    // Find the highest available register if called before RA to ensure the
+    // lowest registers are available for allocation. The LaneVGPR, in that
+    // case, will be shifted back to the lowest range after VGPR allocation.
+    LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF,
+                                       !IsPrologEpilog);
     if (LaneVGPR == AMDGPU::NoRegister) {
       // We have no VGPRs left for spilling SGPRs. Reset because we will not
       // partially spill the SGPR to VGPRs.
@@ -359,12 +390,12 @@ bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills(
   return true;
 }
 
-bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
-                                                        int FI,
-                                                        bool IsPrologEpilog) {
+bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(
+    MachineFunction &MF, int FI, bool SpillToPhysVGPRLane,
+    bool IsPrologEpilog) {
   std::vector<SIRegisterInfo::SpilledReg> &SpillLanes =
-      IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI]
-                     : SGPRSpillsToVirtualVGPRLanes[FI];
+      SpillToPhysVGPRLane ? SGPRSpillsToPhysicalVGPRLanes[FI]
+                          : SGPRSpillsToVirtualVGPRLanes[FI];
 
   // This has already been allocated.
   if (!SpillLanes.empty())
@@ -384,14 +415,15 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPRLane(MachineFunction &MF,
   assert(ST.getRegisterInfo()->spillSGPRToVGPR() &&
          "not spilling SGPRs to VGPRs");
 
-  unsigned &NumSpillLanes =
-      IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes;
+  unsigned &NumSpillLanes = SpillToPhysVGPRLane ? NumPhysicalVGPRSpillLanes
+                                                : NumVirtualVGPRSpillLanes;
 
   for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) {
     unsigned LaneIndex = (NumSpillLanes % WaveSize);
 
-    bool Allocated = IsPrologEpilog
-                         ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex)
+    bool Allocated = SpillToPhysVGPRLane
+                         ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex,
+                                                             IsPrologEpilog)
                          : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex);
     if (!Allocated) {
       NumSpillLanes -= I;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ecc31fbd9dd3d..9ff66a094f991 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -548,7 +548,8 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI,
                                         unsigned LaneIndex);
   bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI,
-                                         unsigned LaneIndex);
+                                         unsigned LaneIndex,
+                                         bool IsPrologEpilog);
 
 public:
   Register getVGPRForAGPRCopy() const {
@@ -588,6 +589,7 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   }
 
   ArrayRef<Register> getSGPRSpillVGPRs() const { return SpillVGPRs; }
+
   const WWMSpillsMap &getWWMSpills() const { return WWMSpills; }
   const ReservedRegSet &getWWMReservedRegs() const { return WWMReservedRegs; }
 
@@ -702,7 +704,12 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
       I->second.IsDead = true;
   }
 
+  // To bring the Physical VGPRs in the highest range allocated for CSR SGPR
+  // spilling into the lowest available range.
+  void shiftSpillPhysVGPRsToLowestRange(MachineFunction &MF);
+
   bool allocateSGPRSpillToVGPRLane(MachineFunction &MF, int FI,
+                                   bool SpillToPhysVGPRLane = false,
                                    bool IsPrologEpilog = false);
   bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
 
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
index e65eca7810610..bdd7ff11fde63 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pow-codegen.ll
@@ -116,38 +116,38 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v43, v1
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v43
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v42, v1
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v2
+; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
@@ -156,7 +156,7 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
@@ -170,32 +170,32 @@ define double @test_pow_fast_f64__integral_y(double %x, i32 %y.i) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v42
-; CHECK-NEXT:    v_and_b32_e32 v2, v2, v43
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
+; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
@@ -257,37 +257,37 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v43, v31
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v42, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v3
-; CHECK-NEXT:    v_mov_b32_e32 v41, v2
+; CHECK-NEXT:    v_mov_b32_e32 v41, v3
+; CHECK-NEXT:    v_mov_b32_e32 v40, v2
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
@@ -296,7 +296,7 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[41:42]
+; CHECK-NEXT:    v_mul_f64 v[0:1], v[0:1], v[40:41]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
@@ -309,29 +309,29 @@ define double @test_powr_fast_f64(double %x, double %y) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v43
+; CHECK-NEXT:    v_mov_b32_e32 v31, v42
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
@@ -400,38 +400,38 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v43, v1
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v43
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v42, v1
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; CHECK-NEXT:    v_mov_b32_e32 v42, v2
+; CHECK-NEXT:    v_mov_b32_e32 v41, v2
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
 ; CHECK-NEXT:    s_mov_b32 s44, s13
@@ -440,7 +440,7 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
@@ -454,32 +454,32 @@ define double @test_pown_fast_f64(double %x, i32 %y) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v42
-; CHECK-NEXT:    v_and_b32_e32 v2, v2, v43
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_lshlrev_b32_e32 v2, 31, v41
+; CHECK-NEXT:    v_and_b32_e32 v2, v2, v42
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
@@ -543,34 +543,34 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v42, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v42, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v42, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v42, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v42, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v42, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v42, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v42, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v42, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v42, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v42, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v42, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v42, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v42, s44, 12
 ; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v42, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
@@ -578,10 +578,10 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s45, s12
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; CHECK-NEXT:    v_lshlrev_b32_e32 v42, 1, v2
+; CHECK-NEXT:    v_lshlrev_b32_e32 v41, 1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v41
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
@@ -595,28 +595,28 @@ define double @test_pown_fast_f64_known_even(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_readlane_b32 s45, v42, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v42, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v42, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v42, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v42, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v42, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v42, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v42, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v42, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v42, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v42, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v42, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v42, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v42, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v42, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
@@ -685,36 +685,36 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 14
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v43, s16, 14
+; CHECK-NEXT:    v_writelane_b32 v43, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v43, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v43, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v43, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v43, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v43, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v43, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v43, s39, 7
 ; CHECK-NEXT:    s_addk_i32 s32, 0x800
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v43, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v43, s41, 9
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4log2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4log2d@gotpcrel32@hi+12
 ; CHECK-NEXT:    s_load_dwordx2 s[16:17], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_mov_b32_e32 v42, v1
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
-; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v42
+; CHECK-NEXT:    v_writelane_b32 v43, s42, 10
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_writelane_b32 v43, s43, 11
+; CHECK-NEXT:    v_mov_b32_e32 v41, v1
+; CHECK-NEXT:    v_writelane_b32 v43, s44, 12
+; CHECK-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v41
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    v_writelane_b32 v43, s45, 13
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
@@ -722,10 +722,10 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s45, s12
 ; CHECK-NEXT:    s_mov_b64 s[36:37], s[10:11]
 ; CHECK-NEXT:    s_mov_b64 s[38:39], s[8:9]
-; CHECK-NEXT:    v_or_b32_e32 v43, 1, v2
+; CHECK-NEXT:    v_or_b32_e32 v42, 1, v2
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v43
+; CHECK-NEXT:    v_cvt_f64_i32_e32 v[2:3], v42
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, _Z4exp2d@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, _Z4exp2d@gotpcrel32@hi+12
@@ -739,31 +739,31 @@ define double @test_pown_fast_f64_known_odd(double %x, i32 %y.arg) {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; CHECK-NEXT:    v_and_b32_e32 v2, 0x80000000, v42
-; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    v_and_b32_e32 v2, 0x80000000, v41
+; CHECK-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_or_b32_e32 v1, v2, v1
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 14
+; CHECK-NEXT:    v_readlane_b32 s45, v43, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v43, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v43, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v43, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v43, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v43, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v43, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v43, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v43, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v43, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v43, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v43, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v43, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v43, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v43, 14
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xf800
 ; CHECK-NEXT:    s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 4a696879ad7b2..2f7190e761102 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -3544,12 +3544,12 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v3, s30, 0
-; GCN-NEXT:    v_writelane_b32 v3, s31, 1
+; GCN-NEXT:    v_writelane_b32 v4, s30, 0
+; GCN-NEXT:    v_writelane_b32 v4, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
@@ -3558,15 +3558,15 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
-; GCN-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v3, vcc, 2, v2
+; GCN-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v3, 1
-; GCN-NEXT:    v_readlane_b32 s30, v3, 0
+; GCN-NEXT:    v_readlane_b32 s31, v4, 1
+; GCN-NEXT:    v_readlane_b32 s30, v4, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -3579,28 +3579,28 @@ define void @test_call_v2bf16(<2 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    s_getpc_b64 s[4:5]
 ; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
-; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 2, v2
+; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 2, v2
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
-; GFX7-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v2, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s8
@@ -3737,12 +3737,12 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v4, s30, 0
-; GCN-NEXT:    v_writelane_b32 v4, s31, 1
+; GCN-NEXT:    v_writelane_b32 v5, s30, 0
+; GCN-NEXT:    v_writelane_b32 v5, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
@@ -3751,16 +3751,16 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GCN-NEXT:    v_add_i32_e32 v5, vcc, 4, v3
+; GCN-NEXT:    v_add_i32_e32 v4, vcc, 4, v3
 ; GCN-NEXT:    v_alignbit_b32 v0, v1, v0, 16
-; GCN-NEXT:    buffer_store_short v2, v5, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v2, v4, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_dword v0, v3, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v4, 1
-; GCN-NEXT:    v_readlane_b32 s30, v4, 0
+; GCN-NEXT:    v_readlane_b32 s31, v5, 1
+; GCN-NEXT:    v_readlane_b32 s30, v5, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -3808,26 +3808,26 @@ define void @test_call_v3bf16(<3 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0x400
 ; GFX8-NEXT:    s_getpc_b64 s[4:5]
 ; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
-; GFX8-NEXT:    buffer_store_short v1, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_short v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s6
@@ -3941,12 +3941,12 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v5, s30, 0
-; GCN-NEXT:    v_writelane_b32 v5, s31, 1
+; GCN-NEXT:    v_writelane_b32 v8, s30, 0
+; GCN-NEXT:    v_writelane_b32 v8, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
@@ -3957,21 +3957,21 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GCN-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
 ; GCN-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GCN-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
-; GCN-NEXT:    v_add_i32_e32 v7, vcc, 4, v4
-; GCN-NEXT:    v_add_i32_e32 v8, vcc, 2, v4
-; GCN-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v5, vcc, 6, v4
+; GCN-NEXT:    v_add_i32_e32 v6, vcc, 4, v4
+; GCN-NEXT:    v_add_i32_e32 v7, vcc, 2, v4
+; GCN-NEXT:    buffer_store_short v3, v5, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v7, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v2, v6, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v8, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v7, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v5, 1
-; GCN-NEXT:    v_readlane_b32 s30, v5, 0
+; GCN-NEXT:    v_readlane_b32 s31, v8, 1
+; GCN-NEXT:    v_readlane_b32 s30, v8, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -3984,21 +3984,21 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    s_getpc_b64 s[4:5]
 ; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v6, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v6, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
-; GFX7-NEXT:    v_add_i32_e32 v6, vcc, 6, v4
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 6, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v2, 16, v2
-; GFX7-NEXT:    buffer_store_short v3, v6, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v3, v5, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v3, vcc, 4, v4
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
@@ -4010,10 +4010,10 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v4, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v6, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s8
@@ -4026,26 +4026,26 @@ define void @test_call_v4bf16(<4 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0x400
 ; GFX8-NEXT:    s_getpc_b64 s[4:5]
 ; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v3, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v4, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v4, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v2
-; GFX8-NEXT:    buffer_store_dword v1, v4, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 4, v2
+; GFX8-NEXT:    buffer_store_dword v1, v3, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v3, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v3, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v4, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v4, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s6
@@ -4157,12 +4157,12 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v16, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v9, s30, 0
-; GCN-NEXT:    v_writelane_b32 v9, s31, 1
+; GCN-NEXT:    v_writelane_b32 v16, s30, 0
+; GCN-NEXT:    v_writelane_b32 v16, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
@@ -4177,33 +4177,33 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
 ; GCN-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
 ; GCN-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GCN-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
-; GCN-NEXT:    v_add_i32_e32 v11, vcc, 12, v8
-; GCN-NEXT:    v_add_i32_e32 v12, vcc, 10, v8
-; GCN-NEXT:    v_add_i32_e32 v13, vcc, 8, v8
-; GCN-NEXT:    v_add_i32_e32 v14, vcc, 6, v8
-; GCN-NEXT:    v_add_i32_e32 v15, vcc, 4, v8
-; GCN-NEXT:    v_add_i32_e32 v16, vcc, 2, v8
-; GCN-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v9, vcc, 14, v8
+; GCN-NEXT:    v_add_i32_e32 v10, vcc, 12, v8
+; GCN-NEXT:    v_add_i32_e32 v11, vcc, 10, v8
+; GCN-NEXT:    v_add_i32_e32 v12, vcc, 8, v8
+; GCN-NEXT:    v_add_i32_e32 v13, vcc, 6, v8
+; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v8
+; GCN-NEXT:    v_add_i32_e32 v15, vcc, 2, v8
+; GCN-NEXT:    buffer_store_short v7, v9, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v6, v11, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v6, v10, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v5, v12, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v5, v11, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v4, v13, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v4, v12, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v3, v14, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v3, v13, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v2, v15, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v16, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v15, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v9, 1
-; GCN-NEXT:    v_readlane_b32 s30, v9, 0
+; GCN-NEXT:    v_readlane_b32 s31, v16, 1
+; GCN-NEXT:    v_readlane_b32 s30, v16, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v16, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -4216,21 +4216,21 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    s_getpc_b64 s[4:5]
 ; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v10, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v10, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v7, 16, v7
-; GFX7-NEXT:    v_add_i32_e32 v10, vcc, 14, v8
+; GFX7-NEXT:    v_add_i32_e32 v9, vcc, 14, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v6, 16, v6
-; GFX7-NEXT:    buffer_store_short v7, v10, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v7, v9, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v7, vcc, 12, v8
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v5, 16, v5
@@ -4258,10 +4258,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v8, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v10, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s8
@@ -4274,19 +4274,19 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0x400
 ; GFX8-NEXT:    s_getpc_b64 s[4:5]
 ; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v5, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v5, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v6, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v6, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v6, vcc, 12, v4
-; GFX8-NEXT:    buffer_store_dword v3, v6, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 12, v4
+; GFX8-NEXT:    buffer_store_dword v3, v5, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v3, vcc, 8, v4
 ; GFX8-NEXT:    buffer_store_dword v2, v3, s[0:3], 0 offen
@@ -4296,10 +4296,10 @@ define void @test_call_v8bf16(<8 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v4, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v5, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v5, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v6, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v6, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s6
@@ -4419,12 +4419,12 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    s_mov_b32 s8, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v28, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
 ; GCN-NEXT:    s_waitcnt expcnt(0)
-; GCN-NEXT:    v_writelane_b32 v17, s30, 0
-; GCN-NEXT:    v_writelane_b32 v17, s31, 1
+; GCN-NEXT:    v_writelane_b32 v28, s30, 0
+; GCN-NEXT:    v_writelane_b32 v28, s31, 1
 ; GCN-NEXT:    s_getpc_b64 s[4:5]
 ; GCN-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
@@ -4447,57 +4447,57 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GCN-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
 ; GCN-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
 ; GCN-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 28, v16
-; GCN-NEXT:    v_add_i32_e32 v20, vcc, 26, v16
-; GCN-NEXT:    v_add_i32_e32 v21, vcc, 24, v16
-; GCN-NEXT:    v_add_i32_e32 v22, vcc, 22, v16
-; GCN-NEXT:    v_add_i32_e32 v23, vcc, 20, v16
-; GCN-NEXT:    v_add_i32_e32 v24, vcc, 18, v16
-; GCN-NEXT:    v_add_i32_e32 v25, vcc, 16, v16
-; GCN-NEXT:    v_add_i32_e32 v26, vcc, 14, v16
-; GCN-NEXT:    v_add_i32_e32 v27, vcc, 12, v16
-; GCN-NEXT:    v_add_i32_e32 v28, vcc, 10, v16
-; GCN-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 30, v16
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 28, v16
+; GCN-NEXT:    v_add_i32_e32 v19, vcc, 26, v16
+; GCN-NEXT:    v_add_i32_e32 v20, vcc, 24, v16
+; GCN-NEXT:    v_add_i32_e32 v21, vcc, 22, v16
+; GCN-NEXT:    v_add_i32_e32 v22, vcc, 20, v16
+; GCN-NEXT:    v_add_i32_e32 v23, vcc, 18, v16
+; GCN-NEXT:    v_add_i32_e32 v24, vcc, 16, v16
+; GCN-NEXT:    v_add_i32_e32 v25, vcc, 14, v16
+; GCN-NEXT:    v_add_i32_e32 v26, vcc, 12, v16
+; GCN-NEXT:    v_add_i32_e32 v27, vcc, 10, v16
+; GCN-NEXT:    buffer_store_short v15, v17, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v15, vcc, 8, v16
-; GCN-NEXT:    v_add_i32_e32 v18, vcc, 6, v16
-; GCN-NEXT:    buffer_store_short v14, v19, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v17, vcc, 6, v16
+; GCN-NEXT:    buffer_store_short v14, v18, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; GCN-NEXT:    v_add_i32_e32 v14, vcc, 4, v16
-; GCN-NEXT:    v_add_i32_e32 v19, vcc, 2, v16
-; GCN-NEXT:    buffer_store_short v13, v20, s[0:3], 0 offen
+; GCN-NEXT:    v_add_i32_e32 v18, vcc, 2, v16
+; GCN-NEXT:    buffer_store_short v13, v19, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v12, v21, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v12, v20, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v11, v22, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v11, v21, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v10, v23, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v10, v22, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v9, v24, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v9, v23, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v8, v25, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v8, v24, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v7, v26, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v7, v25, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v6, v27, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v6, v26, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v5, v28, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v5, v27, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v4, v15, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v3, v18, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v3, v17, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v2, v14, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_short v1, v19, s[0:3], 0 offen
+; GCN-NEXT:    buffer_store_short v1, v18, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    v_readlane_b32 s31, v17, 1
-; GCN-NEXT:    v_readlane_b32 s30, v17, 0
+; GCN-NEXT:    v_readlane_b32 s31, v28, 1
+; GCN-NEXT:    v_readlane_b32 s30, v28, 0
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v28, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s8
@@ -4510,21 +4510,21 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_mov_b32 s8, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v18, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
 ; GFX7-NEXT:    s_getpc_b64 s[4:5]
 ; GFX7-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GFX7-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX7-NEXT:    v_writelane_b32 v17, s30, 0
-; GFX7-NEXT:    v_writelane_b32 v17, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v18, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v18, s31, 1
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[4:5]
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v15, 16, v15
-; GFX7-NEXT:    v_add_i32_e32 v18, vcc, 30, v16
+; GFX7-NEXT:    v_add_i32_e32 v17, vcc, 30, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
-; GFX7-NEXT:    buffer_store_short v15, v18, s[0:3], 0 offen
+; GFX7-NEXT:    buffer_store_short v15, v17, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    v_add_i32_e32 v15, vcc, 28, v16
 ; GFX7-NEXT:    v_lshrrev_b32_e32 v13, 16, v13
@@ -4584,10 +4584,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    buffer_store_short v0, v16, s[0:3], 0 offen
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_readlane_b32 s31, v17, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v17, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v18, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v18, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v18, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s8
@@ -4600,19 +4600,19 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_mov_b32 s6, s33
 ; GFX8-NEXT:    s_mov_b32 s33, s32
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0x400
 ; GFX8-NEXT:    s_getpc_b64 s[4:5]
 ; GFX8-NEXT:    s_add_u32 s4, s4, test_arg_store_v2bf16@gotpcrel32@lo+4
 ; GFX8-NEXT:    s_addc_u32 s5, s5, test_arg_store_v2bf16@gotpcrel32@hi+12
 ; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX8-NEXT:    v_writelane_b32 v9, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v9, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v10, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v10, s31, 1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX8-NEXT:    v_add_u32_e32 v10, vcc, 28, v8
-; GFX8-NEXT:    buffer_store_dword v7, v10, s[0:3], 0 offen
+; GFX8-NEXT:    v_add_u32_e32 v9, vcc, 28, v8
+; GFX8-NEXT:    buffer_store_dword v7, v9, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_add_u32_e32 v7, vcc, 24, v8
 ; GFX8-NEXT:    buffer_store_dword v6, v7, s[0:3], 0 offen
@@ -4634,10 +4634,10 @@ define void @test_call_v16bf16(<16 x bfloat> %in, ptr addrspace(5) %out) {
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    buffer_store_dword v0, v8, s[0:3], 0 offen
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_readlane_b32 s31, v9, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v9, 0
+; GFX8-NEXT:    v_readlane_b32 s31, v10, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX8-NEXT:    s_mov_b32 s33, s6
@@ -27303,7 +27303,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7:       ; %bb.0:
 ; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX7-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
@@ -27330,16 +27330,16 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v11
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX7-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v32, s30, 0
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX7-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX7-NEXT:    v_writelane_b32 v32, s31, 1
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX7-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX7-NEXT:    v_writelane_b32 v32, s34, 2
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX7-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX7-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX7-NEXT:    v_writelane_b32 v32, s35, 3
 ; GFX7-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX7-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; GFX7-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:8
@@ -27357,7 +27357,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    buffer_load_dword v13, off, s[0:3], s32 offset:56
 ; GFX7-NEXT:    buffer_load_dword v14, off, s[0:3], s32 offset:60
 ; GFX7-NEXT:    buffer_load_dword v15, off, s[0:3], s32
-; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
+; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:64
 ; GFX7-NEXT:    s_waitcnt vmcnt(14)
 ; GFX7-NEXT:    v_cndmask_b32_e32 v0, v0, v16, vcc
 ; GFX7-NEXT:    v_cndmask_b32_e64 v1, v1, v17, s[4:5]
@@ -27388,7 +27388,7 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    v_cndmask_b32_e64 v14, v14, v30, s[30:31]
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff0000, v0
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
-; GFX7-NEXT:    v_cndmask_b32_e64 v15, v32, v15, s[34:35]
+; GFX7-NEXT:    v_cndmask_b32_e64 v15, v31, v15, s[34:35]
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff0000, v1
 ; GFX7-NEXT:    v_and_b32_e32 v2, 0xffff0000, v2
 ; GFX7-NEXT:    v_and_b32_e32 v3, 0xffff0000, v3
@@ -27404,12 +27404,12 @@ define <16 x bfloat> @v_vselect_v16bf16(<16 x i1> %cond, <16 x bfloat> %a, <16 x
 ; GFX7-NEXT:    v_and_b32_e32 v13, 0xffff0000, v13
 ; GFX7-NEXT:    v_and_b32_e32 v14, 0xffff0000, v14
 ; GFX7-NEXT:    v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX7-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX7-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX7-NEXT:    v_readlane_b32 s35, v32, 3
+; GFX7-NEXT:    v_readlane_b32 s34, v32, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v32, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v32, 0
 ; GFX7-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX7-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX7-NEXT:    s_waitcnt vmcnt(0)
 ; GFX7-NEXT:    s_setpc_b64 s[30:31]
@@ -28211,107 +28211,107 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX8-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX8-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX8-NEXT:    v_writelane_b32 v31, s31, 1
-; GFX8-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX8-NEXT:    v_writelane_b32 v34, s30, 0
+; GFX8-NEXT:    v_writelane_b32 v34, s31, 1
+; GFX8-NEXT:    v_writelane_b32 v34, s34, 2
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX8-NEXT:    v_writelane_b32 v34, s35, 3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v1
-; GFX8-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX8-NEXT:    v_writelane_b32 v34, s36, 4
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX8-NEXT:    v_writelane_b32 v31, s37, 5
+; GFX8-NEXT:    v_writelane_b32 v34, s37, 5
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX8-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX8-NEXT:    v_writelane_b32 v34, s38, 6
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v4
-; GFX8-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX8-NEXT:    v_writelane_b32 v34, s39, 7
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX8-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX8-NEXT:    v_writelane_b32 v34, s40, 8
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX8-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX8-NEXT:    v_writelane_b32 v34, s41, 9
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v7
-; GFX8-NEXT:    v_writelane_b32 v31, s42, 10
+; GFX8-NEXT:    v_writelane_b32 v34, s42, 10
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v8
-; GFX8-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX8-NEXT:    v_writelane_b32 v34, s43, 11
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v9
-; GFX8-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX8-NEXT:    v_writelane_b32 v34, s44, 12
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v10
-; GFX8-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX8-NEXT:    v_writelane_b32 v34, s45, 13
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v11
-; GFX8-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX8-NEXT:    v_writelane_b32 v34, s46, 14
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX8-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX8-NEXT:    v_writelane_b32 v34, s47, 15
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX8-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX8-NEXT:    v_writelane_b32 v34, s48, 16
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX8-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX8-NEXT:    v_writelane_b32 v34, s49, 17
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX8-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX8-NEXT:    v_writelane_b32 v34, s50, 18
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v16
-; GFX8-NEXT:    v_writelane_b32 v31, s51, 19
+; GFX8-NEXT:    v_writelane_b32 v34, s51, 19
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v17
-; GFX8-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX8-NEXT:    v_writelane_b32 v34, s52, 20
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v18
-; GFX8-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX8-NEXT:    v_writelane_b32 v34, s53, 21
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v19
-; GFX8-NEXT:    v_writelane_b32 v31, s54, 22
+; GFX8-NEXT:    v_writelane_b32 v34, s54, 22
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX8-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX8-NEXT:    v_writelane_b32 v34, s55, 23
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v21
-; GFX8-NEXT:    v_writelane_b32 v31, s56, 24
+; GFX8-NEXT:    v_writelane_b32 v34, s56, 24
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v22
-; GFX8-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX8-NEXT:    v_writelane_b32 v34, s57, 25
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v23
-; GFX8-NEXT:    v_writelane_b32 v31, s58, 26
+; GFX8-NEXT:    v_writelane_b32 v34, s58, 26
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v24
-; GFX8-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX8-NEXT:    v_writelane_b32 v34, s59, 27
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX8-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX8-NEXT:    v_writelane_b32 v34, s60, 28
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX8-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX8-NEXT:    v_writelane_b32 v34, s61, 29
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX8-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX8-NEXT:    v_writelane_b32 v34, s62, 30
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX8-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX8-NEXT:    v_writelane_b32 v34, s63, 31
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX8-NEXT:    v_writelane_b32 v31, s64, 32
+; GFX8-NEXT:    v_writelane_b32 v34, s64, 32
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v30
-; GFX8-NEXT:    v_writelane_b32 v31, s65, 33
+; GFX8-NEXT:    v_writelane_b32 v34, s65, 33
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[64:65], 1, v0
 ; GFX8-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX8-NEXT:    v_writelane_b32 v31, s66, 34
-; GFX8-NEXT:    v_writelane_b32 v31, s67, 35
+; GFX8-NEXT:    v_writelane_b32 v34, s66, 34
+; GFX8-NEXT:    v_writelane_b32 v34, s67, 35
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX8-NEXT:    v_cmp_eq_u32_e64 s[66:67], 1, v0
@@ -28344,74 +28344,74 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:120
 ; GFX8-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
 ; GFX8-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:124
-; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
+; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
 ; GFX8-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:128
-; GFX8-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GFX8-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v29
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v29
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v33
-; GFX8-NEXT:    v_cndmask_b32_e64 v28, v34, v28, s[66:67]
-; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v33, s[64:65]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v32
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v30
-; GFX8-NEXT:    v_cndmask_b32_e64 v33, v34, v33, s[62:63]
-; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v32, s[60:61]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v27
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v26
-; GFX8-NEXT:    v_cndmask_b32_e64 v32, v34, v32, s[58:59]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v28, 16, v32
+; GFX8-NEXT:    v_cndmask_b32_e64 v28, v33, v28, s[66:67]
+; GFX8-NEXT:    v_cndmask_b32_e64 v29, v29, v32, s[64:65]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v32, 16, v31
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v30
+; GFX8-NEXT:    v_cndmask_b32_e64 v32, v33, v32, s[62:63]
+; GFX8-NEXT:    v_cndmask_b32_e64 v30, v30, v31, s[60:61]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v31, 16, v27
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v26
+; GFX8-NEXT:    v_cndmask_b32_e64 v31, v33, v31, s[58:59]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v26, v26, v27, s[56:57]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v27, 16, v25
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v24
-; GFX8-NEXT:    v_cndmask_b32_e64 v27, v34, v27, s[54:55]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v24
+; GFX8-NEXT:    v_cndmask_b32_e64 v27, v33, v27, s[54:55]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v24, v24, v25, s[52:53]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v25, 16, v23
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v22
-; GFX8-NEXT:    v_cndmask_b32_e64 v25, v34, v25, s[50:51]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v22
+; GFX8-NEXT:    v_cndmask_b32_e64 v25, v33, v25, s[50:51]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v22, v22, v23, s[48:49]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v23, 16, v21
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v20
-; GFX8-NEXT:    v_cndmask_b32_e64 v23, v34, v23, s[46:47]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v20
+; GFX8-NEXT:    v_cndmask_b32_e64 v23, v33, v23, s[46:47]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v20, v20, v21, s[44:45]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v21, 16, v19
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v18
-; GFX8-NEXT:    v_cndmask_b32_e64 v21, v34, v21, s[42:43]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v18
+; GFX8-NEXT:    v_cndmask_b32_e64 v21, v33, v21, s[42:43]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v18, v18, v19, s[40:41]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v19, 16, v17
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v16
-; GFX8-NEXT:    v_cndmask_b32_e64 v19, v34, v19, s[38:39]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v16
+; GFX8-NEXT:    v_cndmask_b32_e64 v19, v33, v19, s[38:39]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v16, v16, v17, s[36:37]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 16, v15
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v14
-; GFX8-NEXT:    v_cndmask_b32_e64 v17, v34, v17, s[34:35]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v14
+; GFX8-NEXT:    v_cndmask_b32_e64 v17, v33, v17, s[34:35]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v14, v14, v15, s[30:31]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 16, v13
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v12
-; GFX8-NEXT:    v_cndmask_b32_e64 v15, v34, v15, s[28:29]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v12
+; GFX8-NEXT:    v_cndmask_b32_e64 v15, v33, v15, s[28:29]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v12, v12, v13, s[26:27]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 16, v11
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v10
-; GFX8-NEXT:    v_cndmask_b32_e64 v13, v34, v13, s[24:25]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v10
+; GFX8-NEXT:    v_cndmask_b32_e64 v13, v33, v13, s[24:25]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v10, v10, v11, s[22:23]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 16, v9
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v8
-; GFX8-NEXT:    v_cndmask_b32_e64 v11, v34, v11, s[20:21]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v8
+; GFX8-NEXT:    v_cndmask_b32_e64 v11, v33, v11, s[20:21]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v8, v8, v9, s[18:19]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v6
-; GFX8-NEXT:    v_cndmask_b32_e64 v9, v34, v9, s[16:17]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v6
+; GFX8-NEXT:    v_cndmask_b32_e64 v9, v33, v9, s[16:17]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v6, v6, v7, s[14:15]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 16, v5
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v4
-; GFX8-NEXT:    v_cndmask_b32_e64 v7, v34, v7, s[12:13]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v4
+; GFX8-NEXT:    v_cndmask_b32_e64 v7, v33, v7, s[12:13]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v4, v4, v5, s[10:11]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 16, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v2
-; GFX8-NEXT:    v_cndmask_b32_e64 v5, v34, v5, s[8:9]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v2
+; GFX8-NEXT:    v_cndmask_b32_e64 v5, v33, v5, s[8:9]
 ; GFX8-NEXT:    v_cndmask_b32_e64 v2, v2, v3, s[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
-; GFX8-NEXT:    v_lshrrev_b32_e32 v34, 16, v0
-; GFX8-NEXT:    v_cndmask_b32_e64 v3, v34, v3, s[4:5]
+; GFX8-NEXT:    v_lshrrev_b32_e32 v33, 16, v0
+; GFX8-NEXT:    v_cndmask_b32_e64 v3, v33, v3, s[4:5]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v0, v0, v1, vcc
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
 ; GFX8-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28434,8 +28434,8 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v23
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v11, 16, v25
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v12, 16, v27
-; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v32
-; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v33
+; GFX8-NEXT:    v_lshlrev_b32_e32 v13, 16, v31
+; GFX8-NEXT:    v_lshlrev_b32_e32 v14, 16, v32
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v15, 16, v28
 ; GFX8-NEXT:    v_or_b32_sdwa v8, v16, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v9, v18, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -28445,44 +28445,44 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX8-NEXT:    v_or_b32_sdwa v13, v26, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v14, v30, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX8-NEXT:    v_or_b32_sdwa v15, v29, v15 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_readlane_b32 s67, v31, 35
-; GFX8-NEXT:    v_readlane_b32 s66, v31, 34
-; GFX8-NEXT:    v_readlane_b32 s65, v31, 33
-; GFX8-NEXT:    v_readlane_b32 s64, v31, 32
-; GFX8-NEXT:    v_readlane_b32 s63, v31, 31
-; GFX8-NEXT:    v_readlane_b32 s62, v31, 30
-; GFX8-NEXT:    v_readlane_b32 s61, v31, 29
-; GFX8-NEXT:    v_readlane_b32 s60, v31, 28
-; GFX8-NEXT:    v_readlane_b32 s59, v31, 27
-; GFX8-NEXT:    v_readlane_b32 s58, v31, 26
-; GFX8-NEXT:    v_readlane_b32 s57, v31, 25
-; GFX8-NEXT:    v_readlane_b32 s56, v31, 24
-; GFX8-NEXT:    v_readlane_b32 s55, v31, 23
-; GFX8-NEXT:    v_readlane_b32 s54, v31, 22
-; GFX8-NEXT:    v_readlane_b32 s53, v31, 21
-; GFX8-NEXT:    v_readlane_b32 s52, v31, 20
-; GFX8-NEXT:    v_readlane_b32 s51, v31, 19
-; GFX8-NEXT:    v_readlane_b32 s50, v31, 18
-; GFX8-NEXT:    v_readlane_b32 s49, v31, 17
-; GFX8-NEXT:    v_readlane_b32 s48, v31, 16
-; GFX8-NEXT:    v_readlane_b32 s47, v31, 15
-; GFX8-NEXT:    v_readlane_b32 s46, v31, 14
-; GFX8-NEXT:    v_readlane_b32 s45, v31, 13
-; GFX8-NEXT:    v_readlane_b32 s44, v31, 12
-; GFX8-NEXT:    v_readlane_b32 s43, v31, 11
-; GFX8-NEXT:    v_readlane_b32 s42, v31, 10
-; GFX8-NEXT:    v_readlane_b32 s41, v31, 9
-; GFX8-NEXT:    v_readlane_b32 s40, v31, 8
-; GFX8-NEXT:    v_readlane_b32 s39, v31, 7
-; GFX8-NEXT:    v_readlane_b32 s38, v31, 6
-; GFX8-NEXT:    v_readlane_b32 s37, v31, 5
-; GFX8-NEXT:    v_readlane_b32 s36, v31, 4
-; GFX8-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX8-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX8-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX8-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX8-NEXT:    v_readlane_b32 s67, v34, 35
+; GFX8-NEXT:    v_readlane_b32 s66, v34, 34
+; GFX8-NEXT:    v_readlane_b32 s65, v34, 33
+; GFX8-NEXT:    v_readlane_b32 s64, v34, 32
+; GFX8-NEXT:    v_readlane_b32 s63, v34, 31
+; GFX8-NEXT:    v_readlane_b32 s62, v34, 30
+; GFX8-NEXT:    v_readlane_b32 s61, v34, 29
+; GFX8-NEXT:    v_readlane_b32 s60, v34, 28
+; GFX8-NEXT:    v_readlane_b32 s59, v34, 27
+; GFX8-NEXT:    v_readlane_b32 s58, v34, 26
+; GFX8-NEXT:    v_readlane_b32 s57, v34, 25
+; GFX8-NEXT:    v_readlane_b32 s56, v34, 24
+; GFX8-NEXT:    v_readlane_b32 s55, v34, 23
+; GFX8-NEXT:    v_readlane_b32 s54, v34, 22
+; GFX8-NEXT:    v_readlane_b32 s53, v34, 21
+; GFX8-NEXT:    v_readlane_b32 s52, v34, 20
+; GFX8-NEXT:    v_readlane_b32 s51, v34, 19
+; GFX8-NEXT:    v_readlane_b32 s50, v34, 18
+; GFX8-NEXT:    v_readlane_b32 s49, v34, 17
+; GFX8-NEXT:    v_readlane_b32 s48, v34, 16
+; GFX8-NEXT:    v_readlane_b32 s47, v34, 15
+; GFX8-NEXT:    v_readlane_b32 s46, v34, 14
+; GFX8-NEXT:    v_readlane_b32 s45, v34, 13
+; GFX8-NEXT:    v_readlane_b32 s44, v34, 12
+; GFX8-NEXT:    v_readlane_b32 s43, v34, 11
+; GFX8-NEXT:    v_readlane_b32 s42, v34, 10
+; GFX8-NEXT:    v_readlane_b32 s41, v34, 9
+; GFX8-NEXT:    v_readlane_b32 s40, v34, 8
+; GFX8-NEXT:    v_readlane_b32 s39, v34, 7
+; GFX8-NEXT:    v_readlane_b32 s38, v34, 6
+; GFX8-NEXT:    v_readlane_b32 s37, v34, 5
+; GFX8-NEXT:    v_readlane_b32 s36, v34, 4
+; GFX8-NEXT:    v_readlane_b32 s35, v34, 3
+; GFX8-NEXT:    v_readlane_b32 s34, v34, 2
+; GFX8-NEXT:    v_readlane_b32 s31, v34, 1
+; GFX8-NEXT:    v_readlane_b32 s30, v34, 0
 ; GFX8-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX8-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX8-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX8-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -28491,104 +28491,104 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v31, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v31, s31, 1
-; GFX9-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v33, s34, 2
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX9-NEXT:    v_writelane_b32 v33, s35, 3
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[4:5], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v3
-; GFX9-NEXT:    v_writelane_b32 v31, s36, 4
+; GFX9-NEXT:    v_writelane_b32 v33, s36, 4
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[6:7], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
-; GFX9-NEXT:    v_writelane_b32 v31, s37, 5
+; GFX9-NEXT:    v_writelane_b32 v33, s37, 5
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[8:9], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v5
-; GFX9-NEXT:    v_writelane_b32 v31, s38, 6
+; GFX9-NEXT:    v_writelane_b32 v33, s38, 6
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[10:11], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v4
-; GFX9-NEXT:    v_writelane_b32 v31, s39, 7
+; GFX9-NEXT:    v_writelane_b32 v33, s39, 7
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[12:13], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v7
-; GFX9-NEXT:    v_writelane_b32 v31, s40, 8
+; GFX9-NEXT:    v_writelane_b32 v33, s40, 8
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[14:15], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v6
-; GFX9-NEXT:    v_writelane_b32 v31, s41, 9
+; GFX9-NEXT:    v_writelane_b32 v33, s41, 9
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[16:17], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v9
-; GFX9-NEXT:    v_writelane_b32 v31, s42, 10
+; GFX9-NEXT:    v_writelane_b32 v33, s42, 10
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[18:19], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v8
-; GFX9-NEXT:    v_writelane_b32 v31, s43, 11
+; GFX9-NEXT:    v_writelane_b32 v33, s43, 11
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[20:21], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v11
-; GFX9-NEXT:    v_writelane_b32 v31, s44, 12
+; GFX9-NEXT:    v_writelane_b32 v33, s44, 12
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[22:23], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v10
-; GFX9-NEXT:    v_writelane_b32 v31, s45, 13
+; GFX9-NEXT:    v_writelane_b32 v33, s45, 13
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[24:25], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v13
-; GFX9-NEXT:    v_writelane_b32 v31, s46, 14
+; GFX9-NEXT:    v_writelane_b32 v33, s46, 14
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[26:27], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v12
-; GFX9-NEXT:    v_writelane_b32 v31, s47, 15
+; GFX9-NEXT:    v_writelane_b32 v33, s47, 15
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[28:29], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v15
-; GFX9-NEXT:    v_writelane_b32 v31, s48, 16
+; GFX9-NEXT:    v_writelane_b32 v33, s48, 16
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[30:31], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v14
-; GFX9-NEXT:    v_writelane_b32 v31, s49, 17
+; GFX9-NEXT:    v_writelane_b32 v33, s49, 17
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[34:35], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v17
-; GFX9-NEXT:    v_writelane_b32 v31, s50, 18
+; GFX9-NEXT:    v_writelane_b32 v33, s50, 18
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[36:37], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v16
-; GFX9-NEXT:    v_writelane_b32 v31, s51, 19
+; GFX9-NEXT:    v_writelane_b32 v33, s51, 19
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[38:39], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v19
-; GFX9-NEXT:    v_writelane_b32 v31, s52, 20
+; GFX9-NEXT:    v_writelane_b32 v33, s52, 20
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[40:41], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v18
-; GFX9-NEXT:    v_writelane_b32 v31, s53, 21
+; GFX9-NEXT:    v_writelane_b32 v33, s53, 21
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[42:43], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v21
-; GFX9-NEXT:    v_writelane_b32 v31, s54, 22
+; GFX9-NEXT:    v_writelane_b32 v33, s54, 22
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[44:45], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v20
-; GFX9-NEXT:    v_writelane_b32 v31, s55, 23
+; GFX9-NEXT:    v_writelane_b32 v33, s55, 23
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[46:47], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v23
-; GFX9-NEXT:    v_writelane_b32 v31, s56, 24
+; GFX9-NEXT:    v_writelane_b32 v33, s56, 24
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[48:49], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v22
-; GFX9-NEXT:    v_writelane_b32 v31, s57, 25
+; GFX9-NEXT:    v_writelane_b32 v33, s57, 25
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[50:51], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v25
-; GFX9-NEXT:    v_writelane_b32 v31, s58, 26
+; GFX9-NEXT:    v_writelane_b32 v33, s58, 26
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[52:53], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v24
-; GFX9-NEXT:    v_writelane_b32 v31, s59, 27
+; GFX9-NEXT:    v_writelane_b32 v33, s59, 27
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[54:55], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v27
-; GFX9-NEXT:    v_writelane_b32 v31, s60, 28
+; GFX9-NEXT:    v_writelane_b32 v33, s60, 28
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[56:57], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v26
-; GFX9-NEXT:    v_writelane_b32 v31, s61, 29
+; GFX9-NEXT:    v_writelane_b32 v33, s61, 29
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[58:59], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v29
-; GFX9-NEXT:    v_writelane_b32 v31, s62, 30
+; GFX9-NEXT:    v_writelane_b32 v33, s62, 30
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[60:61], 1, v0
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v28
-; GFX9-NEXT:    v_writelane_b32 v31, s63, 31
+; GFX9-NEXT:    v_writelane_b32 v33, s63, 31
 ; GFX9-NEXT:    v_cmp_eq_u32_e64 s[62:63], 1, v0
 ; GFX9-NEXT:    buffer_load_ushort v0, off, s[0:3], s32
-; GFX9-NEXT:    v_writelane_b32 v31, s64, 32
-; GFX9-NEXT:    v_writelane_b32 v31, s65, 33
-; GFX9-NEXT:    v_writelane_b32 v31, s66, 34
+; GFX9-NEXT:    v_writelane_b32 v33, s64, 32
+; GFX9-NEXT:    v_writelane_b32 v33, s65, 33
+; GFX9-NEXT:    v_writelane_b32 v33, s66, 34
 ; GFX9-NEXT:    v_and_b32_e32 v1, 1, v1
-; GFX9-NEXT:    v_writelane_b32 v31, s67, 35
+; GFX9-NEXT:    v_writelane_b32 v33, s67, 35
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v0
@@ -28625,14 +28625,14 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:56
 ; GFX9-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:124
 ; GFX9-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:64
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cndmask_b32_e64 v29, v32, v33, s[66:67]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v33, 16, v33
+; GFX9-NEXT:    v_cndmask_b32_e64 v29, v31, v32, s[66:67]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v32, 16, v32
-; GFX9-NEXT:    v_cndmask_b32_e64 v32, v32, v33, s[64:65]
-; GFX9-NEXT:    v_cndmask_b32_e64 v33, v28, v30, s[62:63]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v31, 16, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v31, v31, v32, s[64:65]
+; GFX9-NEXT:    v_cndmask_b32_e64 v32, v28, v30, s[62:63]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
 ; GFX9-NEXT:    v_cndmask_b32_e64 v28, v28, v30, s[60:61]
@@ -28707,46 +28707,46 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX9-NEXT:    v_perm_b32 v11, v22, v25, s4
 ; GFX9-NEXT:    v_perm_b32 v12, v24, v27, s4
 ; GFX9-NEXT:    v_perm_b32 v13, v26, v30, s4
-; GFX9-NEXT:    v_perm_b32 v14, v28, v33, s4
-; GFX9-NEXT:    v_perm_b32 v15, v32, v29, s4
-; GFX9-NEXT:    v_readlane_b32 s67, v31, 35
-; GFX9-NEXT:    v_readlane_b32 s66, v31, 34
-; GFX9-NEXT:    v_readlane_b32 s65, v31, 33
-; GFX9-NEXT:    v_readlane_b32 s64, v31, 32
-; GFX9-NEXT:    v_readlane_b32 s63, v31, 31
-; GFX9-NEXT:    v_readlane_b32 s62, v31, 30
-; GFX9-NEXT:    v_readlane_b32 s61, v31, 29
-; GFX9-NEXT:    v_readlane_b32 s60, v31, 28
-; GFX9-NEXT:    v_readlane_b32 s59, v31, 27
-; GFX9-NEXT:    v_readlane_b32 s58, v31, 26
-; GFX9-NEXT:    v_readlane_b32 s57, v31, 25
-; GFX9-NEXT:    v_readlane_b32 s56, v31, 24
-; GFX9-NEXT:    v_readlane_b32 s55, v31, 23
-; GFX9-NEXT:    v_readlane_b32 s54, v31, 22
-; GFX9-NEXT:    v_readlane_b32 s53, v31, 21
-; GFX9-NEXT:    v_readlane_b32 s52, v31, 20
-; GFX9-NEXT:    v_readlane_b32 s51, v31, 19
-; GFX9-NEXT:    v_readlane_b32 s50, v31, 18
-; GFX9-NEXT:    v_readlane_b32 s49, v31, 17
-; GFX9-NEXT:    v_readlane_b32 s48, v31, 16
-; GFX9-NEXT:    v_readlane_b32 s47, v31, 15
-; GFX9-NEXT:    v_readlane_b32 s46, v31, 14
-; GFX9-NEXT:    v_readlane_b32 s45, v31, 13
-; GFX9-NEXT:    v_readlane_b32 s44, v31, 12
-; GFX9-NEXT:    v_readlane_b32 s43, v31, 11
-; GFX9-NEXT:    v_readlane_b32 s42, v31, 10
-; GFX9-NEXT:    v_readlane_b32 s41, v31, 9
-; GFX9-NEXT:    v_readlane_b32 s40, v31, 8
-; GFX9-NEXT:    v_readlane_b32 s39, v31, 7
-; GFX9-NEXT:    v_readlane_b32 s38, v31, 6
-; GFX9-NEXT:    v_readlane_b32 s37, v31, 5
-; GFX9-NEXT:    v_readlane_b32 s36, v31, 4
-; GFX9-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v31, 0
+; GFX9-NEXT:    v_perm_b32 v14, v28, v32, s4
+; GFX9-NEXT:    v_perm_b32 v15, v31, v29, s4
+; GFX9-NEXT:    v_readlane_b32 s67, v33, 35
+; GFX9-NEXT:    v_readlane_b32 s66, v33, 34
+; GFX9-NEXT:    v_readlane_b32 s65, v33, 33
+; GFX9-NEXT:    v_readlane_b32 s64, v33, 32
+; GFX9-NEXT:    v_readlane_b32 s63, v33, 31
+; GFX9-NEXT:    v_readlane_b32 s62, v33, 30
+; GFX9-NEXT:    v_readlane_b32 s61, v33, 29
+; GFX9-NEXT:    v_readlane_b32 s60, v33, 28
+; GFX9-NEXT:    v_readlane_b32 s59, v33, 27
+; GFX9-NEXT:    v_readlane_b32 s58, v33, 26
+; GFX9-NEXT:    v_readlane_b32 s57, v33, 25
+; GFX9-NEXT:    v_readlane_b32 s56, v33, 24
+; GFX9-NEXT:    v_readlane_b32 s55, v33, 23
+; GFX9-NEXT:    v_readlane_b32 s54, v33, 22
+; GFX9-NEXT:    v_readlane_b32 s53, v33, 21
+; GFX9-NEXT:    v_readlane_b32 s52, v33, 20
+; GFX9-NEXT:    v_readlane_b32 s51, v33, 19
+; GFX9-NEXT:    v_readlane_b32 s50, v33, 18
+; GFX9-NEXT:    v_readlane_b32 s49, v33, 17
+; GFX9-NEXT:    v_readlane_b32 s48, v33, 16
+; GFX9-NEXT:    v_readlane_b32 s47, v33, 15
+; GFX9-NEXT:    v_readlane_b32 s46, v33, 14
+; GFX9-NEXT:    v_readlane_b32 s45, v33, 13
+; GFX9-NEXT:    v_readlane_b32 s44, v33, 12
+; GFX9-NEXT:    v_readlane_b32 s43, v33, 11
+; GFX9-NEXT:    v_readlane_b32 s42, v33, 10
+; GFX9-NEXT:    v_readlane_b32 s41, v33, 9
+; GFX9-NEXT:    v_readlane_b32 s40, v33, 8
+; GFX9-NEXT:    v_readlane_b32 s39, v33, 7
+; GFX9-NEXT:    v_readlane_b32 s38, v33, 6
+; GFX9-NEXT:    v_readlane_b32 s37, v33, 5
+; GFX9-NEXT:    v_readlane_b32 s36, v33, 4
+; GFX9-NEXT:    v_readlane_b32 s35, v33, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v33, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v33, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v33, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -28754,8 +28754,8 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-LABEL: v_vselect_v32bf16:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    v_and_b32_e32 v29, 1, v29
@@ -28770,31 +28770,31 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_and_b32_e32 v14, 1, v14
 ; GFX10-NEXT:    v_and_b32_e32 v12, 1, v12
 ; GFX10-NEXT:    s_clause 0x14
-; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:60
-; GFX10-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    buffer_load_ushort v34, off, s[0:3], s32
-; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:64
-; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:48
-; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:52
-; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:56
-; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:32
-; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:36
-; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:40
-; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:108
-; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:44
-; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:112
-; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:60
+; GFX10-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_load_ushort v33, off, s[0:3], s32
+; GFX10-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:64
+; GFX10-NEXT:    buffer_load_dword v36, off, s[0:3], s32 offset:48
+; GFX10-NEXT:    buffer_load_dword v37, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_load_dword v38, off, s[0:3], s32 offset:52
+; GFX10-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_load_dword v48, off, s[0:3], s32 offset:56
+; GFX10-NEXT:    buffer_load_dword v49, off, s[0:3], s32 offset:32
+; GFX10-NEXT:    buffer_load_dword v50, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_load_dword v51, off, s[0:3], s32 offset:36
+; GFX10-NEXT:    buffer_load_dword v52, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s32 offset:40
+; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s32 offset:108
+; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s32 offset:44
+; GFX10-NEXT:    buffer_load_dword v64, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_load_dword v65, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_load_dword v66, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_load_dword v67, off, s[0:3], s32 offset:80
 ; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 1, v29
 ; GFX10-NEXT:    s_clause 0x1
 ; GFX10-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_load_dword v69, off, s[0:3], s32 offset:28
+; GFX10-NEXT:    buffer_load_dword v68, off, s[0:3], s32 offset:28
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v30
 ; GFX10-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:96
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s5, 1, v28
@@ -28816,17 +28816,17 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:24
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s13, 1, v12
 ; GFX10-NEXT:    buffer_load_dword v12, off, s[0:3], s32 offset:4
-; GFX10-NEXT:    v_writelane_b32 v31, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_and_b32_e32 v2, 1, v2
 ; GFX10-NEXT:    v_and_b32_e32 v4, 1, v4
 ; GFX10-NEXT:    v_and_b32_e32 v6, 1, v6
-; GFX10-NEXT:    v_writelane_b32 v31, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
 ; GFX10-NEXT:    v_and_b32_e32 v8, 1, v8
 ; GFX10-NEXT:    v_and_b32_e32 v10, 1, v10
 ; GFX10-NEXT:    v_and_b32_e32 v1, 1, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_writelane_b32 v31, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
 ; GFX10-NEXT:    v_and_b32_e32 v5, 1, v5
 ; GFX10-NEXT:    v_and_b32_e32 v7, 1, v7
 ; GFX10-NEXT:    v_and_b32_e32 v9, 1, v9
@@ -28845,7 +28845,7 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s17, 1, v4
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s18, 1, v2
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s19, 1, v0
-; GFX10-NEXT:    v_writelane_b32 v31, s35, 3
+; GFX10-NEXT:    v_writelane_b32 v40, s35, 3
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s20, 1, v27
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s21, 1, v25
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s22, 1, v23
@@ -28861,111 +28861,111 @@ define <32 x bfloat> @v_vselect_v32bf16(<32 x i1> %cond, <32 x bfloat> %a, <32 x
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s34, 1, v5
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s35, 1, v9
 ; GFX10-NEXT:    s_waitcnt vmcnt(32)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v32
+; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v31
 ; GFX10-NEXT:    s_waitcnt vmcnt(31)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v33
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v32
 ; GFX10-NEXT:    s_waitcnt vmcnt(30)
-; GFX10-NEXT:    v_and_b32_e32 v2, 1, v34
+; GFX10-NEXT:    v_and_b32_e32 v2, 1, v33
 ; GFX10-NEXT:    s_waitcnt vmcnt(29)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v35
+; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 16, v34
 ; GFX10-NEXT:    s_waitcnt vmcnt(28)
-; GFX10-NEXT:    v_cndmask_b32_e64 v15, v35, v36, s4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v36
-; GFX10-NEXT:    v_cndmask_b32_e64 v17, v33, v32, s5
+; GFX10-NEXT:    v_cndmask_b32_e64 v15, v34, v35, s4
+; GFX10-NEXT:    v_lshrrev_b32_e32 v3, 16, v35
+; GFX10-NEXT:    v_cndmask_b32_e64 v17, v32, v31, s5
 ; GFX10-NEXT:    s_waitcnt vmcnt(25)
-; GFX10-NEXT:    v_cndmask_b32_e64 v19, v38, v39, s7
+; GFX10-NEXT:    v_cndmask_b32_e64 v19, v37, v38, s7
 ; GFX10-NEXT:    s_waitcnt vmcnt(24)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v48
+; GFX10-NEXT:    v_lshrrev_b32_e32 v6, 16, v39
 ; GFX10-NEXT:    s_waitcnt vmcnt(23)
-; GFX10-NEXT:    v_cndmask_b32_e64 v13, v48, v49, s6
-; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v49
-; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v39
-; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v38
-; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v37
+; GFX10-NEXT:    v_cndmask_b32_e64 v13, v39, v48, s6
+; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 16, v48
+; GFX10-NEXT:    v_lshrrev_b32_e32 v7, 16, v38
+; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 16, v37
+; GFX10-NEXT:    v_lshrrev_b32_e32 v9, 16, v36
 ; GFX10-NEXT:    s_waitcnt vmcnt(18)
-; GFX10-NEXT:    v_cndmask_b32_e64 v27, v53, v54, s10
+; GFX10-NEXT:    v_cndmask_b32_e64 v27, v52, v53, s10
 ; GFX10-NEXT:    s_waitcnt vmcnt(17)
-; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v55
+; GFX10-NEXT:    v_lshrrev_b32_e32 v25, 16, v54
 ; GFX10-NEXT:    s_waitcnt vmcnt(16)
-; GFX10-NEXT:    v_cndmask_b32_e64 v21, v55, v64, s9
+; GFX10-NEXT:    v_cndmask_b32_e64 v21, v54, v55, s9
 ; GFX10-NEXT:    s_waitcnt vmcnt(15)
-; GFX10-NEXT:    v_cndmask_b32_e64 v11, v65, v37, s8
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v65
-; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v64
-; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v54
-; GFX10-NEXT:    v_lshrrev_b32_e32 v33, 16, v53
-; GFX10-NEXT:    v_cndmask_b32_e64 v34, v51, v52, s11
-; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v52
-; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v51
+; GFX10-NEXT:    v_cndmask_b32_e64 v11, v64, v36, s8
+; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 16, v64
+; GFX10-NEXT:    v_lshrrev_b32_e32 v23, 16, v55
+; GFX10-NEXT:    v_lshrrev_b32_e32 v31, 16, v53
+; GFX10-NEXT:    v_lshrrev_b32_e32 v32, 16, v52
+; GFX10-NEXT:    v_cndmask_b32_e64 v33, v50, v51, s11
+; GFX10-NEXT:    v_lshrrev_b32_e32 v34, 16, v51
+; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 16, v50
 ; GFX10-NEXT:    s_waitcnt vmcnt(9)
-; GFX10-NEXT:    v_cndmask_b32_e64 v37, v30, v50, s12
-; GFX10-NEXT:    v_lshrrev_b32_e32 v38, 16, v50
+; GFX10-NEXT:    v_cndmask_b32_e64 v36, v30, v49, s12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v37, 16, v49
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v30, 16, v30
-; GFX10-NEXT:    v_cndmask_b32_e64 v39, v29, v69, s13
-; GFX10-NEXT:    v_lshrrev_b32_e32 v48, 16, v69
+; GFX10-NEXT:    v_cndmask_b32_e64 v38, v29, v68, s13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v39, 16, v68
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v29, 16, v29
 ; GFX10-NEXT:    s_waitcnt vmcnt(6)
-; GFX10-NEXT:    v_cndmask_b32_e64 v50, v24, v22, s15
+; GFX10-NEXT:    v_cndmask_b32_e64 v49, v24, v22, s15
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v22, 16, v22
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v24, 16, v24
 ; GFX10-NEXT:    s_waitcnt vmcnt(5)
-; GFX10-NEXT:    v_cndmask_b32_e64 v51, v68, v20, s16
+; GFX10-NEXT:    v_cndmask_b32_e64 v50, v67, v20, s16
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v20, 16, v20
-; GFX10-NEXT:    v_lshrrev_b32_e32 v52, 16, v68
+; GFX10-NEXT:    v_lshrrev_b32_e32 v51, 16, v67
 ; GFX10-NEXT:    s_waitcnt vmcnt(4)
-; GFX10-NEXT:    v_cndmask_b32_e64 v53, v67, v18, s17
+; GFX10-NEXT:    v_cndmask_b32_e64 v52, v66, v18, s17
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v18, 16, v18
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cndmask_b32_e64 v49, v28, v26, s14
+; GFX10-NEXT:    v_cndmask_b32_e64 v48, v28, v26, s14
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v26, 16, v26
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v28, 16, v28
-; GFX10-NEXT:    v_lshrrev_b32_e32 v54, 16, v67
-; GFX10-NEXT:    v_cndmask_b32_e64 v55, v66, v16, s18
+; GFX10-NEXT:    v_lshrrev_b32_e32 v53, 16, v66
+; GFX10-NEXT:    v_cndmask_b32_e64 v54, v65, v16, s18
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 16, v16
-; GFX10-NEXT:    v_lshrrev_b32_e32 v64, 16, v66
+; GFX10-NEXT:    v_lshrrev_b32_e32 v55, 16, v65
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cndmask_b32_e64 v65, v14, v12, s19
+; GFX10-NEXT:    v_cndmask_b32_e64 v64, v14, v12, s19
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 16, v12
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v14
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s4, 1, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v66, v1, v0, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e64 v67, v6, v5, s20
-; GFX10-NEXT:    v_cndmask_b32_e64 v68, v8, v7, s21
-; GFX10-NEXT:    v_cndmask_b32_e64 v69, v10, v9, s22
+; GFX10-NEXT:    v_cndmask_b32_e32 v65, v1, v0, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e64 v66, v6, v5, s20
+; GFX10-NEXT:    v_cndmask_b32_e64 v67, v8, v7, s21
+; GFX10-NEXT:    v_cndmask_b32_e64 v68, v10, v9, s22
 ; GFX10-NEXT:    v_cndmask_b32_e64 v10, v25, v23, s23
-; GFX10-NEXT:    v_cndmask_b32_e64 v9, v33, v32, s24
-; GFX10-NEXT:    v_cndmask_b32_e64 v8, v36, v35, s25
-; GFX10-NEXT:    v_cndmask_b32_e64 v7, v30, v38, s26
-; GFX10-NEXT:    v_cndmask_b32_e64 v6, v29, v48, s27
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, v32, v31, s24
+; GFX10-NEXT:    v_cndmask_b32_e64 v8, v35, v34, s25
+; GFX10-NEXT:    v_cndmask_b32_e64 v7, v30, v37, s26
+; GFX10-NEXT:    v_cndmask_b32_e64 v6, v29, v39, s27
 ; GFX10-NEXT:    v_cndmask_b32_e64 v5, v28, v26, s28
-; GFX10-NEXT:    v_cndmask_b32_e64 v20, v52, v20, s29
+; GFX10-NEXT:    v_cndmask_b32_e64 v20, v51, v20, s29
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, v14, v12, s31
-; GFX10-NEXT:    v_cndmask_b32_e64 v1, v64, v16, s30
-; GFX10-NEXT:    v_cndmask_b32_e64 v2, v54, v18, s34
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, v55, v16, s30
+; GFX10-NEXT:    v_cndmask_b32_e64 v2, v53, v18, s34
 ; GFX10-NEXT:    v_cndmask_b32_e64 v12, v24, v22, s35
 ; GFX10-NEXT:    v_cndmask_b32_e64 v16, v4, v3, s4
-; GFX10-NEXT:    v_perm_b32 v0, v0, v65, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v1, v1, v55, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v2, v2, v53, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v3, v20, v51, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v4, v12, v50, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v5, v5, v49, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v6, v6, v39, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v7, v7, v37, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v8, v8, v34, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v0, v0, v64, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v1, v1, v54, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v2, v2, v52, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v3, v20, v50, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v4, v12, v49, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v5, v5, v48, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v6, v6, v38, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v7, v7, v36, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v8, v8, v33, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v9, v9, v27, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v10, v10, v21, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v11, v69, v11, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v12, v68, v19, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v13, v67, v13, 0x5040100
-; GFX10-NEXT:    v_perm_b32 v14, v66, v17, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v11, v68, v11, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v12, v67, v19, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v13, v66, v13, 0x5040100
+; GFX10-NEXT:    v_perm_b32 v14, v65, v17, 0x5040100
 ; GFX10-NEXT:    v_perm_b32 v15, v16, v15, 0x5040100
-; GFX10-NEXT:    v_readlane_b32 s35, v31, 3
-; GFX10-NEXT:    v_readlane_b32 s34, v31, 2
-; GFX10-NEXT:    v_readlane_b32 s31, v31, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v31, 0
-; GFX10-NEXT:    s_xor_saveexec_b32 s4, -1
-; GFX10-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s35, v40, 3
+; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    s_or_saveexec_b32 s4, -1
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s4
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
index 634106d20489e..e926a3c728cbd 100644
--- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
+++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll
@@ -278,11 +278,11 @@ define void @callee_with_stack_no_fp_elim_csr_vgpr() #1 {
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
 ; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; GCN: v_writelane_b32 v0
+; GCN: v_writelane_b32 v1
 ; MUBUF:   buffer_store_dword v{{[0-9]+}}, off, s[0:3], s33 offset:4
 ; FLATSCR: scratch_store_dword off, v{{[0-9]+}}, s33 offset:4
 ; GCN: ;;#ASMSTART
-; GCN: v_writelane_b32 v0
+; GCN: v_writelane_b32 v1
 
 ; MUBUF:        s_addk_i32 s32, 0x400
 ; FLATSCR:      s_add_i32 s32, s32, 16
@@ -320,19 +320,19 @@ define void @last_lane_vgpr_for_fp_csr() #1 {
 ; MUBUF-NEXT:   buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]]
-; GCN-COUNT-61: v_writelane_b32 v0,
+; GCN-COUNT-61: v_writelane_b32 v1,
 ; MUBUF:   buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; FLATSCR: scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
-; GCN: v_writelane_b32 v0,
+; GCN: v_writelane_b32 v1,
 ; MUBUF:   buffer_store_dword
 ; FLATSCR: scratch_store_dword
 ; GCN: ;;#ASMSTART
-; GCN: v_writelane_b32 v0,
+; GCN: v_writelane_b32 v1,
 ; MUBUF:   buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; FLATSCR: scratch_load_dword v41, off, s33 ; 4-byte Folded Reload
 ; MUBUF:        s_addk_i32 s32, 0x400
 ; FLATSCR:      s_add_i32 s32, s32, 16
-; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v0
+; GCN-COUNT-64: v_readlane_b32 s{{[0-9]+}}, v1
 ; GCN-NEXT: s_xor_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}}
 ; MUBUF-NEXT:   buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:8 ; 4-byte Folded Reload
diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
index 36a7ed51227a6..764c40ebc714d 100644
--- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll
@@ -19,25 +19,25 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_mov_b32 s16, s33
 ; CHECK-NEXT:    s_mov_b32 s33, s32
 ; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
-; CHECK-NEXT:    v_writelane_b32 v40, s16, 16
-; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v40, s34, 2
-; CHECK-NEXT:    v_writelane_b32 v40, s35, 3
-; CHECK-NEXT:    v_writelane_b32 v40, s36, 4
-; CHECK-NEXT:    v_writelane_b32 v40, s37, 5
-; CHECK-NEXT:    v_writelane_b32 v40, s38, 6
-; CHECK-NEXT:    v_writelane_b32 v40, s39, 7
-; CHECK-NEXT:    v_writelane_b32 v40, s40, 8
-; CHECK-NEXT:    v_writelane_b32 v40, s41, 9
-; CHECK-NEXT:    v_writelane_b32 v40, s42, 10
-; CHECK-NEXT:    v_writelane_b32 v40, s43, 11
-; CHECK-NEXT:    v_writelane_b32 v40, s44, 12
+; CHECK-NEXT:    v_writelane_b32 v41, s16, 16
+; CHECK-NEXT:    v_writelane_b32 v41, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v41, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v41, s34, 2
+; CHECK-NEXT:    v_writelane_b32 v41, s35, 3
+; CHECK-NEXT:    v_writelane_b32 v41, s36, 4
+; CHECK-NEXT:    v_writelane_b32 v41, s37, 5
+; CHECK-NEXT:    v_writelane_b32 v41, s38, 6
+; CHECK-NEXT:    v_writelane_b32 v41, s39, 7
+; CHECK-NEXT:    v_writelane_b32 v41, s40, 8
+; CHECK-NEXT:    v_writelane_b32 v41, s41, 9
+; CHECK-NEXT:    v_writelane_b32 v41, s42, 10
+; CHECK-NEXT:    v_writelane_b32 v41, s43, 11
+; CHECK-NEXT:    v_writelane_b32 v41, s44, 12
 ; CHECK-NEXT:    s_addk_i32 s32, 0x400
-; CHECK-NEXT:    v_writelane_b32 v40, s45, 13
-; CHECK-NEXT:    v_writelane_b32 v40, s46, 14
+; CHECK-NEXT:    v_writelane_b32 v41, s45, 13
+; CHECK-NEXT:    v_writelane_b32 v41, s46, 14
 ; CHECK-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; CHECK-NEXT:    ;DEBUG_VALUE: dummy:dummy <- undef
 ; CHECK-NEXT:  .Ltmp0:
@@ -45,11 +45,11 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_getpc_b64 s[4:5]
 ; CHECK-NEXT:    s_add_u32 s4, s4, __kmpc_alloc_shared@gotpcrel32@lo+4
 ; CHECK-NEXT:    s_addc_u32 s5, s5, __kmpc_alloc_shared@gotpcrel32@hi+12
-; CHECK-NEXT:    v_writelane_b32 v40, s47, 15
+; CHECK-NEXT:    v_writelane_b32 v41, s47, 15
 ; CHECK-NEXT:    s_load_dwordx2 s[46:47], s[4:5], 0x0
 ; CHECK-NEXT:    s_mov_b64 s[4:5], s[40:41]
-; CHECK-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; CHECK-NEXT:    v_mov_b32_e32 v41, v31
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    v_mov_b32_e32 v40, v31
 ; CHECK-NEXT:    s_mov_b64 s[34:35], s[6:7]
 ; CHECK-NEXT:    s_mov_b32 s42, s15
 ; CHECK-NEXT:    s_mov_b32 s43, s14
@@ -67,33 +67,33 @@ define weak_odr void @test(i32 %0) !dbg !34 {
 ; CHECK-NEXT:    s_mov_b32 s13, s44
 ; CHECK-NEXT:    s_mov_b32 s14, s43
 ; CHECK-NEXT:    s_mov_b32 s15, s42
-; CHECK-NEXT:    v_mov_b32_e32 v31, v41
+; CHECK-NEXT:    v_mov_b32_e32 v31, v40
 ; CHECK-NEXT:    s_swappc_b64 s[30:31], s[46:47]
 ; CHECK-NEXT:  .Ltmp1:
 ; CHECK-NEXT:    ;DEBUG_VALUE: dummy:dummy <- [$vgpr0_vgpr1+0]
 ; CHECK-NEXT:    .loc 1 0 9 is_stmt 0 ; dummy:0:9
-; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; CHECK-NEXT:    v_mov_b32_e32 v2, 0
 ; CHECK-NEXT:    flat_store_dword v[0:1], v2
-; CHECK-NEXT:    v_readlane_b32 s47, v40, 15
-; CHECK-NEXT:    v_readlane_b32 s46, v40, 14
-; CHECK-NEXT:    v_readlane_b32 s45, v40, 13
-; CHECK-NEXT:    v_readlane_b32 s44, v40, 12
-; CHECK-NEXT:    v_readlane_b32 s43, v40, 11
-; CHECK-NEXT:    v_readlane_b32 s42, v40, 10
-; CHECK-NEXT:    v_readlane_b32 s41, v40, 9
-; CHECK-NEXT:    v_readlane_b32 s40, v40, 8
-; CHECK-NEXT:    v_readlane_b32 s39, v40, 7
-; CHECK-NEXT:    v_readlane_b32 s38, v40, 6
-; CHECK-NEXT:    v_readlane_b32 s37, v40, 5
-; CHECK-NEXT:    v_readlane_b32 s36, v40, 4
-; CHECK-NEXT:    v_readlane_b32 s35, v40, 3
-; CHECK-NEXT:    v_readlane_b32 s34, v40, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v40, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v40, 0
-; CHECK-NEXT:    v_readlane_b32 s4, v40, 16
+; CHECK-NEXT:    v_readlane_b32 s47, v41, 15
+; CHECK-NEXT:    v_readlane_b32 s46, v41, 14
+; CHECK-NEXT:    v_readlane_b32 s45, v41, 13
+; CHECK-NEXT:    v_readlane_b32 s44, v41, 12
+; CHECK-NEXT:    v_readlane_b32 s43, v41, 11
+; CHECK-NEXT:    v_readlane_b32 s42, v41, 10
+; CHECK-NEXT:    v_readlane_b32 s41, v41, 9
+; CHECK-NEXT:    v_readlane_b32 s40, v41, 8
+; CHECK-NEXT:    v_readlane_b32 s39, v41, 7
+; CHECK-NEXT:    v_readlane_b32 s38, v41, 6
+; CHECK-NEXT:    v_readlane_b32 s37, v41, 5
+; CHECK-NEXT:    v_readlane_b32 s36, v41, 4
+; CHECK-NEXT:    v_readlane_b32 s35, v41, 3
+; CHECK-NEXT:    v_readlane_b32 s34, v41, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v41, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v41, 0
+; CHECK-NEXT:    v_readlane_b32 s4, v41, 16
 ; CHECK-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; CHECK-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[6:7]
 ; CHECK-NEXT:    s_addk_i32 s32, 0xfc00
 ; CHECK-NEXT:    s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
index 145ab4ae6378b..a118fa388f86d 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll
@@ -3992,29 +3992,29 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_ubyte v0, v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_ubyte v0, v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_i8_ret@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_i8_ret@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    global_store_byte v[41:42], v0, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_byte v[40:41], v0, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4027,30 +4027,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_i8_ret@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_i8_ret@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_ubyte v0, v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_ubyte v0, v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    global_store_byte v[41:42], v0, off
+; GFX10-NEXT:    global_store_byte v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4064,30 +4064,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_i8_ret@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_i8_ret@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_u8 v0, v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_u8 v0, v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    global_store_b8 v[41:42], v0, off
+; GFX11-NEXT:    global_store_b8 v[40:41], v0, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4100,30 +4100,30 @@ define amdgpu_gfx void @test_call_external_void_func_i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_i8_ret@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_i8_ret@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_ubyte v0, v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    global_store_byte v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_byte v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4144,33 +4144,33 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_ushort v0, v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_ushort v0, v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v2i8_ret@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v2i8_ret@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b16_e32 v1, 8, v0
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v1
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_short v[41:42], v0, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_short v[40:41], v0, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4183,34 +4183,34 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v2i8_ret@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v2i8_ret@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_ushort v0, v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_ushort v0, v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b16 v1, 8, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_short v[41:42], v0, off
+; GFX10-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4224,36 +4224,36 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_u16 v0, v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_u16 v0, v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b16 v1, 8, v0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b16 v[41:42], v0, off
+; GFX11-NEXT:    global_store_b16 v[40:41], v0, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4266,34 +4266,34 @@ define amdgpu_gfx void @test_call_external_void_func_v2i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v2i8_ret@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v2i8_ret@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_ushort v0, v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b16 v1, 8, v0
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    global_store_short v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4314,19 +4314,19 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dword v0, v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_dword v0, v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4336,15 +4336,15 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v4, 0
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_byte v[3:4], v2, off
-; GFX9-NEXT:    global_store_short v[41:42], v0, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_short v[40:41], v0, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4357,20 +4357,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dword v0, v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dword v0, v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4378,17 +4378,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    global_store_byte v[3:4], v2, off
-; GFX10-NEXT:    global_store_short v[41:42], v0, off
+; GFX10-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4402,20 +4402,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b32 v0, v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_b32 v0, v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4425,18 +4425,18 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 2
 ; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_or_b32_e32 v3, v4, v3
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v2, off
-; GFX11-NEXT:    global_store_b16 v[41:42], v3, off
+; GFX11-NEXT:    global_store_b16 v[40:41], v3, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4449,20 +4449,20 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4470,17 +4470,17 @@ define amdgpu_gfx void @test_call_external_void_func_v3i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v3, 2
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v4, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[3:4], v2, off
-; GFX10-SCRATCH-NEXT:    global_store_short v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_short v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4501,19 +4501,19 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dword v0, v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_dword v0, v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v4i8_ret@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v4i8_ret@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
@@ -4524,15 +4524,15 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX9-NEXT:    v_lshlrev_b16_e32 v1, 8, v3
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dword v[41:42], v0, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_dword v[40:41], v0, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4545,20 +4545,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v4i8_ret@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v4i8_ret@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dword v0, v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dword v0, v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4566,18 +4566,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4591,20 +4591,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b32 v0, v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_b32 v0, v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4615,22 +4615,22 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    global_store_b32 v[41:42], v0, off
+; GFX11-NEXT:    global_store_b32 v[40:41], v0, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4643,20 +4643,20 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v4i8_ret@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v4i8_ret@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_dword v0, v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4664,18 +4664,18 @@ define amdgpu_gfx void @test_call_external_void_func_v4i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4696,19 +4696,19 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_dwordx2 v[5:6], v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v5i8_ret@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v5i8_ret@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
@@ -4724,15 +4724,15 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    global_store_byte v[0:1], v4, off
-; GFX9-NEXT:    global_store_dword v[41:42], v2, off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_dword v[40:41], v2, off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4745,20 +4745,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v5i8_ret@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v5i8_ret@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dwordx2 v[5:6], v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
@@ -4768,21 +4768,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_byte v[0:1], v4, off
-; GFX10-NEXT:    global_store_dword v[41:42], v2, off
+; GFX10-NEXT:    global_store_dword v[40:41], v2, off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -4796,20 +4796,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[5:6], v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_b64 v[5:6], v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, v5
 ; GFX11-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
@@ -4822,10 +4822,10 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xff, v0
 ; GFX11-NEXT:    v_lshlrev_b16 v3, 8, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xff, v2
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v0
@@ -4835,12 +4835,12 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v3
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    global_store_b8 v[0:1], v4, off
-; GFX11-NEXT:    global_store_b32 v[41:42], v2, off
+; GFX11-NEXT:    global_store_b32 v[40:41], v2, off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -4853,20 +4853,20 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v5i8_ret@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v5i8_ret@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[5:6], v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[5:6], v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b64 v[3:4], 24, v[5:6]
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v1, 8, v5
@@ -4876,21 +4876,21 @@ define amdgpu_gfx void @test_call_external_void_func_v5i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 4
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v5, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    global_store_byte v[0:1], v4, off
-; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v2, off
+; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v2, off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -4911,19 +4911,19 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[41:42], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    global_load_dwordx2 v[0:1], v[40:41], off
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v8i8_ret@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v8i8_ret@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4944,15 +4944,15 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v2, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dwordx2 v[41:42], v[3:4], off
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_dwordx2 v[40:41], v[3:4], off
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -4965,20 +4965,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v8i8_ret@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v8i8_ret@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[41:42], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    global_load_dwordx2 v[0:1], v[40:41], off
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -4993,21 +4993,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-NEXT:    v_lshlrev_b16 v7, 8, v7
 ; GFX10-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dwordx2 v[41:42], v[0:1], off
+; GFX10-NEXT:    global_store_dwordx2 v[40:41], v[0:1], off
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -5021,31 +5021,31 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_mov_b32_e32 v42, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    global_load_b64 v[0:1], v[41:42], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    global_load_b64 v[0:1], v[40:41], off
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v3, 24, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 8, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v6, 16, v1
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v7, 24, v1
-; GFX11-NEXT:    v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v1, v8
+; GFX11-NEXT:    v_mov_b32_e32 v1, v8
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    v_lshlrev_b16 v5, 8, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 0xff, v4
 ; GFX11-NEXT:    v_lshlrev_b16 v7, 8, v7
 ; GFX11-NEXT:    v_and_b32_e32 v6, 0xff, v6
@@ -5056,23 +5056,23 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX11-NEXT:    v_or_b32_e32 v4, v4, v5
 ; GFX11-NEXT:    v_or_b32_e32 v5, v6, v7
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    v_and_b32_e32 v2, 0xffff, v4
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v3, 16, v5
 ; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v4, 16, v1
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    v_or_b32_e32 v1, v2, v3
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
-; GFX11-NEXT:    global_store_b64 v[41:42], v[0:1], off
+; GFX11-NEXT:    global_store_b64 v[40:41], v[0:1], off
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5085,20 +5085,20 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v8i8_ret@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v8i8_ret@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[41:42], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    global_load_dwordx2 v[0:1], v[40:41], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v8, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v2, 16, v0
@@ -5113,21 +5113,21 @@ define amdgpu_gfx void @test_call_external_void_func_v8i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v7, 8, v7
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v1, 8, v1
 ; GFX10-SCRATCH-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v4, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v5, v6, v7 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v1, v4, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    global_store_dwordx2 v[41:42], v[0:1], off
+; GFX10-SCRATCH-NEXT:    global_store_dwordx2 v[40:41], v[0:1], off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
@@ -5148,24 +5148,24 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, 0
+; GFX9-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX9-NEXT:    v_mov_b32_e32 v41, 0
-; GFX9-NEXT:    v_mov_b32_e32 v43, 16
-; GFX9-NEXT:    v_mov_b32_e32 v42, 0
-; GFX9-NEXT:    v_mov_b32_e32 v44, 0
-; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[41:42], off
-; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[43:44], off
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_mov_b32_e32 v43, 0
+; GFX9-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
+; GFX9-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
+; GFX9-NEXT:    v_writelane_b32 v44, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
@@ -5245,18 +5245,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v9, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v8, v8, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dwordx4 v[43:44], v[0:3], off
-; GFX9-NEXT:    global_store_dwordx4 v[41:42], v[6:9], off
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    global_store_dwordx4 v[42:43], v[0:3], off
+; GFX9-NEXT:    global_store_dwordx4 v[40:41], v[6:9], off
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -5269,26 +5269,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, 0
+; GFX10-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX10-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-NEXT:    v_mov_b32_e32 v43, 16
-; GFX10-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-NEXT:    v_mov_b32_e32 v44, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_mov_b32_e32 v43, 0
+; GFX10-NEXT:    v_writelane_b32 v44, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_v3i8_ret@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_v3i8_ret@abs32@lo
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[41:42], off
-; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[43:44], off
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
+; GFX10-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
+; GFX10-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
@@ -5368,18 +5368,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-NEXT:    v_or_b32_sdwa v8, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v7, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    global_store_dwordx4 v[43:44], v[7:10], off
-; GFX10-NEXT:    global_store_dwordx4 v[41:42], v[3:6], off
+; GFX10-NEXT:    global_store_dwordx4 v[42:43], v[7:10], off
+; GFX10-NEXT:    global_store_dwordx4 v[40:41], v[3:6], off
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
@@ -5393,24 +5393,24 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33
-; GFX11-NEXT:    v_mov_b32_e32 v41, 0
-; GFX11-NEXT:    v_dual_mov_b32 v42, 0 :: v_dual_mov_b32 v43, 16
-; GFX11-NEXT:    v_mov_b32_e32 v44, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33
+; GFX11-NEXT:    v_mov_b32_e32 v40, 0
+; GFX11-NEXT:    v_dual_mov_b32 v41, 0 :: v_dual_mov_b32 v42, 16
+; GFX11-NEXT:    v_mov_b32_e32 v43, 0
+; GFX11-NEXT:    v_writelane_b32 v44, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi
-; GFX11-NEXT:    global_load_b128 v[0:3], v[41:42], off
+; GFX11-NEXT:    global_load_b128 v[0:3], v[40:41], off
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo
-; GFX11-NEXT:    global_load_b128 v[16:19], v[43:44], off
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    global_load_b128 v[16:19], v[42:43], off
+; GFX11-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
@@ -5519,18 +5519,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX11-NEXT:    v_or_b32_e32 v7, v11, v12
 ; GFX11-NEXT:    v_or_b32_e32 v3, v0, v2
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    global_store_b128 v[43:44], v[7:10], off
-; GFX11-NEXT:    global_store_b128 v[41:42], v[3:6], off
+; GFX11-NEXT:    global_store_b128 v[42:43], v[7:10], off
+; GFX11-NEXT:    global_store_b128 v[40:41], v[3:6], off
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
+; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -5543,26 +5543,26 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v44, s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v43, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v44, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:12 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v43, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, 0
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 16
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v43, 16
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, 0
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v44, 0
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v43, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s0, 2
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_void_func_v3i8_ret@abs32@hi
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_void_func_v3i8_ret@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[41:42], off
-; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v[43:44], off
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[0:3], v[40:41], off
+; GFX10-SCRATCH-NEXT:    global_load_dwordx4 v[16:19], v[42:43], off
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 32
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_waitcnt vmcnt(1)
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v35, 8, v0
 ; GFX10-SCRATCH-NEXT:    v_lshrrev_b32_e32 v36, 16, v0
@@ -5642,18 +5642,18 @@ define amdgpu_gfx void @test_call_external_void_func_v32i8_ret() #0 {
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v8, v14, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v7, v12, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-SCRATCH-NEXT:    v_or_b32_sdwa v3, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[43:44], v[7:10], off
-; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[41:42], v[3:6], off
+; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[42:43], v[7:10], off
+; GFX10-SCRATCH-NEXT:    global_store_dwordx4 v[40:41], v[3:6], off
 ; GFX10-SCRATCH-NEXT:    s_clause 0x3
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v44, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v43, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:12
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v43, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:8
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:12
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v44, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_addk_i32 s32, 0xffe0
@@ -8255,29 +8255,29 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v42, s30, 0
+; GFX9-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX9-NEXT:    s_mov_b32 s35, external_i32_func_i32@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_i32_func_i32@abs32@lo
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX9-NEXT:    v_mov_b32_e32 v42, v1
+; GFX9-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX9-NEXT:    v_mov_b32_e32 v41, v1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    global_store_dword v[41:42], v0, off
+; GFX9-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -8290,31 +8290,31 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_mov_b32_e32 v41, v0
+; GFX10-NEXT:    v_writelane_b32 v42, s34, 2
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-NEXT:    s_mov_b32 s35, external_i32_func_i32@abs32@hi
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX10-NEXT:    s_mov_b32 s34, external_i32_func_i32@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    v_mov_b32_e32 v42, v1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v41, v1
+; GFX10-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-NEXT:    s_clause 0x1
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v42, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -8328,30 +8328,30 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v42, s0, 2
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33
-; GFX11-NEXT:    v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33
+; GFX11-NEXT:    v_dual_mov_b32 v41, v1 :: v_dual_mov_b32 v40, v0
 ; GFX11-NEXT:    v_mov_b32_e32 v0, 42
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX11-NEXT:    s_mov_b32 s1, external_i32_func_i32@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_i32_func_i32@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    global_store_b32 v[41:42], v0, off dlc
+; GFX11-NEXT:    global_store_b32 v[40:41], v0, off dlc
 ; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4
+; GFX11-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -8364,31 +8364,31 @@ define amdgpu_gfx void @test_call_external_i32_func_i32_imm(ptr addrspace(1) %ou
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, s33
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s33, s32
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 offset:8 ; 4-byte Folded Spill
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s0, 2
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v42, s33 ; 4-byte Folded Spill
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, v0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s0, 2
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    scratch_store_dword off, v41, s33 ; 4-byte Folded Spill
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v0, 42
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s1, external_i32_func_i32@abs32@hi
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 s0, external_i32_func_i32@abs32@lo
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, 16
-; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v42, v1
-; GFX10-SCRATCH-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-SCRATCH-NEXT:    v_mov_b32_e32 v41, v1
+; GFX10-SCRATCH-NEXT:    v_writelane_b32 v42, s31, 1
 ; GFX10-SCRATCH-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX10-SCRATCH-NEXT:    global_store_dword v[41:42], v0, off
+; GFX10-SCRATCH-NEXT:    global_store_dword v[40:41], v0, off
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-SCRATCH-NEXT:    s_clause 0x1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33 offset:4
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v41, off, s33
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:4
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX10-SCRATCH-NEXT:    v_readlane_b32 s0, v42, 2
 ; GFX10-SCRATCH-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX10-SCRATCH-NEXT:    scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload
+; GFX10-SCRATCH-NEXT:    scratch_load_dword v42, off, s33 offset:8 ; 4-byte Folded Reload
 ; GFX10-SCRATCH-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-SCRATCH-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX10-SCRATCH-NEXT:    s_add_i32 s32, s32, -16
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
index ad55d49a1a96d..a14e3d5673f82 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll
@@ -322,30 +322,30 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_void@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_void@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_mov_b32_e32 v41, v31
+; GFX9-NEXT:    v_mov_b32_e32 v40, v31
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    v_mov_b32_e32 v31, v41
+; GFX9-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use v31
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -358,31 +358,31 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 2
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 2
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_void@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_void@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_mov_b32_e32 v41, v31
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v40, v31
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX10-NEXT:    v_mov_b32_e32 v31, v41
+; GFX10-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use v31
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 2
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -396,31 +396,31 @@ define amdgpu_gfx void @test_call_void_func_void_mayclobber_v31(ptr addrspace(1)
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 2
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_void@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_void@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    v_writelane_b32 v41, s30, 0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def v31
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_mov_b32_e32 v41, v31
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_mov_b32_e32 v40, v31
+; GFX11-NEXT:    v_writelane_b32 v41, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_mov_b32_e32 v31, v41
+; GFX11-NEXT:    v_mov_b32_e32 v31, v40
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use v31
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -1184,16 +1184,16 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    s_mov_b32 s34, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 3
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 1
+; GFX9-NEXT:    v_writelane_b32 v41, s34, 3
+; GFX9-NEXT:    v_writelane_b32 v41, s4, 0
+; GFX9-NEXT:    v_writelane_b32 v41, s30, 1
 ; GFX9-NEXT:    s_mov_b32 s35, external_void_func_void@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, external_void_func_void@abs32@lo
 ; GFX9-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_writelane_b32 v41, s31, 2
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def s40
 ; GFX9-NEXT:    ;;#ASMEND
@@ -1201,21 +1201,21 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; def v32
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    v_mov_b32_e32 v41, v32
+; GFX9-NEXT:    v_mov_b32_e32 v40, v32
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ; use s4
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
-; GFX9-NEXT:    ; use v41
+; GFX9-NEXT:    ; use v40
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v41, 2
+; GFX9-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX9-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX9-NEXT:    v_readlane_b32 s34, v41, 3
 ; GFX9-NEXT:    s_or_saveexec_b64 s[36:37], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[36:37]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX9-NEXT:    s_mov_b32 s33, s34
@@ -1228,15 +1228,15 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    s_mov_b32 s34, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
-; GFX10-NEXT:    v_writelane_b32 v40, s34, 3
+; GFX10-NEXT:    v_writelane_b32 v41, s34, 3
 ; GFX10-NEXT:    s_mov_b32 s35, external_void_func_void@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, external_void_func_void@abs32@lo
 ; GFX10-NEXT:    s_addk_i32 s32, 0x200
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v41, s4, 0
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def s40
 ; GFX10-NEXT:    ;;#ASMEND
@@ -1244,23 +1244,23 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; def v32
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    v_mov_b32_e32 v41, v32
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX10-NEXT:    v_mov_b32_e32 v40, v32
+; GFX10-NEXT:    v_writelane_b32 v41, s30, 1
+; GFX10-NEXT:    v_writelane_b32 v41, s31, 2
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ; use s4
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    ;;#ASMSTART
-; GFX10-NEXT:    ; use v41
+; GFX10-NEXT:    ; use v40
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s34, v40, 3
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX10-NEXT:    v_readlane_b32 s31, v41, 2
+; GFX10-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX10-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX10-NEXT:    v_readlane_b32 s34, v41, 3
 ; GFX10-NEXT:    s_or_saveexec_b32 s35, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s35
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfe00
@@ -1274,14 +1274,14 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 3
+; GFX11-NEXT:    v_writelane_b32 v41, s0, 3
 ; GFX11-NEXT:    s_mov_b32 s1, external_void_func_void@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s0, external_void_func_void@abs32@lo
 ; GFX11-NEXT:    s_add_i32 s32, s32, 16
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill
-; GFX11-NEXT:    v_writelane_b32 v40, s4, 0
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill
+; GFX11-NEXT:    v_writelane_b32 v41, s4, 0
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def s40
 ; GFX11-NEXT:    ;;#ASMEND
@@ -1289,23 +1289,23 @@ define amdgpu_gfx void @callee_saved_sgpr_vgpr_kernel() #1 {
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; def v32
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_mov_b32_e32 v41, v32
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 1
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 2
+; GFX11-NEXT:    v_mov_b32_e32 v40, v32
+; GFX11-NEXT:    v_writelane_b32 v41, s30, 1
+; GFX11-NEXT:    v_writelane_b32 v41, s31, 2
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ; use s4
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    ;;#ASMSTART
-; GFX11-NEXT:    ; use v41
+; GFX11-NEXT:    ; use v40
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 2
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s4, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 3
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload
+; GFX11-NEXT:    v_readlane_b32 s31, v41, 2
+; GFX11-NEXT:    v_readlane_b32 s30, v41, 1
+; GFX11-NEXT:    v_readlane_b32 s4, v41, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v41, 3
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_add_i32 s32, s32, -16
 ; GFX11-NEXT:    s_mov_b32 s33, s0
diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
index 23502d1b36d18..c1d682689903a 100644
--- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
+++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll
@@ -2764,27 +2764,26 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    s_mov_b32 s36, s33
 ; GFX9-NEXT:    s_add_i32 s33, s32, 0x7fc0
 ; GFX9-NEXT:    s_and_b32 s33, s33, 0xffff8000
-; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0x28000
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:8
@@ -2827,7 +2826,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:156
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:160
 ; GFX9-NEXT:    v_lshrrev_b32_e64 v0, 6, s33
-; GFX9-NEXT:    v_writelane_b32 v33, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v63, s30, 0
 ; GFX9-NEXT:    s_mov_b32 s35, return_72xi32@abs32@hi
 ; GFX9-NEXT:    s_mov_b32 s34, return_72xi32@abs32@lo
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x200, v0
@@ -2862,41 +2861,41 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_mov_b32_e32 v29, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v30, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v31, 0
-; GFX9-NEXT:    v_writelane_b32 v33, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:636
 ; GFX9-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:640
 ; GFX9-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:644
-; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:648
-; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:652
-; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:656
-; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:660
-; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:664
-; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:668
-; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:672
-; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:676
-; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:680
-; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:684
-; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:688
-; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:692
-; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:696
-; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:700
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:704
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:708
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:712
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:716
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:720
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:724
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:728
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:732
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:736
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:740
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:748
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:752
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:756
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:760
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:764
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:768
+; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:648
+; GFX9-NEXT:    buffer_load_dword v34, off, s[0:3], s33 offset:652
+; GFX9-NEXT:    buffer_load_dword v35, off, s[0:3], s33 offset:656
+; GFX9-NEXT:    buffer_load_dword v36, off, s[0:3], s33 offset:660
+; GFX9-NEXT:    buffer_load_dword v37, off, s[0:3], s33 offset:664
+; GFX9-NEXT:    buffer_load_dword v38, off, s[0:3], s33 offset:668
+; GFX9-NEXT:    buffer_load_dword v39, off, s[0:3], s33 offset:672
+; GFX9-NEXT:    buffer_load_dword v48, off, s[0:3], s33 offset:676
+; GFX9-NEXT:    buffer_load_dword v49, off, s[0:3], s33 offset:680
+; GFX9-NEXT:    buffer_load_dword v50, off, s[0:3], s33 offset:684
+; GFX9-NEXT:    buffer_load_dword v51, off, s[0:3], s33 offset:688
+; GFX9-NEXT:    buffer_load_dword v52, off, s[0:3], s33 offset:692
+; GFX9-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:696
+; GFX9-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:700
+; GFX9-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:704
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:708
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:712
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:716
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:720
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:724
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:728
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:732
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:736
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:740
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:748
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:752
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:756
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:760
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:764
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:768
 ; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:772
 ; GFX9-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:776
 ; GFX9-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:780
@@ -2953,38 +2952,38 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX9-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:4
 ; GFX9-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:8
-; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:12
-; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:16
-; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:20
-; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:24
-; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:28
-; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:32
-; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:36
-; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:40
-; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:44
-; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:48
-; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:52
-; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:56
-; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:60
-; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:64
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:68
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:72
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88
-; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:92
-; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96
-; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100
-; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104
+; GFX9-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:12
+; GFX9-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:16
+; GFX9-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:20
+; GFX9-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:24
+; GFX9-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:28
+; GFX9-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:32
+; GFX9-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:36
+; GFX9-NEXT:    buffer_store_dword v48, off, s[0:3], s32 offset:40
+; GFX9-NEXT:    buffer_store_dword v49, off, s[0:3], s32 offset:44
+; GFX9-NEXT:    buffer_store_dword v50, off, s[0:3], s32 offset:48
+; GFX9-NEXT:    buffer_store_dword v51, off, s[0:3], s32 offset:52
+; GFX9-NEXT:    buffer_store_dword v52, off, s[0:3], s32 offset:56
+; GFX9-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:60
+; GFX9-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:64
+; GFX9-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:68
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:92
+; GFX9-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:96
+; GFX9-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:100
+; GFX9-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:104
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 24
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108
-; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:112
-; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:116
-; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:120
-; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:124
-; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:128
-; GFX9-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:132
+; GFX9-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112
+; GFX9-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116
+; GFX9-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120
+; GFX9-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:124
+; GFX9-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128
+; GFX9-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:132
 ; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136
 ; GFX9-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:140
 ; GFX9-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:144
@@ -3003,26 +3002,25 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX9-NEXT:    v_add_u32_e32 v0, 0x400, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, 42
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[34:35]
-; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v33, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v33, 0
-; GFX9-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v62, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v63, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v63, 0
+; GFX9-NEXT:    s_or_saveexec_b64 s[34:35], -1
+; GFX9-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-NEXT:    s_add_i32 s32, s32, 0xfffd8000
 ; GFX9-NEXT:    s_mov_b32 s33, s36
@@ -3036,27 +3034,27 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    s_add_i32 s33, s32, 0x3fe0
 ; GFX10-NEXT:    s_and_b32 s33, s33, 0xffffc000
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0x14000
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    v_writelane_b32 v63, s30, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32
 ; GFX10-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:4
@@ -3133,7 +3131,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_mov_b32_e32 v31, 0
 ; GFX10-NEXT:    s_mov_b32 s35, return_72xi32@abs32@hi
 ; GFX10-NEXT:    s_mov_b32 s34, return_72xi32@abs32@lo
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v63, s31, 1
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_clause 0x28
 ; GFX10-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:636
@@ -3154,21 +3152,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_load_dword v53, off, s[0:3], s33 offset:696
 ; GFX10-NEXT:    buffer_load_dword v54, off, s[0:3], s33 offset:700
 ; GFX10-NEXT:    buffer_load_dword v55, off, s[0:3], s33 offset:704
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:708
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:712
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:716
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:720
-; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:724
-; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:728
-; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:732
-; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:736
-; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:740
-; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:748
-; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:752
-; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:756
-; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:760
-; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:764
-; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:768
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:708
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:712
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:716
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:720
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:724
+; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:728
+; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:732
+; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:736
+; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:740
+; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:748
+; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:752
+; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:756
+; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:760
+; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:764
+; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:768
 ; GFX10-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:772
 ; GFX10-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:776
 ; GFX10-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:780
@@ -3243,21 +3241,21 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    buffer_store_dword v53, off, s[0:3], s32 offset:60
 ; GFX10-NEXT:    buffer_store_dword v54, off, s[0:3], s32 offset:64
 ; GFX10-NEXT:    buffer_store_dword v55, off, s[0:3], s32 offset:68
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:72
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:76
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:80
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:84
-; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:88
-; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:92
-; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:96
-; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:100
-; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:104
-; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:112
-; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:116
-; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:120
-; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:124
-; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:128
-; GFX10-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:132
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:72
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:76
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:80
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:84
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:88
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:92
+; GFX10-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:96
+; GFX10-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:100
+; GFX10-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:104
+; GFX10-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:112
+; GFX10-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:116
+; GFX10-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:120
+; GFX10-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:124
+; GFX10-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:128
+; GFX10-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:132
 ; GFX10-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:136
 ; GFX10-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:140
 ; GFX10-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:144
@@ -3279,25 +3277,25 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0x400, v0
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[34:35]
 ; GFX10-NEXT:    s_clause 0xe
-; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:8
-; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:16
-; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:20
-; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:24
-; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:28
-; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:32
-; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:36
-; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:40
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:44
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:48
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:52
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:56
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX10-NEXT:    buffer_load_dword v62, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v61, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v60, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v59, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    buffer_load_dword v58, off, s[0:3], s33 offset:16
+; GFX10-NEXT:    buffer_load_dword v57, off, s[0:3], s33 offset:20
+; GFX10-NEXT:    buffer_load_dword v56, off, s[0:3], s33 offset:24
+; GFX10-NEXT:    buffer_load_dword v47, off, s[0:3], s33 offset:28
+; GFX10-NEXT:    buffer_load_dword v46, off, s[0:3], s33 offset:32
+; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:36
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:40
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:44
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:48
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:52
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:56
+; GFX10-NEXT:    v_readlane_b32 s31, v63, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v63, 0
 ; GFX10-NEXT:    s_or_saveexec_b32 s34, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v63, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s34
 ; GFX10-NEXT:    s_add_i32 s32, s32, 0xfffec000
@@ -3313,7 +3311,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s33, s33, 0xfffffe00
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v60, s33 offset:1536 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_mov_b32 s0, 0
 ; GFX11-NEXT:    v_mov_b32_e32 v4, 0
@@ -3323,22 +3321,19 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX11-NEXT:    s_addk_i32 s32, 0xa00
-; GFX11-NEXT:    s_clause 0xe
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:56
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:52
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:48
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:44
-; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:40
-; GFX11-NEXT:    scratch_store_b32 off, v46, s33 offset:36
-; GFX11-NEXT:    scratch_store_b32 off, v47, s33 offset:32
-; GFX11-NEXT:    scratch_store_b32 off, v56, s33 offset:28
-; GFX11-NEXT:    scratch_store_b32 off, v57, s33 offset:24
-; GFX11-NEXT:    scratch_store_b32 off, v58, s33 offset:20
-; GFX11-NEXT:    scratch_store_b32 off, v59, s33 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v60, s33 offset:12
-; GFX11-NEXT:    scratch_store_b32 off, v61, s33 offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v62, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v63, s33
+; GFX11-NEXT:    s_clause 0xb
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:44
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:40
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:36
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:32
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:28
+; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:24
+; GFX11-NEXT:    scratch_store_b32 off, v46, s33 offset:20
+; GFX11-NEXT:    scratch_store_b32 off, v47, s33 offset:16
+; GFX11-NEXT:    scratch_store_b32 off, v56, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v57, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v58, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v59, s33
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0xa0
 ; GFX11-NEXT:    s_add_i32 s1, s32, 0x90
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s32
@@ -3361,7 +3356,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s1
 ; GFX11-NEXT:    s_add_i32 s0, s33, 0x200
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v60, s30, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v3, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v5, 0 :: v_dual_mov_b32 v4, 0
@@ -3380,7 +3375,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    v_dual_mov_b32 v31, 0 :: v_dual_mov_b32 v30, 0
 ; GFX11-NEXT:    s_mov_b32 s45, return_72xi32@abs32@hi
 ; GFX11-NEXT:    s_mov_b32 s44, return_72xi32@abs32@lo
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v60, s31, 1
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
 ; GFX11-NEXT:    s_clause 0x1
 ; GFX11-NEXT:    scratch_load_b128 v[45:48], off, s33 offset:624
@@ -3391,22 +3386,16 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_clause 0x9
 ; GFX11-NEXT:    scratch_load_b128 v[48:51], off, s33 offset:656
 ; GFX11-NEXT:    scratch_load_b128 v[52:55], off, s33 offset:672
-; GFX11-NEXT:    scratch_load_b128 v[41:44], off, s33 offset:688
-; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:704
-; GFX11-NEXT:    scratch_load_b128 v[60:63], off, s33 offset:720
+; GFX11-NEXT:    scratch_load_b128 v[37:40], off, s33 offset:688
+; GFX11-NEXT:    scratch_load_b128 v[41:44], off, s33 offset:704
+; GFX11-NEXT:    scratch_load_b128 v[56:59], off, s33 offset:720
 ; GFX11-NEXT:    scratch_load_b128 v[12:15], off, s33 offset:736
 ; GFX11-NEXT:    scratch_load_b128 v[0:3], off, s33 offset:752
 ; GFX11-NEXT:    scratch_load_b128 v[4:7], off, s33 offset:768
 ; GFX11-NEXT:    scratch_load_b128 v[8:11], off, s33 offset:784
 ; GFX11-NEXT:    scratch_load_b128 v[16:19], off, s33 offset:512
-; GFX11-NEXT:    s_waitcnt vmcnt(8)
-; GFX11-NEXT:    v_dual_mov_b32 v38, v53 :: v_dual_mov_b32 v37, v52
-; GFX11-NEXT:    s_waitcnt vmcnt(7)
-; GFX11-NEXT:    v_dual_mov_b32 v39, v54 :: v_dual_mov_b32 v52, v44
-; GFX11-NEXT:    s_waitcnt vmcnt(6)
-; GFX11-NEXT:    v_dual_mov_b32 v53, v56 :: v_dual_mov_b32 v54, v57
-; GFX11-NEXT:    s_waitcnt vmcnt(4)
-; GFX11-NEXT:    v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v12
+; GFX11-NEXT:    s_waitcnt vmcnt(2)
+; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v1, v4
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b128 off, v[16:19], s33 offset:1588 ; 16-byte Folded Spill
 ; GFX11-NEXT:    s_clause 0x3
@@ -3414,13 +3403,7 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    scratch_load_b128 v[20:23], off, s33 offset:544
 ; GFX11-NEXT:    scratch_load_b128 v[24:27], off, s33 offset:560
 ; GFX11-NEXT:    scratch_load_b128 v[28:31], off, s33 offset:576
-; GFX11-NEXT:    v_mov_b32_e32 v56, v63
-; GFX11-NEXT:    v_mov_b32_e32 v12, v15
-; GFX11-NEXT:    v_dual_mov_b32 v14, v1 :: v_dual_mov_b32 v15, v2
-; GFX11-NEXT:    v_dual_mov_b32 v1, v4 :: v_dual_mov_b32 v2, v5
-; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v5, v8
-; GFX11-NEXT:    s_waitcnt vmcnt(3)
-; GFX11-NEXT:    v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v8, v19
+; GFX11-NEXT:    v_dual_mov_b32 v4, v7 :: v_dual_mov_b32 v7, v10
 ; GFX11-NEXT:    s_waitcnt vmcnt(2)
 ; GFX11-NEXT:    v_mov_b32_e32 v10, v21
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3432,47 +3415,53 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s32
-; GFX11-NEXT:    v_mov_b32_e32 v32, v36
+; GFX11-NEXT:    v_dual_mov_b32 v31, v47 :: v_dual_mov_b32 v32, v36
 ; GFX11-NEXT:    v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49
-; GFX11-NEXT:    v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v36, v51
-; GFX11-NEXT:    v_dual_mov_b32 v48, v55 :: v_dual_mov_b32 v49, v41
-; GFX11-NEXT:    v_mov_b32_e32 v50, v42
-; GFX11-NEXT:    v_dual_mov_b32 v55, v58 :: v_dual_mov_b32 v58, v13
-; GFX11-NEXT:    v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v0, v3
-; GFX11-NEXT:    v_dual_mov_b32 v3, v6 :: v_dual_mov_b32 v6, v9
+; GFX11-NEXT:    v_dual_mov_b32 v35, v50 :: v_dual_mov_b32 v48, v51
+; GFX11-NEXT:    v_dual_mov_b32 v49, v52 :: v_dual_mov_b32 v50, v53
+; GFX11-NEXT:    v_dual_mov_b32 v51, v54 :: v_dual_mov_b32 v36, v55
+; GFX11-NEXT:    v_dual_mov_b32 v53, v41 :: v_dual_mov_b32 v52, v40
+; GFX11-NEXT:    v_dual_mov_b32 v54, v42 :: v_dual_mov_b32 v41, v56
+; GFX11-NEXT:    v_dual_mov_b32 v55, v43 :: v_dual_mov_b32 v40, v44
+; GFX11-NEXT:    v_dual_mov_b32 v42, v57 :: v_dual_mov_b32 v57, v12
+; GFX11-NEXT:    v_dual_mov_b32 v43, v58 :: v_dual_mov_b32 v56, v59
+; GFX11-NEXT:    v_mov_b32_e32 v58, v13
+; GFX11-NEXT:    v_dual_mov_b32 v12, v15 :: v_dual_mov_b32 v13, v0
+; GFX11-NEXT:    v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v0, v3
+; GFX11-NEXT:    v_dual_mov_b32 v2, v5 :: v_dual_mov_b32 v3, v6
+; GFX11-NEXT:    v_dual_mov_b32 v5, v8 :: v_dual_mov_b32 v6, v9
+; GFX11-NEXT:    v_mov_b32_e32 v9, v20
 ; GFX11-NEXT:    scratch_store_b32 off, v11, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x90
-; GFX11-NEXT:    v_mov_b32_e32 v51, v43
-; GFX11-NEXT:    v_mov_b32_e32 v41, v59
+; GFX11-NEXT:    v_mov_b32_e32 v11, v22
 ; GFX11-NEXT:    scratch_store_b128 off, v[4:7], s0
-; GFX11-NEXT:    v_mov_b32_e32 v7, v18
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x80
-; GFX11-NEXT:    v_dual_mov_b32 v42, v60 :: v_dual_mov_b32 v43, v61
+; GFX11-NEXT:    v_mov_b32_e32 v5, v16
 ; GFX11-NEXT:    scratch_store_b128 off, v[0:3], s0
-; GFX11-NEXT:    v_dual_mov_b32 v0, 24 :: v_dual_mov_b32 v9, v20
+; GFX11-NEXT:    v_mov_b32_e32 v0, 24
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x70
-; GFX11-NEXT:    v_mov_b32_e32 v5, v16
+; GFX11-NEXT:    v_mov_b32_e32 v6, v17
 ; GFX11-NEXT:    scratch_store_b128 off, v[12:15], s0
+; GFX11-NEXT:    v_mov_b32_e32 v13, v24
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x6c
-; GFX11-NEXT:    v_dual_mov_b32 v6, v17 :: v_dual_mov_b32 v11, v22
+; GFX11-NEXT:    v_mov_b32_e32 v7, v18
 ; GFX11-NEXT:    scratch_store_b32 off, v0, s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x60
-; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
+; GFX11-NEXT:    v_dual_mov_b32 v8, v19 :: v_dual_mov_b32 v15, v26
 ; GFX11-NEXT:    scratch_store_b96 off, v[56:58], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 0x50
-; GFX11-NEXT:    v_mov_b32_e32 v13, v24
-; GFX11-NEXT:    scratch_store_b128 off, v[41:44], s0
+; GFX11-NEXT:    v_dual_mov_b32 v12, v23 :: v_dual_mov_b32 v29, v45
+; GFX11-NEXT:    scratch_store_b128 off, v[40:43], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 64
-; GFX11-NEXT:    v_dual_mov_b32 v14, v25 :: v_dual_mov_b32 v31, v47
+; GFX11-NEXT:    v_mov_b32_e32 v14, v25
 ; GFX11-NEXT:    scratch_store_b128 off, v[52:55], s0
 ; GFX11-NEXT:    s_add_i32 s0, s32, 48
-; GFX11-NEXT:    v_mov_b32_e32 v15, v26
-; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v16, v27
 ; GFX11-NEXT:    scratch_store_b128 off, v[36:39], s0
-; GFX11-NEXT:    s_add_i32 s0, s32, 16
+; GFX11-NEXT:    s_add_i32 s0, s32, 32
 ; GFX11-NEXT:    v_mov_b32_e32 v30, v46
+; GFX11-NEXT:    scratch_store_b128 off, v[48:51], s0
+; GFX11-NEXT:    s_add_i32 s0, s32, 16
 ; GFX11-NEXT:    scratch_store_b128 off, v[32:35], s0
 ; GFX11-NEXT:    scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
@@ -3485,26 +3474,23 @@ define amdgpu_gfx void @call_72xi32() #1 {
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[44:45]
-; GFX11-NEXT:    s_clause 0xe
-; GFX11-NEXT:    scratch_load_b32 v63, off, s33
-; GFX11-NEXT:    scratch_load_b32 v62, off, s33 offset:4
-; GFX11-NEXT:    scratch_load_b32 v61, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:12
-; GFX11-NEXT:    scratch_load_b32 v59, off, s33 offset:16
-; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:20
-; GFX11-NEXT:    scratch_load_b32 v57, off, s33 offset:24
-; GFX11-NEXT:    scratch_load_b32 v56, off, s33 offset:28
-; GFX11-NEXT:    scratch_load_b32 v47, off, s33 offset:32
-; GFX11-NEXT:    scratch_load_b32 v46, off, s33 offset:36
-; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:40
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:44
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:48
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:52
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:56
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX11-NEXT:    s_clause 0xb
+; GFX11-NEXT:    scratch_load_b32 v59, off, s33
+; GFX11-NEXT:    scratch_load_b32 v58, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v57, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v56, off, s33 offset:12
+; GFX11-NEXT:    scratch_load_b32 v47, off, s33 offset:16
+; GFX11-NEXT:    scratch_load_b32 v46, off, s33 offset:20
+; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:24
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:28
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:32
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:36
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:40
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:44
+; GFX11-NEXT:    v_readlane_b32 s31, v60, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v60, 0
 ; GFX11-NEXT:    s_or_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v60, off, s33 offset:1536 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_addk_i32 s32, 0xf600
 ; GFX11-NEXT:    s_mov_b32 s33, s46
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 65404637ca51b..76ec1cc84f55b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -152,23 +152,23 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -202,23 +202,23 @@ define amdgpu_gfx void @global_atomic_xchg_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -830,23 +830,23 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -958,23 +958,23 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_swap v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1416,23 +1416,23 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_add v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_add v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1466,23 +1466,23 @@ define amdgpu_gfx void @global_atomic_add_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_add v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_add v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1766,23 +1766,23 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_sub v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1816,23 +1816,23 @@ define amdgpu_gfx void @global_atomic_sub_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_sub v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_sub v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2116,23 +2116,23 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_and v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_and v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2166,23 +2166,23 @@ define amdgpu_gfx void @global_atomic_and_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_and v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_and v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2649,38 +2649,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, s34, v2
-; SI-NEXT:    v_not_b32_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v0, s34, v1
+; SI-NEXT:    v_not_b32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB44_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2740,38 +2740,38 @@ define amdgpu_gfx void @global_atomic_nand_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, s34, v2
-; SI-NEXT:    v_not_b32_e32 v1, v1
+; SI-NEXT:    v_and_b32_e32 v0, s34, v1
+; SI-NEXT:    v_not_b32_e32 v0, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB45_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2834,39 +2834,38 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
-; SI-NEXT:    v_and_b32_e32 v0, s34, v4
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_not_b32_e32 v3, v0
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_and_b32_e32 v0, s34, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB46_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2928,39 +2927,38 @@ define amdgpu_gfx i32 @global_atomic_nand_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
-; SI-NEXT:    v_and_b32_e32 v0, s34, v4
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_not_b32_e32 v3, v0
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_and_b32_e32 v0, s34, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB47_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3167,23 +3165,23 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_or v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_or v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3217,23 +3215,23 @@ define amdgpu_gfx void @global_atomic_or_i32_noret_offset_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_or v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_or v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3517,23 +3515,23 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_xor v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_xor v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3567,23 +3565,23 @@ define amdgpu_gfx void @global_atomic_xor_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_xor v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_xor v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4038,37 +4036,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s34, v2
+; SI-NEXT:    v_max_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB68_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4126,37 +4124,37 @@ define amdgpu_gfx void @global_atomic_max_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_max_i32_e32 v1, s34, v2
+; SI-NEXT:    v_max_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB69_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4217,38 +4215,37 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_max_i32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB70_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4308,38 +4305,37 @@ define amdgpu_gfx i32 @global_atomic_max_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_i32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_max_i32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB71_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5135,37 +5131,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_max_u32_e32 v1, s34, v2
+; SI-NEXT:    v_max_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB80_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5223,37 +5219,37 @@ define amdgpu_gfx void @global_atomic_umax_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_max_u32_e32 v1, s34, v2
+; SI-NEXT:    v_max_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB81_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5314,38 +5310,37 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_max_u32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB82_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5405,38 +5400,37 @@ define amdgpu_gfx i32 @global_atomic_umax_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_max_u32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_max_u32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB83_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6137,37 +6131,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_u32_e32 v1, s34, v2
+; SI-NEXT:    v_min_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB91_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6225,37 +6219,37 @@ define amdgpu_gfx void @global_atomic_umin_i32_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB92_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_u32_e32 v1, s34, v2
+; SI-NEXT:    v_min_u32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB92_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6316,38 +6310,37 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_u32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_min_u32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB93_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6407,38 +6400,37 @@ define amdgpu_gfx i32 @global_atomic_umin_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB94_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_u32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_min_u32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB94_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6814,37 +6806,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s34, v2
+; SI-NEXT:    v_min_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB99_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6902,37 +6894,37 @@ define amdgpu_gfx void @global_atomic_min_i32_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v4, s6, 0
+; SI-NEXT:    v_writelane_b32 v4, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v1, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_min_i32_e32 v1, s34, v2
+; SI-NEXT:    v_min_i32_e32 v0, s34, v1
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
 ; SI-NEXT:    v_mov_b32_e32 v3, v1
-; SI-NEXT:    buffer_atomic_cmpswap v[3:4], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_mov_b32_e32 v2, v0
+; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v3, v2
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v1
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v2, v3
+; SI-NEXT:    v_mov_b32_e32 v1, v2
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB100_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v4, 1
+; SI-NEXT:    v_readlane_b32 s6, v4, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6993,38 +6985,37 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 glc
+; SI-NEXT:    v_min_i32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB101_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7084,38 +7075,37 @@ define amdgpu_gfx i32 @global_atomic_min_i32_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v3, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v1, s6, 0
-; SI-NEXT:    v_writelane_b32 v1, s7, 1
+; SI-NEXT:    v_writelane_b32 v3, s6, 0
+; SI-NEXT:    v_writelane_b32 v3, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dword v2, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_load_dword v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v4, v2
+; SI-NEXT:    v_mov_b32_e32 v2, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_min_i32_e32 v3, s34, v4
-; SI-NEXT:    v_mov_b32_e32 v2, v3
-; SI-NEXT:    v_mov_b32_e32 v3, v4
-; SI-NEXT:    buffer_atomic_cmpswap v[2:3], off, s[4:7], 0 offset:16 glc
+; SI-NEXT:    v_min_i32_e32 v1, s34, v2
+; SI-NEXT:    v_mov_b32_e32 v0, v1
+; SI-NEXT:    v_mov_b32_e32 v1, v2
+; SI-NEXT:    buffer_atomic_cmpswap v[0:1], off, s[4:7], 0 offset:16 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v2, v4
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, v0, v2
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB102_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v2
-; SI-NEXT:    v_readlane_b32 s7, v1, 1
-; SI-NEXT:    v_readlane_b32 s6, v1, 0
+; SI-NEXT:    v_readlane_b32 s7, v3, 1
+; SI-NEXT:    v_readlane_b32 s6, v3, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7727,23 +7717,23 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_inc v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_inc v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7777,23 +7767,23 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i32_noret_offset_scalar(ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_inc v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_inc v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8077,23 +8067,23 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_dec v1, off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_dec v0, off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8127,23 +8117,23 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i32_noret_offset_scalar(ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v1, s6, 0
+; SI-NEXT:    v_writelane_b32 v1, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_dec v1, off, s[4:7], 0 offset:16
+; SI-NEXT:    buffer_atomic_dec v0, off, s[4:7], 0 offset:16
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v1, 1
+; SI-NEXT:    v_readlane_b32 s6, v1, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index 34457781a9999..d137f471910dc 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -154,25 +154,25 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -208,23 +208,23 @@ define amdgpu_gfx void @global_atomic_xchg_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -846,25 +846,25 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -978,23 +978,23 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_swap_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_swap_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1446,25 +1446,25 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1500,23 +1500,23 @@ define amdgpu_gfx void @global_atomic_add_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_add_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_add_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1810,25 +1810,25 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -1864,23 +1864,23 @@ define amdgpu_gfx void @global_atomic_sub_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_sub_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_sub_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2174,25 +2174,25 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_and_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_and_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2228,23 +2228,23 @@ define amdgpu_gfx void @global_atomic_and_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_and_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_and_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2772,44 +2772,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB44_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, s34, v4
+; SI-NEXT:    v_and_b32_e32 v0, s34, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, s35, v3
-; SI-NEXT:    v_not_b32_e32 v2, v1
-; SI-NEXT:    v_not_b32_e32 v1, v5
-; SI-NEXT:    v_mov_b32_e32 v8, v4
+; SI-NEXT:    v_and_b32_e32 v4, s35, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_not_b32_e32 v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
 ; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB44_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2877,44 +2877,44 @@ define amdgpu_gfx void @global_atomic_nand_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v8, s6, 0
+; SI-NEXT:    v_writelane_b32 v8, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB45_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v1, s34, v4
+; SI-NEXT:    v_and_b32_e32 v0, s34, v3
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_and_b32_e32 v5, s35, v3
-; SI-NEXT:    v_not_b32_e32 v2, v1
-; SI-NEXT:    v_not_b32_e32 v1, v5
-; SI-NEXT:    v_mov_b32_e32 v8, v4
+; SI-NEXT:    v_and_b32_e32 v4, s35, v2
+; SI-NEXT:    v_not_b32_e32 v1, v0
+; SI-NEXT:    v_not_b32_e32 v0, v4
 ; SI-NEXT:    v_mov_b32_e32 v7, v3
 ; SI-NEXT:    v_mov_b32_e32 v6, v2
 ; SI-NEXT:    v_mov_b32_e32 v5, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[5:8], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v4, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[4:7], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[5:6], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB45_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v8, 1
+; SI-NEXT:    v_readlane_b32 s6, v8, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -2983,46 +2983,44 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v6, s6, 0
+; SI-NEXT:    v_writelane_b32 v6, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB46_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_and_b32_e32 v0, s34, v8
-; SI-NEXT:    v_and_b32_e32 v1, s35, v7
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_not_b32_e32 v6, v0
-; SI-NEXT:    v_not_b32_e32 v5, v1
+; SI-NEXT:    v_and_b32_e32 v0, s34, v5
+; SI-NEXT:    v_and_b32_e32 v1, s35, v4
+; SI-NEXT:    v_not_b32_e32 v3, v0
+; SI-NEXT:    v_not_b32_e32 v2, v1
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB46_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v6, 1
+; SI-NEXT:    v_readlane_b32 s6, v6, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3090,46 +3088,44 @@ define amdgpu_gfx i64 @global_atomic_nand_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v6, s6, 0
+; SI-NEXT:    v_writelane_b32 v6, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
 ; SI-NEXT:  .LBB47_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_and_b32_e32 v0, s34, v8
-; SI-NEXT:    v_and_b32_e32 v1, s35, v7
+; SI-NEXT:    v_mov_b32_e32 v5, v1
+; SI-NEXT:    v_mov_b32_e32 v4, v0
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_not_b32_e32 v6, v0
-; SI-NEXT:    v_not_b32_e32 v5, v1
+; SI-NEXT:    v_and_b32_e32 v0, s34, v5
+; SI-NEXT:    v_and_b32_e32 v1, s35, v4
+; SI-NEXT:    v_not_b32_e32 v3, v0
+; SI-NEXT:    v_not_b32_e32 v2, v1
+; SI-NEXT:    v_mov_b32_e32 v0, v2
+; SI-NEXT:    v_mov_b32_e32 v1, v3
+; SI-NEXT:    v_mov_b32_e32 v2, v4
 ; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[4:5]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB47_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v6, 1
+; SI-NEXT:    v_readlane_b32 s6, v6, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3344,25 +3340,25 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_or_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_or_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3398,23 +3394,23 @@ define amdgpu_gfx void @global_atomic_or_i64_noret_offset_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_or_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_or_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3708,25 +3704,25 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -3762,23 +3758,23 @@ define amdgpu_gfx void @global_atomic_xor_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_xor_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_xor_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4294,45 +4290,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB68_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB68_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4402,45 +4398,45 @@ define amdgpu_gfx void @global_atomic_max_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB69_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB69_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4511,47 +4507,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB70_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB70_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -4621,47 +4615,45 @@ define amdgpu_gfx i64 @global_atomic_max_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB71_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB71_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5576,45 +5568,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB80_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB80_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5684,45 +5676,45 @@ define amdgpu_gfx void @global_atomic_umax_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB81_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB81_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5793,47 +5785,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB82_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB82_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -5903,47 +5893,45 @@ define amdgpu_gfx i64 @global_atomic_umax_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB83_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_lt_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB83_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6746,45 +6734,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_scalar(ptr addrspace(1) inr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB91_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB91_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6854,45 +6842,45 @@ define amdgpu_gfx void @global_atomic_umin_i64_noret_offset_scalar(ptr addrspace
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB92_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB92_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -6963,47 +6951,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_scalar(ptr addrspace(1) inreg
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB93_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB93_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7073,47 +7059,45 @@ define amdgpu_gfx i64 @global_atomic_umin_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB94_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_u64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB94_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7552,45 +7536,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_scalar(ptr addrspace(1) inre
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB99_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB99_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7660,45 +7644,45 @@ define amdgpu_gfx void @global_atomic_min_i64_noret_offset_scalar(ptr addrspace(
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[2:3], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v5, s35
-; SI-NEXT:    v_mov_b32_e32 v6, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB100_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[3:4]
-; SI-NEXT:    v_cndmask_b32_e32 v2, v5, v4, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[2:3]
+; SI-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v0, v5, v2, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v10, v4
 ; SI-NEXT:    v_mov_b32_e32 v9, v3
 ; SI-NEXT:    v_mov_b32_e32 v8, v2
 ; SI-NEXT:    v_mov_b32_e32 v7, v1
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[7:10], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v6, v0
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[6:9], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[7:8], v[3:4]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
+; SI-NEXT:    v_mov_b32_e32 v2, v6
 ; SI-NEXT:    v_mov_b32_e32 v3, v7
-; SI-NEXT:    v_mov_b32_e32 v4, v8
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB100_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7769,47 +7753,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_scalar(ptr addrspace(1) inreg %
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB101_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB101_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -7879,47 +7861,45 @@ define amdgpu_gfx i64 @global_atomic_min_i64_ret_offset_scalar(ptr addrspace(1)
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v10, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v2, s6, 0
-; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_writelane_b32 v10, s6, 0
+; SI-NEXT:    v_writelane_b32 v10, s7, 1
 ; SI-NEXT:    s_mov_b32 s35, s7
 ; SI-NEXT:    s_mov_b32 s34, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    buffer_load_dwordx2 v[3:4], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_mov_b64 s[36:37], 0
-; SI-NEXT:    v_mov_b32_e32 v0, s35
-; SI-NEXT:    v_mov_b32_e32 v1, s34
+; SI-NEXT:    v_mov_b32_e32 v4, s35
+; SI-NEXT:    v_mov_b32_e32 v5, s34
 ; SI-NEXT:  .LBB102_1: ; %atomicrmw.start
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    v_mov_b32_e32 v8, v4
-; SI-NEXT:    v_mov_b32_e32 v7, v3
-; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[7:8]
+; SI-NEXT:    v_mov_b32_e32 v9, v1
+; SI-NEXT:    v_mov_b32_e32 v8, v0
+; SI-NEXT:    v_cmp_ge_i64_e32 vcc, s[34:35], v[8:9]
+; SI-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; SI-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_cndmask_b32_e32 v6, v0, v8, vcc
-; SI-NEXT:    v_cndmask_b32_e32 v5, v1, v7, vcc
-; SI-NEXT:    v_mov_b32_e32 v3, v5
-; SI-NEXT:    v_mov_b32_e32 v4, v6
-; SI-NEXT:    v_mov_b32_e32 v5, v7
-; SI-NEXT:    v_mov_b32_e32 v6, v8
-; SI-NEXT:    buffer_atomic_cmpswap_x2 v[3:6], off, s[4:7], 0 offset:32 glc
+; SI-NEXT:    v_mov_b32_e32 v0, v6
+; SI-NEXT:    v_mov_b32_e32 v1, v7
+; SI-NEXT:    v_mov_b32_e32 v2, v8
+; SI-NEXT:    v_mov_b32_e32 v3, v9
+; SI-NEXT:    buffer_atomic_cmpswap_x2 v[0:3], off, s[4:7], 0 offset:32 glc
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[3:4], v[7:8]
+; SI-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
 ; SI-NEXT:    s_or_b64 s[36:37], vcc, s[36:37]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[36:37]
 ; SI-NEXT:    s_cbranch_execnz .LBB102_1
 ; SI-NEXT:  ; %bb.2: ; %atomicrmw.end
 ; SI-NEXT:    s_or_b64 exec, exec, s[36:37]
-; SI-NEXT:    v_mov_b32_e32 v0, v3
-; SI-NEXT:    v_mov_b32_e32 v1, v4
-; SI-NEXT:    v_readlane_b32 s7, v2, 1
-; SI-NEXT:    v_readlane_b32 s6, v2, 0
+; SI-NEXT:    v_readlane_b32 s7, v10, 1
+; SI-NEXT:    v_readlane_b32 s6, v10, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v10, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8601,25 +8581,25 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8655,23 +8635,23 @@ define amdgpu_gfx void @global_atomic_uinc_wrap_i64_noret_offset_scalar(ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_inc_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_inc_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -8965,25 +8945,25 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_scalar(ptr addrspace(1
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
 ; SI-NEXT:    s_mov_b32 s34, s7
 ; SI-NEXT:    s_mov_b32 s35, s6
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
-; SI-NEXT:    v_mov_b32_e32 v1, s35
-; SI-NEXT:    v_mov_b32_e32 v2, s34
+; SI-NEXT:    v_mov_b32_e32 v0, s35
+; SI-NEXT:    v_mov_b32_e32 v1, s34
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0
+; SI-NEXT:    buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
@@ -9019,23 +8999,23 @@ define amdgpu_gfx void @global_atomic_udec_wrap_i64_noret_offset_scalar(ptr addr
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; SI-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt expcnt(0)
-; SI-NEXT:    v_writelane_b32 v0, s6, 0
-; SI-NEXT:    v_writelane_b32 v0, s7, 1
-; SI-NEXT:    v_mov_b32_e32 v1, s6
-; SI-NEXT:    v_mov_b32_e32 v2, s7
+; SI-NEXT:    v_writelane_b32 v2, s6, 0
+; SI-NEXT:    v_writelane_b32 v2, s7, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s7
 ; SI-NEXT:    s_mov_b32 s7, 0xf000
 ; SI-NEXT:    s_mov_b32 s6, -1
 ; SI-NEXT:    s_waitcnt vmcnt(0)
-; SI-NEXT:    buffer_atomic_dec_x2 v[1:2], off, s[4:7], 0 offset:32
+; SI-NEXT:    buffer_atomic_dec_x2 v[0:1], off, s[4:7], 0 offset:32
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    buffer_wbinvl1
-; SI-NEXT:    v_readlane_b32 s7, v0, 1
-; SI-NEXT:    v_readlane_b32 s6, v0, 0
+; SI-NEXT:    v_readlane_b32 s7, v2, 1
+; SI-NEXT:    v_readlane_b32 s6, v2, 0
 ; SI-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; SI-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; SI-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; SI-NEXT:    s_mov_b64 exec, s[34:35]
 ; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0)
 ; SI-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index e456b7d2e8b9b..297b5180dfe9b 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -6,209 +6,209 @@ define void @main(i1 %arg) #0 {
 ; CHECK:       ; %bb.0: ; %bb
 ; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v8, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
-; CHECK-NEXT:    v_writelane_b32 v1, s30, 0
-; CHECK-NEXT:    v_writelane_b32 v1, s31, 1
-; CHECK-NEXT:    v_writelane_b32 v1, s36, 2
-; CHECK-NEXT:    v_writelane_b32 v1, s37, 3
-; CHECK-NEXT:    v_writelane_b32 v1, s38, 4
-; CHECK-NEXT:    v_writelane_b32 v1, s39, 5
-; CHECK-NEXT:    v_writelane_b32 v1, s40, 6
-; CHECK-NEXT:    v_writelane_b32 v1, s41, 7
-; CHECK-NEXT:    v_writelane_b32 v1, s42, 8
-; CHECK-NEXT:    v_writelane_b32 v1, s43, 9
-; CHECK-NEXT:    v_writelane_b32 v1, s44, 10
-; CHECK-NEXT:    v_writelane_b32 v1, s45, 11
-; CHECK-NEXT:    v_writelane_b32 v1, s46, 12
-; CHECK-NEXT:    v_writelane_b32 v1, s47, 13
-; CHECK-NEXT:    v_writelane_b32 v1, s48, 14
-; CHECK-NEXT:    v_writelane_b32 v1, s49, 15
+; CHECK-NEXT:    v_writelane_b32 v8, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v8, s31, 1
+; CHECK-NEXT:    v_writelane_b32 v8, s36, 2
+; CHECK-NEXT:    v_writelane_b32 v8, s37, 3
+; CHECK-NEXT:    v_writelane_b32 v8, s38, 4
+; CHECK-NEXT:    v_writelane_b32 v8, s39, 5
+; CHECK-NEXT:    v_writelane_b32 v8, s40, 6
+; CHECK-NEXT:    v_writelane_b32 v8, s41, 7
+; CHECK-NEXT:    v_writelane_b32 v8, s42, 8
+; CHECK-NEXT:    v_writelane_b32 v8, s43, 9
+; CHECK-NEXT:    v_writelane_b32 v8, s44, 10
+; CHECK-NEXT:    v_writelane_b32 v8, s45, 11
+; CHECK-NEXT:    v_writelane_b32 v8, s46, 12
+; CHECK-NEXT:    v_writelane_b32 v8, s47, 13
+; CHECK-NEXT:    v_writelane_b32 v8, s48, 14
+; CHECK-NEXT:    v_writelane_b32 v8, s49, 15
 ; CHECK-NEXT:    s_getpc_b64 s[24:25]
-; CHECK-NEXT:    v_writelane_b32 v1, s50, 16
+; CHECK-NEXT:    v_writelane_b32 v8, s50, 16
 ; CHECK-NEXT:    s_movk_i32 s4, 0xf0
 ; CHECK-NEXT:    s_mov_b32 s5, s24
-; CHECK-NEXT:    v_writelane_b32 v1, s51, 17
+; CHECK-NEXT:    v_writelane_b32 v8, s51, 17
 ; CHECK-NEXT:    s_load_dwordx16 s[36:51], s[4:5], 0x0
-; CHECK-NEXT:    ; implicit-def: $vgpr5 : SGPR spill to VGPR lane
+; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_mov_b64 s[4:5], 0
 ; CHECK-NEXT:    s_load_dwordx4 s[28:31], s[4:5], 0x0
 ; CHECK-NEXT:    s_movk_i32 s4, 0x130
 ; CHECK-NEXT:    s_mov_b32 s5, s24
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v5, s36, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s37, 1
-; CHECK-NEXT:    v_writelane_b32 v5, s38, 2
-; CHECK-NEXT:    v_writelane_b32 v5, s39, 3
-; CHECK-NEXT:    v_writelane_b32 v5, s40, 4
-; CHECK-NEXT:    v_writelane_b32 v5, s41, 5
-; CHECK-NEXT:    v_writelane_b32 v5, s42, 6
-; CHECK-NEXT:    v_writelane_b32 v5, s43, 7
-; CHECK-NEXT:    v_writelane_b32 v5, s44, 8
-; CHECK-NEXT:    v_writelane_b32 v5, s45, 9
-; CHECK-NEXT:    v_writelane_b32 v5, s46, 10
+; CHECK-NEXT:    v_writelane_b32 v4, s36, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s37, 1
+; CHECK-NEXT:    v_writelane_b32 v4, s38, 2
+; CHECK-NEXT:    v_writelane_b32 v4, s39, 3
+; CHECK-NEXT:    v_writelane_b32 v4, s40, 4
+; CHECK-NEXT:    v_writelane_b32 v4, s41, 5
+; CHECK-NEXT:    v_writelane_b32 v4, s42, 6
+; CHECK-NEXT:    v_writelane_b32 v4, s43, 7
+; CHECK-NEXT:    v_writelane_b32 v4, s44, 8
+; CHECK-NEXT:    v_writelane_b32 v4, s45, 9
+; CHECK-NEXT:    v_writelane_b32 v4, s46, 10
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v5, s47, 11
-; CHECK-NEXT:    v_writelane_b32 v5, s48, 12
-; CHECK-NEXT:    v_writelane_b32 v5, s49, 13
+; CHECK-NEXT:    v_writelane_b32 v4, s47, 11
+; CHECK-NEXT:    v_writelane_b32 v4, s48, 12
+; CHECK-NEXT:    v_writelane_b32 v4, s49, 13
 ; CHECK-NEXT:    s_mov_b32 s20, 0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s50, 14
-; CHECK-NEXT:    v_mov_b32_e32 v6, s28
-; CHECK-NEXT:    v_mov_b32_e32 v7, v2
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s50, 14
+; CHECK-NEXT:    v_mov_b32_e32 v5, s28
+; CHECK-NEXT:    v_mov_b32_e32 v6, v1
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_writelane_b32 v5, s51, 15
-; CHECK-NEXT:    v_mov_b32_e32 v3, v2
-; CHECK-NEXT:    image_sample_lz v6, v[6:7], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v4, s51, 15
+; CHECK-NEXT:    v_mov_b32_e32 v2, v1
+; CHECK-NEXT:    image_sample_lz v5, v[5:6], s[44:51], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v5, s4, 16
-; CHECK-NEXT:    v_writelane_b32 v5, s5, 17
-; CHECK-NEXT:    v_writelane_b32 v5, s6, 18
-; CHECK-NEXT:    v_writelane_b32 v5, s7, 19
-; CHECK-NEXT:    v_writelane_b32 v5, s8, 20
-; CHECK-NEXT:    v_writelane_b32 v5, s9, 21
-; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[4:11], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_writelane_b32 v5, s10, 22
-; CHECK-NEXT:    v_writelane_b32 v5, s11, 23
-; CHECK-NEXT:    v_writelane_b32 v5, s12, 24
-; CHECK-NEXT:    v_writelane_b32 v5, s13, 25
-; CHECK-NEXT:    v_writelane_b32 v5, s14, 26
-; CHECK-NEXT:    v_writelane_b32 v5, s15, 27
-; CHECK-NEXT:    v_writelane_b32 v5, s16, 28
-; CHECK-NEXT:    v_writelane_b32 v1, s52, 18
-; CHECK-NEXT:    v_writelane_b32 v5, s17, 29
-; CHECK-NEXT:    v_writelane_b32 v1, s53, 19
-; CHECK-NEXT:    v_writelane_b32 v5, s18, 30
-; CHECK-NEXT:    v_writelane_b32 v1, s54, 20
-; CHECK-NEXT:    v_writelane_b32 v5, s19, 31
+; CHECK-NEXT:    v_writelane_b32 v4, s4, 16
+; CHECK-NEXT:    v_writelane_b32 v4, s5, 17
+; CHECK-NEXT:    v_writelane_b32 v4, s6, 18
+; CHECK-NEXT:    v_writelane_b32 v4, s7, 19
+; CHECK-NEXT:    v_writelane_b32 v4, s8, 20
+; CHECK-NEXT:    v_writelane_b32 v4, s9, 21
+; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[4:11], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_writelane_b32 v4, s10, 22
+; CHECK-NEXT:    v_writelane_b32 v4, s11, 23
+; CHECK-NEXT:    v_writelane_b32 v4, s12, 24
+; CHECK-NEXT:    v_writelane_b32 v4, s13, 25
+; CHECK-NEXT:    v_writelane_b32 v4, s14, 26
+; CHECK-NEXT:    v_writelane_b32 v4, s15, 27
+; CHECK-NEXT:    v_writelane_b32 v4, s16, 28
+; CHECK-NEXT:    v_writelane_b32 v8, s52, 18
+; CHECK-NEXT:    v_writelane_b32 v4, s17, 29
+; CHECK-NEXT:    v_writelane_b32 v8, s53, 19
+; CHECK-NEXT:    v_writelane_b32 v4, s18, 30
+; CHECK-NEXT:    v_writelane_b32 v8, s54, 20
+; CHECK-NEXT:    v_writelane_b32 v4, s19, 31
 ; CHECK-NEXT:    s_mov_b32 s4, 48
 ; CHECK-NEXT:    s_mov_b32 s5, s24
-; CHECK-NEXT:    v_writelane_b32 v1, s55, 21
+; CHECK-NEXT:    v_writelane_b32 v8, s55, 21
 ; CHECK-NEXT:    s_load_dwordx8 s[4:11], s[4:5], 0x0
-; CHECK-NEXT:    v_writelane_b32 v1, s56, 22
-; CHECK-NEXT:    v_writelane_b32 v1, s57, 23
-; CHECK-NEXT:    v_writelane_b32 v1, s58, 24
-; CHECK-NEXT:    v_writelane_b32 v1, s59, 25
-; CHECK-NEXT:    v_writelane_b32 v1, s60, 26
+; CHECK-NEXT:    v_writelane_b32 v8, s56, 22
+; CHECK-NEXT:    v_writelane_b32 v8, s57, 23
+; CHECK-NEXT:    v_writelane_b32 v8, s58, 24
+; CHECK-NEXT:    v_writelane_b32 v8, s59, 25
+; CHECK-NEXT:    v_writelane_b32 v8, s60, 26
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_writelane_b32 v5, s4, 32
-; CHECK-NEXT:    v_writelane_b32 v1, s61, 27
-; CHECK-NEXT:    v_writelane_b32 v5, s5, 33
-; CHECK-NEXT:    v_writelane_b32 v1, s62, 28
-; CHECK-NEXT:    v_writelane_b32 v5, s6, 34
-; CHECK-NEXT:    v_writelane_b32 v1, s63, 29
-; CHECK-NEXT:    v_writelane_b32 v5, s7, 35
-; CHECK-NEXT:    v_writelane_b32 v1, s64, 30
-; CHECK-NEXT:    v_writelane_b32 v5, s8, 36
-; CHECK-NEXT:    v_writelane_b32 v1, s65, 31
-; CHECK-NEXT:    v_writelane_b32 v5, s9, 37
-; CHECK-NEXT:    v_writelane_b32 v1, s66, 32
+; CHECK-NEXT:    v_writelane_b32 v4, s4, 32
+; CHECK-NEXT:    v_writelane_b32 v8, s61, 27
+; CHECK-NEXT:    v_writelane_b32 v4, s5, 33
+; CHECK-NEXT:    v_writelane_b32 v8, s62, 28
+; CHECK-NEXT:    v_writelane_b32 v4, s6, 34
+; CHECK-NEXT:    v_writelane_b32 v8, s63, 29
+; CHECK-NEXT:    v_writelane_b32 v4, s7, 35
+; CHECK-NEXT:    v_writelane_b32 v8, s64, 30
+; CHECK-NEXT:    v_writelane_b32 v4, s8, 36
+; CHECK-NEXT:    v_writelane_b32 v8, s65, 31
+; CHECK-NEXT:    v_writelane_b32 v4, s9, 37
+; CHECK-NEXT:    v_writelane_b32 v8, s66, 32
 ; CHECK-NEXT:    s_movk_i32 s26, 0x1f0
 ; CHECK-NEXT:    s_movk_i32 s28, 0x2f0
 ; CHECK-NEXT:    s_mov_b32 s27, s24
 ; CHECK-NEXT:    s_mov_b32 s29, s24
-; CHECK-NEXT:    v_writelane_b32 v5, s10, 38
-; CHECK-NEXT:    v_writelane_b32 v1, s67, 33
-; CHECK-NEXT:    v_writelane_b32 v5, s11, 39
+; CHECK-NEXT:    v_writelane_b32 v4, s10, 38
+; CHECK-NEXT:    v_writelane_b32 v8, s67, 33
+; CHECK-NEXT:    v_writelane_b32 v4, s11, 39
 ; CHECK-NEXT:    s_load_dwordx16 s[52:67], s[26:27], 0x0
 ; CHECK-NEXT:    s_load_dwordx16 s[4:19], s[28:29], 0x0
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
 ; CHECK-NEXT:    s_xor_b64 s[24:25], vcc, -1
-; CHECK-NEXT:    ; implicit-def: $vgpr4 : SGPR spill to VGPR lane
+; CHECK-NEXT:    ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_mul_f32_e32 v0, v7, v6
+; CHECK-NEXT:    v_mul_f32_e32 v0, v6, v5
 ; CHECK-NEXT:    s_and_saveexec_b64 s[26:27], s[24:25]
 ; CHECK-NEXT:    s_xor_b64 s[26:27], exec, s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_3
 ; CHECK-NEXT:  ; %bb.1: ; %bb48
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v5, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v5, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v5, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v5, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 15
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
 ; CHECK-NEXT:    s_and_b64 vcc, exec, -1
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 4
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[44:51], s[20:23] dmask:0x1
-; CHECK-NEXT:    v_mov_b32_e32 v3, 0
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 7
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
+; CHECK-NEXT:    image_sample_lz v5, v[1:2], s[44:51], s[20:23] dmask:0x1
+; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
 ; CHECK-NEXT:  .LBB0_2: ; %bb50
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 32
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 36
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 37
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 38
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 39
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 32
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 36
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 37
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 38
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 39
 ; CHECK-NEXT:    s_mov_b32 s21, s20
 ; CHECK-NEXT:    s_mov_b32 s22, s20
 ; CHECK-NEXT:    s_mov_b32 s23, s20
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 33
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 34
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 33
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 34
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[60:67], s[40:43] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 35
-; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[12:19], s[20:23] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[60:67], s[40:43] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 35
+; CHECK-NEXT:    image_sample_lz v1, v[1:2], s[12:19], s[20:23] dmask:0x1
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v7
-; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v0
-; CHECK-NEXT:    v_mul_f32_e32 v2, v2, v6
+; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v6
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v0
+; CHECK-NEXT:    v_mul_f32_e32 v1, v1, v5
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccnz .LBB0_2
 ; CHECK-NEXT:  .LBB0_3: ; %Flow14
 ; CHECK-NEXT:    s_waitcnt lgkmcnt(0)
-; CHECK-NEXT:    v_readlane_b32 s12, v5, 32
-; CHECK-NEXT:    v_readlane_b32 s13, v5, 33
-; CHECK-NEXT:    v_readlane_b32 s14, v5, 34
-; CHECK-NEXT:    v_readlane_b32 s15, v5, 35
-; CHECK-NEXT:    v_readlane_b32 s16, v5, 36
-; CHECK-NEXT:    v_readlane_b32 s17, v5, 37
-; CHECK-NEXT:    v_readlane_b32 s18, v5, 38
-; CHECK-NEXT:    v_readlane_b32 s19, v5, 39
-; CHECK-NEXT:    v_writelane_b32 v5, s4, 40
-; CHECK-NEXT:    v_writelane_b32 v5, s5, 41
-; CHECK-NEXT:    v_writelane_b32 v5, s6, 42
-; CHECK-NEXT:    v_writelane_b32 v5, s7, 43
-; CHECK-NEXT:    v_writelane_b32 v5, s8, 44
-; CHECK-NEXT:    v_writelane_b32 v5, s9, 45
-; CHECK-NEXT:    v_writelane_b32 v5, s10, 46
-; CHECK-NEXT:    v_writelane_b32 v5, s11, 47
-; CHECK-NEXT:    v_writelane_b32 v5, s12, 48
-; CHECK-NEXT:    v_writelane_b32 v5, s13, 49
-; CHECK-NEXT:    v_writelane_b32 v5, s14, 50
-; CHECK-NEXT:    v_writelane_b32 v5, s15, 51
-; CHECK-NEXT:    v_writelane_b32 v5, s16, 52
-; CHECK-NEXT:    v_writelane_b32 v5, s17, 53
-; CHECK-NEXT:    v_writelane_b32 v5, s18, 54
-; CHECK-NEXT:    v_writelane_b32 v5, s19, 55
-; CHECK-NEXT:    v_writelane_b32 v5, s52, 56
-; CHECK-NEXT:    v_writelane_b32 v4, s60, 0
-; CHECK-NEXT:    v_writelane_b32 v5, s53, 57
-; CHECK-NEXT:    v_writelane_b32 v4, s61, 1
-; CHECK-NEXT:    v_writelane_b32 v5, s54, 58
-; CHECK-NEXT:    v_writelane_b32 v4, s62, 2
-; CHECK-NEXT:    v_writelane_b32 v5, s55, 59
-; CHECK-NEXT:    v_writelane_b32 v4, s63, 3
-; CHECK-NEXT:    v_writelane_b32 v5, s56, 60
-; CHECK-NEXT:    v_writelane_b32 v4, s64, 4
-; CHECK-NEXT:    v_writelane_b32 v5, s57, 61
-; CHECK-NEXT:    v_writelane_b32 v4, s65, 5
-; CHECK-NEXT:    v_writelane_b32 v5, s58, 62
-; CHECK-NEXT:    v_writelane_b32 v4, s66, 6
-; CHECK-NEXT:    v_writelane_b32 v5, s59, 63
-; CHECK-NEXT:    v_writelane_b32 v4, s67, 7
+; CHECK-NEXT:    v_readlane_b32 s12, v4, 32
+; CHECK-NEXT:    v_readlane_b32 s13, v4, 33
+; CHECK-NEXT:    v_readlane_b32 s14, v4, 34
+; CHECK-NEXT:    v_readlane_b32 s15, v4, 35
+; CHECK-NEXT:    v_readlane_b32 s16, v4, 36
+; CHECK-NEXT:    v_readlane_b32 s17, v4, 37
+; CHECK-NEXT:    v_readlane_b32 s18, v4, 38
+; CHECK-NEXT:    v_readlane_b32 s19, v4, 39
+; CHECK-NEXT:    v_writelane_b32 v4, s4, 40
+; CHECK-NEXT:    v_writelane_b32 v4, s5, 41
+; CHECK-NEXT:    v_writelane_b32 v4, s6, 42
+; CHECK-NEXT:    v_writelane_b32 v4, s7, 43
+; CHECK-NEXT:    v_writelane_b32 v4, s8, 44
+; CHECK-NEXT:    v_writelane_b32 v4, s9, 45
+; CHECK-NEXT:    v_writelane_b32 v4, s10, 46
+; CHECK-NEXT:    v_writelane_b32 v4, s11, 47
+; CHECK-NEXT:    v_writelane_b32 v4, s12, 48
+; CHECK-NEXT:    v_writelane_b32 v4, s13, 49
+; CHECK-NEXT:    v_writelane_b32 v4, s14, 50
+; CHECK-NEXT:    v_writelane_b32 v4, s15, 51
+; CHECK-NEXT:    v_writelane_b32 v4, s16, 52
+; CHECK-NEXT:    v_writelane_b32 v4, s17, 53
+; CHECK-NEXT:    v_writelane_b32 v4, s18, 54
+; CHECK-NEXT:    v_writelane_b32 v4, s19, 55
+; CHECK-NEXT:    v_writelane_b32 v4, s52, 56
+; CHECK-NEXT:    v_writelane_b32 v3, s60, 0
+; CHECK-NEXT:    v_writelane_b32 v4, s53, 57
+; CHECK-NEXT:    v_writelane_b32 v3, s61, 1
+; CHECK-NEXT:    v_writelane_b32 v4, s54, 58
+; CHECK-NEXT:    v_writelane_b32 v3, s62, 2
+; CHECK-NEXT:    v_writelane_b32 v4, s55, 59
+; CHECK-NEXT:    v_writelane_b32 v3, s63, 3
+; CHECK-NEXT:    v_writelane_b32 v4, s56, 60
+; CHECK-NEXT:    v_writelane_b32 v3, s64, 4
+; CHECK-NEXT:    v_writelane_b32 v4, s57, 61
+; CHECK-NEXT:    v_writelane_b32 v3, s65, 5
+; CHECK-NEXT:    v_writelane_b32 v4, s58, 62
+; CHECK-NEXT:    v_writelane_b32 v3, s66, 6
+; CHECK-NEXT:    v_writelane_b32 v4, s59, 63
+; CHECK-NEXT:    v_writelane_b32 v3, s67, 7
 ; CHECK-NEXT:    s_andn2_saveexec_b64 s[20:21], s[26:27]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_10
 ; CHECK-NEXT:  ; %bb.4: ; %bb32
@@ -218,102 +218,102 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:  ; %bb.5: ; %bb43
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s9, s8
-; CHECK-NEXT:    v_mov_b32_e32 v2, s8
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 0
-; CHECK-NEXT:    v_mov_b32_e32 v3, s9
+; CHECK-NEXT:    v_mov_b32_e32 v0, s8
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 0
+; CHECK-NEXT:    v_mov_b32_e32 v1, s9
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 1
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 2
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 3
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 4
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 5
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 6
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 7
-; CHECK-NEXT:    v_readlane_b32 s44, v5, 8
-; CHECK-NEXT:    v_readlane_b32 s45, v5, 9
-; CHECK-NEXT:    v_readlane_b32 s46, v5, 10
-; CHECK-NEXT:    v_readlane_b32 s47, v5, 11
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 12
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 13
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 14
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 15
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 16
-; CHECK-NEXT:    v_readlane_b32 s44, v5, 24
-; CHECK-NEXT:    v_readlane_b32 s45, v5, 25
-; CHECK-NEXT:    v_readlane_b32 s46, v5, 26
-; CHECK-NEXT:    v_readlane_b32 s47, v5, 27
-; CHECK-NEXT:    v_readlane_b32 s48, v5, 28
-; CHECK-NEXT:    v_readlane_b32 s49, v5, 29
-; CHECK-NEXT:    v_readlane_b32 s50, v5, 30
-; CHECK-NEXT:    v_readlane_b32 s51, v5, 31
-; CHECK-NEXT:    v_mov_b32_e32 v7, 0
-; CHECK-NEXT:    v_mov_b32_e32 v8, v7
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 17
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 18
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 19
-; CHECK-NEXT:    image_sample_lz v2, v[2:3], s[44:51], s[12:15] dmask:0x1
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 20
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 21
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 22
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 23
-; CHECK-NEXT:    ; implicit-def: $vgpr0
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 1
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 2
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 3
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 4
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 5
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 6
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 7
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 8
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 9
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 10
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 11
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 12
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 13
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 14
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 15
+; CHECK-NEXT:    image_sample_lz v5, v[0:1], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 16
+; CHECK-NEXT:    v_readlane_b32 s44, v4, 24
+; CHECK-NEXT:    v_readlane_b32 s45, v4, 25
+; CHECK-NEXT:    v_readlane_b32 s46, v4, 26
+; CHECK-NEXT:    v_readlane_b32 s47, v4, 27
+; CHECK-NEXT:    v_readlane_b32 s48, v4, 28
+; CHECK-NEXT:    v_readlane_b32 s49, v4, 29
+; CHECK-NEXT:    v_readlane_b32 s50, v4, 30
+; CHECK-NEXT:    v_readlane_b32 s51, v4, 31
+; CHECK-NEXT:    v_mov_b32_e32 v6, 0
+; CHECK-NEXT:    v_mov_b32_e32 v7, v6
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 17
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 18
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 19
+; CHECK-NEXT:    image_sample_lz v0, v[0:1], s[44:51], s[12:15] dmask:0x1
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 20
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 21
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 22
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 23
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx3 v[6:8], off, s[8:11], 0
+; CHECK-NEXT:    buffer_store_dwordx3 v[5:7], off, s[8:11], 0
 ; CHECK-NEXT:    s_waitcnt vmcnt(1)
-; CHECK-NEXT:    buffer_store_dwordx4 v[2:5], off, s[8:11], 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[8:11], 0
+; CHECK-NEXT:    ; implicit-def: $vgpr0
 ; CHECK-NEXT:  .LBB0_6: ; %Flow12
 ; CHECK-NEXT:    s_or_saveexec_b64 s[4:5], s[22:23]
-; CHECK-NEXT:    v_readlane_b32 s52, v5, 40
-; CHECK-NEXT:    v_readlane_b32 s53, v5, 41
-; CHECK-NEXT:    v_readlane_b32 s54, v5, 42
-; CHECK-NEXT:    v_readlane_b32 s55, v5, 43
-; CHECK-NEXT:    v_readlane_b32 s56, v5, 44
-; CHECK-NEXT:    v_readlane_b32 s57, v5, 45
-; CHECK-NEXT:    v_readlane_b32 s58, v5, 46
-; CHECK-NEXT:    v_readlane_b32 s59, v5, 47
-; CHECK-NEXT:    v_readlane_b32 s60, v5, 48
-; CHECK-NEXT:    v_readlane_b32 s61, v5, 49
-; CHECK-NEXT:    v_readlane_b32 s62, v5, 50
-; CHECK-NEXT:    v_readlane_b32 s63, v5, 51
-; CHECK-NEXT:    v_readlane_b32 s64, v5, 52
-; CHECK-NEXT:    v_readlane_b32 s65, v5, 53
-; CHECK-NEXT:    v_readlane_b32 s66, v5, 54
-; CHECK-NEXT:    v_readlane_b32 s67, v5, 55
+; CHECK-NEXT:    v_readlane_b32 s52, v4, 40
+; CHECK-NEXT:    v_readlane_b32 s53, v4, 41
+; CHECK-NEXT:    v_readlane_b32 s54, v4, 42
+; CHECK-NEXT:    v_readlane_b32 s55, v4, 43
+; CHECK-NEXT:    v_readlane_b32 s56, v4, 44
+; CHECK-NEXT:    v_readlane_b32 s57, v4, 45
+; CHECK-NEXT:    v_readlane_b32 s58, v4, 46
+; CHECK-NEXT:    v_readlane_b32 s59, v4, 47
+; CHECK-NEXT:    v_readlane_b32 s60, v4, 48
+; CHECK-NEXT:    v_readlane_b32 s61, v4, 49
+; CHECK-NEXT:    v_readlane_b32 s62, v4, 50
+; CHECK-NEXT:    v_readlane_b32 s63, v4, 51
+; CHECK-NEXT:    v_readlane_b32 s64, v4, 52
+; CHECK-NEXT:    v_readlane_b32 s65, v4, 53
+; CHECK-NEXT:    v_readlane_b32 s66, v4, 54
+; CHECK-NEXT:    v_readlane_b32 s67, v4, 55
 ; CHECK-NEXT:    s_xor_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:    s_cbranch_execz .LBB0_9
 ; CHECK-NEXT:  ; %bb.7: ; %bb33.preheader
 ; CHECK-NEXT:    s_mov_b32 s8, 0
 ; CHECK-NEXT:    s_mov_b32 s6, s8
 ; CHECK-NEXT:    s_mov_b32 s7, s8
-; CHECK-NEXT:    v_mov_b32_e32 v2, s6
-; CHECK-NEXT:    v_readlane_b32 s36, v5, 56
+; CHECK-NEXT:    v_mov_b32_e32 v1, s6
+; CHECK-NEXT:    v_readlane_b32 s36, v4, 56
 ; CHECK-NEXT:    s_mov_b32 s9, s8
 ; CHECK-NEXT:    s_mov_b32 s10, s8
 ; CHECK-NEXT:    s_mov_b32 s11, s8
-; CHECK-NEXT:    v_mov_b32_e32 v3, s7
-; CHECK-NEXT:    v_readlane_b32 s37, v5, 57
-; CHECK-NEXT:    v_readlane_b32 s38, v5, 58
-; CHECK-NEXT:    v_readlane_b32 s39, v5, 59
-; CHECK-NEXT:    v_readlane_b32 s40, v5, 60
-; CHECK-NEXT:    v_readlane_b32 s41, v5, 61
-; CHECK-NEXT:    v_readlane_b32 s42, v5, 62
-; CHECK-NEXT:    v_readlane_b32 s43, v5, 63
+; CHECK-NEXT:    v_mov_b32_e32 v2, s7
+; CHECK-NEXT:    v_readlane_b32 s37, v4, 57
+; CHECK-NEXT:    v_readlane_b32 s38, v4, 58
+; CHECK-NEXT:    v_readlane_b32 s39, v4, 59
+; CHECK-NEXT:    v_readlane_b32 s40, v4, 60
+; CHECK-NEXT:    v_readlane_b32 s41, v4, 61
+; CHECK-NEXT:    v_readlane_b32 s42, v4, 62
+; CHECK-NEXT:    v_readlane_b32 s43, v4, 63
 ; CHECK-NEXT:    s_nop 4
-; CHECK-NEXT:    image_sample_lz v6, v[2:3], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT:    image_sample_lz v7, v[2:3], s[52:59], s[8:11] dmask:0x1
-; CHECK-NEXT:    ; kill: killed $vgpr2_vgpr3
+; CHECK-NEXT:    image_sample_lz v5, v[1:2], s[36:43], s[8:11] dmask:0x1
+; CHECK-NEXT:    image_sample_lz v6, v[1:2], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT:    ; kill: killed $vgpr1_vgpr2
 ; CHECK-NEXT:    s_mov_b64 s[12:13], s[36:37]
 ; CHECK-NEXT:    s_and_b64 vcc, exec, 0
-; CHECK-NEXT:    v_readlane_b32 s44, v4, 0
-; CHECK-NEXT:    v_readlane_b32 s45, v4, 1
-; CHECK-NEXT:    v_readlane_b32 s46, v4, 2
-; CHECK-NEXT:    v_readlane_b32 s47, v4, 3
-; CHECK-NEXT:    v_readlane_b32 s48, v4, 4
-; CHECK-NEXT:    v_readlane_b32 s49, v4, 5
-; CHECK-NEXT:    v_readlane_b32 s50, v4, 6
-; CHECK-NEXT:    v_readlane_b32 s51, v4, 7
+; CHECK-NEXT:    v_readlane_b32 s44, v3, 0
+; CHECK-NEXT:    v_readlane_b32 s45, v3, 1
+; CHECK-NEXT:    v_readlane_b32 s46, v3, 2
+; CHECK-NEXT:    v_readlane_b32 s47, v3, 3
+; CHECK-NEXT:    v_readlane_b32 s48, v3, 4
+; CHECK-NEXT:    v_readlane_b32 s49, v3, 5
+; CHECK-NEXT:    v_readlane_b32 s50, v3, 6
+; CHECK-NEXT:    v_readlane_b32 s51, v3, 7
 ; CHECK-NEXT:    s_mov_b64 s[14:15], s[38:39]
 ; CHECK-NEXT:    s_mov_b64 s[16:17], s[40:41]
 ; CHECK-NEXT:    s_mov_b64 s[18:19], s[42:43]
@@ -321,59 +321,59 @@ define void @main(i1 %arg) #0 {
 ; CHECK-NEXT:    ; kill: killed $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59
 ; CHECK-NEXT:    ; kill: killed $sgpr8_sgpr9_sgpr10 killed $sgpr11
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
-; CHECK-NEXT:    v_sub_f32_e32 v2, v7, v6
-; CHECK-NEXT:    v_mul_f32_e32 v0, v2, v0
-; CHECK-NEXT:    v_mov_b32_e32 v2, 0
+; CHECK-NEXT:    v_sub_f32_e32 v1, v6, v5
+; CHECK-NEXT:    v_mul_f32_e32 v0, v1, v0
+; CHECK-NEXT:    v_mov_b32_e32 v1, 0
 ; CHECK-NEXT:  .LBB0_8: ; %bb33
 ; CHECK-NEXT:    ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT:    v_add_f32_e32 v3, v2, v0
-; CHECK-NEXT:    v_sub_f32_e32 v2, v2, v3
+; CHECK-NEXT:    v_add_f32_e32 v2, v1, v0
+; CHECK-NEXT:    v_sub_f32_e32 v1, v1, v2
 ; CHECK-NEXT:    s_mov_b64 vcc, vcc
 ; CHECK-NEXT:    s_cbranch_vccz .LBB0_8
 ; CHECK-NEXT:  .LBB0_9: ; %Flow13
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; CHECK-NEXT:  .LBB0_10: ; %UnifiedReturnBlock
 ; CHECK-NEXT:    s_or_b64 exec, exec, s[20:21]
-; CHECK-NEXT:    v_readlane_b32 s67, v1, 33
-; CHECK-NEXT:    v_readlane_b32 s66, v1, 32
-; CHECK-NEXT:    v_readlane_b32 s65, v1, 31
-; CHECK-NEXT:    v_readlane_b32 s64, v1, 30
-; CHECK-NEXT:    v_readlane_b32 s63, v1, 29
-; CHECK-NEXT:    v_readlane_b32 s62, v1, 28
-; CHECK-NEXT:    v_readlane_b32 s61, v1, 27
-; CHECK-NEXT:    v_readlane_b32 s60, v1, 26
-; CHECK-NEXT:    v_readlane_b32 s59, v1, 25
-; CHECK-NEXT:    v_readlane_b32 s58, v1, 24
-; CHECK-NEXT:    v_readlane_b32 s57, v1, 23
-; CHECK-NEXT:    v_readlane_b32 s56, v1, 22
-; CHECK-NEXT:    v_readlane_b32 s55, v1, 21
-; CHECK-NEXT:    v_readlane_b32 s54, v1, 20
-; CHECK-NEXT:    v_readlane_b32 s53, v1, 19
-; CHECK-NEXT:    v_readlane_b32 s52, v1, 18
-; CHECK-NEXT:    v_readlane_b32 s51, v1, 17
-; CHECK-NEXT:    v_readlane_b32 s50, v1, 16
-; CHECK-NEXT:    v_readlane_b32 s49, v1, 15
-; CHECK-NEXT:    v_readlane_b32 s48, v1, 14
-; CHECK-NEXT:    v_readlane_b32 s47, v1, 13
-; CHECK-NEXT:    v_readlane_b32 s46, v1, 12
-; CHECK-NEXT:    v_readlane_b32 s45, v1, 11
-; CHECK-NEXT:    v_readlane_b32 s44, v1, 10
-; CHECK-NEXT:    v_readlane_b32 s43, v1, 9
-; CHECK-NEXT:    v_readlane_b32 s42, v1, 8
-; CHECK-NEXT:    v_readlane_b32 s41, v1, 7
-; CHECK-NEXT:    v_readlane_b32 s40, v1, 6
-; CHECK-NEXT:    v_readlane_b32 s39, v1, 5
-; CHECK-NEXT:    v_readlane_b32 s38, v1, 4
-; CHECK-NEXT:    v_readlane_b32 s37, v1, 3
-; CHECK-NEXT:    v_readlane_b32 s36, v1, 2
-; CHECK-NEXT:    v_readlane_b32 s31, v1, 1
-; CHECK-NEXT:    v_readlane_b32 s30, v1, 0
-; CHECK-NEXT:    ; kill: killed $vgpr5
+; CHECK-NEXT:    v_readlane_b32 s67, v8, 33
+; CHECK-NEXT:    v_readlane_b32 s66, v8, 32
+; CHECK-NEXT:    v_readlane_b32 s65, v8, 31
+; CHECK-NEXT:    v_readlane_b32 s64, v8, 30
+; CHECK-NEXT:    v_readlane_b32 s63, v8, 29
+; CHECK-NEXT:    v_readlane_b32 s62, v8, 28
+; CHECK-NEXT:    v_readlane_b32 s61, v8, 27
+; CHECK-NEXT:    v_readlane_b32 s60, v8, 26
+; CHECK-NEXT:    v_readlane_b32 s59, v8, 25
+; CHECK-NEXT:    v_readlane_b32 s58, v8, 24
+; CHECK-NEXT:    v_readlane_b32 s57, v8, 23
+; CHECK-NEXT:    v_readlane_b32 s56, v8, 22
+; CHECK-NEXT:    v_readlane_b32 s55, v8, 21
+; CHECK-NEXT:    v_readlane_b32 s54, v8, 20
+; CHECK-NEXT:    v_readlane_b32 s53, v8, 19
+; CHECK-NEXT:    v_readlane_b32 s52, v8, 18
+; CHECK-NEXT:    v_readlane_b32 s51, v8, 17
+; CHECK-NEXT:    v_readlane_b32 s50, v8, 16
+; CHECK-NEXT:    v_readlane_b32 s49, v8, 15
+; CHECK-NEXT:    v_readlane_b32 s48, v8, 14
+; CHECK-NEXT:    v_readlane_b32 s47, v8, 13
+; CHECK-NEXT:    v_readlane_b32 s46, v8, 12
+; CHECK-NEXT:    v_readlane_b32 s45, v8, 11
+; CHECK-NEXT:    v_readlane_b32 s44, v8, 10
+; CHECK-NEXT:    v_readlane_b32 s43, v8, 9
+; CHECK-NEXT:    v_readlane_b32 s42, v8, 8
+; CHECK-NEXT:    v_readlane_b32 s41, v8, 7
+; CHECK-NEXT:    v_readlane_b32 s40, v8, 6
+; CHECK-NEXT:    v_readlane_b32 s39, v8, 5
+; CHECK-NEXT:    v_readlane_b32 s38, v8, 4
+; CHECK-NEXT:    v_readlane_b32 s37, v8, 3
+; CHECK-NEXT:    v_readlane_b32 s36, v8, 2
+; CHECK-NEXT:    v_readlane_b32 s31, v8, 1
+; CHECK-NEXT:    v_readlane_b32 s30, v8, 0
 ; CHECK-NEXT:    ; kill: killed $vgpr4
+; CHECK-NEXT:    ; kill: killed $vgpr3
 ; CHECK-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT:    buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT:    buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
 ; CHECK-NEXT:    s_mov_b64 exec, s[4:5]
 ; CHECK-NEXT:    s_waitcnt vmcnt(0)
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
index 3aaf04c94cda5..408199bbc9223 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll
@@ -1042,92 +1042,92 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GCN-NEXT:    s_mov_b32 s10, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
-; GCN-NEXT:    v_writelane_b32 v40, s60, 28
-; GCN-NEXT:    v_writelane_b32 v40, s61, 29
-; GCN-NEXT:    v_writelane_b32 v40, s62, 30
-; GCN-NEXT:    v_writelane_b32 v40, s63, 31
-; GCN-NEXT:    v_mov_b32_e32 v41, v0
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v41, s30, 0
+; GCN-NEXT:    v_writelane_b32 v41, s31, 1
+; GCN-NEXT:    v_writelane_b32 v41, s34, 2
+; GCN-NEXT:    v_writelane_b32 v41, s35, 3
+; GCN-NEXT:    v_writelane_b32 v41, s36, 4
+; GCN-NEXT:    v_writelane_b32 v41, s37, 5
+; GCN-NEXT:    v_writelane_b32 v41, s38, 6
+; GCN-NEXT:    v_writelane_b32 v41, s39, 7
+; GCN-NEXT:    v_writelane_b32 v41, s40, 8
+; GCN-NEXT:    v_writelane_b32 v41, s41, 9
+; GCN-NEXT:    v_writelane_b32 v41, s42, 10
+; GCN-NEXT:    v_writelane_b32 v41, s43, 11
+; GCN-NEXT:    v_writelane_b32 v41, s44, 12
+; GCN-NEXT:    v_writelane_b32 v41, s45, 13
+; GCN-NEXT:    v_writelane_b32 v41, s46, 14
+; GCN-NEXT:    v_writelane_b32 v41, s47, 15
+; GCN-NEXT:    v_writelane_b32 v41, s48, 16
+; GCN-NEXT:    v_writelane_b32 v41, s49, 17
+; GCN-NEXT:    v_writelane_b32 v41, s50, 18
+; GCN-NEXT:    v_writelane_b32 v41, s51, 19
+; GCN-NEXT:    v_writelane_b32 v41, s52, 20
+; GCN-NEXT:    v_writelane_b32 v41, s53, 21
+; GCN-NEXT:    v_writelane_b32 v41, s54, 22
+; GCN-NEXT:    v_writelane_b32 v41, s55, 23
+; GCN-NEXT:    v_writelane_b32 v41, s56, 24
+; GCN-NEXT:    v_writelane_b32 v41, s57, 25
+; GCN-NEXT:    v_writelane_b32 v41, s58, 26
+; GCN-NEXT:    v_writelane_b32 v41, s59, 27
+; GCN-NEXT:    v_writelane_b32 v41, s60, 28
+; GCN-NEXT:    v_writelane_b32 v41, s61, 29
+; GCN-NEXT:    v_writelane_b32 v41, s62, 30
+; GCN-NEXT:    v_writelane_b32 v41, s63, 31
+; GCN-NEXT:    v_mov_b32_e32 v40, v0
 ; GCN-NEXT:    s_mov_b64 s[4:5], exec
 ; GCN-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GCN-NEXT:    v_readfirstlane_b32 s6, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v2
 ; GCN-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2]
 ; GCN-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GCN-NEXT:    v_mov_b32_e32 v0, v41
+; GCN-NEXT:    v_mov_b32_e32 v0, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GCN-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GCN-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GCN-NEXT:    s_cbranch_execnz .LBB7_1
 ; GCN-NEXT:  ; %bb.2:
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
-; GCN-NEXT:    v_mov_b32_e32 v0, v41
-; GCN-NEXT:    v_readlane_b32 s63, v40, 31
-; GCN-NEXT:    v_readlane_b32 s62, v40, 30
-; GCN-NEXT:    v_readlane_b32 s61, v40, 29
-; GCN-NEXT:    v_readlane_b32 s60, v40, 28
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    v_mov_b32_e32 v0, v40
+; GCN-NEXT:    v_readlane_b32 s63, v41, 31
+; GCN-NEXT:    v_readlane_b32 s62, v41, 30
+; GCN-NEXT:    v_readlane_b32 s61, v41, 29
+; GCN-NEXT:    v_readlane_b32 s60, v41, 28
+; GCN-NEXT:    v_readlane_b32 s59, v41, 27
+; GCN-NEXT:    v_readlane_b32 s58, v41, 26
+; GCN-NEXT:    v_readlane_b32 s57, v41, 25
+; GCN-NEXT:    v_readlane_b32 s56, v41, 24
+; GCN-NEXT:    v_readlane_b32 s55, v41, 23
+; GCN-NEXT:    v_readlane_b32 s54, v41, 22
+; GCN-NEXT:    v_readlane_b32 s53, v41, 21
+; GCN-NEXT:    v_readlane_b32 s52, v41, 20
+; GCN-NEXT:    v_readlane_b32 s51, v41, 19
+; GCN-NEXT:    v_readlane_b32 s50, v41, 18
+; GCN-NEXT:    v_readlane_b32 s49, v41, 17
+; GCN-NEXT:    v_readlane_b32 s48, v41, 16
+; GCN-NEXT:    v_readlane_b32 s47, v41, 15
+; GCN-NEXT:    v_readlane_b32 s46, v41, 14
+; GCN-NEXT:    v_readlane_b32 s45, v41, 13
+; GCN-NEXT:    v_readlane_b32 s44, v41, 12
+; GCN-NEXT:    v_readlane_b32 s43, v41, 11
+; GCN-NEXT:    v_readlane_b32 s42, v41, 10
+; GCN-NEXT:    v_readlane_b32 s41, v41, 9
+; GCN-NEXT:    v_readlane_b32 s40, v41, 8
+; GCN-NEXT:    v_readlane_b32 s39, v41, 7
+; GCN-NEXT:    v_readlane_b32 s38, v41, 6
+; GCN-NEXT:    v_readlane_b32 s37, v41, 5
+; GCN-NEXT:    v_readlane_b32 s36, v41, 4
+; GCN-NEXT:    v_readlane_b32 s35, v41, 3
+; GCN-NEXT:    v_readlane_b32 s34, v41, 2
+; GCN-NEXT:    v_readlane_b32 s31, v41, 1
+; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s10
@@ -1140,92 +1140,92 @@ define i32 @test_indirect_call_vgpr_ptr_arg_and_reuse(i32 %i, ptr %fptr) {
 ; GISEL-NEXT:    s_mov_b32 s10, s33
 ; GISEL-NEXT:    s_mov_b32 s33, s32
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0x400
-; GISEL-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GISEL-NEXT:    v_writelane_b32 v40, s30, 0
-; GISEL-NEXT:    v_writelane_b32 v40, s31, 1
-; GISEL-NEXT:    v_writelane_b32 v40, s34, 2
-; GISEL-NEXT:    v_writelane_b32 v40, s35, 3
-; GISEL-NEXT:    v_writelane_b32 v40, s36, 4
-; GISEL-NEXT:    v_writelane_b32 v40, s37, 5
-; GISEL-NEXT:    v_writelane_b32 v40, s38, 6
-; GISEL-NEXT:    v_writelane_b32 v40, s39, 7
-; GISEL-NEXT:    v_writelane_b32 v40, s40, 8
-; GISEL-NEXT:    v_writelane_b32 v40, s41, 9
-; GISEL-NEXT:    v_writelane_b32 v40, s42, 10
-; GISEL-NEXT:    v_writelane_b32 v40, s43, 11
-; GISEL-NEXT:    v_writelane_b32 v40, s44, 12
-; GISEL-NEXT:    v_writelane_b32 v40, s45, 13
-; GISEL-NEXT:    v_writelane_b32 v40, s46, 14
-; GISEL-NEXT:    v_writelane_b32 v40, s47, 15
-; GISEL-NEXT:    v_writelane_b32 v40, s48, 16
-; GISEL-NEXT:    v_writelane_b32 v40, s49, 17
-; GISEL-NEXT:    v_writelane_b32 v40, s50, 18
-; GISEL-NEXT:    v_writelane_b32 v40, s51, 19
-; GISEL-NEXT:    v_writelane_b32 v40, s52, 20
-; GISEL-NEXT:    v_writelane_b32 v40, s53, 21
-; GISEL-NEXT:    v_writelane_b32 v40, s54, 22
-; GISEL-NEXT:    v_writelane_b32 v40, s55, 23
-; GISEL-NEXT:    v_writelane_b32 v40, s56, 24
-; GISEL-NEXT:    v_writelane_b32 v40, s57, 25
-; GISEL-NEXT:    v_writelane_b32 v40, s58, 26
-; GISEL-NEXT:    v_writelane_b32 v40, s59, 27
-; GISEL-NEXT:    v_writelane_b32 v40, s60, 28
-; GISEL-NEXT:    v_writelane_b32 v40, s61, 29
-; GISEL-NEXT:    v_writelane_b32 v40, s62, 30
-; GISEL-NEXT:    v_writelane_b32 v40, s63, 31
-; GISEL-NEXT:    v_mov_b32_e32 v41, v0
+; GISEL-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GISEL-NEXT:    v_writelane_b32 v41, s30, 0
+; GISEL-NEXT:    v_writelane_b32 v41, s31, 1
+; GISEL-NEXT:    v_writelane_b32 v41, s34, 2
+; GISEL-NEXT:    v_writelane_b32 v41, s35, 3
+; GISEL-NEXT:    v_writelane_b32 v41, s36, 4
+; GISEL-NEXT:    v_writelane_b32 v41, s37, 5
+; GISEL-NEXT:    v_writelane_b32 v41, s38, 6
+; GISEL-NEXT:    v_writelane_b32 v41, s39, 7
+; GISEL-NEXT:    v_writelane_b32 v41, s40, 8
+; GISEL-NEXT:    v_writelane_b32 v41, s41, 9
+; GISEL-NEXT:    v_writelane_b32 v41, s42, 10
+; GISEL-NEXT:    v_writelane_b32 v41, s43, 11
+; GISEL-NEXT:    v_writelane_b32 v41, s44, 12
+; GISEL-NEXT:    v_writelane_b32 v41, s45, 13
+; GISEL-NEXT:    v_writelane_b32 v41, s46, 14
+; GISEL-NEXT:    v_writelane_b32 v41, s47, 15
+; GISEL-NEXT:    v_writelane_b32 v41, s48, 16
+; GISEL-NEXT:    v_writelane_b32 v41, s49, 17
+; GISEL-NEXT:    v_writelane_b32 v41, s50, 18
+; GISEL-NEXT:    v_writelane_b32 v41, s51, 19
+; GISEL-NEXT:    v_writelane_b32 v41, s52, 20
+; GISEL-NEXT:    v_writelane_b32 v41, s53, 21
+; GISEL-NEXT:    v_writelane_b32 v41, s54, 22
+; GISEL-NEXT:    v_writelane_b32 v41, s55, 23
+; GISEL-NEXT:    v_writelane_b32 v41, s56, 24
+; GISEL-NEXT:    v_writelane_b32 v41, s57, 25
+; GISEL-NEXT:    v_writelane_b32 v41, s58, 26
+; GISEL-NEXT:    v_writelane_b32 v41, s59, 27
+; GISEL-NEXT:    v_writelane_b32 v41, s60, 28
+; GISEL-NEXT:    v_writelane_b32 v41, s61, 29
+; GISEL-NEXT:    v_writelane_b32 v41, s62, 30
+; GISEL-NEXT:    v_writelane_b32 v41, s63, 31
+; GISEL-NEXT:    v_mov_b32_e32 v40, v0
 ; GISEL-NEXT:    s_mov_b64 s[4:5], exec
 ; GISEL-NEXT:  .LBB7_1: ; =>This Inner Loop Header: Depth=1
 ; GISEL-NEXT:    v_readfirstlane_b32 s6, v1
 ; GISEL-NEXT:    v_readfirstlane_b32 s7, v2
 ; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, s[6:7], v[1:2]
 ; GISEL-NEXT:    s_and_saveexec_b64 s[8:9], vcc
-; GISEL-NEXT:    v_mov_b32_e32 v0, v41
+; GISEL-NEXT:    v_mov_b32_e32 v0, v40
 ; GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
 ; GISEL-NEXT:    ; implicit-def: $vgpr1
 ; GISEL-NEXT:    s_xor_b64 exec, exec, s[8:9]
 ; GISEL-NEXT:    s_cbranch_execnz .LBB7_1
 ; GISEL-NEXT:  ; %bb.2:
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
-; GISEL-NEXT:    v_mov_b32_e32 v0, v41
-; GISEL-NEXT:    v_readlane_b32 s63, v40, 31
-; GISEL-NEXT:    v_readlane_b32 s62, v40, 30
-; GISEL-NEXT:    v_readlane_b32 s61, v40, 29
-; GISEL-NEXT:    v_readlane_b32 s60, v40, 28
-; GISEL-NEXT:    v_readlane_b32 s59, v40, 27
-; GISEL-NEXT:    v_readlane_b32 s58, v40, 26
-; GISEL-NEXT:    v_readlane_b32 s57, v40, 25
-; GISEL-NEXT:    v_readlane_b32 s56, v40, 24
-; GISEL-NEXT:    v_readlane_b32 s55, v40, 23
-; GISEL-NEXT:    v_readlane_b32 s54, v40, 22
-; GISEL-NEXT:    v_readlane_b32 s53, v40, 21
-; GISEL-NEXT:    v_readlane_b32 s52, v40, 20
-; GISEL-NEXT:    v_readlane_b32 s51, v40, 19
-; GISEL-NEXT:    v_readlane_b32 s50, v40, 18
-; GISEL-NEXT:    v_readlane_b32 s49, v40, 17
-; GISEL-NEXT:    v_readlane_b32 s48, v40, 16
-; GISEL-NEXT:    v_readlane_b32 s47, v40, 15
-; GISEL-NEXT:    v_readlane_b32 s46, v40, 14
-; GISEL-NEXT:    v_readlane_b32 s45, v40, 13
-; GISEL-NEXT:    v_readlane_b32 s44, v40, 12
-; GISEL-NEXT:    v_readlane_b32 s43, v40, 11
-; GISEL-NEXT:    v_readlane_b32 s42, v40, 10
-; GISEL-NEXT:    v_readlane_b32 s41, v40, 9
-; GISEL-NEXT:    v_readlane_b32 s40, v40, 8
-; GISEL-NEXT:    v_readlane_b32 s39, v40, 7
-; GISEL-NEXT:    v_readlane_b32 s38, v40, 6
-; GISEL-NEXT:    v_readlane_b32 s37, v40, 5
-; GISEL-NEXT:    v_readlane_b32 s36, v40, 4
-; GISEL-NEXT:    v_readlane_b32 s35, v40, 3
-; GISEL-NEXT:    v_readlane_b32 s34, v40, 2
-; GISEL-NEXT:    v_readlane_b32 s31, v40, 1
-; GISEL-NEXT:    v_readlane_b32 s30, v40, 0
-; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GISEL-NEXT:    v_mov_b32_e32 v0, v40
+; GISEL-NEXT:    v_readlane_b32 s63, v41, 31
+; GISEL-NEXT:    v_readlane_b32 s62, v41, 30
+; GISEL-NEXT:    v_readlane_b32 s61, v41, 29
+; GISEL-NEXT:    v_readlane_b32 s60, v41, 28
+; GISEL-NEXT:    v_readlane_b32 s59, v41, 27
+; GISEL-NEXT:    v_readlane_b32 s58, v41, 26
+; GISEL-NEXT:    v_readlane_b32 s57, v41, 25
+; GISEL-NEXT:    v_readlane_b32 s56, v41, 24
+; GISEL-NEXT:    v_readlane_b32 s55, v41, 23
+; GISEL-NEXT:    v_readlane_b32 s54, v41, 22
+; GISEL-NEXT:    v_readlane_b32 s53, v41, 21
+; GISEL-NEXT:    v_readlane_b32 s52, v41, 20
+; GISEL-NEXT:    v_readlane_b32 s51, v41, 19
+; GISEL-NEXT:    v_readlane_b32 s50, v41, 18
+; GISEL-NEXT:    v_readlane_b32 s49, v41, 17
+; GISEL-NEXT:    v_readlane_b32 s48, v41, 16
+; GISEL-NEXT:    v_readlane_b32 s47, v41, 15
+; GISEL-NEXT:    v_readlane_b32 s46, v41, 14
+; GISEL-NEXT:    v_readlane_b32 s45, v41, 13
+; GISEL-NEXT:    v_readlane_b32 s44, v41, 12
+; GISEL-NEXT:    v_readlane_b32 s43, v41, 11
+; GISEL-NEXT:    v_readlane_b32 s42, v41, 10
+; GISEL-NEXT:    v_readlane_b32 s41, v41, 9
+; GISEL-NEXT:    v_readlane_b32 s40, v41, 8
+; GISEL-NEXT:    v_readlane_b32 s39, v41, 7
+; GISEL-NEXT:    v_readlane_b32 s38, v41, 6
+; GISEL-NEXT:    v_readlane_b32 s37, v41, 5
+; GISEL-NEXT:    v_readlane_b32 s36, v41, 4
+; GISEL-NEXT:    v_readlane_b32 s35, v41, 3
+; GISEL-NEXT:    v_readlane_b32 s34, v41, 2
+; GISEL-NEXT:    v_readlane_b32 s31, v41, 1
+; GISEL-NEXT:    v_readlane_b32 s30, v41, 0
+; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_or_saveexec_b64 s[4:5], -1
-; GISEL-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GISEL-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GISEL-NEXT:    s_mov_b64 exec, s[4:5]
 ; GISEL-NEXT:    s_addk_i32 s32, 0xfc00
 ; GISEL-NEXT:    s_mov_b32 s33, s10
diff --git a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
index cfec77e68eae9..833aba9b26afd 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-waitcnts-crash.ll
@@ -13,7 +13,7 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT:   $sgpr16 = S_MOV_B32 $sgpr33
   ; CHECK-NEXT:   $sgpr33 = S_MOV_B32 $sgpr32
   ; CHECK-NEXT:   $sgpr17 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
+  ; CHECK-NEXT:   BUFFER_STORE_DWORD_OFFSET killed $vgpr40, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr17
   ; CHECK-NEXT:   $sgpr32 = frame-setup S_ADDK_I32 $sgpr32, 512, implicit-def dead $scc
   ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr16, 2, undef $vgpr40
@@ -26,8 +26,8 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT:   BUFFER_GL0_INV implicit $exec
   ; CHECK-NEXT:   BUFFER_GL1_INV implicit $exec
   ; CHECK-NEXT:   renamable $sgpr16_sgpr17 = S_LOAD_DWORDX2_IMM killed renamable $sgpr16_sgpr17, 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, $vgpr40
-  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, $vgpr40
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr30, 0, killed $vgpr40
+  ; CHECK-NEXT:   $vgpr40 = V_WRITELANE_B32 killed $sgpr31, 1, killed $vgpr40
   ; CHECK-NEXT:   S_WAITCNT 49279
   ; CHECK-NEXT:   dead $sgpr30_sgpr31 = SI_CALL killed renamable $sgpr16_sgpr17, @bar, csr_amdgpu, implicit killed $sgpr4_sgpr5, implicit killed $sgpr6_sgpr7, implicit killed $sgpr8_sgpr9, implicit killed $sgpr10_sgpr11, implicit killed $sgpr12, implicit killed $sgpr13, implicit killed $sgpr14, implicit killed $sgpr15, implicit killed $vgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3
   ; CHECK-NEXT:   $vcc_lo = S_MOV_B32 $exec_lo
@@ -43,7 +43,7 @@ define fastcc i32 @foo() {
   ; CHECK-NEXT: {{  $}}
   ; CHECK-NEXT:   $sgpr31 = V_READLANE_B32 $vgpr40, 1
   ; CHECK-NEXT:   $sgpr30 = V_READLANE_B32 $vgpr40, 0
-  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 $vgpr40, 2
+  ; CHECK-NEXT:   $sgpr4 = V_READLANE_B32 killed $vgpr40, 2
   ; CHECK-NEXT:   $sgpr5 = S_OR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
   ; CHECK-NEXT:   $vgpr40 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr33, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5)
   ; CHECK-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr5
diff --git a/llvm/test/CodeGen/AMDGPU/ipra.ll b/llvm/test/CodeGen/AMDGPU/ipra.ll
index 0769337127870..6c8646968b676 100644
--- a/llvm/test/CodeGen/AMDGPU/ipra.ll
+++ b/llvm/test/CodeGen/AMDGPU/ipra.ll
@@ -42,11 +42,11 @@ define amdgpu_kernel void @kernel_call() #0 {
 ; GCN-LABEL: {{^}}func_regular_call:
 ; GCN-NOT: buffer_load
 ; GCN-NOT: readlane
-; GCN: flat_load_dword v9
+; GCN: flat_load_dword v8
 ; GCN: s_swappc_b64
 ; GCN-NOT: buffer_load
 ; GCN-NOT: readlane
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v9
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 
 ; GCN: ; NumSgprs: 34
 ; GCN: ; NumVgprs: 10
@@ -72,9 +72,9 @@ define void @func_tail_call() #1 {
 }
 
 ; GCN-LABEL: {{^}}func_call_tail_call:
-; GCN: flat_load_dword v9
+; GCN: flat_load_dword v8
 ; GCN: s_swappc_b64
-; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v9
+; GCN: flat_store_dword v{{\[[0-9]+:[0-9]+\]}}, v8
 ; GCN: s_setpc_b64
 
 ; GCN: ; NumSgprs: 34
diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
index 797b13044e722..1e9994dd8e6ef 100644
--- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
+++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll
@@ -191,47 +191,47 @@ define void @slsr1_1(i32 %b.arg, i32 %s.arg) #0 {
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 5
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v43, s4, 5
+; GFX9-NEXT:    v_writelane_b32 v43, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v43, s31, 1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s34, 2
-; GFX9-NEXT:    v_writelane_b32 v40, s36, 3
+; GFX9-NEXT:    v_writelane_b32 v43, s34, 2
+; GFX9-NEXT:    v_writelane_b32 v43, s36, 3
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, foo@gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, foo@gotpcrel32@hi+12
-; GFX9-NEXT:    v_writelane_b32 v40, s37, 4
+; GFX9-NEXT:    v_writelane_b32 v43, s37, 4
 ; GFX9-NEXT:    s_load_dwordx2 s[36:37], s[4:5], 0x0
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v41, v1
-; GFX9-NEXT:    v_mov_b32_e32 v42, v0
-; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v42, v41
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v40, v1
+; GFX9-NEXT:    v_mov_b32_e32 v41, v0
+; GFX9-NEXT:    v_mul_u32_u24_e32 v0, v41, v40
 ; GFX9-NEXT:    s_mov_b32 s34, s15
-; GFX9-NEXT:    v_and_b32_e32 v43, 0xffffff, v41
+; GFX9-NEXT:    v_and_b32_e32 v42, 0xffffff, v40
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    v_mad_u32_u24 v41, v42, v41, v43
+; GFX9-NEXT:    v_mad_u32_u24 v40, v41, v40, v42
 ; GFX9-NEXT:    s_mov_b32 s15, s34
-; GFX9-NEXT:    v_mov_b32_e32 v0, v41
+; GFX9-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    v_add_u32_e32 v0, v41, v43
+; GFX9-NEXT:    v_add_u32_e32 v0, v40, v42
 ; GFX9-NEXT:    s_mov_b32 s15, s34
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[36:37]
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s37, v40, 4
-; GFX9-NEXT:    v_readlane_b32 s36, v40, 3
-; GFX9-NEXT:    v_readlane_b32 s34, v40, 2
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 5
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s37, v43, 4
+; GFX9-NEXT:    v_readlane_b32 s36, v43, 3
+; GFX9-NEXT:    v_readlane_b32 s34, v43, 2
+; GFX9-NEXT:    v_readlane_b32 s31, v43, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v43, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v43, 5
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
index e0efd09f1f14b..4ea77d1d1ac15 100644
--- a/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
+++ b/llvm/test/CodeGen/AMDGPU/s-getpc-b64-remat.ll
@@ -8,23 +8,23 @@ define void @test_remat_s_getpc_b64() {
 ; GFX9:       ; %bb.0: ; %entry
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v2, off, s[0:3], s32 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_mov_b32_e32 v2, s5
-; GFX9-NEXT:    global_store_dwordx2 v[1:2], v[1:2], off
-; GFX9-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    global_store_dwordx2 v[0:1], v[0:1], off
+; GFX9-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v2, 0
 ; GFX9-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX9-NEXT:    buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -33,23 +33,23 @@ define void @test_remat_s_getpc_b64() {
 ; GFX11:       ; %bb.0: ; %entry
 ; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
-; GFX11-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX11-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX11-NEXT:    global_store_b64 v[1:2], v[1:2], off
+; GFX11-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX11-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX11-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX11-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX11-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
@@ -62,26 +62,26 @@ define void @test_remat_s_getpc_b64() {
 ; GFX12-NEXT:    s_wait_bvhcnt 0x0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill
+; GFX12-NEXT:    scratch_store_b32 off, v2, s32 ; 4-byte Folded Spill
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
-; GFX12-NEXT:    v_writelane_b32 v0, s30, 0
+; GFX12-NEXT:    v_writelane_b32 v2, s30, 0
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ;;#ASMEND
-; GFX12-NEXT:    v_writelane_b32 v0, s31, 1
+; GFX12-NEXT:    v_writelane_b32 v2, s31, 1
 ; GFX12-NEXT:    ;;#ASMSTART
 ; GFX12-NEXT:    ;;#ASMEND
 ; GFX12-NEXT:    s_getpc_b64 s[0:1]
 ; GFX12-NEXT:    s_sext_i32_i16 s1, s1
-; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX12-NEXT:    v_readlane_b32 s31, v0, 1
-; GFX12-NEXT:    v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
-; GFX12-NEXT:    v_readlane_b32 s30, v0, 0
-; GFX12-NEXT:    global_store_b64 v[1:2], v[1:2], off
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_readlane_b32 s31, v2, 1
+; GFX12-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX12-NEXT:    global_store_b64 v[0:1], v[0:1], off
 ; GFX12-NEXT:    s_xor_saveexec_b32 s0, -1
-; GFX12-NEXT:    scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload
+; GFX12-NEXT:    scratch_load_b32 v2, off, s32 ; 4-byte Folded Reload
 ; GFX12-NEXT:    s_mov_b32 exec_lo, s0
 ; GFX12-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
index 4544b177351ee..6a2532147f886 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-overlap-wwm-reserve.mir
@@ -28,183 +28,183 @@ body:             |
   ; GCN-LABEL: name: test_main
   ; GCN: bb.0:
   ; GCN-NEXT:   successors: %bb.1(0x80000000)
-  ; GCN-NEXT:   liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr5
+  ; GCN-NEXT:   liveins: $sgpr4, $sgpr5, $sgpr6, $sgpr7, $sgpr8, $sgpr9, $sgpr10, $sgpr11, $sgpr12, $sgpr13, $sgpr14, $sgpr15, $sgpr16, $sgpr17, $sgpr18, $sgpr19, $sgpr20, $sgpr21, $sgpr22, $sgpr23, $sgpr24, $sgpr25, $sgpr26, $sgpr27, $sgpr28, $sgpr29, $sgpr30, $sgpr31, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $sgpr96, $sgpr97, $sgpr98, $sgpr99, $sgpr100, $sgpr101, $sgpr102, $sgpr103, $vgpr0, $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   $sgpr0 = COPY $sgpr33
   ; GCN-NEXT:   $sgpr33 = frame-setup COPY $sgpr32
   ; GCN-NEXT:   $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr1, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
-  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr4, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr3, $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.69, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr4, $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.70, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr5, $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.71, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR $vgpr2, $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.72, addrspace 5)
+  ; GCN-NEXT:   SCRATCH_STORE_DWORD_SADDR killed $vgpr1, $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (store (s32) into %stack.74, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr1
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr3
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr0, 4, undef $vgpr5
   ; GCN-NEXT:   $sgpr32 = frame-setup S_ADD_I32 $sgpr32, 24, implicit-def dead $scc
-  ; GCN-NEXT:   renamable $vgpr5 = IMPLICIT_DEF
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr1
-  ; GCN-NEXT:   $vgpr1 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr1
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr2
-  ; GCN-NEXT:   $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr2
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr3
-  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr3
+  ; GCN-NEXT:   renamable $vgpr2 = IMPLICIT_DEF
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr4, 0, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr5, 1, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr6, 2, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr7, 3, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr8, 4, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr9, 5, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr10, 6, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr11, 7, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr12, 8, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr13, 9, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr14, 10, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr15, 11, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr16, 12, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr17, 13, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr18, 14, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr19, 15, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr20, 16, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr21, 17, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr22, 18, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr23, 19, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr24, 20, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr25, 21, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr26, 22, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr27, 23, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr28, 24, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr29, 25, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr30, 26, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr31, 27, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr64, 28, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr65, 29, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr66, 30, $vgpr3
+  ; GCN-NEXT:   $vgpr3 = SI_SPILL_S32_TO_VGPR $sgpr67, 31, $vgpr3
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr68, 0, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr69, 1, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr70, 2, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr71, 3, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr72, 4, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr73, 5, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr74, 6, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr75, 7, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr76, 8, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr77, 9, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr78, 10, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr79, 11, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr80, 12, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr81, 13, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr82, 14, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr83, 15, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr84, 16, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr85, 17, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr86, 18, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr87, 19, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr88, 20, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr89, 21, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr90, 22, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr91, 23, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr92, 24, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr93, 25, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr94, 26, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr95, 27, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr96, 28, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr97, 29, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr98, 30, $vgpr4
+  ; GCN-NEXT:   $vgpr4 = SI_SPILL_S32_TO_VGPR $sgpr99, 31, $vgpr4
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr100, 0, $vgpr5
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr101, 1, $vgpr5
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr102, 2, $vgpr5
+  ; GCN-NEXT:   $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr103, 3, $vgpr5
   ; GCN-NEXT:   $sgpr22 = IMPLICIT_DEF
-  ; GCN-NEXT:   renamable $vgpr5 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr5
-  ; GCN-NEXT:   dead $vgpr4 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
+  ; GCN-NEXT:   renamable $vgpr2 = SI_SPILL_S32_TO_VGPR $sgpr22, 0, killed $vgpr2
+  ; GCN-NEXT:   dead $vgpr1 = V_SET_INACTIVE_B32 $vgpr0, 0, implicit $exec, implicit-def $scc
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.1:
   ; GCN-NEXT:   successors: %bb.2(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr5
+  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   KILL implicit-def $vcc, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit-def $sgpr96_sgpr97_sgpr98_sgpr99_sgpr100_sgpr101_sgpr102_sgpr103
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.2:
   ; GCN-NEXT:   successors: %bb.3(0x80000000)
-  ; GCN-NEXT:   liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr5
+  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 0
+  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT: bb.3:
-  ; GCN-NEXT:   liveins: $vgpr1, $vgpr2, $vgpr3, $vgpr5
+  ; GCN-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5
   ; GCN-NEXT: {{  $}}
-  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
-  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
-  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
-  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
-  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 31
-  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 30
-  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 29
-  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 28
-  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 27
-  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 26
-  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 25
-  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 24
-  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 23
-  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 22
-  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 21
-  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 20
-  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 19
-  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 18
-  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 17
-  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 16
-  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 15
-  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 14
-  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 13
-  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 12
-  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 11
-  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 10
-  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 9
-  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 8
-  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 7
-  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 6
-  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 5
-  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 4
-  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 3
-  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 2
-  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 1
-  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr2, 0
-  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 31
-  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 30
-  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 29
-  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 28
-  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 27
-  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 26
-  ; GCN-NEXT:   $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 25
-  ; GCN-NEXT:   $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 24
-  ; GCN-NEXT:   $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 23
-  ; GCN-NEXT:   $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 22
-  ; GCN-NEXT:   $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 21
-  ; GCN-NEXT:   $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 20
-  ; GCN-NEXT:   $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 19
-  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 18
-  ; GCN-NEXT:   $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 17
-  ; GCN-NEXT:   $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 16
-  ; GCN-NEXT:   $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 15
-  ; GCN-NEXT:   $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 14
-  ; GCN-NEXT:   $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 13
-  ; GCN-NEXT:   $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 12
-  ; GCN-NEXT:   $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 11
-  ; GCN-NEXT:   $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 10
-  ; GCN-NEXT:   $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 9
-  ; GCN-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 8
-  ; GCN-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 7
-  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 6
-  ; GCN-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 5
-  ; GCN-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 4
-  ; GCN-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 3
-  ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 2
-  ; GCN-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 1
-  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr1, 0
-  ; GCN-NEXT:   KILL killed renamable $vgpr5
-  ; GCN-NEXT:   $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+  ; GCN-NEXT:   $sgpr103 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 3
+  ; GCN-NEXT:   $sgpr102 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 2
+  ; GCN-NEXT:   $sgpr101 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 1
+  ; GCN-NEXT:   $sgpr100 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 0
+  ; GCN-NEXT:   $sgpr99 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 31
+  ; GCN-NEXT:   $sgpr98 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 30
+  ; GCN-NEXT:   $sgpr97 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 29
+  ; GCN-NEXT:   $sgpr96 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 28
+  ; GCN-NEXT:   $sgpr95 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 27
+  ; GCN-NEXT:   $sgpr94 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 26
+  ; GCN-NEXT:   $sgpr93 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 25
+  ; GCN-NEXT:   $sgpr92 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 24
+  ; GCN-NEXT:   $sgpr91 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 23
+  ; GCN-NEXT:   $sgpr90 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 22
+  ; GCN-NEXT:   $sgpr89 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 21
+  ; GCN-NEXT:   $sgpr88 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 20
+  ; GCN-NEXT:   $sgpr87 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 19
+  ; GCN-NEXT:   $sgpr86 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 18
+  ; GCN-NEXT:   $sgpr85 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 17
+  ; GCN-NEXT:   $sgpr84 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 16
+  ; GCN-NEXT:   $sgpr83 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 15
+  ; GCN-NEXT:   $sgpr82 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 14
+  ; GCN-NEXT:   $sgpr81 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 13
+  ; GCN-NEXT:   $sgpr80 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 12
+  ; GCN-NEXT:   $sgpr79 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 11
+  ; GCN-NEXT:   $sgpr78 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 10
+  ; GCN-NEXT:   $sgpr77 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 9
+  ; GCN-NEXT:   $sgpr76 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 8
+  ; GCN-NEXT:   $sgpr75 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 7
+  ; GCN-NEXT:   $sgpr74 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 6
+  ; GCN-NEXT:   $sgpr73 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 5
+  ; GCN-NEXT:   $sgpr72 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 4
+  ; GCN-NEXT:   $sgpr71 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 3
+  ; GCN-NEXT:   $sgpr70 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 2
+  ; GCN-NEXT:   $sgpr69 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 1
+  ; GCN-NEXT:   $sgpr68 = SI_RESTORE_S32_FROM_VGPR $vgpr4, 0
+  ; GCN-NEXT:   $sgpr67 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 31
+  ; GCN-NEXT:   $sgpr66 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 30
+  ; GCN-NEXT:   $sgpr65 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 29
+  ; GCN-NEXT:   $sgpr64 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 28
+  ; GCN-NEXT:   $sgpr31 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 27
+  ; GCN-NEXT:   $sgpr30 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 26
+  ; GCN-NEXT:   $sgpr29 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 25
+  ; GCN-NEXT:   $sgpr28 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 24
+  ; GCN-NEXT:   $sgpr27 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 23
+  ; GCN-NEXT:   $sgpr26 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 22
+  ; GCN-NEXT:   $sgpr25 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 21
+  ; GCN-NEXT:   $sgpr24 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 20
+  ; GCN-NEXT:   $sgpr23 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 19
+  ; GCN-NEXT:   $sgpr22 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 18
+  ; GCN-NEXT:   $sgpr21 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 17
+  ; GCN-NEXT:   $sgpr20 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 16
+  ; GCN-NEXT:   $sgpr19 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 15
+  ; GCN-NEXT:   $sgpr18 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 14
+  ; GCN-NEXT:   $sgpr17 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 13
+  ; GCN-NEXT:   $sgpr16 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 12
+  ; GCN-NEXT:   $sgpr15 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 11
+  ; GCN-NEXT:   $sgpr14 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 10
+  ; GCN-NEXT:   $sgpr13 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 9
+  ; GCN-NEXT:   $sgpr12 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 8
+  ; GCN-NEXT:   $sgpr11 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 7
+  ; GCN-NEXT:   $sgpr10 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 6
+  ; GCN-NEXT:   $sgpr9 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 5
+  ; GCN-NEXT:   $sgpr8 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 4
+  ; GCN-NEXT:   $sgpr7 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 3
+  ; GCN-NEXT:   $sgpr6 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 2
+  ; GCN-NEXT:   $sgpr5 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 1
+  ; GCN-NEXT:   $sgpr4 = SI_RESTORE_S32_FROM_VGPR $vgpr3, 0
+  ; GCN-NEXT:   KILL killed renamable $vgpr2
+  ; GCN-NEXT:   $sgpr0 = SI_RESTORE_S32_FROM_VGPR $vgpr5, 4
   ; GCN-NEXT:   $sgpr1 = S_XOR_SAVEEXEC_B32 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec
-  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
-  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
-  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
-  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
-  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5)
+  ; GCN-NEXT:   $vgpr3 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 0, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.69, addrspace 5)
+  ; GCN-NEXT:   $vgpr4 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 4, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.70, addrspace 5)
+  ; GCN-NEXT:   $vgpr5 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 8, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.71, addrspace 5)
+  ; GCN-NEXT:   $vgpr2 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 12, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.72, addrspace 5)
+  ; GCN-NEXT:   $vgpr1 = SCRATCH_LOAD_DWORD_SADDR $sgpr33, 16, 0, implicit $exec, implicit $flat_scr :: (load (s32) from %stack.74, addrspace 5)
   ; GCN-NEXT:   $exec_lo = S_MOV_B32 killed $sgpr1
   ; GCN-NEXT:   $sgpr32 = frame-destroy S_ADD_I32 $sgpr32, -24, implicit-def dead $scc
   ; GCN-NEXT:   $sgpr33 = COPY $sgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 16550fc9588ae..f523b4a2495f1 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -1183,7 +1183,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill
@@ -1297,11 +1297,11 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    buffer_store_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    buffer_store_dword v255, off, s[0:3], s32 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v4, s34, 0
-; GCN-NEXT:    v_writelane_b32 v4, s35, 1
-; GCN-NEXT:    v_writelane_b32 v4, s36, 2
-; GCN-NEXT:    v_writelane_b32 v4, s37, 3
-; GCN-NEXT:    v_mov_b32_e32 v5, v3
+; GCN-NEXT:    v_writelane_b32 v5, s34, 0
+; GCN-NEXT:    v_writelane_b32 v5, s35, 1
+; GCN-NEXT:    v_writelane_b32 v5, s36, 2
+; GCN-NEXT:    v_writelane_b32 v5, s37, 3
+; GCN-NEXT:    v_mov_b32_e32 v4, v3
 ; GCN-NEXT:    v_mov_b32_e32 v3, v1
 ; GCN-NEXT:    ; implicit-def: $sgpr4
 ; GCN-NEXT:    ; implicit-def: $sgpr4
@@ -1310,30 +1310,30 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    ; implicit-def: $sgpr4
 ; GCN-NEXT:    ; implicit-def: $sgpr4
 ; GCN-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
-; GCN-NEXT:    v_mov_b32_e32 v3, v5
+; GCN-NEXT:    v_mov_b32_e32 v3, v4
 ; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
 ; GCN-NEXT:    ; implicit-def: $sgpr4_sgpr5
-; GCN-NEXT:    flat_load_dwordx4 v[5:8], v[2:3]
+; GCN-NEXT:    flat_load_dwordx4 v[6:9], v[2:3]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
-; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload
 ; GCN-NEXT:    ;;#ASMSTART
 ; GCN-NEXT:    ;;#ASMEND
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
-; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[5:8]
-; GCN-NEXT:    v_readlane_b32 s37, v4, 3
-; GCN-NEXT:    v_readlane_b32 s36, v4, 2
-; GCN-NEXT:    v_readlane_b32 s35, v4, 1
-; GCN-NEXT:    v_readlane_b32 s34, v4, 0
+; GCN-NEXT:    flat_store_dwordx4 v[0:1], v[6:9]
+; GCN-NEXT:    v_readlane_b32 s37, v5, 3
+; GCN-NEXT:    v_readlane_b32 s36, v5, 2
+; GCN-NEXT:    v_readlane_b32 s35, v5, 1
+; GCN-NEXT:    v_readlane_b32 s34, v5, 0
 ; GCN-NEXT:    buffer_load_dword v255, off, s[0:3], s32 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v254, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v253, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
@@ -1447,7 +1447,7 @@ define void @spill_sgpr_no_free_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %in
 ; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload
 ; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GCN-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[4:5]
 ; GCN-NEXT:    s_waitcnt vmcnt(0)
 ; GCN-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
index 7a01679f9972c..2c0f64f85d823 100644
--- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll
@@ -211,15 +211,15 @@ entry:
 ; GCN-DAG: s_addc_u32 s5, s5, i32_fastcc_i32_i32@gotpcrel32@hi+12
 
 ; GCN-DAG: v_writelane_b32 [[CSRV]], s30, 0
-; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-DAG: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-DAG: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GCN-DAG: v_writelane_b32 [[CSRV]], s31, 1
 
 
 ; GCN: s_swappc_b64
 
-; GCN-DAG: buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-DAG: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 
 ; GCN: s_getpc_b64 s[4:5]
 ; GCN-NEXT: s_add_u32 s4, s4, sibling_call_i32_fastcc_i32_i32@rel32@lo+4
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
index d718b49321835..85a615c3d8ae8 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-csr-live-ins.mir
@@ -10,9 +10,9 @@ body:             |
   bb.0:
     liveins: $sgpr50
     ; CHECK-LABEL: name: spill_csr_sgpr_argument
-    ; CHECK: liveins: $sgpr50, $vgpr0
+    ; CHECK: liveins: $sgpr50, $vgpr63
     ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR $sgpr50, 0, $vgpr0
+    ; CHECK-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR $sgpr50, 0, $vgpr63
     ; CHECK-NEXT: S_NOP 0, implicit $sgpr50
     ; CHECK-NEXT: $sgpr50 = S_MOV_B32 0
     S_NOP 0, implicit $sgpr50
diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
index efb022ccb0d55..11babc82e919b 100644
--- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir
@@ -53,41 +53,41 @@ body:             |
   bb.0:
     liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-LABEL: name: sgpr_spill_lane_crossover
-    ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr0, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
+    ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr63, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
     ; GCN-NEXT: {{  $}}
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr67, 3, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr68, 4, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr69, 5, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr70, 6, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr0
-    ; GCN-NEXT: $vgpr0 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr0
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr64, 0, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr65, 1, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr66, 2, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr67, 3, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr68, 4, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr69, 5, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr70, 6, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr71, 7, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr72, 8, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr73, 9, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr74, 10, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr75, 11, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr76, 12, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr77, 13, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr78, 14, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr79, 15, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr80, 16, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr81, 17, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr82, 18, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr83, 19, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr84, 20, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr85, 21, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr86, 22, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr87, 23, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr88, 24, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr89, 25, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr90, 26, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr91, 27, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr92, 28, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr93, 29, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr94, 30, $vgpr63
+    ; GCN-NEXT: $vgpr63 = SI_SPILL_S32_TO_VGPR killed $sgpr95, 31, $vgpr63
     ; GCN-NEXT: S_NOP 0
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR killed $sgpr10, 0, [[DEF]]
     ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = SI_SPILL_S32_TO_VGPR $sgpr64, 1, [[DEF]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95
diff --git a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
index 00564a7db77bc..d2b960fe43f84 100644
--- a/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/spill_more_than_wavesize_csr_sgprs.ll
@@ -22,11 +22,11 @@ define void @spill_more_than_wavesize_csr_sgprs() {
 }
 
 ; CHECK-LABEL: {{^}}spill_more_than_wavesize_csr_sgprs_with_stack_object:
-; CHECK-DAG:    v_writelane_b32 v0, s98, 63
-; CHECK-DAG:    v_writelane_b32 v1, s99, 0
+; CHECK-DAG:    v_writelane_b32 v1, s98, 63
+; CHECK-DAG:    v_writelane_b32 v2, s99, 0
 ; CHECK-NOT: dummy
-; CHECK-DAG:    v_readlane_b32 s99, v1, 0
-; CHECK-DAG:    v_readlane_b32 s98, v0, 63
+; CHECK-DAG:    v_readlane_b32 s99, v2, 0
+; CHECK-DAG:    v_readlane_b32 s98, v1, 63
 define void @spill_more_than_wavesize_csr_sgprs_with_stack_object() {
   %alloca = alloca i32, align 4, addrspace(5)
   store volatile i32 0, ptr addrspace(5) %alloca
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index c352229f6a494..d8db2d5319868 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -1585,17 +1585,17 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s24, s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s33, s32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s16, -1
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s16
 ; WAVE32-WWM-PREALLOC-NEXT:    s_add_i32 s32, s32, 0x1200
-; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr33 : SGPR spill to VGPR lane
-; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s30, 0
-; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s31, 1
+; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
+; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s30, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s31, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s16, s32
-; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s16, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s16, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    s_lshr_b32 s16, s16, 5
-; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v33, s16, 1
+; WAVE32-WWM-PREALLOC-NEXT:    v_writelane_b32 v32, s16, 1
 ; WAVE32-WWM-PREALLOC-NEXT:    v_mov_b32_e32 v0, 42
 ; WAVE32-WWM-PREALLOC-NEXT:    buffer_store_dword v0, off, s[0:3], s33
 ; WAVE32-WWM-PREALLOC-NEXT:    s_waitcnt_vscnt null, 0x0
@@ -1673,18 +1673,18 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
 ; WAVE32-WWM-PREALLOC-NEXT:    ; implicit-def: $sgpr18
 ; WAVE32-WWM-PREALLOC-NEXT:    v_mov_b32_e32 v30, s18
 ; WAVE32-WWM-PREALLOC-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s5, v33, 1
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s4, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s5, v32, 1
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s4, v32, 0
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMSTART
 ; WAVE32-WWM-PREALLOC-NEXT:    ; use s5
 ; WAVE32-WWM-PREALLOC-NEXT:    ;;#ASMEND
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s32, s4
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v32, 1
-; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s30, v32, 0
-; WAVE32-WWM-PREALLOC-NEXT:    ; kill: killed $vgpr33
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s31, v33, 1
+; WAVE32-WWM-PREALLOC-NEXT:    v_readlane_b32 s30, v33, 0
+; WAVE32-WWM-PREALLOC-NEXT:    ; kill: killed $vgpr32
 ; WAVE32-WWM-PREALLOC-NEXT:    s_xor_saveexec_b32 s4, -1
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
-; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload
+; WAVE32-WWM-PREALLOC-NEXT:    buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 exec_lo, s4
 ; WAVE32-WWM-PREALLOC-NEXT:    s_add_i32 s32, s32, 0xffffee00
 ; WAVE32-WWM-PREALLOC-NEXT:    s_mov_b32 s33, s24
diff --git a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
index 57e4cec4eccb1..468a8463a06d6 100644
--- a/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AMDGPU/strictfp_f16_abi_promote.ll
@@ -252,31 +252,31 @@ define void @outgoing_f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_mov_b32 s16, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
-; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX7-NEXT:    s_mov_b32 s17, f16_result@abs32@hi
 ; GFX7-NEXT:    s_mov_b32 s16, f16_result@abs32@lo
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX7-NEXT:    v_mov_b32_e32 v42, v1
-; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v1
+; GFX7-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    flat_store_short v[41:42], v0
-; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    flat_store_short v[40:41], v0
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
@@ -294,37 +294,37 @@ define void @outgoing_v2f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_mov_b32 s16, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
-; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX7-NEXT:    s_mov_b32 s17, v2f16_result@abs32@hi
 ; GFX7-NEXT:    s_mov_b32 s16, v2f16_result@abs32@lo
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX7-NEXT:    v_mov_b32_e32 v42, v1
-; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v1
+; GFX7-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
 ; GFX7-NEXT:    v_and_b32_e32 v1, 0xffff, v1
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v1, v1
 ; GFX7-NEXT:    v_and_b32_e32 v0, 0xffff, v0
 ; GFX7-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
 ; GFX7-NEXT:    v_or_b32_e32 v0, v0, v1
-; GFX7-NEXT:    flat_store_dword v[41:42], v0
-; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    flat_store_dword v[40:41], v0
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
@@ -342,18 +342,18 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_mov_b32 s16, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
-; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX7-NEXT:    s_mov_b32 s17, v4f16_result@abs32@hi
 ; GFX7-NEXT:    s_mov_b32 s16, v4f16_result@abs32@lo
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX7-NEXT:    v_mov_b32_e32 v42, v1
-; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v1
+; GFX7-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -375,17 +375,17 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    v_or_b32_e32 v4, v0, v1
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v2, v2, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v41
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v40
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
-; GFX7-NEXT:    flat_store_dword v[41:42], v4
-; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    flat_store_dword v[40:41], v4
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
@@ -403,18 +403,18 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    s_mov_b32 s16, s33
 ; GFX7-NEXT:    s_mov_b32 s33, s32
 ; GFX7-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX7-NEXT:    s_mov_b64 exec, s[18:19]
-; GFX7-NEXT:    v_writelane_b32 v40, s16, 2
-; GFX7-NEXT:    v_writelane_b32 v40, s30, 0
+; GFX7-NEXT:    v_writelane_b32 v42, s16, 2
+; GFX7-NEXT:    v_writelane_b32 v42, s30, 0
 ; GFX7-NEXT:    s_mov_b32 s17, v8f16_result@abs32@hi
 ; GFX7-NEXT:    s_mov_b32 s16, v8f16_result@abs32@lo
 ; GFX7-NEXT:    s_addk_i32 s32, 0x400
-; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX7-NEXT:    buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX7-NEXT:    v_writelane_b32 v40, s31, 1
-; GFX7-NEXT:    v_mov_b32_e32 v42, v1
-; GFX7-NEXT:    v_mov_b32_e32 v41, v0
+; GFX7-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX7-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX7-NEXT:    v_writelane_b32 v42, s31, 1
+; GFX7-NEXT:    v_mov_b32_e32 v41, v1
+; GFX7-NEXT:    v_mov_b32_e32 v40, v0
 ; GFX7-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v1, v1
 ; GFX7-NEXT:    v_cvt_f16_f32_e32 v0, v0
@@ -456,23 +456,23 @@ define void @outgoing_v8f16_return(ptr %ptr) #0 {
 ; GFX7-NEXT:    v_or_b32_e32 v5, v1, v0
 ; GFX7-NEXT:    v_lshlrev_b32_e32 v0, 16, v3
 ; GFX7-NEXT:    v_or_b32_e32 v3, v4, v0
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v41
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 12, v40
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
 ; GFX7-NEXT:    flat_store_dword v[0:1], v3
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v41
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 8, v40
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
 ; GFX7-NEXT:    flat_store_dword v[0:1], v5
-; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v41
-; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v42, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 4, v40
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v41, vcc
 ; GFX7-NEXT:    flat_store_dword v[0:1], v2
-; GFX7-NEXT:    flat_store_dword v[41:42], v8
-; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX7-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX7-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX7-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX7-NEXT:    flat_store_dword v[40:41], v8
+; GFX7-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX7-NEXT:    v_readlane_b32 s31, v42, 1
+; GFX7-NEXT:    v_readlane_b32 s30, v42, 0
+; GFX7-NEXT:    v_readlane_b32 s4, v42, 2
 ; GFX7-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX7-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX7-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX7-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX7-NEXT:    s_addk_i32 s32, 0xfc00
 ; GFX7-NEXT:    s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
index dda41ec131a31..ebbce68221a94 100644
--- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
+++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll
@@ -9,27 +9,27 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 16
+; GCN-NEXT:    v_writelane_b32 v41, s16, 16
 ; GCN-NEXT:    s_addk_i32 s32, 0x400
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v41, s30, 0
+; GCN-NEXT:    v_writelane_b32 v41, s31, 1
+; GCN-NEXT:    v_writelane_b32 v41, s34, 2
+; GCN-NEXT:    v_writelane_b32 v41, s35, 3
+; GCN-NEXT:    v_writelane_b32 v41, s36, 4
+; GCN-NEXT:    v_writelane_b32 v41, s37, 5
+; GCN-NEXT:    v_writelane_b32 v41, s38, 6
+; GCN-NEXT:    v_writelane_b32 v41, s39, 7
+; GCN-NEXT:    v_writelane_b32 v41, s40, 8
+; GCN-NEXT:    v_writelane_b32 v41, s41, 9
+; GCN-NEXT:    v_writelane_b32 v41, s42, 10
+; GCN-NEXT:    v_writelane_b32 v41, s43, 11
+; GCN-NEXT:    v_writelane_b32 v41, s44, 12
+; GCN-NEXT:    v_writelane_b32 v41, s45, 13
+; GCN-NEXT:    v_writelane_b32 v41, s46, 14
+; GCN-NEXT:    v_writelane_b32 v41, s47, 15
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
 ; GCN-NEXT:    flat_load_dword v0, v[0:1]
@@ -58,9 +58,9 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_mov_b32 s43, s13
 ; GCN-NEXT:    s_mov_b32 s44, s14
 ; GCN-NEXT:    s_mov_b32 s45, s15
-; GCN-NEXT:    v_mov_b32_e32 v41, v31
+; GCN-NEXT:    v_mov_b32_e32 v40, v31
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
-; GCN-NEXT:    v_mov_b32_e32 v31, v41
+; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_mov_b32 s12, s42
 ; GCN-NEXT:    s_mov_b32 s13, s43
 ; GCN-NEXT:    s_mov_b32 s14, s44
@@ -93,26 +93,26 @@ define hidden void @widget() {
 ; GCN-NEXT:    s_addc_u32 s17, s17, wibble@rel32@hi+12
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[16:17]
 ; GCN-NEXT:  .LBB0_8: ; %UnifiedReturnBlock
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s4, v40, 16
+; GCN-NEXT:    v_readlane_b32 s47, v41, 15
+; GCN-NEXT:    v_readlane_b32 s46, v41, 14
+; GCN-NEXT:    v_readlane_b32 s45, v41, 13
+; GCN-NEXT:    v_readlane_b32 s44, v41, 12
+; GCN-NEXT:    v_readlane_b32 s43, v41, 11
+; GCN-NEXT:    v_readlane_b32 s42, v41, 10
+; GCN-NEXT:    v_readlane_b32 s41, v41, 9
+; GCN-NEXT:    v_readlane_b32 s40, v41, 8
+; GCN-NEXT:    v_readlane_b32 s39, v41, 7
+; GCN-NEXT:    v_readlane_b32 s38, v41, 6
+; GCN-NEXT:    v_readlane_b32 s37, v41, 5
+; GCN-NEXT:    v_readlane_b32 s36, v41, 4
+; GCN-NEXT:    v_readlane_b32 s35, v41, 3
+; GCN-NEXT:    v_readlane_b32 s34, v41, 2
+; GCN-NEXT:    v_readlane_b32 s31, v41, 1
+; GCN-NEXT:    v_readlane_b32 s30, v41, 0
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    v_readlane_b32 s4, v41, 16
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xfc00
 ; GCN-NEXT:    s_mov_b32 s33, s4
@@ -257,45 +257,45 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b32 s16, s33
 ; GCN-NEXT:    s_mov_b32 s33, s32
 ; GCN-NEXT:    s_or_saveexec_b64 s[18:19], -1
-; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GCN-NEXT:    s_mov_b64 exec, s[18:19]
-; GCN-NEXT:    v_writelane_b32 v40, s16, 28
+; GCN-NEXT:    v_writelane_b32 v45, s16, 28
 ; GCN-NEXT:    s_addk_i32 s32, 0x800
-; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
-; GCN-NEXT:    v_writelane_b32 v40, s30, 0
-; GCN-NEXT:    v_writelane_b32 v40, s31, 1
-; GCN-NEXT:    v_writelane_b32 v40, s34, 2
-; GCN-NEXT:    v_writelane_b32 v40, s35, 3
-; GCN-NEXT:    v_writelane_b32 v40, s36, 4
-; GCN-NEXT:    v_writelane_b32 v40, s37, 5
-; GCN-NEXT:    v_writelane_b32 v40, s38, 6
-; GCN-NEXT:    v_writelane_b32 v40, s39, 7
-; GCN-NEXT:    v_writelane_b32 v40, s40, 8
-; GCN-NEXT:    v_writelane_b32 v40, s41, 9
-; GCN-NEXT:    v_writelane_b32 v40, s42, 10
-; GCN-NEXT:    v_writelane_b32 v40, s43, 11
-; GCN-NEXT:    v_writelane_b32 v40, s44, 12
-; GCN-NEXT:    v_writelane_b32 v40, s45, 13
-; GCN-NEXT:    v_writelane_b32 v40, s46, 14
-; GCN-NEXT:    v_writelane_b32 v40, s47, 15
-; GCN-NEXT:    v_writelane_b32 v40, s48, 16
-; GCN-NEXT:    v_writelane_b32 v40, s49, 17
-; GCN-NEXT:    v_writelane_b32 v40, s50, 18
-; GCN-NEXT:    v_writelane_b32 v40, s51, 19
-; GCN-NEXT:    v_writelane_b32 v40, s52, 20
-; GCN-NEXT:    v_writelane_b32 v40, s53, 21
-; GCN-NEXT:    v_writelane_b32 v40, s54, 22
-; GCN-NEXT:    v_writelane_b32 v40, s55, 23
-; GCN-NEXT:    v_writelane_b32 v40, s56, 24
-; GCN-NEXT:    v_writelane_b32 v40, s57, 25
-; GCN-NEXT:    v_writelane_b32 v40, s58, 26
-; GCN-NEXT:    v_writelane_b32 v40, s59, 27
+; GCN-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GCN-NEXT:    v_writelane_b32 v45, s30, 0
+; GCN-NEXT:    v_writelane_b32 v45, s31, 1
+; GCN-NEXT:    v_writelane_b32 v45, s34, 2
+; GCN-NEXT:    v_writelane_b32 v45, s35, 3
+; GCN-NEXT:    v_writelane_b32 v45, s36, 4
+; GCN-NEXT:    v_writelane_b32 v45, s37, 5
+; GCN-NEXT:    v_writelane_b32 v45, s38, 6
+; GCN-NEXT:    v_writelane_b32 v45, s39, 7
+; GCN-NEXT:    v_writelane_b32 v45, s40, 8
+; GCN-NEXT:    v_writelane_b32 v45, s41, 9
+; GCN-NEXT:    v_writelane_b32 v45, s42, 10
+; GCN-NEXT:    v_writelane_b32 v45, s43, 11
+; GCN-NEXT:    v_writelane_b32 v45, s44, 12
+; GCN-NEXT:    v_writelane_b32 v45, s45, 13
+; GCN-NEXT:    v_writelane_b32 v45, s46, 14
+; GCN-NEXT:    v_writelane_b32 v45, s47, 15
+; GCN-NEXT:    v_writelane_b32 v45, s48, 16
+; GCN-NEXT:    v_writelane_b32 v45, s49, 17
+; GCN-NEXT:    v_writelane_b32 v45, s50, 18
+; GCN-NEXT:    v_writelane_b32 v45, s51, 19
+; GCN-NEXT:    v_writelane_b32 v45, s52, 20
+; GCN-NEXT:    v_writelane_b32 v45, s53, 21
+; GCN-NEXT:    v_writelane_b32 v45, s54, 22
+; GCN-NEXT:    v_writelane_b32 v45, s55, 23
+; GCN-NEXT:    v_writelane_b32 v45, s56, 24
+; GCN-NEXT:    v_writelane_b32 v45, s57, 25
+; GCN-NEXT:    v_writelane_b32 v45, s58, 26
+; GCN-NEXT:    v_writelane_b32 v45, s59, 27
 ; GCN-NEXT:    s_mov_b64 s[34:35], s[6:7]
-; GCN-NEXT:    v_mov_b32_e32 v41, v31
+; GCN-NEXT:    v_mov_b32_e32 v40, v31
 ; GCN-NEXT:    s_mov_b32 s46, s15
 ; GCN-NEXT:    s_mov_b32 s47, s14
 ; GCN-NEXT:    s_mov_b32 s48, s13
@@ -305,18 +305,18 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b64 s[40:41], s[4:5]
 ; GCN-NEXT:    v_mov_b32_e32 v0, 0
 ; GCN-NEXT:    v_mov_b32_e32 v1, 0
-; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v41
-; GCN-NEXT:    v_mov_b32_e32 v43, 0
-; GCN-NEXT:    flat_load_dword v44, v[0:1]
+; GCN-NEXT:    v_and_b32_e32 v2, 0x3ff, v40
+; GCN-NEXT:    v_mov_b32_e32 v42, 0
+; GCN-NEXT:    flat_load_dword v43, v[0:1]
 ; GCN-NEXT:    s_mov_b64 s[50:51], 0
 ; GCN-NEXT:    s_getpc_b64 s[52:53]
 ; GCN-NEXT:    s_add_u32 s52, s52, spam@rel32@lo+4
 ; GCN-NEXT:    s_addc_u32 s53, s53, spam@rel32@hi+12
-; GCN-NEXT:    v_lshlrev_b32_e32 v42, 2, v2
+; GCN-NEXT:    v_lshlrev_b32_e32 v41, 2, v2
 ; GCN-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN-NEXT:    v_cmp_eq_f32_e64 s[54:55], 0, v44
-; GCN-NEXT:    v_cmp_neq_f32_e64 s[42:43], 0, v44
-; GCN-NEXT:    v_mov_b32_e32 v45, 0x7fc00000
+; GCN-NEXT:    v_cmp_eq_f32_e64 s[54:55], 0, v43
+; GCN-NEXT:    v_cmp_neq_f32_e64 s[42:43], 0, v43
+; GCN-NEXT:    v_mov_b32_e32 v44, 0x7fc00000
 ; GCN-NEXT:    s_branch .LBB1_2
 ; GCN-NEXT:  .LBB1_1: ; %Flow7
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -327,8 +327,8 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_18
 ; GCN-NEXT:  .LBB1_2: ; %bb2
 ; GCN-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GCN-NEXT:    flat_load_dword v0, v[42:43]
-; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], 0
+; GCN-NEXT:    flat_load_dword v0, v[41:42]
+; GCN-NEXT:    buffer_store_dword v42, off, s[0:3], 0
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
 ; GCN-NEXT:    s_waitcnt vmcnt(1)
 ; GCN-NEXT:    v_cmp_lt_i32_e32 vcc, 2, v0
@@ -351,7 +351,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_mov_b32 s13, s48
 ; GCN-NEXT:    s_mov_b32 s14, s47
 ; GCN-NEXT:    s_mov_b32 s15, s46
-; GCN-NEXT:    v_mov_b32_e32 v31, v41
+; GCN-NEXT:    v_mov_b32_e32 v31, v40
 ; GCN-NEXT:    s_swappc_b64 s[30:31], s[52:53]
 ; GCN-NEXT:    v_cmp_neq_f32_e32 vcc, 0, v0
 ; GCN-NEXT:    s_mov_b64 s[6:7], 0
@@ -364,7 +364,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_7
 ; GCN-NEXT:  ; %bb.6: ; %bb16
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_or_b64 s[8:9], s[54:55], exec
 ; GCN-NEXT:  .LBB1_7: ; %Flow3
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -376,7 +376,7 @@ define hidden void @blam() {
 ; GCN-NEXT:  ; %bb.8: ; %bb17
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_mov_b64 s[6:7], exec
-; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v43, off, s[0:3], 0
 ; GCN-NEXT:  .LBB1_9: ; %Flow4
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
 ; GCN-NEXT:    s_or_b64 exec, exec, s[8:9]
@@ -406,7 +406,7 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_15
 ; GCN-NEXT:  ; %bb.14: ; %bb10
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_or_b64 s[10:11], s[6:7], exec
 ; GCN-NEXT:  .LBB1_15: ; %Flow6
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
@@ -424,47 +424,47 @@ define hidden void @blam() {
 ; GCN-NEXT:    s_cbranch_execz .LBB1_1
 ; GCN-NEXT:  ; %bb.17: ; %bb18
 ; GCN-NEXT:    ; in Loop: Header=BB1_2 Depth=1
-; GCN-NEXT:    buffer_store_dword v45, off, s[0:3], 0
+; GCN-NEXT:    buffer_store_dword v44, off, s[0:3], 0
 ; GCN-NEXT:    s_andn2_b64 s[4:5], s[4:5], exec
 ; GCN-NEXT:    s_branch .LBB1_1
 ; GCN-NEXT:  .LBB1_18: ; %DummyReturnBlock
 ; GCN-NEXT:    s_or_b64 exec, exec, s[50:51]
-; GCN-NEXT:    v_readlane_b32 s59, v40, 27
-; GCN-NEXT:    v_readlane_b32 s58, v40, 26
-; GCN-NEXT:    v_readlane_b32 s57, v40, 25
-; GCN-NEXT:    v_readlane_b32 s56, v40, 24
-; GCN-NEXT:    v_readlane_b32 s55, v40, 23
-; GCN-NEXT:    v_readlane_b32 s54, v40, 22
-; GCN-NEXT:    v_readlane_b32 s53, v40, 21
-; GCN-NEXT:    v_readlane_b32 s52, v40, 20
-; GCN-NEXT:    v_readlane_b32 s51, v40, 19
-; GCN-NEXT:    v_readlane_b32 s50, v40, 18
-; GCN-NEXT:    v_readlane_b32 s49, v40, 17
-; GCN-NEXT:    v_readlane_b32 s48, v40, 16
-; GCN-NEXT:    v_readlane_b32 s47, v40, 15
-; GCN-NEXT:    v_readlane_b32 s46, v40, 14
-; GCN-NEXT:    v_readlane_b32 s45, v40, 13
-; GCN-NEXT:    v_readlane_b32 s44, v40, 12
-; GCN-NEXT:    v_readlane_b32 s43, v40, 11
-; GCN-NEXT:    v_readlane_b32 s42, v40, 10
-; GCN-NEXT:    v_readlane_b32 s41, v40, 9
-; GCN-NEXT:    v_readlane_b32 s40, v40, 8
-; GCN-NEXT:    v_readlane_b32 s39, v40, 7
-; GCN-NEXT:    v_readlane_b32 s38, v40, 6
-; GCN-NEXT:    v_readlane_b32 s37, v40, 5
-; GCN-NEXT:    v_readlane_b32 s36, v40, 4
-; GCN-NEXT:    v_readlane_b32 s35, v40, 3
-; GCN-NEXT:    v_readlane_b32 s34, v40, 2
-; GCN-NEXT:    v_readlane_b32 s31, v40, 1
-; GCN-NEXT:    v_readlane_b32 s30, v40, 0
-; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GCN-NEXT:    v_readlane_b32 s4, v40, 28
+; GCN-NEXT:    v_readlane_b32 s59, v45, 27
+; GCN-NEXT:    v_readlane_b32 s58, v45, 26
+; GCN-NEXT:    v_readlane_b32 s57, v45, 25
+; GCN-NEXT:    v_readlane_b32 s56, v45, 24
+; GCN-NEXT:    v_readlane_b32 s55, v45, 23
+; GCN-NEXT:    v_readlane_b32 s54, v45, 22
+; GCN-NEXT:    v_readlane_b32 s53, v45, 21
+; GCN-NEXT:    v_readlane_b32 s52, v45, 20
+; GCN-NEXT:    v_readlane_b32 s51, v45, 19
+; GCN-NEXT:    v_readlane_b32 s50, v45, 18
+; GCN-NEXT:    v_readlane_b32 s49, v45, 17
+; GCN-NEXT:    v_readlane_b32 s48, v45, 16
+; GCN-NEXT:    v_readlane_b32 s47, v45, 15
+; GCN-NEXT:    v_readlane_b32 s46, v45, 14
+; GCN-NEXT:    v_readlane_b32 s45, v45, 13
+; GCN-NEXT:    v_readlane_b32 s44, v45, 12
+; GCN-NEXT:    v_readlane_b32 s43, v45, 11
+; GCN-NEXT:    v_readlane_b32 s42, v45, 10
+; GCN-NEXT:    v_readlane_b32 s41, v45, 9
+; GCN-NEXT:    v_readlane_b32 s40, v45, 8
+; GCN-NEXT:    v_readlane_b32 s39, v45, 7
+; GCN-NEXT:    v_readlane_b32 s38, v45, 6
+; GCN-NEXT:    v_readlane_b32 s37, v45, 5
+; GCN-NEXT:    v_readlane_b32 s36, v45, 4
+; GCN-NEXT:    v_readlane_b32 s35, v45, 3
+; GCN-NEXT:    v_readlane_b32 s34, v45, 2
+; GCN-NEXT:    v_readlane_b32 s31, v45, 1
+; GCN-NEXT:    v_readlane_b32 s30, v45, 0
+; GCN-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GCN-NEXT:    v_readlane_b32 s4, v45, 28
 ; GCN-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GCN-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GCN-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GCN-NEXT:    s_mov_b64 exec, s[6:7]
 ; GCN-NEXT:    s_addk_i32 s32, 0xf800
 ; GCN-NEXT:    s_mov_b32 s33, s4
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
index f3276719ac13c..5ced02f28c977 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-large-tuple-alloc-error.ll
@@ -1,188 +1,214 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx900 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX900 %s
 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx906 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s
-; RUN: not --crash llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs < %s 2>&1 | FileCheck -check-prefix=GFX908-ERR %s
+; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s
 ; RUN: llc -mtriple amdgcn-amd-amdhsa -mcpu=gfx90a -O0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90a %s
 
-; This test crashes for gfx908 while allocating the tuple. Compared to the other subtargets,
+; This test used to crash for gfx908 while allocating the tuple. Compared to the other subtargets,
 ; gfx908 marks an extra VGPR reserved for AGPR to VGPR copy that puts more register pressure.
-
-; GFX908-ERR: error: ran out of registers during register allocation
+; To minimize the register pressure, the VGPRs used for CSR SGPR spilling has been picked from the
+; higher available range there by allowing more VGPRs available in the lowest range for allocation.
 
 define i32 @test_tuple(<16 x i64> %0) {
 ; GFX900-LABEL: test_tuple:
 ; GFX900:       ; %bb.0:
 ; GFX900-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX900-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX900-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX900-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX900-NEXT:    v_writelane_b32 v31, s36, 0
-; GFX900-NEXT:    v_writelane_b32 v31, s37, 1
-; GFX900-NEXT:    v_writelane_b32 v31, s38, 2
-; GFX900-NEXT:    v_writelane_b32 v31, s39, 3
-; GFX900-NEXT:    v_writelane_b32 v31, s40, 4
-; GFX900-NEXT:    v_writelane_b32 v31, s41, 5
-; GFX900-NEXT:    v_writelane_b32 v31, s42, 6
-; GFX900-NEXT:    v_writelane_b32 v31, s43, 7
-; GFX900-NEXT:    v_writelane_b32 v31, s44, 8
-; GFX900-NEXT:    v_writelane_b32 v31, s45, 9
-; GFX900-NEXT:    v_writelane_b32 v31, s46, 10
-; GFX900-NEXT:    v_writelane_b32 v31, s47, 11
-; GFX900-NEXT:    v_writelane_b32 v31, s48, 12
-; GFX900-NEXT:    v_writelane_b32 v31, s49, 13
-; GFX900-NEXT:    v_writelane_b32 v31, s50, 14
-; GFX900-NEXT:    v_writelane_b32 v31, s51, 15
-; GFX900-NEXT:    v_writelane_b32 v31, s52, 16
-; GFX900-NEXT:    v_writelane_b32 v31, s53, 17
-; GFX900-NEXT:    v_writelane_b32 v31, s54, 18
-; GFX900-NEXT:    v_writelane_b32 v31, s55, 19
-; GFX900-NEXT:    v_writelane_b32 v31, s56, 20
-; GFX900-NEXT:    v_writelane_b32 v31, s57, 21
-; GFX900-NEXT:    v_writelane_b32 v31, s58, 22
-; GFX900-NEXT:    v_writelane_b32 v31, s59, 23
-; GFX900-NEXT:    v_writelane_b32 v31, s60, 24
-; GFX900-NEXT:    v_writelane_b32 v31, s61, 25
-; GFX900-NEXT:    v_writelane_b32 v31, s62, 26
-; GFX900-NEXT:    v_writelane_b32 v31, s63, 27
-; GFX900-NEXT:    v_writelane_b32 v31, s64, 28
-; GFX900-NEXT:    v_writelane_b32 v31, s65, 29
-; GFX900-NEXT:    v_writelane_b32 v31, s66, 30
-; GFX900-NEXT:    v_writelane_b32 v31, s67, 31
-; GFX900-NEXT:    v_mov_b32_e32 v32, v0
-; GFX900-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; implicit-def: $sgpr4
-; GFX900-NEXT:    ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec
-; GFX900-NEXT:    v_mov_b32_e32 v33, v1
-; GFX900-NEXT:    v_mov_b32_e32 v34, v2
-; GFX900-NEXT:    v_mov_b32_e32 v35, v3
-; GFX900-NEXT:    v_mov_b32_e32 v36, v4
-; GFX900-NEXT:    v_mov_b32_e32 v37, v5
-; GFX900-NEXT:    v_mov_b32_e32 v38, v6
-; GFX900-NEXT:    v_mov_b32_e32 v39, v7
-; GFX900-NEXT:    v_mov_b32_e32 v40, v8
-; GFX900-NEXT:    v_mov_b32_e32 v41, v9
-; GFX900-NEXT:    v_mov_b32_e32 v42, v10
-; GFX900-NEXT:    v_mov_b32_e32 v43, v11
-; GFX900-NEXT:    v_mov_b32_e32 v44, v12
-; GFX900-NEXT:    v_mov_b32_e32 v45, v13
-; GFX900-NEXT:    v_mov_b32_e32 v46, v14
-; GFX900-NEXT:    v_mov_b32_e32 v47, v15
-; GFX900-NEXT:    v_mov_b32_e32 v48, v16
-; GFX900-NEXT:    v_mov_b32_e32 v49, v17
-; GFX900-NEXT:    v_mov_b32_e32 v50, v18
-; GFX900-NEXT:    v_mov_b32_e32 v51, v19
-; GFX900-NEXT:    v_mov_b32_e32 v52, v20
-; GFX900-NEXT:    v_mov_b32_e32 v53, v21
-; GFX900-NEXT:    v_mov_b32_e32 v54, v22
-; GFX900-NEXT:    v_mov_b32_e32 v55, v23
-; GFX900-NEXT:    v_mov_b32_e32 v56, v24
-; GFX900-NEXT:    v_mov_b32_e32 v57, v25
-; GFX900-NEXT:    v_mov_b32_e32 v58, v26
-; GFX900-NEXT:    v_mov_b32_e32 v59, v27
-; GFX900-NEXT:    v_mov_b32_e32 v60, v28
-; GFX900-NEXT:    v_mov_b32_e32 v61, v29
-; GFX900-NEXT:    v_mov_b32_e32 v62, v30
-; GFX900-NEXT:    ; kill: def $vgpr63 killed $vgpr0 killed $exec
+; GFX900-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX900-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX900-NEXT:    v_writelane_b32 v63, s36, 0
+; GFX900-NEXT:    v_writelane_b32 v63, s37, 1
+; GFX900-NEXT:    v_writelane_b32 v63, s38, 2
+; GFX900-NEXT:    v_writelane_b32 v63, s39, 3
+; GFX900-NEXT:    v_writelane_b32 v63, s40, 4
+; GFX900-NEXT:    v_writelane_b32 v63, s41, 5
+; GFX900-NEXT:    v_writelane_b32 v63, s42, 6
+; GFX900-NEXT:    v_writelane_b32 v63, s43, 7
+; GFX900-NEXT:    v_writelane_b32 v63, s44, 8
+; GFX900-NEXT:    v_writelane_b32 v63, s45, 9
+; GFX900-NEXT:    v_writelane_b32 v63, s46, 10
+; GFX900-NEXT:    v_writelane_b32 v63, s47, 11
+; GFX900-NEXT:    v_writelane_b32 v63, s48, 12
+; GFX900-NEXT:    v_writelane_b32 v63, s49, 13
+; GFX900-NEXT:    v_writelane_b32 v63, s50, 14
+; GFX900-NEXT:    v_writelane_b32 v63, s51, 15
+; GFX900-NEXT:    v_writelane_b32 v63, s52, 16
+; GFX900-NEXT:    v_writelane_b32 v63, s53, 17
+; GFX900-NEXT:    v_writelane_b32 v63, s54, 18
+; GFX900-NEXT:    v_writelane_b32 v63, s55, 19
+; GFX900-NEXT:    v_writelane_b32 v63, s56, 20
+; GFX900-NEXT:    v_writelane_b32 v63, s57, 21
+; GFX900-NEXT:    v_writelane_b32 v63, s58, 22
+; GFX900-NEXT:    v_writelane_b32 v63, s59, 23
+; GFX900-NEXT:    v_writelane_b32 v63, s60, 24
+; GFX900-NEXT:    v_writelane_b32 v63, s61, 25
+; GFX900-NEXT:    v_writelane_b32 v63, s62, 26
+; GFX900-NEXT:    v_writelane_b32 v63, s63, 27
+; GFX900-NEXT:    v_writelane_b32 v63, s64, 28
+; GFX900-NEXT:    v_writelane_b32 v63, s65, 29
+; GFX900-NEXT:    v_writelane_b32 v63, s66, 30
+; GFX900-NEXT:    v_writelane_b32 v63, s67, 31
+; GFX900-NEXT:    v_mov_b32_e32 v33, v30
+; GFX900-NEXT:    v_mov_b32_e32 v34, v29
+; GFX900-NEXT:    v_mov_b32_e32 v35, v28
+; GFX900-NEXT:    v_mov_b32_e32 v36, v27
+; GFX900-NEXT:    v_mov_b32_e32 v37, v26
+; GFX900-NEXT:    v_mov_b32_e32 v38, v25
+; GFX900-NEXT:    v_mov_b32_e32 v39, v24
+; GFX900-NEXT:    v_mov_b32_e32 v48, v23
+; GFX900-NEXT:    v_mov_b32_e32 v49, v22
+; GFX900-NEXT:    v_mov_b32_e32 v50, v21
+; GFX900-NEXT:    v_mov_b32_e32 v51, v20
+; GFX900-NEXT:    v_mov_b32_e32 v52, v19
+; GFX900-NEXT:    v_mov_b32_e32 v53, v18
+; GFX900-NEXT:    v_mov_b32_e32 v54, v17
+; GFX900-NEXT:    v_mov_b32_e32 v55, v16
+; GFX900-NEXT:    v_mov_b32_e32 v40, v15
+; GFX900-NEXT:    v_mov_b32_e32 v41, v14
+; GFX900-NEXT:    v_mov_b32_e32 v42, v13
+; GFX900-NEXT:    v_mov_b32_e32 v43, v12
+; GFX900-NEXT:    v_mov_b32_e32 v44, v11
+; GFX900-NEXT:    v_mov_b32_e32 v45, v10
+; GFX900-NEXT:    v_mov_b32_e32 v46, v9
+; GFX900-NEXT:    v_mov_b32_e32 v47, v8
+; GFX900-NEXT:    v_mov_b32_e32 v56, v7
+; GFX900-NEXT:    v_mov_b32_e32 v57, v6
+; GFX900-NEXT:    v_mov_b32_e32 v58, v5
+; GFX900-NEXT:    v_mov_b32_e32 v59, v4
+; GFX900-NEXT:    v_mov_b32_e32 v60, v3
+; GFX900-NEXT:    v_mov_b32_e32 v61, v2
+; GFX900-NEXT:    v_mov_b32_e32 v62, v1
+; GFX900-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; implicit-def: $sgpr4
+; GFX900-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
+; GFX900-NEXT:    v_mov_b32_e32 v1, v62
+; GFX900-NEXT:    v_mov_b32_e32 v2, v61
+; GFX900-NEXT:    v_mov_b32_e32 v3, v60
+; GFX900-NEXT:    v_mov_b32_e32 v4, v59
+; GFX900-NEXT:    v_mov_b32_e32 v5, v58
+; GFX900-NEXT:    v_mov_b32_e32 v6, v57
+; GFX900-NEXT:    v_mov_b32_e32 v7, v56
+; GFX900-NEXT:    v_mov_b32_e32 v8, v47
+; GFX900-NEXT:    v_mov_b32_e32 v9, v46
+; GFX900-NEXT:    v_mov_b32_e32 v10, v45
+; GFX900-NEXT:    v_mov_b32_e32 v11, v44
+; GFX900-NEXT:    v_mov_b32_e32 v12, v43
+; GFX900-NEXT:    v_mov_b32_e32 v13, v42
+; GFX900-NEXT:    v_mov_b32_e32 v14, v41
+; GFX900-NEXT:    v_mov_b32_e32 v15, v40
+; GFX900-NEXT:    v_mov_b32_e32 v16, v55
+; GFX900-NEXT:    v_mov_b32_e32 v17, v54
+; GFX900-NEXT:    v_mov_b32_e32 v18, v53
+; GFX900-NEXT:    v_mov_b32_e32 v19, v52
+; GFX900-NEXT:    v_mov_b32_e32 v20, v51
+; GFX900-NEXT:    v_mov_b32_e32 v21, v50
+; GFX900-NEXT:    v_mov_b32_e32 v22, v49
+; GFX900-NEXT:    v_mov_b32_e32 v23, v48
+; GFX900-NEXT:    v_mov_b32_e32 v24, v39
+; GFX900-NEXT:    v_mov_b32_e32 v25, v38
+; GFX900-NEXT:    v_mov_b32_e32 v26, v37
+; GFX900-NEXT:    v_mov_b32_e32 v27, v36
+; GFX900-NEXT:    v_mov_b32_e32 v28, v35
+; GFX900-NEXT:    v_mov_b32_e32 v29, v34
+; GFX900-NEXT:    v_mov_b32_e32 v30, v33
+; GFX900-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX900-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    v_mov_b32_e32 v0, 0
-; GFX900-NEXT:    v_readlane_b32 s67, v31, 31
-; GFX900-NEXT:    v_readlane_b32 s66, v31, 30
-; GFX900-NEXT:    v_readlane_b32 s65, v31, 29
-; GFX900-NEXT:    v_readlane_b32 s64, v31, 28
-; GFX900-NEXT:    v_readlane_b32 s63, v31, 27
-; GFX900-NEXT:    v_readlane_b32 s62, v31, 26
-; GFX900-NEXT:    v_readlane_b32 s61, v31, 25
-; GFX900-NEXT:    v_readlane_b32 s60, v31, 24
-; GFX900-NEXT:    v_readlane_b32 s59, v31, 23
-; GFX900-NEXT:    v_readlane_b32 s58, v31, 22
-; GFX900-NEXT:    v_readlane_b32 s57, v31, 21
-; GFX900-NEXT:    v_readlane_b32 s56, v31, 20
-; GFX900-NEXT:    v_readlane_b32 s55, v31, 19
-; GFX900-NEXT:    v_readlane_b32 s54, v31, 18
-; GFX900-NEXT:    v_readlane_b32 s53, v31, 17
-; GFX900-NEXT:    v_readlane_b32 s52, v31, 16
-; GFX900-NEXT:    v_readlane_b32 s51, v31, 15
-; GFX900-NEXT:    v_readlane_b32 s50, v31, 14
-; GFX900-NEXT:    v_readlane_b32 s49, v31, 13
-; GFX900-NEXT:    v_readlane_b32 s48, v31, 12
-; GFX900-NEXT:    v_readlane_b32 s47, v31, 11
-; GFX900-NEXT:    v_readlane_b32 s46, v31, 10
-; GFX900-NEXT:    v_readlane_b32 s45, v31, 9
-; GFX900-NEXT:    v_readlane_b32 s44, v31, 8
-; GFX900-NEXT:    v_readlane_b32 s43, v31, 7
-; GFX900-NEXT:    v_readlane_b32 s42, v31, 6
-; GFX900-NEXT:    v_readlane_b32 s41, v31, 5
-; GFX900-NEXT:    v_readlane_b32 s40, v31, 4
-; GFX900-NEXT:    v_readlane_b32 s39, v31, 3
-; GFX900-NEXT:    v_readlane_b32 s38, v31, 2
-; GFX900-NEXT:    v_readlane_b32 s37, v31, 1
-; GFX900-NEXT:    v_readlane_b32 s36, v31, 0
-; GFX900-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX900-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX900-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX900-NEXT:    v_readlane_b32 s67, v63, 31
+; GFX900-NEXT:    v_readlane_b32 s66, v63, 30
+; GFX900-NEXT:    v_readlane_b32 s65, v63, 29
+; GFX900-NEXT:    v_readlane_b32 s64, v63, 28
+; GFX900-NEXT:    v_readlane_b32 s63, v63, 27
+; GFX900-NEXT:    v_readlane_b32 s62, v63, 26
+; GFX900-NEXT:    v_readlane_b32 s61, v63, 25
+; GFX900-NEXT:    v_readlane_b32 s60, v63, 24
+; GFX900-NEXT:    v_readlane_b32 s59, v63, 23
+; GFX900-NEXT:    v_readlane_b32 s58, v63, 22
+; GFX900-NEXT:    v_readlane_b32 s57, v63, 21
+; GFX900-NEXT:    v_readlane_b32 s56, v63, 20
+; GFX900-NEXT:    v_readlane_b32 s55, v63, 19
+; GFX900-NEXT:    v_readlane_b32 s54, v63, 18
+; GFX900-NEXT:    v_readlane_b32 s53, v63, 17
+; GFX900-NEXT:    v_readlane_b32 s52, v63, 16
+; GFX900-NEXT:    v_readlane_b32 s51, v63, 15
+; GFX900-NEXT:    v_readlane_b32 s50, v63, 14
+; GFX900-NEXT:    v_readlane_b32 s49, v63, 13
+; GFX900-NEXT:    v_readlane_b32 s48, v63, 12
+; GFX900-NEXT:    v_readlane_b32 s47, v63, 11
+; GFX900-NEXT:    v_readlane_b32 s46, v63, 10
+; GFX900-NEXT:    v_readlane_b32 s45, v63, 9
+; GFX900-NEXT:    v_readlane_b32 s44, v63, 8
+; GFX900-NEXT:    v_readlane_b32 s43, v63, 7
+; GFX900-NEXT:    v_readlane_b32 s42, v63, 6
+; GFX900-NEXT:    v_readlane_b32 s41, v63, 5
+; GFX900-NEXT:    v_readlane_b32 s40, v63, 4
+; GFX900-NEXT:    v_readlane_b32 s39, v63, 3
+; GFX900-NEXT:    v_readlane_b32 s38, v63, 2
+; GFX900-NEXT:    v_readlane_b32 s37, v63, 1
+; GFX900-NEXT:    v_readlane_b32 s36, v63, 0
+; GFX900-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX900-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX900-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX900-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX900-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX900-NEXT:    s_waitcnt vmcnt(0)
 ; GFX900-NEXT:    s_setpc_b64 s[30:31]
@@ -190,185 +216,416 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX906-LABEL: test_tuple:
 ; GFX906:       ; %bb.0:
 ; GFX906-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX906-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX906-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX906-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
 ; GFX906-NEXT:    s_mov_b64 exec, s[4:5]
-; GFX906-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
-; GFX906-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
-; GFX906-NEXT:    v_writelane_b32 v31, s36, 0
-; GFX906-NEXT:    v_writelane_b32 v31, s37, 1
-; GFX906-NEXT:    v_writelane_b32 v31, s38, 2
-; GFX906-NEXT:    v_writelane_b32 v31, s39, 3
-; GFX906-NEXT:    v_writelane_b32 v31, s40, 4
-; GFX906-NEXT:    v_writelane_b32 v31, s41, 5
-; GFX906-NEXT:    v_writelane_b32 v31, s42, 6
-; GFX906-NEXT:    v_writelane_b32 v31, s43, 7
-; GFX906-NEXT:    v_writelane_b32 v31, s44, 8
-; GFX906-NEXT:    v_writelane_b32 v31, s45, 9
-; GFX906-NEXT:    v_writelane_b32 v31, s46, 10
-; GFX906-NEXT:    v_writelane_b32 v31, s47, 11
-; GFX906-NEXT:    v_writelane_b32 v31, s48, 12
-; GFX906-NEXT:    v_writelane_b32 v31, s49, 13
-; GFX906-NEXT:    v_writelane_b32 v31, s50, 14
-; GFX906-NEXT:    v_writelane_b32 v31, s51, 15
-; GFX906-NEXT:    v_writelane_b32 v31, s52, 16
-; GFX906-NEXT:    v_writelane_b32 v31, s53, 17
-; GFX906-NEXT:    v_writelane_b32 v31, s54, 18
-; GFX906-NEXT:    v_writelane_b32 v31, s55, 19
-; GFX906-NEXT:    v_writelane_b32 v31, s56, 20
-; GFX906-NEXT:    v_writelane_b32 v31, s57, 21
-; GFX906-NEXT:    v_writelane_b32 v31, s58, 22
-; GFX906-NEXT:    v_writelane_b32 v31, s59, 23
-; GFX906-NEXT:    v_writelane_b32 v31, s60, 24
-; GFX906-NEXT:    v_writelane_b32 v31, s61, 25
-; GFX906-NEXT:    v_writelane_b32 v31, s62, 26
-; GFX906-NEXT:    v_writelane_b32 v31, s63, 27
-; GFX906-NEXT:    v_writelane_b32 v31, s64, 28
-; GFX906-NEXT:    v_writelane_b32 v31, s65, 29
-; GFX906-NEXT:    v_writelane_b32 v31, s66, 30
-; GFX906-NEXT:    v_writelane_b32 v31, s67, 31
-; GFX906-NEXT:    v_mov_b32_e32 v32, v0
-; GFX906-NEXT:    buffer_load_dword v0, off, s[0:3], s32
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; implicit-def: $sgpr4
-; GFX906-NEXT:    ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec
-; GFX906-NEXT:    v_mov_b32_e32 v33, v1
-; GFX906-NEXT:    v_mov_b32_e32 v34, v2
-; GFX906-NEXT:    v_mov_b32_e32 v35, v3
-; GFX906-NEXT:    v_mov_b32_e32 v36, v4
-; GFX906-NEXT:    v_mov_b32_e32 v37, v5
-; GFX906-NEXT:    v_mov_b32_e32 v38, v6
-; GFX906-NEXT:    v_mov_b32_e32 v39, v7
-; GFX906-NEXT:    v_mov_b32_e32 v40, v8
-; GFX906-NEXT:    v_mov_b32_e32 v41, v9
-; GFX906-NEXT:    v_mov_b32_e32 v42, v10
-; GFX906-NEXT:    v_mov_b32_e32 v43, v11
-; GFX906-NEXT:    v_mov_b32_e32 v44, v12
-; GFX906-NEXT:    v_mov_b32_e32 v45, v13
-; GFX906-NEXT:    v_mov_b32_e32 v46, v14
-; GFX906-NEXT:    v_mov_b32_e32 v47, v15
-; GFX906-NEXT:    v_mov_b32_e32 v48, v16
-; GFX906-NEXT:    v_mov_b32_e32 v49, v17
-; GFX906-NEXT:    v_mov_b32_e32 v50, v18
-; GFX906-NEXT:    v_mov_b32_e32 v51, v19
-; GFX906-NEXT:    v_mov_b32_e32 v52, v20
-; GFX906-NEXT:    v_mov_b32_e32 v53, v21
-; GFX906-NEXT:    v_mov_b32_e32 v54, v22
-; GFX906-NEXT:    v_mov_b32_e32 v55, v23
-; GFX906-NEXT:    v_mov_b32_e32 v56, v24
-; GFX906-NEXT:    v_mov_b32_e32 v57, v25
-; GFX906-NEXT:    v_mov_b32_e32 v58, v26
-; GFX906-NEXT:    v_mov_b32_e32 v59, v27
-; GFX906-NEXT:    v_mov_b32_e32 v60, v28
-; GFX906-NEXT:    v_mov_b32_e32 v61, v29
-; GFX906-NEXT:    v_mov_b32_e32 v62, v30
-; GFX906-NEXT:    ; kill: def $vgpr63 killed $vgpr0 killed $exec
+; GFX906-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX906-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX906-NEXT:    v_writelane_b32 v63, s36, 0
+; GFX906-NEXT:    v_writelane_b32 v63, s37, 1
+; GFX906-NEXT:    v_writelane_b32 v63, s38, 2
+; GFX906-NEXT:    v_writelane_b32 v63, s39, 3
+; GFX906-NEXT:    v_writelane_b32 v63, s40, 4
+; GFX906-NEXT:    v_writelane_b32 v63, s41, 5
+; GFX906-NEXT:    v_writelane_b32 v63, s42, 6
+; GFX906-NEXT:    v_writelane_b32 v63, s43, 7
+; GFX906-NEXT:    v_writelane_b32 v63, s44, 8
+; GFX906-NEXT:    v_writelane_b32 v63, s45, 9
+; GFX906-NEXT:    v_writelane_b32 v63, s46, 10
+; GFX906-NEXT:    v_writelane_b32 v63, s47, 11
+; GFX906-NEXT:    v_writelane_b32 v63, s48, 12
+; GFX906-NEXT:    v_writelane_b32 v63, s49, 13
+; GFX906-NEXT:    v_writelane_b32 v63, s50, 14
+; GFX906-NEXT:    v_writelane_b32 v63, s51, 15
+; GFX906-NEXT:    v_writelane_b32 v63, s52, 16
+; GFX906-NEXT:    v_writelane_b32 v63, s53, 17
+; GFX906-NEXT:    v_writelane_b32 v63, s54, 18
+; GFX906-NEXT:    v_writelane_b32 v63, s55, 19
+; GFX906-NEXT:    v_writelane_b32 v63, s56, 20
+; GFX906-NEXT:    v_writelane_b32 v63, s57, 21
+; GFX906-NEXT:    v_writelane_b32 v63, s58, 22
+; GFX906-NEXT:    v_writelane_b32 v63, s59, 23
+; GFX906-NEXT:    v_writelane_b32 v63, s60, 24
+; GFX906-NEXT:    v_writelane_b32 v63, s61, 25
+; GFX906-NEXT:    v_writelane_b32 v63, s62, 26
+; GFX906-NEXT:    v_writelane_b32 v63, s63, 27
+; GFX906-NEXT:    v_writelane_b32 v63, s64, 28
+; GFX906-NEXT:    v_writelane_b32 v63, s65, 29
+; GFX906-NEXT:    v_writelane_b32 v63, s66, 30
+; GFX906-NEXT:    v_writelane_b32 v63, s67, 31
+; GFX906-NEXT:    v_mov_b32_e32 v33, v30
+; GFX906-NEXT:    v_mov_b32_e32 v34, v29
+; GFX906-NEXT:    v_mov_b32_e32 v35, v28
+; GFX906-NEXT:    v_mov_b32_e32 v36, v27
+; GFX906-NEXT:    v_mov_b32_e32 v37, v26
+; GFX906-NEXT:    v_mov_b32_e32 v38, v25
+; GFX906-NEXT:    v_mov_b32_e32 v39, v24
+; GFX906-NEXT:    v_mov_b32_e32 v48, v23
+; GFX906-NEXT:    v_mov_b32_e32 v49, v22
+; GFX906-NEXT:    v_mov_b32_e32 v50, v21
+; GFX906-NEXT:    v_mov_b32_e32 v51, v20
+; GFX906-NEXT:    v_mov_b32_e32 v52, v19
+; GFX906-NEXT:    v_mov_b32_e32 v53, v18
+; GFX906-NEXT:    v_mov_b32_e32 v54, v17
+; GFX906-NEXT:    v_mov_b32_e32 v55, v16
+; GFX906-NEXT:    v_mov_b32_e32 v40, v15
+; GFX906-NEXT:    v_mov_b32_e32 v41, v14
+; GFX906-NEXT:    v_mov_b32_e32 v42, v13
+; GFX906-NEXT:    v_mov_b32_e32 v43, v12
+; GFX906-NEXT:    v_mov_b32_e32 v44, v11
+; GFX906-NEXT:    v_mov_b32_e32 v45, v10
+; GFX906-NEXT:    v_mov_b32_e32 v46, v9
+; GFX906-NEXT:    v_mov_b32_e32 v47, v8
+; GFX906-NEXT:    v_mov_b32_e32 v56, v7
+; GFX906-NEXT:    v_mov_b32_e32 v57, v6
+; GFX906-NEXT:    v_mov_b32_e32 v58, v5
+; GFX906-NEXT:    v_mov_b32_e32 v59, v4
+; GFX906-NEXT:    v_mov_b32_e32 v60, v3
+; GFX906-NEXT:    v_mov_b32_e32 v61, v2
+; GFX906-NEXT:    v_mov_b32_e32 v62, v1
+; GFX906-NEXT:    buffer_load_dword v32, off, s[0:3], s32
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; implicit-def: $sgpr4
+; GFX906-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
+; GFX906-NEXT:    v_mov_b32_e32 v1, v62
+; GFX906-NEXT:    v_mov_b32_e32 v2, v61
+; GFX906-NEXT:    v_mov_b32_e32 v3, v60
+; GFX906-NEXT:    v_mov_b32_e32 v4, v59
+; GFX906-NEXT:    v_mov_b32_e32 v5, v58
+; GFX906-NEXT:    v_mov_b32_e32 v6, v57
+; GFX906-NEXT:    v_mov_b32_e32 v7, v56
+; GFX906-NEXT:    v_mov_b32_e32 v8, v47
+; GFX906-NEXT:    v_mov_b32_e32 v9, v46
+; GFX906-NEXT:    v_mov_b32_e32 v10, v45
+; GFX906-NEXT:    v_mov_b32_e32 v11, v44
+; GFX906-NEXT:    v_mov_b32_e32 v12, v43
+; GFX906-NEXT:    v_mov_b32_e32 v13, v42
+; GFX906-NEXT:    v_mov_b32_e32 v14, v41
+; GFX906-NEXT:    v_mov_b32_e32 v15, v40
+; GFX906-NEXT:    v_mov_b32_e32 v16, v55
+; GFX906-NEXT:    v_mov_b32_e32 v17, v54
+; GFX906-NEXT:    v_mov_b32_e32 v18, v53
+; GFX906-NEXT:    v_mov_b32_e32 v19, v52
+; GFX906-NEXT:    v_mov_b32_e32 v20, v51
+; GFX906-NEXT:    v_mov_b32_e32 v21, v50
+; GFX906-NEXT:    v_mov_b32_e32 v22, v49
+; GFX906-NEXT:    v_mov_b32_e32 v23, v48
+; GFX906-NEXT:    v_mov_b32_e32 v24, v39
+; GFX906-NEXT:    v_mov_b32_e32 v25, v38
+; GFX906-NEXT:    v_mov_b32_e32 v26, v37
+; GFX906-NEXT:    v_mov_b32_e32 v27, v36
+; GFX906-NEXT:    v_mov_b32_e32 v28, v35
+; GFX906-NEXT:    v_mov_b32_e32 v29, v34
+; GFX906-NEXT:    v_mov_b32_e32 v30, v33
+; GFX906-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX906-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    v_mov_b32_e32 v0, 0
-; GFX906-NEXT:    v_readlane_b32 s67, v31, 31
-; GFX906-NEXT:    v_readlane_b32 s66, v31, 30
-; GFX906-NEXT:    v_readlane_b32 s65, v31, 29
-; GFX906-NEXT:    v_readlane_b32 s64, v31, 28
-; GFX906-NEXT:    v_readlane_b32 s63, v31, 27
-; GFX906-NEXT:    v_readlane_b32 s62, v31, 26
-; GFX906-NEXT:    v_readlane_b32 s61, v31, 25
-; GFX906-NEXT:    v_readlane_b32 s60, v31, 24
-; GFX906-NEXT:    v_readlane_b32 s59, v31, 23
-; GFX906-NEXT:    v_readlane_b32 s58, v31, 22
-; GFX906-NEXT:    v_readlane_b32 s57, v31, 21
-; GFX906-NEXT:    v_readlane_b32 s56, v31, 20
-; GFX906-NEXT:    v_readlane_b32 s55, v31, 19
-; GFX906-NEXT:    v_readlane_b32 s54, v31, 18
-; GFX906-NEXT:    v_readlane_b32 s53, v31, 17
-; GFX906-NEXT:    v_readlane_b32 s52, v31, 16
-; GFX906-NEXT:    v_readlane_b32 s51, v31, 15
-; GFX906-NEXT:    v_readlane_b32 s50, v31, 14
-; GFX906-NEXT:    v_readlane_b32 s49, v31, 13
-; GFX906-NEXT:    v_readlane_b32 s48, v31, 12
-; GFX906-NEXT:    v_readlane_b32 s47, v31, 11
-; GFX906-NEXT:    v_readlane_b32 s46, v31, 10
-; GFX906-NEXT:    v_readlane_b32 s45, v31, 9
-; GFX906-NEXT:    v_readlane_b32 s44, v31, 8
-; GFX906-NEXT:    v_readlane_b32 s43, v31, 7
-; GFX906-NEXT:    v_readlane_b32 s42, v31, 6
-; GFX906-NEXT:    v_readlane_b32 s41, v31, 5
-; GFX906-NEXT:    v_readlane_b32 s40, v31, 4
-; GFX906-NEXT:    v_readlane_b32 s39, v31, 3
-; GFX906-NEXT:    v_readlane_b32 s38, v31, 2
-; GFX906-NEXT:    v_readlane_b32 s37, v31, 1
-; GFX906-NEXT:    v_readlane_b32 s36, v31, 0
-; GFX906-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
-; GFX906-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX906-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX906-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX906-NEXT:    v_readlane_b32 s67, v63, 31
+; GFX906-NEXT:    v_readlane_b32 s66, v63, 30
+; GFX906-NEXT:    v_readlane_b32 s65, v63, 29
+; GFX906-NEXT:    v_readlane_b32 s64, v63, 28
+; GFX906-NEXT:    v_readlane_b32 s63, v63, 27
+; GFX906-NEXT:    v_readlane_b32 s62, v63, 26
+; GFX906-NEXT:    v_readlane_b32 s61, v63, 25
+; GFX906-NEXT:    v_readlane_b32 s60, v63, 24
+; GFX906-NEXT:    v_readlane_b32 s59, v63, 23
+; GFX906-NEXT:    v_readlane_b32 s58, v63, 22
+; GFX906-NEXT:    v_readlane_b32 s57, v63, 21
+; GFX906-NEXT:    v_readlane_b32 s56, v63, 20
+; GFX906-NEXT:    v_readlane_b32 s55, v63, 19
+; GFX906-NEXT:    v_readlane_b32 s54, v63, 18
+; GFX906-NEXT:    v_readlane_b32 s53, v63, 17
+; GFX906-NEXT:    v_readlane_b32 s52, v63, 16
+; GFX906-NEXT:    v_readlane_b32 s51, v63, 15
+; GFX906-NEXT:    v_readlane_b32 s50, v63, 14
+; GFX906-NEXT:    v_readlane_b32 s49, v63, 13
+; GFX906-NEXT:    v_readlane_b32 s48, v63, 12
+; GFX906-NEXT:    v_readlane_b32 s47, v63, 11
+; GFX906-NEXT:    v_readlane_b32 s46, v63, 10
+; GFX906-NEXT:    v_readlane_b32 s45, v63, 9
+; GFX906-NEXT:    v_readlane_b32 s44, v63, 8
+; GFX906-NEXT:    v_readlane_b32 s43, v63, 7
+; GFX906-NEXT:    v_readlane_b32 s42, v63, 6
+; GFX906-NEXT:    v_readlane_b32 s41, v63, 5
+; GFX906-NEXT:    v_readlane_b32 s40, v63, 4
+; GFX906-NEXT:    v_readlane_b32 s39, v63, 3
+; GFX906-NEXT:    v_readlane_b32 s38, v63, 2
+; GFX906-NEXT:    v_readlane_b32 s37, v63, 1
+; GFX906-NEXT:    v_readlane_b32 s36, v63, 0
+; GFX906-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v61, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v60, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v59, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v58, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v57, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v56, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX906-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX906-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX906-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
 ; GFX906-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX906-NEXT:    s_waitcnt vmcnt(0)
 ; GFX906-NEXT:    s_setpc_b64 s[30:31]
 ;
+; GFX908-LABEL: test_tuple:
+; GFX908:       ; %bb.0:
+; GFX908-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX908-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX908-NEXT:    buffer_store_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX908-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a2, v42 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a3, v43 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a4, v44 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a5, v45 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a6, v46 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a7, v47 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a8, v56 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a9, v57 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a10, v58 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a11, v59 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
+; GFX908-NEXT:    v_writelane_b32 v62, s36, 0
+; GFX908-NEXT:    v_writelane_b32 v62, s37, 1
+; GFX908-NEXT:    v_writelane_b32 v62, s38, 2
+; GFX908-NEXT:    v_writelane_b32 v62, s39, 3
+; GFX908-NEXT:    v_writelane_b32 v62, s40, 4
+; GFX908-NEXT:    v_writelane_b32 v62, s41, 5
+; GFX908-NEXT:    v_writelane_b32 v62, s42, 6
+; GFX908-NEXT:    v_writelane_b32 v62, s43, 7
+; GFX908-NEXT:    v_writelane_b32 v62, s44, 8
+; GFX908-NEXT:    v_writelane_b32 v62, s45, 9
+; GFX908-NEXT:    v_writelane_b32 v62, s46, 10
+; GFX908-NEXT:    v_writelane_b32 v62, s47, 11
+; GFX908-NEXT:    v_writelane_b32 v62, s48, 12
+; GFX908-NEXT:    v_writelane_b32 v62, s49, 13
+; GFX908-NEXT:    v_writelane_b32 v62, s50, 14
+; GFX908-NEXT:    v_writelane_b32 v62, s51, 15
+; GFX908-NEXT:    v_writelane_b32 v62, s52, 16
+; GFX908-NEXT:    v_writelane_b32 v62, s53, 17
+; GFX908-NEXT:    v_writelane_b32 v62, s54, 18
+; GFX908-NEXT:    v_writelane_b32 v62, s55, 19
+; GFX908-NEXT:    v_writelane_b32 v62, s56, 20
+; GFX908-NEXT:    v_writelane_b32 v62, s57, 21
+; GFX908-NEXT:    v_writelane_b32 v62, s58, 22
+; GFX908-NEXT:    v_writelane_b32 v62, s59, 23
+; GFX908-NEXT:    v_writelane_b32 v62, s60, 24
+; GFX908-NEXT:    v_writelane_b32 v62, s61, 25
+; GFX908-NEXT:    v_writelane_b32 v62, s62, 26
+; GFX908-NEXT:    v_writelane_b32 v62, s63, 27
+; GFX908-NEXT:    v_writelane_b32 v62, s64, 28
+; GFX908-NEXT:    v_writelane_b32 v62, s65, 29
+; GFX908-NEXT:    v_writelane_b32 v62, s66, 30
+; GFX908-NEXT:    v_writelane_b32 v62, s67, 31
+; GFX908-NEXT:    v_mov_b32_e32 v33, v30
+; GFX908-NEXT:    v_mov_b32_e32 v34, v29
+; GFX908-NEXT:    v_mov_b32_e32 v35, v28
+; GFX908-NEXT:    v_mov_b32_e32 v36, v27
+; GFX908-NEXT:    v_mov_b32_e32 v37, v26
+; GFX908-NEXT:    v_mov_b32_e32 v38, v25
+; GFX908-NEXT:    v_mov_b32_e32 v39, v24
+; GFX908-NEXT:    v_mov_b32_e32 v48, v23
+; GFX908-NEXT:    v_mov_b32_e32 v49, v22
+; GFX908-NEXT:    v_mov_b32_e32 v50, v21
+; GFX908-NEXT:    v_mov_b32_e32 v51, v20
+; GFX908-NEXT:    v_mov_b32_e32 v52, v19
+; GFX908-NEXT:    v_mov_b32_e32 v53, v18
+; GFX908-NEXT:    v_mov_b32_e32 v54, v17
+; GFX908-NEXT:    v_mov_b32_e32 v55, v16
+; GFX908-NEXT:    v_mov_b32_e32 v40, v15
+; GFX908-NEXT:    v_mov_b32_e32 v41, v14
+; GFX908-NEXT:    v_mov_b32_e32 v42, v13
+; GFX908-NEXT:    v_mov_b32_e32 v43, v12
+; GFX908-NEXT:    v_mov_b32_e32 v44, v11
+; GFX908-NEXT:    v_mov_b32_e32 v45, v10
+; GFX908-NEXT:    v_mov_b32_e32 v46, v9
+; GFX908-NEXT:    v_mov_b32_e32 v47, v8
+; GFX908-NEXT:    v_mov_b32_e32 v56, v7
+; GFX908-NEXT:    v_mov_b32_e32 v57, v6
+; GFX908-NEXT:    v_mov_b32_e32 v58, v5
+; GFX908-NEXT:    v_mov_b32_e32 v59, v4
+; GFX908-NEXT:    v_mov_b32_e32 v60, v3
+; GFX908-NEXT:    v_mov_b32_e32 v61, v2
+; GFX908-NEXT:    v_mov_b32_e32 v32, v1
+; GFX908-NEXT:    buffer_load_dword v1, off, s[0:3], s32
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    v_accvgpr_write_b32 a14, v1 ; Reload Reuse
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; implicit-def: $sgpr4
+; GFX908-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
+; GFX908-NEXT:    v_mov_b32_e32 v1, v32
+; GFX908-NEXT:    s_nop 1
+; GFX908-NEXT:    v_accvgpr_read_b32 v32, a14 ; Reload Reuse
+; GFX908-NEXT:    v_mov_b32_e32 v2, v61
+; GFX908-NEXT:    v_mov_b32_e32 v3, v60
+; GFX908-NEXT:    v_mov_b32_e32 v4, v59
+; GFX908-NEXT:    v_mov_b32_e32 v5, v58
+; GFX908-NEXT:    v_mov_b32_e32 v6, v57
+; GFX908-NEXT:    v_mov_b32_e32 v7, v56
+; GFX908-NEXT:    v_mov_b32_e32 v8, v47
+; GFX908-NEXT:    v_mov_b32_e32 v9, v46
+; GFX908-NEXT:    v_mov_b32_e32 v10, v45
+; GFX908-NEXT:    v_mov_b32_e32 v11, v44
+; GFX908-NEXT:    v_mov_b32_e32 v12, v43
+; GFX908-NEXT:    v_mov_b32_e32 v13, v42
+; GFX908-NEXT:    v_mov_b32_e32 v14, v41
+; GFX908-NEXT:    v_mov_b32_e32 v15, v40
+; GFX908-NEXT:    v_mov_b32_e32 v16, v55
+; GFX908-NEXT:    v_mov_b32_e32 v17, v54
+; GFX908-NEXT:    v_mov_b32_e32 v18, v53
+; GFX908-NEXT:    v_mov_b32_e32 v19, v52
+; GFX908-NEXT:    v_mov_b32_e32 v20, v51
+; GFX908-NEXT:    v_mov_b32_e32 v21, v50
+; GFX908-NEXT:    v_mov_b32_e32 v22, v49
+; GFX908-NEXT:    v_mov_b32_e32 v23, v48
+; GFX908-NEXT:    v_mov_b32_e32 v24, v39
+; GFX908-NEXT:    v_mov_b32_e32 v25, v38
+; GFX908-NEXT:    v_mov_b32_e32 v26, v37
+; GFX908-NEXT:    v_mov_b32_e32 v27, v36
+; GFX908-NEXT:    v_mov_b32_e32 v28, v35
+; GFX908-NEXT:    v_mov_b32_e32 v29, v34
+; GFX908-NEXT:    v_mov_b32_e32 v30, v33
+; GFX908-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
+; GFX908-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
+; GFX908-NEXT:    v_mov_b32_e32 v0, 0
+; GFX908-NEXT:    v_readlane_b32 s67, v62, 31
+; GFX908-NEXT:    v_readlane_b32 s66, v62, 30
+; GFX908-NEXT:    v_readlane_b32 s65, v62, 29
+; GFX908-NEXT:    v_readlane_b32 s64, v62, 28
+; GFX908-NEXT:    v_readlane_b32 s63, v62, 27
+; GFX908-NEXT:    v_readlane_b32 s62, v62, 26
+; GFX908-NEXT:    v_readlane_b32 s61, v62, 25
+; GFX908-NEXT:    v_readlane_b32 s60, v62, 24
+; GFX908-NEXT:    v_readlane_b32 s59, v62, 23
+; GFX908-NEXT:    v_readlane_b32 s58, v62, 22
+; GFX908-NEXT:    v_readlane_b32 s57, v62, 21
+; GFX908-NEXT:    v_readlane_b32 s56, v62, 20
+; GFX908-NEXT:    v_readlane_b32 s55, v62, 19
+; GFX908-NEXT:    v_readlane_b32 s54, v62, 18
+; GFX908-NEXT:    v_readlane_b32 s53, v62, 17
+; GFX908-NEXT:    v_readlane_b32 s52, v62, 16
+; GFX908-NEXT:    v_readlane_b32 s51, v62, 15
+; GFX908-NEXT:    v_readlane_b32 s50, v62, 14
+; GFX908-NEXT:    v_readlane_b32 s49, v62, 13
+; GFX908-NEXT:    v_readlane_b32 s48, v62, 12
+; GFX908-NEXT:    v_readlane_b32 s47, v62, 11
+; GFX908-NEXT:    v_readlane_b32 s46, v62, 10
+; GFX908-NEXT:    v_readlane_b32 s45, v62, 9
+; GFX908-NEXT:    v_readlane_b32 s44, v62, 8
+; GFX908-NEXT:    v_readlane_b32 s43, v62, 7
+; GFX908-NEXT:    v_readlane_b32 s42, v62, 6
+; GFX908-NEXT:    v_readlane_b32 s41, v62, 5
+; GFX908-NEXT:    v_readlane_b32 s40, v62, 4
+; GFX908-NEXT:    v_readlane_b32 s39, v62, 3
+; GFX908-NEXT:    v_readlane_b32 s38, v62, 2
+; GFX908-NEXT:    v_readlane_b32 s37, v62, 1
+; GFX908-NEXT:    v_readlane_b32 s36, v62, 0
+; GFX908-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v59, a11 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v58, a10 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v57, a9 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v56, a8 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v47, a7 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v46, a6 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v45, a5 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v44, a4 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v43, a3 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
+; GFX908-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
+; GFX908-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX908-NEXT:    buffer_load_dword v62, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX908-NEXT:    s_mov_b64 exec, s[4:5]
+; GFX908-NEXT:    s_waitcnt vmcnt(0)
+; GFX908-NEXT:    s_setpc_b64 s[30:31]
+;
 ; GFX90a-LABEL: test_tuple:
 ; GFX90a:       ; %bb.0:
 ; GFX90a-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX90a-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX90a-NEXT:    buffer_store_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX90a-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX90a-NEXT:    buffer_store_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
 ; GFX90a-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a0, v40 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a1, v41 ; Reload Reuse
@@ -385,41 +642,69 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a12, v60 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a13, v61 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_write_b32 a14, v62 ; Reload Reuse
-; GFX90a-NEXT:    v_accvgpr_write_b32 a15, v63 ; Reload Reuse
-; GFX90a-NEXT:    v_writelane_b32 v31, s36, 0
-; GFX90a-NEXT:    v_writelane_b32 v31, s37, 1
-; GFX90a-NEXT:    v_writelane_b32 v31, s38, 2
-; GFX90a-NEXT:    v_writelane_b32 v31, s39, 3
-; GFX90a-NEXT:    v_writelane_b32 v31, s40, 4
-; GFX90a-NEXT:    v_writelane_b32 v31, s41, 5
-; GFX90a-NEXT:    v_writelane_b32 v31, s42, 6
-; GFX90a-NEXT:    v_writelane_b32 v31, s43, 7
-; GFX90a-NEXT:    v_writelane_b32 v31, s44, 8
-; GFX90a-NEXT:    v_writelane_b32 v31, s45, 9
-; GFX90a-NEXT:    v_writelane_b32 v31, s46, 10
-; GFX90a-NEXT:    v_writelane_b32 v31, s47, 11
-; GFX90a-NEXT:    v_writelane_b32 v31, s48, 12
-; GFX90a-NEXT:    v_writelane_b32 v31, s49, 13
-; GFX90a-NEXT:    v_writelane_b32 v31, s50, 14
-; GFX90a-NEXT:    v_writelane_b32 v31, s51, 15
-; GFX90a-NEXT:    v_writelane_b32 v31, s52, 16
-; GFX90a-NEXT:    v_writelane_b32 v31, s53, 17
-; GFX90a-NEXT:    v_writelane_b32 v31, s54, 18
-; GFX90a-NEXT:    v_writelane_b32 v31, s55, 19
-; GFX90a-NEXT:    v_writelane_b32 v31, s56, 20
-; GFX90a-NEXT:    v_writelane_b32 v31, s57, 21
-; GFX90a-NEXT:    v_writelane_b32 v31, s58, 22
-; GFX90a-NEXT:    v_writelane_b32 v31, s59, 23
-; GFX90a-NEXT:    v_writelane_b32 v31, s60, 24
-; GFX90a-NEXT:    v_writelane_b32 v31, s61, 25
-; GFX90a-NEXT:    v_writelane_b32 v31, s62, 26
-; GFX90a-NEXT:    v_writelane_b32 v31, s63, 27
-; GFX90a-NEXT:    v_writelane_b32 v31, s64, 28
-; GFX90a-NEXT:    v_writelane_b32 v31, s65, 29
-; GFX90a-NEXT:    v_writelane_b32 v31, s66, 30
-; GFX90a-NEXT:    v_writelane_b32 v31, s67, 31
-; GFX90a-NEXT:    v_mov_b32_e32 v32, v0
-; GFX90a-NEXT:    buffer_load_dword v0, off, s[0:3], s32
+; GFX90a-NEXT:    v_writelane_b32 v63, s36, 0
+; GFX90a-NEXT:    v_writelane_b32 v63, s37, 1
+; GFX90a-NEXT:    v_writelane_b32 v63, s38, 2
+; GFX90a-NEXT:    v_writelane_b32 v63, s39, 3
+; GFX90a-NEXT:    v_writelane_b32 v63, s40, 4
+; GFX90a-NEXT:    v_writelane_b32 v63, s41, 5
+; GFX90a-NEXT:    v_writelane_b32 v63, s42, 6
+; GFX90a-NEXT:    v_writelane_b32 v63, s43, 7
+; GFX90a-NEXT:    v_writelane_b32 v63, s44, 8
+; GFX90a-NEXT:    v_writelane_b32 v63, s45, 9
+; GFX90a-NEXT:    v_writelane_b32 v63, s46, 10
+; GFX90a-NEXT:    v_writelane_b32 v63, s47, 11
+; GFX90a-NEXT:    v_writelane_b32 v63, s48, 12
+; GFX90a-NEXT:    v_writelane_b32 v63, s49, 13
+; GFX90a-NEXT:    v_writelane_b32 v63, s50, 14
+; GFX90a-NEXT:    v_writelane_b32 v63, s51, 15
+; GFX90a-NEXT:    v_writelane_b32 v63, s52, 16
+; GFX90a-NEXT:    v_writelane_b32 v63, s53, 17
+; GFX90a-NEXT:    v_writelane_b32 v63, s54, 18
+; GFX90a-NEXT:    v_writelane_b32 v63, s55, 19
+; GFX90a-NEXT:    v_writelane_b32 v63, s56, 20
+; GFX90a-NEXT:    v_writelane_b32 v63, s57, 21
+; GFX90a-NEXT:    v_writelane_b32 v63, s58, 22
+; GFX90a-NEXT:    v_writelane_b32 v63, s59, 23
+; GFX90a-NEXT:    v_writelane_b32 v63, s60, 24
+; GFX90a-NEXT:    v_writelane_b32 v63, s61, 25
+; GFX90a-NEXT:    v_writelane_b32 v63, s62, 26
+; GFX90a-NEXT:    v_writelane_b32 v63, s63, 27
+; GFX90a-NEXT:    v_writelane_b32 v63, s64, 28
+; GFX90a-NEXT:    v_writelane_b32 v63, s65, 29
+; GFX90a-NEXT:    v_writelane_b32 v63, s66, 30
+; GFX90a-NEXT:    v_writelane_b32 v63, s67, 31
+; GFX90a-NEXT:    v_mov_b32_e32 v33, v30
+; GFX90a-NEXT:    v_mov_b32_e32 v34, v29
+; GFX90a-NEXT:    v_mov_b32_e32 v35, v28
+; GFX90a-NEXT:    v_mov_b32_e32 v36, v27
+; GFX90a-NEXT:    v_mov_b32_e32 v37, v26
+; GFX90a-NEXT:    v_mov_b32_e32 v38, v25
+; GFX90a-NEXT:    v_mov_b32_e32 v39, v24
+; GFX90a-NEXT:    v_mov_b32_e32 v48, v23
+; GFX90a-NEXT:    v_mov_b32_e32 v49, v22
+; GFX90a-NEXT:    v_mov_b32_e32 v50, v21
+; GFX90a-NEXT:    v_mov_b32_e32 v51, v20
+; GFX90a-NEXT:    v_mov_b32_e32 v52, v19
+; GFX90a-NEXT:    v_mov_b32_e32 v53, v18
+; GFX90a-NEXT:    v_mov_b32_e32 v54, v17
+; GFX90a-NEXT:    v_mov_b32_e32 v55, v16
+; GFX90a-NEXT:    v_mov_b32_e32 v40, v15
+; GFX90a-NEXT:    v_mov_b32_e32 v41, v14
+; GFX90a-NEXT:    v_mov_b32_e32 v42, v13
+; GFX90a-NEXT:    v_mov_b32_e32 v43, v12
+; GFX90a-NEXT:    v_mov_b32_e32 v44, v11
+; GFX90a-NEXT:    v_mov_b32_e32 v45, v10
+; GFX90a-NEXT:    v_mov_b32_e32 v46, v9
+; GFX90a-NEXT:    v_mov_b32_e32 v47, v8
+; GFX90a-NEXT:    v_mov_b32_e32 v56, v7
+; GFX90a-NEXT:    v_mov_b32_e32 v57, v6
+; GFX90a-NEXT:    v_mov_b32_e32 v58, v5
+; GFX90a-NEXT:    v_mov_b32_e32 v59, v4
+; GFX90a-NEXT:    v_mov_b32_e32 v60, v3
+; GFX90a-NEXT:    v_mov_b32_e32 v61, v2
+; GFX90a-NEXT:    v_mov_b32_e32 v62, v1
+; GFX90a-NEXT:    buffer_load_dword v32, off, s[0:3], s32
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
@@ -452,74 +737,72 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
 ; GFX90a-NEXT:    ; implicit-def: $sgpr4
-; GFX90a-NEXT:    ; kill: def $vgpr32 killed $vgpr32 def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 killed $exec
-; GFX90a-NEXT:    v_mov_b32_e32 v33, v1
-; GFX90a-NEXT:    v_mov_b32_e32 v34, v2
-; GFX90a-NEXT:    v_mov_b32_e32 v35, v3
-; GFX90a-NEXT:    v_mov_b32_e32 v36, v4
-; GFX90a-NEXT:    v_mov_b32_e32 v37, v5
-; GFX90a-NEXT:    v_mov_b32_e32 v38, v6
-; GFX90a-NEXT:    v_mov_b32_e32 v39, v7
-; GFX90a-NEXT:    v_mov_b32_e32 v40, v8
-; GFX90a-NEXT:    v_mov_b32_e32 v41, v9
-; GFX90a-NEXT:    v_mov_b32_e32 v42, v10
-; GFX90a-NEXT:    v_mov_b32_e32 v43, v11
-; GFX90a-NEXT:    v_mov_b32_e32 v44, v12
-; GFX90a-NEXT:    v_mov_b32_e32 v45, v13
-; GFX90a-NEXT:    v_mov_b32_e32 v46, v14
-; GFX90a-NEXT:    v_mov_b32_e32 v47, v15
-; GFX90a-NEXT:    v_mov_b32_e32 v48, v16
-; GFX90a-NEXT:    v_mov_b32_e32 v49, v17
-; GFX90a-NEXT:    v_mov_b32_e32 v50, v18
-; GFX90a-NEXT:    v_mov_b32_e32 v51, v19
-; GFX90a-NEXT:    v_mov_b32_e32 v52, v20
-; GFX90a-NEXT:    v_mov_b32_e32 v53, v21
-; GFX90a-NEXT:    v_mov_b32_e32 v54, v22
-; GFX90a-NEXT:    v_mov_b32_e32 v55, v23
-; GFX90a-NEXT:    v_mov_b32_e32 v56, v24
-; GFX90a-NEXT:    v_mov_b32_e32 v57, v25
-; GFX90a-NEXT:    v_mov_b32_e32 v58, v26
-; GFX90a-NEXT:    v_mov_b32_e32 v59, v27
-; GFX90a-NEXT:    v_mov_b32_e32 v60, v28
-; GFX90a-NEXT:    v_mov_b32_e32 v61, v29
-; GFX90a-NEXT:    v_mov_b32_e32 v62, v30
-; GFX90a-NEXT:    ; kill: def $vgpr63 killed $vgpr0 killed $exec
+; GFX90a-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
+; GFX90a-NEXT:    v_mov_b32_e32 v1, v62
+; GFX90a-NEXT:    v_mov_b32_e32 v2, v61
+; GFX90a-NEXT:    v_mov_b32_e32 v3, v60
+; GFX90a-NEXT:    v_mov_b32_e32 v4, v59
+; GFX90a-NEXT:    v_mov_b32_e32 v5, v58
+; GFX90a-NEXT:    v_mov_b32_e32 v6, v57
+; GFX90a-NEXT:    v_mov_b32_e32 v7, v56
+; GFX90a-NEXT:    v_mov_b32_e32 v8, v47
+; GFX90a-NEXT:    v_mov_b32_e32 v9, v46
+; GFX90a-NEXT:    v_mov_b32_e32 v10, v45
+; GFX90a-NEXT:    v_mov_b32_e32 v11, v44
+; GFX90a-NEXT:    v_mov_b32_e32 v12, v43
+; GFX90a-NEXT:    v_mov_b32_e32 v13, v42
+; GFX90a-NEXT:    v_mov_b32_e32 v14, v41
+; GFX90a-NEXT:    v_mov_b32_e32 v15, v40
+; GFX90a-NEXT:    v_mov_b32_e32 v16, v55
+; GFX90a-NEXT:    v_mov_b32_e32 v17, v54
+; GFX90a-NEXT:    v_mov_b32_e32 v18, v53
+; GFX90a-NEXT:    v_mov_b32_e32 v19, v52
+; GFX90a-NEXT:    v_mov_b32_e32 v20, v51
+; GFX90a-NEXT:    v_mov_b32_e32 v21, v50
+; GFX90a-NEXT:    v_mov_b32_e32 v22, v49
+; GFX90a-NEXT:    v_mov_b32_e32 v23, v48
+; GFX90a-NEXT:    v_mov_b32_e32 v24, v39
+; GFX90a-NEXT:    v_mov_b32_e32 v25, v38
+; GFX90a-NEXT:    v_mov_b32_e32 v26, v37
+; GFX90a-NEXT:    v_mov_b32_e32 v27, v36
+; GFX90a-NEXT:    v_mov_b32_e32 v28, v35
+; GFX90a-NEXT:    v_mov_b32_e32 v29, v34
+; GFX90a-NEXT:    v_mov_b32_e32 v30, v33
+; GFX90a-NEXT:    ; kill: def $vgpr31 killed $vgpr32 killed $exec
 ; GFX90a-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
-; GFX90a-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NEXT:    v_mov_b32_e32 v0, 0
-; GFX90a-NEXT:    v_readlane_b32 s67, v31, 31
-; GFX90a-NEXT:    v_readlane_b32 s66, v31, 30
-; GFX90a-NEXT:    v_readlane_b32 s65, v31, 29
-; GFX90a-NEXT:    v_readlane_b32 s64, v31, 28
-; GFX90a-NEXT:    v_readlane_b32 s63, v31, 27
-; GFX90a-NEXT:    v_readlane_b32 s62, v31, 26
-; GFX90a-NEXT:    v_readlane_b32 s61, v31, 25
-; GFX90a-NEXT:    v_readlane_b32 s60, v31, 24
-; GFX90a-NEXT:    v_readlane_b32 s59, v31, 23
-; GFX90a-NEXT:    v_readlane_b32 s58, v31, 22
-; GFX90a-NEXT:    v_readlane_b32 s57, v31, 21
-; GFX90a-NEXT:    v_readlane_b32 s56, v31, 20
-; GFX90a-NEXT:    v_readlane_b32 s55, v31, 19
-; GFX90a-NEXT:    v_readlane_b32 s54, v31, 18
-; GFX90a-NEXT:    v_readlane_b32 s53, v31, 17
-; GFX90a-NEXT:    v_readlane_b32 s52, v31, 16
-; GFX90a-NEXT:    v_readlane_b32 s51, v31, 15
-; GFX90a-NEXT:    v_readlane_b32 s50, v31, 14
-; GFX90a-NEXT:    v_readlane_b32 s49, v31, 13
-; GFX90a-NEXT:    v_readlane_b32 s48, v31, 12
-; GFX90a-NEXT:    v_readlane_b32 s47, v31, 11
-; GFX90a-NEXT:    v_readlane_b32 s46, v31, 10
-; GFX90a-NEXT:    v_readlane_b32 s45, v31, 9
-; GFX90a-NEXT:    v_readlane_b32 s44, v31, 8
-; GFX90a-NEXT:    v_readlane_b32 s43, v31, 7
-; GFX90a-NEXT:    v_readlane_b32 s42, v31, 6
-; GFX90a-NEXT:    v_readlane_b32 s41, v31, 5
-; GFX90a-NEXT:    v_readlane_b32 s40, v31, 4
-; GFX90a-NEXT:    v_readlane_b32 s39, v31, 3
-; GFX90a-NEXT:    v_readlane_b32 s38, v31, 2
-; GFX90a-NEXT:    v_readlane_b32 s37, v31, 1
-; GFX90a-NEXT:    v_readlane_b32 s36, v31, 0
-; GFX90a-NEXT:    v_accvgpr_read_b32 v63, a15 ; Reload Reuse
+; GFX90a-NEXT:    v_readlane_b32 s67, v63, 31
+; GFX90a-NEXT:    v_readlane_b32 s66, v63, 30
+; GFX90a-NEXT:    v_readlane_b32 s65, v63, 29
+; GFX90a-NEXT:    v_readlane_b32 s64, v63, 28
+; GFX90a-NEXT:    v_readlane_b32 s63, v63, 27
+; GFX90a-NEXT:    v_readlane_b32 s62, v63, 26
+; GFX90a-NEXT:    v_readlane_b32 s61, v63, 25
+; GFX90a-NEXT:    v_readlane_b32 s60, v63, 24
+; GFX90a-NEXT:    v_readlane_b32 s59, v63, 23
+; GFX90a-NEXT:    v_readlane_b32 s58, v63, 22
+; GFX90a-NEXT:    v_readlane_b32 s57, v63, 21
+; GFX90a-NEXT:    v_readlane_b32 s56, v63, 20
+; GFX90a-NEXT:    v_readlane_b32 s55, v63, 19
+; GFX90a-NEXT:    v_readlane_b32 s54, v63, 18
+; GFX90a-NEXT:    v_readlane_b32 s53, v63, 17
+; GFX90a-NEXT:    v_readlane_b32 s52, v63, 16
+; GFX90a-NEXT:    v_readlane_b32 s51, v63, 15
+; GFX90a-NEXT:    v_readlane_b32 s50, v63, 14
+; GFX90a-NEXT:    v_readlane_b32 s49, v63, 13
+; GFX90a-NEXT:    v_readlane_b32 s48, v63, 12
+; GFX90a-NEXT:    v_readlane_b32 s47, v63, 11
+; GFX90a-NEXT:    v_readlane_b32 s46, v63, 10
+; GFX90a-NEXT:    v_readlane_b32 s45, v63, 9
+; GFX90a-NEXT:    v_readlane_b32 s44, v63, 8
+; GFX90a-NEXT:    v_readlane_b32 s43, v63, 7
+; GFX90a-NEXT:    v_readlane_b32 s42, v63, 6
+; GFX90a-NEXT:    v_readlane_b32 s41, v63, 5
+; GFX90a-NEXT:    v_readlane_b32 s40, v63, 4
+; GFX90a-NEXT:    v_readlane_b32 s39, v63, 3
+; GFX90a-NEXT:    v_readlane_b32 s38, v63, 2
+; GFX90a-NEXT:    v_readlane_b32 s37, v63, 1
+; GFX90a-NEXT:    v_readlane_b32 s36, v63, 0
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v62, a14 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v61, a13 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v60, a12 ; Reload Reuse
@@ -535,8 +818,8 @@ define i32 @test_tuple(<16 x i64> %0) {
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v42, a2 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v41, a1 ; Reload Reuse
 ; GFX90a-NEXT:    v_accvgpr_read_b32 v40, a0 ; Reload Reuse
-; GFX90a-NEXT:    s_xor_saveexec_b64 s[4:5], -1
-; GFX90a-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX90a-NEXT:    s_or_saveexec_b64 s[4:5], -1
+; GFX90a-NEXT:    buffer_load_dword v63, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
 ; GFX90a-NEXT:    s_mov_b64 exec, s[4:5]
 ; GFX90a-NEXT:    s_waitcnt vmcnt(0)
 ; GFX90a-NEXT:    s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
index 92efbe5a71826..2b96e10fd3cc3 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll
@@ -14,17 +14,17 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v36, v16
 ; GFX9-NEXT:    v_mov_b32_e32 v35, v15
 ; GFX9-NEXT:    v_mov_b32_e32 v34, v14
 ; GFX9-NEXT:    v_mov_b32_e32 v33, v13
 ; GFX9-NEXT:    v_mov_b32_e32 v32, v12
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
@@ -33,30 +33,30 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    ;;#ASMEND
 ; GFX9-NEXT:    ;;#ASMSTART
 ; GFX9-NEXT:    ;;#ASMEND
-; GFX9-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT:    image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX9-NEXT:    v_writelane_b32 v44, s4, 2
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    v_mov_b32_e32 v0, v41
-; GFX9-NEXT:    v_mov_b32_e32 v1, v42
-; GFX9-NEXT:    v_mov_b32_e32 v2, v43
-; GFX9-NEXT:    v_mov_b32_e32 v3, v44
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    v_mov_b32_e32 v0, v40
+; GFX9-NEXT:    v_mov_b32_e32 v1, v41
+; GFX9-NEXT:    v_mov_b32_e32 v2, v42
+; GFX9-NEXT:    v_mov_b32_e32 v3, v43
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -69,7 +69,7 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    s_mov_b32 s4, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    v_mov_b32_e32 v36, v16
@@ -77,10 +77,10 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    v_mov_b32_e32 v34, v14
 ; GFX10-NEXT:    v_mov_b32_e32 v33, v13
 ; GFX10-NEXT:    v_mov_b32_e32 v32, v12
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    ;;#ASMSTART
@@ -89,31 +89,31 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    ;;#ASMEND
 ; GFX10-NEXT:    ;;#ASMSTART
 ; GFX10-NEXT:    ;;#ASMEND
-; GFX10-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_gather4_c_b_cl v[40:43], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX10-NEXT:    v_writelane_b32 v44, s4, 2
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX10-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    v_mov_b32_e32 v0, v41
-; GFX10-NEXT:    v_mov_b32_e32 v1, v42
-; GFX10-NEXT:    v_mov_b32_e32 v2, v43
-; GFX10-NEXT:    v_mov_b32_e32 v3, v44
+; GFX10-NEXT:    v_mov_b32_e32 v0, v40
+; GFX10-NEXT:    v_mov_b32_e32 v1, v41
+; GFX10-NEXT:    v_mov_b32_e32 v2, v42
+; GFX10-NEXT:    v_mov_b32_e32 v3, v43
 ; GFX10-NEXT:    s_clause 0x3
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX10-NEXT:    v_readlane_b32 s4, v44, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
@@ -127,16 +127,16 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:16 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15
 ; GFX11-NEXT:    v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
 ; GFX11-NEXT:    v_mov_b32_e32 v32, v12
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    ;;#ASMSTART
@@ -145,29 +145,29 @@ define <4 x float> @non_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    ;;#ASMEND
 ; GFX11-NEXT:    ;;#ASMSTART
 ; GFX11-NEXT:    ;;#ASMEND
-; GFX11-NEXT:    image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_gather4_c_b_cl v[40:43], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v44, s0, 2
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v44, s30, 0
+; GFX11-NEXT:    v_writelane_b32 v44, s31, 1
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    v_dual_mov_b32 v0, v41 :: v_dual_mov_b32 v1, v42
-; GFX11-NEXT:    v_dual_mov_b32 v2, v43 :: v_dual_mov_b32 v3, v44
+; GFX11-NEXT:    v_dual_mov_b32 v0, v40 :: v_dual_mov_b32 v1, v41
+; GFX11-NEXT:    v_dual_mov_b32 v2, v42 :: v_dual_mov_b32 v3, v43
 ; GFX11-NEXT:    s_clause 0x3
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:12
+; GFX11-NEXT:    v_readlane_b32 s31, v44, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v44, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v44, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:16 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
@@ -204,43 +204,43 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX9-NEXT:    s_mov_b32 s4, s33
 ; GFX9-NEXT:    s_mov_b32 s33, s32
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
-; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX9-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-NEXT:    v_mov_b32_e32 v45, v16
-; GFX9-NEXT:    v_mov_b32_e32 v44, v15
-; GFX9-NEXT:    v_mov_b32_e32 v43, v14
-; GFX9-NEXT:    v_mov_b32_e32 v42, v13
-; GFX9-NEXT:    v_mov_b32_e32 v41, v12
-; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-NEXT:    v_mov_b32_e32 v44, v16
+; GFX9-NEXT:    v_mov_b32_e32 v43, v15
+; GFX9-NEXT:    v_mov_b32_e32 v42, v14
+; GFX9-NEXT:    v_mov_b32_e32 v41, v13
+; GFX9-NEXT:    v_mov_b32_e32 v40, v12
+; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX9-NEXT:    v_writelane_b32 v45, s4, 2
 ; GFX9-NEXT:    s_getpc_b64 s[4:5]
 ; GFX9-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX9-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
 ; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX9-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX9-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX9-NEXT:    v_writelane_b32 v45, s30, 0
+; GFX9-NEXT:    v_writelane_b32 v45, s31, 1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1
+; GFX9-NEXT:    image_gather4_c_b_cl v[0:3], v[40:44], s[4:11], s[4:7] dmask:0x1
 ; GFX9-NEXT:    s_nop 0
-; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX9-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX9-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX9-NEXT:    buffer_load_dword v44, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-NEXT:    v_readlane_b32 s31, v45, 1
+; GFX9-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX9-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX9-NEXT:    s_or_saveexec_b64 s[6:7], -1
-; GFX9-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX9-NEXT:    s_mov_b64 exec, s[6:7]
 ; GFX9-NEXT:    s_addk_i32 s32, 0xf800
 ; GFX9-NEXT:    s_mov_b32 s33, s4
@@ -253,44 +253,44 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX10-NEXT:    s_mov_b32 s4, s33
 ; GFX10-NEXT:    s_mov_b32 s33, s32
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
-; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
-; GFX10-NEXT:    buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v43, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX10-NEXT:    buffer_store_dword v44, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_addk_i32 s32, 0x400
-; GFX10-NEXT:    v_writelane_b32 v40, s4, 2
+; GFX10-NEXT:    v_writelane_b32 v45, s4, 2
 ; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4
 ; GFX10-NEXT:    s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12
-; GFX10-NEXT:    v_mov_b32_e32 v41, v16
+; GFX10-NEXT:    v_mov_b32_e32 v40, v16
 ; GFX10-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
-; GFX10-NEXT:    v_mov_b32_e32 v42, v15
-; GFX10-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX10-NEXT:    v_mov_b32_e32 v43, v14
-; GFX10-NEXT:    v_mov_b32_e32 v44, v13
-; GFX10-NEXT:    v_mov_b32_e32 v45, v12
-; GFX10-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX10-NEXT:    v_mov_b32_e32 v41, v15
+; GFX10-NEXT:    v_writelane_b32 v45, s30, 0
+; GFX10-NEXT:    v_mov_b32_e32 v42, v14
+; GFX10-NEXT:    v_mov_b32_e32 v43, v13
+; GFX10-NEXT:    v_mov_b32_e32 v44, v12
+; GFX10-NEXT:    v_writelane_b32 v45, s31, 1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    global_store_dwordx4 v[0:1], v[0:3], off
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX10-NEXT:    image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX10-NEXT:    s_clause 0x4
-; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33
-; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33 offset:4
-; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:8
-; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:12
-; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:16
-; GFX10-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX10-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX10-NEXT:    v_readlane_b32 s4, v40, 2
+; GFX10-NEXT:    buffer_load_dword v44, off, s[0:3], s33
+; GFX10-NEXT:    buffer_load_dword v43, off, s[0:3], s33 offset:4
+; GFX10-NEXT:    buffer_load_dword v42, off, s[0:3], s33 offset:8
+; GFX10-NEXT:    buffer_load_dword v41, off, s[0:3], s33 offset:12
+; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:16
+; GFX10-NEXT:    v_readlane_b32 s31, v45, 1
+; GFX10-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX10-NEXT:    v_readlane_b32 s4, v45, 2
 ; GFX10-NEXT:    s_or_saveexec_b32 s5, -1
-; GFX10-NEXT:    buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX10-NEXT:    buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX10-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-NEXT:    s_mov_b32 exec_lo, s5
 ; GFX10-NEXT:    s_addk_i32 s32, 0xfc00
@@ -304,42 +304,42 @@ define <4 x float> @call_preserved_vgpr_tuple8(<8 x i32> %rsrc, <4 x i32> %samp,
 ; GFX11-NEXT:    s_mov_b32 s0, s33
 ; GFX11-NEXT:    s_mov_b32 s33, s32
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:20 ; 4-byte Folded Spill
+; GFX11-NEXT:    scratch_store_b32 off, v45, s33 offset:20 ; 4-byte Folded Spill
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:16
-; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:12
-; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:8
-; GFX11-NEXT:    scratch_store_b32 off, v44, s33 offset:4
-; GFX11-NEXT:    scratch_store_b32 off, v45, s33
+; GFX11-NEXT:    scratch_store_b32 off, v40, s33 offset:16
+; GFX11-NEXT:    scratch_store_b32 off, v41, s33 offset:12
+; GFX11-NEXT:    scratch_store_b32 off, v42, s33 offset:8
+; GFX11-NEXT:    scratch_store_b32 off, v43, s33 offset:4
+; GFX11-NEXT:    scratch_store_b32 off, v44, s33
 ; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_add_i32 s32, s32, 32
-; GFX11-NEXT:    v_writelane_b32 v40, s0, 2
+; GFX11-NEXT:    v_writelane_b32 v45, s0, 2
 ; GFX11-NEXT:    s_getpc_b64 s[0:1]
 ; GFX11-NEXT:    s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4
 ; GFX11-NEXT:    s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12
-; GFX11-NEXT:    v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15
+; GFX11-NEXT:    v_dual_mov_b32 v40, v16 :: v_dual_mov_b32 v41, v15
 ; GFX11-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX11-NEXT:    v_writelane_b32 v40, s30, 0
-; GFX11-NEXT:    v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13
-; GFX11-NEXT:    v_mov_b32_e32 v45, v12
-; GFX11-NEXT:    v_writelane_b32 v40, s31, 1
+; GFX11-NEXT:    v_writelane_b32 v45, s30, 0
+; GFX11-NEXT:    v_dual_mov_b32 v42, v14 :: v_dual_mov_b32 v43, v13
+; GFX11-NEXT:    v_mov_b32_e32 v44, v12
+; GFX11-NEXT:    v_writelane_b32 v45, s31, 1
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    global_store_b128 v[0:1], v[0:3], off
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v45, v44, v43, v42, v41], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX11-NEXT:    image_gather4_c_b_cl v[0:3], [v44, v43, v42, v41, v40], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D
 ; GFX11-NEXT:    s_clause 0x4
-; GFX11-NEXT:    scratch_load_b32 v45, off, s33
-; GFX11-NEXT:    scratch_load_b32 v44, off, s33 offset:4
-; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:8
-; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:12
-; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:16
-; GFX11-NEXT:    v_readlane_b32 s31, v40, 1
-; GFX11-NEXT:    v_readlane_b32 s30, v40, 0
-; GFX11-NEXT:    v_readlane_b32 s0, v40, 2
+; GFX11-NEXT:    scratch_load_b32 v44, off, s33
+; GFX11-NEXT:    scratch_load_b32 v43, off, s33 offset:4
+; GFX11-NEXT:    scratch_load_b32 v42, off, s33 offset:8
+; GFX11-NEXT:    scratch_load_b32 v41, off, s33 offset:12
+; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:16
+; GFX11-NEXT:    v_readlane_b32 s31, v45, 1
+; GFX11-NEXT:    v_readlane_b32 s30, v45, 0
+; GFX11-NEXT:    v_readlane_b32 s0, v45, 2
 ; GFX11-NEXT:    s_or_saveexec_b32 s1, -1
-; GFX11-NEXT:    scratch_load_b32 v40, off, s33 offset:20 ; 4-byte Folded Reload
+; GFX11-NEXT:    scratch_load_b32 v45, off, s33 offset:20 ; 4-byte Folded Reload
 ; GFX11-NEXT:    s_mov_b32 exec_lo, s1
 ; GFX11-NEXT:    s_addk_i32 s32, 0xffe0
 ; GFX11-NEXT:    s_mov_b32 s33, s0
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index ab7d3ca0ab425..e79cb66dcd776 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -371,13 +371,13 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x400
-; GFX9-O0-NEXT:    v_writelane_b32 v2, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v2, s31, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v3, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s40, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s4
 ; GFX9-O0-NEXT:    ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41
@@ -393,9 +393,9 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s43
 ; GFX9-O0-NEXT:    s_mov_b32 s39, s42
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, s34
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[40:41], -1
 ; GFX9-O0-NEXT:    s_getpc_b64 s[42:43]
@@ -405,18 +405,18 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O0-NEXT:    s_mov_b64 s[44:45], s[0:1]
 ; GFX9-O0-NEXT:    s_mov_b64 s[0:1], s[44:45]
 ; GFX9-O0-NEXT:    s_mov_b64 s[2:3], s[46:47]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O0-NEXT:    s_swappc_b64 s[30:31], s[42:43]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v3
+; GFX9-O0-NEXT:    v_add_u32_e64 v1, v1, v2
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[40:41]
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0xfffffc00
@@ -430,33 +430,33 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
 ; GFX9-O3-NEXT:    s_mov_b32 s38, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v2, s30, 0
+; GFX9-O3-NEXT:    v_writelane_b32 v3, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x400
-; GFX9-O3-NEXT:    v_writelane_b32 v2, s31, 1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, s8
+; GFX9-O3-NEXT:    v_writelane_b32 v3, s31, 1
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, s8
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
-; GFX9-O3-NEXT:    v_mov_b32_e32 v3, 0
+; GFX9-O3-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v3
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called@rel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called@rel32@hi+12
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v0
-; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v3
+; GFX9-O3-NEXT:    v_add_u32_e32 v1, v1, v2
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v1
 ; GFX9-O3-NEXT:    buffer_store_dword v0, off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v2, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v2, 0
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v3, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v3, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v1, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0xfc00
@@ -562,12 +562,12 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b32 s48, s33
 ; GFX9-O0-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
@@ -581,8 +581,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_add_i32 s32, s32, 0x1000
 ; GFX9-O0-NEXT:    ; implicit-def: $vgpr0 : SGPR spill to VGPR lane
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s30, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v8, s31, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s30, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v10, s31, 1
 ; GFX9-O0-NEXT:    s_mov_b32 s34, s8
 ; GFX9-O0-NEXT:    s_mov_b32 s38, s6
 ; GFX9-O0-NEXT:    s_mov_b32 s36, s4
@@ -606,11 +606,11 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_mov_b32 s35, s9
 ; GFX9-O0-NEXT:    ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    s_mov_b64 s[36:37], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s36
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, s37
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, s36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, s37
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O0-NEXT:    v_writelane_b32 v0, s34, 4
@@ -618,10 +618,10 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    s_or_saveexec_b64 s[46:47], -1
 ; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[46:47]
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v8
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 32
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37
-; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[9:10]
+; GFX9-O0-NEXT:    v_lshrrev_b64 v[3:4], s34, v[8:9]
 ; GFX9-O0-NEXT:    s_getpc_b64 s[34:35]
 ; GFX9-O0-NEXT:    s_add_u32 s34, s34, strict_wwm_called_i64@gotpcrel32@lo+4
 ; GFX9-O0-NEXT:    s_addc_u32 s35, s35, strict_wwm_called_i64@gotpcrel32@hi+12
@@ -651,8 +651,8 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v1
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr40
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v9
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v9
 ; GFX9-O0-NEXT:    v_add_co_u32_e64 v2, s[40:41], v2, v4
 ; GFX9-O0-NEXT:    v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41]
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
@@ -660,15 +660,15 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v3
 ; GFX9-O0-NEXT:    s_mov_b32 s34, 0
 ; GFX9-O0-NEXT:    buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4
-; GFX9-O0-NEXT:    v_readlane_b32 s31, v8, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s30, v8, 0
+; GFX9-O0-NEXT:    v_readlane_b32 s31, v10, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s30, v10, 0
 ; GFX9-O0-NEXT:    ; kill: killed $vgpr0
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload
@@ -689,50 +689,50 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
 ; GFX9-O3-NEXT:    s_mov_b32 s40, s33
 ; GFX9-O3-NEXT:    s_mov_b32 s33, s32
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 ; 4-byte Folded Spill
-; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O3-NEXT:    buffer_store_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
+; GFX9-O3-NEXT:    buffer_store_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    buffer_store_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O3-NEXT:    buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_writelane_b32 v6, s30, 0
+; GFX9-O3-NEXT:    v_writelane_b32 v8, s30, 0
 ; GFX9-O3-NEXT:    s_addk_i32 s32, 0x800
-; GFX9-O3-NEXT:    v_writelane_b32 v6, s31, 1
+; GFX9-O3-NEXT:    v_writelane_b32 v8, s31, 1
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[34:35], -1
 ; GFX9-O3-NEXT:    s_getpc_b64 s[36:37]
 ; GFX9-O3-NEXT:    s_add_u32 s36, s36, strict_wwm_called_i64@gotpcrel32@lo+4
 ; GFX9-O3-NEXT:    s_addc_u32 s37, s37, strict_wwm_called_i64@gotpcrel32@hi+12
 ; GFX9-O3-NEXT:    s_load_dwordx2 s[36:37], s[36:37], 0x0
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s8
-; GFX9-O3-NEXT:    v_mov_b32_e32 v8, s9
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, s8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v7, s9
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
+; GFX9-O3-NEXT:    v_mov_b32_e32 v6, 0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v7, 0
-; GFX9-O3-NEXT:    v_mov_b32_e32 v8, 0
 ; GFX9-O3-NEXT:    s_not_b64 exec, exec
 ; GFX9-O3-NEXT:    s_or_saveexec_b64 s[38:39], -1
-; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v7
-; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v8
+; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v6
+; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v7
 ; GFX9-O3-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-O3-NEXT:    s_swappc_b64 s[30:31], s[36:37]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v2, v0
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v3, v1
-; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v7
-; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v8, vcc
+; GFX9-O3-NEXT:    v_add_co_u32_e32 v2, vcc, v2, v6
+; GFX9-O3-NEXT:    v_addc_co_u32_e32 v3, vcc, v3, v7, vcc
 ; GFX9-O3-NEXT:    s_mov_b64 exec, s[38:39]
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v0, v2
 ; GFX9-O3-NEXT:    v_mov_b32_e32 v1, v3
 ; GFX9-O3-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0 offset:4
-; GFX9-O3-NEXT:    v_readlane_b32 s31, v6, 1
-; GFX9-O3-NEXT:    v_readlane_b32 s30, v6, 0
+; GFX9-O3-NEXT:    v_readlane_b32 s31, v8, 1
+; GFX9-O3-NEXT:    v_readlane_b32 s30, v8, 0
 ; GFX9-O3-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
-; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v8, off, s[0:3], s33 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
+; GFX9-O3-NEXT:    buffer_load_dword v7, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload
 ; GFX9-O3-NEXT:    buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload
@@ -922,64 +922,64 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0:       ; %bb.0:
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v36, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v37, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v38, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-O0-NEXT:    buffer_store_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
-; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s64, 0
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s65, 1
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s66, 2
-; GFX9-O0-NEXT:    v_writelane_b32 v32, s67, 3
-; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s64, 0
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s65, 1
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s66, 2
+; GFX9-O0-NEXT:    v_writelane_b32 v47, s67, 3
+; GFX9-O0-NEXT:    buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:20
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:16
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:12
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:8
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:4
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
@@ -987,147 +987,147 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s5
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s6
-; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s7
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s8
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s9
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s10
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s11
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s12
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s13
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s14
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s16
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s18
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s19
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s20
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s21
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s22
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s23
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
-; GFX9-O0-NEXT:    v_mov_b32_e32 v47, s25
-; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s26
-; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s27
-; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s28
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s29
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s24
 ; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-O0-NEXT:    v_mov_b32_e32 v46, s25
+; GFX9-O0-NEXT:    v_mov_b32_e32 v45, s26
+; GFX9-O0-NEXT:    v_mov_b32_e32 v44, s27
+; GFX9-O0-NEXT:    v_mov_b32_e32 v43, s28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s29
+; GFX9-O0-NEXT:    buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
 ; GFX9-O0-NEXT:    ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v1, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v2, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v3, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v4, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v5, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v6, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v7, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v8, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v9, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v10, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v11, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v12, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v13, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v14, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v15, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v16, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v17, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v18, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v19, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v47
-; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v46
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v45
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v44
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v20, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v21, v46
+; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v22, v45
+; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v23, v44
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v24, v43
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(4)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v25, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v43
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v47
-; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v46
-; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v45
-; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v44
-; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr43 killed $exec
-; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v26, v42
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_mov_b32_e32 v27, v46
+; GFX9-O0-NEXT:    v_mov_b32_e32 v28, v45
+; GFX9-O0-NEXT:    v_mov_b32_e32 v29, v44
+; GFX9-O0-NEXT:    v_mov_b32_e32 v30, v43
+; GFX9-O0-NEXT:    ; kill: def $vgpr31 killed $vgpr42 killed $exec
+; GFX9-O0-NEXT:    buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34
 ; GFX9-O0-NEXT:    ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec
@@ -1157,55 +1157,55 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr34_sgpr35
 ; GFX9-O0-NEXT:    ; implicit-def: $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67
 ; GFX9-O0-NEXT:    s_mov_b64 s[34:35], 0
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v10
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v11
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, v10
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, v11
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v32, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v33, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v9
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, v8
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, v9
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v34, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v35, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v6
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v7
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, v6
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, v7
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v36, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v37, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v4
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v5
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, v4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, v5
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v38, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v39, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v2
-; GFX9-O0-NEXT:    v_mov_b32_e32 v42, v3
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, v2
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, v3
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s34
-; GFX9-O0-NEXT:    v_mov_b32_e32 v42, s35
+; GFX9-O0-NEXT:    v_mov_b32_e32 v40, s34
+; GFX9-O0-NEXT:    v_mov_b32_e32 v41, s35
 ; GFX9-O0-NEXT:    s_not_b64 exec, exec
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v34
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v33
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:4
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v32
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v36
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v35
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:12
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v34
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:8
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v38
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v37
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:20
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v36
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:16
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v40
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v39
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:28
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v38
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:24
-; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v42
-; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v41
+; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:36
+; GFX9-O0-NEXT:    v_mov_b32_e32 v0, v40
 ; GFX9-O0-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen offset:32
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v1, s5
@@ -1233,20 +1233,19 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v23, s27
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v24, s28
 ; GFX9-O0-NEXT:    v_mov_b32_e32 v25, s29
-; GFX9-O0-NEXT:    v_readlane_b32 s67, v32, 3
-; GFX9-O0-NEXT:    v_readlane_b32 s66, v32, 2
-; GFX9-O0-NEXT:    v_readlane_b32 s65, v32, 1
-; GFX9-O0-NEXT:    v_readlane_b32 s64, v32, 0
-; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    v_readlane_b32 s67, v47, 3
+; GFX9-O0-NEXT:    v_readlane_b32 s66, v47, 2
+; GFX9-O0-NEXT:    v_readlane_b32 s65, v47, 1
+; GFX9-O0-NEXT:    v_readlane_b32 s64, v47, 0
+; GFX9-O0-NEXT:    buffer_load_dword v46, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v45, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v44, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v43, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_xor_saveexec_b64 s[34:35], -1
-; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    buffer_load_dword v32, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v33, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v34, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v35, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
@@ -1256,7 +1255,8 @@ define amdgpu_gfx <32 x i32> @strict_wwm_callee_saves(<32 x i32> inreg %keep, pt
 ; GFX9-O0-NEXT:    buffer_load_dword v39, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
-; GFX9-O0-NEXT:    buffer_load_dword v42, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-O0-NEXT:    s_mov_b64 exec, -1
+; GFX9-O0-NEXT:    buffer_load_dword v47, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
 ; GFX9-O0-NEXT:    s_mov_b64 exec, s[34:35]
 ; GFX9-O0-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-O0-NEXT:    s_setpc_b64 s[30:31]

From 3dea0aa8f4607888d0c32cd7a691d8090b1b73c7 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 23 Jan 2024 17:57:34 -0800
Subject: [PATCH 723/843] [LSR] Fix incorrect comment. NFC (#79207)

---
 llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 9346da4384f9e..7ebc5da8b25a5 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -7017,7 +7017,7 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
       assert(Expander.isSafeToExpand(TermValueS) &&
              "Terminating value was checked safe in canFoldTerminatingCondition");
 
-      // Create new terminating value at loop header
+      // Create new terminating value at loop preheader
       Value *TermValue = Expander.expandCodeFor(TermValueS, ToHelpFold->getType(),
                                                 LoopPreheader->getTerminator());
 

From ecde13b1a861696dec5c4ccae792abe25df07db9 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 23 Jan 2024 21:06:02 -0500
Subject: [PATCH 724/843] [gn build] port 7e50f006f7f6

---
 llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn    | 3 +++
 llvm/utils/gn/secondary/llvm/tools/llc/BUILD.gn         | 7 ++++++-
 llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn | 1 -
 3 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
index 12d45f1af1db8..78a9d20812ef9 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/X86/BUILD.gn
@@ -63,12 +63,14 @@ static_library("LLVMX86CodeGen") {
     "//llvm/lib/CodeGen/GlobalISel",
     "//llvm/lib/CodeGen/SelectionDAG",
     "//llvm/lib/IR",
+    "//llvm/lib/IRPrinter",
     "//llvm/lib/MC",
     "//llvm/lib/Support",
     "//llvm/lib/Target",
     "//llvm/lib/TargetParser",
     "//llvm/lib/Transforms/CFGuard",
     "//llvm/lib/Transforms/Instrumentation",
+    "//llvm/lib/Transforms/Scalar",
   ]
   include_dirs = [ "." ]
   sources = [
@@ -83,6 +85,7 @@ static_library("LLVMX86CodeGen") {
     "X86CallFrameOptimization.cpp",
     "X86CallingConv.cpp",
     "X86CmovConversion.cpp",
+    "X86CodeGenPassBuilder.cpp",
     "X86CompressEVEX.cpp",
     "X86DiscriminateMemOps.cpp",
     "X86DomainReassignment.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llc/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llc/BUILD.gn
index 5cb3e69d9f106..a968760e1c2a3 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llc/BUILD.gn
@@ -7,7 +7,9 @@ executable("llc") {
     "//llvm/lib/CodeGen/SelectionDAG",
     "//llvm/lib/IR",
     "//llvm/lib/IRReader",
+    "//llvm/lib/IRPrinter",
     "//llvm/lib/MC",
+    "//llvm/lib/Passes",
     "//llvm/lib/Support",
     "//llvm/lib/Target",
     "//llvm/lib/Target:TargetsToBuild",
@@ -16,7 +18,10 @@ executable("llc") {
     "//llvm/lib/Transforms/Utils",
     "//llvm/lib/Transforms/Vectorize",
   ]
-  sources = [ "llc.cpp" ]
+  sources = [
+    "NewPMDriver.cpp",
+    "llc.cpp",
+  ]
 
   # Support plugins.
   # FIXME: Disable dead stripping once other binaries are dead-stripped.
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
index 0d406a4178c23..8aff3e4d230e4 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CodeGen/BUILD.gn
@@ -24,7 +24,6 @@ unittest("CodeGenTests") {
     "AllocationOrderTest.cpp",
     "AsmPrinterDwarfTest.cpp",
     "CCStateTest.cpp",
-    "CodeGenPassBuilderTest.cpp",
     "DIEHashTest.cpp",
     "DIETest.cpp",
     "DwarfStringPoolEntryRefTest.cpp",

From f0c387038854d61a632520a4073d1b6ebf4997ed Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Wed, 24 Jan 2024 10:22:35 +0800
Subject: [PATCH 725/843] [Modules] [HeaderSearch] Don't reenter headers if it
 is pragma once  (#76119)

Close https://github.com/llvm/llvm-project/issues/73023

The direct issue of https://github.com/llvm/llvm-project/issues/73023 is
that we entered a header which is marked as pragma once since the
compiler think it is OK if there is controlling macro.

It doesn't make sense. I feel like it should be sufficient to skip it
after we see the '#pragma once'.

From the context, it looks like the workaround is primarily for
ObjectiveC. So we might need reviewers from OC.
---
 clang/lib/Lex/HeaderSearch.cpp | 78 +++++++++++++++++-----------------
 clang/test/Modules/pr73023.cpp | 19 +++++++++
 2 files changed, 58 insertions(+), 39 deletions(-)
 create mode 100644 clang/test/Modules/pr73023.cpp

diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index 0f1090187734f..dfa974e9a67ed 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -1408,57 +1408,57 @@ bool HeaderSearch::ShouldEnterIncludeFile(Preprocessor &PP,
   // Get information about this file.
   HeaderFileInfo &FileInfo = getFileInfo(File);
 
-  // FIXME: this is a workaround for the lack of proper modules-aware support
-  // for #import / #pragma once
-  auto TryEnterImported = [&]() -> bool {
-    if (!ModulesEnabled)
-      return false;
-    // Ensure FileInfo bits are up to date.
-    ModMap.resolveHeaderDirectives(File);
-    // Modules with builtins are special; multiple modules use builtins as
-    // modular headers, example:
-    //
-    //    module stddef { header "stddef.h" export * }
-    //
-    // After module map parsing, this expands to:
-    //
-    //    module stddef {
-    //      header "/path_to_builtin_dirs/stddef.h"
-    //      textual "stddef.h"
-    //    }
-    //
-    // It's common that libc++ and system modules will both define such
-    // submodules. Make sure cached results for a builtin header won't
-    // prevent other builtin modules from potentially entering the builtin
-    // header. Note that builtins are header guarded and the decision to
-    // actually enter them is postponed to the controlling macros logic below.
-    bool TryEnterHdr = false;
-    if (FileInfo.isCompilingModuleHeader && FileInfo.isModuleHeader)
-      TryEnterHdr = ModMap.isBuiltinHeader(File);
-
-    // Textual headers can be #imported from different modules. Since ObjC
-    // headers find in the wild might rely only on #import and do not contain
-    // controlling macros, be conservative and only try to enter textual headers
-    // if such macro is present.
-    if (!FileInfo.isModuleHeader &&
-        FileInfo.getControllingMacro(ExternalLookup))
-      TryEnterHdr = true;
-    return TryEnterHdr;
-  };
-
   // If this is a #import directive, check that we have not already imported
   // this header.
   if (isImport) {
     // If this has already been imported, don't import it again.
     FileInfo.isImport = true;
 
+    // FIXME: this is a workaround for the lack of proper modules-aware support
+    // for #import / #pragma once
+    auto TryEnterImported = [&]() -> bool {
+      if (!ModulesEnabled)
+        return false;
+      // Ensure FileInfo bits are up to date.
+      ModMap.resolveHeaderDirectives(File);
+      // Modules with builtins are special; multiple modules use builtins as
+      // modular headers, example:
+      //
+      //    module stddef { header "stddef.h" export * }
+      //
+      // After module map parsing, this expands to:
+      //
+      //    module stddef {
+      //      header "/path_to_builtin_dirs/stddef.h"
+      //      textual "stddef.h"
+      //    }
+      //
+      // It's common that libc++ and system modules will both define such
+      // submodules. Make sure cached results for a builtin header won't
+      // prevent other builtin modules from potentially entering the builtin
+      // header. Note that builtins are header guarded and the decision to
+      // actually enter them is postponed to the controlling macros logic below.
+      bool TryEnterHdr = false;
+      if (FileInfo.isCompilingModuleHeader && FileInfo.isModuleHeader)
+        TryEnterHdr = ModMap.isBuiltinHeader(File);
+
+      // Textual headers can be #imported from different modules. Since ObjC
+      // headers find in the wild might rely only on #import and do not contain
+      // controlling macros, be conservative and only try to enter textual
+      // headers if such macro is present.
+      if (!FileInfo.isModuleHeader &&
+          FileInfo.getControllingMacro(ExternalLookup))
+        TryEnterHdr = true;
+      return TryEnterHdr;
+    };
+
     // Has this already been #import'ed or #include'd?
     if (PP.alreadyIncluded(File) && !TryEnterImported())
       return false;
   } else {
     // Otherwise, if this is a #include of a file that was previously #import'd
     // or if this is the second #include of a #pragma once file, ignore it.
-    if ((FileInfo.isPragmaOnce || FileInfo.isImport) && !TryEnterImported())
+    if (FileInfo.isPragmaOnce || FileInfo.isImport)
       return false;
   }
 
diff --git a/clang/test/Modules/pr73023.cpp b/clang/test/Modules/pr73023.cpp
new file mode 100644
index 0000000000000..19d7df6ce8a4c
--- /dev/null
+++ b/clang/test/Modules/pr73023.cpp
@@ -0,0 +1,19 @@
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang_cc1 -std=c++20 %t/test.cpp -fsyntax-only -verify
+
+//--- i.h
+#ifndef I_H
+#pragma once
+struct S{};
+#endif
+
+//--- test.cpp
+// expected-no-diagnostics
+#include "i.h"
+
+int foo() {
+    return sizeof(S);
+}

From 7bda0ce15a2874ad74fb1a451a174084094ccc34 Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Wed, 24 Jan 2024 10:40:11 +0800
Subject: [PATCH 726/843] [llc] Remove C backend support (#79237)

C backend is removed in 3.1.
---
 llvm/tools/llc/llc.cpp | 14 +++-----------
 1 file changed, 3 insertions(+), 11 deletions(-)

diff --git a/llvm/tools/llc/llc.cpp b/llvm/tools/llc/llc.cpp
index d76d89eae3b18..3e2567c441df5 100644
--- a/llvm/tools/llc/llc.cpp
+++ b/llvm/tools/llc/llc.cpp
@@ -7,8 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 // This is the llc code generator driver. It provides a convenient
-// command-line interface for generating native assembly-language code
-// or C code, given LLVM bitcode.
+// command-line interface for generating an assembly file or a relocatable file,
+// given LLVM bitcode.
 //
 //===----------------------------------------------------------------------===//
 
@@ -274,15 +274,7 @@ static std::unique_ptr<ToolOutputFile> GetOutputStream(const char *TargetName,
 
       switch (codegen::getFileType()) {
       case CodeGenFileType::AssemblyFile:
-        if (TargetName[0] == 'c') {
-          if (TargetName[1] == 0)
-            OutputFilename += ".cbe.c";
-          else if (TargetName[1] == 'p' && TargetName[2] == 'p')
-            OutputFilename += ".cpp";
-          else
-            OutputFilename += ".s";
-        } else
-          OutputFilename += ".s";
+        OutputFilename += ".s";
         break;
       case CodeGenFileType::ObjectFile:
         if (OS == Triple::Win32)

From 63f742c15f01a25c60f0090a3aceb15bb8985e5e Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Tue, 23 Jan 2024 19:45:24 -0700
Subject: [PATCH 727/843] [RISCV] Add sifive-p670 processor (#79015)

This is an OOO core that has a vector unit. For more information see
https://www.sifive.com/cores/performance-p650-670.

Scheduler model and other tuning will come in separate patches.
---
 clang/test/Driver/riscv-cpus.c            | 49 ++++++++++++++++++++++-
 clang/test/Misc/target-invalid-cpu-note.c |  4 +-
 llvm/docs/ReleaseNotes.rst                |  1 +
 llvm/lib/Target/RISCV/RISCVProcessors.td  | 37 +++++++++++++++++
 4 files changed, 88 insertions(+), 3 deletions(-)

diff --git a/clang/test/Driver/riscv-cpus.c b/clang/test/Driver/riscv-cpus.c
index 015df83e77800..ff2bd6f7c8ba3 100644
--- a/clang/test/Driver/riscv-cpus.c
+++ b/clang/test/Driver/riscv-cpus.c
@@ -247,7 +247,54 @@
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zbb"
 // MCPU-SIFIVE-P450-SAME: "-target-feature" "+zbs"
 // MCPU-SIFIVE-P450-SAME: "-target-abi" "lp64d"
-//
+
+// RUN: %clang -target riscv64 -### -c %s 2>&1 -mcpu=sifive-p670 | FileCheck -check-prefix=MCPU-SIFIVE-P670 %s
+// MCPU-SIFIVE-P670: "-target-cpu" "sifive-p670"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+m"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+a"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+f"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+d"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+c"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+v"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zic64b"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zicbom"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zicbop"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zicboz"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+ziccamoa"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+ziccif"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zicclsm"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+ziccrse"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zicsr"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zifencei"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zihintntl"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zihintpause"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zihpm"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+za64rs"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zfhmin"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zba"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zbb"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zbs"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvbb"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvbc"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zve32f"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zve32x"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zve64d"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zve64f"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zve64x"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvkg"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvkn"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvknc"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvkned"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvkng"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvknhb"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvks"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvksc"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvksed"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvksg"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvksh"
+// MCPU-SIFIVE-P670-SAME: "-target-feature" "+zvkt"
+// MCPU-SIFIVE-P670-SAME: "-target-abi" "lp64d"
+
 // Check failed cases
 
 // RUN: not %clang --target=riscv32 -### -c %s 2>&1 -mcpu=generic-rv321 | FileCheck -check-prefix=FAIL-MCPU-NAME %s
diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c
index 48e9f05d9b03d..84aed5c9c36fe 100644
--- a/clang/test/Misc/target-invalid-cpu-note.c
+++ b/clang/test/Misc/target-invalid-cpu-note.c
@@ -85,7 +85,7 @@
 
 // RUN: not %clang_cc1 -triple riscv64 -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix RISCV64
 // RISCV64: error: unknown target CPU 'not-a-cpu'
-// RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-p450, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, veyron-v1, xiangshan-nanhu{{$}}
+// RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-p450, sifive-p670, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, veyron-v1, xiangshan-nanhu{{$}}
 
 // RUN: not %clang_cc1 -triple riscv32 -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE-RISCV32
 // TUNE-RISCV32: error: unknown target CPU 'not-a-cpu'
@@ -93,4 +93,4 @@
 
 // RUN: not %clang_cc1 -triple riscv64 -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE-RISCV64
 // TUNE-RISCV64: error: unknown target CPU 'not-a-cpu'
-// TUNE-RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-p450, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, veyron-v1, xiangshan-nanhu, generic, rocket, sifive-7-series{{$}}
+// TUNE-RISCV64-NEXT: note: valid target CPU values are: generic-rv64, rocket-rv64, sifive-p450, sifive-p670, sifive-s21, sifive-s51, sifive-s54, sifive-s76, sifive-u54, sifive-u74, sifive-x280, veyron-v1, xiangshan-nanhu, generic, rocket, sifive-7-series{{$}}
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 4ef28e277228c..7b6a3f10d6377 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -180,6 +180,7 @@ Changes to the RISC-V Backend
   and Zic64b extensions which were introduced as a part of the RISC-V Profiles
   specification.
 * The Smepmp 1.0 extension is now supported.
+* ``-mcpu=sifive-p670`` was added.
 
 Changes to the WebAssembly Backend
 ----------------------------------
diff --git a/llvm/lib/Target/RISCV/RISCVProcessors.td b/llvm/lib/Target/RISCV/RISCVProcessors.td
index af621de9f802a..03ca505d100df 100644
--- a/llvm/lib/Target/RISCV/RISCVProcessors.td
+++ b/llvm/lib/Target/RISCV/RISCVProcessors.td
@@ -245,6 +245,43 @@ def SIFIVE_P450 : RISCVProcessorModel<"sifive-p450", SiFiveP400Model,
                                        TuneLUIADDIFusion,
                                        TuneAUIPCADDIFusion]>;
 
+def SIFIVE_P670 : RISCVProcessorModel<"sifive-p670", NoSchedModel,
+                                      [Feature64Bit,
+                                       FeatureStdExtZifencei,
+                                       FeatureStdExtM,
+                                       FeatureStdExtA,
+                                       FeatureStdExtF,
+                                       FeatureStdExtD,
+                                       FeatureStdExtC,
+                                       FeatureStdExtZa64rs,
+                                       FeatureStdExtZic64b,
+                                       FeatureStdExtZicbop,
+                                       FeatureStdExtZicbom,
+                                       FeatureStdExtZicboz,
+                                       FeatureStdExtZiccamoa,
+                                       FeatureStdExtZiccif,
+                                       FeatureStdExtZicclsm,
+                                       FeatureStdExtZiccrse,
+                                       FeatureStdExtZihintntl,
+                                       FeatureStdExtZihintpause,
+                                       FeatureStdExtZihpm,
+                                       FeatureStdExtZba,
+                                       FeatureStdExtZbb,
+                                       FeatureStdExtZbs,
+                                       FeatureStdExtZfhmin,
+                                       FeatureStdExtV,
+                                       FeatureStdExtZvl128b,
+                                       FeatureStdExtZvbb,
+                                       FeatureStdExtZvknc,
+                                       FeatureStdExtZvkng,
+                                       FeatureStdExtZvksc,
+                                       FeatureStdExtZvksg,
+                                       FeatureFastUnalignedAccess],
+                                      [TuneNoDefaultUnroll,
+                                       TuneConditionalCompressedMoveFusion,
+                                       TuneLUIADDIFusion,
+                                       TuneAUIPCADDIFusion]>;
+
 def SYNTACORE_SCR1_BASE : RISCVProcessorModel<"syntacore-scr1-base",
                                               SyntacoreSCR1Model,
                                               [Feature32Bit,

From 93248729cfae82a5ca2323d4a8e15aa3b9b9c707 Mon Sep 17 00:00:00 2001
From: Wang Pengcheng <wangpengcheng.pp@bytedance.com>
Date: Wed, 24 Jan 2024 10:49:14 +0800
Subject: [PATCH 728/843] [RISCV][MC] Split tests for A into Zaamo and Zalrsc
 parts

So that we don't duplicate tests in later patch.

Reviewers: topperc, dtcxzyw, asb

Reviewed By: asb

Pull Request: https://github.com/llvm/llvm-project/pull/79111
---
 .../{rv32a-invalid.s => rv32zaamo-invalid.s}  |  7 ----
 .../{rv32a-valid.s => rv32zaamo-valid.s}      | 26 ------------
 llvm/test/MC/RISCV/rv32zalrsc-invalid.s       |  7 ++++
 llvm/test/MC/RISCV/rv32zalrsc-valid.s         | 36 ++++++++++++++++
 .../{rv64a-invalid.s => rv64zaamo-invalid.s}  |  4 --
 .../{rv64a-valid.s => rv64zaamo-valid.s}      | 34 ---------------
 llvm/test/MC/RISCV/rv64zalrsc-invalid.s       |  7 ++++
 llvm/test/MC/RISCV/rv64zalrsc-valid.s         | 42 +++++++++++++++++++
 8 files changed, 92 insertions(+), 71 deletions(-)
 rename llvm/test/MC/RISCV/{rv32a-invalid.s => rv32zaamo-invalid.s} (71%)
 rename llvm/test/MC/RISCV/{rv32a-valid.s => rv32zaamo-valid.s} (85%)
 create mode 100644 llvm/test/MC/RISCV/rv32zalrsc-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv32zalrsc-valid.s
 rename llvm/test/MC/RISCV/{rv64a-invalid.s => rv64zaamo-invalid.s} (78%)
 rename llvm/test/MC/RISCV/{rv64a-valid.s => rv64zaamo-valid.s} (83%)
 create mode 100644 llvm/test/MC/RISCV/rv64zalrsc-invalid.s
 create mode 100644 llvm/test/MC/RISCV/rv64zalrsc-valid.s

diff --git a/llvm/test/MC/RISCV/rv32a-invalid.s b/llvm/test/MC/RISCV/rv32zaamo-invalid.s
similarity index 71%
rename from llvm/test/MC/RISCV/rv32a-invalid.s
rename to llvm/test/MC/RISCV/rv32zaamo-invalid.s
index 34d51fc30ca21..f6183fbc8a1f1 100644
--- a/llvm/test/MC/RISCV/rv32a-invalid.s
+++ b/llvm/test/MC/RISCV/rv32zaamo-invalid.s
@@ -4,15 +4,8 @@
 amoswap.w a1, a2, a3 # CHECK: :[[@LINE]]:19: error: expected '(' or optional integer offset
 amomin.w a1, a2, 1 # CHECK: :[[@LINE]]:20: error: expected '(' after optional integer offset
 amomin.w a1, a2, 1(a3) # CHECK: :[[@LINE]]:18: error: optional integer offset must be 0
-lr.w a4, a5 # CHECK: :[[@LINE]]:10: error: expected '(' or optional integer offset
 
 # Only .aq, .rl, and .aqrl suffixes are valid
 amoxor.w.rlqa a2, a3, (a4) # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
 amoor.w.aq.rl a4, a5, (a6) # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
 amoor.w. a4, a5, (a6) # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
-
-# lr only takes two operands
-lr.w s0, (s1), s2 # CHECK: :[[@LINE]]:16: error: invalid operand for instruction
-
-# Note: errors for use of RV64A instructions for RV32 are checked in
-# rv64a-valid.s
diff --git a/llvm/test/MC/RISCV/rv32a-valid.s b/llvm/test/MC/RISCV/rv32zaamo-valid.s
similarity index 85%
rename from llvm/test/MC/RISCV/rv32a-valid.s
rename to llvm/test/MC/RISCV/rv32zaamo-valid.s
index 1f66680c27114..ea1ae79558443 100644
--- a/llvm/test/MC/RISCV/rv32a-valid.s
+++ b/llvm/test/MC/RISCV/rv32zaamo-valid.s
@@ -9,32 +9,6 @@
 # RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
 # RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
 
-# CHECK-ASM-AND-OBJ: lr.w t0, (t1)
-# CHECK-ASM: encoding: [0xaf,0x22,0x03,0x10]
-lr.w t0, (t1)
-# CHECK-ASM-AND-OBJ: lr.w.aq t1, (t2)
-# CHECK-ASM: encoding: [0x2f,0xa3,0x03,0x14]
-lr.w.aq t1, (t2)
-# CHECK-ASM-AND-OBJ: lr.w.rl t2, (t3)
-# CHECK-ASM: encoding: [0xaf,0x23,0x0e,0x12]
-lr.w.rl t2, (t3)
-# CHECK-ASM-AND-OBJ: lr.w.aqrl t3, (t4)
-# CHECK-ASM: encoding: [0x2f,0xae,0x0e,0x16]
-lr.w.aqrl t3, (t4)
-
-# CHECK-ASM-AND-OBJ: sc.w t6, t5, (t4)
-# CHECK-ASM: encoding: [0xaf,0xaf,0xee,0x19]
-sc.w t6, t5, (t4)
-# CHECK-ASM-AND-OBJ: sc.w.aq t5, t4, (t3)
-# CHECK-ASM: encoding: [0x2f,0x2f,0xde,0x1d]
-sc.w.aq t5, t4, (t3)
-# CHECK-ASM-AND-OBJ: sc.w.rl t4, t3, (t2)
-# CHECK-ASM: encoding: [0xaf,0xae,0xc3,0x1b]
-sc.w.rl t4, t3, (t2)
-# CHECK-ASM-AND-OBJ: sc.w.aqrl t3, t2, (t1)
-# CHECK-ASM: encoding: [0x2f,0x2e,0x73,0x1e]
-sc.w.aqrl t3, t2, (t1)
-
 # CHECK-ASM-AND-OBJ: amoswap.w a4, ra, (s0)
 # CHECK-ASM: encoding: [0x2f,0x27,0x14,0x08]
 amoswap.w a4, ra, (s0)
diff --git a/llvm/test/MC/RISCV/rv32zalrsc-invalid.s b/llvm/test/MC/RISCV/rv32zalrsc-invalid.s
new file mode 100644
index 0000000000000..61cfc614b7c42
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zalrsc-invalid.s
@@ -0,0 +1,7 @@
+# RUN: not llvm-mc -triple riscv32 -mattr=+a < %s 2>&1 | FileCheck %s
+
+# Final operand must have parentheses
+lr.w a4, a5 # CHECK: :[[@LINE]]:10: error: expected '(' or optional integer offset
+
+# lr only takes two operands
+lr.w s0, (s1), s2 # CHECK: :[[@LINE]]:16: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv32zalrsc-valid.s b/llvm/test/MC/RISCV/rv32zalrsc-valid.s
new file mode 100644
index 0000000000000..0d4881a4b45a7
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv32zalrsc-valid.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc %s -triple=riscv32 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv32 -mattr=+a < %s \
+# RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
+# RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+
+# CHECK-ASM-AND-OBJ: lr.w t0, (t1)
+# CHECK-ASM: encoding: [0xaf,0x22,0x03,0x10]
+lr.w t0, (t1)
+# CHECK-ASM-AND-OBJ: lr.w.aq t1, (t2)
+# CHECK-ASM: encoding: [0x2f,0xa3,0x03,0x14]
+lr.w.aq t1, (t2)
+# CHECK-ASM-AND-OBJ: lr.w.rl t2, (t3)
+# CHECK-ASM: encoding: [0xaf,0x23,0x0e,0x12]
+lr.w.rl t2, (t3)
+# CHECK-ASM-AND-OBJ: lr.w.aqrl t3, (t4)
+# CHECK-ASM: encoding: [0x2f,0xae,0x0e,0x16]
+lr.w.aqrl t3, (t4)
+
+# CHECK-ASM-AND-OBJ: sc.w t6, t5, (t4)
+# CHECK-ASM: encoding: [0xaf,0xaf,0xee,0x19]
+sc.w t6, t5, (t4)
+# CHECK-ASM-AND-OBJ: sc.w.aq t5, t4, (t3)
+# CHECK-ASM: encoding: [0x2f,0x2f,0xde,0x1d]
+sc.w.aq t5, t4, (t3)
+# CHECK-ASM-AND-OBJ: sc.w.rl t4, t3, (t2)
+# CHECK-ASM: encoding: [0xaf,0xae,0xc3,0x1b]
+sc.w.rl t4, t3, (t2)
+# CHECK-ASM-AND-OBJ: sc.w.aqrl t3, t2, (t1)
+# CHECK-ASM: encoding: [0x2f,0x2e,0x73,0x1e]
+sc.w.aqrl t3, t2, (t1)
diff --git a/llvm/test/MC/RISCV/rv64a-invalid.s b/llvm/test/MC/RISCV/rv64zaamo-invalid.s
similarity index 78%
rename from llvm/test/MC/RISCV/rv64a-invalid.s
rename to llvm/test/MC/RISCV/rv64zaamo-invalid.s
index 2816f434e4706..70a4e557755ba 100644
--- a/llvm/test/MC/RISCV/rv64a-invalid.s
+++ b/llvm/test/MC/RISCV/rv64zaamo-invalid.s
@@ -4,12 +4,8 @@
 amoswap.d a1, a2, a3 # CHECK: :[[@LINE]]:19: error: expected '(' or optional integer offset
 amomin.d a1, a2, 1 # CHECK: :[[@LINE]]:20: error: expected '(' after optional integer offset
 amomin.d a1, a2, 1(a3) # CHECK: :[[@LINE]]:18: error: optional integer offset must be 0
-lr.d a4, a5 # CHECK: :[[@LINE]]:10: error: expected '(' or optional integer offset
 
 # Only .aq, .rl, and .aqrl suffixes are valid
 amoxor.d.rlqa a2, a3, (a4) # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
 amoor.d.aq.rl a4, a5, (a6) # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
 amoor.d. a4, a5, (a6) # CHECK: :[[@LINE]]:1: error: unrecognized instruction mnemonic
-
-# lr only takes two operands
-lr.d s0, (s1), s2 # CHECK: :[[@LINE]]:16: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64a-valid.s b/llvm/test/MC/RISCV/rv64zaamo-valid.s
similarity index 83%
rename from llvm/test/MC/RISCV/rv64a-valid.s
rename to llvm/test/MC/RISCV/rv64zaamo-valid.s
index 3276b397f7194..73cdc55584341 100644
--- a/llvm/test/MC/RISCV/rv64a-valid.s
+++ b/llvm/test/MC/RISCV/rv64zaamo-valid.s
@@ -7,40 +7,6 @@
 # RUN: not llvm-mc -triple riscv32 -mattr=+a < %s 2>&1 \
 # RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
 
-# CHECK-ASM-AND-OBJ: lr.d t0, (t1)
-# CHECK-ASM: encoding: [0xaf,0x32,0x03,0x10]
-# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
-lr.d t0, (t1)
-# CHECK-ASM-AND-OBJ: lr.d.aq t1, (t2)
-# CHECK-ASM: encoding: [0x2f,0xb3,0x03,0x14]
-# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
-lr.d.aq t1, (t2)
-# CHECK-ASM-AND-OBJ: lr.d.rl t2, (t3)
-# CHECK-ASM: encoding: [0xaf,0x33,0x0e,0x12]
-# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
-lr.d.rl t2, (t3)
-# CHECK-ASM-AND-OBJ: lr.d.aqrl t3, (t4)
-# CHECK-ASM: encoding: [0x2f,0xbe,0x0e,0x16]
-# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
-lr.d.aqrl t3, (t4)
-
-# CHECK-ASM-AND-OBJ: sc.d t6, t5, (t4)
-# CHECK-ASM: encoding: [0xaf,0xbf,0xee,0x19]
-# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
-sc.d t6, t5, (t4)
-# CHECK-ASM-AND-OBJ: sc.d.aq t5, t4, (t3)
-# CHECK-ASM: encoding: [0x2f,0x3f,0xde,0x1d]
-# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
-sc.d.aq t5, t4, (t3)
-# CHECK-ASM-AND-OBJ: sc.d.rl t4, t3, (t2)
-# CHECK-ASM: encoding: [0xaf,0xbe,0xc3,0x1b]
-# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
-sc.d.rl t4, t3, (t2)
-# CHECK-ASM-AND-OBJ: sc.d.aqrl t3, t2, (t1)
-# CHECK-ASM: encoding: [0x2f,0x3e,0x73,0x1e]
-# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
-sc.d.aqrl t3, t2, (t1)
-
 # CHECK-ASM-AND-OBJ: amoswap.d a4, ra, (s0)
 # CHECK-ASM: encoding: [0x2f,0x37,0x14,0x08]
 # CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
diff --git a/llvm/test/MC/RISCV/rv64zalrsc-invalid.s b/llvm/test/MC/RISCV/rv64zalrsc-invalid.s
new file mode 100644
index 0000000000000..0be009725ed82
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zalrsc-invalid.s
@@ -0,0 +1,7 @@
+# RUN: not llvm-mc -triple riscv64 -mattr=+a < %s 2>&1 | FileCheck %s
+
+# Final operand must have parentheses
+lr.d a4, a5 # CHECK: :[[@LINE]]:10: error: expected '(' or optional integer offset
+
+# lr only takes two operands
+lr.d s0, (s1), s2 # CHECK: :[[@LINE]]:16: error: invalid operand for instruction
diff --git a/llvm/test/MC/RISCV/rv64zalrsc-valid.s b/llvm/test/MC/RISCV/rv64zalrsc-valid.s
new file mode 100644
index 0000000000000..d4c87523b71c1
--- /dev/null
+++ b/llvm/test/MC/RISCV/rv64zalrsc-valid.s
@@ -0,0 +1,42 @@
+# RUN: llvm-mc %s -triple=riscv64 -mattr=+a -riscv-no-aliases -show-encoding \
+# RUN:     | FileCheck -check-prefixes=CHECK-ASM,CHECK-ASM-AND-OBJ %s
+# RUN: llvm-mc -filetype=obj -triple=riscv64 -mattr=+a < %s \
+# RUN:     | llvm-objdump --mattr=+a -M no-aliases -d -r - \
+# RUN:     | FileCheck --check-prefix=CHECK-ASM-AND-OBJ %s
+#
+# RUN: not llvm-mc -triple riscv32 -mattr=+a < %s 2>&1 \
+# RUN:     | FileCheck -check-prefix=CHECK-RV32 %s
+
+# CHECK-ASM-AND-OBJ: lr.d t0, (t1)
+# CHECK-ASM: encoding: [0xaf,0x32,0x03,0x10]
+# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
+lr.d t0, (t1)
+# CHECK-ASM-AND-OBJ: lr.d.aq t1, (t2)
+# CHECK-ASM: encoding: [0x2f,0xb3,0x03,0x14]
+# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
+lr.d.aq t1, (t2)
+# CHECK-ASM-AND-OBJ: lr.d.rl t2, (t3)
+# CHECK-ASM: encoding: [0xaf,0x33,0x0e,0x12]
+# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
+lr.d.rl t2, (t3)
+# CHECK-ASM-AND-OBJ: lr.d.aqrl t3, (t4)
+# CHECK-ASM: encoding: [0x2f,0xbe,0x0e,0x16]
+# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
+lr.d.aqrl t3, (t4)
+
+# CHECK-ASM-AND-OBJ: sc.d t6, t5, (t4)
+# CHECK-ASM: encoding: [0xaf,0xbf,0xee,0x19]
+# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
+sc.d t6, t5, (t4)
+# CHECK-ASM-AND-OBJ: sc.d.aq t5, t4, (t3)
+# CHECK-ASM: encoding: [0x2f,0x3f,0xde,0x1d]
+# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
+sc.d.aq t5, t4, (t3)
+# CHECK-ASM-AND-OBJ: sc.d.rl t4, t3, (t2)
+# CHECK-ASM: encoding: [0xaf,0xbe,0xc3,0x1b]
+# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
+sc.d.rl t4, t3, (t2)
+# CHECK-ASM-AND-OBJ: sc.d.aqrl t3, t2, (t1)
+# CHECK-ASM: encoding: [0x2f,0x3e,0x73,0x1e]
+# CHECK-RV32: :[[@LINE+1]]:1: error: instruction requires the following: RV64I Base Instruction Set{{$}}
+sc.d.aqrl t3, t2, (t1)

From 987087df90026605fc8d03ebda5a1cd31b71e609 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 23 Jan 2024 18:21:59 -0800
Subject: [PATCH 729/843] Bump trunk version to 19.0.0git

---
 clang-tools-extra/docs/ReleaseNotes.rst  |  455 --------
 clang/docs/ReleaseNotes.rst              | 1283 ----------------------
 libcxx/include/__config                  |    2 +-
 lld/docs/ReleaseNotes.rst                |   13 -
 llvm/CMakeLists.txt                      |    2 +-
 llvm/docs/ReleaseNotes.rst               |  252 +----
 llvm/utils/gn/secondary/llvm/version.gni |    2 +-
 llvm/utils/lit/lit/__init__.py           |    2 +-
 openmp/docs/ReleaseNotes.rst             |    6 +-
 pstl/docs/ReleaseNotes.rst               |    6 +-
 10 files changed, 11 insertions(+), 2012 deletions(-)

diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 5758b5acbc0b5..fd2cba4e4f463 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -69,9 +69,6 @@ Code completion
 Code actions
 ^^^^^^^^^^^^
 
-- The extract variable tweak gained support for extracting lambda expressions to a variable.
-- A new tweak was added for turning unscoped into scoped enums.
-
 Signature help
 ^^^^^^^^^^^^^^
 
@@ -100,467 +97,15 @@ The improvements are...
 Improvements to clang-tidy
 --------------------------
 
-- Preprocessor-level module header parsing is now disabled by default due to
-  the problems it caused in C++20 and above, leading to performance and code
-  parsing issues regardless of whether modules were used or not. This change
-  will impact only the following checks:
-  :doc:`modernize-replace-disallow-copy-and-assign-macro
-  <clang-tidy/checks/modernize/replace-disallow-copy-and-assign-macro>`,
-  :doc:`bugprone-reserved-identifier
-  <clang-tidy/checks/bugprone/reserved-identifier>`, and
-  :doc:`readability-identifier-naming
-  <clang-tidy/checks/readability/identifier-naming>`. Those checks will no
-  longer see macros defined in modules. Users can still enable this
-  functionality using the newly added command line option
-  `--enable-module-headers-parsing`.
-
-- Remove configuration option `AnalyzeTemporaryDestructors`, which was deprecated since
-  :program:`clang-tidy` 16.
-
-- Improved `--dump-config` to print check options in alphabetical order.
-
-- Improved :program:`clang-tidy-diff.py` script.
-    * Return exit code `1` if any :program:`clang-tidy` subprocess exits with
-      a non-zero code or if exporting fixes fails.
-
-    * Accept a directory as a value for `-export-fixes` to export individual
-      yaml files for each compilation unit.
-
-    * Introduce a `-config-file` option that forwards a configuration file to
-      :program:`clang-tidy`. Corresponds to the `--config-file` option in
-      :program:`clang-tidy`.
-
-- Improved :program:`run-clang-tidy.py` script. It now accepts a directory
-  as a value for `-export-fixes` to export individual yaml files for each
-  compilation unit.
-
-
 New checks
 ^^^^^^^^^^
 
-- New :doc:`bugprone-casting-through-void
-  <clang-tidy/checks/bugprone/casting-through-void>` check.
-
-  Detects unsafe or redundant two-step casting operations involving ``void*``.
-
-- New :doc:`bugprone-chained-comparison
-  <clang-tidy/checks/bugprone/chained-comparison>` check.
-
-  Check detects chained comparison operators that can lead to unintended
-  behavior or logical errors.
-
-- New :doc:`bugprone-compare-pointer-to-member-virtual-function
-  <clang-tidy/checks/bugprone/compare-pointer-to-member-virtual-function>` check.
-
-  Detects equality comparison between pointer to member virtual function and
-  anything other than null-pointer-constant.
-
-- New :doc:`bugprone-inc-dec-in-conditions
-  <clang-tidy/checks/bugprone/inc-dec-in-conditions>` check.
-
-  Detects when a variable is both incremented/decremented and referenced inside
-  a complex condition and suggests moving them outside to avoid ambiguity in
-  the variable's value.
-
-- New :doc:`bugprone-incorrect-enable-if
-  <clang-tidy/checks/bugprone/incorrect-enable-if>` check.
-
-  Detects incorrect usages of ``std::enable_if`` that don't name the nested
-  ``type`` type.
-
-- New :doc:`bugprone-multi-level-implicit-pointer-conversion
-  <clang-tidy/checks/bugprone/multi-level-implicit-pointer-conversion>` check.
-
-  Detects implicit conversions between pointers of different levels of
-  indirection.
-
-- New :doc:`bugprone-optional-value-conversion
-  <clang-tidy/checks/bugprone/optional-value-conversion>` check.
-
-  Detects potentially unintentional and redundant conversions where a value is
-  extracted from an optional-like type and then used to create a new instance
-  of the same optional-like type.
-
-- New :doc:`bugprone-unused-local-non-trivial-variable
-  <clang-tidy/checks/bugprone/unused-local-non-trivial-variable>` check.
-
-  Warns when a local non trivial variable is unused within a function.
-
-- New :doc:`cppcoreguidelines-no-suspend-with-lock
-  <clang-tidy/checks/cppcoreguidelines/no-suspend-with-lock>` check.
-
-  Flags coroutines that suspend while a lock guard is in scope at the
-  suspension point.
-
-- New :doc:`hicpp-ignored-remove-result
-  <clang-tidy/checks/hicpp/ignored-remove-result>` check.
-
-  Ensure that the result of ``std::remove``, ``std::remove_if`` and
-  ``std::unique`` are not ignored according to rule 17.5.1.
-
-- New :doc:`misc-coroutine-hostile-raii
-  <clang-tidy/checks/misc/coroutine-hostile-raii>` check.
-
-  Detects when objects of certain hostile RAII types persists across suspension
-  points in a coroutine. Such hostile types include scoped-lockable types and
-  types belonging to a configurable denylist.
-
-- New :doc:`modernize-use-constraints
-  <clang-tidy/checks/modernize/use-constraints>` check.
-
-  Replace ``enable_if`` with C++20 requires clauses.
-
-- New :doc:`modernize-use-starts-ends-with
-  <clang-tidy/checks/modernize/use-starts-ends-with>` check.
-
-  Checks whether a ``find`` or ``rfind`` result is compared with 0 and suggests
-  replacing with ``starts_with`` when the method exists in the class. Notably,
-  this will work with ``std::string`` and ``std::string_view``.
-
-- New :doc:`modernize-use-std-numbers
-  <clang-tidy/checks/modernize/use-std-numbers>` check.
-
-  Finds constants and function calls to math functions that can be replaced
-  with C++20's mathematical constants from the ``numbers`` header and
-  offers fix-it hints.
-
-- New :doc:`performance-enum-size
-  <clang-tidy/checks/performance/enum-size>` check.
-
-  Recommends the smallest possible underlying type for an ``enum`` or ``enum``
-  class based on the range of its enumerators.
-
-- New :doc:`readability-avoid-nested-conditional-operator
-  <clang-tidy/checks/readability/avoid-nested-conditional-operator>` check.
-
-  Identifies instances of nested conditional operators in the code.
-
-- New :doc:`readability-avoid-return-with-void-value
-  <clang-tidy/checks/readability/avoid-return-with-void-value>` check.
-
-  Finds return statements with ``void`` values used within functions with
-  ``void`` result types.
-
-- New :doc:`readability-redundant-casting
-  <clang-tidy/checks/readability/redundant-casting>` check.
-
-  Detects explicit type casting operations that involve the same source and
-  destination types, and subsequently recommend their removal.
-  
-- New :doc:`readability-redundant-inline-specifier
-  <clang-tidy/checks/readability/redundant-inline-specifier>` check.
-
-  Detects redundant ``inline`` specifiers on function and variable declarations.
-
-- New :doc:`readability-reference-to-constructed-temporary
-  <clang-tidy/checks/readability/reference-to-constructed-temporary>` check.
-
-  Detects C++ code where a reference variable is used to extend the lifetime
-  of a temporary object that has just been constructed.
-
 New check aliases
 ^^^^^^^^^^^^^^^^^
 
-- New alias :doc:`cppcoreguidelines-macro-to-enum
-  <clang-tidy/checks/cppcoreguidelines/macro-to-enum>` to :doc:`modernize-macro-to-enum
-  <clang-tidy/checks/modernize/macro-to-enum>` was added.
-
 Changes in existing checks
 ^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-- Improved :doc:`abseil-string-find-startswith
-  <clang-tidy/checks/abseil/string-find-startswith>` check to also consider
-  ``std::basic_string_view`` in addition to ``std::basic_string`` by default.
-
-- Improved :doc:`bugprone-assert-side-effect
-  <clang-tidy/checks/bugprone/assert-side-effect>` check to report usage of
-  non-const ``<<`` and ``>>`` operators in assertions and fixed some false-positives
-  with const operators.
-
-- Improved :doc:`bugprone-dangling-handle
-  <clang-tidy/checks/bugprone/dangling-handle>` check to support functional
-  casting during type conversions at variable initialization, now with improved
-  compatibility for C++17 and later versions.
-
-- Improved :doc:`bugprone-exception-escape
-  <clang-tidy/checks/bugprone/exception-escape>` check by extending the default
-  check function names to include ``iter_swap`` and ``iter_move``.
-
-- Improved :doc:`bugprone-implicit-widening-of-multiplication-result
-  <clang-tidy/checks/bugprone/implicit-widening-of-multiplication-result>` check
-  to correctly emit fixes.
-
-- Improved :doc:`bugprone-lambda-function-name
-  <clang-tidy/checks/bugprone/lambda-function-name>` check by adding option
-  `IgnoreMacros` to ignore warnings in macros.
-
-- Improved :doc:`bugprone-non-zero-enum-to-bool-conversion
-  <clang-tidy/checks/bugprone/non-zero-enum-to-bool-conversion>` check by
-  eliminating false positives resulting from direct usage of bitwise operators.
-
-- Improved :doc:`bugprone-reserved-identifier
-  <clang-tidy/checks/bugprone/reserved-identifier>` check, so that it does not
-  warn on macros starting with underscore and lowercase letter.
-
-- Improved :doc:`bugprone-sizeof-expression
-  <clang-tidy/checks/bugprone/sizeof-expression>` check diagnostics to precisely
-  highlight specific locations, providing more accurate guidance.
-
-- Improved :doc:`bugprone-unchecked-optional-access
-  <clang-tidy/checks/bugprone/unchecked-optional-access>` check, so that it does
-  not crash during handling of optional values.
-
-- Improved :doc:`bugprone-undefined-memory-manipulation
-  <clang-tidy/checks/bugprone/undefined-memory-manipulation>` check to support
-  fixed-size arrays of non-trivial types.
-
-- Improved :doc:`bugprone-unused-return-value
-  <clang-tidy/checks/bugprone/unused-return-value>` check diagnostic message,
-  added support for detection of unused results when cast to non-``void`` type.
-  Casting to ``void`` no longer suppresses issues by default, control this
-  behavior with the new `AllowCastToVoid` option.
-
-- Improved :doc:`cppcoreguidelines-avoid-non-const-global-variables
-  <clang-tidy/checks/cppcoreguidelines/avoid-non-const-global-variables>` check
-  to ignore ``static`` variables declared within the scope of
-  ``class``/``struct``.
-
-- Improved :doc:`cppcoreguidelines-avoid-reference-coroutine-parameters
-  <clang-tidy/checks/cppcoreguidelines/avoid-reference-coroutine-parameters>`
-  check to ignore false positives related to matching parameters of non
-  coroutine functions and increase issue detection for cases involving type
-  aliases with references.
-
-- Improved :doc:`cppcoreguidelines-missing-std-forward
-  <clang-tidy/checks/cppcoreguidelines/missing-std-forward>` check to
-  address false positives in the capture list and body of lambdas.
-
-- Improved :doc:`cppcoreguidelines-narrowing-conversions
-  <clang-tidy/checks/cppcoreguidelines/narrowing-conversions>` check by
-  extending the `IgnoreConversionFromTypes` option to include types without a
-  declaration, such as built-in types.
-
-- Improved :doc:`cppcoreguidelines-prefer-member-initializer
-  <clang-tidy/checks/cppcoreguidelines/prefer-member-initializer>` check to
-  ignore delegate constructors and ignore re-assignment for reference or when
-  initialization depend on field that is initialized before. Additionally, it
-  now provides valid fixes for member variables initialized with macros.
-
-- Improved :doc:`cppcoreguidelines-pro-bounds-array-to-pointer-decay
-  <clang-tidy/checks/cppcoreguidelines/pro-bounds-array-to-pointer-decay>` check
-  to ignore predefined expression (e.g., ``__func__``, ...).
-
-- Improved :doc:`cppcoreguidelines-pro-bounds-constant-array-index
-  <clang-tidy/checks/cppcoreguidelines/pro-bounds-constant-array-index>` check
-  to perform checks on derived classes of  ``std::array``.
-
-- Improved :doc:`cppcoreguidelines-pro-type-const-cast
-  <clang-tidy/checks/cppcoreguidelines/pro-type-const-cast>` check to ignore
-  casts to ``const`` or ``volatile`` type (controlled by `StrictMode` option)
-  and casts in implicitly invoked code.
-
-- Improved :doc:`cppcoreguidelines-pro-type-member-init
-  <clang-tidy/checks/cppcoreguidelines/pro-type-member-init>` check to ignore
-  dependent delegate constructors.
-
-- Improved :doc:`cppcoreguidelines-pro-type-static-cast-downcast
-  <clang-tidy/checks/cppcoreguidelines/pro-type-static-cast-downcast>` check to
-  disregard casts on non-polymorphic types when the `StrictMode` option is set
-  to `false`.
-
-- Improved :doc:`cppcoreguidelines-pro-type-vararg
-  <clang-tidy/checks/cppcoreguidelines/pro-type-vararg>` check to ignore
-  false-positives in unevaluated context (e.g., ``decltype``, ``sizeof``, ...).
-
-- Improved :doc:`cppcoreguidelines-rvalue-reference-param-not-moved
-  <clang-tidy/checks/cppcoreguidelines/rvalue-reference-param-not-moved>` check
-  to ignore unused parameters when they are marked as unused and parameters of
-  deleted functions and constructors.
-
-- Improved :doc:`google-readability-casting
-  <clang-tidy/checks/google/readability-casting>` check to ignore constructor
-  calls disguised as functional casts.
-
-- Improved :doc:`google-runtime-int <clang-tidy/checks/google/runtime-int>`
-  check to ignore false positives on user defined-literals.
-
-- Improved :doc:`llvm-namespace-comment
-  <clang-tidy/checks/llvm/namespace-comment>` check to provide fixes for
-  ``inline`` namespaces in the same format as :program:`clang-format`.
-
-- Improved :doc:`llvmlibc-callee-namespace
-  <clang-tidy/checks/llvmlibc/callee-namespace>` to support
-  customizable namespace. This matches the change made to implementation in
-  namespace.
-
-- Improved :doc:`llvmlibc-implementation-in-namespace
-  <clang-tidy/checks/llvmlibc/implementation-in-namespace>` to support
-  customizable namespace. This further allows for testing the libc when the
-  system-libc is also LLVM's libc.
-
-- Improved :doc:`llvmlibc-inline-function-decl
-  <clang-tidy/checks/llvmlibc/inline-function-decl>` to properly ignore implicit
-  functions, such as struct constructors, and explicitly deleted functions.
-
-- Improved :doc:`misc-const-correctness
-  <clang-tidy/checks/misc/const-correctness>` check to avoid false positive when
-  using pointer to member function. Additionally, the check no longer emits
-  a diagnostic when a variable that is not type-dependent is an operand of a
-  type-dependent binary operator. Improved performance of the check through
-  optimizations. The check no longer emits a diagnostic for non-parameter-pack
-  variables in C++17 fold expressions.
-
-- Improved :doc:`misc-include-cleaner
-  <clang-tidy/checks/misc/include-cleaner>` check by adding option
-  `DeduplicateFindings` to output one finding per symbol occurrence, avoid
-  inserting the same header multiple times, fix a bug where `IgnoreHeaders`
-  option won't work with verbatim/std headers.
-
-- Improved :doc:`misc-redundant-expression
-  <clang-tidy/checks/misc/redundant-expression>` check to ignore
-  false-positives in unevaluated context (e.g., ``decltype``).
-
-- Improved :doc:`misc-static-assert
-  <clang-tidy/checks/misc/static-assert>` check to ignore false-positives when
-  referring to non-``constexpr`` variables in non-unevaluated context.
-
-- Improved :doc:`misc-unused-using-decls
-  <clang-tidy/checks/misc/unused-using-decls>` check to avoid false positive when
-  using in elaborated type and only check C++ files.
-
-- Improved :doc:`modernize-avoid-bind
-  <clang-tidy/checks/modernize/avoid-bind>` check to
-  not emit a ``return`` for fixes when the function returns ``void`` and to
-  provide valid fixes for cases involving bound C++ operators.
-
-- Improved :doc:`modernize-loop-convert
-  <clang-tidy/checks/modernize/loop-convert>` to support for-loops with
-  iterators initialized by free functions like ``begin``, ``end``, or ``size``
-  and avoid crash for array of dependent array and non-dereferenceable builtin
-  types used as iterators.
-
-- Improved :doc:`modernize-make-shared
-  <clang-tidy/checks/modernize/make-shared>` check to support
-  ``std::shared_ptr`` implementations that inherit the ``reset`` method from a
-  base class.
-
-- Improved :doc:`modernize-return-braced-init-list
-  <clang-tidy/checks/modernize/return-braced-init-list>` check to ignore
-  false-positives when constructing the container with ``count`` copies of
-  elements with value ``value``.
-
-- Improved :doc:`modernize-use-auto
-  <clang-tidy/checks/modernize/use-auto>` to avoid create incorrect fix hints
-  for pointer to array type and pointer to function type.
-
-- Improved :doc:`modernize-use-emplace
-  <clang-tidy/checks/modernize/use-emplace>` to not replace aggregates that
-  ``emplace`` cannot construct with aggregate initialization.
-
-- Improved :doc:`modernize-use-equals-delete
-  <clang-tidy/checks/modernize/use-equals-delete>` check to ignore
-  false-positives when special member function is actually used or implicit.
-
-- Improved :doc:`modernize-use-nullptr
-  <clang-tidy/checks/modernize/use-nullptr>` check by adding option
-  `IgnoredTypes` that can be used to exclude some pointer types.
-
-- Improved :doc:`modernize-use-std-print
-  <clang-tidy/checks/modernize/use-std-print>` check to accurately generate
-  fixes for reordering arguments.
-
-- Improved :doc:`modernize-use-using
-  <clang-tidy/checks/modernize/use-using>` check to fix function pointer and
-  forward declared ``typedef`` correctly. Added option `IgnoreExternC` to ignore
-  ``typedef`` declaration in ``extern "C"`` scope.
-
-- Improved :doc:`performance-faster-string-find
-  <clang-tidy/checks/performance/faster-string-find>` check to properly escape
-  single quotes.
-
-- Improved :doc:`performance-for-range-copy
-  <clang-tidy/checks/performance/for-range-copy>` check to handle cases where
-  the loop variable is a structured binding.
-
-- Improved :doc:`performance-noexcept-move-constructor
-  <clang-tidy/checks/performance/noexcept-move-constructor>` to better handle
-  conditional ``noexcept`` expressions, eliminating false-positives.
-
-- Improved :doc:`performance-noexcept-swap
-  <clang-tidy/checks/performance/noexcept-swap>` check to enforce a stricter
-  match with the swap function signature and better handling of condition
-  ``noexcept`` expressions, eliminating false-positives. ``iter_swap`` function
-  name is checked by default.
-
-- Improved :doc:`readability-braces-around-statements
-  <clang-tidy/checks/readability/braces-around-statements>` check to
-  ignore false-positive for ``if constexpr`` in lambda expression.
-
-- Improved :doc:`readability-avoid-const-params-in-decls
-  <clang-tidy/checks/readability/avoid-const-params-in-decls>` diagnostics to
-  highlight the ``const`` location
-
-- Improved :doc:`readability-container-contains
-  <clang-tidy/checks/readability/container-contains>` to correctly handle
-  integer literals with suffixes in fix-its.
-
-- Improved :doc:`readability-container-size-empty
-  <clang-tidy/checks/readability/container-size-empty>` check to
-  detect comparison between string and empty string literals and support
-  ``length()`` method as an alternative to ``size()``. Resolved false positives
-  tied to negative values from size-like methods, and one triggered by size
-  checks below zero.
-
-- Improved :doc:`readability-function-size
-  <clang-tidy/checks/readability/function-size>` check configuration to use
-  `none` rather than `-1` to disable some parameters.
-
-- Improved :doc:`readability-identifier-naming
-  <clang-tidy/checks/readability/identifier-naming>` check to issue accurate
-  warnings when a type's forward declaration precedes its definition.
-  Additionally, it now provides appropriate warnings for ``struct`` and
-  ``union`` in C, while also incorporating support for the
-  ``Leading_upper_snake_case`` naming convention. The handling of ``typedef``
-  has been enhanced, particularly within complex types like function pointers
-  and cases where style checks were omitted when functions started with macros.
-  Added support for C++20 ``concept`` declarations. ``Camel_Snake_Case`` and
-  ``camel_Snake_Case`` now detect more invalid identifier names. Fields in
-  anonymous records (i.e. anonymous structs and unions) now can be checked with
-  the naming rules associated with their enclosing scopes rather than the naming
-  rules of public ``struct``/``union`` members.
-
-- Improved :doc:`readability-implicit-bool-conversion
-  <clang-tidy/checks/readability/implicit-bool-conversion>` check to take
-  do-while loops into account for the `AllowIntegerConditions` and
-  `AllowPointerConditions` options. It also now provides more consistent
-  suggestions when parentheses are added to the return value or expressions.
-  It also ignores false-positives for comparison containing bool bitfield.
-
-- Improved :doc:`readability-misleading-indentation
-  <clang-tidy/checks/readability/misleading-indentation>` check to ignore
-  false-positives for line started with empty macro.
-
-- Improved :doc:`readability-non-const-parameter
-  <clang-tidy/checks/readability/non-const-parameter>` check to ignore
-  false-positives in initializer list of record.
-
-- Improved :doc:`readability-redundant-member-init
-  <clang-tidy/checks/readability/redundant-member-init>` check to now also
-  detect redundant in-class initializers.
-
-- Improved :doc:`readability-simplify-boolean-expr
-  <clang-tidy/checks/readability/simplify-boolean-expr>` check by adding the
-  new option `IgnoreMacros` that allows to ignore boolean expressions originating
-  from expanded macros.
-
-- Improved :doc:`readability-simplify-subscript-expr
-  <clang-tidy/checks/readability/simplify-subscript-expr>` check by extending
-  the default value of the `Types` option to include ``std::span``.
-
-- Improved :doc:`readability-static-accessed-through-instance
-  <clang-tidy/checks/readability/static-accessed-through-instance>` check to
-  identify calls to static member functions with out-of-class inline definitions.
-
 Removed checks
 ^^^^^^^^^^^^^^
 
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 060bc7669b72a..fc7511e913673 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -37,139 +37,17 @@ These changes are ones which we think may surprise users when upgrading to
 Clang |release| because of the opportunity they pose for disruption to existing
 code bases.
 
-- Fix a bug in reversed argument for templated operators.
-  This breaks code in C++20 which was previously accepted in C++17.
-  Clang did not properly diagnose such casese in C++20 before this change. Eg:
-
-  .. code-block:: cpp
-
-    struct P {};
-    template<class S> bool operator==(const P&, const S&);
-
-    struct A : public P {};
-    struct B : public P {};
-
-    // This equality is now ambiguous in C++20.
-    bool ambiguous(A a, B b) { return a == b; }
-
-    template<class S> bool operator!=(const P&, const S&);
-    // Ok. Found a matching operator!=.
-    bool fine(A a, B b) { return a == b; }
-
-  To reduce such widespread breakages, as an extension, Clang accepts this code
-  with an existing warning ``-Wambiguous-reversed-operator`` warning.
-  Fixes `GH <https://github.com/llvm/llvm-project/issues/53954>`_.
-
-- The CMake variable ``GCC_INSTALL_PREFIX`` (which sets the default
-  ``--gcc-toolchain=``) is deprecated and will be removed. Specify
-  ``--gcc-install-dir=`` or ``--gcc-triple=`` in a `configuration file
-  <https://clang.llvm.org/docs/UsersManual.html#configuration-files>` as a
-  replacement.
-  (`#77537 <https://github.com/llvm/llvm-project/pull/77537>`_)
-
 C/C++ Language Potentially Breaking Changes
 -------------------------------------------
 
-- The default extension name for PCH generation (``-c -xc-header`` and ``-c
-  -xc++-header``) is now ``.pch`` instead of ``.gch``.
-- ``-include a.h`` probing ``a.h.gch`` will now ignore ``a.h.gch`` if it is not
-  a clang pch file or a directory containing any clang pch file.
-- Fixed a bug that caused ``__has_cpp_attribute`` and ``__has_c_attribute``
-  return incorrect values for some C++-11-style attributes. Below is a complete
-  list of behavior changes.
-
-  .. csv-table::
-    :header: Test, Old value, New value
-
-    ``__has_cpp_attribute(unused)``,                    201603, 0
-    ``__has_cpp_attribute(gnu::unused)``,               201603, 1
-    ``__has_c_attribute(unused)``,                      202106, 0
-    ``__has_cpp_attribute(clang::fallthrough)``,        201603, 1
-    ``__has_cpp_attribute(gnu::fallthrough)``,          201603, 1
-    ``__has_c_attribute(gnu::fallthrough)``,            201910, 1
-    ``__has_cpp_attribute(warn_unused_result)``,        201907, 0
-    ``__has_cpp_attribute(clang::warn_unused_result)``, 201907, 1
-    ``__has_cpp_attribute(gnu::warn_unused_result)``,   201907, 1
-    ``__has_c_attribute(warn_unused_result)``,          202003, 0
-    ``__has_c_attribute(gnu::warn_unused_result)``,     202003, 1
-
-- Fixed a bug in finding matching `operator!=` while adding reversed `operator==` as
-  outlined in "The Equality Operator You Are Looking For" (`P2468 <http://wg21.link/p2468r2>`_).
-  Fixes (`#68901: <https://github.com/llvm/llvm-project/issues/68901>`_).
-
 C++ Specific Potentially Breaking Changes
 -----------------------------------------
-- The name mangling rules for function templates has been changed to take into
-  account the possibility that functions could be overloaded on their template
-  parameter lists or requires-clauses. This causes mangled names to change for
-  function templates in the following cases:
-
-  - When a template parameter in a function template depends on a previous
-    template parameter, such as ``template<typename T, T V> void f()``.
-  - When the function has any constraints, whether from constrained template
-      parameters or requires-clauses.
-  - When the template parameter list includes a deduced type -- either
-      ``auto``, ``decltype(auto)``, or a deduced class template specialization
-      type.
-  - When a template template parameter is given a template template argument
-      that has a different template parameter list.
-
-  This fixes a number of issues where valid programs would be rejected due to
-  mangling collisions, or would in some cases be silently miscompiled. Clang
-  will use the old manglings if ``-fclang-abi-compat=17`` or lower is
-  specified.
-  (`#48216 <https://github.com/llvm/llvm-project/issues/48216>`_),
-  (`#49884 <https://github.com/llvm/llvm-project/issues/49884>`_), and
-  (`#61273 <https://github.com/llvm/llvm-project/issues/61273>`_)
-
-- The `ClassScopeFunctionSpecializationDecl` AST node has been removed.
-  Dependent class scope explicit function template specializations now use
-  `DependentFunctionTemplateSpecializationInfo` to store candidate primary
-  templates and explicit template arguments. This should not impact users of
-  Clang as a compiler, but it may break assumptions in Clang-based tools
-  iterating over the AST.
-
-- The warning `-Wenum-constexpr-conversion` is now also enabled by default on
-  system headers and macros. It will be turned into a hard (non-downgradable)
-  error in the next Clang release.
-
-- The flag `-fdelayed-template-parsing` won't be enabled by default with C++20
-  when targetting MSVC to match the behavior of MSVC.
-  (`MSVC Docs <https://learn.microsoft.com/en-us/cpp/build/reference/permissive-standards-conformance?view=msvc-170>`_)
-
-- Remove the hardcoded path to the imported modules for C++20 named modules. Now we
-  require all the dependent modules to specified from the command line.
-  See (`#62707: <https://github.com/llvm/llvm-project/issues/62707>`_).
-
-- Forbid `import XXX;` in C++ to find module `XXX` comes from explicit clang modules.
-  See (`#64755: <https://github.com/llvm/llvm-project/issues/64755>`_).
 
 ABI Changes in This Version
 ---------------------------
-- Following the SystemV ABI for x86-64, ``__int128`` arguments will no longer
-  be split between a register and a stack slot.
 
 AST Dumping Potentially Breaking Changes
 ----------------------------------------
-- When dumping a sugared type, Clang will no longer print the desugared type if
-  its textual representation is the same as the sugared one. This applies to
-  both text dumps of the form ``'foo':'foo'`` which will now be dumped as just
-  ``'foo'``, and JSON dumps of the form:
-
-  .. code-block:: json
-
-    "type": {
-      "qualType": "foo",
-      "desugaredQualType": "foo"
-    }
-
-  which will now be dumped as just:
-
-  .. code-block:: json
-
-    "type": {
-      "qualType": "foo"
-    }
 
 What's New in Clang |release|?
 ==============================
@@ -183,682 +61,48 @@ C++ Language Changes
 
 C++20 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
-- Implemented `P1907R1 <https://wg21.link/P1907R1>` which extends allowed non-type template argument
-  kinds with e.g. floating point values and pointers and references to subobjects.
-  This feature is still experimental. Accordingly, ``__cpp_nontype_template_args`` was not updated.
-  However, its support can be tested with ``__has_extension(cxx_generalized_nttp)``.
 
 C++23 Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
-- Implemented `P0847R7: Deducing this <https://wg21.link/P0847R7>`_. Some related core issues were also
-  implemented (`CWG2553 <https://wg21.link/CWG2553>`_, `CWG2554 <https://wg21.link/CWG2554>`_,
-  `CWG2653 <https://wg21.link/CWG2653>`_, `CWG2687 <https://wg21.link/CWG2687>`_). Because the
-  support for this feature is still experimental, the feature test macro ``__cpp_explicit_this_parameter``
-  was not set in this version.
-  However, its support can be tested with ``__has_extension(cxx_explicit_this_parameter)``.
-
-- Added a separate warning to warn the use of attributes on lambdas as a C++23 extension
-  in previous language versions: ``-Wc++23-lambda-attributes``.
 
 C++2c Feature Support
 ^^^^^^^^^^^^^^^^^^^^^
 
-- Implemented `P2169R4: A nice placeholder with no name <https://wg21.link/P2169R4>`_. This allows using ``_``
-  as a variable name multiple times in the same scope and is supported in all C++ language modes as an extension.
-  An extension warning is produced when multiple variables are introduced by ``_`` in the same scope.
-  Unused warnings are no longer produced for variables named ``_``.
-  Currently, inspecting placeholders variables in a debugger when more than one are declared in the same scope
-  is not supported.
-
-  .. code-block:: cpp
-
-    struct S {
-      int _, _; // Was invalid, now OK
-    };
-    void func() {
-      int _, _; // Was invalid, now OK
-    }
-    void other() {
-      int _; // Previously diagnosed under -Wunused, no longer diagnosed
-    }
-
-- Attributes now expect unevaluated strings in attributes parameters that are string literals.
-  This is applied to both C++ standard attributes, and other attributes supported by Clang.
-  This completes the implementation of `P2361R6 Unevaluated Strings <https://wg21.link/P2361R6>`_
-
-- Implemented `P2864R2 Remove Deprecated Arithmetic Conversion on Enumerations From C++26 <https://wg21.link/P2864R2>`_.
-
-- Implemented `P2361R6 Template parameter initialization <https://wg21.link/P2308R1>`_.
-  This change is applied as a DR in all language modes.
-
-
 Resolutions to C++ Defect Reports
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-- Implemented `CWG2137 <https://wg21.link/CWG2137>`_ which allows
-  list-initialization from objects of the same type.
-- Implemented `CWG2311 <https://wg21.link/CWG2311>`_: given a prvalue ``e`` of object type
-  ``T``, ``T{e}`` will try to resolve an initializer list constructor and will use it if successful (CWG2137).
-  Otherwise, if there is no initializer list constructor, the copy will be elided as if it was ``T(e)``.
-
-- Implemented `CWG2598 <https://wg21.link/CWG2598>`_ and `CWG2096 <https://wg21.link/CWG2096>`_,
-  making unions (that have either no members or at least one literal member) literal types.
-  (`#77924: <https://github.com/llvm/llvm-project/issues/77924>`_).
-
 
 C Language Changes
 ------------------
-- ``structs``, ``unions``, and ``arrays`` that are const may now be used as
-  constant expressions.  This change is more consistent with the behavior of
-  GCC.
-- Enums will now be represented in TBAA metadata using their actual underlying
-  integer type. Previously they were treated as chars, which meant they could
-  alias with all other types.
-- Clang now supports the C-only attribute ``counted_by``. When applied to a
-  struct's flexible array member, it points to the struct field that holds the
-  number of elements in the flexible array member. This information can improve
-  the results of the array bound sanitizer and the
-  ``__builtin_dynamic_object_size`` builtin.
 
 C23 Feature Support
 ^^^^^^^^^^^^^^^^^^^
-- Clang now accepts ``-std=c23`` and ``-std=gnu23`` as language standard modes,
-  and the ``__STDC_VERSION__`` macro now expands to ``202311L`` instead of its
-  previous placeholder value. Clang continues to accept ``-std=c2x`` and
-  ``-std=gnu2x`` as aliases for C23 and GNU C23, respectively.
-- Clang now supports `requires c23` for module maps.
-- Clang now supports ``N3007 Type inference for object definitions``.
-
-- Clang now supports ``<stdckdint.h>`` which defines several macros for performing
-  checked integer arithmetic. It is also exposed in pre-C23 modes.
-
-- Completed the implementation of
-  `N2508 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2508.pdf>`_. We
-  previously implemented allowing a label at the end of a compound statement,
-  and now we've implemented allowing a label to be followed by a declaration
-  instead of a statement.
-- Implemented
-  `N2940 <https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2940.pdf>`_ which
-  removes support for trigraphs in C23 and later. In earlier language modes,
-  trigraphs remain enabled by default in conforming modes (e.g. ``-std=c17``)
-  and disabled by default in GNU and Microsoft modes (e.g., ``-std=gnu17`` or
-  ``-fms-compatibility``). If needed, you can enable trigraphs by passing
-  ``-ftrigraphs``.
 
 Non-comprehensive list of changes in this release
 -------------------------------------------------
 
-* Clang now has a ``__builtin_vectorelements()`` function that determines the number of elements in a vector.
-  For fixed-sized vectors, e.g., defined via ``__attribute__((vector_size(N)))`` or ARM NEON's vector types
-  (e.g., ``uint16x8_t``), this returns the constant number of elements at compile-time.
-  For scalable vectors, e.g., SVE or RISC-V V, the number of elements is not known at compile-time and is
-  determined at runtime.
-* The ``__datasizeof`` keyword has been added. It is similar to ``sizeof``
-  except that it returns the size of a type ignoring tail padding.
-* ``__builtin_classify_type()`` now classifies ``_BitInt`` values as the return value ``18``
-  and vector types as return value ``19``, to match GCC 14's behavior.
-* The default value of `_MSC_VER` was raised from 1920 to 1933.
-* Since MSVC 19.33 added undocumented attribute ``[[msvc::constexpr]]``, this release adds the attribute as well.
-
-* Added ``#pragma clang fp reciprocal``.
-
-* The version of Unicode used by Clang (primarily to parse identifiers) has been updated to 15.1.
-
 New Compiler Flags
 ------------------
 
-* ``-fverify-intermediate-code`` and its complement ``-fno-verify-intermediate-code``.
-  Enables or disables verification of the generated LLVM IR.
-  Users can pass this to turn on extra verification to catch certain types of
-  compiler bugs at the cost of extra compile time.
-  Since enabling the verifier adds a non-trivial cost of a few percent impact on
-  build times, it's disabled by default, unless your LLVM distribution itself is
-  compiled with runtime checks enabled.
-* ``-fkeep-system-includes`` modifies the behavior of the ``-E`` option,
-  preserving ``#include`` directives for "system" headers instead of copying
-  the preprocessed text to the output. This can greatly reduce the size of the
-  preprocessed output, which can be helpful when trying to reduce a test case.
-* ``-fassume-nothrow-exception-dtor`` is added to assume that the destructor of
-  a thrown exception object will not throw. The generated code for catch
-  handlers will be smaller. A throw expression of a type with a
-  potentially-throwing destructor will lead to an error.
-
-* ``-fopenacc`` was added as a part of the effort to support OpenACC in Clang.
-
-* ``-fcx-limited-range`` enables the naive mathematical formulas for complex
-  division and multiplication with no NaN checking of results. The default is
-  ``-fno-cx-limited-range``, but this option is enabled by ``-ffast-math``.
-
-* ``-fcx-fortran-rules`` enables the naive mathematical formulas for complex
-  multiplication and enables application of Smith's algorithm for complex
-  division. See SMITH, R. L. Algorithm 116: Complex division. Commun. ACM 5, 8
-  (1962). The default is ``-fno-cx-fortran-rules``.
-
-* ``-fvisibility-global-new-delete=<value>`` gives more freedom to users to
-  control how and if Clang forces a visibility for the replaceable new and
-  delete declarations. The option takes 4 values: ``force-hidden``,
-  ``force-protected``, ``force-default`` and ``source``; ``force-default`` is
-  the default. Option values with prefix ``force-`` assign such declarations
-  an implicit visibility attribute with the corresponding visibility. An option
-  value of ``source`` implies that no implicit attribute is added. Without the
-  attribute the replaceable global new and delete operators behave normally
-  (like other functions) with respect to visibility attributes, pragmas and
-  options (e.g ``--fvisibility=``).
-
 Deprecated Compiler Flags
 -------------------------
 
 Modified Compiler Flags
 -----------------------
 
-* ``-Woverriding-t-option`` is renamed to ``-Woverriding-option``.
-* ``-Winterrupt-service-routine`` is renamed to ``-Wexcessive-regsave`` as a generalization
-* ``-frewrite-includes`` now guards the original #include directives with
-  ``__CLANG_REWRITTEN_INCLUDES``, and ``__CLANG_REWRITTEN_SYSTEM_INCLUDES`` as
-  appropriate.
-* Introducing a new default calling convention for ``-fdefault-calling-conv``:
-  ``rtdcall``. This new default CC only works for M68k and will use the new
-  ``m68k_rtdcc`` CC on every functions that are not variadic. The ``-mrtd``
-  driver/frontend flag has the same effect when targeting M68k.
-* ``-fvisibility-global-new-delete-hidden`` is now a deprecated spelling of
-  ``-fvisibility-global-new-delete=force-hidden`` (``-fvisibility-global-new-delete=``
-  is new in this release).
-
 Removed Compiler Flags
 -------------------------
 
-* ``-enable-trivial-auto-var-init-zero-knowing-it-will-be-removed-from-clang`` has been removed.
-  It has not been needed to enable ``-ftrivial-auto-var-init=zero`` since Clang 16.
-
 Attribute Changes in Clang
 --------------------------
-- On X86, a warning is now emitted if a function with ``__attribute__((no_caller_saved_registers))``
-  calls a function without ``__attribute__((no_caller_saved_registers))``, and is not compiled with
-  ``-mgeneral-regs-only``
-- On X86, a function with ``__attribute__((interrupt))`` can now call a function without
-  ``__attribute__((no_caller_saved_registers))`` provided that it is compiled with ``-mgeneral-regs-only``
-
-- When a non-variadic function is decorated with the ``format`` attribute,
-  Clang now checks that the format string would match the function's parameters'
-  types after default argument promotion. As a result, it's no longer an
-  automatic diagnostic to use parameters of types that the format style
-  supports but that are never the result of default argument promotion, such as
-  ``float``. (`#59824: <https://github.com/llvm/llvm-project/issues/59824>`_)
-
-- Clang now supports ``[[clang::preferred_type(type-name)]]`` as an attribute
-  which can be applied to a bit-field. This attribute helps to map a bit-field
-  back to a particular type that may be better-suited to representing the bit-
-  field but cannot be used for other reasons and will impact the debug
-  information generated for the bit-field. This is most useful when mapping a
-  bit-field of basic integer type back to a ``bool`` or an enumeration type,
-  e.g.,
-
-  .. code-block:: c++
-
-      enum E { Apple, Orange, Pear };
-      struct S {
-        [[clang::preferred_type(E)]] unsigned FruitKind : 2;
-      };
-
-  When viewing ``S::FruitKind`` in a debugger, it will behave as if the member
-  was declared as type ``E`` rather than ``unsigned``.
-
-- Clang now warns you that the ``_Alignas`` attribute on declaration specifiers
-  is ignored, changed from the former incorrect suggestion to move it past
-  declaration specifiers. (`#58637 <https://github.com/llvm/llvm-project/issues/58637>`_)
-
-- Clang now introduced ``[[clang::coro_only_destroy_when_complete]]`` attribute
-  to reduce the size of the destroy functions for coroutines which are known to
-  be destroyed after having reached the final suspend point.
-
-- Clang now introduced ``[[clang::coro_return_type]]`` and ``[[clang::coro_wrapper]]``
-  attributes. A function returning a type marked with ``[[clang::coro_return_type]]``
-  should be a coroutine. A non-coroutine function marked with ``[[clang::coro_wrapper]]``
-  is still allowed to return the such a type. This is helpful for analyzers to recognize coroutines from the function signatures.
-
-- Clang now supports ``[[clang::code_align(N)]]`` as an attribute which can be
-  applied to a loop and specifies the byte alignment for a loop. This attribute
-  accepts a positive integer constant initialization expression indicating the
-  number of bytes for the minimum alignment boundary. Its value must be a power
-  of 2, between 1 and 4096(inclusive).
-
-  .. code-block:: c++
-
-      void Array(int *array, size_t n) {
-        [[clang::code_align(64)]] for (int i = 0; i < n; ++i) array[i] = 0;
-      }
-
-      template<int A>
-      void func() {
-        [[clang::code_align(A)]] for(;;) { }
-      }
-
-- Clang now introduced ``[[clang::coro_lifetimebound]]`` attribute.
-  All parameters of a function are considered to be lifetime bound if the function
-  returns a type annotated with ``[[clang::coro_lifetimebound]]`` and ``[[clang::coro_return_type]]``.
-  This analysis can be disabled for a function by annotating the function with ``[[clang::coro_disable_lifetimebound]]``.
 
 Improvements to Clang's diagnostics
 -----------------------------------
-- Clang constexpr evaluator now prints template arguments when displaying
-  template-specialization function calls.
-- Clang contexpr evaluator now displays notes as well as an error when a constructor
-  of a base class is not called in the constructor of its derived class.
-- Clang no longer emits ``-Wmissing-variable-declarations`` for variables declared
-  with the ``register`` storage class.
-- Clang's ``-Wswitch-default`` flag now diagnoses whenever a ``switch`` statement
-  does not have a ``default`` label.
-- Clang's ``-Wtautological-negation-compare`` flag now diagnoses logical
-  tautologies like ``x && !x`` and ``!x || x`` in expressions. This also
-  makes ``-Winfinite-recursion`` diagnose more cases.
-  (`#56035: <https://github.com/llvm/llvm-project/issues/56035>`_).
-- Clang constexpr evaluator now diagnoses compound assignment operators against
-  uninitialized variables as a read of uninitialized object.
-  (`#51536 <https://github.com/llvm/llvm-project/issues/51536>`_)
-- Clang's ``-Wformat-truncation`` now diagnoses ``snprintf`` call that is known to
-  result in string truncation.
-  (`#64871: <https://github.com/llvm/llvm-project/issues/64871>`_).
-  Existing warnings that similarly warn about the overflow in ``sprintf``
-  now falls under its own warning group ```-Wformat-overflow`` so that it can
-  be disabled separately from ``Wfortify-source``.
-  These two new warning groups have subgroups ``-Wformat-truncation-non-kprintf``
-  and ``-Wformat-overflow-non-kprintf``, respectively. These subgroups are used when
-  the format string contains ``%p`` format specifier.
-  Because Linux kernel's codebase has format extensions for ``%p``, kernel developers
-  are encouraged to disable these two subgroups by setting ``-Wno-format-truncation-non-kprintf``
-  and ``-Wno-format-overflow-non-kprintf`` in order to avoid false positives on
-  the kernel codebase.
-  Also clang no longer emits false positive warnings about the output length of
-  ``%g`` format specifier and about ``%o, %x, %X`` with ``#`` flag.
-- Clang now emits ``-Wcast-qual`` for functional-style cast expressions.
-- Clang no longer emits irrelevant notes about unsatisfied constraint expressions
-  on the left-hand side of ``||`` when the right-hand side constraint is satisfied.
-  (`#54678: <https://github.com/llvm/llvm-project/issues/54678>`_).
-- Clang now prints its 'note' diagnostic in cyan instead of black, to be more compatible
-  with terminals with dark background colors. This is also more consistent with GCC.
-- Clang now displays an improved diagnostic and a note when a defaulted special
-  member is marked ``constexpr`` in a class with a virtual base class
-  (`#64843: <https://github.com/llvm/llvm-project/issues/64843>`_).
-- ``-Wfixed-enum-extension`` and ``-Wmicrosoft-fixed-enum`` diagnostics are no longer
-  emitted when building as C23, since C23 standardizes support for enums with a
-  fixed underlying type.
-- When describing the failure of static assertion of `==` expression, clang prints the integer
-  representation of the value as well as its character representation when
-  the user-provided expression is of character type. If the character is
-  non-printable, clang now shows the escpaed character.
-  Clang also prints multi-byte characters if the user-provided expression
-  is of multi-byte character type.
-
-  *Example Code*:
-
-  .. code-block:: c++
-
-     static_assert("A\n"[1] == U'🌍');
-
-  *BEFORE*:
-
-  .. code-block:: text
-
-    source:1:15: error: static assertion failed due to requirement '"A\n"[1] == U'\U0001f30d''
-    1 | static_assert("A\n"[1] == U'🌍');
-      |               ^~~~~~~~~~~~~~~~~
-    source:1:24: note: expression evaluates to ''
-    ' == 127757'
-    1 | static_assert("A\n"[1] == U'🌍');
-      |               ~~~~~~~~~^~~~~~~~
-
-  *AFTER*:
-
-  .. code-block:: text
-
-    source:1:15: error: static assertion failed due to requirement '"A\n"[1] == U'\U0001f30d''
-    1 | static_assert("A\n"[1] == U'🌍');
-      |               ^~~~~~~~~~~~~~~~~
-    source:1:24: note: expression evaluates to ''\n' (0x0A, 10) == U'🌍' (0x1F30D, 127757)'
-    1 | static_assert("A\n"[1] == U'🌍');
-      |               ~~~~~~~~~^~~~~~~~
-- Clang now always diagnoses when using non-standard layout types in ``offsetof`` .
-  (`#64619: <https://github.com/llvm/llvm-project/issues/64619>`_)
-- Clang now diagnoses redefined defaulted constructor when redefined
-  defaulted constructor with different exception specs.
-  (`#69094: <https://github.com/llvm/llvm-project/issues/69094>`_)
-- Clang now diagnoses use of variable-length arrays in C++ by default (and
-  under ``-Wall`` in GNU++ mode). This is an extension supported by Clang and
-  GCC, but is very easy to accidentally use without realizing it's a
-  nonportable construct that has different semantics from a constant-sized
-  array. (`#62836 <https://github.com/llvm/llvm-project/issues/62836>`_)
-
-- Clang changed the order in which it displays candidate functions on overloading failures.
-  Previously, Clang used definition of ordering from the C++ Standard. The order defined in
-  the Standard is partial and is not suited for sorting. Instead, Clang now uses a strict
-  order that still attempts to push more relevant functions to the top by comparing their
-  corresponding conversions. In some cases, this results in better order. E.g., for the
-  following code
-
-  .. code-block:: cpp
-
-      struct Foo {
-        operator int();
-        operator const char*();
-      };
-
-      void test() { Foo() - Foo(); }
-
-  Clang now produces a list with two most relevant builtin operators at the top,
-  i.e. ``operator-(int, int)`` and ``operator-(const char*, const char*)``.
-  Previously ``operator-(const char*, const char*)`` was the first element,
-  but ``operator-(int, int)`` was only the 13th element in the output.
-  However, new implementation does not take into account some aspects of
-  C++ semantics, e.g. which function template is more specialized. This
-  can sometimes lead to worse ordering.
-
-
-- When describing a warning/error in a function-style type conversion Clang underlines only until
-  the end of the expression we convert from. Now Clang underlines until the closing parenthesis.
-
-  Before:
-
-  .. code-block:: text
-
-    warning: cast from 'long (*)(const int &)' to 'decltype(fun_ptr)' (aka 'long (*)(int &)') converts to incompatible function type [-Wcast-function-type-strict]
-    24 | return decltype(fun_ptr)( f_ptr /*comment*/);
-       |        ^~~~~~~~~~~~~~~~~~~~~~~~
-
-  After:
-
-  .. code-block:: text
-
-    warning: cast from 'long (*)(const int &)' to 'decltype(fun_ptr)' (aka 'long (*)(int &)') converts to incompatible function type [-Wcast-function-type-strict]
-    24 | return decltype(fun_ptr)( f_ptr /*comment*/);
-       |        ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-
-- ``-Wzero-as-null-pointer-constant`` diagnostic is no longer emitted when using ``__null``
-  (or, more commonly, ``NULL`` when the platform defines it as ``__null``) to be more consistent
-  with GCC.
-- Clang will warn on deprecated specializations used in system headers when their instantiation
-  is caused by user code.
-- Clang will now print ``static_assert`` failure details for arithmetic binary operators.
-  Example:
-
-  .. code-block:: cpp
-
-    static_assert(1 << 4 == 15);
-
-  will now print:
-
-  .. code-block:: text
-
-    error: static assertion failed due to requirement '1 << 4 == 15'
-       48 | static_assert(1 << 4 == 15);
-          |               ^~~~~~~~~~~~
-    note: expression evaluates to '16 == 15'
-       48 | static_assert(1 << 4 == 15);
-          |               ~~~~~~~^~~~~
-
-- Clang now diagnoses definitions of friend function specializations, e.g. ``friend void f<>(int) {}``.
-- Clang now diagnoses narrowing conversions involving const references.
-  (`#63151: <https://github.com/llvm/llvm-project/issues/63151>`_).
-- Clang now diagnoses unexpanded packs within the template argument lists of function template specializations.
-- The warning `-Wnan-infinity-disabled` is now emitted when ``INFINITY``
-  or ``NAN`` are used in arithmetic operations or function arguments in
-  floating-point mode where ``INFINITY`` or ``NAN`` don't have the expected
-  values.
-
-- Clang now diagnoses attempts to bind a bitfield to an NTTP of a reference type as erroneous
-  converted constant expression and not as a reference to subobject.
-- Clang now diagnoses ``auto`` and ``decltype(auto)`` in declarations of conversion function template
-  (`CWG1878: <https://cplusplus.github.io/CWG/issues/1878.html>`_)
-- Clang now diagnoses the requirement that non-template friend declarations with requires clauses
-  and template friend declarations with a constraint that depends on a template parameter from an
-  enclosing template must be a definition.
-- Clang now diagnoses incorrect usage of ``const`` and ``pure`` attributes, so ``-Wignored-attributes`` diagnoses more cases.
-- Clang now emits more descriptive diagnostics for 'unusual' expressions (e.g. incomplete index
-  expressions on matrix types or builtin functions without an argument list) as placement-args
-  to new-expressions.
-
-  Before:
-
-  .. code-block:: text
-
-    error: no matching function for call to 'operator new'
-       13 |     new (__builtin_memset) S {};
-          |     ^   ~~~~~~~~~~~~~~~~~~
-
-    note: candidate function not viable: no known conversion from '<builtin fn type>' to 'int' for 2nd argument
-        5 |     void* operator new(__SIZE_TYPE__, int);
-          |           ^
-
-  After:
-
-  .. code-block:: text
-
-    error: builtin functions must be directly called
-       13 |     new (__builtin_memset) S {};
-          |          ^
-
-- Clang now diagnoses import before module declarations but not in global
-  module fragment.
-  (`#67627: <https://github.com/llvm/llvm-project/issues/67627>`_).
-
-- Clang now diagnoses include headers with angle in module purviews, which is
-  not usually intended.
-  (`#68615: <https://github.com/llvm/llvm-project/issues/68615>`_)
-
-- Clang now won't mention invisible namespace when diagnose invisible declarations
-  inside namespace. The original diagnostic message is confusing.
-  (`#73893: <https://github.com/llvm/llvm-project/issues/73893>`_)
 
 Improvements to Clang's time-trace
 ----------------------------------
-- Two time-trace scope variables are added. A time trace scope variable of
-  ``ParseDeclarationOrFunctionDefinition`` with the function's source location
-  is added to record the time spent parsing the function's declaration or
-  definition. Another time trace scope variable of ``ParseFunctionDefinition``
-  is also added to record the name of the defined function.
 
 Bug Fixes in This Version
 -------------------------
-- Fixed an issue where a class template specialization whose declaration is
-  instantiated in one module and whose definition is instantiated in another
-  module may end up with members associated with the wrong declaration of the
-  class, which can result in miscompiles in some cases.
-- Fix crash on use of a variadic overloaded operator.
-  (`#42535 <https://github.com/llvm/llvm-project/issues/42535>`_)
-- Fix a hang on valid C code passing a function type as an argument to
-  ``typeof`` to form a function declaration.
-  (`#64713 <https://github.com/llvm/llvm-project/issues/64713>`_)
-- Clang now reports missing-field-initializers warning for missing designated
-  initializers in C++.
-  (`#56628 <https://github.com/llvm/llvm-project/issues/56628>`_)
-- Clang now respects ``-fwrapv`` and ``-ftrapv`` for ``__builtin_abs`` and
-  ``abs`` builtins.
-  (`#45129 <https://github.com/llvm/llvm-project/issues/45129>`_,
-  `#45794 <https://github.com/llvm/llvm-project/issues/45794>`_)
-- Fixed an issue where accesses to the local variables of a coroutine during
-  ``await_suspend`` could be misoptimized, including accesses to the awaiter
-  object itself.
-  (`#56301 <https://github.com/llvm/llvm-project/issues/56301>`_)
-  The current solution may bring performance regressions if the awaiters have
-  non-static data members. See
-  `#64945 <https://github.com/llvm/llvm-project/issues/64945>`_ for details.
-- Clang now prints unnamed members in diagnostic messages instead of giving an
-  empty ''. Fixes
-  (`#63759 <https://github.com/llvm/llvm-project/issues/63759>`_)
-- Fix crash in __builtin_strncmp and related builtins when the size value
-  exceeded the maximum value representable by int64_t. Fixes
-  (`#64876 <https://github.com/llvm/llvm-project/issues/64876>`_)
-- Fixed an assertion if a function has cleanups and fatal erors.
-  (`#48974 <https://github.com/llvm/llvm-project/issues/48974>`_)
-- Clang now emits an error if it is not possible to deduce array size for a
-  variable with incomplete array type.
-  (`#37257 <https://github.com/llvm/llvm-project/issues/37257>`_)
-- Clang's ``-Wunused-private-field`` no longer warns on fields whose type is
-  declared with ``[[maybe_unused]]``.
-  (`#61334 <https://github.com/llvm/llvm-project/issues/61334>`_)
-- For function multi-versioning using the ``target``, ``target_clones``, or
-  ``target_version`` attributes, remove comdat for internal linkage functions.
-  (`#65114 <https://github.com/llvm/llvm-project/issues/65114>`_)
-- Clang now reports ``-Wformat`` for bool value and char specifier confusion
-  in scanf. Fixes
-  (`#64987 <https://github.com/llvm/llvm-project/issues/64987>`_)
-- Support MSVC predefined macro expressions in constant expressions and in
-  local structs.
-- Correctly parse non-ascii identifiers that appear immediately after a line splicing
-  (`#65156 <https://github.com/llvm/llvm-project/issues/65156>`_)
-- Clang no longer considers the loss of ``__unaligned`` qualifier from objects as
-  an invalid conversion during method function overload resolution.
-- Fix lack of comparison of declRefExpr in ASTStructuralEquivalence
-  (`#66047 <https://github.com/llvm/llvm-project/issues/66047>`_)
-- Fix parser crash when dealing with ill-formed objective C++ header code. Fixes
-  (`#64836 <https://github.com/llvm/llvm-project/issues/64836>`_)
-- Fix crash in implicit conversions from initialize list to arrays of unknown
-  bound for C++20. Fixes
-  (`#62945 <https://github.com/llvm/llvm-project/issues/62945>`_)
-- Clang now allows an ``_Atomic`` qualified integer in a switch statement. Fixes
-  (`#65557 <https://github.com/llvm/llvm-project/issues/65557>`_)
-- Fixes crash when trying to obtain the common sugared type of
-  `decltype(instantiation-dependent-expr)`.
-  Fixes (`#67603 <https://github.com/llvm/llvm-project/issues/67603>`_)
-- Fixes a crash caused by a multidimensional array being captured by a lambda
-  (`#67722 <https://github.com/llvm/llvm-project/issues/67722>`_).
-- Fixes a crash when instantiating a lambda with requires clause.
-  (`#64462 <https://github.com/llvm/llvm-project/issues/64462>`_)
-- Fixes a regression where the ``UserDefinedLiteral`` was not properly preserved
-  while evaluating consteval functions. (`#63898 <https://github.com/llvm/llvm-project/issues/63898>`_).
-- Fix a crash when evaluating value-dependent structured binding
-  variables at compile time.
-  Fixes (`#67690 <https://github.com/llvm/llvm-project/issues/67690>`_)
-- Fixes a ``clang-17`` regression where ``LLVM_UNREACHABLE_OPTIMIZE=OFF``
-  cannot be used with ``Release`` mode builds. (`#68237 <https://github.com/llvm/llvm-project/issues/68237>`_).
-- Fix crash in evaluating ``constexpr`` value for invalid template function.
-  Fixes (`#68542 <https://github.com/llvm/llvm-project/issues/68542>`_)
-- Clang will correctly evaluate ``noexcept`` expression for template functions
-  of template classes. Fixes
-  (`#68543 <https://github.com/llvm/llvm-project/issues/68543>`_,
-  `#42496 <https://github.com/llvm/llvm-project/issues/42496>`_,
-  `#77071 <https://github.com/llvm/llvm-project/issues/77071>`_,
-  `#77411 <https://github.com/llvm/llvm-project/issues/77411>`_)
-- Fixed an issue when a shift count larger than ``__INT64_MAX__``, in a right
-  shift operation, could result in missing warnings about
-  ``shift count >= width of type`` or internal compiler error.
-- Fixed an issue with computing the common type for the LHS and RHS of a `?:`
-  operator in C. No longer issuing a confusing diagnostic along the lines of
-  "incompatible operand types ('foo' and 'foo')" with extensions such as matrix
-  types. Fixes (`#69008 <https://github.com/llvm/llvm-project/issues/69008>`_)
-- Clang no longer permits using the `_BitInt` types as an underlying type for an
-  enumeration as specified in the C23 Standard.
-  Fixes (`#69619 <https://github.com/llvm/llvm-project/issues/69619>`_)
-- Fixed an issue when a shift count specified by a small constant ``_BitInt()``,
-  in a left shift operation, could result in a faulty warnings about
-  ``shift count >= width of type``.
-- Clang now accepts anonymous members initialized with designated initializers
-  inside templates.
-  Fixes (`#65143 <https://github.com/llvm/llvm-project/issues/65143>`_)
-- Fix crash in formatting the real/imaginary part of a complex lvalue.
-  Fixes (`#69218 <https://github.com/llvm/llvm-project/issues/69218>`_)
-- No longer use C++ ``thread_local`` semantics in C23 when using
-  ``thread_local`` instead of ``_Thread_local``.
-  Fixes (`#70068 <https://github.com/llvm/llvm-project/issues/70068>`_) and
-  (`#69167 <https://github.com/llvm/llvm-project/issues/69167>`_)
-- Fix crash in evaluating invalid lambda expression which forget capture this.
-  Fixes (`#67687 <https://github.com/llvm/llvm-project/issues/67687>`_)
-- Fix crash from constexpr evaluator evaluating uninitialized arrays as rvalue.
-  Fixes (`#67317 <https://github.com/llvm/llvm-project/issues/67317>`_)
-- Clang now properly diagnoses use of stand-alone OpenMP directives after a
-  label (including ``case`` or ``default`` labels).
-- Fix compiler memory leak for enums with underlying type larger than 64 bits.
-  Fixes (`#78311 <https://github.com/llvm/llvm-project/pull/78311>`_)
-
-  Before:
-
-  .. code-block:: c++
-
-    label:
-    #pragma omp barrier // ok
-
-  After:
-
-  .. code-block:: c++
-
-    label:
-    #pragma omp barrier // error: '#pragma omp barrier' cannot be an immediate substatement
-
-- Fixed an issue that a benign assertion might hit when instantiating a pack expansion
-  inside a lambda. (`#61460 <https://github.com/llvm/llvm-project/issues/61460>`_)
-- Fix crash during instantiation of some class template specializations within class
-  templates. Fixes (`#70375 <https://github.com/llvm/llvm-project/issues/70375>`_)
-- Fix crash during code generation of C++ coroutine initial suspend when the return
-  type of await_resume is not trivially destructible.
-  Fixes (`#63803 <https://github.com/llvm/llvm-project/issues/63803>`_)
-- ``__is_trivially_relocatable`` no longer returns true for non-object types
-  such as references and functions.
-  Fixes (`#67498 <https://github.com/llvm/llvm-project/issues/67498>`_)
-- Fix crash when the object used as a ``static_assert`` message has ``size`` or ``data`` members
-  which are not member functions.
-- Support UDLs in ``static_assert`` message.
-- Fixed false positive error emitted by clang when performing qualified name
-  lookup and the current class instantiation has dependent bases.
-  Fixes (`#13826 <https://github.com/llvm/llvm-project/issues/13826>`_)
-- Fix a ``clang-17`` regression where a templated friend with constraints is not
-  properly applied when its parameters reference an enclosing non-template class.
-  Fixes (`#71595 <https://github.com/llvm/llvm-project/issues/71595>`_)
-- Fix the name of the ifunc symbol emitted for multiversion functions declared with the
-  ``target_clones`` attribute. This addresses a linker error that would otherwise occur
-  when these functions are referenced from other TUs.
-- Fixes compile error that double colon operator cannot resolve macro with parentheses.
-  Fixes (`#64467 <https://github.com/llvm/llvm-project/issues/64467>`_)
-- Clang's ``-Wchar-subscripts`` no longer warns on chars whose values are known non-negative constants.
-  Fixes (`#18763 <https://github.com/llvm/llvm-project/issues/18763>`_)
-- Fix crash due to incorrectly allowing conversion functions in copy elision.
-  Fixes (`#39319 <https://github.com/llvm/llvm-project/issues/39319>`_) and
-  (`#60182 <https://github.com/llvm/llvm-project/issues/60182>`_) and
-  (`#62157 <https://github.com/llvm/llvm-project/issues/62157>`_) and
-  (`#64885 <https://github.com/llvm/llvm-project/issues/64885>`_) and
-  (`#65568 <https://github.com/llvm/llvm-project/issues/65568>`_)
-- Fix an issue where clang doesn't respect detault template arguments that
-  are added in a later redeclaration for CTAD.
-  Fixes (`#69987 <https://github.com/llvm/llvm-project/issues/69987>`_)
-- Fix an issue where CTAD fails for explicit type conversion.
-  Fixes (`#64347 <https://github.com/llvm/llvm-project/issues/64347>`_)
-- Fix crash when using C++ only tokens like ``::`` in C compiler clang.
-  Fixes (`#73559 <https://github.com/llvm/llvm-project/issues/73559>`_)
-- Clang now accepts recursive non-dependent calls to functions with deduced
-  return type.
-  Fixes (`#71015 <https://github.com/llvm/llvm-project/issues/71015>`_)
-- Fix assertion failure when initializing union containing struct with
-  flexible array member using empty initializer list.
-  Fixes (`#77085 <https://github.com/llvm/llvm-project/issues/77085>`_)
-- Fix assertion crash due to failed scope restoring caused by too-early VarDecl
-  invalidation by invalid initializer Expr.
-  Fixes (`#30908 <https://github.com/llvm/llvm-project/issues/30908>`_)
-- Clang now emits correct source location for code-coverage regions in `if constexpr`
-  and `if consteval` branches. Untaken branches are now skipped.
-  Fixes (`#54419 <https://github.com/llvm/llvm-project/issues/54419>`_)
-- Fix assertion failure when declaring a template friend function with
-  a constrained parameter in a template class that declares a class method
-  or lambda at different depth.
-  Fixes (`#75426 <https://github.com/llvm/llvm-project/issues/75426>`_)
-- Fix an issue where clang cannot find conversion function with template
-  parameter when instantiation of template class.
-  Fixes (`#77583 <https://github.com/llvm/llvm-project/issues/77583>`_)
-- Fix an issue where CTAD fails for function-type/array-type arguments.
-  Fixes (`#51710 <https://github.com/llvm/llvm-project/issues/51710>`_)
-- Fix crashes when using the binding decl from an invalid structured binding.
-  Fixes (`#67495 <https://github.com/llvm/llvm-project/issues/67495>`_) and
-  (`#72198 <https://github.com/llvm/llvm-project/issues/72198>`_)
-- Fix assertion failure when call noreturn-attribute function with musttail
-  attribute.
-  Fixes (`#76631 <https://github.com/llvm/llvm-project/issues/76631>`_)
-  - The MS ``__noop`` builtin without an argument list is now accepted
-  in the placement-args of new-expressions, matching MSVC's behaviour.
-- Fix an issue that caused MS ``__decspec(property)`` accesses as well as
-  Objective-C++ property accesses to not be converted to a function call
-  to the getter in the placement-args of new-expressions.
-  Fixes (`#65053 <https://github.com/llvm/llvm-project/issues/65053>`_)
-- Fix an issue with missing symbol definitions when the first coroutine
-  statement appears in a discarded ``if constexpr`` branch.
-  Fixes (`#78290 <https://github.com/llvm/llvm-project/issues/78290>`_)
-- Fixed assertion failure with deleted overloaded unary operators.
-  Fixes (`#78314 <https://github.com/llvm/llvm-project/issues/78314>`_)
 
 Bug Fixes to Compiler Builtins
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -869,363 +113,41 @@ Bug Fixes to Attribute Support
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
 
-- Clang limits the size of arrays it will try to evaluate at compile time
-  to avoid memory exhaustion.
-  This limit can be modified by `-fconstexpr-steps`.
-  (`#63562 <https://github.com/llvm/llvm-project/issues/63562>`_)
-
-- Fix a crash caused by some named unicode escape sequences designating
-  a Unicode character whose name contains a ``-``.
-  (Fixes `#64161 <https://github.com/llvm/llvm-project/issues/64161>`_)
-
-- Fix cases where we ignore ambiguous name lookup when looking up members.
-  (`#22413 <https://github.com/llvm/llvm-project/issues/22413>`_),
-  (`#29942 <https://github.com/llvm/llvm-project/issues/29942>`_),
-  (`#35574 <https://github.com/llvm/llvm-project/issues/35574>`_) and
-  (`#27224 <https://github.com/llvm/llvm-project/issues/27224>`_).
-
-- Clang emits an error on substitution failure within lambda body inside a
-  requires-expression. This fixes:
-  (`#64138 <https://github.com/llvm/llvm-project/issues/64138>`_) and
-  (`#71684 <https://github.com/llvm/llvm-project/issues/71684>`_).
-
-- Update ``FunctionDeclBitfields.NumFunctionDeclBits``. This fixes:
-  (`#64171 <https://github.com/llvm/llvm-project/issues/64171>`_).
-
-- Expressions producing ``nullptr`` are correctly evaluated
-  by the constant interpreter when appearing as the operand
-  of a binary comparison.
-  (`#64923 <https://github.com/llvm/llvm-project/issues/64923>`_)
-
-- Fix a crash when an immediate invocation is not a constant expression
-  and appear in an implicit cast.
-  (`#64949 <https://github.com/llvm/llvm-project/issues/64949>`_).
-
-- Fix crash when parsing ill-formed lambda trailing return type. Fixes:
-  (`#64962 <https://github.com/llvm/llvm-project/issues/64962>`_) and
-  (`#28679 <https://github.com/llvm/llvm-project/issues/28679>`_).
-
-- Fix a crash caused by substitution failure in expression requirements.
-  (`#64172 <https://github.com/llvm/llvm-project/issues/64172>`_) and
-  (`#64723 <https://github.com/llvm/llvm-project/issues/64723>`_).
-
-- Fix crash when parsing the requires clause of some generic lambdas.
-  (`#64689 <https://github.com/llvm/llvm-project/issues/64689>`_)
-
-- Fix crash when the trailing return type of a generic and dependent
-  lambda refers to an init-capture.
-  (`#65067 <https://github.com/llvm/llvm-project/issues/65067>`_ and
-  `#63675 <https://github.com/llvm/llvm-project/issues/63675>`_)
-
-- Clang now properly handles out of line template specializations when there is
-  a non-template inner-class between the function and the class template.
-  (`#65810 <https://github.com/llvm/llvm-project/issues/65810>`_)
-
-- Fix a crash when calling a non-constant immediate function
-  in the initializer of a static data member.
-  (`#65985 <https://github.com/llvm/llvm-project/issues/65985>_`).
-- Clang now properly converts static lambda call operator to function
-  pointers on win32.
-  (`#62594 <https://github.com/llvm/llvm-project/issues/62594>`_)
-
-- Fixed some cases where the source location for an instantiated specialization
-  of a function template or a member function of a class template was assigned
-  the location of a non-defining declaration rather than the location of the
-  definition the specialization was instantiated from.
-  (`#26057 <https://github.com/llvm/llvm-project/issues/26057>`_`)
-
-- Fix a crash when a default member initializer of a base aggregate
-  makes an invalid call to an immediate function.
-  (`#66324 <https://github.com/llvm/llvm-project/issues/66324>`_)
-
-- Fix crash for a lambda attribute with a statement expression
-  that contains a `return`.
-  (`#48527 <https://github.com/llvm/llvm-project/issues/48527>`_)
-
-- Clang now no longer asserts when an UnresolvedLookupExpr is used as an
-  expression requirement. (`#66612 https://github.com/llvm/llvm-project/issues/66612`)
-
-- Clang now disambiguates NTTP types when printing diagnostics where the
-  NTTP types are compared with the 'diff' method.
-  (`#66744 https://github.com/llvm/llvm-project/issues/66744`)
-
-- Fix crash caused by a spaceship operator returning a comparision category by
-  reference. Fixes:
-  (`#64162 <https://github.com/llvm/llvm-project/issues/64162>`_)
-- Fix a crash when calling a consteval function in an expression used as
-  the size of an array.
-  (`#65520 <https://github.com/llvm/llvm-project/issues/65520>`_)
-
-- Clang no longer tries to capture non-odr-used variables that appear
-  in the enclosing expression of a lambda expression with a noexcept specifier.
-  (`#67492 <https://github.com/llvm/llvm-project/issues/67492>`_)
-
-- Fix crash when fold expression was used in the initialization of default
-  argument. Fixes:
-  (`#67395 <https://github.com/llvm/llvm-project/issues/67395>`_)
-
-- Fixed a bug causing destructors of constant-evaluated structured bindings
-  initialized by array elements to be called in the wrong evaluation context.
-
-- Fix crash where ill-formed code was being treated as a deduction guide and
-  we now produce a diagnostic. Fixes:
-  (`#65522 <https://github.com/llvm/llvm-project/issues/65522>`_)
-
-- Fixed a bug where clang incorrectly considered implicitly generated deduction
-  guides from a non-templated constructor and a templated constructor as ambiguous,
-  rather than prefer the non-templated constructor as specified in
-  [standard.group]p3.
-
-- Fixed a crash caused by incorrect handling of dependence on variable templates
-  with non-type template parameters of reference type. Fixes:
-  (`#65153 <https://github.com/llvm/llvm-project/issues/65153>`_)
-
-- Clang now properly compares constraints on an out of line class template
-  declaration definition. Fixes:
-  (`#61763 <https://github.com/llvm/llvm-project/issues/61763>`_)
-
-- Fix a bug where implicit deduction guides are not correctly generated for nested template
-  classes. Fixes:
-  (`#46200 <https://github.com/llvm/llvm-project/issues/46200>`_)
-  (`#57812 <https://github.com/llvm/llvm-project/issues/57812>`_)
-
-- Diagnose use of a variable-length array in a coroutine. The design of
-  coroutines is such that it is not possible to support VLA use. Fixes:
-  (`#65858 <https://github.com/llvm/llvm-project/issues/65858>`_)
-
-- Fix bug where we were overriding zero-initialization of class members when
-  default initializing a base class in a constant expression context. Fixes:
-  (`#69890 <https://github.com/llvm/llvm-project/issues/69890>`_)
-
-- Fix crash when template class static member imported to other translation unit.
-  Fixes:
-  (`#68769 <https://github.com/llvm/llvm-project/issues/68769>`_)
-
-- Clang now rejects incomplete types for ``__builtin_dump_struct``. Fixes:
-  (`#63506 <https://github.com/llvm/llvm-project/issues/63506>`_)
-
-- Fixed a crash for C++98/03 while checking an ill-formed ``_Static_assert`` expression.
-  Fixes: (`#72025 <https://github.com/llvm/llvm-project/issues/72025>`_)
-
-- Clang now defers the instantiation of explicit specifier until constraint checking
-  completes (except deduction guides). Fixes:
-  (`#59827 <https://github.com/llvm/llvm-project/issues/59827>`_)
-
-- Fix crash when parsing nested requirement. Fixes:
-  (`#73112 <https://github.com/llvm/llvm-project/issues/73112>`_)
-
-- Fixed a crash caused by using return type requirement in a lambda. Fixes:
-  (`#63808 <https://github.com/llvm/llvm-project/issues/63808>`_)
-  (`#64607 <https://github.com/llvm/llvm-project/issues/64607>`_)
-  (`#64086 <https://github.com/llvm/llvm-project/issues/64086>`_)
-
-- Fixed a crash where we lost uninstantiated constraints on placeholder NTTP packs. Fixes:
-  (`#63837 <https://github.com/llvm/llvm-project/issues/63837>`_)
-
-- Fixed a regression where clang forgets how to substitute into constraints on template-template
-  parameters. Fixes:
-  (`#57410 <https://github.com/llvm/llvm-project/issues/57410>`_) and
-  (`#76604 <https://github.com/llvm/llvm-project/issues/57410>`_)
-
-- Fix a bug where clang would produce inconsistent values when
-  ``std::source_location::current()`` was used in a function template.
-  Fixes (`#78128 <https://github.com/llvm/llvm-project/issues/78128>`_)
-
-- Clang now allows parenthesized initialization of arrays in `operator new[]`.
-  Fixes: (`#68198 <https://github.com/llvm/llvm-project/issues/68198>`_)
-
-- Fixes CTAD for aggregates on nested template classes. Fixes:
-  (`#77599 <https://github.com/llvm/llvm-project/issues/77599>`_)
-
-- Fix crash when importing the same module with an dynamic initializer twice
-  in different visibility.
-  Fixes (`#67893 <https://github.com/llvm/llvm-project/issues/67893>`_)
-
-- Fix a false-positive ODR violation for different definitions for `std::align_val_t`.
-  Fixes (`#76638 <https://github.com/llvm/llvm-project/issues/76638>`_)
-
-- Remove recorded `#pragma once` state for headers included in named modules.
-  Fixes (`#77995 <https://github.com/llvm/llvm-project/issues/77995>`_)
-
-- Set the ``__cpp_auto_cast`` feature test macro in C++23 mode.
-
-- Fix crash for inconsistent deducing state of function return types
-  in importing modules.
-  Fixes (`#78830 <https://github.com/llvm/llvm-project/issues/78830>`_)
-  Fixes (`#60085 <https://github.com/llvm/llvm-project/issues/60085>`_)
-
 Bug Fixes to AST Handling
 ^^^^^^^^^^^^^^^^^^^^^^^^^
-- Fixed an import failure of recursive friend class template.
-  `Issue 64169 <https://github.com/llvm/llvm-project/issues/64169>`_
-- Remove unnecessary RecordLayout computation when importing UnaryOperator. The
-  computed RecordLayout is incorrect if fields are not completely imported and
-  should not be cached.
-  `Issue 64170 <https://github.com/llvm/llvm-project/issues/64170>`_
-- Fixed ``hasAnyBase`` not binding nodes in its submatcher.
-  (`#65421 <https://github.com/llvm/llvm-project/issues/65421>`_)
-- Fixed a bug where RecursiveASTVisitor fails to visit the
-  initializer of a bitfield.
-  `Issue 64916 <https://github.com/llvm/llvm-project/issues/64916>`_
-- Fixed a bug where range-loop-analysis checks for trivial copyability,
-  rather than trivial copy-constructibility
-  `Issue 47355 <https://github.com/llvm/llvm-project/issues/47355>`_
-- Fixed a bug where Template Instantiation failed to handle Lambda Expressions
-  with certain types of Attributes.
-  (`#76521 <https://github.com/llvm/llvm-project/issues/76521>`_)
 
 Miscellaneous Bug Fixes
 ^^^^^^^^^^^^^^^^^^^^^^^
 
 Miscellaneous Clang Crashes Fixed
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-- Fixed a crash when parsing top-level ObjC blocks that aren't properly
-  terminated. Clang should now also recover better when an @end is missing
-  between blocks.
-  `Issue 64065 <https://github.com/llvm/llvm-project/issues/64065>`_
-- Fixed a crash when check array access on zero-length element.
-  `Issue 64564 <https://github.com/llvm/llvm-project/issues/64564>`_
-- Fixed a crash when an ObjC ivar has an invalid type. See
-  (`#68001 <https://github.com/llvm/llvm-project/pull/68001>`_)
-- Fixed a crash in C when redefined struct is another nested redefinition.
-  `Issue 41302 <https://github.com/llvm/llvm-project/issues/41302>`_
-- Fixed a crash when ``-ast-dump=json`` was used for code using class
-  template deduction guides.
-- Fixed a crash when a lambda marked as ``static`` referenced a captured
-  variable in an expression.
-  `Issue 74608 <https://github.com/llvm/llvm-project/issues/74608>`_
-- Fixed a crash with modules and a ``constexpr`` destructor.
-  `Issue 68702 <https://github.com/llvm/llvm-project/issues/68702>`_
-
 
 OpenACC Specific Changes
 ------------------------
-- OpenACC Implementation effort is beginning with semantic analysis and parsing
-  of OpenACC pragmas. The ``-fopenacc`` flag was added to enable these new,
-  albeit incomplete changes. The ``_OPENACC`` macro is currently defined to
-  ``1``, as support is too incomplete to update to a standards-required value.
-- Added ``-fexperimental-openacc-macro-override``, a command line option to
-  permit overriding the ``_OPENACC`` macro to be any digit-only value specified
-  by the user, which permits testing the compiler against existing OpenACC
-  workloads in order to evaluate implementation progress.
 
 Target Specific Changes
 -----------------------
 
 AMDGPU Support
 ^^^^^^^^^^^^^^
-- Use pass-by-reference (byref) in stead of pass-by-value (byval) for struct
-  arguments in C ABI. Callee is responsible for allocating stack memory and
-  copying the value of the struct if modified. Note that AMDGPU backend still
-  supports byval for struct arguments.
-- The default value for ``-mcode-object-version`` is now 5.
-  See `AMDHSA Code Object V5 Metadata <https://llvm.org/docs/AMDGPUUsage.html#code-object-v5-metadata>`_
-  for more details.
 
 X86 Support
 ^^^^^^^^^^^
 
-- Added option ``-m[no-]evex512`` to disable ZMM and 64-bit mask instructions
-  for AVX512 features.
-- Support ISA of ``USER_MSR``.
-  * Support intrinsic of ``_urdmsr``.
-  * Support intrinsic of ``_uwrmsr``.
-- Support ISA of ``AVX10.1``.
-- ``-march=pantherlake`` and ``-march=clearwaterforest`` are now supported.
-- Added ABI handling for ``__float128`` to match with GCC.
-- Emit warnings for options to enable knl/knm specific ISAs: AVX512PF, AVX512ER
-  and PREFETCHWT1. From next version (LLVM 19), these ISAs' intrinsic supports
-  will be deprecated:
-  * intrinsic series of *_exp2a23_*
-  * intrinsic series of *_rsqrt28_*
-  * intrinsic series of *_rcp28_*
-  * intrinsic series of *_prefetch_i[3|6][2|4]gather_*
-  * intrinsic series of *_prefetch_i[3|6][2|4]scatter_*
-
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
-- C++ function name mangling has been changed to align with the specification
-  (https://github.com/ARM-software/abi-aa/blob/main/aapcs64/aapcs64.rst).
-  This affects C++ functions with SVE ACLE parameters. Clang will use the old
-  manglings if ``-fclang-abi-compat=17`` or lower is  specified.
-
-- New AArch64 asm constraints have been added for r8-r11(Uci) and r12-r15(Ucj).
-
-- Support has been added for the following processors (-mcpu identifiers in parenthesis):
-
-  For Arm:
-
-  * Cortex-M52 (cortex-m52).
-
-  For AArch64:
-
-  * Cortex-A520 (cortex-a520).
-  * Cortex-A720 (cortex-a720).
-  * Cortex-X4 (cortex-x4).
-
 Android Support
 ^^^^^^^^^^^^^^^
 
-- Android target triples are usually suffixed with a version. Clang searches for
-  target-specific runtime and standard libraries in directories named after the
-  target (e.g. if you're building with ``--target=aarch64-none-linux-android21``,
-  Clang will look for ``lib/aarch64-none-linux-android21`` under its resource
-  directory to find runtime libraries). If an exact match isn't found, Clang
-  would previously fall back to a directory without any version (which would be
-  ``lib/aarch64-none-linux-android`` in our example). Clang will now look for
-  directories for lower versions and use the newest version it finds instead,
-  e.g. if you have ``lib/aarch64-none-linux-android21`` and
-  ``lib/aarch64-none-linux-android29``, ``-target aarch64-none-linux-android23``
-  will use the former and ``-target aarch64-none-linux-android30`` will use the
-  latter. Falling back to a versionless directory will now emit a warning, and
-  the fallback will be removed in Clang 19.
-
 Windows Support
 ^^^^^^^^^^^^^^^
-- Fixed an assertion failure that occurred due to a failure to propagate
-  ``MSInheritanceAttr`` attributes to class template instantiations created
-  for explicit template instantiation declarations.
-
-- The ``-fno-auto-import`` option was added for MinGW targets. The option both
-  affects code generation (inhibiting generating indirection via ``.refptr``
-  stubs for potentially auto imported symbols, generating smaller and more
-  efficient code) and linking (making the linker error out on such cases).
-  If the option only is used during code generation but not when linking,
-  linking may succeed but the resulting executables may expose issues at
-  runtime.
 
 LoongArch Support
 ^^^^^^^^^^^^^^^^^
-- Added builtins support for all LSX (128-bits SIMD) and LASX (256-bits SIMD)
-  instructions.
-- Added builtins support for approximate calculation instructions that were
-  introduced in LoongArch Reference Manual V1.10.
-- Made ``-mcmodel=`` compatible with LoongArch gcc that accepted ``normal``,
-  ``medium`` and ``extreme``.
-- The ``model`` attribute was now supported for overriding the default code
-  model used to access global variables. The following values were supported:
-  ``normal``, ``medium`` and ``extreme``.
-
-  *Example Code*:
-
-  .. code-block:: c
-
-     int var __attribute((model("extreme")));
-
-- Default to ``-fno-direct-access-external-data`` for non-PIC.
-- An ABI mismatch with gcc/g++ about empty structs/unions passing was fixed.
-- ``_mcount`` was generated instead of ``mcount``.
 
 RISC-V Support
 ^^^^^^^^^^^^^^
-- Unaligned memory accesses can be toggled by ``-m[no-]unaligned-access`` or the
-  aliases ``-m[no-]strict-align``.
-- CodeGen of RV32E/RV64E was supported experimentally.
-- CodeGen of ilp32e/lp64e was supported experimentally.
-
-- Default ABI with F but without D was changed to ilp32f for RV32 and to lp64f
-  for RV64.
 
 CUDA/HIP Language Changes
 ^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -1233,18 +155,9 @@ CUDA/HIP Language Changes
 CUDA Support
 ^^^^^^^^^^^^
 
-- Clang now supports CUDA SDK up to 12.3
-- Added support for sm_90a
-
 AIX Support
 ^^^^^^^^^^^
 
-- Introduced the ``-maix-small-local-exec-tls`` option to produce a faster
-  access sequence for local-exec TLS variables where the offset from the TLS
-  base is encoded as an immediate operand.
-  This access sequence is not used for TLS variables larger than 32KB, and is
-  currently only supported on 64-bit mode.
-
 WebAssembly Support
 ^^^^^^^^^^^^^^^^^^^
 
@@ -1256,232 +169,36 @@ DWARF Support in Clang
 
 Floating Point Support in Clang
 -------------------------------
-- Add ``__builtin_elementwise_log`` builtin for floating point types only.
-- Add ``__builtin_elementwise_log10`` builtin for floating point types only.
-- Add ``__builtin_elementwise_log2`` builtin for floating point types only.
-- Add ``__builtin_elementwise_exp`` builtin for floating point types only.
-- Add ``__builtin_elementwise_exp2`` builtin for floating point types only.
-- Add ``__builtin_set_flt_rounds`` builtin for X86, x86_64, Arm and AArch64 only.
-- Add ``__builtin_elementwise_pow`` builtin for floating point types only.
-- Add ``__builtin_elementwise_bitreverse`` builtin for integer types only.
-- Add ``__builtin_elementwise_sqrt`` builtin for floating point types only.
-- ``__builtin_isfpclass`` builtin now supports vector types.
-- ``#pragma float_control(precise,on)`` enables precise floating-point
-  semantics. If ``math-errno`` is disabled in the current TU, clang will
-  re-enable ``math-errno`` in the presense of
-  ``#pragma float_control(precise,on)``.
-- Add ``__builtin_exp10``, ``__builtin_exp10f``,
-  ``__builtin_exp10f16``, ``__builtin_exp10l`` and
-  ``__builtin_exp10f128`` builtins.
-- Add ``__builtin_iszero``, ``__builtin_issignaling`` and
-  ``__builtin_issubnormal``.
-- Add support for C99's ``#pragma STDC CX_LIMITED_RANGE`` feature.  This
-  enables the naive mathematical formulas for complex multiplication and
-  division, which are faster but do not correctly handle overflow and infinities.
 
 AST Matchers
 ------------
-- Add ``convertVectorExpr``.
-- Add ``dependentSizedExtVectorType``.
-- Add ``macroQualifiedType``.
-- Add ``CXXFoldExpr`` related matchers: ``cxxFoldExpr``, ``callee``,
-  ``hasInit``, ``hasPattern``, ``isRightFold``, ``isLeftFold``,
-  ``isUnaryFold``, ``isBinaryFold``, ``hasOperator``, ``hasLHS``, ``hasRHS``, ``hasEitherOperand``.
 
 clang-format
 ------------
-- Add ``AllowBreakBeforeNoexceptSpecifier`` option.
-- Add ``AllowShortCompoundRequirementOnASingleLine`` option.
-- Change ``BreakAfterAttributes`` from ``Never`` to ``Leave`` in LLVM style.
-- Add ``BreakAdjacentStringLiterals`` option.
-- Add ``ObjCPropertyAttributeOrder`` which can be used to sort ObjC property
-  attributes (like ``nonatomic, strong, nullable``).
-- Add ``PenaltyBreakScopeResolution`` option.
-- Add ``.clang-format-ignore`` files.
-- Add ``AlignFunctionPointers`` sub-option for ``AlignConsecutiveDeclarations``.
-- Add ``SkipMacroDefinitionBody`` option.
 
 libclang
 --------
 
-- Exposed arguments of ``clang::annotate``.
-- ``clang::getCursorKindForDecl`` now recognizes linkage specifications such as
-  ``extern "C"`` and reports them as ``CXCursor_LinkageSpec``.
-
 Static Analyzer
 ---------------
 
 New features
 ^^^^^^^^^^^^
 
-- Implemented the ``[[clang::suppress]]`` attribute for suppressing diagnostics
-  of static analysis tools, such as the Clang Static Analyzer.
-  `Documentation <https://clang.llvm.org/docs/AttributeReference.html#suppress>`__.
-
-- Added support for the ``cleanup`` attribute.
-  `Documentation <https://clang.llvm.org/docs/AttributeReference.html#cleanup>`__.
-
-- Support "Deducing this" (P0847R7). (Worked out of the box)
-  (`af4751738db8 <https://github.com/llvm/llvm-project/commit/af4751738db89a142a8880c782d12d4201b222a8>`__)
-
-- Added a new checker ``core.BitwiseShift`` which reports situations where
-  bitwise shift operators produce undefined behavior (because some operand is
-  negative or too large).
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#core-bitwiseshift-c-c>`__.
-
-- Added a new experimental checker ``alpha.core.StdVariant`` to detect variant
-  accesses via wrong alternatives.
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#alpha-core-stdvariant-c>`__.
-  (`#66481 <https://github.com/llvm/llvm-project/pull/66481>`_)
-
-- Added a new experimental checker ``alpha.cplusplus.ArrayDelete`` to detect
-  destructions of arrays of polymorphic objects that are destructed as their
-  base class (`CERT EXP51-CPP <https://wiki.sei.cmu.edu/confluence/display/cplusplus/EXP51-CPP.+Do+not+delete+an+array+through+a+pointer+of+the+incorrect+type>`_).
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#alpha-cplusplus-arraydelete-c>`__.
-  (`0e246bb67573 <https://github.com/llvm/llvm-project/commit/0e246bb67573799409d0085b89902a330998ddcc>`_)
-
-- Added a new checker configuration option ``InvalidatingGetEnv=[true,false]`` to
-  ``security.cert.env.InvalidPtr``. It's not set by default.
-  If set, ``getenv`` calls won't invalidate previously returned pointers.
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`__.
-  (`#67663 <https://github.com/llvm/llvm-project/pull/67663>`_)
-
 Crash and bug fixes
 ^^^^^^^^^^^^^^^^^^^
 
-- Fixed a crash caused by ``builtin_bit_cast``.
-  (`#69922 <https://github.com/llvm/llvm-project/issues/69922>`_)
-
-- Fixed a ``core.StackAddressEscape`` crash on temporary object fields.
-  (`#66221 <https://github.com/llvm/llvm-project/issues/66221>`_)
-
-- A few crashes have been found and fixed using randomized testing related
-  to the use of ``_BitInt()`` in tidy checks and in clang analysis.
-  (`#67212 <https://github.com/llvm/llvm-project/pull/67212>`_,
-  `#66782 <https://github.com/llvm/llvm-project/pull/66782>`_,
-  `#65889 <https://github.com/llvm/llvm-project/pull/65889>`_,
-  `#65888 <https://github.com/llvm/llvm-project/pull/65888>`_,
-  `#65887 <https://github.com/llvm/llvm-project/pull/65887>`_)
-
-- Fixed note links of the HTML output.
-  (`#64054 <https://github.com/llvm/llvm-project/issues/64054>`_)
-
-- Allow widening rage-based for loops.
-  (`#70190 <https://github.com/llvm/llvm-project/pull/70190>`_)
-
-- Fixed uninitialized base class with initializer list when ctor is not
-  declared in the base class.
-  (`#70464 <https://github.com/llvm/llvm-project/issues/70464>`_,
-  `#59493 <https://github.com/llvm/llvm-project/issues/59493>`_,
-  `#54533 <https://github.com/llvm/llvm-project/issues/54533>`_)
-
-- Fixed an ``alpha.unix.cstring`` crash on variadic functions.
-  (`#74269 <https://github.com/llvm/llvm-project/issues/74269>`_)
-
-- Fix false positive in mutation check when using pointer to member function.
-  (`#66204 <https://github.com/llvm/llvm-project/issues/66204>`_)
-
 Improvements
 ^^^^^^^^^^^^
 
-- Improved the ``unix.StdCLibraryFunctions`` checker by modeling more
-  functions like ``send``, ``recv``, ``readlink``, ``fflush``, ``mkdtemp``,
-  ``getcwd`` and ``errno`` behavior.
-  (`52ac71f92d38 <https://github.com/llvm/llvm-project/commit/52ac71f92d38f75df5cb88e9c090ac5fd5a71548>`_,
-  `#77040 <https://github.com/llvm/llvm-project/pull/77040>`_,
-  `#76671 <https://github.com/llvm/llvm-project/pull/76671>`_,
-  `#71373 <https://github.com/llvm/llvm-project/pull/71373>`_,
-  `#76557 <https://github.com/llvm/llvm-project/pull/76557>`_,
-  `#71392 <https://github.com/llvm/llvm-project/pull/71392>`_)
-
-- Fixed a false negative for when accessing a nonnull property (ObjC).
-  (`1dceba3a3684 <https://github.com/llvm/llvm-project/commit/1dceba3a3684d12394731e09a6cf3efcebf07a3a>`_)
-
-- ``security.insecureAPI.DeprecatedOrUnsafeBufferHandling`` now considers
-  ``fprintf`` calls unsafe.
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#security-insecureapi-deprecatedorunsafebufferhandling-c>`__.
-
-- Improved the diagnostics of the ``optin.core.EnumCastOutOfRange`` checker.
-  It will display the name and the declaration of the enumeration along with
-  the concrete value being cast to the enum.
-  (`#74503 <https://github.com/llvm/llvm-project/pull/74503>`_)
-
-- Improved the ``alpha.security.ArrayBoundV2`` checker for detecting buffer
-  accesses prior the buffer; and also reworked the diagnostic messages.
-  (`3e014038b373 <https://github.com/llvm/llvm-project/commit/3e014038b373e5a4a96d89d46cea17e4d2456a04>`_,
-  `#70056 <https://github.com/llvm/llvm-project/pull/70056>`_,
-  `#72107 <https://github.com/llvm/llvm-project/pull/72107>`_)
-
-- Improved the ``alpha.unix.cstring.OutOfBounds`` checking both ends of the
-  buffers in more cases.
-  (`c3a87ddad62a <https://github.com/llvm/llvm-project/commit/c3a87ddad62a6cc01acaccc76592bc6730c8ac3c>`_,
-  `0954dc3fb921 <https://github.com/llvm/llvm-project/commit/0954dc3fb9214b994623f5306473de075f8e3593>`_)
-
-- Improved the ``alpha.unix.Stream`` checker by modeling more functions
-  ``fputs``, ``fputc``, ``fgets``, ``fgetc``, ``fdopen``, ``ungetc``, ``fflush``,
-  ``getdelim``, ``getline`` and no not recognize alternative
-  ``fopen`` and ``tmpfile`` implementations.
-  (`#78693 <https://github.com/llvm/llvm-project/pull/78693>`_,
-  `#76776 <https://github.com/llvm/llvm-project/pull/76776>`_,
-  `#74296 <https://github.com/llvm/llvm-project/pull/74296>`_,
-  `#73335 <https://github.com/llvm/llvm-project/pull/73335>`_,
-  `#72627 <https://github.com/llvm/llvm-project/pull/72627>`_,
-  `#71518 <https://github.com/llvm/llvm-project/pull/71518>`_,
-  `#72016 <https://github.com/llvm/llvm-project/pull/72016>`_,
-  `#70540 <https://github.com/llvm/llvm-project/pull/70540>`_,
-  `#73638 <https://github.com/llvm/llvm-project/pull/73638>`_,
-  `#77331 <https://github.com/llvm/llvm-project/pull/77331>`_)
-
-- The ``alpha.security.taint.TaintPropagation`` checker no longer propagates
-  taint on ``strlen`` and ``strnlen`` calls, unless these are marked
-  explicitly propagators in the user-provided taint configuration file.
-  This removal empirically reduces the number of false positive reports.
-  Read the PR for the details.
-  (`#66086 <https://github.com/llvm/llvm-project/pull/66086>`_)
-
-- Other taint-related improvements.
-  (`#66358 <https://github.com/llvm/llvm-project/pull/66358>`_,
-  `#66074 <https://github.com/llvm/llvm-project/pull/66074>`_,
-  `#66358 <https://github.com/llvm/llvm-project/pull/66358>`_)
-
-- Checkers can query constraint bounds to improve diagnostic messages.
-  (`#74141 <https://github.com/llvm/llvm-project/pull/74141>`_)
-
-- Improved the generated initializers for modules. Now the calls to initialize
-  functions from imported module units can be omitted if the initialize
-  function is known to be empty.
-  (`#56794 <https://github.com/llvm/llvm-project/issues/56794>`_)
-
-- Clang now allow to export declarations within linkage-specification.
-  (`#71347 <https://github.com/llvm/llvm-project/issues/71347>`_)
-
 Moved checkers
 ^^^^^^^^^^^^^^
 
-- Move checker ``alpha.unix.Errno`` out of the ``alpha`` package
-  to ``unix.Errno``.
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-errno-c>`__.
-
-- Move checker ``alpha.unix.StdCLibraryFunctions`` out of the ``alpha`` package
-  to ``unix.StdCLibraryFunctions``.
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#unix-stdclibraryfunctions-c>`__.
-
-- Move checker ``alpha.security.cert.env.InvalidPtr`` out of the ``alpha``
-  package to ``security.cert.env.InvalidPtr``.
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#security-cert-env-invalidptr>`__.
-
-- Move checker ``alpha.cplusplus.EnumCastOutOfRange`` out of the ``alpha``
-  package to ``optin.core.EnumCastOutOfRange``.
-  `Documentation <https://clang.llvm.org/docs/analyzer/checkers.html#optin-core-enumcastoutofrange-c-c>`__.
-
 .. _release-notes-sanitizers:
 
 Sanitizers
 ----------
 
-- ``-fsanitize=signed-integer-overflow`` now instruments ``__builtin_abs`` and
-  ``abs`` builtins.
-
 Python Binding Changes
 ----------------------
 
diff --git a/libcxx/include/__config b/libcxx/include/__config
index 9557e8e8cf97f..a7de4ca7d5f61 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -62,7 +62,7 @@
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
 // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is
 // defined to XXYYZZ.
-#  define _LIBCPP_VERSION 180000
+#  define _LIBCPP_VERSION 190000
 
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
diff --git a/lld/docs/ReleaseNotes.rst b/lld/docs/ReleaseNotes.rst
index 01669543cd50c..6f60efd87c975 100644
--- a/lld/docs/ReleaseNotes.rst
+++ b/lld/docs/ReleaseNotes.rst
@@ -26,21 +26,12 @@ Non-comprehensive list of changes in this release
 ELF Improvements
 ----------------
 
-* ``--fat-lto-objects`` option is added to support LLVM FatLTO.
-  Without ``--fat-lto-objects``, LLD will link LLVM FatLTO objects using the
-  relocatable object file. (`D146778 <https://reviews.llvm.org/D146778>`_)
-* common-page-size can now be larger than the system page-size.
-  (`#57618 <https://github.com/llvm/llvm-project/issues/57618>`_)
-
 Breaking changes
 ----------------
 
 COFF Improvements
 -----------------
 
-* Added support for ``--time-trace`` and associated ``--time-trace-granularity``.
-  This generates a .json profile trace of the linker execution.
-
 MinGW Improvements
 ------------------
 
@@ -50,9 +41,5 @@ MachO Improvements
 WebAssembly Improvements
 ------------------------
 
-* Indexes are no longer required on archive files.  Instead symbol information
-  is read from object files within the archive.  This matches the behaviour of
-  the ELF linker.
-
 Fixes
 #####
diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 1d230004e6c34..485c76b8bb936 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -16,7 +16,7 @@ if(NOT LLVM_NO_INSTALL_NAME_DIR_FOR_BUILD_TREE)
 endif()
 
 if(NOT DEFINED LLVM_VERSION_MAJOR)
-  set(LLVM_VERSION_MAJOR 18)
+  set(LLVM_VERSION_MAJOR 19)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
   set(LLVM_VERSION_MINOR 0)
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 7b6a3f10d6377..300c9687e8c00 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -50,79 +50,27 @@ Update on required toolchains to build LLVM
 Changes to the LLVM IR
 ----------------------
 
-* The `llvm.stacksave` and `llvm.stackrestore` intrinsics now use
-  an overloaded pointer type to support non-0 address spaces.
-* The constant expression variants of the following instructions have been
-  removed:
-
-  * ``and``
-  * ``or``
-  * ``lshr``
-  * ``ashr``
-  * ``zext``
-  * ``sext``
-  * ``fptrunc``
-  * ``fpext``
-  * ``fptoui``
-  * ``fptosi``
-  * ``uitofp``
-  * ``sitofp``
-
-* Added `llvm.exp10` intrinsic.
-
-* Added a ``code_model`` attribute for the `global variable <LangRef.html#global-variables>`_.
-
 Changes to LLVM infrastructure
 ------------------------------
 
-* Minimum Clang version to build LLVM in C++20 configuration has been updated to clang-17.0.6.
-
 Changes to building LLVM
 ------------------------
 
 Changes to TableGen
 -------------------
 
-* Added constructs for debugging TableGen files:
-
-  * `dump` keyword to dump messages to standard error, see
-     https://github.com/llvm/llvm-project/pull/68793.
-  * `!repr` bang operator to inspect the content of values, see
-     https://github.com/llvm/llvm-project/pull/68716.
-
 Changes to Interprocedural Optimizations
 ----------------------------------------
 
 Changes to the AArch64 Backend
 ------------------------------
 
-* Added support for Cortex-A520, Cortex-A720 and Cortex-X4 CPUs.
-
-* Neoverse-N2 was incorrectly marked as an Armv8.5a core. This has been
-  changed to an Armv9.0a core. However, crypto options are not enabled
-  by default for Armv9 cores, so `-mcpu=neoverse-n2+crypto` is now required
-  to enable crypto for this core. As far as the compiler is concerned,
-  Armv9.0a has the same features enabled as Armv8.5a, with the exception
-  of crypto.
-
 Changes to the AMDGPU Backend
 -----------------------------
 
-* `llvm.sqrt.f32` is now lowered correctly. Use `llvm.amdgcn.sqrt.f32`
-  for raw instruction access.
-
-* Implemented `llvm.stacksave` and `llvm.stackrestore` intrinsics.
-
-* Implemented :ref:`llvm.get.rounding <int_get_rounding>`
-
-* The default :ref:`AMDHSA code object version <amdgpu-amdhsa-code-object-metadata-v5>` is now 5.
-
 Changes to the ARM Backend
 --------------------------
 
-* Added support for Cortex-M52 CPUs.
-* Added execute-only support for Armv6-M.
-
 Changes to the AVR Backend
 --------------------------
 
@@ -135,21 +83,6 @@ Changes to the Hexagon Backend
 Changes to the LoongArch Backend
 --------------------------------
 
-* Added intrinsics support for all LSX (128-bits SIMD) and LASX (256-bits SIMD)
-  instructions.
-* Added definition and intrinsics support for new instructions that were
-  introduced in LoongArch Reference Manual V1.10.
-* Emitted adjacent ``pcaddu18i+jirl`` instrunction sequence with one relocation
-  ``R_LARCH_CALL36`` instead of ``pcalau12i+jirl`` with two relocations
-  ``R_LARCH_PCALA_{HI20,LO12}`` for function call in medium code model.
-* The code model of global variables can now be overridden by means of the newly
-  added LLVM IR attribute, ``code_model``.
-* Added support for the ``llvm.is.fpclass`` intrinsic.
-* ``mulodi4`` and ``muloti4`` libcalls were disabled due to absence in libgcc.
-* Added initial support for auto vectorization.
-* Added initial support for linker relaxation.
-* Assorted codegen improvements.
-
 Changes to the MIPS Backend
 ---------------------------
 
@@ -159,146 +92,29 @@ Changes to the PowerPC Backend
 Changes to the RISC-V Backend
 -----------------------------
 
-* The Zfa extension version was upgraded to 1.0 and is no longer experimental.
-* Zihintntl extension version was upgraded to 1.0 and is no longer experimental.
-* Intrinsics were added for Zk*, Zbb, and Zbc. See https://github.com/riscv-non-isa/riscv-c-api-doc/blob/master/riscv-c-api.md#scalar-bit-manipulation-extension-intrinsics
-* Default ABI with F but without D was changed to ilp32f for RV32 and to lp64f for RV64.
-* The Zvbb, Zvbc, Zvkb, Zvkg, Zvkn, Zvknc, Zvkned, Zvkng, Zvknha, Zvknhb, Zvks,
-  Zvksc, Zvksed, Zvksg, Zvksh, and Zvkt extension version was upgraded to 1.0
-  and is no longer experimental.  However, the C intrinsics for these extensions
-  are still experimental.  To use the C intrinsics for these extensions,
-  ``-menable-experimental-extensions`` needs to be passed to Clang.
-* XSfcie extension and SiFive CSRs and instructions that were associated with
-  it have been removed. None of these CSRs and instructions were part of
-  "SiFive Custom Instruction Extension" as SiFive defines it. The LLVM project
-  needs to work with SiFive to define and document real extension names for
-  individual CSRs and instructions.
-* ``-mcpu=sifive-p450`` was added.
-* CodeGen of RV32E/RV64E was supported experimentally.
-* CodeGen of ilp32e/lp64e was supported experimentally.
-* Support was added for the Ziccif, Ziccrse, Ziccamoa, Zicclsm, Za64rs, Za128rs
-  and Zic64b extensions which were introduced as a part of the RISC-V Profiles
-  specification.
-* The Smepmp 1.0 extension is now supported.
-* ``-mcpu=sifive-p670`` was added.
-
 Changes to the WebAssembly Backend
 ----------------------------------
 
 Changes to the Windows Target
 -----------------------------
 
-* The LLVM filesystem class ``UniqueID`` and function ``equivalent()``
-  no longer determine that distinct different path names for the same
-  hard linked file actually are equal. This is an intentional tradeoff in a
-  bug fix, where the bug used to cause distinct files to be considered
-  equivalent on some file systems. This change fixed the issues
-  https://github.com/llvm/llvm-project/issues/61401 and
-  https://github.com/llvm/llvm-project/issues/22079.
-
 Changes to the X86 Backend
 --------------------------
 
-* The ``i128`` type now matches GCC and clang's ``__int128`` type. This mainly
-  benefits external projects such as Rust which aim to be binary compatible
-  with C, but also fixes code generation where LLVM already assumed that the
-  type matched and called into libgcc helper functions.
-* Support ISA of ``USER_MSR``.
-* Support ISA of ``AVX10.1-256`` and ``AVX10.1-512``.
-* ``-mcpu=pantherlake`` and ``-mcpu=clearwaterforest`` are now supported.
-* ``-mapxf`` is supported.
-* Marking global variables with ``code_model = "small"/"large"`` in the IR now
-  overrides the global code model to allow 32-bit relocations or require 64-bit
-  relocations to the global variable.
-* The medium code model's code generation was audited to be more similar to the
-  small code model where possible.
-
 Changes to the OCaml bindings
 -----------------------------
 
 Changes to the Python bindings
 ------------------------------
 
-* The python bindings have been removed.
-
-
 Changes to the C API
 --------------------
 
-* Added ``LLVMGetTailCallKind`` and ``LLVMSetTailCallKind`` to
-  allow getting and setting ``tail``, ``musttail``, and ``notail``
-  attributes on call instructions.
-* The following functions for creating constant expressions have been removed,
-  because the underlying constant expressions are no longer supported. Instead,
-  an instruction should be created using the ``LLVMBuildXYZ`` APIs, which will
-  constant fold the operands if possible and create an instruction otherwise:
-
-  * ``LLVMConstAnd``
-  * ``LLVMConstOr``
-  * ``LLVMConstLShr``
-  * ``LLVMConstAShr``
-  * ``LLVMConstZExt``
-  * ``LLVMConstSExt``
-  * ``LLVMConstZExtOrBitCast``
-  * ``LLVMConstSExtOrBitCast``
-  * ``LLVMConstIntCast``
-  * ``LLVMConstFPTrunc``
-  * ``LLVMConstFPExt``
-  * ``LLVMConstFPToUI``
-  * ``LLVMConstFPToSI``
-  * ``LLVMConstUIToFP``
-  * ``LLVMConstSIToFP``
-  * ``LLVMConstFPCast``
-
-* Added ``LLVMCreateTargetMachineWithOptions``, along with helper functions for
-  an opaque option structure, as an alternative to ``LLVMCreateTargetMachine``.
-  The option structure exposes an additional setting (i.e., the target ABI) and
-  provides default values for unspecified settings.
-
-* Added ``LLVMGetNNeg`` and ``LLVMSetNNeg`` for getting/setting the new nneg flag
-  on zext instructions, and ``LLVMGetIsDisjoint`` and ``LLVMSetIsDisjoint``
-  for getting/setting the new disjoint flag on or instructions.
-
-* Added the following functions for manipulating operand bundles, as well as
-  building ``call`` and ``invoke`` instructions that use operand bundles:
-
-  * ``LLVMBuildCallWithOperandBundles``
-  * ``LLVMBuildInvokeWithOperandBundles``
-  * ``LLVMCreateOperandBundle``
-  * ``LLVMDisposeOperandBundle``
-  * ``LLVMGetNumOperandBundles``
-  * ``LLVMGetOperandBundleAtIndex``
-  * ``LLVMGetNumOperandBundleArgs``
-  * ``LLVMGetOperandBundleArgAtIndex``
-  * ``LLVMGetOperandBundleTag``
-
-* Added ``LLVMGetFastMathFlags`` and ``LLVMSetFastMathFlags`` for getting/setting
-  the fast-math flags of an instruction, as well as ``LLVMCanValueUseFastMathFlags``
-  for checking if an instruction can use such flags
-
 Changes to the CodeGen infrastructure
 -------------------------------------
 
-* A new debug type ``isel-dump`` is added to show only the SelectionDAG dumps
-  after each ISel phase (i.e. ``-debug-only=isel-dump``). This new debug type
-  can be filtered by function names using ``-filter-print-funcs=<function names>``,
-  the same flag used to filter IR dumps after each Pass. Note that the existing
-  ``-debug-only=isel`` will take precedence over the new behavior and
-  print SelectionDAG dumps of every single function regardless of
-  ``-filter-print-funcs``'s values.
-
-* ``PrologEpilogInserter`` no longer supports register scavenging
-  during forwards frame index elimination. Targets should use
-  backwards frame index elimination instead.
-
-* ``RegScavenger`` no longer supports forwards register
-  scavenging. Clients should use backwards register scavenging
-  instead, which is preferred because it does not depend on accurate
-  kill flags.
-
 Changes to the Metadata Info
 ---------------------------------
-* Added a new loop metadata `!{!"llvm.loop.align", i32 64}`
 
 Changes to the Debug Info
 ---------------------------------
@@ -306,82 +122,16 @@ Changes to the Debug Info
 Changes to the LLVM tools
 ---------------------------------
 
-* llvm-symbolizer now treats invalid input as an address for which source
-  information is not found.
-* llvm-readelf now supports ``--extra-sym-info`` (``-X``) to display extra
-  information (section name) when showing symbols.
-
-* ``llvm-nm`` now supports the ``--line-numbers`` (``-l``) option to use
-  debugging information to print symbols' filenames and line numbers.
-
-* llvm-symbolizer and llvm-addr2line now support addresses specified as symbol names.
-
-* llvm-objcopy now supports ``--gap-fill`` and ``--pad-to`` options, for
-  ELF input and binary output files only.
-
 Changes to LLDB
 ---------------------------------
 
-* ``SBWatchpoint::GetHardwareIndex`` is deprecated and now returns -1
-  to indicate the index is unavailable.
-* Methods in SBHostOS related to threads have had their implementations
-  removed. These methods will return a value indicating failure.
-* ``SBType::FindDirectNestedType`` function is added. It's useful
-  for formatters to quickly find directly nested type when it's known
-  where to search for it, avoiding more expensive global search via
-  ``SBTarget::FindFirstType``.
-* ``lldb-vscode`` was renamed to ``lldb-dap`` and and its installation
-  instructions have been updated to reflect this. The underlying functionality
-  remains unchanged.
-* The ``mte_ctrl`` register can now be read from AArch64 Linux core files.
-* LLDB on AArch64 Linux now supports debugging the Scalable Matrix Extension
-  (SME) and Scalable Matrix Extension 2 (SME2) for both live processes and core
-  files. For details refer to the
-  `AArch64 Linux documentation <https://lldb.llvm.org/use/aarch64-linux.html>`_.
-* LLDB now supports symbol and binary acquisition automatically using the
-  DEBUFINFOD protocol. The standard mechanism of specifying DEBUFINOD servers in
-  the ``DEBUGINFOD_URLS`` environment variable is used by default. In addition,
-  users can specify servers to request symbols from using the LLDB setting
-  ``plugin.symbol-locator.debuginfod.server_urls``, override or adding to the
-  environment variable.
-
-
-* When running on AArch64 Linux, ``lldb-server`` now provides register
-  field information for the following registers: ``cpsr``, ``fpcr``,
-  ``fpsr``, ``svcr`` and ``mte_ctrl``. ::
-
-    (lldb) register read cpsr
-          cpsr = 0x80001000
-               = (N = 1, Z = 0, C = 0, V = 0, SS = 0, IL = 0, <...>
-
-  This is only available when ``lldb`` is built with XML support.
-  Where possible the CPU's capabilities are used to decide which
-  fields are present, however this is not always possible or entirely
-  accurate. If in doubt, refer to the numerical value.
-
 Changes to Sanitizers
 ---------------------
-* HWASan now defaults to detecting use-after-scope bugs.
 
 Other Changes
 -------------
 
-* The ``Flags`` field of ``llvm::opt::Option`` has been split into ``Flags``
-  and ``Visibility`` to simplify option sharing between various drivers (such
-  as ``clang``, ``clang-cl``, or ``flang``) that rely on Clang's Options.td.
-  Overloads of ``llvm::opt::OptTable`` that use ``FlagsToInclude`` have been
-  deprecated. There is a script and instructions on how to resolve conflicts -
-  see https://reviews.llvm.org/D157150 and https://reviews.llvm.org/D157151 for
-  details.
-
-* On Linux, FreeBSD, and NetBSD, setting the environment variable
-  ``LLVM_ENABLE_SYMBOLIZER_MARKUP`` causes tools to print stacktraces using
-  :doc:`Symbolizer Markup <SymbolizerMarkupFormat>`.
-  This works even if the tools have no embedded symbol information (i.e. are
-  fully stripped); :doc:`llvm-symbolizer <CommandGuide/llvm-symbolizer>` can
-  symbolize the markup afterwards using ``debuginfod``.
-
-External Open Source Projects Using LLVM 15
+External Open Source Projects Using LLVM 19
 ===========================================
 
 * A project...
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index f113759176612..7c02ed396db5f 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
-llvm_version_major = 18
+llvm_version_major = 19
 llvm_version_minor = 0
 llvm_version_patch = 0
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index 1eea0887f1d11..a5a1ff66bf417 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Daniel Dunbar"
 __email__ = "daniel@minormatter.com"
-__versioninfo__ = (18, 0, 0)
+__versioninfo__ = (19, 0, 0)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
 __all__ = []
diff --git a/openmp/docs/ReleaseNotes.rst b/openmp/docs/ReleaseNotes.rst
index 3eeaf5c900d80..df944c3be68de 100644
--- a/openmp/docs/ReleaseNotes.rst
+++ b/openmp/docs/ReleaseNotes.rst
@@ -1,10 +1,10 @@
 ===========================
-OpenMP 17.0.0 Release Notes
+OpenMP 19.0.0 Release Notes
 ===========================
 
 
 .. warning::
-   These are in-progress notes for the upcoming LLVM 17.0.0 release.
+   These are in-progress notes for the upcoming LLVM 19.0.0 release.
    Release notes for previous releases can be found on
    `the Download Page <https://releases.llvm.org/download.html>`_.
 
@@ -12,7 +12,7 @@ OpenMP 17.0.0 Release Notes
 Introduction
 ============
 
-This document contains the release notes for the OpenMP runtime, release 17.0.0.
+This document contains the release notes for the OpenMP runtime, release 19.0.0.
 Here we describe the status of OpenMP, including major improvements
 from the previous release. All OpenMP releases may be downloaded
 from the `LLVM releases web site <https://llvm.org/releases/>`_.
diff --git a/pstl/docs/ReleaseNotes.rst b/pstl/docs/ReleaseNotes.rst
index 61bee0848536b..342b2eafc2f51 100644
--- a/pstl/docs/ReleaseNotes.rst
+++ b/pstl/docs/ReleaseNotes.rst
@@ -1,5 +1,5 @@
 =======================================
-PSTL 18.0.0 (In-Progress) Release Notes
+PSTL 19.0.0 (In-Progress) Release Notes
 =======================================
 
 .. contents::
@@ -10,7 +10,7 @@ Written by the `PSTL Team <https://pstl.llvm.org>`_
 
 .. warning::
 
-   These are in-progress notes for the upcoming pstl 17 release.
+   These are in-progress notes for the upcoming pstl 19 release.
    Release notes for previous releases can be found on
    `the Download Page <https://releases.llvm.org/download.html>`_.
 
@@ -30,7 +30,7 @@ web page, this document applies to the *next* release, not the current one.
 To see the release notes for a specific release, please see the `releases
 page <https://llvm.org/releases/>`_.
 
-What's New in PSTL 18.0.0?
+What's New in PSTL 19.0.0?
 ==========================
 
 New Features

From baba7e4175b6ca21e83b1cf8229f29dbba02e979 Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Wed, 24 Jan 2024 11:03:14 +0800
Subject: [PATCH 730/843] [test] Update dwarf-loongarch-relocs.ll

Address buildbot faiures:
http://45.33.8.238/macm1/77360/step_11.txt
http://45.33.8.238/linux/128902/step_12.txt
---
 .../LoongArch/dwarf-loongarch-relocs.ll       | 37 ++++++++++++-------
 1 file changed, 23 insertions(+), 14 deletions(-)

diff --git a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
index e03b4c1d34de1..07443a62b9339 100644
--- a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
+++ b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
@@ -1,19 +1,22 @@
 ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=-relax %s -o %t.o
 ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-NORL %s
-; RUN: llvm-objdump --source %t.o | FileCheck --check-prefix=SOURCE %s
-; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefix=DWARF %s
+; RUN: llvm-objdump --source %t.o | FileCheck --check-prefixes=SOURCE,SOURCE-NORL %s
+; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefixes=DWARF,DWARF-NORL %s
 
 ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=+relax %s -o %t.r.o
 ; RUN: llvm-readobj -r %t.r.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-ENRL %s
-; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefix=SOURCE %s
-; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefix=DWARF %s
+; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefixes=SOURCE,SOURCE-ENRL %s
+; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefixes=DWARF,DWARF-ENRL %s
 
 ; RELOCS-BOTH:       Relocations [
 ; RELOCS-BOTH-NEXT:    Section ({{.*}}) .rela.text {
-; RELOCS-BOTH-NEXT:      0x14 R_LARCH_PCALA_HI20 sym 0x0
-; RELOCS-ENRL-NEXT:      0x14 R_LARCH_RELAX - 0x0
-; RELOCS-BOTH-NEXT:      0x18 R_LARCH_PCALA_LO12 sym 0x0
-; RELOCS-ENRL-NEXT:      0x18 R_LARCH_RELAX - 0x0
+; RELOCS-NORL-NEXT:      0x14 R_LARCH_PCALA_HI20 sym 0x0
+; RELOCS-NORL-NEXT:      0x18 R_LARCH_PCALA_LO12 sym 0x0
+; RELOCS-ENRL-NEXT:      0x0 R_LARCH_ALIGN .Lla-relax-align0 0x5
+; RELOCS-ENRL-NEXT:      0x30 R_LARCH_PCALA_HI20 sym 0x0
+; RELOCS-ENRL-NEXT:      0x30 R_LARCH_RELAX - 0x0
+; RELOCS-ENRL-NEXT:      0x34 R_LARCH_PCALA_LO12 sym 0x0
+; RELOCS-ENRL-NEXT:      0x34 R_LARCH_RELAX - 0x0
 ; RELOCS-BOTH-NEXT:    }
 ; RELOCS-BOTH:         Section ({{.*}}) .rela.debug_frame {
 ; RELOCS-NORL-NEXT:      0x1C R_LARCH_32 .debug_frame 0x0
@@ -36,7 +39,8 @@
 ; RELOCS-BOTH-NEXT:    }
 ; RELOCS-BOTH-NEXT:  ]
 
-; SOURCE:  0000000000000000 <foo>:
+; SOURCE-NORL:  0000000000000000 <foo>:
+; SOURCE-ENRL:  000000000000001c <foo>:
 ; SOURCE:  ; {
 ; SOURCE:  ;   asm volatile(
 ; SOURCE:  ;   return 0;
@@ -87,11 +91,16 @@
 ; DWARF-EMPTY:
 ; DWARF-NEXT:  Address            Line   Column File   ISA Discriminator OpIndex Flags
 ; DWARF-NEXT:  ------------------ ------ ------ ------ --- ------------- ------- -------------
-; DWARF-NEXT:  0x0000000000000000      2      0      0   0             0       0  is_stmt
-; DWARF-NEXT:  0x0000000000000010      3      3      0   0             0       0  is_stmt prologue_end
-; DWARF-NEXT:  0x0000000000000020     10      3      0   0             0       0  is_stmt
-; DWARF-NEXT:  0x000000000000002c     10      3      0   0             0       0  epilogue_begin
-; DWARF-NEXT:  0x0000000000000034     10      3      0   0             0       0  end_sequence
+; DWARF-NORL-NEXT:  0x0000000000000000      2      0      0   0             0       0  is_stmt
+; DWARF-NORL-NEXT:  0x0000000000000010      3      3      0   0             0       0  is_stmt prologue_end
+; DWARF-NORL-NEXT:  0x0000000000000020     10      3      0   0             0       0  is_stmt
+; DWARF-NORL-NEXT:  0x000000000000002c     10      3      0   0             0       0  epilogue_begin
+; DWARF-NORL-NEXT:  0x0000000000000034     10      3      0   0             0       0  end_sequence
+; DWARF-ENRL-NEXT:  0x000000000000001c      2      0      0   0             0       0  is_stmt
+; DWARF-ENRL-NEXT:  0x000000000000002c      3      3      0   0             0       0  is_stmt prologue_end
+; DWARF-ENRL-NEXT:  0x000000000000003c     10      3      0   0             0       0  is_stmt
+; DWARF-ENRL-NEXT:  0x0000000000000048     10      3      0   0             0       0  epilogue_begin
+; DWARF-ENRL-NEXT:  0x0000000000000050     10      3      0   0             0       0  end_sequence
 
 ; ModuleID = 'dwarf-loongarch-relocs.c'
 source_filename = "dwarf-loongarch-relocs.c"

From 7251243315ef66f9b3f32e6f8e9536f701aa0d0a Mon Sep 17 00:00:00 2001
From: paperchalice <liujunchang97@outlook.com>
Date: Wed, 24 Jan 2024 11:29:18 +0800
Subject: [PATCH 731/843] [CodeGen][Passes] Move `CodeGenPassBuilder.h` to
 Passes (#79242)

`CodeGenPassBuilder` is not very tightly coupled to CodeGen, it may need
to reference some method in pass builder in future, so move
`CodeGenPassBuilder.h` to Passes.
---
 .../llvm/{CodeGen => Passes}/CodeGenPassBuilder.h    | 10 +++++-----
 .../llvm/{CodeGen => Passes}/MachinePassRegistry.def |  0
 llvm/include/module.modulemap                        |  3 ---
 llvm/lib/CodeGen/CMakeLists.txt                      |  1 -
 llvm/lib/Passes/CMakeLists.txt                       |  1 +
 llvm/lib/{CodeGen => Passes}/CodeGenPassBuilder.cpp  |  6 +++---
 llvm/lib/Passes/PassBuilder.cpp                      | 12 ++++++------
 llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp        |  2 +-
 llvm/tools/llc/NewPMDriver.cpp                       |  2 +-
 9 files changed, 17 insertions(+), 20 deletions(-)
 rename llvm/include/llvm/{CodeGen => Passes}/CodeGenPassBuilder.h (99%)
 rename llvm/include/llvm/{CodeGen => Passes}/MachinePassRegistry.def (100%)
 rename llvm/lib/{CodeGen => Passes}/CodeGenPassBuilder.cpp (87%)

diff --git a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
similarity index 99%
rename from llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
rename to llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 96d6f891af4e2..07afac3bcf840 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -12,8 +12,8 @@
 ///
 //===----------------------------------------------------------------------===//
 
-#ifndef LLVM_CODEGEN_CODEGENPASSBUILDER_H
-#define LLVM_CODEGEN_CODEGENPASSBUILDER_H
+#ifndef LLVM_PASSES_CODEGENPASSBUILDER_H
+#define LLVM_PASSES_CODEGENPASSBUILDER_H
 
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -116,7 +116,7 @@ namespace llvm {
     }                                                                          \
     static AnalysisKey Key;                                                    \
   };
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
 
 /// This class provides access to building LLVM's passes.
 ///
@@ -676,7 +676,7 @@ CodeGenPassBuilder<Derived>::getPassNameFromLegacyName(StringRef Name) const {
 #define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)              \
   if (Name == NAME)                                                            \
     Ret = {#PASS_NAME, true};
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
 
   if (Ret.first.empty())
     Ret = derived().getTargetPassNameFromLegacyName(Name);
@@ -1235,4 +1235,4 @@ void CodeGenPassBuilder<Derived>::addBlockPlacement(
 
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_CODEGENPASSBUILDER_H
+#endif // LLVM_PASSES_CODEGENPASSBUILDER_H
diff --git a/llvm/include/llvm/CodeGen/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
similarity index 100%
rename from llvm/include/llvm/CodeGen/MachinePassRegistry.def
rename to llvm/include/llvm/Passes/MachinePassRegistry.def
diff --git a/llvm/include/module.modulemap b/llvm/include/module.modulemap
index 8930fa8c087cf..b19929fb9adfd 100644
--- a/llvm/include/module.modulemap
+++ b/llvm/include/module.modulemap
@@ -46,11 +46,8 @@ module LLVM_Backend {
     exclude header "llvm/CodeGen/LinkAllAsmWriterComponents.h"
     exclude header "llvm/CodeGen/LinkAllCodegenComponents.h"
 
-    exclude header "llvm/CodeGen/CodeGenPassBuilder.h"
-
     // These are intended for (repeated) textual inclusion.
     textual header "llvm/CodeGen/DIEValue.def"
-    textual header "llvm/CodeGen/MachinePassRegistry.def"
   }
 }
 
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index df2d1831ee5fd..28c32116b2ba5 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -56,7 +56,6 @@ add_llvm_component_library(LLVMCodeGen
   CFIInstrInserter.cpp
   CodeGen.cpp
   CodeGenCommonISel.cpp
-  CodeGenPassBuilder.cpp
   CodeGenPrepare.cpp
   CommandFlags.cpp
   ComplexDeinterleavingPass.cpp
diff --git a/llvm/lib/Passes/CMakeLists.txt b/llvm/lib/Passes/CMakeLists.txt
index 98d2de76c0e11..6425f4934b210 100644
--- a/llvm/lib/Passes/CMakeLists.txt
+++ b/llvm/lib/Passes/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_component_library(LLVMPasses
+  CodeGenPassBuilder.cpp
   OptimizationLevel.cpp
   PassBuilder.cpp
   PassBuilderBindings.cpp
diff --git a/llvm/lib/CodeGen/CodeGenPassBuilder.cpp b/llvm/lib/Passes/CodeGenPassBuilder.cpp
similarity index 87%
rename from llvm/lib/CodeGen/CodeGenPassBuilder.cpp
rename to llvm/lib/Passes/CodeGenPassBuilder.cpp
index 82945528e7680..c0319e5d6e404 100644
--- a/llvm/lib/CodeGen/CodeGenPassBuilder.cpp
+++ b/llvm/lib/Passes/CodeGenPassBuilder.cpp
@@ -11,17 +11,17 @@
 //
 //===---------------------------------------------------------------------===//
 
-#include "llvm/CodeGen/CodeGenPassBuilder.h"
+#include "llvm/Passes/CodeGenPassBuilder.h"
 
 using namespace llvm;
 
 namespace llvm {
 #define DUMMY_MACHINE_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                \
   MachinePassKey PASS_NAME::Key;
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
 #define DUMMY_MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)              \
   MachinePassKey PASS_NAME::Key;
 #define DUMMY_MACHINE_FUNCTION_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)          \
   AnalysisKey PASS_NAME::Key;
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
 } // namespace llvm
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 000594f0e7f4b..2fe167bd439d4 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -400,7 +400,7 @@ PassBuilder::PassBuilder(TargetMachine *TM, PipelineTuningOptions PTO,
   PIC->addClassToPassName(PASS_NAME::name(), NAME);
 #define MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)                    \
   PIC->addClassToPassName(PASS_NAME::name(), NAME);
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
   }
 }
 
@@ -441,7 +441,7 @@ void PassBuilder::registerMachineFunctionAnalyses(
 
 #define MACHINE_FUNCTION_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)                \
   MFAM.registerPass([&] { return PASS_NAME(); });
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
 
   for (auto &C : MachineFunctionAnalysisRegistrationCallbacks)
     C(MFAM);
@@ -1868,7 +1868,7 @@ Error PassBuilder::parseMachinePass(MachineFunctionPassManager &MFPM,
     MFPM.addPass(PASS_NAME());                                                 \
     return Error::success();                                                   \
   }
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
 
   for (auto &C : MachinePipelineParsingCallbacks)
     if (C(Name, MFPM))
@@ -2179,17 +2179,17 @@ void PassBuilder::printPassNames(raw_ostream &OS) {
   OS << "Machine module passes (WIP):\n";
 #define MACHINE_MODULE_PASS(NAME, PASS_NAME, CONSTRUCTOR)                      \
   printPassName(NAME, OS);
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
 
   OS << "Machine function passes (WIP):\n";
 #define MACHINE_FUNCTION_PASS(NAME, PASS_NAME, CONSTRUCTOR)                    \
   printPassName(NAME, OS);
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
 
   OS << "Machine function analyses (WIP):\n";
 #define MACHINE_FUNCTION_ANALYSIS(NAME, PASS_NAME, CONSTRUCTOR)                \
   printPassName(NAME, OS);
-#include "llvm/CodeGen/MachinePassRegistry.def"
+#include "llvm/Passes/MachinePassRegistry.def"
 }
 
 void PassBuilder::registerParseTopLevelPipelineCallback(
diff --git a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
index 616f777833e56..4a11dd2e31acd 100644
--- a/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
+++ b/llvm/lib/Target/X86/X86CodeGenPassBuilder.cpp
@@ -12,8 +12,8 @@
 
 #include "X86TargetMachine.h"
 
-#include "llvm/CodeGen/CodeGenPassBuilder.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/Passes/CodeGenPassBuilder.h"
 
 using namespace llvm;
 
diff --git a/llvm/tools/llc/NewPMDriver.cpp b/llvm/tools/llc/NewPMDriver.cpp
index 13020f3dd07fe..2f979f3081f79 100644
--- a/llvm/tools/llc/NewPMDriver.cpp
+++ b/llvm/tools/llc/NewPMDriver.cpp
@@ -15,7 +15,6 @@
 #include "NewPMDriver.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/CodeGen/CodeGenPassBuilder.h"
 #include "llvm/CodeGen/CommandFlags.h"
 #include "llvm/CodeGen/MIRParser/MIRParser.h"
 #include "llvm/CodeGen/MIRPrinter.h"
@@ -30,6 +29,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
+#include "llvm/Passes/CodeGenPassBuilder.h" // TODO: Include pass headers properly.
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/CommandLine.h"

From 33d804c6c2786cbbbc13743060f08d679941e0a4 Mon Sep 17 00:00:00 2001
From: Brandon Wu <brandon.wu@sifive.com>
Date: Wed, 24 Jan 2024 11:30:12 +0800
Subject: [PATCH 732/843] [RISCV] Allow VCIX with SE to reorder (#77049)

This patch allows VCIX instructions that have side effect to be
reordered
with memory and other side effecting instructions. However we don't want
VCIX instructions to be reordered with each other, so we propose a dummy
register called VCIX_STATE and make these instructions implicitly define
and use
it.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp |  292 ++++-
 llvm/lib/Target/RISCV/RISCVISelLowering.h   |   71 ++
 llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td  |  229 +++-
 llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp |    3 +
 llvm/lib/Target/RISCV/RISCVRegisterInfo.td  |    3 +
 llvm/test/CodeGen/RISCV/pr69586.ll          | 1267 ++++++++++---------
 6 files changed, 1169 insertions(+), 696 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 47c6cd6e5487b..56a5ab14a4a9f 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -8450,25 +8450,63 @@ static SDValue lowerGetVectorLength(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(ISD::TRUNCATE, DL, N->getValueType(0), Res);
 }
 
-static void getVCIXOperands(SDValue &Op, SelectionDAG &DAG,
-                            SmallVector<SDValue> &Ops) {
+static inline void promoteVCIXScalar(const SDValue &Op,
+                                     SmallVectorImpl<SDValue> &Operands,
+                                     SelectionDAG &DAG) {
+  const RISCVSubtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<RISCVSubtarget>();
+
+  bool HasChain = Op.getOpcode() == ISD::INTRINSIC_VOID ||
+                  Op.getOpcode() == ISD::INTRINSIC_W_CHAIN;
+  unsigned IntNo = Op.getConstantOperandVal(HasChain ? 1 : 0);
   SDLoc DL(Op);
 
+  const RISCVVIntrinsicsTable::RISCVVIntrinsicInfo *II =
+      RISCVVIntrinsicsTable::getRISCVVIntrinsicInfo(IntNo);
+  if (!II || !II->hasScalarOperand())
+    return;
+
+  unsigned SplatOp = II->ScalarOperand + 1;
+  assert(SplatOp < Op.getNumOperands());
+
+  SDValue &ScalarOp = Operands[SplatOp];
+  MVT OpVT = ScalarOp.getSimpleValueType();
+  MVT XLenVT = Subtarget.getXLenVT();
+
+  // The code below is partially copied from lowerVectorIntrinsicScalars.
+  // If this isn't a scalar, or its type is XLenVT we're done.
+  if (!OpVT.isScalarInteger() || OpVT == XLenVT)
+    return;
+
+  // Manually emit promote operation for scalar operation.
+  if (OpVT.bitsLT(XLenVT)) {
+    unsigned ExtOpc =
+        isa<ConstantSDNode>(ScalarOp) ? ISD::SIGN_EXTEND : ISD::ANY_EXTEND;
+    ScalarOp = DAG.getNode(ExtOpc, DL, XLenVT, ScalarOp);
+  }
+
+  return;
+}
+
+static void processVCIXOperands(SDValue &OrigOp,
+                                SmallVectorImpl<SDValue> &Operands,
+                                SelectionDAG &DAG) {
+  promoteVCIXScalar(OrigOp, Operands, DAG);
   const RISCVSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<RISCVSubtarget>();
-  for (const SDValue &V : Op->op_values()) {
+  for (SDValue &V : Operands) {
     EVT ValType = V.getValueType();
-    if (ValType.isScalableVector() && ValType.isFloatingPoint()) {
+    if (ValType.isVector() && ValType.isFloatingPoint()) {
       MVT InterimIVT =
           MVT::getVectorVT(MVT::getIntegerVT(ValType.getScalarSizeInBits()),
                            ValType.getVectorElementCount());
-      Ops.push_back(DAG.getBitcast(InterimIVT, V));
-    } else if (ValType.isFixedLengthVector()) {
+      V = DAG.getBitcast(InterimIVT, V);
+    }
+    if (ValType.isFixedLengthVector()) {
       MVT OpContainerVT = getContainerForFixedLengthVector(
           DAG, V.getSimpleValueType(), Subtarget);
-      Ops.push_back(convertToScalableVector(OpContainerVT, V, DAG, Subtarget));
-    } else
-      Ops.push_back(V);
+      V = convertToScalableVector(OpContainerVT, V, DAG, Subtarget);
+    }
   }
 }
 
@@ -8702,8 +8740,8 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::riscv_sf_vc_v_fvw: {
     MVT VT = Op.getSimpleValueType();
 
-    SmallVector<SDValue> Ops;
-    getVCIXOperands(Op, DAG, Ops);
+    SmallVector<SDValue> Operands{Op->op_values()};
+    processVCIXOperands(Op, Operands, DAG);
 
     MVT RetVT = VT;
     if (VT.isFixedLengthVector())
@@ -8712,7 +8750,7 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       RetVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
                                VT.getVectorElementCount());
 
-    SDValue NewNode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, RetVT, Ops);
+    SDValue NewNode = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, RetVT, Operands);
 
     if (VT.isFixedLengthVector())
       NewNode = convertFromScalableVector(VT, NewNode, DAG, Subtarget);
@@ -8729,6 +8767,52 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   return lowerVectorIntrinsicScalars(Op, DAG, Subtarget);
 }
 
+static inline SDValue getVCIXISDNodeWCHAIN(SDValue &Op, SelectionDAG &DAG,
+                                           unsigned Type) {
+  SDLoc DL(Op);
+  SmallVector<SDValue> Operands{Op->op_values()};
+  Operands.erase(Operands.begin() + 1);
+
+  const RISCVSubtarget &Subtarget =
+      DAG.getMachineFunction().getSubtarget<RISCVSubtarget>();
+  MVT VT = Op.getSimpleValueType();
+  MVT RetVT = VT;
+  MVT FloatVT = VT;
+
+  if (VT.isFloatingPoint()) {
+    RetVT = MVT::getVectorVT(MVT::getIntegerVT(VT.getScalarSizeInBits()),
+                             VT.getVectorElementCount());
+    FloatVT = RetVT;
+  }
+  if (VT.isFixedLengthVector())
+    RetVT = getContainerForFixedLengthVector(DAG.getTargetLoweringInfo(), RetVT,
+                                             Subtarget);
+
+  processVCIXOperands(Op, Operands, DAG);
+
+  SDVTList VTs = DAG.getVTList({RetVT, MVT::Other});
+  SDValue NewNode = DAG.getNode(Type, DL, VTs, Operands);
+  SDValue Chain = NewNode.getValue(1);
+
+  if (VT.isFixedLengthVector())
+    NewNode = convertFromScalableVector(FloatVT, NewNode, DAG, Subtarget);
+  if (VT.isFloatingPoint())
+    NewNode = DAG.getBitcast(VT, NewNode);
+
+  NewNode = DAG.getMergeValues({NewNode, Chain}, DL);
+
+  return NewNode;
+}
+
+static inline SDValue getVCIXISDNodeVOID(SDValue &Op, SelectionDAG &DAG,
+                                         unsigned Type) {
+  SmallVector<SDValue> Operands{Op->op_values()};
+  Operands.erase(Operands.begin() + 1);
+  processVCIXOperands(Op, Operands, DAG);
+
+  return DAG.getNode(Type, SDLoc(Op), Op.getValueType(), Operands);
+}
+
 SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                     SelectionDAG &DAG) const {
   unsigned IntNo = Op.getConstantOperandVal(1);
@@ -8846,48 +8930,33 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return DAG.getMergeValues(Results, DL);
   }
   case Intrinsic::riscv_sf_vc_v_x_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_X_SE);
   case Intrinsic::riscv_sf_vc_v_i_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_I_SE);
   case Intrinsic::riscv_sf_vc_v_xv_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_XV_SE);
   case Intrinsic::riscv_sf_vc_v_iv_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_IV_SE);
   case Intrinsic::riscv_sf_vc_v_vv_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_VV_SE);
   case Intrinsic::riscv_sf_vc_v_fv_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_FV_SE);
   case Intrinsic::riscv_sf_vc_v_xvv_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_XVV_SE);
   case Intrinsic::riscv_sf_vc_v_ivv_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_IVV_SE);
   case Intrinsic::riscv_sf_vc_v_vvv_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_VVV_SE);
   case Intrinsic::riscv_sf_vc_v_fvv_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_FVV_SE);
   case Intrinsic::riscv_sf_vc_v_xvw_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_XVW_SE);
   case Intrinsic::riscv_sf_vc_v_ivw_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_IVW_SE);
   case Intrinsic::riscv_sf_vc_v_vvw_se:
-  case Intrinsic::riscv_sf_vc_v_fvw_se: {
-    MVT VT = Op.getSimpleValueType();
-    SDLoc DL(Op);
-    SmallVector<SDValue> Ops;
-    getVCIXOperands(Op, DAG, Ops);
-
-    MVT RetVT = VT;
-    if (VT.isFixedLengthVector())
-      RetVT = getContainerForFixedLengthVector(VT);
-    else if (VT.isFloatingPoint())
-      RetVT = MVT::getVectorVT(MVT::getIntegerVT(RetVT.getScalarSizeInBits()),
-                               RetVT.getVectorElementCount());
-
-    SDVTList VTs = DAG.getVTList({RetVT, MVT::Other});
-    SDValue NewNode = DAG.getNode(ISD::INTRINSIC_W_CHAIN, DL, VTs, Ops);
-
-    if (VT.isFixedLengthVector()) {
-      SDValue FixedVector =
-          convertFromScalableVector(VT, NewNode, DAG, Subtarget);
-      NewNode = DAG.getMergeValues({FixedVector, NewNode.getValue(1)}, DL);
-    } else if (VT.isFloatingPoint()) {
-      SDValue BitCast = DAG.getBitcast(VT, NewNode.getValue(0));
-      NewNode = DAG.getMergeValues({BitCast, NewNode.getValue(1)}, DL);
-    }
-
-    if (Op == NewNode)
-      break;
-
-    return NewNode;
-  }
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_VVW_SE);
+  case Intrinsic::riscv_sf_vc_v_fvw_se:
+    return getVCIXISDNodeWCHAIN(Op, DAG, RISCVISD::SF_VC_V_FVW_SE);
   }
 
   return lowerVectorIntrinsicScalars(Op, DAG, Subtarget);
@@ -8977,72 +9046,117 @@ SDValue RISCVTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
         FixedIntrinsic->getMemoryVT(), FixedIntrinsic->getMemOperand());
   }
   case Intrinsic::riscv_sf_vc_x_se_e8mf8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E8MF8);
   case Intrinsic::riscv_sf_vc_x_se_e8mf4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E8MF4);
   case Intrinsic::riscv_sf_vc_x_se_e8mf2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E8MF2);
   case Intrinsic::riscv_sf_vc_x_se_e8m1:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E8M1);
   case Intrinsic::riscv_sf_vc_x_se_e8m2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E8M2);
   case Intrinsic::riscv_sf_vc_x_se_e8m4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E8M4);
   case Intrinsic::riscv_sf_vc_x_se_e8m8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E8M8);
   case Intrinsic::riscv_sf_vc_x_se_e16mf4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E16MF4);
   case Intrinsic::riscv_sf_vc_x_se_e16mf2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E16MF2);
   case Intrinsic::riscv_sf_vc_x_se_e16m1:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E16M1);
   case Intrinsic::riscv_sf_vc_x_se_e16m2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E16M2);
   case Intrinsic::riscv_sf_vc_x_se_e16m4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E16M4);
   case Intrinsic::riscv_sf_vc_x_se_e16m8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E16M8);
   case Intrinsic::riscv_sf_vc_x_se_e32mf2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E32MF2);
   case Intrinsic::riscv_sf_vc_x_se_e32m1:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E32M1);
   case Intrinsic::riscv_sf_vc_x_se_e32m2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E32M2);
   case Intrinsic::riscv_sf_vc_x_se_e32m4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E32M4);
   case Intrinsic::riscv_sf_vc_x_se_e32m8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E32M8);
   case Intrinsic::riscv_sf_vc_x_se_e64m1:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E64M1);
   case Intrinsic::riscv_sf_vc_x_se_e64m2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E64M2);
   case Intrinsic::riscv_sf_vc_x_se_e64m4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E64M4);
   case Intrinsic::riscv_sf_vc_x_se_e64m8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_X_SE_E64M8);
   case Intrinsic::riscv_sf_vc_i_se_e8mf8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E8MF8);
   case Intrinsic::riscv_sf_vc_i_se_e8mf4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E8MF4);
   case Intrinsic::riscv_sf_vc_i_se_e8mf2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E8MF2);
   case Intrinsic::riscv_sf_vc_i_se_e8m1:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E8M1);
   case Intrinsic::riscv_sf_vc_i_se_e8m2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E8M2);
   case Intrinsic::riscv_sf_vc_i_se_e8m4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E8M4);
   case Intrinsic::riscv_sf_vc_i_se_e8m8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E8M8);
   case Intrinsic::riscv_sf_vc_i_se_e16mf4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E16MF4);
   case Intrinsic::riscv_sf_vc_i_se_e16mf2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E16MF2);
   case Intrinsic::riscv_sf_vc_i_se_e16m1:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E16M1);
   case Intrinsic::riscv_sf_vc_i_se_e16m2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E16M2);
   case Intrinsic::riscv_sf_vc_i_se_e16m4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E16M4);
   case Intrinsic::riscv_sf_vc_i_se_e16m8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E16M8);
   case Intrinsic::riscv_sf_vc_i_se_e32mf2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E32MF2);
   case Intrinsic::riscv_sf_vc_i_se_e32m1:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E32M1);
   case Intrinsic::riscv_sf_vc_i_se_e32m2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E32M2);
   case Intrinsic::riscv_sf_vc_i_se_e32m4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E32M4);
   case Intrinsic::riscv_sf_vc_i_se_e32m8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E32M8);
   case Intrinsic::riscv_sf_vc_i_se_e64m1:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E64M1);
   case Intrinsic::riscv_sf_vc_i_se_e64m2:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E64M2);
   case Intrinsic::riscv_sf_vc_i_se_e64m4:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E64M4);
   case Intrinsic::riscv_sf_vc_i_se_e64m8:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_I_SE_E64M8);
   case Intrinsic::riscv_sf_vc_xv_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XV_SE);
   case Intrinsic::riscv_sf_vc_iv_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_IV_SE);
   case Intrinsic::riscv_sf_vc_vv_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_VV_SE);
   case Intrinsic::riscv_sf_vc_fv_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_FV_SE);
   case Intrinsic::riscv_sf_vc_xvv_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XVV_SE);
   case Intrinsic::riscv_sf_vc_ivv_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_IVV_SE);
   case Intrinsic::riscv_sf_vc_vvv_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_VVV_SE);
   case Intrinsic::riscv_sf_vc_fvv_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_FVV_SE);
   case Intrinsic::riscv_sf_vc_xvw_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_XVW_SE);
   case Intrinsic::riscv_sf_vc_ivw_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_IVW_SE);
   case Intrinsic::riscv_sf_vc_vvw_se:
-  case Intrinsic::riscv_sf_vc_fvw_se: {
-    SmallVector<SDValue> Ops;
-    getVCIXOperands(Op, DAG, Ops);
-
-    SDValue NewNode =
-        DAG.getNode(ISD::INTRINSIC_VOID, SDLoc(Op), Op->getVTList(), Ops);
-
-    if (Op == NewNode)
-      break;
-
-    return NewNode;
-  }
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_VVW_SE);
+  case Intrinsic::riscv_sf_vc_fvw_se:
+    return getVCIXISDNodeVOID(Op, DAG, RISCVISD::SF_VC_FVW_SE);
   }
 
   return lowerVectorIntrinsicScalars(Op, DAG, Subtarget);
@@ -19013,6 +19127,76 @@ const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(SWAP_CSR)
   NODE_NAME_CASE(CZERO_EQZ)
   NODE_NAME_CASE(CZERO_NEZ)
+  NODE_NAME_CASE(SF_VC_X_SE_E8MF8)
+  NODE_NAME_CASE(SF_VC_X_SE_E8MF4)
+  NODE_NAME_CASE(SF_VC_X_SE_E8MF2)
+  NODE_NAME_CASE(SF_VC_X_SE_E8M1)
+  NODE_NAME_CASE(SF_VC_X_SE_E8M2)
+  NODE_NAME_CASE(SF_VC_X_SE_E8M4)
+  NODE_NAME_CASE(SF_VC_X_SE_E8M8)
+  NODE_NAME_CASE(SF_VC_X_SE_E16MF4)
+  NODE_NAME_CASE(SF_VC_X_SE_E16MF2)
+  NODE_NAME_CASE(SF_VC_X_SE_E16M1)
+  NODE_NAME_CASE(SF_VC_X_SE_E16M2)
+  NODE_NAME_CASE(SF_VC_X_SE_E16M4)
+  NODE_NAME_CASE(SF_VC_X_SE_E16M8)
+  NODE_NAME_CASE(SF_VC_X_SE_E32MF2)
+  NODE_NAME_CASE(SF_VC_X_SE_E32M1)
+  NODE_NAME_CASE(SF_VC_X_SE_E32M2)
+  NODE_NAME_CASE(SF_VC_X_SE_E32M4)
+  NODE_NAME_CASE(SF_VC_X_SE_E32M8)
+  NODE_NAME_CASE(SF_VC_X_SE_E64M1)
+  NODE_NAME_CASE(SF_VC_X_SE_E64M2)
+  NODE_NAME_CASE(SF_VC_X_SE_E64M4)
+  NODE_NAME_CASE(SF_VC_X_SE_E64M8)
+  NODE_NAME_CASE(SF_VC_I_SE_E8MF8)
+  NODE_NAME_CASE(SF_VC_I_SE_E8MF4)
+  NODE_NAME_CASE(SF_VC_I_SE_E8MF2)
+  NODE_NAME_CASE(SF_VC_I_SE_E8M1)
+  NODE_NAME_CASE(SF_VC_I_SE_E8M2)
+  NODE_NAME_CASE(SF_VC_I_SE_E8M4)
+  NODE_NAME_CASE(SF_VC_I_SE_E8M8)
+  NODE_NAME_CASE(SF_VC_I_SE_E16MF4)
+  NODE_NAME_CASE(SF_VC_I_SE_E16MF2)
+  NODE_NAME_CASE(SF_VC_I_SE_E16M1)
+  NODE_NAME_CASE(SF_VC_I_SE_E16M2)
+  NODE_NAME_CASE(SF_VC_I_SE_E16M4)
+  NODE_NAME_CASE(SF_VC_I_SE_E16M8)
+  NODE_NAME_CASE(SF_VC_I_SE_E32MF2)
+  NODE_NAME_CASE(SF_VC_I_SE_E32M1)
+  NODE_NAME_CASE(SF_VC_I_SE_E32M2)
+  NODE_NAME_CASE(SF_VC_I_SE_E32M4)
+  NODE_NAME_CASE(SF_VC_I_SE_E32M8)
+  NODE_NAME_CASE(SF_VC_I_SE_E64M1)
+  NODE_NAME_CASE(SF_VC_I_SE_E64M2)
+  NODE_NAME_CASE(SF_VC_I_SE_E64M4)
+  NODE_NAME_CASE(SF_VC_I_SE_E64M8)
+  NODE_NAME_CASE(SF_VC_XV_SE)
+  NODE_NAME_CASE(SF_VC_IV_SE)
+  NODE_NAME_CASE(SF_VC_VV_SE)
+  NODE_NAME_CASE(SF_VC_FV_SE)
+  NODE_NAME_CASE(SF_VC_XVV_SE)
+  NODE_NAME_CASE(SF_VC_IVV_SE)
+  NODE_NAME_CASE(SF_VC_VVV_SE)
+  NODE_NAME_CASE(SF_VC_FVV_SE)
+  NODE_NAME_CASE(SF_VC_XVW_SE)
+  NODE_NAME_CASE(SF_VC_IVW_SE)
+  NODE_NAME_CASE(SF_VC_VVW_SE)
+  NODE_NAME_CASE(SF_VC_FVW_SE)
+  NODE_NAME_CASE(SF_VC_V_X_SE)
+  NODE_NAME_CASE(SF_VC_V_I_SE)
+  NODE_NAME_CASE(SF_VC_V_XV_SE)
+  NODE_NAME_CASE(SF_VC_V_IV_SE)
+  NODE_NAME_CASE(SF_VC_V_VV_SE)
+  NODE_NAME_CASE(SF_VC_V_FV_SE)
+  NODE_NAME_CASE(SF_VC_V_XVV_SE)
+  NODE_NAME_CASE(SF_VC_V_IVV_SE)
+  NODE_NAME_CASE(SF_VC_V_VVV_SE)
+  NODE_NAME_CASE(SF_VC_V_FVV_SE)
+  NODE_NAME_CASE(SF_VC_V_XVW_SE)
+  NODE_NAME_CASE(SF_VC_V_IVW_SE)
+  NODE_NAME_CASE(SF_VC_V_VVW_SE)
+  NODE_NAME_CASE(SF_VC_V_FVW_SE)
   }
   // clang-format on
   return nullptr;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 30b9ad7e6f6f3..8d2fdbb9d40b4 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -415,6 +415,77 @@ enum NodeType : unsigned {
   STRICT_VFROUND_NOEXCEPT_VL,
   LAST_RISCV_STRICTFP_OPCODE = STRICT_VFROUND_NOEXCEPT_VL,
 
+  SF_VC_X_SE_E8MF8,
+  SF_VC_X_SE_E8MF4,
+  SF_VC_X_SE_E8MF2,
+  SF_VC_X_SE_E8M1,
+  SF_VC_X_SE_E8M2,
+  SF_VC_X_SE_E8M4,
+  SF_VC_X_SE_E8M8,
+  SF_VC_X_SE_E16MF4,
+  SF_VC_X_SE_E16MF2,
+  SF_VC_X_SE_E16M1,
+  SF_VC_X_SE_E16M2,
+  SF_VC_X_SE_E16M4,
+  SF_VC_X_SE_E16M8,
+  SF_VC_X_SE_E32MF2,
+  SF_VC_X_SE_E32M1,
+  SF_VC_X_SE_E32M2,
+  SF_VC_X_SE_E32M4,
+  SF_VC_X_SE_E32M8,
+  SF_VC_X_SE_E64M1,
+  SF_VC_X_SE_E64M2,
+  SF_VC_X_SE_E64M4,
+  SF_VC_X_SE_E64M8,
+  SF_VC_I_SE_E8MF8,
+  SF_VC_I_SE_E8MF4,
+  SF_VC_I_SE_E8MF2,
+  SF_VC_I_SE_E8M1,
+  SF_VC_I_SE_E8M2,
+  SF_VC_I_SE_E8M4,
+  SF_VC_I_SE_E8M8,
+  SF_VC_I_SE_E16MF4,
+  SF_VC_I_SE_E16MF2,
+  SF_VC_I_SE_E16M1,
+  SF_VC_I_SE_E16M2,
+  SF_VC_I_SE_E16M4,
+  SF_VC_I_SE_E16M8,
+  SF_VC_I_SE_E32MF2,
+  SF_VC_I_SE_E32M1,
+  SF_VC_I_SE_E32M2,
+  SF_VC_I_SE_E32M4,
+  SF_VC_I_SE_E32M8,
+  SF_VC_I_SE_E64M1,
+  SF_VC_I_SE_E64M2,
+  SF_VC_I_SE_E64M4,
+  SF_VC_I_SE_E64M8,
+  SF_VC_XV_SE,
+  SF_VC_IV_SE,
+  SF_VC_VV_SE,
+  SF_VC_FV_SE,
+  SF_VC_XVV_SE,
+  SF_VC_IVV_SE,
+  SF_VC_VVV_SE,
+  SF_VC_FVV_SE,
+  SF_VC_XVW_SE,
+  SF_VC_IVW_SE,
+  SF_VC_VVW_SE,
+  SF_VC_FVW_SE,
+  SF_VC_V_X_SE,
+  SF_VC_V_I_SE,
+  SF_VC_V_XV_SE,
+  SF_VC_V_IV_SE,
+  SF_VC_V_VV_SE,
+  SF_VC_V_FV_SE,
+  SF_VC_V_XVV_SE,
+  SF_VC_V_IVV_SE,
+  SF_VC_V_VVV_SE,
+  SF_VC_V_FVV_SE,
+  SF_VC_V_XVW_SE,
+  SF_VC_V_IVW_SE,
+  SF_VC_V_VVW_SE,
+  SF_VC_V_FVW_SE,
+
   // WARNING: Do not add anything in the end unless you want the node to
   // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
   // opcodes will be thought as target memory ops!
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 31f832dfd84ce..d22f98d693b1b 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -221,8 +221,8 @@ let Predicates = [HasVendorXSfvfnrclipxfqf], DecoderNamespace = "XSfvfnrclipxfqf
   def VFNRCLIP_XU_F_QF : CustomSiFiveVFNRCLIP<0b100010, OPFVF, "sf.vfnrclip.xu.f.qf">;
   def VFNRCLIP_X_F_QF : CustomSiFiveVFNRCLIP<0b100011, OPFVF, "sf.vfnrclip.x.f.qf">;
 }
-class VPseudoVC_X<Operand OpClass, DAGOperand RS1Class,
-                  bit HasSideEffect = 1> :
+
+class VPseudoVC_X<Operand OpClass, DAGOperand RS1Class> :
       Pseudo<(outs),
              (ins OpClass:$op1, payload5:$rs2, payload5:$rd, RS1Class:$r1,
                   AVL:$vl, ixlenimm:$sew), []>,
@@ -231,12 +231,11 @@ class VPseudoVC_X<Operand OpClass, DAGOperand RS1Class,
   let mayStore = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let hasSideEffects = HasSideEffect;
+  let hasSideEffects = 0;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
-class VPseudoVC_XV<Operand OpClass, VReg RS2Class, DAGOperand RS1Class,
-                   bit HasSideEffect = 1> :
+class VPseudoVC_XV<Operand OpClass, VReg RS2Class, DAGOperand RS1Class> :
       Pseudo<(outs),
              (ins OpClass:$op1, payload5:$rd, RS2Class:$rs2, RS1Class:$r1,
                   AVL:$vl, ixlenimm:$sew), []>,
@@ -245,12 +244,12 @@ class VPseudoVC_XV<Operand OpClass, VReg RS2Class, DAGOperand RS1Class,
   let mayStore = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let hasSideEffects = HasSideEffect;
+  let hasSideEffects = 0;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoVC_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
-                    DAGOperand RS1Class, bit HasSideEffect = 1> :
+                    DAGOperand RS1Class> :
       Pseudo<(outs),
              (ins OpClass:$op1, RDClass:$rd, RS2Class:$rs2, RS1Class:$r1,
                   AVL:$vl, ixlenimm:$sew), []>,
@@ -259,12 +258,11 @@ class VPseudoVC_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
   let mayStore = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let hasSideEffects = HasSideEffect;
+  let hasSideEffects = 0;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
-class VPseudoVC_V_X<Operand OpClass, VReg RDClass, DAGOperand RS1Class,
-                    bit HasSideEffect = 1> :
+class VPseudoVC_V_X<Operand OpClass, VReg RDClass, DAGOperand RS1Class> :
       Pseudo<(outs RDClass:$rd),
              (ins OpClass:$op1, payload5:$rs2, RS1Class:$r1,
                   AVL:$vl, ixlenimm:$sew), []>,
@@ -273,12 +271,12 @@ class VPseudoVC_V_X<Operand OpClass, VReg RDClass, DAGOperand RS1Class,
   let mayStore = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let hasSideEffects = HasSideEffect;
+  let hasSideEffects = 0;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoVC_V_XV<Operand OpClass, VReg RDClass, VReg RS2Class,
-                     DAGOperand RS1Class, bit HasSideEffect = 1> :
+                     DAGOperand RS1Class> :
       Pseudo<(outs RDClass:$rd),
              (ins OpClass:$op1, RS2Class:$rs2, RS1Class:$r1,
                   AVL:$vl, ixlenimm:$sew), []>,
@@ -287,12 +285,12 @@ class VPseudoVC_V_XV<Operand OpClass, VReg RDClass, VReg RS2Class,
   let mayStore = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let hasSideEffects = HasSideEffect;
+  let hasSideEffects = 0;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 class VPseudoVC_V_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
-                      DAGOperand RS1Class, bit HasSideEffect = 1> :
+                      DAGOperand RS1Class> :
       Pseudo<(outs RDClass:$rd),
              (ins OpClass:$op1, RDClass:$rs3, RS2Class:$rs2, RS1Class:$r1,
                   AVL:$vl, ixlenimm:$sew), []>,
@@ -301,44 +299,52 @@ class VPseudoVC_V_XVV<Operand OpClass, VReg RDClass, VReg RS2Class,
   let mayStore = 0;
   let HasVLOp = 1;
   let HasSEWOp = 1;
-  let hasSideEffects = HasSideEffect;
+  let hasSideEffects = 0;
   let BaseInstr = !cast<Instruction>(PseudoToVInst<NAME>.VInst);
 }
 
 multiclass VPseudoVC_X<LMULInfo m, DAGOperand RS1Class,
                        Operand OpClass = payload2> {
   let VLMul = m.value in {
-    def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_X<OpClass, RS1Class>;
-    def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>;
-    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class, 0>;
+    let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
+      def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_X<OpClass, RS1Class>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>;
+    }
+    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_X<OpClass, m.vrclass, RS1Class>;
   }
 }
 
 multiclass VPseudoVC_XV<LMULInfo m, DAGOperand RS1Class,
                         Operand OpClass = payload2> {
   let VLMul = m.value in {
-    def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XV<OpClass, m.vrclass, RS1Class>;
-    def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>;
-    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class, 0>;
+    let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
+      def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XV<OpClass, m.vrclass, RS1Class>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+    }
+    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XV<OpClass, m.vrclass, m.vrclass, RS1Class>;
   }
 }
 
 multiclass VPseudoVC_XVV<LMULInfo m, DAGOperand RS1Class,
                          Operand OpClass = payload2> {
   let VLMul = m.value in {
-    def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
-    def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
-    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class, 0>;
+    let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in {
+      def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+      def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
+    }
+    def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.vrclass, m.vrclass, RS1Class>;
   }
 }
 
 multiclass VPseudoVC_XVW<LMULInfo m, DAGOperand RS1Class,
                          Operand OpClass = payload2> {
   let VLMul = m.value in {
+    let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in
     def "PseudoVC_" # NAME # "_SE_" # m.MX : VPseudoVC_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
     let Constraints = "@earlyclobber $rd, $rd = $rs3" in {
+      let Defs = [VCIX_STATE], Uses = [VCIX_STATE] in
       def "PseudoVC_V_" # NAME # "_SE_" # m.MX : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
-      def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class, 0>;
+      def "PseudoVC_V_" # NAME # "_" # m.MX : VPseudoVC_V_XVV<OpClass, m.wvrclass, m.vrclass, RS1Class>;
     }
   }
 }
@@ -428,6 +434,149 @@ let Predicates = [HasVendorXSfvfnrclipxfqf] in {
   defm VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP;
 }
 
+// SDNode
+def SDT_SF_VC_X : SDTypeProfile<0, 5, [SDTCisSameAs<0, 1>,
+                                       SDTCisVT<0, XLenVT>,
+                                       SDTCisSameAs<0, 2>,
+                                       SDTCisSameAs<0, 3>,
+                                       SDTCisSameAs<0, 4>]>;
+
+def SDT_SF_VC_V_X : SDTypeProfile<1, 4, [SDTCisVec<0>,
+                                         SDTCisVT<1, XLenVT>,
+                                         SDTCisSameAs<1, 2>,
+                                         SDTCisSameAs<1, 3>,
+                                         SDTCisSameAs<1, 4>]>;
+
+def SDT_SF_VC_XV : SDTypeProfile<0, 5, [SDTCisSameAs<0, 1>,
+                                        SDTCisVec<2>,
+                                        SDTCisSameAs<0, 4>,
+                                        SDTCisVT<0, XLenVT>]>;
+
+def SDT_SF_VC_V_XV : SDTypeProfile<1, 4, [SDTCisVec<0>,
+                                          SDTCisVT<1, XLenVT>,
+                                          SDTCisSameAs<0, 2>,
+                                          SDTCisSameAs<1, 4>]>;
+
+def SDT_SF_VC_XVV : SDTypeProfile<0, 5, [SDTCisVT<0, XLenVT>,
+                                         SDTCisVec<1>,
+                                         SDTCisSameAs<1, 2>,
+                                         SDTCisSameAs<0, 4>]>;
+
+def SDT_SF_VC_V_XVV : SDTypeProfile<1, 5, [SDTCisVec<0>,
+                                           SDTCisVT<1, XLenVT>,
+                                           SDTCisSameAs<0, 2>,
+                                           SDTCisSameAs<0, 3>,
+                                           SDTCisSameAs<1, 5>]>;
+
+def SDT_SF_VC_XVW : SDTypeProfile<0, 5, [SDTCisVT<0, XLenVT>,
+                                         SDTCisVec<1>, SDTCisVec<2>,
+                                         SDTCisSameAs<0, 4>]>;
+
+def SDT_SF_VC_V_XVW : SDTypeProfile<1, 5, [SDTCisVec<0>,
+                                           SDTCisVT<1, XLenVT>,
+                                           SDTCisSameAs<0, 2>,
+                                           SDTCisVec<3>,
+                                           SDTCisSameAs<1, 5>]>;
+
+foreach vti = AllIntegerVectors in {
+  def sf_vc_x_e#vti.SEW#!tolower(vti.LMul.MX) : SDNode<"RISCVISD::SF_VC_X_SE_E"#vti.SEW#vti.LMul.MX, SDT_SF_VC_X, [SDNPHasChain]>;
+  def sf_vc_i_e#vti.SEW#!tolower(vti.LMul.MX) : SDNode<"RISCVISD::SF_VC_I_SE_E"#vti.SEW#vti.LMul.MX, SDT_SF_VC_X, [SDNPHasChain]>;
+}
+def sf_vc_v_x_se : SDNode<"RISCVISD::SF_VC_V_X_SE", SDT_SF_VC_V_X, [SDNPHasChain]>;
+def sf_vc_v_i_se : SDNode<"RISCVISD::SF_VC_V_I_SE", SDT_SF_VC_V_X, [SDNPHasChain]>;
+def sf_vc_vv_se : SDNode<"RISCVISD::SF_VC_VV_SE", SDT_SF_VC_XV, [SDNPHasChain]>;
+def sf_vc_xv_se : SDNode<"RISCVISD::SF_VC_XV_SE", SDT_SF_VC_XV, [SDNPHasChain]>;
+def sf_vc_iv_se : SDNode<"RISCVISD::SF_VC_IV_SE", SDT_SF_VC_XV, [SDNPHasChain]>;
+def sf_vc_fv_se : SDNode<"RISCVISD::SF_VC_FV_SE", SDT_SF_VC_XV, [SDNPHasChain]>;
+def sf_vc_v_vv_se : SDNode<"RISCVISD::SF_VC_V_VV_SE", SDT_SF_VC_V_XV, [SDNPHasChain]>;
+def sf_vc_v_xv_se : SDNode<"RISCVISD::SF_VC_V_XV_SE", SDT_SF_VC_V_XV, [SDNPHasChain]>;
+def sf_vc_v_iv_se : SDNode<"RISCVISD::SF_VC_V_IV_SE", SDT_SF_VC_V_XV, [SDNPHasChain]>;
+def sf_vc_v_fv_se : SDNode<"RISCVISD::SF_VC_V_FV_SE", SDT_SF_VC_V_XV, [SDNPHasChain]>;
+def sf_vc_vvv_se : SDNode<"RISCVISD::SF_VC_VVV_SE", SDT_SF_VC_XVV, [SDNPHasChain]>;
+def sf_vc_xvv_se : SDNode<"RISCVISD::SF_VC_XVV_SE", SDT_SF_VC_XVV, [SDNPHasChain]>;
+def sf_vc_ivv_se : SDNode<"RISCVISD::SF_VC_IVV_SE", SDT_SF_VC_XVV, [SDNPHasChain]>;
+def sf_vc_fvv_se : SDNode<"RISCVISD::SF_VC_FVV_SE", SDT_SF_VC_XVV, [SDNPHasChain]>;
+def sf_vc_v_vvv_se : SDNode<"RISCVISD::SF_VC_V_VVV_SE", SDT_SF_VC_V_XVV, [SDNPHasChain]>;
+def sf_vc_v_xvv_se : SDNode<"RISCVISD::SF_VC_V_XVV_SE", SDT_SF_VC_V_XVV, [SDNPHasChain]>;
+def sf_vc_v_ivv_se : SDNode<"RISCVISD::SF_VC_V_IVV_SE", SDT_SF_VC_V_XVV, [SDNPHasChain]>;
+def sf_vc_v_fvv_se : SDNode<"RISCVISD::SF_VC_V_FVV_SE", SDT_SF_VC_V_XVV, [SDNPHasChain]>;
+def sf_vc_vvw_se : SDNode<"RISCVISD::SF_VC_VVW_SE", SDT_SF_VC_XVW, [SDNPHasChain]>;
+def sf_vc_xvw_se : SDNode<"RISCVISD::SF_VC_XVW_SE", SDT_SF_VC_XVW, [SDNPHasChain]>;
+def sf_vc_ivw_se : SDNode<"RISCVISD::SF_VC_IVW_SE", SDT_SF_VC_XVW, [SDNPHasChain]>;
+def sf_vc_fvw_se : SDNode<"RISCVISD::SF_VC_FVW_SE", SDT_SF_VC_XVW, [SDNPHasChain]>;
+def sf_vc_v_vvw_se : SDNode<"RISCVISD::SF_VC_V_VVW_SE", SDT_SF_VC_V_XVW, [SDNPHasChain]>;
+def sf_vc_v_xvw_se : SDNode<"RISCVISD::SF_VC_V_XVW_SE", SDT_SF_VC_V_XVW, [SDNPHasChain]>;
+def sf_vc_v_ivw_se : SDNode<"RISCVISD::SF_VC_V_IVW_SE", SDT_SF_VC_V_XVW, [SDNPHasChain]>;
+def sf_vc_v_fvw_se : SDNode<"RISCVISD::SF_VC_V_FVW_SE", SDT_SF_VC_V_XVW, [SDNPHasChain]>;
+
+class VPatVC_OP4_ISD<SDPatternOperator op,
+                     string inst,
+                     ValueType op2_type,
+                     ValueType op3_type,
+                     ValueType op4_type,
+                     int sew,
+                     DAGOperand op2_kind,
+                     DAGOperand op3_kind,
+                     DAGOperand op4_kind,
+                     Operand op1_kind = payload2> :
+  Pat<(op
+       (XLenVT   op1_kind:$op1),
+       (op2_type op2_kind:$op2),
+       (op3_type op3_kind:$op3),
+       (op4_type op4_kind:$op4),
+       VLOpFrag),
+      (!cast<Instruction>(inst)
+       (XLenVT   op1_kind:$op1),
+       (op2_type op2_kind:$op2),
+       (op3_type op3_kind:$op3),
+       (op4_type op4_kind:$op4),
+       GPR:$vl, sew)>;
+
+class VPatVC_V_OP4_ISD<SDPatternOperator op,
+                       string inst,
+                       ValueType result_type,
+                       ValueType op2_type,
+                       ValueType op3_type,
+                       ValueType op4_type,
+                       int sew,
+                       DAGOperand op2_kind,
+                       DAGOperand op3_kind,
+                       DAGOperand op4_kind,
+                       Operand op1_kind = payload2> :
+  Pat<(result_type (op
+                    (XLenVT   op1_kind:$op1),
+                    (op2_type op2_kind:$op2),
+                    (op3_type op3_kind:$op3),
+                    (op4_type op4_kind:$op4),
+                    VLOpFrag)),
+                   (!cast<Instruction>(inst)
+                    (XLenVT   op1_kind:$op1),
+                    (op2_type op2_kind:$op2),
+                    (op3_type op3_kind:$op3),
+                    (op4_type op4_kind:$op4),
+                    GPR:$vl, sew)>;
+
+
+class VPatVC_V_OP3_ISD<SDPatternOperator op,
+                       string inst,
+                       ValueType result_type,
+                       ValueType op2_type,
+                       ValueType op3_type,
+                       int sew,
+                       DAGOperand op2_kind,
+                       DAGOperand op3_kind,
+                       Operand op1_kind = payload2> :
+  Pat<(result_type (op
+                    (XLenVT   op1_kind:$op1),
+                    (op2_type op2_kind:$op2),
+                    (op3_type op3_kind:$op3),
+                    VLOpFrag)),
+                   (!cast<Instruction>(inst)
+                    (XLenVT   op1_kind:$op1),
+                    (op2_type op2_kind:$op2),
+                    (op3_type op3_kind:$op3),
+                    GPR:$vl, sew)>;
+
 class VPatVC_OP4<string intrinsic_name,
                  string inst,
                  ValueType op2_type,
@@ -497,14 +646,14 @@ class VPatVC_V_OP3<string intrinsic_name,
 
 multiclass VPatVC_X<string intrinsic_suffix, string instruction_suffix,
                     VTypeInfo vti, ValueType type, DAGOperand kind> {
-  def : VPatVC_OP4<"int_riscv_sf_vc_" # intrinsic_suffix # "_se_e" # vti.SEW # !tolower(vti.LMul.MX),
-                   "PseudoVC_" # instruction_suffix # "_SE_" # vti.LMul.MX,
-                   XLenVT, XLenVT, type, vti.Log2SEW,
-                   payload5, payload5, kind>;
-  def : VPatVC_V_OP3<"int_riscv_sf_vc_v_" # intrinsic_suffix # "_se",
-                     "PseudoVC_V_" # instruction_suffix # "_SE_" # vti.LMul.MX,
-                     vti.Vector, XLenVT, type, vti.Log2SEW,
-                     payload5, kind>;
+  def : VPatVC_OP4_ISD<!cast<SDPatternOperator>("sf_vc_" # intrinsic_suffix # "_e" # vti.SEW # !tolower(vti.LMul.MX)),
+                       "PseudoVC_" # instruction_suffix # "_SE_" # vti.LMul.MX,
+                       XLenVT, XLenVT, type, vti.Log2SEW,
+                       payload5, payload5, kind>;
+  def : VPatVC_V_OP3_ISD<!cast<SDPatternOperator>("sf_vc_v_" # intrinsic_suffix # "_se"),
+                         "PseudoVC_V_" # instruction_suffix # "_SE_" # vti.LMul.MX,
+                         vti.Vector, XLenVT, type, vti.Log2SEW,
+                         payload5, kind>;
   def : VPatVC_V_OP3<"int_riscv_sf_vc_v_" # intrinsic_suffix,
                      "PseudoVC_V_" # instruction_suffix # "_" # vti.LMul.MX,
                      vti.Vector, XLenVT, type, vti.Log2SEW,
@@ -514,14 +663,14 @@ multiclass VPatVC_X<string intrinsic_suffix, string instruction_suffix,
 multiclass VPatVC_XV<string intrinsic_suffix, string instruction_suffix,
                      VTypeInfo vti, ValueType type, DAGOperand kind,
                      Operand op1_kind = payload2> {
-  def : VPatVC_OP4<"int_riscv_sf_vc_" # intrinsic_suffix # "_se",
+  def : VPatVC_OP4_ISD<!cast<SDPatternOperator>("sf_vc_" # intrinsic_suffix # "_se"),
                    "PseudoVC_" # instruction_suffix # "_SE_" # vti.LMul.MX,
                    XLenVT, vti.Vector, type, vti.Log2SEW,
                    payload5, vti.RegClass, kind, op1_kind>;
-  def : VPatVC_V_OP3<"int_riscv_sf_vc_v_" # intrinsic_suffix # "_se",
-                     "PseudoVC_V_" # instruction_suffix # "_SE_" # vti.LMul.MX,
-                     vti.Vector, vti.Vector, type, vti.Log2SEW,
-                     vti.RegClass, kind, op1_kind>;
+  def : VPatVC_V_OP3_ISD<!cast<SDPatternOperator>("sf_vc_v_" # intrinsic_suffix # "_se"),
+                         "PseudoVC_V_" # instruction_suffix # "_SE_" # vti.LMul.MX,
+                         vti.Vector, vti.Vector, type, vti.Log2SEW,
+                         vti.RegClass, kind, op1_kind>;
   def : VPatVC_V_OP3<"int_riscv_sf_vc_v_" # intrinsic_suffix,
                      "PseudoVC_V_" # instruction_suffix # "_" # vti.LMul.MX,
                      vti.Vector, vti.Vector, type, vti.Log2SEW,
@@ -531,11 +680,11 @@ multiclass VPatVC_XV<string intrinsic_suffix, string instruction_suffix,
 multiclass VPatVC_XVV<string intrinsic_suffix, string instruction_suffix,
                       VTypeInfo wti, VTypeInfo vti, ValueType type, DAGOperand kind,
                       Operand op1_kind = payload2> {
-  def : VPatVC_OP4<"int_riscv_sf_vc_" # intrinsic_suffix # "_se",
+  def : VPatVC_OP4_ISD<!cast<SDPatternOperator>("sf_vc_" # intrinsic_suffix # "_se"),
                    "PseudoVC_" # instruction_suffix # "_SE_" # vti.LMul.MX,
                    wti.Vector, vti.Vector, type, vti.Log2SEW,
                    wti.RegClass, vti.RegClass, kind, op1_kind>;
-  def : VPatVC_V_OP4<"int_riscv_sf_vc_v_" # intrinsic_suffix # "_se",
+  def : VPatVC_V_OP4_ISD<!cast<SDPatternOperator>("sf_vc_v_" # intrinsic_suffix # "_se"),
                      "PseudoVC_V_" # instruction_suffix # "_SE_" # vti.LMul.MX,
                      wti.Vector, wti.Vector, vti.Vector, type, vti.Log2SEW,
                      wti.RegClass, vti.RegClass, kind, op1_kind>;
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index d7bb46a221dd2..30457f528853b 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -130,6 +130,9 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   markSuperRegs(Reserved, RISCV::FRM);
   markSuperRegs(Reserved, RISCV::FFLAGS);
 
+  // SiFive VCIX state registers.
+  markSuperRegs(Reserved, RISCV::VCIX_STATE);
+
   if (MF.getFunction().getCallingConv() == CallingConv::GRAAL) {
     if (Subtarget.isRVE())
       report_fatal_error("Graal reserved registers do not exist in RVE");
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 5a4d8c4cfece7..193b85e281860 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -607,3 +607,6 @@ def FRM    : RISCVReg<0, "frm">;
 
 // Shadow Stack register
 def SSP    : RISCVReg<0, "ssp">;
+
+// Dummy VCIX state register
+def VCIX_STATE : RISCVReg<0, "vcix_state">;
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index ef91334c5ff00..2d5fce2ca4970 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -7,21 +7,21 @@
 define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-LABEL: test:
 ; NOREMAT:       # %bb.0:
-; NOREMAT-NEXT:    addi sp, sp, -368
-; NOREMAT-NEXT:    .cfi_def_cfa_offset 368
-; NOREMAT-NEXT:    sd ra, 360(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s0, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s1, 344(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s3, 328(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s4, 320(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s5, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s6, 304(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s7, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s8, 288(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s9, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s10, 272(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    sd s11, 264(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addi sp, sp, -400
+; NOREMAT-NEXT:    .cfi_def_cfa_offset 400
+; NOREMAT-NEXT:    sd ra, 392(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s0, 384(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s1, 376(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s2, 368(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s3, 360(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s4, 352(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s5, 344(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s6, 336(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s7, 328(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s8, 320(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s9, 312(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s10, 304(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd s11, 296(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    .cfi_offset ra, -8
 ; NOREMAT-NEXT:    .cfi_offset s0, -16
 ; NOREMAT-NEXT:    .cfi_offset s1, -24
@@ -35,6 +35,11 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    .cfi_offset s9, -88
 ; NOREMAT-NEXT:    .cfi_offset s10, -96
 ; NOREMAT-NEXT:    .cfi_offset s11, -104
+; NOREMAT-NEXT:    csrr a2, vlenb
+; NOREMAT-NEXT:    li a3, 6
+; NOREMAT-NEXT:    mul a2, a2, a3
+; NOREMAT-NEXT:    sub sp, sp, a2
+; NOREMAT-NEXT:    .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0x90, 0x03, 0x22, 0x11, 0x06, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 400 + 6 * vlenb
 ; NOREMAT-NEXT:    li a2, 32
 ; NOREMAT-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
 ; NOREMAT-NEXT:    vle32.v v8, (a0)
@@ -50,670 +55,728 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; NOREMAT-NEXT:    vle32.v v10, (a2)
 ; NOREMAT-NEXT:    li a2, 1
 ; NOREMAT-NEXT:    slli a2, a2, 11
-; NOREMAT-NEXT:    sd a2, 256(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a2, 272(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
 ; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    li a4, 5
-; NOREMAT-NEXT:    slli a2, a4, 9
-; NOREMAT-NEXT:    sd a2, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li a5, 5
+; NOREMAT-NEXT:    slli a2, a5, 9
+; NOREMAT-NEXT:    sd a2, 264(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a2, a0, a2
 ; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    li a5, 3
-; NOREMAT-NEXT:    slli a2, a5, 10
-; NOREMAT-NEXT:    sd a2, 240(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    li a2, 3
+; NOREMAT-NEXT:    slli a3, a2, 10
+; NOREMAT-NEXT:    sd a3, 256(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    li a3, 7
-; NOREMAT-NEXT:    slli a2, a3, 9
-; NOREMAT-NEXT:    sd a2, 232(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
+; NOREMAT-NEXT:    vle32.v v8, (a3)
+; NOREMAT-NEXT:    li a4, 7
+; NOREMAT-NEXT:    slli a3, a4, 9
+; NOREMAT-NEXT:    sd a3, 248(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    vle32.v v14, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    lui a2, 1
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
+; NOREMAT-NEXT:    vle32.v v10, (a3)
+; NOREMAT-NEXT:    lui a3, 1
+; NOREMAT-NEXT:    add a3, a0, a3
+; NOREMAT-NEXT:    vle32.v v12, (a3)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    li a2, 9
-; NOREMAT-NEXT:    slli a6, a2, 9
-; NOREMAT-NEXT:    sd a6, 224(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v8, (a3)
+; NOREMAT-NEXT:    li a3, 9
+; NOREMAT-NEXT:    slli a6, a3, 9
+; NOREMAT-NEXT:    sd a6, 240(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a6, a0, a6
 ; NOREMAT-NEXT:    vle32.v v14, (a6)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v10, (a6)
-; NOREMAT-NEXT:    slli a6, a4, 10
-; NOREMAT-NEXT:    sd a6, 216(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    slli a6, a5, 10
+; NOREMAT-NEXT:    sd a6, 232(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a6, a0, a6
 ; NOREMAT-NEXT:    vle32.v v12, (a6)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
 ; NOREMAT-NEXT:    vle32.v v8, (a6)
 ; NOREMAT-NEXT:    li s8, 11
 ; NOREMAT-NEXT:    slli a6, s8, 9
-; NOREMAT-NEXT:    sd a6, 208(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a6, 224(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a6, a0, a6
 ; NOREMAT-NEXT:    vle32.v v14, (a6)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v10, (a6)
-; NOREMAT-NEXT:    slli a5, a5, 11
-; NOREMAT-NEXT:    sd a5, 200(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v12, (a5)
+; NOREMAT-NEXT:    slli a2, a2, 11
+; NOREMAT-NEXT:    sd a2, 216(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a5)
+; NOREMAT-NEXT:    vle32.v v8, (a2)
 ; NOREMAT-NEXT:    li s2, 13
-; NOREMAT-NEXT:    slli a5, s2, 9
-; NOREMAT-NEXT:    sd a5, 192(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v14, (a5)
+; NOREMAT-NEXT:    slli a2, s2, 9
+; NOREMAT-NEXT:    sd a2, 208(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    vle32.v v14, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a5)
-; NOREMAT-NEXT:    slli a5, a3, 10
-; NOREMAT-NEXT:    sd a5, 184(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v12, (a5)
+; NOREMAT-NEXT:    vle32.v v10, (a2)
+; NOREMAT-NEXT:    slli a2, a4, 10
+; NOREMAT-NEXT:    sd a2, 200(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a2, a0, a2
+; NOREMAT-NEXT:    vle32.v v12, (a2)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a5)
-; NOREMAT-NEXT:    li t0, 15
-; NOREMAT-NEXT:    slli a5, t0, 9
-; NOREMAT-NEXT:    sd a5, 176(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v14, (a5)
+; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    li a2, 15
+; NOREMAT-NEXT:    slli a6, a2, 9
+; NOREMAT-NEXT:    sd a6, 192(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a6, a0, a6
+; NOREMAT-NEXT:    vle32.v v26, (a6)
 ; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a5)
-; NOREMAT-NEXT:    lui a5, 2
+; NOREMAT-NEXT:    vle32.v v16, (a6)
+; NOREMAT-NEXT:    lui a6, 2
+; NOREMAT-NEXT:    add a6, a0, a6
+; NOREMAT-NEXT:    vle32.v v28, (a6)
+; NOREMAT-NEXT:    vle32.v v10, (a6)
+; NOREMAT-NEXT:    li a6, 17
+; NOREMAT-NEXT:    slli a6, a6, 9
+; NOREMAT-NEXT:    sd a6, 184(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li t0, 17
+; NOREMAT-NEXT:    add a6, a0, a6
+; NOREMAT-NEXT:    vle32.v v30, (a6)
+; NOREMAT-NEXT:    vle32.v v18, (a6)
+; NOREMAT-NEXT:    slli a6, a3, 10
+; NOREMAT-NEXT:    sd a6, 176(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a6, a0, a6
+; NOREMAT-NEXT:    vle32.v v0, (a6)
+; NOREMAT-NEXT:    vle32.v v20, (a6)
+; NOREMAT-NEXT:    li a6, 19
+; NOREMAT-NEXT:    slli a6, a6, 9
+; NOREMAT-NEXT:    sd a6, 168(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    li a7, 19
+; NOREMAT-NEXT:    add a6, a0, a6
+; NOREMAT-NEXT:    vle32.v v2, (a6)
+; NOREMAT-NEXT:    vle32.v v22, (a6)
+; NOREMAT-NEXT:    slli a5, a5, 11
+; NOREMAT-NEXT:    sd a5, 160(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a5, a0, a5
+; NOREMAT-NEXT:    vle32.v v4, (a5)
 ; NOREMAT-NEXT:    vle32.v v12, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a5)
-; NOREMAT-NEXT:    li a5, 17
-; NOREMAT-NEXT:    slli a5, a5, 9
-; NOREMAT-NEXT:    sd a5, 168(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li a7, 17
+; NOREMAT-NEXT:    li s10, 21
+; NOREMAT-NEXT:    slli a5, s10, 9
+; NOREMAT-NEXT:    sd a5, 152(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a5, a0, a5
+; NOREMAT-NEXT:    vle32.v v24, (a5)
 ; NOREMAT-NEXT:    vle32.v v14, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a5)
-; NOREMAT-NEXT:    slli a5, a2, 10
-; NOREMAT-NEXT:    sd a5, 160(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; NOREMAT-NEXT:    slli a5, s8, 10
+; NOREMAT-NEXT:    sd a5, 144(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v12, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; NOREMAT-NEXT:    vle32.v v26, (a5)
 ; NOREMAT-NEXT:    vle32.v v8, (a5)
-; NOREMAT-NEXT:    li a5, 19
-; NOREMAT-NEXT:    slli a5, a5, 9
-; NOREMAT-NEXT:    sd a5, 152(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    li a6, 19
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v28
+; NOREMAT-NEXT:    li s6, 23
+; NOREMAT-NEXT:    slli a5, s6, 9
+; NOREMAT-NEXT:    sd a5, 136(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a5, a0, a5
-; NOREMAT-NEXT:    vle32.v v14, (a5)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; NOREMAT-NEXT:    vle32.v v28, (a5)
+; NOREMAT-NEXT:    vle32.v v16, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v30
+; NOREMAT-NEXT:    lui a5, 3
+; NOREMAT-NEXT:    add a5, a0, a5
+; NOREMAT-NEXT:    vle32.v v30, (a5)
 ; NOREMAT-NEXT:    vle32.v v10, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v0
+; NOREMAT-NEXT:    li s3, 25
+; NOREMAT-NEXT:    slli a5, s3, 9
+; NOREMAT-NEXT:    sd a5, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a5, a0, a5
+; NOREMAT-NEXT:    vle32.v v0, (a5)
+; NOREMAT-NEXT:    vle32.v v18, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v2
+; NOREMAT-NEXT:    slli a5, s2, 10
+; NOREMAT-NEXT:    sd a5, 120(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a5, a0, a5
+; NOREMAT-NEXT:    vle32.v v2, (a5)
+; NOREMAT-NEXT:    vle32.v v20, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v4
+; NOREMAT-NEXT:    li t5, 27
+; NOREMAT-NEXT:    slli a5, t5, 9
+; NOREMAT-NEXT:    sd a5, 112(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    add a5, a0, a5
+; NOREMAT-NEXT:    vle32.v v4, (a5)
+; NOREMAT-NEXT:    vle32.v v22, (a5)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v24
 ; NOREMAT-NEXT:    slli a4, a4, 11
-; NOREMAT-NEXT:    sd a4, 144(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a4, 104(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    vle32.v v24, (a4)
 ; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    li s10, 21
-; NOREMAT-NEXT:    slli a4, s10, 9
-; NOREMAT-NEXT:    sd a4, 136(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v26
+; NOREMAT-NEXT:    li t2, 29
+; NOREMAT-NEXT:    slli a4, t2, 9
+; NOREMAT-NEXT:    sd a4, 96(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a0, a4
+; NOREMAT-NEXT:    vle32.v v26, (a4)
 ; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    slli a4, s8, 10
-; NOREMAT-NEXT:    sd a4, 128(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v28
+; NOREMAT-NEXT:    slli a4, a2, 10
+; NOREMAT-NEXT:    sd a4, 88(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; NOREMAT-NEXT:    vle32.v v28, (a4)
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    li s6, 23
-; NOREMAT-NEXT:    slli a4, s6, 9
-; NOREMAT-NEXT:    sd a4, 120(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    lui a4, 3
+; NOREMAT-NEXT:    csrr a4, vlenb
+; NOREMAT-NEXT:    slli a4, a4, 2
+; NOREMAT-NEXT:    add a4, sp, a4
+; NOREMAT-NEXT:    addi a4, a4, 288
+; NOREMAT-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v30
+; NOREMAT-NEXT:    li a5, 31
+; NOREMAT-NEXT:    slli a4, a5, 9
+; NOREMAT-NEXT:    sd a4, 80(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; NOREMAT-NEXT:    vle32.v v30, (a4)
+; NOREMAT-NEXT:    vle32.v v16, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v0
+; NOREMAT-NEXT:    lui a6, 4
+; NOREMAT-NEXT:    add a4, a0, a6
+; NOREMAT-NEXT:    vle32.v v0, (a4)
 ; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    li s3, 25
-; NOREMAT-NEXT:    slli a4, s3, 9
-; NOREMAT-NEXT:    sd a4, 112(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    csrr a4, vlenb
+; NOREMAT-NEXT:    slli a4, a4, 1
+; NOREMAT-NEXT:    add a4, sp, a4
+; NOREMAT-NEXT:    addi a4, a4, 288
+; NOREMAT-NEXT:    vs2r.v v8, (a4) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
+; NOREMAT-NEXT:    addiw a4, a6, 512
+; NOREMAT-NEXT:    sd a4, 72(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a4)
-; NOREMAT-NEXT:    slli a4, s2, 10
-; NOREMAT-NEXT:    sd a4, 104(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v2, (a4)
+; NOREMAT-NEXT:    vle32.v v18, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v4
+; NOREMAT-NEXT:    slli a4, t0, 10
+; NOREMAT-NEXT:    sd a4, 64(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a4)
-; NOREMAT-NEXT:    li t5, 27
-; NOREMAT-NEXT:    slli a4, t5, 9
-; NOREMAT-NEXT:    sd a4, 96(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    vle32.v v20, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
+; NOREMAT-NEXT:    addiw a4, a6, 1536
+; NOREMAT-NEXT:    sd a4, 56(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a4, a0, a4
-; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    vle32.v v6, (a4)
+; NOREMAT-NEXT:    vle32.v v22, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v26
 ; NOREMAT-NEXT:    slli a3, a3, 11
-; NOREMAT-NEXT:    sd a3, 88(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    li t2, 29
-; NOREMAT-NEXT:    slli a3, t2, 9
-; NOREMAT-NEXT:    sd a3, 80(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    slli a3, t0, 10
-; NOREMAT-NEXT:    sd a3, 72(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    addi a3, sp, 288
+; NOREMAT-NEXT:    vs2r.v v8, (a3) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v28
+; NOREMAT-NEXT:    lui s1, 5
+; NOREMAT-NEXT:    addiw a3, s1, -1536
+; NOREMAT-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
 ; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    li a5, 31
-; NOREMAT-NEXT:    slli a3, a5, 9
-; NOREMAT-NEXT:    sd a3, 64(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    vle32.v v24, (a3)
+; NOREMAT-NEXT:    csrr a3, vlenb
+; NOREMAT-NEXT:    slli a3, a3, 2
+; NOREMAT-NEXT:    add a3, sp, a3
+; NOREMAT-NEXT:    addi a3, a3, 288
+; NOREMAT-NEXT:    vl2r.v v10, (a3) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v30
+; NOREMAT-NEXT:    slli a3, a7, 10
+; NOREMAT-NEXT:    sd a3, 32(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
 ; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    lui a4, 4
-; NOREMAT-NEXT:    add a3, a0, a4
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    addiw a3, a4, 512
-; NOREMAT-NEXT:    sd a3, 56(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
 ; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    slli a3, a7, 10
-; NOREMAT-NEXT:    sd a3, 48(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v12, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a3)
-; NOREMAT-NEXT:    addiw a3, a4, 1536
-; NOREMAT-NEXT:    sd a3, 40(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v0
+; NOREMAT-NEXT:    addiw a3, s1, -512
+; NOREMAT-NEXT:    sd a3, 24(sp) # 8-byte Folded Spill
 ; NOREMAT-NEXT:    add a3, a0, a3
-; NOREMAT-NEXT:    vle32.v v14, (a3)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a3)
-; NOREMAT-NEXT:    slli a2, a2, 11
-; NOREMAT-NEXT:    sd a2, 32(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    lui s1, 5
-; NOREMAT-NEXT:    addiw a2, s1, -1536
-; NOREMAT-NEXT:    sd a2, 24(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    slli a2, a6, 10
-; NOREMAT-NEXT:    sd a2, 16(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, s1, -512
-; NOREMAT-NEXT:    sd a2, 8(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT:    add a2, a0, a2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    add a2, a0, s1
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v0, (a3)
+; NOREMAT-NEXT:    vle32.v v16, (a3)
+; NOREMAT-NEXT:    csrr a3, vlenb
+; NOREMAT-NEXT:    slli a3, a3, 1
+; NOREMAT-NEXT:    add a3, sp, a3
+; NOREMAT-NEXT:    addi a3, a3, 288
+; NOREMAT-NEXT:    vl2r.v v26, (a3) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v2
+; NOREMAT-NEXT:    add a3, a0, s1
+; NOREMAT-NEXT:    vle32.v v26, (a3)
+; NOREMAT-NEXT:    vle32.v v28, (a3)
+; NOREMAT-NEXT:    csrr a3, vlenb
+; NOREMAT-NEXT:    slli a3, a3, 2
+; NOREMAT-NEXT:    add a3, sp, a3
+; NOREMAT-NEXT:    addi a3, a3, 288
+; NOREMAT-NEXT:    vs2r.v v28, (a3) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v4
 ; NOREMAT-NEXT:    addiw ra, s1, 512
-; NOREMAT-NEXT:    add a2, a0, ra
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
+; NOREMAT-NEXT:    add a3, a0, ra
+; NOREMAT-NEXT:    vle32.v v28, (a3)
+; NOREMAT-NEXT:    vle32.v v30, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v6
 ; NOREMAT-NEXT:    slli s11, s10, 10
-; NOREMAT-NEXT:    add a2, a0, s11
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    add a3, a0, s11
+; NOREMAT-NEXT:    vle32.v v2, (a3)
+; NOREMAT-NEXT:    vle32.v v18, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v12
 ; NOREMAT-NEXT:    addiw s10, s1, 1536
-; NOREMAT-NEXT:    add a2, a0, s10
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
+; NOREMAT-NEXT:    add a3, a0, s10
+; NOREMAT-NEXT:    vle32.v v4, (a3)
+; NOREMAT-NEXT:    vle32.v v20, (a3)
+; NOREMAT-NEXT:    addi a3, sp, 288
+; NOREMAT-NEXT:    vl2r.v v12, (a3) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v8
 ; NOREMAT-NEXT:    slli s9, s8, 11
-; NOREMAT-NEXT:    add a2, a0, s9
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    lui t1, 6
-; NOREMAT-NEXT:    addiw s8, t1, -1536
-; NOREMAT-NEXT:    add a2, a0, s8
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
+; NOREMAT-NEXT:    add a3, a0, s9
+; NOREMAT-NEXT:    vle32.v v6, (a3)
+; NOREMAT-NEXT:    vle32.v v12, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v10
+; NOREMAT-NEXT:    lui t0, 6
+; NOREMAT-NEXT:    addiw s8, t0, -1536
+; NOREMAT-NEXT:    add a3, a0, s8
+; NOREMAT-NEXT:    vle32.v v8, (a3)
+; NOREMAT-NEXT:    vle32.v v22, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v0
 ; NOREMAT-NEXT:    slli s7, s6, 10
-; NOREMAT-NEXT:    add a2, a0, s7
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    addiw s6, t1, -512
-; NOREMAT-NEXT:    add a2, a0, s6
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    add a2, a0, t1
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    addiw s5, t1, 512
-; NOREMAT-NEXT:    add a2, a0, s5
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
+; NOREMAT-NEXT:    add a3, a0, s7
+; NOREMAT-NEXT:    vle32.v v10, (a3)
+; NOREMAT-NEXT:    vle32.v v14, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v26
+; NOREMAT-NEXT:    addiw s6, t0, -512
+; NOREMAT-NEXT:    add a3, a0, s6
+; NOREMAT-NEXT:    vle32.v v0, (a3)
+; NOREMAT-NEXT:    vle32.v v16, (a3)
+; NOREMAT-NEXT:    csrr a3, vlenb
+; NOREMAT-NEXT:    slli a3, a3, 2
+; NOREMAT-NEXT:    add a3, sp, a3
+; NOREMAT-NEXT:    addi a3, a3, 288
+; NOREMAT-NEXT:    vl2r.v v24, (a3) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v28
+; NOREMAT-NEXT:    add a3, a0, t0
+; NOREMAT-NEXT:    vle32.v v24, (a3)
+; NOREMAT-NEXT:    vle32.v v26, (a3)
+; NOREMAT-NEXT:    csrr a3, vlenb
+; NOREMAT-NEXT:    slli a3, a3, 2
+; NOREMAT-NEXT:    add a3, sp, a3
+; NOREMAT-NEXT:    addi a3, a3, 288
+; NOREMAT-NEXT:    vs2r.v v26, (a3) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v2
+; NOREMAT-NEXT:    addiw s5, t0, 512
+; NOREMAT-NEXT:    add a3, a0, s5
+; NOREMAT-NEXT:    vle32.v v26, (a3)
+; NOREMAT-NEXT:    vle32.v v28, (a3)
+; NOREMAT-NEXT:    csrr a3, vlenb
+; NOREMAT-NEXT:    slli a3, a3, 1
+; NOREMAT-NEXT:    add a3, sp, a3
+; NOREMAT-NEXT:    addi a3, a3, 288
+; NOREMAT-NEXT:    vs2r.v v28, (a3) # Unknown-size Folded Spill
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v4
 ; NOREMAT-NEXT:    slli s4, s3, 10
-; NOREMAT-NEXT:    add a2, a0, s4
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
-; NOREMAT-NEXT:    addiw s3, t1, 1536
-; NOREMAT-NEXT:    add a2, a0, s3
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
+; NOREMAT-NEXT:    add a3, a0, s4
+; NOREMAT-NEXT:    vle32.v v28, (a3)
+; NOREMAT-NEXT:    vle32.v v18, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v6
+; NOREMAT-NEXT:    addiw s3, t0, 1536
+; NOREMAT-NEXT:    add a3, a0, s3
+; NOREMAT-NEXT:    vle32.v v30, (a3)
+; NOREMAT-NEXT:    vle32.v v20, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v8
 ; NOREMAT-NEXT:    slli s2, s2, 11
-; NOREMAT-NEXT:    add a2, a0, s2
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    add a3, a0, s2
+; NOREMAT-NEXT:    vle32.v v2, (a3)
+; NOREMAT-NEXT:    vle32.v v12, (a3)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v10
 ; NOREMAT-NEXT:    lui a3, 7
 ; NOREMAT-NEXT:    addiw s0, a3, -1536
-; NOREMAT-NEXT:    add a2, a0, s0
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
+; NOREMAT-NEXT:    add a4, a0, s0
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    vle32.v v22, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v0
 ; NOREMAT-NEXT:    slli t6, t5, 10
-; NOREMAT-NEXT:    add a2, a0, t6
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    add a4, a0, t6
+; NOREMAT-NEXT:    vle32.v v6, (a4)
+; NOREMAT-NEXT:    vle32.v v14, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v24
 ; NOREMAT-NEXT:    addiw t5, a3, -512
-; NOREMAT-NEXT:    add a2, a0, t5
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    add a2, a0, a3
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    add a4, a0, t5
+; NOREMAT-NEXT:    vle32.v v0, (a4)
+; NOREMAT-NEXT:    vle32.v v16, (a4)
+; NOREMAT-NEXT:    csrr a4, vlenb
+; NOREMAT-NEXT:    slli a4, a4, 2
+; NOREMAT-NEXT:    add a4, sp, a4
+; NOREMAT-NEXT:    addi a4, a4, 288
+; NOREMAT-NEXT:    vl2r.v v8, (a4) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
+; NOREMAT-NEXT:    add a4, a0, a3
+; NOREMAT-NEXT:    vle32.v v26, (a4)
+; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    csrr a4, vlenb
+; NOREMAT-NEXT:    slli a4, a4, 1
+; NOREMAT-NEXT:    add a4, sp, a4
+; NOREMAT-NEXT:    addi a4, a4, 288
+; NOREMAT-NEXT:    vl2r.v v10, (a4) # Unknown-size Folded Reload
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; NOREMAT-NEXT:    addiw t4, a3, 512
-; NOREMAT-NEXT:    add a2, a0, t4
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
+; NOREMAT-NEXT:    add a4, a0, t4
+; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    vle32.v v24, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v18, v30
 ; NOREMAT-NEXT:    slli t3, t2, 10
-; NOREMAT-NEXT:    add a2, a0, t3
-; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    add a4, a0, t3
+; NOREMAT-NEXT:    vle32.v v18, (a4)
+; NOREMAT-NEXT:    vle32.v v28, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v20, v2
 ; NOREMAT-NEXT:    addiw t2, a3, 1536
-; NOREMAT-NEXT:    add a2, a0, t2
-; NOREMAT-NEXT:    vle32.v v14, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a2)
-; NOREMAT-NEXT:    slli t0, t0, 11
-; NOREMAT-NEXT:    add a2, a0, t0
+; NOREMAT-NEXT:    add a4, a0, t2
+; NOREMAT-NEXT:    vle32.v v20, (a4)
+; NOREMAT-NEXT:    vle32.v v30, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
+; NOREMAT-NEXT:    slli t1, a2, 11
+; NOREMAT-NEXT:    add a2, a0, t1
 ; NOREMAT-NEXT:    vle32.v v12, (a2)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a2)
+; NOREMAT-NEXT:    vle32.v v2, (a2)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v22, v6
 ; NOREMAT-NEXT:    lui a2, 8
 ; NOREMAT-NEXT:    addiw a7, a2, -1536
 ; NOREMAT-NEXT:    add a4, a0, a7
-; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    vle32.v v22, (a4)
+; NOREMAT-NEXT:    vle32.v v4, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v14, v0
 ; NOREMAT-NEXT:    slli a6, a5, 10
 ; NOREMAT-NEXT:    add a4, a0, a6
-; NOREMAT-NEXT:    vle32.v v12, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    vle32.v v8, (a4)
+; NOREMAT-NEXT:    vle32.v v14, (a4)
+; NOREMAT-NEXT:    vle32.v v0, (a4)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v16, v26
 ; NOREMAT-NEXT:    addiw a5, a2, -512
 ; NOREMAT-NEXT:    add a4, a0, a5
-; NOREMAT-NEXT:    vle32.v v14, (a4)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; NOREMAT-NEXT:    vle32.v v10, (a4)
+; NOREMAT-NEXT:    vle32.v v16, (a4)
+; NOREMAT-NEXT:    vle32.v v26, (a4)
 ; NOREMAT-NEXT:    add a0, a0, a2
-; NOREMAT-NEXT:    vle32.v v12, (a0)
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; NOREMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; NOREMAT-NEXT:    vle32.v v6, (a0)
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v8, v10
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v24, v18
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v28, v20
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v30, v12
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v2, v22
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v4, v14
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v0, v16
+; NOREMAT-NEXT:    sf.vc.vv 3, 0, v26, v6
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    addi a0, a1, 1024
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    addi a0, a1, 1536
-; NOREMAT-NEXT:    vse32.v v10, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 272(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 264(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 256(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 248(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    lui a0, 1
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 240(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    lui a0, 1
+; NOREMAT-NEXT:    ld a0, 232(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 224(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    ld a0, 216(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 208(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    ld a0, 200(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 192(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    lui a0, 2
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 184(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    lui a0, 2
+; NOREMAT-NEXT:    ld a0, 176(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 168(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    ld a0, 160(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 152(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    ld a0, 144(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 136(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    lui a0, 3
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 128(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    lui a0, 3
+; NOREMAT-NEXT:    ld a0, 120(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 112(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    ld a0, 104(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 96(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    ld a0, 88(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 80(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    lui a0, 4
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    ld a0, 72(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    lui a0, 4
+; NOREMAT-NEXT:    ld a0, 64(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 56(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    ld a0, 48(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    ld a0, 40(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    ld a0, 32(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    ld a0, 16(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld a0, 24(sp) # 8-byte Folded Reload
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    ld a0, 8(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    add s1, a1, s1
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (s1)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    add ra, a1, ra
+; NOREMAT-NEXT:    vse32.v v8, (ra)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (ra)
 ; NOREMAT-NEXT:    add s11, a1, s11
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (s11)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    add s10, a1, s10
+; NOREMAT-NEXT:    vse32.v v8, (s10)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (s10)
 ; NOREMAT-NEXT:    add s9, a1, s9
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (s9)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    add s8, a1, s8
+; NOREMAT-NEXT:    vse32.v v8, (s8)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (s8)
 ; NOREMAT-NEXT:    add s7, a1, s7
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (s7)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    add s6, a1, s6
+; NOREMAT-NEXT:    vse32.v v8, (s6)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    add t0, a1, t0
+; NOREMAT-NEXT:    vse32.v v8, (t0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (s6)
-; NOREMAT-NEXT:    add t1, a1, t1
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (t1)
 ; NOREMAT-NEXT:    add s5, a1, s5
+; NOREMAT-NEXT:    vse32.v v8, (s5)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (s5)
 ; NOREMAT-NEXT:    add s4, a1, s4
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (s4)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    add s3, a1, s3
+; NOREMAT-NEXT:    vse32.v v8, (s3)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (s3)
 ; NOREMAT-NEXT:    add s2, a1, s2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (s2)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    add s0, a1, s0
+; NOREMAT-NEXT:    vse32.v v8, (s0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (s0)
 ; NOREMAT-NEXT:    add t6, a1, t6
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (t6)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    add t5, a1, t5
+; NOREMAT-NEXT:    vse32.v v8, (t5)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (t5)
 ; NOREMAT-NEXT:    add a3, a1, a3
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a3)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    add t4, a1, t4
+; NOREMAT-NEXT:    vse32.v v8, (t4)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (t4)
 ; NOREMAT-NEXT:    add t3, a1, t3
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (t3)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    add t2, a1, t2
+; NOREMAT-NEXT:    vse32.v v8, (t2)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    add t1, a1, t1
+; NOREMAT-NEXT:    vse32.v v8, (t1)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (t2)
-; NOREMAT-NEXT:    add t0, a1, t0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (t0)
 ; NOREMAT-NEXT:    add a7, a1, a7
+; NOREMAT-NEXT:    vse32.v v8, (a7)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a7)
 ; NOREMAT-NEXT:    add a6, a1, a6
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a6)
-; NOREMAT-NEXT:    add a5, a1, a5
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a5)
-; NOREMAT-NEXT:    add a0, a1, a2
+; NOREMAT-NEXT:    add a5, a1, a5
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    addiw a0, a2, 512
-; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    vse32.v v8, (a5)
+; NOREMAT-NEXT:    add a0, a1, a2
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    addiw a0, a2, 1024
+; NOREMAT-NEXT:    addiw a0, a2, 512
 ; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
-; NOREMAT-NEXT:    addiw a0, a2, 1536
+; NOREMAT-NEXT:    addiw a0, a2, 1024
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v10, (a0)
-; NOREMAT-NEXT:    li a0, 17
-; NOREMAT-NEXT:    slli a0, a0, 11
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    addiw a0, a2, 1536
 ; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    li a0, 17
+; NOREMAT-NEXT:    slli a0, a0, 11
+; NOREMAT-NEXT:    add a0, a1, a0
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    lui a0, 9
 ; NOREMAT-NEXT:    addiw a2, a0, -1536
 ; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -1024
-; NOREMAT-NEXT:    add a2, a1, a2
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -512
+; NOREMAT-NEXT:    addiw a2, a0, -1024
 ; NOREMAT-NEXT:    add a2, a1, a2
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    add a2, a1, a0
+; NOREMAT-NEXT:    addiw a2, a0, -512
+; NOREMAT-NEXT:    add a2, a1, a2
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, 512
-; NOREMAT-NEXT:    add a2, a1, a2
+; NOREMAT-NEXT:    add a2, a1, a0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, 1024
+; NOREMAT-NEXT:    addiw a2, a0, 512
 ; NOREMAT-NEXT:    add a2, a1, a2
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a2)
+; NOREMAT-NEXT:    addiw a2, a0, 1024
+; NOREMAT-NEXT:    add a2, a1, a2
+; NOREMAT-NEXT:    vse32.v v10, (a2)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    addiw a0, a0, 1536
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    li a0, 19
 ; NOREMAT-NEXT:    slli a0, a0, 11
 ; NOREMAT-NEXT:    add a0, a1, a0
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
-; NOREMAT-NEXT:    vse32.v v8, (a0)
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    vse32.v v10, (a0)
 ; NOREMAT-NEXT:    lui a0, 10
 ; NOREMAT-NEXT:    addiw a2, a0, -1536
 ; NOREMAT-NEXT:    add a2, a1, a2
-; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -1024
-; NOREMAT-NEXT:    add a2, a1, a2
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a2)
-; NOREMAT-NEXT:    addiw a2, a0, -512
+; NOREMAT-NEXT:    addiw a2, a0, -1024
 ; NOREMAT-NEXT:    add a2, a1, a2
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; NOREMAT-NEXT:    vse32.v v10, (a2)
-; NOREMAT-NEXT:    add a2, a1, a0
+; NOREMAT-NEXT:    addiw a2, a0, -512
+; NOREMAT-NEXT:    add a2, a1, a2
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; NOREMAT-NEXT:    vse32.v v8, (a2)
+; NOREMAT-NEXT:    add a2, a1, a0
+; NOREMAT-NEXT:    vse32.v v10, (a2)
 ; NOREMAT-NEXT:    addiw a0, a0, 512
 ; NOREMAT-NEXT:    add a0, a1, a0
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    vse32.v v10, (a0)
+; NOREMAT-NEXT:    vse32.v v8, (a0)
 ; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; NOREMAT-NEXT:    ld ra, 360(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s0, 352(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s1, 344(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s2, 336(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s3, 328(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s4, 320(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s5, 312(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s6, 304(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s7, 296(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s8, 288(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s9, 280(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s10, 272(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    ld s11, 264(sp) # 8-byte Folded Reload
-; NOREMAT-NEXT:    addi sp, sp, 368
+; NOREMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
+; NOREMAT-NEXT:    csrr a0, vlenb
+; NOREMAT-NEXT:    li a1, 6
+; NOREMAT-NEXT:    mul a0, a0, a1
+; NOREMAT-NEXT:    add sp, sp, a0
+; NOREMAT-NEXT:    ld ra, 392(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s0, 384(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s1, 376(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s2, 368(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s3, 360(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s4, 352(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s5, 344(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s6, 336(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s7, 328(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s8, 320(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s9, 312(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s10, 304(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    ld s11, 296(sp) # 8-byte Folded Reload
+; NOREMAT-NEXT:    addi sp, sp, 400
 ; NOREMAT-NEXT:    ret
 ;
 ; REMAT-LABEL: test:
@@ -864,512 +927,512 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    add a2, a0, a2
 ; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    li a2, 11
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v10, (a2)
 ; REMAT-NEXT:    li a2, 23
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v16
+; REMAT-NEXT:    vle32.v v12, (a2)
 ; REMAT-NEXT:    lui a2, 3
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v14, (a2)
 ; REMAT-NEXT:    li a2, 25
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    vle32.v v16, (a2)
 ; REMAT-NEXT:    li a2, 13
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    vle32.v v18, (a2)
 ; REMAT-NEXT:    li a2, 27
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v20, (a2)
 ; REMAT-NEXT:    li a2, 7
 ; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    vle32.v v22, (a2)
 ; REMAT-NEXT:    li a2, 29
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v26
 ; REMAT-NEXT:    li a2, 15
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v28
 ; REMAT-NEXT:    li a2, 31
 ; REMAT-NEXT:    slli a2, a2, 9
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v30
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v0
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    addiw a2, a2, 512
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v2
 ; REMAT-NEXT:    li a2, 17
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v4
 ; REMAT-NEXT:    lui a2, 4
 ; REMAT-NEXT:    addiw a2, a2, 1536
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v24
 ; REMAT-NEXT:    li a2, 9
 ; REMAT-NEXT:    slli a2, a2, 11
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v26
 ; REMAT-NEXT:    lui a2, 5
 ; REMAT-NEXT:    addiw a2, a2, -1536
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v28
 ; REMAT-NEXT:    li a2, 19
 ; REMAT-NEXT:    slli a2, a2, 10
 ; REMAT-NEXT:    add a2, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v30
 ; REMAT-NEXT:    lui ra, 5
 ; REMAT-NEXT:    addiw ra, ra, -512
 ; REMAT-NEXT:    add a2, a0, ra
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v0
 ; REMAT-NEXT:    lui s11, 5
 ; REMAT-NEXT:    add a2, a0, s11
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v2
 ; REMAT-NEXT:    lui s10, 5
 ; REMAT-NEXT:    addiw s10, s10, 512
 ; REMAT-NEXT:    add a2, a0, s10
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v4
 ; REMAT-NEXT:    li s9, 21
 ; REMAT-NEXT:    slli s9, s9, 10
 ; REMAT-NEXT:    add a2, a0, s9
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v24
 ; REMAT-NEXT:    lui s8, 5
 ; REMAT-NEXT:    addiw s8, s8, 1536
 ; REMAT-NEXT:    add a2, a0, s8
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v26
 ; REMAT-NEXT:    li s7, 11
 ; REMAT-NEXT:    slli s7, s7, 11
 ; REMAT-NEXT:    add a2, a0, s7
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v28
 ; REMAT-NEXT:    lui s6, 6
 ; REMAT-NEXT:    addiw s6, s6, -1536
 ; REMAT-NEXT:    add a2, a0, s6
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v30
 ; REMAT-NEXT:    li s5, 23
 ; REMAT-NEXT:    slli s5, s5, 10
 ; REMAT-NEXT:    add a2, a0, s5
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v0
 ; REMAT-NEXT:    lui s4, 6
 ; REMAT-NEXT:    addiw s4, s4, -512
 ; REMAT-NEXT:    add a2, a0, s4
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v2
 ; REMAT-NEXT:    lui s3, 6
 ; REMAT-NEXT:    add a2, a0, s3
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v14, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v4
 ; REMAT-NEXT:    lui s2, 6
 ; REMAT-NEXT:    addiw s2, s2, 512
 ; REMAT-NEXT:    add a2, a0, s2
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v4, (a2)
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v24
 ; REMAT-NEXT:    li s1, 25
 ; REMAT-NEXT:    slli s1, s1, 10
 ; REMAT-NEXT:    add a2, a0, s1
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v6, (a2)
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v26
 ; REMAT-NEXT:    lui s0, 6
 ; REMAT-NEXT:    addiw s0, s0, 1536
 ; REMAT-NEXT:    add a2, a0, s0
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v28
 ; REMAT-NEXT:    li t6, 13
 ; REMAT-NEXT:    slli t6, t6, 11
 ; REMAT-NEXT:    add a2, a0, t6
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v30
 ; REMAT-NEXT:    lui t5, 7
 ; REMAT-NEXT:    addiw t5, t5, -1536
 ; REMAT-NEXT:    add a2, a0, t5
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v0
 ; REMAT-NEXT:    li t4, 27
 ; REMAT-NEXT:    slli t4, t4, 10
 ; REMAT-NEXT:    add a2, a0, t4
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v2
 ; REMAT-NEXT:    lui t3, 7
 ; REMAT-NEXT:    addiw t3, t3, -512
 ; REMAT-NEXT:    add a2, a0, t3
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v14, v4
 ; REMAT-NEXT:    lui t2, 7
 ; REMAT-NEXT:    add a2, a0, t2
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
+; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v6
 ; REMAT-NEXT:    lui t1, 7
 ; REMAT-NEXT:    addiw t1, t1, 512
 ; REMAT-NEXT:    add a2, a0, t1
 ; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v16, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v18, v26
 ; REMAT-NEXT:    li t0, 29
 ; REMAT-NEXT:    slli t0, t0, 10
 ; REMAT-NEXT:    add a2, a0, t0
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v18, (a2)
+; REMAT-NEXT:    vle32.v v26, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v20, v28
 ; REMAT-NEXT:    lui a7, 7
 ; REMAT-NEXT:    addiw a7, a7, 1536
 ; REMAT-NEXT:    add a2, a0, a7
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v20, (a2)
+; REMAT-NEXT:    vle32.v v28, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v22, v30
 ; REMAT-NEXT:    li a6, 15
 ; REMAT-NEXT:    slli a6, a6, 11
 ; REMAT-NEXT:    add a2, a0, a6
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v22, (a2)
+; REMAT-NEXT:    vle32.v v30, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v24, v0
 ; REMAT-NEXT:    lui a5, 8
 ; REMAT-NEXT:    addiw a5, a5, -1536
 ; REMAT-NEXT:    add a2, a0, a5
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v24, (a2)
+; REMAT-NEXT:    vle32.v v0, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v2
 ; REMAT-NEXT:    li a4, 31
 ; REMAT-NEXT:    slli a4, a4, 10
 ; REMAT-NEXT:    add a2, a0, a4
-; REMAT-NEXT:    vle32.v v12, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    vle32.v v8, (a2)
+; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v2, (a2)
+; REMAT-NEXT:    sf.vc.vv 3, 0, v12, v4
 ; REMAT-NEXT:    lui a3, 8
 ; REMAT-NEXT:    addiw a3, a3, -512
 ; REMAT-NEXT:    add a2, a0, a3
-; REMAT-NEXT:    vle32.v v14, (a2)
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
-; REMAT-NEXT:    vle32.v v10, (a2)
+; REMAT-NEXT:    vle32.v v12, (a2)
+; REMAT-NEXT:    vle32.v v4, (a2)
 ; REMAT-NEXT:    lui a2, 8
 ; REMAT-NEXT:    add a0, a0, a2
-; REMAT-NEXT:    vle32.v v12, (a0)
+; REMAT-NEXT:    vle32.v v6, (a0)
 ; REMAT-NEXT:    sf.vc.vv 3, 0, v8, v14
-; REMAT-NEXT:    sf.vc.vv 3, 0, v10, v12
+; REMAT-NEXT:    sf.vc.vv 3, 0, v16, v18
+; REMAT-NEXT:    sf.vc.vv 3, 0, v26, v20
+; REMAT-NEXT:    sf.vc.vv 3, 0, v28, v22
+; REMAT-NEXT:    sf.vc.vv 3, 0, v30, v24
+; REMAT-NEXT:    sf.vc.vv 3, 0, v0, v10
+; REMAT-NEXT:    sf.vc.vv 3, 0, v2, v12
+; REMAT-NEXT:    sf.vc.vv 3, 0, v4, v6
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    addi a0, a1, 1024
 ; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    addi a0, a1, 1536
-; REMAT-NEXT:    vse32.v v10, (a0)
+; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 1
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 5
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 3
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 7
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    lui a0, 1
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 9
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 5
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 11
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 3
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 13
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 7
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 15
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    lui a0, 2
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 17
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 9
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 19
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 5
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 21
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 11
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 23
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    lui a0, 3
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 25
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 13
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 27
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 7
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 29
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 15
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    li a0, 31
 ; REMAT-NEXT:    slli a0, a0, 9
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    lui a0, 4
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    lui a0, 4
 ; REMAT-NEXT:    addiw a0, a0, 512
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 17
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    lui a0, 4
 ; REMAT-NEXT:    addiw a0, a0, 1536
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 9
 ; REMAT-NEXT:    slli a0, a0, 11
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    lui a0, 5
 ; REMAT-NEXT:    addiw a0, a0, -1536
 ; REMAT-NEXT:    add a0, a1, a0
+; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    li a0, 19
 ; REMAT-NEXT:    slli a0, a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add ra, a1, ra
+; REMAT-NEXT:    vse32.v v8, (ra)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (ra)
 ; REMAT-NEXT:    add s11, a1, s11
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (s11)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add s10, a1, s10
+; REMAT-NEXT:    vse32.v v8, (s10)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (s10)
 ; REMAT-NEXT:    add s9, a1, s9
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (s9)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add s8, a1, s8
+; REMAT-NEXT:    vse32.v v8, (s8)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (s8)
 ; REMAT-NEXT:    add s7, a1, s7
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (s7)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add s6, a1, s6
+; REMAT-NEXT:    vse32.v v8, (s6)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (s6)
 ; REMAT-NEXT:    add s5, a1, s5
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (s5)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add s4, a1, s4
+; REMAT-NEXT:    vse32.v v8, (s4)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (s4)
 ; REMAT-NEXT:    add s3, a1, s3
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (s3)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add s2, a1, s2
+; REMAT-NEXT:    vse32.v v8, (s2)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (s2)
 ; REMAT-NEXT:    add s1, a1, s1
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (s1)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add s0, a1, s0
+; REMAT-NEXT:    vse32.v v8, (s0)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (s0)
 ; REMAT-NEXT:    add t6, a1, t6
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (t6)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add t5, a1, t5
+; REMAT-NEXT:    vse32.v v8, (t5)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (t5)
 ; REMAT-NEXT:    add t4, a1, t4
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (t4)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add t3, a1, t3
+; REMAT-NEXT:    vse32.v v8, (t3)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (t3)
 ; REMAT-NEXT:    add t2, a1, t2
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (t2)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add t1, a1, t1
+; REMAT-NEXT:    vse32.v v8, (t1)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (t1)
 ; REMAT-NEXT:    add t0, a1, t0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (t0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add a7, a1, a7
+; REMAT-NEXT:    vse32.v v8, (a7)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a7)
 ; REMAT-NEXT:    add a6, a1, a6
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a6)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add a5, a1, a5
+; REMAT-NEXT:    vse32.v v8, (a5)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a5)
 ; REMAT-NEXT:    add a4, a1, a4
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a4)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    add a3, a1, a3
+; REMAT-NEXT:    vse32.v v8, (a3)
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a3)
 ; REMAT-NEXT:    add a2, a1, a2
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a2)
@@ -1449,13 +1512,13 @@ define void @test(ptr %0, ptr %1, i64 %2) {
 ; REMAT-NEXT:    vse32.v v10, (a0)
 ; REMAT-NEXT:    lui a0, 10
 ; REMAT-NEXT:    add a0, a1, a0
-; REMAT-NEXT:    sf.vc.v.i 2, 0, v10, 0
 ; REMAT-NEXT:    vse32.v v8, (a0)
 ; REMAT-NEXT:    lui a0, 10
 ; REMAT-NEXT:    addiw a0, a0, 512
 ; REMAT-NEXT:    add a0, a1, a0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
-; REMAT-NEXT:    vse32.v v10, (a0)
+; REMAT-NEXT:    vse32.v v8, (a0)
+; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    sf.vc.v.i 2, 0, v8, 0
 ; REMAT-NEXT:    ld ra, 104(sp) # 8-byte Folded Reload
 ; REMAT-NEXT:    ld s0, 96(sp) # 8-byte Folded Reload

From 502ec2e5a0e5922e09e69b7da08f3e7a8c7aacac Mon Sep 17 00:00:00 2001
From: Weining Lu <luweining@loongson.cn>
Date: Wed, 24 Jan 2024 11:53:31 +0800
Subject: [PATCH 733/843] [test] Make dwarf-loongarch-relocs.ll non-sensitive
 of function alignment

---
 .../LoongArch/dwarf-loongarch-relocs.ll       | 39 +++++++------------
 1 file changed, 15 insertions(+), 24 deletions(-)

diff --git a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
index 07443a62b9339..d6a1d8d6e1366 100644
--- a/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
+++ b/llvm/test/DebugInfo/LoongArch/dwarf-loongarch-relocs.ll
@@ -1,22 +1,19 @@
 ; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=-relax %s -o %t.o
 ; RUN: llvm-readobj -r %t.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-NORL %s
-; RUN: llvm-objdump --source %t.o | FileCheck --check-prefixes=SOURCE,SOURCE-NORL %s
-; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefixes=DWARF,DWARF-NORL %s
+; RUN: llvm-objdump --source %t.o | FileCheck --check-prefix=SOURCE %s
+; RUN: llvm-dwarfdump --debug-info --debug-line %t.o | FileCheck --check-prefix=DWARF %s
 
-; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=+relax %s -o %t.r.o
+; RUN: llc --filetype=obj --mtriple=loongarch64 --mattr=+relax --align-all-functions=2 %s -o %t.r.o
 ; RUN: llvm-readobj -r %t.r.o | FileCheck --check-prefixes=RELOCS-BOTH,RELOCS-ENRL %s
-; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefixes=SOURCE,SOURCE-ENRL %s
-; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefixes=DWARF,DWARF-ENRL %s
+; RUN: llvm-objdump --source %t.r.o | FileCheck --check-prefix=SOURCE %s
+; RUN: llvm-dwarfdump --debug-info --debug-line %t.r.o | FileCheck --check-prefix=DWARF %s
 
 ; RELOCS-BOTH:       Relocations [
 ; RELOCS-BOTH-NEXT:    Section ({{.*}}) .rela.text {
-; RELOCS-NORL-NEXT:      0x14 R_LARCH_PCALA_HI20 sym 0x0
-; RELOCS-NORL-NEXT:      0x18 R_LARCH_PCALA_LO12 sym 0x0
-; RELOCS-ENRL-NEXT:      0x0 R_LARCH_ALIGN .Lla-relax-align0 0x5
-; RELOCS-ENRL-NEXT:      0x30 R_LARCH_PCALA_HI20 sym 0x0
-; RELOCS-ENRL-NEXT:      0x30 R_LARCH_RELAX - 0x0
-; RELOCS-ENRL-NEXT:      0x34 R_LARCH_PCALA_LO12 sym 0x0
-; RELOCS-ENRL-NEXT:      0x34 R_LARCH_RELAX - 0x0
+; RELOCS-BOTH-NEXT:      0x14 R_LARCH_PCALA_HI20 sym 0x0
+; RELOCS-ENRL-NEXT:      0x14 R_LARCH_RELAX - 0x0
+; RELOCS-BOTH-NEXT:      0x18 R_LARCH_PCALA_LO12 sym 0x0
+; RELOCS-ENRL-NEXT:      0x18 R_LARCH_RELAX - 0x0
 ; RELOCS-BOTH-NEXT:    }
 ; RELOCS-BOTH:         Section ({{.*}}) .rela.debug_frame {
 ; RELOCS-NORL-NEXT:      0x1C R_LARCH_32 .debug_frame 0x0
@@ -39,8 +36,7 @@
 ; RELOCS-BOTH-NEXT:    }
 ; RELOCS-BOTH-NEXT:  ]
 
-; SOURCE-NORL:  0000000000000000 <foo>:
-; SOURCE-ENRL:  000000000000001c <foo>:
+; SOURCE:  0000000000000000 <foo>:
 ; SOURCE:  ; {
 ; SOURCE:  ;   asm volatile(
 ; SOURCE:  ;   return 0;
@@ -91,16 +87,11 @@
 ; DWARF-EMPTY:
 ; DWARF-NEXT:  Address            Line   Column File   ISA Discriminator OpIndex Flags
 ; DWARF-NEXT:  ------------------ ------ ------ ------ --- ------------- ------- -------------
-; DWARF-NORL-NEXT:  0x0000000000000000      2      0      0   0             0       0  is_stmt
-; DWARF-NORL-NEXT:  0x0000000000000010      3      3      0   0             0       0  is_stmt prologue_end
-; DWARF-NORL-NEXT:  0x0000000000000020     10      3      0   0             0       0  is_stmt
-; DWARF-NORL-NEXT:  0x000000000000002c     10      3      0   0             0       0  epilogue_begin
-; DWARF-NORL-NEXT:  0x0000000000000034     10      3      0   0             0       0  end_sequence
-; DWARF-ENRL-NEXT:  0x000000000000001c      2      0      0   0             0       0  is_stmt
-; DWARF-ENRL-NEXT:  0x000000000000002c      3      3      0   0             0       0  is_stmt prologue_end
-; DWARF-ENRL-NEXT:  0x000000000000003c     10      3      0   0             0       0  is_stmt
-; DWARF-ENRL-NEXT:  0x0000000000000048     10      3      0   0             0       0  epilogue_begin
-; DWARF-ENRL-NEXT:  0x0000000000000050     10      3      0   0             0       0  end_sequence
+; DWARF-NEXT:  0x0000000000000000      2      0      0   0             0       0  is_stmt
+; DWARF-NEXT:  0x0000000000000010      3      3      0   0             0       0  is_stmt prologue_end
+; DWARF-NEXT:  0x0000000000000020     10      3      0   0             0       0  is_stmt
+; DWARF-NEXT:  0x000000000000002c     10      3      0   0             0       0  epilogue_begin
+; DWARF-NEXT:  0x0000000000000034     10      3      0   0             0       0  end_sequence
 
 ; ModuleID = 'dwarf-loongarch-relocs.c'
 source_filename = "dwarf-loongarch-relocs.c"

From 4c3de45ecf9eea6b4ad850a042706f7865a2aab2 Mon Sep 17 00:00:00 2001
From: leecheechen <chenli@loongson.cn>
Date: Wed, 24 Jan 2024 13:19:19 +0800
Subject: [PATCH 734/843] [LoongArch][test] Add tests reporting error if
 lsx/lasx feature is missing when lsx/lasx builtins are called (#79250)

---
 clang/test/CodeGen/LoongArch/lasx/builtin-error.c | 9 +++++++++
 clang/test/CodeGen/LoongArch/lsx/builtin-error.c  | 9 +++++++++
 2 files changed, 18 insertions(+)

diff --git a/clang/test/CodeGen/LoongArch/lasx/builtin-error.c b/clang/test/CodeGen/LoongArch/lasx/builtin-error.c
index 724484465769e..dabf34368ea3e 100644
--- a/clang/test/CodeGen/LoongArch/lasx/builtin-error.c
+++ b/clang/test/CodeGen/LoongArch/lasx/builtin-error.c
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -triple loongarch64 -target-feature +lasx -verify %s
+// RUN: not %clang_cc1 -triple loongarch64 -DFEATURE_CHECK -emit-llvm %s -o /dev/null 2>&1 \
+// RUN:   | FileCheck %s
 
 typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
 typedef signed char v32i8_b __attribute__((vector_size(32), aligned(1)));
@@ -21,6 +23,13 @@ typedef float v8f32_w __attribute__((vector_size(32), aligned(4)));
 typedef double v4f64 __attribute__((vector_size(32), aligned(32)));
 typedef double v4f64_d __attribute__((vector_size(32), aligned(8)));
 
+#ifdef FEATURE_CHECK
+void test_feature(v32i8 _1) {
+// CHECK: error: '__builtin_lasx_xvslli_b' needs target feature lasx
+  (void)__builtin_lasx_xvslli_b(_1, 1);
+}
+#endif
+
 v32i8 xvslli_b(v32i8 _1, int var) {
   v32i8 res = __builtin_lasx_xvslli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
   res |= __builtin_lasx_xvslli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}
diff --git a/clang/test/CodeGen/LoongArch/lsx/builtin-error.c b/clang/test/CodeGen/LoongArch/lsx/builtin-error.c
index 3fc5f73f11934..722ba7fe8cd20 100644
--- a/clang/test/CodeGen/LoongArch/lsx/builtin-error.c
+++ b/clang/test/CodeGen/LoongArch/lsx/builtin-error.c
@@ -1,4 +1,6 @@
 // RUN: %clang_cc1 -triple loongarch64 -target-feature +lsx -verify %s
+// RUN: not %clang_cc1 -triple loongarch64 -DFEATURE_CHECK -emit-llvm %s -o /dev/null 2>&1 \
+// RUN:   | FileCheck %s
 
 typedef signed char v16i8 __attribute__((vector_size(16), aligned(16)));
 typedef signed char v16i8_b __attribute__((vector_size(16), aligned(1)));
@@ -25,6 +27,13 @@ typedef long long __m128i __attribute__((__vector_size__(16), __may_alias__));
 typedef float __m128 __attribute__((__vector_size__(16), __may_alias__));
 typedef double __m128d __attribute__((__vector_size__(16), __may_alias__));
 
+#ifdef FEATURE_CHECK
+void test_feature(v16i8 _1) {
+// CHECK: error: '__builtin_lsx_vslli_b' needs target feature lsx
+  (void)__builtin_lsx_vslli_b(_1, 1);
+}
+#endif
+
 v16i8 vslli_b(v16i8 _1, int var) {
   v16i8 res = __builtin_lsx_vslli_b(_1, -1); // expected-error {{argument value 4294967295 is outside the valid range [0, 7]}}
   res |= __builtin_lsx_vslli_b(_1, 8); // expected-error {{argument value 8 is outside the valid range [0, 7]}}

From 218bb21eaa2f3c08ec24de1f5d88f4d5265068bf Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 24 Jan 2024 12:53:33 +0700
Subject: [PATCH 735/843] [RISCV] Update performCombineVMergeAndVOps comments.
 NFC (#78472)

The current comment was written whenever we had separate TU/TA variants
for
each pseudo, and hasn't been accurate for a while.
This method has grown rather complicated over time so rather than
enumerate all
the different possible cases now (which must be a lot), this updates the
comment to list the different rules that are required for us to be able
to fold
a vmerge.
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 42 ++++++++++++---------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 0d8688ba2eaea..4d6f74bc0285c 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -3458,19 +3458,27 @@ static unsigned GetVMSetForLMul(RISCVII::VLMUL LMUL) {
   llvm_unreachable("Unknown VLMUL enum");
 }
 
-// Try to fold away VMERGE_VVM instructions. We handle these cases:
-// -Masked TU VMERGE_VVM combined with an unmasked TA instruction instruction
-//  folds to a masked TU instruction. VMERGE_VVM must have have merge operand
-//  same as false operand.
-// -Masked TA VMERGE_VVM combined with an unmasked TA instruction fold to a
-//  masked TA instruction.
-// -Unmasked TU VMERGE_VVM combined with a masked MU TA instruction folds to
-//  masked TU instruction. Both instructions must have the same merge operand.
-//  VMERGE_VVM must have have merge operand same as false operand.
-// Note: The VMERGE_VVM forms above (TA, and TU) refer to the policy implied,
-// not the pseudo name.  That is, a TA VMERGE_VVM can be either the _TU pseudo
-// form with an IMPLICIT_DEF passthrough operand or the unsuffixed (TA) pseudo
-// form.
+// Try to fold away VMERGE_VVM instructions into their true operands:
+//
+// %true = PseudoVADD_VV ...
+// %x = PseudoVMERGE_VVM %false, %false, %true, %mask
+// ->
+// %x = PseudoVADD_VV_MASK %false, ..., %mask
+//
+// We can only fold if vmerge's merge operand, vmerge's false operand and
+// %true's merge operand (if it has one) are the same. This is because we have
+// to consolidate them into one merge operand in the result.
+//
+// If %true is masked, then we can use its mask instead of vmerge's if vmerge's
+// mask is all ones.
+//
+// We can also fold a VMV_V_V into its true operand, since it is equivalent to a
+// VMERGE_VVM with an all ones mask.
+//
+// The resulting VL is the minimum of the two VLs.
+//
+// The resulting policy is the effective policy the vmerge would have had,
+// i.e. whether or not it's merge operand was implicit-def.
 bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   SDValue Merge, False, True, VL, Mask, Glue;
   // A vmv.v.v is equivalent to a vmerge with an all-ones mask.
@@ -3530,6 +3538,8 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
   if (Info->MaskAffectsResult && Mask && !usesAllOnesMask(Mask, Glue))
     return false;
 
+  // If True has a merge operand then it needs to be the same as vmerge's False,
+  // since False will be used for the result's merge operand.
   if (HasTiedDest && !isImplicitDef(True->getOperand(0))) {
     // The vmerge instruction must be TU.
     // FIXME: This could be relaxed, but we need to handle the policy for the
@@ -3537,19 +3547,17 @@ bool RISCVDAGToDAGISel::performCombineVMergeAndVOps(SDNode *N) {
     if (isImplicitDef(Merge))
       return false;
     SDValue MergeOpTrue = True->getOperand(0);
-    // Both the vmerge instruction and the True instruction must have the same
-    // merge operand.
     if (False != MergeOpTrue)
       return false;
   }
 
+  // If True is masked then the vmerge must have an all 1s mask, since we're
+  // going to keep the mask from True.
   if (IsMasked) {
     assert(HasTiedDest && "Expected tied dest");
     // The vmerge instruction must be TU.
     if (isImplicitDef(Merge))
       return false;
-    // The vmerge instruction must have an all 1s mask since we're going to keep
-    // the mask from the True instruction.
     // FIXME: Support mask agnostic True instruction which would have an
     // undef merge operand.
     if (Mask && !usesAllOnesMask(Mask, Glue))

From f7b61f81b5c4710fe089c357ef57ac0afcaf4b56 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 24 Jan 2024 13:58:48 +0800
Subject: [PATCH 736/843] [X86][CodeGen] Transform NDD SUB to CMP if dest reg
 is dead (#79135)

---
 llvm/lib/Target/X86/X86InstrInfo.cpp | 208 +++++++++++----------------
 llvm/test/CodeGen/X86/apx/adc.ll     |  68 ++++-----
 llvm/test/CodeGen/X86/apx/sbb.ll     |  60 ++++----
 llvm/test/CodeGen/X86/apx/sub.ll     |   4 +-
 4 files changed, 146 insertions(+), 194 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index d6f9aa6d6acec..c5c6d5a67c16e 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2268,72 +2268,44 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   MachineInstr *WorkingMI = nullptr;
   unsigned Opc = MI.getOpcode();
 
+#define CASE_ND(OP)                                                            \
+  case X86::OP:                                                                \
+  case X86::OP##_ND:
+
   switch (Opc) {
   // SHLD B, C, I <-> SHRD C, B, (BitWidth - I)
-  case X86::SHRD16rri8:
-  case X86::SHLD16rri8:
-  case X86::SHRD32rri8:
-  case X86::SHLD32rri8:
-  case X86::SHRD64rri8:
-  case X86::SHLD64rri8:
-  case X86::SHRD16rri8_ND:
-  case X86::SHLD16rri8_ND:
-  case X86::SHRD32rri8_ND:
-  case X86::SHLD32rri8_ND:
-  case X86::SHRD64rri8_ND:
-  case X86::SHLD64rri8_ND: {
+  CASE_ND(SHRD16rri8)
+  CASE_ND(SHLD16rri8)
+  CASE_ND(SHRD32rri8)
+  CASE_ND(SHLD32rri8)
+  CASE_ND(SHRD64rri8)
+  CASE_ND(SHLD64rri8) {
     unsigned Size;
     switch (Opc) {
     default:
       llvm_unreachable("Unreachable!");
-    case X86::SHRD16rri8:
-      Size = 16;
-      Opc = X86::SHLD16rri8;
-      break;
-    case X86::SHLD16rri8:
-      Size = 16;
-      Opc = X86::SHRD16rri8;
-      break;
-    case X86::SHRD32rri8:
-      Size = 32;
-      Opc = X86::SHLD32rri8;
-      break;
-    case X86::SHLD32rri8:
-      Size = 32;
-      Opc = X86::SHRD32rri8;
-      break;
-    case X86::SHRD64rri8:
-      Size = 64;
-      Opc = X86::SHLD64rri8;
-      break;
-    case X86::SHLD64rri8:
-      Size = 64;
-      Opc = X86::SHRD64rri8;
-      break;
-    case X86::SHRD16rri8_ND:
-      Size = 16;
-      Opc = X86::SHLD16rri8_ND;
-      break;
-    case X86::SHLD16rri8_ND:
-      Size = 16;
-      Opc = X86::SHRD16rri8_ND;
-      break;
-    case X86::SHRD32rri8_ND:
-      Size = 32;
-      Opc = X86::SHLD32rri8_ND;
-      break;
-    case X86::SHLD32rri8_ND:
-      Size = 32;
-      Opc = X86::SHRD32rri8_ND;
-      break;
-    case X86::SHRD64rri8_ND:
-      Size = 64;
-      Opc = X86::SHLD64rri8_ND;
-      break;
-    case X86::SHLD64rri8_ND:
-      Size = 64;
-      Opc = X86::SHRD64rri8_ND;
-      break;
+#define FROM_TO_SIZE(A, B, S)                                                  \
+  case X86::A:                                                                 \
+    Opc = X86::B;                                                              \
+    Size = S;                                                                  \
+    break;                                                                     \
+  case X86::A##_ND:                                                            \
+    Opc = X86::B##_ND;                                                         \
+    Size = S;                                                                  \
+    break;                                                                     \
+  case X86::B:                                                                 \
+    Opc = X86::A;                                                              \
+    Size = S;                                                                  \
+    break;                                                                     \
+  case X86::B##_ND:                                                            \
+    Opc = X86::A##_ND;                                                         \
+    Size = S;                                                                  \
+    break;
+
+    FROM_TO_SIZE(SHRD16rri8, SHLD16rri8, 16)
+    FROM_TO_SIZE(SHRD32rri8, SHLD32rri8, 32)
+    FROM_TO_SIZE(SHRD64rri8, SHLD64rri8, 64)
+#undef FROM_TO_SIZE
     }
     WorkingMI = CloneIfNew(MI);
     WorkingMI->setDesc(get(Opc));
@@ -4684,28 +4656,28 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
     }
     return true;
   // A SUB can be used to perform comparison.
-  case X86::SUB64rm:
-  case X86::SUB32rm:
-  case X86::SUB16rm:
-  case X86::SUB8rm:
+  CASE_ND(SUB64rm)
+  CASE_ND(SUB32rm)
+  CASE_ND(SUB16rm)
+  CASE_ND(SUB8rm)
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     CmpMask = 0;
     CmpValue = 0;
     return true;
-  case X86::SUB64rr:
-  case X86::SUB32rr:
-  case X86::SUB16rr:
-  case X86::SUB8rr:
+  CASE_ND(SUB64rr)
+  CASE_ND(SUB32rr)
+  CASE_ND(SUB16rr)
+  CASE_ND(SUB8rr)
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = MI.getOperand(2).getReg();
     CmpMask = 0;
     CmpValue = 0;
     return true;
-  case X86::SUB64ri32:
-  case X86::SUB32ri:
-  case X86::SUB16ri:
-  case X86::SUB8ri:
+  CASE_ND(SUB64ri32)
+  CASE_ND(SUB32ri)
+  CASE_ND(SUB16ri)
+  CASE_ND(SUB8ri)
     SrcReg = MI.getOperand(1).getReg();
     SrcReg2 = 0;
     if (MI.getOperand(2).isImm()) {
@@ -4750,10 +4722,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
   case X86::CMP32rr:
   case X86::CMP16rr:
   case X86::CMP8rr:
-  case X86::SUB64rr:
-  case X86::SUB32rr:
-  case X86::SUB16rr:
-  case X86::SUB8rr: {
+  CASE_ND(SUB64rr)
+  CASE_ND(SUB32rr)
+  CASE_ND(SUB16rr)
+  CASE_ND(SUB8rr) {
     Register OISrcReg;
     Register OISrcReg2;
     int64_t OIMask;
@@ -4775,10 +4747,10 @@ bool X86InstrInfo::isRedundantFlagInstr(const MachineInstr &FlagI,
   case X86::CMP32ri:
   case X86::CMP16ri:
   case X86::CMP8ri:
-  case X86::SUB64ri32:
-  case X86::SUB32ri:
-  case X86::SUB16ri:
-  case X86::SUB8ri:
+  CASE_ND(SUB64ri32)
+  CASE_ND(SUB32ri)
+  CASE_ND(SUB16ri)
+  CASE_ND(SUB8ri)
   case X86::TEST64rr:
   case X86::TEST32rr:
   case X86::TEST16rr:
@@ -5110,62 +5082,42 @@ bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   switch (CmpInstr.getOpcode()) {
   default:
     break;
-  case X86::SUB64ri32:
-  case X86::SUB32ri:
-  case X86::SUB16ri:
-  case X86::SUB8ri:
-  case X86::SUB64rm:
-  case X86::SUB32rm:
-  case X86::SUB16rm:
-  case X86::SUB8rm:
-  case X86::SUB64rr:
-  case X86::SUB32rr:
-  case X86::SUB16rr:
-  case X86::SUB8rr: {
+  CASE_ND(SUB64ri32)
+  CASE_ND(SUB32ri)
+  CASE_ND(SUB16ri)
+  CASE_ND(SUB8ri)
+  CASE_ND(SUB64rm)
+  CASE_ND(SUB32rm)
+  CASE_ND(SUB16rm)
+  CASE_ND(SUB8rm)
+  CASE_ND(SUB64rr)
+  CASE_ND(SUB32rr)
+  CASE_ND(SUB16rr)
+  CASE_ND(SUB8rr) {
     if (!MRI->use_nodbg_empty(CmpInstr.getOperand(0).getReg()))
       return false;
     // There is no use of the destination register, we can replace SUB with CMP.
     unsigned NewOpcode = 0;
+#define FROM_TO(A, B)                                                          \
+  CASE_ND(A) NewOpcode = X86::B;                                               \
+  break;
     switch (CmpInstr.getOpcode()) {
     default:
       llvm_unreachable("Unreachable!");
-    case X86::SUB64rm:
-      NewOpcode = X86::CMP64rm;
-      break;
-    case X86::SUB32rm:
-      NewOpcode = X86::CMP32rm;
-      break;
-    case X86::SUB16rm:
-      NewOpcode = X86::CMP16rm;
-      break;
-    case X86::SUB8rm:
-      NewOpcode = X86::CMP8rm;
-      break;
-    case X86::SUB64rr:
-      NewOpcode = X86::CMP64rr;
-      break;
-    case X86::SUB32rr:
-      NewOpcode = X86::CMP32rr;
-      break;
-    case X86::SUB16rr:
-      NewOpcode = X86::CMP16rr;
-      break;
-    case X86::SUB8rr:
-      NewOpcode = X86::CMP8rr;
-      break;
-    case X86::SUB64ri32:
-      NewOpcode = X86::CMP64ri32;
-      break;
-    case X86::SUB32ri:
-      NewOpcode = X86::CMP32ri;
-      break;
-    case X86::SUB16ri:
-      NewOpcode = X86::CMP16ri;
-      break;
-    case X86::SUB8ri:
-      NewOpcode = X86::CMP8ri;
-      break;
+    FROM_TO(SUB64rm, CMP64rm)
+    FROM_TO(SUB32rm, CMP32rm)
+    FROM_TO(SUB16rm, CMP16rm)
+    FROM_TO(SUB8rm, CMP8rm)
+    FROM_TO(SUB64rr, CMP64rr)
+    FROM_TO(SUB32rr, CMP32rr)
+    FROM_TO(SUB16rr, CMP16rr)
+    FROM_TO(SUB8rr, CMP8rr)
+    FROM_TO(SUB64ri32, CMP64ri32)
+    FROM_TO(SUB32ri, CMP32ri)
+    FROM_TO(SUB16ri, CMP16ri)
+    FROM_TO(SUB8ri, CMP8ri)
     }
+#undef FROM_TO
     CmpInstr.setDesc(get(NewOpcode));
     CmpInstr.removeOperand(0);
     // Mutating this instruction invalidates any debug data associated with it.
diff --git a/llvm/test/CodeGen/X86/apx/adc.ll b/llvm/test/CodeGen/X86/apx/adc.ll
index e8657acea8ad4..342d64e6bca00 100644
--- a/llvm/test/CodeGen/X86/apx/adc.ll
+++ b/llvm/test/CodeGen/X86/apx/adc.ll
@@ -4,7 +4,7 @@
 define i8 @adc8rr(i8 %a, i8 %b, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: adc8rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %dl, %cl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xd1]
+; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
 ; CHECK-NEXT:    adcb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x10,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = add i8 %a, %b
@@ -17,7 +17,7 @@ define i8 @adc8rr(i8 %a, i8 %b, i8 %x, i8 %y) nounwind {
 define i16 @adc16rr(i16 %a, i16 %b, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %dx, %cx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
 ; CHECK-NEXT:    adcw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x11,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = add i16 %a, %b
@@ -30,7 +30,7 @@ define i16 @adc16rr(i16 %a, i16 %b, i16 %x, i16 %y) nounwind {
 define i32 @adc32rr(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: adc32rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %edx, %ecx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
 ; CHECK-NEXT:    adcl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x11,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = add i32 %a, %b
@@ -43,7 +43,7 @@ define i32 @adc32rr(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 define i64 @adc64rr(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: adc64rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rdx, %rcx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
 ; CHECK-NEXT:    adcq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x11,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = add i64 %a, %b
@@ -56,7 +56,7 @@ define i64 @adc64rr(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 define i8 @adc8rm(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: adc8rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %dl, %cl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xd1]
+; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
 ; CHECK-NEXT:    adcb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x12,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
@@ -70,7 +70,7 @@ define i8 @adc8rm(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 define i16 @adc16rm(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %dx, %cx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
 ; CHECK-NEXT:    adcw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x13,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
@@ -84,7 +84,7 @@ define i16 @adc16rm(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 define i32 @adc32rm(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: adc32rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %edx, %ecx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
 ; CHECK-NEXT:    adcl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x13,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
@@ -98,7 +98,7 @@ define i32 @adc32rm(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 define i64 @adc64rm(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: adc64rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rdx, %rcx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
 ; CHECK-NEXT:    adcq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x13,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
@@ -112,7 +112,7 @@ define i64 @adc64rm(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 define i16 @adc16ri8(i16 %a, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16ri8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %si, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    adcw $0, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xd7,0x00,0x00]
 ; CHECK-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -127,7 +127,7 @@ define i16 @adc16ri8(i16 %a, i16 %x, i16 %y) nounwind {
 define i32 @adc32ri8(i32 %a, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: adc32ri8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
 ; CHECK-NEXT:    adcl $123, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xd7,0x7b,0x00,0x00,0x00]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = add i32 %a, 123
@@ -140,7 +140,7 @@ define i32 @adc32ri8(i32 %a, i32 %x, i32 %y) nounwind {
 define i64 @adc64ri8(i64 %a, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: adc64ri8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
 ; CHECK-NEXT:    adcq $123, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xd7,0x7b,0x00,0x00,0x00]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = add i64 %a, 123
@@ -153,7 +153,7 @@ define i64 @adc64ri8(i64 %a, i64 %x, i64 %y) nounwind {
 define i8 @adc8ri(i8 %a, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: adc8ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %sil, %dl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf2]
+; CHECK-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
 ; CHECK-NEXT:    adcb $123, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xd7,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = add i8 %a, 123
@@ -166,7 +166,7 @@ define i8 @adc8ri(i8 %a, i8 %x, i8 %y) nounwind {
 define i16 @adc16ri(i16 %a, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %si, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    adcw $0, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xd7,0x00,0x00]
 ; CHECK-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0x4D2
@@ -182,7 +182,7 @@ define i16 @adc16ri(i16 %a, i16 %x, i16 %y) nounwind {
 define i32 @adc32ri(i32 %a, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: adc32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
 ; CHECK-NEXT:    adcl $123456, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xd7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -196,7 +196,7 @@ define i32 @adc32ri(i32 %a, i32 %x, i32 %y) nounwind {
 define i64 @adc64ri(i64 %a, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: adc64ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
 ; CHECK-NEXT:    adcq $123456, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xd7,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -210,7 +210,7 @@ define i64 @adc64ri(i64 %a, i64 %x, i64 %y) nounwind {
 define i8 @adc8mr(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: adc8mr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %dl, %cl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xd1]
+; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
 ; CHECK-NEXT:    adcb %dil, (%rsi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x10,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
@@ -224,7 +224,7 @@ define i8 @adc8mr(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 define i16 @adc16mr(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16mr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %dx, %cx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
 ; CHECK-NEXT:    adcw %di, (%rsi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x11,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
@@ -238,7 +238,7 @@ define i16 @adc16mr(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 define i32 @adc32mr(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: adc32mr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %edx, %ecx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
 ; CHECK-NEXT:    adcl %edi, (%rsi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x11,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
@@ -252,7 +252,7 @@ define i32 @adc32mr(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 define i64 @adc64mr(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: adc64mr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rdx, %rcx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
 ; CHECK-NEXT:    adcq %rdi, (%rsi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x11,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
@@ -266,7 +266,7 @@ define i64 @adc64mr(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 define i16 @adc16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16mi8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %si, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    adcw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x17,0x00,0x00]
 ; CHECK-NEXT:    addl $123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x7b]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -282,7 +282,7 @@ define i16 @adc16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
 define i32 @adc32mi8(ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: adc32mi8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
 ; CHECK-NEXT:    adcl $123, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x17,0x7b,0x00,0x00,0x00]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i32, ptr %ptr
@@ -296,7 +296,7 @@ define i32 @adc32mi8(ptr %ptr, i32 %x, i32 %y) nounwind {
 define i64 @adc64mi8(ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: adc64mi8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
 ; CHECK-NEXT:    adcq $123, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x17,0x7b,0x00,0x00,0x00]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i64, ptr %ptr
@@ -310,7 +310,7 @@ define i64 @adc64mi8(ptr %ptr, i64 %x, i64 %y) nounwind {
 define i8 @adc8mi(ptr %ptr, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: adc8mi:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %sil, %dl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf2]
+; CHECK-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
 ; CHECK-NEXT:    adcb $123, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x17,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
@@ -324,7 +324,7 @@ define i8 @adc8mi(ptr %ptr, i8 %x, i8 %y) nounwind {
 define i16 @adc16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16mi:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %si, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    adcw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x17,0x00,0x00]
 ; CHECK-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0x4D2
@@ -341,7 +341,7 @@ define i16 @adc16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
 define i32 @adc32mi(ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: adc32mi:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
 ; CHECK-NEXT:    adcl $123456, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x17,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -356,7 +356,7 @@ define i32 @adc32mi(ptr %ptr, i32 %x, i32 %y) nounwind {
 define i64 @adc64mi(ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: adc64mi:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
 ; CHECK-NEXT:    adcq $123456, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x17,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -371,7 +371,7 @@ define i64 @adc64mi(ptr %ptr, i64 %x, i64 %y) nounwind {
 define void @adc8mr_legacy(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: adc8mr_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %dl, %cl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xd1]
+; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
 ; CHECK-NEXT:    adcb %dil, (%rsi) # encoding: [0x40,0x10,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
@@ -386,7 +386,7 @@ define void @adc8mr_legacy(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 define void @adc16mr_legacy(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16mr_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %dx, %cx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
 ; CHECK-NEXT:    adcw %di, (%rsi) # encoding: [0x66,0x11,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
@@ -401,7 +401,7 @@ define void @adc16mr_legacy(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 define void @adc32mr_legacy(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: adc32mr_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %edx, %ecx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
 ; CHECK-NEXT:    adcl %edi, (%rsi) # encoding: [0x11,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
@@ -416,7 +416,7 @@ define void @adc32mr_legacy(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 define void @adc64mr_legacy(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: adc64mr_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rdx, %rcx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
 ; CHECK-NEXT:    adcq %rdi, (%rsi) # encoding: [0x48,0x11,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
@@ -431,7 +431,7 @@ define void @adc64mr_legacy(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 define void @adc8mi_legacy(ptr %ptr, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: adc8mi_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %sil, %dl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf2]
+; CHECK-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
 ; CHECK-NEXT:    adcb $123, (%rdi) # encoding: [0x80,0x17,0x7b]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %a = load i8, ptr %ptr
@@ -446,7 +446,7 @@ define void @adc8mi_legacy(ptr %ptr, i8 %x, i8 %y) nounwind {
 define void @adc16mi_legacy(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: adc16mi_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %si, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    adcw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x17,0x00,0x00]
 ; CHECK-NEXT:    addl $1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xd2,0x04,0x00,0x00]
 ; CHECK-NEXT:    # imm = 0x4D2
@@ -464,7 +464,7 @@ define void @adc16mi_legacy(ptr %ptr, i16 %x, i16 %y) nounwind {
 define void @adc32mi_legacy(ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: adc32mi_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
 ; CHECK-NEXT:    adcl $123456, (%rdi) # encoding: [0x81,0x17,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -480,7 +480,7 @@ define void @adc32mi_legacy(ptr %ptr, i32 %x, i32 %y) nounwind {
 define void @adc64mi_legacy(ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: adc64mi_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
 ; CHECK-NEXT:    adcq $123456, (%rdi) # encoding: [0x48,0x81,0x17,0x40,0xe2,0x01,0x00]
 ; CHECK-NEXT:    # imm = 0x1E240
 ; CHECK-NEXT:    retq # encoding: [0xc3]
diff --git a/llvm/test/CodeGen/X86/apx/sbb.ll b/llvm/test/CodeGen/X86/apx/sbb.ll
index 755193588a7b0..778fea04b62b5 100644
--- a/llvm/test/CodeGen/X86/apx/sbb.ll
+++ b/llvm/test/CodeGen/X86/apx/sbb.ll
@@ -4,7 +4,7 @@
 define i8 @sbb8rr(i8 %a, i8 %b, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: sbb8rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %dl, %cl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xd1]
+; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
 ; CHECK-NEXT:    sbbb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x18,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = sub i8 %a, %b
@@ -17,7 +17,7 @@ define i8 @sbb8rr(i8 %a, i8 %b, i8 %x, i8 %y) nounwind {
 define i16 @sbb16rr(i16 %a, i16 %b, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: sbb16rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %dx, %cx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
 ; CHECK-NEXT:    sbbw %si, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x19,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = sub i16 %a, %b
@@ -30,7 +30,7 @@ define i16 @sbb16rr(i16 %a, i16 %b, i16 %x, i16 %y) nounwind {
 define i32 @sbb32rr(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: sbb32rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %edx, %ecx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
 ; CHECK-NEXT:    sbbl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x19,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = sub i32 %a, %b
@@ -43,7 +43,7 @@ define i32 @sbb32rr(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 define i64 @sbb64rr(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: sbb64rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rdx, %rcx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
 ; CHECK-NEXT:    sbbq %rsi, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x19,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %s = sub i64 %a, %b
@@ -56,7 +56,7 @@ define i64 @sbb64rr(i64 %a, i64 %b, i64 %x, i64 %y) nounwind {
 define i8 @sbb8rm(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: sbb8rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %dl, %cl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xd1]
+; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
 ; CHECK-NEXT:    sbbb (%rsi), %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x1a,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
@@ -70,7 +70,7 @@ define i8 @sbb8rm(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 define i16 @sbb16rm(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: sbb16rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %dx, %cx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
 ; CHECK-NEXT:    sbbw (%rsi), %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x1b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
@@ -84,7 +84,7 @@ define i16 @sbb16rm(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 define i32 @sbb32rm(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: sbb32rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %edx, %ecx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
 ; CHECK-NEXT:    sbbl (%rsi), %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x1b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
@@ -98,7 +98,7 @@ define i32 @sbb32rm(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 define i64 @sbb64rm(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: sbb64rm:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rdx, %rcx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
 ; CHECK-NEXT:    sbbq (%rsi), %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x1b,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
@@ -112,7 +112,7 @@ define i64 @sbb64rm(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 define i16 @sbb16ri8(i16 %a, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: sbb16ri8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %si, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    sbbw $0, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xdf,0x00,0x00]
 ; CHECK-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -127,7 +127,7 @@ define i16 @sbb16ri8(i16 %a, i16 %x, i16 %y) nounwind {
 define i32 @sbb32ri8(i32 %a, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: sbb32ri8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
 ; CHECK-NEXT:    sbbl $0, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xdf,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -141,7 +141,7 @@ define i32 @sbb32ri8(i32 %a, i32 %x, i32 %y) nounwind {
 define i64 @sbb64ri8(i64 %a, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: sbb64ri8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
 ; CHECK-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xdf,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    addq $-123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc0,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -155,7 +155,7 @@ define i64 @sbb64ri8(i64 %a, i64 %x, i64 %y) nounwind {
 define i8 @sbb8ri(i8 %a, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: sbb8ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %sil, %dl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf2]
+; CHECK-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
 ; CHECK-NEXT:    sbbb $0, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0xdf,0x00]
 ; CHECK-NEXT:    addb $-123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -169,7 +169,7 @@ define i8 @sbb8ri(i8 %a, i8 %x, i8 %y) nounwind {
 define i16 @sbb16ri(i16 %a, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: sbb16ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %si, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    sbbw $0, %di, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0xdf,0x00,0x00]
 ; CHECK-NEXT:    addl $-1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x2e,0xfb,0xff,0xff]
 ; CHECK-NEXT:    # imm = 0xFB2E
@@ -185,7 +185,7 @@ define i16 @sbb16ri(i16 %a, i16 %x, i16 %y) nounwind {
 define i32 @sbb32ri(i32 %a, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: sbb32ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
 ; CHECK-NEXT:    sbbl $0, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0xdf,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    addl $-123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
@@ -200,7 +200,7 @@ define i32 @sbb32ri(i32 %a, i32 %x, i32 %y) nounwind {
 define i64 @sbb64ri(i64 %a, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: sbb64ri:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
 ; CHECK-NEXT:    sbbq $0, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xdf,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    addq $-123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
@@ -215,7 +215,7 @@ define i64 @sbb64ri(i64 %a, i64 %x, i64 %y) nounwind {
 define i8 @sbb8mr(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: sbb8mr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %dl, %cl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xd1]
+; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
 ; CHECK-NEXT:    sbbb %dil, (%rsi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x18,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
@@ -229,7 +229,7 @@ define i8 @sbb8mr(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 define i16 @sbb16mr(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: sbb16mr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %dx, %cx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
 ; CHECK-NEXT:    sbbw %di, (%rsi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x19,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
@@ -243,7 +243,7 @@ define i16 @sbb16mr(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 define i32 @sbb32mr(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: sbb32mr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %edx, %ecx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
 ; CHECK-NEXT:    sbbl %edi, (%rsi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x19,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
@@ -257,7 +257,7 @@ define i32 @sbb32mr(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 define i64 @sbb64mr(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: sbb64mr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rdx, %rcx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
 ; CHECK-NEXT:    sbbq %rdi, (%rsi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x19,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
@@ -271,7 +271,7 @@ define i64 @sbb64mr(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 define i16 @sbb16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: sbb16mi8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %si, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    sbbw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x1f,0x00,0x00]
 ; CHECK-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
 ; CHECK-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -287,7 +287,7 @@ define i16 @sbb16mi8(ptr %ptr, i16 %x, i16 %y) nounwind {
 define i32 @sbb32mi8(ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: sbb32mi8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
 ; CHECK-NEXT:    sbbl $0, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x1f,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    addl $-123, %eax # EVEX TO LEGACY Compression encoding: [0x83,0xc0,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -302,7 +302,7 @@ define i32 @sbb32mi8(ptr %ptr, i32 %x, i32 %y) nounwind {
 define i64 @sbb64mi8(ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: sbb64mi8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
 ; CHECK-NEXT:    sbbq $0, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x1f,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    addq $-123, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x83,0xc0,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -317,7 +317,7 @@ define i64 @sbb64mi8(ptr %ptr, i64 %x, i64 %y) nounwind {
 define i8 @sbb8mi(ptr %ptr, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: sbb8mi:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %sil, %dl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xf2]
+; CHECK-NEXT:    cmpb %sil, %dl # encoding: [0x40,0x38,0xf2]
 ; CHECK-NEXT:    sbbb $0, (%rdi), %al # encoding: [0x62,0xf4,0x7c,0x18,0x80,0x1f,0x00]
 ; CHECK-NEXT:    addb $-123, %al # EVEX TO LEGACY Compression encoding: [0x04,0x85]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
@@ -332,7 +332,7 @@ define i8 @sbb8mi(ptr %ptr, i8 %x, i8 %y) nounwind {
 define i16 @sbb16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: sbb16mi:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %si, %dx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpw %si, %dx # encoding: [0x66,0x39,0xf2]
 ; CHECK-NEXT:    sbbw $0, (%rdi), %ax # encoding: [0x62,0xf4,0x7d,0x18,0x81,0x1f,0x00,0x00]
 ; CHECK-NEXT:    addl $-1234, %eax # EVEX TO LEGACY Compression encoding: [0x05,0x2e,0xfb,0xff,0xff]
 ; CHECK-NEXT:    # imm = 0xFB2E
@@ -349,7 +349,7 @@ define i16 @sbb16mi(ptr %ptr, i16 %x, i16 %y) nounwind {
 define i32 @sbb32mi(ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: sbb32mi:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %esi, %edx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpl %esi, %edx # encoding: [0x39,0xf2]
 ; CHECK-NEXT:    sbbl $0, (%rdi), %eax # encoding: [0x62,0xf4,0x7c,0x18,0x81,0x1f,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    addl $-123456, %eax # EVEX TO LEGACY Compression encoding: [0x05,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
@@ -365,7 +365,7 @@ define i32 @sbb32mi(ptr %ptr, i32 %x, i32 %y) nounwind {
 define i64 @sbb64mi(ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: sbb64mi:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rsi, %rdx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xf2]
+; CHECK-NEXT:    cmpq %rsi, %rdx # encoding: [0x48,0x39,0xf2]
 ; CHECK-NEXT:    sbbq $0, (%rdi), %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0x1f,0x00,0x00,0x00,0x00]
 ; CHECK-NEXT:    addq $-123456, %rax # EVEX TO LEGACY Compression encoding: [0x48,0x05,0xc0,0x1d,0xfe,0xff]
 ; CHECK-NEXT:    # imm = 0xFFFE1DC0
@@ -381,7 +381,7 @@ define i64 @sbb64mi(ptr %ptr, i64 %x, i64 %y) nounwind {
 define void @sbb8mr_legacy(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 ; CHECK-LABEL: sbb8mr_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subb %dl, %cl, %al # encoding: [0x62,0xf4,0x7c,0x18,0x28,0xd1]
+; CHECK-NEXT:    cmpb %dl, %cl # encoding: [0x38,0xd1]
 ; CHECK-NEXT:    sbbb %dil, (%rsi) # encoding: [0x40,0x18,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i8, ptr %ptr
@@ -396,7 +396,7 @@ define void @sbb8mr_legacy(i8 %a, ptr %ptr, i8 %x, i8 %y) nounwind {
 define void @sbb16mr_legacy(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 ; CHECK-LABEL: sbb16mr_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subw %dx, %cx, %ax # encoding: [0x62,0xf4,0x7d,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpw %dx, %cx # encoding: [0x66,0x39,0xd1]
 ; CHECK-NEXT:    sbbw %di, (%rsi) # encoding: [0x66,0x19,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i16, ptr %ptr
@@ -411,7 +411,7 @@ define void @sbb16mr_legacy(i16 %a, ptr %ptr, i16 %x, i16 %y) nounwind {
 define void @sbb32mr_legacy(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 ; CHECK-LABEL: sbb32mr_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subl %edx, %ecx, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpl %edx, %ecx # encoding: [0x39,0xd1]
 ; CHECK-NEXT:    sbbl %edi, (%rsi) # encoding: [0x19,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i32, ptr %ptr
@@ -426,7 +426,7 @@ define void @sbb32mr_legacy(i32 %a, ptr %ptr, i32 %x, i32 %y) nounwind {
 define void @sbb64mr_legacy(i64 %a, ptr %ptr, i64 %x, i64 %y) nounwind {
 ; CHECK-LABEL: sbb64mr_legacy:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq %rdx, %rcx, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x29,0xd1]
+; CHECK-NEXT:    cmpq %rdx, %rcx # encoding: [0x48,0x39,0xd1]
 ; CHECK-NEXT:    sbbq %rdi, (%rsi) # encoding: [0x48,0x19,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %b = load i64, ptr %ptr
diff --git a/llvm/test/CodeGen/X86/apx/sub.ll b/llvm/test/CodeGen/X86/apx/sub.ll
index aaf45c6b499c2..c8e48dbb981af 100644
--- a/llvm/test/CodeGen/X86/apx/sub.ll
+++ b/llvm/test/CodeGen/X86/apx/sub.ll
@@ -500,8 +500,8 @@ declare void @f()
 define void @sub64ri_reloc(i64 %val) {
 ; CHECK-LABEL: sub64ri_reloc:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    subq $val, %rdi, %rax # encoding: [0x62,0xf4,0xfc,0x18,0x81,0xef,A,A,A,A]
-; CHECK-NEXT:    # fixup A - offset: 6, value: val, kind: reloc_signed_4byte
+; CHECK-NEXT:    cmpq $val, %rdi # encoding: [0x48,0x81,0xff,A,A,A,A]
+; CHECK-NEXT:    # fixup A - offset: 3, value: val, kind: reloc_signed_4byte
 ; CHECK-NEXT:    jbe .LBB41_2 # encoding: [0x76,A]
 ; CHECK-NEXT:    # fixup A - offset: 1, value: .LBB41_2-1, kind: FK_PCRel_1
 ; CHECK-NEXT:  # %bb.1: # %t

From a01195ff5cc3d7fd084743b1f47007645bb385f4 Mon Sep 17 00:00:00 2001
From: Douglas Yung <douglas.yung@sony.com>
Date: Tue, 23 Jan 2024 22:02:27 -0800
Subject: [PATCH 737/843] Update compiler version expected that seems to be
 embedded in CHECK line of test at llvm/test/CodeGen/SystemZ/zos-ppa2.ll.

The test contains a CHECK line which verifies an .ascii line which originally checks
for 18001970010100000000. After the bump of the compiler version to 19, the test
started to fail with the string now being 19001970010100000000.

This should fix this failing test on bots.
---
 llvm/test/CodeGen/SystemZ/zos-ppa2.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/SystemZ/zos-ppa2.ll b/llvm/test/CodeGen/SystemZ/zos-ppa2.ll
index f54f654b804a2..2a686563388a4 100644
--- a/llvm/test/CodeGen/SystemZ/zos-ppa2.ll
+++ b/llvm/test/CodeGen/SystemZ/zos-ppa2.ll
@@ -24,7 +24,7 @@
 ; CHECK:    .byte   0
 ; CHECK:    .byte   3
 ; CHECK:    .short  30
-; CHECK:    .ascii  "\323\323\345\324@@@@@@\361\370\360\360\361\371\367\360\360\361\360\361\360\360\360\360\360\360\360\360"
+; CHECK:    .ascii  "\323\323\345\324@@@@@@\361\371\360\360\361\371\367\360\360\361\360\361\360\360\360\360\360\360\360\360"
 define void @void_test() {
 entry:
   ret void

From b4785cebfb0a756e19ff4ae3e41d2563f10cd292 Mon Sep 17 00:00:00 2001
From: Uday Bondhugula <uday@polymagelabs.com>
Date: Wed, 24 Jan 2024 11:41:44 +0530
Subject: [PATCH 738/843] [MLIR] NFC. Clean up stale TODO comments and style
 deviations in affine utils (#79079)

NFC. Clean up stale TODO comments and style deviations in affine utils
and
affine fusion utils.
---
 .../mlir/Dialect/Affine/Analysis/Utils.h      |  3 --
 .../mlir/Dialect/Affine/LoopFusionUtils.h     |  6 ---
 .../Dialect/Affine/Utils/LoopFusionUtils.cpp  | 44 +++++++++----------
 mlir/lib/Dialect/Affine/Utils/Utils.cpp       |  1 -
 4 files changed, 20 insertions(+), 34 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h b/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h
index 3dea99cd6b3e5..b8f354892ee60 100644
--- a/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h
+++ b/mlir/include/mlir/Dialect/Affine/Analysis/Utils.h
@@ -50,7 +50,6 @@ struct LoopNestStateCollector {
 // top-level operations in a `Block` which contain load/store ops, and edges
 // are memref dependences between the nodes.
 // TODO: Add a more flexible dependence graph representation.
-// TODO: Add a depth parameter to dependence graph construction.
 struct MemRefDependenceGraph {
 public:
   // Node represents a node in the graph. A Node is either an entire loop nest
@@ -394,7 +393,6 @@ bool buildSliceTripCountMap(
 /// nest surrounding ops in 'opsA' at 'loopDepth'. Returns
 /// 'SliceComputationResult::Success' if union was computed correctly, an
 /// appropriate 'failure' otherwise.
-// TODO: Change this API to take 'forOpA'/'forOpB'.
 SliceComputationResult
 computeSliceUnion(ArrayRef<Operation *> opsA, ArrayRef<Operation *> opsB,
                   unsigned loopDepth, unsigned numCommonLoops,
@@ -532,7 +530,6 @@ struct MemRefRegion {
   /// variables since getMemRefRegion() is called with a specific loop depth,
   /// and thus the region is symbolic in the outer surrounding loops at that
   /// depth.
-  // TODO: Replace this to exploit HyperRectangularSet.
   FlatAffineValueConstraints cst;
 };
 
diff --git a/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
index 4cec777f9888c..0ef39fd7d1463 100644
--- a/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
+++ b/mlir/include/mlir/Dialect/Affine/LoopFusionUtils.h
@@ -27,9 +27,6 @@ namespace affine {
 class AffineForOp;
 struct ComputationSliceState;
 
-// TODO: Extend this module to include utility functions for querying fusion
-// cost/storage reduction, and for performing the loop fusion transformation.
-
 struct FusionResult {
   enum ResultEnum {
     Success,
@@ -49,7 +46,6 @@ struct FusionResult {
 /// strategies are also limited to scenarios where a single memref is involved
 /// in the producer-consume or sibling relationship between the candidate
 /// loops. We use 'memref' to keep track of such a memref.
-// TODO: Remove 'memref' when we support more generic scenarios.
 // TODO: Generalize utilities so that producer-consumer and sibling fusion
 // strategies can be used without the assumptions made in the AffineLoopFusion
 // pass.
@@ -145,7 +141,6 @@ bool getLoopNestStats(AffineForOp forOp, LoopNestStats *stats);
 /// Currently, the total cost is computed by counting the total operation
 /// instance count (i.e. total number of operations in the loop body * loop
 /// trip count) for the entire loop nest.
-// TODO: Improve this cost model.
 int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats);
 
 /// Computes and returns in 'computeCost', the total compute cost of fusing the
@@ -154,7 +149,6 @@ int64_t getComputeCost(AffineForOp forOp, LoopNestStats &stats);
 /// (i.e. total number of operations in the loop body * loop trip count) for
 /// the entire loop nest.
 /// Returns true on success, failure otherwise (e.g. non-constant trip counts).
-// TODO: Improve this cost model.
 bool getFusionComputeCost(AffineForOp srcForOp, LoopNestStats &srcStats,
                           AffineForOp dstForOp, LoopNestStats &dstStats,
                           const ComputationSliceState &slice,
diff --git a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
index 77ee1d4c7a02d..fb45528ad5e7d 100644
--- a/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/LoopFusionUtils.cpp
@@ -150,8 +150,8 @@ static Operation *getFusedLoopNestInsertionPoint(AffineForOp srcForOp,
   //
   // Valid insertion point range: (lastDepOpB, firstDepOpA)
   //
-  if (firstDepOpA != nullptr) {
-    if (lastDepOpB != nullptr) {
+  if (firstDepOpA) {
+    if (lastDepOpB) {
       if (firstDepOpA->isBeforeInBlock(lastDepOpB) || firstDepOpA == lastDepOpB)
         // No valid insertion point exists which preserves dependences.
         return nullptr;
@@ -218,7 +218,7 @@ static unsigned getMaxLoopDepth(ArrayRef<Operation *> srcOps,
   // Check dependences on all pairs of ops in 'targetDstOps' and store the
   // minimum loop depth at which a dependence is satisfied.
   for (unsigned i = 0, e = targetDstOps.size(); i < e; ++i) {
-    auto *srcOpInst = targetDstOps[i];
+    Operation *srcOpInst = targetDstOps[i];
     MemRefAccess srcAccess(srcOpInst);
     for (unsigned j = 0; j < e; ++j) {
       auto *dstOpInst = targetDstOps[j];
@@ -539,7 +539,7 @@ static int64_t getComputeCostHelper(
     }
   }
   // Add in additional op instances from slice (if specified in map).
-  if (computeCostMap != nullptr) {
+  if (computeCostMap) {
     auto it = computeCostMap->find(forOp);
     if (it != computeCostMap->end()) {
       opCount += it->second;
@@ -547,7 +547,7 @@ static int64_t getComputeCostHelper(
   }
   // Override trip count (if specified in map).
   int64_t tripCount = stats.tripCountMap[forOp];
-  if (tripCountOverrideMap != nullptr) {
+  if (tripCountOverrideMap) {
     auto it = tripCountOverrideMap->find(forOp);
     if (it != tripCountOverrideMap->end()) {
       tripCount = it->second;
@@ -591,17 +591,14 @@ bool mlir::affine::getFusionComputeCost(AffineForOp srcForOp,
   auto *insertPointParent = slice.insertPoint->getParentOp();
 
   // The store and loads to this memref will disappear.
-  // TODO: Add load coalescing to memref data flow opt pass.
   if (storeLoadFwdGuaranteed) {
     // Subtract from operation count the loads/store we expect load/store
     // forwarding to remove.
     unsigned storeCount = 0;
     llvm::SmallDenseSet<Value, 4> storeMemrefs;
-    srcForOp.walk([&](Operation *op) {
-      if (auto storeOp = dyn_cast<AffineWriteOpInterface>(op)) {
-        storeMemrefs.insert(storeOp.getMemRef());
-        ++storeCount;
-      }
+    srcForOp.walk([&](AffineWriteOpInterface storeOp) {
+      storeMemrefs.insert(storeOp.getMemRef());
+      ++storeCount;
     });
     // Subtract out any store ops in single-iteration src slice loop nest.
     if (storeCount > 0)
@@ -609,19 +606,18 @@ bool mlir::affine::getFusionComputeCost(AffineForOp srcForOp,
     // Subtract out any load users of 'storeMemrefs' nested below
     // 'insertPointParent'.
     for (Value memref : storeMemrefs) {
-      for (auto *user : memref.getUsers()) {
-        if (dyn_cast<AffineReadOpInterface>(user)) {
-          SmallVector<AffineForOp, 4> loops;
-          // Check if any loop in loop nest surrounding 'user' is
-          // 'insertPointParent'.
-          getAffineForIVs(*user, &loops);
-          if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
-            if (auto forOp =
-                    dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
-              if (computeCostMap.count(forOp) == 0)
-                computeCostMap[forOp] = 0;
-              computeCostMap[forOp] -= 1;
-            }
+      for (Operation *user : memref.getUsers()) {
+        if (!isa<AffineReadOpInterface>(user))
+          continue;
+        SmallVector<AffineForOp, 4> loops;
+        // Check if any loop in loop nest surrounding 'user' is
+        // 'insertPointParent'.
+        getAffineForIVs(*user, &loops);
+        if (llvm::is_contained(loops, cast<AffineForOp>(insertPointParent))) {
+          if (auto forOp = dyn_cast_or_null<AffineForOp>(user->getParentOp())) {
+            if (computeCostMap.count(forOp) == 0)
+              computeCostMap[forOp] = 0;
+            computeCostMap[forOp] -= 1;
           }
         }
       }
diff --git a/mlir/lib/Dialect/Affine/Utils/Utils.cpp b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
index 578d03c629285..4d4adb94a9fc8 100644
--- a/mlir/lib/Dialect/Affine/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Affine/Utils/Utils.cpp
@@ -1215,7 +1215,6 @@ LogicalResult mlir::affine::replaceAllMemRefUsesWith(
   // Create new fully composed AffineMap for new op to be created.
   assert(newMapOperands.size() == newMemRefRank);
   auto newMap = builder.getMultiDimIdentityMap(newMemRefRank);
-  // TODO: Avoid creating/deleting temporary AffineApplyOps here.
   fullyComposeAffineMapAndOperands(&newMap, &newMapOperands);
   newMap = simplifyAffineMap(newMap);
   canonicalizeMapAndOperands(&newMap, &newMapOperands);

From bc182ea7c2e86aea27268a08311438f8093a6a45 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 24 Jan 2024 06:13:57 +0000
Subject: [PATCH 739/843] [gn build] Port 7251243315ef

---
 llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn | 7 ++-----
 llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn  | 1 +
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 047f6583ec4e8..d35e269115cac 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -8,9 +8,7 @@ static_library("CodeGenTypes") {
     "//llvm/include/llvm/CodeGen:GenVT",
     "//llvm/lib/Support",
   ]
-  sources = [
-    "LowLevelType.cpp"
-  ]
+  sources = [ "LowLevelType.cpp" ]
 }
 
 static_library("CodeGen") {
@@ -50,13 +48,11 @@ static_library("CodeGen") {
     "CFGuardLongjmp.cpp",
     "CFIFixup.cpp",
     "CFIInstrInserter.cpp",
-    "GCEmptyBasicBlocks.cpp",
     "CalcSpillWeights.cpp",
     "CallBrPrepare.cpp",
     "CallingConvLower.cpp",
     "CodeGen.cpp",
     "CodeGenCommonISel.cpp",
-    "CodeGenPassBuilder.cpp",
     "CodeGenPrepare.cpp",
     "CommandFlags.cpp",
     "ComplexDeinterleavingPass.cpp",
@@ -80,6 +76,7 @@ static_library("CodeGen") {
     "FinalizeISel.cpp",
     "FixupStatepointCallerSaved.cpp",
     "FuncletLayout.cpp",
+    "GCEmptyBasicBlocks.cpp",
     "GCMetadata.cpp",
     "GCMetadataPrinter.cpp",
     "GCRootLowering.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
index 5cd96737886c5..274f5b54345c7 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Passes/BUILD.gn
@@ -20,6 +20,7 @@ static_library("Passes") {
     "//llvm/lib/Transforms/Vectorize",
   ]
   sources = [
+    "CodeGenPassBuilder.cpp",
     "OptimizationLevel.cpp",
     "PassBuilder.cpp",
     "PassBuilderBindings.cpp",

From 7e3fb372b0e8899958ec7e9241797e7e136a7a23 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?kadir=20=C3=A7etinkaya?= <kadircet@google.com>
Date: Wed, 24 Jan 2024 07:26:00 +0100
Subject: [PATCH 740/843] [include-cleaner] Check emptiness instead of
 occurences (#79154)

Our internal integration relies on injecting some default values to
ignore/keep lists.
That means we can have filters, despite of not having occurences
for the flag.
---
 clang-tools-extra/include-cleaner/test/tool.cpp           | 8 ++++++++
 clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp | 4 ++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/clang-tools-extra/include-cleaner/test/tool.cpp b/clang-tools-extra/include-cleaner/test/tool.cpp
index de47b2a3f3778..2155eec189d18 100644
--- a/clang-tools-extra/include-cleaner/test/tool.cpp
+++ b/clang-tools-extra/include-cleaner/test/tool.cpp
@@ -22,10 +22,18 @@ int x = foo();
 // IGNORE2-NOT: - "foobar.h"
 //     IGNORE2: + "foo.h"
 
+//        RUN: clang-include-cleaner -print=changes %s --ignore-headers= -- -I%S/Inputs/ | FileCheck --allow-empty --check-prefix=IGNORE3 %s
+//    IGNORE3: - "foobar.h"
+//    IGNORE3: + "foo.h"
+
 //        RUN: clang-include-cleaner -print=changes %s --only-headers="foo\.h" -- -I%S/Inputs/ | FileCheck --match-full-lines --allow-empty --check-prefix=ONLY %s
 //   ONLY-NOT: - "foobar.h"
 //       ONLY: + "foo.h"
 
+//        RUN: clang-include-cleaner -print=changes %s --only-headers= -- -I%S/Inputs/ | FileCheck --allow-empty --check-prefix=ONLY2 %s
+//      ONLY2: - "foobar.h"
+//      ONLY2: + "foo.h"
+
 //        RUN: clang-include-cleaner -print %s -- -I%S/Inputs/ | FileCheck --match-full-lines --check-prefix=PRINT %s
 //      PRINT: #include "foo.h"
 //  PRINT-NOT: {{^}}#include "foobar.h"{{$}}
diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
index 132ad25411509..3bc449b0152bb 100644
--- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
+++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
@@ -262,9 +262,9 @@ std::function<bool(llvm::StringRef)> headerFilter() {
     return nullptr;
 
   return [OnlyMatches, IgnoreMatches](llvm::StringRef Header) {
-    if (OnlyHeaders.getNumOccurrences() && !OnlyMatches(Header))
+    if (!OnlyHeaders.empty() && !OnlyMatches(Header))
       return true;
-    if (IgnoreHeaders.getNumOccurrences() && IgnoreMatches(Header))
+    if (!IgnoreHeaders.empty() && IgnoreMatches(Header))
       return true;
     return false;
   };

From d119ecb958ebc0a82e7bca01a21115ced8e0b13d Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 24 Jan 2024 14:42:33 +0800
Subject: [PATCH 741/843] [X86][NFC] Pre-commit test for RA hints for APX NDD
 instructions

---
 llvm/test/CodeGen/X86/apx/mul-i1024.ll | 2919 +++++++++++++++---------
 1 file changed, 1889 insertions(+), 1030 deletions(-)

diff --git a/llvm/test/CodeGen/X86/apx/mul-i1024.ll b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
index 3bffd02cbbab9..2eaa161225e4a 100644
--- a/llvm/test/CodeGen/X86/apx/mul-i1024.ll
+++ b/llvm/test/CodeGen/X86/apx/mul-i1024.ll
@@ -1,1036 +1,1895 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+egpr | FileCheck %s --check-prefix=APX
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+egpr | FileCheck %s --check-prefix=EGPR
+; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+egpr,+ndd | FileCheck %s --check-prefix=EGPR-NDD
 
 define void @test_1024(ptr %a, ptr %b, ptr %out) nounwind {
-; APX-LABEL: test_1024:
-; APX:       # %bb.0:
-; APX-NEXT:    pushq %rbp
-; APX-NEXT:    pushq %r15
-; APX-NEXT:    pushq %r14
-; APX-NEXT:    pushq %r13
-; APX-NEXT:    pushq %r12
-; APX-NEXT:    pushq %rbx
-; APX-NEXT:    subq $104, %rsp
-; APX-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq %rdi, %r24
-; APX-NEXT:    movq (%rdi), %r13
-; APX-NEXT:    movq 8(%rdi), %r18
-; APX-NEXT:    movq 24(%rdi), %r29
-; APX-NEXT:    movq 16(%rdi), %r17
-; APX-NEXT:    movq 40(%rdi), %rdi
-; APX-NEXT:    movq 32(%r24), %r10
-; APX-NEXT:    movq 56(%r24), %r15
-; APX-NEXT:    movq 48(%r24), %r12
-; APX-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq 24(%rsi), %r23
-; APX-NEXT:    movq 16(%rsi), %r11
-; APX-NEXT:    movq (%rsi), %r27
-; APX-NEXT:    movq 8(%rsi), %r14
-; APX-NEXT:    movq %r12, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r19
-; APX-NEXT:    movq %r15, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %r12, %rax
-; APX-NEXT:    mulq %r14
-; APX-NEXT:    movq %rdx, %r20
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    addq %r16, %r8
-; APX-NEXT:    adcq %r9, %r20
-; APX-NEXT:    setb %al
-; APX-NEXT:    movzbl %al, %ecx
-; APX-NEXT:    movq %r15, %rax
-; APX-NEXT:    mulq %r14
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r20, %r16
-; APX-NEXT:    adcq %rcx, %r9
-; APX-NEXT:    movq %r10, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r20
-; APX-NEXT:    movq %rax, %r25
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r21
-; APX-NEXT:    movq %rax, %r22
-; APX-NEXT:    addq %r20, %r22
-; APX-NEXT:    adcq $0, %r21
-; APX-NEXT:    movq %r10, %rax
-; APX-NEXT:    mulq %r14
-; APX-NEXT:    movq %rdx, %r20
-; APX-NEXT:    movq %rax, %r28
-; APX-NEXT:    addq %r22, %r28
-; APX-NEXT:    adcq %r21, %r20
-; APX-NEXT:    setb %al
-; APX-NEXT:    movzbl %al, %ecx
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    mulq %r14
-; APX-NEXT:    movq %rdx, %r21
-; APX-NEXT:    movq %rax, %r22
-; APX-NEXT:    addq %r20, %r22
-; APX-NEXT:    adcq %rcx, %r21
-; APX-NEXT:    addq %r19, %r22
-; APX-NEXT:    adcq %r8, %r21
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq %r10, %rax
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r30
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %r19
-; APX-NEXT:    movq %rax, %r20
-; APX-NEXT:    addq %r8, %r20
-; APX-NEXT:    adcq $0, %r19
-; APX-NEXT:    movq %r10, %rax
-; APX-NEXT:    mulq %r23
-; APX-NEXT:    movq %rdx, %rbx
-; APX-NEXT:    movq %rax, %r31
-; APX-NEXT:    addq %r20, %r31
-; APX-NEXT:    adcq %r19, %rbx
-; APX-NEXT:    setb %al
-; APX-NEXT:    movzbl %al, %ecx
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    mulq %r23
-; APX-NEXT:    movq %rdx, %r26
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    addq %rbx, %r8
-; APX-NEXT:    adcq %rcx, %r26
-; APX-NEXT:    addq %r22, %r30
-; APX-NEXT:    adcq %r21, %r31
-; APX-NEXT:    adcq $0, %r8
-; APX-NEXT:    adcq $0, %r26
-; APX-NEXT:    addq %r16, %r8
-; APX-NEXT:    adcq %r9, %r26
-; APX-NEXT:    setb %al
-; APX-NEXT:    movzbl %al, %ecx
-; APX-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq %r12, %rax
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %rsi
-; APX-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq %r15, %rax
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %r16
-; APX-NEXT:    movq %rax, %r21
-; APX-NEXT:    addq %r9, %r21
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    movq %r12, %rax
-; APX-NEXT:    mulq %r23
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %rdi
-; APX-NEXT:    addq %r21, %rdi
-; APX-NEXT:    adcq %r16, %r9
-; APX-NEXT:    setb %al
-; APX-NEXT:    movzbl %al, %r10d
-; APX-NEXT:    movq %r15, %rax
-; APX-NEXT:    mulq %r23
-; APX-NEXT:    movq %rdx, %r21
-; APX-NEXT:    movq %rax, %r22
-; APX-NEXT:    addq %r9, %r22
-; APX-NEXT:    adcq %r10, %r21
-; APX-NEXT:    addq %r8, %rsi
-; APX-NEXT:    movq %rsi, %r19
-; APX-NEXT:    adcq %r26, %rdi
-; APX-NEXT:    adcq %rcx, %r22
-; APX-NEXT:    adcq $0, %r21
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %rbx
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    mulq %r14
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r26
-; APX-NEXT:    addq %r16, %r26
-; APX-NEXT:    adcq %r9, %r8
-; APX-NEXT:    setb %al
-; APX-NEXT:    movzbl %al, %ecx
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %r14
-; APX-NEXT:    movq %r14, %rsi
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    adcq %rcx, %r9
-; APX-NEXT:    movq %r13, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq %r18, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r14
-; APX-NEXT:    movq %rax, %r15
-; APX-NEXT:    addq %r8, %r15
-; APX-NEXT:    adcq $0, %r14
-; APX-NEXT:    movq %r13, %rax
-; APX-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    mulq %rsi
-; APX-NEXT:    movq %rdx, %r12
-; APX-NEXT:    addq %r15, %rax
-; APX-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %r14, %r12
-; APX-NEXT:    setb %cl
-; APX-NEXT:    movq %r18, %rax
-; APX-NEXT:    mulq %rsi
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r15
-; APX-NEXT:    addq %r12, %r15
-; APX-NEXT:    movzbl %cl, %eax
-; APX-NEXT:    adcq %rax, %r8
-; APX-NEXT:    addq %rbx, %r15
-; APX-NEXT:    adcq %r26, %r8
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %r13, %rax
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %r26
-; APX-NEXT:    movq %rax, %rsi
-; APX-NEXT:    movq %r18, %rax
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %rbx
-; APX-NEXT:    movq %rax, %r14
-; APX-NEXT:    addq %r26, %r14
-; APX-NEXT:    adcq $0, %rbx
-; APX-NEXT:    movq %r13, %rax
-; APX-NEXT:    mulq %r23
-; APX-NEXT:    movq %rdx, %r12
-; APX-NEXT:    addq %r14, %rax
-; APX-NEXT:    movq %rax, %r10
-; APX-NEXT:    adcq %rbx, %r12
-; APX-NEXT:    setb %cl
-; APX-NEXT:    movq %r18, %rax
-; APX-NEXT:    mulq %r23
-; APX-NEXT:    movq %rdx, %r14
-; APX-NEXT:    movq %rax, %r26
-; APX-NEXT:    addq %r12, %r26
-; APX-NEXT:    movzbl %cl, %eax
-; APX-NEXT:    adcq %rax, %r14
-; APX-NEXT:    addq %r15, %rsi
-; APX-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %r8, %r10
-; APX-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq $0, %r26
-; APX-NEXT:    adcq $0, %r14
-; APX-NEXT:    addq %r16, %r26
-; APX-NEXT:    adcq %r9, %r14
-; APX-NEXT:    setb %cl
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %rbx
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    mulq %r23
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r15
-; APX-NEXT:    addq %r16, %r15
-; APX-NEXT:    adcq %r9, %r8
-; APX-NEXT:    setb %r9b
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %r23
-; APX-NEXT:    movq %rdx, %r12
-; APX-NEXT:    movq %rax, %rbp
-; APX-NEXT:    addq %r8, %rbp
-; APX-NEXT:    movzbl %r9b, %eax
-; APX-NEXT:    adcq %rax, %r12
-; APX-NEXT:    addq %r26, %rbx
-; APX-NEXT:    adcq %r14, %r15
-; APX-NEXT:    movzbl %cl, %eax
-; APX-NEXT:    adcq %rax, %rbp
-; APX-NEXT:    adcq $0, %r12
-; APX-NEXT:    addq %r25, %rbx
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq 32(%rsi), %r25
-; APX-NEXT:    adcq %r28, %r15
-; APX-NEXT:    adcq %r30, %rbp
-; APX-NEXT:    adcq %r31, %r12
-; APX-NEXT:    adcq $0, %r19
-; APX-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq $0, %rdi
-; APX-NEXT:    adcq $0, %r22
-; APX-NEXT:    adcq $0, %r21
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r28
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq 40(%rsi), %rcx
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r26
-; APX-NEXT:    addq %r16, %r26
-; APX-NEXT:    adcq %r9, %r8
-; APX-NEXT:    setb %r10b
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    movzbl %r10b, %eax
-; APX-NEXT:    adcq %rax, %r9
-; APX-NEXT:    movq %r13, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r19
-; APX-NEXT:    movq %r18, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r30
-; APX-NEXT:    movq %rax, %r31
-; APX-NEXT:    addq %r8, %r31
-; APX-NEXT:    adcq $0, %r30
-; APX-NEXT:    movq %r13, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r20
-; APX-NEXT:    addq %r31, %r20
-; APX-NEXT:    adcq %r30, %r8
-; APX-NEXT:    setb %r10b
-; APX-NEXT:    movq %r18, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r30
-; APX-NEXT:    movq %rax, %r31
-; APX-NEXT:    addq %r8, %r31
-; APX-NEXT:    movzbl %r10b, %eax
-; APX-NEXT:    adcq %rax, %r30
-; APX-NEXT:    addq %r28, %r31
-; APX-NEXT:    adcq %r26, %r30
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq 48(%rsi), %r28
-; APX-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq %r13, %rax
-; APX-NEXT:    mulq %r28
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r11
-; APX-NEXT:    movq %r18, %rax
-; APX-NEXT:    movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    mulq %r28
-; APX-NEXT:    movq %rdx, %r26
-; APX-NEXT:    movq %rax, %r14
-; APX-NEXT:    addq %r8, %r14
-; APX-NEXT:    adcq $0, %r26
-; APX-NEXT:    movq 56(%rsi), %r10
-; APX-NEXT:    movq %r13, %rax
-; APX-NEXT:    mulq %r10
-; APX-NEXT:    movq %rdx, %r13
-; APX-NEXT:    addq %r14, %rax
-; APX-NEXT:    movq %rax, %r14
-; APX-NEXT:    adcq %r26, %r13
-; APX-NEXT:    setb %sil
-; APX-NEXT:    movq %r18, %rax
-; APX-NEXT:    mulq %r10
-; APX-NEXT:    movq %rdx, %r26
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    addq %r13, %r8
-; APX-NEXT:    movzbl %sil, %eax
-; APX-NEXT:    adcq %rax, %r26
-; APX-NEXT:    addq %r31, %r11
-; APX-NEXT:    adcq %r30, %r14
-; APX-NEXT:    adcq $0, %r8
-; APX-NEXT:    adcq $0, %r26
-; APX-NEXT:    addq %r16, %r8
-; APX-NEXT:    adcq %r9, %r26
-; APX-NEXT:    setb %r18b
-; APX-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    mulq %r28
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r30
-; APX-NEXT:    movq %r29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %r28
-; APX-NEXT:    movq %rdx, %r16
-; APX-NEXT:    movq %rax, %r31
-; APX-NEXT:    addq %r9, %r31
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    mulq %r10
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r17
-; APX-NEXT:    addq %r31, %r17
-; APX-NEXT:    adcq %r16, %r9
-; APX-NEXT:    setb %r16b
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %r10
-; APX-NEXT:    movq %rdx, %r13
-; APX-NEXT:    movq %rax, %r31
-; APX-NEXT:    addq %r9, %r31
-; APX-NEXT:    movzbl %r16b, %eax
-; APX-NEXT:    adcq %rax, %r13
-; APX-NEXT:    addq %r8, %r30
-; APX-NEXT:    adcq %r26, %r17
-; APX-NEXT:    movzbl %r18b, %eax
-; APX-NEXT:    adcq %rax, %r31
-; APX-NEXT:    adcq $0, %r13
-; APX-NEXT:    addq %rbx, %r19
-; APX-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %r15, %r20
-; APX-NEXT:    movq %r20, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %rbp, %r11
-; APX-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %r12, %r14
-; APX-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq $0, %r30
-; APX-NEXT:    adcq $0, %r17
-; APX-NEXT:    adcq $0, %r31
-; APX-NEXT:    adcq $0, %r13
-; APX-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
-; APX-NEXT:    adcq %rdi, %r17
-; APX-NEXT:    adcq %r22, %r31
-; APX-NEXT:    adcq %r21, %r13
-; APX-NEXT:    setb %r15b
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r19
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload
-; APX-NEXT:    movq %r21, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %rsi, %rax
-; APX-NEXT:    movq %rsi, %r29
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r20
-; APX-NEXT:    addq %r16, %r20
-; APX-NEXT:    adcq %r9, %r8
-; APX-NEXT:    setb %r18b
-; APX-NEXT:    movq %r21, %rax
-; APX-NEXT:    movq %r21, %r14
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    movzbl %r18b, %eax
-; APX-NEXT:    adcq %rax, %r9
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; APX-NEXT:    movq %rbx, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %rdi
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r21
-; APX-NEXT:    movq %rax, %r22
-; APX-NEXT:    addq %r8, %r22
-; APX-NEXT:    adcq $0, %r21
-; APX-NEXT:    movq %rbx, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    addq %r22, %rax
-; APX-NEXT:    movq %rax, %r11
-; APX-NEXT:    adcq %r21, %r8
-; APX-NEXT:    setb %r18b
-; APX-NEXT:    movq %rsi, %rax
-; APX-NEXT:    movq %rsi, %r21
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r22
-; APX-NEXT:    movq %rax, %r26
-; APX-NEXT:    addq %r8, %r26
-; APX-NEXT:    movzbl %r18b, %eax
-; APX-NEXT:    adcq %rax, %r22
-; APX-NEXT:    addq %r19, %r26
-; APX-NEXT:    adcq %r20, %r22
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %rbx, %rax
-; APX-NEXT:    mulq %r28
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %rsi
-; APX-NEXT:    movq %r21, %rax
-; APX-NEXT:    mulq %r28
-; APX-NEXT:    movq %rdx, %r19
-; APX-NEXT:    movq %rax, %r20
-; APX-NEXT:    addq %r8, %r20
-; APX-NEXT:    adcq $0, %r19
-; APX-NEXT:    movq %rbx, %rax
-; APX-NEXT:    mulq %r10
-; APX-NEXT:    movq %rdx, %rbx
-; APX-NEXT:    addq %r20, %rax
-; APX-NEXT:    movq %rax, %r20
-; APX-NEXT:    adcq %r19, %rbx
-; APX-NEXT:    setb %r18b
-; APX-NEXT:    movq %r21, %rax
-; APX-NEXT:    mulq %r10
-; APX-NEXT:    movq %rdx, %r21
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    addq %rbx, %r8
-; APX-NEXT:    movzbl %r18b, %eax
-; APX-NEXT:    adcq %rax, %r21
-; APX-NEXT:    addq %r26, %rsi
-; APX-NEXT:    adcq %r22, %r20
-; APX-NEXT:    adcq $0, %r8
-; APX-NEXT:    adcq $0, %r21
-; APX-NEXT:    addq %r16, %r8
-; APX-NEXT:    adcq %r9, %r21
-; APX-NEXT:    setb %r18b
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %r28
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r22
-; APX-NEXT:    movq %r14, %rax
-; APX-NEXT:    mulq %r28
-; APX-NEXT:    movq %rdx, %r16
-; APX-NEXT:    movq %rax, %r19
-; APX-NEXT:    addq %r9, %r19
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    movq %r29, %rax
-; APX-NEXT:    mulq %r10
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    addq %r19, %rax
-; APX-NEXT:    movq %rax, %r19
-; APX-NEXT:    adcq %r16, %r9
-; APX-NEXT:    setb %r16b
-; APX-NEXT:    movq %r14, %rax
-; APX-NEXT:    mulq %r10
-; APX-NEXT:    movq %rdx, %rbp
-; APX-NEXT:    movq %rax, %r12
-; APX-NEXT:    addq %r9, %r12
-; APX-NEXT:    movzbl %r16b, %eax
-; APX-NEXT:    adcq %rax, %rbp
-; APX-NEXT:    addq %r8, %r22
-; APX-NEXT:    adcq %r21, %r19
-; APX-NEXT:    movzbl %r18b, %eax
-; APX-NEXT:    adcq %rax, %r12
-; APX-NEXT:    adcq $0, %rbp
-; APX-NEXT:    addq %r30, %rdi
-; APX-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %r17, %r11
-; APX-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %r31, %rsi
-; APX-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %r13, %r20
-; APX-NEXT:    movq %r20, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movzbl %r15b, %eax
-; APX-NEXT:    adcq %rax, %r22
-; APX-NEXT:    movq %r22, (%rsp) # 8-byte Spill
-; APX-NEXT:    adcq $0, %r19
-; APX-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq $0, %r12
-; APX-NEXT:    adcq $0, %rbp
-; APX-NEXT:    movq 64(%r24), %r21
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    mulq %r21
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r22
-; APX-NEXT:    movq %r23, %rax
-; APX-NEXT:    mulq %r21
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq 72(%r24), %r30
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    mulq %r30
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r26
-; APX-NEXT:    addq %r16, %r26
-; APX-NEXT:    adcq %r9, %r8
-; APX-NEXT:    setb %r18b
-; APX-NEXT:    movq %r23, %rax
-; APX-NEXT:    mulq %r30
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    movzbl %r18b, %eax
-; APX-NEXT:    adcq %rax, %r9
-; APX-NEXT:    movq %r27, %rax
-; APX-NEXT:    mulq %r21
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; APX-NEXT:    movq %r11, %rax
-; APX-NEXT:    mulq %r21
-; APX-NEXT:    movq %rdx, %r31
-; APX-NEXT:    movq %rax, %rbx
-; APX-NEXT:    addq %r8, %rbx
-; APX-NEXT:    adcq $0, %r31
-; APX-NEXT:    movq %r27, %rax
-; APX-NEXT:    mulq %r30
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    addq %rbx, %rax
-; APX-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %r31, %r8
-; APX-NEXT:    setb %r18b
-; APX-NEXT:    movq %r11, %rax
-; APX-NEXT:    mulq %r30
-; APX-NEXT:    movq %rdx, %r31
-; APX-NEXT:    movq %rax, %rbx
-; APX-NEXT:    addq %r8, %rbx
-; APX-NEXT:    movzbl %r18b, %eax
-; APX-NEXT:    adcq %rax, %r31
-; APX-NEXT:    addq %r22, %rbx
-; APX-NEXT:    adcq %r26, %r31
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq 80(%r24), %r13
-; APX-NEXT:    movq %r27, %rax
-; APX-NEXT:    mulq %r13
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %rsi
-; APX-NEXT:    movq %r11, %rax
-; APX-NEXT:    mulq %r13
-; APX-NEXT:    movq %rdx, %r26
-; APX-NEXT:    movq %rax, %r14
-; APX-NEXT:    addq %r8, %r14
-; APX-NEXT:    adcq $0, %r26
-; APX-NEXT:    movq 88(%r24), %r18
-; APX-NEXT:    movq %r27, %rax
-; APX-NEXT:    mulq %r18
-; APX-NEXT:    movq %rdx, %r15
-; APX-NEXT:    movq %rax, %r22
-; APX-NEXT:    addq %r14, %r22
-; APX-NEXT:    adcq %r26, %r15
-; APX-NEXT:    setb %r14b
-; APX-NEXT:    movq %r11, %rax
-; APX-NEXT:    mulq %r18
-; APX-NEXT:    movq %rdx, %r26
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    addq %r15, %r8
-; APX-NEXT:    movzbl %r14b, %eax
-; APX-NEXT:    adcq %rax, %r26
-; APX-NEXT:    addq %rbx, %rsi
-; APX-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; APX-NEXT:    adcq %r31, %r22
-; APX-NEXT:    adcq $0, %r8
-; APX-NEXT:    adcq $0, %r26
-; APX-NEXT:    addq %r16, %r8
-; APX-NEXT:    adcq %r9, %r26
-; APX-NEXT:    setb %r31b
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    mulq %r13
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %rsi
-; APX-NEXT:    movq %r23, %rax
-; APX-NEXT:    mulq %r13
-; APX-NEXT:    movq %rdx, %r16
-; APX-NEXT:    movq %rax, %r14
-; APX-NEXT:    addq %r9, %r14
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    mulq %r18
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %rbx
-; APX-NEXT:    addq %r14, %rbx
-; APX-NEXT:    adcq %r16, %r9
-; APX-NEXT:    setb %r16b
-; APX-NEXT:    movq %r23, %rax
-; APX-NEXT:    mulq %r18
-; APX-NEXT:    movq %rdx, %r14
-; APX-NEXT:    movq %rax, %r15
-; APX-NEXT:    addq %r9, %r15
-; APX-NEXT:    movzbl %r16b, %eax
-; APX-NEXT:    adcq %rax, %r14
-; APX-NEXT:    addq %r8, %rsi
-; APX-NEXT:    adcq %r26, %rbx
-; APX-NEXT:    movzbl %r31b, %eax
-; APX-NEXT:    adcq %rax, %r15
-; APX-NEXT:    adcq $0, %r14
-; APX-NEXT:    imulq %r25, %r18
-; APX-NEXT:    movq %r25, %rax
-; APX-NEXT:    mulq %r13
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    addq %r18, %rdx
-; APX-NEXT:    imulq %rcx, %r13
-; APX-NEXT:    addq %rdx, %r13
-; APX-NEXT:    movq %r28, %r9
-; APX-NEXT:    imulq %r30, %r9
-; APX-NEXT:    movq %r28, %rax
-; APX-NEXT:    mulq %r21
-; APX-NEXT:    movq %rax, %r26
-; APX-NEXT:    addq %r9, %rdx
-; APX-NEXT:    imulq %r21, %r10
-; APX-NEXT:    addq %rdx, %r10
-; APX-NEXT:    addq %r8, %r26
-; APX-NEXT:    adcq %r13, %r10
-; APX-NEXT:    movq %r21, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r9
-; APX-NEXT:    movq %r30, %rax
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rdx, %r25
-; APX-NEXT:    movq %rax, %r28
-; APX-NEXT:    addq %r8, %r28
-; APX-NEXT:    adcq $0, %r25
-; APX-NEXT:    movq %r21, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r28, %r16
-; APX-NEXT:    adcq %r25, %r8
-; APX-NEXT:    setb %r18b
-; APX-NEXT:    movq %r30, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r21
-; APX-NEXT:    movq %rax, %r28
-; APX-NEXT:    addq %r8, %r28
-; APX-NEXT:    movzbl %r18b, %eax
-; APX-NEXT:    adcq %rax, %r21
-; APX-NEXT:    addq %r26, %r28
-; APX-NEXT:    adcq %r10, %r21
-; APX-NEXT:    movq 112(%r24), %rcx
-; APX-NEXT:    movq %r27, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    imulq %r11, %rcx
-; APX-NEXT:    addq %rdx, %rcx
-; APX-NEXT:    movq 120(%r24), %rax
-; APX-NEXT:    imulq %r27, %rax
-; APX-NEXT:    addq %rax, %rcx
-; APX-NEXT:    movq 96(%r24), %r25
-; APX-NEXT:    movq 104(%r24), %r26
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    imulq %r26, %rdi
-; APX-NEXT:    mulq %r25
-; APX-NEXT:    movq %rax, %r29
-; APX-NEXT:    addq %rdi, %rdx
-; APX-NEXT:    imulq %r25, %r23
-; APX-NEXT:    addq %rdx, %r23
-; APX-NEXT:    addq %r8, %r29
-; APX-NEXT:    adcq %rcx, %r23
-; APX-NEXT:    movq %r25, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r20
-; APX-NEXT:    movq %r26, %rax
-; APX-NEXT:    mulq %r27
-; APX-NEXT:    movq %rdx, %r27
-; APX-NEXT:    movq %rax, %r30
-; APX-NEXT:    addq %r8, %r30
-; APX-NEXT:    adcq $0, %r27
-; APX-NEXT:    movq %r25, %rax
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r25
-; APX-NEXT:    addq %r30, %r25
-; APX-NEXT:    adcq %r27, %r8
-; APX-NEXT:    setb %cl
-; APX-NEXT:    movq %r26, %rax
-; APX-NEXT:    mulq %r11
-; APX-NEXT:    movq %rdx, %r24
-; APX-NEXT:    movq %rax, %r27
-; APX-NEXT:    addq %r8, %r27
-; APX-NEXT:    movzbl %cl, %eax
-; APX-NEXT:    adcq %rax, %r24
-; APX-NEXT:    addq %r29, %r27
-; APX-NEXT:    adcq %r23, %r24
-; APX-NEXT:    addq %r9, %r20
-; APX-NEXT:    adcq %r16, %r25
-; APX-NEXT:    adcq %r28, %r27
-; APX-NEXT:    adcq %r21, %r24
-; APX-NEXT:    addq %rsi, %r20
-; APX-NEXT:    adcq %rbx, %r25
-; APX-NEXT:    adcq %r15, %r27
-; APX-NEXT:    adcq %r14, %r24
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; APX-NEXT:    movq 80(%r11), %rbx
-; APX-NEXT:    movq %rbx, %rax
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
-; APX-NEXT:    mulq %r19
-; APX-NEXT:    movq %rax, %r21
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq 88(%r11), %r28
-; APX-NEXT:    movq %r28, %rax
-; APX-NEXT:    mulq %r19
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %rbx, %rax
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r17 # 8-byte Reload
-; APX-NEXT:    mulq %r17
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r26
-; APX-NEXT:    addq %r16, %r26
-; APX-NEXT:    adcq %r9, %r8
-; APX-NEXT:    setb %cl
-; APX-NEXT:    movq %r28, %rax
-; APX-NEXT:    mulq %r17
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r16
-; APX-NEXT:    addq %r8, %r16
-; APX-NEXT:    movzbl %cl, %eax
-; APX-NEXT:    adcq %rax, %r9
-; APX-NEXT:    movq 64(%r11), %r15
-; APX-NEXT:    movq %r15, %rax
-; APX-NEXT:    mulq %r19
-; APX-NEXT:    movq %rax, %r23
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq 72(%r11), %r14
-; APX-NEXT:    movq %r14, %rax
-; APX-NEXT:    mulq %r19
-; APX-NEXT:    movq %rdx, %r30
-; APX-NEXT:    movq %rax, %r31
-; APX-NEXT:    addq %r8, %r31
-; APX-NEXT:    adcq $0, %r30
-; APX-NEXT:    movq %r15, %rax
-; APX-NEXT:    mulq %r17
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r29
-; APX-NEXT:    addq %r31, %r29
-; APX-NEXT:    adcq %r30, %r8
-; APX-NEXT:    setb %cl
-; APX-NEXT:    movq %r14, %rax
-; APX-NEXT:    mulq %r17
-; APX-NEXT:    movq %rdx, %r31
-; APX-NEXT:    movq %rax, %r13
-; APX-NEXT:    addq %r8, %r13
-; APX-NEXT:    movzbl %cl, %eax
-; APX-NEXT:    adcq %rax, %r31
-; APX-NEXT:    addq %r21, %r13
-; APX-NEXT:    adcq %r26, %r31
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %r15, %rax
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; APX-NEXT:    mulq %rdi
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r30
-; APX-NEXT:    movq %r14, %rax
-; APX-NEXT:    mulq %rdi
-; APX-NEXT:    movq %rdx, %r26
-; APX-NEXT:    movq %rax, %rcx
-; APX-NEXT:    addq %r8, %rcx
-; APX-NEXT:    adcq $0, %r26
-; APX-NEXT:    movq %r15, %rax
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload
-; APX-NEXT:    mulq %r18
-; APX-NEXT:    movq %rdx, %r10
-; APX-NEXT:    movq %rax, %r21
-; APX-NEXT:    addq %rcx, %r21
-; APX-NEXT:    adcq %r26, %r10
-; APX-NEXT:    setb %cl
-; APX-NEXT:    movq %r14, %rax
-; APX-NEXT:    mulq %r18
-; APX-NEXT:    movq %rdx, %r26
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    addq %r10, %r8
-; APX-NEXT:    movzbl %cl, %eax
-; APX-NEXT:    adcq %rax, %r26
-; APX-NEXT:    addq %r13, %r30
-; APX-NEXT:    adcq %r31, %r21
-; APX-NEXT:    adcq $0, %r8
-; APX-NEXT:    adcq $0, %r26
-; APX-NEXT:    addq %r16, %r8
-; APX-NEXT:    adcq %r9, %r26
-; APX-NEXT:    setb %sil
-; APX-NEXT:    movq %rbx, %rax
-; APX-NEXT:    mulq %rdi
-; APX-NEXT:    movq %rdx, %rcx
-; APX-NEXT:    movq %rax, %r31
-; APX-NEXT:    movq %r28, %rax
-; APX-NEXT:    mulq %rdi
-; APX-NEXT:    movq %rdx, %r9
-; APX-NEXT:    movq %rax, %r10
-; APX-NEXT:    addq %rcx, %r10
-; APX-NEXT:    adcq $0, %r9
-; APX-NEXT:    movq %rbx, %rax
-; APX-NEXT:    mulq %r18
-; APX-NEXT:    movq %rdx, %rcx
-; APX-NEXT:    movq %rax, %r13
-; APX-NEXT:    addq %r10, %r13
-; APX-NEXT:    adcq %r9, %rcx
-; APX-NEXT:    setb %r10b
-; APX-NEXT:    movq %r28, %rax
-; APX-NEXT:    mulq %r18
-; APX-NEXT:    movq %rdx, %r16
-; APX-NEXT:    movq %rax, %r9
-; APX-NEXT:    addq %rcx, %r9
-; APX-NEXT:    movzbl %r10b, %eax
-; APX-NEXT:    adcq %rax, %r16
-; APX-NEXT:    addq %r8, %r31
-; APX-NEXT:    adcq %r26, %r13
-; APX-NEXT:    movzbl %sil, %eax
-; APX-NEXT:    adcq %rax, %r9
-; APX-NEXT:    adcq $0, %r16
-; APX-NEXT:    movq 96(%r11), %rcx
-; APX-NEXT:    imulq %rcx, %r18
-; APX-NEXT:    movq %rcx, %rax
-; APX-NEXT:    mulq %rdi
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    addq %r18, %rdx
-; APX-NEXT:    movq 104(%r11), %r26
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    imulq %r26, %rax
-; APX-NEXT:    addq %rdx, %rax
-; APX-NEXT:    movq %rax, %r10
-; APX-NEXT:    movq 112(%r11), %rax
-; APX-NEXT:    movq %rax, %rsi
-; APX-NEXT:    imulq %r17, %rsi
-; APX-NEXT:    mulq %r19
-; APX-NEXT:    movq %rax, %rdi
-; APX-NEXT:    addq %rsi, %rdx
-; APX-NEXT:    movq 120(%r11), %r18
-; APX-NEXT:    imulq %r19, %r18
-; APX-NEXT:    addq %rdx, %r18
-; APX-NEXT:    addq %r8, %rdi
-; APX-NEXT:    adcq %r10, %r18
-; APX-NEXT:    movq %r19, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %rsi
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    mulq %rcx
-; APX-NEXT:    movq %rdx, %rcx
-; APX-NEXT:    movq %rax, %r10
-; APX-NEXT:    addq %r8, %r10
-; APX-NEXT:    adcq $0, %rcx
-; APX-NEXT:    movq %r19, %rax
-; APX-NEXT:    mulq %r26
-; APX-NEXT:    movq %rdx, %r8
-; APX-NEXT:    movq %rax, %r11
-; APX-NEXT:    addq %r10, %r11
-; APX-NEXT:    adcq %rcx, %r8
-; APX-NEXT:    setb %cl
-; APX-NEXT:    movq %r17, %rax
-; APX-NEXT:    mulq %r26
-; APX-NEXT:    movq %rdx, %r10
-; APX-NEXT:    movq %rax, %r17
-; APX-NEXT:    addq %r8, %r17
-; APX-NEXT:    movzbl %cl, %eax
-; APX-NEXT:    adcq %rax, %r10
-; APX-NEXT:    addq %rdi, %r17
-; APX-NEXT:    adcq %r18, %r10
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; APX-NEXT:    imulq %r15, %rdi
-; APX-NEXT:    movq %r15, %rax
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; APX-NEXT:    mulq %r8
-; APX-NEXT:    movq %rax, %rcx
-; APX-NEXT:    addq %rdi, %rdx
-; APX-NEXT:    movq %r8, %rax
-; APX-NEXT:    imulq %r14, %rax
-; APX-NEXT:    addq %rdx, %rax
-; APX-NEXT:    movq %rax, %r18
-; APX-NEXT:    movq %rbx, %rdi
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
-; APX-NEXT:    imulq %r19, %rdi
-; APX-NEXT:    movq %rbx, %rax
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; APX-NEXT:    mulq %r8
-; APX-NEXT:    movq %rax, %r26
-; APX-NEXT:    addq %rdi, %rdx
-; APX-NEXT:    imulq %r8, %r28
-; APX-NEXT:    addq %rdx, %r28
-; APX-NEXT:    addq %rcx, %r26
-; APX-NEXT:    adcq %r18, %r28
-; APX-NEXT:    movq %r8, %rax
-; APX-NEXT:    movq %r8, %rdi
-; APX-NEXT:    mulq %r15
-; APX-NEXT:    movq %rdx, %rcx
-; APX-NEXT:    movq %rax, %r8
-; APX-NEXT:    movq %r19, %rax
-; APX-NEXT:    mulq %r15
-; APX-NEXT:    movq %rdx, %rbx
-; APX-NEXT:    movq %rax, %r15
-; APX-NEXT:    addq %rcx, %r15
-; APX-NEXT:    adcq $0, %rbx
-; APX-NEXT:    movq %rdi, %rax
-; APX-NEXT:    mulq %r14
-; APX-NEXT:    movq %rdx, %rcx
-; APX-NEXT:    movq %rax, %r18
-; APX-NEXT:    addq %r15, %r18
-; APX-NEXT:    adcq %rbx, %rcx
-; APX-NEXT:    setb %dil
-; APX-NEXT:    movq %r19, %rax
-; APX-NEXT:    mulq %r14
-; APX-NEXT:    addq %rcx, %rax
-; APX-NEXT:    movzbl %dil, %ecx
-; APX-NEXT:    adcq %rcx, %rdx
-; APX-NEXT:    addq %r26, %rax
-; APX-NEXT:    adcq %r28, %rdx
-; APX-NEXT:    addq %rsi, %r8
-; APX-NEXT:    adcq %r11, %r18
-; APX-NEXT:    adcq %r17, %rax
-; APX-NEXT:    adcq %r10, %rdx
-; APX-NEXT:    addq %r31, %r8
-; APX-NEXT:    adcq %r13, %r18
-; APX-NEXT:    adcq %r9, %rax
-; APX-NEXT:    adcq %r16, %rdx
-; APX-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload
-; APX-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; APX-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
-; APX-NEXT:    adcq %r22, %r21
-; APX-NEXT:    adcq %r20, %r8
-; APX-NEXT:    adcq %r25, %r18
-; APX-NEXT:    adcq %r27, %rax
-; APX-NEXT:    adcq %r24, %rdx
-; APX-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload
-; APX-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
-; APX-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
-; APX-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
-; APX-NEXT:    adcq (%rsp), %r8 # 8-byte Folded Reload
-; APX-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Folded Reload
-; APX-NEXT:    adcq %r12, %rax
-; APX-NEXT:    adcq %rbp, %rdx
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, (%rcx)
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, 8(%rcx)
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, 16(%rcx)
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, 24(%rcx)
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, 32(%rcx)
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, 40(%rcx)
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, 48(%rcx)
-; APX-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; APX-NEXT:    movq %rsi, 56(%rcx)
-; APX-NEXT:    movq %r23, 64(%rcx)
-; APX-NEXT:    movq %r29, 72(%rcx)
-; APX-NEXT:    movq %r30, 80(%rcx)
-; APX-NEXT:    movq %r21, 88(%rcx)
-; APX-NEXT:    movq %r8, 96(%rcx)
-; APX-NEXT:    movq %r18, 104(%rcx)
-; APX-NEXT:    movq %rax, 112(%rcx)
-; APX-NEXT:    movq %rdx, 120(%rcx)
-; APX-NEXT:    addq $104, %rsp
-; APX-NEXT:    popq %rbx
-; APX-NEXT:    popq %r12
-; APX-NEXT:    popq %r13
-; APX-NEXT:    popq %r14
-; APX-NEXT:    popq %r15
-; APX-NEXT:    popq %rbp
-; APX-NEXT:    retq
+; EGPR-LABEL: test_1024:
+; EGPR:       # %bb.0:
+; EGPR-NEXT:    pushq %rbp
+; EGPR-NEXT:    pushq %r15
+; EGPR-NEXT:    pushq %r14
+; EGPR-NEXT:    pushq %r13
+; EGPR-NEXT:    pushq %r12
+; EGPR-NEXT:    pushq %rbx
+; EGPR-NEXT:    subq $104, %rsp
+; EGPR-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %rdi, %r24
+; EGPR-NEXT:    movq (%rdi), %r13
+; EGPR-NEXT:    movq 8(%rdi), %r18
+; EGPR-NEXT:    movq 24(%rdi), %r29
+; EGPR-NEXT:    movq 16(%rdi), %r17
+; EGPR-NEXT:    movq 40(%rdi), %rdi
+; EGPR-NEXT:    movq 32(%r24), %r10
+; EGPR-NEXT:    movq 56(%r24), %r15
+; EGPR-NEXT:    movq 48(%r24), %r12
+; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq 24(%rsi), %r23
+; EGPR-NEXT:    movq 16(%rsi), %r11
+; EGPR-NEXT:    movq (%rsi), %r27
+; EGPR-NEXT:    movq 8(%rsi), %r14
+; EGPR-NEXT:    movq %r12, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r19
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %r12, %rax
+; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %rdx, %r20
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    addq %r16, %r8
+; EGPR-NEXT:    adcq %r9, %r20
+; EGPR-NEXT:    setb %al
+; EGPR-NEXT:    movzbl %al, %ecx
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r20, %r16
+; EGPR-NEXT:    adcq %rcx, %r9
+; EGPR-NEXT:    movq %r10, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r20
+; EGPR-NEXT:    movq %rax, %r25
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r21
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    addq %r20, %r22
+; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    movq %r10, %rax
+; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %rdx, %r20
+; EGPR-NEXT:    movq %rax, %r28
+; EGPR-NEXT:    addq %r22, %r28
+; EGPR-NEXT:    adcq %r21, %r20
+; EGPR-NEXT:    setb %al
+; EGPR-NEXT:    movzbl %al, %ecx
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %rdx, %r21
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    addq %r20, %r22
+; EGPR-NEXT:    adcq %rcx, %r21
+; EGPR-NEXT:    addq %r19, %r22
+; EGPR-NEXT:    adcq %r8, %r21
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r10, %rax
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %r19
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    addq %r8, %r20
+; EGPR-NEXT:    adcq $0, %r19
+; EGPR-NEXT:    movq %r10, %rax
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rdx, %rbx
+; EGPR-NEXT:    movq %rax, %r31
+; EGPR-NEXT:    addq %r20, %r31
+; EGPR-NEXT:    adcq %r19, %rbx
+; EGPR-NEXT:    setb %al
+; EGPR-NEXT:    movzbl %al, %ecx
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    addq %rbx, %r8
+; EGPR-NEXT:    adcq %rcx, %r26
+; EGPR-NEXT:    addq %r22, %r30
+; EGPR-NEXT:    adcq %r21, %r31
+; EGPR-NEXT:    adcq $0, %r8
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    addq %r16, %r8
+; EGPR-NEXT:    adcq %r9, %r26
+; EGPR-NEXT:    setb %al
+; EGPR-NEXT:    movzbl %al, %ecx
+; EGPR-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r12, %rax
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %r16
+; EGPR-NEXT:    movq %rax, %r21
+; EGPR-NEXT:    addq %r9, %r21
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    movq %r12, %rax
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %rdi
+; EGPR-NEXT:    addq %r21, %rdi
+; EGPR-NEXT:    adcq %r16, %r9
+; EGPR-NEXT:    setb %al
+; EGPR-NEXT:    movzbl %al, %r10d
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rdx, %r21
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    addq %r9, %r22
+; EGPR-NEXT:    adcq %r10, %r21
+; EGPR-NEXT:    addq %r8, %rsi
+; EGPR-NEXT:    movq %rsi, %r19
+; EGPR-NEXT:    adcq %r26, %rdi
+; EGPR-NEXT:    adcq %rcx, %r22
+; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %rbx
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    adcq %r9, %r8
+; EGPR-NEXT:    setb %al
+; EGPR-NEXT:    movzbl %al, %ecx
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %r14, %rsi
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    adcq %rcx, %r9
+; EGPR-NEXT:    movq %r13, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r18, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r14
+; EGPR-NEXT:    movq %rax, %r15
+; EGPR-NEXT:    addq %r8, %r15
+; EGPR-NEXT:    adcq $0, %r14
+; EGPR-NEXT:    movq %r13, %rax
+; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    mulq %rsi
+; EGPR-NEXT:    movq %rdx, %r12
+; EGPR-NEXT:    addq %r15, %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r14, %r12
+; EGPR-NEXT:    setb %cl
+; EGPR-NEXT:    movq %r18, %rax
+; EGPR-NEXT:    mulq %rsi
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r15
+; EGPR-NEXT:    addq %r12, %r15
+; EGPR-NEXT:    movzbl %cl, %eax
+; EGPR-NEXT:    adcq %rax, %r8
+; EGPR-NEXT:    addq %rbx, %r15
+; EGPR-NEXT:    adcq %r26, %r8
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %r13, %rax
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    movq %r18, %rax
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %rbx
+; EGPR-NEXT:    movq %rax, %r14
+; EGPR-NEXT:    addq %r26, %r14
+; EGPR-NEXT:    adcq $0, %rbx
+; EGPR-NEXT:    movq %r13, %rax
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rdx, %r12
+; EGPR-NEXT:    addq %r14, %rax
+; EGPR-NEXT:    movq %rax, %r10
+; EGPR-NEXT:    adcq %rbx, %r12
+; EGPR-NEXT:    setb %cl
+; EGPR-NEXT:    movq %r18, %rax
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rdx, %r14
+; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    addq %r12, %r26
+; EGPR-NEXT:    movzbl %cl, %eax
+; EGPR-NEXT:    adcq %rax, %r14
+; EGPR-NEXT:    addq %r15, %rsi
+; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r8, %r10
+; EGPR-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    adcq $0, %r14
+; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    adcq %r9, %r14
+; EGPR-NEXT:    setb %cl
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %rbx
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r15
+; EGPR-NEXT:    addq %r16, %r15
+; EGPR-NEXT:    adcq %r9, %r8
+; EGPR-NEXT:    setb %r9b
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %r23
+; EGPR-NEXT:    movq %rdx, %r12
+; EGPR-NEXT:    movq %rax, %rbp
+; EGPR-NEXT:    addq %r8, %rbp
+; EGPR-NEXT:    movzbl %r9b, %eax
+; EGPR-NEXT:    adcq %rax, %r12
+; EGPR-NEXT:    addq %r26, %rbx
+; EGPR-NEXT:    adcq %r14, %r15
+; EGPR-NEXT:    movzbl %cl, %eax
+; EGPR-NEXT:    adcq %rax, %rbp
+; EGPR-NEXT:    adcq $0, %r12
+; EGPR-NEXT:    addq %r25, %rbx
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq 32(%rsi), %r25
+; EGPR-NEXT:    adcq %r28, %r15
+; EGPR-NEXT:    adcq %r30, %rbp
+; EGPR-NEXT:    adcq %r31, %r12
+; EGPR-NEXT:    adcq $0, %r19
+; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq $0, %rdi
+; EGPR-NEXT:    adcq $0, %r22
+; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r28
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq 40(%rsi), %rcx
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    adcq %r9, %r8
+; EGPR-NEXT:    setb %r10b
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    movzbl %r10b, %eax
+; EGPR-NEXT:    adcq %rax, %r9
+; EGPR-NEXT:    movq %r13, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r19
+; EGPR-NEXT:    movq %r18, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r30
+; EGPR-NEXT:    movq %rax, %r31
+; EGPR-NEXT:    addq %r8, %r31
+; EGPR-NEXT:    adcq $0, %r30
+; EGPR-NEXT:    movq %r13, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    addq %r31, %r20
+; EGPR-NEXT:    adcq %r30, %r8
+; EGPR-NEXT:    setb %r10b
+; EGPR-NEXT:    movq %r18, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r30
+; EGPR-NEXT:    movq %rax, %r31
+; EGPR-NEXT:    addq %r8, %r31
+; EGPR-NEXT:    movzbl %r10b, %eax
+; EGPR-NEXT:    adcq %rax, %r30
+; EGPR-NEXT:    addq %r28, %r31
+; EGPR-NEXT:    adcq %r26, %r30
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq 48(%rsi), %r28
+; EGPR-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r13, %rax
+; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r11
+; EGPR-NEXT:    movq %r18, %rax
+; EGPR-NEXT:    movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rax, %r14
+; EGPR-NEXT:    addq %r8, %r14
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    movq 56(%rsi), %r10
+; EGPR-NEXT:    movq %r13, %rax
+; EGPR-NEXT:    mulq %r10
+; EGPR-NEXT:    movq %rdx, %r13
+; EGPR-NEXT:    addq %r14, %rax
+; EGPR-NEXT:    movq %rax, %r14
+; EGPR-NEXT:    adcq %r26, %r13
+; EGPR-NEXT:    setb %sil
+; EGPR-NEXT:    movq %r18, %rax
+; EGPR-NEXT:    mulq %r10
+; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    addq %r13, %r8
+; EGPR-NEXT:    movzbl %sil, %eax
+; EGPR-NEXT:    adcq %rax, %r26
+; EGPR-NEXT:    addq %r31, %r11
+; EGPR-NEXT:    adcq %r30, %r14
+; EGPR-NEXT:    adcq $0, %r8
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    addq %r16, %r8
+; EGPR-NEXT:    adcq %r9, %r26
+; EGPR-NEXT:    setb %r18b
+; EGPR-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    movq %r29, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rdx, %r16
+; EGPR-NEXT:    movq %rax, %r31
+; EGPR-NEXT:    addq %r9, %r31
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    mulq %r10
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r17
+; EGPR-NEXT:    addq %r31, %r17
+; EGPR-NEXT:    adcq %r16, %r9
+; EGPR-NEXT:    setb %r16b
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %r10
+; EGPR-NEXT:    movq %rdx, %r13
+; EGPR-NEXT:    movq %rax, %r31
+; EGPR-NEXT:    addq %r9, %r31
+; EGPR-NEXT:    movzbl %r16b, %eax
+; EGPR-NEXT:    adcq %rax, %r13
+; EGPR-NEXT:    addq %r8, %r30
+; EGPR-NEXT:    adcq %r26, %r17
+; EGPR-NEXT:    movzbl %r18b, %eax
+; EGPR-NEXT:    adcq %rax, %r31
+; EGPR-NEXT:    adcq $0, %r13
+; EGPR-NEXT:    addq %rbx, %r19
+; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r15, %r20
+; EGPR-NEXT:    movq %r20, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %rbp, %r11
+; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r12, %r14
+; EGPR-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq $0, %r30
+; EGPR-NEXT:    adcq $0, %r17
+; EGPR-NEXT:    adcq $0, %r31
+; EGPR-NEXT:    adcq $0, %r13
+; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq %rdi, %r17
+; EGPR-NEXT:    adcq %r22, %r31
+; EGPR-NEXT:    adcq %r21, %r13
+; EGPR-NEXT:    setb %r15b
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r19
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %rsi, %rax
+; EGPR-NEXT:    movq %rsi, %r29
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    addq %r16, %r20
+; EGPR-NEXT:    adcq %r9, %r8
+; EGPR-NEXT:    setb %r18b
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    movq %r21, %r14
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    movzbl %r18b, %eax
+; EGPR-NEXT:    adcq %rax, %r9
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; EGPR-NEXT:    movq %rbx, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r21
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    addq %r8, %r22
+; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    movq %rbx, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    addq %r22, %rax
+; EGPR-NEXT:    movq %rax, %r11
+; EGPR-NEXT:    adcq %r21, %r8
+; EGPR-NEXT:    setb %r18b
+; EGPR-NEXT:    movq %rsi, %rax
+; EGPR-NEXT:    movq %rsi, %r21
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r22
+; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    addq %r8, %r26
+; EGPR-NEXT:    movzbl %r18b, %eax
+; EGPR-NEXT:    adcq %rax, %r22
+; EGPR-NEXT:    addq %r19, %r26
+; EGPR-NEXT:    adcq %r20, %r22
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %rbx, %rax
+; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rdx, %r19
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    addq %r8, %r20
+; EGPR-NEXT:    adcq $0, %r19
+; EGPR-NEXT:    movq %rbx, %rax
+; EGPR-NEXT:    mulq %r10
+; EGPR-NEXT:    movq %rdx, %rbx
+; EGPR-NEXT:    addq %r20, %rax
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    adcq %r19, %rbx
+; EGPR-NEXT:    setb %r18b
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %r10
+; EGPR-NEXT:    movq %rdx, %r21
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    addq %rbx, %r8
+; EGPR-NEXT:    movzbl %r18b, %eax
+; EGPR-NEXT:    adcq %rax, %r21
+; EGPR-NEXT:    addq %r26, %rsi
+; EGPR-NEXT:    adcq %r22, %r20
+; EGPR-NEXT:    adcq $0, %r8
+; EGPR-NEXT:    adcq $0, %r21
+; EGPR-NEXT:    addq %r16, %r8
+; EGPR-NEXT:    adcq %r9, %r21
+; EGPR-NEXT:    setb %r18b
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    mulq %r28
+; EGPR-NEXT:    movq %rdx, %r16
+; EGPR-NEXT:    movq %rax, %r19
+; EGPR-NEXT:    addq %r9, %r19
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    movq %r29, %rax
+; EGPR-NEXT:    mulq %r10
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    addq %r19, %rax
+; EGPR-NEXT:    movq %rax, %r19
+; EGPR-NEXT:    adcq %r16, %r9
+; EGPR-NEXT:    setb %r16b
+; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    mulq %r10
+; EGPR-NEXT:    movq %rdx, %rbp
+; EGPR-NEXT:    movq %rax, %r12
+; EGPR-NEXT:    addq %r9, %r12
+; EGPR-NEXT:    movzbl %r16b, %eax
+; EGPR-NEXT:    adcq %rax, %rbp
+; EGPR-NEXT:    addq %r8, %r22
+; EGPR-NEXT:    adcq %r21, %r19
+; EGPR-NEXT:    movzbl %r18b, %eax
+; EGPR-NEXT:    adcq %rax, %r12
+; EGPR-NEXT:    adcq $0, %rbp
+; EGPR-NEXT:    addq %r30, %rdi
+; EGPR-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r17, %r11
+; EGPR-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r31, %rsi
+; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r13, %r20
+; EGPR-NEXT:    movq %r20, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movzbl %r15b, %eax
+; EGPR-NEXT:    adcq %rax, %r22
+; EGPR-NEXT:    movq %r22, (%rsp) # 8-byte Spill
+; EGPR-NEXT:    adcq $0, %r19
+; EGPR-NEXT:    movq %r19, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq $0, %r12
+; EGPR-NEXT:    adcq $0, %rbp
+; EGPR-NEXT:    movq 64(%r24), %r21
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    mulq %r21
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    mulq %r21
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq 72(%r24), %r30
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    mulq %r30
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    adcq %r9, %r8
+; EGPR-NEXT:    setb %r18b
+; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    mulq %r30
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    movzbl %r18b, %eax
+; EGPR-NEXT:    adcq %rax, %r9
+; EGPR-NEXT:    movq %r27, %rax
+; EGPR-NEXT:    mulq %r21
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    mulq %r21
+; EGPR-NEXT:    movq %rdx, %r31
+; EGPR-NEXT:    movq %rax, %rbx
+; EGPR-NEXT:    addq %r8, %rbx
+; EGPR-NEXT:    adcq $0, %r31
+; EGPR-NEXT:    movq %r27, %rax
+; EGPR-NEXT:    mulq %r30
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    addq %rbx, %rax
+; EGPR-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r31, %r8
+; EGPR-NEXT:    setb %r18b
+; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    mulq %r30
+; EGPR-NEXT:    movq %rdx, %r31
+; EGPR-NEXT:    movq %rax, %rbx
+; EGPR-NEXT:    addq %r8, %rbx
+; EGPR-NEXT:    movzbl %r18b, %eax
+; EGPR-NEXT:    adcq %rax, %r31
+; EGPR-NEXT:    addq %r22, %rbx
+; EGPR-NEXT:    adcq %r26, %r31
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq 80(%r24), %r13
+; EGPR-NEXT:    movq %r27, %rax
+; EGPR-NEXT:    mulq %r13
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    mulq %r13
+; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rax, %r14
+; EGPR-NEXT:    addq %r8, %r14
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    movq 88(%r24), %r18
+; EGPR-NEXT:    movq %r27, %rax
+; EGPR-NEXT:    mulq %r18
+; EGPR-NEXT:    movq %rdx, %r15
+; EGPR-NEXT:    movq %rax, %r22
+; EGPR-NEXT:    addq %r14, %r22
+; EGPR-NEXT:    adcq %r26, %r15
+; EGPR-NEXT:    setb %r14b
+; EGPR-NEXT:    movq %r11, %rax
+; EGPR-NEXT:    mulq %r18
+; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    addq %r15, %r8
+; EGPR-NEXT:    movzbl %r14b, %eax
+; EGPR-NEXT:    adcq %rax, %r26
+; EGPR-NEXT:    addq %rbx, %rsi
+; EGPR-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NEXT:    adcq %r31, %r22
+; EGPR-NEXT:    adcq $0, %r8
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    addq %r16, %r8
+; EGPR-NEXT:    adcq %r9, %r26
+; EGPR-NEXT:    setb %r31b
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    mulq %r13
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    mulq %r13
+; EGPR-NEXT:    movq %rdx, %r16
+; EGPR-NEXT:    movq %rax, %r14
+; EGPR-NEXT:    addq %r9, %r14
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    mulq %r18
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %rbx
+; EGPR-NEXT:    addq %r14, %rbx
+; EGPR-NEXT:    adcq %r16, %r9
+; EGPR-NEXT:    setb %r16b
+; EGPR-NEXT:    movq %r23, %rax
+; EGPR-NEXT:    mulq %r18
+; EGPR-NEXT:    movq %rdx, %r14
+; EGPR-NEXT:    movq %rax, %r15
+; EGPR-NEXT:    addq %r9, %r15
+; EGPR-NEXT:    movzbl %r16b, %eax
+; EGPR-NEXT:    adcq %rax, %r14
+; EGPR-NEXT:    addq %r8, %rsi
+; EGPR-NEXT:    adcq %r26, %rbx
+; EGPR-NEXT:    movzbl %r31b, %eax
+; EGPR-NEXT:    adcq %rax, %r15
+; EGPR-NEXT:    adcq $0, %r14
+; EGPR-NEXT:    imulq %r25, %r18
+; EGPR-NEXT:    movq %r25, %rax
+; EGPR-NEXT:    mulq %r13
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    addq %r18, %rdx
+; EGPR-NEXT:    imulq %rcx, %r13
+; EGPR-NEXT:    addq %rdx, %r13
+; EGPR-NEXT:    movq %r28, %r9
+; EGPR-NEXT:    imulq %r30, %r9
+; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    mulq %r21
+; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    addq %r9, %rdx
+; EGPR-NEXT:    imulq %r21, %r10
+; EGPR-NEXT:    addq %rdx, %r10
+; EGPR-NEXT:    addq %r8, %r26
+; EGPR-NEXT:    adcq %r13, %r10
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r9
+; EGPR-NEXT:    movq %r30, %rax
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rdx, %r25
+; EGPR-NEXT:    movq %rax, %r28
+; EGPR-NEXT:    addq %r8, %r28
+; EGPR-NEXT:    adcq $0, %r25
+; EGPR-NEXT:    movq %r21, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r28, %r16
+; EGPR-NEXT:    adcq %r25, %r8
+; EGPR-NEXT:    setb %r18b
+; EGPR-NEXT:    movq %r30, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r21
+; EGPR-NEXT:    movq %rax, %r28
+; EGPR-NEXT:    addq %r8, %r28
+; EGPR-NEXT:    movzbl %r18b, %eax
+; EGPR-NEXT:    adcq %rax, %r21
+; EGPR-NEXT:    addq %r26, %r28
+; EGPR-NEXT:    adcq %r10, %r21
+; EGPR-NEXT:    movq 112(%r24), %rcx
+; EGPR-NEXT:    movq %r27, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    imulq %r11, %rcx
+; EGPR-NEXT:    addq %rdx, %rcx
+; EGPR-NEXT:    movq 120(%r24), %rax
+; EGPR-NEXT:    imulq %r27, %rax
+; EGPR-NEXT:    addq %rax, %rcx
+; EGPR-NEXT:    movq 96(%r24), %r25
+; EGPR-NEXT:    movq 104(%r24), %r26
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    imulq %r26, %rdi
+; EGPR-NEXT:    mulq %r25
+; EGPR-NEXT:    movq %rax, %r29
+; EGPR-NEXT:    addq %rdi, %rdx
+; EGPR-NEXT:    imulq %r25, %r23
+; EGPR-NEXT:    addq %rdx, %r23
+; EGPR-NEXT:    addq %r8, %r29
+; EGPR-NEXT:    adcq %rcx, %r23
+; EGPR-NEXT:    movq %r25, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r20
+; EGPR-NEXT:    movq %r26, %rax
+; EGPR-NEXT:    mulq %r27
+; EGPR-NEXT:    movq %rdx, %r27
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    addq %r8, %r30
+; EGPR-NEXT:    adcq $0, %r27
+; EGPR-NEXT:    movq %r25, %rax
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r25
+; EGPR-NEXT:    addq %r30, %r25
+; EGPR-NEXT:    adcq %r27, %r8
+; EGPR-NEXT:    setb %cl
+; EGPR-NEXT:    movq %r26, %rax
+; EGPR-NEXT:    mulq %r11
+; EGPR-NEXT:    movq %rdx, %r24
+; EGPR-NEXT:    movq %rax, %r27
+; EGPR-NEXT:    addq %r8, %r27
+; EGPR-NEXT:    movzbl %cl, %eax
+; EGPR-NEXT:    adcq %rax, %r24
+; EGPR-NEXT:    addq %r29, %r27
+; EGPR-NEXT:    adcq %r23, %r24
+; EGPR-NEXT:    addq %r9, %r20
+; EGPR-NEXT:    adcq %r16, %r25
+; EGPR-NEXT:    adcq %r28, %r27
+; EGPR-NEXT:    adcq %r21, %r24
+; EGPR-NEXT:    addq %rsi, %r20
+; EGPR-NEXT:    adcq %rbx, %r25
+; EGPR-NEXT:    adcq %r15, %r27
+; EGPR-NEXT:    adcq %r14, %r24
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; EGPR-NEXT:    movq 80(%r11), %rbx
+; EGPR-NEXT:    movq %rbx, %rax
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
+; EGPR-NEXT:    mulq %r19
+; EGPR-NEXT:    movq %rax, %r21
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq 88(%r11), %r28
+; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    mulq %r19
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %rbx, %rax
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r17 # 8-byte Reload
+; EGPR-NEXT:    mulq %r17
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    addq %r16, %r26
+; EGPR-NEXT:    adcq %r9, %r8
+; EGPR-NEXT:    setb %cl
+; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    mulq %r17
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r16
+; EGPR-NEXT:    movzbl %cl, %eax
+; EGPR-NEXT:    adcq %rax, %r9
+; EGPR-NEXT:    movq 64(%r11), %r15
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    mulq %r19
+; EGPR-NEXT:    movq %rax, %r23
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq 72(%r11), %r14
+; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    mulq %r19
+; EGPR-NEXT:    movq %rdx, %r30
+; EGPR-NEXT:    movq %rax, %r31
+; EGPR-NEXT:    addq %r8, %r31
+; EGPR-NEXT:    adcq $0, %r30
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    mulq %r17
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r29
+; EGPR-NEXT:    addq %r31, %r29
+; EGPR-NEXT:    adcq %r30, %r8
+; EGPR-NEXT:    setb %cl
+; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    mulq %r17
+; EGPR-NEXT:    movq %rdx, %r31
+; EGPR-NEXT:    movq %rax, %r13
+; EGPR-NEXT:    addq %r8, %r13
+; EGPR-NEXT:    movzbl %cl, %eax
+; EGPR-NEXT:    adcq %rax, %r31
+; EGPR-NEXT:    addq %r21, %r13
+; EGPR-NEXT:    adcq %r26, %r31
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; EGPR-NEXT:    mulq %rdi
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r30
+; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    mulq %rdi
+; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rax, %rcx
+; EGPR-NEXT:    addq %r8, %rcx
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload
+; EGPR-NEXT:    mulq %r18
+; EGPR-NEXT:    movq %rdx, %r10
+; EGPR-NEXT:    movq %rax, %r21
+; EGPR-NEXT:    addq %rcx, %r21
+; EGPR-NEXT:    adcq %r26, %r10
+; EGPR-NEXT:    setb %cl
+; EGPR-NEXT:    movq %r14, %rax
+; EGPR-NEXT:    mulq %r18
+; EGPR-NEXT:    movq %rdx, %r26
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    addq %r10, %r8
+; EGPR-NEXT:    movzbl %cl, %eax
+; EGPR-NEXT:    adcq %rax, %r26
+; EGPR-NEXT:    addq %r13, %r30
+; EGPR-NEXT:    adcq %r31, %r21
+; EGPR-NEXT:    adcq $0, %r8
+; EGPR-NEXT:    adcq $0, %r26
+; EGPR-NEXT:    addq %r16, %r8
+; EGPR-NEXT:    adcq %r9, %r26
+; EGPR-NEXT:    setb %sil
+; EGPR-NEXT:    movq %rbx, %rax
+; EGPR-NEXT:    mulq %rdi
+; EGPR-NEXT:    movq %rdx, %rcx
+; EGPR-NEXT:    movq %rax, %r31
+; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    mulq %rdi
+; EGPR-NEXT:    movq %rdx, %r9
+; EGPR-NEXT:    movq %rax, %r10
+; EGPR-NEXT:    addq %rcx, %r10
+; EGPR-NEXT:    adcq $0, %r9
+; EGPR-NEXT:    movq %rbx, %rax
+; EGPR-NEXT:    mulq %r18
+; EGPR-NEXT:    movq %rdx, %rcx
+; EGPR-NEXT:    movq %rax, %r13
+; EGPR-NEXT:    addq %r10, %r13
+; EGPR-NEXT:    adcq %r9, %rcx
+; EGPR-NEXT:    setb %r10b
+; EGPR-NEXT:    movq %r28, %rax
+; EGPR-NEXT:    mulq %r18
+; EGPR-NEXT:    movq %rdx, %r16
+; EGPR-NEXT:    movq %rax, %r9
+; EGPR-NEXT:    addq %rcx, %r9
+; EGPR-NEXT:    movzbl %r10b, %eax
+; EGPR-NEXT:    adcq %rax, %r16
+; EGPR-NEXT:    addq %r8, %r31
+; EGPR-NEXT:    adcq %r26, %r13
+; EGPR-NEXT:    movzbl %sil, %eax
+; EGPR-NEXT:    adcq %rax, %r9
+; EGPR-NEXT:    adcq $0, %r16
+; EGPR-NEXT:    movq 96(%r11), %rcx
+; EGPR-NEXT:    imulq %rcx, %r18
+; EGPR-NEXT:    movq %rcx, %rax
+; EGPR-NEXT:    mulq %rdi
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    addq %r18, %rdx
+; EGPR-NEXT:    movq 104(%r11), %r26
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    imulq %r26, %rax
+; EGPR-NEXT:    addq %rdx, %rax
+; EGPR-NEXT:    movq %rax, %r10
+; EGPR-NEXT:    movq 112(%r11), %rax
+; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    imulq %r17, %rsi
+; EGPR-NEXT:    mulq %r19
+; EGPR-NEXT:    movq %rax, %rdi
+; EGPR-NEXT:    addq %rsi, %rdx
+; EGPR-NEXT:    movq 120(%r11), %r18
+; EGPR-NEXT:    imulq %r19, %r18
+; EGPR-NEXT:    addq %rdx, %r18
+; EGPR-NEXT:    addq %r8, %rdi
+; EGPR-NEXT:    adcq %r10, %r18
+; EGPR-NEXT:    movq %r19, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %rsi
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    mulq %rcx
+; EGPR-NEXT:    movq %rdx, %rcx
+; EGPR-NEXT:    movq %rax, %r10
+; EGPR-NEXT:    addq %r8, %r10
+; EGPR-NEXT:    adcq $0, %rcx
+; EGPR-NEXT:    movq %r19, %rax
+; EGPR-NEXT:    mulq %r26
+; EGPR-NEXT:    movq %rdx, %r8
+; EGPR-NEXT:    movq %rax, %r11
+; EGPR-NEXT:    addq %r10, %r11
+; EGPR-NEXT:    adcq %rcx, %r8
+; EGPR-NEXT:    setb %cl
+; EGPR-NEXT:    movq %r17, %rax
+; EGPR-NEXT:    mulq %r26
+; EGPR-NEXT:    movq %rdx, %r10
+; EGPR-NEXT:    movq %rax, %r17
+; EGPR-NEXT:    addq %r8, %r17
+; EGPR-NEXT:    movzbl %cl, %eax
+; EGPR-NEXT:    adcq %rax, %r10
+; EGPR-NEXT:    addq %rdi, %r17
+; EGPR-NEXT:    adcq %r18, %r10
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; EGPR-NEXT:    imulq %r15, %rdi
+; EGPR-NEXT:    movq %r15, %rax
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; EGPR-NEXT:    mulq %r8
+; EGPR-NEXT:    movq %rax, %rcx
+; EGPR-NEXT:    addq %rdi, %rdx
+; EGPR-NEXT:    movq %r8, %rax
+; EGPR-NEXT:    imulq %r14, %rax
+; EGPR-NEXT:    addq %rdx, %rax
+; EGPR-NEXT:    movq %rax, %r18
+; EGPR-NEXT:    movq %rbx, %rdi
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r19 # 8-byte Reload
+; EGPR-NEXT:    imulq %r19, %rdi
+; EGPR-NEXT:    movq %rbx, %rax
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; EGPR-NEXT:    mulq %r8
+; EGPR-NEXT:    movq %rax, %r26
+; EGPR-NEXT:    addq %rdi, %rdx
+; EGPR-NEXT:    imulq %r8, %r28
+; EGPR-NEXT:    addq %rdx, %r28
+; EGPR-NEXT:    addq %rcx, %r26
+; EGPR-NEXT:    adcq %r18, %r28
+; EGPR-NEXT:    movq %r8, %rax
+; EGPR-NEXT:    movq %r8, %rdi
+; EGPR-NEXT:    mulq %r15
+; EGPR-NEXT:    movq %rdx, %rcx
+; EGPR-NEXT:    movq %rax, %r8
+; EGPR-NEXT:    movq %r19, %rax
+; EGPR-NEXT:    mulq %r15
+; EGPR-NEXT:    movq %rdx, %rbx
+; EGPR-NEXT:    movq %rax, %r15
+; EGPR-NEXT:    addq %rcx, %r15
+; EGPR-NEXT:    adcq $0, %rbx
+; EGPR-NEXT:    movq %rdi, %rax
+; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    movq %rdx, %rcx
+; EGPR-NEXT:    movq %rax, %r18
+; EGPR-NEXT:    addq %r15, %r18
+; EGPR-NEXT:    adcq %rbx, %rcx
+; EGPR-NEXT:    setb %dil
+; EGPR-NEXT:    movq %r19, %rax
+; EGPR-NEXT:    mulq %r14
+; EGPR-NEXT:    addq %rcx, %rax
+; EGPR-NEXT:    movzbl %dil, %ecx
+; EGPR-NEXT:    adcq %rcx, %rdx
+; EGPR-NEXT:    addq %r26, %rax
+; EGPR-NEXT:    adcq %r28, %rdx
+; EGPR-NEXT:    addq %rsi, %r8
+; EGPR-NEXT:    adcq %r11, %r18
+; EGPR-NEXT:    adcq %r17, %rax
+; EGPR-NEXT:    adcq %r10, %rdx
+; EGPR-NEXT:    addq %r31, %r8
+; EGPR-NEXT:    adcq %r13, %r18
+; EGPR-NEXT:    adcq %r9, %rax
+; EGPR-NEXT:    adcq %r16, %rdx
+; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq %r22, %r21
+; EGPR-NEXT:    adcq %r20, %r8
+; EGPR-NEXT:    adcq %r25, %r18
+; EGPR-NEXT:    adcq %r27, %rax
+; EGPR-NEXT:    adcq %r24, %rdx
+; EGPR-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r29 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r30 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq (%rsp), %r8 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Folded Reload
+; EGPR-NEXT:    adcq %r12, %rax
+; EGPR-NEXT:    adcq %rbp, %rdx
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, (%rcx)
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, 8(%rcx)
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, 16(%rcx)
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, 24(%rcx)
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, 32(%rcx)
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, 40(%rcx)
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, 48(%rcx)
+; EGPR-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; EGPR-NEXT:    movq %rsi, 56(%rcx)
+; EGPR-NEXT:    movq %r23, 64(%rcx)
+; EGPR-NEXT:    movq %r29, 72(%rcx)
+; EGPR-NEXT:    movq %r30, 80(%rcx)
+; EGPR-NEXT:    movq %r21, 88(%rcx)
+; EGPR-NEXT:    movq %r8, 96(%rcx)
+; EGPR-NEXT:    movq %r18, 104(%rcx)
+; EGPR-NEXT:    movq %rax, 112(%rcx)
+; EGPR-NEXT:    movq %rdx, 120(%rcx)
+; EGPR-NEXT:    addq $104, %rsp
+; EGPR-NEXT:    popq %rbx
+; EGPR-NEXT:    popq %r12
+; EGPR-NEXT:    popq %r13
+; EGPR-NEXT:    popq %r14
+; EGPR-NEXT:    popq %r15
+; EGPR-NEXT:    popq %rbp
+; EGPR-NEXT:    retq
+;
+; EGPR-NDD-LABEL: test_1024:
+; EGPR-NDD:       # %bb.0:
+; EGPR-NDD-NEXT:    pushq %rbp
+; EGPR-NDD-NEXT:    pushq %r15
+; EGPR-NDD-NEXT:    pushq %r14
+; EGPR-NDD-NEXT:    pushq %r13
+; EGPR-NDD-NEXT:    pushq %r12
+; EGPR-NDD-NEXT:    pushq %rbx
+; EGPR-NDD-NEXT:    subq $104, %rsp
+; EGPR-NDD-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %rdi, %r20
+; EGPR-NDD-NEXT:    movq (%rdi), %r16
+; EGPR-NDD-NEXT:    movq 8(%rdi), %r14
+; EGPR-NDD-NEXT:    movq 24(%rdi), %r9
+; EGPR-NDD-NEXT:    movq 16(%rdi), %r10
+; EGPR-NDD-NEXT:    movq 40(%rdi), %rdi
+; EGPR-NDD-NEXT:    movq 32(%r20), %r11
+; EGPR-NDD-NEXT:    movq 56(%r20), %r17
+; EGPR-NDD-NEXT:    movq 48(%r20), %r15
+; EGPR-NDD-NEXT:    movq 24(%rsi), %r18
+; EGPR-NDD-NEXT:    movq 16(%rsi), %r24
+; EGPR-NDD-NEXT:    movq (%rsi), %r22
+; EGPR-NDD-NEXT:    movq 8(%rsi), %r21
+; EGPR-NDD-NEXT:    movq %rsi, %r23
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    movq %rdx, %r25
+; EGPR-NDD-NEXT:    movq %rax, %r19
+; EGPR-NDD-NEXT:    movq %r17, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    addq %r25, %rax, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rsi
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %rcx, %rax, %r8
+; EGPR-NDD-NEXT:    adcq %rdx, %rsi, %rcx
+; EGPR-NDD-NEXT:    setb %al
+; EGPR-NDD-NEXT:    movzbl %al, %esi
+; EGPR-NDD-NEXT:    movq %r17, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %rcx, %rax, %r27
+; EGPR-NDD-NEXT:    adcq %rdx, %rsi
+; EGPR-NDD-NEXT:    movq %r11, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    movq %rdx, %r26
+; EGPR-NDD-NEXT:    movq %rax, %r25
+; EGPR-NDD-NEXT:    movq %rdi, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    addq %r26, %rax, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r26
+; EGPR-NDD-NEXT:    movq %r11, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %rax, %rcx
+; EGPR-NDD-NEXT:    adcq %rdx, %r26
+; EGPR-NDD-NEXT:    setb %al
+; EGPR-NDD-NEXT:    movzbl %al, %r28d
+; EGPR-NDD-NEXT:    movq %rdi, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %r26, %rax
+; EGPR-NDD-NEXT:    adcq %r28, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r19, %r28
+; EGPR-NDD-NEXT:    adcq %rdx, %r8
+; EGPR-NDD-NEXT:    adcq $0, %r27
+; EGPR-NDD-NEXT:    adcq $0, %rsi, %r29
+; EGPR-NDD-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %r11, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %rdx, %r19
+; EGPR-NDD-NEXT:    movq %rax, %r26
+; EGPR-NDD-NEXT:    movq %rdi, %rax
+; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    addq %r19, %rax, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r19
+; EGPR-NDD-NEXT:    movq %r11, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %rsi, %rax, %r30
+; EGPR-NDD-NEXT:    adcq %rdx, %r19, %rsi
+; EGPR-NDD-NEXT:    setb %al
+; EGPR-NDD-NEXT:    movzbl %al, %r19d
+; EGPR-NDD-NEXT:    movq %rdi, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %rsi, %rax
+; EGPR-NDD-NEXT:    adcq %r19, %rdx
+; EGPR-NDD-NEXT:    addq %r28, %r26, %rsi
+; EGPR-NDD-NEXT:    adcq %r8, %r30, %r28
+; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r27, %r8
+; EGPR-NDD-NEXT:    adcq %rdx, %r29, %r27
+; EGPR-NDD-NEXT:    setb %al
+; EGPR-NDD-NEXT:    movzbl %al, %r31d
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %rdx, %r19
+; EGPR-NDD-NEXT:    movq %rax, %r26
+; EGPR-NDD-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %r17, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    addq %rax, %r19
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r29
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %rax, %r19
+; EGPR-NDD-NEXT:    adcq %rdx, %r29
+; EGPR-NDD-NEXT:    setb %al
+; EGPR-NDD-NEXT:    movzbl %al, %r30d
+; EGPR-NDD-NEXT:    movq %r17, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %r29, %rax
+; EGPR-NDD-NEXT:    adcq %r30, %rdx
+; EGPR-NDD-NEXT:    addq %r8, %r26, %r29
+; EGPR-NDD-NEXT:    adcq %r27, %r19, %r30
+; EGPR-NDD-NEXT:    adcq %rax, %r31
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    movq %rdx, %r19
+; EGPR-NDD-NEXT:    movq %rax, %r26
+; EGPR-NDD-NEXT:    movq %r9, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    addq %r19, %rax, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r19
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %rax, %r8
+; EGPR-NDD-NEXT:    adcq %rdx, %r19
+; EGPR-NDD-NEXT:    setb %al
+; EGPR-NDD-NEXT:    movzbl %al, %r27d
+; EGPR-NDD-NEXT:    movq %r9, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %rax, %r19
+; EGPR-NDD-NEXT:    adcq %r27, %rdx, %rbx
+; EGPR-NDD-NEXT:    movq %r16, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    movq %rdx, %r27
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    addq %rax, %r27
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
+; EGPR-NDD-NEXT:    movq %r16, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %r27, %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %rdx, %r12, %r27
+; EGPR-NDD-NEXT:    setb %bpl
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %r27, %rax
+; EGPR-NDD-NEXT:    movzbl %bpl, %r27d
+; EGPR-NDD-NEXT:    adcq %r27, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r26, %r12
+; EGPR-NDD-NEXT:    adcq %rdx, %r8
+; EGPR-NDD-NEXT:    adcq $0, %r19
+; EGPR-NDD-NEXT:    adcq $0, %rbx
+; EGPR-NDD-NEXT:    movq %r16, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %rdx, %r26
+; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r13
+; EGPR-NDD-NEXT:    movq %r16, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    adcq %rdx, %r13
+; EGPR-NDD-NEXT:    setb %bpl
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %r13, %rax
+; EGPR-NDD-NEXT:    movzbl %bpl, %r13d
+; EGPR-NDD-NEXT:    adcq %r13, %rdx
+; EGPR-NDD-NEXT:    addq %r12, %r27, %r11
+; EGPR-NDD-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %r26, %r8
+; EGPR-NDD-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r19, %r8
+; EGPR-NDD-NEXT:    adcq %rdx, %rbx, %r19
+; EGPR-NDD-NEXT:    setb %bl
+; EGPR-NDD-NEXT:    movq %r10, %r17
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %rdx, %r26
+; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    movq %r9, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    movq %r18, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %rax, %r26
+; EGPR-NDD-NEXT:    adcq %rdx, %r12
+; EGPR-NDD-NEXT:    setb %bpl
+; EGPR-NDD-NEXT:    movq %r9, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %r12, %rax
+; EGPR-NDD-NEXT:    movzbl %bpl, %r12d
+; EGPR-NDD-NEXT:    adcq %r12, %rdx
+; EGPR-NDD-NEXT:    addq %r27, %r8
+; EGPR-NDD-NEXT:    adcq %r26, %r19
+; EGPR-NDD-NEXT:    movzbl %bl, %r26d
+; EGPR-NDD-NEXT:    adcq %r26, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx
+; EGPR-NDD-NEXT:    addq %r8, %r25, %r12
+; EGPR-NDD-NEXT:    movq 32(%r23), %r26
+; EGPR-NDD-NEXT:    adcq %r19, %rcx, %r13
+; EGPR-NDD-NEXT:    adcq %rax, %rsi, %rbp
+; EGPR-NDD-NEXT:    adcq %rdx, %r28, %rbx
+; EGPR-NDD-NEXT:    adcq $0, %r29, %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq $0, %r30
+; EGPR-NDD-NEXT:    adcq $0, %r31
+; EGPR-NDD-NEXT:    adcq $0, %rdi, %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq %rdx, %r25
+; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    movq %r9, %r19
+; EGPR-NDD-NEXT:    movq %r9, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    addq %r25, %rax, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
+; EGPR-NDD-NEXT:    movq 40(%r23), %r18
+; EGPR-NDD-NEXT:    movq %r23, %r11
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %rcx, %rax, %rdi
+; EGPR-NDD-NEXT:    adcq %rdx, %r8
+; EGPR-NDD-NEXT:    setb %r25b
+; EGPR-NDD-NEXT:    movq %r9, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %r8, %rax, %r29
+; EGPR-NDD-NEXT:    movzbl %r25b, %eax
+; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rsi
+; EGPR-NDD-NEXT:    movq %r16, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq %rdx, %r28
+; EGPR-NDD-NEXT:    movq %rax, %r25
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    addq %r28, %rax, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r28
+; EGPR-NDD-NEXT:    movq %r16, %rax
+; EGPR-NDD-NEXT:    movq %r16, %r10
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %r8, %rax, %r23
+; EGPR-NDD-NEXT:    adcq %rdx, %r28
+; EGPR-NDD-NEXT:    setb %cl
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    movq %r14, %r16
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %r28, %rax
+; EGPR-NDD-NEXT:    movzbl %cl, %ecx
+; EGPR-NDD-NEXT:    adcq %rdx, %rcx
+; EGPR-NDD-NEXT:    addq %rax, %r27
+; EGPR-NDD-NEXT:    adcq %rcx, %rdi
+; EGPR-NDD-NEXT:    adcq $0, %r29, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rsi, %r9
+; EGPR-NDD-NEXT:    movq %r11, %r14
+; EGPR-NDD-NEXT:    movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 48(%r11), %r11
+; EGPR-NDD-NEXT:    movq %r10, %rsi
+; EGPR-NDD-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    movq %rdx, %r28
+; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %r16, %rax
+; EGPR-NDD-NEXT:    movq %r16, %r10
+; EGPR-NDD-NEXT:    movq %r16, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    addq %rax, %r28
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
+; EGPR-NDD-NEXT:    movq 56(%r14), %r16
+; EGPR-NDD-NEXT:    movq %rsi, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %rax, %r28
+; EGPR-NDD-NEXT:    adcq %rdx, %rcx
+; EGPR-NDD-NEXT:    setb %sil
+; EGPR-NDD-NEXT:    movq %r10, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %rcx, %rax
+; EGPR-NDD-NEXT:    movzbl %sil, %ecx
+; EGPR-NDD-NEXT:    adcq %rdx, %rcx
+; EGPR-NDD-NEXT:    addq %r27, %r29, %r10
+; EGPR-NDD-NEXT:    adcq %r28, %rdi
+; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rcx
+; EGPR-NDD-NEXT:    addq %rax, %r8
+; EGPR-NDD-NEXT:    adcq %rcx, %r9, %rsi
+; EGPR-NDD-NEXT:    setb %r9b
+; EGPR-NDD-NEXT:    movq %r17, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %r17, %rax
+; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    movq %rdx, %r28
+; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %r19, %rax
+; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    addq %r28, %rax, %r27
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r28
+; EGPR-NDD-NEXT:    movq %r17, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %rax, %r27
+; EGPR-NDD-NEXT:    adcq %rdx, %r28
+; EGPR-NDD-NEXT:    setb %cl
+; EGPR-NDD-NEXT:    movq %r19, %rax
+; EGPR-NDD-NEXT:    movq %r19, %r17
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %r28, %rax
+; EGPR-NDD-NEXT:    movzbl %cl, %ecx
+; EGPR-NDD-NEXT:    adcq %rdx, %rcx
+; EGPR-NDD-NEXT:    addq %r8, %r29, %rdx
+; EGPR-NDD-NEXT:    adcq %r27, %rsi
+; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
+; EGPR-NDD-NEXT:    adcq %r8, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rcx
+; EGPR-NDD-NEXT:    addq %r12, %r25, %r8
+; EGPR-NDD-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %r13, %r23, %r8
+; EGPR-NDD-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %rbp, %r10, %r8
+; EGPR-NDD-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %rbx, %rdi
+; EGPR-NDD-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq $0, %rdx
+; EGPR-NDD-NEXT:    adcq $0, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rcx, %rdi
+; EGPR-NDD-NEXT:    addq %rdx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %rsi, %r30, %r19
+; EGPR-NDD-NEXT:    adcq %rax, %r31, %r30
+; EGPR-NDD-NEXT:    adcq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %r31 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    setb %bpl
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq %rdx, %r25
+; EGPR-NDD-NEXT:    movq %rax, %r28
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r9, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    addq %r25, %rax, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    movq %r15, %r13
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %rax, %rsi
+; EGPR-NDD-NEXT:    adcq %rdx, %rdi
+; EGPR-NDD-NEXT:    setb %r8b
+; EGPR-NDD-NEXT:    movq %r9, %rax
+; EGPR-NDD-NEXT:    movq %r9, %r23
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %rax, %rdi
+; EGPR-NDD-NEXT:    movzbl %r8b, %eax
+; EGPR-NDD-NEXT:    adcq %rax, %rdx, %r8
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq %rdx, %r29
+; EGPR-NDD-NEXT:    movq %rax, %r25
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    addq %r29, %rax, %r9
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r10
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %r9, %rax, %rbx
+; EGPR-NDD-NEXT:    adcq %rdx, %r10, %r9
+; EGPR-NDD-NEXT:    setb %r10b
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %r9, %rax
+; EGPR-NDD-NEXT:    movzbl %r10b, %r9d
+; EGPR-NDD-NEXT:    adcq %r9, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r28, %r9
+; EGPR-NDD-NEXT:    adcq %rdx, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rdi
+; EGPR-NDD-NEXT:    adcq $0, %r8
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    movq %rdx, %r28
+; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    addq %r28, %rax, %r10
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r27
+; EGPR-NDD-NEXT:    movq %r14, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %rax, %r10
+; EGPR-NDD-NEXT:    adcq %rdx, %r27
+; EGPR-NDD-NEXT:    setb %r28b
+; EGPR-NDD-NEXT:    movq %r15, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %r27, %rax
+; EGPR-NDD-NEXT:    movzbl %r28b, %r27d
+; EGPR-NDD-NEXT:    adcq %r27, %rdx
+; EGPR-NDD-NEXT:    addq %r29, %r9
+; EGPR-NDD-NEXT:    adcq %r10, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %rdi
+; EGPR-NDD-NEXT:    adcq %rdx, %r8
+; EGPR-NDD-NEXT:    setb %r10b
+; EGPR-NDD-NEXT:    movq %r13, %rax
+; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    movq %rdx, %r28
+; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %r23, %r14
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r11
+; EGPR-NDD-NEXT:    addq %r28, %rax, %r27
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r28
+; EGPR-NDD-NEXT:    movq %r13, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %rax, %r27
+; EGPR-NDD-NEXT:    adcq %rdx, %r28
+; EGPR-NDD-NEXT:    setb %r15b
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %r28, %rax
+; EGPR-NDD-NEXT:    movzbl %r15b, %r28d
+; EGPR-NDD-NEXT:    adcq %r28, %rdx
+; EGPR-NDD-NEXT:    addq %r29, %rdi
+; EGPR-NDD-NEXT:    adcq %r27, %r8
+; EGPR-NDD-NEXT:    movzbl %r10b, %r10d
+; EGPR-NDD-NEXT:    adcq %r10, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx
+; EGPR-NDD-NEXT:    addq %r25, %rcx
+; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %r19, %rbx, %rcx
+; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %r30, %r9, %rcx
+; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %r31, %rsi, %rcx
+; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movzbl %bpl, %ecx
+; EGPR-NDD-NEXT:    adcq %rdi, %rcx
+; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq $0, %r8, %rcx
+; EGPR-NDD-NEXT:    movq %rcx, (%rsp) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq 64(%r20), %r28
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r28
+; EGPR-NDD-NEXT:    movq %rdx, %r25
+; EGPR-NDD-NEXT:    movq %rax, %r30
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r23 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r28
+; EGPR-NDD-NEXT:    addq %r25, %rax, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rsi
+; EGPR-NDD-NEXT:    movq 72(%r20), %r29
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r29
+; EGPR-NDD-NEXT:    addq %rax, %rcx
+; EGPR-NDD-NEXT:    adcq %rdx, %rsi
+; EGPR-NDD-NEXT:    setb %dil
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r29
+; EGPR-NDD-NEXT:    addq %rax, %rsi
+; EGPR-NDD-NEXT:    movzbl %dil, %eax
+; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rdi
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    mulq %r28
+; EGPR-NDD-NEXT:    movq %rdx, %r31
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    mulq %r28
+; EGPR-NDD-NEXT:    addq %r31, %rax, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    mulq %r29
+; EGPR-NDD-NEXT:    addq %r8, %rax
+; EGPR-NDD-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %rdx, %r9, %r8
+; EGPR-NDD-NEXT:    setb %r9b
+; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    mulq %r29
+; EGPR-NDD-NEXT:    addq %r8, %rax
+; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
+; EGPR-NDD-NEXT:    adcq %r8, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r30, %r8
+; EGPR-NDD-NEXT:    adcq %rdx, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rdi
+; EGPR-NDD-NEXT:    movq 80(%r20), %rbx
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    mulq %rbx
+; EGPR-NDD-NEXT:    movq %rdx, %r30
+; EGPR-NDD-NEXT:    movq %rax, %r31
+; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    mulq %rbx
+; EGPR-NDD-NEXT:    addq %r30, %rax, %r9
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r10
+; EGPR-NDD-NEXT:    movq 88(%r20), %r15
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    mulq %r15
+; EGPR-NDD-NEXT:    addq %rax, %r9
+; EGPR-NDD-NEXT:    adcq %rdx, %r10
+; EGPR-NDD-NEXT:    setb %r19b
+; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    mulq %r15
+; EGPR-NDD-NEXT:    addq %r10, %rax
+; EGPR-NDD-NEXT:    movzbl %r19b, %r10d
+; EGPR-NDD-NEXT:    adcq %r10, %rdx
+; EGPR-NDD-NEXT:    addq %r31, %r8
+; EGPR-NDD-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq %r9, %rcx
+; EGPR-NDD-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
+; EGPR-NDD-NEXT:    addq %rax, %rsi
+; EGPR-NDD-NEXT:    adcq %rdi, %rcx
+; EGPR-NDD-NEXT:    setb %dil
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %rbx
+; EGPR-NDD-NEXT:    movq %rdx, %r30
+; EGPR-NDD-NEXT:    movq %rax, %r31
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %rbx
+; EGPR-NDD-NEXT:    addq %r30, %rax, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r15
+; EGPR-NDD-NEXT:    addq %rax, %r8
+; EGPR-NDD-NEXT:    adcq %rdx, %r9
+; EGPR-NDD-NEXT:    setb %r10b
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r15
+; EGPR-NDD-NEXT:    addq %r9, %rax
+; EGPR-NDD-NEXT:    movzbl %r10b, %r9d
+; EGPR-NDD-NEXT:    adcq %r9, %rdx
+; EGPR-NDD-NEXT:    addq %rsi, %r31, %r25
+; EGPR-NDD-NEXT:    adcq %rcx, %r8, %r19
+; EGPR-NDD-NEXT:    movzbl %dil, %ecx
+; EGPR-NDD-NEXT:    adcq %rcx, %rax, %r31
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r12
+; EGPR-NDD-NEXT:    imulq %r15, %r26, %rcx
+; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    mulq %rbx
+; EGPR-NDD-NEXT:    movq %rax, %r30
+; EGPR-NDD-NEXT:    addq %rcx, %rdx, %rax
+; EGPR-NDD-NEXT:    imulq %rbx, %r18, %rcx
+; EGPR-NDD-NEXT:    addq %rax, %rcx
+; EGPR-NDD-NEXT:    imulq %r29, %r11, %rsi
+; EGPR-NDD-NEXT:    movq %r11, %rax
+; EGPR-NDD-NEXT:    mulq %r28
+; EGPR-NDD-NEXT:    addq %rsi, %rdx
+; EGPR-NDD-NEXT:    imulq %r28, %r16, %rsi
+; EGPR-NDD-NEXT:    addq %rsi, %rdx
+; EGPR-NDD-NEXT:    addq %r30, %rax, %rsi
+; EGPR-NDD-NEXT:    adcq %rcx, %rdx, %rdi
+; EGPR-NDD-NEXT:    movq %r28, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq %rdx, %r30
+; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    movq %r29, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    addq %r30, %rax, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r8
+; EGPR-NDD-NEXT:    movq %r28, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %rax, %rcx
+; EGPR-NDD-NEXT:    adcq %rdx, %r8
+; EGPR-NDD-NEXT:    setb %r9b
+; EGPR-NDD-NEXT:    movq %r29, %rax
+; EGPR-NDD-NEXT:    mulq %r18
+; EGPR-NDD-NEXT:    addq %r8, %rax
+; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
+; EGPR-NDD-NEXT:    adcq %r8, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %rsi
+; EGPR-NDD-NEXT:    adcq %rdi, %rdx, %r29
+; EGPR-NDD-NEXT:    movq 112(%r20), %rdi
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    mulq %rdi
+; EGPR-NDD-NEXT:    movq %rax, %r26
+; EGPR-NDD-NEXT:    imulq 120(%r20), %r22, %rax
+; EGPR-NDD-NEXT:    addq %rdx, %rax
+; EGPR-NDD-NEXT:    imulq %rdi, %r21, %rdx
+; EGPR-NDD-NEXT:    addq %rdx, %rax, %r8
+; EGPR-NDD-NEXT:    movq 96(%r20), %r28
+; EGPR-NDD-NEXT:    movq 104(%r20), %rdi
+; EGPR-NDD-NEXT:    imulq %rdi, %r24, %r9
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r28
+; EGPR-NDD-NEXT:    addq %r9, %rdx
+; EGPR-NDD-NEXT:    imulq %r28, %r23, %r9
+; EGPR-NDD-NEXT:    addq %r9, %rdx
+; EGPR-NDD-NEXT:    addq %r26, %rax, %r9
+; EGPR-NDD-NEXT:    adcq %rdx, %r8
+; EGPR-NDD-NEXT:    movq %r28, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    movq %rdx, %r23
+; EGPR-NDD-NEXT:    movq %rax, %r24
+; EGPR-NDD-NEXT:    movq %rdi, %rax
+; EGPR-NDD-NEXT:    mulq %r22
+; EGPR-NDD-NEXT:    addq %r23, %rax, %r10
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r11
+; EGPR-NDD-NEXT:    movq %r28, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %rax, %r10
+; EGPR-NDD-NEXT:    adcq %rdx, %r11
+; EGPR-NDD-NEXT:    setb %r16b
+; EGPR-NDD-NEXT:    movq %rdi, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %r11, %rax
+; EGPR-NDD-NEXT:    movzbl %r16b, %edi
+; EGPR-NDD-NEXT:    adcq %rdi, %rdx
+; EGPR-NDD-NEXT:    addq %r9, %rax
+; EGPR-NDD-NEXT:    adcq %r8, %rdx
+; EGPR-NDD-NEXT:    addq %r27, %r24, %rdi
+; EGPR-NDD-NEXT:    adcq %r10, %rcx
+; EGPR-NDD-NEXT:    adcq %rsi, %rax
+; EGPR-NDD-NEXT:    adcq %r29, %rdx
+; EGPR-NDD-NEXT:    addq %rdi, %r25, %r15
+; EGPR-NDD-NEXT:    adcq %rcx, %r19, %rbx
+; EGPR-NDD-NEXT:    adcq %rax, %r31, %rbp
+; EGPR-NDD-NEXT:    adcq %rdx, %r12, %r30
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r18 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq 80(%r18), %r22
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    movq %rax, %r26
+; EGPR-NDD-NEXT:    movq %rdx, %rdi
+; EGPR-NDD-NEXT:    movq 88(%r18), %r20
+; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %rdi, %rax, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rsi
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; EGPR-NDD-NEXT:    mulq %r12
+; EGPR-NDD-NEXT:    addq %rax, %rcx
+; EGPR-NDD-NEXT:    adcq %rdx, %rsi
+; EGPR-NDD-NEXT:    setb %dil
+; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    mulq %r12
+; EGPR-NDD-NEXT:    addq %rax, %rsi
+; EGPR-NDD-NEXT:    movzbl %dil, %eax
+; EGPR-NDD-NEXT:    adcq %rax, %rdx, %rdi
+; EGPR-NDD-NEXT:    movq 64(%r18), %r24
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    movq %rax, %r29
+; EGPR-NDD-NEXT:    movq %rdx, %r27
+; EGPR-NDD-NEXT:    movq 72(%r18), %r23
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %r27, %rax, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r12
+; EGPR-NDD-NEXT:    addq %r8, %rax, %r31
+; EGPR-NDD-NEXT:    adcq %rdx, %r9, %r8
+; EGPR-NDD-NEXT:    setb %r9b
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r12
+; EGPR-NDD-NEXT:    addq %r8, %rax
+; EGPR-NDD-NEXT:    movzbl %r9b, %r8d
+; EGPR-NDD-NEXT:    adcq %r8, %rdx
+; EGPR-NDD-NEXT:    addq %rax, %r26, %r8
+; EGPR-NDD-NEXT:    adcq %rdx, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rsi
+; EGPR-NDD-NEXT:    adcq $0, %rdi
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    movq %rdx, %r26
+; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %r26, %rax, %r9
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r10
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r17
+; EGPR-NDD-NEXT:    addq %rax, %r9
+; EGPR-NDD-NEXT:    adcq %rdx, %r10
+; EGPR-NDD-NEXT:    setb %r11b
+; EGPR-NDD-NEXT:    movq %r23, %rax
+; EGPR-NDD-NEXT:    mulq %r17
+; EGPR-NDD-NEXT:    addq %r10, %rax
+; EGPR-NDD-NEXT:    movzbl %r11b, %r10d
+; EGPR-NDD-NEXT:    adcq %r10, %rdx
+; EGPR-NDD-NEXT:    addq %r8, %r27, %r28
+; EGPR-NDD-NEXT:    adcq %rcx, %r9, %r25
+; EGPR-NDD-NEXT:    adcq $0, %rax
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rcx
+; EGPR-NDD-NEXT:    addq %rax, %rsi
+; EGPR-NDD-NEXT:    adcq %rdi, %rcx
+; EGPR-NDD-NEXT:    setb %dil
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    movq %rdx, %r26
+; EGPR-NDD-NEXT:    movq %rax, %r27
+; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    addq %r26, %rax, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r9
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    mulq %r17
+; EGPR-NDD-NEXT:    addq %rax, %r8
+; EGPR-NDD-NEXT:    adcq %rdx, %r9
+; EGPR-NDD-NEXT:    setb %r10b
+; EGPR-NDD-NEXT:    movq %r20, %rax
+; EGPR-NDD-NEXT:    mulq %r17
+; EGPR-NDD-NEXT:    addq %r9, %rax
+; EGPR-NDD-NEXT:    movzbl %r10b, %r9d
+; EGPR-NDD-NEXT:    adcq %r9, %rdx
+; EGPR-NDD-NEXT:    addq %rsi, %r27
+; EGPR-NDD-NEXT:    adcq %rcx, %r8, %r19
+; EGPR-NDD-NEXT:    movzbl %dil, %ecx
+; EGPR-NDD-NEXT:    adcq %rax, %rcx
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %rdi
+; EGPR-NDD-NEXT:    movq %r18, %r9
+; EGPR-NDD-NEXT:    movq 96(%r18), %r26
+; EGPR-NDD-NEXT:    imulq %r17, %r26, %rsi
+; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    mulq %r16
+; EGPR-NDD-NEXT:    movq %rax, %r18
+; EGPR-NDD-NEXT:    addq %rsi, %rdx, %rax
+; EGPR-NDD-NEXT:    movq 104(%r9), %r8
+; EGPR-NDD-NEXT:    imulq %r16, %r8, %rdx
+; EGPR-NDD-NEXT:    addq %rdx, %rax, %rsi
+; EGPR-NDD-NEXT:    movq 112(%r9), %rax
+; EGPR-NDD-NEXT:    movq %r9, %r11
+; EGPR-NDD-NEXT:    imulq %r12, %rax, %r9
+; EGPR-NDD-NEXT:    mulq %r21
+; EGPR-NDD-NEXT:    addq %r9, %rdx
+; EGPR-NDD-NEXT:    imulq 120(%r11), %r21, %r9
+; EGPR-NDD-NEXT:    addq %r9, %rdx
+; EGPR-NDD-NEXT:    addq %r18, %rax, %r9
+; EGPR-NDD-NEXT:    adcq %rsi, %rdx, %r16
+; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    movq %rdx, %r17
+; EGPR-NDD-NEXT:    movq %rax, %rsi
+; EGPR-NDD-NEXT:    movq %r12, %rax
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    addq %r17, %rax, %r10
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r17
+; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    mulq %r8
+; EGPR-NDD-NEXT:    addq %r10, %rax, %r11
+; EGPR-NDD-NEXT:    adcq %rdx, %r17, %r10
+; EGPR-NDD-NEXT:    setb %r17b
+; EGPR-NDD-NEXT:    movq %r12, %rax
+; EGPR-NDD-NEXT:    mulq %r8
+; EGPR-NDD-NEXT:    addq %r10, %rax
+; EGPR-NDD-NEXT:    movzbl %r17b, %r8d
+; EGPR-NDD-NEXT:    adcq %r8, %rdx
+; EGPR-NDD-NEXT:    addq %r9, %rax, %r10
+; EGPR-NDD-NEXT:    adcq %r16, %rdx, %r17
+; EGPR-NDD-NEXT:    imulq %r14, %r24, %r8
+; EGPR-NDD-NEXT:    movq %r24, %rax
+; EGPR-NDD-NEXT:    mulq %r13
+; EGPR-NDD-NEXT:    movq %rax, %r9
+; EGPR-NDD-NEXT:    addq %r8, %rdx, %rax
+; EGPR-NDD-NEXT:    imulq %r13, %r23, %rdx
+; EGPR-NDD-NEXT:    addq %rdx, %rax, %r8
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r21 # 8-byte Reload
+; EGPR-NDD-NEXT:    imulq %r21, %r22, %r16
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r26 # 8-byte Reload
+; EGPR-NDD-NEXT:    mulq %r26
+; EGPR-NDD-NEXT:    addq %r16, %rdx
+; EGPR-NDD-NEXT:    imulq %r26, %r20, %r16
+; EGPR-NDD-NEXT:    addq %r16, %rdx
+; EGPR-NDD-NEXT:    addq %r9, %rax, %r16
+; EGPR-NDD-NEXT:    adcq %r8, %rdx, %r18
+; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    movq %rdx, %r8
+; EGPR-NDD-NEXT:    movq %rax, %r9
+; EGPR-NDD-NEXT:    movq %r21, %rax
+; EGPR-NDD-NEXT:    movq %r21, %r22
+; EGPR-NDD-NEXT:    mulq %r24
+; EGPR-NDD-NEXT:    addq %rax, %r8
+; EGPR-NDD-NEXT:    adcq $0, %rdx, %r20
+; EGPR-NDD-NEXT:    movq %r26, %rax
+; EGPR-NDD-NEXT:    mulq %r23
+; EGPR-NDD-NEXT:    addq %rax, %r8
+; EGPR-NDD-NEXT:    adcq %rdx, %r20
+; EGPR-NDD-NEXT:    setb %r21b
+; EGPR-NDD-NEXT:    movq %r22, %rax
+; EGPR-NDD-NEXT:    mulq %r23
+; EGPR-NDD-NEXT:    addq %r20, %rax
+; EGPR-NDD-NEXT:    movzbl %r21b, %r20d
+; EGPR-NDD-NEXT:    adcq %r20, %rdx
+; EGPR-NDD-NEXT:    addq %r16, %rax
+; EGPR-NDD-NEXT:    adcq %r18, %rdx
+; EGPR-NDD-NEXT:    addq %r9, %rsi
+; EGPR-NDD-NEXT:    adcq %r11, %r8
+; EGPR-NDD-NEXT:    adcq %r10, %rax
+; EGPR-NDD-NEXT:    adcq %r17, %rdx
+; EGPR-NDD-NEXT:    addq %r27, %rsi
+; EGPR-NDD-NEXT:    adcq %r19, %r8
+; EGPR-NDD-NEXT:    adcq %rcx, %rax
+; EGPR-NDD-NEXT:    adcq %rdx, %rdi, %rcx
+; EGPR-NDD-NEXT:    addq {{[-0-9]+}}(%r{{[sb]}}p), %r29, %rdx # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r31, %rdi # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r28, %r9 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq {{[-0-9]+}}(%r{{[sb]}}p), %r25, %r10 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %r15, %rsi
+; EGPR-NDD-NEXT:    adcq %rbx, %r8
+; EGPR-NDD-NEXT:    adcq %rbp, %rax
+; EGPR-NDD-NEXT:    adcq %r30, %rcx
+; EGPR-NDD-NEXT:    addq %rdx, {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %rdi, {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %r9, {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %r10, {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %rsi, {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %r8, (%rsp), %r8 # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %rax, {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    adcq %rcx, {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r16, (%r11)
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r16, 8(%r11)
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r16, 16(%r11)
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r16, 24(%r11)
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r16, 32(%r11)
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r16, 40(%r11)
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r16, 48(%r11)
+; EGPR-NDD-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r16 # 8-byte Reload
+; EGPR-NDD-NEXT:    movq %r16, 56(%r11)
+; EGPR-NDD-NEXT:    movq %rdx, 64(%r11)
+; EGPR-NDD-NEXT:    movq %rdi, 72(%r11)
+; EGPR-NDD-NEXT:    movq %r9, 80(%r11)
+; EGPR-NDD-NEXT:    movq %r10, 88(%r11)
+; EGPR-NDD-NEXT:    movq %rsi, 96(%r11)
+; EGPR-NDD-NEXT:    movq %r8, 104(%r11)
+; EGPR-NDD-NEXT:    movq %rax, 112(%r11)
+; EGPR-NDD-NEXT:    movq %rcx, 120(%r11)
+; EGPR-NDD-NEXT:    addq $104, %rsp
+; EGPR-NDD-NEXT:    popq %rbx
+; EGPR-NDD-NEXT:    popq %r12
+; EGPR-NDD-NEXT:    popq %r13
+; EGPR-NDD-NEXT:    popq %r14
+; EGPR-NDD-NEXT:    popq %r15
+; EGPR-NDD-NEXT:    popq %rbp
+; EGPR-NDD-NEXT:    retq
   %av = load i1024, ptr %a
   %bv = load i1024, ptr %b
   %r = mul i1024 %av, %bv

From 5355857038cf6f9e3831504804e973508ffc2d01 Mon Sep 17 00:00:00 2001
From: cor3ntin <corentinjabot@gmail.com>
Date: Wed, 24 Jan 2024 07:45:50 +0100
Subject: [PATCH 742/843] [Clang][NFC] Optimize isAsciiIdentifierContinue
 (#78699)

Precompute the isAsciiIdentifierContinue table which is on the hot path.


https://llvm-compile-time-tracker.com/compare.php?from=30da0f5a359ab4a684c5fdf0f4dbed20bae10f99&to=cb0e48db2b8193d2ee59c2a6e998317cb220d513&stat=instructions:u
---
 clang/include/clang/Basic/CharInfo.h | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Basic/CharInfo.h b/clang/include/clang/Basic/CharInfo.h
index 7d41193835089..d807955311828 100644
--- a/clang/include/clang/Basic/CharInfo.h
+++ b/clang/include/clang/Basic/CharInfo.h
@@ -59,12 +59,28 @@ LLVM_READONLY inline bool isAsciiIdentifierStart(unsigned char c,
   return AllowDollar && c == '$';
 }
 
+LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c) {
+  // Precomputed CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER
+  static constexpr unsigned char IDContinue[256] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,
+      0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+      1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  return IDContinue[c];
+}
+
 /// Returns true if this is a body character of a C identifier,
 /// which is [a-zA-Z0-9_].
 LLVM_READONLY inline bool isAsciiIdentifierContinue(unsigned char c,
-                                                    bool AllowDollar = false) {
-  using namespace charinfo;
-  if (InfoTable[c] & (CHAR_UPPER|CHAR_LOWER|CHAR_DIGIT|CHAR_UNDER))
+                                                    bool AllowDollar) {
+  if (isAsciiIdentifierContinue(c))
     return true;
   return AllowDollar && c == '$';
 }

From 7a6c2628e99f70edf6ee46a6b4b4d3d7301353c6 Mon Sep 17 00:00:00 2001
From: martinboehme <mboehme@google.com>
Date: Wed, 24 Jan 2024 08:06:32 +0100
Subject: [PATCH 743/843] [clang][dataflow] Eliminate two uses of
 `RecordValue::getLoc()`. (#79163)

This is a small step towards eventually eliminating `RecordValue`
entirely.
---
 clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
index 196a1360a7750..acb38e5764745 100644
--- a/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
+++ b/clang/lib/Analysis/FlowSensitive/DataflowEnvironment.cpp
@@ -412,13 +412,13 @@ void Environment::initialize() {
           QualType ThisPointeeType =
               SurroundingMethodDecl->getFunctionObjectParameterType();
           setThisPointeeStorageLocation(
-              cast<RecordValue>(createValue(ThisPointeeType))->getLoc());
+              cast<RecordStorageLocation>(createObject(ThisPointeeType)));
         }
       }
     } else if (MethodDecl->isImplicitObjectMemberFunction()) {
       QualType ThisPointeeType = MethodDecl->getFunctionObjectParameterType();
       setThisPointeeStorageLocation(
-          cast<RecordValue>(createValue(ThisPointeeType))->getLoc());
+          cast<RecordStorageLocation>(createObject(ThisPointeeType)));
     }
   }
 }

From 78b00c116be8b3b53ff13552e31eb305b11cb169 Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Wed, 24 Jan 2024 07:05:37 +0000
Subject: [PATCH 744/843] Revert "[lldb] Improve maintainability and
 readability for ValueObject methods (#75865)"

This reverts commit d657519838e4b2310e13ec5ff52599e041860825 as it
breaks two dozen tests. The breakages are related to variable path
expression parsing and summary string parsing (possibly the same code).
---
 lldb/source/Core/ValueObject.cpp | 330 ++++++++++++++++---------------
 1 file changed, 166 insertions(+), 164 deletions(-)

diff --git a/lldb/source/Core/ValueObject.cpp b/lldb/source/Core/ValueObject.cpp
index d58bf2ca763d9..9208047be3666 100644
--- a/lldb/source/Core/ValueObject.cpp
+++ b/lldb/source/Core/ValueObject.cpp
@@ -1582,64 +1582,62 @@ bool ValueObject::IsUninitializedReference() {
 
 ValueObjectSP ValueObject::GetSyntheticArrayMember(size_t index,
                                                    bool can_create) {
-  if (!IsPointerType() && !IsArrayType())
-    return ValueObjectSP();
-
-  std::string index_str = llvm::formatv("[{0}]", index);
-  ConstString index_const_str(index_str);
-  // Check if we have already created a synthetic array member in this valid
-  // object. If we have we will re-use it.
-  if (auto existing_synthetic_child = GetSyntheticChild(index_const_str))
-    return existing_synthetic_child;
-
-  // We haven't made a synthetic array member for INDEX yet, so lets make
-  // one and cache it for any future reference.
-  ValueObject *synthetic_child = CreateChildAtIndex(0, true, index);
-
-  if (!synthetic_child)
-    return ValueObjectSP();
-
-  // Cache the synthetic child's value because it's valid.
-  AddSyntheticChild(index_const_str, synthetic_child);
-  auto synthetic_child_sp = synthetic_child->GetSP();
-  synthetic_child_sp->SetName(ConstString(index_str));
-  synthetic_child_sp->m_flags.m_is_array_item_for_pointer = true;
+  ValueObjectSP synthetic_child_sp;
+  if (IsPointerType() || IsArrayType()) {
+    std::string index_str = llvm::formatv("[{0}]", index);
+    ConstString index_const_str(index_str);
+    // Check if we have already created a synthetic array member in this valid
+    // object. If we have we will re-use it.
+    synthetic_child_sp = GetSyntheticChild(index_const_str);
+    if (!synthetic_child_sp) {
+      ValueObject *synthetic_child;
+      // We haven't made a synthetic array member for INDEX yet, so lets make
+      // one and cache it for any future reference.
+      synthetic_child = CreateChildAtIndex(0, true, index);
+
+      // Cache the value if we got one back...
+      if (synthetic_child) {
+        AddSyntheticChild(index_const_str, synthetic_child);
+        synthetic_child_sp = synthetic_child->GetSP();
+        synthetic_child_sp->SetName(ConstString(index_str));
+        synthetic_child_sp->m_flags.m_is_array_item_for_pointer = true;
+      }
+    }
+  }
   return synthetic_child_sp;
 }
 
 ValueObjectSP ValueObject::GetSyntheticBitFieldChild(uint32_t from, uint32_t to,
                                                      bool can_create) {
-  if (!IsScalarType())
-    return ValueObjectSP();
-
-  std::string index_str = llvm::formatv("[{0}-{1}]", from, to);
-  ConstString index_const_str(index_str);
-
-  // Check if we have already created a synthetic array member in this valid
-  // object. If we have we will re-use it.
-  if (auto existing_synthetic_child = GetSyntheticChild(index_const_str))
-    return existing_synthetic_child;
-
-  uint32_t bit_field_size = to - from + 1;
-  uint32_t bit_field_offset = from;
-  if (GetDataExtractor().GetByteOrder() == eByteOrderBig)
-    bit_field_offset =
-        GetByteSize().value_or(0) * 8 - bit_field_size - bit_field_offset;
-
-  // We haven't made a synthetic array member for INDEX yet, so lets make
-  // one and cache it for any future reference.
-  ValueObjectChild *synthetic_child = new ValueObjectChild(
-      *this, GetCompilerType(), index_const_str, GetByteSize().value_or(0), 0,
-      bit_field_size, bit_field_offset, false, false, eAddressTypeInvalid, 0);
-
-  if (!synthetic_child)
-    return ValueObjectSP();
-
-  // Cache the synthetic child's value because it's valid.
-  AddSyntheticChild(index_const_str, synthetic_child);
-  auto synthetic_child_sp = synthetic_child->GetSP();
-  synthetic_child_sp->SetName(ConstString(index_str));
-  synthetic_child_sp->m_flags.m_is_bitfield_for_scalar = true;
+  ValueObjectSP synthetic_child_sp;
+  if (IsScalarType()) {
+    std::string index_str = llvm::formatv("[{0}-{1}]", from, to);
+    ConstString index_const_str(index_str);
+    // Check if we have already created a synthetic array member in this valid
+    // object. If we have we will re-use it.
+    synthetic_child_sp = GetSyntheticChild(index_const_str);
+    if (!synthetic_child_sp) {
+      uint32_t bit_field_size = to - from + 1;
+      uint32_t bit_field_offset = from;
+      if (GetDataExtractor().GetByteOrder() == eByteOrderBig)
+        bit_field_offset =
+            GetByteSize().value_or(0) * 8 - bit_field_size - bit_field_offset;
+      // We haven't made a synthetic array member for INDEX yet, so lets make
+      // one and cache it for any future reference.
+      ValueObjectChild *synthetic_child = new ValueObjectChild(
+          *this, GetCompilerType(), index_const_str, GetByteSize().value_or(0),
+          0, bit_field_size, bit_field_offset, false, false,
+          eAddressTypeInvalid, 0);
+
+      // Cache the value if we got one back...
+      if (synthetic_child) {
+        AddSyntheticChild(index_const_str, synthetic_child);
+        synthetic_child_sp = synthetic_child->GetSP();
+        synthetic_child_sp->SetName(ConstString(index_str));
+        synthetic_child_sp->m_flags.m_is_bitfield_for_scalar = true;
+      }
+    }
+  }
   return synthetic_child_sp;
 }
 
@@ -1649,8 +1647,9 @@ ValueObjectSP ValueObject::GetSyntheticChildAtOffset(
 
   ValueObjectSP synthetic_child_sp;
 
-  if (name_const_str.IsEmpty())
+  if (name_const_str.IsEmpty()) {
     name_const_str.SetString("@" + std::to_string(offset));
+  }
 
   // Check if we have already created a synthetic array member in this valid
   // object. If we have we will re-use it.
@@ -1660,13 +1659,13 @@ ValueObjectSP ValueObject::GetSyntheticChildAtOffset(
     return synthetic_child_sp;
 
   if (!can_create)
-    return ValueObjectSP();
+    return {};
 
   ExecutionContext exe_ctx(GetExecutionContextRef());
   std::optional<uint64_t> size =
       type.GetByteSize(exe_ctx.GetBestExecutionContextScope());
   if (!size)
-    return ValueObjectSP();
+    return {};
   ValueObjectChild *synthetic_child =
       new ValueObjectChild(*this, type, name_const_str, *size, offset, 0, 0,
                            false, false, eAddressTypeInvalid, 0);
@@ -1700,7 +1699,7 @@ ValueObjectSP ValueObject::GetSyntheticBase(uint32_t offset,
     return synthetic_child_sp;
 
   if (!can_create)
-    return ValueObjectSP();
+    return {};
 
   const bool is_base_class = true;
 
@@ -1708,7 +1707,7 @@ ValueObjectSP ValueObject::GetSyntheticBase(uint32_t offset,
   std::optional<uint64_t> size =
       type.GetByteSize(exe_ctx.GetBestExecutionContextScope());
   if (!size)
-    return ValueObjectSP();
+    return {};
   ValueObjectChild *synthetic_child =
       new ValueObjectChild(*this, type, name_const_str, *size, offset, 0, 0,
                            is_base_class, false, eAddressTypeInvalid, 0);
@@ -1737,30 +1736,30 @@ static const char *SkipLeadingExpressionPathSeparators(const char *expression) {
 ValueObjectSP
 ValueObject::GetSyntheticExpressionPathChild(const char *expression,
                                              bool can_create) {
+  ValueObjectSP synthetic_child_sp;
   ConstString name_const_string(expression);
   // Check if we have already created a synthetic array member in this valid
   // object. If we have we will re-use it.
-  if (auto existing_synthetic_child = GetSyntheticChild(name_const_string))
-    return existing_synthetic_child;
-
-  // We haven't made a synthetic array member for expression yet, so lets
-  // make one and cache it for any future reference.
-  auto path_options = GetValueForExpressionPathOptions();
-  path_options.SetSyntheticChildrenTraversal(
-      GetValueForExpressionPathOptions::SyntheticChildrenTraversal::None);
-  auto synthetic_child =
-      GetValueForExpressionPath(expression, nullptr, nullptr, path_options);
-
-  if (!synthetic_child)
-    return ValueObjectSP();
-
-  // Cache the synthetic child's value because it's valid.
-  // FIXME: this causes a "real" child to end up with its name changed to
-  // the contents of expression
-  AddSyntheticChild(name_const_string, synthetic_child.get());
-  synthetic_child->SetName(
-      ConstString(SkipLeadingExpressionPathSeparators(expression)));
-  return synthetic_child;
+  synthetic_child_sp = GetSyntheticChild(name_const_string);
+  if (!synthetic_child_sp) {
+    // We haven't made a synthetic array member for expression yet, so lets
+    // make one and cache it for any future reference.
+    synthetic_child_sp = GetValueForExpressionPath(
+        expression, nullptr, nullptr,
+        GetValueForExpressionPathOptions().SetSyntheticChildrenTraversal(
+            GetValueForExpressionPathOptions::SyntheticChildrenTraversal::
+                None));
+
+    // Cache the value if we got one back...
+    if (synthetic_child_sp.get()) {
+      // FIXME: this causes a "real" child to end up with its name changed to
+      // the contents of expression
+      AddSyntheticChild(name_const_string, synthetic_child_sp.get());
+      synthetic_child_sp->SetName(
+          ConstString(SkipLeadingExpressionPathSeparators(expression)));
+    }
+  }
+  return synthetic_child_sp;
 }
 
 void ValueObject::CalculateSyntheticValue() {
@@ -1957,55 +1956,66 @@ ValueObjectSP ValueObject::GetValueForExpressionPath(
     const GetValueForExpressionPathOptions &options,
     ExpressionPathAftermath *final_task_on_target) {
 
-  auto dummy_stop_reason = eExpressionPathScanEndReasonUnknown;
-  auto dummy_value_type = eExpressionPathEndResultTypeInvalid;
-  auto dummy_final_task = eExpressionPathAftermathNothing;
-
-  auto proxy_stop_reason = reason_to_stop ? reason_to_stop : &dummy_stop_reason;
-  auto proxy_value_type =
-      final_value_type ? final_value_type : &dummy_value_type;
-  auto proxy_final_task =
-      final_task_on_target ? final_task_on_target : &dummy_final_task;
-
-  auto ret_value = GetValueForExpressionPath_Impl(expression, proxy_stop_reason,
-                                                  proxy_value_type, options,
-                                                  proxy_final_task);
-
-  // The caller knows nothing happened if `final_task_on_target` doesn't change.
-  if (!ret_value || (*proxy_value_type) != eExpressionPathEndResultTypePlain ||
-      !final_task_on_target)
-    return ValueObjectSP();
-
-  ExpressionPathAftermath &final_task_on_target_ref = (*final_task_on_target);
-  ExpressionPathScanEndReason stop_reason_for_error;
-  Status error;
-  // The method can only dereference and take the address of plain objects.
-  switch (final_task_on_target_ref) {
-  case eExpressionPathAftermathNothing:
-    return ret_value;
-
-  case eExpressionPathAftermathDereference:
-    ret_value = ret_value->Dereference(error);
-    stop_reason_for_error = eExpressionPathScanEndReasonDereferencingFailed;
-    break;
-
-  case eExpressionPathAftermathTakeAddress:
-    ret_value = ret_value->AddressOf(error);
-    stop_reason_for_error = eExpressionPathScanEndReasonTakingAddressFailed;
-    break;
-  }
-
-  if (ret_value && error.Success()) {
-    final_task_on_target_ref = eExpressionPathAftermathNothing;
-    return ret_value;
+  ExpressionPathScanEndReason dummy_reason_to_stop =
+      ValueObject::eExpressionPathScanEndReasonUnknown;
+  ExpressionPathEndResultType dummy_final_value_type =
+      ValueObject::eExpressionPathEndResultTypeInvalid;
+  ExpressionPathAftermath dummy_final_task_on_target =
+      ValueObject::eExpressionPathAftermathNothing;
+
+  ValueObjectSP ret_val = GetValueForExpressionPath_Impl(
+      expression, reason_to_stop ? reason_to_stop : &dummy_reason_to_stop,
+      final_value_type ? final_value_type : &dummy_final_value_type, options,
+      final_task_on_target ? final_task_on_target
+                           : &dummy_final_task_on_target);
+
+  if (!final_task_on_target ||
+      *final_task_on_target == ValueObject::eExpressionPathAftermathNothing)
+    return ret_val;
+
+  if (ret_val.get() &&
+      ((final_value_type ? *final_value_type : dummy_final_value_type) ==
+       eExpressionPathEndResultTypePlain)) // I can only deref and takeaddress
+                                           // of plain objects
+  {
+    if ((final_task_on_target ? *final_task_on_target
+                              : dummy_final_task_on_target) ==
+        ValueObject::eExpressionPathAftermathDereference) {
+      Status error;
+      ValueObjectSP final_value = ret_val->Dereference(error);
+      if (error.Fail() || !final_value.get()) {
+        if (reason_to_stop)
+          *reason_to_stop =
+              ValueObject::eExpressionPathScanEndReasonDereferencingFailed;
+        if (final_value_type)
+          *final_value_type = ValueObject::eExpressionPathEndResultTypeInvalid;
+        return ValueObjectSP();
+      } else {
+        if (final_task_on_target)
+          *final_task_on_target = ValueObject::eExpressionPathAftermathNothing;
+        return final_value;
+      }
+    }
+    if (*final_task_on_target ==
+        ValueObject::eExpressionPathAftermathTakeAddress) {
+      Status error;
+      ValueObjectSP final_value = ret_val->AddressOf(error);
+      if (error.Fail() || !final_value.get()) {
+        if (reason_to_stop)
+          *reason_to_stop =
+              ValueObject::eExpressionPathScanEndReasonTakingAddressFailed;
+        if (final_value_type)
+          *final_value_type = ValueObject::eExpressionPathEndResultTypeInvalid;
+        return ValueObjectSP();
+      } else {
+        if (final_task_on_target)
+          *final_task_on_target = ValueObject::eExpressionPathAftermathNothing;
+        return final_value;
+      }
+    }
   }
-
-  if (reason_to_stop)
-    *reason_to_stop = stop_reason_for_error;
-
-  if (final_value_type)
-    *final_value_type = eExpressionPathEndResultTypeInvalid;
-  return ValueObjectSP();
+  return ret_val; // final_task_on_target will still have its original value, so
+                  // you know I did not do it
 }
 
 ValueObjectSP ValueObject::GetValueForExpressionPath_Impl(
@@ -2676,47 +2686,39 @@ ValueObjectSP ValueObject::AddressOf(Status &error) {
   const bool scalar_is_load_address = false;
   addr_t addr = GetAddressOf(scalar_is_load_address, &address_type);
   error.Clear();
+  if (addr != LLDB_INVALID_ADDRESS && address_type != eAddressTypeHost) {
+    switch (address_type) {
+    case eAddressTypeInvalid: {
+      StreamString expr_path_strm;
+      GetExpressionPath(expr_path_strm);
+      error.SetErrorStringWithFormat("'%s' is not in memory",
+                                     expr_path_strm.GetData());
+    } break;
 
-  StreamString expr_path_strm;
-  GetExpressionPath(expr_path_strm);
-  const char *expr_path_str = expr_path_strm.GetData();
-
-  ExecutionContext exe_ctx(GetExecutionContextRef());
-  auto scope = exe_ctx.GetBestExecutionContextScope();
-
-  if (addr == LLDB_INVALID_ADDRESS) {
+    case eAddressTypeFile:
+    case eAddressTypeLoad: {
+      CompilerType compiler_type = GetCompilerType();
+      if (compiler_type) {
+        std::string name(1, '&');
+        name.append(m_name.AsCString(""));
+        ExecutionContext exe_ctx(GetExecutionContextRef());
+        m_addr_of_valobj_sp = ValueObjectConstResult::Create(
+            exe_ctx.GetBestExecutionContextScope(),
+            compiler_type.GetPointerType(), ConstString(name.c_str()), addr,
+            eAddressTypeInvalid, m_data.GetAddressByteSize());
+      }
+    } break;
+    default:
+      break;
+    }
+  } else {
+    StreamString expr_path_strm;
+    GetExpressionPath(expr_path_strm);
     error.SetErrorStringWithFormat("'%s' doesn't have a valid address",
-                                   expr_path_str);
-    return ValueObjectSP();
+                                   expr_path_strm.GetData());
   }
 
-  switch (address_type) {
-  case eAddressTypeInvalid:
-    error.SetErrorStringWithFormat("'%s' is not in memory", expr_path_str);
-    return ValueObjectSP();
-
-  case eAddressTypeHost:
-    error.SetErrorStringWithFormat("'%s' is in host process (LLDB) memory",
-                                   expr_path_str);
-    return ValueObjectSP();
-
-  case eAddressTypeFile:
-  case eAddressTypeLoad: {
-    CompilerType compiler_type = GetCompilerType();
-    if (!compiler_type) {
-      error.SetErrorStringWithFormat("'%s' doesn't have a compiler type",
-                                     expr_path_str);
-      return ValueObjectSP();
-    }
-
-    std::string name(1, '&');
-    name.append(m_name.AsCString(""));
-    m_addr_of_valobj_sp = ValueObjectConstResult::Create(
-        scope, compiler_type.GetPointerType(), ConstString(name.c_str()), addr,
-        eAddressTypeInvalid, m_data.GetAddressByteSize());
-    return m_addr_of_valobj_sp;
-  }
-  }
+  return m_addr_of_valobj_sp;
 }
 
 ValueObjectSP ValueObject::DoCast(const CompilerType &compiler_type) {

From dd3e6c87f3f4affd17d05a4d25fa77d224a98d94 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Wed, 24 Jan 2024 15:45:05 +0800
Subject: [PATCH 745/843] Support C++20 Modules in clang-repl (#79261)

This comes from when I playing around clang-repl with moduels : )

I succeeded to import std with https://libcxx.llvm.org/Modules.html and
calling `std::printf` after this patch.

I want to put the documentation part to
https://clang.llvm.org/docs/StandardCPlusPlusModules.html in a separate
commit.
---
 clang/docs/ReleaseNotes.rst               |  2 ++
 clang/include/clang/Basic/LangOptions.def |  2 +-
 clang/lib/Serialization/ASTReaderDecl.cpp |  6 ++---
 clang/test/Interpreter/cxx20-modules.cppm | 31 +++++++++++++++++++++++
 4 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Interpreter/cxx20-modules.cppm

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index fc7511e913673..db3d74e124e7d 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -191,6 +191,8 @@ Crash and bug fixes
 Improvements
 ^^^^^^^^^^^^
 
+- Support importing C++20 modules in clang-repl.
+
 Moved checkers
 ^^^^^^^^^^^^^^
 
diff --git a/clang/include/clang/Basic/LangOptions.def b/clang/include/clang/Basic/LangOptions.def
index 8fc75e1cca039..1e671a7c46016 100644
--- a/clang/include/clang/Basic/LangOptions.def
+++ b/clang/include/clang/Basic/LangOptions.def
@@ -485,7 +485,7 @@ VALUE_LANGOPT(FuchsiaAPILevel, 32, 0, "Fuchsia API level")
 // on large _BitInts.
 BENIGN_VALUE_LANGOPT(MaxBitIntWidth, 32, 128, "Maximum width of a _BitInt")
 
-LANGOPT(IncrementalExtensions, 1, 0, " True if we want to process statements"
+COMPATIBLE_LANGOPT(IncrementalExtensions, 1, 0, " True if we want to process statements"
         "on the global scope, ignore EOF token and continue later on (thus "
         "avoid tearing the Lexer and etc. down). Controlled by "
         "-fincremental-extensions.")
diff --git a/clang/lib/Serialization/ASTReaderDecl.cpp b/clang/lib/Serialization/ASTReaderDecl.cpp
index a149d82153037..867f4c47eaece 100644
--- a/clang/lib/Serialization/ASTReaderDecl.cpp
+++ b/clang/lib/Serialization/ASTReaderDecl.cpp
@@ -3286,10 +3286,10 @@ DeclContext *ASTDeclReader::getPrimaryContextForMerging(ASTReader &Reader,
   if (auto *OID = dyn_cast<ObjCInterfaceDecl>(DC))
     return OID->getDefinition();
 
-  // We can see the TU here only if we have no Sema object. In that case,
-  // there's no TU scope to look in, so using the DC alone is sufficient.
+  // We can see the TU here only if we have no Sema object. It is possible
+  // we're in clang-repl so we still need to get the primary context.
   if (auto *TU = dyn_cast<TranslationUnitDecl>(DC))
-    return TU;
+    return TU->getPrimaryContext();
 
   return nullptr;
 }
diff --git a/clang/test/Interpreter/cxx20-modules.cppm b/clang/test/Interpreter/cxx20-modules.cppm
new file mode 100644
index 0000000000000..bc2b722f6b519
--- /dev/null
+++ b/clang/test/Interpreter/cxx20-modules.cppm
@@ -0,0 +1,31 @@
+// UNSUPPORTED: system-aix
+//
+// RUN: rm -rf %t
+// RUN: mkdir -p %t
+// RUN: split-file %s %t
+//
+// RUN: %clang -std=c++20 %t/mod.cppm --precompile \
+// RUN:     -o %t/mod.pcm
+// RUN: %clang %t/mod.pcm -c -o %t/mod.o
+// RUN: %clang -shared %t/mod.o -o %t/libmod.so
+//
+// RUN: cat %t/import.cpp | env LD_LIBRARY_PATH=%t:$LD_LIBRARY_PATH \
+// RUN:     clang-repl -Xcc=-std=c++20 -Xcc=-fmodule-file=M=%t/mod.pcm \
+// RUN:     | FileCheck %t/import.cpp
+
+//--- mod.cppm
+export module M;
+export const char* Hello() {
+    return "Hello Interpreter for Modules!";
+}
+
+//--- import.cpp
+
+%lib libmod.so
+
+import M;
+
+extern "C" int printf(const char *, ...);
+printf("%s\n", Hello());
+
+// CHECK: Hello Interpreter for Modules!

From 71d64ed80f8b7556be6954b2c4d663c7d89f476d Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 24 Jan 2024 15:02:37 +0800
Subject: [PATCH 746/843] [X86][Peephole] Add NDD entries for EFLAGS
 optimization

---
 llvm/lib/Target/X86/X86InstrInfo.cpp          | 236 ++++++------
 .../test/CodeGen/X86/apx/optimize-compare.mir |  73 ++++
 llvm/test/CodeGen/X86/apx/shift-eflags.ll     | 364 ++++++++++++++++++
 3 files changed, 555 insertions(+), 118 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/apx/optimize-compare.mir
 create mode 100644 llvm/test/CodeGen/X86/apx/shift-eflags.ll

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index c5c6d5a67c16e..b42d8aad48b3f 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -4809,96 +4809,96 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
 
   // The shift instructions only modify ZF if their shift count is non-zero.
   // N.B.: The processor truncates the shift count depending on the encoding.
-  case X86::SAR8ri:
-  case X86::SAR16ri:
-  case X86::SAR32ri:
-  case X86::SAR64ri:
-  case X86::SHR8ri:
-  case X86::SHR16ri:
-  case X86::SHR32ri:
-  case X86::SHR64ri:
+  CASE_ND(SAR8ri)
+  CASE_ND(SAR16ri)
+  CASE_ND(SAR32ri)
+  CASE_ND(SAR64ri)
+  CASE_ND(SHR8ri)
+  CASE_ND(SHR16ri)
+  CASE_ND(SHR32ri)
+  CASE_ND(SHR64ri)
     return getTruncatedShiftCount(MI, 2) != 0;
 
   // Some left shift instructions can be turned into LEA instructions but only
   // if their flags aren't used. Avoid transforming such instructions.
-  case X86::SHL8ri:
-  case X86::SHL16ri:
-  case X86::SHL32ri:
-  case X86::SHL64ri: {
+  CASE_ND(SHL8ri)
+  CASE_ND(SHL16ri)
+  CASE_ND(SHL32ri)
+  CASE_ND(SHL64ri) {
     unsigned ShAmt = getTruncatedShiftCount(MI, 2);
     if (isTruncatedShiftCountForLEA(ShAmt))
       return false;
     return ShAmt != 0;
   }
 
-  case X86::SHRD16rri8:
-  case X86::SHRD32rri8:
-  case X86::SHRD64rri8:
-  case X86::SHLD16rri8:
-  case X86::SHLD32rri8:
-  case X86::SHLD64rri8:
+  CASE_ND(SHRD16rri8)
+  CASE_ND(SHRD32rri8)
+  CASE_ND(SHRD64rri8)
+  CASE_ND(SHLD16rri8)
+  CASE_ND(SHLD32rri8)
+  CASE_ND(SHLD64rri8)
     return getTruncatedShiftCount(MI, 3) != 0;
 
-  case X86::SUB64ri32:
-  case X86::SUB32ri:
-  case X86::SUB16ri:
-  case X86::SUB8ri:
-  case X86::SUB64rr:
-  case X86::SUB32rr:
-  case X86::SUB16rr:
-  case X86::SUB8rr:
-  case X86::SUB64rm:
-  case X86::SUB32rm:
-  case X86::SUB16rm:
-  case X86::SUB8rm:
-  case X86::DEC64r:
-  case X86::DEC32r:
-  case X86::DEC16r:
-  case X86::DEC8r:
-  case X86::ADD64ri32:
-  case X86::ADD32ri:
-  case X86::ADD16ri:
-  case X86::ADD8ri:
-  case X86::ADD64rr:
-  case X86::ADD32rr:
-  case X86::ADD16rr:
-  case X86::ADD8rr:
-  case X86::ADD64rm:
-  case X86::ADD32rm:
-  case X86::ADD16rm:
-  case X86::ADD8rm:
-  case X86::INC64r:
-  case X86::INC32r:
-  case X86::INC16r:
-  case X86::INC8r:
-  case X86::ADC64ri32:
-  case X86::ADC32ri:
-  case X86::ADC16ri:
-  case X86::ADC8ri:
-  case X86::ADC64rr:
-  case X86::ADC32rr:
-  case X86::ADC16rr:
-  case X86::ADC8rr:
-  case X86::ADC64rm:
-  case X86::ADC32rm:
-  case X86::ADC16rm:
-  case X86::ADC8rm:
-  case X86::SBB64ri32:
-  case X86::SBB32ri:
-  case X86::SBB16ri:
-  case X86::SBB8ri:
-  case X86::SBB64rr:
-  case X86::SBB32rr:
-  case X86::SBB16rr:
-  case X86::SBB8rr:
-  case X86::SBB64rm:
-  case X86::SBB32rm:
-  case X86::SBB16rm:
-  case X86::SBB8rm:
-  case X86::NEG8r:
-  case X86::NEG16r:
-  case X86::NEG32r:
-  case X86::NEG64r:
+  CASE_ND(SUB64ri32)
+  CASE_ND(SUB32ri)
+  CASE_ND(SUB16ri)
+  CASE_ND(SUB8ri)
+  CASE_ND(SUB64rr)
+  CASE_ND(SUB32rr)
+  CASE_ND(SUB16rr)
+  CASE_ND(SUB8rr)
+  CASE_ND(SUB64rm)
+  CASE_ND(SUB32rm)
+  CASE_ND(SUB16rm)
+  CASE_ND(SUB8rm)
+  CASE_ND(DEC64r)
+  CASE_ND(DEC32r)
+  CASE_ND(DEC16r)
+  CASE_ND(DEC8r)
+  CASE_ND(ADD64ri32)
+  CASE_ND(ADD32ri)
+  CASE_ND(ADD16ri)
+  CASE_ND(ADD8ri)
+  CASE_ND(ADD64rr)
+  CASE_ND(ADD32rr)
+  CASE_ND(ADD16rr)
+  CASE_ND(ADD8rr)
+  CASE_ND(ADD64rm)
+  CASE_ND(ADD32rm)
+  CASE_ND(ADD16rm)
+  CASE_ND(ADD8rm)
+  CASE_ND(INC64r)
+  CASE_ND(INC32r)
+  CASE_ND(INC16r)
+  CASE_ND(INC8r)
+  CASE_ND(ADC64ri32)
+  CASE_ND(ADC32ri)
+  CASE_ND(ADC16ri)
+  CASE_ND(ADC8ri)
+  CASE_ND(ADC64rr)
+  CASE_ND(ADC32rr)
+  CASE_ND(ADC16rr)
+  CASE_ND(ADC8rr)
+  CASE_ND(ADC64rm)
+  CASE_ND(ADC32rm)
+  CASE_ND(ADC16rm)
+  CASE_ND(ADC8rm)
+  CASE_ND(SBB64ri32)
+  CASE_ND(SBB32ri)
+  CASE_ND(SBB16ri)
+  CASE_ND(SBB8ri)
+  CASE_ND(SBB64rr)
+  CASE_ND(SBB32rr)
+  CASE_ND(SBB16rr)
+  CASE_ND(SBB8rr)
+  CASE_ND(SBB64rm)
+  CASE_ND(SBB32rm)
+  CASE_ND(SBB16rm)
+  CASE_ND(SBB8rm)
+  CASE_ND(NEG8r)
+  CASE_ND(NEG16r)
+  CASE_ND(NEG32r)
+  CASE_ND(NEG64r)
   case X86::LZCNT16rr:
   case X86::LZCNT16rm:
   case X86::LZCNT32rr:
@@ -4918,42 +4918,42 @@ inline static bool isDefConvertible(const MachineInstr &MI, bool &NoSignFlag,
   case X86::TZCNT64rr:
   case X86::TZCNT64rm:
     return true;
-  case X86::AND64ri32:
-  case X86::AND32ri:
-  case X86::AND16ri:
-  case X86::AND8ri:
-  case X86::AND64rr:
-  case X86::AND32rr:
-  case X86::AND16rr:
-  case X86::AND8rr:
-  case X86::AND64rm:
-  case X86::AND32rm:
-  case X86::AND16rm:
-  case X86::AND8rm:
-  case X86::XOR64ri32:
-  case X86::XOR32ri:
-  case X86::XOR16ri:
-  case X86::XOR8ri:
-  case X86::XOR64rr:
-  case X86::XOR32rr:
-  case X86::XOR16rr:
-  case X86::XOR8rr:
-  case X86::XOR64rm:
-  case X86::XOR32rm:
-  case X86::XOR16rm:
-  case X86::XOR8rm:
-  case X86::OR64ri32:
-  case X86::OR32ri:
-  case X86::OR16ri:
-  case X86::OR8ri:
-  case X86::OR64rr:
-  case X86::OR32rr:
-  case X86::OR16rr:
-  case X86::OR8rr:
-  case X86::OR64rm:
-  case X86::OR32rm:
-  case X86::OR16rm:
-  case X86::OR8rm:
+  CASE_ND(AND64ri32)
+  CASE_ND(AND32ri)
+  CASE_ND(AND16ri)
+  CASE_ND(AND8ri)
+  CASE_ND(AND64rr)
+  CASE_ND(AND32rr)
+  CASE_ND(AND16rr)
+  CASE_ND(AND8rr)
+  CASE_ND(AND64rm)
+  CASE_ND(AND32rm)
+  CASE_ND(AND16rm)
+  CASE_ND(AND8rm)
+  CASE_ND(XOR64ri32)
+  CASE_ND(XOR32ri)
+  CASE_ND(XOR16ri)
+  CASE_ND(XOR8ri)
+  CASE_ND(XOR64rr)
+  CASE_ND(XOR32rr)
+  CASE_ND(XOR16rr)
+  CASE_ND(XOR8rr)
+  CASE_ND(XOR64rm)
+  CASE_ND(XOR32rm)
+  CASE_ND(XOR16rm)
+  CASE_ND(XOR8rm)
+  CASE_ND(OR64ri32)
+  CASE_ND(OR32ri)
+  CASE_ND(OR16ri)
+  CASE_ND(OR8ri)
+  CASE_ND(OR64rr)
+  CASE_ND(OR32rr)
+  CASE_ND(OR16rr)
+  CASE_ND(OR8rr)
+  CASE_ND(OR64rm)
+  CASE_ND(OR32rm)
+  CASE_ND(OR16rm)
+  CASE_ND(OR8rm)
   case X86::ANDN32rr:
   case X86::ANDN32rm:
   case X86::ANDN64rr:
@@ -5035,10 +5035,10 @@ static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
   switch (MI.getOpcode()) {
   default:
     return X86::COND_INVALID;
-  case X86::NEG8r:
-  case X86::NEG16r:
-  case X86::NEG32r:
-  case X86::NEG64r:
+  CASE_ND(NEG8r)
+  CASE_ND(NEG16r)
+  CASE_ND(NEG32r)
+  CASE_ND(NEG64r)
     return X86::COND_AE;
   case X86::LZCNT16rr:
   case X86::LZCNT32rr:
diff --git a/llvm/test/CodeGen/X86/apx/optimize-compare.mir b/llvm/test/CodeGen/X86/apx/optimize-compare.mir
new file mode 100644
index 0000000000000..7eabeb30f2ee4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/optimize-compare.mir
@@ -0,0 +1,73 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -o - %s -mtriple=x86_64-- -run-pass peephole-opt -mattr=+ndd | FileCheck %s
+
+---
+name: opt_redundant_flags_0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: opt_redundant_flags_0
+    ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY $esi
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $edi
+    ; CHECK-NEXT: [[SUB32rr_ND:%[0-9]+]]:gr32 = SUB32rr_ND [[COPY]], [[COPY1]], implicit-def $eflags
+    ; CHECK-NEXT: $eax = COPY [[SUB32rr_ND]]
+    ; CHECK-NEXT: $bl = SETCCr 2, implicit $eflags
+    %0:gr32 = COPY $esi
+    %1:gr32 = COPY $edi
+    %2:gr32 = SUB32rr_ND %0, %1, implicit-def dead $eflags
+    $eax = COPY %2
+    ; CMP should be removed.
+    CMP32rr %0, %1, implicit-def $eflags
+    $bl = SETCCr 2, implicit $eflags
+...
+---
+name: opt_redundant_flags_1
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: opt_redundant_flags_1
+    ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY $esi
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $edi
+    ; CHECK-NEXT: [[SUB32rr_ND:%[0-9]+]]:gr32 = SUB32rr_ND [[COPY]], [[COPY1]], implicit-def $eflags
+    ; CHECK-NEXT: $eax = COPY [[SUB32rr_ND]]
+    ; CHECK-NEXT: $bl = SETCCr 6, implicit $eflags
+    %0:gr32 = COPY $esi
+    %1:gr32 = COPY $edi
+    %2:gr32 = SUB32rr_ND %0, %1, implicit-def dead $eflags
+    $eax = COPY %2
+    ; CMP should be removed.
+    CMP32rr %1, %0, implicit-def $eflags
+    $bl = SETCCr 3, implicit $eflags
+...
+---
+name: opt_redundant_flags_2
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: opt_redundant_flags_2
+    ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY $esi
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $edi
+    ; CHECK-NEXT: [[SUB32rr_ND:%[0-9]+]]:gr32 = SUB32rr_ND [[COPY]], [[COPY1]], implicit-def $eflags
+    ; CHECK-NEXT: $cl = SETCCr 2, implicit $eflags
+    ; CHECK-NEXT: $eax = COPY [[SUB32rr_ND]]
+    ; CHECK-NEXT: $bl = SETCCr 2, implicit $eflags
+    %0:gr32 = COPY $esi
+    %1:gr32 = COPY $edi
+    %2:gr32 = SUB32rr_ND %0, %1, implicit-def $eflags
+    ; an extra eflags reader shouldn't stop optimization.
+    $cl = SETCCr 2, implicit $eflags
+    $eax = COPY %2
+    CMP32rr %0, %1, implicit-def $eflags
+    $bl = SETCCr 2, implicit $eflags
+...
+---
+name: opt_zerocmp_user_0
+body: |
+  bb.0:
+    ; CHECK-LABEL: name: opt_zerocmp_user_0
+    ; CHECK: [[COPY:%[0-9]+]]:gr32 = COPY $esi
+    ; CHECK-NEXT: [[NEG32r_ND:%[0-9]+]]:gr32 = NEG32r_ND [[COPY]], implicit-def $eflags
+    ; CHECK-NEXT: $al = SETCCr 3, implicit $eflags
+    %0:gr32 = COPY $esi
+    %1:gr32 = NEG32r_ND %0, implicit-def dead $eflags
+    ; TEST should be removed.
+    TEST32rr %0, %0, implicit-def $eflags
+    $al = SETCCr 4, implicit $eflags
+...
diff --git a/llvm/test/CodeGen/X86/apx/shift-eflags.ll b/llvm/test/CodeGen/X86/apx/shift-eflags.ll
new file mode 100644
index 0000000000000..ed181aa116238
--- /dev/null
+++ b/llvm/test/CodeGen/X86/apx/shift-eflags.ll
@@ -0,0 +1,364 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+ndd | FileCheck %s
+
+; Use shift eflags result when it won't cause stalls
+
+; ashr by constant - use sarl eflags result
+define i32 @ashr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: ashr_const:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    sarl $14, %edi, %edx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %s = ashr i32 %a0, 14
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; lshr by constant - simplify to test
+define i32 @lshr_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: lshr_const:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    testl $-16384, %edi # imm = 0xC000
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %s = lshr i32 %a0, 14
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; shl by constant - simplify to test
+define i32 @shl_const(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: shl_const:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    testl $262143, %edi # imm = 0x3FFFF
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %s = shl i32 %a0, 14
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; ashr by constant and using shift result - use sarl eflags result
+define i32 @ashr_const_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: ashr_const_self_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sarl $14, %edi, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = ashr i32 %a0, 14
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; lshr by constant and using shift result - use shrl eflags result
+define i32 @lshr_const_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: lshr_const_self_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    shrl $14, %edi, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = lshr i32 %a0, 14
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; lshr by constant and using result - use shll eflags result
+define i32 @shl_const_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: shl_const_self_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    shll $14, %edi, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = shl i32 %a0, 14
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; ashr by 1 - use sarl eflags result
+define i32 @ashr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: ashr_const1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    sarl $1, %edi, %edx
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %s = ashr i32 %a0, 1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; lshr by 1 - simplify to test
+define i32 @lshr_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: lshr_const1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    testl $-2, %edi
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %s = lshr i32 %a0, 1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; shl by 1 - simplify to test
+define i32 @shl_const1(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: shl_const1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    testl $2147483647, %edi # imm = 0x7FFFFFFF
+; CHECK-NEXT:    cmovnel %ecx, %eax
+; CHECK-NEXT:    retq
+  %s = shl i32 %a0, 1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; ashr by 1 and using shift result - use sarl eflags result
+define i32 @ashr_const1_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: ashr_const1_self_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sarl $1, %edi, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = ashr i32 %a0, 1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; lshr by 1 and using shift result - use shrl eflags result
+define i32 @lshr_const1_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: lshr_const1_self_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    shrl $1, %edi, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = lshr i32 %a0, 1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; lshr by 1 and using result - use addl eflags result
+define i32 @shl_const1_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: shl_const1_self_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addl %edi, %edi, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = shl i32 %a0, 1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; ashr by variable - use seperate test
+define i32 @ashr_var(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: ashr_var:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %edi, %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    cmovel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = ashr i32 %a0, %a1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; lshr by variable - use seperate test
+define i32 @lshr_var(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: lshr_var:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, %edi, %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    cmovel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = lshr i32 %a0, %a1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; shl by variable - use seperate test
+define i32 @shl_var(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: shl_var:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %edi, %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    cmovel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = shl i32 %a0, %a1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; ashr by variable and using result - use seperate test
+define i32 @ashr_var_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: ashr_var_self_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    sarl %cl, %edi, %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = ashr i32 %a0, %a1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; lshr by variable and using result - use seperate test
+define i32 @lshr_var_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: lshr_var_self_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shrl %cl, %edi, %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = lshr i32 %a0, %a1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; shl by variable and using result - use seperate test
+define i32 @shl_var_self_select(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: shl_var_self_select:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %esi, %ecx
+; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-NEXT:    shll %cl, %edi, %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %s = shl i32 %a0, %a1
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; ashr by non-zero variable - use seperate test
+define i32 @ashr_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: ashr_var_amt_never_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    orb $1, %sil, %cl
+; CHECK-NEXT:    sarl %cl, %edi, %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    cmovel %edx, %eax
+; CHECK-NEXT:    retq
+  %a = or i32 %a1, 1
+  %s = ashr i32 %a0, %a
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; lshr by non-zero variable - use seperate test
+define i32 @lshr_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: lshr_var_amt_never_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    orb $1, %sil, %cl
+; CHECK-NEXT:    shrl %cl, %edi, %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    cmovel %edx, %eax
+; CHECK-NEXT:    retq
+  %a = or i32 %a1, 1
+  %s = lshr i32 %a0, %a
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; shl by non-zero variable - use seperate test
+define i32 @shl_var_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: shl_var_amt_never_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl %ecx, %eax
+; CHECK-NEXT:    orb $1, %sil, %cl
+; CHECK-NEXT:    shll %cl, %edi, %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    cmovel %edx, %eax
+; CHECK-NEXT:    retq
+  %a = or i32 %a1, 1
+  %s = shl i32 %a0, %a
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %a2, i32 %a3
+  ret i32 %r
+}
+
+; ashr by non-zero variable and using result - use seperate test
+define i32 @ashr_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: ashr_var_self_select_amt_never_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    orb $1, %sil, %cl
+; CHECK-NEXT:    shrl %cl, %edi, %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %a = or i32 %a1, 1
+  %s = lshr i32 %a0, %a
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; lshr by non-zero variable and using result - use seperate test
+define i32 @lshr_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: lshr_var_self_select_amt_never_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    orb $1, %sil, %cl
+; CHECK-NEXT:    shrl %cl, %edi, %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %a = or i32 %a1, 1
+  %s = lshr i32 %a0, %a
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}
+
+; shl by non-zero variable and using result - use seperate test
+define i32 @shl_var_self_select_amt_never_zero(i32 %a0, i32 %a1, i32 %a2, i32 %a3) {
+; CHECK-LABEL: shl_var_self_select_amt_never_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    orb $1, %sil, %cl
+; CHECK-NEXT:    shrl %cl, %edi, %eax
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    cmovnel %edx, %eax
+; CHECK-NEXT:    retq
+  %a = or i32 %a1, 1
+  %s = lshr i32 %a0, %a
+  %c = icmp eq i32 %s, 0
+  %r = select i1 %c, i32 %s, i32 %a2
+  ret i32 %r
+}

From c1259650e742e7b3053f6520729b4c1f44c27814 Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Wed, 24 Jan 2024 16:10:47 +0800
Subject: [PATCH 747/843] [NFC] Add more requirement to
 clang/test/Interpreter/cxx20-modules.cppm

The previous test can't work on other platforms like PS4.

Address the comments in
https://github.com/llvm/llvm-project/pull/79261#issuecomment-1907589030
---
 clang/test/Interpreter/cxx20-modules.cppm | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/test/Interpreter/cxx20-modules.cppm b/clang/test/Interpreter/cxx20-modules.cppm
index bc2b722f6b519..2c6eba5255191 100644
--- a/clang/test/Interpreter/cxx20-modules.cppm
+++ b/clang/test/Interpreter/cxx20-modules.cppm
@@ -1,3 +1,4 @@
+// REQUIRES: host-supports-jit, x86_64-linux
 // UNSUPPORTED: system-aix
 //
 // RUN: rm -rf %t

From 11ca56eaf1aaf4f7fba5d94562c6b6c0d4444bc3 Mon Sep 17 00:00:00 2001
From: Felix Kellenbenz <107758057+felixkellenbenz@users.noreply.github.com>
Date: Wed, 24 Jan 2024 09:12:16 +0100
Subject: [PATCH 748/843] [llvm-objcopy] Don't remove .gnu_debuglink section
 when using --strip-all (#78919)

This fixes the issue mentioned here:
https://github.com/llvm/llvm-project/issues/57407
It prevents `llvm-objcopy` from removing the `.gnu _debuglink` section
when used with the `--strip-all` flag. Since `--strip-all` is the
default of `llvm-strip` the patch also prevents `llvm-strip` from
removing the `.gnu_debuglink` section.
---
 llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp             | 2 ++
 llvm/test/tools/llvm-objcopy/ELF/strip-all.test | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
index daf03810fd7bf..b6d77d17bae36 100644
--- a/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
+++ b/llvm/lib/ObjCopy/ELF/ELFObjcopy.cpp
@@ -450,6 +450,8 @@ static Error replaceAndRemoveSections(const CommonConfig &Config,
         return false;
       if (StringRef(Sec.Name).starts_with(".gnu.warning"))
         return false;
+      if (StringRef(Sec.Name).starts_with(".gnu_debuglink"))
+        return false;
       // We keep the .ARM.attribute section to maintain compatibility
       // with Debian derived distributions. This is a bug in their
       // patchset as documented here:
diff --git a/llvm/test/tools/llvm-objcopy/ELF/strip-all.test b/llvm/test/tools/llvm-objcopy/ELF/strip-all.test
index 20f3c9addf556..ba27a3509c6a3 100644
--- a/llvm/test/tools/llvm-objcopy/ELF/strip-all.test
+++ b/llvm/test/tools/llvm-objcopy/ELF/strip-all.test
@@ -68,16 +68,19 @@ Sections:
     Flags:           [ ]
   - Name:            .gnu.warning.foo
     Type:            SHT_PROGBITS
+  - Name:            .gnu_debuglink
+    Type:            SHT_PROGBITS
 ProgramHeaders:
   # Use an arbitrary segment type to show that the segment type is unimportant.
   - Type:     0x61234567
     FirstSec: non_alloc_in_segment
     LastSec:  non_alloc_in_segment
 
-# CHECK: SectionHeaderCount: 6
+# CHECK: SectionHeaderCount: 7
 
 # CHECK: Name: non_alloc_in_segment
 # CHECK: Name: .bss
 # CHECK: Name: .text
 # CHECK: Name: .gnu.warning.foo
+# CHECK: Name: .gnu_debuglink
 # CHECK: Name: .shstrtab

From bae1adae1c7cdf3b0bd618fc9cd5af251dc901ed Mon Sep 17 00:00:00 2001
From: Chuanqi Xu <yedeng.yd@linux.alibaba.com>
Date: Wed, 24 Jan 2024 16:08:15 +0800
Subject: [PATCH 749/843] [docs] [C++20] [Modules] Document how to import
 modules in clang-repl

We support to import C++20 named modules now in in clang-repl in
https://github.com/llvm/llvm-project/commit/dd3e6c87f3f4affd17d05a4d25fa77d224a98d94.

Then we should document how can we do that.
---
 clang/docs/StandardCPlusPlusModules.rst | 39 +++++++++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/clang/docs/StandardCPlusPlusModules.rst b/clang/docs/StandardCPlusPlusModules.rst
index 81043ff25be02..4e853990a7338 100644
--- a/clang/docs/StandardCPlusPlusModules.rst
+++ b/clang/docs/StandardCPlusPlusModules.rst
@@ -1213,6 +1213,45 @@ instead a real binary. There are 4 potential solutions to the problem:
   $ clang-scan-deps -format=p1689 -- <path-to-compiler-executable>/clang++ -std=c++20 -resource-dir <resource-dir> mod.cppm -c -o mod.o
 
 
+Import modules with clang-repl
+==============================
+
+We're able to import C++20 named modules with clang-repl.
+
+Let's start with a simple example:
+
+.. code-block:: c++
+
+  // M.cppm
+  export module M;
+  export const char* Hello() {
+      return "Hello Interpreter for Modules!";
+  }
+
+We still need to compile the named module in ahead.
+
+.. code-block:: console
+
+  $ clang++ -std=c++20 M.cppm --precompile -o M.pcm
+  $ clang++ M.pcm -c -o M.o
+  $ clang++ -shared M.o -o libM.so
+
+Note that we need to compile the module unit into a dynamic library so that the clang-repl
+can load the object files of the module units.
+
+Then we are able to import module ``M`` in clang-repl.
+
+.. code-block:: console
+
+  $ clang-repl -Xcc=-std=c++20 -Xcc=-fprebuilt-module-path=.
+  # We need to load the dynamic library first before importing the modules.
+  clang-repl> %lib libM.so
+  clang-repl> import M;
+  clang-repl> extern "C" int printf(const char *, ...);
+  clang-repl> printf("%s\n", Hello());
+  Hello Interpreter for Modules!
+  clang-repl> %quit
+
 Possible Questions
 ==================
 

From d50705ed5d482ccd9b9afabea2b6358d056b7543 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Wed, 24 Jan 2024 08:18:08 +0000
Subject: [PATCH 750/843] [mlir][vector] Support scalable vec in
 `TransferReadAfterWriteToBroadcast` (#79162)

Makes `TransferReadAfterWriteToBroadcast` correctly propagate
scalability flags.
---
 mlir/lib/Dialect/Vector/IR/VectorOps.cpp   |  9 +++++++--
 mlir/test/Dialect/Vector/canonicalize.mlir | 18 ++++++++++++++++++
 2 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
index 791924f92e8ad..5f9860ef2311e 100644
--- a/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
+++ b/mlir/lib/Dialect/Vector/IR/VectorOps.cpp
@@ -4079,10 +4079,15 @@ struct TransferReadAfterWriteToBroadcast
     // final shape we want.
     ArrayRef<int64_t> destShape = readOp.getVectorType().getShape();
     SmallVector<int64_t> broadcastShape(destShape.size());
-    for (const auto &pos : llvm::enumerate(permutation))
+    SmallVector<bool> broadcastScalableFlags(destShape.size());
+    for (const auto &pos : llvm::enumerate(permutation)) {
       broadcastShape[pos.value()] = destShape[pos.index()];
+      broadcastScalableFlags[pos.value()] =
+          readOp.getVectorType().getScalableDims()[pos.index()];
+    }
     VectorType broadcastedType = VectorType::get(
-        broadcastShape, defWrite.getVectorType().getElementType());
+        broadcastShape, defWrite.getVectorType().getElementType(),
+        broadcastScalableFlags);
     vec = rewriter.create<vector::BroadcastOp>(loc, broadcastedType, vec);
     SmallVector<int64_t> transposePerm(permutation.begin(), permutation.end());
     rewriter.replaceOpWithNewOp<vector::TransposeOp>(readOp, vec,
diff --git a/mlir/test/Dialect/Vector/canonicalize.mlir b/mlir/test/Dialect/Vector/canonicalize.mlir
index feefb0c174aab..e6f045e12e519 100644
--- a/mlir/test/Dialect/Vector/canonicalize.mlir
+++ b/mlir/test/Dialect/Vector/canonicalize.mlir
@@ -1302,6 +1302,24 @@ func.func @store_to_load_tensor_broadcast(%arg0 : tensor<4x4xf32>,
 
 // -----
 
+// CHECK-LABEL: func @store_to_load_tensor_broadcast_scalable
+//  CHECK-SAME: (%[[ARG:.*]]: tensor<?xf32>, %[[V0:.*]]: vector<[4]xf32>)
+//       CHECK:   %[[B:.*]] = vector.broadcast %[[V0]] : vector<[4]xf32> to vector<6x[4]xf32>
+//       CHECK:   return %[[B]] : vector<6x[4]xf32>
+func.func @store_to_load_tensor_broadcast_scalable(%arg0 : tensor<?xf32>,
+  %v0 : vector<[4]xf32>) -> vector<6x[4]xf32> {
+  %c0 = arith.constant 0 : index
+  %cf0 = arith.constant 0.0 : f32
+  %w0 = vector.transfer_write %v0, %arg0[%c0] {in_bounds = [true]} :
+    vector<[4]xf32>, tensor<?xf32>
+  %0 = vector.transfer_read %w0[%c0], %cf0 {in_bounds = [true, true],
+  permutation_map = affine_map<(d0) -> (0, d0)>} :
+    tensor<?xf32>, vector<6x[4]xf32>
+  return %0 : vector<6x[4]xf32>
+}
+
+// -----
+
 // CHECK-LABEL: func @store_to_load_tensor_perm_broadcast
 //  CHECK-SAME: (%[[ARG:.*]]: tensor<4x4x4xf32>, %[[V0:.*]]: vector<4x1xf32>)
 //       CHECK:   %[[B:.*]] = vector.broadcast %[[V0]] : vector<4x1xf32> to vector<100x5x4x1xf32>

From f3b495f5842bd3c8a5433ba4b784d49c663f84af Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 24 Jan 2024 15:17:16 +0700
Subject: [PATCH 751/843] [RISCV] Add tests for reverse shuffles of i1 vectors.
 NFC

This is to add test coverage for a change in #73342
---
 .../test/Analysis/CostModel/RISCV/shuffle-reverse.ll | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
index 17deeb2cfafa6..146909cc93df1 100644
--- a/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/shuffle-reverse.ll
@@ -9,6 +9,10 @@
 define void @reverse() {
 ;
 ; CHECK-LABEL: 'reverse'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -34,6 +38,10 @@ define void @reverse() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; SIZE-LABEL: 'reverse'
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
@@ -58,6 +66,10 @@ define void @reverse() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+  %v2i1 = shufflevector <2 x i1> undef, <2 x i1> undef, <2 x i32> <i32 1, i32 0>
+  %v4i1 = shufflevector <4 x i1> undef, <4 x i1> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+  %v8i1 = shufflevector <8 x i1> undef, <8 x i1> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+  %v16i1 = shufflevector <16 x i1> undef, <16 x i1> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>

From 34466019e74fe455f6c67bab1d48222a08bede4d Mon Sep 17 00:00:00 2001
From: Adrian Kuegel <akuegel@google.com>
Date: Wed, 24 Jan 2024 08:18:57 +0000
Subject: [PATCH 752/843] [mlir][Bazel] Add missing dependency after
 750e90e4403df23d6b271afb90e6b4d463739965

---
 utils/bazel/llvm-project-overlay/mlir/BUILD.bazel | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index 632450f666587..2e75c48510b9c 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -8031,6 +8031,7 @@ cc_library(
     deps = [
         ":AMDGPUDialect",
         ":ArithDialect",
+        ":ArithUtils",
         ":ConversionPassIncGen",
         ":ConvertToLLVMInterface",
         ":IR",
@@ -8053,8 +8054,8 @@ cc_library(
     ]),
     includes = ["include"],
     deps = [
-        ":ArmSMEDialect",
         ":ArithDialect",
+        ":ArmSMEDialect",
         ":ConversionPassIncGen",
         ":IR",
         ":Pass",

From 5404a3792ed58b94b938bbf5cfe6eeb23c664efc Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 24 Jan 2024 00:27:33 -0800
Subject: [PATCH 753/843] [Driver] Use StringRef::consume_front (NFC)

---
 clang/lib/Driver/Driver.cpp                | 10 +++-------
 clang/lib/Driver/ToolChains/CommonArgs.cpp |  4 +---
 clang/lib/Driver/ToolChains/Hexagon.cpp    |  3 +--
 3 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/clang/lib/Driver/Driver.cpp b/clang/lib/Driver/Driver.cpp
index 7109faa1072de..190a73bfd40b6 100644
--- a/clang/lib/Driver/Driver.cpp
+++ b/clang/lib/Driver/Driver.cpp
@@ -6482,18 +6482,15 @@ bool Driver::GetReleaseVersion(StringRef Str, unsigned &Major, unsigned &Minor,
     return false;
   if (Str.empty())
     return true;
-  if (Str[0] != '.')
+  if (!Str.consume_front("."))
     return false;
 
-  Str = Str.drop_front(1);
-
   if (Str.consumeInteger(10, Minor))
     return false;
   if (Str.empty())
     return true;
-  if (Str[0] != '.')
+  if (!Str.consume_front("."))
     return false;
-  Str = Str.drop_front(1);
 
   if (Str.consumeInteger(10, Micro))
     return false;
@@ -6521,9 +6518,8 @@ bool Driver::GetReleaseVersion(StringRef Str,
     Digits[CurDigit] = Digit;
     if (Str.empty())
       return true;
-    if (Str[0] != '.')
+    if (!Str.consume_front("."))
       return false;
-    Str = Str.drop_front(1);
     CurDigit++;
   }
 
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index fadaf3e60c661..48534bc1669e4 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -317,9 +317,7 @@ void tools::handleTargetFeaturesGroup(const Driver &D,
       continue;
     }
 
-    bool IsNegative = Name.starts_with("no-");
-    if (IsNegative)
-      Name = Name.substr(3);
+    bool IsNegative = Name.consume_front("no-");
 
     Features.push_back(Args.MakeArgString((IsNegative ? "-" : "+") + Name));
   }
diff --git a/clang/lib/Driver/ToolChains/Hexagon.cpp b/clang/lib/Driver/ToolChains/Hexagon.cpp
index d1eed931be5f1..8a5f8f1489931 100644
--- a/clang/lib/Driver/ToolChains/Hexagon.cpp
+++ b/clang/lib/Driver/ToolChains/Hexagon.cpp
@@ -801,7 +801,6 @@ StringRef HexagonToolChain::GetTargetCPUVersion(const ArgList &Args) {
     CpuArg = A;
 
   StringRef CPU = CpuArg ? CpuArg->getValue() : GetDefaultCPU();
-  if (CPU.starts_with("hexagon"))
-    return CPU.substr(sizeof("hexagon") - 1);
+  CPU.consume_front("hexagon");
   return CPU;
 }

From 873a7bb12949709ea406c8adc82cd69fc372527d Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 24 Jan 2024 00:27:35 -0800
Subject: [PATCH 754/843] [Transforms] Use llvm::pred_size and
 llvm::predecessors (NFC)

---
 llvm/lib/Transforms/Scalar/JumpThreading.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 8603c5cf9c022..c58df063d8c53 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1431,16 +1431,14 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
   array_pod_sort(AvailablePreds.begin(), AvailablePreds.end());
 
   // Create a PHI node at the start of the block for the PRE'd load value.
-  pred_iterator PB = pred_begin(LoadBB), PE = pred_end(LoadBB);
-  PHINode *PN = PHINode::Create(LoadI->getType(), std::distance(PB, PE), "");
+  PHINode *PN = PHINode::Create(LoadI->getType(), pred_size(LoadBB), "");
   PN->insertBefore(LoadBB->begin());
   PN->takeName(LoadI);
   PN->setDebugLoc(LoadI->getDebugLoc());
 
   // Insert new entries into the PHI for each predecessor.  A single block may
   // have multiple entries here.
-  for (pred_iterator PI = PB; PI != PE; ++PI) {
-    BasicBlock *P = *PI;
+  for (BasicBlock *P : predecessors(LoadBB)) {
     AvailablePredsTy::iterator I =
         llvm::lower_bound(AvailablePreds, std::make_pair(P, (Value *)nullptr));
 

From 18a3c7a01ed983d19e47a03e664200ffff53689c Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 24 Jan 2024 00:27:37 -0800
Subject: [PATCH 755/843] [AMDGPU] Use llvm::none_of (NFC)

---
 llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
index e5fbcca1e7d16..88429e3f0e218 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -293,7 +293,7 @@ static bool imageIntrinsicOptimizerImpl(Function &F, const TargetMachine *TM) {
   Module *M = F.getParent();
 
   // Early test to determine if the intrinsics are used.
-  if (std::none_of(M->begin(), M->end(), [](Function &F) {
+  if (llvm::none_of(*M, [](Function &F) {
         return !F.users().empty() &&
                (F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2dmsaa ||
                 F.getIntrinsicID() == Intrinsic::amdgcn_image_load_2darraymsaa);

From b0763a1ae940d60d8f558f85216382bc6695a1e3 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 24 Jan 2024 00:27:38 -0800
Subject: [PATCH 756/843] [DebugInfo] Use std::size (NFC)

---
 llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index 29949ee021456..b9cf7d22c80d4 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -761,8 +761,7 @@ DWARFFormValue::getAsFile(DILineInfoSpecifier::FileLineInfoKind Kind) const {
 bool llvm::dwarf::doesFormBelongToClass(dwarf::Form Form, DWARFFormValue::FormClass FC,
                            uint16_t DwarfVersion) {
   // First, check DWARF5 form classes.
-  if (Form < ArrayRef(DWARF5FormClasses).size() &&
-      DWARF5FormClasses[Form] == FC)
+  if (Form < std::size(DWARF5FormClasses) && DWARF5FormClasses[Form] == FC)
     return true;
   // Check more forms from extensions and proposals.
   switch (Form) {

From 33ecef9812e2c9bfadef035b8e34a949acae2abc Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 24 Jan 2024 17:10:28 +0800
Subject: [PATCH 757/843] [X86][CodeGen] Fix crash when commute operands of
 Instruction for code size (#79245)

Reported in 134fcc62786d31ab73439201dce2d73808d1785a

Incorrect opcode is used  b/c there is a `[[fallthrough]]` at line 2386.
---
 llvm/lib/Target/X86/X86InstrInfo.cpp        | 35 +++++++++------------
 llvm/test/CodeGen/X86/commute-blend-avx2.ll |  9 ++++++
 2 files changed, 23 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index b42d8aad48b3f..975d12d271d83 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -2326,33 +2326,26 @@ MachineInstr *X86InstrInfo::commuteInstructionImpl(MachineInstr &MI, bool NewMI,
   case X86::VBLENDPSrri:
     // If we're optimizing for size, try to use MOVSD/MOVSS.
     if (MI.getParent()->getParent()->getFunction().hasOptSize()) {
-      unsigned Mask;
-      switch (Opc) {
-      default:
-        llvm_unreachable("Unreachable!");
-      case X86::BLENDPDrri:
-        Opc = X86::MOVSDrr;
-        Mask = 0x03;
-        break;
-      case X86::BLENDPSrri:
-        Opc = X86::MOVSSrr;
-        Mask = 0x0F;
-        break;
-      case X86::VBLENDPDrri:
-        Opc = X86::VMOVSDrr;
-        Mask = 0x03;
-        break;
-      case X86::VBLENDPSrri:
-        Opc = X86::VMOVSSrr;
-        Mask = 0x0F;
-        break;
-      }
+      unsigned Mask = (Opc == X86::BLENDPDrri || Opc == X86::VBLENDPDrri) ? 0x03: 0x0F;
       if ((MI.getOperand(3).getImm() ^ Mask) == 1) {
+#define FROM_TO(FROM, TO)                                                      \
+  case X86::FROM:                                                              \
+    Opc = X86::TO;                                                             \
+    break;
+        switch (Opc) {
+        default:
+          llvm_unreachable("Unreachable!");
+        FROM_TO(BLENDPDrri, MOVSDrr)
+        FROM_TO(BLENDPSrri, MOVSSrr)
+        FROM_TO(VBLENDPDrri, VMOVSDrr)
+        FROM_TO(VBLENDPSrri, VMOVSSrr)
+        }
         WorkingMI = CloneIfNew(MI);
         WorkingMI->setDesc(get(Opc));
         WorkingMI->removeOperand(3);
         break;
       }
+#undef FROM_TO
     }
     [[fallthrough]];
   case X86::PBLENDWrri:
diff --git a/llvm/test/CodeGen/X86/commute-blend-avx2.ll b/llvm/test/CodeGen/X86/commute-blend-avx2.ll
index b5ffe78d29a61..75511104580e9 100644
--- a/llvm/test/CodeGen/X86/commute-blend-avx2.ll
+++ b/llvm/test/CodeGen/X86/commute-blend-avx2.ll
@@ -88,3 +88,12 @@ define <4 x double> @commute_fold_vblendpd_256(<4 x double> %a, ptr %b) #0 {
   ret <4 x double> %2
 }
 declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+define <4 x float> @commute_vblendpd_128_for_code_size(<4 x float> %a, <4 x float> %b) optsize {
+; CHECK-LABEL: commute_vblendpd_128_for_code_size:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1],xmm1[2,3]
+; CHECK-NEXT:    retq
+  %r = shufflevector <4 x float> %b, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  ret <4 x float> %r
+}

From a7a1b8b17e264fb0f2d2b4165cf9a7f5094b08b3 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 10:15:42 +0100
Subject: [PATCH 758/843] [MSSAUpdater] Handle simplified accesses when
 updating phis (#78272)

This is a followup to #76819. After those changes, we can still run into
an assertion failure for a slight variation of the test case: When
fixing up MemoryPhis, we map the incoming access to the access of the
cloned instruction -- which may now no longer exist.

Fix this by reusing the getNewDefiningAccessForClone() helper, which
will look upwards for a new defining access in that case.
---
 llvm/lib/Analysis/MemorySSAUpdater.cpp        |  22 +---
 .../memssa-readnone-access.ll                 | 104 ++++++++++++++++++
 2 files changed, 107 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp
index e87ae7d71fffe..aa550f0b6a7bf 100644
--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -692,25 +692,9 @@ void MemorySSAUpdater::updateForClonedLoop(const LoopBlocksRPO &LoopBlocks,
         continue;
 
       // Determine incoming value and add it as incoming from IncBB.
-      if (MemoryUseOrDef *IncMUD = dyn_cast<MemoryUseOrDef>(IncomingAccess)) {
-        if (!MSSA->isLiveOnEntryDef(IncMUD)) {
-          Instruction *IncI = IncMUD->getMemoryInst();
-          assert(IncI && "Found MemoryUseOrDef with no Instruction.");
-          if (Instruction *NewIncI =
-                  cast_or_null<Instruction>(VMap.lookup(IncI))) {
-            IncMUD = MSSA->getMemoryAccess(NewIncI);
-            assert(IncMUD &&
-                   "MemoryUseOrDef cannot be null, all preds processed.");
-          }
-        }
-        NewPhi->addIncoming(IncMUD, IncBB);
-      } else {
-        MemoryPhi *IncPhi = cast<MemoryPhi>(IncomingAccess);
-        if (MemoryAccess *NewDefPhi = MPhiMap.lookup(IncPhi))
-          NewPhi->addIncoming(NewDefPhi, IncBB);
-        else
-          NewPhi->addIncoming(IncPhi, IncBB);
-      }
+      NewPhi->addIncoming(
+          getNewDefiningAccessForClone(IncomingAccess, VMap, MPhiMap, MSSA),
+          IncBB);
     }
     if (auto *SingleAccess = onlySingleValue(NewPhi)) {
       MPhiMap[Phi] = SingleAccess;
diff --git a/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll b/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll
index 2aaf777683e11..c6e6608d4be38 100644
--- a/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll
+++ b/llvm/test/Transforms/SimpleLoopUnswitch/memssa-readnone-access.ll
@@ -115,3 +115,107 @@ split:
 exit:
   ret void
 }
+
+; Variants of the above test with swapped branch destinations.
+
+define void @test1_swapped(i1 %c) {
+; CHECK-LABEL: define void @test1_swapped(
+; CHECK-SAME: i1 [[C:%.*]]) {
+; CHECK-NEXT:  start:
+; CHECK-NEXT:    [[C_FR:%.*]] = freeze i1 [[C]]
+; CHECK-NEXT:    br i1 [[C_FR]], label [[START_SPLIT_US:%.*]], label [[START_SPLIT:%.*]]
+; CHECK:       start.split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[LOOP_US]]
+; CHECK:       start.split:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+start:
+  br label %loop
+
+loop:
+  %fn = load ptr, ptr @vtable, align 8
+  call void %fn()
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test2_swapped(i1 %c, ptr %p) {
+; CHECK-LABEL: define void @test2_swapped(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[C_FR:%.*]] = freeze i1 [[C]]
+; CHECK-NEXT:    br i1 [[C_FR]], label [[DOTSPLIT_US:%.*]], label [[DOTSPLIT:%.*]]
+; CHECK:       .split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[LOOP_US]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+  br label %loop
+
+loop:
+  %fn = load ptr, ptr @vtable, align 8
+  call void %fn()
+  call void @bar()
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @test3_swapped(i1 %c, ptr %p) {
+; CHECK-LABEL: define void @test3_swapped(
+; CHECK-SAME: i1 [[C:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[C_FR:%.*]] = freeze i1 [[C]]
+; CHECK-NEXT:    br i1 [[C_FR]], label [[DOTSPLIT_US:%.*]], label [[DOTSPLIT:%.*]]
+; CHECK:       .split.us:
+; CHECK-NEXT:    br label [[LOOP_US:%.*]]
+; CHECK:       loop.us:
+; CHECK-NEXT:    br label [[SPLIT_US:%.*]]
+; CHECK:       split.us:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[LOOP_US]]
+; CHECK:       .split:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    br label [[SPLIT:%.*]]
+; CHECK:       split:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @bar()
+; CHECK-NEXT:    br label [[EXIT:%.*]]
+; CHECK:       exit:
+; CHECK-NEXT:    ret void
+;
+  br label %loop
+
+loop:
+  %fn = load ptr, ptr @vtable, align 8
+  br label %split
+
+split:
+  call void %fn()
+  call void @bar()
+  br i1 %c, label %loop, label %exit
+
+exit:
+  ret void
+}

From 303e64826b79af0c0e67ba06d5d8e1385adc32b5 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Wed, 24 Jan 2024 17:23:38 +0800
Subject: [PATCH 759/843] [X86][NFC] Remove dead code for "_REV" instructions

ADC/SBB with reverse encoding is never emitted by compiler before
encoding optimization, which is called after flag-copy lowering.

This is a partial reland for 8bbf100799a97f8342bf1a8409c6fb48f03e837f
---
 llvm/lib/Target/X86/X86FlagsCopyLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index b13bf361ab79b..aad839b83ee19 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -173,7 +173,6 @@ static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
 
 #define LLVM_EXPAND_ADC_SBB_INSTR(MNEMONIC)                                    \
   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr)                                        \
-  LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rr_REV)                                    \
   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, rm)                                        \
   LLVM_EXPAND_INSTR_SIZES(MNEMONIC, mr)                                        \
   case X86::MNEMONIC##8ri:                                                     \

From 543cf08636f3a3bb55dddba2e8cad787601647ba Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 10:45:20 +0100
Subject: [PATCH 760/843] [PhaseOrdering] Add additional test for #79161 (NFC)

---
 .../X86/loop-vectorizer-noalias.ll            | 147 ++++++++++++++++++
 1 file changed, 147 insertions(+)
 create mode 100644 llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll

diff --git a/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll b/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll
new file mode 100644
index 0000000000000..846787f721ba7
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll
@@ -0,0 +1,147 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -O3 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+define internal void @acc(ptr noalias noundef %val, ptr noalias noundef %prev) {
+entry:
+  %0 = load i8, ptr %prev, align 1
+  %conv = zext i8 %0 to i32
+  %1 = load i8, ptr %val, align 1
+  %conv1 = zext i8 %1 to i32
+  %add = add nsw i32 %conv1, %conv
+  %conv2 = trunc i32 %add to i8
+  store i8 %conv2, ptr %val, align 1
+  ret void
+}
+
+; This loop should not get vectorized.
+; FIXME: This is a miscompile.
+define void @accsum(ptr noundef %vals, i64 noundef %num) #0 {
+; CHECK-LABEL: define void @accsum(
+; CHECK-SAME: ptr nocapture noundef [[VALS:%.*]], i64 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[NUM]], 1
+; CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
+; CHECK:       iter.check:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[NUM]], -1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM]], 9
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[NUM]], 33
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], -32
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[VALS]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 -1
+; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
+; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 15
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[WIDE_LOAD6]], [[WIDE_LOAD4]]
+; CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP1]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP4]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[IND_END9:%.*]] = or disjoint i64 [[N_VEC]], 1
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY_PREHEADER]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; CHECK-NEXT:    [[N_VEC8:%.*]] = and i64 [[TMP0]], -8
+; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[N_VEC8]], 1
+; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX12:%.*]] = or disjoint i64 [[INDEX11]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[VALS]], i64 [[OFFSET_IDX12]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 -1
+; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
+; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3]])
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[WIDE_LOAD13]]
+; CHECK-NEXT:    store <8 x i8> [[TMP10]], ptr [[TMP8]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX11]], 8
+; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC8]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
+; CHECK-NEXT:    br i1 [[CMP_N10]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]]
+; CHECK:       for.body.preheader:
+; CHECK-NEXT:    [[I_02_PH:%.*]] = phi i64 [ 1, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[I_02_PH]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[VALS]], i64 [[I_02]]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[ARRAYIDX]], i64 -1
+; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
+; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3]])
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[ADD_I:%.*]] = add i8 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    store i8 [[ADD_I]], ptr [[ARRAYIDX]], align 1, !alias.scope [[META0]], !noalias [[META3]]
+; CHECK-NEXT:    [[INC]] = add nuw i64 [[I_02]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUM]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       for.end:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.inc, %entry
+  %i.0 = phi i64 [ 1, %entry ], [ %inc, %for.inc ]
+  %cmp = icmp ult i64 %i.0, %num
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond
+  br label %for.end
+
+for.body:                                         ; preds = %for.cond
+  %arrayidx = getelementptr inbounds i8, ptr %vals, i64 %i.0
+  %sub = sub i64 %i.0, 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %vals, i64 %sub
+  call void @acc(ptr noundef %arrayidx, ptr noundef %arrayidx1)
+  br label %for.inc
+
+for.inc:                                          ; preds = %for.body
+  %inc = add i64 %i.0, 1
+  br label %for.cond
+
+for.end:                                          ; preds = %for.cond.cleanup
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+
+attributes #0 = { "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87"}
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]], !"acc: %val"}
+; CHECK: [[META2]] = distinct !{[[META2]], !"acc"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]], !"acc: %prev"}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
+; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
+; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]], [[META7]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]], [[META6]]}
+;.

From cd7ea4ea657ea41b42fcbd0e6b33faa46608d18e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 11:20:16 +0100
Subject: [PATCH 761/843] [LAA] Drop alias scope metadata that is not valid
 across iterations (#79161)

LAA currently adds memory locations with their original AATags to AST.
However, scoped alias AATags may be valid only within one loop
iteration, while LAA reasons across iterations.

Fix this by determining which alias scopes are defined inside the loop,
and drop AATags that reference these scopes.

Fixes https://github.com/llvm/llvm-project/issues/79137.
---
 llvm/lib/Analysis/LoopAccessAnalysis.cpp      | 53 ++++++++++--
 .../LoopAccessAnalysis/noalias-scope-decl.ll  | 11 ++-
 .../X86/loop-vectorizer-noalias.ll            | 83 ++-----------------
 3 files changed, 63 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 7e67c90152829..dd6b88fee415a 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -657,16 +657,18 @@ class AccessAnalysis {
 
   AccessAnalysis(Loop *TheLoop, AAResults *AA, LoopInfo *LI,
                  MemoryDepChecker::DepCandidates &DA,
-                 PredicatedScalarEvolution &PSE)
-      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE) {
+                 PredicatedScalarEvolution &PSE,
+                 SmallPtrSetImpl<MDNode *> &LoopAliasScopes)
+      : TheLoop(TheLoop), BAA(*AA), AST(BAA), LI(LI), DepCands(DA), PSE(PSE),
+        LoopAliasScopes(LoopAliasScopes) {
     // We're analyzing dependences across loop iterations.
     BAA.enableCrossIterationMode();
   }
 
   /// Register a load  and whether it is only read from.
   void addLoad(MemoryLocation &Loc, Type *AccessTy, bool IsReadOnly) {
-    Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Loc.getWithNewSize(LocationSize::beforeOrAfterPointer()));
+    Value *Ptr = const_cast<Value *>(Loc.Ptr);
+    AST.add(adjustLoc(Loc));
     Accesses[MemAccessInfo(Ptr, false)].insert(AccessTy);
     if (IsReadOnly)
       ReadOnlyPtr.insert(Ptr);
@@ -674,8 +676,8 @@ class AccessAnalysis {
 
   /// Register a store.
   void addStore(MemoryLocation &Loc, Type *AccessTy) {
-    Value *Ptr = const_cast<Value*>(Loc.Ptr);
-    AST.add(Loc.getWithNewSize(LocationSize::beforeOrAfterPointer()));
+    Value *Ptr = const_cast<Value *>(Loc.Ptr);
+    AST.add(adjustLoc(Loc));
     Accesses[MemAccessInfo(Ptr, true)].insert(AccessTy);
   }
 
@@ -731,6 +733,32 @@ class AccessAnalysis {
 private:
   typedef MapVector<MemAccessInfo, SmallSetVector<Type *, 1>> PtrAccessMap;
 
+  /// Adjust the MemoryLocation so that it represents accesses to this
+  /// location across all iterations, rather than a single one.
+  MemoryLocation adjustLoc(MemoryLocation Loc) const {
+    // The accessed location varies within the loop, but remains within the
+    // underlying object.
+    Loc.Size = LocationSize::beforeOrAfterPointer();
+    Loc.AATags.Scope = adjustAliasScopeList(Loc.AATags.Scope);
+    Loc.AATags.NoAlias = adjustAliasScopeList(Loc.AATags.NoAlias);
+    return Loc;
+  }
+
+  /// Drop alias scopes that are only valid within a single loop iteration.
+  MDNode *adjustAliasScopeList(MDNode *ScopeList) const {
+    if (!ScopeList)
+      return nullptr;
+
+    // For the sake of simplicity, drop the whole scope list if any scope is
+    // iteration-local.
+    if (any_of(ScopeList->operands(), [&](Metadata *Scope) {
+          return LoopAliasScopes.contains(cast<MDNode>(Scope));
+        }))
+      return nullptr;
+
+    return ScopeList;
+  }
+
   /// Go over all memory access and check whether runtime pointer checks
   /// are needed and build sets of dependency check candidates.
   void processMemAccesses();
@@ -775,6 +803,10 @@ class AccessAnalysis {
   PredicatedScalarEvolution &PSE;
 
   DenseMap<Value *, SmallVector<const Value *, 16>> UnderlyingObjects;
+
+  /// Alias scopes that are declared inside the loop, and as such not valid
+  /// across iterations.
+  SmallPtrSetImpl<MDNode *> &LoopAliasScopes;
 };
 
 } // end anonymous namespace
@@ -2283,6 +2315,7 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   // Holds the Load and Store instructions.
   SmallVector<LoadInst *, 16> Loads;
   SmallVector<StoreInst *, 16> Stores;
+  SmallPtrSet<MDNode *, 8> LoopAliasScopes;
 
   // Holds all the different accesses in the loop.
   unsigned NumReads = 0;
@@ -2326,6 +2359,11 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
       if (HasComplexMemInst)
         continue;
 
+      // Record alias scopes defined inside the loop.
+      if (auto *Decl = dyn_cast<NoAliasScopeDeclInst>(&I))
+        for (Metadata *Op : Decl->getScopeList()->operands())
+          LoopAliasScopes.insert(cast<MDNode>(Op));
+
       // Many math library functions read the rounding mode. We will only
       // vectorize a loop if it contains known function calls that don't set
       // the flag. Therefore, it is safe to ignore this read from memory.
@@ -2407,7 +2445,8 @@ void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
   }
 
   MemoryDepChecker::DepCandidates DependentAccesses;
-  AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE);
+  AccessAnalysis Accesses(TheLoop, AA, LI, DependentAccesses, *PSE,
+                          LoopAliasScopes);
 
   // Holds the analyzed pointers. We don't want to call getUnderlyingObjects
   // multiple times on the same object. If the ptr is accessed twice, once
diff --git a/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll b/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll
index 98bb5f99a40a1..fb296f5089422 100644
--- a/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll
+++ b/llvm/test/Analysis/LoopAccessAnalysis/noalias-scope-decl.ll
@@ -7,8 +7,17 @@
 define void @test_scope_in_loop(ptr %arg, i64 %num) {
 ; CHECK-LABEL: 'test_scope_in_loop'
 ; CHECK-NEXT:    loop:
-; CHECK-NEXT:      Memory dependences are safe
+; CHECK-NEXT:      Report: unsafe dependent memory operations in loop. Use #pragma clang loop distribute(enable) to allow loop distribution to attempt to isolate the offending operations into a separate loop
+; CHECK-NEXT:  Backward loop carried data dependence.
 ; CHECK-NEXT:      Dependences:
+; CHECK-NEXT:        Backward:
+; CHECK-NEXT:            %load.prev = load i8, ptr %prev.ptr, align 1, !alias.scope !0, !noalias !3 ->
+; CHECK-NEXT:            store i8 %add, ptr %cur.ptr, align 1, !alias.scope !3
+; CHECK-EMPTY:
+; CHECK-NEXT:        Forward:
+; CHECK-NEXT:            %load.cur = load i8, ptr %cur.ptr, align 1, !alias.scope !3 ->
+; CHECK-NEXT:            store i8 %add, ptr %cur.ptr, align 1, !alias.scope !3
+; CHECK-EMPTY:
 ; CHECK-NEXT:      Run-time memory checks:
 ; CHECK-NEXT:      Grouped accesses:
 ; CHECK-EMPTY:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll b/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll
index 846787f721ba7..5c85c0d21f59f 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/loop-vectorizer-noalias.ll
@@ -14,89 +14,25 @@ entry:
 }
 
 ; This loop should not get vectorized.
-; FIXME: This is a miscompile.
 define void @accsum(ptr noundef %vals, i64 noundef %num) #0 {
 ; CHECK-LABEL: define void @accsum(
 ; CHECK-SAME: ptr nocapture noundef [[VALS:%.*]], i64 noundef [[NUM:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i64 [[NUM]], 1
-; CHECK-NEXT:    br i1 [[CMP1]], label [[ITER_CHECK:%.*]], label [[FOR_END:%.*]]
-; CHECK:       iter.check:
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[NUM]], -1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM]], 9
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK3:%.*]] = icmp ult i64 [[NUM]], 33
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK3]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK:       vector.ph:
-; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP0]], -32
-; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
-; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[VALS]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 -1
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0:![0-9]+]])
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 15
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1, !alias.scope [[META3]], !noalias [[META0]]
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1, !alias.scope [[META3]], !noalias [[META0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = add <16 x i8> [[WIDE_LOAD5]], [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP6:%.*]] = add <16 x i8> [[WIDE_LOAD6]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP1]], align 1, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    store <16 x i8> [[TMP6]], ptr [[TMP4]], align 1, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
-; CHECK:       middle.block:
-; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[IND_END9:%.*]] = or disjoint i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = and i64 [[TMP0]], 24
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp eq i64 [[N_VEC_REMAINING]], 0
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[FOR_BODY_PREHEADER]], label [[VEC_EPILOG_PH]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    [[N_VEC8:%.*]] = and i64 [[TMP0]], -8
-; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[N_VEC8]], 1
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX11:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT15:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX12:%.*]] = or disjoint i64 [[INDEX11]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[VALS]], i64 [[OFFSET_IDX12]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 -1
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3]])
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <8 x i8>, ptr [[TMP9]], align 1, !alias.scope [[META3]], !noalias [[META0]]
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <8 x i8> [[WIDE_LOAD14]], [[WIDE_LOAD13]]
-; CHECK-NEXT:    store <8 x i8> [[TMP10]], ptr [[TMP8]], align 1, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    [[INDEX_NEXT15]] = add nuw i64 [[INDEX11]], 8
-; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT15]], [[N_VEC8]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC8]]
-; CHECK-NEXT:    br i1 [[CMP_N10]], label [[FOR_END]], label [[FOR_BODY_PREHEADER]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.preheader:
-; CHECK-NEXT:    [[I_02_PH:%.*]] = phi i64 [ 1, [[ITER_CHECK]] ], [ [[IND_END9]], [[VEC_EPILOG_ITER_CHECK]] ], [ [[IND_END]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[LOAD_INITIAL:%.*]] = load i8, ptr [[VALS]], align 1
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[I_02_PH]], [[FOR_BODY_PREHEADER]] ]
+; CHECK-NEXT:    [[STORE_FORWARDED:%.*]] = phi i8 [ [[LOAD_INITIAL]], [[FOR_BODY_PREHEADER]] ], [ [[ADD_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 1, [[FOR_BODY_PREHEADER]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[VALS]], i64 [[I_02]]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr i8, ptr [[ARRAYIDX]], i64 -1
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META0]])
-; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3]])
-; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1, !alias.scope [[META3]], !noalias [[META0]]
-; CHECK-NEXT:    [[TMP13:%.*]] = load i8, ptr [[ARRAYIDX]], align 1, !alias.scope [[META0]], !noalias [[META3]]
-; CHECK-NEXT:    [[ADD_I:%.*]] = add i8 [[TMP13]], [[TMP12]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[ARRAYIDX]], align 1, !alias.scope [[META0:![0-9]+]], !noalias [[META3:![0-9]+]]
+; CHECK-NEXT:    [[ADD_I]] = add i8 [[TMP0]], [[STORE_FORWARDED]]
 ; CHECK-NEXT:    store i8 [[ADD_I]], ptr [[ARRAYIDX]], align 1, !alias.scope [[META0]], !noalias [[META3]]
 ; CHECK-NEXT:    [[INC]] = add nuw i64 [[I_02]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUM]]
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -139,9 +75,4 @@ attributes #0 = { "target-features"="+cmov,+cx8,+fxsr,+mmx,+sse,+sse2,+x87"}
 ; CHECK: [[META2]] = distinct !{[[META2]], !"acc"}
 ; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
 ; CHECK: [[META4]] = distinct !{[[META4]], [[META2]], !"acc: %prev"}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META6:![0-9]+]], [[META7:![0-9]+]]}
-; CHECK: [[META6]] = !{!"llvm.loop.isvectorized", i32 1}
-; CHECK: [[META7]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META6]], [[META7]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META7]], [[META6]]}
 ;.

From 4a582845597e97d245e8ffdc14281f922b835e56 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 24 Jan 2024 11:22:43 +0100
Subject: [PATCH 762/843] [clang] Refactor Builtins.def to be a tablegen file
 (#68324)

This makes the builtins list quite a bit more verbose, but IMO this is a
huge win in terms of readability.
---
 clang/include/clang/AST/Expr.h                |   17 +-
 clang/include/clang/Basic/Builtins.def        | 1703 +------
 clang/include/clang/Basic/Builtins.h          |    2 +-
 clang/include/clang/Basic/Builtins.td         | 4537 +++++++++++++++++
 clang/include/clang/Basic/BuiltinsBPF.def     |   33 -
 clang/include/clang/Basic/BuiltinsBPF.td      |   37 +
 clang/include/clang/Basic/BuiltinsBase.td     |  121 +
 clang/include/clang/Basic/CMakeLists.txt      |    8 +
 clang/include/clang/Basic/TargetBuiltins.h    |    2 +-
 clang/include/module.modulemap                |    2 -
 clang/lib/AST/StmtPrinter.cpp                 |    2 +-
 clang/lib/Basic/Builtins.cpp                  |    2 +-
 clang/lib/Basic/Targets/BPF.cpp               |    2 +-
 clang/lib/Sema/OpenCLBuiltins.td              |   12 +-
 clang/lib/Sema/SemaChecking.cpp               |   24 +-
 clang/lib/Sema/SemaExpr.cpp                   |    2 +-
 clang/test/Analysis/bstring.c                 |   10 +-
 clang/test/CodeGen/callback_pthread_create.c  |    2 +-
 clang/utils/TableGen/CMakeLists.txt           |    1 +
 clang/utils/TableGen/ClangBuiltinsEmitter.cpp |  334 ++
 clang/utils/TableGen/MveEmitter.cpp           |    2 +-
 clang/utils/TableGen/TableGen.cpp             |    6 +
 clang/utils/TableGen/TableGenBackends.h       |    2 +
 23 files changed, 5089 insertions(+), 1774 deletions(-)
 create mode 100644 clang/include/clang/Basic/Builtins.td
 delete mode 100644 clang/include/clang/Basic/BuiltinsBPF.def
 create mode 100644 clang/include/clang/Basic/BuiltinsBPF.td
 create mode 100644 clang/include/clang/Basic/BuiltinsBase.td
 create mode 100644 clang/utils/TableGen/ClangBuiltinsEmitter.cpp

diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index 9820bd11da860..59f0aee2c0ced 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -6410,7 +6410,7 @@ class AtomicExpr : public Expr {
   enum AtomicOp {
 #define BUILTIN(ID, TYPE, ATTRS)
 #define ATOMIC_BUILTIN(ID, TYPE, ATTRS) AO ## ID,
-#include "clang/Basic/Builtins.def"
+#include "clang/Basic/Builtins.inc"
     // Avoid trailing comma
     BI_First = 0
   };
@@ -6476,7 +6476,7 @@ class AtomicExpr : public Expr {
 #define ATOMIC_BUILTIN(ID, TYPE, ATTRS)                                        \
   case AO##ID:                                                                 \
     return #ID;
-#include "clang/Basic/Builtins.def"
+#include "clang/Basic/Builtins.inc"
     }
     llvm_unreachable("not an atomic operator?");
   }
@@ -6505,8 +6505,8 @@ class AtomicExpr : public Expr {
   }
 
   bool isOpenCL() const {
-    return getOp() >= AO__opencl_atomic_init &&
-           getOp() <= AO__opencl_atomic_fetch_max;
+    return getOp() >= AO__opencl_atomic_compare_exchange_strong &&
+           getOp() <= AO__opencl_atomic_store;
   }
 
   SourceLocation getBuiltinLoc() const { return BuiltinLoc; }
@@ -6531,11 +6531,14 @@ class AtomicExpr : public Expr {
   /// \return empty atomic scope model if the atomic op code does not have
   ///   scope operand.
   static std::unique_ptr<AtomicScopeModel> getScopeModel(AtomicOp Op) {
-    if (Op >= AO__opencl_atomic_load && Op <= AO__opencl_atomic_fetch_max)
+    // FIXME: Allow grouping of builtins to be able to only check >= and <=
+    if (Op >= AO__opencl_atomic_compare_exchange_strong &&
+        Op <= AO__opencl_atomic_store && Op != AO__opencl_atomic_init)
       return AtomicScopeModel::create(AtomicScopeModelKind::OpenCL);
-    else if (Op >= AO__hip_atomic_load && Op <= AO__hip_atomic_fetch_max)
+    if (Op >= AO__hip_atomic_compare_exchange_strong &&
+        Op <= AO__hip_atomic_store)
       return AtomicScopeModel::create(AtomicScopeModelKind::HIP);
-    else if (Op >= AO__scoped_atomic_load && Op <= AO__scoped_atomic_fetch_max)
+    if (Op >= AO__scoped_atomic_add_fetch && Op <= AO__scoped_atomic_xor_fetch)
       return AtomicScopeModel::create(AtomicScopeModelKind::Generic);
     return AtomicScopeModel::create(AtomicScopeModelKind::None);
   }
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
index 4dcbaf8a7beaa..f356f881d5ef9 100644
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -5,17 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-//
-// This file defines the standard builtin function database.  Users of this file
-// must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
 
-// FIXME: This should really be a .td file, but that requires modifying tblgen.
-// Perhaps tblgen should have plugins.
-
-// The first value provided to the macro specifies the function name of the
-// builtin, and results in a clang::builtin::BIXX enum value for XX.
+// This is only documentation for the database layout. This will be removed once
+// all builtin databases are converted to tablegen files
 
 // The second value provided to the macro specifies the type of the function
 // (result value, then each argument) as follows:
@@ -108,1694 +100,3 @@
 //                      M_0, ..., M_k as payload
 //  z -> this is a function in (possibly-versioned) namespace std
 //  E -> this function can be constant evaluated by Clang frontend
-//  FIXME: gcc has nonnull
-
-#if defined(BUILTIN) && !defined(LIBBUILTIN)
-#  define LIBBUILTIN(ID, TYPE, ATTRS, HEADER, BUILTIN_LANG) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-#if defined(BUILTIN) && !defined(LANGBUILTIN)
-#  define LANGBUILTIN(ID, TYPE, ATTRS, BUILTIN_LANG) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-// Standard libc/libm functions:
-BUILTIN(__builtin_atan2 , "ddd"  , "Fne")
-BUILTIN(__builtin_atan2f, "fff"  , "Fne")
-BUILTIN(__builtin_atan2l, "LdLdLd", "Fne")
-BUILTIN(__builtin_atan2f128, "LLdLLdLLd", "Fne")
-BUILTIN(__builtin_abs  , "ii"  , "ncF")
-BUILTIN(__builtin_copysign, "ddd", "ncFE")
-BUILTIN(__builtin_copysignf, "fff", "ncFE")
-BUILTIN(__builtin_copysignf16, "hhh", "ncF")
-BUILTIN(__builtin_copysignl, "LdLdLd", "ncFE")
-BUILTIN(__builtin_copysignf128, "LLdLLdLLd", "ncFE")
-BUILTIN(__builtin_fabs , "dd"  , "ncFE")
-BUILTIN(__builtin_fabsf, "ff"  , "ncFE")
-BUILTIN(__builtin_fabsl, "LdLd", "ncFE")
-BUILTIN(__builtin_fabsf16, "hh"  , "ncF")
-BUILTIN(__builtin_fabsf128, "LLdLLd", "ncFE")
-BUILTIN(__builtin_fmod , "ddd"  , "Fne")
-BUILTIN(__builtin_fmodf, "fff"  , "Fne")
-BUILTIN(__builtin_fmodf16, "hhh"  , "Fne")
-BUILTIN(__builtin_fmodl, "LdLdLd", "Fne")
-BUILTIN(__builtin_fmodf128, "LLdLLdLLd", "Fne")
-BUILTIN(__builtin_frexp , "ddi*"  , "Fn")
-BUILTIN(__builtin_frexpf, "ffi*"  , "Fn")
-BUILTIN(__builtin_frexpl, "LdLdi*", "Fn")
-BUILTIN(__builtin_frexpf128, "LLdLLdi*", "Fn")
-BUILTIN(__builtin_frexpf16, "hhi*"  , "Fn")
-BUILTIN(__builtin_huge_val, "d", "ncE")
-BUILTIN(__builtin_huge_valf, "f", "ncE")
-BUILTIN(__builtin_huge_vall, "Ld", "ncE")
-BUILTIN(__builtin_huge_valf16, "x", "ncE")
-BUILTIN(__builtin_huge_valf128, "LLd", "ncE")
-BUILTIN(__builtin_inf  , "d"   , "ncE")
-BUILTIN(__builtin_inff , "f"   , "ncE")
-BUILTIN(__builtin_infl , "Ld"  , "ncE")
-BUILTIN(__builtin_inff16 , "x"  , "ncE")
-BUILTIN(__builtin_inff128 , "LLd"  , "ncE")
-BUILTIN(__builtin_labs , "LiLi"  , "Fnc")
-BUILTIN(__builtin_llabs, "LLiLLi", "Fnc")
-BUILTIN(__builtin_ldexp , "ddi"  , "Fne")
-BUILTIN(__builtin_ldexpf, "ffi"  , "Fne")
-BUILTIN(__builtin_ldexpl, "LdLdi", "Fne")
-BUILTIN(__builtin_ldexpf128, "LLdLLdi", "Fne")
-BUILTIN(__builtin_ldexpf16, "hhi", "Fne")
-BUILTIN(__builtin_modf , "ddd*"  , "Fn")
-BUILTIN(__builtin_modff, "fff*"  , "Fn")
-BUILTIN(__builtin_modfl, "LdLdLd*", "Fn")
-BUILTIN(__builtin_modff128, "LLdLLdLLd*", "Fn")
-BUILTIN(__builtin_nan,  "dcC*" , "FnUE")
-BUILTIN(__builtin_nanf, "fcC*" , "FnUE")
-BUILTIN(__builtin_nanl, "LdcC*", "FnUE")
-BUILTIN(__builtin_nanf16, "xcC*", "FnUE")
-BUILTIN(__builtin_nanf128, "LLdcC*", "FnUE")
-BUILTIN(__builtin_nans,  "dcC*" , "FnUE")
-BUILTIN(__builtin_nansf, "fcC*" , "FnUE")
-BUILTIN(__builtin_nansl, "LdcC*", "FnUE")
-BUILTIN(__builtin_nansf16, "xcC*", "FnUE")
-BUILTIN(__builtin_nansf128, "LLdcC*", "FnUE")
-BUILTIN(__builtin_powi , "ddi"  , "Fnc")
-BUILTIN(__builtin_powif, "ffi"  , "Fnc")
-BUILTIN(__builtin_powil, "LdLdi", "Fnc")
-BUILTIN(__builtin_pow , "ddd"  , "Fne")
-BUILTIN(__builtin_powf, "fff"  , "Fne")
-BUILTIN(__builtin_powf16, "hhh"  , "Fne")
-BUILTIN(__builtin_powl, "LdLdLd", "Fne")
-BUILTIN(__builtin_powf128, "LLdLLdLLd", "Fne")
-
-// Standard unary libc/libm functions with double/float/long double variants:
-BUILTIN(__builtin_acos , "dd"  , "Fne")
-BUILTIN(__builtin_acosf, "ff"  , "Fne")
-BUILTIN(__builtin_acosl, "LdLd", "Fne")
-BUILTIN(__builtin_acosf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_acosh , "dd"  , "Fne")
-BUILTIN(__builtin_acoshf, "ff"  , "Fne")
-BUILTIN(__builtin_acoshl, "LdLd", "Fne")
-BUILTIN(__builtin_acoshf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_asin , "dd"  , "Fne")
-BUILTIN(__builtin_asinf, "ff"  , "Fne")
-BUILTIN(__builtin_asinl, "LdLd", "Fne")
-BUILTIN(__builtin_asinf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_asinh , "dd"  , "Fne")
-BUILTIN(__builtin_asinhf, "ff"  , "Fne")
-BUILTIN(__builtin_asinhl, "LdLd", "Fne")
-BUILTIN(__builtin_asinhf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_atan , "dd"  , "Fne")
-BUILTIN(__builtin_atanf, "ff"  , "Fne")
-BUILTIN(__builtin_atanl, "LdLd", "Fne")
-BUILTIN(__builtin_atanf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_atanh , "dd", "Fne")
-BUILTIN(__builtin_atanhf, "ff", "Fne")
-BUILTIN(__builtin_atanhl, "LdLd", "Fne")
-BUILTIN(__builtin_atanhf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_cbrt , "dd", "Fnc")
-BUILTIN(__builtin_cbrtf, "ff", "Fnc")
-BUILTIN(__builtin_cbrtl, "LdLd", "Fnc")
-BUILTIN(__builtin_cbrtf128, "LLdLLd", "Fnc")
-BUILTIN(__builtin_ceil , "dd"  , "Fnc")
-BUILTIN(__builtin_ceilf, "ff"  , "Fnc")
-BUILTIN(__builtin_ceilf16, "hh"  , "Fnc")
-BUILTIN(__builtin_ceill, "LdLd", "Fnc")
-BUILTIN(__builtin_ceilf128, "LLdLLd", "Fnc")
-BUILTIN(__builtin_cos , "dd"  , "Fne")
-BUILTIN(__builtin_cosf, "ff"  , "Fne")
-BUILTIN(__builtin_cosf16, "hh"  , "Fne")
-BUILTIN(__builtin_cosh , "dd"  , "Fne")
-BUILTIN(__builtin_coshf, "ff"  , "Fne")
-BUILTIN(__builtin_coshl, "LdLd", "Fne")
-BUILTIN(__builtin_coshf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_cosl, "LdLd", "Fne")
-BUILTIN(__builtin_cosf128, "LLdLLd"  , "Fne")
-BUILTIN(__builtin_erf , "dd", "Fne")
-BUILTIN(__builtin_erff, "ff", "Fne")
-BUILTIN(__builtin_erfl, "LdLd", "Fne")
-BUILTIN(__builtin_erff128, "LLdLLd", "Fne")
-BUILTIN(__builtin_erfc , "dd", "Fne")
-BUILTIN(__builtin_erfcf, "ff", "Fne")
-BUILTIN(__builtin_erfcl, "LdLd", "Fne")
-BUILTIN(__builtin_erfcf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_exp , "dd"  , "Fne")
-BUILTIN(__builtin_expf, "ff"  , "Fne")
-BUILTIN(__builtin_expf16, "hh"  , "Fne")
-BUILTIN(__builtin_expl, "LdLd", "Fne")
-BUILTIN(__builtin_expf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_exp2 , "dd"  , "Fne")
-BUILTIN(__builtin_exp2f, "ff"  , "Fne")
-BUILTIN(__builtin_exp2f16, "hh"  , "Fne")
-BUILTIN(__builtin_exp2l, "LdLd", "Fne")
-BUILTIN(__builtin_exp2f128, "LLdLLd"  , "Fne")
-BUILTIN(__builtin_exp10 , "dd"  , "Fne")
-BUILTIN(__builtin_exp10f, "ff"  , "Fne")
-BUILTIN(__builtin_exp10f16, "hh"  , "Fne")
-BUILTIN(__builtin_exp10l, "LdLd", "Fne")
-BUILTIN(__builtin_exp10f128, "LLdLLd"  , "Fne")
-BUILTIN(__builtin_expm1 , "dd", "Fne")
-BUILTIN(__builtin_expm1f, "ff", "Fne")
-BUILTIN(__builtin_expm1l, "LdLd", "Fne")
-BUILTIN(__builtin_expm1f128, "LLdLLd", "Fne")
-BUILTIN(__builtin_fdim, "ddd", "Fne")
-BUILTIN(__builtin_fdimf, "fff", "Fne")
-BUILTIN(__builtin_fdiml, "LdLdLd", "Fne")
-BUILTIN(__builtin_fdimf128, "LLdLLdLLd", "Fne")
-BUILTIN(__builtin_floor , "dd"  , "Fnc")
-BUILTIN(__builtin_floorf, "ff"  , "Fnc")
-BUILTIN(__builtin_floorf16, "hh"  , "Fnc")
-BUILTIN(__builtin_floorl, "LdLd", "Fnc")
-BUILTIN(__builtin_floorf128, "LLdLLd", "Fnc")
-BUILTIN(__builtin_fma, "dddd", "Fne")
-BUILTIN(__builtin_fmaf, "ffff", "Fne")
-BUILTIN(__builtin_fmaf16, "hhhh", "Fne")
-BUILTIN(__builtin_fmal, "LdLdLdLd", "Fne")
-BUILTIN(__builtin_fmaf128, "LLdLLdLLdLLd", "Fne")
-BUILTIN(__builtin_fmax, "ddd", "FncE")
-BUILTIN(__builtin_fmaxf, "fff", "FncE")
-BUILTIN(__builtin_fmaxf16, "hhh", "FncE")
-BUILTIN(__builtin_fmaxl, "LdLdLd", "FncE")
-BUILTIN(__builtin_fmaxf128, "LLdLLdLLd", "FncE")
-BUILTIN(__builtin_fmin, "ddd", "FncE")
-BUILTIN(__builtin_fminf, "fff", "FncE")
-BUILTIN(__builtin_fminf16, "hhh", "FncE")
-BUILTIN(__builtin_fminl, "LdLdLd", "FncE")
-BUILTIN(__builtin_fminf128, "LLdLLdLLd", "FncE")
-BUILTIN(__builtin_hypot , "ddd"  , "Fne")
-BUILTIN(__builtin_hypotf, "fff"  , "Fne")
-BUILTIN(__builtin_hypotl, "LdLdLd", "Fne")
-BUILTIN(__builtin_hypotf128, "LLdLLdLLd", "Fne")
-BUILTIN(__builtin_ilogb , "id", "Fne")
-BUILTIN(__builtin_ilogbf, "if", "Fne")
-BUILTIN(__builtin_ilogbl, "iLd", "Fne")
-BUILTIN(__builtin_ilogbf128, "iLLd", "Fne")
-BUILTIN(__builtin_lgamma , "dd", "Fn")
-BUILTIN(__builtin_lgammaf, "ff", "Fn")
-BUILTIN(__builtin_lgammal, "LdLd", "Fn")
-BUILTIN(__builtin_lgammaf128, "LLdLLd", "Fn")
-BUILTIN(__builtin_llrint, "LLid", "Fne")
-BUILTIN(__builtin_llrintf, "LLif", "Fne")
-BUILTIN(__builtin_llrintl, "LLiLd", "Fne")
-BUILTIN(__builtin_llrintf128, "LLiLLd", "Fne")
-BUILTIN(__builtin_llround , "LLid", "Fne")
-BUILTIN(__builtin_llroundf, "LLif", "Fne")
-BUILTIN(__builtin_llroundl, "LLiLd", "Fne")
-BUILTIN(__builtin_llroundf128, "LLiLLd", "Fne")
-BUILTIN(__builtin_log , "dd"  , "Fne")
-BUILTIN(__builtin_log10 , "dd"  , "Fne")
-BUILTIN(__builtin_log10f, "ff"  , "Fne")
-BUILTIN(__builtin_log10f16, "hh"  , "Fne")
-BUILTIN(__builtin_log10l, "LdLd", "Fne")
-BUILTIN(__builtin_log10f128, "LLdLLd"  , "Fne")
-BUILTIN(__builtin_log1p , "dd"  , "Fne")
-BUILTIN(__builtin_log1pf, "ff"  , "Fne")
-BUILTIN(__builtin_log1pl, "LdLd", "Fne")
-BUILTIN(__builtin_log1pf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_log2, "dd"  , "Fne")
-BUILTIN(__builtin_log2f, "ff"  , "Fne")
-BUILTIN(__builtin_log2f16, "hh"  , "Fne")
-BUILTIN(__builtin_log2l, "LdLd"  , "Fne")
-BUILTIN(__builtin_log2f128, "LLdLLd"  , "Fne")
-BUILTIN(__builtin_logb , "dd", "Fne")
-BUILTIN(__builtin_logbf, "ff", "Fne")
-BUILTIN(__builtin_logbl, "LdLd", "Fne")
-BUILTIN(__builtin_logbf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_logf, "ff"  , "Fne")
-BUILTIN(__builtin_logf16, "hh"  , "Fne")
-BUILTIN(__builtin_logl, "LdLd", "Fne")
-BUILTIN(__builtin_logf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_lrint , "Lid", "Fne")
-BUILTIN(__builtin_lrintf, "Lif", "Fne")
-BUILTIN(__builtin_lrintl, "LiLd", "Fne")
-BUILTIN(__builtin_lrintf128, "LiLLd", "Fne")
-BUILTIN(__builtin_lround , "Lid", "Fne")
-BUILTIN(__builtin_lroundf, "Lif", "Fne")
-BUILTIN(__builtin_lroundl, "LiLd", "Fne")
-BUILTIN(__builtin_lroundf128, "LiLLd", "Fne")
-BUILTIN(__builtin_nearbyint , "dd", "Fnc")
-BUILTIN(__builtin_nearbyintf, "ff", "Fnc")
-BUILTIN(__builtin_nearbyintl, "LdLd", "Fnc")
-BUILTIN(__builtin_nearbyintf128, "LLdLLd", "Fnc")
-BUILTIN(__builtin_nextafter , "ddd", "Fne")
-BUILTIN(__builtin_nextafterf, "fff", "Fne")
-BUILTIN(__builtin_nextafterl, "LdLdLd", "Fne")
-BUILTIN(__builtin_nextafterf128, "LLdLLdLLd", "Fne")
-BUILTIN(__builtin_nexttoward , "ddLd", "Fne")
-BUILTIN(__builtin_nexttowardf, "ffLd", "Fne")
-BUILTIN(__builtin_nexttowardl, "LdLdLd", "Fne")
-BUILTIN(__builtin_nexttowardf128, "LLdLLdLLd", "Fne")
-BUILTIN(__builtin_remainder , "ddd", "Fne")
-BUILTIN(__builtin_remainderf, "fff", "Fne")
-BUILTIN(__builtin_remainderl, "LdLdLd", "Fne")
-BUILTIN(__builtin_remainderf128, "LLdLLdLLd", "Fne")
-BUILTIN(__builtin_remquo , "dddi*", "Fn")
-BUILTIN(__builtin_remquof, "fffi*", "Fn")
-BUILTIN(__builtin_remquol, "LdLdLdi*", "Fn")
-BUILTIN(__builtin_remquof128, "LLdLLdLLdi*", "Fn")
-BUILTIN(__builtin_rint , "dd", "Fnc")
-BUILTIN(__builtin_rintf, "ff", "Fnc")
-BUILTIN(__builtin_rintf16, "hh", "Fnc")
-BUILTIN(__builtin_rintl, "LdLd", "Fnc")
-BUILTIN(__builtin_rintf128, "LLdLLd", "Fnc")
-BUILTIN(__builtin_round, "dd"  , "Fnc")
-BUILTIN(__builtin_roundf, "ff"  , "Fnc")
-BUILTIN(__builtin_roundf16, "hh"  , "Fnc")
-BUILTIN(__builtin_roundl, "LdLd"  , "Fnc")
-BUILTIN(__builtin_roundf128, "LLdLLd"  , "Fnc")
-BUILTIN(__builtin_roundeven, "dd"  , "Fnc")
-BUILTIN(__builtin_roundevenf, "ff"  , "Fnc")
-BUILTIN(__builtin_roundevenf16, "hh"  , "Fnc")
-BUILTIN(__builtin_roundevenl, "LdLd"  , "Fnc")
-BUILTIN(__builtin_roundevenf128, "LLdLLd"  , "Fnc")
-BUILTIN(__builtin_scalbln , "ddLi", "Fne")
-BUILTIN(__builtin_scalblnf, "ffLi", "Fne")
-BUILTIN(__builtin_scalblnl, "LdLdLi", "Fne")
-BUILTIN(__builtin_scalblnf128, "LLdLLdLi", "Fne")
-BUILTIN(__builtin_scalbn , "ddi", "Fne")
-BUILTIN(__builtin_scalbnf, "ffi", "Fne")
-BUILTIN(__builtin_scalbnl, "LdLdi", "Fne")
-BUILTIN(__builtin_scalbnf128, "LLdLLdi", "Fne")
-BUILTIN(__builtin_sin , "dd"  , "Fne")
-BUILTIN(__builtin_sinf, "ff"  , "Fne")
-BUILTIN(__builtin_sinf16, "hh"  , "Fne")
-BUILTIN(__builtin_sinh , "dd"  , "Fne")
-BUILTIN(__builtin_sinhf, "ff"  , "Fne")
-BUILTIN(__builtin_sinhl, "LdLd", "Fne")
-BUILTIN(__builtin_sinhf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_sinl, "LdLd", "Fne")
-BUILTIN(__builtin_sinf128, "LLdLLd"  , "Fne")
-BUILTIN(__builtin_sqrt , "dd"  , "Fne")
-BUILTIN(__builtin_sqrtf, "ff"  , "Fne")
-BUILTIN(__builtin_sqrtf16, "hh"  , "Fne")
-BUILTIN(__builtin_sqrtl, "LdLd", "Fne")
-BUILTIN(__builtin_sqrtf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_tan , "dd"  , "Fne")
-BUILTIN(__builtin_tanf, "ff"  , "Fne")
-BUILTIN(__builtin_tanh , "dd"  , "Fne")
-BUILTIN(__builtin_tanhf, "ff"  , "Fne")
-BUILTIN(__builtin_tanhl, "LdLd", "Fne")
-BUILTIN(__builtin_tanhf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_tanl, "LdLd", "Fne")
-BUILTIN(__builtin_tanf128, "LLdLLd"  , "Fne")
-BUILTIN(__builtin_tgamma , "dd", "Fne")
-BUILTIN(__builtin_tgammaf, "ff", "Fne")
-BUILTIN(__builtin_tgammal, "LdLd", "Fne")
-BUILTIN(__builtin_tgammaf128, "LLdLLd", "Fne")
-BUILTIN(__builtin_trunc , "dd", "Fnc")
-BUILTIN(__builtin_truncf, "ff", "Fnc")
-BUILTIN(__builtin_truncl, "LdLd", "Fnc")
-BUILTIN(__builtin_truncf128, "LLdLLd", "Fnc")
-BUILTIN(__builtin_truncf16, "hh", "Fnc")
-
-// Access to floating point environment
-BUILTIN(__builtin_flt_rounds, "i", "n")
-BUILTIN(__builtin_set_flt_rounds, "vi", "n")
-
-// C99 complex builtins
-BUILTIN(__builtin_cabs, "dXd", "Fne")
-BUILTIN(__builtin_cabsf, "fXf", "Fne")
-BUILTIN(__builtin_cabsl, "LdXLd", "Fne")
-BUILTIN(__builtin_cacos, "XdXd", "Fne")
-BUILTIN(__builtin_cacosf, "XfXf", "Fne")
-BUILTIN(__builtin_cacosh, "XdXd", "Fne")
-BUILTIN(__builtin_cacoshf, "XfXf", "Fne")
-BUILTIN(__builtin_cacoshl, "XLdXLd", "Fne")
-BUILTIN(__builtin_cacosl, "XLdXLd", "Fne")
-BUILTIN(__builtin_carg, "dXd", "Fne")
-BUILTIN(__builtin_cargf, "fXf", "Fne")
-BUILTIN(__builtin_cargl, "LdXLd", "Fne")
-BUILTIN(__builtin_casin, "XdXd", "Fne")
-BUILTIN(__builtin_casinf, "XfXf", "Fne")
-BUILTIN(__builtin_casinh, "XdXd", "Fne")
-BUILTIN(__builtin_casinhf, "XfXf", "Fne")
-BUILTIN(__builtin_casinhl, "XLdXLd", "Fne")
-BUILTIN(__builtin_casinl, "XLdXLd", "Fne")
-BUILTIN(__builtin_catan, "XdXd", "Fne")
-BUILTIN(__builtin_catanf, "XfXf", "Fne")
-BUILTIN(__builtin_catanh, "XdXd", "Fne")
-BUILTIN(__builtin_catanhf, "XfXf", "Fne")
-BUILTIN(__builtin_catanhl, "XLdXLd", "Fne")
-BUILTIN(__builtin_catanl, "XLdXLd", "Fne")
-BUILTIN(__builtin_ccos, "XdXd", "Fne")
-BUILTIN(__builtin_ccosf, "XfXf", "Fne")
-BUILTIN(__builtin_ccosl, "XLdXLd", "Fne")
-BUILTIN(__builtin_ccosh, "XdXd", "Fne")
-BUILTIN(__builtin_ccoshf, "XfXf", "Fne")
-BUILTIN(__builtin_ccoshl, "XLdXLd", "Fne")
-BUILTIN(__builtin_cexp, "XdXd", "Fne")
-BUILTIN(__builtin_cexpf, "XfXf", "Fne")
-BUILTIN(__builtin_cexpl, "XLdXLd", "Fne")
-BUILTIN(__builtin_cimag, "dXd", "Fnc")
-BUILTIN(__builtin_cimagf, "fXf", "Fnc")
-BUILTIN(__builtin_cimagl, "LdXLd", "Fnc")
-BUILTIN(__builtin_conj, "XdXd", "Fnc")
-BUILTIN(__builtin_conjf, "XfXf", "Fnc")
-BUILTIN(__builtin_conjl, "XLdXLd", "Fnc")
-BUILTIN(__builtin_clog, "XdXd", "Fne")
-BUILTIN(__builtin_clogf, "XfXf", "Fne")
-BUILTIN(__builtin_clogl, "XLdXLd", "Fne")
-BUILTIN(__builtin_cproj, "XdXd", "Fnc")
-BUILTIN(__builtin_cprojf, "XfXf", "Fnc")
-BUILTIN(__builtin_cprojl, "XLdXLd", "Fnc")
-BUILTIN(__builtin_cpow, "XdXdXd", "Fne")
-BUILTIN(__builtin_cpowf, "XfXfXf", "Fne")
-BUILTIN(__builtin_cpowl, "XLdXLdXLd", "Fne")
-BUILTIN(__builtin_creal, "dXd", "Fnc")
-BUILTIN(__builtin_crealf, "fXf", "Fnc")
-BUILTIN(__builtin_creall, "LdXLd", "Fnc")
-BUILTIN(__builtin_csin, "XdXd", "Fne")
-BUILTIN(__builtin_csinf, "XfXf", "Fne")
-BUILTIN(__builtin_csinl, "XLdXLd", "Fne")
-BUILTIN(__builtin_csinh, "XdXd", "Fne")
-BUILTIN(__builtin_csinhf, "XfXf", "Fne")
-BUILTIN(__builtin_csinhl, "XLdXLd", "Fne")
-BUILTIN(__builtin_csqrt, "XdXd", "Fne")
-BUILTIN(__builtin_csqrtf, "XfXf", "Fne")
-BUILTIN(__builtin_csqrtl, "XLdXLd", "Fne")
-BUILTIN(__builtin_ctan, "XdXd", "Fne")
-BUILTIN(__builtin_ctanf, "XfXf", "Fne")
-BUILTIN(__builtin_ctanl, "XLdXLd", "Fne")
-BUILTIN(__builtin_ctanh, "XdXd", "Fne")
-BUILTIN(__builtin_ctanhf, "XfXf", "Fne")
-BUILTIN(__builtin_ctanhl, "XLdXLd", "Fne")
-
-// GCC-compatible C99 CMPLX implementation.
-BUILTIN(__builtin_complex, "v.", "nctE")
-
-// FP Comparisons.
-BUILTIN(__builtin_isgreater     , "i.", "Fnct")
-BUILTIN(__builtin_isgreaterequal, "i.", "Fnct")
-BUILTIN(__builtin_isless        , "i.", "Fnct")
-BUILTIN(__builtin_islessequal   , "i.", "Fnct")
-BUILTIN(__builtin_islessgreater , "i.", "Fnct")
-BUILTIN(__builtin_isunordered   , "i.", "Fnct")
-
-// Unary FP classification
-BUILTIN(__builtin_fpclassify, "iiiiii.", "FnctE")
-BUILTIN(__builtin_isfinite,   "i.", "FnctE")
-BUILTIN(__builtin_isinf,      "i.", "FnctE")
-BUILTIN(__builtin_isinf_sign, "i.", "FnctE")
-BUILTIN(__builtin_isnan,      "i.", "FnctE")
-BUILTIN(__builtin_isnormal,   "i.", "FnctE")
-BUILTIN(__builtin_issubnormal,"i.", "FnctE")
-BUILTIN(__builtin_iszero,     "i.", "FnctE")
-BUILTIN(__builtin_issignaling,"i.", "FnctE")
-BUILTIN(__builtin_isfpclass,  "i.", "nctE")
-
-// FP signbit builtins
-BUILTIN(__builtin_signbit, "i.", "Fnct")
-BUILTIN(__builtin_signbitf, "if", "Fnc")
-BUILTIN(__builtin_signbitl, "iLd", "Fnc")
-
-// Special FP builtins.
-BUILTIN(__builtin_canonicalize, "dd", "nc")
-BUILTIN(__builtin_canonicalizef, "ff", "nc")
-BUILTIN(__builtin_canonicalizef16, "hh", "nc")
-BUILTIN(__builtin_canonicalizel, "LdLd", "nc")
-
-// Builtins for arithmetic.
-BUILTIN(__builtin_clzs , "iUs"  , "ncE")
-BUILTIN(__builtin_clz  , "iUi"  , "ncE")
-BUILTIN(__builtin_clzl , "iULi" , "ncE")
-BUILTIN(__builtin_clzll, "iULLi", "ncE")
-// TODO: int clzimax(uintmax_t)
-BUILTIN(__builtin_ctzs , "iUs"  , "ncE")
-BUILTIN(__builtin_ctz  , "iUi"  , "ncE")
-BUILTIN(__builtin_ctzl , "iULi" , "ncE")
-BUILTIN(__builtin_ctzll, "iULLi", "ncE")
-// TODO: int ctzimax(uintmax_t)
-BUILTIN(__builtin_ffs  , "ii"  , "FncE")
-BUILTIN(__builtin_ffsl , "iLi" , "FncE")
-BUILTIN(__builtin_ffsll, "iLLi", "FncE")
-BUILTIN(__builtin_parity  , "iUi"  , "ncE")
-BUILTIN(__builtin_parityl , "iULi" , "ncE")
-BUILTIN(__builtin_parityll, "iULLi", "ncE")
-BUILTIN(__builtin_popcount  , "iUi"  , "ncE")
-BUILTIN(__builtin_popcountl , "iULi" , "ncE")
-BUILTIN(__builtin_popcountll, "iULLi", "ncE")
-BUILTIN(__builtin_clrsb  , "ii"  , "ncE")
-BUILTIN(__builtin_clrsbl , "iLi" , "ncE")
-BUILTIN(__builtin_clrsbll, "iLLi", "ncE")
-
-// The following builtins rely on that char == 8 bits, short == 16 bits and that
-// there exists native types on the target that are 32- and 64-bits wide, unless
-// these conditions are fulfilled these builtins will operate on a not intended
-// bitwidth.
-BUILTIN(__builtin_bswap16, "UsUs", "ncE")
-BUILTIN(__builtin_bswap32, "UZiUZi", "ncE")
-BUILTIN(__builtin_bswap64, "UWiUWi", "ncE")
-
-BUILTIN(__builtin_bitreverse8, "UcUc", "ncE")
-BUILTIN(__builtin_bitreverse16, "UsUs", "ncE")
-BUILTIN(__builtin_bitreverse32, "UZiUZi", "ncE")
-BUILTIN(__builtin_bitreverse64, "UWiUWi", "ncE")
-
-BUILTIN(__builtin_rotateleft8, "UcUcUc", "ncE")
-BUILTIN(__builtin_rotateleft16, "UsUsUs", "ncE")
-BUILTIN(__builtin_rotateleft32, "UZiUZiUZi", "ncE")
-BUILTIN(__builtin_rotateleft64, "UWiUWiUWi", "ncE")
-BUILTIN(__builtin_rotateright8, "UcUcUc", "ncE")
-BUILTIN(__builtin_rotateright16, "UsUsUs", "ncE")
-BUILTIN(__builtin_rotateright32, "UZiUZiUZi", "ncE")
-BUILTIN(__builtin_rotateright64, "UWiUWiUWi", "ncE")
-
-// Random GCC builtins
-BUILTIN(__builtin_calloc, "v*zz", "nF")
-BUILTIN(__builtin_constant_p, "i.", "nctuE")
-BUILTIN(__builtin_classify_type, "i.", "nctuE")
-BUILTIN(__builtin___CFStringMakeConstantString, "FC*cC*", "ncE")
-BUILTIN(__builtin___NSStringMakeConstantString, "FC*cC*", "ncE")
-BUILTIN(__builtin_va_start, "vA.", "nt")
-BUILTIN(__builtin_va_end, "vA", "n")
-BUILTIN(__builtin_va_copy, "vAA", "n")
-BUILTIN(__builtin_stdarg_start, "vA.", "nt")
-BUILTIN(__builtin_assume_aligned, "v*vC*z.", "nctE")
-BUILTIN(__builtin_bcmp, "ivC*vC*z", "FnE")
-BUILTIN(__builtin_bcopy, "vvC*v*z", "nF")
-BUILTIN(__builtin_bzero, "vv*z", "nF")
-BUILTIN(__builtin_free, "vv*", "nF")
-BUILTIN(__builtin_malloc, "v*z", "nF")
-BUILTIN(__builtin_memchr, "v*vC*iz", "nFE")
-BUILTIN(__builtin_memcmp, "ivC*vC*z", "nFE")
-BUILTIN(__builtin_memcpy, "v*v*vC*z", "nFE")
-BUILTIN(__builtin_memcpy_inline, "vv*vC*Iz", "n")
-BUILTIN(__builtin_memmove, "v*v*vC*z", "nFE")
-BUILTIN(__builtin_mempcpy, "v*v*vC*z", "nF")
-BUILTIN(__builtin_memset, "v*v*iz", "nF")
-BUILTIN(__builtin_memset_inline, "vv*iIz", "n")
-BUILTIN(__builtin_stpcpy, "c*c*cC*", "nF")
-BUILTIN(__builtin_stpncpy, "c*c*cC*z", "nF")
-BUILTIN(__builtin_strcasecmp, "icC*cC*", "nF")
-BUILTIN(__builtin_strcat, "c*c*cC*", "nF")
-BUILTIN(__builtin_strchr, "c*cC*i", "nFE")
-BUILTIN(__builtin_strcmp, "icC*cC*", "nFE")
-BUILTIN(__builtin_strcpy, "c*c*cC*", "nF")
-BUILTIN(__builtin_strcspn, "zcC*cC*", "nF")
-BUILTIN(__builtin_strdup, "c*cC*", "nF")
-BUILTIN(__builtin_strlen, "zcC*", "nFE")
-BUILTIN(__builtin_strncasecmp, "icC*cC*z", "nF")
-BUILTIN(__builtin_strncat, "c*c*cC*z", "nF")
-BUILTIN(__builtin_strncmp, "icC*cC*z", "nFE")
-BUILTIN(__builtin_strncpy, "c*c*cC*z", "nF")
-BUILTIN(__builtin_strndup, "c*cC*z", "nF")
-BUILTIN(__builtin_strpbrk, "c*cC*cC*", "nF")
-BUILTIN(__builtin_strrchr, "c*cC*i", "nF")
-BUILTIN(__builtin_strspn, "zcC*cC*", "nF")
-BUILTIN(__builtin_strstr, "c*cC*cC*", "nF")
-BUILTIN(__builtin_wcschr, "w*wC*w", "nFE")
-BUILTIN(__builtin_wcscmp, "iwC*wC*", "nFE")
-BUILTIN(__builtin_wcslen, "zwC*", "nFE")
-BUILTIN(__builtin_wcsncmp, "iwC*wC*z", "nFE")
-BUILTIN(__builtin_wmemchr, "w*wC*wz", "nFE")
-BUILTIN(__builtin_wmemcmp, "iwC*wC*z", "nFE")
-BUILTIN(__builtin_wmemcpy, "w*w*wC*z", "nFE")
-BUILTIN(__builtin_wmemmove, "w*w*wC*z", "nFE")
-BUILTIN(__builtin_realloc, "v*v*z", "nF")
-BUILTIN(__builtin_return_address, "v*IUi", "n")
-BUILTIN(__builtin_extract_return_addr, "v*v*", "n")
-BUILTIN(__builtin_frame_address, "v*IUi", "n")
-BUILTIN(__builtin___clear_cache, "vc*c*", "n")
-BUILTIN(__builtin_setjmp, "iv**", "j")
-BUILTIN(__builtin_longjmp, "vv**i", "r")
-BUILTIN(__builtin_unwind_init, "v", "")
-BUILTIN(__builtin_eh_return_data_regno, "iIi", "ncE")
-BUILTIN(__builtin_fprintf, "iP*RcC*R.", "nFp:1:")
-BUILTIN(__builtin_printf, "icC*R.", "nFp:0:")
-BUILTIN(__builtin_sprintf, "ic*RcC*R.", "nFp:1:")
-BUILTIN(__builtin_snprintf, "ic*RzcC*R.", "nFp:2:")
-BUILTIN(__builtin_vprintf, "icC*Ra", "nFP:0:")
-BUILTIN(__builtin_vfprintf, "iP*RcC*Ra", "nFP:1:")
-BUILTIN(__builtin_vsprintf, "ic*RcC*Ra", "nFP:1:")
-BUILTIN(__builtin_vsnprintf, "ic*RzcC*Ra", "nFP:2:")
-BUILTIN(__builtin_fscanf, "iP*RcC*R.", "Fs:1:")
-BUILTIN(__builtin_scanf, "icC*R.", "Fs:0:")
-BUILTIN(__builtin_sscanf, "icC*RcC*R.", "Fs:1:")
-BUILTIN(__builtin_vfscanf, "iP*RcC*Ra", "FS:1:")
-BUILTIN(__builtin_vscanf, "icC*Ra", "FS:0:")
-BUILTIN(__builtin_vsscanf, "icC*RcC*Ra", "FS:1:")
-BUILTIN(__builtin_thread_pointer, "v*", "nc")
-BUILTIN(__builtin_launder, "v*v*", "ntE")
-LANGBUILTIN(__builtin_is_constant_evaluated, "b", "nE", CXX_LANG)
-
-// GCC exception builtins
-BUILTIN(__builtin_eh_return, "vzv*", "r") // FIXME: Takes intptr_t, not size_t!
-BUILTIN(__builtin_frob_return_addr, "v*v*", "n")
-BUILTIN(__builtin_dwarf_cfa, "v*", "n")
-BUILTIN(__builtin_init_dwarf_reg_size_table, "vv*", "n")
-BUILTIN(__builtin_dwarf_sp_column, "Ui", "n")
-BUILTIN(__builtin_extend_pointer, "ULLiv*", "n") // _Unwind_Word == uint64_t
-
-// GCC Object size checking builtins
-BUILTIN(__builtin_object_size, "zvC*i", "nuE")
-BUILTIN(__builtin_dynamic_object_size, "zvC*i", "nuE") // Clang only.
-BUILTIN(__builtin___memcpy_chk, "v*v*vC*zz", "nF")
-BUILTIN(__builtin___memccpy_chk, "v*v*vC*izz", "nF")
-BUILTIN(__builtin___memmove_chk, "v*v*vC*zz", "nF")
-BUILTIN(__builtin___mempcpy_chk, "v*v*vC*zz", "nF")
-BUILTIN(__builtin___memset_chk, "v*v*izz", "nF")
-BUILTIN(__builtin___stpcpy_chk, "c*c*cC*z", "nF")
-BUILTIN(__builtin___strcat_chk, "c*c*cC*z", "nF")
-BUILTIN(__builtin___strcpy_chk, "c*c*cC*z", "nF")
-BUILTIN(__builtin___strlcat_chk, "zc*cC*zz", "nF")
-BUILTIN(__builtin___strlcpy_chk, "zc*cC*zz", "nF")
-BUILTIN(__builtin___strncat_chk, "c*c*cC*zz", "nF")
-BUILTIN(__builtin___strncpy_chk, "c*c*cC*zz", "nF")
-BUILTIN(__builtin___stpncpy_chk, "c*c*cC*zz", "nF")
-BUILTIN(__builtin___snprintf_chk, "ic*RzizcC*R.", "Fp:4:")
-BUILTIN(__builtin___sprintf_chk, "ic*RizcC*R.", "Fp:3:")
-BUILTIN(__builtin___vsnprintf_chk, "ic*RzizcC*Ra", "FP:4:")
-BUILTIN(__builtin___vsprintf_chk, "ic*RizcC*Ra", "FP:3:")
-BUILTIN(__builtin___fprintf_chk, "iP*RicC*R.", "Fp:2:")
-BUILTIN(__builtin___printf_chk, "iicC*R.", "Fp:1:")
-BUILTIN(__builtin___vfprintf_chk, "iP*RicC*Ra", "FP:2:")
-BUILTIN(__builtin___vprintf_chk, "iicC*Ra", "FP:1:")
-
-BUILTIN(__builtin_unpredictable, "LiLi"   , "nc")
-BUILTIN(__builtin_expect, "LiLiLi"   , "ncE")
-BUILTIN(__builtin_expect_with_probability, "LiLiLid", "ncE")
-BUILTIN(__builtin_prefetch, "vvC*.", "nc")
-BUILTIN(__builtin_readcyclecounter, "ULLi", "n")
-BUILTIN(__builtin_trap, "v", "nr")
-BUILTIN(__builtin_debugtrap, "v", "n")
-BUILTIN(__builtin_unreachable, "v", "nr")
-BUILTIN(__builtin_shufflevector, "v."   , "nct")
-BUILTIN(__builtin_convertvector, "v."   , "nct")
-BUILTIN(__builtin_vectorelements, "v."  , "nct")
-BUILTIN(__builtin_alloca, "v*z"   , "Fn")
-BUILTIN(__builtin_alloca_uninitialized, "v*z", "Fn")
-BUILTIN(__builtin_alloca_with_align, "v*zIz", "Fn")
-BUILTIN(__builtin_alloca_with_align_uninitialized, "v*zIz", "Fn")
-BUILTIN(__builtin_call_with_static_chain, "v.", "nt")
-BUILTIN(__builtin_nondeterministic_value, "v.", "nt")
-
-BUILTIN(__builtin_elementwise_abs, "v.", "nct")
-BUILTIN(__builtin_elementwise_bitreverse, "v.", "nct")
-BUILTIN(__builtin_elementwise_max, "v.", "nct")
-BUILTIN(__builtin_elementwise_min, "v.", "nct")
-BUILTIN(__builtin_elementwise_ceil, "v.", "nct")
-BUILTIN(__builtin_elementwise_cos, "v.", "nct")
-BUILTIN(__builtin_elementwise_exp, "v.", "nct")
-BUILTIN(__builtin_elementwise_exp2, "v.", "nct")
-BUILTIN(__builtin_elementwise_floor, "v.", "nct")
-BUILTIN(__builtin_elementwise_log, "v.", "nct")
-BUILTIN(__builtin_elementwise_log2, "v.", "nct")
-BUILTIN(__builtin_elementwise_log10, "v.", "nct")
-BUILTIN(__builtin_elementwise_pow, "v.", "nct")
-BUILTIN(__builtin_elementwise_roundeven, "v.", "nct")
-BUILTIN(__builtin_elementwise_round, "v.", "nct")
-BUILTIN(__builtin_elementwise_rint, "v.", "nct")
-BUILTIN(__builtin_elementwise_nearbyint, "v.", "nct")
-BUILTIN(__builtin_elementwise_sin, "v.", "nct")
-BUILTIN(__builtin_elementwise_sqrt, "v.", "nct")
-BUILTIN(__builtin_elementwise_trunc, "v.", "nct")
-BUILTIN(__builtin_elementwise_canonicalize, "v.", "nct")
-BUILTIN(__builtin_elementwise_copysign, "v.", "nct")
-BUILTIN(__builtin_elementwise_fma, "v.", "nct")
-BUILTIN(__builtin_elementwise_add_sat, "v.", "nct")
-BUILTIN(__builtin_elementwise_sub_sat, "v.", "nct")
-BUILTIN(__builtin_reduce_max, "v.", "nct")
-BUILTIN(__builtin_reduce_min, "v.", "nct")
-BUILTIN(__builtin_reduce_xor, "v.", "nct")
-BUILTIN(__builtin_reduce_or, "v.", "nct")
-BUILTIN(__builtin_reduce_and, "v.", "nct")
-BUILTIN(__builtin_reduce_add, "v.", "nct")
-BUILTIN(__builtin_reduce_mul, "v.", "nct")
-
-BUILTIN(__builtin_matrix_transpose, "v.", "nFt")
-BUILTIN(__builtin_matrix_column_major_load, "v.", "nFt")
-BUILTIN(__builtin_matrix_column_major_store, "v.", "nFt")
-
-// "Overloaded" Atomic operator builtins.  These are overloaded to support data
-// types of i8, i16, i32, i64, and i128.  The front-end sees calls to the
-// non-suffixed version of these (which has a bogus type) and transforms them to
-// the right overloaded version in Sema (plus casts).
-
-// FIXME: These assume that char -> i8, short -> i16, int -> i32,
-// long long -> i64.
-
-BUILTIN(__sync_fetch_and_add, "v.", "t")
-BUILTIN(__sync_fetch_and_add_1, "ccD*c.", "nt")
-BUILTIN(__sync_fetch_and_add_2, "ssD*s.", "nt")
-BUILTIN(__sync_fetch_and_add_4, "iiD*i.", "nt")
-BUILTIN(__sync_fetch_and_add_8, "LLiLLiD*LLi.", "nt")
-BUILTIN(__sync_fetch_and_add_16, "LLLiLLLiD*LLLi.", "nt")
-
-BUILTIN(__sync_fetch_and_sub, "v.", "t")
-BUILTIN(__sync_fetch_and_sub_1, "ccD*c.", "nt")
-BUILTIN(__sync_fetch_and_sub_2, "ssD*s.", "nt")
-BUILTIN(__sync_fetch_and_sub_4, "iiD*i.", "nt")
-BUILTIN(__sync_fetch_and_sub_8, "LLiLLiD*LLi.", "nt")
-BUILTIN(__sync_fetch_and_sub_16, "LLLiLLLiD*LLLi.", "nt")
-
-BUILTIN(__sync_fetch_and_or, "v.", "t")
-BUILTIN(__sync_fetch_and_or_1, "ccD*c.", "nt")
-BUILTIN(__sync_fetch_and_or_2, "ssD*s.", "nt")
-BUILTIN(__sync_fetch_and_or_4, "iiD*i.", "nt")
-BUILTIN(__sync_fetch_and_or_8, "LLiLLiD*LLi.", "nt")
-BUILTIN(__sync_fetch_and_or_16, "LLLiLLLiD*LLLi.", "nt")
-
-BUILTIN(__sync_fetch_and_and, "v.", "t")
-BUILTIN(__sync_fetch_and_and_1, "ccD*c.", "tn")
-BUILTIN(__sync_fetch_and_and_2, "ssD*s.", "tn")
-BUILTIN(__sync_fetch_and_and_4, "iiD*i.", "tn")
-BUILTIN(__sync_fetch_and_and_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_fetch_and_and_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_fetch_and_xor, "v.", "t")
-BUILTIN(__sync_fetch_and_xor_1, "ccD*c.", "tn")
-BUILTIN(__sync_fetch_and_xor_2, "ssD*s.", "tn")
-BUILTIN(__sync_fetch_and_xor_4, "iiD*i.", "tn")
-BUILTIN(__sync_fetch_and_xor_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_fetch_and_xor_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_fetch_and_nand, "v.", "t")
-BUILTIN(__sync_fetch_and_nand_1, "ccD*c.", "tn")
-BUILTIN(__sync_fetch_and_nand_2, "ssD*s.", "tn")
-BUILTIN(__sync_fetch_and_nand_4, "iiD*i.", "tn")
-BUILTIN(__sync_fetch_and_nand_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_fetch_and_nand_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_add_and_fetch, "v.", "t")
-BUILTIN(__sync_add_and_fetch_1, "ccD*c.", "tn")
-BUILTIN(__sync_add_and_fetch_2, "ssD*s.", "tn")
-BUILTIN(__sync_add_and_fetch_4, "iiD*i.", "tn")
-BUILTIN(__sync_add_and_fetch_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_add_and_fetch_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_sub_and_fetch, "v.", "t")
-BUILTIN(__sync_sub_and_fetch_1, "ccD*c.", "tn")
-BUILTIN(__sync_sub_and_fetch_2, "ssD*s.", "tn")
-BUILTIN(__sync_sub_and_fetch_4, "iiD*i.", "tn")
-BUILTIN(__sync_sub_and_fetch_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_sub_and_fetch_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_or_and_fetch, "v.", "t")
-BUILTIN(__sync_or_and_fetch_1, "ccD*c.", "tn")
-BUILTIN(__sync_or_and_fetch_2, "ssD*s.", "tn")
-BUILTIN(__sync_or_and_fetch_4, "iiD*i.", "tn")
-BUILTIN(__sync_or_and_fetch_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_or_and_fetch_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_and_and_fetch, "v.", "t")
-BUILTIN(__sync_and_and_fetch_1, "ccD*c.", "tn")
-BUILTIN(__sync_and_and_fetch_2, "ssD*s.", "tn")
-BUILTIN(__sync_and_and_fetch_4, "iiD*i.", "tn")
-BUILTIN(__sync_and_and_fetch_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_and_and_fetch_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_xor_and_fetch, "v.", "t")
-BUILTIN(__sync_xor_and_fetch_1, "ccD*c.", "tn")
-BUILTIN(__sync_xor_and_fetch_2, "ssD*s.", "tn")
-BUILTIN(__sync_xor_and_fetch_4, "iiD*i.", "tn")
-BUILTIN(__sync_xor_and_fetch_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_xor_and_fetch_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_nand_and_fetch, "v.", "t")
-BUILTIN(__sync_nand_and_fetch_1, "ccD*c.", "tn")
-BUILTIN(__sync_nand_and_fetch_2, "ssD*s.", "tn")
-BUILTIN(__sync_nand_and_fetch_4, "iiD*i.", "tn")
-BUILTIN(__sync_nand_and_fetch_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_nand_and_fetch_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_bool_compare_and_swap, "v.", "t")
-BUILTIN(__sync_bool_compare_and_swap_1, "bcD*cc.", "tn")
-BUILTIN(__sync_bool_compare_and_swap_2, "bsD*ss.", "tn")
-BUILTIN(__sync_bool_compare_and_swap_4, "biD*ii.", "tn")
-BUILTIN(__sync_bool_compare_and_swap_8, "bLLiD*LLiLLi.", "tn")
-BUILTIN(__sync_bool_compare_and_swap_16, "bLLLiD*LLLiLLLi.", "tn")
-
-BUILTIN(__sync_val_compare_and_swap, "v.", "t")
-BUILTIN(__sync_val_compare_and_swap_1, "ccD*cc.", "tn")
-BUILTIN(__sync_val_compare_and_swap_2, "ssD*ss.", "tn")
-BUILTIN(__sync_val_compare_and_swap_4, "iiD*ii.", "tn")
-BUILTIN(__sync_val_compare_and_swap_8, "LLiLLiD*LLiLLi.", "tn")
-BUILTIN(__sync_val_compare_and_swap_16, "LLLiLLLiD*LLLiLLLi.", "tn")
-
-BUILTIN(__sync_lock_test_and_set, "v.", "t")
-BUILTIN(__sync_lock_test_and_set_1, "ccD*c.", "tn")
-BUILTIN(__sync_lock_test_and_set_2, "ssD*s.", "tn")
-BUILTIN(__sync_lock_test_and_set_4, "iiD*i.", "tn")
-BUILTIN(__sync_lock_test_and_set_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_lock_test_and_set_16, "LLLiLLLiD*LLLi.", "tn")
-
-BUILTIN(__sync_lock_release, "v.", "t")
-BUILTIN(__sync_lock_release_1, "vcD*.", "tn")
-BUILTIN(__sync_lock_release_2, "vsD*.", "tn")
-BUILTIN(__sync_lock_release_4, "viD*.", "tn")
-BUILTIN(__sync_lock_release_8, "vLLiD*.", "tn")
-BUILTIN(__sync_lock_release_16, "vLLLiD*.", "tn")
-
-BUILTIN(__sync_swap, "v.", "t")
-BUILTIN(__sync_swap_1, "ccD*c.", "tn")
-BUILTIN(__sync_swap_2, "ssD*s.", "tn")
-BUILTIN(__sync_swap_4, "iiD*i.", "tn")
-BUILTIN(__sync_swap_8, "LLiLLiD*LLi.", "tn")
-BUILTIN(__sync_swap_16, "LLLiLLLiD*LLLi.", "tn")
-
-// Some of our atomics builtins are handled by AtomicExpr rather than
-// as normal builtin CallExprs. This macro is used for such builtins.
-#ifndef ATOMIC_BUILTIN
-#define ATOMIC_BUILTIN(ID, TYPE, ATTRS) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-// C11 _Atomic operations for <stdatomic.h>.
-ATOMIC_BUILTIN(__c11_atomic_init, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_load, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_store, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_exchange, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_compare_exchange_strong, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_compare_exchange_weak, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_fetch_add, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_fetch_sub, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_fetch_and, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_fetch_or, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_fetch_xor, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_fetch_nand, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_fetch_max, "v.", "t")
-ATOMIC_BUILTIN(__c11_atomic_fetch_min, "v.", "t")
-BUILTIN(__c11_atomic_thread_fence, "vi", "n")
-BUILTIN(__c11_atomic_signal_fence, "vi", "n")
-BUILTIN(__c11_atomic_is_lock_free, "bz", "nE")
-
-// GNU atomic builtins.
-ATOMIC_BUILTIN(__atomic_load, "v.", "t")
-ATOMIC_BUILTIN(__atomic_load_n, "v.", "t")
-ATOMIC_BUILTIN(__atomic_store, "v.", "t")
-ATOMIC_BUILTIN(__atomic_store_n, "v.", "t")
-ATOMIC_BUILTIN(__atomic_exchange, "v.", "t")
-ATOMIC_BUILTIN(__atomic_exchange_n, "v.", "t")
-ATOMIC_BUILTIN(__atomic_compare_exchange, "v.", "t")
-ATOMIC_BUILTIN(__atomic_compare_exchange_n, "v.", "t")
-ATOMIC_BUILTIN(__atomic_fetch_add, "v.", "t")
-ATOMIC_BUILTIN(__atomic_fetch_sub, "v.", "t")
-ATOMIC_BUILTIN(__atomic_fetch_and, "v.", "t")
-ATOMIC_BUILTIN(__atomic_fetch_or, "v.", "t")
-ATOMIC_BUILTIN(__atomic_fetch_xor, "v.", "t")
-ATOMIC_BUILTIN(__atomic_fetch_nand, "v.", "t")
-ATOMIC_BUILTIN(__atomic_add_fetch, "v.", "t")
-ATOMIC_BUILTIN(__atomic_sub_fetch, "v.", "t")
-ATOMIC_BUILTIN(__atomic_and_fetch, "v.", "t")
-ATOMIC_BUILTIN(__atomic_or_fetch, "v.", "t")
-ATOMIC_BUILTIN(__atomic_xor_fetch, "v.", "t")
-ATOMIC_BUILTIN(__atomic_max_fetch, "v.", "t")
-ATOMIC_BUILTIN(__atomic_min_fetch, "v.", "t")
-ATOMIC_BUILTIN(__atomic_nand_fetch, "v.", "t")
-BUILTIN(__atomic_test_and_set, "bvD*i", "n")
-BUILTIN(__atomic_clear, "vvD*i", "n")
-BUILTIN(__atomic_thread_fence, "vi", "n")
-BUILTIN(__atomic_signal_fence, "vi", "n")
-BUILTIN(__atomic_always_lock_free, "bzvCD*", "nE")
-BUILTIN(__atomic_is_lock_free, "bzvCD*", "nE")
-
-// GNU atomic builtins with atomic scopes.
-ATOMIC_BUILTIN(__scoped_atomic_load, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_load_n, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_store, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_store_n, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_exchange, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_exchange_n, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_compare_exchange, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_compare_exchange_n, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_fetch_add, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_fetch_sub, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_fetch_and, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_fetch_or, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_fetch_xor, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_fetch_nand, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_add_fetch, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_sub_fetch, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_and_fetch, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_or_fetch, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_xor_fetch, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_max_fetch, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_min_fetch, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_nand_fetch, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_fetch_min, "v.", "t")
-ATOMIC_BUILTIN(__scoped_atomic_fetch_max, "v.", "t")
-
-// OpenCL 2.0 atomic builtins.
-ATOMIC_BUILTIN(__opencl_atomic_init, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_load, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_store, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_exchange, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_compare_exchange_strong, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_compare_exchange_weak, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_fetch_add, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_fetch_sub, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_fetch_and, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_fetch_or, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_fetch_xor, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_fetch_min, "v.", "t")
-ATOMIC_BUILTIN(__opencl_atomic_fetch_max, "v.", "t")
-
-// GCC does not support these, they are a Clang extension.
-ATOMIC_BUILTIN(__atomic_fetch_min, "v.", "t")
-ATOMIC_BUILTIN(__atomic_fetch_max, "v.", "t")
-
-// HIP atomic builtins.
-ATOMIC_BUILTIN(__hip_atomic_load, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_store, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_compare_exchange_weak, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_compare_exchange_strong, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_exchange, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_fetch_add, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_fetch_sub, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_fetch_and, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_fetch_or, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_fetch_xor, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_fetch_min, "v.", "t")
-ATOMIC_BUILTIN(__hip_atomic_fetch_max, "v.", "t")
-
-#undef ATOMIC_BUILTIN
-
-// Non-overloaded atomic builtins.
-BUILTIN(__sync_synchronize, "v", "n")
-// GCC does not support these, they are a Clang extension.
-BUILTIN(__sync_fetch_and_min, "iiD*i", "n")
-BUILTIN(__sync_fetch_and_max, "iiD*i", "n")
-BUILTIN(__sync_fetch_and_umin, "UiUiD*Ui", "n")
-BUILTIN(__sync_fetch_and_umax, "UiUiD*Ui", "n")
-
-// Random libc builtins.
-BUILTIN(__builtin_abort, "v", "Fnr")
-BUILTIN(__builtin_index, "c*cC*i", "Fn")
-BUILTIN(__builtin_rindex, "c*cC*i", "Fn")
-
-// ignored glibc builtin, see https://sourceware.org/bugzilla/show_bug.cgi?id=25399
-BUILTIN(__warn_memset_zero_len, "v", "nU")
-
-// Microsoft builtins.  These are only active with -fms-extensions.
-LANGBUILTIN(_alloca,          "v*z", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__annotation,     "wC*.","n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__assume,         "vb",  "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_bittest,                "UcNiC*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_bittestandcomplement,   "UcNi*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_bittestandreset,        "UcNi*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_bittestandset,          "UcNi*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_bittest64,              "UcWiC*Wi", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_bittestandcomplement64, "UcWi*Wi", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_bittestandreset64,      "UcWi*Wi", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_bittestandset64,        "UcWi*Wi", "n", ALL_MS_LANGUAGES)
-LIBBUILTIN(_byteswap_ushort, "UsUs",     "fnc", STDLIB_H, ALL_MS_LANGUAGES)
-LIBBUILTIN(_byteswap_ulong,  "UNiUNi",   "fnc", STDLIB_H, ALL_MS_LANGUAGES)
-LIBBUILTIN(_byteswap_uint64, "ULLiULLi", "fnc", STDLIB_H, ALL_MS_LANGUAGES)
-LANGBUILTIN(__debugbreak,     "v",   "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__exception_code, "UNi", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_exception_code,  "UNi", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__exception_info, "v*",  "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_exception_info,  "v*",  "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__abnormal_termination, "i", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_abnormal_termination,  "i", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__GetExceptionInfo, "v*.", "zntu", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedAnd8,   "ccD*c",        "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedAnd16,  "ssD*s",        "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedAnd,    "NiNiD*Ni",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedCompareExchange8,   "ccD*cc",         "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedCompareExchange16,  "ssD*ss",         "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedCompareExchange,    "NiNiD*NiNi",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedCompareExchange64,  "LLiLLiD*LLiLLi", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedCompareExchangePointer, "v*v*D*v*v*", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedCompareExchangePointer_nf, "v*v*D*v*v*", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedDecrement16,        "ssD*",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedDecrement,          "NiNiD*",   "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchange,           "NiNiD*Ni",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchange8,          "ccD*c",        "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchange16,         "ssD*s",        "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchangeAdd8,       "ccD*c",          "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchangeAdd16,      "ssD*s",          "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchangeAdd,        "NiNiD*Ni",       "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchangePointer,    "v*v*D*v*",   "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchangeSub8,   "ccD*c",        "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchangeSub16,  "ssD*s",        "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedExchangeSub,    "NiNiD*Ni",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedIncrement16,        "ssD*",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedIncrement,          "NiNiD*",   "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedOr8,  "ccD*c",        "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedOr16, "ssD*s",        "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedOr,   "NiNiD*Ni",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedXor8,  "ccD*c",       "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedXor16, "ssD*s",       "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_InterlockedXor,   "NiNiD*Ni",    "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandreset,     "UcNiD*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandreset64,   "UcWiD*Wi", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandreset_acq, "UcNiD*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandreset_nf,  "UcNiD*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandreset_rel, "UcNiD*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandset,       "UcNiD*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandset64,     "UcWiD*Wi", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandset_acq,   "UcNiD*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandset_nf,    "UcNiD*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_interlockedbittestandset_rel,   "UcNiD*Ni", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__iso_volatile_load8,   "ccCD*",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__iso_volatile_load16,  "ssCD*",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__iso_volatile_load32,  "iiCD*",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__iso_volatile_load64,  "LLiLLiCD*", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__iso_volatile_store8,  "vcD*c",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__iso_volatile_store16, "vsD*s",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__iso_volatile_store32, "viD*i",     "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__iso_volatile_store64, "vLLiD*LLi", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__noop,           "i.",  "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(__lzcnt16, "UsUs",    "ncE", ALL_MS_LANGUAGES)
-LANGBUILTIN(__lzcnt,   "UiUi",    "ncE", ALL_MS_LANGUAGES)
-LANGBUILTIN(__lzcnt64, "UWiUWi",  "ncE", ALL_MS_LANGUAGES)
-LANGBUILTIN(__popcnt16, "UsUs",   "ncE", ALL_MS_LANGUAGES)
-LANGBUILTIN(__popcnt,   "UiUi",   "ncE", ALL_MS_LANGUAGES)
-LANGBUILTIN(__popcnt64, "UWiUWi", "ncE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_ReturnAddress, "v*", "n", ALL_MS_LANGUAGES)
-LANGBUILTIN(_rotl8,  "UcUcUc",    "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_rotl16, "UsUsUc",    "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_rotl,   "UiUii",     "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_lrotl,  "ULiULii",   "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_rotl64, "UWiUWii",   "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_rotr8,  "UcUcUc",    "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_rotr16, "UsUsUc",    "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_rotr,   "UiUii",     "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_lrotr,  "ULiULii",   "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(_rotr64, "UWiUWii",   "nE", ALL_MS_LANGUAGES)
-LANGBUILTIN(__va_start,       "vc**.", "nt", ALL_MS_LANGUAGES)
-LANGBUILTIN(__fastfail, "vUi",    "nr", ALL_MS_LANGUAGES)
-
-// Microsoft library builtins.
-LIBBUILTIN(_setjmpex, "iJ", "fjT", SETJMPEX_H, ALL_MS_LANGUAGES)
-
-// C99 library functions
-// C99 stdarg.h
-LIBBUILTIN(va_start, "vA.",       "fn",    STDARG_H, ALL_LANGUAGES)
-LIBBUILTIN(va_end, "vA",          "fn",    STDARG_H, ALL_LANGUAGES)
-LIBBUILTIN(va_copy, "vAA",        "fn",    STDARG_H, ALL_LANGUAGES)
-// C99 stdlib.h
-LIBBUILTIN(abort, "v",            "fr",    STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(calloc, "v*zz",        "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(exit, "vi",            "fr",    STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(_Exit, "vi",           "fr",    STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(malloc, "v*z",         "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(realloc, "v*v*z",      "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(free,    "vv*",        "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(strtod, "dcC*c**",     "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(strtof, "fcC*c**",     "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(strtold, "LdcC*c**",   "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(strtol, "LicC*c**i",   "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(strtoll, "LLicC*c**i", "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(strtoul, "ULicC*c**i", "f",     STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(strtoull, "ULLicC*c**i", "f",   STDLIB_H, ALL_LANGUAGES)
-// C11 stdlib.h
-LIBBUILTIN(aligned_alloc, "v*zz", "f",     STDLIB_H, ALL_LANGUAGES)
-// C99 string.h
-LIBBUILTIN(memcpy, "v*v*vC*z",    "fE",    STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(memcmp, "ivC*vC*z",    "fE",    STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(memmove, "v*v*vC*z",   "fE",    STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strcpy, "c*c*cC*",     "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strncpy, "c*c*cC*z",   "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strcmp, "icC*cC*",     "fE",    STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strncmp, "icC*cC*z",   "fE",    STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strcat, "c*c*cC*",     "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strncat, "c*c*cC*z",   "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strxfrm, "zc*cC*z",    "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(memchr, "v*vC*iz",     "fE",    STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strchr, "c*cC*i",      "fE",    STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strcspn, "zcC*cC*",    "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strpbrk, "c*cC*cC*",   "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strrchr, "c*cC*i",     "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strspn, "zcC*cC*",     "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strstr, "c*cC*cC*",    "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strtok, "c*c*cC*",     "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(memset, "v*v*iz",      "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strerror, "c*i",       "f",     STRING_H, ALL_LANGUAGES)
-LIBBUILTIN(strlen, "zcC*",        "fE",    STRING_H, ALL_LANGUAGES)
-// C99 stdio.h
-// FIXME: This list is incomplete.
-LIBBUILTIN(printf, "icC*.",       "fp:0:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(fprintf, "iP*cC*.",    "fp:1:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(snprintf, "ic*zcC*.",  "fp:2:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(sprintf, "ic*cC*.",    "fp:1:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(vprintf, "icC*a",      "fP:0:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(vfprintf, "iP*cC*a",   "fP:1:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(vsnprintf, "ic*zcC*a", "fP:2:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(vsprintf, "ic*cC*a",   "fP:1:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(scanf, "icC*R.",       "fs:0:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(fscanf, "iP*RcC*R.",   "fs:1:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(sscanf, "icC*RcC*R.",  "fs:1:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(vscanf, "icC*Ra",      "fS:0:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(vfscanf, "iP*RcC*Ra",  "fS:1:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(vsscanf, "icC*RcC*Ra", "fS:1:", STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(fopen, "P*cC*cC*",     "f",     STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(fread, "zv*zzP*",      "f",     STDIO_H, ALL_LANGUAGES)
-LIBBUILTIN(fwrite, "zvC*zzP*",    "f",     STDIO_H, ALL_LANGUAGES)
-
-// C99 ctype.h
-LIBBUILTIN(isalnum, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(isalpha, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(isblank, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(iscntrl, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(isdigit, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(isgraph, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(islower, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(isprint, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(ispunct, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(isspace, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(isupper, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(isxdigit, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(tolower, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-LIBBUILTIN(toupper, "ii", "fnU", CTYPE_H, ALL_LANGUAGES)
-// C99 wchar.h
-// FIXME: This list is incomplete. We should cover at least the functions that
-// take format strings.
-LIBBUILTIN(wcschr,  "w*wC*w",   "fE", WCHAR_H, ALL_LANGUAGES)
-LIBBUILTIN(wcscmp,  "iwC*wC*",  "fE", WCHAR_H, ALL_LANGUAGES)
-LIBBUILTIN(wcslen,  "zwC*",     "fE", WCHAR_H, ALL_LANGUAGES)
-LIBBUILTIN(wcsncmp, "iwC*wC*z", "fE", WCHAR_H, ALL_LANGUAGES)
-LIBBUILTIN(wmemchr, "w*wC*wz",  "fE", WCHAR_H, ALL_LANGUAGES)
-LIBBUILTIN(wmemcmp, "iwC*wC*z", "fE", WCHAR_H, ALL_LANGUAGES)
-LIBBUILTIN(wmemcpy, "w*w*wC*z", "fE", WCHAR_H, ALL_LANGUAGES)
-LIBBUILTIN(wmemmove,"w*w*wC*z", "fE", WCHAR_H, ALL_LANGUAGES)
-
-// C99
-// In some systems setjmp is a macro that expands to _setjmp. We undefine
-// it here to avoid having two identical LIBBUILTIN entries.
-#undef setjmp
-LIBBUILTIN(setjmp, "iJ",          "fjT",   SETJMP_H, ALL_LANGUAGES)
-LIBBUILTIN(longjmp, "vJi",        "frT",   SETJMP_H, ALL_LANGUAGES)
-
-// Non-C library functions, active in GNU mode only.
-// Functions with (returns_twice) attribute (marked as "j") are still active in
-// all languages, because losing this attribute would result in miscompilation
-// when these functions are used in non-GNU mode. PR16138.
-LIBBUILTIN(alloca, "v*z",         "f",     STDLIB_H, ALL_GNU_LANGUAGES)
-// POSIX malloc.h
-LIBBUILTIN(memalign, "v*zz",      "f",     MALLOC_H, ALL_GNU_LANGUAGES)
-// POSIX string.h
-LIBBUILTIN(memccpy, "v*v*vC*iz",  "f",     STRING_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(mempcpy, "v*v*vC*z",   "f",     STRING_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(stpcpy, "c*c*cC*",     "f",     STRING_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(stpncpy, "c*c*cC*z",   "f",     STRING_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(strdup, "c*cC*",       "f",     STRING_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(strndup, "c*cC*z",     "f",     STRING_H, ALL_GNU_LANGUAGES)
-// POSIX strings.h
-LIBBUILTIN(index, "c*cC*i",       "f",     STRINGS_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(rindex, "c*cC*i",      "f",     STRINGS_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(bzero, "vv*z",         "f",     STRINGS_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(bcopy, "vvC*v*z",      "f",     STRINGS_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(bcmp, "ivC*vC*z",      "fE",    STRINGS_H, ALL_GNU_LANGUAGES)
-// In some systems str[n]casejmp is a macro that expands to _str[n]icmp.
-// We undefine then here to avoid wrong name.
-#undef strcasecmp
-#undef strncasecmp
-LIBBUILTIN(strcasecmp, "icC*cC*", "f",     STRINGS_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(strncasecmp, "icC*cC*z", "f",   STRINGS_H, ALL_GNU_LANGUAGES)
-// POSIX unistd.h
-LIBBUILTIN(_exit, "vi",           "fr",    UNISTD_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(vfork, "p",            "fjT",   UNISTD_H, ALL_LANGUAGES)
-// POSIX pthread.h
-// FIXME: Should specify argument types.
-LIBBUILTIN(pthread_create, "",  "fC<2,3>", PTHREAD_H, ALL_GNU_LANGUAGES)
-
-// POSIX setjmp.h
-
-// FIXME: MinGW _setjmp has an additional void* parameter.
-LIBBUILTIN(_setjmp, "iJ",         "fjT",   SETJMP_H, ALL_LANGUAGES)
-LIBBUILTIN(__sigsetjmp, "iSJi",   "fjT",   SETJMP_H, ALL_LANGUAGES)
-LIBBUILTIN(sigsetjmp, "iSJi",     "fjT",   SETJMP_H, ALL_LANGUAGES)
-LIBBUILTIN(savectx, "iJ",         "fjT",   SETJMP_H, ALL_LANGUAGES)
-LIBBUILTIN(getcontext, "iK*",     "fjT",   SETJMP_H, ALL_LANGUAGES)
-
-LIBBUILTIN(_longjmp, "vJi",       "frT",   SETJMP_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(siglongjmp, "vSJi",    "frT",   SETJMP_H, ALL_GNU_LANGUAGES)
-// non-standard but very common
-LIBBUILTIN(strlcpy, "zc*cC*z",    "f",     STRING_H, ALL_GNU_LANGUAGES)
-LIBBUILTIN(strlcat, "zc*cC*z",    "f",     STRING_H, ALL_GNU_LANGUAGES)
-//   id objc_msgSend(id, SEL, ...)
-LIBBUILTIN(objc_msgSend, "GGH.",   "f",     OBJC_MESSAGE_H, OBJC_LANG)
-// long double objc_msgSend_fpret(id self, SEL op, ...)
-LIBBUILTIN(objc_msgSend_fpret, "LdGH.", "f", OBJC_MESSAGE_H, OBJC_LANG)
-// _Complex long double objc_msgSend_fp2ret(id self, SEL op, ...)
-LIBBUILTIN(objc_msgSend_fp2ret, "XLdGH.", "f", OBJC_MESSAGE_H, OBJC_LANG)
-// void objc_msgSend_stret (id, SEL, ...)
-LIBBUILTIN(objc_msgSend_stret, "vGH.", "f", OBJC_MESSAGE_H, OBJC_LANG)
-// id objc_msgSendSuper(struct objc_super *super, SEL op, ...)
-LIBBUILTIN(objc_msgSendSuper, "GM*H.", "f", OBJC_MESSAGE_H, OBJC_LANG)
-// void objc_msgSendSuper_stret(struct objc_super *super, SEL op, ...)
-LIBBUILTIN(objc_msgSendSuper_stret, "vM*H.", "f", OBJC_MESSAGE_H, OBJC_LANG)
-//   id objc_getClass(const char *name)
-LIBBUILTIN(objc_getClass, "GcC*",   "f",     OBJC_RUNTIME_H, OBJC_LANG)
-//   id objc_getMetaClass(const char *name)
-LIBBUILTIN(objc_getMetaClass, "GcC*",   "f", OBJC_RUNTIME_H, OBJC_LANG)
-// void objc_enumerationMutation(id)
-LIBBUILTIN(objc_enumerationMutation, "vG", "f", OBJC_RUNTIME_H, OBJC_LANG)
-
-// id objc_read_weak(id *location)
-LIBBUILTIN(objc_read_weak, "GG*", "f", OBJC_OBJC_AUTO_H, OBJC_LANG)
-// id objc_assign_weak(id value, id *location)
-LIBBUILTIN(objc_assign_weak, "GGG*", "f", OBJC_OBJC_AUTO_H, OBJC_LANG)
-// id objc_assign_ivar(id value, id dest, ptrdiff_t offset)
-LIBBUILTIN(objc_assign_ivar, "GGGY", "f", OBJC_OBJC_AUTO_H, OBJC_LANG)
-// id objc_assign_global(id val, id *dest)
-LIBBUILTIN(objc_assign_global, "GGG*", "f", OBJC_OBJC_AUTO_H, OBJC_LANG)
-// id objc_assign_strongCast(id val, id *dest
-LIBBUILTIN(objc_assign_strongCast, "GGG*", "f", OBJC_OBJC_AUTO_H, OBJC_LANG)
-
-// id objc_exception_extract(void *localExceptionData)
-LIBBUILTIN(objc_exception_extract, "Gv*", "f", OBJC_OBJC_EXCEPTION_H, OBJC_LANG)
-// void objc_exception_try_enter(void *localExceptionData)
-LIBBUILTIN(objc_exception_try_enter, "vv*", "f", OBJC_OBJC_EXCEPTION_H, OBJC_LANG)
-// void objc_exception_try_exit(void *localExceptionData)
-LIBBUILTIN(objc_exception_try_exit, "vv*", "f", OBJC_OBJC_EXCEPTION_H, OBJC_LANG)
-// int objc_exception_match(Class exceptionClass, id exception)
-LIBBUILTIN(objc_exception_match, "iGG", "f", OBJC_OBJC_EXCEPTION_H, OBJC_LANG)
-// void objc_exception_throw(id exception)
-LIBBUILTIN(objc_exception_throw, "vG", "f", OBJC_OBJC_EXCEPTION_H, OBJC_LANG)
-
-// int objc_sync_enter(id obj)
-LIBBUILTIN(objc_sync_enter, "iG", "f", OBJC_OBJC_SYNC_H, OBJC_LANG)
-// int objc_sync_exit(id obj)
-LIBBUILTIN(objc_sync_exit, "iG", "f", OBJC_OBJC_SYNC_H, OBJC_LANG)
-
-BUILTIN(__builtin_objc_memmove_collectable, "v*v*vC*z", "nF")
-
-// void NSLog(NSString *fmt, ...)
-LIBBUILTIN(NSLog, "vG.", "fp:0:", FOUNDATION_NSOBJCRUNTIME_H, OBJC_LANG)
-// void NSLogv(NSString *fmt, va_list args)
-LIBBUILTIN(NSLogv, "vGa", "fP:0:", FOUNDATION_NSOBJCRUNTIME_H, OBJC_LANG)
-
-// Builtin math library functions
-LIBBUILTIN(atan2, "ddd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(atan2f, "fff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(atan2l, "LdLdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(abs, "ii", "fnc", STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(labs, "LiLi", "fnc", STDLIB_H, ALL_LANGUAGES)
-LIBBUILTIN(llabs, "LLiLLi", "fnc", STDLIB_H, ALL_LANGUAGES)
-
-LIBBUILTIN(copysign, "ddd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(copysignf, "fff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(copysignl, "LdLdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(fabs, "dd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fabsf, "ff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fabsl, "LdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-// Some systems define finitef as alias of _finitef.
-#if defined (finitef)
-#undef finitef
-#endif
-LIBBUILTIN(finite, "id", "fnc", MATH_H, GNU_LANG)
-LIBBUILTIN(finitef, "if", "fnc", MATH_H, GNU_LANG)
-LIBBUILTIN(finitel, "iLd", "fnc", MATH_H, GNU_LANG)
-// glibc's math.h generates calls to __finite
-LIBBUILTIN(__finite, "id", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(__finitef, "if", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(__finitel, "iLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(fmod, "ddd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fmodf, "fff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fmodl, "LdLdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(frexp, "ddi*", "fn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(frexpf, "ffi*", "fn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(frexpl, "LdLdi*", "fn", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(ldexp, "ddi", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(ldexpf, "ffi", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(ldexpl, "LdLdi", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(modf, "ddd*", "fn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(modff, "fff*", "fn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(modfl, "LdLdLd*", "fn", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(nan,  "dcC*", "fUn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(nanf, "fcC*", "fUn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(nanl, "LdcC*", "fUn", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(pow, "ddd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(powf, "fff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(powl, "LdLdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(acos, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(acosf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(acosl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(acosh, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(acoshf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(acoshl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(asin, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(asinf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(asinl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(asinh, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(asinhf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(asinhl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(atan, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(atanf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(atanl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(atanh, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(atanhf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(atanhl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cbrt, "dd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(cbrtf, "ff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(cbrtl, "LdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(ceil, "dd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(ceilf, "ff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(ceill, "LdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cos, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(cosf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(cosl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cosh, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(coshf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(coshl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(erf, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(erff, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(erfl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(erfc, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(erfcf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(erfcl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(exp, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(expf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(expl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(exp2, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(exp2f, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(exp2l, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(expm1, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(expm1f, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(expm1l, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(fdim, "ddd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fdimf, "fff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fdiml, "LdLdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(floor, "dd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(floorf, "ff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(floorl, "LdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(fma, "dddd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fmaf, "ffff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fmal, "LdLdLdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(fmax, "ddd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fmaxf, "fff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fmaxl, "LdLdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(fmin, "ddd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fminf, "fff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(fminl, "LdLdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(hypot, "ddd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(hypotf, "fff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(hypotl, "LdLdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(ilogb, "id", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(ilogbf, "if", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(ilogbl, "iLd", "fne", MATH_H, ALL_LANGUAGES)
-
-// POSIX math.h declares a global, signgam, that lgamma writes to, so these
-// shouldn't have "e", "c" or "g" attributes
-LIBBUILTIN(lgamma, "dd", "fn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(lgammaf, "ff", "fn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(lgammal, "LdLd", "fn", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(llrint, "LLid", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(llrintf, "LLif", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(llrintl, "LLiLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(llround, "LLid", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(llroundf, "LLif", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(llroundl, "LLiLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(log, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(logf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(logl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(log10, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(log10f, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(log10l, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(log1p, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(log1pf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(log1pl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(log2, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(log2f, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(log2l, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(logb, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(logbf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(logbl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(lrint, "Lid", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(lrintf, "Lif", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(lrintl, "LiLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(lround, "Lid", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(lroundf, "Lif", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(lroundl, "LiLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(nearbyint, "dd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(nearbyintf, "ff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(nearbyintl, "LdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(nextafter, "ddd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(nextafterf, "fff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(nextafterl, "LdLdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(nexttoward, "ddLd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(nexttowardf, "ffLd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(nexttowardl, "LdLdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(remainder, "ddd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(remainderf, "fff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(remainderl, "LdLdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(remquo, "dddi*", "fn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(remquof, "fffi*", "fn", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(remquol, "LdLdLdi*", "fn", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(rint, "dd", "fng", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(rintf, "ff", "fng", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(rintl, "LdLd", "fng", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(round, "dd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(roundf, "ff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(roundl, "LdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(roundeven, "dd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(roundevenf, "ff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(roundevenl, "LdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(scalbln, "ddLi", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(scalblnf, "ffLi", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(scalblnl, "LdLdLi", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(scalbn, "ddi", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(scalbnf, "ffi", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(scalbnl, "LdLdi", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(sin, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(sinf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(sinl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(sinh, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(sinhf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(sinhl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(sqrt, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(sqrtf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(sqrtl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(tan, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(tanf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(tanl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(tanh, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(tanhf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(tanhl, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(tgamma, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(tgammaf, "ff", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(tgammal, "LdLd", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(trunc, "dd", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(truncf, "ff", "fnc", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(truncl, "LdLd", "fnc", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cabs, "dXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cabsf, "fXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cabsl, "LdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cacos, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cacosf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cacosl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cacosh, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cacoshf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cacoshl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(carg, "dXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cargf, "fXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cargl, "LdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(casin, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(casinf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(casinl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(casinh, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(casinhf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(casinhl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(catan, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(catanf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(catanl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(catanh, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(catanhf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(catanhl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(ccos, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(ccosf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(ccosl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(ccosh, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(ccoshf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(ccoshl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cexp, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cexpf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cexpl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cimag, "dXd", "fnc", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cimagf, "fXf", "fnc", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cimagl, "LdXLd", "fnc", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(conj, "XdXd", "fnc", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(conjf, "XfXf", "fnc", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(conjl, "XLdXLd", "fnc", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(clog, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(clogf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(clogl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cproj, "XdXd", "fnc", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cprojf, "XfXf", "fnc", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cprojl, "XLdXLd", "fnc", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(cpow, "XdXdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cpowf, "XfXfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(cpowl, "XLdXLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(creal, "dXd", "fnc", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(crealf, "fXf", "fnc", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(creall, "LdXLd", "fnc", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(csin, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(csinf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(csinl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(csinh, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(csinhf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(csinhl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(csqrt, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(csqrtf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(csqrtl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(ctan, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(ctanf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(ctanl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-LIBBUILTIN(ctanh, "XdXd", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(ctanhf, "XfXf", "fne", COMPLEX_H, ALL_LANGUAGES)
-LIBBUILTIN(ctanhl, "XLdXLd", "fne", COMPLEX_H, ALL_LANGUAGES)
-
-// __sinpi and friends are OS X specific library functions, but otherwise much
-// like the standard (non-complex) sin (etc).
-LIBBUILTIN(__sinpi, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(__sinpif, "ff", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(__cospi, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(__cospif, "ff", "fne", MATH_H, ALL_LANGUAGES)
-
-LIBBUILTIN(__tanpi, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(__tanpif, "ff", "fne", MATH_H, ALL_LANGUAGES)
-
-// Similarly, __exp10 is OS X only
-LIBBUILTIN(__exp10, "dd", "fne", MATH_H, ALL_LANGUAGES)
-LIBBUILTIN(__exp10f, "ff", "fne", MATH_H, ALL_LANGUAGES)
-
-// Blocks runtime Builtin math library functions
-LIBBUILTIN(_Block_object_assign, "vv*vC*iC", "f", BLOCKS_H, ALL_LANGUAGES)
-LIBBUILTIN(_Block_object_dispose, "vvC*iC", "f", BLOCKS_H, ALL_LANGUAGES)
-// FIXME: Also declare NSConcreteGlobalBlock and NSConcreteStackBlock.
-
-// C++ standard library builtins in namespace 'std'.
-LIBBUILTIN(addressof, "v*v&", "zfncThE", MEMORY, CXX_LANG)
-// Synonym for addressof used internally by libstdc++.
-LANGBUILTIN(__addressof, "v*v&", "zfncTE", CXX_LANG)
-LIBBUILTIN(as_const, "v&v&", "zfncThE", UTILITY, CXX_LANG)
-LIBBUILTIN(forward, "v&v&", "zfncThE", UTILITY, CXX_LANG)
-LIBBUILTIN(forward_like, "v&v&", "zfncThE", UTILITY, CXX_LANG)
-LIBBUILTIN(move, "v&v&", "zfncThE", UTILITY, CXX_LANG)
-LIBBUILTIN(move_if_noexcept, "v&v&", "zfncThE", UTILITY, CXX_LANG)
-
-// Annotation function
-BUILTIN(__builtin_annotation, "v.", "tn")
-
-// Invariants
-BUILTIN(__builtin_assume, "vb", "nE")
-BUILTIN(__builtin_assume_separate_storage, "vvCD*vCD*", "nE")
-
-// Multiprecision Arithmetic Builtins.
-BUILTIN(__builtin_addcb, "UcUcCUcCUcCUc*", "n")
-BUILTIN(__builtin_addcs, "UsUsCUsCUsCUs*", "n")
-BUILTIN(__builtin_addc, "UiUiCUiCUiCUi*", "n")
-BUILTIN(__builtin_addcl, "ULiULiCULiCULiCULi*", "n")
-BUILTIN(__builtin_addcll, "ULLiULLiCULLiCULLiCULLi*", "n")
-BUILTIN(__builtin_subcb, "UcUcCUcCUcCUc*", "n")
-BUILTIN(__builtin_subcs, "UsUsCUsCUsCUs*", "n")
-BUILTIN(__builtin_subc, "UiUiCUiCUiCUi*", "n")
-BUILTIN(__builtin_subcl, "ULiULiCULiCULiCULi*", "n")
-BUILTIN(__builtin_subcll, "ULLiULLiCULLiCULLiCULLi*", "n")
-
-// Checked Arithmetic Builtins for Security.
-BUILTIN(__builtin_add_overflow, "b.", "ntE")
-BUILTIN(__builtin_sub_overflow, "b.", "ntE")
-BUILTIN(__builtin_mul_overflow, "b.", "ntE")
-BUILTIN(__builtin_uadd_overflow, "bUiCUiCUi*", "nE")
-BUILTIN(__builtin_uaddl_overflow, "bULiCULiCULi*", "nE")
-BUILTIN(__builtin_uaddll_overflow, "bULLiCULLiCULLi*", "nE")
-BUILTIN(__builtin_usub_overflow, "bUiCUiCUi*", "nE")
-BUILTIN(__builtin_usubl_overflow, "bULiCULiCULi*", "nE")
-BUILTIN(__builtin_usubll_overflow, "bULLiCULLiCULLi*", "nE")
-BUILTIN(__builtin_umul_overflow, "bUiCUiCUi*", "nE")
-BUILTIN(__builtin_umull_overflow, "bULiCULiCULi*", "nE")
-BUILTIN(__builtin_umulll_overflow, "bULLiCULLiCULLi*", "nE")
-BUILTIN(__builtin_sadd_overflow, "bSiCSiCSi*", "nE")
-BUILTIN(__builtin_saddl_overflow, "bSLiCSLiCSLi*", "nE")
-BUILTIN(__builtin_saddll_overflow, "bSLLiCSLLiCSLLi*", "nE")
-BUILTIN(__builtin_ssub_overflow, "bSiCSiCSi*", "nE")
-BUILTIN(__builtin_ssubl_overflow, "bSLiCSLiCSLi*", "nE")
-BUILTIN(__builtin_ssubll_overflow, "bSLLiCSLLiCSLLi*", "nE")
-BUILTIN(__builtin_smul_overflow, "bSiCSiCSi*", "nE")
-BUILTIN(__builtin_smull_overflow, "bSLiCSLiCSLi*", "nE")
-BUILTIN(__builtin_smulll_overflow, "bSLLiCSLLiCSLLi*", "nE")
-
-// Clang builtins (not available in GCC).
-BUILTIN(__builtin_addressof, "v*v&", "nctE")
-BUILTIN(__builtin_function_start, "v*v&", "nctE")
-BUILTIN(__builtin_operator_new, "v*z", "tcE")
-BUILTIN(__builtin_operator_delete, "vv*", "tnE")
-BUILTIN(__builtin_char_memchr, "c*cC*iz", "nE")
-BUILTIN(__builtin_dump_struct, "v.", "t")
-BUILTIN(__builtin_preserve_access_index, "v.", "t")
-
-// Alignment builtins (uses custom parsing to support pointers and integers)
-BUILTIN(__builtin_is_aligned, "bvC*z", "nctE")
-BUILTIN(__builtin_align_up, "v*vC*z", "nctE")
-BUILTIN(__builtin_align_down, "v*vC*z", "nctE")
-
-// Safestack builtins
-BUILTIN(__builtin___get_unsafe_stack_start, "v*", "Fn")
-BUILTIN(__builtin___get_unsafe_stack_bottom, "v*", "Fn")
-BUILTIN(__builtin___get_unsafe_stack_top, "v*", "Fn")
-BUILTIN(__builtin___get_unsafe_stack_ptr, "v*", "Fn")
-
-// Nontemporal loads/stores builtins
-BUILTIN(__builtin_nontemporal_store, "v.", "t")
-BUILTIN(__builtin_nontemporal_load, "v.", "t")
-
-// Coroutine intrinsics.
-LANGBUILTIN(__builtin_coro_resume, "vv*", "", COR_LANG)
-LANGBUILTIN(__builtin_coro_destroy, "vv*", "", COR_LANG)
-LANGBUILTIN(__builtin_coro_done, "bv*", "n", COR_LANG)
-LANGBUILTIN(__builtin_coro_promise, "v*v*IiIb", "n", COR_LANG)
-
-LANGBUILTIN(__builtin_coro_size, "z", "n", COR_LANG)
-LANGBUILTIN(__builtin_coro_align, "z", "n", COR_LANG)
-LANGBUILTIN(__builtin_coro_frame, "v*", "n", COR_LANG)
-LANGBUILTIN(__builtin_coro_noop, "v*", "n", COR_LANG)
-LANGBUILTIN(__builtin_coro_free, "v*v*", "n", COR_LANG)
-
-LANGBUILTIN(__builtin_coro_id, "v*Iiv*v*v*", "n", COR_LANG)
-LANGBUILTIN(__builtin_coro_alloc, "b", "n", COR_LANG)
-LANGBUILTIN(__builtin_coro_begin, "v*v*", "n", COR_LANG)
-LANGBUILTIN(__builtin_coro_end, "bv*Ib", "n", COR_LANG)
-LANGBUILTIN(__builtin_coro_suspend, "cIb", "n", COR_LANG)
-
-// OpenCL v2.0 s6.13.16, s9.17.3.5 - Pipe functions.
-// We need the generic prototype, since the packet type could be anything.
-LANGBUILTIN(read_pipe, "i.", "tn", OCL_PIPE)
-LANGBUILTIN(write_pipe, "i.", "tn", OCL_PIPE)
-
-LANGBUILTIN(reserve_read_pipe, "i.", "tn", OCL_PIPE)
-LANGBUILTIN(reserve_write_pipe, "i.", "tn", OCL_PIPE)
-
-LANGBUILTIN(commit_write_pipe, "v.", "tn", OCL_PIPE)
-LANGBUILTIN(commit_read_pipe, "v.", "tn", OCL_PIPE)
-
-LANGBUILTIN(sub_group_reserve_read_pipe, "i.", "tn", OCL_PIPE)
-LANGBUILTIN(sub_group_reserve_write_pipe, "i.", "tn", OCL_PIPE)
-
-LANGBUILTIN(sub_group_commit_read_pipe, "v.", "tn", OCL_PIPE)
-LANGBUILTIN(sub_group_commit_write_pipe, "v.", "tn", OCL_PIPE)
-
-LANGBUILTIN(work_group_reserve_read_pipe, "i.", "tn", OCL_PIPE)
-LANGBUILTIN(work_group_reserve_write_pipe, "i.", "tn", OCL_PIPE)
-
-LANGBUILTIN(work_group_commit_read_pipe, "v.", "tn", OCL_PIPE)
-LANGBUILTIN(work_group_commit_write_pipe, "v.", "tn", OCL_PIPE)
-
-LANGBUILTIN(get_pipe_num_packets, "Ui.", "tn", OCL_PIPE)
-LANGBUILTIN(get_pipe_max_packets, "Ui.", "tn", OCL_PIPE)
-
-// OpenCL v2.0 s6.13.17 - Enqueue kernel functions.
-// Custom builtin check allows to perform special check of passed block arguments.
-LANGBUILTIN(enqueue_kernel, "i.", "tn", OCL_DSE)
-LANGBUILTIN(get_kernel_work_group_size, "Ui.", "tn", OCL_DSE)
-LANGBUILTIN(get_kernel_preferred_work_group_size_multiple, "Ui.", "tn", OCL_DSE)
-LANGBUILTIN(get_kernel_max_sub_group_size_for_ndrange, "Ui.", "tn", OCL_DSE)
-LANGBUILTIN(get_kernel_sub_group_count_for_ndrange, "Ui.", "tn", OCL_DSE)
-
-// OpenCL v2.0 s6.13.9 - Address space qualifier functions.
-// FIXME: Pointer parameters of OpenCL builtins should have their address space
-// requirement defined.
-LANGBUILTIN(to_global, "v*v*", "tn", OCL_GAS)
-LANGBUILTIN(to_local, "v*v*", "tn", OCL_GAS)
-LANGBUILTIN(to_private, "v*v*", "tn", OCL_GAS)
-
-// OpenCL half load/store builtin
-LANGBUILTIN(__builtin_store_half, "vdh*", "n", ALL_OCL_LANGUAGES)
-LANGBUILTIN(__builtin_store_halff, "vfh*", "n", ALL_OCL_LANGUAGES)
-LANGBUILTIN(__builtin_load_half, "dhC*", "nc", ALL_OCL_LANGUAGES)
-LANGBUILTIN(__builtin_load_halff, "fhC*", "nc", ALL_OCL_LANGUAGES)
-
-// Builtins for os_log/os_trace
-BUILTIN(__builtin_os_log_format_buffer_size, "zcC*.", "p:0:nutE")
-BUILTIN(__builtin_os_log_format, "v*v*cC*.", "p:0:nt")
-
-// CUDA/HIP
-LANGBUILTIN(__builtin_get_device_side_mangled_name, "cC*.", "ncT", CUDA_LANG)
-
-// HLSL
-LANGBUILTIN(__builtin_hlsl_wave_active_count_bits, "Uib", "nc", HLSL_LANG)
-LANGBUILTIN(__builtin_hlsl_create_handle, "v*Uc", "nc", HLSL_LANG)
-
-// Builtins for XRay
-BUILTIN(__xray_customevent, "vcC*z", "")
-BUILTIN(__xray_typedevent, "vzcC*z", "")
-
-// Win64-compatible va_list functions
-BUILTIN(__builtin_ms_va_start, "vc*&.", "nt")
-BUILTIN(__builtin_ms_va_end, "vc*&", "n")
-BUILTIN(__builtin_ms_va_copy, "vc*&c*&", "n")
-
-// Arithmetic Fence: to prevent FP reordering and reassociation optimizations
-LANGBUILTIN(__arithmetic_fence, "v.", "tE", ALL_LANGUAGES)
-
-#undef BUILTIN
-#undef LIBBUILTIN
-#undef LANGBUILTIN
diff --git a/clang/include/clang/Basic/Builtins.h b/clang/include/clang/Basic/Builtins.h
index 3fd5b02b5aa58..f955d21169556 100644
--- a/clang/include/clang/Basic/Builtins.h
+++ b/clang/include/clang/Basic/Builtins.h
@@ -64,7 +64,7 @@ namespace Builtin {
 enum ID {
   NotBuiltin  = 0,      // This is not a builtin function.
 #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-#include "clang/Basic/Builtins.def"
+#include "clang/Basic/Builtins.inc"
   FirstTSBuiltin
 };
 
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
new file mode 100644
index 0000000000000..fdd1cf8221dcc
--- /dev/null
+++ b/clang/include/clang/Basic/Builtins.td
@@ -0,0 +1,4537 @@
+//===--- Builtins.td - Builtins function info database-----------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/Basic/BuiltinsBase.td"
+
+class FPMathTemplate : Template<["float", "double", "long double"],
+                                ["f",     "",       "l"]>;
+
+class FPMathWithF16Template :
+    Template<["float", "double", "long double", "__fp16", "__float128"],
+             ["f",     "",       "l",           "f16",    "f128"]>;
+
+class FPMathWithF16F128Template :
+    Template<["float", "double", "long double", "__fp16", "__float128"],
+             ["f",     "",       "l",           "f16",    "f128"]>;
+
+class F16F128MathTemplate : Template<["__fp16", "__float128"],
+                                     ["f16",    "f128"]>;
+
+class IntMathTemplate : Template<["int", "long int", "long long int"],
+                                 ["",     "l",       "ll"], /*AsPrefix=*/1>;
+
+class MSInt8_16_32Template : Template<["char", "short", "msint32_t"],
+                                      ["8",    "16",    ""]>;
+
+class Int8_16_32_64Template
+    : Template<["char", "short", "int", "long long int"],
+               ["8",    "16",    "32",  "64"]>;
+
+class MSInt8_16_32_64Template
+    : Template<["char", "short", "msint32_t", "long long int"],
+               ["8",    "16",    "",          "64"]>;
+
+class MSInt16_32Template : Template<["short", "msint32_t"],
+                                    ["16",    ""]>;
+
+class MSUInt16_32_64Template :
+    Template<["unsigned short", "unsigned int", "uint64_t"],
+             ["16",             "",             "64"]>;
+
+class MSInt32_64Template : Template<["msint32_t", "int64_t"],
+                                    ["",          "64"]>;
+
+class FloatDoubleTemplate : Template<["float", "double"],
+                                     ["f",     ""]>;
+
+// FIXME: These assume that char -> i8, short -> i16, int -> i32,
+// long long -> i64.
+class SyncBuiltinsTemplate :
+    Template<["char", "short", "int", "long long int", "__int128_t"],
+             ["1",    "2",     "4",   "8",             "16"]>;
+
+class BitInt8_16_32_64BuiltinsTemplate :
+    Template<["unsigned char", "unsigned short", "uint32_t", "uint64_t"],
+             ["8",             "16",             "32",       "64"]>;
+
+class BitShort_Int_Long_LongLongTemplate :
+    Template<["short", "int", "long int", "long long int"],
+             ["s",     "",    "l",        "ll"]>;
+
+class BitInt_Long_LongLongTemplate :
+    Template<["int", "long int", "long long int"],
+             ["",    "l",        "ll"]>;
+
+// Most of the types used in the prototypes are types from C, C++ or ObjC. There
+// are a few builtin-specific types and qualifiers.
+//
+// builtin-specific types:
+// - __builtin_va_list: This is the internal representation for va_lists
+// - __builtin_va_list_ref: A reference-like type to __builtin_va_list
+// - msint32_t: 'int' size if target is LP64, 'L' otherwise.
+//
+// builtin-specific qualifiers:
+// - _Constant: Argument has to constant-fold to an integer constant expression
+
+// __fp16 and __float128 builtin variants of libc/libm functions.
+def AcosF128 : Builtin {
+  let Spellings = ["__builtin_acosf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def AcoshF128 : Builtin {
+  let Spellings = ["__builtin_acoshf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def AsinF128 : Builtin {
+  let Spellings = ["__builtin_asinf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def AsinhF128 : Builtin {
+  let Spellings = ["__builtin_asinhf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def AtanF128 : Builtin {
+  let Spellings = ["__builtin_atanf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def AtanhF128 : Builtin {
+  let Spellings = ["__builtin_atanhf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def CbrtF128 : Builtin {
+  let Spellings = ["__builtin_cbrtf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "__float128(__float128)";
+}
+
+def CeilF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_ceil"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "T(T)";
+}
+
+def CosF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_cos"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def CoshF128 : Builtin {
+  let Spellings = ["__builtin_coshf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def ErfF128 : Builtin {
+  let Spellings = ["__builtin_erff128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def ErfcF128 : Builtin {
+  let Spellings = ["__builtin_erfcf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def ExpF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_exp"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def Exp2F16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_exp2"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def Exp10F16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_exp10"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def Expm1F128 : Builtin {
+  let Spellings = ["__builtin_expm1f128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def FdimF128 : Builtin {
+  let Spellings = ["__builtin_fdimf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128, __float128)";
+}
+
+def FloorF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_floor"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "T(T)";
+}
+
+def FmaF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_fma"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T, T)";
+}
+
+def FmaxF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_fmax"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const, Constexpr];
+  let Prototype = "T(T, T)";
+}
+
+def FminF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_fmin"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const, Constexpr];
+  let Prototype = "T(T, T)";
+}
+
+def Atan2F128 : Builtin {
+  let Spellings = ["__builtin_atan2f128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128, __float128)";
+}
+
+def CopysignF16 : Builtin {
+  let Spellings = ["__builtin_copysignf16"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "__fp16(__fp16, __fp16)";
+}
+
+def CopysignF128 : Builtin {
+  let Spellings = ["__builtin_copysignf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const, Constexpr];
+  let Prototype = "__float128(__float128, __float128)";
+}
+
+def FabsF16 : Builtin {
+  let Spellings = ["__builtin_fabsf16"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "__fp16(__fp16)";
+}
+
+def FabsF128 : Builtin {
+  let Spellings = ["__builtin_fabsf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const, Constexpr];
+  let Prototype = "__float128(__float128)";
+}
+
+def FmodF16F128 : F16F128MathTemplate, Builtin {
+  let Spellings = ["__builtin_fmod"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T)";
+}
+
+def FrexpF16F128 : F16F128MathTemplate, Builtin {
+  let Spellings = ["__builtin_frexp"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "T(T, int*)";
+}
+
+def HugeVal : Builtin, FPMathWithF16F128Template {
+  let Spellings = ["__builtin_huge_val"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T()";
+}
+
+def Inf : Builtin, FPMathWithF16F128Template {
+  let Spellings = ["__builtin_inf"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T()";
+}
+
+def LdexpF16F128 : F16F128MathTemplate, Builtin {
+  let Spellings = ["__builtin_ldexp"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, int)";
+}
+
+def ModfF128 : Builtin {
+  let Spellings = ["__builtin_modff128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "__float128(__float128, __float128*)";
+}
+
+// This isn't a FPMathWithF16F128Template because the f16
+// version takes a _Float16 for some reason.
+def NanF16 : Builtin {
+  let Spellings = ["__builtin_nanf16"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Pure, Constexpr];
+  let Prototype = "_Float16(char const*)";
+}
+
+def NanF128 : Builtin {
+  let Spellings = ["__builtin_nanf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Pure, Constexpr];
+  let Prototype = "__float128(char const*)";
+}
+
+def Nans : Builtin,
+    Template<["float", "double", "long double", "_Float16", "__float128"],
+             ["f",     "",       "l",           "f16",      "f128"]> {
+  let Spellings = ["__builtin_nans"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Pure, Constexpr];
+  let Prototype = "T(char const*)";
+}
+
+def PowI : Builtin, FPMathTemplate {
+  let Spellings = ["__builtin_powi"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "T(T, int)";
+}
+
+def PowF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_pow"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T)";
+}
+
+def HypotF128 : Builtin {
+  let Spellings = ["__builtin_hypotf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128, __float128)";
+}
+
+def ILogbF128 : Builtin {
+  let Spellings = ["__builtin_ilogbf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "int(__float128)";
+}
+
+def LgammaF128 : Builtin {
+  let Spellings = ["__builtin_lgammaf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "__float128(__float128)";
+}
+
+def LLrintF128 : Builtin {
+  let Spellings = ["__builtin_llrintf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "long long int(__float128)";
+}
+
+def LLroundF128 : Builtin {
+  let Spellings = ["__builtin_llroundf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "long long int(__float128)";
+}
+
+def Log10F16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_log10"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def Log1pF128 : Builtin {
+  let Spellings = ["__builtin_log1pf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def Log2F16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_log2"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def LogbF128 : Builtin {
+  let Spellings = ["__builtin_logbf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def LogF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_log"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def LrintF128 : Builtin {
+  let Spellings = ["__builtin_lrintf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "long int(__float128)";
+}
+
+def LroundF128 : Builtin {
+  let Spellings = ["__builtin_lroundf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "long int(__float128)";
+}
+
+def NearbyintF128 : Builtin {
+  let Spellings = ["__builtin_nearbyintf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "__float128(__float128)";
+}
+
+def NextafterF128 : Builtin {
+  let Spellings = ["__builtin_nextafterf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128, __float128)";
+}
+
+def NexttowardF128 : Builtin {
+  let Spellings = ["__builtin_nexttowardf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128, __float128)";
+}
+
+def RemainderF128 : Builtin {
+  let Spellings = ["__builtin_remainderf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128, __float128)";
+}
+
+def RemquoF128 : Builtin {
+  let Spellings = ["__builtin_remquof128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "__float128(__float128, __float128, int*)";
+}
+
+def RintF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_rint"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "T(T)";
+}
+
+def RoundF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_round"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "T(T)";
+}
+
+def RoundevenF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_roundeven"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "T(T)";
+}
+
+def ScanlblnF128 : Builtin {
+  let Spellings = ["__builtin_scalblnf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128, long int)";
+}
+
+def ScanlbnF128 : Builtin {
+  let Spellings = ["__builtin_scalbnf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128, int)";
+}
+
+def SinF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_sin"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def SinhF128 : Builtin {
+  let Spellings = ["__builtin_sinhf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def SqrtF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_sqrt"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def TanF128 : Builtin {
+  let Spellings = ["__builtin_tanf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def TanhF128 : Builtin {
+  let Spellings = ["__builtin_tanhf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def TgammaF128 : Builtin {
+  let Spellings = ["__builtin_tgammaf128"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "__float128(__float128)";
+}
+
+def TruncF16F128 : Builtin, F16F128MathTemplate {
+  let Spellings = ["__builtin_trunc"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "T(T)";
+}
+
+// Access to floating point environment.
+def BuiltinFltRounds : Builtin {
+  let Spellings = ["__builtin_flt_rounds"];
+  let Attributes = [NoThrow];
+  let Prototype = "int()";
+}
+
+def BuiltinSetFltRounds : Builtin {
+  let Spellings = ["__builtin_set_flt_rounds"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(int)";
+}
+
+// GCC-compatible C99 CMPLX implementation.
+def BuiltinComplex : Builtin {
+  let Spellings = ["__builtin_complex"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "void(...)";
+}
+
+// FP Comparison functions.
+def IsGreater : Builtin {
+  let Spellings = ["__builtin_isgreater"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking];
+  let Prototype = "int(...)";
+}
+
+def IsGreaterEqual : Builtin {
+  let Spellings = ["__builtin_isgreaterequal"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking];
+  let Prototype = "int(...)";
+}
+
+def IsLess : Builtin {
+  let Spellings = ["__builtin_isless"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking];
+  let Prototype = "int(...)";
+}
+
+def IsLessEqual : Builtin {
+  let Spellings = ["__builtin_islessequal"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking];
+  let Prototype = "int(...)";
+}
+
+def IsLessGreater : Builtin {
+  let Spellings = ["__builtin_islessgreater"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking];
+  let Prototype = "int(...)";
+}
+
+def IsUnordered : Builtin {
+  let Spellings = ["__builtin_isunordered"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking];
+  let Prototype = "int(...)";
+}
+
+// Unary FP classification.
+def FPClassify : Builtin {
+  let Spellings = ["__builtin_fpclassify"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "int(int, int, int, int, int, ...)";
+}
+
+def IsFinite : Builtin {
+  let Spellings = ["__builtin_isfinite"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def IsInf : Builtin {
+  let Spellings = ["__builtin_isinf"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def IsInfSign : Builtin {
+  let Spellings = ["__builtin_isinf_sign"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def IsNan : Builtin {
+  let Spellings = ["__builtin_isnan"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def IsNormal : Builtin {
+  let Spellings = ["__builtin_isnormal"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def IsSubnormal : Builtin {
+  let Spellings = ["__builtin_issubnormal"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def IsZero : Builtin {
+  let Spellings = ["__builtin_iszero"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def IsSignaling : Builtin {
+  let Spellings = ["__builtin_issignaling"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def IsFPClass : Builtin {
+  let Spellings = ["__builtin_isfpclass"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "int(...)";
+}
+
+// FP signbit builtins.
+def Signbit : Builtin {
+  let Spellings = ["__builtin_signbit"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const,
+                    CustomTypeChecking];
+  let Prototype = "int(...)";
+}
+
+def SignbitF : Builtin {
+  let Spellings = ["__builtin_signbitf"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "int(float)";
+}
+
+def SignbitL : Builtin {
+  let Spellings = ["__builtin_signbitl"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const];
+  let Prototype = "int(long double)";
+}
+
+// Special FP builtins.
+def Canonicalize : Builtin, FPMathWithF16Template {
+  let Spellings = ["__builtin_canonicalize"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+}
+
+// Builtins for arithmetic.
+def Clz : Builtin, BitShort_Int_Long_LongLongTemplate {
+  let Spellings = ["__builtin_clz"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "int(unsigned T)";
+}
+
+// FIXME: Add int clzimax(uintmax_t)
+
+def Ctz : Builtin, BitShort_Int_Long_LongLongTemplate {
+  let Spellings = ["__builtin_ctz"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "int(unsigned T)";
+}
+
+// FIXME: Add int ctzimax(uintmax_t)
+
+def FFS : Builtin, BitInt_Long_LongLongTemplate {
+  let Spellings = ["__builtin_ffs"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Const, Constexpr];
+  let Prototype = "int(T)";
+}
+
+def Parity : Builtin, BitInt_Long_LongLongTemplate {
+  let Spellings = ["__builtin_parity"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "int(unsigned T)";
+}
+
+def Popcount : Builtin, BitInt_Long_LongLongTemplate {
+  let Spellings = ["__builtin_popcount"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "int(unsigned T)";
+}
+
+def Clrsb : Builtin, BitInt_Long_LongLongTemplate {
+  let Spellings = ["__builtin_clrsb"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "int(T)";
+}
+
+// The following builtins rely on that char == 8 bits, short == 16 bits and that
+// there exists native types on the target that are 32- and 64-bits wide, unless
+// these conditions are fulfilled these builtins will operate on a not intended
+// bitwidth.
+def BSwap : Builtin, Template<["unsigned short", "uint32_t", "uint64_t"],
+                              ["16",             "32",       "64"]> {
+  let Spellings = ["__builtin_bswap"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T(T)";
+}
+
+def Bitreverse : BitInt8_16_32_64BuiltinsTemplate, Builtin {
+  let Spellings = ["__builtin_bitreverse"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T(T)";
+}
+
+def RotateLeft : BitInt8_16_32_64BuiltinsTemplate, Builtin {
+  let Spellings = ["__builtin_rotateleft"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T(T, T)";
+}
+
+def RotateRight : BitInt8_16_32_64BuiltinsTemplate, Builtin {
+  let Spellings = ["__builtin_rotateright"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T(T, T)";
+}
+
+// Random GCC builtins
+// FIXME: The builtins marked FunctionWithBuiltinPrefix below should be
+//        merged with the library definitions. They are currently not because
+//        the attributes are different.
+def BuiltinCalloc : Builtin {
+  let Spellings = ["__builtin_calloc"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(size_t, size_t)";
+}
+
+def BuiltinConstantP : Builtin {
+  let Spellings = ["__builtin_constant_p"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, UnevaluatedArguments, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def BuiltinClassifyType : Builtin {
+  let Spellings = ["__builtin_classify_type"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, UnevaluatedArguments, Constexpr];
+  let Prototype = "int(...)";
+}
+
+def BuiltinCFStringMakeConstantString : Builtin {
+  let Spellings = ["__builtin___CFStringMakeConstantString"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "constant_CFString const*(char const*)";
+}
+
+def BuiltinNSStringMakeConstantString : Builtin {
+  let Spellings = ["__builtin___NSStringMakeConstantString"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "constant_CFString const*(char const*)";
+}
+
+def BuiltinVaStart : Builtin {
+  let Spellings = ["__builtin_va_start"];
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(__builtin_va_list_ref, ...)";
+}
+
+def BuiltinStdargStart : Builtin {
+  let Spellings = ["__builtin_stdarg_start"];
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(__builtin_va_list_ref, ...)";
+}
+
+def BuiltinAssumeAligned : Builtin {
+  let Spellings = ["__builtin_assume_aligned"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "void*(void const*, size_t, ...)";
+}
+
+def BuiltinFree : Builtin {
+  let Spellings = ["__builtin_free"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void(void*)";
+}
+
+def BuiltinMalloc : Builtin {
+  let Spellings = ["__builtin_malloc"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(size_t)";
+}
+
+def BuiltinMemcpyInline : Builtin {
+  let Spellings = ["__builtin_memcpy_inline"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(void*, void const*, _Constant size_t)";
+}
+
+def BuiltinMempcpy : Builtin {
+  let Spellings = ["__builtin_mempcpy"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(void*, void const*, size_t)";
+}
+
+def BuiltinMemsetInline : Builtin {
+  let Spellings = ["__builtin_memset_inline"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(void*, int, _Constant size_t)";
+}
+
+def BuiltinStrcspn : Builtin {
+  let Spellings = ["__builtin_strcspn"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "size_t(char const*, char const*)";
+}
+
+def BuiltinRealloc : Builtin {
+  let Spellings = ["__builtin_realloc"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(void*, size_t)";
+}
+
+def BuiltinReturnAddress : Builtin {
+  let Spellings = ["__builtin_return_address"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(_Constant unsigned int)";
+}
+
+def ExtractReturnAddr : Builtin {
+  let Spellings = ["__builtin_extract_return_addr"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(void*)";
+}
+
+def FrameAddress : Builtin {
+  let Spellings = ["__builtin_frame_address"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(_Constant unsigned int)";
+}
+
+def ClearCache : Builtin {
+  let Spellings = ["__builtin___clear_cache"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(char*, char*)";
+}
+
+def BuiltinSetjmp : Builtin {
+  let Spellings = ["__builtin_setjmp"];
+  let Attributes = [ReturnsTwice];
+  let Prototype = "int(void**)";
+}
+
+def BuiltinLongjmp : Builtin {
+  let Spellings = ["__builtin_longjmp"];
+  let Attributes = [NoReturn];
+  let Prototype = "void(void**, int)";
+}
+
+def UnwindInit : Builtin {
+  let Spellings = ["__builtin_unwind_init"];
+  let Prototype = "void()";
+}
+
+def EHReturnDataRegNo : Builtin {
+  let Spellings = ["__builtin_eh_return_data_regno"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "int(_Constant int)";
+}
+
+def ThreadPointer : Builtin {
+  let Spellings = ["__builtin_thread_pointer"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void*()";
+}
+
+def Launder : Builtin {
+  let Spellings = ["__builtin_launder"];
+  let Attributes = [NoThrow, CustomTypeChecking, Constexpr];
+  let Prototype = "void*(void*)";
+}
+
+def IsConstantEvaluated : LangBuiltin<"CXX_LANG"> {
+  let Spellings = ["__builtin_is_constant_evaluated"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool()";
+}
+
+// GCC exception builtins
+def EHReturn : Builtin {
+  let Spellings = ["__builtin_eh_return"];
+  let Attributes = [NoReturn];
+  // FIXME: Takes intptr_t, not size_t!
+  let Prototype = "void(size_t, void*)";
+}
+
+def FrobReturnAddr : Builtin {
+  let Spellings = ["__builtin_frob_return_addr"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(void*)";
+}
+
+def DWARF_CFA : Builtin {
+  let Spellings = ["__builtin_dwarf_cfa"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*()";
+}
+
+def InitDWARFRegSizeTable : Builtin {
+  let Spellings = ["__builtin_init_dwarf_reg_size_table"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(void*)";
+}
+
+def DWARFSpColumn : Builtin {
+  let Spellings = ["__builtin_dwarf_sp_column"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned int()";
+}
+
+def ExtendPointer : Builtin {
+  let Spellings = ["__builtin_extend_pointer"];
+  let Attributes = [NoThrow];
+  // _Unwind_Word == uint64_t
+  let Prototype = "unsigned long long int(void*)";
+}
+
+// GCC Object size checking builtins.
+def ObjectSize : Builtin {
+  let Spellings = ["__builtin_object_size"];
+  let Attributes = [NoThrow, UnevaluatedArguments, Constexpr];
+  let Prototype = "size_t(void const*, int)";
+}
+
+def DynamicObjectSize : Builtin { // Clang only
+  let Spellings = ["__builtin_dynamic_object_size"];
+  let Attributes = [NoThrow, UnevaluatedArguments, Constexpr];
+  let Prototype = "size_t(void const*, int)";
+}
+
+def MemcpyChk : Builtin {
+  let Spellings = ["__builtin___memcpy_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(void*, void const*, size_t, size_t)";
+}
+
+def MemccpyChk : Builtin {
+  let Spellings = ["__builtin___memccpy_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(void*, void const*, int, size_t, size_t)";
+}
+
+def MemmoveChk : Builtin {
+  let Spellings = ["__builtin___memmove_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(void*, void const*, size_t, size_t)";
+}
+
+def MempcpyChk : Builtin {
+  let Spellings = ["__builtin___mempcpy_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(void*, void const*, size_t, size_t)";
+}
+
+def MemsetChk : Builtin {
+  let Spellings = ["__builtin___memset_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(void*, int, size_t, size_t)";
+}
+
+def StpcpyChk : Builtin {
+  let Spellings = ["__builtin___stpcpy_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "char*(char*, char const*, size_t)";
+}
+
+def StrcatChk : Builtin {
+  let Spellings = ["__builtin___strcat_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "char*(char*, char const*, size_t)";
+}
+
+def StrcpyChk : Builtin {
+  let Spellings = ["__builtin___strcpy_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "char*(char*, char const*, size_t)";
+}
+
+def StrlcatChk : Builtin {
+  let Spellings = ["__builtin___strlcat_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "size_t(char*, char const*, size_t, size_t)";
+}
+
+def StrlcpyChk : Builtin {
+  let Spellings = ["__builtin___strlcpy_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "size_t(char*, char const*, size_t, size_t)";
+}
+
+def StrncatChk : Builtin {
+  let Spellings = ["__builtin___strncat_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "char*(char*, char const*, size_t, size_t)";
+}
+
+def StrncpyChk : Builtin {
+  let Spellings = ["__builtin___strncpy_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "char*(char*, char const*, size_t, size_t)";
+}
+
+def StpncpyChk : Builtin {
+  let Spellings = ["__builtin___stpncpy_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "int(char*, char*, char const*, size_t, size_t)";
+}
+
+def SNPrintfChk : Builtin {
+  let Spellings = ["__builtin___snprintf_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, PrintfFormat<4>];
+  let Prototype = "int(char* restrict, size_t, int, size_t, char const* restrict, ...)";
+}
+
+def SPrintfChk : Builtin {
+  let Spellings = ["__builtin___sprintf_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, PrintfFormat<3>];
+  let Prototype = "int(char* restrict, int, size_t, char const* restrict, ...)";
+}
+
+def VSNPrintfChk : Builtin {
+  let Spellings = ["__builtin___vsnprintf_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, VPrintfFormat<4>];
+  let Prototype = "int(char* restrict, size_t, int, size_t, char const* restrict, __builtin_va_list)";
+}
+
+def VSPrintfChk : Builtin {
+  let Spellings = ["__builtin___vsprintf_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, VPrintfFormat<3>];
+  let Prototype = "int(char* restrict, int, size_t, char const* restrict, __builtin_va_list)";
+}
+
+def FPrintfChk : Builtin {
+  let Spellings = ["__builtin___fprintf_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, PrintfFormat<2>];
+  let Prototype = "int(FILE* restrict, int, char const* restrict, ...)";
+}
+
+def PrintfChk : Builtin {
+  let Spellings = ["__builtin___printf_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, PrintfFormat<1>];
+  let Prototype = "int(int, int, char const* restrict, ...)";
+}
+
+def VFPrintfChk : Builtin {
+  let Spellings = ["__builtin___vfprintf_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, VPrintfFormat<2>];
+  let Prototype = "int(FILE* restrict, int, char const* restrict, __builtin_va_list)";
+}
+
+def VPrintfChk : Builtin {
+  let Spellings = ["__builtin___vprintf_chk"];
+  let Attributes = [FunctionWithBuiltinPrefix, VPrintfFormat<1>];
+  let Prototype = "int(int, char const* restrict, __builtin_va_list)";
+}
+
+def Unpredictable : Builtin {
+  let Spellings = ["__builtin_unpredictable"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "long int(long int)";
+}
+
+def Expect : Builtin {
+  let Spellings = ["__builtin_expect"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "long int(long int, long int)";
+}
+
+def ExpectWithProbability : Builtin {
+  let Spellings = ["__builtin_expect_with_probability"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "long int(long int, long int, double)";
+}
+
+def Prefetch : Builtin {
+  let Spellings = ["__builtin_prefetch"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(void const*, ...)";
+}
+
+def ReadCycleCounter : Builtin {
+  let Spellings = ["__builtin_readcyclecounter"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned long long int()";
+}
+
+def Trap : Builtin {
+  let Spellings = ["__builtin_trap"];
+  let Attributes = [NoThrow, NoReturn];
+  let Prototype = "void()";
+}
+
+def Debugtrap : Builtin {
+  let Spellings = ["__builtin_debugtrap"];
+  let Attributes = [NoThrow];
+  let Prototype = "void()";
+}
+
+def Unreachable : Builtin {
+  let Spellings = ["__builtin_unreachable"];
+  let Attributes = [NoThrow, NoReturn];
+  let Prototype = "void()";
+}
+
+def ShuffleVector : Builtin {
+  let Spellings = ["__builtin_shufflevector"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ConvertVector : Builtin {
+  let Spellings = ["__builtin_convertvector"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AllocaUninitialized : Builtin {
+  let Spellings = ["__builtin_alloca_uninitialized"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(size_t)";
+}
+
+def AllocaWithAlign : Builtin {
+  let Spellings = ["__builtin_alloca_with_align"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(size_t, _Constant size_t)";
+}
+
+def AllocaWithAlignUninitialized : Builtin {
+  let Spellings = ["__builtin_alloca_with_align_uninitialized"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
+  let Prototype = "void*(size_t, _Constant size_t)";
+}
+
+def CallWithStaticChain : Builtin {
+  let Spellings = ["__builtin_call_with_static_chain"];
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def NondetermenisticValue : Builtin {
+  let Spellings = ["__builtin_nondeterministic_value"];
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseAbs : Builtin {
+  let Spellings = ["__builtin_elementwise_abs"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseBitreverse : Builtin {
+  let Spellings = ["__builtin_elementwise_bitreverse"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseMax : Builtin {
+  let Spellings = ["__builtin_elementwise_max"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseMin : Builtin {
+  let Spellings = ["__builtin_elementwise_min"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseCeil : Builtin {
+  let Spellings = ["__builtin_elementwise_ceil"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseCos : Builtin {
+  let Spellings = ["__builtin_elementwise_cos"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseExp : Builtin {
+  let Spellings = ["__builtin_elementwise_exp"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseExp2 : Builtin {
+  let Spellings = ["__builtin_elementwise_exp2"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseFloor : Builtin {
+  let Spellings = ["__builtin_elementwise_floor"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseLog : Builtin {
+  let Spellings = ["__builtin_elementwise_log"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseLog2 : Builtin {
+  let Spellings = ["__builtin_elementwise_log2"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseLog10 : Builtin {
+  let Spellings = ["__builtin_elementwise_log10"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwisePow : Builtin {
+  let Spellings = ["__builtin_elementwise_pow"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseRoundEven : Builtin {
+  let Spellings = ["__builtin_elementwise_roundeven"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseRound : Builtin {
+  let Spellings = ["__builtin_elementwise_round"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseRint : Builtin {
+  let Spellings = ["__builtin_elementwise_rint"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseNearbyInt : Builtin {
+  let Spellings = ["__builtin_elementwise_nearbyint"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseSin : Builtin {
+  let Spellings = ["__builtin_elementwise_sin"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseSqrt : Builtin {
+  let Spellings = ["__builtin_elementwise_sqrt"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseTrunc : Builtin {
+  let Spellings = ["__builtin_elementwise_trunc"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseCanonicalize : Builtin {
+  let Spellings = ["__builtin_elementwise_canonicalize"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseCopysign : Builtin {
+  let Spellings = ["__builtin_elementwise_copysign"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseFma : Builtin {
+  let Spellings = ["__builtin_elementwise_fma"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseAddSat : Builtin {
+  let Spellings = ["__builtin_elementwise_add_sat"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ElementwiseSubSat : Builtin {
+  let Spellings = ["__builtin_elementwise_sub_sat"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ReduceMax : Builtin {
+  let Spellings = ["__builtin_reduce_max"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ReduceMin : Builtin {
+  let Spellings = ["__builtin_reduce_min"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ReduceXor : Builtin {
+  let Spellings = ["__builtin_reduce_xor"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ReduceOr : Builtin {
+  let Spellings = ["__builtin_reduce_or"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ReduceAnd : Builtin {
+  let Spellings = ["__builtin_reduce_and"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ReduceAdd : Builtin {
+  let Spellings = ["__builtin_reduce_add"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ReduceMul : Builtin {
+  let Spellings = ["__builtin_reduce_mul"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def MatrixTranspose : Builtin {
+  let Spellings = ["__builtin_matrix_transpose"];
+  let Attributes = [NoThrow, FunctionWithBuiltinPrefix, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def MatrixColumnMajorLoad : Builtin {
+  let Spellings = ["__builtin_matrix_column_major_load"];
+  let Attributes = [NoThrow, FunctionWithBuiltinPrefix, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def MatrixColumnMajorStore : Builtin {
+  let Spellings = ["__builtin_matrix_column_major_store"];
+  let Attributes = [NoThrow, FunctionWithBuiltinPrefix, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+// "Overloaded" Atomic operator builtins.  These are overloaded to support data
+// types of i8, i16, i32, i64, and i128.  The front-end sees calls to the
+// non-suffixed version of these (which has a bogus type) and transforms them to
+// the right overloaded version in Sema (plus casts).
+
+def SyncFetchAndAdd : Builtin {
+  let Spellings = ["__sync_fetch_and_add"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncFetchAndAddN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_fetch_and_add_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncFetchAndSub : Builtin {
+  let Spellings = ["__sync_fetch_and_sub"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncFetchAndSubN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_fetch_and_sub_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncFetchAndOr : Builtin {
+  let Spellings = ["__sync_fetch_and_or"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncFetchAndOrN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_fetch_and_or_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncFetchAndAnd : Builtin {
+  let Spellings = ["__sync_fetch_and_and"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncFetchAndAndN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_fetch_and_and_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncFetchAndXor : Builtin {
+  let Spellings = ["__sync_fetch_and_xor"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncFetchAndXorN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_fetch_and_xor_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncFetchAndNand : Builtin {
+  let Spellings = ["__sync_fetch_and_nand"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncFetchAndNandN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_fetch_and_nand_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncAddAndFetch : Builtin {
+  let Spellings = ["__sync_add_and_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncAddAndFetchN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_add_and_fetch_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncSubAndFetch : Builtin {
+  let Spellings = ["__sync_sub_and_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncSubAndFetchN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_sub_and_fetch_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncOrAndFetch : Builtin {
+  let Spellings = ["__sync_or_and_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncOrAndFetchN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_or_and_fetch_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncAndAndFetch : Builtin {
+  let Spellings = ["__sync_and_and_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncAndAndFetchN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_and_and_fetch_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncXorAndFetch : Builtin {
+  let Spellings = ["__sync_xor_and_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncXorAndFetchN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_xor_and_fetch_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncNandAndFetch : Builtin {
+  let Spellings = ["__sync_nand_and_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncNandAndFetchN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_nand_and_fetch_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncBoolCompareAndSwap : Builtin {
+  let Spellings = ["__sync_bool_compare_and_swap"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncBoolCompareAndSwapN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_bool_compare_and_swap_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncValCompareAndSwap : Builtin {
+  let Spellings = ["__sync_val_compare_and_swap"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SynLockValCompareAndSwapN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_val_compare_and_swap_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncLockTestAndSet : Builtin {
+  let Spellings = ["__sync_lock_test_and_set"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SynLockLockTestAndSetN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_lock_test_and_set_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncLockReleaseN : Builtin {
+  let Spellings = ["__sync_lock_release"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SynLockReleaseN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_lock_release_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+def SyncSwap : Builtin {
+  let Spellings = ["__sync_swap"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def SyncSwapN : Builtin, SyncBuiltinsTemplate {
+  let Spellings = ["__sync_swap_"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "T(T volatile*, T, ...)";
+}
+
+// C11 _Atomic operations for <stdatomic.h>.
+def C11AtomicInit : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_init"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicLoad : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_load"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicStore : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_store"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicExchange : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_exchange"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicCompareExchangeStrong : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_compare_exchange_strong"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicCompareExchangeWeak : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_compare_exchange_weak"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicFetchAdd : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_fetch_add"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicFetchSub : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_fetch_sub"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicFetchAnd : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_fetch_and"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicFetchOr : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_fetch_or"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicFetchXor : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_fetch_xor"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicFetchNand : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_fetch_nand"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicFetchMax : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_fetch_max"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicFetchMin : AtomicBuiltin {
+  let Spellings = ["__c11_atomic_fetch_min"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def C11AtomicThreadFence : Builtin {
+  let Spellings = ["__c11_atomic_thread_fence"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(int)";
+}
+
+def C11AtomicSignalFence : Builtin {
+  let Spellings = ["__c11_atomic_signal_fence"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(int)";
+}
+
+def C11AtomicIsLockFree : Builtin {
+  let Spellings = ["__c11_atomic_is_lock_free"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool(size_t)";
+}
+
+// GNU atomic builtins.
+def AtomicLoad : AtomicBuiltin {
+  let Spellings = ["__atomic_load"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicLoadN : AtomicBuiltin {
+  let Spellings = ["__atomic_load_n"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicStore : AtomicBuiltin {
+  let Spellings = ["__atomic_store"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicStoreN : AtomicBuiltin {
+  let Spellings = ["__atomic_store_n"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicExchange : AtomicBuiltin {
+  let Spellings = ["__atomic_exchange"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicExchangeN : AtomicBuiltin {
+  let Spellings = ["__atomic_exchange_n"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicCompareExchange : AtomicBuiltin {
+  let Spellings = ["__atomic_compare_exchange"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicCompareExchangeN : AtomicBuiltin {
+  let Spellings = ["__atomic_compare_exchange_n"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicFetchAdd : AtomicBuiltin {
+  let Spellings = ["__atomic_fetch_add"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicFetchSub : AtomicBuiltin {
+  let Spellings = ["__atomic_fetch_sub"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicFetchAnd : AtomicBuiltin {
+  let Spellings = ["__atomic_fetch_and"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicFetchOr : AtomicBuiltin {
+  let Spellings = ["__atomic_fetch_or"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicFetchXor : AtomicBuiltin {
+  let Spellings = ["__atomic_fetch_xor"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicFetchNand : AtomicBuiltin {
+  let Spellings = ["__atomic_fetch_nand"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicAddFetch : AtomicBuiltin {
+  let Spellings = ["__atomic_add_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicSubFetch : AtomicBuiltin {
+  let Spellings = ["__atomic_sub_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicAndFetch : AtomicBuiltin {
+  let Spellings = ["__atomic_and_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicOrFetch : AtomicBuiltin {
+  let Spellings = ["__atomic_or_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicXorFetch : AtomicBuiltin {
+  let Spellings = ["__atomic_xor_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicMaxFetch : AtomicBuiltin {
+  let Spellings = ["__atomic_max_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicMinFetch : AtomicBuiltin {
+  let Spellings = ["__atomic_min_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicNandFetch : AtomicBuiltin {
+  let Spellings = ["__atomic_nand_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicTestAndSet : Builtin {
+  let Spellings = ["__atomic_test_and_set"];
+  let Attributes = [NoThrow];
+  let Prototype = "bool(void volatile*, int)";
+}
+
+def AtomicClear : Builtin {
+  let Spellings = ["__atomic_clear"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(void volatile*, int)";
+}
+
+def AtomicThreadFence : Builtin {
+  let Spellings = ["__atomic_thread_fence"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(int)";
+}
+
+def AtomicSignalFence : Builtin {
+  let Spellings = ["__atomic_signal_fence"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(int)";
+}
+
+def AtomicAlwaysLockFree : Builtin {
+  let Spellings = ["__atomic_always_lock_free"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool(size_t, void const volatile*)";
+}
+
+def AtomicIsLockFree : Builtin {
+  let Spellings = ["__atomic_is_lock_free"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool(size_t, void const volatile*)";
+}
+
+// GNU atomic builtins with atomic scopes.
+def ScopedAtomicLoad : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_load"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicLoadN : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_load_n"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicStore : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_store"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicStoreN : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_store_n"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicExchange : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_exchange"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicExchangeN : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_exchange_n"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicCompareExchange : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_compare_exchange"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicCompareExchangeN : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_compare_exchange_n"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicFetchAdd : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_fetch_add"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicFetchSub : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_fetch_sub"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicFetchAnd : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_fetch_and"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicFetchOr : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_fetch_or"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicFetchXor : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_fetch_xor"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicFetchNand : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_fetch_nand"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicFetchMin : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_fetch_min"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicFetchMax : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_fetch_max"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicAddFetch : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_add_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicSubFetch : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_sub_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicAndFetch : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_and_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicOrFetch : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_or_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicXorFetch : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_xor_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicNandFetch : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_nand_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicMinFetch : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_min_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def ScopedAtomicMaxFetch : AtomicBuiltin {
+  let Spellings = ["__scoped_atomic_max_fetch"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+// OpenCL 2.0 atomic builtins.
+def OpenCLAtomicInit : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_init"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicLoad : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_load"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicStore : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_store"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicCompareExchangeWeak : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_compare_exchange_weak"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicCompareExchangeStrong : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_compare_exchange_strong"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicExchange : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_exchange"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicFetchAdd : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_fetch_add"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicFetchSub : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_fetch_sub"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicFetchAnd : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_fetch_and"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicFetchOr : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_fetch_or"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicFetchXor : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_fetch_xor"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicFetchMin : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_fetch_min"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def OpenCLAtomicFetchMax : AtomicBuiltin {
+  let Spellings = ["__opencl_atomic_fetch_max"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+// GCC does not support these, they are a Clang extension.
+def AtomicFetchMax : AtomicBuiltin {
+  let Spellings = ["__atomic_fetch_max"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def AtomicFetchMin : AtomicBuiltin {
+  let Spellings = ["__atomic_fetch_min"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+// HIP atomic builtins.
+def HipAtomicLoad : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_load"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicStore : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_store"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicCompareExchangeWeak : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_compare_exchange_weak"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicCompareExchangeStrong : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_compare_exchange_strong"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicExchange : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_exchange"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicFetchAdd : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_fetch_add"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicFetchSub : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_fetch_sub"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicFetchAnd : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_fetch_and"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicFetchOr : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_fetch_or"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicFetchXor : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_fetch_xor"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicFetchMin : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_fetch_min"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def HipAtomicFetchMax : AtomicBuiltin {
+  let Spellings = ["__hip_atomic_fetch_max"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+// Non-overloaded atomic builtins.
+def SyncSynchronize : Builtin {
+  let Spellings = ["__sync_synchronize"];
+  let Attributes = [NoThrow];
+  let Prototype = "void()";
+}
+
+// GCC does not support these, they are a Clang extension.
+def SyncFetchAndMin : Builtin {
+  let Spellings = ["__sync_fetch_and_min"];
+  let Attributes = [NoThrow];
+  let Prototype = "int(int volatile*, int)";
+}
+
+def SyncFetchAndMax : Builtin {
+  let Spellings = ["__sync_fetch_and_max"];
+  let Attributes = [NoThrow];
+  let Prototype = "int(int volatile*, int)";
+}
+
+def SyncFetchAndUMin : Builtin {
+  let Spellings = ["__sync_fetch_and_umin"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned int(unsigned int volatile*, unsigned int)";
+}
+
+def SyncFetchAndUMax : Builtin {
+  let Spellings = ["__sync_fetch_and_umax"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned int(unsigned int volatile*, unsigned int)";
+}
+
+// ignored glibc builtin, see https://sourceware.org/bugzilla/show_bug.cgi?id=25399
+def WarnMemsetZeroLen : Builtin {
+  let Spellings = ["__warn_memset_zero_len"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "void()";
+}
+
+// Microsoft builtins. These are only active with -fms-extensions.
+def Alloca : MSLangBuiltin {
+  let Spellings = ["_alloca"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(size_t)";
+}
+
+def MSAnnotation : MSLangBuiltin {
+  let Spellings = ["__annotation"];
+  let Attributes = [NoThrow];
+  let Prototype = "wchar_t const*(...)";
+}
+
+def MSAssume : MSLangBuiltin {
+  let Spellings = ["__assume"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "void(bool)";
+}
+
+def Bittest : MSLangBuiltin, MSInt32_64Template {
+  let Spellings = ["_bittest"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(T const*, T)";
+}
+
+def BittestAndComplement : MSLangBuiltin, MSInt32_64Template {
+  let Spellings = ["_bittestandcomplement"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(T*, T)";
+}
+
+def BittestAndReset : MSLangBuiltin, MSInt32_64Template {
+  let Spellings = ["_bittestandreset"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(T*, T)";
+}
+
+def BittestAndSet : MSLangBuiltin, MSInt32_64Template {
+  let Spellings = ["_bittestandset"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(T*, T)";
+}
+
+def MSByteswap : MSLibBuiltin<"stdlib.h">,
+    Template<["unsigned short", "msuint32_t", "unsigned long long int"],
+             ["_ushort",        "_ulong",     "_uint64"]> {
+  let Spellings = ["_byteswap"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+}
+
+def Debugbreak : MSLangBuiltin {
+  let Spellings = ["__debugbreak"];
+  let Attributes = [NoThrow];
+  let Prototype = "void()";
+}
+
+def ExceptionCode : MSLangBuiltin {
+  let Spellings = ["__exception_code", "_exception_code"];
+  let Attributes = [NoThrow];
+  let Prototype = "msuint32_t()";
+}
+
+def ExceptionInfo : MSLangBuiltin {
+  let Spellings = ["__exception_info", "_exception_info"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*()";
+}
+
+def AbnormalTermination : MSLangBuiltin {
+  let Spellings = ["__abnormal_termination", "_abnormal_termination"];
+  let Attributes = [NoThrow];
+  let Prototype = "int()";
+}
+
+def GetExceptionInfo : MSLangBuiltin {
+  let Spellings = ["__GetExceptionInfo"];
+  let Attributes = [NoThrow, CustomTypeChecking, UnevaluatedArguments];
+  let Prototype = "void*(...)";
+  let Namespace = "std";
+}
+
+def InterlockedAnd : MSLangBuiltin, MSInt8_16_32Template {
+  let Spellings = ["_InterlockedAnd"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T volatile*, T)";
+}
+
+def InterlockedCompareExchange : MSLangBuiltin, MSInt8_16_32_64Template {
+  let Spellings = ["_InterlockedCompareExchange"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T volatile*, T, T)";
+}
+
+def InterlockedCompareExchangePointer : MSLangBuiltin {
+  let Spellings = ["_InterlockedCompareExchangePointer"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(void* volatile*, void*, void*)";
+}
+
+def InterlockedCompareExchangePointer_nf : MSLangBuiltin {
+  let Spellings = ["_InterlockedCompareExchangePointer_nf"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(void* volatile*, void*, void*)";
+}
+
+def InterlockedDecrement : MSLangBuiltin, MSInt16_32Template {
+  let Spellings = ["_InterlockedDecrement"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T volatile*)";
+}
+
+def InterlockedExchange : MSLangBuiltin, MSInt8_16_32Template {
+  let Spellings = ["_InterlockedExchange"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T volatile*, T)";
+}
+
+def InterlockedExchangeAdd : MSLangBuiltin, MSInt8_16_32Template {
+  let Spellings = ["_InterlockedExchangeAdd"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T volatile*, T)";
+}
+
+def InterlockedExchangePointer : MSLangBuiltin {
+  let Spellings = ["_InterlockedExchangePointer"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(void* volatile*, void*)";
+}
+
+def InterlockedExchangeSub : MSLangBuiltin, MSInt8_16_32Template {
+  let Spellings = ["_InterlockedExchangeSub"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T volatile*, T)";
+}
+
+def InterlockedIncrement : MSLangBuiltin, MSInt16_32Template {
+  let Spellings = ["_InterlockedIncrement"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T volatile*)";
+}
+
+def InterlockedOr : MSLangBuiltin, MSInt8_16_32Template {
+  let Spellings = ["_InterlockedOr"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T volatile*, T)";
+}
+
+def InterlockedXor : MSLangBuiltin, MSInt8_16_32Template {
+  let Spellings = ["_InterlockedXor"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T volatile*, T)";
+}
+
+def InterlockedBittestAndReset : MSLangBuiltin, MSInt32_64Template {
+  let Spellings = ["_interlockedbittestandreset"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(T volatile*, T)";
+}
+
+def InterlockedBittestAndReset_acq : MSLangBuiltin {
+  let Spellings = ["_interlockedbittestandreset_acq"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(msint32_t volatile*, msint32_t)";
+}
+
+def InterlockedBittestAndReset_nf : MSLangBuiltin {
+  let Spellings = ["_interlockedbittestandreset_nf"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(msint32_t volatile*, msint32_t)";
+}
+
+def InterlockedBittestAndReset_rel : MSLangBuiltin {
+  let Spellings = ["_interlockedbittestandreset_rel"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(msint32_t volatile*, msint32_t)";
+}
+
+def InterlockedBittestAndSet : MSLangBuiltin, MSInt32_64Template {
+  let Spellings = ["_interlockedbittestandset"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(T volatile*, T)";
+}
+
+def InterlockedBittestAndSet_acq : MSLangBuiltin {
+  let Spellings = ["_interlockedbittestandset_acq"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(msint32_t volatile*, msint32_t)";
+}
+
+def InterlockedBittestAndSet_nf : MSLangBuiltin {
+  let Spellings = ["_interlockedbittestandset_nf"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(msint32_t volatile*, msint32_t)";
+}
+
+def InterlockedBittestAndSet_rel : MSLangBuiltin {
+  let Spellings = ["_interlockedbittestandset_rel"];
+  let Attributes = [NoThrow];
+  let Prototype = "unsigned char(msint32_t volatile*, msint32_t)";
+}
+
+def IsoVolatileLoad : MSLangBuiltin, Int8_16_32_64Template {
+  let Spellings = ["__iso_volatile_load"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T const volatile*)";
+}
+
+def IsoVolatileStore : MSLangBuiltin, Int8_16_32_64Template {
+  let Spellings = ["__iso_volatile_store"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(T volatile*, T)";
+}
+
+def Noop : MSLangBuiltin {
+  let Spellings = ["__noop"];
+  let Attributes = [NoThrow];
+  let Prototype = "int(...)";
+}
+
+def MSCountLeadingZeroes : MSLangBuiltin, MSUInt16_32_64Template {
+  let Spellings = ["__lzcnt"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T(T)";
+}
+
+def MSPopCount : MSLangBuiltin, MSUInt16_32_64Template {
+  let Spellings = ["__popcnt"];
+  let Attributes = [NoThrow, Const, Constexpr];
+  let Prototype = "T(T)";
+}
+
+def MSReturnAddress : MSLangBuiltin {
+  let Spellings = ["_ReturnAddress"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*()";
+}
+
+def Rotl8 : MSLangBuiltin {
+  let Spellings = ["_rotl8"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "unsigned char(unsigned char, unsigned char)";
+}
+
+def Rotl16 : MSLangBuiltin {
+  let Spellings = ["_rotl16"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "unsigned short(unsigned short, unsigned char)";
+}
+
+def Rotl : MSLangBuiltin {
+  let Spellings = ["_rotl"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "unsigned int(unsigned int, int)";
+}
+
+def Lrotl : MSLangBuiltin {
+  let Spellings = ["_lrotl"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "unsigned long int(unsigned long int, int)";
+}
+
+def Rotl64 : MSLangBuiltin {
+  let Spellings = ["_rotl64"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "uint64_t(uint64_t, int)";
+}
+
+def Rotr8 : MSLangBuiltin {
+  let Spellings = ["_rotr8"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "unsigned char(unsigned char, unsigned char)";
+}
+
+def Rotr16 : MSLangBuiltin {
+  let Spellings = ["_rotr16"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "unsigned short(unsigned short, unsigned char)";
+}
+
+def Rotr : MSLangBuiltin {
+  let Spellings = ["_rotr"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "unsigned int(unsigned int, int)";
+}
+
+def Lrotr : MSLangBuiltin {
+  let Spellings = ["_lrotr"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "unsigned long int(unsigned long int, int)";
+}
+
+def Rotr64 : MSLangBuiltin {
+  let Spellings = ["_rotr64"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "uint64_t(uint64_t, int)";
+}
+
+def MSva_start : MSLangBuiltin {
+  let Spellings = ["__va_start"];
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(char**, ...)";
+}
+
+def FastFail : MSLangBuiltin {
+  let Spellings = ["__fastfail"];
+  let Attributes = [NoThrow, NoReturn];
+  let Prototype = "void(unsigned int)";
+}
+
+// Microsoft library builtins.
+def SetJmpEx : MSLibBuiltin<"setjmpex.h"> {
+  let Spellings = ["_setjmpex"];
+  let Attributes = [IgnoreSignature, ReturnsTwice];
+  let Prototype = "int(jmp_buf)";
+}
+
+// C99 library functions
+// C99 stdarg.h
+def VaStart : LibBuiltin<"stdarg.h"> {
+  let Spellings = ["va_start"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(__builtin_va_list_ref, ...)";
+}
+
+def VaEnd : LibBuiltin<"stdarg.h"> {
+  let Spellings = ["va_end"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(__builtin_va_list_ref)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def VaCopy : LibBuiltin<"stdarg.h"> {
+  let Spellings = ["va_copy"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(__builtin_va_list_ref, __builtin_va_list_ref)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+// C99 stdlib.h
+def Abort : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["abort"];
+  let Attributes = [NoThrow, NoReturn];
+  let Prototype = "void()";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Calloc : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["calloc"];
+  let Prototype = "void*(size_t, size_t)";
+}
+
+def Exit : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["exit", "_Exit"];
+  let Attributes = [NoReturn];
+  let Prototype = "void(int)";
+}
+
+def Malloc : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["malloc"];
+  let Prototype = "void*(size_t)";
+}
+
+def Realloc : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["realloc"];
+  let Prototype = "void*(void*, size_t)";
+}
+
+def Free : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["free"];
+  let Prototype = "void(void*)";
+}
+
+def StrToD : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["strtod"];
+  let Prototype = "double(char const*, char**)";
+}
+
+def StrToF : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["strtof"];
+  let Prototype = "float(char const*, char**)";
+}
+
+def StrToLd : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["strtold"];
+  let Prototype = "long double(char const*, char**)";
+}
+
+def StrToL : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["strtol"];
+  let Prototype = "long int(char const*, char**, int)";
+}
+
+def StrToLL : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["strtoll"];
+  let Prototype = "long long int(char const*, char**, int)";
+}
+
+def StrToUL : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["strtoul"];
+  let Prototype = "unsigned long int(char const*, char**, int)";
+}
+
+def StrToULL : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["strtoull"];
+  let Prototype = "unsigned long long int(char const*, char**, int)";
+}
+
+// C11 stdlib.h
+def AlignedAlloc : LibBuiltin<"stdlib.h"> {
+  let Spellings = ["aligned_alloc"];
+  let Prototype = "void*(size_t, size_t)";
+}
+
+// C99 string.h
+def MemCpy : LibBuiltin<"string.h"> {
+  let Spellings = ["memcpy"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "void*(void*, void const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def BuiltinMemCmp : Builtin {
+  let Spellings = ["__builtin_memcmp"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Constexpr];
+  let Prototype = "int(void const*, void const*, size_t)";
+}
+
+def MemCmp : LibBuiltin<"string.h"> {
+  let Spellings = ["memcmp"];
+  let Attributes = [Constexpr];
+  let Prototype = "int(void const*, void const*, size_t)";
+}
+
+def MemMove : LibBuiltin<"string.h"> {
+  let Spellings = ["memmove"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "void*(void*, void const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrCpy : LibBuiltin<"string.h"> {
+  let Spellings = ["strcpy"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char*, char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrNCpy : LibBuiltin<"string.h"> {
+  let Spellings = ["strncpy"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char*, char const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrCmp : LibBuiltin<"string.h"> {
+  let Spellings = ["strcmp"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "int(char const*, char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrNCmp : LibBuiltin<"string.h"> {
+  let Spellings = ["strncmp"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "int(char const*, char const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrCat : LibBuiltin<"string.h"> {
+  let Spellings = ["strcat"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char*, char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrNCat : LibBuiltin<"string.h"> {
+  let Spellings = ["strncat"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char*, char const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrxFrm : LibBuiltin<"string.h"> {
+  let Spellings = ["strxfrm"];
+  let Prototype = "size_t(char*, char const*, size_t)";
+}
+
+def MemChr : LibBuiltin<"string.h"> {
+  let Spellings = ["memchr"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "void*(void const*, int, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrChr : LibBuiltin<"string.h"> {
+  let Spellings = ["strchr"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "char*(char const*, int)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrcSpn : LibBuiltin<"string.h"> {
+  let Spellings = ["strcspn"];
+  let Prototype = "size_t(char const*, char const*)";
+}
+
+def StrpBrk : LibBuiltin<"string.h"> {
+  let Spellings = ["strpbrk"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char const*, char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrrChr : LibBuiltin<"string.h"> {
+  let Spellings = ["strrchr"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char const*, int)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrSpn : LibBuiltin<"string.h"> {
+  let Spellings = ["strspn"];
+  let Attributes = [NoThrow];
+  let Prototype = "size_t(char const*, char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrStr : LibBuiltin<"string.h"> {
+  let Spellings = ["strstr"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char const*, char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrTok : LibBuiltin<"string.h"> {
+  let Spellings = ["strtok"];
+  let Prototype = "char*(char*, char const*)";
+}
+
+def MemSet : LibBuiltin<"string.h"> {
+  let Spellings = ["memset"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(void*, int, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrError : LibBuiltin<"string.h"> {
+  let Spellings = ["strerror"];
+  let Prototype = "char*(int)";
+}
+
+def StrLen : LibBuiltin<"string.h"> {
+  let Spellings = ["strlen"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "size_t(char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+// C99 stdio.h
+// FIXME: This list is incomplete.
+def Printf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["printf"];
+  let Attributes = [PrintfFormat<0>];
+  let Prototype = "int(char const*, ...)";
+}
+
+// FIXME: The builtin and library function should have the same signature.
+def BuiltinPrintf : Builtin {
+  let Spellings = ["__builtin_printf"];
+  let Attributes = [NoThrow, PrintfFormat<0>, FunctionWithBuiltinPrefix];
+  let Prototype = "int(char const* restrict, ...)";
+}
+
+def FPrintf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["fprintf"];
+  let Attributes = [NoThrow, PrintfFormat<1>];
+  let Prototype = "int(FILE* restrict, char const* restrict, ...)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def SnPrintf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["snprintf"];
+  let Attributes = [NoThrow, PrintfFormat<2>];
+  let Prototype = "int(char* restrict, size_t, char const* restrict, ...)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def SPrintf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["sprintf"];
+  let Attributes = [NoThrow, PrintfFormat<1>];
+  let Prototype = "int(char* restrict, char const* restrict, ...)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def VPrintf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["vprintf"];
+  let Attributes = [NoThrow, VPrintfFormat<0>];
+  let Prototype = "int(char const* restrict, __builtin_va_list)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def VfPrintf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["vfprintf"];
+  let Attributes = [NoThrow, VPrintfFormat<1>];
+  let Prototype = "int(FILE* restrict, char const* restrict, __builtin_va_list)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def VsnPrintf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["vsnprintf"];
+  let Attributes = [NoThrow, VPrintfFormat<2>];
+  let Prototype = "int(char* restrict, size_t, char const* restrict, __builtin_va_list)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def VsPrintf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["vsprintf"];
+  let Attributes = [NoThrow, VPrintfFormat<1>];
+  let Prototype = "int(char* restrict, char const* restrict, __builtin_va_list)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Scanf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["scanf"];
+  let Attributes = [ScanfFormat<0>];
+  let Prototype = "int(char const* restrict, ...)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def FScanf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["fscanf"];
+  let Attributes = [ScanfFormat<1>];
+  let Prototype = "int(FILE* restrict, char const* restrict, ...)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def SScanf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["sscanf"];
+  let Attributes = [ScanfFormat<1>];
+  let Prototype = "int(char const* restrict, char const* restrict, ...)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def VScanf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["vscanf"];
+  let Attributes = [VScanfFormat<0>];
+  let Prototype = "int(char const* restrict, __builtin_va_list)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def VFScanf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["vfscanf"];
+  let Attributes = [VScanfFormat<1>];
+  let Prototype = "int(FILE* restrict, char const* restrict, __builtin_va_list)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def VSScanf : LibBuiltin<"stdio.h"> {
+  let Spellings = ["vsscanf"];
+  let Attributes = [VScanfFormat<1>];
+  let Prototype = "int(char const* restrict, char const* restrict, __builtin_va_list)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Fopen : LibBuiltin<"stdio.h"> {
+  let Spellings = ["fopen"];
+  let Prototype = "FILE*(char const*, char const*)";
+}
+
+def Fread : LibBuiltin<"stdio.h"> {
+  let Spellings = ["fread"];
+  let Prototype = "size_t(void*, size_t, size_t, FILE*)";
+}
+
+def Fwrite : LibBuiltin<"stdio.h"> {
+  let Spellings = ["fwrite"];
+  let Prototype = "size_t(void const*, size_t, size_t, FILE*)";
+}
+
+// C99 ctype.h
+
+def IsAlNum : LibBuiltin<"ctype.h"> {
+  let Spellings = ["isalnum"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsAlpha : LibBuiltin<"ctype.h"> {
+  let Spellings = ["isalpha"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsBlank : LibBuiltin<"ctype.h"> {
+  let Spellings = ["isblank"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsCntrl : LibBuiltin<"ctype.h"> {
+  let Spellings = ["iscntrl"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsDigit : LibBuiltin<"ctype.h"> {
+  let Spellings = ["isdigit"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsGraph : LibBuiltin<"ctype.h"> {
+  let Spellings = ["isgraph"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsLower : LibBuiltin<"ctype.h"> {
+  let Spellings = ["islower"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsPrint : LibBuiltin<"ctype.h"> {
+  let Spellings = ["isprint"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsPunct : LibBuiltin<"ctype.h"> {
+  let Spellings = ["ispunct"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsSpace : LibBuiltin<"ctype.h"> {
+  let Spellings = ["isspace"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsUpper : LibBuiltin<"ctype.h"> {
+  let Spellings = ["isupper"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def IsXDigit : LibBuiltin<"ctype.h"> {
+  let Spellings = ["isxdigit"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def ToLower : LibBuiltin<"ctype.h"> {
+  let Spellings = ["tolower"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+def ToUpper : LibBuiltin<"ctype.h"> {
+  let Spellings = ["toupper"];
+  let Attributes = [NoThrow, Pure];
+  let Prototype = "int(int)";
+}
+
+// C99 wchar.h
+// FIXME: This list is incomplete. We should cover at least the functions that
+// take format strings.
+
+def WcsChr : LibBuiltin<"wchar.h"> {
+  let Spellings = ["wcschr"];
+  let Attributes = [NoThrow, Pure, Constexpr];
+  let Prototype = "wchar_t*(wchar_t const*, wchar_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def WcsCmp : LibBuiltin<"wchar.h"> {
+  let Spellings = ["wcscmp"];
+  let Attributes = [NoThrow, Pure, Constexpr];
+  let Prototype = "int(wchar_t const*, wchar_t const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def WcsLen : LibBuiltin<"wchar.h"> {
+  let Spellings = ["wcslen"];
+  let Attributes = [NoThrow, Pure, Constexpr];
+  let Prototype = "size_t(wchar_t const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def WcsnCmp : LibBuiltin<"wchar.h"> {
+  let Spellings = ["wcsncmp"];
+  let Attributes = [NoThrow, Pure, Constexpr];
+  let Prototype = "int(wchar_t const*, wchar_t const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def WMemChr : LibBuiltin<"wchar.h"> {
+  let Spellings = ["wmemchr"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "wchar_t*(wchar_t const*, wchar_t, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def WMemCmp : LibBuiltin<"wchar.h"> {
+  let Spellings = ["wmemcmp"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "int(wchar_t const*, wchar_t const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def WMemCpy : LibBuiltin<"wchar.h"> {
+  let Spellings = ["wmemcpy"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "wchar_t*(wchar_t*, wchar_t const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def WMemMove : LibBuiltin<"wchar.h"> {
+  let Spellings = ["wmemmove"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "wchar_t*(wchar_t*, wchar_t const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+// C99
+
+// FIXME: MinGW _setjmp has an additional void* parameter.
+def SetJmp : LibBuiltin<"setjmp.h"> {
+  let Spellings = ["setjmp", "_setjmp"];
+  let Attributes = [ReturnsTwice, IgnoreSignature];
+  let Prototype = "int(jmp_buf)";
+  let RequiresUndef = 1;
+}
+
+def LongJmp : LibBuiltin<"setjmp.h"> {
+  let Spellings = ["longjmp"];
+  let Attributes = [NoReturn, IgnoreSignature];
+  let Prototype = "void(jmp_buf, int)";
+}
+
+// Non-C library functions, active in GNU mode only.
+// Functions with [[gnu::returns_twice]] attribute are still active in
+// all languages, because losing this attribute would result in miscompilation
+// when these functions are used in non-GNU mode. PR16138.
+
+def AllocA : GNULibBuiltin<"stdlib.h"> {
+  let Spellings = ["alloca"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+// POSIX malloc.h
+
+def MemAlign : GNULibBuiltin<"malloc.h"> {
+  let Spellings = ["memalign"];
+  let Prototype = "void*(size_t, size_t)";
+}
+
+// POSIX string.h
+
+def MemcCpy : GNULibBuiltin<"stdlib.h"> {
+  let Spellings = ["memccpy"];
+  let Prototype = "void*(void*, void const*, int, size_t)";
+}
+
+def MempCpy : GNULibBuiltin<"stdlib.h"> {
+  let Spellings = ["mempcpy"];
+  let Prototype = "void*(void*, void const*, size_t)";
+}
+
+def StpCpy : GNULibBuiltin<"stdlib.h"> {
+  let Spellings = ["stpcpy"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char*, char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StpnCpy : GNULibBuiltin<"stdlib.h"> {
+  let Spellings = ["stpncpy"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char*, char const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrDup : GNULibBuiltin<"stdlib.h"> {
+  let Spellings = ["strdup"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def StrnDup : GNULibBuiltin<"stdlib.h"> {
+  let Spellings = ["strndup"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+// POSIX strings.h
+
+def Index : GNULibBuiltin<"strings.h"> {
+  let Spellings = ["index"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char const*, int)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Rindex : GNULibBuiltin<"strings.h"> {
+  let Spellings = ["rindex"];
+  let Attributes = [NoThrow];
+  let Prototype = "char*(char const*, int)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def BZero : GNULibBuiltin<"strings.h"> {
+  let Spellings = ["bzero"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(void*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Bcopy : GNULibBuiltin<"strings.h"> {
+  let Spellings = ["bcopy"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(void const*, void*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+// FIXME: This should be part of BCmp.
+def BuiltinBCmp : Builtin {
+  let Spellings = ["__builtin_bcmp"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow, Constexpr];
+  let Prototype = "int(void const*, void const*, size_t)";
+}
+
+def BCmp : GNULibBuiltin<"strings.h"> {
+  let Spellings = ["bcmp"];
+  let Attributes = [Constexpr];
+  let Prototype = "int(void const*, void const*, size_t)";
+}
+
+def StrCaseCmp : GNULibBuiltin<"strings.h"> {
+  let Spellings = ["strcasecmp"];
+  let Prototype = "int(char const*, char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+  let RequiresUndef = 1;
+}
+
+def StrnCaseCmp : GNULibBuiltin<"strings.h"> {
+  let Spellings = ["strncasecmp"];
+  let Prototype = "int(char const*, char const*, size_t)";
+  let AddBuiltinPrefixedAlias = 1;
+  let RequiresUndef = 1;
+}
+
+def GNU_Exit : GNULibBuiltin<"unistd.h"> {
+  let Spellings = ["_exit"];
+  let Attributes = [NoReturn];
+  let Prototype = "void(int)";
+}
+
+def VFork : LibBuiltin<"unistd.h"> {
+  let Spellings = ["vfork"];
+  let Attributes = [ReturnsTwice, IgnoreSignature];
+  let Prototype = "pid_t()";
+}
+
+// POSIX pthread.h
+// FIXME: This should be a GNULibBuiltin, but it's currently missing the prototype.
+
+def PthreadCreate : CustomEntry {
+  let Entry = "LIBBUILTIN(pthread_create, \"\",  \"fC<2,3>\", PTHREAD_H, ALL_GNU_LANGUAGES)";
+}
+
+def SigSetJmp : LibBuiltin<"setjmp.h"> {
+  let Spellings = ["sigsetjmp", "__sigsetjmp"];
+  let Attributes = [ReturnsTwice, IgnoreSignature];
+  let Prototype = "int(sigjmp_buf, int)";
+}
+
+def SaveCtx : LibBuiltin<"setjmp.h"> {
+  let Spellings = ["savectx"];
+  let Attributes = [ReturnsTwice, IgnoreSignature];
+  let Prototype = "int(sigjmp_buf)";
+}
+
+def GetContext : LibBuiltin<"setjmp.h"> {
+  let Spellings = ["getcontext"];
+  let Attributes = [ReturnsTwice, IgnoreSignature];
+  let Prototype = "int(ucontext_t*)";
+}
+
+def GNU_LongJmp : GNULibBuiltin<"setjmp.h"> {
+  let Spellings = ["_longjmp"];
+  let Attributes = [NoReturn, IgnoreSignature];
+  let Prototype = "void(jmp_buf, int)";
+}
+
+def SigLongJmp : GNULibBuiltin<"setjmp.h"> {
+  let Spellings = ["siglongjmp"];
+  let Attributes = [NoReturn, IgnoreSignature];
+  let Prototype = "void(sigjmp_buf, int)";
+}
+
+// non-standard but very common
+
+def StrlCpy : GNULibBuiltin<"string.h"> {
+  let Spellings = ["strlcpy"];
+  let Prototype = "size_t(char*, char const*, size_t)";
+}
+
+def StrlCat : GNULibBuiltin<"string.h"> {
+  let Spellings = ["strlcat"];
+  let Prototype = "size_t(char*, char const*, size_t)";
+}
+
+def ObjcMsgSend : ObjCLibBuiltin<"objc_message.h"> {
+  let Spellings = ["objc_msgSend"];
+  let Prototype = "id(id, SEL, ...)";
+}
+
+def ObjcMsgSendFpret : ObjCLibBuiltin<"objc_message.h"> {
+  let Spellings = ["objc_msgSend_fpret"];
+  let Prototype = "long double(id, SEL, ...)";
+}
+
+def ObjcMsgSendFp2ret : ObjCLibBuiltin<"objc/message.h"> {
+  let Spellings = ["objc_msgSend_fp2ret"];
+  let Prototype = "_Complex long double(id, SEL, ...)";
+}
+
+def ObjcMsgSendStret : ObjCLibBuiltin<"objc/message.h"> {
+  let Spellings = ["objc_msgSend_stret"];
+  let Prototype = "void(id, SEL, ...)";
+}
+
+def ObjcMsgSendSuper : ObjCLibBuiltin<"objc/message.h"> {
+  let Spellings = ["objc_msgSendSuper"];
+  let Prototype = "id(objc_super*, SEL, ...)";
+}
+
+def ObjcMsgSendSuperStret : ObjCLibBuiltin<"objc/message.h"> {
+  let Spellings = ["objc_msgSendSuper_stret"];
+  let Prototype = "void(objc_super*, SEL, ...)";
+}
+
+def ObjcGetClass : ObjCLibBuiltin<"objc/runtime.h"> {
+  let Spellings = ["objc_getClass"];
+  let Prototype = "id(char const*)";
+}
+
+def ObjcGetMetaClass : ObjCLibBuiltin<"objc/runtime.h"> {
+  let Spellings = ["objc_getMetaClass"];
+  let Prototype = "id(char const*)";
+}
+
+def ObjcEnumerationMutation : ObjCLibBuiltin<"objc/runtime.h"> {
+  let Spellings = ["objc_enumerationMutation"];
+  let Prototype = "void(id)";
+}
+
+def ObjcReadWeak : ObjCLibBuiltin<"objc/message.h"> {
+  let Spellings = ["objc_read_weak"];
+  let Prototype = "id(id*)";
+}
+
+def ObjcAssignWeak : ObjCLibBuiltin<"objc/message.h"> {
+  let Spellings = ["objc_assign_weak"];
+  let Prototype = "id(id, id*)";
+}
+
+def ObjcAssignIvar : ObjCLibBuiltin<"objc/message.h"> {
+  let Spellings = ["objc_assign_ivar"];
+  let Prototype = "id(id, id, ptrdiff_t)";
+}
+
+def ObjcAssignGlobal : ObjCLibBuiltin<"objc/message.h"> {
+  let Spellings = ["objc_assign_global"];
+  let Prototype = "id(id, id*)";
+}
+
+def ObjcAssignStrongCast : ObjCLibBuiltin<"objc/objc-auto.h"> {
+  let Spellings = ["objc_assign_strongCast"];
+  let Prototype = "id(id, id*)";
+}
+
+def ObjcExceptionExtract : ObjCLibBuiltin<"objc/objc_exception.h"> {
+  let Spellings = ["objc_exception_extract"];
+  let Prototype = "id(void*)";
+}
+
+def ObjcExceptionTryEnter : ObjCLibBuiltin<"objc/objc_exception.h"> {
+  let Spellings = ["objc_exception_try_enter"];
+  let Prototype = "void(void*)";
+}
+
+def ObjcExceptionTryExit : ObjCLibBuiltin<"objc/objc_exception.h"> {
+  let Spellings = ["objc_exception_try_exit"];
+  let Prototype = "void(void*)";
+}
+
+def ObjcExceptionMatch : ObjCLibBuiltin<"objc/objc_exception.h"> {
+  let Spellings = ["objc_exception_match"];
+  let Prototype = "int(id, id)";
+}
+
+def ObjcExceptionThrow : ObjCLibBuiltin<"objc/objc_exception.h"> {
+  let Spellings = ["objc_exception_throw"];
+  let Prototype = "void(id)";
+}
+
+def ObjcSyncEnter : ObjCLibBuiltin<"objc_objc_sync.h"> {
+  let Spellings = ["objc_sync_enter"];
+  let Prototype = "int(id)";
+}
+
+def ObjcSyncExit : ObjCLibBuiltin<"objc_objc_sync.h"> {
+  let Spellings = ["objc_sync_exit"];
+  let Prototype = "int(id)";
+}
+
+def ObjcMemmoveCollectable : Builtin {
+  let Spellings = ["__builtin_objc_memmove_collectable"];
+  let Attributes = [NoThrow, FunctionWithBuiltinPrefix];
+  let Prototype = "void*(void*, void const*, size_t)";
+}
+
+def NSLog : ObjCLibBuiltin<"foundation_nsobjcruntime.h"> {
+  let Spellings = ["NSLog"];
+  let Attributes = [PrintfFormat<0>];
+  let Prototype = "void(id, ...)";
+}
+
+def NSLogv : ObjCLibBuiltin<"foundation_nsobjcruntime.h"> {
+  let Spellings = ["NSLogv"];
+  let Attributes = [VPrintfFormat<0>];
+  let Prototype = "void(id, __builtin_va_list)";
+}
+
+def Atan2 : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["atan2"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Abs : IntMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["abs"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Copysign : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["copysign"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+  let OnlyBuiltinPrefixedAliasIsConstexpr = 1;
+}
+
+def Fabs : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["fabs"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+  let OnlyBuiltinPrefixedAliasIsConstexpr = 1;
+}
+
+def Finite : FPMathTemplate, GNULibBuiltin<"math.h"> {
+  let Spellings = ["finite"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "int(T)";
+  let RequiresUndef = 1;
+}
+
+def OSXFinite : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["__finite"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "int(T)";
+}
+
+def Fmod : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["fmod"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Frexp : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["frexp"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T, int*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Ldexp : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["ldexp"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, int)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Modf : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["modf"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T, T*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Nan : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["nan"];
+  let Attributes = [Pure, NoThrow];
+  let Prototype = "T(char const*)";
+  let AddBuiltinPrefixedAlias = 1;
+  let OnlyBuiltinPrefixedAliasIsConstexpr = 1;
+}
+
+def Pow : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["pow"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Acos : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["acos"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Acosh : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["acosh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Asin : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["asin"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Asinh : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["asinh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Atan : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["atan"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Atanh : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["atanh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cbrt : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["cbrt"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Ceil : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["ceil"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cos : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["cos"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cosh : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["cosh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Erf : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["erf"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Erfc : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["erfc"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Exp : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["exp"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Exp2 : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["exp2"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+// FIXME: Why is there no builtin without the prefix?
+def Exp10 : FPMathTemplate, Builtin {
+  let Spellings = ["__builtin_exp10"];
+  let Attributes = [FunctionWithBuiltinPrefix, NoThrow,
+                    ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def Expm1 : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["expm1"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Fdim : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["fdim"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Floor : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["floor"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Fma : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["fma"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Fmax : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["fmax"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+  let OnlyBuiltinPrefixedAliasIsConstexpr = 1;
+}
+
+def Fmin : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["fmin"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+  let OnlyBuiltinPrefixedAliasIsConstexpr = 1;
+}
+
+def Hypot : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["hypot"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Ilogb : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["ilogb"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "int(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Lgamma : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["lgamma"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Llrint : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["llrint"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "long long int(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Llround : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["llround"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "long long int(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Log : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["log"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Log10 : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["log10"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Log1p : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["log1p"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Log2 : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["log2"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Logb : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["logb"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Lrint : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["lrint"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "long int(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Lround : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["lround"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "long int(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Nearbyint : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["nearbyint"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Nextafter : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["nextafter"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Nexttoward : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["nexttoward"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, long double)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Remainder : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["remainder"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Remquo : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["remquo"];
+  let Attributes = [NoThrow];
+  let Prototype = "T(T, T, int*)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Rint : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["rint"];
+  let Attributes = [NoThrow, ConstIgnoringExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Round : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["round"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def RoundEven : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["roundeven"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Scalbln : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["scalbln"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, long int)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Scalbn : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["scalbn"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T, int)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Sin : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["sin"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Sinh : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["sinh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Sqrt : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["sqrt"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Tan : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["tan"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Tanh : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["tanh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Tgamma : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["tgamma"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Trunc : FPMathTemplate, LibBuiltin<"math.h"> {
+  let Spellings = ["trunc"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cabs : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["cabs"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cacos : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["cacos"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cacosh : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["cacosh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Carg : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["carg"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Casin : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["casin"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Casinh : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["casinh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Catan : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["catan"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Catanh : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["catanh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Ccos : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["ccos"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Ccosh : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["ccosh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cexp : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["cexp"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cimag : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["cimag"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Conj : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["conj"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Clog : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["clog"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cproj : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["cproj"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Cpow : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["cpow"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T, _Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Creal : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["creal"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Csin : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["csin"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Csinh : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["csinh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Csqrt : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["csqrt"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Ctan : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["ctan"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+def Ctanh : FPMathTemplate, LibBuiltin<"complex.h"> {
+  let Spellings = ["ctanh"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "_Complex T(_Complex T)";
+  let AddBuiltinPrefixedAlias = 1;
+}
+
+// __sinpi and friends are OS X specific library functions, but otherwise much
+// like the standard (non-complex) sin (etc).
+def Sinpi : LibBuiltin<"math.h">, FloatDoubleTemplate {
+  let Spellings = ["__sinpi"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def Cospi : LibBuiltin<"math.h">, FloatDoubleTemplate {
+  let Spellings = ["__cospi"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+def Tanpi : LibBuiltin<"math.h">, FloatDoubleTemplate {
+  let Spellings = ["__tanpi"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+// Similarly, __exp10 is OS X only
+def OSXExp10 : LibBuiltin<"math.h">, FloatDoubleTemplate {
+  let Spellings = ["__exp10"];
+  let Attributes = [NoThrow, ConstIgnoringErrnoAndExceptions];
+  let Prototype = "T(T)";
+}
+
+// Blocks runtime Builtin math library functions.
+def BlockObjectAssign : LibBuiltin<"blocks.h"> {
+  let Spellings = ["_Block_object_assign"];
+  let Prototype = "void(void*, void const*, int const)";
+}
+
+def BlockObjectDispose : LibBuiltin<"blocks.h"> {
+  let Spellings = ["_Block_object_dispose"];
+  let Prototype = "void(void const*, int const)";
+}
+// FIXME: Also declare NSConcreteGlobalBlock and NSConcreteStackBlock.
+
+def Addressof : CxxLibBuiltin<"memory"> {
+  let Spellings = ["addressof", "__addressof"];
+  let Attributes = [NoThrow, Const, IgnoreSignature, RequireDeclaration,
+                    Constexpr];
+  let Prototype = "void*(void&)";
+  let Namespace = "std";
+}
+
+def AsConst : CxxLibBuiltin<"utility"> {
+  let Spellings = ["as_const"];
+  let Attributes = [NoThrow, Const, IgnoreSignature, RequireDeclaration,
+                    Constexpr];
+  let Prototype = "void&(void&)";
+  let Namespace = "std";
+}
+
+def Forward : CxxLibBuiltin<"utility"> {
+  let Spellings = ["forward"];
+  let Attributes = [NoThrow, Const, IgnoreSignature, RequireDeclaration,
+                    Constexpr];
+  let Prototype = "void&(void&)";
+  let Namespace = "std";
+}
+
+def ForwardLike : CxxLibBuiltin<"utility"> {
+  let Spellings = ["forward_like"];
+  let Attributes = [NoThrow, Const, IgnoreSignature, RequireDeclaration,
+                    Constexpr];
+  let Prototype = "void&(void&)";
+  let Namespace = "std";
+}
+
+def Move : CxxLibBuiltin<"utility"> {
+  let Spellings = ["move"];
+  let Attributes = [NoThrow, Const, IgnoreSignature, RequireDeclaration,
+                    Constexpr];
+  let Prototype = "void&(void&)";
+  let Namespace = "std";
+}
+
+def MoveIfNsoexcept : CxxLibBuiltin<"memory"> {
+  let Spellings = ["move_if_noexcept"];
+  let Attributes = [NoThrow, Const, IgnoreSignature, RequireDeclaration,
+                    Constexpr];
+  let Prototype = "void&(void&)";
+  let Namespace = "std";
+}
+
+def Annotation : Builtin {
+  let Spellings = ["__builtin_annotation"];
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+// Invariants
+def Assume : Builtin {
+  let Spellings = ["__builtin_assume"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "void(bool)";
+}
+
+def AssumeSeparateStorage : Builtin {
+  let Spellings = ["__builtin_assume_separate_storage"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "void(void const volatile*, void const volatile*)";
+}
+
+// Multiprecision Arithmetic Builtins.
+
+class MPATemplate : Template<
+    ["unsigned char",     "unsigned short",        "unsigned int",
+     "unsigned long int", "unsigned long long int"],
+    ["b",                 "s",                     "",
+     "l",                 "ll"]>;
+
+def Addc : Builtin, MPATemplate {
+  let Spellings = ["__builtin_addc"];
+  let Attributes = [NoThrow];
+  // FIXME: Why are these argumentes marked const?
+  let Prototype = "T(T const, T const, T const, T*)";
+}
+
+def Subc : Builtin, MPATemplate {
+  let Spellings = ["__builtin_subc"];
+  let Attributes = [NoThrow];
+  // FIXME: Why are these argumentes marked const?
+  let Prototype = "T(T const, T const, T const, T*)";
+}
+
+// Checked Arithmetic Builtins for Security.
+def AddOverflow : Builtin {
+  let Spellings = ["__builtin_add_overflow"];
+  let Attributes = [NoThrow, CustomTypeChecking, Constexpr];
+  let Prototype = "bool(...)";
+}
+
+def SubOverflow : Builtin {
+  let Spellings = ["__builtin_sub_overflow"];
+  let Attributes = [NoThrow, CustomTypeChecking, Constexpr];
+  let Prototype = "bool(...)";
+}
+
+def MulOverflow : Builtin {
+  let Spellings = ["__builtin_mul_overflow"];
+  let Attributes = [NoThrow, CustomTypeChecking, Constexpr];
+  let Prototype = "bool(...)";
+}
+
+class UOverflowTemplate :
+    Template<["unsigned int", "unsigned long int", "unsigned long long int"],
+             ["_overflow",    "l_overflow",        "ll_overflow"]>;
+
+def UaddOverflow : Builtin, UOverflowTemplate {
+  let Spellings = ["__builtin_uadd"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool(T const, T const, T*)";
+}
+
+def UsubOverflow : Builtin, UOverflowTemplate {
+  let Spellings = ["__builtin_usub"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool(T const, T const, T*)";
+}
+
+def UmulOverflow : Builtin, UOverflowTemplate {
+  let Spellings = ["__builtin_umul"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool(T const, T const, T*)";
+}
+
+class SOverflowTemplate :
+    Template<["int",          "long int",          "long long int"],
+             ["_overflow",    "l_overflow",        "ll_overflow"]>;
+
+def SaddOverflow : Builtin, SOverflowTemplate {
+  let Spellings = ["__builtin_sadd"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool(T const, T const, T*)";
+}
+
+def SsubOverflow : Builtin, SOverflowTemplate {
+  let Spellings = ["__builtin_ssub"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool(T const, T const, T*)";
+}
+
+def SmulOverflow : Builtin, SOverflowTemplate {
+  let Spellings = ["__builtin_smul"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "bool(T const, T const, T*)";
+}
+
+// Clang builtins (not available in GCC).
+def BuiltinAddressof : Builtin {
+  let Spellings = ["__builtin_addressof"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "void*(void&)";
+}
+
+def BuiltinFunctionStart : Builtin {
+  let Spellings = ["__builtin_function_start"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "void*(void&)";
+}
+
+def BuiltinOperatorNew : Builtin {
+  let Spellings = ["__builtin_operator_new"];
+  let Attributes = [Const, CustomTypeChecking, Constexpr];
+  let Prototype = "void*(size_t)";
+}
+
+def BuiltinOperatorDelete : Builtin {
+  let Spellings = ["__builtin_operator_delete"];
+  let Attributes = [NoThrow, CustomTypeChecking, Constexpr];
+  let Prototype = "void(void*)";
+}
+
+def BuiltinCharMemchr : Builtin {
+  let Spellings = ["__builtin_char_memchr"];
+  let Attributes = [NoThrow, Constexpr];
+  let Prototype = "char*(char const*, int, size_t)";
+}
+
+def BuiltinDumpStruct : Builtin {
+  let Spellings = ["__builtin_dump_struct"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def BuiltinPreserveAccessIndex : Builtin {
+  let Spellings = ["__builtin_preserve_access_index"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def IsAligned : Builtin {
+  let Spellings = ["__builtin_is_aligned"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "bool(void const*, size_t)";
+}
+
+def AlignUp : Builtin {
+  let Spellings = ["__builtin_align_up"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "void*(void const*, size_t)";
+}
+
+def AlignDown : Builtin {
+  let Spellings = ["__builtin_align_down"];
+  let Attributes = [NoThrow, Const, CustomTypeChecking, Constexpr];
+  let Prototype = "void*(void const*, size_t)";
+}
+
+// Safestack builtins.
+def GetUnsafeStackStart : Builtin {
+  let Spellings = ["__builtin___get_unsafe_stack_start"];
+  let Attributes = [NoThrow, FunctionWithBuiltinPrefix];
+  let Prototype = "void*()";
+}
+
+def GetUnsafeStackBottom : Builtin {
+  let Spellings = ["__builtin___get_unsafe_stack_bottom"];
+  let Attributes = [NoThrow, FunctionWithBuiltinPrefix];
+  let Prototype = "void*()";
+}
+
+def GetUnsafeStackTop : Builtin {
+  let Spellings = ["__builtin___get_unsafe_stack_top"];
+  let Attributes = [NoThrow, FunctionWithBuiltinPrefix];
+  let Prototype = "void*()";
+}
+
+def GetUnsafeStackPtr : Builtin {
+  let Spellings = ["__builtin___get_unsafe_stack_ptr"];
+  let Attributes = [NoThrow, FunctionWithBuiltinPrefix];
+  let Prototype = "void*()";
+}
+
+// Nontemporal loads/stores builtins.
+def NontemporalStore : Builtin {
+  let Spellings = ["__builtin_nontemporal_store"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+def NontemporalLoad : Builtin {
+  let Spellings = ["__builtin_nontemporal_load"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "void(...)";
+}
+
+// Coroutine intrinsics
+def CoroResume : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_resume"];
+  let Prototype = "void(void*)";
+}
+
+def CoroDestroy : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_destroy"];
+  let Prototype = "void(void*)";
+}
+
+def CoroDone : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_done"];
+  let Attributes = [NoThrow];
+  let Prototype = "bool(void*)";
+}
+
+def CoroPromise : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_promise"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(void*, _Constant int, _Constant bool)";
+}
+
+def CoroSize : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_size"];
+  let Attributes = [NoThrow];
+  let Prototype = "size_t()";
+}
+
+def CoroAlign : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_align"];
+  let Attributes = [NoThrow];
+  let Prototype = "size_t()";
+}
+
+def CoroFrame : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_frame"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*()";
+}
+
+def CoroNoop : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_noop"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*()";
+}
+
+def CoroFree : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_free"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(void*)";
+}
+
+def CoroId : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_id"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(_Constant int, void*, void*, void*)";
+}
+
+def CoroAlloc : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_alloc"];
+  let Attributes = [NoThrow];
+  let Prototype = "bool()";
+}
+
+def CoroBegin : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_begin"];
+  let Attributes = [NoThrow];
+  let Prototype = "void*(void*)";
+}
+
+def CoroEnd : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_end"];
+  let Attributes = [NoThrow];
+  let Prototype = "bool(void*, _Constant bool)";
+}
+
+def CoroSuspend : CoroLangBuiltin {
+  let Spellings = ["__builtin_coro_suspend"];
+  let Attributes = [NoThrow];
+  let Prototype = "char(_Constant bool)";
+}
+
+// OpenCL v2.0 s6.13.16, s9.17.3.5 - Pipe functions.
+// We need the generic prototype, since the packet type could be anything.
+def ReadPipe : OCLPipeLangBuiltin {
+  let Spellings = ["read_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "int(...)";
+}
+
+def WritePipe : OCLPipeLangBuiltin {
+  let Spellings = ["write_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "int(...)";
+}
+
+def ReserveReadPipe : OCLPipeLangBuiltin {
+  let Spellings = ["reserve_read_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "int(...)";
+}
+
+def ReserveWritePipe : OCLPipeLangBuiltin {
+  let Spellings = ["reserve_write_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "int(...)";
+}
+
+def CommitWritePipe : OCLPipeLangBuiltin {
+  let Spellings = ["commit_write_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void(...)";
+}
+
+def CommitReadPipe : OCLPipeLangBuiltin {
+  let Spellings = ["commit_read_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void(...)";
+}
+
+def SubGroupReserveReadPipe : OCLPipeLangBuiltin {
+  let Spellings = ["sub_group_reserve_read_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "int(...)";
+}
+
+def SubGroupReserveWritePipe : OCLPipeLangBuiltin {
+  let Spellings = ["sub_group_reserve_write_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "int(...)";
+}
+
+def SubGroupCommitWritePipe : OCLPipeLangBuiltin {
+  let Spellings = ["sub_group_commit_write_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void(...)";
+}
+
+def SubGroupCommitReadPipe : OCLPipeLangBuiltin {
+  let Spellings = ["sub_group_commit_read_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void(...)";
+}
+
+def WorkGroupReserveReadPipe : OCLPipeLangBuiltin {
+  let Spellings = ["work_group_reserve_read_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "int(...)";
+}
+
+def WorkGroupReserveWritePipe : OCLPipeLangBuiltin {
+  let Spellings = ["work_group_reserve_write_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "int(...)";
+}
+
+def WorkGroupCommitWritePipe : OCLPipeLangBuiltin {
+  let Spellings = ["work_group_commit_write_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void(...)";
+}
+
+def WorkGroupCommitReadPipe : OCLPipeLangBuiltin {
+  let Spellings = ["work_group_commit_read_pipe"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void(...)";
+}
+
+def GetPipeNumPackets : OCLPipeLangBuiltin {
+  let Spellings = ["get_pipe_num_packets"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "unsigned int(...)";
+}
+
+def GetPipeMaxPackets : OCLPipeLangBuiltin {
+  let Spellings = ["get_pipe_max_packets"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "unsigned int(...)";
+}
+
+// OpenCL v2.0 s6.13.17 - Enqueue kernel functions.
+// Custom builtin check allows to perform special check of passed block arguments.
+def EnqueueKernel : OCL_DSELangBuiltin {
+  let Spellings = ["enqueue_kernel"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "int(...)";
+}
+
+def GetKernelWorkGroupSize : OCL_DSELangBuiltin {
+  let Spellings = ["get_kernel_work_group_size"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "unsigned int(...)";
+}
+
+def GetKernelPreferredWorkGroupSizeMultiple : OCL_DSELangBuiltin {
+  let Spellings = ["get_kernel_preferred_work_group_size_multiple"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "unsigned int(...)";
+}
+
+def GetKernelMaxSubGroupSizeForNDRange : OCL_DSELangBuiltin {
+  let Spellings = ["get_kernel_max_sub_group_size_for_ndrange"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "unsigned int(...)";
+}
+
+def GetKernelSubGroupCountForNDRange : OCL_DSELangBuiltin {
+  let Spellings = ["get_kernel_sub_group_count_for_ndrange"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "unsigned int(...)";
+}
+
+// OpenCL v2.0 s6.13.9 - Address space qualifier functions.
+// FIXME: Pointer parameters of OpenCL builtins should have their address space
+// requirement defined.
+def ToGlobal : OCL_GASLangBuiltin {
+  let Spellings = ["to_global"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void*(void*)";
+}
+
+def ToLocal : OCL_GASLangBuiltin {
+  let Spellings = ["to_local"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void*(void*)";
+}
+
+def ToPrivate : OCL_GASLangBuiltin {
+  let Spellings = ["to_private"];
+  let Attributes = [CustomTypeChecking, NoThrow];
+  let Prototype = "void*(void*)";
+}
+
+// OpenCL half load/store builtin.
+def StoreHalf : OCLLangBuiltin, FloatDoubleTemplate {
+  let Spellings = ["__builtin_store_half"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(T, __fp16*)";
+}
+
+def LoadHalf : OCLLangBuiltin, FloatDoubleTemplate {
+  let Spellings = ["__builtin_load_half"];
+  // FIXME: Is this actually Const? This looks like it shoud be Pure.
+  let Attributes = [NoThrow, Const];
+  let Prototype = "T(__fp16 const*)";
+}
+
+// Builtins for os_log/os_trace.
+def OSLogFormatBufferSize : Builtin {
+  let Spellings = ["__builtin_os_log_format_buffer_size"];
+  let Attributes = [PrintfFormat<0>, NoThrow, UnevaluatedArguments,
+                    CustomTypeChecking, Constexpr];
+  let Prototype = "size_t(char const*, ...)";
+}
+
+def OSLogFormat : Builtin {
+  let Spellings = ["__builtin_os_log_format"];
+  // FIXME: The printf attribute looks suspiciously like it should be argument #1.
+  let Attributes = [PrintfFormat<0>, NoThrow, CustomTypeChecking];
+  let Prototype = "void*(void*, char const*, ...)";
+}
+
+// CUDA/HIP
+def GetDeviceSideMangledName : LangBuiltin<"CUDA_LANG"> {
+  let Spellings = ["__builtin_get_device_side_mangled_name"];
+  let Attributes = [NoThrow, Const, IgnoreSignature];
+  let Prototype = "char const*(...)";
+}
+
+// HLSL
+def HLSLWaveActiveCountBits : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_wave_active_count_bits"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "unsigned int(bool)";
+}
+
+def HLSLCreateHandle : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_create_handle"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void*(unsigned char)";
+}
+
+// Builtins for XRay.
+def XRayCustomEvent : Builtin {
+  let Spellings = ["__xray_customevent"];
+  let Prototype = "void(char const*, size_t)";
+}
+
+def XRayTypedEvent : Builtin {
+  let Spellings = ["__xray_typedevent"];
+  let Prototype = "void(size_t, char const*, size_t)";
+}
+
+// Win64-compatible va_list functions.
+def MSVaStart : Builtin {
+  let Spellings = ["__builtin_ms_va_start"];
+  let Attributes = [NoThrow, CustomTypeChecking];
+  let Prototype = "void(char*&, ...)";
+}
+
+def MSVaEnd : Builtin {
+  let Spellings = ["__builtin_ms_va_end"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(char*&)";
+}
+
+def MSVaCopy : Builtin {
+  let Spellings = ["__builtin_ms_va_copy"];
+  let Attributes = [NoThrow];
+  let Prototype = "void(char*&, char*&)";
+}
+
+// Arithmetic Fence: to prevent FP reordering and reassociation optimizations
+// FIXME: Should this just be a Builtin?
+def ArithmeticFence : LangBuiltin<"ALL_LANGUAGES"> {
+  let Spellings = ["__arithmetic_fence"];
+  let Attributes = [CustomTypeChecking, Constexpr];
+  let Prototype = "void(...)";
+}
diff --git a/clang/include/clang/Basic/BuiltinsBPF.def b/clang/include/clang/Basic/BuiltinsBPF.def
deleted file mode 100644
index 190062645ec4d..0000000000000
--- a/clang/include/clang/Basic/BuiltinsBPF.def
+++ /dev/null
@@ -1,33 +0,0 @@
-//===--- BuiltinsBPF.def - BPF Builtin function database --------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines the BPF-specific builtin function database.  Users of
-// this file must define the BUILTIN macro to make use of this information.
-//
-//===----------------------------------------------------------------------===//
-
-// The format of this database matches clang/Basic/Builtins.def.
-
-#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
-#   define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
-#endif
-
-// Get record field information.
-TARGET_BUILTIN(__builtin_preserve_field_info, "Ui.", "t", "")
-
-// Get BTF type id.
-TARGET_BUILTIN(__builtin_btf_type_id, "LUi.", "t", "")
-
-// Get type information.
-TARGET_BUILTIN(__builtin_preserve_type_info, "Ui.", "t", "")
-
-// Preserve enum value.
-TARGET_BUILTIN(__builtin_preserve_enum_value, "Li.", "t", "")
-
-#undef BUILTIN
-#undef TARGET_BUILTIN
diff --git a/clang/include/clang/Basic/BuiltinsBPF.td b/clang/include/clang/Basic/BuiltinsBPF.td
new file mode 100644
index 0000000000000..169d05c870998
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsBPF.td
@@ -0,0 +1,37 @@
+//===--- BuiltinsBPF.td - BPF Builtin function database ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "clang/Basic/BuiltinsBase.td"
+
+// Get record field information
+def PreserveFieldInfo : TargetBuiltin {
+  let Spellings = ["__builtin_preserve_field_info"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "unsigned int(...)";
+}
+
+// Get BTF type id
+def BtfTypeID : TargetBuiltin {
+  let Spellings = ["__builtin_btf_type_id"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "long unsigned int(...)";
+}
+
+// Get type information
+def PreserveTypeInfo : TargetBuiltin {
+  let Spellings = ["__builtin_preserve_type_info"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "long unsigned int(...)";
+}
+
+// Preserve enum value
+def PreserveEnumValue : TargetBuiltin {
+  let Spellings = ["__builtin_preserve_enum_value"];
+  let Attributes = [CustomTypeChecking];
+  let Prototype = "long int(...)";
+}
diff --git a/clang/include/clang/Basic/BuiltinsBase.td b/clang/include/clang/Basic/BuiltinsBase.td
new file mode 100644
index 0000000000000..b65b41be03265
--- /dev/null
+++ b/clang/include/clang/Basic/BuiltinsBase.td
@@ -0,0 +1,121 @@
+//===--- BuiltinsBase.td - common structured used by builtins ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Attributes
+// ==========
+
+class Attribute<string mangling> {
+  string Mangling = mangling;
+}
+
+class IndexedAttribute<string baseMangling, int I> : Attribute<baseMangling> {
+  int Index = I;
+}
+
+// Standard Attributes
+// -------------------
+def NoReturn : Attribute<"r">;
+
+// Attributes from the gnu:: namespace
+// -----------------------------------
+def Const : Attribute<"c">;
+def NoThrow : Attribute<"n">;
+def Pure : Attribute<"U">;
+def ReturnsTwice : Attribute<"j">;
+//  FIXME: gcc has nonnull
+
+// builtin-specific attributes
+// ---------------------------
+
+// Signature is meaningless, use custom typechecking.
+def CustomTypeChecking : Attribute<"t">;
+
+// Type is not important to semantic analysis and codegen; recognize as builtin
+// even if type doesn't match signature, and don't warn if we can't be sure the
+// type is right.
+def IgnoreSignature : Attribute<"T">;
+
+// Arguments are not evaluated for their side-effects.
+def UnevaluatedArguments : Attribute<"u">;
+
+// FIXME: This is misused in a lot of the places it is used currently.
+// This function is equivalent to a library function without the __builtin_
+// prefix. This is relevant for CodeGen; it should not be used if custom CodeGen
+// is required for a builtin.
+def FunctionWithBuiltinPrefix : Attribute<"F">;
+
+// const, but only when -fno-math-errno and FP exceptions are ignored.
+def ConstIgnoringErrnoAndExceptions : Attribute<"e">;
+
+// const when FP exceptions are ignored.
+def ConstIgnoringExceptions : Attribute<"g">;
+
+// This function requires a specific header or an explicit declaration.
+def RequireDeclaration : Attribute<"h">;
+
+class PrintfFormat<int I> : IndexedAttribute<"p", I>;
+class VPrintfFormat<int I> : IndexedAttribute<"P", I>;
+class ScanfFormat<int I> : IndexedAttribute<"s", I>;
+class VScanfFormat<int I> : IndexedAttribute<"S", I>;
+
+// Other Attributes
+// ----------------
+
+// Builtin can be constant evaluated
+def Constexpr : Attribute<"E">;
+
+// Builtin kinds
+// =============
+
+class Builtin {
+  list<string> Spellings;
+  list<Attribute> Attributes = [];
+  string Prototype;
+  string Namespace;
+  // On some platforms, some functions are actually macros. In that case we need
+  // to #undef them.
+  bit RequiresUndef = 0;
+}
+
+class CustomEntry {
+  string Entry;
+}
+
+class AtomicBuiltin : Builtin;
+class TargetBuiltin : Builtin;
+
+class LibBuiltin<string header, string languages = "ALL_LANGUAGES"> : Builtin {
+  string Header = header;
+  string Languages = languages;
+  bit AddBuiltinPrefixedAlias = 0;
+  bit OnlyBuiltinPrefixedAliasIsConstexpr = 0;
+}
+
+class MSLibBuiltin<string header> : LibBuiltin<header, "ALL_MS_LANGUAGES">;
+class GNULibBuiltin<string header> : LibBuiltin<header, "ALL_GNU_LANGUAGES">;
+class ObjCLibBuiltin<string header> : LibBuiltin<header, "OBJC_LANG">;
+class CxxLibBuiltin<string header> : LibBuiltin<header, "CXX_LANG">;
+
+class LangBuiltin<string languages> : Builtin {
+  string Languages = languages;
+}
+
+class MSLangBuiltin : LangBuiltin<"ALL_MS_LANGUAGES">;
+class CoroLangBuiltin : LangBuiltin<"COR_LANG">;
+class OCLPipeLangBuiltin : LangBuiltin<"OCL_PIPE">;
+class OCL_DSELangBuiltin : LangBuiltin<"OCL_DSE">;
+class OCL_GASLangBuiltin : LangBuiltin<"OCL_GAS">;
+class OCLLangBuiltin : LangBuiltin<"ALL_OCL_LANGUAGES">;
+
+class Template<list<string> substitutions,
+               list<string> affixes,
+               bit as_prefix = 0> {
+  list<string> Substitutions = substitutions;
+  list<string> Affixes = affixes;
+  bit AsPrefix = as_prefix;
+}
diff --git a/clang/include/clang/Basic/CMakeLists.txt b/clang/include/clang/Basic/CMakeLists.txt
index be216150c71bb..9689a0f48c3ca 100644
--- a/clang/include/clang/Basic/CMakeLists.txt
+++ b/clang/include/clang/Basic/CMakeLists.txt
@@ -57,6 +57,14 @@ clang_tablegen(AttrHasAttributeImpl.inc -gen-clang-attr-has-attribute-impl
   TARGET ClangAttrHasAttributeImpl
   )
 
+clang_tablegen(Builtins.inc -gen-clang-builtins
+  SOURCE Builtins.td
+  TARGET ClangBuiltins)
+
+clang_tablegen(BuiltinsBPF.inc -gen-clang-builtins
+  SOURCE BuiltinsBPF.td
+  TARGET ClangBuiltinsBPF)
+
 # ARM NEON and MVE
 clang_tablegen(arm_neon.inc -gen-arm-neon-sema
   SOURCE arm_neon.td
diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h
index c31834fb52a90..a4abaaef44c06 100644
--- a/clang/include/clang/Basic/TargetBuiltins.h
+++ b/clang/include/clang/Basic/TargetBuiltins.h
@@ -84,7 +84,7 @@ namespace clang {
   enum {
     LastTIBuiltin = clang::Builtin::FirstTSBuiltin - 1,
   #define BUILTIN(ID, TYPE, ATTRS) BI##ID,
-  #include "clang/Basic/BuiltinsBPF.def"
+  #include "clang/Basic/BuiltinsBPF.inc"
     LastTSBuiltin
   };
   }
diff --git a/clang/include/module.modulemap b/clang/include/module.modulemap
index 52395ee9b0fc9..794526bc289c0 100644
--- a/clang/include/module.modulemap
+++ b/clang/include/module.modulemap
@@ -42,8 +42,6 @@ module Clang_Basic {
   textual header "clang/Basic/BuiltinsAArch64NeonSVEBridge.def"
   textual header "clang/Basic/BuiltinsAArch64NeonSVEBridge_cg.def"
   textual header "clang/Basic/BuiltinsARM.def"
-  textual header "clang/Basic/BuiltinsBPF.def"
-  textual header "clang/Basic/Builtins.def"
   textual header "clang/Basic/BuiltinHeaders.def"
   textual header "clang/Basic/BuiltinsHexagon.def"
   textual header "clang/Basic/BuiltinsHexagonDep.def"
diff --git a/clang/lib/AST/StmtPrinter.cpp b/clang/lib/AST/StmtPrinter.cpp
index 9d4aa07ec4da7..9b5bf436c13be 100644
--- a/clang/lib/AST/StmtPrinter.cpp
+++ b/clang/lib/AST/StmtPrinter.cpp
@@ -1833,7 +1833,7 @@ void StmtPrinter::VisitAtomicExpr(AtomicExpr *Node) {
   case AtomicExpr::AO ## ID: \
     Name = #ID "("; \
     break;
-#include "clang/Basic/Builtins.def"
+#include "clang/Basic/Builtins.inc"
   }
   OS << Name;
 
diff --git a/clang/lib/Basic/Builtins.cpp b/clang/lib/Basic/Builtins.cpp
index d366989bafc56..3467847ac1672 100644
--- a/clang/lib/Basic/Builtins.cpp
+++ b/clang/lib/Basic/Builtins.cpp
@@ -38,7 +38,7 @@ static constexpr Builtin::Info BuiltinInfo[] = {
   {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, LANGS},
 #define LIBBUILTIN(ID, TYPE, ATTRS, HEADER, LANGS)                             \
   {#ID, TYPE, ATTRS, nullptr, HeaderDesc::HEADER, LANGS},
-#include "clang/Basic/Builtins.def"
+#include "clang/Basic/Builtins.inc"
 };
 
 const Builtin::Info &Builtin::Context::getRecord(unsigned ID) const {
diff --git a/clang/lib/Basic/Targets/BPF.cpp b/clang/lib/Basic/Targets/BPF.cpp
index e713e08479223..e3fbbb720d069 100644
--- a/clang/lib/Basic/Targets/BPF.cpp
+++ b/clang/lib/Basic/Targets/BPF.cpp
@@ -22,7 +22,7 @@ using namespace clang::targets;
 static constexpr Builtin::Info BuiltinInfo[] = {
 #define BUILTIN(ID, TYPE, ATTRS)                                               \
   {#ID, TYPE, ATTRS, nullptr, HeaderDesc::NO_HEADER, ALL_LANGUAGES},
-#include "clang/Basic/BuiltinsBPF.def"
+#include "clang/Basic/BuiltinsBPF.inc"
 };
 
 void BPFTargetInfo::getTargetDefines(const LangOptions &Opts,
diff --git a/clang/lib/Sema/OpenCLBuiltins.td b/clang/lib/Sema/OpenCLBuiltins.td
index 0cceba090bd8f..a7bdfe20b9828 100644
--- a/clang/lib/Sema/OpenCLBuiltins.td
+++ b/clang/lib/Sema/OpenCLBuiltins.td
@@ -936,7 +936,7 @@ def : Builtin<"read_mem_fence", [Void, MemFenceFlags]>;
 def : Builtin<"write_mem_fence", [Void, MemFenceFlags]>;
 
 // OpenCL v3.0 s6.15.10 - Address Space Qualifier Functions.
-// to_global, to_local, to_private are declared in Builtins.def.
+// to_global, to_local, to_private are declared in Builtins.td.
 
 let Extension = FuncExtOpenCLCGenericAddressSpace in {
   // The OpenCL 3.0 specification defines these with a "gentype" argument indicating any builtin
@@ -1448,25 +1448,25 @@ let Extension = FuncExtOpenCLCWGCollectiveFunctions in {
 //--------------------------------------------------------------------
 // OpenCL2.0 : 6.13.16 : Pipe Functions
 // --- Table 27 ---
-// Defined in Builtins.def
+// Defined in Builtins.td
 
 // --- Table 28 ---
-// Builtins taking pipe arguments are defined in Builtins.def
+// Builtins taking pipe arguments are defined in Builtins.td
 let Extension = FuncExtOpenCLCPipes in {
   def : Builtin<"is_valid_reserve_id", [Bool, ReserveId]>;
 }
 
 // --- Table 29 ---
-// Defined in Builtins.def
+// Defined in Builtins.td
 
 
 //--------------------------------------------------------------------
 // OpenCL2.0 : 6.13.17 : Enqueuing Kernels
 // --- Table 30 ---
-// Defined in Builtins.def
+// Defined in Builtins.td
 
 // --- Table 32 ---
-// Defined in Builtins.def
+// Defined in Builtins.td
 
 // --- Table 33 ---
 let Extension = FuncExtOpenCLCDeviceEnqueue in {
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index 7833d5a2ea20e..4d280f25cc04c 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2471,7 +2471,7 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
 #define ATOMIC_BUILTIN(ID, TYPE, ATTRS) \
   case Builtin::BI##ID: \
     return SemaAtomicOpsOverloaded(TheCallResult, AtomicExpr::AO##ID);
-#include "clang/Basic/Builtins.def"
+#include "clang/Basic/Builtins.inc"
   case Builtin::BI__annotation:
     if (SemaBuiltinMSVCAnnotation(*this, TheCall))
       return ExprError();
@@ -7869,18 +7869,18 @@ ExprResult Sema::BuildAtomicExpr(SourceRange CallRange, SourceRange ExprRange,
   static_assert(sizeof(NumArgs)/sizeof(NumArgs[0]) == NumForm
       && sizeof(NumVals)/sizeof(NumVals[0]) == NumForm,
       "need to update code for modified forms");
-  static_assert(AtomicExpr::AO__c11_atomic_init == 0 &&
-                    AtomicExpr::AO__c11_atomic_fetch_min + 1 ==
-                        AtomicExpr::AO__atomic_load,
+  static_assert(AtomicExpr::AO__atomic_add_fetch == 0 &&
+                    AtomicExpr::AO__atomic_xor_fetch + 1 ==
+                        AtomicExpr::AO__c11_atomic_compare_exchange_strong,
                 "need to update code for modified C11 atomics");
-  bool IsOpenCL = Op >= AtomicExpr::AO__opencl_atomic_init &&
-                  Op <= AtomicExpr::AO__opencl_atomic_fetch_max;
-  bool IsHIP = Op >= AtomicExpr::AO__hip_atomic_load &&
-               Op <= AtomicExpr::AO__hip_atomic_fetch_max;
-  bool IsScoped = Op >= AtomicExpr::AO__scoped_atomic_load &&
-                  Op <= AtomicExpr::AO__scoped_atomic_fetch_max;
-  bool IsC11 = (Op >= AtomicExpr::AO__c11_atomic_init &&
-               Op <= AtomicExpr::AO__c11_atomic_fetch_min) ||
+  bool IsOpenCL = Op >= AtomicExpr::AO__opencl_atomic_compare_exchange_strong &&
+                  Op <= AtomicExpr::AO__opencl_atomic_store;
+  bool IsHIP = Op >= AtomicExpr::AO__hip_atomic_compare_exchange_strong &&
+               Op <= AtomicExpr::AO__hip_atomic_store;
+  bool IsScoped = Op >= AtomicExpr::AO__scoped_atomic_add_fetch &&
+                  Op <= AtomicExpr::AO__scoped_atomic_xor_fetch;
+  bool IsC11 = (Op >= AtomicExpr::AO__c11_atomic_compare_exchange_strong &&
+                Op <= AtomicExpr::AO__c11_atomic_store) ||
                IsOpenCL;
   bool IsN = Op == AtomicExpr::AO__atomic_load_n ||
              Op == AtomicExpr::AO__atomic_store_n ||
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 6413a48f809ac..695b871d53abf 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -7504,7 +7504,7 @@ ExprResult Sema::BuildResolvedCallExpr(Expr *Fn, NamedDecl *NDecl,
     // Extract the return type from the (builtin) function pointer type.
     // FIXME Several builtins still have setType in
     // Sema::CheckBuiltinFunctionCall. One should review their definitions in
-    // Builtins.def to ensure they are correct before removing setType calls.
+    // Builtins.td to ensure they are correct before removing setType calls.
     QualType FnPtrTy = Context.getPointerType(FDecl->getType());
     Result = ImpCastExprToType(Fn, FnPtrTy, CK_BuiltinFnToFnPtr).get();
     ResultTy = FDecl->getCallResultType();
diff --git a/clang/test/Analysis/bstring.c b/clang/test/Analysis/bstring.c
index 5d86241a4ac9a..f015e0b5d9fb7 100644
--- a/clang/test/Analysis/bstring.c
+++ b/clang/test/Analysis/bstring.c
@@ -4,7 +4,7 @@
 // RUN:   -analyzer-checker=alpha.unix.cstring \
 // RUN:   -analyzer-disable-checker=alpha.unix.cstring.UninitializedRead \
 // RUN:   -analyzer-checker=debug.ExprInspection \
-// RUN:   -analyzer-config eagerly-assume=false  
+// RUN:   -analyzer-config eagerly-assume=false
 //
 // RUN: %clang_analyze_cc1 -verify %s -DUSE_BUILTINS \
 // RUN:   -analyzer-checker=core \
@@ -131,7 +131,7 @@ void memcpy5(void) {
 
 void memcpy6(void) {
   int a[4] = {0};
-  memcpy(a, a, 8); // expected-warning{{overlapping}}  
+  memcpy(a, a, 8); // expected-warning{{overlapping}}
 }
 
 void memcpy7(void) {
@@ -257,7 +257,7 @@ void mempcpy5(void) {
 
 void mempcpy6(void) {
   int a[4] = {0};
-  mempcpy(a, a, 8); // expected-warning{{overlapping}}  
+  mempcpy(a, a, 8); // expected-warning{{overlapping}}
 }
 
 void mempcpy7(void) {
@@ -319,7 +319,7 @@ void mempcpy15(void) {
 
   p1 = (&s2) + 1;
   p2 = mempcpy(&s2, &s1, sizeof(struct st));
-  
+
   clang_analyzer_eval(p1 == p2); // expected-warning{{TRUE}}
 }
 
@@ -405,7 +405,7 @@ void memmove2 (void) {
 #define bcmp BUILTIN(bcmp)
 int bcmp(const void *s1, const void *s2, size_t n);
 #define memcmp bcmp
-// 
+//
 #else /* VARIANT */
 
 #define memcmp BUILTIN(memcmp)
diff --git a/clang/test/CodeGen/callback_pthread_create.c b/clang/test/CodeGen/callback_pthread_create.c
index 80457cb3ade3b..31bc2e9098f45 100644
--- a/clang/test/CodeGen/callback_pthread_create.c
+++ b/clang/test/CodeGen/callback_pthread_create.c
@@ -1,4 +1,4 @@
-// FIXME: pthread_create() definition in Builtins.def doesn't match the real one, so it doesn't get recognized as a builtin and attributes aren't added.
+// FIXME: pthread_create() definition in Builtins.td doesn't match the real one, so it doesn't get recognized as a builtin and attributes aren't added.
 // RUN: false
 // XFAIL: *
 
diff --git a/clang/utils/TableGen/CMakeLists.txt b/clang/utils/TableGen/CMakeLists.txt
index f136cbfee5fa1..2ca4a96cadb67 100644
--- a/clang/utils/TableGen/CMakeLists.txt
+++ b/clang/utils/TableGen/CMakeLists.txt
@@ -7,6 +7,7 @@ add_tablegen(clang-tblgen CLANG
   ClangASTNodesEmitter.cpp
   ClangASTPropertiesEmitter.cpp
   ClangAttrEmitter.cpp
+  ClangBuiltinsEmitter.cpp
   ClangCommentCommandInfoEmitter.cpp
   ClangCommentHTMLNamedCharacterReferenceEmitter.cpp
   ClangCommentHTMLTagsEmitter.cpp
diff --git a/clang/utils/TableGen/ClangBuiltinsEmitter.cpp b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
new file mode 100644
index 0000000000000..dc10fa14c5959
--- /dev/null
+++ b/clang/utils/TableGen/ClangBuiltinsEmitter.cpp
@@ -0,0 +1,334 @@
+//=- ClangBuiltinsEmitter.cpp - Generate Clang builtins tables -*- C++ -*-====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This tablegen backend emits Clang's builtins tables.
+//
+//===----------------------------------------------------------------------===//
+
+#include "TableGenBackends.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
+#include "llvm/TableGen/TableGenBackend.h"
+
+using namespace llvm;
+
+namespace {
+enum class BuiltinType {
+  Builtin,
+  AtomicBuiltin,
+  LibBuiltin,
+  LangBuiltin,
+  TargetBuiltin,
+};
+
+class PrototypeParser {
+public:
+  PrototypeParser(StringRef Substitution, const Record *Builtin)
+      : Loc(Builtin->getFieldLoc("Prototype")), Substitution(Substitution) {
+    ParsePrototype(Builtin->getValueAsString("Prototype"));
+  }
+
+private:
+  void ParsePrototype(StringRef Prototype) {
+    Prototype = Prototype.trim();
+    ParseTypes(Prototype);
+  }
+
+  void ParseTypes(StringRef &Prototype) {
+    auto ReturnType = Prototype.take_until([](char c) { return c == '('; });
+    ParseType(ReturnType);
+    Prototype = Prototype.drop_front(ReturnType.size() + 1);
+    if (!Prototype.ends_with(")"))
+      PrintFatalError(Loc, "Expected closing brace at end of prototype");
+    Prototype = Prototype.drop_back();
+    for (auto T = Prototype.split(','); !T.first.empty();
+         Prototype = T.second, T = Prototype.split(','))
+      ParseType(T.first);
+  }
+
+  void ParseType(StringRef T) {
+    T = T.trim();
+    if (T.consume_back("*")) {
+      ParseType(T);
+      Type += "*";
+    } else if (T.consume_back("const")) {
+      ParseType(T);
+      Type += "C";
+    } else if (T.consume_back("volatile")) {
+      ParseType(T);
+      Type += "D";
+    } else if (T.consume_back("restrict")) {
+      ParseType(T);
+      Type += "R";
+    } else if (T.consume_back("&")) {
+      ParseType(T);
+      Type += "&";
+    } else if (T.consume_front("long")) {
+      Type += "L";
+      ParseType(T);
+    } else if (T.consume_front("unsigned")) {
+      Type += "U";
+      ParseType(T);
+    } else if (T.consume_front("_Complex")) {
+      Type += "X";
+      ParseType(T);
+    } else if (T.consume_front("_Constant")) {
+      Type += "I";
+      ParseType(T);
+    } else if (T.consume_front("T")) {
+      if (Substitution.empty())
+        PrintFatalError(Loc, "Not a template");
+      ParseType(Substitution);
+    } else {
+      auto ReturnTypeVal = StringSwitch<std::string>(T)
+                               .Case("__builtin_va_list_ref", "A")
+                               .Case("__builtin_va_list", "a")
+                               .Case("__float128", "LLd")
+                               .Case("__fp16", "h")
+                               .Case("__int128_t", "LLLi")
+                               .Case("_Float16", "x")
+                               .Case("bool", "b")
+                               .Case("char", "c")
+                               .Case("constant_CFString", "F")
+                               .Case("double", "d")
+                               .Case("FILE", "P")
+                               .Case("float", "f")
+                               .Case("id", "G")
+                               .Case("int", "i")
+                               .Case("int32_t", "Zi")
+                               .Case("int64_t", "Wi")
+                               .Case("jmp_buf", "J")
+                               .Case("msint32_t", "Ni")
+                               .Case("msuint32_t", "UNi")
+                               .Case("objc_super", "M")
+                               .Case("pid_t", "p")
+                               .Case("ptrdiff_t", "Y")
+                               .Case("SEL", "H")
+                               .Case("short", "s")
+                               .Case("sigjmp_buf", "SJ")
+                               .Case("size_t", "z")
+                               .Case("ucontext_t", "K")
+                               .Case("uint32_t", "UZi")
+                               .Case("uint64_t", "UWi")
+                               .Case("void", "v")
+                               .Case("wchar_t", "w")
+                               .Case("...", ".")
+                               .Default("error");
+      if (ReturnTypeVal == "error")
+        PrintFatalError(Loc, "Unknown Type: " + T);
+      Type += ReturnTypeVal;
+    }
+  }
+
+public:
+  void Print(llvm::raw_ostream &OS) const { OS << ", \"" << Type << '\"'; }
+
+private:
+  SMLoc Loc;
+  StringRef Substitution;
+  std::string Type;
+};
+
+class HeaderNameParser {
+public:
+  HeaderNameParser(const Record *Builtin) {
+    for (char c : Builtin->getValueAsString("Header")) {
+      if (std::islower(c))
+        HeaderName += static_cast<char>(std::toupper(c));
+      else if (c == '.' || c == '_' || c == '/' || c == '-')
+        HeaderName += '_';
+      else
+        PrintFatalError(Builtin->getLoc(), "Unexpected header name");
+    }
+  }
+
+  void Print(llvm::raw_ostream &OS) const { OS << HeaderName; }
+
+private:
+  std::string HeaderName;
+};
+
+void PrintAttributes(const Record *Builtin, BuiltinType BT,
+                     llvm::raw_ostream &OS) {
+  OS << '\"';
+  if (Builtin->isSubClassOf("LibBuiltin")) {
+    if (BT == BuiltinType::LibBuiltin) {
+      OS << 'f';
+    } else {
+      OS << 'F';
+      if (Builtin->getValueAsBit("OnlyBuiltinPrefixedAliasIsConstexpr"))
+        OS << 'E';
+    }
+  }
+
+  if (auto NS = Builtin->getValueAsOptionalString("Namespace")) {
+    if (NS != "std")
+      PrintFatalError(Builtin->getFieldLoc("Namespace"), "Unknown namespace: ");
+    OS << "z";
+  }
+
+  for (const auto *Attr : Builtin->getValueAsListOfDefs("Attributes")) {
+    OS << Attr->getValueAsString("Mangling");
+    if (Attr->isSubClassOf("IndexedAttribute"))
+      OS << ':' << Attr->getValueAsInt("Index") << ':';
+  }
+  OS << '\"';
+}
+
+void EmitBuiltinDef(llvm::raw_ostream &OS, StringRef Substitution,
+                    const Record *Builtin, Twine Spelling, BuiltinType BT) {
+  if (Builtin->getValueAsBit("RequiresUndef"))
+    OS << "#undef " << Spelling << '\n';
+  switch (BT) {
+  case BuiltinType::LibBuiltin:
+    OS << "LIBBUILTIN";
+    break;
+  case BuiltinType::LangBuiltin:
+    OS << "LANGBUILTIN";
+    break;
+  case BuiltinType::Builtin:
+    OS << "BUILTIN";
+    break;
+  case BuiltinType::AtomicBuiltin:
+    OS << "ATOMIC_BUILTIN";
+    break;
+  case BuiltinType::TargetBuiltin:
+    OS << "TARGET_BUILTIN";
+    break;
+  }
+
+  OS << "(" << Spelling;
+  PrototypeParser{Substitution, Builtin}.Print(OS);
+  OS << ", ";
+  PrintAttributes(Builtin, BT, OS);
+
+  switch (BT) {
+  case BuiltinType::LibBuiltin: {
+    OS << ", ";
+    HeaderNameParser{Builtin}.Print(OS);
+    [[fallthrough]];
+  }
+  case BuiltinType::LangBuiltin: {
+    OS << ", " << Builtin->getValueAsString("Languages");
+    break;
+  }
+  case BuiltinType::TargetBuiltin:
+    OS << ", \"\"";
+    break;
+  case BuiltinType::AtomicBuiltin:
+  case BuiltinType::Builtin:
+    break;
+  }
+  OS << ")\n";
+}
+
+struct TemplateInsts {
+  std::vector<std::string> Substitution;
+  std::vector<std::string> Affix;
+  bool IsPrefix;
+};
+
+TemplateInsts getTemplateInsts(const Record *R) {
+  TemplateInsts temp;
+  auto Substitutions = R->getValueAsListOfStrings("Substitutions");
+  auto Affixes = R->getValueAsListOfStrings("Affixes");
+  temp.IsPrefix = R->getValueAsBit("AsPrefix");
+
+  if (Substitutions.size() != Affixes.size())
+    PrintFatalError(R->getLoc(), "Substitutions and affixes "
+                                 "don't have the same lengths");
+
+  for (auto [Affix, Substitution] : llvm::zip(Affixes, Substitutions)) {
+    temp.Substitution.emplace_back(Substitution);
+    temp.Affix.emplace_back(Affix);
+  }
+  return temp;
+}
+
+void EmitBuiltin(llvm::raw_ostream &OS, const Record *Builtin) {
+  TemplateInsts Templates = {};
+  if (Builtin->isSubClassOf("Template")) {
+    Templates = getTemplateInsts(Builtin);
+  } else {
+    Templates.Affix.emplace_back();
+    Templates.Substitution.emplace_back();
+  }
+
+  for (auto [Substitution, Affix] :
+       llvm::zip(Templates.Substitution, Templates.Affix)) {
+    for (StringRef Spelling : Builtin->getValueAsListOfStrings("Spellings")) {
+      auto FullSpelling =
+          (Templates.IsPrefix ? Affix + Spelling : Spelling + Affix).str();
+      BuiltinType BT = BuiltinType::Builtin;
+      if (Builtin->isSubClassOf("AtomicBuiltin")) {
+        BT = BuiltinType::AtomicBuiltin;
+      } else if (Builtin->isSubClassOf("LangBuiltin")) {
+        BT = BuiltinType::LangBuiltin;
+      } else if (Builtin->isSubClassOf("TargetBuiltin")) {
+        BT = BuiltinType::TargetBuiltin;
+      } else if (Builtin->isSubClassOf("LibBuiltin")) {
+        BT = BuiltinType::LibBuiltin;
+        if (Builtin->getValueAsBit("AddBuiltinPrefixedAlias"))
+          EmitBuiltinDef(OS, Substitution, Builtin,
+                         std::string("__builtin_") + FullSpelling,
+                         BuiltinType::Builtin);
+      }
+      EmitBuiltinDef(OS, Substitution, Builtin, FullSpelling, BT);
+    }
+  }
+}
+} // namespace
+
+void clang::EmitClangBuiltins(llvm::RecordKeeper &Records,
+                              llvm::raw_ostream &OS) {
+  emitSourceFileHeader("List of builtins that Clang recognizes", OS);
+
+  OS << R"c++(
+#if defined(BUILTIN) && !defined(LIBBUILTIN)
+#  define LIBBUILTIN(ID, TYPE, ATTRS, HEADER, BUILTIN_LANG) BUILTIN(ID, TYPE, ATTRS)
+#endif
+
+#if defined(BUILTIN) && !defined(LANGBUILTIN)
+#  define LANGBUILTIN(ID, TYPE, ATTRS, BUILTIN_LANG) BUILTIN(ID, TYPE, ATTRS)
+#endif
+
+// Some of our atomics builtins are handled by AtomicExpr rather than
+// as normal builtin CallExprs. This macro is used for such builtins.
+#ifndef ATOMIC_BUILTIN
+#  define ATOMIC_BUILTIN(ID, TYPE, ATTRS) BUILTIN(ID, TYPE, ATTRS)
+#endif
+
+#if defined(BUILTIN) && !defined(TARGET_BUILTIN)
+#  define TARGET_BUILTIN(ID, TYPE, ATTRS, FEATURE) BUILTIN(ID, TYPE, ATTRS)
+#endif
+)c++";
+
+  // AtomicBuiltins are order dependent
+  // emit them first to make manual checking easier
+  for (const auto *Builtin : Records.getAllDerivedDefinitions("AtomicBuiltin"))
+    EmitBuiltin(OS, Builtin);
+
+  for (const auto *Builtin : Records.getAllDerivedDefinitions("Builtin")) {
+    if (Builtin->isSubClassOf("AtomicBuiltin"))
+      continue;
+    EmitBuiltin(OS, Builtin);
+  }
+
+  for (const auto *Entry : Records.getAllDerivedDefinitions("CustomEntry")) {
+    OS << Entry->getValueAsString("Entry") << '\n';
+  }
+
+  OS << R"c++(
+#undef ATOMIC_BUILTIN
+#undef BUILTIN
+#undef LIBBUILTIN
+#undef LANGBUILTIN
+#undef TARGET_BUILTIN
+)c++";
+}
diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index 496cb10d14f2d..3a90eee5f1c92 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1846,7 +1846,7 @@ void MveEmitter::EmitHeader(raw_ostream &OS) {
         // declared 'static inline' without a body, which is fine
         // provided clang recognizes them as builtins, and has the
         // effect that this type signature is used in place of the one
-        // that Builtins.def didn't provide. That's how we can get
+        // that Builtins.td didn't provide. That's how we can get
         // structure types that weren't defined until this header was
         // included to be part of the type signature of a builtin that
         // was known to clang already.
diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp
index 158d10e2b3d6b..3f22e43b97ecd 100644
--- a/clang/utils/TableGen/TableGen.cpp
+++ b/clang/utils/TableGen/TableGen.cpp
@@ -49,6 +49,7 @@ enum ActionType {
   GenClangAttrNodeTraverse,
   GenClangBasicReader,
   GenClangBasicWriter,
+  GenClangBuiltins,
   GenClangDiagsDefs,
   GenClangDiagGroups,
   GenClangDiagsIndexName,
@@ -178,6 +179,8 @@ cl::opt<ActionType> Action(
                    "Generate clang attribute text node dumper"),
         clEnumValN(GenClangAttrNodeTraverse, "gen-clang-attr-node-traverse",
                    "Generate clang attribute traverser"),
+        clEnumValN(GenClangBuiltins, "gen-clang-builtins",
+                   "Generate clang builtins list"),
         clEnumValN(GenClangDiagsDefs, "gen-clang-diags-defs",
                    "Generate Clang diagnostics definitions"),
         clEnumValN(GenClangDiagGroups, "gen-clang-diag-groups",
@@ -390,6 +393,9 @@ bool ClangTableGenMain(raw_ostream &OS, RecordKeeper &Records) {
   case GenClangAttrNodeTraverse:
     EmitClangAttrNodeTraverse(Records, OS);
     break;
+  case GenClangBuiltins:
+    EmitClangBuiltins(Records, OS);
+    break;
   case GenClangDiagsDefs:
     EmitClangDiagsDefs(Records, OS, ClangComponent);
     break;
diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h
index 58a4af3c23a67..ecd506b7b6b50 100644
--- a/clang/utils/TableGen/TableGenBackends.h
+++ b/clang/utils/TableGen/TableGenBackends.h
@@ -75,6 +75,8 @@ void EmitClangAttrNodeTraverse(llvm::RecordKeeper &Records,
                                llvm::raw_ostream &OS);
 void EmitClangAttrDocTable(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
 
+void EmitClangBuiltins(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);
+
 void EmitClangDiagsDefs(llvm::RecordKeeper &Records, llvm::raw_ostream &OS,
                         const std::string &Component);
 void EmitClangDiagGroups(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);

From 416b079336c6d6e48858f951cd494a7a3577deb8 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 11:23:28 +0100
Subject: [PATCH 763/843] Fix release issue workflow (#79268)

Remove the `--phab-token` argument (which currently eats the subsequent
"auto" as the token no longer exists) and related code.

I think this will fix the workflow failure in
https://github.com/llvm/llvm-project/issues/79253#issuecomment-1907679229.
---
 .github/workflows/issue-release-workflow.yml |  2 -
 llvm/utils/git/github-automation.py          | 91 --------------------
 2 files changed, 93 deletions(-)

diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml
index b09ef6555fa90..1f45799af1695 100644
--- a/.github/workflows/issue-release-workflow.yml
+++ b/.github/workflows/issue-release-workflow.yml
@@ -61,7 +61,6 @@ jobs:
           --token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \
           release-workflow \
           --issue-number ${{ github.event.issue.number }} \
-          --phab-token ${{ secrets.RELEASE_WORKFLOW_PHAB_TOKEN }} \
           auto
 
   create-pull-request:
@@ -90,5 +89,4 @@ jobs:
           --token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \
           release-workflow \
           --issue-number ${{ github.event.issue.number }} \
-          --phab-token ${{ secrets.RELEASE_WORKFLOW_PHAB_TOKEN }} \
           auto
diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index f78d91059ecd3..ffe4e54484fd5 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -251,85 +251,6 @@ def setup_llvmbot_git(git_dir="."):
         config.set_value("user", "email", "llvmbot@llvm.org")
 
 
-def phab_api_call(phab_token: str, url: str, args: dict) -> dict:
-    """
-    Make an API call to the Phabricator web service and return a dictionary
-    containing the json response.
-    """
-    data = {"api.token": phab_token}
-    data.update(args)
-    response = requests.post(url, data=data)
-    return response.json()
-
-
-def phab_login_to_github_login(
-    phab_token: str, repo: github.Repository.Repository, phab_login: str
-) -> Optional[str]:
-    """
-    Tries to translate a Phabricator login to a github login by
-    finding a commit made in Phabricator's Differential.
-    The commit's SHA1 is then looked up in the github repo and
-    the committer's login associated with that commit is returned.
-
-    :param str phab_token: The Conduit API token to use for communication with Pabricator
-    :param github.Repository.Repository repo: The github repo to use when looking for the SHA1 found in Differential
-    :param str phab_login: The Phabricator login to be translated.
-    """
-
-    args = {
-        "constraints[authors][0]": phab_login,
-        # PHID for "LLVM Github Monorepo" repository
-        "constraints[repositories][0]": "PHID-REPO-f4scjekhnkmh7qilxlcy",
-        "limit": 1,
-    }
-    # API documentation: https://reviews.llvm.org/conduit/method/diffusion.commit.search/
-    r = phab_api_call(
-        phab_token, "https://reviews.llvm.org/api/diffusion.commit.search", args
-    )
-    data = r["result"]["data"]
-    if len(data) == 0:
-        # Can't find any commits associated with this user
-        return None
-
-    commit_sha = data[0]["fields"]["identifier"]
-    committer = repo.get_commit(commit_sha).committer
-    if not committer:
-        # This committer had an email address GitHub could not recognize, so
-        # it can't link the user to a GitHub account.
-        print(f"Warning: Can't find github account for {phab_login}")
-        return None
-    return committer.login
-
-
-def phab_get_commit_approvers(phab_token: str, commit: github.Commit.Commit) -> list:
-    args = {"corpus": commit.commit.message}
-    # API documentation: https://reviews.llvm.org/conduit/method/differential.parsecommitmessage/
-    r = phab_api_call(
-        phab_token, "https://reviews.llvm.org/api/differential.parsecommitmessage", args
-    )
-    review_id = r["result"]["revisionIDFieldInfo"]["value"]
-    if not review_id:
-        # No Phabricator revision for this commit
-        return []
-
-    args = {"constraints[ids][0]": review_id, "attachments[reviewers]": True}
-    # API documentation: https://reviews.llvm.org/conduit/method/differential.revision.search/
-    r = phab_api_call(
-        phab_token, "https://reviews.llvm.org/api/differential.revision.search", args
-    )
-    reviewers = r["result"]["data"][0]["attachments"]["reviewers"]["reviewers"]
-    accepted = []
-    for reviewer in reviewers:
-        if reviewer["status"] != "accepted":
-            continue
-        phid = reviewer["reviewerPHID"]
-        args = {"constraints[phids][0]": phid}
-        # API documentation: https://reviews.llvm.org/conduit/method/user.search/
-        r = phab_api_call(phab_token, "https://reviews.llvm.org/api/user.search", args)
-        accepted.append(r["result"]["data"][0]["fields"]["username"])
-    return accepted
-
-
 def extract_commit_hash(arg: str):
     """
     Extract the commit hash from the argument passed to /action github
@@ -364,7 +285,6 @@ def __init__(
         branch_repo_name: str,
         branch_repo_token: str,
         llvm_project_dir: str,
-        phab_token: str,
     ) -> None:
         self._token = token
         self._repo_name = repo
@@ -375,7 +295,6 @@ def __init__(
         else:
             self._branch_repo_token = self.token
         self._llvm_project_dir = llvm_project_dir
-        self._phab_token = phab_token
 
     @property
     def token(self) -> str:
@@ -401,10 +320,6 @@ def branch_repo_token(self) -> str:
     def llvm_project_dir(self) -> str:
         return self._llvm_project_dir
 
-    @property
-    def phab_token(self) -> str:
-        return self._phab_token
-
     @property
     def repo(self) -> github.Repository.Repository:
         return github.Github(self.token).get_repo(self.repo_name)
@@ -708,11 +623,6 @@ def execute_command(self) -> bool:
 release_workflow_parser.add_argument(
     "--issue-number", type=int, required=True, help="The issue number to update"
 )
-release_workflow_parser.add_argument(
-    "--phab-token",
-    type=str,
-    help="Phabricator conduit API token. See https://reviews.llvm.org/settings/user/<USER>/page/apitokens/",
-)
 release_workflow_parser.add_argument(
     "--branch-repo-token",
     type=str,
@@ -759,7 +669,6 @@ def execute_command(self) -> bool:
         args.branch_repo,
         args.branch_repo_token,
         args.llvm_project_dir,
-        args.phab_token,
     )
     if not release_workflow.release_branch_for_issue:
         release_workflow.issue_notify_no_milestone(sys.stdin.readlines())

From fe0e632b00e63bda75155a8d1aa16d271d4af728 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 24 Jan 2024 10:38:35 +0000
Subject: [PATCH 764/843] [DebugInfo][RemoveDIs] Support DPValues in HWAsan
 (#78731)

This patch extends HWASAN to support maintenance of debug-info that
isn't stored as intrinsics, but is instead in a DPValue object. This is
straight-forwards: we collect any such objects in StackInfoBuilder, and
apply the same operations to them as we would to dbg.value and similar
intrinsics.

I've also replaced some calls to getNextNode with debug-info skipping
next calls, and use iterators for instruction insertion rather than
instruction pointers. This avoids any difference in output between
intrinsic / non-intrinsic debug-info, but also means that any debug-info
comes before code inserted by HWAsan, rather than afterwards. See the
test modifications, where the variable assignment (presented as a
dbg.value) jumps up over all the code inserted by HWAsan. Seeing how the
code inserted by HWAsan is always (AFAIUI) given the source-location of
the instruction being instrumented, I don't believe this will have any
effect on which lines variable assignments become visible on; it may
extend the number of instructions covered by the assignments though.
---
 .../Transforms/Utils/MemoryTaggingSupport.h   |  2 ++
 .../Target/AArch64/AArch64StackTagging.cpp    |  2 ++
 .../Instrumentation/HWAddressSanitizer.cpp    | 29 +++++++++++--------
 .../Transforms/Utils/MemoryTaggingSupport.cpp | 15 ++++++++++
 .../HWAddressSanitizer/RISCV/alloca.ll        | 16 +++++-----
 .../HWAddressSanitizer/alloca.ll              | 19 +++++++-----
 6 files changed, 55 insertions(+), 28 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
index 990cf60a09c27..eb00e6c4e856d 100644
--- a/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
+++ b/llvm/include/llvm/Transforms/Utils/MemoryTaggingSupport.h
@@ -52,6 +52,8 @@ struct AllocaInfo {
   SmallVector<IntrinsicInst *, 2> LifetimeStart;
   SmallVector<IntrinsicInst *, 2> LifetimeEnd;
   SmallVector<DbgVariableIntrinsic *, 2> DbgVariableIntrinsics;
+  // Non-intrinsic records of variable locations.
+  SmallVector<DPValue *, 2> DbgVariableRecords;
 };
 
 struct StackInfo {
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index b5b15022cda4f..73bee2f5bfb1c 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -578,6 +578,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
     // Fixup debug intrinsics to point to the new alloca.
     for (auto *DVI : Info.DbgVariableIntrinsics)
       DVI->replaceVariableLocationOp(OldAI, Info.AI);
+    for (auto *DPV : Info.DbgVariableRecords)
+      DPV->replaceVariableLocationOp(OldAI, Info.AI);
   }
 
   // If we have instrumented at least one alloca, all unrecognized lifetime
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index efb621cde9065..66c7a2edeb118 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1357,7 +1357,7 @@ Value *HWAddressSanitizer::readRegister(IRBuilder<> &IRB, StringRef Name) {
 bool HWAddressSanitizer::instrumentLandingPads(
     SmallVectorImpl<Instruction *> &LandingPadVec) {
   for (auto *LP : LandingPadVec) {
-    IRBuilder<> IRB(LP->getNextNode());
+    IRBuilder<> IRB(LP->getNextNonDebugInstruction());
     IRB.CreateCall(
         HwasanHandleVfork,
         {readRegister(IRB, (TargetTriple.getArch() == Triple::x86_64) ? "rsp"
@@ -1387,7 +1387,7 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
     auto N = I++;
     auto *AI = KV.first;
     memtag::AllocaInfo &Info = KV.second;
-    IRBuilder<> IRB(AI->getNextNode());
+    IRBuilder<> IRB(AI->getNextNonDebugInstruction());
 
     // Replace uses of the alloca with tagged address.
     Value *Tag = getAllocaTag(IRB, StackTag, N);
@@ -1425,17 +1425,22 @@ bool HWAddressSanitizer::instrumentStack(memtag::StackInfo &SInfo,
       return User != AILong && User != AICast && !isLifetimeIntrinsic(User);
     });
 
-    for (auto *DDI : Info.DbgVariableIntrinsics) {
+    // Helper utility for adding DW_OP_LLVM_tag_offset to debug-info records,
+    // abstracted over whether they're intrinsic-stored or DPValue stored.
+    auto AnnotateDbgRecord = [&](auto *DPtr) {
       // Prepend "tag_offset, N" to the dwarf expression.
       // Tag offset logically applies to the alloca pointer, and it makes sense
       // to put it at the beginning of the expression.
       SmallVector<uint64_t, 8> NewOps = {dwarf::DW_OP_LLVM_tag_offset,
                                          retagMask(N)};
-      for (size_t LocNo = 0; LocNo < DDI->getNumVariableLocationOps(); ++LocNo)
-        if (DDI->getVariableLocationOp(LocNo) == AI)
-          DDI->setExpression(DIExpression::appendOpsToArg(DDI->getExpression(),
-                                                          NewOps, LocNo));
-    }
+      for (size_t LocNo = 0; LocNo < DPtr->getNumVariableLocationOps(); ++LocNo)
+        if (DPtr->getVariableLocationOp(LocNo) == AI)
+          DPtr->setExpression(DIExpression::appendOpsToArg(
+              DPtr->getExpression(), NewOps, LocNo));
+    };
+
+    llvm::for_each(Info.DbgVariableIntrinsics, AnnotateDbgRecord);
+    llvm::for_each(Info.DbgVariableRecords, AnnotateDbgRecord);
 
     auto TagEnd = [&](Instruction *Node) {
       IRB.SetInsertPoint(Node);
@@ -1532,8 +1537,8 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
 
   assert(!ShadowBase);
 
-  Instruction *InsertPt = &*F.getEntryBlock().begin();
-  IRBuilder<> EntryIRB(InsertPt);
+  BasicBlock::iterator InsertPt = F.getEntryBlock().begin();
+  IRBuilder<> EntryIRB(&F.getEntryBlock(), InsertPt);
   emitPrologue(EntryIRB,
                /*WithFrameRecord*/ ClRecordStackHistory != none &&
                    Mapping.WithFrameRecord &&
@@ -1552,12 +1557,12 @@ void HWAddressSanitizer::sanitizeFunction(Function &F,
   // entry block back into the entry block so that they aren't treated as
   // dynamic allocas.
   if (EntryIRB.GetInsertBlock() != &F.getEntryBlock()) {
-    InsertPt = &*F.getEntryBlock().begin();
+    InsertPt = F.getEntryBlock().begin();
     for (Instruction &I :
          llvm::make_early_inc_range(*EntryIRB.GetInsertBlock())) {
       if (auto *AI = dyn_cast<AllocaInst>(&I))
         if (isa<ConstantInt>(AI->getArraySize()))
-          I.moveBefore(InsertPt);
+          I.moveBefore(F.getEntryBlock(), InsertPt);
     }
   }
 
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index f94047633022c..648a5276a3d60 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -149,6 +149,21 @@ void StackInfoBuilder::visit(Instruction &Inst) {
       }
     }
   }
+
+  // Check for non-intrinsic debug-info records.
+  for (auto &DPV : Inst.getDbgValueRange()) {
+    for (Value *V : DPV.location_ops()) {
+      if (auto *AI = dyn_cast_or_null<AllocaInst>(V)) {
+        if (!isInterestingAlloca(*AI))
+          continue;
+        AllocaInfo &AInfo = Info.AllocasToInstrument[AI];
+        auto &DPVVec = AInfo.DbgVariableRecords;
+        if (DPVVec.empty() || DPVVec.back() != &DPV)
+          DPVVec.push_back(&DPV);
+      }
+    }
+  }
+
   Instruction *ExitUntag = getUntagLocationIfFunctionExit(Inst);
   if (ExitUntag)
     Info.RetVec.push_back(ExitUntag);
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
index de0bd84e2c579..7c899095ffdef 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/RISCV/alloca.ll
@@ -42,6 +42,7 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG10:![0-9]+]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG10]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG10]]
@@ -57,7 +58,6 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG10]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG10]]
 ; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
 ; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG10]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
@@ -77,6 +77,7 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG10:![0-9]+]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG10]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG10]]
@@ -92,7 +93,6 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG10]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG10]]
 ; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
 ; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG10]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
@@ -153,10 +153,10 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: !2, file: !2, line: 4, type: !8, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !3)
 ; DYNAMIC-SHADOW: [[META8:![0-9]+]] = !DISubroutineType(types: !9)
 ; DYNAMIC-SHADOW: [[META9:![0-9]+]] = !{null}
-; DYNAMIC-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
-; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: !12)
-; DYNAMIC-SHADOW: [[META12:![0-9]+]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: [[META12:![0-9]+]])
+; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 ; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 0, scope: !7)
+; DYNAMIC-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
 ; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: !7)
 ;.
 ; ZERO-BASED-SHADOW: [[META0:![0-9]+]] = !{ptr @hwasan.note}
@@ -169,9 +169,9 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: !2, file: !2, line: 4, type: !8, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !3)
 ; ZERO-BASED-SHADOW: [[META8:![0-9]+]] = !DISubroutineType(types: !9)
 ; ZERO-BASED-SHADOW: [[META9:![0-9]+]] = !{null}
-; ZERO-BASED-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
-; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: !12)
-; ZERO-BASED-SHADOW: [[META12:![0-9]+]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: [[META12:![0-9]+]])
+; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 ; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 0, scope: !7)
+; ZERO-BASED-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
 ; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: !7)
 ;.
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
index 7707c90506e3e..8db14222652f5 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/alloca.ll
@@ -5,6 +5,9 @@
 ; RUN: opt < %s -passes=hwasan -hwasan-with-ifunc=1 -S | FileCheck %s --check-prefixes=DYNAMIC-SHADOW
 ; RUN: opt < %s -passes=hwasan -hwasan-mapping-offset=0 -S | FileCheck %s --check-prefixes=ZERO-BASED-SHADOW
 
+; RUN: opt < %s -passes=hwasan -hwasan-with-ifunc=1 -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=DYNAMIC-SHADOW
+; RUN: opt < %s -passes=hwasan -hwasan-mapping-offset=0 -S --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=ZERO-BASED-SHADOW
+
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android10000"
 
@@ -40,6 +43,7 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; DYNAMIC-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; DYNAMIC-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
+; DYNAMIC-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG10:![0-9]+]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG10]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG10]]
@@ -55,7 +59,6 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; DYNAMIC-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG10]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG10]]
 ; DYNAMIC-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG10]]
-; DYNAMIC-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
 ; DYNAMIC-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG10]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
 ; DYNAMIC-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
@@ -75,6 +78,7 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_STACK_BASE_TAG:%.*]] = xor i64 [[TMP1]], [[TMP2]]
 ; ZERO-BASED-SHADOW-NEXT:    [[HWASAN_UAR_TAG:%.*]] = lshr i64 [[TMP1]], 56
 ; ZERO-BASED-SHADOW-NEXT:    [[X:%.*]] = alloca { i32, [12 x i8] }, align 16
+; ZERO-BASED-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP3:%.*]] = xor i64 [[HWASAN_STACK_BASE_TAG]], 0, !dbg [[DBG10:![0-9]+]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP4:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG10]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP5:%.*]] = and i64 [[TMP4]], 72057594037927935, !dbg [[DBG10]]
@@ -90,7 +94,6 @@ define void @test_alloca() sanitize_hwaddress !dbg !15 {
 ; ZERO-BASED-SHADOW-NEXT:    store i8 4, ptr [[TMP13]], align 1, !dbg [[DBG10]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[X]], i32 15, !dbg [[DBG10]]
 ; ZERO-BASED-SHADOW-NEXT:    store i8 [[TMP8]], ptr [[TMP14]], align 1, !dbg [[DBG10]]
-; ZERO-BASED-SHADOW-NEXT:    call void @llvm.dbg.value(metadata !DIArgList(ptr [[X]], ptr [[X]]), metadata [[META11:![0-9]+]], metadata !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_LLVM_tag_offset, 0, DW_OP_LLVM_arg, 1, DW_OP_LLVM_tag_offset, 0, DW_OP_plus, DW_OP_deref)), !dbg [[DBG13:![0-9]+]]
 ; ZERO-BASED-SHADOW-NEXT:    call void @use32(ptr nonnull [[X_HWASAN]]), !dbg [[DBG10]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP15:%.*]] = trunc i64 [[HWASAN_UAR_TAG]] to i8, !dbg [[DBG14:![0-9]+]]
 ; ZERO-BASED-SHADOW-NEXT:    [[TMP16:%.*]] = ptrtoint ptr [[X]] to i64, !dbg [[DBG14]]
@@ -151,10 +154,10 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; DYNAMIC-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: !2, file: !2, line: 4, type: !8, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !3)
 ; DYNAMIC-SHADOW: [[META8:![0-9]+]] = !DISubroutineType(types: !9)
 ; DYNAMIC-SHADOW: [[META9:![0-9]+]] = !{null}
-; DYNAMIC-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
-; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: !12)
-; DYNAMIC-SHADOW: [[META12:![0-9]+]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; DYNAMIC-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: ![[META12:[0-9]+]])
+; DYNAMIC-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 ; DYNAMIC-SHADOW: [[DBG13]] = !DILocation(line: 0, scope: !7)
+; DYNAMIC-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
 ; DYNAMIC-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: !7)
 ;.
 ; ZERO-BASED-SHADOW: [[META0:![0-9]+]] = !{ptr @hwasan.note}
@@ -167,9 +170,9 @@ declare void @llvm.dbg.value(metadata, metadata, metadata)
 ; ZERO-BASED-SHADOW: [[DBG7]] = distinct !DISubprogram(name: "test_alloca", linkageName: "_Z11test_allocav", scope: !2, file: !2, line: 4, type: !8, scopeLine: 4, flags: DIFlagPrototyped, spFlags: DISPFlagDefinition, unit: !1, retainedNodes: !3)
 ; ZERO-BASED-SHADOW: [[META8:![0-9]+]] = !DISubroutineType(types: !9)
 ; ZERO-BASED-SHADOW: [[META9:![0-9]+]] = !{null}
-; ZERO-BASED-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
-; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: !12)
-; ZERO-BASED-SHADOW: [[META12:![0-9]+]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+; ZERO-BASED-SHADOW: [[META11]] = !DILocalVariable(name: "x", scope: !7, file: !2, line: 5, type: ![[META12:[0-9]+]])
+; ZERO-BASED-SHADOW: [[META12]] = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
 ; ZERO-BASED-SHADOW: [[DBG13]] = !DILocation(line: 0, scope: !7)
+; ZERO-BASED-SHADOW: [[DBG10]] = !DILocation(line: 7, column: 5, scope: !7)
 ; ZERO-BASED-SHADOW: [[DBG14]] = !DILocation(line: 8, column: 1, scope: !7)
 ;.

From fb9a82b0235713782c1cf9d1eba20ce8d95766f7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Tue, 23 Jan 2024 19:36:12 +0100
Subject: [PATCH 765/843] [clang-repl] Refine fix for linker error: PLT offset
 too large

This is a follow-up improvement after the discussion in #78959
---
 clang/tools/clang-repl/CMakeLists.txt | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/clang/tools/clang-repl/CMakeLists.txt b/clang/tools/clang-repl/CMakeLists.txt
index 031dcaba5e446..d3dec1984b78d 100644
--- a/clang/tools/clang-repl/CMakeLists.txt
+++ b/clang/tools/clang-repl/CMakeLists.txt
@@ -23,12 +23,13 @@ if(CLANG_PLUGIN_SUPPORT)
   export_executable_symbols_for_plugins(clang-repl)
 endif()
 
-string(TOUPPER "${CMAKE_SYSTEM_PROCESSOR}" system_processor)
-if(system_processor MATCHES "ARM")
-  set(FLAG_LONG_PLT "-Wl,--long-plt")
-  llvm_check_linker_flag(CXX ${FLAG_LONG_PLT} LINKER_HAS_FLAG_LONG_PLT)
-  # Linkers without this flag are assumed to have long PLTs by default
-  if(LINKER_HAS_FLAG_LONG_PLT)
-    target_link_options(clang-repl PRIVATE ${FLAG_LONG_PLT})
-  endif()
+# The clang-repl binary can get huge with static linking in debug mode.
+# Some 32-bit targets use PLT slots with limited branch range by default and we
+# start to exceed this limit, e.g. when linking for arm-linux-gnueabihf with
+# gold. This flag tells the linker to build a PLT for the full address range.
+# Linkers without this flag are assumed to support proper PLTs by default.
+set(flag_long_plt "-Wl,--long-plt")
+llvm_check_linker_flag(CXX ${flag_long_plt} HAVE_LINKER_FLAG_LONG_PLT)
+if(HAVE_LINKER_FLAG_LONG_PLT)
+  target_link_options(clang-repl PRIVATE ${flag_long_plt})
 endif()

From 383d488b0bd68f1abd58c2d0114f82c54ee286d1 Mon Sep 17 00:00:00 2001
From: Kareem Ergawy <kareem.ergawy@amd.com>
Date: Wed, 24 Jan 2024 11:52:06 +0100
Subject: [PATCH 766/843] [openmp][flang][offloading] Do not use fixed device
 IDs in checks (#78973)

Fixes a small issues in an offloading test where the test dependec on
the host and device being assigned certains numeric IDs. This however is
not stable and fails in situations where any of the devices is assigned
an ID different from the expected value. The fix just checks that
offloading succeeded by making sure the IDs are different.

The test was failing locally for me.
---
 .../test/offloading/fortran/target_map_common_block1.f90      | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/openmp/libomptarget/test/offloading/fortran/target_map_common_block1.f90 b/openmp/libomptarget/test/offloading/fortran/target_map_common_block1.f90
index 35bbe511d9314..6aaa66d6449b6 100644
--- a/openmp/libomptarget/test/offloading/fortran/target_map_common_block1.f90
+++ b/openmp/libomptarget/test/offloading/fortran/target_map_common_block1.f90
@@ -20,9 +20,9 @@ program main
     devices(2) = omp_get_device_num()
   !$omp end target
   print *, "var1 after target = ", var1
-  print *, "devices: ", devices
+  print *, "devices are different? ", (devices(1) /= devices(2))
 end program
 
 ! CHECK: var1 before target =  10
 ! CHECK: var1 after target =  20
-! CHECK: devices:  1 0
+! CHECK: devices are different? T

From 91ddcba83ae4385fe771e918c096e6074b411de3 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Wed, 24 Jan 2024 11:58:32 +0100
Subject: [PATCH 767/843] AMDGPU/GlobalISelDivergenceLowering: select divergent
 i1 phis (#78482)

Implement PhiLoweringHelper for GlobalISel in DivergenceLoweringHelper.
Use machine uniformity analysis to find divergent i1 phis and select
them as lane mask phis in same way SILowerI1Copies select VReg_1 phis.
Note that divergent i1 phis include phis created by LCSSA and all cases
of uses outside of cycle are actually covered by "lowering LCSSA phis".
GlobalISel lane masks are registers with sgpr register class and S1 LLT.

TODO: General goal is that instructions created in this pass are fully
instruction-selected so that selection of lane mask phis is not split
across multiple passes.

patch 3 from: https://github.com/llvm/llvm-project/pull/73337
---
 .../llvm/CodeGen/MachineRegisterInfo.h        |  11 +
 .../llvm/CodeGen/MachineUniformityAnalysis.h  |  19 +
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      |  11 +
 .../lib/CodeGen/MachineUniformityAnalysis.cpp |  19 -
 .../AMDGPUGlobalISelDivergenceLowering.cpp    | 145 +++++++-
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   5 +-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    |  30 +-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.h      |  11 +-
 ...-divergent-i1-phis-no-lane-mask-merging.ll |   3 +-
 ...divergent-i1-phis-no-lane-mask-merging.mir |  76 ++--
 ...vergence-divergent-i1-used-outside-loop.ll |   3 +-
 ...ergence-divergent-i1-used-outside-loop.mir | 340 +++++++++++++-----
 .../GlobalISel/divergence-structurizer.ll     |   3 +-
 .../GlobalISel/divergence-structurizer.mir    | 292 +++++++++++----
 .../divergence-temporal-divergent-i1.ll       |   3 +-
 .../divergence-temporal-divergent-i1.mir      | 102 ++++--
 .../divergence-temporal-divergent-reg.ll      |   2 +-
 .../divergence-temporal-divergent-reg.mir     |   2 +-
 .../GlobalISel/divergent-control-flow.ll      |   1 +
 .../AMDGPU/GlobalISel/inst-select-phi.mir     |   4 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |   1 +
 21 files changed, 824 insertions(+), 259 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 9bca74a1d4fc8..03ba952c354ae 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -752,6 +752,17 @@ class MachineRegisterInfo {
   Register createVirtualRegister(const TargetRegisterClass *RegClass,
                                  StringRef Name = "");
 
+  /// All avilable attributes a virtual register can have.
+  struct RegisterAttributes {
+    const RegClassOrRegBank *RCOrRB;
+    LLT Ty;
+  };
+
+  /// createVirtualRegister - Create and return a new virtual register in the
+  /// function with the specified register attributes.
+  Register createVirtualRegister(RegisterAttributes RegAttr,
+                                 StringRef Name = "");
+
   /// Create and return a new virtual register in the function with the same
   /// attributes as the given register.
   Register cloneVirtualRegister(Register VReg, StringRef Name = "");
diff --git a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
index e6da099751e7a..1039ac4e5189b 100644
--- a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
+++ b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
@@ -32,6 +32,25 @@ MachineUniformityInfo computeMachineUniformityInfo(
     MachineFunction &F, const MachineCycleInfo &cycleInfo,
     const MachineDomTree &domTree, bool HasBranchDivergence);
 
+/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
+class MachineUniformityAnalysisPass : public MachineFunctionPass {
+  MachineUniformityInfo UI;
+
+public:
+  static char ID;
+
+  MachineUniformityAnalysisPass();
+
+  MachineUniformityInfo &getUniformityInfo() { return UI; }
+  const MachineUniformityInfo &getUniformityInfo() const { return UI; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+  // TODO: verify analysis
+};
+
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index 087604af6a718..d286128cc89be 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -167,6 +167,17 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
   return Reg;
 }
 
+/// createVirtualRegister - Create and return a new virtual register in the
+/// function with the specified register attributes.
+Register MachineRegisterInfo::createVirtualRegister(RegisterAttributes RegAttr,
+                                                    StringRef Name) {
+  Register Reg = createIncompleteVirtualRegister(Name);
+  VRegInfo[Reg].first = *RegAttr.RCOrRB;
+  setType(Reg, RegAttr.Ty);
+  noteNewVirtualRegister(Reg);
+  return Reg;
+}
+
 Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
                                                    StringRef Name) {
   Register Reg = createIncompleteVirtualRegister(Name);
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 3e0fe2b1ba087..131138e0649e4 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -165,25 +165,6 @@ MachineUniformityInfo llvm::computeMachineUniformityInfo(
 
 namespace {
 
-/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
-class MachineUniformityAnalysisPass : public MachineFunctionPass {
-  MachineUniformityInfo UI;
-
-public:
-  static char ID;
-
-  MachineUniformityAnalysisPass();
-
-  MachineUniformityInfo &getUniformityInfo() { return UI; }
-  const MachineUniformityInfo &getUniformityInfo() const { return UI; }
-
-  bool runOnMachineFunction(MachineFunction &F) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void print(raw_ostream &OS, const Module *M = nullptr) const override;
-
-  // TODO: verify analysis
-};
-
 class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
 public:
   static char ID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index 4cd8b1ec1051f..4f65a95de82ac 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -16,7 +16,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "SILowerI1Copies.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineUniformityAnalysis.h"
+#include "llvm/InitializePasses.h"
 
 #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
 
@@ -42,14 +46,146 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addRequired<MachineDominatorTree>();
+    AU.addRequired<MachinePostDominatorTree>();
+    AU.addRequired<MachineUniformityAnalysisPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
 
+class DivergenceLoweringHelper : public PhiLoweringHelper {
+public:
+  DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
+                           MachinePostDominatorTree *PDT,
+                           MachineUniformityInfo *MUI);
+
+private:
+  MachineUniformityInfo *MUI = nullptr;
+  MachineIRBuilder B;
+  Register buildRegCopyToLaneMask(Register Reg);
+
+public:
+  void markAsLaneMask(Register DstReg) const override;
+  void getCandidatesForLowering(
+      SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
+  void collectIncomingValuesFromPhi(
+      const MachineInstr *MI,
+      SmallVectorImpl<Incoming> &Incomings) const override;
+  void replaceDstReg(Register NewReg, Register OldReg,
+                     MachineBasicBlock *MBB) override;
+  void buildMergeLaneMasks(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator I, const DebugLoc &DL,
+                           Register DstReg, Register PrevReg,
+                           Register CurReg) override;
+  void constrainAsLaneMask(Incoming &In) override;
+};
+
+DivergenceLoweringHelper::DivergenceLoweringHelper(
+    MachineFunction *MF, MachineDominatorTree *DT,
+    MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
+    : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
+
+// _(s1) -> SReg_32/64(s1)
+void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
+  assert(MRI->getType(DstReg) == LLT::scalar(1));
+
+  if (MRI->getRegClassOrNull(DstReg)) {
+    if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
+      return;
+    llvm_unreachable("Failed to constrain register class");
+  }
+
+  MRI->setRegClass(DstReg, ST->getBoolRC());
+}
+
+void DivergenceLoweringHelper::getCandidatesForLowering(
+    SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
+  LLT S1 = LLT::scalar(1);
+
+  // Add divergent i1 phis to the list
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB.phis()) {
+      Register Dst = MI.getOperand(0).getReg();
+      if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
+        Vreg1Phis.push_back(&MI);
+    }
+  }
+}
+
+void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
+    const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
+  for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
+    Incomings.emplace_back(MI->getOperand(i).getReg(),
+                           MI->getOperand(i + 1).getMBB(), Register());
+  }
+}
+
+void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
+                                             MachineBasicBlock *MBB) {
+  BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
+      .addReg(NewReg);
+}
+
+// Copy Reg to new lane mask register, insert a copy after instruction that
+// defines Reg while skipping phis if needed.
+Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
+  Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+  MachineInstr *Instr = MRI->getVRegDef(Reg);
+  MachineBasicBlock *MBB = Instr->getParent();
+  B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
+  B.buildCopy(LaneMask, Reg);
+  return LaneMask;
+}
+
+// bb.previous
+//   %PrevReg = ...
+//
+// bb.current
+//   %CurReg = ...
+//
+//   %DstReg - not defined
+//
+// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
+//
+// bb.previous
+//   %PrevReg = ...
+//   %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
+//
+// bb.current
+//   %CurReg = ...
+//   %CurRegCopy:sreg_32(s1) = COPY %CurReg
+//   ...
+//   %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
+//   %CurMaskedReg:sreg_32(s1)  = AND %ExecReg, CurRegCopy - inactive lanes to 0
+//   %DstReg:sreg_32(s1)        = OR %PrevMaskedReg, CurMaskedReg
+//
+// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
+void DivergenceLoweringHelper::buildMergeLaneMasks(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
+    Register DstReg, Register PrevReg, Register CurReg) {
+  // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
+  // TODO: check if inputs are constants or results of a compare.
+
+  Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
+  Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
+  Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+  Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
+
+  B.setInsertPt(MBB, I);
+  B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
+  B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
+  B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
+}
+
+void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
+
 } // End anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
                       "AMDGPU GlobalISel divergence lowering", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
 INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
                     "AMDGPU GlobalISel divergence lowering", false, false)
 
@@ -64,5 +200,12 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
 
 bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
     MachineFunction &MF) {
-  return false;
+  MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
+  MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
+  MachineUniformityInfo &MUI =
+      getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
+
+  DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
+
+  return Helper.lowerPhis();
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index fdee74d58d269..610cedd4ea6fc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -210,6 +210,7 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
   const Register DefReg = I.getOperand(0).getReg();
   const LLT DefTy = MRI->getType(DefReg);
+
   if (DefTy == LLT::scalar(1)) {
     if (!AllowRiskySelect) {
       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
@@ -3552,8 +3553,6 @@ bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
 }
 
 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
-  if (I.isPHI())
-    return selectPHI(I);
 
   if (!I.isPreISelOpcode()) {
     if (I.isCopy())
@@ -3696,6 +3695,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectWaveAddress(I);
   case AMDGPU::G_STACKRESTORE:
     return selectStackRestore(I);
+  case AMDGPU::G_PHI:
+    return selectPHI(I);
   default:
     return selectImpl(I, *CoverageInfo);
   }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index cfa0c21def791..59843438950ac 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -31,9 +31,9 @@
 
 using namespace llvm;
 
-static Register insertUndefLaneMask(MachineBasicBlock *MBB,
-                                    MachineRegisterInfo *MRI,
-                                    Register LaneMaskRegAttrs);
+static Register
+insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
+                    MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs);
 
 namespace {
 
@@ -78,7 +78,7 @@ class Vreg1LoweringHelper : public PhiLoweringHelper {
                            MachineBasicBlock::iterator I, const DebugLoc &DL,
                            Register DstReg, Register PrevReg,
                            Register CurReg) override;
-  void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
+  void constrainAsLaneMask(Incoming &In) override;
 
   bool lowerCopiesFromI1();
   bool lowerCopiesToI1();
@@ -304,7 +304,8 @@ class LoopFinder {
   /// blocks, so that the SSA updater doesn't have to search all the way to the
   /// function entry.
   void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
-                      MachineRegisterInfo &MRI, Register LaneMaskRegAttrs,
+                      MachineRegisterInfo &MRI,
+                      MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs,
                       ArrayRef<Incoming> Incomings = {}) {
     assert(LoopLevel < CommonDominators.size());
 
@@ -411,14 +412,15 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
   return new SILowerI1Copies();
 }
 
-Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
-                                 Register LaneMaskRegAttrs) {
-  return MRI->cloneVirtualRegister(LaneMaskRegAttrs);
+Register llvm::createLaneMaskReg(
+    MachineRegisterInfo *MRI,
+    MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs) {
+  return MRI->createVirtualRegister(LaneMaskRegAttrs);
 }
 
-static Register insertUndefLaneMask(MachineBasicBlock *MBB,
-                                    MachineRegisterInfo *MRI,
-                                    Register LaneMaskRegAttrs) {
+static Register
+insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
+                    MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs) {
   MachineFunction &MF = *MBB->getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -619,7 +621,7 @@ bool PhiLoweringHelper::lowerPhis() {
       for (auto &Incoming : Incomings) {
         MachineBasicBlock &IMBB = *Incoming.Block;
         if (PIA.isSource(IMBB)) {
-          constrainIncomingRegisterTakenAsIs(Incoming);
+          constrainAsLaneMask(Incoming);
           SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
         } else {
           Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
@@ -911,6 +913,4 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
   }
 }
 
-void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
-  return;
-}
+void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
index 5099d39c2d141..80ea0536b4cbf 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
@@ -31,7 +31,9 @@ struct Incoming {
       : Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {}
 };
 
-Register createLaneMaskReg(MachineRegisterInfo *MRI, Register LaneMaskRegAttrs);
+Register
+createLaneMaskReg(MachineRegisterInfo *MRI,
+                  MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs);
 
 class PhiLoweringHelper {
 public:
@@ -47,7 +49,7 @@ class PhiLoweringHelper {
   MachineRegisterInfo *MRI = nullptr;
   const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
-  Register LaneMaskRegAttrs;
+  MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs;
 
 #ifndef NDEBUG
   DenseSet<Register> PhiRegisters;
@@ -68,7 +70,8 @@ class PhiLoweringHelper {
   getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
 
   void initializeLaneMaskRegisterAttributes(Register LaneMask) {
-    LaneMaskRegAttrs = LaneMask;
+    LaneMaskRegAttrs.RCOrRB = &MRI->getRegClassOrRegBank(LaneMask);
+    LaneMaskRegAttrs.Ty = MRI->getType(LaneMask);
   }
 
   bool isLaneMaskReg(Register Reg) const {
@@ -91,7 +94,7 @@ class PhiLoweringHelper {
                                    MachineBasicBlock::iterator I,
                                    const DebugLoc &DL, Register DstReg,
                                    Register PrevReg, Register CurReg) = 0;
-  virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
+  virtual void constrainAsLaneMask(Incoming &In) = 0;
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 7a68aec1a1c55..06a8f80e6aa34 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; REQUIRES: do-not-run-me
 
 ; Divergent phis that don't require lowering using lane mask merging
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index d314ebe355f51..55f22b0bbb4df 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -1,5 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
+
+# Test is updated but copies between S1-register-with-reg-class and
+# register-with-reg-class-no-LLT fail machine verification
+# REQUIRES: do-not-run-me-with-machine-verifier
 
 --- |
   define void @divergent_i1_phi_uniform_branch() {ret void}
@@ -46,7 +50,7 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -126,6 +130,7 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
@@ -136,12 +141,17 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -191,30 +201,37 @@ body: |
   ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -279,25 +296,28 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
-  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI1]](s32), [[C3]]
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
   ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.4
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C2]](s1), %bb.1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C4]]
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C4]]
   ; GFX10-NEXT:   G_BRCOND [[XOR]](s1), %bb.5
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -320,22 +340,26 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C8]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C8]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C9]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[XOR1]](s1), %bb.5
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C11]], [[C10]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY10]](s1), [[C11]], [[C10]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -468,7 +492,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
   ; GFX10-NEXT:   G_BRCOND [[PHI1]](s1), %bb.5
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 34dedfe10365f..808c2c2ded201 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; REQUIRES: do-not-run-me
 
 ; This file contains various tests that have divergent i1s used outside of
 ; the loop. These are lane masks is sgpr and need to have correct value in
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index 92463714ec694..d84ccece81c78 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -1,5 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
+
+# Test is updated but copies between S1-register-with-reg-class and
+# register-with-reg-class-no-LLT fail machine verification
+# REQUIRES: do-not-run-me-with-machine-verifier
 
 ---
 name: divergent_i1_phi_used_outside_loop
@@ -19,30 +23,49 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY7]], [[C2]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_2]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -103,42 +126,67 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %13(s1), %bb.3
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %41, %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_OR_B32_]](s1), %bb.0, %27(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PHI1]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PHI3]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C2]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY12]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI1]], [[C3]](s64)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI3]], [[C3]](s64)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C4]]
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI2]], [[C4]]
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[ADD]](s32), [[C5]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.1
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.3
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C7]], [[C6]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[C7]], [[C6]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -211,30 +259,37 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -298,6 +353,7 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -305,29 +361,49 @@ body: |
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, [[C1]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY6]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.1, %61, %bb.7
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.1, %48, %bb.7
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI5]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C5]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -335,9 +411,17 @@ body: |
   ; GFX10-NEXT:   successors: %bb.7(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C7]]
-  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI2]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C7]]
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI5]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -353,22 +437,32 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.3(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[C6]](s1), %bb.4, [[C3]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C9]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI5]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY17]], [[C9]]
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.7
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1)
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY19]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -479,35 +573,40 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI1]]
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI2]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
-  ; GFX10-NEXT:   G_STORE [[PHI1]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   G_STORE [[PHI2]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]]
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -515,27 +614,35 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C4]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C3]](s1), %bb.5, [[C2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.8(0x40000000), %bb.9(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.6
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI1]](s32), %bb.6
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY11]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
@@ -648,47 +755,75 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %14(s1), %bb.3
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %42, %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[PHI2]], %bb.1, [[DEF2]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[PHI3]]
+  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY11]]
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI1]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C4]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI4]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[FREEZE]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C6]], [[C5]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[C6]], [[C5]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -770,20 +905,39 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -798,6 +952,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
@@ -805,9 +960,16 @@ body: |
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C7]]
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -817,21 +979,27 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index c1f3924e466d5..fb1253c7a17d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; REQUIRES: do-not-run-me
 
 ; Simples case, if - then, that requires lane mask merging,
 ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index 9461d558684e8..0e6588b4593c9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -1,5 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
+
+# Test is updated but copies between S1-register-with-reg-class and
+# register-with-reg-class-no-LLT fail machine verification
+# REQUIRES: do-not-run-me-with-machine-verifier
 
 ---
 name: divergent_i1_phi_if_then
@@ -18,6 +22,7 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -28,13 +33,18 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -85,6 +95,7 @@ body: |
   ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C]]
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -93,7 +104,9 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %10(s1), %bb.3, [[DEF]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[COPY5]](s1)
   ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -102,6 +115,10 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C1]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -109,14 +126,19 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI [[PHI]](s1), %bb.1, [[ICMP1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI1]](s1), [[C3]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C3]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -183,20 +205,28 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %35, %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -210,23 +240,28 @@ body: |
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C5]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C5]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C5]]
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C6]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C6]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.2, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -308,20 +343,28 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %48, %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -329,6 +372,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
@@ -341,10 +385,11 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %34(s1), %bb.5, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
@@ -358,21 +403,30 @@ body: |
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[C8]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C8]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C8]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C9]]
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C4]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[COPY12]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -481,20 +535,28 @@ body: |
   ; GFX10-NEXT:   [[MV3:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %61, %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -502,6 +564,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
@@ -514,10 +577,11 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %37(s1), %bb.5, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
@@ -525,6 +589,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1)
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C8]](s32)
   ; GFX10-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV3]], [[SHL2]](s64)
@@ -537,9 +602,14 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI %47(s1), %bb.7, [[C4]](s1), %bb.2
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
@@ -552,21 +622,30 @@ body: |
   ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD3]], [[C11]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD3]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C11]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C11]]
   ; GFX10-NEXT:   [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C12]]
+  ; GFX10-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C12]]
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP3]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s1) = G_PHI [[ICMP3]](s1), %bb.6, [[C7]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[COPY17]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -696,20 +775,39 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -724,6 +822,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
@@ -731,9 +830,16 @@ body: |
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C7]]
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]]
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -743,21 +849,27 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
@@ -857,7 +969,11 @@ body: |
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY4]](s32), [[COPY1]]
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
@@ -870,18 +986,27 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, [[DEF]](s1), %bb.7
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI %67(s1), %bb.6, %70, %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI %49(s1), %bb.6, %48(s1), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI %35(s1), %bb.6, %34(s1), %bb.7
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY9]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI1]](s1), %17(s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), %17(s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.3(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT1]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
@@ -890,18 +1015,28 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_CONVERGENT]](s32)
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]]
   ; GFX10-NEXT:   [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32)
+  ; GFX10-NEXT:   [[DEF4:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF5:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %63(s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[COPY3]], [[COPY2]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[COPY3]], [[COPY2]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32)
   ; GFX10-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0
@@ -911,17 +1046,42 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT1]](s32), %bb.3
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
+  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
+  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %42(s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %56(s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[DEF6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI6]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.0, [[PHI]](s1), %bb.2, [[C2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.0, [[PHI7]], %bb.2, [[S_OR_B32_1]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, [[PHI1]], %bb.2, [[DEF5]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, [[PHI2]], %bb.2, [[DEF4]](s1), %bb.4
+  ; GFX10-NEXT:   [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]]
+  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]]
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI8]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_5]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_6:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY19]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_6:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_6:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_6]](s1), [[S_AND_B32_6]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_6]](s1)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY17]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   bb.0:
     successors: %bb.7(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index 54881f59096c1..431da1de1fd48 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; REQUIRES: do-not-run-me
 
 define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i1_phi:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
index 9c2d083d0aa1d..64d740287d9fc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -1,5 +1,9 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
+
+# Test is updated but copies between S1-register-with-reg-class and
+# register-with-reg-class-no-LLT fail machine verification
+# REQUIRES: do-not-run-me-with-machine-verifier
 
 ---
 name: temporal_divergent_i1_phi
@@ -17,30 +21,37 @@ body: |
   ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -97,30 +108,37 @@ body: |
   ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
+  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
+  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
+  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
+  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -183,20 +201,31 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
+  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x50000000), %bb.5(0x30000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %42, %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
+  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
+  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
+  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.3
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -218,8 +247,12 @@ body: |
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[COPY2]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C7]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI3]](s32), [[COPY2]]
+  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -229,20 +262,25 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
+  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI2]](s32)
+  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
+  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_2]](s1)
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
index d065f228a4e4a..b21e6a729dbc2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
 
 define void @temporal_divergent_i32(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
index 4cc68029489e9..cd6a0f8297a5a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: temporal_divergent_i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 25e2267fdee89..6384c47398fce 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
+; REQUIRES: do-not-run-me
 
 ; Make sure the branch targets are correct after lowering llvm.amdgcn.if
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
index c7b7a84e821f9..c7d45f062d0d2 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
@@ -422,7 +422,7 @@ body:             |
     G_BR %bb.2
 
   bb.2:
-    %6:sgpr(s32) = PHI %0(s32), %bb.0, %5(s32), %bb.1
+    %6:sgpr(s32) = G_PHI %0(s32), %bb.0, %5(s32), %bb.1
     $sgpr0 = COPY %6(s32)
     S_SETPC_B64 undef $sgpr30_sgpr31
 
@@ -476,7 +476,7 @@ body:             |
     G_BR %bb.2
 
   bb.2:
-    %6:vgpr(s32) = PHI %0(s32), %bb.0, %5(s32), %bb.1
+    %6:vgpr(s32) = G_PHI %0(s32), %bb.0, %5(s32), %bb.1
     $vgpr0 = COPY %6
     S_SETPC_B64 undef $sgpr30_sgpr31
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index a5482bd5b79a9..4caf83774bbba 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -5,6 +5,7 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX10_W64 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11_W32 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX11_W64 %s
+; REQUIRES: do-not-run-me
 
 define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:

From 149ed9d2c58095f87745f8696ef96b5076c91fca Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Wed, 24 Jan 2024 12:00:35 +0100
Subject: [PATCH 768/843] AMDGPU: update GFX11 wmma hazards (#76143)

One V_NOP or unrelated VALU instruction in between is required for
correctness when matrix A or B of current WMMA instruction overlaps with
matrix D of previous WMMA instruction.
Remaining cases of WMMA operand overlaps are handled by the hardware and
do not require handling in hazard recognizer.

Hardware may stall in cases where:
- matrix C of current WMMA instruction overlaps with matrix D of
previous WMMA instruction
- VALU instruction reads matrix D of previous WMMA instruction
- matrix A,B or C of WMMA instruction reads result of previous VALU
instruction
---
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp | 23 +--------
 llvm/test/CodeGen/AMDGPU/wmma-hazards.mir     | 48 +++++++++++++------
 2 files changed, 35 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index b6e4e65ff5b03..102ecb5e7680e 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1726,8 +1726,8 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
     if (!SIInstrInfo::isWMMA(I))
       return false;
 
-    // Src0 or Src1 of the current wmma instruction overlaps with the dest of
-    // the previous wmma.
+    // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
+    // with the dest(matrix D) of the previous wmma.
     const Register CurSrc0Reg =
         TII->getNamedOperand(*MI, AMDGPU::OpName::src0)->getReg();
     const Register CurSrc1Reg =
@@ -1741,25 +1741,6 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
       return true;
     }
 
-    // Src2 of the current wmma instruction overlaps with the dest of the
-    // previous wmma.
-    const MachineOperand *Src2 =
-        TII->getNamedOperand(*MI, AMDGPU::OpName::src2);
-    const Register CurSrc2Reg = Src2->isReg() ? Src2->getReg() : Register();
-
-    if (CurSrc2Reg != AMDGPU::NoRegister &&
-        TRI->regsOverlap(PrevDstReg, CurSrc2Reg)) {
-
-      const MachineOperand *Src2Mods =
-          TII->getNamedOperand(*MI, AMDGPU::OpName::src2_modifiers);
-      const bool NoSrc2Mods =
-          (Src2Mods->getImm() & (SISrcMods::NEG | SISrcMods::NEG_HI)) == 0;
-      // Exception: there is no hazard if the wmma instructions are of the same
-      // type and there is no input modifier on src2 of the current instruction.
-      return !(NoSrc2Mods && (TII->pseudoToMCOpcode(I.getOpcode()) ==
-                              TII->pseudoToMCOpcode(MI->getOpcode())));
-    }
-
     return false;
   };
 
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards.mir
index 40a58fd3ffa2a..8d8cef0b6c730 100644
--- a/llvm/test/CodeGen/AMDGPU/wmma-hazards.mir
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards.mir
@@ -7,7 +7,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_A
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: V_NOP_e32 implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -21,7 +23,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_B
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: V_NOP_e32 implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -35,7 +39,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: valu_inbetween_WMMA1_D_overlaps_WMMA2_A
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -50,7 +56,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: value_inbetween_WMMA1_D_overlaps_WMMA2_B
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -65,8 +73,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
-    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -79,7 +88,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: valu_inbetween_WMMA1_D_overlaps_WMMA2_C
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: $vgpr40 = V_MOV_B32_e32 0, implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -94,7 +105,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_no_imod
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -107,7 +120,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_real_instruction_no_imod
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_WMMA_F32_16X16X16_F16_threeaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -120,8 +135,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_same_pseudo_instruction_imod
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
-    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -135,8 +151,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_no_imod
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
-    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
@@ -150,8 +167,9 @@ body:             |
   bb.0:
   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
     ; GCN-LABEL: name: back_to_back_WMMA1_D_overlaps_WMMA2_C_diff_pseudo_instruction_imod
-    ; GCN: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
-    ; GCN-NEXT: V_NOP_e32 implicit $exec
+    ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GCN-NEXT: {{  $}}
+    ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     ; GCN-NEXT: early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_twoaddr_w32 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 8, killed $vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, 8, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
     early-clobber renamable $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_twoaddr_w32 8, killed $vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 11, killed $vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec

From 9dddb3d5f3bf323b7b7f8281bb848731f69fddfa Mon Sep 17 00:00:00 2001
From: Haojian Wu <hokein.wu@gmail.com>
Date: Wed, 24 Jan 2024 12:11:00 +0100
Subject: [PATCH 769/843] [AST] Mark the fallthrough coreturn statement
 implicit. (#77465)

This is a followup of #77311.
---
 clang/lib/Sema/SemaCoroutine.cpp      | 2 +-
 clang/test/AST/ast-dump-coroutine.cpp | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/clang/lib/Sema/SemaCoroutine.cpp b/clang/lib/Sema/SemaCoroutine.cpp
index 4e600fd29ee73..a969b9383563b 100644
--- a/clang/lib/Sema/SemaCoroutine.cpp
+++ b/clang/lib/Sema/SemaCoroutine.cpp
@@ -1748,7 +1748,7 @@ bool CoroutineStmtBuilder::makeOnFallthrough() {
       return false;
   } else if (HasRVoid) {
     Fallthrough = S.BuildCoreturnStmt(FD.getLocation(), nullptr,
-                                      /*IsImplicit*/false);
+                                      /*IsImplicit=*/true);
     Fallthrough = S.ActOnFinishFullStmt(Fallthrough.get());
     if (Fallthrough.isInvalid())
       return false;
diff --git a/clang/test/AST/ast-dump-coroutine.cpp b/clang/test/AST/ast-dump-coroutine.cpp
index 5e7736300f9fe..8741c7b35b155 100644
--- a/clang/test/AST/ast-dump-coroutine.cpp
+++ b/clang/test/AST/ast-dump-coroutine.cpp
@@ -52,8 +52,7 @@ Task test()  {
 // CHECK-NEXT:       |-CXXMemberCallExpr  {{.*}} 'std::suspend_always'
 // CHECK-NEXT:       |   | `-MemberExpr {{.*}} .initial_suspend
 //                   ...
-// FIXME: the CoreturnStmt should be marked as implicit
-// CHECK: CoreturnStmt {{.*}} <col:6>{{$}}
+// CHECK: CoreturnStmt {{.*}} <col:6> implicit
 
 Task test2()  {
 // Writen souce code, verify no implicit bit for the co_return expr.
@@ -65,5 +64,4 @@ Task test2()  {
 // CHECK:        |-DeclStmt {{.*}}
 // CHECK-NEXT:   | `-VarDecl {{.*}} implicit used __promise
 //               ...
-// FIXME: the CoreturnStmt should be marked as implicit
-// CHECK: CoreturnStmt {{.*}} <col:6>{{$}}
+// CHECK: CoreturnStmt {{.*}} <col:6> implicit

From c46109d0d78863ff5e4e23c8f9fd85eb1220a42e Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Wed, 24 Jan 2024 12:18:34 +0100
Subject: [PATCH 770/843] Revert "AMDGPU/GlobalISelDivergenceLowering: select
 divergent i1 phis" (#79274)

Reverts llvm/llvm-project#78482
---
 .../llvm/CodeGen/MachineRegisterInfo.h        |  11 -
 .../llvm/CodeGen/MachineUniformityAnalysis.h  |  19 -
 llvm/lib/CodeGen/MachineRegisterInfo.cpp      |  11 -
 .../lib/CodeGen/MachineUniformityAnalysis.cpp |  19 +
 .../AMDGPUGlobalISelDivergenceLowering.cpp    | 145 +-------
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |   5 +-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp    |  30 +-
 llvm/lib/Target/AMDGPU/SILowerI1Copies.h      |  11 +-
 ...-divergent-i1-phis-no-lane-mask-merging.ll |   3 +-
 ...divergent-i1-phis-no-lane-mask-merging.mir |  76 ++--
 ...vergence-divergent-i1-used-outside-loop.ll |   3 +-
 ...ergence-divergent-i1-used-outside-loop.mir | 340 +++++-------------
 .../GlobalISel/divergence-structurizer.ll     |   3 +-
 .../GlobalISel/divergence-structurizer.mir    | 292 ++++-----------
 .../divergence-temporal-divergent-i1.ll       |   3 +-
 .../divergence-temporal-divergent-i1.mir      | 102 ++----
 .../divergence-temporal-divergent-reg.ll      |   2 +-
 .../divergence-temporal-divergent-reg.mir     |   2 +-
 .../GlobalISel/divergent-control-flow.ll      |   1 -
 .../AMDGPU/GlobalISel/inst-select-phi.mir     |   4 +-
 .../AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll |   1 -
 21 files changed, 259 insertions(+), 824 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
index 03ba952c354ae..9bca74a1d4fc8 100644
--- a/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/MachineRegisterInfo.h
@@ -752,17 +752,6 @@ class MachineRegisterInfo {
   Register createVirtualRegister(const TargetRegisterClass *RegClass,
                                  StringRef Name = "");
 
-  /// All avilable attributes a virtual register can have.
-  struct RegisterAttributes {
-    const RegClassOrRegBank *RCOrRB;
-    LLT Ty;
-  };
-
-  /// createVirtualRegister - Create and return a new virtual register in the
-  /// function with the specified register attributes.
-  Register createVirtualRegister(RegisterAttributes RegAttr,
-                                 StringRef Name = "");
-
   /// Create and return a new virtual register in the function with the same
   /// attributes as the given register.
   Register cloneVirtualRegister(Register VReg, StringRef Name = "");
diff --git a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
index 1039ac4e5189b..e6da099751e7a 100644
--- a/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
+++ b/llvm/include/llvm/CodeGen/MachineUniformityAnalysis.h
@@ -32,25 +32,6 @@ MachineUniformityInfo computeMachineUniformityInfo(
     MachineFunction &F, const MachineCycleInfo &cycleInfo,
     const MachineDomTree &domTree, bool HasBranchDivergence);
 
-/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
-class MachineUniformityAnalysisPass : public MachineFunctionPass {
-  MachineUniformityInfo UI;
-
-public:
-  static char ID;
-
-  MachineUniformityAnalysisPass();
-
-  MachineUniformityInfo &getUniformityInfo() { return UI; }
-  const MachineUniformityInfo &getUniformityInfo() const { return UI; }
-
-  bool runOnMachineFunction(MachineFunction &F) override;
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  void print(raw_ostream &OS, const Module *M = nullptr) const override;
-
-  // TODO: verify analysis
-};
-
 } // namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINEUNIFORMITYANALYSIS_H
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index d286128cc89be..087604af6a718 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -167,17 +167,6 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
   return Reg;
 }
 
-/// createVirtualRegister - Create and return a new virtual register in the
-/// function with the specified register attributes.
-Register MachineRegisterInfo::createVirtualRegister(RegisterAttributes RegAttr,
-                                                    StringRef Name) {
-  Register Reg = createIncompleteVirtualRegister(Name);
-  VRegInfo[Reg].first = *RegAttr.RCOrRB;
-  setType(Reg, RegAttr.Ty);
-  noteNewVirtualRegister(Reg);
-  return Reg;
-}
-
 Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
                                                    StringRef Name) {
   Register Reg = createIncompleteVirtualRegister(Name);
diff --git a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
index 131138e0649e4..3e0fe2b1ba087 100644
--- a/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
+++ b/llvm/lib/CodeGen/MachineUniformityAnalysis.cpp
@@ -165,6 +165,25 @@ MachineUniformityInfo llvm::computeMachineUniformityInfo(
 
 namespace {
 
+/// Legacy analysis pass which computes a \ref MachineUniformityInfo.
+class MachineUniformityAnalysisPass : public MachineFunctionPass {
+  MachineUniformityInfo UI;
+
+public:
+  static char ID;
+
+  MachineUniformityAnalysisPass();
+
+  MachineUniformityInfo &getUniformityInfo() { return UI; }
+  const MachineUniformityInfo &getUniformityInfo() const { return UI; }
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void print(raw_ostream &OS, const Module *M = nullptr) const override;
+
+  // TODO: verify analysis
+};
+
 class MachineUniformityInfoPrinterPass : public MachineFunctionPass {
 public:
   static char ID;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
index 4f65a95de82ac..4cd8b1ec1051f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelDivergenceLowering.cpp
@@ -16,11 +16,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "SILowerI1Copies.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineUniformityAnalysis.h"
-#include "llvm/InitializePasses.h"
 
 #define DEBUG_TYPE "amdgpu-global-isel-divergence-lowering"
 
@@ -46,146 +42,14 @@ class AMDGPUGlobalISelDivergenceLowering : public MachineFunctionPass {
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineDominatorTree>();
-    AU.addRequired<MachinePostDominatorTree>();
-    AU.addRequired<MachineUniformityAnalysisPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
 
-class DivergenceLoweringHelper : public PhiLoweringHelper {
-public:
-  DivergenceLoweringHelper(MachineFunction *MF, MachineDominatorTree *DT,
-                           MachinePostDominatorTree *PDT,
-                           MachineUniformityInfo *MUI);
-
-private:
-  MachineUniformityInfo *MUI = nullptr;
-  MachineIRBuilder B;
-  Register buildRegCopyToLaneMask(Register Reg);
-
-public:
-  void markAsLaneMask(Register DstReg) const override;
-  void getCandidatesForLowering(
-      SmallVectorImpl<MachineInstr *> &Vreg1Phis) const override;
-  void collectIncomingValuesFromPhi(
-      const MachineInstr *MI,
-      SmallVectorImpl<Incoming> &Incomings) const override;
-  void replaceDstReg(Register NewReg, Register OldReg,
-                     MachineBasicBlock *MBB) override;
-  void buildMergeLaneMasks(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator I, const DebugLoc &DL,
-                           Register DstReg, Register PrevReg,
-                           Register CurReg) override;
-  void constrainAsLaneMask(Incoming &In) override;
-};
-
-DivergenceLoweringHelper::DivergenceLoweringHelper(
-    MachineFunction *MF, MachineDominatorTree *DT,
-    MachinePostDominatorTree *PDT, MachineUniformityInfo *MUI)
-    : PhiLoweringHelper(MF, DT, PDT), MUI(MUI), B(*MF) {}
-
-// _(s1) -> SReg_32/64(s1)
-void DivergenceLoweringHelper::markAsLaneMask(Register DstReg) const {
-  assert(MRI->getType(DstReg) == LLT::scalar(1));
-
-  if (MRI->getRegClassOrNull(DstReg)) {
-    if (MRI->constrainRegClass(DstReg, ST->getBoolRC()))
-      return;
-    llvm_unreachable("Failed to constrain register class");
-  }
-
-  MRI->setRegClass(DstReg, ST->getBoolRC());
-}
-
-void DivergenceLoweringHelper::getCandidatesForLowering(
-    SmallVectorImpl<MachineInstr *> &Vreg1Phis) const {
-  LLT S1 = LLT::scalar(1);
-
-  // Add divergent i1 phis to the list
-  for (MachineBasicBlock &MBB : *MF) {
-    for (MachineInstr &MI : MBB.phis()) {
-      Register Dst = MI.getOperand(0).getReg();
-      if (MRI->getType(Dst) == S1 && MUI->isDivergent(Dst))
-        Vreg1Phis.push_back(&MI);
-    }
-  }
-}
-
-void DivergenceLoweringHelper::collectIncomingValuesFromPhi(
-    const MachineInstr *MI, SmallVectorImpl<Incoming> &Incomings) const {
-  for (unsigned i = 1; i < MI->getNumOperands(); i += 2) {
-    Incomings.emplace_back(MI->getOperand(i).getReg(),
-                           MI->getOperand(i + 1).getMBB(), Register());
-  }
-}
-
-void DivergenceLoweringHelper::replaceDstReg(Register NewReg, Register OldReg,
-                                             MachineBasicBlock *MBB) {
-  BuildMI(*MBB, MBB->getFirstNonPHI(), {}, TII->get(AMDGPU::COPY), OldReg)
-      .addReg(NewReg);
-}
-
-// Copy Reg to new lane mask register, insert a copy after instruction that
-// defines Reg while skipping phis if needed.
-Register DivergenceLoweringHelper::buildRegCopyToLaneMask(Register Reg) {
-  Register LaneMask = createLaneMaskReg(MRI, LaneMaskRegAttrs);
-  MachineInstr *Instr = MRI->getVRegDef(Reg);
-  MachineBasicBlock *MBB = Instr->getParent();
-  B.setInsertPt(*MBB, MBB->SkipPHIsAndLabels(std::next(Instr->getIterator())));
-  B.buildCopy(LaneMask, Reg);
-  return LaneMask;
-}
-
-// bb.previous
-//   %PrevReg = ...
-//
-// bb.current
-//   %CurReg = ...
-//
-//   %DstReg - not defined
-//
-// -> (wave32 example, new registers have sreg_32 reg class and S1 LLT)
-//
-// bb.previous
-//   %PrevReg = ...
-//   %PrevRegCopy:sreg_32(s1) = COPY %PrevReg
-//
-// bb.current
-//   %CurReg = ...
-//   %CurRegCopy:sreg_32(s1) = COPY %CurReg
-//   ...
-//   %PrevMaskedReg:sreg_32(s1) = ANDN2 %PrevRegCopy, ExecReg - active lanes 0
-//   %CurMaskedReg:sreg_32(s1)  = AND %ExecReg, CurRegCopy - inactive lanes to 0
-//   %DstReg:sreg_32(s1)        = OR %PrevMaskedReg, CurMaskedReg
-//
-// DstReg = for active lanes rewrite bit in PrevReg with bit from CurReg
-void DivergenceLoweringHelper::buildMergeLaneMasks(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, const DebugLoc &DL,
-    Register DstReg, Register PrevReg, Register CurReg) {
-  // DstReg = (PrevReg & !EXEC) | (CurReg & EXEC)
-  // TODO: check if inputs are constants or results of a compare.
-
-  Register PrevRegCopy = buildRegCopyToLaneMask(PrevReg);
-  Register CurRegCopy = buildRegCopyToLaneMask(CurReg);
-  Register PrevMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
-  Register CurMaskedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
-
-  B.setInsertPt(MBB, I);
-  B.buildInstr(AndN2Op, {PrevMaskedReg}, {PrevRegCopy, ExecReg});
-  B.buildInstr(AndOp, {CurMaskedReg}, {ExecReg, CurRegCopy});
-  B.buildInstr(OrOp, {DstReg}, {PrevMaskedReg, CurMaskedReg});
-}
-
-void DivergenceLoweringHelper::constrainAsLaneMask(Incoming &In) { return; }
-
 } // End anonymous namespace.
 
 INITIALIZE_PASS_BEGIN(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
                       "AMDGPU GlobalISel divergence lowering", false, false)
-INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachinePostDominatorTree)
-INITIALIZE_PASS_DEPENDENCY(MachineUniformityAnalysisPass)
 INITIALIZE_PASS_END(AMDGPUGlobalISelDivergenceLowering, DEBUG_TYPE,
                     "AMDGPU GlobalISel divergence lowering", false, false)
 
@@ -200,12 +64,5 @@ FunctionPass *llvm::createAMDGPUGlobalISelDivergenceLoweringPass() {
 
 bool AMDGPUGlobalISelDivergenceLowering::runOnMachineFunction(
     MachineFunction &MF) {
-  MachineDominatorTree &DT = getAnalysis<MachineDominatorTree>();
-  MachinePostDominatorTree &PDT = getAnalysis<MachinePostDominatorTree>();
-  MachineUniformityInfo &MUI =
-      getAnalysis<MachineUniformityAnalysisPass>().getUniformityInfo();
-
-  DivergenceLoweringHelper Helper(&MF, &DT, &PDT, &MUI);
-
-  return Helper.lowerPhis();
+  return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index 610cedd4ea6fc..fdee74d58d269 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -210,7 +210,6 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
   const Register DefReg = I.getOperand(0).getReg();
   const LLT DefTy = MRI->getType(DefReg);
-
   if (DefTy == LLT::scalar(1)) {
     if (!AllowRiskySelect) {
       LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
@@ -3553,6 +3552,8 @@ bool AMDGPUInstructionSelector::selectStackRestore(MachineInstr &MI) const {
 }
 
 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
+  if (I.isPHI())
+    return selectPHI(I);
 
   if (!I.isPreISelOpcode()) {
     if (I.isCopy())
@@ -3695,8 +3696,6 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectWaveAddress(I);
   case AMDGPU::G_STACKRESTORE:
     return selectStackRestore(I);
-  case AMDGPU::G_PHI:
-    return selectPHI(I);
   default:
     return selectImpl(I, *CoverageInfo);
   }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 59843438950ac..cfa0c21def791 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -31,9 +31,9 @@
 
 using namespace llvm;
 
-static Register
-insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
-                    MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs);
+static Register insertUndefLaneMask(MachineBasicBlock *MBB,
+                                    MachineRegisterInfo *MRI,
+                                    Register LaneMaskRegAttrs);
 
 namespace {
 
@@ -78,7 +78,7 @@ class Vreg1LoweringHelper : public PhiLoweringHelper {
                            MachineBasicBlock::iterator I, const DebugLoc &DL,
                            Register DstReg, Register PrevReg,
                            Register CurReg) override;
-  void constrainAsLaneMask(Incoming &In) override;
+  void constrainIncomingRegisterTakenAsIs(Incoming &In) override;
 
   bool lowerCopiesFromI1();
   bool lowerCopiesToI1();
@@ -304,8 +304,7 @@ class LoopFinder {
   /// blocks, so that the SSA updater doesn't have to search all the way to the
   /// function entry.
   void addLoopEntries(unsigned LoopLevel, MachineSSAUpdater &SSAUpdater,
-                      MachineRegisterInfo &MRI,
-                      MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs,
+                      MachineRegisterInfo &MRI, Register LaneMaskRegAttrs,
                       ArrayRef<Incoming> Incomings = {}) {
     assert(LoopLevel < CommonDominators.size());
 
@@ -412,15 +411,14 @@ FunctionPass *llvm::createSILowerI1CopiesPass() {
   return new SILowerI1Copies();
 }
 
-Register llvm::createLaneMaskReg(
-    MachineRegisterInfo *MRI,
-    MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs) {
-  return MRI->createVirtualRegister(LaneMaskRegAttrs);
+Register llvm::createLaneMaskReg(MachineRegisterInfo *MRI,
+                                 Register LaneMaskRegAttrs) {
+  return MRI->cloneVirtualRegister(LaneMaskRegAttrs);
 }
 
-static Register
-insertUndefLaneMask(MachineBasicBlock *MBB, MachineRegisterInfo *MRI,
-                    MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs) {
+static Register insertUndefLaneMask(MachineBasicBlock *MBB,
+                                    MachineRegisterInfo *MRI,
+                                    Register LaneMaskRegAttrs) {
   MachineFunction &MF = *MBB->getParent();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
@@ -621,7 +619,7 @@ bool PhiLoweringHelper::lowerPhis() {
       for (auto &Incoming : Incomings) {
         MachineBasicBlock &IMBB = *Incoming.Block;
         if (PIA.isSource(IMBB)) {
-          constrainAsLaneMask(Incoming);
+          constrainIncomingRegisterTakenAsIs(Incoming);
           SSAUpdater.AddAvailableValue(&IMBB, Incoming.Reg);
         } else {
           Incoming.UpdatedReg = createLaneMaskReg(MRI, LaneMaskRegAttrs);
@@ -913,4 +911,6 @@ void Vreg1LoweringHelper::buildMergeLaneMasks(MachineBasicBlock &MBB,
   }
 }
 
-void Vreg1LoweringHelper::constrainAsLaneMask(Incoming &In) {}
+void Vreg1LoweringHelper::constrainIncomingRegisterTakenAsIs(Incoming &In) {
+  return;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
index 80ea0536b4cbf..5099d39c2d141 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.h
@@ -31,9 +31,7 @@ struct Incoming {
       : Reg(Reg), Block(Block), UpdatedReg(UpdatedReg) {}
 };
 
-Register
-createLaneMaskReg(MachineRegisterInfo *MRI,
-                  MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs);
+Register createLaneMaskReg(MachineRegisterInfo *MRI, Register LaneMaskRegAttrs);
 
 class PhiLoweringHelper {
 public:
@@ -49,7 +47,7 @@ class PhiLoweringHelper {
   MachineRegisterInfo *MRI = nullptr;
   const GCNSubtarget *ST = nullptr;
   const SIInstrInfo *TII = nullptr;
-  MachineRegisterInfo::RegisterAttributes LaneMaskRegAttrs;
+  Register LaneMaskRegAttrs;
 
 #ifndef NDEBUG
   DenseSet<Register> PhiRegisters;
@@ -70,8 +68,7 @@ class PhiLoweringHelper {
   getSaluInsertionAtEnd(MachineBasicBlock &MBB) const;
 
   void initializeLaneMaskRegisterAttributes(Register LaneMask) {
-    LaneMaskRegAttrs.RCOrRB = &MRI->getRegClassOrRegBank(LaneMask);
-    LaneMaskRegAttrs.Ty = MRI->getType(LaneMask);
+    LaneMaskRegAttrs = LaneMask;
   }
 
   bool isLaneMaskReg(Register Reg) const {
@@ -94,7 +91,7 @@ class PhiLoweringHelper {
                                    MachineBasicBlock::iterator I,
                                    const DebugLoc &DL, Register DstReg,
                                    Register PrevReg, Register CurReg) = 0;
-  virtual void constrainAsLaneMask(Incoming &In) = 0;
+  virtual void constrainIncomingRegisterTakenAsIs(Incoming &In) = 0;
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 06a8f80e6aa34..7a68aec1a1c55 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 ; Divergent phis that don't require lowering using lane mask merging
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
index 55f22b0bbb4df..d314ebe355f51 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.mir
@@ -1,9 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
-
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 --- |
   define void @divergent_i1_phi_uniform_branch() {ret void}
@@ -50,7 +46,7 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %14(s1), %bb.3, [[ICMP]](s1), %bb.0
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -130,7 +126,6 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $sgpr0
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C1]]
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.2
@@ -141,17 +136,12 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -201,37 +191,30 @@ body: |
   ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -296,28 +279,25 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %39(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.5
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %19(s1), %bb.5
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1000
-  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI2]](s32), [[C3]]
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sle), [[PHI1]](s32), [[C3]]
   ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.4
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C2]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %24(s1), %bb.4, [[C2]](s1), %bb.1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C4]]
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C4]]
   ; GFX10-NEXT:   G_BRCOND [[XOR]](s1), %bb.5
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -340,26 +320,22 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C8]]
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[XOR1]](s1)
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+  ; GFX10-NEXT:   [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C8]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C9]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C9]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[XOR1]](s1), %bb.5
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[C10:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY10]](s1), [[C11]], [[C10]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C11]], [[C10]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -492,7 +468,7 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %30(s32), %bb.4, [[DEF]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI %32(s1), %bb.4, [[C5]](s1), %bb.0
   ; GFX10-NEXT:   G_BRCOND [[PHI1]](s1), %bb.5
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
index 808c2c2ded201..34dedfe10365f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 ; This file contains various tests that have divergent i1s used outside of
 ; the loop. These are lane masks is sgpr and need to have correct value in
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
index d84ccece81c78..92463714ec694 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-used-outside-loop.mir
@@ -1,9 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
-
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: divergent_i1_phi_used_outside_loop
@@ -23,49 +19,30 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[FCMP]](s1)
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %36(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.0, %24(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY7]], [[C2]]
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI3]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI2]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_2]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY11]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -126,67 +103,42 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 -1
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32_xm0_xexec = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[DEF]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %41, %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[S_OR_B32_]](s1), %bb.0, %27(s1), %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %9(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(p1) = G_PHI [[MV]](p1), %bb.0, %11(p1), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %13(s1), %bb.3
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PHI3]](p1) :: (load (s32), addrspace 1)
+  ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PHI1]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY12]](s1)
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 4
-  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI3]], [[C3]](s64)
+  ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[PHI1]], [[C3]](s64)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI2]], [[C4]]
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = nsw G_ADD [[PHI]], [[C4]]
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 10
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sge), [[ADD]](s32), [[C5]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BRCOND [[ICMP1]](s1), %bb.1
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.3
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[C7]], [[C6]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C7]], [[C6]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -259,37 +211,30 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[COPY1]](s32), [[C1]]
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %24(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.1, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[FCMP]](s1), %bb.0, %13(s1), %bb.1
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
   ; GFX10-NEXT:   [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP1]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -353,7 +298,6 @@ body: |
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[C]]
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C1]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
@@ -361,49 +305,29 @@ body: |
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %39(s1), %bb.8
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI %14(s1), %bb.8, [[C1]](s1), %bb.0
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY6]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.1, %72(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.1, %61, %bb.7
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.1, %48, %bb.7
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C2]](s32), %bb.1, %17(s32), %bb.7
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %19(s32), %bb.7, [[C2]](s32), %bb.1
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI5]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C4]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C5]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
@@ -411,17 +335,9 @@ body: |
   ; GFX10-NEXT:   successors: %bb.7(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[C6]](s1)
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI5]], [[C7]]
-  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI5]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C7]]
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI2]](s32), [[COPY]]
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
@@ -437,32 +353,22 @@ body: |
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.3(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.3, [[S_OR_B32_3]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.3, [[S_OR_B32_2]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.4, [[DEF]](s32), %bb.3
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[C6]](s1), %bb.4, [[C3]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C3]](s1), %bb.3
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[COPY17]], [[C9]]
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY16]](s1), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI4]], [[C9]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI5]](s1), [[PHI1]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
-  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_4]](s1)
-  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY19]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY20]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.7
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.7
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x40000000), %bb.2(0x40000000)
@@ -573,40 +479,35 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF1]](s1), %bb.0, %38(s1), %bb.6
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %11(s32), %bb.6, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %13(s32), %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI2]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[ICMP]](s1)
+  ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY]](s32), [[PHI1]]
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C1]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
-  ; GFX10-NEXT:   G_STORE [[PHI2]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1)
+  ; GFX10-NEXT:   G_STORE [[PHI1]](s32), [[PTR_ADD]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
   ; GFX10-NEXT:   successors: %bb.5(0x40000000), %bb.6(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI2]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY1]](s32), [[PHI1]]
   ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.6, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -614,35 +515,27 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C4]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.7(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[C2]](s1), %bb.4, [[S_OR_B32_]](s1), %bb.5
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD]](s32), %bb.5, [[DEF]](s32), %bb.4
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C3]](s1), %bb.5, [[C2]](s1), %bb.4
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.8(0x40000000), %bb.9(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI2]](s32), %bb.6
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_1]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
-  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY11]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.6
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.6
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[PHI1]](s32), %bb.6
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
+  ; GFX10-NEXT:   [[SI_IF2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.9, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
@@ -755,75 +648,47 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY3]](s32), [[COPY4]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %53(s1), %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %42, %bb.3
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[C1]](s1), %bb.0, %32(s1), %bb.3
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI2]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[COPY7]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY7]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %10(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %12(s32), %bb.3
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[C1]](s1), %bb.0, %14(s1), %bb.3
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI2]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[LOAD]](s32), [[C3]]
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
-  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[PHI2]], %bb.1, [[DEF2]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.2, [[PHI2]](s1), %bb.1
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[COPY11]]
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[FREEZE]](s1)
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[FREEZE]](s1)
+  ; GFX10-NEXT:   [[FREEZE:%[0-9]+]]:_(s1) = G_FREEZE [[PHI3]]
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C4]]
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI4]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY5]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C4]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[PHI1]](s32), [[COPY]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_3]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[FREEZE]](s1), %bb.3
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY15]](s1), [[C6]], [[C5]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI4]](s1), [[C6]], [[C5]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV1]](p0) :: (store (s32))
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -905,39 +770,20 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -952,7 +798,6 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
@@ -960,16 +805,9 @@ body: |
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C7]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]]
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -979,27 +817,21 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
index fb1253c7a17d9..c1f3924e466d5 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
+; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -amdgpu-global-isel-risky-select -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 ; Simples case, if - then, that requires lane mask merging,
 ; %phi lane mask will hold %val_A at %A. Lanes that are active in %B
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
index 0e6588b4593c9..9461d558684e8 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-structurizer.mir
@@ -1,9 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
-
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: divergent_i1_phi_if_then
@@ -22,7 +18,6 @@ body: |
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 6
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C]]
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP]](s1)
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(eq), [[COPY3]](s32), [[C1]]
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP1]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -33,18 +28,13 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI [[ICMP]](s1), %bb.0, [[ICMP2]](s1), %bb.1
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY6]](s1), [[C4]], [[C3]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI]](s1), [[C4]], [[C3]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -95,7 +85,6 @@ body: |
   ; GFX10-NEXT:   [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
   ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[COPY3]](s32), [[C]]
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
@@ -104,9 +93,7 @@ body: |
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %19(s1), %bb.3
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[COPY5]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %10(s1), %bb.3, [[DEF]](s1), %bb.0
   ; GFX10-NEXT:   [[SI_ELSE:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_ELSE [[SI_IF]](s32), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -115,10 +102,6 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(uge), [[COPY2]](s32), [[C1]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
@@ -126,19 +109,14 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[COPY2]](s32), [[C2]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY8]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.1
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[COPY5]](s1), %bb.1, [[S_OR_B32_]](s1), %bb.2
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI [[PHI]](s1), %bb.1, [[ICMP1]](s1), %bb.2
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_ELSE]](s32)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY9]](s1), [[C3]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI1]](s1), [[C3]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p1) :: (store (s32), addrspace 1)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
@@ -205,28 +183,20 @@ body: |
   ; GFX10-NEXT:   [[MV1:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY2]](s32), [[COPY3]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %35, %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %9(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %11(s32), %bb.3
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY5]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -240,28 +210,23 @@ body: |
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C5]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C5]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C5]]
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C6]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C6]]
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.4(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.2
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.2, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.2, [[C1]](s1), %bb.1
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -343,28 +308,20 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %48, %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.3
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY7]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -372,7 +329,6 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
@@ -385,11 +341,10 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %47(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %32(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %34(s1), %bb.5, [[C1]](s1), %bb.1
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY10]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
@@ -403,30 +358,21 @@ body: |
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD2]], [[C8]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD2]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C8]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C8]]
   ; GFX10-NEXT:   [[C9:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C9]]
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY9]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C9]]
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, [[S_OR_B32_1]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[COPY12]](s1)
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.4, [[DEF]](s32), %bb.2
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4, [[C4]](s1), %bb.2
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -535,28 +481,20 @@ body: |
   ; GFX10-NEXT:   [[MV3:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY6]](s32), [[COPY7]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.3(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %61, %bb.3
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %15(s32), %bb.3, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %17(s32), %bb.3
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI2]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
@@ -564,7 +502,6 @@ body: |
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C5]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV2]], [[SHL1]](s64)
@@ -577,11 +514,10 @@ body: |
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.8(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, %60(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]]
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %35(s32), %bb.5, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI %37(s1), %bb.5, [[C1]](s1), %bb.1
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY12]](s1), [[PHI1]](s32)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI3]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.8
   ; GFX10-NEXT: {{  $}}
@@ -589,7 +525,6 @@ body: |
   ; GFX10-NEXT:   successors: %bb.6(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C7]](s1)
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL2:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C8]](s32)
   ; GFX10-NEXT:   [[PTR_ADD2:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV3]], [[SHL2]](s64)
@@ -602,14 +537,9 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.3(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[C4]](s1), %bb.2, %71(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[COPY14]](s1)
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI %46(s32), %bb.7, [[DEF]](s32), %bb.2
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s1) = G_PHI %47(s1), %bb.7, [[C4]](s1), %bb.2
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
@@ -622,30 +552,21 @@ body: |
   ; GFX10-NEXT:   [[C11:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD3]], [[C11]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD3]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C11]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C11]]
   ; GFX10-NEXT:   [[C12:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI2]](s32), [[C12]]
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP3]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY13]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C12]]
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[C7]](s1), %bb.4, [[S_OR_B32_2]](s1), %bb.6
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[COPY17]](s1)
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.6, [[DEF]](s32), %bb.4
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s1) = G_PHI [[ICMP3]](s1), %bb.6, [[C7]](s1), %bb.4
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF2]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY18]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.8:
-  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI9]](s32)
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.3
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
   ; GFX10-NEXT:   S_ENDPGM 0
   bb.0:
     successors: %bb.1(0x80000000)
@@ -775,39 +696,20 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY4]](s32), [[COPY5]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x40000000), %bb.5(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF3]](s1), %bb.0, %67(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, %56, %bb.5
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %43, %bb.5
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %12(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %14(s32), %bb.5
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI4]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[ICMP]](s1), %bb.5, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.3
   ; GFX10-NEXT: {{  $}}
@@ -822,7 +724,6 @@ body: |
   ; GFX10-NEXT:   successors: %bb.5(0x80000000)
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[C5]](s1)
   ; GFX10-NEXT:   [[C6:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL1:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C6]](s32)
   ; GFX10-NEXT:   [[PTR_ADD1:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV]], [[SHL1]](s64)
@@ -830,16 +731,9 @@ body: |
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI4]], [[C7]]
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
   ; GFX10-NEXT:   [[C8:%[0-9]+]]:_(s32) = G_CONSTANT i32 100
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI4]](s32), [[C8]]
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY12]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY14]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[C8]]
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -849,27 +743,21 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_1]](s1), %bb.1, [[S_OR_B32_3]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_2]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[PHI5]]
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[PHI6]]
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY16]](s1)
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[SI_IF]](s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY15]](s1), [[PHI3]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY6]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY17]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_4]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI8]](s32)
-  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY18]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+  ; GFX10-NEXT:   [[SI_IF1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
@@ -969,11 +857,7 @@ body: |
   ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:_(s32) = COPY $vgpr5
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s1) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[COPY6:%[0-9]+]]:sreg_32(s1) = COPY [[DEF]](s1)
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY4]](s32), [[COPY1]]
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF3:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.7
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
@@ -986,27 +870,18 @@ body: |
   ; GFX10-NEXT: bb.2:
   ; GFX10-NEXT:   successors: %bb.4(0x40000000), %bb.7(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI %67(s1), %bb.6, %70, %bb.7
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI %49(s1), %bb.6, %48(s1), %bb.7
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:sreg_32 = PHI %35(s1), %bb.6, %34(s1), %bb.7
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[PHI2]]
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[COPY9]](s1)
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, [[DEF]](s1), %bb.7
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s1) = G_PHI %12(s1), %bb.6, %14(s1), %bb.7
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), %15(s32)
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY8]](s1), %17(s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY10]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_]](s1)
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI1]](s1), %17(s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.4
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.3:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.3(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI3]](s32)
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.1, %19(s32), %bb.3
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT1:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[ICMP1]](s1), [[PHI2]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT1]](s32), %bb.3, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
@@ -1015,28 +890,18 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[INTRINSIC_CONVERGENT]](s32)
   ; GFX10-NEXT:   [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY5]](s32), [[COPY]]
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP2]](s1)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[C2]](s1)
   ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP]], [[C2]]
   ; GFX10-NEXT:   [[OR:%[0-9]+]]:_(s1) = G_OR [[ICMP2]], [[XOR]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT2:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[OR]](s1), %25(s32)
-  ; GFX10-NEXT:   [[DEF4:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF5:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %63(s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY11]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY13]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT2]](s32), %bb.7, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.5:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[ICMP2]](s1), %bb.4
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_1]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY14]](s1), [[COPY3]], [[COPY2]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[COPY3]], [[COPY2]]
   ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT3:%[0-9]+]]:_(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane), [[SELECT]](s32)
   ; GFX10-NEXT:   $sgpr0 = COPY [[INTRINSIC_CONVERGENT3]](s32)
   ; GFX10-NEXT:   SI_RETURN_TO_EPILOG implicit $sgpr0
@@ -1046,42 +911,17 @@ body: |
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT1]](s32), %bb.3
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s1) = G_CONSTANT i1 false
-  ; GFX10-NEXT:   [[COPY15:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
-  ; GFX10-NEXT:   [[COPY16:%[0-9]+]]:sreg_32(s1) = COPY [[C3]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI5]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_3:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %42(s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_3:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY16]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_3:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_3]](s1), [[S_AND_B32_3]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_ANDN2_B32_4:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 %56(s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_4:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY15]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_4:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_4]](s1), [[S_AND_B32_4]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[DEF6:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.7:
   ; GFX10-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[ICMP]](s1), %bb.0, [[S_OR_B32_]](s1), %bb.2, [[S_OR_B32_2]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:sreg_32 = PHI [[DEF3]](s1), %bb.0, [[PHI7]], %bb.2, [[S_OR_B32_1]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32 = PHI [[DEF2]](s1), %bb.0, [[PHI1]], %bb.2, [[DEF5]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI9:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, [[PHI2]], %bb.2, [[DEF4]](s1), %bb.4
-  ; GFX10-NEXT:   [[PHI10:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI10]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI11:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[COPY17:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]]
-  ; GFX10-NEXT:   [[COPY18:%[0-9]+]]:sreg_32(s1) = COPY [[PHI7]]
-  ; GFX10-NEXT:   [[COPY19:%[0-9]+]]:sreg_32(s1) = COPY [[PHI8]]
-  ; GFX10-NEXT:   [[COPY20:%[0-9]+]]:sreg_32(s1) = COPY [[PHI9]]
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT2]](s32), %bb.4, [[PHI6]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.4, [[INTRINSIC_CONVERGENT]](s32), %bb.2, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI8:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[ICMP]](s1), %bb.0, [[PHI]](s1), %bb.2, [[C2]](s1), %bb.4
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY21:%[0-9]+]]:sreg_32(s1) = COPY [[C4]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_5:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY20]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_5:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY6]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_5:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_5]](s1), [[S_AND_B32_5]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY22:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_5]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_6:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY19]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_6:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY21]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_6:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_6]](s1), [[S_AND_B32_6]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY23:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_6]](s1)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY17]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI8]](s1), %bb.2, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.1
   bb.0:
     successors: %bb.7(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
index 431da1de1fd48..54881f59096c1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
-; REQUIRES: do-not-run-me
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 define void @temporal_divergent_i1_phi(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i1_phi:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
index 64d740287d9fc..9c2d083d0aa1d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-i1.mir
@@ -1,9 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
-
-# Test is updated but copies between S1-register-with-reg-class and
-# register-with-reg-class-no-LLT fail machine verification
-# REQUIRES: do-not-run-me-with-machine-verifier
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: temporal_divergent_i1_phi
@@ -21,37 +17,30 @@ body: |
   ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI3]](s1)
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY4]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY3]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[PHI2]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -108,37 +97,30 @@ body: |
   ; GFX10-NEXT:   [[MV:%[0-9]+]]:_(p0) = G_MERGE_VALUES [[COPY1]](s32), [[COPY2]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
-  ; GFX10-NEXT:   [[DEF:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.2(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32 = PHI [[DEF]](s1), %bb.0, %22(s1), %bb.1
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
-  ; GFX10-NEXT:   [[COPY3:%[0-9]+]]:sreg_32(s1) = COPY [[PHI]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %7(s32), %bb.1, [[C1]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C1]](s32), %bb.0, %9(s32), %bb.1
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s1) = G_PHI [[C]](s1), %bb.0, %11(s1), %bb.1
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI3]], [[C2]]
-  ; GFX10-NEXT:   [[COPY4:%[0-9]+]]:sreg_32(s1) = COPY [[XOR]](s1)
-  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI2]](s32)
+  ; GFX10-NEXT:   [[XOR:%[0-9]+]]:_(s1) = G_XOR [[PHI2]], [[C2]]
+  ; GFX10-NEXT:   [[UITOFP:%[0-9]+]]:_(s32) = G_UITOFP [[PHI1]](s32)
   ; GFX10-NEXT:   [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ogt), [[UITOFP]](s32), [[COPY]]
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
-  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI2]], [[C3]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI1]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY3]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY4]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C3]]
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[FCMP]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.2:
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[XOR]](s1), %bb.1
   ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.1
-  ; GFX10-NEXT:   [[COPY5:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI4]](s32)
   ; GFX10-NEXT:   [[C4:%[0-9]+]]:_(s32) = G_FCONSTANT float 0.000000e+00
   ; GFX10-NEXT:   [[C5:%[0-9]+]]:_(s32) = G_FCONSTANT float 1.000000e+00
-  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[COPY5]](s1), [[C5]], [[C4]]
+  ; GFX10-NEXT:   [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[PHI3]](s1), [[C5]], [[C4]]
   ; GFX10-NEXT:   G_STORE [[SELECT]](s32), [[MV]](p0) :: (store (s32))
   ; GFX10-NEXT:   SI_RETURN
   bb.0:
@@ -201,31 +183,20 @@ body: |
   ; GFX10-NEXT:   [[MV2:%[0-9]+]]:_(p1) = G_MERGE_VALUES [[COPY5]](s32), [[COPY6]](s32)
   ; GFX10-NEXT:   [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF1:%[0-9]+]]:sreg_32(s1) = IMPLICIT_DEF
-  ; GFX10-NEXT:   [[DEF2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = IMPLICIT_DEF
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.1:
   ; GFX10-NEXT:   successors: %bb.3(0x50000000), %bb.5(0x30000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:sreg_32_xm0_xexec = PHI [[DEF2]](s1), %bb.0, %53(s1), %bb.5
-  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:sreg_32 = PHI [[DEF1]](s1), %bb.0, %42, %bb.5
-  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
-  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
-  ; GFX10-NEXT:   [[COPY7:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI]]
-  ; GFX10-NEXT:   [[COPY8:%[0-9]+]]:sreg_32(s1) = COPY [[PHI1]]
+  ; GFX10-NEXT:   [[PHI:%[0-9]+]]:_(s32) = G_PHI %13(s32), %bb.5, [[C]](s32), %bb.0
+  ; GFX10-NEXT:   [[PHI1:%[0-9]+]]:_(s32) = G_PHI [[C]](s32), %bb.0, %15(s32), %bb.5
   ; GFX10-NEXT:   [[C1:%[0-9]+]]:_(s1) = G_CONSTANT i1 true
-  ; GFX10-NEXT:   [[COPY9:%[0-9]+]]:sreg_32(s1) = COPY [[C1]](s1)
-  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI3]](s32)
+  ; GFX10-NEXT:   [[SEXT:%[0-9]+]]:_(s64) = G_SEXT [[PHI1]](s32)
   ; GFX10-NEXT:   [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 2
   ; GFX10-NEXT:   [[SHL:%[0-9]+]]:_(s64) = G_SHL [[SEXT]], [[C2]](s32)
   ; GFX10-NEXT:   [[PTR_ADD:%[0-9]+]]:_(p1) = G_PTR_ADD [[MV1]], [[SHL]](s64)
   ; GFX10-NEXT:   [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[PTR_ADD]](p1) :: (load (s32), addrspace 1)
   ; GFX10-NEXT:   [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
   ; GFX10-NEXT:   [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(ne), [[LOAD]](s32), [[C3]]
-  ; GFX10-NEXT:   [[S_ANDN2_B32_:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY8]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY9]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_]](s1), [[S_AND_B32_]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[COPY10:%[0-9]+]]:sreg_32(s1) = COPY [[S_OR_B32_]](s1)
   ; GFX10-NEXT:   G_BRCOND [[ICMP]](s1), %bb.3
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
@@ -247,12 +218,8 @@ body: |
   ; GFX10-NEXT:   [[C7:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
   ; GFX10-NEXT:   [[ADD:%[0-9]+]]:_(s32) = G_ADD [[LOAD1]], [[C7]]
   ; GFX10-NEXT:   G_STORE [[ADD]](s32), [[PTR_ADD1]](p1) :: (store (s32), addrspace 1)
-  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI3]], [[C7]]
-  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI3]](s32), [[COPY2]]
-  ; GFX10-NEXT:   [[COPY11:%[0-9]+]]:sreg_32(s1) = COPY [[ICMP1]](s1)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_1:%[0-9]+]]:sreg_32(s1) = S_ANDN2_B32 [[COPY10]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_1:%[0-9]+]]:sreg_32(s1) = S_AND_B32 $exec_lo, [[COPY11]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_1:%[0-9]+]]:sreg_32(s1) = S_OR_B32 [[S_ANDN2_B32_1]](s1), [[S_AND_B32_1]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[ADD1:%[0-9]+]]:_(s32) = G_ADD [[PHI1]], [[C7]]
+  ; GFX10-NEXT:   [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(ult), [[PHI1]](s32), [[COPY2]]
   ; GFX10-NEXT:   G_BR %bb.5
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.4:
@@ -262,25 +229,20 @@ body: |
   ; GFX10-NEXT: bb.5:
   ; GFX10-NEXT:   successors: %bb.6(0x04000000), %bb.1(0x7c000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:sreg_32 = PHI [[S_OR_B32_]](s1), %bb.1, [[S_OR_B32_1]](s1), %bb.3
-  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
-  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
-  ; GFX10-NEXT:   [[COPY12:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[PHI6]](s1)
-  ; GFX10-NEXT:   [[COPY13:%[0-9]+]]:sreg_32(s1) = COPY [[PHI4]]
-  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[COPY13]](s1), [[PHI2]](s32)
-  ; GFX10-NEXT:   [[S_ANDN2_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_ANDN2_B32 [[COPY7]](s1), $exec_lo, implicit-def $scc
-  ; GFX10-NEXT:   [[S_AND_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_AND_B32 $exec_lo, [[COPY12]](s1), implicit-def $scc
-  ; GFX10-NEXT:   [[S_OR_B32_2:%[0-9]+]]:sreg_32_xm0_xexec(s1) = S_OR_B32 [[S_ANDN2_B32_2]](s1), [[S_AND_B32_2]](s1), implicit-def $scc
+  ; GFX10-NEXT:   [[PHI2:%[0-9]+]]:_(s32) = G_PHI [[ADD1]](s32), %bb.3, [[DEF]](s32), %bb.1
+  ; GFX10-NEXT:   [[PHI3:%[0-9]+]]:_(s1) = G_PHI [[C5]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[PHI4:%[0-9]+]]:_(s1) = G_PHI [[ICMP1]](s1), %bb.3, [[C1]](s1), %bb.1
+  ; GFX10-NEXT:   [[INTRINSIC_CONVERGENT:%[0-9]+]]:sreg_32_xm0_xexec(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.if.break), [[PHI4]](s1), [[PHI]](s32)
   ; GFX10-NEXT:   SI_LOOP [[INTRINSIC_CONVERGENT]](s32), %bb.1, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.6
   ; GFX10-NEXT: {{  $}}
   ; GFX10-NEXT: bb.6:
   ; GFX10-NEXT:   successors: %bb.2(0x40000000), %bb.4(0x40000000)
   ; GFX10-NEXT: {{  $}}
-  ; GFX10-NEXT:   [[PHI7:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
-  ; GFX10-NEXT:   [[COPY14:%[0-9]+]]:sreg_32_xm0_xexec(s1) = COPY [[S_OR_B32_2]](s1)
-  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI7]](s32)
-  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[COPY14]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
+  ; GFX10-NEXT:   [[PHI5:%[0-9]+]]:sreg_32_xm0_xexec(s1) = G_PHI [[PHI3]](s1), %bb.5
+  ; GFX10-NEXT:   [[PHI6:%[0-9]+]]:_(s32) = G_PHI [[INTRINSIC_CONVERGENT]](s32), %bb.5
+  ; GFX10-NEXT:   G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS intrinsic(@llvm.amdgcn.end.cf), [[PHI6]](s32)
+  ; GFX10-NEXT:   [[SI_IF:%[0-9]+]]:sreg_32_xm0_xexec(s32) = SI_IF [[PHI5]](s1), %bb.4, implicit-def $exec, implicit-def $scc, implicit $exec
   ; GFX10-NEXT:   G_BR %bb.2
   bb.0:
     successors: %bb.1(0x80000000)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
index b21e6a729dbc2..d065f228a4e4a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3
-; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10 %s
+; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 < %s | FileCheck -check-prefix=GFX10 %s
 
 define void @temporal_divergent_i32(float %val, ptr %addr) {
 ; GFX10-LABEL: temporal_divergent_i32:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
index cd6a0f8297a5a..4cc68029489e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-temporal-divergent-reg.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn-mesa-amdpal -mcpu=gfx1010 -run-pass=amdgpu-global-isel-divergence-lowering %s -o - | FileCheck -check-prefix=GFX10 %s
 
 ---
 name: temporal_divergent_i32
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
index 6384c47398fce..25e2267fdee89 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergent-control-flow.ll
@@ -1,6 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -amdgpu-global-isel-risky-select -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s
-; REQUIRES: do-not-run-me
 
 ; Make sure the branch targets are correct after lowering llvm.amdgcn.if
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
index c7d45f062d0d2..c7b7a84e821f9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-phi.mir
@@ -422,7 +422,7 @@ body:             |
     G_BR %bb.2
 
   bb.2:
-    %6:sgpr(s32) = G_PHI %0(s32), %bb.0, %5(s32), %bb.1
+    %6:sgpr(s32) = PHI %0(s32), %bb.0, %5(s32), %bb.1
     $sgpr0 = COPY %6(s32)
     S_SETPC_B64 undef $sgpr30_sgpr31
 
@@ -476,7 +476,7 @@ body:             |
     G_BR %bb.2
 
   bb.2:
-    %6:vgpr(s32) = G_PHI %0(s32), %bb.0, %5(s32), %bb.1
+    %6:vgpr(s32) = PHI %0(s32), %bb.0, %5(s32), %bb.1
     $vgpr0 = COPY %6
     S_SETPC_B64 undef $sgpr30_sgpr31
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
index 4caf83774bbba..a5482bd5b79a9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll
@@ -5,7 +5,6 @@
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1010 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX10_W64 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 < %s | FileCheck --check-prefix=GFX11_W32 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 < %s | FileCheck --check-prefix=GFX11_W64 %s
-; REQUIRES: do-not-run-me
 
 define float @v_div_fmas_f32(float %a, float %b, float %c, i1 %d) {
 ; GFX7-LABEL: v_div_fmas_f32:

From cfddb59be2124f7ec615f48a2d0395c6fdb1bb56 Mon Sep 17 00:00:00 2001
From: Mariusz Sikora <mariusz.sikora@amd.com>
Date: Wed, 24 Jan 2024 12:21:15 +0100
Subject: [PATCH 771/843] =?UTF-8?q?[AMDGPU][GFX12]=20VOP=20encoding=20and?=
 =?UTF-8?q?=20codegen=20-=20add=20support=20for=20v=5Fcvt=20fp8/=E2=80=A6?=
 =?UTF-8?q?=20(#78414)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…bf8 instructions

    Add VOP1, VOP1_DPP8, VOP1_DPP16, VOP3, VOP3_DPP8, VOP3_DPP16
    instructions that were supported on GFX940 (MI300):
    - V_CVT_F32_FP8
    - V_CVT_F32_BF8
    - V_CVT_PK_F32_FP8
    - V_CVT_PK_F32_BF8
    - V_CVT_PK_FP8_F32
    - V_CVT_PK_BF8_F32
    - V_CVT_SR_FP8_F32
    - V_CVT_SR_BF8_F32

---------

Co-authored-by: Mateja Marjanovic <mateja.marjanovic@amd.com>
Co-authored-by: Mirko Brkušanin <Mirko.Brkusanin@amd.com>
---
 clang/test/CodeGenOpenCL/amdgpu-features.cl   |   4 +-
 .../test/CodeGenOpenCL/builtins-amdgcn-fp8.cl |  35 +-
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   1 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  31 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |  26 +
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |  10 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   7 +-
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |  11 +
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |   3 +
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |  93 +++-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  53 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |  29 +-
 llvm/lib/TargetParser/TargetParser.cpp        |   1 +
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll | 142 ++++++
 .../AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir        | 197 ++++++++
 .../CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll     | 466 +++++++++++++++---
 llvm/test/MC/AMDGPU/gfx12_asm_vop1.s          |  45 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s    |  12 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s     |  12 +
 llvm/test/MC/AMDGPU/gfx12_asm_vop3.s          |  36 ++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s    | 108 ++++
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s     |  48 ++
 .../test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s | 138 ++++++
 .../AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s   |  12 +
 .../MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s |  12 +
 .../Disassembler/AMDGPU/gfx12_dasm_vop1.txt   |  36 ++
 .../AMDGPU/gfx12_dasm_vop1_dpp16.txt          |  12 +
 .../AMDGPU/gfx12_dasm_vop1_dpp8.txt           |  12 +
 .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt   |  36 ++
 .../AMDGPU/gfx12_dasm_vop3_dpp16.txt          | 108 ++++
 .../AMDGPU/gfx12_dasm_vop3_dpp8.txt           |  48 ++
 .../AMDGPU/gfx12_dasm_vop3_from_vop1.txt      |  36 ++
 .../gfx12_dasm_vop3_from_vop1_dpp16.txt       |  12 +
 .../AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt |  12 +
 34 files changed, 1742 insertions(+), 102 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir

diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 1ba2b129f6895..9c8ca0bb96f61 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -100,8 +100,8 @@
 // GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
index 56d757012a5e7..4e3a56b4201bb 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-fp8.cl
@@ -1,59 +1,60 @@
 // REQUIRES: amdgpu-registered-target
-// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX940
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx940  -S -emit-llvm -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S -emit-llvm -o - %s | FileCheck %s
 
 typedef float  v2f   __attribute__((ext_vector_type(2)));
 
-// CHECK-GFX940-LABEL: @test_cvt_f32_bf8
-// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
+// CHECK-LABEL: @test_cvt_f32_bf8
+// CHECK: call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
 void test_cvt_f32_bf8(global int* out, int a)
 {
   *out = __builtin_amdgcn_cvt_f32_bf8(a, 0);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_f32_fp8
-// CHECK-GFX940: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
+// CHECK-LABEL: @test_cvt_f32_fp8
+// CHECK: call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
 void test_cvt_f32_fp8(global int* out, int a)
 {
   *out = __builtin_amdgcn_cvt_f32_fp8(a, 1);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_pk_f32_bf8
-// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
+// CHECK-LABEL: @test_cvt_pk_f32_bf8
+// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
 void test_cvt_pk_f32_bf8(global v2f* out, int a)
 {
   *out = __builtin_amdgcn_cvt_pk_f32_bf8(a, false);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_pk_f32_fp8
-// CHECK-GFX940: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
+// CHECK-LABEL: @test_cvt_pk_f32_fp8
+// CHECK: call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
 void test_cvt_pk_f32_fp8(global v2f* out, int a)
 {
   *out = __builtin_amdgcn_cvt_pk_f32_fp8(a, true);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_pk_bf8_f32
-// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
+// CHECK-LABEL: @test_cvt_pk_bf8_f32
+// CHECK: call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %a, float %b, i32 %old, i1 false)
 void test_cvt_pk_bf8_f32(global int* out, int old, float a, float b)
 {
   *out = __builtin_amdgcn_cvt_pk_bf8_f32(a, b, old, false);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_pk_fp8_f32
-// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
+// CHECK-LABEL: @test_cvt_pk_fp8_f32
+// CHECK: call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %a, float %b, i32 %old, i1 true)
 void test_cvt_pk_fp8_f32(global int* out, int old, float a, float b)
 {
   *out = __builtin_amdgcn_cvt_pk_fp8_f32(a, b, old, true);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_sr_bf8_f32
-// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
+// CHECK-LABEL: @test_cvt_sr_bf8_f32
+// CHECK: call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %a, i32 %b, i32 %old, i32 2)
 void test_cvt_sr_bf8_f32(global int* out, int old, float a, int b)
 {
   *out = __builtin_amdgcn_cvt_sr_bf8_f32(a, b, old, 2);
 }
 
-// CHECK-GFX940-LABEL: @test_cvt_sr_fp8_f32
-// CHECK-GFX940: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
+// CHECK-LABEL: @test_cvt_sr_fp8_f32
+// CHECK: call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %a, i32 %b, i32 %old, i32 3)
 void test_cvt_sr_fp8_f32(global int* out, int old, float a, int b)
 {
   *out = __builtin_amdgcn_cvt_sr_fp8_f32(a, b, old, 3);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index cb29d5d947598..250e3e350c02e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1506,6 +1506,7 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureFlatAtomicFaddF32Inst,
    FeatureImageInsts,
    FeatureExtendedImageInsts,
+   FeatureFP8ConversionInsts,
    FeaturePackedTID,
    FeatureVcmpxPermlaneHazard,
    FeatureSALUFloatInsts,
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 489cf85693edb..6b1d04b04066f 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -3500,6 +3500,9 @@ bool AMDGPUAsmParser::usesConstantBus(const MCInst &Inst, unsigned OpIdx) {
     return !isInlineConstant(Inst, OpIdx);
   } else if (MO.isReg()) {
     auto Reg = MO.getReg();
+    if (!Reg) {
+      return false;
+    }
     const MCRegisterInfo *TRI = getContext().getRegisterInfo();
     auto PReg = mc2PseudoReg(Reg);
     return isSGPR(PReg, TRI) && PReg != SGPR_NULL;
@@ -8303,12 +8306,20 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
   const bool IsPacked = (Desc.TSFlags & SIInstrFlags::IsPacked) != 0;
 
   if (Opc == AMDGPU::V_CVT_SR_BF8_F32_vi ||
-      Opc == AMDGPU::V_CVT_SR_FP8_F32_vi) {
+      Opc == AMDGPU::V_CVT_SR_FP8_F32_vi ||
+      Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_gfx12 ||
+      Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_gfx12) {
     Inst.addOperand(MCOperand::createImm(0)); // Placeholder for src2_mods
     Inst.addOperand(Inst.getOperand(0));
   }
 
-  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in)) {
+  // Adding vdst_in operand is already covered for these DPP instructions in
+  // cvtVOP3DPP.
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::vdst_in) &&
+      !(Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp_gfx12 ||
+        Opc == AMDGPU::V_CVT_PK_BF8_F32_e64_dpp8_gfx12 ||
+        Opc == AMDGPU::V_CVT_PK_FP8_F32_e64_dpp8_gfx12)) {
     assert(!IsPacked);
     Inst.addOperand(Inst.getOperand(0));
   }
@@ -8770,6 +8781,22 @@ void AMDGPUAsmParser::cvtVOP3DPP(MCInst &Inst, const OperandVector &Operands,
       }
     }
 
+    int VdstInIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst_in);
+    if (VdstInIdx == static_cast<int>(Inst.getNumOperands())) {
+      Inst.addOperand(Inst.getOperand(0));
+    }
+
+    bool IsVOP3CvtSrDpp = Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+                          Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12 ||
+                          Opc == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
+                          Opc == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12;
+    if (IsVOP3CvtSrDpp) {
+      if (Src2ModIdx == static_cast<int>(Inst.getNumOperands())) {
+        Inst.addOperand(MCOperand::createImm(0));
+        Inst.addOperand(MCOperand::createReg(0));
+      }
+    }
+
     auto TiedTo = Desc.getOperandConstraint(Inst.getNumOperands(),
                                             MCOI::TIED_TO);
     if (TiedTo != -1) {
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 86096b0d80b42..2c6aa2ee348a1 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -712,6 +712,13 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                          AMDGPU::OpName::src2_modifiers);
   }
 
+  if (Res && (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp ||
+              MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp)) {
+    // Insert dummy unused src2_modifiers.
+    insertNamedMCOperand(MI, MCOperand::createImm(0),
+                         AMDGPU::OpName::src2_modifiers);
+  }
+
   if (Res && (MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::DS) &&
       !AMDGPU::hasGDS(STI)) {
     insertNamedMCOperand(MI, MCOperand::createImm(0), AMDGPU::OpName::gds);
@@ -942,6 +949,7 @@ void AMDGPUDisassembler::convertMacDPPInst(MCInst &MI) const {
 // first add optional MI operands to check FI
 DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
   unsigned Opc = MI.getOpcode();
+
   if (MCII->get(Opc).TSFlags & SIInstrFlags::VOP3P) {
     convertVOP3PDPPInst(MI);
   } else if ((MCII->get(Opc).TSFlags & SIInstrFlags::VOPC) ||
@@ -951,6 +959,15 @@ DecodeStatus AMDGPUDisassembler::convertDPP8Inst(MCInst &MI) const {
     if (isMacDPP(MI))
       convertMacDPPInst(MI);
 
+    int VDstInIdx =
+        AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+    if (VDstInIdx != -1)
+      insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+
+    if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp8_gfx12 ||
+        MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp8_gfx12)
+      insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+
     unsigned DescNumOps = MCII->get(Opc).getNumOperands();
     if (MI.getNumOperands() < DescNumOps &&
         AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::op_sel)) {
@@ -977,6 +994,15 @@ DecodeStatus AMDGPUDisassembler::convertVOP3DPPInst(MCInst &MI) const {
   if (isMacDPP(MI))
     convertMacDPPInst(MI);
 
+  int VDstInIdx =
+      AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::vdst_in);
+  if (VDstInIdx != -1)
+    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::vdst_in);
+
+  if (MI.getOpcode() == AMDGPU::V_CVT_SR_BF8_F32_e64_dpp_gfx12 ||
+      MI.getOpcode() == AMDGPU::V_CVT_SR_FP8_F32_e64_dpp_gfx12)
+    insertNamedMCOperand(MI, MI.getOperand(0), AMDGPU::OpName::src2);
+
   unsigned Opc = MI.getOpcode();
   unsigned DescNumOps = MCII->get(Opc).getNumOperands();
   if (MI.getNumOperands() < DescNumOps &&
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index e73e53aa270f9..9e64e3fd79576 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1305,6 +1305,16 @@ void AMDGPUInstPrinter::printOpSel(const MCInst *MI, unsigned,
                                    const MCSubtargetInfo &STI,
                                    raw_ostream &O) {
   unsigned Opc = MI->getOpcode();
+  if (isCvt_F32_Fp8_Bf8_e64(Opc)) {
+    auto SrcMod =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
+    unsigned Mod = MI->getOperand(SrcMod).getImm();
+    unsigned Index0 = !!(Mod & SISrcMods::OP_SEL_0);
+    unsigned Index1 = !!(Mod & SISrcMods::OP_SEL_1);
+    if (Index0 || Index1)
+      O << " op_sel:[" << Index0 << ',' << Index1 << ']';
+    return;
+  }
   if (isPermlane16(Opc)) {
     auto FIN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0_modifiers);
     auto BCN = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src1_modifiers);
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index a6820544f4b4d..4ab840a4c8485 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1684,8 +1684,9 @@ class getIns64 <RegisterOperand Src0RC, RegisterOperand Src1RC,
         !if(HasOMod,
           (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
                clampmod0:$clamp, omod0:$omod),
-          (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
-               clampmod0:$clamp))
+          !if (HasClamp,
+            (ins Src0Mod:$src0_modifiers, Src0RC:$src0, clampmod0:$clamp),
+            (ins Src0Mod:$src0_modifiers, Src0RC:$src0)))
       /* else */,
         // VOP1 without modifiers
         !if (HasClamp,
@@ -2279,6 +2280,8 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit IsSingle = 0;
   field bit IsWMMA = 0;
 
+  field bit IsFP8 = 0;
+
   field bit HasDst = !ne(DstVT.Value, untyped.Value);
   field bit HasDst32 = HasDst;
   field bit EmitDst = HasDst; // force dst encoding, see v_movreld_b32 special case
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 0bf9452d822e9..106fdb19f2789 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -529,6 +529,17 @@ bool isPermlane16(unsigned Opc) {
          Opc == AMDGPU::V_PERMLANEX16_VAR_B32_e64_gfx12;
 }
 
+bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc) {
+  return Opc == AMDGPU::V_CVT_F32_BF8_e64_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_FP8_e64_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_BF8_e64_dpp8_gfx12 ||
+         Opc == AMDGPU::V_CVT_F32_FP8_e64_dpp8_gfx12 ||
+         Opc == AMDGPU::V_CVT_PK_F32_BF8_e64_gfx12 ||
+         Opc == AMDGPU::V_CVT_PK_F32_FP8_e64_gfx12;
+}
+
 bool isGenericAtomic(unsigned Opc) {
   return Opc == AMDGPU::G_AMDGPU_ATOMIC_FMIN ||
          Opc == AMDGPU::G_AMDGPU_ATOMIC_FMAX ||
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index d3f55c7920174..11b0bc5c81711 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -535,6 +535,9 @@ bool isPermlane16(unsigned Opc);
 LLVM_READNONE
 bool isGenericAtomic(unsigned Opc);
 
+LLVM_READNONE
+bool isCvt_F32_Fp8_Bf8_e64(unsigned Opc);
+
 namespace VOPD {
 
 enum Component : unsigned {
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 95a1d86963473..ef652fce65482 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -571,6 +571,7 @@ let SubtargetPredicate = isGFX9Only in {
 } // End SubtargetPredicate = isGFX9Only
 
 class VOPProfile_Base_CVT_F32_F8<ValueType vt> : VOPProfileI2F <vt, i32> {
+  let HasExtDPP = 1;
   let HasExtSDWA = 1;
   let HasExtSDWA9 = 1;
   let HasExt = 1;
@@ -599,6 +600,7 @@ class Cvt_F32_F8_Pat<SDPatternOperator node, int index,
     (inst_sdwa 0, $src, 0, 0, index)
 >;
 
+let SubtargetPredicate = isGFX9Only in {
 let OtherPredicates = [HasCvtFP8VOP1Bug] in {
   def : GCNPat<(f32 (int_amdgcn_cvt_f32_fp8 i32:$src, 0)),
                (V_CVT_F32_FP8_sdwa 0, $src, 0, 0, 0)>;
@@ -617,6 +619,7 @@ foreach Index = [1, 2, 3] in {
   def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_fp8, Index, V_CVT_F32_FP8_sdwa>;
   def : Cvt_F32_F8_Pat<int_amdgcn_cvt_f32_bf8, Index, V_CVT_F32_BF8_sdwa>;
 }
+} // End SubtargetPredicate = isGFX9Only
 
 class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
     VOP1_Pseudo inst_e32, VOP1_SDWA_Pseudo inst_sdwa> : GCNPat<
@@ -626,11 +629,77 @@ class Cvt_PK_F32_F8_Pat<SDPatternOperator node, int index,
          (inst_e32 $src))
 >;
 
-foreach Index = [0, -1] in {
-  def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
-                          V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
-  def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
-                          V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+let SubtargetPredicate = isGFX9Only in {
+  foreach Index = [0, -1] in {
+    def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_fp8, Index,
+                            V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_sdwa>;
+    def : Cvt_PK_F32_F8_Pat<int_amdgcn_cvt_pk_f32_bf8, Index,
+                            V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_sdwa>;
+  }
+}
+
+
+// Similar to VOPProfile_Base_CVT_F32_F8, but for VOP3 instructions.
+def VOPProfile_Base_CVT_PK_F32_F8_OpSel : VOPProfileI2F <v2f32, i32> {
+  let HasOpSel = 1;
+  let HasExtVOP3DPP = 0;
+}
+
+def VOPProfile_Base_CVT_F32_F8_OpSel : VOPProfile<[f32, i32, untyped, untyped]> {
+  let HasOpSel = 1;
+  let HasExtDPP = 1;
+  let HasExtVOP3DPP = 1;
+  let IsFP8 = 1;
+  let HasClamp = 0;
+  let HasOMod = 0;
+  let HasModifiers = 1;
+  let Src1VOP3DPP = Src1RC64;
+}
+
+let SubtargetPredicate = isGFX12Plus, mayRaiseFPException = 0,
+    SchedRW = [WriteFloatCvt] in {
+  defm V_CVT_F32_FP8_OP_SEL    : VOP1Inst<"v_cvt_f32_fp8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
+  defm V_CVT_F32_BF8_OP_SEL    : VOP1Inst<"v_cvt_f32_bf8_op_sel", VOPProfile_Base_CVT_F32_F8_OpSel>;
+  defm V_CVT_PK_F32_FP8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_fp8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
+  defm V_CVT_PK_F32_BF8_OP_SEL : VOP1Inst<"v_cvt_pk_f32_bf8_op_sel", VOPProfile_Base_CVT_PK_F32_F8_OpSel>;
+}
+
+class Cvt_F32_F8_Pat_OpSel<SDPatternOperator node, bits<2> index,
+    VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
+    (f32 (node i32:$src, index)),
+    !if (index,
+         (inst_e64 !if(index{0},
+                         !if(index{1}, !or(SRCMODS.OP_SEL_0, SRCMODS.OP_SEL_1),
+                                       SRCMODS.OP_SEL_0),
+                         !if(index{1}, SRCMODS.OP_SEL_1, 0)),
+                    $src, 0),
+         (inst_e32 $src))
+>;
+
+let SubtargetPredicate = isGFX12Plus in {
+  foreach Index = [0, 1, 2, 3] in {
+    def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_fp8, Index,
+                               V_CVT_F32_FP8_e32, V_CVT_F32_FP8_OP_SEL_e64>;
+    def : Cvt_F32_F8_Pat_OpSel<int_amdgcn_cvt_f32_bf8, Index,
+                               V_CVT_F32_BF8_e32, V_CVT_F32_BF8_OP_SEL_e64>;
+  }
+}
+
+class Cvt_PK_F32_F8_Pat_OpSel<SDPatternOperator node, int index,
+    VOP1_Pseudo inst_e32, VOP3_Pseudo inst_e64> : GCNPat<
+    (v2f32 (node i32:$src, index)),
+    !if (index,
+         (inst_e64 SRCMODS.OP_SEL_0, $src, 0, 0, SRCMODS.NONE),
+         (inst_e32 $src))
+>;
+
+let SubtargetPredicate = isGFX12Plus in {
+  foreach Index = [0, -1] in {
+    def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_fp8, Index,
+                                  V_CVT_PK_F32_FP8_e32, V_CVT_PK_F32_FP8_OP_SEL_e64>;
+    def : Cvt_PK_F32_F8_Pat_OpSel<int_amdgcn_cvt_pk_f32_bf8, Index,
+                                  V_CVT_PK_F32_BF8_e32, V_CVT_PK_F32_BF8_OP_SEL_e64>;
+  }
 }
 
 let SubtargetPredicate = isGFX10Plus in {
@@ -853,6 +922,20 @@ multiclass VOP1_Real_NO_DPP_OP_SEL_with_name<GFXGen Gen, bits<9> op,
   VOP3_Real_with_name<Gen, {0, 1, 1, op{6-0}}, opName, asmName>;
 
 
+// Define VOP1 instructions using the pseudo instruction with its old profile and
+// VOP3 using the OpSel profile for the pseudo instruction.
+defm V_CVT_F32_FP8      : VOP1_Real_NO_VOP3_with_name_gfx12<0x06c, "V_CVT_F32_FP8", "v_cvt_f32_fp8">;
+defm V_CVT_F32_FP8      : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06c, "V_CVT_F32_FP8_OP_SEL", "v_cvt_f32_fp8">;
+
+defm V_CVT_F32_BF8      : VOP1_Real_NO_VOP3_with_name_gfx12<0x06d, "V_CVT_F32_BF8", "v_cvt_f32_bf8">;
+defm V_CVT_F32_BF8      : VOP1_Realtriple_e64_with_name<GFX12Gen, 0x06d, "V_CVT_F32_BF8_OP_SEL", "v_cvt_f32_bf8">;
+
+defm V_CVT_PK_F32_FP8   : VOP1_Real_e32_with_name<GFX12Gen, 0x06e, "V_CVT_PK_F32_FP8", "v_cvt_pk_f32_fp8">;
+defm V_CVT_PK_F32_FP8   : VOP3_Real_with_name<GFX12Gen, 0x1ee, "V_CVT_PK_F32_FP8_OP_SEL", "v_cvt_pk_f32_fp8">;
+
+defm V_CVT_PK_F32_BF8   : VOP1_Real_e32_with_name<GFX12Gen, 0x06f, "V_CVT_PK_F32_BF8", "v_cvt_pk_f32_bf8">;
+defm V_CVT_PK_F32_BF8   : VOP3_Real_with_name<GFX12Gen, 0x1ef, "V_CVT_PK_F32_BF8_OP_SEL", "v_cvt_pk_f32_bf8">;
+
 defm V_CVT_NEAREST_I32_F32 : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00c,
   "V_CVT_RPI_I32_F32", "v_cvt_nearest_i32_f32">;
 defm V_CVT_FLOOR_I32_F32   : VOP1_Real_FULL_with_name_gfx11_gfx12<0x00d,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 713b4712d563c..14db522102148 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -520,8 +520,26 @@ def VOP3_CVT_PK_F8_F32_Profile : VOP3_Profile<VOP_I32_F32_F32, VOP3_OPSEL> {
   let InsVOP3OpSel = (ins FP32InputMods:$src0_modifiers, Src0RC64:$src0,
                           FP32InputMods:$src1_modifiers, Src1RC64:$src1,
                           VGPR_32:$vdst_in, op_sel0:$op_sel);
+  let InsVOP3DPP = (ins VGPR_32:$old,
+                        FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                        FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                        VGPR_32:$vdst_in, op_sel0:$op_sel,
+                        dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                        bank_mask:$bank_mask, bound_ctrl:$bound_ctrl);
+
+  let InsVOP3DPP16 = (ins VGPR_32:$old,
+                          FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                          FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                          VGPR_32:$vdst_in, op_sel0:$op_sel,
+                          dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                          bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi);
+  let InsVOP3DPP8 = (ins VGPR_32:$old,
+                         FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                         FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                         VGPR_32:$vdst_in, op_sel0:$op_sel, dpp8:$dpp8, FI:$fi);
+
   let HasClamp = 0;
-  let HasExtVOP3DPP = 0;
+  let HasExtVOP3DPP = 1;
 }
 
 def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
@@ -530,14 +548,36 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
                           FP32InputMods:$src1_modifiers, Src1RC64:$src1,
                           FP32InputMods:$src2_modifiers, VGPR_32:$src2,
                           op_sel0:$op_sel);
+  let InsVOP3DPP16 = (ins VGPR_32:$old,
+                          FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                          FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                          FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+                          op_sel0:$op_sel, dpp_ctrl:$dpp_ctrl, row_mask:$row_mask,
+                          bank_mask:$bank_mask, bound_ctrl:$bound_ctrl, FI:$fi);
+  let InsVOP3DPP8 = (ins VGPR_32:$old,
+                         FP32InputMods:$src0_modifiers, Src0VOP3DPP:$src0,
+                         FP32InputMods:$src1_modifiers, Src1VOP3DPP:$src1,
+                         FP32InputMods:$src2_modifiers, VGPR_32:$src2,
+                         op_sel0:$op_sel, dpp8:$dpp8, FI:$fi);
   let HasClamp = 0;
   let HasSrc2 = 0;
   let HasSrc2Mods = 1;
+  let HasExtVOP3DPP = 1;
+  let HasOpSel = 1;
   let AsmVOP3OpSel = !subst(", $src2_modifiers", "",
                             getAsmVOP3OpSel<3, HasClamp, HasOMod,
                                             HasSrc0FloatMods, HasSrc1FloatMods,
                                             HasSrc2FloatMods>.ret);
-  let HasExtVOP3DPP = 0;
+  let AsmVOP3DPP16 = !subst(", $src2_modifiers", "",
+                            getAsmVOP3DPP16<getAsmVOP3Base<3, 1, HasClamp, 1,
+                                            HasOMod, 0, 1, HasSrc0FloatMods,
+                                            HasSrc1FloatMods,
+                                            HasSrc2FloatMods>.ret>.ret);
+  let AsmVOP3DPP8 = !subst(", $src2_modifiers", "",
+                           getAsmVOP3DPP8<getAsmVOP3Base<3, 1, HasClamp, 1,
+                                          HasOMod, 0, 1, HasSrc0FloatMods,
+                                          HasSrc1FloatMods,
+                                          HasSrc2FloatMods>.ret>.ret);
 }
 
 def IsPow2Plus1: PatLeaf<(i32 imm), [{
@@ -618,13 +658,13 @@ let SubtargetPredicate = HasFP8ConversionInsts, mayRaiseFPException = 0,
 
 class Cvt_PK_F8_F32_Pat<SDPatternOperator node, int index, VOP3_Pseudo inst> : GCNPat<
     (i32 (node f32:$src0, f32:$src1, i32:$old, index)),
-    (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, !if(index, SRCMODS.OP_SEL_0, 0))
+    (inst !if(index, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1, $old, 0)
 >;
 
 class Cvt_SR_F8_F32_Pat<SDPatternOperator node, bits<2> index, VOP3_Pseudo inst> : GCNPat<
     (i32 (node f32:$src0, i32:$src1, i32:$old, index)),
     (inst !if(index{1}, SRCMODS.DST_OP_SEL, 0), $src0, 0, $src1,
-          !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, !if(index{1}, SRCMODS.OP_SEL_0, 0))
+          !if(index{0}, SRCMODS.OP_SEL_0, 0), $old, 0)
 >;
 
 foreach Index = [0, -1] in {
@@ -998,6 +1038,11 @@ defm V_MAXIMUM_F16        : VOP3Only_Realtriple_t16_gfx12<0x368>;
 defm V_PERMLANE16_VAR_B32  : VOP3Only_Real_Base_gfx12<0x30f>;
 defm V_PERMLANEX16_VAR_B32 : VOP3Only_Real_Base_gfx12<0x310>;
 
+defm V_CVT_PK_FP8_F32  : VOP3Only_Realtriple_gfx12<0x369>;
+defm V_CVT_PK_BF8_F32  : VOP3Only_Realtriple_gfx12<0x36a>;
+defm V_CVT_SR_FP8_F32  : VOP3Only_Realtriple_gfx12<0x36b>;
+defm V_CVT_SR_BF8_F32  : VOP3Only_Realtriple_gfx12<0x36c>;
+
 //===----------------------------------------------------------------------===//
 // GFX11, GFX12
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index df505c3365cbd..8b6d1ddba595e 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -305,6 +305,11 @@ class VOP3OpSel_gfx10<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
 
 class VOP3OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx10<op, p>;
 
+class VOP3FP8OpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p> {
+  let Inst{11} = !if(p.HasSrc0, src0_modifiers{2}, 0);
+  let Inst{12} = !if(p.HasSrc0, src0_modifiers{3}, 0);
+}
+
 class VOP3DotOpSel_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3OpSel_gfx11_gfx12<op, p>{
   let Inst{11} = ?;
   let Inst{12} = ?;
@@ -738,7 +743,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
   // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
   let Inst{11} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{2}, 0),?);
-  let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, 0),?);
+  let Inst{12} = !if(P.HasOpSel,!if(P.HasSrc1Mods, src1_modifiers{2}, !if((P.IsFP8), src0_modifiers{3}, 0)), ?);
   let Inst{13} = !if(P.HasOpSel,!if(P.HasSrc2Mods, src2_modifiers{2}, 0),?);
   let Inst{14} = !if(P.HasOpSel,!if(P.HasSrc0Mods, src0_modifiers{3}, 0),?);
   let Inst{15}    = !if(P.HasClamp, clamp, 0);
@@ -1406,14 +1411,20 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
   let AsmString = asmName # ps.AsmOperands,
       IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
-  if ps.Pfl.HasOpSel then
-    def _e64#Gen.Suffix :
-      VOP3_Real_Gen<ps, Gen>,
-      VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
-  if !not(ps.Pfl.HasOpSel) then
-    def _e64#Gen.Suffix :
-      VOP3_Real_Gen<ps, Gen>,
-      VOP3e_gfx11_gfx12<op, ps.Pfl>;
+    if ps.Pfl.IsFP8 then {
+      def _e64#Gen.Suffix :
+        VOP3_Real_Gen<ps, Gen>,
+        VOP3FP8OpSel_gfx11_gfx12<op, ps.Pfl>;
+    } else {
+      if ps.Pfl.HasOpSel then
+        def _e64#Gen.Suffix :
+          VOP3_Real_Gen<ps, Gen>,
+          VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
+      if !not(ps.Pfl.HasOpSel) then
+        def _e64#Gen.Suffix :
+          VOP3_Real_Gen<ps, Gen>,
+          VOP3e_gfx11_gfx12<op, ps.Pfl>;
+    }
   }
   def Gen.Suffix#"_VOP3_alias" : MnemonicAlias<ps.Mnemonic, asmName>, Requires<[Gen.AssemblerPredicate]>, LetDummies;
 }
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 3cbe974ff3142..20f324604aa52 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -294,6 +294,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["gfx12-insts"] = true;
       Features["atomic-fadd-rtn-insts"] = true;
       Features["image-insts"] = true;
+      Features["fp8-conversion-insts"] = true;
       break;
     case GK_GFX1151:
     case GK_GFX1150:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
new file mode 100644
index 0000000000000..f49fec60892cd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_cs float @test_cvt_f32_bf8_byte0(i32 %a) {
+; GFX12-LABEL: test_cvt_f32_bf8_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_cvt_f32_bf8_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    ; return to shader part epilog
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 0)
+  ret float %ret
+}
+
+define amdgpu_cs float @test_cvt_f32_bf8_byte1(i32 %a) {
+; GFX12-LABEL: test_cvt_f32_bf8_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT:    ; return to shader part epilog
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 1)
+  ret float %ret
+}
+
+define amdgpu_cs float @test_cvt_f32_bf8_byte2(i32 %a) {
+; GFX12-LABEL: test_cvt_f32_bf8_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    ; return to shader part epilog
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %tmp0, i32 2)
+  ret float %ret
+}
+
+define amdgpu_cs float @test_cvt_f32_fp8_byte3(i32 %a) {
+; GFX12-LABEL: test_cvt_f32_fp8_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT:    ; return to shader part epilog
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %tmp0, i32 3)
+  ret float %ret
+}
+
+define amdgpu_cs void @test_cvt_pk_bf8_f32_word0(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_cvt_pk_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %tmp1, float %y, i32 %old, i1 false)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_cvt_pk_fp8_f32_word1(i32 %a, float %y, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %tmp1, float %y, i32 %old, i1 true)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_cvt_sr_bf8_f32_byte0(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_cvt_sr_bf8_f32_e64_dpp v2, v0, v1 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %tmp1, i32 %r, i32 %old, i32 0)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_cvt_sr_fp8_f32_byte1(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 1)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_cs void @test_cvt_sr_fp8_f32_byte2(i32 %a, i32 %r, i32 %old, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_dpp v0, v0 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf bound_ctrl:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    global_store_b32 v[3:4], v2, off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+  %tmp0 = call i32 @llvm.amdgcn.mov.dpp.i32(i32 %a, i32 228, i32 15, i32 15, i1 1)
+  %tmp1 = bitcast i32 %tmp0 to float
+  %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %tmp1, i32 %r, i32 %old, i32 2)
+  store i32 %ret, ptr addrspace(1) %out
+  ret void
+}
+
+declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
+declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
+declare i32 @llvm.amdgcn.cvt.pk.bf8.f32(float, float, i32, i1)
+declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1)
+declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32)
+declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32)
+
+declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
+declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
+
+attributes #0 = { nounwind convergent }
+attributes #1 = { nounwind readnone convergent }
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
new file mode 100644
index 0000000000000..d11fb27640ee7
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.dpp.mir
@@ -0,0 +1,197 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass=gcn-dpp-combine %s -o - | FileCheck -check-prefix=GFX12 %s
+
+---
+name:            test_cvt_f32_bf8_byte0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX12-LABEL: name: test_cvt_f32_bf8_byte0
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[V_CVT_F32_BF8_dpp:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_dpp [[DEF]], [[COPY]], 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_dpp]]
+    ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_BF8_e32 killed %1, implicit $mode, implicit $exec
+    $vgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+---
+name:            test_cvt_f32_bf8_byte2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX12-LABEL: name: test_cvt_f32_bf8_byte2
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_F32_BF8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_BF8_OP_SEL_e64_]]
+    ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_BF8_OP_SEL_e64 8, killed %1, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+---
+name:            test_cvt_f32_fp8_byte3
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0
+
+    ; GFX12-LABEL: name: test_cvt_f32_fp8_byte3
+    ; GFX12: liveins: $vgpr0
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY]], [[COPY]], 228, 15, 15, -1, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_F32_FP8_OP_SEL_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed [[V_MOV_B32_dpp]], 0, implicit $mode, implicit $exec
+    ; GFX12-NEXT: $vgpr0 = COPY [[V_CVT_F32_FP8_OP_SEL_e64_]]
+    ; GFX12-NEXT: SI_RETURN_TO_EPILOG $vgpr0
+    %0:vgpr_32 = COPY $vgpr0
+    %1:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %2:vgpr_32 = V_CVT_F32_FP8_OP_SEL_e64 12, killed %1, 0, implicit $mode, implicit $exec
+    $vgpr0 = COPY %2
+    SI_RETURN_TO_EPILOG $vgpr0
+
+...
+---
+name:            test_cvt_pk_bf8_f32_word0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; GFX12-LABEL: name: test_cvt_pk_bf8_f32_word0
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[V_CVT_PK_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_PK_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_BF8_F32_e64_dpp]], 0, 0, implicit $exec
+    ; GFX12-NEXT: S_ENDPGM 0
+    %4:vgpr_32 = COPY $vgpr4
+    %3:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %1:vgpr_32 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
+    %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %7:vgpr_32 = V_CVT_PK_BF8_F32_e64 0, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_cvt_pk_fp8_f32_word1
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; GFX12-LABEL: name: test_cvt_pk_fp8_f32_word1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_PK_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], [[COPY2]], 0, implicit $mode, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_PK_FP8_F32_e64_]], 0, 0, implicit $exec
+    ; GFX12-NEXT: S_ENDPGM 0
+    %4:vgpr_32 = COPY $vgpr4
+    %3:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %1:vgpr_32 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
+    %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %7:vgpr_32 = V_CVT_PK_FP8_F32_e64 8, killed %6, 0, %1, %2, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_cvt_sr_bf8_f32_byte0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; GFX12-LABEL: name: test_cvt_sr_bf8_f32_byte0
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX12-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GFX12-NEXT: [[V_CVT_SR_BF8_F32_e64_dpp:%[0-9]+]]:vgpr_32 = V_CVT_SR_BF8_F32_e64_dpp [[DEF]], 0, [[COPY4]], 0, [[COPY3]], 0, [[COPY2]], 0, 228, 15, 15, 1, implicit $mode, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_BF8_F32_e64_dpp]], 0, 0, implicit $exec
+    ; GFX12-NEXT: S_ENDPGM 0
+    %4:vgpr_32 = COPY $vgpr4
+    %3:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %1:vgpr_32 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
+    %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %7:vgpr_32 = V_CVT_SR_BF8_F32_e64 0, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
+---
+name:            test_cvt_sr_fp8_f32_byte2
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+
+    ; GFX12-LABEL: name: test_cvt_sr_fp8_f32_byte2
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+    ; GFX12-NEXT: [[V_MOV_B32_dpp:%[0-9]+]]:vgpr_32 = V_MOV_B32_dpp [[COPY4]], [[COPY4]], 228, 15, 15, -1, implicit $exec
+    ; GFX12-NEXT: [[V_CVT_SR_FP8_F32_e64_:%[0-9]+]]:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed [[V_MOV_B32_dpp]], 0, [[COPY3]], 0, [[COPY2]], 0, implicit $mode, implicit $exec
+    ; GFX12-NEXT: GLOBAL_STORE_DWORD [[REG_SEQUENCE]], killed [[V_CVT_SR_FP8_F32_e64_]], 0, 0, implicit $exec
+    ; GFX12-NEXT: S_ENDPGM 0
+    %4:vgpr_32 = COPY $vgpr4
+    %3:vgpr_32 = COPY $vgpr3
+    %2:vgpr_32 = COPY $vgpr2
+    %1:vgpr_32 = COPY $vgpr1
+    %0:vgpr_32 = COPY $vgpr0
+    %11:vreg_64 = REG_SEQUENCE %3, %subreg.sub0, %4, %subreg.sub1
+    %6:vgpr_32 = V_MOV_B32_dpp %0, %0, 228, 15, 15, -1, implicit $exec
+    %7:vgpr_32 = V_CVT_SR_FP8_F32_e64 8, killed %6, 0, %1, 0, %2, 0, implicit $mode, implicit $exec
+    GLOBAL_STORE_DWORD %11, killed %7, 0, 0, implicit $exec
+    S_ENDPGM 0
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
index 26d0d702d99db..17b1fcf865e94 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.fp8.ll
@@ -1,4 +1,6 @@
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX940 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX12 %s
 
 declare float @llvm.amdgcn.cvt.f32.bf8(i32, i32)
 declare float @llvm.amdgcn.cvt.f32.fp8(i32, i32)
@@ -9,182 +11,524 @@ declare i32 @llvm.amdgcn.cvt.pk.fp8.f32(float, float, i32, i1)
 declare i32 @llvm.amdgcn.cvt.sr.bf8.f32(float, i32, i32, i32)
 declare i32 @llvm.amdgcn.cvt.sr.fp8.f32(float, i32, i32, i32)
 
-; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte0:
-; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0{{$}}
 define float @test_cvt_f32_bf8_byte0(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_bf8_byte0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_bf8_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_bf8_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 0)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte1:
-; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1
 define float @test_cvt_f32_bf8_byte1(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_bf8_byte1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_bf8_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 1)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte2:
-; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2
 define float @test_cvt_f32_bf8_byte2(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_bf8_byte2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_bf8_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 2)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_bf8_byte3:
-; GCN: v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3
 define float @test_cvt_f32_bf8_byte3(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_bf8_byte3:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_bf8_sdwa v0, v0 src0_sel:BYTE_3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_bf8_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_bf8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.bf8(i32 %a, i32 3)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte0:
-; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0{{$}}
 define float @test_cvt_f32_fp8_byte0(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_fp8_byte0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_fp8_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_fp8_e32 v0, v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 0)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte1:
-; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1
 define float @test_cvt_f32_fp8_byte1(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_fp8_byte1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_fp8_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 1)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte2:
-; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2
 define float @test_cvt_f32_fp8_byte2(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_fp8_byte2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_fp8_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[0,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 2)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_f32_fp8_byte3:
-; GCN: v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3
 define float @test_cvt_f32_fp8_byte3(i32 %a) {
+; GFX940-LABEL: test_cvt_f32_fp8_byte3:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_f32_fp8_sdwa v0, v0 src0_sel:BYTE_3
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_f32_fp8_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_f32_fp8_e64 v0, v0 op_sel:[1,1]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call float @llvm.amdgcn.cvt.f32.fp8(i32 %a, i32 3)
   ret float %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word0:
-; GCN: v_cvt_pk_f32_bf8_e32 v[0:1], v0{{$}}
 define <2 x float> @test_cvt_pk_f32_bf8_word0(i32 %a) {
+; GFX940-LABEL: test_cvt_pk_f32_bf8_word0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_f32_bf8_e32 v[0:1], v0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_f32_bf8_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_f32_bf8_e32 v[0:1], v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 false)
   ret <2 x float> %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_f32_bf8_word1:
-; GCN: v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1
 define <2 x float> @test_cvt_pk_f32_bf8_word1(i32 %a) {
+; GFX940-LABEL: test_cvt_pk_f32_bf8_word1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_f32_bf8_sdwa v[0:1], v0 src0_sel:WORD_1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_f32_bf8_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_f32_bf8_e64 v[0:1], v0 op_sel:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.bf8(i32 %a, i1 true)
   ret <2 x float> %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word0:
-; GCN: v_cvt_pk_f32_fp8_e32 v[0:1], v0{{$}}
 define <2 x float> @test_cvt_pk_f32_fp8_word0(i32 %a) {
+; GFX940-LABEL: test_cvt_pk_f32_fp8_word0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_f32_fp8_e32 v[0:1], v0
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_f32_fp8_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_f32_fp8_e32 v[0:1], v0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 false)
   ret <2 x float> %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_f32_fp8_word1:
-; GCN: v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1
 define <2 x float> @test_cvt_pk_f32_fp8_word1(i32 %a) {
+; GFX940-LABEL: test_cvt_pk_f32_fp8_word1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_f32_fp8_sdwa v[0:1], v0 src0_sel:WORD_1
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_f32_fp8_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_f32_fp8_e64 v[0:1], v0 op_sel:[1,0]
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call <2 x float> @llvm.amdgcn.cvt.pk.f32.fp8(i32 %a, i1 true)
   ret <2 x float> %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word0:
-; GCN: v_cvt_pk_bf8_f32 v2, v0, v1{{$}}
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_pk_bf8_f32_word0(float %x, float %y, i32 %old) {
+; GFX940-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_bf8_f32_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 false)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_bf8_f32_word1:
-; GCN: v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_pk_bf8_f32_word1(float %x, float %y, i32 %old) {
+; GFX940-LABEL: test_cvt_pk_bf8_f32_word1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_bf8_f32_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_bf8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.bf8.f32(float %x, float %y, i32 %old, i1 true)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word0:
-; GCN: v_cvt_pk_fp8_f32 v2, v0, v1{{$}}
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_pk_fp8_f32_word0(float %x, float %y, i32 %old) {
+; GFX940-LABEL: test_cvt_pk_fp8_f32_word0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_fp8_f32_word0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 false)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_pk_fp8_f32_word1:
-; GCN: v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_pk_fp8_f32_word1(float %x, float %y, i32 %old) {
+; GFX940-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_pk_fp8_f32_word1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_pk_fp8_f32 v2, v0, v1 op_sel:[0,0,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.pk.fp8.f32(float %x, float %y, i32 %old, i1 true)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte0:
-; GCN: v_cvt_sr_bf8_f32 v2, v0, v1{{$}}
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_bf8_f32_byte0(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_bf8_f32_byte0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 0)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte1:
-; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_bf8_f32_byte1(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_bf8_f32_byte1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 1)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte2:
-; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_bf8_f32_byte2(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_bf8_f32_byte2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 2)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_bf8_f32_byte3:
-; GCN: v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_bf8_f32_byte3(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_bf8_f32_byte3:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_bf8_f32_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_bf8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.bf8.f32(float %x, i32 %r, i32 %old, i32 3)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte0:
-; GCN: v_cvt_sr_fp8_f32 v2, v0, v1{{$}}
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_fp8_f32_byte0(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_fp8_f32_byte0:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 0)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte1:
-; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_fp8_f32_byte1(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_fp8_f32_byte1:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte1:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,0]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 1)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte2:
-; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_fp8_f32_byte2(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_fp8_f32_byte2:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte2:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,0,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 2)
   ret i32 %ret
 }
 
-; GCN-LABEL: {{^}}test_cvt_sr_fp8_f32_byte3:
-; GCN: v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
-; GCN: v_mov_b32_e32 v0, v2
 define i32 @test_cvt_sr_fp8_f32_byte3(float %x, i32 %r, i32 %old) {
+; GFX940-LABEL: test_cvt_sr_fp8_f32_byte3:
+; GFX940:       ; %bb.0:
+; GFX940-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX940-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX940-NEXT:    s_nop 0
+; GFX940-NEXT:    v_mov_b32_e32 v0, v2
+; GFX940-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: test_cvt_sr_fp8_f32_byte3:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_cvt_sr_fp8_f32 v2, v0, v1 op_sel:[0,0,1,1]
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, v2
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
   %ret = tail call i32 @llvm.amdgcn.cvt.sr.fp8.f32(float %x, i32 %r, i32 %old, i32 3)
   ret i32 %ret
 }
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
index 35411ee0ba2a5..c9c4fceffaeb0 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1.s
@@ -397,6 +397,51 @@ v_ctz_i32_b32 v5, src_scc
 v_ctz_i32_b32 v255, 0xaf123456
 // GFX12: encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 
+v_cvt_f32_bf8_e32 v1, s3
+// GFX12: encoding: [0x03,0xda,0x02,0x7e]
+
+v_cvt_f32_bf8_e32 v1, 3
+// GFX12: encoding: [0x83,0xda,0x02,0x7e]
+
+v_cvt_f32_bf8_e32 v1, v3
+// GFX12: encoding: [0x03,0xdb,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, s3
+// GFX12: encoding: [0x03,0xd8,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, 3
+// GFX12: encoding: [0x83,0xd8,0x02,0x7e]
+
+v_cvt_f32_fp8_e32 v1, v3
+// GFX12: encoding: [0x03,0xd9,0x02,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], s3
+// GFX12: encoding: [0x03,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], s5
+// GFX12: encoding: [0x05,0xde,0x06,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], 3
+// GFX12: encoding: [0x83,0xde,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], 3
+// GFX12: encoding: [0x83,0xde,0x06,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[2:3], v3
+// GFX12: encoding: [0x03,0xdf,0x04,0x7e]
+
+v_cvt_pk_f32_bf8_e32 v[3:4], v3
+// GFX12: encoding: [0x03,0xdf,0x06,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], s3
+// GFX12: encoding: [0x03,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], 3
+// GFX12: encoding: [0x83,0xdc,0x04,0x7e]
+
+v_cvt_pk_f32_fp8_e32 v[2:3], v3
+// GFX12: encoding: [0x03,0xdd,0x04,0x7e]
+
 v_cvt_f16_f32 v5, v1
 // GFX12: encoding: [0x01,0x15,0x0a,0x7e]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
index dd6afb28c396a..5e0e1b688bc58 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp16.s
@@ -337,6 +337,18 @@ v_ctz_i32_b32 v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 v_ctz_i32_b32 v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x05,0x30]
 
+v_cvt_f32_fp8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc
+// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac]
+
+v_cvt_f32_fp8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe
+// GFX12: encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+
+v_cvt_f32_bf8 v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc
+// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac]
+
+v_cvt_f32_bf8 v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe
+// GFX12: encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+
 v_cvt_f16_f32 v5, v1 quad_perm:[3,2,1,0]
 // GFX12: encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
index 6530de0268456..36c89710ce8f8 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop1_dpp8.s
@@ -73,6 +73,18 @@ v_ctz_i32_b32 v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_ctz_i32_b32 v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: encoding: [0xe9,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00]
 
+v_cvt_f32_fp8 v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_fp8 v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05]
+
+v_cvt_f32_bf8 v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_bf8 v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05]
+
 v_cvt_f16_f32 v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index cf3f9c45bdcc8..beb57999b855e 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -1099,6 +1099,42 @@ v_cubetc_f32 v5, -src_scc, |vcc_lo|, -1 mul:4
 v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2
 // GFX12: encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 
+v_cvt_pk_fp8_f32 v1, v2, v3
+// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_pk_fp8_f32 v1, -v2, |v3|
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
+
+v_cvt_pk_fp8_f32 v1, s2, 3
+// GFX12: encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
+
+v_cvt_pk_bf8_f32 v1, v2, v3
+// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_pk_bf8_f32 v1, -v2, |v3|
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
+
+v_cvt_pk_bf8_f32 v1, s2, 3
+// GFX12: encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
+
+v_cvt_sr_fp8_f32 v1, v2, v3
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v10, s2, v5
+// GFX12: encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
+
+v_cvt_sr_fp8_f32 v5, -|v255|, v4
+// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
+
+v_cvt_sr_bf8_f32 v1, v2, v3
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v10, s2, v5
+// GFX12: encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
+
+v_cvt_sr_bf8_f32 v5, -|v255|, v4
+// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
+
 v_cvt_pk_i16_f32 v5, v1, v2
 // GFX12: encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index 26f63102df950..df3430f376f69 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -1015,6 +1015,114 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 row_xmask:0 row_mask:0x1 bank_m
 v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x05,0x30]
 
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3]
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+
+v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+
+v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
+// GFX12: encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,1,2,3]
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+
+v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+
+v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
+// GFX12: encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+
+v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+
+v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
+// GFX12: encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+
+v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+
+v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1
+// GFX12: encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+
 v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index de294b1ff2a22..09dd6df618c5b 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -570,6 +570,54 @@ v_cubetc_f32_e64_dpp v5, v1, -|v2|, -|0.5| mul:4 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x87,0x0e,0xd6,0xe9,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 
+v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1]
+// GFX12: encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
+
+v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+
+v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+
+v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+
+v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0]
+// GFX12: encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+
 v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
index e35bb63290672..7ee60262a5c1b 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1.s
@@ -396,6 +396,144 @@ v_ctz_i32_b32_e64 v5, src_scc
 v_ctz_i32_b32_e64 v255, 0xaf123456
 // GFX12: encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 
+v_cvt_f32_bf8_e64 v1, s3
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, s3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, 3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_bf8_e64 v1, v3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xed,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, s3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, 3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 op_sel:[0,1]
+// GFX12: encoding: [0x01,0x10,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,0]
+// GFX12: encoding: [0x01,0x08,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_f32_fp8_e64 v1, v3 op_sel:[1,1]
+// GFX12: encoding: [0x01,0x18,0xec,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], s3
+// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], s3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], 3
+// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], 3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], v3
+// GFX12: encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[2:3], v3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], s3
+// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], s3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], 3
+// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], 3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], v3
+// GFX12: encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[2:3], v3 op_sel:[1,0]
+// GFX12: encoding: [0x02,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], s3
+// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], s3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], 3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], v3
+// GFX12: encoding: [0x03,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_bf8_e64 v[3:4], v3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xef,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], s3
+// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], 3
+// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], 3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x83,0x00,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], v3
+// GFX12: encoding: [0x03,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+
+v_cvt_pk_f32_fp8_e64 v[3:4], v3 op_sel:[1,0]
+// GFX12: encoding: [0x03,0x08,0xee,0xd5,0x03,0x01,0x00,0x00]
+
 v_cvt_f16_f32_e64 v5, v1
 // GFX12: encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
index 6b915bd14683a..808f941197c42 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp16.s
@@ -336,6 +336,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x05,0x30]
 
+V_CVT_F32_FP8_e64_dpp v5, v1 quad_perm:[3,1,2,0] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0x27,0x00,0x2d]
+
+V_CVT_F32_FP8_e64_dpp v1, v3 quad_perm:[2,1,0,3] row_mask:0x5 bank_mask:0xe
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0xc6,0x00,0x5e]
+
+V_CVT_F32_BF8_e64_dpp v5, v1 quad_perm:[0,3,2,1] row_mask:0x2 bank_mask:0xd
+// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0x6c,0x00,0x2d]
+
+V_CVT_F32_BF8_e64_dpp v1, v3 quad_perm:[0,1,3,2] row_mask:0x5 bank_mask:0xe
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0xb4,0x00,0x5e]
+
 v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
index 61266f3776c28..f7b51cfb6bda8 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_from_vop1_dpp8.s
@@ -84,6 +84,18 @@ v_ctz_i32_b32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0xba,0xd5,0xe9,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 
+v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
+
+v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7]
+// GFX12: encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
+
+v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
+
 v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt
index a839f03c42ba1..39bb7338c8074 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1.txt
@@ -397,6 +397,42 @@
 # GFX12: v_ctz_i32_b32_e32 v255, 0xaf123456      ; encoding: [0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf]
 0xff,0x74,0xfe,0x7f,0x56,0x34,0x12,0xaf
 
+# GFX12: v_cvt_f32_bf8_e32 v1, s3                ; encoding: [0x03,0xda,0x02,0x7e]
+0x03,0xda,0x02,0x7e
+
+# GFX12: v_cvt_f32_bf8_e32 v1, 3                 ; encoding: [0x83,0xda,0x02,0x7e]
+0x83,0xda,0x02,0x7e
+
+# GFX12: v_cvt_f32_bf8_e32 v1, v3                ; encoding: [0x03,0xdb,0x02,0x7e]
+0x03,0xdb,0x02,0x7e
+
+# GFX12: v_cvt_f32_fp8_e32 v1, s3                ; encoding: [0x03,0xd8,0x02,0x7e]
+0x03,0xd8,0x02,0x7e
+
+# GFX12: v_cvt_f32_fp8_e32 v1, 3                 ; encoding: [0x83,0xd8,0x02,0x7e]
+0x83,0xd8,0x02,0x7e
+
+# GFX12: v_cvt_f32_fp8_e32 v1, v3                ; encoding: [0x03,0xd9,0x02,0x7e]
+0x03,0xd9,0x02,0x7e
+
+# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], s3         ; encoding: [0x03,0xde,0x04,0x7e]
+0x03,0xde,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], 3          ; encoding: [0x83,0xde,0x04,0x7e]
+0x83,0xde,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_bf8_e32 v[2:3], v3         ; encoding: [0x03,0xdf,0x04,0x7e]
+0x03,0xdf,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], s3         ; encoding: [0x03,0xdc,0x04,0x7e]
+0x03,0xdc,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], 3          ; encoding: [0x83,0xdc,0x04,0x7e]
+0x83,0xdc,0x04,0x7e
+
+# GFX12: v_cvt_pk_f32_fp8_e32 v[2:3], v3         ; encoding: [0x03,0xdd,0x04,0x7e]
+0x03,0xdd,0x04,0x7e
+
 # GFX12: v_cvt_f16_f32_e32 v5, v1                ; encoding: [0x01,0x15,0x0a,0x7e]
 0x01,0x15,0x0a,0x7e
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
index bcb9ad9febb96..5848333f41ef7 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp16.txt
@@ -337,6 +337,18 @@
 # GFX12: v_ctz_i32_b32_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30]
 0xfa,0x74,0xfe,0x7f,0xff,0x6f,0x0d,0x30
 
+# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac]
+0xfa,0xd8,0x02,0x7e,0x03,0xe4,0x00,0xac
+
+# GFX12: v_cvt_f32_fp8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+0xfa,0xd8,0x02,0x7e,0x03,0x1b,0x00,0x2e
+
+# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[0,1,2,3] row_mask:0xa bank_mask:0xc ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac]
+0xfa,0xda,0x02,0x7e,0x03,0xe4,0x00,0xac
+
+# GFX12: v_cvt_f32_bf8_dpp v1, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xe ; encoding: [0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e]
+0xfa,0xda,0x02,0x7e,0x03,0x1b,0x00,0x2e
+
 # GFX12: v_cvt_f16_f32_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff]
 0xfa,0x14,0x0a,0x7e,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
index 928165997dd91..d42e9ae25039b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop1_dpp8.txt
@@ -49,6 +49,18 @@
 # GFX12: v_ctz_i32_b32_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00]
 0xea,0x74,0xfe,0x7f,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_f32_fp8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+0xe9,0xd8,0x0a,0x7e,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_cvt_f32_fp8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05]
+0xe9,0xd8,0x02,0x7e,0x03,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa]
+0xe9,0xda,0x0a,0x7e,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_cvt_f32_bf8_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05]
+0xe9,0xda,0x02,0x7e,0x03,0x77,0x39,0x05
+
 # GFX12: v_cvt_f16_f32_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05]
 0xe9,0x14,0x0a,0x7e,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index db690aa99e4ab..f86903b8de44b 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -993,6 +993,42 @@
 # GFX12: v_cubetc_f32 v255, -|0xaf123456|, -|vcc_hi|, null clamp div:2 ; encoding: [0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf]
 0xff,0x83,0x0e,0xd6,0xff,0xd6,0xf0,0x79,0x56,0x34,0x12,0xaf
 
+# GFX12: v_cvt_pk_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x00,0x69,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
+0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20
+
+# GFX12: v_cvt_pk_fp8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00]
+0x01,0x00,0x69,0xd7,0x02,0x06,0x01,0x00
+
+# GFX12: v_cvt_pk_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x00,0x6a,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
+0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20
+
+# GFX12: v_cvt_pk_bf8_f32 v1, s2, 3 ; encoding: [0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00]
+0x01,0x00,0x6a,0xd7,0x02,0x06,0x01,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x00,0x6b,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00]
+0x0a,0x00,0x6b,0xd7,0x02,0x0a,0x02,0x00
+
+# GFX12: v_cvt_sr_fp8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20]
+0x05,0x01,0x6b,0xd7,0xff,0x09,0x02,0x20
+
+# GFX12: v_cvt_sr_bf8_f32 v1, v2, v3 ; encoding: [0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00]
+0x01,0x00,0x6c,0xd7,0x02,0x07,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v10, s2, v5 ; encoding: [0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00]
+0x0a,0x00,0x6c,0xd7,0x02,0x0a,0x02,0x00
+
+# GFX12: v_cvt_sr_bf8_f32 v5, -|v255|, v4 ; encoding: [0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20]
+0x05,0x01,0x6c,0xd7,0xff,0x09,0x02,0x20
+
 # GFX12: v_cvt_pk_i16_f32 v5, v1, v2             ; encoding: [0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x06,0xd7,0x01,0x05,0x02,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 69f61c7eb8030..1be1d6e91ad8a 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -825,6 +825,114 @@
 # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30]
 0xff,0x87,0x0e,0xd6,0xfa,0xfe,0xf7,0xfb,0xff,0x6f,0x0d,0x30
 
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20]
+0x01,0x02,0x6a,0xd7,0x02,0x07,0x02,0x20
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x06,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+0x01,0x02,0x6a,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32 v1, -v2, |v3| ; encoding: [0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20]
+0x01,0x02,0x69,0xd7,0x02,0x07,0x02,0x20
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v6, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x06,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v6, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v255| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v1, -v2, |v3| quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+0x01,0x02,0x69,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x06,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+0x01,0x00,0x6c,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xe4,0x00,0xff
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v6, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed]
+0x06,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v6, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x06,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v255 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0xfe,0x03,0x20,0x02,0x1b,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[0,2,1,3] row_mask:0xe bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0xd8,0x00,0xed
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0x2 bank_mask:0xd ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0x2d
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0x5 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x00,0xe5
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v1, -v2, v3 quad_perm:[3,2,1,0] row_mask:0xe bank_mask:0xd fi:1 ; encoding: [0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed]
+0x01,0x00,0x6b,0xd7,0xfa,0x06,0x02,0x20,0x02,0x1b,0x04,0xed
+
 # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x06,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index a7f0183016147..44b3f7594029f 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -495,6 +495,54 @@
 # GFX12: v_cubetc_f32_e64_dpp v255, -|v255|, -|v255|, -|src_scc| clamp div:2 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00]
 0xff,0x87,0x0e,0xd6,0xea,0xfe,0xf7,0xfb,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,2,3,0,1] ; encoding: [0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21]
+0x05,0x00,0x69,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0xa9,0x21
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+0x05,0x01,0x69,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+0x05,0x02,0x69,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_fp8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+0xff,0x03,0x69,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6a,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, |v1|, -v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05]
+0x05,0x01,0x6a,0xd7,0xe9,0x04,0x02,0x40,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v5, -v1, |v2| dpp8:[7,6,5,4,3,2,1,0] fi:1 ; encoding: [0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+0x05,0x02,0x6a,0xd7,0xea,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_pk_bf8_f32_e64_dpp v255, -|v255|, -|v255| dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
+0xff,0x03,0x6a,0xd7,0xe9,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x01,0x6b,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6b,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_fp8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+0xff,0x01,0x6b,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, |v1|, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x01,0x6c,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v5, -v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
+0x05,0x00,0x6c,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05
+
+# GFX12: v_cvt_sr_bf8_f32_e64_dpp v255, -|v255|, v255 dpp8:[0,0,0,0,0,0,0,0] ; encoding: [0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00]
+0xff,0x01,0x6c,0xd7,0xe9,0xfe,0x03,0x20,0xff,0x00,0x00,0x00
+
 # GFX12: v_cvt_pk_i16_f32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x06,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
index 4fe4284e8eb4e..9a8368a65f3d3 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1.txt
@@ -396,6 +396,42 @@
 # GFX12: v_ctz_i32_b32_e64 v255, 0xaf123456      ; encoding: [0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x00,0xba,0xd5,0xff,0x00,0x00,0x00,0x56,0x34,0x12,0xaf
 
+# GFX12: v_cvt_f32_bf8_e64 v1, s3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00]
+0x01,0x00,0xed,0xd5,0x03,0x00,0x00,0x00
+
+# GFX12: v_cvt_f32_bf8_e64 v1, 3                 ; encoding: [0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00]
+0x01,0x00,0xed,0xd5,0x83,0x00,0x00,0x00
+
+# GFX12: v_cvt_f32_bf8_e64 v1, v3                ; encoding: [0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00]
+0x01,0x00,0xed,0xd5,0x03,0x01,0x00,0x00
+
+# GFX12: v_cvt_f32_fp8_e64 v1, s3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00]
+0x01,0x00,0xec,0xd5,0x03,0x00,0x00,0x00
+
+# GFX12: v_cvt_f32_fp8_e64 v1, 3                 ; encoding: [0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00]
+0x01,0x00,0xec,0xd5,0x83,0x00,0x00,0x00
+
+# GFX12: v_cvt_f32_fp8_e64 v1, v3                ; encoding: [0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00]
+0x01,0x00,0xec,0xd5,0x03,0x01,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00]
+0x02,0x00,0xef,0xd5,0x03,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00]
+0x02,0x00,0xef,0xd5,0x83,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_bf8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00]
+0x02,0x00,0xef,0xd5,0x03,0x01,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], s3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00]
+0x02,0x00,0xee,0xd5,0x03,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], 3          ; encoding: [0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00]
+0x02,0x00,0xee,0xd5,0x83,0x00,0x00,0x00
+
+# GFX12: v_cvt_pk_f32_fp8_e64 v[2:3], v3         ; encoding: [0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00]
+0x02,0x00,0xee,0xd5,0x03,0x01,0x00,0x00
+
 # GFX12: v_cvt_f16_f32_e64 v5, v1                ; encoding: [0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00]
 0x05,0x00,0x8a,0xd5,0x01,0x01,0x00,0x00
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
index e914d139e240e..8af274e0b4028 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp16.txt
@@ -336,6 +336,18 @@
 # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0xba,0xd5,0xfa,0x00,0x00,0x00,0xff,0x6f,0x0d,0x30
 
+# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d]
+0x05,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e]
+0x01,0x00,0xec,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 quad_perm:[0,1,2,3] row_mask:0x2 bank_mask:0xd ; encoding: [0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d]
+0x05,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x01,0xe4,0x00,0x2d
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 quad_perm:[0,2,1,1] row_mask:0x5 bank_mask:0xe ; encoding: [0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e]
+0x01,0x00,0xed,0xd5,0xfa,0x00,0x00,0x00,0x03,0x58,0x00,0x5e
+
 # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x8a,0xd5,0xfa,0x00,0x00,0x00,0x01,0x1b,0x00,0xff
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
index 2a4b677620d38..3d48d58c775b1 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_from_vop1_dpp8.txt
@@ -72,6 +72,18 @@
 # GFX12: v_ctz_i32_b32_e64_dpp v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0xba,0xd5,0xea,0x00,0x00,0x00,0xff,0x00,0x00,0x00
 
+# GFX12: v_cvt_f32_fp8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
+0x05,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_cvt_f32_fp8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
+0x01,0x00,0xec,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v5, v1 dpp8:[0,1,2,3,4,5,6,7] ; encoding: [0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa]
+0x05,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x01,0x88,0xc6,0xfa
+
+# GFX12: v_cvt_f32_bf8_e64_dpp v1, v3 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05]
+0x01,0x00,0xed,0xd5,0xe9,0x00,0x00,0x00,0x03,0x77,0x39,0x05
+
 # GFX12: v_cvt_f16_f32_e64_dpp v5, v1 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x8a,0xd5,0xe9,0x00,0x00,0x00,0x01,0x77,0x39,0x05
 

From 78d8ce316ff6f06f58a7f3eb7f633c4bf3bf3285 Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Wed, 24 Jan 2024 13:46:05 +0200
Subject: [PATCH 772/843] [AMDGPU] Require explicit immediate offsets for
 SGPR+IMM SMEM instructions. (#79131)

As otherwise SGPR+IMM instructions are not distinguishable to SGPR-only
ones in AsmParser, leading to ambiguities.

GFX12 doesn't have special SGPR-only variants, so we still allow
optional immediate offsets for the subtarget.

Also rename the offset operand classes while there.

Part of <https://github.com/llvm/llvm-project/issues/69256>.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td    | 32 ++++++++++++------------
 llvm/lib/Target/AMDGPU/SMInstructions.td | 23 +++++++++++------
 2 files changed, 32 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 4ab840a4c8485..903a04718b644 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -994,9 +994,9 @@ def SDWAVopcDst : BoolRC {
   let PrintMethod = "printVOPDst";
 }
 
-class NamedIntOperand<ValueType Type, string Prefix, string Name = NAME,
-                      string ConvertMethod = "nullptr">
-    : CustomOperand<Type, 1, Name> {
+class NamedIntOperand<ValueType Type, string Prefix, bit Optional = 1,
+                      string Name = NAME, string ConvertMethod = "nullptr">
+    : CustomOperand<Type, Optional, Name> {
   let ParserMethod =
     "[this](OperandVector &Operands) -> ParseStatus { "#
     "return parseIntWithPrefix(\""#Prefix#"\", Operands, "#
@@ -1039,9 +1039,9 @@ class ArrayOperand0<string Id, string Name = NAME>
 
 let ImmTy = "ImmTyOffset" in
 def flat_offset : CustomOperand<i32, 1, "FlatOffset">;
-def offset : NamedIntOperand<i32, "offset", "Offset">;
-def offset0 : NamedIntOperand<i8, "offset0", "Offset0">;
-def offset1 : NamedIntOperand<i8, "offset1", "Offset1">;
+def offset : NamedIntOperand<i32, "offset", 1, "Offset">;
+def offset0 : NamedIntOperand<i8, "offset0", 1, "Offset0">;
+def offset1 : NamedIntOperand<i8, "offset1", 1, "Offset1">;
 
 def gds : NamedBitOperand<"gds", "GDS">;
 
@@ -1092,25 +1092,25 @@ def dpp8 : CustomOperand<i32, 0, "DPP8">;
 def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
 
 let DefaultValue = "0xf" in {
-def row_mask : NamedIntOperand<i32, "row_mask", "DppRowMask">;
-def bank_mask : NamedIntOperand<i32, "bank_mask", "DppBankMask">;
+def row_mask : NamedIntOperand<i32, "row_mask", 1, "DppRowMask">;
+def bank_mask : NamedIntOperand<i32, "bank_mask", 1, "DppBankMask">;
 }
-def bound_ctrl : NamedIntOperand<i1, "bound_ctrl", "DppBoundCtrl",
+def bound_ctrl : NamedIntOperand<i1, "bound_ctrl", 1, "DppBoundCtrl",
     "[this] (int64_t &BC) -> bool { return convertDppBoundCtrl(BC); }">;
-def FI : NamedIntOperand<i32, "fi", "DppFI">;
+def FI : NamedIntOperand<i32, "fi", 1, "DppFI">;
 
 def blgp : CustomOperand<i32, 1, "BLGP">;
-def cbsz : NamedIntOperand<i32, "cbsz", "CBSZ">;
-def abid : NamedIntOperand<i32, "abid", "ABID">;
+def cbsz : NamedIntOperand<i32, "cbsz", 1, "CBSZ">;
+def abid : NamedIntOperand<i32, "abid", 1, "ABID">;
 
 def hwreg : CustomOperand<i32, 0, "Hwreg">;
 
 def exp_tgt : CustomOperand<i32, 0, "ExpTgt">;
 
-def wait_vdst : NamedIntOperand<i8, "wait_vdst", "WaitVDST">;
-def wait_exp : NamedIntOperand<i8, "wait_exp", "WaitEXP">;
-def wait_va_vdst : NamedIntOperand<i8, "wait_va_vdst", "WaitVAVDst">;
-def wait_va_vsrc : NamedIntOperand<i8, "wait_vm_vsrc", "WaitVMVSrc">;
+def wait_vdst : NamedIntOperand<i8, "wait_vdst", 1, "WaitVDST">;
+def wait_exp : NamedIntOperand<i8, "wait_exp", 1, "WaitEXP">;
+def wait_va_vdst : NamedIntOperand<i8, "wait_va_vdst", 1, "WaitVAVDst">;
+def wait_va_vsrc : NamedIntOperand<i8, "wait_vm_vsrc", 1, "WaitVMVSrc">;
 
 class KImmFPOperand<ValueType vt> : ImmOperand<vt> {
   let OperandNamespace = "AMDGPU";
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 9a27d22d585ec..f082de35b6ae9 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -10,8 +10,13 @@ def smrd_offset_8 : ImmOperand<i32, "SMRDOffset8", 1>;
 
 let EncoderMethod = "getSMEMOffsetEncoding",
     DecoderMethod = "decodeSMEMOffset" in {
-def smem_offset : ImmOperand<i32, "SMEMOffset", 1>;
-def smem_offset_mod : NamedIntOperand<i32, "offset", "SMEMOffsetMod">;
+def SMEMOffset : ImmOperand<i32, "SMEMOffset", 1>;
+def SMEMOffsetMod : NamedIntOperand<i32, "offset", 0>;
+def OptSMEMOffsetMod : NamedIntOperand<i32, "offset"> {
+  let ImmTy = SMEMOffsetMod.ImmTy;
+  let PredicateMethod = SMEMOffsetMod.PredicateMethod;
+  let PrintMethod = SMEMOffsetMod.PrintMethod;
+}
 }
 
 //===----------------------------------------------------------------------===//
@@ -87,11 +92,14 @@ class OffsetMode<bit hasOffset, bit hasSOffset, string variant,
   string Asm = asm;
 }
 
-def IMM_Offset : OffsetMode<1, 0, "_IMM", (ins smem_offset:$offset), "$offset">;
+def IMM_Offset : OffsetMode<1, 0, "_IMM", (ins SMEMOffset:$offset), "$offset">;
 def SGPR_Offset : OffsetMode<0, 1, "_SGPR", (ins SReg_32:$soffset), "$soffset">;
 def SGPR_IMM_Offset : OffsetMode<1, 1, "_SGPR_IMM",
-                                 (ins SReg_32:$soffset, smem_offset_mod:$offset),
+                                 (ins SReg_32:$soffset, SMEMOffsetMod:$offset),
                                  "$soffset$offset">;
+def SGPR_IMM_OptOffset : OffsetMode<1, 1, "_SGPR_IMM",
+                                    (ins SReg_32:$soffset, OptSMEMOffsetMod:$offset),
+                                    "$soffset$offset">;
 
 class SM_Probe_Pseudo <string opName, RegisterClass baseClass, OffsetMode offsets>
   : SM_Pseudo<opName, (outs),
@@ -201,6 +209,7 @@ multiclass SM_Pseudo_Probe<RegisterClass baseClass> {
   def _IMM  : SM_Probe_Pseudo <opName, baseClass, IMM_Offset>;
   def _SGPR : SM_Probe_Pseudo <opName, baseClass, SGPR_Offset>;
   def _SGPR_IMM : SM_Probe_Pseudo <opName, baseClass, SGPR_IMM_Offset>;
+  def _SGPR_OPT_IMM : SM_Probe_Pseudo <opName, baseClass, SGPR_IMM_OptOffset>;
 }
 
 class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
@@ -214,7 +223,7 @@ class SM_WaveId_Pseudo<string opName, SDPatternOperator node> : SM_Pseudo<
 
 class SM_Prefetch_Pseudo <string opName, RegisterClass baseClass, bit hasSBase>
   : SM_Pseudo<opName, (outs), !con(!if(hasSBase, (ins baseClass:$sbase), (ins)),
-                                   (ins smem_offset:$offset, SReg_32:$soffset, i8imm:$sdata)),
+                                   (ins SMEMOffset:$offset, SReg_32:$soffset, i8imm:$sdata)),
               !if(hasSBase, " $sbase,", "") # " $offset, $soffset, $sdata"> {
   // Mark prefetches as both load and store to prevent reordering with loads
   // and stores. This is also needed for pattern to match prefetch intrinsic.
@@ -1410,7 +1419,7 @@ class SMEM_Real_Load_gfx12<bits<6> op, string ps, string opName, OffsetMode offs
 multiclass SM_Real_Loads_gfx12<bits<6> op, string ps = NAME> {
   defvar opName = !tolower(NAME);
   def _IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, IMM_Offset>;
-  def _SGPR_IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, SGPR_IMM_Offset>;
+  def _SGPR_IMM_gfx12 : SMEM_Real_Load_gfx12<op, ps, opName, SGPR_IMM_OptOffset>;
 }
 
 defm S_LOAD_B32  : SM_Real_Loads_gfx12<0x00, "S_LOAD_DWORD">;
@@ -1448,7 +1457,7 @@ def S_PREFETCH_DATA_PC_REL_gfx12 : SMEM_Real_Prefetch_gfx12<0x28, S_PREFETCH_DAT
 multiclass SMEM_Real_Probe_gfx12<bits<6> op> {
   defvar ps = NAME;
   def _IMM_gfx12      : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_IMM)>;
-  def _SGPR_IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_IMM)>;
+  def _SGPR_IMM_gfx12 : SMEM_Real_Prefetch_gfx12<op, !cast<SM_Probe_Pseudo>(ps#_SGPR_OPT_IMM)>;
 }
 
 defm S_ATC_PROBE        : SMEM_Real_Probe_gfx12<0x22>;

From c83180c1248615cf6ea8842eb4e0cebebba4ab57 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 24 Jan 2024 11:56:02 +0000
Subject: [PATCH 773/843] [ConstraintElim] Add tests for #78621.

Tests with umin where the result may be poison for
https://github.com/llvm/llvm-project/issues/78621.
---
 .../umin-result-may-be-poison.ll              | 60 +++++++++++++++++++
 1 file changed, 60 insertions(+)
 create mode 100644 llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll

diff --git a/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll b/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll
new file mode 100644
index 0000000000000..35ac72e54d189
--- /dev/null
+++ b/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll
@@ -0,0 +1,60 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -p constraint-elimination -S %s | FileCheck %s
+
+; Tests for https://github.com/llvm/llvm-project/issues/78621.
+
+define i1 @umin_not_used(i32 %arg) {
+; CHECK-LABEL: define i1 @umin_not_used(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80)
+; CHECK-NEXT:    [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    ret i1 false
+;
+  %icmp = icmp slt i32 %arg, 0
+  %shl = shl nuw nsw i32 %arg, 3
+  call i32 @llvm.umin.i32(i32 %shl, i32 80)
+  %cmp2 = shl nuw nsw i32 %arg, 3
+  ret i1 %icmp
+}
+
+define i1 @umin_poison_is_UB_via_call(i32 %arg) {
+; CHECK-LABEL: define i1 @umin_poison_is_UB_via_call(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80)
+; CHECK-NEXT:    call void @noundef(i32 noundef [[MIN]])
+; CHECK-NEXT:    [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    ret i1 false
+;
+  %icmp = icmp slt i32 %arg, 0
+  %shl = shl nuw nsw i32 %arg, 3
+  %min = call i32 @llvm.umin.i32(i32 %shl, i32 80)
+  call void @noundef(i32 noundef %min)
+  %cmp2 = shl nuw nsw i32 %arg, 3
+  ret i1 %icmp
+}
+
+define i1 @umin_poison_call_before_UB(i32 %arg) {
+; CHECK-LABEL: define i1 @umin_poison_call_before_UB(
+; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80)
+; CHECK-NEXT:    call void @fn()
+; CHECK-NEXT:    call void @noundef(i32 noundef [[MIN]])
+; CHECK-NEXT:    [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3
+; CHECK-NEXT:    ret i1 false
+;
+  %icmp = icmp slt i32 %arg, 0
+  %shl = shl nuw nsw i32 %arg, 3
+  %min = call i32 @llvm.umin.i32(i32 %shl, i32 80)
+  call void @fn()
+  call void @noundef(i32 noundef %min)
+  %cmp2 = shl nuw nsw i32 %arg, 3
+  ret i1 %icmp
+}
+
+declare i32 @llvm.umin.i32(i32, i32) #0
+
+declare void @noundef(i32 noundef)
+declare void @fn()

From dee02ee9f8ffc74fea6c54f4c00df16e7ca4c8a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 24 Jan 2024 12:47:51 +0100
Subject: [PATCH 774/843] [clang][Interp][NFC] Complex elements can only be
 primitives

So, return a PrimType directly from classifyComplexElementType().
---
 clang/lib/AST/Interp/ByteCodeExprGen.cpp | 6 +++---
 clang/lib/AST/Interp/ByteCodeExprGen.h   | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.cpp b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
index cfcef067b92bd..24a33c32df140 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.cpp
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.cpp
@@ -601,8 +601,8 @@ bool ByteCodeExprGen<Emitter>::VisitComplexBinOp(const BinaryOperator *E) {
 
   const Expr *LHS = E->getLHS();
   const Expr *RHS = E->getRHS();
-  PrimType LHSElemT = *this->classifyComplexElementType(LHS->getType());
-  PrimType RHSElemT = *this->classifyComplexElementType(RHS->getType());
+  PrimType LHSElemT = this->classifyComplexElementType(LHS->getType());
+  PrimType RHSElemT = this->classifyComplexElementType(RHS->getType());
 
   unsigned LHSOffset = this->allocateLocalPrimitive(LHS, PT_Ptr, true, false);
   unsigned RHSOffset = this->allocateLocalPrimitive(RHS, PT_Ptr, true, false);
@@ -2992,7 +2992,7 @@ bool ByteCodeExprGen<Emitter>::emitComplexReal(const Expr *SubExpr) {
   // Since our _Complex implementation does not map to a primitive type,
   // we sometimes have to do the lvalue-to-rvalue conversion here manually.
   if (!SubExpr->isLValue())
-    return this->emitLoadPop(*classifyComplexElementType(SubExpr->getType()),
+    return this->emitLoadPop(classifyComplexElementType(SubExpr->getType()),
                              SubExpr);
   return true;
 }
diff --git a/clang/lib/AST/Interp/ByteCodeExprGen.h b/clang/lib/AST/Interp/ByteCodeExprGen.h
index df4cb736299cb..63ea8292b5876 100644
--- a/clang/lib/AST/Interp/ByteCodeExprGen.h
+++ b/clang/lib/AST/Interp/ByteCodeExprGen.h
@@ -282,12 +282,12 @@ class ByteCodeExprGen : public ConstStmtVisitor<ByteCodeExprGen<Emitter>, bool>,
   }
 
   bool emitPrimCast(PrimType FromT, PrimType ToT, QualType ToQT, const Expr *E);
-  std::optional<PrimType> classifyComplexElementType(QualType T) const {
+  PrimType classifyComplexElementType(QualType T) const {
     assert(T->isAnyComplexType());
 
     QualType ElemType = T->getAs<ComplexType>()->getElementType();
 
-    return this->classify(ElemType);
+    return *this->classify(ElemType);
   }
 
   bool emitComplexReal(const Expr *SubExpr);

From 17cfc15d6b9b3773db8353937aac9878d7777b21 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 24 Jan 2024 10:13:52 +0000
Subject: [PATCH 775/843] Fix spelling typo. NFC

commutatvity -> commutativity
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index e158312caffde..86de5dd0c3f09 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52193,7 +52193,7 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
   if (SDValue Not = IsNOT(N0, DAG))
     return DAG.getNode(ISD::AND, DL, VT, DAG.getBitcast(VT, Not), N1);
 
-  // Fold for better commutatvity:
+  // Fold for better commutativity:
   // ANDNP(x,NOT(y)) -> AND(NOT(x),NOT(y)) -> NOT(OR(X,Y)).
   if (N1->hasOneUse())
     if (SDValue Not = IsNOT(N1, DAG))

From 6255bae6c9afe89470f264f903051f64bc15135f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 24 Jan 2024 10:55:22 +0000
Subject: [PATCH 776/843] [X86] Add test coverage based on #78888

---
 llvm/test/CodeGen/X86/icmp-pow2-mask.ll | 142 ++++++++++++++++++++++++
 1 file changed, 142 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/icmp-pow2-mask.ll

diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
new file mode 100644
index 0000000000000..5eeb45607c8c8
--- /dev/null
+++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
@@ -0,0 +1,142 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64    | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=AVX512
+
+define <8 x i16> @pow2_mask_v16i8(i8 zeroext %0) {
+; SSE2-LABEL: pow2_mask_v16i8:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %edi, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    retq
+;
+; SSE41-LABEL: pow2_mask_v16i8:
+; SSE41:       # %bb.0:
+; SSE41-NEXT:    movd %edi, %xmm0
+; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm1
+; SSE41-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
+; SSE41-NEXT:    retq
+;
+; AVX2-LABEL: pow2_mask_v16i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: pow2_mask_v16i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastb %edi, %xmm0
+; AVX512-NEXT:    vptestmb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %k0
+; AVX512-NEXT:    vpmovm2w %k0, %xmm0
+; AVX512-NEXT:    retq
+  %vec = insertelement <1 x i8> poison, i8 %0, i64 0
+  %splat = shufflevector <1 x i8> %vec, <1 x i8> poison, <8 x i32> zeroinitializer
+  %mask = and <8 x i8> %splat, <i8 -128, i8 64, i8 32, i8 16, i8 8, i8 4, i8 2, i8 1>
+  %not = icmp ne <8 x i8> %mask, zeroinitializer
+  %ext = sext <8 x i1> %not to <8 x i16>
+  ret <8 x i16> %ext
+}
+
+define <16 x i16> @pow2_mask_v16i16(i16 zeroext %0) {
+; SSE-LABEL: pow2_mask_v16i16:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm0
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1]
+; SSE-NEXT:    movdqa %xmm0, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [32768,16384,8192,4096,2048,1024,512,256]
+; SSE-NEXT:    pand %xmm3, %xmm0
+; SSE-NEXT:    pcmpeqw %xmm3, %xmm0
+; SSE-NEXT:    pcmpeqw %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: pow2_mask_v16i16:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastw %xmm0, %ymm0
+; AVX2-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1]
+; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: pow2_mask_v16i16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastw %edi, %ymm0
+; AVX512-NEXT:    vmovdqa {{.*#+}} ymm1 = [32768,16384,8192,4096,2048,1024,512,256,128,64,32,16,8,4,2,1]
+; AVX512-NEXT:    vpand %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    vpcmpeqw %ymm1, %ymm0, %ymm0
+; AVX512-NEXT:    retq
+  %vec = insertelement <1 x i16> poison, i16 %0, i64 0
+  %splat = shufflevector <1 x i16> %vec, <1 x i16> poison, <16 x i32> zeroinitializer
+  %mask = and <16 x i16> %splat, <i16 -32768, i16 16384, i16 8192, i16 4096, i16 2048, i16 1024, i16 512, i16 256, i16 128, i16 64, i16 32, i16 16, i16 8, i16 4, i16 2, i16 1>
+  %not = icmp ne <16 x i16> %mask, zeroinitializer
+  %ext = sext <16 x i1> %not to <16 x i16>
+  ret <16 x i16> %ext
+}
+
+; PR78888
+define i64 @pow2_mask_v8i8(i8 zeroext %0) {
+; SSE-LABEL: pow2_mask_v8i8:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movd %edi, %xmm0
+; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm1
+; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
+; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movq %xmm0, %rax
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: pow2_mask_v8i8:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vmovd %edi, %xmm0
+; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: pow2_mask_v8i8:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vpbroadcastb %edi, %xmm0
+; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
+; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
+; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovq %xmm0, %rax
+; AVX512-NEXT:    retq
+  %vec = insertelement <1 x i8> poison, i8 %0, i64 0
+  %splat = shufflevector <1 x i8> %vec, <1 x i8> poison, <8 x i32> zeroinitializer
+  %mask = and <8 x i8> %splat, <i8 -128, i8 64, i8 32, i8 16, i8 8, i8 4, i8 2, i8 1>
+  %not = icmp ne <8 x i8> %mask, zeroinitializer
+  %ext = sext <8 x i1> %not to <8 x i8>
+  %res = bitcast <8 x i8> %ext to i64
+  ret i64 %res
+}

From 72f10f7eb536da58cb79e13974895cd97d4e1a5f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 24 Jan 2024 11:39:21 +0000
Subject: [PATCH 777/843] [X86] Fold not(pcmpeq(and(X,CstPow2),0)) ->
 pcmpeq(and(X,CstPow2),CstPow2)

Fixes #78888
---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 20 ++++++++++++
 llvm/test/CodeGen/X86/icmp-pow2-mask.ll | 41 +++++++++----------------
 2 files changed, 35 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 86de5dd0c3f09..5cc2803b28087 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -49328,6 +49328,26 @@ static SDValue combineOrXorWithSETCC(SDNode *N, SDValue N0, SDValue N1,
     }
   }
 
+  // not(pcmpeq(and(X,CstPow2),0)) -> pcmpeq(and(X,CstPow2),CstPow2)
+  if (N->getOpcode() == ISD::XOR && N0.getOpcode() == X86ISD::PCMPEQ &&
+      N0.getOperand(0).getOpcode() == ISD::AND &&
+      ISD::isBuildVectorAllOnes(N1.getNode())) {
+    MVT VT = N->getSimpleValueType(0);
+    APInt UndefElts;
+    SmallVector<APInt> EltBits;
+    if (getTargetConstantBitsFromNode(N0.getOperand(0).getOperand(1),
+                                      VT.getScalarSizeInBits(), UndefElts,
+                                      EltBits)) {
+      bool IsPow2OrUndef = true;
+      for (unsigned I = 0, E = EltBits.size(); I != E; ++I)
+        IsPow2OrUndef &= UndefElts[I] || EltBits[I].isPowerOf2();
+
+      if (IsPow2OrUndef)
+        return DAG.getNode(X86ISD::PCMPEQ, SDLoc(N), VT, N0.getOperand(0),
+                           N0.getOperand(0).getOperand(1));
+    }
+  }
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
index 5eeb45607c8c8..e2b3a23827a01 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
@@ -11,11 +11,9 @@ define <8 x i16> @pow2_mask_v16i8(i8 zeroext %0) {
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
 ; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; SSE2-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; SSE2-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE2-NEXT:    pxor %xmm1, %xmm0
+; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [128,128,64,64,32,32,16,16,8,8,4,4,2,2,1,1]
+; SSE2-NEXT:    pand %xmm1, %xmm0
+; SSE2-NEXT:    pcmpeqb %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; SSE41-LABEL: pow2_mask_v16i8:
@@ -23,11 +21,9 @@ define <8 x i16> @pow2_mask_v16i8(i8 zeroext %0) {
 ; SSE41-NEXT:    movd %edi, %xmm0
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm1
-; SSE41-NEXT:    pcmpeqb %xmm0, %xmm1
-; SSE41-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE41-NEXT:    pxor %xmm1, %xmm0
+; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    pand %xmm1, %xmm0
+; SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
 ; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
 ; SSE41-NEXT:    retq
 ;
@@ -35,11 +31,9 @@ define <8 x i16> @pow2_mask_v16i8(i8 zeroext %0) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpmovsxbw %xmm0, %xmm0
 ; AVX2-NEXT:    retq
 ;
@@ -103,11 +97,9 @@ define i64 @pow2_mask_v8i8(i8 zeroext %0) {
 ; SSE-NEXT:    movd %edi, %xmm0
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm1
-; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
-; SSE-NEXT:    pcmpeqd %xmm0, %xmm0
-; SSE-NEXT:    pxor %xmm1, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,u,u,u,u,u,u,u,u]
+; SSE-NEXT:    pand %xmm1, %xmm0
+; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
 ; SSE-NEXT:    movq %xmm0, %rax
 ; SSE-NEXT:    retq
 ;
@@ -115,21 +107,18 @@ define i64 @pow2_mask_v8i8(i8 zeroext %0) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovd %edi, %xmm0
 ; AVX2-NEXT:    vpbroadcastb %xmm0, %xmm0
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; AVX2-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    vpcmpeqd %xmm1, %xmm1, %xmm1
-; AVX2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX2-NEXT:    vmovq %xmm0, %rax
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: pow2_mask_v8i8:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpbroadcastb %edi, %xmm0
-; AVX512-NEXT:    vpandq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm0, %xmm0
-; AVX512-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,128,64,32,16,8,4,2,1]
+; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $15, %xmm0, %xmm0, %xmm0
 ; AVX512-NEXT:    vmovq %xmm0, %rax
 ; AVX512-NEXT:    retq
   %vec = insertelement <1 x i8> poison, i8 %0, i64 0

From 27cfe7a07fc858bd890f2e0980f530a8573748b0 Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Wed, 24 Jan 2024 13:23:55 +0100
Subject: [PATCH 778/843] [flang] Set assumed-size last extent to -1 (#79156)

Currently lowering sets the extents of assumed-size array to "undef"
which was OK as long as the value was not expected to be read.

But when interfacing with the runtime and when passing assumed-size to
assumed-rank, this last extent may be read and must be -1 as specified
in the BIND(C) case in 18.5.3 point 5.

Set this value to -1, and update all the lowering code that was looking
for an undef defining op to identify assumed-size: much safer to
propagate and use semantic info here, the previous check actually did
not work if the array was used in an internal procedure (defining op not
visible anymore).

@clementval and @agozillon, I left assumed-size extent to zero in the
acc/omp bounds op as it was, please double check that is what you want
(I can imagine -1 may create troubles here, and 0 makes some sense as it
would lead to no data transfer).

This also allows removing special cases in UBOUND/LBOUND lowering.

Also disable allocation of cray pointee. This was never intended and
would now lead to crashes with the -1 value for assumed-size cray
pointee.
---
 flang/include/flang/Optimizer/Builder/Array.h | 27 --------
 .../flang/Optimizer/Builder/BoxValue.h        |  3 -
 flang/lib/Lower/ConvertVariable.cpp           | 29 ++++++---
 flang/lib/Lower/DirectivesCommon.h            | 29 ++++++---
 flang/lib/Lower/OpenMP.cpp                    |  7 ++-
 flang/lib/Optimizer/Builder/BoxValue.cpp      | 16 -----
 flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 63 ++++++-------------
 .../Optimizer/Transforms/ArrayValueCopy.cpp   | 13 +++-
 .../Lower/HLFIR/assumed-size-cray-pointee.f90 | 14 +++++
 flang/test/Lower/HLFIR/cray-pointers.f90      |  6 --
 flang/test/Lower/Intrinsics/lbound.f90        |  2 +-
 flang/test/Lower/Intrinsics/ubound.f90        |  2 +-
 .../Lower/array-expression-assumed-size.f90   |  8 +--
 flang/test/Lower/cray-pointer.f90             |  6 --
 14 files changed, 97 insertions(+), 128 deletions(-)
 delete mode 100644 flang/include/flang/Optimizer/Builder/Array.h
 create mode 100644 flang/test/Lower/HLFIR/assumed-size-cray-pointee.f90

diff --git a/flang/include/flang/Optimizer/Builder/Array.h b/flang/include/flang/Optimizer/Builder/Array.h
deleted file mode 100644
index c508042bc20f2..0000000000000
--- a/flang/include/flang/Optimizer/Builder/Array.h
+++ /dev/null
@@ -1,27 +0,0 @@
-//===-- Array.h -------------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef FORTRAN_OPTIMIZER_BUILDER_ARRAY_H
-#define FORTRAN_OPTIMIZER_BUILDER_ARRAY_H
-
-#include "flang/Optimizer/Dialect/FIROps.h"
-
-namespace fir::factory {
-
-/// Return true if and only if the extents are those of an assumed-size array.
-/// An assumed-size array in Fortran is declared with `*` as the upper bound of
-/// the last dimension of the array. Lowering converts the asterisk to an
-/// undefined value.
-inline bool isAssumedSize(const llvm::SmallVectorImpl<mlir::Value> &extents) {
-  return !extents.empty() &&
-         mlir::isa_and_nonnull<UndefOp>(extents.back().getDefiningOp());
-}
-
-} // namespace fir::factory
-
-#endif // FORTRAN_OPTIMIZER_BUILDER_ARRAY_H
diff --git a/flang/include/flang/Optimizer/Builder/BoxValue.h b/flang/include/flang/Optimizer/Builder/BoxValue.h
index 040555f3d907c..2fed2d48a7a08 100644
--- a/flang/include/flang/Optimizer/Builder/BoxValue.h
+++ b/flang/include/flang/Optimizer/Builder/BoxValue.h
@@ -527,9 +527,6 @@ class ExtendedValue : public details::matcher<ExtendedValue> {
                  [](const auto &box) -> bool { return false; });
   }
 
-  /// Is this an assumed size array ?
-  bool isAssumedSize() const;
-
   /// LLVM style debugging of extended values
   LLVM_DUMP_METHOD void dump() const { llvm::errs() << *this << '\n'; }
 
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index dd024a0a1ec79..006cc1417b63d 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -684,6 +684,13 @@ static mlir::Value createNewLocal(Fortran::lower::AbstractConverter &converter,
   llvm::StringRef symNm = toStringRef(ultimateSymbol.name());
   bool isTarg = var.isTarget();
 
+  // Do not allocate storage for cray pointee. The address inside the cray
+  // pointer will be used instead when using the pointee. Allocating space
+  // would be a waste of space, and incorrect if the pointee is a non dummy
+  // assumed-size (possible with cray pointee).
+  if (ultimateSymbol.test(Fortran::semantics::Symbol::Flag::CrayPointee))
+    return builder.create<fir::ZeroOp>(loc, fir::ReferenceType::get(ty));
+
   // Let the builder do all the heavy lifting.
   if (!Fortran::semantics::IsProcedurePointer(ultimateSymbol))
     return builder.allocateLocal(loc, ty, nm, symNm, shape, lenParams, isTarg);
@@ -1454,6 +1461,15 @@ static void lowerExplicitLowerBounds(
   assert(result.empty() || result.size() == box.dynamicBound().size());
 }
 
+/// Return -1 for the last dimension extent/upper bound of assumed-size arrays.
+/// This value is required to fulfill the requirements for assumed-rank
+/// associated with assumed-size (see for instance UBOUND in 16.9.196, and
+/// CFI_desc_t requirements in 18.5.3 point 5.).
+static mlir::Value getAssumedSizeExtent(mlir::Location loc,
+                                        fir::FirOpBuilder &builder) {
+  return builder.createIntegerConstant(loc, builder.getIndexType(), -1);
+}
+
 /// Lower explicit extents into \p result if this is an explicit-shape or
 /// assumed-size array. Does nothing if this is not an explicit-shape or
 /// assumed-size array.
@@ -1484,8 +1500,7 @@ lowerExplicitExtents(Fortran::lower::AbstractConverter &converter,
         result.emplace_back(
             computeExtent(builder, loc, lowerBounds[spec.index()], ub));
     } else if (spec.value()->ubound().isStar()) {
-      // Assumed extent is undefined. Must be provided by user's code.
-      result.emplace_back(builder.create<fir::UndefOp>(loc, idxTy));
+      result.emplace_back(getAssumedSizeExtent(loc, builder));
     }
   }
   assert(result.empty() || result.size() == box.dynamicBound().size());
@@ -1513,15 +1528,13 @@ lowerExplicitCharLen(Fortran::lower::AbstractConverter &converter,
   return mlir::Value{};
 }
 
-/// Treat negative values as undefined. Assumed size arrays will return -1 from
-/// the front end for example. Using negative values can produce hard to find
-/// bugs much further along in the compilation.
+/// Assumed size arrays last extent is -1 in the front end.
 static mlir::Value genExtentValue(fir::FirOpBuilder &builder,
                                   mlir::Location loc, mlir::Type idxTy,
                                   long frontEndExtent) {
   if (frontEndExtent >= 0)
     return builder.createIntegerConstant(loc, idxTy, frontEndExtent);
-  return builder.create<fir::UndefOp>(loc, idxTy);
+  return getAssumedSizeExtent(loc, builder);
 }
 
 /// If a symbol is an array, it may have been declared with unknown extent
@@ -2000,7 +2013,7 @@ void Fortran::lower::mapSymbolAttributes(
             builder.create<fir::BoxDimsOp>(loc, idxTy, idxTy, idxTy, box, dim);
         shapes.emplace_back(dimInfo.getResult(1));
       } else if (spec->ubound().isStar()) {
-        shapes.emplace_back(builder.create<fir::UndefOp>(loc, idxTy));
+        shapes.emplace_back(getAssumedSizeExtent(loc, builder));
       } else {
         llvm::report_fatal_error("unknown bound category");
       }
@@ -2047,7 +2060,7 @@ void Fortran::lower::mapSymbolAttributes(
         } else {
           // An assumed size array. The extent is not computed.
           assert(spec->ubound().isStar() && "expected assumed size");
-          extents.emplace_back(builder.create<fir::UndefOp>(loc, idxTy));
+          extents.emplace_back(getAssumedSizeExtent(loc, builder));
         }
       }
     }
diff --git a/flang/lib/Lower/DirectivesCommon.h b/flang/lib/Lower/DirectivesCommon.h
index ffbd8ae1558ed..0920ff80f487b 100644
--- a/flang/lib/Lower/DirectivesCommon.h
+++ b/flang/lib/Lower/DirectivesCommon.h
@@ -761,7 +761,7 @@ template <typename BoundsOp, typename BoundsType>
 llvm::SmallVector<mlir::Value>
 genBaseBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
                  Fortran::lower::AbstractConverter &converter,
-                 fir::ExtendedValue dataExv) {
+                 fir::ExtendedValue dataExv, bool isAssumedSize) {
   mlir::Type idxTy = builder.getIndexType();
   mlir::Type boundTy = builder.getType<BoundsType>();
   llvm::SmallVector<mlir::Value> bounds;
@@ -770,14 +770,15 @@ genBaseBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
     return bounds;
 
   mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
-  for (std::size_t dim = 0; dim < dataExv.rank(); ++dim) {
+  const unsigned rank = dataExv.rank();
+  for (unsigned dim = 0; dim < rank; ++dim) {
     mlir::Value baseLb =
         fir::factory::readLowerBound(builder, loc, dataExv, dim, one);
     mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
     mlir::Value ub;
     mlir::Value lb = zero;
     mlir::Value ext = fir::factory::readExtent(builder, loc, dataExv, dim);
-    if (mlir::isa<fir::UndefOp>(ext.getDefiningOp())) {
+    if (isAssumedSize && dim + 1 == rank) {
       ext = zero;
       ub = lb;
     } else {
@@ -801,7 +802,8 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
              Fortran::lower::StatementContext &stmtCtx,
              const std::list<Fortran::parser::SectionSubscript> &subscripts,
              std::stringstream &asFortran, fir::ExtendedValue &dataExv,
-             mlir::Value baseAddr, bool treatIndexAsSection = false) {
+             bool dataExvIsAssumedSize, mlir::Value baseAddr,
+             bool treatIndexAsSection = false) {
   int dimension = 0;
   mlir::Type idxTy = builder.getIndexType();
   mlir::Type boundTy = builder.getType<BoundsType>();
@@ -809,6 +811,7 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
 
   mlir::Value zero = builder.createIntegerConstant(loc, idxTy, 0);
   mlir::Value one = builder.createIntegerConstant(loc, idxTy, 1);
+  const int dataExvRank = static_cast<int>(dataExv.rank());
   for (const auto &subscript : subscripts) {
     const auto *triplet{
         std::get_if<Fortran::parser::SubscriptTriplet>(&subscript.u)};
@@ -912,7 +915,7 @@ genBoundsOps(fir::FirOpBuilder &builder, mlir::Location loc,
         }
 
         extent = fir::factory::readExtent(builder, loc, dataExv, dimension);
-        if (mlir::isa<fir::UndefOp>(extent.getDefiningOp())) {
+        if (dataExvIsAssumedSize && dimension + 1 == dataExvRank) {
           extent = zero;
           if (ubound && lbound) {
             mlir::Value diff =
@@ -959,6 +962,7 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                 const auto *dataRef =
                     std::get_if<Fortran::parser::DataRef>(&designator.u);
                 fir::ExtendedValue dataExv;
+                bool dataExvIsAssumedSize = false;
                 if (Fortran::parser::Unwrap<
                         Fortran::parser::StructureComponent>(
                         arrayElement->base)) {
@@ -971,6 +975,8 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                 } else {
                   const Fortran::parser::Name &name =
                       Fortran::parser::GetLastName(*dataRef);
+                  dataExvIsAssumedSize = Fortran::semantics::IsAssumedSizeArray(
+                      name.symbol->GetUltimate());
                   info = getDataOperandBaseAddr(converter, builder,
                                                 *name.symbol, operandLocation);
                   dataExv = converter.getSymbolExtendedValue(*name.symbol);
@@ -981,8 +987,8 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                   asFortran << '(';
                   bounds = genBoundsOps<BoundsOp, BoundsType>(
                       builder, operandLocation, converter, stmtCtx,
-                      arrayElement->subscripts, asFortran, dataExv, info.addr,
-                      treatIndexAsSection);
+                      arrayElement->subscripts, asFortran, dataExv,
+                      dataExvIsAssumedSize, info.addr, treatIndexAsSection);
                 }
                 asFortran << ')';
               } else if (auto structComp = Fortran::parser::Unwrap<
@@ -993,7 +999,8 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                 if (fir::unwrapRefType(info.addr.getType())
                         .isa<fir::SequenceType>())
                   bounds = genBaseBoundsOps<BoundsOp, BoundsType>(
-                      builder, operandLocation, converter, compExv);
+                      builder, operandLocation, converter, compExv,
+                      /*isAssumedSize=*/false);
                 asFortran << (*expr).AsFortran();
 
                 bool isOptional = Fortran::semantics::IsOptional(
@@ -1047,10 +1054,14 @@ AddrAndBoundsInfo gatherDataOperandAddrAndBounds(
                     bounds = genBoundsOpsFromBox<BoundsOp, BoundsType>(
                         builder, operandLocation, converter, dataExv, info);
                   }
+                  bool dataExvIsAssumedSize =
+                      Fortran::semantics::IsAssumedSizeArray(
+                          name.symbol->GetUltimate());
                   if (fir::unwrapRefType(info.addr.getType())
                           .isa<fir::SequenceType>())
                     bounds = genBaseBoundsOps<BoundsOp, BoundsType>(
-                        builder, operandLocation, converter, dataExv);
+                        builder, operandLocation, converter, dataExv,
+                        dataExvIsAssumedSize);
                   asFortran << name.ToString();
                 } else { // Unsupported
                   llvm::report_fatal_error(
diff --git a/flang/lib/Lower/OpenMP.cpp b/flang/lib/Lower/OpenMP.cpp
index bebae92f2f9f0..d2215f4d1bf1c 100644
--- a/flang/lib/Lower/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP.cpp
@@ -2915,11 +2915,14 @@ genTargetOp(Fortran::lower::AbstractConverter &converter,
                                                   mlir::omp::DataBoundsType>(
                   converter.getFirOpBuilder(), converter.getCurrentLocation(),
                   converter, dataExv, info);
-        if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>())
+        if (fir::unwrapRefType(info.addr.getType()).isa<fir::SequenceType>()) {
+          bool dataExvIsAssumedSize =
+              Fortran::semantics::IsAssumedSizeArray(sym.GetUltimate());
           bounds = Fortran::lower::genBaseBoundsOps<mlir::omp::DataBoundsOp,
                                                     mlir::omp::DataBoundsType>(
               converter.getFirOpBuilder(), converter.getCurrentLocation(),
-              converter, dataExv);
+              converter, dataExv, dataExvIsAssumedSize);
+        }
 
         llvm::omp::OpenMPOffloadMappingFlags mapFlag =
             llvm::omp::OpenMPOffloadMappingFlags::OMP_MAP_IMPLICIT;
diff --git a/flang/lib/Optimizer/Builder/BoxValue.cpp b/flang/lib/Optimizer/Builder/BoxValue.cpp
index ada315493ba45..361fa59e20403 100644
--- a/flang/lib/Optimizer/Builder/BoxValue.cpp
+++ b/flang/lib/Optimizer/Builder/BoxValue.cpp
@@ -232,19 +232,3 @@ mlir::Value fir::factory::getExtentAtDimension(mlir::Location loc,
     return extents[dim];
   return {};
 }
-
-static inline bool isUndefOp(mlir::Value v) {
-  return mlir::isa_and_nonnull<fir::UndefOp>(v.getDefiningOp());
-}
-
-bool fir::ExtendedValue::isAssumedSize() const {
-  return match(
-      [](const fir::ArrayBoxValue &box) -> bool {
-        return !box.getExtents().empty() && isUndefOp(box.getExtents().back());
-        ;
-      },
-      [](const fir::CharArrayBoxValue &box) -> bool {
-        return !box.getExtents().empty() && isUndefOp(box.getExtents().back());
-      },
-      [](const auto &box) -> bool { return false; });
-}
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index d781650069442..a0baa409fe44b 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -5691,10 +5691,10 @@ static mlir::Value computeLBOUND(fir::FirOpBuilder &builder, mlir::Location loc,
   if (hasDefaultLowerBound(array))
     return one;
   mlir::Value lb = fir::factory::readLowerBound(builder, loc, array, dim, one);
-  if (dim + 1 == array.rank() && array.isAssumedSize())
-    return lb;
   mlir::Value extent = fir::factory::readExtent(builder, loc, array, dim);
   zero = builder.createConvert(loc, extent.getType(), zero);
+  // Note: for assumed size, the extent is -1, and the lower bound should
+  // be returned. It is important to test extent == 0 and not extent > 0.
   auto dimIsEmpty = builder.create<mlir::arith::CmpIOp>(
       loc, mlir::arith::CmpIPredicate::eq, extent, zero);
   one = builder.createConvert(loc, lb.getType(), one);
@@ -5703,52 +5703,29 @@ static mlir::Value computeLBOUND(fir::FirOpBuilder &builder, mlir::Location loc,
 
 /// Create a fir.box to be passed to the LBOUND/UBOUND runtime.
 /// This ensure that local lower bounds of assumed shape are propagated and that
-/// a fir.box with equivalent LBOUNDs but an explicit shape is created for
-/// assumed size arrays to avoid undefined behaviors in codegen or the runtime.
+/// a fir.box with equivalent LBOUNDs.
 static mlir::Value
 createBoxForRuntimeBoundInquiry(mlir::Location loc, fir::FirOpBuilder &builder,
                                 const fir::ExtendedValue &array) {
-  if (!array.isAssumedSize())
-    return array.match(
-        [&](const fir::BoxValue &boxValue) -> mlir::Value {
-          // This entity is mapped to a fir.box that may not contain the local
-          // lower bound information if it is a dummy. Rebox it with the local
-          // shape information.
-          mlir::Value localShape = builder.createShape(loc, array);
-          mlir::Value oldBox = boxValue.getAddr();
-          return builder.create<fir::ReboxOp>(loc, oldBox.getType(), oldBox,
-                                              localShape,
-                                              /*slice=*/mlir::Value{});
-        },
-        [&](const auto &) -> mlir::Value {
-          // This a pointer/allocatable, or an entity not yet tracked with a
-          // fir.box. For pointer/allocatable, createBox will forward the
-          // descriptor that contains the correct lower bound information. For
-          // other entities, a new fir.box will be made with the local lower
-          // bounds.
-          return builder.createBox(loc, array);
-        });
-  // Assumed sized are not meant to be emboxed. This could cause the undefined
-  // extent cannot safely be understood by the runtime/codegen that will
-  // consider that the dimension is empty and that the related LBOUND value must
-  // be one. Pretend that the related extent is one to get the correct LBOUND
-  // value.
-  llvm::SmallVector<mlir::Value> shape =
-      fir::factory::getExtents(loc, builder, array);
-  assert(!shape.empty() && "assumed size must have at least one dimension");
-  shape.back() = builder.createIntegerConstant(loc, builder.getIndexType(), 1);
-  auto safeToEmbox = array.match(
-      [&](const fir::CharArrayBoxValue &x) -> fir::ExtendedValue {
-        return fir::CharArrayBoxValue{x.getAddr(), x.getLen(), shape,
-                                      x.getLBounds()};
-      },
-      [&](const fir::ArrayBoxValue &x) -> fir::ExtendedValue {
-        return fir::ArrayBoxValue{x.getAddr(), shape, x.getLBounds()};
+  return array.match(
+      [&](const fir::BoxValue &boxValue) -> mlir::Value {
+        // This entity is mapped to a fir.box that may not contain the local
+        // lower bound information if it is a dummy. Rebox it with the local
+        // shape information.
+        mlir::Value localShape = builder.createShape(loc, array);
+        mlir::Value oldBox = boxValue.getAddr();
+        return builder.create<fir::ReboxOp>(loc, oldBox.getType(), oldBox,
+                                            localShape,
+                                            /*slice=*/mlir::Value{});
       },
-      [&](const auto &) -> fir::ExtendedValue {
-        fir::emitFatalError(loc, "not an assumed size array");
+      [&](const auto &) -> mlir::Value {
+        // This is a pointer/allocatable, or an entity not yet tracked with a
+        // fir.box. For pointer/allocatable, createBox will forward the
+        // descriptor that contains the correct lower bound information. For
+        // other entities, a new fir.box will be made with the local lower
+        // bounds.
+        return builder.createBox(loc, array);
       });
-  return builder.createBox(loc, safeToEmbox);
 }
 
 // LBOUND
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index 5a4997052b18a..29717353da963 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -6,7 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "flang/Optimizer/Builder/Array.h"
 #include "flang/Optimizer/Builder/BoxValue.h"
 #include "flang/Optimizer/Builder/FIRBuilder.h"
 #include "flang/Optimizer/Builder/Factory.h"
@@ -822,6 +821,16 @@ static mlir::Type getEleTy(mlir::Type ty) {
   return ReferenceType::get(eleTy);
 }
 
+// This is an unsafe way to deduce this (won't be true in internal
+// procedure or inside select-rank for assumed-size). Only here to satisfy
+// legacy code until removed.
+static bool isAssumedSize(llvm::SmallVectorImpl<mlir::Value> &extents) {
+  if (extents.empty())
+    return false;
+  auto cstLen = fir::getIntIfConstant(extents.back());
+  return cstLen.has_value() && *cstLen == -1;
+}
+
 // Extract extents from the ShapeOp/ShapeShiftOp into the result vector.
 static bool getAdjustedExtents(mlir::Location loc,
                                mlir::PatternRewriter &rewriter,
@@ -840,7 +849,7 @@ static bool getAdjustedExtents(mlir::Location loc,
     emitFatalError(loc, "not a fir.shape/fir.shape_shift op");
   }
   auto idxTy = rewriter.getIndexType();
-  if (factory::isAssumedSize(result)) {
+  if (isAssumedSize(result)) {
     // Use slice information to compute the extent of the column.
     auto one = rewriter.create<mlir::arith::ConstantIndexOp>(loc, 1);
     mlir::Value size = one;
diff --git a/flang/test/Lower/HLFIR/assumed-size-cray-pointee.f90 b/flang/test/Lower/HLFIR/assumed-size-cray-pointee.f90
new file mode 100644
index 0000000000000..6e3138fbe3f4d
--- /dev/null
+++ b/flang/test/Lower/HLFIR/assumed-size-cray-pointee.f90
@@ -0,0 +1,14 @@
+! Test lowering of assumed-size cray pointee. This is an
+! odd case where an assumed-size symbol is not a dummy.
+! Test that no bogus stack allocation is created for it
+! (it will take its address from the cray pointer when used).
+! RUN: bbc -emit-hlfir -o - %s | FileCheck %s
+
+subroutine assumed_size_cray_ptr
+  implicit none
+  pointer(ivar,var)
+  real :: var(*)
+end subroutine
+! CHECK-LABEL: func.func @_QPassumed_size_cray_ptr
+! CHECK-NOT: fir.alloca !fir.array<?xf32>
+! CHECK: return
diff --git a/flang/test/Lower/HLFIR/cray-pointers.f90 b/flang/test/Lower/HLFIR/cray-pointers.f90
index 1ee0b2e0b0b94..d1f1a5647ff1c 100644
--- a/flang/test/Lower/HLFIR/cray-pointers.f90
+++ b/flang/test/Lower/HLFIR/cray-pointers.f90
@@ -125,7 +125,6 @@ end subroutine test5
 ! CHECK:           %[[VAL_2:.*]]:2 = hlfir.declare %[[VAL_1]] {uniq_name = "_QFtest5Ecp"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
 ! CHECK:           %[[VAL_5:.*]] = arith.constant 3 : index
 ! CHECK:           %[[VAL_6:.*]] = arith.constant 9 : index
-! CHECK:           %[[VAL_7:.*]] = fir.alloca !fir.array<9x!fir.type<_QFtest5Tt{r:f32,i:i32}>> {bindc_name = "v", uniq_name = "_QFtest5Ev"}
 ! CHECK:           %[[VAL_8:.*]] = fir.shape_shift %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shapeshift<1>
 ! CHECK:           %[[VAL_13:.*]]:2 = hlfir.declare %[[VAL_0]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest5Ev"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFtest5Tt{r:f32,i:i32}>>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFtest5Tt{r:f32,i:i32}>>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFtest5Tt{r:f32,i:i32}>>>>>)
 ! CHECK:           %[[VAL_14:.*]] = fir.zero_bits !fir.ptr<!fir.array<?x!fir.type<_QFtest5Tt{r:f32,i:i32}>>>
@@ -206,7 +205,6 @@ end subroutine test7
 ! CHECK:    %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_3]](%[[VAL_4]]) {uniq_name = "_QFtest7Earr"} : (!fir.ref<!fir.array<5xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<5xi32>>, !fir.ref<!fir.array<5xi32>>)
 ! CHECK:    %[[VAL_6:.*]] = arith.constant 5 : index
-! CHECK:    %[[VAL_7:.*]] = fir.alloca !fir.array<5xi32> {bindc_name = "pte", uniq_name = "_QFtest7Epte"}
 ! CHECK:    %[[VAL_8:.*]] = fir.shape %[[VAL_6]] : (index) -> !fir.shape<1>
 ! CHECK:    %[[VAL_9:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest7Epte"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 ! CHECK:    %[[VAL_10:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
@@ -229,7 +227,6 @@ end subroutine test8
 ! CHECK-LABEL:     func.func @_QPtest8(
 ! CHECK:    %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
 ! CHECK:    %[[VAL_2:.*]] = arith.constant 5 : index
-! CHECK:    %[[VAL_3:.*]] = fir.alloca !fir.array<5xi32> {bindc_name = "pte", uniq_name = "_QFtest8Epte"}
 ! CHECK:    %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest8Epte"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 ! CHECK:    %[[VAL_6:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
@@ -260,7 +257,6 @@ end subroutine test9
 ! CHECK-LABEL:     func.func @_QPtest9(
 ! CHECK:    %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<!fir.array<?xi32>>>
 ! CHECK:    %[[VAL_2:.*]] = arith.constant 5 : index
-! CHECK:    %[[VAL_3:.*]] = fir.alloca !fir.array<5xi32> {bindc_name = "pte", uniq_name = "_QFtest9Epte"}
 ! CHECK:    %[[VAL_4:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:    %[[VAL_5:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest9Epte"} : (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>) -> (!fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>, !fir.ref<!fir.box<!fir.ptr<!fir.array<?xi32>>>>)
 ! CHECK:    %[[VAL_6:.*]] = fir.zero_bits !fir.ptr<!fir.array<?xi32>>
@@ -291,7 +287,6 @@ subroutine test10()
 end subroutine test10
 ! CHECK-LABEL:  func.func @_QPtest10(
 ! CHECK:    %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<i32>>
-! CHECK:    %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "pte", uniq_name = "_QFtest10Epte"}
 ! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest10Epte"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK:    %[[VAL_4:.*]] = fir.zero_bits !fir.ptr<i32>
 ! CHECK:    %[[VAL_5:.*]] = fir.embox %[[VAL_4]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
@@ -320,7 +315,6 @@ subroutine sub2(x)
 end subroutine test11
 ! CHECK-LABEL:  func.func @_QPtest11(
 ! CHECK:    %[[VAL_1:.*]] = fir.alloca !fir.box<!fir.ptr<i32>>
-! CHECK:    %[[VAL_2:.*]] = fir.alloca i32 {bindc_name = "pte", uniq_name = "_QFtest11Epte"}
 ! CHECK:    %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_1]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFtest11Epte"} : (!fir.ref<!fir.box<!fir.ptr<i32>>>) -> (!fir.ref<!fir.box<!fir.ptr<i32>>>, !fir.ref<!fir.box<!fir.ptr<i32>>>)
 ! CHECK:    %[[VAL_4:.*]] = fir.zero_bits !fir.ptr<i32>
 ! CHECK:    %[[VAL_5:.*]] = fir.embox %[[VAL_4]] : (!fir.ptr<i32>) -> !fir.box<!fir.ptr<i32>>
diff --git a/flang/test/Lower/Intrinsics/lbound.f90 b/flang/test/Lower/Intrinsics/lbound.f90
index 54d2683e14f1c..a5ca2d39a5ee4 100644
--- a/flang/test/Lower/Intrinsics/lbound.f90
+++ b/flang/test/Lower/Intrinsics/lbound.f90
@@ -40,8 +40,8 @@ subroutine lbound_test_2(a, dim, res)
 subroutine lbound_test_3(a, dim, res)
   real, dimension(2:10, 3:*) :: a
   integer(8):: dim, res
+! CHECK:  %[[VAL_0:.*]] = arith.constant -1 : index
 ! CHECK:  %[[VAL_1:.*]] = fir.load %arg1 : !fir.ref<i64>
-! CHECK:  %[[VAL_0:.*]] = arith.constant 1 : index
 ! CHECK:  %[[VAL_2:.*]] = fir.shape_shift %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index, index) -> !fir.shapeshift<2>
 ! CHECK:         %[[VAL_3:.*]] = fir.embox %arg0(%[[VAL_2]]) : (!fir.ref<!fir.array<9x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<9x?xf32>>
 ! CHECK:         %[[VAL_4:.*]] = fir.address_of(
diff --git a/flang/test/Lower/Intrinsics/ubound.f90 b/flang/test/Lower/Intrinsics/ubound.f90
index 889414d1dd769..dae21acda170d 100644
--- a/flang/test/Lower/Intrinsics/ubound.f90
+++ b/flang/test/Lower/Intrinsics/ubound.f90
@@ -48,7 +48,7 @@ subroutine ubound_test_2(a, dim, res)
 subroutine ubound_test_3(a, dim, res)
   real, dimension(10, 20, *) :: a
   integer(8):: dim, res
-! CHECK:         %[[VAL_0:.*]] = fir.undefined index
+! CHECK:         %[[VAL_0:.*]] = arith.constant -1 : index
 ! CHECK:         %[[VAL_1:.*]] = fir.shape %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index) -> !fir.shape<3>
 ! CHECK:         %[[VAL_2:.*]] = fir.embox %{{.*}}(%[[VAL_1]]) : (!fir.ref<!fir.array<10x20x?xf32>>, !fir.shape<3>) -> !fir.box<!fir.array<10x20x?xf32>>
 ! CHECK:         %[[VAL_3:.*]] = fir.load %{{.*}} : !fir.ref<i64>
diff --git a/flang/test/Lower/array-expression-assumed-size.f90 b/flang/test/Lower/array-expression-assumed-size.f90
index b5fd09103aa0b..ae35da951538b 100644
--- a/flang/test/Lower/array-expression-assumed-size.f90
+++ b/flang/test/Lower/array-expression-assumed-size.f90
@@ -19,7 +19,7 @@ end subroutine assumed_size_forall_test
 ! CHECK:         %[[VAL_1A:.*]] = fir.convert %c10{{.*}} : (i64) -> index 
 ! CHECK:         %[[VAL_1B:.*]] = arith.cmpi sgt, %[[VAL_1A]], %c0{{.*}} : index 
 ! CHECK:         %[[VAL_1:.*]] = arith.select %[[VAL_1B]], %[[VAL_1A]], %c0{{.*}} : index
-! CHECK:         %[[VAL_2:.*]] = fir.undefined index
+! CHECK:         %[[VAL_2:.*]] = arith.constant -1 : index
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : index
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 1 : i64
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
@@ -82,7 +82,7 @@ end subroutine assumed_size_forall_test
 ! CHECK:         %[[VAL_2A:.*]] = fir.convert %c10{{.*}} : (i64) -> index 
 ! CHECK:         %[[VAL_2B:.*]] = arith.cmpi sgt, %[[VAL_2A]], %c0{{.*}} : index 
 ! CHECK:         %[[VAL_2:.*]] = arith.select %[[VAL_2B]], %[[VAL_2A]], %c0{{.*}} : index
-! CHECK:         %[[VAL_3:.*]] = fir.undefined index
+! CHECK:         %[[VAL_3:.*]] = arith.constant -1 : index
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 2 : i32
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> index
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 6 : i32
@@ -149,7 +149,7 @@ end subroutine assumed_size_forall_test
 ! PostOpt-DAG:         %[[VAL_4:.*]] = arith.constant 0 : index
 ! PostOpt-DAG:         %[[VAL_5:.*]] = arith.constant 3 : index
 ! PostOpt-DAG:         %[[VAL_6:.*]] = arith.constant 4 : index
-! PostOpt:         %[[VAL_7:.*]] = fir.undefined index
+! PostOpt-DAG:         %[[VAL_7:.*]] = arith.constant -1 : index
 ! PostOpt:         %[[VAL_8:.*]] = fir.shape %[[VAL_1]], %[[VAL_7]] : (index, index) -> !fir.shape<2>
 ! PostOpt:         %[[VAL_9:.*]] = fir.slice %[[VAL_2]], %[[VAL_1]], %[[VAL_2]], %[[VAL_2]], %[[VAL_3]], %[[VAL_2]] : (index, index, index, index, index, index) -> !fir.slice<2>
 ! PostOpt:         %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_3]]
@@ -227,8 +227,8 @@ end subroutine assumed_size_forall_test
 ! PostOpt-DAG:         %[[VAL_4:.*]] = arith.constant 1 : index
 ! PostOpt-DAG:         %[[VAL_5:.*]] = arith.constant 0 : index
 ! PostOpt-DAG:         %[[VAL_6:.*]] = arith.constant 5 : index
+! PostOpt-DAG:         %[[VAL_8:.*]] = arith.constant -1 : index
 ! PostOpt:         %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
-! PostOpt:         %[[VAL_8:.*]] = fir.undefined index
 ! PostOpt:         %[[VAL_9:.*]] = fir.shape %[[VAL_2]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
 ! PostOpt:         %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_4]]
 ! PostOpt:         br ^bb1(%[[VAL_5]], %[[VAL_4]] : index, index)
diff --git a/flang/test/Lower/cray-pointer.f90 b/flang/test/Lower/cray-pointer.f90
index d13d3c542e771..4e9f49daab4e9 100644
--- a/flang/test/Lower/cray-pointer.f90
+++ b/flang/test/Lower/cray-pointer.f90
@@ -59,7 +59,6 @@ subroutine cray_derivedType()
 
 ! CHECK: %[[dt:.*]] = fir.alloca !fir.type<_QFcray_derivedtypeTdt{i:i32,j:i32}>
 ! CHECK: %[[k:.*]] = fir.alloca i32 {{.*}}
-! CHECK: %[[pte:.*]] = fir.alloca i32 {{.*}}
 ! CHECK: %[[ptr:.*]] = fir.alloca i64 {{.*}}
 ! CHECK: %[[xdt:.*]] = fir.alloca !fir.type<_QFcray_derivedtypeTdt{i:i32,j:i32}> {{.*}}
 ! CHECK: %[[xdtbox:.*]] = fir.embox %[[xdt]] : (!fir.ref<!fir.type<_QFcray_derivedtypeTdt{i:i32,j:i32}>>) -> !fir.box<!fir.type<_QFcray_derivedtypeTdt{i:i32,j:i32}>>
@@ -107,7 +106,6 @@ subroutine cray_ptrArth()
 
 ! CHECK: %[[dt:.*]] = fir.alloca !fir.type<_QFcray_ptrarthTdt{x:i32,y:i32,z:i32}>
 ! CHECK: %[[i:.*]] = fir.alloca i32 {{.*}}
-! CHECK: %[[pte:.*]] = fir.alloca i32 {{.*}}
 ! CHECK: %[[ptr:.*]] = fir.alloca i64 {{.*}}
 ! CHECK: %[[xdt:.*]] = fir.alloca !fir.type<_QFcray_ptrarthTdt{x:i32,y:i32,z:i32}> {{.*}}
 ! CHECK: %[[xdtbox:.*]] = fir.embox %[[xdt]] : (!fir.ref<!fir.type<_QFcray_ptrarthTdt{x:i32,y:i32,z:i32}>>) -> !fir.box<!fir.type<_QFcray_ptrarthTdt{x:i32,y:i32,z:i32}>>
@@ -154,7 +152,6 @@ subroutine cray_arrayElement()
 
 ! CHECK: %[[data:.*]] = fir.alloca !fir.array<5xi32> {{.*}}
 ! CHECK: %[[k:.*]] = fir.alloca i32 {{.*}}
-! CHECK: %[[pte:.*]] = fir.alloca !fir.array<3xi32> {{.*}}
 ! CHECK: %[[ptr:.*]] = fir.alloca i64 {{.*}}
 ! CHECK: %[[c2:.*]] = arith.constant 2 : i64
 ! CHECK: %[[c1:.*]] = arith.constant 1 : i64
@@ -206,7 +203,6 @@ subroutine cray_2darrayElement()
 
 ! CHECK: %[[data:.*]] = fir.alloca !fir.array<2x4xi32> {{.*}}
 ! CHECK: %[[k:.*]] = fir.alloca i32 {{.*}}
-! CHECK: %[[pte:.*]] = fir.alloca !fir.array<2x3xi32> {{.*}}
 ! CHECK: %[[ptr:.*]] = fir.alloca i64 {{.*}}
 ! CHECK: %[[c2:.*]] = arith.constant 2 : i64
 ! CHECK: %[[c1:.*]] = arith.constant 1 : i64
@@ -269,7 +265,6 @@ subroutine cray_array()
 ! CHECK: %[[c3:.*]] = arith.constant 3 : index
 ! CHECK: %[[k:.*]] = fir.alloca !fir.array<3xi32> {{.*}}
 ! CHECK: %[[c31:.*]] = arith.constant 3 : index
-! CHECK: %[[pte:.*]] = fir.alloca !fir.array<3xi32> {{.*}}
 ! CHECK: %[[ptr:.*]] = fir.alloca i64 {{.*}}
 ! CHECK: %[[c2:.*]] = arith.constant 2 : i64
 ! CHECK: %[[c1:.*]] = arith.constant 1 : i64
@@ -333,7 +328,6 @@ subroutine cray_arraySection()
 ! CHECK: %[[c2:.*]] = arith.constant 2 : index
 ! CHECK: %[[k:.*]] = fir.alloca !fir.array<2xi32> {{.*}}
 ! CHECK: %[[c3:.*]] = arith.constant 3 : index
-! CHECK: %[[pte:.*]] = fir.alloca !fir.array<3xi32> {{.*}}
 ! CHECK: %[[ptr:.*]] = fir.alloca i64 {{.*}}
 ! CHECK: %[[c1:.*]] = arith.constant 2 : i64
 ! CHECK: %[[c0:.*]] = arith.constant 1 : i64

From 7fdf608cefa0d9051eb3146ee19c3750e237c799 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirko=20Brku=C5=A1anin?= <Mirko.Brkusanin@amd.com>
Date: Wed, 24 Jan 2024 13:43:07 +0100
Subject: [PATCH 779/843] [AMDGPU] Add GFX12 WMMA and SWMMAC instructions
 (#77795)

Co-authored-by: Petar Avramovic <Petar.Avramovic@amd.com>
Co-authored-by: Piotr Sobczak <piotr.sobczak@amd.com>
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |   62 +
 clang/lib/CodeGen/CGBuiltin.cpp               |  177 +-
 .../builtins-amdgcn-gfx12-wmma-w32.cl         |  156 ++
 .../builtins-amdgcn-gfx12-wmma-w64.cl         |  155 ++
 .../builtins-amdgcn-swmmac-w32.cl             |  135 ++
 .../builtins-amdgcn-swmmac-w64.cl             |  134 ++
 .../builtins-amdgcn-wmma-w32-gfx10-err.cl     |    2 +-
 .../CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl |   17 +-
 .../CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl |   18 +-
 .../amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl  |  107 ++
 .../amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl  |  104 ++
 .../amdgpu/builtins-amdgcn-swmmac-w32.cl      |  110 ++
 .../amdgpu/builtins-amdgcn-swmmac-w64.cl      |  109 ++
 llvm/include/llvm/IR/IntrinsicsAMDGPU.td      |  115 +-
 llvm/lib/Target/AMDGPU/AMDGPUGISel.td         |   24 +
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp |  330 ++++
 llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h   |   10 +
 .../AMDGPU/AMDGPUInstructionSelector.cpp      |  213 +++
 .../Target/AMDGPU/AMDGPUInstructionSelector.h |   13 +
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |   23 +
 .../Target/AMDGPU/AMDGPURegisterBankInfo.cpp  |   16 +
 .../Target/AMDGPU/AMDGPUSearchableTables.td   |   16 +
 .../AMDGPU/AsmParser/AMDGPUAsmParser.cpp      |  111 +-
 .../Disassembler/AMDGPUDisassembler.cpp       |    8 +
 .../lib/Target/AMDGPU/GCNHazardRecognizer.cpp |   18 +-
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp |   37 +
 .../AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h   |    4 +
 llvm/lib/Target/AMDGPU/SIDefines.h            |    3 +
 llvm/lib/Target/AMDGPU/SIFoldOperands.cpp     |    1 +
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   30 +
 llvm/lib/Target/AMDGPU/SIInstrFormats.td      |    5 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.h          |    8 +
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   11 +
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      |    5 +
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  500 ++++-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |    3 +
 .../UniformityAnalysis/AMDGPU/intrinsics.ll   |  135 +-
 ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll |  504 +++++
 .../AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll   |  519 ++++++
 .../GlobalISel/wmma-gfx12-w32-iu-modifiers.ll |  309 ++++
 .../wmma-gfx12-w32-swmmac-index_key.ll        |  321 ++++
 .../AMDGPU/GlobalISel/wmma-gfx12-w32.ll       |  370 ++++
 ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll |  459 +++++
 .../AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll   |  430 +++++
 .../GlobalISel/wmma-gfx12-w64-iu-modifiers.ll |  274 +++
 .../wmma-gfx12-w64-swmmac-index_key.ll        |  472 +++++
 .../AMDGPU/GlobalISel/wmma-gfx12-w64.ll       |  333 ++++
 ...wmma-gfx12-w32-f16-f32-matrix-modifiers.ll |  499 +++++
 .../test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll |  431 +++++
 .../AMDGPU/wmma-gfx12-w32-iu-modifiers.ll     |  309 ++++
 .../AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll |  321 ++++
 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll    |  370 ++++
 ...wmma-gfx12-w64-f16-f32-matrix-modifiers.ll |  456 +++++
 .../test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll |  373 ++++
 .../AMDGPU/wmma-gfx12-w64-iu-modifiers.ll     |  274 +++
 .../AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll |  472 +++++
 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll    |  333 ++++
 .../CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir |  351 ++++
 .../CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir |  352 ++++
 llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s      | 1529 ++++++++++++++++
 llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s      | 1529 ++++++++++++++++
 .../AMDGPU/gfx12_dasm_wmma_w32.txt            | 1628 +++++++++++++++++
 .../AMDGPU/gfx12_dasm_wmma_w64.txt            | 1628 +++++++++++++++++
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |   17 +-
 mlir/test/Target/LLVMIR/rocdl.mlir            |   24 +-
 65 files changed, 17701 insertions(+), 111 deletions(-)
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
 create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
 create mode 100644 cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
 create mode 100644 cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
 create mode 100644 cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
 create mode 100644 cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
 create mode 100644 llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
 create mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index d208342d9c516..74dfd1d214e84 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -436,5 +436,67 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_i32, "ii*1", "nc", "gfx12-insts,w
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4i16, "V4sV4s*1", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_v4f16, "V4hV4h*1", "nc", "gfx12-insts,wavefrontsize64")
 
+//===----------------------------------------------------------------------===//
+// WMMA builtins.
+// Postfix w32 indicates the builtin requires wavefront size of 32.
+// Postfix w64 indicates the builtin requires wavefront size of 64.
+//
+// Some of these are very similar to their GFX11 counterparts, but they don't
+// require replication of the A,B matrices, so they use fewer vector elements.
+// Therefore, we add an "_gfx12" suffix to distinguish them from the existing
+// builtins.
+//===----------------------------------------------------------------------===//
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+// These are gfx12-only, but for consistency with the other WMMA variants we're
+// keeping the "_gfx12" suffix.
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12, "V4fiiV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32, "V8fV8hV16hV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32, "V8fV8sV16sV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32, "V8hV8hV16hV8hs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32, "V8sV8sV16sV8ss", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32, "V8iIbiIbV2iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32, "V8iIbV2iIbV4iV8isIb", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32, "V8fV2iV4iV8fs", "nc", "gfx12-insts,wavefrontsize32")
+
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64, "V4fV4hV8hV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64, "V4fV4sV8sV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64, "V4hV4hV8hV4hs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64, "V4sV4sV8sV4ss", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64, "V4iIbiIbiV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64, "V4iIbiIbV2iV4isIb", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64, "V4fiV2iV4fs", "nc", "gfx12-insts,wavefrontsize64")
+
 #undef BUILTIN
 #undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 7ef764b8e1ac8..a4f26a6f0eb19 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18279,65 +18279,216 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
   case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
-  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64: {
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+  case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
 
     // These operations perform a matrix multiplication and accumulation of
     // the form:
     //             D = A * B + C
-    // The return type always matches the type of matrix C.
-    unsigned ArgForMatchingRetType;
+    // We need to specify one type for matrices AB and one for matrices CD.
+    // Sparse matrix operations can have different types for A and B as well as
+    // an additional type for sparsity index.
+    // Destination type should be put before types used for source operands.
+    SmallVector<unsigned, 2> ArgsForMatchingMatrixTypes;
+    // On GFX12, the intrinsics with 16-bit accumulator use a packed layout.
+    // There is no need for the variable opsel argument, so always set it to
+    // "false".
+    bool AppendFalseForOpselArg = false;
     unsigned BuiltinWMMAOp;
 
     switch (BuiltinID) {
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64:
-      ArgForMatchingRetType = 2;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_f16;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64:
-      ArgForMatchingRetType = 2;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12:
+      AppendFalseForOpselArg = true;
+      LLVM_FALLTHROUGH;
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12:
+      AppendFalseForOpselArg = true;
+      LLVM_FALLTHROUGH;
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x16_f16_tied;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64:
-      ArgForMatchingRetType = 2;
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x16_bf16_tied;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64:
-      ArgForMatchingRetType = 4;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu8;
       break;
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32:
     case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64:
-      ArgForMatchingRetType = 4;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
       BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x16_iu4;
       break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {2, 0}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12:
+    case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12:
+      ArgsForMatchingMatrixTypes = {4, 1}; // CD, AB
+      BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x32_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_f16_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x32_f16_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x32_f16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+      ArgsForMatchingMatrixTypes = {4, 1, 3, 5}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+    case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+      ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
+      BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
+      break;
     }
 
     SmallVector<Value *, 6> Args;
     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
       Args.push_back(EmitScalarExpr(E->getArg(i)));
+    if (AppendFalseForOpselArg)
+      Args.push_back(Builder.getFalse());
 
-    Function *F = CGM.getIntrinsic(BuiltinWMMAOp,
-                                   {Args[ArgForMatchingRetType]->getType()});
+    SmallVector<llvm::Type *, 6> ArgTypes;
+    for (auto ArgIdx : ArgsForMatchingMatrixTypes)
+      ArgTypes.push_back(Args[ArgIdx]->getType());
 
+    Function *F = CGM.getIntrinsic(BuiltinWMMAOp, ArgTypes);
     return Builder.CreateCall(F, Args);
   }
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
new file mode 100644
index 0000000000000..11747af7ea74f
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -0,0 +1,156 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+// Wave32
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[A:%.*]], <8 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16(<8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32(<2 x i32> [[A:%.*]], <2 x i32> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
new file mode 100644
index 0000000000000..ef32648743ca6
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -0,0 +1,155 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+
+// Wave64
+
+//
+// amdgcn_wmma_f32_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f32_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_f16_16x16x16_f16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[A:%.*]], <4 x half> [[B:%.*]], <4 x half> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_bf16_16x16x16_bf16
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16(<4 x i16> [[A:%.*]], <4 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu8
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
+}
+
+//
+// amdgcn_wmma_i32_16x16x16_iu4
+//
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32(i32 [[A:%.*]], i32 [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_wmma_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
new file mode 100644
index 0000000000000..b303c2f25ddda
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w32.cl
@@ -0,0 +1,135 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+typedef half  v16h   __attribute__((ext_vector_type(16)));
+typedef short v16s   __attribute__((ext_vector_type(16)));
+
+// Wave32
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 true, <2 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
new file mode 100644
index 0000000000000..855fa7351e155
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-swmmac-w64.cl
@@ -0,0 +1,134 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+
+// Wave64
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_f16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f16_16x16x32_f16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.i16(<4 x half> [[A:%.*]], <8 x half> [[B:%.*]], <4 x half> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.i16(<4 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <4 x i16> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 8, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x32_iu4_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.i16(i1 true, i32 [[A:%.*]], i1 true, i32 [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_i32_16x16x64_iu4_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.i16(i1 true, i32 [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i16 [[INDEX:%.*]], i1 true)
+// CHECK-GFX1200-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: @test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(
+// CHECK-GFX1200-NEXT:  entry:
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.i16(i32 [[A:%.*]], <2 x i32> [[B:%.*]], <4 x float> [[C:%.*]], i16 [[INDEX:%.*]])
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1200-NEXT:    ret void
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
+}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
index 41a78ae268be5..49cb797df4233 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32-gfx10-err.cl
@@ -16,7 +16,7 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 
 // Wave32
 
-void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f, 
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out8f, v16s a16s, v16s b16s, v8f c8f,
                                             global v16h* out16h, v16h a16h, v16h b16h, v16h c16h,
                                             global v16s* out16s, v2i a2i, v2i b2i, v16s c16s,
                                             global v8i* out8i, v4i a4i, v4i b4i, v8i c8i)
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
index 4f13c75e5e81f..3c6aaf5e38281 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w32.cl
@@ -2,7 +2,6 @@
 // REQUIRES: amdgpu-registered-target
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
 
-typedef float  v4f   __attribute__((ext_vector_type(4)));
 typedef float  v8f   __attribute__((ext_vector_type(8)));
 typedef half   v16h  __attribute__((ext_vector_type(16)));
 typedef int    v2i   __attribute__((ext_vector_type(2)));
@@ -20,7 +19,7 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x float> [[C:%.*]])
 // CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -35,7 +34,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v16h a, v16h b, v8f
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x float> [[C:%.*]])
 // CHECK-GFX1100-NEXT:    store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -50,7 +49,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v16s a, v16s b, v8f
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -65,7 +64,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v16h* out, v16h a, v16h b, v16
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -80,7 +79,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v16s* out, v16s a, v16s b, v
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v16f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <16 x half> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <16 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -95,7 +94,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w32(global v16h* out, v16h a, v16h b
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v16i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i16> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <16 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -110,7 +109,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32(global v16s* out, v16s a, v16s
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
 // CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -125,7 +124,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v4i a, v4i b, v8i c)
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w32(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false)
 // CHECK-GFX1100-NEXT:    store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
index 4797675f50d42..1490f14fd17b6 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-wmma-w64.cl
@@ -3,12 +3,10 @@
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1100 -DWMMA_GFX1100_TESTS -S -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1100
 
 typedef float  v4f   __attribute__((ext_vector_type(4)));
-typedef float  v8f   __attribute__((ext_vector_type(8)));
 typedef half   v8h   __attribute__((ext_vector_type(8)));
 typedef half   v16h  __attribute__((ext_vector_type(16)));
 typedef int    v2i   __attribute__((ext_vector_type(2)));
 typedef int    v4i   __attribute__((ext_vector_type(4)));
-typedef int    v8i   __attribute__((ext_vector_type(8)));
 typedef short  v8s   __attribute__((ext_vector_type(8)));
 typedef short  v16s  __attribute__((ext_vector_type(16)));
 
@@ -22,7 +20,7 @@ typedef short  v16s  __attribute__((ext_vector_type(16)));
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_f16_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <4 x float> [[C:%.*]])
 // CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4:![0-9]+]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -37,7 +35,7 @@ void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v16h a, v16h b, v4f
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f32_16x16x16_bf16_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <4 x float> [[C:%.*]])
 // CHECK-GFX1100-NEXT:    store <4 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -52,7 +50,7 @@ void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v16s a, v16s b, v4f
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -67,7 +65,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v8h* out, v16h a, v16h b, v8h
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -82,7 +80,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v8s* out, v16s a, v16s b, v8
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.tied.v8f16.v16f16(<16 x half> [[A:%.*]], <16 x half> [[B:%.*]], <8 x half> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -97,7 +95,7 @@ void test_amdgcn_wmma_f16_16x16x16_f16_tied_w64(global v8h* out, v16h a, v16h b,
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.tied.v8i16.v16i16(<16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <8 x i16> [[C:%.*]], i1 true)
 // CHECK-GFX1100-NEXT:    store <8 x i16> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -112,7 +110,7 @@ void test_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64(global v8s* out, v16s a, v16s
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu8_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 true, <4 x i32> [[A:%.*]], i1 true, <4 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
 // CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
@@ -127,7 +125,7 @@ void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, v4i a, v4i b, v4i c)
 
 // CHECK-GFX1100-LABEL: @test_amdgcn_wmma_i32_16x16x16_iu4_w64(
 // CHECK-GFX1100-NEXT:  entry:
-// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
+// CHECK-GFX1100-NEXT:    [[TMP0:%.*]] = tail call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 true, <2 x i32> [[A:%.*]], i1 true, <2 x i32> [[B:%.*]], <4 x i32> [[C:%.*]], i1 false)
 // CHECK-GFX1100-NEXT:    store <4 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
 // CHECK-GFX1100-NEXT:    ret void
 //
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
new file mode 100644
index 0000000000000..867273126f8fd
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w32.cl
@@ -0,0 +1,107 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+
+// Wave32
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w32(global v8f* out, v8h a, v8h b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w32(global v8f* out, v8s a, v8s b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w32:
+// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w32(global v8h* out, v8h a, v8h b, v8h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w32:
+// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w32(global v8s* out, v8s a, v8s b, v8s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w32(global v8i* out, int a, int b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v8f* out, v2i a, v2i b, v8f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v8i* out, v2i a, v2i b, v8i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12(true, a, true, b, c, false);
+}
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
new file mode 100644
index 0000000000000..ad01450c35a76
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-gfx12-wmma-w64.cl
@@ -0,0 +1,104 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+
+// Wave64
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_f16_w64:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_f16_w64(global v4f* out, v4h a, v4h b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf16_w64:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf16_w64(global v4f* out, v4s a, v4s b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f16_16x16x16_f16_w64:
+// CHECK-GFX1200: v_wmma_f16_16x16x16_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f16_16x16x16_f16_w64(global v4h* out, v4h a, v4h b, v4h c)
+{
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_bf16_16x16x16_bf16_w64:
+// CHECK-GFX1200: v_wmma_bf16_16x16x16_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_bf16_16x16x16_bf16_w64(global v4s* out, v4s a, v4s b, v4s c)
+{
+  *out = __builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu8_w64:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu8_w64(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x16_iu4_w64:
+// CHECK-GFX1200: v_wmma_i32_16x16x16_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x16_iu4_w64(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12(true, a, true, b, c, false);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_fp8_w64_gfx12(a, b, c);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_fp8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_fp8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32:
+// CHECK-GFX1200: v_wmma_f32_16x16x16_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}]
+//
+void test_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32(global v4f* out, int a, int b, v4f c)
+{
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_wmma_i32_16x16x32_iu4_w32:
+// CHECK-GFX1200: v_wmma_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}] neg_lo:[1,1,0]
+//
+void test_amdgcn_wmma_i32_16x16x32_iu4_w32(global v4i* out, int a, int b, v4i c)
+{
+  *out = __builtin_amdgcn_wmma_i32_16x16x32_iu4_w64_gfx12(true, a, true, b, c, false);
+}
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
new file mode 100644
index 0000000000000..317d9a1102ccf
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w32.cl
@@ -0,0 +1,110 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize32 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef float  v8f   __attribute__((ext_vector_type(8)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+typedef int    v8i   __attribute__((ext_vector_type(8)));
+typedef half  v16h   __attribute__((ext_vector_type(16)));
+typedef short v16s   __attribute__((ext_vector_type(16)));
+
+// Wave32
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w32(global v8f* out, v8h a, v16h b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w32(global v8f* out, v8s a, v16s b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w32:
+// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w32(global v8h* out, v8h a, v16h b, v8h c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w32:
+// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w32(global v8s* out, v8s a, v16s b, v8s c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w32:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w32(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w32:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w32(global v8i* out, int a, v2i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w32(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w32:
+// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w32(global v8i* out, v2i a, v4i b, v8i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w32(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w32(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(global v8f* out, v2i a, v4i b, v8f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32(a, b, c, index);
+}
diff --git a/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
new file mode 100644
index 0000000000000..eb81234f53a66
--- /dev/null
+++ b/cross-project-tests/amdgpu/builtins-amdgcn-swmmac-w64.cl
@@ -0,0 +1,109 @@
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -target-feature +wavefrontsize64 -S -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+typedef int    v2i   __attribute__((ext_vector_type(2)));
+typedef int    v4i   __attribute__((ext_vector_type(4)));
+typedef float  v4f   __attribute__((ext_vector_type(4)));
+typedef half   v4h   __attribute__((ext_vector_type(4)));
+typedef short  v4s   __attribute__((ext_vector_type(4)));
+typedef half   v8h   __attribute__((ext_vector_type(8)));
+typedef short  v8s   __attribute__((ext_vector_type(8)));
+
+// Wave64
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_f16_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_f16_w64(global v4f* out, v4h a, v8h b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_f16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf16_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf16_w64(global v4f* out, v4s a, v8s b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f16_16x16x32_f16_w64:
+// CHECK-GFX1200: v_swmmac_f16_16x16x32_f16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f16_16x16x32_f16_w64(global v4h* out, v4h a, v8h b, v4h c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f16_16x16x32_f16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_bf16_16x16x32_bf16_w64:
+// CHECK-GFX1200: v_swmmac_bf16_16x16x32_bf16 v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_bf16_16x16x32_bf16_w64(global v4s* out, v4s a, v8s b, v4s c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_bf16_16x16x32_bf16_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu8_w64:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu8_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu8_w64(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x32_iu4_w64:
+// CHECK-GFX1200: v_swmmac_i32_16x16x32_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v{{[0-9]+}}, v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x32_iu4_w64(global v4i* out, int a, int b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x32_iu4_w64(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_i32_16x16x64_iu4_w64:
+// CHECK-GFX1200: v_swmmac_i32_16x16x64_iu4 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}} neg_lo:[1,1,0] clamp
+//
+void test_amdgcn_swmmac_i32_16x16x64_iu4_w64(global v4i* out, int a, v2i b, v4i c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_i32_16x16x64_iu4_w64(true, a, true, b, c, index, true);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_fp8_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_fp8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_fp8_bf8_w64(a, b, c, index);
+}
+
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_fp8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64(a, b, c, index);
+}
+
+// CHECK-GFX1200-LABEL: test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+// CHECK-GFX1200: v_swmmac_f32_16x16x32_bf8_bf8 v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}, v[{{[0-9]+:[0-9]+}}], v{{[0-9]+}}
+//
+void test_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(global v4f* out, int a, v2i b, v4f c, short index)
+{
+  *out = __builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64(a, b, c, index);
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index e6db9da5526aa..9eb1ac8e27bef 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -2601,6 +2601,11 @@ def int_amdgcn_ds_bvh_stack_rtn :
     [ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
   >;
 
+def int_amdgcn_s_wait_event_export_ready :
+  ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
+  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
+>;
+
 // WMMA (Wave Matrix Multiply-Accumulate) intrinsics
 //
 // These operations perform a matrix multiplication and accumulation of
@@ -2608,10 +2613,10 @@ def int_amdgcn_ds_bvh_stack_rtn :
 
 class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
   Intrinsic<
-    [CD],               // %D
+    [CD], // %D
     [
       AB,               // %A
-      AB,               // %B
+      LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
     ],
     [IntrNoMem, IntrConvergent, IntrWillReturn, IntrNoCallback, IntrNoFree]
@@ -2619,49 +2624,50 @@ class AMDGPUWmmaIntrinsic<LLVMType AB, LLVMType CD> :
 
 class AMDGPUWmmaIntrinsicOPSEL<LLVMType AB, LLVMType CD> :
   Intrinsic<
-    [CD],               // %D
+    [CD], // %D
     [
       AB,               // %A
-      AB,               // %B
+      LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
-      llvm_i1_ty,       // %high
+      llvm_i1_ty,       // %high (op_sel) for GFX11, 0 for GFX12
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<3>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
 class AMDGPUWmmaIntrinsicIU<LLVMType AB, LLVMType CD> :
   Intrinsic<
-    [CD],               // %D
+    [CD], // %D
     [
       llvm_i1_ty,       // %A_sign
       AB,               // %A
       llvm_i1_ty,       // %B_sign
-      AB,               // %B
+      LLVMMatchType<1>, // %B
       LLVMMatchType<0>, // %C
       llvm_i1_ty,       // %clamp
     ],
     [IntrNoMem, IntrConvergent, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<5>>, IntrWillReturn, IntrNoCallback, IntrNoFree]
 >;
 
-def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_v16i16_ty, llvm_anyfloat_ty>;
-// The regular, untied f16/bf16 wmma intrinsics only write to one half
-// of the registers (set via the op_sel bit).
-// The content of the other 16-bit of the registers is undefined.
-def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
-// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix
-// registers to the input accumulator registers.
-// Essentially, the content of the other 16-bit is preserved from the input.
-def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_v16f16_ty, llvm_anyfloat_ty>;
-def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_v16i16_ty, llvm_anyint_ty>;
-def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_v4i32_ty, llvm_anyint_ty>;
-def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_v2i32_ty, llvm_anyint_ty>;
+// WMMA GFX11Only
 
-def int_amdgcn_s_wait_event_export_ready :
-  ClangBuiltin<"__builtin_amdgcn_s_wait_event_export_ready">,
-  Intrinsic<[], [], [IntrNoMem, IntrHasSideEffects, IntrWillReturn]
->;
+// The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
+// The tied versions of the f16/bf16 wmma intrinsics tie the destination matrix registers to the input accumulator registers.
+// The content of the other 16-bit half is preserved from the input.
+def int_amdgcn_wmma_f16_16x16x16_f16_tied   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16_tied : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
+
+// WMMA GFX11Plus
+
+def int_amdgcn_wmma_f32_16x16x16_f16   : AMDGPUWmmaIntrinsic<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf16  : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu8   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_wmma_i32_16x16x16_iu4   : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
+// GFX11: The OPSEL intrinsics read from and write to one half of the registers, selected by the op_sel bit.
+//        The content of the other 16-bit half is undefined.
+// GFX12: The op_sel bit must be 0.
+def int_amdgcn_wmma_f16_16x16x16_f16   : AMDGPUWmmaIntrinsicOPSEL<llvm_anyfloat_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_bf16_16x16x16_bf16 : AMDGPUWmmaIntrinsicOPSEL<llvm_anyint_ty, llvm_anyint_ty>;
 
 //===----------------------------------------------------------------------===//
 // GFX12 Intrinsics
@@ -2681,6 +2687,65 @@ def int_amdgcn_permlanex16_var : ClangBuiltin<"__builtin_amdgcn_permlanex16_var"
             [IntrNoMem, IntrConvergent, IntrWillReturn,
              ImmArg<ArgIndex<3>>, ImmArg<ArgIndex<4>>, IntrNoCallback, IntrNoFree]>;
 
+
+// WMMA (Wave Matrix Multiply-Accumulate) intrinsics
+//
+// These operations perform a matrix multiplication and accumulation of
+// the form: D = A * B + C .
+
+// A and B are <8 x fp8> or <8 x bf8>, but since fp8 and bf8 are not supported by llvm we use <2 x i32>.
+def int_amdgcn_wmma_f32_16x16x16_fp8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_fp8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_fp8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+def int_amdgcn_wmma_f32_16x16x16_bf8_bf8 : AMDGPUWmmaIntrinsic<llvm_anyint_ty, llvm_anyfloat_ty>;
+// A and B are <16 x iu4>.
+def int_amdgcn_wmma_i32_16x16x32_iu4     : AMDGPUWmmaIntrinsicIU<llvm_anyint_ty, llvm_anyint_ty>;
+
+// SWMMAC (Wave Matrix(sparse) Multiply-Accumulate) intrinsics
+//
+// These operations perform a sparse matrix multiplication and accumulation of
+// the form: D = A * B + C.
+// A is sparse matrix, half the size of B, and is expanded using sparsity index.
+
+class AMDGPUSWmmacIntrinsicIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      A,                // %A
+      B,                // %B
+      LLVMMatchType<0>, // %C
+      Index             // %Sparsity index for A
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn]
+>;
+
+class AMDGPUSWmmacIntrinsicIUIdx<LLVMType A, LLVMType B, LLVMType CD, LLVMType Index> :
+  Intrinsic<
+    [CD],               // %D
+    [
+      llvm_i1_ty,       // %A_sign
+      A,                // %A
+      llvm_i1_ty,       // %B_sign
+      B,                // %B
+      LLVMMatchType<0>, // %C
+      Index,            // %Sparsity index for A
+      llvm_i1_ty,       // %clamp
+    ],
+    [IntrNoMem, IntrConvergent, IntrWillReturn, ImmArg<ArgIndex<0>>, ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<6>>]
+>;
+
+def int_amdgcn_swmmac_f32_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf16    : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f16_16x16x32_f16     : AMDGPUSWmmacIntrinsicIdx<llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_bf16_16x16x32_bf16   : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x32_iu8     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x32_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_i32_16x16x64_iu4     : AMDGPUSWmmacIntrinsicIUIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_fp8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_fp8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf8_fp8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+def int_amdgcn_swmmac_f32_16x16x32_bf8_bf8 : AMDGPUSWmmacIntrinsicIdx<llvm_anyint_ty, llvm_anyint_ty, llvm_anyfloat_ty, llvm_anyint_ty>;
+
 def int_amdgcn_global_atomic_ordered_add_b64 : AMDGPUAtomicRtn<llvm_i64_ty, global_ptr_ty>;
 
 def int_amdgcn_flat_atomic_fmin_num   : AMDGPUAtomicRtn<llvm_anyfloat_ty>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index a19b03b929233..152f495a452ba 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -59,6 +59,30 @@ def gi_wmmaopselvop3pmods :
     GIComplexOperandMatcher<s32, "selectWMMAOpSelVOP3PMods">,
     GIComplexPatternEquiv<WMMAOpSelVOP3PMods>;
 
+def gi_wmmavisrc :
+    GIComplexOperandMatcher<s32, "selectWMMAVISrc">,
+    GIComplexPatternEquiv<WMMAVISrc>;
+
+def gi_wmmamods :
+    GIComplexOperandMatcher<s32, "selectWMMAModsF32NegAbs">,
+    GIComplexPatternEquiv<WMMAModsF32NegAbs>;
+
+def gi_wmmamodsf16Neg :
+    GIComplexOperandMatcher<s32, "selectWMMAModsF16Neg">,
+    GIComplexPatternEquiv<WMMAModsF16Neg>;
+
+def gi_wmmamodsf16NegAbs :
+    GIComplexOperandMatcher<s32, "selectWMMAModsF16NegAbs">,
+    GIComplexPatternEquiv<WMMAModsF16NegAbs>;
+
+def gi_swmmacindex8 :
+    GIComplexOperandMatcher<s32, "selectSWMMACIndex8">,
+    GIComplexPatternEquiv<SWMMACIndex8>;
+
+def gi_swmmacindex16 :
+    GIComplexOperandMatcher<s32, "selectSWMMACIndex16">,
+    GIComplexPatternEquiv<SWMMACIndex16>;
+
 def gi_vop3opselmods :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
     GIComplexPatternEquiv<VOP3OpSelMods>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 4c35649cec6c8..4f7bf3f7d35e7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -3048,6 +3048,336 @@ bool AMDGPUDAGToDAGISel::SelectWMMAOpSelVOP3PMods(SDValue In,
   return true;
 }
 
+static MachineSDNode *buildRegSequence32(SmallVectorImpl<SDValue> &Elts,
+                                         llvm::SelectionDAG *CurDAG,
+                                         const SDLoc &DL) {
+  unsigned DstRegClass;
+  EVT DstTy;
+  switch (Elts.size()) {
+  case 8:
+    DstRegClass = AMDGPU::VReg_256RegClassID;
+    DstTy = MVT::v8i32;
+    break;
+  case 4:
+    DstRegClass = AMDGPU::VReg_128RegClassID;
+    DstTy = MVT::v4i32;
+    break;
+  case 2:
+    DstRegClass = AMDGPU::VReg_64RegClassID;
+    DstTy = MVT::v2i32;
+    break;
+  default:
+    llvm_unreachable("unhandled Reg sequence size");
+  }
+
+  SmallVector<SDValue, 17> Ops;
+  Ops.push_back(CurDAG->getTargetConstant(DstRegClass, DL, MVT::i32));
+  for (unsigned i = 0; i < Elts.size(); ++i) {
+    Ops.push_back(Elts[i]);
+    Ops.push_back(CurDAG->getTargetConstant(
+        SIRegisterInfo::getSubRegFromChannel(i), DL, MVT::i32));
+  }
+  return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, DstTy, Ops);
+}
+
+static MachineSDNode *buildRegSequence16(SmallVectorImpl<SDValue> &Elts,
+                                         llvm::SelectionDAG *CurDAG,
+                                         const SDLoc &DL) {
+  SmallVector<SDValue, 8> PackedElts;
+  assert("unhandled Reg sequence size" &&
+         (Elts.size() == 8 || Elts.size() == 16));
+
+  // Pack 16-bit elements in pairs into 32-bit register. If both elements are
+  // unpacked from 32-bit source use it, otherwise pack them using v_perm.
+  for (unsigned i = 0; i < Elts.size(); i += 2) {
+    SDValue LoSrc = stripExtractLoElt(stripBitcast(Elts[i]));
+    SDValue HiSrc;
+    if (isExtractHiElt(Elts[i + 1], HiSrc) && LoSrc == HiSrc) {
+      PackedElts.push_back(HiSrc);
+    } else {
+      SDValue PackLoLo = CurDAG->getTargetConstant(0x05040100, DL, MVT::i32);
+      MachineSDNode *Packed =
+          CurDAG->getMachineNode(AMDGPU::V_PERM_B32_e64, DL, MVT::i32,
+                                 {Elts[i + 1], Elts[i], PackLoLo});
+      PackedElts.push_back(SDValue(Packed, 0));
+    }
+  }
+
+  return buildRegSequence32(PackedElts, CurDAG, DL);
+}
+
+static MachineSDNode *buildRegSequence(SmallVectorImpl<SDValue> &Elts,
+                                       llvm::SelectionDAG *CurDAG,
+                                       const SDLoc &DL, unsigned ElementSize) {
+  if (ElementSize == 16)
+    return buildRegSequence16(Elts, CurDAG, DL);
+  if (ElementSize == 32)
+    return buildRegSequence32(Elts, CurDAG, DL);
+  llvm_unreachable("Unhandled element size");
+}
+
+static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
+                                 SmallVectorImpl<SDValue> &Elts, SDValue &Src,
+                                 llvm::SelectionDAG *CurDAG, const SDLoc &DL,
+                                 unsigned ElementSize) {
+  if (ModOpcode == ISD::FNEG) {
+    Mods |= SISrcMods::NEG;
+    // Check if all elements also have abs modifier
+    SmallVector<SDValue, 8> NegAbsElts;
+    for (auto El : Elts) {
+      if (El.getOpcode() != ISD::FABS)
+        break;
+      NegAbsElts.push_back(El->getOperand(0));
+    }
+    if (Elts.size() != NegAbsElts.size()) {
+      // Neg
+      Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
+    } else {
+      // Neg and Abs
+      Mods |= SISrcMods::NEG_HI;
+      Src = SDValue(buildRegSequence(NegAbsElts, CurDAG, DL, ElementSize), 0);
+    }
+  } else {
+    assert(ModOpcode == ISD::FABS);
+    // Abs
+    Mods |= SISrcMods::NEG_HI;
+    Src = SDValue(buildRegSequence(Elts, CurDAG, DL, ElementSize), 0);
+  }
+}
+
+// Check all f16 elements for modifiers while looking through b32 and v2b16
+// build vector, stop if element does not satisfy ModifierCheck.
+static void
+checkWMMAElementsModifiersF16(BuildVectorSDNode *BV,
+                              std::function<bool(SDValue)> ModifierCheck) {
+  for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+    if (auto *F16Pair =
+            dyn_cast<BuildVectorSDNode>(stripBitcast(BV->getOperand(i)))) {
+      for (unsigned i = 0; i < F16Pair->getNumOperands(); ++i) {
+        SDValue ElF16 = stripBitcast(F16Pair->getOperand(i));
+        if (!ModifierCheck(ElF16))
+          break;
+      }
+    }
+  }
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF16Neg(SDValue In, SDValue &Src,
+                                              SDValue &SrcMods) const {
+  Src = In;
+  unsigned Mods = SISrcMods::OP_SEL_1;
+
+  // mods are on f16 elements
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    SmallVector<SDValue, 8> EltsF16;
+
+    checkWMMAElementsModifiersF16(BV, [&](SDValue Element) -> bool {
+      if (Element.getOpcode() != ISD::FNEG)
+        return false;
+      EltsF16.push_back(Element.getOperand(0));
+      return true;
+    });
+
+    // All elements have neg modifier
+    if (BV->getNumOperands() * 2 == EltsF16.size()) {
+      Src = SDValue(buildRegSequence16(EltsF16, CurDAG, SDLoc(In)), 0);
+      Mods |= SISrcMods::NEG;
+      Mods |= SISrcMods::NEG_HI;
+    }
+  }
+
+  // mods are on v2f16 elements
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    SmallVector<SDValue, 8> EltsV2F16;
+    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+      SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (ElV2f16.getOpcode() != ISD::FNEG)
+        break;
+      EltsV2F16.push_back(ElV2f16.getOperand(0));
+    }
+
+    // All pairs of elements have neg modifier
+    if (BV->getNumOperands() == EltsV2F16.size()) {
+      Src = SDValue(buildRegSequence32(EltsV2F16, CurDAG, SDLoc(In)), 0);
+      Mods |= SISrcMods::NEG;
+      Mods |= SISrcMods::NEG_HI;
+    }
+  }
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
+                                                 SDValue &SrcMods) const {
+  Src = In;
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned ModOpcode;
+
+  // mods are on f16 elements
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    SmallVector<SDValue, 8> EltsF16;
+    checkWMMAElementsModifiersF16(BV, [&](SDValue ElF16) -> bool {
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsF16.empty())
+        ModOpcode = (ElF16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+      if (ElF16.getOpcode() != ModOpcode)
+        return false;
+      EltsF16.push_back(ElF16.getOperand(0));
+      return true;
+    });
+
+    // All elements have ModOpcode modifier
+    if (BV->getNumOperands() * 2 == EltsF16.size())
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF16, Src, CurDAG, SDLoc(In),
+                           16);
+  }
+
+  // mods are on v2f16 elements
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    SmallVector<SDValue, 8> EltsV2F16;
+
+    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+      SDValue ElV2f16 = stripBitcast(BV->getOperand(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsV2F16.empty())
+        ModOpcode = (ElV2f16.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+      if (ElV2f16->getOpcode() != ModOpcode)
+        break;
+      EltsV2F16.push_back(ElV2f16->getOperand(0));
+    }
+
+    // All elements have ModOpcode modifier
+    if (BV->getNumOperands() == EltsV2F16.size())
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, CurDAG, SDLoc(In),
+                           32);
+  }
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
+                                                 SDValue &SrcMods) const {
+  Src = In;
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned ModOpcode;
+  SmallVector<SDValue, 8> EltsF32;
+
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(stripBitcast(In))) {
+    for (unsigned i = 0; i < BV->getNumOperands(); ++i) {
+      SDValue ElF32 = stripBitcast(BV->getOperand(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsF32.empty())
+        ModOpcode = (ElF32.getOpcode() == ISD::FNEG) ? ISD::FNEG : ISD::FABS;
+      if (ElF32.getOpcode() != ModOpcode)
+        break;
+      EltsF32.push_back(ElF32.getOperand(0));
+    }
+
+    // All elements had ModOpcode modifier
+    if (BV->getNumOperands() == EltsF32.size())
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, CurDAG, SDLoc(In),
+                           32);
+  }
+
+  SrcMods = CurDAG->getTargetConstant(Mods, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectWMMAVISrc(SDValue In, SDValue &Src) const {
+  if (auto *BV = dyn_cast<BuildVectorSDNode>(In)) {
+    BitVector UndefElements;
+    if (SDValue Splat = BV->getSplatValue(&UndefElements))
+      if (isInlineImmediate(Splat.getNode())) {
+        if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
+          unsigned Imm = C->getAPIntValue().getSExtValue();
+          Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+          return true;
+        }
+        if (const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat)) {
+          unsigned Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
+          Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+          return true;
+        }
+        llvm_unreachable("unhandled Constant node");
+      }
+  }
+
+  // 16 bit splat
+  SDValue SplatSrc32 = stripBitcast(In);
+  if (auto *SplatSrc32BV = dyn_cast<BuildVectorSDNode>(SplatSrc32)) {
+    if (SDValue Splat32 = SplatSrc32BV->getSplatValue()) {
+      SDValue SplatSrc16 = stripBitcast(Splat32);
+      if (auto *SplatSrc16BV = dyn_cast<BuildVectorSDNode>(SplatSrc16)) {
+        if (SDValue Splat = SplatSrc16BV->getSplatValue()) {
+
+          // f16
+          if (isInlineImmediate(Splat.getNode())) {
+            const ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Splat);
+            int64_t Imm = C->getValueAPF().bitcastToAPInt().getSExtValue();
+            Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i16);
+            return true;
+          }
+
+          // bf16
+          if (const ConstantSDNode *C = dyn_cast<ConstantSDNode>(Splat)) {
+            const SIInstrInfo *TII = Subtarget->getInstrInfo();
+            APInt BF16Value = C->getAPIntValue();
+            APInt F32Value = BF16Value.zext(32).shl(16);
+            if (TII->isInlineConstant(F32Value)) {
+              int64_t Imm = F32Value.getSExtValue();
+              Src = CurDAG->getTargetConstant(Imm, SDLoc(In), MVT::i32);
+              return true;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return false;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex8(SDValue In, SDValue &Src,
+                                            SDValue &IndexKey) const {
+  unsigned Key = 0;
+  Src = In;
+
+  if (In.getOpcode() == ISD::SRL) {
+    const llvm::SDValue &ShiftSrc = In.getOperand(0);
+    ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+    if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
+        ShiftAmt->getZExtValue() % 8 == 0) {
+      Key = ShiftAmt->getZExtValue() / 8;
+      Src = ShiftSrc;
+    }
+  }
+
+  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+  return true;
+}
+
+bool AMDGPUDAGToDAGISel::SelectSWMMACIndex16(SDValue In, SDValue &Src,
+                                             SDValue &IndexKey) const {
+  unsigned Key = 0;
+  Src = In;
+
+  if (In.getOpcode() == ISD::SRL) {
+    const llvm::SDValue &ShiftSrc = In.getOperand(0);
+    ConstantSDNode *ShiftAmt = dyn_cast<ConstantSDNode>(In.getOperand(1));
+    if (ShiftSrc.getValueType().getSizeInBits() == 32 && ShiftAmt &&
+        ShiftAmt->getZExtValue() == 16) {
+      Key = 1;
+      Src = ShiftSrc;
+    }
+  }
+
+  IndexKey = CurDAG->getTargetConstant(Key, SDLoc(In), MVT::i32);
+  return true;
+}
+
 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
   Src = In;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
index 8645490f0b16f..3b42d88df0c24 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.h
@@ -240,6 +240,16 @@ class AMDGPUDAGToDAGISel : public SelectionDAGISel {
   bool SelectVOP3PModsNeg(SDValue In, SDValue &Src) const;
   bool SelectWMMAOpSelVOP3PMods(SDValue In, SDValue &Src) const;
 
+  bool SelectWMMAModsF32NegAbs(SDValue In, SDValue &Src,
+                               SDValue &SrcMods) const;
+  bool SelectWMMAModsF16Neg(SDValue In, SDValue &Src, SDValue &SrcMods) const;
+  bool SelectWMMAModsF16NegAbs(SDValue In, SDValue &Src,
+                               SDValue &SrcMods) const;
+  bool SelectWMMAVISrc(SDValue In, SDValue &Src) const;
+
+  bool SelectSWMMACIndex8(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+  bool SelectSWMMACIndex16(SDValue In, SDValue &Src, SDValue &IndexKey) const;
+
   bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
   bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index fdee74d58d269..f255d098b631c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -3956,6 +3956,219 @@ AMDGPUInstructionSelector::selectWMMAOpSelVOP3PMods(
   }};
 }
 
+static Register buildRegSequence(SmallVectorImpl<Register> &Elts,
+                                 MachineInstr *InsertPt,
+                                 MachineRegisterInfo &MRI) {
+  const TargetRegisterClass *DstRegClass;
+  switch (Elts.size()) {
+  case 8:
+    DstRegClass = &AMDGPU::VReg_256RegClass;
+    break;
+  case 4:
+    DstRegClass = &AMDGPU::VReg_128RegClass;
+    break;
+  case 2:
+    DstRegClass = &AMDGPU::VReg_64RegClass;
+    break;
+  default:
+    llvm_unreachable("unhandled Reg sequence size");
+  }
+
+  MachineIRBuilder B(*InsertPt);
+  auto MIB = B.buildInstr(AMDGPU::REG_SEQUENCE)
+                 .addDef(MRI.createVirtualRegister(DstRegClass));
+  for (unsigned i = 0; i < Elts.size(); ++i) {
+    MIB.addReg(Elts[i]);
+    MIB.addImm(SIRegisterInfo::getSubRegFromChannel(i));
+  }
+  return MIB->getOperand(0).getReg();
+}
+
+static void selectWMMAModsNegAbs(unsigned ModOpcode, unsigned &Mods,
+                                 SmallVectorImpl<Register> &Elts, Register &Src,
+                                 MachineInstr *InsertPt,
+                                 MachineRegisterInfo &MRI) {
+  if (ModOpcode == TargetOpcode::G_FNEG) {
+    Mods |= SISrcMods::NEG;
+    // Check if all elements also have abs modifier
+    SmallVector<Register, 8> NegAbsElts;
+    for (auto El : Elts) {
+      Register FabsSrc;
+      if (!mi_match(El, MRI, m_GFabs(m_Reg(FabsSrc))))
+        break;
+      NegAbsElts.push_back(FabsSrc);
+    }
+    if (Elts.size() != NegAbsElts.size()) {
+      // Neg
+      Src = buildRegSequence(Elts, InsertPt, MRI);
+    } else {
+      // Neg and Abs
+      Mods |= SISrcMods::NEG_HI;
+      Src = buildRegSequence(NegAbsElts, InsertPt, MRI);
+    }
+  } else {
+    assert(ModOpcode == TargetOpcode::G_FABS);
+    // Abs
+    Mods |= SISrcMods::NEG_HI;
+    Src = buildRegSequence(Elts, InsertPt, MRI);
+  }
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF32NegAbs(MachineOperand &Root) const {
+  Register Src = Root.getReg();
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned ModOpcode;
+  SmallVector<Register, 8> EltsF32;
+
+  if (GBuildVector *BV = dyn_cast<GBuildVector>(MRI->getVRegDef(Src))) {
+    for (unsigned i = 0; i < BV->getNumSources(); ++i) {
+      MachineInstr *ElF32 = MRI->getVRegDef(BV->getSourceReg(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsF32.empty())
+        ModOpcode = (ElF32->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
+                                                           : AMDGPU::G_FABS;
+      if (ElF32->getOpcode() != ModOpcode)
+        break;
+      EltsF32.push_back(ElF32->getOperand(1).getReg());
+    }
+
+    // All elements had ModOpcode modifier
+    if (BV->getNumSources() == EltsF32.size()) {
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsF32, Src, Root.getParent(),
+                           *MRI);
+    }
+  }
+
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF16Neg(MachineOperand &Root) const {
+  Register Src = Root.getReg();
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  SmallVector<Register, 8> EltsV2F16;
+
+  if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
+    for (unsigned i = 0; i < CV->getNumSources(); ++i) {
+      Register FNegSrc;
+      if (!mi_match(CV->getSourceReg(i), *MRI, m_GFNeg(m_Reg(FNegSrc))))
+        break;
+      EltsV2F16.push_back(FNegSrc);
+    }
+
+    // All elements had ModOpcode modifier
+    if (CV->getNumSources() == EltsV2F16.size()) {
+      Mods |= SISrcMods::NEG;
+      Mods |= SISrcMods::NEG_HI;
+      Src = buildRegSequence(EltsV2F16, Root.getParent(), *MRI);
+    }
+  }
+
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAModsF16NegAbs(MachineOperand &Root) const {
+  Register Src = Root.getReg();
+  unsigned Mods = SISrcMods::OP_SEL_1;
+  unsigned ModOpcode;
+  SmallVector<Register, 8> EltsV2F16;
+
+  if (GConcatVectors *CV = dyn_cast<GConcatVectors>(MRI->getVRegDef(Src))) {
+    for (unsigned i = 0; i < CV->getNumSources(); ++i) {
+      MachineInstr *ElV2F16 = MRI->getVRegDef(CV->getSourceReg(i));
+      // Based on first element decide which mod we match, neg or abs
+      if (EltsV2F16.empty())
+        ModOpcode = (ElV2F16->getOpcode() == AMDGPU::G_FNEG) ? AMDGPU::G_FNEG
+                                                             : AMDGPU::G_FABS;
+      if (ElV2F16->getOpcode() != ModOpcode)
+        break;
+      EltsV2F16.push_back(ElV2F16->getOperand(1).getReg());
+    }
+
+    // All elements had ModOpcode modifier
+    if (CV->getNumSources() == EltsV2F16.size()) {
+      MachineIRBuilder B(*Root.getParent());
+      selectWMMAModsNegAbs(ModOpcode, Mods, EltsV2F16, Src, Root.getParent(),
+                           *MRI);
+    }
+  }
+
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+           [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }}};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectWMMAVISrc(MachineOperand &Root) const {
+  std::optional<FPValueAndVReg> FPValReg;
+  if (mi_match(Root.getReg(), *MRI, m_GFCstOrSplat(FPValReg))) {
+    if (TII.isInlineConstant(FPValReg->Value.bitcastToAPInt())) {
+      return {{[=](MachineInstrBuilder &MIB) {
+        MIB.addImm(FPValReg->Value.bitcastToAPInt().getSExtValue());
+      }}};
+    }
+    // Non-inlineable splat floats should not fall-through for integer immediate
+    // checks.
+    return {};
+  }
+
+  APInt ICst;
+  if (mi_match(Root.getReg(), *MRI, m_ICstOrSplat(ICst))) {
+    if (TII.isInlineConstant(ICst)) {
+      return {
+          {[=](MachineInstrBuilder &MIB) { MIB.addImm(ICst.getSExtValue()); }}};
+    }
+  }
+
+  return {};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex8(MachineOperand &Root) const {
+  Register Src =
+      getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+  unsigned Key = 0;
+
+  Register ShiftSrc;
+  std::optional<ValueAndVReg> ShiftAmt;
+  if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
+      MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
+      ShiftAmt->Value.getZExtValue() % 8 == 0) {
+    Key = ShiftAmt->Value.getZExtValue() / 8;
+    Src = ShiftSrc;
+  }
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSWMMACIndex16(MachineOperand &Root) const {
+
+  Register Src =
+      getDefIgnoringCopies(Root.getReg(), *MRI)->getOperand(0).getReg();
+  unsigned Key = 0;
+
+  Register ShiftSrc;
+  std::optional<ValueAndVReg> ShiftAmt;
+  if (mi_match(Src, *MRI, m_GLShr(m_Reg(ShiftSrc), m_GCst(ShiftAmt))) &&
+      MRI->getType(ShiftSrc).getSizeInBits() == 32 &&
+      ShiftAmt->Value.getZExtValue() == 16) {
+    Src = ShiftSrc;
+    Key = 1;
+  }
+
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Key); } // index_key
+  }};
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3OpSelMods(MachineOperand &Root) const {
   Register Src;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 12ea46c2895b0..ef7630f137aca 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -199,6 +199,19 @@ class AMDGPUInstructionSelector final : public InstructionSelector {
   InstructionSelector::ComplexRendererFns
   selectWMMAOpSelVOP3PMods(MachineOperand &Root) const;
 
+  InstructionSelector::ComplexRendererFns
+  selectWMMAModsF32NegAbs(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectWMMAModsF16Neg(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectWMMAModsF16NegAbs(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectWMMAVISrc(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectSWMMACIndex8(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectSWMMACIndex16(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectVOP3OpSelMods(MachineOperand &Root) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 8e74d4c0e9459..32921bb248caf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -7134,6 +7134,29 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
     return legalizeDSAtomicFPIntrinsic(Helper, MI, IntrID);
   case Intrinsic::amdgcn_image_bvh_intersect_ray:
     return legalizeBVHIntrinsic(MI, B);
+  case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+  case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
+    Register Index = MI.getOperand(5).getReg();
+    LLT S32 = LLT::scalar(32);
+    if (MRI.getType(Index) != S32)
+      MI.getOperand(5).setReg(B.buildAnyExt(S32, Index).getReg(0));
+    return true;
+  }
+  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
+    Register Index = MI.getOperand(7).getReg();
+    LLT S32 = LLT::scalar(32);
+    if (MRI.getType(Index) != S32)
+      MI.getOperand(7).setReg(B.buildAnyExt(S32, Index).getReg(0));
+    return true;
+  }
   case Intrinsic::amdgcn_fmed3: {
     GISelChangeObserver &Observer = Helper.Observer;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index bdd4e891f1589..09fac963d222d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4505,6 +4505,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_wmma_f32_16x16x16_f16:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu4:
     case Intrinsic::amdgcn_wmma_i32_16x16x16_iu8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_fp8_bf8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_fp8:
+    case Intrinsic::amdgcn_wmma_f32_16x16x16_bf8_bf8:
+    case Intrinsic::amdgcn_wmma_i32_16x16x32_iu4:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+    case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+    case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+    case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+    case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8:
       return getDefaultMappingVOP(MI);
     case Intrinsic::amdgcn_log:
     case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 67263f23b9831..bb1c6b7337299 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -414,6 +414,22 @@ def : SourceOfDivergence<int_amdgcn_wmma_f16_16x16x16_f16>;
 def : SourceOfDivergence<int_amdgcn_wmma_bf16_16x16x16_bf16>;
 def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu8>;
 def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x16_iu4>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_wmma_f32_16x16x16_bf8_bf8>;
+def : SourceOfDivergence<int_amdgcn_wmma_i32_16x16x32_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f16_16x16x32_f16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_bf16_16x16x32_bf16>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x32_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_i32_16x16x64_iu4>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_fp8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_fp8_bf8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_fp8>;
+def : SourceOfDivergence<int_amdgcn_swmmac_f32_16x16x32_bf8_bf8>;
 def : SourceOfDivergence<int_amdgcn_global_load_tr>;
 
 // The dummy boolean output is divergent from the IR's perspective,
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 6b1d04b04066f..9ab657f4e7bb4 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -151,6 +151,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     ImmTyOpSelHi,
     ImmTyNegLo,
     ImmTyNegHi,
+    ImmTyIndexKey8bit,
+    ImmTyIndexKey16bit,
     ImmTyDPP8,
     ImmTyDppCtrl,
     ImmTyDppRowMask,
@@ -383,6 +385,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
   bool isGDS() const { return isImmTy(ImmTyGDS); }
   bool isLDS() const { return isImmTy(ImmTyLDS); }
   bool isCPol() const { return isImmTy(ImmTyCPol); }
+  bool isIndexKey8bit() const { return isImmTy(ImmTyIndexKey8bit); }
+  bool isIndexKey16bit() const { return isImmTy(ImmTyIndexKey16bit); }
   bool isTFE() const { return isImmTy(ImmTyTFE); }
   bool isFORMAT() const { return isImmTy(ImmTyFORMAT) && isUInt<7>(getImm()); }
   bool isDppBankMask() const { return isImmTy(ImmTyDppBankMask); }
@@ -656,6 +660,14 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isVISrcF16() || isVISrcB32();
   }
 
+  bool isVISrc_64F16() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::f16);
+  }
+
+  bool isVISrc_64B32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
+  }
+
   bool isVISrc_64B64() const {
     return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i64);
   }
@@ -672,6 +684,14 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     return isRegOrInlineNoMods(AMDGPU::VReg_64RegClassID, MVT::i32);
   }
 
+  bool isVISrc_256B32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i32);
+  }
+
+  bool isVISrc_256F32() const {
+    return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::f32);
+  }
+
   bool isVISrc_256B64() const {
     return isRegOrInlineNoMods(AMDGPU::VReg_256RegClassID, MVT::i64);
   }
@@ -1047,6 +1067,8 @@ class AMDGPUOperand : public MCParsedAsmOperand {
     case ImmTyOffset1: OS << "Offset1"; break;
     case ImmTySMEMOffsetMod: OS << "SMEMOffsetMod"; break;
     case ImmTyCPol: OS << "CPol"; break;
+    case ImmTyIndexKey8bit: OS << "index_key"; break;
+    case ImmTyIndexKey16bit: OS << "index_key"; break;
     case ImmTyTFE: OS << "TFE"; break;
     case ImmTyD16: OS << "D16"; break;
     case ImmTyFORMAT: OS << "FORMAT"; break;
@@ -1604,6 +1626,11 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   ParseStatus parseRegWithFPInputMods(OperandVector &Operands);
   ParseStatus parseRegWithIntInputMods(OperandVector &Operands);
   ParseStatus parseVReg32OrOff(OperandVector &Operands);
+  ParseStatus tryParseIndexKey(OperandVector &Operands,
+                               AMDGPUOperand::ImmTy ImmTy);
+  ParseStatus parseIndexKey8bit(OperandVector &Operands);
+  ParseStatus parseIndexKey16bit(OperandVector &Operands);
+
   ParseStatus parseDfmtNfmt(int64_t &Format);
   ParseStatus parseUfmt(int64_t &Format);
   ParseStatus parseSymbolicSplitFormat(StringRef FormatStr, SMLoc Loc,
@@ -1784,6 +1811,8 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3P(MCInst &Inst, const OperandVector &Operands);
+  void cvtSWMMAC(MCInst &Inst, const OperandVector &Operands);
+
   void cvtVOPD(MCInst &Inst, const OperandVector &Operands);
   void cvtVOP3OpSel(MCInst &Inst, const OperandVector &Operands,
                     OptionalImmIndexMap &OptionalIdx);
@@ -4367,7 +4396,11 @@ bool AMDGPUAsmParser::validateNeg(const MCInst &Inst, int OpName) {
   uint64_t TSFlags = MII.get(Opc).TSFlags;
 
   // v_dot4 fp8/bf8 neg_lo/neg_hi not allowed on src0 and src1 (allowed on src2)
-  if (!(TSFlags & SIInstrFlags::IsDOT))
+  // v_wmma iu4/iu8 neg_lo not allowed on src2 (allowed on src0, src1)
+  // v_swmmac f16/bf16 neg_lo/neg_hi not allowed on src2 (allowed on src0, src1)
+  // other wmma/swmmac instructions don't have neg_lo/neg_hi operand.
+  if (!(TSFlags & SIInstrFlags::IsDOT) && !(TSFlags & SIInstrFlags::IsWMMA) &&
+      !(TSFlags & SIInstrFlags::IsSWMMAC))
     return true;
 
   int NegIdx = AMDGPU::getNamedOperandIdx(Opc, OpName);
@@ -6468,6 +6501,33 @@ bool AMDGPUAsmParser::tryParseFmt(const char *Pref,
   return true;
 }
 
+ParseStatus AMDGPUAsmParser::tryParseIndexKey(OperandVector &Operands,
+                                              AMDGPUOperand::ImmTy ImmTy) {
+  const char *Pref = "index_key";
+  int64_t ImmVal = 0;
+  SMLoc Loc = getLoc();
+  auto Res = parseIntWithPrefix(Pref, ImmVal);
+  if (!Res.isSuccess())
+    return Res;
+
+  if (ImmTy == AMDGPUOperand::ImmTyIndexKey16bit && (ImmVal < 0 || ImmVal > 1))
+    return Error(Loc, Twine("out of range ", StringRef(Pref)));
+
+  if (ImmTy == AMDGPUOperand::ImmTyIndexKey8bit && (ImmVal < 0 || ImmVal > 3))
+    return Error(Loc, Twine("out of range ", StringRef(Pref)));
+
+  Operands.push_back(AMDGPUOperand::CreateImm(this, ImmVal, Loc, ImmTy));
+  return ParseStatus::Success;
+}
+
+ParseStatus AMDGPUAsmParser::parseIndexKey8bit(OperandVector &Operands) {
+  return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey8bit);
+}
+
+ParseStatus AMDGPUAsmParser::parseIndexKey16bit(OperandVector &Operands) {
+  return tryParseIndexKey(Operands, AMDGPUOperand::ImmTyIndexKey16bit);
+}
+
 // dfmt and nfmt (in a tbuffer instruction) are parsed as one to allow their
 // values to live in a joint format operand in the MCInst encoding.
 ParseStatus AMDGPUAsmParser::parseDfmtNfmt(int64_t &Format) {
@@ -8340,10 +8400,12 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
   }
 
   int NegLoIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_lo);
-  if (NegLoIdx != -1) {
+  if (NegLoIdx != -1)
     addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegLo);
+
+  int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+  if (NegHiIdx != -1)
     addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyNegHi);
-  }
 
   const int Ops[] = { AMDGPU::OpName::src0,
                       AMDGPU::OpName::src1,
@@ -8363,11 +8425,11 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands,
   if (OpSelHiIdx != -1)
     OpSelHi = Inst.getOperand(OpSelHiIdx).getImm();
 
-  if (NegLoIdx != -1) {
-    int NegHiIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::neg_hi);
+  if (NegLoIdx != -1)
     NegLo = Inst.getOperand(NegLoIdx).getImm();
+
+  if (NegHiIdx != -1)
     NegHi = Inst.getOperand(NegHiIdx).getImm();
-  }
 
   for (int J = 0; J < 3; ++J) {
     int OpIdx = AMDGPU::getNamedOperandIdx(Opc, Ops[J]);
@@ -8403,6 +8465,43 @@ void AMDGPUAsmParser::cvtVOP3P(MCInst &Inst, const OperandVector &Operands) {
   cvtVOP3P(Inst, Operands, OptIdx);
 }
 
+static void addSrcModifiersAndSrc(MCInst &Inst, const OperandVector &Operands,
+                                  unsigned i, unsigned Opc, unsigned OpName) {
+  if (AMDGPU::getNamedOperandIdx(Opc, OpName) != -1)
+    ((AMDGPUOperand &)*Operands[i]).addRegOrImmWithFPInputModsOperands(Inst, 2);
+  else
+    ((AMDGPUOperand &)*Operands[i]).addRegOperands(Inst, 1);
+}
+
+void AMDGPUAsmParser::cvtSWMMAC(MCInst &Inst, const OperandVector &Operands) {
+  unsigned Opc = Inst.getOpcode();
+
+  ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1);
+  addSrcModifiersAndSrc(Inst, Operands, 2, Opc, AMDGPU::OpName::src0_modifiers);
+  addSrcModifiersAndSrc(Inst, Operands, 3, Opc, AMDGPU::OpName::src1_modifiers);
+  ((AMDGPUOperand &)*Operands[1]).addRegOperands(Inst, 1); // srcTiedDef
+  ((AMDGPUOperand &)*Operands[4]).addRegOperands(Inst, 1); // src2
+
+  OptionalImmIndexMap OptIdx;
+  for (unsigned i = 5; i < Operands.size(); ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+    OptIdx[Op.getImmTy()] = i;
+  }
+
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_8bit))
+    addOptionalImmOperand(Inst, Operands, OptIdx,
+                          AMDGPUOperand::ImmTyIndexKey8bit);
+
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::index_key_16bit))
+    addOptionalImmOperand(Inst, Operands, OptIdx,
+                          AMDGPUOperand::ImmTyIndexKey16bit);
+
+  if (AMDGPU::hasNamedOperand(Opc, AMDGPU::OpName::clamp))
+    addOptionalImmOperand(Inst, Operands, OptIdx, AMDGPUOperand::ImmTyClampSI);
+
+  cvtVOP3P(Inst, Operands, OptIdx);
+}
+
 //===----------------------------------------------------------------------===//
 // VOPD
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 2c6aa2ee348a1..a9968cfe25b46 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -260,8 +260,12 @@ DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_32, OPW32, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 64)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VS_64, OPW64, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_64, OPW64, 16)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 32)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_128, OPW128, 16)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 64)
+DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_256, OPW256, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_512, OPW512, 32)
 DECODE_OPERAND_SRC_REG_OR_IMM_9(VReg_1024, OPW1024, 32)
 
@@ -704,6 +708,10 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       break;
 
     Res = tryDecodeInst(DecoderTableWMMAGFX1164, MI, QW, Address, CS);
+    if (Res)
+      break;
+
+    Res = tryDecodeInst(DecoderTableWMMAGFX1264, MI, QW, Address, CS);
   } while (false);
 
   if (Res && AMDGPU::isMAC(MI.getOpcode())) {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 102ecb5e7680e..1c013cc25b296 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -1716,14 +1716,14 @@ bool GCNHazardRecognizer::fixVALUTransUseHazard(MachineInstr *MI) {
 }
 
 bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
-  if (!SIInstrInfo::isWMMA(*MI))
+  if (!SIInstrInfo::isWMMA(*MI) && !SIInstrInfo::isSWMMAC(*MI))
     return false;
 
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
-  auto IsHazardFn = [MI, TII, TRI](const MachineInstr &I) {
-    if (!SIInstrInfo::isWMMA(I))
+  auto IsHazardFn = [MI, TII, TRI, this](const MachineInstr &I) {
+    if (!SIInstrInfo::isWMMA(I) && !SIInstrInfo::isSWMMAC(I))
       return false;
 
     // Src0(matrix A) or Src1(matrix B) of the current wmma instruction overlaps
@@ -1741,6 +1741,18 @@ bool GCNHazardRecognizer::fixWMMAHazards(MachineInstr *MI) {
       return true;
     }
 
+    // GFX12+ allows overlap of matrix C with PrevDstReg (hardware will stall)
+    // but Index can't overlap with PrevDstReg.
+    if (AMDGPU::isGFX12Plus(ST)) {
+      if (SIInstrInfo::isSWMMAC(*MI)) {
+        const Register CurIndex =
+            TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->getReg();
+        if (TRI->regsOverlap(PrevDstReg, CurIndex))
+          return true;
+      }
+      return false;
+    }
+
     return false;
   };
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index 9e64e3fd79576..abfa4a3531e8e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -1275,6 +1275,23 @@ void AMDGPUInstPrinter::printPackedModifier(const MCInst *MI,
         (ModIdx != -1) ? MI->getOperand(ModIdx).getImm() : DefaultValue;
   }
 
+  // Print three values of neg/opsel for wmma instructions (prints 0 when there
+  // is no src_modifier operand instead of not printing anything).
+  if (MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsSWMMAC ||
+      MII.get(MI->getOpcode()).TSFlags & SIInstrFlags::IsWMMA) {
+    NumOps = 0;
+    int DefaultValue = Mod == SISrcMods::OP_SEL_1;
+    for (int OpName :
+         {AMDGPU::OpName::src0_modifiers, AMDGPU::OpName::src1_modifiers,
+          AMDGPU::OpName::src2_modifiers}) {
+      int Idx = AMDGPU::getNamedOperandIdx(Opc, OpName);
+      if (Idx != -1)
+        Ops[NumOps++] = MI->getOperand(Idx).getImm();
+      else
+        Ops[NumOps++] = DefaultValue;
+    }
+  }
+
   const bool HasDstSel =
     NumOps > 0 &&
     Mod == SISrcMods::OP_SEL_0 &&
@@ -1346,6 +1363,26 @@ void AMDGPUInstPrinter::printNegHi(const MCInst *MI, unsigned OpNo,
   printPackedModifier(MI, " neg_hi:[", SISrcMods::NEG_HI, O);
 }
 
+void AMDGPUInstPrinter::printIndexKey8bit(const MCInst *MI, unsigned OpNo,
+                                          const MCSubtargetInfo &STI,
+                                          raw_ostream &O) {
+  auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+  if (Imm == 0)
+    return;
+
+  O << " index_key:" << Imm;
+}
+
+void AMDGPUInstPrinter::printIndexKey16bit(const MCInst *MI, unsigned OpNo,
+                                           const MCSubtargetInfo &STI,
+                                           raw_ostream &O) {
+  auto Imm = MI->getOperand(OpNo).getImm() & 0x7;
+  if (Imm == 0)
+    return;
+
+  O << " index_key:" << Imm;
+}
+
 void AMDGPUInstPrinter::printInterpSlot(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index e3958f88277da..e91ff86b219a0 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -139,6 +139,10 @@ class AMDGPUInstPrinter : public MCInstPrinter {
                   const MCSubtargetInfo &STI, raw_ostream &O);
   void printNegHi(const MCInst *MI, unsigned OpNo,
                   const MCSubtargetInfo &STI, raw_ostream &O);
+  void printIndexKey8bit(const MCInst *MI, unsigned OpNo,
+                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printIndexKey16bit(const MCInst *MI, unsigned OpNo,
+                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpSlot(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printInterpAttr(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 8ab66d4fd5b86..19596d53b4532 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -167,6 +167,9 @@ enum : uint64_t {
 
   // ds_gws_* instructions.
   GWS = UINT64_C(1) << 62,
+
+  // Is a SWMMAC instruction.
+  IsSWMMAC = UINT64_C(1) << 63,
 };
 
 // v_cmp_class_* etc. use a 10-bit mask for what operation is checked.
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2862a7787e75a..a812cdc61500c 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -208,6 +208,7 @@ bool SIFoldOperands::canUseImmWithOpSel(FoldCandidate &Fold) const {
   assert(Old.isReg() && Fold.isImm());
 
   if (!(TSFlags & SIInstrFlags::IsPacked) || (TSFlags & SIInstrFlags::IsMAI) ||
+      (TSFlags & SIInstrFlags::IsWMMA) || (TSFlags & SIInstrFlags::IsSWMMAC) ||
       (ST->hasDOTOpSelHazard() && (TSFlags & SIInstrFlags::IsDOT)))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cf947dccafac5..d35b76c8ad54e 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8242,6 +8242,36 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                             SIInstrInfo::MO_ABS32_LO);
     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
   }
+  case Intrinsic::amdgcn_swmmac_f16_16x16x32_f16:
+  case Intrinsic::amdgcn_swmmac_bf16_16x16x32_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_f16:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_fp8_bf8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_fp8:
+  case Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8: {
+    if (Op.getOperand(4).getValueType() == MVT::i32)
+      return SDValue();
+
+    SDLoc SL(Op);
+    auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(4), SL, MVT::i32);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+                       Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+                       Op.getOperand(3), IndexKeyi32);
+  }
+  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu4:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x32_iu8:
+  case Intrinsic::amdgcn_swmmac_i32_16x16x64_iu4: {
+    if (Op.getOperand(6).getValueType() == MVT::i32)
+      return SDValue();
+
+    SDLoc SL(Op);
+    auto IndexKeyi32 = DAG.getAnyExtOrTrunc(Op.getOperand(6), SL, MVT::i32);
+    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, SL, Op.getValueType(),
+                       {Op.getOperand(0), Op.getOperand(1), Op.getOperand(2),
+                        Op.getOperand(3), Op.getOperand(4), Op.getOperand(5),
+                        IndexKeyi32, Op.getOperand(7)});
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 1b66d163714fb..ab536f8f49d53 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -161,6 +161,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   // ds_gws_* instructions.
   field bit GWS = 0;
 
+  // This bit indicates that this is one of SWMMAC instructions.
+  field bit IsSWMMAC = 0;
+
   // These need to be kept in sync with the enum in SIInstrFlags.
   let TSFlags{0} = SALU;
   let TSFlags{1} = VALU;
@@ -248,6 +251,8 @@ class InstSI <dag outs, dag ins, string asm = "",
   
   let TSFlags{62} = GWS;
 
+  let TSFlags{63} = IsSWMMAC;
+
   let SchedRW = [Write32Bit];
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index fc85b089aa471..1c9dacc09f815 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -802,6 +802,14 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     return isMFMA(MI) || isWMMA(MI);
   }
 
+  static bool isSWMMAC(const MachineInstr &MI) {
+    return MI.getDesc().TSFlags & SIInstrFlags::IsSWMMAC;
+  }
+
+  bool isSWMMAC(uint16_t Opcode) const {
+    return get(Opcode).TSFlags & SIInstrFlags::IsSWMMAC;
+  }
+
   bool isDOT(uint16_t Opcode) const {
     return get(Opcode).TSFlags & SIInstrFlags::IsDOT;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 903a04718b644..29b3b014893f2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -1088,6 +1088,9 @@ def op_sel_hi0 : ArrayOperand0<"op_sel_hi", "OpSelHi">;
 def neg_lo0 : ArrayOperand0<"neg_lo", "NegLo">;
 def neg_hi0 : ArrayOperand0<"neg_hi", "NegHi">;
 
+def IndexKey16bit : CustomOperand<i32, 1>;
+def IndexKey8bit : CustomOperand<i32, 1>;
+
 def dpp8 : CustomOperand<i32, 0, "DPP8">;
 def dpp_ctrl : CustomOperand<i32, 0, "DPPCtrl">;
 
@@ -1344,6 +1347,13 @@ def VOP3PModsDOT  : ComplexPattern<untyped, 2, "SelectVOP3PModsDOT">;
 def VOP3PModsNeg  : ComplexPattern<untyped, 1, "SelectVOP3PModsNeg">;
 def WMMAOpSelVOP3PMods  : ComplexPattern<untyped, 1, "SelectWMMAOpSelVOP3PMods">;
 
+def WMMAModsF32NegAbs  : ComplexPattern<untyped, 2, "SelectWMMAModsF32NegAbs">;
+def WMMAModsF16Neg  : ComplexPattern<untyped, 2, "SelectWMMAModsF16Neg">;
+def WMMAModsF16NegAbs  : ComplexPattern<untyped, 2, "SelectWMMAModsF16NegAbs">;
+def WMMAVISrc  : ComplexPattern<untyped, 1, "SelectWMMAVISrc">;
+def SWMMACIndex8  : ComplexPattern<untyped, 2, "SelectSWMMACIndex8">;
+def SWMMACIndex16  : ComplexPattern<untyped, 2, "SelectSWMMACIndex16">;
+
 def VOP3OpSel  : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
 
 def VOP3OpSelMods  : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
@@ -2279,6 +2289,7 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit IsDOT = 0;
   field bit IsSingle = 0;
   field bit IsWMMA = 0;
+  field bit IsSWMMAC = 0;
 
   field bit IsFP8 = 0;
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index f42af89cf5e6d..b3265b73fa7e1 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1341,9 +1341,14 @@ def VCSrc_v2f16 : RegOrV2F16 <"VS_32", "OPERAND_REG_INLINE_C">;
 //  VISrc_* Operands with a VGPR or an inline constant
 //===----------------------------------------------------------------------===//
 
+def VISrc_64_f16 : RegOrF16 <"VReg_64", "OPERAND_REG_INLINE_C">;
+def VISrc_64_b32 : RegOrB32 <"VReg_64", "OPERAND_REG_INLINE_C">;
 def VISrc_64_f64 : RegOrF64 <"VReg_64", "OPERAND_REG_INLINE_C">;
+def VISrc_128_f16 : RegOrF16 <"VReg_128", "OPERAND_REG_INLINE_C">;
 def VISrc_128_b32 : RegOrB32 <"VReg_128", "OPERAND_REG_INLINE_C">;
 def VISrc_128_f32 : RegOrF32 <"VReg_128", "OPERAND_REG_INLINE_C">;
+def VISrc_256_b32 : RegOrB32 <"VReg_256", "OPERAND_REG_INLINE_C">;
+def VISrc_256_f32 : RegOrF32 <"VReg_256", "OPERAND_REG_INLINE_C">;
 def VISrc_256_f64 : RegOrF64 <"VReg_256", "OPERAND_REG_INLINE_C">;
 def VISrc_512_b32 : RegOrB32 <"VReg_512", "OPERAND_REG_INLINE_C">;
 def VISrc_512_f32 : RegOrF32 <"VReg_512", "OPERAND_REG_INLINE_C">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 0c7a08cd4bc91..107b95a9ca8eb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -936,16 +936,19 @@ multiclass WMMAInst<string Suffix, string Instr, VOPProfile P, SDPatternOperator
                           !cast<Instruction>(NAME # _threeaddr # Suffix)>;
   }
 
-  if !eq(Type, WMMAOpSel) then {
-    def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
-  } else if !eq(Type, WMMAUIClamp) then {
-    def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
-  } else {
-    def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+  let SubtargetPredicate = isGFX11Only in {
+    if !eq(Type, WMMAOpSel) then {
+      def : WMMAOpSelPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+    } else if !eq(Type, WMMAUIClamp) then {
+      def : WMMAUIClampPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+    } else {
+      def : WMMARegularPat<!cast<Instruction>(NAME # _twoaddr # Suffix), node, P>;
+    }
   }
 }
 
 
+
 let WaveSizePredicate = isWave32 in {
   defm V_WMMA_F32_16X16X16_F16   : WMMAInst<"_w32", "v_wmma_f32_16x16x16_f16",  VOP_V8F32_V16F16_V16F16_V8F32, int_amdgcn_wmma_f32_16x16x16_f16, VRegSrc_256, WMMARegular, 1>;
   defm V_WMMA_F32_16X16X16_BF16  : WMMAInst<"_w32", "v_wmma_f32_16x16x16_bf16", VOP_V8F32_V16I16_V16I16_V8F32, int_amdgcn_wmma_f32_16x16x16_bf16, VRegSrc_256, WMMARegular, 1>;
@@ -969,6 +972,398 @@ let WaveSizePredicate = isWave64 in {
 
 }
 
+class VOP3PWMMA_Profile<list<ValueType> ArgTy, bit _IsSWMMAC, int _IndexType,
+                        bit _IsIU, bit _IsFP8BF8>
+    : VOP3P_Profile<VOPProfile<ArgTy>> {
+  bit IsIU = _IsIU;
+  bit IsFP8BF8 = _IsFP8BF8;
+  bit IsF16BF16 = !not(!or(IsIU, IsFP8BF8));
+
+  int IndexType = _IndexType;
+
+  let IsPacked = 1;
+  let IsWMMA = !not(_IsSWMMAC);
+  let IsSWMMAC = _IsSWMMAC;
+
+  bit IsAB_F16 = !and(IsF16BF16, ArgTy[1].isFP);
+  bit IsAB_BF16 = !and(IsF16BF16, isIntType<ArgTy[1]>.ret);
+  bit IsC_F32 = !or(!eq(ArgTy[3], v8f32), !eq(ArgTy[3], v4f32));
+  bit IsC_BF16 = !or(!eq(ArgTy[3], v8i16), !eq(ArgTy[3], v4i16));
+  bit IsC_F16 = !or(!eq(ArgTy[3], v8f16), !eq(ArgTy[3], v4f16));
+
+  bit NegLo01 = !or(IsF16BF16, IsIU);
+  bit NegLo2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+  bit NegHi01 = IsF16BF16;
+  bit NegHi2 = !and(!or(IsF16BF16, IsFP8BF8), IsWMMA);
+  bit NegLoAny = !or(NegLo01, NegLo2);
+  bit NegHiAny = !or(NegHi01, NegHi2);
+
+  let DstRC = !cond(!eq(ArgTy[0], v8f32): VDst_256,
+                    !eq(ArgTy[0], v8i32): VDst_256,
+                    !eq(ArgTy[0], v8f16): VDst_128,
+                    !eq(ArgTy[0], v8i16): VDst_128,
+                    !eq(ArgTy[0], v4f32): VDst_128,
+                    !eq(ArgTy[0], v4i32): VDst_128,
+                    !eq(ArgTy[0], v4f16): VDst_64,
+                    !eq(ArgTy[0], v4i16): VDst_64);
+  let Src0RC64 = !cond(!eq(ArgTy[1], v8f16): VRegSrc_128,
+                       !eq(ArgTy[1], v4f16): VRegSrc_64,
+                       !eq(ArgTy[1], v4i16): VRegSrc_64,
+                       !eq(ArgTy[1], v8i16): VRegSrc_128,
+                       !eq(ArgTy[1], v4i32): VRegSrc_128,
+                       !eq(ArgTy[1], v2i32): VRegSrc_64,
+                       !eq(ArgTy[1], i32)  : VRegSrc_32);
+  let Src1RC64 = !cond(!eq(ArgTy[2], v16f16): VRegSrc_256,
+                       !eq(ArgTy[2], v16i16): VRegSrc_256,
+                       !eq(ArgTy[2], v8f16): VRegSrc_128,
+                       !eq(ArgTy[2], v8i16): VRegSrc_128,
+                       !eq(ArgTy[2], v4i32): VRegSrc_128,
+                       !eq(ArgTy[1], v4i16): VRegSrc_64,
+                       !eq(ArgTy[1], v4f16): VRegSrc_64,
+                       !eq(ArgTy[2], v2i32): VRegSrc_64,
+                       !eq(ArgTy[2], i32)  : VRegSrc_32);
+  let Src2RC64 = !if(IsSWMMAC, DstRC,
+                               !cond(!eq(ArgTy[3], v8f32): VISrc_256_f32,
+                                     !eq(ArgTy[3], v8i32): VISrc_256_b32,
+                                     !eq(ArgTy[3], v8f16): VISrc_128_f16,
+                                     !eq(ArgTy[3], v8i16): VISrc_128_f32, // bf16
+                                     !eq(ArgTy[3], v4f16): VISrc_64_f16,
+                                     !eq(ArgTy[3], v4i16): VISrc_64_b32,
+                                     !eq(ArgTy[3], v4i32): VISrc_128_b32,
+                                     !eq(ArgTy[3], v4f32): VISrc_128_f32));
+
+  // For f16 and bf16 matrices A and B, each element can be modified by
+  // fneg(neg_lo,neg_hi = 1). For iu4 and iu8 matrices A and B neg_lo is
+  // overloaded to mean unsigned/signed: neg_lo = 0 (u4 and u8) unsigned(zext)
+  // neg_lo = 1 (i4 and i8) signed(sext). For f16, bf16 and f32 matrix C each
+  // element can be modified by fneg(neg_lo = 1) or fabs(neg_hi = 1).
+
+  // Opcode             | src0/src1 - matrix A/B | src2 - matrix C or Index
+  // ---------------------------------------------------------------------------
+  // wmma f32_f16       | both neg_lo,neg_hi = 1 | neg_lo = 1  neg C(f32)
+  // wmma f32_bf16      | neg A/B (f16 or bf16)  | neg_hi = 1  abs C(f32)
+  // ---------------------------------------------------------------------------
+  // wmma f16_f16       | both neg_lo,neg_hi = 1 | neg_lo = 1 neg C(f16 or bf16)
+  // wmma bf16_bf16     | neg A/B (f16 or bf16)  | neg_hi = 1 abs C(f16 or bf16)
+  // ---------------------------------------------------------------------------
+  // wmma i32_iu8/iu4   | neg_lo = 0 u4/u8(zext) | not allowed for
+  //                    | neg_lo = 1 i4/i8(sext) | i32 matrices
+  // ---------------------------------------------------------------------------
+  // wmma f32_fp8/bf8   | not allowed for        | neg_lo = 1  neg C(f32)
+  // (4 instructions)   | f8 and bf8 matrices    | neg_hi = 1  abs C(f32)
+  // ---------------------------------------------------------------------------
+  // swmmac f32_f16     | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
+  // swmmac f32_bf16    | neg A/B (f16 or bf16)  | A Index - matrix C is in dst
+  // ---------------------------------------------------------------------------
+  // swmmac f16_f16     | both neg_lo,neg_hi = 1 | not allowed for sparse matrix
+  // swmmac bf16_bf16   | neg A/B (f16 or bf16)  | A Index - matrix C is in dst
+  // ---------------------------------------------------------------------------
+  // swmmac i32_iu8/iu4 | neg_lo = 0 u4/u8(zext) | not allowed for sparse matrix
+  //                    | neg_lo = 1 i4/i8(sext) | A Index - matrix C is in dst
+  // ---------------------------------------------------------------------------
+  // swmmac f32_fp8/bf8 | not allowed for        | not allowed for sparse matrix
+  // (4 instructions)   | f8 and bf8 matrices    | A Index - matrix C is in dst
+
+  // pseudo
+
+  // fp8bf8 wmmas don't use src (0 and 1) modifiers, iu use neg_lo, f16 and bf16
+  // use neg_lo and neg_hi. iu wmmas (C is i32) don't use src 2 modifiers,
+  // remaining wmmas(f16, bf16 and f8bf8) use neg_lo and neg_hi for C (C is f32
+  // f16 or bf16). swmmac use index_key and don't use src 2 modifiers.
+
+  dag Src0Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src0_modifiers));
+  dag Src1Mods = !if(IsFP8BF8, (ins), (ins PackedF16InputMods:$src1_modifiers));
+  dag Src2Mods = !if(IsIU, (ins), (ins PackedF16InputMods:$src2_modifiers));
+  dag IndexKey = !cond(!eq(IndexType, 0) : (ins),
+                       !eq(IndexType, 8) : (ins IndexKey8bit:$index_key_8bit),
+                       !eq(IndexType, 16): (ins IndexKey16bit:$index_key_16bit));
+  dag Clamp = !if(IsIU, (ins clampmod0:$clamp), (ins));
+  dag Neg = !cond(!and(NegLoAny, NegHiAny)             : (ins neg_lo0:$neg_lo, neg_hi0:$neg_hi),
+                  !and(NegLoAny, !not(NegHiAny))       : (ins neg_lo0:$neg_lo),
+                  !and(!not(NegLoAny), !not(NegHiAny)) : (ins));
+
+  let InsVOP3P = !con(Src0Mods, (ins Src0RC64:$src0), Src1Mods, (ins Src1RC64:$src1),
+                      !cond(IsWMMA   : !con(Src2Mods, (ins Src2RC64:$src2)),
+                            IsSWMMAC : !con((ins DstRC:$srcTiedDef), (ins VRegSrc_32:$src2), IndexKey)),
+                      Clamp, Neg);
+
+  // asm
+
+  string IndexKeyAsm = !cond(!eq(IndexType, 0)  : "",
+                             !eq(IndexType, 8)  : "$index_key_8bit",
+                             !eq(IndexType, 16) : "$index_key_16bit");
+  string ClampAsm = !if(IsIU, "$clamp", "");
+  string NegAsm = !cond(!and(NegLoAny, NegHiAny)             : "$neg_lo$neg_hi",
+                        !and(NegLoAny, !not(NegHiAny))       : "$neg_lo",
+                        !and(!not(NegLoAny), !not(NegHiAny)) : "");
+
+  let AsmVOP3P = "$vdst, $src0, $src1, $src2"#IndexKeyAsm#NegAsm#ClampAsm;
+
+  // isel patterns
+
+  dag Src0InPat  = !cond(IsAB_F16  : (ins (Src0VT (WMMAModsF16Neg Src0VT:$src0, i32:$src0_modifiers))),
+                         IsAB_BF16 : (ins Src0VT:$src0),
+                         IsIU      : (ins (VOP3PModsNeg i32:$src0_modifiers), Src0VT:$src0),
+                         IsFP8BF8  : (ins Src0VT:$src0));
+  dag Src0OutPat = !cond(IsAB_F16  : (ins i32:$src0_modifiers, Src0VT:$src0),
+                         IsAB_BF16 : (ins (i32 8), Src0VT:$src0),
+                         IsIU      : (ins i32:$src0_modifiers, Src0VT:$src0),
+                         IsFP8BF8  : (ins Src0VT:$src0));
+  dag Src1InPat  = !cond(IsAB_F16  : (ins (Src1VT (WMMAModsF16Neg Src1VT:$src1, i32:$src1_modifiers))),
+                         IsAB_BF16 : (ins Src1VT:$src1),
+                         IsIU      : (ins (VOP3PModsNeg i32:$src1_modifiers), Src1VT:$src1),
+                         IsFP8BF8  : (ins Src1VT:$src1));
+  dag Src1OutPat = !cond(IsAB_F16  : (ins i32:$src1_modifiers, Src1VT:$src1),
+                         IsAB_BF16 : (ins (i32 8), Src1VT:$src1),
+                         IsIU      : (ins i32:$src1_modifiers, Src1VT:$src1),
+                         IsFP8BF8  : (ins Src1VT:$src1));
+  dag Src2InPatWmma  = !cond(IsC_F32  : (ins (Src2VT (WMMAModsF32NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+                             IsC_F16  : (ins (Src2VT (WMMAModsF16NegAbs Src2VT:$src2, i32:$src2_modifiers))),
+                             IsC_BF16 : (ins Src2VT:$src2),
+                             IsIU     : (ins Src2VT:$src2),
+                             IsSWMMAC : (ins));
+  dag Src2OutPatWmma = !cond(IsC_F32  : (ins i32:$src2_modifiers, Src2VT:$src2),
+                             IsC_F16  : (ins i32:$src2_modifiers, Src2VT:$src2),
+                             IsC_BF16 : (ins (i32 8), Src2VT:$src2),
+                             IsIU     : (ins Src2VT:$src2),
+                             IsSWMMAC : (ins));
+  dag ClampPat = !if(IsIU, (ins i1:$clamp), (ins));
+  dag IndexInPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
+                         !eq(IndexType, 8) : (ins (i32 (SWMMACIndex8 i32:$src2, i32:$index_key_8bit))),
+                         !eq(IndexType, 16): (ins (i32 (SWMMACIndex16 i32:$src2, i32:$index_key_16bit))));
+  dag IndexOutPat = !cond(!eq(IndexType, 0) : (ins i32:$src2),
+                          !eq(IndexType, 8) : (ins i32:$src2, i32:$index_key_8bit),
+                          !eq(IndexType, 16): (ins i32:$src2, i32:$index_key_16bit));
+  dag Src2InlineInPat = (ins (Src2VT (WMMAVISrc Src2VT:$src2)));
+  dag Src2InlineOutPat = !con(!if(IsIU, (ins), (ins (i32 8))), (ins Src2VT:$src2));
+
+
+  dag WmmaInPat  = !con(Src0InPat, Src1InPat, Src2InPatWmma, ClampPat);
+  dag WmmaOutPat = !con(Src0OutPat, Src1OutPat, Src2OutPatWmma, ClampPat);
+
+  dag SwmmacInPat  = !con(Src0InPat, Src1InPat, (ins Src2VT:$srcTiedDef), IndexInPat, ClampPat);
+  dag SwmmacOutPat = !con(Src0OutPat, Src1OutPat, (ins Src2VT:$srcTiedDef), IndexOutPat, ClampPat);
+
+  // wmma pattern where src2 is inline imm uses _threeaddr pseudo,
+  // can't use _twoaddr since it would violate src2 tied to vdst constraint.
+  dag WmmaInlineInPat  = !con(Src0InPat, Src1InPat, Src2InlineInPat,  ClampPat);
+  dag WmmaInlineOutPat = !con(Src0OutPat, Src1OutPat, Src2InlineOutPat, ClampPat);
+}
+
+multiclass WMMAInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+  let Mnemonic = Instr, mayRaiseFPException = 0, ReadsModeReg = 0 in {
+    let Constraints = "@earlyclobber $vdst,$vdst = $src2", isConvertibleToThreeAddress = 1 in
+      def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+        let PseudoInstr = Instr#PseudoInstrSuffix;
+      }
+
+    let Constraints = "@earlyclobber $vdst", SchedRW = [Write32Bit, Write32Bit] in
+      def _threeaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+        let PseudoInstr = Instr#PseudoInstrSuffix;
+      }
+
+  }
+  def : WMMAOpcodeMapping<!cast<Instruction>(NAME # _twoaddr),
+                          !cast<Instruction>(NAME # _threeaddr)>;
+}
+
+multiclass SWMMACInstGFX12<string Instr, VOP3PWMMA_Profile WMMAProfile, string PseudoInstrSuffix> {
+  def _twoaddr : VOP3P_Pseudo<Instr, WMMAProfile>{
+    let Mnemonic = Instr;
+    let PseudoInstr = Instr#PseudoInstrSuffix;
+    let mayRaiseFPException = 0;
+    let ReadsModeReg = 0;
+    let AsmMatchConverter = "cvtSWMMAC";
+
+    let Constraints = "@earlyclobber $vdst,$vdst = $srcTiedDef";
+  }
+}
+
+// First argument in Profile is types for matrices D, A, B and C (D = A * B + C)
+// as used by llvm ir, types are vectors(with matrix elements)
+// wave32:
+// For 16x16 matrices, lanes 0 to 31 will have 8 matrix elts,
+// for 16 x 32 16 elts and for 16 x 64 lanes have 32 elts.
+// wave64:
+// lanes will have half the size of elements in lanes compared to wave32 with
+// exception of 16x16_iu4: lanes0-31 will have 8xi4, remaining lanes are ignored
+
+// general idea on element distribution differences:
+// wave32: lane n has 8 matrix elements
+// wave64: lane n has first 4, lane n+32 has other 4 elements
+
+// index size, for each 2 elements in lane you need 4bits in index
+
+// Non-standard types (iu8, iu4, fp8, bf8) will be packed in vectors of i32s.
+// Original type for them is in comment on the right and refers to A and B.
+
+def F32_F16_WMMA_w32    : VOP3PWMMA_Profile<[v8f32, v8f16, v8f16, v8f32], 0, 0, 0, 0>;
+def F32_BF16_WMMA_w32   : VOP3PWMMA_Profile<[v8f32, v8i16, v8i16, v8f32], 0, 0, 0, 0>;
+def F16_F16_WMMA_w32    : VOP3PWMMA_Profile<[v8f16, v8f16, v8f16, v8f16], 0, 0, 0, 0>;
+def BF16_BF16_WMMA_w32  : VOP3PWMMA_Profile<[v8i16, v8i16, v8i16, v8i16], 0, 0, 0, 0>;
+def I32_IU8_WMMA_w32    : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 8xi8
+def I32_IU4X16_WMMA_w32 : VOP3PWMMA_Profile<[v8i32,   i32,   i32, v8i32], 0, 0, 1, 0>; // 8xi4
+def F32_FP8BF8_WMMA_w32 : VOP3PWMMA_Profile<[v8f32, v2i32, v2i32, v8f32], 0, 0, 0, 1>; // 8xf8
+def I32_IU4X32_WMMA_w32 : VOP3PWMMA_Profile<[v8i32, v2i32, v2i32, v8i32], 0, 0, 1, 0>; // 16xi4
+
+def F32_F16_WMMA_w64    : VOP3PWMMA_Profile<[v4f32, v4f16, v4f16, v4f32], 0, 0, 0, 0>;
+def F32_BF16_WMMA_w64   : VOP3PWMMA_Profile<[v4f32, v4i16, v4i16, v4f32], 0, 0, 0, 0>;
+def F16_F16_WMMA_w64    : VOP3PWMMA_Profile<[v4f16, v4f16, v4f16, v4f16], 0, 0, 0, 0>;
+def BF16_BF16_WMMA_w64  : VOP3PWMMA_Profile<[v4i16, v4i16, v4i16, v4i16], 0, 0, 0, 0>;
+def I32_IU8_WMMA_w64    : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 0, 0, 1, 0>; // 4xi8
+def I32_IU4X16_WMMA_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 0, 0, 1, 0>; // 8xi4 *
+def F32_FP8BF8_WMMA_w64 : VOP3PWMMA_Profile<[v4f32,   i32,   i32, v4f32], 0, 0, 0, 1>; // 4xf8
+def I32_IU4X32_WMMA_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 0, 0, 1, 0>; // 8xi4
+
+def F32_F16_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f32, v8f16, v16f16, v8f32], 1, 16, 0, 0>;
+def F32_BF16_SWMMAC_w32   : VOP3PWMMA_Profile<[v8f32, v8i16, v16i16, v8f32], 1, 16, 0, 0>;
+def F16_F16_SWMMAC_w32    : VOP3PWMMA_Profile<[v8f16, v8f16, v16f16, v8f16], 1, 16, 0, 0>;
+def BF16_BF16_SWMMAC_w32  : VOP3PWMMA_Profile<[v8i16, v8i16, v16i16, v8i16], 1, 16, 0, 0>;
+def I32_IU8_SWMMAC_w32    : VOP3PWMMA_Profile<[v8i32, v2i32,  v4i32, v8i32], 1, 16, 1, 0>; // 8xi8, 16xi8
+def I32_IU4X32_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32,   i32,  v2i32, v8i32], 1, 16, 1, 0>; // 8xi4, 16xi4
+def I32_IU4X64_SWMMAC_w32 : VOP3PWMMA_Profile<[v8i32, v2i32,  v4i32, v8i32], 1,  0, 1, 0>; // 16xi4, 32xi4 **
+def F32_FP8BF8_SWMMAC_w32 : VOP3PWMMA_Profile<[v8f32, v2i32,  v4i32, v8f32], 1, 16, 0, 1>; // 8xf8, 16xf8
+
+def F32_F16_SWMMAC_w64    : VOP3PWMMA_Profile<[v4f32, v4f16, v8f16, v4f32], 1,  8, 0, 0>;
+def F32_BF16_SWMMAC_w64   : VOP3PWMMA_Profile<[v4f32, v4i16, v8i16, v4f32], 1,  8, 0, 0>;
+def F16_F16_SWMMAC_w64    : VOP3PWMMA_Profile<[v4f16, v4f16, v8f16, v4f16], 1,  8, 0, 0>;
+def BF16_BF16_SWMMAC_w64  : VOP3PWMMA_Profile<[v4i16, v4i16, v8i16, v4i16], 1,  8, 0, 0>;
+def I32_IU8_SWMMAC_w64    : VOP3PWMMA_Profile<[v4i32,   i32, v2i32, v4i32], 1,  8, 1, 0>; // 4xi8, 8xi8
+def I32_IU4X32_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32,   i32,   i32, v4i32], 1, 16, 1, 0>; // 8xi4, 8xi4 ***
+def I32_IU4X64_SWMMAC_w64 : VOP3PWMMA_Profile<[v4i32,   i32, v2i32, v4i32], 1, 16, 1, 0>; // 8xi4, 16xi4
+def F32_FP8BF8_SWMMAC_w64 : VOP3PWMMA_Profile<[v4f32,   i32, v2i32, v4f32], 1,  8, 0, 1>; // 4xf8, 8xf8
+
+// *   IU4X16_WMMA_w64 lanes 0-31 will have 8xi4, remaining lanes are ignored
+// **  IU4X64_SWMMAC_w32 index is i32, index_key is not used
+// *** IU4X32_SWMMAC_w64 lanes 0-31 will have 8xi4 remaining lanes are ignored
+//                       for matrix A, index is i16; Matrix B uses all lanes
+
+let WaveSizePredicate = isWave32 in {
+defm V_WMMA_F32_16X16X16_F16_w32     : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16",     F32_F16_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF16_w32    : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16",    F32_BF16_WMMA_w32, "_w32">;
+defm V_WMMA_F16_16X16X16_F16_w32     : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16",     F16_F16_WMMA_w32, "_w32">;
+defm V_WMMA_BF16_16X16X16_BF16_w32   : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16",   BF16_BF16_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X16_IU8_w32     : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8",     I32_IU8_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X16_IU4_w32     : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4",     I32_IU4X16_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w32, "_w32">;
+defm V_WMMA_I32_16X16X32_IU4_w32     : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4",     I32_IU4X32_WMMA_w32, "_w32">;
+
+defm V_SWMMAC_F32_16X16X32_F16_w32     : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16",     F32_F16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF16_w32    : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16",    F32_BF16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F16_16X16X32_F16_w32     : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16",     F16_F16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32   : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16",   BF16_BF16_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X32_IU8_w32     : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8",     I32_IU8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X32_IU4_w32     : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4",     I32_IU4X32_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_I32_16X16X64_IU4_w32     : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4",     I32_IU4X64_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w32, "_w32">;
+}
+
+let WaveSizePredicate = isWave64 in {
+defm V_WMMA_F32_16X16X16_F16_w64     : WMMAInstGFX12<"v_wmma_f32_16x16x16_f16",     F32_F16_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF16_w64    : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf16",    F32_BF16_WMMA_w64, "_w64">;
+defm V_WMMA_F16_16X16X16_F16_w64     : WMMAInstGFX12<"v_wmma_f16_16x16x16_f16",     F16_F16_WMMA_w64, "_w64">;
+defm V_WMMA_BF16_16X16X16_BF16_w64   : WMMAInstGFX12<"v_wmma_bf16_16x16x16_bf16",   BF16_BF16_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X16_IU8_w64     : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu8",     I32_IU8_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X16_IU4_w64     : WMMAInstGFX12<"v_wmma_i32_16x16x16_iu4",     I32_IU4X16_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_fp8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_fp8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : WMMAInstGFX12<"v_wmma_f32_16x16x16_bf8_bf8", F32_FP8BF8_WMMA_w64, "_w64">;
+defm V_WMMA_I32_16X16X32_IU4_w64     : WMMAInstGFX12<"v_wmma_i32_16x16x32_iu4",     I32_IU4X32_WMMA_w64, "_w64">;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64     : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_f16",     F32_F16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF16_w64    : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf16",    F32_BF16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F16_16X16X32_F16_w64     : SWMMACInstGFX12<"v_swmmac_f16_16x16x32_f16",     F16_F16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64   : SWMMACInstGFX12<"v_swmmac_bf16_16x16x32_bf16",   BF16_BF16_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X32_IU8_w64     : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu8",     I32_IU8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X32_IU4_w64     : SWMMACInstGFX12<"v_swmmac_i32_16x16x32_iu4",     I32_IU4X32_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_I32_16X16X64_IU4_w64     : SWMMACInstGFX12<"v_swmmac_i32_16x16x64_iu4",     I32_IU4X64_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_fp8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_fp8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : SWMMACInstGFX12<"v_swmmac_f32_16x16x32_bf8_bf8", F32_FP8BF8_SWMMAC_w64, "_w64">;
+}
+
+// IsGFX11OpselIntrinsic: f16_f16 and bf16_bf16 Intrinsics have imm operand that
+// controls opsel. Used by gfx11, removed in gfx12 (operand must be 0).
+multiclass WMMAPat<string Inst, SDPatternOperator node, VOP3PWMMA_Profile P, bit IsGFX11OpselIntrinsic = 0> {
+  def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
+                (P.DstVT !setdagop(P.WmmaOutPat, !cast<Instruction>(Inst#"_twoaddr")))>;
+  let AddedComplexity = 4 in
+  def : GCNPat <(P.DstVT !setdagop(!con(P.WmmaInlineInPat, !if(IsGFX11OpselIntrinsic, (ins 0), (ins))), node)),
+                (P.DstVT !setdagop(P.WmmaInlineOutPat, !cast<Instruction>(Inst#"_threeaddr")))>;
+}
+
+class SWMMACPat<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
+  GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
+          (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>;
+
+class SWMMACPat_w64<Instruction Inst, SDPatternOperator node, VOP3PWMMA_Profile P> :
+  GCNPat <(P.DstVT !setdagop(P.SwmmacInPat, node)),
+          (P.DstVT !setdagop(P.SwmmacOutPat, Inst))>{
+            let WaveSizePredicate = isWave64;
+          }
+
+let WaveSizePredicate = isWave32, SubtargetPredicate = isGFX12Plus in {
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w32",     int_amdgcn_wmma_f32_16x16x16_f16,     F32_F16_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w32",    int_amdgcn_wmma_f32_16x16x16_bf16,    F32_BF16_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w32",     int_amdgcn_wmma_f16_16x16x16_f16,     F16_F16_WMMA_w32,1>;
+  defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w32",   int_amdgcn_wmma_bf16_16x16x16_bf16,   BF16_BF16_WMMA_w32,1>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w32",     int_amdgcn_wmma_i32_16x16x16_iu8,     I32_IU8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w32",     int_amdgcn_wmma_i32_16x16x16_iu4,     I32_IU4X16_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w32", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w32>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w32",     int_amdgcn_wmma_i32_16x16x32_iu4,     I32_IU4X32_WMMA_w32>;
+
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w32_twoaddr,     int_amdgcn_swmmac_f32_16x16x32_f16,     F32_F16_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr,    int_amdgcn_swmmac_f32_16x16x32_bf16,    F32_BF16_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w32_twoaddr,     int_amdgcn_swmmac_f16_16x16x32_f16,     F16_F16_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr,   int_amdgcn_swmmac_bf16_16x16x32_bf16,   BF16_BF16_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr,     int_amdgcn_swmmac_i32_16x16x32_iu8,     I32_IU8_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr,     int_amdgcn_swmmac_i32_16x16x32_iu4,     I32_IU4X32_SWMMAC_w32>;
+  def : GCNPat <(I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacInPat,  int_amdgcn_swmmac_i32_16x16x64_iu4)),
+                (I32_IU4X64_SWMMAC_w32.DstVT !setdagop(I32_IU4X64_SWMMAC_w32.SwmmacOutPat, V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr))>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w32>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w32>;
+}
+
+let WaveSizePredicate = isWave64, SubtargetPredicate = isGFX12Plus in {
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_F16_w64",     int_amdgcn_wmma_f32_16x16x16_f16,     F32_F16_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF16_w64",    int_amdgcn_wmma_f32_16x16x16_bf16,    F32_BF16_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F16_16X16X16_F16_w64",     int_amdgcn_wmma_f16_16x16x16_f16,     F16_F16_WMMA_w64,1>;
+  defm : WMMAPat<"V_WMMA_BF16_16X16X16_BF16_w64",   int_amdgcn_wmma_bf16_16x16x16_bf16,   BF16_BF16_WMMA_w64,1>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X16_IU8_w64",     int_amdgcn_wmma_i32_16x16x16_iu8,     I32_IU8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X16_IU4_w64",     int_amdgcn_wmma_i32_16x16x16_iu4,     I32_IU4X16_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_fp8, F32_FP8BF8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_FP8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_fp8_bf8, F32_FP8BF8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_FP8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_fp8, F32_FP8BF8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_F32_16X16X16_BF8_BF8_w64", int_amdgcn_wmma_f32_16x16x16_bf8_bf8, F32_FP8BF8_WMMA_w64>;
+  defm : WMMAPat<"V_WMMA_I32_16X16X32_IU4_w64",     int_amdgcn_wmma_i32_16x16x32_iu4,     I32_IU4X32_WMMA_w64>;
+
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_F16_w64_twoaddr,     int_amdgcn_swmmac_f32_16x16x32_f16,     F32_F16_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr,    int_amdgcn_swmmac_f32_16x16x32_bf16,    F32_BF16_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F16_16X16X32_F16_w64_twoaddr,     int_amdgcn_swmmac_f16_16x16x32_f16,     F16_F16_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr,   int_amdgcn_swmmac_bf16_16x16x32_bf16,   BF16_BF16_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr,     int_amdgcn_swmmac_i32_16x16x32_iu8,     I32_IU8_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr,     int_amdgcn_swmmac_i32_16x16x32_iu4,     I32_IU4X32_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr,     int_amdgcn_swmmac_i32_16x16x64_iu4,     I32_IU4X64_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_fp8, F32_FP8BF8_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_fp8_bf8, F32_FP8BF8_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_fp8, F32_FP8BF8_SWMMAC_w64>;
+  def : SWMMACPat<V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr, int_amdgcn_swmmac_f32_16x16x32_bf8_bf8, F32_FP8BF8_SWMMAC_w64>;
+}
+
+
 //===----------------------------------------------------------------------===//
 // Begin Real Encodings
 //===----------------------------------------------------------------------===//
@@ -1005,6 +1400,99 @@ multiclass VOP3P_Real_Base<GFXGen Gen, bits<7> op, string backing_ps_name = NAME
     VOP3Pe_gfx11_gfx12<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl>;
 }
 
+class VOP3PeWmma<bits<7> op, VOPProfile P, VOP3PWMMA_Profile WMMAP>
+    : VOP3Pe_gfx11_gfx12<op, P>{
+  // opsel
+  let Inst{11} = !cond(!eq(WMMAP.IndexType, 0)  : 0,
+                       !eq(WMMAP.IndexType, 8)  : index_key_8bit{0},
+                       !eq(WMMAP.IndexType, 16) : index_key_16bit{0});
+  let Inst{12} = !if(!eq(WMMAP.IndexType, 8), index_key_8bit{1}, 0);
+  let Inst{13} = 0;
+  // opsel_hi
+  let Inst{59} = 1;
+  let Inst{60} = 1;
+  let Inst{14} = 1;
+  // neg_lo
+  let Inst{61} = !if(WMMAP.NegLo01, src0_modifiers{0}, 0);
+  let Inst{62} = !if(WMMAP.NegLo01, src1_modifiers{0}, 0);
+  let Inst{63} = !if(WMMAP.NegLo2, src2_modifiers{0}, 0);
+  // neg_hi
+  let Inst{8}  = !if(WMMAP.NegHi01, src0_modifiers{1}, 0);
+  let Inst{9}  = !if(WMMAP.NegHi01, src1_modifiers{1}, 0);
+  let Inst{10} = !if(WMMAP.NegHi2, src2_modifiers{1}, 0);
+  // clamp
+  let Inst{15} = !if(WMMAP.IsIU, clamp{0}, 0);
+}
+
+multiclass VOP3P_WMMA_Real_Base<GFXGen Gen, bits<7> op, VOP3PWMMA_Profile WMMAP,
+                                string backing_ps_name = NAME,
+                                string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
+  def Gen.Suffix :
+    VOP3P_Real_Gen<!cast<VOP3P_Pseudo>(backing_ps_name), Gen, asmName>,
+    VOP3PeWmma<op, !cast<VOP3P_Pseudo>(backing_ps_name).Pfl, WMMAP>;
+}
+
+multiclass VOP3P_Real_WMMA_gfx12 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
+  let WaveSizePredicate = isWave32, DecoderNamespace = "GFX12" in {
+    defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
+  }
+}
+
+multiclass VOP3P_Real_WMMA_gfx12w64 <bits<7> op, VOP3PWMMA_Profile WMMAP> {
+  let WaveSizePredicate = isWave64, DecoderNamespace = "WMMAGFX12" in {
+    defm _twoaddr : VOP3P_WMMA_Real_Base <GFX12Gen, op, WMMAP>;
+  }
+}
+
+defm V_WMMA_F32_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x040, F32_F16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF16_w32    : VOP3P_Real_WMMA_gfx12 <0x041, F32_BF16_WMMA_w32>;
+defm V_WMMA_F16_16X16X16_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x042, F16_F16_WMMA_w32>;
+defm V_WMMA_BF16_16X16X16_BF16_w32   : VOP3P_Real_WMMA_gfx12 <0x043, BF16_BF16_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU8_w32     : VOP3P_Real_WMMA_gfx12 <0x044, I32_IU8_WMMA_w32>;
+defm V_WMMA_I32_16X16X16_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x045, I32_IU4X16_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x046, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x047, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x048, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x049, F32_FP8BF8_WMMA_w32>;
+defm V_WMMA_I32_16X16X32_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x04a, I32_IU4X32_WMMA_w32>;
+
+defm V_WMMA_F32_16X16X16_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x040, F32_F16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF16_w64    : VOP3P_Real_WMMA_gfx12w64 <0x041, F32_BF16_WMMA_w64>;
+defm V_WMMA_F16_16X16X16_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x042, F16_F16_WMMA_w64>;
+defm V_WMMA_BF16_16X16X16_BF16_w64   : VOP3P_Real_WMMA_gfx12w64 <0x043, BF16_BF16_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU8_w64     : VOP3P_Real_WMMA_gfx12w64 <0x044, I32_IU8_WMMA_w64>;
+defm V_WMMA_I32_16X16X16_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x045, I32_IU4X16_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x046, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x047, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x048, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_F32_16X16X16_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x049, F32_FP8BF8_WMMA_w64>;
+defm V_WMMA_I32_16X16X32_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x04a, I32_IU4X32_WMMA_w64>;
+
+
+defm V_SWMMAC_F32_16X16X32_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x050, F32_F16_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF16_w32    : VOP3P_Real_WMMA_gfx12 <0x051, F32_BF16_SWMMAC_w32>;
+defm V_SWMMAC_F16_16X16X32_F16_w32     : VOP3P_Real_WMMA_gfx12 <0x052, F16_F16_SWMMAC_w32>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w32   : VOP3P_Real_WMMA_gfx12 <0x053, BF16_BF16_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU8_w32     : VOP3P_Real_WMMA_gfx12 <0x054, I32_IU8_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X32_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x055, I32_IU4X32_SWMMAC_w32>;
+defm V_SWMMAC_I32_16X16X64_IU4_w32     : VOP3P_Real_WMMA_gfx12 <0x056, I32_IU4X64_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x057, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x058, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w32 : VOP3P_Real_WMMA_gfx12 <0x059, F32_FP8BF8_SWMMAC_w32>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w32 : VOP3P_Real_WMMA_gfx12 <0x05a, F32_FP8BF8_SWMMAC_w32>;
+
+defm V_SWMMAC_F32_16X16X32_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x050, F32_F16_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF16_w64    : VOP3P_Real_WMMA_gfx12w64 <0x051, F32_BF16_SWMMAC_w64>;
+defm V_SWMMAC_F16_16X16X32_F16_w64     : VOP3P_Real_WMMA_gfx12w64 <0x052, F16_F16_SWMMAC_w64>;
+defm V_SWMMAC_BF16_16X16X32_BF16_w64   : VOP3P_Real_WMMA_gfx12w64 <0x053, BF16_BF16_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU8_w64     : VOP3P_Real_WMMA_gfx12w64 <0x054, I32_IU8_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X32_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x055, I32_IU4X32_SWMMAC_w64>;
+defm V_SWMMAC_I32_16X16X64_IU4_w64     : VOP3P_Real_WMMA_gfx12w64 <0x056, I32_IU4X64_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x057, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_FP8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x058, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_FP8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x059, F32_FP8BF8_SWMMAC_w64>;
+defm V_SWMMAC_F32_16X16X32_BF8_BF8_w64 : VOP3P_Real_WMMA_gfx12w64 <0x05a, F32_FP8BF8_SWMMAC_w64>;
+
 multiclass VOP3P_Real_with_name<GFXGen Gen, bits<7> op,
                           string backing_ps_name = NAME,
                           string asmName = !cast<VOP3P_Pseudo>(NAME).Mnemonic> {
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 8b6d1ddba595e..20d7c88fb7e59 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -124,6 +124,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   let IsPacked = P.IsPacked;
   let IsMAI = P.IsMAI;
   let IsWMMA = P.IsWMMA;
+  let IsSWMMAC = P.IsSWMMAC;
 
   let AsmOperands = !if(isVop3OpSel,
                         P.AsmVOP3OpSel,
@@ -383,6 +384,8 @@ class VOP3Pe <bits<7> op, VOPProfile P> : Enc64 {
   bits<4> src2_modifiers;
   bits<9> src2;
   bits<1> clamp;
+  bits<2> index_key_8bit;
+  bits<1> index_key_16bit;
 
   let Inst{7-0} = vdst;
   let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
diff --git a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
index a08ca86c8a619..4b6f6159ff23f 100644
--- a/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
+++ b/llvm/test/Analysis/UniformityAnalysis/AMDGPU/intrinsics.ll
@@ -63,52 +63,140 @@ define amdgpu_kernel void @writelane(ptr addrspace(1) %out) #0 {
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
 define amdgpu_kernel void @wmma_f32_16x16x16_f16(<16 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
-  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %A, <16 x half> %B, <8 x float> %C)
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %A, <16 x half> %B, <8 x float> %C)
   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
 define amdgpu_kernel void @wmma_f32_16x16x16_ibf16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
-  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
+  %tmp0 = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %A, <16 x i16> %B, <8 x float> %C)
   store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
 define amdgpu_kernel void @wmma_f16_16x16x16_f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, ptr addrspace(1) %out) {
 bb:
-  %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
+  %tmp0 = call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %A, <16 x half> %B, <16 x half> %C, i1 false)
   store <16 x half> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
 define amdgpu_kernel void @wmma_f16_16x16x16_bf16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, ptr addrspace(1) %out) {
 bb:
-  %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
+  %tmp0 = call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %A, <16 x i16> %B, <16 x i16> %C, i1 false)
   store <16 x i16> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
 define amdgpu_kernel void @wmma_i32_16x16x16_ui8(<4 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 bb:
-  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
+  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 false, <4 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i1 false)
   store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
-; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
 define amdgpu_kernel void @wmma_i32_16x16x16_ui4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
 bb:
-  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
+  %tmp0 = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 false, <2 x i32> %A, i1 false, <2 x i32> %B, <8 x i32> %C, i1 false)
   store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
   ret void
 }
 
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+    store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+    ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+    store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+    ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+    store <8 x half> %tmp0, ptr addrspace(1) %out, align 32
+    ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+  store <8 x i16> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+define amdgpu_kernel void @swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.i16(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+define amdgpu_kernel void @swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 false, i32 %A, i1 false, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.i16(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+define amdgpu_kernel void @swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 false, <2 x i32> %A, i1 false, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 false)
+  store <8 x i32> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
+; CHECK: DIVERGENT: %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+define amdgpu_kernel void @swmmac_f32_16x16x32_bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+bb:
+  %tmp0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %tmp0, ptr addrspace(1) %out, align 32
+  ret void
+}
+
 ; CHECK: DIVERGENT: %tmp0 = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1) %gep)
 define amdgpu_kernel void @global_load_tr_b64_v2i32(ptr addrspace(1) %addr, ptr addrspace(1) %out) {
 bb:
@@ -190,12 +278,23 @@ declare i32 @llvm.amdgcn.mov.dpp.i32(i32, i32, i32, i32, i1) #1
 declare i32 @llvm.amdgcn.mov.dpp8.i32(i32, i32) #1
 declare i32 @llvm.amdgcn.update.dpp.i32(i32, i32, i32, i32, i32, i1) #1
 declare i32 @llvm.amdgcn.writelane(i32, i32, i32) #1
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half>, <16 x half> , <8 x float>) #1
-declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16>, <16 x i16> , <8 x float>) #1
-declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
-declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
-declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half>, <16 x half> , <8 x float>) #1
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16>, <16 x i16> , <8 x float>) #1
+declare <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half>, <16 x half> , <16 x half>, i1 immarg) #1
+declare <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16>, <16 x i16> , <16 x i16>, i1 immarg) #1
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 immarg, <4 x i32>, i1 immarg, <4 x i32> , <8 x i32>, i1 immarg) #1
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32> , <8 x i32>, i1 immarg) #1
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16, i1)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16, i1)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8(<2 x i32>, <4 x i32>, <8 x float>, i16)
 
 declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32(ptr addrspace(1))
 declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16(ptr addrspace(1))
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
new file mode 100644
index 0000000000000..ea377531df2ae
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,504 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x half> %C
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <16 x half> %B
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <16 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %fneg.fabs.C = fneg <8 x float> %fabs.C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+  %fneg.fabs.C = fneg <8 x half> %fabs.C
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %el3 = extractelement <8 x float> %C, i32 3
+  %el3.fabs = call float @llvm.fabs.f32(float %el3)
+  %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
+  %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    flat_load_b128 v[12:15], v[8:9]
+; GFX12-NEXT:    flat_load_b128 v[16:19], v[8:9] offset:16
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x101
+; GFX12-NEXT:    v_and_b32_e32 v8, 0xffff, v12
+; GFX12-NEXT:    v_and_b32_e32 v9, 0xffff, v14
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v14, 0xffff, v16
+; GFX12-NEXT:    v_and_b32_e32 v16, 0xffff, v18
+; GFX12-NEXT:    v_lshl_or_b32 v12, v13, 16, v8
+; GFX12-NEXT:    v_lshl_or_b32 v13, v15, 16, v9
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX12-NEXT:    v_lshl_or_b32 v14, v17, 16, v14
+; GFX12-NEXT:    v_lshl_or_b32 v15, v19, 16, v16
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[10:11], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %C = load <16 x half>, ptr %Caddr
+  %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %fneg.C_shuffle = fneg <8 x half> %C_shuffle
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
new file mode 100644
index 0000000000000..6251dfdc392eb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-imm.ll
@@ -0,0 +1,519 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v17, s7 :: v_dual_mov_b32 v16, s6
+; GFX12-NEXT:    v_dual_mov_b32 v15, s5 :: v_dual_mov_b32 v14, s4
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x42004200
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x3f803f80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x3fc03fc0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_dual_mov_b32 v13, s3 :: v_dual_mov_b32 v12, s2
+; GFX12-NEXT:    v_dual_mov_b32 v11, s1 :: v_dual_mov_b32 v10, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, s7 :: v_dual_mov_b32 v10, s6
+; GFX12-NEXT:    v_dual_mov_b32 v9, s5 :: v_dual_mov_b32 v8, s4
+; GFX12-NEXT:    v_dual_mov_b32 v7, s3 :: v_dual_mov_b32 v6, s2
+; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s7, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s4, s0
+; GFX12-NEXT:    s_mov_b32 s5, s0
+; GFX12-NEXT:    s_mov_b32 s6, s0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v13, s7 :: v_dual_mov_b32 v12, s6
+; GFX12-NEXT:    v_dual_mov_b32 v11, s5 :: v_dual_mov_b32 v10, s4
+; GFX12-NEXT:    v_dual_mov_b32 v9, s3 :: v_dual_mov_b32 v8, s2
+; GFX12-NEXT:    v_dual_mov_b32 v7, s1 :: v_dual_mov_b32 v6, s0
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
new file mode 100644
index 0000000000000..fe6d16bd8b5ea
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-iu-modifiers.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
new file mode 100644
index 0000000000000..c80d7a6d9a836
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32-swmmac-index_key.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX12-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX12-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX12-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
+  store <8 x half> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
+  store <8 x half> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX12-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
+  store <8 x i16> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
+  store <8 x i16> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+  store <8 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+  store <8 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v11, v[11:12], off
+; GFX12-NEXT:    v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX12-NEXT:    v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX12-NEXT:    v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX12-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[13:14], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+  store <8 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+  store <8 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
new file mode 100644
index 0000000000000..c4edc5b72b2fb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w32.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
new file mode 100644
index 0000000000000..eafbfb6d1eeb5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,459 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x half> %C
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %fneg.fabs.C = fneg <4 x float> %fabs.C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+  %fneg.fabs.C = fneg <4 x half> %fabs.C
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %el3 = extractelement <4 x float> %C, i32 3
+  %el3.fabs = call float @llvm.fabs.f32(float %el3)
+  %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
+  %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b128 v[8:11], v[4:5]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_and_b32_e32 v4, 0xffff, v8
+; GFX12-NEXT:    v_and_b32_e32 v5, 0xffff, v10
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-NEXT:    v_lshl_or_b32 v4, v9, 16, v4
+; GFX12-NEXT:    v_lshl_or_b32 v5, v11, 16, v5
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %C = load <8 x half>, ptr %Caddr
+  %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %fneg.C_shuffle = fneg <4 x half> %C_shuffle
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
new file mode 100644
index 0000000000000..c4d70fd5f0637
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-imm.ll
@@ -0,0 +1,430 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v9, s3
+; GFX12-NEXT:    v_mov_b32_e32 v8, s2
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v9, s3
+; GFX12-NEXT:    v_mov_b32_e32 v8, s2
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x42004200
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x3f803f80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x3fc03fc0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s1
+; GFX12-NEXT:    v_mov_b32_e32 v6, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_mov_b32 s0, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_movk_i32 s0, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    s_mov_b32 s3, s0
+; GFX12-NEXT:    s_mov_b32 s1, s0
+; GFX12-NEXT:    s_mov_b32 s2, s0
+; GFX12-NEXT:    v_mov_b32_e32 v7, s3
+; GFX12-NEXT:    v_mov_b32_e32 v6, s2
+; GFX12-NEXT:    v_mov_b32_e32 v5, s1
+; GFX12-NEXT:    v_mov_b32_e32 v4, s0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
new file mode 100644
index 0000000000000..7e1d09805df3f
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-iu-modifiers.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
new file mode 100644
index 0000000000000..b6f1828dce257
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64-swmmac-index_key.ll
@@ -0,0 +1,472 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX12-NEXT:    v_mov_b32_e32 v23, v9
+; GFX12-NEXT:    v_mov_b32_e32 v22, v8
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v9
+; GFX12-NEXT:    v_mov_b32_e32 v26, v8
+; GFX12-NEXT:    v_mov_b32_e32 v25, v7
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v31, v9
+; GFX12-NEXT:    v_mov_b32_e32 v30, v8
+; GFX12-NEXT:    v_mov_b32_e32 v29, v7
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX12-NEXT:    v_mov_b32_e32 v23, v9
+; GFX12-NEXT:    v_mov_b32_e32 v22, v8
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v9
+; GFX12-NEXT:    v_mov_b32_e32 v26, v8
+; GFX12-NEXT:    v_mov_b32_e32 v25, v7
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v31, v9
+; GFX12-NEXT:    v_mov_b32_e32 v30, v8
+; GFX12-NEXT:    v_mov_b32_e32 v29, v7
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX12-NEXT:    v_mov_b32_e32 v9, v7
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v7
+; GFX12-NEXT:    v_mov_b32_e32 v18, v6
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
+  store <4 x half> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
+  store <4 x half> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
+  store <4 x half> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
+  store <4 x half> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX12-NEXT:    v_mov_b32_e32 v9, v7
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v7
+; GFX12-NEXT:    v_mov_b32_e32 v18, v6
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
+  store <4 x i16> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
+  store <4 x i16> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
+  store <4 x i16> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
+  store <4 x i16> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
+  store <4 x i32> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
+  store <4 x i32> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v6, v[6:7], off
+; GFX12-NEXT:    v_mov_b32_e32 v15, v5
+; GFX12-NEXT:    v_mov_b32_e32 v14, v4
+; GFX12-NEXT:    v_mov_b32_e32 v13, v3
+; GFX12-NEXT:    v_mov_b32_e32 v12, v2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v16, v6
+; GFX12-NEXT:    v_mov_b32_e32 v15, v5
+; GFX12-NEXT:    v_mov_b32_e32 v14, v4
+; GFX12-NEXT:    v_mov_b32_e32 v13, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    global_store_b128 v[9:10], v[13:16], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
new file mode 100644
index 0000000000000..0d1871a18d405
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/wmma-gfx12-w64.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
new file mode 100644
index 0000000000000..fb2af36839b5d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,499 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> %C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %fneg.B, <8 x float> %C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %fneg.A, <8 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> %C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x half> %C
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fabs.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <8 x float> %C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fneg.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <16 x half> %B
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %fneg.A, <16 x half> %B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <16 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %fneg.B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x float> @llvm.fabs.v8f32(<8 x float> %C)
+  %fneg.fabs.C = fneg <8 x float> %fabs.C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <8 x half> @llvm.fabs.v8f16(<8 x half> %C)
+  %fneg.fabs.C = fneg <8 x half> %fabs.C
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.fabs.C, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_and_b32_e32 v11, 0x7fffffff, v11
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %el3 = extractelement <8 x float> %C, i32 3
+  %el3.fabs = call float @llvm.fabs.f32(float %el3)
+  %partial.fabs.C = insertelement <8 x float> %C, float %el3.fabs, i32 3
+  %fneg.partial.fabs.C = fneg <8 x float> %partial.fabs.C
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %fneg.partial.fabs.C)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<8 x half> %A, <8 x half> %B, <8 x float> %C, <8 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <8 x half> %A
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half> %fneg.A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, <8 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<8 x half> %A, <8 x half> %B, <8 x half> %C, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %fneg.B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<8 x half> %A, <8 x half> %B, ptr %Caddr, <8 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    flat_load_b128 v[12:15], v[8:9] offset:16
+; GFX12-NEXT:    flat_load_b128 v[16:19], v[8:9]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x101
+; GFX12-NEXT:    v_perm_b32 v15, v15, v14, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v14, v13, v12, 0x5040100
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_perm_b32 v13, v19, v18, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v12, v17, v16, 0x5040100
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[12:15], v[0:3], v[4:7], v[12:15] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[10:11], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %C = load <16 x half>, ptr %Caddr
+  %C_shuffle = shufflevector <16 x half> %C, <16 x half> undef, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %fneg.C_shuffle = fneg <8 x half> %C_shuffle
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %fneg.C_shuffle , i1 0)
+  store <8 x half> %res, <8 x half> addrspace(1)* %out
+  ret void
+}
+
+declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
+declare <8 x float> @llvm.fabs.v8f32(<8 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
new file mode 100644
index 0000000000000..51f93b57f38e9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-imm.ll
@@ -0,0 +1,431 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT:    v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
+; GFX12-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
+; GFX12-NEXT:    v_mov_b32_e32 v17, v10
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v10, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT:    v_dual_mov_b32 v13, v10 :: v_dual_mov_b32 v14, v10
+; GFX12-NEXT:    v_dual_mov_b32 v15, v10 :: v_dual_mov_b32 v16, v10
+; GFX12-NEXT:    v_mov_b32_e32 v17, v10
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[10:17], v[0:3], v[4:7], v[10:17]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[14:17], off offset:16
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<8 x half> %A, <8 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v10, 0x42004200
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT:    v_mov_b32_e32 v13, v10
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> <half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], 1.0
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<8 x i16> %A, <8 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v10, 0x3fc03fc0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v11, v10 :: v_dual_mov_b32 v12, v10
+; GFX12-NEXT:    v_mov_b32_e32 v13, v10
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[10:13], v[0:3], v[4:7], v[10:13]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[10:13], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> <i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v5, v4 :: v_dual_mov_b32 v6, v4
+; GFX12-NEXT:    v_dual_mov_b32 v7, v4 :: v_dual_mov_b32 v8, v4
+; GFX12-NEXT:    v_dual_mov_b32 v9, v4 :: v_dual_mov_b32 v10, v4
+; GFX12-NEXT:    v_mov_b32_e32 v11, v4
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:11], v0, v1, v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> <float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], 1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(<2 x i32> %A, <2 x i32> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6
+; GFX12-NEXT:    v_dual_mov_b32 v9, v6 :: v_dual_mov_b32 v10, v6
+; GFX12-NEXT:    v_dual_mov_b32 v11, v6 :: v_dual_mov_b32 v12, v6
+; GFX12-NEXT:    v_mov_b32_e32 v13, v6
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[6:13], v[0:1], v[2:3], v[6:13]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[4:5], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> <i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
new file mode 100644
index 0000000000000..48297932aa598
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-iu-modifiers.ll
@@ -0,0 +1,309 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 1, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 1, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 1, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 1, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 1, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 1, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 1)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
new file mode 100644
index 0000000000000..43538768f00fd
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32-swmmac-index_key.ll
@@ -0,0 +1,321 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX12-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v20, v[20:21], off
+; GFX12-NEXT:    v_dual_mov_b32 v33, v19 :: v_dual_mov_b32 v32, v18
+; GFX12-NEXT:    v_dual_mov_b32 v31, v17 :: v_dual_mov_b32 v30, v16
+; GFX12-NEXT:    v_dual_mov_b32 v29, v15 :: v_dual_mov_b32 v28, v14
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[26:33], v[0:3], v[4:11], v20
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[22:23], v[30:33], off offset:16
+; GFX12-NEXT:    global_store_b128 v[22:23], v[26:29], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[24:25], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[24:25], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<8 x half> %A, <16 x half> %B, <8 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX12-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index0)
+  store <8 x half> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index1)
+  store <8 x half> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v16, v[16:17], off
+; GFX12-NEXT:    v_dual_mov_b32 v25, v15 :: v_dual_mov_b32 v24, v14
+; GFX12-NEXT:    v_dual_mov_b32 v23, v13 :: v_dual_mov_b32 v22, v12
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[22:25], v[0:3], v[4:11], v16
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[22:25], off
+; GFX12-NEXT:    global_store_b128 v[20:21], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index0)
+  store <8 x i16> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index1)
+  store <8 x i16> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+  store <8 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+  store <8 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v11, v[11:12], off
+; GFX12-NEXT:    v_dual_mov_b32 v24, v10 :: v_dual_mov_b32 v23, v9
+; GFX12-NEXT:    v_dual_mov_b32 v22, v8 :: v_dual_mov_b32 v21, v7
+; GFX12-NEXT:    v_dual_mov_b32 v20, v6 :: v_dual_mov_b32 v19, v5
+; GFX12-NEXT:    v_dual_mov_b32 v18, v4 :: v_dual_mov_b32 v17, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[17:24], v0, v[1:2], v11
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[13:14], v[21:24], off offset:16
+; GFX12-NEXT:    global_store_b128 v[13:14], v[17:20], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index0, i1 0)
+  store <8 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index1, i1 0)
+  store <8 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v14, v[14:15], off
+; GFX12-NEXT:    v_dual_mov_b32 v27, v13 :: v_dual_mov_b32 v26, v12
+; GFX12-NEXT:    v_dual_mov_b32 v25, v11 :: v_dual_mov_b32 v24, v10
+; GFX12-NEXT:    v_dual_mov_b32 v23, v9 :: v_dual_mov_b32 v22, v8
+; GFX12-NEXT:    v_dual_mov_b32 v21, v7 :: v_dual_mov_b32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[20:27], v[0:1], v[2:5], v14
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[24:27], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[20:23], off
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[18:19], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index0)
+  store <8 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index1)
+  store <8 x float> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
new file mode 100644
index 0000000000000..2db3b07de54d0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w32.ll
@@ -0,0 +1,370 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half> %A, <8 x half> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[16:17], v[12:15], off offset:16
+; GFX12-NEXT:    global_store_b128 v[16:17], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16> %A, <8 x i16> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half> %A, <8 x half> %B, <8 x half> %C, i1 0)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, i1 0)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[10:11], v[6:9], off offset:16
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 0, i32 %A, i1 0, i32 %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(<2 x i32> %A, <2 x i32> %B, <8 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32> %A, <2 x i32> %B, <8 x float> %C)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(<2 x i32> %A, <2 x i32> %B, <8 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[8:11], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 0, <2 x i32> %A, i1 0, <2 x i32> %B, <8 x i32> %C, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half> %A, <16 x half> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[21:22], v[16:19], off offset:16
+; GFX12-NEXT:    global_store_b128 v[21:22], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16> %A, <16 x i16> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half> %A, <16 x half> %B, <8 x half> %C, i16 %Index)
+  store <8 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+; GFX12-NEXT:    global_store_b128 v[17:18], v[12:15], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16> %A, <16 x i16> %B, <8 x i16> %C, i16 %Index)
+  store <8 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, <2 x i32> %B, <8 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[12:13], v[7:10], off offset:16
+; GFX12-NEXT:    global_store_b128 v[12:13], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <8 x i32> %C, i16 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(<2 x i32> %A, <4 x i32> %B, <8 x i32> %C, i32 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 0, <2 x i32> %A, i1 0, <4 x i32> %B, <8 x i32> %C, i32 %Index, i1 0)
+  store <8 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    global_store_b128 v[15:16], v[10:13], off offset:16
+; GFX12-NEXT:    global_store_b128 v[15:16], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32> %A, <4 x i32> %B, <8 x float> %C, i16 %Index)
+  store <8 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16.v8f16.v8f32(<8 x half>, <8 x half>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v8i16.v8i16.v8f32(<8 x i16>, <8 x i16>, <8 x float>)
+declare <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16.v8f16.v8f16(<8 x half>, <8 x half>, <8 x half>, i1 immarg)
+declare <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v8i16.v8i16.v8i16(<8 x i16>, <8 x i16>, <8 x i16>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.i32.i32.v8i32(i1 immarg, i32, i1 immarg, i32, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v8f32.v2i32.v2i32.v8f32(<2 x i32>, <2 x i32>, <8 x float>)
+declare <8 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v8i32.v2i32.v2i32.v8i32(i1 immarg, <2 x i32>, i1 immarg, <2 x i32>, <8 x i32>, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v8f32.v8f16.v16f16.v8f32.i16(<8 x half>, <16 x half>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v8f32.v8i16.v16i16.v8f32.i16(<8 x i16>, <16 x i16>, <8 x float>, i16)
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.v8f16.i16(<8 x half>, <16 x half>, <8 x half>, i16)
+declare <8 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v8i16.v8i16.v16i16.v8i16.i16(<8 x i16>, <16 x i16>, <8 x i16>, i16)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v8i32.v2i32.v4i32.v8i32.i16(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v8i32.i32.v2i32.v8i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <8 x i32>, i16 %Index, i1 immarg)
+declare <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v8i32.v2i32.v4i32.v8i32.i32(i1 immarg, <2 x i32>, i1 immarg, <4 x i32>, <8 x i32>, i32 %Index, i1 immarg)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
+declare <8 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v8f32.v2i32.v4i32.v8f32.i16(<2 x i32>, <4 x i32>, <8 x float>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
new file mode 100644
index 0000000000000..1f2ffaf762712
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-f16-f32-matrix-modifiers.ll
@@ -0,0 +1,456 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %fneg.B, <4 x float> %C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_negC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_absC(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negA(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %fneg.A, <4 x half> %B, <4 x half> %C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> %C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x half> %C
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_absC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fabs.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_negC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_negC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.C = fneg <4 x float> %C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fneg.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_absC(i32 %A, i32 %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_absC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32 %A, i32 %B, <4 x float> %fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x float> %C, i16 %Index)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x float> %C, i16 %Index, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x float> %C, i16 %Index)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negA(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %fneg.A, <8 x half> %B, <4 x half> %C, i16 %Index)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_negB(<4 x half> %A, <8 x half> %B, <4 x half> %C, i16 %Index, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_negB:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <8 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half> %A, <8 x half> %fneg.B, <4 x half> %C, i16 %Index)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; both neg and abs patterns (wmma matrix C f32 or f16 )
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x float> @llvm.fabs.v4f32(<4 x float> %C)
+  %fneg.fabs.C = fneg <4 x float> %fabs.C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negabsC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negabsC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] neg_hi:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fabs.C = call <4 x half> @llvm.fabs.v4f16(<4 x half> %C)
+  %fneg.fabs.C = fneg <4 x half> %fabs.C
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.fabs.C, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_neg_partial_fabsA(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_neg_partial_fabsA:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_and_b32_e32 v7, 0x7fffffff, v7
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %el3 = extractelement <4 x float> %C, i32 3
+  %el3.fabs = call float @llvm.fabs.f32(float %el3)
+  %partial.fabs.C = insertelement <4 x float> %C, float %el3.fabs, i32 3
+  %fneg.partial.fabs.C = fneg <4 x float> %partial.fabs.C
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %fneg.partial.fabs.C)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+; A or B matrix modifier and constant in C
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_negA_constantC(<4 x half> %A, <4 x half> %B, <4 x float> %C, <4 x float> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_negA_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0 neg_lo:[1,0,0] neg_hi:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.A = fneg <4 x half> %A
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half> %fneg.A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, <4 x float> addrspace(1)* %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negB_constantC(<4 x half> %A, <4 x half> %B, <4 x half> %C, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negB_constantC:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0 neg_lo:[0,1,0] neg_hi:[0,1,0]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %fneg.B = fneg <4 x half> %B
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %fneg.B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+; pack f16 elements with v_perm_b32 since they don't come from same b32
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_negC_pack(<4 x half> %A, <4 x half> %B, ptr %Caddr, <4 x half> addrspace(1)* %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_negC_pack:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    flat_load_b128 v[8:11], v[4:5]
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    v_perm_b32 v5, v11, v10, 0x5040100
+; GFX12-NEXT:    v_perm_b32 v4, v9, v8, 0x5040100
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %C = load <8 x half>, ptr %Caddr
+  %C_shuffle = shufflevector <8 x half> %C, <8 x half> undef, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %fneg.C_shuffle = fneg <4 x half> %C_shuffle
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half> %A, <4 x half> %B, <4 x half> %fneg.C_shuffle , i1 0)
+  store <4 x half> %res, <4 x half> addrspace(1)* %out
+  ret void
+}
+
+declare <4 x half> @llvm.fabs.v4f16(<4 x half>)
+declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
+declare float @llvm.fabs.f32(float)
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f16.v8f16.v4f32.i16(<4 x half>, <8 x half>, <4 x float>, i16)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v8f16.v4f16.i16(<4 x half>, <8 x half>, <4 x half>, i16)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
new file mode 100644
index 0000000000000..fa0a7c98cea32
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-imm.ll
@@ -0,0 +1,373 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v7, v6
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v9, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v7, v6
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v9, v6
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[6:9], v[0:1], v[2:3], v[6:9]
+; GFX12-NEXT:    global_store_b128 v[4:5], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 1.0, half 1.0, half 1.0, half 1.0>, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16_imm_non_inlineable(<4 x half> %A, <4 x half> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x42004200
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v7, v6
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> <half 3.0, half 3.0, half 3.0, half 3.0>, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], 1.0
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16256, i16 16256, i16 16256, i16 16256>, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16_imm_non_inlineable(<4 x i16> %A, <4 x i16> %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v6, 0x3fc03fc0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v7, v6
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[6:7], v[0:1], v[2:3], v[6:7]
+; GFX12-NEXT:    global_store_b64 v[4:5], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> <i16 16320, i16 16320, i16 16320, i16 16320>, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_imm_non_inlineable(i32 %A, i32 %B, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, 1.0
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x40400000
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> <float 3.0, float 3.0, float 3.0, float 3.0>)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, 1
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 1, i32 1, i32 1, i32 1>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_imm_non_inlineable(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_imm_non_inlineable:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_mov_b32_e32 v4, 0x80
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX12-NEXT:    v_mov_b32_e32 v5, v4
+; GFX12-NEXT:    v_mov_b32_e32 v6, v4
+; GFX12-NEXT:    v_mov_b32_e32 v7, v4
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[4:7], v0, v1, v[4:7]
+; GFX12-NEXT:    global_store_b128 v[2:3], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> <i32 128, i32 128, i32 128, i32 128>, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
new file mode 100644
index 0000000000000..803453e45ced0
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-iu-modifiers.ll
@@ -0,0 +1,274 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src0(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 1, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_zext_src1(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 1, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_clamp(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src0(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src0:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 1, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_zext_src1(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_zext_src1:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 1, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_clamp(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_clamp:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 1)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
new file mode 100644
index 0000000000000..0c5b19a8416fb
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64-swmmac-index_key.ll
@@ -0,0 +1,472 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX12-NEXT:    v_mov_b32_e32 v23, v9
+; GFX12-NEXT:    v_mov_b32_e32 v22, v8
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v9
+; GFX12-NEXT:    v_mov_b32_e32 v26, v8
+; GFX12-NEXT:    v_mov_b32_e32 v25, v7
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v31, v9
+; GFX12-NEXT:    v_mov_b32_e32 v30, v8
+; GFX12-NEXT:    v_mov_b32_e32 v29, v7
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v10, v[10:11], off
+; GFX12-NEXT:    v_mov_b32_e32 v23, v9
+; GFX12-NEXT:    v_mov_b32_e32 v22, v8
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v9
+; GFX12-NEXT:    v_mov_b32_e32 v26, v8
+; GFX12-NEXT:    v_mov_b32_e32 v25, v7
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v31, v9
+; GFX12-NEXT:    v_mov_b32_e32 v30, v8
+; GFX12-NEXT:    v_mov_b32_e32 v29, v7
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[20:23], v[0:1], v[2:5], v10
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[24:27], v[0:1], v[2:5], v10 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[28:31], v[0:1], v[2:5], v10 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+; GFX12-NEXT:    global_store_b128 v[12:13], v[20:23], off
+; GFX12-NEXT:    global_store_b128 v[14:15], v[24:27], off
+; GFX12-NEXT:    global_store_b128 v[16:17], v[28:31], off
+; GFX12-NEXT:    global_store_b128 v[18:19], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16_index_key(<4 x half> %A, <8 x half> %B, <4 x half> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX12-NEXT:    v_mov_b32_e32 v9, v7
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v7
+; GFX12-NEXT:    v_mov_b32_e32 v18, v6
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index0)
+  store <4 x half> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index1)
+  store <4 x half> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index2)
+  store <4 x half> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index3)
+  store <4 x half> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16_index_key(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v22, v[8:9], off
+; GFX12-NEXT:    v_mov_b32_e32 v9, v7
+; GFX12-NEXT:    v_mov_b32_e32 v8, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v7
+; GFX12-NEXT:    v_mov_b32_e32 v18, v6
+; GFX12-NEXT:    v_mov_b32_e32 v21, v7
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[8:9], v[0:1], v[2:5], v22
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[18:19], v[0:1], v[2:5], v22 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[20:21], v[0:1], v[2:5], v22 index_key:2
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v22 index_key:3
+; GFX12-NEXT:    global_store_b64 v[10:11], v[8:9], off
+; GFX12-NEXT:    global_store_b64 v[12:13], v[18:19], off
+; GFX12-NEXT:    global_store_b64 v[14:15], v[20:21], off
+; GFX12-NEXT:    global_store_b64 v[16:17], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index0)
+  store <4 x i16> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index1)
+  store <4 x i16> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index2)
+  store <4 x i16> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index3)
+  store <4 x i16> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index2, i1 0)
+  store <4 x i32> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index3, i1 0)
+  store <4 x i32> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4_index_key(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v6, v[6:7], off
+; GFX12-NEXT:    v_mov_b32_e32 v15, v5
+; GFX12-NEXT:    v_mov_b32_e32 v14, v4
+; GFX12-NEXT:    v_mov_b32_e32 v13, v3
+; GFX12-NEXT:    v_mov_b32_e32 v12, v2
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[12:15], v0, v1, v6
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+; GFX12-NEXT:    global_store_b128 v[8:9], v[12:15], off
+; GFX12-NEXT:    global_store_b128 v[10:11], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4_index_key(i32 %A, <2 x i32> %B, <4 x i32> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v16, v6
+; GFX12-NEXT:    v_mov_b32_e32 v15, v5
+; GFX12-NEXT:    v_mov_b32_e32 v14, v4
+; GFX12-NEXT:    v_mov_b32_e32 v13, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[13:16], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    global_store_b128 v[9:10], v[13:16], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <2 x i16>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <2 x i16> %IndexVec, i32 0
+  %res0 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index0, i1 0)
+  store <4 x i32> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <2 x i16> %IndexVec, i32 1
+  %res1 = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index1, i1 0)
+  store <4 x i32> %res1, ptr addrspace(1) %out1
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8_index_key(i32 %A, <2 x i32> %B, <4 x float> %C, ptr addrspace(1) %IndexVecPtr, ptr addrspace(1) %out0, ptr addrspace(1) %out1, ptr addrspace(1) %out2, ptr addrspace(1) %out3) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8_index_key:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    global_load_b32 v7, v[7:8], off
+; GFX12-NEXT:    v_mov_b32_e32 v20, v6
+; GFX12-NEXT:    v_mov_b32_e32 v19, v5
+; GFX12-NEXT:    v_mov_b32_e32 v18, v4
+; GFX12-NEXT:    v_mov_b32_e32 v17, v3
+; GFX12-NEXT:    v_mov_b32_e32 v24, v6
+; GFX12-NEXT:    v_mov_b32_e32 v23, v5
+; GFX12-NEXT:    v_mov_b32_e32 v22, v4
+; GFX12-NEXT:    v_mov_b32_e32 v21, v3
+; GFX12-NEXT:    v_mov_b32_e32 v28, v6
+; GFX12-NEXT:    v_mov_b32_e32 v27, v5
+; GFX12-NEXT:    v_mov_b32_e32 v26, v4
+; GFX12-NEXT:    v_mov_b32_e32 v25, v3
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[17:20], v0, v[1:2], v7
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[21:24], v0, v[1:2], v7 index_key:1
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_3)
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[25:28], v0, v[1:2], v7 index_key:2
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+; GFX12-NEXT:    global_store_b128 v[9:10], v[17:20], off
+; GFX12-NEXT:    global_store_b128 v[11:12], v[21:24], off
+; GFX12-NEXT:    global_store_b128 v[13:14], v[25:28], off
+; GFX12-NEXT:    global_store_b128 v[15:16], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %IndexVec = load <4 x i8>, ptr addrspace(1) %IndexVecPtr, align 4
+  %Index0 = extractelement <4 x i8> %IndexVec, i32 0
+  %res0 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index0)
+  store <4 x float> %res0, ptr addrspace(1) %out0
+  %Index1 = extractelement <4 x i8> %IndexVec, i32 1
+  %res1 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index1)
+  store <4 x float> %res1, ptr addrspace(1) %out1
+  %Index2 = extractelement <4 x i8> %IndexVec, i32 2
+  %res2 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index2)
+  store <4 x float> %res2, ptr addrspace(1) %out2
+  %Index3 = extractelement <4 x i8> %IndexVec, i32 3
+  %res3 = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index3)
+  store <4 x float> %res3, ptr addrspace(1) %out3
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
new file mode 100644
index 0000000000000..b25f278513331
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-gfx12-w64.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -march=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -verify-machineinstrs < %s | FileCheck %s --check-prefix=GFX12
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float>@llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half> %A, <4 x half> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+; GFX12-NEXT:    global_store_b128 v[8:9], v[4:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16> %A, <4 x i16> %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f16_16x16x16_f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f16_16x16x16_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half> %A, <4 x half> %B, <4 x half> %C, i1 0)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_bf16_16x16x16_bf16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_bf16_16x16x16_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+; GFX12-NEXT:    global_store_b64 v[6:7], v[4:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, i1 0)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu8(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x16_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x16_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_fp8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_fp8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_f32_16x16x16_bf8_bf8(i32 %A, i32 %B, <4 x float> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_f32_16x16x16_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32 %A, i32 %B, <4 x float> %C)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_wmma_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_wmma_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+; GFX12-NEXT:    global_store_b128 v[6:7], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half> %A, <8 x half> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+; GFX12-NEXT:    global_store_b128 v[11:12], v[6:9], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16> %A, <8 x i16> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f16_16x16x32_f16(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f16_16x16x32_f16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half> %A, <8 x half> %B, <4 x half> %C, i8 %Index)
+  store <4 x half> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_bf16_16x16x32_bf16(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_bf16_16x16x32_bf16:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+; GFX12-NEXT:    global_store_b64 v[9:10], v[6:7], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16> %A, <8 x i16> %B, <4 x i16> %C, i8 %Index)
+  store <4 x i16> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu8(i32 %A, <2 x i32> %B, <4 x i32> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i8 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x32_iu4(i32 %A, i32 %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x32_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+; GFX12-NEXT:    global_store_b128 v[7:8], v[2:5], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 0, i32 %A, i1 0, i32 %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_i32_16x16x64_iu4(i32 %A, <2 x i32> %B, <4 x i32> %C, i16 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_i32_16x16x64_iu4:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 0, i32 %A, i1 0, <2 x i32> %B, <4 x i32> %C, i16 %Index, i1 0)
+  store <4 x i32> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_fp8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_fp8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_fp8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_fp8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+define amdgpu_ps void @test_swmmac_f32_16x16x32_bf8_bf8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index, ptr addrspace(1) %out) {
+; GFX12-LABEL: test_swmmac_f32_16x16x32_bf8_bf8:
+; GFX12:       ; %bb.0: ; %bb
+; GFX12-NEXT:    v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+; GFX12-NEXT:    global_store_b128 v[8:9], v[3:6], off
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
+bb:
+  %res = call <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32 %A, <2 x i32> %B, <4 x float> %C, i8 %Index)
+  store <4 x float> %res, ptr addrspace(1) %out
+  ret void
+}
+
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16.v4f16.v4f32(<4 x half>, <4 x half>, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v4i16.v4i16.v4f32(<4 x i16>, <4 x i16>, <4 x float>)
+declare <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16.v4f16.v4f16(<4 x half>, <4 x half>, <4 x half>, i1 immarg)
+declare <4 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v4i16.v4i16.v4i16.v4i16(<4 x i16>, <4 x i16>, <4 x i16>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.fp8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.fp8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf8.bf8.v4f32.i32.i32.v4f32(i32, i32, <4 x float>)
+declare <4 x i32> @llvm.amdgcn.wmma.i32.16x16x32.iu4.v4i32.i32.i32.v4i32(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.f16.v4f32.v4f16.v8f16.v4f32.i8(<4 x half>, <8 x half>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf16.v4f32.v4i16.v8i16.v4f32.i8(<4 x i16>, <8 x i16>, <4 x float>, i8)
+declare <4 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v4f16.v4f16.v8f16.v4f16.i8(<4 x half>, <8 x half>, <4 x half>, i8)
+declare <4 x i16> @llvm.amdgcn.swmmac.bf16.16x16x32.bf16.v4i16.v4i16.v8i16.v4i16.i8(<4 x i16>, <8 x i16>, <4 x i16>, i8)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu8.v4i32.i32.v2i32.v4i32.i8(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i8 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x32.iu4.v4i32.i32.i32.v4i32.i16(i1 immarg, i32, i1 immarg, i32, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x i32> @llvm.amdgcn.swmmac.i32.16x16x64.iu4.v4i32.i32.v2i32.v4i32.i16(i1 immarg, i32, i1 immarg, <2 x i32>, <4 x i32>, i16 %Index, i1 immarg)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.fp8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.fp8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
+declare <4 x float> @llvm.amdgcn.swmmac.f32.16x16x32.bf8.bf8.v4f32.i32.v2i32.v4f32.i8(i32, <2 x i32>, <4 x float>, i8)
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
new file mode 100644
index 0000000000000..47a1e06c5d7dc
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w32.mir
@@ -0,0 +1,351 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+
+# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
+#  $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
+#  $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
+
+---
+name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43 = V_WMMA_F32_16X16X16_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F16_16X16X16_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_BF16_16X16X16_BF16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35, 8, killed $vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+---
+name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X16_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_I32_16X16X16_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_FP8_FP8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_F32_16X16X16_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39 = V_WMMA_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr0_vgpr1, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_F16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr44, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46, $vgpr47, $vgpr48, $vgpr49, $vgpr50
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47 = V_SWMMAC_F32_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+
+    ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_F16_16X16X32_F16_w32_twoaddr 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 8, killed $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+
+    ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44, $vgpr45, $vgpr46
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr40_vgpr41_vgpr42_vgpr43 = V_SWMMAC_BF16_16X16X32_BF16_w32_twoaddr 8, killed $vgpr28_vgpr29_vgpr30_vgpr31, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr40_vgpr41_vgpr42_vgpr43, killed $vgpr44, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_I32_16X16X32_IU8_w32_twoaddr 8, killed $vgpr28_vgpr29, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38 = V_SWMMAC_I32_16X16X32_IU4_w32_twoaddr 8, killed $vgpr28, 8, killed $vgpr28_vgpr29, killed $vgpr31_vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_I32_16X16X64_IU4_w32_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_FP8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_SWMMAC_F32_16X16X32_FP8_BF8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, killed $vgpr42, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_FP8_w32_twoaddr killed $vgpr28_vgpr29, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28, $vgpr29, $vgpr30, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36, $vgpr37, $vgpr38, $vgpr39, $vgpr40, $vgpr41, $vgpr42, $vgpr43, $vgpr44
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = V_WMMA_F32_16X16X16_F16_w32_twoaddr 8, killed $vgpr10_vgpr11_vgpr12_vgpr13, 8, killed $vgpr14_vgpr15_vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, 0, 0, implicit $exec
+    early-clobber renamable $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41 = V_SWMMAC_F32_16X16X32_BF8_BF8_w32_twoaddr killed $vgpr0_vgpr1, killed $vgpr30_vgpr31_vgpr32_vgpr33, killed $vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41, killed $vgpr42, 0, implicit $exec
+...
diff --git a/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
new file mode 100644
index 0000000000000..34c37aa91ab80
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/wmma-hazards-gfx12-w64.mir
@@ -0,0 +1,352 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -march=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=GFX12 %s
+
+# D0 overlaps A1, B1, C1 or Index1. Overlap starts at vgpr0.
+#  $D0 = wmma0 $A0, $B0, $C0 or $D0 = swmmac0 $A0, $B0, $C0, $Index0
+#  $D1 = wmma1 $A1, $B1, $C1 or $D1 = swmmac1 $A1, $B1, $C1, $Index1
+
+---
+name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_f16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf16_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr20_vgpr21_vgpr22_vgpr23 = V_WMMA_F32_16X16X16_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1, 8, killed $vgpr20_vgpr21_vgpr22_vgpr23, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f16_16x16x16_f16_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1 = V_WMMA_F16_16X16X16_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19, 8, killed $vgpr0_vgpr1, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_bf16_16x16x16_bf16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr20_vgpr21 = V_WMMA_BF16_16X16X16_BF16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19, 8, killed $vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X16_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x16_iu4_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_I32_16X16X16_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_fp8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_FP8_FP8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_fp8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_fp8_bf8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_f32_16x16x16_bf8_bf8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_F32_16X16X16_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+
+    ; GFX12-LABEL: name: test_wmma_i32_16x16x32_iu4_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_WMMA_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr0, killed $vgpr18_vgpr19_vgpr20_vgpr21, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_f16_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_F16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr26, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf16_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26, $vgpr27, $vgpr28
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr22_vgpr23_vgpr24_vgpr25 = V_SWMMAC_F32_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23_vgpr24_vgpr25, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+
+    ; GFX12-LABEL: name: test_swmmac_f16_16x16x32_f16_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_F16_16X16X32_F16_w64_twoaddr 8, killed $vgpr0_vgpr1, 8, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+
+    ; GFX12-LABEL: name: test_swmmac_bf16_16x16x32_bf16_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25, $vgpr26
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr22_vgpr23 = V_SWMMAC_BF16_16X16X32_BF16_w64_twoaddr 8, killed $vgpr16_vgpr17, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr22_vgpr23, killed $vgpr24, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_I32_16X16X32_IU8_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x32_iu4_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr18_vgpr19_vgpr20_vgpr21 = V_SWMMAC_I32_16X16X32_IU4_w64_twoaddr 8, killed $vgpr16, 8, killed $vgpr16, killed $vgpr18_vgpr19_vgpr20_vgpr21, killed $vgpr0, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_i32_16x16x64_iu4_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_I32_16X16X64_IU4_w64_twoaddr 8, killed $vgpr0, 8, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, 0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_fp8_D0_overlaps_B1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_FP8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr0_vgpr1, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_fp8_bf8_D0_overlaps_C1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_SWMMAC_F32_16X16X32_FP8_BF8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr0_vgpr1_vgpr2_vgpr3, killed $vgpr23, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_fp8_D0_overlaps_Index1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_FP8_w64_twoaddr killed $vgpr16, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr0, 0, implicit $exec
+...
+
+---
+name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+
+    ; GFX12-LABEL: name: test_swmmac_f32_16x16x32_bf8_bf8_D0_overlaps_A1
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24, $vgpr25
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    ; GFX12-NEXT: V_NOP_e32 implicit $exec
+    ; GFX12-NEXT: early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+    early-clobber renamable $vgpr0_vgpr1_vgpr2_vgpr3 = V_WMMA_F32_16X16X16_F16_w64_twoaddr 8, killed $vgpr6_vgpr7, 8, killed $vgpr8_vgpr9, 8, killed $vgpr0_vgpr1_vgpr2_vgpr3, 0, 0, implicit $exec
+    early-clobber renamable $vgpr19_vgpr20_vgpr21_vgpr22 = V_SWMMAC_F32_16X16X32_BF8_BF8_w64_twoaddr killed $vgpr0, killed $vgpr17_vgpr18, killed $vgpr19_vgpr20_vgpr21_vgpr22, killed $vgpr23, 0, implicit $exec
+...
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
new file mode 100644
index 0000000000000..e1cd0cab66347
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w32.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1
+// GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3], v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], 1.0, v[4:7], v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0, v[8:15]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1
+// GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1
+// GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3], v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0, v[4:7], v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0, v[8:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], 1, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:9], s0, v1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], 1, v1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1, v[2:9]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1], v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], 1, v[2:3], v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1, v[4:11]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3], v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], 1.0, v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0, v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3], v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0, v[4:11], v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0, v20
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3], v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], 1.0, v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0, v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3], v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0, v[4:11], v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0, v16
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], 1, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], s0, v[1:2], v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1], v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], 1, v[1:2], v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1, v11
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], 1, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1], v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0, v[2:5], v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0, v14
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
new file mode 100644
index 0000000000000..8bd9e5039b720
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_wmma_w64.s
@@ -0,0 +1,1529 @@
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+// RUN: not llvm-mc -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 %s 2>&1 | FileCheck --check-prefix=GFX12-ERR --implicit-check-not=error: %s
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_f16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1], v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], 1.0, v[2:3], v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0, v[4:7]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_f16_16x16x16_f16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1
+// GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1], v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0, v[2:3], v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0, v[4:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1
+// GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], 1, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x16_iu4 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], 1, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1
+// GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0]
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_wmma_i32_16x16x32_iu4 v[2:5], s0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, s1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], 1.0, v1, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1.0, v[2:5]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1], v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], 1.0, v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0, v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1], v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0, v[2:5], v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0, v10
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1], v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], 1.0, v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0, v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0]
+// GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_hi operand
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1], v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0, v[2:5], v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0, v8
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], 1, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], s0, v1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], 1, v1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1, v6
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: out of range index_key
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid neg_lo operand
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], 1, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 clamp
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[0,1,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 op_sel_hi:[1,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3
+// GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_lo:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[1,0,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,1,0]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 neg_hi:[0,0,1]
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: not a valid operand.
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0, v[1:2], v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0, v7
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+
+v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1
+// GFX12-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
new file mode 100644
index 0000000000000..5079d2f089656
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w32.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x40,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x40,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x58,0x40,0xcc,0x00,0x09,0x22,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x40,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x40,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x40,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x41,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x41,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], v[8:15] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x41,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], s[8:15]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x41,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], 1.0/*Invalid immediate*/, v[4:7], v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], 1.0/*Invalid immediate*/, v[8:15] ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x41,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x42,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x42,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x42,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x42,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x42,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0xc0,0x43,0xcc,0x00,0x09,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x48,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x50,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x60,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x00,0x43,0xcc,0x00,0x09,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[1,0,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x3c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,1,0] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x5c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_lo:[0,0,1] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x9c]
+
+[0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[1,0,0] ; encoding: [0x08,0x41,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,1,0] ; encoding: [0x08,0x42,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], v[8:11] neg_hi:[0,0,1] ; encoding: [0x08,0x44,0x43,0xcc,0x00,0x09,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c] # sgpr src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c] # sgpr src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], s[4:7]/*Invalid register, operand has 'VReg_128' register class*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18] # sgpr src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], s[8:11]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x22,0x18]
+
+[0x08,0x40,0x43,0xcc,0xf2,0x08,0x22,0x1c] # 1.0 src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], 1.0/*Invalid immediate*/, v[4:7], v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x08,0x22,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], 1.0/*Invalid immediate*/, v[8:11] ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0xca,0x1b]
+
+[0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[8:11], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x43,0xcc,0x00,0x09,0x06,0x1a]
+
+
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x44,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x44,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x44,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x44,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x44,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, v[2:9] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c] # sgpr src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, s1/*Invalid register, operand has 'VGPR_32' register class*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x18] # sgpr src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, s[0:7]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], 1/*Invalid immediate*/, v1, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, 1/*Invalid immediate*/, v[2:9] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:9], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x46,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x46,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x46,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x46,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x46,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x46,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x48,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x48,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x48,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x48,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x48,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x48,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x47,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x47,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x47,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x47,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x47,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x47,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x49,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x49,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x49,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], v[4:11] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x49,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x49,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], 1.0/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], 1.0/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x49,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] clamp ; encoding: [0x04,0xc0,0x4a,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x4a,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x4a,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], v[4:11] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x41,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x42,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x44,0x4a,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c] # sgpr src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c] # sgpr src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], s[2:3]/*Invalid register, operand has 'VReg_64' register class*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18] # sgpr src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/ ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x12,0x18]
+
+[0x04,0x40,0x4a,0xcc,0x81,0x04,0x12,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], 1/*Invalid immediate*/, v[2:3], v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x01,0x04,0x12,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x03,0x11,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], 1/*Invalid immediate*/, v[4:11] ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x03,0x10,0x1c]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0x06,0x1a]
+
+[0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[4:11], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x4a,0xcc,0x00,0x05,0xca,0x1b]
+
+
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x50,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX12:v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x50,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x50,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x50,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x50,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x50,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x50,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0xc0,0x51,0xcc,0x00,0x09,0x52,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x51,0xcc,0x00,0x09,0x52,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 index_key:1 ; encoding: [0x0c,0x48,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x50,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x51,0xcc,0x00,0x09,0x52,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x3c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x5c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], v20 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x51,0xcc,0x00,0x09,0x52,0x1c]
+
+[0x0c,0x44,0x51,0xcc,0x00,0x09,0x52,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], s20/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x52,0x18]
+
+[0x0c,0x40,0x51,0xcc,0xf2,0x08,0x52,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], 1.0/*Invalid immediate*/, v[4:11], v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x08,0x52,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0xe5,0x51,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], 1.0/*Invalid immediate*/, v20 ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x01,0x50,0x1c]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[12:19], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x51,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x52,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x52,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x52,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x52,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x52,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x52,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x52,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0xc0,0x53,0xcc,0x00,0x09,0x42,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x60,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x00,0x53,0xcc,0x00,0x09,0x42,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 index_key:1 ; encoding: [0x0c,0x48,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x50,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x58,0x53,0xcc,0x00,0x09,0x42,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[1,0,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x3c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_lo:[0,1,0] ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x5c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[1,0,0] ; encoding: [0x0c,0x41,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], v16 neg_hi:[0,1,0] ; encoding: [0x0c,0x42,0x53,0xcc,0x00,0x09,0x42,0x1c]
+
+[0x0c,0x44,0x53,0xcc,0x00,0x09,0x42,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c] # sgpr src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c] # sgpr src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], s[4:11]/*Invalid register, operand has 'VReg_256' register class*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18] # sgpr src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], s16/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x42,0x18]
+
+[0x0c,0x40,0x53,0xcc,0xf2,0x08,0x42,0x1c] # 1.0 src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], 1.0/*Invalid immediate*/, v[4:11], v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x08,0x42,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0xe5,0x41,0x1c] # 1.0 src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], 1.0/*Invalid immediate*/, v16 ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x01,0x40,0x1c]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1.0/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x02,0x18]
+
+[0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[12:15], v[0:3], v[4:11], 1/*Invalid immediate*/ ; encoding: [0x0c,0x40,0x53,0xcc,0x00,0x09,0x06,0x18]
+
+
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x54,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x54,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x54,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x54,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x54,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x54,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x54,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12:v_swmmac_i32_16x16x32_iu8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x54,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 clamp ; encoding: [0x03,0xc0,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x55,0xcc,0x00,0x03,0x2e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 index_key:1 ; encoding: [0x03,0x48,0x55,0xcc,0x00,0x03,0x2e,0x1c]
+
+[0x03,0x50,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x55,0xcc,0x00,0x03,0x2e,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x3c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], v11 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x5c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x55,0xcc,0x00,0x03,0x2e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c] # sgpr src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c] # sgpr src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x01,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18] # sgpr src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], s11/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2e,0x18]
+
+[0x03,0x40,0x55,0xcc,0x81,0x02,0x2e,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], 1/*Invalid immediate*/, v[1:2], v11 ; encoding: [0x03,0x40,0x55,0xcc,0x01,0x02,0x2e,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x2d,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, 1/*Invalid immediate*/, v11 ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x2c,0x1c]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[3:10], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 clamp ; encoding: [0x06,0xc0,0x56,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x56,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x50,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x56,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x3c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], v14 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x5c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x56,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x56,0xcc,0x81,0x04,0x3a,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], 1/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x56,0xcc,0x01,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x03,0x39,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], 1/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x03,0x38,0x1c]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x06,0x18]
+
+[0x06,0x40,0x56,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x56,0xcc,0x00,0x05,0x02,0x18]
+
+
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x57,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x57,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x57,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x57,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x57,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x57,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x57,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x58,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x58,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x58,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x58,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x58,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x58,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x58,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x59,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x59,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x59,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x59,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x59,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x59,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x59,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0xc0,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x60,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], v14 index_key:1 ; encoding: [0x06,0x48,0x5a,0xcc,0x00,0x05,0x3a,0x1c]
+
+[0x06,0x50,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x58,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x42,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x44,0x5a,0xcc,0x00,0x05,0x3a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c] # sgpr src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x38,0x1c] # sgpr src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18] # sgpr src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], s14/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x3a,0x18]
+
+[0x06,0x40,0x5a,0xcc,0xf2,0x04,0x3a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], 1.0/*Invalid immediate*/, v[2:5], v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x04,0x3a,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0xe5,0x39,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], 1.0/*Invalid immediate*/, v14 ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x01,0x38,0x1c]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[6:13], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x5a,0xcc,0x00,0x05,0x06,0x18]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt
new file mode 100644
index 0000000000000..61700faa8e607
--- /dev/null
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_wmma_w64.txt
@@ -0,0 +1,1628 @@
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s | FileCheck --check-prefix=GFX12 %s
+# RUN: not llvm-mc -disassemble -arch=amdgcn -mcpu=gfx1200 -mattr=-wavefrontsize32,+wavefrontsize64 -show-encoding %s 2>&1 | FileCheck --implicit-check-not=warning: --check-prefix=GFX12-ERR %s
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x40,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x40,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x40,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x40,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x40,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_f16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x40,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x41,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x41,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x41,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], v[4:7] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x41,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x41,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], 1.0/*Invalid immediate*/, v[2:3], v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], 1.0/*Invalid immediate*/, v[4:7] ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf16 v[4:7], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x41,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x42,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x42,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x42,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x42,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x42,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f16_16x16x16_f16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x42,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0xc0,0x43,0xcc,0x00,0x05,0x12,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x60,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x00,0x43,0xcc,0x00,0x05,0x12,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x48,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x50,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x58,0x43,0xcc,0x00,0x05,0x12,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[1,0,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x3c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,1,0] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x5c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_lo:[0,0,1] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x12,0x9c]
+
+[0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[1,0,0] ; encoding: [0x04,0x41,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,1,0] ; encoding: [0x04,0x42,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], v[4:5] neg_hi:[0,0,1] ; encoding: [0x04,0x44,0x43,0xcc,0x00,0x05,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/ ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x02,0x18]
+
+[0x04,0x40,0x43,0xcc,0xf2,0x04,0x12,0x1c] # 1.0 src0
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], 1.0/*Invalid immediate*/, v[2:3], v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x04,0x12,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0xe5,0x11,0x1c] # 1.0 src1
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], 1.0/*Invalid immediate*/, v[4:5] ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x01,0x10,0x1c]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1.0 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0xca,0x1b]
+
+[0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_bf16_16x16x16_bf16 v[4:5], v[0:1], v[2:3], 1 ; encoding: [0x04,0x40,0x43,0xcc,0x00,0x05,0x06,0x1a]
+
+
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x44,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x44,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x44,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x44,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x44,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x44,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x45,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x45,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x45,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x45,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x45,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x16_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x45,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x46,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x46,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x46,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x46,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x46,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x46,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x47,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x47,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x47,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x47,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x47,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_fp8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x47,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x48,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x48,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x48,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x48,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x48,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_fp8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x48,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x49,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x49,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x49,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_lo:[0,0,1] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x0a,0x9c]
+
+[0x02,0x41,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, v[2:5] neg_hi:[0,0,1] ; encoding: [0x02,0x44,0x49,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x49,0xcc,0xf2,0x02,0x0a,0x1c] # 1.0 src0
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], 1.0/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0xe5,0x09,0x1c] # 1.0 src1
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, 1.0/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0xca,0x1b]
+
+[0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_f32_16x16x16_bf8_bf8 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x49,0xcc,0x00,0x03,0x06,0x1a]
+
+
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # clamp
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] clamp ; encoding: [0x02,0xc0,0x4a,0xcc,0x00,0x03,0x0a,0x1c]
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:1
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x50,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x3c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, v[2:5] neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x5c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x0a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x4a,0xcc,0x00,0x03,0x0a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c] # sgpr_0 src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c] # sgpr_0 src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x01,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, s[0:3]/*Invalid register, operand has 'VReg_128' register class*/ ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x4a,0xcc,0x81,0x02,0x0a,0x1c] # 1 src0
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x01,0x02,0x0a,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x09,0x1c] # 1 src1
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v[2:5] ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x08,0x1c]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0x06,0x1a]
+
+[0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_wmma_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0 ; encoding: [0x02,0x40,0x4a,0xcc,0x00,0x03,0xca,0x1b]
+
+
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x50,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x50,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x50,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x50,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_f16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x50,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0xc0,0x51,0xcc,0x00,0x05,0x2a,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x60,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x51,0xcc,0x00,0x05,0x2a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:1 ; encoding: [0x06,0x48,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:2 ; encoding: [0x06,0x50,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 index_key:3 ; encoding: [0x06,0x58,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x3c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x5c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x2a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], v10 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x51,0xcc,0x00,0x05,0x2a,0x1c]
+
+[0x06,0x44,0x51,0xcc,0x00,0x05,0x2a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0xf2,0x04,0x2a,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], 1.0/*Invalid immediate*/, v[2:5], v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x04,0x2a,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0xe5,0x29,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], 1.0/*Invalid immediate*/, v10 ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x01,0x28,0x1c]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf16 v[6:9], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x51,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x52,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x52,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x52,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x52,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f16_16x16x32_f16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x52,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0xc0,0x53,0xcc,0x00,0x05,0x22,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x60,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x00,0x53,0xcc,0x00,0x05,0x22,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:1 ; encoding: [0x06,0x48,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:2 ; encoding: [0x06,0x50,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c] # index_key:3
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 index_key:3 ; encoding: [0x06,0x58,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[1,0,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x3c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_lo:[0,1,0] ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x5c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x22,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[1,0,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[1,0,0] ; encoding: [0x06,0x41,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,1,0]
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], v8 neg_hi:[0,1,0] ; encoding: [0x06,0x42,0x53,0xcc,0x00,0x05,0x22,0x1c]
+
+[0x06,0x44,0x53,0xcc,0x00,0x05,0x22,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], s[0:3]/*Invalid register, operand has 'VReg_128' register class*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0xf2,0x04,0x22,0x1c] # 1.0 src0
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], 1.0/*Invalid immediate*/, v[2:5], v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x04,0x22,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0xe5,0x21,0x1c] # 1.0 src1
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], 1.0/*Invalid immediate*/, v8 ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x01,0x20,0x1c]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1.0/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x02,0x18]
+
+[0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_bf16_16x16x32_bf16 v[6:7], v[0:1], v[2:5], 1/*Invalid immediate*/ ; encoding: [0x06,0x40,0x53,0xcc,0x00,0x05,0x06,0x18]
+
+
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x54,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x54,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x54,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x54,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x54,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x54,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x54,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 clamp ; encoding: [0x02,0xc0,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x60,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x00,0x55,0xcc,0x00,0x03,0x1a,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 index_key:1 ; encoding: [0x02,0x48,0x55,0xcc,0x00,0x03,0x1a,0x1c]
+
+[0x02,0x50,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x58,0x55,0xcc,0x00,0x03,0x1a,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[1,0,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x3c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, v6 neg_lo:[0,1,0] ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x5c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x1a,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x41,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x42,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x44,0x55,0xcc,0x00,0x03,0x1a,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], s0/*Invalid register, operand has 'VGPR_32' register class*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, s0/*Invalid register, operand has 'VGPR_32' register class*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x01,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+[0x02,0x40,0x55,0xcc,0x81,0x02,0x1a,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], 1/*Invalid immediate*/, v1, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x01,0x02,0x1a,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x19,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, 1/*Invalid immediate*/, v6 ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x18,0x1c]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x06,0x18]
+
+[0x02,0x40,0x55,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x32_iu4 v[2:5], v0, v1, 1.0/*Invalid immediate*/ ; encoding: [0x02,0x40,0x55,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 clamp ; encoding: [0x03,0xc0,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x60,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x56,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x56,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x58,0x56,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[1,0,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x3c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], v7 neg_lo:[0,1,0] ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x5c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x56,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x56,0xcc,0x81,0x02,0x1e,0x1c] # 1 src0
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], 1/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x56,0xcc,0x01,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x1d,0x1c] # 1 src1
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, 1/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x1c,0x1c]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x06,0x18]
+
+[0x03,0x40,0x56,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_i32_16x16x64_iu4 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x56,0xcc,0x00,0x03,0x02,0x18]
+
+
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x57,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x57,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x57,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x57,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x57,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x58,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x58,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x58,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x58,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_fp8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x58,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x59,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x59,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x59,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x59,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_fp8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x59,0xcc,0x00,0x03,0x06,0x18]
+
+
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0xc0,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # clamp
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[1,0,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,1,0]
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x60,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x14] # op_sel_hi:[0,1,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x0c] # op_sel_hi:[1,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x00,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # op_sel_hi:[1,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:1 ; encoding: [0x03,0x48,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:2 ; encoding: [0x03,0x50,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # index_key:3
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], v7 index_key:3 ; encoding: [0x03,0x58,0x5a,0xcc,0x00,0x03,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x3c] # neg_lo:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x5c] # neg_lo:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x1e,0x9c] # neg_lo:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x41,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[1,0,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x42,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,1,0]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x44,0x5a,0xcc,0x00,0x03,0x1e,0x1c] # neg_hi:[0,0,1]
+# GFX12-ERR: warning: invalid instruction encoding
+
+[0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c] # sgpr_0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], s0/*Invalid register, operand has 'VGPR_32' register class*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c] # sgpr_0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, s[0:1]/*Invalid register, operand has 'VReg_64' register class*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18] # sgpr_0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], s0/*Invalid register, operand has 'VGPR_32' register class*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0xf2,0x02,0x1e,0x1c] # 1.0 src0
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], 1.0/*Invalid immediate*/, v[1:2], v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x02,0x1e,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0xe5,0x1d,0x1c] # 1.0 src1
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, 1.0/*Invalid immediate*/, v7 ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x01,0x1c,0x1c]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0xca,0x1b] # 1.0 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1.0/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x02,0x18]
+
+[0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x1a] # 1 src2
+# GFX12: v_swmmac_f32_16x16x32_bf8_bf8 v[3:6], v0, v[1:2], 1/*Invalid immediate*/ ; encoding: [0x03,0x40,0x5a,0xcc,0x00,0x03,0x06,0x18]
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 516a984399ff8..638e46a2f9c75 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -253,22 +253,23 @@ def ROCDL_mfma_f32_32x32x16_fp8_fp8 : ROCDL_Mfma_IntrOp<"mfma.f32.32x32x16.fp8.f
 
 //===---------------------------------------------------------------------===//
 // WMMA intrinsics
-class ROCDL_Wmma_IntrOp<string mnemonic, list<Trait> traits = []> :
+class ROCDL_Wmma_IntrOp<string mnemonic, list<int> overloadedOperands,
+                        list<Trait> traits = []> :
   LLVM_IntrOpBase<ROCDL_Dialect, mnemonic,
                   "amdgcn_" # !subst(".","_", mnemonic),
-                  [0], [], traits, 1>,
+                  [0], overloadedOperands, traits, 1>,
   Arguments<(ins Variadic<LLVM_Type>:$args)> {
   let assemblyFormat =
     "$args attr-dict `:` functional-type($args, $res)";
 }
 
 // Available on RDNA3
-def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16">;
-def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16">;
-def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16">;
-def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16">;
-def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8">;
-def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4">;
+def ROCDL_wmma_f32_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.f16", [0]>;
+def ROCDL_wmma_f32_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.f32.16x16x16.bf16", [0]>;
+def ROCDL_wmma_f16_16x16x16_f16 : ROCDL_Wmma_IntrOp<"wmma.f16.16x16x16.f16", [0]>;
+def ROCDL_wmma_bf16_16x16x16_bf16 : ROCDL_Wmma_IntrOp<"wmma.bf16.16x16x16.bf16", [0]>;
+def ROCDL_wmma_i32_16x16x16_iu8 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu8", [1]>;
+def ROCDL_wmma_i32_16x16x16_iu4 : ROCDL_Wmma_IntrOp<"wmma.i32.16x16x16.iu4", [1]>;
 
 //===---------------------------------------------------------------------===//
 // Operations on raw buffer resources (stride of 0, bounds checks either off or in
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 3c9c70711ae23..26123300d7488 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -248,53 +248,53 @@ llvm.func @rocdl.wmma(%arg0 : vector<8xf32>, %arg1 : vector<16 x f16>, %arg2 : v
   // ---- Wave32 -----
 
   // f16 -> f32
-  // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x float> %{{.*}})
   %r0 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg0 : (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
 
   // bf16 -> f32
-  // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
+  // CHECK: call <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v8f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x float> %{{.*}})
   %r1 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg0 : (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32>
 
   // f16 -> f16 (OPSEL = {0,1})
-  // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
+  // CHECK: call <16 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v16f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <16 x half> %{{.*}}, i1 {{.*}})
   %r2 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg1, %zero : (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
 
   // bf16 -> bf16 (OPSEL = {0,1})
-  // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
+  // CHECK: call <16 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v16i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <16 x i16> %{{.*}}, i1 {{.*}})
   %r4 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg2, %zero : (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16>
 
   // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
-  // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
+  // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v8i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
   %r5 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg3, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32>
 
   // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
-  // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
+  // CHECK: call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v8i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <8 x i32> %{{.*}}, i1 {{.*}})
   %r6 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg3, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
 
   // ---- Wave64 -----
 
   // f16 -> f32
-  // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <4 x float> %{{.*}})
   %r7 = rocdl.wmma.f32.16x16x16.f16 %arg1, %arg1, %arg6 : (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32>
 
   // bf16 -> f32
-  // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
+  // CHECK: call <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.bf16.v4f32.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <4 x float> %{{.*}})
   %r8 = rocdl.wmma.f32.16x16x16.bf16 %arg2, %arg2, %arg6 : (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32>
 
   // f16 -> f16 (OPSEL = {0,1})
-  // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
+  // CHECK: call <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v16f16(<16 x half> %{{.*}}, <16 x half> %{{.*}}, <8 x half> %{{.*}}, i1 {{.*}})
   %r9 = rocdl.wmma.f16.16x16x16.f16 %arg1, %arg1, %arg7, %zero : (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16>
 
   // bf16 -> bf16 (OPSEL = {0,1})
-  // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
+  // CHECK: call <8 x i16> @llvm.amdgcn.wmma.bf16.16x16x16.bf16.v8i16.v16i16(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}, <8 x i16> %{{.*}}, i1 {{.*}})
   %r11 = rocdl.wmma.bf16.16x16x16.bf16 %arg2, %arg2, %arg8, %zero : (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16>
 
   // int8 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
-  // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
+  // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu8.v4i32.v4i32(i1 {{.*}}, <4 x i32> %{{.*}}, i1 {{.*}}, <4 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
   %r12 = rocdl.wmma.i32.16x16x16.iu8 %zero, %arg5, %zero, %arg5, %arg5, %zero : (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32>
 
   // int4 -> int32 (signA = {0,1}, signB = {0,1}, clamp = {0,1})
-  // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
+  // CHECK: call <4 x i32> @llvm.amdgcn.wmma.i32.16x16x16.iu4.v4i32.v2i32(i1 {{.*}}, <2 x i32> %{{.*}}, i1 {{.*}}, <2 x i32> %{{.*}}, <4 x i32> %{{.*}}, i1 {{.*}})
   %r13 = rocdl.wmma.i32.16x16x16.iu4 %zero, %arg4, %zero, %arg4, %arg5, %zero : (i1, vector<2xi32>, i1, vector<2xi32>, vector<4xi32>, i1) -> vector<4xi32>
 
   llvm.return %r0 : vector<8xf32>

From 89dae798cc77789a43e9a60173f647dae03a65fe Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 12:33:57 +0100
Subject: [PATCH 780/843] [Loads] Use BatchAAResults for available value APIs
 (NFCI)

This allows caching AA queries both within and across the calls,
and enables us to use a custom AAQI configuration.
---
 llvm/include/llvm/Analysis/Loads.h                   | 12 ++++++------
 llvm/lib/Analysis/Lint.cpp                           |  3 ++-
 llvm/lib/Analysis/Loads.cpp                          |  9 ++++-----
 .../InstCombine/InstCombineLoadStoreAlloca.cpp       |  3 ++-
 llvm/lib/Transforms/Scalar/JumpThreading.cpp         | 11 ++++++-----
 5 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/Analysis/Loads.h b/llvm/include/llvm/Analysis/Loads.h
index 2880ed33a34cb..0926093bba99d 100644
--- a/llvm/include/llvm/Analysis/Loads.h
+++ b/llvm/include/llvm/Analysis/Loads.h
@@ -18,7 +18,7 @@
 
 namespace llvm {
 
-class AAResults;
+class BatchAAResults;
 class AssumptionCache;
 class DataLayout;
 class DominatorTree;
@@ -129,11 +129,10 @@ extern cl::opt<unsigned> DefMaxInstsToScan;
 /// location in memory, as opposed to the value operand of a store.
 ///
 /// \returns The found value, or nullptr if no value is found.
-Value *FindAvailableLoadedValue(LoadInst *Load,
-                                BasicBlock *ScanBB,
+Value *FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
                                 BasicBlock::iterator &ScanFrom,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan,
-                                AAResults *AA = nullptr,
+                                BatchAAResults *AA = nullptr,
                                 bool *IsLoadCSE = nullptr,
                                 unsigned *NumScanedInst = nullptr);
 
@@ -141,7 +140,8 @@ Value *FindAvailableLoadedValue(LoadInst *Load,
 /// FindAvailableLoadedValue() for the case where we are not interested in
 /// finding the closest clobbering instruction if no available load is found.
 /// This overload cannot be used to scan across multiple blocks.
-Value *FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, bool *IsLoadCSE,
+Value *FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
+                                bool *IsLoadCSE,
                                 unsigned MaxInstsToScan = DefMaxInstsToScan);
 
 /// Scan backwards to see if we have the value of the given pointer available
@@ -170,7 +170,7 @@ Value *FindAvailableLoadedValue(LoadInst *Load, AAResults &AA, bool *IsLoadCSE,
 Value *findAvailablePtrLoadStore(const MemoryLocation &Loc, Type *AccessTy,
                                  bool AtLeastAtomic, BasicBlock *ScanBB,
                                  BasicBlock::iterator &ScanFrom,
-                                 unsigned MaxInstsToScan, AAResults *AA,
+                                 unsigned MaxInstsToScan, BatchAAResults *AA,
                                  bool *IsLoadCSE, unsigned *NumScanedInst);
 
 /// Returns true if a pointer value \p A can be replace with another pointer
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index 1ebc593016bc0..16635097d20af 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -657,11 +657,12 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
     BasicBlock::iterator BBI = L->getIterator();
     BasicBlock *BB = L->getParent();
     SmallPtrSet<BasicBlock *, 4> VisitedBlocks;
+    BatchAAResults BatchAA(*AA);
     for (;;) {
       if (!VisitedBlocks.insert(BB).second)
         break;
       if (Value *U =
-              FindAvailableLoadedValue(L, BB, BBI, DefMaxInstsToScan, AA))
+              FindAvailableLoadedValue(L, BB, BBI, DefMaxInstsToScan, &BatchAA))
         return findValueImpl(U, OffsetOk, Visited);
       if (BBI != BB->begin())
         break;
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index 97d21db86abf2..6bf0d2f56eb4e 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -450,11 +450,10 @@ llvm::DefMaxInstsToScan("available-load-scan-limit", cl::init(6), cl::Hidden,
            "to scan backward from a given instruction, when searching for "
            "available loaded value"));
 
-Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
-                                      BasicBlock *ScanBB,
+Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BasicBlock *ScanBB,
                                       BasicBlock::iterator &ScanFrom,
                                       unsigned MaxInstsToScan,
-                                      AAResults *AA, bool *IsLoad,
+                                      BatchAAResults *AA, bool *IsLoad,
                                       unsigned *NumScanedInst) {
   // Don't CSE load that is volatile or anything stronger than unordered.
   if (!Load->isUnordered())
@@ -583,7 +582,7 @@ static Value *getAvailableLoadStore(Instruction *Inst, const Value *Ptr,
 Value *llvm::findAvailablePtrLoadStore(
     const MemoryLocation &Loc, Type *AccessTy, bool AtLeastAtomic,
     BasicBlock *ScanBB, BasicBlock::iterator &ScanFrom, unsigned MaxInstsToScan,
-    AAResults *AA, bool *IsLoadCSE, unsigned *NumScanedInst) {
+    BatchAAResults *AA, bool *IsLoadCSE, unsigned *NumScanedInst) {
   if (MaxInstsToScan == 0)
     MaxInstsToScan = ~0U;
 
@@ -664,7 +663,7 @@ Value *llvm::findAvailablePtrLoadStore(
   return nullptr;
 }
 
-Value *llvm::FindAvailableLoadedValue(LoadInst *Load, AAResults &AA,
+Value *llvm::FindAvailableLoadedValue(LoadInst *Load, BatchAAResults &AA,
                                       bool *IsLoadCSE,
                                       unsigned MaxInstsToScan) {
   const DataLayout &DL = Load->getModule()->getDataLayout();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index bb2a77daa60a7..1254a050027a4 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1032,7 +1032,8 @@ Instruction *InstCombinerImpl::visitLoadInst(LoadInst &LI) {
   // where there are several consecutive memory accesses to the same location,
   // separated by a few arithmetic operations.
   bool IsLoadCSE = false;
-  if (Value *AvailableVal = FindAvailableLoadedValue(&LI, *AA, &IsLoadCSE)) {
+  BatchAAResults BatchAA(*AA);
+  if (Value *AvailableVal = FindAvailableLoadedValue(&LI, BatchAA, &IsLoadCSE)) {
     if (IsLoadCSE)
       combineMetadataForCSE(cast<LoadInst>(AvailableVal), &LI, false);
 
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index c58df063d8c53..b7cf024896315 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -1260,8 +1260,9 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
   // the entry to its block.
   BasicBlock::iterator BBIt(LoadI);
   bool IsLoadCSE;
+  BatchAAResults BatchAA(*AA);
   if (Value *AvailableVal = FindAvailableLoadedValue(
-          LoadI, LoadBB, BBIt, DefMaxInstsToScan, AA, &IsLoadCSE)) {
+          LoadI, LoadBB, BBIt, DefMaxInstsToScan, &BatchAA, &IsLoadCSE)) {
     // If the value of the load is locally available within the block, just use
     // it.  This frequently occurs for reg2mem'd allocas.
 
@@ -1322,9 +1323,9 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
     MemoryLocation Loc(LoadedPtr->DoPHITranslation(LoadBB, PredBB),
                        LocationSize::precise(DL.getTypeStoreSize(AccessTy)),
                        AATags);
-    PredAvailable = findAvailablePtrLoadStore(Loc, AccessTy, LoadI->isAtomic(),
-                                              PredBB, BBIt, DefMaxInstsToScan,
-                                              AA, &IsLoadCSE, &NumScanedInst);
+    PredAvailable = findAvailablePtrLoadStore(
+        Loc, AccessTy, LoadI->isAtomic(), PredBB, BBIt, DefMaxInstsToScan,
+        &BatchAA, &IsLoadCSE, &NumScanedInst);
 
     // If PredBB has a single predecessor, continue scanning through the
     // single predecessor.
@@ -1336,7 +1337,7 @@ bool JumpThreadingPass::simplifyPartiallyRedundantLoad(LoadInst *LoadI) {
         BBIt = SinglePredBB->end();
         PredAvailable = findAvailablePtrLoadStore(
             Loc, AccessTy, LoadI->isAtomic(), SinglePredBB, BBIt,
-            (DefMaxInstsToScan - NumScanedInst), AA, &IsLoadCSE,
+            (DefMaxInstsToScan - NumScanedInst), &BatchAA, &IsLoadCSE,
             &NumScanedInst);
       }
     }

From d1b473c7956c080fe4d784bb89f720fbd28024a6 Mon Sep 17 00:00:00 2001
From: Danial Klimkin <dklimkin@users.noreply.github.com>
Date: Wed, 24 Jan 2024 14:06:14 +0100
Subject: [PATCH 781/843] Fix bazel build past
 7251243315ef66f9b3f32e6f8e9536f701aa0d0a (#79282)

Fix bazel build past 7251243315ef66f9b3f32e6f8e9536f701aa0d0a
---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index b22ec01376be6..00d3c9733b8fd 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -2577,7 +2577,10 @@ cc_library(
         "lib/Passes/*.cpp",
         "lib/Passes/*.h",
     ]),
-    hdrs = glob(["include/llvm/Passes/*.h"]) + ["include/llvm-c/Transforms/PassBuilder.h"],
+    hdrs = glob([
+        "include/llvm/Passes/*.h",
+        "include/llvm/Passes/*.def",
+    ]) + ["include/llvm-c/Transforms/PassBuilder.h"],
     copts = llvm_copts,
     deps = [
         ":AggressiveInstCombine",
@@ -2591,6 +2594,7 @@ cc_library(
         ":IRPrinter",
         ":InstCombine",
         ":Instrumentation",
+        ":MC",
         ":MLPolicies",
         ":ObjCARC",
         ":Scalar",

From 5469010ba73701d47498576a433aea5c6e16ba2c Mon Sep 17 00:00:00 2001
From: ostannard <oliver.stannard@arm.com>
Date: Wed, 24 Jan 2024 13:12:03 +0000
Subject: [PATCH 782/843] [AArch64] FP/SIMD is not mandatory for v8-R (#79004)

The FP/SIMD instructions are optional for v8-R, so they should not be
marked as a dependency of HasV8_0rOps. This had the effect of disabling
some v8R-specific system registers when any of these features was
disabled.

I've moved these features to be enabled by default for Cortex-R82
(currently the only v8-R AArch64 core), matching the previous behavior,
and clang's default.

Based on a patch by Simi Pallipurath <simi.pallipurath@arm.com>
---
 llvm/lib/Target/AArch64/AArch64.td              | 17 +++++++++--------
 llvm/test/MC/AArch64/armv8.1a-rdma.s            |  2 +-
 llvm/test/MC/AArch64/armv8.2a-dotprod-errors.s  |  2 +-
 llvm/test/MC/AArch64/armv8.2a-dotprod.s         |  4 ++--
 llvm/test/MC/AArch64/armv8r-sysreg.s            |  3 +++
 .../Disassembler/AArch64/armv8.3a-complex.txt   |  2 +-
 .../MC/Disassembler/AArch64/armv8.3a-js.txt     |  2 +-
 7 files changed, 18 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 36700f73df4b2..02fb01caf7e80 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -81,7 +81,8 @@ def FeatureFMV : SubtargetFeature<"fmv", "HasFMV", "true",
   "Enable Function Multi Versioning support.">;
 
 def FeatureRDM : SubtargetFeature<"rdm", "HasRDM", "true",
-  "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions (FEAT_RDM)">;
+  "Enable ARMv8.1 Rounding Double Multiply Add/Subtract instructions (FEAT_RDM)",
+  [FeatureNEON]>;
 
 def FeaturePAN : SubtargetFeature<
     "pan", "HasPAN", "true",
@@ -321,7 +322,7 @@ def FeatureUseRSqrt : SubtargetFeature<
 
 def FeatureDotProd : SubtargetFeature<
     "dotprod", "HasDotProd", "true",
-    "Enable dot product support (FEAT_DotProd)">;
+    "Enable dot product support (FEAT_DotProd)", [FeatureNEON]>;
 
 def FeaturePAuth : SubtargetFeature<
     "pauth", "HasPAuth", "true",
@@ -708,15 +709,14 @@ def HasV9_5aOps : SubtargetFeature<
 def HasV8_0rOps : SubtargetFeature<
   "v8r", "HasV8_0rOps", "true", "Support ARM v8r instructions",
   [//v8.1
-  FeatureCRC, FeaturePAN, FeatureRDM, FeatureLSE, FeatureCONTEXTIDREL2,
+  FeatureCRC, FeaturePAN, FeatureLSE, FeatureCONTEXTIDREL2,
   //v8.2
   FeatureRAS, FeaturePsUAO, FeatureCCPP, FeaturePAN_RWV,
   //v8.3
-  FeatureComplxNum, FeatureCCIDX, FeatureJS,
-  FeaturePAuth, FeatureRCPC,
+  FeatureCCIDX, FeaturePAuth, FeatureRCPC,
   //v8.4
-  FeatureDotProd, FeatureTRACEV8_4, FeatureTLB_RMI,
-  FeatureFlagM, FeatureDIT, FeatureSEL2, FeatureRCPC_IMMO,
+  FeatureTRACEV8_4, FeatureTLB_RMI, FeatureFlagM, FeatureDIT, FeatureSEL2,
+  FeatureRCPC_IMMO,
   // Not mandatory in v8.0-R, but included here on the grounds that it
   // only enables names of system registers
   FeatureSpecRestrict
@@ -1420,7 +1420,8 @@ def ProcessorFeatures {
                                  FeaturePerfMon, FeatureSPE, FeatureSPE_EEF];
   list<SubtargetFeature> R82  = [HasV8_0rOps, FeaturePerfMon, FeatureFullFP16,
                                  FeatureFP16FML, FeatureSSBS, FeaturePredRes,
-                                 FeatureSB];
+                                 FeatureSB, FeatureRDM, FeatureDotProd,
+                                 FeatureComplxNum, FeatureJS];
   list<SubtargetFeature> X1   = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8,
                                  FeatureNEON, FeatureRCPC, FeaturePerfMon,
                                  FeatureSPE, FeatureFullFP16, FeatureDotProd,
diff --git a/llvm/test/MC/AArch64/armv8.1a-rdma.s b/llvm/test/MC/AArch64/armv8.1a-rdma.s
index 907e4149454ae..6e9e97a528fb9 100644
--- a/llvm/test/MC/AArch64/armv8.1a-rdma.s
+++ b/llvm/test/MC/AArch64/armv8.1a-rdma.s
@@ -1,6 +1,6 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a -show-encoding < %s 2> %t | FileCheck %s
 // RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8r -show-encoding < %s 2> %t | FileCheck %s
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8r,+rdm -show-encoding < %s 2> %t | FileCheck %s
 // RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
   .text
 
diff --git a/llvm/test/MC/AArch64/armv8.2a-dotprod-errors.s b/llvm/test/MC/AArch64/armv8.2a-dotprod-errors.s
index 3c2fca56dd6b8..b3736a281122a 100644
--- a/llvm/test/MC/AArch64/armv8.2a-dotprod-errors.s
+++ b/llvm/test/MC/AArch64/armv8.2a-dotprod-errors.s
@@ -1,6 +1,6 @@
 // RUN: not llvm-mc -triple aarch64 -mattr=+dotprod -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
-// RUN: not llvm-mc -triple aarch64 -mattr=+v8r -show-encoding < %s 2> %t
+// RUN: not llvm-mc -triple aarch64 -mattr=+v8r,+dotprod -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
 
 udot v0.2s, v1.8b, v2.4b[4]
diff --git a/llvm/test/MC/AArch64/armv8.2a-dotprod.s b/llvm/test/MC/AArch64/armv8.2a-dotprod.s
index 9c4a6cad7e07a..a49ed14d8677a 100644
--- a/llvm/test/MC/AArch64/armv8.2a-dotprod.s
+++ b/llvm/test/MC/AArch64/armv8.2a-dotprod.s
@@ -12,13 +12,13 @@
 // RUN: llvm-mc -triple aarch64 -mcpu=neoverse-n2 -show-encoding < %s| FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=tsv110 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=cortex-r82 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
-// RUN: llvm-mc -triple aarch64 -mattr=+v8r -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
+// RUN: llvm-mc -triple aarch64 -mattr=+v8r,+dotprod -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=ampere1 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 // RUN: llvm-mc -triple aarch64 -mcpu=ampere1a -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD
 
 // RUN: not llvm-mc -triple aarch64 -mattr=+v8.2a -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
-// RUN: not llvm-mc -triple aarch64 -mattr=+v8r,-dotprod -show-encoding < %s 2> %t
+// RUN: not llvm-mc -triple aarch64 -mattr=+v8r -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
 // RUN: not llvm-mc -triple aarch64 -mcpu=cortex-r82 -mattr=-dotprod -show-encoding < %s 2> %t
 // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s
diff --git a/llvm/test/MC/AArch64/armv8r-sysreg.s b/llvm/test/MC/AArch64/armv8r-sysreg.s
index 0f9cb05622d2a..46a23d5827b29 100644
--- a/llvm/test/MC/AArch64/armv8r-sysreg.s
+++ b/llvm/test/MC/AArch64/armv8r-sysreg.s
@@ -1,4 +1,7 @@
 // RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+v8r -o - %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+v8r,-fp-armv8,-rdm,-dotprod,-complxnum,-jsconv -o - %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64 -show-encoding -mcpu=cortex-r82 -o - %s | FileCheck %s
+// RUN: llvm-mc -triple aarch64 -show-encoding -mcpu=cortex-r82 -mattr=-fp-armv8,-rdm,-dotprod,-complxnum,-jsconv -o - %s | FileCheck %s
 .text
 mrs x0, VSCTLR_EL2
 mrs x0, MPUIR_EL1
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.3a-complex.txt b/llvm/test/MC/Disassembler/AArch64/armv8.3a-complex.txt
index f920639173ad2..ea8b498fdc6e7 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.3a-complex.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.3a-complex.txt
@@ -1,7 +1,7 @@
 # RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.3a,-fullfp16 --disassemble < %s 2>%t | FileCheck %s --check-prefix=CHECK
 # RUN: FileCheck %s < %t --check-prefix=NO-FP16
 # RUN:     llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.3a,+fullfp16 --disassemble < %s 2>%t | FileCheck %s --check-prefix=CHECK --check-prefix=FP16
-# RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8r --disassemble < %s 2>%t | FileCheck %s --check-prefix=CHECK
+# RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8r,+complxnum --disassemble < %s 2>%t | FileCheck %s --check-prefix=CHECK
 # RUN: FileCheck %s < %t --check-prefix=NO-FP16
 # RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=-v8.3a,+fullfp16 --disassemble < %s 2>&1 | FileCheck %s --check-prefix=NO-V83A
 
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.3a-js.txt b/llvm/test/MC/Disassembler/AArch64/armv8.3a-js.txt
index 66c8c3001e982..22144cf64d965 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.3a-js.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.3a-js.txt
@@ -1,4 +1,4 @@
 # RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.3a --disassemble < %s | FileCheck %s
-# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8r --disassemble < %s | FileCheck %s
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8r,+jsconv --disassemble < %s | FileCheck %s
 # CHECK: fjcvtzs w0, d0
 [0x00,0x00,0x7e,0x1e]

From 182ab1c7034b951433fb8831b67e7758fe61d4e8 Mon Sep 17 00:00:00 2001
From: Rainer Orth <ro@gcc.gnu.org>
Date: Wed, 24 Jan 2024 14:33:45 +0100
Subject: [PATCH 783/843] [Support] Adjust .note.GNU-stack guard in
 Support/BLAKE3/blake3_*_x86-64_unix.S (#76229)

When using GNU ld 2.41 on FreeBSD 14.0/amd64, there are linker warnings
like
```
/vol/gcc/bin/gld-2.41: warning: blake3_avx512_x86-64_unix.S.o: missing .note.GNU-stack section implies executable stack
/vol/gcc/bin/gld-2.41: NOTE: This behaviour is deprecated and will be removed in a future version of the linker
```
This can be fixed by adjusting the guard of the `.note.GNU-stack`
sections in `blake3_*_x86-64_unix.S` to match `llvm/lib/MC/MCAsmInfoELF.cpp:MCAsmInfoELF::getNonexecutableStackSection` which emits the section on all ELF targets
but Solaris.

Tested on `amd64-pc-freebsd14.0`.
---
 llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S   | 2 +-
 llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S | 2 +-
 llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S   | 2 +-
 llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S  | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
index 69fc0936d73c5..e98893c7ef8b8 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx2_x86-64_unix.S
@@ -2,7 +2,7 @@
 
 #include "llvm_blake3_prefix.h"
 
-#if defined(__ELF__) && defined(__linux__)
+#if defined(__ELF__) && !(defined(__sun__) && defined(__svr4__))
 .section .note.GNU-stack,"",%progbits
 #endif
 
diff --git a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
index f04a135dd1bc4..224605147c3d7 100644
--- a/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_avx512_x86-64_unix.S
@@ -2,7 +2,7 @@
 
 #include "llvm_blake3_prefix.h"
 
-#if defined(__ELF__) && defined(__linux__)
+#if defined(__ELF__) && !(defined(__sun__) && defined(__svr4__))
 .section .note.GNU-stack,"",%progbits
 #endif
 
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
index 9a4f5eb7318bf..d69a1706fefe7 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse2_x86-64_unix.S
@@ -2,7 +2,7 @@
 
 #include "llvm_blake3_prefix.h"
 
-#if defined(__ELF__) && defined(__linux__)
+#if defined(__ELF__) && !(defined(__sun__) && defined(__svr4__))
 .section .note.GNU-stack,"",%progbits
 #endif
 
diff --git a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
index 1be4ed744426b..c5b103af61c4f 100644
--- a/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
+++ b/llvm/lib/Support/BLAKE3/blake3_sse41_x86-64_unix.S
@@ -2,7 +2,7 @@
 
 #include "llvm_blake3_prefix.h"
 
-#if defined(__ELF__) && defined(__linux__)
+#if defined(__ELF__) && !(defined(__sun__) && defined(__svr4__))
 .section .note.GNU-stack,"",%progbits
 #endif
 

From 8b43c1be23119c1024bed0a8ce392bc73727e2e2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <RKSimon@users.noreply.github.com>
Date: Wed, 24 Jan 2024 14:00:51 +0000
Subject: [PATCH 784/843] [X86] X86FixupVectorConstants - shrink vector load to
 movsd/movsd/movd/movq 'zero upper' instructions (#79000)

If we're loading a vector constant that is known to be zero in the upper elements, then attempt to shrink the constant and just scalar load the lower 32/64 bits.

Always chose the vzload/broadcast with the smallest constant load, and prefer vzload over broadcasts for same bitwidth to avoid domain flips (mainly a AVX1 issue).

Fixes #73783
---
 .../Target/X86/X86FixupVectorConstants.cpp    | 177 ++++++++++------
 .../test/CodeGen/X86/2011-20-21-zext-ui2fp.ll |   2 +-
 .../any_extend_vector_inreg_of_broadcast.ll   |   5 +-
 ...d_vector_inreg_of_broadcast_from_memory.ll |   5 +-
 llvm/test/CodeGen/X86/avx-load-store.ll       |   2 +-
 llvm/test/CodeGen/X86/avx2-arith.ll           |   2 +-
 .../avx512-shuffles/shuffle-chained-bf16.ll   |   2 +-
 llvm/test/CodeGen/X86/bitreverse.ll           |   6 +-
 llvm/test/CodeGen/X86/combine-srl.ll          |   4 +-
 llvm/test/CodeGen/X86/combine-subo.ll         |   4 +-
 .../test/CodeGen/X86/constant-pool-sharing.ll |   4 +-
 llvm/test/CodeGen/X86/dpbusd.ll               |   2 +-
 llvm/test/CodeGen/X86/dpbusd_const.ll         |   6 +-
 .../CodeGen/X86/expand-vp-cast-intrinsics.ll  |   2 +-
 llvm/test/CodeGen/X86/fcmp-constant.ll        |   2 +-
 .../test/CodeGen/X86/fold-vector-sext-zext.ll |  24 +--
 llvm/test/CodeGen/X86/half.ll                 |   4 +-
 llvm/test/CodeGen/X86/icmp-pow2-mask.ll       |   4 +-
 .../X86/insert-into-constant-vector.ll        |   8 +-
 llvm/test/CodeGen/X86/insertelement-ones.ll   |   4 +-
 .../X86/masked_gather_scatter_widen.ll        |   2 +-
 llvm/test/CodeGen/X86/masked_store_trunc.ll   |   8 +-
 .../CodeGen/X86/memcmp-more-load-pairs.ll     |   2 +-
 llvm/test/CodeGen/X86/memcmp.ll               |   2 +-
 llvm/test/CodeGen/X86/pmaddubsw.ll            |   7 +-
 llvm/test/CodeGen/X86/pr46532.ll              |   2 +-
 llvm/test/CodeGen/X86/pr63108.ll              |  12 +-
 llvm/test/CodeGen/X86/pr74736.ll              |   2 +-
 llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll  |   2 +-
 llvm/test/CodeGen/X86/pshufb-mask-comments.ll |   2 +-
 llvm/test/CodeGen/X86/ret-mmx.ll              |   2 +-
 llvm/test/CodeGen/X86/sad.ll                  |   2 +-
 llvm/test/CodeGen/X86/sext-vsetcc.ll          |   4 +-
 llvm/test/CodeGen/X86/shrink_vmul.ll          |   4 +-
 llvm/test/CodeGen/X86/shuffle-half.ll         |   2 +-
 .../X86/shuffle-strided-with-offset-256.ll    | 143 +++++--------
 llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll |  76 +++----
 llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll |   4 +-
 llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll  |  56 ++---
 llvm/test/CodeGen/X86/vec_anyext.ll           |   8 +-
 llvm/test/CodeGen/X86/vec_fp_to_int.ll        |  10 +-
 llvm/test/CodeGen/X86/vec_set-A.ll            |   4 +-
 llvm/test/CodeGen/X86/vector-blend.ll         |  18 +-
 .../X86/vector-constrained-fp-intrinsics.ll   |   2 +-
 llvm/test/CodeGen/X86/vector-fshl-128.ll      |  16 +-
 llvm/test/CodeGen/X86/vector-fshl-256.ll      |   6 +-
 llvm/test/CodeGen/X86/vector-fshl-rot-128.ll  |  12 +-
 llvm/test/CodeGen/X86/vector-fshl-rot-256.ll  |   2 +-
 .../CodeGen/X86/vector-fshl-rot-sub128.ll     |   6 +-
 llvm/test/CodeGen/X86/vector-fshl-sub128.ll   |   2 +-
 llvm/test/CodeGen/X86/vector-fshr-128.ll      |  16 +-
 llvm/test/CodeGen/X86/vector-fshr-256.ll      |   4 +-
 llvm/test/CodeGen/X86/vector-fshr-rot-128.ll  |  12 +-
 llvm/test/CodeGen/X86/vector-fshr-rot-256.ll  |   2 +-
 .../CodeGen/X86/vector-fshr-rot-sub128.ll     |   6 +-
 llvm/test/CodeGen/X86/vector-fshr-sub128.ll   |   2 +-
 .../CodeGen/X86/vector-half-conversions.ll    |   4 +-
 .../vector-interleaved-load-i16-stride-3.ll   |  16 +-
 .../vector-interleaved-load-i16-stride-5.ll   |  14 +-
 .../vector-interleaved-load-i16-stride-6.ll   |  14 +-
 .../vector-interleaved-load-i16-stride-7.ll   |  42 ++--
 .../vector-interleaved-load-i16-stride-8.ll   |  50 ++---
 .../vector-interleaved-load-i32-stride-3.ll   |   2 +-
 .../vector-interleaved-load-i32-stride-4.ll   |   8 +-
 .../vector-interleaved-load-i32-stride-6.ll   |  24 +--
 .../vector-interleaved-load-i32-stride-7.ll   |  45 ++--
 .../vector-interleaved-load-i32-stride-8.ll   |   2 +-
 .../vector-interleaved-load-i8-stride-2.ll    |  13 +-
 .../vector-interleaved-load-i8-stride-4.ll    | 108 ++++------
 .../vector-interleaved-load-i8-stride-5.ll    |  82 +++----
 .../vector-interleaved-load-i8-stride-6.ll    |  91 +++-----
 .../vector-interleaved-load-i8-stride-7.ll    | 122 +++++------
 .../vector-interleaved-load-i8-stride-8.ll    | 200 +++++++++---------
 .../vector-interleaved-store-i8-stride-5.ll   |   6 +-
 .../vector-interleaved-store-i8-stride-7.ll   |   3 +-
 .../vector-interleaved-store-i8-stride-8.ll   |  30 +--
 llvm/test/CodeGen/X86/vector-lzcnt-128.ll     | 112 +++++-----
 llvm/test/CodeGen/X86/vector-lzcnt-256.ll     |  16 +-
 .../CodeGen/X86/vector-mulfix-legalize.ll     |   4 +-
 .../CodeGen/X86/vector-reduce-add-mask.ll     |   4 +-
 llvm/test/CodeGen/X86/vector-rotate-128.ll    |  14 +-
 llvm/test/CodeGen/X86/vector-rotate-256.ll    |   2 +-
 .../CodeGen/X86/vector-shift-ashr-sub128.ll   |   6 +-
 .../CodeGen/X86/vector-shift-lshr-sub128.ll   |   6 +-
 .../CodeGen/X86/vector-shift-shl-sub128.ll    |   4 +-
 .../CodeGen/X86/vector-shuffle-128-v16.ll     |  11 +-
 .../CodeGen/X86/vector-shuffle-256-v16.ll     |   8 +-
 .../CodeGen/X86/vector-shuffle-256-v32.ll     |  34 ++-
 .../test/CodeGen/X86/vector-shuffle-256-v8.ll |   8 +-
 .../CodeGen/X86/vector-shuffle-512-v32.ll     |   4 +-
 .../CodeGen/X86/vector-shuffle-512-v64.ll     |   8 +-
 .../test/CodeGen/X86/vector-shuffle-512-v8.ll |   4 +-
 .../CodeGen/X86/vector-shuffle-combining.ll   |  56 ++---
 llvm/test/CodeGen/X86/vector-shuffle-v1.ll    |   6 +-
 llvm/test/CodeGen/X86/vector-trunc.ll         |   4 +-
 llvm/test/CodeGen/X86/vector-tzcnt-128.ll     |  28 +--
 llvm/test/CodeGen/X86/vselect-constants.ll    |   4 +-
 llvm/test/CodeGen/X86/vselect-post-combine.ll |   2 +-
 llvm/test/CodeGen/X86/widen_bitcnt.ll         |  20 +-
 .../CodeGen/X86/x86-interleaved-access.ll     |  24 +--
 .../CodeGen/X86/zero_extend_vector_inreg.ll   |  64 +++---
 .../zero_extend_vector_inreg_of_broadcast.ll  |  17 +-
 ...d_vector_inreg_of_broadcast_from_memory.ll |  17 +-
 103 files changed, 959 insertions(+), 1135 deletions(-)

diff --git a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
index 9da8d7338ea36..ca52cb384a7ee 100644
--- a/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
+++ b/llvm/lib/Target/X86/X86FixupVectorConstants.cpp
@@ -67,6 +67,9 @@ FunctionPass *llvm::createX86FixupVectorConstants() {
 static std::optional<APInt> extractConstantBits(const Constant *C) {
   unsigned NumBits = C->getType()->getPrimitiveSizeInBits();
 
+  if (auto *CUndef = dyn_cast<UndefValue>(C))
+    return APInt::getZero(NumBits);
+
   if (auto *CInt = dyn_cast<ConstantInt>(C))
     return CInt->getValue();
 
@@ -80,6 +83,18 @@ static std::optional<APInt> extractConstantBits(const Constant *C) {
         return APInt::getSplat(NumBits, *Bits);
       }
     }
+
+    APInt Bits = APInt::getZero(NumBits);
+    for (unsigned I = 0, E = CV->getNumOperands(); I != E; ++I) {
+      Constant *Elt = CV->getOperand(I);
+      std::optional<APInt> SubBits = extractConstantBits(Elt);
+      if (!SubBits)
+        return std::nullopt;
+      assert(NumBits == (E * SubBits->getBitWidth()) &&
+             "Illegal vector element size");
+      Bits.insertBits(*SubBits, I * SubBits->getBitWidth());
+    }
+    return Bits;
   }
 
   if (auto *CDS = dyn_cast<ConstantDataSequential>(C)) {
@@ -223,6 +238,35 @@ static Constant *rebuildSplatableConstant(const Constant *C,
   return rebuildConstant(OriginalType->getContext(), SclTy, *Splat, NumSclBits);
 }
 
+static Constant *rebuildZeroUpperConstant(const Constant *C,
+                                          unsigned ScalarBitWidth) {
+  Type *Ty = C->getType();
+  Type *SclTy = Ty->getScalarType();
+  unsigned NumBits = Ty->getPrimitiveSizeInBits();
+  unsigned NumSclBits = SclTy->getPrimitiveSizeInBits();
+  LLVMContext &Ctx = C->getContext();
+
+  if (NumBits > ScalarBitWidth) {
+    // Determine if the upper bits are all zero.
+    if (std::optional<APInt> Bits = extractConstantBits(C)) {
+      if (Bits->countLeadingZeros() >= (NumBits - ScalarBitWidth)) {
+        // If the original constant was made of smaller elements, try to retain
+        // those types.
+        if (ScalarBitWidth > NumSclBits && (ScalarBitWidth % NumSclBits) == 0)
+          return rebuildConstant(Ctx, SclTy, *Bits, NumSclBits);
+
+        // Fallback to raw integer bits.
+        APInt RawBits = Bits->zextOrTrunc(ScalarBitWidth);
+        return ConstantInt::get(Ctx, RawBits);
+      }
+    }
+  }
+
+  return nullptr;
+}
+
+typedef std::function<Constant *(const Constant *, unsigned)> RebuildFn;
+
 bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
                                                      MachineBasicBlock &MBB,
                                                      MachineInstr &MI) {
@@ -233,37 +277,45 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
   bool HasBWI = ST->hasBWI();
   bool HasVLX = ST->hasVLX();
 
-  auto ConvertToBroadcast = [&](unsigned OpBcst256, unsigned OpBcst128,
-                                unsigned OpBcst64, unsigned OpBcst32,
-                                unsigned OpBcst16, unsigned OpBcst8,
-                                unsigned OperandNo) {
-    assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
-           "Unexpected number of operands!");
-
-    if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
-      // Attempt to detect a suitable splat from increasing splat widths.
-      std::pair<unsigned, unsigned> Broadcasts[] = {
-          {8, OpBcst8},   {16, OpBcst16},   {32, OpBcst32},
-          {64, OpBcst64}, {128, OpBcst128}, {256, OpBcst256},
-      };
-      for (auto [BitWidth, OpBcst] : Broadcasts) {
-        if (OpBcst) {
-          // Construct a suitable splat constant and adjust the MI to
-          // use the new constant pool entry.
-          if (Constant *NewCst = rebuildSplatableConstant(C, BitWidth)) {
-            unsigned NewCPI =
-                CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
-            MI.setDesc(TII->get(OpBcst));
-            MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
-            return true;
+  auto FixupConstant =
+      [&](unsigned OpBcst256, unsigned OpBcst128, unsigned OpBcst64,
+          unsigned OpBcst32, unsigned OpBcst16, unsigned OpBcst8,
+          unsigned OpUpper64, unsigned OpUpper32, unsigned OperandNo) {
+        assert(MI.getNumOperands() >= (OperandNo + X86::AddrNumOperands) &&
+               "Unexpected number of operands!");
+
+        if (auto *C = X86::getConstantFromPool(MI, OperandNo)) {
+          // Attempt to detect a suitable splat/vzload from increasing constant
+          // bitwidths.
+          // Prefer vzload vs broadcast for same bitwidth to avoid domain flips.
+          std::tuple<unsigned, unsigned, RebuildFn> FixupLoad[] = {
+              {8, OpBcst8, rebuildSplatableConstant},
+              {16, OpBcst16, rebuildSplatableConstant},
+              {32, OpUpper32, rebuildZeroUpperConstant},
+              {32, OpBcst32, rebuildSplatableConstant},
+              {64, OpUpper64, rebuildZeroUpperConstant},
+              {64, OpBcst64, rebuildSplatableConstant},
+              {128, OpBcst128, rebuildSplatableConstant},
+              {256, OpBcst256, rebuildSplatableConstant},
+          };
+          for (auto [BitWidth, Op, RebuildConstant] : FixupLoad) {
+            if (Op) {
+              // Construct a suitable constant and adjust the MI to use the new
+              // constant pool entry.
+              if (Constant *NewCst = RebuildConstant(C, BitWidth)) {
+                unsigned NewCPI =
+                    CP->getConstantPoolIndex(NewCst, Align(BitWidth / 8));
+                MI.setDesc(TII->get(Op));
+                MI.getOperand(OperandNo + X86::AddrDisp).setIndex(NewCPI);
+                return true;
+              }
+            }
           }
         }
-      }
-    }
-    return false;
-  };
+        return false;
+      };
 
-  // Attempt to convert full width vector loads into broadcast loads.
+  // Attempt to convert full width vector loads into broadcast/vzload loads.
   switch (Opc) {
   /* FP Loads */
   case X86::MOVAPDrm:
@@ -271,79 +323,82 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
   case X86::MOVUPDrm:
   case X86::MOVUPSrm:
     // TODO: SSE3 MOVDDUP Handling
-    return false;
+    return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVSDrm, X86::MOVSSrm, 1);
   case X86::VMOVAPDrm:
   case X86::VMOVAPSrm:
   case X86::VMOVUPDrm:
   case X86::VMOVUPSrm:
-    return ConvertToBroadcast(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
-                              1);
+    return FixupConstant(0, 0, X86::VMOVDDUPrm, X86::VBROADCASTSSrm, 0, 0,
+                         X86::VMOVSDrm, X86::VMOVSSrm, 1);
   case X86::VMOVAPDYrm:
   case X86::VMOVAPSYrm:
   case X86::VMOVUPDYrm:
   case X86::VMOVUPSYrm:
-    return ConvertToBroadcast(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
-                              X86::VBROADCASTSSYrm, 0, 0, 1);
+    return FixupConstant(0, X86::VBROADCASTF128rm, X86::VBROADCASTSDYrm,
+                         X86::VBROADCASTSSYrm, 0, 0, 0, 0, 1);
   case X86::VMOVAPDZ128rm:
   case X86::VMOVAPSZ128rm:
   case X86::VMOVUPDZ128rm:
   case X86::VMOVUPSZ128rm:
-    return ConvertToBroadcast(0, 0, X86::VMOVDDUPZ128rm,
-                              X86::VBROADCASTSSZ128rm, 0, 0, 1);
+    return FixupConstant(0, 0, X86::VMOVDDUPZ128rm, X86::VBROADCASTSSZ128rm, 0,
+                         0, X86::VMOVSDZrm, X86::VMOVSSZrm, 1);
   case X86::VMOVAPDZ256rm:
   case X86::VMOVAPSZ256rm:
   case X86::VMOVUPDZ256rm:
   case X86::VMOVUPSZ256rm:
-    return ConvertToBroadcast(0, X86::VBROADCASTF32X4Z256rm,
-                              X86::VBROADCASTSDZ256rm, X86::VBROADCASTSSZ256rm,
-                              0, 0, 1);
+    return FixupConstant(0, X86::VBROADCASTF32X4Z256rm, X86::VBROADCASTSDZ256rm,
+                         X86::VBROADCASTSSZ256rm, 0, 0, 0, 0, 1);
   case X86::VMOVAPDZrm:
   case X86::VMOVAPSZrm:
   case X86::VMOVUPDZrm:
   case X86::VMOVUPSZrm:
-    return ConvertToBroadcast(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
-                              X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0,
-                              1);
+    return FixupConstant(X86::VBROADCASTF64X4rm, X86::VBROADCASTF32X4rm,
+                         X86::VBROADCASTSDZrm, X86::VBROADCASTSSZrm, 0, 0, 0, 0,
+                         1);
     /* Integer Loads */
+  case X86::MOVDQArm:
+  case X86::MOVDQUrm:
+    return FixupConstant(0, 0, 0, 0, 0, 0, X86::MOVQI2PQIrm, X86::MOVDI2PDIrm,
+                         1);
   case X86::VMOVDQArm:
   case X86::VMOVDQUrm:
-    return ConvertToBroadcast(
-        0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
-        HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
-        HasAVX2 ? X86::VPBROADCASTWrm : 0, HasAVX2 ? X86::VPBROADCASTBrm : 0,
-        1);
+    return FixupConstant(0, 0, HasAVX2 ? X86::VPBROADCASTQrm : X86::VMOVDDUPrm,
+                         HasAVX2 ? X86::VPBROADCASTDrm : X86::VBROADCASTSSrm,
+                         HasAVX2 ? X86::VPBROADCASTWrm : 0,
+                         HasAVX2 ? X86::VPBROADCASTBrm : 0, X86::VMOVQI2PQIrm,
+                         X86::VMOVDI2PDIrm, 1);
   case X86::VMOVDQAYrm:
   case X86::VMOVDQUYrm:
-    return ConvertToBroadcast(
+    return FixupConstant(
         0, HasAVX2 ? X86::VBROADCASTI128rm : X86::VBROADCASTF128rm,
         HasAVX2 ? X86::VPBROADCASTQYrm : X86::VBROADCASTSDYrm,
         HasAVX2 ? X86::VPBROADCASTDYrm : X86::VBROADCASTSSYrm,
         HasAVX2 ? X86::VPBROADCASTWYrm : 0, HasAVX2 ? X86::VPBROADCASTBYrm : 0,
-        1);
+        0, 0, 1);
   case X86::VMOVDQA32Z128rm:
   case X86::VMOVDQA64Z128rm:
   case X86::VMOVDQU32Z128rm:
   case X86::VMOVDQU64Z128rm:
-    return ConvertToBroadcast(0, 0, X86::VPBROADCASTQZ128rm,
-                              X86::VPBROADCASTDZ128rm,
-                              HasBWI ? X86::VPBROADCASTWZ128rm : 0,
-                              HasBWI ? X86::VPBROADCASTBZ128rm : 0, 1);
+    return FixupConstant(0, 0, X86::VPBROADCASTQZ128rm, X86::VPBROADCASTDZ128rm,
+                         HasBWI ? X86::VPBROADCASTWZ128rm : 0,
+                         HasBWI ? X86::VPBROADCASTBZ128rm : 0,
+                         X86::VMOVQI2PQIZrm, X86::VMOVDI2PDIZrm, 1);
   case X86::VMOVDQA32Z256rm:
   case X86::VMOVDQA64Z256rm:
   case X86::VMOVDQU32Z256rm:
   case X86::VMOVDQU64Z256rm:
-    return ConvertToBroadcast(0, X86::VBROADCASTI32X4Z256rm,
-                              X86::VPBROADCASTQZ256rm, X86::VPBROADCASTDZ256rm,
-                              HasBWI ? X86::VPBROADCASTWZ256rm : 0,
-                              HasBWI ? X86::VPBROADCASTBZ256rm : 0, 1);
+    return FixupConstant(0, X86::VBROADCASTI32X4Z256rm, X86::VPBROADCASTQZ256rm,
+                         X86::VPBROADCASTDZ256rm,
+                         HasBWI ? X86::VPBROADCASTWZ256rm : 0,
+                         HasBWI ? X86::VPBROADCASTBZ256rm : 0, 0, 0, 1);
   case X86::VMOVDQA32Zrm:
   case X86::VMOVDQA64Zrm:
   case X86::VMOVDQU32Zrm:
   case X86::VMOVDQU64Zrm:
-    return ConvertToBroadcast(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
-                              X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
-                              HasBWI ? X86::VPBROADCASTWZrm : 0,
-                              HasBWI ? X86::VPBROADCASTBZrm : 0, 1);
+    return FixupConstant(X86::VBROADCASTI64X4rm, X86::VBROADCASTI32X4rm,
+                         X86::VPBROADCASTQZrm, X86::VPBROADCASTDZrm,
+                         HasBWI ? X86::VPBROADCASTWZrm : 0,
+                         HasBWI ? X86::VPBROADCASTBZrm : 0, 0, 0, 1);
   }
 
   auto ConvertToBroadcastAVX512 = [&](unsigned OpSrc32, unsigned OpSrc64) {
@@ -368,7 +423,7 @@ bool X86FixupVectorConstantsPass::processInstruction(MachineFunction &MF,
 
     if (OpBcst32 || OpBcst64) {
       unsigned OpNo = OpBcst32 == 0 ? OpNoBcst64 : OpNoBcst32;
-      return ConvertToBroadcast(0, 0, OpBcst64, OpBcst32, 0, 0, OpNo);
+      return FixupConstant(0, 0, OpBcst64, OpBcst32, 0, 0, 0, 0, OpNo);
     }
     return false;
   };
diff --git a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
index e4e5d51d272d6..1ae51ee975638 100644
--- a/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
+++ b/llvm/test/CodeGen/X86/2011-20-21-zext-ui2fp.ll
@@ -7,7 +7,7 @@
 define void @ui_to_fp_conv(ptr nocapture %aFOO, ptr nocapture %RET) nounwind {
 ; CHECK-LABEL: ui_to_fp_conv:
 ; CHECK:       # %bb.0: # %allocas
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [1.0E+0,1.0E+0,0.0E+0,0.0E+0]
 ; CHECK-NEXT:    xorps %xmm1, %xmm1
 ; CHECK-NEXT:    movups %xmm1, 16(%rsi)
 ; CHECK-NEXT:    movups %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
index fe48059f9d0e6..e592b714a05dc 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; SSE42-NEXT:    paddb 48(%rsi), %xmm2
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    paddb 32(%rsi), %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    pshufb %xmm3, %xmm1
 ; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT:    # xmm3 = mem[0,0]
+; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
index 2f576fe671590..d3f6bd20a0127 100644
--- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
 ; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
 ; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    pshufb %xmm3, %xmm1
 ; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT:    # xmm3 = mem[0,0]
+; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/avx-load-store.ll b/llvm/test/CodeGen/X86/avx-load-store.ll
index 33eb704788740..3f856d33145d8 100644
--- a/llvm/test/CodeGen/X86/avx-load-store.ll
+++ b/llvm/test/CodeGen/X86/avx-load-store.ll
@@ -220,7 +220,7 @@ define void @f_f() nounwind {
 ; CHECK-NEXT:    testb %al, %al
 ; CHECK-NEXT:    jne .LBB9_4
 ; CHECK-NEXT:  # %bb.3: # %cif_mixed_test_all
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
+; CHECK-NEXT:    vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
 ; CHECK-NEXT:    vmaskmovps %ymm0, %ymm0, (%rax)
 ; CHECK-NEXT:  .LBB9_4: # %cif_mixed_test_any_check
 ;
diff --git a/llvm/test/CodeGen/X86/avx2-arith.ll b/llvm/test/CodeGen/X86/avx2-arith.ll
index 3e69581171944..d32143cf33f2f 100644
--- a/llvm/test/CodeGen/X86/avx2-arith.ll
+++ b/llvm/test/CodeGen/X86/avx2-arith.ll
@@ -234,7 +234,7 @@ define <8 x i16> @mul_const8(<8 x i16> %x) {
 define <8 x i32> @mul_const9(<8 x i32> %x) {
 ; CHECK-LABEL: mul_const9:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
+; CHECK-NEXT:    vmovd {{.*#+}} xmm1 = [2,0,0,0]
 ; CHECK-NEXT:    vpmulld %ymm1, %ymm0, %ymm0
 ; CHECK-NEXT:    ret{{[l|q]}}
   %y = mul <8 x i32> %x, <i32 2, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
index 99d6049fc1d86..12ce721b8c5d5 100644
--- a/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
+++ b/llvm/test/CodeGen/X86/avx512-shuffles/shuffle-chained-bf16.ll
@@ -13,7 +13,7 @@ define <2 x bfloat> @shuffle_chained_v32bf16_v2bf16(<32 x bfloat> %a) {
 ; CHECK-NEXT:    .cfi_def_cfa_register %rbp
 ; CHECK-NEXT:    andq $-64, %rsp
 ; CHECK-NEXT:    subq $128, %rsp
-; CHECK-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [0,16,0,16,0,16,0,16]
+; CHECK-NEXT:    vmovd {{.*#+}} xmm1 = [0,16,0,0,0,0,0,0]
 ; CHECK-NEXT:    vpermw %zmm0, %zmm1, %zmm0
 ; CHECK-NEXT:    vmovdqa64 %zmm0, (%rsp)
 ; CHECK-NEXT:    vmovaps (%rsp), %xmm0
diff --git a/llvm/test/CodeGen/X86/bitreverse.ll b/llvm/test/CodeGen/X86/bitreverse.ll
index 2d978b5e991c9..26b1d64874e59 100644
--- a/llvm/test/CodeGen/X86/bitreverse.ll
+++ b/llvm/test/CodeGen/X86/bitreverse.ll
@@ -587,17 +587,17 @@ define <2 x i16> @fold_v2i16() {
 ;
 ; X64-LABEL: fold_v2i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [61440,240,u,u,u,u,u,u]
+; X64-NEXT:    movss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
 ; X64-NEXT:    retq
 ;
 ; X86XOP-LABEL: fold_v2i16:
 ; X86XOP:       # %bb.0:
-; X86XOP-NEXT:    vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
+; X86XOP-NEXT:    vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
 ; X86XOP-NEXT:    retl
 ;
 ; GFNI-LABEL: fold_v2i16:
 ; GFNI:       # %bb.0:
-; GFNI-NEXT:    vbroadcastss {{.*#+}} xmm0 = [61440,240,61440,240,61440,240,61440,240]
+; GFNI-NEXT:    vmovss {{.*#+}} xmm0 = [61440,240,0,0,0,0,0,0]
 ; GFNI-NEXT:    retq
   %b = call <2 x i16> @llvm.bitreverse.v2i16(<2 x i16> <i16 15, i16 3840>)
   ret <2 x i16> %b
diff --git a/llvm/test/CodeGen/X86/combine-srl.ll b/llvm/test/CodeGen/X86/combine-srl.ll
index 3e0f581ea01a0..125196a0819b6 100644
--- a/llvm/test/CodeGen/X86/combine-srl.ll
+++ b/llvm/test/CodeGen/X86/combine-srl.ll
@@ -356,7 +356,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
 ; SSE-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE-NEXT:    movdqa %xmm1, %xmm2
 ; SSE-NEXT:    pshufb %xmm0, %xmm2
 ; SSE-NEXT:    psrlw $4, %xmm0
@@ -378,7 +378,7 @@ define <4 x i32> @combine_vec_lshr_lzcnt_bit1(<4 x i32> %x) {
 ; AVX-LABEL: combine_vec_lshr_lzcnt_bit1:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX-NEXT:    vpxor %xmm3, %xmm3, %xmm3
diff --git a/llvm/test/CodeGen/X86/combine-subo.ll b/llvm/test/CodeGen/X86/combine-subo.ll
index 235df0a666ee9..5e4bba6e0fd35 100644
--- a/llvm/test/CodeGen/X86/combine-subo.ll
+++ b/llvm/test/CodeGen/X86/combine-subo.ll
@@ -217,13 +217,13 @@ define { <4 x i8>, <4 x i1> } @always_usub_const_vector() nounwind {
 define { <4 x i8>, <4 x i1> } @never_usub_const_vector() nounwind {
 ; SSE-LABEL: never_usub_const_vector:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [127,255,0,254,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
 ; SSE-NEXT:    xorps %xmm1, %xmm1
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: never_usub_const_vector:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vbroadcastss {{.*#+}} xmm0 = [127,255,0,254,127,255,0,254,127,255,0,254,127,255,0,254]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [127,255,0,254,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    retq
   %x = call { <4 x i8>, <4 x i1> } @llvm.usub.with.overflow.v4i8(<4 x i8> <i8 255, i8 255, i8 255, i8 255>, <4 x i8> <i8 128, i8 0, i8 255, i8 1>)
diff --git a/llvm/test/CodeGen/X86/constant-pool-sharing.ll b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
index db5a7810974c9..062d87ed035fd 100644
--- a/llvm/test/CodeGen/X86/constant-pool-sharing.ll
+++ b/llvm/test/CodeGen/X86/constant-pool-sharing.ll
@@ -77,7 +77,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
 ; SSE-LINUX:       # %bb.0:
 ; SSE-LINUX-NEXT:    xorps %xmm0, %xmm0
 ; SSE-LINUX-NEXT:    movaps %xmm0, 48(%rdi)
-; SSE-LINUX-NEXT:    movaps {{.*#+}} xmm1 = [18446744073709551615,0]
+; SSE-LINUX-NEXT:    movsd {{.*#+}} xmm1 = [18446744073709551615,0]
 ; SSE-LINUX-NEXT:    movaps %xmm1, 32(%rdi)
 ; SSE-LINUX-NEXT:    movaps %xmm1, 16(%rdi)
 ; SSE-LINUX-NEXT:    movaps %xmm1, (%rdi)
@@ -92,7 +92,7 @@ define void @store_repeated_constants(ptr %lo, ptr %hi) {
 ; SSE-MSVC:       # %bb.0:
 ; SSE-MSVC-NEXT:    xorps %xmm0, %xmm0
 ; SSE-MSVC-NEXT:    movaps %xmm0, 48(%rcx)
-; SSE-MSVC-NEXT:    movaps {{.*#+}} xmm1 = [18446744073709551615,0]
+; SSE-MSVC-NEXT:    movsd {{.*#+}} xmm1 = [18446744073709551615,0]
 ; SSE-MSVC-NEXT:    movaps %xmm1, 32(%rcx)
 ; SSE-MSVC-NEXT:    movaps %xmm1, 16(%rcx)
 ; SSE-MSVC-NEXT:    movaps %xmm1, (%rcx)
diff --git a/llvm/test/CodeGen/X86/dpbusd.ll b/llvm/test/CodeGen/X86/dpbusd.ll
index 06f11e6d527bd..fbea08eb1e550 100644
--- a/llvm/test/CodeGen/X86/dpbusd.ll
+++ b/llvm/test/CodeGen/X86/dpbusd.ll
@@ -379,7 +379,7 @@ define i32 @vpdpbusd_2xi32(ptr%a, ptr%b, i32 %c, i32 %n) {
 ; AVX512VNNI-LABEL: vpdpbusd_2xi32:
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
-; AVX512VNNI-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512VNNI-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX512VNNI-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VNNI-NEXT:    vmovq {{.*#+}} xmm2 = mem[0],zero
 ; AVX512VNNI-NEXT:    vpandq %zmm1, %zmm2, %zmm1
diff --git a/llvm/test/CodeGen/X86/dpbusd_const.ll b/llvm/test/CodeGen/X86/dpbusd_const.ll
index 2fe9feb06dff6..5862e614265b1 100644
--- a/llvm/test/CodeGen/X86/dpbusd_const.ll
+++ b/llvm/test/CodeGen/X86/dpbusd_const.ll
@@ -108,7 +108,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVXVNNI:       # %bb.0: # %entry
 ; AVXVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVXVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVXVNNI-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVXVNNI-NEXT:    vmovd {{.*#+}} xmm2 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVXVNNI-NEXT:    {vex} vpdpbusd %xmm0, %xmm2, %xmm1
 ; AVXVNNI-NEXT:    vmovd %xmm1, %eax
 ; AVXVNNI-NEXT:    addl %edi, %eax
@@ -118,7 +118,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VNNI:       # %bb.0: # %entry
 ; AVX512VNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VNNI-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VNNI-NEXT:    vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VNNI-NEXT:    vpdpbusd %zmm0, %zmm1, %zmm2
 ; AVX512VNNI-NEXT:    vmovd %xmm2, %eax
@@ -130,7 +130,7 @@ define i32 @mul_4xi8_cs(<4 x i8> %a, i32 %c) {
 ; AVX512VLVNNI:       # %bb.0: # %entry
 ; AVX512VLVNNI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLVNNI-NEXT:    vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7]
-; AVX512VLVNNI-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVNNI-NEXT:    vmovd {{.*#+}} xmm1 = [0,1,2,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVNNI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512VLVNNI-NEXT:    vpdpbusd %xmm0, %xmm1, %xmm2
 ; AVX512VLVNNI-NEXT:    vmovd %xmm2, %eax
diff --git a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
index 3b015acb69bd2..0a52dfff71eda 100644
--- a/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/expand-vp-cast-intrinsics.ll
@@ -532,7 +532,7 @@ define <2 x half> @vfptrunc_v2f16_v2f64(<2 x double> %a, <2 x i1> %m, i32 zeroex
 ; AVX512-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-NEXT:    callq __truncdfhf2@PLT
 ; AVX512-NEXT:    vpbroadcastw %xmm0, %xmm1
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [4,0,0,0]
+; AVX512-NEXT:    vmovss {{.*#+}} xmm0 = [4,0,0,0]
 ; AVX512-NEXT:    vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
 ; AVX512-NEXT:    addq $40, %rsp
 ; AVX512-NEXT:    .cfi_def_cfa_offset 8
diff --git a/llvm/test/CodeGen/X86/fcmp-constant.ll b/llvm/test/CodeGen/X86/fcmp-constant.ll
index 481a32b39dd37..335cb28213f92 100644
--- a/llvm/test/CodeGen/X86/fcmp-constant.ll
+++ b/llvm/test/CodeGen/X86/fcmp-constant.ll
@@ -92,7 +92,7 @@ define <2 x i64> @fcmp_ueq_v2f64_undef() {
 define <2 x i64> @fcmp_ueq_v2f64_undef_elt() {
 ; CHECK-LABEL: fcmp_ueq_v2f64_undef_elt:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [18446744073709551615,0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [18446744073709551615,0]
 ; CHECK-NEXT:    retq
   %1 = fcmp ueq <2 x double> <double 0x3FF0000000000000, double 0xFFEFFFFFFFFFFFFF>, <double undef, double 0x3FF0000000000000>
   %2 = sext <2 x i1> %1 to <2 x i64>
diff --git a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
index 3f8bd24c38049..d31168f407890 100644
--- a/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
+++ b/llvm/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -11,14 +11,12 @@
 define <4 x i16> @test_sext_4i8_4i16() {
 ; X32-LABEL: test_sext_4i8_4i16:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
-; X32-NEXT:    # xmm0 = mem[0,0]
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = [0,65535,2,65533,0,0,0,0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_sext_4i8_4i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovddup {{.*#+}} xmm0 = [0,65535,2,65533,0,65535,2,65533]
-; X64-NEXT:    # xmm0 = mem[0,0]
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = [0,65535,2,65533,0,0,0,0]
 ; X64-NEXT:    retq
   %1 = insertelement <4 x i8> undef, i8 0, i32 0
   %2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -31,14 +29,12 @@ define <4 x i16> @test_sext_4i8_4i16() {
 define <4 x i16> @test_sext_4i8_4i16_undef() {
 ; X32-LABEL: test_sext_4i8_4i16_undef:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
-; X32-NEXT:    # xmm0 = mem[0,0]
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_sext_4i8_4i16_undef:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovddup {{.*#+}} xmm0 = [0,65535,0,65533,0,65535,0,65533]
-; X64-NEXT:    # xmm0 = mem[0,0]
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = [0,65535,0,65533,0,0,0,0]
 ; X64-NEXT:    retq
   %1 = insertelement <4 x i8> undef, i8 undef, i32 0
   %2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -211,14 +207,12 @@ define <8 x i32> @test_sext_8i8_8i32_undef() {
 define <4 x i16> @test_zext_4i8_4i16() {
 ; X32-LABEL: test_zext_4i8_4i16:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
-; X32-NEXT:    # xmm0 = mem[0,0]
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = [0,255,2,253,0,0,0,0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_zext_4i8_4i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovddup {{.*#+}} xmm0 = [0,255,2,253,0,255,2,253]
-; X64-NEXT:    # xmm0 = mem[0,0]
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = [0,255,2,253,0,0,0,0]
 ; X64-NEXT:    retq
   %1 = insertelement <4 x i8> undef, i8 0, i32 0
   %2 = insertelement <4 x i8> %1, i8 -1, i32 1
@@ -267,14 +261,12 @@ define <4 x i64> @test_zext_4i8_4i64() {
 define <4 x i16> @test_zext_4i8_4i16_undef() {
 ; X32-LABEL: test_zext_4i8_4i16_undef:
 ; X32:       # %bb.0:
-; X32-NEXT:    vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
-; X32-NEXT:    # xmm0 = mem[0,0]
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0]
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test_zext_4i8_4i16_undef:
 ; X64:       # %bb.0:
-; X64-NEXT:    vmovddup {{.*#+}} xmm0 = [0,255,0,253,0,255,0,253]
-; X64-NEXT:    # xmm0 = mem[0,0]
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = [0,255,0,253,0,0,0,0]
 ; X64-NEXT:    retq
   %1 = insertelement <4 x i8> undef, i8 undef, i32 0
   %2 = insertelement <4 x i8> %1, i8 -1, i32 1
diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll
index b42f6fdea34b6..d0853fdc748d2 100644
--- a/llvm/test/CodeGen/X86/half.ll
+++ b/llvm/test/CodeGen/X86/half.ll
@@ -2116,7 +2116,7 @@ define void @pr63114() {
 ; CHECK-LIBCALL-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
 ; CHECK-LIBCALL-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
 ; CHECK-LIBCALL-NEXT:    pand %xmm1, %xmm0
-; CHECK-LIBCALL-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
+; CHECK-LIBCALL-NEXT:    movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
 ; CHECK-LIBCALL-NEXT:    por %xmm2, %xmm0
 ; CHECK-LIBCALL-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
 ; CHECK-LIBCALL-NEXT:    pand %xmm3, %xmm0
@@ -2181,7 +2181,7 @@ define void @pr63114() {
 ; CHECK-I686-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,2,1]
 ; CHECK-I686-NEXT:    movdqa {{.*#+}} xmm1 = [65535,65535,65535,0,65535,65535,65535,65535]
 ; CHECK-I686-NEXT:    pand %xmm1, %xmm0
-; CHECK-I686-NEXT:    movdqa {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
+; CHECK-I686-NEXT:    movq {{.*#+}} xmm2 = [0,0,0,15360,0,0,0,0]
 ; CHECK-I686-NEXT:    por %xmm2, %xmm0
 ; CHECK-I686-NEXT:    movdqa {{.*#+}} xmm3 = [65535,65535,65535,65535,65535,65535,65535,0]
 ; CHECK-I686-NEXT:    pand %xmm3, %xmm0
diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
index e2b3a23827a01..6d2866f50c6c7 100644
--- a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
+++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll
@@ -21,7 +21,7 @@ define <8 x i16> @pow2_mask_v16i8(i8 zeroext %0) {
 ; SSE41-NEXT:    movd %edi, %xmm0
 ; SSE41-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE41-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    pand %xmm1, %xmm0
 ; SSE41-NEXT:    pcmpeqb %xmm1, %xmm0
 ; SSE41-NEXT:    pmovsxbw %xmm0, %xmm0
@@ -97,7 +97,7 @@ define i64 @pow2_mask_v8i8(i8 zeroext %0) {
 ; SSE-NEXT:    movd %edi, %xmm0
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,u,u,u,u,u,u,u,u]
+; SSE-NEXT:    movq {{.*#+}} xmm1 = [128,64,32,16,8,4,2,1,0,0,0,0,0,0,0,0]
 ; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pcmpeqb %xmm1, %xmm0
 ; SSE-NEXT:    movq %xmm0, %rax
diff --git a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
index 418d632480328..bc977e006606e 100644
--- a/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
+++ b/llvm/test/CodeGen/X86/insert-into-constant-vector.ll
@@ -375,7 +375,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X86-SSE-LABEL: elt5_v8i64:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm2 = [4,0,0,0]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm2 = [4,0,0,0]
 ; X86-SSE-NEXT:    movlhps {{.*#+}} xmm2 = xmm2[0],xmm0[0]
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [42,0,1,0]
 ; X86-SSE-NEXT:    movaps {{.*#+}} xmm1 = [2,0,3,0]
@@ -404,7 +404,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X86-AVX1-LABEL: elt5_v8i64:
 ; X86-AVX1:       # %bb.0:
 ; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [4,0,0,0]
 ; X86-AVX1-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X86-AVX1-NEXT:    vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
 ; X86-AVX1-NEXT:    vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
@@ -421,7 +421,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X86-AVX2-LABEL: elt5_v8i64:
 ; X86-AVX2:       # %bb.0:
 ; X86-AVX2-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
-; X86-AVX2-NEXT:    vmovaps {{.*#+}} xmm1 = [4,0,0,0]
+; X86-AVX2-NEXT:    vmovss {{.*#+}} xmm1 = [4,0,0,0]
 ; X86-AVX2-NEXT:    vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
 ; X86-AVX2-NEXT:    vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm1
 ; X86-AVX2-NEXT:    vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
@@ -439,7 +439,7 @@ define <8 x i64> @elt5_v8i64(i64 %x) {
 ; X86-AVX512F:       # %bb.0:
 ; X86-AVX512F-NEXT:    vmovaps {{.*#+}} ymm0 = [42,0,1,0,2,0,3,0]
 ; X86-AVX512F-NEXT:    vmovsd {{.*#+}} xmm1 = mem[0],zero
-; X86-AVX512F-NEXT:    vmovaps {{.*#+}} xmm2 = [4,0,0,0]
+; X86-AVX512F-NEXT:    vmovss {{.*#+}} xmm2 = [4,0,0,0]
 ; X86-AVX512F-NEXT:    vmovlhps {{.*#+}} xmm1 = xmm2[0],xmm1[0]
 ; X86-AVX512F-NEXT:    vinsertf128 $1, {{\.?LCPI[0-9]+_[0-9]+}}, %ymm1, %ymm1
 ; X86-AVX512F-NEXT:    vinsertf64x4 $1, %ymm1, %zmm0, %zmm0
diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll
index ed7cbb0a8430d..cfcb8798e0b9c 100644
--- a/llvm/test/CodeGen/X86/insertelement-ones.ll
+++ b/llvm/test/CodeGen/X86/insertelement-ones.ll
@@ -280,7 +280,7 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) {
 ;
 ; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -385,7 +385,7 @@ define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) {
 ;
 ; AVX1-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm1 = [255,0,0,0]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX1-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
 ; AVX1-NEXT:    vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
index f63545bcd178e..aad1b44344850 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter_widen.ll
@@ -739,7 +739,7 @@ define <17 x float> @test_mgather_v17f32(ptr %base, <17 x i32> %index)
 ; WIDEN_AVX2-NEXT:    vgatherdps %ymm5, (%rsi,%ymm1,4), %ymm6
 ; WIDEN_AVX2-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; WIDEN_AVX2-NEXT:    vgatherdps %ymm3, (%rsi,%ymm0,4), %ymm1
-; WIDEN_AVX2-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,0,0,0]
+; WIDEN_AVX2-NEXT:    vmovss {{.*#+}} xmm0 = [4294967295,0,0,0]
 ; WIDEN_AVX2-NEXT:    vgatherdps %ymm0, (%rsi,%ymm2,4), %ymm4
 ; WIDEN_AVX2-NEXT:    vmovss %xmm4, 64(%rdi)
 ; WIDEN_AVX2-NEXT:    vmovaps %ymm1, 32(%rdi)
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index f78646afccb34..6c21bb4d99e74 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -1395,7 +1395,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-LABEL: truncstore_v4i64_v4i8:
 ; SSE4:       # %bb.0:
 ; SSE4-NEXT:    pxor %xmm3, %xmm3
-; SSE4-NEXT:    movdqa {{.*#+}} xmm4 = [0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE4-NEXT:    movd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; SSE4-NEXT:    pshufb %xmm4, %xmm1
 ; SSE4-NEXT:    pshufb %xmm4, %xmm0
 ; SSE4-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1435,7 +1435,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm3
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm4 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
@@ -3791,7 +3791,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-LABEL: truncstore_v8i32_v8i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
@@ -3865,7 +3865,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm3
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm4 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index 6eb02bfc1fd0c..da46ea4065579 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -947,7 +947,7 @@ define i1 @length24_eq_const(ptr %X) nounwind {
 ; X64-MIC-AVX:       # %bb.0:
 ; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
 ; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = [959985462,858927408,0,0]
 ; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
 ; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
 ; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index f5e7384362a92..83cb0d6f973be 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -1015,7 +1015,7 @@ define i1 @length24_eq_const(ptr %X) nounwind {
 ; X64-MIC-AVX:       # %bb.0:
 ; X64-MIC-AVX-NEXT:    vmovdqu (%rdi), %xmm0
 ; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [959985462,858927408,0,0]
+; X64-MIC-AVX-NEXT:    vmovq {{.*#+}} xmm2 = [959985462,858927408,0,0]
 ; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm2, %zmm1, %k0
 ; X64-MIC-AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [858927408,926299444,825243960,892613426]
 ; X64-MIC-AVX-NEXT:    vpcmpneqd %zmm1, %zmm0, %k1
diff --git a/llvm/test/CodeGen/X86/pmaddubsw.ll b/llvm/test/CodeGen/X86/pmaddubsw.ll
index ea0b4e4b21c77..e46a14673a517 100644
--- a/llvm/test/CodeGen/X86/pmaddubsw.ll
+++ b/llvm/test/CodeGen/X86/pmaddubsw.ll
@@ -320,8 +320,7 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
-; AVX1-NEXT:    # xmm2 = mem[0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
 ; AVX1-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
@@ -349,9 +348,9 @@ define <8 x i16> @pmaddubsw_bad_extend(ptr %Aptr, ptr %Bptr) {
 ; AVX256:       # %bb.0:
 ; AVX256-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX256-NEXT:    vmovdqa (%rsi), %xmm1
-; AVX256-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,2,4,6,8,10,12,14]
+; AVX256-NEXT:    vmovq {{.*#+}} xmm2 = [0,2,4,6,8,10,12,14,0,0,0,0,0,0,0,0]
 ; AVX256-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
-; AVX256-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX256-NEXT:    vmovq {{.*#+}} xmm4 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX256-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX256-NEXT:    vpshufb %xmm2, %xmm1, %xmm2
 ; AVX256-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/pr46532.ll b/llvm/test/CodeGen/X86/pr46532.ll
index ffaa131d5cbb1..cbc677229ede6 100644
--- a/llvm/test/CodeGen/X86/pr46532.ll
+++ b/llvm/test/CodeGen/X86/pr46532.ll
@@ -7,7 +7,7 @@ define void @WhileWithLoopInvariantOperation.21() {
 ; CHECK-NEXT:    movq (%rax), %rax
 ; CHECK-NEXT:    vxorps %xmm0, %xmm0, %xmm0
 ; CHECK-NEXT:    vmovaps %xmm0, 32(%rax)
-; CHECK-NEXT:    vmovaps {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = [4294967295,4294967295,0,0]
 ; CHECK-NEXT:    vmaskmovps %ymm0, %ymm0, (%rax)
 while.1.body.preheader:
   %0 = load ptr, ptr undef, align 8, !invariant.load !0, !dereferenceable !1, !align !2
diff --git a/llvm/test/CodeGen/X86/pr63108.ll b/llvm/test/CodeGen/X86/pr63108.ll
index 38e45595a8846..b1576851ed023 100644
--- a/llvm/test/CodeGen/X86/pr63108.ll
+++ b/llvm/test/CodeGen/X86/pr63108.ll
@@ -11,11 +11,11 @@ define i32 @PR63108() {
 ; SSE-NEXT:    testb %al, %al
 ; SSE-NEXT:    je .LBB0_2
 ; SSE-NEXT:  # %bb.1:
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [251,223,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = [57339,0,0,0]
 ; SSE-NEXT:    jmp .LBB0_5
 ; SSE-NEXT:  .LBB0_2: # %vector.body.preheader
 ; SSE-NEXT:    pxor %xmm0, %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [57339,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [57339,0,0,0]
 ; SSE-NEXT:    xorl %eax, %eax
 ; SSE-NEXT:    .p2align 4, 0x90
 ; SSE-NEXT:  .LBB0_3: # %vector.body
@@ -47,10 +47,10 @@ define i32 @PR63108() {
 ; AVX1-NEXT:    testb %al, %al
 ; AVX1-NEXT:    je .LBB0_2
 ; AVX1-NEXT:  # %bb.1:
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm0 = [251,223,0,0,251,223,0,0,251,223,0,0,251,223,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = [57339,0,0,0]
 ; AVX1-NEXT:    jmp .LBB0_5
 ; AVX1-NEXT:  .LBB0_2: # %vector.body.preheader
-; AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [57339,0,0,0]
+; AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [57339,0,0,0]
 ; AVX1-NEXT:    xorl %eax, %eax
 ; AVX1-NEXT:    .p2align 4, 0x90
 ; AVX1-NEXT:  .LBB0_3: # %vector.body
@@ -87,7 +87,7 @@ define i32 @PR63108() {
 ; AVX2-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
 ; AVX2-NEXT:    jmp .LBB0_5
 ; AVX2-NEXT:  .LBB0_2: # %vector.body.preheader
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [57339,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [57339,0,0,0]
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    .p2align 4, 0x90
 ; AVX2-NEXT:  .LBB0_3: # %vector.body
@@ -124,7 +124,7 @@ define i32 @PR63108() {
 ; AVX512-NEXT:    vpbroadcastw {{.*#+}} xmm0 = [251,223,251,223,251,223,251,223,251,223,251,223,251,223,251,223]
 ; AVX512-NEXT:    jmp .LBB0_5
 ; AVX512-NEXT:  .LBB0_2: # %vector.body.preheader
-; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [57339,0,0,0]
+; AVX512-NEXT:    vmovd {{.*#+}} xmm0 = [57339,0,0,0]
 ; AVX512-NEXT:    xorl %eax, %eax
 ; AVX512-NEXT:    .p2align 4, 0x90
 ; AVX512-NEXT:  .LBB0_3: # %vector.body
diff --git a/llvm/test/CodeGen/X86/pr74736.ll b/llvm/test/CodeGen/X86/pr74736.ll
index 3dfdbf102c953..1c3b4bd4971c1 100644
--- a/llvm/test/CodeGen/X86/pr74736.ll
+++ b/llvm/test/CodeGen/X86/pr74736.ll
@@ -6,7 +6,7 @@ define void @main(<16 x i32> %0, i32 %1) {
 ; SSE-LABEL: main:
 ; SSE:       # %bb.0: # %entry
 ; SSE-NEXT:    movd %edi, %xmm4
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [1,0,0,0]
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [1,0,0,0]
 ; SSE-NEXT:    shufps {{.*#+}} xmm0 = xmm0[1,0],xmm4[1,0]
 ; SSE-NEXT:    paddd %xmm0, %xmm0
 ; SSE-NEXT:    paddd %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
index bbe46a99ffa41..5f13e97487435 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-lzcnt.ll
@@ -38,7 +38,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) {
 define <16 x i8> @testv16i8(<16 x i8> %in) {
 ; AVX256-LABEL: testv16i8:
 ; AVX256:       # %bb.0:
-; AVX256-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX256-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX256-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX256-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX256-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
index ef91e7a3f9107..b96338984d6f5 100644
--- a/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
+++ b/llvm/test/CodeGen/X86/pshufb-mask-comments.ll
@@ -54,7 +54,7 @@ define <16 x i8> @test4(<16 x i8> %V, ptr %P) {
 define <16 x i8> @test5(<16 x i8> %V) {
 ; CHECK-LABEL: test5:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1,0,0,0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [1,0,0,0]
 ; CHECK-NEXT:    movaps %xmm1, (%rax)
 ; CHECK-NEXT:    movaps {{.*#+}} xmm1 = [1,1]
 ; CHECK-NEXT:    movaps %xmm1, (%rax)
diff --git a/llvm/test/CodeGen/X86/ret-mmx.ll b/llvm/test/CodeGen/X86/ret-mmx.ll
index 815f95e64496b..81dd73363c1fb 100644
--- a/llvm/test/CodeGen/X86/ret-mmx.ll
+++ b/llvm/test/CodeGen/X86/ret-mmx.ll
@@ -32,7 +32,7 @@ define <1 x i64> @t2() nounwind {
 define <2 x i32> @t3() nounwind {
 ; CHECK-LABEL: t3:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    movaps {{.*#+}} xmm0 = [1,0,0,0]
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [1,0,0,0]
 ; CHECK-NEXT:    retq
   ret <2 x i32> <i32 1, i32 0>
 }
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 3d3e935045475..d043234705aa0 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -544,7 +544,7 @@ define dso_local i32 @sad_2i8() nounwind {
 ; SSE2:       # %bb.0: # %entry
 ; SSE2-NEXT:    pxor %xmm0, %xmm0
 ; SSE2-NEXT:    movq $-1024, %rax # imm = 0xFC00
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
 ; SSE2-NEXT:    .p2align 4, 0x90
 ; SSE2-NEXT:  .LBB3_1: # %vector.body
 ; SSE2-NEXT:    # =>This Inner Loop Header: Depth=1
diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll
index 839c972b23dba..de9d47526b166 100644
--- a/llvm/test/CodeGen/X86/sext-vsetcc.ll
+++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll
@@ -216,7 +216,7 @@ define <4 x i32> @cmp_ult_load_const(ptr %x) nounwind {
 ; SSE-LABEL: cmp_ult_load_const:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [42,214,0,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [42,214,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; SSE-NEXT:    pmaxub %xmm0, %xmm1
 ; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
@@ -243,7 +243,7 @@ define <3 x i32> @cmp_ult_load_const_bad_type(ptr %x) nounwind {
 ; SSE-LABEL: cmp_ult_load_const_bad_type:
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [42,214,0,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [42,214,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; SSE-NEXT:    pmaxub %xmm0, %xmm1
 ; SSE-NEXT:    pcmpeqb %xmm0, %xmm1
 ; SSE-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
diff --git a/llvm/test/CodeGen/X86/shrink_vmul.ll b/llvm/test/CodeGen/X86/shrink_vmul.ll
index 8dd06e0d848d3..2610f4322c8e2 100644
--- a/llvm/test/CodeGen/X86/shrink_vmul.ll
+++ b/llvm/test/CodeGen/X86/shrink_vmul.ll
@@ -1745,7 +1745,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
 ; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SSE-NEXT:    movl c, %edx
 ; X86-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u]
+; X86-SSE-NEXT:    movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X86-SSE-NEXT:    pmulhuw %xmm1, %xmm2
 ; X86-SSE-NEXT:    pmullw %xmm1, %xmm0
@@ -1768,7 +1768,7 @@ define void @mul_2xi16_varconst1(ptr nocapture readonly %a, i64 %index) {
 ; X64-SSE:       # %bb.0: # %entry
 ; X64-SSE-NEXT:    movq c(%rip), %rax
 ; X64-SSE-NEXT:    movd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X64-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [0,65535,u,u,u,u,u,u]
+; X64-SSE-NEXT:    movd {{.*#+}} xmm1 = [0,65535,0,0,0,0,0,0]
 ; X64-SSE-NEXT:    movdqa %xmm0, %xmm2
 ; X64-SSE-NEXT:    pmulhuw %xmm1, %xmm2
 ; X64-SSE-NEXT:    pmullw %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-half.ll b/llvm/test/CodeGen/X86/shuffle-half.ll
index 0d27fc3896766..291fe841043ed 100644
--- a/llvm/test/CodeGen/X86/shuffle-half.ll
+++ b/llvm/test/CodeGen/X86/shuffle-half.ll
@@ -10,7 +10,7 @@ define <32 x half> @dump_vec() {
 ; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %cond.load
 ; CHECK-NEXT:    vpinsrw $0, (%rax), %xmm0, %xmm0
-; CHECK-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; CHECK-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; CHECK-NEXT:    vpand %ymm0, %ymm1, %ymm0
 ; CHECK-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vinserti64x4 $0, %ymm0, %zmm1, %zmm0
diff --git a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
index 42f1bd7824909..f632654f89e04 100644
--- a/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-strided-with-offset-256.ll
@@ -12,34 +12,22 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw,+avx512vl,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512,AVX512BWVL
 
 define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-NEXT:    # xmm2 = mem[0,0]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v16i8_1:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-NEXT:    vmovdqa %xmm0, (%rsi)
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v16i8_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX-NEXT:    vmovdqa %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v16i8_1:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512F-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -50,7 +38,7 @@ define void @shuffle_v32i8_to_v16i8_1(ptr %L, ptr %S) nounwind {
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512VL-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -147,27 +135,16 @@ define void @shuffle_v8i32_to_v4i32_1(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_1:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_1:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_1:
 ; AVX512F:       # %bb.0:
@@ -207,27 +184,16 @@ define void @shuffle_v32i8_to_v8i8_1(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_2:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_2:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_2:
 ; AVX512F:       # %bb.0:
@@ -267,27 +233,16 @@ define void @shuffle_v32i8_to_v8i8_2(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8_3(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8_3:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8_3:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8_3:
 ; AVX512F:       # %bb.0:
@@ -538,7 +493,7 @@ define void @shuffle_v32i8_to_v4i8_1(ptr %L, ptr %S) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -598,7 +553,7 @@ define void @shuffle_v32i8_to_v4i8_2(ptr %L, ptr %S) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -658,7 +613,7 @@ define void @shuffle_v32i8_to_v4i8_3(ptr %L, ptr %S) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -718,7 +673,7 @@ define void @shuffle_v32i8_to_v4i8_4(ptr %L, ptr %S) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -778,7 +733,7 @@ define void @shuffle_v32i8_to_v4i8_5(ptr %L, ptr %S) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -838,7 +793,7 @@ define void @shuffle_v32i8_to_v4i8_6(ptr %L, ptr %S) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -898,7 +853,7 @@ define void @shuffle_v32i8_to_v4i8_7(ptr %L, ptr %S) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
index 0ab141f4c2022..05dd2344d30f7 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-256.ll
@@ -382,27 +382,16 @@ define void @trunc_v4i64_to_v4i32(ptr %L, ptr %S) nounwind {
 }
 
 define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: shuffle_v32i8_to_v8i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: shuffle_v32i8_to_v8i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    retq
+; AVX-LABEL: shuffle_v32i8_to_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: shuffle_v32i8_to_v8i8:
 ; AVX512F:       # %bb.0:
@@ -447,27 +436,16 @@ define void @shuffle_v32i8_to_v8i8(ptr %L, ptr %S) nounwind {
 }
 
 define void @trunc_v8i32_to_v8i8(ptr %L, ptr %S) nounwind {
-; AVX1-LABEL: trunc_v8i32_to_v8i8:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX1-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: trunc_v8i32_to_v8i8:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
-; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
-; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX2-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-NEXT:    retq
+; AVX-LABEL: trunc_v8i32_to_v8i8:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm1
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
+; AVX-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
+; AVX-NEXT:    vmovq %xmm0, (%rsi)
+; AVX-NEXT:    retq
 ;
 ; AVX512F-LABEL: trunc_v8i32_to_v8i8:
 ; AVX512F:       # %bb.0:
@@ -519,7 +497,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -530,7 +508,7 @@ define <2 x i64> @trunc_v8i32_to_v8i8_return_v2i64(<8 x i32> %vec) nounwind {
 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v2i64:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -692,7 +670,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
 ; AVX1-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -703,7 +681,7 @@ define <16 x i8> @trunc_v8i32_to_v8i8_return_v16i8(<8 x i32> %vec) nounwind {
 ; AVX2-LABEL: trunc_v8i32_to_v8i8_return_v16i8:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -1021,7 +999,7 @@ define <16 x i8> @trunc_v4i64_to_v4i8_return_v16i8(<4 x i64> %vec) nounwind {
 ; AVX1-LABEL: trunc_v4i64_to_v4i8_return_v16i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1194,7 +1172,7 @@ define void @shuffle_v32i8_to_v4i8(ptr %L, ptr %S) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1259,7 +1237,7 @@ define void @trunc_v4i64_to_v4i8(ptr %L, ptr %S) nounwind {
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
index 6e357a5fb34f5..85e160e497172 100644
--- a/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
+++ b/llvm/test/CodeGen/X86/shuffle-vs-trunc-512.ll
@@ -392,7 +392,7 @@ define <4 x double> @PR34175(ptr %p) {
 ;
 ; AVX512BWVL-LABEL: PR34175:
 ; AVX512BWVL:       # %bb.0:
-; AVX512BWVL-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
+; AVX512BWVL-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
 ; AVX512BWVL-NEXT:    vmovdqu (%rdi), %ymm1
 ; AVX512BWVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
 ; AVX512BWVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
@@ -411,7 +411,7 @@ define <4 x double> @PR34175(ptr %p) {
 ;
 ; AVX512VBMIVL-LABEL: PR34175:
 ; AVX512VBMIVL:       # %bb.0:
-; AVX512VBMIVL-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
+; AVX512VBMIVL-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
 ; AVX512VBMIVL-NEXT:    vmovdqu (%rdi), %ymm1
 ; AVX512VBMIVL-NEXT:    vpermt2w 32(%rdi), %ymm0, %ymm1
 ; AVX512VBMIVL-NEXT:    vpmovzxwd {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
index 348501caf619a..777beea886f56 100644
--- a/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/llvm/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -834,43 +834,43 @@ declare <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16>, <8 x i16>) nounwind rea
 define <16 x i8> @test_x86_sse2_packsswb_128_fold() {
 ; X86-SSE-LABEL: test_x86_sse2_packsswb_128_fold:
 ; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-SSE-NEXT:    ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X86-SSE-NEXT:    ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A]
+; X86-SSE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_x86_sse2_packsswb_128_fold:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX1-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
 ; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_x86_sse2_packsswb_128_fold:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
 ; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_x86_sse2_packsswb_128_fold:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-SSE-NEXT:    ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X64-SSE-NEXT:    ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-SSE-NEXT:    movsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-SSE-NEXT:    ## encoding: [0xf2,0x0f,0x10,0x05,A,A,A,A]
+; X64-SSE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_x86_sse2_packsswb_128_fold:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX1-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX1-NEXT:    vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX1-NEXT:    ## encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
 ; X64-AVX1-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_x86_sse2_packsswb_128_fold:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
-; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = [0,127,127,255,255,128,128,128,0,0,0,0,0,0,0,0]
+; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfb,0x10,0x05,A,A,A,A]
 ; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
@@ -902,43 +902,43 @@ declare <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16>, <8 x i16>) nounwind rea
 define <16 x i8> @test_x86_sse2_packuswb_128_fold() {
 ; X86-SSE-LABEL: test_x86_sse2_packuswb_128_fold:
 ; X86-SSE:       ## %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-SSE-NEXT:    ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X86-SSE-NEXT:    ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; X86-SSE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-SSE-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX1-LABEL: test_x86_sse2_packuswb_128_fold:
 ; X86-AVX1:       ## %bb.0:
-; X86-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX1-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
 ; X86-AVX1-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX1-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X86-AVX512-LABEL: test_x86_sse2_packuswb_128_fold:
 ; X86-AVX512:       ## %bb.0:
-; X86-AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X86-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X86-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
 ; X86-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4
 ; X86-AVX512-NEXT:    retl ## encoding: [0xc3]
 ;
 ; X64-SSE-LABEL: test_x86_sse2_packuswb_128_fold:
 ; X64-SSE:       ## %bb.0:
-; X64-SSE-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-SSE-NEXT:    ## encoding: [0x0f,0x28,0x05,A,A,A,A]
-; X64-SSE-NEXT:    ## fixup A - offset: 3, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
+; X64-SSE-NEXT:    movss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-SSE-NEXT:    ## encoding: [0xf3,0x0f,0x10,0x05,A,A,A,A]
+; X64-SSE-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-SSE-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX1-LABEL: test_x86_sse2_packuswb_128_fold:
 ; X64-AVX1:       ## %bb.0:
-; X64-AVX1-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX1-NEXT:    ## encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX1-NEXT:    vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX1-NEXT:    ## encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
 ; X64-AVX1-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-AVX1-NEXT:    retq ## encoding: [0xc3]
 ;
 ; X64-AVX512-LABEL: test_x86_sse2_packuswb_128_fold:
 ; X64-AVX512:       ## %bb.0:
-; X64-AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
-; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x28,0x05,A,A,A,A]
+; X64-AVX512-NEXT:    vmovss {{.*#+}} xmm0 = [0,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; X64-AVX512-NEXT:    ## EVEX TO VEX Compression encoding: [0xc5,0xfa,0x10,0x05,A,A,A,A]
 ; X64-AVX512-NEXT:    ## fixup A - offset: 4, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte
 ; X64-AVX512-NEXT:    retq ## encoding: [0xc3]
   %res = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 255, i16 256, i16 65535, i16 -1, i16 -255, i16 -256, i16 -32678>, <8 x i16> zeroinitializer)
diff --git a/llvm/test/CodeGen/X86/vec_anyext.ll b/llvm/test/CodeGen/X86/vec_anyext.ll
index cdd30165a99bc..09e4a4b3a773d 100644
--- a/llvm/test/CodeGen/X86/vec_anyext.ll
+++ b/llvm/test/CodeGen/X86/vec_anyext.ll
@@ -173,7 +173,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    vmovdqa (%ecx), %xmm0
 ; X86-NEXT:    vmovdqa 16(%ecx), %xmm1
-; X86-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; X86-NEXT:    vmovd {{.*#+}} xmm2 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; X86-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; X86-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; X86-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -211,8 +211,7 @@ define <4 x i8> @func_8_64(ptr %a, ptr %b) nounwind {
 define <4 x i16> @const_16_32() nounwind {
 ; CHECK-LABEL: const_16_32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
-; CHECK-NEXT:    # xmm0 = mem[0,0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = [0,3,8,7,0,0,0,0]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %G = trunc <4 x i32> <i32 0, i32 3, i32 8, i32 7> to <4 x i16>
   ret <4 x i16> %G
@@ -221,8 +220,7 @@ define <4 x i16> @const_16_32() nounwind {
 define <4 x i16> @const_16_64() nounwind {
 ; CHECK-LABEL: const_16_64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vmovddup {{.*#+}} xmm0 = [0,3,8,7,0,3,8,7]
-; CHECK-NEXT:    # xmm0 = mem[0,0]
+; CHECK-NEXT:    vmovsd {{.*#+}} xmm0 = [0,3,8,7,0,0,0,0]
 ; CHECK-NEXT:    ret{{[l|q]}}
   %G = trunc <4 x i64> <i64 0, i64 3, i64 8, i64 7> to <4 x i16>
   ret <4 x i16> %G
diff --git a/llvm/test/CodeGen/X86/vec_fp_to_int.ll b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
index 1d81cb3773080..a0e9f33483b69 100644
--- a/llvm/test/CodeGen/X86/vec_fp_to_int.ll
+++ b/llvm/test/CodeGen/X86/vec_fp_to_int.ll
@@ -1907,13 +1907,12 @@ define <2 x i64> @fptosi_2f64_to_2i64_const() {
 define <4 x i32> @fptosi_2f64_to_2i32_const() {
 ; SSE-LABEL: fptosi_2f64_to_2i32_const:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [4294967295,1,u,u]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4294967295,1,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fptosi_2f64_to_2i32_const:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [4294967295,1,4294967295,1]
-; AVX-NEXT:    # xmm0 = mem[0,0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4294967295,1,0,0]
 ; AVX-NEXT:    retq
   %cvt = fptosi <2 x double> <double -1.0, double 1.0> to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
@@ -1966,13 +1965,12 @@ define <2 x i64> @fptoui_2f64_to_2i64_const() {
 define <4 x i32> @fptoui_2f64_to_2i32_const(<2 x double> %a) {
 ; SSE-LABEL: fptoui_2f64_to_2i32_const:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [2,4,u,u]
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [2,4,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: fptoui_2f64_to_2i32_const:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovddup {{.*#+}} xmm0 = [2,4,2,4]
-; AVX-NEXT:    # xmm0 = mem[0,0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [2,4,0,0]
 ; AVX-NEXT:    retq
   %cvt = fptoui <2 x double> <double 2.0, double 4.0> to <2 x i32>
   %ext = shufflevector <2 x i32> %cvt, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
diff --git a/llvm/test/CodeGen/X86/vec_set-A.ll b/llvm/test/CodeGen/X86/vec_set-A.ll
index c8ff250b5bfbc..a288579bda34e 100644
--- a/llvm/test/CodeGen/X86/vec_set-A.ll
+++ b/llvm/test/CodeGen/X86/vec_set-A.ll
@@ -5,12 +5,12 @@
 define <2 x i64> @test1() nounwind {
 ; X86-LABEL: test1:
 ; X86:       # %bb.0:
-; X86-NEXT:    movaps {{.*#+}} xmm0 = [1,0,0,0]
+; X86-NEXT:    movss {{.*#+}} xmm0 = [1,0,0,0]
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: test1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movaps {{.*#+}} xmm0 = [1,0,0,0]
+; X64-NEXT:    movss {{.*#+}} xmm0 = [1,0,0,0]
 ; X64-NEXT:    retq
   ret <2 x i64> < i64 1, i64 0 >
 }
diff --git a/llvm/test/CodeGen/X86/vector-blend.ll b/llvm/test/CodeGen/X86/vector-blend.ll
index 73c2f4ca5b10b..bd5c9363794aa 100644
--- a/llvm/test/CodeGen/X86/vector-blend.ll
+++ b/llvm/test/CodeGen/X86/vector-blend.ll
@@ -79,22 +79,16 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 ; SSE41-LABEL: vsel_4xi8:
 ; SSE41:       # %bb.0: # %entry
 ; SSE41-NEXT:    movdqa %xmm0, %xmm2
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,255,0,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = [255,255,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: vsel_4xi8:
-; AVX1:       # %bb.0: # %entry
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255]
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: vsel_4xi8:
-; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,255,0,255,255,255,0,255,255,255,0,255,255,255,0,255]
-; AVX2-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: vsel_4xi8:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [255,255,0,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    retq
 entry:
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
   ret <4 x i8> %vsel
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index acf45fc4bbeba..0adb9ddfc426a 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -4379,7 +4379,7 @@ define <2 x i32> @constrained_vector_fptoui_v2i32_v2f32() #0 {
 ;
 ; AVX512-LABEL: constrained_vector_fptoui_v2i32_v2f32:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vmovaps {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0]
+; AVX512-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,4.3E+1,0.0E+0,0.0E+0]
 ; AVX512-NEXT:    vcvttps2udq %zmm0, %zmm0
 ; AVX512-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshl-128.ll b/llvm/test/CodeGen/X86/vector-fshl-128.ll
index 4f100cd3e0530..550b2e0655438 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-128.ll
@@ -1212,7 +1212,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
 ; SSE-LABEL: splatvar_funnnel_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm3 = [15,0,0,0]
 ; SSE-NEXT:    movdqa %xmm2, %xmm4
 ; SSE-NEXT:    pandn %xmm3, %xmm4
 ; SSE-NEXT:    psrlw $1, %xmm1
@@ -1224,7 +1224,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX-LABEL: splatvar_funnnel_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
@@ -1235,7 +1235,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
@@ -1246,7 +1246,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VL-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
@@ -1257,7 +1257,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
@@ -1278,7 +1278,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VLBW-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
@@ -1295,7 +1295,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; XOP-LABEL: splatvar_funnnel_v8i16:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOP-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOP-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; XOP-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
@@ -1306,7 +1306,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; X86-SSE2-NEXT:    movd {{.*#+}} xmm3 = [15,0,0,0]
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pandn %xmm3, %xmm4
 ; X86-SSE2-NEXT:    psrlw $1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshl-256.ll b/llvm/test/CodeGen/X86/vector-fshl-256.ll
index 05be4e1ee928e..683fdf15cdea4 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-256.ll
@@ -1000,7 +1000,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
@@ -1088,7 +1088,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOPAVX1-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; XOPAVX1-NEXT:    vpandn %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; XOPAVX1-NEXT:    vpsrlw $1, %xmm5, %xmm5
@@ -1278,7 +1278,7 @@ define void @fancierRotate2(ptr %arr, ptr %control, i32 %rot0, i32 %rot1) {
 ; AVX1-NEXT:    vmovd %ecx, %xmm2
 ; AVX1-NEXT:    movq $-1024, %rax # imm = 0xFC00
 ; AVX1-NEXT:    vpxor %xmm0, %xmm0, %xmm0
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [31,0,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm3 = [31,0,0,0]
 ; AVX1-NEXT:    vpand %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    .p2align 4, 0x90
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
index 9ddd171b4db69..6a8d9d73f138b 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll
@@ -945,7 +945,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; SSE41-LABEL: splatvar_funnnel_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,0,0,0]
+; SSE41-NEXT:    movd {{.*#+}} xmm2 = [15,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    pandn %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
@@ -958,7 +958,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX-LABEL: splatvar_funnnel_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
@@ -969,7 +969,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512F-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX512F-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
@@ -980,7 +980,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VL-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
@@ -991,7 +991,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
@@ -1002,7 +1002,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VLBW-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
index 58719e6bd8e0c..6fc95cc7780ff 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll
@@ -757,7 +757,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX1-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
index 825ca727b624e..3452b33ada2a9 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-rot-sub128.ll
@@ -324,7 +324,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512F-LABEL: constant_funnnel_v2i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = [4,5,0,0]
 ; AVX512F-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512BW-LABEL: constant_funnnel_v2i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = [4,5,0,0]
 ; AVX512BW-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512VBMI2-NEXT:    vmovq {{.*#+}} xmm1 = [4,5,0,0]
 ; AVX512VBMI2-NEXT:    vprolvd %zmm1, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
index 0b6361ffd4fae..64deaf0e75966 100644
--- a/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshl-sub128.ll
@@ -390,7 +390,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [4,5,4,5]
+; AVX512VBMI2-NEXT:    vmovq {{.*#+}} xmm2 = [4,5,0,0]
 ; AVX512VBMI2-NEXT:    vpshldvd %zmm2, %zmm1, %zmm0
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshr-128.ll b/llvm/test/CodeGen/X86/vector-fshr-128.ll
index 1f51f02a197ac..70e3a02516743 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-128.ll
@@ -1337,7 +1337,7 @@ define <4 x i32> @splatvar_funnnel_v4i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %
 define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %amt) nounwind {
 ; SSE-LABEL: splatvar_funnnel_v8i16:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm3 = [15,0,0,0]
 ; SSE-NEXT:    movdqa %xmm2, %xmm4
 ; SSE-NEXT:    pand %xmm3, %xmm4
 ; SSE-NEXT:    psrlw %xmm4, %xmm1
@@ -1349,7 +1349,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX-LABEL: splatvar_funnnel_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -1360,7 +1360,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX512F-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512F-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -1371,7 +1371,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VL-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VL-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -1382,7 +1382,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX512BW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512BW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -1403,7 +1403,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX512VLBW-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX512VLBW-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -1421,7 +1421,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; XOP-LABEL: splatvar_funnnel_v8i16:
 ; XOP:       # %bb.0:
-; XOP-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOP-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; XOP-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOP-NEXT:    vpsrlw %xmm4, %xmm1, %xmm1
 ; XOP-NEXT:    vpandn %xmm3, %xmm2, %xmm2
@@ -1432,7 +1432,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %
 ;
 ; X86-SSE2-LABEL: splatvar_funnnel_v8i16:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm3 = [15,0,0,0]
+; X86-SSE2-NEXT:    movd {{.*#+}} xmm3 = [15,0,0,0]
 ; X86-SSE2-NEXT:    movdqa %xmm2, %xmm4
 ; X86-SSE2-NEXT:    pand %xmm3, %xmm4
 ; X86-SSE2-NEXT:    psrlw %xmm4, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-256.ll b/llvm/test/CodeGen/X86/vector-fshr-256.ll
index 1a6ecea596563..61aea6ad4d595 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-256.ll
@@ -1032,7 +1032,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %
 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; AVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; AVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
@@ -1121,7 +1121,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i
 ;
 ; XOPAVX1-LABEL: splatvar_funnnel_v16i16:
 ; XOPAVX1:       # %bb.0:
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [15,0,0,0]
+; XOPAVX1-NEXT:    vmovd {{.*#+}} xmm3 = [15,0,0,0]
 ; XOPAVX1-NEXT:    vpand %xmm3, %xmm2, %xmm4
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm1, %xmm5
 ; XOPAVX1-NEXT:    vpsrlw %xmm4, %xmm5, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
index 402eb73e18101..3fa9994312e45 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll
@@ -982,7 +982,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; SSE41-LABEL: splatvar_funnnel_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,0,0,0]
+; SSE41-NEXT:    movd {{.*#+}} xmm2 = [15,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    pand %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
@@ -995,7 +995,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX-LABEL: splatvar_funnnel_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX-NEXT:    vpandn %xmm2, %xmm1, %xmm1
@@ -1006,7 +1006,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX512F-LABEL: splatvar_funnnel_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512F-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512F-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm1
@@ -1017,7 +1017,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX512VL-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VL-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm1
@@ -1028,7 +1028,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX512BW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512BW-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
@@ -1039,7 +1039,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind
 ;
 ; AVX512VLBW-LABEL: splatvar_funnnel_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VLBW-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512VLBW-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %xmm0, %xmm3
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
index bb311468ce913..b2047a04f163e 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll
@@ -796,7 +796,7 @@ define <8 x i32> @splatvar_funnnel_v8i32(<8 x i32> %x, <8 x i32> %amt) nounwind
 define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounwind {
 ; AVX1-LABEL: splatvar_funnnel_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX1-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpsrlw %xmm3, %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
index b8c356711921d..d78aa4e049e0a 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-rot-sub128.ll
@@ -338,7 +338,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512F-LABEL: constant_funnnel_v2i32:
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = [4,5,0,0]
 ; AVX512F-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512F-NEXT:    vzeroupper
@@ -352,7 +352,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512BW-LABEL: constant_funnnel_v2i32:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = [4,5,0,0]
 ; AVX512BW-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -366,7 +366,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x) nounwind {
 ; AVX512VBMI2-LABEL: constant_funnnel_v2i32:
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,5,4,5]
+; AVX512VBMI2-NEXT:    vmovq {{.*#+}} xmm1 = [4,5,0,0]
 ; AVX512VBMI2-NEXT:    vprorvd %zmm1, %zmm0, %zmm0
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
index 56896927e7e5a..1add344e3e41f 100644
--- a/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-fshr-sub128.ll
@@ -454,7 +454,7 @@ define <2 x i32> @constant_funnnel_v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
 ; AVX512VBMI2:       # %bb.0:
 ; AVX512VBMI2-NEXT:    # kill: def $xmm1 killed $xmm1 def $zmm1
 ; AVX512VBMI2-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512VBMI2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [4,5,4,5]
+; AVX512VBMI2-NEXT:    vmovq {{.*#+}} xmm2 = [4,5,0,0]
 ; AVX512VBMI2-NEXT:    vpshrdvd %zmm2, %zmm0, %zmm1
 ; AVX512VBMI2-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX512VBMI2-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-half-conversions.ll b/llvm/test/CodeGen/X86/vector-half-conversions.ll
index 21533818bac11..f59960f06f4a1 100644
--- a/llvm/test/CodeGen/X86/vector-half-conversions.ll
+++ b/llvm/test/CodeGen/X86/vector-half-conversions.ll
@@ -3173,7 +3173,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
 ; AVX512F-NEXT:    callq __truncdfhf2@PLT
 ; AVX512F-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
-; AVX512F-NEXT:    vmovaps {{.*#+}} xmm1 = [16,0,0,0]
+; AVX512F-NEXT:    vmovss {{.*#+}} xmm1 = [16,0,0,0]
 ; AVX512F-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %zmm0 # 64-byte Reload
 ; AVX512F-NEXT:    vpermt2ps %zmm2, %zmm1, %zmm0
 ; AVX512F-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
@@ -3195,7 +3195,7 @@ define <2 x i16> @cvt_2f64_to_2i16(<2 x double> %a0) nounwind {
 ; AVX512-FASTLANE-NEXT:    vmovdqa %xmm0, (%rsp) # 16-byte Spill
 ; AVX512-FASTLANE-NEXT:    callq __truncdfhf2@PLT
 ; AVX512-FASTLANE-NEXT:    vpbroadcastw %xmm0, %xmm1
-; AVX512-FASTLANE-NEXT:    vmovaps {{.*#+}} xmm0 = [4,0,0,0]
+; AVX512-FASTLANE-NEXT:    vmovss {{.*#+}} xmm0 = [4,0,0,0]
 ; AVX512-FASTLANE-NEXT:    vpermi2ps (%rsp), %xmm1, %xmm0 # 16-byte Folded Reload
 ; AVX512-FASTLANE-NEXT:    addq $40, %rsp
 ; AVX512-FASTLANE-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
index d2bef6b234e38..47aab9e264c19 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-3.ll
@@ -240,11 +240,11 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-SLOW-LABEL: load_i16_stride3_vf4:
 ; AVX512BW-SLOW:       # %bb.0:
-; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
+; AVX512BW-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
 ; AVX512BW-SLOW-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-SLOW-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX512BW-SLOW-NEXT:    vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
+; AVX512BW-SLOW-NEXT:    vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
 ; AVX512BW-SLOW-NEXT:    vpermi2w %xmm2, %xmm1, %xmm3
 ; AVX512BW-SLOW-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[0,3,2,3,4,5,6,7]
 ; AVX512BW-SLOW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm1[2,1,2,3]
@@ -257,13 +257,13 @@ define void @load_i16_stride3_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-FAST-LABEL: load_i16_stride3_vf4:
 ; AVX512BW-FAST:       # %bb.0:
-; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,3,6,9,0,3,6,9]
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm0 = [0,3,6,9,0,0,0,0]
 ; AVX512BW-FAST-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX512BW-FAST-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm2, %xmm1, %xmm0
-; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,4,7,10,1,4,7,10]
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm3 = [1,4,7,10,0,0,0,0]
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm2, %xmm1, %xmm3
-; AVX512BW-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [2,5,8,11,2,5,8,11]
+; AVX512BW-FAST-NEXT:    vmovq {{.*#+}} xmm4 = [2,5,8,11,0,0,0,0]
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm2, %xmm1, %xmm4
 ; AVX512BW-FAST-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512BW-FAST-NEXT:    vmovq %xmm3, (%rdx)
@@ -967,8 +967,7 @@ define void @load_i16_stride3_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm11 = xmm0[0,1],xmm1[2],xmm0[3,4],xmm1[5],xmm0[6,7]
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm10 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm11, %xmm11
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm15
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm15[0,1,2],xmm11[3,4,5,6,7]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1807,8 +1806,7 @@ define void @load_i16_stride3_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm3 = xmm8[0,1],xmm13[2],xmm8[3,4],xmm13[5],xmm8[6,7]
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,5,4,5,4,5,4,5,10,11,0,1,6,7,12,13]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm3, %xmm14
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,2,3,8,9,14,15,0,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [2,3,8,9,14,15,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm13
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm13[0,1,2],xmm14[3,4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
index 904f8ddd2e7f4..0d5445dc194ce 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-5.ll
@@ -409,22 +409,22 @@ define void @load_i16_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ;
 ; AVX512BW-LABEL: load_i16_stride5_vf4:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [1,6,11,0,1,6,11,0]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [1,6,11,0,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm1
 ; AVX512BW-NEXT:    vpermw %zmm1, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [0,5,10,0,0,5,10,0]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [0,5,10,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpermw %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX512BW-NEXT:    vpextrw $7, %xmm2, %eax
 ; AVX512BW-NEXT:    vpinsrw $3, %eax, %xmm1, %xmm1
 ; AVX512BW-NEXT:    vpinsrw $3, 32(%rdi), %xmm0, %xmm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,7,12,17,2,7,12,17]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm2 = [2,7,12,17,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm3
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm4
 ; AVX512BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm2
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,8,13,18,3,8,13,18]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,8,13,18,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm5
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [4,9,14,19,4,9,14,19]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,9,14,19,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm3, %ymm4, %ymm6
 ; AVX512BW-NEXT:    vmovq %xmm1, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rdx)
@@ -7449,7 +7449,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm31, %xmm15
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm20, %xmm7
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm1 = xmm15[0],xmm7[1],xmm15[2,3]
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,6,7,0,1,10,11,0,0]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm0 = [6,7,0,1,10,11,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [1,3,6,0,5,u,u,u]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm30, %ymm9
@@ -7482,7 +7482,7 @@ define void @load_i16_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm24
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm15[0,1],xmm7[2],xmm15[3]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm9[0],ymm10[1,2],ymm9[3],ymm10[4],ymm9[5],ymm10[6,7],ymm9[8],ymm10[9,10],ymm9[11],ymm10[12],ymm9[13],ymm10[14,15]
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,8,9,2,3,12,13,0,0]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm2 = [8,9,2,3,12,13,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm2, %xmm0, %xmm3
 ; AVX512F-FAST-NEXT:    vmovdqa64 {{.*#+}} ymm27 = [1,4,6,3,6,u,u,u]
 ; AVX512F-FAST-NEXT:    vpermd %ymm1, %ymm27, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
index fffe46d3cbed1..3efae5c115045 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-6.ll
@@ -213,7 +213,7 @@ define void @load_i16_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7]
 ; AVX512BW-FAST-NEXT:    vpbroadcastw 4(%rdi), %xmm4
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
-; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [3,9,3,9,3,9,3,9]
+; AVX512BW-FAST-NEXT:    vmovd {{.*#+}} xmm5 = [3,9,0,0,0,0,0,0]
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm1, %xmm0, %xmm5
 ; AVX512BW-FAST-NEXT:    vpbroadcastw 20(%rdi), %xmm6
 ; AVX512BW-FAST-NEXT:    vpbroadcastw 8(%rdi), %xmm7
@@ -501,19 +501,19 @@ define void @load_i16_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-LABEL: load_i16_stride6_vf4:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,6,12,18,0,6,12,18]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,6,12,18,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,7,13,19,1,7,13,19]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,7,13,19,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [2,8,14,20,2,8,14,20]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [2,8,14,20,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,9,15,21,3,9,15,21]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,9,15,21,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [4,10,16,22,4,10,16,22]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,10,16,22,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,11,17,23,5,11,17,23]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [5,11,17,23,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
index 4b32647f6fb3e..853f94c84d1a8 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-7.ll
@@ -240,7 +240,7 @@ define void @load_i16_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-FAST-NEXT:    vpsrlq $48, %xmm1, %xmm8
 ; AVX512BW-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX512BW-FAST-NEXT:    vpunpckhwd {{.*#+}} xmm3 = xmm3[4],xmm1[4],xmm3[5],xmm1[5],xmm3[6],xmm1[6],xmm3[7],xmm1[7]
-; AVX512BW-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [6,13,6,13,6,13,6,13]
+; AVX512BW-FAST-NEXT:    vmovd {{.*#+}} xmm8 = [6,13,0,0,0,0,0,0]
 ; AVX512BW-FAST-NEXT:    vpermi2w %xmm1, %xmm0, %xmm8
 ; AVX512BW-FAST-NEXT:    vmovd %xmm2, (%rsi)
 ; AVX512BW-FAST-NEXT:    vmovd %xmm4, (%rdx)
@@ -658,21 +658,21 @@ define void @load_i16_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,7,14,21,0,7,14,21]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,7,14,21,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,8,15,22,1,8,15,22]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,8,15,22,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [2,9,16,23,2,9,16,23]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [2,9,16,23,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,10,17,24,3,10,17,24]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,10,17,24,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [4,11,18,25,4,11,18,25]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,11,18,25,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,12,19,26,5,12,19,26]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [5,12,19,26,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [6,13,20,27,6,13,20,27]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm8 = [6,13,20,27,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
@@ -1343,7 +1343,7 @@ define void @load_i16_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpshufb {{.*#+}} xmm9 = xmm9[u,u,u,u,u,u,u,u,0,1,14,15,u,u,10,11]
 ; AVX512F-FAST-NEXT:    vpblendw {{.*#+}} xmm9 = xmm9[0,1,2,3,4,5],xmm2[6],xmm9[7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm10 = ymm5[0,1],ymm4[2],ymm5[3,4],ymm4[5],ymm5[6,7]
-; AVX512F-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX512F-FAST-NEXT:    vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-FAST-NEXT:    vpshufb %xmm11, %xmm10, %xmm12
 ; AVX512F-FAST-NEXT:    vextracti128 $1, %ymm10, %xmm10
 ; AVX512F-FAST-NEXT:    vpshuflw {{.*#+}} xmm10 = xmm10[3,1,2,3,4,5,6,7]
@@ -2380,7 +2380,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm11[0],ymm12[1,2,3,4,5,6,7],ymm11[8],ymm12[9,10,11,12,13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm11 = ymm11[0,1,2,3],ymm12[4,5,6,7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm14 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT:    vmovd {{.*#+}} xmm12 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm12, %xmm14, %xmm15
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm14, %xmm14
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm14 = xmm14[3,1,2,3,4,5,6,7]
@@ -2546,7 +2546,7 @@ define void @load_i16_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm10 = ymm11[0],ymm10[1,2,3,4,5,6,7],ymm11[8],ymm10[9,10,11,12,13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm10 = ymm11[0,1,2,3],ymm10[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1],ymm0[2],ymm1[3,4],ymm0[5],ymm1[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm11 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT:    vmovd {{.*#+}} xmm11 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm11, %xmm13, %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm13, %xmm13
 ; AVX2-FAST-PERLANE-NEXT:    vpshuflw {{.*#+}} xmm13 = xmm13[3,1,2,3,4,5,6,7]
@@ -5161,7 +5161,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm1 = ymm6[0,1,2,3,4],ymm1[5,6,7],ymm6[8,9,10,11,12],ymm1[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm6 = ymm15[0,1],ymm11[2,3],ymm15[4,5],ymm11[6,7]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm6, %xmm15
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT:    vmovd {{.*#+}} xmm2 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm15, %xmm15
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm6 = xmm6[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm15[0],xmm6[1],xmm15[1],xmm6[2],xmm15[2],xmm6[3],xmm15[3]
@@ -5496,7 +5496,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $36, {{[-0-9]+}}(%r{{[sb]}}p), %ymm15, %ymm2 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm2 = ymm15[0,1],mem[2],ymm15[3,4],mem[5],ymm15[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT:    vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm1, %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm2
@@ -5566,7 +5566,7 @@ define void @load_i16_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendw {{.*#+}} ymm8 = ymm8[0,1,2,3,4],ymm10[5,6,7],ymm8[8,9,10,11,12],ymm10[13,14,15]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm10 = ymm15[0,1],ymm12[2,3],ymm15[4,5],ymm12[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm10, %xmm14
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT:    vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm1, %xmm14, %xmm14
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb {{.*#+}} xmm10 = xmm10[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3]
@@ -11374,7 +11374,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0,1],ymm1[2],mem[3,4],ymm1[5],mem[6,7]
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT:    vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm1, %xmm3
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, %xmm10
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm1
@@ -11431,7 +11431,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm0 = mem[0,1],ymm0[2],mem[3,4],ymm0[5],mem[6,7]
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT:    vmovd {{.*#+}} xmm1 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm1, %xmm0, %xmm3
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX2-FAST-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[3,1,2,3,4,5,6,7]
@@ -11493,7 +11493,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpblendw {{.*#+}} ymm12 = ymm2[0,1,2,3,4],ymm1[5,6,7],ymm2[8,9,10,11,12],ymm1[13,14,15]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm13[0,1],ymm7[2,3],ymm13[4,5],ymm7[6,7]
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm1, %xmm2
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-NEXT:    vmovd {{.*#+}} xmm4 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-FAST-NEXT:    vpshufb {{.*#+}} xmm1 = xmm1[10,11,6,7,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -11522,7 +11522,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vextracti128 $1, %ymm12, %xmm13
 ; AVX2-FAST-NEXT:    vpshufb %xmm4, %xmm13, %xmm13
 ; AVX2-FAST-NEXT:    vmovdqa %xmm4, %xmm6
-; AVX2-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FAST-NEXT:    vmovd {{.*#+}} xmm2 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpshufb %xmm2, %xmm12, %xmm12
 ; AVX2-FAST-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm10 # 32-byte Reload
@@ -12194,7 +12194,7 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $219, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1],ymm2[2],mem[3,4],ymm2[5],mem[6,7]
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT:    vmovd {{.*#+}} xmm6 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm6, %xmm2, %xmm4
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %xmm6, %xmm12
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm2
@@ -12324,9 +12324,9 @@ define void @load_i16_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $51, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1],ymm2[2,3],mem[4,5],ymm2[6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vextracti128 $1, %ymm2, %xmm4
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm13 = [8,9,4,5,8,9,4,5,8,9,4,5,8,9,4,5]
+; AVX2-FAST-PERLANE-NEXT:    vmovd {{.*#+}} xmm13 = [8,9,4,5,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm13, %xmm4, %xmm8
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [10,11,6,7,10,11,6,7,10,11,6,7,10,11,6,7]
+; AVX2-FAST-PERLANE-NEXT:    vmovd {{.*#+}} xmm4 = [10,11,6,7,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpshufb %xmm4, %xmm2, %xmm2
 ; AVX2-FAST-PERLANE-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm2[0],xmm8[0],xmm2[1],xmm8[1],xmm2[2],xmm8[2],xmm2[3],xmm8[3]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa {{.*#+}} ymm2 = [u,u,u,u,u,u,u,u,2,3,0,1,14,15,12,13,26,27,26,27,26,27,26,27,18,19,16,17,30,31,28,29]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
index 78065bc73c1d3..054afe2dfdf2e 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i16-stride-8.ll
@@ -271,23 +271,23 @@ define void @load_i16_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX512BW-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [0,8,16,24,0,8,16,24]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm0 = [0,8,16,24,0,0,0,0]
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm1
 ; AVX512BW-NEXT:    vmovdqa 32(%rdi), %ymm2
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,9,17,25,1,9,17,25]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm3 = [1,9,17,25,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm3
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [2,10,18,26,2,10,18,26]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm4 = [2,10,18,26,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm4
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,11,19,27,3,11,19,27]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm5 = [3,11,19,27,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm5
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm6 = [4,12,20,28,4,12,20,28]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm6 = [4,12,20,28,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm6
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,13,21,29,5,13,21,29]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm7 = [5,13,21,29,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm7
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm8 = [6,14,22,30,6,14,22,30]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm8 = [6,14,22,30,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm8
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [7,15,23,31,7,15,23,31]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm9 = [7,15,23,31,0,0,0,0]
 ; AVX512BW-NEXT:    vpermi2w %ymm2, %ymm1, %ymm9
 ; AVX512BW-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512BW-NEXT:    vmovq %xmm3, (%rdx)
@@ -560,7 +560,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm7 = xmm15[0,1],xmm7[2,3]
 ; AVX512F-SLOW-NEXT:    vpunpckhdq {{.*#+}} xmm15 = xmm14[2],xmm13[2],xmm14[3],xmm13[3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm8 = xmm15[0,1],xmm8[2,3]
-; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [3,7,3,7]
+; AVX512F-SLOW-NEXT:    vmovq {{.*#+}} xmm15 = [3,7,0,0]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm13, %xmm15, %xmm14
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm9 = xmm14[0,1],xmm9[2,3]
 ; AVX512F-SLOW-NEXT:    vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
@@ -615,7 +615,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm15[0],xmm14[0],xmm15[1],xmm14[1]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm0, %xmm16
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm9 = [1,5,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm15, %xmm2
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm14, %xmm9, %xmm2
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
@@ -626,7 +626,7 @@ define void @load_i16_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm10, %xmm1, %xmm0
 ; AVX512F-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm10 = xmm15[2],xmm14[2],xmm15[3],xmm14[3]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm10[0,1],xmm0[2,3]
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm10 = [3,7,3,7]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm10 = [3,7,0,0]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm14, %xmm10, %xmm15
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm11 = xmm15[0,1],xmm11[2,3]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm17, %xmm14
@@ -1353,7 +1353,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1],xmm6[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm5[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %ymm1, %ymm19
-; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
+; AVX512F-SLOW-NEXT:    vmovq {{.*#+}} xmm17 = [3,7,0,0]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm4, %xmm17, %xmm14
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm1 = xmm14[0,1],xmm11[2,3]
 ; AVX512F-SLOW-NEXT:    vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -1490,7 +1490,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm15[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm1, %ymm24
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm15 = [1,5,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm11, %xmm1
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm5, %xmm15, %xmm1
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm12[0],xmm4[0],xmm12[1],xmm4[1]
@@ -1523,7 +1523,7 @@ define void @load_i16_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm1 = ymm8[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %ymm0, %ymm21
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm18 = [3,7,3,7]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm18 = [3,7,0,0]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm5, %xmm18, %xmm11
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm11[0,1],xmm2[2,3]
 ; AVX512F-FAST-NEXT:    vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,4,5,5,7,8,9,10,11,12,13,13,15]
@@ -3238,7 +3238,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm6 = ymm13[0,1,2,3],ymm6[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm6, %zmm0, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
+; AVX512F-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = [3,7,0,0]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm11, %xmm0, %xmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, %xmm6
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm0
@@ -3409,7 +3409,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm12 = xmm14[0,1],xmm12[2,3]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm11 = ymm12[0,1,2,3],ymm11[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm11, %zmm1, %zmm1
-; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [3,7,3,7]
+; AVX512F-SLOW-NEXT:    vmovq {{.*#+}} xmm12 = [3,7,0,0]
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm16, %xmm12, %xmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm11
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1],xmm11[2,3]
@@ -3563,7 +3563,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm5[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm11, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [1,5,1,5]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm1 = [1,5,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm9, %xmm0
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm20, %xmm1, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm1, %xmm11
@@ -3630,7 +3630,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm7[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm3, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm27 = [3,7,3,7]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm27 = [3,7,0,0]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm20, %xmm27, %xmm9
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm16, %xmm0
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm9[0,1],xmm0[2,3]
@@ -3733,7 +3733,7 @@ define void @load_i16_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm11[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm31, %zmm31
 ; AVX512F-FAST-NEXT:    vmovdqa %xmm5, %xmm0
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [1,5,1,5]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm4 = [1,5,0,0]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm18, %xmm4, %xmm0
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm12[0],xmm24[0],xmm12[1],xmm24[1]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3]
@@ -7297,7 +7297,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
 ; AVX512F-SLOW-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
+; AVX512F-SLOW-NEXT:    vmovq {{.*#+}} xmm0 = [3,7,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm1
 ; AVX512F-SLOW-NEXT:    vpermt2d %xmm16, %xmm0, %xmm1
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm0, %xmm16
@@ -7711,7 +7711,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-SLOW-NEXT:    vinserti64x4 $0, %ymm0, %zmm20, %zmm0
-; AVX512F-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm16 = [3,7,3,7]
+; AVX512F-SLOW-NEXT:    vmovq {{.*#+}} xmm16 = [3,7,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm1
 ; AVX512F-SLOW-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm16, %xmm1 # 16-byte Folded Reload
 ; AVX512F-SLOW-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
@@ -8050,7 +8050,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm12[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm9, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [1,5,1,5]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm9 = [1,5,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm30, %xmm0
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm17, %xmm9, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm17, %xmm30
@@ -8225,7 +8225,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm3[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm24, %zmm0
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [3,7,3,7]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm0 = [3,7,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm31, %xmm1
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm28, %xmm0, %xmm1
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm0, %xmm16
@@ -8477,7 +8477,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vmovdqu64 %zmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm23, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm23, (%rsp) # 16-byte Spill
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm15 = [1,5,1,5]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm15 = [1,5,0,0]
 ; AVX512F-FAST-NEXT:    vpermt2d %xmm19, %xmm15, %xmm0
 ; AVX512F-FAST-NEXT:    vmovdqa64 %xmm19, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX512F-FAST-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm9[0],xmm29[0],xmm9[1],xmm29[1]
@@ -8642,7 +8642,7 @@ define void @load_i16_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
 ; AVX512F-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
 ; AVX512F-FAST-NEXT:    vinserti64x4 $0, %ymm0, %zmm21, %zmm21
-; AVX512F-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm17 = [3,7,3,7]
+; AVX512F-FAST-NEXT:    vmovq {{.*#+}} xmm17 = [3,7,0,0]
 ; AVX512F-FAST-NEXT:    vmovdqa (%rsp), %xmm0 # 16-byte Reload
 ; AVX512F-FAST-NEXT:    vpermt2d {{[-0-9]+}}(%r{{[sb]}}p), %xmm17, %xmm0 # 16-byte Folded Reload
 ; AVX512F-FAST-NEXT:    vpblendd $12, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm1 # 16-byte Folded Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
index 4ed9a99c58c3f..cce2340d214aa 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-3.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride3_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-FAST-NEXT:    vpshufd {{.*#+}} xmm2 = xmm0[0,3,2,3]
-; AVX512-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [5,0,5,0]
+; AVX512-FAST-NEXT:    vmovd {{.*#+}} xmm3 = [5,0,0,0]
 ; AVX512-FAST-NEXT:    vpermi2d %xmm0, %xmm1, %xmm3
 ; AVX512-FAST-NEXT:    vpbroadcastd 8(%rdi), %xmm0
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
index 40355e3f7686f..30ac7dc443182 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-4.ll
@@ -78,7 +78,7 @@ define void @load_i32_stride4_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-FAST-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX512-FAST-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
+; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm3 = [1,5,0,0]
 ; AVX512-FAST-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FAST-NEXT:    vmovq %xmm2, (%rsi)
@@ -160,8 +160,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm0 = xmm5[0,1],xmm0[2,3]
 ; AVX2-ONLY-NEXT:    vmovaps 48(%rdi), %xmm5
 ; AVX2-ONLY-NEXT:    vunpcklps {{.*#+}} xmm6 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm7 = [1,5,1,5]
-; AVX2-ONLY-NEXT:    # xmm7 = mem[0,0]
+; AVX2-ONLY-NEXT:    vmovsd {{.*#+}} xmm7 = [1,5,0,0]
 ; AVX2-ONLY-NEXT:    vmovaps (%rdi), %ymm8
 ; AVX2-ONLY-NEXT:    vpermps %ymm8, %ymm7, %ymm7
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3]
@@ -171,8 +170,7 @@ define void @load_i32_stride4_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm2[2],xmm3[2],xmm2[3],xmm3[3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3]
 ; AVX2-ONLY-NEXT:    vunpckhps {{.*#+}} xmm2 = xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [3,7,3,7]
-; AVX2-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT:    vmovsd {{.*#+}} xmm3 = [3,7,0,0]
 ; AVX2-ONLY-NEXT:    vpermps %ymm8, %ymm3, %ymm3
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm2 = xmm3[0,1],xmm2[2,3]
 ; AVX2-ONLY-NEXT:    vmovaps %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
index d3f9b1c4e15a7..317875ca0ba7a 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-6.ll
@@ -86,12 +86,10 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vshufps {{.*#+}} xmm5 = xmm5[2,0,2,3]
 ; AVX2-ONLY-NEXT:    vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm1 = xmm1[0],xmm3[1],xmm1[2,3]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [4,2,4,2]
-; AVX2-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX2-ONLY-NEXT:    vmovsd {{.*#+}} xmm3 = [4,2,0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpermps %ymm0, %ymm3, %ymm3
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = [5,3,5,3]
-; AVX2-ONLY-NEXT:    # xmm6 = mem[0,0]
+; AVX2-ONLY-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX2-ONLY-NEXT:    vpermps %ymm0, %ymm6, %ymm0
 ; AVX2-ONLY-NEXT:    vmovlps %xmm4, (%rsi)
 ; AVX2-ONLY-NEXT:    vmovlps %xmm2, (%rdx)
@@ -118,13 +116,11 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-SLOW-NEXT:    vpinsrd $1, %r10d, %xmm4, %xmm4
 ; AVX512-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,3,3,3]
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3]
-; AVX512-SLOW-NEXT:    vmovddup {{.*#+}} xmm2 = [4,2,4,2]
-; AVX512-SLOW-NEXT:    # xmm2 = mem[0,0]
+; AVX512-SLOW-NEXT:    vmovsd {{.*#+}} xmm2 = [4,2,0,0]
 ; AVX512-SLOW-NEXT:    vmovaps 32(%rdi), %ymm5
 ; AVX512-SLOW-NEXT:    vblendps {{.*#+}} ymm5 = ymm5[0,1,2,3],mem[4,5,6,7]
 ; AVX512-SLOW-NEXT:    vpermps %ymm5, %ymm2, %ymm2
-; AVX512-SLOW-NEXT:    vmovddup {{.*#+}} xmm6 = [5,3,5,3]
-; AVX512-SLOW-NEXT:    # xmm6 = mem[0,0]
+; AVX512-SLOW-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512-SLOW-NEXT:    vpermps %ymm5, %ymm6, %ymm5
 ; AVX512-SLOW-NEXT:    vmovq %xmm3, (%rsi)
 ; AVX512-SLOW-NEXT:    vmovq %xmm1, (%rdx)
@@ -147,15 +143,13 @@ define void @load_i32_stride6_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FAST-NEXT:    vpermi2d %xmm2, %xmm1, %xmm4
 ; AVX512-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [2,4,2,4]
 ; AVX512-FAST-NEXT:    vpermi2d %xmm3, %xmm1, %xmm2
-; AVX512-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [3,5,3,5]
+; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm5 = [3,5,0,0]
 ; AVX512-FAST-NEXT:    vpermi2d %xmm3, %xmm1, %xmm5
-; AVX512-FAST-NEXT:    vmovddup {{.*#+}} xmm1 = [4,2,4,2]
-; AVX512-FAST-NEXT:    # xmm1 = mem[0,0]
+; AVX512-FAST-NEXT:    vmovsd {{.*#+}} xmm1 = [4,2,0,0]
 ; AVX512-FAST-NEXT:    vmovaps 32(%rdi), %ymm3
 ; AVX512-FAST-NEXT:    vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],mem[4,5,6,7]
 ; AVX512-FAST-NEXT:    vpermps %ymm3, %ymm1, %ymm1
-; AVX512-FAST-NEXT:    vmovddup {{.*#+}} xmm6 = [5,3,5,3]
-; AVX512-FAST-NEXT:    # xmm6 = mem[0,0]
+; AVX512-FAST-NEXT:    vmovsd {{.*#+}} xmm6 = [5,3,0,0]
 ; AVX512-FAST-NEXT:    vpermps %ymm3, %ymm6, %ymm3
 ; AVX512-FAST-NEXT:    vmovq %xmm0, (%rsi)
 ; AVX512-FAST-NEXT:    vmovq %xmm4, (%rdx)
@@ -308,13 +302,13 @@ define void @load_i32_stride6_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm5 = xmm5[0,1,2],xmm8[3]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm8 = xmm4[0,1],xmm7[2,3]
 ; AVX2-ONLY-NEXT:    vpshufd {{.*#+}} xmm8 = xmm8[0,1,0,2]
-; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm9 = [4,2,4,2]
+; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm9 = [4,2,0,0]
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpermd %ymm1, %ymm9, %ymm2
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm2 = xmm2[0,1],xmm8[2,3]
 ; AVX2-ONLY-NEXT:    vpmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[3]
-; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm7 = [5,3,5,3]
+; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm7 = [5,3,0,0]
 ; AVX2-ONLY-NEXT:    vpermd %ymm1, %ymm7, %ymm1
 ; AVX2-ONLY-NEXT:    vpblendd {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3]
 ; AVX2-ONLY-NEXT:    vmovdqa %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
index e4f94b368b7fd..abd4525a67737 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-7.ll
@@ -100,8 +100,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm6 = xmm6[0],xmm4[1],xmm6[2,3]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} xmm3 = xmm4[0,1,2],xmm3[3]
 ; AVX2-ONLY-NEXT:    vshufps {{.*#+}} xmm3 = xmm3[3,2,2,3]
-; AVX2-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = [4,3,4,3]
-; AVX2-ONLY-NEXT:    # xmm4 = mem[0,0]
+; AVX2-ONLY-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-ONLY-NEXT:    vpermps %ymm7, %ymm4, %ymm4
 ; AVX2-ONLY-NEXT:    vblendps {{.*#+}} ymm7 = ymm1[0],ymm0[1],ymm1[2,3,4],ymm0[5],ymm1[6,7]
@@ -134,7 +133,7 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
 ; AVX512-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
 ; AVX512-SLOW-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[3,2,2,3]
-; AVX512-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,11,4,11]
+; AVX512-SLOW-NEXT:    vmovq {{.*#+}} xmm1 = [4,11,0,0]
 ; AVX512-SLOW-NEXT:    vmovdqa 32(%rdi), %ymm5
 ; AVX512-SLOW-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512-SLOW-NEXT:    vpermi2d %ymm5, %ymm6, %ymm1
@@ -165,9 +164,9 @@ define void @load_i32_stride7_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FAST-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512-FAST-NEXT:    vpbroadcastd 8(%rdi), %xmm4
 ; AVX512-FAST-NEXT:    vpblendd {{.*#+}} xmm4 = xmm4[0],xmm1[1],xmm4[2,3]
-; AVX512-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [7,2,7,2]
+; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm5 = [7,2,0,0]
 ; AVX512-FAST-NEXT:    vpermi2d %xmm0, %xmm1, %xmm5
-; AVX512-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [4,11,4,11]
+; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm0 = [4,11,0,0]
 ; AVX512-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
 ; AVX512-FAST-NEXT:    vmovdqa (%rdi), %ymm6
 ; AVX512-FAST-NEXT:    vpermi2d %ymm1, %ymm6, %ymm0
@@ -355,8 +354,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
 ; AVX2-SLOW-NEXT:    vbroadcastss 100(%rdi), %xmm9
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm10 = [4,3,4,3]
-; AVX2-SLOW-NEXT:    # xmm10 = mem[0,0]
+; AVX2-SLOW-NEXT:    vmovsd {{.*#+}} xmm10 = [4,3,0,0]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpermps %ymm11, %ymm10, %ymm10
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -414,8 +412,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
 ; AVX2-FAST-NEXT:    vbroadcastss 100(%rdi), %xmm9
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-NEXT:    vmovddup {{.*#+}} xmm10 = [4,3,4,3]
-; AVX2-FAST-NEXT:    # xmm10 = mem[0,0]
+; AVX2-FAST-NEXT:    vmovsd {{.*#+}} xmm10 = [4,3,0,0]
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-NEXT:    vpermps %ymm11, %ymm10, %ymm10
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -472,8 +469,7 @@ define void @load_i32_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} xmm7 = xmm7[0,1],xmm9[2,3]
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastss 100(%rdi), %xmm9
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} xmm9 = xmm8[0,1,2],xmm9[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovddup {{.*#+}} xmm10 = [4,3,4,3]
-; AVX2-FAST-PERLANE-NEXT:    # xmm10 = mem[0,0]
+; AVX2-FAST-PERLANE-NEXT:    vmovsd {{.*#+}} xmm10 = [4,3,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm11 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermps %ymm11, %ymm10, %ymm10
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3]
@@ -850,7 +846,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-SLOW-NEXT:    vpbroadcastd 100(%rdi), %xmm10
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm11
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
+; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm12 = [4,3,0,0]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -954,7 +950,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-NEXT:    vpbroadcastd 100(%rdi), %xmm10
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %xmm11
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
+; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm12 = [4,3,0,0]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1058,7 +1054,7 @@ define void @load_i32_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 100(%rdi), %xmm10
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdi), %xmm11
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm10 = xmm11[0,1,2],xmm10[3]
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm12 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT:    vmovq {{.*#+}} xmm12 = [4,3,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm13 = ymm1[0,1,2,3],ymm0[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm13, %ymm12, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm10 = xmm12[0,1],xmm10[2,3]
@@ -1880,7 +1876,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vpbroadcastd 100(%rdi), %xmm0
 ; AVX2-SLOW-NEXT:    vmovdqa 64(%rdi), %xmm3
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm5 = [4,3,0,0]
 ; AVX2-SLOW-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm7, %ymm12
 ; AVX2-SLOW-NEXT:    vpermd %ymm10, %ymm5, %ymm10
@@ -2104,7 +2100,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vpbroadcastd 100(%rdi), %xmm0
 ; AVX2-FAST-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [4,3,4,3]
+; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm3 = [4,3,0,0]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm4 = ymm6[0,1,2,3],ymm5[4,5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqa %ymm5, %ymm11
 ; AVX2-FAST-NEXT:    vmovdqa %ymm6, %ymm12
@@ -2332,7 +2328,7 @@ define void @load_i32_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastd 100(%rdi), %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa 64(%rdi), %xmm3
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} xmm0 = xmm3[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT:    vmovq {{.*#+}} xmm5 = [4,3,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd {{.*#+}} ymm10 = ymm7[0,1,2,3],ymm9[4,5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm7, %ymm12
 ; AVX2-FAST-PERLANE-NEXT:    vpermd %ymm10, %ymm5, %ymm10
@@ -4253,7 +4249,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-SLOW-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3]
+; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,0,0]
 ; AVX2-SLOW-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
@@ -4763,7 +4759,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FAST-NEXT:    vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FAST-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [4,3,4,3]
+; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm0 = [4,3,0,0]
 ; AVX2-FAST-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
@@ -5274,7 +5270,7 @@ define void @load_i32_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm1[5,6,7]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [4,3,4,3]
+; AVX2-FAST-PERLANE-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqu {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vpblendd $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm0, %ymm0 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm0 = mem[0,1,2,3],ymm0[4,5,6,7]
@@ -9154,8 +9150,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-SLOW-NEXT:    vbroadcastss 100(%rdi), %xmm0
 ; AVX2-SLOW-NEXT:    vmovaps 64(%rdi), %xmm1
 ; AVX2-SLOW-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-SLOW-NEXT:    vmovddup {{.*#+}} xmm7 = [4,3,4,3]
-; AVX2-SLOW-NEXT:    # xmm7 = mem[0,0]
+; AVX2-SLOW-NEXT:    vmovsd {{.*#+}} xmm7 = [4,3,0,0]
 ; AVX2-SLOW-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-SLOW-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-SLOW-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
@@ -10195,8 +10190,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-NEXT:    vbroadcastss 100(%rdi), %xmm0
 ; AVX2-FAST-NEXT:    vmovaps 64(%rdi), %xmm9
 ; AVX2-FAST-NEXT:    vblendps {{.*#+}} xmm0 = xmm9[0,1,2],xmm0[3]
-; AVX2-FAST-NEXT:    vmovddup {{.*#+}} xmm4 = [4,3,4,3]
-; AVX2-FAST-NEXT:    # xmm4 = mem[0,0]
+; AVX2-FAST-NEXT:    vmovsd {{.*#+}} xmm4 = [4,3,0,0]
 ; AVX2-FAST-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm1 # 32-byte Reload
 ; AVX2-FAST-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm1 # 32-byte Folded Reload
 ; AVX2-FAST-NEXT:    # ymm1 = mem[0,1,2,3],ymm1[4,5,6,7]
@@ -11245,8 +11239,7 @@ define void @load_i32_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, pt
 ; AVX2-FAST-PERLANE-NEXT:    vbroadcastss 100(%rdi), %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovaps 64(%rdi), %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vblendps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[3]
-; AVX2-FAST-PERLANE-NEXT:    vmovddup {{.*#+}} xmm7 = [4,3,4,3]
-; AVX2-FAST-PERLANE-NEXT:    # xmm7 = mem[0,0]
+; AVX2-FAST-PERLANE-NEXT:    vmovsd {{.*#+}} xmm7 = [4,3,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
 ; AVX2-FAST-PERLANE-NEXT:    vblendps $15, {{[-0-9]+}}(%r{{[sb]}}p), %ymm2, %ymm2 # 32-byte Folded Reload
 ; AVX2-FAST-PERLANE-NEXT:    # ymm2 = mem[0,1,2,3],ymm2[4,5,6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
index faa1642831c15..8f0c4b9b726a5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i32-stride-8.ll
@@ -153,7 +153,7 @@ define void @load_i32_stride8_vf2(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512-FAST-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX512-FAST-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX512-FAST-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; AVX512-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,5,1,5]
+; AVX512-FAST-NEXT:    vmovq {{.*#+}} xmm3 = [1,5,0,0]
 ; AVX512-FAST-NEXT:    vpermi2d %xmm1, %xmm0, %xmm3
 ; AVX512-FAST-NEXT:    vpunpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; AVX512-FAST-NEXT:    vmovdqa 32(%rdi), %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
index 88ebda3622cc9..ec029c3f9c033 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-2.ll
@@ -182,8 +182,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX1-ONLY-NEXT:    vpand %xmm0, %xmm2, %xmm3
 ; AVX1-ONLY-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -199,7 +198,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX2-ONLY-NEXT:    vpand %xmm0, %xmm2, %xmm3
 ; AVX2-ONLY-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX2-ONLY-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX2-ONLY-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -215,7 +214,7 @@ define void @load_i8_stride2_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX512F-NEXT:    vpand %xmm0, %xmm2, %xmm3
 ; AVX512F-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX512F-NEXT:    vpackuswb %xmm3, %xmm0, %xmm0
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
@@ -281,8 +280,7 @@ define void @load_i8_stride2_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX1-ONLY-NEXT:    vpand %xmm0, %xmm2, %xmm6
 ; AVX1-ONLY-NEXT:    vpand %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vpackuswb %xmm6, %xmm0, %xmm0
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-ONLY-NEXT:    # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm6 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm4[0]
@@ -438,8 +436,7 @@ define void @load_i8_stride2_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1) nou
 ; AVX1-ONLY-NEXT:    vpand %xmm1, %xmm8, %xmm12
 ; AVX1-ONLY-NEXT:    vpand %xmm1, %xmm7, %xmm1
 ; AVX1-ONLY-NEXT:    vpackuswb %xmm12, %xmm1, %xmm1
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-ONLY-NEXT:    # xmm12 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm12 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
index 40894a14a681c..fb9cb361ff863 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-4.ll
@@ -214,57 +214,31 @@ define void @load_i8_stride4_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movq %xmm1, (%r8)
 ; SSE-NEXT:    retq
 ;
-; AVX1-ONLY-LABEL: load_i8_stride4_vf8:
-; AVX1-ONLY:       # %bb.0:
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
-; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX1-ONLY-NEXT:    vmovq %xmm0, (%rsi)
-; AVX1-ONLY-NEXT:    vmovq %xmm3, (%rdx)
-; AVX1-ONLY-NEXT:    vmovq %xmm4, (%rcx)
-; AVX1-ONLY-NEXT:    vmovq %xmm1, (%r8)
-; AVX1-ONLY-NEXT:    retq
-;
-; AVX2-ONLY-LABEL: load_i8_stride4_vf8:
-; AVX2-ONLY:       # %bb.0:
-; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
-; AVX2-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
-; AVX2-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
-; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
-; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
-; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
-; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
-; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
-; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
-; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
-; AVX2-ONLY-NEXT:    vmovq %xmm0, (%rsi)
-; AVX2-ONLY-NEXT:    vmovq %xmm3, (%rdx)
-; AVX2-ONLY-NEXT:    vmovq %xmm4, (%rcx)
-; AVX2-ONLY-NEXT:    vmovq %xmm1, (%r8)
-; AVX2-ONLY-NEXT:    retq
+; AVX1-LABEL: load_i8_stride4_vf8:
+; AVX1:       # %bb.0:
+; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX1-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX1-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
+; AVX1-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm3 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
+; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm4 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
+; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm5 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vpshufb %xmm5, %xmm2, %xmm2
+; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1]
+; AVX1-NEXT:    vmovq %xmm0, (%rsi)
+; AVX1-NEXT:    vmovq %xmm3, (%rdx)
+; AVX1-NEXT:    vmovq %xmm4, (%rcx)
+; AVX1-NEXT:    vmovq %xmm1, (%r8)
+; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: load_i8_stride4_vf8:
 ; AVX512:       # %bb.0:
@@ -413,7 +387,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm4, %xmm5
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
@@ -422,7 +396,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -431,7 +405,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm3, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
@@ -440,7 +414,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -461,7 +435,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX2-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
@@ -470,7 +444,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX2-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-ONLY-NEXT:    vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -479,7 +453,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
 ; AVX2-ONLY-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
-; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm7 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-ONLY-NEXT:    vmovd {{.*#+}} xmm7 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1]
@@ -488,7 +462,7 @@ define void @load_i8_stride4_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm3, %xmm3
 ; AVX2-ONLY-NEXT:    vpshufb %xmm7, %xmm2, %xmm2
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-ONLY-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -780,7 +754,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa 96(%rdi), %xmm2
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm2, %xmm3
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm9 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm9 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa 80(%rdi), %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm3, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa 64(%rdi), %xmm5
@@ -804,7 +778,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm11
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm10 = xmm11[0],xmm10[0],xmm11[1],xmm10[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm11 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm11 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm3, %xmm12
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm5, %xmm13
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1]
@@ -822,7 +796,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm0, %xmm11
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm2, %xmm12
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm3, %xmm13
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm5, %xmm14
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -840,7 +814,7 @@ define void @load_i8_stride4_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
@@ -1545,7 +1519,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm3, %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm3, %xmm8
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa 80(%rdi), %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm5
@@ -1611,7 +1585,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm8, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -1662,7 +1636,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm4, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm8, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
@@ -1705,7 +1679,7 @@ define void @load_i8_stride4_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
index 130207fd9c2eb..03139923ae5c4 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-5.ll
@@ -150,47 +150,26 @@ define void @load_i8_stride5_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; SSE-NEXT:    movd %xmm3, (%r9)
 ; SSE-NEXT:    retq
 ;
-; AVX1-ONLY-LABEL: load_i8_stride5_vf4:
-; AVX1-ONLY:       # %bb.0:
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
-; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
-; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
-; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
-; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX1-ONLY-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
-; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX1-ONLY-NEXT:    vmovd %xmm3, (%rsi)
-; AVX1-ONLY-NEXT:    vmovd %xmm4, (%rdx)
-; AVX1-ONLY-NEXT:    vmovd %xmm5, (%rcx)
-; AVX1-ONLY-NEXT:    vmovd %xmm6, (%r8)
-; AVX1-ONLY-NEXT:    vmovd %xmm0, (%r9)
-; AVX1-ONLY-NEXT:    retq
-;
-; AVX2-LABEL: load_i8_stride5_vf4:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [0,5,10,15,0,5,10,15,0,5,10,15,0,5,10,15]
-; AVX2-NEXT:    vmovdqa (%rdi), %xmm1
-; AVX2-NEXT:    vmovdqa 16(%rdi), %xmm2
-; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
-; AVX2-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
-; AVX2-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
-; AVX2-NEXT:    vmovd %xmm3, (%rsi)
-; AVX2-NEXT:    vmovd %xmm4, (%rdx)
-; AVX2-NEXT:    vmovd %xmm5, (%rcx)
-; AVX2-NEXT:    vmovd %xmm6, (%r8)
-; AVX2-NEXT:    vmovd %xmm0, (%r9)
-; AVX2-NEXT:    retq
+; AVX-LABEL: load_i8_stride5_vf4:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = [0,5,10,15,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vmovdqa (%rdi), %xmm1
+; AVX-NEXT:    vmovdqa 16(%rdi), %xmm2
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm3
+; AVX-NEXT:    vpalignr {{.*#+}} xmm4 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0]
+; AVX-NEXT:    vpshufb %xmm0, %xmm4, %xmm4
+; AVX-NEXT:    vpalignr {{.*#+}} xmm5 = xmm1[2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1]
+; AVX-NEXT:    vpshufb %xmm0, %xmm5, %xmm5
+; AVX-NEXT:    vpalignr {{.*#+}} xmm6 = xmm1[3,4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2]
+; AVX-NEXT:    vpshufb %xmm0, %xmm6, %xmm6
+; AVX-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[4,5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3]
+; AVX-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmovd %xmm3, (%rsi)
+; AVX-NEXT:    vmovd %xmm4, (%rdx)
+; AVX-NEXT:    vmovd %xmm5, (%rcx)
+; AVX-NEXT:    vmovd %xmm6, (%r8)
+; AVX-NEXT:    vmovd %xmm0, (%r9)
+; AVX-NEXT:    retq
   %wide.vec = load <20 x i8>, ptr %in.vec, align 64
   %strided.vec0 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 0, i32 5, i32 10, i32 15>
   %strided.vec1 = shufflevector <20 x i8> %wide.vec, <20 x i8> poison, <4 x i32> <i32 1, i32 6, i32 11, i32 16>
@@ -707,7 +686,7 @@ define void @load_i8_stride5_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm3[u,u,u,u,u,u,u],zero,zero,zero,xmm3[2,7,12,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm1[u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm6 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm6, %xmm4, %xmm5, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm7 = [0,1,2,3,4,5,6,7,8,9,10,11,12,128,128,128]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm4, %xmm5
@@ -1551,7 +1530,7 @@ define void @load_i8_stride5_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm10[u,u,u,u,u,u,u],zero,zero,zero,xmm10[2,7,12,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm8[u,u,u,u,u,u,u,3,8,13],zero,zero,zero,xmm8[u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm11, %xmm12, %xmm11
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm13 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm13, %xmm5, %xmm11, %xmm5
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm12 = [255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255]
 ; AVX1-ONLY-NEXT:    vandps %ymm5, %ymm12, %ymm11
@@ -3169,7 +3148,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, %xmm12
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm6, %xmm6
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = [255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm6, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 176(%rdi), %xmm1
@@ -3191,11 +3170,9 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm4, %xmm3
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm2, %xmm3, %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,1,6,11,128,128,128,128,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [1,6,11,128,128,128,128,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm8, %xmm4
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,128,128,128,0,5,10,15,0]
-; AVX1-ONLY-NEXT:    # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm5 = [128,128,128,0,5,10,15,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
 ; AVX1-ONLY-NEXT:    vpor %xmm4, %xmm6, %xmm4
 ; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm7 = [128,128,3,8,13,0,0,128,128,128,3,8,13,0,0,128]
@@ -3351,8 +3328,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa (%rsp), %xmm7 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm7, %xmm12
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm12 = xmm14[0,1,2],xmm12[3,4,5,6,7]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,2,7,12,128,128,128,0,0]
-; AVX1-ONLY-NEXT:    # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm14 = [2,7,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm15, %xmm0
 ; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm9 = [10,15,128,128,128,0,0,5,10,15,128,128,128,0,0,5]
@@ -3485,8 +3461,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm7, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa (%rsp), %xmm6 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = xmm6[u,u,u,u,u,u],zero,zero,zero,xmm6[1,6,11,u,u,u,u]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,128,128,128,3,8,13,0,0]
-; AVX1-ONLY-NEXT:    # xmm7 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm7 = [128,128,128,3,8,13,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm12, %xmm12
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm10 = xmm12[0,1,2],xmm10[3,4,5],xmm12[6,7]
@@ -3494,8 +3469,7 @@ define void @load_i8_stride5_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    # xmm12 = mem[0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm13, %xmm13
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,4,9,14,128,128,128,0,0]
-; AVX1-ONLY-NEXT:    # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm6 = [4,9,14,128,128,128,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm15, %xmm15
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm13 = xmm15[0,1,2],xmm13[3,4,5],xmm15[6,7]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
index 899f38951342e..ed179d9959546 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-6.ll
@@ -380,8 +380,7 @@ define void @load_i8_stride6_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm2[4,10,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[2,8,14],zero,zero,xmm1[u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm5, %xmm6, %xmm5
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,1,2,3,4,128,128,128]
-; AVX1-ONLY-NEXT:    # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm6 = [0,1,2,3,4,128,128,128,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = zero,zero,zero,zero,zero,xmm0[0,6,12,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm7, %xmm5, %xmm5
@@ -825,7 +824,7 @@ define void @load_i8_stride6_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm3[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm12 = xmm0[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
 ; AVX1-ONLY-NEXT:    vpunpckhqdq {{.*#+}} xmm11 = xmm12[1],xmm11[1]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm12 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm12, %xmm10, %xmm11, %xmm10
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm4[u,u,u,u,u,u,u,u,u,u],zero,zero,zero,xmm4[2,8,14]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm13 = xmm5[u,u,u,u,u,u,u,u,u,u,0,6,12],zero,zero,zero
@@ -1728,7 +1727,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm8[u,u,u,u,u,u,u,u,4,10,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,u,u,u,u,u,u,2,8,14]
 ; AVX1-ONLY-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[5,11],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u,u,u]
@@ -1743,13 +1742,11 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
-; AVX1-ONLY-NEXT:    # xmm8 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm8 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm0, %xmm1
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
-; AVX1-ONLY-NEXT:    # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm2 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa 96(%rdi), %xmm13
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm13, %xmm3
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm3, %xmm1
@@ -1763,7 +1760,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm14, %xmm10
 ; AVX1-ONLY-NEXT:    vpor %xmm4, %xmm10, %xmm4
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm4, %ymm1
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm15 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm15 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm6, %xmm4
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm5, %xmm10
@@ -1804,7 +1801,7 @@ define void @load_i8_stride6_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm15 = xmm14[u,u,u,u,u,u,u,u,u,u,u,3,9,15],zero,zero
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm15, %xmm0
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm8 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm8 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm15
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm1 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
@@ -3597,11 +3594,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa 240(%rdi), %xmm9
 ; AVX1-ONLY-NEXT:    vmovdqa 208(%rdi), %xmm10
 ; AVX1-ONLY-NEXT:    vmovdqa 192(%rdi), %xmm11
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm12 = mem[0,0]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm1 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm12 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm1 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm6, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, %xmm13
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
@@ -3625,16 +3620,14 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm5, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm6, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, %xmm13
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm15, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm12 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm12 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm7, %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, %xmm14
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm3, %xmm15
@@ -3652,12 +3645,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm5, %xmm1, %xmm2, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm4, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, %xmm12
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm2 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm14, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, %xmm13
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm1, %xmm1
@@ -3668,7 +3659,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm5, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm2[1],xmm0[1]
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm11, %xmm1
@@ -3682,13 +3673,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm2 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm7, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, %xmm8
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm14, %xmm13
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm14, %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm3, %xmm14
@@ -3715,13 +3704,11 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpunpckhqdq {{.*#+}} xmm2 = xmm3[1],xmm2[1]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,128,128,128,2,8,14,0,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,128,2,8,14,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa 112(%rdi), %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,6,12,128,128,128,0,0]
-; AVX1-ONLY-NEXT:    # xmm15 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm15 = [0,6,12,128,128,128,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa 96(%rdi), %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm1, %xmm1
@@ -3737,7 +3724,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm2[u,u,u,u,u,u,u,u,u,u,u,2,8,14],zero,zero
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm2
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm14 = [0,0,4,10,0,0,4,10,0,0,4,10,0,0,4,10]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm14 = [0,0,4,10,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm5, %xmm4
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm1 = [0,6,12,0,0,6,12,0,0,6,12,0,0,6,12,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm10, %xmm5
@@ -3825,12 +3812,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vandnps %ymm0, %ymm7, %ymm0
 ; AVX1-ONLY-NEXT:    vorps %ymm0, %ymm1, %ymm0
 ; AVX1-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,128,128,128,3,9,15,0,0]
-; AVX1-ONLY-NEXT:    # xmm8 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm8 = [128,128,128,3,9,15,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm0, %xmm0
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,1,7,13,128,128,128,0,0]
-; AVX1-ONLY-NEXT:    # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm4 = [1,7,13,128,128,128,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm1, %xmm0
@@ -3842,7 +3827,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm5, %xmm14
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm14, %xmm1
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm14
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,0,5,11,0,0,5,11,0,0,5,11,0,0,5,11]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,5,11,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm7 = [1,7,13,0,1,7,13,0,1,7,13,0,1,7,13,0]
@@ -3919,12 +3904,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vandnps %ymm1, %ymm15, %ymm1
 ; AVX1-ONLY-NEXT:    vorps %ymm1, %ymm0, %ymm0
 ; AVX1-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,128,128,128,4,10,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm9 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm9 = [128,128,128,4,10,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm10, %xmm0
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,2,8,14,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm13 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm13 = [2,8,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm1, %xmm1
@@ -3941,7 +3924,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vandnps %ymm1, %ymm8, %ymm1
 ; AVX1-ONLY-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm8, %ymm4 # 32-byte Folded Reload
 ; AVX1-ONLY-NEXT:    vorps %ymm1, %ymm4, %ymm4
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm13 = [2,8,14,0,2,8,14,0,2,8,14,0,2,8,14,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm13 = [2,8,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm11, %xmm1
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm8 = [0,0,6,12,0,0,6,12,0,0,6,12,0,0,6,12]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm12, %xmm5
@@ -3987,11 +3970,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vandnps %ymm0, %ymm5, %ymm0
 ; AVX1-ONLY-NEXT:    vorps %ymm0, %ymm3, %ymm0
 ; AVX1-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,128,128,128,5,11,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm14 = [128,128,128,5,11,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm10, %xmm0
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,3,9,15,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm13 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm13 = [3,9,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm14, %xmm3
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm3, %xmm0
@@ -4009,7 +3990,7 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vandnps %ymm0, %ymm9, %ymm0
 ; AVX1-ONLY-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm9, %ymm4 # 32-byte Folded Reload
 ; AVX1-ONLY-NEXT:    vorps %ymm0, %ymm4, %ymm0
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm8 = [3,9,15,0,3,9,15,0,3,9,15,0,3,9,15,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm8 = [3,9,15,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm9 = [0,1,7,13,0,1,7,13,0,1,7,13,0,1,7,13]
@@ -4054,11 +4035,9 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vandnps %ymm2, %ymm7, %ymm2
 ; AVX1-ONLY-NEXT:    vorps %ymm2, %ymm0, %ymm0
 ; AVX1-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,4,10,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm1 = [4,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm14, %xmm0
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,128,128,0,6,12,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm14 = [128,128,0,6,12,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm13, %xmm2
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm2, %xmm0
@@ -4121,12 +4100,10 @@ define void @load_i8_stride6_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vandnps %ymm3, %ymm0, %ymm3
 ; AVX1-ONLY-NEXT:    vorps %ymm3, %ymm4, %ymm2
 ; AVX1-ONLY-NEXT:    vmovups %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,5,11,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm15 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm15 = [5,11,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm2, %xmm3
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,128,128,1,7,13,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm4 = [128,128,1,7,13,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm13, %xmm5
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm5, %xmm3
 ; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,128,128,128,3,9,15,0,0,128,128,128,3,9,15]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
index 718d4973f97e2..a7a468d15c402 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-7.ll
@@ -225,16 +225,16 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
@@ -303,16 +303,16 @@ define void @load_i8_stride7_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm1[6,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[1,8,15],zero,xmm0[u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpor %xmm3, %xmm4, %xmm3
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm4 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpshufb %xmm4, %xmm0, %xmm5
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm6 = xmm1[0,7,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm6 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpshufb %xmm6, %xmm0, %xmm7
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm8 = xmm1[1,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX512F-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
-; AVX512F-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-NEXT:    vpshufb %xmm8, %xmm0, %xmm9
 ; AVX512F-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm9[0],xmm4[0],xmm9[1],xmm4[1],xmm9[2],xmm4[2],xmm9[3],xmm4[3]
 ; AVX512F-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
@@ -654,7 +654,7 @@ define void @load_i8_stride7_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[u,u,u,u],zero,zero,zero,xmm3[5,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = xmm2[u,u,u,u,0,7,14],zero,xmm2[u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm7, %xmm9, %xmm7
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm9 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm9 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm0, %xmm10
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm11 = xmm1[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
@@ -1403,7 +1403,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm3[u,u,u,u,u,3,10],zero,zero,zero,xmm3[u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm5 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm5, %xmm1
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm5 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm5, %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 80(%rdi), %xmm1
@@ -1415,7 +1415,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = xmm3[u,u,u,u,u,4,11],zero,zero,xmm3[u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm9, %xmm10, %xmm9
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm11 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm11, %xmm8, %xmm9, %xmm8
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm9 = xmm1[u,u,u,u,5,12,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = xmm5[u,u,u,u,u,0,7,14,u,u,u,u,u,u,u,u]
@@ -1455,7 +1455,7 @@ define void @load_i8_stride7_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm14 = xmm6[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm6[5,12]
 ; AVX1-ONLY-NEXT:    vpor %xmm14, %xmm13, %xmm13
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm9, %xmm11, %xmm13, %xmm11
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm13 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm13 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm2, %xmm14
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm15 = xmm7[2,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -2935,7 +2935,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,3,10],zero,zero,zero,xmm15[u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm4[u,u,u,u,u],zero,zero,xmm4[1,8,15,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm6[u,u,u,u,u,u,u,2,9],zero,zero,zero,xmm6[u,u,u,u]
@@ -2952,7 +2952,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm4[u,u,u,u,u],zero,zero,xmm4[2,9,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm15[u,u,u,u,u,4,11],zero,zero,xmm15[u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, (%rsp) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm8[u,u],zero,zero,zero,xmm8[5,12,u,u,u,u,u,u,u,u,u]
@@ -2961,7 +2961,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,3,10],zero,zero,zero,xmm6[u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[1,8,15,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm3 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm3, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm10[2,9],zero,zero,zero,xmm10[u,u,u,u,u,u,u,u,u,u,u]
@@ -2988,7 +2988,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm0 = xmm7[u,u,u,u,u,u,u],zero,zero,xmm7[2,9,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm2 = xmm6[u,u,u,u,u,u,u,4,11],zero,zero,xmm6[u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm2, %xmm2
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm0 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm0, %xmm1, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm1 = xmm9[u,u,2,9],zero,zero,zero,xmm9[u,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm3 = xmm8[u,u],zero,zero,xmm8[0,7,14,u,u,u,u,u,u,u,u,u]
@@ -3096,7 +3096,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm3[7]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm4 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[4,11]
 ; AVX1-ONLY-NEXT:    vpor %xmm4, %xmm1, %xmm1
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm12 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm12 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm2, %xmm4
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm1, %ymm4
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -3115,7 +3115,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3,4,5,6],xmm3[7]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm8[u,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zero,xmm8[5,12]
 ; AVX1-ONLY-NEXT:    vpor %xmm7, %xmm4, %xmm7
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm4 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm10
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm10, %ymm7, %ymm7
 ; AVX1-ONLY-NEXT:    vandps {{[-0-9]+}}(%r{{[sb]}}p), %ymm1, %ymm10 # 32-byte Folded Reload
@@ -3127,7 +3127,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vorps %ymm7, %ymm10, %ymm0
 ; AVX1-ONLY-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm11, %xmm7
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm14 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm14 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm5, %xmm10
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm10[0],xmm7[0],xmm10[1],xmm7[1],xmm10[2],xmm7[2],xmm10[3],xmm7[3]
@@ -3153,7 +3153,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vorps %ymm7, %ymm3, %ymm3
 ; AVX1-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm11, %xmm4
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm7 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm7 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm5, %xmm10
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm5, %xmm3
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3]
@@ -3179,7 +3179,7 @@ define void @load_i8_stride7_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vandnps %ymm5, %ymm13, %ymm5
 ; AVX1-ONLY-NEXT:    vorps %ymm5, %ymm4, %ymm4
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm11, %xmm5
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm7 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm7 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm3, %xmm10
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm10[0],xmm5[0],xmm10[1],xmm5[1],xmm10[2],xmm5[2],xmm10[3],xmm5[3]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm10 = xmm15[u,u,u,u,2,9],zero,zero,zero,xmm15[u,u,u,u,u,u,u]
@@ -6763,11 +6763,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, %xmm10
 ; AVX1-ONLY-NEXT:    vmovdqa 48(%rdi), %xmm8
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm8, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,128,128,128,5,12,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm2 = [128,128,128,5,12,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,7,14,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [0,7,14,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm5
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
@@ -6781,7 +6779,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    # xmm5 = mem[0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm8, %xmm8
 ; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm8, %xmm8
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm6 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm6, %xmm7, %xmm8, %xmm7
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm7, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 240(%rdi), %xmm7
@@ -6863,11 +6861,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm15, %xmm0, %xmm2, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,128,128,128,5,12,0]
-; AVX1-ONLY-NEXT:    # xmm0 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = [0,0,128,128,128,5,12,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm12, %xmm2
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,7,14,128,128,0]
-; AVX1-ONLY-NEXT:    # xmm12 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm12 = [0,0,0,7,14,128,128,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm7, %xmm3
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm5
 ; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = [10,128,128,128,0,0,0,3,10,128,128,128,0,0,0,3]
@@ -6877,7 +6873,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm10, %xmm13
 ; AVX1-ONLY-NEXT:    vpor %xmm4, %xmm13, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm4 = [u,u,255,255,255,255,255,0,0,0,0,0,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm4 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm4, %xmm5, %xmm13, %xmm5
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm8, %xmm0
@@ -6888,12 +6884,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm4, %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,128,128,128,6,13,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm14 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm14 = [128,128,128,6,13,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm11, %xmm0
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,1,8,15,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm2 = [1,8,15,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm10, %xmm3
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm3, %xmm3
@@ -6904,7 +6898,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm7, %xmm13
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm13, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,u,u,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm9 = [255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm9, %xmm3, %xmm13, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
@@ -6920,11 +6914,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm9, %xmm14, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm9, %xmm12
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,2,9,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm9 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm9 = [2,9,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm10, %xmm2
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,128,128,0,7,14,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [128,128,0,7,14,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm11, %xmm4
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm4, %xmm2
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [10,128,128,3,10,128,128,3,10,128,128,3,10,128,128,3]
@@ -6946,11 +6938,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm14, %xmm1, %xmm2, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm14, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,3,10,128,128,128,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm1 = [3,10,128,128,128,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm10, %xmm3
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,128,128,1,8,15,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm2 = [128,128,1,8,15,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm11, %xmm4
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm4, %xmm5
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [11,128,128,4,11,128,128,4,11,128,128,4,11,128,128,4]
@@ -6968,12 +6958,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm3, %xmm2
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm6, %xmm1, %xmm2, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,128,128,128,6,13,0]
-; AVX1-ONLY-NEXT:    # xmm0 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm0 = [0,0,128,128,128,6,13,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm11, %xmm1
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,1,8,15,128,128,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [0,0,1,8,15,128,128,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm14, %xmm2
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm2, %xmm1
@@ -6984,7 +6972,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm12, %xmm13
 ; AVX1-ONLY-NEXT:    vpor %xmm2, %xmm13, %xmm13
-; AVX1-ONLY-NEXT:    vmovdqa {{.*#+}} xmm2 = [u,u,255,255,255,255,255,0,0,0,0,u,u,u,u,u]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm2 = [0,0,255,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm2, %xmm1, %xmm13, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -6999,12 +6987,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm1, %xmm3, %xmm1
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,2,9,128,128,128,0]
-; AVX1-ONLY-NEXT:    # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,2,9,128,128,128,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm14, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm14, %xmm15
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,128,128,0,7,14,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [0,0,128,128,0,7,14,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm11, %xmm4
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm4, %xmm5
 ; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [128,3,10,128,128,3,10,128,128,3,10,128,128,3,10,128]
@@ -7025,11 +7011,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm6, %xmm0
 ; AVX1-ONLY-NEXT:    vpor %xmm3, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm8, %xmm1, %xmm0, %xmm14
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,3,10,128,128,128,0]
-; AVX1-ONLY-NEXT:    # xmm1 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,3,10,128,128,128,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm15, %xmm0
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,128,128,1,8,15,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [0,0,128,128,1,8,15,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm7, %xmm4
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm4, %xmm4
@@ -7098,10 +7082,10 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpor %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm13, %xmm2, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm0 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm7, %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, %xmm14
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,0,4,11,0,0,4,11,0,0,4,11,0,0,4,11]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm0 = [0,0,4,11,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm7 = xmm2[0],xmm1[0],xmm2[1],xmm1[1]
@@ -7167,7 +7151,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm11 = [0,0,0,1,2,3,8,15,0,0,0,1,2,3,8,15]
 ; AVX1-ONLY-NEXT:    # xmm11 = mem[0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm7, %xmm10
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,7,14,0,0,7,14,0,0,7,14,0,0,7,14,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm0 = [0,7,14,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa 112(%rdi), %xmm2
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm12
@@ -7286,7 +7270,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm4, %xmm12
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm4, %xmm7
 ; AVX1-ONLY-NEXT:    vpor %xmm7, %xmm1, %xmm1
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm9 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm9 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm3, %xmm7
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm1, %ymm7
 ; AVX1-ONLY-NEXT:    vmovaps {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,255,255,255]
@@ -7341,7 +7325,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm12, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm12, %xmm11
 ; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm2, %xmm6
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm8 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm8 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm2, %xmm7
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm7, %ymm6, %ymm6
@@ -7371,9 +7355,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vorps %ymm4, %ymm3, %ymm0
 ; AVX1-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm14 = [2,9,0,0,2,9,0,0,2,9,0,0,2,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm14 = [2,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm8, %xmm3
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm0 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm13, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -7442,9 +7426,9 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vandps %ymm3, %ymm15, %ymm3
 ; AVX1-ONLY-NEXT:    vorps %ymm4, %ymm3, %ymm3
 ; AVX1-ONLY-NEXT:    vmovups %ymm3, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [3,10,0,0,3,10,0,0,3,10,0,0,3,10,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [3,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm8, %xmm3
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [5,12,0,0,5,12,0,0,5,12,0,0,5,12,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [5,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm13, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm5, %xmm8
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -7512,7 +7496,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovups %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm4 = xmm0[4,11,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [6,13,0,0,6,13,0,0,6,13,0,0,6,13,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [6,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm0, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm6[0],xmm4[0],xmm6[1],xmm4[1],xmm6[2],xmm4[2],xmm6[3],xmm4[3]
@@ -9561,7 +9545,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{.*#+}} ymm1 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX512F-ONLY-SLOW-NEXT:    vpternlogq $184, %ymm0, %ymm1, %ymm2
 ; AVX512F-ONLY-SLOW-NEXT:    vpternlogq $226, %ymm17, %ymm7, %ymm28
-; AVX512F-ONLY-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-ONLY-SLOW-NEXT:    vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-ONLY-SLOW-NEXT:    vpternlogq $202, %ymm29, %ymm21, %ymm15
 ; AVX512F-ONLY-SLOW-NEXT:    vpternlogq $202, %ymm21, %ymm29, %ymm7
 ; AVX512F-ONLY-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -9977,7 +9961,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $184, %ymm2, %ymm11, %ymm3
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $226, %ymm15, %ymm14, %ymm16
-; AVX512F-ONLY-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm8 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512F-ONLY-FAST-NEXT:    vmovd {{.*#+}} xmm8 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $202, %ymm29, %ymm31, %ymm13
 ; AVX512F-ONLY-FAST-NEXT:    vpternlogq $202, %ymm31, %ymm29, %ymm14
 ; AVX512F-ONLY-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
@@ -10412,7 +10396,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-SLOW-NEXT:    vmovdqa64 {{.*#+}} ymm18 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $184, %ymm0, %ymm18, %ymm2
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $226, %ymm21, %ymm7, %ymm16
-; AVX512DQ-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm10 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512DQ-SLOW-NEXT:    vmovd {{.*#+}} xmm10 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $202, %ymm6, %ymm13, %ymm15
 ; AVX512DQ-SLOW-NEXT:    vpternlogq $202, %ymm13, %ymm6, %ymm7
 ; AVX512DQ-SLOW-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -10829,7 +10813,7 @@ define void @load_i8_stride7_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{.*#+}} ymm11 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
 ; AVX512DQ-FAST-NEXT:    vpternlogq $184, %ymm2, %ymm11, %ymm8
 ; AVX512DQ-FAST-NEXT:    vpternlogq $226, %ymm15, %ymm14, %ymm16
-; AVX512DQ-FAST-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [4,11,0,0,4,11,0,0,4,11,0,0,4,11,0,0]
+; AVX512DQ-FAST-NEXT:    vmovd {{.*#+}} xmm6 = [4,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512DQ-FAST-NEXT:    vpternlogq $202, %ymm31, %ymm0, %ymm13
 ; AVX512DQ-FAST-NEXT:    vpternlogq $202, %ymm0, %ymm31, %ymm14
 ; AVX512DQ-FAST-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
index f2133b9e42d30..0f779732c0fd5 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-load-i8-stride-8.ll
@@ -240,37 +240,37 @@ define void @load_i8_stride8_vf4(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm0 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm2
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm1, %xmm0
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm2, %xmm4
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm3
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm4 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm5
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm1, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm6 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm6 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm2, %xmm7
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm7 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm7 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm2, %xmm8
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm7
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm8 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm8 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm2, %xmm9
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm8
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm9 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm9 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3]
@@ -592,7 +592,7 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; AVX1-ONLY-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa 32(%rdi), %xmm2
@@ -600,70 +600,70 @@ define void @load_i8_stride8_vf8(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5,6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm6 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5,6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm6 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm6 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm3, %xmm7
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm2, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm7 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm7 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm1, %xmm8
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm0, %xmm7
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5,6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm7 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm7 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm3, %xmm8
 ; AVX1-ONLY-NEXT:    vpshufb %xmm7, %xmm2, %xmm7
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm7 = xmm7[0],xmm8[0],xmm7[1],xmm8[1],xmm7[2],xmm8[2],xmm7[3],xmm8[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm8 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm8 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm1, %xmm9
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm0, %xmm8
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5,6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm8 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm8 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm3, %xmm9
 ; AVX1-ONLY-NEXT:    vpshufb %xmm8, %xmm2, %xmm8
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm9[0],xmm8[1],xmm9[1],xmm8[2],xmm9[2],xmm8[3],xmm9[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm9 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm9 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm1, %xmm10
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm0, %xmm9
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5,6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm9 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm9 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm3, %xmm10
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm2, %xmm9
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm10 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm10 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm1, %xmm11
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm0, %xmm10
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1],xmm9[2,3],xmm10[4,5,6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm10 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm10 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm2, %xmm10
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm11 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm11 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm1, %xmm12
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm0, %xmm11
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1],xmm10[2,3],xmm11[4,5,6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm11 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm11 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
@@ -1287,7 +1287,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm5, %xmm3
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm6[0],xmm3[1],xmm6[1],xmm3[2],xmm6[2],xmm3[3],xmm6[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm9 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm9 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm1
 ; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa 32(%rdi), %xmm6
@@ -1295,7 +1295,7 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm7, %xmm10
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm6, %xmm9
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm10[0],xmm9[1],xmm10[1],xmm9[2],xmm10[2],xmm9[3],xmm10[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm10 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm10 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm1, %xmm10
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
@@ -1311,11 +1311,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm5, %xmm10
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm9 = xmm10[0,1,2,3,4,5],xmm9[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm10 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm10 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm7, %xmm11
 ; AVX1-ONLY-NEXT:    vpshufb %xmm10, %xmm6, %xmm10
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm11 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm11 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm3, %xmm12
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm1, %xmm11
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
@@ -1331,11 +1331,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm5, %xmm11
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm10 = xmm11[0,1,2,3,4,5],xmm10[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm11 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm11 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm7, %xmm12
 ; AVX1-ONLY-NEXT:    vpshufb %xmm11, %xmm6, %xmm11
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm11[0],xmm12[0],xmm11[1],xmm12[1],xmm11[2],xmm12[2],xmm11[3],xmm12[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm12 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm12 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm3, %xmm13
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm1, %xmm12
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm1, %xmm9
@@ -1351,11 +1351,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm5, %xmm12
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm11 = xmm12[0,1,2,3,4,5],xmm11[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm12 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm12 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm7, %xmm13
 ; AVX1-ONLY-NEXT:    vpshufb %xmm12, %xmm6, %xmm12
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm13 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm13 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm3, %xmm14
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm1, %xmm13
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
@@ -1370,11 +1370,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm5, %xmm13
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm12 = xmm13[0,1,2,3,4,5],xmm12[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm13 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm13 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm7, %xmm14
 ; AVX1-ONLY-NEXT:    vpshufb %xmm13, %xmm6, %xmm13
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm13 = xmm13[0],xmm14[0],xmm13[1],xmm14[1],xmm13[2],xmm14[2],xmm13[3],xmm14[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm3, %xmm15
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm1, %xmm14
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
@@ -1389,11 +1389,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm5, %xmm14
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm13 = xmm14[0,1,2,3,4,5],xmm13[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm14 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm14 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm6, %xmm14
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm15 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm15 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm3, %xmm0
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm1, %xmm15
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -1408,11 +1408,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm5, %xmm14
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm14[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm14 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm14 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm7, %xmm15
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm6, %xmm14
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm15 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm15 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm3, %xmm0
 ; AVX1-ONLY-NEXT:    vpshufb %xmm15, %xmm9, %xmm15
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -1427,11 +1427,11 @@ define void @load_i8_stride8_vf16(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm7, %xmm4
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm6, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm4 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm4 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm9, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -2850,7 +2850,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm6[0],xmm3[0],xmm6[1],xmm3[1],xmm6[2],xmm3[2],xmm6[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm8 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm14 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm14 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vmovdqa 16(%rdi), %xmm1
@@ -2862,7 +2862,7 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm3, %xmm9
 ; AVX1-ONLY-NEXT:    vpshufb %xmm14, %xmm2, %xmm11
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm11
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm13
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm11 = xmm13[0],xmm11[0],xmm13[1],xmm11[1],xmm13[2],xmm11[2],xmm13[3],xmm11[3]
@@ -2914,13 +2914,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm13, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm9, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm11, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -2965,11 +2965,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm13, %xmm4
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm9, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm11, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm13, %xmm7
@@ -3011,13 +3011,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm8, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm9, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm13, %xmm8
@@ -3062,12 +3062,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm13, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm9, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm8, %xmm7
@@ -3108,13 +3108,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm13, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm10 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm10, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -3160,12 +3160,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm8, %xmm4
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm11, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm10, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm14, %xmm7
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3]
@@ -3208,12 +3208,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm1 = xmm3[0,1,2,3,4,5],xmm1[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm8, %xmm4
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm6, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
@@ -4529,14 +4529,14 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm12 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3]
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm12, %ymm0, %ymm12
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm12[0,1,2,3,4,5,6],ymm4[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm12 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm12 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm12, %xmm6, %xmm13
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm12, %xmm0, %xmm12
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm0, %xmm10
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm15 = xmm12[0],xmm13[0],xmm12[1],xmm13[1],xmm12[2],xmm13[2],xmm12[3],xmm13[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa 128(%rdi), %xmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa 144(%rdi), %xmm9
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm1 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm9, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm9, %xmm13
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm1, %xmm8, %xmm1
@@ -4577,12 +4577,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm6, %xmm14
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm6, %xmm12
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm14 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm6
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm6, %xmm14
@@ -4619,12 +4619,12 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm12, %xmm14
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm12, %xmm27
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm14 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm15
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm26, %xmm12
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
@@ -4662,13 +4662,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm4[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm27, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm0, %xmm14
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm10, %xmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm10, %xmm22
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm14 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm14 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm13, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm13, %xmm25
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm14, %xmm12, %xmm14
@@ -4705,13 +4705,13 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm27, %xmm6
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm6, %xmm4
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm22, %xmm7
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm4 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm25, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm9, %xmm0
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm12, %xmm4
@@ -4749,11 +4749,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm1, %ymm0, %ymm1
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm1 = ymm3[0,1,2,3,4,5,6],ymm1[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm6, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm7, %xmm3
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm5 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm9, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa %xmm9, %xmm14
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
@@ -4789,11 +4789,11 @@ define void @load_i8_stride8_vf32(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm4, %ymm0, %ymm4
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm2 = ymm4[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm4 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm4 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm6, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm4, %xmm7, %xmm4
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm5 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm14, %xmm7
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm5, %xmm12, %xmm5
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3]
@@ -7495,7 +7495,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm2[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,0,0,8,0,0,0,8,0,0,0,8,0,0,0,8]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,0,8,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa 304(%rdi), %xmm12
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm12, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7503,7 +7503,7 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm9 = [0,8,0,0,0,8,0,0,0,8,0,0,0,8,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm9 = [0,8,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa 272(%rdi), %xmm11
 ; AVX1-ONLY-NEXT:    vpshufb %xmm9, %xmm11, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa %xmm11, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -7625,12 +7625,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm13, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm12, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm7, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm11, %xmm0
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm10, %xmm15
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3]
@@ -7724,12 +7724,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm13, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm7, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload
@@ -7822,13 +7822,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm7, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm7, %xmm14
@@ -7919,13 +7919,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm7 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm8 # 16-byte Reload
@@ -8019,12 +8019,12 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm7, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm8, %xmm14
@@ -8116,13 +8116,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm3[0,1,2,3,4,5],xmm2[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm5, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm6, %xmm6
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload
@@ -8212,13 +8212,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm4, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[1],xmm1[1],xmm4[2],xmm1[2],xmm4[3],xmm1[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm1[0,1,2,3,4,5],xmm0[6,7]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm1 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm1 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm1, %xmm6, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3]
-; AVX1-ONLY-NEXT:    vbroadcastss {{.*#+}} xmm0 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX1-ONLY-NEXT:    vmovd {{.*#+}} xmm0 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
 ; AVX1-ONLY-NEXT:    vpshufb %xmm0, %xmm11, %xmm14
 ; AVX1-ONLY-NEXT:    vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload
@@ -10910,14 +10910,14 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
 ; AVX512F-SLOW-NEXT:    vmovdqa 416(%rdi), %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa 432(%rdi), %xmm11
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,1,9,0,0,1,9,0,0,1,9,0,0,1,9]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,1,9,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm0, %xmm5
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm0, %xmm22
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3]
 ; AVX512F-SLOW-NEXT:    vmovdqa 384(%rdi), %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa 400(%rdi), %xmm12
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [1,9,0,0,1,9,0,0,1,9,0,0,1,9,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm2 = [1,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm12, %xmm6
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm0, %xmm7
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm0, %xmm31
@@ -11008,13 +11008,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm8 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [0,0,2,10,0,0,2,10,0,0,2,10,0,0,2,10]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm3 = [0,0,2,10,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm11, %xmm2
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm11, %xmm24
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm22, %xmm11
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm11, %xmm9
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm9 = xmm9[0],xmm2[0],xmm9[1],xmm2[1],xmm9[2],xmm2[2],xmm9[3],xmm2[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [2,10,0,0,2,10,0,0,2,10,0,0,2,10,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm2 = [2,10,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm12, %xmm0
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm12, %xmm30
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm5
@@ -11100,13 +11100,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,3,11,0,0,3,11,0,0,3,11,0,0,3,11]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,3,11,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm24, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm11, %xmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm22, %xmm28
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [3,11,0,0,3,11,0,0,3,11,0,0,3,11,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm3 = [3,11,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm30, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm11
@@ -11189,13 +11189,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,4,12,0,0,4,12,0,0,4,12,0,0,4,12]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,4,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm24, %xmm11
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm7
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm7, %xmm8
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [4,12,0,0,4,12,0,0,4,12,0,0,4,12,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm3 = [4,12,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm30, %xmm7
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm7, %xmm9
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm12
@@ -11278,13 +11278,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,5,13,0,0,5,13,0,0,5,13,0,0,5,13]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,5,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm11, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm24, %xmm27
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm28, %xmm5
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [5,13,0,0,5,13,0,0,5,13,0,0,5,13,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm3 = [5,13,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm30, %xmm10
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm12, %xmm15
@@ -11372,13 +11372,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,6,14,0,0,6,14,0,0,6,14,0,0,6,14]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,6,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm27, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm5, %xmm8
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm5, %xmm21
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [6,14,0,0,6,14,0,0,6,14,0,0,6,14,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm3 = [6,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm10, %xmm9
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm10, %xmm18
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm5
@@ -11462,13 +11462,13 @@ define void @load_i8_stride8_vf64(ptr %in.vec, ptr %out.vec0, ptr %out.vec1, ptr
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm2, %ymm0, %ymm2
 ; AVX512F-SLOW-NEXT:    vinserti128 $1, %xmm3, %ymm0, %ymm3
 ; AVX512F-SLOW-NEXT:    vpblendd {{.*#+}} ymm4 = ymm3[0,1,2,3,4,5,6],ymm2[7]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,0,7,15,0,0,7,15,0,0,7,15,0,0,7,15]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm2 = [0,0,7,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm27, %xmm3
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm3, %xmm3
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm21, %xmm8
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm2, %xmm8, %xmm8
 ; AVX512F-SLOW-NEXT:    vpunpcklwd {{.*#+}} xmm8 = xmm8[0],xmm3[0],xmm8[1],xmm3[1],xmm8[2],xmm3[2],xmm8[3],xmm3[3]
-; AVX512F-SLOW-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [7,15,0,0,7,15,0,0,7,15,0,0,7,15,0,0]
+; AVX512F-SLOW-NEXT:    vmovd {{.*#+}} xmm3 = [7,15,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm18, %xmm9
 ; AVX512F-SLOW-NEXT:    vpshufb %xmm3, %xmm9, %xmm9
 ; AVX512F-SLOW-NEXT:    vmovdqa64 %xmm31, %xmm11
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index c4bf0c3630e83..9ceb3a250bc41 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -342,7 +342,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-SLOW-NEXT:    shrq $48, %rax
 ; AVX2-SLOW-NEXT:    vmovd %eax, %xmm1
 ; AVX2-SLOW-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-SLOW-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
+; AVX2-SLOW-NEXT:    vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX2-SLOW-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
 ; AVX2-SLOW-NEXT:    vmovq %xmm0, 32(%r9)
 ; AVX2-SLOW-NEXT:    vmovdqa %ymm2, (%r9)
@@ -374,7 +374,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FAST-NEXT:    shrq $48, %rax
 ; AVX2-FAST-NEXT:    vmovd %eax, %xmm1
 ; AVX2-FAST-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
+; AVX2-FAST-NEXT:    vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-NEXT:    vmovq %xmm0, 32(%r9)
 ; AVX2-FAST-NEXT:    vmovdqa %ymm2, (%r9)
@@ -406,7 +406,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX2-FAST-PERLANE-NEXT:    shrq $48, %rax
 ; AVX2-FAST-PERLANE-NEXT:    vmovd %eax, %xmm1
 ; AVX2-FAST-PERLANE-NEXT:    vpbroadcastw %xmm1, %xmm1
-; AVX2-FAST-PERLANE-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,255,255,0,255,255,255,255,0]
+; AVX2-FAST-PERLANE-NEXT:    vmovq {{.*#+}} xmm3 = [255,255,0,255,255,255,255,0,0,0,0,0,0,0,0,0]
 ; AVX2-FAST-PERLANE-NEXT:    vpblendvb %xmm3, %xmm0, %xmm1, %xmm0
 ; AVX2-FAST-PERLANE-NEXT:    vmovq %xmm0, 32(%r9)
 ; AVX2-FAST-PERLANE-NEXT:    vmovdqa %ymm2, (%r9)
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
index 482da013d741b..3aab872fa4a91 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll
@@ -475,8 +475,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,xmm0[u,u,u,u,7,15],zero,xmm0[u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm3[6,u,u,u,u],zero,zero,xmm3[7,u,u,u,u,u,u,u,u]
 ; AVX1-ONLY-NEXT:    vpor %xmm6, %xmm7, %xmm6
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,255,255,255,255,0,0,0]
-; AVX1-ONLY-NEXT:    # xmm7 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm7 = [0,255,255,255,255,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm6 = zero,zero,xmm2[0,8,u,u,u],zero,zero,xmm2[1,9,u,u,u],zero,zero
 ; AVX1-ONLY-NEXT:    vpshufb {{.*#+}} xmm7 = xmm1[0,8],zero,zero,xmm1[u,u,u,1,9],zero,zero,xmm1[u,u,u,2,10]
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
index e60034f7d6b75..dcafb52679063 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-8.ll
@@ -187,13 +187,11 @@ define void @store_i8_stride8_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm1 = xmm2[0],mem[0],xmm2[1],mem[1]
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm3[0],mem[0],xmm3[1],mem[1]
 ; AVX1-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,2,6,10,14,3,7,11,15]
-; AVX1-ONLY-NEXT:    # xmm2 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm2 = [2,6,10,14,3,7,11,15,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm1, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm2, %xmm0, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,4,8,12,1,5,9,13]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [0,4,8,12,1,5,9,13,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -361,47 +359,39 @@ define void @store_i8_stride8_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
 ; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = mem[0],zero
 ; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm4 = mem[0],zero
 ; AVX1-ONLY-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,2,10,0,0,3,11]
-; AVX1-ONLY-NEXT:    # xmm4 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm4 = [0,0,2,10,0,0,3,11,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX1-ONLY-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,2,10,0,0,3,11,0,0]
-; AVX1-ONLY-NEXT:    # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm5 = [2,10,0,0,3,11,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm4 = xmm5[0,1],xmm4[2,3],xmm5[4,5],xmm4[6,7]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,8,0,0,1,9]
-; AVX1-ONLY-NEXT:    # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm5 = [0,0,0,8,0,0,1,9,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,8,0,0,1,9,0,0]
-; AVX1-ONLY-NEXT:    # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm6 = [0,8,0,0,1,9,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
 ; AVX1-ONLY-NEXT:    vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,6,14,0,0,7,15]
-; AVX1-ONLY-NEXT:    # xmm5 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm5 = [0,0,6,14,0,0,7,15,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX1-ONLY-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,6,14,0,0,7,15,0,0]
-; AVX1-ONLY-NEXT:    # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm6 = [6,14,0,0,7,15,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3]
 ; AVX1-ONLY-NEXT:    vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,4,12,0,0,5,13]
-; AVX1-ONLY-NEXT:    # xmm6 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm6 = [0,0,4,12,0,0,5,13,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX1-ONLY-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3]
-; AVX1-ONLY-NEXT:    vmovddup {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,4,12,0,0,5,13,0,0]
-; AVX1-ONLY-NEXT:    # xmm3 = mem[0,0]
+; AVX1-ONLY-NEXT:    vmovq {{.*#+}} xmm3 = [4,12,0,0,5,13,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX1-ONLY-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX1-ONLY-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
index 122b478577fbf..5d02bb8b05f18 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-128.ll
@@ -96,7 +96,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv2i64:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
@@ -128,7 +128,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; SSE41-LABEL: testv2i64:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
@@ -160,7 +160,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; AVX1OR2-LABEL: testv2i64:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -188,7 +188,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv2i64:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -216,7 +216,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv2i64:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -257,7 +257,7 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind {
 ;
 ; X86-SSE-LABEL: testv2i64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE-NEXT:    pshufb %xmm0, %xmm4
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -374,7 +374,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv2i64u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    movdqa %xmm2, %xmm3
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm3
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm1
@@ -406,7 +406,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; SSE41-LABEL: testv2i64u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm2, %xmm3
 ; SSE41-NEXT:    pshufb %xmm0, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm1
@@ -438,7 +438,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; AVX1OR2-LABEL: testv2i64u:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -466,7 +466,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv2i64u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -494,7 +494,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv2i64u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -535,7 +535,7 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind {
 ;
 ; X86-SSE-LABEL: testv2i64u:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE-NEXT:    pshufb %xmm0, %xmm4
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -656,7 +656,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv4i32:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
@@ -682,7 +682,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; SSE41-LABEL: testv4i32:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
@@ -708,7 +708,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX1OR2-LABEL: testv4i32:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -731,7 +731,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv4i32:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -754,7 +754,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv4i32:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -790,7 +790,7 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind {
 ;
 ; X86-SSE-LABEL: testv4i32:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE-NEXT:    pshufb %xmm0, %xmm4
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -905,7 +905,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv4i32u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
@@ -931,7 +931,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; SSE41-LABEL: testv4i32u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
@@ -957,7 +957,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX1OR2-LABEL: testv4i32u:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -980,7 +980,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv4i32u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1003,7 +1003,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv4i32u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1039,7 +1039,7 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind {
 ;
 ; X86-SSE-LABEL: testv4i32u:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm3, %xmm4
 ; X86-SSE-NEXT:    pshufb %xmm0, %xmm4
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1142,7 +1142,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv8i16:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
@@ -1162,7 +1162,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; SSE41-LABEL: testv8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
@@ -1182,7 +1182,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX1OR2-LABEL: testv8i16:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -1200,7 +1200,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1218,7 +1218,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv8i16:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1254,7 +1254,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind {
 ;
 ; X86-SSE-LABEL: testv8i16:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1350,7 +1350,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv8i16u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    movdqa %xmm0, %xmm3
@@ -1370,7 +1370,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; SSE41-LABEL: testv8i16u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    movdqa %xmm0, %xmm3
@@ -1390,7 +1390,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX1OR2-LABEL: testv8i16u:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
@@ -1408,7 +1408,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv8i16u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1426,7 +1426,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv8i16u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm3
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm3, %xmm3
@@ -1462,7 +1462,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind {
 ;
 ; X86-SSE-LABEL: testv8i16u:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    movq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm2, %xmm3
 ; X86-SSE-NEXT:    pshufb %xmm0, %xmm3
 ; X86-SSE-NEXT:    movdqa %xmm0, %xmm1
@@ -1552,7 +1552,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv16i8:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    psrlw $4, %xmm0
@@ -1567,7 +1567,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; SSE41-LABEL: testv16i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    psrlw $4, %xmm0
@@ -1582,7 +1582,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; AVX1OR2-LABEL: testv16i8:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1595,7 +1595,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv16i8:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1608,7 +1608,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv16i8:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1630,7 +1630,7 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind {
 ;
 ; X86-SSE-LABEL: testv16i8:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE-NEXT:    pshufb %xmm0, %xmm2
 ; X86-SSE-NEXT:    psrlw $4, %xmm0
@@ -1715,7 +1715,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; SSSE3-LABEL: testv16i8u:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSSE3-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    movdqa %xmm1, %xmm2
 ; SSSE3-NEXT:    pshufb %xmm0, %xmm2
 ; SSSE3-NEXT:    psrlw $4, %xmm0
@@ -1730,7 +1730,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; SSE41-LABEL: testv16i8u:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm2
 ; SSE41-NEXT:    pshufb %xmm0, %xmm2
 ; SSE41-NEXT:    psrlw $4, %xmm0
@@ -1745,7 +1745,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; AVX1OR2-LABEL: testv16i8u:
 ; AVX1OR2:       # %bb.0:
-; AVX1OR2-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1OR2-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1OR2-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX1OR2-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX1OR2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
@@ -1758,7 +1758,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; AVX512VL-LABEL: testv16i8u:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VL-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1771,7 +1771,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; AVX512VLBWDQ-LABEL: testv16i8u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX512VLBWDQ-NEXT:    vpshufb %xmm0, %xmm1, %xmm2
 ; AVX512VLBWDQ-NEXT:    vpsrlw $4, %xmm0, %xmm0
 ; AVX512VLBWDQ-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm0
@@ -1793,7 +1793,7 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 ;
 ; X86-SSE-LABEL: testv16i8u:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movdqa {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; X86-SSE-NEXT:    movq {{.*#+}} xmm1 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; X86-SSE-NEXT:    movdqa %xmm1, %xmm2
 ; X86-SSE-NEXT:    pshufb %xmm0, %xmm2
 ; X86-SSE-NEXT:    psrlw $4, %xmm0
@@ -1812,22 +1812,22 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 define <2 x i64> @foldv2i64() nounwind {
 ; SSE-LABEL: foldv2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [55,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; NOBW-LABEL: foldv2i64:
 ; NOBW:       # %bb.0:
-; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; NOBW-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
 ; NOBW-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: foldv2i64:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X86-SSE-LABEL: foldv2i64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [55,0,0,0]
 ; X86-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
   ret <2 x i64> %out
@@ -1836,22 +1836,22 @@ define <2 x i64> @foldv2i64() nounwind {
 define <2 x i64> @foldv2i64u() nounwind {
 ; SSE-LABEL: foldv2i64u:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [55,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; NOBW-LABEL: foldv2i64u:
 ; NOBW:       # %bb.0:
-; NOBW-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; NOBW-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
 ; NOBW-NEXT:    retq
 ;
 ; AVX512VLBWDQ-LABEL: foldv2i64u:
 ; AVX512VLBWDQ:       # %bb.0:
-; AVX512VLBWDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [55,0,0,0]
+; AVX512VLBWDQ-NEXT:    vmovss {{.*#+}} xmm0 = [55,0,0,0]
 ; AVX512VLBWDQ-NEXT:    retq
 ;
 ; X86-SSE-LABEL: foldv2i64u:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [55,0,0,0]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [55,0,0,0]
 ; X86-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.ctlz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
   ret <2 x i64> %out
diff --git a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
index fe6836c045f3b..8a0d9a6134cea 100644
--- a/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
+++ b/llvm/test/CodeGen/X86/vector-lzcnt-256.ll
@@ -13,7 +13,7 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -199,7 +199,7 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind {
 ; AVX1-LABEL: testv4i64u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm4
 ; AVX1-NEXT:    vpsrlw $4, %xmm2, %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -385,7 +385,7 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -541,7 +541,7 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind {
 ; AVX1-LABEL: testv8i32u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -697,7 +697,7 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -818,7 +818,7 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind {
 ; AVX1-LABEL: testv16i16u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm4
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -939,7 +939,7 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -1035,7 +1035,7 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind {
 ; AVX1-LABEL: testv32i8u:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm1, %xmm2, %xmm3
 ; AVX1-NEXT:    vpsrlw $4, %xmm1, %xmm1
 ; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
diff --git a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
index 82afa15079182..b233855029c58 100644
--- a/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
+++ b/llvm/test/CodeGen/X86/vector-mulfix-legalize.ll
@@ -13,7 +13,7 @@ declare <4 x i16> @llvm.umul.fix.sat.v4i16(<4 x i16>, <4 x i16>, i32 immarg)
 define <4 x i16> @smulfix(<4 x i16> %a) {
 ; CHECK-LABEL: smulfix:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
+; CHECK-NEXT:    movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
 ; CHECK-NEXT:    pmullw %xmm1, %xmm2
 ; CHECK-NEXT:    psrlw $15, %xmm2
@@ -28,7 +28,7 @@ define <4 x i16> @smulfix(<4 x i16> %a) {
 define <4 x i16> @umulfix(<4 x i16> %a) {
 ; CHECK-LABEL: umulfix:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movdqa {{.*#+}} xmm1 = [1,2,3,4,u,u,u,u]
+; CHECK-NEXT:    movq {{.*#+}} xmm1 = [1,2,3,4,0,0,0,0]
 ; CHECK-NEXT:    movdqa %xmm0, %xmm2
 ; CHECK-NEXT:    pmullw %xmm1, %xmm2
 ; CHECK-NEXT:    psrlw $15, %xmm2
diff --git a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
index dbdc3e09fcef0..8056b9a2963c3 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-add-mask.ll
@@ -864,7 +864,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
 ;
 ; SSE41-LABEL: test_v4i16_v4i8:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
@@ -914,7 +914,7 @@ define i16 @test_v4i16_v4i8(<4 x i16> %a0) {
 ; AVX512BW-LABEL: test_v4i16_v4i8:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
 ; AVX512BW-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll
index 43c9be2dc6f97..ad810c092bf55 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-128.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll
@@ -719,7 +719,7 @@ define <2 x i64> @splatvar_rotate_v2i64(<2 x i64> %a, <2 x i64> %b) nounwind {
 ;
 ; X86-SSE2-LABEL: splatvar_rotate_v2i64:
 ; X86-SSE2:       # %bb.0:
-; X86-SSE2-NEXT:    movdqa {{.*#+}} xmm2 = [64,0,0,0]
+; X86-SSE2-NEXT:    movd {{.*#+}} xmm2 = [64,0,0,0]
 ; X86-SSE2-NEXT:    psubq %xmm1, %xmm2
 ; X86-SSE2-NEXT:    movdqa %xmm0, %xmm3
 ; X86-SSE2-NEXT:    psllq %xmm1, %xmm3
@@ -815,7 +815,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; SSE41-LABEL: splatvar_rotate_v8i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [15,0,0,0]
+; SSE41-NEXT:    movd {{.*#+}} xmm2 = [15,0,0,0]
 ; SSE41-NEXT:    movdqa %xmm1, %xmm3
 ; SSE41-NEXT:    pandn %xmm2, %xmm3
 ; SSE41-NEXT:    movdqa %xmm0, %xmm4
@@ -828,7 +828,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX-LABEL: splatvar_rotate_v8i16:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
@@ -839,7 +839,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX512F-LABEL: splatvar_rotate_v8i16:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512F-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512F-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX512F-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
@@ -850,7 +850,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX512VL-LABEL: splatvar_rotate_v8i16:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VL-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX512VL-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
@@ -861,7 +861,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX512BW-LABEL: splatvar_rotate_v8i16:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512BW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512BW-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX512BW-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
@@ -872,7 +872,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind {
 ;
 ; AVX512VLBW-LABEL: splatvar_rotate_v8i16:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX512VLBW-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX512VLBW-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX512VLBW-NEXT:    vpsrlw $1, %xmm0, %xmm4
 ; AVX512VLBW-NEXT:    vpsrlw %xmm3, %xmm4, %xmm3
diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll
index c55335f849569..dae2cf382b820 100644
--- a/llvm/test/CodeGen/X86/vector-rotate-256.ll
+++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll
@@ -649,7 +649,7 @@ define <8 x i32> @splatvar_rotate_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind {
 define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind {
 ; AVX1-LABEL: splatvar_rotate_v16i16:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX1-NEXT:    vpandn %xmm2, %xmm1, %xmm3
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm4
 ; AVX1-NEXT:    vpsrlw $1, %xmm4, %xmm5
diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
index e15be224bec8c..ff41d883380a8 100644
--- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll
@@ -1786,7 +1786,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v4i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,u,16384,8192,u,u,u,u]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [0,0,16384,8192,0,0,0,0]
 ; SSE41-NEXT:    pmulhw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    psraw $1, %xmm0
@@ -1818,7 +1818,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v4i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1896,7 +1896,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v2i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpsravw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
index f340615464cfa..71719e03c7c6d 100644
--- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll
@@ -1486,7 +1486,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ;
 ; SSE41-LABEL: constant_shift_v4i16:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm1 = [u,32768,16384,8192,u,u,u,u]
+; SSE41-NEXT:    movq {{.*#+}} xmm1 = [0,32768,16384,8192,0,0,0,0]
 ; SSE41-NEXT:    pmulhuw %xmm0, %xmm1
 ; SSE41-NEXT:    pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7]
 ; SSE41-NEXT:    retq
@@ -1511,7 +1511,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v4i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1581,7 +1581,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v2i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpsrlvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
index 4d4642b18878e..19645fd08c946 100644
--- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
+++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll
@@ -1339,7 +1339,7 @@ define <4 x i16> @constant_shift_v4i16(<4 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v4i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [0,1,2,3,0,1,2,3]
+; AVX512BW-NEXT:    vmovq {{.*#+}} xmm1 = [0,1,2,3,0,0,0,0]
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
@@ -1399,7 +1399,7 @@ define <2 x i16> @constant_shift_v2i16(<2 x i16> %a) nounwind {
 ; AVX512BW-LABEL: constant_shift_v2i16:
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 def $zmm0
-; AVX512BW-NEXT:    vpbroadcastd {{.*#+}} xmm1 = [2,3,2,3,2,3,2,3]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [2,3,0,0,0,0,0,0]
 ; AVX512BW-NEXT:    vpsllvw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    # kill: def $xmm0 killed $xmm0 killed $zmm0
 ; AVX512BW-NEXT:    vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
index b40be9452ddd7..e298091bfb983 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll
@@ -1003,7 +1003,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
 ;
 ; SSSE3-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSSE3-NEXT:    movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; SSSE3-NEXT:    pshufb %xmm2, %xmm1
 ; SSSE3-NEXT:    pshufb %xmm2, %xmm0
 ; SSSE3-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1011,7 +1011,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
 ;
 ; SSE41-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movdqa {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    pshufb %xmm2, %xmm1
 ; SSE41-NEXT:    pshufb %xmm2, %xmm0
 ; SSE41-NEXT:    punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1019,8 +1019,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
 ;
 ; AVX1-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX1-NEXT:    # xmm2 = mem[0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1028,7 +1027,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
 ;
 ; AVX2-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
@@ -1036,7 +1035,7 @@ define <16 x i8> @shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31(
 ;
 ; AVX512VLBW-LABEL: shuffle_v16i8_01_03_05_07_09_11_13_15_17_19_21_23_25_27_29_31:
 ; AVX512VLBW:       # %bb.0:
-; AVX512VLBW-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
+; AVX512VLBW-NEXT:    vmovq {{.*#+}} xmm2 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX512VLBW-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX512VLBW-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX512VLBW-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
index 5b4f15b51ec00..bac63f2ddb505 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll
@@ -539,7 +539,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,0,12,0,0,0,0]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -580,7 +580,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,13,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -621,7 +621,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0]
+; AVX512VL-NEXT:    vmovd {{.*#+}} xmm1 = [0,14,0,0,0,0,0,0]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -662,7 +662,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0
 ;
 ; AVX512VL-LABEL: shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm1 = [15,0,0,0]
+; AVX512VL-NEXT:    vmovd {{.*#+}} xmm1 = [15,0,0,0]
 ; AVX512VL-NEXT:    vpermw %ymm0, %ymm1, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
index baa942794ccd8..1cfa5e6dfdff5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll
@@ -1281,7 +1281,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_24_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,0,24,0,0,0,0,0,0,0,0]
 ; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
@@ -1328,7 +1328,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_00_25_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,0,0,0,0,25,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
@@ -1375,7 +1375,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_00_26_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,0,0,0,26,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
@@ -1422,7 +1422,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_00_27_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vmovq {{.*#+}} xmm1 = [0,0,0,0,27,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
@@ -1469,7 +1469,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_00_28_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vmovd {{.*#+}} xmm1 = [0,0,0,28,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
@@ -1516,7 +1516,7 @@ define <32 x i8> @shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_00_29_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vmovd {{.*#+}} xmm1 = [0,0,29,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
@@ -1563,7 +1563,7 @@ define <32 x i8> @shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_00_30_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX512VLVBMI-NEXT:    vmovd {{.*#+}} xmm1 = [0,30,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
@@ -1610,7 +1610,7 @@ define <32 x i8> @shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ;
 ; AVX512VLVBMI-LABEL: shuffle_v32i8_31_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00:
 ; AVX512VLVBMI:       # %bb.0:
-; AVX512VLVBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [31,0,0,0]
+; AVX512VLVBMI-NEXT:    vmovd {{.*#+}} xmm1 = [31,0,0,0]
 ; AVX512VLVBMI-NEXT:    vpermb %ymm0, %ymm1, %ymm0
 ; AVX512VLVBMI-NEXT:    retq
 ;
@@ -2565,12 +2565,10 @@ define <32 x i8> @shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_
 ; AVX1-LABEL: shuffle_v32i8_39_38_37_36_35_34_33_32_15_14_13_12_11_10_09_08_55_54_53_52_51_50_49_48_31_30_29_28_27_26_25_24:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,15,14,13,12,11,10,9,8]
-; AVX1-NEXT:    # xmm3 = mem[0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm3 = [15,14,13,12,11,10,9,8,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT:    vmovddup {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,7,6,5,4,3,2,1,0]
-; AVX1-NEXT:    # xmm5 = mem[0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm5 = [7,6,5,4,3,2,1,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm4
 ; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
@@ -2771,7 +2769,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
 ; AVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2785,7 +2783,7 @@ define <32 x i8> @shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_
 ; XOPAVX1-LABEL: shuffle_v32i8_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_00_16_16_16_16_16_16_16_24_16_16_16_16_16_16_16_16:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
+; XOPAVX1-NEXT:    vmovq {{.*#+}} xmm2 = [0,0,0,0,0,0,0,8,0,0,0,0,0,0,0,0]
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2803,7 +2801,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2817,7 +2815,7 @@ define <32 x i8> @shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; XOPAVX1-LABEL: shuffle_v32i8_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_00_16_30_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
+; XOPAVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2835,7 +2833,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; AVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
@@ -2849,7 +2847,7 @@ define <32 x i8> @shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_
 ; XOPAVX1-LABEL: shuffle_v32i8_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_31_16_16_16_16_16_16_16_16_16_16_16_16_16_16_16:
 ; XOPAVX1:       # %bb.0:
 ; XOPAVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; XOPAVX1-NEXT:    vmovdqa {{.*#+}} xmm2 = [15,0,0,0]
+; XOPAVX1-NEXT:    vmovd {{.*#+}} xmm2 = [15,0,0,0]
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; XOPAVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; XOPAVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
index a69a1a18f26e5..02cad49d29fdb 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v8.ll
@@ -208,7 +208,7 @@ define <8 x float> @shuffle_v8f32_06000000(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8f32_06000000:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,6,0,0]
+; AVX2OR512VL-NEXT:    vmovsd {{.*#+}} xmm1 = [0,6,0,0]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -225,7 +225,7 @@ define <8 x float> @shuffle_v8f32_70000000(<8 x float> %a, <8 x float> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8f32_70000000:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2OR512VL-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1808,7 +1808,7 @@ define <8 x i32> @shuffle_v8i32_06000000(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8i32_06000000:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [0,6,0,0]
+; AVX2OR512VL-NEXT:    vmovsd {{.*#+}} xmm1 = [0,6,0,0]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 6, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -1825,7 +1825,7 @@ define <8 x i32> @shuffle_v8i32_70000000(<8 x i32> %a, <8 x i32> %b) {
 ;
 ; AVX2OR512VL-LABEL: shuffle_v8i32_70000000:
 ; AVX2OR512VL:       # %bb.0:
-; AVX2OR512VL-NEXT:    vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; AVX2OR512VL-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
 ; AVX2OR512VL-NEXT:    vpermps %ymm0, %ymm1, %ymm0
 ; AVX2OR512VL-NEXT:    retq
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
index fa0ec33bf3408..24b1b42c2dc05 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v32.ll
@@ -199,13 +199,13 @@ define <32 x i16> @shuffle_v32i16_03_00_01_02_07_04_05_06_11_08_09_10_15_12_13_1
 define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) {
 ; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; KNL:       ## %bb.0:
-; KNL-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; KNL-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; KNL-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; KNL-NEXT:    retq
 ;
 ; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; SKX:       ## %bb.0:
-; SKX-NEXT:    vmovaps {{.*#+}} xmm1 = [65535,0,0,0]
+; SKX-NEXT:    vmovss {{.*#+}} xmm1 = [65535,0,0,0]
 ; SKX-NEXT:    vandps %zmm1, %zmm0, %zmm0
 ; SKX-NEXT:    retq
   %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> <i32 0, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
index 4df5307316a42..f4cc8522adec5 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v64.ll
@@ -109,25 +109,25 @@ define <64 x i8> @shuffle_v64i8_79_00_01_02_03_04_05_06_07_08_09_10_11_12_13_14_
 define <64 x i8> @shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<64 x i8> %a) {
 ; AVX512F-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512F-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    retq
 ;
 ; AVX512DQ-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovaps {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512DQ-NEXT:    vmovss {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512DQ-NEXT:    vandps %zmm1, %zmm0, %zmm0
 ; AVX512DQ-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: shuffle_v64i8_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512VBMI-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512VBMI-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512VBMI-NEXT:    retq
   %shuffle = shufflevector <64 x i8> %a, <64 x i8> zeroinitializer, <64 x i32> <i32 0, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64, i32 64>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
index 9bfca824fb71a..008593a239f86 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-512-v8.ll
@@ -142,7 +142,7 @@ define <8 x double> @shuffle_v8f64_06000000(<8 x double> %a, <8 x double> %b) {
 define <8 x double> @shuffle_v8f64_70000000(<8 x double> %a, <8 x double> %b) {
 ; ALL-LABEL: shuffle_v8f64_70000000:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
 ; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
@@ -960,7 +960,7 @@ define <8 x i64> @shuffle_v8i64_06000000(<8 x i64> %a, <8 x i64> %b) {
 define <8 x i64> @shuffle_v8i64_70000000(<8 x i64> %a, <8 x i64> %b) {
 ; ALL-LABEL: shuffle_v8i64_70000000:
 ; ALL:       # %bb.0:
-; ALL-NEXT:    vmovaps {{.*#+}} xmm1 = [7,0,0,0]
+; ALL-NEXT:    vmovss {{.*#+}} xmm1 = [7,0,0,0]
 ; ALL-NEXT:    vpermpd %zmm0, %zmm1, %zmm0
 ; ALL-NEXT:    ret{{[l|q]}}
   %shuffle = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
index 1e2ee7b99a608..d285d07e66049 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -1744,26 +1744,18 @@ define <4 x i8> @combine_test1c(ptr %a, ptr %b) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [0,255,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: combine_test1c:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: combine_test1c:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,255,255,255,0,255,255,255,0,255,255,255,0,255,255,255]
-; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: combine_test1c:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [0,255,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %A = load <4 x i8>, ptr %a
   %B = load <4 x i8>, ptr %b
   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -1838,26 +1830,18 @@ define <4 x i8> @combine_test4c(ptr %a, ptr %b) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    movd {{.*#+}} xmm1 = mem[0],zero,zero,zero
 ; SSE41-NEXT:    movd {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [255,0,255,255,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE41-NEXT:    movss {{.*#+}} xmm0 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
 ; SSE41-NEXT:    pblendvb %xmm0, %xmm2, %xmm1
 ; SSE41-NEXT:    movdqa %xmm1, %xmm0
 ; SSE41-NEXT:    retq
 ;
-; AVX1-LABEL: combine_test4c:
-; AVX1:       # %bb.0:
-; AVX1-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
-; AVX1-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX1-NEXT:    retq
-;
-; AVX2-LABEL: combine_test4c:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [255,0,255,255,255,0,255,255,255,0,255,255,255,0,255,255]
-; AVX2-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
-; AVX2-NEXT:    retq
+; AVX-LABEL: combine_test4c:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX-NEXT:    vmovd {{.*#+}} xmm2 = [255,0,255,255,0,0,0,0,0,0,0,0,0,0,0,0]
+; AVX-NEXT:    vpblendvb %xmm2, %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    retq
   %A = load <4 x i8>, ptr %a
   %B = load <4 x i8>, ptr %b
   %1 = shufflevector <4 x i8> %A, <4 x i8> %B, <4 x i32> <i32 4, i32 1, i32 6, i32 3>
@@ -3187,7 +3171,7 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
 define void @PR43024() {
 ; SSE2-LABEL: PR43024:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE2-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
 ; SSE2-NEXT:    movaps %xmm0, (%rax)
 ; SSE2-NEXT:    addss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
 ; SSE2-NEXT:    xorps %xmm1, %xmm1
@@ -3198,7 +3182,7 @@ define void @PR43024() {
 ;
 ; SSSE3-LABEL: PR43024:
 ; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSSE3-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
 ; SSSE3-NEXT:    movaps %xmm0, (%rax)
 ; SSSE3-NEXT:    addss %xmm0, %xmm0
 ; SSSE3-NEXT:    xorps %xmm1, %xmm1
@@ -3209,7 +3193,7 @@ define void @PR43024() {
 ;
 ; SSE41-LABEL: PR43024:
 ; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE41-NEXT:    movsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
 ; SSE41-NEXT:    movaps %xmm0, (%rax)
 ; SSE41-NEXT:    addss %xmm0, %xmm0
 ; SSE41-NEXT:    xorps %xmm1, %xmm1
@@ -3220,7 +3204,7 @@ define void @PR43024() {
 ;
 ; AVX-LABEL: PR43024:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rax)
 ; AVX-NEXT:    vaddss {{\.?LCPI[0-9]+_[0-9]+}}+4(%rip), %xmm0, %xmm0
 ; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
index 95a57d1bdf331..aca50c461a7a1 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-v1.ll
@@ -42,7 +42,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; AVX512F-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
+; AVX512F-NEXT:    vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
 ; AVX512F-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
@@ -56,7 +56,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; AVX512VL-NEXT:    vptestmq %xmm0, %xmm0, %k1
 ; AVX512VL-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm1 {%k1} {z}
-; AVX512VL-NEXT:    vmovdqa {{.*#+}} xmm2 = [18446744073709551615,0]
+; AVX512VL-NEXT:    vmovq {{.*#+}} xmm2 = [18446744073709551615,0]
 ; AVX512VL-NEXT:    vpalignr {{.*#+}} xmm1 = xmm1[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7]
 ; AVX512VL-NEXT:    vptestmq %xmm1, %xmm1, %k1
 ; AVX512VL-NEXT:    vmovdqa64 %xmm0, %xmm0 {%k1} {z}
@@ -67,7 +67,7 @@ define <2 x i1> @shuf2i1_1_2(<2 x i1> %a) {
 ; VL_BW_DQ-NEXT:    vpsllq $63, %xmm0, %xmm0
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
-; VL_BW_DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [18446744073709551615,0]
+; VL_BW_DQ-NEXT:    vmovq {{.*#+}} xmm1 = [18446744073709551615,0]
 ; VL_BW_DQ-NEXT:    vpalignr {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7]
 ; VL_BW_DQ-NEXT:    vpmovq2m %xmm0, %k0
 ; VL_BW_DQ-NEXT:    vpmovm2q %k0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll
index f8e3a7a23056f..3294c7ffee40d 100644
--- a/llvm/test/CodeGen/X86/vector-trunc.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc.ll
@@ -516,7 +516,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
 ; AVX1-LABEL: trunc8i32_8i8:
 ; AVX1:       # %bb.0: # %entry
 ; AVX1-NEXT:    vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -527,7 +527,7 @@ define void @trunc8i32_8i8(<8 x i32> %a) {
 ; AVX2-LABEL: trunc8i32_8i8:
 ; AVX2:       # %bb.0: # %entry
 ; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm1
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm2 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm2 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm2, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
index 3d5947d8e59bd..3dc43031cea9e 100644
--- a/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
+++ b/llvm/test/CodeGen/X86/vector-tzcnt-128.ll
@@ -1762,37 +1762,37 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind {
 define <2 x i64> @foldv2i64() nounwind {
 ; SSE-LABEL: foldv2i64:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [8,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: foldv2i64:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: foldv2i64:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQ-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: foldv2i64:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQVL-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv2i64:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG_NOVLX-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv2i64:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; BITALG-NEXT:    retq
 ;
 ; X86-SSE-LABEL: foldv2i64:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [8,0,0,0]
 ; X86-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 0)
   ret <2 x i64> %out
@@ -1801,37 +1801,37 @@ define <2 x i64> @foldv2i64() nounwind {
 define <2 x i64> @foldv2i64u() nounwind {
 ; SSE-LABEL: foldv2i64u:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
+; SSE-NEXT:    movss {{.*#+}} xmm0 = [8,0,0,0]
 ; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: foldv2i64u:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQ-LABEL: foldv2i64u:
 ; AVX512VPOPCNTDQ:       # %bb.0:
-; AVX512VPOPCNTDQ-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQ-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX512VPOPCNTDQ-NEXT:    retq
 ;
 ; AVX512VPOPCNTDQVL-LABEL: foldv2i64u:
 ; AVX512VPOPCNTDQVL:       # %bb.0:
-; AVX512VPOPCNTDQVL-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; AVX512VPOPCNTDQVL-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; AVX512VPOPCNTDQVL-NEXT:    retq
 ;
 ; BITALG_NOVLX-LABEL: foldv2i64u:
 ; BITALG_NOVLX:       # %bb.0:
-; BITALG_NOVLX-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG_NOVLX-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; BITALG_NOVLX-NEXT:    retq
 ;
 ; BITALG-LABEL: foldv2i64u:
 ; BITALG:       # %bb.0:
-; BITALG-NEXT:    vmovaps {{.*#+}} xmm0 = [8,0,0,0]
+; BITALG-NEXT:    vmovss {{.*#+}} xmm0 = [8,0,0,0]
 ; BITALG-NEXT:    retq
 ;
 ; X86-SSE-LABEL: foldv2i64u:
 ; X86-SSE:       # %bb.0:
-; X86-SSE-NEXT:    movaps {{.*#+}} xmm0 = [8,0,0,0]
+; X86-SSE-NEXT:    movss {{.*#+}} xmm0 = [8,0,0,0]
 ; X86-SSE-NEXT:    retl
   %out = call <2 x i64> @llvm.cttz.v2i64(<2 x i64> <i64 256, i64 -1>, i1 -1)
   ret <2 x i64> %out
diff --git a/llvm/test/CodeGen/X86/vselect-constants.ll b/llvm/test/CodeGen/X86/vselect-constants.ll
index 0630a40b88099..050b3329a4abb 100644
--- a/llvm/test/CodeGen/X86/vselect-constants.ll
+++ b/llvm/test/CodeGen/X86/vselect-constants.ll
@@ -281,7 +281,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    pxor %xmm1, %xmm1
 ; SSE-NEXT:    pcmpeqw %xmm0, %xmm1
-; SSE-NEXT:    movdqa {{.*#+}} xmm0 = [1,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm0 = [1,0,0,0]
 ; SSE-NEXT:    pand %xmm1, %xmm0
 ; SSE-NEXT:    pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
 ; SSE-NEXT:    por %xmm0, %xmm1
@@ -292,7 +292,7 @@ define i32 @wrong_min_signbits(<2 x i16> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqa {{.*#+}} xmm1 = [2,0,0,0]
+; AVX-NEXT:    vmovd {{.*#+}} xmm1 = [2,0,0,0]
 ; AVX-NEXT:    vpblendvb %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
 ; AVX-NEXT:    vmovd %xmm0, %eax
 ; AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vselect-post-combine.ll b/llvm/test/CodeGen/X86/vselect-post-combine.ll
index e91b8d029bcb4..474f70f78937e 100644
--- a/llvm/test/CodeGen/X86/vselect-post-combine.ll
+++ b/llvm/test/CodeGen/X86/vselect-post-combine.ll
@@ -4,7 +4,7 @@
 define ptr @test_mul(ptr %addr) {
 ; AVX2-LABEL: test_mul:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [255,0,0,0]
 ; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm1 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
 ; AVX2-NEXT:    vpblendvb %xmm0, (%rdi), %xmm1, %xmm0
 ; AVX2-NEXT:    vpmovzxbw {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero,xmm0[8],zero,xmm0[9],zero,xmm0[10],zero,xmm0[11],zero,xmm0[12],zero,xmm0[13],zero,xmm0[14],zero,xmm0[15],zero
diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll
index 0f121d88b3573..18cb5e1b86ec3 100644
--- a/llvm/test/CodeGen/X86/widen_bitcnt.ll
+++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll
@@ -345,7 +345,7 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32
 define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
 ; SSE42-LABEL: widen_ctlz_v2i32_v4i32:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    movdqa %xmm3, %xmm6
 ; SSE42-NEXT:    pshufb %xmm0, %xmm6
 ; SSE42-NEXT:    movdqa %xmm0, %xmm5
@@ -394,7 +394,7 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
 ;
 ; AVX2-LABEL: widen_ctlz_v2i32_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm4
 ; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -448,7 +448,7 @@ define <4 x i32> @widen_ctlz_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
 define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE42-LABEL: widen_ctlz_v4i32_v8i32:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    movdqa %xmm3, %xmm6
 ; SSE42-NEXT:    pshufb %xmm0, %xmm6
 ; SSE42-NEXT:    movdqa %xmm0, %xmm5
@@ -535,7 +535,7 @@ define <8 x i32> @widen_ctlz_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
 define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) {
 ; SSE42-LABEL: widen_ctlz_v2i32_v8i32:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT:    movq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    movdqa %xmm5, %xmm8
 ; SSE42-NEXT:    pshufb %xmm0, %xmm8
 ; SSE42-NEXT:    movdqa %xmm0, %xmm7
@@ -629,7 +629,7 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm6
 ; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -724,7 +724,7 @@ define <8 x i32> @widen_ctlz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32>
 define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
 ; SSE42-LABEL: widen_ctlz_undef_v2i32_v4i32:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    movdqa %xmm3, %xmm6
 ; SSE42-NEXT:    pshufb %xmm0, %xmm6
 ; SSE42-NEXT:    movdqa %xmm0, %xmm5
@@ -773,7 +773,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
 ;
 ; AVX2-LABEL: widen_ctlz_undef_v2i32_v4i32:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm2 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm2, %xmm3
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm4
 ; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
@@ -827,7 +827,7 @@ define <4 x i32> @widen_ctlz_undef_v2i32_v4i32(<2 x i32> %a0, <2 x i32> %a1) {
 define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
 ; SSE42-LABEL: widen_ctlz_undef_v4i32_v8i32:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT:    movq {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    movdqa %xmm3, %xmm6
 ; SSE42-NEXT:    pshufb %xmm0, %xmm6
 ; SSE42-NEXT:    movdqa %xmm0, %xmm5
@@ -914,7 +914,7 @@ define <8 x i32> @widen_ctlz_undef_v4i32_v8i32(<4 x i32> %a0, <4 x i32> %a1) {
 define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> %a2, <2 x i32> %a3) {
 ; SSE42-LABEL: widen_ctlz_undef_v2i32_v8i32:
 ; SSE42:       # %bb.0:
-; SSE42-NEXT:    movdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; SSE42-NEXT:    movq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    movdqa %xmm5, %xmm8
 ; SSE42-NEXT:    pshufb %xmm0, %xmm8
 ; SSE42-NEXT:    movdqa %xmm0, %xmm7
@@ -1008,7 +1008,7 @@ define <8 x i32> @widen_ctlz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    # kill: def $xmm1 killed $xmm1 def $ymm1
 ; AVX2-NEXT:    # kill: def $xmm0 killed $xmm0 def $ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
+; AVX2-NEXT:    vmovq {{.*#+}} xmm5 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm0, %xmm5, %xmm4
 ; AVX2-NEXT:    vpsrlw $4, %xmm0, %xmm6
 ; AVX2-NEXT:    vpbroadcastb {{.*#+}} xmm7 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 742a3a0c66a2a..06140b2395fca 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -444,7 +444,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm4, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm2, %xmm3, %xmm2
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
@@ -453,7 +453,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -463,7 +463,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm4, %xmm6
 ; AVX1-NEXT:    vpshufb %xmm5, %xmm3, %xmm5
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -472,7 +472,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm4, %xmm4
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm4 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm4 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm4, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -492,7 +492,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm3, %xmm5
 ; AVX2-NEXT:    vpshufb %xmm4, %xmm2, %xmm4
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1]
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm5 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm5 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm1, %xmm6
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm0, %xmm5
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
@@ -501,7 +501,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm6 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -511,7 +511,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm3, %xmm6
 ; AVX2-NEXT:    vpshufb %xmm5, %xmm2, %xmm5
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1]
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm6 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm6 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm1, %xmm7
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm0, %xmm6
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1]
@@ -520,7 +520,7 @@ define <16 x i1> @interleaved_load_vf16_i8_stride4(ptr %ptr) nounwind {
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm3, %xmm3
 ; AVX2-NEXT:    vpshufb %xmm6, %xmm2, %xmm2
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX2-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
@@ -571,7 +571,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm3, %xmm4
 ; AVX1-NEXT:    vpshufb %xmm6, %xmm2, %xmm5
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm8 = [0,4,8,12,0,4,8,12,0,4,8,12,0,4,8,12]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm8 = [0,4,8,12,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm8, %xmm1, %xmm5
 ; AVX1-NEXT:    vpshufb %xmm8, %xmm0, %xmm7
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm5 = xmm7[0],xmm5[0],xmm7[1],xmm5[1]
@@ -591,7 +591,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm11, %xmm3, %xmm8
 ; AVX1-NEXT:    vpshufb %xmm11, %xmm2, %xmm12
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm8 = xmm12[0],xmm8[0],xmm12[1],xmm8[1]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm12 = [1,5,9,13,1,5,9,13,1,5,9,13,1,5,9,13]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm12 = [1,5,9,13,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm12, %xmm1, %xmm13
 ; AVX1-NEXT:    vpshufb %xmm12, %xmm0, %xmm14
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -609,7 +609,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm10, %xmm3, %xmm11
 ; AVX1-NEXT:    vpshufb %xmm10, %xmm2, %xmm12
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm11 = xmm12[0],xmm11[0],xmm12[1],xmm11[1]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm12 = [2,6,10,14,2,6,10,14,2,6,10,14,2,6,10,14]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm12 = [2,6,10,14,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm12, %xmm1, %xmm13
 ; AVX1-NEXT:    vpshufb %xmm12, %xmm0, %xmm14
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm13 = xmm14[0],xmm13[0],xmm14[1],xmm13[1]
@@ -625,7 +625,7 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
 ; AVX1-NEXT:    vpshufb %xmm12, %xmm3, %xmm3
 ; AVX1-NEXT:    vpshufb %xmm12, %xmm2, %xmm2
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
-; AVX1-NEXT:    vbroadcastss {{.*#+}} xmm3 = [3,7,11,15,3,7,11,15,3,7,11,15,3,7,11,15]
+; AVX1-NEXT:    vmovd {{.*#+}} xmm3 = [3,7,11,15,0,0,0,0,0,0,0,0,0,0,0,0]
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vpshufb %xmm3, %xmm0, %xmm0
 ; AVX1-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
index 023ac96181d3c..8c9dc90d2a71d 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg.ll
@@ -1686,7 +1686,7 @@ define void @vec256_v32i8_to_v2i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
 ; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
 ; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -1783,7 +1783,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -1794,7 +1794,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -1805,7 +1805,7 @@ define void @vec256_v32i8_to_v1i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -1997,7 +1997,7 @@ define void @vec256_v16i16_to_v2i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -2126,7 +2126,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX2-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -2137,7 +2137,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovdqa %ymm0, (%rcx)
@@ -2148,7 +2148,7 @@ define void @vec256_v16i16_to_v1i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -3360,7 +3360,7 @@ define void @vec384_v48i8_to_v3i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
 ; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
@@ -3483,7 +3483,7 @@ define void @vec384_v48i8_to_v2i192_factor24(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -3609,7 +3609,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -3622,7 +3622,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
@@ -3635,7 +3635,7 @@ define void @vec384_v48i8_to_v1i384_factor48(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -4261,7 +4261,7 @@ define void @vec384_v24i16_to_v3i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5]
@@ -4442,7 +4442,7 @@ define void @vec384_v24i16_to_v2i192_factor12(ptr %in.vec.base.ptr, ptr %in.vec.
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -4589,7 +4589,7 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -4602,7 +4602,7 @@ define void @vec384_v24i16_to_v1i384_factor24(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
@@ -5998,7 +5998,7 @@ define void @vec512_v64i8_to_v4i128_factor16(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
 ; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    movdqa %xmm0, %xmm2
 ; SSE-NEXT:    pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2]
@@ -6182,7 +6182,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    movdqa (%rdi), %xmm0
 ; SSE-NEXT:    paddb (%rsi), %xmm0
-; SSE-NEXT:    movdqa {{.*#+}} xmm1 = [255,0,0,0]
+; SSE-NEXT:    movd {{.*#+}} xmm1 = [255,0,0,0]
 ; SSE-NEXT:    pand %xmm0, %xmm1
 ; SSE-NEXT:    pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1]
 ; SSE-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -6217,7 +6217,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6231,7 +6231,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6245,7 +6245,7 @@ define void @vec512_v64i8_to_v2i256_factor32(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512BW-NEXT:    vpand %ymm1, %ymm0, %ymm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -6297,7 +6297,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -6310,7 +6310,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
@@ -6323,7 +6323,7 @@ define void @vec512_v64i8_to_v1i512_factor64(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa64 (%rdi), %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %zmm0, %zmm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512BW-NEXT:    vpandq %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpaddb (%rdx), %zmm0, %zmm0
 ; AVX512BW-NEXT:    vmovdqa64 %zmm0, (%rcx)
@@ -6573,7 +6573,7 @@ define void @vec512_v32i16_to_v4i128_factor8(ptr %in.vec.base.ptr, ptr %in.vec.b
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1]
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0]
@@ -6755,7 +6755,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa (%rdi), %xmm0
 ; SSE2-NEXT:    paddb (%rsi), %xmm0
-; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; SSE2-NEXT:    movd {{.*#+}} xmm1 = [65535,0,0,0]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
 ; SSE2-NEXT:    psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
@@ -6809,7 +6809,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX2-NEXT:    vpand %ymm0, %ymm1, %ymm1
 ; AVX2-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6823,7 +6823,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm0, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm0
@@ -6837,7 +6837,7 @@ define void @vec512_v32i16_to_v2i256_factor16(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512BW:       # %bb.0:
 ; AVX512BW-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512BW-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512BW-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512BW-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX512BW-NEXT:    vpand %ymm0, %ymm1, %ymm1
 ; AVX512BW-NEXT:    vpshufb {{.*#+}} ymm0 = ymm0[2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
 ; AVX512BW-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
@@ -6908,7 +6908,7 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vmovaps 32(%rdx), %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -6921,7 +6921,7 @@ define void @vec512_v32i16_to_v1i512_factor32(ptr %in.vec.base.ptr, ptr %in.vec.
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
 ; AVX512F-NEXT:    vmovaps 32(%rdx), %ymm1
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
index b4c79aff5cef1..8ab53140eb911 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll
@@ -1053,7 +1053,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; SSE42-NEXT:    paddb 48(%rsi), %xmm2
 ; SSE42-NEXT:    paddb (%rsi), %xmm0
 ; SSE42-NEXT:    paddb 32(%rsi), %xmm1
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    pshufb %xmm3, %xmm1
 ; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -1075,8 +1075,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; AVX-NEXT:    vpaddb 48(%rsi), %xmm2, %xmm2
 ; AVX-NEXT:    vpaddb (%rsi), %xmm0, %xmm0
 ; AVX-NEXT:    vpaddb 32(%rsi), %xmm1, %xmm1
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT:    # xmm3 = mem[0,0]
+; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -6055,7 +6054,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -6068,7 +6067,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -6081,7 +6080,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [255,0,0,0]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm1 = [255,0,0,0]
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -6520,7 +6519,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX2-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -6533,7 +6532,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX512F-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
@@ -6546,7 +6545,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
 ; AVX512DQ:       # %bb.0:
 ; AVX512DQ-NEXT:    vmovdqa (%rdi), %ymm0
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm1 = [65535,0,0,0]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm1 = [65535,0,0,0]
 ; AVX512DQ-NEXT:    vpand %ymm1, %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rdx), %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpaddb (%rdx), %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
index 23f02e9245eed..c362bdaa3217d 100644
--- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
+++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll
@@ -875,7 +875,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; SSE42-NEXT:    movdqa (%rdi), %xmm0
 ; SSE42-NEXT:    movdqa 32(%rdi), %xmm1
 ; SSE42-NEXT:    movdqa 48(%rdi), %xmm2
-; SSE42-NEXT:    movdqa {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,u,u,u,u,u,u,u,u]
+; SSE42-NEXT:    movq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; SSE42-NEXT:    pshufb %xmm3, %xmm1
 ; SSE42-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; SSE42-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -894,8 +894,7 @@ define void @vec256_i8_widen_to_i16_factor2_broadcast_to_v16i16_factor16(ptr %in
 ; AVX-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX-NEXT:    vmovdqa 48(%rdi), %xmm2
-; AVX-NEXT:    vmovddup {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,1,3,5,7,9,11,13,15]
-; AVX-NEXT:    # xmm3 = mem[0,0]
+; AVX-NEXT:    vmovq {{.*#+}} xmm3 = [1,3,5,7,9,11,13,15,0,0,0,0,0,0,0,0]
 ; AVX-NEXT:    vpshufb %xmm3, %xmm1, %xmm1
 ; AVX-NEXT:    vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
@@ -4884,7 +4883,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
 ;
 ; AVX2-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [255,0,0,0]
 ; AVX2-NEXT:    vpand (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
@@ -4895,7 +4894,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
 ;
 ; AVX512F-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm0 = [255,0,0,0]
 ; AVX512F-NEXT:    vpand (%rdi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
@@ -4906,7 +4905,7 @@ define void @vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2(ptr %i
 ;
 ; AVX512DQ-LABEL: vec512_i8_widen_to_i256_factor32_broadcast_to_v2i256_factor2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [255,0,0,0]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm0 = [255,0,0,0]
 ; AVX512DQ-NEXT:    vpand (%rdi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
@@ -5291,7 +5290,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
 ;
 ; AVX2-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [65535,0,0,0]
+; AVX2-NEXT:    vmovd {{.*#+}} xmm0 = [65535,0,0,0]
 ; AVX2-NEXT:    vpand (%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm1
 ; AVX2-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
@@ -5302,7 +5301,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
 ;
 ; AVX512F-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqa {{.*#+}} xmm0 = [65535,0,0,0]
+; AVX512F-NEXT:    vmovd {{.*#+}} xmm0 = [65535,0,0,0]
 ; AVX512F-NEXT:    vpand (%rdi), %ymm0, %ymm0
 ; AVX512F-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm1
 ; AVX512F-NEXT:    vpaddb (%rsi), %ymm0, %ymm0
@@ -5313,7 +5312,7 @@ define void @vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2(ptr %
 ;
 ; AVX512DQ-LABEL: vec512_i16_widen_to_i256_factor16_broadcast_to_v2i256_factor2:
 ; AVX512DQ:       # %bb.0:
-; AVX512DQ-NEXT:    vmovdqa {{.*#+}} xmm0 = [65535,0,0,0]
+; AVX512DQ-NEXT:    vmovd {{.*#+}} xmm0 = [65535,0,0,0]
 ; AVX512DQ-NEXT:    vpand (%rdi), %ymm0, %ymm0
 ; AVX512DQ-NEXT:    vpaddb 32(%rsi), %ymm0, %ymm1
 ; AVX512DQ-NEXT:    vpaddb (%rsi), %ymm0, %ymm0

From 7143b451d71fe314730f7610d7908e3b9611815c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 15:03:24 +0100
Subject: [PATCH 785/843] [JumpThreading] Add test for #79175 (NFC)

---
 llvm/test/Transforms/JumpThreading/pr79175.ll | 64 +++++++++++++++++++
 1 file changed, 64 insertions(+)
 create mode 100644 llvm/test/Transforms/JumpThreading/pr79175.ll

diff --git a/llvm/test/Transforms/JumpThreading/pr79175.ll b/llvm/test/Transforms/JumpThreading/pr79175.ll
new file mode 100644
index 0000000000000..6815aabb26dfc
--- /dev/null
+++ b/llvm/test/Transforms/JumpThreading/pr79175.ll
@@ -0,0 +1,64 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -passes=jump-threading < %s | FileCheck %s
+
+@f = external global i32
+
+; Make sure the value of @f is reloaded prior to the final comparison.
+; FIXME: This is a miscompile.
+define i32 @test(i64 %idx, i32 %val) {
+; CHECK-LABEL: define i32 @test(
+; CHECK-SAME: i64 [[IDX:%.*]], i32 [[VAL:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i64 [[IDX]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[RETURN:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[F:%.*]] = load i32, ptr @f, align 4
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i32 [[F]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[COND_END_THREAD:%.*]], label [[COND_END:%.*]]
+; CHECK:       cond.end:
+; CHECK-NEXT:    [[CMP_I:%.*]] = icmp sgt i32 [[VAL]], 0
+; CHECK-NEXT:    [[COND_FR:%.*]] = freeze i1 [[CMP_I]]
+; CHECK-NEXT:    br i1 [[COND_FR]], label [[COND_END_THREAD]], label [[TMP0:%.*]]
+; CHECK:       cond.end.thread:
+; CHECK-NEXT:    [[F_RELOAD_PR:%.*]] = load i32, ptr @f, align 4
+; CHECK-NEXT:    br label [[TMP0]]
+; CHECK:       0:
+; CHECK-NEXT:    [[F_RELOAD:%.*]] = phi i32 [ [[F]], [[COND_END]] ], [ [[F_RELOAD_PR]], [[COND_END_THREAD]] ]
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i32 [ 0, [[COND_END_THREAD]] ], [ [[VAL]], [[COND_END]] ]
+; CHECK-NEXT:    [[F_IDX:%.*]] = getelementptr inbounds i32, ptr @f, i64 [[IDX]]
+; CHECK-NEXT:    store i32 [[TMP1]], ptr [[F_IDX]], align 4
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp slt i32 [[F_RELOAD]], 1
+; CHECK-NEXT:    br i1 [[CMP3]], label [[RETURN2:%.*]], label [[RETURN]]
+; CHECK:       return:
+; CHECK-NEXT:    ret i32 0
+; CHECK:       return2:
+; CHECK-NEXT:    ret i32 1
+;
+entry:
+  %cmp = icmp slt i64 %idx, 1
+  br i1 %cmp, label %for.body, label %return
+
+for.body:
+  %f = load i32, ptr @f, align 4
+  %cmp1 = icmp eq i32 %f, 0
+  br i1 %cmp1, label %cond.end, label %cond.false
+
+cond.false:
+  br label %cond.end
+
+cond.end:
+  %phi = phi i32 [ %val, %cond.false ], [ 1, %for.body ]
+  %cmp.i = icmp sgt i32 %phi, 0
+  %sel = select i1 %cmp.i, i32 0, i32 %phi
+  %f.idx = getelementptr inbounds i32, ptr @f, i64 %idx
+  store i32 %sel, ptr %f.idx, align 4
+  %f.reload = load i32, ptr @f, align 4
+  %cmp3 = icmp slt i32 %f.reload, 1
+  br i1 %cmp3, label %return2, label %return
+
+return:
+  ret i32 0
+
+return2:
+  ret i32 1
+}

From c3e77070489979788788ef479f8932ac460b675b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 24 Jan 2024 15:16:56 +0100
Subject: [PATCH 786/843] [clang][AST][NFC] Turn a isa<> + cast<> into
 dynamic_cast<>

---
 clang/lib/AST/ExprCXX.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/lib/AST/ExprCXX.cpp b/clang/lib/AST/ExprCXX.cpp
index e61c11dffd884..a775beadff9da 100644
--- a/clang/lib/AST/ExprCXX.cpp
+++ b/clang/lib/AST/ExprCXX.cpp
@@ -511,14 +511,14 @@ DependentScopeDeclRefExpr::CreateEmpty(const ASTContext &Context,
 }
 
 SourceLocation CXXConstructExpr::getBeginLoc() const {
-  if (isa<CXXTemporaryObjectExpr>(this))
-    return cast<CXXTemporaryObjectExpr>(this)->getBeginLoc();
+  if (const auto *TOE = dyn_cast<CXXTemporaryObjectExpr>(this))
+    return TOE->getBeginLoc();
   return getLocation();
 }
 
 SourceLocation CXXConstructExpr::getEndLoc() const {
-  if (isa<CXXTemporaryObjectExpr>(this))
-    return cast<CXXTemporaryObjectExpr>(this)->getEndLoc();
+  if (const auto *TOE = dyn_cast<CXXTemporaryObjectExpr>(this))
+    return TOE->getEndLoc();
 
   if (ParenOrBraceRange.isValid())
     return ParenOrBraceRange.getEnd();

From 98509c7f9792c79b05a41b95c24607f6dd489c5a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 24 Jan 2024 14:19:32 +0000
Subject: [PATCH 787/843] [AArch64] Add vec3 tests with different load/store
 alignments.

Add extra tests with different load/store alignments for
https://github.com/llvm/llvm-project/pull/78637.
---
 .../AArch64/vec3-loads-ext-trunc-stores.ll    | 256 +++++++++++++++++-
 1 file changed, 248 insertions(+), 8 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
index 2cb103bd0b3e9..0ef87c3293055 100644
--- a/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
+++ b/llvm/test/CodeGen/AArch64/vec3-loads-ext-trunc-stores.ll
@@ -87,6 +87,76 @@ define <4 x i32> @load_v3i8_to_4xi32(ptr %src) {
   ret <4 x i32> %e
 }
 
+define <4 x i32> @load_v3i8_to_4xi32_align_2(ptr %src) {
+; CHECK-LABEL: load_v3i8_to_4xi32_align_2:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    movi.2d v1, #0x0000ff000000ff
+; CHECK-NEXT:    strh w8, [sp, #12]
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldrsb w8, [x0, #2]
+; CHECK-NEXT:    ushll.8h v0, v0, #0
+; CHECK-NEXT:    mov.h v0[1], v0[1]
+; CHECK-NEXT:    mov.h v0[2], w8
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_v3i8_to_4xi32_align_2:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ldrh w8, [x0]
+; BE-NEXT:    movi v1.2d, #0x0000ff000000ff
+; BE-NEXT:    strh w8, [sp, #12]
+; BE-NEXT:    ldr s0, [sp, #12]
+; BE-NEXT:    ldrsb w8, [x0, #2]
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    ushll v0.8h, v0.8b, #0
+; BE-NEXT:    mov v0.h[1], v0.h[1]
+; BE-NEXT:    mov v0.h[2], w8
+; BE-NEXT:    ushll v0.4s, v0.4h, #0
+; BE-NEXT:    and v0.16b, v0.16b, v1.16b
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load <3 x i8>, ptr %src, align 2
+  %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> <i32 3, i32 4, i32 5, i32 undef>
+  %e = zext <4 x i8> %s to <4 x i32>
+  ret <4 x i32> %e
+}
+
+define <4 x i32> @load_v3i8_to_4xi32_align_4(ptr %src) {
+; CHECK-LABEL: load_v3i8_to_4xi32_align_4:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    movi.2d v1, #0x0000ff000000ff
+; CHECK-NEXT:    zip1.8b v0, v0, v0
+; CHECK-NEXT:    ushll.4s v0, v0, #0
+; CHECK-NEXT:    and.16b v0, v0, v1
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_v3i8_to_4xi32_align_4:
+; BE:       // %bb.0:
+; BE-NEXT:    ldr s0, [x0]
+; BE-NEXT:    movi v1.2d, #0x0000ff000000ff
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; BE-NEXT:    ushll v0.4s, v0.4h, #0
+; BE-NEXT:    and v0.16b, v0.16b, v1.16b
+; BE-NEXT:    rev64 v0.4s, v0.4s
+; BE-NEXT:    ext v0.16b, v0.16b, v0.16b, #8
+; BE-NEXT:    ret
+  %l = load <3 x i8>, ptr %src, align 4
+  %s = shufflevector <3 x i8> poison, <3 x i8> %l, <4 x i32> <i32 3, i32 4, i32 5, i32 undef>
+  %e = zext <4 x i8> %s to <4 x i32>
+  ret <4 x i32> %e
+}
+
 define <4 x i32> @load_v3i8_to_4xi32_const_offset_1(ptr %src) {
 ; CHECK-LABEL: load_v3i8_to_4xi32_const_offset_1:
 ; CHECK:       ; %bb.0:
@@ -176,6 +246,42 @@ define <4 x i32> @load_v3i8_to_4xi32_const_offset_3(ptr %src) {
 }
 
 define <4 x i32> @volatile_load_v3i8_to_4xi32(ptr %src) {
+; check-label: volatile_load_v3i8_to_4xi32:
+; check:       ; %bb.0:
+; check-next:    sub sp, sp, #16
+; check-next:    .cfi_def_cfa_offset 16
+; check-next:    ldrh w8, [x0]
+; check-next:    movi.2d v1, #0x0000ff000000ff
+; check-next:    strh w8, [sp, #12]
+; check-next:    ldr s0, [sp, #12]
+; check-next:    ldrsb w8, [x0, #2]
+; check-next:    ushll.8h v0, v0, #0
+; check-next:    mov.h v0[1], v0[1]
+; check-next:    mov.h v0[2], w8
+; check-next:    ushll.4s v0, v0, #0
+; check-next:    and.16b v0, v0, v1
+; check-next:    add sp, sp, #16
+; check-next:    ret
+;
+; be-label: volatile_load_v3i8_to_4xi32:
+; be:       // %bb.0:
+; be-next:    sub sp, sp, #16
+; be-next:    .cfi_def_cfa_offset 16
+; be-next:    ldrh w8, [x0]
+; be-next:    movi v1.2d, #0x0000ff000000ff
+; be-next:    strh w8, [sp, #12]
+; be-next:    ldr s0, [sp, #12]
+; be-next:    ldrsb w8, [x0, #2]
+; be-next:    rev32 v0.8b, v0.8b
+; be-next:    ushll v0.8h, v0.8b, #0
+; be-next:    mov v0.h[1], v0.h[1]
+; be-next:    mov v0.h[2], w8
+; be-next:    ushll v0.4s, v0.4h, #0
+; be-next:    and v0.16b, v0.16b, v1.16b
+; be-next:    rev64 v0.4s, v0.4s
+; be-next:    ext v0.16b, v0.16b, v0.16b, #8
+; be-next:    add sp, sp, #16
+; be-next:    ret
 ; CHECK-LABEL: volatile_load_v3i8_to_4xi32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #16
@@ -286,9 +392,9 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    add x9, x0, #4
 ; CHECK-NEXT:  Lloh0:
-; CHECK-NEXT:    adrp x8, lCPI7_0@PAGE
+; CHECK-NEXT:    adrp x8, lCPI9_0@PAGE
 ; CHECK-NEXT:  Lloh1:
-; CHECK-NEXT:    ldr d1, [x8, lCPI7_0@PAGEOFF]
+; CHECK-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
 ; CHECK-NEXT:    ld1.h { v0 }[2], [x9]
 ; CHECK-NEXT:    add.4h v0, v0, v1
 ; CHECK-NEXT:    xtn.8b v1, v0
@@ -309,8 +415,8 @@ define void @store_trunc_add_from_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    add x8, x0, #4
 ; BE-NEXT:    rev32 v0.4h, v0.4h
 ; BE-NEXT:    ld1 { v0.h }[2], [x8]
-; BE-NEXT:    adrp x8, .LCPI7_0
-; BE-NEXT:    add x8, x8, :lo12:.LCPI7_0
+; BE-NEXT:    adrp x8, .LCPI9_0
+; BE-NEXT:    add x8, x8, :lo12:.LCPI9_0
 ; BE-NEXT:    ld1 { v1.4h }, [x8]
 ; BE-NEXT:    add v0.4h, v0.4h, v1.4h
 ; BE-NEXT:    xtn v1.8b, v0.8h
@@ -373,6 +479,64 @@ entry:
   ret void
 }
 
+define void @load_ext_to_64bits_default_align(ptr %src, ptr %dst) {
+; CHECK-LABEL: load_ext_to_64bits_default_align:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    add x8, x1, #4
+; CHECK-NEXT:    zip1.8b v0, v0, v0
+; CHECK-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-NEXT:    st1.h { v0 }[2], [x8]
+; CHECK-NEXT:    str s0, [x1]
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_ext_to_64bits_default_align:
+; BE:       // %bb.0: // %entry
+; BE-NEXT:    ldr s0, [x0]
+; BE-NEXT:    add x8, x1, #4
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; BE-NEXT:    bic v0.4h, #255, lsl #8
+; BE-NEXT:    rev32 v1.8h, v0.8h
+; BE-NEXT:    st1 { v0.h }[2], [x8]
+; BE-NEXT:    str s1, [x1]
+; BE-NEXT:    ret
+entry:
+  %l = load <3 x i8>, ptr %src
+  %e = zext <3 x i8> %l to <3 x i16>
+  store <3 x i16> %e, ptr %dst, align 1
+  ret void
+}
+
+define void @load_ext_to_64bits_align_4(ptr %src, ptr %dst) {
+; CHECK-LABEL: load_ext_to_64bits_align_4:
+; CHECK:       ; %bb.0: ; %entry
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    add x8, x1, #4
+; CHECK-NEXT:    zip1.8b v0, v0, v0
+; CHECK-NEXT:    bic.4h v0, #255, lsl #8
+; CHECK-NEXT:    st1.h { v0 }[2], [x8]
+; CHECK-NEXT:    str s0, [x1]
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: load_ext_to_64bits_align_4:
+; BE:       // %bb.0: // %entry
+; BE-NEXT:    ldr s0, [x0]
+; BE-NEXT:    add x8, x1, #4
+; BE-NEXT:    rev32 v0.8b, v0.8b
+; BE-NEXT:    zip1 v0.8b, v0.8b, v0.8b
+; BE-NEXT:    bic v0.4h, #255, lsl #8
+; BE-NEXT:    rev32 v1.8h, v0.8h
+; BE-NEXT:    st1 { v0.h }[2], [x8]
+; BE-NEXT:    str s1, [x1]
+; BE-NEXT:    ret
+entry:
+  %l = load <3 x i8>, ptr %src, align 4
+  %e = zext <3 x i8> %l to <3 x i16>
+  store <3 x i16> %e, ptr %dst, align 1
+  ret void
+}
+
 define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
 ; CHECK-LABEL: load_ext_add_to_64bits:
 ; CHECK:       ; %bb.0: ; %entry
@@ -380,9 +544,9 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    ldrh w9, [x0]
 ; CHECK-NEXT:  Lloh2:
-; CHECK-NEXT:    adrp x8, lCPI9_0@PAGE
+; CHECK-NEXT:    adrp x8, lCPI13_0@PAGE
 ; CHECK-NEXT:  Lloh3:
-; CHECK-NEXT:    ldr d1, [x8, lCPI9_0@PAGEOFF]
+; CHECK-NEXT:    ldr d1, [x8, lCPI13_0@PAGEOFF]
 ; CHECK-NEXT:    add x8, x1, #4
 ; CHECK-NEXT:    strh w9, [sp, #12]
 ; CHECK-NEXT:    add x9, x0, #2
@@ -408,8 +572,8 @@ define void @load_ext_add_to_64bits(ptr %src, ptr %dst) {
 ; BE-NEXT:    rev32 v0.8b, v0.8b
 ; BE-NEXT:    ushll v0.8h, v0.8b, #0
 ; BE-NEXT:    ld1 { v0.b }[4], [x8]
-; BE-NEXT:    adrp x8, .LCPI9_0
-; BE-NEXT:    add x8, x8, :lo12:.LCPI9_0
+; BE-NEXT:    adrp x8, .LCPI13_0
+; BE-NEXT:    add x8, x8, :lo12:.LCPI13_0
 ; BE-NEXT:    ld1 { v1.4h }, [x8]
 ; BE-NEXT:    add x8, x1, #4
 ; BE-NEXT:    bic v0.4h, #255, lsl #8
@@ -465,6 +629,82 @@ define void @shift_trunc_store(ptr %src, ptr %dst) {
   ret void
 }
 
+define void @shift_trunc_store_default_align(ptr %src, ptr %dst) {
+; CHECK-LABEL: shift_trunc_store_default_align:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    shrn.4h v0, v0, #16
+; CHECK-NEXT:    xtn.8b v1, v0
+; CHECK-NEXT:    umov.h w8, v0[2]
+; CHECK-NEXT:    str s1, [sp, #12]
+; CHECK-NEXT:    ldrh w9, [sp, #12]
+; CHECK-NEXT:    strb w8, [x1, #2]
+; CHECK-NEXT:    strh w9, [x1]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: shift_trunc_store_default_align:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ld1 { v0.4s }, [x0]
+; BE-NEXT:    shrn v0.4h, v0.4s, #16
+; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    umov w8, v0.h[2]
+; BE-NEXT:    rev32 v1.16b, v1.16b
+; BE-NEXT:    str s1, [sp, #12]
+; BE-NEXT:    ldrh w9, [sp, #12]
+; BE-NEXT:    strb w8, [x1, #2]
+; BE-NEXT:    strh w9, [x1]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load <3 x i32>, ptr %src
+  %s = lshr <3 x i32> %l, <i32 16, i32 16, i32 16>
+  %t = trunc <3 x i32> %s to <3 x i8>
+  store <3 x i8> %t, ptr %dst
+  ret void
+}
+
+define void @shift_trunc_store_align_4(ptr %src, ptr %dst) {
+; CHECK-LABEL: shift_trunc_store_align_4:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [x0]
+; CHECK-NEXT:    shrn.4h v0, v0, #16
+; CHECK-NEXT:    xtn.8b v1, v0
+; CHECK-NEXT:    umov.h w8, v0[2]
+; CHECK-NEXT:    str s1, [sp, #12]
+; CHECK-NEXT:    ldrh w9, [sp, #12]
+; CHECK-NEXT:    strb w8, [x1, #2]
+; CHECK-NEXT:    strh w9, [x1]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+;
+; BE-LABEL: shift_trunc_store_align_4:
+; BE:       // %bb.0:
+; BE-NEXT:    sub sp, sp, #16
+; BE-NEXT:    .cfi_def_cfa_offset 16
+; BE-NEXT:    ld1 { v0.4s }, [x0]
+; BE-NEXT:    shrn v0.4h, v0.4s, #16
+; BE-NEXT:    xtn v1.8b, v0.8h
+; BE-NEXT:    umov w8, v0.h[2]
+; BE-NEXT:    rev32 v1.16b, v1.16b
+; BE-NEXT:    str s1, [sp, #12]
+; BE-NEXT:    ldrh w9, [sp, #12]
+; BE-NEXT:    strb w8, [x1, #2]
+; BE-NEXT:    strh w9, [x1]
+; BE-NEXT:    add sp, sp, #16
+; BE-NEXT:    ret
+  %l = load <3 x i32>, ptr %src
+  %s = lshr <3 x i32> %l, <i32 16, i32 16, i32 16>
+  %t = trunc <3 x i32> %s to <3 x i8>
+  store <3 x i8> %t, ptr %dst, align 4
+  ret void
+}
+
 define void @shift_trunc_store_const_offset_1(ptr %src, ptr %dst) {
 ; CHECK-LABEL: shift_trunc_store_const_offset_1:
 ; CHECK:       ; %bb.0:

From 90ba33099cbb17e7c159e9ebc5a512037db99d6d Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 15:25:29 +0100
Subject: [PATCH 788/843] [InstCombine] Canonicalize constant GEPs to i8 source
 element type (#68882)

This patch canonicalizes getelementptr instructions with constant
indices to use the `i8` source element type. This makes it easier for
optimizations to recognize that two GEPs are identical, because they
don't need to see past many different ways to express the same offset.

This is a first step towards
https://discourse.llvm.org/t/rfc-replacing-getelementptr-with-ptradd/68699.
This is limited to constant GEPs only for now, as they have a clear
canonical form, while we're not yet sure how exactly to deal with
variable indices.

The test llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll gives
two representative examples of the kind of optimization improvement we
expect from this change. In the first test SimplifyCFG can now realize
that all switch branches are actually the same. In the second test it
can convert it into simple arithmetic. These are representative of
common optimization failures we see in Rust.

Fixes https://github.com/llvm/llvm-project/issues/69841.
---
 .../CodeGen/PowerPC/builtins-ppc-pair-mma.c   |  10 +-
 clang/test/CodeGen/aarch64-ls64-inline-asm.c  |  18 +-
 .../attr-arm-sve-vector-bits-bitcast.c        |  48 +-
 clang/test/CodeGen/attr-counted-by.c          | 286 +++---
 .../attr-riscv-rvv-vector-bits-bitcast.c      |  24 +-
 clang/test/CodeGen/cleanup-destslot-simple.c  |   4 +-
 .../test/CodeGen/hexagon-brev-ld-ptr-incdec.c |   6 +-
 clang/test/CodeGen/ms-intrinsics.c            |  12 +-
 clang/test/CodeGen/nofpclass.c                |   8 +-
 clang/test/CodeGen/union-tbaa1.c              |   4 +-
 .../RelativeVTablesABI/dynamic-cast.cpp       |   2 +-
 .../RelativeVTablesABI/type-info.cpp          |   4 +-
 .../CodeGenCXX/microsoft-abi-dynamic-cast.cpp |  12 +-
 .../test/CodeGenCXX/microsoft-abi-typeid.cpp  |   2 +-
 clang/test/CodeGenObjC/arc-foreach.m          |   4 +-
 .../test/CodeGenObjCXX/arc-cxx11-init-list.mm |   2 +-
 clang/test/Headers/__clang_hip_math.hip       |  24 +-
 clang/test/OpenMP/bug57757.cpp                |  12 +-
 flang/test/HLFIR/no-block-merging.fir         |   2 +-
 .../InstCombine/InstructionCombining.cpp      |   9 +
 llvm/test/Analysis/BasicAA/featuretest.ll     |   6 +-
 .../CodeGen/AMDGPU/vector-alloca-bitcast.ll   |  12 +-
 .../BPF/preserve-static-offset/load-inline.ll |   4 +-
 .../load-unroll-inline.ll                     |   4 +-
 .../BPF/preserve-static-offset/load-unroll.ll |   8 +-
 .../store-unroll-inline.ll                    |   4 +-
 .../Hexagon/autohvx/vector-align-tbaa.ll      |  54 +-
 llvm/test/Transforms/Coroutines/coro-async.ll |   2 +-
 .../coro-retcon-alloca-opaque-ptr.ll          |   2 +-
 .../Coroutines/coro-retcon-alloca.ll          |   2 +-
 .../Coroutines/coro-retcon-once-value.ll      |   6 +-
 .../Coroutines/coro-retcon-resume-values.ll   |   8 +-
 .../Transforms/Coroutines/coro-swifterror.ll  |   2 +-
 .../InstCombine/2007-03-25-BadShiftMask.ll    |   2 +-
 .../InstCombine/2009-01-08-AlignAlloca.ll     |   2 +-
 .../2009-02-20-InstCombine-SROA.ll            |  32 +-
 .../X86/x86-addsub-inseltpoison.ll            |   2 +-
 .../Transforms/InstCombine/X86/x86-addsub.ll  |   2 +-
 llvm/test/Transforms/InstCombine/add3.ll      |   2 +-
 llvm/test/Transforms/InstCombine/array.ll     |   4 +-
 llvm/test/Transforms/InstCombine/assume.ll    |   2 +-
 llvm/test/Transforms/InstCombine/cast_phi.ll  |   4 +-
 .../Transforms/InstCombine/catchswitch-phi.ll |   4 +-
 .../Transforms/InstCombine/compare-alloca.ll  |   2 +-
 .../Transforms/InstCombine/extractvalue.ll    |   4 +-
 .../Transforms/InstCombine/gep-addrspace.ll   |   2 +-
 .../gep-canonicalize-constant-indices.ll      |  18 +-
 .../InstCombine/gep-combine-loop-invariant.ll |   6 +-
 .../Transforms/InstCombine/gep-custom-dl.ll   |   4 +-
 .../InstCombine/gep-merge-constant-indices.ll |  14 +-
 .../InstCombine/gep-vector-indices.ll         |   8 +-
 .../test/Transforms/InstCombine/gep-vector.ll |   2 +-
 llvm/test/Transforms/InstCombine/gepphigep.ll |   2 +-
 .../Transforms/InstCombine/getelementptr.ll   |  49 +-
 .../Transforms/InstCombine/icmp-custom-dl.ll  |   4 +-
 llvm/test/Transforms/InstCombine/icmp-gep.ll  |   8 +-
 .../InstCombine/indexed-gep-compares.ll       |  16 +-
 llvm/test/Transforms/InstCombine/intptr1.ll   |  20 +-
 llvm/test/Transforms/InstCombine/intptr2.ll   |   4 +-
 llvm/test/Transforms/InstCombine/intptr3.ll   |   4 +-
 llvm/test/Transforms/InstCombine/intptr4.ll   |   4 +-
 llvm/test/Transforms/InstCombine/intptr5.ll   |   4 +-
 llvm/test/Transforms/InstCombine/intptr7.ll   |   4 +-
 .../InstCombine/load-store-forward.ll         |   2 +-
 llvm/test/Transforms/InstCombine/load.ll      |   2 +-
 .../InstCombine/loadstore-metadata.ll         |   2 +-
 llvm/test/Transforms/InstCombine/memchr-5.ll  |  48 +-
 llvm/test/Transforms/InstCombine/memchr-9.ll  |  46 +-
 llvm/test/Transforms/InstCombine/memcmp-3.ll  |  56 +-
 llvm/test/Transforms/InstCombine/memcmp-4.ll  |   8 +-
 llvm/test/Transforms/InstCombine/memcmp-5.ll  |  26 +-
 llvm/test/Transforms/InstCombine/memcmp-6.ll  |  12 +-
 llvm/test/Transforms/InstCombine/memcmp-7.ll  |  22 +-
 .../Transforms/InstCombine/memcpy_alloca.ll   |   2 +-
 llvm/test/Transforms/InstCombine/memrchr-5.ll |  64 +-
 llvm/test/Transforms/InstCombine/memset2.ll   |   2 +-
 .../multi-size-address-space-pointer.ll       |  14 +-
 .../InstCombine/non-integral-pointers.ll      |   4 +-
 .../test/Transforms/InstCombine/opaque-ptr.ll |  42 +-
 .../phi-equal-incoming-pointers.ll            |   2 +-
 .../Transforms/InstCombine/phi-timeout.ll     |   2 +-
 llvm/test/Transforms/InstCombine/phi.ll       |   4 +-
 llvm/test/Transforms/InstCombine/pr39908.ll   |   6 +-
 llvm/test/Transforms/InstCombine/pr44242.ll   |   2 +-
 llvm/test/Transforms/InstCombine/pr58901.ll   |   7 +-
 .../InstCombine/ptr-replace-alloca.ll         |  10 +-
 .../Transforms/InstCombine/select-cmp-br.ll   |  16 +-
 .../test/Transforms/InstCombine/select-gep.ll |  16 +-
 llvm/test/Transforms/InstCombine/shift.ll     |   8 +-
 .../sink_sideeffecting_instruction.ll         |   2 +-
 llvm/test/Transforms/InstCombine/sprintf-2.ll |  16 +-
 .../InstCombine/statepoint-cleanup.ll         |   8 +-
 llvm/test/Transforms/InstCombine/str-int-3.ll |  46 +-
 llvm/test/Transforms/InstCombine/str-int-4.ll |  68 +-
 llvm/test/Transforms/InstCombine/str-int-5.ll |  54 +-
 llvm/test/Transforms/InstCombine/str-int.ll   |   2 +-
 .../Transforms/InstCombine/strcall-bad-sig.ll |   8 +-
 .../Transforms/InstCombine/strcall-no-nul.ll  |  26 +-
 llvm/test/Transforms/InstCombine/strlen-7.ll  |  40 +-
 llvm/test/Transforms/InstCombine/strlen-9.ll  |  12 +-
 llvm/test/Transforms/InstCombine/strncmp-4.ll |  28 +-
 llvm/test/Transforms/InstCombine/strncmp-5.ll |  42 +-
 llvm/test/Transforms/InstCombine/strncmp-6.ll |  12 +-
 llvm/test/Transforms/InstCombine/sub.ll       |   6 +-
 .../test/Transforms/InstCombine/unpack-fca.ll |  54 +-
 .../vec_demanded_elts-inseltpoison.ll         |   4 +-
 .../InstCombine/vec_demanded_elts.ll          |   4 +-
 .../vec_gep_scalar_arg-inseltpoison.ll        |   2 +-
 .../InstCombine/vec_gep_scalar_arg.ll         |   2 +-
 .../test/Transforms/InstCombine/vscale_gep.ll |   2 +-
 llvm/test/Transforms/InstCombine/wcslen-5.ll  |   2 +-
 .../Transforms/LoopUnroll/ARM/upperbound.ll   |   4 +-
 llvm/test/Transforms/LoopUnroll/peel-loop.ll  |  16 +-
 .../AArch64/deterministic-type-shrinkage.ll   |   2 +-
 .../LoopVectorize/AArch64/intrinsiccost.ll    |   8 +-
 .../AArch64/sve-cond-inv-loads.ll             |   4 +-
 .../AArch64/sve-interleaved-accesses.ll       | 626 ++++++-------
 .../LoopVectorize/AArch64/sve-widen-phi.ll    | 166 ++--
 .../AArch64/vector-reverse-mask4.ll           |   8 +-
 .../LoopVectorize/AMDGPU/packed-math.ll       |  44 +-
 .../Transforms/LoopVectorize/ARM/mve-qabs.ll  |  18 +-
 .../LoopVectorize/ARM/mve-reductions.ll       |   2 +-
 .../LoopVectorize/ARM/mve-selectandorcost.ll  |   6 +-
 .../LoopVectorize/ARM/pointer_iv.ll           |  60 +-
 .../LoopVectorize/X86/float-induction-x86.ll  |  18 +-
 .../LoopVectorize/X86/interleaving.ll         |  60 +-
 .../LoopVectorize/X86/intrinsiccost.ll        |  27 +-
 .../X86/invariant-store-vectorization.ll      |   6 +-
 .../LoopVectorize/X86/metadata-enable.ll      | 824 +++++++++---------
 .../Transforms/LoopVectorize/X86/pr23997.ll   |  28 +-
 .../LoopVectorize/X86/small-size.ll           |   8 +-
 ...86-interleaved-store-accesses-with-gaps.ll |   4 +-
 .../LoopVectorize/consecutive-ptr-uniforms.ll |   4 +-
 .../LoopVectorize/extract-last-veclane.ll     |   2 +-
 .../LoopVectorize/float-induction.ll          |  16 +-
 .../Transforms/LoopVectorize/induction.ll     |  52 +-
 .../LoopVectorize/interleaved-accesses.ll     |  20 +-
 .../LoopVectorize/reduction-inloop-uf4.ll     |   6 +-
 .../Transforms/LoopVectorize/runtime-check.ll |   2 +-
 .../scalar_after_vectorization.ll             |   2 +-
 .../Transforms/LoopVectorize/vector-geps.ll   |   4 +-
 .../multiply-fused-dominance.ll               | 112 +--
 .../multiply-fused-loops.ll                   |  24 +-
 .../multiply-fused-multiple-blocks.ll         |  72 +-
 .../LowerMatrixIntrinsics/multiply-fused.ll   | 134 +--
 .../LowerMatrixIntrinsics/multiply-minimal.ll |  10 +-
 ...ting-sinking-required-for-vectorization.ll |  12 +-
 ...ple-unreachable-exits-for-vectorization.ll |  22 +-
 .../PhaseOrdering/AArch64/quant_4x4.ll        |  96 +-
 .../AArch64/sinking-vs-if-conversion.ll       |   8 +-
 .../PhaseOrdering/ARM/arm_mult_q15.ll         |  12 +-
 .../PhaseOrdering/X86/excessive-unrolling.ll  |   6 +-
 .../X86/hoist-load-of-baseptr.ll              |  10 +-
 .../PhaseOrdering/X86/pixel-splat.ll          |   2 +-
 .../X86/pr48844-br-to-switch-vectorization.ll |   2 +-
 .../Transforms/PhaseOrdering/X86/pr50555.ll   |   4 +-
 .../PhaseOrdering/X86/speculation-vs-tbaa.ll  |   2 +-
 .../PhaseOrdering/X86/spurious-peeling.ll     |  12 +-
 .../test/Transforms/PhaseOrdering/X86/vdiv.ll |  12 +-
 .../Transforms/PhaseOrdering/X86/vec-shift.ll |   8 +-
 llvm/test/Transforms/PhaseOrdering/basic.ll   |   8 +-
 .../PhaseOrdering/loop-access-checks.ll       |   6 +-
 llvm/test/Transforms/PhaseOrdering/pr39282.ll |   4 +-
 .../PhaseOrdering/simplifycfg-options.ll      |   2 +-
 .../PhaseOrdering/switch_with_geps.ll         |  56 +-
 .../SLPVectorizer/AArch64/gather-cost.ll      |  12 +-
 .../SLPVectorizer/AArch64/gather-reduce.ll    |   8 +-
 .../SLPVectorizer/AArch64/loadorder.ll        |  32 +-
 .../WebAssembly/no-vectorize-rotate.ll        |   2 +-
 .../SLPVectorizer/X86/operandorder.ll         |   4 +-
 llvm/test/Transforms/SLPVectorizer/X86/opt.ll |  14 +-
 .../Transforms/SLPVectorizer/X86/pr46983.ll   |  32 +-
 .../SLPVectorizer/X86/pr47629-inseltpoison.ll | 272 +++---
 .../Transforms/SLPVectorizer/X86/pr47629.ll   | 272 +++---
 .../SampleProfile/pseudo-probe-instcombine.ll |  10 +-
 .../Transforms/Util/strip-gc-relocates.ll     |   4 +-
 176 files changed, 2520 insertions(+), 2532 deletions(-)

diff --git a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c
index 3922513e22469..5422d993ff157 100644
--- a/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c
+++ b/clang/test/CodeGen/PowerPC/builtins-ppc-pair-mma.c
@@ -25,13 +25,13 @@ void test1(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
 // CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP1]], 0
 // CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[RESP:%.*]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RESP]], i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[RESP]], i64 16
 // CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 16
 // CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP1]], 2
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RESP]], i64 2
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[RESP]], i64 32
 // CHECK-NEXT:    store <16 x i8> [[TMP5]], ptr [[TMP6]], align 16
 // CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } [[TMP1]], 3
-// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RESP]], i64 3
+// CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[RESP]], i64 48
 // CHECK-NEXT:    store <16 x i8> [[TMP7]], ptr [[TMP8]], align 16
 // CHECK-NEXT:    ret void
 //
@@ -60,7 +60,7 @@ void test3(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, unsi
 // CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP1]], 0
 // CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[RESP:%.*]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RESP]], i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[RESP]], i64 16
 // CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 16
 // CHECK-NEXT:    ret void
 //
@@ -1072,7 +1072,7 @@ void test76(unsigned char *vqp, unsigned char *vpp, vector unsigned char vc, uns
 // CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP1]], 0
 // CHECK-NEXT:    store <16 x i8> [[TMP2]], ptr [[RESP:%.*]], align 16
 // CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[TMP1]], 1
-// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <16 x i8>, ptr [[RESP]], i64 1
+// CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[RESP]], i64 16
 // CHECK-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 16
 // CHECK-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/aarch64-ls64-inline-asm.c b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
index ac2dbe1fa1b31..744d6919b05ee 100644
--- a/clang/test/CodeGen/aarch64-ls64-inline-asm.c
+++ b/clang/test/CodeGen/aarch64-ls64-inline-asm.c
@@ -16,8 +16,8 @@ void load(struct foo *output, void *addr)
 
 // CHECK-LABEL: @store(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP1:%.*]] = load i512, ptr [[INPUT:%.*]], align 8
-// CHECK-NEXT:    tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP1]], ptr [[ADDR:%.*]]) #[[ATTR1]], !srcloc !3
+// CHECK-NEXT:    [[TMP0:%.*]] = load i512, ptr [[INPUT:%.*]], align 8
+// CHECK-NEXT:    tail call void asm sideeffect "st64b $0,[$1]", "r,r,~{memory}"(i512 [[TMP0]], ptr [[ADDR:%.*]]) #[[ATTR1]], !srcloc !3
 // CHECK-NEXT:    ret void
 //
 void store(const struct foo *input, void *addr)
@@ -29,25 +29,25 @@ void store(const struct foo *input, void *addr)
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[IN:%.*]], align 4, !tbaa [[TBAA4:![0-9]+]]
 // CHECK-NEXT:    [[CONV:%.*]] = sext i32 [[TMP0]] to i64
-// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 1
+// CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 4
 // CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CONV2:%.*]] = sext i32 [[TMP1]] to i64
-// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 4
+// CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 16
 // CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX4]], align 4, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CONV5:%.*]] = sext i32 [[TMP2]] to i64
-// CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 16
+// CHECK-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 64
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CONV8:%.*]] = sext i32 [[TMP3]] to i64
-// CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 25
+// CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 100
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX10]], align 4, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CONV11:%.*]] = sext i32 [[TMP4]] to i64
-// CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 36
+// CHECK-NEXT:    [[ARRAYIDX13:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 144
 // CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX13]], align 4, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CONV14:%.*]] = sext i32 [[TMP5]] to i64
-// CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 49
+// CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 196
 // CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX16]], align 4, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CONV17:%.*]] = sext i32 [[TMP6]] to i64
-// CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i32, ptr [[IN]], i64 64
+// CHECK-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds i8, ptr [[IN]], i64 256
 // CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[ARRAYIDX19]], align 4, !tbaa [[TBAA4]]
 // CHECK-NEXT:    [[CONV20:%.*]] = sext i32 [[TMP7]] to i64
 // CHECK-NEXT:    [[S_SROA_10_0_INSERT_EXT:%.*]] = zext i64 [[CONV20]] to i512
diff --git a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
index 22e2e0c2ff102..323afb6459124 100644
--- a/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-arm-sve-vector-bits-bitcast.c
@@ -30,21 +30,21 @@ DEFINE_STRUCT(bool)
 
 // CHECK-128-LABEL: @read_int64(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 16, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v2i64(<vscale x 2 x i64> undef, <2 x i64> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 // CHECK-256-LABEL: @read_int64(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 16, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v4i64(<vscale x 2 x i64> undef, <4 x i64> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
 //
 // CHECK-512-LABEL: @read_int64(
 // CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 64
 // CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i64>, ptr [[Y]], align 16, !tbaa [[TBAA2:![0-9]+]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i64> @llvm.vector.insert.nxv2i64.v8i64(<vscale x 2 x i64> undef, <8 x i64> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 2 x i64> [[CAST_SCALABLE]]
@@ -56,21 +56,21 @@ svint64_t read_int64(struct struct_int64 *s) {
 // CHECK-128-LABEL: @write_int64(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv2i64(<vscale x 2 x i64> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_int64(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv2i64(<vscale x 2 x i64> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_int64(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i64> @llvm.vector.extract.v8i64.nxv2i64(<vscale x 2 x i64> [[X:%.*]], i64 0)
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 64
 // CHECK-512-NEXT:    store <8 x i64> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
@@ -84,21 +84,21 @@ void write_int64(struct struct_int64 *s, svint64_t x) {
 
 // CHECK-128-LABEL: @read_float64(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v2f64(<vscale x 2 x double> undef, <2 x double> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 2 x double> [[CAST_SCALABLE]]
 //
 // CHECK-256-LABEL: @read_float64(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v4f64(<vscale x 2 x double> undef, <4 x double> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 2 x double> [[CAST_SCALABLE]]
 //
 // CHECK-512-LABEL: @read_float64(
 // CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 64
 // CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x double>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x double> @llvm.vector.insert.nxv2f64.v8f64(<vscale x 2 x double> undef, <8 x double> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 2 x double> [[CAST_SCALABLE]]
@@ -110,21 +110,21 @@ svfloat64_t read_float64(struct struct_float64 *s) {
 // CHECK-128-LABEL: @write_float64(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x double> @llvm.vector.extract.v2f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_float64(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x double> @llvm.vector.extract.v4f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_float64(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x double> @llvm.vector.extract.v8f64.nxv2f64(<vscale x 2 x double> [[X:%.*]], i64 0)
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 64
 // CHECK-512-NEXT:    store <8 x double> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
@@ -138,21 +138,21 @@ void write_float64(struct struct_float64 *s, svfloat64_t x) {
 
 // CHECK-128-LABEL: @read_bfloat16(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <8 x bfloat>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v8bf16(<vscale x 8 x bfloat> undef, <8 x bfloat> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
 // CHECK-256-LABEL: @read_bfloat16(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <16 x bfloat>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v16bf16(<vscale x 8 x bfloat> undef, <16 x bfloat> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
 //
 // CHECK-512-LABEL: @read_bfloat16(
 // CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 64
 // CHECK-512-NEXT:    [[TMP0:%.*]] = load <32 x bfloat>, ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 8 x bfloat> @llvm.vector.insert.nxv8bf16.v32bf16(<vscale x 8 x bfloat> undef, <32 x bfloat> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    ret <vscale x 8 x bfloat> [[CAST_SCALABLE]]
@@ -164,21 +164,21 @@ svbfloat16_t read_bfloat16(struct struct_bfloat16 *s) {
 // CHECK-128-LABEL: @write_bfloat16(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x bfloat> @llvm.vector.extract.v8bf16.nxv8bf16(<vscale x 8 x bfloat> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    store <8 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_bfloat16(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <16 x bfloat> @llvm.vector.extract.v16bf16.nxv8bf16(<vscale x 8 x bfloat> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    store <16 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
 // CHECK-512-LABEL: @write_bfloat16(
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <32 x bfloat> @llvm.vector.extract.v32bf16.nxv8bf16(<vscale x 8 x bfloat> [[X:%.*]], i64 0)
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BFLOAT16:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 64
 // CHECK-512-NEXT:    store <32 x bfloat> [[CAST_FIXED]], ptr [[Y]], align 16, !tbaa [[TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
@@ -192,7 +192,7 @@ void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
 
 // CHECK-128-LABEL: @read_bool(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 2
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i8>, ptr [[Y]], align 2, !tbaa [[TBAA2]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v2i8(<vscale x 2 x i8> undef, <2 x i8> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
@@ -200,7 +200,7 @@ void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
 //
 // CHECK-256-LABEL: @read_bool(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 4
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[Y]], align 2, !tbaa [[TBAA2]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v4i8(<vscale x 2 x i8> undef, <4 x i8> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
@@ -208,7 +208,7 @@ void write_bfloat16(struct struct_bfloat16 *s, svbfloat16_t x) {
 //
 // CHECK-512-LABEL: @read_bool(
 // CHECK-512-NEXT:  entry:
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
 // CHECK-512-NEXT:    [[TMP0:%.*]] = load <8 x i8>, ptr [[Y]], align 2, !tbaa [[TBAA2]]
 // CHECK-512-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 2 x i8> @llvm.vector.insert.nxv2i8.v8i8(<vscale x 2 x i8> undef, <8 x i8> [[TMP0]], i64 0)
 // CHECK-512-NEXT:    [[TMP1:%.*]] = bitcast <vscale x 2 x i8> [[CAST_SCALABLE]] to <vscale x 16 x i1>
@@ -222,7 +222,7 @@ svbool_t read_bool(struct struct_bool *s) {
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X:%.*]] to <vscale x 2 x i8>
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i8> @llvm.vector.extract.v2i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 2
 // CHECK-128-NEXT:    store <2 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[TBAA2]]
 // CHECK-128-NEXT:    ret void
 //
@@ -230,7 +230,7 @@ svbool_t read_bool(struct struct_bool *s) {
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X:%.*]] to <vscale x 2 x i8>
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i8> @llvm.vector.extract.v4i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 4
 // CHECK-256-NEXT:    store <4 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[TBAA2]]
 // CHECK-256-NEXT:    ret void
 //
@@ -238,7 +238,7 @@ svbool_t read_bool(struct struct_bool *s) {
 // CHECK-512-NEXT:  entry:
 // CHECK-512-NEXT:    [[TMP0:%.*]] = bitcast <vscale x 16 x i1> [[X:%.*]] to <vscale x 2 x i8>
 // CHECK-512-NEXT:    [[CAST_FIXED:%.*]] = tail call <8 x i8> @llvm.vector.extract.v8i8.nxv2i8(<vscale x 2 x i8> [[TMP0]], i64 0)
-// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_BOOL:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-512-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
 // CHECK-512-NEXT:    store <8 x i8> [[CAST_FIXED]], ptr [[Y]], align 2, !tbaa [[TBAA2]]
 // CHECK-512-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index 74d5457e398b9..e5685e39173b0 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -60,7 +60,7 @@ struct anon_struct {
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
@@ -69,31 +69,35 @@ struct anon_struct {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB2:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10:[0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test1(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef writeonly [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test1(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr nocapture noundef writeonly [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[VAL:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 [[VAL]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -104,7 +108,7 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], [[INDEX]], !nosanitize [[META2]]
@@ -113,7 +117,8 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP2]]
@@ -123,26 +128,29 @@ void test1(struct annotated *p, int index, int val) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP0]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test2(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -153,7 +161,7 @@ void test2(struct annotated *p, size_t index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test2_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
@@ -164,7 +172,7 @@ void test2(struct annotated *p, size_t index) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test2_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
@@ -189,7 +197,7 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i64 [[TMP0]], [[INDEX]], !nosanitize [[META2]]
@@ -198,7 +206,8 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP3]], i64 4)
@@ -212,7 +221,7 @@ size_t test2_bdos(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
@@ -221,21 +230,24 @@ size_t test2_bdos(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -248,7 +260,7 @@ void test3(struct annotated *p, size_t index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
@@ -261,7 +273,7 @@ void test3(struct annotated *p, size_t index) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
@@ -288,8 +300,9 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
@@ -303,7 +316,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 244
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = and i32 [[TMP4]], 252
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV1:%.*]] = select i1 [[TMP2]], i32 [[TMP5]], i32 0
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV1]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD7:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
@@ -320,7 +333,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], 240
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP11:%.*]] = and i32 [[TMP10]], 252
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV9:%.*]] = select i1 [[TMP8]], i32 [[TMP11]], i32 0
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM13]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM13]]
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV9]], ptr [[ARRAYIDX18]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD23:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[ADD29:%.*]] = add nsw i32 [[INDEX]], 2
@@ -332,7 +345,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM30]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont37:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM30]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM30]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP14:%.*]] = icmp sgt i32 [[FAM_IDX]], -1
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP15:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD23]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP16:%.*]] = sext i32 [[FAM_IDX]] to i64
@@ -349,15 +362,16 @@ size_t test3_bdos(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 244
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = and i32 [[TMP1]], 252
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV1:%.*]] = select i1 [[TMP2]], i32 [[TMP3]], i32 0
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV1]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD4:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD4]], 2
@@ -367,7 +381,7 @@ size_t test3_bdos(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV6:%.*]] = select i1 [[TMP6]], i32 [[TMP7]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM8:%.*]] = sext i32 [[ADD]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM8]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM8]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV6]], ptr [[ARRAYIDX9]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD12:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD12]] to i64
@@ -382,39 +396,41 @@ size_t test3_bdos(struct annotated *p) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV14:%.*]] = select i1 [[TMP13]], i32 [[TMP15]], i32 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ADD16:%.*]] = add nsw i32 [[INDEX]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM17:%.*]] = sext i32 [[ADD16]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM17]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM17]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV14]], ptr [[ARRAYIDX18]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test4(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX5]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM17:%.*]] = sext i32 [[ADD]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM17]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM17]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX18]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ADD31:%.*]] = add nsw i32 [[INDEX]], 2
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM32:%.*]] = sext i32 [[ADD31]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM32]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM32]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX33]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test4(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]], i32 noundef [[FAM_IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX3]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDEX]], 1
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM9:%.*]] = sext i32 [[ADD]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM9]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM9]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX10]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ADD17:%.*]] = add nsw i32 [[INDEX]], 2
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[ADD17]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM18]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM18]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 255, ptr [[ARRAYIDX19]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -428,7 +444,7 @@ void test4(struct annotated *p, int index, int fam_idx) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test4_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
@@ -443,7 +459,7 @@ void test4(struct annotated *p, int index, int fam_idx) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test4_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
@@ -472,7 +488,7 @@ size_t test4_bdos(struct annotated *p, int index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
@@ -481,47 +497,47 @@ size_t test4_bdos(struct annotated *p, int index) {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT]], ptr [[P]], i64 1
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD_TR:%.*]] = trunc i64 [[DOT_COUNTED_BY_LOAD]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = add i32 [[TMP1]], 16
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 16
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP3]]
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD_TR:%.*]] = trunc i64 [[DOT_COUNTED_BY_LOAD]] to i32
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP1]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT]], ptr [[P]], i64 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP2]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test5(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 1
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test5(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 1
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -532,7 +548,7 @@ void test5(struct anon_struct *p, int index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[DOT_COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 16
@@ -543,7 +559,7 @@ void test5(struct anon_struct *p, int index) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[DOT_COUNTED_BY_LOAD]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 16
@@ -568,7 +584,7 @@ size_t test5_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test6(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
@@ -577,45 +593,45 @@ size_t test5_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB10:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT]], ptr [[P]], i64 1
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD_TR:%.*]] = trunc i64 [[DOT_COUNTED_BY_LOAD]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP1]]
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2
+// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP2]]
 // SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test6(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD_TR:%.*]] = trunc i64 [[DOT_COUNTED_BY_LOAD]] to i32
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP0]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT]], ptr [[P]], i64 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test6(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 1
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test6(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 1
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -626,7 +642,7 @@ void test6(struct anon_struct *p, int index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test6_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[DOT_COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
@@ -636,7 +652,7 @@ void test6(struct anon_struct *p, int index) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test6_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANON_STRUCT:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[DOT_COUNTED_BY_LOAD]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
@@ -660,7 +676,7 @@ size_t test6_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test7(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i8, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i8 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
@@ -733,7 +749,7 @@ size_t test7_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test8(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i8, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
@@ -751,7 +767,7 @@ size_t test7_bdos(struct union_of_fams *p) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test8(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i8, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 9
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
@@ -784,7 +800,7 @@ void test8(struct union_of_fams *p, int index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test8_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i8, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[DOT_COUNTED_BY_LOAD]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
@@ -792,7 +808,7 @@ void test8(struct union_of_fams *p, int index) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test8_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i8, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i8 [[DOT_COUNTED_BY_LOAD]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP0]]
@@ -814,7 +830,7 @@ size_t test8_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test9(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[TMP0]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
@@ -887,7 +903,7 @@ size_t test9_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test10(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
@@ -907,7 +923,7 @@ size_t test9_bdos(struct union_of_fams *p) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test10(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = trunc i32 [[NARROW]] to i8
@@ -942,7 +958,7 @@ void test10(struct union_of_fams *p, int index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test10_bdos(
 // SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64
@@ -951,7 +967,7 @@ void test10(struct union_of_fams *p, int index) {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test10_bdos(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_UNION_OF_FAMS:%.*]], ptr [[P]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[NARROW:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 0)
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext nneg i32 [[NARROW]] to i64
@@ -975,7 +991,7 @@ size_t test10_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
@@ -984,31 +1000,35 @@ size_t test10_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    store i32 4, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test11(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef writeonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 4, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test11(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 4, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test11(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr nocapture noundef writeonly [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[P]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 4, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -1162,7 +1182,7 @@ struct test13_bar {
 // SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA11:![0-9]+]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_TEST13_BAR:%.*]], ptr [[TMP0]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[INDEX]], !nosanitize [[META2]]
@@ -1171,7 +1191,8 @@ struct test13_bar {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB23:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST13_BAR]], ptr [[TMP0]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x ptr], ptr [[REVMAP]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITH-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA14:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 0
 //
@@ -1179,7 +1200,8 @@ struct test13_bar {
 // NO-SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR8:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA8:![0-9]+]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST13_BAR:%.*]], ptr [[TMP0]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x ptr], ptr [[REVMAP]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 0
 //
@@ -1187,7 +1209,7 @@ struct test13_bar {
 // SANITIZE-WITHOUT-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA11:![0-9]+]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_TEST13_BAR:%.*]], ptr [[TMP0]], i64 0, i32 1
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[TMP1]], [[INDEX]], !nosanitize [[META9]]
@@ -1196,7 +1218,8 @@ struct test13_bar {
 // SANITIZE-WITHOUT-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[INDEX]]) #[[ATTR8]], !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    unreachable, !nosanitize [[META9]]
 // SANITIZE-WITHOUT-ATTR:       cont5:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST13_BAR]], ptr [[TMP0]], i64 0, i32 2, i64 [[INDEX]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x ptr], ptr [[REVMAP]], i64 0, i64 [[INDEX]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA14:![0-9]+]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 0
 //
@@ -1204,7 +1227,8 @@ struct test13_bar {
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA8:![0-9]+]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST13_BAR:%.*]], ptr [[TMP0]], i64 0, i32 2, i64 [[INDEX]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x ptr], ptr [[REVMAP]], i64 0, i64 [[INDEX]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store ptr null, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11:![0-9]+]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 0
 //
@@ -1236,10 +1260,11 @@ struct test14_foo {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca [[STRUCT_TEST14_FOO:%.*]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 1, ptr [[DOTCOMPOUNDLITERAL]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TEST14_FOO]], ptr [[DOTCOMPOUNDLITERAL]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[DOTCOMPOUNDLITERAL]], i64 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 2, ptr [[Y]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[BLAH:%.*]] = getelementptr inbounds i8, ptr [[DOTCOMPOUNDLITERAL]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST14_FOO]], ptr [[DOTCOMPOUNDLITERAL]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[BLAH]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP0]]
 //
@@ -1261,10 +1286,11 @@ struct test14_foo {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca [[STRUCT_TEST14_FOO:%.*]], align 4
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 1, ptr [[DOTCOMPOUNDLITERAL]], align 4, !tbaa [[TBAA2]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_TEST14_FOO]], ptr [[DOTCOMPOUNDLITERAL]], i64 0, i32 1
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[DOTCOMPOUNDLITERAL]], i64 4
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 2, ptr [[Y]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BLAH:%.*]] = getelementptr inbounds i8, ptr [[DOTCOMPOUNDLITERAL]], i64 8
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST14_FOO]], ptr [[DOTCOMPOUNDLITERAL]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[BLAH]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
@@ -1293,8 +1319,9 @@ int test14(int idx) {
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 1, ptr [[FOO]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[BLAH:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANON_8]], ptr [[FOO]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[BLAH]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR12]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP1]]
@@ -1320,8 +1347,9 @@ int test14(int idx) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 1, ptr [[FOO]], align 4
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 2, ptr [[TMP0]], align 4
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[BLAH:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_ANON_8]], ptr [[FOO]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[BLAH]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    call void @llvm.lifetime.end.p0(i64 8, ptr nonnull [[FOO]]) #[[ATTR9]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP1]]
@@ -1468,7 +1496,7 @@ struct tests_foo {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test24(
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[VAR:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TESTS_FOO:%.*]], ptr [[VAR]], i64 10
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 40
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
@@ -1476,28 +1504,28 @@ struct tests_foo {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB26:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont4:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_TESTS_FOO]], ptr [[VAR]], i64 21
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 84
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP1]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test24(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[VAR:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_TESTS_FOO:%.*]], ptr [[VAR]], i64 21
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 84
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test24(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[VAR:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_TESTS_FOO:%.*]], ptr [[VAR]], i64 21
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 84
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test24(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[VAR:%.*]]) local_unnamed_addr #[[ATTR6:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_TESTS_FOO:%.*]], ptr [[VAR]], i64 21
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 84
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP0]]
 //
@@ -1517,7 +1545,7 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TESTS_FOO:%.*]], ptr [[TMP0]], i64 11
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP2]]
 //
@@ -1525,7 +1553,7 @@ int test24(int c, struct tests_foo *var) {
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[VAR:%.*]]) local_unnamed_addr #[[ATTR9:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[TBAA11]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TESTS_FOO:%.*]], ptr [[TMP0]], i64 11
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP1]]
 //
@@ -1533,7 +1561,7 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[VAR:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[TBAA14]]
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TESTS_FOO:%.*]], ptr [[TMP0]], i64 11
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP1]]
 //
@@ -1541,7 +1569,7 @@ int test24(int c, struct tests_foo *var) {
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[VAR:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[TBAA11]]
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TESTS_FOO:%.*]], ptr [[TMP0]], i64 11
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP1]]
 //
@@ -1559,7 +1587,7 @@ struct test26_foo {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test26(
 // SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[FOO:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[S:%.*]] = getelementptr inbounds [[STRUCT_TEST26_FOO:%.*]], ptr [[FOO]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[S:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[S]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
@@ -1569,7 +1597,7 @@ struct test26_foo {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds [[STRUCT_TEST26_FOO]], ptr [[FOO]], i64 1
+// SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP2]]
@@ -1577,7 +1605,7 @@ struct test26_foo {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test26(
 // NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[FOO:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds [[STRUCT_TEST26_FOO:%.*]], ptr [[FOO]], i64 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
@@ -1586,7 +1614,7 @@ struct test26_foo {
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test26(
 // SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr noundef [[FOO:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds [[STRUCT_TEST26_FOO:%.*]], ptr [[FOO]], i64 1
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
@@ -1595,7 +1623,7 @@ struct test26_foo {
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test26(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[FOO:%.*]]) local_unnamed_addr #[[ATTR6]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds [[STRUCT_TEST26_FOO:%.*]], ptr [[FOO]], i64 1
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 8
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[C]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
@@ -1631,7 +1659,7 @@ struct test27_foo {
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_TEST27_FOO:%.*]], ptr [[P]], i64 0, i32 2
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
@@ -1640,7 +1668,8 @@ struct test27_foo {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST27_FOO]], ptr [[P]], i64 0, i32 4, i64 [[IDXPROM]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 24
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x ptr], ptr [[ENTRIES]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA14]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[J]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_TEST27_BAR:%.*]], ptr [[TMP2]], i64 [[IDXPROM4]]
@@ -1649,8 +1678,9 @@ struct test27_foo {
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local ptr @test27(
 // NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 24
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST27_FOO:%.*]], ptr [[P]], i64 0, i32 4, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x ptr], ptr [[ENTRIES]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[J]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_TEST27_BAR:%.*]], ptr [[TMP0]], i64 [[IDXPROM1]]
@@ -1659,8 +1689,9 @@ struct test27_foo {
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local ptr @test27(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 24
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST27_FOO:%.*]], ptr [[P]], i64 0, i32 4, i64 [[IDXPROM]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x ptr], ptr [[ENTRIES]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA14]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM3:%.*]] = sext i32 [[J]] to i64
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds [[STRUCT_TEST27_BAR:%.*]], ptr [[TMP0]], i64 [[IDXPROM3]]
@@ -1669,8 +1700,9 @@ struct test27_foo {
 // NO-SANITIZE-WITHOUT-ATTR-LABEL: define dso_local ptr @test27(
 // NO-SANITIZE-WITHOUT-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[I:%.*]], i32 noundef [[J:%.*]]) local_unnamed_addr #[[ATTR6]] {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:  entry:
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 24
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST27_FOO:%.*]], ptr [[P]], i64 0, i32 4, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x ptr], ptr [[ENTRIES]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM1:%.*]] = sext i32 [[J]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_TEST27_BAR:%.*]], ptr [[TMP0]], i64 [[IDXPROM1]]
@@ -1693,7 +1725,7 @@ struct test28_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA14]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA14]]
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_TEST28_FOO:%.*]], ptr [[TMP2]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP3]], !nosanitize [[META2]]
@@ -1702,7 +1734,8 @@ struct test28_foo {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB31:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont17:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST28_FOO]], ptr [[TMP2]], i64 0, i32 2, i64 [[IDXPROM]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP5]]
 //
@@ -1712,8 +1745,9 @@ struct test28_foo {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA11]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA11]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA11]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST28_FOO:%.*]], ptr [[TMP2]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP3]]
 //
@@ -1723,8 +1757,9 @@ struct test28_foo {
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA14]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA14]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA14]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST28_FOO:%.*]], ptr [[TMP2]], i64 0, i32 2, i64 [[IDXPROM]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP3]]
 //
@@ -1734,8 +1769,9 @@ struct test28_foo {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA11]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA11]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 8, !tbaa [[TBAA11]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[I]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_TEST28_FOO:%.*]], ptr [[TMP2]], i64 0, i32 2, i64 [[IDXPROM]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARR]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP3:%.*]] = load i32, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i32 [[TMP3]]
 //
@@ -1762,7 +1798,7 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA14]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[TMP2]], i64 0, i32 1
+// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 8
 // SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM15:%.*]] = sext i32 [[IDX2]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
@@ -1772,7 +1808,8 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR10]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont20:
-// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[TMP2]], i64 0, i32 2, i64 [[IDXPROM15]]
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
+// SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX18:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM15]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP5]]
@@ -1785,13 +1822,14 @@ struct annotated_struct_array {
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[TMP0]], i64 0, i32 1
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD]], 2
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP1]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM4:%.*]] = sext i32 [[IDX2]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED]], ptr [[TMP0]], i64 0, i32 2, i64 [[IDXPROM4]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM4]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX5]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
@@ -1807,8 +1845,9 @@ struct annotated_struct_array {
 // SANITIZE-WITHOUT-ATTR:       cont21:
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA14]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 12
 // SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM18:%.*]] = sext i32 [[IDX2]] to i64
-// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[TMP2]], i64 0, i32 2, i64 [[IDXPROM18]]
+// SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX19:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM18]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX19]], align 4, !tbaa [[TBAA2]]
 // SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
@@ -1818,8 +1857,9 @@ struct annotated_struct_array {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[IDXPROM]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[ARRAYIDX]], align 8, !tbaa [[TBAA11]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 12
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[IDX2]] to i64
-// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [[STRUCT_ANNOTATED:%.*]], ptr [[TMP0]], i64 0, i32 2, i64 [[IDXPROM5]]
+// NO-SANITIZE-WITHOUT-ATTR-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM5]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX6]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
index 886af083f1c00..6d290f9cb47be 100644
--- a/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
+++ b/clang/test/CodeGen/attr-riscv-rvv-vector-bits-bitcast.c
@@ -35,21 +35,21 @@ DEFINE_STRUCT(float64m1)
 
 // CHECK-64-LABEL: @read_int64m1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x i64>, ptr [[Y]], align 8, !tbaa [[TBAA4:![0-9]+]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v1i64(<vscale x 1 x i64> undef, <1 x i64> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
 // CHECK-128-LABEL: @read_int64m1(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x i64>, ptr [[Y]], align 8, !tbaa [[TBAA4:![0-9]+]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v2i64(<vscale x 1 x i64> undef, <2 x i64> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
 //
 // CHECK-256-LABEL: @read_int64m1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x i64>, ptr [[Y]], align 8, !tbaa [[TBAA4:![0-9]+]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x i64> @llvm.vector.insert.nxv1i64.v4i64(<vscale x 1 x i64> undef, <4 x i64> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 1 x i64> [[CAST_SCALABLE]]
@@ -61,21 +61,21 @@ vint64m1_t read_int64m1(struct struct_int64m1 *s) {
 // CHECK-64-LABEL: @write_int64m1(
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x i64> @llvm.vector.extract.v1i64.nxv1i64(<vscale x 1 x i64> [[X:%.*]], i64 0)
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
 // CHECK-64-NEXT:    store <1 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-128-LABEL: @write_int64m1(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x i64> @llvm.vector.extract.v2i64.nxv1i64(<vscale x 1 x i64> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    store <2 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_int64m1(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x i64> @llvm.vector.extract.v4i64.nxv1i64(<vscale x 1 x i64> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_INT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    store <4 x i64> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
 //
@@ -89,21 +89,21 @@ void write_int64m1(struct struct_int64m1 *s, vint64m1_t x) {
 
 // CHECK-64-LABEL: @read_float64m1(
 // CHECK-64-NEXT:  entry:
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
 // CHECK-64-NEXT:    [[TMP0:%.*]] = load <1 x double>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-64-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x double> @llvm.vector.insert.nxv1f64.v1f64(<vscale x 1 x double> undef, <1 x double> [[TMP0]], i64 0)
 // CHECK-64-NEXT:    ret <vscale x 1 x double> [[CAST_SCALABLE]]
 //
 // CHECK-128-LABEL: @read_float64m1(
 // CHECK-128-NEXT:  entry:
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-128-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x double> @llvm.vector.insert.nxv1f64.v2f64(<vscale x 1 x double> undef, <2 x double> [[TMP0]], i64 0)
 // CHECK-128-NEXT:    ret <vscale x 1 x double> [[CAST_SCALABLE]]
 //
 // CHECK-256-LABEL: @read_float64m1(
 // CHECK-256-NEXT:  entry:
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    [[TMP0:%.*]] = load <4 x double>, ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    [[CAST_SCALABLE:%.*]] = tail call <vscale x 1 x double> @llvm.vector.insert.nxv1f64.v4f64(<vscale x 1 x double> undef, <4 x double> [[TMP0]], i64 0)
 // CHECK-256-NEXT:    ret <vscale x 1 x double> [[CAST_SCALABLE]]
@@ -115,21 +115,21 @@ vfloat64m1_t read_float64m1(struct struct_float64m1 *s) {
 // CHECK-64-LABEL: @write_float64m1(
 // CHECK-64-NEXT:  entry:
 // CHECK-64-NEXT:    [[CAST_FIXED:%.*]] = tail call <1 x double> @llvm.vector.extract.v1f64.nxv1f64(<vscale x 1 x double> [[X:%.*]], i64 0)
-// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-64-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 8
 // CHECK-64-NEXT:    store <1 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-64-NEXT:    ret void
 //
 // CHECK-128-LABEL: @write_float64m1(
 // CHECK-128-NEXT:  entry:
 // CHECK-128-NEXT:    [[CAST_FIXED:%.*]] = tail call <2 x double> @llvm.vector.extract.v2f64.nxv1f64(<vscale x 1 x double> [[X:%.*]], i64 0)
-// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-128-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 16
 // CHECK-128-NEXT:    store <2 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-128-NEXT:    ret void
 //
 // CHECK-256-LABEL: @write_float64m1(
 // CHECK-256-NEXT:  entry:
 // CHECK-256-NEXT:    [[CAST_FIXED:%.*]] = tail call <4 x double> @llvm.vector.extract.v4f64.nxv1f64(<vscale x 1 x double> [[X:%.*]], i64 0)
-// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds [[STRUCT_STRUCT_FLOAT64M1:%.*]], ptr [[S:%.*]], i64 0, i32 1
+// CHECK-256-NEXT:    [[Y:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 32
 // CHECK-256-NEXT:    store <4 x double> [[CAST_FIXED]], ptr [[Y]], align 8, !tbaa [[TBAA4]]
 // CHECK-256-NEXT:    ret void
 //
diff --git a/clang/test/CodeGen/cleanup-destslot-simple.c b/clang/test/CodeGen/cleanup-destslot-simple.c
index 075afbba18cdb..a02841a5e2e2e 100644
--- a/clang/test/CodeGen/cleanup-destslot-simple.c
+++ b/clang/test/CodeGen/cleanup-destslot-simple.c
@@ -101,8 +101,8 @@
 // CHECK-KMSAN-NEXT:    call void @__msan_warning(i32 [[TMP9]]) #[[ATTR3:[0-9]+]], !dbg [[DBG20]]
 // CHECK-KMSAN-NEXT:    br label [[TMP10]], !dbg [[DBG20]]
 // CHECK-KMSAN:       10:
-// CHECK-KMSAN-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i64 0, i32 6
-// CHECK-KMSAN-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr { [100 x i64], [100 x i64], [100 x i64], [100 x i64], i64, [200 x i32], i32, i32 }, ptr [[TMP0]], i64 0, i32 1
+// CHECK-KMSAN-NEXT:    [[RETVAL_ORIGIN:%.*]] = getelementptr i8, ptr [[TMP0]], i64 4008
+// CHECK-KMSAN-NEXT:    [[RETVAL_SHADOW:%.*]] = getelementptr i8, ptr [[TMP0]], i64 800
 // CHECK-KMSAN-NEXT:    [[TMP11:%.*]] = load i32, ptr [[P_0_P_0_P_0_P_0_]], align 4, !dbg [[DBG20]], !tbaa [[TBAA11]]
 // CHECK-KMSAN-NEXT:    [[TMP12:%.*]] = call { ptr, ptr } @__msan_metadata_ptr_for_load_4(ptr nonnull [[P_0_P_0_P_0_P_0_]]) #[[ATTR2]], !dbg [[DBG20]]
 // CHECK-KMSAN-NEXT:    [[TMP13:%.*]] = extractvalue { ptr, ptr } [[TMP12]], 0, !dbg [[DBG20]]
diff --git a/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c b/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
index c9b97da34d59e..7802168de4d75 100644
--- a/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
+++ b/clang/test/CodeGen/hexagon-brev-ld-ptr-incdec.c
@@ -6,9 +6,9 @@
 // the return value will be the value in A[2]
 // CHECK: @brev_ptr_inc
 // CHECK-DAG: llvm.hexagon.L2.loadri.pbr
-// CHECK-DAG: getelementptr{{.*}}i32 1
-// CHECK-NOT: getelementptr{{.*}}i32 2
-// CHECK-NOT: getelementptr{{.*}}i32 1
+// CHECK-DAG: getelementptr inbounds i8, {{.*}}i32 4
+// CHECK-NOT: getelementptr inbounds i8, {{.*}}i32 8
+// CHECK-NOT: getelementptr inbounds i8, {{.*}}i32 4
 int brev_ptr_inc(int A[], int B[]) {
   int *p0 = &B[0];
   int *p1 = &A[0];
diff --git a/clang/test/CodeGen/ms-intrinsics.c b/clang/test/CodeGen/ms-intrinsics.c
index ffab8b998d8bd..5bb003d1f91fc 100644
--- a/clang/test/CodeGen/ms-intrinsics.c
+++ b/clang/test/CodeGen/ms-intrinsics.c
@@ -156,7 +156,7 @@ unsigned char test_BitScanForward(unsigned long *Index, unsigned long Mask) {
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i32, ptr %Index, {{i64|i32}} 1
+// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i8, ptr %Index, {{i64|i32}} 4
 // CHECK:   [[INDEX:%[0-9]+]] = tail call i32 @llvm.cttz.i32(i32 %Mask, i1 true)
 // CHECK:   store i32 [[INDEX]], ptr [[IDXGEP]], align 4
 // CHECK:   br label %[[END_LABEL]]
@@ -171,7 +171,7 @@ unsigned char test_BitScanReverse(unsigned long *Index, unsigned long Mask) {
 // CHECK:   [[RESULT:%[a-z0-9._]+]] = phi i8 [ 0, %[[ISZERO_LABEL:[a-z0-9._]+]] ], [ 1, %[[ISNOTZERO_LABEL]] ]
 // CHECK:   ret i8 [[RESULT]]
 // CHECK:   [[ISNOTZERO_LABEL]]:
-// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i32, ptr %Index, {{i64|i32}} 1
+// CHECK:   [[IDXGEP:%[a-z0-9._]+]] = getelementptr inbounds i8, ptr %Index, {{i64|i32}} 4
 // CHECK:   [[REVINDEX:%[0-9]+]] = tail call i32 @llvm.ctlz.i32(i32 %Mask, i1 true)
 // CHECK:   [[INDEX:%[0-9]+]] = xor i32 [[REVINDEX]], 31
 // CHECK:   store i32 [[INDEX]], ptr [[IDXGEP]], align 4
@@ -437,10 +437,10 @@ unsigned char test_InterlockedCompareExchange128(
                                         ++ExchangeLow, ++ComparandResult);
 }
 // CHECK-64: define{{.*}}i8 @test_InterlockedCompareExchange128(ptr{{[a-z_ ]*}}%Destination, i64{{[a-z_ ]*}}%ExchangeHigh, i64{{[a-z_ ]*}}%ExchangeLow, ptr{{[a-z_ ]*}}%ComparandResult){{.*}}{
-// CHECK-64: %incdec.ptr = getelementptr inbounds i64, ptr %Destination, i64 1
+// CHECK-64: %incdec.ptr = getelementptr inbounds i8, ptr %Destination, i64 8
 // CHECK-64: %inc = add nsw i64 %ExchangeHigh, 1
 // CHECK-64: %inc1 = add nsw i64 %ExchangeLow, 1
-// CHECK-64: %incdec.ptr2 = getelementptr inbounds i64, ptr %ComparandResult, i64 1
+// CHECK-64: %incdec.ptr2 = getelementptr inbounds i8, ptr %ComparandResult, i64 8
 // CHECK-64: [[EH:%[0-9]+]] = zext i64 %inc to i128
 // CHECK-64: [[EL:%[0-9]+]] = zext i64 %inc1 to i128
 // CHECK-64: [[EHS:%[0-9]+]] = shl nuw i128 [[EH]], 64
@@ -486,7 +486,7 @@ short test_InterlockedIncrement16(short volatile *Addend) {
   return _InterlockedIncrement16(++Addend);
 }
 // CHECK: define{{.*}}i16 @test_InterlockedIncrement16(ptr{{[a-z_ ]*}}%Addend){{.*}}{
-// CHECK: %incdec.ptr = getelementptr inbounds i16, ptr %Addend, {{i64|i32}} 1
+// CHECK: %incdec.ptr = getelementptr inbounds i8, ptr %Addend, {{i64|i32}} 2
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i16 1 seq_cst, align 2
 // CHECK: [[RESULT:%[0-9]+]] = add i16 [[TMP]], 1
 // CHECK: ret i16 [[RESULT]]
@@ -496,7 +496,7 @@ long test_InterlockedIncrement(long volatile *Addend) {
   return _InterlockedIncrement(++Addend);
 }
 // CHECK: define{{.*}}i32 @test_InterlockedIncrement(ptr{{[a-z_ ]*}}%Addend){{.*}}{
-// CHECK: %incdec.ptr = getelementptr inbounds i32, ptr %Addend, {{i64|i32}} 1
+// CHECK: %incdec.ptr = getelementptr inbounds i8, ptr %Addend, {{i64|i32}} 4
 // CHECK: [[TMP:%[0-9]+]] = atomicrmw add ptr %incdec.ptr, i32 1 seq_cst, align 4
 // CHECK: [[RESULT:%[0-9]+]] = add i32 [[TMP]], 1
 // CHECK: ret i32 [[RESULT]]
diff --git a/clang/test/CodeGen/nofpclass.c b/clang/test/CodeGen/nofpclass.c
index daaf24e77b09d..dd90d02f7759b 100644
--- a/clang/test/CodeGen/nofpclass.c
+++ b/clang/test/CodeGen/nofpclass.c
@@ -925,9 +925,9 @@ _Complex _Float16 defined_complex_func_f16_ret(_Complex _Float16 c) {
 // CLFINITEONLY-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca { double, double }, align 8
 // CLFINITEONLY-NEXT:    [[CONV:%.*]] = fpext float [[F32]] to double
 // CLFINITEONLY-NEXT:    [[CF16_REAL:%.*]] = load half, ptr [[CF16]], align 8
-// CLFINITEONLY-NEXT:    [[CF16_IMAGP:%.*]] = getelementptr inbounds { half, half }, ptr [[CF16]], i64 0, i32 1
+// CLFINITEONLY-NEXT:    [[CF16_IMAGP:%.*]] = getelementptr inbounds i8, ptr [[CF16]], i64 2
 // CLFINITEONLY-NEXT:    [[CF16_IMAG:%.*]] = load half, ptr [[CF16_IMAGP]], align 2
-// CLFINITEONLY-NEXT:    [[INDIRECT_ARG_TEMP_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[INDIRECT_ARG_TEMP]], i64 0, i32 1
+// CLFINITEONLY-NEXT:    [[INDIRECT_ARG_TEMP_IMAGP:%.*]] = getelementptr inbounds i8, ptr [[INDIRECT_ARG_TEMP]], i64 8
 // CLFINITEONLY-NEXT:    store double [[CF64_COERCE0]], ptr [[INDIRECT_ARG_TEMP]], align 8
 // CLFINITEONLY-NEXT:    store double [[CF64_COERCE1]], ptr [[INDIRECT_ARG_TEMP_IMAGP]], align 8
 // CLFINITEONLY-NEXT:    [[COERCE5_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x half> poison, half [[CF16_REAL]], i64 0
@@ -1176,9 +1176,9 @@ float call_variadic(float f32, double f64, _Float16 f16,
 // CLFINITEONLY-NEXT:    [[INDIRECT_ARG_TEMP:%.*]] = alloca { double, double }, align 8
 // CLFINITEONLY-NEXT:    [[CONV:%.*]] = fpext float [[F32]] to double
 // CLFINITEONLY-NEXT:    [[CF16_REAL:%.*]] = load half, ptr [[CF16]], align 8
-// CLFINITEONLY-NEXT:    [[CF16_IMAGP:%.*]] = getelementptr inbounds { half, half }, ptr [[CF16]], i64 0, i32 1
+// CLFINITEONLY-NEXT:    [[CF16_IMAGP:%.*]] = getelementptr inbounds i8, ptr [[CF16]], i64 2
 // CLFINITEONLY-NEXT:    [[CF16_IMAG:%.*]] = load half, ptr [[CF16_IMAGP]], align 2
-// CLFINITEONLY-NEXT:    [[INDIRECT_ARG_TEMP_IMAGP:%.*]] = getelementptr inbounds { double, double }, ptr [[INDIRECT_ARG_TEMP]], i64 0, i32 1
+// CLFINITEONLY-NEXT:    [[INDIRECT_ARG_TEMP_IMAGP:%.*]] = getelementptr inbounds i8, ptr [[INDIRECT_ARG_TEMP]], i64 8
 // CLFINITEONLY-NEXT:    store double [[CF64_COERCE0]], ptr [[INDIRECT_ARG_TEMP]], align 8
 // CLFINITEONLY-NEXT:    store double [[CF64_COERCE1]], ptr [[INDIRECT_ARG_TEMP_IMAGP]], align 8
 // CLFINITEONLY-NEXT:    [[COERCE5_SROA_0_0_VEC_INSERT:%.*]] = insertelement <2 x half> poison, half [[CF16_REAL]], i64 0
diff --git a/clang/test/CodeGen/union-tbaa1.c b/clang/test/CodeGen/union-tbaa1.c
index f6f10b3e5b1e1..a5faa8269aed6 100644
--- a/clang/test/CodeGen/union-tbaa1.c
+++ b/clang/test/CodeGen/union-tbaa1.c
@@ -27,10 +27,10 @@ void bar(vect32 p[][2]);
 // CHECK-NEXT:    store i32 [[TMP3]], ptr [[VEC]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[INDEX]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    [[ARRAYIDX14:%.*]] = getelementptr inbounds [4 x [2 x %union.vect32]], ptr [[TMP]], i32 0, i32 [[TMP4]], i32 1
-// CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds [2 x i16], ptr [[ARRAYIDX14]], i32 0, i32 1
+// CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX14]], i32 2
 // CHECK-NEXT:    [[TMP5:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2, !tbaa [[TBAA6]]
 // CHECK-NEXT:    [[CONV16:%.*]] = zext i16 [[TMP5]] to i32
-// CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i32, ptr [[VEC]], i32 1
+// CHECK-NEXT:    [[ARRAYIDX17:%.*]] = getelementptr inbounds i8, ptr [[VEC]], i32 4
 // CHECK-NEXT:    store i32 [[CONV16]], ptr [[ARRAYIDX17]], align 4, !tbaa [[TBAA2]]
 // CHECK-NEXT:    call void @bar(ptr noundef nonnull [[TMP]]) #[[ATTR3]]
 // CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 32, ptr nonnull [[TMP]]) #[[ATTR3]]
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp
index 20b9557bcc1a5..4fef80f051ac9 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/dynamic-cast.cpp
@@ -33,7 +33,7 @@
 // CHECK-NEXT:   br i1 [[isnull]], label %[[dynamic_cast_end:[a-z0-9._]+]], label %[[dynamic_cast_notnull:[a-z0-9._]+]]
 // CHECK:      [[dynamic_cast_notnull]]:
 // CHECK-DAG:    [[vtable:%[a-z0-9]+]] = load ptr, ptr %b, align 8
-// CHECK-DAG:    [[offset_ptr:%.+]] = getelementptr inbounds i32, ptr [[vtable]], i64 -2
+// CHECK-DAG:    [[offset_ptr:%.+]] = getelementptr inbounds i8, ptr [[vtable]], i64 -8
 // CHECK-DAG:    [[offset_to_top:%.+]] = load i32, ptr [[offset_ptr]], align 4
 // CHECK-DAG:    [[offset_to_top2:%.+]] = sext i32 [[offset_to_top]] to i64
 // CHECK-DAG:    [[casted:%.+]] = getelementptr inbounds i8, ptr %b, i64 [[offset_to_top2]]
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
index bc10461187b75..83c8956f9ff16 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
@@ -2,8 +2,6 @@
 
 // RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O3 -S -o - -emit-llvm -fcxx-exceptions -fexceptions | FileCheck %s
 
-// CHECK: %"class.std::type_info" = type { ptr, ptr }
-
 // CHECK: $_ZTI1A.rtti_proxy = comdat any
 // CHECK: $_ZTI1B.rtti_proxy = comdat any
 
@@ -37,7 +35,7 @@
 // CHECK-NEXT:   [[vtable:%[a-z0-9]+]] = load ptr, ptr %a
 // CHECK-NEXT:   [[type_info_ptr:%[0-9]+]] = tail call ptr @llvm.load.relative.i32(ptr [[vtable]], i32 -4)
 // CHECK-NEXT:   [[type_info_ptr2:%[0-9]+]] = load ptr, ptr [[type_info_ptr]], align 8
-// CHECK-NEXT:   [[name_ptr:%[a-z0-9._]+]] = getelementptr inbounds %"class.std::type_info", ptr [[type_info_ptr2]], i64 0, i32 1
+// CHECK-NEXT:   [[name_ptr:%[a-z0-9._]+]] = getelementptr inbounds i8, ptr [[type_info_ptr2]], i64 8
 // CHECK-NEXT:   [[name:%[0-9]+]] = load ptr, ptr [[name_ptr]], align 8
 // CHECK-NEXT:   [[eq:%[a-z0-9.]+]] = icmp eq ptr [[name]], @_ZTS1B
 // CHECK-NEXT:   ret i1 [[eq]]
diff --git a/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp b/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
index 5c4b86744287e..4b046acf0d878 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-dynamic-cast.cpp
@@ -18,7 +18,7 @@ T* test1(V* x) { return &dynamic_cast<T&>(*x); }
 T* test2(A* x) { return &dynamic_cast<T&>(*x); }
 // CHECK-LABEL: define dso_local noundef ptr @"?test2@@YAPAUT@@PAUA@@@Z"(ptr noundef %x)
 // CHECK:        [[VBTBL:%.*]] = load ptr, ptr %x, align 4
-// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, ptr [[VBTBL]], i32 1
+// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i8, ptr [[VBTBL]], i32 4
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, ptr [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, ptr %x, i32 [[VBOFFS]]
 // CHECK-NEXT:   [[CALL:%.*]] = tail call ptr @__RTDynamicCast(ptr nonnull [[ADJ]], i32 [[VBOFFS]], ptr nonnull @"??_R0?AUA@@@8", ptr nonnull @"??_R0?AUT@@@8", i32 1)
@@ -28,7 +28,7 @@ T* test3(B* x) { return &dynamic_cast<T&>(*x); }
 // CHECK-LABEL: define dso_local noundef ptr @"?test3@@YAPAUT@@PAUB@@@Z"(ptr noundef %x)
 // CHECK:        [[VBPTR:%.*]] = getelementptr inbounds i8, ptr %x, i32 4
 // CHECK-NEXT:   [[VBTBL:%.*]] = load ptr, ptr [[VBPTR:%.*]], align 4
-// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, ptr [[VBTBL]], i32 1
+// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i8, ptr [[VBTBL]], i32 4
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, ptr [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, ptr %x, i32 [[DELTA]]
@@ -45,7 +45,7 @@ T* test5(A* x) { return dynamic_cast<T*>(x); }
 // CHECK:        [[CHECK:%.*]] = icmp eq ptr %x, null
 // CHECK-NEXT:   br i1 [[CHECK]]
 // CHECK:        [[VBTBL:%.*]] = load ptr, ptr %x, align 4
-// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, ptr [[VBTBL]], i32 1
+// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i8, ptr [[VBTBL]], i32 4
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, ptr [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, ptr %x, i32 [[VBOFFS]]
 // CHECK-NEXT:   [[CALL:%.*]] = tail call ptr @__RTDynamicCast(ptr nonnull [[ADJ]], i32 [[VBOFFS]], ptr {{.*}}@"??_R0?AUA@@@8", ptr {{.*}}@"??_R0?AUT@@@8", i32 0)
@@ -59,7 +59,7 @@ T* test6(B* x) { return dynamic_cast<T*>(x); }
 // CHECK-NEXT:   br i1 [[CHECK]]
 // CHECK:        [[VBPTR:%.*]] = getelementptr inbounds i8, ptr %x, i32 4
 // CHECK-NEXT:   [[VBTBL:%.*]] = load ptr, ptr [[VBPTR]], align 4
-// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, ptr [[VBTBL]], i32 1
+// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i8, ptr [[VBTBL]], i32 4
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, ptr [[VBOFFP]], align 4
 // CHECK-NEXT:   [[DELTA:%.*]] = add nsw i32 [[VBOFFS]], 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, ptr %x, i32 [[DELTA]]
@@ -78,7 +78,7 @@ void* test8(A* x) { return dynamic_cast<void*>(x); }
 // CHECK:        [[CHECK:%.*]] = icmp eq ptr %x, null
 // CHECK-NEXT:   br i1 [[CHECK]]
 // CHECK:        [[VBTBL:%.*]] = load ptr, ptr %x, align 4
-// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, ptr [[VBTBL]], i32 1
+// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i8, ptr [[VBTBL]], i32 4
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, ptr [[VBOFFP]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, ptr %x, i32 [[VBOFFS]]
 // CHECK-NEXT:   [[RES:%.*]] = tail call ptr @__RTCastToVoid(ptr nonnull [[ADJ]])
@@ -92,7 +92,7 @@ void* test9(B* x) { return dynamic_cast<void*>(x); }
 // CHECK-NEXT:   br i1 [[CHECK]]
 // CHECK:        [[VBPTR:%.*]] = getelementptr inbounds i8, ptr %x, i32 4
 // CHECK-NEXT:   [[VBTBL:%.*]] = load ptr, ptr [[VBPTR]], align 4
-// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i32, ptr [[VBTBL]], i32 1
+// CHECK-NEXT:   [[VBOFFP:%.*]] = getelementptr inbounds i8, ptr [[VBTBL]], i32 4
 // CHECK-NEXT:   [[VBOFFS:%.*]] = load i32, ptr [[VBOFFP]], align 4
 // CHECK-NEXT:   [[BASE:%.*]] = getelementptr i8, ptr %x, i32 [[VBOFFS]]
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr i8, ptr [[BASE]], i32 4
diff --git a/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp b/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
index 5b3c812c07846..31d7d23fd6fc5 100644
--- a/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
+++ b/clang/test/CodeGenCXX/microsoft-abi-typeid.cpp
@@ -31,7 +31,7 @@ const std::type_info* test3_typeid() { return &typeid(*fn()); }
 // CHECK:        call ptr @__RTtypeid(ptr null)
 // CHECK-NEXT:   unreachable
 // CHECK:        [[VBTBL:%.*]] = load ptr, ptr [[CALL]], align 4
-// CHECK-NEXT:   [[VBSLOT:%.*]] = getelementptr inbounds i32, ptr [[VBTBL]], i32 1
+// CHECK-NEXT:   [[VBSLOT:%.*]] = getelementptr inbounds i8, ptr [[VBTBL]], i32 4
 // CHECK-NEXT:   [[VBASE_OFFS:%.*]] = load i32, ptr [[VBSLOT]], align 4
 // CHECK-NEXT:   [[ADJ:%.*]] = getelementptr inbounds i8, ptr [[CALL]], i32 [[VBASE_OFFS]]
 // CHECK-NEXT:   [[RT:%.*]] = tail call ptr @__RTtypeid(ptr nonnull [[ADJ]])
diff --git a/clang/test/CodeGenObjC/arc-foreach.m b/clang/test/CodeGenObjC/arc-foreach.m
index 64099e04d6d03..71edc5161303c 100644
--- a/clang/test/CodeGenObjC/arc-foreach.m
+++ b/clang/test/CodeGenObjC/arc-foreach.m
@@ -65,7 +65,7 @@ void test0(NSArray *array) {
 // CHECK-LP64-NEXT: call void @llvm.objc.storeStrong(ptr [[CAPTURED]], ptr null)
 // CHECK-LP64-NOT:  call void (...) @llvm.objc.clang.arc.use(
 
-// CHECK-LP64-OPT: [[D0:%.*]] = getelementptr inbounds [[BLOCK_T]], ptr [[BLOCK]], i64 0, i32 5
+// CHECK-LP64-OPT: [[D0:%.*]] = getelementptr inbounds i8, ptr [[BLOCK]], i64 32
 // CHECK-LP64-OPT: [[CAPTURE:%.*]] = load ptr, ptr [[D0]]
 // CHECK-LP64-OPT: call void (...) @llvm.objc.clang.arc.use(ptr [[CAPTURE]])
 
@@ -189,7 +189,7 @@ - (void) foo2 {
 // CHECK-LP64-OPT-LABEL: define internal void @"\01-[I1 foo2]"(
 // CHECK-LP64-OPT: ptr %self
 // CHECK-LP64-OPT: [[BLOCK:%.*]] = alloca [[BLOCK_T:<{.*}>]],
-// CHECK-LP64-OPT: [[T0:%.*]] = getelementptr inbounds [[BLOCK_T]], ptr [[BLOCK]], i64 0, i32 5
+// CHECK-LP64-OPT: [[T0:%.*]] = getelementptr inbounds i8, ptr [[BLOCK]], i64 32
 
 // CHECK-LP64:         call void @llvm.objc.storeStrong(ptr [[BC]], ptr null)
 // CHECK-LP64-NOT:     call void (...) @llvm.objc.clang.arc.use(ptr [[BC]])
diff --git a/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm b/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm
index c2d4f8685d7ef..fcf47c7faa9ce 100644
--- a/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm
+++ b/clang/test/CodeGenObjCXX/arc-cxx11-init-list.mm
@@ -42,7 +42,7 @@ + (instancetype) new;
 
 // CHECK: define{{.*}} void @_Z4foo1v(ptr {{.*}} %[[AGG_RESULT:.*]])
 // CHECK: store ptr @[[REFTMP]], ptr %[[AGG_RESULT]]
-// CHECK: %[[SIZE:.*]] = getelementptr inbounds %"class.std::initializer_list.0", ptr %[[AGG_RESULT]], i32 0, i32 1
+// CHECK: %[[SIZE:.*]] = getelementptr inbounds i8, ptr %[[AGG_RESULT]], i32 4
 // CHECK: store i32 2, ptr %[[SIZE]]
 // CHECK: ret void
 
diff --git a/clang/test/Headers/__clang_hip_math.hip b/clang/test/Headers/__clang_hip_math.hip
index fa13ddba425bd..5230c360d0bef 100644
--- a/clang/test/Headers/__clang_hip_math.hip
+++ b/clang/test/Headers/__clang_hip_math.hip
@@ -2829,7 +2829,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // DEFAULT:       _ZL5normfiPKf.exit:
@@ -2849,7 +2849,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // FINITEONLY:       _ZL5normfiPKf.exit:
@@ -2869,7 +2869,7 @@ extern "C" __device__ double test_normcdfinv(double x) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5NORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP20:![0-9]+]]
 // APPROX:       _ZL5normfiPKf.exit:
@@ -2893,7 +2893,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // DEFAULT:       _ZL4normiPKd.exit:
@@ -2913,7 +2913,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // FINITEONLY:       _ZL4normiPKd.exit:
@@ -2933,7 +2933,7 @@ extern "C" __device__ float test_normf(int x, const float *y) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL4NORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP21:![0-9]+]]
 // APPROX:       _ZL4normiPKd.exit:
@@ -3253,7 +3253,7 @@ extern "C" __device__ double test_rint(double x) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // DEFAULT:       _ZL6rnormfiPKf.exit:
@@ -3273,7 +3273,7 @@ extern "C" __device__ double test_rint(double x) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract float [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract float [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // FINITEONLY:       _ZL6rnormfiPKf.exit:
@@ -3293,7 +3293,7 @@ extern "C" __device__ double test_rint(double x) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load float, ptr [[__A_ADDR_0_I3]], align 4, !tbaa [[TBAA16]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract float [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract float [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds float, ptr [[__A_ADDR_0_I3]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 4
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL6RNORMFIPKF_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP22:![0-9]+]]
 // APPROX:       _ZL6rnormfiPKf.exit:
@@ -3317,7 +3317,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // DEFAULT-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // DEFAULT-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // DEFAULT-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
+// DEFAULT-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
 // DEFAULT-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // DEFAULT-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // DEFAULT:       _ZL5rnormiPKd.exit:
@@ -3337,7 +3337,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // FINITEONLY-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // FINITEONLY-NEXT:    [[MUL_I:%.*]] = fmul nnan ninf contract double [[TMP0]], [[TMP0]]
 // FINITEONLY-NEXT:    [[ADD_I]] = fadd nnan ninf contract double [[__R_0_I4]], [[MUL_I]]
-// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
+// FINITEONLY-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
 // FINITEONLY-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // FINITEONLY-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // FINITEONLY:       _ZL5rnormiPKd.exit:
@@ -3357,7 +3357,7 @@ extern "C" __device__ float test_rnormf(int x, const float* y) {
 // APPROX-NEXT:    [[TMP0:%.*]] = load double, ptr [[__A_ADDR_0_I3]], align 8, !tbaa [[TBAA18]]
 // APPROX-NEXT:    [[MUL_I:%.*]] = fmul contract double [[TMP0]], [[TMP0]]
 // APPROX-NEXT:    [[ADD_I]] = fadd contract double [[__R_0_I4]], [[MUL_I]]
-// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds double, ptr [[__A_ADDR_0_I3]], i64 1
+// APPROX-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__A_ADDR_0_I3]], i64 8
 // APPROX-NEXT:    [[TOBOOL_NOT_I:%.*]] = icmp eq i32 [[DEC_I]], 0
 // APPROX-NEXT:    br i1 [[TOBOOL_NOT_I]], label [[_ZL5RNORMIPKD_EXIT]], label [[WHILE_BODY_I]], !llvm.loop [[LOOP23:![0-9]+]]
 // APPROX:       _ZL5rnormiPKd.exit:
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index 4aa5c2639c51a..e1f646e2b141a 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -19,9 +19,9 @@ void foo() {
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = tail call i32 @__kmpc_global_thread_num(ptr nonnull @[[GLOB1:[0-9]+]])
 // CHECK-NEXT:    [[TMP1:%.*]] = tail call ptr @__kmpc_omp_task_alloc(ptr nonnull @[[GLOB1]], i32 [[TMP0]], i32 0, i64 56, i64 1, ptr nonnull @.omp_task_entry.)
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 // CHECK-NEXT:    store ptr @_Z3barif, ptr [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]]
-// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2
+// CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 // CHECK-NEXT:    store i32 0, ptr [[TMP3]], align 8, !tbaa [[TBAA12:![0-9]+]]
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]])
 // CHECK-NEXT:    ret void
@@ -30,7 +30,7 @@ void foo() {
 // CHECK-LABEL: define {{[^@]+}}@.omp_task_entry.
 // CHECK-SAME: (i32 noundef [[TMP0:%.*]], ptr noalias noundef [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T:%.*]], ptr [[TMP1]], i64 0, i32 2
+// CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 // CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META13:![0-9]+]])
 // CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP2]], align 4, !tbaa [[TBAA16:![0-9]+]], !alias.scope [[META13]], !noalias [[META17:![0-9]+]]
 // CHECK-NEXT:    switch i32 [[TMP3]], label [[DOTOMP_OUTLINED__EXIT:%.*]] [
@@ -42,9 +42,9 @@ void foo() {
 // CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias [[META13]]
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .untied.next..i:
-// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES:%.*]], ptr [[TMP1]], i64 0, i32 1
-// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 2
-// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_KMP_TASK_T_WITH_PRIVATES]], ptr [[TMP1]], i64 0, i32 1, i32 1
+// CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
+// CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
+// CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 48
 // CHECK-NEXT:    [[TMP8:%.*]] = load ptr, ptr [[TMP5]], align 8, !tbaa [[TBAA19:![0-9]+]], !noalias [[META13]]
 // CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA16]], !noalias [[META13]]
 // CHECK-NEXT:    [[TMP10:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA20:![0-9]+]], !noalias [[META13]]
diff --git a/flang/test/HLFIR/no-block-merging.fir b/flang/test/HLFIR/no-block-merging.fir
index 987cf8bf01e8a..75803bf010884 100644
--- a/flang/test/HLFIR/no-block-merging.fir
+++ b/flang/test/HLFIR/no-block-merging.fir
@@ -27,7 +27,7 @@ func.func @no_shape_merge(%cdt: i1, %from: !fir.ref<!fir.array<?xf64>>, %to : !f
 // Note: block merging happens in the output below, but after FIR codegen.
 
 // CHECK-LABEL:  define void @no_shape_merge(
-// CHECK:  %[[GEP:.*]] = getelementptr double, ptr %{{.*}}
+// CHECK:  %[[GEP:.*]] = getelementptr i8, ptr %{{.*}}
 // CHECK:  %[[LOAD:.*]] = load double, ptr %[[GEP]]
 // CHECK:  store double %[[LOAD]], ptr %{{.*}}
 // CHECK:  ret void
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 249f4a7710e04..738d35c5b52e3 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -2450,6 +2450,15 @@ Instruction *InstCombinerImpl::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   if (MadeChange)
     return &GEP;
 
+  // Canonicalize constant GEPs to i8 type.
+  if (!GEPEltType->isIntegerTy(8) && GEP.hasAllConstantIndices()) {
+    APInt Offset(DL.getIndexTypeSizeInBits(GEPType), 0);
+    if (GEP.accumulateConstantOffset(DL, Offset))
+      return replaceInstUsesWith(
+          GEP, Builder.CreatePtrAdd(PtrOp, Builder.getInt(Offset), "",
+                                    GEP.isInBounds()));
+  }
+
   // Check to see if the inputs to the PHI node are getelementptr instructions.
   if (auto *PN = dyn_cast<PHINode>(PtrOp)) {
     auto *Op1 = dyn_cast<GetElementPtrInst>(PN->getOperand(0));
diff --git a/llvm/test/Analysis/BasicAA/featuretest.ll b/llvm/test/Analysis/BasicAA/featuretest.ll
index f556c95747a19..5ef94cd8c7fda 100644
--- a/llvm/test/Analysis/BasicAA/featuretest.ll
+++ b/llvm/test/Analysis/BasicAA/featuretest.ll
@@ -59,7 +59,7 @@ define i32 @constant_array_index_test() {
 ; CHECK-LABEL: @constant_array_index_test(
 ; CHECK-NEXT:    [[ARRAY1:%.*]] = alloca [100 x i32], align 4
 ; CHECK-NEXT:    call void @external(ptr nonnull [[ARRAY1]])
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i32, ptr [[ARRAY1]], i64 6
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[ARRAY1]], i64 24
 ; CHECK-NEXT:    store i32 1, ptr [[P2]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;
@@ -80,12 +80,12 @@ define i32 @constant_array_index_test() {
 ; they cannot alias.
 define i32 @gep_distance_test(ptr %A) {
 ; NO_ASSUME-LABEL: @gep_distance_test(
-; NO_ASSUME-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 2
+; NO_ASSUME-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 8
 ; NO_ASSUME-NEXT:    store i32 7, ptr [[B]], align 4
 ; NO_ASSUME-NEXT:    ret i32 0
 ;
 ; USE_ASSUME-LABEL: @gep_distance_test(
-; USE_ASSUME-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 2
+; USE_ASSUME-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 8
 ; USE_ASSUME-NEXT:    store i32 7, ptr [[B]], align 4
 ; USE_ASSUME-NEXT:    call void @llvm.assume(i1 true) [ "dereferenceable"(ptr [[A]], i64 4), "nonnull"(ptr [[A]]), "align"(ptr [[A]], i64 4) ]
 ; USE_ASSUME-NEXT:    ret i32 0
diff --git a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
index f779c9a468f47..769db215cf034 100644
--- a/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
+++ b/llvm/test/CodeGen/AMDGPU/vector-alloca-bitcast.ll
@@ -369,11 +369,11 @@ define amdgpu_kernel void @vector_bitcast_from_alloca_array(ptr addrspace(1) %ou
 ; OPT-LABEL: @vector_bitcast_to_array_from_alloca_array(
 ; OPT-NOT:   alloca
 ; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
-; OPT-NEXT: %out.repack1 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 1
+; OPT-NEXT: %out.repack1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 4
 ; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
-; OPT-NEXT: %out.repack2 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 2
+; OPT-NEXT: %out.repack2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 8
 ; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
-; OPT-NEXT: %out.repack3 = getelementptr inbounds [4 x i32], ptr addrspace(1) %out, i64 0, i64 3
+; OPT-NEXT: %out.repack3 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 12
 ; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4
 
 ; GCN-LABEL: {{^}}vector_bitcast_to_array_from_alloca_array:
@@ -395,11 +395,11 @@ define amdgpu_kernel void @vector_bitcast_to_array_from_alloca_array(ptr addrspa
 ; OPT-LABEL: @vector_bitcast_to_struct_from_alloca_array(
 ; OPT-NOT:   alloca
 ; OPT-NEXT: store i32 1, ptr addrspace(1) %out, align 4
-; OPT-NEXT: %out.repack1 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 1
+; OPT-NEXT: %out.repack1 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 4
 ; OPT-NEXT: store i32 2, ptr addrspace(1) %out.repack1, align 4
-; OPT-NEXT: %out.repack2 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 2
+; OPT-NEXT: %out.repack2 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 8
 ; OPT-NEXT: store i32 3, ptr addrspace(1) %out.repack2, align 4
-; OPT-NEXT: %out.repack3 = getelementptr inbounds %struct.v4, ptr addrspace(1) %out, i64 0, i32 3
+; OPT-NEXT: %out.repack3 = getelementptr inbounds i8, ptr addrspace(1) %out, i64 12
 ; OPT-NEXT: store i32 4, ptr addrspace(1) %out.repack3, align 4
 
 ; GCN-LABEL: {{^}}vector_bitcast_to_struct_from_alloca_array:
diff --git a/llvm/test/CodeGen/BPF/preserve-static-offset/load-inline.ll b/llvm/test/CodeGen/BPF/preserve-static-offset/load-inline.ll
index 9149b350dd895..b43c73035b332 100644
--- a/llvm/test/CodeGen/BPF/preserve-static-offset/load-inline.ll
+++ b/llvm/test/CodeGen/BPF/preserve-static-offset/load-inline.ll
@@ -57,8 +57,8 @@ entry:
 ; CHECK:      define dso_local void @quux(ptr nocapture noundef readonly %[[p:.*]])
 ; CHECK:        %[[bb_i1:.*]] = tail call i32 (ptr, i1, i8, i8, i8, i1, ...)
 ; CHECK-SAME:     @llvm.bpf.getelementptr.and.load.i32
-; CHECK-SAME:       (ptr readonly elementtype(%struct.foo) %[[p]],
-; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 0, i32 immarg 1, i32 immarg 1)
+; CHECK-SAME:       (ptr readonly elementtype(i8) %[[p]],
+; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 8)
 ; CHECK-SAME:      #[[v2:.*]], !tbaa
 ; CHECK-NEXT:   tail call void @consume(i32 noundef %[[bb_i1]])
 ; CHECK:      attributes #[[v2]] = { memory(argmem: read) }
diff --git a/llvm/test/CodeGen/BPF/preserve-static-offset/load-unroll-inline.ll b/llvm/test/CodeGen/BPF/preserve-static-offset/load-unroll-inline.ll
index 78172cd17dca4..1f0be06755b44 100644
--- a/llvm/test/CodeGen/BPF/preserve-static-offset/load-unroll-inline.ll
+++ b/llvm/test/CodeGen/BPF/preserve-static-offset/load-unroll-inline.ll
@@ -75,8 +75,8 @@ entry:
 ; CHECK:      define dso_local void @quux(ptr nocapture noundef readonly %[[p:.*]])
 ; CHECK:        %[[v1:.*]] = tail call i32 (ptr, i1, i8, i8, i8, i1, ...)
 ; CHECK-SAME:     @llvm.bpf.getelementptr.and.load.i32
-; CHECK-SAME:       (ptr readonly elementtype(%struct.foo) %[[p]],
-; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 0, i32 immarg 1, i64 immarg 1)
+; CHECK-SAME:       (ptr readonly elementtype(i8) %[[p]],
+; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 8)
 ; CHECK:        tail call void @consume(i32 noundef %[[v1]])
 ; CHECK:        tail call void @consume(i32 noundef %[[v1]])
 
diff --git a/llvm/test/CodeGen/BPF/preserve-static-offset/load-unroll.ll b/llvm/test/CodeGen/BPF/preserve-static-offset/load-unroll.ll
index 7c3303342bb6d..2409fbc8b7e85 100644
--- a/llvm/test/CodeGen/BPF/preserve-static-offset/load-unroll.ll
+++ b/llvm/test/CodeGen/BPF/preserve-static-offset/load-unroll.ll
@@ -53,14 +53,14 @@ while.end:                                        ; preds = %while.cond
 ; CHECK:      define dso_local void @bar(ptr nocapture noundef readonly %[[p:.*]])
 ; CHECK:        %[[v1:.*]] = tail call i32 (ptr, i1, i8, i8, i8, i1, ...)
 ; CHECK-SAME:     @llvm.bpf.getelementptr.and.load.i32
-; CHECK-SAME:       (ptr readonly elementtype(%struct.foo) %[[p]],
-; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 0, i32 immarg 1, i64 immarg 0)
+; CHECK-SAME:       (ptr readonly elementtype(i8) %[[p]],
+; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 4)
 ; CHECK-SAME:      #[[attrs:.*]], !tbaa
 ; CHECK-NEXT:   tail call void @consume(i32 noundef %[[v1]])
 ; CHECK-NEXT:   %[[v2:.*]] = tail call i32 (ptr, i1, i8, i8, i8, i1, ...)
 ; CHECK-SAME:     @llvm.bpf.getelementptr.and.load.i32
-; CHECK-SAME:       (ptr readonly elementtype(%struct.foo) %[[p]],
-; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 0, i32 immarg 1, i64 immarg 1)
+; CHECK-SAME:       (ptr readonly elementtype(i8) %[[p]],
+; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 8)
 ; CHECK-SAME:      #[[attrs]], !tbaa
 ; CHECK-NEXT:   tail call void @consume(i32 noundef %[[v2]])
 ; CHECK:      attributes #[[attrs]] = { memory(argmem: read) }
diff --git a/llvm/test/CodeGen/BPF/preserve-static-offset/store-unroll-inline.ll b/llvm/test/CodeGen/BPF/preserve-static-offset/store-unroll-inline.ll
index 161aded79eac3..3c6280ca75b64 100644
--- a/llvm/test/CodeGen/BPF/preserve-static-offset/store-unroll-inline.ll
+++ b/llvm/test/CodeGen/BPF/preserve-static-offset/store-unroll-inline.ll
@@ -75,8 +75,8 @@ entry:
 ; CHECK-NEXT:   tail call void (i32, ptr, i1, i8, i8, i8, i1, ...)
 ; CHECK-SAME:     @llvm.bpf.getelementptr.and.store.i32
 ; CHECK-SAME:       (i32 1,
-; CHECK-SAME:        ptr writeonly elementtype(%struct.foo) %[[p]],
-; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 0, i32 immarg 1)
+; CHECK-SAME:        ptr writeonly elementtype(i8) %[[p]],
+; CHECK-SAME:        i1 false, i8 0, i8 1, i8 2, i1 true, i64 immarg 4)
 ; CHECK-NEXT:   ret void
 
 ; Function Attrs: nocallback nofree nosync nounwind speculatable willreturn memory(none)
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
index 24c5e1c2f789a..73279844e91b3 100644
--- a/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
+++ b/llvm/test/CodeGen/Hexagon/autohvx/vector-align-tbaa.ll
@@ -13,12 +13,12 @@ define <64 x i16> @f0(ptr %a0, i32 %a1) #0 {
 ; CHECK-LABEL: @f0(
 ; CHECK-NEXT:  b0:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[A0:%.*]], i32 [[A1:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 128
 ; CHECK-NEXT:    [[PTI:%.*]] = ptrtoint ptr [[V1]] to i32
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[PTI]], -128
 ; CHECK-NEXT:    [[ITP:%.*]] = inttoptr i32 [[AND]] to ptr
 ; CHECK-NEXT:    [[PTI1:%.*]] = ptrtoint ptr [[V1]] to i32
-; CHECK-NEXT:    [[ALD15:%.*]] = load <32 x i32>, ptr [[ITP]], align 128, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[ALD14:%.*]] = load <32 x i32>, ptr [[ITP]], align 128, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ITP]], i32 128
 ; CHECK-NEXT:    [[ALD2:%.*]] = load <128 x i8>, ptr [[GEP]], align 128, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[ITP]], i32 256
@@ -26,12 +26,12 @@ define <64 x i16> @f0(ptr %a0, i32 %a1) #0 {
 ; CHECK-NEXT:    [[ISZ:%.*]] = icmp ne i32 [[AND4]], 0
 ; CHECK-NEXT:    [[CUP:%.*]] = call <32 x i32> @llvm.hexagon.V6.vL32b.pred.ai.128B(i1 [[ISZ]], ptr [[GEP3]], i32 0), !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[CST5:%.*]] = bitcast <128 x i8> [[ALD2]] to <32 x i32>
-; CHECK-NEXT:    [[CUP7:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CST5]], <32 x i32> [[ALD15]], i32 [[PTI1]])
-; CHECK-NEXT:    [[CST13:%.*]] = bitcast <32 x i32> [[CUP7]] to <64 x i16>
-; CHECK-NEXT:    [[CST10:%.*]] = bitcast <128 x i8> [[ALD2]] to <32 x i32>
-; CHECK-NEXT:    [[CUP11:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CUP]], <32 x i32> [[CST10]], i32 [[PTI1]])
-; CHECK-NEXT:    [[CST14:%.*]] = bitcast <32 x i32> [[CUP11]] to <64 x i16>
-; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[CST13]], [[CST14]]
+; CHECK-NEXT:    [[CUP7:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CST5]], <32 x i32> [[ALD14]], i32 [[PTI1]])
+; CHECK-NEXT:    [[CST12:%.*]] = bitcast <32 x i32> [[CUP7]] to <64 x i16>
+; CHECK-NEXT:    [[CST9:%.*]] = bitcast <128 x i8> [[ALD2]] to <32 x i32>
+; CHECK-NEXT:    [[CUP10:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CUP]], <32 x i32> [[CST9]], i32 [[PTI1]])
+; CHECK-NEXT:    [[CST13:%.*]] = bitcast <32 x i32> [[CUP10]] to <64 x i16>
+; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[CST12]], [[CST13]]
 ; CHECK-NEXT:    ret <64 x i16> [[V8]]
 ;
 b0:
@@ -51,12 +51,12 @@ define <64 x i16> @f1(ptr %a0, i32 %a1) #0 {
 ; CHECK-LABEL: @f1(
 ; CHECK-NEXT:  b0:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[A0:%.*]], i32 [[A1:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 128
 ; CHECK-NEXT:    [[PTI:%.*]] = ptrtoint ptr [[V1]] to i32
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[PTI]], -128
 ; CHECK-NEXT:    [[ITP:%.*]] = inttoptr i32 [[AND]] to ptr
 ; CHECK-NEXT:    [[PTI1:%.*]] = ptrtoint ptr [[V1]] to i32
-; CHECK-NEXT:    [[ALD15:%.*]] = load <32 x i32>, ptr [[ITP]], align 128, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[ALD14:%.*]] = load <32 x i32>, ptr [[ITP]], align 128, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ITP]], i32 128
 ; CHECK-NEXT:    [[ALD2:%.*]] = load <128 x i8>, ptr [[GEP]], align 128
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[ITP]], i32 256
@@ -64,12 +64,12 @@ define <64 x i16> @f1(ptr %a0, i32 %a1) #0 {
 ; CHECK-NEXT:    [[ISZ:%.*]] = icmp ne i32 [[AND4]], 0
 ; CHECK-NEXT:    [[CUP:%.*]] = call <32 x i32> @llvm.hexagon.V6.vL32b.pred.ai.128B(i1 [[ISZ]], ptr [[GEP3]], i32 0)
 ; CHECK-NEXT:    [[CST5:%.*]] = bitcast <128 x i8> [[ALD2]] to <32 x i32>
-; CHECK-NEXT:    [[CUP7:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CST5]], <32 x i32> [[ALD15]], i32 [[PTI1]])
-; CHECK-NEXT:    [[CST13:%.*]] = bitcast <32 x i32> [[CUP7]] to <64 x i16>
-; CHECK-NEXT:    [[CST10:%.*]] = bitcast <128 x i8> [[ALD2]] to <32 x i32>
-; CHECK-NEXT:    [[CUP11:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CUP]], <32 x i32> [[CST10]], i32 [[PTI1]])
-; CHECK-NEXT:    [[CST14:%.*]] = bitcast <32 x i32> [[CUP11]] to <64 x i16>
-; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[CST13]], [[CST14]]
+; CHECK-NEXT:    [[CUP7:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CST5]], <32 x i32> [[ALD14]], i32 [[PTI1]])
+; CHECK-NEXT:    [[CST12:%.*]] = bitcast <32 x i32> [[CUP7]] to <64 x i16>
+; CHECK-NEXT:    [[CST9:%.*]] = bitcast <128 x i8> [[ALD2]] to <32 x i32>
+; CHECK-NEXT:    [[CUP10:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CUP]], <32 x i32> [[CST9]], i32 [[PTI1]])
+; CHECK-NEXT:    [[CST13:%.*]] = bitcast <32 x i32> [[CUP10]] to <64 x i16>
+; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[CST12]], [[CST13]]
 ; CHECK-NEXT:    ret <64 x i16> [[V8]]
 ;
 b0:
@@ -89,12 +89,12 @@ define <64 x i16> @f2(ptr %a0, i32 %a1) #0 {
 ; CHECK-LABEL: @f2(
 ; CHECK-NEXT:  b0:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[A0:%.*]], i32 [[A1:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 128
 ; CHECK-NEXT:    [[PTI:%.*]] = ptrtoint ptr [[V1]] to i32
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[PTI]], -128
 ; CHECK-NEXT:    [[ITP:%.*]] = inttoptr i32 [[AND]] to ptr
 ; CHECK-NEXT:    [[PTI1:%.*]] = ptrtoint ptr [[V1]] to i32
-; CHECK-NEXT:    [[ALD15:%.*]] = load <32 x i32>, ptr [[ITP]], align 128, !tbaa [[TBAA0]]
+; CHECK-NEXT:    [[ALD14:%.*]] = load <32 x i32>, ptr [[ITP]], align 128, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[ITP]], i32 128
 ; CHECK-NEXT:    [[ALD2:%.*]] = load <128 x i8>, ptr [[GEP]], align 128
 ; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr i8, ptr [[ITP]], i32 256
@@ -102,12 +102,12 @@ define <64 x i16> @f2(ptr %a0, i32 %a1) #0 {
 ; CHECK-NEXT:    [[ISZ:%.*]] = icmp ne i32 [[AND4]], 0
 ; CHECK-NEXT:    [[CUP:%.*]] = call <32 x i32> @llvm.hexagon.V6.vL32b.pred.ai.128B(i1 [[ISZ]], ptr [[GEP3]], i32 0), !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[CST5:%.*]] = bitcast <128 x i8> [[ALD2]] to <32 x i32>
-; CHECK-NEXT:    [[CUP7:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CST5]], <32 x i32> [[ALD15]], i32 [[PTI1]])
-; CHECK-NEXT:    [[CST13:%.*]] = bitcast <32 x i32> [[CUP7]] to <64 x i16>
-; CHECK-NEXT:    [[CST10:%.*]] = bitcast <128 x i8> [[ALD2]] to <32 x i32>
-; CHECK-NEXT:    [[CUP11:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CUP]], <32 x i32> [[CST10]], i32 [[PTI1]])
-; CHECK-NEXT:    [[CST14:%.*]] = bitcast <32 x i32> [[CUP11]] to <64 x i16>
-; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[CST13]], [[CST14]]
+; CHECK-NEXT:    [[CUP7:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CST5]], <32 x i32> [[ALD14]], i32 [[PTI1]])
+; CHECK-NEXT:    [[CST12:%.*]] = bitcast <32 x i32> [[CUP7]] to <64 x i16>
+; CHECK-NEXT:    [[CST9:%.*]] = bitcast <128 x i8> [[ALD2]] to <32 x i32>
+; CHECK-NEXT:    [[CUP10:%.*]] = call <32 x i32> @llvm.hexagon.V6.valignb.128B(<32 x i32> [[CUP]], <32 x i32> [[CST9]], i32 [[PTI1]])
+; CHECK-NEXT:    [[CST13:%.*]] = bitcast <32 x i32> [[CUP10]] to <64 x i16>
+; CHECK-NEXT:    [[V8:%.*]] = add <64 x i16> [[CST12]], [[CST13]]
 ; CHECK-NEXT:    ret <64 x i16> [[V8]]
 ;
 b0:
@@ -127,7 +127,7 @@ define void @f3(ptr %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
 ; CHECK-LABEL: @f3(
 ; CHECK-NEXT:  b0:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[A0:%.*]], i32 [[A1:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 128
 ; CHECK-NEXT:    [[PTI:%.*]] = ptrtoint ptr [[V1]] to i32
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[PTI]], -128
 ; CHECK-NEXT:    [[ITP:%.*]] = inttoptr i32 [[AND]] to ptr
@@ -180,7 +180,7 @@ define void @f4(ptr %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
 ; CHECK-LABEL: @f4(
 ; CHECK-NEXT:  b0:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[A0:%.*]], i32 [[A1:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 128
 ; CHECK-NEXT:    [[PTI:%.*]] = ptrtoint ptr [[V1]] to i32
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[PTI]], -128
 ; CHECK-NEXT:    [[ITP:%.*]] = inttoptr i32 [[AND]] to ptr
@@ -233,7 +233,7 @@ define void @f5(ptr %a0, i32 %a1, <64 x i16> %a2, <64 x i16> %a3) #0 {
 ; CHECK-LABEL: @f5(
 ; CHECK-NEXT:  b0:
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[A0:%.*]], i32 [[A1:%.*]]
-; CHECK-NEXT:    [[V1:%.*]] = getelementptr i16, ptr [[TMP0]], i32 64
+; CHECK-NEXT:    [[V1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 128
 ; CHECK-NEXT:    [[PTI:%.*]] = ptrtoint ptr [[V1]] to i32
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[PTI]], -128
 ; CHECK-NEXT:    [[ITP:%.*]] = inttoptr i32 [[AND]] to ptr
diff --git a/llvm/test/Transforms/Coroutines/coro-async.ll b/llvm/test/Transforms/Coroutines/coro-async.ll
index f796ce3dbb0d8..3740c3d1d8387 100644
--- a/llvm/test/Transforms/Coroutines/coro-async.ll
+++ b/llvm/test/Transforms/Coroutines/coro-async.ll
@@ -132,7 +132,7 @@ define void @my_async_function_pa(ptr %ctxt, ptr %task, ptr %actor) {
 ; CHECK:   [[CALLEE_CTXT:%.*]] = tail call ptr @llvm.coro.async.context.alloc(ptr %task, ptr nonnull @my_other_async_function_fp)
 ; CHECK:   [[CALLEE_CTXT_SPILL:%.*]] = getelementptr inbounds i8, ptr %async.ctxt, i64 160
 ; CHECK:   store ptr [[CALLEE_CTXT]], ptr [[CALLEE_CTXT_SPILL]]
-; CHECK:   [[TYPED_RETURN_TO_CALLER_ADDR:%.*]] = getelementptr inbounds %async.ctxt, ptr [[CALLEE_CTXT]], i64 0, i32 1
+; CHECK:   [[TYPED_RETURN_TO_CALLER_ADDR:%.*]] = getelementptr inbounds i8, ptr [[CALLEE_CTXT]], i64 8
 ; CHECK:   store ptr @my_async_functionTQ0_, ptr [[TYPED_RETURN_TO_CALLER_ADDR]]
 ; CHECK:   store ptr %async.ctxt, ptr [[CALLEE_CTXT]]
 ; Make sure the spill is underaligned to the max context alignment (16).
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-alloca-opaque-ptr.ll b/llvm/test/Transforms/Coroutines/coro-retcon-alloca-opaque-ptr.ll
index 112ffd215b1be..2dabc72a51014 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-alloca-opaque-ptr.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-alloca-opaque-ptr.ll
@@ -8,7 +8,7 @@ declare {ptr, ptr, i32} @prototype_f(ptr, i1)
 define {ptr, ptr, i32} @f(ptr %buffer, i32 %n, { i32 } %dummy) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  coro.return:
-; CHECK-NEXT:    [[N_VAL_SPILL_ADDR:%.*]] = getelementptr inbounds [[F_FRAME:%.*]], ptr [[BUFFER:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[N_VAL_SPILL_ADDR:%.*]] = getelementptr inbounds i8, ptr [[BUFFER:%.*]], i64 8
 ; CHECK-NEXT:    store i32 [[N:%.*]], ptr [[N_VAL_SPILL_ADDR]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr @allocate(i32 [[N]])
 ; CHECK-NEXT:    store ptr [[TMP0]], ptr [[BUFFER]], align 8
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll b/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll
index fc009520fe4fa..9b00d7929684f 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-alloca.ll
@@ -7,7 +7,7 @@ declare {ptr, ptr, i32} @prototype_f(ptr, i1)
 define {ptr, ptr, i32} @f(ptr %buffer, i32 %n) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  coro.return:
-; CHECK-NEXT:    [[N_VAL_SPILL_ADDR:%.*]] = getelementptr inbounds [[F_FRAME:%.*]], ptr [[BUFFER:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[N_VAL_SPILL_ADDR:%.*]] = getelementptr inbounds i8, ptr [[BUFFER:%.*]], i64 8
 ; CHECK-NEXT:    store i32 [[N:%.*]], ptr [[N_VAL_SPILL_ADDR]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr @allocate(i32 [[N]])
 ; CHECK-NEXT:    store ptr [[TMP0]], ptr [[BUFFER]], align 8
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll b/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
index b67abfb051f35..2ee515f0a1a91 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-once-value.ll
@@ -159,7 +159,7 @@ declare void @print(i32)
 ; CHECK-NEXT:  PostSpill:
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr @allocate(i32 16)
 ; CHECK-NEXT:    store ptr [[TMP0]], ptr [[BUFFER:%.*]], align 8
-; CHECK-NEXT:    [[VAL_SPILL_ADDR:%.*]] = getelementptr inbounds [[G_FRAME:%.*]], ptr [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[VAL_SPILL_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    store i32 [[VAL:%.*]], ptr [[VAL_SPILL_ADDR]], align 4
 ; CHECK-NEXT:    store ptr [[ARRAY:%.*]], ptr [[TMP0]], align 8
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[ARRAY]], align 4
@@ -180,7 +180,7 @@ declare void @print(i32)
 ; CHECK-NEXT:    store i32 0, ptr [[ARRAY_RELOAD]], align 4
 ; CHECK-NEXT:    br label [[COROEND]]
 ; CHECK:       CoroEnd:
-; CHECK-NEXT:    [[VAL_RELOAD_ADDR:%.*]] = getelementptr inbounds [[G_FRAME:%.*]], ptr [[TMP2]], i64 0, i32 1
+; CHECK-NEXT:    [[VAL_RELOAD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 8
 ; CHECK-NEXT:    [[VAL_RELOAD:%.*]] = load i32, ptr [[VAL_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[NEW_VAL:%.*]] = add i32 [[VAL_RELOAD]], 123
 ; CHECK-NEXT:    tail call void @deallocate(ptr [[TMP2]])
@@ -198,7 +198,7 @@ declare void @print(i32)
 ; CHECK-NEXT:    store i32 10, ptr [[ARRAY_RELOAD]], align 4
 ; CHECK-NEXT:    br label [[COROEND]]
 ; CHECK:       CoroEnd:
-; CHECK-NEXT:    [[VAL_RELOAD_ADDR:%.*]] = getelementptr inbounds [[G_FRAME:%.*]], ptr [[TMP2]], i64 0, i32 1
+; CHECK-NEXT:    [[VAL_RELOAD_ADDR:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 8
 ; CHECK-NEXT:    [[VAL_RELOAD:%.*]] = load i32, ptr [[VAL_RELOAD_ADDR]], align 4
 ; CHECK-NEXT:    [[NEW_VAL:%.*]] = add i32 [[VAL_RELOAD]], 123
 ; CHECK-NEXT:    tail call void @deallocate(ptr [[TMP2]])
diff --git a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
index f4333a12bb34a..4c10cb3805c7c 100644
--- a/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
+++ b/llvm/test/Transforms/Coroutines/coro-retcon-resume-values.ll
@@ -38,12 +38,12 @@ define i32 @main() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = tail call ptr @allocate(i32 12)
 ; CHECK-NEXT:    store i32 1, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[N_VAL3_SPILL_ADDR_I:%.*]] = getelementptr inbounds [[F_FRAME:%.*]], ptr [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[N_VAL3_SPILL_ADDR_I:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
 ; CHECK-NEXT:    store i32 1, ptr [[N_VAL3_SPILL_ADDR_I]], align 4, !noalias !0
-; CHECK-NEXT:    [[INPUT_SPILL_ADDR_I:%.*]] = getelementptr inbounds [[F_FRAME]], ptr [[TMP0]], i64 0, i32 2
+; CHECK-NEXT:    [[INPUT_SPILL_ADDR_I:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 ; CHECK-NEXT:    store i32 2, ptr [[INPUT_SPILL_ADDR_I]], align 4, !noalias !0
-; CHECK-NEXT:    [[INPUT_RELOAD_ADDR13_I:%.*]] = getelementptr inbounds [[F_FRAME]], ptr [[TMP0]], i64 0, i32 2
-; CHECK-NEXT:    [[N_VAL3_RELOAD_ADDR11_I:%.*]] = getelementptr inbounds [[F_FRAME]], ptr [[TMP0]], i64 0, i32 1
+; CHECK-NEXT:    [[INPUT_RELOAD_ADDR13_I:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
+; CHECK-NEXT:    [[N_VAL3_RELOAD_ADDR11_I:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 4
 ; CHECK-NEXT:    store i32 3, ptr [[N_VAL3_RELOAD_ADDR11_I]], align 4, !noalias !3
 ; CHECK-NEXT:    store i32 4, ptr [[INPUT_RELOAD_ADDR13_I]], align 4, !noalias !3
 ; CHECK-NEXT:    tail call void @print(i32 7), !noalias !6
diff --git a/llvm/test/Transforms/Coroutines/coro-swifterror.ll b/llvm/test/Transforms/Coroutines/coro-swifterror.ll
index f192dbac3197e..de28ed5608168 100644
--- a/llvm/test/Transforms/Coroutines/coro-swifterror.ll
+++ b/llvm/test/Transforms/Coroutines/coro-swifterror.ll
@@ -50,7 +50,7 @@ define ptr @g(ptr %buffer, i32 %n) {
 ; CHECK-NEXT:    store ptr null, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    call void @maybeThrow(ptr nonnull swifterror [[TMP0]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[DOTSPILL_ADDR:%.*]] = getelementptr inbounds [[G_FRAME:%.*]], ptr [[BUFFER]], i32 0, i32 1
+; CHECK-NEXT:    [[DOTSPILL_ADDR:%.*]] = getelementptr inbounds i8, ptr [[BUFFER]], i32 4
 ; CHECK-NEXT:    store ptr [[TMP1]], ptr [[DOTSPILL_ADDR]], align 4
 ; CHECK-NEXT:    call void @logError(ptr [[TMP1]])
 ; CHECK-NEXT:    ret ptr @g.resume.0
diff --git a/llvm/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll b/llvm/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll
index ffc05a40865d3..27005f38f29f1 100644
--- a/llvm/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll
+++ b/llvm/test/Transforms/InstCombine/2007-03-25-BadShiftMask.ll
@@ -10,7 +10,7 @@ define i32 @main() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U:%.*]] = alloca [[STRUCT__1ANON:%.*]], align 8
 ; CHECK-NEXT:    store double 0x7FF0000000000000, ptr [[U]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT__0ANON:%.*]], ptr [[U]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = and i32 [[TMP6]], 2146435072
 ; CHECK-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[TMP0]], 2146435072
diff --git a/llvm/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll b/llvm/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
index dee71b2290acf..c31f6d4b4a1fe 100644
--- a/llvm/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
+++ b/llvm/test/Transforms/InstCombine/2009-01-08-AlignAlloca.ll
@@ -13,7 +13,7 @@ define i32 @bar(i64 %key_token2) nounwind {
 ; CHECK-NEXT:    [[IOSPEC:%.*]] = alloca [[STRUCT_KEY:%.*]], align 8
 ; CHECK-NEXT:    [[RET:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    store i32 0, ptr [[IOSPEC]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds { i32, i32 }, ptr [[IOSPEC]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[IOSPEC]], i32 4
 ; CHECK-NEXT:    store i32 0, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    store i64 [[KEY_TOKEN2:%.*]], ptr [[IOSPEC]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 (...) @foo(ptr nonnull byval([[STRUCT_KEY]]) align 4 [[IOSPEC]], ptr nonnull [[RET]]) #[[ATTR0:[0-9]+]]
diff --git a/llvm/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll b/llvm/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
index 9f69d67f61e11..b532c81556738 100644
--- a/llvm/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
+++ b/llvm/test/Transforms/InstCombine/2009-02-20-InstCombine-SROA.ll
@@ -24,7 +24,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    [[__LAST_ADDR_I_I:%.*]] = alloca %"struct.__gnu_cxx::__normal_iterator<int*,std::vector<int, std::allocator<int> > >", align 8
 ; IC-NEXT:    [[TMP0:%.*]] = alloca i32, align 4
 ; IC-NEXT:    store i32 42, ptr [[TMP0]], align 4
-; IC-NEXT:    [[TMP1:%.*]] = getelementptr %"struct.std::_Vector_base<int,std::allocator<int> >::_Vector_impl", ptr [[X:%.*]], i32 0, i32 1
+; IC-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i32 4
 ; IC-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[TMP1]], align 4
 ; IC-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[X]], align 4
 ; IC-NEXT:    store ptr [[TMP3]], ptr [[__FIRST_ADDR_I_I]], align 4
@@ -45,7 +45,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT:%.*]]
 ; IC:       bb2.i.i:
 ; IC-NEXT:    [[TMP13:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
-; IC-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[TMP13]], i32 1
+; IC-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP13]], i32 4
 ; IC-NEXT:    store ptr [[TMP14]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4
 ; IC-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -56,7 +56,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb5.i.i:
 ; IC-NEXT:    [[TMP19:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
-; IC-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[TMP19]], i32 1
+; IC-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[TMP19]], i32 4
 ; IC-NEXT:    store ptr [[TMP20]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP21:%.*]] = load i32, ptr [[TMP20]], align 4
 ; IC-NEXT:    [[TMP22:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -67,7 +67,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb8.i.i:
 ; IC-NEXT:    [[TMP25:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
-; IC-NEXT:    [[TMP26:%.*]] = getelementptr i32, ptr [[TMP25]], i32 1
+; IC-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[TMP25]], i32 4
 ; IC-NEXT:    store ptr [[TMP26]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4
 ; IC-NEXT:    [[TMP28:%.*]] = load i32, ptr [[TMP0]], align 4
@@ -78,7 +78,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb11.i.i:
 ; IC-NEXT:    [[TMP31:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
-; IC-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[TMP31]], i32 1
+; IC-NEXT:    [[TMP32:%.*]] = getelementptr i8, ptr [[TMP31]], i32 4
 ; IC-NEXT:    store ptr [[TMP32]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    [[TMP33:%.*]] = add nsw i32 [[__TRIP_COUNT_0_I_I:%.*]], -1
 ; IC-NEXT:    br label [[BB12_I_I]]
@@ -109,7 +109,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb17.i.i:
 ; IC-NEXT:    [[TMP46:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
-; IC-NEXT:    [[TMP47:%.*]] = getelementptr i32, ptr [[TMP46]], i32 1
+; IC-NEXT:    [[TMP47:%.*]] = getelementptr i8, ptr [[TMP46]], i32 4
 ; IC-NEXT:    store ptr [[TMP47]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[BB18_I_I]]
 ; IC:       bb18.i.i:
@@ -123,7 +123,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb21.i.i:
 ; IC-NEXT:    [[TMP53:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
-; IC-NEXT:    [[TMP54:%.*]] = getelementptr i32, ptr [[TMP53]], i32 1
+; IC-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[TMP53]], i32 4
 ; IC-NEXT:    store ptr [[TMP54]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[BB22_I_I]]
 ; IC:       bb22.i.i:
@@ -137,7 +137,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC:       bb25.i.i:
 ; IC-NEXT:    [[TMP60:%.*]] = load ptr, ptr [[__FIRST_ADDR_I_I]], align 4
-; IC-NEXT:    [[TMP61:%.*]] = getelementptr i32, ptr [[TMP60]], i32 1
+; IC-NEXT:    [[TMP61:%.*]] = getelementptr i8, ptr [[TMP60]], i32 4
 ; IC-NEXT:    store ptr [[TMP61]], ptr [[__FIRST_ADDR_I_I]], align 4
 ; IC-NEXT:    br label [[BB26_I_I]]
 ; IC:       bb26.i.i:
@@ -151,7 +151,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ;
 ; IC_SROA-LABEL: @_Z3fooRSt6vectorIiSaIiEE(
 ; IC_SROA-NEXT:  entry:
-; IC_SROA-NEXT:    [[TMP0:%.*]] = getelementptr %"struct.std::_Vector_base<int,std::allocator<int> >::_Vector_impl", ptr [[X:%.*]], i32 0, i32 1
+; IC_SROA-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[X:%.*]], i32 4
 ; IC_SROA-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4
 ; IC_SROA-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[X]], align 4
 ; IC_SROA-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[TMP1]] to i32
@@ -166,28 +166,28 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC_SROA:       bb1.i.i:
 ; IC_SROA-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT:%.*]]
 ; IC_SROA:       bb2.i.i:
-; IC_SROA-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[__FIRST_ADDR_I_I_SROA_0_0]], i32 1
+; IC_SROA-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[__FIRST_ADDR_I_I_SROA_0_0]], i32 4
 ; IC_SROA-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4
 ; IC_SROA-NEXT:    [[TMP11:%.*]] = icmp eq i32 [[TMP10]], 42
 ; IC_SROA-NEXT:    br i1 [[TMP11]], label [[BB4_I_I:%.*]], label [[BB5_I_I:%.*]]
 ; IC_SROA:       bb4.i.i:
 ; IC_SROA-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC_SROA:       bb5.i.i:
-; IC_SROA-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[TMP9]], i32 1
+; IC_SROA-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP9]], i32 4
 ; IC_SROA-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4
 ; IC_SROA-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[TMP13]], 42
 ; IC_SROA-NEXT:    br i1 [[TMP14]], label [[BB7_I_I:%.*]], label [[BB8_I_I:%.*]]
 ; IC_SROA:       bb7.i.i:
 ; IC_SROA-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC_SROA:       bb8.i.i:
-; IC_SROA-NEXT:    [[TMP15:%.*]] = getelementptr i32, ptr [[TMP12]], i32 1
+; IC_SROA-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[TMP12]], i32 4
 ; IC_SROA-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4
 ; IC_SROA-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[TMP16]], 42
 ; IC_SROA-NEXT:    br i1 [[TMP17]], label [[BB10_I_I:%.*]], label [[BB11_I_I:%.*]]
 ; IC_SROA:       bb10.i.i:
 ; IC_SROA-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC_SROA:       bb11.i.i:
-; IC_SROA-NEXT:    [[TMP18:%.*]] = getelementptr i32, ptr [[TMP15]], i32 1
+; IC_SROA-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP15]], i32 4
 ; IC_SROA-NEXT:    [[TMP19:%.*]] = add nsw i32 [[__TRIP_COUNT_0_I_I:%.*]], -1
 ; IC_SROA-NEXT:    br label [[BB12_I_I]]
 ; IC_SROA:       bb12.i.i:
@@ -212,7 +212,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC_SROA:       bb16.i.i:
 ; IC_SROA-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC_SROA:       bb17.i.i:
-; IC_SROA-NEXT:    [[TMP27:%.*]] = getelementptr i32, ptr [[__FIRST_ADDR_I_I_SROA_0_0]], i32 1
+; IC_SROA-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[__FIRST_ADDR_I_I_SROA_0_0]], i32 4
 ; IC_SROA-NEXT:    br label [[BB18_I_I]]
 ; IC_SROA:       bb18.i.i:
 ; IC_SROA-NEXT:    [[__FIRST_ADDR_I_I_SROA_0_1:%.*]] = phi ptr [ [[TMP27]], [[BB17_I_I]] ], [ [[__FIRST_ADDR_I_I_SROA_0_0]], [[BB13_I_I]] ]
@@ -222,7 +222,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC_SROA:       bb20.i.i:
 ; IC_SROA-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC_SROA:       bb21.i.i:
-; IC_SROA-NEXT:    [[TMP30:%.*]] = getelementptr i32, ptr [[__FIRST_ADDR_I_I_SROA_0_1]], i32 1
+; IC_SROA-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[__FIRST_ADDR_I_I_SROA_0_1]], i32 4
 ; IC_SROA-NEXT:    br label [[BB22_I_I]]
 ; IC_SROA:       bb22.i.i:
 ; IC_SROA-NEXT:    [[__FIRST_ADDR_I_I_SROA_0_2:%.*]] = phi ptr [ [[TMP30]], [[BB21_I_I]] ], [ [[__FIRST_ADDR_I_I_SROA_0_0]], [[BB13_I_I]] ]
@@ -232,7 +232,7 @@ define ptr @_Z3fooRSt6vectorIiSaIiEE(ptr %X) {
 ; IC_SROA:       bb24.i.i:
 ; IC_SROA-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
 ; IC_SROA:       bb25.i.i:
-; IC_SROA-NEXT:    [[TMP33:%.*]] = getelementptr i32, ptr [[__FIRST_ADDR_I_I_SROA_0_2]], i32 1
+; IC_SROA-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[__FIRST_ADDR_I_I_SROA_0_2]], i32 4
 ; IC_SROA-NEXT:    br label [[BB26_I_I]]
 ; IC_SROA:       bb26.i.i:
 ; IC_SROA-NEXT:    br label [[_ZST4FINDIN9__GNU_CXX17__NORMAL_ITERATORIPIST6VECTORIISAIIEEEEIET_S7_S7_RKT0__EXIT]]
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll b/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll
index ad4088c63e761..f03b633c581d6 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-addsub-inseltpoison.ll
@@ -146,7 +146,7 @@ define void @PR46277(float %0, float %1, float %2, float %3, <4 x float> %4, ptr
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP1:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> [[TMP8]], <4 x float> [[TMP4:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP5:%.*]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP5:%.*]], i64 4
 ; CHECK-NEXT:    store float [[TMP10]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i64 1
 ; CHECK-NEXT:    store float [[TMP12]], ptr [[TMP11]], align 4
diff --git a/llvm/test/Transforms/InstCombine/X86/x86-addsub.ll b/llvm/test/Transforms/InstCombine/X86/x86-addsub.ll
index c788e4ea81efc..9b0387e51eead 100644
--- a/llvm/test/Transforms/InstCombine/X86/x86-addsub.ll
+++ b/llvm/test/Transforms/InstCombine/X86/x86-addsub.ll
@@ -146,7 +146,7 @@ define void @PR46277(float %0, float %1, float %2, float %3, <4 x float> %4, ptr
 ; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[TMP1:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP9:%.*]] = tail call <4 x float> @llvm.x86.sse3.addsub.ps(<4 x float> [[TMP8]], <4 x float> [[TMP4:%.*]])
 ; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <4 x float> [[TMP9]], i64 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds float, ptr [[TMP5:%.*]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP5:%.*]], i64 4
 ; CHECK-NEXT:    store float [[TMP10]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <4 x float> [[TMP9]], i64 1
 ; CHECK-NEXT:    store float [[TMP12]], ptr [[TMP11]], align 4
diff --git a/llvm/test/Transforms/InstCombine/add3.ll b/llvm/test/Transforms/InstCombine/add3.ll
index ba9ce8e64438d..87e1208870394 100644
--- a/llvm/test/Transforms/InstCombine/add3.ll
+++ b/llvm/test/Transforms/InstCombine/add3.ll
@@ -15,7 +15,7 @@ define void @test2(i32 %.val24) {
 ; CHECK-NEXT:    store i32 1, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = add i32 [[DOTVAL24]], -16
 ; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i32 [[TMP2]] to ptr
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i32, ptr [[TMP3]], i32 1
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP3]], i32 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = tail call i32 @callee(i32 [[TMP5]])
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/array.ll b/llvm/test/Transforms/InstCombine/array.ll
index 396a7aa340f6b..236821d8ba4c0 100644
--- a/llvm/test/Transforms/InstCombine/array.ll
+++ b/llvm/test/Transforms/InstCombine/array.ll
@@ -7,7 +7,7 @@ define void @test(ptr %ptr, i32 %a, i32 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 10
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 40
 ; CHECK-NEXT:    store i32 [[B]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -79,7 +79,7 @@ define void @test_zext_nneg(ptr %ptr, i32 %a, i32 %b) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[A]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP1]], i64 10
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP1]], i64 40
 ; CHECK-NEXT:    store i32 [[B]], ptr [[GEP]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/assume.ll b/llvm/test/Transforms/InstCombine/assume.ll
index c7609564685fe..927f0a86b0a25 100644
--- a/llvm/test/Transforms/InstCombine/assume.ll
+++ b/llvm/test/Transforms/InstCombine/assume.ll
@@ -774,7 +774,7 @@ exit:
 
 define void @canonicalize_assume(ptr %0) {
 ; DEFAULT-LABEL: @canonicalize_assume(
-; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 2
+; DEFAULT-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 8
 ; DEFAULT-NEXT:    call void @llvm.assume(i1 true) [ "align"(ptr [[TMP2]], i64 16) ]
 ; DEFAULT-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/cast_phi.ll b/llvm/test/Transforms/InstCombine/cast_phi.ll
index 5b4425b4d8308..68847e73ac5d2 100644
--- a/llvm/test/Transforms/InstCombine/cast_phi.ll
+++ b/llvm/test/Transforms/InstCombine/cast_phi.ll
@@ -16,9 +16,9 @@ define void @MainKernel(i32 %iNumSteps, i32 %tid, i32 %base) {
 ; CHECK-NEXT:    [[CMP7:%.*]] = icmp eq i32 [[TID]], 0
 ; CHECK-NEXT:    br i1 [[CMP7]], label [[DOTBB1:%.*]], label [[DOTBB2:%.*]]
 ; CHECK:       .bb1:
-; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds [258 x float], ptr [[CALLA]], i64 0, i64 256
+; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i8, ptr [[CALLA]], i64 1024
 ; CHECK-NEXT:    store float [[CONV_I]], ptr [[ARRAYIDX10]], align 4
-; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds [258 x float], ptr [[CALLB]], i64 0, i64 256
+; CHECK-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[CALLB]], i64 1024
 ; CHECK-NEXT:    store float 0.000000e+00, ptr [[ARRAYIDX11]], align 4
 ; CHECK-NEXT:    br label [[DOTBB2]]
 ; CHECK:       .bb2:
diff --git a/llvm/test/Transforms/InstCombine/catchswitch-phi.ll b/llvm/test/Transforms/InstCombine/catchswitch-phi.ll
index 9a12e8bebabf6..038847609b0f9 100644
--- a/llvm/test/Transforms/InstCombine/catchswitch-phi.ll
+++ b/llvm/test/Transforms/InstCombine/catchswitch-phi.ll
@@ -22,11 +22,11 @@ define void @test0(i1 %c1) personality ptr @__gxx_wasm_personality_v0 {
 ; CHECK-NEXT:    [[TMP0:%.*]] = alloca [[STRUCT_BLAM:%.*]], align 4
 ; CHECK-NEXT:    br i1 [[C1:%.*]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_BLAM]], ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
 ; CHECK-NEXT:    invoke void @foo()
 ; CHECK-NEXT:    to label [[BB3:%.*]] unwind label [[BB4:%.*]]
 ; CHECK:       bb2:
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_BLAM]], ptr [[TMP0]], i32 0, i32 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i32 4
 ; CHECK-NEXT:    invoke void @foo()
 ; CHECK-NEXT:    to label [[BB3]] unwind label [[BB4]]
 ; CHECK:       bb3:
diff --git a/llvm/test/Transforms/InstCombine/compare-alloca.ll b/llvm/test/Transforms/InstCombine/compare-alloca.ll
index 9ca8dc866e565..c2639d39bc656 100644
--- a/llvm/test/Transforms/InstCombine/compare-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/compare-alloca.ll
@@ -75,7 +75,7 @@ define i1 @alloca_argument_compare_escaped_through_store(ptr %arg, ptr %ptr) {
 ; CHECK-LABEL: @alloca_argument_compare_escaped_through_store(
 ; CHECK-NEXT:    [[ALLOC:%.*]] = alloca i64, align 8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[ALLOC]], [[ARG:%.*]]
-; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds i64, ptr [[ALLOC]], i32 1
+; CHECK-NEXT:    [[P:%.*]] = getelementptr inbounds i8, ptr [[ALLOC]], i32 8
 ; CHECK-NEXT:    store ptr [[P]], ptr [[PTR:%.*]], align 4
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/extractvalue.ll b/llvm/test/Transforms/InstCombine/extractvalue.ll
index fce85596cf5f2..0b7c649839063 100644
--- a/llvm/test/Transforms/InstCombine/extractvalue.ll
+++ b/llvm/test/Transforms/InstCombine/extractvalue.ll
@@ -51,7 +51,7 @@ define i32 @foo(i32 %a, i32 %b) {
 ; The new load should be in the same spot as the old load.
 define i32 @extract2gep(ptr %pair, ptr %P) {
 ; CHECK-LABEL: @extract2gep(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { i16, i32 }, ptr [[PAIR:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[PAIR:%.*]], i64 4
 ; CHECK-NEXT:    [[E:%.*]] = load i32, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    store i32 0, ptr [[P:%.*]], align 4
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
@@ -82,7 +82,7 @@ end:
 ; to a 3-index inbounds gep + smaller load.
 define i16 @doubleextract2gep(ptr %arg) {
 ; CHECK-LABEL: @doubleextract2gep(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds { i16, { i32, i16 } }, ptr [[ARG:%.*]], i64 0, i32 1, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 8
 ; CHECK-NEXT:    [[E2:%.*]] = load i16, ptr [[TMP1]], align 2
 ; CHECK-NEXT:    ret i16 [[E2]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/gep-addrspace.ll b/llvm/test/Transforms/InstCombine/gep-addrspace.ll
index 64813264e9766..789a31e593946 100644
--- a/llvm/test/Transforms/InstCombine/gep-addrspace.ll
+++ b/llvm/test/Transforms/InstCombine/gep-addrspace.ll
@@ -45,7 +45,7 @@ define { i8, i8 } @inbounds_after_addrspacecast() {
 ; CHECK-NEXT:    [[T0:%.*]] = alloca i16, align 2
 ; CHECK-NEXT:    call void @escape_alloca(ptr nonnull [[T0]])
 ; CHECK-NEXT:    [[T1:%.*]] = addrspacecast ptr [[T0]] to ptr addrspace(11)
-; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds [2 x i8], ptr addrspace(11) [[T1]], i64 0, i64 1
+; CHECK-NEXT:    [[T2:%.*]] = getelementptr inbounds i8, ptr addrspace(11) [[T1]], i64 1
 ; CHECK-NEXT:    [[T3:%.*]] = load i8, ptr addrspace(11) [[T2]], align 1
 ; CHECK-NEXT:    [[INSERT:%.*]] = insertvalue { i8, i8 } zeroinitializer, i8 [[T3]], 1
 ; CHECK-NEXT:    ret { i8, i8 } [[INSERT]]
diff --git a/llvm/test/Transforms/InstCombine/gep-canonicalize-constant-indices.ll b/llvm/test/Transforms/InstCombine/gep-canonicalize-constant-indices.ll
index 2d5721b8c3656..849793c7a85af 100644
--- a/llvm/test/Transforms/InstCombine/gep-canonicalize-constant-indices.ll
+++ b/llvm/test/Transforms/InstCombine/gep-canonicalize-constant-indices.ll
@@ -13,7 +13,7 @@ declare void @use(i1)
 ; result = (((ptr) p + a) + b) + 1
 define ptr @basic(ptr %p, i64 %a, i64 %b) {
 ; CHECK-LABEL: @basic(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[A:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[B:%.*]]
 ; CHECK-NEXT:    ret ptr [[TMP3]]
@@ -50,9 +50,9 @@ define ptr @partialConstant2(ptr %p, i64 %a, i64 %b) {
 ; result = ((ptr) p + a) + 3
 define ptr @merge(ptr %p, i64 %a) {
 ; CHECK-LABEL: @merge(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[A:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 8
 ; CHECK-NEXT:    ret ptr [[TMP3]]
 ;
   %1 = getelementptr inbounds i32, ptr %p, i64 1
@@ -67,12 +67,12 @@ define ptr @merge(ptr %p, i64 %a) {
 ; result = (ptr) ((ptr) ((ptr) ptr + a) + (a * b)) + 9
 define ptr @nested(ptr %p, i64 %a, i64 %b) {
 ; CHECK-LABEL: @nested(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <3 x i32>, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 [[A:%.*]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul i64 [[A]], [[B:%.*]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <5 x i32>, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 128
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i16, ptr [[TMP4]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <4 x i32>, ptr [[TMP5]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 16
 ; CHECK-NEXT:    ret ptr [[TMP6]]
 ;
   %1 = getelementptr inbounds <3 x i32>, ptr %p, i64 1
@@ -87,7 +87,7 @@ define ptr @nested(ptr %p, i64 %a, i64 %b) {
 ; It is valid to swap if the source operand of the first GEP has multiple uses.
 define ptr @multipleUses1(ptr %p) {
 ; CHECK-LABEL: @multipleUses1(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[P]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[TMP2]]
 ; CHECK-NEXT:    ret ptr [[TMP3]]
@@ -101,7 +101,7 @@ define ptr @multipleUses1(ptr %p) {
 ; It is valid to swap if the second GEP has multiple uses.
 define ptr @multipleUses2(ptr %p, i64 %a) {
 ; CHECK-LABEL: @multipleUses2(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[A:%.*]]
 ; CHECK-NEXT:    call void @use(ptr nonnull [[TMP2]])
 ; CHECK-NEXT:    ret ptr [[TMP2]]
@@ -115,7 +115,7 @@ define ptr @multipleUses2(ptr %p, i64 %a) {
 ; Negative test. It is not valid to swap if the first GEP has multiple uses.
 define ptr @multipleUses3(ptr %p) {
 ; CHECK-LABEL: @multipleUses3(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 [[TMP2]]
 ; CHECK-NEXT:    ret ptr [[TMP3]]
diff --git a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll
index d0e5919b09f1c..99cdb6bc760b4 100644
--- a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll
+++ b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll
@@ -94,7 +94,7 @@ define void @PR37005(ptr %base, ptr %in) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[E1:%.*]] = getelementptr inbounds ptr, ptr [[IN:%.*]], i64 undef
-; CHECK-NEXT:    [[E2:%.*]] = getelementptr inbounds ptr, ptr [[E1]], i64 6
+; CHECK-NEXT:    [[E2:%.*]] = getelementptr inbounds i8, ptr [[E1]], i64 48
 ; CHECK-NEXT:    [[E4:%.*]] = getelementptr inbounds ptr, ptr [[E2]], <2 x i64> <i64 0, i64 1>
 ; CHECK-NEXT:    [[PI1:%.*]] = ptrtoint <2 x ptr> [[E4]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[PI1]], <i64 14, i64 14>
@@ -126,7 +126,7 @@ define void @PR37005_2(ptr %base, ptr %in) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[E1:%.*]] = getelementptr inbounds ptr, ptr [[IN:%.*]], i64 undef
-; CHECK-NEXT:    [[E2:%.*]] = getelementptr inbounds ptr, ptr [[E1]], i64 6
+; CHECK-NEXT:    [[E2:%.*]] = getelementptr inbounds i8, ptr [[E1]], i64 48
 ; CHECK-NEXT:    [[PI1:%.*]] = ptrtoint ptr [[E2]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr i64 [[PI1]], 14
 ; CHECK-NEXT:    [[SL1:%.*]] = and i64 [[TMP0]], 1125899906842496
@@ -156,7 +156,7 @@ define void @PR37005_3(<2 x ptr> %base, ptr %in) {
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[E1:%.*]] = getelementptr inbounds ptr, ptr [[IN:%.*]], i64 undef
-; CHECK-NEXT:    [[E2:%.*]] = getelementptr inbounds ptr, ptr [[E1]], i64 6
+; CHECK-NEXT:    [[E2:%.*]] = getelementptr inbounds i8, ptr [[E1]], i64 48
 ; CHECK-NEXT:    [[E4:%.*]] = getelementptr inbounds ptr, ptr [[E2]], <2 x i64> <i64 0, i64 1>
 ; CHECK-NEXT:    [[PI1:%.*]] = ptrtoint <2 x ptr> [[E4]] to <2 x i64>
 ; CHECK-NEXT:    [[TMP0:%.*]] = lshr <2 x i64> [[PI1]], <i64 14, i64 14>
diff --git a/llvm/test/Transforms/InstCombine/gep-custom-dl.ll b/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
index 7be15e38d44a7..d9449e05612c3 100644
--- a/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
+++ b/llvm/test/Transforms/InstCombine/gep-custom-dl.ll
@@ -13,7 +13,7 @@ target datalayout = "e-m:m-p:40:64:64:32-i32:32-i16:16-i8:8-n32"
 ; Test that two array indexing geps fold
 define ptr @test1(ptr %I) {
 ; CHECK-LABEL: @test1(
-; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[I:%.*]], i32 21
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[I:%.*]], i32 84
 ; CHECK-NEXT:    ret ptr [[B]]
 ;
   %A = getelementptr i32, ptr %I, i8 17
@@ -24,7 +24,7 @@ define ptr @test1(ptr %I) {
 ; Test that two getelementptr insts fold
 define ptr @test2(ptr %I) {
 ; CHECK-LABEL: @test2(
-; CHECK-NEXT:    [[A:%.*]] = getelementptr { i32 }, ptr [[I:%.*]], i32 1
+; CHECK-NEXT:    [[A:%.*]] = getelementptr i8, ptr [[I:%.*]], i32 4
 ; CHECK-NEXT:    ret ptr [[A]]
 ;
   %A = getelementptr { i32 }, ptr %I, i32 1
diff --git a/llvm/test/Transforms/InstCombine/gep-merge-constant-indices.ll b/llvm/test/Transforms/InstCombine/gep-merge-constant-indices.ll
index bcba8eadb1f96..509d19c8c00c7 100644
--- a/llvm/test/Transforms/InstCombine/gep-merge-constant-indices.ll
+++ b/llvm/test/Transforms/InstCombine/gep-merge-constant-indices.ll
@@ -12,7 +12,7 @@ target datalayout = "i24:8:8"
 ; result = (ptr) p + 3
 define ptr @mergeBasic(ptr %p) {
 ; CHECK-LABEL: @mergeBasic(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 12
 ; CHECK-NEXT:    ret ptr [[TMP1]]
 ;
   %1 = getelementptr inbounds i32, ptr %p, i64 1
@@ -58,7 +58,7 @@ define ptr @zeroSum(ptr %p) {
 ; result = (ptr) ((ptr) p + 1) + 17
 define ptr @array1(ptr %p) {
 ; CHECK-LABEL: @array1(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [20 x i8], ptr [[P:%.*]], i64 1, i64 17
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 37
 ; CHECK-NEXT:    ret ptr [[TMP1]]
 ;
   %1 = getelementptr inbounds [20 x i8], ptr %p, i64 1, i64 1
@@ -93,7 +93,7 @@ define ptr @struct1(ptr %p) {
 ; result = &((struct.A*) p - 1).member1
 define ptr @struct2(ptr %p) {
 ; CHECK-LABEL: @struct2(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[P:%.*]], i64 -1, i32 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 -4
 ; CHECK-NEXT:    ret ptr [[TMP1]]
 ;
   %1 = getelementptr inbounds %struct.A, ptr %p, i64 0, i32 1
@@ -104,7 +104,7 @@ define ptr @struct2(ptr %p) {
 ; result = (ptr) &((struct.B) p)[0].member2.member0 + 7
 define ptr @structStruct(ptr %p) {
 ; CHECK-LABEL: @structStruct(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_B:%.*]], ptr [[P:%.*]], i64 0, i32 2, i32 0, i64 7
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 15
 ; CHECK-NEXT:    ret ptr [[TMP1]]
 ;
   %1 = getelementptr inbounds %struct.B, ptr %p, i64 0, i32 2, i32 0, i64 3
@@ -144,7 +144,7 @@ define ptr @notDivisible(ptr %p) {
 ; or divisible by the other's size.
 define ptr @partialConstant2(ptr %p, i64 %a) {
 ; CHECK-LABEL: @partialConstant2(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [4 x i64], ptr [[TMP1]], i64 [[A:%.*]], i64 2
 ; CHECK-NEXT:    ret ptr [[TMP2]]
 ;
@@ -157,7 +157,7 @@ define ptr @partialConstant2(ptr %p, i64 %a) {
 ; first GEP by the second GEP.
 define ptr @partialConstant3(ptr %p) {
 ; CHECK-LABEL: @partialConstant3(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = ptrtoint ptr [[TMP1]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [4 x i64], ptr [[TMP1]], i64 [[TMP2]], i64 2
 ; CHECK-NEXT:    ret ptr [[TMP3]]
@@ -200,7 +200,7 @@ define ptr @partialConstantMemberAliasing2(ptr %p, i64 %a) {
 define ptr @partialConstantMemberAliasing3(ptr %p, i64 %a) {
 ; CHECK-LABEL: @partialConstantMemberAliasing3(
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[STRUCT_C:%.*]], ptr [[P:%.*]], i64 [[A:%.*]], i32 2
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 1
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 4
 ; CHECK-NEXT:    ret ptr [[TMP2]]
 ;
   %1 = getelementptr inbounds %struct.C, ptr %p, i64 %a, i32 2
diff --git a/llvm/test/Transforms/InstCombine/gep-vector-indices.ll b/llvm/test/Transforms/InstCombine/gep-vector-indices.ll
index d9f7e4ff9f701..e9534e45ec141 100644
--- a/llvm/test/Transforms/InstCombine/gep-vector-indices.ll
+++ b/llvm/test/Transforms/InstCombine/gep-vector-indices.ll
@@ -3,7 +3,7 @@
 
 define ptr @vector_splat_indices_v2i64_ext0(ptr %a) {
 ; CHECK-LABEL: @vector_splat_indices_v2i64_ext0(
-; CHECK-NEXT:    [[RES:%.*]] = getelementptr i32, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[RES:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 16
 ; CHECK-NEXT:    ret ptr [[RES]]
 ;
   %gep = getelementptr i32, ptr %a, <2 x i64> <i64 4, i64 4>
@@ -13,7 +13,7 @@ define ptr @vector_splat_indices_v2i64_ext0(ptr %a) {
 
 define ptr @vector_splat_indices_nxv2i64_ext0(ptr %a) {
 ; CHECK-LABEL: @vector_splat_indices_nxv2i64_ext0(
-; CHECK-NEXT:    [[RES:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 4
+; CHECK-NEXT:    [[RES:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 16
 ; CHECK-NEXT:    ret ptr [[RES]]
 ;
   %tmp = insertelement <vscale x 2 x i64> poison, i64 4, i32 0
@@ -74,7 +74,7 @@ define ptr @vector_splat_ptrs_nxv2i64_ext0(ptr %a, i64 %index) {
 
 define ptr @vector_struct1_splat_indices_v4i64_ext1(ptr %a) {
 ; CHECK-LABEL: @vector_struct1_splat_indices_v4i64_ext1(
-; CHECK-NEXT:    [[RES:%.*]] = getelementptr { float, float }, ptr [[A:%.*]], i64 4, i32 0
+; CHECK-NEXT:    [[RES:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 32
 ; CHECK-NEXT:    ret ptr [[RES]]
 ;
   %gep = getelementptr {float, float}, ptr %a, <4 x i32> <i32 4, i32 4, i32 4, i32 4>, i32 0
@@ -85,7 +85,7 @@ define ptr @vector_struct1_splat_indices_v4i64_ext1(ptr %a) {
 
 define ptr @vector_struct2_splat_indices_v4i64_ext1(ptr %a) {
 ; CHECK-LABEL: @vector_struct2_splat_indices_v4i64_ext1(
-; CHECK-NEXT:    [[RES:%.*]] = getelementptr { float, [8 x float] }, ptr [[A:%.*]], i64 2, i32 1, i64 4
+; CHECK-NEXT:    [[RES:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 92
 ; CHECK-NEXT:    ret ptr [[RES]]
 ;
   %gep = getelementptr {float, [8 x float]}, ptr %a, i32 2, i32 1, <4 x i32> <i32 4, i32 4, i32 4, i32 4>
diff --git a/llvm/test/Transforms/InstCombine/gep-vector.ll b/llvm/test/Transforms/InstCombine/gep-vector.ll
index 9ebb245325b14..3465c620b70fe 100644
--- a/llvm/test/Transforms/InstCombine/gep-vector.ll
+++ b/llvm/test/Transforms/InstCombine/gep-vector.ll
@@ -136,7 +136,7 @@ define ptr @test_accumulate_constant_offset_vscale_nonzero(<vscale x 16 x i1> %p
 
 define ptr @test_accumulate_constant_offset_vscale_zero(<vscale x 16 x i1> %pg, ptr %base) {
 ; CHECK-LABEL: @test_accumulate_constant_offset_vscale_zero(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr <vscale x 16 x i8>, ptr [[BASE:%.*]], i64 0, i64 4
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 4
 ; CHECK-NEXT:    ret ptr [[GEP]]
 ;
   %gep = getelementptr <vscale x 16 x i8>, ptr %base, i64 0, i64 4
diff --git a/llvm/test/Transforms/InstCombine/gepphigep.ll b/llvm/test/Transforms/InstCombine/gepphigep.ll
index bc2d8201a36a3..93266c6754396 100644
--- a/llvm/test/Transforms/InstCombine/gepphigep.ll
+++ b/llvm/test/Transforms/InstCombine/gepphigep.ll
@@ -54,7 +54,7 @@ define i32 @test2(ptr %dm, i1 %tmp4, i64 %tmp9, i64 %tmp19) {
 ; CHECK-NEXT:    store i32 0, ptr [[TMP10]], align 4
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds [[STRUCT2]], ptr [[TMP1]], i64 [[TMP19:%.*]]
 ; CHECK-NEXT:    store i32 0, ptr [[TMP20]], align 4
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds [[STRUCT2]], ptr [[TMP1]], i64 [[TMP9]], i32 1
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 4
 ; CHECK-NEXT:    [[TMP25:%.*]] = load i32, ptr [[TMP24]], align 4
 ; CHECK-NEXT:    ret i32 [[TMP25]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/getelementptr.ll b/llvm/test/Transforms/InstCombine/getelementptr.ll
index 642c3eb2a0e41..08931eb7f02b8 100644
--- a/llvm/test/Transforms/InstCombine/getelementptr.ll
+++ b/llvm/test/Transforms/InstCombine/getelementptr.ll
@@ -42,7 +42,7 @@ define ptr @test2(ptr %I) {
 ; Test that two array indexing geps fold
 define ptr @test3(ptr %I) {
 ; CHECK-LABEL: @test3(
-; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[I:%.*]], i64 21
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[I:%.*]], i64 84
 ; CHECK-NEXT:    ret ptr [[B]]
 ;
   %A = getelementptr i32, ptr %I, i64 17
@@ -53,7 +53,7 @@ define ptr @test3(ptr %I) {
 ; Test that two getelementptr insts fold
 define ptr @test4(ptr %I) {
 ; CHECK-LABEL: @test4(
-; CHECK-NEXT:    [[A:%.*]] = getelementptr { i32 }, ptr [[I:%.*]], i64 1
+; CHECK-NEXT:    [[A:%.*]] = getelementptr i8, ptr [[I:%.*]], i64 4
 ; CHECK-NEXT:    ret ptr [[A]]
 ;
   %A = getelementptr { i32 }, ptr %I, i64 1
@@ -162,8 +162,8 @@ define i1 @test10(ptr %x, ptr %y) {
 
 define i1 @test10_addrspacecast(ptr %x, ptr addrspace(3) %y) {
 ; CHECK-LABEL: @test10_addrspacecast(
-; CHECK-NEXT:    [[T1:%.*]] = getelementptr { i32, i32 }, ptr [[X:%.*]], i64 0, i32 1
-; CHECK-NEXT:    [[T3:%.*]] = getelementptr { i32, i32 }, ptr addrspace(3) [[Y:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[T1:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 4
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr i8, ptr addrspace(3) [[Y:%.*]], i64 4
 ; CHECK-NEXT:    [[T3_C:%.*]] = addrspacecast ptr addrspace(3) [[T3]] to ptr
 ; CHECK-NEXT:    [[T4:%.*]] = icmp eq ptr [[T1]], [[T3_C]]
 ; CHECK-NEXT:    ret i1 [[T4]]
@@ -190,7 +190,7 @@ define i1 @test11(ptr %X) {
 define i32 @test12(ptr %a) {
 ; CHECK-LABEL: @test12(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[A:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 8
 ; CHECK-NEXT:    store i32 10, ptr [[G3]], align 4
 ; CHECK-NEXT:    ret i32 10
 ;
@@ -596,7 +596,7 @@ define i32 @test27(ptr %to, ptr %from) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FROM_ADDR:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[T344:%.*]] = load ptr, ptr [[FROM_ADDR]], align 8
-; CHECK-NEXT:    [[T348:%.*]] = getelementptr [[STRUCT_SIGINFO_T:%.*]], ptr [[T344]], i64 0, i32 3, i32 0, i32 3
+; CHECK-NEXT:    [[T348:%.*]] = getelementptr i8, ptr [[T344]], i64 24
 ; CHECK-NEXT:    [[T351:%.*]] = load i32, ptr [[T348]], align 8
 ; CHECK-NEXT:    [[T360:%.*]] = call i32 asm sideeffect "...", "=r,ir,*m,i,0,~{dirflag},~{fpsr},~{flags}"(i32 [[T351]], ptr elementtype([[STRUCT___LARGE_STRUCT:%.*]]) null, i32 -14, i32 0) #[[ATTR0:[0-9]+]]
 ; CHECK-NEXT:    unreachable
@@ -623,14 +623,15 @@ define i32 @test28() nounwind  {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ORIENTATIONS:%.*]] = alloca [1 x [1 x %struct.x]], align 8
 ; CHECK-NEXT:    [[T3:%.*]] = call i32 @puts(ptr noundef nonnull dereferenceable(1) @.str) #[[ATTR0]]
+; CHECK-NEXT:    [[T45:%.*]] = getelementptr inbounds i8, ptr [[ORIENTATIONS]], i64 1
 ; CHECK-NEXT:    br label [[BB10:%.*]]
 ; CHECK:       bb10:
 ; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[BB10]] ]
 ; CHECK-NEXT:    [[T12_REC:%.*]] = xor i32 [[INDVAR]], -1
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[T12_REC]] to i64
-; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds [1 x [1 x %struct.x]], ptr [[ORIENTATIONS]], i64 1, i64 0, i64 [[TMP0]]
+; CHECK-NEXT:    [[T12:%.*]] = getelementptr inbounds [[STRUCT_X:%.*]], ptr [[T45]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[T16:%.*]] = call i32 (ptr, ...) @printf(ptr noundef nonnull dereferenceable(1) @.str1, ptr nonnull [[T12]]) #[[ATTR0]]
-; CHECK-NEXT:    [[T84:%.*]] = icmp eq i32 [[INDVAR]], 0
+; CHECK-NEXT:    [[T84:%.*]] = icmp eq ptr [[T12]], [[ORIENTATIONS]]
 ; CHECK-NEXT:    [[INDVAR_NEXT]] = add i32 [[INDVAR]], 1
 ; CHECK-NEXT:    br i1 [[T84]], label [[BB17:%.*]], label [[BB10]]
 ; CHECK:       bb17:
@@ -732,9 +733,9 @@ define ptr @test32(ptr %v) {
 ; CHECK-LABEL: @test32(
 ; CHECK-NEXT:    [[A:%.*]] = alloca [4 x ptr], align 16
 ; CHECK-NEXT:    store ptr null, ptr [[A]], align 8
-; CHECK-NEXT:    [[D:%.*]] = getelementptr inbounds { [16 x i8] }, ptr [[A]], i64 0, i32 0, i64 8
+; CHECK-NEXT:    [[D:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    store ptr [[V:%.*]], ptr [[D]], align 8
-; CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds [4 x ptr], ptr [[A]], i64 0, i64 2
+; CHECK-NEXT:    [[F:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[G:%.*]] = load ptr, ptr [[F]], align 8
 ; CHECK-NEXT:    ret ptr [[G]]
 ;
@@ -753,7 +754,7 @@ define ptr @test32(ptr %v) {
 
 define ptr @test33(ptr %A) {
 ; CHECK-LABEL: @test33(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr [[STRUCT_ANON:%.*]], ptr [[A:%.*]], i64 0, i32 2
+; CHECK-NEXT:    [[C:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    ret ptr [[C]]
 ;
   %C = getelementptr %struct.anon, ptr %A, i32 0, i32 2
@@ -762,7 +763,7 @@ define ptr @test33(ptr %A) {
 
 define ptr addrspace(1) @test33_as1(ptr addrspace(1) %A) {
 ; CHECK-LABEL: @test33_as1(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr [[STRUCT_ANON:%.*]], ptr addrspace(1) [[A:%.*]], i16 0, i32 2
+; CHECK-NEXT:    [[C:%.*]] = getelementptr i8, ptr addrspace(1) [[A:%.*]], i16 4
 ; CHECK-NEXT:    ret ptr addrspace(1) [[C]]
 ;
   %C = getelementptr %struct.anon, ptr addrspace(1) %A, i32 0, i32 2
@@ -771,7 +772,7 @@ define ptr addrspace(1) @test33_as1(ptr addrspace(1) %A) {
 
 define ptr addrspace(1) @test33_array_as1(ptr addrspace(1) %A) {
 ; CHECK-LABEL: @test33_array_as1(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr [5 x i32], ptr addrspace(1) [[A:%.*]], i16 0, i16 2
+; CHECK-NEXT:    [[C:%.*]] = getelementptr i8, ptr addrspace(1) [[A:%.*]], i16 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[C]]
 ;
   %C = getelementptr [5 x i32], ptr addrspace(1) %A, i32 0, i32 2
@@ -781,7 +782,7 @@ define ptr addrspace(1) @test33_array_as1(ptr addrspace(1) %A) {
 ; Make sure the GEP indices use the right pointer sized integer
 define ptr addrspace(1) @test33_array_struct_as1(ptr addrspace(1) %A) {
 ; CHECK-LABEL: @test33_array_struct_as1(
-; CHECK-NEXT:    [[C:%.*]] = getelementptr [20 x i32], ptr addrspace(1) [[A:%.*]], i16 0, i16 2
+; CHECK-NEXT:    [[C:%.*]] = getelementptr i8, ptr addrspace(1) [[A:%.*]], i16 8
 ; CHECK-NEXT:    ret ptr addrspace(1) [[C]]
 ;
   %C = getelementptr [20 x i32], ptr addrspace(1) %A, i32 0, i32 2
@@ -791,7 +792,7 @@ define ptr addrspace(1) @test33_array_struct_as1(ptr addrspace(1) %A) {
 define ptr addrspace(1) @test33_addrspacecast(ptr %A) {
 ; CHECK-LABEL: @test33_addrspacecast(
 ; CHECK-NEXT:    [[B:%.*]] = addrspacecast ptr [[A:%.*]] to ptr addrspace(1)
-; CHECK-NEXT:    [[C:%.*]] = getelementptr [[STRUCT_ANON:%.*]], ptr addrspace(1) [[B]], i16 0, i32 2
+; CHECK-NEXT:    [[C:%.*]] = getelementptr i8, ptr addrspace(1) [[B]], i16 4
 ; CHECK-NEXT:    ret ptr addrspace(1) [[C]]
 ;
   %B = addrspacecast ptr %A to ptr addrspace(1)
@@ -871,7 +872,7 @@ declare void @pr10322_f3(ptr)
 define void @pr10322_f1(ptr %foo) {
 ; CHECK-LABEL: @pr10322_f1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds [[PR10322_T:%.*]], ptr [[FOO:%.*]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX8:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 16
 ; CHECK-NEXT:    call void @pr10322_f2(ptr nonnull [[ARRAYIDX8]]) #[[ATTR0]]
 ; CHECK-NEXT:    call void @pr10322_f3(ptr nonnull [[ARRAYIDX8]]) #[[ATTR0]]
 ; CHECK-NEXT:    ret void
@@ -891,7 +892,7 @@ entry:
 
 define void @three_gep_f(ptr %x) {
 ; CHECK-LABEL: @three_gep_f(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr [[THREE_GEP_T2:%.*]], ptr [[X:%.*]], i64 2
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[X:%.*]], i64 8
 ; CHECK-NEXT:    call void @three_gep_h(ptr [[GEP1]])
 ; CHECK-NEXT:    call void @three_gep_g(ptr [[GEP1]])
 ; CHECK-NEXT:    ret void
@@ -911,7 +912,7 @@ declare void @three_gep_h(ptr)
 
 define void @test39(ptr %arg, i8 %arg1) nounwind {
 ; CHECK-LABEL: @test39(
-; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds [[STRUCT_HAM:%.*]], ptr [[ARG:%.*]], i64 0, i32 2
+; CHECK-NEXT:    [[T:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i64 16
 ; CHECK-NEXT:    [[T2:%.*]] = load ptr, ptr [[T]], align 8
 ; CHECK-NEXT:    [[T4:%.*]] = getelementptr inbounds i8, ptr [[T2]], i64 -8
 ; CHECK-NEXT:    store i8 [[ARG1:%.*]], ptr [[T4]], align 8
@@ -1162,7 +1163,7 @@ define ptr @test48(ptr %I, i64 %C, i64 %D) {
 
 define ptr @test49(ptr %I, i64 %C) {
 ; CHECK-LABEL: @test49(
-; CHECK-NEXT:    [[B:%.*]] = getelementptr i32, ptr [[I:%.*]], i64 -1
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[I:%.*]], i64 -4
 ; CHECK-NEXT:    ret ptr [[B]]
 ;
   %notC = xor i64 -1, %C
@@ -1229,7 +1230,7 @@ define ptr @test_nzgep_zgep(ptr %base, i64 %idx) {
 
 define ptr @test_gep_inbounds_of_gep(ptr %base) {
 ; CHECK-LABEL: @test_gep_inbounds_of_gep(
-; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 8
+; CHECK-NEXT:    [[PTR2:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 32
 ; CHECK-NEXT:    ret ptr [[PTR2]]
 ;
   %ptr1 = getelementptr i32, ptr %base, i64 4
@@ -1318,7 +1319,7 @@ define i32 @test_gep_bitcast_malloc(ptr %a) {
 ; CHECK-LABEL: @test_gep_bitcast_malloc(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CALL:%.*]] = call noalias dereferenceable_or_null(16) ptr @malloc(i64 16)
-; CHECK-NEXT:    [[G3:%.*]] = getelementptr [[STRUCT_A:%.*]], ptr [[CALL]], i64 0, i32 2
+; CHECK-NEXT:    [[G3:%.*]] = getelementptr i8, ptr [[CALL]], i64 12
 ; CHECK-NEXT:    [[A_C:%.*]] = load i32, ptr [[G3]], align 4
 ; CHECK-NEXT:    ret i32 [[A_C]]
 ;
@@ -1331,9 +1332,9 @@ entry:
 
 define ptr @gep_of_gep_multiuse_const_and_const(ptr %p, i64 %idx) {
 ; CHECK-LABEL: @gep_of_gep_multiuse_const_and_const(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr { i32, i32 }, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8
 ; CHECK-NEXT:    call void @use(ptr [[GEP1]])
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr { i32, i32 }, ptr [[P]], i64 1, i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 12
 ; CHECK-NEXT:    ret ptr [[GEP2]]
 ;
   %gep1 = getelementptr { i32, i32 }, ptr %p, i64 1
@@ -1346,7 +1347,7 @@ define ptr @gep_of_gep_multiuse_var_and_const(ptr %p, i64 %idx) {
 ; CHECK-LABEL: @gep_of_gep_multiuse_var_and_const(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr { i32, i32 }, ptr [[P:%.*]], i64 [[IDX:%.*]]
 ; CHECK-NEXT:    call void @use(ptr [[GEP1]])
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr { i32, i32 }, ptr [[P]], i64 [[IDX]], i32 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[GEP1]], i64 4
 ; CHECK-NEXT:    ret ptr [[GEP2]]
 ;
   %gep1 = getelementptr { i32, i32 }, ptr %p, i64 %idx
diff --git a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll
index 330825e1055ea..491f214671675 100644
--- a/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-custom-dl.ll
@@ -7,7 +7,7 @@ declare i32 @test58_d(i64 )
 
 define i1 @test59(ptr %foo) {
 ; CHECK-LABEL: @test59(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i32 2
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i32 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[GEP1]] to i40
 ; CHECK-NEXT:    [[USE:%.*]] = zext i40 [[TMP1]] to i64
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @test58_d(i64 [[USE]])
@@ -23,7 +23,7 @@ define i1 @test59(ptr %foo) {
 
 define i1 @test59_as1(ptr addrspace(1) %foo) {
 ; CHECK-LABEL: @test59_as1(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[FOO:%.*]], i16 2
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[FOO:%.*]], i16 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP1]] to i16
 ; CHECK-NEXT:    [[USE:%.*]] = zext i16 [[TMP1]] to i64
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @test58_d(i64 [[USE]])
diff --git a/llvm/test/Transforms/InstCombine/icmp-gep.ll b/llvm/test/Transforms/InstCombine/icmp-gep.ll
index fb297e04ceb06..d912f962354fb 100644
--- a/llvm/test/Transforms/InstCombine/icmp-gep.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-gep.ll
@@ -244,7 +244,7 @@ declare i32 @test58_d(i64)
 
 define i1 @test59(ptr %foo) {
 ; CHECK-LABEL: @test59(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 2
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 8
 ; CHECK-NEXT:    [[USE:%.*]] = ptrtoint ptr [[GEP1]] to i64
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @test58_d(i64 [[USE]])
 ; CHECK-NEXT:    ret i1 true
@@ -259,7 +259,7 @@ define i1 @test59(ptr %foo) {
 
 define i1 @test59_as1(ptr addrspace(1) %foo) {
 ; CHECK-LABEL: @test59_as1(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[FOO:%.*]], i16 2
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[FOO:%.*]], i16 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr addrspace(1) [[GEP1]] to i16
 ; CHECK-NEXT:    [[USE:%.*]] = zext i16 [[TMP1]] to i64
 ; CHECK-NEXT:    [[CALL:%.*]] = call i32 @test58_d(i64 [[USE]])
@@ -417,7 +417,7 @@ define i1 @test60_extra_use(ptr %foo, i64 %i, i64 %j) {
 
 define i1 @test60_extra_use_const_operands_inbounds(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60_extra_use_const_operands_inbounds(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[FOO:%.*]], i64 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 4
 ; CHECK-NEXT:    call void @use(ptr nonnull [[GEP1]])
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[J:%.*]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
@@ -431,7 +431,7 @@ define i1 @test60_extra_use_const_operands_inbounds(ptr %foo, i64 %i, i64 %j) {
 
 define i1 @test60_extra_use_const_operands_no_inbounds(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60_extra_use_const_operands_no_inbounds(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[FOO:%.*]], i64 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[FOO:%.*]], i64 4
 ; CHECK-NEXT:    call void @use(ptr [[GEP1]])
 ; CHECK-NEXT:    [[GEP2_IDX_MASK:%.*]] = and i64 [[J:%.*]], 9223372036854775807
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[GEP2_IDX_MASK]], 2
diff --git a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
index 61cf727fb6c8c..2b5b3fce70535 100644
--- a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
+++ b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
@@ -73,8 +73,8 @@ define ptr @test3_no_inbounds1(ptr %A, i32 %Offset) {
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS:%.*]] = phi ptr [ [[RHS_NEXT:%.*]], [[BB]] ], [ [[TMP]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[LHS:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 100
-; CHECK-NEXT:    [[RHS_NEXT]] = getelementptr inbounds i32, ptr [[RHS]], i32 1
+; CHECK-NEXT:    [[LHS:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 400
+; CHECK-NEXT:    [[RHS_NEXT]] = getelementptr inbounds i8, ptr [[RHS]], i32 4
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult ptr [[LHS]], [[RHS]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
@@ -102,8 +102,8 @@ define ptr @test3_no_inbounds2(ptr %A, i32 %Offset) {
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS:%.*]] = phi ptr [ [[RHS_NEXT:%.*]], [[BB]] ], [ [[TMP]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[LHS:%.*]] = getelementptr inbounds i32, ptr [[A]], i32 100
-; CHECK-NEXT:    [[RHS_NEXT]] = getelementptr i32, ptr [[RHS]], i32 1
+; CHECK-NEXT:    [[LHS:%.*]] = getelementptr inbounds i8, ptr [[A]], i32 400
+; CHECK-NEXT:    [[RHS_NEXT]] = getelementptr i8, ptr [[RHS]], i32 4
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult ptr [[LHS]], [[RHS]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
@@ -131,8 +131,8 @@ define ptr @test3_no_inbounds3(ptr %A, i32 %Offset) {
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS:%.*]] = phi ptr [ [[RHS_NEXT:%.*]], [[BB]] ], [ [[TMP]], [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[LHS:%.*]] = getelementptr i32, ptr [[A]], i32 100
-; CHECK-NEXT:    [[RHS_NEXT]] = getelementptr inbounds i32, ptr [[RHS]], i32 1
+; CHECK-NEXT:    [[LHS:%.*]] = getelementptr i8, ptr [[A]], i32 400
+; CHECK-NEXT:    [[RHS_NEXT]] = getelementptr inbounds i8, ptr [[RHS]], i32 4
 ; CHECK-NEXT:    [[COND:%.*]] = icmp ult ptr [[LHS]], [[RHS]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
@@ -318,7 +318,7 @@ define i1 @test8(ptr %in, i64 %offset) {
 ; CHECK-NEXT:    [[GEPI8:%.*]] = getelementptr inbounds i8, ptr [[CASTI8]], i32 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i64 [[LD]] to i32
 ; CHECK-NEXT:    [[PTRCAST:%.*]] = inttoptr i32 [[TMP2]] to ptr
-; CHECK-NEXT:    [[GEPI32:%.*]] = getelementptr inbounds ptr, ptr [[PTRCAST]], i32 1
+; CHECK-NEXT:    [[GEPI32:%.*]] = getelementptr inbounds i8, ptr [[PTRCAST]], i32 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[GEPI32]], [[GEPI8]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -335,7 +335,7 @@ entry:
 define void @test_zero_offset_cycle(ptr %arg) {
 ; CHECK-LABEL: @test_zero_offset_cycle(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds { i64, i64 }, ptr [[ARG:%.*]], i32 0, i32 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[ARG:%.*]], i32 8
 ; CHECK-NEXT:    [[GEP_INT:%.*]] = ptrtoint ptr [[GEP]] to i32
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
diff --git a/llvm/test/Transforms/InstCombine/intptr1.ll b/llvm/test/Transforms/InstCombine/intptr1.ll
index fcd97b0cf8b0b..2bf255341f4bb 100644
--- a/llvm/test/Transforms/InstCombine/intptr1.ll
+++ b/llvm/test/Transforms/InstCombine/intptr1.ll
@@ -16,8 +16,8 @@ define void @test1(ptr %a, ptr readnone %a_end, ptr %b.i64) {
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[B_ADDR_02_PTR]], align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[I1]], 4.200000e+01
 ; CHECK-NEXT:    store float [[MUL_I]], ptr [[A_ADDR_03]], align 4
-; CHECK-NEXT:    [[ADD]] = getelementptr inbounds float, ptr [[B_ADDR_02_PTR]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_03]], i64 1
+; CHECK-NEXT:    [[ADD]] = getelementptr inbounds i8, ptr [[B_ADDR_02_PTR]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_03]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
@@ -68,8 +68,8 @@ define void @test1_neg(ptr %a, ptr readnone %a_end, ptr %b.i64) {
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[A]], align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[I1]], 4.200000e+01
 ; CHECK-NEXT:    store float [[MUL_I]], ptr [[A_ADDR_03]], align 4
-; CHECK-NEXT:    [[ADD]] = getelementptr inbounds float, ptr [[A]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_03]], i64 1
+; CHECK-NEXT:    [[ADD]] = getelementptr inbounds i8, ptr [[A]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_03]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
@@ -122,8 +122,8 @@ define void @test2(ptr %a, ptr readnone %a_end, ptr %b.float) {
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[B_ADDR_02_PTR]], align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[I1]], 4.200000e+01
 ; CHECK-NEXT:    store float [[MUL_I]], ptr [[A_ADDR_03]], align 4
-; CHECK-NEXT:    [[ADD]] = getelementptr inbounds float, ptr [[B_ADDR_02_PTR]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_03]], i64 1
+; CHECK-NEXT:    [[ADD]] = getelementptr inbounds i8, ptr [[B_ADDR_02_PTR]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_03]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
@@ -172,8 +172,8 @@ define void @test3(ptr %a, ptr readnone %a_end, ptr %b.i8p) {
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[B_ADDR_02_PTR]], align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[I1]], 4.200000e+01
 ; CHECK-NEXT:    store float [[MUL_I]], ptr [[A_ADDR_03]], align 4
-; CHECK-NEXT:    [[ADD]] = getelementptr inbounds float, ptr [[B_ADDR_02_PTR]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_03]], i64 1
+; CHECK-NEXT:    [[ADD]] = getelementptr inbounds i8, ptr [[B_ADDR_02_PTR]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_03]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
@@ -221,8 +221,8 @@ define void @test4(ptr %a, ptr readnone %a_end, ptr %b.float) {
 ; CHECK-NEXT:    [[I1:%.*]] = load float, ptr [[B_ADDR_02_IN]], align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[I1]], 4.200000e+01
 ; CHECK-NEXT:    store float [[MUL_I]], ptr [[A_ADDR_03]], align 4
-; CHECK-NEXT:    [[ADD]] = getelementptr inbounds float, ptr [[B_ADDR_02_IN]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_03]], i64 1
+; CHECK-NEXT:    [[ADD]] = getelementptr inbounds i8, ptr [[B_ADDR_02_IN]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_03]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/InstCombine/intptr2.ll b/llvm/test/Transforms/InstCombine/intptr2.ll
index 6ace82bc8e09e..c256171142b91 100644
--- a/llvm/test/Transforms/InstCombine/intptr2.ll
+++ b/llvm/test/Transforms/InstCombine/intptr2.ll
@@ -15,8 +15,8 @@ define void @test1(ptr %a, ptr readnone %a_end, ptr %b.i) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[B_ADDR_02_IN]], align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[TMP1]], 4.200000e+01
 ; CHECK-NEXT:    store float [[MUL_I]], ptr [[A_ADDR_03]], align 4
-; CHECK-NEXT:    [[ADD]] = getelementptr inbounds float, ptr [[B_ADDR_02_IN]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_03]], i64 1
+; CHECK-NEXT:    [[ADD]] = getelementptr inbounds i8, ptr [[B_ADDR_02_IN]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_03]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/InstCombine/intptr3.ll b/llvm/test/Transforms/InstCombine/intptr3.ll
index 0937d6d377b1e..3bb3c7dcb4e1a 100644
--- a/llvm/test/Transforms/InstCombine/intptr3.ll
+++ b/llvm/test/Transforms/InstCombine/intptr3.ll
@@ -15,8 +15,8 @@ define  void @test(ptr %a, ptr readnone %a_end, i64 %b) unnamed_addr  {
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[B_ADDR_FLOAT]], align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[L]], 4.200000e+01
 ; CHECK-NEXT:    store float [[MUL_I]], ptr [[A_ADDR_03]], align 4
-; CHECK-NEXT:    [[B_ADDR_FLOAT_INC]] = getelementptr inbounds float, ptr [[B_ADDR_FLOAT]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_03]], i64 1
+; CHECK-NEXT:    [[B_ADDR_FLOAT_INC]] = getelementptr inbounds i8, ptr [[B_ADDR_FLOAT]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_03]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/InstCombine/intptr4.ll b/llvm/test/Transforms/InstCombine/intptr4.ll
index 612fef0c1837d..5998501d82f4f 100644
--- a/llvm/test/Transforms/InstCombine/intptr4.ll
+++ b/llvm/test/Transforms/InstCombine/intptr4.ll
@@ -22,8 +22,8 @@ define  void @test(ptr %a, ptr readnone %a_end, i64 %b, ptr %bf) unnamed_addr  {
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[B_ADDR_FLOAT]], align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[L]], 4.200000e+01
 ; CHECK-NEXT:    store float [[MUL_I]], ptr [[A_ADDR_03]], align 4
-; CHECK-NEXT:    [[B_ADDR_FLOAT_INC]] = getelementptr inbounds float, ptr [[B_ADDR_I64_PTR]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_03]], i64 1
+; CHECK-NEXT:    [[B_ADDR_FLOAT_INC]] = getelementptr inbounds i8, ptr [[B_ADDR_I64_PTR]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_03]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/InstCombine/intptr5.ll b/llvm/test/Transforms/InstCombine/intptr5.ll
index 1f493d6a3a135..f992886208f67 100644
--- a/llvm/test/Transforms/InstCombine/intptr5.ll
+++ b/llvm/test/Transforms/InstCombine/intptr5.ll
@@ -24,8 +24,8 @@ define  void @test(ptr %a, ptr readnone %a_end, i64 %b, ptr %bf) unnamed_addr  {
 ; CHECK-NEXT:    [[L:%.*]] = load float, ptr [[B_ADDR_FLOAT]], align 4
 ; CHECK-NEXT:    [[MUL_I:%.*]] = fmul float [[L]], 4.200000e+01
 ; CHECK-NEXT:    store float [[MUL_I]], ptr [[A_ADDR_03]], align 4
-; CHECK-NEXT:    [[B_ADDR_FLOAT_INC]] = getelementptr inbounds float, ptr [[B_ADDR_I64_PTR]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_03]], i64 1
+; CHECK-NEXT:    [[B_ADDR_FLOAT_INC]] = getelementptr inbounds i8, ptr [[B_ADDR_I64_PTR]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_03]], i64 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ult ptr [[INCDEC_PTR]], [[A_END]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_END:%.*]]
 ; CHECK:       for.end:
diff --git a/llvm/test/Transforms/InstCombine/intptr7.ll b/llvm/test/Transforms/InstCombine/intptr7.ll
index 78e0b59ee0e23..18fdd208e99a0 100644
--- a/llvm/test/Transforms/InstCombine/intptr7.ll
+++ b/llvm/test/Transforms/InstCombine/intptr7.ll
@@ -6,7 +6,7 @@ define void @matching_phi(i64 %a, ptr %b, i1 %cond) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[BB2:%.*]], label [[BB1:%.*]]
 ; CHECK:       bb1:
-; CHECK-NEXT:    [[ADDB:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 2
+; CHECK-NEXT:    [[ADDB:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 8
 ; CHECK-NEXT:    br label [[BB3:%.*]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[ADD_INT:%.*]] = add i64 [[A:%.*]], 1
@@ -48,7 +48,7 @@ define void @no_matching_phi(i64 %a, ptr %b, i1 %cond) {
 ; CHECK-LABEL: @no_matching_phi(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ADD_INT:%.*]] = add i64 [[A:%.*]], 1
-; CHECK-NEXT:    [[ADDB:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 2
+; CHECK-NEXT:    [[ADDB:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 8
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[B:%.*]], label [[A:%.*]]
 ; CHECK:       A:
 ; CHECK-NEXT:    [[TMP0:%.*]] = inttoptr i64 [[ADD_INT]] to ptr
diff --git a/llvm/test/Transforms/InstCombine/load-store-forward.ll b/llvm/test/Transforms/InstCombine/load-store-forward.ll
index 6be5f6ed42d53..aa593b95b3601 100644
--- a/llvm/test/Transforms/InstCombine/load-store-forward.ll
+++ b/llvm/test/Transforms/InstCombine/load-store-forward.ll
@@ -63,7 +63,7 @@ define i32 @vec_store_load_first_constexpr(ptr %p) {
 define i32 @vec_store_load_second(ptr %p) {
 ; CHECK-LABEL: @vec_store_load_second(
 ; CHECK-NEXT:    store <2 x i32> <i32 1, i32 2>, ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i32, ptr [[P]], i64 1
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[P]], i64 4
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[P3]], align 4
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/load.ll b/llvm/test/Transforms/InstCombine/load.ll
index fb6c50d5fd8fc..7d53c8ee35684 100644
--- a/llvm/test/Transforms/InstCombine/load.ll
+++ b/llvm/test/Transforms/InstCombine/load.ll
@@ -136,7 +136,7 @@ C:		; preds = %F, %T
 
 define double @test11(ptr %p) {
 ; CHECK-LABEL: @test11(
-; CHECK-NEXT:    [[T0:%.*]] = getelementptr double, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[T0:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 8
 ; CHECK-NEXT:    store double 2.000000e+00, ptr [[T0]], align 8
 ; CHECK-NEXT:    ret double 2.000000e+00
 ;
diff --git a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
index e7ad7600a3f5f..b9a96937e57c7 100644
--- a/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
+++ b/llvm/test/Transforms/InstCombine/loadstore-metadata.ll
@@ -152,7 +152,7 @@ define void @test_load_cast_combine_nonnull(ptr %ptr) {
 ; CHECK-LABEL: @test_load_cast_combine_nonnull(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[P:%.*]] = load ptr, ptr [[PTR:%.*]], align 8, !nonnull !6
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr ptr, ptr [[PTR]], i64 42
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[PTR]], i64 336
 ; CHECK-NEXT:    store ptr [[P]], ptr [[GEP]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/memchr-5.ll b/llvm/test/Transforms/InstCombine/memchr-5.ll
index 40c6b28a30136..010c83719b8b6 100644
--- a/llvm/test/Transforms/InstCombine/memchr-5.ll
+++ b/llvm/test/Transforms/InstCombine/memchr-5.ll
@@ -16,37 +16,37 @@ declare ptr @memchr(ptr, i32, i64)
 define void @fold_memchr_a(ptr %pcmp) {
 ; BE-LABEL: @fold_memchr_a(
 ; BE-NEXT:    store i64 0, ptr [[PCMP:%.*]], align 4
-; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; BE-NEXT:    store i64 1, ptr [[PSTOR1]], align 4
-; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; BE-NEXT:    store i64 2, ptr [[PSTOR2]], align 4
-; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; BE-NEXT:    store i64 3, ptr [[PSTOR3]], align 4
-; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; BE-NEXT:    store i64 13, ptr [[PSTOR4]], align 4
-; BE-NEXT:    [[PSTOR6:%.*]] = getelementptr i64, ptr [[PCMP]], i64 6
+; BE-NEXT:    [[PSTOR6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 48
 ; BE-NEXT:    store i64 14, ptr [[PSTOR6]], align 4
-; BE-NEXT:    [[PSTOR7:%.*]] = getelementptr i64, ptr [[PCMP]], i64 7
+; BE-NEXT:    [[PSTOR7:%.*]] = getelementptr i8, ptr [[PCMP]], i64 56
 ; BE-NEXT:    store i64 15, ptr [[PSTOR7]], align 4
-; BE-NEXT:    [[PSTOR8:%.*]] = getelementptr i64, ptr [[PCMP]], i64 8
+; BE-NEXT:    [[PSTOR8:%.*]] = getelementptr i8, ptr [[PCMP]], i64 64
 ; BE-NEXT:    store i64 0, ptr [[PSTOR8]], align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_memchr_a(
 ; LE-NEXT:    store i64 3, ptr [[PCMP:%.*]], align 4
-; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; LE-NEXT:    store i64 2, ptr [[PSTOR1]], align 4
-; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; LE-NEXT:    store i64 1, ptr [[PSTOR2]], align 4
-; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; LE-NEXT:    store i64 0, ptr [[PSTOR3]], align 4
-; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; LE-NEXT:    store i64 14, ptr [[PSTOR4]], align 4
-; LE-NEXT:    [[PSTOR6:%.*]] = getelementptr i64, ptr [[PCMP]], i64 6
+; LE-NEXT:    [[PSTOR6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 48
 ; LE-NEXT:    store i64 13, ptr [[PSTOR6]], align 4
-; LE-NEXT:    [[PSTOR7:%.*]] = getelementptr i64, ptr [[PCMP]], i64 7
+; LE-NEXT:    [[PSTOR7:%.*]] = getelementptr i8, ptr [[PCMP]], i64 56
 ; LE-NEXT:    store i64 12, ptr [[PSTOR7]], align 4
-; LE-NEXT:    [[PSTOR8:%.*]] = getelementptr i64, ptr [[PCMP]], i64 8
+; LE-NEXT:    [[PSTOR8:%.*]] = getelementptr i8, ptr [[PCMP]], i64 64
 ; LE-NEXT:    store i64 0, ptr [[PSTOR8]], align 4
 ; LE-NEXT:    ret void
 ;
@@ -123,29 +123,29 @@ define void @fold_memchr_a(ptr %pcmp) {
 define void @fold_memchr_a_p1(ptr %pcmp) {
 ; BE-LABEL: @fold_memchr_a_p1(
 ; BE-NEXT:    store i64 0, ptr [[PCMP:%.*]], align 4
-; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; BE-NEXT:    store i64 1, ptr [[PSTOR1]], align 4
-; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; BE-NEXT:    store i64 2, ptr [[PSTOR2]], align 4
-; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; BE-NEXT:    store i64 3, ptr [[PSTOR3]], align 4
-; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; BE-NEXT:    store i64 0, ptr [[PSTOR4]], align 4
-; BE-NEXT:    [[PSTOR5:%.*]] = getelementptr i64, ptr [[PCMP]], i64 5
+; BE-NEXT:    [[PSTOR5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 40
 ; BE-NEXT:    store i64 0, ptr [[PSTOR5]], align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_memchr_a_p1(
 ; LE-NEXT:    store i64 3, ptr [[PCMP:%.*]], align 4
-; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; LE-NEXT:    store i64 2, ptr [[PSTOR1]], align 4
-; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; LE-NEXT:    store i64 1, ptr [[PSTOR2]], align 4
-; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; LE-NEXT:    store i64 0, ptr [[PSTOR3]], align 4
-; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; LE-NEXT:    store i64 0, ptr [[PSTOR4]], align 4
-; LE-NEXT:    [[PSTOR5:%.*]] = getelementptr i64, ptr [[PCMP]], i64 5
+; LE-NEXT:    [[PSTOR5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 40
 ; LE-NEXT:    store i64 0, ptr [[PSTOR5]], align 4
 ; LE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/memchr-9.ll b/llvm/test/Transforms/InstCombine/memchr-9.ll
index e2c2e6839817c..fe80c282eed54 100644
--- a/llvm/test/Transforms/InstCombine/memchr-9.ll
+++ b/llvm/test/Transforms/InstCombine/memchr-9.ll
@@ -19,23 +19,23 @@ declare ptr @memchr(ptr, i32, i64)
 define void @fold_memchr_A_pIb_cst_cst(ptr %pchr) {
 ; CHECK-LABEL: @fold_memchr_A_pIb_cst_cst(
 ; CHECK-NEXT:    store ptr @a, ptr [[PCHR:%.*]], align 8
-; CHECK-NEXT:    [[PST_0_1_1:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 1
+; CHECK-NEXT:    [[PST_0_1_1:%.*]] = getelementptr i8, ptr [[PCHR]], i64 8
 ; CHECK-NEXT:    store ptr null, ptr [[PST_0_1_1]], align 8
-; CHECK-NEXT:    [[PST_0_4_4:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 2
+; CHECK-NEXT:    [[PST_0_4_4:%.*]] = getelementptr i8, ptr [[PCHR]], i64 16
 ; CHECK-NEXT:    store ptr null, ptr [[PST_0_4_4]], align 8
-; CHECK-NEXT:    [[PST_1_0_1:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 3
+; CHECK-NEXT:    [[PST_1_0_1:%.*]] = getelementptr i8, ptr [[PCHR]], i64 24
 ; CHECK-NEXT:    store ptr getelementptr (i8, ptr @a, i64 1), ptr [[PST_1_0_1]], align 8
-; CHECK-NEXT:    [[PST_1_0_3:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 4
+; CHECK-NEXT:    [[PST_1_0_3:%.*]] = getelementptr i8, ptr [[PCHR]], i64 32
 ; CHECK-NEXT:    store ptr getelementptr (i8, ptr @a, i64 1), ptr [[PST_1_0_3]], align 8
-; CHECK-NEXT:    [[PST_1_1_1:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 5
+; CHECK-NEXT:    [[PST_1_1_1:%.*]] = getelementptr i8, ptr [[PCHR]], i64 40
 ; CHECK-NEXT:    store ptr null, ptr [[PST_1_1_1]], align 8
-; CHECK-NEXT:    [[PST_1_1_2:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 6
+; CHECK-NEXT:    [[PST_1_1_2:%.*]] = getelementptr i8, ptr [[PCHR]], i64 48
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 0, i64 1), ptr [[PST_1_1_2]], align 8
-; CHECK-NEXT:    [[PST_1_3_3:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 7
+; CHECK-NEXT:    [[PST_1_3_3:%.*]] = getelementptr i8, ptr [[PCHR]], i64 56
 ; CHECK-NEXT:    store ptr null, ptr [[PST_1_3_3]], align 8
-; CHECK-NEXT:    [[PST_1_3_4:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 8
+; CHECK-NEXT:    [[PST_1_3_4:%.*]] = getelementptr i8, ptr [[PCHR]], i64 64
 ; CHECK-NEXT:    store ptr null, ptr [[PST_1_3_4]], align 8
-; CHECK-NEXT:    [[PST_1_3_6:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 10
+; CHECK-NEXT:    [[PST_1_3_6:%.*]] = getelementptr i8, ptr [[PCHR]], i64 80
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 1), ptr [[PST_1_3_6]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -108,45 +108,45 @@ define void @fold_memchr_A_pIb_cst_N(i64 %N, ptr %pchr) {
 ; CHECK-NEXT:    [[MEMCHR_CMP:%.*]] = icmp eq i64 [[N:%.*]], 0
 ; CHECK-NEXT:    [[CHR_0_0_N:%.*]] = select i1 [[MEMCHR_CMP]], ptr null, ptr @a
 ; CHECK-NEXT:    store ptr [[CHR_0_0_N]], ptr [[PCHR:%.*]], align 8
-; CHECK-NEXT:    [[PST_0_1_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 1
+; CHECK-NEXT:    [[PST_0_1_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 8
 ; CHECK-NEXT:    [[MEMCHR_CMP1:%.*]] = icmp ult i64 [[N]], 3
 ; CHECK-NEXT:    [[CHR_0_1_N:%.*]] = select i1 [[MEMCHR_CMP1]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 0, i64 1)
 ; CHECK-NEXT:    store ptr [[CHR_0_1_N]], ptr [[PST_0_1_N]], align 8
-; CHECK-NEXT:    [[PST_0_4_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 2
+; CHECK-NEXT:    [[PST_0_4_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 16
 ; CHECK-NEXT:    store ptr null, ptr [[PST_0_4_N]], align 8
-; CHECK-NEXT:    [[PST_1_0_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 3
+; CHECK-NEXT:    [[PST_1_0_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 24
 ; CHECK-NEXT:    [[MEMCHR_CMP2:%.*]] = icmp eq i64 [[N]], 0
 ; CHECK-NEXT:    [[CHR_1_0_N:%.*]] = select i1 [[MEMCHR_CMP2]], ptr null, ptr getelementptr (i8, ptr @a, i64 1)
 ; CHECK-NEXT:    store ptr [[CHR_1_0_N]], ptr [[PST_1_0_N]], align 8
-; CHECK-NEXT:    [[PST_1_1_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 4
+; CHECK-NEXT:    [[PST_1_1_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 32
 ; CHECK-NEXT:    [[MEMCHR_CMP3:%.*]] = icmp ult i64 [[N]], 2
 ; CHECK-NEXT:    [[CHR_1_1_N:%.*]] = select i1 [[MEMCHR_CMP3]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 0, i64 1)
 ; CHECK-NEXT:    store ptr [[CHR_1_1_N]], ptr [[PST_1_1_N]], align 8
-; CHECK-NEXT:    [[PST_1_2_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 5
+; CHECK-NEXT:    [[PST_1_2_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 40
 ; CHECK-NEXT:    [[MEMCHR_CMP4:%.*]] = icmp ult i64 [[N]], 4
 ; CHECK-NEXT:    [[CHR_1_2_N:%.*]] = select i1 [[MEMCHR_CMP4]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 0)
 ; CHECK-NEXT:    store ptr [[CHR_1_2_N]], ptr [[PST_1_2_N]], align 8
-; CHECK-NEXT:    [[PST_1_3_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 6
+; CHECK-NEXT:    [[PST_1_3_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 48
 ; CHECK-NEXT:    [[MEMCHR_CMP5:%.*]] = icmp ult i64 [[N]], 6
 ; CHECK-NEXT:    [[CHR_1_3_N:%.*]] = select i1 [[MEMCHR_CMP5]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 1)
 ; CHECK-NEXT:    store ptr [[CHR_1_3_N]], ptr [[PST_1_3_N]], align 8
-; CHECK-NEXT:    [[PST_1_4_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 7
+; CHECK-NEXT:    [[PST_1_4_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 56
 ; CHECK-NEXT:    store ptr null, ptr [[PST_1_4_N]], align 8
-; CHECK-NEXT:    [[PST_2_0_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 8
+; CHECK-NEXT:    [[PST_2_0_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 64
 ; CHECK-NEXT:    store ptr null, ptr [[PST_2_0_N]], align 8
-; CHECK-NEXT:    [[PST_2_1_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 9
+; CHECK-NEXT:    [[PST_2_1_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 72
 ; CHECK-NEXT:    [[MEMCHR_CMP6:%.*]] = icmp eq i64 [[N]], 0
 ; CHECK-NEXT:    [[CHR_2_1_N:%.*]] = select i1 [[MEMCHR_CMP6]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 0, i64 1)
 ; CHECK-NEXT:    store ptr [[CHR_2_1_N]], ptr [[PST_2_1_N]], align 8
-; CHECK-NEXT:    [[PST_2_2_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 10
+; CHECK-NEXT:    [[PST_2_2_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 80
 ; CHECK-NEXT:    [[MEMCHR_CMP7:%.*]] = icmp ult i64 [[N]], 3
 ; CHECK-NEXT:    [[CHR_2_2_N:%.*]] = select i1 [[MEMCHR_CMP7]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 0)
 ; CHECK-NEXT:    store ptr [[CHR_2_2_N]], ptr [[PST_2_2_N]], align 8
-; CHECK-NEXT:    [[PST_2_3_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 11
+; CHECK-NEXT:    [[PST_2_3_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 88
 ; CHECK-NEXT:    [[MEMCHR_CMP8:%.*]] = icmp ult i64 [[N]], 5
 ; CHECK-NEXT:    [[CHR_2_3_N:%.*]] = select i1 [[MEMCHR_CMP8]], ptr null, ptr getelementptr inbounds ([1 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 1)
 ; CHECK-NEXT:    store ptr [[CHR_2_3_N]], ptr [[PST_2_3_N]], align 8
-; CHECK-NEXT:    [[PST_2_4_N:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 12
+; CHECK-NEXT:    [[PST_2_4_N:%.*]] = getelementptr i8, ptr [[PCHR]], i64 96
 ; CHECK-NEXT:    store ptr null, ptr [[PST_2_4_N]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -232,10 +232,10 @@ define void @call_memchr_A_pIb_xs_cst(ptr %pchr) {
 ; CHECK-LABEL: @call_memchr_A_pIb_xs_cst(
 ; CHECK-NEXT:    [[CHR_1_0_0_2:%.*]] = call ptr @memchr(ptr noundef nonnull dereferenceable(1) getelementptr inbounds ([1 x %struct.A], ptr @a, i64 1, i64 0), i32 0, i64 2)
 ; CHECK-NEXT:    store ptr [[CHR_1_0_0_2]], ptr [[PCHR:%.*]], align 8
-; CHECK-NEXT:    [[PST_1_0_1_2:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 1
+; CHECK-NEXT:    [[PST_1_0_1_2:%.*]] = getelementptr i8, ptr [[PCHR]], i64 8
 ; CHECK-NEXT:    [[CHR_1_0_1_2:%.*]] = call ptr @memchr(ptr noundef nonnull dereferenceable(1) getelementptr inbounds ([1 x %struct.A], ptr @a, i64 1, i64 0), i32 0, i64 2)
 ; CHECK-NEXT:    store ptr [[CHR_1_0_1_2]], ptr [[PST_1_0_1_2]], align 8
-; CHECK-NEXT:    [[PST_0_0_8_2:%.*]] = getelementptr ptr, ptr [[PCHR]], i64 2
+; CHECK-NEXT:    [[PST_0_0_8_2:%.*]] = getelementptr i8, ptr [[PCHR]], i64 16
 ; CHECK-NEXT:    [[CHR_0_0_8_2:%.*]] = call ptr @memchr(ptr noundef nonnull dereferenceable(1) getelementptr inbounds ([1 x %struct.A], ptr @a, i64 1, i64 0, i32 0, i64 0), i32 0, i64 2)
 ; CHECK-NEXT:    store ptr [[CHR_0_0_8_2]], ptr [[PST_0_0_8_2]], align 8
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/memcmp-3.ll b/llvm/test/Transforms/InstCombine/memcmp-3.ll
index fd4d54ebd7c1c..ae7c2fc330481 100644
--- a/llvm/test/Transforms/InstCombine/memcmp-3.ll
+++ b/llvm/test/Transforms/InstCombine/memcmp-3.ll
@@ -18,41 +18,41 @@ declare i32 @memcmp(ptr, ptr, i64)
 define void @fold_memcmp_ia16a_i8a(ptr %pcmp) {
 ; BE-LABEL: @fold_memcmp_ia16a_i8a(
 ; BE-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; BE-NEXT:    store i32 0, ptr [[PSTOR1]], align 4
-; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; BE-NEXT:    store i32 0, ptr [[PSTOR2]], align 4
-; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; BE-NEXT:    store i32 0, ptr [[PSTOR3]], align 4
-; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; BE-NEXT:    store i32 0, ptr [[PSTOR4]], align 4
-; BE-NEXT:    [[PSTOR5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; BE-NEXT:    [[PSTOR5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; BE-NEXT:    store i32 0, ptr [[PSTOR5]], align 4
-; BE-NEXT:    [[PSTOR6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 6
+; BE-NEXT:    [[PSTOR6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; BE-NEXT:    store i32 0, ptr [[PSTOR6]], align 4
-; BE-NEXT:    [[PSTOR7:%.*]] = getelementptr i32, ptr [[PCMP]], i64 7
+; BE-NEXT:    [[PSTOR7:%.*]] = getelementptr i8, ptr [[PCMP]], i64 28
 ; BE-NEXT:    store i32 0, ptr [[PSTOR7]], align 4
-; BE-NEXT:    [[PSTOR8:%.*]] = getelementptr i32, ptr [[PCMP]], i64 8
+; BE-NEXT:    [[PSTOR8:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; BE-NEXT:    store i32 1, ptr [[PSTOR8]], align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_memcmp_ia16a_i8a(
 ; LE-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; LE-NEXT:    store i32 1, ptr [[PSTOR1]], align 4
-; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; LE-NEXT:    store i32 1, ptr [[PSTOR2]], align 4
-; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; LE-NEXT:    store i32 1, ptr [[PSTOR3]], align 4
-; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; LE-NEXT:    store i32 1, ptr [[PSTOR4]], align 4
-; LE-NEXT:    [[PSTOR5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; LE-NEXT:    [[PSTOR5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; LE-NEXT:    store i32 1, ptr [[PSTOR5]], align 4
-; LE-NEXT:    [[PSTOR6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 6
+; LE-NEXT:    [[PSTOR6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; LE-NEXT:    store i32 1, ptr [[PSTOR6]], align 4
-; LE-NEXT:    [[PSTOR7:%.*]] = getelementptr i32, ptr [[PCMP]], i64 7
+; LE-NEXT:    [[PSTOR7:%.*]] = getelementptr i8, ptr [[PCMP]], i64 28
 ; LE-NEXT:    store i32 1, ptr [[PSTOR7]], align 4
-; LE-NEXT:    [[PSTOR8:%.*]] = getelementptr i32, ptr [[PCMP]], i64 8
+; LE-NEXT:    [[PSTOR8:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; LE-NEXT:    store i32 1, ptr [[PSTOR8]], align 4
 ; LE-NEXT:    ret void
 ;
@@ -101,33 +101,33 @@ define void @fold_memcmp_ia16a_i8a(ptr %pcmp) {
 define void @fold_memcmp_ia16a_p1_i8a_p1(ptr %pcmp) {
 ; BE-LABEL: @fold_memcmp_ia16a_p1_i8a_p1(
 ; BE-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; BE-NEXT:    store i32 1, ptr [[PSTOR1]], align 4
-; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; BE-NEXT:    store i32 1, ptr [[PSTOR2]], align 4
-; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; BE-NEXT:    store i32 1, ptr [[PSTOR3]], align 4
-; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; BE-NEXT:    store i32 1, ptr [[PSTOR4]], align 4
-; BE-NEXT:    [[PSTOR5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; BE-NEXT:    [[PSTOR5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; BE-NEXT:    store i32 1, ptr [[PSTOR5]], align 4
-; BE-NEXT:    [[PSTOR6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 6
+; BE-NEXT:    [[PSTOR6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; BE-NEXT:    store i32 1, ptr [[PSTOR6]], align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_memcmp_ia16a_p1_i8a_p1(
 ; LE-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; LE-NEXT:    store i32 1, ptr [[PSTOR1]], align 4
-; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; LE-NEXT:    store i32 1, ptr [[PSTOR2]], align 4
-; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; LE-NEXT:    store i32 1, ptr [[PSTOR3]], align 4
-; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; LE-NEXT:    store i32 1, ptr [[PSTOR4]], align 4
-; LE-NEXT:    [[PSTOR5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; LE-NEXT:    [[PSTOR5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; LE-NEXT:    store i32 1, ptr [[PSTOR5]], align 4
-; LE-NEXT:    [[PSTOR6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 6
+; LE-NEXT:    [[PSTOR6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; LE-NEXT:    store i32 1, ptr [[PSTOR6]], align 4
 ; LE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/memcmp-4.ll b/llvm/test/Transforms/InstCombine/memcmp-4.ll
index c63f305581b03..8c839bbfc47bd 100644
--- a/llvm/test/Transforms/InstCombine/memcmp-4.ll
+++ b/llvm/test/Transforms/InstCombine/memcmp-4.ll
@@ -20,13 +20,13 @@ declare i32 @memcmp(ptr, ptr, i64)
 define void @fold_memcmp_mismatch_too_big(ptr %pcmp) {
 ; BE-LABEL: @fold_memcmp_mismatch_too_big(
 ; BE-NEXT:    store i32 -1, ptr [[PCMP:%.*]], align 4
-; BE-NEXT:    [[PSTOR_CB:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; BE-NEXT:    [[PSTOR_CB:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; BE-NEXT:    store i32 1, ptr [[PSTOR_CB]], align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_memcmp_mismatch_too_big(
 ; LE-NEXT:    store i32 -1, ptr [[PCMP:%.*]], align 4
-; LE-NEXT:    [[PSTOR_CB:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; LE-NEXT:    [[PSTOR_CB:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; LE-NEXT:    store i32 1, ptr [[PSTOR_CB]], align 4
 ; LE-NEXT:    ret void
 ;
@@ -49,13 +49,13 @@ define void @fold_memcmp_mismatch_too_big(ptr %pcmp) {
 define void @fold_memcmp_match_too_big(ptr %pcmp) {
 ; BE-LABEL: @fold_memcmp_match_too_big(
 ; BE-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; BE-NEXT:    [[PSTOR_AB_M1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; BE-NEXT:    [[PSTOR_AB_M1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; BE-NEXT:    store i32 0, ptr [[PSTOR_AB_M1]], align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_memcmp_match_too_big(
 ; LE-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; LE-NEXT:    [[PSTOR_AB_M1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; LE-NEXT:    [[PSTOR_AB_M1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; LE-NEXT:    store i32 0, ptr [[PSTOR_AB_M1]], align 4
 ; LE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/memcmp-5.ll b/llvm/test/Transforms/InstCombine/memcmp-5.ll
index 591131cdb7907..fa97a206b45ba 100644
--- a/llvm/test/Transforms/InstCombine/memcmp-5.ll
+++ b/llvm/test/Transforms/InstCombine/memcmp-5.ll
@@ -20,21 +20,21 @@ define void @fold_memcmp_a_b_n(ptr %pcmp, i64 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[N:%.*]], 0
 ; CHECK-NEXT:    [[C0_1:%.*]] = sext i1 [[TMP1]] to i32
-; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 [[C0_1]], ptr [[S0_1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_2:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[S0_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[S0_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 [[C0_2]], ptr [[S0_2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_3:%.*]] = sext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[S0_3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[S0_3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 [[C0_3]], ptr [[S0_3]], align 4
-; CHECK-NEXT:    [[S0_4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; CHECK-NEXT:    [[S0_4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; CHECK-NEXT:    store i32 0, ptr [[S0_4]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_5:%.*]] = sext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[S0_5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; CHECK-NEXT:    [[S0_5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; CHECK-NEXT:    store i32 [[C0_5]], ptr [[S0_5]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -107,23 +107,23 @@ define void @fold_memcmp_a_c_n(ptr %pcmp, i64 %n) {
 ; CHECK-NEXT:    store i32 [[C0_0]], ptr [[PCMP:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_1:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 [[C0_1]], ptr [[S0_1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_2:%.*]] = sext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[S0_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[S0_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 [[C0_2]], ptr [[S0_2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_3:%.*]] = sext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[S0_3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[S0_3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 [[C0_3]], ptr [[S0_3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[N]], 3
 ; CHECK-NEXT:    [[C0_4:%.*]] = sext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[S0_4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; CHECK-NEXT:    [[S0_4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; CHECK-NEXT:    store i32 [[C0_4]], ptr [[S0_4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[N]], 3
 ; CHECK-NEXT:    [[C0_5:%.*]] = sext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[S0_5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; CHECK-NEXT:    [[S0_5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; CHECK-NEXT:    store i32 [[C0_5]], ptr [[S0_5]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -178,11 +178,11 @@ define void @fold_memcmp_a_d_n(ptr %pcmp, i64 %n) {
 ; CHECK-NEXT:    store i32 [[C0_0]], ptr [[PCMP:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_1:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 [[C0_1]], ptr [[S0_1]], align 4
-; CHECK-NEXT:    [[S1_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[S1_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 0, ptr [[S1_1]], align 4
-; CHECK-NEXT:    [[S6_6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[S6_6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 0, ptr [[S6_6]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/memcmp-6.ll b/llvm/test/Transforms/InstCombine/memcmp-6.ll
index e5e304895135c..be704bec0a8d0 100644
--- a/llvm/test/Transforms/InstCombine/memcmp-6.ll
+++ b/llvm/test/Transforms/InstCombine/memcmp-6.ll
@@ -16,11 +16,11 @@ declare i32 @memcmp(ptr, ptr, i64)
 define void @fold_memcmp_cst_cst(ptr %pcmp) {
 ; CHECK-LABEL: @fold_memcmp_cst_cst(
 ; CHECK-NEXT:    store i32 -1, ptr [[PCMP:%.*]], align 4
-; CHECK-NEXT:    [[SB5_A5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[SB5_A5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 1, ptr [[SB5_A5]], align 4
-; CHECK-NEXT:    [[SA6_B6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[SA6_B6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 -1, ptr [[SA6_B6]], align 4
-; CHECK-NEXT:    [[SB6_A6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[SB6_A6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 1, ptr [[SB6_A6]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -63,15 +63,15 @@ define void @fold_memcmp_cst_var(ptr %pcmp, i64 %n) {
 ; CHECK-NEXT:    store i32 [[CA0_B0]], ptr [[PCMP:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[N]], 6
 ; CHECK-NEXT:    [[CB0_A0:%.*]] = zext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[SB0_A0:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[SB0_A0:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 [[CB0_A0]], ptr [[SB0_A0]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[CA6_B6:%.*]] = sext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[SA6_B6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[SA6_B6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 [[CA6_B6]], ptr [[SA6_B6]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[CB6_A6:%.*]] = zext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[SB6_A6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[SB6_A6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 [[CB6_A6]], ptr [[SB6_A6]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/memcmp-7.ll b/llvm/test/Transforms/InstCombine/memcmp-7.ll
index 72dccfd86f801..8a6ce565f502d 100644
--- a/llvm/test/Transforms/InstCombine/memcmp-7.ll
+++ b/llvm/test/Transforms/InstCombine/memcmp-7.ll
@@ -14,9 +14,9 @@ declare i32 @memcmp(ptr, ptr, i64)
 define void @fold_memcmp_i32a_i32b_pIb(i32 %I, ptr %pcmp)
 ; CHECK-LABEL: @fold_memcmp_i32a_i32b_pIb(
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; CHECK-NEXT:    [[PST_1_1_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[PST_1_1_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 0, ptr [[PST_1_1_2]], align 4
-; CHECK-NEXT:    [[PST_1_1_3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[PST_1_1_3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 0, ptr [[PST_1_1_3]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -50,23 +50,23 @@ define void @fold_memcmp_i32a_i32b_pIb(i32 %I, ptr %pcmp)
 define void @fold_memcmp_A_B_pIb(i32 %I, ptr %pcmp) {
 ; CHECK-LABEL: @fold_memcmp_A_B_pIb(
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; CHECK-NEXT:    [[PST_0_0_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[PST_0_0_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 0, ptr [[PST_0_0_2]], align 4
-; CHECK-NEXT:    [[PST_0_0_3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[PST_0_0_3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 0, ptr [[PST_0_0_3]], align 4
-; CHECK-NEXT:    [[PST_0_0_4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[PST_0_0_4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 0, ptr [[PST_0_0_4]], align 4
-; CHECK-NEXT:    [[PST_0_1_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; CHECK-NEXT:    [[PST_0_1_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; CHECK-NEXT:    store i32 -1, ptr [[PST_0_1_1]], align 4
-; CHECK-NEXT:    [[PST_0_1_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; CHECK-NEXT:    [[PST_0_1_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; CHECK-NEXT:    store i32 -1, ptr [[PST_0_1_2]], align 4
-; CHECK-NEXT:    [[PST_0_1_3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 6
+; CHECK-NEXT:    [[PST_0_1_3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; CHECK-NEXT:    store i32 -1, ptr [[PST_0_1_3]], align 4
-; CHECK-NEXT:    [[PST_1_0_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; CHECK-NEXT:    [[PST_1_0_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; CHECK-NEXT:    store i32 1, ptr [[PST_1_0_1]], align 4
-; CHECK-NEXT:    [[PST_1_0_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; CHECK-NEXT:    [[PST_1_0_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; CHECK-NEXT:    store i32 1, ptr [[PST_1_0_2]], align 4
-; CHECK-NEXT:    [[PST_1_0_3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 6
+; CHECK-NEXT:    [[PST_1_0_3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; CHECK-NEXT:    store i32 1, ptr [[PST_1_0_3]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/memcpy_alloca.ll b/llvm/test/Transforms/InstCombine/memcpy_alloca.ll
index b40f4b8ca5a30..e58e357684e18 100644
--- a/llvm/test/Transforms/InstCombine/memcpy_alloca.ll
+++ b/llvm/test/Transforms/InstCombine/memcpy_alloca.ll
@@ -59,7 +59,7 @@ define void @test5(ptr %dest) {
 define void @test6(ptr %dest) {
 ; CHECK-LABEL: @test6(
 ; CHECK-NEXT:    [[A:%.*]] = alloca [7 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i16, ptr [[A]], i64 1
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 2
 ; CHECK-NEXT:    store i16 42, ptr [[P2]], align 2
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 1 dereferenceable(7) [[DEST:%.*]], ptr noundef nonnull align 1 dereferenceable(7) [[P2]], i64 7, i1 false)
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/memrchr-5.ll b/llvm/test/Transforms/InstCombine/memrchr-5.ll
index 535c01fd6fe79..2646f8d2e9408 100644
--- a/llvm/test/Transforms/InstCombine/memrchr-5.ll
+++ b/llvm/test/Transforms/InstCombine/memrchr-5.ll
@@ -16,37 +16,37 @@ declare ptr @memrchr(ptr, i32, i64)
 define void @fold_memrchr_a_16(ptr %pcmp) {
 ; BE-LABEL: @fold_memrchr_a_16(
 ; BE-NEXT:    store i64 0, ptr [[PCMP:%.*]], align 4
-; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; BE-NEXT:    store i64 1, ptr [[PSTOR1]], align 4
-; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; BE-NEXT:    store i64 2, ptr [[PSTOR2]], align 4
-; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; BE-NEXT:    store i64 3, ptr [[PSTOR3]], align 4
-; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; BE-NEXT:    store i64 13, ptr [[PSTOR4]], align 4
-; BE-NEXT:    [[PSTOR6:%.*]] = getelementptr i64, ptr [[PCMP]], i64 6
+; BE-NEXT:    [[PSTOR6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 48
 ; BE-NEXT:    store i64 14, ptr [[PSTOR6]], align 4
-; BE-NEXT:    [[PSTOR7:%.*]] = getelementptr i64, ptr [[PCMP]], i64 7
+; BE-NEXT:    [[PSTOR7:%.*]] = getelementptr i8, ptr [[PCMP]], i64 56
 ; BE-NEXT:    store i64 15, ptr [[PSTOR7]], align 4
-; BE-NEXT:    [[PSTOR8:%.*]] = getelementptr i64, ptr [[PCMP]], i64 8
+; BE-NEXT:    [[PSTOR8:%.*]] = getelementptr i8, ptr [[PCMP]], i64 64
 ; BE-NEXT:    store i64 0, ptr [[PSTOR8]], align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_memrchr_a_16(
 ; LE-NEXT:    store i64 3, ptr [[PCMP:%.*]], align 4
-; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; LE-NEXT:    store i64 2, ptr [[PSTOR1]], align 4
-; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; LE-NEXT:    store i64 1, ptr [[PSTOR2]], align 4
-; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; LE-NEXT:    store i64 0, ptr [[PSTOR3]], align 4
-; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; LE-NEXT:    store i64 14, ptr [[PSTOR4]], align 4
-; LE-NEXT:    [[PSTOR6:%.*]] = getelementptr i64, ptr [[PCMP]], i64 6
+; LE-NEXT:    [[PSTOR6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 48
 ; LE-NEXT:    store i64 13, ptr [[PSTOR6]], align 4
-; LE-NEXT:    [[PSTOR7:%.*]] = getelementptr i64, ptr [[PCMP]], i64 7
+; LE-NEXT:    [[PSTOR7:%.*]] = getelementptr i8, ptr [[PCMP]], i64 56
 ; LE-NEXT:    store i64 12, ptr [[PSTOR7]], align 4
-; LE-NEXT:    [[PSTOR8:%.*]] = getelementptr i64, ptr [[PCMP]], i64 8
+; LE-NEXT:    [[PSTOR8:%.*]] = getelementptr i8, ptr [[PCMP]], i64 64
 ; LE-NEXT:    store i64 0, ptr [[PSTOR8]], align 4
 ; LE-NEXT:    ret void
 ;
@@ -123,29 +123,29 @@ define void @fold_memrchr_a_16(ptr %pcmp) {
 define void @fold_memrchr_a_p1_16(ptr %pcmp) {
 ; BE-LABEL: @fold_memrchr_a_p1_16(
 ; BE-NEXT:    store i64 0, ptr [[PCMP:%.*]], align 4
-; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; BE-NEXT:    store i64 1, ptr [[PSTOR1]], align 4
-; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; BE-NEXT:    store i64 2, ptr [[PSTOR2]], align 4
-; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; BE-NEXT:    store i64 3, ptr [[PSTOR3]], align 4
-; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; BE-NEXT:    store i64 0, ptr [[PSTOR4]], align 4
-; BE-NEXT:    [[PSTOR5:%.*]] = getelementptr i64, ptr [[PCMP]], i64 5
+; BE-NEXT:    [[PSTOR5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 40
 ; BE-NEXT:    store i64 0, ptr [[PSTOR5]], align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_memrchr_a_p1_16(
 ; LE-NEXT:    store i64 3, ptr [[PCMP:%.*]], align 4
-; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; LE-NEXT:    store i64 2, ptr [[PSTOR1]], align 4
-; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; LE-NEXT:    store i64 1, ptr [[PSTOR2]], align 4
-; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; LE-NEXT:    store i64 0, ptr [[PSTOR3]], align 4
-; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; LE-NEXT:    store i64 0, ptr [[PSTOR4]], align 4
-; LE-NEXT:    [[PSTOR5:%.*]] = getelementptr i64, ptr [[PCMP]], i64 5
+; LE-NEXT:    [[PSTOR5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 40
 ; LE-NEXT:    store i64 0, ptr [[PSTOR5]], align 4
 ; LE-NEXT:    ret void
 ;
@@ -206,25 +206,25 @@ define void @fold_memrchr_a_p1_16(ptr %pcmp) {
 define void @fold_memrchr_a_20(ptr %pcmp) {
 ; BE-LABEL: @fold_memrchr_a_20(
 ; BE-NEXT:    store i64 16, ptr [[PCMP:%.*]], align 4
-; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; BE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; BE-NEXT:    store i64 17, ptr [[PSTOR1]], align 4
-; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; BE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; BE-NEXT:    store i64 18, ptr [[PSTOR2]], align 4
-; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; BE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; BE-NEXT:    store i64 19, ptr [[PSTOR3]], align 4
-; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; BE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; BE-NEXT:    store i64 4, ptr [[PSTOR4]], align 4
 ; BE-NEXT:    ret void
 ;
 ; LE-LABEL: @fold_memrchr_a_20(
 ; LE-NEXT:    store i64 19, ptr [[PCMP:%.*]], align 4
-; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i64, ptr [[PCMP]], i64 1
+; LE-NEXT:    [[PSTOR1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; LE-NEXT:    store i64 18, ptr [[PSTOR1]], align 4
-; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i64, ptr [[PCMP]], i64 2
+; LE-NEXT:    [[PSTOR2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; LE-NEXT:    store i64 17, ptr [[PSTOR2]], align 4
-; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i64, ptr [[PCMP]], i64 3
+; LE-NEXT:    [[PSTOR3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; LE-NEXT:    store i64 16, ptr [[PSTOR3]], align 4
-; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i64, ptr [[PCMP]], i64 4
+; LE-NEXT:    [[PSTOR4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; LE-NEXT:    store i64 7, ptr [[PSTOR4]], align 4
 ; LE-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/memset2.ll b/llvm/test/Transforms/InstCombine/memset2.ll
index d81abf659335a..be5faccca3c43 100644
--- a/llvm/test/Transforms/InstCombine/memset2.ll
+++ b/llvm/test/Transforms/InstCombine/memset2.ll
@@ -9,7 +9,7 @@ define i32 @test(ptr addrspace(1) nocapture %moves) {
 ; CHECK-LABEL: define i32 @test
 ; CHECK-SAME: (ptr addrspace(1) nocapture [[MOVES:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds [[STRUCT_MOVES:%.*]], ptr addrspace(1) [[MOVES]], i64 1, i32 0, i64 9
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[MOVES]], i64 26
 ; CHECK-NEXT:    store i64 0, ptr addrspace(1) [[GEP]], align 1
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/InstCombine/multi-size-address-space-pointer.ll b/llvm/test/Transforms/InstCombine/multi-size-address-space-pointer.ll
index 99eb53edab786..df3152a6a4353 100644
--- a/llvm/test/Transforms/InstCombine/multi-size-address-space-pointer.ll
+++ b/llvm/test/Transforms/InstCombine/multi-size-address-space-pointer.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:32:32:32-p1:64:64:64-p2:8:8:8-p3:16:16:16-p4:16:16:16-i
 
 define i32 @test_as0(ptr addrspace(0) %a) {
 ; CHECK-LABEL: @test_as0(
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i32, ptr [[A:%.*]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr [[A:%.*]], i32 4
 ; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    ret i32 [[Y]]
 ;
@@ -16,7 +16,7 @@ define i32 @test_as0(ptr addrspace(0) %a) {
 
 define i32 @test_as1(ptr addrspace(1) %a) {
 ; CHECK-LABEL: @test_as1(
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i32, ptr addrspace(1) [[A:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr addrspace(1) [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr addrspace(1) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    ret i32 [[Y]]
 ;
@@ -27,7 +27,7 @@ define i32 @test_as1(ptr addrspace(1) %a) {
 
 define i32 @test_as2(ptr addrspace(2) %a) {
 ; CHECK-LABEL: @test_as2(
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i32, ptr addrspace(2) [[A:%.*]], i8 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr addrspace(2) [[A:%.*]], i8 4
 ; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr addrspace(2) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    ret i32 [[Y]]
 ;
@@ -38,7 +38,7 @@ define i32 @test_as2(ptr addrspace(2) %a) {
 
 define i32 @test_as3(ptr addrspace(3) %a) {
 ; CHECK-LABEL: @test_as3(
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i32, ptr addrspace(3) [[A:%.*]], i16 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr i8, ptr addrspace(3) [[A:%.*]], i16 4
 ; CHECK-NEXT:    [[Y:%.*]] = load i32, ptr addrspace(3) [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    ret i32 [[Y]]
 ;
@@ -92,7 +92,7 @@ define <2 x i8> @test_combine_vector_inttoptr(<2 x i8> %a) {
 ; Check that the GEP index is changed to the address space integer type (i64 -> i8)
 define ptr addrspace(2) @shrink_gep_constant_index_64_as2(ptr addrspace(2) %p) {
 ; CHECK-LABEL: @shrink_gep_constant_index_64_as2(
-; CHECK-NEXT:    [[RET:%.*]] = getelementptr i32, ptr addrspace(2) [[P:%.*]], i8 1
+; CHECK-NEXT:    [[RET:%.*]] = getelementptr i8, ptr addrspace(2) [[P:%.*]], i8 4
 ; CHECK-NEXT:    ret ptr addrspace(2) [[RET]]
 ;
   %ret = getelementptr i32, ptr addrspace(2) %p, i64 1
@@ -101,7 +101,7 @@ define ptr addrspace(2) @shrink_gep_constant_index_64_as2(ptr addrspace(2) %p) {
 
 define ptr addrspace(2) @shrink_gep_constant_index_32_as2(ptr addrspace(2) %p) {
 ; CHECK-LABEL: @shrink_gep_constant_index_32_as2(
-; CHECK-NEXT:    [[RET:%.*]] = getelementptr i32, ptr addrspace(2) [[P:%.*]], i8 1
+; CHECK-NEXT:    [[RET:%.*]] = getelementptr i8, ptr addrspace(2) [[P:%.*]], i8 4
 ; CHECK-NEXT:    ret ptr addrspace(2) [[RET]]
 ;
   %ret = getelementptr i32, ptr addrspace(2) %p, i32 1
@@ -110,7 +110,7 @@ define ptr addrspace(2) @shrink_gep_constant_index_32_as2(ptr addrspace(2) %p) {
 
 define ptr addrspace(3) @shrink_gep_constant_index_64_as3(ptr addrspace(3) %p) {
 ; CHECK-LABEL: @shrink_gep_constant_index_64_as3(
-; CHECK-NEXT:    [[RET:%.*]] = getelementptr i32, ptr addrspace(3) [[P:%.*]], i16 1
+; CHECK-NEXT:    [[RET:%.*]] = getelementptr i8, ptr addrspace(3) [[P:%.*]], i16 4
 ; CHECK-NEXT:    ret ptr addrspace(3) [[RET]]
 ;
   %ret = getelementptr i32, ptr addrspace(3) %p, i64 1
diff --git a/llvm/test/Transforms/InstCombine/non-integral-pointers.ll b/llvm/test/Transforms/InstCombine/non-integral-pointers.ll
index 7c82b7ff36f5d..11e2108f60078 100644
--- a/llvm/test/Transforms/InstCombine/non-integral-pointers.ll
+++ b/llvm/test/Transforms/InstCombine/non-integral-pointers.ll
@@ -56,7 +56,7 @@ define i64 @g(ptr %gp) {
 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load ptr addrspace(4), ptr [[GP:%.*]], align 8
 ; CHECK-NEXT:    [[V74:%.*]] = call ptr addrspace(4) @alloc()
 ; CHECK-NEXT:    [[V75:%.*]] = addrspacecast ptr addrspace(4) [[V74]] to ptr
-; CHECK-NEXT:    [[V77:%.*]] = getelementptr ptr addrspace(4), ptr [[V75]], i64 -1
+; CHECK-NEXT:    [[V77:%.*]] = getelementptr i8, ptr [[V75]], i64 -8
 ; CHECK-NEXT:    store ptr addrspace(4) [[DOTPRE]], ptr [[V77]], align 8
 ; CHECK-NEXT:    [[V81:%.*]] = load i64, ptr [[V77]], align 8
 ; CHECK-NEXT:    ret i64 [[V81]]
@@ -74,7 +74,7 @@ define i64 @g2(ptr addrspace(4) %gp) {
 ; CHECK-LABEL: @g2(
 ; CHECK-NEXT:    [[DOTPRE:%.*]] = load ptr, ptr addrspace(4) [[GP:%.*]], align 8
 ; CHECK-NEXT:    [[V74:%.*]] = call ptr addrspace(4) @alloc()
-; CHECK-NEXT:    [[V77:%.*]] = getelementptr ptr, ptr addrspace(4) [[V74]], i64 -1
+; CHECK-NEXT:    [[V77:%.*]] = getelementptr i8, ptr addrspace(4) [[V74]], i64 -8
 ; CHECK-NEXT:    store ptr [[DOTPRE]], ptr addrspace(4) [[V77]], align 8
 ; CHECK-NEXT:    [[V81_CAST:%.*]] = ptrtoint ptr [[DOTPRE]] to i64
 ; CHECK-NEXT:    ret i64 [[V81_CAST]]
diff --git a/llvm/test/Transforms/InstCombine/opaque-ptr.ll b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
index 4448a49ad92bf..4d38e2cd37c95 100644
--- a/llvm/test/Transforms/InstCombine/opaque-ptr.ll
+++ b/llvm/test/Transforms/InstCombine/opaque-ptr.ll
@@ -160,7 +160,7 @@ define void @varargs_cast_opaque_to_typed(ptr %a) {
 
 define ptr @geps_combinable(ptr %a) {
 ; CHECK-LABEL: @geps_combinable(
-; CHECK-NEXT:    [[A3:%.*]] = getelementptr { i32, { i32, i32 } }, ptr [[A:%.*]], i64 0, i32 1, i32 1
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 8
 ; CHECK-NEXT:    ret ptr [[A3]]
 ;
   %a2 = getelementptr { i32, { i32, i32 } }, ptr %a, i32 0, i32 1
@@ -170,7 +170,7 @@ define ptr @geps_combinable(ptr %a) {
 
 define ptr @geps_combinable_different_elem_type1(ptr %a) {
 ; CHECK-LABEL: @geps_combinable_different_elem_type1(
-; CHECK-NEXT:    [[A3:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 8
 ; CHECK-NEXT:    ret ptr [[A3]]
 ;
   %a2 = getelementptr { i32, i32 }, ptr %a, i32 0, i32 1
@@ -180,7 +180,7 @@ define ptr @geps_combinable_different_elem_type1(ptr %a) {
 
 define ptr @geps_combinable_different_elem_type2(ptr %a) {
 ; CHECK-LABEL: @geps_combinable_different_elem_type2(
-; CHECK-NEXT:    [[A3:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 8
 ; CHECK-NEXT:    ret ptr [[A3]]
 ;
   %a2 = getelementptr { i32, i32 }, ptr %a, i32 0, i32 1
@@ -190,7 +190,7 @@ define ptr @geps_combinable_different_elem_type2(ptr %a) {
 
 define ptr @geps_combinable_different_elem_type3(ptr %a) {
 ; CHECK-LABEL: @geps_combinable_different_elem_type3(
-; CHECK-NEXT:    [[A3:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 1, i32 1
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 12
 ; CHECK-NEXT:    ret ptr [[A3]]
 ;
   %a2 = getelementptr { i32, i32 }, ptr %a, i32 0, i32 1
@@ -263,9 +263,9 @@ declare void @use(ptr)
 
 define ptr @geps_combinable_different_elem_type_extra_use1(ptr %a) {
 ; CHECK-LABEL: @geps_combinable_different_elem_type_extra_use1(
-; CHECK-NEXT:    [[A2:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[A2:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    call void @use(ptr [[A2]])
-; CHECK-NEXT:    [[A3:%.*]] = getelementptr { i32, i32 }, ptr [[A]], i64 1
+; CHECK-NEXT:    [[A3:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    ret ptr [[A3]]
 ;
   %a2 = getelementptr { i32, i32 }, ptr %a, i32 0, i32 1
@@ -440,7 +440,7 @@ bb2:
 define ptr addrspace(1) @gep_of_addrspace_cast(ptr %ptr) {
 ; CHECK-LABEL: @gep_of_addrspace_cast(
 ; CHECK-NEXT:    [[CAST1:%.*]] = addrspacecast ptr [[PTR:%.*]] to ptr addrspace(1)
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr addrspace(1) [[CAST1]], i64 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[CAST1]], i64 4
 ; CHECK-NEXT:    ret ptr addrspace(1) [[GEP]]
 ;
   %cast1 = addrspacecast ptr %ptr to ptr addrspace(1)
@@ -519,7 +519,7 @@ define ptr @phi_of_gep(i1 %c, ptr %p) {
 ; CHECK:       else:
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    [[PHI:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[PHI:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    ret ptr [[PHI]]
 ;
   br i1 %c, label %if, label %else
@@ -541,10 +541,10 @@ define ptr @phi_of_gep_different_type(i1 %c, ptr %p) {
 ; CHECK-LABEL: @phi_of_gep_different_type(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 4
 ; CHECK-NEXT:    br label [[JOIN:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i64, ptr [[P]], i64 1
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i8, ptr [[P]], i64 8
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
 ; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[GEP1]], [[IF]] ], [ [[GEP2]], [[ELSE]] ]
@@ -573,9 +573,9 @@ define ptr @gep_of_phi_of_gep(i1 %c, ptr %p) {
 ; CHECK:       else:
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 1, [[IF]] ], [ 2, [[ELSE]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 8, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
 ; CHECK-NEXT:    ret ptr [[GEP]]
 ;
   br i1 %c, label %if, label %else
@@ -598,14 +598,13 @@ define ptr @gep_of_phi_of_gep_different_type(i1 %c, ptr %p) {
 ; CHECK-LABEL: @gep_of_phi_of_gep_different_type(
 ; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
 ; CHECK-NEXT:    br label [[JOIN:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i64, ptr [[P]], i64 2
 ; CHECK-NEXT:    br label [[JOIN]]
 ; CHECK:       join:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[GEP1]], [[IF]] ], [ [[GEP2]], [[ELSE]] ]
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i32, ptr [[PHI]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = phi i64 [ 4, [[IF]] ], [ 16, [[ELSE]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[TMP1]]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 4
 ; CHECK-NEXT:    ret ptr [[GEP]]
 ;
   br i1 %c, label %if, label %else
@@ -626,8 +625,8 @@ join:
 
 define ptr @select_of_gep(i1 %c, ptr %p) {
 ; CHECK-LABEL: @select_of_gep(
-; CHECK-NEXT:    [[S_V:%.*]] = select i1 [[C:%.*]], i64 1, i64 2
-; CHECK-NEXT:    [[S:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 [[S_V]]
+; CHECK-NEXT:    [[S_V:%.*]] = select i1 [[C:%.*]], i64 4, i64 8
+; CHECK-NEXT:    [[S:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[S_V]]
 ; CHECK-NEXT:    ret ptr [[S]]
 ;
   %gep1 = getelementptr i32, ptr %p, i64 1
@@ -638,9 +637,8 @@ define ptr @select_of_gep(i1 %c, ptr %p) {
 
 define ptr @select_of_gep_different_type(i1 %c, ptr %p) {
 ; CHECK-LABEL: @select_of_gep_different_type(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr i32, ptr [[P:%.*]], i64 1
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr i64, ptr [[P]], i64 2
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[C:%.*]], ptr [[GEP1]], ptr [[GEP2]]
+; CHECK-NEXT:    [[S_V:%.*]] = select i1 [[C:%.*]], i64 4, i64 16
+; CHECK-NEXT:    [[S:%.*]] = getelementptr i8, ptr [[P:%.*]], i64 [[S_V]]
 ; CHECK-NEXT:    ret ptr [[S]]
 ;
   %gep1 = getelementptr i32, ptr %p, i64 1
diff --git a/llvm/test/Transforms/InstCombine/phi-equal-incoming-pointers.ll b/llvm/test/Transforms/InstCombine/phi-equal-incoming-pointers.ll
index ea75de4e99f88..998169ec2fe26 100644
--- a/llvm/test/Transforms/InstCombine/phi-equal-incoming-pointers.ll
+++ b/llvm/test/Transforms/InstCombine/phi-equal-incoming-pointers.ll
@@ -157,7 +157,7 @@ define i32 @test_gep_i32ptr(i1 %cond, i1 %cond2) {
 ; ALL:       bb2:
 ; ALL-NEXT:    br label [[EXIT]]
 ; ALL:       exit:
-; ALL-NEXT:    [[PTR_TYPED:%.*]] = getelementptr inbounds i32, ptr [[OBJ]], i64 16
+; ALL-NEXT:    [[PTR_TYPED:%.*]] = getelementptr inbounds i8, ptr [[OBJ]], i64 64
 ; ALL-NEXT:    [[RES_PHI:%.*]] = load i32, ptr [[PTR_TYPED]], align 4
 ; ALL-NEXT:    store i32 1, ptr [[PTR_TYPED]], align 4
 ; ALL-NEXT:    [[RES:%.*]] = select i1 [[COND2:%.*]], i32 [[RES_PHI]], i32 1
diff --git a/llvm/test/Transforms/InstCombine/phi-timeout.ll b/llvm/test/Transforms/InstCombine/phi-timeout.ll
index cb28502fdbb60..f30750ef0455c 100644
--- a/llvm/test/Transforms/InstCombine/phi-timeout.ll
+++ b/llvm/test/Transforms/InstCombine/phi-timeout.ll
@@ -10,7 +10,7 @@ define void @timeout(ptr nocapture readonly %cinfo, ptr %ptr) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[CINFO:%.*]], i32 2
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, ptr [[CINFO:%.*]], i32 4
 ; CHECK-NEXT:    [[L:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
 ; CHECK-NEXT:    [[CMP17:%.*]] = icmp eq i16 [[L]], 0
 ; CHECK-NEXT:    [[EXTRACT_T1:%.*]] = trunc i16 [[L]] to i8
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index 717f4c682a153..e1ae6c1ea4757 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -232,7 +232,7 @@ define ptr @test8(ptr %A, i1 %b) {
 ; CHECK:       BB1:
 ; CHECK-NEXT:    br label [[BB2]]
 ; CHECK:       BB2:
-; CHECK-NEXT:    [[B:%.*]] = getelementptr { i32, i32 }, ptr [[A:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[B:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    ret ptr [[B]]
 ;
 BB0:
@@ -1735,7 +1735,7 @@ define i1 @pr57488_icmp_of_phi(ptr %ptr.base, i64 %len) {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[ACCUM:%.*]] = phi i1 [ [[AND:%.*]], [[LOOP]] ], [ true, [[START:%.*]] ]
 ; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[PTR_NEXT:%.*]], [[LOOP]] ], [ [[PTR_BASE]], [[START]] ]
-; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i64, ptr [[PTR]], i64 1
+; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
 ; CHECK-NEXT:    [[VAL:%.*]] = load i64, ptr [[PTR]], align 8
 ; CHECK-NEXT:    [[VAL_ZERO:%.*]] = icmp eq i64 [[VAL]], 0
 ; CHECK-NEXT:    [[AND]] = and i1 [[ACCUM]], [[VAL_ZERO]]
diff --git a/llvm/test/Transforms/InstCombine/pr39908.ll b/llvm/test/Transforms/InstCombine/pr39908.ll
index 0f0eae68439d7..ca143f417fb27 100644
--- a/llvm/test/Transforms/InstCombine/pr39908.ll
+++ b/llvm/test/Transforms/InstCombine/pr39908.ll
@@ -8,7 +8,7 @@ target datalayout = "p:32:32"
 define i1 @test(ptr %p, i32 %n) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:    [[END:%.*]] = getelementptr inbounds [0 x %S], ptr [[P:%.*]], i32 0, i32 [[N:%.*]], i32 0, i32 0
-; CHECK-NEXT:    [[LAST:%.*]] = getelementptr inbounds [[S:%.*]], ptr [[END]], i32 -1
+; CHECK-NEXT:    [[LAST:%.*]] = getelementptr inbounds i8, ptr [[END]], i32 -8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[LAST]], [[P]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -23,7 +23,7 @@ define i1 @test64(ptr %p, i64 %n) {
 ; CHECK-LABEL: @test64(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[END:%.*]] = getelementptr inbounds [0 x %S], ptr [[P:%.*]], i32 0, i32 [[TMP1]], i32 0, i32 0
-; CHECK-NEXT:    [[LAST:%.*]] = getelementptr inbounds [[S:%.*]], ptr [[END]], i32 -1
+; CHECK-NEXT:    [[LAST:%.*]] = getelementptr inbounds i8, ptr [[END]], i32 -8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[LAST]], [[P]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
@@ -38,7 +38,7 @@ define i1 @test64_overflow(ptr %p, i64 %n) {
 ; CHECK-LABEL: @test64_overflow(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i64 [[N:%.*]] to i32
 ; CHECK-NEXT:    [[END:%.*]] = getelementptr inbounds [0 x %S], ptr [[P:%.*]], i32 0, i32 [[TMP1]], i32 0, i32 0
-; CHECK-NEXT:    [[LAST:%.*]] = getelementptr inbounds [[S:%.*]], ptr [[END]], i32 -1
+; CHECK-NEXT:    [[LAST:%.*]] = getelementptr inbounds i8, ptr [[END]], i32 -8
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp eq ptr [[LAST]], [[P]]
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/pr44242.ll b/llvm/test/Transforms/InstCombine/pr44242.ll
index e7b0f7da03476..e86c17057fe27 100644
--- a/llvm/test/Transforms/InstCombine/pr44242.ll
+++ b/llvm/test/Transforms/InstCombine/pr44242.ll
@@ -118,7 +118,7 @@ define void @store_address(i32 %x) {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[X:%.*]], 0
 ; CHECK-NEXT:    br i1 [[CMP]], label [[END:%.*]], label [[LOOP]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[VAL_INCR]] = getelementptr float, ptr [[VAL]], i64 1
+; CHECK-NEXT:    [[VAL_INCR]] = getelementptr i8, ptr [[VAL]], i64 4
 ; CHECK-NEXT:    br label [[LOOP_HEADER]]
 ; CHECK:       end:
 ; CHECK-NEXT:    store i32 0, ptr [[VAL]], align 4
diff --git a/llvm/test/Transforms/InstCombine/pr58901.ll b/llvm/test/Transforms/InstCombine/pr58901.ll
index f94c3f131b2be..1eea4b5305545 100644
--- a/llvm/test/Transforms/InstCombine/pr58901.ll
+++ b/llvm/test/Transforms/InstCombine/pr58901.ll
@@ -3,8 +3,9 @@
 
 define ptr @f1(ptr %arg, i64 %arg1) {
 ; CHECK-LABEL: @f1(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [6 x i32], ptr [[ARG:%.*]], i64 3, i64 [[ARG1:%.*]]
-; CHECK-NEXT:    ret ptr [[TMP1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 72
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [6 x i32], ptr [[TMP1]], i64 0, i64 [[ARG1:%.*]]
+; CHECK-NEXT:    ret ptr [[TMP2]]
 ;
   %1 = getelementptr [6 x i32], ptr %arg, i64 3
   %2 = getelementptr [6 x i32], ptr %1, i64 0, i64 %arg1
@@ -13,7 +14,7 @@ define ptr @f1(ptr %arg, i64 %arg1) {
 
 define ptr @f2(ptr %arg, i64 %arg1) {
 ; CHECK-LABEL: @f2(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr [6 x i32], ptr [[ARG:%.*]], i64 3
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[ARG:%.*]], i64 72
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr [6 x i32], ptr [[TMP1]], i64 [[ARG1:%.*]], i64 [[ARG1]]
 ; CHECK-NEXT:    ret ptr [[TMP2]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
index 6a0e7098ab744..a7aa3a8ef1beb 100644
--- a/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
+++ b/llvm/test/Transforms/InstCombine/ptr-replace-alloca.ll
@@ -45,10 +45,10 @@ define i8 @volatile_load_keep_alloca(i1 %cond) {
 ; CHECK-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef align 4 dereferenceable(32) [[ALLOCA]], ptr noundef nonnull align 16 dereferenceable(32) @g1, i64 32, i1 false)
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[VAL_IF:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(1) [[ALLOCA]], i64 0, i64 1
+; CHECK-NEXT:    [[VAL_IF:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ALLOCA]], i64 1
 ; CHECK-NEXT:    br label [[SINK:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[VAL_ELSE:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(1) [[ALLOCA]], i64 0, i64 2
+; CHECK-NEXT:    [[VAL_ELSE:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ALLOCA]], i64 2
 ; CHECK-NEXT:    br label [[SINK]]
 ; CHECK:       sink:
 ; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[VAL_IF]], [[IF]] ], [ [[VAL_ELSE]], [[ELSE]] ]
@@ -81,10 +81,10 @@ define i8 @no_memcpy_keep_alloca(i1 %cond) {
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i8], align 4, addrspace(1)
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[IF:%.*]], label [[ELSE:%.*]]
 ; CHECK:       if:
-; CHECK-NEXT:    [[VAL_IF:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(1) [[ALLOCA]], i64 0, i64 1
+; CHECK-NEXT:    [[VAL_IF:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ALLOCA]], i64 1
 ; CHECK-NEXT:    br label [[SINK:%.*]]
 ; CHECK:       else:
-; CHECK-NEXT:    [[VAL_ELSE:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(1) [[ALLOCA]], i64 0, i64 2
+; CHECK-NEXT:    [[VAL_ELSE:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ALLOCA]], i64 2
 ; CHECK-NEXT:    br label [[SINK]]
 ; CHECK:       sink:
 ; CHECK-NEXT:    [[PTR:%.*]] = phi ptr addrspace(1) [ [[VAL_IF]], [[IF]] ], [ [[VAL_ELSE]], [[ELSE]] ]
@@ -445,7 +445,7 @@ define i8 @call_readonly_keep_alloca2() {
 ; CHECK-LABEL: @call_readonly_keep_alloca2(
 ; CHECK-NEXT:    [[ALLOCA:%.*]] = alloca [32 x i8], align 1, addrspace(1)
 ; CHECK-NEXT:    call void @llvm.memcpy.p1.p0.i64(ptr addrspace(1) noundef align 1 dereferenceable(16) [[ALLOCA]], ptr noundef nonnull align 16 dereferenceable(16) @g1, i64 16, i1 false)
-; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds [32 x i8], ptr addrspace(1) [[ALLOCA]], i64 0, i64 16
+; CHECK-NEXT:    [[A1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[ALLOCA]], i64 16
 ; CHECK-NEXT:    call void @llvm.memcpy.p1.p1.i64(ptr addrspace(1) noundef align 1 dereferenceable(16) [[A1]], ptr addrspace(1) noundef align 16 dereferenceable(16) @g2, i64 16, i1 false)
 ; CHECK-NEXT:    [[P:%.*]] = addrspacecast ptr addrspace(1) [[ALLOCA]] to ptr
 ; CHECK-NEXT:    [[V:%.*]] = call i8 @readonly_callee(ptr [[P]])
diff --git a/llvm/test/Transforms/InstCombine/select-cmp-br.ll b/llvm/test/Transforms/InstCombine/select-cmp-br.ll
index 92dc562281b81..f1c62a5593f0f 100644
--- a/llvm/test/Transforms/InstCombine/select-cmp-br.ll
+++ b/llvm/test/Transforms/InstCombine/select-cmp-br.ll
@@ -12,7 +12,7 @@ define void @test1(ptr %arg) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[M:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C:%.*]], ptr [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 16
 ; CHECK-NEXT:    [[N:%.*]] = load ptr, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP5_NOT:%.*]] = icmp eq ptr [[M]], [[N]]
 ; CHECK-NEXT:    br i1 [[TMP5_NOT]], label [[BB8:%.*]], label [[BB10:%.*]]
@@ -22,7 +22,7 @@ define void @test1(ptr %arg) {
 ; CHECK-NEXT:    tail call void @bar(ptr nonnull [[ARG]])
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[M]], i64 9
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[M]], i64 72
 ; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](ptr nonnull [[ARG]])
 ; CHECK-NEXT:    br label [[BB]]
@@ -54,7 +54,7 @@ define void @test2(ptr %arg) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[M:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C:%.*]], ptr [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 16
 ; CHECK-NEXT:    [[N:%.*]] = load ptr, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[M]], [[N]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[BB10:%.*]], label [[BB8:%.*]]
@@ -64,7 +64,7 @@ define void @test2(ptr %arg) {
 ; CHECK-NEXT:    tail call void @bar(ptr nonnull [[ARG]])
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[M]], i64 9
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[M]], i64 72
 ; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](ptr nonnull [[ARG]])
 ; CHECK-NEXT:    br label [[BB]]
@@ -96,7 +96,7 @@ define void @test3(ptr %arg) {
 ; CHECK-LABEL: @test3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[M:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C:%.*]], ptr [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 16
 ; CHECK-NEXT:    [[N:%.*]] = load ptr, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[M]], [[N]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[BB8:%.*]], label [[BB10:%.*]]
@@ -106,7 +106,7 @@ define void @test3(ptr %arg) {
 ; CHECK-NEXT:    tail call void @bar(ptr nonnull [[ARG]])
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[M]], i64 9
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[M]], i64 72
 ; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](ptr nonnull [[ARG]])
 ; CHECK-NEXT:    br label [[BB]]
@@ -138,7 +138,7 @@ define void @test4(ptr %arg) {
 ; CHECK-LABEL: @test4(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[M:%.*]] = load ptr, ptr [[ARG:%.*]], align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds [[C:%.*]], ptr [[ARG]], i64 1, i32 0, i32 0
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[ARG]], i64 16
 ; CHECK-NEXT:    [[N:%.*]] = load ptr, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq ptr [[M]], [[N]]
 ; CHECK-NEXT:    br i1 [[TMP5]], label [[BB10:%.*]], label [[BB8:%.*]]
@@ -148,7 +148,7 @@ define void @test4(ptr %arg) {
 ; CHECK-NEXT:    tail call void @bar(ptr nonnull [[ARG]])
 ; CHECK-NEXT:    br label [[BB:%.*]]
 ; CHECK:       bb10:
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[M]], i64 9
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[M]], i64 72
 ; CHECK-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP11:%.*]] = tail call i64 [[TMP4]](ptr nonnull [[ARG]])
 ; CHECK-NEXT:    br label [[BB]]
diff --git a/llvm/test/Transforms/InstCombine/select-gep.ll b/llvm/test/Transforms/InstCombine/select-gep.ll
index ad2b029c51764..53bbcda85bb22 100644
--- a/llvm/test/Transforms/InstCombine/select-gep.ll
+++ b/llvm/test/Transforms/InstCombine/select-gep.ll
@@ -5,7 +5,7 @@ define ptr @test1a(ptr %p, ptr %q) {
 ; CHECK-LABEL: @test1a(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt ptr [[P:%.*]], [[Q:%.*]]
 ; CHECK-NEXT:    [[SELECT_V:%.*]] = select i1 [[CMP]], ptr [[P]], ptr [[Q]]
-; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr i32, ptr [[SELECT_V]], i64 4
+; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr i8, ptr [[SELECT_V]], i64 16
 ; CHECK-NEXT:    ret ptr [[SELECT]]
 ;
   %gep1 = getelementptr i32, ptr %p, i64 4
@@ -19,7 +19,7 @@ define ptr @test1b(ptr %p, ptr %q) {
 ; CHECK-LABEL: @test1b(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt ptr [[P:%.*]], [[Q:%.*]]
 ; CHECK-NEXT:    [[SELECT_V:%.*]] = select i1 [[CMP]], ptr [[P]], ptr [[Q]]
-; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr i32, ptr [[SELECT_V]], i64 4
+; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr i8, ptr [[SELECT_V]], i64 16
 ; CHECK-NEXT:    ret ptr [[SELECT]]
 ;
   %gep1 = getelementptr inbounds i32, ptr %p, i64 4
@@ -33,7 +33,7 @@ define ptr @test1c(ptr %p, ptr %q) {
 ; CHECK-LABEL: @test1c(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt ptr [[P:%.*]], [[Q:%.*]]
 ; CHECK-NEXT:    [[SELECT_V:%.*]] = select i1 [[CMP]], ptr [[P]], ptr [[Q]]
-; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr i32, ptr [[SELECT_V]], i64 4
+; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr i8, ptr [[SELECT_V]], i64 16
 ; CHECK-NEXT:    ret ptr [[SELECT]]
 ;
   %gep1 = getelementptr i32, ptr %p, i64 4
@@ -47,7 +47,7 @@ define ptr @test1d(ptr %p, ptr %q) {
 ; CHECK-LABEL: @test1d(
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt ptr [[P:%.*]], [[Q:%.*]]
 ; CHECK-NEXT:    [[SELECT_V:%.*]] = select i1 [[CMP]], ptr [[P]], ptr [[Q]]
-; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr inbounds i32, ptr [[SELECT_V]], i64 4
+; CHECK-NEXT:    [[SELECT:%.*]] = getelementptr inbounds i8, ptr [[SELECT_V]], i64 16
 ; CHECK-NEXT:    ret ptr [[SELECT]]
 ;
   %gep1 = getelementptr inbounds i32, ptr %p, i64 4
@@ -103,8 +103,8 @@ define ptr @test2c(ptr %p, i64 %x, i64 %y) {
 ; CHECK-LABEL: @test2c(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[X:%.*]]
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ugt i64 [[X]], [[Y:%.*]]
-; CHECK-NEXT:    [[SEL_IDX:%.*]] = select i1 [[ICMP]], i64 0, i64 6
-; CHECK-NEXT:    [[SEL:%.*]] = getelementptr inbounds i32, ptr [[GEP1]], i64 [[SEL_IDX]]
+; CHECK-NEXT:    [[SEL_IDX:%.*]] = select i1 [[ICMP]], i64 0, i64 24
+; CHECK-NEXT:    [[SEL:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 [[SEL_IDX]]
 ; CHECK-NEXT:    ret ptr [[SEL]]
 ;
   %gep1 = getelementptr inbounds i32, ptr %p, i64 %x
@@ -119,8 +119,8 @@ define ptr @test2d(ptr %p, i64 %x, i64 %y) {
 ; CHECK-LABEL: @test2d(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr [[P:%.*]], i64 [[X:%.*]]
 ; CHECK-NEXT:    [[ICMP:%.*]] = icmp ugt i64 [[X]], [[Y:%.*]]
-; CHECK-NEXT:    [[SEL_IDX:%.*]] = select i1 [[ICMP]], i64 6, i64 0
-; CHECK-NEXT:    [[SEL:%.*]] = getelementptr inbounds i32, ptr [[GEP1]], i64 [[SEL_IDX]]
+; CHECK-NEXT:    [[SEL_IDX:%.*]] = select i1 [[ICMP]], i64 24, i64 0
+; CHECK-NEXT:    [[SEL:%.*]] = getelementptr inbounds i8, ptr [[GEP1]], i64 [[SEL_IDX]]
 ; CHECK-NEXT:    ret ptr [[SEL]]
 ;
   %gep1 = getelementptr inbounds i32, ptr %p, i64 %x
diff --git a/llvm/test/Transforms/InstCombine/shift.ll b/llvm/test/Transforms/InstCombine/shift.ll
index 6f9ce1544cc0a..d783adbe93863 100644
--- a/llvm/test/Transforms/InstCombine/shift.ll
+++ b/llvm/test/Transforms/InstCombine/shift.ll
@@ -119,8 +119,8 @@ define i8 @test11a(i8 %A) {
 ;; (A >> 8) << 8 === A & -256
 define i32 @test12(i32 %A) {
 ; CHECK-LABEL: @test12(
-; CHECK-NEXT:    [[C:%.*]] = and i32 [[A:%.*]], -256
-; CHECK-NEXT:    ret i32 [[C]]
+; CHECK-NEXT:    [[B:%.*]] = and i32 [[A:%.*]], -256
+; CHECK-NEXT:    ret i32 [[B]]
 ;
   %B = ashr i32 %A, 8
   %C = shl i32 %B, 8
@@ -1754,7 +1754,7 @@ define void @ashr_out_of_range_1(ptr %A) {
 ; CHECK-NEXT:    [[B:%.*]] = select i1 [[TMP1]], i177 0, i177 [[L_FROZEN]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = trunc i177 [[B]] to i64
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i177, ptr [[A]], i64 [[TMP2]]
-; CHECK-NEXT:    [[G11:%.*]] = getelementptr i177, ptr [[TMP3]], i64 -1
+; CHECK-NEXT:    [[G11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 -24
 ; CHECK-NEXT:    [[C17:%.*]] = icmp sgt i177 [[B]], [[L_FROZEN]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = sext i1 [[C17]] to i64
 ; CHECK-NEXT:    [[G62:%.*]] = getelementptr i177, ptr [[G11]], i64 [[TMP4]]
@@ -1783,7 +1783,7 @@ define void @ashr_out_of_range_1(ptr %A) {
 define void @ossfuzz_38078(i32 %arg, i32 %arg1, ptr %ptr, ptr %ptr2, ptr %ptr3, ptr %ptr4, ptr %ptr5, ptr %ptr6, ptr %ptr7) {
 ; CHECK-LABEL: @ossfuzz_38078(
 ; CHECK-NEXT:  bb:
-; CHECK-NEXT:    [[G1:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 -1
+; CHECK-NEXT:    [[G1:%.*]] = getelementptr i8, ptr [[PTR:%.*]], i64 -4
 ; CHECK-NEXT:    [[I2:%.*]] = sub i32 0, [[ARG1:%.*]]
 ; CHECK-NEXT:    [[I5:%.*]] = icmp eq i32 [[I2]], [[ARG:%.*]]
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[I5]])
diff --git a/llvm/test/Transforms/InstCombine/sink_sideeffecting_instruction.ll b/llvm/test/Transforms/InstCombine/sink_sideeffecting_instruction.ll
index f6262fd3c353a..1da6903ae4666 100644
--- a/llvm/test/Transforms/InstCombine/sink_sideeffecting_instruction.ll
+++ b/llvm/test/Transforms/InstCombine/sink_sideeffecting_instruction.ll
@@ -187,7 +187,7 @@ define i32 @sink_gep1(i1 %c) {
 ; CHECK:       early_return:
 ; CHECK-NEXT:    ret i32 0
 ; CHECK:       use_block:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[VAR]], i64 1
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 4
 ; CHECK-NEXT:    [[VAR3:%.*]] = call i32 @unknown(ptr nonnull [[GEP]]) #[[ATTR1]]
 ; CHECK-NEXT:    ret i32 [[VAR3]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/sprintf-2.ll b/llvm/test/Transforms/InstCombine/sprintf-2.ll
index 086be81bca9ec..6caca2cc17825 100644
--- a/llvm/test/Transforms/InstCombine/sprintf-2.ll
+++ b/llvm/test/Transforms/InstCombine/sprintf-2.ll
@@ -18,21 +18,21 @@ declare i32 @snprintf(ptr, i64, ptr, ...)
 define void @fold_snprintf_member_pC(ptr %pi) {
 ; CHECK-LABEL: @fold_snprintf_member_pC(
 ; CHECK-NEXT:    store i32 1, ptr [[PI:%.*]], align 4
-; CHECK-NEXT:    [[PIA0AP1:%.*]] = getelementptr i32, ptr [[PI]], i64 1
+; CHECK-NEXT:    [[PIA0AP1:%.*]] = getelementptr i8, ptr [[PI]], i64 4
 ; CHECK-NEXT:    store i32 0, ptr [[PIA0AP1]], align 4
-; CHECK-NEXT:    [[PIA0B:%.*]] = getelementptr i32, ptr [[PI]], i64 2
+; CHECK-NEXT:    [[PIA0B:%.*]] = getelementptr i8, ptr [[PI]], i64 8
 ; CHECK-NEXT:    store i32 2, ptr [[PIA0B]], align 4
-; CHECK-NEXT:    [[PIA0BP1:%.*]] = getelementptr i32, ptr [[PI]], i64 3
+; CHECK-NEXT:    [[PIA0BP1:%.*]] = getelementptr i8, ptr [[PI]], i64 12
 ; CHECK-NEXT:    store i32 1, ptr [[PIA0BP1]], align 4
-; CHECK-NEXT:    [[PIA0BP2:%.*]] = getelementptr i32, ptr [[PI]], i64 4
+; CHECK-NEXT:    [[PIA0BP2:%.*]] = getelementptr i8, ptr [[PI]], i64 16
 ; CHECK-NEXT:    store i32 0, ptr [[PIA0BP2]], align 4
-; CHECK-NEXT:    [[PIA0C:%.*]] = getelementptr i32, ptr [[PI]], i64 5
+; CHECK-NEXT:    [[PIA0C:%.*]] = getelementptr i8, ptr [[PI]], i64 20
 ; CHECK-NEXT:    store i32 3, ptr [[PIA0C]], align 4
-; CHECK-NEXT:    [[PIA1A:%.*]] = getelementptr i32, ptr [[PI]], i64 6
+; CHECK-NEXT:    [[PIA1A:%.*]] = getelementptr i8, ptr [[PI]], i64 24
 ; CHECK-NEXT:    store i32 4, ptr [[PIA1A]], align 4
-; CHECK-NEXT:    [[PIA1B:%.*]] = getelementptr i32, ptr [[PI]], i64 7
+; CHECK-NEXT:    [[PIA1B:%.*]] = getelementptr i8, ptr [[PI]], i64 28
 ; CHECK-NEXT:    store i32 5, ptr [[PIA1B]], align 4
-; CHECK-NEXT:    [[PIA1C:%.*]] = getelementptr i32, ptr [[PI]], i64 8
+; CHECK-NEXT:    [[PIA1C:%.*]] = getelementptr i8, ptr [[PI]], i64 32
 ; CHECK-NEXT:    store i32 6, ptr [[PIA1C]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/statepoint-cleanup.ll b/llvm/test/Transforms/InstCombine/statepoint-cleanup.ll
index 61c2b85553fd9..95906f0f456d6 100644
--- a/llvm/test/Transforms/InstCombine/statepoint-cleanup.ll
+++ b/llvm/test/Transforms/InstCombine/statepoint-cleanup.ll
@@ -10,7 +10,7 @@ declare void @func()
 define void @test(ptr addrspace(1) %b) gc "statepoint-example" {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[D:%.*]] = getelementptr i32, ptr addrspace(1) [[B:%.*]], i64 16
+; CHECK-NEXT:    [[D:%.*]] = getelementptr i8, ptr addrspace(1) [[B:%.*]], i64 64
 ; CHECK-NEXT:    [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr nonnull elementtype(void ()) @func, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(ptr addrspace(1) [[B]], ptr addrspace(1) [[D]]) ]
 ; CHECK-NEXT:    [[B_NEW_1:%.*]] = call ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[SAFEPOINT_TOKEN]], i32 0, i32 0)
 ; CHECK-NEXT:    [[B_NEW_2:%.*]] = call ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[SAFEPOINT_TOKEN]], i32 0, i32 0)
@@ -68,7 +68,7 @@ entry:
 define void @test_no_base_use(ptr addrspace(1) %b) gc "statepoint-example" {
 ; CHECK-LABEL: @test_no_base_use(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[D:%.*]] = getelementptr i32, ptr addrspace(1) [[B:%.*]], i64 16
+; CHECK-NEXT:    [[D:%.*]] = getelementptr i8, ptr addrspace(1) [[B:%.*]], i64 64
 ; CHECK-NEXT:    [[SAFEPOINT_TOKEN:%.*]] = tail call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr nonnull elementtype(void ()) @func, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(ptr addrspace(1) [[B]], ptr addrspace(1) [[D]]) ]
 ; CHECK-NEXT:    [[D_NEW_1:%.*]] = call ptr addrspace(1) @llvm.experimental.gc.relocate.p1(token [[SAFEPOINT_TOKEN]], i32 0, i32 1)
 ; CHECK-NEXT:    store i32 1, ptr addrspace(1) [[D_NEW_1]], align 4
@@ -90,7 +90,7 @@ entry:
 define void @test_invoke(ptr addrspace(1) %b) gc "statepoint-example" personality ptr @fake_personality_function {
 ; CHECK-LABEL: @test_invoke(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[D:%.*]] = getelementptr i32, ptr addrspace(1) [[B:%.*]], i64 16
+; CHECK-NEXT:    [[D:%.*]] = getelementptr i8, ptr addrspace(1) [[B:%.*]], i64 64
 ; CHECK-NEXT:    [[SAFEPOINT_TOKEN:%.*]] = invoke token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr nonnull elementtype(void ()) @func, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(ptr addrspace(1) [[B]], ptr addrspace(1) [[D]]) ]
 ; CHECK-NEXT:    to label [[NORMAL_DEST:%.*]] unwind label [[UNWIND_DEST:%.*]]
 ; CHECK:       normal_dest:
@@ -209,7 +209,7 @@ unwind_dest:
 define void @test_no_base_use_invoke(ptr addrspace(1) %b) gc "statepoint-example" personality ptr @fake_personality_function {
 ; CHECK-LABEL: @test_no_base_use_invoke(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[D:%.*]] = getelementptr i32, ptr addrspace(1) [[B:%.*]], i64 16
+; CHECK-NEXT:    [[D:%.*]] = getelementptr i8, ptr addrspace(1) [[B:%.*]], i64 64
 ; CHECK-NEXT:    [[SAFEPOINT_TOKEN:%.*]] = invoke token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 0, i32 0, ptr nonnull elementtype(void ()) @func, i32 0, i32 0, i32 0, i32 0) [ "gc-live"(ptr addrspace(1) [[B]], ptr addrspace(1) [[D]]) ]
 ; CHECK-NEXT:    to label [[NORMAL_DEST:%.*]] unwind label [[UNWIND_DEST:%.*]]
 ; CHECK:       normal_dest:
diff --git a/llvm/test/Transforms/InstCombine/str-int-3.ll b/llvm/test/Transforms/InstCombine/str-int-3.ll
index 7d46297b5d143..f319a16d211f9 100644
--- a/llvm/test/Transforms/InstCombine/str-int-3.ll
+++ b/llvm/test/Transforms/InstCombine/str-int-3.ll
@@ -21,11 +21,11 @@ declare i64 @strtoll(ptr, ptr, i32)
 define void @fold_atoi_member(ptr %pi) {
 ; CHECK-LABEL: @fold_atoi_member(
 ; CHECK-NEXT:    store i32 1, ptr [[PI:%.*]], align 4
-; CHECK-NEXT:    [[PIA0B:%.*]] = getelementptr i32, ptr [[PI]], i64 1
+; CHECK-NEXT:    [[PIA0B:%.*]] = getelementptr i8, ptr [[PI]], i64 4
 ; CHECK-NEXT:    store i32 12, ptr [[PIA0B]], align 4
-; CHECK-NEXT:    [[PIA1A:%.*]] = getelementptr i32, ptr [[PI]], i64 2
+; CHECK-NEXT:    [[PIA1A:%.*]] = getelementptr i8, ptr [[PI]], i64 8
 ; CHECK-NEXT:    store i32 123, ptr [[PIA1A]], align 4
-; CHECK-NEXT:    [[PIA1B:%.*]] = getelementptr i32, ptr [[PI]], i64 3
+; CHECK-NEXT:    [[PIA1B:%.*]] = getelementptr i8, ptr [[PI]], i64 12
 ; CHECK-NEXT:    store i32 1234, ptr [[PIA1B]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -92,15 +92,15 @@ define void @fold_atoi_offset_out_of_bounds(ptr %pi) {
 define void @fold_atol_member(ptr %pi) {
 ; CHECK-LABEL: @fold_atol_member(
 ; CHECK-NEXT:    store i64 1, ptr [[PI:%.*]], align 4
-; CHECK-NEXT:    [[PIA0B:%.*]] = getelementptr i64, ptr [[PI]], i64 1
+; CHECK-NEXT:    [[PIA0B:%.*]] = getelementptr i8, ptr [[PI]], i64 8
 ; CHECK-NEXT:    store i64 12, ptr [[PIA0B]], align 4
-; CHECK-NEXT:    [[PIA0C:%.*]] = getelementptr i64, ptr [[PI]], i64 2
+; CHECK-NEXT:    [[PIA0C:%.*]] = getelementptr i8, ptr [[PI]], i64 16
 ; CHECK-NEXT:    store i64 56789, ptr [[PIA0C]], align 4
-; CHECK-NEXT:    [[PIA1A:%.*]] = getelementptr i64, ptr [[PI]], i64 3
+; CHECK-NEXT:    [[PIA1A:%.*]] = getelementptr i8, ptr [[PI]], i64 24
 ; CHECK-NEXT:    store i64 123, ptr [[PIA1A]], align 4
-; CHECK-NEXT:    [[PIA1B:%.*]] = getelementptr i64, ptr [[PI]], i64 4
+; CHECK-NEXT:    [[PIA1B:%.*]] = getelementptr i8, ptr [[PI]], i64 32
 ; CHECK-NEXT:    store i64 1234, ptr [[PIA1B]], align 4
-; CHECK-NEXT:    [[PIA1C:%.*]] = getelementptr i64, ptr [[PI]], i64 5
+; CHECK-NEXT:    [[PIA1C:%.*]] = getelementptr i8, ptr [[PI]], i64 40
 ; CHECK-NEXT:    store i64 67890, ptr [[PIA1C]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -148,15 +148,15 @@ define void @fold_atol_member(ptr %pi) {
 define void @fold_atoll_member_pC(ptr %pi) {
 ; CHECK-LABEL: @fold_atoll_member_pC(
 ; CHECK-NEXT:    store i64 1, ptr [[PI:%.*]], align 4
-; CHECK-NEXT:    [[PIA0BP1:%.*]] = getelementptr i64, ptr [[PI]], i64 1
+; CHECK-NEXT:    [[PIA0BP1:%.*]] = getelementptr i8, ptr [[PI]], i64 8
 ; CHECK-NEXT:    store i64 2, ptr [[PIA0BP1]], align 4
-; CHECK-NEXT:    [[PIA0CP3:%.*]] = getelementptr i64, ptr [[PI]], i64 2
+; CHECK-NEXT:    [[PIA0CP3:%.*]] = getelementptr i8, ptr [[PI]], i64 16
 ; CHECK-NEXT:    store i64 89, ptr [[PIA0CP3]], align 4
-; CHECK-NEXT:    [[PIA1AP2:%.*]] = getelementptr i64, ptr [[PI]], i64 3
+; CHECK-NEXT:    [[PIA1AP2:%.*]] = getelementptr i8, ptr [[PI]], i64 24
 ; CHECK-NEXT:    store i64 3, ptr [[PIA1AP2]], align 4
-; CHECK-NEXT:    [[PIA1BP3:%.*]] = getelementptr i64, ptr [[PI]], i64 4
+; CHECK-NEXT:    [[PIA1BP3:%.*]] = getelementptr i8, ptr [[PI]], i64 32
 ; CHECK-NEXT:    store i64 4, ptr [[PIA1BP3]], align 4
-; CHECK-NEXT:    [[PIA1CP4:%.*]] = getelementptr i64, ptr [[PI]], i64 5
+; CHECK-NEXT:    [[PIA1CP4:%.*]] = getelementptr i8, ptr [[PI]], i64 40
 ; CHECK-NEXT:    store i64 0, ptr [[PIA1CP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -204,15 +204,15 @@ define void @fold_atoll_member_pC(ptr %pi) {
 define void @fold_strtol_member_pC(ptr %pi) {
 ; CHECK-LABEL: @fold_strtol_member_pC(
 ; CHECK-NEXT:    store i64 1, ptr [[PI:%.*]], align 4
-; CHECK-NEXT:    [[PIA0BP1:%.*]] = getelementptr i64, ptr [[PI]], i64 1
+; CHECK-NEXT:    [[PIA0BP1:%.*]] = getelementptr i8, ptr [[PI]], i64 8
 ; CHECK-NEXT:    store i64 2, ptr [[PIA0BP1]], align 4
-; CHECK-NEXT:    [[PIA0CP3:%.*]] = getelementptr i64, ptr [[PI]], i64 2
+; CHECK-NEXT:    [[PIA0CP3:%.*]] = getelementptr i8, ptr [[PI]], i64 16
 ; CHECK-NEXT:    store i64 89, ptr [[PIA0CP3]], align 4
-; CHECK-NEXT:    [[PIA1AP2:%.*]] = getelementptr i64, ptr [[PI]], i64 3
+; CHECK-NEXT:    [[PIA1AP2:%.*]] = getelementptr i8, ptr [[PI]], i64 24
 ; CHECK-NEXT:    store i64 3, ptr [[PIA1AP2]], align 4
-; CHECK-NEXT:    [[PIA1BP3:%.*]] = getelementptr i64, ptr [[PI]], i64 4
+; CHECK-NEXT:    [[PIA1BP3:%.*]] = getelementptr i8, ptr [[PI]], i64 32
 ; CHECK-NEXT:    store i64 4, ptr [[PIA1BP3]], align 4
-; CHECK-NEXT:    [[PIA1CP4:%.*]] = getelementptr i64, ptr [[PI]], i64 5
+; CHECK-NEXT:    [[PIA1CP4:%.*]] = getelementptr i8, ptr [[PI]], i64 40
 ; CHECK-NEXT:    store i64 0, ptr [[PIA1CP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -260,15 +260,15 @@ define void @fold_strtol_member_pC(ptr %pi) {
 define void @fold_strtoll_member_pC(ptr %pi) {
 ; CHECK-LABEL: @fold_strtoll_member_pC(
 ; CHECK-NEXT:    store i64 1, ptr [[PI:%.*]], align 4
-; CHECK-NEXT:    [[PIA0BP1:%.*]] = getelementptr i64, ptr [[PI]], i64 1
+; CHECK-NEXT:    [[PIA0BP1:%.*]] = getelementptr i8, ptr [[PI]], i64 8
 ; CHECK-NEXT:    store i64 2, ptr [[PIA0BP1]], align 4
-; CHECK-NEXT:    [[PIA0CP3:%.*]] = getelementptr i64, ptr [[PI]], i64 2
+; CHECK-NEXT:    [[PIA0CP3:%.*]] = getelementptr i8, ptr [[PI]], i64 16
 ; CHECK-NEXT:    store i64 89, ptr [[PIA0CP3]], align 4
-; CHECK-NEXT:    [[PIA1AP2:%.*]] = getelementptr i64, ptr [[PI]], i64 3
+; CHECK-NEXT:    [[PIA1AP2:%.*]] = getelementptr i8, ptr [[PI]], i64 24
 ; CHECK-NEXT:    store i64 3, ptr [[PIA1AP2]], align 4
-; CHECK-NEXT:    [[PIA1BP3:%.*]] = getelementptr i64, ptr [[PI]], i64 4
+; CHECK-NEXT:    [[PIA1BP3:%.*]] = getelementptr i8, ptr [[PI]], i64 32
 ; CHECK-NEXT:    store i64 4, ptr [[PIA1BP3]], align 4
-; CHECK-NEXT:    [[PIA1CP4:%.*]] = getelementptr i64, ptr [[PI]], i64 5
+; CHECK-NEXT:    [[PIA1CP4:%.*]] = getelementptr i8, ptr [[PI]], i64 40
 ; CHECK-NEXT:    store i64 0, ptr [[PIA1CP4]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/str-int-4.ll b/llvm/test/Transforms/InstCombine/str-int-4.ll
index 9d3302ca16085..6efc5fb4ed1fb 100644
--- a/llvm/test/Transforms/InstCombine/str-int-4.ll
+++ b/llvm/test/Transforms/InstCombine/str-int-4.ll
@@ -45,37 +45,37 @@ define void @fold_strtol(ptr %ps) {
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_im123, i64 0, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    store i32 -123, ptr [[PS:%.*]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_ip234, i64 0, i64 10), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i32, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 4
 ; CHECK-NEXT:    store i32 234, ptr [[PS1]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x i8], ptr @i0, i64 0, i64 2), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i32, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i32 0, ptr [[PS2]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x i8], ptr @i9, i64 0, i64 2), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i32, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 12
 ; CHECK-NEXT:    store i32 9, ptr [[PS3]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x i8], ptr @ia, i64 0, i64 2), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i32, ptr [[PS]], i64 4
+; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i32 10, ptr [[PS4]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([7 x i8], ptr @i19azAZ, i64 0, i64 6), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i32, ptr [[PS]], i64 5
+; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i8, ptr [[PS]], i64 20
 ; CHECK-NEXT:    store i32 76095035, ptr [[PS5]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @i32min, i64 0, i64 12), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i32, ptr [[PS]], i64 6
+; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS6]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([15 x i8], ptr @mo32min, i64 0, i64 14), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i32, ptr [[PS]], i64 7
+; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i8, ptr [[PS]], i64 28
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS7]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @mx32min, i64 0, i64 12), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i32, ptr [[PS]], i64 8
+; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i8, ptr [[PS]], i64 32
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS8]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @mx32min, i64 0, i64 12), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i32, ptr [[PS]], i64 9
+; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i8, ptr [[PS]], i64 36
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS9]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @i32max, i64 0, i64 11), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i32, ptr [[PS]], i64 10
+; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i8, ptr [[PS]], i64 40
 ; CHECK-NEXT:    store i32 2147483647, ptr [[PS10]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @x32max, i64 0, i64 11), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS11:%.*]] = getelementptr i32, ptr [[PS]], i64 11
+; CHECK-NEXT:    [[PS11:%.*]] = getelementptr i8, ptr [[PS]], i64 44
 ; CHECK-NEXT:    store i32 2147483647, ptr [[PS11]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -149,46 +149,46 @@ define void @call_strtol(ptr %ps) {
 ; CHECK-NEXT:    [[MINM1:%.*]] = call i32 @strtol(ptr nonnull @i32min_m1, ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    store i32 [[MINM1]], ptr [[PS:%.*]], align 4
 ; CHECK-NEXT:    [[MAXP1:%.*]] = call i32 @strtol(ptr nonnull @i32max_p1, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i32, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 4
 ; CHECK-NEXT:    store i32 [[MAXP1]], ptr [[PS1]], align 4
 ; CHECK-NEXT:    [[IPLUS:%.*]] = call i32 @strtol(ptr nonnull @wsplus, ptr nonnull @endptr, i32 0)
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i32, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i32 [[IPLUS]], ptr [[PS2]], align 4
 ; CHECK-NEXT:    [[IA:%.*]] = call i32 @strtol(ptr nonnull @ia, ptr nonnull @endptr, i32 0)
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i32, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 12
 ; CHECK-NEXT:    store i32 [[IA]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    [[I8:%.*]] = call i32 @strtol(ptr nonnull @i8, ptr nonnull @endptr, i32 8)
-; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i32, ptr [[PS]], i64 4
+; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i32 [[I8]], ptr [[PS4]], align 4
 ; CHECK-NEXT:    [[I0X:%.*]] = call i32 @strtol(ptr nonnull @x0x, ptr nonnull @endptr, i32 0)
-; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i32, ptr [[PS]], i64 5
+; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i8, ptr [[PS]], i64 20
 ; CHECK-NEXT:    store i32 [[I0X]], ptr [[PS5]], align 4
 ; CHECK-NEXT:    [[IWSPWS0:%.*]] = call i32 @strtol(ptr nonnull @wsplusws0, ptr nonnull @endptr, i32 0)
-; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i32, ptr [[PS]], i64 6
+; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i32 [[IWSPWS0]], ptr [[PS6]], align 4
 ; CHECK-NEXT:    [[I19AZAZ:%.*]] = call i32 @strtol(ptr nonnull @i19azAZ, ptr nonnull @endptr, i32 35)
-; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i32, ptr [[PS]], i64 7
+; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i8, ptr [[PS]], i64 28
 ; CHECK-NEXT:    store i32 [[I19AZAZ]], ptr [[PS7]], align 4
 ; CHECK-NEXT:    [[O32MIN:%.*]] = call i32 @strtol(ptr nonnull @o32min, ptr nonnull @endptr, i32 0)
-; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i32, ptr [[PS]], i64 8
+; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i8, ptr [[PS]], i64 32
 ; CHECK-NEXT:    store i32 [[O32MIN]], ptr [[PS8]], align 4
 ; CHECK-NEXT:    [[X32MIN:%.*]] = call i32 @strtol(ptr nonnull @x32min, ptr nonnull @endptr, i32 0)
-; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i32, ptr [[PS]], i64 9
+; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i8, ptr [[PS]], i64 36
 ; CHECK-NEXT:    store i32 [[X32MIN]], ptr [[PS9]], align 4
 ; CHECK-NEXT:    [[X32MIN_10:%.*]] = call i32 @strtol(ptr nonnull @x32min, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i32, ptr [[PS]], i64 10
+; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i8, ptr [[PS]], i64 40
 ; CHECK-NEXT:    store i32 [[X32MIN_10]], ptr [[PS10]], align 4
 ; CHECK-NEXT:    [[NWS:%.*]] = call i32 @strtol(ptr nonnull @ws, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS11:%.*]] = getelementptr i32, ptr [[PS]], i64 11
+; CHECK-NEXT:    [[PS11:%.*]] = getelementptr i8, ptr [[PS]], i64 44
 ; CHECK-NEXT:    store i32 [[NWS]], ptr [[PS11]], align 4
 ; CHECK-NEXT:    [[NWSP6:%.*]] = call i32 @strtol(ptr nonnull getelementptr inbounds ([7 x i8], ptr @ws, i64 0, i64 6), ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS12:%.*]] = getelementptr i32, ptr [[PS]], i64 12
+; CHECK-NEXT:    [[PS12:%.*]] = getelementptr i8, ptr [[PS]], i64 48
 ; CHECK-NEXT:    store i32 [[NWSP6]], ptr [[PS12]], align 4
 ; CHECK-NEXT:    [[I0B1:%.*]] = call i32 @strtol(ptr nonnull @i0, ptr nonnull @endptr, i32 1)
-; CHECK-NEXT:    [[PS13:%.*]] = getelementptr i32, ptr [[PS]], i64 13
+; CHECK-NEXT:    [[PS13:%.*]] = getelementptr i8, ptr [[PS]], i64 52
 ; CHECK-NEXT:    store i32 [[I0B1]], ptr [[PS13]], align 4
 ; CHECK-NEXT:    [[I0B256:%.*]] = call i32 @strtol(ptr nonnull @i0, ptr nonnull @endptr, i32 256)
-; CHECK-NEXT:    [[PS14:%.*]] = getelementptr i32, ptr [[PS]], i64 14
+; CHECK-NEXT:    [[PS14:%.*]] = getelementptr i8, ptr [[PS]], i64 56
 ; CHECK-NEXT:    store i32 [[I0B256]], ptr [[PS14]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -290,13 +290,13 @@ define void @fold_strtoll(ptr %ps) {
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_im123, i64 0, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    store i64 -123, ptr [[PS:%.*]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_ip234, i64 0, i64 10), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i64, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i64 234, ptr [[PS1]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([22 x i8], ptr @i64min, i64 0, i64 21), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i64, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 -9223372036854775808, ptr [[PS2]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([21 x i8], ptr @i64max, i64 0, i64 20), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i64, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 9223372036854775807, ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -330,13 +330,13 @@ define void @call_strtoll(ptr %ps) {
 ; CHECK-NEXT:    [[MINM1:%.*]] = call i64 @strtoll(ptr nonnull @i64min_m1, ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    store i64 [[MINM1]], ptr [[PS:%.*]], align 4
 ; CHECK-NEXT:    [[MAXP1:%.*]] = call i64 @strtoll(ptr nonnull @i64max_p1, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i64, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i64 [[MAXP1]], ptr [[PS1]], align 4
 ; CHECK-NEXT:    [[NWS:%.*]] = call i64 @strtoll(ptr nonnull @ws, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i64, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 [[NWS]], ptr [[PS2]], align 4
 ; CHECK-NEXT:    [[NWSP6:%.*]] = call i64 @strtoll(ptr nonnull getelementptr inbounds ([7 x i8], ptr @ws, i64 0, i64 6), ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i64, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 [[NWSP6]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -373,13 +373,13 @@ define void @call_strtoll(ptr %ps) {
 define void @call_strtol_trailing_space(ptr %ps) {
 ; CHECK-LABEL: @call_strtol_trailing_space(
 ; CHECK-NEXT:    [[N1:%.*]] = call i32 @strtol(ptr nonnull @i_1_2_3_, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i32, ptr [[PS:%.*]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS:%.*]], i64 4
 ; CHECK-NEXT:    store i32 [[N1]], ptr [[PS1]], align 4
 ; CHECK-NEXT:    [[N2:%.*]] = call i32 @strtol(ptr nonnull getelementptr inbounds ([9 x i8], ptr @i_1_2_3_, i64 0, i64 2), ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i32, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i32 [[N2]], ptr [[PS2]], align 4
 ; CHECK-NEXT:    [[N3:%.*]] = call i32 @strtol(ptr nonnull getelementptr inbounds ([9 x i8], ptr @i_1_2_3_, i64 0, i64 4), ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i32, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 12
 ; CHECK-NEXT:    store i32 [[N3]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/str-int-5.ll b/llvm/test/Transforms/InstCombine/str-int-5.ll
index f8976bcb82757..ff4f2bffd9776 100644
--- a/llvm/test/Transforms/InstCombine/str-int-5.ll
+++ b/llvm/test/Transforms/InstCombine/str-int-5.ll
@@ -49,37 +49,37 @@ define void @fold_strtoul(ptr %ps) {
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_im123, i64 0, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    store i32 -123, ptr [[PS:%.*]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_ip234, i64 0, i64 10), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i32, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 4
 ; CHECK-NEXT:    store i32 234, ptr [[PS1]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @i32min_m1, i64 0, i64 12), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i32, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i32 2147483647, ptr [[PS2]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @i32min, i64 0, i64 12), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i32, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 12
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS3]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([15 x i8], ptr @o32min, i64 0, i64 14), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i32, ptr [[PS]], i64 4
+; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS4]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([15 x i8], ptr @mo32min, i64 0, i64 14), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i32, ptr [[PS]], i64 5
+; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i8, ptr [[PS]], i64 20
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS5]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @x32min, i64 0, i64 12), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i32, ptr [[PS]], i64 6
+; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS6]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @mx32min, i64 0, i64 12), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i32, ptr [[PS]], i64 7
+; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i8, ptr [[PS]], i64 28
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS7]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @i32max, i64 0, i64 11), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i32, ptr [[PS]], i64 8
+; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i8, ptr [[PS]], i64 32
 ; CHECK-NEXT:    store i32 2147483647, ptr [[PS8]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([6 x i8], ptr @mX01, i64 0, i64 5), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i32, ptr [[PS]], i64 9
+; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i8, ptr [[PS]], i64 36
 ; CHECK-NEXT:    store i32 -1, ptr [[PS9]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @i32max_p1, i64 0, i64 11), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i32, ptr [[PS]], i64 10
+; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i8, ptr [[PS]], i64 40
 ; CHECK-NEXT:    store i32 -2147483648, ptr [[PS10]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([12 x i8], ptr @ui32max, i64 0, i64 11), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS11:%.*]] = getelementptr i32, ptr [[PS]], i64 11
+; CHECK-NEXT:    [[PS11:%.*]] = getelementptr i8, ptr [[PS]], i64 44
 ; CHECK-NEXT:    store i32 -1, ptr [[PS11]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -154,13 +154,13 @@ define void @call_strtoul(ptr %ps) {
 ; CHECK-NEXT:    [[MINM1:%.*]] = call i32 @strtoul(ptr nonnull @i64min_m1, ptr nonnull @endptr, i32 10)
 ; CHECK-NEXT:    store i32 [[MINM1]], ptr [[PS:%.*]], align 4
 ; CHECK-NEXT:    [[MAXP1:%.*]] = call i32 @strtoul(ptr nonnull @ui32max_p1, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i32, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 4
 ; CHECK-NEXT:    store i32 [[MAXP1]], ptr [[PS1]], align 4
 ; CHECK-NEXT:    [[NWS:%.*]] = call i32 @strtoul(ptr nonnull @ws, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i32, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i32 [[NWS]], ptr [[PS2]], align 4
 ; CHECK-NEXT:    [[NWSP6:%.*]] = call i32 @strtoul(ptr nonnull getelementptr inbounds ([7 x i8], ptr @ws, i64 0, i64 6), ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i32, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 12
 ; CHECK-NEXT:    store i32 [[NWSP6]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -198,34 +198,34 @@ define void @fold_strtoull(ptr %ps) {
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_im123, i64 0, i64 10), ptr @endptr, align 8
 ; CHECK-NEXT:    store i64 -123, ptr [[PS:%.*]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([11 x i8], ptr @ws_ip234, i64 0, i64 10), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i64, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i64 234, ptr [[PS1]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([22 x i8], ptr @i64min_m1, i64 0, i64 21), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i64, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 9223372036854775807, ptr [[PS2]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @i32min, i64 0, i64 12), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i64, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 -2147483648, ptr [[PS3]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([15 x i8], ptr @o32min, i64 0, i64 14), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i64, ptr [[PS]], i64 4
+; CHECK-NEXT:    [[PS4:%.*]] = getelementptr i8, ptr [[PS]], i64 32
 ; CHECK-NEXT:    store i64 2147483648, ptr [[PS4]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([13 x i8], ptr @x32min, i64 0, i64 12), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i64, ptr [[PS]], i64 5
+; CHECK-NEXT:    [[PS5:%.*]] = getelementptr i8, ptr [[PS]], i64 40
 ; CHECK-NEXT:    store i64 2147483648, ptr [[PS5]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([22 x i8], ptr @i64min, i64 0, i64 21), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i64, ptr [[PS]], i64 6
+; CHECK-NEXT:    [[PS6:%.*]] = getelementptr i8, ptr [[PS]], i64 48
 ; CHECK-NEXT:    store i64 -9223372036854775808, ptr [[PS6]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([21 x i8], ptr @i64max, i64 0, i64 20), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i64, ptr [[PS]], i64 7
+; CHECK-NEXT:    [[PS7:%.*]] = getelementptr i8, ptr [[PS]], i64 56
 ; CHECK-NEXT:    store i64 9223372036854775807, ptr [[PS7]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([21 x i8], ptr @i64max_p1, i64 0, i64 20), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i64, ptr [[PS]], i64 8
+; CHECK-NEXT:    [[PS8:%.*]] = getelementptr i8, ptr [[PS]], i64 64
 ; CHECK-NEXT:    store i64 -9223372036854775808, ptr [[PS8]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([22 x i8], ptr @ui64max, i64 0, i64 21), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i64, ptr [[PS]], i64 9
+; CHECK-NEXT:    [[PS9:%.*]] = getelementptr i8, ptr [[PS]], i64 72
 ; CHECK-NEXT:    store i64 -1, ptr [[PS9]], align 4
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([20 x i8], ptr @x64max, i64 0, i64 19), ptr @endptr, align 8
-; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i64, ptr [[PS]], i64 10
+; CHECK-NEXT:    [[PS10:%.*]] = getelementptr i8, ptr [[PS]], i64 80
 ; CHECK-NEXT:    store i64 -1, ptr [[PS10]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -293,13 +293,13 @@ define void @fold_strtoull(ptr %ps) {
 define void @call_strtoull(ptr %ps) {
 ; CHECK-LABEL: @call_strtoull(
 ; CHECK-NEXT:    [[MAXP1:%.*]] = call i64 @strtoull(ptr nonnull @ui64max_p1, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i64, ptr [[PS:%.*]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS:%.*]], i64 8
 ; CHECK-NEXT:    store i64 [[MAXP1]], ptr [[PS1]], align 4
 ; CHECK-NEXT:    [[NWS:%.*]] = call i64 @strtoull(ptr nonnull @ws, ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i64, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 [[NWS]], ptr [[PS2]], align 4
 ; CHECK-NEXT:    [[NWSP6:%.*]] = call i64 @strtoull(ptr nonnull getelementptr inbounds ([7 x i8], ptr @ws, i64 0, i64 6), ptr nonnull @endptr, i32 10)
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i64, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 [[NWSP6]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/str-int.ll b/llvm/test/Transforms/InstCombine/str-int.ll
index 8b4126b5c9576..718bfe4133332 100644
--- a/llvm/test/Transforms/InstCombine/str-int.ll
+++ b/llvm/test/Transforms/InstCombine/str-int.ll
@@ -45,7 +45,7 @@ define i32 @strtol_hex() #0 {
 
 define i32 @strtol_endptr_not_null(ptr %pend) {
 ; CHECK-LABEL: @strtol_endptr_not_null(
-; CHECK-NEXT:    [[ENDP1:%.*]] = getelementptr inbounds ptr, ptr [[PEND:%.*]], i64 1
+; CHECK-NEXT:    [[ENDP1:%.*]] = getelementptr inbounds i8, ptr [[PEND:%.*]], i64 8
 ; CHECK-NEXT:    store ptr getelementptr inbounds ([3 x i8], ptr @.str, i64 0, i64 2), ptr [[ENDP1]], align 8
 ; CHECK-NEXT:    ret i32 12
 ;
diff --git a/llvm/test/Transforms/InstCombine/strcall-bad-sig.ll b/llvm/test/Transforms/InstCombine/strcall-bad-sig.ll
index 98f5e6ea1b27c..5e59db5ef88aa 100644
--- a/llvm/test/Transforms/InstCombine/strcall-bad-sig.ll
+++ b/llvm/test/Transforms/InstCombine/strcall-bad-sig.ll
@@ -15,10 +15,10 @@ define void @call_bad_ato(ptr %ps) {
 ; CHECK-NEXT:    [[IR:%.*]] = call ptr @atoi(ptr nonnull @a)
 ; CHECK-NEXT:    store ptr [[IR]], ptr [[PS:%.*]], align 8
 ; CHECK-NEXT:    [[LR:%.*]] = call ptr @atol(ptr nonnull @a)
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr ptr, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store ptr [[LR]], ptr [[PS1]], align 8
 ; CHECK-NEXT:    [[LLR:%.*]] = call ptr @atol(ptr nonnull @a)
-; CHECK-NEXT:    [[PS2:%.*]] = getelementptr ptr, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[PS2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store ptr [[LLR]], ptr [[PS2]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -114,12 +114,12 @@ define void @call_bad_strto(ptr %psi32, ptr %psi64) {
 ; CHECK-NEXT:    [[LR:%.*]] = call i32 @strtol(ptr nonnull @a, ptr null)
 ; CHECK-NEXT:    store i32 [[LR]], ptr [[PSI32:%.*]], align 4
 ; CHECK-NEXT:    [[ULR:%.*]] = call i32 @strtoul(ptr nonnull @a, ptr null)
-; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i32, ptr [[PSI32]], i64 1
+; CHECK-NEXT:    [[PS1:%.*]] = getelementptr i8, ptr [[PSI32]], i64 4
 ; CHECK-NEXT:    store i32 [[ULR]], ptr [[PS1]], align 4
 ; CHECK-NEXT:    [[LLR:%.*]] = call i64 @strtoll(ptr nonnull @a, ptr null)
 ; CHECK-NEXT:    store i64 [[LLR]], ptr [[PSI64:%.*]], align 4
 ; CHECK-NEXT:    [[ULLR:%.*]] = call i64 @strtoull(ptr nonnull @a, ptr null)
-; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i64, ptr [[PSI64]], i64 3
+; CHECK-NEXT:    [[PS3:%.*]] = getelementptr i8, ptr [[PSI64]], i64 24
 ; CHECK-NEXT:    store i64 [[ULLR]], ptr [[PS3]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/strcall-no-nul.ll b/llvm/test/Transforms/InstCombine/strcall-no-nul.ll
index fef06a03639c7..30221ad5b0962 100644
--- a/llvm/test/Transforms/InstCombine/strcall-no-nul.ll
+++ b/llvm/test/Transforms/InstCombine/strcall-no-nul.ll
@@ -62,7 +62,7 @@ define ptr @fold_strchr_past_end() {
 define void @fold_strcmp_past_end(ptr %pcmp) {
 ; CHECK-LABEL: @fold_strcmp_past_end(
 ; CHECK-NEXT:    store i32 1, ptr [[PCMP:%.*]], align 4
-; CHECK-NEXT:    [[PC50:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[PC50:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 -1, ptr [[PC50]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -84,7 +84,7 @@ define void @fold_strcmp_past_end(ptr %pcmp) {
 define void @fold_strncmp_past_end(ptr %pcmp) {
 ; CHECK-LABEL: @fold_strncmp_past_end(
 ; CHECK-NEXT:    store i32 1, ptr [[PCMP:%.*]], align 4
-; CHECK-NEXT:    [[PC50:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[PC50:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 -1, ptr [[PC50]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -118,7 +118,7 @@ define ptr @fold_strrchr_past_end(i32 %c) {
 define void @fold_strstr_past_end(ptr %psub) {
 ; CHECK-LABEL: @fold_strstr_past_end(
 ; CHECK-NEXT:    store ptr @a5, ptr [[PSUB:%.*]], align 8
-; CHECK-NEXT:    [[PS50:%.*]] = getelementptr ptr, ptr [[PSUB]], i64 1
+; CHECK-NEXT:    [[PS50:%.*]] = getelementptr i8, ptr [[PSUB]], i64 8
 ; CHECK-NEXT:    store ptr null, ptr [[PS50]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -202,7 +202,7 @@ define ptr @fold_strncpy_past_end(ptr %dst) {
 define void @fold_strpbrk_past_end(ptr %psub) {
 ; CHECK-LABEL: @fold_strpbrk_past_end(
 ; CHECK-NEXT:    store ptr null, ptr [[PSUB:%.*]], align 8
-; CHECK-NEXT:    [[PS50:%.*]] = getelementptr ptr, ptr [[PSUB]], i64 1
+; CHECK-NEXT:    [[PS50:%.*]] = getelementptr i8, ptr [[PSUB]], i64 8
 ; CHECK-NEXT:    store ptr null, ptr [[PS50]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -224,7 +224,7 @@ define void @fold_strpbrk_past_end(ptr %psub) {
 define void @fold_strspn_past_end(ptr %poff) {
 ; CHECK-LABEL: @fold_strspn_past_end(
 ; CHECK-NEXT:    store i64 0, ptr [[POFF:%.*]], align 4
-; CHECK-NEXT:    [[PO50:%.*]] = getelementptr i64, ptr [[POFF]], i64 1
+; CHECK-NEXT:    [[PO50:%.*]] = getelementptr i8, ptr [[POFF]], i64 8
 ; CHECK-NEXT:    store i64 0, ptr [[PO50]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -246,7 +246,7 @@ define void @fold_strspn_past_end(ptr %poff) {
 define void @fold_strcspn_past_end(ptr %poff) {
 ; CHECK-LABEL: @fold_strcspn_past_end(
 ; CHECK-NEXT:    store i64 2, ptr [[POFF:%.*]], align 4
-; CHECK-NEXT:    [[PO50:%.*]] = getelementptr i64, ptr [[POFF]], i64 1
+; CHECK-NEXT:    [[PO50:%.*]] = getelementptr i8, ptr [[POFF]], i64 8
 ; CHECK-NEXT:    store i64 0, ptr [[PO50]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -285,19 +285,19 @@ define void @fold_atol_strtol_past_end(ptr %ps) {
 ; CHECK-NEXT:    [[I0:%.*]] = call i64 @atol(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0))
 ; CHECK-NEXT:    store i64 [[I0]], ptr [[PS:%.*]], align 4
 ; CHECK-NEXT:    [[I1:%.*]] = call i64 @atoll(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0))
-; CHECK-NEXT:    [[P1:%.*]] = getelementptr i64, ptr [[PS]], i64 1
+; CHECK-NEXT:    [[P1:%.*]] = getelementptr i8, ptr [[PS]], i64 8
 ; CHECK-NEXT:    store i64 [[I1]], ptr [[P1]], align 4
 ; CHECK-NEXT:    [[I2:%.*]] = call i64 @strtol(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0), ptr null, i32 0)
-; CHECK-NEXT:    [[P2:%.*]] = getelementptr i64, ptr [[PS]], i64 2
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr i8, ptr [[PS]], i64 16
 ; CHECK-NEXT:    store i64 [[I2]], ptr [[P2]], align 4
 ; CHECK-NEXT:    [[I3:%.*]] = call i64 @strtoul(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0), ptr null, i32 8)
-; CHECK-NEXT:    [[P3:%.*]] = getelementptr i64, ptr [[PS]], i64 3
+; CHECK-NEXT:    [[P3:%.*]] = getelementptr i8, ptr [[PS]], i64 24
 ; CHECK-NEXT:    store i64 [[I3]], ptr [[P3]], align 4
 ; CHECK-NEXT:    [[I4:%.*]] = call i64 @strtoll(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0), ptr null, i32 10)
-; CHECK-NEXT:    [[P4:%.*]] = getelementptr i64, ptr [[PS]], i64 4
+; CHECK-NEXT:    [[P4:%.*]] = getelementptr i8, ptr [[PS]], i64 32
 ; CHECK-NEXT:    store i64 [[I4]], ptr [[P4]], align 4
 ; CHECK-NEXT:    [[I5:%.*]] = call i64 @strtoul(ptr nocapture nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0), ptr null, i32 16)
-; CHECK-NEXT:    [[P5:%.*]] = getelementptr i64, ptr [[PS]], i64 5
+; CHECK-NEXT:    [[P5:%.*]] = getelementptr i8, ptr [[PS]], i64 40
 ; CHECK-NEXT:    store i64 [[I5]], ptr [[P5]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -336,7 +336,7 @@ define void @fold_atol_strtol_past_end(ptr %ps) {
 define void @fold_sprintf_past_end(ptr %pcnt, ptr %dst) {
 ; CHECK-LABEL: @fold_sprintf_past_end(
 ; CHECK-NEXT:    store i32 0, ptr [[PCNT:%.*]], align 4
-; CHECK-NEXT:    [[PN05:%.*]] = getelementptr i32, ptr [[PCNT]], i64 1
+; CHECK-NEXT:    [[PN05:%.*]] = getelementptr i8, ptr [[PCNT]], i64 4
 ; CHECK-NEXT:    store i32 0, ptr [[PN05]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -361,7 +361,7 @@ define void @fold_snprintf_past_end(ptr %pcnt, ptr %dst, i64 %n) {
 ; CHECK-NEXT:    [[N5_:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr [[DST:%.*]], i64 [[N:%.*]], ptr nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0))
 ; CHECK-NEXT:    store i32 [[N5_]], ptr [[PCNT:%.*]], align 4
 ; CHECK-NEXT:    [[N05:%.*]] = call i32 (ptr, i64, ptr, ...) @snprintf(ptr [[DST]], i64 [[N]], ptr nonnull @a5, ptr nonnull getelementptr inbounds ([5 x i8], ptr @a5, i64 1, i64 0))
-; CHECK-NEXT:    [[PN05:%.*]] = getelementptr i32, ptr [[PCNT]], i64 1
+; CHECK-NEXT:    [[PN05:%.*]] = getelementptr i8, ptr [[PCNT]], i64 4
 ; CHECK-NEXT:    store i32 [[N05]], ptr [[PN05]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/strlen-7.ll b/llvm/test/Transforms/InstCombine/strlen-7.ll
index 1a12a791329b5..6eb7885abd041 100644
--- a/llvm/test/Transforms/InstCombine/strlen-7.ll
+++ b/llvm/test/Transforms/InstCombine/strlen-7.ll
@@ -16,39 +16,39 @@ declare i64 @strlen(ptr)
 define void @fold_strlen_A(ptr %plen) {
 ; CHECK-LABEL: @fold_strlen_A(
 ; CHECK-NEXT:    store i64 1, ptr [[PLEN:%.*]], align 4
-; CHECK-NEXT:    [[PLEN1:%.*]] = getelementptr i64, ptr [[PLEN]], i64 1
+; CHECK-NEXT:    [[PLEN1:%.*]] = getelementptr i8, ptr [[PLEN]], i64 8
 ; CHECK-NEXT:    store i64 0, ptr [[PLEN1]], align 4
-; CHECK-NEXT:    [[PLEN2:%.*]] = getelementptr i64, ptr [[PLEN]], i64 2
+; CHECK-NEXT:    [[PLEN2:%.*]] = getelementptr i8, ptr [[PLEN]], i64 16
 ; CHECK-NEXT:    store i64 0, ptr [[PLEN2]], align 4
-; CHECK-NEXT:    [[PLEN3:%.*]] = getelementptr i64, ptr [[PLEN]], i64 3
+; CHECK-NEXT:    [[PLEN3:%.*]] = getelementptr i8, ptr [[PLEN]], i64 24
 ; CHECK-NEXT:    store i64 0, ptr [[PLEN3]], align 4
-; CHECK-NEXT:    [[PLEN4:%.*]] = getelementptr i64, ptr [[PLEN]], i64 4
+; CHECK-NEXT:    [[PLEN4:%.*]] = getelementptr i8, ptr [[PLEN]], i64 32
 ; CHECK-NEXT:    store i64 2, ptr [[PLEN4]], align 4
-; CHECK-NEXT:    [[PLEN5:%.*]] = getelementptr i64, ptr [[PLEN]], i64 5
+; CHECK-NEXT:    [[PLEN5:%.*]] = getelementptr i8, ptr [[PLEN]], i64 40
 ; CHECK-NEXT:    store i64 1, ptr [[PLEN5]], align 4
-; CHECK-NEXT:    [[PLEN6:%.*]] = getelementptr i64, ptr [[PLEN]], i64 6
+; CHECK-NEXT:    [[PLEN6:%.*]] = getelementptr i8, ptr [[PLEN]], i64 48
 ; CHECK-NEXT:    store i64 0, ptr [[PLEN6]], align 4
-; CHECK-NEXT:    [[PLEN7:%.*]] = getelementptr i64, ptr [[PLEN]], i64 7
+; CHECK-NEXT:    [[PLEN7:%.*]] = getelementptr i8, ptr [[PLEN]], i64 56
 ; CHECK-NEXT:    store i64 0, ptr [[PLEN7]], align 4
-; CHECK-NEXT:    [[PLEN8:%.*]] = getelementptr i64, ptr [[PLEN]], i64 8
+; CHECK-NEXT:    [[PLEN8:%.*]] = getelementptr i8, ptr [[PLEN]], i64 64
 ; CHECK-NEXT:    store i64 0, ptr [[PLEN8]], align 4
-; CHECK-NEXT:    [[PLEN9:%.*]] = getelementptr i64, ptr [[PLEN]], i64 9
+; CHECK-NEXT:    [[PLEN9:%.*]] = getelementptr i8, ptr [[PLEN]], i64 72
 ; CHECK-NEXT:    store i64 3, ptr [[PLEN9]], align 4
-; CHECK-NEXT:    [[PLEN10:%.*]] = getelementptr i64, ptr [[PLEN]], i64 10
+; CHECK-NEXT:    [[PLEN10:%.*]] = getelementptr i8, ptr [[PLEN]], i64 80
 ; CHECK-NEXT:    store i64 2, ptr [[PLEN10]], align 4
-; CHECK-NEXT:    [[PLEN11:%.*]] = getelementptr i64, ptr [[PLEN]], i64 11
+; CHECK-NEXT:    [[PLEN11:%.*]] = getelementptr i8, ptr [[PLEN]], i64 88
 ; CHECK-NEXT:    store i64 1, ptr [[PLEN11]], align 4
-; CHECK-NEXT:    [[PLEN12:%.*]] = getelementptr i64, ptr [[PLEN]], i64 12
+; CHECK-NEXT:    [[PLEN12:%.*]] = getelementptr i8, ptr [[PLEN]], i64 96
 ; CHECK-NEXT:    store i64 0, ptr [[PLEN12]], align 4
-; CHECK-NEXT:    [[PLEN14:%.*]] = getelementptr i64, ptr [[PLEN]], i64 14
+; CHECK-NEXT:    [[PLEN14:%.*]] = getelementptr i8, ptr [[PLEN]], i64 112
 ; CHECK-NEXT:    store i64 4, ptr [[PLEN14]], align 4
-; CHECK-NEXT:    [[PLEN15:%.*]] = getelementptr i64, ptr [[PLEN]], i64 15
+; CHECK-NEXT:    [[PLEN15:%.*]] = getelementptr i8, ptr [[PLEN]], i64 120
 ; CHECK-NEXT:    store i64 3, ptr [[PLEN15]], align 4
-; CHECK-NEXT:    [[PLEN16:%.*]] = getelementptr i64, ptr [[PLEN]], i64 16
+; CHECK-NEXT:    [[PLEN16:%.*]] = getelementptr i8, ptr [[PLEN]], i64 128
 ; CHECK-NEXT:    store i64 2, ptr [[PLEN16]], align 4
-; CHECK-NEXT:    [[PLEN17:%.*]] = getelementptr i64, ptr [[PLEN]], i64 17
+; CHECK-NEXT:    [[PLEN17:%.*]] = getelementptr i8, ptr [[PLEN]], i64 136
 ; CHECK-NEXT:    store i64 1, ptr [[PLEN17]], align 4
-; CHECK-NEXT:    [[PLEN18:%.*]] = getelementptr i64, ptr [[PLEN]], i64 18
+; CHECK-NEXT:    [[PLEN18:%.*]] = getelementptr i8, ptr [[PLEN]], i64 144
 ; CHECK-NEXT:    store i64 0, ptr [[PLEN18]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -172,15 +172,15 @@ define void @fold_strlen_A_pI(ptr %plen, i64 %I) {
 ; CHECK-NEXT:    store i64 [[LENA0A]], ptr [[PLEN:%.*]], align 4
 ; CHECK-NEXT:    [[PA0B:%.*]] = getelementptr [2 x %struct.A], ptr @a, i64 0, i64 0, i32 1, i64 [[I]]
 ; CHECK-NEXT:    [[LENA0B:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PA0B]])
-; CHECK-NEXT:    [[PLEN1:%.*]] = getelementptr i64, ptr [[PLEN]], i64 1
+; CHECK-NEXT:    [[PLEN1:%.*]] = getelementptr i8, ptr [[PLEN]], i64 8
 ; CHECK-NEXT:    store i64 [[LENA0B]], ptr [[PLEN1]], align 4
 ; CHECK-NEXT:    [[PA1A:%.*]] = getelementptr [2 x %struct.A], ptr @a, i64 0, i64 1, i32 0, i64 [[I]]
 ; CHECK-NEXT:    [[LENA1A:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PA1A]])
-; CHECK-NEXT:    [[PLEN2:%.*]] = getelementptr i64, ptr [[PLEN]], i64 2
+; CHECK-NEXT:    [[PLEN2:%.*]] = getelementptr i8, ptr [[PLEN]], i64 16
 ; CHECK-NEXT:    store i64 [[LENA1A]], ptr [[PLEN2]], align 4
 ; CHECK-NEXT:    [[PA1B:%.*]] = getelementptr [2 x %struct.A], ptr @a, i64 0, i64 1, i32 1, i64 [[I]]
 ; CHECK-NEXT:    [[LENA1B:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PA1B]])
-; CHECK-NEXT:    [[PLEN3:%.*]] = getelementptr i64, ptr [[PLEN]], i64 3
+; CHECK-NEXT:    [[PLEN3:%.*]] = getelementptr i8, ptr [[PLEN]], i64 24
 ; CHECK-NEXT:    store i64 [[LENA1B]], ptr [[PLEN3]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/strlen-9.ll b/llvm/test/Transforms/InstCombine/strlen-9.ll
index 4e0d728fb184e..e29e426c6a6a3 100644
--- a/llvm/test/Transforms/InstCombine/strlen-9.ll
+++ b/llvm/test/Transforms/InstCombine/strlen-9.ll
@@ -20,23 +20,23 @@ declare i64 @strlen(ptr)
 define void @fold_strlen_no_nul(ptr %plen, i32 %i) {
 ; CHECK-LABEL: @fold_strlen_no_nul(
 ; CHECK-NEXT:    store i64 5, ptr [[PLEN:%.*]], align 4
-; CHECK-NEXT:    [[PNA5_P5:%.*]] = getelementptr i64, ptr [[PLEN]], i64 1
+; CHECK-NEXT:    [[PNA5_P5:%.*]] = getelementptr i8, ptr [[PLEN]], i64 8
 ; CHECK-NEXT:    store i64 0, ptr [[PNA5_P5]], align 4
-; CHECK-NEXT:    [[PNS5_P6:%.*]] = getelementptr i64, ptr [[PLEN]], i64 2
+; CHECK-NEXT:    [[PNS5_P6:%.*]] = getelementptr i8, ptr [[PLEN]], i64 16
 ; CHECK-NEXT:    store i64 0, ptr [[PNS5_P6]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = sext i32 [[I:%.*]] to i64
 ; CHECK-NEXT:    [[PA5_PI:%.*]] = getelementptr [5 x i8], ptr @a5, i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[NA5_PI:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PA5_PI]])
-; CHECK-NEXT:    [[PNA5_PI:%.*]] = getelementptr i64, ptr [[PLEN]], i64 3
+; CHECK-NEXT:    [[PNA5_PI:%.*]] = getelementptr i8, ptr [[PLEN]], i64 24
 ; CHECK-NEXT:    store i64 [[NA5_PI]], ptr [[PNA5_PI]], align 4
-; CHECK-NEXT:    [[PNZ0_P0:%.*]] = getelementptr i64, ptr [[PLEN]], i64 4
+; CHECK-NEXT:    [[PNZ0_P0:%.*]] = getelementptr i8, ptr [[PLEN]], i64 32
 ; CHECK-NEXT:    store i64 0, ptr [[PNZ0_P0]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = sext i32 [[I]] to i64
 ; CHECK-NEXT:    [[PZ0_PI:%.*]] = getelementptr [0 x i8], ptr @z0, i64 0, i64 [[TMP2]]
 ; CHECK-NEXT:    [[NZ0_PI:%.*]] = call i64 @strlen(ptr noundef nonnull dereferenceable(1) [[PZ0_PI]])
-; CHECK-NEXT:    [[PNZ0_PI:%.*]] = getelementptr i64, ptr [[PLEN]], i64 5
+; CHECK-NEXT:    [[PNZ0_PI:%.*]] = getelementptr i8, ptr [[PLEN]], i64 40
 ; CHECK-NEXT:    store i64 [[NZ0_PI]], ptr [[PNZ0_PI]], align 4
-; CHECK-NEXT:    [[PNZ5_P5:%.*]] = getelementptr i64, ptr [[PLEN]], i64 6
+; CHECK-NEXT:    [[PNZ5_P5:%.*]] = getelementptr i8, ptr [[PLEN]], i64 48
 ; CHECK-NEXT:    store i64 0, ptr [[PNZ5_P5]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/strncmp-4.ll b/llvm/test/Transforms/InstCombine/strncmp-4.ll
index 0e019dad3a02a..b22650176b0d0 100644
--- a/llvm/test/Transforms/InstCombine/strncmp-4.ll
+++ b/llvm/test/Transforms/InstCombine/strncmp-4.ll
@@ -13,23 +13,23 @@ declare i32 @strncmp(ptr, ptr, i64)
 define void @fold_strncmp_Aa_b(ptr %pcmp) {
 ; CHECK-LABEL: @fold_strncmp_Aa_b(
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; CHECK-NEXT:    [[PCMP1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[PCMP1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP1]], align 4
-; CHECK-NEXT:    [[PCMP2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[PCMP2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP2]], align 4
-; CHECK-NEXT:    [[PCMP3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[PCMP3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP3]], align 4
-; CHECK-NEXT:    [[PCMP4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; CHECK-NEXT:    [[PCMP4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP4]], align 4
-; CHECK-NEXT:    [[PCMP5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; CHECK-NEXT:    [[PCMP5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP5]], align 4
-; CHECK-NEXT:    [[PCMP6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 6
+; CHECK-NEXT:    [[PCMP6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP6]], align 4
-; CHECK-NEXT:    [[PCMP7:%.*]] = getelementptr i32, ptr [[PCMP]], i64 7
+; CHECK-NEXT:    [[PCMP7:%.*]] = getelementptr i8, ptr [[PCMP]], i64 28
 ; CHECK-NEXT:    store i32 -1, ptr [[PCMP7]], align 4
-; CHECK-NEXT:    [[PCMP8:%.*]] = getelementptr i32, ptr [[PCMP]], i64 8
+; CHECK-NEXT:    [[PCMP8:%.*]] = getelementptr i8, ptr [[PCMP]], i64 32
 ; CHECK-NEXT:    store i32 -1, ptr [[PCMP8]], align 4
-; CHECK-NEXT:    [[PCMP9:%.*]] = getelementptr i32, ptr [[PCMP]], i64 9
+; CHECK-NEXT:    [[PCMP9:%.*]] = getelementptr i8, ptr [[PCMP]], i64 36
 ; CHECK-NEXT:    store i32 -1, ptr [[PCMP9]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -98,15 +98,15 @@ define void @fold_strncmp_Aa_b(ptr %pcmp) {
 define void @fold_strncmp_Ab_a(ptr %pcmp) {
 ; CHECK-LABEL: @fold_strncmp_Ab_a(
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
-; CHECK-NEXT:    [[PCMP1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[PCMP1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP1]], align 4
-; CHECK-NEXT:    [[PCMP2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[PCMP2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP2]], align 4
-; CHECK-NEXT:    [[PCMP3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[PCMP3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP3]], align 4
-; CHECK-NEXT:    [[PCMP4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; CHECK-NEXT:    [[PCMP4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; CHECK-NEXT:    store i32 1, ptr [[PCMP4]], align 4
-; CHECK-NEXT:    [[PCMP5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; CHECK-NEXT:    [[PCMP5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; CHECK-NEXT:    store i32 1, ptr [[PCMP5]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/strncmp-5.ll b/llvm/test/Transforms/InstCombine/strncmp-5.ll
index 1065091fc4c71..4c6585ef2a983 100644
--- a/llvm/test/Transforms/InstCombine/strncmp-5.ll
+++ b/llvm/test/Transforms/InstCombine/strncmp-5.ll
@@ -21,25 +21,25 @@ define void @fold_strncmp_a_b_n(ptr %pcmp, i64 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[N:%.*]], 0
 ; CHECK-NEXT:    [[C0_1:%.*]] = sext i1 [[TMP1]] to i32
-; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 [[C0_1]], ptr [[S0_1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_2:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[S0_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[S0_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 [[C0_2]], ptr [[S0_2]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_3:%.*]] = sext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[S0_3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[S0_3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 [[C0_3]], ptr [[S0_3]], align 4
-; CHECK-NEXT:    [[S0_4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; CHECK-NEXT:    [[S0_4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; CHECK-NEXT:    store i32 0, ptr [[S0_4]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_5:%.*]] = sext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[S0_5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; CHECK-NEXT:    [[S0_5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; CHECK-NEXT:    store i32 [[C0_5]], ptr [[S0_5]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C5_0:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[S5_0:%.*]] = getelementptr i32, ptr [[PCMP]], i64 6
+; CHECK-NEXT:    [[S5_0:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; CHECK-NEXT:    store i32 [[C5_0]], ptr [[S5_0]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -117,23 +117,23 @@ define void @fold_strncmp_a_c_n(ptr %pcmp, i64 %n) {
 ; CHECK-NEXT:    store i32 [[C0_0]], ptr [[PCMP:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_1:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 [[C0_1]], ptr [[S0_1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_2:%.*]] = sext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[S0_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[S0_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 [[C0_2]], ptr [[S0_2]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_3:%.*]] = sext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[S0_3:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[S0_3:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 [[C0_3]], ptr [[S0_3]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ugt i64 [[N]], 3
 ; CHECK-NEXT:    [[C0_4:%.*]] = sext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[S0_4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; CHECK-NEXT:    [[S0_4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; CHECK-NEXT:    store i32 [[C0_4]], ptr [[S0_4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i64 [[N]], 3
 ; CHECK-NEXT:    [[C0_5:%.*]] = sext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[S0_5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; CHECK-NEXT:    [[S0_5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; CHECK-NEXT:    store i32 [[C0_5]], ptr [[S0_5]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -188,27 +188,27 @@ define void @fold_strncmp_a_d_n(ptr %pcmp, i64 %n) {
 ; CHECK-NEXT:    store i32 [[C0_0]], ptr [[PCMP:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C0_1:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 [[C0_1]], ptr [[S0_1]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ugt i64 [[N]], 3
 ; CHECK-NEXT:    [[C1_1:%.*]] = zext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[S1_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[S1_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 [[C1_1]], ptr [[S1_1]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ugt i64 [[N]], 2
 ; CHECK-NEXT:    [[C2_2:%.*]] = zext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[S2_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[S2_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 [[C2_2]], ptr [[S2_2]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C4_4:%.*]] = zext i1 [[TMP5]] to i32
-; CHECK-NEXT:    [[S4_4:%.*]] = getelementptr i32, ptr [[PCMP]], i64 4
+; CHECK-NEXT:    [[S4_4:%.*]] = getelementptr i8, ptr [[PCMP]], i64 16
 ; CHECK-NEXT:    store i32 [[C4_4]], ptr [[S4_4]], align 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C4_4_2:%.*]] = sext i1 [[TMP6]] to i32
-; CHECK-NEXT:    [[S4_4_2:%.*]] = getelementptr i32, ptr [[PCMP]], i64 5
+; CHECK-NEXT:    [[S4_4_2:%.*]] = getelementptr i8, ptr [[PCMP]], i64 20
 ; CHECK-NEXT:    store i32 [[C4_4_2]], ptr [[S4_4_2]], align 4
-; CHECK-NEXT:    [[S5_5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 6
+; CHECK-NEXT:    [[S5_5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 24
 ; CHECK-NEXT:    store i32 0, ptr [[S5_5]], align 4
-; CHECK-NEXT:    [[S6_6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 7
+; CHECK-NEXT:    [[S6_6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 28
 ; CHECK-NEXT:    store i32 0, ptr [[S6_6]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -301,13 +301,13 @@ define void @fold_strncmp_d_e_n(ptr %pcmp, i64 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[PCMP:%.*]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne i64 [[N:%.*]], 0
 ; CHECK-NEXT:    [[C0_1:%.*]] = zext i1 [[TMP1]] to i32
-; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[S0_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 [[C0_1]], ptr [[S0_1]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[C1_0:%.*]] = sext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[S1_0:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[S1_0:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 [[C1_0]], ptr [[S1_0]], align 4
-; CHECK-NEXT:    [[S1_1:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[S1_1:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 0, ptr [[S1_1]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/strncmp-6.ll b/llvm/test/Transforms/InstCombine/strncmp-6.ll
index e4ab9e78583f4..1fb462be2221f 100644
--- a/llvm/test/Transforms/InstCombine/strncmp-6.ll
+++ b/llvm/test/Transforms/InstCombine/strncmp-6.ll
@@ -16,11 +16,11 @@ declare i32 @strncmp(ptr, ptr, i64)
 define void @fold_strncmp_cst_cst(ptr %pcmp) {
 ; CHECK-LABEL: @fold_strncmp_cst_cst(
 ; CHECK-NEXT:    store i32 -1, ptr [[PCMP:%.*]], align 4
-; CHECK-NEXT:    [[SB5_A5:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[SB5_A5:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 1, ptr [[SB5_A5]], align 4
-; CHECK-NEXT:    [[SA6_B6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[SA6_B6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 -1, ptr [[SA6_B6]], align 4
-; CHECK-NEXT:    [[SB6_A6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[SB6_A6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 1, ptr [[SB6_A6]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -63,15 +63,15 @@ define void @fold_strncmp_cst_var(ptr %pcmp, i64 %n) {
 ; CHECK-NEXT:    store i32 [[CA0_B0]], ptr [[PCMP:%.*]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ugt i64 [[N]], 6
 ; CHECK-NEXT:    [[CB0_A0:%.*]] = zext i1 [[TMP2]] to i32
-; CHECK-NEXT:    [[SB0_A0:%.*]] = getelementptr i32, ptr [[PCMP]], i64 1
+; CHECK-NEXT:    [[SB0_A0:%.*]] = getelementptr i8, ptr [[PCMP]], i64 4
 ; CHECK-NEXT:    store i32 [[CB0_A0]], ptr [[SB0_A0]], align 4
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[CA6_B6:%.*]] = sext i1 [[TMP3]] to i32
-; CHECK-NEXT:    [[SA6_B6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 2
+; CHECK-NEXT:    [[SA6_B6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 8
 ; CHECK-NEXT:    store i32 [[CA6_B6]], ptr [[SA6_B6]], align 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne i64 [[N]], 0
 ; CHECK-NEXT:    [[CB6_A6:%.*]] = zext i1 [[TMP4]] to i32
-; CHECK-NEXT:    [[SB6_A6:%.*]] = getelementptr i32, ptr [[PCMP]], i64 3
+; CHECK-NEXT:    [[SB6_A6:%.*]] = getelementptr i8, ptr [[PCMP]], i64 12
 ; CHECK-NEXT:    store i32 [[CB6_A6]], ptr [[SB6_A6]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index 1fceca321da12..494cdf62c7975 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -1124,7 +1124,7 @@ define i64 @test58(ptr %foo, i64 %i, i64 %j) {
 define i64 @test59(ptr %foo, i64 %i) {
 ; CHECK-LABEL: @test59(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO:%.*]], i64 0, i64 42, i64 [[I:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO]], i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4200
 ; CHECK-NEXT:    store ptr [[GEP1]], ptr @dummy_global1, align 8
 ; CHECK-NEXT:    store ptr [[GEP2]], ptr @dummy_global2, align 8
 ; CHECK-NEXT:    ret i64 [[I]]
@@ -1143,7 +1143,7 @@ define i64 @test59(ptr %foo, i64 %i) {
 define i64 @test60(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test60(
 ; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO:%.*]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO]], i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[FOO]], i64 4200
 ; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint ptr [[GEP1]] to i64
 ; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint ptr [[GEP2]] to i64
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i64 [[CAST1]], [[CAST2]]
@@ -1162,7 +1162,7 @@ define i64 @test60(ptr %foo, i64 %i, i64 %j) {
 
 define i64 @test61(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test61(
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO:%.*]], i64 0, i64 42, i64 0
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i8, ptr [[FOO:%.*]], i64 4200
 ; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [100 x [100 x i8]], ptr [[FOO]], i64 0, i64 [[J:%.*]], i64 [[I:%.*]]
 ; CHECK-NEXT:    [[CAST1:%.*]] = ptrtoint ptr [[GEP1]] to i64
 ; CHECK-NEXT:    [[CAST2:%.*]] = ptrtoint ptr [[GEP2]] to i64
diff --git a/llvm/test/Transforms/InstCombine/unpack-fca.ll b/llvm/test/Transforms/InstCombine/unpack-fca.ll
index 4ac78b9e32ce9..b0cd3823dcca9 100644
--- a/llvm/test/Transforms/InstCombine/unpack-fca.ll
+++ b/llvm/test/Transforms/InstCombine/unpack-fca.ll
@@ -24,7 +24,7 @@ define void @storeA(ptr %a.ptr) {
 define void @storeB(ptr %b.ptr) {
 ; CHECK-LABEL: @storeB(
 ; CHECK-NEXT:    store ptr null, ptr [[B_PTR:%.*]], align 8
-; CHECK-NEXT:    [[B_PTR_REPACK1:%.*]] = getelementptr inbounds [[B:%.*]], ptr [[B_PTR]], i64 0, i32 1
+; CHECK-NEXT:    [[B_PTR_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[B_PTR]], i64 8
 ; CHECK-NEXT:    store i64 42, ptr [[B_PTR_REPACK1]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -76,14 +76,14 @@ define void @storeArrayOfB(ptr %ab.ptr, [2 x %B] %ab) {
 ; CHECK-NEXT:    [[AB_ELT:%.*]] = extractvalue [2 x %B] [[AB:%.*]], 0
 ; CHECK-NEXT:    [[AB_ELT_ELT:%.*]] = extractvalue [[B:%.*]] [[AB_ELT]], 0
 ; CHECK-NEXT:    store ptr [[AB_ELT_ELT]], ptr [[AB_PTR:%.*]], align 8
-; CHECK-NEXT:    [[AB_PTR_REPACK3:%.*]] = getelementptr inbounds [[B]], ptr [[AB_PTR]], i64 0, i32 1
+; CHECK-NEXT:    [[AB_PTR_REPACK3:%.*]] = getelementptr inbounds i8, ptr [[AB_PTR]], i64 8
 ; CHECK-NEXT:    [[AB_ELT_ELT4:%.*]] = extractvalue [[B]] [[AB_ELT]], 1
 ; CHECK-NEXT:    store i64 [[AB_ELT_ELT4]], ptr [[AB_PTR_REPACK3]], align 8
-; CHECK-NEXT:    [[AB_PTR_REPACK1:%.*]] = getelementptr inbounds [2 x %B], ptr [[AB_PTR]], i64 0, i64 1
+; CHECK-NEXT:    [[AB_PTR_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[AB_PTR]], i64 16
 ; CHECK-NEXT:    [[AB_ELT2:%.*]] = extractvalue [2 x %B] [[AB]], 1
 ; CHECK-NEXT:    [[AB_ELT2_ELT:%.*]] = extractvalue [[B]] [[AB_ELT2]], 0
 ; CHECK-NEXT:    store ptr [[AB_ELT2_ELT]], ptr [[AB_PTR_REPACK1]], align 8
-; CHECK-NEXT:    [[AB_PTR_REPACK1_REPACK5:%.*]] = getelementptr inbounds [2 x %B], ptr [[AB_PTR]], i64 0, i64 1, i32 1
+; CHECK-NEXT:    [[AB_PTR_REPACK1_REPACK5:%.*]] = getelementptr inbounds i8, ptr [[AB_PTR]], i64 24
 ; CHECK-NEXT:    [[AB_ELT2_ELT6:%.*]] = extractvalue [[B]] [[AB_ELT2]], 1
 ; CHECK-NEXT:    store i64 [[AB_ELT2_ELT6]], ptr [[AB_PTR_REPACK1_REPACK5]], align 8
 ; CHECK-NEXT:    ret void
@@ -106,7 +106,7 @@ define %B @loadB(ptr %b.ptr) {
 ; CHECK-LABEL: @loadB(
 ; CHECK-NEXT:    [[DOTUNPACK:%.*]] = load ptr, ptr [[B_PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[B:%.*]] poison, ptr [[DOTUNPACK]], 0
-; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds [[B]], ptr [[B_PTR]], i64 0, i32 1
+; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds i8, ptr [[B_PTR]], i64 8
 ; CHECK-NEXT:    [[DOTUNPACK2:%.*]] = load i64, ptr [[DOTELT1]], align 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [[B]] [[TMP1]], i64 [[DOTUNPACK2]], 1
 ; CHECK-NEXT:    ret [[B]] [[TMP2]]
@@ -162,9 +162,9 @@ define { %A } @structOfA(ptr %sa.ptr) {
 define %B @structB(ptr %b.ptr) {
 ; CHECK-LABEL: @structB(
 ; CHECK-NEXT:    store ptr null, ptr [[B_PTR:%.*]], align 8
-; CHECK-NEXT:    [[B_PTR_REPACK1:%.*]] = getelementptr inbounds [[B:%.*]], ptr [[B_PTR]], i64 0, i32 1
+; CHECK-NEXT:    [[B_PTR_REPACK1:%.*]] = getelementptr inbounds i8, ptr [[B_PTR]], i64 8
 ; CHECK-NEXT:    store i64 42, ptr [[B_PTR_REPACK1]], align 8
-; CHECK-NEXT:    ret [[B]] { ptr null, i64 42 }
+; CHECK-NEXT:    ret [[B:%.*]] { ptr null, i64 42 }
 ;
   store %B { ptr null, i64 42 }, ptr %b.ptr, align 8
   %1 = load %B, ptr %b.ptr, align 8
@@ -175,14 +175,14 @@ define [2 x %B] @loadArrayOfB(ptr %ab.ptr) {
 ; CHECK-LABEL: @loadArrayOfB(
 ; CHECK-NEXT:    [[DOTUNPACK_UNPACK:%.*]] = load ptr, ptr [[AB_PTR:%.*]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertvalue [[B:%.*]] poison, ptr [[DOTUNPACK_UNPACK]], 0
-; CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds [[B]], ptr [[AB_PTR]], i64 0, i32 1
+; CHECK-NEXT:    [[DOTUNPACK_ELT3:%.*]] = getelementptr inbounds i8, ptr [[AB_PTR]], i64 8
 ; CHECK-NEXT:    [[DOTUNPACK_UNPACK4:%.*]] = load i64, ptr [[DOTUNPACK_ELT3]], align 8
 ; CHECK-NEXT:    [[DOTUNPACK5:%.*]] = insertvalue [[B]] [[TMP1]], i64 [[DOTUNPACK_UNPACK4]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertvalue [2 x %B] poison, [[B]] [[DOTUNPACK5]], 0
-; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds [2 x %B], ptr [[AB_PTR]], i64 0, i64 1
+; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds i8, ptr [[AB_PTR]], i64 16
 ; CHECK-NEXT:    [[DOTUNPACK2_UNPACK:%.*]] = load ptr, ptr [[DOTELT1]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertvalue [[B]] poison, ptr [[DOTUNPACK2_UNPACK]], 0
-; CHECK-NEXT:    [[DOTUNPACK2_ELT6:%.*]] = getelementptr inbounds [2 x %B], ptr [[AB_PTR]], i64 0, i64 1, i32 1
+; CHECK-NEXT:    [[DOTUNPACK2_ELT6:%.*]] = getelementptr inbounds i8, ptr [[AB_PTR]], i64 24
 ; CHECK-NEXT:    [[DOTUNPACK2_UNPACK7:%.*]] = load i64, ptr [[DOTUNPACK2_ELT6]], align 8
 ; CHECK-NEXT:    [[DOTUNPACK28:%.*]] = insertvalue [[B]] [[TMP3]], i64 [[DOTUNPACK2_UNPACK7]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertvalue [2 x %B] [[TMP2]], [[B]] [[DOTUNPACK28]], 1
@@ -207,7 +207,7 @@ define [2000 x %B] @loadLargeArrayOfB(ptr %ab.ptr) {
 ; Make sure that we do not increase alignment of packed struct element
 define i32 @packed_alignment(ptr dereferenceable(9) %s) {
 ; CHECK-LABEL: @packed_alignment(
-; CHECK-NEXT:    [[TV_ELT1:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], ptr [[S:%.*]], i64 0, i32 1, i32 1
+; CHECK-NEXT:    [[TV_ELT1:%.*]] = getelementptr inbounds i8, ptr [[S:%.*]], i64 5
 ; CHECK-NEXT:    [[TV_UNPACK2:%.*]] = load i32, ptr [[TV_ELT1]], align 1
 ; CHECK-NEXT:    ret i32 [[TV_UNPACK2]]
 ;
@@ -222,38 +222,38 @@ define i32 @packed_alignment(ptr dereferenceable(9) %s) {
 define void @check_alignment(ptr %u, ptr %v) {
 ; CHECK-LABEL: @check_alignment(
 ; CHECK-NEXT:    [[DOTUNPACK:%.*]] = load i8, ptr [[U:%.*]], align 8
-; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds [[STRUCT_U:%.*]], ptr [[U]], i64 0, i32 1
+; CHECK-NEXT:    [[DOTELT1:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 1
 ; CHECK-NEXT:    [[DOTUNPACK2:%.*]] = load i8, ptr [[DOTELT1]], align 1
-; CHECK-NEXT:    [[DOTELT3:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[U]], i64 0, i32 2
+; CHECK-NEXT:    [[DOTELT3:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 2
 ; CHECK-NEXT:    [[DOTUNPACK4:%.*]] = load i8, ptr [[DOTELT3]], align 2
-; CHECK-NEXT:    [[DOTELT5:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[U]], i64 0, i32 3
+; CHECK-NEXT:    [[DOTELT5:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 3
 ; CHECK-NEXT:    [[DOTUNPACK6:%.*]] = load i8, ptr [[DOTELT5]], align 1
-; CHECK-NEXT:    [[DOTELT7:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[U]], i64 0, i32 4
+; CHECK-NEXT:    [[DOTELT7:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 4
 ; CHECK-NEXT:    [[DOTUNPACK8:%.*]] = load i8, ptr [[DOTELT7]], align 4
-; CHECK-NEXT:    [[DOTELT9:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[U]], i64 0, i32 5
+; CHECK-NEXT:    [[DOTELT9:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 5
 ; CHECK-NEXT:    [[DOTUNPACK10:%.*]] = load i8, ptr [[DOTELT9]], align 1
-; CHECK-NEXT:    [[DOTELT11:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[U]], i64 0, i32 6
+; CHECK-NEXT:    [[DOTELT11:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 6
 ; CHECK-NEXT:    [[DOTUNPACK12:%.*]] = load i8, ptr [[DOTELT11]], align 2
-; CHECK-NEXT:    [[DOTELT13:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[U]], i64 0, i32 7
+; CHECK-NEXT:    [[DOTELT13:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 7
 ; CHECK-NEXT:    [[DOTUNPACK14:%.*]] = load i8, ptr [[DOTELT13]], align 1
-; CHECK-NEXT:    [[DOTELT15:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[U]], i64 0, i32 8
+; CHECK-NEXT:    [[DOTELT15:%.*]] = getelementptr inbounds i8, ptr [[U]], i64 8
 ; CHECK-NEXT:    [[DOTUNPACK16:%.*]] = load i64, ptr [[DOTELT15]], align 8
 ; CHECK-NEXT:    store i8 [[DOTUNPACK]], ptr [[V:%.*]], align 8
-; CHECK-NEXT:    [[V_REPACK17:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[V]], i64 0, i32 1
+; CHECK-NEXT:    [[V_REPACK17:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 1
 ; CHECK-NEXT:    store i8 [[DOTUNPACK2]], ptr [[V_REPACK17]], align 1
-; CHECK-NEXT:    [[V_REPACK19:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[V]], i64 0, i32 2
+; CHECK-NEXT:    [[V_REPACK19:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 2
 ; CHECK-NEXT:    store i8 [[DOTUNPACK4]], ptr [[V_REPACK19]], align 2
-; CHECK-NEXT:    [[V_REPACK21:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[V]], i64 0, i32 3
+; CHECK-NEXT:    [[V_REPACK21:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 3
 ; CHECK-NEXT:    store i8 [[DOTUNPACK6]], ptr [[V_REPACK21]], align 1
-; CHECK-NEXT:    [[V_REPACK23:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[V]], i64 0, i32 4
+; CHECK-NEXT:    [[V_REPACK23:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 4
 ; CHECK-NEXT:    store i8 [[DOTUNPACK8]], ptr [[V_REPACK23]], align 4
-; CHECK-NEXT:    [[V_REPACK25:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[V]], i64 0, i32 5
+; CHECK-NEXT:    [[V_REPACK25:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 5
 ; CHECK-NEXT:    store i8 [[DOTUNPACK10]], ptr [[V_REPACK25]], align 1
-; CHECK-NEXT:    [[V_REPACK27:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[V]], i64 0, i32 6
+; CHECK-NEXT:    [[V_REPACK27:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 6
 ; CHECK-NEXT:    store i8 [[DOTUNPACK12]], ptr [[V_REPACK27]], align 2
-; CHECK-NEXT:    [[V_REPACK29:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[V]], i64 0, i32 7
+; CHECK-NEXT:    [[V_REPACK29:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 7
 ; CHECK-NEXT:    store i8 [[DOTUNPACK14]], ptr [[V_REPACK29]], align 1
-; CHECK-NEXT:    [[V_REPACK31:%.*]] = getelementptr inbounds [[STRUCT_U]], ptr [[V]], i64 0, i32 8
+; CHECK-NEXT:    [[V_REPACK31:%.*]] = getelementptr inbounds i8, ptr [[V]], i64 8
 ; CHECK-NEXT:    store i64 [[DOTUNPACK16]], ptr [[V_REPACK31]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
index 25f5c39c2c948..738ef1bc1ad2f 100644
--- a/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts-inseltpoison.ll
@@ -512,7 +512,7 @@ define ptr @gep_vbase_w_s_idx(<2 x ptr> %base, i64 %index) {
 
 define ptr @gep_splat_base_w_s_idx(ptr %base) {
 ; CHECK-LABEL: @gep_splat_base_w_s_idx(
-; CHECK-NEXT:    [[EE:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 1
+; CHECK-NEXT:    [[EE:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 4
 ; CHECK-NEXT:    ret ptr [[EE]]
 ;
   %basevec1 = insertelement <2 x ptr> poison, ptr %base, i32 0
@@ -576,7 +576,7 @@ define ptr @gep_cvbase_w_cv_idx(<2 x ptr> %base, i64 %raw_addr) {
 
 define ptr @gep_sbase_w_cv_idx(ptr %base) {
 ; CHECK-LABEL: @gep_sbase_w_cv_idx(
-; CHECK-NEXT:    [[EE:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 1
+; CHECK-NEXT:    [[EE:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 4
 ; CHECK-NEXT:    ret ptr [[EE]]
 ;
   %gep = getelementptr i32, ptr %base, <2 x i64> <i64 0, i64 1>
diff --git a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
index c84d39c5aa9dd..4e5fdbb1eab99 100644
--- a/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
+++ b/llvm/test/Transforms/InstCombine/vec_demanded_elts.ll
@@ -515,7 +515,7 @@ define ptr @gep_vbase_w_s_idx(<2 x ptr> %base, i64 %index) {
 
 define ptr @gep_splat_base_w_s_idx(ptr %base) {
 ; CHECK-LABEL: @gep_splat_base_w_s_idx(
-; CHECK-NEXT:    [[EE:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 1
+; CHECK-NEXT:    [[EE:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 4
 ; CHECK-NEXT:    ret ptr [[EE]]
 ;
   %basevec1 = insertelement <2 x ptr> undef, ptr %base, i32 0
@@ -579,7 +579,7 @@ define ptr @gep_cvbase_w_cv_idx(<2 x ptr> %base, i64 %raw_addr) {
 
 define ptr @gep_sbase_w_cv_idx(ptr %base) {
 ; CHECK-LABEL: @gep_sbase_w_cv_idx(
-; CHECK-NEXT:    [[EE:%.*]] = getelementptr i32, ptr [[BASE:%.*]], i64 1
+; CHECK-NEXT:    [[EE:%.*]] = getelementptr i8, ptr [[BASE:%.*]], i64 4
 ; CHECK-NEXT:    ret ptr [[EE]]
 ;
   %gep = getelementptr i32, ptr %base, <2 x i64> <i64 0, i64 1>
diff --git a/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll
index 40b76d140b5a1..4ac1d084cda9a 100644
--- a/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg-inseltpoison.ll
@@ -3,7 +3,7 @@
 
 define <4 x ptr> @PR41270(ptr %x) {
 ; CHECK-LABEL: @PR41270(
-; CHECK-NEXT:    [[T3:%.*]] = getelementptr inbounds [4 x i16], ptr [[X:%.*]], i64 0, i64 3
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 6
 ; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x ptr> poison, ptr [[T3]], i64 0
 ; CHECK-NEXT:    ret <4 x ptr> [[INS2]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg.ll b/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg.ll
index 4e4fa7defa8cf..8b44213333542 100644
--- a/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg.ll
+++ b/llvm/test/Transforms/InstCombine/vec_gep_scalar_arg.ll
@@ -3,7 +3,7 @@
 
 define <4 x ptr> @PR41270(ptr %x) {
 ; CHECK-LABEL: @PR41270(
-; CHECK-NEXT:    [[T3:%.*]] = getelementptr inbounds [4 x i16], ptr [[X:%.*]], i64 0, i64 3
+; CHECK-NEXT:    [[T3:%.*]] = getelementptr inbounds i8, ptr [[X:%.*]], i64 6
 ; CHECK-NEXT:    [[INS2:%.*]] = insertelement <4 x ptr> <ptr poison, ptr undef, ptr undef, ptr undef>, ptr [[T3]], i64 0
 ; CHECK-NEXT:    ret <4 x ptr> [[INS2]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/vscale_gep.ll b/llvm/test/Transforms/InstCombine/vscale_gep.ll
index 2a1865f69fe30..534888dc1a1c4 100644
--- a/llvm/test/Transforms/InstCombine/vscale_gep.ll
+++ b/llvm/test/Transforms/InstCombine/vscale_gep.ll
@@ -40,7 +40,7 @@ define void @gep_bitcast(ptr %p) {
 define i32 @gep_alloca_inbounds_vscale_zero() {
 ; CHECK-LABEL: @gep_alloca_inbounds_vscale_zero(
 ; CHECK-NEXT:    [[A:%.*]] = alloca <vscale x 4 x i32>, align 16
-; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds <vscale x 4 x i32>, ptr [[A]], i64 0, i64 2
+; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[LOAD:%.*]] = load i32, ptr [[TMP]], align 4
 ; CHECK-NEXT:    ret i32 [[LOAD]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/wcslen-5.ll b/llvm/test/Transforms/InstCombine/wcslen-5.ll
index 29874e7de5928..3e0cdf34a6422 100644
--- a/llvm/test/Transforms/InstCombine/wcslen-5.ll
+++ b/llvm/test/Transforms/InstCombine/wcslen-5.ll
@@ -42,7 +42,7 @@ define dso_local i64 @fold_wcslen_s3_pi_p1_s5(i1 zeroext %0, i64 %1) {
 ; XFAIL-CHECK-NEXT:    ret i64 [[SEL]]
 ; CHECK-LABEL: @fold_wcslen_s3_pi_p1_s5(
 ; CHECK-NEXT:    [[PS3_PI:%.*]] = getelementptr inbounds [4 x i32], ptr @ws3, i64 0, i64 [[TMP1:%.*]]
-; CHECK-NEXT:    [[PS3_PI_P1:%.*]] = getelementptr inbounds i32, ptr [[PS3_PI]], i64 1
+; CHECK-NEXT:    [[PS3_PI_P1:%.*]] = getelementptr inbounds i8, ptr [[PS3_PI]], i64 4
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[TMP0:%.*]], ptr [[PS3_PI_P1]], ptr @ws5
 ; CHECK-NEXT:    [[LEN:%.*]] = tail call i64 @wcslen(ptr nonnull [[SEL]])
 ; CHECK-NEXT:    ret i64 [[LEN]]
diff --git a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
index b364116e78e4f..141af3e66949a 100644
--- a/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
+++ b/llvm/test/Transforms/LoopUnroll/ARM/upperbound.ll
@@ -17,7 +17,7 @@ define void @test(ptr %x, i32 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[X]], align 4
 ; CHECK-NEXT:    br label [[IF_END]]
 ; CHECK:       if.end:
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr [[X]], i32 4
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[REM]], 1
 ; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY_1:%.*]]
 ; CHECK:       while.body.1:
@@ -28,7 +28,7 @@ define void @test(ptr %x, i32 %n) {
 ; CHECK-NEXT:    store i32 0, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    br label [[IF_END_1]]
 ; CHECK:       if.end.1:
-; CHECK-NEXT:    [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i32, ptr [[X]], i32 2
+; CHECK-NEXT:    [[INCDEC_PTR_1:%.*]] = getelementptr inbounds i8, ptr [[X]], i32 8
 ; CHECK-NEXT:    [[CMP_1:%.*]] = icmp sgt i32 [[REM]], 2
 ; CHECK-NEXT:    br i1 [[CMP_1]], label [[WHILE_BODY_2:%.*]], label [[WHILE_END]]
 ; CHECK:       while.body.2:
diff --git a/llvm/test/Transforms/LoopUnroll/peel-loop.ll b/llvm/test/Transforms/LoopUnroll/peel-loop.ll
index 88ffe60075f9b..a76facae3cb9d 100644
--- a/llvm/test/Transforms/LoopUnroll/peel-loop.ll
+++ b/llvm/test/Transforms/LoopUnroll/peel-loop.ll
@@ -16,20 +16,20 @@ define void @basic(ptr %p, i32 %k) #0 {
 ; CHECK-NEXT:    [[CMP_PEEL_NOT:%.*]] = icmp eq i32 [[K]], 1
 ; CHECK-NEXT:    br i1 [[CMP_PEEL_NOT]], label [[FOR_END]], label [[FOR_BODY_PEEL2:%.*]]
 ; CHECK:       for.body.peel2:
-; CHECK-NEXT:    [[INCDEC_PTR_PEEL:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR_PEEL:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 4
 ; CHECK-NEXT:    store i32 1, ptr [[INCDEC_PTR_PEEL]], align 4
 ; CHECK-NEXT:    [[CMP_PEEL5:%.*]] = icmp sgt i32 [[K]], 2
 ; CHECK-NEXT:    br i1 [[CMP_PEEL5]], label [[FOR_BODY_PEEL7:%.*]], label [[FOR_END]]
 ; CHECK:       for.body.peel7:
-; CHECK-NEXT:    [[INCDEC_PTR_PEEL3:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR_PEEL8:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR_PEEL3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    [[INCDEC_PTR_PEEL8:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 ; CHECK-NEXT:    store i32 2, ptr [[INCDEC_PTR_PEEL3]], align 4
 ; CHECK-NEXT:    [[CMP_PEEL10_NOT:%.*]] = icmp eq i32 [[K]], 3
 ; CHECK-NEXT:    br i1 [[CMP_PEEL10_NOT]], label [[FOR_END]], label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 3, [[FOR_BODY_PEEL7]] ]
 ; CHECK-NEXT:    [[P_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[INCDEC_PTR_PEEL8]], [[FOR_BODY_PEEL7]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[P_ADDR_04]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_ADDR_04]], i64 4
 ; CHECK-NEXT:    store i32 [[I_05]], ptr [[P_ADDR_04]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_05]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[K]]
@@ -76,20 +76,20 @@ define i32 @output(ptr %p, i32 %k) #0 {
 ; CHECK-NEXT:    [[CMP_PEEL_NOT:%.*]] = icmp eq i32 [[K]], 1
 ; CHECK-NEXT:    br i1 [[CMP_PEEL_NOT]], label [[FOR_END]], label [[FOR_BODY_PEEL2:%.*]]
 ; CHECK:       for.body.peel2:
-; CHECK-NEXT:    [[INCDEC_PTR_PEEL:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR_PEEL:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 4
 ; CHECK-NEXT:    store i32 1, ptr [[INCDEC_PTR_PEEL]], align 4
 ; CHECK-NEXT:    [[CMP_PEEL5:%.*]] = icmp sgt i32 [[K]], 2
 ; CHECK-NEXT:    br i1 [[CMP_PEEL5]], label [[FOR_BODY_PEEL7:%.*]], label [[FOR_END]]
 ; CHECK:       for.body.peel7:
-; CHECK-NEXT:    [[INCDEC_PTR_PEEL3:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 2
-; CHECK-NEXT:    [[INCDEC_PTR_PEEL8:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 3
+; CHECK-NEXT:    [[INCDEC_PTR_PEEL3:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    [[INCDEC_PTR_PEEL8:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
 ; CHECK-NEXT:    store i32 2, ptr [[INCDEC_PTR_PEEL3]], align 4
 ; CHECK-NEXT:    [[CMP_PEEL10_NOT:%.*]] = icmp eq i32 [[K]], 3
 ; CHECK-NEXT:    br i1 [[CMP_PEEL10_NOT]], label [[FOR_END]], label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_05:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 3, [[FOR_BODY_PEEL7]] ]
 ; CHECK-NEXT:    [[P_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[INCDEC_PTR_PEEL8]], [[FOR_BODY_PEEL7]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[P_ADDR_04]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[P_ADDR_04]], i64 4
 ; CHECK-NEXT:    store i32 [[I_05]], ptr [[P_ADDR_04]], align 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_05]], 1
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[INC]], [[K]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
index bcf5c1888786a..03f055413583f 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/deterministic-type-shrinkage.ll
@@ -459,7 +459,7 @@ define void @old_and_new_size_equalko(ptr noalias %src, ptr noalias %dst) {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
index 95dc010811e6c..b772a3814a64a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/intrinsiccost.ll
@@ -38,12 +38,12 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[NEXT_GEP6:%.*]] = getelementptr i8, ptr [[PDST]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP5]], align 2
 ; CHECK-NEXT:    [[TMP6:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD]], <8 x i16> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> [[WIDE_LOAD8]], <8 x i16> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i16, ptr [[NEXT_GEP6]], i64 8
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[NEXT_GEP6]], i64 16
 ; CHECK-NEXT:    store <8 x i16> [[TMP6]], ptr [[NEXT_GEP6]], align 2
 ; CHECK-NEXT:    store <8 x i16> [[TMP7]], ptr [[TMP8]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -61,10 +61,10 @@ define void @saddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRC_ADDR_08]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 2
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = tail call i16 @llvm.sadd.sat.i16(i16 [[TMP10]], i16 [[OFFSET]])
-; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i16, ptr [[PDST_ADDR_07]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i8, ptr [[PDST_ADDR_07]], i64 2
 ; CHECK-NEXT:    store i16 [[TMP11]], ptr [[PDST_ADDR_07]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_09]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
index 7848d0836d546..6f62f2f2096f1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-cond-inv-loads.ll
@@ -173,7 +173,7 @@ define void @invariant_load_cond(ptr noalias nocapture %a, ptr nocapture readonl
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 42
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 168
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x ptr> poison, ptr [[TMP5]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x ptr> [[BROADCAST_SPLATINSERT]], <vscale x 4 x ptr> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[COND:%.*]], i64 [[INDEX]]
@@ -201,7 +201,7 @@ define void @invariant_load_cond(ptr noalias nocapture %a, ptr nocapture readonl
 ; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp eq i32 [[TMP12]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 42
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 168
 ; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX3]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index fa8e35f4697e9..16718bb41751b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -26,8 +26,8 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[D:%.*]], i64 0
@@ -36,19 +36,19 @@ define void @test_array_load2_store2(i32 %C, i32 %D) #1 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <vscale x 4 x i32> [[TMP1]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = mul nsw <vscale x 4 x i32> [[TMP2]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i64 -1
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP5]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <vscale x 4 x i32> [[TMP3]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP7:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 -4
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP7]])
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -106,13 +106,13 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <vscale x 4 x i64> [[TMP0]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -121,22 +121,22 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP4]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[TMP5]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP7]]
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP9]]
-; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i64> [[TMP5]], i64 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP11]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 -1
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP8]], <vscale x 4 x i32> [[TMP10]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[TMP7]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[BROADCAST_SPLAT]], [[TMP9]]
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER1]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul nsw <vscale x 4 x i32> [[BROADCAST_SPLAT3]], [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x i64> [[TMP7]], i64 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP10]], <vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -197,13 +197,13 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl <vscale x 4 x i64> [[TMP0]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 3
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP3]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl <vscale x 4 x i64> [[TMP2]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP5]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[C:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -212,23 +212,23 @@ define void @test_array_load2_store2_i16(i32 noundef %C, i32 noundef %D) #1 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP1]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i32], ptr @AB, i64 0, i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <vscale x 4 x i32> [[TMP5]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP9:%.*]] = trunc <vscale x 4 x i32> [[TMP8]] to <vscale x 4 x i16>
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP9]], <vscale x 4 x ptr> [[TMP10]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <vscale x 4 x i32> [[TMP6]], [[BROADCAST_SPLAT2]]
-; CHECK-NEXT:    [[TMP12:%.*]] = trunc <vscale x 4 x i32> [[TMP11]] to <vscale x 4 x i16>
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, <vscale x 4 x i64> [[TMP7]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP12]], <vscale x 4 x ptr> [[TMP13]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[TMP7]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP11:%.*]] = trunc <vscale x 4 x i32> [[TMP10]] to <vscale x 4 x i16>
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP11]], <vscale x 4 x ptr> [[TMP12]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP13:%.*]] = mul nsw <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT2]]
+; CHECK-NEXT:    [[TMP14:%.*]] = trunc <vscale x 4 x i32> [[TMP13]] to <vscale x 4 x i16>
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [1024 x i16], ptr @CD_i16, i64 0, <vscale x 4 x i64> [[TMP9]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i16.nxv4p0(<vscale x 4 x i16> [[TMP14]], <vscale x 4 x ptr> [[TMP15]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -277,42 +277,42 @@ define i32 @test_struct_load6(%struct.ST6* %S) #1 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP2]], i64 0
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP4]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP0]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[S:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP3]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP4]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 2
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP5]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 3
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 4
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 5
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
-; CHECK-NEXT:    [[TMP9:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
-; CHECK-NEXT:    [[TMP10:%.*]] = add <vscale x 4 x i32> [[TMP9]], [[WIDE_MASKED_GATHER2]]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER3]]
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[TMP12]], [[WIDE_MASKED_GATHER5]]
-; CHECK-NEXT:    [[TMP14]] = sub <vscale x 4 x i32> [[TMP10]], [[TMP13]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP2]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[STRUCT_ST6:%.*]], ptr [[S:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP5]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER1:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 2
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER2:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP7]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 3
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER3:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP8]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 4
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP9]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[STRUCT_ST6]], ptr [[S]], <vscale x 4 x i64> [[VEC_IND]], i32 5
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <vscale x 4 x i32> @llvm.masked.gather.nxv4i32.nxv4p0(<vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i32> poison)
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER]], [[VEC_PHI]]
+; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i32> [[TMP11]], [[WIDE_MASKED_GATHER2]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i32> [[WIDE_MASKED_GATHER1]], [[WIDE_MASKED_GATHER3]]
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i32> [[TMP13]], [[WIDE_MASKED_GATHER4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = add <vscale x 4 x i32> [[TMP14]], [[WIDE_MASKED_GATHER5]]
+; CHECK-NEXT:    [[TMP16]] = sub <vscale x 4 x i32> [[TMP12]], [[TMP15]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP14]])
+; CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -381,12 +381,12 @@ define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
-; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1023, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i32 [[TMP1]], -4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.stepvector.nxv4i32()
+; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i32> shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1023, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer), [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i32 [[TMP3]], -4
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[DOTNEG]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[DOTSPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
@@ -394,31 +394,31 @@ define void @test_reversed_load2_store2(%struct.ST2* noalias nocapture readonly
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i32> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i32 [[TMP3]], 3
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw i32 2, [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[TMP5]] to i64
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[TMP6]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
+; CHECK-NEXT:    [[TMP5:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i32 [[TMP5]], 3
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw i32 2, [[TMP6]]
+; CHECK-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 [[TMP8]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP8]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP9]])
-; CHECK-NEXT:    [[TMP10:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP11:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP14:%.*]] = shl nuw nsw i32 [[TMP13]], 3
-; CHECK-NEXT:    [[TMP15:%.*]] = sub nsw i32 1, [[TMP14]]
-; CHECK-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP15]] to i64
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 [[TMP16]]
-; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
-; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[REVERSE:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP10]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[REVERSE1:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP12:%.*]] = add nsw <vscale x 4 x i32> [[REVERSE]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP13:%.*]] = sub nsw <vscale x 4 x i32> [[REVERSE1]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i32 [[TMP15]], 3
+; CHECK-NEXT:    [[TMP17:%.*]] = sub nsw i32 1, [[TMP16]]
+; CHECK-NEXT:    [[TMP18:%.*]] = sext i32 [[TMP17]] to i64
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[TMP14]], i64 [[TMP18]]
+; CHECK-NEXT:    [[REVERSE2:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP12]])
+; CHECK-NEXT:    [[REVERSE3:%.*]] = call <vscale x 4 x i32> @llvm.experimental.vector.reverse.nxv4i32(<vscale x 4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[REVERSE2]], <vscale x 4 x i32> [[REVERSE3]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP17]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP19]]
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
@@ -475,21 +475,21 @@ define void @even_load_static_tc(i32* noalias nocapture readonly %A, i32* noalia
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i64 512, [[TMP1]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw nsw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nsw <vscale x 4 x i32> [[TMP3]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP5:%.*]] = and i64 [[INDEX]], 9223372036854775804
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP4]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
+; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw <vscale x 4 x i32> [[TMP5]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP7:%.*]] = and i64 [[INDEX]], 9223372036854775804
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP7]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP6]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       middle.block:
@@ -561,21 +561,21 @@ define void @even_load_dynamic_tc(i32* noalias nocapture readonly %A, i32* noali
 ; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP8]], i64 [[TMP6]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP9]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 2
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = and i64 [[INDEX]], 9223372036854775804
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP13]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = shl nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = and i64 [[INDEX]], 9223372036854775804
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP15]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       middle.block:
@@ -640,12 +640,12 @@ define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noal
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
-; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), [[TMP0]]
-; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP1]], -4
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[INDUCTION:%.*]] = sub <vscale x 4 x i64> shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1023, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer), [[TMP2]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP3]], -4
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[DOTNEG]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[X:%.*]], i64 0
@@ -654,14 +654,14 @@ define void @load_gap_reverse(%pair* noalias nocapture readonly %P1, %pair* noal
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP4]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i64> poison)
-; CHECK-NEXT:    [[TMP5:%.*]] = sub nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], [[VEC_IND]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP2]], <vscale x 4 x ptr> [[TMP3]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP5]], <vscale x 4 x ptr> [[TMP4]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <vscale x 4 x i64> [[BROADCAST_SPLAT]], [[VEC_IND]]
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds [[PAIR:%.*]], ptr [[P1:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR]], ptr [[P2:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i64> @llvm.masked.gather.nxv4i64.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i64> poison)
+; CHECK-NEXT:    [[TMP7:%.*]] = sub nsw <vscale x 4 x i64> [[WIDE_MASKED_GATHER]], [[VEC_IND]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP4]], <vscale x 4 x ptr> [[TMP5]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i64.nxv4p0(<vscale x 4 x i64> [[TMP7]], <vscale x 4 x ptr> [[TMP6]], i32 8, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -709,28 +709,28 @@ define void @mixed_load2_store2(i32* noalias nocapture readonly %A, i32* noalias
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP4:%.*]] = mul nsw <vscale x 4 x i32> [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP6:%.*]] = mul nsw <vscale x 4 x i32> [[TMP4]], [[TMP3]]
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP5:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <vscale x 4 x i32> [[TMP6]], [[TMP5]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 -1
-; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP4]], <vscale x 4 x i32> [[TMP7]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 1
+; CHECK-NEXT:    [[TMP9:%.*]] = add nsw <vscale x 4 x i32> [[TMP8]], [[TMP7]]
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i64 -4
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[TMP6]], <vscale x 4 x i32> [[TMP9]])
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 512
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
@@ -802,33 +802,33 @@ define void @int_float_struct(%struct.IntFloat* nocapture readonly %p) #0 {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x float> [ insertelement (<vscale x 4 x float> zeroinitializer, float undef, i32 0), [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ insertelement (<vscale x 4 x i32> zeroinitializer, i32 undef, i32 0), [[VECTOR_PH]] ], [ [[TMP6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [[STRUCT_INTFLOAT:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP1:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
-; CHECK-NEXT:    [[TMP3:%.*]] = bitcast <vscale x 4 x i32> [[TMP2]] to <vscale x 4 x float>
-; CHECK-NEXT:    [[TMP4]] = add <vscale x 4 x i32> [[TMP1]], [[VEC_PHI1]]
-; CHECK-NEXT:    [[TMP5]] = fadd fast <vscale x 4 x float> [[VEC_PHI]], [[TMP3]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; CHECK-NEXT:    [[TMP3:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = bitcast <vscale x 4 x i32> [[TMP4]] to <vscale x 4 x float>
+; CHECK-NEXT:    [[TMP6]] = add <vscale x 4 x i32> [[TMP3]], [[VEC_PHI1]]
+; CHECK-NEXT:    [[TMP7]] = fadd fast <vscale x 4 x float> [[VEC_PHI]], [[TMP5]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP10:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP5]])
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP4]])
+; CHECK-NEXT:    [[TMP9:%.*]] = call fast float @llvm.vector.reduce.fadd.nxv4f32(float -0.000000e+00, <vscale x 4 x float> [[TMP7]])
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP6]])
 ; CHECK-NEXT:    br i1 true, label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i32 [ poison, [[FOR_BODY]] ], [ [[TMP10]], [[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    [[ADD3_LCSSA:%.*]] = phi float [ poison, [[FOR_BODY]] ], [ [[TMP9]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    store i32 [[ADD_LCSSA]], ptr @SA, align 4
 ; CHECK-NEXT:    store float [[ADD3_LCSSA]], ptr @SB, align 4
 ; CHECK-NEXT:    ret void
@@ -892,28 +892,28 @@ define void @PR27626_0(%pair.i32 *%p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 4 x ptr> [[TMP10]], i64 0
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 4 x ptr> [[TMP12]], i64 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
@@ -979,35 +979,35 @@ define i32 @PR27626_1(%pair.i32 *%p, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP10]], align 4
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP12]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP13:%.*]] = extractelement <vscale x 4 x ptr> [[TMP11]], i64 0
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP15:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP15]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP15]] = add <vscale x 4 x i32> [[TMP14]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]]
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP17]] = add <vscale x 4 x i32> [[TMP16]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP15]])
+; CHECK-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP17]])
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -1074,28 +1074,28 @@ define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[Z:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP10]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], <vscale x 4 x i64> [[VEC_IND]], i32 0
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[VEC_IND]], i32 1
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP13]], <vscale x 4 x ptr> [[TMP12]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP15]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
@@ -1107,7 +1107,7 @@ define void @PR27626_2(%pair.i32 *%p, i64 %n, i32 %z) #1 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
-; CHECK-NEXT:    [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0
+; CHECK-NEXT:    [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8
 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[P_I_X]], align 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4
@@ -1164,36 +1164,36 @@ define i32 @PR27626_3(%pair.i32 *%p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
 ; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP3]], i64 [[N_MOD_VF]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub nsw i64 [[SMAX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP18:%.*]] = shl nuw nsw i64 [[TMP17]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 2
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 2
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP10:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[TMP10]], i32 1
-; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP12:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds [[PAIR_I32:%.*]], ptr [[P:%.*]], i64 [[INDEX]], i32 0
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], <vscale x 4 x i64> [[TMP12]], i32 1
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[TMP13]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP14]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP12]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[TMP16]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[WIDE_VEC1:%.*]] = load <vscale x 8 x i32>, ptr [[TMP14]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC1]])
-; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
-; CHECK-NEXT:    [[TMP16]] = add <vscale x 4 x i32> [[TMP15]], [[VEC_PHI]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP18]]
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC2]], 0
+; CHECK-NEXT:    [[TMP18]] = add <vscale x 4 x i32> [[TMP17]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP16]])
+; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP18]])
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
@@ -1267,13 +1267,13 @@ define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP5]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl nuw i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2
-; CHECK-NEXT:    [[TMP6:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl <vscale x 4 x i64> [[TMP6]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP8]], 3
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP9]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 2
+; CHECK-NEXT:    [[TMP8:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = shl <vscale x 4 x i64> [[TMP8]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP11:%.*]] = shl nuw nsw i64 [[TMP10]], 3
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP11]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -1284,16 +1284,16 @@ define void @PR27626_4(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP7]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP10:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP10]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP11]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i64 -1
+; CHECK-NEXT:    [[TMP12:%.*]] = or disjoint i64 [[OFFSET_IDX]], 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP12]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.experimental.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
-; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP15]]
+; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
@@ -1363,14 +1363,14 @@ define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP2]], [[DOTNEG]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[N_VEC]], 1
 ; CHECK-NEXT:    [[IND_END:%.*]] = or disjoint i64 [[TMP6]], 3
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = shl <vscale x 4 x i64> [[TMP7]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[TMP8]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 3
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP10]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = shl <vscale x 4 x i64> [[TMP9]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <vscale x 4 x i64> [[TMP10]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 3, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 3
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP12]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[X:%.*]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
@@ -1382,15 +1382,15 @@ define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 -1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP12:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 -3, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP11]]
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP12]]
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP14]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP17]]
+; CHECK-NEXT:    [[TMP13:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 -1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = add <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 -3, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP13]]
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A]], <vscale x 4 x i64> [[TMP14]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP16]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x ptr> [[TMP17]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT4]], <vscale x 4 x ptr> [[TMP15]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]]
@@ -1404,9 +1404,9 @@ define void @PR27626_5(i32 *%a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]]
-; CHECK-NEXT:    [[A_I_MINUS_1:%.*]] = getelementptr i32, ptr [[TMP19]], i64 -1
+; CHECK-NEXT:    [[A_I_MINUS_1:%.*]] = getelementptr i8, ptr [[TMP19]], i64 -4
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]]
-; CHECK-NEXT:    [[A_I_MINUS_3:%.*]] = getelementptr i32, ptr [[TMP20]], i64 -3
+; CHECK-NEXT:    [[A_I_MINUS_3:%.*]] = getelementptr i8, ptr [[TMP20]], i64 -12
 ; CHECK-NEXT:    store i32 [[X]], ptr [[A_I_MINUS_1]], align 4
 ; CHECK-NEXT:    store i32 [[Y]], ptr [[A_I_MINUS_3]], align 4
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[A_I]], align 4
@@ -1473,38 +1473,38 @@ define void @PR34743(i16* %a, i32* %b, i64 %n) #1 {
 ; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP8]], -4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[TMP1]], [[DOTNEG]]
 ; CHECK-NEXT:    [[IND_END:%.*]] = shl i64 [[N_VEC]], 1
-; CHECK-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP28:%.*]] = shl nuw nsw i64 [[TMP27]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vscale.i32()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i32 [[TMP9]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = add nsw i32 [[TMP10]], -1
-; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = shl <vscale x 4 x i64> [[TMP12]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 3
-; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP15]], i64 0
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vscale.i32()
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i32 [[TMP11]], 2
+; CHECK-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], -1
+; CHECK-NEXT:    [[VECTOR_RECUR_INIT:%.*]] = insertelement <vscale x 4 x i16> poison, i16 [[DOTPRE]], i32 [[TMP13]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call <vscale x 4 x i64> @llvm.experimental.stepvector.nxv4i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = shl <vscale x 4 x i64> [[TMP14]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 3
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 4 x i64> poison, i64 [[TMP17]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 4 x i64> [[DOTSPLATINSERT]], <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <vscale x 4 x i16> [ [[VECTOR_RECUR_INIT]], [[VECTOR_PH]] ], [ [[WIDE_MASKED_GATHER4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP13]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP16:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP17:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP16]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP18]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison), !alias.scope [[META34:![0-9]+]]
-; CHECK-NEXT:    [[TMP19:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP17]]
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison), !alias.scope [[META34]]
-; CHECK-NEXT:    [[TMP21:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
-; CHECK-NEXT:    [[TMP22:%.*]] = sext <vscale x 4 x i16> [[TMP21]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP23:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]] to <vscale x 4 x i32>
-; CHECK-NEXT:    [[TMP24:%.*]] = mul nsw <vscale x 4 x i32> [[TMP22]], [[TMP19]]
-; CHECK-NEXT:    [[TMP25:%.*]] = mul nsw <vscale x 4 x i32> [[TMP24]], [[TMP23]]
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP25]], ptr [[TMP26]], align 4,  !alias.scope [[META37:![0-9]+]], !noalias [[META34]]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP28]]
+; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP15]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP18:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP19:%.*]] = add nuw nsw <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 2, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP18]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP20]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison), !alias.scope [[META34:![0-9]+]]
+; CHECK-NEXT:    [[TMP21:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i16, ptr [[A]], <vscale x 4 x i64> [[TMP19]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER4]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP22]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison), !alias.scope [[META34]]
+; CHECK-NEXT:    [[TMP23:%.*]] = call <vscale x 4 x i16> @llvm.experimental.vector.splice.nxv4i16(<vscale x 4 x i16> [[VECTOR_RECUR]], <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]], i32 -1)
+; CHECK-NEXT:    [[TMP24:%.*]] = sext <vscale x 4 x i16> [[TMP23]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP25:%.*]] = sext <vscale x 4 x i16> [[WIDE_MASKED_GATHER4]] to <vscale x 4 x i32>
+; CHECK-NEXT:    [[TMP26:%.*]] = mul nsw <vscale x 4 x i32> [[TMP24]], [[TMP21]]
+; CHECK-NEXT:    [[TMP27:%.*]] = mul nsw <vscale x 4 x i32> [[TMP26]], [[TMP25]]
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP27]], ptr [[TMP28]], align 4, !alias.scope [[META37:![0-9]+]], !noalias [[META34]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP10]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i64> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP39:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 1682d0740d0f5..7226048c478d4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -27,43 +27,43 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[DOTNEG]], [[N]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 [[TMP3]]
-; CHECK-NEXT:    [[TMP25:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP26:%.*]] = shl nuw nsw i64 [[TMP25]], 3
+; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 5
-; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP6]]
-; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP8]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 5
+; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[C]], i64 [[TMP8]]
+; CHECK-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <vscale x 8 x i32>, ptr [[NEXT_GEP2]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC]])
-; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC]], 1
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.experimental.vector.deinterleave2.nxv8i32(<vscale x 8 x i32> [[WIDE_VEC3]])
-; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 0
-; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 1
-; CHECK-NEXT:    [[TMP13:%.*]] = add nsw <vscale x 4 x i32> [[TMP9]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP14:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nuw nsw i64 [[TMP16]], 2
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[TMP17]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP15]], align 4
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP14]], ptr [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = add nsw <vscale x 4 x i32> [[TMP10]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP20:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP23:%.*]] = shl nuw nsw i64 [[TMP22]], 2
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[TMP21]], i64 [[TMP23]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP19]], ptr [[TMP21]], align 4
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP20]], ptr [[TMP24]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP26]]
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x i32>, <vscale x 4 x i32> } [[STRIDED_VEC4]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = add nsw <vscale x 4 x i32> [[TMP11]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP16:%.*]] = add nsw <vscale x 4 x i32> [[TMP13]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP19:%.*]] = shl nuw nsw i64 [[TMP18]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i64 [[TMP19]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP15]], ptr [[TMP17]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP16]], ptr [[TMP20]], align 4
+; CHECK-NEXT:    [[TMP21:%.*]] = add nsw <vscale x 4 x i32> [[TMP12]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP22:%.*]] = add nsw <vscale x 4 x i32> [[TMP14]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP25:%.*]] = shl nuw nsw i64 [[TMP24]], 2
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP23]], i64 [[TMP25]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP21]], ptr [[TMP23]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP22]], ptr [[TMP26]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
@@ -76,9 +76,9 @@ define void @widen_ptr_phi_unrolled(ptr noalias nocapture %a, ptr noalias nocapt
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[PTR_014:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[I_013:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL1]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i32, ptr [[PTR_014]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR:%.*]] = getelementptr inbounds i8, ptr [[PTR_014]], i64 4
 ; CHECK-NEXT:    [[TMP28:%.*]] = load i32, ptr [[PTR_014]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i32, ptr [[PTR_014]], i64 2
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[PTR_014]], i64 8
 ; CHECK-NEXT:    [[TMP29:%.*]] = load i32, ptr [[INCDEC_PTR]], align 4
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP28]], 1
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I_013]]
@@ -143,28 +143,28 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[SRC:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 2
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[DST:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw nsw i64 [[TMP15]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = shl i64 [[INDEX]], 2
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP5]]
-; CHECK-NEXT:    [[TMP6:%.*]] = shl i64 [[INDEX]], 2
-; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
-; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP7:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP8:%.*]] = shl i64 [[INDEX]], 2
+; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw nsw i64 [[TMP9]], 2
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[NEXT_GEP]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i32>, ptr [[TMP9]], align 4
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP11:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 2
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i32, ptr [[NEXT_GEP5]], i64 [[TMP13]]
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP10]], ptr [[NEXT_GEP5]], align 4
-; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP11]], ptr [[TMP14]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <vscale x 4 x i32>, ptr [[TMP11]], align 4
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP13:%.*]] = shl nsw <vscale x 4 x i32> [[WIDE_LOAD7]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 1, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i32, ptr [[NEXT_GEP5]], i64 [[TMP15]]
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP12]], ptr [[NEXT_GEP5]], align 4
+; CHECK-NEXT:    store <vscale x 4 x i32> [[TMP13]], ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       middle.block:
@@ -182,8 +182,8 @@ define void @widen_2ptrs_phi_unrolled(ptr noalias nocapture %dst, ptr noalias no
 ; CHECK-NEXT:    [[TMP18:%.*]] = load i32, ptr [[S_010]], align 4
 ; CHECK-NEXT:    [[MUL:%.*]] = shl nsw i32 [[TMP18]], 1
 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[D_09]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[D_09]], i64 1
-; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i32, ptr [[S_010]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[D_09]], i64 4
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[S_010]], i64 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_011]], 1
 ; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[N]]
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -234,30 +234,30 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl i64 [[N_VEC]], 3
 ; CHECK-NEXT:    [[IND_END2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP13:%.*]] = shl nuw nsw i64 [[TMP12]], 1
+; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[A]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw nsw i64 [[TMP5]], 3
-; CHECK-NEXT:    [[TMP7:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = shl <vscale x 2 x i64> [[TMP7]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
-; CHECK-NEXT:    [[TMP9:%.*]] = shl i64 [[INDEX]], 3
-; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = extractelement <vscale x 2 x ptr> [[TMP8]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
-; CHECK-NEXT:    store <vscale x 2 x ptr> [[TMP8]], ptr [[NEXT_GEP]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP13]]
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP6]]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw nsw i64 [[TMP7]], 3
+; CHECK-NEXT:    [[TMP9:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = shl <vscale x 2 x i64> [[TMP9]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 2, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
+; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[INDEX]], 3
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x ptr> [[TMP10]], i64 0
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i32>, ptr [[TMP12]], align 8
+; CHECK-NEXT:    [[TMP13]] = add <vscale x 2 x i32> [[WIDE_LOAD]], [[VEC_PHI]]
+; CHECK-NEXT:    store <vscale x 2 x ptr> [[TMP10]], ptr [[NEXT_GEP]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP8]]
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
-; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[SMAX]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -274,8 +274,8 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[VAR1:%.*]] = load i32, ptr [[P]], align 8
 ; CHECK-NEXT:    [[VAR2]] = add i32 [[VAR1]], [[VAR0]]
 ; CHECK-NEXT:    store ptr [[P]], ptr [[Q]], align 8
-; CHECK-NEXT:    [[VAR3]] = getelementptr inbounds i32, ptr [[P]], i64 1
-; CHECK-NEXT:    [[VAR4]] = getelementptr inbounds ptr, ptr [[Q]], i64 1
+; CHECK-NEXT:    [[VAR3]] = getelementptr inbounds i8, ptr [[P]], i64 4
+; CHECK-NEXT:    [[VAR4]] = getelementptr inbounds i8, ptr [[Q]], i64 8
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
 ; CHECK-NEXT:    [[COND:%.*]] = icmp slt i64 [[I_NEXT]], [[N]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[FOR_BODY]], label [[FOR_END]], !llvm.loop [[LOOP8:![0-9]+]]
@@ -310,22 +310,22 @@ define void @phi_used_in_vector_compare_and_scalar_indvar_update_and_store(ptr %
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[POINTER_PHI:%.*]] = phi ptr [ [[PTR:%.*]], [[VECTOR_PH]] ], [ [[PTR_IND:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
-; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = shl <vscale x 2 x i64> [[TMP2]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
-; CHECK-NEXT:    [[TMP4:%.*]] = icmp ne <vscale x 2 x ptr> [[TMP3]], zeroinitializer
-; CHECK-NEXT:    [[TMP5:%.*]] = extractelement <vscale x 2 x ptr> [[TMP3]], i64 0
-; CHECK-NEXT:    call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> zeroinitializer, ptr [[TMP5]], i32 2, <vscale x 2 x i1> [[TMP4]])
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
-; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = call <vscale x 2 x i64> @llvm.experimental.stepvector.nxv2i64()
+; CHECK-NEXT:    [[VECTOR_GEP:%.*]] = shl <vscale x 2 x i64> [[TMP4]], shufflevector (<vscale x 2 x i64> insertelement (<vscale x 2 x i64> poison, i64 1, i64 0), <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer)
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <vscale x 2 x i64> [[VECTOR_GEP]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ne <vscale x 2 x ptr> [[TMP5]], zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 2 x ptr> [[TMP5]], i64 0
+; CHECK-NEXT:    call void @llvm.masked.store.nxv2i16.p0(<vscale x 2 x i16> zeroinitializer, ptr [[TMP7]], i32 2, <vscale x 2 x i1> [[TMP6]])
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       middle.block:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
index 347ec55f6a992..d5ace655fdcc1 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vector-reverse-mask4.ll
@@ -34,8 +34,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
 ; CHECK-NEXT:    [[TMP0:%.*]] = xor i64 [[INDEX]], -1
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[TMP0]], [[N]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[COND:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 -3
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 -7
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -24
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 -56
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x double> [[WIDE_LOAD]], <4 x double> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x double>, ptr [[TMP4]], align 8
@@ -43,8 +43,8 @@ define void @vector_reverse_mask_v4i1(ptr noalias %a, ptr noalias %cond, i64 %N)
 ; CHECK-NEXT:    [[TMP5:%.*]] = fcmp une <4 x double> [[REVERSE]], zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = fcmp une <4 x double> [[REVERSE2]], zeroinitializer
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[TMP7]], i64 -3
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr double, ptr [[TMP7]], i64 -7
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -24
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP7]], i64 -56
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i1> [[TMP5]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i1> [[TMP6]], <4 x i1> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[WIDE_MASKED_LOAD:%.*]] = call <4 x double> @llvm.masked.load.v4f64.p0(ptr [[TMP8]], i32 8, <4 x i1> [[REVERSE3]], <4 x double> poison)
diff --git a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
index 4988ba0d7d37a..8f768d959d8b1 100644
--- a/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
+++ b/llvm/test/Transforms/LoopVectorize/AMDGPU/packed-math.ll
@@ -11,27 +11,27 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
 ; GFX9-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; GFX9:       vector.body:
 ; GFX9-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; GFX9-NEXT:    [[VEC_PHI:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; GFX9-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; GFX9-NEXT:    [[VEC_PHI:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; GFX9-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; GFX9-NEXT:    [[TMP0:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[S:%.*]], i64 [[INDEX]]
-; GFX9-NEXT:    [[TMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP0]], i64 2
+; GFX9-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 4
 ; GFX9-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 2
-; GFX9-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP2]], align 2
-; GFX9-NEXT:    [[TMP4]] = fadd fast <2 x half> [[VEC_PHI]], [[WIDE_LOAD]]
-; GFX9-NEXT:    [[TMP5]] = fadd fast <2 x half> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; GFX9-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP1]], align 2
+; GFX9-NEXT:    [[TMP2]] = fadd fast <2 x half> [[VEC_PHI]], [[WIDE_LOAD]]
+; GFX9-NEXT:    [[TMP3]] = fadd fast <2 x half> [[VEC_PHI1]], [[WIDE_LOAD2]]
 ; GFX9-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; GFX9-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; GFX9-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; GFX9-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; GFX9-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; GFX9:       middle.block:
-; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP5]], [[TMP4]]
-; GFX9-NEXT:    [[TMP7:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX]])
+; GFX9-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]]
+; GFX9-NEXT:    [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX]])
 ; GFX9-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; GFX9:       scalar.ph:
 ; GFX9-NEXT:    br label [[FOR_BODY:%.*]]
 ; GFX9:       for.body:
 ; GFX9-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; GFX9:       for.end:
-; GFX9-NEXT:    [[ADD_LCSSA:%.*]] = phi half [ poison, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; GFX9-NEXT:    [[ADD_LCSSA:%.*]] = phi half [ poison, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; GFX9-NEXT:    ret half [[ADD_LCSSA]]
 ;
 ; VI-LABEL: @vectorize_v2f16_loop(
@@ -41,27 +41,27 @@ define half @vectorize_v2f16_loop(ptr addrspace(1) noalias %s) {
 ; VI-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; VI:       vector.body:
 ; VI-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; VI-NEXT:    [[VEC_PHI:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; VI-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP5:%.*]], [[VECTOR_BODY]] ]
+; VI-NEXT:    [[VEC_PHI:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
+; VI-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x half> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; VI-NEXT:    [[TMP0:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[S:%.*]], i64 [[INDEX]]
-; VI-NEXT:    [[TMP2:%.*]] = getelementptr inbounds half, ptr addrspace(1) [[TMP0]], i64 2
+; VI-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP0]], i64 4
 ; VI-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP0]], align 2
-; VI-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP2]], align 2
-; VI-NEXT:    [[TMP4]] = fadd fast <2 x half> [[VEC_PHI]], [[WIDE_LOAD]]
-; VI-NEXT:    [[TMP5]] = fadd fast <2 x half> [[VEC_PHI1]], [[WIDE_LOAD2]]
+; VI-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x half>, ptr addrspace(1) [[TMP1]], align 2
+; VI-NEXT:    [[TMP2]] = fadd fast <2 x half> [[VEC_PHI]], [[WIDE_LOAD]]
+; VI-NEXT:    [[TMP3]] = fadd fast <2 x half> [[VEC_PHI1]], [[WIDE_LOAD2]]
 ; VI-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
-; VI-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; VI-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VI-NEXT:    [[TMP4:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
+; VI-NEXT:    br i1 [[TMP4]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; VI:       middle.block:
-; VI-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP5]], [[TMP4]]
-; VI-NEXT:    [[TMP7:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX]])
+; VI-NEXT:    [[BIN_RDX:%.*]] = fadd fast <2 x half> [[TMP3]], [[TMP2]]
+; VI-NEXT:    [[TMP5:%.*]] = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH8000, <2 x half> [[BIN_RDX]])
 ; VI-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; VI:       scalar.ph:
 ; VI-NEXT:    br label [[FOR_BODY:%.*]]
 ; VI:       for.body:
 ; VI-NEXT:    br i1 poison, label [[FOR_END]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
 ; VI:       for.end:
-; VI-NEXT:    [[ADD_LCSSA:%.*]] = phi half [ poison, [[FOR_BODY]] ], [ [[TMP7]], [[MIDDLE_BLOCK]] ]
+; VI-NEXT:    [[ADD_LCSSA:%.*]] = phi half [ poison, [[FOR_BODY]] ], [ [[TMP5]], [[MIDDLE_BLOCK]] ]
 ; VI-NEXT:    ret half [[ADD_LCSSA]]
 ;
 ; CI-LABEL: @vectorize_v2f16_loop(
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
index c1492ec5dbb74..45b84a0b5e856 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-qabs.ll
@@ -60,7 +60,7 @@ define void @arm_abs_q7(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32 %
 ; CHECK-NEXT:    store i8 [[COND11]], ptr [[PDST_ADDR_020]], align 1
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_021]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -131,7 +131,7 @@ define void @arm_abs_q15(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32
 ; CHECK-NEXT:    store <8 x i16> [[TMP9]], ptr [[NEXT_GEP7]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
@@ -144,18 +144,18 @@ define void @arm_abs_q15(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32
 ; CHECK-NEXT:    [[PSRC_ADDR_023:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[BLKCNT_022:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_021:%.*]] = phi ptr [ [[INCDEC_PTR13:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRC_ADDR_023]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_023]], i32 2
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[PSRC_ADDR_023]], align 2
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i16 [[TMP11]], 0
 ; CHECK-NEXT:    [[CMP5:%.*]] = icmp eq i16 [[TMP11]], -32768
 ; CHECK-NEXT:    [[SUB:%.*]] = sub i16 0, [[TMP11]]
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP5]], i16 32767, i16 [[SUB]]
 ; CHECK-NEXT:    [[COND11:%.*]] = select i1 [[CMP1]], i16 [[TMP11]], i16 [[COND]]
-; CHECK-NEXT:    [[INCDEC_PTR13]] = getelementptr inbounds i16, ptr [[PDST_ADDR_021]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR13]] = getelementptr inbounds i8, ptr [[PDST_ADDR_021]], i32 2
 ; CHECK-NEXT:    store i16 [[COND11]], ptr [[PDST_ADDR_021]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_022]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 ;
@@ -226,7 +226,7 @@ define void @arm_abs_q31(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32
 ; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[NEXT_GEP7]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_VEC]], [[BLOCKSIZE]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[WHILE_END]], label [[SCALAR_PH]]
@@ -239,18 +239,18 @@ define void @arm_abs_q31(ptr nocapture readonly %pSrc, ptr nocapture %pDst, i32
 ; CHECK-NEXT:    [[PSRC_ADDR_017:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[BLKCNT_016:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_015:%.*]] = phi ptr [ [[INCDEC_PTR7:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL6]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[PSRC_ADDR_017]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_017]], i32 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[PSRC_ADDR_017]], align 4
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp sgt i32 [[TMP11]], 0
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i32 [[TMP11]], -2147483648
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i32 0, [[TMP11]]
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP2]], i32 2147483647, i32 [[SUB]]
 ; CHECK-NEXT:    [[COND6:%.*]] = select i1 [[CMP1]], i32 [[TMP11]], i32 [[COND]]
-; CHECK-NEXT:    [[INCDEC_PTR7]] = getelementptr inbounds i32, ptr [[PDST_ADDR_015]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR7]] = getelementptr inbounds i8, ptr [[PDST_ADDR_015]], i32 4
 ; CHECK-NEXT:    store i32 [[COND6]], ptr [[PDST_ADDR_015]], align 4
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_016]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
index 1dd0347e4d286..18caa9cc16f35 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-reductions.ll
@@ -1338,7 +1338,7 @@ define i32 @reduction_interleave_group(i32 %n, ptr %arr) #0 {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i32 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = or disjoint i32 [[OFFSET_IDX]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[ARR:%.*]], i32 [[TMP3]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 -4
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
index 5d85a4cd73fdd..938e1deab0b6c 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/mve-selectandorcost.ll
@@ -65,9 +65,9 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %
 ; CHECK-NEXT:    [[PB_ADDR_019:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL2]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[BLOCKSIZE_ADDR_018:%.*]] = phi i32 [ [[DEC:%.*]], [[IF_END]] ], [ [[BC_RESUME_VAL4]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[ACCUM_017:%.*]] = phi float [ [[ACCUM_1:%.*]], [[IF_END]] ], [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[PA_ADDR_020]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PA_ADDR_020]], i32 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = load float, ptr [[PA_ADDR_020]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds float, ptr [[PB_ADDR_019]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[PB_ADDR_019]], i32 4
 ; CHECK-NEXT:    [[TMP16:%.*]] = load float, ptr [[PB_ADDR_019]], align 4
 ; CHECK-NEXT:    [[CMP2:%.*]] = fcmp fast une float [[TMP15]], 0.000000e+00
 ; CHECK-NEXT:    [[CMP3:%.*]] = fcmp fast une float [[TMP16]], 0.000000e+00
@@ -86,7 +86,7 @@ define float @test(ptr nocapture readonly %pA, ptr nocapture readonly %pB, i32 %
 ; CHECK-NEXT:    [[ACCUM_1]] = phi float [ [[ADD4]], [[IF_THEN]] ], [ [[ACCUM_017]], [[WHILE_BODY]] ]
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLOCKSIZE_ADDR_018]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    [[ACCUM_0_LCSSA:%.*]] = phi float [ 0.000000e+00, [[ENTRY:%.*]] ], [ [[ACCUM_1]], [[IF_END]] ], [ [[TMP14]], [[MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret float [[ACCUM_0_LCSSA]]
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
index c3f307c234ed9..2269b774d9f31 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
@@ -69,10 +69,10 @@ define hidden void @pointer_phi_v4i32_add2(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR_09]], align 4
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, ptr [[A_ADDR_09]], i32 2
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 8
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[Y]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[B_ADDR_07]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -123,10 +123,10 @@ define hidden void @pointer_phi_v4i32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[A_ADDR_09]], align 4
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, ptr [[A_ADDR_09]], i32 3
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 12
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP4]], [[Y]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[B_ADDR_07]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -219,10 +219,10 @@ define hidden void @pointer_phi_v8i16_add2(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_09:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[L1:%.*]] = load i16, ptr [[A_ADDR_011]], align 2
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[A_ADDR_011]], i32 2
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_011]], i32 4
 ; CHECK-NEXT:    [[CONV1:%.*]] = add i16 [[L1]], [[TMP0]]
 ; CHECK-NEXT:    store i16 [[CONV1]], ptr [[B_ADDR_09]], align 2
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[B_ADDR_09]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_09]], i32 2
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_010]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
@@ -258,10 +258,10 @@ define hidden void @pointer_phi_v8i16_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[I_010:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_09:%.*]] = phi ptr [ [[B:%.*]], [[ENTRY]] ], [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[L1:%.*]] = load i16, ptr [[A_ADDR_011]], align 2
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i16, ptr [[A_ADDR_011]], i32 3
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_011]], i32 6
 ; CHECK-NEXT:    [[CONV1:%.*]] = add i16 [[L1]], [[TMP0]]
 ; CHECK-NEXT:    store i16 [[CONV1]], ptr [[B_ADDR_09]], align 2
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[B_ADDR_09]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_09]], i32 2
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_010]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]]
@@ -502,10 +502,10 @@ define hidden void @pointer_phi_v4f32_add2(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[A_ADDR_09]], align 4
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_09]], i32 2
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 8
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[Y]]
 ; CHECK-NEXT:    store float [[ADD]], ptr [[B_ADDR_07]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -556,10 +556,10 @@ define hidden void @pointer_phi_v4f32_add3(ptr noalias nocapture readonly %A, pt
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 996, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[A_ADDR_09]], align 4
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds float, ptr [[A_ADDR_09]], i32 3
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 12
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[TMP4]], [[Y]]
 ; CHECK-NEXT:    store float [[ADD]], ptr [[B_ADDR_07]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds float, ptr [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
@@ -649,10 +649,10 @@ define hidden void @pointer_phi_v4half_add2(ptr noalias nocapture readonly %A, p
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr [[A_ADDR_09]], align 4
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds half, ptr [[A_ADDR_09]], i32 2
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 4
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast half [[TMP4]], [[Y]]
 ; CHECK-NEXT:    store half [[ADD]], ptr [[B_ADDR_07]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds half, ptr [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 2
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
@@ -703,10 +703,10 @@ define hidden void @pointer_phi_v4half_add3(ptr noalias nocapture readonly %A, p
 ; CHECK-NEXT:    [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 992, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load half, ptr [[A_ADDR_09]], align 4
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds half, ptr [[A_ADDR_09]], i32 3
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_09]], i32 6
 ; CHECK-NEXT:    [[ADD:%.*]] = fadd fast half [[TMP4]], [[Y]]
 ; CHECK-NEXT:    store half [[ADD]], ptr [[B_ADDR_07]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds half, ptr [[B_ADDR_07]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_07]], i32 2
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_08]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 1000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[END:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
@@ -753,7 +753,7 @@ define hidden void @pointer_phi_v4i32_uf2(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER5:%.*]] = call <4 x i32> @llvm.masked.gather.v4i32.v4p0(<4 x ptr> [[TMP1]], i32 4, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i32> poison)
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER5]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
 ; CHECK-NEXT:    store <4 x i32> [[TMP3]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP4]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -767,10 +767,10 @@ define hidden void @pointer_phi_v4i32_uf2(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9992, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_06:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[A_ADDR_08]], align 4
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, ptr [[A_ADDR_08]], i32 6
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_08]], i32 24
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP7]], [[Y]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[B_ADDR_06]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[B_ADDR_06]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_06]], i32 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
@@ -824,9 +824,9 @@ define hidden void @pointer_phi_v4i32_uf4(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER7]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER8]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_MASKED_GATHER9]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 8
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 12
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48
 ; CHECK-NEXT:    store <4 x i32> [[TMP5]], ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP10]], align 4
@@ -842,10 +842,10 @@ define hidden void @pointer_phi_v4i32_uf4(ptr noalias nocapture readonly %A, ptr
 ; CHECK-NEXT:    [[I_07:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 9984, [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[B_ADDR_06:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[FOR_BODY]] ], [ [[IND_END2]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i32, ptr [[A_ADDR_08]], align 4
-; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i32, ptr [[A_ADDR_08]], i32 6
+; CHECK-NEXT:    [[ADD_PTR]] = getelementptr inbounds i8, ptr [[A_ADDR_08]], i32 24
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[TMP13]], [[Y]]
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[B_ADDR_06]], align 4
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i32, ptr [[B_ADDR_06]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[B_ADDR_06]], i32 4
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_07]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC]], 10000
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
@@ -886,18 +886,18 @@ define hidden void @mult_ptr_iv(ptr noalias nocapture readonly %x, ptr noalias n
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[POINTER_PHI]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[POINTER_PHI5]], <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 1
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP0]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope !28
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP0]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope [[META28:![0-9]+]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP0]], i32 2
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope !28
-; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP3]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope !28
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER7:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP2]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope [[META28]]
+; CHECK-NEXT:    [[WIDE_MASKED_GATHER8:%.*]] = call <4 x i8> @llvm.masked.gather.v4i8.v4p0(<4 x ptr> [[TMP3]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x i8> poison), !alias.scope [[META28]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], <i8 10, i8 10, i8 10, i8 10>
 ; CHECK-NEXT:    [[TMP5:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER7]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <4 x i8> [[WIDE_MASKED_GATHER]], [[WIDE_MASKED_GATHER8]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 1
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> [[TMP1]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !31, !noalias !28
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP4]], <4 x ptr> [[TMP1]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META31:![0-9]+]], !noalias [[META28]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, <4 x ptr> [[TMP1]], i32 2
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> [[TMP7]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !31, !noalias !28
-; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> [[TMP8]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope !31, !noalias !28
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP5]], <4 x ptr> [[TMP7]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META31]], !noalias [[META28]]
+; CHECK-NEXT:    call void @llvm.masked.scatter.v4i8.v4p0(<4 x i8> [[TMP6]], <4 x ptr> [[TMP8]], i32 1, <4 x i1> <i1 true, i1 true, i1 true, i1 true>), !alias.scope [[META31]], !noalias [[META28]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[PTR_IND]] = getelementptr i8, ptr [[POINTER_PHI]], i32 12
 ; CHECK-NEXT:    [[PTR_IND6]] = getelementptr i8, ptr [[POINTER_PHI5]], i32 12
diff --git a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
index 88186584c2053..c55e732c90147 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/float-induction-x86.ll
@@ -36,9 +36,9 @@ define void @fp_iv_loop1(ptr noalias nocapture %A, i32 %N) #0 {
 ; AUTO_VEC-NEXT:    [[STEP_ADD2:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00, float 8.000000e+00>
 ; AUTO_VEC-NEXT:    [[STEP_ADD3:%.*]] = fadd fast <8 x float> [[VEC_IND]], <float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01, float 1.200000e+01>
 ; AUTO_VEC-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
-; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 16
-; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 24
+; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
+; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 64
+; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 96
 ; AUTO_VEC-NEXT:    store <8 x float> [[VEC_IND]], ptr [[TMP1]], align 4
 ; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; AUTO_VEC-NEXT:    store <8 x float> [[STEP_ADD2]], ptr [[TMP3]], align 4
@@ -211,9 +211,9 @@ define double @external_use_with_fast_math(ptr %a, i64 %n) {
 ; AUTO_VEC-NEXT:    [[STEP_ADD2:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 2.400000e+01, double 2.400000e+01, double 2.400000e+01, double 2.400000e+01>
 ; AUTO_VEC-NEXT:    [[STEP_ADD3:%.*]] = fadd fast <4 x double> [[VEC_IND]], <double 3.600000e+01, double 3.600000e+01, double 3.600000e+01, double 3.600000e+01>
 ; AUTO_VEC-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[INDEX]]
-; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i64 4
-; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr double, ptr [[TMP1]], i64 8
-; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[TMP1]], i64 12
+; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP1]], i64 32
+; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i64 64
+; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[TMP1]], i64 96
 ; AUTO_VEC-NEXT:    store <4 x double> [[VEC_IND]], ptr [[TMP1]], align 8
 ; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD]], ptr [[TMP2]], align 8
 ; AUTO_VEC-NEXT:    store <4 x double> [[STEP_ADD2]], ptr [[TMP3]], align 8
@@ -377,9 +377,9 @@ define void @fadd_reassoc_FMF(ptr nocapture %p, i32 %N) {
 ; AUTO_VEC-NEXT:    [[STEP_ADD2:%.*]] = fadd reassoc <8 x float> [[STEP_ADD]], <float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02>
 ; AUTO_VEC-NEXT:    [[STEP_ADD3:%.*]] = fadd reassoc <8 x float> [[STEP_ADD2]], <float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02, float 3.360000e+02>
 ; AUTO_VEC-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[P:%.*]], i64 [[INDEX]]
-; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 8
-; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 16
-; AUTO_VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 24
+; AUTO_VEC-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 32
+; AUTO_VEC-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 64
+; AUTO_VEC-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 96
 ; AUTO_VEC-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x float>, ptr [[TMP2]], align 4
 ; AUTO_VEC-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x float>, ptr [[TMP3]], align 4
 ; AUTO_VEC-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x float>, ptr [[TMP4]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
index 55757157fce98..e64b02f00dfc1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/interleaving.ll
@@ -24,12 +24,12 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; SSE-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; SSE-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; SSE-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; SSE-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
-; SSE-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP9]], i64 4
-; SSE-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
-; SSE-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP11]], align 4
+; SSE-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], [[STRIDED_VEC]]
+; SSE-NEXT:    [[TMP6:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC2]]
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i64 16
+; SSE-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP7]], align 4
+; SSE-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
 ; SSE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; SSE-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; SSE-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -72,18 +72,18 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX1-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <8 x i32> [[WIDE_VEC1]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <8 x i32> [[WIDE_VEC2]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; AVX1-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <8 x i32> [[WIDE_VEC3]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
-; AVX1-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
-; AVX1-NEXT:    [[TMP16:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
-; AVX1-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
-; AVX1-NEXT:    [[TMP18:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
-; AVX1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; AVX1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 4
-; AVX1-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 8
-; AVX1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 12
-; AVX1-NEXT:    store <4 x i32> [[TMP15]], ptr [[TMP19]], align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP16]], ptr [[TMP21]], align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP23]], align 4
-; AVX1-NEXT:    store <4 x i32> [[TMP18]], ptr [[TMP25]], align 4
+; AVX1-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
+; AVX1-NEXT:    [[TMP12:%.*]] = add nsw <4 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
+; AVX1-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
+; AVX1-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
+; AVX1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; AVX1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 16
+; AVX1-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 32
+; AVX1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 48
+; AVX1-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP15]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP12]], ptr [[TMP16]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP13]], ptr [[TMP17]], align 4
+; AVX1-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP18]], align 4
 ; AVX1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; AVX1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; AVX1-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -126,18 +126,18 @@ define void @foo(ptr noalias nocapture %a, ptr noalias nocapture readonly %b) {
 ; AVX2-NEXT:    [[STRIDED_VEC8:%.*]] = shufflevector <16 x i32> [[WIDE_VEC1]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[STRIDED_VEC9:%.*]] = shufflevector <16 x i32> [[WIDE_VEC2]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
 ; AVX2-NEXT:    [[STRIDED_VEC10:%.*]] = shufflevector <16 x i32> [[WIDE_VEC3]], <16 x i32> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
-; AVX2-NEXT:    [[TMP15:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
-; AVX2-NEXT:    [[TMP16:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
-; AVX2-NEXT:    [[TMP17:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
-; AVX2-NEXT:    [[TMP18:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
-; AVX2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; AVX2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 8
-; AVX2-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 16
-; AVX2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 24
-; AVX2-NEXT:    store <8 x i32> [[TMP15]], ptr [[TMP19]], align 4
-; AVX2-NEXT:    store <8 x i32> [[TMP16]], ptr [[TMP21]], align 4
-; AVX2-NEXT:    store <8 x i32> [[TMP17]], ptr [[TMP23]], align 4
-; AVX2-NEXT:    store <8 x i32> [[TMP18]], ptr [[TMP25]], align 4
+; AVX2-NEXT:    [[TMP11:%.*]] = add nsw <8 x i32> [[STRIDED_VEC7]], [[STRIDED_VEC]]
+; AVX2-NEXT:    [[TMP12:%.*]] = add nsw <8 x i32> [[STRIDED_VEC8]], [[STRIDED_VEC4]]
+; AVX2-NEXT:    [[TMP13:%.*]] = add nsw <8 x i32> [[STRIDED_VEC9]], [[STRIDED_VEC5]]
+; AVX2-NEXT:    [[TMP14:%.*]] = add nsw <8 x i32> [[STRIDED_VEC10]], [[STRIDED_VEC6]]
+; AVX2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 32
+; AVX2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 64
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP15]], i64 96
+; AVX2-NEXT:    store <8 x i32> [[TMP11]], ptr [[TMP15]], align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP12]], ptr [[TMP16]], align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP13]], ptr [[TMP17]], align 4
+; AVX2-NEXT:    store <8 x i32> [[TMP14]], ptr [[TMP18]], align 4
 ; AVX2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; AVX2-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; AVX2-NEXT:    br i1 [[TMP19]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
index 8527fb433c638..23b653bbda380 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/intrinsiccost.ll
@@ -7,11 +7,6 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-COST-LABEL: uaddsat
-; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i16 @llvm.uadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction:   %1 = tail call i16 @llvm.uadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i16 @llvm.uadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i16 @llvm.uadd.sat.i16(i16 %0, i16 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 16 For instruction:   %1 = tail call i16 @llvm.uadd.sat.i16(i16 %0, i16 %offset)
 
 define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @uaddsat(
@@ -36,9 +31,9 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[PSRC:%.*]], i64 [[TMP1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[NEXT_GEP5:%.*]] = getelementptr i8, ptr [[PDST:%.*]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 16
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 32
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i64 48
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 32
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 64
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 96
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i16>, ptr [[TMP3]], align 2
 ; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i16>, ptr [[TMP4]], align 2
@@ -47,9 +42,9 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[TMP7:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD9]], <16 x i16> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP8:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD10]], <16 x i16> [[BROADCAST_SPLAT]])
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <16 x i16> @llvm.uadd.sat.v16i16(<16 x i16> [[WIDE_LOAD11]], <16 x i16> [[BROADCAST_SPLAT]])
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 16
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 32
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i16, ptr [[NEXT_GEP5]], i64 48
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 32
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 64
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[NEXT_GEP5]], i64 96
 ; CHECK-NEXT:    store <16 x i16> [[TMP6]], ptr [[NEXT_GEP5]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP7]], ptr [[TMP10]], align 2
 ; CHECK-NEXT:    store <16 x i16> [[TMP8]], ptr [[TMP11]], align 2
@@ -106,10 +101,10 @@ define void @uaddsat(ptr nocapture readonly %pSrc, i16 signext %offset, ptr noca
 ; CHECK-NEXT:    [[BLKCNT_09:%.*]] = phi i32 [ [[DEC:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL19]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[PSRC_ADDR_08:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL22]], [[VEC_EPILOG_SCALAR_PH]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_07:%.*]] = phi ptr [ [[INCDEC_PTR3:%.*]], [[WHILE_BODY]] ], [ [[BC_RESUME_VAL25]], [[VEC_EPILOG_SCALAR_PH]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRC_ADDR_08]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRC_ADDR_08]], i64 2
 ; CHECK-NEXT:    [[TMP22:%.*]] = load i16, ptr [[PSRC_ADDR_08]], align 2
 ; CHECK-NEXT:    [[TMP23:%.*]] = tail call i16 @llvm.uadd.sat.i16(i16 [[TMP22]], i16 [[OFFSET]])
-; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i16, ptr [[PDST_ADDR_07]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR3]] = getelementptr inbounds i8, ptr [[PDST_ADDR_07]], i64 2
 ; CHECK-NEXT:    store i16 [[TMP23]], ptr [[PDST_ADDR_07]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_09]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
@@ -139,12 +134,6 @@ while.end:                                        ; preds = %while.body, %entry
 }
 
 ; CHECK-COST-LABEL: fshl
-; CHECK-COST: Found an estimated cost of 2 for VF 1 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 2 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 4 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 8 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 1 for VF 16 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
-; CHECK-COST: Found an estimated cost of 4 for VF 32 For instruction:   %1 = tail call i8 @llvm.fshl.i8(i8 %0, i8 %0, i8 %offset)
 
 define void @fshl(ptr nocapture readonly %pSrc, i8 signext %offset, ptr nocapture noalias %pDst, i32 %blockSize) #0 {
 ; CHECK-LABEL: @fshl(
diff --git a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
index 330acb008d0dd..6575c5a288f21 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/invariant-store-vectorization.ll
@@ -35,9 +35,9 @@ define i32 @inv_val_store_to_inv_address_with_reduction(ptr %a, i64 %n, ptr %b)
 ; CHECK-NEXT:    [[VEC_PHI5:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP7:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI6:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 16
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 32
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 48
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 64
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 128
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 192
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i32>, ptr [[TMP1]], align 8, !alias.scope [[META0:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i32>, ptr [[TMP2]], align 8, !alias.scope [[META0]]
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i32>, ptr [[TMP3]], align 8, !alias.scope [[META0]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
index a1dd890212b76..cc36c2ba6f758 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/metadata-enable.ll
@@ -24,80 +24,80 @@ define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b,
 ; O1-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; O1-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O1-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; O1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; O1-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O1-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O1-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O1-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; O1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; O1-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; O1-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O1-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; O1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; O1-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; O1-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O1-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; O1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; O1-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; O1-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O1-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; O1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O1-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; O1-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; O1-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O1-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O1-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; O1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O1-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; O1-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; O1-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O1-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O1-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; O1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O1-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; O1-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; O1-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O1-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O1-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; O1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O1-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; O1-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; O1-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O1-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O1-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; O1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O1-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; O1-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; O1-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O1-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O1-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; O1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O1-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; O1-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; O1-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O1-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O1-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; O1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O1-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; O1-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; O1-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O1-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O1-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; O1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O1-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; O1-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; O1-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O1-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; O1-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; O1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O1-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; O1-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; O1-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O1-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; O1-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; O1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O1-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; O1-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; O1-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O1-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; O1-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; O1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O1-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; O1-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; O1-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O1-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; O1-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; O1-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; O1-NEXT:    ret i32 [[TMP46]]
@@ -109,80 +109,80 @@ define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b,
 ; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; O2-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O2-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O2-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O2-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; O2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; O2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; O2-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O2-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; O2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; O2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; O2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; O2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O2-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; O2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O2-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; O2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; O2-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O2-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; O2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; O2-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O2-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; O2-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O2-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; O2-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O2-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; O2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; O2-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; O2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; O2-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; O2-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; O2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; O2-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; O2-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; O2-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; O2-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; O2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; O2-NEXT:    ret i32 [[TMP46]]
@@ -194,80 +194,80 @@ define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b,
 ; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; O3-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O3-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O3-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O3-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; O3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O3-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; O3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; O3-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O3-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; O3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; O3-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O3-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; O3-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O3-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; O3-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O3-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; O3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O3-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; O3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; O3-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O3-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; O3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; O3-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O3-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; O3-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O3-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; O3-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O3-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; O3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; O3-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; O3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; O3-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; O3-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; O3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; O3-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; O3-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; O3-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; O3-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; O3-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; O3-NEXT:    ret i32 [[TMP46]]
@@ -279,80 +279,80 @@ define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b,
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; O3DEFAULT-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; O3DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O3DEFAULT-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; O3DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; O3DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; O3DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; O3DEFAULT-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; O3DEFAULT-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; O3DEFAULT-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; O3DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; O3DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; O3DEFAULT-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; O3DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; O3DEFAULT-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; O3DEFAULT-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; O3DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; O3DEFAULT-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; O3DEFAULT-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; O3DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; O3DEFAULT-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; O3DEFAULT-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; O3DEFAULT-NEXT:    ret i32 [[TMP46]]
@@ -364,80 +364,80 @@ define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b,
 ; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; Os-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; Os-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; Os-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; Os-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; Os-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; Os-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; Os-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; Os-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; Os-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; Os-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; Os-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; Os-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; Os-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; Os-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; Os-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; Os-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; Os-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; Os-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; Os-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; Os-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; Os-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; Os-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; Os-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; Os-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; Os-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; Os-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; Os-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; Os-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; Os-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; Os-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; Os-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; Os-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; Os-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; Os-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; Os-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; Os-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; Os-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; Os-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; Os-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; Os-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; Os-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; Os-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; Os-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; Os-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; Os-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; Os-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; Os-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; Os-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; Os-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; Os-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; Os-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; Os-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; Os-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; Os-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; Os-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; Os-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; Os-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; Os-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; Os-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; Os-NEXT:    ret i32 [[TMP46]]
@@ -449,80 +449,80 @@ define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b,
 ; Oz-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; Oz-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; Oz-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; Oz-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; Oz-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; Oz-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; Oz-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; Oz-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; Oz-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; Oz-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; Oz-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; Oz-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; Oz-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; Oz-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; Oz-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; Oz-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; Oz-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; Oz-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; Oz-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; Oz-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; Oz-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; Oz-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; Oz-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; Oz-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; Oz-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; Oz-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; Oz-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; Oz-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; Oz-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; Oz-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; Oz-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; Oz-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; Oz-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; Oz-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; Oz-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; Oz-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; Oz-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; Oz-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; Oz-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; Oz-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; Oz-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; Oz-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; Oz-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; Oz-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; Oz-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; Oz-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; Oz-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; Oz-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; Oz-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; Oz-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; Oz-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; Oz-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; Oz-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; Oz-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; Oz-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; Oz-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; Oz-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; Oz-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; Oz-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; Oz-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; Oz-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; Oz-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; Oz-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; Oz-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; Oz-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; Oz-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; Oz-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; Oz-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; Oz-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; Oz-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; Oz-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; Oz-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; Oz-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; Oz-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; Oz-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; Oz-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; Oz-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; Oz-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; Oz-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; Oz-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; Oz-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; Oz-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; Oz-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; Oz-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; Oz-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; Oz-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; Oz-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; Oz-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; Oz-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; Oz-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; Oz-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; Oz-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; Oz-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; Oz-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; Oz-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; Oz-NEXT:    ret i32 [[TMP46]]
@@ -534,80 +534,80 @@ define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b,
 ; O1VEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; O1VEC2-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O1VEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; O1VEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O1VEC2-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O1VEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; O1VEC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O1VEC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; O1VEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; O1VEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O1VEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; O1VEC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O1VEC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; O1VEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; O1VEC2-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O1VEC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; O1VEC2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O1VEC2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; O1VEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; O1VEC2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O1VEC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; O1VEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O1VEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; O1VEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; O1VEC2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O1VEC2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; O1VEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O1VEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; O1VEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; O1VEC2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O1VEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; O1VEC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O1VEC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; O1VEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; O1VEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O1VEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; O1VEC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O1VEC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; O1VEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; O1VEC2-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O1VEC2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; O1VEC2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O1VEC2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; O1VEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; O1VEC2-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O1VEC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; O1VEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O1VEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; O1VEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; O1VEC2-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O1VEC2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; O1VEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O1VEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; O1VEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; O1VEC2-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O1VEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; O1VEC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O1VEC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; O1VEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; O1VEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O1VEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; O1VEC2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O1VEC2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; O1VEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; O1VEC2-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O1VEC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; O1VEC2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O1VEC2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; O1VEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; O1VEC2-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O1VEC2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; O1VEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O1VEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; O1VEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; O1VEC2-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O1VEC2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O1VEC2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; O1VEC2-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; O1VEC2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; O1VEC2-NEXT:    ret i32 [[TMP46]]
@@ -619,80 +619,80 @@ define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b,
 ; OzVEC2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; OzVEC2-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; OzVEC2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; OzVEC2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; OzVEC2-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; OzVEC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; OzVEC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; OzVEC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; OzVEC2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; OzVEC2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; OzVEC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; OzVEC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; OzVEC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; OzVEC2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; OzVEC2-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; OzVEC2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; OzVEC2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; OzVEC2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; OzVEC2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; OzVEC2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; OzVEC2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; OzVEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; OzVEC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; OzVEC2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; OzVEC2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; OzVEC2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; OzVEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; OzVEC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; OzVEC2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; OzVEC2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; OzVEC2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; OzVEC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; OzVEC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; OzVEC2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; OzVEC2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; OzVEC2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; OzVEC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; OzVEC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; OzVEC2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; OzVEC2-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; OzVEC2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; OzVEC2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; OzVEC2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; OzVEC2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; OzVEC2-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; OzVEC2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; OzVEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; OzVEC2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; OzVEC2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; OzVEC2-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; OzVEC2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; OzVEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; OzVEC2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; OzVEC2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; OzVEC2-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; OzVEC2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; OzVEC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; OzVEC2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; OzVEC2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; OzVEC2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; OzVEC2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; OzVEC2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; OzVEC2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; OzVEC2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; OzVEC2-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; OzVEC2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; OzVEC2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; OzVEC2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; OzVEC2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; OzVEC2-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; OzVEC2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; OzVEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; OzVEC2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; OzVEC2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; OzVEC2-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; OzVEC2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; OzVEC2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; OzVEC2-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; OzVEC2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; OzVEC2-NEXT:    ret i32 [[TMP46]]
@@ -704,80 +704,80 @@ define i32 @enabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b,
 ; O3DIS-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; O3DIS-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O3DIS-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; O3DIS-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3DIS-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; O3DIS-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O3DIS-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3DIS-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O3DIS-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; O3DIS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3DIS-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; O3DIS-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; O3DIS-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3DIS-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O3DIS-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; O3DIS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3DIS-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; O3DIS-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; O3DIS-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3DIS-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O3DIS-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; O3DIS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3DIS-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; O3DIS-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; O3DIS-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3DIS-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O3DIS-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; O3DIS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3DIS-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; O3DIS-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; O3DIS-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3DIS-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O3DIS-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; O3DIS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3DIS-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; O3DIS-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; O3DIS-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3DIS-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O3DIS-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; O3DIS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3DIS-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; O3DIS-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; O3DIS-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3DIS-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O3DIS-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; O3DIS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3DIS-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; O3DIS-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; O3DIS-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3DIS-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O3DIS-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; O3DIS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3DIS-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; O3DIS-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; O3DIS-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3DIS-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O3DIS-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; O3DIS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3DIS-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; O3DIS-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; O3DIS-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3DIS-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O3DIS-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; O3DIS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3DIS-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; O3DIS-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; O3DIS-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3DIS-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O3DIS-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; O3DIS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3DIS-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; O3DIS-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; O3DIS-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3DIS-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; O3DIS-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; O3DIS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3DIS-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; O3DIS-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; O3DIS-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3DIS-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; O3DIS-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; O3DIS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3DIS-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; O3DIS-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; O3DIS-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3DIS-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; O3DIS-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; O3DIS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3DIS-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; O3DIS-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; O3DIS-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3DIS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3DIS-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; O3DIS-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; O3DIS-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; O3DIS-NEXT:    ret i32 [[TMP46]]
@@ -826,80 +826,80 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; O2-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O2-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; O2-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O2-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O2-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; O2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; O2-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; O2-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O2-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; O2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; O2-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; O2-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O2-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; O2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; O2-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; O2-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O2-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; O2-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; O2-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O2-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; O2-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; O2-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O2-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; O2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; O2-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; O2-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O2-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O2-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; O2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; O2-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; O2-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O2-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; O2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O2-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; O2-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; O2-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O2-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O2-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; O2-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; O2-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O2-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O2-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; O2-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; O2-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O2-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O2-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; O2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O2-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; O2-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; O2-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O2-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; O2-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; O2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O2-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; O2-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; O2-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O2-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; O2-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; O2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O2-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; O2-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; O2-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O2-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; O2-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O2-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; O2-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; O2-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O2-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; O2-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; O2-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; O2-NEXT:    ret i32 [[TMP46]]
@@ -911,80 +911,80 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; O3-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O3-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; O3-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O3-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O3-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; O3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; O3-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; O3-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O3-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; O3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; O3-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; O3-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O3-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; O3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; O3-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; O3-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O3-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; O3-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; O3-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O3-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; O3-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; O3-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O3-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; O3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; O3-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; O3-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O3-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; O3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; O3-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; O3-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O3-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; O3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; O3-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; O3-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O3-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; O3-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; O3-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O3-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; O3-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; O3-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O3-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; O3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; O3-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; O3-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; O3-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; O3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; O3-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; O3-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; O3-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; O3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; O3-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; O3-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; O3-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; O3-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; O3-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; O3-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; O3-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; O3-NEXT:    ret i32 [[TMP46]]
@@ -996,80 +996,80 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; O3DEFAULT-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; O3DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; O3DEFAULT-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; O3DEFAULT-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3DEFAULT-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; O3DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; O3DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3DEFAULT-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; O3DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; O3DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; O3DEFAULT-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; O3DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; O3DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; O3DEFAULT-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; O3DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; O3DEFAULT-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3DEFAULT-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; O3DEFAULT-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; O3DEFAULT-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3DEFAULT-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; O3DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; O3DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; O3DEFAULT-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3DEFAULT-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; O3DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; O3DEFAULT-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; O3DEFAULT-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; O3DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; O3DEFAULT-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; O3DEFAULT-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; O3DEFAULT-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; O3DEFAULT-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3DEFAULT-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; O3DEFAULT-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; O3DEFAULT-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3DEFAULT-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; O3DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; O3DEFAULT-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; O3DEFAULT-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; O3DEFAULT-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; O3DEFAULT-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; O3DEFAULT-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; O3DEFAULT-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; O3DEFAULT-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; O3DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; O3DEFAULT-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; O3DEFAULT-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; O3DEFAULT-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; O3DEFAULT-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; O3DEFAULT-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; O3DEFAULT-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; O3DEFAULT-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; O3DEFAULT-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; O3DEFAULT-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; O3DEFAULT-NEXT:    ret i32 [[TMP46]]
@@ -1081,80 +1081,80 @@ define i32 @nopragma(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; Os-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[B:%.*]], align 4
 ; Os-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
 ; Os-NEXT:    store <4 x i32> [[TMP0]], ptr [[A:%.*]], align 4
-; Os-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
+; Os-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
 ; Os-NEXT:    [[WIDE_LOAD_1:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; Os-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_1]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; Os-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; Os-NEXT:    store <4 x i32> [[TMP2]], ptr [[TMP3]], align 4
-; Os-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
+; Os-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
 ; Os-NEXT:    [[WIDE_LOAD_2:%.*]] = load <4 x i32>, ptr [[TMP4]], align 4
 ; Os-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_2]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; Os-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; Os-NEXT:    store <4 x i32> [[TMP5]], ptr [[TMP6]], align 4
-; Os-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
+; Os-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
 ; Os-NEXT:    [[WIDE_LOAD_3:%.*]] = load <4 x i32>, ptr [[TMP7]], align 4
 ; Os-NEXT:    [[TMP8:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_3]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; Os-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; Os-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
-; Os-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
+; Os-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
 ; Os-NEXT:    [[WIDE_LOAD_4:%.*]] = load <4 x i32>, ptr [[TMP10]], align 4
 ; Os-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_4]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; Os-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; Os-NEXT:    store <4 x i32> [[TMP11]], ptr [[TMP12]], align 4
-; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
+; Os-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
 ; Os-NEXT:    [[WIDE_LOAD_5:%.*]] = load <4 x i32>, ptr [[TMP13]], align 4
 ; Os-NEXT:    [[TMP14:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_5]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; Os-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; Os-NEXT:    store <4 x i32> [[TMP14]], ptr [[TMP15]], align 4
-; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
+; Os-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
 ; Os-NEXT:    [[WIDE_LOAD_6:%.*]] = load <4 x i32>, ptr [[TMP16]], align 4
 ; Os-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_6]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; Os-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; Os-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP18]], align 4
-; Os-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
+; Os-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
 ; Os-NEXT:    [[WIDE_LOAD_7:%.*]] = load <4 x i32>, ptr [[TMP19]], align 4
 ; Os-NEXT:    [[TMP20:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_7]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; Os-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; Os-NEXT:    store <4 x i32> [[TMP20]], ptr [[TMP21]], align 4
-; Os-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
+; Os-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
 ; Os-NEXT:    [[WIDE_LOAD_8:%.*]] = load <4 x i32>, ptr [[TMP22]], align 4
 ; Os-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_8]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; Os-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; Os-NEXT:    store <4 x i32> [[TMP23]], ptr [[TMP24]], align 4
-; Os-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
+; Os-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
 ; Os-NEXT:    [[WIDE_LOAD_9:%.*]] = load <4 x i32>, ptr [[TMP25]], align 4
 ; Os-NEXT:    [[TMP26:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_9]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; Os-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; Os-NEXT:    store <4 x i32> [[TMP26]], ptr [[TMP27]], align 4
-; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
+; Os-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
 ; Os-NEXT:    [[WIDE_LOAD_10:%.*]] = load <4 x i32>, ptr [[TMP28]], align 4
 ; Os-NEXT:    [[TMP29:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_10]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; Os-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; Os-NEXT:    store <4 x i32> [[TMP29]], ptr [[TMP30]], align 4
-; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
+; Os-NEXT:    [[TMP31:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
 ; Os-NEXT:    [[WIDE_LOAD_11:%.*]] = load <4 x i32>, ptr [[TMP31]], align 4
 ; Os-NEXT:    [[TMP32:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_11]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; Os-NEXT:    [[TMP33:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; Os-NEXT:    store <4 x i32> [[TMP32]], ptr [[TMP33]], align 4
-; Os-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 48
+; Os-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 192
 ; Os-NEXT:    [[WIDE_LOAD_12:%.*]] = load <4 x i32>, ptr [[TMP34]], align 4
 ; Os-NEXT:    [[TMP35:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_12]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 48
+; Os-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 192
 ; Os-NEXT:    store <4 x i32> [[TMP35]], ptr [[TMP36]], align 4
-; Os-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 52
+; Os-NEXT:    [[TMP37:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 208
 ; Os-NEXT:    [[WIDE_LOAD_13:%.*]] = load <4 x i32>, ptr [[TMP37]], align 4
 ; Os-NEXT:    [[TMP38:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_13]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 52
+; Os-NEXT:    [[TMP39:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 208
 ; Os-NEXT:    store <4 x i32> [[TMP38]], ptr [[TMP39]], align 4
-; Os-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 56
+; Os-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 224
 ; Os-NEXT:    [[WIDE_LOAD_14:%.*]] = load <4 x i32>, ptr [[TMP40]], align 4
 ; Os-NEXT:    [[TMP41:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_14]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 56
+; Os-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 224
 ; Os-NEXT:    store <4 x i32> [[TMP41]], ptr [[TMP42]], align 4
-; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 60
+; Os-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 240
 ; Os-NEXT:    [[WIDE_LOAD_15:%.*]] = load <4 x i32>, ptr [[TMP43]], align 4
 ; Os-NEXT:    [[TMP44:%.*]] = add nsw <4 x i32> [[WIDE_LOAD_15]], [[BROADCAST_SPLAT]]
-; Os-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 60
+; Os-NEXT:    [[TMP45:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 240
 ; Os-NEXT:    store <4 x i32> [[TMP44]], ptr [[TMP45]], align 4
 ; Os-NEXT:    [[TMP46:%.*]] = load i32, ptr [[A]], align 4
 ; Os-NEXT:    ret i32 [[TMP46]]
@@ -1349,58 +1349,58 @@ define i32 @disabled(ptr noalias nocapture %a, ptr noalias nocapture readonly %b
 ; O3DEFAULT-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> zeroinitializer
 ; O3DEFAULT-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[TMP0]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP3]], ptr [[A:%.*]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 4
+; O3DEFAULT-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 16
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; O3DEFAULT-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_4]], align 4
 ; O3DEFAULT-NEXT:    [[TMP5:%.*]] = add nsw <4 x i32> [[TMP4]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP5]], ptr [[ARRAYIDX2_4]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 8
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 8
+; O3DEFAULT-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 32
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_8:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 32
 ; O3DEFAULT-NEXT:    [[TMP6:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_8]], align 4
 ; O3DEFAULT-NEXT:    [[TMP7:%.*]] = add nsw <4 x i32> [[TMP6]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP7]], ptr [[ARRAYIDX2_8]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 12
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 12
+; O3DEFAULT-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 48
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_12:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 48
 ; O3DEFAULT-NEXT:    [[TMP8:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_12]], align 4
 ; O3DEFAULT-NEXT:    [[TMP9:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP9]], ptr [[ARRAYIDX2_12]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 16
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 16
+; O3DEFAULT-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 64
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 64
 ; O3DEFAULT-NEXT:    [[TMP10:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_16]], align 4
 ; O3DEFAULT-NEXT:    [[TMP11:%.*]] = add nsw <4 x i32> [[TMP10]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP11]], ptr [[ARRAYIDX2_16]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 20
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 20
+; O3DEFAULT-NEXT:    [[ARRAYIDX_20:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 80
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_20:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 80
 ; O3DEFAULT-NEXT:    [[TMP12:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_20]], align 4
 ; O3DEFAULT-NEXT:    [[TMP13:%.*]] = add nsw <4 x i32> [[TMP12]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP13]], ptr [[ARRAYIDX2_20]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 24
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 24
+; O3DEFAULT-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 96
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_24:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 96
 ; O3DEFAULT-NEXT:    [[TMP14:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_24]], align 4
 ; O3DEFAULT-NEXT:    [[TMP15:%.*]] = add nsw <4 x i32> [[TMP14]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP15]], ptr [[ARRAYIDX2_24]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 28
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 28
+; O3DEFAULT-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 112
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_28:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 112
 ; O3DEFAULT-NEXT:    [[TMP16:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_28]], align 4
 ; O3DEFAULT-NEXT:    [[TMP17:%.*]] = add nsw <4 x i32> [[TMP16]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP17]], ptr [[ARRAYIDX2_28]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 32
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 32
+; O3DEFAULT-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 128
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_32:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 128
 ; O3DEFAULT-NEXT:    [[TMP18:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_32]], align 4
 ; O3DEFAULT-NEXT:    [[TMP19:%.*]] = add nsw <4 x i32> [[TMP18]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP19]], ptr [[ARRAYIDX2_32]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 36
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 36
+; O3DEFAULT-NEXT:    [[ARRAYIDX_36:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 144
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_36:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 144
 ; O3DEFAULT-NEXT:    [[TMP20:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_36]], align 4
 ; O3DEFAULT-NEXT:    [[TMP21:%.*]] = add nsw <4 x i32> [[TMP20]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP21]], ptr [[ARRAYIDX2_36]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 40
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 40
+; O3DEFAULT-NEXT:    [[ARRAYIDX_40:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 160
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_40:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 160
 ; O3DEFAULT-NEXT:    [[TMP22:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_40]], align 4
 ; O3DEFAULT-NEXT:    [[TMP23:%.*]] = add nsw <4 x i32> [[TMP22]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP23]], ptr [[ARRAYIDX2_40]], align 4
-; O3DEFAULT-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 44
-; O3DEFAULT-NEXT:    [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 44
+; O3DEFAULT-NEXT:    [[ARRAYIDX_44:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 176
+; O3DEFAULT-NEXT:    [[ARRAYIDX2_44:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 176
 ; O3DEFAULT-NEXT:    [[TMP24:%.*]] = load <4 x i32>, ptr [[ARRAYIDX_44]], align 4
 ; O3DEFAULT-NEXT:    [[TMP25:%.*]] = add nsw <4 x i32> [[TMP24]], [[TMP2]]
 ; O3DEFAULT-NEXT:    store <4 x i32> [[TMP25]], ptr [[ARRAYIDX2_44]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
index 4472ae4fecd38..0b16d80a4adbc 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/pr23997.ll
@@ -30,21 +30,21 @@ define void @foo(ptr addrspace(1) align 8 dereferenceable_or_null(16), ptr addrs
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT12]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP5]], i64 4
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP5]], i64 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP5]], i64 12
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP5]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope !0
-; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP8]], align 8, !alias.scope !0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP5]], i64 32
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP5]], i64 64
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP5]], i64 96
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP5]], align 8, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP6]], align 8, !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP7]], align 8, !alias.scope [[META0]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x ptr addrspace(1)>, ptr addrspace(1) [[TMP8]], align 8, !alias.scope [[META0]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[DOT10]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP9]], i64 4
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP9]], i64 8
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds ptr addrspace(1), ptr addrspace(1) [[TMP9]], i64 12
-; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD]], ptr addrspace(1) [[TMP9]], align 8, !alias.scope !3, !noalias !0
-; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD3]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope !3, !noalias !0
-; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope !3, !noalias !0
-; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD5]], ptr addrspace(1) [[TMP12]], align 8, !alias.scope !3, !noalias !0
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i64 32
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i64 64
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr addrspace(1) [[TMP9]], i64 96
+; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD]], ptr addrspace(1) [[TMP9]], align 8, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD3]], ptr addrspace(1) [[TMP10]], align 8, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD4]], ptr addrspace(1) [[TMP11]], align 8, !alias.scope [[META3]], !noalias [[META0]]
+; CHECK-NEXT:    store <4 x ptr addrspace(1)> [[WIDE_LOAD5]], ptr addrspace(1) [[TMP12]], align 8, !alias.scope [[META3]], !noalias [[META0]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
index fcd41fd188ce6..be83329d30fef 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/small-size.ll
@@ -369,11 +369,11 @@ define void @example23(ptr nocapture %src, ptr nocapture %dst) optsize {
 ; CHECK-NEXT:    [[DOT04:%.*]] = phi ptr [ [[SRC:%.*]], [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[TMP1]] ]
 ; CHECK-NEXT:    [[DOT013:%.*]] = phi ptr [ [[DST:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[TMP1]] ]
 ; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ 0, [[TMP0]] ], [ [[TMP7:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[TMP2]] = getelementptr inbounds i16, ptr [[DOT04]], i64 1
+; CHECK-NEXT:    [[TMP2]] = getelementptr inbounds i8, ptr [[DOT04]], i64 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[DOT04]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7
-; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds i32, ptr [[DOT013]], i64 1
+; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds i8, ptr [[DOT013]], i64 4
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr [[DOT013]], align 4
 ; CHECK-NEXT:    [[TMP7]] = add nuw nsw i32 [[I_02]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[TMP7]], 256
@@ -561,11 +561,11 @@ define i64 @example23d(ptr noalias nocapture %src, ptr noalias nocapture %dst) o
 ; CHECK-NEXT:    [[DOT04:%.*]] = phi ptr [ [[SRC:%.*]], [[TMP0:%.*]] ], [ [[TMP2:%.*]], [[TMP1]] ]
 ; CHECK-NEXT:    [[DOT013:%.*]] = phi ptr [ [[DST:%.*]], [[TMP0]] ], [ [[TMP6:%.*]], [[TMP1]] ]
 ; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 0, [[TMP0]] ], [ [[TMP7:%.*]], [[TMP1]] ]
-; CHECK-NEXT:    [[TMP2]] = getelementptr inbounds i16, ptr [[DOT04]], i64 1
+; CHECK-NEXT:    [[TMP2]] = getelementptr inbounds i8, ptr [[DOT04]], i64 2
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i16, ptr [[DOT04]], align 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = zext i16 [[TMP3]] to i32
 ; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i32 [[TMP4]], 7
-; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds i32, ptr [[DOT013]], i64 1
+; CHECK-NEXT:    [[TMP6]] = getelementptr inbounds i8, ptr [[DOT013]], i64 4
 ; CHECK-NEXT:    store i32 [[TMP5]], ptr [[DOT013]], align 4
 ; CHECK-NEXT:    [[TMP7]] = add nuw nsw i64 [[I_02]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[TMP7]], 257
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
index ffe4e13b187a2..fabe2eb8062bb 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-store-accesses-with-gaps.ll
@@ -72,7 +72,7 @@ define dso_local void @test1(ptr noalias nocapture %points, ptr noalias nocaptur
 ;
 ; ENABLED_MASKED_STRIDED-LABEL: @test1(
 ; ENABLED_MASKED_STRIDED-NEXT:  entry:
-; ENABLED_MASKED_STRIDED-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i16, ptr [[POINTS:%.*]], i64 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[POINTS:%.*]], i64 -2
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -239,7 +239,7 @@ define dso_local void @test2(ptr noalias nocapture %points, i32 %numPoints, ptr
 ; ENABLED_MASKED_STRIDED-NEXT:    [[TRIP_COUNT_MINUS_1:%.*]] = add nsw i64 [[WIDE_TRIP_COUNT]], -1
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TRIP_COUNT_MINUS_1]], i64 0
 ; ENABLED_MASKED_STRIDED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT]], <4 x i64> poison, <4 x i32> zeroinitializer
-; ENABLED_MASKED_STRIDED-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i16, ptr [[POINTS:%.*]], i64 -1
+; ENABLED_MASKED_STRIDED-NEXT:    [[INVARIANT_GEP:%.*]] = getelementptr i8, ptr [[POINTS:%.*]], i64 -2
 ; ENABLED_MASKED_STRIDED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; ENABLED_MASKED_STRIDED:       vector.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
index 5f2f83e19ab5b..c7eaac315fd86 100644
--- a/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
+++ b/llvm/test/Transforms/LoopVectorize/consecutive-ptr-uniforms.ll
@@ -49,7 +49,7 @@ for.end:
 ; CHECK:       %offset.idx = sub i64 %n, %index
 ; CHECK-NOT:   getelementptr
 ; CHECK:       %[[G0:.+]] = getelementptr inbounds i32, ptr %a, i64 %offset.idx
-; CHECK:       getelementptr inbounds i32, ptr %[[G0]], i64 -3
+; CHECK:       getelementptr inbounds i8, ptr %[[G0]], i64 -12
 ; CHECK-NOT:   getelementptr
 ; CHECK:       br i1 {{.*}}, label %middle.block, label %vector.body
 ;
@@ -162,7 +162,7 @@ for.end:
 ; INTER:       %offset.idx = sub i64 %n, %index
 ; INTER-NOT:   getelementptr
 ; INTER:       %[[G0:.+]] = getelementptr inbounds %pair, ptr %p, i64 %offset.idx, i32 0
-; INTER:       getelementptr inbounds i32, ptr %[[G0]], i64 -6
+; INTER:       getelementptr inbounds i8, ptr %[[G0]], i64 -24
 ; INTER-NOT:   getelementptr
 ; INTER:       br i1 {{.*}}, label %middle.block, label %vector.body
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll
index 6de93c381d0ed..fe6d9b3ec690e 100644
--- a/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll
+++ b/llvm/test/Transforms/LoopVectorize/extract-last-veclane.ll
@@ -38,7 +38,7 @@ define void @inv_store_last_lane(ptr noalias nocapture %a, ptr noalias nocapture
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[MUL_LCSSA:%.*]] = phi i32 [ [[MUL]], [[FOR_BODY]] ], [ [[TMP4]], [[MIDDLE_BLOCK]] ]
-; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[INV:%.*]], i64 42
+; CHECK-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[INV:%.*]], i64 168
 ; CHECK-NEXT:    store i32 [[MUL_LCSSA]], ptr [[ARRAYIDX5]], align 4
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 736b0598043e6..c2ba85d71c0e7 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -102,7 +102,7 @@ define void @fp_iv_loop1_fast_FMF(float %init, ptr noalias nocapture %A, i32 %N)
 ; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fsub fast <4 x float> [[VEC_IND]], [[DOTSPLAT5]]
 ; VEC4_INTERL2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], ptr [[TMP5]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -349,7 +349,7 @@ define void @fp_iv_loop1_reassoc_FMF(float %init, ptr noalias nocapture %A, i32
 ; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fsub reassoc <4 x float> [[VEC_IND]], [[DOTSPLAT5]]
 ; VEC4_INTERL2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP4]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], ptr [[TMP5]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -586,7 +586,7 @@ define void @fp_iv_loop2(float %init, ptr noalias nocapture %A, i32 %N) #0 {
 ; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 16
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -859,7 +859,7 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    [[VEC_IND10:%.*]] = phi <4 x float> [ [[INDUCTION]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT13:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD11:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[DOTSPLAT9]]
 ; VEC4_INTERL2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 16
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND10]], ptr [[TMP6]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD11]], ptr [[TMP7]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP8:%.*]] = fadd fast <4 x float> [[VEC_IND10]], [[BROADCAST_SPLAT]]
@@ -869,11 +869,11 @@ define void @fp_iv_loop3(float %init, ptr noalias nocapture %A, ptr noalias noca
 ; VEC4_INTERL2-NEXT:    [[TMP12:%.*]] = fadd fast <4 x float> [[TMP10]], [[TMP8]]
 ; VEC4_INTERL2-NEXT:    [[TMP13:%.*]] = fadd fast <4 x float> [[TMP11]], [[TMP9]]
 ; VEC4_INTERL2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[B:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TMP14]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP14]], i64 16
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP12]], ptr [[TMP14]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP13]], ptr [[TMP15]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[C:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds float, ptr [[TMP16]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP16]], i64 16
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP10]], ptr [[TMP16]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[TMP11]], ptr [[TMP17]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -1167,7 +1167,7 @@ define void @fp_iv_loop4(ptr noalias nocapture %A, i32 %N) {
 ; VEC4_INTERL2-NEXT:    [[VEC_IND:%.*]] = phi <4 x float> [ <float 1.000000e+00, float 1.500000e+00, float 2.000000e+00, float 2.500000e+00>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; VEC4_INTERL2-NEXT:    [[STEP_ADD:%.*]] = fadd fast <4 x float> [[VEC_IND]], <float 2.000000e+00, float 2.000000e+00, float 2.000000e+00, float 2.000000e+00>
 ; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 16
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[VEC_IND]], ptr [[TMP2]], align 4
 ; VEC4_INTERL2-NEXT:    store <4 x float> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -1403,7 +1403,7 @@ define void @non_primary_iv_float_scalar(ptr %A, i64 %N) {
 ; VEC4_INTERL2-NEXT:    [[DOTCAST2:%.*]] = sitofp i64 [[INDEX]] to float
 ; VEC4_INTERL2-NEXT:    [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 4
 ; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds float, ptr [[A:%.*]], i64 [[INDEX]]
-; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4
+; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; VEC4_INTERL2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP1]], align 4
 ; VEC4_INTERL2-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x float>, ptr [[TMP2]], align 4
 ; VEC4_INTERL2-NEXT:    [[TMP3:%.*]] = fcmp fast oeq <4 x float> [[WIDE_LOAD]], zeroinitializer
diff --git a/llvm/test/Transforms/LoopVectorize/induction.ll b/llvm/test/Transforms/LoopVectorize/induction.ll
index 119c0aefaa73f..29d8719db9b29 100644
--- a/llvm/test/Transforms/LoopVectorize/induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/induction.ll
@@ -112,7 +112,7 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 190, i32 191>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 2
+; UNROLL-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -205,7 +205,7 @@ define void @multi_int_induction(ptr %A, i32 %N) {
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 190, i32 191, i32 192, i32 193>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP3]], i64 4
+; INTERLEAVE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP4]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -410,12 +410,12 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
 ; UNROLL-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[TMP5]], i64 [[OFFSET]]
-; UNROLL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 2
+; UNROLL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 8
 ; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x float>, ptr [[TMP6]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; UNROLL-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x float>, ptr [[TMP7]], align 4, !alias.scope [[META4]], !noalias [[META7]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
 ; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[OFFSET2]]
-; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 2
+; UNROLL-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 8
 ; UNROLL-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x float>, ptr [[TMP9]], align 4, !alias.scope [[META7]]
 ; UNROLL-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x float>, ptr [[TMP10]], align 4, !alias.scope [[META7]]
 ; UNROLL-NEXT:    [[TMP11:%.*]] = fmul fast <2 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]]
@@ -553,12 +553,12 @@ define void @scalar_use(ptr %a, float %b, i64 %offset, i64 %offset2, i64 %n) {
 ; INTERLEAVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP5:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr float, ptr [[TMP5]], i64 [[OFFSET]]
-; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds float, ptr [[TMP6]], i64 4
+; INTERLEAVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 16
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP6]], align 4, !alias.scope [[META4:![0-9]+]], !noalias [[META7:![0-9]+]]
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x float>, ptr [[TMP7]], align 4, !alias.scope [[META4]], !noalias [[META7]]
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[A]], i64 [[INDEX]]
 ; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i64 [[OFFSET2]]
-; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP9]], i64 4
+; INTERLEAVE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP9]], i64 16
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope [[META7]]
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x float>, ptr [[TMP10]], align 4, !alias.scope [[META7]]
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = fmul fast <4 x float> [[BROADCAST_SPLAT]], [[WIDE_LOAD5]]
@@ -718,7 +718,7 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; UNROLL-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 2
+; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP0]], align 8
 ; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
 ; UNROLL-NEXT:    [[TMP2]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
@@ -809,7 +809,7 @@ define i64 @scalarize_induction_variable_01(ptr %a, i64 %n) {
 ; INTERLEAVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP2:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP3:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i64, ptr [[A:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 4
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 32
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i64>, ptr [[TMP0]], align 8
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
 ; INTERLEAVE-NEXT:    [[TMP2]] = add <4 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
@@ -2115,7 +2115,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; UNROLL-NEXT:    [[TMP0:%.*]] = or disjoint i32 [[INDEX]], 2
 ; UNROLL-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
 ; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 8
 ; UNROLL-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i32>, ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    [[WIDE_LOAD2:%.*]] = load <2 x i32>, ptr [[TMP3]], align 4
 ; UNROLL-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
@@ -2303,7 +2303,7 @@ define i32 @scalarize_induction_variable_05(ptr %a, i32 %x, i1 %c, i32 %n) {
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = or disjoint i32 [[INDEX]], 4
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = sext i32 [[INDEX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 16
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    br i1 [[C]], label [[PRED_UDIV_IF:%.*]], label [[PRED_UDIV_CONTINUE:%.*]]
@@ -3605,7 +3605,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; UNROLL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2
+; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -3749,7 +3749,7 @@ define void @wrappingindvars1(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -3993,7 +3993,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
 ; UNROLL-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; UNROLL-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 2
+; UNROLL-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -4143,7 +4143,7 @@ define void @wrappingindvars2(i8 %t, i32 %len, ptr %A) {
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i8 [[DOTCAST5]], [[T]]
 ; INTERLEAVE-NEXT:    [[TMP10:%.*]] = sext i8 [[OFFSET_IDX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP10]]
-; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 4
+; INTERLEAVE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP11]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP12]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -4291,7 +4291,7 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 2, i32 2>
 ; UNROLL-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
 ; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -4368,7 +4368,7 @@ define void @veciv(ptr nocapture %a, i32 %start, i32 %k) {
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 4, i32 4, i32 4, i32 4>
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = sext i32 [[INDEX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -4509,7 +4509,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; UNROLL-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; UNROLL-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
 ; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 2
+; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
@@ -4601,7 +4601,7 @@ define void @trunciv(ptr nocapture %a, i32 %start, i64 %k) {
 ; INTERLEAVE-NEXT:    [[SEXT:%.*]] = shl i64 [[OFFSET_IDX]], 32
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = ashr exact i64 [[SEXT]], 32
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP0]]
-; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8
@@ -4746,7 +4746,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; UNROLL-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]]
 ; UNROLL-NEXT:    [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64
 ; UNROLL-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
-; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 2
+; UNROLL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
@@ -4835,7 +4835,7 @@ define void @nonprimary(ptr nocapture %a, i32 %start, i32 %i, i32 %k) {
 ; INTERLEAVE-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[INDEX]], [[I]]
 ; INTERLEAVE-NEXT:    [[TMP1:%.*]] = sext i32 [[OFFSET_IDX]] to i64
 ; INTERLEAVE-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[TMP1]]
-; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 4
+; INTERLEAVE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP2]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP3]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -4968,7 +4968,7 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; UNROLL-NEXT:    [[VEC_IND:%.*]] = phi <2 x i32> [ <i32 0, i32 2>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; UNROLL-NEXT:    [[STEP_ADD:%.*]] = add <2 x i32> [[VEC_IND]], <i32 4, i32 4>
 ; UNROLL-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; UNROLL-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -5055,7 +5055,7 @@ define void @non_primary_iv_trunc(ptr %a, i64 %n) {
 ; INTERLEAVE-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 0, i32 2, i32 4, i32 6>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; INTERLEAVE-NEXT:    [[STEP_ADD:%.*]] = add <4 x i32> [[VEC_IND]], <i32 8, i32 8, i32 8, i32 8>
 ; INTERLEAVE-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
+; INTERLEAVE-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[VEC_IND]], ptr [[TMP0]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[STEP_ADD]], ptr [[TMP1]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
@@ -6048,7 +6048,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; UNROLL-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]]
 ; UNROLL-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[VEC_IND]], [[TMP3]]
 ; UNROLL-NEXT:    [[TMP8:%.*]] = add <2 x i32> [[STEP_ADD]], [[TMP4]]
-; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 2
+; UNROLL-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP6]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[TMP8]], ptr [[TMP9]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 4
@@ -6143,7 +6143,7 @@ define void @pr52460_first_order_recurrence_truncated_iv(ptr noalias %src, ptr %
 ; INTERLEAVE-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[DST:%.*]], i64 [[TMP5]]
 ; INTERLEAVE-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[VEC_IND]], [[TMP3]]
 ; INTERLEAVE-NEXT:    [[TMP8:%.*]] = add <4 x i32> [[STEP_ADD]], [[TMP4]]
-; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i32, ptr [[TMP6]], i64 4
+; INTERLEAVE-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP6]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP8]], ptr [[TMP9]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[OFFSET_IDX]], 8
@@ -6394,7 +6394,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; UNROLL-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i32> [[VECTOR_RECUR]], <2 x i32> [[VEC_IND]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i32> [[VEC_IND]], <2 x i32> [[STEP_ADD]], <2 x i32> <i32 1, i32 2>
 ; UNROLL-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
-; UNROLL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 2
+; UNROLL-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i64 8
 ; UNROLL-NEXT:    store <2 x i32> [[TMP17]], ptr [[TMP19]], align 4
 ; UNROLL-NEXT:    store <2 x i32> [[TMP18]], ptr [[TMP20]], align 4
 ; UNROLL-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -6552,7 +6552,7 @@ define void @test_optimized_cast_induction_feeding_first_order_recurrence(i64 %n
 ; INTERLEAVE-NEXT:    [[TMP17:%.*]] = shufflevector <4 x i32> [[VECTOR_RECUR]], <4 x i32> [[VEC_IND]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; INTERLEAVE-NEXT:    [[TMP18:%.*]] = shufflevector <4 x i32> [[VEC_IND]], <4 x i32> [[STEP_ADD]], <4 x i32> <i32 3, i32 4, i32 5, i32 6>
 ; INTERLEAVE-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i32, ptr [[PTR:%.*]], i64 [[INDEX]]
-; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i32, ptr [[TMP19]], i64 4
+; INTERLEAVE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP19]], i64 16
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP17]], ptr [[TMP19]], align 4
 ; INTERLEAVE-NEXT:    store <4 x i32> [[TMP18]], ptr [[TMP20]], align 4
 ; INTERLEAVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 9a27c294f1384..4c3377255b21a 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -42,7 +42,7 @@ define void @test_array_load2_store2(i32 %C, i32 %D) {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = mul nsw <4 x i32> [[STRIDED_VEC1]], [[BROADCAST_SPLAT3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [1024 x i32], ptr @CD, i64 0, i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -349,7 +349,7 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <4 x i32> [ <i32 1023, i32 1022, i32 1021, i32 1020>, [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = sub i64 1023, [[INDEX]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[STRUCT_ST2:%.*]], ptr [[A:%.*]], i64 [[OFFSET_IDX]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 -6
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 -24
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <8 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
 ; CHECK-NEXT:    [[REVERSE:%.*]] = shufflevector <4 x i32> [[STRIDED_VEC]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -358,7 +358,7 @@ define void @test_reversed_load2_store2(ptr noalias nocapture readonly %A, ptr n
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[REVERSE]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = sub nsw <4 x i32> [[REVERSE2]], [[VEC_IND]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST2]], ptr [[B:%.*]], i64 [[OFFSET_IDX]], i32 1
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -7
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -28
 ; CHECK-NEXT:    [[REVERSE3:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[REVERSE4:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[REVERSE3]], <4 x i32> [[REVERSE4]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
@@ -675,7 +675,7 @@ define void @mixed_load2_store2(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <8 x i32> [[WIDE_VEC]], <8 x i32> poison, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[STRIDED_VEC4]], [[STRIDED_VEC3]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i64 -1
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP3]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -1090,7 +1090,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 0
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP4]], i32 0
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[INDEX]], i32 1
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP2]], i32 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[TMP3]], i32 1
@@ -1119,7 +1119,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[P_I_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 0
-; CHECK-NEXT:    [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 -1, i32 0
+; CHECK-NEXT:    [[P_I_MINUS_1_X:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 -8
 ; CHECK-NEXT:    [[P_I_Y:%.*]] = getelementptr inbounds [[PAIR_I32]], ptr [[P]], i64 [[I]], i32 1
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[P_I_X]], align 4
 ; CHECK-NEXT:    [[TMP19:%.*]] = load i32, ptr [[P_I_MINUS_1_X]], align 4
@@ -1296,7 +1296,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    store i32 [[X]], ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP11]], i64 -1
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 -4
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i32> [[BROADCAST_SPLAT]], <4 x i32> [[BROADCAST_SPLAT2]], <8 x i32> <i32 0, i32 4, i32 1, i32 5, i32 2, i32 6, i32 3, i32 7>
 ; CHECK-NEXT:    store <8 x i32> [[INTERLEAVED_VEC]], ptr [[TMP12]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -1379,7 +1379,7 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i32, ptr [[A]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i32, ptr [[TMP12]], i64 9
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP12]], i64 36
 ; CHECK-NEXT:    [[TMP14:%.*]] = extractelement <4 x i64> [[TMP7]], i64 0
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = extractelement <4 x i64> [[TMP7]], i64 1
@@ -1422,9 +1422,9 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) {
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
 ; CHECK-NEXT:    [[A_I:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[I]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]]
-; CHECK-NEXT:    [[A_I_MINUS_1:%.*]] = getelementptr i32, ptr [[TMP31]], i64 -1
+; CHECK-NEXT:    [[A_I_MINUS_1:%.*]] = getelementptr i8, ptr [[TMP31]], i64 -4
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr i32, ptr [[A]], i64 [[I]]
-; CHECK-NEXT:    [[A_I_MINUS_3:%.*]] = getelementptr i32, ptr [[TMP32]], i64 -3
+; CHECK-NEXT:    [[A_I_MINUS_3:%.*]] = getelementptr i8, ptr [[TMP32]], i64 -12
 ; CHECK-NEXT:    store i32 [[X]], ptr [[A_I_MINUS_1]], align 4
 ; CHECK-NEXT:    store i32 [[Y]], ptr [[A_I_MINUS_3]], align 4
 ; CHECK-NEXT:    store i32 [[Z]], ptr [[A_I]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
index e4c183906865a..306ec125dc202 100644
--- a/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
+++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop-uf4.ll
@@ -16,9 +16,9 @@ define i32 @reduction_sum_single(ptr noalias nocapture %A) {
 ; CHECK-NEXT:    [[VEC_PHI2:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI3:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 12
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 32
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 48
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check.ll b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
index 108580556f532..d5df8afc80a79 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check.ll
@@ -235,7 +235,7 @@ define void @test_runtime_check2(ptr %a, float %b, i64 %offset, i64 %offset2, i6
 ; CHECK-NEXT:    [[AD:%.*]] = fadd fast float [[L1]], [[M]]
 ; CHECK-NEXT:    store float [[AD]], ptr [[ARR_IDX]], align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr [[C:%.*]], i64 [[IV]]
-; CHECK-NEXT:    [[C_IDX:%.*]] = getelementptr float, ptr [[TMP2]], i64 -1
+; CHECK-NEXT:    [[C_IDX:%.*]] = getelementptr i8, ptr [[TMP2]], i64 -4
 ; CHECK-NEXT:    [[LC:%.*]] = load float, ptr [[C_IDX]], align 4
 ; CHECK-NEXT:    [[VC:%.*]] = fadd float [[LC]], 1.000000e+00
 ; CHECK-NEXT:    [[C_IDX2:%.*]] = getelementptr inbounds float, ptr [[C]], i64 [[IV]]
diff --git a/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
index 4a6523f2dabf5..f791b956bcf54 100644
--- a/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/scalar_after_vectorization.ll
@@ -11,7 +11,7 @@ target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 ; CHECK:   %[[T2:.+]] = add nuw nsw i64 %offset.idx, %tmp0
 ; CHECK:   %[[T3:.+]] = sub nsw i64 %[[T2]], %x
 ; CHECK:   %[[T4:.+]] = getelementptr inbounds i32, ptr %a, i64 %[[T3]]
-; CHECK:   %[[T6:.+]] = getelementptr inbounds i32, ptr %[[T4]], i64 4
+; CHECK:   %[[T6:.+]] = getelementptr inbounds i8, ptr %[[T4]], i64 16
 ; CHECK:   load <4 x i32>, ptr %[[T4]], align 4
 ; CHECK:   load <4 x i32>, ptr %[[T6]], align 4
 ; CHECK:   br {{.*}}, label %middle.block, label %vector.body
diff --git a/llvm/test/Transforms/LoopVectorize/vector-geps.ll b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
index ec0b4864ac3fe..7aff527a5a799 100644
--- a/llvm/test/Transforms/LoopVectorize/vector-geps.ll
+++ b/llvm/test/Transforms/LoopVectorize/vector-geps.ll
@@ -68,7 +68,7 @@ define void @uniform_vector_gep_stored(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 4
 ; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <4 x ptr> poison, ptr [[TMP0]], i64 0
 ; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <4 x ptr> [[DOTSPLATINSERT]], <4 x ptr> poison, <4 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i64 [[INDEX]]
@@ -84,7 +84,7 @@ define void @uniform_vector_gep_stored(ptr %a, ptr %b, i64 %n) {
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I:%.*]] = phi i64 [ [[I_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
-; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 1
+; CHECK-NEXT:    [[VAR0:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 4
 ; CHECK-NEXT:    [[VAR1:%.*]] = getelementptr inbounds ptr, ptr [[A]], i64 [[I]]
 ; CHECK-NEXT:    store ptr [[VAR0]], ptr [[VAR1]], align 8
 ; CHECK-NEXT:    [[I_NEXT]] = add nuw nsw i64 [[I]], 1
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
index 2b5ee9a76047f..2af9909ee396b 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-dominance.ll
@@ -27,45 +27,45 @@ define void @multiply_can_hoist_cast(ptr noalias %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x double>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <1 x double>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD2]], <1 x double> [[COL_LOAD3]], <1 x double> [[TMP4]])
 ; CHECK-NEXT:    store <1 x double> [[TMP7]], ptr [[C]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[A]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <1 x double>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[COL_LOAD8]], [[COL_LOAD9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <1 x double>, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <1 x double>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD13]], <1 x double> [[COL_LOAD14]], <1 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[C]], i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[C]], i64 8
 ; CHECK-NEXT:    store <1 x double> [[TMP12]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <1 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD20:%.*]] = load <1 x double>, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[COL_LOAD19]], [[COL_LOAD20]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD24:%.*]] = load <1 x double>, ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[TMP3]], i64 3
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD25:%.*]] = load <1 x double>, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD24]], <1 x double> [[COL_LOAD25]], <1 x double> [[TMP15]])
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr double, ptr [[C]], i64 2
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[C]], i64 16
 ; CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[TMP19]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[A]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD30:%.*]] = load <1 x double>, ptr [[TMP20]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD31:%.*]] = load <1 x double>, ptr [[TMP21]], align 8
 ; CHECK-NEXT:    [[TMP22:%.*]] = fmul contract <1 x double> [[COL_LOAD30]], [[COL_LOAD31]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <1 x double>, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP3]], i64 3
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD36:%.*]] = load <1 x double>, ptr [[TMP24]], align 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C]], i64 3
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 24
 ; CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -80,7 +80,7 @@ entry:
 define void @multiply_can_hoist_multiple_insts(ptr noalias %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: @multiply_can_hoist_multiple_insts(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr [4 x double], ptr [[C:%.*]], i64 2
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 64
 ; CHECK-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[GEP]] to i64
 ; CHECK-NEXT:    [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32
 ; CHECK-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[B:%.*]] to i64
@@ -99,45 +99,45 @@ define void @multiply_can_hoist_multiple_insts(ptr noalias %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x double>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <1 x double>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD2]], <1 x double> [[COL_LOAD3]], <1 x double> [[TMP4]])
 ; CHECK-NEXT:    store <1 x double> [[TMP7]], ptr [[GEP]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[A]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <1 x double>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[COL_LOAD8]], [[COL_LOAD9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <1 x double>, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <1 x double>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD13]], <1 x double> [[COL_LOAD14]], <1 x double> [[TMP9]])
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr [4 x double], ptr [[C]], i64 2, i64 1
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[C]], i64 72
 ; CHECK-NEXT:    store <1 x double> [[TMP12]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <1 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD20:%.*]] = load <1 x double>, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[COL_LOAD19]], [[COL_LOAD20]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD24:%.*]] = load <1 x double>, ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[TMP3]], i64 3
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD25:%.*]] = load <1 x double>, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD24]], <1 x double> [[COL_LOAD25]], <1 x double> [[TMP15]])
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr [4 x double], ptr [[C]], i64 2, i64 2
+; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[C]], i64 80
 ; CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[TMP19]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[A]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD30:%.*]] = load <1 x double>, ptr [[TMP20]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD31:%.*]] = load <1 x double>, ptr [[TMP21]], align 8
 ; CHECK-NEXT:    [[TMP22:%.*]] = fmul contract <1 x double> [[COL_LOAD30]], [[COL_LOAD31]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <1 x double>, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP3]], i64 3
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD36:%.*]] = load <1 x double>, ptr [[TMP24]], align 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr [4 x double], ptr [[C]], i64 2, i64 3
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 88
 ; CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -154,7 +154,7 @@ entry:
 define void @multiply_can_hoist_multiple_insts2(ptr noalias %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: @multiply_can_hoist_multiple_insts2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr <4 x double>, ptr [[C:%.*]], i64 42
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 1344
 ; CHECK-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[GEP_1]] to i64
 ; CHECK-NEXT:    [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 32
 ; CHECK-NEXT:    [[LOAD_BEGIN:%.*]] = ptrtoint ptr [[B:%.*]] to i64
@@ -173,42 +173,42 @@ define void @multiply_can_hoist_multiple_insts2(ptr noalias %A, ptr %B, ptr %C)
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <1 x double>, ptr [[A:%.*]], align 8
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = fmul contract <1 x double> [[COL_LOAD]], [[COL_LOAD1]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <1 x double>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr double, ptr [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <1 x double>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[TMP7:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD2]], <1 x double> [[COL_LOAD3]], <1 x double> [[TMP4]])
 ; CHECK-NEXT:    store <1 x double> [[TMP7]], ptr [[GEP_1]], align 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr double, ptr [[A]], i64 1
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <1 x double>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <1 x double>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = fmul contract <1 x double> [[COL_LOAD8]], [[COL_LOAD9]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD13:%.*]] = load <1 x double>, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[TMP3]], i64 1
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP3]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <1 x double>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD13]], <1 x double> [[COL_LOAD14]], <1 x double> [[TMP9]])
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[C]], i64 1352
 ; CHECK-NEXT:    store <1 x double> [[TMP12]], ptr [[TMP13]], align 8
 ; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <1 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD20:%.*]] = load <1 x double>, ptr [[TMP14]], align 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[COL_LOAD19]], [[COL_LOAD20]]
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD24:%.*]] = load <1 x double>, ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[TMP3]], i64 3
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD25:%.*]] = load <1 x double>, ptr [[TMP17]], align 8
 ; CHECK-NEXT:    [[TMP18:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD24]], <1 x double> [[COL_LOAD25]], <1 x double> [[TMP15]])
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[C]], i64 1360
 ; CHECK-NEXT:    store <1 x double> [[TMP18]], ptr [[TMP19]], align 8
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[A]], i64 1
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD30:%.*]] = load <1 x double>, ptr [[TMP20]], align 8
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD31:%.*]] = load <1 x double>, ptr [[TMP21]], align 8
 ; CHECK-NEXT:    [[TMP22:%.*]] = fmul contract <1 x double> [[COL_LOAD30]], [[COL_LOAD31]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <1 x double>, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP3]], i64 3
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD36:%.*]] = load <1 x double>, ptr [[TMP24]], align 8
 ; CHECK-NEXT:    [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD35]], <1 x double> [[COL_LOAD36]], <1 x double> [[TMP22]])
 ; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 1368
@@ -232,9 +232,9 @@ define void @multiply_dont_hoist_phi(ptr noalias %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[NEXT:%.*]]
 ; CHECK:       next:
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A:%.*]], i64 2
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[B:%.*]], i64 2
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD4]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A]], align 8
@@ -246,7 +246,7 @@ define void @multiply_dont_hoist_phi(ptr noalias %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT7]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr <4 x double>, ptr [[C:%.*]], i64 26
+; CHECK-NEXT:    [[GEP_1:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 832
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[GEP_1]], align 8
 ; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr i8, ptr [[C]], i64 848
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[VEC_GEP14]], align 8
@@ -273,10 +273,10 @@ define void @multiply_dont_hoist_cast_due_to_operand(ptr noalias %A, ptr %B, ptr
 ; CHECK-LABEL: @multiply_dont_hoist_cast_due_to_operand(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[B]], i64 2
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
@@ -288,7 +288,7 @@ define void @multiply_dont_hoist_cast_due_to_operand(ptr noalias %A, ptr %B, ptr
 ; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP2]])
 ; CHECK-NEXT:    [[C:%.*]] = load ptr, ptr [[C_PTR:%.*]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[C]], align 8
-; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr double, ptr [[C]], i64 2
+; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr i8, ptr [[C]], i64 16
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[VEC_GEP14]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -306,10 +306,10 @@ define void @multiply_dont_hoist_load(ptr noalias %A, ptr %B, ptr %C.ptr) {
 ; CHECK-LABEL: @multiply_dont_hoist_load(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[B]], i64 2
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
@@ -321,7 +321,7 @@ define void @multiply_dont_hoist_load(ptr noalias %A, ptr %B, ptr %C.ptr) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP2]])
 ; CHECK-NEXT:    [[C:%.*]] = load ptr, ptr [[C_PTR:%.*]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[C]], align 8
-; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr double, ptr [[C]], i64 2
+; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr i8, ptr [[C]], i64 16
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[VEC_GEP14]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -339,10 +339,10 @@ define void @multiply_dont_hoist_call(ptr noalias %A, ptr %B) {
 ; CHECK-LABEL: @multiply_dont_hoist_call(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[B]], i64 2
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
@@ -354,7 +354,7 @@ define void @multiply_dont_hoist_call(ptr noalias %A, ptr %B) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP2]])
 ; CHECK-NEXT:    [[C:%.*]] = call ptr @get_address()
 ; CHECK-NEXT:    store <2 x double> [[TMP1]], ptr [[C]], align 8
-; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr double, ptr [[C]], i64 2
+; CHECK-NEXT:    [[VEC_GEP14:%.*]] = getelementptr i8, ptr [[C]], i64 16
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[VEC_GEP14]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
index e0dbb21cac08b..1cc886e68d8e6 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-loops.ll
@@ -30,13 +30,13 @@ define void @multiply_noalias_4x4(ptr noalias %A, ptr noalias %B, ptr noalias %C
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr double, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr double, ptr [[TMP1]], i64 [[ROWS_IV]]
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[COLS_IV]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[B:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[TMP4]], i64 [[INNER_IV]]
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr double, ptr [[TMP5]], i64 4
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[TMP5]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x double>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD2]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD]], <2 x double> [[SPLAT_SPLAT]], <2 x double> [[RESULT_VEC_0]])
@@ -58,7 +58,7 @@ define void @multiply_noalias_4x4(ptr noalias %A, ptr noalias %B, ptr noalias %C
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr double, ptr [[C:%.*]], i64 [[TMP10]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP11]], i64 [[ROWS_IV]]
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr double, ptr [[TMP12]], i64 4
+; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i8, ptr [[TMP12]], i64 32
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[VEC_GEP16]], align 8
 ; CHECK-NEXT:    br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]]
 ; CHECK:       cols.latch:
@@ -106,13 +106,13 @@ define void @multiply_noalias_2x4(ptr noalias %A, ptr noalias %B, ptr noalias %C
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[ROWS_IV]]
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i64, ptr [[TMP2]], i64 2
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[COLS_IV]], 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i64 [[INNER_IV]]
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i64, ptr [[TMP5]], i64 4
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[TMP5]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i64>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x i64> [[COL_LOAD2]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT]]
@@ -138,7 +138,7 @@ define void @multiply_noalias_2x4(ptr noalias %A, ptr noalias %B, ptr noalias %C
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[C:%.*]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[ROWS_IV]]
 ; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i64, ptr [[TMP16]], i64 2
+; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i8, ptr [[TMP16]], i64 16
 ; CHECK-NEXT:    store <2 x i64> [[TMP13]], ptr [[VEC_GEP16]], align 8
 ; CHECK-NEXT:    br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]]
 ; CHECK:       cols.latch:
@@ -192,13 +192,13 @@ define void @multiply_noalias_4x2_2x8(ptr noalias %A, ptr noalias %B, ptr noalia
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr i64, ptr [[A:%.*]], i64 [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i64, ptr [[TMP1]], i64 [[ROWS_IV]]
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x i64>, ptr [[TMP2]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i64, ptr [[TMP2]], i64 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP2]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x i64>, ptr [[VEC_GEP]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl i64 [[COLS_IV]], 1
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i64, ptr [[B:%.*]], i64 [[TMP3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i64, ptr [[TMP4]], i64 [[INNER_IV]]
 ; CHECK-NEXT:    [[COL_LOAD2:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i64, ptr [[TMP5]], i64 2
+; CHECK-NEXT:    [[VEC_GEP3:%.*]] = getelementptr i8, ptr [[TMP5]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD4:%.*]] = load <2 x i64>, ptr [[VEC_GEP3]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x i64> [[COL_LOAD2]], <2 x i64> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = mul <2 x i64> [[COL_LOAD]], [[SPLAT_SPLAT]]
@@ -224,7 +224,7 @@ define void @multiply_noalias_4x2_2x8(ptr noalias %A, ptr noalias %B, ptr noalia
 ; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i64, ptr [[C:%.*]], i64 [[TMP14]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i64, ptr [[TMP15]], i64 [[ROWS_IV]]
 ; CHECK-NEXT:    store <2 x i64> [[TMP9]], ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i64, ptr [[TMP16]], i64 4
+; CHECK-NEXT:    [[VEC_GEP16:%.*]] = getelementptr i8, ptr [[TMP16]], i64 32
 ; CHECK-NEXT:    store <2 x i64> [[TMP13]], ptr [[VEC_GEP16]], align 8
 ; CHECK-NEXT:    br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]]
 ; CHECK:       cols.latch:
@@ -309,13 +309,13 @@ define void @multiply_alias_2x2(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP8]], [[ROWS_IV]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr float, ptr [[TMP3]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x float>, ptr [[TMP10]], align 4
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr float, ptr [[TMP10]], i64 2
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP10]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x float>, ptr [[VEC_GEP]], align 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = shl i64 [[COLS_IV]], 1
 ; CHECK-NEXT:    [[TMP12:%.*]] = add i64 [[TMP11]], [[INNER_IV]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr float, ptr [[TMP7]], i64 [[TMP12]]
 ; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <2 x float>, ptr [[TMP13]], align 4
-; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr float, ptr [[TMP13]], i64 2
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP13]], i64 8
 ; CHECK-NEXT:    [[COL_LOAD11:%.*]] = load <2 x float>, ptr [[VEC_GEP10]], align 4
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x float> [[COL_LOAD9]], <2 x float> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call contract <2 x float> @llvm.fmuladd.v2f32(<2 x float> [[COL_LOAD]], <2 x float> [[SPLAT_SPLAT]], <2 x float> [[RESULT_VEC_0]])
@@ -337,7 +337,7 @@ define void @multiply_alias_2x2(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr float, ptr [[C]], i64 [[TMP18]]
 ; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr float, ptr [[TMP19]], i64 [[ROWS_IV]]
 ; CHECK-NEXT:    store <2 x float> [[TMP15]], ptr [[TMP20]], align 8
-; CHECK-NEXT:    [[VEC_GEP23:%.*]] = getelementptr float, ptr [[TMP20]], i64 2
+; CHECK-NEXT:    [[VEC_GEP23:%.*]] = getelementptr i8, ptr [[TMP20]], i64 8
 ; CHECK-NEXT:    store <2 x float> [[TMP17]], ptr [[VEC_GEP23]], align 8
 ; CHECK-NEXT:    br i1 [[ROWS_COND_NOT]], label [[COLS_LATCH]], label [[ROWS_HEADER]]
 ; CHECK:       cols.latch:
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
index f7ac2d321439a..aab3907bd622b 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused-multiple-blocks.ll
@@ -10,12 +10,12 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
 ; CHECK-LABEL: @test(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COL_LOAD133:%.*]] = load <3 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP134:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[VEC_GEP134:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD135:%.*]] = load <3 x double>, ptr [[VEC_GEP134]], align 8
 ; CHECK-NEXT:    [[COL_LOAD136:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP137:%.*]] = getelementptr double, ptr [[B]], i64 2
+; CHECK-NEXT:    [[VEC_GEP137:%.*]] = getelementptr i8, ptr [[B]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[VEC_GEP137]], align 8
-; CHECK-NEXT:    [[VEC_GEP139:%.*]] = getelementptr double, ptr [[B]], i64 4
+; CHECK-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[B]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
 ; CHECK-NEXT:    [[STORE_BEGIN:%.*]] = ptrtoint ptr [[C:%.*]] to i64
 ; CHECK-NEXT:    [[STORE_END:%.*]] = add nuw nsw i64 [[STORE_BEGIN]], 72
@@ -48,10 +48,10 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
 ; CHECK:       no_alias3:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi ptr [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP3]], i64 3
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
 ; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr double, ptr [[TMP7]], i64 2
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
@@ -62,14 +62,14 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
 ; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[C]], align 8
-; CHECK-NEXT:    [[VEC_GEP21:%.*]] = getelementptr double, ptr [[C]], i64 3
+; CHECK-NEXT:    [[VEC_GEP21:%.*]] = getelementptr i8, ptr [[C]], i64 24
 ; CHECK-NEXT:    store <2 x double> [[TMP11]], ptr [[VEC_GEP21]], align 8
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD22:%.*]] = load <1 x double>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[VEC_GEP23:%.*]] = getelementptr double, ptr [[TMP3]], i64 5
+; CHECK-NEXT:    [[VEC_GEP23:%.*]] = getelementptr i8, ptr [[TMP3]], i64 40
 ; CHECK-NEXT:    [[COL_LOAD24:%.*]] = load <1 x double>, ptr [[VEC_GEP23]], align 8
 ; CHECK-NEXT:    [[COL_LOAD25:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[VEC_GEP26:%.*]] = getelementptr double, ptr [[TMP7]], i64 2
+; CHECK-NEXT:    [[VEC_GEP26:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD27:%.*]] = load <2 x double>, ptr [[VEC_GEP26]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT29:%.*]] = shufflevector <2 x double> [[COL_LOAD25]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <1 x double> [[COL_LOAD22]], [[SPLAT_SPLATINSERT29]]
@@ -79,39 +79,39 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
 ; CHECK-NEXT:    [[TMP15:%.*]] = fmul contract <1 x double> [[COL_LOAD22]], [[SPLAT_SPLATINSERT35]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT38:%.*]] = shufflevector <2 x double> [[COL_LOAD27]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP16:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD24]], <1 x double> [[SPLAT_SPLATINSERT38]], <1 x double> [[TMP15]])
-; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr double, ptr [[C]], i64 2
+; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[C]], i64 16
 ; CHECK-NEXT:    store <1 x double> [[TMP14]], ptr [[TMP17]], align 8
-; CHECK-NEXT:    [[VEC_GEP40:%.*]] = getelementptr double, ptr [[C]], i64 5
+; CHECK-NEXT:    [[VEC_GEP40:%.*]] = getelementptr i8, ptr [[C]], i64 40
 ; CHECK-NEXT:    store <1 x double> [[TMP16]], ptr [[VEC_GEP40]], align 8
 ; CHECK-NEXT:    [[COL_LOAD41:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[VEC_GEP42:%.*]] = getelementptr double, ptr [[TMP3]], i64 3
+; CHECK-NEXT:    [[VEC_GEP42:%.*]] = getelementptr i8, ptr [[TMP3]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD43:%.*]] = load <2 x double>, ptr [[VEC_GEP42]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[TMP7]], i64 4
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT47:%.*]] = shufflevector <2 x double> [[COL_LOAD44]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD41]], [[SPLAT_SPLAT47]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD44]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD43]], <2 x double> [[SPLAT_SPLAT50]], <2 x double> [[TMP19]])
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[C]], i64 6
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[C]], i64 48
 ; CHECK-NEXT:    store <2 x double> [[TMP20]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD51:%.*]] = load <1 x double>, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[VEC_GEP52:%.*]] = getelementptr double, ptr [[TMP3]], i64 5
+; CHECK-NEXT:    [[VEC_GEP52:%.*]] = getelementptr i8, ptr [[TMP3]], i64 40
 ; CHECK-NEXT:    [[COL_LOAD53:%.*]] = load <1 x double>, ptr [[VEC_GEP52]], align 8
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[TMP7]], i64 4
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD54:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT56:%.*]] = shufflevector <2 x double> [[COL_LOAD54]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP24:%.*]] = fmul contract <1 x double> [[COL_LOAD51]], [[SPLAT_SPLATINSERT56]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT59:%.*]] = shufflevector <2 x double> [[COL_LOAD54]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP25:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD53]], <1 x double> [[SPLAT_SPLATINSERT59]], <1 x double> [[TMP24]])
-; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[C]], i64 8
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[C]], i64 64
 ; CHECK-NEXT:    store <1 x double> [[TMP25]], ptr [[TMP26]], align 8
 ; CHECK-NEXT:    br i1 [[COND:%.*]], label [[TRUE:%.*]], label [[FALSE:%.*]]
 ; CHECK:       true:
 ; CHECK-NEXT:    [[TMP27:%.*]] = fadd contract <3 x double> [[COL_LOAD133]], [[COL_LOAD133]]
 ; CHECK-NEXT:    [[TMP28:%.*]] = fadd contract <3 x double> [[COL_LOAD135]], [[COL_LOAD135]]
 ; CHECK-NEXT:    store <3 x double> [[TMP27]], ptr [[A]], align 8
-; CHECK-NEXT:    [[VEC_GEP143:%.*]] = getelementptr double, ptr [[A]], i64 3
+; CHECK-NEXT:    [[VEC_GEP143:%.*]] = getelementptr i8, ptr [[A]], i64 24
 ; CHECK-NEXT:    store <3 x double> [[TMP28]], ptr [[VEC_GEP143]], align 8
 ; CHECK-NEXT:    br label [[END:%.*]]
 ; CHECK:       false:
@@ -119,9 +119,9 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
 ; CHECK-NEXT:    [[TMP30:%.*]] = fadd contract <2 x double> [[COL_LOAD138]], [[COL_LOAD138]]
 ; CHECK-NEXT:    [[TMP31:%.*]] = fadd contract <2 x double> [[COL_LOAD140]], [[COL_LOAD140]]
 ; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[B]], align 8
-; CHECK-NEXT:    [[VEC_GEP141:%.*]] = getelementptr double, ptr [[B]], i64 2
+; CHECK-NEXT:    [[VEC_GEP141:%.*]] = getelementptr i8, ptr [[B]], i64 16
 ; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[VEC_GEP141]], align 8
-; CHECK-NEXT:    [[VEC_GEP142:%.*]] = getelementptr double, ptr [[B]], i64 4
+; CHECK-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[B]], i64 32
 ; CHECK-NEXT:    store <2 x double> [[TMP31]], ptr [[VEC_GEP142]], align 8
 ; CHECK-NEXT:    br label [[END]]
 ; CHECK:       end:
@@ -156,10 +156,10 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
 ; CHECK:       no_alias70:
 ; CHECK-NEXT:    [[TMP39:%.*]] = phi ptr [ [[B]], [[NO_ALIAS63]] ], [ [[B]], [[ALIAS_CONT68]] ], [ [[TMP38]], [[COPY69]] ]
 ; CHECK-NEXT:    [[COL_LOAD75:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
-; CHECK-NEXT:    [[VEC_GEP76:%.*]] = getelementptr double, ptr [[TMP35]], i64 3
+; CHECK-NEXT:    [[VEC_GEP76:%.*]] = getelementptr i8, ptr [[TMP35]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD77:%.*]] = load <2 x double>, ptr [[VEC_GEP76]], align 8
 ; CHECK-NEXT:    [[COL_LOAD78:%.*]] = load <2 x double>, ptr [[TMP39]], align 8
-; CHECK-NEXT:    [[VEC_GEP79:%.*]] = getelementptr double, ptr [[TMP39]], i64 2
+; CHECK-NEXT:    [[VEC_GEP79:%.*]] = getelementptr i8, ptr [[TMP39]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD80:%.*]] = load <2 x double>, ptr [[VEC_GEP79]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT83:%.*]] = shufflevector <2 x double> [[COL_LOAD78]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP40:%.*]] = fmul contract <2 x double> [[COL_LOAD75]], [[SPLAT_SPLAT83]]
@@ -170,14 +170,14 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
 ; CHECK-NEXT:    [[SPLAT_SPLAT92:%.*]] = shufflevector <2 x double> [[COL_LOAD80]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP43:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD77]], <2 x double> [[SPLAT_SPLAT92]], <2 x double> [[TMP42]])
 ; CHECK-NEXT:    store <2 x double> [[TMP41]], ptr [[C]], align 8
-; CHECK-NEXT:    [[VEC_GEP93:%.*]] = getelementptr double, ptr [[C]], i64 3
+; CHECK-NEXT:    [[VEC_GEP93:%.*]] = getelementptr i8, ptr [[C]], i64 24
 ; CHECK-NEXT:    store <2 x double> [[TMP43]], ptr [[VEC_GEP93]], align 8
-; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr double, ptr [[TMP35]], i64 2
+; CHECK-NEXT:    [[TMP44:%.*]] = getelementptr i8, ptr [[TMP35]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD94:%.*]] = load <1 x double>, ptr [[TMP44]], align 8
-; CHECK-NEXT:    [[VEC_GEP95:%.*]] = getelementptr double, ptr [[TMP35]], i64 5
+; CHECK-NEXT:    [[VEC_GEP95:%.*]] = getelementptr i8, ptr [[TMP35]], i64 40
 ; CHECK-NEXT:    [[COL_LOAD96:%.*]] = load <1 x double>, ptr [[VEC_GEP95]], align 8
 ; CHECK-NEXT:    [[COL_LOAD97:%.*]] = load <2 x double>, ptr [[TMP39]], align 8
-; CHECK-NEXT:    [[VEC_GEP98:%.*]] = getelementptr double, ptr [[TMP39]], i64 2
+; CHECK-NEXT:    [[VEC_GEP98:%.*]] = getelementptr i8, ptr [[TMP39]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[VEC_GEP98]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT101:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP45:%.*]] = fmul contract <1 x double> [[COL_LOAD94]], [[SPLAT_SPLATINSERT101]]
@@ -187,32 +187,32 @@ define void @test(ptr %A, ptr %B, ptr %C, i1 %cond) {
 ; CHECK-NEXT:    [[TMP47:%.*]] = fmul contract <1 x double> [[COL_LOAD94]], [[SPLAT_SPLATINSERT107]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT110:%.*]] = shufflevector <2 x double> [[COL_LOAD99]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP48:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD96]], <1 x double> [[SPLAT_SPLATINSERT110]], <1 x double> [[TMP47]])
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[C]], i64 2
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[C]], i64 16
 ; CHECK-NEXT:    store <1 x double> [[TMP46]], ptr [[TMP49]], align 8
-; CHECK-NEXT:    [[VEC_GEP112:%.*]] = getelementptr double, ptr [[C]], i64 5
+; CHECK-NEXT:    [[VEC_GEP112:%.*]] = getelementptr i8, ptr [[C]], i64 40
 ; CHECK-NEXT:    store <1 x double> [[TMP48]], ptr [[VEC_GEP112]], align 8
 ; CHECK-NEXT:    [[COL_LOAD113:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
-; CHECK-NEXT:    [[VEC_GEP114:%.*]] = getelementptr double, ptr [[TMP35]], i64 3
+; CHECK-NEXT:    [[VEC_GEP114:%.*]] = getelementptr i8, ptr [[TMP35]], i64 24
 ; CHECK-NEXT:    [[COL_LOAD115:%.*]] = load <2 x double>, ptr [[VEC_GEP114]], align 8
-; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr double, ptr [[TMP39]], i64 4
+; CHECK-NEXT:    [[TMP50:%.*]] = getelementptr i8, ptr [[TMP39]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD116:%.*]] = load <2 x double>, ptr [[TMP50]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT119:%.*]] = shufflevector <2 x double> [[COL_LOAD116]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP51:%.*]] = fmul contract <2 x double> [[COL_LOAD113]], [[SPLAT_SPLAT119]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT122:%.*]] = shufflevector <2 x double> [[COL_LOAD116]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT122]], <2 x double> [[TMP51]])
-; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr double, ptr [[C]], i64 6
+; CHECK-NEXT:    [[TMP53:%.*]] = getelementptr i8, ptr [[C]], i64 48
 ; CHECK-NEXT:    store <2 x double> [[TMP52]], ptr [[TMP53]], align 8
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr double, ptr [[TMP35]], i64 2
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[TMP35]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD123:%.*]] = load <1 x double>, ptr [[TMP54]], align 8
-; CHECK-NEXT:    [[VEC_GEP124:%.*]] = getelementptr double, ptr [[TMP35]], i64 5
+; CHECK-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP35]], i64 40
 ; CHECK-NEXT:    [[COL_LOAD125:%.*]] = load <1 x double>, ptr [[VEC_GEP124]], align 8
-; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr double, ptr [[TMP39]], i64 4
+; CHECK-NEXT:    [[TMP55:%.*]] = getelementptr i8, ptr [[TMP39]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD126:%.*]] = load <2 x double>, ptr [[TMP55]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT128:%.*]] = shufflevector <2 x double> [[COL_LOAD126]], <2 x double> poison, <1 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP56:%.*]] = fmul contract <1 x double> [[COL_LOAD123]], [[SPLAT_SPLATINSERT128]]
 ; CHECK-NEXT:    [[SPLAT_SPLATINSERT131:%.*]] = shufflevector <2 x double> [[COL_LOAD126]], <2 x double> poison, <1 x i32> <i32 1>
 ; CHECK-NEXT:    [[TMP57:%.*]] = call contract <1 x double> @llvm.fmuladd.v1f64(<1 x double> [[COL_LOAD125]], <1 x double> [[SPLAT_SPLATINSERT131]], <1 x double> [[TMP56]])
-; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr double, ptr [[C]], i64 8
+; CHECK-NEXT:    [[TMP58:%.*]] = getelementptr i8, ptr [[C]], i64 64
 ; CHECK-NEXT:    store <1 x double> [[TMP57]], ptr [[TMP58]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
index 81d1fc5861511..155f7755c2095 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-fused.ll
@@ -42,10 +42,10 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK:       no_alias3:
 ; CHECK-NEXT:    [[TMP7:%.*]] = phi ptr [ [[B]], [[NO_ALIAS]] ], [ [[B]], [[ALIAS_CONT1]] ], [ [[TMP6]], [[COPY2]] ]
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[TMP3]], i64 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
 ; CHECK-NEXT:    [[COL_LOAD9:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr double, ptr [[TMP7]], i64 4
+; CHECK-NEXT:    [[VEC_GEP10:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD11:%.*]] = load <2 x double>, ptr [[VEC_GEP10]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD9]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP8:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
@@ -55,13 +55,13 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[TMP10:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT17]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT20:%.*]] = shufflevector <2 x double> [[COL_LOAD11]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP11:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD8]], <2 x double> [[SPLAT_SPLAT20]], <2 x double> [[TMP10]])
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr double, ptr [[TMP3]], i64 8
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
 ; CHECK-NEXT:    [[COL_LOAD21:%.*]] = load <2 x double>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[VEC_GEP22:%.*]] = getelementptr double, ptr [[TMP3]], i64 12
+; CHECK-NEXT:    [[VEC_GEP22:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
 ; CHECK-NEXT:    [[COL_LOAD23:%.*]] = load <2 x double>, ptr [[VEC_GEP22]], align 8
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr double, ptr [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD24:%.*]] = load <2 x double>, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[VEC_GEP25:%.*]] = getelementptr double, ptr [[TMP7]], i64 6
+; CHECK-NEXT:    [[VEC_GEP25:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
 ; CHECK-NEXT:    [[COL_LOAD26:%.*]] = load <2 x double>, ptr [[VEC_GEP25]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT30:%.*]] = shufflevector <2 x double> [[COL_LOAD24]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD21]], <2 x double> [[SPLAT_SPLAT30]], <2 x double> [[TMP9]])
@@ -72,14 +72,14 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[SPLAT_SPLAT40:%.*]] = shufflevector <2 x double> [[COL_LOAD26]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD23]], <2 x double> [[SPLAT_SPLAT40]], <2 x double> [[TMP16]])
 ; CHECK-NEXT:    store <2 x double> [[TMP15]], ptr [[C]], align 8
-; CHECK-NEXT:    [[VEC_GEP41:%.*]] = getelementptr double, ptr [[C]], i64 4
+; CHECK-NEXT:    [[VEC_GEP41:%.*]] = getelementptr i8, ptr [[C]], i64 32
 ; CHECK-NEXT:    store <2 x double> [[TMP17]], ptr [[VEC_GEP41]], align 8
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD42:%.*]] = load <2 x double>, ptr [[TMP18]], align 8
-; CHECK-NEXT:    [[VEC_GEP43:%.*]] = getelementptr double, ptr [[TMP3]], i64 6
+; CHECK-NEXT:    [[VEC_GEP43:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
 ; CHECK-NEXT:    [[COL_LOAD44:%.*]] = load <2 x double>, ptr [[VEC_GEP43]], align 8
 ; CHECK-NEXT:    [[COL_LOAD45:%.*]] = load <2 x double>, ptr [[TMP7]], align 8
-; CHECK-NEXT:    [[VEC_GEP46:%.*]] = getelementptr double, ptr [[TMP7]], i64 4
+; CHECK-NEXT:    [[VEC_GEP46:%.*]] = getelementptr i8, ptr [[TMP7]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD47:%.*]] = load <2 x double>, ptr [[VEC_GEP46]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT50:%.*]] = shufflevector <2 x double> [[COL_LOAD45]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP19:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT50]]
@@ -89,13 +89,13 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[TMP21:%.*]] = fmul contract <2 x double> [[COL_LOAD42]], [[SPLAT_SPLAT56]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT59:%.*]] = shufflevector <2 x double> [[COL_LOAD47]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP22:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD44]], <2 x double> [[SPLAT_SPLAT59]], <2 x double> [[TMP21]])
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[TMP3]], i64 10
+; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD60:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[VEC_GEP61:%.*]] = getelementptr double, ptr [[TMP3]], i64 14
+; CHECK-NEXT:    [[VEC_GEP61:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
 ; CHECK-NEXT:    [[COL_LOAD62:%.*]] = load <2 x double>, ptr [[VEC_GEP61]], align 8
-; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr double, ptr [[TMP7]], i64 2
+; CHECK-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP7]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD63:%.*]] = load <2 x double>, ptr [[TMP24]], align 8
-; CHECK-NEXT:    [[VEC_GEP64:%.*]] = getelementptr double, ptr [[TMP7]], i64 6
+; CHECK-NEXT:    [[VEC_GEP64:%.*]] = getelementptr i8, ptr [[TMP7]], i64 48
 ; CHECK-NEXT:    [[COL_LOAD65:%.*]] = load <2 x double>, ptr [[VEC_GEP64]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT69:%.*]] = shufflevector <2 x double> [[COL_LOAD63]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP25:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP20]])
@@ -105,16 +105,16 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[TMP27:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD60]], <2 x double> [[SPLAT_SPLAT76]], <2 x double> [[TMP22]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT79:%.*]] = shufflevector <2 x double> [[COL_LOAD65]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP28:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD62]], <2 x double> [[SPLAT_SPLAT79]], <2 x double> [[TMP27]])
-; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr double, ptr [[C]], i64 2
+; CHECK-NEXT:    [[TMP29:%.*]] = getelementptr i8, ptr [[C]], i64 16
 ; CHECK-NEXT:    store <2 x double> [[TMP26]], ptr [[TMP29]], align 8
-; CHECK-NEXT:    [[VEC_GEP80:%.*]] = getelementptr double, ptr [[C]], i64 6
+; CHECK-NEXT:    [[VEC_GEP80:%.*]] = getelementptr i8, ptr [[C]], i64 48
 ; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[VEC_GEP80]], align 8
 ; CHECK-NEXT:    [[COL_LOAD81:%.*]] = load <2 x double>, ptr [[TMP3]], align 8
-; CHECK-NEXT:    [[VEC_GEP82:%.*]] = getelementptr double, ptr [[TMP3]], i64 4
+; CHECK-NEXT:    [[VEC_GEP82:%.*]] = getelementptr i8, ptr [[TMP3]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD83:%.*]] = load <2 x double>, ptr [[VEC_GEP82]], align 8
-; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr double, ptr [[TMP7]], i64 8
+; CHECK-NEXT:    [[TMP30:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
 ; CHECK-NEXT:    [[COL_LOAD84:%.*]] = load <2 x double>, ptr [[TMP30]], align 8
-; CHECK-NEXT:    [[VEC_GEP85:%.*]] = getelementptr double, ptr [[TMP7]], i64 12
+; CHECK-NEXT:    [[VEC_GEP85:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
 ; CHECK-NEXT:    [[COL_LOAD86:%.*]] = load <2 x double>, ptr [[VEC_GEP85]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT89:%.*]] = shufflevector <2 x double> [[COL_LOAD84]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT89]]
@@ -124,13 +124,13 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[TMP33:%.*]] = fmul contract <2 x double> [[COL_LOAD81]], [[SPLAT_SPLAT95]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT98:%.*]] = shufflevector <2 x double> [[COL_LOAD86]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP34:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD83]], <2 x double> [[SPLAT_SPLAT98]], <2 x double> [[TMP33]])
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[TMP3]], i64 8
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[TMP3]], i64 64
 ; CHECK-NEXT:    [[COL_LOAD99:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
-; CHECK-NEXT:    [[VEC_GEP100:%.*]] = getelementptr double, ptr [[TMP3]], i64 12
+; CHECK-NEXT:    [[VEC_GEP100:%.*]] = getelementptr i8, ptr [[TMP3]], i64 96
 ; CHECK-NEXT:    [[COL_LOAD101:%.*]] = load <2 x double>, ptr [[VEC_GEP100]], align 8
-; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr double, ptr [[TMP7]], i64 10
+; CHECK-NEXT:    [[TMP36:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD102:%.*]] = load <2 x double>, ptr [[TMP36]], align 8
-; CHECK-NEXT:    [[VEC_GEP103:%.*]] = getelementptr double, ptr [[TMP7]], i64 14
+; CHECK-NEXT:    [[VEC_GEP103:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
 ; CHECK-NEXT:    [[COL_LOAD104:%.*]] = load <2 x double>, ptr [[VEC_GEP103]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT108:%.*]] = shufflevector <2 x double> [[COL_LOAD102]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP37:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP32]])
@@ -140,17 +140,17 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD99]], <2 x double> [[SPLAT_SPLAT115]], <2 x double> [[TMP34]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT118:%.*]] = shufflevector <2 x double> [[COL_LOAD104]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP40:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD101]], <2 x double> [[SPLAT_SPLAT118]], <2 x double> [[TMP39]])
-; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr double, ptr [[C]], i64 8
+; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr i8, ptr [[C]], i64 64
 ; CHECK-NEXT:    store <2 x double> [[TMP38]], ptr [[TMP41]], align 8
-; CHECK-NEXT:    [[VEC_GEP119:%.*]] = getelementptr double, ptr [[C]], i64 12
+; CHECK-NEXT:    [[VEC_GEP119:%.*]] = getelementptr i8, ptr [[C]], i64 96
 ; CHECK-NEXT:    store <2 x double> [[TMP40]], ptr [[VEC_GEP119]], align 8
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr double, ptr [[TMP3]], i64 2
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr i8, ptr [[TMP3]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD120:%.*]] = load <2 x double>, ptr [[TMP42]], align 8
-; CHECK-NEXT:    [[VEC_GEP121:%.*]] = getelementptr double, ptr [[TMP3]], i64 6
+; CHECK-NEXT:    [[VEC_GEP121:%.*]] = getelementptr i8, ptr [[TMP3]], i64 48
 ; CHECK-NEXT:    [[COL_LOAD122:%.*]] = load <2 x double>, ptr [[VEC_GEP121]], align 8
-; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr double, ptr [[TMP7]], i64 8
+; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr i8, ptr [[TMP7]], i64 64
 ; CHECK-NEXT:    [[COL_LOAD123:%.*]] = load <2 x double>, ptr [[TMP43]], align 8
-; CHECK-NEXT:    [[VEC_GEP124:%.*]] = getelementptr double, ptr [[TMP7]], i64 12
+; CHECK-NEXT:    [[VEC_GEP124:%.*]] = getelementptr i8, ptr [[TMP7]], i64 96
 ; CHECK-NEXT:    [[COL_LOAD125:%.*]] = load <2 x double>, ptr [[VEC_GEP124]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT128:%.*]] = shufflevector <2 x double> [[COL_LOAD123]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP44:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT128]]
@@ -160,13 +160,13 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[TMP46:%.*]] = fmul contract <2 x double> [[COL_LOAD120]], [[SPLAT_SPLAT134]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT137:%.*]] = shufflevector <2 x double> [[COL_LOAD125]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP47:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD122]], <2 x double> [[SPLAT_SPLAT137]], <2 x double> [[TMP46]])
-; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr double, ptr [[TMP3]], i64 10
+; CHECK-NEXT:    [[TMP48:%.*]] = getelementptr i8, ptr [[TMP3]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD138:%.*]] = load <2 x double>, ptr [[TMP48]], align 8
-; CHECK-NEXT:    [[VEC_GEP139:%.*]] = getelementptr double, ptr [[TMP3]], i64 14
+; CHECK-NEXT:    [[VEC_GEP139:%.*]] = getelementptr i8, ptr [[TMP3]], i64 112
 ; CHECK-NEXT:    [[COL_LOAD140:%.*]] = load <2 x double>, ptr [[VEC_GEP139]], align 8
-; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr double, ptr [[TMP7]], i64 10
+; CHECK-NEXT:    [[TMP49:%.*]] = getelementptr i8, ptr [[TMP7]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD141:%.*]] = load <2 x double>, ptr [[TMP49]], align 8
-; CHECK-NEXT:    [[VEC_GEP142:%.*]] = getelementptr double, ptr [[TMP7]], i64 14
+; CHECK-NEXT:    [[VEC_GEP142:%.*]] = getelementptr i8, ptr [[TMP7]], i64 112
 ; CHECK-NEXT:    [[COL_LOAD143:%.*]] = load <2 x double>, ptr [[VEC_GEP142]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT147:%.*]] = shufflevector <2 x double> [[COL_LOAD141]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP50:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP45]])
@@ -176,9 +176,9 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[TMP52:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD138]], <2 x double> [[SPLAT_SPLAT154]], <2 x double> [[TMP47]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT157:%.*]] = shufflevector <2 x double> [[COL_LOAD143]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP53:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD140]], <2 x double> [[SPLAT_SPLAT157]], <2 x double> [[TMP52]])
-; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr double, ptr [[C]], i64 10
+; CHECK-NEXT:    [[TMP54:%.*]] = getelementptr i8, ptr [[C]], i64 80
 ; CHECK-NEXT:    store <2 x double> [[TMP51]], ptr [[TMP54]], align 8
-; CHECK-NEXT:    [[VEC_GEP158:%.*]] = getelementptr double, ptr [[C]], i64 14
+; CHECK-NEXT:    [[VEC_GEP158:%.*]] = getelementptr i8, ptr [[C]], i64 112
 ; CHECK-NEXT:    store <2 x double> [[TMP53]], ptr [[VEC_GEP158]], align 8
 ; CHECK-NEXT:    ret void
 ;
@@ -234,7 +234,7 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-LABEL: @multiply_reuse_load(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A]], i64 4
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <2 x double> [[COL_LOAD]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
@@ -244,13 +244,13 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[TMP2:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT10]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT13:%.*]] = shufflevector <2 x double> [[COL_LOAD1]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP3:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD1]], <2 x double> [[SPLAT_SPLAT13]], <2 x double> [[TMP2]])
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[A]], i64 64
 ; CHECK-NEXT:    [[COL_LOAD14:%.*]] = load <2 x double>, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[VEC_GEP15:%.*]] = getelementptr double, ptr [[A]], i64 12
+; CHECK-NEXT:    [[VEC_GEP15:%.*]] = getelementptr i8, ptr [[A]], i64 96
 ; CHECK-NEXT:    [[COL_LOAD16:%.*]] = load <2 x double>, ptr [[VEC_GEP15]], align 8
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD17:%.*]] = load <2 x double>, ptr [[TMP5]], align 8
-; CHECK-NEXT:    [[VEC_GEP18:%.*]] = getelementptr double, ptr [[A]], i64 6
+; CHECK-NEXT:    [[VEC_GEP18:%.*]] = getelementptr i8, ptr [[A]], i64 48
 ; CHECK-NEXT:    [[COL_LOAD19:%.*]] = load <2 x double>, ptr [[VEC_GEP18]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT23:%.*]] = shufflevector <2 x double> [[COL_LOAD17]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP6:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD14]], <2 x double> [[SPLAT_SPLAT23]], <2 x double> [[TMP1]])
@@ -261,14 +261,14 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[SPLAT_SPLAT33:%.*]] = shufflevector <2 x double> [[COL_LOAD19]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP9:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD16]], <2 x double> [[SPLAT_SPLAT33]], <2 x double> [[TMP8]])
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[C:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP34:%.*]] = getelementptr double, ptr [[C]], i64 4
+; CHECK-NEXT:    [[VEC_GEP34:%.*]] = getelementptr i8, ptr [[C]], i64 32
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[VEC_GEP34]], align 8
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD35:%.*]] = load <2 x double>, ptr [[TMP10]], align 8
-; CHECK-NEXT:    [[VEC_GEP36:%.*]] = getelementptr double, ptr [[A]], i64 6
+; CHECK-NEXT:    [[VEC_GEP36:%.*]] = getelementptr i8, ptr [[A]], i64 48
 ; CHECK-NEXT:    [[COL_LOAD37:%.*]] = load <2 x double>, ptr [[VEC_GEP36]], align 8
 ; CHECK-NEXT:    [[COL_LOAD38:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[VEC_GEP39:%.*]] = getelementptr double, ptr [[A]], i64 4
+; CHECK-NEXT:    [[VEC_GEP39:%.*]] = getelementptr i8, ptr [[A]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD40:%.*]] = load <2 x double>, ptr [[VEC_GEP39]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT43:%.*]] = shufflevector <2 x double> [[COL_LOAD38]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT43]]
@@ -278,13 +278,13 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[TMP13:%.*]] = fmul contract <2 x double> [[COL_LOAD35]], [[SPLAT_SPLAT49]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT52:%.*]] = shufflevector <2 x double> [[COL_LOAD40]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP14:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD37]], <2 x double> [[SPLAT_SPLAT52]], <2 x double> [[TMP13]])
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[A]], i64 10
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[A]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD53:%.*]] = load <2 x double>, ptr [[TMP15]], align 8
-; CHECK-NEXT:    [[VEC_GEP54:%.*]] = getelementptr double, ptr [[A]], i64 14
+; CHECK-NEXT:    [[VEC_GEP54:%.*]] = getelementptr i8, ptr [[A]], i64 112
 ; CHECK-NEXT:    [[COL_LOAD55:%.*]] = load <2 x double>, ptr [[VEC_GEP54]], align 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD56:%.*]] = load <2 x double>, ptr [[TMP16]], align 8
-; CHECK-NEXT:    [[VEC_GEP57:%.*]] = getelementptr double, ptr [[A]], i64 6
+; CHECK-NEXT:    [[VEC_GEP57:%.*]] = getelementptr i8, ptr [[A]], i64 48
 ; CHECK-NEXT:    [[COL_LOAD58:%.*]] = load <2 x double>, ptr [[VEC_GEP57]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT62:%.*]] = shufflevector <2 x double> [[COL_LOAD56]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP17:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD53]], <2 x double> [[SPLAT_SPLAT62]], <2 x double> [[TMP12]])
@@ -294,16 +294,16 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[TMP19:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD53]], <2 x double> [[SPLAT_SPLAT69]], <2 x double> [[TMP14]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT72:%.*]] = shufflevector <2 x double> [[COL_LOAD58]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP20:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD55]], <2 x double> [[SPLAT_SPLAT72]], <2 x double> [[TMP19]])
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr double, ptr [[C]], i64 2
+; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr i8, ptr [[C]], i64 16
 ; CHECK-NEXT:    store <2 x double> [[TMP18]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[VEC_GEP73:%.*]] = getelementptr double, ptr [[C]], i64 6
+; CHECK-NEXT:    [[VEC_GEP73:%.*]] = getelementptr i8, ptr [[C]], i64 48
 ; CHECK-NEXT:    store <2 x double> [[TMP20]], ptr [[VEC_GEP73]], align 8
 ; CHECK-NEXT:    [[COL_LOAD74:%.*]] = load <2 x double>, ptr [[A]], align 8
-; CHECK-NEXT:    [[VEC_GEP75:%.*]] = getelementptr double, ptr [[A]], i64 4
+; CHECK-NEXT:    [[VEC_GEP75:%.*]] = getelementptr i8, ptr [[A]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD76:%.*]] = load <2 x double>, ptr [[VEC_GEP75]], align 8
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr double, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr i8, ptr [[A]], i64 64
 ; CHECK-NEXT:    [[COL_LOAD77:%.*]] = load <2 x double>, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[VEC_GEP78:%.*]] = getelementptr double, ptr [[A]], i64 12
+; CHECK-NEXT:    [[VEC_GEP78:%.*]] = getelementptr i8, ptr [[A]], i64 96
 ; CHECK-NEXT:    [[COL_LOAD79:%.*]] = load <2 x double>, ptr [[VEC_GEP78]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT82:%.*]] = shufflevector <2 x double> [[COL_LOAD77]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP23:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT82]]
@@ -313,13 +313,13 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[TMP25:%.*]] = fmul contract <2 x double> [[COL_LOAD74]], [[SPLAT_SPLAT88]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT91:%.*]] = shufflevector <2 x double> [[COL_LOAD79]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP26:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD76]], <2 x double> [[SPLAT_SPLAT91]], <2 x double> [[TMP25]])
-; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[A]], i64 64
 ; CHECK-NEXT:    [[COL_LOAD92:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
-; CHECK-NEXT:    [[VEC_GEP93:%.*]] = getelementptr double, ptr [[A]], i64 12
+; CHECK-NEXT:    [[VEC_GEP93:%.*]] = getelementptr i8, ptr [[A]], i64 96
 ; CHECK-NEXT:    [[COL_LOAD94:%.*]] = load <2 x double>, ptr [[VEC_GEP93]], align 8
-; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr double, ptr [[A]], i64 10
+; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[A]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD95:%.*]] = load <2 x double>, ptr [[TMP28]], align 8
-; CHECK-NEXT:    [[VEC_GEP96:%.*]] = getelementptr double, ptr [[A]], i64 14
+; CHECK-NEXT:    [[VEC_GEP96:%.*]] = getelementptr i8, ptr [[A]], i64 112
 ; CHECK-NEXT:    [[COL_LOAD97:%.*]] = load <2 x double>, ptr [[VEC_GEP96]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT101:%.*]] = shufflevector <2 x double> [[COL_LOAD95]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP29:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT101]], <2 x double> [[TMP24]])
@@ -329,17 +329,17 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[TMP31:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD92]], <2 x double> [[SPLAT_SPLAT108]], <2 x double> [[TMP26]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT111:%.*]] = shufflevector <2 x double> [[COL_LOAD97]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP32:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD94]], <2 x double> [[SPLAT_SPLAT111]], <2 x double> [[TMP31]])
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[C]], i64 8
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr i8, ptr [[C]], i64 64
 ; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP33]], align 8
-; CHECK-NEXT:    [[VEC_GEP112:%.*]] = getelementptr double, ptr [[C]], i64 12
+; CHECK-NEXT:    [[VEC_GEP112:%.*]] = getelementptr i8, ptr [[C]], i64 96
 ; CHECK-NEXT:    store <2 x double> [[TMP32]], ptr [[VEC_GEP112]], align 8
-; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD113:%.*]] = load <2 x double>, ptr [[TMP34]], align 8
-; CHECK-NEXT:    [[VEC_GEP114:%.*]] = getelementptr double, ptr [[A]], i64 6
+; CHECK-NEXT:    [[VEC_GEP114:%.*]] = getelementptr i8, ptr [[A]], i64 48
 ; CHECK-NEXT:    [[COL_LOAD115:%.*]] = load <2 x double>, ptr [[VEC_GEP114]], align 8
-; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr double, ptr [[A]], i64 8
+; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr i8, ptr [[A]], i64 64
 ; CHECK-NEXT:    [[COL_LOAD116:%.*]] = load <2 x double>, ptr [[TMP35]], align 8
-; CHECK-NEXT:    [[VEC_GEP117:%.*]] = getelementptr double, ptr [[A]], i64 12
+; CHECK-NEXT:    [[VEC_GEP117:%.*]] = getelementptr i8, ptr [[A]], i64 96
 ; CHECK-NEXT:    [[COL_LOAD118:%.*]] = load <2 x double>, ptr [[VEC_GEP117]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT121:%.*]] = shufflevector <2 x double> [[COL_LOAD116]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP36:%.*]] = fmul contract <2 x double> [[COL_LOAD113]], [[SPLAT_SPLAT121]]
@@ -349,9 +349,9 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[TMP38:%.*]] = fmul contract <2 x double> [[COL_LOAD113]], [[SPLAT_SPLAT127]]
 ; CHECK-NEXT:    [[SPLAT_SPLAT130:%.*]] = shufflevector <2 x double> [[COL_LOAD118]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP39:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD115]], <2 x double> [[SPLAT_SPLAT130]], <2 x double> [[TMP38]])
-; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr double, ptr [[A]], i64 10
+; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr i8, ptr [[A]], i64 80
 ; CHECK-NEXT:    [[COL_LOAD131:%.*]] = load <2 x double>, ptr [[TMP40]], align 8
-; CHECK-NEXT:    [[VEC_GEP132:%.*]] = getelementptr double, ptr [[A]], i64 14
+; CHECK-NEXT:    [[VEC_GEP132:%.*]] = getelementptr i8, ptr [[A]], i64 112
 ; CHECK-NEXT:    [[COL_LOAD133:%.*]] = load <2 x double>, ptr [[VEC_GEP132]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT140:%.*]] = shufflevector <2 x double> [[COL_LOAD131]], <2 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP41:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD131]], <2 x double> [[SPLAT_SPLAT140]], <2 x double> [[TMP37]])
@@ -361,9 +361,9 @@ define void @multiply_reuse_load(ptr noalias %A, ptr noalias %B, ptr noalias %C)
 ; CHECK-NEXT:    [[TMP43:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD131]], <2 x double> [[SPLAT_SPLAT147]], <2 x double> [[TMP39]])
 ; CHECK-NEXT:    [[SPLAT_SPLAT150:%.*]] = shufflevector <2 x double> [[COL_LOAD133]], <2 x double> poison, <2 x i32> <i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP44:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD133]], <2 x double> [[SPLAT_SPLAT150]], <2 x double> [[TMP43]])
-; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr double, ptr [[C]], i64 10
+; CHECK-NEXT:    [[TMP45:%.*]] = getelementptr i8, ptr [[C]], i64 80
 ; CHECK-NEXT:    store <2 x double> [[TMP42]], ptr [[TMP45]], align 8
-; CHECK-NEXT:    [[VEC_GEP151:%.*]] = getelementptr double, ptr [[C]], i64 14
+; CHECK-NEXT:    [[VEC_GEP151:%.*]] = getelementptr i8, ptr [[C]], i64 112
 ; CHECK-NEXT:    store <2 x double> [[TMP44]], ptr [[VEC_GEP151]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
index a02ef276e43e4..fa52ce39aad9b 100644
--- a/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
+++ b/llvm/test/Transforms/LowerMatrixIntrinsics/multiply-minimal.ll
@@ -14,14 +14,14 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-LABEL: @multiply(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COL_LOAD:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr double, ptr [[A]], i64 2
+; CHECK-NEXT:    [[VEC_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[COL_LOAD1:%.*]] = load <2 x double>, ptr [[VEC_GEP]], align 8
-; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr double, ptr [[A]], i64 4
+; CHECK-NEXT:    [[VEC_GEP2:%.*]] = getelementptr i8, ptr [[A]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD3:%.*]] = load <2 x double>, ptr [[VEC_GEP2]], align 8
-; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr double, ptr [[A]], i64 6
+; CHECK-NEXT:    [[VEC_GEP4:%.*]] = getelementptr i8, ptr [[A]], i64 48
 ; CHECK-NEXT:    [[COL_LOAD5:%.*]] = load <2 x double>, ptr [[VEC_GEP4]], align 8
 ; CHECK-NEXT:    [[COL_LOAD6:%.*]] = load <4 x double>, ptr [[B:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr double, ptr [[B]], i64 4
+; CHECK-NEXT:    [[VEC_GEP7:%.*]] = getelementptr i8, ptr [[B]], i64 32
 ; CHECK-NEXT:    [[COL_LOAD8:%.*]] = load <4 x double>, ptr [[VEC_GEP7]], align 8
 ; CHECK-NEXT:    [[SPLAT_SPLAT:%.*]] = shufflevector <4 x double> [[COL_LOAD6]], <4 x double> poison, <2 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP0:%.*]] = fmul contract <2 x double> [[COL_LOAD]], [[SPLAT_SPLAT]]
@@ -40,7 +40,7 @@ define void @multiply(ptr %A, ptr %B, ptr %C) {
 ; CHECK-NEXT:    [[SPLAT_SPLAT29:%.*]] = shufflevector <4 x double> [[COL_LOAD8]], <4 x double> poison, <2 x i32> <i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP7:%.*]] = call contract <2 x double> @llvm.fmuladd.v2f64(<2 x double> [[COL_LOAD5]], <2 x double> [[SPLAT_SPLAT29]], <2 x double> [[TMP6]])
 ; CHECK-NEXT:    store <2 x double> [[TMP3]], ptr [[C:%.*]], align 8
-; CHECK-NEXT:    [[VEC_GEP30:%.*]] = getelementptr double, ptr [[C]], i64 2
+; CHECK-NEXT:    [[VEC_GEP30:%.*]] = getelementptr i8, ptr [[C]], i64 16
 ; CHECK-NEXT:    store <2 x double> [[TMP7]], ptr [[VEC_GEP30]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
index cf4dfc0b76ac9..33cfd0aa8574e 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/hoisting-sinking-required-for-vectorization.ll
@@ -49,7 +49,7 @@ define void @loop(ptr %X, ptr %Y) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP1]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x double>, ptr [[TMP2]], align 8
 ; CHECK-NEXT:    [[TMP3:%.*]] = fcmp olt <2 x double> [[WIDE_LOAD]], zeroinitializer
@@ -61,7 +61,7 @@ define void @loop(ptr %X, ptr %Y) {
 ; CHECK-NEXT:    [[TMP9:%.*]] = select <2 x i1> [[TMP3]], <2 x double> zeroinitializer, <2 x double> [[TMP7]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <2 x i1> [[TMP4]], <2 x double> zeroinitializer, <2 x double> [[TMP8]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds double, ptr [[TMP11]], i64 2
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i64 16
 ; CHECK-NEXT:    store <2 x double> [[TMP9]], ptr [[TMP11]], align 8
 ; CHECK-NEXT:    store <2 x double> [[TMP10]], ptr [[TMP12]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
@@ -149,26 +149,26 @@ define void @loop2(ptr %A, ptr %B, ptr %C, float %x) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds i32, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP0]], align 4, !alias.scope [[META4:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !alias.scope [[META4]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD]], <i32 20, i32 20, i32 20, i32 20>
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq <4 x i32> [[WIDE_LOAD7]], <i32 20, i32 20, i32 20, i32 20>
 ; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds float, ptr [[TMP4]], i64 4
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x float>, ptr [[TMP4]], align 4, !alias.scope [[META7:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x float>, ptr [[TMP5]], align 4, !alias.scope [[META7]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fmul <4 x float> [[WIDE_LOAD8]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = fmul <4 x float> [[WIDE_LOAD9]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr float, ptr [[B]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr float, ptr [[TMP8]], i64 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD10:%.*]] = load <4 x float>, ptr [[TMP8]], align 4, !alias.scope [[META9:![0-9]+]], !noalias [[META11:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD11:%.*]] = load <4 x float>, ptr [[TMP9]], align 4, !alias.scope [[META9]], !noalias [[META11]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = select <4 x i1> [[TMP2]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[WIDE_LOAD10]]
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = fadd <4 x float> [[TMP6]], [[TMP10]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = select <4 x i1> [[TMP3]], <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <4 x float> [[WIDE_LOAD11]]
 ; CHECK-NEXT:    [[PREDPHI12:%.*]] = fadd <4 x float> [[TMP7]], [[TMP11]]
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP8]], i64 4
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI]], ptr [[TMP8]], align 4, !alias.scope [[META9]], !noalias [[META11]]
 ; CHECK-NEXT:    store <4 x float> [[PREDPHI12]], ptr [[TMP12]], align 4, !alias.scope [[META9]], !noalias [[META11]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
index e346bef27bbd0..8fc5189e8bc79 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/peel-multiple-unreachable-exits-for-vectorization.ll
@@ -11,13 +11,13 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-LABEL: @sum_2_at_with_int_conversion(
 ; CHECK-NEXT:  at_with_int_conversion.exit11.peel:
 ; CHECK-NEXT:    [[START_I:%.*]] = load ptr, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr [[VEC:%.*]], ptr [[A]], i64 0, i32 1
+; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[END_I:%.*]] = load ptr, ptr [[GEP_END_I]], align 8
 ; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
-; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 8
 ; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
 ; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
 ; CHECK-NEXT:    [[START_INT_I4_PEEL:%.*]] = ptrtoint ptr [[START_I1_PEEL]] to i64
@@ -50,11 +50,11 @@ define i64 @sum_2_at_with_int_conversion(ptr %A, ptr %B, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI16:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP13:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD17:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD18:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD19:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
@@ -122,14 +122,14 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-LABEL: @sum_3_at_with_int_conversion(
 ; CHECK-NEXT:  at_with_int_conversion.exit22.peel:
 ; CHECK-NEXT:    [[START_I:%.*]] = load ptr, ptr [[A:%.*]], align 8
-; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr [[VEC:%.*]], ptr [[A]], i64 0, i32 1
+; CHECK-NEXT:    [[GEP_END_I:%.*]] = getelementptr i8, ptr [[A]], i64 8
 ; CHECK-NEXT:    [[END_I:%.*]] = load ptr, ptr [[GEP_END_I]], align 8
 ; CHECK-NEXT:    [[START_INT_I:%.*]] = ptrtoint ptr [[START_I]] to i64
 ; CHECK-NEXT:    [[END_INT_I:%.*]] = ptrtoint ptr [[END_I]] to i64
 ; CHECK-NEXT:    [[SUB_I:%.*]] = sub i64 [[END_INT_I]], [[START_INT_I]]
-; CHECK-NEXT:    [[GEP_END_I13:%.*]] = getelementptr [[VEC]], ptr [[C:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[GEP_END_I13:%.*]] = getelementptr i8, ptr [[C:%.*]], i64 8
 ; CHECK-NEXT:    [[SMAX:%.*]] = tail call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 0)
-; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr [[VEC]], ptr [[B:%.*]], i64 0, i32 1
+; CHECK-NEXT:    [[GEP_END_I2:%.*]] = getelementptr i8, ptr [[B:%.*]], i64 8
 ; CHECK-NEXT:    [[LV_I_PEEL:%.*]] = load i64, ptr [[START_I]], align 8
 ; CHECK-NEXT:    [[START_I1_PEEL:%.*]] = load ptr, ptr [[B]], align 8
 ; CHECK-NEXT:    [[END_I3_PEEL:%.*]] = load ptr, ptr [[GEP_END_I2]], align 8
@@ -171,15 +171,15 @@ define i64 @sum_3_at_with_int_conversion(ptr %A, ptr %B, ptr %C, i64 %N) {
 ; CHECK-NEXT:    [[VEC_PHI28:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = or disjoint i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i64, ptr [[START_I]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i64, ptr [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP6]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[TMP6]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD29:%.*]] = load <2 x i64>, ptr [[TMP7]], align 8
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i64, ptr [[START_I1_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i64, ptr [[TMP8]], i64 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD30:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD31:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i64, ptr [[START_I12_PEEL]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i64, ptr [[TMP10]], i64 2
+; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[TMP10]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD32:%.*]] = load <2 x i64>, ptr [[TMP10]], align 8
 ; CHECK-NEXT:    [[WIDE_LOAD33:%.*]] = load <2 x i64>, ptr [[TMP11]], align 8
 ; CHECK-NEXT:    [[TMP12:%.*]] = add <2 x i64> [[WIDE_LOAD]], [[VEC_PHI]]
@@ -260,7 +260,7 @@ exit:
 define i64 @at_with_int_conversion(ptr %ptr, i64 %idx) {
 ; CHECK-LABEL: @at_with_int_conversion(
 ; CHECK-NEXT:    [[START:%.*]] = load ptr, ptr [[PTR:%.*]], align 8
-; CHECK-NEXT:    [[GEP_END:%.*]] = getelementptr [[VEC:%.*]], ptr [[PTR]], i64 0, i32 1
+; CHECK-NEXT:    [[GEP_END:%.*]] = getelementptr i8, ptr [[PTR]], i64 8
 ; CHECK-NEXT:    [[END:%.*]] = load ptr, ptr [[GEP_END]], align 8
 ; CHECK-NEXT:    [[START_INT:%.*]] = ptrtoint ptr [[START]] to i64
 ; CHECK-NEXT:    [[END_INT:%.*]] = ptrtoint ptr [[END]] to i64
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
index d60b232796767..7b9f3a2e13a64 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/quant_4x4.ll
@@ -40,14 +40,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[TMP12:%.*]] = trunc <8 x i32> [[TMP11]] to <8 x i16>
 ; CHECK-NEXT:    [[PREDPHI:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[TMP12]], <8 x i16> [[TMP8]]
 ; CHECK-NEXT:    store <8 x i16> [[PREDPHI]], ptr [[DCT]], align 2, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 8
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD_1:%.*]] = load <8 x i16>, ptr [[TMP13]], align 2, !alias.scope !0, !noalias !3
 ; CHECK-NEXT:    [[TMP14:%.*]] = sext <8 x i16> [[WIDE_LOAD_1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp sgt <8 x i16> [[WIDE_LOAD_1]], zeroinitializer
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 8
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD28_1:%.*]] = load <8 x i16>, ptr [[TMP16]], align 2, !alias.scope !6
 ; CHECK-NEXT:    [[TMP17:%.*]] = zext <8 x i16> [[WIDE_LOAD28_1]] to <8 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 8
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD29_1:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2, !alias.scope !7
 ; CHECK-NEXT:    [[TMP19:%.*]] = zext <8 x i16> [[WIDE_LOAD29_1]] to <8 x i32>
 ; CHECK-NEXT:    [[TMP20:%.*]] = sub nsw <8 x i32> [[TMP17]], [[TMP14]]
@@ -95,14 +95,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK:       if.end:
 ; CHECK-NEXT:    [[STOREMERGE:%.*]] = phi i16 [ [[CONV28]], [[IF_ELSE]] ], [ [[CONV12]], [[IF_THEN]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE]], ptr [[DCT]], align 2
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 2
 ; CHECK-NEXT:    [[TMP36:%.*]] = load i16, ptr [[ARRAYIDX_1]], align 2
 ; CHECK-NEXT:    [[CONV_1:%.*]] = sext i16 [[TMP36]] to i32
 ; CHECK-NEXT:    [[CMP1_1:%.*]] = icmp sgt i16 [[TMP36]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX4_1:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 2
 ; CHECK-NEXT:    [[TMP37:%.*]] = load i16, ptr [[ARRAYIDX4_1]], align 2
 ; CHECK-NEXT:    [[CONV5_1:%.*]] = zext i16 [[TMP37]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX10_1:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 2
 ; CHECK-NEXT:    [[TMP38:%.*]] = load i16, ptr [[ARRAYIDX10_1]], align 2
 ; CHECK-NEXT:    [[CONV11_1:%.*]] = zext i16 [[TMP38]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_1]], label [[IF_THEN_1:%.*]], label [[IF_ELSE_1:%.*]]
@@ -123,14 +123,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_1:%.*]] = phi i16 [ [[CONV28_1]], [[IF_ELSE_1]] ], [ [[CONV12_1]], [[IF_THEN_1]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_1]], ptr [[ARRAYIDX_1]], align 2
 ; CHECK-NEXT:    [[OR_131:%.*]] = or i16 [[STOREMERGE]], [[STOREMERGE_1]]
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 4
 ; CHECK-NEXT:    [[TMP40:%.*]] = load i16, ptr [[ARRAYIDX_2]], align 2
 ; CHECK-NEXT:    [[CONV_2:%.*]] = sext i16 [[TMP40]] to i32
 ; CHECK-NEXT:    [[CMP1_2:%.*]] = icmp sgt i16 [[TMP40]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_2:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX4_2:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 4
 ; CHECK-NEXT:    [[TMP41:%.*]] = load i16, ptr [[ARRAYIDX4_2]], align 2
 ; CHECK-NEXT:    [[CONV5_2:%.*]] = zext i16 [[TMP41]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX10_2:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 4
 ; CHECK-NEXT:    [[TMP42:%.*]] = load i16, ptr [[ARRAYIDX10_2]], align 2
 ; CHECK-NEXT:    [[CONV11_2:%.*]] = zext i16 [[TMP42]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_2]], label [[IF_THEN_2:%.*]], label [[IF_ELSE_2:%.*]]
@@ -151,14 +151,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_2:%.*]] = phi i16 [ [[CONV28_2]], [[IF_ELSE_2]] ], [ [[CONV12_2]], [[IF_THEN_2]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_2]], ptr [[ARRAYIDX_2]], align 2
 ; CHECK-NEXT:    [[OR_232:%.*]] = or i16 [[OR_131]], [[STOREMERGE_2]]
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 6
 ; CHECK-NEXT:    [[TMP44:%.*]] = load i16, ptr [[ARRAYIDX_3]], align 2
 ; CHECK-NEXT:    [[CONV_3:%.*]] = sext i16 [[TMP44]] to i32
 ; CHECK-NEXT:    [[CMP1_3:%.*]] = icmp sgt i16 [[TMP44]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_3:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX4_3:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 6
 ; CHECK-NEXT:    [[TMP45:%.*]] = load i16, ptr [[ARRAYIDX4_3]], align 2
 ; CHECK-NEXT:    [[CONV5_3:%.*]] = zext i16 [[TMP45]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 3
+; CHECK-NEXT:    [[ARRAYIDX10_3:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 6
 ; CHECK-NEXT:    [[TMP46:%.*]] = load i16, ptr [[ARRAYIDX10_3]], align 2
 ; CHECK-NEXT:    [[CONV11_3:%.*]] = zext i16 [[TMP46]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_3]], label [[IF_THEN_3:%.*]], label [[IF_ELSE_3:%.*]]
@@ -179,14 +179,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_3:%.*]] = phi i16 [ [[CONV28_3]], [[IF_ELSE_3]] ], [ [[CONV12_3]], [[IF_THEN_3]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_3]], ptr [[ARRAYIDX_3]], align 2
 ; CHECK-NEXT:    [[OR_333:%.*]] = or i16 [[OR_232]], [[STOREMERGE_3]]
-; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX_4:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 8
 ; CHECK-NEXT:    [[TMP48:%.*]] = load i16, ptr [[ARRAYIDX_4]], align 2
 ; CHECK-NEXT:    [[CONV_4:%.*]] = sext i16 [[TMP48]] to i32
 ; CHECK-NEXT:    [[CMP1_4:%.*]] = icmp sgt i16 [[TMP48]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_4:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX4_4:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 8
 ; CHECK-NEXT:    [[TMP49:%.*]] = load i16, ptr [[ARRAYIDX4_4]], align 2
 ; CHECK-NEXT:    [[CONV5_4:%.*]] = zext i16 [[TMP49]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_4:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX10_4:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 8
 ; CHECK-NEXT:    [[TMP50:%.*]] = load i16, ptr [[ARRAYIDX10_4]], align 2
 ; CHECK-NEXT:    [[CONV11_4:%.*]] = zext i16 [[TMP50]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_4]], label [[IF_THEN_4:%.*]], label [[IF_ELSE_4:%.*]]
@@ -207,14 +207,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_4:%.*]] = phi i16 [ [[CONV28_4]], [[IF_ELSE_4]] ], [ [[CONV12_4]], [[IF_THEN_4]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_4]], ptr [[ARRAYIDX_4]], align 2
 ; CHECK-NEXT:    [[OR_434:%.*]] = or i16 [[OR_333]], [[STOREMERGE_4]]
-; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX_5:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 10
 ; CHECK-NEXT:    [[TMP52:%.*]] = load i16, ptr [[ARRAYIDX_5]], align 2
 ; CHECK-NEXT:    [[CONV_5:%.*]] = sext i16 [[TMP52]] to i32
 ; CHECK-NEXT:    [[CMP1_5:%.*]] = icmp sgt i16 [[TMP52]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_5:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX4_5:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 10
 ; CHECK-NEXT:    [[TMP53:%.*]] = load i16, ptr [[ARRAYIDX4_5]], align 2
 ; CHECK-NEXT:    [[CONV5_5:%.*]] = zext i16 [[TMP53]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_5:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 5
+; CHECK-NEXT:    [[ARRAYIDX10_5:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 10
 ; CHECK-NEXT:    [[TMP54:%.*]] = load i16, ptr [[ARRAYIDX10_5]], align 2
 ; CHECK-NEXT:    [[CONV11_5:%.*]] = zext i16 [[TMP54]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_5]], label [[IF_THEN_5:%.*]], label [[IF_ELSE_5:%.*]]
@@ -235,14 +235,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_5:%.*]] = phi i16 [ [[CONV28_5]], [[IF_ELSE_5]] ], [ [[CONV12_5]], [[IF_THEN_5]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_5]], ptr [[ARRAYIDX_5]], align 2
 ; CHECK-NEXT:    [[OR_535:%.*]] = or i16 [[OR_434]], [[STOREMERGE_5]]
-; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX_6:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 12
 ; CHECK-NEXT:    [[TMP56:%.*]] = load i16, ptr [[ARRAYIDX_6]], align 2
 ; CHECK-NEXT:    [[CONV_6:%.*]] = sext i16 [[TMP56]] to i32
 ; CHECK-NEXT:    [[CMP1_6:%.*]] = icmp sgt i16 [[TMP56]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_6:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX4_6:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 12
 ; CHECK-NEXT:    [[TMP57:%.*]] = load i16, ptr [[ARRAYIDX4_6]], align 2
 ; CHECK-NEXT:    [[CONV5_6:%.*]] = zext i16 [[TMP57]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_6:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX10_6:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 12
 ; CHECK-NEXT:    [[TMP58:%.*]] = load i16, ptr [[ARRAYIDX10_6]], align 2
 ; CHECK-NEXT:    [[CONV11_6:%.*]] = zext i16 [[TMP58]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_6]], label [[IF_THEN_6:%.*]], label [[IF_ELSE_6:%.*]]
@@ -263,14 +263,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_6:%.*]] = phi i16 [ [[CONV28_6]], [[IF_ELSE_6]] ], [ [[CONV12_6]], [[IF_THEN_6]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_6]], ptr [[ARRAYIDX_6]], align 2
 ; CHECK-NEXT:    [[OR_636:%.*]] = or i16 [[OR_535]], [[STOREMERGE_6]]
-; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX_7:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 14
 ; CHECK-NEXT:    [[TMP60:%.*]] = load i16, ptr [[ARRAYIDX_7]], align 2
 ; CHECK-NEXT:    [[CONV_7:%.*]] = sext i16 [[TMP60]] to i32
 ; CHECK-NEXT:    [[CMP1_7:%.*]] = icmp sgt i16 [[TMP60]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_7:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX4_7:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 14
 ; CHECK-NEXT:    [[TMP61:%.*]] = load i16, ptr [[ARRAYIDX4_7]], align 2
 ; CHECK-NEXT:    [[CONV5_7:%.*]] = zext i16 [[TMP61]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_7:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 7
+; CHECK-NEXT:    [[ARRAYIDX10_7:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 14
 ; CHECK-NEXT:    [[TMP62:%.*]] = load i16, ptr [[ARRAYIDX10_7]], align 2
 ; CHECK-NEXT:    [[CONV11_7:%.*]] = zext i16 [[TMP62]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_7]], label [[IF_THEN_7:%.*]], label [[IF_ELSE_7:%.*]]
@@ -291,14 +291,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_7:%.*]] = phi i16 [ [[CONV28_7]], [[IF_ELSE_7]] ], [ [[CONV12_7]], [[IF_THEN_7]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_7]], ptr [[ARRAYIDX_7]], align 2
 ; CHECK-NEXT:    [[OR_737:%.*]] = or i16 [[OR_636]], [[STOREMERGE_7]]
-; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX_8:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 16
 ; CHECK-NEXT:    [[TMP64:%.*]] = load i16, ptr [[ARRAYIDX_8]], align 2
 ; CHECK-NEXT:    [[CONV_8:%.*]] = sext i16 [[TMP64]] to i32
 ; CHECK-NEXT:    [[CMP1_8:%.*]] = icmp sgt i16 [[TMP64]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_8:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX4_8:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 16
 ; CHECK-NEXT:    [[TMP65:%.*]] = load i16, ptr [[ARRAYIDX4_8]], align 2
 ; CHECK-NEXT:    [[CONV5_8:%.*]] = zext i16 [[TMP65]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_8:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX10_8:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 16
 ; CHECK-NEXT:    [[TMP66:%.*]] = load i16, ptr [[ARRAYIDX10_8]], align 2
 ; CHECK-NEXT:    [[CONV11_8:%.*]] = zext i16 [[TMP66]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_8]], label [[IF_THEN_8:%.*]], label [[IF_ELSE_8:%.*]]
@@ -319,14 +319,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_8:%.*]] = phi i16 [ [[CONV28_8]], [[IF_ELSE_8]] ], [ [[CONV12_8]], [[IF_THEN_8]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_8]], ptr [[ARRAYIDX_8]], align 2
 ; CHECK-NEXT:    [[OR_838:%.*]] = or i16 [[OR_737]], [[STOREMERGE_8]]
-; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX_9:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 18
 ; CHECK-NEXT:    [[TMP68:%.*]] = load i16, ptr [[ARRAYIDX_9]], align 2
 ; CHECK-NEXT:    [[CONV_9:%.*]] = sext i16 [[TMP68]] to i32
 ; CHECK-NEXT:    [[CMP1_9:%.*]] = icmp sgt i16 [[TMP68]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_9:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX4_9:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 18
 ; CHECK-NEXT:    [[TMP69:%.*]] = load i16, ptr [[ARRAYIDX4_9]], align 2
 ; CHECK-NEXT:    [[CONV5_9:%.*]] = zext i16 [[TMP69]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_9:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX10_9:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 18
 ; CHECK-NEXT:    [[TMP70:%.*]] = load i16, ptr [[ARRAYIDX10_9]], align 2
 ; CHECK-NEXT:    [[CONV11_9:%.*]] = zext i16 [[TMP70]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_9]], label [[IF_THEN_9:%.*]], label [[IF_ELSE_9:%.*]]
@@ -347,14 +347,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_9:%.*]] = phi i16 [ [[CONV28_9]], [[IF_ELSE_9]] ], [ [[CONV12_9]], [[IF_THEN_9]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_9]], ptr [[ARRAYIDX_9]], align 2
 ; CHECK-NEXT:    [[OR_939:%.*]] = or i16 [[OR_838]], [[STOREMERGE_9]]
-; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX_10:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 20
 ; CHECK-NEXT:    [[TMP72:%.*]] = load i16, ptr [[ARRAYIDX_10]], align 2
 ; CHECK-NEXT:    [[CONV_10:%.*]] = sext i16 [[TMP72]] to i32
 ; CHECK-NEXT:    [[CMP1_10:%.*]] = icmp sgt i16 [[TMP72]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_10:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX4_10:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 20
 ; CHECK-NEXT:    [[TMP73:%.*]] = load i16, ptr [[ARRAYIDX4_10]], align 2
 ; CHECK-NEXT:    [[CONV5_10:%.*]] = zext i16 [[TMP73]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_10:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX10_10:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 20
 ; CHECK-NEXT:    [[TMP74:%.*]] = load i16, ptr [[ARRAYIDX10_10]], align 2
 ; CHECK-NEXT:    [[CONV11_10:%.*]] = zext i16 [[TMP74]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_10]], label [[IF_THEN_10:%.*]], label [[IF_ELSE_10:%.*]]
@@ -375,14 +375,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_10:%.*]] = phi i16 [ [[CONV28_10]], [[IF_ELSE_10]] ], [ [[CONV12_10]], [[IF_THEN_10]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_10]], ptr [[ARRAYIDX_10]], align 2
 ; CHECK-NEXT:    [[OR_1040:%.*]] = or i16 [[OR_939]], [[STOREMERGE_10]]
-; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX_11:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 22
 ; CHECK-NEXT:    [[TMP76:%.*]] = load i16, ptr [[ARRAYIDX_11]], align 2
 ; CHECK-NEXT:    [[CONV_11:%.*]] = sext i16 [[TMP76]] to i32
 ; CHECK-NEXT:    [[CMP1_11:%.*]] = icmp sgt i16 [[TMP76]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_11:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX4_11:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 22
 ; CHECK-NEXT:    [[TMP77:%.*]] = load i16, ptr [[ARRAYIDX4_11]], align 2
 ; CHECK-NEXT:    [[CONV5_11:%.*]] = zext i16 [[TMP77]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_11:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 11
+; CHECK-NEXT:    [[ARRAYIDX10_11:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 22
 ; CHECK-NEXT:    [[TMP78:%.*]] = load i16, ptr [[ARRAYIDX10_11]], align 2
 ; CHECK-NEXT:    [[CONV11_11:%.*]] = zext i16 [[TMP78]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_11]], label [[IF_THEN_11:%.*]], label [[IF_ELSE_11:%.*]]
@@ -403,14 +403,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_11:%.*]] = phi i16 [ [[CONV28_11]], [[IF_ELSE_11]] ], [ [[CONV12_11]], [[IF_THEN_11]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_11]], ptr [[ARRAYIDX_11]], align 2
 ; CHECK-NEXT:    [[OR_1141:%.*]] = or i16 [[OR_1040]], [[STOREMERGE_11]]
-; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX_12:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 24
 ; CHECK-NEXT:    [[TMP80:%.*]] = load i16, ptr [[ARRAYIDX_12]], align 2
 ; CHECK-NEXT:    [[CONV_12:%.*]] = sext i16 [[TMP80]] to i32
 ; CHECK-NEXT:    [[CMP1_12:%.*]] = icmp sgt i16 [[TMP80]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_12:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX4_12:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 24
 ; CHECK-NEXT:    [[TMP81:%.*]] = load i16, ptr [[ARRAYIDX4_12]], align 2
 ; CHECK-NEXT:    [[CONV5_12:%.*]] = zext i16 [[TMP81]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_12:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 12
+; CHECK-NEXT:    [[ARRAYIDX10_12:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 24
 ; CHECK-NEXT:    [[TMP82:%.*]] = load i16, ptr [[ARRAYIDX10_12]], align 2
 ; CHECK-NEXT:    [[CONV11_12:%.*]] = zext i16 [[TMP82]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_12]], label [[IF_THEN_12:%.*]], label [[IF_ELSE_12:%.*]]
@@ -431,14 +431,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_12:%.*]] = phi i16 [ [[CONV28_12]], [[IF_ELSE_12]] ], [ [[CONV12_12]], [[IF_THEN_12]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_12]], ptr [[ARRAYIDX_12]], align 2
 ; CHECK-NEXT:    [[OR_1242:%.*]] = or i16 [[OR_1141]], [[STOREMERGE_12]]
-; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX_13:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 26
 ; CHECK-NEXT:    [[TMP84:%.*]] = load i16, ptr [[ARRAYIDX_13]], align 2
 ; CHECK-NEXT:    [[CONV_13:%.*]] = sext i16 [[TMP84]] to i32
 ; CHECK-NEXT:    [[CMP1_13:%.*]] = icmp sgt i16 [[TMP84]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_13:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX4_13:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 26
 ; CHECK-NEXT:    [[TMP85:%.*]] = load i16, ptr [[ARRAYIDX4_13]], align 2
 ; CHECK-NEXT:    [[CONV5_13:%.*]] = zext i16 [[TMP85]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_13:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 13
+; CHECK-NEXT:    [[ARRAYIDX10_13:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 26
 ; CHECK-NEXT:    [[TMP86:%.*]] = load i16, ptr [[ARRAYIDX10_13]], align 2
 ; CHECK-NEXT:    [[CONV11_13:%.*]] = zext i16 [[TMP86]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_13]], label [[IF_THEN_13:%.*]], label [[IF_ELSE_13:%.*]]
@@ -459,14 +459,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_13:%.*]] = phi i16 [ [[CONV28_13]], [[IF_ELSE_13]] ], [ [[CONV12_13]], [[IF_THEN_13]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_13]], ptr [[ARRAYIDX_13]], align 2
 ; CHECK-NEXT:    [[OR_1343:%.*]] = or i16 [[OR_1242]], [[STOREMERGE_13]]
-; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX_14:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 28
 ; CHECK-NEXT:    [[TMP88:%.*]] = load i16, ptr [[ARRAYIDX_14]], align 2
 ; CHECK-NEXT:    [[CONV_14:%.*]] = sext i16 [[TMP88]] to i32
 ; CHECK-NEXT:    [[CMP1_14:%.*]] = icmp sgt i16 [[TMP88]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_14:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX4_14:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 28
 ; CHECK-NEXT:    [[TMP89:%.*]] = load i16, ptr [[ARRAYIDX4_14]], align 2
 ; CHECK-NEXT:    [[CONV5_14:%.*]] = zext i16 [[TMP89]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_14:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 14
+; CHECK-NEXT:    [[ARRAYIDX10_14:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 28
 ; CHECK-NEXT:    [[TMP90:%.*]] = load i16, ptr [[ARRAYIDX10_14]], align 2
 ; CHECK-NEXT:    [[CONV11_14:%.*]] = zext i16 [[TMP90]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_14]], label [[IF_THEN_14:%.*]], label [[IF_ELSE_14:%.*]]
@@ -487,14 +487,14 @@ define i32 @quant_4x4(ptr noundef %dct, ptr noundef %mf, ptr noundef %bias) {
 ; CHECK-NEXT:    [[STOREMERGE_14:%.*]] = phi i16 [ [[CONV28_14]], [[IF_ELSE_14]] ], [ [[CONV12_14]], [[IF_THEN_14]] ]
 ; CHECK-NEXT:    store i16 [[STOREMERGE_14]], ptr [[ARRAYIDX_14]], align 2
 ; CHECK-NEXT:    [[OR_1444:%.*]] = or i16 [[OR_1343]], [[STOREMERGE_14]]
-; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i16, ptr [[DCT]], i64 15
+; CHECK-NEXT:    [[ARRAYIDX_15:%.*]] = getelementptr inbounds i8, ptr [[DCT]], i64 30
 ; CHECK-NEXT:    [[TMP92:%.*]] = load i16, ptr [[ARRAYIDX_15]], align 2
 ; CHECK-NEXT:    [[CONV_15:%.*]] = sext i16 [[TMP92]] to i32
 ; CHECK-NEXT:    [[CMP1_15:%.*]] = icmp sgt i16 [[TMP92]], 0
-; CHECK-NEXT:    [[ARRAYIDX4_15:%.*]] = getelementptr inbounds i16, ptr [[BIAS]], i64 15
+; CHECK-NEXT:    [[ARRAYIDX4_15:%.*]] = getelementptr inbounds i8, ptr [[BIAS]], i64 30
 ; CHECK-NEXT:    [[TMP93:%.*]] = load i16, ptr [[ARRAYIDX4_15]], align 2
 ; CHECK-NEXT:    [[CONV5_15:%.*]] = zext i16 [[TMP93]] to i32
-; CHECK-NEXT:    [[ARRAYIDX10_15:%.*]] = getelementptr inbounds i16, ptr [[MF]], i64 15
+; CHECK-NEXT:    [[ARRAYIDX10_15:%.*]] = getelementptr inbounds i8, ptr [[MF]], i64 30
 ; CHECK-NEXT:    [[TMP94:%.*]] = load i16, ptr [[ARRAYIDX10_15]], align 2
 ; CHECK-NEXT:    [[CONV11_15:%.*]] = zext i16 [[TMP94]] to i32
 ; CHECK-NEXT:    br i1 [[CMP1_15]], label [[IF_THEN_15:%.*]], label [[IF_ELSE_15:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/sinking-vs-if-conversion.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/sinking-vs-if-conversion.ll
index 06d7b36509a52..885529926fc4b 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/sinking-vs-if-conversion.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/sinking-vs-if-conversion.ll
@@ -12,13 +12,13 @@ define void @test_find_min(ptr noundef nonnull align 8 dereferenceable(24) %this
 ; CHECK-LABEL: define void @test_find_min(
 ; CHECK-SAME: ptr nocapture noundef nonnull align 8 dereferenceable(24) [[THIS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[NUM_:%.*]] = getelementptr inbounds [[STRUCT_TREE:%.*]], ptr [[THIS]], i64 0, i32 2
+; CHECK-NEXT:    [[NUM_:%.*]] = getelementptr inbounds i8, ptr [[THIS]], i64 16
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NUM_]], align 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[THIS]], align 8
 ; CHECK-NEXT:    [[CMP8:%.*]] = icmp sgt i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[CMP8]], label [[FOR_BODY_LR_PH:%.*]], label [[FOR_END:%.*]]
 ; CHECK:       for.body.lr.ph:
-; CHECK-NEXT:    [[ARRAY_:%.*]] = getelementptr inbounds [[STRUCT_TREE]], ptr [[THIS]], i64 0, i32 1
+; CHECK-NEXT:    [[ARRAY_:%.*]] = getelementptr inbounds i8, ptr [[THIS]], i64 8
 ; CHECK-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAY_]], align 8
 ; CHECK-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[TMP0]] to i64
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
@@ -30,9 +30,9 @@ define void @test_find_min(ptr noundef nonnull align 8 dereferenceable(24) %this
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq ptr [[MIN_010]], null
 ; CHECK-NEXT:    br i1 [[CMP3]], label [[COND_END7]], label [[COND_FALSE:%.*]]
 ; CHECK:       cond.false:
-; CHECK-NEXT:    [[KEY2:%.*]] = getelementptr inbounds [[STRUCT_NODE:%.*]], ptr [[MIN_010]], i64 0, i32 1
+; CHECK-NEXT:    [[KEY2:%.*]] = getelementptr inbounds i8, ptr [[MIN_010]], i64 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[KEY2]], align 4
-; CHECK-NEXT:    [[KEY:%.*]] = getelementptr inbounds [[STRUCT_NODE]], ptr [[TMP3]], i64 0, i32 1
+; CHECK-NEXT:    [[KEY:%.*]] = getelementptr inbounds i8, ptr [[TMP3]], i64 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[KEY]], align 4
 ; CHECK-NEXT:    [[CMP4:%.*]] = icmp slt i32 [[TMP5]], [[TMP4]]
 ; CHECK-NEXT:    [[COND:%.*]] = select i1 [[CMP4]], ptr [[TMP3]], ptr [[MIN_010]]
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index 7cbce461a4924..73bcee5fb74f0 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -60,21 +60,21 @@ define void @arm_mult_q15(ptr %pSrcA, ptr %pSrcB, ptr noalias %pDst, i32 %blockS
 ; CHECK-NEXT:    [[PSRCA_ADDR_05:%.*]] = phi ptr [ [[INCDEC_PTR:%.*]], [[WHILE_BODY]] ], [ [[PSRCA_ADDR_05_PH]], [[WHILE_BODY_PREHEADER16]] ]
 ; CHECK-NEXT:    [[PDST_ADDR_04:%.*]] = phi ptr [ [[INCDEC_PTR4:%.*]], [[WHILE_BODY]] ], [ [[PDST_ADDR_04_PH]], [[WHILE_BODY_PREHEADER16]] ]
 ; CHECK-NEXT:    [[PSRCB_ADDR_03:%.*]] = phi ptr [ [[INCDEC_PTR1:%.*]], [[WHILE_BODY]] ], [ [[PSRCB_ADDR_03_PH]], [[WHILE_BODY_PREHEADER16]] ]
-; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i16, ptr [[PSRCA_ADDR_05]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR]] = getelementptr inbounds i8, ptr [[PSRCA_ADDR_05]], i32 2
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[PSRCA_ADDR_05]], align 2
 ; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[TMP13]] to i32
-; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i16, ptr [[PSRCB_ADDR_03]], i32 1
+; CHECK-NEXT:    [[INCDEC_PTR1]] = getelementptr inbounds i8, ptr [[PSRCB_ADDR_03]], i32 2
 ; CHECK-NEXT:    [[TMP14:%.*]] = load i16, ptr [[PSRCB_ADDR_03]], align 2
 ; CHECK-NEXT:    [[CONV2:%.*]] = sext i16 [[TMP14]] to i32
 ; CHECK-NEXT:    [[MUL:%.*]] = mul nsw i32 [[CONV2]], [[CONV]]
 ; CHECK-NEXT:    [[SHR:%.*]] = ashr i32 [[MUL]], 15
-; CHECK-NEXT:    [[TMP15:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
-; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[TMP15]] to i16
-; CHECK-NEXT:    [[INCDEC_PTR4]] = getelementptr inbounds i16, ptr [[PDST_ADDR_04]], i32 1
+; CHECK-NEXT:    [[SPEC_SELECT_I:%.*]] = tail call i32 @llvm.smin.i32(i32 [[SHR]], i32 32767)
+; CHECK-NEXT:    [[CONV3:%.*]] = trunc i32 [[SPEC_SELECT_I]] to i16
+; CHECK-NEXT:    [[INCDEC_PTR4]] = getelementptr inbounds i8, ptr [[PDST_ADDR_04]], i32 2
 ; CHECK-NEXT:    store i16 [[CONV3]], ptr [[PDST_ADDR_04]], align 2
 ; CHECK-NEXT:    [[DEC]] = add i32 [[BLKCNT_06]], -1
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[DEC]], 0
-; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP2:![0-9]+]]
+; CHECK-NEXT:    br i1 [[CMP_NOT]], label [[WHILE_END]], label [[WHILE_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
index 8f37e79539b8c..fd6bb61d0369d 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/excessive-unrolling.ll
@@ -180,17 +180,17 @@ define void @test_runtime_trip_count(i32 %N) {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [58 x double], ptr @b, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP0]], align 16
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x double>, ptr [[TMP1]], align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds [58 x double], ptr @c, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds double, ptr [[TMP2]], i64 2
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 16
 ; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <2 x double>, ptr [[TMP2]], align 16
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x double>, ptr [[TMP3]], align 16
 ; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[WIDE_LOAD]], [[WIDE_LOAD5]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[WIDE_LOAD4]], [[WIDE_LOAD6]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [58 x double], ptr @a, i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP6]], i64 2
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP6]], i64 16
 ; CHECK-NEXT:    store <2 x double> [[TMP4]], ptr [[TMP6]], align 16
 ; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 16
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
index eb8fe10274521..f0a1846389580 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll
@@ -56,7 +56,7 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8
 ; O2:       vector.body:
 ; O2-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_BODY4_PREHEADER]] ]
 ; O2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[INDEX]]
-; O2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; O2-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; O2-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; O2-NEXT:    [[WIDE_LOAD8:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
 ; O2-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
@@ -76,7 +76,7 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8
 ; O2:       for.cond.cleanup3:
 ; O2-NEXT:    [[INC7]] = add nuw nsw i64 [[I_06]], 1
 ; O2-NEXT:    [[EXITCOND7_NOT:%.*]] = icmp eq i64 [[INC7]], 100
-; O2-NEXT:    br i1 [[EXITCOND7_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP7:![0-9]+]]
+; O2-NEXT:    br i1 [[EXITCOND7_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER]], !llvm.loop [[LOOP8:![0-9]+]]
 ; O2:       for.body4:
 ; O2-NEXT:    [[J_05:%.*]] = phi i64 [ [[INC5:%.*]], [[FOR_BODY4]] ], [ [[J_05_PH]], [[FOR_BODY4_PREHEADER9]] ]
 ; O2-NEXT:    [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[J_05]]
@@ -85,7 +85,7 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8
 ; O2-NEXT:    store i32 [[INC]], ptr [[ADD_PTR_I]], align 4, !tbaa [[TBAA0]]
 ; O2-NEXT:    [[INC5]] = add nuw i64 [[J_05]], 1
 ; O2-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5]], [[NUMELEMS]]
-; O2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP8:![0-9]+]]
+; O2-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4]], !llvm.loop [[LOOP9:![0-9]+]]
 ;
 ; O3-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy
 ; O3-SAME: (ptr nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
@@ -104,7 +104,7 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8
 ; O3:       vector.body:
 ; O3-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ]
 ; O3-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 [[INDEX]]
-; O3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; O3-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; O3-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
 ; O3-NEXT:    [[WIDE_LOAD9:%.*]] = load <4 x i32>, ptr [[TMP2]], align 4, !tbaa [[TBAA0]]
 ; O3-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
@@ -127,7 +127,7 @@ define dso_local void @_Z7computeRSt6vectorIiSaIiEEy(ptr noundef nonnull align 8
 ; O3-NEXT:    store i32 [[INC_US]], ptr [[ADD_PTR_I_US]], align 4, !tbaa [[TBAA0]]
 ; O3-NEXT:    [[INC5_US]] = add nuw i64 [[J_05_US]], 1
 ; O3-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC5_US]], [[NUMELEMS]]
-; O3-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !llvm.loop [[LOOP7:![0-9]+]]
+; O3-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !llvm.loop [[LOOP8:![0-9]+]]
 ; O3:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; O3-NEXT:    [[INC7_US]] = add nuw nsw i64 [[I_06_US]], 1
 ; O3-NEXT:    [[EXITCOND8_NOT:%.*]] = icmp eq i64 [[INC7_US]], 100
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll b/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll
index c50f164046301..05e39efc7745a 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll
@@ -43,7 +43,7 @@ define void @loop_or(ptr noalias %pIn, ptr noalias %pOut, i32 %s) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = or disjoint <4 x i32> [[TMP4]], <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <4 x i32> [[TMP5]], <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[POUT:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i64 4
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i64 16
 ; CHECK-NEXT:    store <4 x i32> [[TMP6]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    store <4 x i32> [[TMP7]], ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
index b52dd6743649a..40c42ffdfd107 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr48844-br-to-switch-vectorization.ll
@@ -23,7 +23,7 @@ define dso_local void @test(ptr %start, ptr %end) #0 {
 ; CHECK-NEXT:    store i32 42, ptr [[PTR2]], align 4
 ; CHECK-NEXT:    br label [[LATCH]]
 ; CHECK:       latch:
-; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i32, ptr [[PTR2]], i64 1
+; CHECK-NEXT:    [[PTR_NEXT]] = getelementptr inbounds i8, ptr [[PTR2]], i64 4
 ; CHECK-NEXT:    [[I11_NOT:%.*]] = icmp eq ptr [[PTR_NEXT]], [[END]]
 ; CHECK-NEXT:    br i1 [[I11_NOT]], label [[EXIT]], label [[BB12]]
 ; CHECK:       exit:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll
index 18d1c64268550..d9101272ba3d0 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50555.ll
@@ -11,7 +11,7 @@ define void @trunc_through_one_add(ptr noalias %0, ptr noalias readonly %1) {
 ; SSE-NEXT:    [[TMP7:%.*]] = lshr <8 x i16> [[TMP6]], <i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
 ; SSE-NEXT:    store <8 x i16> [[TMP7]], ptr [[TMP0:%.*]], align 2
 ; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; SSE-NEXT:    [[TMP10:%.*]] = load <8 x i8>, ptr [[TMP8]], align 1
 ; SSE-NEXT:    [[TMP11:%.*]] = zext <8 x i8> [[TMP10]] to <8 x i16>
 ; SSE-NEXT:    [[TMP12:%.*]] = lshr <8 x i16> [[TMP11]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
@@ -187,7 +187,7 @@ define void @trunc_through_two_adds(ptr noalias %0, ptr noalias readonly %1, ptr
 ; SSE-NEXT:    store <8 x i16> [[TMP11]], ptr [[TMP0:%.*]], align 2
 ; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 8
 ; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i64 8
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i16, ptr [[TMP0]], i64 8
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; SSE-NEXT:    [[TMP15:%.*]] = load <8 x i8>, ptr [[TMP12]], align 1
 ; SSE-NEXT:    [[TMP16:%.*]] = zext <8 x i8> [[TMP15]] to <8 x i16>
 ; SSE-NEXT:    [[TMP17:%.*]] = load <8 x i8>, ptr [[TMP13]], align 1
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
index 24cc18625af82..c5deb716d8030 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll
@@ -40,7 +40,7 @@ define void @licm(ptr align 8 dereferenceable(8) %_M_start.i, i64 %numElem) {
 ; O23:       vector.body:
 ; O23-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; O23-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[TMP0]], i64 [[INDEX]]
-; O23-NEXT:    [[TMP2:%.*]] = getelementptr inbounds double, ptr [[TMP1]], i64 2
+; O23-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, ptr [[TMP1]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; O23-NEXT:    store <2 x double> <double 2.000000e+00, double 2.000000e+00>, ptr [[TMP2]], align 8, !tbaa [[TBAA8]]
 ; O23-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
index 57951d7754269..e645a309b4793 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll
@@ -16,14 +16,14 @@ define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(ptr %FVP) {
 ; O1-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
 ; O1-SAME: (ptr nocapture readonly [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; O1-NEXT:  entry:
-; O1-NEXT:    [[VSRC23_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], ptr [[FVP]], i64 0, i32 1
+; O1-NEXT:    [[VSRC23_I:%.*]] = getelementptr inbounds i8, ptr [[FVP]], i64 16
 ; O1-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VSRC23_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; O1-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], ptr [[TMP0]], i64 undef, i32 1
+; O1-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], ptr [[TMP0]], i64 undef
+; O1-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX_I_I]], i64 8
 ; O1-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
 ; O1-NEXT:    [[CMP56_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
 ; O1-NEXT:    br i1 [[CMP56_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
 ; O1:       for.body7.lr.ph.i:
-; O1-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], ptr [[TMP0]], i64 undef
 ; O1-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX_I_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; O1-NEXT:    [[ARRAYIDX_I3_I:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 undef
 ; O1-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[FVP]], align 8, !tbaa [[TBAA0]]
@@ -46,14 +46,14 @@ define dso_local void @_Z13vecIncFromPtrP12FloatVecPair(ptr %FVP) {
 ; O23-LABEL: define {{[^@]+}}@_Z13vecIncFromPtrP12FloatVecPair
 ; O23-SAME: (ptr nocapture readonly [[FVP:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; O23-NEXT:  entry:
-; O23-NEXT:    [[VSRC23_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], ptr [[FVP]], i64 0, i32 1
+; O23-NEXT:    [[VSRC23_I:%.*]] = getelementptr inbounds i8, ptr [[FVP]], i64 16
 ; O23-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VSRC23_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
-; O23-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], ptr [[TMP0]], i64 undef, i32 1
+; O23-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], ptr [[TMP0]], i64 undef
+; O23-NEXT:    [[SIZE4_I:%.*]] = getelementptr inbounds i8, ptr [[ARRAYIDX_I_I]], i64 8
 ; O23-NEXT:    [[TMP1:%.*]] = load i32, ptr [[SIZE4_I]], align 8, !tbaa [[TBAA6:![0-9]+]]
 ; O23-NEXT:    [[CMP56_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0
 ; O23-NEXT:    br i1 [[CMP56_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]]
 ; O23:       for.body7.lr.ph.i:
-; O23-NEXT:    [[ARRAYIDX_I_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], ptr [[TMP0]], i64 undef
 ; O23-NEXT:    [[TMP2:%.*]] = load ptr, ptr [[ARRAYIDX_I_I]], align 8, !tbaa [[TBAA8:![0-9]+]]
 ; O23-NEXT:    [[ARRAYIDX_I3_I:%.*]] = getelementptr inbounds float, ptr [[TMP2]], i64 undef
 ; O23-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[FVP]], align 8, !tbaa [[TBAA0]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index 5053fd64f59ca..87d3900765490 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -37,9 +37,9 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[Y]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i64 4
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i64 8
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds double, ptr [[TMP5]], i64 12
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 32
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 64
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i64 96
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x double>, ptr [[TMP5]], align 8, !tbaa [[TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x double>, ptr [[TMP6]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <4 x double>, ptr [[TMP7]], align 8, !tbaa [[TBAA3]]
@@ -49,9 +49,9 @@ define void @vdiv(ptr %x, ptr %y, double %a, i32 %N) #0 {
 ; CHECK-NEXT:    [[TMP11:%.*]] = fmul fast <4 x double> [[WIDE_LOAD7]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = fmul fast <4 x double> [[WIDE_LOAD8]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds double, ptr [[X]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 4
-; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 8
-; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds double, ptr [[TMP13]], i64 12
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i64 32
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i64 64
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP13]], i64 96
 ; CHECK-NEXT:    store <4 x double> [[TMP9]], ptr [[TMP13]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    store <4 x double> [[TMP10]], ptr [[TMP14]], align 8, !tbaa [[TBAA3]]
 ; CHECK-NEXT:    store <4 x double> [[TMP11]], ptr [[TMP15]], align 8, !tbaa [[TBAA3]]
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-shift.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-shift.ll
index ba7f8009349ad..7ee9812b3e74c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-shift.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-shift.ll
@@ -37,17 +37,17 @@ define void @bar(ptr noundef %0) {
 ; SSE-NEXT:    [[TMP3:%.*]] = shl <2 x i64> [[TMP2]], <i64 44, i64 44>
 ; SSE-NEXT:    [[TMP4:%.*]] = sub nuw nsw <2 x i64> <i64 -17592186044416, i64 -17592186044416>, [[TMP3]]
 ; SSE-NEXT:    store <2 x i64> [[TMP4]], ptr [[TMP0]], align 8
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; SSE-NEXT:    [[TMP6:%.*]] = load <2 x i64>, ptr [[TMP5]], align 8
 ; SSE-NEXT:    [[TMP7:%.*]] = shl <2 x i64> [[TMP6]], <i64 44, i64 44>
 ; SSE-NEXT:    [[TMP8:%.*]] = sub nuw nsw <2 x i64> <i64 -17592186044416, i64 -17592186044416>, [[TMP7]]
 ; SSE-NEXT:    store <2 x i64> [[TMP8]], ptr [[TMP5]], align 8
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 4
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 32
 ; SSE-NEXT:    [[TMP10:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
 ; SSE-NEXT:    [[TMP11:%.*]] = shl <2 x i64> [[TMP10]], <i64 44, i64 44>
 ; SSE-NEXT:    [[TMP12:%.*]] = sub nuw nsw <2 x i64> <i64 -17592186044416, i64 -17592186044416>, [[TMP11]]
 ; SSE-NEXT:    store <2 x i64> [[TMP12]], ptr [[TMP9]], align 8
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 6
+; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 48
 ; SSE-NEXT:    [[TMP14:%.*]] = load <2 x i64>, ptr [[TMP13]], align 8
 ; SSE-NEXT:    [[TMP15:%.*]] = shl <2 x i64> [[TMP14]], <i64 44, i64 44>
 ; SSE-NEXT:    [[TMP16:%.*]] = sub nuw nsw <2 x i64> <i64 -17592186044416, i64 -17592186044416>, [[TMP15]]
@@ -59,7 +59,7 @@ define void @bar(ptr noundef %0) {
 ; AVX-NEXT:    [[TMP3:%.*]] = shl <4 x i64> [[TMP2]], <i64 44, i64 44, i64 44, i64 44>
 ; AVX-NEXT:    [[TMP4:%.*]] = sub nuw nsw <4 x i64> <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>, [[TMP3]]
 ; AVX-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP0]], align 8
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 4
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 32
 ; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; AVX-NEXT:    [[TMP7:%.*]] = shl <4 x i64> [[TMP6]], <i64 44, i64 44, i64 44, i64 44>
 ; AVX-NEXT:    [[TMP8:%.*]] = sub nuw nsw <4 x i64> <i64 -17592186044416, i64 -17592186044416, i64 -17592186044416, i64 -17592186044416>, [[TMP7]]
diff --git a/llvm/test/Transforms/PhaseOrdering/basic.ll b/llvm/test/Transforms/PhaseOrdering/basic.ll
index 43d2e522e96e3..343afedff1f85 100644
--- a/llvm/test/Transforms/PhaseOrdering/basic.ll
+++ b/llvm/test/Transforms/PhaseOrdering/basic.ll
@@ -32,10 +32,10 @@ define void @test1() nounwind ssp {
 define i32 @test2(i32 %a, ptr %p) nounwind uwtable ssp {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[DIV:%.*]] = lshr i32 [[A:%.*]], 2
-; CHECK-NEXT:    store i32 [[DIV]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[ADD:%.*]] = shl nuw nsw i32 [[DIV]], 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 1
+; CHECK-NEXT:    [[DIV1:%.*]] = lshr i32 [[A:%.*]], 2
+; CHECK-NEXT:    store i32 [[DIV1]], ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[ADD:%.*]] = shl nuw nsw i32 [[DIV1]], 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 4
 ; CHECK-NEXT:    store i32 [[ADD]], ptr [[ARRAYIDX1]], align 4
 ; CHECK-NEXT:    ret i32 0
 ;
diff --git a/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll b/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll
index dbe29a071092e..bd509509c321f 100644
--- a/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll
+++ b/llvm/test/Transforms/PhaseOrdering/loop-access-checks.ll
@@ -38,7 +38,7 @@ define void @test_fill_with_foreach([2 x i64] %elems.coerce) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[__BEGIN1_SROA_0_03:%.*]] = phi ptr [ [[INCDEC_PTR_I:%.*]], [[FOR_BODY]] ], [ [[TMP0]], [[FOR_COND_PREHEADER_SPLIT]] ]
 ; CHECK-NEXT:    tail call void @use(ptr noundef nonnull align 4 dereferenceable(4) [[__BEGIN1_SROA_0_03]])
-; CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i32, ptr [[__BEGIN1_SROA_0_03]], i64 1
+; CHECK-NEXT:    [[INCDEC_PTR_I]] = getelementptr inbounds i8, ptr [[__BEGIN1_SROA_0_03]], i64 4
 ; CHECK-NEXT:    [[CMP_I_NOT:%.*]] = icmp eq ptr [[INCDEC_PTR_I]], [[ADD_PTR_I]]
 ; CHECK-NEXT:    br i1 [[CMP_I_NOT]], label [[COMMON_RET]], label [[FOR_BODY]]
 ;
@@ -133,7 +133,7 @@ define void @foo(ptr noundef nonnull align 8 dereferenceable(24) noalias %vec) #
 ; CHECK-LABEL: define void @foo
 ; CHECK-SAME: (ptr noalias nocapture noundef nonnull readonly align 8 dereferenceable(24) [[VEC:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[_M_FINISH_I_I:%.*]] = getelementptr inbounds [[VECTOR_IMPL_DATA:%.*]], ptr [[VEC]], i64 0, i32 1
+; CHECK-NEXT:    [[_M_FINISH_I_I:%.*]] = getelementptr inbounds i8, ptr [[VEC]], i64 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_FINISH_I_I]], align 8, !tbaa [[TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC]], align 8, !tbaa [[TBAA5:![0-9]+]]
 ; CHECK-NEXT:    [[SUB_PTR_LHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP0]] to i64
@@ -272,7 +272,7 @@ define void @loop_with_signed_induction(ptr noundef nonnull align 8 dereferencea
 ; CHECK-LABEL: define void @loop_with_signed_induction
 ; CHECK-SAME: (ptr nocapture noundef nonnull readonly align 8 dereferenceable(24) [[VEC:%.*]]) local_unnamed_addr #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[_M_FINISH_I_I:%.*]] = getelementptr inbounds [[VECTOR_IMPL_DATA:%.*]], ptr [[VEC]], i64 0, i32 1
+; CHECK-NEXT:    [[_M_FINISH_I_I:%.*]] = getelementptr inbounds i8, ptr [[VEC]], i64 8
 ; CHECK-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[_M_FINISH_I_I]], align 8, !tbaa [[TBAA0]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[VEC]], align 8, !tbaa [[TBAA5]]
 ; CHECK-NEXT:    [[SUB_PTR_LHS_CAST_I_I:%.*]] = ptrtoint ptr [[TMP0]] to i64
diff --git a/llvm/test/Transforms/PhaseOrdering/pr39282.ll b/llvm/test/Transforms/PhaseOrdering/pr39282.ll
index 776c5c482bab5..2bbc01b677bb6 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr39282.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr39282.ll
@@ -21,8 +21,8 @@ define void @pr39282(ptr %addr1, ptr %addr2) {
 ; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META3:![0-9]+]])
 ; CHECK-NEXT:    [[X_I:%.*]] = load i32, ptr [[ADDR1:%.*]], align 4, !alias.scope !3, !noalias !0
 ; CHECK-NEXT:    store i32 [[X_I]], ptr [[ADDR2:%.*]], align 4, !alias.scope !0, !noalias !3
-; CHECK-NEXT:    [[ADDR1I_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR1]], i64 1
-; CHECK-NEXT:    [[ADDR2I_1:%.*]] = getelementptr inbounds i32, ptr [[ADDR2]], i64 1
+; CHECK-NEXT:    [[ADDR1I_1:%.*]] = getelementptr inbounds i8, ptr [[ADDR1]], i64 4
+; CHECK-NEXT:    [[ADDR2I_1:%.*]] = getelementptr inbounds i8, ptr [[ADDR2]], i64 4
 ; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META5:![0-9]+]])
 ; CHECK-NEXT:    tail call void @llvm.experimental.noalias.scope.decl(metadata [[META7:![0-9]+]])
 ; CHECK-NEXT:    [[X_I_1:%.*]] = load i32, ptr [[ADDR1I_1]], align 4, !alias.scope !7, !noalias !5
diff --git a/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll b/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll
index f186aceca4993..f5807c2105abe 100644
--- a/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll
+++ b/llvm/test/Transforms/PhaseOrdering/simplifycfg-options.ll
@@ -9,7 +9,7 @@ define i1 @PR33605(i32 %a, i32 %b, ptr %c) {
 ; CHECK-LABEL: @PR33605(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[OR:%.*]] = or i32 [[B:%.*]], [[A:%.*]]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[C:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[C:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[OR]], [[TMP0]]
 ; CHECK-NEXT:    br i1 [[CMP]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
diff --git a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
index 8f497e92e4ea1..aba003b13b459 100644
--- a/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
+++ b/llvm/test/Transforms/PhaseOrdering/switch_with_geps.ll
@@ -10,34 +10,11 @@ target datalayout = "n64"
 %"OpKind::Two" = type { [1 x i32], i32, i16, i16 }
 %"OpKind::Three" = type { [1 x i32], i32, i16, i16, i16, [1 x i16] }
 
-; FIXME: The switch should be optimized away.
 define i32 @test(ptr %ptr) {
 ; CHECK-LABEL: define i32 @test(
 ; CHECK-SAME: ptr nocapture readonly [[PTR:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  start:
-; CHECK-NEXT:    [[T:%.*]] = load i32, ptr [[PTR]], align 4
-; CHECK-NEXT:    switch i32 [[T]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:      i32 0, label [[BB4:%.*]]
-; CHECK-NEXT:      i32 1, label [[BB5:%.*]]
-; CHECK-NEXT:      i32 2, label [[BB6:%.*]]
-; CHECK-NEXT:      i32 3, label [[BB7:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       default:
-; CHECK-NEXT:    unreachable
-; CHECK:       bb4:
-; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds %"OpKind::Zero", ptr [[PTR]], i64 0, i32 1
-; CHECK-NEXT:    br label [[EXIT:%.*]]
-; CHECK:       bb5:
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds %"OpKind::One", ptr [[PTR]], i64 0, i32 1
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       bb6:
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds %"OpKind::Two", ptr [[PTR]], i64 0, i32 1
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       bb7:
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds %"OpKind::Three", ptr [[PTR]], i64 0, i32 1
-; CHECK-NEXT:    br label [[EXIT]]
-; CHECK:       exit:
-; CHECK-NEXT:    [[PHI:%.*]] = phi ptr [ [[GEP3]], [[BB7]] ], [ [[GEP2]], [[BB6]] ], [ [[GEP1]], [[BB5]] ], [ [[GEP0]], [[BB4]] ]
+; CHECK-NEXT:    [[PHI:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 4
 ; CHECK-NEXT:    [[RET:%.*]] = load i32, ptr [[PHI]], align 4
 ; CHECK-NEXT:    ret i32 [[RET]]
 ;
@@ -77,38 +54,13 @@ exit:
 
 %X = type { i64, i64, i64, i64, i64, i64 }
 
-; FIXME: The switch should be optimized away.
 define void @test2(ptr %self, i64 %v, i64 %ix) {
 ; CHECK-LABEL: define void @test2(
 ; CHECK-SAME: ptr nocapture writeonly [[SELF:%.*]], i64 [[V:%.*]], i64 [[IX:%.*]]) local_unnamed_addr #[[ATTR1:[0-9]+]] {
 ; CHECK-NEXT:  start:
-; CHECK-NEXT:    switch i64 [[IX]], label [[DEFAULT:%.*]] [
-; CHECK-NEXT:      i64 1, label [[BB3:%.*]]
-; CHECK-NEXT:      i64 2, label [[BB4:%.*]]
-; CHECK-NEXT:      i64 3, label [[BB5:%.*]]
-; CHECK-NEXT:      i64 4, label [[BB6:%.*]]
-; CHECK-NEXT:      i64 5, label [[BB7:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       default:
-; CHECK-NEXT:    unreachable
-; CHECK:       bb3:
-; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds [[X:%.*]], ptr [[SELF]], i64 0, i32 1
-; CHECK-NEXT:    br label [[BB8:%.*]]
-; CHECK:       bb4:
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds [[X]], ptr [[SELF]], i64 0, i32 2
-; CHECK-NEXT:    br label [[BB8]]
-; CHECK:       bb5:
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds [[X]], ptr [[SELF]], i64 0, i32 3
-; CHECK-NEXT:    br label [[BB8]]
-; CHECK:       bb6:
-; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds [[X]], ptr [[SELF]], i64 0, i32 4
-; CHECK-NEXT:    br label [[BB8]]
-; CHECK:       bb7:
-; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds [[X]], ptr [[SELF]], i64 0, i32 5
-; CHECK-NEXT:    br label [[BB8]]
-; CHECK:       bb8:
-; CHECK-NEXT:    [[PTR:%.*]] = phi ptr [ [[GEP5]], [[BB7]] ], [ [[GEP4]], [[BB6]] ], [ [[GEP3]], [[BB5]] ], [ [[GEP2]], [[BB4]] ], [ [[GEP1]], [[BB3]] ]
-; CHECK-NEXT:    store i64 [[V]], ptr [[PTR]], align 8
+; CHECK-NEXT:    [[SWITCH_TABLEIDX:%.*]] = shl i64 [[IX]], 3
+; CHECK-NEXT:    [[GEP5:%.*]] = getelementptr inbounds i8, ptr [[SELF]], i64 [[SWITCH_TABLEIDX]]
+; CHECK-NEXT:    store i64 [[V]], ptr [[GEP5]], align 8
 ; CHECK-NEXT:    ret void
 ;
 start:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
index 3a2a482a1ae7f..2ea4721692503 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-cost.ll
@@ -16,8 +16,8 @@ target triple = "aarch64--linux-gnu"
 
 define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
 ; CHECK-LABEL: @gather_multiple_use(
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[C:%.*]], i64 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[A:%.*]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[C:%.*]], i64 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[B:%.*]], i64 2
 ; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[D:%.*]], i64 3
 ; CHECK-NEXT:    [[TMP5:%.*]] = lshr <4 x i32> [[TMP4]], <i32 15, i32 15, i32 15, i32 15>
@@ -57,10 +57,10 @@ define internal i32 @gather_multiple_use(i32 %a, i32 %b, i32 %c, i32 %d) {
 @data = global [6 x [258 x i8]] zeroinitializer, align 1
 define void @gather_load(ptr noalias %ptr) {
 ; CHECK-LABEL: @gather_load(
-; CHECK-NEXT:    [[ARRAYIDX182:%.*]] = getelementptr inbounds i16, ptr [[PTR:%.*]], i64 1
-; CHECK-NEXT:    [[ARRAYIDX183:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 2
-; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 3
-; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds i16, ptr [[PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX182:%.*]] = getelementptr inbounds i8, ptr [[PTR:%.*]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX183:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 4
+; CHECK-NEXT:    [[ARRAYIDX184:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX185:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
 ; CHECK-NEXT:    [[L0:%.*]] = load i8, ptr getelementptr inbounds ([6 x [258 x i8]], ptr @data, i64 0, i64 1, i64 0), align 1
 ; CHECK-NEXT:    [[CONV150:%.*]] = zext i8 [[L0]] to i16
 ; CHECK-NEXT:    [[ADD152:%.*]] = add nuw nsw i16 [[CONV150]], 10
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
index d13597b43f9f7..51c6673a14353 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/gather-reduce.ll
@@ -36,7 +36,7 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i8, ptr [[A_ADDR_0101]], i64 16
 ; GENERIC-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; GENERIC-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; GENERIC-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
@@ -109,7 +109,7 @@ define i32 @gather_reduce_8x16_i32(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i8, ptr [[A_ADDR_0101]], i64 16
 ; KRYO-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; KRYO-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; KRYO-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
@@ -293,7 +293,7 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; GENERIC-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; GENERIC-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; GENERIC-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i8, ptr [[A_ADDR_0101]], i64 16
 ; GENERIC-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; GENERIC-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; GENERIC-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
@@ -366,7 +366,7 @@ define i32 @gather_reduce_8x16_i64(ptr nocapture readonly %a, ptr nocapture read
 ; KRYO-NEXT:    [[I_0103:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[SUM_0102:%.*]] = phi i32 [ [[ADD66]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
 ; KRYO-NEXT:    [[A_ADDR_0101:%.*]] = phi ptr [ [[INCDEC_PTR58:%.*]], [[FOR_BODY]] ], [ [[A:%.*]], [[FOR_BODY_PREHEADER]] ]
-; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i16, ptr [[A_ADDR_0101]], i64 8
+; KRYO-NEXT:    [[INCDEC_PTR58]] = getelementptr inbounds i8, ptr [[A_ADDR_0101]], i64 16
 ; KRYO-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[A_ADDR_0101]], align 2
 ; KRYO-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[TMP0]] to <8 x i32>
 ; KRYO-NEXT:    [[TMP2:%.*]] = load <8 x i16>, ptr [[B:%.*]], align 2
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
index efe8bd9e610bb..f98d0ad0527c2 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/loadorder.ll
@@ -133,7 +133,7 @@ define i16 @reduce_blockstrided2(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-LABEL: @reduce_blockstrided2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i16, ptr [[X:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]]
@@ -165,7 +165,7 @@ define i16 @reduce_blockstrided2(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP10:%.*]] = load i16, ptr [[ARRAYIDX28]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM15]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = load i16, ptr [[ARRAYIDX32]], align 2
-; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 2
 ; CHECK-NEXT:    [[TMP12:%.*]] = load i16, ptr [[ARRAYIDX33]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX36:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM4]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = load i16, ptr [[ARRAYIDX36]], align 2
@@ -254,9 +254,9 @@ define i16 @reduce_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-LABEL: @reduce_blockstrided3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[L0:%.*]] = load i16, ptr [[X:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 2
 ; CHECK-NEXT:    [[L1:%.*]] = load i16, ptr [[ARRAYIDX1]], align 2
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 4
 ; CHECK-NEXT:    [[L2:%.*]] = load i16, ptr [[ARRAYIDX2]], align 2
 ; CHECK-NEXT:    [[IDXPROM:%.*]] = sext i32 [[STRIDE:%.*]] to i64
 ; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM]]
@@ -270,9 +270,9 @@ define i16 @reduce_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, ptr [[X]], i64 [[IDXPROM9]]
 ; CHECK-NEXT:    [[L6:%.*]] = load i16, ptr [[ARRAYIDX10]], align 2
 ; CHECK-NEXT:    [[L8:%.*]] = load i16, ptr [[Y:%.*]], align 2
-; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX15:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 2
 ; CHECK-NEXT:    [[L9:%.*]] = load i16, ptr [[ARRAYIDX15]], align 2
-; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX16:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 4
 ; CHECK-NEXT:    [[L10:%.*]] = load i16, ptr [[ARRAYIDX16]], align 2
 ; CHECK-NEXT:    [[ARRAYIDX20:%.*]] = getelementptr inbounds i16, ptr [[Y]], i64 [[IDXPROM]]
 ; CHECK-NEXT:    [[L12:%.*]] = load i16, ptr [[ARRAYIDX20]], align 2
@@ -678,7 +678,7 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-LABEL: @store_blockstrided3(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[X:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[X]], i64 8
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[ADD4:%.*]] = add nsw i32 [[STRIDE:%.*]], 1
 ; CHECK-NEXT:    [[IDXPROM5:%.*]] = sext i32 [[ADD4]] to i64
@@ -699,7 +699,7 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[ARRAYIDX32:%.*]] = getelementptr inbounds i32, ptr [[X]], i64 [[IDXPROM31]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[ARRAYIDX32]], align 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr [[Y:%.*]], align 4
-; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 2
+; CHECK-NEXT:    [[ARRAYIDX35:%.*]] = getelementptr inbounds i8, ptr [[Y]], i64 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX35]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX41:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM5]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr [[ARRAYIDX41]], align 4
@@ -709,19 +709,19 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[ARRAYIDX60:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM23]]
 ; CHECK-NEXT:    [[ARRAYIDX68:%.*]] = getelementptr inbounds i32, ptr [[Y]], i64 [[IDXPROM31]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = load i32, ptr [[ARRAYIDX68]], align 4
-; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i32, ptr [[Z:%.*]], i64 1
+; CHECK-NEXT:    [[ARRAYIDX72:%.*]] = getelementptr inbounds i8, ptr [[Z:%.*]], i64 4
 ; CHECK-NEXT:    [[MUL73:%.*]] = mul nsw i32 [[TMP6]], [[TMP1]]
-; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 6
+; CHECK-NEXT:    [[ARRAYIDX76:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 24
 ; CHECK-NEXT:    [[TMP10:%.*]] = mul nsw <2 x i32> [[TMP5]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul nsw <2 x i32> [[TMP7]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <2 x i32> [[TMP10]], <2 x i32> [[TMP11]], <4 x i32> <i32 1, i32 0, i32 3, i32 2>
 ; CHECK-NEXT:    [[MUL81:%.*]] = mul nsw i32 [[TMP8]], [[TMP3]]
-; CHECK-NEXT:    [[ARRAYIDX82:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 8
+; CHECK-NEXT:    [[ARRAYIDX82:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 32
 ; CHECK-NEXT:    [[TMP13:%.*]] = load <2 x i32>, ptr [[ARRAYIDX16]], align 4
 ; CHECK-NEXT:    [[TMP14:%.*]] = load <2 x i32>, ptr [[ARRAYIDX52]], align 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul nsw <2 x i32> [[TMP14]], [[TMP13]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = shufflevector <2 x i32> [[TMP15]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
-; CHECK-NEXT:    [[ARRAYIDX90:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 10
+; CHECK-NEXT:    [[ARRAYIDX90:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 40
 ; CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i32>, ptr [[ARRAYIDX24]], align 4
 ; CHECK-NEXT:    [[TMP18:%.*]] = load <2 x i32>, ptr [[ARRAYIDX60]], align 4
 ; CHECK-NEXT:    store i32 [[MUL73]], ptr [[Z]], align 4
@@ -732,7 +732,7 @@ define void @store_blockstrided3(ptr nocapture noundef readonly %x, ptr nocaptur
 ; CHECK-NEXT:    [[TMP20:%.*]] = shufflevector <2 x i32> [[TMP19]], <2 x i32> poison, <2 x i32> <i32 1, i32 0>
 ; CHECK-NEXT:    store <2 x i32> [[TMP20]], ptr [[ARRAYIDX90]], align 4
 ; CHECK-NEXT:    [[MUL91:%.*]] = mul nsw i32 [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i32, ptr [[Z]], i64 9
+; CHECK-NEXT:    [[ARRAYIDX92:%.*]] = getelementptr inbounds i8, ptr [[Z]], i64 36
 ; CHECK-NEXT:    store i32 [[MUL91]], ptr [[ARRAYIDX92]], align 4
 ; CHECK-NEXT:    ret void
 ;
@@ -921,9 +921,9 @@ define void @store_blockstrided4x4(ptr nocapture noundef readonly %p1, i32 nound
 ; CHECK-NEXT:    [[ADD_PTR64:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[IDX_EXT63]]
 ; CHECK-NEXT:    [[ARRAYIDX3_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR]], i64 4
 ; CHECK-NEXT:    [[ARRAYIDX5_1:%.*]] = getelementptr inbounds i8, ptr [[ADD_PTR64]], i64 4
-; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds i32, ptr [[DST0:%.*]], i64 4
-; CHECK-NEXT:    [[DST8:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 8
-; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds i32, ptr [[DST0]], i64 12
+; CHECK-NEXT:    [[DST4:%.*]] = getelementptr inbounds i8, ptr [[DST0:%.*]], i64 16
+; CHECK-NEXT:    [[DST8:%.*]] = getelementptr inbounds i8, ptr [[DST0]], i64 32
+; CHECK-NEXT:    [[DST12:%.*]] = getelementptr inbounds i8, ptr [[DST0]], i64 48
 ; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i8>, ptr [[P1]], align 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext <4 x i8> [[TMP0]] to <4 x i32>
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x i8>, ptr [[ARRAYIDX3]], align 1
diff --git a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
index 24365a850d91d..1c408f19a8a32 100644
--- a/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
+++ b/llvm/test/Transforms/SLPVectorizer/WebAssembly/no-vectorize-rotate.ll
@@ -18,7 +18,7 @@ define void @foo(<2 x i64> %x, <4 x i32> %y, ptr %out) #0 {
 ; CHECK-NEXT:    [[E:%.*]] = extractelement <4 x i32> [[Y]], i64 3
 ; CHECK-NEXT:    [[CONV17:%.*]] = zext i32 [[E]] to i64
 ; CHECK-NEXT:    [[F:%.*]] = tail call i64 @llvm.fshl.i64(i64 [[D]], i64 [[D]], i64 [[CONV17]])
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[OUT]], i32 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[OUT]], i32 8
 ; CHECK-NEXT:    store i64 [[F]], ptr [[ARRAYIDX2]], align 8
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
index b58e5f06d78da..593aad82ad5d8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/operandorder.ll
@@ -233,7 +233,7 @@ define void @vecload_vs_broadcast4(ptr noalias %from, ptr noalias %to, double %v
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, ptr [[FROM:%.*]], i32 1
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_2]], i64 0
@@ -286,7 +286,7 @@ define void @shuffle_nodes_match2(ptr noalias %from, ptr noalias %to, double %v1
 ; CHECK-NEXT:    br label [[LP:%.*]]
 ; CHECK:       lp:
 ; CHECK-NEXT:    [[P:%.*]] = phi double [ 1.000000e+00, [[LP]] ], [ 0.000000e+00, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr double, ptr [[FROM:%.*]], i32 1
+; CHECK-NEXT:    [[FROM_1:%.*]] = getelementptr i8, ptr [[FROM:%.*]], i32 8
 ; CHECK-NEXT:    [[V0_1:%.*]] = load double, ptr [[FROM]], align 4
 ; CHECK-NEXT:    [[V0_2:%.*]] = load double, ptr [[FROM_1]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x double> poison, double [[V0_1]], i64 0
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/opt.ll b/llvm/test/Transforms/SLPVectorizer/X86/opt.ll
index 85447902618c8..95da098b8b143 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/opt.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/opt.ll
@@ -10,10 +10,10 @@ target triple = "x86_64-apple-macosx10.8.0"
 define void @test1(ptr %a, ptr %b, ptr %c) {
 ; SLP-LABEL: @test1(
 ; SLP-NEXT:  entry:
-; SLP-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
-; SLP-NEXT:    [[TMP3:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
-; SLP-NEXT:    [[TMP4:%.*]] = fmul <2 x double> [[TMP1]], [[TMP3]]
-; SLP-NEXT:    store <2 x double> [[TMP4]], ptr [[C:%.*]], align 8
+; SLP-NEXT:    [[TMP0:%.*]] = load <2 x double>, ptr [[A:%.*]], align 8
+; SLP-NEXT:    [[TMP1:%.*]] = load <2 x double>, ptr [[B:%.*]], align 8
+; SLP-NEXT:    [[TMP2:%.*]] = fmul <2 x double> [[TMP0]], [[TMP1]]
+; SLP-NEXT:    store <2 x double> [[TMP2]], ptr [[C:%.*]], align 8
 ; SLP-NEXT:    ret void
 ;
 ; NOSLP-LABEL: @test1(
@@ -21,13 +21,13 @@ define void @test1(ptr %a, ptr %b, ptr %c) {
 ; NOSLP-NEXT:    [[I0:%.*]] = load double, ptr [[A:%.*]], align 8
 ; NOSLP-NEXT:    [[I1:%.*]] = load double, ptr [[B:%.*]], align 8
 ; NOSLP-NEXT:    [[MUL:%.*]] = fmul double [[I0]], [[I1]]
-; NOSLP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds double, ptr [[A]], i64 1
+; NOSLP-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8
 ; NOSLP-NEXT:    [[I3:%.*]] = load double, ptr [[ARRAYIDX3]], align 8
-; NOSLP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds double, ptr [[B]], i64 1
+; NOSLP-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 8
 ; NOSLP-NEXT:    [[I4:%.*]] = load double, ptr [[ARRAYIDX4]], align 8
 ; NOSLP-NEXT:    [[MUL5:%.*]] = fmul double [[I3]], [[I4]]
 ; NOSLP-NEXT:    store double [[MUL]], ptr [[C:%.*]], align 8
-; NOSLP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds double, ptr [[C]], i64 1
+; NOSLP-NEXT:    [[ARRAYIDX5:%.*]] = getelementptr inbounds i8, ptr [[C]], i64 8
 ; NOSLP-NEXT:    store double [[MUL5]], ptr [[ARRAYIDX5]], align 8
 ; NOSLP-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
index 48d8de2a96186..c38f2748a9763 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr46983.ll
@@ -7,10 +7,10 @@
 
 define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i32(
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i32>, ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP5]], [[SHUFFLE]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i32>, ptr [[TMP0:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP6:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = mul <4 x i32> [[TMP4]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = lshr <4 x i32> [[TMP7]], <i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP9:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP8]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
 ; CHECK-NEXT:    store <4 x i32> [[TMP9]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
@@ -48,11 +48,11 @@ define void @store_i32(ptr nocapture %0, i32 %1, i32 %2) {
 
 define void @store_i8(ptr nocapture %0, i32 %1, i32 %2) {
 ; CHECK-LABEL: @store_i8(
-; CHECK-NEXT:    [[TMP5:%.*]] = load <4 x i8>, ptr [[TMP0:%.*]], align 1, !tbaa [[TBAA4:![0-9]+]]
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[TMP5]] to <4 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0
-; CHECK-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i32> [[TMP7]], <4 x i32> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[SHUFFLE]], [[TMP6]]
+; CHECK-NEXT:    [[TMP4:%.*]] = load <4 x i8>, ptr [[TMP0:%.*]], align 1, !tbaa [[TBAA4:![0-9]+]]
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <4 x i8> [[TMP4]] to <4 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> poison, i32 [[TMP1:%.*]], i64 0
+; CHECK-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP6]], <4 x i32> poison, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP8:%.*]] = mul <4 x i32> [[TMP7]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = lshr <4 x i32> [[TMP8]], <i32 15, i32 15, i32 15, i32 15>
 ; CHECK-NEXT:    [[TMP10:%.*]] = call <4 x i32> @llvm.umin.v4i32(<4 x i32> [[TMP9]], <4 x i32> <i32 255, i32 255, i32 255, i32 255>)
 ; CHECK-NEXT:    [[TMP11:%.*]] = trunc <4 x i32> [[TMP10]] to <4 x i8>
@@ -108,7 +108,7 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ; SSE-NEXT:    [[TMP10:%.*]] = and i64 [[TMP7]], 4294967295
 ; SSE-NEXT:    [[TMP11:%.*]] = select i1 [[TMP9]], i64 [[TMP10]], i64 255
 ; SSE-NEXT:    store i64 [[TMP11]], ptr [[TMP0]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 1
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 ; SSE-NEXT:    [[TMP13:%.*]] = load i64, ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], [[TMP4]]
 ; SSE-NEXT:    [[TMP15:%.*]] = lshr i64 [[TMP14]], 15
@@ -117,7 +117,7 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ; SSE-NEXT:    [[TMP18:%.*]] = and i64 [[TMP15]], 4294967295
 ; SSE-NEXT:    [[TMP19:%.*]] = select i1 [[TMP17]], i64 [[TMP18]], i64 255
 ; SSE-NEXT:    store i64 [[TMP19]], ptr [[TMP12]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; SSE-NEXT:    [[TMP21:%.*]] = load i64, ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], [[TMP4]]
 ; SSE-NEXT:    [[TMP23:%.*]] = lshr i64 [[TMP22]], 15
@@ -126,7 +126,7 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ; SSE-NEXT:    [[TMP26:%.*]] = and i64 [[TMP23]], 4294967295
 ; SSE-NEXT:    [[TMP27:%.*]] = select i1 [[TMP25]], i64 [[TMP26]], i64 255
 ; SSE-NEXT:    store i64 [[TMP27]], ptr [[TMP20]], align 8, !tbaa [[TBAA5]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i64, ptr [[TMP0]], i64 3
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
 ; SSE-NEXT:    [[TMP29:%.*]] = load i64, ptr [[TMP28]], align 8, !tbaa [[TBAA5]]
 ; SSE-NEXT:    [[TMP30:%.*]] = mul i64 [[TMP29]], [[TMP4]]
 ; SSE-NEXT:    [[TMP31:%.*]] = lshr i64 [[TMP30]], 15
@@ -139,10 +139,10 @@ define void @store_i64(ptr nocapture %0, i32 %1, i32 %2) {
 ;
 ; AVX-LABEL: @store_i64(
 ; AVX-NEXT:    [[TMP4:%.*]] = zext i32 [[TMP1:%.*]] to i64
-; AVX-NEXT:    [[TMP6:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
-; AVX-NEXT:    [[SHUFFLE:%.*]] = shufflevector <4 x i64> [[TMP7]], <4 x i64> poison, <4 x i32> zeroinitializer
-; AVX-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP6]], [[SHUFFLE]]
+; AVX-NEXT:    [[TMP5:%.*]] = load <4 x i64>, ptr [[TMP0:%.*]], align 8, !tbaa [[TBAA5:![0-9]+]]
+; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i64> poison, i64 [[TMP4]], i64 0
+; AVX-NEXT:    [[TMP7:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> zeroinitializer
+; AVX-NEXT:    [[TMP8:%.*]] = mul <4 x i64> [[TMP5]], [[TMP7]]
 ; AVX-NEXT:    [[TMP9:%.*]] = lshr <4 x i64> [[TMP8]], <i64 15, i64 15, i64 15, i64 15>
 ; AVX-NEXT:    [[TMP10:%.*]] = trunc <4 x i64> [[TMP9]] to <4 x i32>
 ; AVX-NEXT:    [[TMP11:%.*]] = icmp ult <4 x i32> [[TMP10]], <i32 255, i32 255, i32 255, i32 255>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
index 8e17088b27569..5b33c6e889363 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629-inseltpoison.ll
@@ -7,11 +7,11 @@
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -23,11 +23,11 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -39,11 +39,11 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -55,11 +55,11 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -71,11 +71,11 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -107,35 +107,35 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 
 define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 ; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 12
 ; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
 ; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
 ; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
@@ -146,13 +146,13 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
 ; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
@@ -163,13 +163,13 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_2(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
+; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
+; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
 ; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
@@ -215,39 +215,39 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; SSE-LABEL: @gather_load_3(
 ; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 ; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 12
 ; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 60
 ; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 72
 ; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 20
 ; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
+; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 36
 ; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
 ; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24
 ; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 28
 ; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 84
 ; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
 ; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]]
@@ -255,19 +255,19 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 60
 ; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 72
 ; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 36
 ; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 84
 ; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
@@ -283,19 +283,19 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX2-LABEL: @gather_load_3(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 60
 ; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 72
 ; AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 36
 ; AVX2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24
 ; AVX2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 84
 ; AVX2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
@@ -370,20 +370,20 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 
 define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1
-; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2
-; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3
-; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 4
-; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
-; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 5
-; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
-; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 6
-; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
-; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7
-; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[T0:%.*]], i64 4
+; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i8, ptr [[T1:%.*]], i64 44
+; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 8
+; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 16
+; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 12
+; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 60
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 16
+; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 72
+; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 20
+; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 36
+; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 24
+; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 24
+; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 28
+; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 84
 ; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -411,13 +411,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
-; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
+; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i8, ptr [[T1:%.*]], i64 44
+; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 16
+; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 60
+; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 72
+; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 36
+; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 24
+; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 84
 ; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -439,13 +439,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
-; AVX2-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
-; AVX2-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
-; AVX2-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
-; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
-; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
-; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
+; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i8, ptr [[T1:%.*]], i64 44
+; AVX2-NEXT:    [[T10:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 16
+; AVX2-NEXT:    [[T14:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 60
+; AVX2-NEXT:    [[T18:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 72
+; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 36
+; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 24
+; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 84
 ; AVX2-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -533,21 +533,21 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; SSE-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; SSE-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
 ; SSE-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; SSE-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
 ; SSE-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
 ; SSE-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
 ; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
 ; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1
 ; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2
@@ -558,21 +558,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3
 ; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
 ; SSE-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; SSE-NEXT:    [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
+; SSE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; SSE-NEXT:    [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0
 ; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1
@@ -588,35 +588,35 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
 ; AVX-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
 ; AVX-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
 ; AVX-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
 ; AVX-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
+; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
 ; AVX-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
+; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; AVX-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; AVX-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
+; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; AVX-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; AVX-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; AVX-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
 ; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
@@ -640,35 +640,35 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX2-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX2-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
 ; AVX2-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX2-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
 ; AVX2-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
 ; AVX2-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
 ; AVX2-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
 ; AVX2-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; AVX2-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; AVX2-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX2-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; AVX2-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; AVX2-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; AVX2-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
 ; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
index 500856a0d66a9..09d6c77557efa 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr47629.ll
@@ -7,11 +7,11 @@
 
 define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -23,11 +23,11 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -39,11 +39,11 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -55,11 +55,11 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -71,11 +71,11 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 ; AVX512F-NEXT:    ret void
 ;
 ; AVX512VL-LABEL: @gather_load(
-; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512VL-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX512VL-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP1]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX512VL-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX512VL-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX512VL-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX512VL-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; AVX512VL-NEXT:    [[TMP10:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
@@ -107,35 +107,35 @@ define void @gather_load(ptr noalias nocapture %0, ptr noalias nocapture readonl
 
 define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_2(
-; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; SSE-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; SSE-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP5:%.*]] = add nsw i32 [[TMP4]], 1
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    store i32 [[TMP5]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; SSE-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP9:%.*]] = add nsw i32 [[TMP8]], 2
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 ; SSE-NEXT:    store i32 [[TMP9]], ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; SSE-NEXT:    [[TMP12:%.*]] = load i32, ptr [[TMP11]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP13:%.*]] = add nsw i32 [[TMP12]], 3
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 12
 ; SSE-NEXT:    store i32 [[TMP13]], ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP16:%.*]] = load i32, ptr [[TMP15]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP17:%.*]] = add nsw i32 [[TMP16]], 4
 ; SSE-NEXT:    store i32 [[TMP17]], ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_2(
-; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
 ; AVX-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
@@ -146,13 +146,13 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_2(
-; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX2-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX2-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX2-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX2-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
 ; AVX2-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
@@ -163,13 +163,13 @@ define void @gather_load_2(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; AVX2-NEXT:    ret void
 ;
 ; AVX512F-LABEL: @gather_load_2(
-; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[TMP1:%.*]], i64 1
+; AVX512F-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1:%.*]], i64 4
 ; AVX512F-NEXT:    [[TMP4:%.*]] = load i32, ptr [[TMP3]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 10
+; AVX512F-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX512F-NEXT:    [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 3
+; AVX512F-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX512F-NEXT:    [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4, !tbaa [[TBAA0]]
-; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 5
+; AVX512F-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX512F-NEXT:    [[TMP10:%.*]] = load i32, ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
 ; AVX512F-NEXT:    [[TMP11:%.*]] = insertelement <4 x i32> poison, i32 [[TMP4]], i64 0
 ; AVX512F-NEXT:    [[TMP12:%.*]] = insertelement <4 x i32> [[TMP11]], i32 [[TMP6]], i64 1
@@ -215,39 +215,39 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ; SSE-LABEL: @gather_load_3(
 ; SSE-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 1
-; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP0:%.*]], i64 1
+; SSE-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 4
 ; SSE-NEXT:    store i32 [[TMP4]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP8:%.*]] = add i32 [[TMP7]], 2
-; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 2
+; SSE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 8
 ; SSE-NEXT:    store i32 [[TMP8]], ptr [[TMP5]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; SSE-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP12:%.*]] = add i32 [[TMP11]], 3
-; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 3
+; SSE-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 12
 ; SSE-NEXT:    store i32 [[TMP12]], ptr [[TMP9]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 60
 ; SSE-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP16:%.*]] = add i32 [[TMP15]], 4
-; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 4
+; SSE-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 16
 ; SSE-NEXT:    store i32 [[TMP16]], ptr [[TMP13]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 72
 ; SSE-NEXT:    [[TMP19:%.*]] = load i32, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP20:%.*]] = add i32 [[TMP19]], 1
-; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 5
+; SSE-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 20
 ; SSE-NEXT:    store i32 [[TMP20]], ptr [[TMP17]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
+; SSE-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 36
 ; SSE-NEXT:    [[TMP23:%.*]] = load i32, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP24:%.*]] = add i32 [[TMP23]], 2
-; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 6
+; SSE-NEXT:    [[TMP25:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 24
 ; SSE-NEXT:    store i32 [[TMP24]], ptr [[TMP21]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
+; SSE-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24
 ; SSE-NEXT:    [[TMP27:%.*]] = load i32, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP28:%.*]] = add i32 [[TMP27]], 3
-; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i32, ptr [[TMP0]], i64 7
+; SSE-NEXT:    [[TMP29:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 28
 ; SSE-NEXT:    store i32 [[TMP28]], ptr [[TMP25]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 84
 ; SSE-NEXT:    [[TMP31:%.*]] = load i32, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP32:%.*]] = add i32 [[TMP31]], 4
 ; SSE-NEXT:    store i32 [[TMP32]], ptr [[TMP29]], align 4, !tbaa [[TBAA0]]
@@ -255,19 +255,19 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX-LABEL: @gather_load_3(
 ; AVX-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 60
 ; AVX-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 72
 ; AVX-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 36
 ; AVX-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24
 ; AVX-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 84
 ; AVX-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; AVX-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
@@ -283,19 +283,19 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 ;
 ; AVX2-LABEL: @gather_load_3(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP5:%.*]] = load i32, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX2-NEXT:    [[TMP7:%.*]] = load i32, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 15
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 60
 ; AVX2-NEXT:    [[TMP9:%.*]] = load i32, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 18
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 72
 ; AVX2-NEXT:    [[TMP11:%.*]] = load i32, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 9
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 36
 ; AVX2-NEXT:    [[TMP13:%.*]] = load i32, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 6
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 24
 ; AVX2-NEXT:    [[TMP15:%.*]] = load i32, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i64 21
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 84
 ; AVX2-NEXT:    [[TMP17:%.*]] = load i32, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP18:%.*]] = insertelement <8 x i32> poison, i32 [[TMP3]], i64 0
 ; AVX2-NEXT:    [[TMP19:%.*]] = insertelement <8 x i32> [[TMP18]], i32 [[TMP5]], i64 1
@@ -370,20 +370,20 @@ define void @gather_load_3(ptr noalias nocapture %0, ptr noalias nocapture reado
 
 define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture readonly %t1) {
 ; SSE-LABEL: @gather_load_4(
-; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i32, ptr [[T0:%.*]], i64 1
-; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
-; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 2
-; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
-; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 3
-; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
-; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 4
-; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
-; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 5
-; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
-; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 6
-; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
-; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i32, ptr [[T0]], i64 7
-; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
+; SSE-NEXT:    [[T5:%.*]] = getelementptr inbounds i8, ptr [[T0:%.*]], i64 4
+; SSE-NEXT:    [[T6:%.*]] = getelementptr inbounds i8, ptr [[T1:%.*]], i64 44
+; SSE-NEXT:    [[T9:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 8
+; SSE-NEXT:    [[T10:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 16
+; SSE-NEXT:    [[T13:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 12
+; SSE-NEXT:    [[T14:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 60
+; SSE-NEXT:    [[T17:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 16
+; SSE-NEXT:    [[T18:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 72
+; SSE-NEXT:    [[T21:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 20
+; SSE-NEXT:    [[T22:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 36
+; SSE-NEXT:    [[T25:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 24
+; SSE-NEXT:    [[T26:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 24
+; SSE-NEXT:    [[T29:%.*]] = getelementptr inbounds i8, ptr [[T0]], i64 28
+; SSE-NEXT:    [[T30:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 84
 ; SSE-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -411,13 +411,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; SSE-NEXT:    ret void
 ;
 ; AVX-LABEL: @gather_load_4(
-; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
-; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
-; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
-; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
-; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
-; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
-; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
+; AVX-NEXT:    [[T6:%.*]] = getelementptr inbounds i8, ptr [[T1:%.*]], i64 44
+; AVX-NEXT:    [[T10:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 16
+; AVX-NEXT:    [[T14:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 60
+; AVX-NEXT:    [[T18:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 72
+; AVX-NEXT:    [[T22:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 36
+; AVX-NEXT:    [[T26:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 24
+; AVX-NEXT:    [[T30:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 84
 ; AVX-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -439,13 +439,13 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 ; AVX-NEXT:    ret void
 ;
 ; AVX2-LABEL: @gather_load_4(
-; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i32, ptr [[T1:%.*]], i64 11
-; AVX2-NEXT:    [[T10:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 4
-; AVX2-NEXT:    [[T14:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 15
-; AVX2-NEXT:    [[T18:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 18
-; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 9
-; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 6
-; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i32, ptr [[T1]], i64 21
+; AVX2-NEXT:    [[T6:%.*]] = getelementptr inbounds i8, ptr [[T1:%.*]], i64 44
+; AVX2-NEXT:    [[T10:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 16
+; AVX2-NEXT:    [[T14:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 60
+; AVX2-NEXT:    [[T18:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 72
+; AVX2-NEXT:    [[T22:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 36
+; AVX2-NEXT:    [[T26:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 24
+; AVX2-NEXT:    [[T30:%.*]] = getelementptr inbounds i8, ptr [[T1]], i64 84
 ; AVX2-NEXT:    [[T3:%.*]] = load i32, ptr [[T1]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[T7:%.*]] = load i32, ptr [[T6]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[T11:%.*]] = load i32, ptr [[T10]], align 4, !tbaa [[TBAA0]]
@@ -533,21 +533,21 @@ define void @gather_load_4(ptr noalias nocapture %t0, ptr noalias nocapture read
 define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture readonly %1) {
 ; SSE-LABEL: @gather_load_div(
 ; SSE-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4
+; SSE-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; SSE-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; SSE-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; SSE-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; SSE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
 ; SSE-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; SSE-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; SSE-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; SSE-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; SSE-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
+; SSE-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
 ; SSE-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
+; SSE-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
 ; SSE-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP0:%.*]], i64 4
+; SSE-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP0:%.*]], i64 16
 ; SSE-NEXT:    [[TMP19:%.*]] = insertelement <4 x float> poison, float [[TMP3]], i64 0
 ; SSE-NEXT:    [[TMP20:%.*]] = insertelement <4 x float> [[TMP19]], float [[TMP7]], i64 1
 ; SSE-NEXT:    [[TMP21:%.*]] = insertelement <4 x float> [[TMP20]], float [[TMP11]], i64 2
@@ -558,21 +558,21 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ; SSE-NEXT:    [[TMP26:%.*]] = insertelement <4 x float> [[TMP25]], float [[TMP17]], i64 3
 ; SSE-NEXT:    [[TMP27:%.*]] = fdiv <4 x float> [[TMP22]], [[TMP26]]
 ; SSE-NEXT:    store <4 x float> [[TMP27]], ptr [[TMP0]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
+; SSE-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
 ; SSE-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
+; SSE-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
 ; SSE-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
+; SSE-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; SSE-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
+; SSE-NEXT:    [[TMP34:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; SSE-NEXT:    [[TMP35:%.*]] = load float, ptr [[TMP34]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
+; SSE-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; SSE-NEXT:    [[TMP37:%.*]] = load float, ptr [[TMP36]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
+; SSE-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; SSE-NEXT:    [[TMP39:%.*]] = load float, ptr [[TMP38]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
+; SSE-NEXT:    [[TMP40:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; SSE-NEXT:    [[TMP41:%.*]] = load float, ptr [[TMP40]], align 4, !tbaa [[TBAA0]]
-; SSE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
+; SSE-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; SSE-NEXT:    [[TMP43:%.*]] = load float, ptr [[TMP42]], align 4, !tbaa [[TBAA0]]
 ; SSE-NEXT:    [[TMP44:%.*]] = insertelement <4 x float> poison, float [[TMP29]], i64 0
 ; SSE-NEXT:    [[TMP45:%.*]] = insertelement <4 x float> [[TMP44]], float [[TMP33]], i64 1
@@ -588,35 +588,35 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX-LABEL: @gather_load_div(
 ; AVX-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4
+; AVX-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
 ; AVX-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; AVX-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
+; AVX-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
 ; AVX-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
+; AVX-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
 ; AVX-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
+; AVX-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
 ; AVX-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
+; AVX-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
 ; AVX-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
+; AVX-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; AVX-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
+; AVX-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; AVX-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
+; AVX-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
+; AVX-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; AVX-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
+; AVX-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; AVX-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
+; AVX-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; AVX-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
 ; AVX-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
 ; AVX-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
@@ -640,35 +640,35 @@ define void @gather_load_div(ptr noalias nocapture %0, ptr noalias nocapture rea
 ;
 ; AVX2-LABEL: @gather_load_div(
 ; AVX2-NEXT:    [[TMP3:%.*]] = load float, ptr [[TMP1:%.*]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 4
+; AVX2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 16
 ; AVX2-NEXT:    [[TMP5:%.*]] = load float, ptr [[TMP4]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 10
+; AVX2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 40
 ; AVX2-NEXT:    [[TMP7:%.*]] = load float, ptr [[TMP6]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 13
+; AVX2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 52
 ; AVX2-NEXT:    [[TMP9:%.*]] = load float, ptr [[TMP8]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 3
+; AVX2-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 12
 ; AVX2-NEXT:    [[TMP11:%.*]] = load float, ptr [[TMP10]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 11
+; AVX2-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 44
 ; AVX2-NEXT:    [[TMP13:%.*]] = load float, ptr [[TMP12]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 14
+; AVX2-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 56
 ; AVX2-NEXT:    [[TMP15:%.*]] = load float, ptr [[TMP14]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 44
+; AVX2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 176
 ; AVX2-NEXT:    [[TMP17:%.*]] = load float, ptr [[TMP16]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 17
+; AVX2-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 68
 ; AVX2-NEXT:    [[TMP19:%.*]] = load float, ptr [[TMP18]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 33
+; AVX2-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 132
 ; AVX2-NEXT:    [[TMP21:%.*]] = load float, ptr [[TMP20]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 8
+; AVX2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 32
 ; AVX2-NEXT:    [[TMP23:%.*]] = load float, ptr [[TMP22]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 30
+; AVX2-NEXT:    [[TMP24:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 120
 ; AVX2-NEXT:    [[TMP25:%.*]] = load float, ptr [[TMP24]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 5
+; AVX2-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 20
 ; AVX2-NEXT:    [[TMP27:%.*]] = load float, ptr [[TMP26]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 27
+; AVX2-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 108
 ; AVX2-NEXT:    [[TMP29:%.*]] = load float, ptr [[TMP28]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 20
+; AVX2-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 80
 ; AVX2-NEXT:    [[TMP31:%.*]] = load float, ptr [[TMP30]], align 4, !tbaa [[TBAA0]]
-; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds float, ptr [[TMP1]], i64 23
+; AVX2-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i64 92
 ; AVX2-NEXT:    [[TMP33:%.*]] = load float, ptr [[TMP32]], align 4, !tbaa [[TBAA0]]
 ; AVX2-NEXT:    [[TMP34:%.*]] = insertelement <8 x float> poison, float [[TMP3]], i64 0
 ; AVX2-NEXT:    [[TMP35:%.*]] = insertelement <8 x float> [[TMP34]], float [[TMP7]], i64 1
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
index f015bae700754..d1c15306a72d0 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-instcombine.ll
@@ -18,9 +18,9 @@
 define dso_local void @merge(ptr nocapture readonly %params) local_unnamed_addr align 2 {
 ; CHECK-LABEL: @merge(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SAVEPAIRLISTS3:%.*]] = getelementptr inbounds [[STRUCT_NONBONDED:%.*]], ptr [[PARAMS:%.*]], i64 0, i32 11
+; CHECK-NEXT:    [[SAVEPAIRLISTS3:%.*]] = getelementptr inbounds i8, ptr [[PARAMS:%.*]], i64 144
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[SAVEPAIRLISTS3]], align 8
-; CHECK-NEXT:    [[USEPAIRLISTS4:%.*]] = getelementptr inbounds [[STRUCT_NONBONDED]], ptr [[PARAMS]], i64 0, i32 12
+; CHECK-NEXT:    [[USEPAIRLISTS4:%.*]] = getelementptr inbounds i8, ptr [[PARAMS]], i64 148
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[USEPAIRLISTS4]], align 4
 ; CHECK-NEXT:    [[TOBOOL54_NOT:%.*]] = icmp eq i32 [[TMP0]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL54_NOT]], label [[LOR_LHS_FALSE55:%.*]], label [[IF_END109:%.*]]
@@ -37,7 +37,7 @@ define dso_local void @merge(ptr nocapture readonly %params) local_unnamed_addr
 ; CHECK:       if.then117:
 ; CHECK-NEXT:    ret void
 ; CHECK:       if.then138:
-; CHECK-NEXT:    [[DOTIN:%.*]] = getelementptr inbounds [[STRUCT_NONBONDED]], ptr [[PARAMS]], i64 0, i32 16
+; CHECK-NEXT:    [[DOTIN:%.*]] = getelementptr inbounds i8, ptr [[PARAMS]], i64 172
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[DOTIN]], align 4
 ; CHECK-NEXT:    [[TOBOOL139_NOT:%.*]] = icmp eq i32 [[TMP2]], 0
 ; CHECK-NEXT:    br i1 [[TOBOOL139_NOT]], label [[IF_ELSE147:%.*]], label [[IF_THEN140:%.*]]
@@ -89,9 +89,9 @@ if.else147:                                       ; preds = %if.then138
 ;; Check the last store is deleted.
 define i32 @load(ptr nocapture %a, ptr nocapture %b) {
 ; CHECK-LABEL: @load(
-; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i8, ptr [[A:%.*]], i64 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[B:%.*]], i64 1
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[B:%.*]], i64 4
 ; CHECK-NEXT:    store i32 [[TMP2]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    call void @llvm.pseudoprobe(i64 5116412291814990879, i64 1, i32 0, i64 -1)
 ; CHECK-NEXT:    ret i32 [[TMP2]]
diff --git a/llvm/test/Transforms/Util/strip-gc-relocates.ll b/llvm/test/Transforms/Util/strip-gc-relocates.ll
index 552e3b49dd493..c6013bc54fbe2 100644
--- a/llvm/test/Transforms/Util/strip-gc-relocates.ll
+++ b/llvm/test/Transforms/Util/strip-gc-relocates.ll
@@ -28,7 +28,7 @@ entry:
 define void @test2(ptr addrspace(1) %base) gc "statepoint-example" {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[PTR_GEP1:%.*]] = getelementptr i32, ptr addrspace(1) [[BASE:%.*]], i64 30
+; CHECK-NEXT:    [[PTR_GEP1:%.*]] = getelementptr i8, ptr addrspace(1) [[BASE:%.*]], i64 120
 ; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr nonnull elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "gc-live"() ]
 ; CHECK-NEXT:    call void @use_obj32(ptr addrspace(1) [[PTR_GEP1]])
 ; CHECK-NEXT:    [[STATEPOINT_TOKEN1:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr nonnull elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "gc-live"() ]
@@ -91,7 +91,7 @@ define void @test4(i1 %cond) gc "statepoint-example" {
 ; CHECK:       merge:
 ; CHECK-NEXT:    [[BASEPHI_BASE:%.*]] = phi ptr addrspace(1) [ [[BASE1]], [[HERE]] ], [ [[BASE2]], [[THERE]] ], !is_base_value !0
 ; CHECK-NEXT:    [[STATEPOINT_TOKEN:%.*]] = call token (i64, i32, ptr, i32, i32, ...) @llvm.experimental.gc.statepoint.p0(i64 2882400000, i32 0, ptr nonnull elementtype(void ()) @do_safepoint, i32 0, i32 0, i32 0, i32 0) [ "gc-live"() ]
-; CHECK-NEXT:    [[PTR_GEP_REMAT:%.*]] = getelementptr i32, ptr addrspace(1) [[BASEPHI_BASE]], i64 15
+; CHECK-NEXT:    [[PTR_GEP_REMAT:%.*]] = getelementptr i8, ptr addrspace(1) [[BASEPHI_BASE]], i64 60
 ; CHECK-NEXT:    call void @use_obj32(ptr addrspace(1) [[PTR_GEP_REMAT]])
 ; CHECK-NEXT:    ret void
 ;

From 3d91d9613e294b242d853039209b40a0cb7853f2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 24 Jan 2024 14:25:54 +0000
Subject: [PATCH 789/843] [ConstraintElim] Make sure min/max intrinsic results
 are not poison.

The result of umin may be poison and in that case the added constraints
are not be valid in contexts where poison doesn't cause UB. Only queue
facts for min/max intrinsics if the result is guaranteed to not be
poison.

This could be improved in the future, by only adding the fact when
solving conditions using the result value.

Fixes https://github.com/llvm/llvm-project/issues/78621.
---
 llvm/lib/Transforms/Scalar/ConstraintElimination.cpp       | 7 ++++++-
 llvm/test/Transforms/ConstraintElimination/minmax.ll       | 4 +++-
 .../ConstraintElimination/umin-result-may-be-poison.ll     | 6 ++++--
 3 files changed, 13 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 8f09569d0d9cc..7b672e89b67aa 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -1061,11 +1061,16 @@ void State::addInfoFor(BasicBlock &BB) {
           FactOrCheck::getCheck(DT.getNode(&BB), cast<CallInst>(&I)));
       break;
     // Enqueue the intrinsics to add extra info.
-    case Intrinsic::abs:
     case Intrinsic::umin:
     case Intrinsic::umax:
     case Intrinsic::smin:
     case Intrinsic::smax:
+      // TODO: Check if it is possible to instead only added the min/max facts
+      // when simplifying uses of the min/max intrinsics.
+      if (!isGuaranteedNotToBePoison(&I))
+        break;
+      [[fallthrough]];
+    case Intrinsic::abs:
       WorkList.push_back(FactOrCheck::getInstFact(DT.getNode(&BB), &I));
       break;
     }
diff --git a/llvm/test/Transforms/ConstraintElimination/minmax.ll b/llvm/test/Transforms/ConstraintElimination/minmax.ll
index a31cf6845ad67..82b932f14c4ff 100644
--- a/llvm/test/Transforms/ConstraintElimination/minmax.ll
+++ b/llvm/test/Transforms/ConstraintElimination/minmax.ll
@@ -306,7 +306,9 @@ define i1 @smin_branchless(i32 %x, i32 %y) {
 ; CHECK-SAME: (i32 [[X:%.*]], i32 [[Y:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.smin.i32(i32 [[X]], i32 [[Y]])
-; CHECK-NEXT:    [[RET:%.*]] = xor i1 true, false
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp sle i32 [[MIN]], [[X]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[MIN]], [[X]]
+; CHECK-NEXT:    [[RET:%.*]] = xor i1 [[CMP1]], [[CMP2]]
 ; CHECK-NEXT:    ret i1 [[RET]]
 ;
 entry:
diff --git a/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll b/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll
index 35ac72e54d189..6d1d95ec4fdba 100644
--- a/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll
+++ b/llvm/test/Transforms/ConstraintElimination/umin-result-may-be-poison.ll
@@ -6,10 +6,11 @@
 define i1 @umin_not_used(i32 %arg) {
 ; CHECK-LABEL: define i1 @umin_not_used(
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i32 [[ARG]], 0
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3
 ; CHECK-NEXT:    [[TMP1:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80)
 ; CHECK-NEXT:    [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    ret i1 [[ICMP]]
 ;
   %icmp = icmp slt i32 %arg, 0
   %shl = shl nuw nsw i32 %arg, 3
@@ -38,12 +39,13 @@ define i1 @umin_poison_is_UB_via_call(i32 %arg) {
 define i1 @umin_poison_call_before_UB(i32 %arg) {
 ; CHECK-LABEL: define i1 @umin_poison_call_before_UB(
 ; CHECK-SAME: i32 [[ARG:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i32 [[ARG]], 0
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw nsw i32 [[ARG]], 3
 ; CHECK-NEXT:    [[MIN:%.*]] = call i32 @llvm.umin.i32(i32 [[SHL]], i32 80)
 ; CHECK-NEXT:    call void @fn()
 ; CHECK-NEXT:    call void @noundef(i32 noundef [[MIN]])
 ; CHECK-NEXT:    [[CMP2:%.*]] = shl nuw nsw i32 [[ARG]], 3
-; CHECK-NEXT:    ret i1 false
+; CHECK-NEXT:    ret i1 [[ICMP]]
 ;
   %icmp = icmp slt i32 %arg, 0
   %shl = shl nuw nsw i32 %arg, 3

From 382f70a877f00ab71f3cb5ba461b52e1b59cd292 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 24 Jan 2024 09:28:58 -0500
Subject: [PATCH 790/843] [libc++][NFC] Rewrite function call on two lines for
 clarity (#79141)

Previously, there was a ternary conditional with a less-than comparison
appearing inside a template argument, which was really confusing because
of the <...> of the function template. This patch rewrites the same
statement on two lines for clarity.
---
 libcxx/include/string | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/string b/libcxx/include/string
index e97139206d4fa..085178fe1a9ed 100644
--- a/libcxx/include/string
+++ b/libcxx/include/string
@@ -1948,8 +1948,8 @@ private:
     if (__s < __min_cap) {
       return static_cast<size_type>(__min_cap) - 1;
     }
-    size_type __guess =
-        __align_it < sizeof(value_type) < __alignment ? __alignment / sizeof(value_type) : 1 > (__s + 1) - 1;
+    const size_type __boundary = sizeof(value_type) < __alignment ? __alignment / sizeof(value_type) : 1;
+    size_type __guess          = __align_it<__boundary>(__s + 1) - 1;
     if (__guess == __min_cap)
       ++__guess;
     return __guess;

From 03a9f07e189db792b001c4001981d6e2da880221 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 24 Jan 2024 09:41:02 -0500
Subject: [PATCH 791/843] [libc++][NFC] Fix leftover && in comment

---
 libcxx/include/__algorithm/ranges_max.h | 2 +-
 libcxx/include/__algorithm/ranges_min.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/libcxx/include/__algorithm/ranges_max.h b/libcxx/include/__algorithm/ranges_max.h
index 0f89cb2ff5bf2..c63656de51349 100644
--- a/libcxx/include/__algorithm/ranges_max.h
+++ b/libcxx/include/__algorithm/ranges_max.h
@@ -98,6 +98,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#endif // _LIBCPP_STD_VER >= 20 &&
+#endif // _LIBCPP_STD_VER >= 20
 
 #endif // _LIBCPP___ALGORITHM_RANGES_MAX_H
diff --git a/libcxx/include/__algorithm/ranges_min.h b/libcxx/include/__algorithm/ranges_min.h
index 8757358cdf37d..e8f97f2754aca 100644
--- a/libcxx/include/__algorithm/ranges_min.h
+++ b/libcxx/include/__algorithm/ranges_min.h
@@ -90,6 +90,6 @@ _LIBCPP_END_NAMESPACE_STD
 
 _LIBCPP_POP_MACROS
 
-#endif // _LIBCPP_STD_VER >= 20 &&
+#endif // _LIBCPP_STD_VER >= 20
 
 #endif // _LIBCPP___ALGORITHM_RANGES_MIN_H

From 5db2e5801dfec79bad3c804da0f53871a2664373 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 15:42:48 +0100
Subject: [PATCH 792/843] Add necessary permissions to release issue workflow
 (#79272)

The `/cherry-pick` command needs `issues: write` to post a comment on
the issue. The `/branch` command also posts a comment, and also needs
`pull-requests: write` to open a PR.

This should fix the failure encountered at
https://github.com/llvm/llvm-project/issues/79253#issuecomment-1907850027.
---
 .github/workflows/issue-release-workflow.yml | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml
index 1f45799af1695..3c089c5465472 100644
--- a/.github/workflows/issue-release-workflow.yml
+++ b/.github/workflows/issue-release-workflow.yml
@@ -33,6 +33,8 @@ jobs:
   backport-commits:
     name: Backport Commits
     runs-on: ubuntu-latest
+    permissions:
+      issues: write
     if: >-
       (github.repository == 'llvm/llvm-project') &&
       !startswith(github.event.comment.body, '<!--IGNORE-->') &&
@@ -66,6 +68,9 @@ jobs:
   create-pull-request:
     name: Create Pull Request
     runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
     if: >-
       (github.repository == 'llvm/llvm-project') &&
       !startswith(github.event.comment.body, '<!--IGNORE-->') &&

From 380ac53dfa05792c6f9fd0a4aba542f8c7e5e17c Mon Sep 17 00:00:00 2001
From: Felipe de Azevedo Piovezan <fpiovezan@apple.com>
Date: Wed, 24 Jan 2024 06:44:03 -0800
Subject: [PATCH 793/843] [DebugNames] Implement Entry::GetParentEntry query
 (#78760)

This commit introduces a helper function to DWARFAcceleratorTable::Entry
which follows DW_IDX_Parent attributes to returns the corresponding
parent Entry in the table.

It is tested by enhancing dwarfdump so that it now prints:

1. When data is corrupt.
2. When parent information is present, but the parent is not indexed.
3. The parent entry offset, when the parent is present and indexed. This
is printed in terms a real entry offset (the same that gets printed at
the start of each entry: "Entry @ 0x..."), instead of the encoded number
in the table (which is an offset from the start off the Entry list).
This makes it easy to visually inspect the dwarfdump and check what the
parent is.
---
 .../DebugInfo/DWARF/DWARFAcceleratorTable.h   | 18 ++++++++
 .../DebugInfo/DWARF/DWARFAcceleratorTable.cpp | 43 ++++++++++++++++++-
 llvm/test/CodeGen/X86/dwarf-headers.o         |  0
 .../DebugInfo/Generic/debug-names-one-cu.ll   |  4 +-
 .../test/DebugInfo/X86/debug-names-dwarf64.ll | 10 ++---
 .../X86/debug-names-parents-same-offset.ll    | 24 ++++++++---
 llvm/test/DebugInfo/X86/debug-names-types.ll  | 20 ++++-----
 .../ARM/accel-imported-declarations.test      |  4 +-
 8 files changed, 97 insertions(+), 26 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/dwarf-headers.o

diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
index b89536bc0c723..44a19c7b13f9a 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAcceleratorTable.h
@@ -460,6 +460,16 @@ class DWARFDebugNames : public DWARFAcceleratorTable {
     /// Returns the Offset of the DIE within the containing CU or TU.
     std::optional<uint64_t> getDIEUnitOffset() const;
 
+    /// Returns true if this Entry has information about its parent DIE (i.e. if
+    /// it has an IDX_parent attribute)
+    bool hasParentInformation() const;
+
+    /// Returns the Entry corresponding to the parent of the DIE represented by
+    /// `this` Entry. If the parent is not in the table, nullopt is returned.
+    /// Precondition: hasParentInformation() == true.
+    /// An error is returned for ill-formed tables.
+    Expected<std::optional<DWARFDebugNames::Entry>> getParentDIEEntry() const;
+
     /// Return the Abbreviation that can be used to interpret the raw values of
     /// this Accelerator Entry.
     const Abbrev &getAbbrev() const { return *Abbr; }
@@ -469,6 +479,7 @@ class DWARFDebugNames : public DWARFAcceleratorTable {
     std::optional<DWARFFormValue> lookup(dwarf::Index Index) const;
 
     void dump(ScopedPrinter &W) const;
+    void dumpParentIdx(ScopedPrinter &W, const DWARFFormValue &FormValue) const;
 
     friend class NameIndex;
     friend class ValueIterator;
@@ -609,6 +620,13 @@ class DWARFDebugNames : public DWARFAcceleratorTable {
 
     Expected<Entry> getEntry(uint64_t *Offset) const;
 
+    /// Returns the Entry at the relative `Offset` from the start of the Entry
+    /// pool.
+    Expected<Entry> getEntryAtRelativeOffset(uint64_t Offset) const {
+      auto OffsetFromSection = Offset + this->EntriesBase;
+      return getEntry(&OffsetFromSection);
+    }
+
     /// Look up all entries in this Name Index matching \c Key.
     iterator_range<ValueIterator> equal_range(StringRef Key) const;
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 0f9c8ef485d45..03ad5d133cadd 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -611,6 +611,10 @@ DWARFDebugNames::Entry::lookup(dwarf::Index Index) const {
   return std::nullopt;
 }
 
+bool DWARFDebugNames::Entry::hasParentInformation() const {
+  return lookup(dwarf::DW_IDX_parent).has_value();
+}
+
 std::optional<uint64_t> DWARFDebugNames::Entry::getDIEUnitOffset() const {
   if (std::optional<DWARFFormValue> Off = lookup(dwarf::DW_IDX_die_offset))
     return Off->getAsReferenceUVal();
@@ -650,13 +654,48 @@ std::optional<uint64_t> DWARFDebugNames::Entry::getLocalTUIndex() const {
   return std::nullopt;
 }
 
+Expected<std::optional<DWARFDebugNames::Entry>>
+DWARFDebugNames::Entry::getParentDIEEntry() const {
+  // The offset of the accelerator table entry for the parent.
+  std::optional<DWARFFormValue> ParentEntryOff = lookup(dwarf::DW_IDX_parent);
+  assert(ParentEntryOff.has_value() && "hasParentInformation() must be called");
+
+  if (ParentEntryOff->getForm() == dwarf::Form::DW_FORM_flag_present)
+    return std::nullopt;
+  return NameIdx->getEntryAtRelativeOffset(ParentEntryOff->getRawUValue());
+}
+
+void DWARFDebugNames::Entry::dumpParentIdx(
+    ScopedPrinter &W, const DWARFFormValue &FormValue) const {
+  Expected<std::optional<Entry>> ParentEntry = getParentDIEEntry();
+  if (!ParentEntry) {
+    W.getOStream() << "<invalid offset data>";
+    consumeError(ParentEntry.takeError());
+    return;
+  }
+
+  if (!ParentEntry->has_value()) {
+    W.getOStream() << "<parent not indexed>";
+    return;
+  }
+
+  auto AbsoluteOffset = NameIdx->EntriesBase + FormValue.getRawUValue();
+  W.getOStream() << "Entry @ 0x" + Twine::utohexstr(AbsoluteOffset);
+}
+
 void DWARFDebugNames::Entry::dump(ScopedPrinter &W) const {
   W.startLine() << formatv("Abbrev: {0:x}\n", Abbr->Code);
   W.startLine() << formatv("Tag: {0}\n", Abbr->Tag);
   assert(Abbr->Attributes.size() == Values.size());
   for (auto Tuple : zip_first(Abbr->Attributes, Values)) {
-    W.startLine() << formatv("{0}: ", std::get<0>(Tuple).Index);
-    std::get<1>(Tuple).dump(W.getOStream());
+    auto Index = std::get<0>(Tuple).Index;
+    W.startLine() << formatv("{0}: ", Index);
+
+    auto FormValue = std::get<1>(Tuple);
+    if (Index == dwarf::Index::DW_IDX_parent)
+      dumpParentIdx(W, FormValue);
+    else
+      FormValue.dump(W.getOStream());
     W.getOStream() << '\n';
   }
 }
diff --git a/llvm/test/CodeGen/X86/dwarf-headers.o b/llvm/test/CodeGen/X86/dwarf-headers.o
new file mode 100644
index 0000000000000..e69de29bb2d1d
diff --git a/llvm/test/DebugInfo/Generic/debug-names-one-cu.ll b/llvm/test/DebugInfo/Generic/debug-names-one-cu.ll
index 92212a24bd153..46b753e9e2def 100644
--- a/llvm/test/DebugInfo/Generic/debug-names-one-cu.ll
+++ b/llvm/test/DebugInfo/Generic/debug-names-one-cu.ll
@@ -35,7 +35,7 @@
 ; CHECK-NEXT: Abbrev: [[ABBREV]]
 ; CHECK-NEXT: Tag: DW_TAG_variable
 ; CHECK-NEXT: DW_IDX_die_offset: 0x{{[0-9a-f]*}}
-; CHECK-NEXT: DW_IDX_parent: true
+; CHECK-NEXT: DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT: }
 
 ; CHECK: String: 0x{{[0-9a-f]*}} "foobar"
@@ -43,7 +43,7 @@
 ; CHECK-NEXT: Abbrev: [[ABBREV]]
 ; CHECK-NEXT: Tag: DW_TAG_variable
 ; CHECK-NEXT: DW_IDX_die_offset: 0x{{[0-9a-f]*}}
-; CHECK-NEXT: DW_IDX_parent: true
+; CHECK-NEXT: DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT: }
 
 ; VERIFY: No errors.
diff --git a/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll b/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll
index a6855d77a34ac..c15e2ad1d56b0 100644
--- a/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-dwarf64.ll
@@ -59,7 +59,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV]]
 ; CHECK-NEXT:         Tag: DW_TAG_base_type
 ; CHECK-NEXT:         DW_IDX_die_offset: [[TYPEDIE]]
-; CHECK-NEXT:         DW_IDX_parent: true
+; CHECK-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -71,17 +71,17 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV1]]
 ; CHECK-NEXT:         Tag: DW_TAG_variable
 ; CHECK-NEXT:         DW_IDX_die_offset: [[VARDIE]]
-; CHECK-NEXT:         DW_IDX_parent: true
+; CHECK-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:     Name 3 {
 ; CHECK-NEXT:       Hash: 0x7C96FE71
 ; CHECK-NEXT:       String: {{.+}} "func"
-; CHECK-NEXT:       Entry @ {{.+}} {
+; CHECK-NEXT:       Entry @ [[FUNC_ENTRY:0x.+]] {
 ; CHECK-NEXT:         Abbrev: [[ABBREV_SP]]
 ; CHECK-NEXT:         Tag: DW_TAG_subprogram
 ; CHECK-NEXT:         DW_IDX_die_offset: [[SPDIE]]
-; CHECK-NEXT:         DW_IDX_parent: true
+; CHECK-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -96,7 +96,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV_LABEL]]
 ; CHECK-NEXT:         Tag: DW_TAG_label
 ; CHECK-NEXT:         DW_IDX_die_offset: [[LABELDIE]]
-; CHECK-NEXT:         DW_IDX_parent: 0x{{.*}}
+; CHECK-NEXT:         DW_IDX_parent: Entry @ [[FUNC_ENTRY]]
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
diff --git a/llvm/test/DebugInfo/X86/debug-names-parents-same-offset.ll b/llvm/test/DebugInfo/X86/debug-names-parents-same-offset.ll
index 9170cfc8508fa..95d85ab0ffe0f 100644
--- a/llvm/test/DebugInfo/X86/debug-names-parents-same-offset.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-parents-same-offset.ll
@@ -21,16 +21,30 @@
 ; CHECK-NEXT:        Abbrev: [[ABBREV:0x.*]]
 ; CHECK-NEXT:        Tag: DW_TAG_structure_type
 ; CHECK-NEXT:        DW_IDX_type_unit: 0x01
-; CHECK-NEXT:        DW_IDX_die_offset: [[DieOffset:0x.*]]
-; CHECK-NEXT:        DW_IDX_parent: [[Parent1:0x.*]]
+; CHECK-NEXT:        DW_IDX_die_offset: [[DieOffsetStruct:0x.*]]
+; CHECK-NEXT:        DW_IDX_parent: Entry @ [[Parent2:0x.*]]
+; CHECK:      String: {{.*}} "MyNamespace"
+; CHECK-NEXT:      Entry @ [[Parent1:0x.*]] {
+; CHECK-NEXT:        Abbrev: {{.*}}
+; CHECK-NEXT:        Tag: DW_TAG_namespace
+; CHECK-NEXT:        DW_IDX_type_unit: 0x00
+; CHECK-NEXT:        DW_IDX_die_offset: [[DieOffsetNamespace:0x.*]]
+; CHECK-NEXT:        DW_IDX_parent: <parent not indexed>
+; CHECK-NEXT:      }
+; CHECK-NEXT:      Entry @ [[Parent2]] {
+; CHECK-NEXT:        Abbrev: {{.*}}
+; CHECK-NEXT:        Tag: DW_TAG_namespace
+; CHECK-NEXT:        DW_IDX_type_unit: 0x01
+; CHECK-NEXT:        DW_IDX_die_offset: [[DieOffsetNamespace:0x.*]]
+; CHECK-NEXT:        DW_IDX_parent: <parent not indexed>
+; CHECK-NEXT:      }
 ; CHECK:      String: {{.*}} "MyStruct1"
 ; CHECK-NEXT:      Entry @ {{.*}} {
 ; CHECK-NEXT:        Abbrev: [[ABBREV]]
 ; CHECK-NEXT:        Tag: DW_TAG_structure_type
 ; CHECK-NEXT:        DW_IDX_type_unit: 0x00
-; CHECK-NEXT:        DW_IDX_die_offset: [[DieOffset:0x.*]]
-; CHECK-NOT:         DW_IDX_parent: [[Parent1]]
-; CHECK-NEXT:        DW_IDX_parent: 0x{{.*}}
+; CHECK-NEXT:        DW_IDX_die_offset: [[DieOffsetStruct:0x.*]]
+; CHECK-NEXT:        DW_IDX_parent: Entry @ [[Parent1]]
 
 
 %"struct.MyNamespace::MyStruct1" = type { i8 }
diff --git a/llvm/test/DebugInfo/X86/debug-names-types.ll b/llvm/test/DebugInfo/X86/debug-names-types.ll
index c44e82578f14a..f41bb5524b9c3 100644
--- a/llvm/test/DebugInfo/X86/debug-names-types.ll
+++ b/llvm/test/DebugInfo/X86/debug-names-types.ll
@@ -73,7 +73,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV]]
 ; CHECK-NEXT:         Tag: DW_TAG_base_type
 ; CHECK-NEXT:         DW_IDX_die_offset: 0x0000003e
-; CHECK-NEXT:         DW_IDX_parent: true
+; CHECK-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -86,13 +86,13 @@
 ; CHECK-NEXT:           Tag: DW_TAG_structure_type
 ; CHECK-NEXT:           DW_IDX_type_unit: 0x00
 ; CHECK-NEXT:           DW_IDX_die_offset: 0x00000023
-; CHECK-NEXT:           DW_IDX_parent: true
+; CHECK-NEXT:           DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:       Entry @ {{.+}} {
 ; CHECK-NEXT:         Abbrev: [[ABBREV1]]
 ; CHECK-NEXT:         Tag: DW_TAG_structure_type
 ; CHECK-NEXT:         DW_IDX_die_offset: 0x00000042
-; CHECK-NEXT:         DW_IDX_parent: true
+; CHECK-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -104,7 +104,7 @@
 ; CHECK-NEXT:         Abbrev: [[ABBREV2]]
 ; CHECK-NEXT:         Tag: DW_TAG_subprogram
 ; CHECK-NEXT:         DW_IDX_die_offset: 0x00000023
-; CHECK-NEXT:         DW_IDX_parent: true
+; CHECK-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -117,7 +117,7 @@
 ; CHECK-NEXT:           Tag: DW_TAG_base_type
 ; CHECK-NEXT:           DW_IDX_type_unit: 0x00
 ; CHECK-NEXT:           DW_IDX_die_offset: 0x00000038
-; CHECK-NEXT:           DW_IDX_parent: true
+; CHECK-NEXT:           DW_IDX_parent: <parent not indexed>
 ; CHECK-NEXT:       }
 ; CHECK-NEXT:     }
 ; CHECK-NEXT:   ]
@@ -176,7 +176,7 @@
 ; CHECK-SPLIT-NEXT:         Abbrev: [[ABBREV2]]
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_base_type
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000035
-; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
@@ -189,13 +189,13 @@
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_structure_type
 ; CHECK-SPLIT-NEXT:         DW_IDX_type_unit: 0x00
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000021
-; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:       Entry @ {{.*}} {
 ; CHECK-SPLIT-NEXT:         Abbrev: [[ABBREV]]
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_structure_type
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000039
-; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
@@ -207,7 +207,7 @@
 ; CHECK-SPLIT-NEXT:         Abbrev: [[ABBREV3]]
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_subprogram
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x0000001a
-; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
@@ -220,7 +220,7 @@
 ; CHECK-SPLIT-NEXT:         Tag: DW_TAG_base_type
 ; CHECK-SPLIT-NEXT:         DW_IDX_type_unit: 0x00
 ; CHECK-SPLIT-NEXT:         DW_IDX_die_offset: 0x00000036
-; CHECK-SPLIT-NEXT:         DW_IDX_parent: true
+; CHECK-SPLIT-NEXT:         DW_IDX_parent: <parent not indexed>
 ; CHECK-SPLIT-NEXT:       }
 ; CHECK-SPLIT-NEXT:     }
 ; CHECK-SPLIT-NEXT:   ]
diff --git a/llvm/test/tools/dsymutil/ARM/accel-imported-declarations.test b/llvm/test/tools/dsymutil/ARM/accel-imported-declarations.test
index a8bf191e036f7..8afbe34dcdbfb 100644
--- a/llvm/test/tools/dsymutil/ARM/accel-imported-declarations.test
+++ b/llvm/test/tools/dsymutil/ARM/accel-imported-declarations.test
@@ -23,13 +23,13 @@ DWARF-NEXT:     Entry {{.*}} {
 DWARF-NEXT:       Abbrev: {{.*}}
 DWARF-NEXT:       Tag: DW_TAG_namespace
 DWARF:       DW_IDX_die_offset: [[NAMESPACE]]
-DWARF-NEXT:  DW_IDX_parent: 0x{{.*}}
+DWARF-NEXT:  DW_IDX_parent: Entry @ 0x{{.*}}
 DWARF-NEXT:     }
 DWARF-NEXT:     Entry {{.*}} {
 DWARF-NEXT:       Abbrev: {{.*}}
 DWARF:       Tag: DW_TAG_imported_declaration
 DWARF:       DW_IDX_die_offset: 0x0000005c
-DWARF-NEXT:  DW_IDX_parent: 0x{{.*}}
+DWARF-NEXT:  DW_IDX_parent: Entry @ 0x{{.*}}
 DWARF-NEXT:     }
 DWARF-NEXT:   }
 

From f03a60d4d2fc687059c8bb667d1de37713a5a64b Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 16:05:56 +0100
Subject: [PATCH 794/843] Use correct tokens in release issue workflow (#79300)

We should use the normal github.token for interacting with issues/PRs on
the repo, and separately pass the `--branch-repo-token` for creating the
branch in the llvmbot repo.
---
 .github/workflows/issue-release-workflow.yml | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/issue-release-workflow.yml b/.github/workflows/issue-release-workflow.yml
index 3c089c5465472..17209ec055f85 100644
--- a/.github/workflows/issue-release-workflow.yml
+++ b/.github/workflows/issue-release-workflow.yml
@@ -60,8 +60,9 @@ jobs:
           printf "%s" "$COMMENT_BODY" |
           ./llvm/utils/git/github-automation.py \
           --repo "$GITHUB_REPOSITORY" \
-          --token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \
+          --token ${{ github.token }} \
           release-workflow \
+          --branch-repo-token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \
           --issue-number ${{ github.event.issue.number }} \
           auto
 
@@ -91,7 +92,8 @@ jobs:
           printf "%s" "$COMMENT_BODY" |
           ./llvm/utils/git/github-automation.py \
           --repo "$GITHUB_REPOSITORY" \
-          --token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \
+          --token ${{ github.token }} \
           release-workflow \
+          --branch-repo-token ${{ secrets.RELEASE_WORKFLOW_PUSH_SECRET }} \
           --issue-number ${{ github.event.issue.number }} \
           auto

From 70fc9703788e8965813c5b677a85cb84b66671b6 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 24 Jan 2024 15:06:20 +0000
Subject: [PATCH 795/843] [AMDGPU] Move architected SGPR implementation into
 isel (#79120)

---
 .../lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp |  39 ++-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |  68 ++--
 .../lib/Target/AMDGPU/SIMachineFunctionInfo.h |  32 +-
 .../AMDGPU/indirect-call-known-callees.ll     |   1 -
 .../lower-work-group-id-intrinsics-hsa.ll     | 295 ++++++++++++++++++
 .../lower-work-group-id-intrinsics-pal.ll     | 187 +++++++++++
 .../AMDGPU/lower-work-group-id-intrinsics.ll  | 128 --------
 .../AMDGPU/workgroup-id-in-arch-sgprs.ll      | 129 +++-----
 8 files changed, 627 insertions(+), 252 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
 delete mode 100644 llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll

diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 32921bb248caf..615685822f91e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -4178,10 +4178,45 @@ bool AMDGPULegalizerInfo::loadInputValue(
     Register DstReg, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
   const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
-  const ArgDescriptor *Arg;
+  const ArgDescriptor *Arg = nullptr;
   const TargetRegisterClass *ArgRC;
   LLT ArgTy;
-  std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
+
+  CallingConv::ID CC = B.getMF().getFunction().getCallingConv();
+  const ArgDescriptor WorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP9);
+  // If GridZ is not programmed in an entry function then the hardware will set
+  // it to all zeros, so there is no need to mask the GridY value in the low
+  // order bits.
+  const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
+      AMDGPU::TTMP7,
+      AMDGPU::isEntryFunctionCC(CC) && !MFI->hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
+  const ArgDescriptor WorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  if (ST.hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
+    switch (ArgType) {
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+      Arg = &WorkGroupIDX;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+      Arg = &WorkGroupIDY;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+      Arg = &WorkGroupIDZ;
+      ArgRC = &AMDGPU::SReg_32RegClass;
+      ArgTy = LLT::scalar(32);
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Arg)
+    std::tie(Arg, ArgRC, ArgTy) = MFI->getPreloadedValue(ArgType);
 
   if (!Arg) {
     if (ArgType == AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR) {
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index d35b76c8ad54e..d60f511302613 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -2072,11 +2072,45 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
   const SIMachineFunctionInfo &MFI,
   EVT VT,
   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
-  const ArgDescriptor *Reg;
+  const ArgDescriptor *Reg = nullptr;
   const TargetRegisterClass *RC;
   LLT Ty;
 
-  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
+  CallingConv::ID CC = DAG.getMachineFunction().getFunction().getCallingConv();
+  const ArgDescriptor WorkGroupIDX =
+      ArgDescriptor::createRegister(AMDGPU::TTMP9);
+  // If GridZ is not programmed in an entry function then the hardware will set
+  // it to all zeros, so there is no need to mask the GridY value in the low
+  // order bits.
+  const ArgDescriptor WorkGroupIDY = ArgDescriptor::createRegister(
+      AMDGPU::TTMP7,
+      AMDGPU::isEntryFunctionCC(CC) && !MFI.hasWorkGroupIDZ() ? ~0u : 0xFFFFu);
+  const ArgDescriptor WorkGroupIDZ =
+      ArgDescriptor::createRegister(AMDGPU::TTMP7, 0xFFFF0000u);
+  if (Subtarget->hasArchitectedSGPRs() && AMDGPU::isCompute(CC)) {
+    switch (PVID) {
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
+      Reg = &WorkGroupIDX;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
+      Reg = &WorkGroupIDY;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
+      Reg = &WorkGroupIDZ;
+      RC = &AMDGPU::SReg_32RegClass;
+      Ty = LLT::scalar(32);
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Reg)
+    std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
   if (!Reg) {
     if (PVID == AMDGPUFunctionArgInfo::PreloadedValue::KERNARG_SEGMENT_PTR) {
       // It's possible for a kernarg intrinsic call to appear in a kernel with
@@ -2505,28 +2539,24 @@ void SITargetLowering::allocateSystemSGPRs(CCState &CCInfo,
     }
   }
 
-  if (Info.hasWorkGroupIDX()) {
-    Register Reg = Info.addWorkGroupIDX(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+  if (!HasArchitectedSGPRs) {
+    if (Info.hasWorkGroupIDX()) {
+      Register Reg = Info.addWorkGroupIDX();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
 
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasWorkGroupIDY()) {
-    Register Reg = Info.addWorkGroupIDY(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+    if (Info.hasWorkGroupIDY()) {
+      Register Reg = Info.addWorkGroupIDY();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
+      CCInfo.AllocateReg(Reg);
+    }
 
-    CCInfo.AllocateReg(Reg);
-  }
-
-  if (Info.hasWorkGroupIDZ()) {
-    Register Reg = Info.addWorkGroupIDZ(HasArchitectedSGPRs);
-    if (!HasArchitectedSGPRs)
+    if (Info.hasWorkGroupIDZ()) {
+      Register Reg = Info.addWorkGroupIDZ();
       MF.addLiveIn(Reg, &AMDGPU::SGPR_32RegClass);
-
-    CCInfo.AllocateReg(Reg);
+      CCInfo.AllocateReg(Reg);
+    }
   }
 
   if (Info.hasWorkGroupInfo()) {
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index 9ff66a094f991..0336ec4985ea7 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -751,35 +751,21 @@ class SIMachineFunctionInfo final : public AMDGPUMachineFunction,
   }
 
   // Add system SGPRs.
-  Register addWorkGroupIDX(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP9 : getNextSystemSGPR();
-    ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(Reg);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
-
+  Register addWorkGroupIDX() {
+    ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
+    NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDX.getRegister();
   }
 
-  Register addWorkGroupIDY(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
-    unsigned Mask = HasArchitectedSGPRs && hasWorkGroupIDZ() ? 0xffff : ~0u;
-    ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(Reg, Mask);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
-
+  Register addWorkGroupIDY() {
+    ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
+    NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDY.getRegister();
   }
 
-  Register addWorkGroupIDZ(bool HasArchitectedSGPRs) {
-    Register Reg =
-        HasArchitectedSGPRs ? (MCPhysReg)AMDGPU::TTMP7 : getNextSystemSGPR();
-    unsigned Mask = HasArchitectedSGPRs ? 0xffff << 16 : ~0u;
-    ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(Reg, Mask);
-    if (!HasArchitectedSGPRs)
-      NumSystemSGPRs += 1;
-
+  Register addWorkGroupIDZ() {
+    ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
+    NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDZ.getRegister();
   }
 
diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
index 380a13ed16128..47110d9491887 100644
--- a/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
+++ b/llvm/test/CodeGen/AMDGPU/indirect-call-known-callees.ll
@@ -55,7 +55,6 @@ define amdgpu_kernel void @indirect_call_known_no_special_inputs() {
 ; GFX12-NEXT:    s_load_b64 s[2:3], s[4:5], 0x0
 ; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 0
 ; GFX12-NEXT:    v_mov_b32_e32 v31, v0
-; GFX12-NEXT:    s_mov_b32 s12, ttmp9
 ; GFX12-NEXT:    s_mov_b64 s[8:9], 0
 ; GFX12-NEXT:    s_mov_b32 s32, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
new file mode 100644
index 0000000000000..afa914c8375f6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-hsa.ll
@@ -0,0 +1,295 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_kernel void @workgroup_ids_kernel() {
+; GFX9-LABEL: workgroup_ids_kernel:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: workgroup_ids_kernel:
+; GFX9ARCH-SDAG:       ; %bb.0: ; %.entry
+; GFX9ARCH-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9ARCH-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-SDAG-NEXT:    s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: workgroup_ids_kernel:
+; GFX9ARCH-GISEL:       ; %bb.0: ; %.entry
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9ARCH-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: workgroup_ids_kernel:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: workgroup_ids_kernel:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_kernel void @caller() {
+; GFX9-SDAG-LABEL: caller:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-SDAG-NEXT:    s_mov_b32 s38, -1
+; GFX9-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-SDAG-NEXT:    s_add_u32 s36, s36, s7
+; GFX9-SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-SDAG-NEXT:    s_add_u32 s8, s2, 36
+; GFX9-SDAG-NEXT:    s_addc_u32 s9, s3, 0
+; GFX9-SDAG-NEXT:    s_getpc_b64 s[2:3]
+; GFX9-SDAG-NEXT:    s_add_u32 s2, s2, callee@gotpcrel32@lo+4
+; GFX9-SDAG-NEXT:    s_addc_u32 s3, s3, callee@gotpcrel32@hi+12
+; GFX9-SDAG-NEXT:    s_load_dwordx2 s[14:15], s[2:3], 0x0
+; GFX9-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-SDAG-NEXT:    s_mov_b32 s12, s6
+; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[14:15]
+; GFX9-SDAG-NEXT:    s_endpgm
+;
+; GFX9-GISEL-LABEL: caller:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-GISEL-NEXT:    s_mov_b32 s38, -1
+; GFX9-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9-GISEL-NEXT:    s_add_u32 s36, s36, s7
+; GFX9-GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9-GISEL-NEXT:    s_add_u32 s8, s2, 36
+; GFX9-GISEL-NEXT:    s_addc_u32 s9, s3, 0
+; GFX9-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9-GISEL-NEXT:    s_load_dwordx2 s[14:15], s[0:1], 0x0
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9-GISEL-NEXT:    s_mov_b32 s12, s6
+; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[14:15]
+; GFX9-GISEL-NEXT:    s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: caller:
+; GFX9ARCH-SDAG:       ; %bb.0:
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s38, -1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s36, s36, s6
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s2, 36
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s3, 0
+; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[2:3]
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s2, s2, callee@gotpcrel32@lo+4
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s3, s3, callee@gotpcrel32@hi+12
+; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9ARCH-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9ARCH-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9ARCH-SDAG-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-SDAG-NEXT:    s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: caller:
+; GFX9ARCH-GISEL:       ; %bb.0:
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s38, -1
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s39, 0xe00000
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s36, s36, s6
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s37, s37, 0
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s2, 36
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s3, 0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9ARCH-GISEL-NEXT:    v_lshlrev_b32_e32 v1, 10, v1
+; GFX9ARCH-GISEL-NEXT:    v_lshlrev_b32_e32 v2, 20, v2
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[0:1], s[36:37]
+; GFX9ARCH-GISEL-NEXT:    v_or3_b32 v31, v0, v1, v2
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[2:3], s[38:39]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX9ARCH-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: caller:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
+; GFX12-SDAG-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX12-SDAG-NEXT:    s_mov_b32 s7, callee@abs32@hi
+; GFX12-SDAG-NEXT:    s_mov_b32 s6, callee@abs32@lo
+; GFX12-SDAG-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-SDAG-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: caller:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v31, v0 :: v_dual_mov_b32 v0, ttmp9
+; GFX12-GISEL-NEXT:    s_mov_b64 s[10:11], s[4:5]
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, callee@abs32@lo
+; GFX12-GISEL-NEXT:    s_mov_b32 s7, callee@abs32@hi
+; GFX12-GISEL-NEXT:    s_mov_b64 s[4:5], s[0:1]
+; GFX12-GISEL-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[6:7]
+; GFX12-GISEL-NEXT:    s_endpgm
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  call void @callee(i32 %idx) #0
+  ret void
+}
+
+declare void @callee(i32) #0
+
+define void @workgroup_ids_device_func(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-LABEL: workgroup_ids_device_func:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v6, s12
+; GFX9-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s13
+; GFX9-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9ARCH-SDAG-LABEL: workgroup_ids_device_func:
+; GFX9ARCH-SDAG:       ; %bb.0:
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9ARCH-SDAG-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX9ARCH-SDAG-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9ARCH-SDAG-NEXT:    s_lshr_b32 s4, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9ARCH-SDAG-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9ARCH-GISEL-LABEL: workgroup_ids_device_func:
+; GFX9ARCH-GISEL:       ; %bb.0:
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v6, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_and_b32 s4, ttmp7, 0xffff
+; GFX9ARCH-GISEL-NEXT:    s_lshr_b32 s5, ttmp7, 16
+; GFX9ARCH-GISEL-NEXT:    global_store_dword v[0:1], v6, off
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9ARCH-GISEL-NEXT:    global_store_dword v[2:3], v0, off
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, s5
+; GFX9ARCH-GISEL-NEXT:    global_store_dword v[4:5], v0, off
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: workgroup_ids_device_func:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v6, ttmp9 :: v_dual_mov_b32 v7, s0
+; GFX12-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-NEXT:    v_mov_b32_e32 v8, s1
+; GFX12-NEXT:    global_store_b32 v[0:1], v6, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_store_b32 v[2:3], v7, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    global_store_b32 v[4:5], v8, off scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_storecnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %id.x, ptr addrspace(1) %outx
+  store volatile i32 %id.y, ptr addrspace(1) %outy
+  store volatile i32 %id.z, ptr addrspace(1) %outz
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
+
+attributes #0 = { nounwind "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9ARCH: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
new file mode 100644
index 0000000000000..cfff0a969da9e
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics-pal.ll
@@ -0,0 +1,187 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=-architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-hsa -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9ARCH,GFX9ARCH-GISEL %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
+
+define amdgpu_cs void @_amdgpu_cs_main() {
+; GFX9-LABEL: _amdgpu_cs_main:
+; GFX9:       ; %bb.0: ; %.entry
+; GFX9-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9-NEXT:    s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: _amdgpu_cs_main:
+; GFX9ARCH-SDAG:       ; %bb.0: ; %.entry
+; GFX9ARCH-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9ARCH-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v2, s0
+; GFX9ARCH-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-SDAG-NEXT:    s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: _amdgpu_cs_main:
+; GFX9ARCH-GISEL:       ; %bb.0: ; %.entry
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX9ARCH-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9ARCH-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
+; GFX9ARCH-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: _amdgpu_cs_main:
+; GFX12-SDAG:       ; %bb.0: ; %.entry
+; GFX12-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
+; GFX12-SDAG-NEXT:    s_lshr_b32 s1, ttmp7, 16
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s0
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s1
+; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: _amdgpu_cs_main:
+; GFX12-GISEL:       ; %bb.0: ; %.entry
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
+; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
+; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
+.entry:
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
+  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
+  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
+  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
+  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
+  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
+  ret void
+}
+
+define amdgpu_cs void @caller() {
+; GFX9-LABEL: caller:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT:    s_mov_b32 s10, -1
+; GFX9-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9-NEXT:    s_add_u32 s8, s8, s0
+; GFX9-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9-NEXT:    s_getpc_b64 s[0:1]
+; GFX9-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9-NEXT:    s_mov_b32 s32, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX9ARCH-SDAG-LABEL: caller:
+; GFX9ARCH-SDAG:       ; %bb.0:
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s10, -1
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s8, s8, s0
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9ARCH-SDAG-NEXT:    s_getpc_b64 s[0:1]
+; GFX9ARCH-SDAG-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9ARCH-SDAG-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9ARCH-SDAG-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9ARCH-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9ARCH-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9ARCH-SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9ARCH-SDAG-NEXT:    s_endpgm
+;
+; GFX9ARCH-GISEL-LABEL: caller:
+; GFX9ARCH-GISEL:       ; %bb.0:
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s10, -1
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s11, 0xe00000
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s8, s8, s0
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s9, s9, 0
+; GFX9ARCH-GISEL-NEXT:    s_getpc_b64 s[0:1]
+; GFX9ARCH-GISEL-NEXT:    s_add_u32 s0, s0, callee@gotpcrel32@lo+4
+; GFX9ARCH-GISEL-NEXT:    s_addc_u32 s1, s1, callee@gotpcrel32@hi+12
+; GFX9ARCH-GISEL-NEXT:    s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[0:1], s[8:9]
+; GFX9ARCH-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9ARCH-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
+; GFX9ARCH-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX9ARCH-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9ARCH-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GFX9ARCH-GISEL-NEXT:    s_endpgm
+;
+; GFX12-SDAG-LABEL: caller:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-SDAG-NEXT:    s_mov_b32 s1, callee@abs32@hi
+; GFX12-SDAG-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
+; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-SDAG-NEXT:    s_endpgm
+;
+; GFX12-GISEL-LABEL: caller:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX12-GISEL-NEXT:    s_mov_b32 s0, callee@abs32@lo
+; GFX12-GISEL-NEXT:    s_mov_b32 s1, callee@abs32@hi
+; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
+; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
+; GFX12-GISEL-NEXT:    s_endpgm
+  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
+  call amdgpu_gfx void @callee(i32 %idx)
+  ret void
+}
+
+declare amdgpu_gfx void @callee(i32)
+
+define amdgpu_gfx void @workgroup_ids_gfx(ptr addrspace(1) %outx, ptr addrspace(1) %outy, ptr addrspace(1) %outz) {
+; GFX9-LABEL: workgroup_ids_gfx:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9ARCH-LABEL: workgroup_ids_gfx:
+; GFX9ARCH:       ; %bb.0:
+; GFX9ARCH-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9ARCH-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX12-LABEL: workgroup_ids_gfx:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %id.x = call i32 @llvm.amdgcn.workgroup.id.x()
+  %id.y = call i32 @llvm.amdgcn.workgroup.id.y()
+  %id.z = call i32 @llvm.amdgcn.workgroup.id.z()
+  store volatile i32 %id.x, ptr addrspace(1) %outx
+  store volatile i32 %id.y, ptr addrspace(1) %outy
+  store volatile i32 %id.z, ptr addrspace(1) %outz
+  ret void
+}
+
+declare i32 @llvm.amdgcn.workgroup.id.x()
+declare i32 @llvm.amdgcn.workgroup.id.y()
+declare i32 @llvm.amdgcn.workgroup.id.z()
+declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX9-GISEL: {{.*}}
+; GFX9-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
deleted file mode 100644
index 495b54758de04..0000000000000
--- a/llvm/test/CodeGen/AMDGPU/lower-work-group-id-intrinsics.ll
+++ /dev/null
@@ -1,128 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 -mattr=+architected-sgprs -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9,GFX9-GISEL %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
-
-define amdgpu_cs void @_amdgpu_cs_main() {
-; GFX9-SDAG-LABEL: _amdgpu_cs_main:
-; GFX9-SDAG:       ; %bb.0: ; %.entry
-; GFX9-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 16
-; GFX9-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-SDAG-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: _amdgpu_cs_main:
-; GFX9-GISEL:       ; %bb.0: ; %.entry
-; GFX9-GISEL-NEXT:    s_mov_b32 s0, ttmp9
-; GFX9-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
-; GFX9-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s1
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX9-GISEL-NEXT:    buffer_store_dwordx3 v[0:2], off, s[0:3], 0
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: _amdgpu_cs_main:
-; GFX12-SDAG:       ; %bb.0: ; %.entry
-; GFX12-SDAG-NEXT:    s_lshr_b32 s2, ttmp7, 16
-; GFX12-SDAG-NEXT:    s_and_b32 s1, ttmp7, 0xffff
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, s1
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-SDAG-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: _amdgpu_cs_main:
-; GFX12-GISEL:       ; %bb.0: ; %.entry
-; GFX12-GISEL-NEXT:    s_mov_b32 s0, ttmp9
-; GFX12-GISEL-NEXT:    s_and_b32 s1, ttmp7, 0xffff
-; GFX12-GISEL-NEXT:    s_lshr_b32 s2, ttmp7, 16
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, s2
-; GFX12-GISEL-NEXT:    buffer_store_b96 v[0:2], off, s[0:3], null
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT:    s_endpgm
-.entry:
-  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
-  %idy = call i32 @llvm.amdgcn.workgroup.id.y()
-  %idz = call i32 @llvm.amdgcn.workgroup.id.z()
-  %ielemx = insertelement <3 x i32> undef, i32 %idx, i64 0
-  %ielemy = insertelement <3 x i32> %ielemx, i32 %idy, i64 1
-  %ielemz = insertelement <3 x i32> %ielemy, i32 %idz, i64 2
-  call void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32> %ielemz, ptr addrspace(8) undef, i32 0, i32 0, i32 0)
-  ret void
-}
-
-define amdgpu_cs void @caller() {
-; GFX9-SDAG-LABEL: caller:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-SDAG-NEXT:    s_mov_b32 s8, s0
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
-; GFX9-SDAG-NEXT:    s_mov_b32 s5, callee@abs32@hi
-; GFX9-SDAG-NEXT:    s_mov_b32 s4, callee@abs32@lo
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-SDAG-NEXT:    s_mov_b32 s32, 0
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    s_add_u32 s8, s8, s0
-; GFX9-SDAG-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-SDAG-NEXT:    s_mov_b64 s[0:1], s[8:9]
-; GFX9-SDAG-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9-SDAG-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: caller:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_getpc_b64 s[8:9]
-; GFX9-GISEL-NEXT:    s_mov_b32 s8, s0
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[8:11], s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, callee@abs32@lo
-; GFX9-GISEL-NEXT:    s_mov_b32 s5, callee@abs32@hi
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-GISEL-NEXT:    s_mov_b32 s32, 0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_add_u32 s8, s8, s0
-; GFX9-GISEL-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-GISEL-NEXT:    s_mov_b64 s[0:1], s[8:9]
-; GFX9-GISEL-NEXT:    s_mov_b64 s[2:3], s[10:11]
-; GFX9-GISEL-NEXT:    s_swappc_b64 s[30:31], s[4:5]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: caller:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX12-SDAG-NEXT:    s_mov_b32 s1, callee@abs32@hi
-; GFX12-SDAG-NEXT:    s_mov_b32 s0, callee@abs32@lo
-; GFX12-SDAG-NEXT:    s_mov_b32 s32, 0
-; GFX12-SDAG-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: caller:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX12-GISEL-NEXT:    s_mov_b32 s0, callee@abs32@lo
-; GFX12-GISEL-NEXT:    s_mov_b32 s1, callee@abs32@hi
-; GFX12-GISEL-NEXT:    s_mov_b32 s32, 0
-; GFX12-GISEL-NEXT:    s_swappc_b64 s[30:31], s[0:1]
-; GFX12-GISEL-NEXT:    s_endpgm
-  %idx = call i32 @llvm.amdgcn.workgroup.id.x()
-  call amdgpu_gfx void @callee(i32 %idx)
-  ret void
-}
-
-declare amdgpu_gfx void @callee(i32)
-
-declare i32 @llvm.amdgcn.workgroup.id.x()
-declare i32 @llvm.amdgcn.workgroup.id.y()
-declare i32 @llvm.amdgcn.workgroup.id.z()
-declare void @llvm.amdgcn.raw.ptr.buffer.store.v3i32(<3 x i32>, ptr addrspace(8), i32, i32, i32 immarg)
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX12: {{.*}}
-; GFX9: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
index 769e6b0964abd..40e4692a18ec7 100644
--- a/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
+++ b/llvm/test/CodeGen/AMDGPU/workgroup-id-in-arch-sgprs.ll
@@ -5,43 +5,25 @@
 ; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -global-isel=1 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_kernel void @workgroup_id_x(ptr addrspace(1) %ptrx) {
-; GFX9-SDAG-LABEL: workgroup_id_x:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_endpgm
 ;
-; GFX9-GISEL-LABEL: workgroup_id_x:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9-GISEL-NEXT:    s_endpgm
-;
-; GFX12-SDAG-LABEL: workgroup_id_x:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT:    s_endpgm
+; GFX9-LABEL: workgroup_id_x:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    s_endpgm
 ;
-; GFX12-GISEL-LABEL: workgroup_id_x:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    global_store_b32 v1, v0, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT:    s_endpgm
+; GFX12-LABEL: workgroup_id_x:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v1, v0, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
   %idx = call i32 @llvm.amdgcn.workgroup.id.x()
   store i32 %idx, ptr addrspace(1) %ptrx
 
@@ -52,23 +34,23 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 ; GFX9-LABEL: workgroup_id_xy:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    v_mov_b32_e32 v1, ttmp7
-; GFX9-NEXT:    global_store_dword v0, v1, s[2:3]
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v2, v0, s[0:1]
+; GFX9-NEXT:    global_store_dword v2, v1, s[2:3]
 ; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: workgroup_id_xy:
 ; GFX12:       ; %bb.0:
 ; GFX12-NEXT:    s_load_b128 s[0:3], s[0:1], 0x0
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
-; GFX12-NEXT:    v_mov_b32_e32 v2, ttmp7
+; GFX12-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, ttmp7
+; GFX12-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x1
-; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-NEXT:    global_store_b32 v0, v2, s[2:3]
+; GFX12-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-NEXT:    global_store_b32 v2, v1, s[2:3]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -81,37 +63,21 @@ define amdgpu_kernel void @workgroup_id_xy(ptr addrspace(1) %ptrx, ptr addrspace
 }
 
 define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspace(1) %ptry, ptr addrspace(1) %ptrz) {
-; GFX9-SDAG-LABEL: workgroup_id_xyz:
-; GFX9-SDAG:       ; %bb.0:
-; GFX9-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-SDAG-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, 0
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, ttmp9
-; GFX9-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[0:1]
-; GFX9-SDAG-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT:    s_lshr_b32 s0, ttmp7, 16
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[2:3]
-; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s0
-; GFX9-SDAG-NEXT:    global_store_dword v0, v1, s[6:7]
-; GFX9-SDAG-NEXT:    s_endpgm
-;
-; GFX9-GISEL-LABEL: workgroup_id_xyz:
-; GFX9-GISEL:       ; %bb.0:
-; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
-; GFX9-GISEL-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x10
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, ttmp9
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[0:1]
-; GFX9-GISEL-NEXT:    s_and_b32 s0, ttmp7, 0xffff
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    s_lshr_b32 s0, ttmp7, 16
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[2:3]
-; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX9-GISEL-NEXT:    global_store_dword v1, v0, s[6:7]
-; GFX9-GISEL-NEXT:    s_endpgm
+; GFX9-LABEL: workgroup_id_xyz:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
+; GFX9-NEXT:    v_mov_b32_e32 v0, ttmp9
+; GFX9-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x10
+; GFX9-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-NEXT:    s_and_b32 s6, ttmp7, 0xffff
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_store_dword v1, v0, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    s_lshr_b32 s0, ttmp7, 16
+; GFX9-NEXT:    global_store_dword v1, v0, s[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    global_store_dword v1, v0, s[4:5]
+; GFX9-NEXT:    s_endpgm
 ;
 ; GFX12-LABEL: workgroup_id_xyz:
 ; GFX12:       ; %bb.0:
@@ -119,15 +85,15 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 ; GFX12-NEXT:    s_load_b128 s[4:7], s[0:1], 0x0
 ; GFX12-NEXT:    s_load_b64 s[0:1], s[0:1], 0x10
 ; GFX12-NEXT:    s_and_b32 s2, ttmp7, 0xffff
-; GFX12-NEXT:    v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, ttmp9
+; GFX12-NEXT:    v_dual_mov_b32 v0, ttmp9 :: v_dual_mov_b32 v1, 0
 ; GFX12-NEXT:    s_lshr_b32 s3, ttmp7, 16
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
 ; GFX12-NEXT:    s_clause 0x2
-; GFX12-NEXT:    global_store_b32 v0, v1, s[4:5]
-; GFX12-NEXT:    global_store_b32 v0, v2, s[6:7]
-; GFX12-NEXT:    global_store_b32 v0, v3, s[0:1]
+; GFX12-NEXT:    global_store_b32 v1, v0, s[4:5]
+; GFX12-NEXT:    global_store_b32 v1, v2, s[6:7]
+; GFX12-NEXT:    global_store_b32 v1, v3, s[0:1]
 ; GFX12-NEXT:    s_nop 0
 ; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-NEXT:    s_endpgm
@@ -144,3 +110,8 @@ define amdgpu_kernel void @workgroup_id_xyz(ptr addrspace(1) %ptrx, ptr addrspac
 declare i32 @llvm.amdgcn.workgroup.id.x()
 declare i32 @llvm.amdgcn.workgroup.id.y()
 declare i32 @llvm.amdgcn.workgroup.id.z()
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX12-GISEL: {{.*}}
+; GFX12-SDAG: {{.*}}
+; GFX9-GISEL: {{.*}}
+; GFX9-SDAG: {{.*}}

From aaa93ce7323332d8290b8f563d4d71689c1094c5 Mon Sep 17 00:00:00 2001
From: Tom Stellard <tstellar@redhat.com>
Date: Tue, 29 Aug 2023 15:49:55 -0700
Subject: [PATCH 796/843] compiler-rt: Fix FLOAT16 feature detection

CMAKE_TRY_COMPILE_TARGET_TYPE defaults to EXECUTABLE, which causes
any feature detection code snippet without a main function to fail,
so we need to make sure it gets explicitly set to STATIC_LIBRARY.

Bug: https://github.com/ROCm/rocFFT/issues/439
Bug: https://github.com/ROCm/rocBLAS/issues/1350
Bug: https://bugs.gentoo.org/916069
Closes: https://github.com/llvm/llvm-project/pull/69842

Reviewed by: thesamesam, mgorny
---
 compiler-rt/lib/builtins/CMakeLists.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/builtins/CMakeLists.txt b/compiler-rt/lib/builtins/CMakeLists.txt
index 28ded8766f253..8b6dc7ce0c378 100644
--- a/compiler-rt/lib/builtins/CMakeLists.txt
+++ b/compiler-rt/lib/builtins/CMakeLists.txt
@@ -5,7 +5,6 @@
 if (CMAKE_SOURCE_DIR STREQUAL CMAKE_CURRENT_SOURCE_DIR)
   cmake_minimum_required(VERSION 3.20.0)
 
-  set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
   project(CompilerRTBuiltins C ASM)
   set(COMPILER_RT_STANDALONE_BUILD TRUE)
   set(COMPILER_RT_BUILTINS_STANDALONE_BUILD TRUE)
@@ -57,6 +56,8 @@ if (COMPILER_RT_STANDALONE_BUILD)
     ON)
 endif()
 
+set(CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)
+
 include(builtin-config-ix)
 include(CMakeDependentOption)
 include(CMakePushCheckState)

From 777eb35614eff30f8fe8ca7729b9c04846a09476 Mon Sep 17 00:00:00 2001
From: felixh5678 <157516335+felixh5678@users.noreply.github.com>
Date: Wed, 24 Jan 2024 10:16:12 -0500
Subject: [PATCH 797/843] [libc] Add sqrtf128 implementation for Linux x86_64.
 (#79195)

Co-authored-by: Tue Ly <lntue@google.com>
Co-authored-by: Felix <felix@Dirks-MacBook-Pro.local>
---
 libc/config/linux/x86_64/entrypoints.txt      |  1 +
 libc/docs/math/index.rst                      | 12 ++++----
 libc/spec/stdc.td                             |  1 +
 libc/src/math/CMakeLists.txt                  |  1 +
 libc/src/math/generic/CMakeLists.txt          | 12 ++++++++
 libc/src/math/generic/sqrtf128.cpp            | 17 +++++++++++
 libc/src/math/sqrtf128.h                      | 20 +++++++++++++
 libc/test/src/math/smoke/CMakeLists.txt       | 30 +++++++++++++++++++
 .../src/math/smoke/generic_sqrtf128_test.cpp  | 13 ++++++++
 libc/test/src/math/smoke/sqrtf128_test.cpp    | 13 ++++++++
 10 files changed, 115 insertions(+), 5 deletions(-)
 create mode 100644 libc/src/math/generic/sqrtf128.cpp
 create mode 100644 libc/src/math/sqrtf128.h
 create mode 100644 libc/test/src/math/smoke/generic_sqrtf128_test.cpp
 create mode 100644 libc/test/src/math/smoke/sqrtf128_test.cpp

diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index d5ab891674a2d..a1486ac689812 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -374,6 +374,7 @@ if(LIBC_COMPILER_HAS_FLOAT128)
     # math.h C23 _Float128 entrypoints
     libc.src.math.copysignf128
     libc.src.math.fabsf128
+    libc.src.math.sqrtf128
   )
 endif()
 
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 49593dfe01c81..e28b8e8798844 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -452,6 +452,8 @@ Higher Math Functions
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | sqrtl      | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
+| sqrtf128   | |check| |         |         |         |         |         |         |         |         |         |         |         |
++------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tan        | |check| |         |         |         |         |         |         |         |         |         |         |         |
 +------------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+---------+
 | tanf       | |check| | |check| | |check| | |check| | |check| |         |         | |check| | |check| | |check| |         |         |
@@ -475,9 +477,9 @@ Higher Math Functions
 Accuracy of Higher Math Functions
 =================================
 
-============== ================ =============== ======================
-<Func>         <Func_f> (float) <Func> (double) <Func_l> (long double)
-============== ================ =============== ======================
+============== ================ =============== ====================== ======================
+<Func>         <Func_f> (float) <Func> (double) <Func_l> (long double) <Func_f128> (float128)
+============== ================ =============== ====================== ======================
 acos           |check|
 acosh          |check|
 asin           |check|
@@ -501,10 +503,10 @@ pow            |check|
 sin            |check|          large
 sincos         |check|          large
 sinh           |check|
-sqrt           |check|          |check|         |check|
+sqrt           |check|          |check|         |check|                |check|
 tan            |check|
 tanh           |check|
-============== ================ =============== ======================
+============== ================ =============== ====================== ======================
 
 
 Legends:
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 99e9b3ca65ca5..8c2feba23adbf 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -480,6 +480,7 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"sqrt", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"sqrtf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           FunctionSpec<"sqrtl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>]>,
+          FunctionSpec<"sqrtf128", RetValSpec<Float128Type>, [ArgSpec<Float128Type>]>, 
 
           FunctionSpec<"trunc", RetValSpec<DoubleType>, [ArgSpec<DoubleType>]>,
           FunctionSpec<"truncf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index a8d3fbd475f07..3097c4b3a82cc 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -231,6 +231,7 @@ add_math_entrypoint_object(sinhf)
 add_math_entrypoint_object(sqrt)
 add_math_entrypoint_object(sqrtf)
 add_math_entrypoint_object(sqrtl)
+add_math_entrypoint_object(sqrtf128)
 
 add_math_entrypoint_object(tan)
 add_math_entrypoint_object(tanf)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 887a8e6038a49..8b32a3296bb87 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1315,6 +1315,18 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  sqrtf128
+  SRCS
+    sqrtf128.cpp
+  HDRS
+    ../sqrtf128.h
+  DEPENDS
+    libc.src.__support.FPUtil.sqrt 
+  COMPILE_OPTIONS
+    -O3
+  )
+
 add_entrypoint_object(
   remquof
   SRCS
diff --git a/libc/src/math/generic/sqrtf128.cpp b/libc/src/math/generic/sqrtf128.cpp
new file mode 100644
index 0000000000000..0196c3e0a96ae
--- /dev/null
+++ b/libc/src/math/generic/sqrtf128.cpp
@@ -0,0 +1,17 @@
+//===-- Implementation of sqrtf128 function -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sqrtf128.h"
+#include "src/__support/FPUtil/sqrt.h"
+#include "src/__support/common.h"
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(float128, sqrtf128, (float128 x)) { return fputil::sqrt(x); }
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/math/sqrtf128.h b/libc/src/math/sqrtf128.h
new file mode 100644
index 0000000000000..bccb6bbb6332d
--- /dev/null
+++ b/libc/src/math/sqrtf128.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for sqrtf128 ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SQRTF128_H
+#define LLVM_LIBC_SRC_MATH_SQRTF128_H
+
+#include "src/__support/macros/properties/float.h"
+
+namespace LIBC_NAMESPACE {
+
+float128 sqrtf128(float128 x);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_MATH_SQRTF128_H
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 163fa924b243a..03526e019c3e9 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1102,6 +1102,20 @@ add_fp_unittest(
   UNIT_TEST_ONLY
 )
 
+add_fp_unittest(
+  sqrtf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    sqrtf128_test.cpp
+  DEPENDS
+    libc.include.math
+    libc.src.math.sqrtf128
+    libc.src.__support.FPUtil.fp_bits
+  # FIXME: Currently fails on the GPU build.
+  UNIT_TEST_ONLY
+)
+
 add_fp_unittest(
   generic_sqrtf_test
   SUITE
@@ -1150,6 +1164,22 @@ add_fp_unittest(
   UNIT_TEST_ONLY
 )
 
+add_fp_unittest(
+  generic_sqrtf128_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    generic_sqrtf128_test.cpp
+  DEPENDS
+    libc.src.math.sqrtf128
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.generic.sqrt
+  COMPILE_OPTIONS
+    -O3
+  # FIXME: Currently fails on the GPU build.
+  UNIT_TEST_ONLY
+)
+
 add_fp_unittest(
   remquof_test
   SUITE
diff --git a/libc/test/src/math/smoke/generic_sqrtf128_test.cpp b/libc/test/src/math/smoke/generic_sqrtf128_test.cpp
new file mode 100644
index 0000000000000..edba114adf06c
--- /dev/null
+++ b/libc/test/src/math/smoke/generic_sqrtf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for generic implementation of sqrtf128-------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/__support/FPUtil/generic/sqrt.h"
+
+LIST_SQRT_TESTS(float128, LIBC_NAMESPACE::fputil::sqrt<float128>)
diff --git a/libc/test/src/math/smoke/sqrtf128_test.cpp b/libc/test/src/math/smoke/sqrtf128_test.cpp
new file mode 100644
index 0000000000000..23397b0623ce5
--- /dev/null
+++ b/libc/test/src/math/smoke/sqrtf128_test.cpp
@@ -0,0 +1,13 @@
+//===-- Unittests for sqrtf128---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "SqrtTest.h"
+
+#include "src/math/sqrtf128.h"
+
+LIST_SQRT_TESTS(float128, LIBC_NAMESPACE::sqrtf128)

From 9fc890b5a06c5e4a014951a1dd7ad7ba592ceaaf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Wed, 24 Jan 2024 16:18:15 +0100
Subject: [PATCH 798/843] [clang][Parse][NFC] Make a local variable const

---
 clang/lib/Parse/ParseDeclCXX.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Parse/ParseDeclCXX.cpp b/clang/lib/Parse/ParseDeclCXX.cpp
index c0d771dc93dae..ea79917cfc8e9 100644
--- a/clang/lib/Parse/ParseDeclCXX.cpp
+++ b/clang/lib/Parse/ParseDeclCXX.cpp
@@ -2404,7 +2404,7 @@ VirtSpecifiers::Specifier Parser::isCXX11VirtSpecifier(const Token &Tok) const {
   if (!getLangOpts().CPlusPlus || Tok.isNot(tok::identifier))
     return VirtSpecifiers::VS_None;
 
-  IdentifierInfo *II = Tok.getIdentifierInfo();
+  const IdentifierInfo *II = Tok.getIdentifierInfo();
 
   // Initialize the contextual keywords.
   if (!Ident_final) {

From 0065d06760c0fba786b7b5ff061b3b3efa08bfbc Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 24 Jan 2024 15:20:05 +0000
Subject: [PATCH 799/843] [NFC][DebugInfo] Maintain RemoveDIs flag when
 attributor creates functions (#79143)

We're using this flag (IsNewDbgInfoFormat) to detect the boundaries in
LLVM of what's treating debug-info as intrinsics (i.e. dbg.value), and
what's using DPValue objects (the non-intrinsic replacement). The
attributor tends to create new wrapper functions and doesn't insert them
into Modules in the usual way, thus we have to manually update that flag
to signal what debug-info mode it's using.

I've added some --try-experimental-debuginfo-iterators RUN lines to
tests that would otherwise crash because of this, so that they're
exercised by our new-debuginfo-iterators buildbot.

NB: there's an attributor test with a dbg.value in it, however
attributes re-order themselves in RemoveDIs mode for various reasons, so
we're going to address that in a different patch.
---
 llvm/lib/Transforms/IPO/Attributor.cpp                      | 6 ++++++
 .../Transforms/Attributor/ArgumentPromotion/alignment.ll    | 4 ++++
 2 files changed, 10 insertions(+)

diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index d8e290cbc8a4d..5d1a783b2996d 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -2738,6 +2738,8 @@ void Attributor::createShallowWrapper(Function &F) {
       Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(), F.getName());
   F.setName(""); // set the inside function anonymous
   M.getFunctionList().insert(F.getIterator(), Wrapper);
+  // Flag whether the function is using new-debug-info or not.
+  Wrapper->IsNewDbgInfoFormat = M.IsNewDbgInfoFormat;
 
   F.setLinkage(GlobalValue::InternalLinkage);
 
@@ -2818,6 +2820,8 @@ bool Attributor::internalizeFunctions(SmallPtrSetImpl<Function *> &FnSet,
       VMap[&Arg] = &(*NewFArgIt++);
     }
     SmallVector<ReturnInst *, 8> Returns;
+    // Flag whether the function is using new-debug-info or not.
+    Copied->IsNewDbgInfoFormat = F->IsNewDbgInfoFormat;
 
     // Copy the body of the original function to the new one
     CloneFunctionInto(Copied, F, VMap,
@@ -3035,6 +3039,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures(
     OldFn->getParent()->getFunctionList().insert(OldFn->getIterator(), NewFn);
     NewFn->takeName(OldFn);
     NewFn->copyAttributesFrom(OldFn);
+    // Flag whether the function is using new-debug-info or not.
+    NewFn->IsNewDbgInfoFormat = OldFn->IsNewDbgInfoFormat;
 
     // Patch the pointer to LLVM function in debug info descriptor.
     NewFn->setSubprogram(OldFn->getSubprogram());
diff --git a/llvm/test/Transforms/Attributor/ArgumentPromotion/alignment.ll b/llvm/test/Transforms/Attributor/ArgumentPromotion/alignment.ll
index 9ae21ca44ee8c..54a5b8c564077 100644
--- a/llvm/test/Transforms/Attributor/ArgumentPromotion/alignment.ll
+++ b/llvm/test/Transforms/Attributor/ArgumentPromotion/alignment.ll
@@ -2,6 +2,10 @@
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-annotate-decl-cs  -S < %s | FileCheck %s --check-prefixes=CHECK,TUNIT
 ; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s | FileCheck %s --check-prefixes=CHECK,CGSCC
 
+;; Test with RemoveDIs debug-info mode to exercise a potential crash path.
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor -attributor-manifest-internal  -attributor-annotate-decl-cs  -S < %s --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=CHECK,TUNIT
+; RUN: opt -aa-pipeline=basic-aa -passes=attributor-cgscc -attributor-manifest-internal  -attributor-annotate-decl-cs -S < %s --try-experimental-debuginfo-iterators | FileCheck %s --check-prefixes=CHECK,CGSCC
+
 define void @f() {
 ; TUNIT-LABEL: define {{[^@]+}}@f() {
 ; TUNIT-NEXT:  entry:

From dc5b4daae7077b644753e53f175d0f5785fede49 Mon Sep 17 00:00:00 2001
From: quic-asaravan <156995626+quic-asaravan@users.noreply.github.com>
Date: Wed, 24 Jan 2024 21:00:33 +0530
Subject: [PATCH 800/843] [HEXAGON] Inlining Division (#79021)

This patch inlines float division function calls for hexagon.

Co-authored-by: Awanish Pandey <awanpand@codeaurora.org>
---
 .../Target/Hexagon/HexagonISelDAGToDAG.cpp    | 94 +++++++++++++++++++
 llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h |  3 +
 .../Target/Hexagon/HexagonISelLowering.cpp    | 11 +++
 llvm/lib/Target/Hexagon/HexagonISelLowering.h |  1 +
 .../CodeGen/Hexagon/inline-division-space.ll  | 30 ++++++
 llvm/test/CodeGen/Hexagon/inline-division.ll  | 29 ++++++
 6 files changed, 168 insertions(+)
 create mode 100644 llvm/test/CodeGen/Hexagon/inline-division-space.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/inline-division.ll

diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index e4127b0b94c62..6fe3fe0d36b9e 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -904,6 +904,98 @@ void HexagonDAGToDAGISel::SelectQ2V(SDNode *N) {
   ReplaceNode(N, T);
 }
 
+void HexagonDAGToDAGISel::FDiv(SDNode *N) {
+  const SDLoc &dl(N);
+  ArrayRef<EVT> ResultType(N->value_begin(), N->value_end());
+  SmallVector<SDValue, 2> Ops;
+  Ops = {N->getOperand(0), N->getOperand(1)};
+  SDVTList VTs;
+  VTs = CurDAG->getVTList(MVT::f32, MVT::f32);
+  SDNode *ResScale = CurDAG->getMachineNode(Hexagon::F2_sfrecipa, dl, VTs, Ops);
+  SDNode *D = CurDAG->getMachineNode(Hexagon::F2_sffixupd, dl, MVT::f32, Ops);
+
+  SDValue C = CurDAG->getTargetConstant(0x3f800000, dl, MVT::i32);
+  SDNode *constNode =
+      CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::f32, C);
+
+  SDNode *n = CurDAG->getMachineNode(Hexagon::F2_sffixupn, dl, MVT::f32, Ops);
+  SDNode *Err = CurDAG->getMachineNode(Hexagon::F2_sffms_lib, dl, MVT::f32,
+                                       SDValue(constNode, 0), SDValue(D, 0),
+                                       SDValue(ResScale, 0));
+  SDNode *NewRec = CurDAG->getMachineNode(Hexagon::F2_sffma_lib, dl, MVT::f32,
+                                          SDValue(ResScale, 0), SDValue(Err, 0),
+                                          SDValue(ResScale, 0));
+  SDNode *newErr = CurDAG->getMachineNode(Hexagon::F2_sffms_lib, dl, MVT::f32,
+                                          SDValue(constNode, 0), SDValue(D, 0),
+                                          SDValue(NewRec, 0));
+  SDNode *q = CurDAG->getMachineNode(
+      Hexagon::A2_andir, dl, MVT::f32, SDValue(n, 0),
+      CurDAG->getTargetConstant(0x80000000, dl, MVT::i32));
+  SDNode *NewQ =
+      CurDAG->getMachineNode(Hexagon::F2_sffma_lib, dl, MVT::f32, SDValue(q, 0),
+                             SDValue(n, 0), SDValue(NewRec, 0));
+  SDNode *NNewRec = CurDAG->getMachineNode(
+      Hexagon::F2_sffma_lib, dl, MVT::f32, SDValue(NewRec, 0),
+      SDValue(newErr, 0), SDValue(NewRec, 0));
+  SDNode *qErr =
+      CurDAG->getMachineNode(Hexagon::F2_sffms_lib, dl, MVT::f32, SDValue(n, 0),
+                             SDValue(D, 0), SDValue(NewQ, 0));
+  SDNode *NNewQ = CurDAG->getMachineNode(Hexagon::F2_sffma_lib, dl, MVT::f32,
+                                         SDValue(NewQ, 0), SDValue(qErr, 0),
+                                         SDValue(NNewRec, 0));
+
+  SDNode *NqErr =
+      CurDAG->getMachineNode(Hexagon::F2_sffms_lib, dl, MVT::f32, SDValue(n, 0),
+                             SDValue(NNewQ, 0), SDValue(D, 0));
+  std::array<SDValue, 4> temp1 = {SDValue(NNewQ, 0), SDValue(NqErr, 0),
+                                  SDValue(NNewRec, 0), SDValue(ResScale, 1)};
+  ArrayRef<SDValue> OpValue1(temp1);
+  SDNode *FinalNewQ =
+      CurDAG->getMachineNode(Hexagon::F2_sffma_sc, dl, MVT::f32, OpValue1);
+  ReplaceNode(N, FinalNewQ);
+}
+
+void HexagonDAGToDAGISel::FastFDiv(SDNode *N) {
+  const SDLoc &dl(N);
+  ArrayRef<EVT> ResultType(N->value_begin(), N->value_end());
+  SmallVector<SDValue, 2> Ops;
+  Ops = {N->getOperand(0), N->getOperand(1)};
+  SDVTList VTs;
+  VTs = CurDAG->getVTList(MVT::f32, MVT::f32);
+  SDNode *ResScale = CurDAG->getMachineNode(Hexagon::F2_sfrecipa, dl, VTs, Ops);
+  SDNode *D = CurDAG->getMachineNode(Hexagon::F2_sffixupd, dl, MVT::f32, Ops);
+
+  SDValue C = CurDAG->getTargetConstant(0x3f800000, dl, MVT::i32);
+  SDNode *constNode =
+      CurDAG->getMachineNode(Hexagon::A2_tfrsi, dl, MVT::f32, C);
+
+  SDNode *n = CurDAG->getMachineNode(Hexagon::F2_sffixupn, dl, MVT::f32, Ops);
+  SDNode *Err = CurDAG->getMachineNode(Hexagon::F2_sffms_lib, dl, MVT::f32,
+                                       SDValue(constNode, 0), SDValue(D, 0),
+                                       SDValue(ResScale, 0));
+  SDNode *NewRec = CurDAG->getMachineNode(Hexagon::F2_sffma_lib, dl, MVT::f32,
+                                          SDValue(ResScale, 0), SDValue(Err, 0),
+                                          SDValue(ResScale, 0));
+  SDNode *newErr = CurDAG->getMachineNode(Hexagon::F2_sffms_lib, dl, MVT::f32,
+                                          SDValue(constNode, 0), SDValue(D, 0),
+                                          SDValue(NewRec, 0));
+
+  SDNode *NNewRec = CurDAG->getMachineNode(
+      Hexagon::F2_sffma_lib, dl, MVT::f32, SDValue(NewRec, 0),
+      SDValue(newErr, 0), SDValue(NewRec, 0));
+  SDNode *FinalNewQ = CurDAG->getMachineNode(
+      Hexagon::F2_sfmpy, dl, MVT::f32, SDValue(NNewRec, 0), SDValue(n, 0));
+  ReplaceNode(N, FinalNewQ);
+}
+
+void HexagonDAGToDAGISel::SelectFDiv(SDNode *N) {
+  if (N->getFlags().hasAllowReassociation())
+    FastFDiv(N);
+  else
+    FDiv(N);
+  return;
+}
+
 void HexagonDAGToDAGISel::Select(SDNode *N) {
   if (N->isMachineOpcode())
     return N->setNodeId(-1);  // Already selected.
@@ -949,6 +1041,8 @@ void HexagonDAGToDAGISel::Select(SDNode *N) {
   case HexagonISD::D2P:           return SelectD2P(N);
   case HexagonISD::Q2V:           return SelectQ2V(N);
   case HexagonISD::V2Q:           return SelectV2Q(N);
+  case ISD::FDIV:
+    return SelectFDiv(N);
   }
 
   SelectCode(N);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
index 8fb1760936e81..50162b1079964 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
@@ -110,6 +110,9 @@ class HexagonDAGToDAGISel : public SelectionDAGISel {
   void SelectD2P(SDNode *N);
   void SelectQ2V(SDNode *N);
   void SelectV2Q(SDNode *N);
+  void SelectFDiv(SDNode *N);
+  void FDiv(SDNode *N);
+  void FastFDiv(SDNode *N);
 
   // Include the declarations autogenerated from the selection patterns.
   #define GET_DAGISEL_DECL
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 51138091f4a55..13691053ddd70 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -653,6 +653,13 @@ bool HexagonTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   return Subtarget.getInstrInfo()->isValidAutoIncImm(VT, V);
 }
 
+SDValue HexagonTargetLowering::LowerFDIV(SDValue Op, SelectionDAG &DAG) const {
+  if (DAG.getMachineFunction().getFunction().hasOptSize())
+    return SDValue();
+  else
+    return Op;
+}
+
 SDValue
 HexagonTargetLowering::LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
@@ -1765,6 +1772,7 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FADD, MVT::f64, Expand);
   setOperationAction(ISD::FSUB, MVT::f64, Expand);
   setOperationAction(ISD::FMUL, MVT::f64, Expand);
+  setOperationAction(ISD::FDIV, MVT::f32, Custom);
 
   setOperationAction(ISD::FMINNUM, MVT::f32, Legal);
   setOperationAction(ISD::FMAXNUM, MVT::f32, Legal);
@@ -3341,6 +3349,9 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
         errs() << "Error: check for a non-legal type in this operation\n";
 #endif
       llvm_unreachable("Should not custom lower this!");
+
+    case ISD::FDIV:
+      return LowerFDIV(Op, DAG);
     case ISD::CONCAT_VECTORS:       return LowerCONCAT_VECTORS(Op, DAG);
     case ISD::INSERT_SUBVECTOR:     return LowerINSERT_SUBVECTOR(Op, DAG);
     case ISD::INSERT_VECTOR_ELT:    return LowerINSERT_VECTOR_ELT(Op, DAG);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index 8c7d0b70f3857..cb09e5b17843e 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -204,6 +204,7 @@ class HexagonTargetLowering : public TargetLowering {
 
   SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINLINEASM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFDIV(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerPREFETCH(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerREADCYCLECOUNTER(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerEH_LABEL(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/test/CodeGen/Hexagon/inline-division-space.ll b/llvm/test/CodeGen/Hexagon/inline-division-space.ll
new file mode 100644
index 0000000000000..9cf3c5c8b2b84
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/inline-division-space.ll
@@ -0,0 +1,30 @@
+; Test for checking division is inlined or not in case of Os.
+; RUN: llc -O2 -march=hexagon   < %s | FileCheck  %s
+
+; Function Attrs: optsize
+define dso_local i32 @testInt(i32 %a, i32 %b) local_unnamed_addr #0 {
+entry:
+;CHECK: call __hexagon_divsi3
+  %div = sdiv i32 %a, %b
+  %conv = sitofp i32 %div to float
+  %conv1 = fptosi float %conv to i32
+  ret i32 %conv1
+}
+
+; Function Attrs: optsize
+define dso_local float @testFloat(float %a, float %b) local_unnamed_addr #0 {
+entry:
+;CHECK: call __hexagon_divsf3
+  %div = fdiv float %a, %b
+  ret float %div
+}
+
+; Function Attrs: optsize
+define dso_local double @testDouble(double %a, double %b) local_unnamed_addr #0 {
+entry:
+;CHECK: call __hexagon_divdf3
+  %div = fdiv double %a, %b
+  ret double %div
+}
+
+attributes #0 = { optsize }
diff --git a/llvm/test/CodeGen/Hexagon/inline-division.ll b/llvm/test/CodeGen/Hexagon/inline-division.ll
new file mode 100644
index 0000000000000..7249a3f55e868
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/inline-division.ll
@@ -0,0 +1,29 @@
+; Test for checking division is inlined or not in case of Os.
+; RUN: llc -O2 -march=hexagon   < %s | FileCheck  %s
+
+define dso_local i32 @testInt(i32 %a, i32 %b) local_unnamed_addr  {
+entry:
+;CHECK: call __hexagon_divsi3
+  %div = sdiv i32 %a, %b
+  %conv = sitofp i32 %div to float
+  %conv1 = fptosi float %conv to i32
+  ret i32 %conv1
+}
+
+define dso_local float @testFloat(float %a, float %b) local_unnamed_addr  {
+entry:
+;CHECK-NOT: call __hexagon_divsf3
+;CHECK: sfrecipa
+;CHECK: sffixupn
+;CHEKC: and
+;CHECK: sfmpy
+  %div = fdiv float %a, %b
+  ret float %div
+}
+
+define dso_local double @testDouble(double %a, double %b) local_unnamed_addr  {
+entry:
+;CHECK: call __hexagon_divdf3
+  %div = fdiv double %a, %b
+  ret double %div
+}

From 31f41f0984303655acf4eaa8d09643a624b1ccb0 Mon Sep 17 00:00:00 2001
From: Christian Sigg <csigg@google.com>
Date: Wed, 24 Jan 2024 16:34:57 +0100
Subject: [PATCH 801/843] [clang][bazel] Fix BUILD after
 4a582845597e97d245e8ffdc14281f922b835e56.

---
 .../llvm-project-overlay/clang/BUILD.bazel    | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index a0c547b558448..92bf8718145c4 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -477,6 +477,38 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "basic_builtins_gen",
+    tbl_outs = [
+        (
+            "-gen-clang-builtins",
+            "include/clang/Basic/Builtins.inc",
+        ),
+    ],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/Builtins.td",
+    td_srcs = [
+        "include/clang/Basic/Builtins.td",
+        "include/clang/Basic/BuiltinsBase.td",
+    ],
+)
+
+gentbl(
+    name = "basic_builtins_bpf_gen",
+    tbl_outs = [
+        (
+            "-gen-clang-builtins",
+            "include/clang/Basic/BuiltinsBPF.inc",
+        ),
+    ],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/BuiltinsBPF.td",
+    td_srcs = [
+        "include/clang/Basic/BuiltinsBase.td",
+        "include/clang/Basic/BuiltinsBPF.td",
+    ],
+)
+
 gentbl(
     name = "libsema_openclbuiltins_inc_gen",
     strip_include_prefix = "lib/Sema",
@@ -626,6 +658,8 @@ cc_library(
         ":basic_arm_sve_builtins_gen",
         ":basic_arm_sve_typeflags_gen",
         ":basic_attr_gen",
+        ":basic_builtins_gen",
+        ":basic_builtins_bpf_gen",
         ":basic_internal_headers",
         ":basic_riscv_sifive_vector_builtins_gen",
         ":basic_riscv_vector_builtin_cg_gen",

From 2e81ac25b4e2bfdc71aac19a911525a7f35680be Mon Sep 17 00:00:00 2001
From: Ivan Kosarev <ivan.kosarev@amd.com>
Date: Wed, 24 Jan 2024 17:38:16 +0200
Subject: [PATCH 802/843] [AMDGPU][NFC] Simplify AGPR/VGPR load/store operand
 definitions. (#79289)

Part of <https://github.com/llvm/llvm-project/issues/62629>.
---
 .../Disassembler/AMDGPUDisassembler.cpp       | 48 ++++---------------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.td      | 29 ++++-------
 2 files changed, 18 insertions(+), 59 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index a9968cfe25b46..4e2e57e943196 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -365,9 +365,9 @@ static bool IsAGPROperand(const MCInst &Inst, int OpIdx,
   return Reg >= AMDGPU::AGPR0 && Reg <= AMDGPU::AGPR255;
 }
 
-static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
-                                             AMDGPUDisassembler::OpWidthTy Opw,
-                                             const MCDisassembler *Decoder) {
+static DecodeStatus decodeAVLdSt(MCInst &Inst, unsigned Imm,
+                                 AMDGPUDisassembler::OpWidthTy Opw,
+                                 const MCDisassembler *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
   if (!DAsm->isGFX90A()) {
     Imm &= 511;
@@ -399,6 +399,13 @@ static DecodeStatus decodeOperand_AVLdSt_Any(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, DAsm->decodeSrcOp(Opw, Imm | 256));
 }
 
+template <AMDGPUDisassembler::OpWidthTy Opw>
+static DecodeStatus decodeAVLdSt(MCInst &Inst, unsigned Imm,
+                                 uint64_t /* Addr */,
+                                 const MCDisassembler *Decoder) {
+  return decodeAVLdSt(Inst, Imm, Opw, Decoder);
+}
+
 static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm,
                                            uint64_t Addr,
                                            const MCDisassembler *Decoder) {
@@ -408,41 +415,6 @@ static DecodeStatus decodeOperand_VSrc_f64(MCInst &Inst, unsigned Imm,
       Inst, DAsm->decodeSrcOp(AMDGPUDisassembler::OPW64, Imm, false, 64, true));
 }
 
-static DecodeStatus
-DecodeAVLdSt_32RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
-                             const MCDisassembler *Decoder) {
-  return decodeOperand_AVLdSt_Any(Inst, Imm,
-                                  AMDGPUDisassembler::OPW32, Decoder);
-}
-
-static DecodeStatus
-DecodeAVLdSt_64RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
-                             const MCDisassembler *Decoder) {
-  return decodeOperand_AVLdSt_Any(Inst, Imm,
-                                  AMDGPUDisassembler::OPW64, Decoder);
-}
-
-static DecodeStatus
-DecodeAVLdSt_96RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
-                             const MCDisassembler *Decoder) {
-  return decodeOperand_AVLdSt_Any(Inst, Imm,
-                                  AMDGPUDisassembler::OPW96, Decoder);
-}
-
-static DecodeStatus
-DecodeAVLdSt_128RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
-                              const MCDisassembler *Decoder) {
-  return decodeOperand_AVLdSt_Any(Inst, Imm,
-                                  AMDGPUDisassembler::OPW128, Decoder);
-}
-
-static DecodeStatus
-DecodeAVLdSt_160RegisterClass(MCInst &Inst, unsigned Imm, uint64_t Addr,
-                              const MCDisassembler *Decoder) {
-  return decodeOperand_AVLdSt_Any(Inst, Imm, AMDGPUDisassembler::OPW160,
-                                  Decoder);
-}
-
 #define DECODE_SDWA(DecName) \
 DECODE_OPERAND(decodeSDWA##DecName, decodeSDWA##DecName)
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index b3265b73fa7e1..404601d981138 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -1384,30 +1384,17 @@ def AVDst_512 : RegisterOperand<AV_512> {
   let EncoderMethod = "getAVOperandEncoding";
 }
 
-def AVLdSt_32 : RegisterOperand<AV_32> {
-  let DecoderMethod = "DecodeAVLdSt_32RegisterClass";
+class AVLdStOperand<RegisterClass regClass, string width>
+    : RegisterOperand<regClass> {
+  let DecoderMethod = "decodeAVLdSt<AMDGPUDisassembler::" # width # ">";
   let EncoderMethod = "getAVOperandEncoding";
 }
 
-def AVLdSt_64 : RegisterOperand<AV_64> {
-  let DecoderMethod = "DecodeAVLdSt_64RegisterClass";
-  let EncoderMethod = "getAVOperandEncoding";
-}
-
-def AVLdSt_96 : RegisterOperand<AV_96> {
-  let DecoderMethod = "DecodeAVLdSt_96RegisterClass";
-  let EncoderMethod = "getAVOperandEncoding";
-}
-
-def AVLdSt_128 : RegisterOperand<AV_128> {
-  let DecoderMethod = "DecodeAVLdSt_128RegisterClass";
-  let EncoderMethod = "getAVOperandEncoding";
-}
-
-def AVLdSt_160 : RegisterOperand<AV_160> {
-  let DecoderMethod = "DecodeAVLdSt_160RegisterClass";
-  let EncoderMethod = "getAVOperandEncoding";
-}
+def AVLdSt_32 : AVLdStOperand<AV_32, "OPW32">;
+def AVLdSt_64 : AVLdStOperand<AV_64, "OPW64">;
+def AVLdSt_96 : AVLdStOperand<AV_96, "OPW96">;
+def AVLdSt_128 : AVLdStOperand<AV_128, "OPW128">;
+def AVLdSt_160 : AVLdStOperand<AV_160, "OPW160">;
 
 //===----------------------------------------------------------------------===//
 //  ACSrc_* Operands with an AGPR or an inline constant

From ca8605a78b8dd531c164f6a48a180ccf2770d042 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 24 Jan 2024 10:44:35 -0500
Subject: [PATCH 803/843] [ci] Remove bits that are unused since we stopped
 using Phabricator

---
 .ci/generate-buildkite-pipeline-premerge | 29 +-----------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/.ci/generate-buildkite-pipeline-premerge b/.ci/generate-buildkite-pipeline-premerge
index f32eb7213b940..34d914696018d 100755
--- a/.ci/generate-buildkite-pipeline-premerge
+++ b/.ci/generate-buildkite-pipeline-premerge
@@ -9,7 +9,7 @@
 
 #
 # This file generates a Buildkite pipeline that triggers the various CI jobs for
-# the LLVM project during pre-commit CI (each time a Phabricator diff is uploaded).
+# the LLVM project during pre-commit CI.
 #
 # See https://buildkite.com/docs/agent/v3/cli-pipeline#pipeline-format.
 #
@@ -30,8 +30,6 @@ git fetch origin main:main
 : ${WINDOWS_AGENTS:='{"queue": "windows"}'}
 # Filter rules for generic linux tests
 : ${LINUX_AGENTS:='{"queue": "linux"}'}
-# Service agents, for interacting with Phabricator.
-: ${SERVICE_AGENTS:='{"queue": "service"}'}
 # Set by buildkite
 : ${BUILDKITE_COMMIT:=}
 : ${BUILDKITE_BRANCH:=}
@@ -284,28 +282,3 @@ if [[ "${windows_projects}" != "" ]]; then
   - 'bash .ci/monolithic-windows.sh "$(echo ${windows_projects} | tr ' ' ';')" "$(echo ${windows_check_targets})"'
 EOF
 fi
-
-# If build was triggered from a Phabricator review - send an update back.
-if [[ -n "${ph_target_phid:-}" ]]; then
-  cat << EOF
-- continue_on_failure: true
-  wait: '~'
-- label: ':phabricator: update build status on Phabricator'
-  agents: ${SERVICE_AGENTS}
-  artifact_paths:
-  - 'artifacts/**/*'
-  commands:
-  - export SRC=\$\${BUILDKITE_BUILD_PATH}/llvm-premerge-checks
-  - rm -rf \$\${SRC}
-  - git clone --depth 1 https://github.com/google/llvm-premerge-checks.git "\$\${SRC}"
-  - cd \$\${SRC}
-  - git fetch origin "main":x
-  - git checkout x
-  - echo "llvm-premerge-checks commit"
-  - git rev-parse HEAD
-  - pip install -q -r \$\${SRC}/scripts/requirements.txt
-  - cd "\$\$BUILDKITE_BUILD_CHECKOUT_PATH"
-  - \$\${SRC}/scripts/summary.py
-  timeout_in_minutes: 10
-EOF
-fi

From 17db9efe9274e72f42e7e68103dab920ee494ac8 Mon Sep 17 00:00:00 2001
From: Jan Leyonberg <jan_sjodin@yahoo.com>
Date: Wed, 24 Jan 2024 10:51:47 -0500
Subject: [PATCH 804/843] [OpenMP][MLIR] Add omp.distribute op to the OMP
 dialect (#67720)

This patch adds the omp.distribute operation to the OMP dialect. The
purpose is to be able to represent the distribute construct in OpenMP
with the associated clauses. The effect of the operation is to
distributes the loop iterations of the loop(s) contained inside the
region across multiple teams.
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td | 54 +++++++++++++++++++
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 16 ++++++
 mlir/test/Dialect/OpenMP/invalid.mlir         |  9 ++++
 mlir/test/Dialect/OpenMP/ops.mlir             | 30 +++++++++++
 4 files changed, 109 insertions(+)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index d614f2666a85a..96c15e775a302 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -638,6 +638,60 @@ def YieldOp : OpenMP_Op<"yield",
   let assemblyFormat = [{ ( `(` $results^ `:` type($results) `)` )? attr-dict}];
 }
 
+//===----------------------------------------------------------------------===//
+// Distribute construct [2.9.4.1]
+//===----------------------------------------------------------------------===//
+def DistributeOp : OpenMP_Op<"distribute", [AttrSizedOperandSegments,
+                              MemoryEffects<[MemWrite]>]> {
+  let summary = "distribute construct";
+  let description = [{
+    The distribute construct specifies that the iterations of one or more loops
+    (optionally specified using collapse clause) will be executed by the
+    initial teams in the context of their implicit tasks. The loops that the
+    distribute op is associated with starts with the outermost loop enclosed by
+    the distribute op region and going down the loop nest toward the innermost
+    loop. The iterations are distributed across the initial threads of all
+    initial teams that execute the teams region to which the distribute region
+    binds.
+
+    The distribute loop construct specifies that the iterations of the loop(s)
+    will be executed in parallel by threads in the current context. These
+    iterations are spread across threads that already exist in the enclosing
+    region. The lower and upper bounds specify a half-open range: the
+    range includes the lower bound but does not include the upper bound. If the
+    `inclusive` attribute is specified then the upper bound is also included.
+
+    The `dist_schedule_static` attribute specifies the  schedule for this
+    loop, determining how the loop is distributed across the parallel threads.
+    The optional `schedule_chunk` associated with this determines further
+    controls this distribution.
+
+    // TODO: private_var, firstprivate_var, lastprivate_var, collapse
+  }];
+  let arguments = (ins
+             UnitAttr:$dist_schedule_static,
+             Optional<IntLikeType>:$chunk_size,
+             Variadic<AnyType>:$allocate_vars,
+             Variadic<AnyType>:$allocators_vars,
+             OptionalAttr<OrderKindAttr>:$order_val);
+
+  let regions = (region AnyRegion:$region);
+
+  let assemblyFormat = [{
+    oilist(`dist_schedule_static` $dist_schedule_static
+          |`chunk_size` `(` $chunk_size `:` type($chunk_size) `)`
+          |`order` `(` custom<ClauseAttr>($order_val) `)`
+          |`allocate` `(`
+             custom<AllocateAndAllocator>(
+               $allocate_vars, type($allocate_vars),
+               $allocators_vars, type($allocators_vars)
+             ) `)`
+    ) $region attr-dict
+  }];
+
+  let hasVerifier = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // 2.10.1 task Construct
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 2d3be76c65e81..13cc16125a273 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1153,6 +1153,22 @@ LogicalResult SimdLoopOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// Verifier for Distribute construct [2.9.4.1]
+//===----------------------------------------------------------------------===//
+
+LogicalResult DistributeOp::verify() {
+  if (this->getChunkSize() && !this->getDistScheduleStatic())
+    return emitOpError() << "chunk size set without "
+                            "dist_schedule_static being present";
+
+  if (getAllocateVars().size() != getAllocatorsVars().size())
+    return emitError(
+        "expected equal sizes for allocate and allocator variables");
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ReductionOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index 2b0e86ddd22bb..812b79e35595f 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -1729,3 +1729,12 @@ func.func @omp_target_update_invalid_motion_modifier_5(%map1 : memref<?xi32>) {
   return
 }
 llvm.mlir.global internal @_QFsubEx() : i32
+
+// -----
+
+func.func @omp_distribute(%data_var : memref<i32>) -> () {
+  // expected-error @below {{expected equal sizes for allocate and allocator variables}}
+  "omp.distribute"(%data_var) <{operandSegmentSizes = array<i32: 0, 1, 0>}> ({
+      "omp.terminator"() : () -> ()
+    }) : (memref<i32>) -> ()
+}
diff --git a/mlir/test/Dialect/OpenMP/ops.mlir b/mlir/test/Dialect/OpenMP/ops.mlir
index 3d4f6435572f7..ccf72ae31d439 100644
--- a/mlir/test/Dialect/OpenMP/ops.mlir
+++ b/mlir/test/Dialect/OpenMP/ops.mlir
@@ -479,6 +479,36 @@ func.func @omp_simdloop_pretty_multiple(%lb1 : index, %ub1 : index, %step1 : ind
   return
 }
 
+// CHECK-LABEL: omp_distribute
+func.func @omp_distribute(%chunk_size : i32, %data_var : memref<i32>) -> () {
+  // CHECK: omp.distribute
+  "omp.distribute" () ({
+    omp.terminator
+  }) {} : () -> ()
+  // CHECK: omp.distribute
+  omp.distribute {
+    omp.terminator
+  }
+  // CHECK: omp.distribute dist_schedule_static
+  omp.distribute dist_schedule_static {
+    omp.terminator
+  }
+  // CHECK: omp.distribute dist_schedule_static chunk_size(%{{.+}} : i32)
+  omp.distribute dist_schedule_static chunk_size(%chunk_size : i32) {
+    omp.terminator
+  }
+  // CHECK: omp.distribute order(concurrent)
+  omp.distribute order(concurrent) {
+    omp.terminator
+  }
+  // CHECK: omp.distribute allocate(%{{.+}} : memref<i32> -> %{{.+}} : memref<i32>)
+  omp.distribute allocate(%data_var : memref<i32> -> %data_var : memref<i32>) {
+    omp.terminator
+  }
+return
+}
+
+
 // CHECK-LABEL: omp_target
 func.func @omp_target(%if_cond : i1, %device : si32,  %num_threads : i32, %map1: memref<?xi32>, %map2: memref<?xi32>) -> () {
 

From 8d43dad9b86ad0f72100b6f75450f2982f2663b9 Mon Sep 17 00:00:00 2001
From: Christian Sigg <chsigg@users.noreply.github.com>
Date: Wed, 24 Jan 2024 16:54:22 +0100
Subject: [PATCH 805/843] [clang][bazel] Fix BUILD after
 4a582845597e97d245e8ffdc14281f922b835e56.

---
 utils/bazel/llvm-project-overlay/clang/BUILD.bazel | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 92bf8718145c4..63838b2b98041 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -610,6 +610,8 @@ cc_library(
     ]),
     hdrs = glob([
         "include/clang/Basic/*.h",
+        "include/clang/Basic/Builtins.inc",
+        "include/clang/Basic/BuiltinsBPF.inc",
     ]),
     copts = [
         "-DHAVE_VCS_VERSION_INC",

From 6a0118cec079f5963dc5a7a3d9423c55f08b6dad Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 24 Jan 2024 10:55:40 -0500
Subject: [PATCH 806/843] [libc++][docs] Remove mention of Phabricator on the
 landing page

---
 libcxx/docs/index.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libcxx/docs/index.rst b/libcxx/docs/index.rst
index ab7472071ebb9..aa1bd4b83b265 100644
--- a/libcxx/docs/index.rst
+++ b/libcxx/docs/index.rst
@@ -221,9 +221,8 @@ can ask for support on the `libcxx forum`_ or on IRC.
 
 **Patches**
 
-If you want to contribute a patch to libc++, the best place for that is
-`Phabricator <https://llvm.org/docs/Phabricator.html>`_. Please add `libcxx-commits` as a subscriber.
-Also make sure you are subscribed to the `libcxx-commits mailing list`_.
+If you want to contribute a patch to libc++, please start by reviewing our
+:ref:`documentation about contributing <ContributingToLibcxx>`.
 
 **Discussion and Questions**
 

From fc364e26845ce5529caf9f88abcc5a5531d1f59f Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 24 Jan 2024 11:02:51 -0500
Subject: [PATCH 807/843] [libunwind][doc] Remove reference to Phabricator from
 the landing page

---
 libunwind/docs/index.rst | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/libunwind/docs/index.rst b/libunwind/docs/index.rst
index 7126d02abd296..e1283e7acfcc2 100644
--- a/libunwind/docs/index.rst
+++ b/libunwind/docs/index.rst
@@ -83,9 +83,8 @@ Please use the tag "libunwind" for new threads.
 
 **Patches**
 
-If you want to contribute a patch to libunwind, the best place for that is
-`Phabricator <https://llvm.org/docs/Phabricator.html>`_. Please include [libunwind] in the subject and
-add `cfe-commits` as a subscriber. Also make sure you are subscribed to the `cfe-commits mailing list`_.
+If you want to contribute a patch to libunwind, please start by reading the LLVM
+`documentation about contributing <https://www.llvm.org/docs/Contributing.html>`__.
 
 **Discussion and Questions**
 

From 56aa77e1193b7abe65bf3ec16e0f37972345b9f2 Mon Sep 17 00:00:00 2001
From: Danial Klimkin <dklimkin@users.noreply.github.com>
Date: Wed, 24 Jan 2024 17:17:17 +0100
Subject: [PATCH 808/843] Fix bazel build past
 4a582845597e97d245e8ffdc14281f922b835e56 (#79318)

and keep things sorted.
---
 .../llvm-project-overlay/clang/BUILD.bazel    | 66 +++++++++----------
 1 file changed, 31 insertions(+), 35 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
index 63838b2b98041..dda6d9460d582 100644
--- a/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/clang/BUILD.bazel
@@ -306,6 +306,34 @@ gentbl(
     ],
 )
 
+gentbl(
+    name = "basic_builtins_bpf_gen",
+    tbl_outs = [(
+        "-gen-clang-builtins",
+        "include/clang/Basic/BuiltinsBPF.inc",
+    )],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/BuiltinsBPF.td",
+    td_srcs = [
+        "include/clang/Basic/BuiltinsBPF.td",
+        "include/clang/Basic/BuiltinsBase.td",
+    ],
+)
+
+gentbl(
+    name = "basic_builtins_gen",
+    tbl_outs = [(
+        "-gen-clang-builtins",
+        "include/clang/Basic/Builtins.inc",
+    )],
+    tblgen = ":clang-tblgen",
+    td_file = "include/clang/Basic/Builtins.td",
+    td_srcs = [
+        "include/clang/Basic/Builtins.td",
+        "include/clang/Basic/BuiltinsBase.td",
+    ],
+)
+
 gentbl(
     name = "basic_riscv_vector_builtins_gen",
     tbl_outs = [(
@@ -477,38 +505,6 @@ gentbl(
     ],
 )
 
-gentbl(
-    name = "basic_builtins_gen",
-    tbl_outs = [
-        (
-            "-gen-clang-builtins",
-            "include/clang/Basic/Builtins.inc",
-        ),
-    ],
-    tblgen = ":clang-tblgen",
-    td_file = "include/clang/Basic/Builtins.td",
-    td_srcs = [
-        "include/clang/Basic/Builtins.td",
-        "include/clang/Basic/BuiltinsBase.td",
-    ],
-)
-
-gentbl(
-    name = "basic_builtins_bpf_gen",
-    tbl_outs = [
-        (
-            "-gen-clang-builtins",
-            "include/clang/Basic/BuiltinsBPF.inc",
-        ),
-    ],
-    tblgen = ":clang-tblgen",
-    td_file = "include/clang/Basic/BuiltinsBPF.td",
-    td_srcs = [
-        "include/clang/Basic/BuiltinsBase.td",
-        "include/clang/Basic/BuiltinsBPF.td",
-    ],
-)
-
 gentbl(
     name = "libsema_openclbuiltins_inc_gen",
     strip_include_prefix = "lib/Sema",
@@ -610,8 +606,6 @@ cc_library(
     ]),
     hdrs = glob([
         "include/clang/Basic/*.h",
-        "include/clang/Basic/Builtins.inc",
-        "include/clang/Basic/BuiltinsBPF.inc",
     ]),
     copts = [
         "-DHAVE_VCS_VERSION_INC",
@@ -660,8 +654,8 @@ cc_library(
         ":basic_arm_sve_builtins_gen",
         ":basic_arm_sve_typeflags_gen",
         ":basic_attr_gen",
-        ":basic_builtins_gen",
         ":basic_builtins_bpf_gen",
+        ":basic_builtins_gen",
         ":basic_internal_headers",
         ":basic_riscv_sifive_vector_builtins_gen",
         ":basic_riscv_vector_builtin_cg_gen",
@@ -978,6 +972,7 @@ cc_library(
         ":ast_stmt_nodes_gen",
         ":ast_type_properties_gen",
         ":basic",
+        ":basic_builtins_gen",
         ":lex",
         ":type_nodes_gen",
         "//llvm:BinaryFormat",
@@ -1123,6 +1118,7 @@ cc_library(
         ":basic_arm_sve_builtins_gen",
         ":basic_arm_sve_sema_rangechecks_gen",
         ":basic_arm_sve_streaming_attrs_gen",
+        ":basic_builtins_gen",
         ":basic_riscv_sifive_vector_builtin_sema_gen",
         ":basic_riscv_vector_builtin_sema_gen",
         ":edit",

From 396b6bbc5ecef93fce09d6463f47b44dc501d2aa Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 24 Jan 2024 08:29:28 -0800
Subject: [PATCH 809/843] [RISCV] Recurse on second operand of two operand
 shuffles (#79197)

This builds on bdc41106ee48dce59c500c9a3957af947f30c8c3.

This change completes the migration to a recursive shuffle lowering
strategy where when we encounter an unknown two argument shuffle, we
lower each operand as a single source permute, and then use a vselect
(i.e. a vmerge) to combine the results. This relies for code quality on
the post-isel combine which will aggressively fold that vmerge back into
the materialization of the second operand if possible.

Note: The change includes only the most immediately obvious of the
stylistic cleanup. There's a bunch of code movement that this enables
that I'll do as a separate patch as rolling it into this creates an
unreadable diff.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  79 +----
 .../RISCV/rvv/fixed-vectors-fp-interleave.ll  |  36 +-
 .../RISCV/rvv/fixed-vectors-int-interleave.ll |  36 +-
 .../RISCV/rvv/fixed-vectors-int-shuffles.ll   |  23 +-
 .../rvv/fixed-vectors-interleaved-access.ll   | 333 ++++++++----------
 .../rvv/fixed-vectors-shuffle-exact-vlen.ll   |   7 +-
 .../rvv/fixed-vectors-shuffle-transpose.ll    | 129 +++----
 .../rvv/fixed-vectors-shufflevector-vnsrl.ll  |  36 +-
 .../RISCV/rvv/vector-deinterleave-fixed.ll    |   8 +-
 9 files changed, 296 insertions(+), 391 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 56a5ab14a4a9f..5d61baae0ce4e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4975,12 +4975,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
 
   // As a backup, shuffles can be lowered via a vrgather instruction, possibly
   // merged with a second vrgather.
-  SmallVector<SDValue> GatherIndicesLHS, GatherIndicesRHS;
-
-  // Keep a track of which non-undef indices are used by each LHS/RHS shuffle
-  // half.
-  DenseMap<int, unsigned> LHSIndexCounts, RHSIndexCounts;
-
+  SmallVector<int> ShuffleMaskLHS, ShuffleMaskRHS;
   SmallVector<SDValue> MaskVals;
 
   // Now construct the mask that will be used by the blended vrgather operation.
@@ -4989,28 +4984,20 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     bool SelectMaskVal = (MaskIndex < (int)NumElts) ^ !SwapOps;
     MaskVals.push_back(DAG.getConstant(SelectMaskVal, DL, XLenVT));
     bool IsLHSOrUndefIndex = MaskIndex < (int)NumElts;
-    GatherIndicesLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
-                               ? DAG.getConstant(MaskIndex, DL, XLenVT)
-                               : DAG.getUNDEF(XLenVT));
-    GatherIndicesRHS.push_back(
-                               IsLHSOrUndefIndex ? DAG.getUNDEF(XLenVT)
-                               : DAG.getConstant(MaskIndex - NumElts, DL, XLenVT));
-    if (IsLHSOrUndefIndex && MaskIndex >= 0)
-      ++LHSIndexCounts[MaskIndex];
-    if (!IsLHSOrUndefIndex)
-      ++RHSIndexCounts[MaskIndex - NumElts];
+    ShuffleMaskLHS.push_back(IsLHSOrUndefIndex && MaskIndex >= 0
+                             ? MaskIndex : -1);
+    ShuffleMaskRHS.push_back(IsLHSOrUndefIndex ? -1 : (MaskIndex - NumElts));
   }
 
   if (SwapOps) {
     std::swap(V1, V2);
-    std::swap(GatherIndicesLHS, GatherIndicesRHS);
+    std::swap(ShuffleMaskLHS, ShuffleMaskRHS);
   }
 
   assert(MaskVals.size() == NumElts && "Unexpected select-like shuffle");
   MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
 
-  unsigned GatherVXOpc = RISCVISD::VRGATHER_VX_VL;
   unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
   MVT IndexVT = VT.changeTypeToInteger();
   // Since we can't introduce illegal index types at this stage, use i16 and
@@ -5038,6 +5025,11 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   // are handled above.
   if (V2.isUndef()) {
     V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+    SmallVector<SDValue> GatherIndicesLHS;
+    for (int ShuffleIdx : ShuffleMaskLHS)
+      GatherIndicesLHS.push_back(ShuffleIdx != -1
+                                 ? DAG.getConstant(ShuffleIdx, DL, XLenVT)
+                                 : DAG.getUNDEF(XLenVT));
     SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
     LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
                                          Subtarget);
@@ -5046,50 +5038,13 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return convertFromScalableVector(VT, Gather, DAG, Subtarget);
   }
 
-  // Translate the gather index we computed above (and possibly swapped)
-  // back to a shuffle mask.  This step should disappear once we complete
-  // the migration to recursive design.
-  SmallVector<int> ShuffleMaskLHS;
-  ShuffleMaskLHS.reserve(GatherIndicesLHS.size());
-  for (SDValue GatherIndex : GatherIndicesLHS) {
-    if (GatherIndex.isUndef()) {
-      ShuffleMaskLHS.push_back(-1);
-      continue;
-    }
-    auto *IdxC = cast<ConstantSDNode>(GatherIndex);
-    ShuffleMaskLHS.push_back(IdxC->getZExtValue());
-  }
-
-  // Recursively invoke lowering for the LHS as if there were no RHS.
-  // This allows us to leverage all of our single source permute tricks.
-  SDValue Gather =
-    DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
-  Gather = convertToScalableVector(ContainerVT, Gather, DAG, Subtarget);
-
-  // Blend in second vector source with an additional vrgather.
-  V2 = convertToScalableVector(ContainerVT, V2, DAG, Subtarget);
-
-  MVT MaskContainerVT = ContainerVT.changeVectorElementType(MVT::i1);
-  SelectMask =
-    convertToScalableVector(MaskContainerVT, SelectMask, DAG, Subtarget);
-
-  // If only one index is used, we can use a "splat" vrgather.
-  // TODO: We can splat the most-common index and fix-up any stragglers, if
-  // that's beneficial.
-  if (RHSIndexCounts.size() == 1) {
-    int SplatIndex = RHSIndexCounts.begin()->getFirst();
-    Gather = DAG.getNode(GatherVXOpc, DL, ContainerVT, V2,
-                         DAG.getConstant(SplatIndex, DL, XLenVT), Gather,
-                         SelectMask, VL);
-  } else {
-    SDValue RHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesRHS);
-    RHSIndices =
-      convertToScalableVector(IndexContainerVT, RHSIndices, DAG, Subtarget);
-    Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V2, RHSIndices, Gather,
-                         SelectMask, VL);
-  }
-
-  return convertFromScalableVector(VT, Gather, DAG, Subtarget);
+  // Recursively invoke lowering for each operand if we had two
+  // independent single source permutes, and then combine the result via a
+  // vselect.  Note that the vselect will likely be folded back into the
+  // second permute (vrgather, or other) by the post-isel combine.
+  V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);
+  V2 = DAG.getVectorShuffle(VT, DL, V2, DAG.getUNDEF(VT), ShuffleMaskRHS);
+  return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V2, V1);
 }
 
 bool RISCVTargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
index dab530751ef96..6bfd0ac932672 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp-interleave.ll
@@ -238,26 +238,44 @@ define <64 x half> @interleave_v32f16(<32 x half> %x, <32 x half> %y) {
 define <64 x float> @interleave_v32f32(<32 x float> %x, <32 x float> %y) {
 ; V128-LABEL: interleave_v32f32:
 ; V128:       # %bb.0:
+; V128-NEXT:    addi sp, sp, -16
+; V128-NEXT:    .cfi_def_cfa_offset 16
+; V128-NEXT:    csrr a0, vlenb
+; V128-NEXT:    slli a0, a0, 3
+; V128-NEXT:    sub sp, sp, a0
+; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; V128-NEXT:    vmv8r.v v0, v16
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vmv8r.v v16, v8
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v0, v8, 16
+; V128-NEXT:    vslidedown.vi v8, v0, 16
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v24, v0, v8
 ; V128-NEXT:    li a0, -1
 ; V128-NEXT:    vwmaccu.vx v24, a0, v8
-; V128-NEXT:    lui a1, %hi(.LCPI10_0)
-; V128-NEXT:    addi a1, a1, %lo(.LCPI10_0)
-; V128-NEXT:    li a2, 32
-; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; V128-NEXT:    vle16.v v12, (a1)
+; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; V128-NEXT:    vslidedown.vi v0, v16, 16
+; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT:    vwaddu.vv v8, v0, v16
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
 ; V128-NEXT:    lui a1, 699051
 ; V128-NEXT:    addi a1, a1, -1366
+; V128-NEXT:    li a2, 32
 ; V128-NEXT:    vmv.s.x v0, a1
-; V128-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; V128-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v0, v8, v16
-; V128-NEXT:    vwmaccu.vx v0, a0, v16
+; V128-NEXT:    addi a1, sp, 16
+; V128-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; V128-NEXT:    vwaddu.vv v0, v16, v8
+; V128-NEXT:    vwmaccu.vx v0, a0, v8
 ; V128-NEXT:    vmv8r.v v8, v0
 ; V128-NEXT:    vmv8r.v v16, v24
+; V128-NEXT:    csrr a0, vlenb
+; V128-NEXT:    slli a0, a0, 3
+; V128-NEXT:    add sp, sp, a0
+; V128-NEXT:    addi sp, sp, 16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v32f32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
index 9e21cc9e3d624..6da83644413bc 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-interleave.ll
@@ -403,26 +403,44 @@ define <64 x i16> @interleave_v32i16(<32 x i16> %x, <32 x i16> %y) {
 define <64 x i32> @interleave_v32i32(<32 x i32> %x, <32 x i32> %y) {
 ; V128-LABEL: interleave_v32i32:
 ; V128:       # %bb.0:
+; V128-NEXT:    addi sp, sp, -16
+; V128-NEXT:    .cfi_def_cfa_offset 16
+; V128-NEXT:    csrr a0, vlenb
+; V128-NEXT:    slli a0, a0, 3
+; V128-NEXT:    sub sp, sp, a0
+; V128-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; V128-NEXT:    vmv8r.v v0, v16
+; V128-NEXT:    addi a0, sp, 16
+; V128-NEXT:    vs8r.v v16, (a0) # Unknown-size Folded Spill
+; V128-NEXT:    vmv8r.v v16, v8
 ; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; V128-NEXT:    vslidedown.vi v0, v8, 16
+; V128-NEXT:    vslidedown.vi v8, v0, 16
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
 ; V128-NEXT:    vwaddu.vv v24, v0, v8
 ; V128-NEXT:    li a0, -1
 ; V128-NEXT:    vwmaccu.vx v24, a0, v8
-; V128-NEXT:    lui a1, %hi(.LCPI17_0)
-; V128-NEXT:    addi a1, a1, %lo(.LCPI17_0)
-; V128-NEXT:    li a2, 32
-; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
-; V128-NEXT:    vle16.v v12, (a1)
+; V128-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
+; V128-NEXT:    vslidedown.vi v0, v16, 16
+; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; V128-NEXT:    vwaddu.vv v8, v0, v16
+; V128-NEXT:    vwmaccu.vx v8, a0, v16
 ; V128-NEXT:    lui a1, 699051
 ; V128-NEXT:    addi a1, a1, -1366
+; V128-NEXT:    li a2, 32
 ; V128-NEXT:    vmv.s.x v0, a1
-; V128-NEXT:    vrgatherei16.vv v24, v16, v12, v0.t
+; V128-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
+; V128-NEXT:    vmerge.vvm v24, v8, v24, v0
 ; V128-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; V128-NEXT:    vwaddu.vv v0, v8, v16
-; V128-NEXT:    vwmaccu.vx v0, a0, v16
+; V128-NEXT:    addi a1, sp, 16
+; V128-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; V128-NEXT:    vwaddu.vv v0, v16, v8
+; V128-NEXT:    vwmaccu.vx v0, a0, v8
 ; V128-NEXT:    vmv8r.v v8, v0
 ; V128-NEXT:    vmv8r.v v16, v24
+; V128-NEXT:    csrr a0, vlenb
+; V128-NEXT:    slli a0, a0, 3
+; V128-NEXT:    add sp, sp, a0
+; V128-NEXT:    addi sp, sp, 16
 ; V128-NEXT:    ret
 ;
 ; V512-LABEL: interleave_v32i32:
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
index a26a87a1f3c13..23629ace4cf48 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-int-shuffles.ll
@@ -611,12 +611,10 @@ define <8 x i8> @concat_4xi8_start_undef(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: concat_4xi8_start_undef_at_start:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 224
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -4
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslideup.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 undef, i32 9, i32 10, i32 11>
   ret <8 x i8> %res
@@ -625,12 +623,10 @@ define <8 x i8> @concat_4xi8_start_undef_at_start(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @merge_start_into_end_non_contiguous(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_start_into_end_non_contiguous:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 144
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -4
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslideup.vi v8, v9, 4, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 8, i32 5, i32 6, i32 11>
   ret <8 x i8> %res
@@ -670,12 +666,11 @@ define <8 x i8> @merge_start_into_start(<8 x i8> %v, <8 x i8> %w) {
 define <8 x i8> @merge_slidedown(<8 x i8> %v, <8 x i8> %w) {
 ; CHECK-LABEL: merge_slidedown:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    li a0, 195
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 14, i32 15>
   ret <8 x i8> %res
@@ -686,12 +681,10 @@ define <8 x i8> @merge_non_contiguous_slideup_slidedown(<8 x i8> %v, <8 x i8> %w
 ; CHECK-LABEL: merge_non_contiguous_slideup_slidedown:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
 ; CHECK-NEXT:    li a0, 234
 ; CHECK-NEXT:    vmv.s.x v0, a0
 ; CHECK-NEXT:    vslidedown.vi v8, v8, 2
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %res = shufflevector <8 x i8> %v, <8 x i8> %w, <8 x i32> <i32 2, i32 8, i32 4, i32 10, i32 6, i32 12, i32 13, i32 14>
   ret <8 x i8> %res
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
index f889041647b23..e27ff0a573d5f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-interleaved-access.ll
@@ -159,69 +159,57 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi sp, sp, -16
 ; RV32-NEXT:    .cfi_def_cfa_offset 16
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    li a3, 58
+; RV32-NEXT:    li a3, 56
 ; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    sub sp, sp, a2
-; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x3a, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 58 * vlenb
+; RV32-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x38, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 56 * vlenb
 ; RV32-NEXT:    addi a3, a1, 256
 ; RV32-NEXT:    li a2, 32
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, ma
-; RV32-NEXT:    vle32.v v8, (a3)
+; RV32-NEXT:    vle32.v v16, (a3)
 ; RV32-NEXT:    csrr a3, vlenb
-; RV32-NEXT:    li a4, 25
-; RV32-NEXT:    mul a3, a3, a4
+; RV32-NEXT:    slli a3, a3, 5
 ; RV32-NEXT:    add a3, sp, a3
 ; RV32-NEXT:    addi a3, a3, 16
-; RV32-NEXT:    vs8r.v v8, (a3) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a3) # Unknown-size Folded Spill
 ; RV32-NEXT:    addi a3, a1, 128
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vslideup.vi v16, v8, 4
+; RV32-NEXT:    vslideup.vi v8, v16, 4
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 12
-; RV32-NEXT:    mul a4, a4, a5
-; RV32-NEXT:    add a4, sp, a4
-; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v16, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vid.v v20
-; RV32-NEXT:    vadd.vi v4, v20, -10
-; RV32-NEXT:    vmv.v.v v2, v20
-; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 4
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    slli a4, a4, 4
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs2r.v v20, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, 12
 ; RV32-NEXT:    vmv.s.x v1, a4
 ; RV32-NEXT:    vsetivli zero, 16, e32, m8, ta, ma
-; RV32-NEXT:    vslidedown.vi v8, v8, 16
+; RV32-NEXT:    vslidedown.vi v16, v16, 16
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a5, a4, 5
-; RV32-NEXT:    add a4, a5, a4
+; RV32-NEXT:    li a5, 24
+; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs8r.v v8, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs8r.v v16, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 4
+; RV32-NEXT:    slli a4, a4, 2
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs1r.v v1, (a4) # Unknown-size Folded Spill
-; RV32-NEXT:    vrgatherei16.vv v16, v8, v4, v0.t
+; RV32-NEXT:    vslideup.vi v8, v16, 10, v0.t
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    li a5, 21
+; RV32-NEXT:    li a5, 20
 ; RV32-NEXT:    mul a4, a4, a5
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
-; RV32-NEXT:    vs4r.v v16, (a4) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a4, %hi(.LCPI6_0)
 ; RV32-NEXT:    addi a4, a4, %lo(.LCPI6_0)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a4)
 ; RV32-NEXT:    csrr a4, vlenb
-; RV32-NEXT:    slli a4, a4, 2
+; RV32-NEXT:    slli a4, a4, 3
 ; RV32-NEXT:    add a4, sp, a4
 ; RV32-NEXT:    addi a4, a4, 16
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
@@ -233,14 +221,14 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vs4r.v v8, (a4) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v16, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a4, 49
+; RV32-NEXT:    li a4, 40
 ; RV32-NEXT:    mul a1, a1, a4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs8r.v v16, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vle32.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
@@ -248,12 +236,13 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    addi a1, a5, -64
 ; RV32-NEXT:    vmv.s.x v0, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
@@ -263,366 +252,330 @@ define {<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>} @load_
 ; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
+; RV32-NEXT:    li a3, 20
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v12, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vmv.v.v v12, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 21
+; RV32-NEXT:    li a3, 20
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v12, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v12, v8, 2
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    vadd.vi v8, v2, -8
-; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v8, v16, 2
 ; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV32-NEXT:    vmv.v.v v20, v12
+; RV32-NEXT:    vslideup.vi v8, v16, 8, v0.t
+; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_2)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_2)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_3)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_3)
-; RV32-NEXT:    vle16.v v16, (a1)
+; RV32-NEXT:    vle16.v v24, (a1)
 ; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
 ; RV32-NEXT:    vsetivli zero, 12, e32, m4, tu, ma
 ; RV32-NEXT:    vmv.v.v v20, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 3
+; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v20, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_4)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_4)
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
-; RV32-NEXT:    mul a1, a1, a3
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vi v8, v8, -6
-; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v12, v24, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
+; RV32-NEXT:    vl1r.v v1, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vmv1r.v v0, v1
+; RV32-NEXT:    vslideup.vi v12, v16, 6, v0.t
+; RV32-NEXT:    vmv.v.v v4, v12
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_5)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_5)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_6)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_6)
-; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vle16.v v28, (a3)
-; RV32-NEXT:    li a1, 960
-; RV32-NEXT:    vmv.s.x v20, a1
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vs1r.v v20, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v8, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
-; RV32-NEXT:    vmv1r.v v0, v20
+; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    li a1, 960
+; RV32-NEXT:    vmv.s.x v2, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
-; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v8
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v24
+; RV32-NEXT:    vmv1r.v v0, v2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 2
+; RV32-NEXT:    li a3, 48
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
-; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
-; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
-; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8
-; RV32-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
+; RV32-NEXT:    vmv.v.v v4, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl2r.v v8, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vadd.vi v8, v8, -4
-; RV32-NEXT:    vsetvli zero, zero, e32, m4, ta, mu
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    lui a1, %hi(.LCPI6_7)
+; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_7)
+; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
+; RV32-NEXT:    vle16.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v28, v24, v8
+; RV32-NEXT:    vmv1r.v v0, v1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    li a3, 24
+; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v12, v16, v8, v0.t
-; RV32-NEXT:    vmv.v.v v24, v12
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v28, v8, 4, v0.t
+; RV32-NEXT:    vmv.v.v v4, v28
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_8)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_8)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_9)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_9)
-; RV32-NEXT:    vle16.v v16, (a1)
-; RV32-NEXT:    vle16.v v28, (a3)
+; RV32-NEXT:    vle16.v v28, (a1)
+; RV32-NEXT:    vle16.v v24, (a3)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
-; RV32-NEXT:    addi a1, sp, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v28
+; RV32-NEXT:    vmv1r.v v0, v2
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v16, v28, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v24, v8
+; RV32-NEXT:    vmv.v.v v4, v8
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 4
-; RV32-NEXT:    add a1, a3, a1
+; RV32-NEXT:    slli a1, a1, 2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs4r.v v24, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs4r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_10)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_10)
-; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vle16.v v2, (a1)
 ; RV32-NEXT:    lui a1, 15
-; RV32-NEXT:    vmv.s.x v0, a1
+; RV32-NEXT:    vmv.s.x v5, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a1, a1, 4
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v0, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vslideup.vi v28, v24, 6
+; RV32-NEXT:    vmv1r.v v0, v5
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vslideup.vi v20, v16, 6
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v20, v24, v8, v0.t
+; RV32-NEXT:    vl8r.v v8, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v28, v8, v2, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_11)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_11)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a3, %hi(.LCPI6_12)
 ; RV32-NEXT:    addi a3, a3, %lo(.LCPI6_12)
-; RV32-NEXT:    vle16.v v24, (a1)
-; RV32-NEXT:    vle16.v v16, (a3)
+; RV32-NEXT:    vle16.v v0, (a1)
+; RV32-NEXT:    vle16.v v24, (a3)
 ; RV32-NEXT:    li a1, 1008
-; RV32-NEXT:    vmv.s.x v28, a1
+; RV32-NEXT:    vmv.s.x v4, a1
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 25
-; RV32-NEXT:    mul a1, a1, a3
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vs1r.v v28, (a1) # Unknown-size Folded Spill
+; RV32-NEXT:    vs1r.v v4, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 49
+; RV32-NEXT:    li a3, 40
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
-; RV32-NEXT:    vmv1r.v v0, v28
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v0
+; RV32-NEXT:    vmv1r.v v0, v4
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 41
+; RV32-NEXT:    li a3, 48
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v16, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v24, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v20, v8
+; RV32-NEXT:    vmv.v.v v28, v8
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_13)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_13)
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, mu
 ; RV32-NEXT:    vle16.v v8, (a1)
+; RV32-NEXT:    vmv1r.v v0, v5
 ; RV32-NEXT:    csrr a1, vlenb
 ; RV32-NEXT:    slli a1, a1, 4
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl4r.v v24, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    li a3, 24
 ; RV32-NEXT:    mul a1, a1, a3
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl4r.v v16, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a3, a1, 5
-; RV32-NEXT:    add a1, a3, a1
-; RV32-NEXT:    add a1, sp, a1
-; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v16, v24, v8, v0.t
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vrgatherei16.vv v24, v16, v8, v0.t
 ; RV32-NEXT:    lui a1, %hi(.LCPI6_14)
 ; RV32-NEXT:    addi a1, a1, %lo(.LCPI6_14)
 ; RV32-NEXT:    vsetvli zero, a2, e32, m8, ta, mu
 ; RV32-NEXT:    lui a2, %hi(.LCPI6_15)
 ; RV32-NEXT:    addi a2, a2, %lo(.LCPI6_15)
-; RV32-NEXT:    vle16.v v24, (a1)
+; RV32-NEXT:    vle16.v v16, (a1)
 ; RV32-NEXT:    vle16.v v8, (a2)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 5
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vs4r.v v8, (a1) # Unknown-size Folded Spill
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 49
+; RV32-NEXT:    li a2, 40
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl8r.v v0, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v0, v24
+; RV32-NEXT:    vrgatherei16.vv v8, v0, v16
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 25
-; RV32-NEXT:    mul a1, a1, a2
+; RV32-NEXT:    slli a1, a1, 5
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl1r.v v0, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 41
+; RV32-NEXT:    li a2, 48
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
-; RV32-NEXT:    vl8r.v v24, (a1) # Unknown-size Folded Reload
+; RV32-NEXT:    vl8r.v v16, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    slli a2, a1, 5
-; RV32-NEXT:    add a1, a2, a1
+; RV32-NEXT:    li a2, 24
+; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v4, (a1) # Unknown-size Folded Reload
-; RV32-NEXT:    vrgatherei16.vv v8, v24, v4, v0.t
+; RV32-NEXT:    vrgatherei16.vv v8, v16, v4, v0.t
 ; RV32-NEXT:    vsetivli zero, 10, e32, m4, tu, ma
-; RV32-NEXT:    vmv.v.v v16, v8
+; RV32-NEXT:    vmv.v.v v24, v8
 ; RV32-NEXT:    addi a1, a0, 320
 ; RV32-NEXT:    vsetivli zero, 16, e32, m4, ta, ma
-; RV32-NEXT:    vse32.v v16, (a1)
+; RV32-NEXT:    vse32.v v24, (a1)
 ; RV32-NEXT:    addi a1, a0, 256
-; RV32-NEXT:    vse32.v v20, (a1)
+; RV32-NEXT:    vse32.v v28, (a1)
 ; RV32-NEXT:    addi a1, a0, 192
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a3, a2, 4
-; RV32-NEXT:    add a2, a3, a2
+; RV32-NEXT:    slli a2, a2, 2
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 128
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 2
+; RV32-NEXT:    slli a2, a2, 3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    addi a1, a0, 64
 ; RV32-NEXT:    csrr a2, vlenb
-; RV32-NEXT:    slli a2, a2, 3
+; RV32-NEXT:    li a3, 12
+; RV32-NEXT:    mul a2, a2, a3
 ; RV32-NEXT:    add a2, sp, a2
 ; RV32-NEXT:    addi a2, a2, 16
 ; RV32-NEXT:    vl4r.v v8, (a2) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a1)
 ; RV32-NEXT:    csrr a1, vlenb
-; RV32-NEXT:    li a2, 21
+; RV32-NEXT:    li a2, 20
 ; RV32-NEXT:    mul a1, a1, a2
 ; RV32-NEXT:    add a1, sp, a1
 ; RV32-NEXT:    addi a1, a1, 16
 ; RV32-NEXT:    vl4r.v v8, (a1) # Unknown-size Folded Reload
 ; RV32-NEXT:    vse32.v v8, (a0)
 ; RV32-NEXT:    csrr a0, vlenb
-; RV32-NEXT:    li a1, 58
+; RV32-NEXT:    li a1, 56
 ; RV32-NEXT:    mul a0, a0, a1
 ; RV32-NEXT:    add sp, sp, a0
 ; RV32-NEXT:    addi sp, sp, 16
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
index f53b51e05c572..c8c08ab1a987b 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-exact-vlen.ll
@@ -152,15 +152,12 @@ define <4 x i64> @m2_splat_into_identity_two_source(<4 x i64> %v1, <4 x i64> %v2
 define <4 x i64> @m2_splat_into_slide_two_source(<4 x i64> %v1, <4 x i64> %v2) vscale_range(2,2) {
 ; CHECK-LABEL: m2_splat_into_slide_two_source:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
-; CHECK-NEXT:    vid.v v12
-; CHECK-NEXT:    vadd.vi v14, v12, -1
-; CHECK-NEXT:    vsetvli zero, zero, e64, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, ma
 ; CHECK-NEXT:    vrgather.vi v12, v8, 0
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 12
 ; CHECK-NEXT:    vsetivli zero, 4, e64, m2, ta, mu
-; CHECK-NEXT:    vrgatherei16.vv v12, v10, v14, v0.t
+; CHECK-NEXT:    vslideup.vi v12, v10, 1, v0.t
 ; CHECK-NEXT:    vmv.v.v v8, v12
 ; CHECK-NEXT:    ret
   %res = shufflevector <4 x i64> %v1, <4 x i64> %v2, <4 x i32> <i32 0, i32 0, i32 5, i32 6>
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
index a34fa9502d93b..038fead011d89 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shuffle-transpose.ll
@@ -7,12 +7,10 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: trn1.v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 170
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i8> %tmp0
@@ -21,12 +19,11 @@ define <8 x i8> @trn1.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 ; CHECK-LABEL: trn2.v8i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vsetivli zero, 8, e8, mf2, ta, ma
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i8> %v0, <8 x i8> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i8> %tmp0
@@ -35,15 +32,12 @@ define <8 x i8> @trn2.v8i8(<8 x i8> %v0, <8 x i8> %v1) {
 define <16 x i8> @trn1.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-LABEL: trn1.v16i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, ma
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
 ; CHECK-NEXT:    lui a0, 11
 ; CHECK-NEXT:    addi a0, a0, -1366
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vsetivli zero, 1, e16, m1, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vsetivli zero, 16, e8, m1, ta, mu
+; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 0, i32 16, i32 2, i32 18, i32 4, i32 20, i32 6, i32 22, i32 8, i32 24, i32 10, i32 26, i32 12, i32 28, i32 14, i32 30>
   ret <16 x i8> %tmp0
@@ -58,9 +52,8 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 ; CHECK-NEXT:    addi a0, a0, -1366
 ; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e8, m1, ta, ma
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <16 x i8> %v0, <16 x i8> %v1, <16 x i32> <i32 1, i32 17, i32 3, i32 19, i32 5, i32 21, i32 7, i32 23, i32 9, i32 25, i32 11, i32 27, i32 13, i32 29, i32 15, i32 31>
   ret <16 x i8> %tmp0
@@ -69,11 +62,10 @@ define <16 x i8> @trn2.v16i8(<16 x i8> %v0, <16 x i8> %v1) {
 define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: trn1.v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i16> %tmp0
@@ -82,11 +74,10 @@ define <4 x i16> @trn1.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 ; CHECK-LABEL: trn2.v4i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i16> %v0, <4 x i16> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i16> %tmp0
@@ -95,12 +86,10 @@ define <4 x i16> @trn2.v4i16(<4 x i16> %v0, <4 x i16> %v1) {
 define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: trn1.v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 170
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x i16> %tmp0
@@ -109,12 +98,11 @@ define <8 x i16> @trn1.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 define <8 x i16> @trn2.v8i16(<8 x i16> %v0, <8 x i16> %v1) {
 ; CHECK-LABEL: trn2.v8i16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x i16> %v0, <8 x i16> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x i16> %tmp0
@@ -133,11 +121,10 @@ define <2 x i32> @trn1.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 ; CHECK-LABEL: trn2.v2i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
-; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <2 x i32> %v0, <2 x i32> %v1, <2 x i32> <i32 1, i32 3>
   ret <2 x i32> %tmp0
@@ -146,11 +133,10 @@ define <2 x i32> @trn2.v2i32(<2 x i32> %v0, <2 x i32> %v1) {
 define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: trn1.v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i32> %tmp0
@@ -159,11 +145,10 @@ define <4 x i32> @trn1.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 define <4 x i32> @trn2.v4i32(<4 x i32> %v0, <4 x i32> %v1) {
 ; CHECK-LABEL: trn2.v4i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x i32> %v0, <4 x i32> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x i32> %tmp0
@@ -182,11 +167,10 @@ define <2 x i64> @trn1.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 define <2 x i64> @trn2.v2i64(<2 x i64> %v0, <2 x i64> %v1) {
 ; CHECK-LABEL: trn2.v2i64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
-; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <2 x i64> %v0, <2 x i64> %v1, <2 x i32> <i32 1, i32 3>
   ret <2 x i64> %tmp0
@@ -205,11 +189,10 @@ define <2 x float> @trn1.v2f32(<2 x float> %v0, <2 x float> %v1) {
 define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) {
 ; CHECK-LABEL: trn2.v2f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, mu
+; CHECK-NEXT:    vsetivli zero, 2, e32, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
-; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
-; CHECK-NEXT:    vmv1r.v v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <2 x float> %v0, <2 x float> %v1, <2 x i32> <i32 1, i32 3>
   ret <2 x float> %tmp0
@@ -218,11 +201,10 @@ define <2 x float> @trn2.v2f32(<2 x float> %v0, <2 x float> %v1) {
 define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: trn1.v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
+; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x float> %tmp0
@@ -231,11 +213,10 @@ define <4 x float> @trn1.v4f32(<4 x float> %v0, <4 x float> %v1) {
 define <4 x float> @trn2.v4f32(<4 x float> %v0, <4 x float> %v1) {
 ; CHECK-LABEL: trn2.v4f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x float> %v0, <4 x float> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x float> %tmp0
@@ -254,11 +235,10 @@ define <2 x double> @trn1.v2f64(<2 x double> %v0, <2 x double> %v1) {
 define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) {
 ; CHECK-LABEL: trn2.v2f64:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 2
 ; CHECK-NEXT:    vrgather.vi v10, v8, 1
-; CHECK-NEXT:    vrgather.vi v10, v9, 1, v0.t
-; CHECK-NEXT:    vmv.v.v v8, v10
+; CHECK-NEXT:    vmerge.vvm v8, v10, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <2 x double> %v0, <2 x double> %v1, <2 x i32> <i32 1, i32 3>
   ret <2 x double> %tmp0
@@ -267,11 +247,10 @@ define <2 x double> @trn2.v2f64(<2 x double> %v0, <2 x double> %v1) {
 define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: trn1.v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vid.v v10
+; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
+; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x half> %tmp0
@@ -280,11 +259,10 @@ define <4 x half> @trn1.v4f16(<4 x half> %v0, <4 x half> %v1) {
 define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) {
 ; CHECK-LABEL: trn2.v4f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 10
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <4 x half> %v0, <4 x half> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
   ret <4 x half> %tmp0
@@ -293,12 +271,10 @@ define <4 x half> @trn2.v4f16(<4 x half> %v0, <4 x half> %v1) {
 define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: trn1.v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vid.v v10
 ; CHECK-NEXT:    li a0, 170
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vadd.vi v10, v10, -1
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslideup.vi v8, v9, 1, v0.t
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
   ret <8 x half> %tmp0
@@ -307,12 +283,11 @@ define <8 x half> @trn1.v8f16(<8 x half> %v0, <8 x half> %v1) {
 define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) {
 ; CHECK-LABEL: trn2.v8f16:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, mu
-; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; CHECK-NEXT:    li a0, 170
 ; CHECK-NEXT:    vmv.s.x v0, a0
-; CHECK-NEXT:    vid.v v10
-; CHECK-NEXT:    vrgather.vv v8, v9, v10, v0.t
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vmerge.vvm v8, v8, v9, v0
 ; CHECK-NEXT:    ret
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x half> %tmp0
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
index ff6a9dc38999c..25af5ba2f079f 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-shufflevector-vnsrl.ll
@@ -179,12 +179,11 @@ define void @vnsrl_32_i32(ptr %in, ptr %out) {
 ; ZVE32F:       # %bb.0: # %entry
 ; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; ZVE32F-NEXT:    vle32.v v8, (a0)
-; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
-; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; ZVE32F-NEXT:    vmv.v.i v0, 2
-; ZVE32F-NEXT:    vrgather.vi v10, v8, 1
-; ZVE32F-NEXT:    vrgather.vi v10, v9, 1, v0.t
-; ZVE32F-NEXT:    vse32.v v10, (a1)
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT:    vrgather.vi v9, v8, 1
+; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2, v0.t
+; ZVE32F-NEXT:    vse32.v v9, (a1)
 ; ZVE32F-NEXT:    ret
 entry:
   %0 = load <4 x i32>, ptr %in, align 4
@@ -234,12 +233,11 @@ define void @vnsrl_32_float(ptr %in, ptr %out) {
 ; ZVE32F:       # %bb.0: # %entry
 ; ZVE32F-NEXT:    vsetivli zero, 4, e32, m1, ta, ma
 ; ZVE32F-NEXT:    vle32.v v8, (a0)
-; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
-; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2
 ; ZVE32F-NEXT:    vmv.v.i v0, 2
-; ZVE32F-NEXT:    vrgather.vi v10, v8, 1
-; ZVE32F-NEXT:    vrgather.vi v10, v9, 1, v0.t
-; ZVE32F-NEXT:    vse32.v v10, (a1)
+; ZVE32F-NEXT:    vsetivli zero, 2, e32, m1, ta, mu
+; ZVE32F-NEXT:    vrgather.vi v9, v8, 1
+; ZVE32F-NEXT:    vslidedown.vi v9, v8, 2, v0.t
+; ZVE32F-NEXT:    vse32.v v9, (a1)
 ; ZVE32F-NEXT:    ret
 entry:
   %0 = load <4 x float>, ptr %in, align 4
@@ -278,12 +276,11 @@ define void @vnsrl_64_i64(ptr %in, ptr %out) {
 ; V:       # %bb.0: # %entry
 ; V-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; V-NEXT:    vle64.v v8, (a0)
-; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; V-NEXT:    vslidedown.vi v9, v8, 2
 ; V-NEXT:    vmv.v.i v0, 2
-; V-NEXT:    vrgather.vi v10, v8, 1
-; V-NEXT:    vrgather.vi v10, v9, 1, v0.t
-; V-NEXT:    vse64.v v10, (a1)
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; V-NEXT:    vrgather.vi v9, v8, 1
+; V-NEXT:    vslidedown.vi v9, v8, 2, v0.t
+; V-NEXT:    vse64.v v9, (a1)
 ; V-NEXT:    ret
 ;
 ; ZVE32F-LABEL: vnsrl_64_i64:
@@ -330,12 +327,11 @@ define void @vnsrl_64_double(ptr %in, ptr %out) {
 ; V:       # %bb.0: # %entry
 ; V-NEXT:    vsetivli zero, 4, e64, m1, ta, ma
 ; V-NEXT:    vle64.v v8, (a0)
-; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
-; V-NEXT:    vslidedown.vi v9, v8, 2
 ; V-NEXT:    vmv.v.i v0, 2
-; V-NEXT:    vrgather.vi v10, v8, 1
-; V-NEXT:    vrgather.vi v10, v9, 1, v0.t
-; V-NEXT:    vse64.v v10, (a1)
+; V-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; V-NEXT:    vrgather.vi v9, v8, 1
+; V-NEXT:    vslidedown.vi v9, v8, 2, v0.t
+; V-NEXT:    vse64.v v9, (a1)
 ; V-NEXT:    ret
 ;
 ; ZVE32F-LABEL: vnsrl_64_double:
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
index 84e9e2801ff6d..f3c70ed78c747 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-deinterleave-fixed.ll
@@ -82,9 +82,9 @@ define {<2 x i64>, <2 x i64>} @vector_deinterleave_v2i64_v4i64(<4 x i64> %vec) {
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 1
-; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
+; CHECK-NEXT:    vmerge.vvm v9, v9, v10, v0
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
 %retval = call {<2 x i64>, <2 x i64>} @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %vec)
@@ -169,9 +169,9 @@ define {<2 x double>, <2 x double>} @vector_deinterleave_v2f64_v4f64(<4 x double
 ; CHECK-NEXT:    vslidedown.vi v10, v8, 2
 ; CHECK-NEXT:    vsetivli zero, 1, e8, mf8, ta, ma
 ; CHECK-NEXT:    vmv.v.i v0, 2
-; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, mu
+; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vrgather.vi v9, v8, 1
-; CHECK-NEXT:    vrgather.vi v9, v10, 1, v0.t
+; CHECK-NEXT:    vmerge.vvm v9, v9, v10, v0
 ; CHECK-NEXT:    vslideup.vi v8, v10, 1
 ; CHECK-NEXT:    ret
 %retval = call {<2 x double>, <2 x double>} @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %vec)

From 611843d24bd1cfa8b6bf62b48635dbd3a6281759 Mon Sep 17 00:00:00 2001
From: Christian Sigg <chsigg@users.noreply.github.com>
Date: Wed, 24 Jan 2024 17:31:27 +0100
Subject: [PATCH 810/843] [llvm][bazel] Fix BUILD.

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 00d3c9733b8fd..10f5ca593fbd5 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3257,6 +3257,7 @@ cc_binary(
         ":IRPrinter",
         ":IRReader",
         ":MC",
+        ":Passes",
         ":Remarks",
         ":Support",
         ":Target",

From 4079aab8d80233586a9cd3f7be27bece4c21ea16 Mon Sep 17 00:00:00 2001
From: Christian Sigg <chsigg@users.noreply.github.com>
Date: Wed, 24 Jan 2024 17:36:37 +0100
Subject: [PATCH 811/843] [llvm][bazel] Fix BUILD

---
 utils/bazel/llvm-project-overlay/llvm/BUILD.bazel | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index 10f5ca593fbd5..d4cb0cdb55b52 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -3259,6 +3259,7 @@ cc_binary(
         ":MC",
         ":Passes",
         ":Remarks",
+        ":Scalar",
         ":Support",
         ":Target",
         ":TargetParser",

From 56444d5687818938a6ce798e7221aa920c54098e Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Wed, 24 Jan 2024 17:39:10 +0100
Subject: [PATCH 812/843] Remove fork handling from release issue workflow
 (#79310)

This is currently broken, because the check is performed on the wrong
repository. repo here is llvm/llvm-project, which is not a fork (so this
will always trigger), then we'll push a new branch to
llvmbot/llvm-project, and then again set the wrong owner, so we'll look
for the branch in llvm/llvm-project rather than llvmbot/llvm-project.

Rather than fixing this, I'm removing the code entirely, as it shouldn't
be needed anymore (llvmbot/llvm-project is a fork of llvm/llvm-project).
---
 llvm/utils/git/github-automation.py | 29 ++---------------------------
 1 file changed, 2 insertions(+), 27 deletions(-)

diff --git a/llvm/utils/git/github-automation.py b/llvm/utils/git/github-automation.py
index ffe4e54484fd5..72db9995b0e8a 100755
--- a/llvm/utils/git/github-automation.py
+++ b/llvm/utils/git/github-automation.py
@@ -498,33 +498,8 @@ def create_pull_request(self, owner: str, repo_name: str, branch: str) -> bool:
         release_branch_for_issue = self.release_branch_for_issue
         if release_branch_for_issue is None:
             return False
-        head_branch = branch
-        if not repo.fork:
-            # If the target repo is not a fork of llvm-project, we need to copy
-            # the branch into the target repo.  GitHub only supports cross-repo pull
-            # requests on forked repos.
-            head_branch = f"{owner}-{branch}"
-            local_repo = Repo(self.llvm_project_dir)
-            push_done = False
-            for _ in range(0, 5):
-                try:
-                    local_repo.git.fetch(
-                        f"https://github.com/{owner}/{repo_name}", f"{branch}:{branch}"
-                    )
-                    local_repo.git.push(
-                        self.push_url, f"{branch}:{head_branch}", force=True
-                    )
-                    push_done = True
-                    break
-                except Exception as e:
-                    print(e)
-                    time.sleep(30)
-                    continue
-            if not push_done:
-                raise Exception("Failed to mirror branch into {}".format(self.push_url))
-            owner = repo.owner.login
-
-        head = f"{owner}:{head_branch}"
+
+        head = f"{owner}:{branch}"
         if self.check_if_pull_request_exists(repo, head):
             print("PR already exists...")
             return True

From 6c1dbd5359c4336d03b11faeaea8459b421f2c5c Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Wed, 24 Jan 2024 17:41:14 +0100
Subject: [PATCH 813/843] [clang] NFC: Remove
 `{File,Directory}Entry::getName()` (#74910)

The files and directories that Clang accesses are uniqued by their
inode. For each inode `FileManager` will create exactly one `FileEntry`
or `DirectoryEntry` object, which makes answering the question _"Are
these two files/directories the same?"_ a simple pointer equality check.

However, since the same inode can be accessed through multiple different
paths, asking the `FileEntry` or `DirectoryEntry` object _"What is your
name?"_ doesn't have clear semantics. In c0ff9908 we started reporting
the most recent name used to access the entry, which turned out to be
necessary for Clang modules. However, the long-term solution has always
been to explicitly track the as-requested name. This has been
implemented in 4dc5573a as `FileEntryRef` and `DirectoryEntryRef`.

The `DirectoryEntry::getName()` interface has been deprecated since the
Clang 17 release and `FileEntry::getName()` since Clang 18. We have
replaced uses of these deprecated APIs in `main` with
`DirectoryEntryRef::getName()` and `FileEntryRef::getName()`
respectively.

This makes it possible to remove `{File,Directory}Entry::getName()` for
good along with the `FileManager` code that implements them.
---
 clang/include/clang/Basic/DirectoryEntry.h    |  7 -----
 clang/include/clang/Basic/FileEntry.h         | 10 -------
 clang/lib/Basic/FileManager.cpp               | 26 -------------------
 clang/unittests/Basic/FileManagerTest.cpp     |  6 -----
 llvm/include/llvm/Support/VirtualFileSystem.h |  3 ---
 llvm/lib/Support/VirtualFileSystem.cpp        |  1 -
 .../Support/VirtualFileSystemTest.cpp         | 18 -------------
 7 files changed, 71 deletions(-)

diff --git a/clang/include/clang/Basic/DirectoryEntry.h b/clang/include/clang/Basic/DirectoryEntry.h
index 906c2e9af23b3..35fe529ba79df 100644
--- a/clang/include/clang/Basic/DirectoryEntry.h
+++ b/clang/include/clang/Basic/DirectoryEntry.h
@@ -41,13 +41,6 @@ class DirectoryEntry {
   DirectoryEntry &operator=(const DirectoryEntry &) = delete;
   friend class FileManager;
   friend class FileEntryTestHelper;
-
-  // FIXME: We should not be storing a directory entry name here.
-  StringRef Name; // Name of the directory.
-
-public:
-  LLVM_DEPRECATED("Use DirectoryEntryRef::getName() instead.", "")
-  StringRef getName() const { return Name; }
 };
 
 /// A reference to a \c DirectoryEntry  that includes the name of the directory
diff --git a/clang/include/clang/Basic/FileEntry.h b/clang/include/clang/Basic/FileEntry.h
index 35efa147950f0..68d4bf6093003 100644
--- a/clang/include/clang/Basic/FileEntry.h
+++ b/clang/include/clang/Basic/FileEntry.h
@@ -318,18 +318,8 @@ class FileEntry {
   /// The file content, if it is owned by the \p FileEntry.
   std::unique_ptr<llvm::MemoryBuffer> Content;
 
-  // First access name for this FileEntry.
-  //
-  // This is Optional only to allow delayed construction (FileEntryRef has no
-  // default constructor). It should always have a value in practice.
-  //
-  // TODO: remove this once everyone that needs a name uses FileEntryRef.
-  OptionalFileEntryRef LastRef;
-
 public:
   ~FileEntry();
-  LLVM_DEPRECATED("Use FileEntryRef::getName() instead.", "")
-  StringRef getName() const { return LastRef->getName(); }
 
   StringRef tryGetRealPathName() const { return RealPathName; }
   off_t getSize() const { return Size; }
diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index 974c8c22598f6..4e77b178a86b9 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -107,7 +107,6 @@ void FileManager::addAncestorsAsVirtualDirs(StringRef Path) {
 
   // Add the virtual directory to the cache.
   auto *UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
-  UDE->Name = NamedDirEnt.first();
   NamedDirEnt.second = *UDE;
   VirtualDirectoryEntries.push_back(UDE);
 
@@ -179,7 +178,6 @@ FileManager::getDirectoryRef(StringRef DirName, bool CacheFailure) {
     // We don't have this directory yet, add it.  We use the string
     // key from the SeenDirEntries map as the string.
     UDE = new (DirsAlloc.Allocate()) DirectoryEntry();
-    UDE->Name = InterndDirName;
   }
   NamedDirEnt.second = *UDE;
 
@@ -324,32 +322,10 @@ FileManager::getFileRef(StringRef Filename, bool openFile, bool CacheFailure) {
 
   FileEntryRef ReturnedRef(*NamedFileEnt);
   if (ReusingEntry) { // Already have an entry with this inode, return it.
-
-    // FIXME: This hack ensures that `getDir()` will use the path that was
-    // used to lookup this file, even if we found a file by different path
-    // first. This is required in order to find a module's structure when its
-    // headers/module map are mapped in the VFS.
-    //
-    // See above for how this will eventually be removed. `IsVFSMapped`
-    // *cannot* be narrowed to `ExposesExternalVFSPath` as crash reproducers
-    // also depend on this logic and they have `use-external-paths: false`.
-    if (&DirInfo.getDirEntry() != UFE->Dir && Status.IsVFSMapped)
-      UFE->Dir = &DirInfo.getDirEntry();
-
-    // Always update LastRef to the last name by which a file was accessed.
-    // FIXME: Neither this nor always using the first reference is correct; we
-    // want to switch towards a design where we return a FileName object that
-    // encapsulates both the name by which the file was accessed and the
-    // corresponding FileEntry.
-    // FIXME: LastRef should be removed from FileEntry once all clients adopt
-    // FileEntryRef.
-    UFE->LastRef = ReturnedRef;
-
     return ReturnedRef;
   }
 
   // Otherwise, we don't have this file yet, add it.
-  UFE->LastRef = ReturnedRef;
   UFE->Size = Status.getSize();
   UFE->ModTime = llvm::sys::toTimeT(Status.getLastModificationTime());
   UFE->Dir = &DirInfo.getDirEntry();
@@ -461,7 +437,6 @@ FileEntryRef FileManager::getVirtualFileRef(StringRef Filename, off_t Size,
   }
 
   NamedFileEnt.second = FileEntryRef::MapValue(*UFE, *DirInfo);
-  UFE->LastRef = FileEntryRef(NamedFileEnt);
   UFE->Size    = Size;
   UFE->ModTime = ModificationTime;
   UFE->Dir     = &DirInfo->getDirEntry();
@@ -490,7 +465,6 @@ OptionalFileEntryRef FileManager::getBypassFile(FileEntryRef VF) {
   FileEntry *BFE = new (FilesAlloc.Allocate()) FileEntry();
   BypassFileEntries.push_back(BFE);
   Insertion.first->second = FileEntryRef::MapValue(*BFE, VF.getDir());
-  BFE->LastRef = FileEntryRef(*Insertion.first);
   BFE->Size = Status.getSize();
   BFE->Dir = VF.getFileEntry().Dir;
   BFE->ModTime = llvm::sys::toTimeT(Status.getLastModificationTime());
diff --git a/clang/unittests/Basic/FileManagerTest.cpp b/clang/unittests/Basic/FileManagerTest.cpp
index 43339676c4a6a..d32036d975ce9 100644
--- a/clang/unittests/Basic/FileManagerTest.cpp
+++ b/clang/unittests/Basic/FileManagerTest.cpp
@@ -284,9 +284,6 @@ TEST_F(FileManagerTest, getFileRefReturnsCorrectNameForDifferentStatPath) {
   ASSERT_FALSE(!F1Alias);
   ASSERT_FALSE(!F1Alias2);
   EXPECT_EQ("dir/f1.cpp", F1->getName());
-  LLVM_SUPPRESS_DEPRECATED_DECLARATIONS_PUSH
-  EXPECT_EQ("dir/f1.cpp", F1->getFileEntry().getName());
-  LLVM_SUPPRESS_DEPRECATED_DECLARATIONS_POP
   EXPECT_EQ("dir/f1.cpp", F1Alias->getName());
   EXPECT_EQ("dir/f1.cpp", F1Alias2->getName());
   EXPECT_EQ(&F1->getFileEntry(), &F1Alias->getFileEntry());
@@ -305,9 +302,6 @@ TEST_F(FileManagerTest, getFileRefReturnsCorrectNameForDifferentStatPath) {
   ASSERT_FALSE(!F2Alias);
   ASSERT_FALSE(!F2Alias2);
   EXPECT_EQ("dir/f2.cpp", F2->getName());
-  LLVM_SUPPRESS_DEPRECATED_DECLARATIONS_PUSH
-  EXPECT_EQ("dir/f2.cpp", F2->getFileEntry().getName());
-  LLVM_SUPPRESS_DEPRECATED_DECLARATIONS_POP
   EXPECT_EQ("dir/f2.cpp", F2Alias->getName());
   EXPECT_EQ("dir/f2.cpp", F2Alias2->getName());
   EXPECT_EQ(&F2->getFileEntry(), &F2Alias->getFileEntry());
diff --git a/llvm/include/llvm/Support/VirtualFileSystem.h b/llvm/include/llvm/Support/VirtualFileSystem.h
index 44a56e54a0a09..1c3d8e985592b 100644
--- a/llvm/include/llvm/Support/VirtualFileSystem.h
+++ b/llvm/include/llvm/Support/VirtualFileSystem.h
@@ -55,9 +55,6 @@ class Status {
   llvm::sys::fs::perms Perms;
 
 public:
-  // FIXME: remove when files support multiple names
-  bool IsVFSMapped = false;
-
   /// Whether this entity has an external path different from the virtual path,
   /// and the external path is exposed by leaking it through the abstraction.
   /// For example, a RedirectingFileSystem will set this for paths where
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index c43b70e239e07..d6c787cb91b34 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -2334,7 +2334,6 @@ static Status getRedirectedFileStatus(const Twine &OriginalPath,
     S = Status::copyWithNewName(S, OriginalPath);
   else
     S.ExposesExternalVFSPath = true;
-  S.IsVFSMapped = true;
   return S;
 }
 
diff --git a/llvm/unittests/Support/VirtualFileSystemTest.cpp b/llvm/unittests/Support/VirtualFileSystemTest.cpp
index 15f5204e7e804..54a6e0fbc0760 100644
--- a/llvm/unittests/Support/VirtualFileSystemTest.cpp
+++ b/llvm/unittests/Support/VirtualFileSystemTest.cpp
@@ -1563,13 +1563,11 @@ TEST_F(VFSFromYAMLTest, MappedFiles) {
   ErrorOr<vfs::Status> S = O->status("//root/file1");
   ASSERT_FALSE(S.getError());
   EXPECT_EQ("//root/foo/bar/a", S->getName());
-  EXPECT_TRUE(S->IsVFSMapped);
   EXPECT_TRUE(S->ExposesExternalVFSPath);
 
   ErrorOr<vfs::Status> SLower = O->status("//root/foo/bar/a");
   EXPECT_EQ("//root/foo/bar/a", SLower->getName());
   EXPECT_TRUE(S->equivalent(*SLower));
-  EXPECT_FALSE(SLower->IsVFSMapped);
   EXPECT_FALSE(SLower->ExposesExternalVFSPath);
 
   // file after opening
@@ -1578,7 +1576,6 @@ TEST_F(VFSFromYAMLTest, MappedFiles) {
   auto OpenedS = (*OpenedF)->status();
   ASSERT_FALSE(OpenedS.getError());
   EXPECT_EQ("//root/foo/bar/a", OpenedS->getName());
-  EXPECT_TRUE(OpenedS->IsVFSMapped);
   EXPECT_TRUE(OpenedS->ExposesExternalVFSPath);
 
   // directory
@@ -1591,21 +1588,18 @@ TEST_F(VFSFromYAMLTest, MappedFiles) {
   S = O->status("//root/mappeddir");
   ASSERT_FALSE(S.getError());
   EXPECT_TRUE(S->isDirectory());
-  EXPECT_TRUE(S->IsVFSMapped);
   EXPECT_TRUE(S->ExposesExternalVFSPath);
   EXPECT_TRUE(S->equivalent(*O->status("//root/foo/bar")));
 
   SLower = O->status("//root/foo/bar");
   EXPECT_EQ("//root/foo/bar", SLower->getName());
   EXPECT_TRUE(S->equivalent(*SLower));
-  EXPECT_FALSE(SLower->IsVFSMapped);
   EXPECT_FALSE(SLower->ExposesExternalVFSPath);
 
   // file in remapped directory
   S = O->status("//root/mappeddir/a");
   ASSERT_FALSE(S.getError());
   EXPECT_FALSE(S->isDirectory());
-  EXPECT_TRUE(S->IsVFSMapped);
   EXPECT_TRUE(S->ExposesExternalVFSPath);
   EXPECT_EQ("//root/foo/bar/a", S->getName());
 
@@ -1613,7 +1607,6 @@ TEST_F(VFSFromYAMLTest, MappedFiles) {
   S = O->status("//root/mappeddir2/a");
   ASSERT_FALSE(S.getError());
   EXPECT_FALSE(S->isDirectory());
-  EXPECT_TRUE(S->IsVFSMapped);
   EXPECT_FALSE(S->ExposesExternalVFSPath);
   EXPECT_EQ("//root/mappeddir2/a", S->getName());
 
@@ -1623,7 +1616,6 @@ TEST_F(VFSFromYAMLTest, MappedFiles) {
   OpenedS = (*OpenedF)->status();
   ASSERT_FALSE(OpenedS.getError());
   EXPECT_EQ("//root/foo/bar/a", OpenedS->getName());
-  EXPECT_TRUE(OpenedS->IsVFSMapped);
   EXPECT_TRUE(OpenedS->ExposesExternalVFSPath);
 
   // file contents in remapped directory, with use-external-name=false
@@ -1632,7 +1624,6 @@ TEST_F(VFSFromYAMLTest, MappedFiles) {
   OpenedS = (*OpenedF)->status();
   ASSERT_FALSE(OpenedS.getError());
   EXPECT_EQ("//root/mappeddir2/a", OpenedS->getName());
-  EXPECT_TRUE(OpenedS->IsVFSMapped);
   EXPECT_FALSE(OpenedS->ExposesExternalVFSPath);
 
   // broken mapping
@@ -1665,13 +1656,11 @@ TEST_F(VFSFromYAMLTest, MappedRoot) {
   ErrorOr<vfs::Status> S = O->status("//mappedroot/a");
   ASSERT_FALSE(S.getError());
   EXPECT_EQ("//root/foo/bar/a", S->getName());
-  EXPECT_TRUE(S->IsVFSMapped);
   EXPECT_TRUE(S->ExposesExternalVFSPath);
 
   ErrorOr<vfs::Status> SLower = O->status("//root/foo/bar/a");
   EXPECT_EQ("//root/foo/bar/a", SLower->getName());
   EXPECT_TRUE(S->equivalent(*SLower));
-  EXPECT_FALSE(SLower->IsVFSMapped);
   EXPECT_FALSE(SLower->ExposesExternalVFSPath);
 
   // file after opening
@@ -1680,7 +1669,6 @@ TEST_F(VFSFromYAMLTest, MappedRoot) {
   auto OpenedS = (*OpenedF)->status();
   ASSERT_FALSE(OpenedS.getError());
   EXPECT_EQ("//root/foo/bar/a", OpenedS->getName());
-  EXPECT_TRUE(OpenedS->IsVFSMapped);
   EXPECT_TRUE(OpenedS->ExposesExternalVFSPath);
 
   EXPECT_EQ(0, NumDiagnostics);
@@ -1829,13 +1817,11 @@ TEST_F(VFSFromYAMLTest, ReturnsRequestedPathVFSMiss) {
   auto OpenedS = (*OpenedF)->status();
   ASSERT_FALSE(OpenedS.getError());
   EXPECT_EQ("a", OpenedS->getName());
-  EXPECT_FALSE(OpenedS->IsVFSMapped);
   EXPECT_FALSE(OpenedS->ExposesExternalVFSPath);
 
   auto DirectS = RemappedFS->status("a");
   ASSERT_FALSE(DirectS.getError());
   EXPECT_EQ("a", DirectS->getName());
-  EXPECT_FALSE(DirectS->IsVFSMapped);
   EXPECT_FALSE(DirectS->ExposesExternalVFSPath);
 
   EXPECT_EQ(0, NumDiagnostics);
@@ -1871,13 +1857,11 @@ TEST_F(VFSFromYAMLTest, ReturnsExternalPathVFSHit) {
   auto OpenedS = (*OpenedF)->status();
   ASSERT_FALSE(OpenedS.getError());
   EXPECT_EQ("realname", OpenedS->getName());
-  EXPECT_TRUE(OpenedS->IsVFSMapped);
   EXPECT_TRUE(OpenedS->ExposesExternalVFSPath);
 
   auto DirectS = FS->status("vfsname");
   ASSERT_FALSE(DirectS.getError());
   EXPECT_EQ("realname", DirectS->getName());
-  EXPECT_TRUE(DirectS->IsVFSMapped);
   EXPECT_TRUE(DirectS->ExposesExternalVFSPath);
 
   EXPECT_EQ(0, NumDiagnostics);
@@ -1976,13 +1960,11 @@ TEST_F(VFSFromYAMLTest, ReturnsInternalPathVFSHit) {
   auto OpenedS = (*OpenedF)->status();
   ASSERT_FALSE(OpenedS.getError());
   EXPECT_EQ("vfsname", OpenedS->getName());
-  EXPECT_TRUE(OpenedS->IsVFSMapped);
   EXPECT_FALSE(OpenedS->ExposesExternalVFSPath);
 
   auto DirectS = FS->status("vfsname");
   ASSERT_FALSE(DirectS.getError());
   EXPECT_EQ("vfsname", DirectS->getName());
-  EXPECT_TRUE(DirectS->IsVFSMapped);
   EXPECT_FALSE(DirectS->ExposesExternalVFSPath);
 
   EXPECT_EQ(0, NumDiagnostics);

From fd817249f4d50caa7a8986a37cdf712dfdf5ad70 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 24 Jan 2024 08:50:18 -0800
Subject: [PATCH 814/843] [RISCV] Sink code into using branch in shuffle
 lowering [nfc]

Follow up to 396b6bbc, sink code into consuming branch, and fix one
comment I realized used the misleading wording.  (Permute is a specific
sub-type of single source shuffle.)
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 46 ++++++++++-----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5d61baae0ce4e..a38352e8e87f2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4998,32 +4998,32 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
 
-  unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
-  MVT IndexVT = VT.changeTypeToInteger();
-  // Since we can't introduce illegal index types at this stage, use i16 and
-  // vrgatherei16 if the corresponding index type for plain vrgather is greater
-  // than XLenVT.
-  if (IndexVT.getScalarType().bitsGT(XLenVT)) {
-    GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
-    IndexVT = IndexVT.changeVectorElementType(MVT::i16);
-  }
-
-  // If the mask allows, we can do all the index computation in 16 bits.  This
-  // requires less work and less register pressure at high LMUL, and creates
-  // smaller constants which may be cheaper to materialize.
-  if (IndexVT.getScalarType().bitsGT(MVT::i16) && isUInt<16>(NumElts - 1) &&
-      (IndexVT.getSizeInBits() / Subtarget.getRealMinVLen()) > 1) {
-    GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
-    IndexVT = IndexVT.changeVectorElementType(MVT::i16);
-  }
-
-  MVT IndexContainerVT =
-      ContainerVT.changeVectorElementType(IndexVT.getScalarType());
-
   // Base case for the recursion just below - handle the worst case
   // single source permutation.  Note that all the splat variants
   // are handled above.
   if (V2.isUndef()) {
+    unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
+    MVT IndexVT = VT.changeTypeToInteger();
+    // Since we can't introduce illegal index types at this stage, use i16 and
+    // vrgatherei16 if the corresponding index type for plain vrgather is greater
+    // than XLenVT.
+    if (IndexVT.getScalarType().bitsGT(XLenVT)) {
+      GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
+      IndexVT = IndexVT.changeVectorElementType(MVT::i16);
+    }
+
+    // If the mask allows, we can do all the index computation in 16 bits.  This
+    // requires less work and less register pressure at high LMUL, and creates
+    // smaller constants which may be cheaper to materialize.
+    if (IndexVT.getScalarType().bitsGT(MVT::i16) && isUInt<16>(NumElts - 1) &&
+        (IndexVT.getSizeInBits() / Subtarget.getRealMinVLen()) > 1) {
+      GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
+      IndexVT = IndexVT.changeVectorElementType(MVT::i16);
+    }
+
+    MVT IndexContainerVT =
+      ContainerVT.changeVectorElementType(IndexVT.getScalarType());
+
     V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
     SmallVector<SDValue> GatherIndicesLHS;
     for (int ShuffleIdx : ShuffleMaskLHS)
@@ -5039,7 +5039,7 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   }
 
   // Recursively invoke lowering for each operand if we had two
-  // independent single source permutes, and then combine the result via a
+  // independent single source shuffles, and then combine the result via a
   // vselect.  Note that the vselect will likely be folded back into the
   // second permute (vrgather, or other) by the post-isel combine.
   V1 = DAG.getVectorShuffle(VT, DL, V1, DAG.getUNDEF(VT), ShuffleMaskLHS);

From 8abf8d124ae346016c56209de7f57b85671d4367 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 24 Jan 2024 08:53:36 -0800
Subject: [PATCH 815/843] [ELF] Don't resolve relocations referencing SHN_ABS
 to tombstone in non-SHF_ALLOC sections (#79238)

A SHN_ABS symbol has never been considered for
InputSection::relocateNonAlloc.
Before #74686, the code did made it work in the absence of `-z
dead-reloc-in-nonalloc=`.
There is now a report about such SHN_ABS uses

(https://github.com/llvm/llvm-project/pull/74686#issuecomment-1904101711)
and I think it makes sense for non-SHF_ALLOC to support SHN_ABS, like
SHF_ALLOC sections do.

```
// clang -g
__attribute__((weak)) int symbol;
int *foo() { return &symbol; }

0x00000023:   DW_TAG_variable [2]   (0x0000000c)
                ...
                DW_AT_location [DW_FORM_exprloc]        (DW_OP_addrx 0x0)

```

.debug_addr references `symbol`, which can be redefined by a symbol
assignment or --defsym to become a SHN_ABS symbol.

The problem is that `!sym.getOutputSection()` cannot discern SHN_ABS
from a symbol whose section has been discarded. Since commit
1981b1b6b92f7579a30c9ed32dbdf3bc749c1b40, a symbol relative to a
discarded section is changed to `Undefined`, so the `SHN_ABS` check
become trivial.

We currently apply tombstone for a relocation referencing
`SharedSymbol`. This patch does not change the behavior.
---
 lld/ELF/InputSection.cpp              | 13 ++++++-------
 lld/test/ELF/dead-reloc-in-nonalloc.s |  2 +-
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/lld/ELF/InputSection.cpp b/lld/ELF/InputSection.cpp
index c728dd6c6306a..0e0b9783bd88a 100644
--- a/lld/ELF/InputSection.cpp
+++ b/lld/ELF/InputSection.cpp
@@ -961,12 +961,11 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       // vector. The computed value is st_value plus a non-negative offset.
       // Negative values are invalid, so -1 can be used as the tombstone value.
       //
-      // If the referenced symbol is discarded (made Undefined), or the
-      // section defining the referenced symbol is garbage collected,
-      // sym.getOutputSection() is nullptr. `ds->folded` catches the ICF folded
-      // case. However, resolving a relocation in .debug_line to -1 would stop
-      // debugger users from setting breakpoints on the folded-in function, so
-      // exclude .debug_line.
+      // If the referenced symbol is relative to a discarded section (due to
+      // --gc-sections, COMDAT, etc), it has been converted to a Undefined.
+      // `ds->folded` catches the ICF folded case. However, resolving a
+      // relocation in .debug_line to -1 would stop debugger users from setting
+      // breakpoints on the folded-in function, so exclude .debug_line.
       //
       // For pre-DWARF-v5 .debug_loc and .debug_ranges, -1 is a reserved value
       // (base address selection entry), use 1 (which is used by GNU ld for
@@ -974,7 +973,7 @@ void InputSection::relocateNonAlloc(uint8_t *buf, ArrayRef<RelTy> rels) {
       //
       // TODO To reduce disruption, we use 0 instead of -1 as the tombstone
       // value. Enable -1 in a future release.
-      if (!sym.getOutputSection() || (ds && ds->folded && !isDebugLine)) {
+      if (!ds || (ds->folded && !isDebugLine)) {
         // If -z dead-reloc-in-nonalloc= is specified, respect it.
         uint64_t value = SignExtend64<bits>(*tombstone);
         // For a 32-bit local TU reference in .debug_names, X86_64::relocate
diff --git a/lld/test/ELF/dead-reloc-in-nonalloc.s b/lld/test/ELF/dead-reloc-in-nonalloc.s
index 145604eb883a9..b675fc50fc2ea 100644
--- a/lld/test/ELF/dead-reloc-in-nonalloc.s
+++ b/lld/test/ELF/dead-reloc-in-nonalloc.s
@@ -17,7 +17,7 @@
 # AA:          Contents of section .debug_info:
 # AA-NEXT:      0000 [[ADDR]] 00000000 aaaaaaaa 00000000
 # AA:          Contents of section .not_debug:
-# AA-NEXT:      0000 bbbbbbbb bbbbbbbb 00000000          .
+# AA-NEXT:      0000 bbbbbbbb 2a000000 00000000          .
 
 ## Specifying zero can get a behavior similar to GNU ld.
 # RUN: ld.lld --icf=all -z dead-reloc-in-nonalloc=.debug_info=0 %t.o %tabs.o -o %tzero

From e99c8aef5d618f5fe0baf643e3a31ee911e909f1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Wed, 24 Jan 2024 08:55:15 -0800
Subject: [PATCH 816/843] [flang][openacc] Lower DO CONCURRENT with acc loop
 (#79223)

Lower basic DO CONCURRENT with acc loop construct. The DO CONCURRENT is
lowered to an acc.loop operation.

This does not currently cover the DO CONCURRENT with locality specs.
---
 flang/lib/Lower/OpenACC.cpp           | 174 +++++++++++++++++---------
 flang/test/Lower/OpenACC/acc-loop.f90 |  20 +++
 2 files changed, 137 insertions(+), 57 deletions(-)

diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index d619d47fc2359..a2823fb3e27d4 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -1607,6 +1607,48 @@ mlir::Type getTypeFromIvTypeSize(fir::FirOpBuilder &builder,
   return builder.getIntegerType(ivTypeSize * 8);
 }
 
+static void privatizeIv(Fortran::lower::AbstractConverter &converter,
+                        const Fortran::semantics::Symbol &sym,
+                        mlir::Location loc,
+                        llvm::SmallVector<mlir::Type> &ivTypes,
+                        llvm::SmallVector<mlir::Location> &ivLocs,
+                        llvm::SmallVector<mlir::Value> &privateOperands,
+                        llvm::SmallVector<mlir::Value> &ivPrivate,
+                        llvm::SmallVector<mlir::Attribute> &privatizations,
+                        bool isDoConcurrent = false) {
+  fir::FirOpBuilder &builder = converter.getFirOpBuilder();
+
+  mlir::Type ivTy = getTypeFromIvTypeSize(builder, sym);
+  ivTypes.push_back(ivTy);
+  ivLocs.push_back(loc);
+  mlir::Value ivValue = converter.getSymbolAddress(sym);
+  if (!ivValue && isDoConcurrent) {
+    // DO CONCURRENT induction variables are not mapped yet since they are local
+    // to the DO CONCURRENT scope.
+    mlir::OpBuilder::InsertPoint insPt = builder.saveInsertionPoint();
+    builder.setInsertionPointToStart(builder.getAllocaBlock());
+    ivValue = builder.createTemporaryAlloc(loc, ivTy, toStringRef(sym.name()));
+    builder.restoreInsertionPoint(insPt);
+  }
+
+  std::string recipeName =
+      fir::getTypeAsString(ivValue.getType(), converter.getKindMap(),
+                           Fortran::lower::privatizationRecipePrefix);
+  auto recipe = Fortran::lower::createOrGetPrivateRecipe(
+      builder, recipeName, loc, ivValue.getType());
+
+  std::stringstream asFortran;
+  auto op = createDataEntryOp<mlir::acc::PrivateOp>(
+      builder, loc, ivValue, asFortran, {}, true, /*implicit=*/true,
+      mlir::acc::DataClause::acc_private, ivValue.getType());
+
+  privateOperands.push_back(op.getAccPtr());
+  ivPrivate.push_back(op.getAccPtr());
+  privatizations.push_back(mlir::SymbolRefAttr::get(builder.getContext(),
+                                                    recipe.getSymName().str()));
+  converter.bindSymbol(sym, op.getAccPtr());
+}
+
 static mlir::acc::LoopOp
 createLoopOp(Fortran::lower::AbstractConverter &converter,
              mlir::Location currentLocation,
@@ -1640,68 +1682,86 @@ createLoopOp(Fortran::lower::AbstractConverter &converter,
   llvm::SmallVector<mlir::Location> ivLocs;
   llvm::SmallVector<bool> inclusiveBounds;
 
-  if (outerDoConstruct.IsDoConcurrent())
-    TODO(currentLocation, "OpenACC loop with DO CONCURRENT");
-
   llvm::SmallVector<mlir::Location> locs;
   locs.push_back(currentLocation); // Location of the directive
-
-  int64_t collapseValue = Fortran::lower::getCollapseValue(accClauseList);
   Fortran::lower::pft::Evaluation *crtEval = &eval.getFirstNestedEvaluation();
-  for (unsigned i = 0; i < collapseValue; ++i) {
-    const Fortran::parser::LoopControl *loopControl;
-    if (i == 0) {
-      loopControl = &*outerDoConstruct.GetLoopControl();
-      locs.push_back(converter.genLocation(
-          Fortran::parser::FindSourceLocation(outerDoConstruct)));
-    } else {
-      auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
-      assert(doCons && "expect do construct");
-      loopControl = &*doCons->GetLoopControl();
-      locs.push_back(
-          converter.genLocation(Fortran::parser::FindSourceLocation(*doCons)));
+  bool isDoConcurrent = outerDoConstruct.IsDoConcurrent();
+  if (isDoConcurrent) {
+    locs.push_back(converter.genLocation(
+        Fortran::parser::FindSourceLocation(outerDoConstruct)));
+    const Fortran::parser::LoopControl *loopControl =
+        &*outerDoConstruct.GetLoopControl();
+    const auto &concurrent =
+        std::get<Fortran::parser::LoopControl::Concurrent>(loopControl->u);
+    if (!std::get<std::list<Fortran::parser::LocalitySpec>>(concurrent.t)
+             .empty())
+      TODO(currentLocation, "DO CONCURRENT with locality spec");
+
+    const auto &concurrentHeader =
+        std::get<Fortran::parser::ConcurrentHeader>(concurrent.t);
+    const auto &controls =
+        std::get<std::list<Fortran::parser::ConcurrentControl>>(
+            concurrentHeader.t);
+    for (const auto &control : controls) {
+      lowerbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(std::get<1>(control.t)), stmtCtx)));
+      upperbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(std::get<2>(control.t)), stmtCtx)));
+      if (const auto &expr =
+              std::get<std::optional<Fortran::parser::ScalarIntExpr>>(
+                  control.t))
+        steps.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(*expr), stmtCtx)));
+      else // If `step` is not present, assume it is `1`.
+        steps.push_back(builder.createIntegerConstant(
+            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
+
+      const auto &name = std::get<Fortran::parser::Name>(control.t);
+      privatizeIv(converter, *name.symbol, currentLocation, ivTypes, ivLocs,
+                  privateOperands, ivPrivate, privatizations, isDoConcurrent);
+
+      inclusiveBounds.push_back(true);
     }
+  } else {
+    int64_t collapseValue = Fortran::lower::getCollapseValue(accClauseList);
+    for (unsigned i = 0; i < collapseValue; ++i) {
+      const Fortran::parser::LoopControl *loopControl;
+      if (i == 0) {
+        loopControl = &*outerDoConstruct.GetLoopControl();
+        locs.push_back(converter.genLocation(
+            Fortran::parser::FindSourceLocation(outerDoConstruct)));
+      } else {
+        auto *doCons = crtEval->getIf<Fortran::parser::DoConstruct>();
+        assert(doCons && "expect do construct");
+        loopControl = &*doCons->GetLoopControl();
+        locs.push_back(converter.genLocation(
+            Fortran::parser::FindSourceLocation(*doCons)));
+      }
 
-    const Fortran::parser::LoopControl::Bounds *bounds =
-        std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
-    assert(bounds && "Expected bounds on the loop construct");
-    lowerbounds.push_back(fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
-    upperbounds.push_back(fir::getBase(converter.genExprValue(
-        *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
-    if (bounds->step)
-      steps.push_back(fir::getBase(converter.genExprValue(
-          *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
-    else // If `step` is not present, assume it as `1`.
-      steps.push_back(builder.createIntegerConstant(
-          currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
-
-    Fortran::semantics::Symbol &ivSym =
-        bounds->name.thing.symbol->GetUltimate();
-
-    mlir::Type ivTy = getTypeFromIvTypeSize(builder, ivSym);
-    mlir::Value ivValue = converter.getSymbolAddress(ivSym);
-    ivTypes.push_back(ivTy);
-    ivLocs.push_back(currentLocation);
-    std::string recipeName =
-        fir::getTypeAsString(ivValue.getType(), converter.getKindMap(),
-                             Fortran::lower::privatizationRecipePrefix);
-    auto recipe = Fortran::lower::createOrGetPrivateRecipe(
-        builder, recipeName, currentLocation, ivValue.getType());
-    std::stringstream asFortran;
-    auto op = createDataEntryOp<mlir::acc::PrivateOp>(
-        builder, currentLocation, ivValue, asFortran, {}, true,
-        /*implicit=*/true, mlir::acc::DataClause::acc_private,
-        ivValue.getType());
-
-    privateOperands.push_back(op.getAccPtr());
-    ivPrivate.push_back(op.getAccPtr());
-    privatizations.push_back(mlir::SymbolRefAttr::get(
-        builder.getContext(), recipe.getSymName().str()));
-    inclusiveBounds.push_back(true);
-    converter.bindSymbol(ivSym, op.getAccPtr());
-    if (i < collapseValue - 1)
-      crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
+      const Fortran::parser::LoopControl::Bounds *bounds =
+          std::get_if<Fortran::parser::LoopControl::Bounds>(&loopControl->u);
+      assert(bounds && "Expected bounds on the loop construct");
+      lowerbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(bounds->lower), stmtCtx)));
+      upperbounds.push_back(fir::getBase(converter.genExprValue(
+          *Fortran::semantics::GetExpr(bounds->upper), stmtCtx)));
+      if (bounds->step)
+        steps.push_back(fir::getBase(converter.genExprValue(
+            *Fortran::semantics::GetExpr(bounds->step), stmtCtx)));
+      else // If `step` is not present, assume it is `1`.
+        steps.push_back(builder.createIntegerConstant(
+            currentLocation, upperbounds[upperbounds.size() - 1].getType(), 1));
+
+      Fortran::semantics::Symbol &ivSym =
+          bounds->name.thing.symbol->GetUltimate();
+      privatizeIv(converter, ivSym, currentLocation, ivTypes, ivLocs,
+                  privateOperands, ivPrivate, privatizations);
+
+      inclusiveBounds.push_back(true);
+
+      if (i < collapseValue - 1)
+        crtEval = &*std::next(crtEval->getNestedEvaluations().begin());
+    }
   }
 
   for (const Fortran::parser::AccClause &clause : accClauseList.v) {
diff --git a/flang/test/Lower/OpenACC/acc-loop.f90 b/flang/test/Lower/OpenACC/acc-loop.f90
index 6bd4dd61bd893..28da9f99282ce 100644
--- a/flang/test/Lower/OpenACC/acc-loop.f90
+++ b/flang/test/Lower/OpenACC/acc-loop.f90
@@ -306,3 +306,23 @@ program acc_loop
 ! CHECK: acc.loop gang([#acc.device_type<nvidia>, #acc.device_type<default>])
 
 end program
+
+subroutine sub1(i, j, k)
+  integer :: i,j,k
+  integer :: a(i,j,k)
+  !$acc parallel loop
+  do concurrent (i=1:10,j=1:100,k=1:200)
+    a(i,j,k) = a(i,j,k) + 1
+  end do
+end subroutine
+
+! CHECK: func.func @_QPsub1
+! CHECK: %[[DC_K:.*]] = fir.alloca i32 {bindc_name = "k"}
+! CHECK: %[[DC_J:.*]] = fir.alloca i32 {bindc_name = "j"}
+! CHECK: %[[DC_I:.*]] = fir.alloca i32 {bindc_name = "i"}
+! CHECK: acc.parallel
+! CHECK: %[[P_I:.*]] = acc.private varPtr(%[[DC_I]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = ""}
+! CHECK: %[[P_J:.*]] = acc.private varPtr(%[[DC_J]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = ""}
+! CHECK: %[[P_K:.*]] = acc.private varPtr(%[[DC_K]] : !fir.ref<i32>) -> !fir.ref<i32> {implicit = true, name = ""}
+! CHECK: acc.loop private(@privatization_ref_i32 -> %[[P_I]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[P_J]] : !fir.ref<i32>, @privatization_ref_i32 -> %[[P_K]] : !fir.ref<i32>) (%{{.*}} : i32, %{{.*}} : i32, %{{.*}} : i32) = (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32) to (%c10{{.*}}, %c100{{.*}}, %c200{{.*}} : i32, i32, i32)  step (%c1{{.*}}, %c1{{.*}}, %c1{{.*}} : i32, i32, i32)
+! CHECK: } attributes {inclusiveUpperbound = array<i1: true, true, true>}

From 5e894771d9517a0cec3bc12480c1549080aaf5b2 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Wed, 24 Jan 2024 11:58:03 -0500
Subject: [PATCH 817/843] [ci] Remove unused
 generate-buildkite-pipeline-scheduled script (#79320)

The "scheduled build" pipeline on BuildKite had been disabled for months
and doesn't exist anymore, so this script is effectively dead code. When
we set up a cron-activated build again, we should do it using Github
actions (which could trigger a BK pipeline if needed).

Keeping this script around just creates additional confusion about
what's used and what's not used for doing CI.
---
 .ci/generate-buildkite-pipeline-scheduled | 81 -----------------------
 1 file changed, 81 deletions(-)
 delete mode 100755 .ci/generate-buildkite-pipeline-scheduled

diff --git a/.ci/generate-buildkite-pipeline-scheduled b/.ci/generate-buildkite-pipeline-scheduled
deleted file mode 100755
index d5c7c5965ef6a..0000000000000
--- a/.ci/generate-buildkite-pipeline-scheduled
+++ /dev/null
@@ -1,81 +0,0 @@
-#!/usr/bin/env bash
-#===----------------------------------------------------------------------===##
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-#===----------------------------------------------------------------------===##
-
-#
-# This file generates a Buildkite pipeline that triggers the various CI jobs for
-# the LLVM project on scheduled builds.
-#
-# See https://buildkite.com/docs/agent/v3/cli-pipeline#pipeline-format.
-#
-
-set -eu
-set -o pipefail
-
-# Filter rules for generic windows tests
-: ${WINDOWS_AGENTS:='{"queue": "windows"}'}
-# Filter rules for generic linux tests
-: ${LINUX_AGENTS:='{"queue": "linux"}'}
-# Set by buildkite
-: ${BUILDKITE_MESSAGE:=}
-: ${BUILDKITE_COMMIT:=}
-: ${BUILDKITE_BRANCH:=}
-
-cat <<EOF
-steps:
-  - trigger: "libcxx-ci"
-    build:
-      message: "${BUILDKITE_MESSAGE}"
-      commit: "${BUILDKITE_COMMIT}"
-      branch: "${BUILDKITE_BRANCH}"
-
-  - trigger: "clang-ci"
-    build:
-      message: "${BUILDKITE_MESSAGE}"
-      commit: "${BUILDKITE_COMMIT}"
-      branch: "${BUILDKITE_BRANCH}"
-
-  - label: ':linux: Linux x64'
-    artifact_paths:
-    - 'artifacts/**/*'
-    - '*_result.json'
-    - 'build/test-results.xml'
-    agents: ${LINUX_AGENTS}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: 255 # Forced agent shutdown
-          limit: 2
-    timeout_in_minutes: 120
-    env:
-      CC: 'clang'
-      CXX: 'clang++'
-    commands:
-    - ./.ci/monolithic-linux.sh "bolt;clang;clang-tools-extra;compiler-rt;flang;libc;libclc;lld;llvm;mlir;polly;pstl" "check-all"
-
-  - label: ':windows: Windows x64'
-    artifact_paths:
-    - 'artifacts/**/*'
-    - '*_result.json'
-    - 'build/test-results.xml'
-    agents: ${WINDOWS_AGENTS}
-    retry:
-      automatic:
-        - exit_status: -1  # Agent was lost
-          limit: 2
-        - exit_status: 255 # Forced agent shutdown
-          limit: 2
-    timeout_in_minutes: 150
-    env:
-      CC: 'cl'
-      CXX: 'cl'
-      LD: 'link'
-    commands:
-    - C:\\BuildTools\\Common7\\Tools\\VsDevCmd.bat -arch=amd64 -host_arch=amd64
-    - bash .ci/monolithic-windows.sh "clang;clang-tools-extra;flang;libclc;lld;llvm;mlir;polly;pstl" "check-all"

From 56602a48c735a1c906a9ec4e03a64fd9c937def3 Mon Sep 17 00:00:00 2001
From: ostannard <oliver.stannard@arm.com>
Date: Wed, 24 Jan 2024 17:07:20 +0000
Subject: [PATCH 818/843] [TableGen] Include source location in JSON dump
 (#79028)

This adds a '!loc' field to each record containing the file name and
line number of the record declaration.
---
 llvm/docs/TableGen/BackEnds.rst   |  6 +++++
 llvm/lib/TableGen/JSONBackend.cpp |  6 +++++
 llvm/test/TableGen/JSON-locs.td   | 44 +++++++++++++++++++++++++++++++
 llvm/test/TableGen/JSON.td        |  3 +++
 4 files changed, 59 insertions(+)
 create mode 100644 llvm/test/TableGen/JSON-locs.td

diff --git a/llvm/docs/TableGen/BackEnds.rst b/llvm/docs/TableGen/BackEnds.rst
index 5cbb3232ef023..742fea51bcf32 100644
--- a/llvm/docs/TableGen/BackEnds.rst
+++ b/llvm/docs/TableGen/BackEnds.rst
@@ -506,6 +506,12 @@ following fixed keys:
   specified by the TableGen input (if it is ``false``), or invented by
   TableGen itself (if ``true``).
 
+* ``!locs``: an array of strings giving the source locations associated with
+  this record. For records instantiated from a ``multiclass``, this gives the
+  location of each ``def`` or ``defm``, starting with the inner-most
+  ``multiclass``, and ending with the top-level ``defm``. Each string contains
+  the file name and line number, separated by a colon.
+
 For each variable defined in a record, the ``def`` object for that
 record also has a key for the variable name. The corresponding value
 is a translation into JSON of the variable's value, using the
diff --git a/llvm/lib/TableGen/JSONBackend.cpp b/llvm/lib/TableGen/JSONBackend.cpp
index 2a3f522a9c0ef..cd10c22094e45 100644
--- a/llvm/lib/TableGen/JSONBackend.cpp
+++ b/llvm/lib/TableGen/JSONBackend.cpp
@@ -15,6 +15,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/JSON.h"
+#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 
 #define DEBUG_TYPE "json-emitter"
@@ -159,6 +160,11 @@ void JSONEmitter::run(raw_ostream &OS) {
     obj["!name"] = Name;
     obj["!anonymous"] = Def.isAnonymous();
 
+    json::Array locs;
+    for (const SMLoc Loc : Def.getLoc())
+      locs.push_back(SrcMgr.getFormattedLocationNoOffset(Loc));
+    obj["!locs"] = std::move(locs);
+
     root[Name] = std::move(obj);
 
     // Add this def to the instance list for each of its superclasses.
diff --git a/llvm/test/TableGen/JSON-locs.td b/llvm/test/TableGen/JSON-locs.td
new file mode 100644
index 0000000000000..38baf46931489
--- /dev/null
+++ b/llvm/test/TableGen/JSON-locs.td
@@ -0,0 +1,44 @@
+// RUN: llvm-tblgen -dump-json %s | %python %S/JSON-check.py %s
+
+def Simple {}
+// CHECK: data['Simple']['!locs'] == ['JSON-locs.td:3']
+
+multiclass Multiclass1 {
+  def Instance1 {}
+  def Instance2 {}
+}
+
+defm DefM1 : Multiclass1;
+
+// CHECK: data['DefM1Instance1']['!locs'] == ['JSON-locs.td:7', 'JSON-locs.td:11']
+// CHECK: data['DefM1Instance2']['!locs'] == ['JSON-locs.td:8', 'JSON-locs.td:11']
+
+multiclass Multiclass2 {
+  def Instance3 {}
+  def Instance4 {}
+}
+
+defm DefM2 : Multiclass1, Multiclass2;
+// CHECK: data['DefM2Instance1']['!locs'] == ['JSON-locs.td:7', 'JSON-locs.td:21']
+// CHECK: data['DefM2Instance2']['!locs'] == ['JSON-locs.td:8', 'JSON-locs.td:21']
+// CHECK: data['DefM2Instance3']['!locs'] == ['JSON-locs.td:17', 'JSON-locs.td:21']
+// CHECK: data['DefM2Instance4']['!locs'] == ['JSON-locs.td:18', 'JSON-locs.td:21']
+
+multiclass Multiclass3 {
+  defm InnerDefM : Multiclass1;
+  def Instance5 {}
+}
+
+defm DefM3: Multiclass3;
+// CHECK: data['DefM3InnerDefMInstance1']['!locs'] == ['JSON-locs.td:7', 'JSON-locs.td:28', 'JSON-locs.td:32']
+// CHECK: data['DefM3InnerDefMInstance2']['!locs'] == ['JSON-locs.td:8', 'JSON-locs.td:28', 'JSON-locs.td:32']
+// CHECK: data['DefM3Instance5']['!locs'] == ['JSON-locs.td:29', 'JSON-locs.td:32']
+
+class BaseClass {}
+class DerivedClass : BaseClass {}
+// Classes do not appear in the JSON, so do not get locations.
+// CHECK: 'BaseClass' not in data
+// CHECK: 'DerivedClass' not in data
+
+def ClassInstance : DerivedClass {}
+// CHECK: data['ClassInstance']['!locs'] == ['JSON-locs.td:43']
diff --git a/llvm/test/TableGen/JSON.td b/llvm/test/TableGen/JSON.td
index 3fb2ec4014fbc..4d8c3a466bfd0 100644
--- a/llvm/test/TableGen/JSON.td
+++ b/llvm/test/TableGen/JSON.td
@@ -3,12 +3,15 @@
 // CHECK: data['!tablegen_json_version'] == 1
 
 // CHECK: all(data[s]['!name'] == s for s in data if not s.startswith("!"))
+// CHECK: all('!locs' in data[s] for s in data if not s.startswith("!"))
+// CHECK: all(all(loc.startswith("JSON.td:") for loc in data[s]['!locs']) for s in data if not s.startswith("!"))
 
 class Base {}
 class Intermediate : Base {}
 class Derived : Intermediate {}
 
 def D : Intermediate {}
+// CHECK: data['D']['!locs'] == ['JSON.td:13']
 // CHECK: 'D' in data['!instanceof']['Base']
 // CHECK: 'D' in data['!instanceof']['Intermediate']
 // CHECK: 'D' not in data['!instanceof']['Derived']

From b801b607e38ca9113394a918f475e5a521b7ec13 Mon Sep 17 00:00:00 2001
From: Jan Svoboda <jan_svoboda@apple.com>
Date: Wed, 24 Jan 2024 18:12:00 +0100
Subject: [PATCH 819/843] [clangd] Make sure ninja can clean
 "ClangdXPC.framework" (#75669)

After building the ClangdXPC target, `ninja clean` fails with the
following error:

```
ninja: error: remove(lib/ClangdXPC.framework): Directory not empty
ninja: error: remove(<build>/lib/ClangdXPC.framework): Directory not empty
```

I did not find better way to make this work. I guess we could list all
generated files (and directories) in `OUTPUT` of the custom command, but
that seems fairly tedious/fragile.
---
 .../clangd/xpc/cmake/modules/CreateClangdXPCFramework.cmake | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/clang-tools-extra/clangd/xpc/cmake/modules/CreateClangdXPCFramework.cmake b/clang-tools-extra/clangd/xpc/cmake/modules/CreateClangdXPCFramework.cmake
index 46738a204ace1..d5ba44962dd52 100644
--- a/clang-tools-extra/clangd/xpc/cmake/modules/CreateClangdXPCFramework.cmake
+++ b/clang-tools-extra/clangd/xpc/cmake/modules/CreateClangdXPCFramework.cmake
@@ -71,6 +71,12 @@ macro(create_clangd_xpc_framework target name)
     ${CLANGD_FRAMEWORK_LOCATION}
   )
 
+  set_property(
+    TARGET ClangdXPC
+    APPEND
+    PROPERTY ADDITIONAL_CLEAN_FILES ${CLANGD_FRAMEWORK_LOCATION}
+  )
+
   # clangd is already signed as a standalone executable, so it must be forced.
   llvm_codesign(ClangdXPC BUNDLE_PATH "${CLANGD_FRAMEWORK_OUT_LOCATION}/XPCServices/${CLANGD_XPC_SERVICE_NAME}.xpc/" FORCE)
   # ClangdXPC library is already signed as a standalone library, so it must be forced.

From e9311f9c5acd3d9f282c351721bb911e5b519531 Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Wed, 24 Jan 2024 09:14:33 -0800
Subject: [PATCH 820/843] [RISCV] Separate single source and dual source
 lowering code [nfc]

The two single source cases aren't effected by the swap or select matching
as those are dual operand specific.  Similarly, a two source shuffle can't
be a rotate.

We can extend this idea for some of the shuffle types above, but some of
them are validly either single or dual source.  We don't want to loose that
and the code complexity of versioning early and having to repeat some shuffle
kinds doesn't (currently) seem worth it.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 107 ++++++++++----------
 1 file changed, 54 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index a38352e8e87f2..dc1c87fcdc0dd 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4926,15 +4926,55 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return getWideningInterleave(EvenV, OddV, DL, DAG, Subtarget);
   }
 
-  // Detect shuffles which can be re-expressed as vector selects; these are
-  // shuffles in which each element in the destination is taken from an element
-  // at the corresponding index in either source vectors.
-  bool IsSelect = all_of(enumerate(Mask), [&](const auto &MaskIdx) {
-    int MaskIndex = MaskIdx.value();
-    return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts;
-  });
 
+  // Handle any remaining single source shuffles
   assert(!V1.isUndef() && "Unexpected shuffle canonicalization");
+  if (V2.isUndef()) {
+    // We might be able to express the shuffle as a bitrotate. But even if we
+    // don't have Zvkb and have to expand, the expanded sequence of approx. 2
+    // shifts and a vor will have a higher throughput than a vrgather.
+    if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
+      return V;
+
+    // Base case for the two operand recursion below - handle the worst case
+    // single source shuffle.
+    unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
+    MVT IndexVT = VT.changeTypeToInteger();
+    // Since we can't introduce illegal index types at this stage, use i16 and
+    // vrgatherei16 if the corresponding index type for plain vrgather is greater
+    // than XLenVT.
+    if (IndexVT.getScalarType().bitsGT(XLenVT)) {
+      GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
+      IndexVT = IndexVT.changeVectorElementType(MVT::i16);
+    }
+
+    // If the mask allows, we can do all the index computation in 16 bits.  This
+    // requires less work and less register pressure at high LMUL, and creates
+    // smaller constants which may be cheaper to materialize.
+    if (IndexVT.getScalarType().bitsGT(MVT::i16) && isUInt<16>(NumElts - 1) &&
+        (IndexVT.getSizeInBits() / Subtarget.getRealMinVLen()) > 1) {
+      GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
+      IndexVT = IndexVT.changeVectorElementType(MVT::i16);
+    }
+
+    MVT IndexContainerVT =
+      ContainerVT.changeVectorElementType(IndexVT.getScalarType());
+
+    V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
+    SmallVector<SDValue> GatherIndicesLHS;
+    for (int MaskIndex : Mask) {
+      bool IsLHSIndex = MaskIndex < (int)NumElts && MaskIndex >= 0;
+      GatherIndicesLHS.push_back(IsLHSIndex
+                                 ? DAG.getConstant(MaskIndex, DL, XLenVT)
+                                 : DAG.getUNDEF(XLenVT));
+    }
+    SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
+    LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
+                                         Subtarget);
+    SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
+                                 DAG.getUNDEF(ContainerVT), TrueMask, VL);
+    return convertFromScalableVector(VT, Gather, DAG, Subtarget);
+  }
 
   // By default we preserve the original operand order, and use a mask to
   // select LHS as true and RHS as false. However, since RVV vector selects may
@@ -4942,6 +4982,13 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   // instead select between RHS and LHS.
   bool SwapOps = DAG.isSplatValue(V2) && !DAG.isSplatValue(V1);
 
+  // Detect shuffles which can be re-expressed as vector selects; these are
+  // shuffles in which each element in the destination is taken from an element
+  // at the corresponding index in either source vectors.
+  bool IsSelect = all_of(enumerate(Mask), [&](const auto &MaskIdx) {
+    int MaskIndex = MaskIdx.value();
+    return MaskIndex < 0 || MaskIdx.index() == (unsigned)MaskIndex % NumElts;
+  });
   if (IsSelect) {
     // Now construct the mask that will be used by the vselect operation.
     SmallVector<SDValue> MaskVals;
@@ -4959,12 +5006,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::VSELECT, DL, VT, SelectMask, V1, V2);
   }
 
-  // We might be able to express the shuffle as a bitrotate. But even if we
-  // don't have Zvkb and have to expand, the expanded sequence of approx. 2
-  // shifts and a vor will have a higher throughput than a vrgather.
-  if (SDValue V = lowerVECTOR_SHUFFLEAsRotate(SVN, DAG, Subtarget))
-    return V;
-
   if (VT.getScalarSizeInBits() == 8 && VT.getVectorNumElements() > 256) {
     // On such a large vector we're unable to use i8 as the index type.
     // FIXME: We could promote the index to i16 and use vrgatherei16, but that
@@ -4998,46 +5039,6 @@ static SDValue lowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG,
   MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
   SDValue SelectMask = DAG.getBuildVector(MaskVT, DL, MaskVals);
 
-  // Base case for the recursion just below - handle the worst case
-  // single source permutation.  Note that all the splat variants
-  // are handled above.
-  if (V2.isUndef()) {
-    unsigned GatherVVOpc = RISCVISD::VRGATHER_VV_VL;
-    MVT IndexVT = VT.changeTypeToInteger();
-    // Since we can't introduce illegal index types at this stage, use i16 and
-    // vrgatherei16 if the corresponding index type for plain vrgather is greater
-    // than XLenVT.
-    if (IndexVT.getScalarType().bitsGT(XLenVT)) {
-      GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
-      IndexVT = IndexVT.changeVectorElementType(MVT::i16);
-    }
-
-    // If the mask allows, we can do all the index computation in 16 bits.  This
-    // requires less work and less register pressure at high LMUL, and creates
-    // smaller constants which may be cheaper to materialize.
-    if (IndexVT.getScalarType().bitsGT(MVT::i16) && isUInt<16>(NumElts - 1) &&
-        (IndexVT.getSizeInBits() / Subtarget.getRealMinVLen()) > 1) {
-      GatherVVOpc = RISCVISD::VRGATHEREI16_VV_VL;
-      IndexVT = IndexVT.changeVectorElementType(MVT::i16);
-    }
-
-    MVT IndexContainerVT =
-      ContainerVT.changeVectorElementType(IndexVT.getScalarType());
-
-    V1 = convertToScalableVector(ContainerVT, V1, DAG, Subtarget);
-    SmallVector<SDValue> GatherIndicesLHS;
-    for (int ShuffleIdx : ShuffleMaskLHS)
-      GatherIndicesLHS.push_back(ShuffleIdx != -1
-                                 ? DAG.getConstant(ShuffleIdx, DL, XLenVT)
-                                 : DAG.getUNDEF(XLenVT));
-    SDValue LHSIndices = DAG.getBuildVector(IndexVT, DL, GatherIndicesLHS);
-    LHSIndices = convertToScalableVector(IndexContainerVT, LHSIndices, DAG,
-                                         Subtarget);
-    SDValue Gather = DAG.getNode(GatherVVOpc, DL, ContainerVT, V1, LHSIndices,
-                                 DAG.getUNDEF(ContainerVT), TrueMask, VL);
-    return convertFromScalableVector(VT, Gather, DAG, Subtarget);
-  }
-
   // Recursively invoke lowering for each operand if we had two
   // independent single source shuffles, and then combine the result via a
   // vselect.  Note that the vselect will likely be folded back into the

From 1605bf58151bc357780fee5553beee2b404772a7 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 24 Jan 2024 09:18:57 -0800
Subject: [PATCH 821/843] [ConstraintElimination] Use std::move in the
 constructor (NFC) (#79259)

Moving the contents of Coefficients saves 0.43% of heap allocations
during the compilation of a large preprocessed file, namely
X86ISelLowering.cpp, for the X86 target.
---
 llvm/lib/Transforms/Scalar/ConstraintElimination.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 7b672e89b67aa..ae8e2292519cb 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -231,8 +231,8 @@ struct ConstraintTy {
 
   ConstraintTy(SmallVector<int64_t, 8> Coefficients, bool IsSigned, bool IsEq,
                bool IsNe)
-      : Coefficients(Coefficients), IsSigned(IsSigned), IsEq(IsEq), IsNe(IsNe) {
-  }
+      : Coefficients(std::move(Coefficients)), IsSigned(IsSigned), IsEq(IsEq),
+        IsNe(IsNe) {}
 
   unsigned size() const { return Coefficients.size(); }
 

From 3b8539c9dc0bc38ebea903e038257ed4328f290b Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Wed, 24 Jan 2024 09:24:28 -0800
Subject: [PATCH 822/843] [NVPTX] use incomplete aggregate initializers
 (#79062)

The PTX ISA specifies that initializers may be incomplete ([5.4.4.
Initializers](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#initializers))
> As in C, array initializers may be incomplete, i.e., the number of
initializer elements may be less than the extent of the corresponding
array dimension, with remaining array locations initialized to the
default value for the specified array type.

Emitting initializers in this form is preferable because it reduces the
size of the PTX, in some cases significantly, and can improve compile
time of ptxas as a result.
---
 llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp     | 15 +++++++++++--
 .../CodeGen/NVPTX/global-incomplete-init.ll   | 22 +++++++++++++++++++
 llvm/test/CodeGen/NVPTX/globals_init.ll       |  2 +-
 llvm/test/CodeGen/NVPTX/i128-global.ll        |  2 +-
 4 files changed, 37 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/global-incomplete-init.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 6a03c7b0abc34..9534ad9b4f83d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -1292,10 +1292,21 @@ void NVPTXAsmPrinter::AggBuffer::printSymbol(unsigned nSym, raw_ostream &os) {
 
 void NVPTXAsmPrinter::AggBuffer::printBytes(raw_ostream &os) {
   unsigned int ptrSize = AP.MAI->getCodePointerSize();
-  symbolPosInBuffer.push_back(size);
+  // Do not emit trailing zero initializers. They will be zero-initialized by
+  // ptxas. This saves on both space requirements for the generated PTX and on
+  // memory use by ptxas. (See:
+  // https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#global-state-space)
+  unsigned int InitializerCount = size;
+  // TODO: symbols make this harder, but it would still be good to trim trailing
+  // 0s for aggs with symbols as well.
+  if (numSymbols() == 0)
+    while (InitializerCount >= 1 && !buffer[InitializerCount - 1])
+      InitializerCount--;
+
+  symbolPosInBuffer.push_back(InitializerCount);
   unsigned int nSym = 0;
   unsigned int nextSymbolPos = symbolPosInBuffer[nSym];
-  for (unsigned int pos = 0; pos < size;) {
+  for (unsigned int pos = 0; pos < InitializerCount;) {
     if (pos)
       os << ", ";
     if (pos != nextSymbolPos) {
diff --git a/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll b/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll
new file mode 100644
index 0000000000000..d9c08b8dad57f
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/global-incomplete-init.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -march=nvptx64 -mcpu=sm_50 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_50 | %ptxas-verify %}
+
+; Make sure the globals constants have trailing zeros properly trimmed
+
+; basic case
+; CHECK-DAG: .b8 A[8] = {3, 4, 0, 0, 5};
+@A = global [8 x i8] [i8 3, i8 4, i8 0, i8 0, i8 5, i8 0, i8 0, i8 0]
+
+; all-zeros
+; CHECK-DAG: .b8 B[2];
+@B = global [2 x i8] [i8 0, i8 0]
+
+; all-non-zeros
+; CHECK-DAG: .b8 C[4] = {1, 2, 3, 4};
+@C = global [4 x i8] [i8 1, i8 2, i8 3, i8 4]
+
+; initializer with a symbol, the last 0 could be default initialized
+; CHECK-DAG: .u8 e = 1;
+; CHECK-DAG: .u64 D[4] = {e, 0, e, 0};
+@e = addrspace(1) global i8 1
+@D = addrspace(1) global [4 x ptr addrspace(1)] [ptr addrspace(1) @e, ptr addrspace(1) null, ptr addrspace(1) @e, ptr addrspace(1) null]
diff --git a/llvm/test/CodeGen/NVPTX/globals_init.ll b/llvm/test/CodeGen/NVPTX/globals_init.ll
index bdfba6b602aec..d607c6e822a94 100644
--- a/llvm/test/CodeGen/NVPTX/globals_init.ll
+++ b/llvm/test/CodeGen/NVPTX/globals_init.ll
@@ -26,6 +26,6 @@
 @Gblf64 = global [2 x double] [double 5.75e-25, double 12.25e+56]
 
 ; Make sure we fill in alignment gaps correctly.
-; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1, 0, 0, 0};
+; CHECK-DAG: .b8 GblU[12] = {7, 6, 0, 0, 5, 4, 3, 2, 1};
 @GblU = global {i16, i32, i8} {i16 1543, i32 33752069, i8 1}
 
diff --git a/llvm/test/CodeGen/NVPTX/i128-global.ll b/llvm/test/CodeGen/NVPTX/i128-global.ll
index cdebc65272f86..c87087484da1f 100644
--- a/llvm/test/CodeGen/NVPTX/i128-global.ll
+++ b/llvm/test/CodeGen/NVPTX/i128-global.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -O0 -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; CHECK: .visible .global .align 16 .b8 G1[16] = {1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+; CHECK: .visible .global .align 16 .b8 G1[16] = {1};
 @G1 = global i128 1
 
 ; CHECK: .visible .global .align 16 .b8 G2[16];

From ca654acc16c43191228eadfec8f7241dca10b0c3 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Wed, 24 Jan 2024 09:14:06 -0800
Subject: [PATCH 823/843] [SLP]Fix PR79321: SLPVectorizer's PHICompare doesn't
 provide a strict weak ordering.

Compared NumUses to meet the reaquirements of the strict weak ordering.
---
 llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp             | 6 ++++--
 .../SLPVectorizer/X86/extract-subvector-long-input.ll       | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 601d2454c1e16..5c17cd5427357 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -4138,9 +4138,11 @@ BoUpSLP::getReorderingData(const TreeEntry &TE, bool TopToBottom) {
     auto PHICompare = [&](unsigned I1, unsigned I2) {
       Value *V1 = TE.Scalars[I1];
       Value *V2 = TE.Scalars[I2];
-      if (V1 == V2)
+      if (V1 == V2 || (V1->getNumUses() == 0 && V2->getNumUses() == 0))
         return false;
-      if (!V1->hasOneUse() || !V2->hasOneUse())
+      if (V1->getNumUses() < V2->getNumUses())
+        return true;
+      if (V1->getNumUses() > V2->getNumUses())
         return false;
       auto *FirstUserOfPhi1 = cast<Instruction>(*V1->user_begin());
       auto *FirstUserOfPhi2 = cast<Instruction>(*V2->user_begin());
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
index 3c22804453dff..1b54a604cd6f3 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extract-subvector-long-input.ll
@@ -7,7 +7,7 @@ define void @test() {
 ; CHECK-NEXT:    br label [[BB1:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP0:%.*]] = phi <8 x i32> [ poison, [[BB10:%.*]] ], [ zeroinitializer, [[BB:%.*]] ]
-; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <2 x i32> <i32 6, i32 6>
+; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <8 x i32> [[TMP0]], <8 x i32> poison, <2 x i32> <i32 7, i32 7>
 ; CHECK-NEXT:    switch i32 0, label [[BB16:%.*]] [
 ; CHECK-NEXT:      i32 0, label [[BB14:%.*]]
 ; CHECK-NEXT:      i32 1, label [[BB11:%.*]]

From 2a77d92e2e942fb1c7e23b046581531ec464cce5 Mon Sep 17 00:00:00 2001
From: Vojislav Tomasevic <vojislav.tomasevic@syrmia.com>
Date: Wed, 24 Jan 2024 18:39:36 +0100
Subject: [PATCH 824/843] [clang] Incorrect IR involving the use of bcopy
 (#79298)

This patch addresses the issue regarding the call of bcopy function in a
conditional expression.
It is analogous to the already accepted patch which deals with the same
problem, just regarding the bzero function [0].

Here is the testcase which illustrates the issue:

```
void bcopy(const void *, void *, unsigned long);
void foo(void);

void test_bcopy() {
  char dst[20];
  char src[20];
  int _sz = 20, len = 20;
  return (_sz
          ? ((_sz >= len)
             ? bcopy(src, dst, len)
             : foo())
          : bcopy(src, dst, len));
}
```

When processing it with clang, following issue occurs:

Instruction does not dominate all uses!
%arraydecay2 = getelementptr inbounds [20 x i8], ptr %dst, i64 0, i64 0,
!dbg !38
%cond = phi ptr [ %arraydecay2, %cond.end ], [ %arraydecay5,
%cond.false3 ], !dbg !33
fatal error: error in backend: Broken module found, compilation aborted!

This happens because an incorrect phi node is created. It is created
because bcopy function call is lowered to the call of llvm.memmove
intrinsic and function memmove returns void *. Since llvm.memmove is
called in two places in the same return statement, clang creates a phi
node in the final basic block for the return value and that phi node is
incorrect. However, bcopy function should return void in the first
place, so this phi node is unnecessary. This is what this patch
addresses. An appropriate test is also added and no existing tests fail
when applying this patch.

Also, this crash only happens when LLVM is configured with
-DLLVM_ENABLE_ASSERTIONS=On option.

[0] https://reviews.llvm.org/D39746
---
 clang/lib/CodeGen/CGBuiltin.cpp |  2 +-
 clang/test/CodeGen/builtins.c   | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index a4f26a6f0eb19..27183c69b7ea3 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -4011,7 +4011,7 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
     EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(1)->getType(),
                         E->getArg(1)->getExprLoc(), FD, 0);
     Builder.CreateMemMove(Dest, Src, SizeVal, false);
-    return RValue::get(Dest.getPointer());
+    return RValue::get(nullptr);
   }
 
   case Builtin::BImemcpy:
diff --git a/clang/test/CodeGen/builtins.c b/clang/test/CodeGen/builtins.c
index ce1182b724dcc..ed03233b6f1a9 100644
--- a/clang/test/CodeGen/builtins.c
+++ b/clang/test/CodeGen/builtins.c
@@ -202,6 +202,21 @@ void test_conditional_bzero(void) {
   // CHECK-NOT: phi
 }
 
+// CHECK-LABEL: define{{.*}} void @test_conditional_bcopy
+void test_conditional_bcopy(void) {
+  char dst[20];
+  char src[20];
+  int _sz = 20, len = 20;
+  return (_sz
+          ? ((_sz >= len)
+              ? __builtin_bcopy(src, dst, len)
+              : foo())
+          : __builtin_bcopy(src, dst, len));
+  // CHECK: call void @llvm.memmove
+  // CHECK: call void @llvm.memmove
+  // CHECK-NOT: phi
+}
+
 // CHECK-LABEL: define{{.*}} void @test_float_builtins
 void test_float_builtins(__fp16 *H, float F, double D, long double LD) {
   volatile int res;

From 4a9a1d83bee91418c649eccc9537dcbf6a745a65 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 24 Jan 2024 12:35:15 -0500
Subject: [PATCH 825/843] [gn build] port 4a582845597e (tablegen'd clang
 builtins)

---
 .../utils/gn/secondary/clang/include/clang/Basic/BUILD.gn | 8 ++++++++
 llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn          | 2 ++
 llvm/utils/gn/secondary/clang/utils/TableGen/BUILD.gn     | 1 +
 3 files changed, 11 insertions(+)

diff --git a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
index 935a4f1b82937..4babd37e94853 100644
--- a/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/include/clang/Basic/BUILD.gn
@@ -92,6 +92,14 @@ clang_tablegen("AttrHasAttributeImpl") {
   td_file = "Attr.td"
 }
 
+clang_tablegen("Builtins") {
+  args = [ "-gen-clang-builtins" ]
+}
+
+clang_tablegen("BuiltinsBPF") {
+  args = [ "-gen-clang-builtins" ]
+}
+
 # ARM CDE, MVE, and NEON.
 
 clang_tablegen("arm_neon") {
diff --git a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
index b5bd8c158761f..1486d16a64616 100644
--- a/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Basic/BUILD.gn
@@ -25,6 +25,8 @@ static_library("Basic") {
     "//clang/include/clang/Basic:AttrLeftSideMustPrintList",
     "//clang/include/clang/Basic:AttrList",
     "//clang/include/clang/Basic:AttrSubMatchRulesList",
+    "//clang/include/clang/Basic:Builtins",
+    "//clang/include/clang/Basic:BuiltinsBPF",
     "//clang/include/clang/Basic:DiagnosticGroups",
     "//clang/include/clang/Basic:RegularKeywordAttrInfo",
     "//clang/include/clang/Basic:arm_cde_builtins",
diff --git a/llvm/utils/gn/secondary/clang/utils/TableGen/BUILD.gn b/llvm/utils/gn/secondary/clang/utils/TableGen/BUILD.gn
index 64f1b94a3544b..b5f3bc0a7baf5 100644
--- a/llvm/utils/gn/secondary/clang/utils/TableGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/utils/TableGen/BUILD.gn
@@ -10,6 +10,7 @@ executable("clang-tblgen") {
     "ClangASTNodesEmitter.cpp",
     "ClangASTPropertiesEmitter.cpp",
     "ClangAttrEmitter.cpp",
+    "ClangBuiltinsEmitter.cpp",
     "ClangCommentCommandInfoEmitter.cpp",
     "ClangCommentHTMLNamedCharacterReferenceEmitter.cpp",
     "ClangCommentHTMLTagsEmitter.cpp",

From 604a6c409e8473b212952b8633d92bbdb22a45c9 Mon Sep 17 00:00:00 2001
From: Jeremy Morse <jeremy.morse@sony.com>
Date: Wed, 24 Jan 2024 17:45:43 +0000
Subject: [PATCH 826/843] [BPI] Transfer value-handles when assign/move
 constructing BPI (#77774)

Background: BPI stores a collection of edge branch-probabilities, and
also a set of Callback value-handles for the blocks in the
edge-collection. When a block is deleted, BPI's eraseBlock method is
called to clear the edge-collection of references to that block, to
avoid dangling pointers.

However, when move-constructing or assigning a BPI object, the
edge-collection gets moved, but the value-handles are discarded. This
can lead to to stale entries in the edge-collection when blocks are
deleted without the callback -- not normally a problem, but if a new
block is allocated with the same address as an old block, spurious
branch probabilities will be recorded about it. The fix is to transfer
the handles from the source BPI object.

This was exposed by an unrelated debug-info change, it probably just
shifted around allocation orders to expose this. Detected as
nondeterminism and reduced by Zequan Wu:


https://github.com/llvm/llvm-project/commit/f1b0a544514f3d343f32a41de9d6fb0b6cbb6021#commitcomment-136737090

(No test because IMHO testing for a behaviour that varies with memory
allocators is likely futile; I can add the reproducer with a CHECK for
the relevant branch weights if it's desired though)
---
 llvm/include/llvm/Analysis/BranchProbabilityInfo.h | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
index 6b9d178182011..91e1872e9bd6f 100644
--- a/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
+++ b/llvm/include/llvm/Analysis/BranchProbabilityInfo.h
@@ -122,16 +122,23 @@ class BranchProbabilityInfo {
   }
 
   BranchProbabilityInfo(BranchProbabilityInfo &&Arg)
-      : Probs(std::move(Arg.Probs)), LastF(Arg.LastF),
-        EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) {}
+      : Handles(std::move(Arg.Handles)), Probs(std::move(Arg.Probs)),
+        LastF(Arg.LastF),
+        EstimatedBlockWeight(std::move(Arg.EstimatedBlockWeight)) {
+    for (auto &Handle : Handles)
+      Handle.setBPI(this);
+  }
 
   BranchProbabilityInfo(const BranchProbabilityInfo &) = delete;
   BranchProbabilityInfo &operator=(const BranchProbabilityInfo &) = delete;
 
   BranchProbabilityInfo &operator=(BranchProbabilityInfo &&RHS) {
     releaseMemory();
+    Handles = std::move(RHS.Handles);
     Probs = std::move(RHS.Probs);
     EstimatedBlockWeight = std::move(RHS.EstimatedBlockWeight);
+    for (auto &Handle : Handles)
+      Handle.setBPI(this);
     return *this;
   }
 
@@ -279,6 +286,8 @@ class BranchProbabilityInfo {
     }
 
   public:
+    void setBPI(BranchProbabilityInfo *BPI) { this->BPI = BPI; }
+
     BasicBlockCallbackVH(const Value *V, BranchProbabilityInfo *BPI = nullptr)
         : CallbackVH(const_cast<Value *>(V)), BPI(BPI) {}
   };

From ed7cee90f7849594ba9a3ebc2271cb9b5d172004 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 24 Jan 2024 10:06:27 -0800
Subject: [PATCH 827/843] [Driver] Test ignored target-specific options for
 AMDGPU/NVPTX (#79222)

Fix missing test coverage after #70740 #70760

When compiling for CUDA/HIP, the driver creates a cc1 job to compile for
amdgcn/nvptx triple using most options.
Certain target-specific options should be ignored, not lead to an error
(`err_drv_unsupported_opt_for_target`).
---
 clang/test/Driver/unsupported-option-gpu.c | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 clang/test/Driver/unsupported-option-gpu.c

diff --git a/clang/test/Driver/unsupported-option-gpu.c b/clang/test/Driver/unsupported-option-gpu.c
new file mode 100644
index 0000000000000..225f3be7c7624
--- /dev/null
+++ b/clang/test/Driver/unsupported-option-gpu.c
@@ -0,0 +1,5 @@
+/// Some target-specific options are ignored for GPU, so %clang exits with code 0.
+// DEFINE: %{check} = %clang -### -c -mcmodel=medium
+
+// RUN: %{check} -x cuda %s --cuda-path=%S/Inputs/CUDA/usr/local/cuda --offload-arch=sm_60 --no-cuda-version-check -fbasic-block-sections=all
+// RUN: %{check} -x hip %s --rocm-path=%S/Inputs/rocm -nogpulib -nogpuinc

From e099e7b278c774338c7019e38fbbca9ef2c8dd74 Mon Sep 17 00:00:00 2001
From: Nikolas Klauser <nikolasklauser@berlin.de>
Date: Wed, 24 Jan 2024 19:25:01 +0100
Subject: [PATCH 828/843] [Clang] Fix the signature of __builtin___stpncpy_chk

---
 clang/include/clang/Basic/Builtins.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index fdd1cf8221dcc..22e616e6cde59 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -1009,7 +1009,7 @@ def StrncpyChk : Builtin {
 def StpncpyChk : Builtin {
   let Spellings = ["__builtin___stpncpy_chk"];
   let Attributes = [FunctionWithBuiltinPrefix, NoThrow];
-  let Prototype = "int(char*, char*, char const*, size_t, size_t)";
+  let Prototype = "char*(char*, char const*, size_t, size_t)";
 }
 
 def SNPrintfChk : Builtin {

From c1cb0b80f00888920050068a8c6b4f78ca40dd43 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Wed, 24 Jan 2024 13:34:13 -0500
Subject: [PATCH 829/843] [libc][NFC] Fix `-DSHOW_INTERMEDIATE_OBJECTS=DEPS` to
 work properly for entry points and unit tests. (#79254)

---
 libc/cmake/modules/LLVMLibCObjectRules.cmake |  2 +-
 libc/cmake/modules/LLVMLibCTestRules.cmake   | 23 ++++++++++----------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/libc/cmake/modules/LLVMLibCObjectRules.cmake b/libc/cmake/modules/LLVMLibCObjectRules.cmake
index b6c2f2d53e88d..667d7548f7831 100644
--- a/libc/cmake/modules/LLVMLibCObjectRules.cmake
+++ b/libc/cmake/modules/LLVMLibCObjectRules.cmake
@@ -652,7 +652,7 @@ function(create_entrypoint_object fq_target_name)
   if(SHOW_INTERMEDIATE_OBJECTS)
     message(STATUS "Adding entrypoint object ${fq_target_name}")
     if(${SHOW_INTERMEDIATE_OBJECTS} STREQUAL "DEPS")
-      foreach(dep IN LISTS ADD_OBJECT_DEPENDS)
+      foreach(dep IN LISTS ADD_ENTRYPOINT_OBJ_DEPENDS)
         message(STATUS "  ${fq_target_name} depends on ${dep}")
       endforeach()
     endif()
diff --git a/libc/cmake/modules/LLVMLibCTestRules.cmake b/libc/cmake/modules/LLVMLibCTestRules.cmake
index 46ffc8316fc82..0d0a47b33aaeb 100644
--- a/libc/cmake/modules/LLVMLibCTestRules.cmake
+++ b/libc/cmake/modules/LLVMLibCTestRules.cmake
@@ -129,6 +129,16 @@ function(create_libc_unittest fq_target_name)
   list(APPEND fq_deps_list libc.src.__support.StringUtil.error_to_string
                            libc.test.UnitTest.ErrnoSetterMatcher)
   list(REMOVE_DUPLICATES fq_deps_list)
+
+  if(SHOW_INTERMEDIATE_OBJECTS)
+    message(STATUS "Adding unit test ${fq_target_name}")
+    if(${SHOW_INTERMEDIATE_OBJECTS} STREQUAL "DEPS")
+      foreach(dep IN LISTS LIBC_UNITTEST_DEPENDS)
+        message(STATUS "  ${fq_target_name} depends on ${dep}")
+      endforeach()
+    endif()
+  endif()
+
   get_object_files_for_test(
       link_object_files skipped_entrypoints_list ${fq_deps_list})
   if(skipped_entrypoints_list)
@@ -157,15 +167,6 @@ function(create_libc_unittest fq_target_name)
     return()
   endif()
 
-  if(SHOW_INTERMEDIATE_OBJECTS)
-    message(STATUS "Adding unit test ${fq_target_name}")
-    if(${SHOW_INTERMEDIATE_OBJECTS} STREQUAL "DEPS")
-      foreach(dep IN LISTS ADD_OBJECT_DEPENDS)
-        message(STATUS "  ${fq_target_name} depends on ${dep}")
-      endforeach()
-    endif()
-  endif()
-
   if(LIBC_UNITTEST_NO_RUN_POSTBUILD)
     set(fq_build_target_name ${fq_target_name})
   else()
@@ -521,7 +522,7 @@ function(add_integration_test test_name)
       link_object_files skipped_entrypoints_list ${fq_deps_list})
   if(skipped_entrypoints_list)
     if(LIBC_CMAKE_VERBOSE_LOGGING)
-      set(msg "Skipping unittest ${fq_target_name} as it has missing deps: "
+      set(msg "Skipping integration test ${fq_target_name} as it has missing deps: "
               "${skipped_entrypoints_list}.")
       message(STATUS ${msg})
     endif()
@@ -704,7 +705,7 @@ function(add_libc_hermetic_test test_name)
   get_object_files_for_test(
       link_object_files skipped_entrypoints_list ${fq_deps_list})
   if(skipped_entrypoints_list)
-    set(msg "Skipping unittest ${fq_target_name} as it has missing deps: "
+    set(msg "Skipping hermetic test ${fq_target_name} as it has missing deps: "
             "${skipped_entrypoints_list}.")
     message(STATUS ${msg})
     return()

From 32f7922646d5903f63d16c9fbfe3d508b0f8cda7 Mon Sep 17 00:00:00 2001
From: William Moses <gh@wsmoses.com>
Date: Wed, 24 Jan 2024 13:39:27 -0500
Subject: [PATCH 830/843] [CMake/Bazel] Support usage of opt driver as a
 library (#79205)

In Bazel, Clang current separates the clang executable into a
clang-driver library, and the actual clang executable. This allows
downstream users to make their own variations of clang, without having
to redo/maintain separate build pipelines.

This adds the same for opt for both CMake and Bazel.
---
 llvm/tools/opt/CMakeLists.txt                 |  16 +-
 llvm/tools/opt/NewPMDriver.cpp                |   9 +-
 llvm/tools/opt/NewPMDriver.h                  |  22 +-
 llvm/tools/opt/opt.cpp                        | 913 +----------------
 llvm/tools/opt/optdriver.cpp                  | 923 ++++++++++++++++++
 .../llvm-project-overlay/llvm/BUILD.bazel     |  11 +-
 6 files changed, 972 insertions(+), 922 deletions(-)
 create mode 100644 llvm/tools/opt/optdriver.cpp

diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt
index 2cdb99e936287..f3dd95e178e31 100644
--- a/llvm/tools/opt/CMakeLists.txt
+++ b/llvm/tools/opt/CMakeLists.txt
@@ -29,12 +29,24 @@ set(LLVM_LINK_COMPONENTS
   Passes
   )
 
-add_llvm_tool(opt
+# We don't want to link this into libLLVM
+add_llvm_library(LLVMOptDriver
+  STATIC
   NewPMDriver.cpp
-  opt.cpp
+  optdriver.cpp
+  PARTIAL_SOURCES_INTENDED
+  DEPENDS
+  intrinsics_gen
+)
 
+add_llvm_tool(opt
+  PARTIAL_SOURCES_INTENDED
+  opt.cpp 
   DEPENDS
   intrinsics_gen
   SUPPORT_PLUGINS
+
   )
+target_link_libraries(opt PRIVATE LLVMOptDriver)
+
 export_executable_symbols_for_plugins(opt)
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 6ae3f87099afd..fdfb4df53273f 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -328,8 +328,9 @@ bool llvm::runPassPipeline(
     StringRef Arg0, Module &M, TargetMachine *TM, TargetLibraryInfoImpl *TLII,
     ToolOutputFile *Out, ToolOutputFile *ThinLTOLinkOut,
     ToolOutputFile *OptRemarkFile, StringRef PassPipeline,
-    ArrayRef<PassPlugin> PassPlugins, OutputKind OK, VerifierKind VK,
-    bool ShouldPreserveAssemblyUseListOrder,
+    ArrayRef<PassPlugin> PassPlugins,
+    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks,
+    OutputKind OK, VerifierKind VK, bool ShouldPreserveAssemblyUseListOrder,
     bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
     bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
     bool UnifiedLTO) {
@@ -428,6 +429,10 @@ bool llvm::runPassPipeline(
   for (auto &PassPlugin : PassPlugins)
     PassPlugin.registerPassBuilderCallbacks(PB);
 
+  // Load any explicitly specified plugins.
+  for (auto &PassCallback : PassBuilderCallbacks)
+    PassCallback(PB);
+
 #define HANDLE_EXTENSION(Ext)                                                  \
   get##Ext##PluginInfo().RegisterPassBuilderCallbacks(PB);
 #include "llvm/Support/Extension.def"
diff --git a/llvm/tools/opt/NewPMDriver.h b/llvm/tools/opt/NewPMDriver.h
index 3230c27c2d7a8..19cabd15436eb 100644
--- a/llvm/tools/opt/NewPMDriver.h
+++ b/llvm/tools/opt/NewPMDriver.h
@@ -23,6 +23,7 @@
 #include "llvm/Support/CommandLine.h"
 
 namespace llvm {
+class PassBuilder;
 class StringRef;
 class Module;
 class PassPlugin;
@@ -64,16 +65,17 @@ void printPasses(raw_ostream &OS);
 ///
 /// ThinLTOLinkOut is only used when OK is OK_OutputThinLTOBitcode, and can be
 /// nullptr.
-bool runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
-                     TargetLibraryInfoImpl *TLII, ToolOutputFile *Out,
-                     ToolOutputFile *ThinLinkOut, ToolOutputFile *OptRemarkFile,
-                     StringRef PassPipeline, ArrayRef<PassPlugin> PassPlugins,
-                     opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
-                     bool ShouldPreserveAssemblyUseListOrder,
-                     bool ShouldPreserveBitcodeUseListOrder,
-                     bool EmitSummaryIndex, bool EmitModuleHash,
-                     bool EnableDebugify, bool VerifyDIPreserve,
-                     bool UnifiedLTO = false);
+bool runPassPipeline(
+    StringRef Arg0, Module &M, TargetMachine *TM, TargetLibraryInfoImpl *TLII,
+    ToolOutputFile *Out, ToolOutputFile *ThinLinkOut,
+    ToolOutputFile *OptRemarkFile, StringRef PassPipeline,
+    ArrayRef<PassPlugin> PassPlugins,
+    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks,
+    opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
+    bool ShouldPreserveAssemblyUseListOrder,
+    bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
+    bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
+    bool UnifiedLTO = false);
 } // namespace llvm
 
 #endif
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 8c0e98976e602..fc8bd665cbbe9 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -11,912 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NewPMDriver.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/RegionPass.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LLVMRemarkStreamer.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/LegacyPassNameParser.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/LinkAllIR.h"
-#include "llvm/LinkAllPasses.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Passes/PassPlugin.h"
-#include "llvm/Remarks/HotnessThresholdParser.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/PluginLoader.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/SystemUtils.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/YAMLTraits.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/TargetParser/Host.h"
-#include "llvm/TargetParser/SubtargetFeature.h"
-#include "llvm/TargetParser/Triple.h"
-#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Debugify.h"
-#include <algorithm>
-#include <memory>
-#include <optional>
-using namespace llvm;
-using namespace opt_tool;
+#include "llvm/ADT/ArrayRef.h"
+#include <functional>
 
-static codegen::RegisterCodeGenFlags CFG;
-
-// The OptimizationList is automatically populated with registered Passes by the
-// PassNameParser.
-static cl::list<const PassInfo *, bool, PassNameParser> PassList(cl::desc(
-    "Optimizations available (use '-passes=' for the new pass manager)"));
-
-static cl::opt<bool> EnableLegacyPassManager(
-    "bugpoint-enable-legacy-pm",
-    cl::desc(
-        "Enable the legacy pass manager. This is strictly for bugpoint "
-        "due to it not working with the new PM, please do not use otherwise."),
-    cl::init(false));
-
-// This flag specifies a textual description of the optimization pass pipeline
-// to run over the module. This flag switches opt to use the new pass manager
-// infrastructure, completely disabling all of the flags specific to the old
-// pass management.
-static cl::opt<std::string> PassPipeline(
-    "passes",
-    cl::desc(
-        "A textual description of the pass pipeline. To have analysis passes "
-        "available before a certain pass, add 'require<foo-analysis>'."));
-static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
-                                   cl::desc("Alias for -passes"));
-
-static cl::opt<bool> PrintPasses("print-passes",
-                                 cl::desc("Print available passes that can be "
-                                          "specified in -passes=foo and exit"));
-
-static cl::opt<std::string>
-InputFilename(cl::Positional, cl::desc("<input bitcode file>"),
-    cl::init("-"), cl::value_desc("filename"));
-
-static cl::opt<std::string>
-OutputFilename("o", cl::desc("Override output filename"),
-               cl::value_desc("filename"));
-
-static cl::opt<bool>
-Force("f", cl::desc("Enable binary output on terminals"));
-
-static cl::opt<bool>
-NoOutput("disable-output",
-         cl::desc("Do not write result bitcode file"), cl::Hidden);
-
-static cl::opt<bool>
-OutputAssembly("S", cl::desc("Write output as LLVM assembly"));
-
-static cl::opt<bool>
-    OutputThinLTOBC("thinlto-bc",
-                    cl::desc("Write output as ThinLTO-ready bitcode"));
-
-static cl::opt<bool>
-    SplitLTOUnit("thinlto-split-lto-unit",
-                 cl::desc("Enable splitting of a ThinLTO LTOUnit"));
-
-static cl::opt<bool>
-    UnifiedLTO("unified-lto",
-               cl::desc("Use unified LTO piplines. Ignored unless -thinlto-bc "
-                        "is also specified."),
-               cl::Hidden, cl::init(false));
-
-static cl::opt<std::string> ThinLinkBitcodeFile(
-    "thin-link-bitcode-file", cl::value_desc("filename"),
-    cl::desc(
-        "A file in which to write minimized bitcode for the thin link only"));
-
-static cl::opt<bool>
-NoVerify("disable-verify", cl::desc("Do not run the verifier"), cl::Hidden);
-
-static cl::opt<bool> NoUpgradeDebugInfo("disable-upgrade-debug-info",
-                                        cl::desc("Generate invalid output"),
-                                        cl::ReallyHidden);
-
-static cl::opt<bool> VerifyEach("verify-each",
-                                cl::desc("Verify after each transform"));
-
-static cl::opt<bool>
-    DisableDITypeMap("disable-debug-info-type-map",
-                     cl::desc("Don't use a uniquing type map for debug info"));
-
-static cl::opt<bool>
-StripDebug("strip-debug",
-           cl::desc("Strip debugger symbol info from translation unit"));
-
-static cl::opt<bool>
-    StripNamedMetadata("strip-named-metadata",
-                       cl::desc("Strip module-level named metadata"));
-
-static cl::opt<bool>
-    OptLevelO0("O0", cl::desc("Optimization level 0. Similar to clang -O0. "
-                              "Same as -passes='default<O0>'"));
-
-static cl::opt<bool>
-    OptLevelO1("O1", cl::desc("Optimization level 1. Similar to clang -O1. "
-                              "Same as -passes='default<O1>'"));
-
-static cl::opt<bool>
-    OptLevelO2("O2", cl::desc("Optimization level 2. Similar to clang -O2. "
-                              "Same as -passes='default<O2>'"));
-
-static cl::opt<bool>
-    OptLevelOs("Os", cl::desc("Like -O2 but size-conscious. Similar to clang "
-                              "-Os. Same as -passes='default<Os>'"));
-
-static cl::opt<bool> OptLevelOz(
-    "Oz",
-    cl::desc("Like -O2 but optimize for code size above all else. Similar to "
-             "clang -Oz. Same as -passes='default<Oz>'"));
-
-static cl::opt<bool>
-    OptLevelO3("O3", cl::desc("Optimization level 3. Similar to clang -O3. "
-                              "Same as -passes='default<O3>'"));
-
-static cl::opt<unsigned> CodeGenOptLevelCL(
-    "codegen-opt-level",
-    cl::desc("Override optimization level for codegen hooks, legacy PM only"));
-
-static cl::opt<std::string>
-TargetTriple("mtriple", cl::desc("Override target triple for module"));
-
-static cl::opt<bool> EmitSummaryIndex("module-summary",
-                                      cl::desc("Emit module summary index"),
-                                      cl::init(false));
-
-static cl::opt<bool> EmitModuleHash("module-hash", cl::desc("Emit module hash"),
-                                    cl::init(false));
-
-static cl::opt<bool>
-DisableSimplifyLibCalls("disable-simplify-libcalls",
-                        cl::desc("Disable simplify-libcalls"));
-
-static cl::list<std::string> DisableBuiltins(
-    "disable-builtin",
-    cl::desc("Disable specific target library builtin function"));
-
-static cl::opt<bool> EnableDebugify(
-    "enable-debugify",
-    cl::desc(
-        "Start the pipeline with debugify and end it with check-debugify"));
-
-static cl::opt<bool> VerifyDebugInfoPreserve(
-    "verify-debuginfo-preserve",
-    cl::desc("Start the pipeline with collecting and end it with checking of "
-             "debug info preservation."));
-
-static cl::opt<std::string> ClDataLayout("data-layout",
-                                         cl::desc("data layout string to use"),
-                                         cl::value_desc("layout-string"),
-                                         cl::init(""));
-
-static cl::opt<bool> PreserveBitcodeUseListOrder(
-    "preserve-bc-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM bitcode."),
-    cl::init(true), cl::Hidden);
-
-static cl::opt<bool> PreserveAssemblyUseListOrder(
-    "preserve-ll-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM assembly."),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<bool> RunTwice("run-twice",
-                              cl::desc("Run all passes twice, re-using the "
-                                       "same pass manager (legacy PM only)."),
-                              cl::init(false), cl::Hidden);
-
-static cl::opt<bool> DiscardValueNames(
-    "discard-value-names",
-    cl::desc("Discard names from Value (other than GlobalValue)."),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<bool> TimeTrace(
-    "time-trace",
-    cl::desc("Record time trace"));
-
-static cl::opt<unsigned> TimeTraceGranularity(
-    "time-trace-granularity",
-    cl::desc("Minimum time granularity (in microseconds) traced by time profiler"),
-    cl::init(500), cl::Hidden);
-
-static cl::opt<std::string>
-    TimeTraceFile("time-trace-file",
-                    cl::desc("Specify time trace file destination"),
-                    cl::value_desc("filename"));
-
-static cl::opt<bool> RemarksWithHotness(
-    "pass-remarks-with-hotness",
-    cl::desc("With PGO, include profile count in optimization remarks"),
-    cl::Hidden);
-
-static cl::opt<std::optional<uint64_t>, false, remarks::HotnessThresholdParser>
-    RemarksHotnessThreshold(
-        "pass-remarks-hotness-threshold",
-        cl::desc("Minimum profile count required for "
-                 "an optimization remark to be output. "
-                 "Use 'auto' to apply the threshold from profile summary"),
-        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
-
-static cl::opt<std::string>
-    RemarksFilename("pass-remarks-output",
-                    cl::desc("Output filename for pass remarks"),
-                    cl::value_desc("filename"));
-
-static cl::opt<std::string>
-    RemarksPasses("pass-remarks-filter",
-                  cl::desc("Only record optimization remarks from passes whose "
-                           "names match the given regular expression"),
-                  cl::value_desc("regex"));
-
-static cl::opt<std::string> RemarksFormat(
-    "pass-remarks-format",
-    cl::desc("The format used for serializing remarks (default: YAML)"),
-    cl::value_desc("format"), cl::init("yaml"));
-
-static cl::list<std::string>
-    PassPlugins("load-pass-plugin",
-                cl::desc("Load passes from plugin library"));
-
-static cl::opt<bool> TryUseNewDbgInfoFormat(
-    "try-experimental-debuginfo-iterators",
-    cl::desc("Enable debuginfo iterator positions, if they're built in"),
-    cl::init(false), cl::Hidden);
-
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
-//===----------------------------------------------------------------------===//
-// CodeGen-related helper functions.
-//
-
-static CodeGenOptLevel GetCodeGenOptLevel() {
-  return static_cast<CodeGenOptLevel>(unsigned(CodeGenOptLevelCL));
+namespace llvm {
+class PassBuilder;
 }
 
-struct TimeTracerRAII {
-  TimeTracerRAII(StringRef ProgramName) {
-    if (TimeTrace)
-      timeTraceProfilerInitialize(TimeTraceGranularity, ProgramName);
-  }
-  ~TimeTracerRAII() {
-    if (TimeTrace) {
-      if (auto E = timeTraceProfilerWrite(TimeTraceFile, OutputFilename)) {
-        handleAllErrors(std::move(E), [&](const StringError &SE) {
-          errs() << SE.getMessage() << "\n";
-        });
-        return;
-      }
-      timeTraceProfilerCleanup();
-    }
-  }
-};
+extern "C" int optMain(int argc, char **argv,
+                       llvm::ArrayRef<std::function<void(llvm::PassBuilder &)>>
+                           PassBuilderCallbacks);
 
-// For use in NPM transition. Currently this contains most codegen-specific
-// passes. Remove passes from here when porting to the NPM.
-// TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once
-// it exists.
-static bool shouldPinPassToLegacyPM(StringRef Pass) {
-  std::vector<StringRef> PassNameExactToIgnore = {
-      "nvvm-reflect",
-      "nvvm-intr-range",
-      "amdgpu-simplifylib",
-      "amdgpu-image-intrinsic-opt",
-      "amdgpu-usenative",
-      "amdgpu-promote-alloca",
-      "amdgpu-promote-alloca-to-vector",
-      "amdgpu-lower-kernel-attributes",
-      "amdgpu-propagate-attributes-early",
-      "amdgpu-propagate-attributes-late",
-      "amdgpu-unify-metadata",
-      "amdgpu-printf-runtime-binding",
-      "amdgpu-always-inline"};
-  if (llvm::is_contained(PassNameExactToIgnore, Pass))
-    return false;
-
-  std::vector<StringRef> PassNamePrefix = {
-      "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
-      "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
-      "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
-      "amdgcn-", "polly-", "riscv-", "dxil-"};
-  std::vector<StringRef> PassNameContain = {"-eh-prepare"};
-  std::vector<StringRef> PassNameExact = {
-      "safe-stack",
-      "cost-model",
-      "codegenprepare",
-      "interleaved-load-combine",
-      "unreachableblockelim",
-      "verify-safepoint-ir",
-      "atomic-expand",
-      "expandvp",
-      "mve-tail-predication",
-      "interleaved-access",
-      "global-merge",
-      "pre-isel-intrinsic-lowering",
-      "expand-reductions",
-      "indirectbr-expand",
-      "generic-to-nvvm",
-      "expand-memcmp",
-      "loop-reduce",
-      "lower-amx-type",
-      "lower-amx-intrinsics",
-      "polyhedral-info",
-      "print-polyhedral-info",
-      "replace-with-veclib",
-      "jmc-instrumenter",
-      "dot-regions",
-      "dot-regions-only",
-      "view-regions",
-      "view-regions-only",
-      "select-optimize",
-      "expand-large-div-rem",
-      "structurizecfg",
-      "fix-irreducible",
-      "expand-large-fp-convert",
-      "callbrprepare",
-  };
-  for (const auto &P : PassNamePrefix)
-    if (Pass.starts_with(P))
-      return true;
-  for (const auto &P : PassNameContain)
-    if (Pass.contains(P))
-      return true;
-  return llvm::is_contained(PassNameExact, Pass);
-}
-
-// For use in NPM transition.
-static bool shouldForceLegacyPM() {
-  for (const auto &P : PassList) {
-    StringRef Arg = P->getPassArgument();
-    if (shouldPinPassToLegacyPM(Arg))
-      return true;
-  }
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// main for opt
-//
-int main(int argc, char **argv) {
-  InitLLVM X(argc, argv);
-
-  // Enable debug stream buffering.
-  EnableDebugBuffering = true;
-
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  InitializeAllAsmPrinters();
-  InitializeAllAsmParsers();
-
-  // Initialize passes
-  PassRegistry &Registry = *PassRegistry::getPassRegistry();
-  initializeCore(Registry);
-  initializeScalarOpts(Registry);
-  initializeVectorization(Registry);
-  initializeIPO(Registry);
-  initializeAnalysis(Registry);
-  initializeTransformUtils(Registry);
-  initializeInstCombine(Registry);
-  initializeTarget(Registry);
-  // For codegen passes, only passes that do IR to IR transformation are
-  // supported.
-  initializeExpandLargeDivRemLegacyPassPass(Registry);
-  initializeExpandLargeFpConvertLegacyPassPass(Registry);
-  initializeExpandMemCmpLegacyPassPass(Registry);
-  initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
-  initializeSelectOptimizePass(Registry);
-  initializeCallBrPreparePass(Registry);
-  initializeCodeGenPrepareLegacyPassPass(Registry);
-  initializeAtomicExpandPass(Registry);
-  initializeWinEHPreparePass(Registry);
-  initializeDwarfEHPrepareLegacyPassPass(Registry);
-  initializeSafeStackLegacyPassPass(Registry);
-  initializeSjLjEHPreparePass(Registry);
-  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
-  initializeGlobalMergePass(Registry);
-  initializeIndirectBrExpandLegacyPassPass(Registry);
-  initializeInterleavedLoadCombinePass(Registry);
-  initializeInterleavedAccessPass(Registry);
-  initializeUnreachableBlockElimLegacyPassPass(Registry);
-  initializeExpandReductionsPass(Registry);
-  initializeExpandVectorPredicationPass(Registry);
-  initializeWasmEHPreparePass(Registry);
-  initializeWriteBitcodePassPass(Registry);
-  initializeReplaceWithVeclibLegacyPass(Registry);
-  initializeJMCInstrumenterPass(Registry);
-
-  SmallVector<PassPlugin, 1> PluginList;
-  PassPlugins.setCallback([&](const std::string &PluginPath) {
-    auto Plugin = PassPlugin::Load(PluginPath);
-    if (!Plugin)
-      report_fatal_error(Plugin.takeError(), /*gen_crash_diag=*/false);
-    PluginList.emplace_back(Plugin.get());
-  });
-
-  // Register the Target and CPU printer for --version.
-  cl::AddExtraVersionPrinter(sys::printDefaultTargetAndDetectedCPU);
-
-  cl::ParseCommandLineOptions(argc, argv,
-    "llvm .bc -> .bc modular optimizer and analysis printer\n");
-
-  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
-  if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
-    UseNewDbgInfoFormat = true;
-  }
-#endif
-  (void)TryUseNewDbgInfoFormat;
-
-  LLVMContext Context;
-
-  // TODO: remove shouldForceLegacyPM().
-  const bool UseNPM = (!EnableLegacyPassManager && !shouldForceLegacyPM()) ||
-                      PassPipeline.getNumOccurrences() > 0;
-
-  if (UseNPM && !PassList.empty()) {
-    errs() << "The `opt -passname` syntax for the new pass manager is "
-              "not supported, please use `opt -passes=<pipeline>` (or the `-p` "
-              "alias for a more concise version).\n";
-    errs() << "See https://llvm.org/docs/NewPassManager.html#invoking-opt "
-              "for more details on the pass pipeline syntax.\n\n";
-    return 1;
-  }
-
-  if (!UseNPM && PluginList.size()) {
-    errs() << argv[0] << ": " << PassPlugins.ArgStr
-           << " specified with legacy PM.\n";
-    return 1;
-  }
-
-  // FIXME: once the legacy PM code is deleted, move runPassPipeline() here and
-  // construct the PassBuilder before parsing IR so we can reuse the same
-  // PassBuilder for print passes.
-  if (PrintPasses) {
-    printPasses(outs());
-    return 0;
-  }
-
-  TimeTracerRAII TimeTracer(argv[0]);
-
-  SMDiagnostic Err;
-
-  Context.setDiscardValueNames(DiscardValueNames);
-  if (!DisableDITypeMap)
-    Context.enableDebugTypeODRUniquing();
-
-  Expected<std::unique_ptr<ToolOutputFile>> RemarksFileOrErr =
-      setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
-                                   RemarksFormat, RemarksWithHotness,
-                                   RemarksHotnessThreshold);
-  if (Error E = RemarksFileOrErr.takeError()) {
-    errs() << toString(std::move(E)) << '\n';
-    return 1;
-  }
-  std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
-
-  // Load the input module...
-  auto SetDataLayout = [&](StringRef IRTriple,
-                           StringRef IRLayout) -> std::optional<std::string> {
-    // Data layout specified on the command line has the highest priority.
-    if (!ClDataLayout.empty())
-      return ClDataLayout;
-    // If an explicit data layout is already defined in the IR, don't infer.
-    if (!IRLayout.empty())
-      return std::nullopt;
-
-    // If an explicit triple was specified (either in the IR or on the
-    // command line), use that to infer the default data layout. However, the
-    // command line target triple should override the IR file target triple.
-    std::string TripleStr =
-        TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple);
-    // If the triple string is still empty, we don't fall back to
-    // sys::getDefaultTargetTriple() since we do not want to have differing
-    // behaviour dependent on the configured default triple. Therefore, if the
-    // user did not pass -mtriple or define an explicit triple/datalayout in
-    // the IR, we should default to an empty (default) DataLayout.
-    if (TripleStr.empty())
-      return std::nullopt;
-    // Otherwise we infer the DataLayout from the target machine.
-    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
-        codegen::createTargetMachineForTriple(TripleStr, GetCodeGenOptLevel());
-    if (!ExpectedTM) {
-      errs() << argv[0] << ": warning: failed to infer data layout: "
-             << toString(ExpectedTM.takeError()) << "\n";
-      return std::nullopt;
-    }
-    return (*ExpectedTM)->createDataLayout().getStringRepresentation();
-  };
-  std::unique_ptr<Module> M;
-  if (NoUpgradeDebugInfo)
-    M = parseAssemblyFileWithIndexNoUpgradeDebugInfo(
-            InputFilename, Err, Context, nullptr, SetDataLayout)
-            .Mod;
-  else
-    M = parseIRFile(InputFilename, Err, Context,
-                    ParserCallbacks(SetDataLayout));
-
-  if (!M) {
-    Err.print(argv[0], errs());
-    return 1;
-  }
-
-  // Strip debug info before running the verifier.
-  if (StripDebug)
-    StripDebugInfo(*M);
-
-  // Erase module-level named metadata, if requested.
-  if (StripNamedMetadata) {
-    while (!M->named_metadata_empty()) {
-      NamedMDNode *NMD = &*M->named_metadata_begin();
-      M->eraseNamedMetadata(NMD);
-    }
-  }
-
-  // If we are supposed to override the target triple, do so now.
-  if (!TargetTriple.empty())
-    M->setTargetTriple(Triple::normalize(TargetTriple));
-
-  // Immediately run the verifier to catch any problems before starting up the
-  // pass pipelines.  Otherwise we can crash on broken code during
-  // doInitialization().
-  if (!NoVerify && verifyModule(*M, &errs())) {
-    errs() << argv[0] << ": " << InputFilename
-           << ": error: input module is broken!\n";
-    return 1;
-  }
-
-  // Enable testing of whole program devirtualization on this module by invoking
-  // the facility for updating public visibility to linkage unit visibility when
-  // specified by an internal option. This is normally done during LTO which is
-  // not performed via opt.
-  updateVCallVisibilityInModule(
-      *M,
-      /*WholeProgramVisibilityEnabledInLTO=*/false,
-      // FIXME: These need linker information via a
-      // TBD new interface.
-      /*DynamicExportSymbols=*/{},
-      /*ValidateAllVtablesHaveTypeInfos=*/false,
-      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
-
-  // Figure out what stream we are supposed to write to...
-  std::unique_ptr<ToolOutputFile> Out;
-  std::unique_ptr<ToolOutputFile> ThinLinkOut;
-  if (NoOutput) {
-    if (!OutputFilename.empty())
-      errs() << "WARNING: The -o (output filename) option is ignored when\n"
-                "the --disable-output option is used.\n";
-  } else {
-    // Default to standard output.
-    if (OutputFilename.empty())
-      OutputFilename = "-";
-
-    std::error_code EC;
-    sys::fs::OpenFlags Flags =
-        OutputAssembly ? sys::fs::OF_TextWithCRLF : sys::fs::OF_None;
-    Out.reset(new ToolOutputFile(OutputFilename, EC, Flags));
-    if (EC) {
-      errs() << EC.message() << '\n';
-      return 1;
-    }
-
-    if (!ThinLinkBitcodeFile.empty()) {
-      ThinLinkOut.reset(
-          new ToolOutputFile(ThinLinkBitcodeFile, EC, sys::fs::OF_None));
-      if (EC) {
-        errs() << EC.message() << '\n';
-        return 1;
-      }
-    }
-  }
-
-  Triple ModuleTriple(M->getTargetTriple());
-  std::string CPUStr, FeaturesStr;
-  std::unique_ptr<TargetMachine> TM;
-  if (ModuleTriple.getArch()) {
-    CPUStr = codegen::getCPUStr();
-    FeaturesStr = codegen::getFeaturesStr();
-    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
-        codegen::createTargetMachineForTriple(ModuleTriple.str(),
-                                              GetCodeGenOptLevel());
-    if (auto E = ExpectedTM.takeError()) {
-      errs() << argv[0] << ": WARNING: failed to create target machine for '"
-             << ModuleTriple.str() << "': " << toString(std::move(E)) << "\n";
-    } else {
-      TM = std::move(*ExpectedTM);
-    }
-  } else if (ModuleTriple.getArchName() != "unknown" &&
-             ModuleTriple.getArchName() != "") {
-    errs() << argv[0] << ": unrecognized architecture '"
-           << ModuleTriple.getArchName() << "' provided.\n";
-    return 1;
-  }
-
-  // Override function attributes based on CPUStr, FeaturesStr, and command line
-  // flags.
-  codegen::setFunctionAttributes(CPUStr, FeaturesStr, *M);
-
-  // If the output is set to be emitted to standard out, and standard out is a
-  // console, print out a warning message and refuse to do it.  We don't
-  // impress anyone by spewing tons of binary goo to a terminal.
-  if (!Force && !NoOutput && !OutputAssembly)
-    if (CheckBitcodeOutputToConsole(Out->os()))
-      NoOutput = true;
-
-  if (OutputThinLTOBC) {
-    M->addModuleFlag(Module::Error, "EnableSplitLTOUnit", SplitLTOUnit);
-    if (UnifiedLTO)
-      M->addModuleFlag(Module::Error, "UnifiedLTO", 1);
-  }
-
-  // Add an appropriate TargetLibraryInfo pass for the module's triple.
-  TargetLibraryInfoImpl TLII(ModuleTriple);
-
-  // The -disable-simplify-libcalls flag actually disables all builtin optzns.
-  if (DisableSimplifyLibCalls)
-    TLII.disableAllFunctions();
-  else {
-    // Disable individual builtin functions in TargetLibraryInfo.
-    LibFunc F;
-    for (auto &FuncName : DisableBuiltins)
-      if (TLII.getLibFunc(FuncName, F))
-        TLII.setUnavailable(F);
-      else {
-        errs() << argv[0] << ": cannot disable nonexistent builtin function "
-               << FuncName << '\n';
-        return 1;
-      }
-  }
-
-  if (UseNPM) {
-    if (legacy::debugPassSpecified()) {
-      errs() << "-debug-pass does not work with the new PM, either use "
-                "-debug-pass-manager, or use the legacy PM\n";
-      return 1;
-    }
-    auto NumOLevel = OptLevelO0 + OptLevelO1 + OptLevelO2 + OptLevelO3 +
-                     OptLevelOs + OptLevelOz;
-    if (NumOLevel > 1) {
-      errs() << "Cannot specify multiple -O#\n";
-      return 1;
-    }
-    if (NumOLevel > 0 && (PassPipeline.getNumOccurrences() > 0)) {
-      errs() << "Cannot specify -O# and --passes=/--foo-pass, use "
-                "-passes='default<O#>,other-pass'\n";
-      return 1;
-    }
-    std::string Pipeline = PassPipeline;
-
-    if (OptLevelO0)
-      Pipeline = "default<O0>";
-    if (OptLevelO1)
-      Pipeline = "default<O1>";
-    if (OptLevelO2)
-      Pipeline = "default<O2>";
-    if (OptLevelO3)
-      Pipeline = "default<O3>";
-    if (OptLevelOs)
-      Pipeline = "default<Os>";
-    if (OptLevelOz)
-      Pipeline = "default<Oz>";
-    OutputKind OK = OK_NoOutput;
-    if (!NoOutput)
-      OK = OutputAssembly
-               ? OK_OutputAssembly
-               : (OutputThinLTOBC ? OK_OutputThinLTOBitcode : OK_OutputBitcode);
-
-    VerifierKind VK = VK_VerifyOut;
-    if (NoVerify)
-      VK = VK_NoVerifier;
-    else if (VerifyEach)
-      VK = VK_VerifyEachPass;
-
-    // The user has asked to use the new pass manager and provided a pipeline
-    // string. Hand off the rest of the functionality to the new code for that
-    // layer.
-    return runPassPipeline(argv[0], *M, TM.get(), &TLII, Out.get(),
-                           ThinLinkOut.get(), RemarksFile.get(), Pipeline,
-                           PluginList, OK, VK, PreserveAssemblyUseListOrder,
-                           PreserveBitcodeUseListOrder, EmitSummaryIndex,
-                           EmitModuleHash, EnableDebugify,
-                           VerifyDebugInfoPreserve, UnifiedLTO)
-               ? 0
-               : 1;
-  }
-
-  if (OptLevelO0 || OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz ||
-      OptLevelO3) {
-    errs() << "Cannot use -O# with legacy PM.\n";
-    return 1;
-  }
-  if (EmitSummaryIndex) {
-    errs() << "Cannot use -module-summary with legacy PM.\n";
-    return 1;
-  }
-  if (EmitModuleHash) {
-    errs() << "Cannot use -module-hash with legacy PM.\n";
-    return 1;
-  }
-  if (OutputThinLTOBC) {
-    errs() << "Cannot use -thinlto-bc with legacy PM.\n";
-    return 1;
-  }
-  // Create a PassManager to hold and optimize the collection of passes we are
-  // about to build. If the -debugify-each option is set, wrap each pass with
-  // the (-check)-debugify passes.
-  DebugifyCustomPassManager Passes;
-  DebugifyStatsMap DIStatsMap;
-  DebugInfoPerPass DebugInfoBeforePass;
-  if (DebugifyEach) {
-    Passes.setDebugifyMode(DebugifyMode::SyntheticDebugInfo);
-    Passes.setDIStatsMap(DIStatsMap);
-  } else if (VerifyEachDebugInfoPreserve) {
-    Passes.setDebugifyMode(DebugifyMode::OriginalDebugInfo);
-    Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
-    if (!VerifyDIPreserveExport.empty())
-      Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
-  }
-
-  bool AddOneTimeDebugifyPasses =
-      (EnableDebugify && !DebugifyEach) ||
-      (VerifyDebugInfoPreserve && !VerifyEachDebugInfoPreserve);
-
-  Passes.add(new TargetLibraryInfoWrapperPass(TLII));
-
-  // Add internal analysis passes from the target machine.
-  Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
-                                                     : TargetIRAnalysis()));
-
-  if (AddOneTimeDebugifyPasses) {
-    if (EnableDebugify) {
-      Passes.setDIStatsMap(DIStatsMap);
-      Passes.add(createDebugifyModulePass());
-    } else if (VerifyDebugInfoPreserve) {
-      Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
-      Passes.add(createDebugifyModulePass(
-          DebugifyMode::OriginalDebugInfo, "",
-          &(Passes.getDebugInfoPerPass())));
-    }
-  }
-
-  if (TM) {
-    // FIXME: We should dyn_cast this when supported.
-    auto &LTM = static_cast<LLVMTargetMachine &>(*TM);
-    Pass *TPC = LTM.createPassConfig(Passes);
-    Passes.add(TPC);
-  }
-
-  // Create a new optimization pass for each one specified on the command line
-  for (unsigned i = 0; i < PassList.size(); ++i) {
-    const PassInfo *PassInf = PassList[i];
-    if (PassInf->getNormalCtor()) {
-      Pass *P = PassInf->getNormalCtor()();
-      if (P) {
-        // Add the pass to the pass manager.
-        Passes.add(P);
-        // If we are verifying all of the intermediate steps, add the verifier.
-        if (VerifyEach)
-          Passes.add(createVerifierPass());
-      }
-    } else
-      errs() << argv[0] << ": cannot create pass: "
-             << PassInf->getPassName() << "\n";
-  }
-
-  // Check that the module is well formed on completion of optimization
-  if (!NoVerify && !VerifyEach)
-    Passes.add(createVerifierPass());
-
-  if (AddOneTimeDebugifyPasses) {
-    if (EnableDebugify)
-      Passes.add(createCheckDebugifyModulePass(false));
-    else if (VerifyDebugInfoPreserve) {
-      if (!VerifyDIPreserveExport.empty())
-        Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
-      Passes.add(createCheckDebugifyModulePass(
-          false, "", nullptr, DebugifyMode::OriginalDebugInfo,
-          &(Passes.getDebugInfoPerPass()), VerifyDIPreserveExport));
-    }
-  }
-
-  // In run twice mode, we want to make sure the output is bit-by-bit
-  // equivalent if we run the pass manager again, so setup two buffers and
-  // a stream to write to them. Note that llc does something similar and it
-  // may be worth to abstract this out in the future.
-  SmallVector<char, 0> Buffer;
-  SmallVector<char, 0> FirstRunBuffer;
-  std::unique_ptr<raw_svector_ostream> BOS;
-  raw_ostream *OS = nullptr;
-
-  const bool ShouldEmitOutput = !NoOutput;
-
-  // Write bitcode or assembly to the output as the last step...
-  if (ShouldEmitOutput || RunTwice) {
-    assert(Out);
-    OS = &Out->os();
-    if (RunTwice) {
-      BOS = std::make_unique<raw_svector_ostream>(Buffer);
-      OS = BOS.get();
-    }
-    if (OutputAssembly)
-      Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
-    else
-      Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder));
-  }
-
-  // Before executing passes, print the final values of the LLVM options.
-  cl::PrintOptionValues();
-
-  if (!RunTwice) {
-    // Now that we have all of the passes ready, run them.
-    Passes.run(*M);
-  } else {
-    // If requested, run all passes twice with the same pass manager to catch
-    // bugs caused by persistent state in the passes.
-    std::unique_ptr<Module> M2(CloneModule(*M));
-    // Run all passes on the original module first, so the second run processes
-    // the clone to catch CloneModule bugs.
-    Passes.run(*M);
-    FirstRunBuffer = Buffer;
-    Buffer.clear();
-
-    Passes.run(*M2);
-
-    // Compare the two outputs and make sure they're the same
-    assert(Out);
-    if (Buffer.size() != FirstRunBuffer.size() ||
-        (memcmp(Buffer.data(), FirstRunBuffer.data(), Buffer.size()) != 0)) {
-      errs()
-          << "Running the pass manager twice changed the output.\n"
-             "Writing the result of the second run to the specified output.\n"
-             "To generate the one-run comparison binary, just run without\n"
-             "the compile-twice option\n";
-      if (ShouldEmitOutput) {
-        Out->os() << BOS->str();
-        Out->keep();
-      }
-      if (RemarksFile)
-        RemarksFile->keep();
-      return 1;
-    }
-    if (ShouldEmitOutput)
-      Out->os() << BOS->str();
-  }
-
-  if (DebugifyEach && !DebugifyExport.empty())
-    exportDebugifyStats(DebugifyExport, Passes.getDebugifyStatsMap());
-
-  // Declare success.
-  if (!NoOutput)
-    Out->keep();
-
-  if (RemarksFile)
-    RemarksFile->keep();
-
-  if (ThinLinkOut)
-    ThinLinkOut->keep();
-
-  return 0;
-}
+int main(int argc, char **argv) { return optMain(argc, argv, {}); }
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
new file mode 100644
index 0000000000000..3f66bfc9f0176
--- /dev/null
+++ b/llvm/tools/opt/optdriver.cpp
@@ -0,0 +1,923 @@
+//===- optdriver.cpp - The LLVM Modular Optimizer -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Optimizations may be specified an arbitrary number of times on the command
+// line, They are run in the order specified. Common driver library for re-use
+// by potential downstream opt-variants.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NewPMDriver.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LegacyPassNameParser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/LinkAllIR.h"
+#include "llvm/LinkAllPasses.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Remarks/HotnessThresholdParser.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/PluginLoader.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SystemUtils.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Debugify.h"
+#include <algorithm>
+#include <memory>
+#include <optional>
+using namespace llvm;
+using namespace opt_tool;
+
+static codegen::RegisterCodeGenFlags CFG;
+
+// The OptimizationList is automatically populated with registered Passes by the
+// PassNameParser.
+static cl::list<const PassInfo *, bool, PassNameParser> PassList(cl::desc(
+    "Optimizations available (use '-passes=' for the new pass manager)"));
+
+static cl::opt<bool> EnableLegacyPassManager(
+    "bugpoint-enable-legacy-pm",
+    cl::desc(
+        "Enable the legacy pass manager. This is strictly for bugpoint "
+        "due to it not working with the new PM, please do not use otherwise."),
+    cl::init(false));
+
+// This flag specifies a textual description of the optimization pass pipeline
+// to run over the module. This flag switches opt to use the new pass manager
+// infrastructure, completely disabling all of the flags specific to the old
+// pass management.
+static cl::opt<std::string> PassPipeline(
+    "passes",
+    cl::desc(
+        "A textual description of the pass pipeline. To have analysis passes "
+        "available before a certain pass, add 'require<foo-analysis>'."));
+static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
+                               cl::desc("Alias for -passes"));
+
+static cl::opt<bool> PrintPasses("print-passes",
+                                 cl::desc("Print available passes that can be "
+                                          "specified in -passes=foo and exit"));
+
+static cl::opt<std::string> InputFilename(cl::Positional,
+                                          cl::desc("<input bitcode file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+static cl::opt<std::string> OutputFilename("o",
+                                           cl::desc("Override output filename"),
+                                           cl::value_desc("filename"));
+
+static cl::opt<bool> Force("f", cl::desc("Enable binary output on terminals"));
+
+static cl::opt<bool> NoOutput("disable-output",
+                              cl::desc("Do not write result bitcode file"),
+                              cl::Hidden);
+
+static cl::opt<bool> OutputAssembly("S",
+                                    cl::desc("Write output as LLVM assembly"));
+
+static cl::opt<bool>
+    OutputThinLTOBC("thinlto-bc",
+                    cl::desc("Write output as ThinLTO-ready bitcode"));
+
+static cl::opt<bool>
+    SplitLTOUnit("thinlto-split-lto-unit",
+                 cl::desc("Enable splitting of a ThinLTO LTOUnit"));
+
+static cl::opt<bool>
+    UnifiedLTO("unified-lto",
+               cl::desc("Use unified LTO piplines. Ignored unless -thinlto-bc "
+                        "is also specified."),
+               cl::Hidden, cl::init(false));
+
+static cl::opt<std::string> ThinLinkBitcodeFile(
+    "thin-link-bitcode-file", cl::value_desc("filename"),
+    cl::desc(
+        "A file in which to write minimized bitcode for the thin link only"));
+
+static cl::opt<bool> NoVerify("disable-verify",
+                              cl::desc("Do not run the verifier"), cl::Hidden);
+
+static cl::opt<bool> NoUpgradeDebugInfo("disable-upgrade-debug-info",
+                                        cl::desc("Generate invalid output"),
+                                        cl::ReallyHidden);
+
+static cl::opt<bool> VerifyEach("verify-each",
+                                cl::desc("Verify after each transform"));
+
+static cl::opt<bool>
+    DisableDITypeMap("disable-debug-info-type-map",
+                     cl::desc("Don't use a uniquing type map for debug info"));
+
+static cl::opt<bool>
+    StripDebug("strip-debug",
+               cl::desc("Strip debugger symbol info from translation unit"));
+
+static cl::opt<bool>
+    StripNamedMetadata("strip-named-metadata",
+                       cl::desc("Strip module-level named metadata"));
+
+static cl::opt<bool>
+    OptLevelO0("O0", cl::desc("Optimization level 0. Similar to clang -O0. "
+                              "Same as -passes='default<O0>'"));
+
+static cl::opt<bool>
+    OptLevelO1("O1", cl::desc("Optimization level 1. Similar to clang -O1. "
+                              "Same as -passes='default<O1>'"));
+
+static cl::opt<bool>
+    OptLevelO2("O2", cl::desc("Optimization level 2. Similar to clang -O2. "
+                              "Same as -passes='default<O2>'"));
+
+static cl::opt<bool>
+    OptLevelOs("Os", cl::desc("Like -O2 but size-conscious. Similar to clang "
+                              "-Os. Same as -passes='default<Os>'"));
+
+static cl::opt<bool> OptLevelOz(
+    "Oz",
+    cl::desc("Like -O2 but optimize for code size above all else. Similar to "
+             "clang -Oz. Same as -passes='default<Oz>'"));
+
+static cl::opt<bool>
+    OptLevelO3("O3", cl::desc("Optimization level 3. Similar to clang -O3. "
+                              "Same as -passes='default<O3>'"));
+
+static cl::opt<unsigned> CodeGenOptLevelCL(
+    "codegen-opt-level",
+    cl::desc("Override optimization level for codegen hooks, legacy PM only"));
+
+static cl::opt<std::string>
+    TargetTriple("mtriple", cl::desc("Override target triple for module"));
+
+static cl::opt<bool> EmitSummaryIndex("module-summary",
+                                      cl::desc("Emit module summary index"),
+                                      cl::init(false));
+
+static cl::opt<bool> EmitModuleHash("module-hash", cl::desc("Emit module hash"),
+                                    cl::init(false));
+
+static cl::opt<bool>
+    DisableSimplifyLibCalls("disable-simplify-libcalls",
+                            cl::desc("Disable simplify-libcalls"));
+
+static cl::list<std::string> DisableBuiltins(
+    "disable-builtin",
+    cl::desc("Disable specific target library builtin function"));
+
+static cl::opt<bool> EnableDebugify(
+    "enable-debugify",
+    cl::desc(
+        "Start the pipeline with debugify and end it with check-debugify"));
+
+static cl::opt<bool> VerifyDebugInfoPreserve(
+    "verify-debuginfo-preserve",
+    cl::desc("Start the pipeline with collecting and end it with checking of "
+             "debug info preservation."));
+
+static cl::opt<std::string> ClDataLayout("data-layout",
+                                         cl::desc("data layout string to use"),
+                                         cl::value_desc("layout-string"),
+                                         cl::init(""));
+
+static cl::opt<bool> PreserveBitcodeUseListOrder(
+    "preserve-bc-uselistorder",
+    cl::desc("Preserve use-list order when writing LLVM bitcode."),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> PreserveAssemblyUseListOrder(
+    "preserve-ll-uselistorder",
+    cl::desc("Preserve use-list order when writing LLVM assembly."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> RunTwice("run-twice",
+                              cl::desc("Run all passes twice, re-using the "
+                                       "same pass manager (legacy PM only)."),
+                              cl::init(false), cl::Hidden);
+
+static cl::opt<bool> DiscardValueNames(
+    "discard-value-names",
+    cl::desc("Discard names from Value (other than GlobalValue)."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> TimeTrace("time-trace", cl::desc("Record time trace"));
+
+static cl::opt<unsigned> TimeTraceGranularity(
+    "time-trace-granularity",
+    cl::desc(
+        "Minimum time granularity (in microseconds) traced by time profiler"),
+    cl::init(500), cl::Hidden);
+
+static cl::opt<std::string>
+    TimeTraceFile("time-trace-file",
+                  cl::desc("Specify time trace file destination"),
+                  cl::value_desc("filename"));
+
+static cl::opt<bool> RemarksWithHotness(
+    "pass-remarks-with-hotness",
+    cl::desc("With PGO, include profile count in optimization remarks"),
+    cl::Hidden);
+
+static cl::opt<std::optional<uint64_t>, false, remarks::HotnessThresholdParser>
+    RemarksHotnessThreshold(
+        "pass-remarks-hotness-threshold",
+        cl::desc("Minimum profile count required for "
+                 "an optimization remark to be output. "
+                 "Use 'auto' to apply the threshold from profile summary"),
+        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
+
+static cl::opt<std::string>
+    RemarksFilename("pass-remarks-output",
+                    cl::desc("Output filename for pass remarks"),
+                    cl::value_desc("filename"));
+
+static cl::opt<std::string>
+    RemarksPasses("pass-remarks-filter",
+                  cl::desc("Only record optimization remarks from passes whose "
+                           "names match the given regular expression"),
+                  cl::value_desc("regex"));
+
+static cl::opt<std::string> RemarksFormat(
+    "pass-remarks-format",
+    cl::desc("The format used for serializing remarks (default: YAML)"),
+    cl::value_desc("format"), cl::init("yaml"));
+
+static cl::list<std::string>
+    PassPlugins("load-pass-plugin",
+                cl::desc("Load passes from plugin library"));
+
+static cl::opt<bool> TryUseNewDbgInfoFormat(
+    "try-experimental-debuginfo-iterators",
+    cl::desc("Enable debuginfo iterator positions, if they're built in"),
+    cl::init(false), cl::Hidden);
+
+extern cl::opt<bool> UseNewDbgInfoFormat;
+
+//===----------------------------------------------------------------------===//
+// CodeGen-related helper functions.
+//
+
+static CodeGenOptLevel GetCodeGenOptLevel() {
+  return static_cast<CodeGenOptLevel>(unsigned(CodeGenOptLevelCL));
+}
+
+struct TimeTracerRAII {
+  TimeTracerRAII(StringRef ProgramName) {
+    if (TimeTrace)
+      timeTraceProfilerInitialize(TimeTraceGranularity, ProgramName);
+  }
+  ~TimeTracerRAII() {
+    if (TimeTrace) {
+      if (auto E = timeTraceProfilerWrite(TimeTraceFile, OutputFilename)) {
+        handleAllErrors(std::move(E), [&](const StringError &SE) {
+          errs() << SE.getMessage() << "\n";
+        });
+        return;
+      }
+      timeTraceProfilerCleanup();
+    }
+  }
+};
+
+// For use in NPM transition. Currently this contains most codegen-specific
+// passes. Remove passes from here when porting to the NPM.
+// TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once
+// it exists.
+static bool shouldPinPassToLegacyPM(StringRef Pass) {
+  std::vector<StringRef> PassNameExactToIgnore = {
+      "nvvm-reflect",
+      "nvvm-intr-range",
+      "amdgpu-simplifylib",
+      "amdgpu-image-intrinsic-opt",
+      "amdgpu-usenative",
+      "amdgpu-promote-alloca",
+      "amdgpu-promote-alloca-to-vector",
+      "amdgpu-lower-kernel-attributes",
+      "amdgpu-propagate-attributes-early",
+      "amdgpu-propagate-attributes-late",
+      "amdgpu-unify-metadata",
+      "amdgpu-printf-runtime-binding",
+      "amdgpu-always-inline"};
+  if (llvm::is_contained(PassNameExactToIgnore, Pass))
+    return false;
+
+  std::vector<StringRef> PassNamePrefix = {
+      "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
+      "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
+      "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
+      "amdgcn-", "polly-", "riscv-", "dxil-"};
+  std::vector<StringRef> PassNameContain = {"-eh-prepare"};
+  std::vector<StringRef> PassNameExact = {
+      "safe-stack",
+      "cost-model",
+      "codegenprepare",
+      "interleaved-load-combine",
+      "unreachableblockelim",
+      "verify-safepoint-ir",
+      "atomic-expand",
+      "expandvp",
+      "mve-tail-predication",
+      "interleaved-access",
+      "global-merge",
+      "pre-isel-intrinsic-lowering",
+      "expand-reductions",
+      "indirectbr-expand",
+      "generic-to-nvvm",
+      "expand-memcmp",
+      "loop-reduce",
+      "lower-amx-type",
+      "lower-amx-intrinsics",
+      "polyhedral-info",
+      "print-polyhedral-info",
+      "replace-with-veclib",
+      "jmc-instrumenter",
+      "dot-regions",
+      "dot-regions-only",
+      "view-regions",
+      "view-regions-only",
+      "select-optimize",
+      "expand-large-div-rem",
+      "structurizecfg",
+      "fix-irreducible",
+      "expand-large-fp-convert",
+      "callbrprepare",
+  };
+  for (const auto &P : PassNamePrefix)
+    if (Pass.starts_with(P))
+      return true;
+  for (const auto &P : PassNameContain)
+    if (Pass.contains(P))
+      return true;
+  return llvm::is_contained(PassNameExact, Pass);
+}
+
+// For use in NPM transition.
+static bool shouldForceLegacyPM() {
+  for (const auto &P : PassList) {
+    StringRef Arg = P->getPassArgument();
+    if (shouldPinPassToLegacyPM(Arg))
+      return true;
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// main for opt
+//
+extern "C" int optMain(
+    int argc, char **argv,
+    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks) {
+  InitLLVM X(argc, argv);
+
+  // Enable debug stream buffering.
+  EnableDebugBuffering = true;
+
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+
+  // Initialize passes
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+  initializeCore(Registry);
+  initializeScalarOpts(Registry);
+  initializeVectorization(Registry);
+  initializeIPO(Registry);
+  initializeAnalysis(Registry);
+  initializeTransformUtils(Registry);
+  initializeInstCombine(Registry);
+  initializeTarget(Registry);
+  // For codegen passes, only passes that do IR to IR transformation are
+  // supported.
+  initializeExpandLargeDivRemLegacyPassPass(Registry);
+  initializeExpandLargeFpConvertLegacyPassPass(Registry);
+  initializeExpandMemCmpLegacyPassPass(Registry);
+  initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
+  initializeSelectOptimizePass(Registry);
+  initializeCallBrPreparePass(Registry);
+  initializeCodeGenPrepareLegacyPassPass(Registry);
+  initializeAtomicExpandPass(Registry);
+  initializeWinEHPreparePass(Registry);
+  initializeDwarfEHPrepareLegacyPassPass(Registry);
+  initializeSafeStackLegacyPassPass(Registry);
+  initializeSjLjEHPreparePass(Registry);
+  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
+  initializeGlobalMergePass(Registry);
+  initializeIndirectBrExpandLegacyPassPass(Registry);
+  initializeInterleavedLoadCombinePass(Registry);
+  initializeInterleavedAccessPass(Registry);
+  initializeUnreachableBlockElimLegacyPassPass(Registry);
+  initializeExpandReductionsPass(Registry);
+  initializeExpandVectorPredicationPass(Registry);
+  initializeWasmEHPreparePass(Registry);
+  initializeWriteBitcodePassPass(Registry);
+  initializeReplaceWithVeclibLegacyPass(Registry);
+  initializeJMCInstrumenterPass(Registry);
+
+  SmallVector<PassPlugin, 1> PluginList;
+  PassPlugins.setCallback([&](const std::string &PluginPath) {
+    auto Plugin = PassPlugin::Load(PluginPath);
+    if (!Plugin)
+      report_fatal_error(Plugin.takeError(), /*gen_crash_diag=*/false);
+    PluginList.emplace_back(Plugin.get());
+  });
+
+  // Register the Target and CPU printer for --version.
+  cl::AddExtraVersionPrinter(sys::printDefaultTargetAndDetectedCPU);
+
+  cl::ParseCommandLineOptions(
+      argc, argv, "llvm .bc -> .bc modular optimizer and analysis printer\n");
+
+  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
+  // new debug-info format, if it's built in.
+#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  if (TryUseNewDbgInfoFormat) {
+    // If LLVM was built with support for this, turn the new debug-info format
+    // on.
+    UseNewDbgInfoFormat = true;
+  }
+#endif
+  (void)TryUseNewDbgInfoFormat;
+
+  LLVMContext Context;
+
+  // TODO: remove shouldForceLegacyPM().
+  const bool UseNPM = (!EnableLegacyPassManager && !shouldForceLegacyPM()) ||
+                      PassPipeline.getNumOccurrences() > 0;
+
+  if (UseNPM && !PassList.empty()) {
+    errs() << "The `opt -passname` syntax for the new pass manager is "
+              "not supported, please use `opt -passes=<pipeline>` (or the `-p` "
+              "alias for a more concise version).\n";
+    errs() << "See https://llvm.org/docs/NewPassManager.html#invoking-opt "
+              "for more details on the pass pipeline syntax.\n\n";
+    return 1;
+  }
+
+  if (!UseNPM && PluginList.size()) {
+    errs() << argv[0] << ": " << PassPlugins.ArgStr
+           << " specified with legacy PM.\n";
+    return 1;
+  }
+
+  // FIXME: once the legacy PM code is deleted, move runPassPipeline() here and
+  // construct the PassBuilder before parsing IR so we can reuse the same
+  // PassBuilder for print passes.
+  if (PrintPasses) {
+    printPasses(outs());
+    return 0;
+  }
+
+  TimeTracerRAII TimeTracer(argv[0]);
+
+  SMDiagnostic Err;
+
+  Context.setDiscardValueNames(DiscardValueNames);
+  if (!DisableDITypeMap)
+    Context.enableDebugTypeODRUniquing();
+
+  Expected<std::unique_ptr<ToolOutputFile>> RemarksFileOrErr =
+      setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
+                                   RemarksFormat, RemarksWithHotness,
+                                   RemarksHotnessThreshold);
+  if (Error E = RemarksFileOrErr.takeError()) {
+    errs() << toString(std::move(E)) << '\n';
+    return 1;
+  }
+  std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
+
+  // Load the input module...
+  auto SetDataLayout = [&](StringRef IRTriple,
+                           StringRef IRLayout) -> std::optional<std::string> {
+    // Data layout specified on the command line has the highest priority.
+    if (!ClDataLayout.empty())
+      return ClDataLayout;
+    // If an explicit data layout is already defined in the IR, don't infer.
+    if (!IRLayout.empty())
+      return std::nullopt;
+
+    // If an explicit triple was specified (either in the IR or on the
+    // command line), use that to infer the default data layout. However, the
+    // command line target triple should override the IR file target triple.
+    std::string TripleStr =
+        TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple);
+    // If the triple string is still empty, we don't fall back to
+    // sys::getDefaultTargetTriple() since we do not want to have differing
+    // behaviour dependent on the configured default triple. Therefore, if the
+    // user did not pass -mtriple or define an explicit triple/datalayout in
+    // the IR, we should default to an empty (default) DataLayout.
+    if (TripleStr.empty())
+      return std::nullopt;
+    // Otherwise we infer the DataLayout from the target machine.
+    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
+        codegen::createTargetMachineForTriple(TripleStr, GetCodeGenOptLevel());
+    if (!ExpectedTM) {
+      errs() << argv[0] << ": warning: failed to infer data layout: "
+             << toString(ExpectedTM.takeError()) << "\n";
+      return std::nullopt;
+    }
+    return (*ExpectedTM)->createDataLayout().getStringRepresentation();
+  };
+  std::unique_ptr<Module> M;
+  if (NoUpgradeDebugInfo)
+    M = parseAssemblyFileWithIndexNoUpgradeDebugInfo(
+            InputFilename, Err, Context, nullptr, SetDataLayout)
+            .Mod;
+  else
+    M = parseIRFile(InputFilename, Err, Context,
+                    ParserCallbacks(SetDataLayout));
+
+  if (!M) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+
+  // Strip debug info before running the verifier.
+  if (StripDebug)
+    StripDebugInfo(*M);
+
+  // Erase module-level named metadata, if requested.
+  if (StripNamedMetadata) {
+    while (!M->named_metadata_empty()) {
+      NamedMDNode *NMD = &*M->named_metadata_begin();
+      M->eraseNamedMetadata(NMD);
+    }
+  }
+
+  // If we are supposed to override the target triple, do so now.
+  if (!TargetTriple.empty())
+    M->setTargetTriple(Triple::normalize(TargetTriple));
+
+  // Immediately run the verifier to catch any problems before starting up the
+  // pass pipelines.  Otherwise we can crash on broken code during
+  // doInitialization().
+  if (!NoVerify && verifyModule(*M, &errs())) {
+    errs() << argv[0] << ": " << InputFilename
+           << ": error: input module is broken!\n";
+    return 1;
+  }
+
+  // Enable testing of whole program devirtualization on this module by invoking
+  // the facility for updating public visibility to linkage unit visibility when
+  // specified by an internal option. This is normally done during LTO which is
+  // not performed via opt.
+  updateVCallVisibilityInModule(
+      *M,
+      /*WholeProgramVisibilityEnabledInLTO=*/false,
+      // FIXME: These need linker information via a
+      // TBD new interface.
+      /*DynamicExportSymbols=*/{},
+      /*ValidateAllVtablesHaveTypeInfos=*/false,
+      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
+
+  // Figure out what stream we are supposed to write to...
+  std::unique_ptr<ToolOutputFile> Out;
+  std::unique_ptr<ToolOutputFile> ThinLinkOut;
+  if (NoOutput) {
+    if (!OutputFilename.empty())
+      errs() << "WARNING: The -o (output filename) option is ignored when\n"
+                "the --disable-output option is used.\n";
+  } else {
+    // Default to standard output.
+    if (OutputFilename.empty())
+      OutputFilename = "-";
+
+    std::error_code EC;
+    sys::fs::OpenFlags Flags =
+        OutputAssembly ? sys::fs::OF_TextWithCRLF : sys::fs::OF_None;
+    Out.reset(new ToolOutputFile(OutputFilename, EC, Flags));
+    if (EC) {
+      errs() << EC.message() << '\n';
+      return 1;
+    }
+
+    if (!ThinLinkBitcodeFile.empty()) {
+      ThinLinkOut.reset(
+          new ToolOutputFile(ThinLinkBitcodeFile, EC, sys::fs::OF_None));
+      if (EC) {
+        errs() << EC.message() << '\n';
+        return 1;
+      }
+    }
+  }
+
+  Triple ModuleTriple(M->getTargetTriple());
+  std::string CPUStr, FeaturesStr;
+  std::unique_ptr<TargetMachine> TM;
+  if (ModuleTriple.getArch()) {
+    CPUStr = codegen::getCPUStr();
+    FeaturesStr = codegen::getFeaturesStr();
+    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
+        codegen::createTargetMachineForTriple(ModuleTriple.str(),
+                                              GetCodeGenOptLevel());
+    if (auto E = ExpectedTM.takeError()) {
+      errs() << argv[0] << ": WARNING: failed to create target machine for '"
+             << ModuleTriple.str() << "': " << toString(std::move(E)) << "\n";
+    } else {
+      TM = std::move(*ExpectedTM);
+    }
+  } else if (ModuleTriple.getArchName() != "unknown" &&
+             ModuleTriple.getArchName() != "") {
+    errs() << argv[0] << ": unrecognized architecture '"
+           << ModuleTriple.getArchName() << "' provided.\n";
+    return 1;
+  }
+
+  // Override function attributes based on CPUStr, FeaturesStr, and command line
+  // flags.
+  codegen::setFunctionAttributes(CPUStr, FeaturesStr, *M);
+
+  // If the output is set to be emitted to standard out, and standard out is a
+  // console, print out a warning message and refuse to do it.  We don't
+  // impress anyone by spewing tons of binary goo to a terminal.
+  if (!Force && !NoOutput && !OutputAssembly)
+    if (CheckBitcodeOutputToConsole(Out->os()))
+      NoOutput = true;
+
+  if (OutputThinLTOBC) {
+    M->addModuleFlag(Module::Error, "EnableSplitLTOUnit", SplitLTOUnit);
+    if (UnifiedLTO)
+      M->addModuleFlag(Module::Error, "UnifiedLTO", 1);
+  }
+
+  // Add an appropriate TargetLibraryInfo pass for the module's triple.
+  TargetLibraryInfoImpl TLII(ModuleTriple);
+
+  // The -disable-simplify-libcalls flag actually disables all builtin optzns.
+  if (DisableSimplifyLibCalls)
+    TLII.disableAllFunctions();
+  else {
+    // Disable individual builtin functions in TargetLibraryInfo.
+    LibFunc F;
+    for (auto &FuncName : DisableBuiltins)
+      if (TLII.getLibFunc(FuncName, F))
+        TLII.setUnavailable(F);
+      else {
+        errs() << argv[0] << ": cannot disable nonexistent builtin function "
+               << FuncName << '\n';
+        return 1;
+      }
+  }
+
+  if (UseNPM) {
+    if (legacy::debugPassSpecified()) {
+      errs() << "-debug-pass does not work with the new PM, either use "
+                "-debug-pass-manager, or use the legacy PM\n";
+      return 1;
+    }
+    auto NumOLevel = OptLevelO0 + OptLevelO1 + OptLevelO2 + OptLevelO3 +
+                     OptLevelOs + OptLevelOz;
+    if (NumOLevel > 1) {
+      errs() << "Cannot specify multiple -O#\n";
+      return 1;
+    }
+    if (NumOLevel > 0 && (PassPipeline.getNumOccurrences() > 0)) {
+      errs() << "Cannot specify -O# and --passes=/--foo-pass, use "
+                "-passes='default<O#>,other-pass'\n";
+      return 1;
+    }
+    std::string Pipeline = PassPipeline;
+
+    if (OptLevelO0)
+      Pipeline = "default<O0>";
+    if (OptLevelO1)
+      Pipeline = "default<O1>";
+    if (OptLevelO2)
+      Pipeline = "default<O2>";
+    if (OptLevelO3)
+      Pipeline = "default<O3>";
+    if (OptLevelOs)
+      Pipeline = "default<Os>";
+    if (OptLevelOz)
+      Pipeline = "default<Oz>";
+    OutputKind OK = OK_NoOutput;
+    if (!NoOutput)
+      OK = OutputAssembly
+               ? OK_OutputAssembly
+               : (OutputThinLTOBC ? OK_OutputThinLTOBitcode : OK_OutputBitcode);
+
+    VerifierKind VK = VK_VerifyOut;
+    if (NoVerify)
+      VK = VK_NoVerifier;
+    else if (VerifyEach)
+      VK = VK_VerifyEachPass;
+
+    // The user has asked to use the new pass manager and provided a pipeline
+    // string. Hand off the rest of the functionality to the new code for that
+    // layer.
+    return runPassPipeline(
+               argv[0], *M, TM.get(), &TLII, Out.get(), ThinLinkOut.get(),
+               RemarksFile.get(), Pipeline, PluginList, PassBuilderCallbacks,
+               OK, VK, PreserveAssemblyUseListOrder,
+               PreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash,
+               EnableDebugify, VerifyDebugInfoPreserve, UnifiedLTO)
+               ? 0
+               : 1;
+  }
+
+  if (OptLevelO0 || OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz ||
+      OptLevelO3) {
+    errs() << "Cannot use -O# with legacy PM.\n";
+    return 1;
+  }
+  if (EmitSummaryIndex) {
+    errs() << "Cannot use -module-summary with legacy PM.\n";
+    return 1;
+  }
+  if (EmitModuleHash) {
+    errs() << "Cannot use -module-hash with legacy PM.\n";
+    return 1;
+  }
+  if (OutputThinLTOBC) {
+    errs() << "Cannot use -thinlto-bc with legacy PM.\n";
+    return 1;
+  }
+  // Create a PassManager to hold and optimize the collection of passes we are
+  // about to build. If the -debugify-each option is set, wrap each pass with
+  // the (-check)-debugify passes.
+  DebugifyCustomPassManager Passes;
+  DebugifyStatsMap DIStatsMap;
+  DebugInfoPerPass DebugInfoBeforePass;
+  if (DebugifyEach) {
+    Passes.setDebugifyMode(DebugifyMode::SyntheticDebugInfo);
+    Passes.setDIStatsMap(DIStatsMap);
+  } else if (VerifyEachDebugInfoPreserve) {
+    Passes.setDebugifyMode(DebugifyMode::OriginalDebugInfo);
+    Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
+    if (!VerifyDIPreserveExport.empty())
+      Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
+  }
+
+  bool AddOneTimeDebugifyPasses =
+      (EnableDebugify && !DebugifyEach) ||
+      (VerifyDebugInfoPreserve && !VerifyEachDebugInfoPreserve);
+
+  Passes.add(new TargetLibraryInfoWrapperPass(TLII));
+
+  // Add internal analysis passes from the target machine.
+  Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
+                                                     : TargetIRAnalysis()));
+
+  if (AddOneTimeDebugifyPasses) {
+    if (EnableDebugify) {
+      Passes.setDIStatsMap(DIStatsMap);
+      Passes.add(createDebugifyModulePass());
+    } else if (VerifyDebugInfoPreserve) {
+      Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
+      Passes.add(createDebugifyModulePass(DebugifyMode::OriginalDebugInfo, "",
+                                          &(Passes.getDebugInfoPerPass())));
+    }
+  }
+
+  if (TM) {
+    // FIXME: We should dyn_cast this when supported.
+    auto &LTM = static_cast<LLVMTargetMachine &>(*TM);
+    Pass *TPC = LTM.createPassConfig(Passes);
+    Passes.add(TPC);
+  }
+
+  // Create a new optimization pass for each one specified on the command line
+  for (unsigned i = 0; i < PassList.size(); ++i) {
+    const PassInfo *PassInf = PassList[i];
+    if (PassInf->getNormalCtor()) {
+      Pass *P = PassInf->getNormalCtor()();
+      if (P) {
+        // Add the pass to the pass manager.
+        Passes.add(P);
+        // If we are verifying all of the intermediate steps, add the verifier.
+        if (VerifyEach)
+          Passes.add(createVerifierPass());
+      }
+    } else
+      errs() << argv[0] << ": cannot create pass: " << PassInf->getPassName()
+             << "\n";
+  }
+
+  // Check that the module is well formed on completion of optimization
+  if (!NoVerify && !VerifyEach)
+    Passes.add(createVerifierPass());
+
+  if (AddOneTimeDebugifyPasses) {
+    if (EnableDebugify)
+      Passes.add(createCheckDebugifyModulePass(false));
+    else if (VerifyDebugInfoPreserve) {
+      if (!VerifyDIPreserveExport.empty())
+        Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
+      Passes.add(createCheckDebugifyModulePass(
+          false, "", nullptr, DebugifyMode::OriginalDebugInfo,
+          &(Passes.getDebugInfoPerPass()), VerifyDIPreserveExport));
+    }
+  }
+
+  // In run twice mode, we want to make sure the output is bit-by-bit
+  // equivalent if we run the pass manager again, so setup two buffers and
+  // a stream to write to them. Note that llc does something similar and it
+  // may be worth to abstract this out in the future.
+  SmallVector<char, 0> Buffer;
+  SmallVector<char, 0> FirstRunBuffer;
+  std::unique_ptr<raw_svector_ostream> BOS;
+  raw_ostream *OS = nullptr;
+
+  const bool ShouldEmitOutput = !NoOutput;
+
+  // Write bitcode or assembly to the output as the last step...
+  if (ShouldEmitOutput || RunTwice) {
+    assert(Out);
+    OS = &Out->os();
+    if (RunTwice) {
+      BOS = std::make_unique<raw_svector_ostream>(Buffer);
+      OS = BOS.get();
+    }
+    if (OutputAssembly)
+      Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
+    else
+      Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder));
+  }
+
+  // Before executing passes, print the final values of the LLVM options.
+  cl::PrintOptionValues();
+
+  if (!RunTwice) {
+    // Now that we have all of the passes ready, run them.
+    Passes.run(*M);
+  } else {
+    // If requested, run all passes twice with the same pass manager to catch
+    // bugs caused by persistent state in the passes.
+    std::unique_ptr<Module> M2(CloneModule(*M));
+    // Run all passes on the original module first, so the second run processes
+    // the clone to catch CloneModule bugs.
+    Passes.run(*M);
+    FirstRunBuffer = Buffer;
+    Buffer.clear();
+
+    Passes.run(*M2);
+
+    // Compare the two outputs and make sure they're the same
+    assert(Out);
+    if (Buffer.size() != FirstRunBuffer.size() ||
+        (memcmp(Buffer.data(), FirstRunBuffer.data(), Buffer.size()) != 0)) {
+      errs()
+          << "Running the pass manager twice changed the output.\n"
+             "Writing the result of the second run to the specified output.\n"
+             "To generate the one-run comparison binary, just run without\n"
+             "the compile-twice option\n";
+      if (ShouldEmitOutput) {
+        Out->os() << BOS->str();
+        Out->keep();
+      }
+      if (RemarksFile)
+        RemarksFile->keep();
+      return 1;
+    }
+    if (ShouldEmitOutput)
+      Out->os() << BOS->str();
+  }
+
+  if (DebugifyEach && !DebugifyExport.empty())
+    exportDebugifyStats(DebugifyExport, Passes.getDebugifyStatsMap());
+
+  // Declare success.
+  if (!NoOutput)
+    Out->keep();
+
+  if (RemarksFile)
+    RemarksFile->keep();
+
+  if (ThinLinkOut)
+    ThinLinkOut->keep();
+
+  return 0;
+}
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index d4cb0cdb55b52..af219dd743ce7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -4814,8 +4814,8 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "opt",
+cc_library(
+    name = "opt-driver",
     srcs = glob([
         "tools/opt/*.cpp",
         "tools/opt/*.h",
@@ -4826,7 +4826,6 @@ cc_binary(
         "@platforms//os:macos": [],
         "//conditions:default": ["-Wl,--export-dynamic"],
     }),
-    stamp = 0,
     deps = [
         ":AllTargetsAsmParsers",
         ":AllTargetsCodeGens",
@@ -4853,6 +4852,12 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "opt",
+    stamp = 0,
+    deps = [":opt-driver"]
+)
+
 gentbl(
     name = "SancovOptsTableGen",
     strip_include_prefix = "tools/sancov",

From 7e09239e24b339f45f63a670e2e831150826bf70 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Wed, 24 Jan 2024 11:40:23 -0700
Subject: [PATCH 831/843] [CodeGen][MISched] Handle empty sized resource usage.
 (#75951)

TargetSchedule.td explicitly allows the usage of a ProcResource for zero
cycles, in order to represent that the ProcResource must be available
but is not consumed by the instruction. On the other hand,
ResourceSegments explicitly does not allow for a zero sized interval. In
order to remedy this, this patch handles the special case of when there
is an empty interval usage of a resource by not adding an empty
interval.

We ran into this issue downstream, but it makes sense to have
this upstream since it is explicitly allowed by TargetSchedule.td.
---
 llvm/lib/CodeGen/MachineScheduler.cpp    | 16 +++++++++++++++-
 llvm/unittests/CodeGen/SchedBoundary.cpp |  3 ++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index f40e91819a48f..f6d03f59f8564 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -4275,6 +4275,12 @@ unsigned ResourceSegments::getFirstAvailableAt(
   assert(std::is_sorted(std::begin(_Intervals), std::end(_Intervals),
                         sortIntervals) &&
          "Cannot execute on an un-sorted set of intervals.");
+
+  // Zero resource usage is allowed by TargetSchedule.td but we do not construct
+  // a ResourceSegment interval for that situation.
+  if (AcquireAtCycle == Cycle)
+    return CurrCycle;
+
   unsigned RetCycle = CurrCycle;
   ResourceSegments::IntervalTy NewInterval =
       IntervalBuilder(RetCycle, AcquireAtCycle, ReleaseAtCycle);
@@ -4294,8 +4300,16 @@ unsigned ResourceSegments::getFirstAvailableAt(
 
 void ResourceSegments::add(ResourceSegments::IntervalTy A,
                            const unsigned CutOff) {
-  assert(A.first < A.second && "Cannot add empty resource usage");
+  assert(A.first <= A.second && "Cannot add negative resource usage");
   assert(CutOff > 0 && "0-size interval history has no use.");
+  // Zero resource usage is allowed by TargetSchedule.td, in the case that the
+  // instruction needed the resource to be available but does not use it.
+  // However, ResourceSegment represents an interval that is closed on the left
+  // and open on the right. It is impossible to represent an empty interval when
+  // the left is closed. Do not add it to Intervals.
+  if (A.first == A.second)
+    return;
+
   assert(all_of(_Intervals,
                 [&A](const ResourceSegments::IntervalTy &Interval) -> bool {
                   return !intersects(A, Interval);
diff --git a/llvm/unittests/CodeGen/SchedBoundary.cpp b/llvm/unittests/CodeGen/SchedBoundary.cpp
index 5ca6543f8e814..dbcb2e024ebf6 100644
--- a/llvm/unittests/CodeGen/SchedBoundary.cpp
+++ b/llvm/unittests/CodeGen/SchedBoundary.cpp
@@ -85,7 +85,8 @@ TEST(ResourceSegments, add_02) {
 #ifndef NDEBUG
 TEST(ResourceSegmentsDeath, add_empty) {
   auto X = ResourceSegments({{10, 20}, {30, 40}});
-  EXPECT_DEATH(X.add({22, 22}), "Cannot add empty resource usage");
+  X.add({22, 22});
+  EXPECT_EQ(X, ResourceSegments({{10, 20}, {30, 40}}));
 }
 #endif
 

From 048041f19719758976f454cb02b662f7d5ded1dd Mon Sep 17 00:00:00 2001
From: Schrodinger ZHU Yifan <yifanzhu@rochester.edu>
Date: Wed, 24 Jan 2024 13:47:56 -0500
Subject: [PATCH 832/843] [libc] reland mincore (#79309)

---
 libc/config/linux/aarch64/entrypoints.txt     |   1 +
 libc/config/linux/riscv/entrypoints.txt       |   1 +
 libc/config/linux/x86_64/entrypoints.txt      |   1 +
 libc/spec/linux.td                            |  15 ++-
 libc/spec/spec.td                             |   2 +
 libc/src/sys/mman/CMakeLists.txt              |   7 +
 libc/src/sys/mman/linux/CMakeLists.txt        |  13 ++
 libc/src/sys/mman/linux/mincore.cpp           |  28 ++++
 libc/src/sys/mman/mincore.h                   |  20 +++
 libc/test/src/sys/mman/linux/CMakeLists.txt   |  18 +++
 libc/test/src/sys/mman/linux/mincore_test.cpp | 127 ++++++++++++++++++
 11 files changed, 232 insertions(+), 1 deletion(-)
 create mode 100644 libc/src/sys/mman/linux/mincore.cpp
 create mode 100644 libc/src/sys/mman/mincore.h
 create mode 100644 libc/test/src/sys/mman/linux/mincore_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 3f66a582f5e3e..52f1d2bce34f4 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -136,6 +136,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.mman.mprotect
     libc.src.sys.mman.munmap
     libc.src.sys.mman.posix_madvise
+    libc.src.sys.mman.mincore
 
     # sys/random.h entrypoints
     libc.src.sys.random.getrandom
diff --git a/libc/config/linux/riscv/entrypoints.txt b/libc/config/linux/riscv/entrypoints.txt
index 0331ef782cf74..a257f3f8d64ab 100644
--- a/libc/config/linux/riscv/entrypoints.txt
+++ b/libc/config/linux/riscv/entrypoints.txt
@@ -142,6 +142,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.mman.mprotect
     libc.src.sys.mman.munmap
     libc.src.sys.mman.posix_madvise
+    libc.src.sys.mman.mincore
 
     # sys/random.h entrypoints
     libc.src.sys.random.getrandom
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index a1486ac689812..5b43188e75cd7 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -142,6 +142,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.sys.mman.mprotect
     libc.src.sys.mman.munmap
     libc.src.sys.mman.posix_madvise
+    libc.src.sys.mman.mincore
 
     # sys/random.h entrypoints
     libc.src.sys.random.getrandom
diff --git a/libc/spec/linux.td b/libc/spec/linux.td
index ba5f99c12ecd1..eab0a987b920c 100644
--- a/libc/spec/linux.td
+++ b/libc/spec/linux.td
@@ -76,7 +76,20 @@ def Linux : StandardSpec<"Linux"> {
 
   HeaderSpec SysMMan = HeaderSpec<
       "sys/mman.h",
-      [Macro<"MAP_ANONYMOUS">]
+      [Macro<"MAP_ANONYMOUS">],
+      [], // Types
+      [], // Enumerations
+      [
+        FunctionSpec<
+            "mincore",
+            RetValSpec<IntType>,
+            [
+              ArgSpec<VoidPtr>,
+              ArgSpec<SizeTType>,
+              ArgSpec<UnsignedCharPtr>,
+            ]
+        >,
+      ]  // Functions
   >;
 
 
diff --git a/libc/spec/spec.td b/libc/spec/spec.td
index 9b689b5eb502a..818cfaee6b61c 100644
--- a/libc/spec/spec.td
+++ b/libc/spec/spec.td
@@ -49,6 +49,7 @@ def FloatType : NamedType<"float">;
 def DoubleType : NamedType<"double">;
 def LongDoubleType : NamedType<"long double">;
 def CharType : NamedType<"char">;
+def UnsignedCharType : NamedType<"unsigned char">;
 
 // TODO: Add compatibility layer to use C23 type _Float128 if possible.
 def Float128Type : NamedType<"__float128">;
@@ -109,6 +110,7 @@ def IntPtr : PtrType<IntType>;
 def RestrictedIntPtr : RestrictedPtrType<IntType>;
 def FloatPtr : PtrType<FloatType>;
 def DoublePtr : PtrType<DoubleType>;
+def UnsignedCharPtr : PtrType<UnsignedCharType>;
 
 def SigHandlerT : NamedType<"__sighandler_t">;
 
diff --git a/libc/src/sys/mman/CMakeLists.txt b/libc/src/sys/mman/CMakeLists.txt
index e336bfd5d6dbc..2d17429a26b45 100644
--- a/libc/src/sys/mman/CMakeLists.txt
+++ b/libc/src/sys/mman/CMakeLists.txt
@@ -36,3 +36,10 @@ add_entrypoint_object(
   DEPENDS
     .${LIBC_TARGET_OS}.posix_madvise
 )
+
+add_entrypoint_object(
+  mincore
+  ALIAS
+  DEPENDS
+    .${LIBC_TARGET_OS}.mincore
+)
diff --git a/libc/src/sys/mman/linux/CMakeLists.txt b/libc/src/sys/mman/linux/CMakeLists.txt
index 163e7dead8887..ce0cda7f22277 100644
--- a/libc/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/src/sys/mman/linux/CMakeLists.txt
@@ -61,3 +61,16 @@ add_entrypoint_object(
     libc.include.sys_syscall
     libc.src.__support.OSUtil.osutil
 )
+
+add_entrypoint_object(
+  mincore
+  SRCS
+    mincore.cpp
+  HDRS
+    ../mincore.h
+  DEPENDS
+    libc.include.sys_mman
+    libc.include.sys_syscall
+    libc.src.__support.OSUtil.osutil
+    libc.src.errno.errno
+)
diff --git a/libc/src/sys/mman/linux/mincore.cpp b/libc/src/sys/mman/linux/mincore.cpp
new file mode 100644
index 0000000000000..8220c69ef2cb7
--- /dev/null
+++ b/libc/src/sys/mman/linux/mincore.cpp
@@ -0,0 +1,28 @@
+//===---------- Linux implementation of the mincore function --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/sys/mman/mincore.h"
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+
+#include "src/errno/libc_errno.h"
+#include <sys/syscall.h> // For syscall numbers.
+
+namespace LIBC_NAMESPACE {
+
+LLVM_LIBC_FUNCTION(int, mincore, (void *addr, size_t len, unsigned char *vec)) {
+  long ret = syscall_impl(SYS_mincore, reinterpret_cast<long>(addr), len,
+                          reinterpret_cast<long>(vec));
+  if (ret < 0) {
+    libc_errno = static_cast<int>(-ret);
+    return -1;
+  }
+  return 0;
+}
+
+} // namespace LIBC_NAMESPACE
diff --git a/libc/src/sys/mman/mincore.h b/libc/src/sys/mman/mincore.h
new file mode 100644
index 0000000000000..403afaeb6af97
--- /dev/null
+++ b/libc/src/sys/mman/mincore.h
@@ -0,0 +1,20 @@
+//===-- Implementation header for mincore function --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_SYS_MMAN_MINCORE_H
+#define LLVM_LIBC_SRC_SYS_MMAN_MINCORE_H
+
+#include <sys/mman.h> // For size_t
+
+namespace LIBC_NAMESPACE {
+
+int mincore(void *addr, size_t len, unsigned char *vec);
+
+} // namespace LIBC_NAMESPACE
+
+#endif // LLVM_LIBC_SRC_SYS_MMAN_MINCORE_H
diff --git a/libc/test/src/sys/mman/linux/CMakeLists.txt b/libc/test/src/sys/mman/linux/CMakeLists.txt
index 66743be175fed..36557695e503c 100644
--- a/libc/test/src/sys/mman/linux/CMakeLists.txt
+++ b/libc/test/src/sys/mman/linux/CMakeLists.txt
@@ -62,3 +62,21 @@ add_libc_unittest(
     libc.src.sys.mman.posix_madvise
     libc.test.UnitTest.ErrnoSetterMatcher
 )
+
+add_libc_unittest(
+  mincore_test
+  SUITE
+    libc_sys_mman_unittests
+  SRCS
+    mincore_test.cpp
+  DEPENDS
+    libc.include.sys_mman
+    libc.include.unistd
+    libc.src.errno.errno
+    libc.src.sys.mman.mmap
+    libc.src.sys.mman.munmap
+    libc.src.sys.mman.madvise
+    libc.src.sys.mman.mincore
+    libc.src.unistd.sysconf
+    libc.test.UnitTest.ErrnoSetterMatcher
+)
diff --git a/libc/test/src/sys/mman/linux/mincore_test.cpp b/libc/test/src/sys/mman/linux/mincore_test.cpp
new file mode 100644
index 0000000000000..52fc7bcf44f78
--- /dev/null
+++ b/libc/test/src/sys/mman/linux/mincore_test.cpp
@@ -0,0 +1,127 @@
+//===-- Unittests for mincore ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/OSUtil/syscall.h" // For internal syscall function.
+#include "src/errno/libc_errno.h"
+#include "src/sys/mman/madvise.h"
+#include "src/sys/mman/mincore.h"
+#include "src/sys/mman/mmap.h"
+#include "src/sys/mman/munmap.h"
+#include "src/unistd/sysconf.h"
+#include "test/UnitTest/ErrnoSetterMatcher.h"
+#include "test/UnitTest/LibcTest.h"
+#include "test/UnitTest/Test.h"
+
+#include <sys/mman.h>
+#include <sys/syscall.h>
+#include <unistd.h>
+
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Fails;
+using LIBC_NAMESPACE::testing::ErrnoSetterMatcher::Succeeds;
+
+TEST(LlvmLibcMincoreTest, UnMappedMemory) {
+  libc_errno = 0;
+  unsigned char vec;
+  int res = LIBC_NAMESPACE::mincore(nullptr, 1, &vec);
+  EXPECT_THAT(res, Fails(ENOMEM, -1));
+}
+
+TEST(LlvmLibcMincoreTest, UnalignedAddr) {
+  unsigned long page_size = LIBC_NAMESPACE::sysconf(_SC_PAGESIZE);
+  void *addr = LIBC_NAMESPACE::mmap(nullptr, page_size, PROT_READ,
+                                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  EXPECT_NE(addr, MAP_FAILED);
+  EXPECT_EQ(reinterpret_cast<unsigned long>(addr) % page_size, 0ul);
+  libc_errno = 0;
+  int res = LIBC_NAMESPACE::mincore(static_cast<char *>(addr) + 1, 1, nullptr);
+  EXPECT_THAT(res, Fails(EINVAL, -1));
+  EXPECT_THAT(LIBC_NAMESPACE::munmap(addr, page_size), Succeeds());
+}
+
+TEST(LlvmLibcMincoreTest, InvalidVec) {
+  unsigned long page_size = LIBC_NAMESPACE::sysconf(_SC_PAGESIZE);
+  void *addr = LIBC_NAMESPACE::mmap(nullptr, 4 * page_size, PROT_READ,
+                                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  EXPECT_NE(addr, MAP_FAILED);
+  EXPECT_EQ(reinterpret_cast<unsigned long>(addr) % page_size, 0ul);
+  libc_errno = 0;
+  int res = LIBC_NAMESPACE::mincore(addr, 1, nullptr);
+  EXPECT_THAT(res, Fails(EFAULT, -1));
+  void *area = LIBC_NAMESPACE::mmap(nullptr, page_size, PROT_READ | PROT_WRITE,
+                                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  EXPECT_NE(area, MAP_FAILED);
+  unsigned char *ptr = static_cast<unsigned char *>(area) + page_size - 3;
+  res = LIBC_NAMESPACE::mincore(addr, 4 * page_size, ptr);
+  EXPECT_THAT(res, Fails(EFAULT, -1));
+  EXPECT_THAT(LIBC_NAMESPACE::munmap(addr, page_size), Succeeds());
+  EXPECT_THAT(LIBC_NAMESPACE::munmap(area, 2), Succeeds());
+}
+
+TEST(LlvmLibcMincoreTest, NoError) {
+  unsigned long page_size = LIBC_NAMESPACE::sysconf(_SC_PAGESIZE);
+  void *addr = LIBC_NAMESPACE::mmap(nullptr, page_size, PROT_READ,
+                                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  EXPECT_NE(addr, MAP_FAILED);
+  EXPECT_EQ(reinterpret_cast<unsigned long>(addr) % page_size, 0ul);
+  unsigned char vec;
+  libc_errno = 0;
+  int res = LIBC_NAMESPACE::mincore(addr, 1, &vec);
+  EXPECT_THAT(res, Succeeds());
+  EXPECT_THAT(LIBC_NAMESPACE::munmap(addr, page_size), Succeeds());
+}
+
+TEST(LlvmLibcMincoreTest, NegativeLength) {
+  unsigned long page_size = LIBC_NAMESPACE::sysconf(_SC_PAGESIZE);
+  void *addr = LIBC_NAMESPACE::mmap(nullptr, page_size, PROT_READ,
+                                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  EXPECT_NE(addr, MAP_FAILED);
+  EXPECT_EQ(reinterpret_cast<unsigned long>(addr) % page_size, 0ul);
+  unsigned char vec;
+  libc_errno = 0;
+  int res = LIBC_NAMESPACE::mincore(addr, -1, &vec);
+  EXPECT_THAT(res, Fails(ENOMEM, -1));
+  EXPECT_THAT(LIBC_NAMESPACE::munmap(addr, page_size), Succeeds());
+}
+
+TEST(LlvmLibcMincoreTest, PageOut) {
+  unsigned long page_size = LIBC_NAMESPACE::sysconf(_SC_PAGESIZE);
+  unsigned char vec;
+  void *addr = LIBC_NAMESPACE::mmap(nullptr, page_size, PROT_READ | PROT_WRITE,
+                                    MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
+  EXPECT_NE(addr, MAP_FAILED);
+  EXPECT_EQ(reinterpret_cast<unsigned long>(addr) % page_size, 0ul);
+
+  // touch the page
+  {
+    static_cast<char *>(addr)[0] = 0;
+    // TODO: use wrapper functions for mlock/munlock once implemented.
+    // See issue https://github.com/llvm/llvm-project/issues/79336
+    LIBC_NAMESPACE::syscall_impl(
+        SYS_mlock, reinterpret_cast<unsigned long>(addr), page_size);
+    libc_errno = 0;
+    int res = LIBC_NAMESPACE::mincore(addr, 1, &vec);
+    EXPECT_EQ(vec & 1u, 1u);
+    EXPECT_THAT(res, Succeeds());
+    LIBC_NAMESPACE::syscall_impl(
+        SYS_munlock, reinterpret_cast<unsigned long>(addr), page_size);
+  }
+
+  // page out the memory
+  {
+    libc_errno = 0;
+    EXPECT_THAT(LIBC_NAMESPACE::madvise(addr, page_size, MADV_DONTNEED),
+                Succeeds());
+
+    libc_errno = 0;
+    int res = LIBC_NAMESPACE::mincore(addr, page_size, &vec);
+    EXPECT_EQ(vec & 1u, 0u);
+    EXPECT_THAT(res, Succeeds());
+  }
+
+  EXPECT_THAT(LIBC_NAMESPACE::munmap(addr, page_size), Succeeds());
+}

From be08be5d5de97cd593fb99affa1fa994d104eb70 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 24 Jan 2024 13:52:10 -0500
Subject: [PATCH 833/843] Revert "[CMake/Bazel] Support usage of opt driver as
 a library (#79205)"

This reverts commit 32f7922646d5903f63d16c9fbfe3d508b0f8cda7.

Doesn't build, see
https://github.com/llvm/llvm-project/pull/79205#issuecomment-1908730527
---
 llvm/tools/opt/CMakeLists.txt                 |  16 +-
 llvm/tools/opt/NewPMDriver.cpp                |   9 +-
 llvm/tools/opt/NewPMDriver.h                  |  22 +-
 llvm/tools/opt/opt.cpp                        | 913 ++++++++++++++++-
 llvm/tools/opt/optdriver.cpp                  | 923 ------------------
 .../llvm-project-overlay/llvm/BUILD.bazel     |  11 +-
 6 files changed, 922 insertions(+), 972 deletions(-)
 delete mode 100644 llvm/tools/opt/optdriver.cpp

diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt
index f3dd95e178e31..2cdb99e936287 100644
--- a/llvm/tools/opt/CMakeLists.txt
+++ b/llvm/tools/opt/CMakeLists.txt
@@ -29,24 +29,12 @@ set(LLVM_LINK_COMPONENTS
   Passes
   )
 
-# We don't want to link this into libLLVM
-add_llvm_library(LLVMOptDriver
-  STATIC
+add_llvm_tool(opt
   NewPMDriver.cpp
-  optdriver.cpp
-  PARTIAL_SOURCES_INTENDED
-  DEPENDS
-  intrinsics_gen
-)
+  opt.cpp
 
-add_llvm_tool(opt
-  PARTIAL_SOURCES_INTENDED
-  opt.cpp 
   DEPENDS
   intrinsics_gen
   SUPPORT_PLUGINS
-
   )
-target_link_libraries(opt PRIVATE LLVMOptDriver)
-
 export_executable_symbols_for_plugins(opt)
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index fdfb4df53273f..6ae3f87099afd 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -328,9 +328,8 @@ bool llvm::runPassPipeline(
     StringRef Arg0, Module &M, TargetMachine *TM, TargetLibraryInfoImpl *TLII,
     ToolOutputFile *Out, ToolOutputFile *ThinLTOLinkOut,
     ToolOutputFile *OptRemarkFile, StringRef PassPipeline,
-    ArrayRef<PassPlugin> PassPlugins,
-    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks,
-    OutputKind OK, VerifierKind VK, bool ShouldPreserveAssemblyUseListOrder,
+    ArrayRef<PassPlugin> PassPlugins, OutputKind OK, VerifierKind VK,
+    bool ShouldPreserveAssemblyUseListOrder,
     bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
     bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
     bool UnifiedLTO) {
@@ -429,10 +428,6 @@ bool llvm::runPassPipeline(
   for (auto &PassPlugin : PassPlugins)
     PassPlugin.registerPassBuilderCallbacks(PB);
 
-  // Load any explicitly specified plugins.
-  for (auto &PassCallback : PassBuilderCallbacks)
-    PassCallback(PB);
-
 #define HANDLE_EXTENSION(Ext)                                                  \
   get##Ext##PluginInfo().RegisterPassBuilderCallbacks(PB);
 #include "llvm/Support/Extension.def"
diff --git a/llvm/tools/opt/NewPMDriver.h b/llvm/tools/opt/NewPMDriver.h
index 19cabd15436eb..3230c27c2d7a8 100644
--- a/llvm/tools/opt/NewPMDriver.h
+++ b/llvm/tools/opt/NewPMDriver.h
@@ -23,7 +23,6 @@
 #include "llvm/Support/CommandLine.h"
 
 namespace llvm {
-class PassBuilder;
 class StringRef;
 class Module;
 class PassPlugin;
@@ -65,17 +64,16 @@ void printPasses(raw_ostream &OS);
 ///
 /// ThinLTOLinkOut is only used when OK is OK_OutputThinLTOBitcode, and can be
 /// nullptr.
-bool runPassPipeline(
-    StringRef Arg0, Module &M, TargetMachine *TM, TargetLibraryInfoImpl *TLII,
-    ToolOutputFile *Out, ToolOutputFile *ThinLinkOut,
-    ToolOutputFile *OptRemarkFile, StringRef PassPipeline,
-    ArrayRef<PassPlugin> PassPlugins,
-    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks,
-    opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
-    bool ShouldPreserveAssemblyUseListOrder,
-    bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
-    bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
-    bool UnifiedLTO = false);
+bool runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
+                     TargetLibraryInfoImpl *TLII, ToolOutputFile *Out,
+                     ToolOutputFile *ThinLinkOut, ToolOutputFile *OptRemarkFile,
+                     StringRef PassPipeline, ArrayRef<PassPlugin> PassPlugins,
+                     opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
+                     bool ShouldPreserveAssemblyUseListOrder,
+                     bool ShouldPreserveBitcodeUseListOrder,
+                     bool EmitSummaryIndex, bool EmitModuleHash,
+                     bool EnableDebugify, bool VerifyDIPreserve,
+                     bool UnifiedLTO = false);
 } // namespace llvm
 
 #endif
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index fc8bd665cbbe9..8c0e98976e602 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -11,15 +11,912 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/ArrayRef.h"
-#include <functional>
+#include "NewPMDriver.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LegacyPassNameParser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/LinkAllIR.h"
+#include "llvm/LinkAllPasses.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Remarks/HotnessThresholdParser.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/PluginLoader.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SystemUtils.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Debugify.h"
+#include <algorithm>
+#include <memory>
+#include <optional>
+using namespace llvm;
+using namespace opt_tool;
 
-namespace llvm {
-class PassBuilder;
+static codegen::RegisterCodeGenFlags CFG;
+
+// The OptimizationList is automatically populated with registered Passes by the
+// PassNameParser.
+static cl::list<const PassInfo *, bool, PassNameParser> PassList(cl::desc(
+    "Optimizations available (use '-passes=' for the new pass manager)"));
+
+static cl::opt<bool> EnableLegacyPassManager(
+    "bugpoint-enable-legacy-pm",
+    cl::desc(
+        "Enable the legacy pass manager. This is strictly for bugpoint "
+        "due to it not working with the new PM, please do not use otherwise."),
+    cl::init(false));
+
+// This flag specifies a textual description of the optimization pass pipeline
+// to run over the module. This flag switches opt to use the new pass manager
+// infrastructure, completely disabling all of the flags specific to the old
+// pass management.
+static cl::opt<std::string> PassPipeline(
+    "passes",
+    cl::desc(
+        "A textual description of the pass pipeline. To have analysis passes "
+        "available before a certain pass, add 'require<foo-analysis>'."));
+static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
+                                   cl::desc("Alias for -passes"));
+
+static cl::opt<bool> PrintPasses("print-passes",
+                                 cl::desc("Print available passes that can be "
+                                          "specified in -passes=foo and exit"));
+
+static cl::opt<std::string>
+InputFilename(cl::Positional, cl::desc("<input bitcode file>"),
+    cl::init("-"), cl::value_desc("filename"));
+
+static cl::opt<std::string>
+OutputFilename("o", cl::desc("Override output filename"),
+               cl::value_desc("filename"));
+
+static cl::opt<bool>
+Force("f", cl::desc("Enable binary output on terminals"));
+
+static cl::opt<bool>
+NoOutput("disable-output",
+         cl::desc("Do not write result bitcode file"), cl::Hidden);
+
+static cl::opt<bool>
+OutputAssembly("S", cl::desc("Write output as LLVM assembly"));
+
+static cl::opt<bool>
+    OutputThinLTOBC("thinlto-bc",
+                    cl::desc("Write output as ThinLTO-ready bitcode"));
+
+static cl::opt<bool>
+    SplitLTOUnit("thinlto-split-lto-unit",
+                 cl::desc("Enable splitting of a ThinLTO LTOUnit"));
+
+static cl::opt<bool>
+    UnifiedLTO("unified-lto",
+               cl::desc("Use unified LTO piplines. Ignored unless -thinlto-bc "
+                        "is also specified."),
+               cl::Hidden, cl::init(false));
+
+static cl::opt<std::string> ThinLinkBitcodeFile(
+    "thin-link-bitcode-file", cl::value_desc("filename"),
+    cl::desc(
+        "A file in which to write minimized bitcode for the thin link only"));
+
+static cl::opt<bool>
+NoVerify("disable-verify", cl::desc("Do not run the verifier"), cl::Hidden);
+
+static cl::opt<bool> NoUpgradeDebugInfo("disable-upgrade-debug-info",
+                                        cl::desc("Generate invalid output"),
+                                        cl::ReallyHidden);
+
+static cl::opt<bool> VerifyEach("verify-each",
+                                cl::desc("Verify after each transform"));
+
+static cl::opt<bool>
+    DisableDITypeMap("disable-debug-info-type-map",
+                     cl::desc("Don't use a uniquing type map for debug info"));
+
+static cl::opt<bool>
+StripDebug("strip-debug",
+           cl::desc("Strip debugger symbol info from translation unit"));
+
+static cl::opt<bool>
+    StripNamedMetadata("strip-named-metadata",
+                       cl::desc("Strip module-level named metadata"));
+
+static cl::opt<bool>
+    OptLevelO0("O0", cl::desc("Optimization level 0. Similar to clang -O0. "
+                              "Same as -passes='default<O0>'"));
+
+static cl::opt<bool>
+    OptLevelO1("O1", cl::desc("Optimization level 1. Similar to clang -O1. "
+                              "Same as -passes='default<O1>'"));
+
+static cl::opt<bool>
+    OptLevelO2("O2", cl::desc("Optimization level 2. Similar to clang -O2. "
+                              "Same as -passes='default<O2>'"));
+
+static cl::opt<bool>
+    OptLevelOs("Os", cl::desc("Like -O2 but size-conscious. Similar to clang "
+                              "-Os. Same as -passes='default<Os>'"));
+
+static cl::opt<bool> OptLevelOz(
+    "Oz",
+    cl::desc("Like -O2 but optimize for code size above all else. Similar to "
+             "clang -Oz. Same as -passes='default<Oz>'"));
+
+static cl::opt<bool>
+    OptLevelO3("O3", cl::desc("Optimization level 3. Similar to clang -O3. "
+                              "Same as -passes='default<O3>'"));
+
+static cl::opt<unsigned> CodeGenOptLevelCL(
+    "codegen-opt-level",
+    cl::desc("Override optimization level for codegen hooks, legacy PM only"));
+
+static cl::opt<std::string>
+TargetTriple("mtriple", cl::desc("Override target triple for module"));
+
+static cl::opt<bool> EmitSummaryIndex("module-summary",
+                                      cl::desc("Emit module summary index"),
+                                      cl::init(false));
+
+static cl::opt<bool> EmitModuleHash("module-hash", cl::desc("Emit module hash"),
+                                    cl::init(false));
+
+static cl::opt<bool>
+DisableSimplifyLibCalls("disable-simplify-libcalls",
+                        cl::desc("Disable simplify-libcalls"));
+
+static cl::list<std::string> DisableBuiltins(
+    "disable-builtin",
+    cl::desc("Disable specific target library builtin function"));
+
+static cl::opt<bool> EnableDebugify(
+    "enable-debugify",
+    cl::desc(
+        "Start the pipeline with debugify and end it with check-debugify"));
+
+static cl::opt<bool> VerifyDebugInfoPreserve(
+    "verify-debuginfo-preserve",
+    cl::desc("Start the pipeline with collecting and end it with checking of "
+             "debug info preservation."));
+
+static cl::opt<std::string> ClDataLayout("data-layout",
+                                         cl::desc("data layout string to use"),
+                                         cl::value_desc("layout-string"),
+                                         cl::init(""));
+
+static cl::opt<bool> PreserveBitcodeUseListOrder(
+    "preserve-bc-uselistorder",
+    cl::desc("Preserve use-list order when writing LLVM bitcode."),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> PreserveAssemblyUseListOrder(
+    "preserve-ll-uselistorder",
+    cl::desc("Preserve use-list order when writing LLVM assembly."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> RunTwice("run-twice",
+                              cl::desc("Run all passes twice, re-using the "
+                                       "same pass manager (legacy PM only)."),
+                              cl::init(false), cl::Hidden);
+
+static cl::opt<bool> DiscardValueNames(
+    "discard-value-names",
+    cl::desc("Discard names from Value (other than GlobalValue)."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> TimeTrace(
+    "time-trace",
+    cl::desc("Record time trace"));
+
+static cl::opt<unsigned> TimeTraceGranularity(
+    "time-trace-granularity",
+    cl::desc("Minimum time granularity (in microseconds) traced by time profiler"),
+    cl::init(500), cl::Hidden);
+
+static cl::opt<std::string>
+    TimeTraceFile("time-trace-file",
+                    cl::desc("Specify time trace file destination"),
+                    cl::value_desc("filename"));
+
+static cl::opt<bool> RemarksWithHotness(
+    "pass-remarks-with-hotness",
+    cl::desc("With PGO, include profile count in optimization remarks"),
+    cl::Hidden);
+
+static cl::opt<std::optional<uint64_t>, false, remarks::HotnessThresholdParser>
+    RemarksHotnessThreshold(
+        "pass-remarks-hotness-threshold",
+        cl::desc("Minimum profile count required for "
+                 "an optimization remark to be output. "
+                 "Use 'auto' to apply the threshold from profile summary"),
+        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
+
+static cl::opt<std::string>
+    RemarksFilename("pass-remarks-output",
+                    cl::desc("Output filename for pass remarks"),
+                    cl::value_desc("filename"));
+
+static cl::opt<std::string>
+    RemarksPasses("pass-remarks-filter",
+                  cl::desc("Only record optimization remarks from passes whose "
+                           "names match the given regular expression"),
+                  cl::value_desc("regex"));
+
+static cl::opt<std::string> RemarksFormat(
+    "pass-remarks-format",
+    cl::desc("The format used for serializing remarks (default: YAML)"),
+    cl::value_desc("format"), cl::init("yaml"));
+
+static cl::list<std::string>
+    PassPlugins("load-pass-plugin",
+                cl::desc("Load passes from plugin library"));
+
+static cl::opt<bool> TryUseNewDbgInfoFormat(
+    "try-experimental-debuginfo-iterators",
+    cl::desc("Enable debuginfo iterator positions, if they're built in"),
+    cl::init(false), cl::Hidden);
+
+extern cl::opt<bool> UseNewDbgInfoFormat;
+
+//===----------------------------------------------------------------------===//
+// CodeGen-related helper functions.
+//
+
+static CodeGenOptLevel GetCodeGenOptLevel() {
+  return static_cast<CodeGenOptLevel>(unsigned(CodeGenOptLevelCL));
 }
 
-extern "C" int optMain(int argc, char **argv,
-                       llvm::ArrayRef<std::function<void(llvm::PassBuilder &)>>
-                           PassBuilderCallbacks);
+struct TimeTracerRAII {
+  TimeTracerRAII(StringRef ProgramName) {
+    if (TimeTrace)
+      timeTraceProfilerInitialize(TimeTraceGranularity, ProgramName);
+  }
+  ~TimeTracerRAII() {
+    if (TimeTrace) {
+      if (auto E = timeTraceProfilerWrite(TimeTraceFile, OutputFilename)) {
+        handleAllErrors(std::move(E), [&](const StringError &SE) {
+          errs() << SE.getMessage() << "\n";
+        });
+        return;
+      }
+      timeTraceProfilerCleanup();
+    }
+  }
+};
 
-int main(int argc, char **argv) { return optMain(argc, argv, {}); }
+// For use in NPM transition. Currently this contains most codegen-specific
+// passes. Remove passes from here when porting to the NPM.
+// TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once
+// it exists.
+static bool shouldPinPassToLegacyPM(StringRef Pass) {
+  std::vector<StringRef> PassNameExactToIgnore = {
+      "nvvm-reflect",
+      "nvvm-intr-range",
+      "amdgpu-simplifylib",
+      "amdgpu-image-intrinsic-opt",
+      "amdgpu-usenative",
+      "amdgpu-promote-alloca",
+      "amdgpu-promote-alloca-to-vector",
+      "amdgpu-lower-kernel-attributes",
+      "amdgpu-propagate-attributes-early",
+      "amdgpu-propagate-attributes-late",
+      "amdgpu-unify-metadata",
+      "amdgpu-printf-runtime-binding",
+      "amdgpu-always-inline"};
+  if (llvm::is_contained(PassNameExactToIgnore, Pass))
+    return false;
+
+  std::vector<StringRef> PassNamePrefix = {
+      "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
+      "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
+      "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
+      "amdgcn-", "polly-", "riscv-", "dxil-"};
+  std::vector<StringRef> PassNameContain = {"-eh-prepare"};
+  std::vector<StringRef> PassNameExact = {
+      "safe-stack",
+      "cost-model",
+      "codegenprepare",
+      "interleaved-load-combine",
+      "unreachableblockelim",
+      "verify-safepoint-ir",
+      "atomic-expand",
+      "expandvp",
+      "mve-tail-predication",
+      "interleaved-access",
+      "global-merge",
+      "pre-isel-intrinsic-lowering",
+      "expand-reductions",
+      "indirectbr-expand",
+      "generic-to-nvvm",
+      "expand-memcmp",
+      "loop-reduce",
+      "lower-amx-type",
+      "lower-amx-intrinsics",
+      "polyhedral-info",
+      "print-polyhedral-info",
+      "replace-with-veclib",
+      "jmc-instrumenter",
+      "dot-regions",
+      "dot-regions-only",
+      "view-regions",
+      "view-regions-only",
+      "select-optimize",
+      "expand-large-div-rem",
+      "structurizecfg",
+      "fix-irreducible",
+      "expand-large-fp-convert",
+      "callbrprepare",
+  };
+  for (const auto &P : PassNamePrefix)
+    if (Pass.starts_with(P))
+      return true;
+  for (const auto &P : PassNameContain)
+    if (Pass.contains(P))
+      return true;
+  return llvm::is_contained(PassNameExact, Pass);
+}
+
+// For use in NPM transition.
+static bool shouldForceLegacyPM() {
+  for (const auto &P : PassList) {
+    StringRef Arg = P->getPassArgument();
+    if (shouldPinPassToLegacyPM(Arg))
+      return true;
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// main for opt
+//
+int main(int argc, char **argv) {
+  InitLLVM X(argc, argv);
+
+  // Enable debug stream buffering.
+  EnableDebugBuffering = true;
+
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+
+  // Initialize passes
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+  initializeCore(Registry);
+  initializeScalarOpts(Registry);
+  initializeVectorization(Registry);
+  initializeIPO(Registry);
+  initializeAnalysis(Registry);
+  initializeTransformUtils(Registry);
+  initializeInstCombine(Registry);
+  initializeTarget(Registry);
+  // For codegen passes, only passes that do IR to IR transformation are
+  // supported.
+  initializeExpandLargeDivRemLegacyPassPass(Registry);
+  initializeExpandLargeFpConvertLegacyPassPass(Registry);
+  initializeExpandMemCmpLegacyPassPass(Registry);
+  initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
+  initializeSelectOptimizePass(Registry);
+  initializeCallBrPreparePass(Registry);
+  initializeCodeGenPrepareLegacyPassPass(Registry);
+  initializeAtomicExpandPass(Registry);
+  initializeWinEHPreparePass(Registry);
+  initializeDwarfEHPrepareLegacyPassPass(Registry);
+  initializeSafeStackLegacyPassPass(Registry);
+  initializeSjLjEHPreparePass(Registry);
+  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
+  initializeGlobalMergePass(Registry);
+  initializeIndirectBrExpandLegacyPassPass(Registry);
+  initializeInterleavedLoadCombinePass(Registry);
+  initializeInterleavedAccessPass(Registry);
+  initializeUnreachableBlockElimLegacyPassPass(Registry);
+  initializeExpandReductionsPass(Registry);
+  initializeExpandVectorPredicationPass(Registry);
+  initializeWasmEHPreparePass(Registry);
+  initializeWriteBitcodePassPass(Registry);
+  initializeReplaceWithVeclibLegacyPass(Registry);
+  initializeJMCInstrumenterPass(Registry);
+
+  SmallVector<PassPlugin, 1> PluginList;
+  PassPlugins.setCallback([&](const std::string &PluginPath) {
+    auto Plugin = PassPlugin::Load(PluginPath);
+    if (!Plugin)
+      report_fatal_error(Plugin.takeError(), /*gen_crash_diag=*/false);
+    PluginList.emplace_back(Plugin.get());
+  });
+
+  // Register the Target and CPU printer for --version.
+  cl::AddExtraVersionPrinter(sys::printDefaultTargetAndDetectedCPU);
+
+  cl::ParseCommandLineOptions(argc, argv,
+    "llvm .bc -> .bc modular optimizer and analysis printer\n");
+
+  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
+  // new debug-info format, if it's built in.
+#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  if (TryUseNewDbgInfoFormat) {
+    // If LLVM was built with support for this, turn the new debug-info format
+    // on.
+    UseNewDbgInfoFormat = true;
+  }
+#endif
+  (void)TryUseNewDbgInfoFormat;
+
+  LLVMContext Context;
+
+  // TODO: remove shouldForceLegacyPM().
+  const bool UseNPM = (!EnableLegacyPassManager && !shouldForceLegacyPM()) ||
+                      PassPipeline.getNumOccurrences() > 0;
+
+  if (UseNPM && !PassList.empty()) {
+    errs() << "The `opt -passname` syntax for the new pass manager is "
+              "not supported, please use `opt -passes=<pipeline>` (or the `-p` "
+              "alias for a more concise version).\n";
+    errs() << "See https://llvm.org/docs/NewPassManager.html#invoking-opt "
+              "for more details on the pass pipeline syntax.\n\n";
+    return 1;
+  }
+
+  if (!UseNPM && PluginList.size()) {
+    errs() << argv[0] << ": " << PassPlugins.ArgStr
+           << " specified with legacy PM.\n";
+    return 1;
+  }
+
+  // FIXME: once the legacy PM code is deleted, move runPassPipeline() here and
+  // construct the PassBuilder before parsing IR so we can reuse the same
+  // PassBuilder for print passes.
+  if (PrintPasses) {
+    printPasses(outs());
+    return 0;
+  }
+
+  TimeTracerRAII TimeTracer(argv[0]);
+
+  SMDiagnostic Err;
+
+  Context.setDiscardValueNames(DiscardValueNames);
+  if (!DisableDITypeMap)
+    Context.enableDebugTypeODRUniquing();
+
+  Expected<std::unique_ptr<ToolOutputFile>> RemarksFileOrErr =
+      setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
+                                   RemarksFormat, RemarksWithHotness,
+                                   RemarksHotnessThreshold);
+  if (Error E = RemarksFileOrErr.takeError()) {
+    errs() << toString(std::move(E)) << '\n';
+    return 1;
+  }
+  std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
+
+  // Load the input module...
+  auto SetDataLayout = [&](StringRef IRTriple,
+                           StringRef IRLayout) -> std::optional<std::string> {
+    // Data layout specified on the command line has the highest priority.
+    if (!ClDataLayout.empty())
+      return ClDataLayout;
+    // If an explicit data layout is already defined in the IR, don't infer.
+    if (!IRLayout.empty())
+      return std::nullopt;
+
+    // If an explicit triple was specified (either in the IR or on the
+    // command line), use that to infer the default data layout. However, the
+    // command line target triple should override the IR file target triple.
+    std::string TripleStr =
+        TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple);
+    // If the triple string is still empty, we don't fall back to
+    // sys::getDefaultTargetTriple() since we do not want to have differing
+    // behaviour dependent on the configured default triple. Therefore, if the
+    // user did not pass -mtriple or define an explicit triple/datalayout in
+    // the IR, we should default to an empty (default) DataLayout.
+    if (TripleStr.empty())
+      return std::nullopt;
+    // Otherwise we infer the DataLayout from the target machine.
+    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
+        codegen::createTargetMachineForTriple(TripleStr, GetCodeGenOptLevel());
+    if (!ExpectedTM) {
+      errs() << argv[0] << ": warning: failed to infer data layout: "
+             << toString(ExpectedTM.takeError()) << "\n";
+      return std::nullopt;
+    }
+    return (*ExpectedTM)->createDataLayout().getStringRepresentation();
+  };
+  std::unique_ptr<Module> M;
+  if (NoUpgradeDebugInfo)
+    M = parseAssemblyFileWithIndexNoUpgradeDebugInfo(
+            InputFilename, Err, Context, nullptr, SetDataLayout)
+            .Mod;
+  else
+    M = parseIRFile(InputFilename, Err, Context,
+                    ParserCallbacks(SetDataLayout));
+
+  if (!M) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+
+  // Strip debug info before running the verifier.
+  if (StripDebug)
+    StripDebugInfo(*M);
+
+  // Erase module-level named metadata, if requested.
+  if (StripNamedMetadata) {
+    while (!M->named_metadata_empty()) {
+      NamedMDNode *NMD = &*M->named_metadata_begin();
+      M->eraseNamedMetadata(NMD);
+    }
+  }
+
+  // If we are supposed to override the target triple, do so now.
+  if (!TargetTriple.empty())
+    M->setTargetTriple(Triple::normalize(TargetTriple));
+
+  // Immediately run the verifier to catch any problems before starting up the
+  // pass pipelines.  Otherwise we can crash on broken code during
+  // doInitialization().
+  if (!NoVerify && verifyModule(*M, &errs())) {
+    errs() << argv[0] << ": " << InputFilename
+           << ": error: input module is broken!\n";
+    return 1;
+  }
+
+  // Enable testing of whole program devirtualization on this module by invoking
+  // the facility for updating public visibility to linkage unit visibility when
+  // specified by an internal option. This is normally done during LTO which is
+  // not performed via opt.
+  updateVCallVisibilityInModule(
+      *M,
+      /*WholeProgramVisibilityEnabledInLTO=*/false,
+      // FIXME: These need linker information via a
+      // TBD new interface.
+      /*DynamicExportSymbols=*/{},
+      /*ValidateAllVtablesHaveTypeInfos=*/false,
+      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
+
+  // Figure out what stream we are supposed to write to...
+  std::unique_ptr<ToolOutputFile> Out;
+  std::unique_ptr<ToolOutputFile> ThinLinkOut;
+  if (NoOutput) {
+    if (!OutputFilename.empty())
+      errs() << "WARNING: The -o (output filename) option is ignored when\n"
+                "the --disable-output option is used.\n";
+  } else {
+    // Default to standard output.
+    if (OutputFilename.empty())
+      OutputFilename = "-";
+
+    std::error_code EC;
+    sys::fs::OpenFlags Flags =
+        OutputAssembly ? sys::fs::OF_TextWithCRLF : sys::fs::OF_None;
+    Out.reset(new ToolOutputFile(OutputFilename, EC, Flags));
+    if (EC) {
+      errs() << EC.message() << '\n';
+      return 1;
+    }
+
+    if (!ThinLinkBitcodeFile.empty()) {
+      ThinLinkOut.reset(
+          new ToolOutputFile(ThinLinkBitcodeFile, EC, sys::fs::OF_None));
+      if (EC) {
+        errs() << EC.message() << '\n';
+        return 1;
+      }
+    }
+  }
+
+  Triple ModuleTriple(M->getTargetTriple());
+  std::string CPUStr, FeaturesStr;
+  std::unique_ptr<TargetMachine> TM;
+  if (ModuleTriple.getArch()) {
+    CPUStr = codegen::getCPUStr();
+    FeaturesStr = codegen::getFeaturesStr();
+    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
+        codegen::createTargetMachineForTriple(ModuleTriple.str(),
+                                              GetCodeGenOptLevel());
+    if (auto E = ExpectedTM.takeError()) {
+      errs() << argv[0] << ": WARNING: failed to create target machine for '"
+             << ModuleTriple.str() << "': " << toString(std::move(E)) << "\n";
+    } else {
+      TM = std::move(*ExpectedTM);
+    }
+  } else if (ModuleTriple.getArchName() != "unknown" &&
+             ModuleTriple.getArchName() != "") {
+    errs() << argv[0] << ": unrecognized architecture '"
+           << ModuleTriple.getArchName() << "' provided.\n";
+    return 1;
+  }
+
+  // Override function attributes based on CPUStr, FeaturesStr, and command line
+  // flags.
+  codegen::setFunctionAttributes(CPUStr, FeaturesStr, *M);
+
+  // If the output is set to be emitted to standard out, and standard out is a
+  // console, print out a warning message and refuse to do it.  We don't
+  // impress anyone by spewing tons of binary goo to a terminal.
+  if (!Force && !NoOutput && !OutputAssembly)
+    if (CheckBitcodeOutputToConsole(Out->os()))
+      NoOutput = true;
+
+  if (OutputThinLTOBC) {
+    M->addModuleFlag(Module::Error, "EnableSplitLTOUnit", SplitLTOUnit);
+    if (UnifiedLTO)
+      M->addModuleFlag(Module::Error, "UnifiedLTO", 1);
+  }
+
+  // Add an appropriate TargetLibraryInfo pass for the module's triple.
+  TargetLibraryInfoImpl TLII(ModuleTriple);
+
+  // The -disable-simplify-libcalls flag actually disables all builtin optzns.
+  if (DisableSimplifyLibCalls)
+    TLII.disableAllFunctions();
+  else {
+    // Disable individual builtin functions in TargetLibraryInfo.
+    LibFunc F;
+    for (auto &FuncName : DisableBuiltins)
+      if (TLII.getLibFunc(FuncName, F))
+        TLII.setUnavailable(F);
+      else {
+        errs() << argv[0] << ": cannot disable nonexistent builtin function "
+               << FuncName << '\n';
+        return 1;
+      }
+  }
+
+  if (UseNPM) {
+    if (legacy::debugPassSpecified()) {
+      errs() << "-debug-pass does not work with the new PM, either use "
+                "-debug-pass-manager, or use the legacy PM\n";
+      return 1;
+    }
+    auto NumOLevel = OptLevelO0 + OptLevelO1 + OptLevelO2 + OptLevelO3 +
+                     OptLevelOs + OptLevelOz;
+    if (NumOLevel > 1) {
+      errs() << "Cannot specify multiple -O#\n";
+      return 1;
+    }
+    if (NumOLevel > 0 && (PassPipeline.getNumOccurrences() > 0)) {
+      errs() << "Cannot specify -O# and --passes=/--foo-pass, use "
+                "-passes='default<O#>,other-pass'\n";
+      return 1;
+    }
+    std::string Pipeline = PassPipeline;
+
+    if (OptLevelO0)
+      Pipeline = "default<O0>";
+    if (OptLevelO1)
+      Pipeline = "default<O1>";
+    if (OptLevelO2)
+      Pipeline = "default<O2>";
+    if (OptLevelO3)
+      Pipeline = "default<O3>";
+    if (OptLevelOs)
+      Pipeline = "default<Os>";
+    if (OptLevelOz)
+      Pipeline = "default<Oz>";
+    OutputKind OK = OK_NoOutput;
+    if (!NoOutput)
+      OK = OutputAssembly
+               ? OK_OutputAssembly
+               : (OutputThinLTOBC ? OK_OutputThinLTOBitcode : OK_OutputBitcode);
+
+    VerifierKind VK = VK_VerifyOut;
+    if (NoVerify)
+      VK = VK_NoVerifier;
+    else if (VerifyEach)
+      VK = VK_VerifyEachPass;
+
+    // The user has asked to use the new pass manager and provided a pipeline
+    // string. Hand off the rest of the functionality to the new code for that
+    // layer.
+    return runPassPipeline(argv[0], *M, TM.get(), &TLII, Out.get(),
+                           ThinLinkOut.get(), RemarksFile.get(), Pipeline,
+                           PluginList, OK, VK, PreserveAssemblyUseListOrder,
+                           PreserveBitcodeUseListOrder, EmitSummaryIndex,
+                           EmitModuleHash, EnableDebugify,
+                           VerifyDebugInfoPreserve, UnifiedLTO)
+               ? 0
+               : 1;
+  }
+
+  if (OptLevelO0 || OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz ||
+      OptLevelO3) {
+    errs() << "Cannot use -O# with legacy PM.\n";
+    return 1;
+  }
+  if (EmitSummaryIndex) {
+    errs() << "Cannot use -module-summary with legacy PM.\n";
+    return 1;
+  }
+  if (EmitModuleHash) {
+    errs() << "Cannot use -module-hash with legacy PM.\n";
+    return 1;
+  }
+  if (OutputThinLTOBC) {
+    errs() << "Cannot use -thinlto-bc with legacy PM.\n";
+    return 1;
+  }
+  // Create a PassManager to hold and optimize the collection of passes we are
+  // about to build. If the -debugify-each option is set, wrap each pass with
+  // the (-check)-debugify passes.
+  DebugifyCustomPassManager Passes;
+  DebugifyStatsMap DIStatsMap;
+  DebugInfoPerPass DebugInfoBeforePass;
+  if (DebugifyEach) {
+    Passes.setDebugifyMode(DebugifyMode::SyntheticDebugInfo);
+    Passes.setDIStatsMap(DIStatsMap);
+  } else if (VerifyEachDebugInfoPreserve) {
+    Passes.setDebugifyMode(DebugifyMode::OriginalDebugInfo);
+    Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
+    if (!VerifyDIPreserveExport.empty())
+      Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
+  }
+
+  bool AddOneTimeDebugifyPasses =
+      (EnableDebugify && !DebugifyEach) ||
+      (VerifyDebugInfoPreserve && !VerifyEachDebugInfoPreserve);
+
+  Passes.add(new TargetLibraryInfoWrapperPass(TLII));
+
+  // Add internal analysis passes from the target machine.
+  Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
+                                                     : TargetIRAnalysis()));
+
+  if (AddOneTimeDebugifyPasses) {
+    if (EnableDebugify) {
+      Passes.setDIStatsMap(DIStatsMap);
+      Passes.add(createDebugifyModulePass());
+    } else if (VerifyDebugInfoPreserve) {
+      Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
+      Passes.add(createDebugifyModulePass(
+          DebugifyMode::OriginalDebugInfo, "",
+          &(Passes.getDebugInfoPerPass())));
+    }
+  }
+
+  if (TM) {
+    // FIXME: We should dyn_cast this when supported.
+    auto &LTM = static_cast<LLVMTargetMachine &>(*TM);
+    Pass *TPC = LTM.createPassConfig(Passes);
+    Passes.add(TPC);
+  }
+
+  // Create a new optimization pass for each one specified on the command line
+  for (unsigned i = 0; i < PassList.size(); ++i) {
+    const PassInfo *PassInf = PassList[i];
+    if (PassInf->getNormalCtor()) {
+      Pass *P = PassInf->getNormalCtor()();
+      if (P) {
+        // Add the pass to the pass manager.
+        Passes.add(P);
+        // If we are verifying all of the intermediate steps, add the verifier.
+        if (VerifyEach)
+          Passes.add(createVerifierPass());
+      }
+    } else
+      errs() << argv[0] << ": cannot create pass: "
+             << PassInf->getPassName() << "\n";
+  }
+
+  // Check that the module is well formed on completion of optimization
+  if (!NoVerify && !VerifyEach)
+    Passes.add(createVerifierPass());
+
+  if (AddOneTimeDebugifyPasses) {
+    if (EnableDebugify)
+      Passes.add(createCheckDebugifyModulePass(false));
+    else if (VerifyDebugInfoPreserve) {
+      if (!VerifyDIPreserveExport.empty())
+        Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
+      Passes.add(createCheckDebugifyModulePass(
+          false, "", nullptr, DebugifyMode::OriginalDebugInfo,
+          &(Passes.getDebugInfoPerPass()), VerifyDIPreserveExport));
+    }
+  }
+
+  // In run twice mode, we want to make sure the output is bit-by-bit
+  // equivalent if we run the pass manager again, so setup two buffers and
+  // a stream to write to them. Note that llc does something similar and it
+  // may be worth to abstract this out in the future.
+  SmallVector<char, 0> Buffer;
+  SmallVector<char, 0> FirstRunBuffer;
+  std::unique_ptr<raw_svector_ostream> BOS;
+  raw_ostream *OS = nullptr;
+
+  const bool ShouldEmitOutput = !NoOutput;
+
+  // Write bitcode or assembly to the output as the last step...
+  if (ShouldEmitOutput || RunTwice) {
+    assert(Out);
+    OS = &Out->os();
+    if (RunTwice) {
+      BOS = std::make_unique<raw_svector_ostream>(Buffer);
+      OS = BOS.get();
+    }
+    if (OutputAssembly)
+      Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
+    else
+      Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder));
+  }
+
+  // Before executing passes, print the final values of the LLVM options.
+  cl::PrintOptionValues();
+
+  if (!RunTwice) {
+    // Now that we have all of the passes ready, run them.
+    Passes.run(*M);
+  } else {
+    // If requested, run all passes twice with the same pass manager to catch
+    // bugs caused by persistent state in the passes.
+    std::unique_ptr<Module> M2(CloneModule(*M));
+    // Run all passes on the original module first, so the second run processes
+    // the clone to catch CloneModule bugs.
+    Passes.run(*M);
+    FirstRunBuffer = Buffer;
+    Buffer.clear();
+
+    Passes.run(*M2);
+
+    // Compare the two outputs and make sure they're the same
+    assert(Out);
+    if (Buffer.size() != FirstRunBuffer.size() ||
+        (memcmp(Buffer.data(), FirstRunBuffer.data(), Buffer.size()) != 0)) {
+      errs()
+          << "Running the pass manager twice changed the output.\n"
+             "Writing the result of the second run to the specified output.\n"
+             "To generate the one-run comparison binary, just run without\n"
+             "the compile-twice option\n";
+      if (ShouldEmitOutput) {
+        Out->os() << BOS->str();
+        Out->keep();
+      }
+      if (RemarksFile)
+        RemarksFile->keep();
+      return 1;
+    }
+    if (ShouldEmitOutput)
+      Out->os() << BOS->str();
+  }
+
+  if (DebugifyEach && !DebugifyExport.empty())
+    exportDebugifyStats(DebugifyExport, Passes.getDebugifyStatsMap());
+
+  // Declare success.
+  if (!NoOutput)
+    Out->keep();
+
+  if (RemarksFile)
+    RemarksFile->keep();
+
+  if (ThinLinkOut)
+    ThinLinkOut->keep();
+
+  return 0;
+}
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
deleted file mode 100644
index 3f66bfc9f0176..0000000000000
--- a/llvm/tools/opt/optdriver.cpp
+++ /dev/null
@@ -1,923 +0,0 @@
-//===- optdriver.cpp - The LLVM Modular Optimizer -------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Optimizations may be specified an arbitrary number of times on the command
-// line, They are run in the order specified. Common driver library for re-use
-// by potential downstream opt-variants.
-//
-//===----------------------------------------------------------------------===//
-
-#include "NewPMDriver.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/RegionPass.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LLVMRemarkStreamer.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/LegacyPassNameParser.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/LinkAllIR.h"
-#include "llvm/LinkAllPasses.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Passes/PassPlugin.h"
-#include "llvm/Remarks/HotnessThresholdParser.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/PluginLoader.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/SystemUtils.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/YAMLTraits.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/TargetParser/Host.h"
-#include "llvm/TargetParser/SubtargetFeature.h"
-#include "llvm/TargetParser/Triple.h"
-#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Debugify.h"
-#include <algorithm>
-#include <memory>
-#include <optional>
-using namespace llvm;
-using namespace opt_tool;
-
-static codegen::RegisterCodeGenFlags CFG;
-
-// The OptimizationList is automatically populated with registered Passes by the
-// PassNameParser.
-static cl::list<const PassInfo *, bool, PassNameParser> PassList(cl::desc(
-    "Optimizations available (use '-passes=' for the new pass manager)"));
-
-static cl::opt<bool> EnableLegacyPassManager(
-    "bugpoint-enable-legacy-pm",
-    cl::desc(
-        "Enable the legacy pass manager. This is strictly for bugpoint "
-        "due to it not working with the new PM, please do not use otherwise."),
-    cl::init(false));
-
-// This flag specifies a textual description of the optimization pass pipeline
-// to run over the module. This flag switches opt to use the new pass manager
-// infrastructure, completely disabling all of the flags specific to the old
-// pass management.
-static cl::opt<std::string> PassPipeline(
-    "passes",
-    cl::desc(
-        "A textual description of the pass pipeline. To have analysis passes "
-        "available before a certain pass, add 'require<foo-analysis>'."));
-static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
-                               cl::desc("Alias for -passes"));
-
-static cl::opt<bool> PrintPasses("print-passes",
-                                 cl::desc("Print available passes that can be "
-                                          "specified in -passes=foo and exit"));
-
-static cl::opt<std::string> InputFilename(cl::Positional,
-                                          cl::desc("<input bitcode file>"),
-                                          cl::init("-"),
-                                          cl::value_desc("filename"));
-
-static cl::opt<std::string> OutputFilename("o",
-                                           cl::desc("Override output filename"),
-                                           cl::value_desc("filename"));
-
-static cl::opt<bool> Force("f", cl::desc("Enable binary output on terminals"));
-
-static cl::opt<bool> NoOutput("disable-output",
-                              cl::desc("Do not write result bitcode file"),
-                              cl::Hidden);
-
-static cl::opt<bool> OutputAssembly("S",
-                                    cl::desc("Write output as LLVM assembly"));
-
-static cl::opt<bool>
-    OutputThinLTOBC("thinlto-bc",
-                    cl::desc("Write output as ThinLTO-ready bitcode"));
-
-static cl::opt<bool>
-    SplitLTOUnit("thinlto-split-lto-unit",
-                 cl::desc("Enable splitting of a ThinLTO LTOUnit"));
-
-static cl::opt<bool>
-    UnifiedLTO("unified-lto",
-               cl::desc("Use unified LTO piplines. Ignored unless -thinlto-bc "
-                        "is also specified."),
-               cl::Hidden, cl::init(false));
-
-static cl::opt<std::string> ThinLinkBitcodeFile(
-    "thin-link-bitcode-file", cl::value_desc("filename"),
-    cl::desc(
-        "A file in which to write minimized bitcode for the thin link only"));
-
-static cl::opt<bool> NoVerify("disable-verify",
-                              cl::desc("Do not run the verifier"), cl::Hidden);
-
-static cl::opt<bool> NoUpgradeDebugInfo("disable-upgrade-debug-info",
-                                        cl::desc("Generate invalid output"),
-                                        cl::ReallyHidden);
-
-static cl::opt<bool> VerifyEach("verify-each",
-                                cl::desc("Verify after each transform"));
-
-static cl::opt<bool>
-    DisableDITypeMap("disable-debug-info-type-map",
-                     cl::desc("Don't use a uniquing type map for debug info"));
-
-static cl::opt<bool>
-    StripDebug("strip-debug",
-               cl::desc("Strip debugger symbol info from translation unit"));
-
-static cl::opt<bool>
-    StripNamedMetadata("strip-named-metadata",
-                       cl::desc("Strip module-level named metadata"));
-
-static cl::opt<bool>
-    OptLevelO0("O0", cl::desc("Optimization level 0. Similar to clang -O0. "
-                              "Same as -passes='default<O0>'"));
-
-static cl::opt<bool>
-    OptLevelO1("O1", cl::desc("Optimization level 1. Similar to clang -O1. "
-                              "Same as -passes='default<O1>'"));
-
-static cl::opt<bool>
-    OptLevelO2("O2", cl::desc("Optimization level 2. Similar to clang -O2. "
-                              "Same as -passes='default<O2>'"));
-
-static cl::opt<bool>
-    OptLevelOs("Os", cl::desc("Like -O2 but size-conscious. Similar to clang "
-                              "-Os. Same as -passes='default<Os>'"));
-
-static cl::opt<bool> OptLevelOz(
-    "Oz",
-    cl::desc("Like -O2 but optimize for code size above all else. Similar to "
-             "clang -Oz. Same as -passes='default<Oz>'"));
-
-static cl::opt<bool>
-    OptLevelO3("O3", cl::desc("Optimization level 3. Similar to clang -O3. "
-                              "Same as -passes='default<O3>'"));
-
-static cl::opt<unsigned> CodeGenOptLevelCL(
-    "codegen-opt-level",
-    cl::desc("Override optimization level for codegen hooks, legacy PM only"));
-
-static cl::opt<std::string>
-    TargetTriple("mtriple", cl::desc("Override target triple for module"));
-
-static cl::opt<bool> EmitSummaryIndex("module-summary",
-                                      cl::desc("Emit module summary index"),
-                                      cl::init(false));
-
-static cl::opt<bool> EmitModuleHash("module-hash", cl::desc("Emit module hash"),
-                                    cl::init(false));
-
-static cl::opt<bool>
-    DisableSimplifyLibCalls("disable-simplify-libcalls",
-                            cl::desc("Disable simplify-libcalls"));
-
-static cl::list<std::string> DisableBuiltins(
-    "disable-builtin",
-    cl::desc("Disable specific target library builtin function"));
-
-static cl::opt<bool> EnableDebugify(
-    "enable-debugify",
-    cl::desc(
-        "Start the pipeline with debugify and end it with check-debugify"));
-
-static cl::opt<bool> VerifyDebugInfoPreserve(
-    "verify-debuginfo-preserve",
-    cl::desc("Start the pipeline with collecting and end it with checking of "
-             "debug info preservation."));
-
-static cl::opt<std::string> ClDataLayout("data-layout",
-                                         cl::desc("data layout string to use"),
-                                         cl::value_desc("layout-string"),
-                                         cl::init(""));
-
-static cl::opt<bool> PreserveBitcodeUseListOrder(
-    "preserve-bc-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM bitcode."),
-    cl::init(true), cl::Hidden);
-
-static cl::opt<bool> PreserveAssemblyUseListOrder(
-    "preserve-ll-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM assembly."),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<bool> RunTwice("run-twice",
-                              cl::desc("Run all passes twice, re-using the "
-                                       "same pass manager (legacy PM only)."),
-                              cl::init(false), cl::Hidden);
-
-static cl::opt<bool> DiscardValueNames(
-    "discard-value-names",
-    cl::desc("Discard names from Value (other than GlobalValue)."),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<bool> TimeTrace("time-trace", cl::desc("Record time trace"));
-
-static cl::opt<unsigned> TimeTraceGranularity(
-    "time-trace-granularity",
-    cl::desc(
-        "Minimum time granularity (in microseconds) traced by time profiler"),
-    cl::init(500), cl::Hidden);
-
-static cl::opt<std::string>
-    TimeTraceFile("time-trace-file",
-                  cl::desc("Specify time trace file destination"),
-                  cl::value_desc("filename"));
-
-static cl::opt<bool> RemarksWithHotness(
-    "pass-remarks-with-hotness",
-    cl::desc("With PGO, include profile count in optimization remarks"),
-    cl::Hidden);
-
-static cl::opt<std::optional<uint64_t>, false, remarks::HotnessThresholdParser>
-    RemarksHotnessThreshold(
-        "pass-remarks-hotness-threshold",
-        cl::desc("Minimum profile count required for "
-                 "an optimization remark to be output. "
-                 "Use 'auto' to apply the threshold from profile summary"),
-        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
-
-static cl::opt<std::string>
-    RemarksFilename("pass-remarks-output",
-                    cl::desc("Output filename for pass remarks"),
-                    cl::value_desc("filename"));
-
-static cl::opt<std::string>
-    RemarksPasses("pass-remarks-filter",
-                  cl::desc("Only record optimization remarks from passes whose "
-                           "names match the given regular expression"),
-                  cl::value_desc("regex"));
-
-static cl::opt<std::string> RemarksFormat(
-    "pass-remarks-format",
-    cl::desc("The format used for serializing remarks (default: YAML)"),
-    cl::value_desc("format"), cl::init("yaml"));
-
-static cl::list<std::string>
-    PassPlugins("load-pass-plugin",
-                cl::desc("Load passes from plugin library"));
-
-static cl::opt<bool> TryUseNewDbgInfoFormat(
-    "try-experimental-debuginfo-iterators",
-    cl::desc("Enable debuginfo iterator positions, if they're built in"),
-    cl::init(false), cl::Hidden);
-
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
-//===----------------------------------------------------------------------===//
-// CodeGen-related helper functions.
-//
-
-static CodeGenOptLevel GetCodeGenOptLevel() {
-  return static_cast<CodeGenOptLevel>(unsigned(CodeGenOptLevelCL));
-}
-
-struct TimeTracerRAII {
-  TimeTracerRAII(StringRef ProgramName) {
-    if (TimeTrace)
-      timeTraceProfilerInitialize(TimeTraceGranularity, ProgramName);
-  }
-  ~TimeTracerRAII() {
-    if (TimeTrace) {
-      if (auto E = timeTraceProfilerWrite(TimeTraceFile, OutputFilename)) {
-        handleAllErrors(std::move(E), [&](const StringError &SE) {
-          errs() << SE.getMessage() << "\n";
-        });
-        return;
-      }
-      timeTraceProfilerCleanup();
-    }
-  }
-};
-
-// For use in NPM transition. Currently this contains most codegen-specific
-// passes. Remove passes from here when porting to the NPM.
-// TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once
-// it exists.
-static bool shouldPinPassToLegacyPM(StringRef Pass) {
-  std::vector<StringRef> PassNameExactToIgnore = {
-      "nvvm-reflect",
-      "nvvm-intr-range",
-      "amdgpu-simplifylib",
-      "amdgpu-image-intrinsic-opt",
-      "amdgpu-usenative",
-      "amdgpu-promote-alloca",
-      "amdgpu-promote-alloca-to-vector",
-      "amdgpu-lower-kernel-attributes",
-      "amdgpu-propagate-attributes-early",
-      "amdgpu-propagate-attributes-late",
-      "amdgpu-unify-metadata",
-      "amdgpu-printf-runtime-binding",
-      "amdgpu-always-inline"};
-  if (llvm::is_contained(PassNameExactToIgnore, Pass))
-    return false;
-
-  std::vector<StringRef> PassNamePrefix = {
-      "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
-      "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
-      "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
-      "amdgcn-", "polly-", "riscv-", "dxil-"};
-  std::vector<StringRef> PassNameContain = {"-eh-prepare"};
-  std::vector<StringRef> PassNameExact = {
-      "safe-stack",
-      "cost-model",
-      "codegenprepare",
-      "interleaved-load-combine",
-      "unreachableblockelim",
-      "verify-safepoint-ir",
-      "atomic-expand",
-      "expandvp",
-      "mve-tail-predication",
-      "interleaved-access",
-      "global-merge",
-      "pre-isel-intrinsic-lowering",
-      "expand-reductions",
-      "indirectbr-expand",
-      "generic-to-nvvm",
-      "expand-memcmp",
-      "loop-reduce",
-      "lower-amx-type",
-      "lower-amx-intrinsics",
-      "polyhedral-info",
-      "print-polyhedral-info",
-      "replace-with-veclib",
-      "jmc-instrumenter",
-      "dot-regions",
-      "dot-regions-only",
-      "view-regions",
-      "view-regions-only",
-      "select-optimize",
-      "expand-large-div-rem",
-      "structurizecfg",
-      "fix-irreducible",
-      "expand-large-fp-convert",
-      "callbrprepare",
-  };
-  for (const auto &P : PassNamePrefix)
-    if (Pass.starts_with(P))
-      return true;
-  for (const auto &P : PassNameContain)
-    if (Pass.contains(P))
-      return true;
-  return llvm::is_contained(PassNameExact, Pass);
-}
-
-// For use in NPM transition.
-static bool shouldForceLegacyPM() {
-  for (const auto &P : PassList) {
-    StringRef Arg = P->getPassArgument();
-    if (shouldPinPassToLegacyPM(Arg))
-      return true;
-  }
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// main for opt
-//
-extern "C" int optMain(
-    int argc, char **argv,
-    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks) {
-  InitLLVM X(argc, argv);
-
-  // Enable debug stream buffering.
-  EnableDebugBuffering = true;
-
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  InitializeAllAsmPrinters();
-  InitializeAllAsmParsers();
-
-  // Initialize passes
-  PassRegistry &Registry = *PassRegistry::getPassRegistry();
-  initializeCore(Registry);
-  initializeScalarOpts(Registry);
-  initializeVectorization(Registry);
-  initializeIPO(Registry);
-  initializeAnalysis(Registry);
-  initializeTransformUtils(Registry);
-  initializeInstCombine(Registry);
-  initializeTarget(Registry);
-  // For codegen passes, only passes that do IR to IR transformation are
-  // supported.
-  initializeExpandLargeDivRemLegacyPassPass(Registry);
-  initializeExpandLargeFpConvertLegacyPassPass(Registry);
-  initializeExpandMemCmpLegacyPassPass(Registry);
-  initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
-  initializeSelectOptimizePass(Registry);
-  initializeCallBrPreparePass(Registry);
-  initializeCodeGenPrepareLegacyPassPass(Registry);
-  initializeAtomicExpandPass(Registry);
-  initializeWinEHPreparePass(Registry);
-  initializeDwarfEHPrepareLegacyPassPass(Registry);
-  initializeSafeStackLegacyPassPass(Registry);
-  initializeSjLjEHPreparePass(Registry);
-  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
-  initializeGlobalMergePass(Registry);
-  initializeIndirectBrExpandLegacyPassPass(Registry);
-  initializeInterleavedLoadCombinePass(Registry);
-  initializeInterleavedAccessPass(Registry);
-  initializeUnreachableBlockElimLegacyPassPass(Registry);
-  initializeExpandReductionsPass(Registry);
-  initializeExpandVectorPredicationPass(Registry);
-  initializeWasmEHPreparePass(Registry);
-  initializeWriteBitcodePassPass(Registry);
-  initializeReplaceWithVeclibLegacyPass(Registry);
-  initializeJMCInstrumenterPass(Registry);
-
-  SmallVector<PassPlugin, 1> PluginList;
-  PassPlugins.setCallback([&](const std::string &PluginPath) {
-    auto Plugin = PassPlugin::Load(PluginPath);
-    if (!Plugin)
-      report_fatal_error(Plugin.takeError(), /*gen_crash_diag=*/false);
-    PluginList.emplace_back(Plugin.get());
-  });
-
-  // Register the Target and CPU printer for --version.
-  cl::AddExtraVersionPrinter(sys::printDefaultTargetAndDetectedCPU);
-
-  cl::ParseCommandLineOptions(
-      argc, argv, "llvm .bc -> .bc modular optimizer and analysis printer\n");
-
-  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
-  if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
-    UseNewDbgInfoFormat = true;
-  }
-#endif
-  (void)TryUseNewDbgInfoFormat;
-
-  LLVMContext Context;
-
-  // TODO: remove shouldForceLegacyPM().
-  const bool UseNPM = (!EnableLegacyPassManager && !shouldForceLegacyPM()) ||
-                      PassPipeline.getNumOccurrences() > 0;
-
-  if (UseNPM && !PassList.empty()) {
-    errs() << "The `opt -passname` syntax for the new pass manager is "
-              "not supported, please use `opt -passes=<pipeline>` (or the `-p` "
-              "alias for a more concise version).\n";
-    errs() << "See https://llvm.org/docs/NewPassManager.html#invoking-opt "
-              "for more details on the pass pipeline syntax.\n\n";
-    return 1;
-  }
-
-  if (!UseNPM && PluginList.size()) {
-    errs() << argv[0] << ": " << PassPlugins.ArgStr
-           << " specified with legacy PM.\n";
-    return 1;
-  }
-
-  // FIXME: once the legacy PM code is deleted, move runPassPipeline() here and
-  // construct the PassBuilder before parsing IR so we can reuse the same
-  // PassBuilder for print passes.
-  if (PrintPasses) {
-    printPasses(outs());
-    return 0;
-  }
-
-  TimeTracerRAII TimeTracer(argv[0]);
-
-  SMDiagnostic Err;
-
-  Context.setDiscardValueNames(DiscardValueNames);
-  if (!DisableDITypeMap)
-    Context.enableDebugTypeODRUniquing();
-
-  Expected<std::unique_ptr<ToolOutputFile>> RemarksFileOrErr =
-      setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
-                                   RemarksFormat, RemarksWithHotness,
-                                   RemarksHotnessThreshold);
-  if (Error E = RemarksFileOrErr.takeError()) {
-    errs() << toString(std::move(E)) << '\n';
-    return 1;
-  }
-  std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
-
-  // Load the input module...
-  auto SetDataLayout = [&](StringRef IRTriple,
-                           StringRef IRLayout) -> std::optional<std::string> {
-    // Data layout specified on the command line has the highest priority.
-    if (!ClDataLayout.empty())
-      return ClDataLayout;
-    // If an explicit data layout is already defined in the IR, don't infer.
-    if (!IRLayout.empty())
-      return std::nullopt;
-
-    // If an explicit triple was specified (either in the IR or on the
-    // command line), use that to infer the default data layout. However, the
-    // command line target triple should override the IR file target triple.
-    std::string TripleStr =
-        TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple);
-    // If the triple string is still empty, we don't fall back to
-    // sys::getDefaultTargetTriple() since we do not want to have differing
-    // behaviour dependent on the configured default triple. Therefore, if the
-    // user did not pass -mtriple or define an explicit triple/datalayout in
-    // the IR, we should default to an empty (default) DataLayout.
-    if (TripleStr.empty())
-      return std::nullopt;
-    // Otherwise we infer the DataLayout from the target machine.
-    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
-        codegen::createTargetMachineForTriple(TripleStr, GetCodeGenOptLevel());
-    if (!ExpectedTM) {
-      errs() << argv[0] << ": warning: failed to infer data layout: "
-             << toString(ExpectedTM.takeError()) << "\n";
-      return std::nullopt;
-    }
-    return (*ExpectedTM)->createDataLayout().getStringRepresentation();
-  };
-  std::unique_ptr<Module> M;
-  if (NoUpgradeDebugInfo)
-    M = parseAssemblyFileWithIndexNoUpgradeDebugInfo(
-            InputFilename, Err, Context, nullptr, SetDataLayout)
-            .Mod;
-  else
-    M = parseIRFile(InputFilename, Err, Context,
-                    ParserCallbacks(SetDataLayout));
-
-  if (!M) {
-    Err.print(argv[0], errs());
-    return 1;
-  }
-
-  // Strip debug info before running the verifier.
-  if (StripDebug)
-    StripDebugInfo(*M);
-
-  // Erase module-level named metadata, if requested.
-  if (StripNamedMetadata) {
-    while (!M->named_metadata_empty()) {
-      NamedMDNode *NMD = &*M->named_metadata_begin();
-      M->eraseNamedMetadata(NMD);
-    }
-  }
-
-  // If we are supposed to override the target triple, do so now.
-  if (!TargetTriple.empty())
-    M->setTargetTriple(Triple::normalize(TargetTriple));
-
-  // Immediately run the verifier to catch any problems before starting up the
-  // pass pipelines.  Otherwise we can crash on broken code during
-  // doInitialization().
-  if (!NoVerify && verifyModule(*M, &errs())) {
-    errs() << argv[0] << ": " << InputFilename
-           << ": error: input module is broken!\n";
-    return 1;
-  }
-
-  // Enable testing of whole program devirtualization on this module by invoking
-  // the facility for updating public visibility to linkage unit visibility when
-  // specified by an internal option. This is normally done during LTO which is
-  // not performed via opt.
-  updateVCallVisibilityInModule(
-      *M,
-      /*WholeProgramVisibilityEnabledInLTO=*/false,
-      // FIXME: These need linker information via a
-      // TBD new interface.
-      /*DynamicExportSymbols=*/{},
-      /*ValidateAllVtablesHaveTypeInfos=*/false,
-      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
-
-  // Figure out what stream we are supposed to write to...
-  std::unique_ptr<ToolOutputFile> Out;
-  std::unique_ptr<ToolOutputFile> ThinLinkOut;
-  if (NoOutput) {
-    if (!OutputFilename.empty())
-      errs() << "WARNING: The -o (output filename) option is ignored when\n"
-                "the --disable-output option is used.\n";
-  } else {
-    // Default to standard output.
-    if (OutputFilename.empty())
-      OutputFilename = "-";
-
-    std::error_code EC;
-    sys::fs::OpenFlags Flags =
-        OutputAssembly ? sys::fs::OF_TextWithCRLF : sys::fs::OF_None;
-    Out.reset(new ToolOutputFile(OutputFilename, EC, Flags));
-    if (EC) {
-      errs() << EC.message() << '\n';
-      return 1;
-    }
-
-    if (!ThinLinkBitcodeFile.empty()) {
-      ThinLinkOut.reset(
-          new ToolOutputFile(ThinLinkBitcodeFile, EC, sys::fs::OF_None));
-      if (EC) {
-        errs() << EC.message() << '\n';
-        return 1;
-      }
-    }
-  }
-
-  Triple ModuleTriple(M->getTargetTriple());
-  std::string CPUStr, FeaturesStr;
-  std::unique_ptr<TargetMachine> TM;
-  if (ModuleTriple.getArch()) {
-    CPUStr = codegen::getCPUStr();
-    FeaturesStr = codegen::getFeaturesStr();
-    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
-        codegen::createTargetMachineForTriple(ModuleTriple.str(),
-                                              GetCodeGenOptLevel());
-    if (auto E = ExpectedTM.takeError()) {
-      errs() << argv[0] << ": WARNING: failed to create target machine for '"
-             << ModuleTriple.str() << "': " << toString(std::move(E)) << "\n";
-    } else {
-      TM = std::move(*ExpectedTM);
-    }
-  } else if (ModuleTriple.getArchName() != "unknown" &&
-             ModuleTriple.getArchName() != "") {
-    errs() << argv[0] << ": unrecognized architecture '"
-           << ModuleTriple.getArchName() << "' provided.\n";
-    return 1;
-  }
-
-  // Override function attributes based on CPUStr, FeaturesStr, and command line
-  // flags.
-  codegen::setFunctionAttributes(CPUStr, FeaturesStr, *M);
-
-  // If the output is set to be emitted to standard out, and standard out is a
-  // console, print out a warning message and refuse to do it.  We don't
-  // impress anyone by spewing tons of binary goo to a terminal.
-  if (!Force && !NoOutput && !OutputAssembly)
-    if (CheckBitcodeOutputToConsole(Out->os()))
-      NoOutput = true;
-
-  if (OutputThinLTOBC) {
-    M->addModuleFlag(Module::Error, "EnableSplitLTOUnit", SplitLTOUnit);
-    if (UnifiedLTO)
-      M->addModuleFlag(Module::Error, "UnifiedLTO", 1);
-  }
-
-  // Add an appropriate TargetLibraryInfo pass for the module's triple.
-  TargetLibraryInfoImpl TLII(ModuleTriple);
-
-  // The -disable-simplify-libcalls flag actually disables all builtin optzns.
-  if (DisableSimplifyLibCalls)
-    TLII.disableAllFunctions();
-  else {
-    // Disable individual builtin functions in TargetLibraryInfo.
-    LibFunc F;
-    for (auto &FuncName : DisableBuiltins)
-      if (TLII.getLibFunc(FuncName, F))
-        TLII.setUnavailable(F);
-      else {
-        errs() << argv[0] << ": cannot disable nonexistent builtin function "
-               << FuncName << '\n';
-        return 1;
-      }
-  }
-
-  if (UseNPM) {
-    if (legacy::debugPassSpecified()) {
-      errs() << "-debug-pass does not work with the new PM, either use "
-                "-debug-pass-manager, or use the legacy PM\n";
-      return 1;
-    }
-    auto NumOLevel = OptLevelO0 + OptLevelO1 + OptLevelO2 + OptLevelO3 +
-                     OptLevelOs + OptLevelOz;
-    if (NumOLevel > 1) {
-      errs() << "Cannot specify multiple -O#\n";
-      return 1;
-    }
-    if (NumOLevel > 0 && (PassPipeline.getNumOccurrences() > 0)) {
-      errs() << "Cannot specify -O# and --passes=/--foo-pass, use "
-                "-passes='default<O#>,other-pass'\n";
-      return 1;
-    }
-    std::string Pipeline = PassPipeline;
-
-    if (OptLevelO0)
-      Pipeline = "default<O0>";
-    if (OptLevelO1)
-      Pipeline = "default<O1>";
-    if (OptLevelO2)
-      Pipeline = "default<O2>";
-    if (OptLevelO3)
-      Pipeline = "default<O3>";
-    if (OptLevelOs)
-      Pipeline = "default<Os>";
-    if (OptLevelOz)
-      Pipeline = "default<Oz>";
-    OutputKind OK = OK_NoOutput;
-    if (!NoOutput)
-      OK = OutputAssembly
-               ? OK_OutputAssembly
-               : (OutputThinLTOBC ? OK_OutputThinLTOBitcode : OK_OutputBitcode);
-
-    VerifierKind VK = VK_VerifyOut;
-    if (NoVerify)
-      VK = VK_NoVerifier;
-    else if (VerifyEach)
-      VK = VK_VerifyEachPass;
-
-    // The user has asked to use the new pass manager and provided a pipeline
-    // string. Hand off the rest of the functionality to the new code for that
-    // layer.
-    return runPassPipeline(
-               argv[0], *M, TM.get(), &TLII, Out.get(), ThinLinkOut.get(),
-               RemarksFile.get(), Pipeline, PluginList, PassBuilderCallbacks,
-               OK, VK, PreserveAssemblyUseListOrder,
-               PreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash,
-               EnableDebugify, VerifyDebugInfoPreserve, UnifiedLTO)
-               ? 0
-               : 1;
-  }
-
-  if (OptLevelO0 || OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz ||
-      OptLevelO3) {
-    errs() << "Cannot use -O# with legacy PM.\n";
-    return 1;
-  }
-  if (EmitSummaryIndex) {
-    errs() << "Cannot use -module-summary with legacy PM.\n";
-    return 1;
-  }
-  if (EmitModuleHash) {
-    errs() << "Cannot use -module-hash with legacy PM.\n";
-    return 1;
-  }
-  if (OutputThinLTOBC) {
-    errs() << "Cannot use -thinlto-bc with legacy PM.\n";
-    return 1;
-  }
-  // Create a PassManager to hold and optimize the collection of passes we are
-  // about to build. If the -debugify-each option is set, wrap each pass with
-  // the (-check)-debugify passes.
-  DebugifyCustomPassManager Passes;
-  DebugifyStatsMap DIStatsMap;
-  DebugInfoPerPass DebugInfoBeforePass;
-  if (DebugifyEach) {
-    Passes.setDebugifyMode(DebugifyMode::SyntheticDebugInfo);
-    Passes.setDIStatsMap(DIStatsMap);
-  } else if (VerifyEachDebugInfoPreserve) {
-    Passes.setDebugifyMode(DebugifyMode::OriginalDebugInfo);
-    Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
-    if (!VerifyDIPreserveExport.empty())
-      Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
-  }
-
-  bool AddOneTimeDebugifyPasses =
-      (EnableDebugify && !DebugifyEach) ||
-      (VerifyDebugInfoPreserve && !VerifyEachDebugInfoPreserve);
-
-  Passes.add(new TargetLibraryInfoWrapperPass(TLII));
-
-  // Add internal analysis passes from the target machine.
-  Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
-                                                     : TargetIRAnalysis()));
-
-  if (AddOneTimeDebugifyPasses) {
-    if (EnableDebugify) {
-      Passes.setDIStatsMap(DIStatsMap);
-      Passes.add(createDebugifyModulePass());
-    } else if (VerifyDebugInfoPreserve) {
-      Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
-      Passes.add(createDebugifyModulePass(DebugifyMode::OriginalDebugInfo, "",
-                                          &(Passes.getDebugInfoPerPass())));
-    }
-  }
-
-  if (TM) {
-    // FIXME: We should dyn_cast this when supported.
-    auto &LTM = static_cast<LLVMTargetMachine &>(*TM);
-    Pass *TPC = LTM.createPassConfig(Passes);
-    Passes.add(TPC);
-  }
-
-  // Create a new optimization pass for each one specified on the command line
-  for (unsigned i = 0; i < PassList.size(); ++i) {
-    const PassInfo *PassInf = PassList[i];
-    if (PassInf->getNormalCtor()) {
-      Pass *P = PassInf->getNormalCtor()();
-      if (P) {
-        // Add the pass to the pass manager.
-        Passes.add(P);
-        // If we are verifying all of the intermediate steps, add the verifier.
-        if (VerifyEach)
-          Passes.add(createVerifierPass());
-      }
-    } else
-      errs() << argv[0] << ": cannot create pass: " << PassInf->getPassName()
-             << "\n";
-  }
-
-  // Check that the module is well formed on completion of optimization
-  if (!NoVerify && !VerifyEach)
-    Passes.add(createVerifierPass());
-
-  if (AddOneTimeDebugifyPasses) {
-    if (EnableDebugify)
-      Passes.add(createCheckDebugifyModulePass(false));
-    else if (VerifyDebugInfoPreserve) {
-      if (!VerifyDIPreserveExport.empty())
-        Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
-      Passes.add(createCheckDebugifyModulePass(
-          false, "", nullptr, DebugifyMode::OriginalDebugInfo,
-          &(Passes.getDebugInfoPerPass()), VerifyDIPreserveExport));
-    }
-  }
-
-  // In run twice mode, we want to make sure the output is bit-by-bit
-  // equivalent if we run the pass manager again, so setup two buffers and
-  // a stream to write to them. Note that llc does something similar and it
-  // may be worth to abstract this out in the future.
-  SmallVector<char, 0> Buffer;
-  SmallVector<char, 0> FirstRunBuffer;
-  std::unique_ptr<raw_svector_ostream> BOS;
-  raw_ostream *OS = nullptr;
-
-  const bool ShouldEmitOutput = !NoOutput;
-
-  // Write bitcode or assembly to the output as the last step...
-  if (ShouldEmitOutput || RunTwice) {
-    assert(Out);
-    OS = &Out->os();
-    if (RunTwice) {
-      BOS = std::make_unique<raw_svector_ostream>(Buffer);
-      OS = BOS.get();
-    }
-    if (OutputAssembly)
-      Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
-    else
-      Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder));
-  }
-
-  // Before executing passes, print the final values of the LLVM options.
-  cl::PrintOptionValues();
-
-  if (!RunTwice) {
-    // Now that we have all of the passes ready, run them.
-    Passes.run(*M);
-  } else {
-    // If requested, run all passes twice with the same pass manager to catch
-    // bugs caused by persistent state in the passes.
-    std::unique_ptr<Module> M2(CloneModule(*M));
-    // Run all passes on the original module first, so the second run processes
-    // the clone to catch CloneModule bugs.
-    Passes.run(*M);
-    FirstRunBuffer = Buffer;
-    Buffer.clear();
-
-    Passes.run(*M2);
-
-    // Compare the two outputs and make sure they're the same
-    assert(Out);
-    if (Buffer.size() != FirstRunBuffer.size() ||
-        (memcmp(Buffer.data(), FirstRunBuffer.data(), Buffer.size()) != 0)) {
-      errs()
-          << "Running the pass manager twice changed the output.\n"
-             "Writing the result of the second run to the specified output.\n"
-             "To generate the one-run comparison binary, just run without\n"
-             "the compile-twice option\n";
-      if (ShouldEmitOutput) {
-        Out->os() << BOS->str();
-        Out->keep();
-      }
-      if (RemarksFile)
-        RemarksFile->keep();
-      return 1;
-    }
-    if (ShouldEmitOutput)
-      Out->os() << BOS->str();
-  }
-
-  if (DebugifyEach && !DebugifyExport.empty())
-    exportDebugifyStats(DebugifyExport, Passes.getDebugifyStatsMap());
-
-  // Declare success.
-  if (!NoOutput)
-    Out->keep();
-
-  if (RemarksFile)
-    RemarksFile->keep();
-
-  if (ThinLinkOut)
-    ThinLinkOut->keep();
-
-  return 0;
-}
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index af219dd743ce7..d4cb0cdb55b52 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -4814,8 +4814,8 @@ cc_binary(
     ],
 )
 
-cc_library(
-    name = "opt-driver",
+cc_binary(
+    name = "opt",
     srcs = glob([
         "tools/opt/*.cpp",
         "tools/opt/*.h",
@@ -4826,6 +4826,7 @@ cc_library(
         "@platforms//os:macos": [],
         "//conditions:default": ["-Wl,--export-dynamic"],
     }),
+    stamp = 0,
     deps = [
         ":AllTargetsAsmParsers",
         ":AllTargetsCodeGens",
@@ -4852,12 +4853,6 @@ cc_library(
     ],
 )
 
-cc_binary(
-    name = "opt",
-    stamp = 0,
-    deps = [":opt-driver"]
-)
-
 gentbl(
     name = "SancovOptsTableGen",
     strip_include_prefix = "tools/sancov",

From d2d42dcfde7e9bf58e49b376ae7488ffac97f4c8 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Wed, 24 Jan 2024 10:51:23 -0800
Subject: [PATCH 834/843] [CodeGen][MISched] Rename instance of Cycle ->
 ReleaseAtCycle

b1ae461a5358932851de42b66ffde8748da51a83 renamed Cycle ->
ReleaseAtCycle.

7e09239e24b339f45f63a670e2e831150826bf70 was committed without rebasing
but used the old Cycle syntax.

This caused a build failure when
7e09239e24b339f45f63a670e2e831150826bf70 was squash-and-merged. This
patch fixes this problem.
---
 llvm/lib/CodeGen/MachineScheduler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index f6d03f59f8564..363dd3aca6fe4 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -4278,7 +4278,7 @@ unsigned ResourceSegments::getFirstAvailableAt(
 
   // Zero resource usage is allowed by TargetSchedule.td but we do not construct
   // a ResourceSegment interval for that situation.
-  if (AcquireAtCycle == Cycle)
+  if (AcquireAtCycle == ReleaseAtCycle)
     return CurrCycle;
 
   unsigned RetCycle = CurrCycle;

From 31359840247b7d458ea8104373eea3f0cef95e16 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 24 Jan 2024 13:56:49 -0500
Subject: [PATCH 835/843] Reland "[CMake/Bazel] Support usage of opt driver as
 a library (#79205)"

This reverts commit be08be5d5de97cd593fb99affa1fa994d104eb70.
The build error was due to a different change, apologies!
---
 llvm/tools/opt/CMakeLists.txt                 |  16 +-
 llvm/tools/opt/NewPMDriver.cpp                |   9 +-
 llvm/tools/opt/NewPMDriver.h                  |  22 +-
 llvm/tools/opt/opt.cpp                        | 913 +----------------
 llvm/tools/opt/optdriver.cpp                  | 923 ++++++++++++++++++
 .../llvm-project-overlay/llvm/BUILD.bazel     |  11 +-
 6 files changed, 972 insertions(+), 922 deletions(-)
 create mode 100644 llvm/tools/opt/optdriver.cpp

diff --git a/llvm/tools/opt/CMakeLists.txt b/llvm/tools/opt/CMakeLists.txt
index 2cdb99e936287..f3dd95e178e31 100644
--- a/llvm/tools/opt/CMakeLists.txt
+++ b/llvm/tools/opt/CMakeLists.txt
@@ -29,12 +29,24 @@ set(LLVM_LINK_COMPONENTS
   Passes
   )
 
-add_llvm_tool(opt
+# We don't want to link this into libLLVM
+add_llvm_library(LLVMOptDriver
+  STATIC
   NewPMDriver.cpp
-  opt.cpp
+  optdriver.cpp
+  PARTIAL_SOURCES_INTENDED
+  DEPENDS
+  intrinsics_gen
+)
 
+add_llvm_tool(opt
+  PARTIAL_SOURCES_INTENDED
+  opt.cpp 
   DEPENDS
   intrinsics_gen
   SUPPORT_PLUGINS
+
   )
+target_link_libraries(opt PRIVATE LLVMOptDriver)
+
 export_executable_symbols_for_plugins(opt)
diff --git a/llvm/tools/opt/NewPMDriver.cpp b/llvm/tools/opt/NewPMDriver.cpp
index 6ae3f87099afd..fdfb4df53273f 100644
--- a/llvm/tools/opt/NewPMDriver.cpp
+++ b/llvm/tools/opt/NewPMDriver.cpp
@@ -328,8 +328,9 @@ bool llvm::runPassPipeline(
     StringRef Arg0, Module &M, TargetMachine *TM, TargetLibraryInfoImpl *TLII,
     ToolOutputFile *Out, ToolOutputFile *ThinLTOLinkOut,
     ToolOutputFile *OptRemarkFile, StringRef PassPipeline,
-    ArrayRef<PassPlugin> PassPlugins, OutputKind OK, VerifierKind VK,
-    bool ShouldPreserveAssemblyUseListOrder,
+    ArrayRef<PassPlugin> PassPlugins,
+    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks,
+    OutputKind OK, VerifierKind VK, bool ShouldPreserveAssemblyUseListOrder,
     bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
     bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
     bool UnifiedLTO) {
@@ -428,6 +429,10 @@ bool llvm::runPassPipeline(
   for (auto &PassPlugin : PassPlugins)
     PassPlugin.registerPassBuilderCallbacks(PB);
 
+  // Load any explicitly specified plugins.
+  for (auto &PassCallback : PassBuilderCallbacks)
+    PassCallback(PB);
+
 #define HANDLE_EXTENSION(Ext)                                                  \
   get##Ext##PluginInfo().RegisterPassBuilderCallbacks(PB);
 #include "llvm/Support/Extension.def"
diff --git a/llvm/tools/opt/NewPMDriver.h b/llvm/tools/opt/NewPMDriver.h
index 3230c27c2d7a8..19cabd15436eb 100644
--- a/llvm/tools/opt/NewPMDriver.h
+++ b/llvm/tools/opt/NewPMDriver.h
@@ -23,6 +23,7 @@
 #include "llvm/Support/CommandLine.h"
 
 namespace llvm {
+class PassBuilder;
 class StringRef;
 class Module;
 class PassPlugin;
@@ -64,16 +65,17 @@ void printPasses(raw_ostream &OS);
 ///
 /// ThinLTOLinkOut is only used when OK is OK_OutputThinLTOBitcode, and can be
 /// nullptr.
-bool runPassPipeline(StringRef Arg0, Module &M, TargetMachine *TM,
-                     TargetLibraryInfoImpl *TLII, ToolOutputFile *Out,
-                     ToolOutputFile *ThinLinkOut, ToolOutputFile *OptRemarkFile,
-                     StringRef PassPipeline, ArrayRef<PassPlugin> PassPlugins,
-                     opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
-                     bool ShouldPreserveAssemblyUseListOrder,
-                     bool ShouldPreserveBitcodeUseListOrder,
-                     bool EmitSummaryIndex, bool EmitModuleHash,
-                     bool EnableDebugify, bool VerifyDIPreserve,
-                     bool UnifiedLTO = false);
+bool runPassPipeline(
+    StringRef Arg0, Module &M, TargetMachine *TM, TargetLibraryInfoImpl *TLII,
+    ToolOutputFile *Out, ToolOutputFile *ThinLinkOut,
+    ToolOutputFile *OptRemarkFile, StringRef PassPipeline,
+    ArrayRef<PassPlugin> PassPlugins,
+    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks,
+    opt_tool::OutputKind OK, opt_tool::VerifierKind VK,
+    bool ShouldPreserveAssemblyUseListOrder,
+    bool ShouldPreserveBitcodeUseListOrder, bool EmitSummaryIndex,
+    bool EmitModuleHash, bool EnableDebugify, bool VerifyDIPreserve,
+    bool UnifiedLTO = false);
 } // namespace llvm
 
 #endif
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 8c0e98976e602..fc8bd665cbbe9 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -11,912 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "NewPMDriver.h"
-#include "llvm/Analysis/CallGraph.h"
-#include "llvm/Analysis/CallGraphSCCPass.h"
-#include "llvm/Analysis/LoopPass.h"
-#include "llvm/Analysis/RegionPass.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/AsmParser/Parser.h"
-#include "llvm/CodeGen/CommandFlags.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/Config/llvm-config.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/LLVMContext.h"
-#include "llvm/IR/LLVMRemarkStreamer.h"
-#include "llvm/IR/LegacyPassManager.h"
-#include "llvm/IR/LegacyPassNameParser.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/ModuleSummaryIndex.h"
-#include "llvm/IR/Verifier.h"
-#include "llvm/IRReader/IRReader.h"
-#include "llvm/InitializePasses.h"
-#include "llvm/LinkAllIR.h"
-#include "llvm/LinkAllPasses.h"
-#include "llvm/MC/TargetRegistry.h"
-#include "llvm/Passes/PassPlugin.h"
-#include "llvm/Remarks/HotnessThresholdParser.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/FileSystem.h"
-#include "llvm/Support/InitLLVM.h"
-#include "llvm/Support/PluginLoader.h"
-#include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/SystemUtils.h"
-#include "llvm/Support/TargetSelect.h"
-#include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/YAMLTraits.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/TargetParser/Host.h"
-#include "llvm/TargetParser/SubtargetFeature.h"
-#include "llvm/TargetParser/Triple.h"
-#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
-#include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/Debugify.h"
-#include <algorithm>
-#include <memory>
-#include <optional>
-using namespace llvm;
-using namespace opt_tool;
+#include "llvm/ADT/ArrayRef.h"
+#include <functional>
 
-static codegen::RegisterCodeGenFlags CFG;
-
-// The OptimizationList is automatically populated with registered Passes by the
-// PassNameParser.
-static cl::list<const PassInfo *, bool, PassNameParser> PassList(cl::desc(
-    "Optimizations available (use '-passes=' for the new pass manager)"));
-
-static cl::opt<bool> EnableLegacyPassManager(
-    "bugpoint-enable-legacy-pm",
-    cl::desc(
-        "Enable the legacy pass manager. This is strictly for bugpoint "
-        "due to it not working with the new PM, please do not use otherwise."),
-    cl::init(false));
-
-// This flag specifies a textual description of the optimization pass pipeline
-// to run over the module. This flag switches opt to use the new pass manager
-// infrastructure, completely disabling all of the flags specific to the old
-// pass management.
-static cl::opt<std::string> PassPipeline(
-    "passes",
-    cl::desc(
-        "A textual description of the pass pipeline. To have analysis passes "
-        "available before a certain pass, add 'require<foo-analysis>'."));
-static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
-                                   cl::desc("Alias for -passes"));
-
-static cl::opt<bool> PrintPasses("print-passes",
-                                 cl::desc("Print available passes that can be "
-                                          "specified in -passes=foo and exit"));
-
-static cl::opt<std::string>
-InputFilename(cl::Positional, cl::desc("<input bitcode file>"),
-    cl::init("-"), cl::value_desc("filename"));
-
-static cl::opt<std::string>
-OutputFilename("o", cl::desc("Override output filename"),
-               cl::value_desc("filename"));
-
-static cl::opt<bool>
-Force("f", cl::desc("Enable binary output on terminals"));
-
-static cl::opt<bool>
-NoOutput("disable-output",
-         cl::desc("Do not write result bitcode file"), cl::Hidden);
-
-static cl::opt<bool>
-OutputAssembly("S", cl::desc("Write output as LLVM assembly"));
-
-static cl::opt<bool>
-    OutputThinLTOBC("thinlto-bc",
-                    cl::desc("Write output as ThinLTO-ready bitcode"));
-
-static cl::opt<bool>
-    SplitLTOUnit("thinlto-split-lto-unit",
-                 cl::desc("Enable splitting of a ThinLTO LTOUnit"));
-
-static cl::opt<bool>
-    UnifiedLTO("unified-lto",
-               cl::desc("Use unified LTO piplines. Ignored unless -thinlto-bc "
-                        "is also specified."),
-               cl::Hidden, cl::init(false));
-
-static cl::opt<std::string> ThinLinkBitcodeFile(
-    "thin-link-bitcode-file", cl::value_desc("filename"),
-    cl::desc(
-        "A file in which to write minimized bitcode for the thin link only"));
-
-static cl::opt<bool>
-NoVerify("disable-verify", cl::desc("Do not run the verifier"), cl::Hidden);
-
-static cl::opt<bool> NoUpgradeDebugInfo("disable-upgrade-debug-info",
-                                        cl::desc("Generate invalid output"),
-                                        cl::ReallyHidden);
-
-static cl::opt<bool> VerifyEach("verify-each",
-                                cl::desc("Verify after each transform"));
-
-static cl::opt<bool>
-    DisableDITypeMap("disable-debug-info-type-map",
-                     cl::desc("Don't use a uniquing type map for debug info"));
-
-static cl::opt<bool>
-StripDebug("strip-debug",
-           cl::desc("Strip debugger symbol info from translation unit"));
-
-static cl::opt<bool>
-    StripNamedMetadata("strip-named-metadata",
-                       cl::desc("Strip module-level named metadata"));
-
-static cl::opt<bool>
-    OptLevelO0("O0", cl::desc("Optimization level 0. Similar to clang -O0. "
-                              "Same as -passes='default<O0>'"));
-
-static cl::opt<bool>
-    OptLevelO1("O1", cl::desc("Optimization level 1. Similar to clang -O1. "
-                              "Same as -passes='default<O1>'"));
-
-static cl::opt<bool>
-    OptLevelO2("O2", cl::desc("Optimization level 2. Similar to clang -O2. "
-                              "Same as -passes='default<O2>'"));
-
-static cl::opt<bool>
-    OptLevelOs("Os", cl::desc("Like -O2 but size-conscious. Similar to clang "
-                              "-Os. Same as -passes='default<Os>'"));
-
-static cl::opt<bool> OptLevelOz(
-    "Oz",
-    cl::desc("Like -O2 but optimize for code size above all else. Similar to "
-             "clang -Oz. Same as -passes='default<Oz>'"));
-
-static cl::opt<bool>
-    OptLevelO3("O3", cl::desc("Optimization level 3. Similar to clang -O3. "
-                              "Same as -passes='default<O3>'"));
-
-static cl::opt<unsigned> CodeGenOptLevelCL(
-    "codegen-opt-level",
-    cl::desc("Override optimization level for codegen hooks, legacy PM only"));
-
-static cl::opt<std::string>
-TargetTriple("mtriple", cl::desc("Override target triple for module"));
-
-static cl::opt<bool> EmitSummaryIndex("module-summary",
-                                      cl::desc("Emit module summary index"),
-                                      cl::init(false));
-
-static cl::opt<bool> EmitModuleHash("module-hash", cl::desc("Emit module hash"),
-                                    cl::init(false));
-
-static cl::opt<bool>
-DisableSimplifyLibCalls("disable-simplify-libcalls",
-                        cl::desc("Disable simplify-libcalls"));
-
-static cl::list<std::string> DisableBuiltins(
-    "disable-builtin",
-    cl::desc("Disable specific target library builtin function"));
-
-static cl::opt<bool> EnableDebugify(
-    "enable-debugify",
-    cl::desc(
-        "Start the pipeline with debugify and end it with check-debugify"));
-
-static cl::opt<bool> VerifyDebugInfoPreserve(
-    "verify-debuginfo-preserve",
-    cl::desc("Start the pipeline with collecting and end it with checking of "
-             "debug info preservation."));
-
-static cl::opt<std::string> ClDataLayout("data-layout",
-                                         cl::desc("data layout string to use"),
-                                         cl::value_desc("layout-string"),
-                                         cl::init(""));
-
-static cl::opt<bool> PreserveBitcodeUseListOrder(
-    "preserve-bc-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM bitcode."),
-    cl::init(true), cl::Hidden);
-
-static cl::opt<bool> PreserveAssemblyUseListOrder(
-    "preserve-ll-uselistorder",
-    cl::desc("Preserve use-list order when writing LLVM assembly."),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<bool> RunTwice("run-twice",
-                              cl::desc("Run all passes twice, re-using the "
-                                       "same pass manager (legacy PM only)."),
-                              cl::init(false), cl::Hidden);
-
-static cl::opt<bool> DiscardValueNames(
-    "discard-value-names",
-    cl::desc("Discard names from Value (other than GlobalValue)."),
-    cl::init(false), cl::Hidden);
-
-static cl::opt<bool> TimeTrace(
-    "time-trace",
-    cl::desc("Record time trace"));
-
-static cl::opt<unsigned> TimeTraceGranularity(
-    "time-trace-granularity",
-    cl::desc("Minimum time granularity (in microseconds) traced by time profiler"),
-    cl::init(500), cl::Hidden);
-
-static cl::opt<std::string>
-    TimeTraceFile("time-trace-file",
-                    cl::desc("Specify time trace file destination"),
-                    cl::value_desc("filename"));
-
-static cl::opt<bool> RemarksWithHotness(
-    "pass-remarks-with-hotness",
-    cl::desc("With PGO, include profile count in optimization remarks"),
-    cl::Hidden);
-
-static cl::opt<std::optional<uint64_t>, false, remarks::HotnessThresholdParser>
-    RemarksHotnessThreshold(
-        "pass-remarks-hotness-threshold",
-        cl::desc("Minimum profile count required for "
-                 "an optimization remark to be output. "
-                 "Use 'auto' to apply the threshold from profile summary"),
-        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
-
-static cl::opt<std::string>
-    RemarksFilename("pass-remarks-output",
-                    cl::desc("Output filename for pass remarks"),
-                    cl::value_desc("filename"));
-
-static cl::opt<std::string>
-    RemarksPasses("pass-remarks-filter",
-                  cl::desc("Only record optimization remarks from passes whose "
-                           "names match the given regular expression"),
-                  cl::value_desc("regex"));
-
-static cl::opt<std::string> RemarksFormat(
-    "pass-remarks-format",
-    cl::desc("The format used for serializing remarks (default: YAML)"),
-    cl::value_desc("format"), cl::init("yaml"));
-
-static cl::list<std::string>
-    PassPlugins("load-pass-plugin",
-                cl::desc("Load passes from plugin library"));
-
-static cl::opt<bool> TryUseNewDbgInfoFormat(
-    "try-experimental-debuginfo-iterators",
-    cl::desc("Enable debuginfo iterator positions, if they're built in"),
-    cl::init(false), cl::Hidden);
-
-extern cl::opt<bool> UseNewDbgInfoFormat;
-
-//===----------------------------------------------------------------------===//
-// CodeGen-related helper functions.
-//
-
-static CodeGenOptLevel GetCodeGenOptLevel() {
-  return static_cast<CodeGenOptLevel>(unsigned(CodeGenOptLevelCL));
+namespace llvm {
+class PassBuilder;
 }
 
-struct TimeTracerRAII {
-  TimeTracerRAII(StringRef ProgramName) {
-    if (TimeTrace)
-      timeTraceProfilerInitialize(TimeTraceGranularity, ProgramName);
-  }
-  ~TimeTracerRAII() {
-    if (TimeTrace) {
-      if (auto E = timeTraceProfilerWrite(TimeTraceFile, OutputFilename)) {
-        handleAllErrors(std::move(E), [&](const StringError &SE) {
-          errs() << SE.getMessage() << "\n";
-        });
-        return;
-      }
-      timeTraceProfilerCleanup();
-    }
-  }
-};
+extern "C" int optMain(int argc, char **argv,
+                       llvm::ArrayRef<std::function<void(llvm::PassBuilder &)>>
+                           PassBuilderCallbacks);
 
-// For use in NPM transition. Currently this contains most codegen-specific
-// passes. Remove passes from here when porting to the NPM.
-// TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once
-// it exists.
-static bool shouldPinPassToLegacyPM(StringRef Pass) {
-  std::vector<StringRef> PassNameExactToIgnore = {
-      "nvvm-reflect",
-      "nvvm-intr-range",
-      "amdgpu-simplifylib",
-      "amdgpu-image-intrinsic-opt",
-      "amdgpu-usenative",
-      "amdgpu-promote-alloca",
-      "amdgpu-promote-alloca-to-vector",
-      "amdgpu-lower-kernel-attributes",
-      "amdgpu-propagate-attributes-early",
-      "amdgpu-propagate-attributes-late",
-      "amdgpu-unify-metadata",
-      "amdgpu-printf-runtime-binding",
-      "amdgpu-always-inline"};
-  if (llvm::is_contained(PassNameExactToIgnore, Pass))
-    return false;
-
-  std::vector<StringRef> PassNamePrefix = {
-      "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
-      "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
-      "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
-      "amdgcn-", "polly-", "riscv-", "dxil-"};
-  std::vector<StringRef> PassNameContain = {"-eh-prepare"};
-  std::vector<StringRef> PassNameExact = {
-      "safe-stack",
-      "cost-model",
-      "codegenprepare",
-      "interleaved-load-combine",
-      "unreachableblockelim",
-      "verify-safepoint-ir",
-      "atomic-expand",
-      "expandvp",
-      "mve-tail-predication",
-      "interleaved-access",
-      "global-merge",
-      "pre-isel-intrinsic-lowering",
-      "expand-reductions",
-      "indirectbr-expand",
-      "generic-to-nvvm",
-      "expand-memcmp",
-      "loop-reduce",
-      "lower-amx-type",
-      "lower-amx-intrinsics",
-      "polyhedral-info",
-      "print-polyhedral-info",
-      "replace-with-veclib",
-      "jmc-instrumenter",
-      "dot-regions",
-      "dot-regions-only",
-      "view-regions",
-      "view-regions-only",
-      "select-optimize",
-      "expand-large-div-rem",
-      "structurizecfg",
-      "fix-irreducible",
-      "expand-large-fp-convert",
-      "callbrprepare",
-  };
-  for (const auto &P : PassNamePrefix)
-    if (Pass.starts_with(P))
-      return true;
-  for (const auto &P : PassNameContain)
-    if (Pass.contains(P))
-      return true;
-  return llvm::is_contained(PassNameExact, Pass);
-}
-
-// For use in NPM transition.
-static bool shouldForceLegacyPM() {
-  for (const auto &P : PassList) {
-    StringRef Arg = P->getPassArgument();
-    if (shouldPinPassToLegacyPM(Arg))
-      return true;
-  }
-  return false;
-}
-
-//===----------------------------------------------------------------------===//
-// main for opt
-//
-int main(int argc, char **argv) {
-  InitLLVM X(argc, argv);
-
-  // Enable debug stream buffering.
-  EnableDebugBuffering = true;
-
-  InitializeAllTargets();
-  InitializeAllTargetMCs();
-  InitializeAllAsmPrinters();
-  InitializeAllAsmParsers();
-
-  // Initialize passes
-  PassRegistry &Registry = *PassRegistry::getPassRegistry();
-  initializeCore(Registry);
-  initializeScalarOpts(Registry);
-  initializeVectorization(Registry);
-  initializeIPO(Registry);
-  initializeAnalysis(Registry);
-  initializeTransformUtils(Registry);
-  initializeInstCombine(Registry);
-  initializeTarget(Registry);
-  // For codegen passes, only passes that do IR to IR transformation are
-  // supported.
-  initializeExpandLargeDivRemLegacyPassPass(Registry);
-  initializeExpandLargeFpConvertLegacyPassPass(Registry);
-  initializeExpandMemCmpLegacyPassPass(Registry);
-  initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
-  initializeSelectOptimizePass(Registry);
-  initializeCallBrPreparePass(Registry);
-  initializeCodeGenPrepareLegacyPassPass(Registry);
-  initializeAtomicExpandPass(Registry);
-  initializeWinEHPreparePass(Registry);
-  initializeDwarfEHPrepareLegacyPassPass(Registry);
-  initializeSafeStackLegacyPassPass(Registry);
-  initializeSjLjEHPreparePass(Registry);
-  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
-  initializeGlobalMergePass(Registry);
-  initializeIndirectBrExpandLegacyPassPass(Registry);
-  initializeInterleavedLoadCombinePass(Registry);
-  initializeInterleavedAccessPass(Registry);
-  initializeUnreachableBlockElimLegacyPassPass(Registry);
-  initializeExpandReductionsPass(Registry);
-  initializeExpandVectorPredicationPass(Registry);
-  initializeWasmEHPreparePass(Registry);
-  initializeWriteBitcodePassPass(Registry);
-  initializeReplaceWithVeclibLegacyPass(Registry);
-  initializeJMCInstrumenterPass(Registry);
-
-  SmallVector<PassPlugin, 1> PluginList;
-  PassPlugins.setCallback([&](const std::string &PluginPath) {
-    auto Plugin = PassPlugin::Load(PluginPath);
-    if (!Plugin)
-      report_fatal_error(Plugin.takeError(), /*gen_crash_diag=*/false);
-    PluginList.emplace_back(Plugin.get());
-  });
-
-  // Register the Target and CPU printer for --version.
-  cl::AddExtraVersionPrinter(sys::printDefaultTargetAndDetectedCPU);
-
-  cl::ParseCommandLineOptions(argc, argv,
-    "llvm .bc -> .bc modular optimizer and analysis printer\n");
-
-  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
-  // new debug-info format, if it's built in.
-#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
-  if (TryUseNewDbgInfoFormat) {
-    // If LLVM was built with support for this, turn the new debug-info format
-    // on.
-    UseNewDbgInfoFormat = true;
-  }
-#endif
-  (void)TryUseNewDbgInfoFormat;
-
-  LLVMContext Context;
-
-  // TODO: remove shouldForceLegacyPM().
-  const bool UseNPM = (!EnableLegacyPassManager && !shouldForceLegacyPM()) ||
-                      PassPipeline.getNumOccurrences() > 0;
-
-  if (UseNPM && !PassList.empty()) {
-    errs() << "The `opt -passname` syntax for the new pass manager is "
-              "not supported, please use `opt -passes=<pipeline>` (or the `-p` "
-              "alias for a more concise version).\n";
-    errs() << "See https://llvm.org/docs/NewPassManager.html#invoking-opt "
-              "for more details on the pass pipeline syntax.\n\n";
-    return 1;
-  }
-
-  if (!UseNPM && PluginList.size()) {
-    errs() << argv[0] << ": " << PassPlugins.ArgStr
-           << " specified with legacy PM.\n";
-    return 1;
-  }
-
-  // FIXME: once the legacy PM code is deleted, move runPassPipeline() here and
-  // construct the PassBuilder before parsing IR so we can reuse the same
-  // PassBuilder for print passes.
-  if (PrintPasses) {
-    printPasses(outs());
-    return 0;
-  }
-
-  TimeTracerRAII TimeTracer(argv[0]);
-
-  SMDiagnostic Err;
-
-  Context.setDiscardValueNames(DiscardValueNames);
-  if (!DisableDITypeMap)
-    Context.enableDebugTypeODRUniquing();
-
-  Expected<std::unique_ptr<ToolOutputFile>> RemarksFileOrErr =
-      setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
-                                   RemarksFormat, RemarksWithHotness,
-                                   RemarksHotnessThreshold);
-  if (Error E = RemarksFileOrErr.takeError()) {
-    errs() << toString(std::move(E)) << '\n';
-    return 1;
-  }
-  std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
-
-  // Load the input module...
-  auto SetDataLayout = [&](StringRef IRTriple,
-                           StringRef IRLayout) -> std::optional<std::string> {
-    // Data layout specified on the command line has the highest priority.
-    if (!ClDataLayout.empty())
-      return ClDataLayout;
-    // If an explicit data layout is already defined in the IR, don't infer.
-    if (!IRLayout.empty())
-      return std::nullopt;
-
-    // If an explicit triple was specified (either in the IR or on the
-    // command line), use that to infer the default data layout. However, the
-    // command line target triple should override the IR file target triple.
-    std::string TripleStr =
-        TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple);
-    // If the triple string is still empty, we don't fall back to
-    // sys::getDefaultTargetTriple() since we do not want to have differing
-    // behaviour dependent on the configured default triple. Therefore, if the
-    // user did not pass -mtriple or define an explicit triple/datalayout in
-    // the IR, we should default to an empty (default) DataLayout.
-    if (TripleStr.empty())
-      return std::nullopt;
-    // Otherwise we infer the DataLayout from the target machine.
-    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
-        codegen::createTargetMachineForTriple(TripleStr, GetCodeGenOptLevel());
-    if (!ExpectedTM) {
-      errs() << argv[0] << ": warning: failed to infer data layout: "
-             << toString(ExpectedTM.takeError()) << "\n";
-      return std::nullopt;
-    }
-    return (*ExpectedTM)->createDataLayout().getStringRepresentation();
-  };
-  std::unique_ptr<Module> M;
-  if (NoUpgradeDebugInfo)
-    M = parseAssemblyFileWithIndexNoUpgradeDebugInfo(
-            InputFilename, Err, Context, nullptr, SetDataLayout)
-            .Mod;
-  else
-    M = parseIRFile(InputFilename, Err, Context,
-                    ParserCallbacks(SetDataLayout));
-
-  if (!M) {
-    Err.print(argv[0], errs());
-    return 1;
-  }
-
-  // Strip debug info before running the verifier.
-  if (StripDebug)
-    StripDebugInfo(*M);
-
-  // Erase module-level named metadata, if requested.
-  if (StripNamedMetadata) {
-    while (!M->named_metadata_empty()) {
-      NamedMDNode *NMD = &*M->named_metadata_begin();
-      M->eraseNamedMetadata(NMD);
-    }
-  }
-
-  // If we are supposed to override the target triple, do so now.
-  if (!TargetTriple.empty())
-    M->setTargetTriple(Triple::normalize(TargetTriple));
-
-  // Immediately run the verifier to catch any problems before starting up the
-  // pass pipelines.  Otherwise we can crash on broken code during
-  // doInitialization().
-  if (!NoVerify && verifyModule(*M, &errs())) {
-    errs() << argv[0] << ": " << InputFilename
-           << ": error: input module is broken!\n";
-    return 1;
-  }
-
-  // Enable testing of whole program devirtualization on this module by invoking
-  // the facility for updating public visibility to linkage unit visibility when
-  // specified by an internal option. This is normally done during LTO which is
-  // not performed via opt.
-  updateVCallVisibilityInModule(
-      *M,
-      /*WholeProgramVisibilityEnabledInLTO=*/false,
-      // FIXME: These need linker information via a
-      // TBD new interface.
-      /*DynamicExportSymbols=*/{},
-      /*ValidateAllVtablesHaveTypeInfos=*/false,
-      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
-
-  // Figure out what stream we are supposed to write to...
-  std::unique_ptr<ToolOutputFile> Out;
-  std::unique_ptr<ToolOutputFile> ThinLinkOut;
-  if (NoOutput) {
-    if (!OutputFilename.empty())
-      errs() << "WARNING: The -o (output filename) option is ignored when\n"
-                "the --disable-output option is used.\n";
-  } else {
-    // Default to standard output.
-    if (OutputFilename.empty())
-      OutputFilename = "-";
-
-    std::error_code EC;
-    sys::fs::OpenFlags Flags =
-        OutputAssembly ? sys::fs::OF_TextWithCRLF : sys::fs::OF_None;
-    Out.reset(new ToolOutputFile(OutputFilename, EC, Flags));
-    if (EC) {
-      errs() << EC.message() << '\n';
-      return 1;
-    }
-
-    if (!ThinLinkBitcodeFile.empty()) {
-      ThinLinkOut.reset(
-          new ToolOutputFile(ThinLinkBitcodeFile, EC, sys::fs::OF_None));
-      if (EC) {
-        errs() << EC.message() << '\n';
-        return 1;
-      }
-    }
-  }
-
-  Triple ModuleTriple(M->getTargetTriple());
-  std::string CPUStr, FeaturesStr;
-  std::unique_ptr<TargetMachine> TM;
-  if (ModuleTriple.getArch()) {
-    CPUStr = codegen::getCPUStr();
-    FeaturesStr = codegen::getFeaturesStr();
-    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
-        codegen::createTargetMachineForTriple(ModuleTriple.str(),
-                                              GetCodeGenOptLevel());
-    if (auto E = ExpectedTM.takeError()) {
-      errs() << argv[0] << ": WARNING: failed to create target machine for '"
-             << ModuleTriple.str() << "': " << toString(std::move(E)) << "\n";
-    } else {
-      TM = std::move(*ExpectedTM);
-    }
-  } else if (ModuleTriple.getArchName() != "unknown" &&
-             ModuleTriple.getArchName() != "") {
-    errs() << argv[0] << ": unrecognized architecture '"
-           << ModuleTriple.getArchName() << "' provided.\n";
-    return 1;
-  }
-
-  // Override function attributes based on CPUStr, FeaturesStr, and command line
-  // flags.
-  codegen::setFunctionAttributes(CPUStr, FeaturesStr, *M);
-
-  // If the output is set to be emitted to standard out, and standard out is a
-  // console, print out a warning message and refuse to do it.  We don't
-  // impress anyone by spewing tons of binary goo to a terminal.
-  if (!Force && !NoOutput && !OutputAssembly)
-    if (CheckBitcodeOutputToConsole(Out->os()))
-      NoOutput = true;
-
-  if (OutputThinLTOBC) {
-    M->addModuleFlag(Module::Error, "EnableSplitLTOUnit", SplitLTOUnit);
-    if (UnifiedLTO)
-      M->addModuleFlag(Module::Error, "UnifiedLTO", 1);
-  }
-
-  // Add an appropriate TargetLibraryInfo pass for the module's triple.
-  TargetLibraryInfoImpl TLII(ModuleTriple);
-
-  // The -disable-simplify-libcalls flag actually disables all builtin optzns.
-  if (DisableSimplifyLibCalls)
-    TLII.disableAllFunctions();
-  else {
-    // Disable individual builtin functions in TargetLibraryInfo.
-    LibFunc F;
-    for (auto &FuncName : DisableBuiltins)
-      if (TLII.getLibFunc(FuncName, F))
-        TLII.setUnavailable(F);
-      else {
-        errs() << argv[0] << ": cannot disable nonexistent builtin function "
-               << FuncName << '\n';
-        return 1;
-      }
-  }
-
-  if (UseNPM) {
-    if (legacy::debugPassSpecified()) {
-      errs() << "-debug-pass does not work with the new PM, either use "
-                "-debug-pass-manager, or use the legacy PM\n";
-      return 1;
-    }
-    auto NumOLevel = OptLevelO0 + OptLevelO1 + OptLevelO2 + OptLevelO3 +
-                     OptLevelOs + OptLevelOz;
-    if (NumOLevel > 1) {
-      errs() << "Cannot specify multiple -O#\n";
-      return 1;
-    }
-    if (NumOLevel > 0 && (PassPipeline.getNumOccurrences() > 0)) {
-      errs() << "Cannot specify -O# and --passes=/--foo-pass, use "
-                "-passes='default<O#>,other-pass'\n";
-      return 1;
-    }
-    std::string Pipeline = PassPipeline;
-
-    if (OptLevelO0)
-      Pipeline = "default<O0>";
-    if (OptLevelO1)
-      Pipeline = "default<O1>";
-    if (OptLevelO2)
-      Pipeline = "default<O2>";
-    if (OptLevelO3)
-      Pipeline = "default<O3>";
-    if (OptLevelOs)
-      Pipeline = "default<Os>";
-    if (OptLevelOz)
-      Pipeline = "default<Oz>";
-    OutputKind OK = OK_NoOutput;
-    if (!NoOutput)
-      OK = OutputAssembly
-               ? OK_OutputAssembly
-               : (OutputThinLTOBC ? OK_OutputThinLTOBitcode : OK_OutputBitcode);
-
-    VerifierKind VK = VK_VerifyOut;
-    if (NoVerify)
-      VK = VK_NoVerifier;
-    else if (VerifyEach)
-      VK = VK_VerifyEachPass;
-
-    // The user has asked to use the new pass manager and provided a pipeline
-    // string. Hand off the rest of the functionality to the new code for that
-    // layer.
-    return runPassPipeline(argv[0], *M, TM.get(), &TLII, Out.get(),
-                           ThinLinkOut.get(), RemarksFile.get(), Pipeline,
-                           PluginList, OK, VK, PreserveAssemblyUseListOrder,
-                           PreserveBitcodeUseListOrder, EmitSummaryIndex,
-                           EmitModuleHash, EnableDebugify,
-                           VerifyDebugInfoPreserve, UnifiedLTO)
-               ? 0
-               : 1;
-  }
-
-  if (OptLevelO0 || OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz ||
-      OptLevelO3) {
-    errs() << "Cannot use -O# with legacy PM.\n";
-    return 1;
-  }
-  if (EmitSummaryIndex) {
-    errs() << "Cannot use -module-summary with legacy PM.\n";
-    return 1;
-  }
-  if (EmitModuleHash) {
-    errs() << "Cannot use -module-hash with legacy PM.\n";
-    return 1;
-  }
-  if (OutputThinLTOBC) {
-    errs() << "Cannot use -thinlto-bc with legacy PM.\n";
-    return 1;
-  }
-  // Create a PassManager to hold and optimize the collection of passes we are
-  // about to build. If the -debugify-each option is set, wrap each pass with
-  // the (-check)-debugify passes.
-  DebugifyCustomPassManager Passes;
-  DebugifyStatsMap DIStatsMap;
-  DebugInfoPerPass DebugInfoBeforePass;
-  if (DebugifyEach) {
-    Passes.setDebugifyMode(DebugifyMode::SyntheticDebugInfo);
-    Passes.setDIStatsMap(DIStatsMap);
-  } else if (VerifyEachDebugInfoPreserve) {
-    Passes.setDebugifyMode(DebugifyMode::OriginalDebugInfo);
-    Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
-    if (!VerifyDIPreserveExport.empty())
-      Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
-  }
-
-  bool AddOneTimeDebugifyPasses =
-      (EnableDebugify && !DebugifyEach) ||
-      (VerifyDebugInfoPreserve && !VerifyEachDebugInfoPreserve);
-
-  Passes.add(new TargetLibraryInfoWrapperPass(TLII));
-
-  // Add internal analysis passes from the target machine.
-  Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
-                                                     : TargetIRAnalysis()));
-
-  if (AddOneTimeDebugifyPasses) {
-    if (EnableDebugify) {
-      Passes.setDIStatsMap(DIStatsMap);
-      Passes.add(createDebugifyModulePass());
-    } else if (VerifyDebugInfoPreserve) {
-      Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
-      Passes.add(createDebugifyModulePass(
-          DebugifyMode::OriginalDebugInfo, "",
-          &(Passes.getDebugInfoPerPass())));
-    }
-  }
-
-  if (TM) {
-    // FIXME: We should dyn_cast this when supported.
-    auto &LTM = static_cast<LLVMTargetMachine &>(*TM);
-    Pass *TPC = LTM.createPassConfig(Passes);
-    Passes.add(TPC);
-  }
-
-  // Create a new optimization pass for each one specified on the command line
-  for (unsigned i = 0; i < PassList.size(); ++i) {
-    const PassInfo *PassInf = PassList[i];
-    if (PassInf->getNormalCtor()) {
-      Pass *P = PassInf->getNormalCtor()();
-      if (P) {
-        // Add the pass to the pass manager.
-        Passes.add(P);
-        // If we are verifying all of the intermediate steps, add the verifier.
-        if (VerifyEach)
-          Passes.add(createVerifierPass());
-      }
-    } else
-      errs() << argv[0] << ": cannot create pass: "
-             << PassInf->getPassName() << "\n";
-  }
-
-  // Check that the module is well formed on completion of optimization
-  if (!NoVerify && !VerifyEach)
-    Passes.add(createVerifierPass());
-
-  if (AddOneTimeDebugifyPasses) {
-    if (EnableDebugify)
-      Passes.add(createCheckDebugifyModulePass(false));
-    else if (VerifyDebugInfoPreserve) {
-      if (!VerifyDIPreserveExport.empty())
-        Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
-      Passes.add(createCheckDebugifyModulePass(
-          false, "", nullptr, DebugifyMode::OriginalDebugInfo,
-          &(Passes.getDebugInfoPerPass()), VerifyDIPreserveExport));
-    }
-  }
-
-  // In run twice mode, we want to make sure the output is bit-by-bit
-  // equivalent if we run the pass manager again, so setup two buffers and
-  // a stream to write to them. Note that llc does something similar and it
-  // may be worth to abstract this out in the future.
-  SmallVector<char, 0> Buffer;
-  SmallVector<char, 0> FirstRunBuffer;
-  std::unique_ptr<raw_svector_ostream> BOS;
-  raw_ostream *OS = nullptr;
-
-  const bool ShouldEmitOutput = !NoOutput;
-
-  // Write bitcode or assembly to the output as the last step...
-  if (ShouldEmitOutput || RunTwice) {
-    assert(Out);
-    OS = &Out->os();
-    if (RunTwice) {
-      BOS = std::make_unique<raw_svector_ostream>(Buffer);
-      OS = BOS.get();
-    }
-    if (OutputAssembly)
-      Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
-    else
-      Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder));
-  }
-
-  // Before executing passes, print the final values of the LLVM options.
-  cl::PrintOptionValues();
-
-  if (!RunTwice) {
-    // Now that we have all of the passes ready, run them.
-    Passes.run(*M);
-  } else {
-    // If requested, run all passes twice with the same pass manager to catch
-    // bugs caused by persistent state in the passes.
-    std::unique_ptr<Module> M2(CloneModule(*M));
-    // Run all passes on the original module first, so the second run processes
-    // the clone to catch CloneModule bugs.
-    Passes.run(*M);
-    FirstRunBuffer = Buffer;
-    Buffer.clear();
-
-    Passes.run(*M2);
-
-    // Compare the two outputs and make sure they're the same
-    assert(Out);
-    if (Buffer.size() != FirstRunBuffer.size() ||
-        (memcmp(Buffer.data(), FirstRunBuffer.data(), Buffer.size()) != 0)) {
-      errs()
-          << "Running the pass manager twice changed the output.\n"
-             "Writing the result of the second run to the specified output.\n"
-             "To generate the one-run comparison binary, just run without\n"
-             "the compile-twice option\n";
-      if (ShouldEmitOutput) {
-        Out->os() << BOS->str();
-        Out->keep();
-      }
-      if (RemarksFile)
-        RemarksFile->keep();
-      return 1;
-    }
-    if (ShouldEmitOutput)
-      Out->os() << BOS->str();
-  }
-
-  if (DebugifyEach && !DebugifyExport.empty())
-    exportDebugifyStats(DebugifyExport, Passes.getDebugifyStatsMap());
-
-  // Declare success.
-  if (!NoOutput)
-    Out->keep();
-
-  if (RemarksFile)
-    RemarksFile->keep();
-
-  if (ThinLinkOut)
-    ThinLinkOut->keep();
-
-  return 0;
-}
+int main(int argc, char **argv) { return optMain(argc, argv, {}); }
diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
new file mode 100644
index 0000000000000..3f66bfc9f0176
--- /dev/null
+++ b/llvm/tools/opt/optdriver.cpp
@@ -0,0 +1,923 @@
+//===- optdriver.cpp - The LLVM Modular Optimizer -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Optimizations may be specified an arbitrary number of times on the command
+// line, They are run in the order specified. Common driver library for re-use
+// by potential downstream opt-variants.
+//
+//===----------------------------------------------------------------------===//
+
+#include "NewPMDriver.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/RegionPass.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/AsmParser/Parser.h"
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
+#include "llvm/IR/LegacyPassManager.h"
+#include "llvm/IR/LegacyPassNameParser.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
+#include "llvm/IR/Verifier.h"
+#include "llvm/IRReader/IRReader.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/LinkAllIR.h"
+#include "llvm/LinkAllPasses.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Passes/PassPlugin.h"
+#include "llvm/Remarks/HotnessThresholdParser.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/PluginLoader.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SystemUtils.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/YAMLTraits.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Host.h"
+#include "llvm/TargetParser/SubtargetFeature.h"
+#include "llvm/TargetParser/Triple.h"
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Debugify.h"
+#include <algorithm>
+#include <memory>
+#include <optional>
+using namespace llvm;
+using namespace opt_tool;
+
+static codegen::RegisterCodeGenFlags CFG;
+
+// The OptimizationList is automatically populated with registered Passes by the
+// PassNameParser.
+static cl::list<const PassInfo *, bool, PassNameParser> PassList(cl::desc(
+    "Optimizations available (use '-passes=' for the new pass manager)"));
+
+static cl::opt<bool> EnableLegacyPassManager(
+    "bugpoint-enable-legacy-pm",
+    cl::desc(
+        "Enable the legacy pass manager. This is strictly for bugpoint "
+        "due to it not working with the new PM, please do not use otherwise."),
+    cl::init(false));
+
+// This flag specifies a textual description of the optimization pass pipeline
+// to run over the module. This flag switches opt to use the new pass manager
+// infrastructure, completely disabling all of the flags specific to the old
+// pass management.
+static cl::opt<std::string> PassPipeline(
+    "passes",
+    cl::desc(
+        "A textual description of the pass pipeline. To have analysis passes "
+        "available before a certain pass, add 'require<foo-analysis>'."));
+static cl::alias PassPipeline2("p", cl::aliasopt(PassPipeline),
+                               cl::desc("Alias for -passes"));
+
+static cl::opt<bool> PrintPasses("print-passes",
+                                 cl::desc("Print available passes that can be "
+                                          "specified in -passes=foo and exit"));
+
+static cl::opt<std::string> InputFilename(cl::Positional,
+                                          cl::desc("<input bitcode file>"),
+                                          cl::init("-"),
+                                          cl::value_desc("filename"));
+
+static cl::opt<std::string> OutputFilename("o",
+                                           cl::desc("Override output filename"),
+                                           cl::value_desc("filename"));
+
+static cl::opt<bool> Force("f", cl::desc("Enable binary output on terminals"));
+
+static cl::opt<bool> NoOutput("disable-output",
+                              cl::desc("Do not write result bitcode file"),
+                              cl::Hidden);
+
+static cl::opt<bool> OutputAssembly("S",
+                                    cl::desc("Write output as LLVM assembly"));
+
+static cl::opt<bool>
+    OutputThinLTOBC("thinlto-bc",
+                    cl::desc("Write output as ThinLTO-ready bitcode"));
+
+static cl::opt<bool>
+    SplitLTOUnit("thinlto-split-lto-unit",
+                 cl::desc("Enable splitting of a ThinLTO LTOUnit"));
+
+static cl::opt<bool>
+    UnifiedLTO("unified-lto",
+               cl::desc("Use unified LTO piplines. Ignored unless -thinlto-bc "
+                        "is also specified."),
+               cl::Hidden, cl::init(false));
+
+static cl::opt<std::string> ThinLinkBitcodeFile(
+    "thin-link-bitcode-file", cl::value_desc("filename"),
+    cl::desc(
+        "A file in which to write minimized bitcode for the thin link only"));
+
+static cl::opt<bool> NoVerify("disable-verify",
+                              cl::desc("Do not run the verifier"), cl::Hidden);
+
+static cl::opt<bool> NoUpgradeDebugInfo("disable-upgrade-debug-info",
+                                        cl::desc("Generate invalid output"),
+                                        cl::ReallyHidden);
+
+static cl::opt<bool> VerifyEach("verify-each",
+                                cl::desc("Verify after each transform"));
+
+static cl::opt<bool>
+    DisableDITypeMap("disable-debug-info-type-map",
+                     cl::desc("Don't use a uniquing type map for debug info"));
+
+static cl::opt<bool>
+    StripDebug("strip-debug",
+               cl::desc("Strip debugger symbol info from translation unit"));
+
+static cl::opt<bool>
+    StripNamedMetadata("strip-named-metadata",
+                       cl::desc("Strip module-level named metadata"));
+
+static cl::opt<bool>
+    OptLevelO0("O0", cl::desc("Optimization level 0. Similar to clang -O0. "
+                              "Same as -passes='default<O0>'"));
+
+static cl::opt<bool>
+    OptLevelO1("O1", cl::desc("Optimization level 1. Similar to clang -O1. "
+                              "Same as -passes='default<O1>'"));
+
+static cl::opt<bool>
+    OptLevelO2("O2", cl::desc("Optimization level 2. Similar to clang -O2. "
+                              "Same as -passes='default<O2>'"));
+
+static cl::opt<bool>
+    OptLevelOs("Os", cl::desc("Like -O2 but size-conscious. Similar to clang "
+                              "-Os. Same as -passes='default<Os>'"));
+
+static cl::opt<bool> OptLevelOz(
+    "Oz",
+    cl::desc("Like -O2 but optimize for code size above all else. Similar to "
+             "clang -Oz. Same as -passes='default<Oz>'"));
+
+static cl::opt<bool>
+    OptLevelO3("O3", cl::desc("Optimization level 3. Similar to clang -O3. "
+                              "Same as -passes='default<O3>'"));
+
+static cl::opt<unsigned> CodeGenOptLevelCL(
+    "codegen-opt-level",
+    cl::desc("Override optimization level for codegen hooks, legacy PM only"));
+
+static cl::opt<std::string>
+    TargetTriple("mtriple", cl::desc("Override target triple for module"));
+
+static cl::opt<bool> EmitSummaryIndex("module-summary",
+                                      cl::desc("Emit module summary index"),
+                                      cl::init(false));
+
+static cl::opt<bool> EmitModuleHash("module-hash", cl::desc("Emit module hash"),
+                                    cl::init(false));
+
+static cl::opt<bool>
+    DisableSimplifyLibCalls("disable-simplify-libcalls",
+                            cl::desc("Disable simplify-libcalls"));
+
+static cl::list<std::string> DisableBuiltins(
+    "disable-builtin",
+    cl::desc("Disable specific target library builtin function"));
+
+static cl::opt<bool> EnableDebugify(
+    "enable-debugify",
+    cl::desc(
+        "Start the pipeline with debugify and end it with check-debugify"));
+
+static cl::opt<bool> VerifyDebugInfoPreserve(
+    "verify-debuginfo-preserve",
+    cl::desc("Start the pipeline with collecting and end it with checking of "
+             "debug info preservation."));
+
+static cl::opt<std::string> ClDataLayout("data-layout",
+                                         cl::desc("data layout string to use"),
+                                         cl::value_desc("layout-string"),
+                                         cl::init(""));
+
+static cl::opt<bool> PreserveBitcodeUseListOrder(
+    "preserve-bc-uselistorder",
+    cl::desc("Preserve use-list order when writing LLVM bitcode."),
+    cl::init(true), cl::Hidden);
+
+static cl::opt<bool> PreserveAssemblyUseListOrder(
+    "preserve-ll-uselistorder",
+    cl::desc("Preserve use-list order when writing LLVM assembly."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> RunTwice("run-twice",
+                              cl::desc("Run all passes twice, re-using the "
+                                       "same pass manager (legacy PM only)."),
+                              cl::init(false), cl::Hidden);
+
+static cl::opt<bool> DiscardValueNames(
+    "discard-value-names",
+    cl::desc("Discard names from Value (other than GlobalValue)."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> TimeTrace("time-trace", cl::desc("Record time trace"));
+
+static cl::opt<unsigned> TimeTraceGranularity(
+    "time-trace-granularity",
+    cl::desc(
+        "Minimum time granularity (in microseconds) traced by time profiler"),
+    cl::init(500), cl::Hidden);
+
+static cl::opt<std::string>
+    TimeTraceFile("time-trace-file",
+                  cl::desc("Specify time trace file destination"),
+                  cl::value_desc("filename"));
+
+static cl::opt<bool> RemarksWithHotness(
+    "pass-remarks-with-hotness",
+    cl::desc("With PGO, include profile count in optimization remarks"),
+    cl::Hidden);
+
+static cl::opt<std::optional<uint64_t>, false, remarks::HotnessThresholdParser>
+    RemarksHotnessThreshold(
+        "pass-remarks-hotness-threshold",
+        cl::desc("Minimum profile count required for "
+                 "an optimization remark to be output. "
+                 "Use 'auto' to apply the threshold from profile summary"),
+        cl::value_desc("N or 'auto'"), cl::init(0), cl::Hidden);
+
+static cl::opt<std::string>
+    RemarksFilename("pass-remarks-output",
+                    cl::desc("Output filename for pass remarks"),
+                    cl::value_desc("filename"));
+
+static cl::opt<std::string>
+    RemarksPasses("pass-remarks-filter",
+                  cl::desc("Only record optimization remarks from passes whose "
+                           "names match the given regular expression"),
+                  cl::value_desc("regex"));
+
+static cl::opt<std::string> RemarksFormat(
+    "pass-remarks-format",
+    cl::desc("The format used for serializing remarks (default: YAML)"),
+    cl::value_desc("format"), cl::init("yaml"));
+
+static cl::list<std::string>
+    PassPlugins("load-pass-plugin",
+                cl::desc("Load passes from plugin library"));
+
+static cl::opt<bool> TryUseNewDbgInfoFormat(
+    "try-experimental-debuginfo-iterators",
+    cl::desc("Enable debuginfo iterator positions, if they're built in"),
+    cl::init(false), cl::Hidden);
+
+extern cl::opt<bool> UseNewDbgInfoFormat;
+
+//===----------------------------------------------------------------------===//
+// CodeGen-related helper functions.
+//
+
+static CodeGenOptLevel GetCodeGenOptLevel() {
+  return static_cast<CodeGenOptLevel>(unsigned(CodeGenOptLevelCL));
+}
+
+struct TimeTracerRAII {
+  TimeTracerRAII(StringRef ProgramName) {
+    if (TimeTrace)
+      timeTraceProfilerInitialize(TimeTraceGranularity, ProgramName);
+  }
+  ~TimeTracerRAII() {
+    if (TimeTrace) {
+      if (auto E = timeTraceProfilerWrite(TimeTraceFile, OutputFilename)) {
+        handleAllErrors(std::move(E), [&](const StringError &SE) {
+          errs() << SE.getMessage() << "\n";
+        });
+        return;
+      }
+      timeTraceProfilerCleanup();
+    }
+  }
+};
+
+// For use in NPM transition. Currently this contains most codegen-specific
+// passes. Remove passes from here when porting to the NPM.
+// TODO: use a codegen version of PassRegistry.def/PassBuilder::is*Pass() once
+// it exists.
+static bool shouldPinPassToLegacyPM(StringRef Pass) {
+  std::vector<StringRef> PassNameExactToIgnore = {
+      "nvvm-reflect",
+      "nvvm-intr-range",
+      "amdgpu-simplifylib",
+      "amdgpu-image-intrinsic-opt",
+      "amdgpu-usenative",
+      "amdgpu-promote-alloca",
+      "amdgpu-promote-alloca-to-vector",
+      "amdgpu-lower-kernel-attributes",
+      "amdgpu-propagate-attributes-early",
+      "amdgpu-propagate-attributes-late",
+      "amdgpu-unify-metadata",
+      "amdgpu-printf-runtime-binding",
+      "amdgpu-always-inline"};
+  if (llvm::is_contained(PassNameExactToIgnore, Pass))
+    return false;
+
+  std::vector<StringRef> PassNamePrefix = {
+      "x86-",    "xcore-", "wasm-",  "systemz-", "ppc-",    "nvvm-",
+      "nvptx-",  "mips-",  "lanai-", "hexagon-", "bpf-",    "avr-",
+      "thumb2-", "arm-",   "si-",    "gcn-",     "amdgpu-", "aarch64-",
+      "amdgcn-", "polly-", "riscv-", "dxil-"};
+  std::vector<StringRef> PassNameContain = {"-eh-prepare"};
+  std::vector<StringRef> PassNameExact = {
+      "safe-stack",
+      "cost-model",
+      "codegenprepare",
+      "interleaved-load-combine",
+      "unreachableblockelim",
+      "verify-safepoint-ir",
+      "atomic-expand",
+      "expandvp",
+      "mve-tail-predication",
+      "interleaved-access",
+      "global-merge",
+      "pre-isel-intrinsic-lowering",
+      "expand-reductions",
+      "indirectbr-expand",
+      "generic-to-nvvm",
+      "expand-memcmp",
+      "loop-reduce",
+      "lower-amx-type",
+      "lower-amx-intrinsics",
+      "polyhedral-info",
+      "print-polyhedral-info",
+      "replace-with-veclib",
+      "jmc-instrumenter",
+      "dot-regions",
+      "dot-regions-only",
+      "view-regions",
+      "view-regions-only",
+      "select-optimize",
+      "expand-large-div-rem",
+      "structurizecfg",
+      "fix-irreducible",
+      "expand-large-fp-convert",
+      "callbrprepare",
+  };
+  for (const auto &P : PassNamePrefix)
+    if (Pass.starts_with(P))
+      return true;
+  for (const auto &P : PassNameContain)
+    if (Pass.contains(P))
+      return true;
+  return llvm::is_contained(PassNameExact, Pass);
+}
+
+// For use in NPM transition.
+static bool shouldForceLegacyPM() {
+  for (const auto &P : PassList) {
+    StringRef Arg = P->getPassArgument();
+    if (shouldPinPassToLegacyPM(Arg))
+      return true;
+  }
+  return false;
+}
+
+//===----------------------------------------------------------------------===//
+// main for opt
+//
+extern "C" int optMain(
+    int argc, char **argv,
+    ArrayRef<std::function<void(llvm::PassBuilder &)>> PassBuilderCallbacks) {
+  InitLLVM X(argc, argv);
+
+  // Enable debug stream buffering.
+  EnableDebugBuffering = true;
+
+  InitializeAllTargets();
+  InitializeAllTargetMCs();
+  InitializeAllAsmPrinters();
+  InitializeAllAsmParsers();
+
+  // Initialize passes
+  PassRegistry &Registry = *PassRegistry::getPassRegistry();
+  initializeCore(Registry);
+  initializeScalarOpts(Registry);
+  initializeVectorization(Registry);
+  initializeIPO(Registry);
+  initializeAnalysis(Registry);
+  initializeTransformUtils(Registry);
+  initializeInstCombine(Registry);
+  initializeTarget(Registry);
+  // For codegen passes, only passes that do IR to IR transformation are
+  // supported.
+  initializeExpandLargeDivRemLegacyPassPass(Registry);
+  initializeExpandLargeFpConvertLegacyPassPass(Registry);
+  initializeExpandMemCmpLegacyPassPass(Registry);
+  initializeScalarizeMaskedMemIntrinLegacyPassPass(Registry);
+  initializeSelectOptimizePass(Registry);
+  initializeCallBrPreparePass(Registry);
+  initializeCodeGenPrepareLegacyPassPass(Registry);
+  initializeAtomicExpandPass(Registry);
+  initializeWinEHPreparePass(Registry);
+  initializeDwarfEHPrepareLegacyPassPass(Registry);
+  initializeSafeStackLegacyPassPass(Registry);
+  initializeSjLjEHPreparePass(Registry);
+  initializePreISelIntrinsicLoweringLegacyPassPass(Registry);
+  initializeGlobalMergePass(Registry);
+  initializeIndirectBrExpandLegacyPassPass(Registry);
+  initializeInterleavedLoadCombinePass(Registry);
+  initializeInterleavedAccessPass(Registry);
+  initializeUnreachableBlockElimLegacyPassPass(Registry);
+  initializeExpandReductionsPass(Registry);
+  initializeExpandVectorPredicationPass(Registry);
+  initializeWasmEHPreparePass(Registry);
+  initializeWriteBitcodePassPass(Registry);
+  initializeReplaceWithVeclibLegacyPass(Registry);
+  initializeJMCInstrumenterPass(Registry);
+
+  SmallVector<PassPlugin, 1> PluginList;
+  PassPlugins.setCallback([&](const std::string &PluginPath) {
+    auto Plugin = PassPlugin::Load(PluginPath);
+    if (!Plugin)
+      report_fatal_error(Plugin.takeError(), /*gen_crash_diag=*/false);
+    PluginList.emplace_back(Plugin.get());
+  });
+
+  // Register the Target and CPU printer for --version.
+  cl::AddExtraVersionPrinter(sys::printDefaultTargetAndDetectedCPU);
+
+  cl::ParseCommandLineOptions(
+      argc, argv, "llvm .bc -> .bc modular optimizer and analysis printer\n");
+
+  // RemoveDIs debug-info transition: tests may request that we /try/ to use the
+  // new debug-info format, if it's built in.
+#ifdef EXPERIMENTAL_DEBUGINFO_ITERATORS
+  if (TryUseNewDbgInfoFormat) {
+    // If LLVM was built with support for this, turn the new debug-info format
+    // on.
+    UseNewDbgInfoFormat = true;
+  }
+#endif
+  (void)TryUseNewDbgInfoFormat;
+
+  LLVMContext Context;
+
+  // TODO: remove shouldForceLegacyPM().
+  const bool UseNPM = (!EnableLegacyPassManager && !shouldForceLegacyPM()) ||
+                      PassPipeline.getNumOccurrences() > 0;
+
+  if (UseNPM && !PassList.empty()) {
+    errs() << "The `opt -passname` syntax for the new pass manager is "
+              "not supported, please use `opt -passes=<pipeline>` (or the `-p` "
+              "alias for a more concise version).\n";
+    errs() << "See https://llvm.org/docs/NewPassManager.html#invoking-opt "
+              "for more details on the pass pipeline syntax.\n\n";
+    return 1;
+  }
+
+  if (!UseNPM && PluginList.size()) {
+    errs() << argv[0] << ": " << PassPlugins.ArgStr
+           << " specified with legacy PM.\n";
+    return 1;
+  }
+
+  // FIXME: once the legacy PM code is deleted, move runPassPipeline() here and
+  // construct the PassBuilder before parsing IR so we can reuse the same
+  // PassBuilder for print passes.
+  if (PrintPasses) {
+    printPasses(outs());
+    return 0;
+  }
+
+  TimeTracerRAII TimeTracer(argv[0]);
+
+  SMDiagnostic Err;
+
+  Context.setDiscardValueNames(DiscardValueNames);
+  if (!DisableDITypeMap)
+    Context.enableDebugTypeODRUniquing();
+
+  Expected<std::unique_ptr<ToolOutputFile>> RemarksFileOrErr =
+      setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
+                                   RemarksFormat, RemarksWithHotness,
+                                   RemarksHotnessThreshold);
+  if (Error E = RemarksFileOrErr.takeError()) {
+    errs() << toString(std::move(E)) << '\n';
+    return 1;
+  }
+  std::unique_ptr<ToolOutputFile> RemarksFile = std::move(*RemarksFileOrErr);
+
+  // Load the input module...
+  auto SetDataLayout = [&](StringRef IRTriple,
+                           StringRef IRLayout) -> std::optional<std::string> {
+    // Data layout specified on the command line has the highest priority.
+    if (!ClDataLayout.empty())
+      return ClDataLayout;
+    // If an explicit data layout is already defined in the IR, don't infer.
+    if (!IRLayout.empty())
+      return std::nullopt;
+
+    // If an explicit triple was specified (either in the IR or on the
+    // command line), use that to infer the default data layout. However, the
+    // command line target triple should override the IR file target triple.
+    std::string TripleStr =
+        TargetTriple.empty() ? IRTriple.str() : Triple::normalize(TargetTriple);
+    // If the triple string is still empty, we don't fall back to
+    // sys::getDefaultTargetTriple() since we do not want to have differing
+    // behaviour dependent on the configured default triple. Therefore, if the
+    // user did not pass -mtriple or define an explicit triple/datalayout in
+    // the IR, we should default to an empty (default) DataLayout.
+    if (TripleStr.empty())
+      return std::nullopt;
+    // Otherwise we infer the DataLayout from the target machine.
+    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
+        codegen::createTargetMachineForTriple(TripleStr, GetCodeGenOptLevel());
+    if (!ExpectedTM) {
+      errs() << argv[0] << ": warning: failed to infer data layout: "
+             << toString(ExpectedTM.takeError()) << "\n";
+      return std::nullopt;
+    }
+    return (*ExpectedTM)->createDataLayout().getStringRepresentation();
+  };
+  std::unique_ptr<Module> M;
+  if (NoUpgradeDebugInfo)
+    M = parseAssemblyFileWithIndexNoUpgradeDebugInfo(
+            InputFilename, Err, Context, nullptr, SetDataLayout)
+            .Mod;
+  else
+    M = parseIRFile(InputFilename, Err, Context,
+                    ParserCallbacks(SetDataLayout));
+
+  if (!M) {
+    Err.print(argv[0], errs());
+    return 1;
+  }
+
+  // Strip debug info before running the verifier.
+  if (StripDebug)
+    StripDebugInfo(*M);
+
+  // Erase module-level named metadata, if requested.
+  if (StripNamedMetadata) {
+    while (!M->named_metadata_empty()) {
+      NamedMDNode *NMD = &*M->named_metadata_begin();
+      M->eraseNamedMetadata(NMD);
+    }
+  }
+
+  // If we are supposed to override the target triple, do so now.
+  if (!TargetTriple.empty())
+    M->setTargetTriple(Triple::normalize(TargetTriple));
+
+  // Immediately run the verifier to catch any problems before starting up the
+  // pass pipelines.  Otherwise we can crash on broken code during
+  // doInitialization().
+  if (!NoVerify && verifyModule(*M, &errs())) {
+    errs() << argv[0] << ": " << InputFilename
+           << ": error: input module is broken!\n";
+    return 1;
+  }
+
+  // Enable testing of whole program devirtualization on this module by invoking
+  // the facility for updating public visibility to linkage unit visibility when
+  // specified by an internal option. This is normally done during LTO which is
+  // not performed via opt.
+  updateVCallVisibilityInModule(
+      *M,
+      /*WholeProgramVisibilityEnabledInLTO=*/false,
+      // FIXME: These need linker information via a
+      // TBD new interface.
+      /*DynamicExportSymbols=*/{},
+      /*ValidateAllVtablesHaveTypeInfos=*/false,
+      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
+
+  // Figure out what stream we are supposed to write to...
+  std::unique_ptr<ToolOutputFile> Out;
+  std::unique_ptr<ToolOutputFile> ThinLinkOut;
+  if (NoOutput) {
+    if (!OutputFilename.empty())
+      errs() << "WARNING: The -o (output filename) option is ignored when\n"
+                "the --disable-output option is used.\n";
+  } else {
+    // Default to standard output.
+    if (OutputFilename.empty())
+      OutputFilename = "-";
+
+    std::error_code EC;
+    sys::fs::OpenFlags Flags =
+        OutputAssembly ? sys::fs::OF_TextWithCRLF : sys::fs::OF_None;
+    Out.reset(new ToolOutputFile(OutputFilename, EC, Flags));
+    if (EC) {
+      errs() << EC.message() << '\n';
+      return 1;
+    }
+
+    if (!ThinLinkBitcodeFile.empty()) {
+      ThinLinkOut.reset(
+          new ToolOutputFile(ThinLinkBitcodeFile, EC, sys::fs::OF_None));
+      if (EC) {
+        errs() << EC.message() << '\n';
+        return 1;
+      }
+    }
+  }
+
+  Triple ModuleTriple(M->getTargetTriple());
+  std::string CPUStr, FeaturesStr;
+  std::unique_ptr<TargetMachine> TM;
+  if (ModuleTriple.getArch()) {
+    CPUStr = codegen::getCPUStr();
+    FeaturesStr = codegen::getFeaturesStr();
+    Expected<std::unique_ptr<TargetMachine>> ExpectedTM =
+        codegen::createTargetMachineForTriple(ModuleTriple.str(),
+                                              GetCodeGenOptLevel());
+    if (auto E = ExpectedTM.takeError()) {
+      errs() << argv[0] << ": WARNING: failed to create target machine for '"
+             << ModuleTriple.str() << "': " << toString(std::move(E)) << "\n";
+    } else {
+      TM = std::move(*ExpectedTM);
+    }
+  } else if (ModuleTriple.getArchName() != "unknown" &&
+             ModuleTriple.getArchName() != "") {
+    errs() << argv[0] << ": unrecognized architecture '"
+           << ModuleTriple.getArchName() << "' provided.\n";
+    return 1;
+  }
+
+  // Override function attributes based on CPUStr, FeaturesStr, and command line
+  // flags.
+  codegen::setFunctionAttributes(CPUStr, FeaturesStr, *M);
+
+  // If the output is set to be emitted to standard out, and standard out is a
+  // console, print out a warning message and refuse to do it.  We don't
+  // impress anyone by spewing tons of binary goo to a terminal.
+  if (!Force && !NoOutput && !OutputAssembly)
+    if (CheckBitcodeOutputToConsole(Out->os()))
+      NoOutput = true;
+
+  if (OutputThinLTOBC) {
+    M->addModuleFlag(Module::Error, "EnableSplitLTOUnit", SplitLTOUnit);
+    if (UnifiedLTO)
+      M->addModuleFlag(Module::Error, "UnifiedLTO", 1);
+  }
+
+  // Add an appropriate TargetLibraryInfo pass for the module's triple.
+  TargetLibraryInfoImpl TLII(ModuleTriple);
+
+  // The -disable-simplify-libcalls flag actually disables all builtin optzns.
+  if (DisableSimplifyLibCalls)
+    TLII.disableAllFunctions();
+  else {
+    // Disable individual builtin functions in TargetLibraryInfo.
+    LibFunc F;
+    for (auto &FuncName : DisableBuiltins)
+      if (TLII.getLibFunc(FuncName, F))
+        TLII.setUnavailable(F);
+      else {
+        errs() << argv[0] << ": cannot disable nonexistent builtin function "
+               << FuncName << '\n';
+        return 1;
+      }
+  }
+
+  if (UseNPM) {
+    if (legacy::debugPassSpecified()) {
+      errs() << "-debug-pass does not work with the new PM, either use "
+                "-debug-pass-manager, or use the legacy PM\n";
+      return 1;
+    }
+    auto NumOLevel = OptLevelO0 + OptLevelO1 + OptLevelO2 + OptLevelO3 +
+                     OptLevelOs + OptLevelOz;
+    if (NumOLevel > 1) {
+      errs() << "Cannot specify multiple -O#\n";
+      return 1;
+    }
+    if (NumOLevel > 0 && (PassPipeline.getNumOccurrences() > 0)) {
+      errs() << "Cannot specify -O# and --passes=/--foo-pass, use "
+                "-passes='default<O#>,other-pass'\n";
+      return 1;
+    }
+    std::string Pipeline = PassPipeline;
+
+    if (OptLevelO0)
+      Pipeline = "default<O0>";
+    if (OptLevelO1)
+      Pipeline = "default<O1>";
+    if (OptLevelO2)
+      Pipeline = "default<O2>";
+    if (OptLevelO3)
+      Pipeline = "default<O3>";
+    if (OptLevelOs)
+      Pipeline = "default<Os>";
+    if (OptLevelOz)
+      Pipeline = "default<Oz>";
+    OutputKind OK = OK_NoOutput;
+    if (!NoOutput)
+      OK = OutputAssembly
+               ? OK_OutputAssembly
+               : (OutputThinLTOBC ? OK_OutputThinLTOBitcode : OK_OutputBitcode);
+
+    VerifierKind VK = VK_VerifyOut;
+    if (NoVerify)
+      VK = VK_NoVerifier;
+    else if (VerifyEach)
+      VK = VK_VerifyEachPass;
+
+    // The user has asked to use the new pass manager and provided a pipeline
+    // string. Hand off the rest of the functionality to the new code for that
+    // layer.
+    return runPassPipeline(
+               argv[0], *M, TM.get(), &TLII, Out.get(), ThinLinkOut.get(),
+               RemarksFile.get(), Pipeline, PluginList, PassBuilderCallbacks,
+               OK, VK, PreserveAssemblyUseListOrder,
+               PreserveBitcodeUseListOrder, EmitSummaryIndex, EmitModuleHash,
+               EnableDebugify, VerifyDebugInfoPreserve, UnifiedLTO)
+               ? 0
+               : 1;
+  }
+
+  if (OptLevelO0 || OptLevelO1 || OptLevelO2 || OptLevelOs || OptLevelOz ||
+      OptLevelO3) {
+    errs() << "Cannot use -O# with legacy PM.\n";
+    return 1;
+  }
+  if (EmitSummaryIndex) {
+    errs() << "Cannot use -module-summary with legacy PM.\n";
+    return 1;
+  }
+  if (EmitModuleHash) {
+    errs() << "Cannot use -module-hash with legacy PM.\n";
+    return 1;
+  }
+  if (OutputThinLTOBC) {
+    errs() << "Cannot use -thinlto-bc with legacy PM.\n";
+    return 1;
+  }
+  // Create a PassManager to hold and optimize the collection of passes we are
+  // about to build. If the -debugify-each option is set, wrap each pass with
+  // the (-check)-debugify passes.
+  DebugifyCustomPassManager Passes;
+  DebugifyStatsMap DIStatsMap;
+  DebugInfoPerPass DebugInfoBeforePass;
+  if (DebugifyEach) {
+    Passes.setDebugifyMode(DebugifyMode::SyntheticDebugInfo);
+    Passes.setDIStatsMap(DIStatsMap);
+  } else if (VerifyEachDebugInfoPreserve) {
+    Passes.setDebugifyMode(DebugifyMode::OriginalDebugInfo);
+    Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
+    if (!VerifyDIPreserveExport.empty())
+      Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
+  }
+
+  bool AddOneTimeDebugifyPasses =
+      (EnableDebugify && !DebugifyEach) ||
+      (VerifyDebugInfoPreserve && !VerifyEachDebugInfoPreserve);
+
+  Passes.add(new TargetLibraryInfoWrapperPass(TLII));
+
+  // Add internal analysis passes from the target machine.
+  Passes.add(createTargetTransformInfoWrapperPass(TM ? TM->getTargetIRAnalysis()
+                                                     : TargetIRAnalysis()));
+
+  if (AddOneTimeDebugifyPasses) {
+    if (EnableDebugify) {
+      Passes.setDIStatsMap(DIStatsMap);
+      Passes.add(createDebugifyModulePass());
+    } else if (VerifyDebugInfoPreserve) {
+      Passes.setDebugInfoBeforePass(DebugInfoBeforePass);
+      Passes.add(createDebugifyModulePass(DebugifyMode::OriginalDebugInfo, "",
+                                          &(Passes.getDebugInfoPerPass())));
+    }
+  }
+
+  if (TM) {
+    // FIXME: We should dyn_cast this when supported.
+    auto &LTM = static_cast<LLVMTargetMachine &>(*TM);
+    Pass *TPC = LTM.createPassConfig(Passes);
+    Passes.add(TPC);
+  }
+
+  // Create a new optimization pass for each one specified on the command line
+  for (unsigned i = 0; i < PassList.size(); ++i) {
+    const PassInfo *PassInf = PassList[i];
+    if (PassInf->getNormalCtor()) {
+      Pass *P = PassInf->getNormalCtor()();
+      if (P) {
+        // Add the pass to the pass manager.
+        Passes.add(P);
+        // If we are verifying all of the intermediate steps, add the verifier.
+        if (VerifyEach)
+          Passes.add(createVerifierPass());
+      }
+    } else
+      errs() << argv[0] << ": cannot create pass: " << PassInf->getPassName()
+             << "\n";
+  }
+
+  // Check that the module is well formed on completion of optimization
+  if (!NoVerify && !VerifyEach)
+    Passes.add(createVerifierPass());
+
+  if (AddOneTimeDebugifyPasses) {
+    if (EnableDebugify)
+      Passes.add(createCheckDebugifyModulePass(false));
+    else if (VerifyDebugInfoPreserve) {
+      if (!VerifyDIPreserveExport.empty())
+        Passes.setOrigDIVerifyBugsReportFilePath(VerifyDIPreserveExport);
+      Passes.add(createCheckDebugifyModulePass(
+          false, "", nullptr, DebugifyMode::OriginalDebugInfo,
+          &(Passes.getDebugInfoPerPass()), VerifyDIPreserveExport));
+    }
+  }
+
+  // In run twice mode, we want to make sure the output is bit-by-bit
+  // equivalent if we run the pass manager again, so setup two buffers and
+  // a stream to write to them. Note that llc does something similar and it
+  // may be worth to abstract this out in the future.
+  SmallVector<char, 0> Buffer;
+  SmallVector<char, 0> FirstRunBuffer;
+  std::unique_ptr<raw_svector_ostream> BOS;
+  raw_ostream *OS = nullptr;
+
+  const bool ShouldEmitOutput = !NoOutput;
+
+  // Write bitcode or assembly to the output as the last step...
+  if (ShouldEmitOutput || RunTwice) {
+    assert(Out);
+    OS = &Out->os();
+    if (RunTwice) {
+      BOS = std::make_unique<raw_svector_ostream>(Buffer);
+      OS = BOS.get();
+    }
+    if (OutputAssembly)
+      Passes.add(createPrintModulePass(*OS, "", PreserveAssemblyUseListOrder));
+    else
+      Passes.add(createBitcodeWriterPass(*OS, PreserveBitcodeUseListOrder));
+  }
+
+  // Before executing passes, print the final values of the LLVM options.
+  cl::PrintOptionValues();
+
+  if (!RunTwice) {
+    // Now that we have all of the passes ready, run them.
+    Passes.run(*M);
+  } else {
+    // If requested, run all passes twice with the same pass manager to catch
+    // bugs caused by persistent state in the passes.
+    std::unique_ptr<Module> M2(CloneModule(*M));
+    // Run all passes on the original module first, so the second run processes
+    // the clone to catch CloneModule bugs.
+    Passes.run(*M);
+    FirstRunBuffer = Buffer;
+    Buffer.clear();
+
+    Passes.run(*M2);
+
+    // Compare the two outputs and make sure they're the same
+    assert(Out);
+    if (Buffer.size() != FirstRunBuffer.size() ||
+        (memcmp(Buffer.data(), FirstRunBuffer.data(), Buffer.size()) != 0)) {
+      errs()
+          << "Running the pass manager twice changed the output.\n"
+             "Writing the result of the second run to the specified output.\n"
+             "To generate the one-run comparison binary, just run without\n"
+             "the compile-twice option\n";
+      if (ShouldEmitOutput) {
+        Out->os() << BOS->str();
+        Out->keep();
+      }
+      if (RemarksFile)
+        RemarksFile->keep();
+      return 1;
+    }
+    if (ShouldEmitOutput)
+      Out->os() << BOS->str();
+  }
+
+  if (DebugifyEach && !DebugifyExport.empty())
+    exportDebugifyStats(DebugifyExport, Passes.getDebugifyStatsMap());
+
+  // Declare success.
+  if (!NoOutput)
+    Out->keep();
+
+  if (RemarksFile)
+    RemarksFile->keep();
+
+  if (ThinLinkOut)
+    ThinLinkOut->keep();
+
+  return 0;
+}
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index d4cb0cdb55b52..af219dd743ce7 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -4814,8 +4814,8 @@ cc_binary(
     ],
 )
 
-cc_binary(
-    name = "opt",
+cc_library(
+    name = "opt-driver",
     srcs = glob([
         "tools/opt/*.cpp",
         "tools/opt/*.h",
@@ -4826,7 +4826,6 @@ cc_binary(
         "@platforms//os:macos": [],
         "//conditions:default": ["-Wl,--export-dynamic"],
     }),
-    stamp = 0,
     deps = [
         ":AllTargetsAsmParsers",
         ":AllTargetsCodeGens",
@@ -4853,6 +4852,12 @@ cc_binary(
     ],
 )
 
+cc_binary(
+    name = "opt",
+    stamp = 0,
+    deps = [":opt-driver"]
+)
+
 gentbl(
     name = "SancovOptsTableGen",
     strip_include_prefix = "tools/sancov",

From 982996a8b413815733a3871e6d753f46d32e0e07 Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jinsong.ji@intel.com>
Date: Wed, 24 Jan 2024 19:39:36 -0800
Subject: [PATCH 836/843] Fix bad conflict resolution

---
 llvm/tools/opt/optdriver.cpp | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/llvm/tools/opt/optdriver.cpp b/llvm/tools/opt/optdriver.cpp
index 3f66bfc9f0176..722e56de4ee8b 100644
--- a/llvm/tools/opt/optdriver.cpp
+++ b/llvm/tools/opt/optdriver.cpp
@@ -318,6 +318,8 @@ struct TimeTracerRAII {
 // it exists.
 static bool shouldPinPassToLegacyPM(StringRef Pass) {
   std::vector<StringRef> PassNameExactToIgnore = {
+      "globaloffset",
+      "localaccessortosharedmemory",
       "nvvm-reflect",
       "nvvm-intr-range",
       "amdgpu-simplifylib",
@@ -373,6 +375,7 @@ static bool shouldPinPassToLegacyPM(StringRef Pass) {
       "structurizecfg",
       "fix-irreducible",
       "expand-large-fp-convert",
+      "fpbuiltin-fn-selection",
       "callbrprepare",
   };
   for (const auto &P : PassNamePrefix)
@@ -446,6 +449,16 @@ extern "C" int optMain(
   initializeWriteBitcodePassPass(Registry);
   initializeReplaceWithVeclibLegacyPass(Registry);
   initializeJMCInstrumenterPass(Registry);
+  initializeSYCLLowerWGScopeLegacyPassPass(Registry);
+  initializeSYCLLowerESIMDLegacyPassPass(Registry);
+  initializeSYCLLowerInvokeSimdLegacyPassPass(Registry);
+  initializeSPIRITTAnnotationsLegacyPassPass(Registry);
+  initializeESIMDLowerLoadStorePass(Registry);
+  initializeESIMDVerifierPass(Registry);
+  initializeSYCLLowerWGLocalMemoryLegacyPass(Registry);
+  initializeSYCLMutatePrintfAddrspaceLegacyPassPass(Registry);
+  initializeFPBuiltinFnSelectionLegacyPassPass(Registry);
+
 
   SmallVector<PassPlugin, 1> PluginList;
   PassPlugins.setCallback([&](const std::string &PluginPath) {

From ba6699dd6b234a10f5731e8850f9a81fa45121b9 Mon Sep 17 00:00:00 2001
From: LU-JOHN <111294400+LU-JOHN@users.noreply.github.com>
Date: Mon, 22 Jan 2024 04:23:18 -0600
Subject: [PATCH 837/843] Revert change to Fortran array test (#2307)

Revert change to Fortran array test. Also rewrite it to DAG checks to avoid output order problems in future.
Previous change was in:

> Author: Viktoria Maximova <viktoria.maksimova@intel.com>
> Date:   Tue Jan 16 17:16:33 2024 +0100
>
>     Fix DebugInfo/NonSemantic/Shader200/FortranArray.ll (#2303)
>
>     The change is needed after llvm/llvm-project@fc6faa11
>

The LLVM change that motivated the change to FortranArray.ll has been reverted:

> Author: Davide Italiano <davidino@fb.com>
> Date:   Tue Jan 16 16:56:24 2024 -0800
>
>     Revert "[CloneFunction][DebugInfo] Avoid cloning DILocalVariables of inlined functions (#75385)"
>
>     This reverts commit fc6faa1113e9069f41b5500db051210af0eea843.

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/7bf3fb4
---
 .../test/DebugInfo/NonSemantic/Shader200/FortranArray.ll  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm-spirv/test/DebugInfo/NonSemantic/Shader200/FortranArray.ll b/llvm-spirv/test/DebugInfo/NonSemantic/Shader200/FortranArray.ll
index 2f098251d31cc..e42ea45bb6785 100644
--- a/llvm-spirv/test/DebugInfo/NonSemantic/Shader200/FortranArray.ll
+++ b/llvm-spirv/test/DebugInfo/NonSemantic/Shader200/FortranArray.ll
@@ -7,11 +7,11 @@
 ; RUN: llvm-dis %t.rev.bc -o - | FileCheck %s --check-prefix=CHECK-LLVM
 
 ; CHECK-SPIRV: [[#CompUnit:]] [[#]] DebugCompilationUnit
-; CHECK-SPIRV-DAG: [[#EntryFunc:]] [[#]] DebugFunction [[#]]
 ; CHECK-SPIRV-DAG: [[#None:]] [[#]] DebugInfoNone
-; CHECK-SPIRV: [[#BaseTy:]] [[#]] DebugTypeBasic
-; CHECK-SPIRV: [[#Subrange:]] [[#]] DebugTypeSubrange
-; CHECK-SPIRV: DebugTypeArrayDynamic [[#BaseTy]] [[#]] [[#]] [[#None]] [[#None]] [[#Subrange]]
+; CHECK-SPIRV-DAG: [[#BaseTy:]] [[#]] DebugTypeBasic
+; CHECK-SPIRV-DAG: [[#Subrange:]] [[#]] DebugTypeSubrange
+; CHECK-SPIRV-DAG: DebugTypeArrayDynamic [[#BaseTy]] [[#]] [[#]] [[#None]] [[#None]] [[#Subrange]]
+; CHECK-SPIRV-DAG: [[#EntryFunc:]] [[#]] DebugFunction [[#]]
 ; CHECK-SPIRV: DebugEntryPoint [[#EntryFunc]] [[#CompUnit]] [[#]] [[#]] {{$}}
 
 ; CHECK-LLVM: !DICompileUnit(language: DW_LANG_Fortran95

From a130112567aff3ddec4eca155ae374894628931a Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy
 <89994100+VyacheslavLevytskyy@users.noreply.github.com>
Date: Tue, 23 Jan 2024 13:44:30 +0100
Subject: [PATCH 838/843] add API call to display general information about the
 module (#2298)

Partially load SPIR-V from the stream and decode only selected for the report instructions, needed to retrieve general information about the module: capabilities, extensions, version, memory model and addressing model.

In addition to immediately helpful for back-ends lists of capabilities and extensions declared in SPIR-V module, a general intent also is to extend report details in future by feedbacks about further potentially useful analysis, statistics, etc.

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/918036c
---
 llvm-spirv/include/LLVMSPIRVLib.h             |  29 ++++
 llvm-spirv/include/LLVMSPIRVOpts.h            |  21 +++
 llvm-spirv/lib/SPIRV/SPIRVReader.cpp          | 131 ++++++++++++++++++
 .../lib/SPIRV/libSPIRV/SPIRVErrorEnum.h       |   6 +
 llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp |  26 +---
 .../test/negative/spirv_report_bad_input.spt  |  33 +++++
 llvm-spirv/test/spirv_report.spt              |  43 ++++++
 llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp    |  42 +++++-
 8 files changed, 306 insertions(+), 25 deletions(-)
 create mode 100644 llvm-spirv/test/negative/spirv_report_bad_input.spt
 create mode 100644 llvm-spirv/test/spirv_report.spt

diff --git a/llvm-spirv/include/LLVMSPIRVLib.h b/llvm-spirv/include/LLVMSPIRVLib.h
index 599c3a201ef9a..48253374bc0e1 100644
--- a/llvm-spirv/include/LLVMSPIRVLib.h
+++ b/llvm-spirv/include/LLVMSPIRVLib.h
@@ -106,6 +106,35 @@ std::unique_ptr<SPIRVModule> readSpirvModule(std::istream &IS,
                                              const SPIRV::TranslatorOpts &Opts,
                                              std::string &ErrMsg);
 
+struct SPIRVModuleReport {
+  SPIRV::VersionNumber Version;
+  uint32_t MemoryModel;
+  uint32_t AddrModel;
+  std::vector<std::string> Extensions;
+  std::vector<std::string> ExtendedInstructionSets;
+  std::vector<uint32_t> Capabilities;
+};
+/// \brief Partially load SPIR-V from the stream and decode only selected
+/// instructions that are needed to retrieve general information
+/// about the module. If this call fails, readSPIRVModule is
+/// expected to fail as well.
+/// \returns nullopt on failure.
+std::optional<SPIRVModuleReport> getSpirvReport(std::istream &IS);
+std::optional<SPIRVModuleReport> getSpirvReport(std::istream &IS, int &ErrCode);
+
+struct SPIRVModuleTextReport {
+  std::string Version;
+  std::string MemoryModel;
+  std::string AddrModel;
+  std::vector<std::string> Extensions;
+  std::vector<std::string> ExtendedInstructionSets;
+  std::vector<std::string> Capabilities;
+};
+/// \brief Create a human-readable form of the report returned by a call to
+/// getSpirvReport by decoding its binary fields.
+/// \returns String with the human-readable report.
+SPIRVModuleTextReport formatSpirvReport(const SPIRVModuleReport &Report);
+
 } // End namespace SPIRV
 
 namespace llvm {
diff --git a/llvm-spirv/include/LLVMSPIRVOpts.h b/llvm-spirv/include/LLVMSPIRVOpts.h
index 33cf162098331..e9706930d1b7e 100644
--- a/llvm-spirv/include/LLVMSPIRVOpts.h
+++ b/llvm-spirv/include/LLVMSPIRVOpts.h
@@ -68,6 +68,27 @@ enum class VersionNumber : uint32_t {
   MaximumVersion = SPIRV_1_4
 };
 
+inline constexpr std::string_view formatVersionNumber(uint32_t Version) {
+  switch (Version) {
+  case static_cast<uint32_t>(VersionNumber::SPIRV_1_0):
+    return "1.0";
+  case static_cast<uint32_t>(VersionNumber::SPIRV_1_1):
+    return "1.1";
+  case static_cast<uint32_t>(VersionNumber::SPIRV_1_2):
+    return "1.2";
+  case static_cast<uint32_t>(VersionNumber::SPIRV_1_3):
+    return "1.3";
+  case static_cast<uint32_t>(VersionNumber::SPIRV_1_4):
+    return "1.4";
+  }
+  return "unknown";
+}
+
+inline bool isSPIRVVersionKnown(uint32_t Ver) {
+  return Ver >= static_cast<uint32_t>(VersionNumber::MinimumVersion) &&
+         Ver <= static_cast<uint32_t>(VersionNumber::MaximumVersion);
+}
+
 enum class ExtensionID : uint32_t {
   First,
 #define EXT(X) X,
diff --git a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
index 7b087ed0a33ce..d93197e8dc7e1 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVReader.cpp
@@ -4848,6 +4848,137 @@ Instruction *SPIRVToLLVM::transRelational(SPIRVInstruction *I, BasicBlock *BB) {
   return cast<Instruction>(Mutator.getMutated());
 }
 
+std::optional<SPIRVModuleReport> getSpirvReport(std::istream &IS) {
+  int IgnoreErrCode;
+  return getSpirvReport(IS, IgnoreErrCode);
+}
+
+std::optional<SPIRVModuleReport> getSpirvReport(std::istream &IS,
+                                                int &ErrCode) {
+  SPIRVWord Word;
+  std::string Name;
+  std::unique_ptr<SPIRVModule> BM(SPIRVModule::createSPIRVModule());
+  SPIRVDecoder D(IS, *BM);
+  D >> Word;
+  if (Word != MagicNumber) {
+    ErrCode = SPIRVEC_InvalidMagicNumber;
+    return {};
+  }
+  D >> Word;
+  if (!isSPIRVVersionKnown(Word)) {
+    ErrCode = SPIRVEC_InvalidVersionNumber;
+    return {};
+  }
+  SPIRVModuleReport Report;
+  Report.Version = static_cast<SPIRV::VersionNumber>(Word);
+  // Skip: Generator’s magic number, Bound and Reserved word
+  D.ignore(3);
+
+  bool IsReportGenCompleted = false, IsMemoryModelDefined = false;
+  while (!IS.bad() && !IsReportGenCompleted && D.getWordCountAndOpCode()) {
+    switch (D.OpCode) {
+    case OpCapability:
+      D >> Word;
+      Report.Capabilities.push_back(Word);
+      break;
+    case OpExtension:
+      Name.clear();
+      D >> Name;
+      Report.Extensions.push_back(Name);
+      break;
+    case OpExtInstImport:
+      Name.clear();
+      D >> Word >> Name;
+      Report.ExtendedInstructionSets.push_back(Name);
+      break;
+    case OpMemoryModel:
+      if (IsMemoryModelDefined) {
+        ErrCode = SPIRVEC_RepeatedMemoryModel;
+        return {};
+      }
+      SPIRVAddressingModelKind AddrModel;
+      SPIRVMemoryModelKind MemoryModel;
+      D >> AddrModel >> MemoryModel;
+      if (!isValid(AddrModel)) {
+        ErrCode = SPIRVEC_InvalidAddressingModel;
+        return {};
+      }
+      if (!isValid(MemoryModel)) {
+        ErrCode = SPIRVEC_InvalidMemoryModel;
+        return {};
+      }
+      Report.MemoryModel = MemoryModel;
+      Report.AddrModel = AddrModel;
+      IsMemoryModelDefined = true;
+      // In this report we don't analyze instructions after OpMemoryModel
+      IsReportGenCompleted = true;
+      break;
+    default:
+      // No more instructions to gather information about
+      IsReportGenCompleted = true;
+    }
+  }
+  if (IS.bad()) {
+    ErrCode = SPIRVEC_InvalidModule;
+    return {};
+  }
+  if (!IsMemoryModelDefined) {
+    ErrCode = SPIRVEC_UnspecifiedMemoryModel;
+    return {};
+  }
+  ErrCode = SPIRVEC_Success;
+  return std::make_optional(std::move(Report));
+}
+
+constexpr std::string_view formatAddressingModel(uint32_t AddrModel) {
+  switch (AddrModel) {
+  case AddressingModelLogical:
+    return "Logical";
+  case AddressingModelPhysical32:
+    return "Physical32";
+  case AddressingModelPhysical64:
+    return "Physical64";
+  case AddressingModelPhysicalStorageBuffer64:
+    return "PhysicalStorageBuffer64";
+  default:
+    return "Unknown";
+  }
+}
+
+constexpr std::string_view formatMemoryModel(uint32_t MemoryModel) {
+  switch (MemoryModel) {
+  case MemoryModelSimple:
+    return "Simple";
+  case MemoryModelGLSL450:
+    return "GLSL450";
+  case MemoryModelOpenCL:
+    return "OpenCL";
+  case MemoryModelVulkan:
+    return "Vulkan";
+  default:
+    return "Unknown";
+  }
+}
+
+SPIRVModuleTextReport formatSpirvReport(const SPIRVModuleReport &Report) {
+  SPIRVModuleTextReport TextReport;
+  TextReport.Version =
+      formatVersionNumber(static_cast<uint32_t>(Report.Version));
+  TextReport.AddrModel = formatAddressingModel(Report.AddrModel);
+  TextReport.MemoryModel = formatMemoryModel(Report.MemoryModel);
+  // format capability codes as strings
+  std::string Name;
+  for (auto Capability : Report.Capabilities) {
+    bool Found = SPIRVCapabilityNameMap::find(
+        static_cast<SPIRVCapabilityKind>(Capability), &Name);
+    TextReport.Capabilities.push_back(Found ? Name : "Unknown");
+  }
+  // other fields with string content can be copied as is
+  TextReport.Extensions = Report.Extensions;
+  TextReport.ExtendedInstructionSets = Report.ExtendedInstructionSets;
+  return TextReport;
+}
+
 std::unique_ptr<SPIRVModule> readSpirvModule(std::istream &IS,
                                              const SPIRV::TranslatorOpts &Opts,
                                              std::string &ErrMsg) {
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVErrorEnum.h b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVErrorEnum.h
index 68dc0bf6f1f5a..04251d0ebd6d3 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVErrorEnum.h
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVErrorEnum.h
@@ -22,3 +22,9 @@ _SPIRV_OP(InvalidWordCount,
 _SPIRV_OP(Requires1_1, "Feature requires SPIR-V 1.1 or greater:")
 _SPIRV_OP(RequiresExtension,
           "Feature requires the following SPIR-V extension:\n")
+_SPIRV_OP(InvalidMagicNumber,
+          "Invalid Magic Number.")
+_SPIRV_OP(InvalidVersionNumber,
+          "Invalid Version Number.")
+_SPIRV_OP(UnspecifiedMemoryModel, "Unspecified Memory Model.")
+_SPIRV_OP(RepeatedMemoryModel, "Expects a single OpMemoryModel instruction.")
diff --git a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
index 4b16148145b96..0fac0ee8f0b7c 100644
--- a/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
+++ b/llvm-spirv/lib/SPIRV/libSPIRV/SPIRVModule.cpp
@@ -2094,27 +2094,7 @@ void SPIRVModuleImpl::addUnknownStructField(SPIRVTypeStruct *Struct, unsigned I,
 }
 
 static std::string to_string(uint32_t Version) {
-  std::string Res;
-  switch (Version) {
-  case static_cast<uint32_t>(VersionNumber::SPIRV_1_0):
-    Res = "1.0";
-    break;
-  case static_cast<uint32_t>(VersionNumber::SPIRV_1_1):
-    Res = "1.1";
-    break;
-  case static_cast<uint32_t>(VersionNumber::SPIRV_1_2):
-    Res = "1.2";
-    break;
-  case static_cast<uint32_t>(VersionNumber::SPIRV_1_3):
-    Res = "1.3";
-    break;
-  case static_cast<uint32_t>(VersionNumber::SPIRV_1_4):
-    Res = "1.4";
-    break;
-  default:
-    Res = "unknown";
-  }
-
+  std::string Res(formatVersionNumber(Version));
   Res += " (" + std::to_string(Version) + ")";
   return Res;
 }
@@ -2139,9 +2119,7 @@ std::istream &operator>>(std::istream &I, SPIRVModule &M) {
   }
 
   Decoder >> MI.SPIRVVersion;
-  bool SPIRVVersionIsKnown =
-      static_cast<uint32_t>(VersionNumber::MinimumVersion) <= MI.SPIRVVersion &&
-      MI.SPIRVVersion <= static_cast<uint32_t>(VersionNumber::MaximumVersion);
+  bool SPIRVVersionIsKnown = isSPIRVVersionKnown(MI.SPIRVVersion);
   if (!M.getErrorLog().checkError(
           SPIRVVersionIsKnown, SPIRVEC_InvalidModule,
           "unsupported SPIR-V version number '" + to_string(MI.SPIRVVersion) +
diff --git a/llvm-spirv/test/negative/spirv_report_bad_input.spt b/llvm-spirv/test/negative/spirv_report_bad_input.spt
new file mode 100644
index 0000000000000..202cd702a64c6
--- /dev/null
+++ b/llvm-spirv/test/negative/spirv_report_bad_input.spt
@@ -0,0 +1,33 @@
+; RUN: llvm-spirv %s -to-binary -o %t.spv
+; The next line is to corrupt the binary file by changing its Magic Number
+; RUN: echo "0" > %t_corrupted.spv && cat %t.spv >> %t_corrupted.spv
+; RUN: not llvm-spirv --spirv-print-report %t_corrupted.spv 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+;
+; CHECK-ERROR: Invalid SPIR-V binary
+
+119734787 65536 393230 10 0
+2 Capability Addresses
+2 Capability Kernel
+2 Capability LoopFuseINTEL
+2 Capability BitInstructions
+6 Extension "SPV_INTEL_loop_fuse"
+8 Extension "SPV_KHR_bit_instructions"
+5 ExtInstImport 1 "OpenCL.std"
+3 MemoryModel 1 2
+7 EntryPoint 6 5 "TestSatPacked"
+3 Source 3 102000
+
+5 Decorate 5 FuseLoopsInFunctionINTEL 3 1
+4 TypeInt 3 32 0
+2 TypeVoid 2
+5 TypeFunction 4 2 3 3
+
+5 Function 2 5 0 4
+3 FunctionParameter 3 6
+3 FunctionParameter 3 7
+
+2 Label 8
+4 BitReverse 3 9 6
+1 Return
+
+1 FunctionEnd
diff --git a/llvm-spirv/test/spirv_report.spt b/llvm-spirv/test/spirv_report.spt
new file mode 100644
index 0000000000000..a8d89abee353b
--- /dev/null
+++ b/llvm-spirv/test/spirv_report.spt
@@ -0,0 +1,43 @@
+; RUN: llvm-spirv %s -to-binary -o %t.spv
+; RUN: llvm-spirv --spirv-print-report %t.spv | FileCheck %s --check-prefix=CHECK-DAG
+
+; CHECK-DAG: Version: 1.0
+; CHECK-DAG: Memory model: OpenCL
+; CHECK-DAG: Addressing model: Physical32
+; CHECK-DAG: Number of capabilities: 4
+; CHECK-DAG: Capability: Addresses
+; CHECK-DAG: Capability: Kernel
+; CHECK-DAG: Capability: LoopFuseINTEL
+; CHECK-DAG: Capability: BitInstructions
+; CHECK-DAG: Number of extensions: 2
+; CHECK-DAG: Extension: SPV_INTEL_loop_fuse
+; CHECK-DAG: Extension: SPV_KHR_bit_instructions
+; CHECK-DAG: Number of extended instruction sets: 1
+; CHECK-DAG: Extended Instruction Set: OpenCL.std
+
+119734787 65536 393230 10 0
+2 Capability Addresses
+2 Capability Kernel
+2 Capability LoopFuseINTEL
+2 Capability BitInstructions
+6 Extension "SPV_INTEL_loop_fuse"
+8 Extension "SPV_KHR_bit_instructions"
+5 ExtInstImport 1 "OpenCL.std"
+3 MemoryModel 1 2
+7 EntryPoint 6 5 "TestSatPacked"
+3 Source 3 102000
+
+5 Decorate 5 FuseLoopsInFunctionINTEL 3 1
+4 TypeInt 3 32 0
+2 TypeVoid 2
+5 TypeFunction 4 2 3 3
+
+5 Function 2 5 0 4
+3 FunctionParameter 3 6
+3 FunctionParameter 3 7
+
+2 Label 8
+4 BitReverse 3 9 6
+1 Return
+
+1 FunctionEnd
diff --git a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
index cecf8333dea75..d8eb2fd786945 100644
--- a/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
+++ b/llvm-spirv/tools/llvm-spirv/llvm-spirv.cpp
@@ -196,6 +196,12 @@ static cl::opt<bool> SpecConstInfo(
     cl::desc("Display id of constants available for specializaion and their "
              "size in bytes"));
 
+static cl::opt<bool>
+    SPIRVPrintReport("spirv-print-report", cl::init(false),
+                     cl::desc("Display general information about the module "
+                              "(capabilities, extensions, version, memory model"
+                              " and addressing model)"));
+
 static cl::opt<SPIRV::FPContractMode> FPCMode(
     "spirv-fp-contract", cl::desc("Set FP Contraction mode:"),
     cl::init(SPIRV::FPContractMode::On),
@@ -797,7 +803,7 @@ int main(int Ac, char **Av) {
     return convertSPIRV();
 #endif
 
-  if (!IsReverse && !IsRegularization && !SpecConstInfo)
+  if (!IsReverse && !IsRegularization && !SpecConstInfo && !SPIRVPrintReport)
     return convertLLVMToSPIRV(Opts);
 
   if (IsReverse && IsRegularization) {
@@ -824,5 +830,39 @@ int main(int Ac, char **Av) {
                 << ", size in bytes = " << SpecConst.Size
                 << ", type = " << SpecConst.Type << "\n";
   }
+
+  if (SPIRVPrintReport) {
+    std::ifstream IFS(InputFile, std::ios::binary);
+    int ErrCode = 0;
+    std::optional<SPIRV::SPIRVModuleReport> BinReport =
+        SPIRV::getSpirvReport(IFS, ErrCode);
+    if (!BinReport) {
+      std::cerr << "Invalid SPIR-V binary, error code is " << ErrCode << "\n";
+      return -1;
+    }
+
+    SPIRV::SPIRVModuleTextReport TextReport =
+        SPIRV::formatSpirvReport(BinReport.value());
+
+    std::cout << "SPIR-V module report:"
+              << "\n Version: " << TextReport.Version
+              << "\n Memory model: " << TextReport.MemoryModel
+              << "\n Addressing model: " << TextReport.AddrModel << "\n";
+
+    std::cout << " Number of capabilities: " << TextReport.Capabilities.size()
+              << "\n";
+    for (auto &Capability : TextReport.Capabilities)
+      std::cout << "  Capability: " << Capability << "\n";
+
+    std::cout << " Number of extensions: " << TextReport.Extensions.size()
+              << "\n";
+    for (auto &Extension : TextReport.Extensions)
+      std::cout << "  Extension: " << Extension << "\n";
+
+    std::cout << " Number of extended instruction sets: "
+              << TextReport.ExtendedInstructionSets.size() << "\n";
+    for (auto &ExtendedInstructionSet : TextReport.ExtendedInstructionSets)
+      std::cout << "  Extended Instruction Set: " << ExtendedInstructionSet << "\n";
+  }
   return 0;
 }

From d671e8d64996c952fa6ab5226fcb71b5e5fb2f72 Mon Sep 17 00:00:00 2001
From: LU-JOHN <111294400+LU-JOHN@users.noreply.github.com>
Date: Wed, 24 Jan 2024 04:45:49 -0600
Subject: [PATCH 839/843] Test behavior of common global variables (#2306)

Add tests to ensure that:

A common global without addrspace is reported as an error
A common addrspace global is not transformed to a locally allocated variable
Updated error message from "can not" to "cannot". Otherwise this is a NFC.

Signed-off-by: Lu, John <john.lu@intel.com>

Original commit:
https://github.com/KhronosGroup/SPIRV-LLVM-Translator/commit/8c357de
---
 llvm-spirv/lib/SPIRV/SPIRVWriter.cpp          |  2 +-
 .../negative/CommonGlobalVarNoAddrspace.ll    | 24 +++++++++++++++++++
 .../{ => negative}/GlobalVarNoAddrspace.ll    | 10 +++++++-
 .../transcoding/CommonAddrspaceGlobalVar.ll   | 20 ++++++++++++++++
 4 files changed, 54 insertions(+), 2 deletions(-)
 create mode 100644 llvm-spirv/test/negative/CommonGlobalVarNoAddrspace.ll
 rename llvm-spirv/test/{ => negative}/GlobalVarNoAddrspace.ll (51%)
 create mode 100644 llvm-spirv/test/transcoding/CommonAddrspaceGlobalVar.ll

diff --git a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
index 8decf3c34d386..6065650703269 100644
--- a/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
+++ b/llvm-spirv/lib/SPIRV/SPIRVWriter.cpp
@@ -2005,7 +2005,7 @@ LLVMToSPIRVBase::transValueWithoutDecoration(Value *V, SPIRVBasicBlock *BB,
       StorageClass = SPIRSPIRVAddrSpaceMap::map(AddressSpace);
       if (StorageClass == StorageClassFunction) {
         std::stringstream SS;
-        SS << "Global variable can not have Function storage class. "
+        SS << "Global variable cannot have Function storage class. "
            << "Consider setting a proper address space.\n "
            << "Original LLVM value:\n"
            << toString(GV);
diff --git a/llvm-spirv/test/negative/CommonGlobalVarNoAddrspace.ll b/llvm-spirv/test/negative/CommonGlobalVarNoAddrspace.ll
new file mode 100644
index 0000000000000..2550d31e1c4f8
--- /dev/null
+++ b/llvm-spirv/test/negative/CommonGlobalVarNoAddrspace.ll
@@ -0,0 +1,24 @@
+; Ensure "common global" with no addrspace is reported as an error
+; since the SPIR-V spec states:
+;
+;   global variable declarations must always have an address space
+;   specified and that address space cannot be `0`
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: not llvm-spirv %t.bc 2>&1 \
+; RUN: | FileCheck %s --check-prefix=CHECK-ERROR
+
+; CHECK-ERROR: InvalidInstruction: Can't translate llvm instruction:
+; CHECK-ERROR-NEXT: Global variable cannot have Function storage class. Consider setting a proper address space.
+; CHECK-ERROR-NEXT: Original LLVM value:
+; CHECK-ERROR-NEXT: @CG = common global i32 0, align 4
+
+target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
+target triple = "spir64-unknown-unknown"
+
+@CG = common global i32 0, align 4
+ 
+define i32 @f() #0 {
+ %1 = load i32, i32* @CG, align 4
+ ret i32 %1
+}
diff --git a/llvm-spirv/test/GlobalVarNoAddrspace.ll b/llvm-spirv/test/negative/GlobalVarNoAddrspace.ll
similarity index 51%
rename from llvm-spirv/test/GlobalVarNoAddrspace.ll
rename to llvm-spirv/test/negative/GlobalVarNoAddrspace.ll
index c6a68694170fe..fa86402fe845a 100644
--- a/llvm-spirv/test/GlobalVarNoAddrspace.ll
+++ b/llvm-spirv/test/negative/GlobalVarNoAddrspace.ll
@@ -1,9 +1,17 @@
+; Ensure "global" with no addrspace is reported as an error
+; since the SPIR-V spec states:
+;
+;   global variable declarations must always have an address space
+;   specified and that address space cannot be `0`
+
 ; RUN: llvm-as %s -o %t.bc
 ; RUN: not llvm-spirv %t.bc 2>&1 \
 ; RUN: | FileCheck %s --check-prefix=CHECK-ERROR
 
 ; CHECK-ERROR: InvalidInstruction: Can't translate llvm instruction:
-; CHECK-ERROR-NEXT: Global variable can not have Function storage class. Consider setting a proper address space.
+; CHECK-ERROR-NEXT: Global variable cannot have Function storage class. Consider setting a proper address space.
+; CHECK-ERROR-NEXT: Original LLVM value:
+; CHECK-ERROR-NEXT: @G = global i1 true
 
 target datalayout = "e-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-n8:16:32:64"
 target triple = "spir64-unknown-unknown"
diff --git a/llvm-spirv/test/transcoding/CommonAddrspaceGlobalVar.ll b/llvm-spirv/test/transcoding/CommonAddrspaceGlobalVar.ll
new file mode 100644
index 0000000000000..dfec2e006df33
--- /dev/null
+++ b/llvm-spirv/test/transcoding/CommonAddrspaceGlobalVar.ll
@@ -0,0 +1,20 @@
+; Ensure that a "common global" is not converted to a locally allocated
+; variable when translated to SPIR-V and back to LLVM.
+
+; RUN: llvm-as %s -o %t.bc
+; RUN: llvm-spirv %t.bc -o %t.spv
+; RUN: llvm-spirv -r %t.spv -o %t.rev.bc
+; RUN: llvm-dis < %t.rev.bc | FileCheck %s --check-prefix=CHECK-LLVM
+
+; CHECK-LLVM-NOT: alloca
+; CHECK-LLVM: @CAG = common addrspace(1) global i32 0, align 4
+; CHECK-LLVM-NOT: alloca
+
+target triple = "spir64-unknown-unknown"
+
+@CAG = common addrspace(1) global i32 0, align 4
+  
+define i32 @f() #0 {
+ %1 = load i32, i32 addrspace(1) * @CAG, align 4
+ ret i32 %1
+}

From 3d4c6c7631ed91039ffae824182ba5aeb5bcb657 Mon Sep 17 00:00:00 2001
From: sys_ce_bb <sys_ce_bb@intel.com>
Date: Thu, 25 Jan 2024 07:33:36 -0800
Subject: [PATCH 840/843] Update add-ir-annotations tests after 5518a9d7673b

---
 .../AST/ast-attr-add-ir-annotations-pack.cpp  | 378 +++++++++---------
 .../attr-add-ir-annotations-packs.cpp         |   8 +-
 2 files changed, 186 insertions(+), 200 deletions(-)

diff --git a/clang/test/AST/ast-attr-add-ir-annotations-pack.cpp b/clang/test/AST/ast-attr-add-ir-annotations-pack.cpp
index 05ba9fc34bb14..10798849357fb 100644
--- a/clang/test/AST/ast-attr-add-ir-annotations-pack.cpp
+++ b/clang/test/AST/ast-attr-add-ir-annotations-pack.cpp
@@ -252,12 +252,9 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName3' 'const char[6]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate3
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:       SYCLAddIRAnnotationsMemberAttr
@@ -265,20 +262,23 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Names
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName1' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Names
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName2' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Names
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName3' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'int'
   // CHECK-NEXT:           value: Int 1
   // CHECK-NEXT:           IntegerLiteral {{.*}} 'int' 1
@@ -290,10 +290,10 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           IntegerLiteral {{.*}} 'int' 3
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate3 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate3 'void (const ClassWithAnnotFieldTemplate3<AttrName1, AttrName2, AttrName3> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate3<AttrName1, AttrName2, AttrName3> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate3 'void (ClassWithAnnotFieldTemplate3<AttrName1, AttrName2, AttrName3> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate3<AttrName1, AttrName2, AttrName3> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate3 'void (const ClassWithAnnotFieldTemplate3<&AttrName1[0], &AttrName2[0], &AttrName3[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate3<&AttrName1[0], &AttrName2[0], &AttrName3[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate3 'void (ClassWithAnnotFieldTemplate3<&AttrName1[0], &AttrName2[0], &AttrName3[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate3<&AttrName1[0], &AttrName2[0], &AttrName3[0]> &&'
   // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct ClassWithAnnotFieldTemplate3 definition
   // CHECK-NEXT:     DefinitionData
   // CHECK-NEXT:       DefaultConstructor
@@ -303,18 +303,16 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate3
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate3 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate3 'void (const ClassWithAnnotFieldTemplate3<AttrName1, AttrName2> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate3<AttrName1, AttrName2> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate3 'void (ClassWithAnnotFieldTemplate3<AttrName1, AttrName2> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate3<AttrName1, AttrName2> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate3 'void (const ClassWithAnnotFieldTemplate3<&AttrName1[0], &AttrName2[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate3<&AttrName1[0], &AttrName2[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate3 'void (ClassWithAnnotFieldTemplate3<&AttrName1[0], &AttrName2[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate3<&AttrName1[0], &AttrName2[0]> &&'
   ClassWithAnnotFieldTemplate3<AttrName1, AttrName2, AttrName3> InstantiatedCWAFS5;
   ClassWithAnnotFieldTemplate3<AttrName1, AttrName2> InstantiatedCWAFS6;
 
@@ -354,12 +352,9 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName3' 'const char[6]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate4
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:       SYCLAddIRAnnotationsMemberAttr
@@ -370,20 +365,23 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Names
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName1' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Names
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName2' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Names
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName3' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'int'
   // CHECK-NEXT:           value: Int 1
   // CHECK-NEXT:           IntegerLiteral {{.*}} 'int' 1
@@ -395,10 +393,10 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           IntegerLiteral {{.*}} 'int' 3
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate4 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate4 'void (const ClassWithAnnotFieldTemplate4<AttrName1, AttrName2, AttrName3> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate4<AttrName1, AttrName2, AttrName3> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate4 'void (ClassWithAnnotFieldTemplate4<AttrName1, AttrName2, AttrName3> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate4<AttrName1, AttrName2, AttrName3> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate4 'void (const ClassWithAnnotFieldTemplate4<&AttrName1[0], &AttrName2[0], &AttrName3[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate4<&AttrName1[0], &AttrName2[0], &AttrName3[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate4 'void (ClassWithAnnotFieldTemplate4<&AttrName1[0], &AttrName2[0], &AttrName3[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate4<&AttrName1[0], &AttrName2[0], &AttrName3[0]> &&'
   // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct ClassWithAnnotFieldTemplate4 definition
   // CHECK-NEXT:     DefinitionData
   // CHECK-NEXT:       DefaultConstructor
@@ -408,18 +406,16 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate4
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate4 'void () noexcept' inline default trivial
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate4 'void (const ClassWithAnnotFieldTemplate4<AttrName1, AttrName2> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate4<AttrName1, AttrName2> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate4 'void (ClassWithAnnotFieldTemplate4<AttrName1, AttrName2> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate4<AttrName1, AttrName2> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate4 'void (const ClassWithAnnotFieldTemplate4<&AttrName1[0], &AttrName2[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate4<&AttrName1[0], &AttrName2[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate4 'void (ClassWithAnnotFieldTemplate4<&AttrName1[0], &AttrName2[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate4<&AttrName1[0], &AttrName2[0]> &&'
   ClassWithAnnotFieldTemplate4<AttrName1, AttrName2, AttrName3> InstantiatedCWAFS7;
   ClassWithAnnotFieldTemplate4<AttrName1, AttrName2> InstantiatedCWAFS8;
 
@@ -447,10 +443,8 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal1' 'const char[5]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate5
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:       SYCLAddIRAnnotationsMemberAttr
@@ -458,20 +452,22 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName1' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal1' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate5 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (const ClassWithAnnotFieldTemplate5<AttrName1, AttrVal1> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate5<AttrName1, AttrVal1> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (ClassWithAnnotFieldTemplate5<AttrName1, AttrVal1> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate5<AttrName1, AttrVal1> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (const ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrVal1[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrVal1[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrVal1[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrVal1[0]> &&'
   // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct ClassWithAnnotFieldTemplate5 definition
   // CHECK-NEXT:     DefinitionData
   // CHECK-NEXT:       DefaultConstructor
@@ -481,14 +477,10 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal1' 'const char[5]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal2' 'const char[5]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate5
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:       SYCLAddIRAnnotationsMemberAttr
@@ -496,32 +488,36 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName1' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName2' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal1' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal2' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate5 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (const ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrVal1, AttrVal2> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrVal1, AttrVal2> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrVal1, AttrVal2> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrVal1, AttrVal2> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (const ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrVal1[0], &AttrVal2[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrVal1[0], &AttrVal2[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrVal1[0], &AttrVal2[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrVal1[0], &AttrVal2[0]> &&'
   // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct ClassWithAnnotFieldTemplate5 definition
   // CHECK-NEXT:     DefinitionData
   // CHECK-NEXT:       DefaultConstructor
@@ -531,18 +527,12 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName3' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal1' 'const char[5]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal2' 'const char[5]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal3' 'const char[5]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate5
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:       SYCLAddIRAnnotationsMemberAttr
@@ -550,44 +540,50 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName1' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName2' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName3' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal1' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal2' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal3' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate5 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (const ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (const ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0], &AttrVal3[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0], &AttrVal3[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0], &AttrVal3[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0], &AttrVal3[0]> &&'
   // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct ClassWithAnnotFieldTemplate5 definition
   // CHECK-NEXT:     DefinitionData
   // CHECK-NEXT:       DefaultConstructor
@@ -597,24 +593,19 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName3' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal1' 'const char[5]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal2' 'const char[5]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate5
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate5 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (const ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (const ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate5 'void (ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]> &&'
   ClassWithAnnotFieldTemplate5<AttrName1, AttrVal1> InstantiatedCWAFS9;
   ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrVal1, AttrVal2> InstantiatedCWAFS10;
   ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> InstantiatedCWAFS11;
@@ -647,10 +638,8 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal1' 'const char[5]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate6
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:       SYCLAddIRAnnotationsMemberAttr
@@ -661,20 +650,22 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName1' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal1' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate6 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (const ClassWithAnnotFieldTemplate6<AttrName1, AttrVal1> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate6<AttrName1, AttrVal1> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (ClassWithAnnotFieldTemplate6<AttrName1, AttrVal1> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate6<AttrName1, AttrVal1> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (const ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrVal1[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrVal1[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrVal1[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrVal1[0]> &&'
   // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct ClassWithAnnotFieldTemplate6 definition
   // CHECK-NEXT:     DefinitionData
   // CHECK-NEXT:       DefaultConstructor
@@ -684,14 +675,10 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal1' 'const char[5]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal2' 'const char[5]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate6
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:       SYCLAddIRAnnotationsMemberAttr
@@ -702,32 +689,36 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName1' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName2' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal1' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} col:26 referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal2' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate6 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (const ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrVal1, AttrVal2> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrVal1, AttrVal2> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrVal1, AttrVal2> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrVal1, AttrVal2> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (const ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrVal1[0], &AttrVal2[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrVal1[0], &AttrVal2[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrVal1[0], &AttrVal2[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrVal1[0], &AttrVal2[0]> &&'
   // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct ClassWithAnnotFieldTemplate6 definition
   // CHECK-NEXT:     DefinitionData
   // CHECK-NEXT:       DefaultConstructor
@@ -737,18 +728,12 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName3' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal1' 'const char[5]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal2' 'const char[5]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal3' 'const char[5]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate6
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:       SYCLAddIRAnnotationsMemberAttr
@@ -759,44 +744,50 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName1' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName2' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[6]' lvalue Var {{.*}} 'AttrName3' 'const char[6]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal1' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal2' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:         ConstantExpr {{.*}} 'const char *'
   // CHECK-NEXT:           value: LValue
   // CHECK-NEXT:           SubstNonTypeTemplateParmExpr {{.*}} 'const char *'
   // CHECK-NEXT:             NonTypeTemplateParmDecl {{.*}} referenced 'const char *' depth 0 index 0 ... Strs
-  // CHECK-NEXT:             ImplicitCastExpr {{.*}} 'const char *' <ArrayToPointerDecay>
-  // CHECK-NEXT:               DeclRefExpr {{.*}} 'const char[5]' lvalue Var {{.*}} 'AttrVal3' 'const char[5]'
+  // CHECK-NEXT:             ConstantExpr {{.*}} 'const char *'
+  // CHECK-NEXT:               value: LValue
+  // CHECK-NEXT:               OpaqueValueExpr {{.*}} 'const char *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate6 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (const ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (const ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0], &AttrVal3[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0], &AttrVal3[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0], &AttrVal3[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0], &AttrVal3[0]> &&'
   // CHECK-NEXT:   ClassTemplateSpecializationDecl {{.*}} struct ClassWithAnnotFieldTemplate6 definition
   // CHECK-NEXT:     DefinitionData
   // CHECK-NEXT:       DefaultConstructor
@@ -806,24 +797,19 @@ void InstantiateClassWithAnnotFieldTemplates() {
   // CHECK-NEXT:       MoveAssignment
   // CHECK-NEXT:       Destructor
   // CHECK-NEXT:     TemplateArgument pack
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName1' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName2' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrName3' 'const char[6]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal1' 'const char[5]'
-  // CHECK-NEXT:       TemplateArgument decl
-  // CHECK-NEXT:         Var {{.*}} 'AttrVal2' 'const char[5]'
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
+  // CHECK-NEXT:       TemplateArgument
   // CHECK-NEXT:     CXXRecordDecl {{.*}} implicit struct ClassWithAnnotFieldTemplate6
   // CHECK-NEXT:     FieldDecl {{.*}} referenced ptr 'int *'
   // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit used ClassWithAnnotFieldTemplate6 'void () noexcept'
   // CHECK-NEXT:       CompoundStmt
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (const ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> &)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> &'
-  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> &&)'
-  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> &&'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (const ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]> &)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'const ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]> &'
+  // CHECK-NEXT:     CXXConstructorDecl {{.*}} implicit constexpr ClassWithAnnotFieldTemplate6 'void (ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]> &&)'
+  // CHECK-NEXT:       ParmVarDecl {{.*}} 'ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]> &&'
   ClassWithAnnotFieldTemplate6<AttrName1, AttrVal1> InstantiatedCWAFS13;
   ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrVal1, AttrVal2> InstantiatedCWAFS14;
   ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> InstantiatedCWAFS15;
diff --git a/clang/test/SemaSYCL/attr-add-ir-annotations-packs.cpp b/clang/test/SemaSYCL/attr-add-ir-annotations-packs.cpp
index 33e80417bb390..b2681774cce41 100644
--- a/clang/test/SemaSYCL/attr-add-ir-annotations-packs.cpp
+++ b/clang/test/SemaSYCL/attr-add-ir-annotations-packs.cpp
@@ -37,20 +37,20 @@ void InstantiateClassWithAnnotFieldTemplates() {
   ClassWithAnnotFieldTemplate2<1, 2> InstantiatedCWAFS4; // expected-note {{in instantiation of template class 'ClassWithAnnotFieldTemplate2<1, 2>' requested here}}
 
   ClassWithAnnotFieldTemplate3<AttrName1, AttrName2, AttrName3> InstantiatedCWAFS5;
-  ClassWithAnnotFieldTemplate3<AttrName1, AttrName2> InstantiatedCWAFS6; // expected-note {{in instantiation of template class 'ClassWithAnnotFieldTemplate3<AttrName1, AttrName2>' requested here}}
+  ClassWithAnnotFieldTemplate3<AttrName1, AttrName2> InstantiatedCWAFS6; // expected-note {{in instantiation of template class 'ClassWithAnnotFieldTemplate3<&AttrName1[0], &AttrName2[0]>' requested here}}
 
   ClassWithAnnotFieldTemplate4<AttrName1, AttrName2, AttrName3> InstantiatedCWAFS7;
-  ClassWithAnnotFieldTemplate4<AttrName1, AttrName2> InstantiatedCWAFS8; // expected-note {{in instantiation of template class 'ClassWithAnnotFieldTemplate4<AttrName1, AttrName2>' requested here}}
+  ClassWithAnnotFieldTemplate4<AttrName1, AttrName2> InstantiatedCWAFS8; // expected-note {{in instantiation of template class 'ClassWithAnnotFieldTemplate4<&AttrName1[0], &AttrName2[0]>' requested here}}
 
   ClassWithAnnotFieldTemplate5<AttrName1, AttrVal1> InstantiatedCWAFS9;
   ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrVal1, AttrVal2> InstantiatedCWAFS10;
   ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> InstantiatedCWAFS11;
-  ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> InstantiatedCWAFS12; // expected-note {{in instantiation of template class 'ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2>' requested here}}
+  ClassWithAnnotFieldTemplate5<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> InstantiatedCWAFS12; // expected-note {{in instantiation of template class 'ClassWithAnnotFieldTemplate5<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]>' requested here}}
 
   ClassWithAnnotFieldTemplate6<AttrName1, AttrVal1> InstantiatedCWAFS13;
   ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrVal1, AttrVal2> InstantiatedCWAFS14;
   ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2, AttrVal3> InstantiatedCWAFS15;
-  ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> InstantiatedCWAFS16; // expected-note {{in instantiation of template class 'ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2>' requested here}}
+  ClassWithAnnotFieldTemplate6<AttrName1, AttrName2, AttrName3, AttrVal1, AttrVal2> InstantiatedCWAFS16; // expected-note {{in instantiation of template class 'ClassWithAnnotFieldTemplate6<&AttrName1[0], &AttrName2[0], &AttrName3[0], &AttrVal1[0], &AttrVal2[0]>' requested here}}
 
   (void)*InstantiatedCWAFS1.ptr;
   (void)*InstantiatedCWAFS2.ptr;

From 93bcfa2314014e3b8ef88756ec0ba2317a9ac54e Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Wed, 24 Jan 2024 12:32:10 -0800
Subject: [PATCH 841/843] [Driver,test] Add --target= to
 unsupported-option-gpu.c

---
 clang/test/Driver/unsupported-option-gpu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/test/Driver/unsupported-option-gpu.c b/clang/test/Driver/unsupported-option-gpu.c
index 225f3be7c7624..f23cb71ebfb08 100644
--- a/clang/test/Driver/unsupported-option-gpu.c
+++ b/clang/test/Driver/unsupported-option-gpu.c
@@ -1,5 +1,5 @@
 /// Some target-specific options are ignored for GPU, so %clang exits with code 0.
-// DEFINE: %{check} = %clang -### -c -mcmodel=medium
+// DEFINE: %{check} = %clang -### --target=x86_64-linux-gnu -c -mcmodel=medium
 
 // RUN: %{check} -x cuda %s --cuda-path=%S/Inputs/CUDA/usr/local/cuda --offload-arch=sm_60 --no-cuda-version-check -fbasic-block-sections=all
 // RUN: %{check} -x hip %s --rocm-path=%S/Inputs/rocm -nogpulib -nogpuinc

From ece5ab16b89dd9eae7259ba8cbb5e26d77b5fdff Mon Sep 17 00:00:00 2001
From: sys_ce_bb <sys_ce_bb@intel.com>
Date: Mon, 29 Jan 2024 07:14:03 -0800
Subject: [PATCH 842/843] XFAIL joint_matrix_bf16_fill_k_cache_unroll.cpp on
 GPU

Pass without https://github.com/llvm/llvm-project/commit/90ba33099cbb
---
 .../Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp        | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
index 18238e4896ccb..90678f3f414b1 100644
--- a/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
+++ b/sycl/test-e2e/Matrix/XMX8/joint_matrix_bf16_fill_k_cache_unroll.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 // REQUIRES: matrix-xmx8
+// XFAIL:gpu
 
 // RUN: %{build} -mllvm -inline-threshold=2000 -ffp-model=precise -o %t.out -DMANUAL_UNROLL
 // RUN: %{run} %t.out

From e2a8b3e510bd7c3b8b7d0b2c8a36d08ce1fd8ffe Mon Sep 17 00:00:00 2001
From: sys_ce_bb <sys_ce_bb@intel.com>
Date: Mon, 29 Jan 2024 07:09:37 -0800
Subject: [PATCH 843/843] XFAIL internalize_vfunc.cpp on AMD HIP and CUDA

Pass without https://github.com/llvm/llvm-project/commit/90ba33099cbb
---
 sycl/test-e2e/KernelFusion/internalize_vfunc.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
index 278395f2a90fd..a538099a2aa8f 100644
--- a/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
+++ b/sycl/test-e2e/KernelFusion/internalize_vfunc.cpp
@@ -1,5 +1,6 @@
 // RUN: %{build} -fsycl-embed-ir -O2 -o %t.out
 // RUN: %{run} %t.out
+// XFAIL: hip,cuda
 
 // Test complete fusion with private internalization specified on the
 // accessors for a device kernel with sycl::vec::load and sycl::vec::store.